diff --git a/.Rbuildignore b/.Rbuildignore
index b84fd208..5c2415db 100644
--- a/.Rbuildignore
+++ b/.Rbuildignore
@@ -1,6 +1,16 @@
 vignettes/jss835
 vignettes/RcppEigen-intro-nojss.*
 NEWS.org
-vignettes/jss.bst
+vignettes/rnw
 .travis.yml
 debian
+^.*\.Rproj$
+^\.Rproj\.user$
+.*\.tar\.gz$
+^patches
+^.editorconfig$
+^eigen-?\.?\.?
+^\.github
+^\.codecov.yml
+^\.covrignore
+^local
diff --git a/.codecov.yml b/.codecov.yml
new file mode 100644
index 00000000..9ec35288
--- /dev/null
+++ b/.codecov.yml
@@ -0,0 +1,17 @@
+comment: false
+coverage:
+  status:
+    project:
+      default:
+        target: 70%    # the (on purpose low) required coverage value
+        threshold: 2%  # the permitted delta in hitting the target
+    patch:
+      default:
+        target: 0%     # the (on purpose low) required coverage value
+
+#  layout: "header, diff, tree, changes"
+#  behavior: default
+#  require_changes: false  # if true: only post the comment if coverage changes
+#  branches: null
+#  flags: null
+#  paths: null
diff --git a/.covrignore b/.covrignore
new file mode 100644
index 00000000..3b88999b
--- /dev/null
+++ b/.covrignore
@@ -0,0 +1,2 @@
+inst/include/Eigen/*
+inst/include/unsupported/Eigen/*
diff --git a/.editorconfig b/.editorconfig
new file mode 100644
index 00000000..9c5e0b52
--- /dev/null
+++ b/.editorconfig
@@ -0,0 +1,23 @@
+# EditorConfig is awesome: http://EditorConfig.org
+
+# top-most EditorConfig file
+root = true
+
+# Unix-style newlines with a newline ending every file
+[*]
+end_of_line = lf
+insert_final_newline = true
+trim_trailing_whitespace = true
+
+# Matches multiple files with brace expansion notation
+# 4 space indentation
+[*.{c,cpp,h,hpp,R,r}]
+indent_style = space
+indent_size = 4
+
+# Tab indentation (no size specified)
+[Makefile]
+indent_style = tab
+
+[README.md]
+trim_trailing_whitespace = false
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
new file mode 100644
index 00000000..bcd742d9
--- /dev/null
+++ b/.github/workflows/ci.yaml
@@ -0,0 +1,43 @@
+# Run CI for R using https://eddelbuettel.github.io/r-ci/
+
+name: ci
+
+on:
+  push:
+  pull_request:
+
+env:
+  _R_CHECK_FORCE_SUGGESTS_: "false"
+
+jobs:
+  ci:
+    strategy:
+      matrix:
+        include:
+          - {os: macOS-latest}
+          - {os: ubuntu-latest}
+
+    runs-on: ${{ matrix.os }}
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v6
+
+      - name: Setup
+        uses: eddelbuettel/github-actions/r-ci@master
+
+      - name: Dependencies
+        run: ./run.sh install_all
+
+      - name: Test
+        run: ./run.sh run_tests
+
+      - name: Verify OpenMP on macOS
+        if: ${{ matrix.os == 'macOS-latest' }}
+        run: R CMD INSTALL . && Rscript -e 'RcppEigen::EigenNbThreads()'
+
+      - name: Coverage
+        if: ${{ matrix.os == 'ubuntu-latest' }}
+        env:
+          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
+        run: ./run.sh coverage
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 00000000..56843bca
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,7 @@
+.Rproj.user
+.Rhistory
+.RData
+.Ruserdata
+src/*.o
+src/*.so
+src/*.dll
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index 0a0d626d..00000000
--- a/.travis.yml
+++ /dev/null
@@ -1,35 +0,0 @@
-# Sample .travis.yml for R projects.
-#
-# See https://github.com/craigcitro/r-travis
-#     https://github.com/eddelbuettel/r-travis/
-
-language: c
-
-env:
-  global:
-    - R_BUILD_ARGS="--no-build-vignettes --no-manual"
-    - R_CHECK_ARGS="--no-build-vignettes --no-manual --as-cran"
-
-script: 
-  - ./travis-tool.sh run_tests
-
-before_install:
-  - curl -OL http://raw.github.com/eddelbuettel/r-travis/master/scripts/travis-tool.sh
-  - chmod 755 ./travis-tool.sh
-  - ./travis-tool.sh bootstrap
-  - sudo add-apt-repository -y ppa:edd/misc
-  - sudo apt-get update -q
-
-install:
-  - ./travis-tool.sh install_aptget r-cran-rcpp r-cran-matrix r-cran-inline r-cran-runit r-cran-pkgkitten
-# Note: if Rcpp from Github is needed, use following line and remove Rcpp from previous line
-#  - ./travis-tool.sh install_github RcppCore/Rcpp
-
-after_failure:
-  - ./travis-tool.sh dump_logs
-
-notifications:
-  email:
-    on_success: change
-    on_failure: change
-
diff --git a/ChangeLog b/ChangeLog
index 5d551bbf..96656e32 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,639 @@
+2026-06-04  Dirk Eddelbuettel  <edd@debian.org>
+
+	* inst/include/Eigen/: Sync with upstream Eigen 5.0.1
+	* inst/include/unsupported/Eigen/: Idem
+
+	* inst/include/Eigen/CholmodSupport: Apply previous patch
+	* inst/include/Eigen/src/CholmodSupport/CholmodSupport.h: Idem
+	* inst/include/Eigen/src/Core/util/DisableStupidWarnings.h: Idem
+	* inst/include/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h: Idem
+	* inst/include/Eigen/src/Core/arch/AltiVec/PacketMath.h: New one-line patch
+
+2026-05-03  Dirk Eddelbuettel  <edd@debian.org>
+
+	* vignettes/rnw/RcppEigen-Introduction.Rnw: Moved, also update three
+	URLs to https
+	* vignettes/rmw/Makefile: Added to ensure proper bibtex processing
+	* vignettes/RcppEigen-Introduction.pdf: Use precompiled pdf
+	* vignettes/RcppEigen-Introduction.pdf.asis: Added
+	* DESCRIPTION (VignetteBuilder): Added Rcpp
+	* .Rbuildignore: Exclude vignettes/rnw
+
+2026-01-15  Dirk Eddelbuettel  <edd@debian.org>
+
+	* .github/workflows/ci.yaml: Switch to actions/checkout@v6
+
+2026-01-07  Dirk Eddelbuettel  <edd@debian.org>
+
+	* README.md: Another pair of URL updates for Eigen
+	* man/fastLm.Rd: Idem
+
+2025-12-30  Dirk Eddelbuettel  <edd@debian.org>
+
+	* inst/tinytest/test_misc.R: Additional tests predicate
+
+	* R/init.R: Some additional #nocov tags
+
+2025-12-29  Dirk Eddelbuettel  <edd@debian.org>
+
+	* DESCRIPTION: Roll micro release and date
+
+2025-12-28  Dirk Eddelbuettel  <edd@debian.org>
+
+	* configure.ac: Detect OpenMP compile and link options
+	* configure.: Idem
+	* R/init.R: Set multithreading core count on startup, show startup
+	message, add two helper accessor functions
+	* src/Makevars.in: Support OpenMP use for multithreading
+	* src/Makevars.win: Idem
+	* src/RcppEigen.cpp: Add thread setter, add typed version getter,
+	edits to old version getter
+	* src/RcppExport.cpp: Export new compiled functions
+	* R/RcppExport.R: Idem
+	* NAMESPACE: Export several helper functions
+	* man/RcppEigen-package.Rd: Document thread count setting options
+	* man/RcppEigen_throttle_cores.Rd: Document additional helpers
+	* cleanup: Minor extension
+
+2025-12-26  Dirk Eddelbuettel  <edd@debian.org>
+
+	* inst/include/Eigen/src/CholmodSupport/CholmodSupport.h: Comment-out
+	another SuiteSparse reference
+
+	* patches/eigen-5.0.0.diff: Updated against git commit of Eigen 5.0.0 sync
+
+2025-12-25  Dirk Eddelbuettel  <edd@debian.org>
+
+	* inst/include/Eigen/: Sync with upstream Eigen 5.0.0 (aka 3.5.0)
+	* inst/include/unsupported/Eigen/: Idem
+
+	* patches/eigen-5.0.0.diff: Summary of changes applied
+
+	* inst/include/RcppEigenForward.h: No longer use deprecated type
+	* inst/include/RcppEigenWrap.h: Idem
+	* inst/tinytest/cpp/rcppeigen.cpp: Adjust test to non-deprecated map
+	* inst/tinytest/cpp/sparse.cpp: Skip test of deprecated type
+	* inst/tinytest/test_sparse.R: Idem
+
+	* src/fastLm.cpp (SVD::SVD): Update to current form of option setting
+
+2025-08-26  Dirk Eddelbuettel  <edd@debian.org>
+
+	* inst/tinytest/test_misc.R: Add minimal test for EigenNbThreads()
+
+	* inst/include/RcppEigenWrap.h: Add #nocov comments
+
+2025-08-23  Dirk Eddelbuettel  <edd@debian.org>
+
+	* .github/workflows/ci.yaml (jobs): Re-enable coverage
+
+2025-04-04  Dirk Eddelbuettel  <edd@debian.org>
+
+	* .github/workflows/ci.yaml: Switch to r-ci with included bootstrap
+
+2024-08-23  Dirk Eddelbuettel  <edd@debian.org>
+
+	* DESCRIPTION (Version, Date): CRAN Release 0.3.4.0.2
+	* inst/NEWS.Rd: Release 0.3.4.0.2
+
+	* DESCRIPTION (Authors@R): Correct two ORCID tag typos
+
+2024-08-14  Dirk Eddelbuettel  <edd@debian.org>
+
+	* DESCRIPTION (Version, Date): CRAN Release 0.3.4.0.1
+	* inst/NEWS.Rd: Release 0.3.4.0.1
+
+2024-08-13  Dirk Eddelbuettel  <edd@debian.org>
+
+	* inst/include/Eigen/src/misc/blas.h: Conditionally comment out
+	xerbla which is now also provided by r-devel (CRAN request)
+
+2024-08-12  Dirk Eddelbuettel  <edd@debian.org>
+
+	* DESCRIPTION (Authors@R): Added, with ORCID IDs
+
+2024-05-16  Dirk Eddelbuettel  <edd@debian.org>
+
+	* README.md: Use tinyverse.netlify.app for dependency badge
+
+2024-03-01  Dirk Eddelbuettel  <edd@debian.org>
+
+	* .github/workflows/ci.yaml (jobs): Update to actions/checkout@v4,
+	add r-ci-setup actions
+
+2024-02-28  Dirk Eddelbuettel  <edd@debian.org>
+
+	* DESCRIPTION (Version, Date): Release 0.3.4.0.0
+
+2024-02-12  Tomas Kalibera  <tomas.kalibera@gmail.com>
+
+	* inst/include/unsupported/Eigen/CXX11/src/ThreadPool/ThreadYield.h:
+	Support clang on Windows by including 'sched.h' header
+
+2024-01-17  Yixuan Qiu  <yixuan.qiu@cos.name>
+
+	* DESCRIPTION (Version, Date): Release candidate 0.3.3.99.0
+
+	* inst/include/Eigen: Upgraded to Eigen 3.4.0
+	* inst/include/unsupported/Eigen: Idem
+	* patches/eigen-3.4.0.diff: Carried local CRAN patches forward
+	* patches/howToDiff.md: Idem
+
+2023-11-01  Dirk Eddelbuettel  <edd@debian.org>
+
+	* DESCRIPTION (Version, Date): CRAN Release 0.3.3.9.4
+	* inst/NEWS.Rd: Release 0.3.3.9.4
+
+2023-10-05  Mikael Jagan  <jaganmn@mcmaster.ca>
+
+	* DESCRIPTION: Package 'Matrix' is now only a Suggests:
+	* NAMESPACE: Remove unconditional imports from package Matrix
+
+	* inst/include/Eigen/CholmodSupport: No longer need to include
+	RcppEigenCholmod.h
+	* inst/include/Eigen/src/CholmodSupport/CholmodSupport.h: Small
+	wrapper adjustments conditional on Matrix use
+	* inst/include/RcppEigenCholmod.h: Updated
+	* inst/include/RcppEigenForward.h: Simplified
+	* inst/include/RcppEigenWrap.h: Ditto
+	* inst/include/RcppEigenStubs.cpp: New shorter helper
+	* inst/include/RcppEigenStubs.h: Removed
+
+2023-07-21  Dirk Eddelbuettel  <edd@debian.org>
+
+	* DESCRIPTION (Date, Version): Roll micro version and date
+
+	* README.md: Add r-universe badge
+
+2023-07-20  Dirk Eddelbuettel  <edd@debian.org>
+
+	* src/RcppEigen.cpp (EigenNbThreads): Add simple threads reporter
+
+	* R/fastLm.R (fastLmPure): Simpler call to `fastLm_Impl()`
+
+	* src/init.c: Replaced by auto-generated section in RcppExports.cpp
+	* src/RcppExports.cpp: Regenerated
+	* R/RcppExports.R: Idem
+
+	* src/Makevars: Document possible use of '-fopenmp'
+	* src/Makevars.win: Idem
+
+2023-04-18  Dirk Eddelbuettel  <edd@debian.org>
+
+	* README.md: Use app.codecov.io as base for codecov link
+
+2023-03-10  Dirk Eddelbuettel  <edd@debian.org>
+
+	* DESCRIPTION (Date, Version): Roll minor version
+
+	* R/RcppEigen.package.skeleton.R: No longer set Imports: RcppEigen in
+	DESCRIPTION and NAMESPACE
+
+2023-02-12  Dirk Eddelbuettel  <edd@debian.org>
+
+	* inst/CITATION: Convert to bibentry() style with person()
+
+2022-11-04  Dirk Eddelbuettel  <edd@debian.org>
+
+	* DESCRIPTION (Version, Date): CRAN Release 0.3.3.9.3
+	* inst/NEWS.Rd: Release 0.3.3.9.3
+
+	* src/init.c: Add 'void' for proper prototype pleasing clang-15
+
+	* R/fastLm.R (summary.fastLm,print.fastLm): Refer to correct and full
+	variable name df.residual in the returned object
+
+	* .github/workflows/ci.yaml (jobs): Update to actions/checkout@v3
+
+2022-09-15  Jonah Gabry  <jgabry@gmail.com>
+
+	* inst/skeleton/rcppeigen_hello_world.cpp: Correct typo
+
+2022-04-08  Dirk Eddelbuettel  <edd@debian.org>
+
+	* DESCRIPTION (Depends): Add a versioned dependency on R 3.6.0 or
+	later because of our use of FCONE to support USE_FC_LEN_T
+
+2022-04-05  Dirk Eddelbuettel  <edd@debian.org>
+
+	* DESCRIPTION (Version, Date): CRAN Release 0.3.3.9.2
+	* inst/NEWS.Rd: Release 0.3.3.9.2
+
+2022-04-04  Dirk Eddelbuettel  <edd@debian.org>
+
+	* src/fastLm.cpp: Add FCONE in two calls for improved Fortran and C
+	character interface per Writing R Extensions, Section 6.6.1
+
+2022-01-16  Dirk Eddelbuettel  <edd@debian.org>
+
+	* inst/tinytest/test_wrap.R: Added (optional) large memory wrap tests
+ 	* inst/tinytest/cpp/wrap.cpp: Added C++ part of test
+
+	* .codecov.yml: Added to not trigger PR fail for small additions
+
+2022-01-16  Mikael Jagan  <jaganmn@mcmaster.ca>
+
+	* inst/include/RcppEigenWrap.h: Refine use plain dense wrap() change
+
+2022-01-15  Mikael Jagan  <jaganmn@mcmaster.ca>
+
+	* inst/include/RcppEigenWrap.h: Use R_xlen_t for vectors rows + cols
+
+2021-12-29  Dirk Eddelbuettel  <edd@debian.org>
+
+	* README.md: Add total downloads badge
+
+2021-12-08  Dirk Eddelbuettel  <edd@debian.org>
+
+	* README.md: Remove unused continuous integration artifact and badge
+
+2021-10-13  Dirk Eddelbuettel  <edd@debian.org>
+
+	* inst/CITATION: Refinment of doi use
+
+2021-10-10  Dirk Eddelbuettel  <edd@debian.org>
+
+	* inst/CITATION: Switch JSS url to doi form per JSS request
+	* README.md: Idem
+
+2021-07-19  Dirk Eddelbuettel  <edd@debian.org>
+
+	* inst/include/RcppEigenWrap.h: Two more #nocov tags
+	* src/fastLm.cpp: One more #nocov tag
+
+2021-07-18  Dirk Eddelbuettel  <edd@debian.org>
+
+	* inst/tinytest/test_fastLm.R: Add tests for summary
+	* src/fastLm.cpp: Add a few #nocov tags
+
+2021-07-17  Dirk Eddelbuettel  <edd@debian.org>
+
+	* DESCRIPTION (Date, Version): Roll minor version
+
+	* R/RcppEigen.package.skeleton.R (RcppEigen.package.skeleton): Also
+	import RcppEigen in DESCRIPTION
+
+2021-06-07  Dirk Eddelbuettel  <edd@debian.org>
+
+	* DESCRIPTION (Date, Version): Roll minor version
+
+	* inst/tinytest/test_misc.R: New test file
+	* inst/tinytest/test_fastLm.R: Added tests
+
+	* R/fastLm.R: Add single one-line nocov tag
+
+	* .Rbuildignore: Add .covignore and .codecov.yml
+
+2021-06-06  Dirk Eddelbuettel  <edd@debian.org>
+
+	* README.md: Added coverage badge
+
+	* .github/workflows/ci.yaml (jobs): Turn on coverage
+	* .covrignore: Added
+	* .codecov.yml (ignore): Idem
+
+	* R/RcppEigen.package.skeleton.R: Set nocov
+
+2021-05-09  Dirk Eddelbuettel  <edd@debian.org>
+
+	* DESCRIPTION (URL): Add GitHub repo to URL field
+
+2021-01-02  Dirk Eddelbuettel  <edd@debian.org>
+
+	* R/RcppEigen.package.skeleton.R: Wrap any() around grepl()
+
+2020-12-25  Dirk Eddelbuettel  <edd@debian.org>
+
+	* .github/workflows/ci.yaml: Small tweaks to CI YAML file
+
+2020-12-17  Dirk Eddelbuettel  <edd@debian.org>
+
+	* DESCRIPTION (Version, Date): CRAN Release 0.3.3.9.1 (following
+	coordinated update of reverse dependency StanHeaders)
+	* inst/NEWS.Rd: Release 0.3.3.9.1
+
+2020-12-14  Dirk Eddelbuettel  <edd@debian.org>
+
+	* .github/workflows/ci.yaml: Add CI runner using r-ci
+	* README.md: Add new badge
+
+2020-12-05  Dirk Eddelbuettel  <edd@debian.org>
+
+	* DESCRIPTION (Version, Date): Release 0.3.3.9.0
+
+	* inst/include/Eigen: Upgraded to Eigen 3.3.9
+	* inst/include/unsupported/Eigen: Idem
+	* patches/eigen-3.3.9.diff: Carried local CRAN patches forward
+
+	* .travis.yml: Switch to r-ci using focal and bspm
+
+	* README.md: Updated URLs to https and/or redirect location
+	* inst/CITATION: Idem
+	* man/RcppEigen-package.Rd: Idem
+	* man/fastLm.Rd: Idem
+
+2020-08-16  Dirk Eddelbuettel  <edd@debian.org>
+
+	* README.md: Add JSS badge
+
+2020-03-29  Dirk Eddelbuettel  <edd@debian.org>
+
+	* README.md: Added commit badge, edited
+
+2020-01-22  Dirk Eddelbuettel  <edd@debian.org>
+
+	* README.md: README.md: Add a Debian badge
+
+2019-11-16  Dirk Eddelbuettel  <edd@debian.org>
+
+	* DESCRIPTION (Version, Date): Release 0.3.3.7.0
+
+	* inst/include/Eigen: Upgraded to Eigen 3.3.7
+	* inst/include/unsupported/Eigen: Idem
+	* patches/eigen-3.3.7.diff: Carried local CRAN patches forward
+
+2019-11-01  Dirk Eddelbuettel  <edd@debian.org>
+
+	* R/unit.test.R (compile_unit_tests): Removed as no longer needed
+
+2019-10-31  Dirk Eddelbuettel  <edd@debian.org>
+
+	* inst/tinytest/test_transform.R: Switch from RUnit to tinytest
+	* inst/tinytest/test_wrap.R: Idem
+
+	* inst/tinytest/cpp/transform.cpp: Added using Rcpp Attributes
+	* inst/tinytest/cpp/wrap.cpp: Idem
+
+	* inst/tinytest/cpp/sparse.cpp: More idiomatic Eigen code
+	* inst/tinytest/test_sparse.R: Idem
+
+	* .editorconfig: Added
+
+2019-10-30  Dirk Eddelbuettel  <edd@debian.org>
+
+	* DESCRIPTION (Suggests): Switch from RUnit to tinytest
+	* .travis.yml (install): Ditto
+
+	* tests/tinytest.R: Converted to tinytest
+
+        * test_sparse.R: Converted to tinytest
+	* cpp/sparse.cpp: Added using Rcpp Attributes
+
+2019-10-29  Dirk Eddelbuettel  <edd@debian.org>
+
+        * test_fastLm.R: Converted to tinytest
+        * test_RcppEigen.R: Idem
+        * test_solution.R: Idem
+
+	* cpp/rcppeigen.cpp: Added using Rcpp Attributes
+        * cpp/solution.cpp: Idem
+
+2019-10-28  Dirk Eddelbuettel  <edd@debian.org>
+
+	* tests/tinytest.R: Renamed from tests/doRUnit.R
+	* inst/tinytest/test_fastLm.R: Renamed from inst/unitTests/runit*
+	* inst/tinytest/test_RcppEigen.R: Idem
+	* inst/tinytest/test_solution.R: Idem
+	* inst/tinytest/test_sparse.R: Idem
+	* inst/tinytest/test_transform.R: Idem
+	* inst/tinytest/test_wrap.R: Idem
+
+2019-10-13  Dirk Eddelbuettel  <edd@debian.org>
+
+	* README.md: Added CRAN + BioConductor badges for reverse depends,
+	add StackOverflow badge to recommend help searches under 'Rcpp'
+
+	* DESCRIPTION (Version, Date): Roll minor version
+
+	* R/RcppEigen.package.skeleton.R (RcppEigen.package.skeleton): Test for
+	example_code outside of haveKitten test; remove force argument if unused
+
+2019-05-24  Dirk Eddelbuettel  <edd@debian.org>
+
+	* vignettes/RcppEigen-Introduction.Rnw: Update vignette to use
+	RcppEigen:::eigen_version() instead of .Call()
+
+2019-03-29  Dirk Eddelbuettel  <edd@debian.org>
+
+	* inst/skeleton/rcppeigen_hello_world.cpp: Rework first example to
+	not rely on Eigen RNG which R CMD check would complain about.
+
+2019-02-16  James Joseph Balamuta <balamut2@illinois.edu>
+
+	* R/runit.RcppEigen.R: Removed listing RcppEigen in Imports during
+	skeleton package creation.
+
+2018-11-24  Dirk Eddelbuettel  <edd@debian.org>
+
+	* DESCRIPTION (Version, Date): Release 0.3.3.5.0
+
+2018-11-23  Dirk Eddelbuettel  <edd@debian.org>
+
+	* inst/include/Eigen: Updated to release 3.3.5
+	* inst/include/unsupported: Idem
+	* patches/eigen-3.3.5.diff: Diff of local patches
+
+2018-09-02  Dirk Eddelbuettel  <edd@debian.org>
+
+	* .travis.yml: Switch Travis CI to R 3.5 repo
+
+2018-05-30  Michael Weylandt  <michael.weylandt@gmail.com>
+
+	* inst/examples/lmBenchmark.R: Update benchmark script to use
+	microbenchmark and to use exposed fastLm functions from Rcpp
+	packages rather than invoking .Call directly
+
+2018-05-25  Ralf Stubner  <ralf.stubner@daqana.com>
+
+	* inst/include/RcppEigenWrap.h: Use Rf_xlength and R_xlen_t to
+	support long vectors
+
+2018-02-07  Dirk Eddelbuettel  <edd@debian.org>
+
+	* patches/eigen-3.3.4.diff: Diff to upstream Eigen 3.3.4,
+	cherry-picked in the 3.3.4 branch of Yixuan
+
+2018-02-05  Dirk Eddelbuettel  <edd@debian.org>
+
+	* DESCRIPTION (Version, Date): Release 0.3.3.4.0
+
+	* inst/include/Eigen/src/Core/arch/CUDA/Half.h (Eigen): Condition use
+	of long long and unsigned long on C++11
+
+	* inst/include/Eigen/src/Core/arch/SSE/Complex.h (Eigen): Comment-out
+	use of diagnostic-suppressing pragma for gcc + clang to satisfy CRAN
+	* inst/include/Eigen/src/Core/util/DisableStupidWarnings.h: Idem
+
+2018-02-04  Yixuan Qiu  <yixuan.qiu@cos.name>
+
+	[ In RcppEigen 0.3.3.* branch ]
+	* inst/include/Eigen: Updated to the upstream 3.3 branch of Eigen
+	based on version 3.3.4
+	* inst/include/unsupported: Idem
+	* DESCRIPTION: Idem
+	* README.md: Idem
+
+	* inst/include/Eigen/src/Core/util/DisableStupidWarnings.h:
+	Patch from upstream Eigen that has not been ported to the 3.3 branch
+	(cf GitHub issue #48)
+
+2017-11-19  Dirk Eddelbuettel  <edd@debian.org>
+
+	* DESCRIPTION (Version, Date): Release 0.3.3.3.1
+
+	* R/inline.R: Use '::' not ':::' for Rcpp.plugin.maker
+
+2017-08-26  Dirk Eddelbuettel  <edd@debian.org>
+
+	* .travis.yml (before_install): Use https for curl fetch
+
+2017-06-06  Yu Gong  <armgong@yahoo.com>
+
+	* inst/include/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h:
+	Also permit compilation under Haiku-OS
+
+2017-05-28  Dirk Eddelbuettel  <edd@debian.org>
+
+	* inst/examples/lmBenchmark.R (do_bench): Remove spurious argument in
+	call to RcppEigen:::Eigen_SSE() (cf github issue #44)
+
+2017-04-29  Dirk Eddelbuettel  <edd@debian.org>
+
+	* DESCRIPTION: Release 0.3.3.3.0
+
+	[ which again owes a very big thank you to Yixuan for doing the work! ]
+
+2017-04-27  Dirk Eddelbuettel  <edd@debian.org>
+
+	* .travis.yml: Switch to using run.sh for Travis CI
+
+2017-04-27  Yixuan Qiu  <yixuan.qiu@cos.name>
+
+	[ In RcppEigen 0.3.3.* branch ]
+
+	* inst/include/Eigen/src/Core/arch/CUDA/Half.h: Fixed compiler warning
+	on 'long long' type in C++ 98 mode
+
+2017-03-14  Dirk Eddelbuettel  <edd@debian.org>
+
+	* DESCRIPTION (Version, Date): Release 0.3.2.9.1
+
+	* src/init.c (R_init_RcppEigen): Call R_registerRoutines()
+	and R_useDynamicSymbols()
+
+	* NAMESPACE: Use .registration=TRUE on useDynLib
+
+	* R/fastLm.R (fastLmPure): Remove erroneous fourth argument from .Call
+
+2017-03-13  Martin Maechler  <maechler@r-project.org>
+
+	* inst/include/RcppEigenCholmod.h: Synchronize with Matrix package
+
+2017-02-21  Yixuan Qiu  <yixuan.qiu@cos.name>
+
+	[ In RcppEigen 0.3.3.* branch ]
+
+	* inst/include/Eigen: Updated to release 3.3.3 of Eigen
+	* inst/include/unsupported: Idem
+	* DESCRIPTION: Idem
+	* README.md: Idem
+
+2017-01-20  Yixuan Qiu  <yixuan.qiu@cos.name>
+
+	[ In RcppEigen 0.3.3.* branch ]
+
+	* inst/include/Eigen: Updated to release 3.3.2 of Eigen
+	* inst/include/unsupported: Idem
+	* DESCRIPTION: Idem
+	* README.md: Idem
+
+	* inst/unitTests/runit.RcppEigen.R, inst/unitTests/runit.sparse.R:
+	Explicitly convert matrix size to `double` type such that Rcpp can
+	properly return the value to R, thanks to ChingChuan and Dirk
+
+2017-01-20  ChingChuan Chen  <zw12356@gmail.com>
+
+	[ In RcppEigen 0.3.3.* branch ]
+
+	* inst/include/Eigen: Updated to release 3.3.1 of Eigen
+	* inst/include/unsupported: Idem
+	* DESCRIPTION: Idem
+
+	* inst/examples/lmBenchmark.R: Fixed function names
+
+2016-12-22  Dirk Eddelbuettel  <edd@debian.org>
+
+	* DESCRIPTION (URL, BugReports): Added / updated
+
+2016-11-12  Yixuan Qiu  <yixuan.qiu@cos.name>
+
+	[ In RcppEigen 0.3.3.* branch ]
+
+	* inst/include/Eigen: Updated to release 3.3.0 of Eigen
+	* inst/include/unsupported: Idem
+	* DESCRIPTION: Idem
+	* README.md: Idem
+
+	* inst/include/RcppEigenForward.h, inst/include/RcppEigenWrap.h:
+	Added exporters for the new Map<SparseMatrix<T> > type
+	* inst/unitTests/runit.sparse.R: Unit tests for the new type
+
+2016-08-20  Dirk Eddelbuettel  <edd@debian.org>
+
+	* DESCRIPTION: Release 0.3.2.9.0 with big thanks to Yixuan for doing
+	the work!
+
+2016-08-19  Yixuan Qiu  <yixuan.qiu@cos.name>
+
+	* inst/include/Eigen: Updated to release 3.2.9 of Eigen
+
+	* README.md: Updated version number and fixed the NOTE from CRAN URL
+	check
+
+2016-04-30  Dirk Eddelbuettel  <edd@debian.org>
+
+	* README.md: Expanded
+
+2016-04-28  James Joseph Balamuta <balamut2@illinois.edu>
+
+	* inst/include/RcppEigenWrap.h: Added an exporter class for
+	Map::RowVectorX<t> per http://stackoverflow.com/questions/36920689/
+	* inst/include/unitTests/runit.RcppEigen.R: Added row exporter unit
+	test.
+
+2016-02-29  Dirk Eddelbuettel  <edd@debian.org>
+
+	* DESCRIPTION (Version): Release 0.3.2.8.1
+	* inst/NEWS.Rd: Release 0.3.2.8.1
+	* debian/*: Changes for Debian release of 0.3.2.8.1
+
+2016-02-28  Yixuan Qiu  <yixuan.qiu@cos.name>
+
+	* inst/include/Eigen/src/SparseCore/CompressedStorage.h,
+	  inst/include/Eigen/src/SparseCore/SparseBlock.h,
+	  inst/include/Eigen/src/SparseCore/SparseMatrix.h,
+	  inst/include/Eigen/src/SparseCore/SparseRedux.h,
+	  inst/include/Eigen/src/SparseCore/SparseVector.h:
+	Another patch from upstream to fix UBSAN null pointer errors
+
+2016-02-23  Dirk Eddelbuettel  <edd@debian.org>
+
+	* DESCRIPTION (Version): Release 0.3.2.8.0
+	* DESCRIPTION (Author): Added Yixuan Qiu
+
+	* inst/NEWS.Rd: Release 0.3.2.8.0
+
+2016-02-22  Yixuan Qiu  <yixuan.qiu@cos.name>
+
+	* inst/include/Eigen: Updated to release 3.2.8 of Eigen
+	* inst/include/unsupported: Idem
+	* DESCRIPTION: Idem
+	* README.md: Idem
+
+	* inst/include/Eigen/src/SparseCore/CompressedStorage.h,
+	inst/include/Eigen/src/Core/util/Memory.h: Applied patch from
+	upstream to fix UBSAN errors
+
 2016-01-20  Dirk Eddelbuettel  <edd@debian.org>
 
 	* DESCRIPTION: Release 0.3.2.7.0
diff --git a/DESCRIPTION b/DESCRIPTION
index bd6c00fe..cf12c938 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,32 +1,39 @@
 Package: RcppEigen
 Type: Package
 Title: 'Rcpp' Integration for the 'Eigen' Templated Linear Algebra Library
-Version: 0.3.2.7.0
-Date: 2016-01-19
-Author: Douglas Bates, Dirk Eddelbuettel and Romain Francois;
- the authors of Eigen for the included version of Eigen
-Maintainer: Dirk Eddelbuettel <edd@debian.org>
+Version: 0.4.9.9-2
+Date: 2026-06-04
+Authors@R: c(person("Doug", "Bates", role = "aut",
+                    comment = c(ORCID = "0000-0001-8316-9503")),
+             person("Dirk", "Eddelbuettel", role = c("aut", "cre"), email = "edd@debian.org",
+                    comment = c(ORCID = "0000-0001-6419-907X")),
+             person("Romain", "Francois", role = "aut",
+                    comment = c(ORCID = "0000-0002-2444-4226")),
+             person("Yixuan", "Qiu", role = "aut",
+                    comment = c(ORCID = "0000-0003-0109-6692")),
+             person("Authors of", "Eigen", role = "cph",
+                    comment = "Authorship and copyright in included Eigen library as detailed in inst/COPYRIGHTS"))
 Copyright: See the file COPYRIGHTS for various Eigen copyright details
 Description: R and 'Eigen' integration using 'Rcpp'.
- 'Eigen' is a C++ template library for linear algebra: matrices,
- vectors, numerical solvers and related algorithms.  It supports dense
- and sparse matrices on integer, floating point and complex numbers,
- decompositions of such matrices, and solutions of linear systems. Its
- performance on many algorithms is comparable with some of the best
- implementations based on 'Lapack' and level-3 'BLAS'.
- .
- The 'RcppEigen' package includes the header files from the 'Eigen' C++
- template library (currently version 3.2.7). Thus users do not need to
- install 'Eigen' itself in order to use 'RcppEigen'.
- .
- Since version 3.1.1, 'Eigen' is licensed under the Mozilla Public License
- (version 2); earlier version were licensed under the GNU LGPL version 3 or
- later. 'RcppEigen' (the 'Rcpp' bindings/bridge to 'Eigen') is licensed under
- the GNU GPL version 2 or later, as is the rest of 'Rcpp'.
+ 'Eigen' is a C++ template library for linear algebra: matrices, vectors,
+ numerical solvers and related algorithms.  It supports dense and sparse
+ matrices on integer, floating point and complex numbers, decompositions of
+ such matrices, and solutions of linear systems. Its performance on many
+ algorithms is comparable with some of the best implementations based on
+ 'Lapack' and level-3 'BLAS'. The 'RcppEigen' package includes the header
+ files from the 'Eigen' C++ template library. Thus users do not need to
+ install 'Eigen' itself in order to use 'RcppEigen'. Since version 3.1.1,
+ 'Eigen' is licensed under the Mozilla Public License (version 2); earlier
+ version were licensed under the GNU LGPL version 3 or later. 'RcppEigen'
+ (the 'Rcpp' bindings/bridge to 'Eigen') is licensed under the GNU GPL
+ version 2 or later, as is the rest of 'Rcpp'.
 License: GPL (>= 2) | file LICENSE
-Depends: R (>= 2.15.1)
 LazyLoad: yes
+Depends: R (>= 3.6.0)
 LinkingTo: Rcpp
-Imports: Matrix (>= 1.1-0), Rcpp (>= 0.11.0), stats, utils
-Suggests: inline, RUnit, pkgKitten
-URL: http://eigen.tuxfamily.org
+Imports: Rcpp (>= 0.11.0), stats, utils
+Suggests: Matrix, inline, tinytest, pkgKitten, microbenchmark
+URL: https://github.com/RcppCore/RcppEigen, https://dirk.eddelbuettel.com/code/rcpp.eigen.html
+BugReports: https://github.com/RcppCore/RcppEigen/issues
+RoxygenNote: 6.0.1
+VignetteBuilder: Rcpp
diff --git a/NAMESPACE b/NAMESPACE
index a1f4c4f5..1270ac43 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -1,18 +1,20 @@
-useDynLib(RcppEigen)
+useDynLib("RcppEigen", .registration=TRUE)
 
-importClassesFrom("Matrix", "dgCMatrix", "dgeMatrix", "dsCMatrix", "dtCMatrix")
-importFrom(Rcpp, "evalCpp")
-importFrom(utils, "packageDescription", "package.skeleton")
-importFrom(stats, "model.frame", "model.matrix", "model.response", "fitted", "coef", "printCoefmat", "pt")
-#exportPattern("^[[:alpha:]]+")
-export(fastLm,
-       fastLmPure,
-       RcppEigen.package.skeleton
+importFrom("Rcpp", "evalCpp")
+importFrom("utils", "packageDescription", "package.skeleton", "packageVersion")
+importFrom("stats", "model.frame", "model.matrix", "model.response", "fitted", "coef", "printCoefmat", "pt", "na.omit")
+export("fastLm",
+       "fastLmPure",
+       "RcppEigen.package.skeleton",
+       "EigenNbThreads",
+       "EigenSetNbThreads",
+       "RcppEigen_throttle_cores",
+       "RcppEigen_reset_cores"
        )
 
-S3method(fastLm, default)
-S3method(fastLm, formula)
-S3method(predict, fastLm)
-S3method(print, fastLm)
-S3method(summary, fastLm)
-S3method(print, summary.fastLm)
+S3method("fastLm", "default")
+S3method("fastLm", "formula")
+S3method("predict", "fastLm")
+S3method("print", "fastLm")
+S3method("summary", "fastLm")
+S3method("print", "summary.fastLm")
diff --git a/R/RcppEigen.package.skeleton.R b/R/RcppEigen.package.skeleton.R
index acef346d..184919b2 100644
--- a/R/RcppEigen.package.skeleton.R
+++ b/R/RcppEigen.package.skeleton.R
@@ -1,6 +1,6 @@
 ## RcppEigen.package.skeleton.R: makes a skeleton for a package that wants to use RcppEigen
 ##
-## Copyright (C) 2011 - 2015  Douglas Bates, Dirk Eddelbuettel and Romain Francois
+## Copyright (C) 2011 - 2023  Douglas Bates, Dirk Eddelbuettel and Romain Francois
 ##
 ## This file is part of RcppEigen.
 ##
@@ -19,12 +19,12 @@
 
 RcppEigen.package.skeleton <- function(name= "anRpackage", list = character(),
                                        environment = .GlobalEnv,
-                                       path = ".", force = FALSE, 
-                                       code_files = character(), 
+                                       path = ".", force = FALSE,
+                                       code_files = character(),
                                        example_code = TRUE) {
-	
-    env <- parent.frame(1)
-	
+
+    env <- parent.frame(1)              # #nocov start
+
     if (!length(list)) {
         fake <- TRUE
         assign("Rcpp.fake.fun", function() {}, envir = env)
@@ -40,51 +40,53 @@ RcppEigen.package.skeleton <- function(name= "anRpackage", list = character(),
     ## first let the traditional version do its business
     call <- match.call()
     call[[1]] <- skelFunUsed
+    if ("example_code" %in% names(call)) {
+        call[["example_code"]] <- NULL    # remove the example_code argument
+    }
     if (! haveKitten) {                 # in the package.skeleton() case
-        if ("example_code" %in% names(call)) {
-            call[["example_code"]] <- NULL    # remove the example_code argument
-        }
         if (fake) {
             call[["list"]] <- "Rcpp.fake.fun"
         }
+    } else {
+        if (force) {
+            call[["force"]] <- NULL
+        }
     }
-	
+
     tryCatch(eval(call, envir = env),
              error = function(e) {
                  cat(paste(e, "\n")) # print error
                  stop(paste("error while calling `", skelFunName, "`", sep=""))
              })
-	
+
     message("\nAdding RcppEigen settings")
-	
-    ## now pick things up 
+
+    ## now pick things up
     root <- file.path(path, name)
-	
+
     ## Add Rcpp to the DESCRIPTION
     DESCRIPTION <- file.path(root, "DESCRIPTION")
     if (file.exists(DESCRIPTION)) {
-        x <- cbind(read.dcf(DESCRIPTION), 
-                   "Imports" = sprintf("Rcpp (>= %s), RcppEigen (>= %s)", 
-                   packageDescription("Rcpp")[["Version"]], 
-                   packageDescription("RcppEigen")[["Version"]]), 
+        x <- cbind(read.dcf(DESCRIPTION),
+                   "Imports" = sprintf("Rcpp (>= %s)",
+                   packageDescription("Rcpp")[["Version"]]),
                    "LinkingTo" = "Rcpp, RcppEigen")
         write.dcf(x, file = DESCRIPTION)
-        message(" >> added Imports: Rcpp, RcppEigen")
+        message(" >> added Imports: Rcpp")
         message(" >> added LinkingTo: Rcpp, RcppEigen")
     }
-	
-    ## add a useDynLib to NAMESPACE, 
+
+    ## add a useDynLib to NAMESPACE,
     NAMESPACE <- file.path(root, "NAMESPACE")
     lines <- readLines(NAMESPACE)
-    if (! grepl("useDynLib", lines)) {
+    if (!any(grepl("useDynLib", lines))) {
         lines <- c(sprintf("useDynLib(%s)", name),
-                   "import(RcppEigen)",
                    "importFrom(Rcpp, evalCpp)",        ## ensures Rcpp instantiation
                    lines)
         writeLines(lines, con = NAMESPACE)
         message(" >> added useDynLib and importFrom directives to NAMESPACE")
     }
-	
+
     ## lay things out in the src directory
     src <- file.path(root, "src")
     if (!file.exists(src)) {
@@ -100,13 +102,13 @@ RcppEigen.package.skeleton <- function(name= "anRpackage", list = character(),
         file.copy(file.path(skeleton, "Makevars"), Makevars)
         message(" >> added Makevars file with RcppEigen settings")
     }
-	
+
     Makevars.win <- file.path(src, "Makevars.win")
     if (!file.exists(Makevars.win)) {
         file.copy(file.path(skeleton, "Makevars.win"), Makevars.win)
         message(" >> added Makevars.win file with RcppEigen settings")
     }
-		
+
     if (example_code) {
         file.copy(file.path(skeleton, "rcppeigen_hello_world.cpp"), src)
         message(" >> added example src file using Eigen classes")
@@ -116,13 +118,12 @@ RcppEigen.package.skeleton <- function(name= "anRpackage", list = character(),
 	Rcpp::compileAttributes(root)
         message(" >> invoked Rcpp::compileAttributes to create wrappers")
     }
-    
+
     if (fake) {
         rm("Rcpp.fake.fun", envir = env)
         unlink(file.path(root, "R"  , "Rcpp.fake.fun.R"))
         unlink(file.path(root, "man", "Rcpp.fake.fun.Rd"))
     }
-	
-    invisible(NULL)
-}
 
+    invisible(NULL) 						# #nocov end
+}
diff --git a/R/RcppExports.R b/R/RcppExports.R
index 52a03303..27fba4ff 100644
--- a/R/RcppExports.R
+++ b/R/RcppExports.R
@@ -1,15 +1,29 @@
-# This file was generated by Rcpp::compileAttributes
+# Generated by using Rcpp::compileAttributes() -> do not edit by hand
 # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 
-fastLm_Impl <- function(X, y, type) {
-    .Call('RcppEigen_fastLm_Impl', PACKAGE = 'RcppEigen', X, y, type)
+eigen_version <- function(single) {
+    .Call(`_RcppEigen_eigen_version`, single)
 }
 
-eigen_version <- function(single) {
-    .Call('RcppEigen_eigen_version', PACKAGE = 'RcppEigen', single)
+eigen_version_typed <- function() {
+    .Call(`_RcppEigen_eigen_version_typed`)
 }
 
 Eigen_SSE <- function() {
-    .Call('RcppEigen_Eigen_SSE', PACKAGE = 'RcppEigen')
+    .Call(`_RcppEigen_Eigen_SSE`)
+}
+
+#' @rdname RcppEigen_throttle_cores
+EigenNbThreads <- function() {
+    .Call(`_RcppEigen_EigenNbThreads`)
+}
+
+#' @rdname RcppEigen_throttle_cores
+EigenSetNbThreads <- function(n) {
+    invisible(.Call(`_RcppEigen_EigenSetNbThreads`, n))
+}
+
+fastLm_Impl <- function(X, y, type) {
+    .Call(`_RcppEigen_fastLm_Impl`, X, y, type)
 }
 
diff --git a/R/fastLm.R b/R/fastLm.R
index 2616465a..691e8e06 100644
--- a/R/fastLm.R
+++ b/R/fastLm.R
@@ -1,6 +1,6 @@
 ## fastLm.R: Rcpp/Eigen implementation of lm()
 ##
-## Copyright (C)  2011 - 2015  Douglas Bates, Dirk Eddelbuettel and Romain Francois
+## Copyright (C)  2011 - 2023  Douglas Bates, Dirk Eddelbuettel and Romain Francois
 ##
 ## This file is part of RcppEigen.
 ##
@@ -21,7 +21,7 @@ fastLmPure <- function(X, y, method = 0L) {
 
     stopifnot(is.matrix(X), is.numeric(y), NROW(y)==nrow(X))
 
-    .Call("RcppEigen_fastLm_Impl", X, y, method, colnames(X), PACKAGE="RcppEigen")
+    fastLm_Impl(X, y, method)
 }
 
 fastLm <- function(X, ...) UseMethod("fastLm")
@@ -54,7 +54,7 @@ summary.fastLm <- function(object, ...) {
     object$coefficients <- cbind(Estimate     = coef,
                                  "Std. Error" = se,
                                  "t value"    = tval,
-                                 "Pr(>|t|)"   = 2*pt(-abs(tval), df=object$df))
+                                 "Pr(>|t|)"   = 2*pt(-abs(tval), df=object$df.residual))
 
     ## cf src/stats/R/lm.R and case with no weights and an intercept
     f <- object$fitted.values
@@ -66,7 +66,7 @@ summary.fastLm <- function(object, ...) {
     object$r.squared <- mss/(mss + rss)
     df.int <- if (object$intercept) 1L else 0L
     n <- length(f)
-    rdf <- object$df
+    rdf <- object$df.residual
     object$adj.r.squared <- 1 - (1 - object$r.squared) * ((n - df.int)/rdf)
     class(object) <- "summary.fastLm"
     object
@@ -82,7 +82,7 @@ print.summary.fastLm <- function(x, ...) {
 
     printCoefmat(x$coefficients, P.values=TRUE, has.Pvalue=TRUE, ...)
     cat("\nResidual standard error: ", formatC(x$s, digits=digits), " on ",
-        formatC(x$df), " degrees of freedom\n", sep="")
+        formatC(x$df.residual), " degrees of freedom\n", sep="")
     cat("Multiple R-squared: ", formatC(x$r.squared, digits=digits),
         ",\tAdjusted R-squared: ",formatC(x$adj.r.squared, digits=digits),
         "\n", sep="")
@@ -109,7 +109,7 @@ predict.fastLm <- function(object, newdata=NULL, ...) {
         if (!is.null(object$formula)) {
             x <- model.matrix(object$formula, newdata)
         } else {
-            x <- newdata
+            x <- newdata			# #nocov
         }
         y <- as.vector(x %*% coef(object))
     }
diff --git a/R/init.R b/R/init.R
new file mode 100644
index 00000000..6d87356a
--- /dev/null
+++ b/R/init.R
@@ -0,0 +1,61 @@
+## init.R: Startup
+##
+## Copyright (C)  2025  Dirk Eddelbuettel
+##
+## This file is part of RcppEigen.
+##
+## RcppEigen is free software: you can redistribute it and/or modify it
+## under the terms of the GNU General Public License as published by
+## the Free Software Foundation, either version 2 of the License, or
+## (at your option) any later version.
+##
+## RcppEigen is distributed in the hope that it will be useful, but
+## WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with RcppEigen.  If not, see <http://www.gnu.org/licenses/>.
+
+.pkgenv <- new.env(parent=emptyenv())
+
+.onLoad <- function(libname, pkgname) {						# nocov start
+    ## simple fallback: 'Ncpus' (if set) or else all cpus seen by OpenMP
+    ncores <- getOption("Ncpus", EigenNbThreads())
+    ## consider OMP_THREAD_LIMIT (cf Writing R Extensions), gets NA if envvar unset
+    ompcores <- as.integer(Sys.getenv("OMP_THREAD_LIMIT"))
+    ## keep the smaller value, omitting NA
+    ncores <- min(na.omit(c(ncores, ompcores)))
+    .pkgenv[["nb_threads"]] <- ncores
+    RcppEigen_throttle_cores(ncores)
+}
+
+.onAttach <- function(libname, pkgname) {
+    if (interactive()) {
+        packageStartupMessage("RcppEigen ", packageVersion("RcppEigen"),
+                              " using ", .pkgenv[["nb_threads"]], " cores. See ",
+                              "'help(\"RcppEigen-package\")' for details.")
+    }
+}
+
+##' Throttle (or Reset) (Rcpp)Eigen Core Usage
+##'
+##' Helper functions to throttle use of cores by RcppEigen-internal code.
+##' On package load, the initial value is saved and used to reset the value.
+##' @param n Integer value of desired cores, default is the value set at package
+##' startup reflecting the smallest value among the total number of available
+##' cores (or one if compiled without OpenMP support), the value of option
+##' \code{Ncpus} and the value of environment variable \code{OMP_THREAD_LIMIT}.
+##' @return Only \code{EigenNbThreads()} returns a value, the current value of
+##' the number of cores used. The other functions are invoked for their side
+##' effect of affecting the count of cores used.
+##' @seealso \code{\link{RcppEigen-package}}
+RcppEigen_throttle_cores <- function(n) {
+    if (missing(n)) n <- .pkgenv[["nb_threads"]]
+    EigenSetNbThreads(n)
+}
+
+##' @rdname RcppEigen_throttle_cores
+RcppEigen_reset_cores <- function() {
+    EigenSetNbThreads(.pkgenv[["nb_threads"]])				# nocov end
+}
diff --git a/R/inline.R b/R/inline.R
index 3d28eb96..dcf2f755 100644
--- a/R/inline.R
+++ b/R/inline.R
@@ -1,4 +1,4 @@
-## Copyright (C)       2011 Douglas Bates, Dirk Eddelbuettel and Romain Francois
+## Copyright (C)  2011 - 2017  Douglas Bates, Dirk Eddelbuettel and Romain Francois
 ##
 ## This file is part of RcppEigen.
 ##
@@ -15,10 +15,8 @@
 ## You should have received a copy of the GNU General Public License
 ## along with RcppEigen.  If not, see <http://www.gnu.org/licenses/>.
 
-inlineCxxPlugin <-
-    Rcpp:::Rcpp.plugin.maker(
-                             include.before = "#include <RcppEigen.h>", 
-                             package        = "RcppEigen"
-#                             , LinkingTo      = c("RcppEigen", "Rcpp")
-                             )
+inlineCxxPlugin <- Rcpp::Rcpp.plugin.maker(include.before = "#include <RcppEigen.h>",
+                                           package        = "RcppEigen"
+#                                          , LinkingTo      = c("RcppEigen", "Rcpp")
+                                           )
 
diff --git a/R/unit.test.R b/R/unit.test.R
deleted file mode 100644
index 9d14f3a1..00000000
--- a/R/unit.test.R
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright (C)       2011 Douglas Bates, Dirk Eddelbuettel and Romain Francois
-#
-# This file is part of RcppEigen.
-#
-# RcppEigen is free software: you can redistribute it and/or modify it
-# under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 2 of the License, or
-# (at your option) any later version.
-#
-# RcppEigen is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with RcppEigen.  If not, see <http://www.gnu.org/licenses/>.
-
-compile_unit_tests <- function( definitions, includes = "", cxxargs = "" ){
-    signatures <- lapply(definitions, "[[", 1L)
-    bodies <- lapply(definitions, "[[", 2L)
-    cxxfunction <- get( "cxxfunction", asNamespace("inline" ) )
-    fun <- cxxfunction( signatures, bodies, plugin = "RcppEigen", 
-        includes = sprintf( "using namespace std;\n%s", paste( includes, collapse = "\n") ), 
-        cxxargs = cxxargs
-    )
-    fun
-}
diff --git a/README.md b/README.md
index cb5188f4..77f83609 100644
--- a/README.md
+++ b/README.md
@@ -1,16 +1,79 @@
-## RcppEigen
+## RcppEigen: R and Eigen via Rcpp
 
-[![Build Status](https://travis-ci.org/RcppCore/RcppEigen.svg)](https://travis-ci.org/RcppCore/RcppEigen) [![License](http://img.shields.io/badge/license-GPL%20%28%3E=%202%29-brightgreen.svg?style=flat)](http://www.gnu.org/licenses/gpl-2.0.html) [![License](http://img.shields.io/badge/license-MPL2-brightgreen.svg?style=flat)](http://www.mozilla.org/MPL/2.0/) [![CRAN](http://www.r-pkg.org/badges/version/RcppEigen)](http://cran.rstudio.com/package=RcppEigen) [![Downloads](http://cranlogs.r-pkg.org/badges/RcppEigen?color=brightgreen)](http://www.r-pkg.org/pkg/RcppEigen)
+[![CI](https://github.com/RcppCore/RcppEigen/workflows/ci/badge.svg)](https://github.com/RcppCore/RcppEigen/actions?query=workflow%3Aci)
+[![License](https://img.shields.io/badge/license-GPL%20%28%3E=%202%29-brightgreen.svg?style=flat)](https://www.gnu.org/licenses/gpl-2.0.html)
+[![License](https://img.shields.io/badge/license-MPL2-brightgreen.svg?style=flat)](https://www.mozilla.org/MPL/2.0/)
+[![CRAN](https://www.r-pkg.org/badges/version/RcppEigen)](https://cran.r-project.org/package=RcppEigen)
+[![r-universe](https://rcppcore.r-universe.dev/badges/RcppEigen)](https://rcppcore.r-universe.dev/RcppEigen)
+[![Dependencies](https://tinyverse.netlify.app/badge/RcppEigen)](https://cran.r-project.org/package=RcppEigen)
+[![Coverage Status](https://codecov.io/gh/RcppCore/RcppEigen/graph/badge.svg)](https://app.codecov.io/github/RcppCore/RcppEigen?branch=master)
+[![Debian package](https://img.shields.io/debian/v/r-cran-rcppeigen/sid?color=brightgreen)](https://packages.debian.org/sid/r-cran-rcppeigen)
+[![Last Commit](https://img.shields.io/github/last-commit/RcppCore/RcppEigen)](https://github.com/RcppCore/RcppEigen)
+[![Downloads (monthly)](https://cranlogs.r-pkg.org/badges/RcppEigen?color=brightgreen)](https://www.r-pkg.org:443/pkg/RcppEigen)
+[![Downloads (total)](https://cranlogs.r-pkg.org/badges/grand-total/RcppEigen?color=brightgreen)](https://www.r-pkg.org:443/pkg/RcppEigen)
+[![CRAN use](https://jangorecki.gitlab.io/rdeps/RcppEigen/CRAN_usage.svg?sanitize=true)](https://cran.r-project.org/package=RcppEigen)
+[![BioConductor use](https://jangorecki.gitlab.io/rdeps/RcppEigen/BioC_usage.svg?sanitize=true)](https://cran.r-project.org/package=RcppEigen)
+[![JSS](https://img.shields.io/badge/JSS-10.18637%2Fjss.v052.i05-brightgreen)](https://doi.org/10.18637/jss.v052.i05)
 
-[Rcpp](http://dirk.eddelbuettel.com/code/rcpp.html) Integration for the [Eigen](http://eigen.tuxfamily.org) Templated Linear Algebra Library
 
-[Eigen](http://eigen.tuxfamily.org) is a C++ template library for linear
-algebra: matrices, vectors, numerical solvers and related algorithms.  It
-supports dense and sparse matrices on integer, floating point and complex
-numbers, decompositions of such matrices, and solutions of linear
-systems. Its performance on many algorithms is comparable with some of the
-best implementations based on `Lapack` and level-3 `BLAS`.
+### Synopsis
 
-The RcppEigen package includes the header files from the Eigen C++
-template library (currently version 3.2.7). Thus users do not need to
-install Eigen itself in order to use RcppEigen.
+[Eigen](https://libeigen.gitlab.io/) is a C++ template library for linear algebra:
+matrices, vectors, numerical solvers and related algorithms.  It supports dense and sparse
+matrices on integer, floating point and complex numbers, decompositions of such matrices,
+and solutions of linear systems. Its performance on many algorithms is comparable with
+some of the best implementations based on `Lapack` and level-3 `BLAS`.
+
+RcppEigen provides an interface from R to and from [Eigen](https://libeigen.gitlab.io/) by
+using the facilities offered by the [Rcpp](http://dirk.eddelbuettel.com/code/rcpp.html)
+package for seamless R and C++ integration.
+
+### Examples
+
+A few examples are over at the [Rcpp Gallery](https://gallery.rcpp.org/tags/eigen/). A simple one is
+
+```c++
+#include <RcppEigen.h>
+
+// [[Rcpp::depends(RcppEigen)]]
+
+using Eigen::Map;                       // 'maps' rather than copies
+using Eigen::MatrixXd;                  // variable size matrix, double precision
+using Eigen::VectorXd;                  // variable size vector, double precision
+using Eigen::SelfAdjointEigenSolver;    // one of the eigenvalue solvers
+
+// [[Rcpp::export]]
+VectorXd getEigenValues(Map<MatrixXd> M) {
+    SelfAdjointEigenSolver<MatrixXd> es(M);
+    return es.eigenvalues();
+}
+```
+
+which can be turned into a function callable from R via a simple
+
+```
+sourceCpp("eigenExample.cpp")
+```
+
+due to the two Rcpp directives to use headers from the RcppEigen package, and to export
+the `getEigenValues()` function -- but read [the full
+post](https://gallery.rcpp.org/articles/eigen-eigenvalues/) for details.
+
+
+### Status
+
+The package is mature and under acthttps://libeigen.gitlab.io/ive development, following the
+[Eigen](https://libeigen.gitlab.io/) release cycle.
+
+### Documentation
+
+The package contains a pdf vignette which is a pre-print of the [paper by
+Bates and Eddelbuettel](https://doi.org/10.18637/jss.v052.i05) in JSS (2013, v52i05).
+
+### Authors
+
+Douglas Bates, Dirk Eddelbuettel, Romain Francois, and Yixuan Qiu
+
+### License
+
+GPL (>= 2)
diff --git a/cleanup b/cleanup
index b54b4aa6..8b956a2e 100755
--- a/cleanup
+++ b/cleanup
@@ -4,5 +4,6 @@ rm -f  src/*.o src/*.so \
        inst/doc/RcppEigen-unitTests.out \
        inst/doc/RcppEigen-unitTests.aux \
        inst/doc/RcppEigen-unitTests.log \
-       */*~ *~
+       */*~ *~ \
+       config.log config.status src/Makevars
 rm -rf autom4te.cache
diff --git a/configure b/configure
new file mode 100755
index 00000000..585e5b37
--- /dev/null
+++ b/configure
@@ -0,0 +1,4559 @@
+#! /bin/sh
+# Guess values for system-dependent variables and create Makefiles.
+# Generated by GNU Autoconf 2.72 for RcppEigen 0.3.4.0.2-1.
+#
+# Report bugs to <edd@debian.org>.
+#
+#
+# Copyright (C) 1992-1996, 1998-2017, 2020-2023 Free Software Foundation,
+# Inc.
+#
+#
+# This configure script is free software; the Free Software Foundation
+# gives unlimited permission to copy, distribute and modify it.
+## -------------------- ##
+## M4sh Initialization. ##
+## -------------------- ##
+
+# Be more Bourne compatible
+DUALCASE=1; export DUALCASE # for MKS sh
+if test ${ZSH_VERSION+y} && (emulate sh) >/dev/null 2>&1
+then :
+  emulate sh
+  NULLCMD=:
+  # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which
+  # is contrary to our usage.  Disable this feature.
+  alias -g '${1+"$@"}'='"$@"'
+  setopt NO_GLOB_SUBST
+else case e in #(
+  e) case `(set -o) 2>/dev/null` in #(
+  *posix*) :
+    set -o posix ;; #(
+  *) :
+     ;;
+esac ;;
+esac
+fi
+
+
+
+# Reset variables that may have inherited troublesome values from
+# the environment.
+
+# IFS needs to be set, to space, tab, and newline, in precisely that order.
+# (If _AS_PATH_WALK were called with IFS unset, it would have the
+# side effect of setting IFS to empty, thus disabling word splitting.)
+# Quoting is to prevent editors from complaining about space-tab.
+as_nl='
+'
+export as_nl
+IFS=" ""	$as_nl"
+
+PS1='$ '
+PS2='> '
+PS4='+ '
+
+# Ensure predictable behavior from utilities with locale-dependent output.
+LC_ALL=C
+export LC_ALL
+LANGUAGE=C
+export LANGUAGE
+
+# We cannot yet rely on "unset" to work, but we need these variables
+# to be unset--not just set to an empty or harmless value--now, to
+# avoid bugs in old shells (e.g. pre-3.0 UWIN ksh).  This construct
+# also avoids known problems related to "unset" and subshell syntax
+# in other old shells (e.g. bash 2.01 and pdksh 5.2.14).
+for as_var in BASH_ENV ENV MAIL MAILPATH CDPATH
+do eval test \${$as_var+y} \
+  && ( (unset $as_var) || exit 1) >/dev/null 2>&1 && unset $as_var || :
+done
+
+# Ensure that fds 0, 1, and 2 are open.
+if (exec 3>&0) 2>/dev/null; then :; else exec 0</dev/null; fi
+if (exec 3>&1) 2>/dev/null; then :; else exec 1>/dev/null; fi
+if (exec 3>&2)            ; then :; else exec 2>/dev/null; fi
+
+# The user is always right.
+if ${PATH_SEPARATOR+false} :; then
+  PATH_SEPARATOR=:
+  (PATH='/bin;/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 && {
+    (PATH='/bin:/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 ||
+      PATH_SEPARATOR=';'
+  }
+fi
+
+
+# Find who we are.  Look in the path if we contain no directory separator.
+as_myself=
+case $0 in #((
+  *[\\/]* ) as_myself=$0 ;;
+  *) as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  case $as_dir in #(((
+    '') as_dir=./ ;;
+    */) ;;
+    *) as_dir=$as_dir/ ;;
+  esac
+    test -r "$as_dir$0" && as_myself=$as_dir$0 && break
+  done
+IFS=$as_save_IFS
+
+     ;;
+esac
+# We did not find ourselves, most probably we were run as 'sh COMMAND'
+# in which case we are not to be found in the path.
+if test "x$as_myself" = x; then
+  as_myself=$0
+fi
+if test ! -f "$as_myself"; then
+  printf "%s\n" "$as_myself: error: cannot find myself; rerun with an absolute file name" >&2
+  exit 1
+fi
+
+
+# Use a proper internal environment variable to ensure we don't fall
+  # into an infinite loop, continuously re-executing ourselves.
+  if test x"${_as_can_reexec}" != xno && test "x$CONFIG_SHELL" != x; then
+    _as_can_reexec=no; export _as_can_reexec;
+    # We cannot yet assume a decent shell, so we have to provide a
+# neutralization value for shells without unset; and this also
+# works around shells that cannot unset nonexistent variables.
+# Preserve -v and -x to the replacement shell.
+BASH_ENV=/dev/null
+ENV=/dev/null
+(unset BASH_ENV) >/dev/null 2>&1 && unset BASH_ENV ENV
+case $- in # ((((
+  *v*x* | *x*v* ) as_opts=-vx ;;
+  *v* ) as_opts=-v ;;
+  *x* ) as_opts=-x ;;
+  * ) as_opts= ;;
+esac
+exec $CONFIG_SHELL $as_opts "$as_myself" ${1+"$@"}
+# Admittedly, this is quite paranoid, since all the known shells bail
+# out after a failed 'exec'.
+printf "%s\n" "$0: could not re-execute with $CONFIG_SHELL" >&2
+exit 255
+  fi
+  # We don't want this to propagate to other subprocesses.
+          { _as_can_reexec=; unset _as_can_reexec;}
+if test "x$CONFIG_SHELL" = x; then
+  as_bourne_compatible="if test \${ZSH_VERSION+y} && (emulate sh) >/dev/null 2>&1
+then :
+  emulate sh
+  NULLCMD=:
+  # Pre-4.2 versions of Zsh do word splitting on \${1+\"\$@\"}, which
+  # is contrary to our usage.  Disable this feature.
+  alias -g '\${1+\"\$@\"}'='\"\$@\"'
+  setopt NO_GLOB_SUBST
+else case e in #(
+  e) case \`(set -o) 2>/dev/null\` in #(
+  *posix*) :
+    set -o posix ;; #(
+  *) :
+     ;;
+esac ;;
+esac
+fi
+"
+  as_required="as_fn_return () { (exit \$1); }
+as_fn_success () { as_fn_return 0; }
+as_fn_failure () { as_fn_return 1; }
+as_fn_ret_success () { return 0; }
+as_fn_ret_failure () { return 1; }
+
+exitcode=0
+as_fn_success || { exitcode=1; echo as_fn_success failed.; }
+as_fn_failure && { exitcode=1; echo as_fn_failure succeeded.; }
+as_fn_ret_success || { exitcode=1; echo as_fn_ret_success failed.; }
+as_fn_ret_failure && { exitcode=1; echo as_fn_ret_failure succeeded.; }
+if ( set x; as_fn_ret_success y && test x = \"\$1\" )
+then :
+
+else case e in #(
+  e) exitcode=1; echo positional parameters were not saved. ;;
+esac
+fi
+test x\$exitcode = x0 || exit 1
+blah=\$(echo \$(echo blah))
+test x\"\$blah\" = xblah || exit 1
+test -x / || exit 1"
+  as_suggested="  as_lineno_1=";as_suggested=$as_suggested$LINENO;as_suggested=$as_suggested" as_lineno_1a=\$LINENO
+  as_lineno_2=";as_suggested=$as_suggested$LINENO;as_suggested=$as_suggested" as_lineno_2a=\$LINENO
+  eval 'test \"x\$as_lineno_1'\$as_run'\" != \"x\$as_lineno_2'\$as_run'\" &&
+  test \"x\`expr \$as_lineno_1'\$as_run' + 1\`\" = \"x\$as_lineno_2'\$as_run'\"' || exit 1"
+  if (eval "$as_required") 2>/dev/null
+then :
+  as_have_required=yes
+else case e in #(
+  e) as_have_required=no ;;
+esac
+fi
+  if test x$as_have_required = xyes && (eval "$as_suggested") 2>/dev/null
+then :
+
+else case e in #(
+  e) as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+as_found=false
+for as_dir in /bin$PATH_SEPARATOR/usr/bin$PATH_SEPARATOR$PATH
+do
+  IFS=$as_save_IFS
+  case $as_dir in #(((
+    '') as_dir=./ ;;
+    */) ;;
+    *) as_dir=$as_dir/ ;;
+  esac
+  as_found=:
+  case $as_dir in #(
+	 /*)
+	   for as_base in sh bash ksh sh5; do
+	     # Try only shells that exist, to save several forks.
+	     as_shell=$as_dir$as_base
+	     if { test -f "$as_shell" || test -f "$as_shell.exe"; } &&
+		    as_run=a "$as_shell" -c "$as_bourne_compatible""$as_required" 2>/dev/null
+then :
+  CONFIG_SHELL=$as_shell as_have_required=yes
+		   if as_run=a "$as_shell" -c "$as_bourne_compatible""$as_suggested" 2>/dev/null
+then :
+  break 2
+fi
+fi
+	   done;;
+       esac
+  as_found=false
+done
+IFS=$as_save_IFS
+if $as_found
+then :
+
+else case e in #(
+  e) if { test -f "$SHELL" || test -f "$SHELL.exe"; } &&
+	      as_run=a "$SHELL" -c "$as_bourne_compatible""$as_required" 2>/dev/null
+then :
+  CONFIG_SHELL=$SHELL as_have_required=yes
+fi ;;
+esac
+fi
+
+
+      if test "x$CONFIG_SHELL" != x
+then :
+  export CONFIG_SHELL
+             # We cannot yet assume a decent shell, so we have to provide a
+# neutralization value for shells without unset; and this also
+# works around shells that cannot unset nonexistent variables.
+# Preserve -v and -x to the replacement shell.
+BASH_ENV=/dev/null
+ENV=/dev/null
+(unset BASH_ENV) >/dev/null 2>&1 && unset BASH_ENV ENV
+case $- in # ((((
+  *v*x* | *x*v* ) as_opts=-vx ;;
+  *v* ) as_opts=-v ;;
+  *x* ) as_opts=-x ;;
+  * ) as_opts= ;;
+esac
+exec $CONFIG_SHELL $as_opts "$as_myself" ${1+"$@"}
+# Admittedly, this is quite paranoid, since all the known shells bail
+# out after a failed 'exec'.
+printf "%s\n" "$0: could not re-execute with $CONFIG_SHELL" >&2
+exit 255
+fi
+
+    if test x$as_have_required = xno
+then :
+  printf "%s\n" "$0: This script requires a shell more modern than all"
+  printf "%s\n" "$0: the shells that I found on your system."
+  if test ${ZSH_VERSION+y} ; then
+    printf "%s\n" "$0: In particular, zsh $ZSH_VERSION has bugs and should"
+    printf "%s\n" "$0: be upgraded to zsh 4.3.4 or later."
+  else
+    printf "%s\n" "$0: Please tell bug-autoconf@gnu.org and edd@debian.org
+$0: about your system, including any error possibly output
+$0: before this message. Then install a modern shell, or
+$0: manually run the script under such a shell if you do
+$0: have one."
+  fi
+  exit 1
+fi ;;
+esac
+fi
+fi
+SHELL=${CONFIG_SHELL-/bin/sh}
+export SHELL
+# Unset more variables known to interfere with behavior of common tools.
+CLICOLOR_FORCE= GREP_OPTIONS=
+unset CLICOLOR_FORCE GREP_OPTIONS
+
+## --------------------- ##
+## M4sh Shell Functions. ##
+## --------------------- ##
+# as_fn_unset VAR
+# ---------------
+# Portably unset VAR.
+as_fn_unset ()
+{
+  { eval $1=; unset $1;}
+}
+as_unset=as_fn_unset
+
+
+# as_fn_set_status STATUS
+# -----------------------
+# Set $? to STATUS, without forking.
+as_fn_set_status ()
+{
+  return $1
+} # as_fn_set_status
+
+# as_fn_exit STATUS
+# -----------------
+# Exit the shell with STATUS, even in a "trap 0" or "set -e" context.
+as_fn_exit ()
+{
+  set +e
+  as_fn_set_status $1
+  exit $1
+} # as_fn_exit
+
+# as_fn_mkdir_p
+# -------------
+# Create "$as_dir" as a directory, including parents if necessary.
+as_fn_mkdir_p ()
+{
+
+  case $as_dir in #(
+  -*) as_dir=./$as_dir;;
+  esac
+  test -d "$as_dir" || eval $as_mkdir_p || {
+    as_dirs=
+    while :; do
+      case $as_dir in #(
+      *\'*) as_qdir=`printf "%s\n" "$as_dir" | sed "s/'/'\\\\\\\\''/g"`;; #'(
+      *) as_qdir=$as_dir;;
+      esac
+      as_dirs="'$as_qdir' $as_dirs"
+      as_dir=`$as_dirname -- "$as_dir" ||
+$as_expr X"$as_dir" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+	 X"$as_dir" : 'X\(//\)[^/]' \| \
+	 X"$as_dir" : 'X\(//\)$' \| \
+	 X"$as_dir" : 'X\(/\)' \| . 2>/dev/null ||
+printf "%s\n" X"$as_dir" |
+    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)[^/].*/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\).*/{
+	    s//\1/
+	    q
+	  }
+	  s/.*/./; q'`
+      test -d "$as_dir" && break
+    done
+    test -z "$as_dirs" || eval "mkdir $as_dirs"
+  } || test -d "$as_dir" || as_fn_error $? "cannot create directory $as_dir"
+
+
+} # as_fn_mkdir_p
+
+# as_fn_executable_p FILE
+# -----------------------
+# Test if FILE is an executable regular file.
+as_fn_executable_p ()
+{
+  test -f "$1" && test -x "$1"
+} # as_fn_executable_p
+# as_fn_append VAR VALUE
+# ----------------------
+# Append the text in VALUE to the end of the definition contained in VAR. Take
+# advantage of any shell optimizations that allow amortized linear growth over
+# repeated appends, instead of the typical quadratic growth present in naive
+# implementations.
+if (eval "as_var=1; as_var+=2; test x\$as_var = x12") 2>/dev/null
+then :
+  eval 'as_fn_append ()
+  {
+    eval $1+=\$2
+  }'
+else case e in #(
+  e) as_fn_append ()
+  {
+    eval $1=\$$1\$2
+  } ;;
+esac
+fi # as_fn_append
+
+# as_fn_arith ARG...
+# ------------------
+# Perform arithmetic evaluation on the ARGs, and store the result in the
+# global $as_val. Take advantage of shells that can avoid forks. The arguments
+# must be portable across $(()) and expr.
+if (eval "test \$(( 1 + 1 )) = 2") 2>/dev/null
+then :
+  eval 'as_fn_arith ()
+  {
+    as_val=$(( $* ))
+  }'
+else case e in #(
+  e) as_fn_arith ()
+  {
+    as_val=`expr "$@" || test $? -eq 1`
+  } ;;
+esac
+fi # as_fn_arith
+
+
+# as_fn_error STATUS ERROR [LINENO LOG_FD]
+# ----------------------------------------
+# Output "`basename $0`: error: ERROR" to stderr. If LINENO and LOG_FD are
+# provided, also output the error to LOG_FD, referencing LINENO. Then exit the
+# script with STATUS, using 1 if that was 0.
+as_fn_error ()
+{
+  as_status=$1; test $as_status -eq 0 && as_status=1
+  if test "$4"; then
+    as_lineno=${as_lineno-"$3"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+    printf "%s\n" "$as_me:${as_lineno-$LINENO}: error: $2" >&$4
+  fi
+  printf "%s\n" "$as_me: error: $2" >&2
+  as_fn_exit $as_status
+} # as_fn_error
+
+if expr a : '\(a\)' >/dev/null 2>&1 &&
+   test "X`expr 00001 : '.*\(...\)'`" = X001; then
+  as_expr=expr
+else
+  as_expr=false
+fi
+
+if (basename -- /) >/dev/null 2>&1 && test "X`basename -- / 2>&1`" = "X/"; then
+  as_basename=basename
+else
+  as_basename=false
+fi
+
+if (as_dir=`dirname -- /` && test "X$as_dir" = X/) >/dev/null 2>&1; then
+  as_dirname=dirname
+else
+  as_dirname=false
+fi
+
+as_me=`$as_basename -- "$0" ||
+$as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \
+	 X"$0" : 'X\(//\)$' \| \
+	 X"$0" : 'X\(/\)' \| . 2>/dev/null ||
+printf "%s\n" X/"$0" |
+    sed '/^.*\/\([^/][^/]*\)\/*$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\/\(\/\/\)$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\/\(\/\).*/{
+	    s//\1/
+	    q
+	  }
+	  s/.*/./; q'`
+
+# Avoid depending upon Character Ranges.
+as_cr_letters='abcdefghijklmnopqrstuvwxyz'
+as_cr_LETTERS='ABCDEFGHIJKLMNOPQRSTUVWXYZ'
+as_cr_Letters=$as_cr_letters$as_cr_LETTERS
+as_cr_digits='0123456789'
+as_cr_alnum=$as_cr_Letters$as_cr_digits
+
+
+  as_lineno_1=$LINENO as_lineno_1a=$LINENO
+  as_lineno_2=$LINENO as_lineno_2a=$LINENO
+  eval 'test "x$as_lineno_1'$as_run'" != "x$as_lineno_2'$as_run'" &&
+  test "x`expr $as_lineno_1'$as_run' + 1`" = "x$as_lineno_2'$as_run'"' || {
+  # Blame Lee E. McMahon (1931-1989) for sed's syntax.  :-)
+  sed -n '
+    p
+    /[$]LINENO/=
+  ' <$as_myself |
+    sed '
+      t clear
+      :clear
+      s/[$]LINENO.*/&-/
+      t lineno
+      b
+      :lineno
+      N
+      :loop
+      s/[$]LINENO\([^'$as_cr_alnum'_].*\n\)\(.*\)/\2\1\2/
+      t loop
+      s/-\n.*//
+    ' >$as_me.lineno &&
+  chmod +x "$as_me.lineno" ||
+    { printf "%s\n" "$as_me: error: cannot create $as_me.lineno; rerun with a POSIX shell" >&2; as_fn_exit 1; }
+
+  # If we had to re-execute with $CONFIG_SHELL, we're ensured to have
+  # already done that, so ensure we don't try to do so again and fall
+  # in an infinite loop.  This has already happened in practice.
+  _as_can_reexec=no; export _as_can_reexec
+  # Don't try to exec as it changes $[0], causing all sort of problems
+  # (the dirname of $[0] is not the place where we might find the
+  # original and so on.  Autoconf is especially sensitive to this).
+  . "./$as_me.lineno"
+  # Exit status is that of the last command.
+  exit
+}
+
+
+# Determine whether it's possible to make 'echo' print without a newline.
+# These variables are no longer used directly by Autoconf, but are AC_SUBSTed
+# for compatibility with existing Makefiles.
+ECHO_C= ECHO_N= ECHO_T=
+case `echo -n x` in #(((((
+-n*)
+  case `echo 'xy\c'` in
+  *c*) ECHO_T='	';;	# ECHO_T is single tab character.
+  xy)  ECHO_C='\c';;
+  *)   echo `echo ksh88 bug on AIX 6.1` > /dev/null
+       ECHO_T='	';;
+  esac;;
+*)
+  ECHO_N='-n';;
+esac
+
+# For backward compatibility with old third-party macros, we provide
+# the shell variables $as_echo and $as_echo_n.  New code should use
+# AS_ECHO(["message"]) and AS_ECHO_N(["message"]), respectively.
+as_echo='printf %s\n'
+as_echo_n='printf %s'
+
+rm -f conf$$ conf$$.exe conf$$.file
+if test -d conf$$.dir; then
+  rm -f conf$$.dir/conf$$.file
+else
+  rm -f conf$$.dir
+  mkdir conf$$.dir 2>/dev/null
+fi
+if (echo >conf$$.file) 2>/dev/null; then
+  if ln -s conf$$.file conf$$ 2>/dev/null; then
+    as_ln_s='ln -s'
+    # ... but there are two gotchas:
+    # 1) On MSYS, both 'ln -s file dir' and 'ln file dir' fail.
+    # 2) DJGPP < 2.04 has no symlinks; 'ln -s' creates a wrapper executable.
+    # In both cases, we have to default to 'cp -pR'.
+    ln -s conf$$.file conf$$.dir 2>/dev/null && test ! -f conf$$.exe ||
+      as_ln_s='cp -pR'
+  elif ln conf$$.file conf$$ 2>/dev/null; then
+    as_ln_s=ln
+  else
+    as_ln_s='cp -pR'
+  fi
+else
+  as_ln_s='cp -pR'
+fi
+rm -f conf$$ conf$$.exe conf$$.dir/conf$$.file conf$$.file
+rmdir conf$$.dir 2>/dev/null
+
+if mkdir -p . 2>/dev/null; then
+  as_mkdir_p='mkdir -p "$as_dir"'
+else
+  test -d ./-p && rmdir ./-p
+  as_mkdir_p=false
+fi
+
+as_test_x='test -x'
+as_executable_p=as_fn_executable_p
+
+# Sed expression to map a string onto a valid CPP name.
+as_sed_cpp="y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g"
+as_tr_cpp="eval sed '$as_sed_cpp'" # deprecated
+
+# Sed expression to map a string onto a valid variable name.
+as_sed_sh="y%*+%pp%;s%[^_$as_cr_alnum]%_%g"
+as_tr_sh="eval sed '$as_sed_sh'" # deprecated
+
+
+test -n "$DJDIR" || exec 7<&0 </dev/null
+exec 6>&1
+
+# Name of the host.
+# hostname on some systems (SVR3.2, old GNU/Linux) returns a bogus exit status,
+# so uname gets run too.
+ac_hostname=`(hostname || uname -n) 2>/dev/null | sed 1q`
+
+#
+# Initializations.
+#
+ac_default_prefix=/usr/local
+ac_clean_files=
+ac_config_libobj_dir=.
+LIBOBJS=
+cross_compiling=no
+subdirs=
+MFLAGS=
+MAKEFLAGS=
+
+# Identity of this package.
+PACKAGE_NAME='RcppEigen'
+PACKAGE_TARNAME='rcppeigen'
+PACKAGE_VERSION='0.3.4.0.2-1'
+PACKAGE_STRING='RcppEigen 0.3.4.0.2-1'
+PACKAGE_BUGREPORT='edd@debian.org'
+PACKAGE_URL=''
+
+ac_subst_vars='LTLIBOBJS
+LIBOBJS
+PKG_LIBS
+PKG_CXXFLAGS
+OPENMP_FLAG
+CXXCPP
+OBJEXT
+EXEEXT
+ac_ct_CXX
+CPPFLAGS
+LDFLAGS
+CXXFLAGS
+CXX
+target_alias
+host_alias
+build_alias
+LIBS
+ECHO_T
+ECHO_N
+ECHO_C
+DEFS
+mandir
+localedir
+libdir
+psdir
+pdfdir
+dvidir
+htmldir
+infodir
+docdir
+oldincludedir
+includedir
+runstatedir
+localstatedir
+sharedstatedir
+sysconfdir
+datadir
+datarootdir
+libexecdir
+sbindir
+bindir
+program_transform_name
+prefix
+exec_prefix
+PACKAGE_URL
+PACKAGE_BUGREPORT
+PACKAGE_STRING
+PACKAGE_VERSION
+PACKAGE_TARNAME
+PACKAGE_NAME
+PATH_SEPARATOR
+SHELL'
+ac_subst_files=''
+ac_user_opts='
+enable_option_checking
+'
+      ac_precious_vars='build_alias
+host_alias
+target_alias
+CXX
+CXXFLAGS
+LDFLAGS
+LIBS
+CPPFLAGS
+CCC
+CXXCPP'
+
+
+# Initialize some variables set by options.
+ac_init_help=
+ac_init_version=false
+ac_unrecognized_opts=
+ac_unrecognized_sep=
+# The variables have the same names as the options, with
+# dashes changed to underlines.
+cache_file=/dev/null
+exec_prefix=NONE
+no_create=
+no_recursion=
+prefix=NONE
+program_prefix=NONE
+program_suffix=NONE
+program_transform_name=s,x,x,
+silent=
+site=
+srcdir=
+verbose=
+x_includes=NONE
+x_libraries=NONE
+
+# Installation directory options.
+# These are left unexpanded so users can "make install exec_prefix=/foo"
+# and all the variables that are supposed to be based on exec_prefix
+# by default will actually change.
+# Use braces instead of parens because sh, perl, etc. also accept them.
+# (The list follows the same order as the GNU Coding Standards.)
+bindir='${exec_prefix}/bin'
+sbindir='${exec_prefix}/sbin'
+libexecdir='${exec_prefix}/libexec'
+datarootdir='${prefix}/share'
+datadir='${datarootdir}'
+sysconfdir='${prefix}/etc'
+sharedstatedir='${prefix}/com'
+localstatedir='${prefix}/var'
+runstatedir='${localstatedir}/run'
+includedir='${prefix}/include'
+oldincludedir='/usr/include'
+docdir='${datarootdir}/doc/${PACKAGE_TARNAME}'
+infodir='${datarootdir}/info'
+htmldir='${docdir}'
+dvidir='${docdir}'
+pdfdir='${docdir}'
+psdir='${docdir}'
+libdir='${exec_prefix}/lib'
+localedir='${datarootdir}/locale'
+mandir='${datarootdir}/man'
+
+ac_prev=
+ac_dashdash=
+for ac_option
+do
+  # If the previous option needs an argument, assign it.
+  if test -n "$ac_prev"; then
+    eval $ac_prev=\$ac_option
+    ac_prev=
+    continue
+  fi
+
+  case $ac_option in
+  *=?*) ac_optarg=`expr "X$ac_option" : '[^=]*=\(.*\)'` ;;
+  *=)   ac_optarg= ;;
+  *)    ac_optarg=yes ;;
+  esac
+
+  case $ac_dashdash$ac_option in
+  --)
+    ac_dashdash=yes ;;
+
+  -bindir | --bindir | --bindi | --bind | --bin | --bi)
+    ac_prev=bindir ;;
+  -bindir=* | --bindir=* | --bindi=* | --bind=* | --bin=* | --bi=*)
+    bindir=$ac_optarg ;;
+
+  -build | --build | --buil | --bui | --bu)
+    ac_prev=build_alias ;;
+  -build=* | --build=* | --buil=* | --bui=* | --bu=*)
+    build_alias=$ac_optarg ;;
+
+  -cache-file | --cache-file | --cache-fil | --cache-fi \
+  | --cache-f | --cache- | --cache | --cach | --cac | --ca | --c)
+    ac_prev=cache_file ;;
+  -cache-file=* | --cache-file=* | --cache-fil=* | --cache-fi=* \
+  | --cache-f=* | --cache-=* | --cache=* | --cach=* | --cac=* | --ca=* | --c=*)
+    cache_file=$ac_optarg ;;
+
+  --config-cache | -C)
+    cache_file=config.cache ;;
+
+  -datadir | --datadir | --datadi | --datad)
+    ac_prev=datadir ;;
+  -datadir=* | --datadir=* | --datadi=* | --datad=*)
+    datadir=$ac_optarg ;;
+
+  -datarootdir | --datarootdir | --datarootdi | --datarootd | --dataroot \
+  | --dataroo | --dataro | --datar)
+    ac_prev=datarootdir ;;
+  -datarootdir=* | --datarootdir=* | --datarootdi=* | --datarootd=* \
+  | --dataroot=* | --dataroo=* | --dataro=* | --datar=*)
+    datarootdir=$ac_optarg ;;
+
+  -disable-* | --disable-*)
+    ac_useropt=`expr "x$ac_option" : 'x-*disable-\(.*\)'`
+    # Reject names that are not valid shell variable names.
+    expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null &&
+      as_fn_error $? "invalid feature name: '$ac_useropt'"
+    ac_useropt_orig=$ac_useropt
+    ac_useropt=`printf "%s\n" "$ac_useropt" | sed 's/[-+.]/_/g'`
+    case $ac_user_opts in
+      *"
+"enable_$ac_useropt"
+"*) ;;
+      *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--disable-$ac_useropt_orig"
+	 ac_unrecognized_sep=', ';;
+    esac
+    eval enable_$ac_useropt=no ;;
+
+  -docdir | --docdir | --docdi | --doc | --do)
+    ac_prev=docdir ;;
+  -docdir=* | --docdir=* | --docdi=* | --doc=* | --do=*)
+    docdir=$ac_optarg ;;
+
+  -dvidir | --dvidir | --dvidi | --dvid | --dvi | --dv)
+    ac_prev=dvidir ;;
+  -dvidir=* | --dvidir=* | --dvidi=* | --dvid=* | --dvi=* | --dv=*)
+    dvidir=$ac_optarg ;;
+
+  -enable-* | --enable-*)
+    ac_useropt=`expr "x$ac_option" : 'x-*enable-\([^=]*\)'`
+    # Reject names that are not valid shell variable names.
+    expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null &&
+      as_fn_error $? "invalid feature name: '$ac_useropt'"
+    ac_useropt_orig=$ac_useropt
+    ac_useropt=`printf "%s\n" "$ac_useropt" | sed 's/[-+.]/_/g'`
+    case $ac_user_opts in
+      *"
+"enable_$ac_useropt"
+"*) ;;
+      *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--enable-$ac_useropt_orig"
+	 ac_unrecognized_sep=', ';;
+    esac
+    eval enable_$ac_useropt=\$ac_optarg ;;
+
+  -exec-prefix | --exec_prefix | --exec-prefix | --exec-prefi \
+  | --exec-pref | --exec-pre | --exec-pr | --exec-p | --exec- \
+  | --exec | --exe | --ex)
+    ac_prev=exec_prefix ;;
+  -exec-prefix=* | --exec_prefix=* | --exec-prefix=* | --exec-prefi=* \
+  | --exec-pref=* | --exec-pre=* | --exec-pr=* | --exec-p=* | --exec-=* \
+  | --exec=* | --exe=* | --ex=*)
+    exec_prefix=$ac_optarg ;;
+
+  -gas | --gas | --ga | --g)
+    # Obsolete; use --with-gas.
+    with_gas=yes ;;
+
+  -help | --help | --hel | --he | -h)
+    ac_init_help=long ;;
+  -help=r* | --help=r* | --hel=r* | --he=r* | -hr*)
+    ac_init_help=recursive ;;
+  -help=s* | --help=s* | --hel=s* | --he=s* | -hs*)
+    ac_init_help=short ;;
+
+  -host | --host | --hos | --ho)
+    ac_prev=host_alias ;;
+  -host=* | --host=* | --hos=* | --ho=*)
+    host_alias=$ac_optarg ;;
+
+  -htmldir | --htmldir | --htmldi | --htmld | --html | --htm | --ht)
+    ac_prev=htmldir ;;
+  -htmldir=* | --htmldir=* | --htmldi=* | --htmld=* | --html=* | --htm=* \
+  | --ht=*)
+    htmldir=$ac_optarg ;;
+
+  -includedir | --includedir | --includedi | --included | --include \
+  | --includ | --inclu | --incl | --inc)
+    ac_prev=includedir ;;
+  -includedir=* | --includedir=* | --includedi=* | --included=* | --include=* \
+  | --includ=* | --inclu=* | --incl=* | --inc=*)
+    includedir=$ac_optarg ;;
+
+  -infodir | --infodir | --infodi | --infod | --info | --inf)
+    ac_prev=infodir ;;
+  -infodir=* | --infodir=* | --infodi=* | --infod=* | --info=* | --inf=*)
+    infodir=$ac_optarg ;;
+
+  -libdir | --libdir | --libdi | --libd)
+    ac_prev=libdir ;;
+  -libdir=* | --libdir=* | --libdi=* | --libd=*)
+    libdir=$ac_optarg ;;
+
+  -libexecdir | --libexecdir | --libexecdi | --libexecd | --libexec \
+  | --libexe | --libex | --libe)
+    ac_prev=libexecdir ;;
+  -libexecdir=* | --libexecdir=* | --libexecdi=* | --libexecd=* | --libexec=* \
+  | --libexe=* | --libex=* | --libe=*)
+    libexecdir=$ac_optarg ;;
+
+  -localedir | --localedir | --localedi | --localed | --locale)
+    ac_prev=localedir ;;
+  -localedir=* | --localedir=* | --localedi=* | --localed=* | --locale=*)
+    localedir=$ac_optarg ;;
+
+  -localstatedir | --localstatedir | --localstatedi | --localstated \
+  | --localstate | --localstat | --localsta | --localst | --locals)
+    ac_prev=localstatedir ;;
+  -localstatedir=* | --localstatedir=* | --localstatedi=* | --localstated=* \
+  | --localstate=* | --localstat=* | --localsta=* | --localst=* | --locals=*)
+    localstatedir=$ac_optarg ;;
+
+  -mandir | --mandir | --mandi | --mand | --man | --ma | --m)
+    ac_prev=mandir ;;
+  -mandir=* | --mandir=* | --mandi=* | --mand=* | --man=* | --ma=* | --m=*)
+    mandir=$ac_optarg ;;
+
+  -nfp | --nfp | --nf)
+    # Obsolete; use --without-fp.
+    with_fp=no ;;
+
+  -no-create | --no-create | --no-creat | --no-crea | --no-cre \
+  | --no-cr | --no-c | -n)
+    no_create=yes ;;
+
+  -no-recursion | --no-recursion | --no-recursio | --no-recursi \
+  | --no-recurs | --no-recur | --no-recu | --no-rec | --no-re | --no-r)
+    no_recursion=yes ;;
+
+  -oldincludedir | --oldincludedir | --oldincludedi | --oldincluded \
+  | --oldinclude | --oldinclud | --oldinclu | --oldincl | --oldinc \
+  | --oldin | --oldi | --old | --ol | --o)
+    ac_prev=oldincludedir ;;
+  -oldincludedir=* | --oldincludedir=* | --oldincludedi=* | --oldincluded=* \
+  | --oldinclude=* | --oldinclud=* | --oldinclu=* | --oldincl=* | --oldinc=* \
+  | --oldin=* | --oldi=* | --old=* | --ol=* | --o=*)
+    oldincludedir=$ac_optarg ;;
+
+  -prefix | --prefix | --prefi | --pref | --pre | --pr | --p)
+    ac_prev=prefix ;;
+  -prefix=* | --prefix=* | --prefi=* | --pref=* | --pre=* | --pr=* | --p=*)
+    prefix=$ac_optarg ;;
+
+  -program-prefix | --program-prefix | --program-prefi | --program-pref \
+  | --program-pre | --program-pr | --program-p)
+    ac_prev=program_prefix ;;
+  -program-prefix=* | --program-prefix=* | --program-prefi=* \
+  | --program-pref=* | --program-pre=* | --program-pr=* | --program-p=*)
+    program_prefix=$ac_optarg ;;
+
+  -program-suffix | --program-suffix | --program-suffi | --program-suff \
+  | --program-suf | --program-su | --program-s)
+    ac_prev=program_suffix ;;
+  -program-suffix=* | --program-suffix=* | --program-suffi=* \
+  | --program-suff=* | --program-suf=* | --program-su=* | --program-s=*)
+    program_suffix=$ac_optarg ;;
+
+  -program-transform-name | --program-transform-name \
+  | --program-transform-nam | --program-transform-na \
+  | --program-transform-n | --program-transform- \
+  | --program-transform | --program-transfor \
+  | --program-transfo | --program-transf \
+  | --program-trans | --program-tran \
+  | --progr-tra | --program-tr | --program-t)
+    ac_prev=program_transform_name ;;
+  -program-transform-name=* | --program-transform-name=* \
+  | --program-transform-nam=* | --program-transform-na=* \
+  | --program-transform-n=* | --program-transform-=* \
+  | --program-transform=* | --program-transfor=* \
+  | --program-transfo=* | --program-transf=* \
+  | --program-trans=* | --program-tran=* \
+  | --progr-tra=* | --program-tr=* | --program-t=*)
+    program_transform_name=$ac_optarg ;;
+
+  -pdfdir | --pdfdir | --pdfdi | --pdfd | --pdf | --pd)
+    ac_prev=pdfdir ;;
+  -pdfdir=* | --pdfdir=* | --pdfdi=* | --pdfd=* | --pdf=* | --pd=*)
+    pdfdir=$ac_optarg ;;
+
+  -psdir | --psdir | --psdi | --psd | --ps)
+    ac_prev=psdir ;;
+  -psdir=* | --psdir=* | --psdi=* | --psd=* | --ps=*)
+    psdir=$ac_optarg ;;
+
+  -q | -quiet | --quiet | --quie | --qui | --qu | --q \
+  | -silent | --silent | --silen | --sile | --sil)
+    silent=yes ;;
+
+  -runstatedir | --runstatedir | --runstatedi | --runstated \
+  | --runstate | --runstat | --runsta | --runst | --runs \
+  | --run | --ru | --r)
+    ac_prev=runstatedir ;;
+  -runstatedir=* | --runstatedir=* | --runstatedi=* | --runstated=* \
+  | --runstate=* | --runstat=* | --runsta=* | --runst=* | --runs=* \
+  | --run=* | --ru=* | --r=*)
+    runstatedir=$ac_optarg ;;
+
+  -sbindir | --sbindir | --sbindi | --sbind | --sbin | --sbi | --sb)
+    ac_prev=sbindir ;;
+  -sbindir=* | --sbindir=* | --sbindi=* | --sbind=* | --sbin=* \
+  | --sbi=* | --sb=*)
+    sbindir=$ac_optarg ;;
+
+  -sharedstatedir | --sharedstatedir | --sharedstatedi \
+  | --sharedstated | --sharedstate | --sharedstat | --sharedsta \
+  | --sharedst | --shareds | --shared | --share | --shar \
+  | --sha | --sh)
+    ac_prev=sharedstatedir ;;
+  -sharedstatedir=* | --sharedstatedir=* | --sharedstatedi=* \
+  | --sharedstated=* | --sharedstate=* | --sharedstat=* | --sharedsta=* \
+  | --sharedst=* | --shareds=* | --shared=* | --share=* | --shar=* \
+  | --sha=* | --sh=*)
+    sharedstatedir=$ac_optarg ;;
+
+  -site | --site | --sit)
+    ac_prev=site ;;
+  -site=* | --site=* | --sit=*)
+    site=$ac_optarg ;;
+
+  -srcdir | --srcdir | --srcdi | --srcd | --src | --sr)
+    ac_prev=srcdir ;;
+  -srcdir=* | --srcdir=* | --srcdi=* | --srcd=* | --src=* | --sr=*)
+    srcdir=$ac_optarg ;;
+
+  -sysconfdir | --sysconfdir | --sysconfdi | --sysconfd | --sysconf \
+  | --syscon | --sysco | --sysc | --sys | --sy)
+    ac_prev=sysconfdir ;;
+  -sysconfdir=* | --sysconfdir=* | --sysconfdi=* | --sysconfd=* | --sysconf=* \
+  | --syscon=* | --sysco=* | --sysc=* | --sys=* | --sy=*)
+    sysconfdir=$ac_optarg ;;
+
+  -target | --target | --targe | --targ | --tar | --ta | --t)
+    ac_prev=target_alias ;;
+  -target=* | --target=* | --targe=* | --targ=* | --tar=* | --ta=* | --t=*)
+    target_alias=$ac_optarg ;;
+
+  -v | -verbose | --verbose | --verbos | --verbo | --verb)
+    verbose=yes ;;
+
+  -version | --version | --versio | --versi | --vers | -V)
+    ac_init_version=: ;;
+
+  -with-* | --with-*)
+    ac_useropt=`expr "x$ac_option" : 'x-*with-\([^=]*\)'`
+    # Reject names that are not valid shell variable names.
+    expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null &&
+      as_fn_error $? "invalid package name: '$ac_useropt'"
+    ac_useropt_orig=$ac_useropt
+    ac_useropt=`printf "%s\n" "$ac_useropt" | sed 's/[-+.]/_/g'`
+    case $ac_user_opts in
+      *"
+"with_$ac_useropt"
+"*) ;;
+      *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--with-$ac_useropt_orig"
+	 ac_unrecognized_sep=', ';;
+    esac
+    eval with_$ac_useropt=\$ac_optarg ;;
+
+  -without-* | --without-*)
+    ac_useropt=`expr "x$ac_option" : 'x-*without-\(.*\)'`
+    # Reject names that are not valid shell variable names.
+    expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null &&
+      as_fn_error $? "invalid package name: '$ac_useropt'"
+    ac_useropt_orig=$ac_useropt
+    ac_useropt=`printf "%s\n" "$ac_useropt" | sed 's/[-+.]/_/g'`
+    case $ac_user_opts in
+      *"
+"with_$ac_useropt"
+"*) ;;
+      *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--without-$ac_useropt_orig"
+	 ac_unrecognized_sep=', ';;
+    esac
+    eval with_$ac_useropt=no ;;
+
+  --x)
+    # Obsolete; use --with-x.
+    with_x=yes ;;
+
+  -x-includes | --x-includes | --x-include | --x-includ | --x-inclu \
+  | --x-incl | --x-inc | --x-in | --x-i)
+    ac_prev=x_includes ;;
+  -x-includes=* | --x-includes=* | --x-include=* | --x-includ=* | --x-inclu=* \
+  | --x-incl=* | --x-inc=* | --x-in=* | --x-i=*)
+    x_includes=$ac_optarg ;;
+
+  -x-libraries | --x-libraries | --x-librarie | --x-librari \
+  | --x-librar | --x-libra | --x-libr | --x-lib | --x-li | --x-l)
+    ac_prev=x_libraries ;;
+  -x-libraries=* | --x-libraries=* | --x-librarie=* | --x-librari=* \
+  | --x-librar=* | --x-libra=* | --x-libr=* | --x-lib=* | --x-li=* | --x-l=*)
+    x_libraries=$ac_optarg ;;
+
+  -*) as_fn_error $? "unrecognized option: '$ac_option'
+Try '$0 --help' for more information"
+    ;;
+
+  *=*)
+    ac_envvar=`expr "x$ac_option" : 'x\([^=]*\)='`
+    # Reject names that are not valid shell variable names.
+    case $ac_envvar in #(
+      '' | [0-9]* | *[!_$as_cr_alnum]* )
+      as_fn_error $? "invalid variable name: '$ac_envvar'" ;;
+    esac
+    eval $ac_envvar=\$ac_optarg
+    export $ac_envvar ;;
+
+  *)
+    # FIXME: should be removed in autoconf 3.0.
+    printf "%s\n" "$as_me: WARNING: you should use --build, --host, --target" >&2
+    expr "x$ac_option" : ".*[^-._$as_cr_alnum]" >/dev/null &&
+      printf "%s\n" "$as_me: WARNING: invalid host type: $ac_option" >&2
+    : "${build_alias=$ac_option} ${host_alias=$ac_option} ${target_alias=$ac_option}"
+    ;;
+
+  esac
+done
+
+if test -n "$ac_prev"; then
+  ac_option=--`echo $ac_prev | sed 's/_/-/g'`
+  as_fn_error $? "missing argument to $ac_option"
+fi
+
+if test -n "$ac_unrecognized_opts"; then
+  case $enable_option_checking in
+    no) ;;
+    fatal) as_fn_error $? "unrecognized options: $ac_unrecognized_opts" ;;
+    *)     printf "%s\n" "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2 ;;
+  esac
+fi
+
+# Check all directory arguments for consistency.
+for ac_var in	exec_prefix prefix bindir sbindir libexecdir datarootdir \
+		datadir sysconfdir sharedstatedir localstatedir includedir \
+		oldincludedir docdir infodir htmldir dvidir pdfdir psdir \
+		libdir localedir mandir runstatedir
+do
+  eval ac_val=\$$ac_var
+  # Remove trailing slashes.
+  case $ac_val in
+    */ )
+      ac_val=`expr "X$ac_val" : 'X\(.*[^/]\)' \| "X$ac_val" : 'X\(.*\)'`
+      eval $ac_var=\$ac_val;;
+  esac
+  # Be sure to have absolute directory names.
+  case $ac_val in
+    [\\/$]* | ?:[\\/]* )  continue;;
+    NONE | '' ) case $ac_var in *prefix ) continue;; esac;;
+  esac
+  as_fn_error $? "expected an absolute directory name for --$ac_var: $ac_val"
+done
+
+# There might be people who depend on the old broken behavior: '$host'
+# used to hold the argument of --host etc.
+# FIXME: To remove some day.
+build=$build_alias
+host=$host_alias
+target=$target_alias
+
+# FIXME: To remove some day.
+if test "x$host_alias" != x; then
+  if test "x$build_alias" = x; then
+    cross_compiling=maybe
+  elif test "x$build_alias" != "x$host_alias"; then
+    cross_compiling=yes
+  fi
+fi
+
+ac_tool_prefix=
+test -n "$host_alias" && ac_tool_prefix=$host_alias-
+
+test "$silent" = yes && exec 6>/dev/null
+
+
+ac_pwd=`pwd` && test -n "$ac_pwd" &&
+ac_ls_di=`ls -di .` &&
+ac_pwd_ls_di=`cd "$ac_pwd" && ls -di .` ||
+  as_fn_error $? "working directory cannot be determined"
+test "X$ac_ls_di" = "X$ac_pwd_ls_di" ||
+  as_fn_error $? "pwd does not report name of working directory"
+
+
+# Find the source files, if location was not specified.
+if test -z "$srcdir"; then
+  ac_srcdir_defaulted=yes
+  # Try the directory containing this script, then the parent directory.
+  ac_confdir=`$as_dirname -- "$as_myself" ||
+$as_expr X"$as_myself" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+	 X"$as_myself" : 'X\(//\)[^/]' \| \
+	 X"$as_myself" : 'X\(//\)$' \| \
+	 X"$as_myself" : 'X\(/\)' \| . 2>/dev/null ||
+printf "%s\n" X"$as_myself" |
+    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)[^/].*/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\).*/{
+	    s//\1/
+	    q
+	  }
+	  s/.*/./; q'`
+  srcdir=$ac_confdir
+  if test ! -r "$srcdir/$ac_unique_file"; then
+    srcdir=..
+  fi
+else
+  ac_srcdir_defaulted=no
+fi
+if test ! -r "$srcdir/$ac_unique_file"; then
+  test "$ac_srcdir_defaulted" = yes && srcdir="$ac_confdir or .."
+  as_fn_error $? "cannot find sources ($ac_unique_file) in $srcdir"
+fi
+ac_msg="sources are in $srcdir, but 'cd $srcdir' does not work"
+ac_abs_confdir=`(
+	cd "$srcdir" && test -r "./$ac_unique_file" || as_fn_error $? "$ac_msg"
+	pwd)`
+# When building in place, set srcdir=.
+if test "$ac_abs_confdir" = "$ac_pwd"; then
+  srcdir=.
+fi
+# Remove unnecessary trailing slashes from srcdir.
+# Double slashes in file names in object file debugging info
+# mess up M-x gdb in Emacs.
+case $srcdir in
+*/) srcdir=`expr "X$srcdir" : 'X\(.*[^/]\)' \| "X$srcdir" : 'X\(.*\)'`;;
+esac
+for ac_var in $ac_precious_vars; do
+  eval ac_env_${ac_var}_set=\${${ac_var}+set}
+  eval ac_env_${ac_var}_value=\$${ac_var}
+  eval ac_cv_env_${ac_var}_set=\${${ac_var}+set}
+  eval ac_cv_env_${ac_var}_value=\$${ac_var}
+done
+
+#
+# Report the --help message.
+#
+if test "$ac_init_help" = "long"; then
+  # Omit some internal or obsolete options to make the list less imposing.
+  # This message is too long to be a string in the A/UX 3.1 sh.
+  cat <<_ACEOF
+'configure' configures RcppEigen 0.3.4.0.2-1 to adapt to many kinds of systems.
+
+Usage: $0 [OPTION]... [VAR=VALUE]...
+
+To assign environment variables (e.g., CC, CFLAGS...), specify them as
+VAR=VALUE.  See below for descriptions of some of the useful variables.
+
+Defaults for the options are specified in brackets.
+
+Configuration:
+  -h, --help              display this help and exit
+      --help=short        display options specific to this package
+      --help=recursive    display the short help of all the included packages
+  -V, --version           display version information and exit
+  -q, --quiet, --silent   do not print 'checking ...' messages
+      --cache-file=FILE   cache test results in FILE [disabled]
+  -C, --config-cache      alias for '--cache-file=config.cache'
+  -n, --no-create         do not create output files
+      --srcdir=DIR        find the sources in DIR [configure dir or '..']
+
+Installation directories:
+  --prefix=PREFIX         install architecture-independent files in PREFIX
+                          [$ac_default_prefix]
+  --exec-prefix=EPREFIX   install architecture-dependent files in EPREFIX
+                          [PREFIX]
+
+By default, 'make install' will install all the files in
+'$ac_default_prefix/bin', '$ac_default_prefix/lib' etc.  You can specify
+an installation prefix other than '$ac_default_prefix' using '--prefix',
+for instance '--prefix=\$HOME'.
+
+For better control, use the options below.
+
+Fine tuning of the installation directories:
+  --bindir=DIR            user executables [EPREFIX/bin]
+  --sbindir=DIR           system admin executables [EPREFIX/sbin]
+  --libexecdir=DIR        program executables [EPREFIX/libexec]
+  --sysconfdir=DIR        read-only single-machine data [PREFIX/etc]
+  --sharedstatedir=DIR    modifiable architecture-independent data [PREFIX/com]
+  --localstatedir=DIR     modifiable single-machine data [PREFIX/var]
+  --runstatedir=DIR       modifiable per-process data [LOCALSTATEDIR/run]
+  --libdir=DIR            object code libraries [EPREFIX/lib]
+  --includedir=DIR        C header files [PREFIX/include]
+  --oldincludedir=DIR     C header files for non-gcc [/usr/include]
+  --datarootdir=DIR       read-only arch.-independent data root [PREFIX/share]
+  --datadir=DIR           read-only architecture-independent data [DATAROOTDIR]
+  --infodir=DIR           info documentation [DATAROOTDIR/info]
+  --localedir=DIR         locale-dependent data [DATAROOTDIR/locale]
+  --mandir=DIR            man documentation [DATAROOTDIR/man]
+  --docdir=DIR            documentation root [DATAROOTDIR/doc/rcppeigen]
+  --htmldir=DIR           html documentation [DOCDIR]
+  --dvidir=DIR            dvi documentation [DOCDIR]
+  --pdfdir=DIR            pdf documentation [DOCDIR]
+  --psdir=DIR             ps documentation [DOCDIR]
+_ACEOF
+
+  cat <<\_ACEOF
+_ACEOF
+fi
+
+if test -n "$ac_init_help"; then
+  case $ac_init_help in
+     short | recursive ) echo "Configuration of RcppEigen 0.3.4.0.2-1:";;
+   esac
+  cat <<\_ACEOF
+
+Some influential environment variables:
+  CXX         C++ compiler command
+  CXXFLAGS    C++ compiler flags
+  LDFLAGS     linker flags, e.g. -L<lib dir> if you have libraries in a
+              nonstandard directory <lib dir>
+  LIBS        libraries to pass to the linker, e.g. -l<library>
+  CPPFLAGS    (Objective) C/C++ preprocessor flags, e.g. -I<include dir> if
+              you have headers in a nonstandard directory <include dir>
+  CXXCPP      C++ preprocessor
+
+Use these variables to override the choices made by 'configure' or to help
+it to find libraries and programs with nonstandard names/locations.
+
+Report bugs to <edd@debian.org>.
+_ACEOF
+ac_status=$?
+fi
+
+if test "$ac_init_help" = "recursive"; then
+  # If there are subdirs, report their specific --help.
+  for ac_dir in : $ac_subdirs_all; do test "x$ac_dir" = x: && continue
+    test -d "$ac_dir" ||
+      { cd "$srcdir" && ac_pwd=`pwd` && srcdir=. && test -d "$ac_dir"; } ||
+      continue
+    ac_builddir=.
+
+case "$ac_dir" in
+.) ac_dir_suffix= ac_top_builddir_sub=. ac_top_build_prefix= ;;
+*)
+  ac_dir_suffix=/`printf "%s\n" "$ac_dir" | sed 's|^\.[\\/]||'`
+  # A ".." for each directory in $ac_dir_suffix.
+  ac_top_builddir_sub=`printf "%s\n" "$ac_dir_suffix" | sed 's|/[^\\/]*|/..|g;s|/||'`
+  case $ac_top_builddir_sub in
+  "") ac_top_builddir_sub=. ac_top_build_prefix= ;;
+  *)  ac_top_build_prefix=$ac_top_builddir_sub/ ;;
+  esac ;;
+esac
+ac_abs_top_builddir=$ac_pwd
+ac_abs_builddir=$ac_pwd$ac_dir_suffix
+# for backward compatibility:
+ac_top_builddir=$ac_top_build_prefix
+
+case $srcdir in
+  .)  # We are building in place.
+    ac_srcdir=.
+    ac_top_srcdir=$ac_top_builddir_sub
+    ac_abs_top_srcdir=$ac_pwd ;;
+  [\\/]* | ?:[\\/]* )  # Absolute name.
+    ac_srcdir=$srcdir$ac_dir_suffix;
+    ac_top_srcdir=$srcdir
+    ac_abs_top_srcdir=$srcdir ;;
+  *) # Relative name.
+    ac_srcdir=$ac_top_build_prefix$srcdir$ac_dir_suffix
+    ac_top_srcdir=$ac_top_build_prefix$srcdir
+    ac_abs_top_srcdir=$ac_pwd/$srcdir ;;
+esac
+ac_abs_srcdir=$ac_abs_top_srcdir$ac_dir_suffix
+
+    cd "$ac_dir" || { ac_status=$?; continue; }
+    # Check for configure.gnu first; this name is used for a wrapper for
+    # Metaconfig's "Configure" on case-insensitive file systems.
+    if test -f "$ac_srcdir/configure.gnu"; then
+      echo &&
+      $SHELL "$ac_srcdir/configure.gnu" --help=recursive
+    elif test -f "$ac_srcdir/configure"; then
+      echo &&
+      $SHELL "$ac_srcdir/configure" --help=recursive
+    else
+      printf "%s\n" "$as_me: WARNING: no configuration information is in $ac_dir" >&2
+    fi || ac_status=$?
+    cd "$ac_pwd" || { ac_status=$?; break; }
+  done
+fi
+
+test -n "$ac_init_help" && exit $ac_status
+if $ac_init_version; then
+  cat <<\_ACEOF
+RcppEigen configure 0.3.4.0.2-1
+generated by GNU Autoconf 2.72
+
+Copyright (C) 2023 Free Software Foundation, Inc.
+This configure script is free software; the Free Software Foundation
+gives unlimited permission to copy, distribute and modify it.
+_ACEOF
+  exit
+fi
+
+## ------------------------ ##
+## Autoconf initialization. ##
+## ------------------------ ##
+
+# ac_fn_cxx_try_compile LINENO
+# ----------------------------
+# Try to compile conftest.$ac_ext, and return whether this succeeded.
+ac_fn_cxx_try_compile ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  rm -f conftest.$ac_objext conftest.beam
+  if { { ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+printf "%s\n" "$ac_try_echo"; } >&5
+  (eval "$ac_compile") 2>conftest.err
+  ac_status=$?
+  if test -s conftest.err; then
+    grep -v '^ *+' conftest.err >conftest.er1
+    cat conftest.er1 >&5
+    mv -f conftest.er1 conftest.err
+  fi
+  printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; } && {
+	 test -z "$ac_cxx_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest.$ac_objext
+then :
+  ac_retval=0
+else case e in #(
+  e) printf "%s\n" "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+	ac_retval=1 ;;
+esac
+fi
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+  as_fn_set_status $ac_retval
+
+} # ac_fn_cxx_try_compile
+
+# ac_fn_cxx_try_cpp LINENO
+# ------------------------
+# Try to preprocess conftest.$ac_ext, and return whether this succeeded.
+ac_fn_cxx_try_cpp ()
+{
+  as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+  if { { ac_try="$ac_cpp conftest.$ac_ext"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+printf "%s\n" "$ac_try_echo"; } >&5
+  (eval "$ac_cpp conftest.$ac_ext") 2>conftest.err
+  ac_status=$?
+  if test -s conftest.err; then
+    grep -v '^ *+' conftest.err >conftest.er1
+    cat conftest.er1 >&5
+    mv -f conftest.er1 conftest.err
+  fi
+  printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; } > conftest.i && {
+	 test -z "$ac_cxx_preproc_warn_flag$ac_cxx_werror_flag" ||
+	 test ! -s conftest.err
+       }
+then :
+  ac_retval=0
+else case e in #(
+  e) printf "%s\n" "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+    ac_retval=1 ;;
+esac
+fi
+  eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno
+  as_fn_set_status $ac_retval
+
+} # ac_fn_cxx_try_cpp
+ac_configure_args_raw=
+for ac_arg
+do
+  case $ac_arg in
+  *\'*)
+    ac_arg=`printf "%s\n" "$ac_arg" | sed "s/'/'\\\\\\\\''/g"` ;;
+  esac
+  as_fn_append ac_configure_args_raw " '$ac_arg'"
+done
+
+case $ac_configure_args_raw in
+  *$as_nl*)
+    ac_safe_unquote= ;;
+  *)
+    ac_unsafe_z='|&;<>()$`\\"*?[ ''	' # This string ends in space, tab.
+    ac_unsafe_a="$ac_unsafe_z#~"
+    ac_safe_unquote="s/ '\\([^$ac_unsafe_a][^$ac_unsafe_z]*\\)'/ \\1/g"
+    ac_configure_args_raw=`      printf "%s\n" "$ac_configure_args_raw" | sed "$ac_safe_unquote"`;;
+esac
+
+cat >config.log <<_ACEOF
+This file contains any messages produced by compilers while
+running configure, to aid debugging if configure makes a mistake.
+
+It was created by RcppEigen $as_me 0.3.4.0.2-1, which was
+generated by GNU Autoconf 2.72.  Invocation command line was
+
+  $ $0$ac_configure_args_raw
+
+_ACEOF
+exec 5>>config.log
+{
+cat <<_ASUNAME
+## --------- ##
+## Platform. ##
+## --------- ##
+
+hostname = `(hostname || uname -n) 2>/dev/null | sed 1q`
+uname -m = `(uname -m) 2>/dev/null || echo unknown`
+uname -r = `(uname -r) 2>/dev/null || echo unknown`
+uname -s = `(uname -s) 2>/dev/null || echo unknown`
+uname -v = `(uname -v) 2>/dev/null || echo unknown`
+
+/usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null || echo unknown`
+/bin/uname -X     = `(/bin/uname -X) 2>/dev/null     || echo unknown`
+
+/bin/arch              = `(/bin/arch) 2>/dev/null              || echo unknown`
+/usr/bin/arch -k       = `(/usr/bin/arch -k) 2>/dev/null       || echo unknown`
+/usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null || echo unknown`
+/usr/bin/hostinfo      = `(/usr/bin/hostinfo) 2>/dev/null      || echo unknown`
+/bin/machine           = `(/bin/machine) 2>/dev/null           || echo unknown`
+/usr/bin/oslevel       = `(/usr/bin/oslevel) 2>/dev/null       || echo unknown`
+/bin/universe          = `(/bin/universe) 2>/dev/null          || echo unknown`
+
+_ASUNAME
+
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  case $as_dir in #(((
+    '') as_dir=./ ;;
+    */) ;;
+    *) as_dir=$as_dir/ ;;
+  esac
+    printf "%s\n" "PATH: $as_dir"
+  done
+IFS=$as_save_IFS
+
+} >&5
+
+cat >&5 <<_ACEOF
+
+
+## ----------- ##
+## Core tests. ##
+## ----------- ##
+
+_ACEOF
+
+
+# Keep a trace of the command line.
+# Strip out --no-create and --no-recursion so they do not pile up.
+# Strip out --silent because we don't want to record it for future runs.
+# Also quote any args containing shell meta-characters.
+# Make two passes to allow for proper duplicate-argument suppression.
+ac_configure_args=
+ac_configure_args0=
+ac_configure_args1=
+ac_must_keep_next=false
+for ac_pass in 1 2
+do
+  for ac_arg
+  do
+    case $ac_arg in
+    -no-create | --no-c* | -n | -no-recursion | --no-r*) continue ;;
+    -q | -quiet | --quiet | --quie | --qui | --qu | --q \
+    | -silent | --silent | --silen | --sile | --sil)
+      continue ;;
+    *\'*)
+      ac_arg=`printf "%s\n" "$ac_arg" | sed "s/'/'\\\\\\\\''/g"` ;;
+    esac
+    case $ac_pass in
+    1) as_fn_append ac_configure_args0 " '$ac_arg'" ;;
+    2)
+      as_fn_append ac_configure_args1 " '$ac_arg'"
+      if test $ac_must_keep_next = true; then
+	ac_must_keep_next=false # Got value, back to normal.
+      else
+	case $ac_arg in
+	  *=* | --config-cache | -C | -disable-* | --disable-* \
+	  | -enable-* | --enable-* | -gas | --g* | -nfp | --nf* \
+	  | -q | -quiet | --q* | -silent | --sil* | -v | -verb* \
+	  | -with-* | --with-* | -without-* | --without-* | --x)
+	    case "$ac_configure_args0 " in
+	      "$ac_configure_args1"*" '$ac_arg' "* ) continue ;;
+	    esac
+	    ;;
+	  -* ) ac_must_keep_next=true ;;
+	esac
+      fi
+      as_fn_append ac_configure_args " '$ac_arg'"
+      ;;
+    esac
+  done
+done
+{ ac_configure_args0=; unset ac_configure_args0;}
+{ ac_configure_args1=; unset ac_configure_args1;}
+
+# When interrupted or exit'd, cleanup temporary files, and complete
+# config.log.  We remove comments because anyway the quotes in there
+# would cause problems or look ugly.
+# WARNING: Use '\'' to represent an apostrophe within the trap.
+# WARNING: Do not start the trap code with a newline, due to a FreeBSD 4.0 bug.
+trap 'exit_status=$?
+  # Sanitize IFS.
+  IFS=" ""	$as_nl"
+  # Save into config.log some information that might help in debugging.
+  {
+    echo
+
+    printf "%s\n" "## ---------------- ##
+## Cache variables. ##
+## ---------------- ##"
+    echo
+    # The following way of writing the cache mishandles newlines in values,
+(
+  for ac_var in `(set) 2>&1 | sed -n '\''s/^\([a-zA-Z_][a-zA-Z0-9_]*\)=.*/\1/p'\''`; do
+    eval ac_val=\$$ac_var
+    case $ac_val in #(
+    *${as_nl}*)
+      case $ac_var in #(
+      *_cv_*) { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: cache variable $ac_var contains a newline" >&5
+printf "%s\n" "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;;
+      esac
+      case $ac_var in #(
+      _ | IFS | as_nl) ;; #(
+      BASH_ARGV | BASH_SOURCE) eval $ac_var= ;; #(
+      *) { eval $ac_var=; unset $ac_var;} ;;
+      esac ;;
+    esac
+  done
+  (set) 2>&1 |
+    case $as_nl`(ac_space='\'' '\''; set) 2>&1` in #(
+    *${as_nl}ac_space=\ *)
+      sed -n \
+	"s/'\''/'\''\\\\'\'''\''/g;
+	  s/^\\([_$as_cr_alnum]*_cv_[_$as_cr_alnum]*\\)=\\(.*\\)/\\1='\''\\2'\''/p"
+      ;; #(
+    *)
+      sed -n "/^[_$as_cr_alnum]*_cv_[_$as_cr_alnum]*=/p"
+      ;;
+    esac |
+    sort
+)
+    echo
+
+    printf "%s\n" "## ----------------- ##
+## Output variables. ##
+## ----------------- ##"
+    echo
+    for ac_var in $ac_subst_vars
+    do
+      eval ac_val=\$$ac_var
+      case $ac_val in
+      *\'\''*) ac_val=`printf "%s\n" "$ac_val" | sed "s/'\''/'\''\\\\\\\\'\'''\''/g"`;;
+      esac
+      printf "%s\n" "$ac_var='\''$ac_val'\''"
+    done | sort
+    echo
+
+    if test -n "$ac_subst_files"; then
+      printf "%s\n" "## ------------------- ##
+## File substitutions. ##
+## ------------------- ##"
+      echo
+      for ac_var in $ac_subst_files
+      do
+	eval ac_val=\$$ac_var
+	case $ac_val in
+	*\'\''*) ac_val=`printf "%s\n" "$ac_val" | sed "s/'\''/'\''\\\\\\\\'\'''\''/g"`;;
+	esac
+	printf "%s\n" "$ac_var='\''$ac_val'\''"
+      done | sort
+      echo
+    fi
+
+    if test -s confdefs.h; then
+      printf "%s\n" "## ----------- ##
+## confdefs.h. ##
+## ----------- ##"
+      echo
+      cat confdefs.h
+      echo
+    fi
+    test "$ac_signal" != 0 &&
+      printf "%s\n" "$as_me: caught signal $ac_signal"
+    printf "%s\n" "$as_me: exit $exit_status"
+  } >&5
+  rm -f core *.core core.conftest.* &&
+    rm -f -r conftest* confdefs* conf$$* $ac_clean_files &&
+    exit $exit_status
+' 0
+for ac_signal in 1 2 13 15; do
+  trap 'ac_signal='$ac_signal'; as_fn_exit 1' $ac_signal
+done
+ac_signal=0
+
+# confdefs.h avoids OS command line length limits that DEFS can exceed.
+rm -f -r conftest* confdefs.h
+
+printf "%s\n" "/* confdefs.h */" > confdefs.h
+
+# Predefined preprocessor variables.
+
+printf "%s\n" "#define PACKAGE_NAME \"$PACKAGE_NAME\"" >>confdefs.h
+
+printf "%s\n" "#define PACKAGE_TARNAME \"$PACKAGE_TARNAME\"" >>confdefs.h
+
+printf "%s\n" "#define PACKAGE_VERSION \"$PACKAGE_VERSION\"" >>confdefs.h
+
+printf "%s\n" "#define PACKAGE_STRING \"$PACKAGE_STRING\"" >>confdefs.h
+
+printf "%s\n" "#define PACKAGE_BUGREPORT \"$PACKAGE_BUGREPORT\"" >>confdefs.h
+
+printf "%s\n" "#define PACKAGE_URL \"$PACKAGE_URL\"" >>confdefs.h
+
+
+# Let the site file select an alternate cache file if it wants to.
+# Prefer an explicitly selected file to automatically selected ones.
+if test -n "$CONFIG_SITE"; then
+  ac_site_files="$CONFIG_SITE"
+elif test "x$prefix" != xNONE; then
+  ac_site_files="$prefix/share/config.site $prefix/etc/config.site"
+else
+  ac_site_files="$ac_default_prefix/share/config.site $ac_default_prefix/etc/config.site"
+fi
+
+for ac_site_file in $ac_site_files
+do
+  case $ac_site_file in #(
+  */*) :
+     ;; #(
+  *) :
+    ac_site_file=./$ac_site_file ;;
+esac
+  if test -f "$ac_site_file" && test -r "$ac_site_file"; then
+    { printf "%s\n" "$as_me:${as_lineno-$LINENO}: loading site script $ac_site_file" >&5
+printf "%s\n" "$as_me: loading site script $ac_site_file" >&6;}
+    sed 's/^/| /' "$ac_site_file" >&5
+    . "$ac_site_file" \
+      || { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: error: in '$ac_pwd':" >&5
+printf "%s\n" "$as_me: error: in '$ac_pwd':" >&2;}
+as_fn_error $? "failed to load site script $ac_site_file
+See 'config.log' for more details" "$LINENO" 5; }
+  fi
+done
+
+if test -r "$cache_file"; then
+  # Some versions of bash will fail to source /dev/null (special files
+  # actually), so we avoid doing that.  DJGPP emulates it as a regular file.
+  if test /dev/null != "$cache_file" && test -f "$cache_file"; then
+    { printf "%s\n" "$as_me:${as_lineno-$LINENO}: loading cache $cache_file" >&5
+printf "%s\n" "$as_me: loading cache $cache_file" >&6;}
+    case $cache_file in
+      [\\/]* | ?:[\\/]* ) . "$cache_file";;
+      *)                      . "./$cache_file";;
+    esac
+  fi
+else
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: creating cache $cache_file" >&5
+printf "%s\n" "$as_me: creating cache $cache_file" >&6;}
+  >$cache_file
+fi
+
+# Test code for whether the C++ compiler supports C++98 (global declarations)
+ac_cxx_conftest_cxx98_globals='
+// Does the compiler advertise C++98 conformance?
+#if !defined __cplusplus || __cplusplus < 199711L
+# error "Compiler does not advertise C++98 conformance"
+#endif
+
+// These inclusions are to reject old compilers that
+// lack the unsuffixed header files.
+#include <cstdlib>
+#include <exception>
+
+// <cassert> and <cstring> are *not* freestanding headers in C++98.
+extern void assert (int);
+namespace std {
+  extern int strcmp (const char *, const char *);
+}
+
+// Namespaces, exceptions, and templates were all added after "C++ 2.0".
+using std::exception;
+using std::strcmp;
+
+namespace {
+
+void test_exception_syntax()
+{
+  try {
+    throw "test";
+  } catch (const char *s) {
+    // Extra parentheses suppress a warning when building autoconf itself,
+    // due to lint rules shared with more typical C programs.
+    assert (!(strcmp) (s, "test"));
+  }
+}
+
+template <typename T> struct test_template
+{
+  T const val;
+  explicit test_template(T t) : val(t) {}
+  template <typename U> T add(U u) { return static_cast<T>(u) + val; }
+};
+
+} // anonymous namespace
+'
+
+# Test code for whether the C++ compiler supports C++98 (body of main)
+ac_cxx_conftest_cxx98_main='
+  assert (argc);
+  assert (! argv[0]);
+{
+  test_exception_syntax ();
+  test_template<double> tt (2.0);
+  assert (tt.add (4) == 6.0);
+  assert (true && !false);
+}
+'
+
+# Test code for whether the C++ compiler supports C++11 (global declarations)
+ac_cxx_conftest_cxx11_globals='
+// Does the compiler advertise C++ 2011 conformance?
+#if !defined __cplusplus || __cplusplus < 201103L
+# error "Compiler does not advertise C++11 conformance"
+#endif
+
+namespace cxx11test
+{
+  constexpr int get_val() { return 20; }
+
+  struct testinit
+  {
+    int i;
+    double d;
+  };
+
+  class delegate
+  {
+  public:
+    delegate(int n) : n(n) {}
+    delegate(): delegate(2354) {}
+
+    virtual int getval() { return this->n; };
+  protected:
+    int n;
+  };
+
+  class overridden : public delegate
+  {
+  public:
+    overridden(int n): delegate(n) {}
+    virtual int getval() override final { return this->n * 2; }
+  };
+
+  class nocopy
+  {
+  public:
+    nocopy(int i): i(i) {}
+    nocopy() = default;
+    nocopy(const nocopy&) = delete;
+    nocopy & operator=(const nocopy&) = delete;
+  private:
+    int i;
+  };
+
+  // for testing lambda expressions
+  template <typename Ret, typename Fn> Ret eval(Fn f, Ret v)
+  {
+    return f(v);
+  }
+
+  // for testing variadic templates and trailing return types
+  template <typename V> auto sum(V first) -> V
+  {
+    return first;
+  }
+  template <typename V, typename... Args> auto sum(V first, Args... rest) -> V
+  {
+    return first + sum(rest...);
+  }
+}
+'
+
+# Test code for whether the C++ compiler supports C++11 (body of main)
+ac_cxx_conftest_cxx11_main='
+{
+  // Test auto and decltype
+  auto a1 = 6538;
+  auto a2 = 48573953.4;
+  auto a3 = "String literal";
+
+  int total = 0;
+  for (auto i = a3; *i; ++i) { total += *i; }
+
+  decltype(a2) a4 = 34895.034;
+}
+{
+  // Test constexpr
+  short sa[cxx11test::get_val()] = { 0 };
+}
+{
+  // Test initializer lists
+  cxx11test::testinit il = { 4323, 435234.23544 };
+}
+{
+  // Test range-based for
+  int array[] = {9, 7, 13, 15, 4, 18, 12, 10, 5, 3,
+                 14, 19, 17, 8, 6, 20, 16, 2, 11, 1};
+  for (auto &x : array) { x += 23; }
+}
+{
+  // Test lambda expressions
+  using cxx11test::eval;
+  assert (eval ([](int x) { return x*2; }, 21) == 42);
+  double d = 2.0;
+  assert (eval ([&](double x) { return d += x; }, 3.0) == 5.0);
+  assert (d == 5.0);
+  assert (eval ([=](double x) mutable { return d += x; }, 4.0) == 9.0);
+  assert (d == 5.0);
+}
+{
+  // Test use of variadic templates
+  using cxx11test::sum;
+  auto a = sum(1);
+  auto b = sum(1, 2);
+  auto c = sum(1.0, 2.0, 3.0);
+}
+{
+  // Test constructor delegation
+  cxx11test::delegate d1;
+  cxx11test::delegate d2();
+  cxx11test::delegate d3(45);
+}
+{
+  // Test override and final
+  cxx11test::overridden o1(55464);
+}
+{
+  // Test nullptr
+  char *c = nullptr;
+}
+{
+  // Test template brackets
+  test_template<::test_template<int>> v(test_template<int>(12));
+}
+{
+  // Unicode literals
+  char const *utf8 = u8"UTF-8 string \u2500";
+  char16_t const *utf16 = u"UTF-8 string \u2500";
+  char32_t const *utf32 = U"UTF-32 string \u2500";
+}
+'
+
+# Test code for whether the C compiler supports C++11 (complete).
+ac_cxx_conftest_cxx11_program="${ac_cxx_conftest_cxx98_globals}
+${ac_cxx_conftest_cxx11_globals}
+
+int
+main (int argc, char **argv)
+{
+  int ok = 0;
+  ${ac_cxx_conftest_cxx98_main}
+  ${ac_cxx_conftest_cxx11_main}
+  return ok;
+}
+"
+
+# Test code for whether the C compiler supports C++98 (complete).
+ac_cxx_conftest_cxx98_program="${ac_cxx_conftest_cxx98_globals}
+int
+main (int argc, char **argv)
+{
+  int ok = 0;
+  ${ac_cxx_conftest_cxx98_main}
+  return ok;
+}
+"
+
+# Check that the precious variables saved in the cache have kept the same
+# value.
+ac_cache_corrupted=false
+for ac_var in $ac_precious_vars; do
+  eval ac_old_set=\$ac_cv_env_${ac_var}_set
+  eval ac_new_set=\$ac_env_${ac_var}_set
+  eval ac_old_val=\$ac_cv_env_${ac_var}_value
+  eval ac_new_val=\$ac_env_${ac_var}_value
+  case $ac_old_set,$ac_new_set in
+    set,)
+      { printf "%s\n" "$as_me:${as_lineno-$LINENO}: error: '$ac_var' was set to '$ac_old_val' in the previous run" >&5
+printf "%s\n" "$as_me: error: '$ac_var' was set to '$ac_old_val' in the previous run" >&2;}
+      ac_cache_corrupted=: ;;
+    ,set)
+      { printf "%s\n" "$as_me:${as_lineno-$LINENO}: error: '$ac_var' was not set in the previous run" >&5
+printf "%s\n" "$as_me: error: '$ac_var' was not set in the previous run" >&2;}
+      ac_cache_corrupted=: ;;
+    ,);;
+    *)
+      if test "x$ac_old_val" != "x$ac_new_val"; then
+	# differences in whitespace do not lead to failure.
+	ac_old_val_w=`echo x $ac_old_val`
+	ac_new_val_w=`echo x $ac_new_val`
+	if test "$ac_old_val_w" != "$ac_new_val_w"; then
+	  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: error: '$ac_var' has changed since the previous run:" >&5
+printf "%s\n" "$as_me: error: '$ac_var' has changed since the previous run:" >&2;}
+	  ac_cache_corrupted=:
+	else
+	  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: warning: ignoring whitespace changes in '$ac_var' since the previous run:" >&5
+printf "%s\n" "$as_me: warning: ignoring whitespace changes in '$ac_var' since the previous run:" >&2;}
+	  eval $ac_var=\$ac_old_val
+	fi
+	{ printf "%s\n" "$as_me:${as_lineno-$LINENO}:   former value:  '$ac_old_val'" >&5
+printf "%s\n" "$as_me:   former value:  '$ac_old_val'" >&2;}
+	{ printf "%s\n" "$as_me:${as_lineno-$LINENO}:   current value: '$ac_new_val'" >&5
+printf "%s\n" "$as_me:   current value: '$ac_new_val'" >&2;}
+      fi;;
+  esac
+  # Pass precious variables to config.status.
+  if test "$ac_new_set" = set; then
+    case $ac_new_val in
+    *\'*) ac_arg=$ac_var=`printf "%s\n" "$ac_new_val" | sed "s/'/'\\\\\\\\''/g"` ;;
+    *) ac_arg=$ac_var=$ac_new_val ;;
+    esac
+    case " $ac_configure_args " in
+      *" '$ac_arg' "*) ;; # Avoid dups.  Use of quotes ensures accuracy.
+      *) as_fn_append ac_configure_args " '$ac_arg'" ;;
+    esac
+  fi
+done
+if $ac_cache_corrupted; then
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: error: in '$ac_pwd':" >&5
+printf "%s\n" "$as_me: error: in '$ac_pwd':" >&2;}
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: error: changes in the environment can compromise the build" >&5
+printf "%s\n" "$as_me: error: changes in the environment can compromise the build" >&2;}
+  as_fn_error $? "run '${MAKE-make} distclean' and/or 'rm $cache_file'
+	    and start over" "$LINENO" 5
+fi
+## -------------------- ##
+## Main body of script. ##
+## -------------------- ##
+
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+
+
+## Set R_HOME, respecting an environment variable if one is set
+: ${R_HOME=$(R RHOME)}
+if test -z "${R_HOME}"; then
+    as_fn_error $? "Could not determine R_HOME." "$LINENO" 5
+fi
+
+## Use R to set CXX and CXXFLAGS
+CXX=$(${R_HOME}/bin/R CMD config CXX)
+CXXFLAGS=$("${R_HOME}/bin/R" CMD config CXXFLAGS)
+
+## We are using C++
+ac_ext=cpp
+ac_cpp='$CXXCPP $CPPFLAGS'
+ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
+
+
+
+
+
+
+
+ac_ext=cpp
+ac_cpp='$CXXCPP $CPPFLAGS'
+ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
+if test -z "$CXX"; then
+  if test -n "$CCC"; then
+    CXX=$CCC
+  else
+    if test -n "$ac_tool_prefix"; then
+  for ac_prog in g++ c++ gpp aCC CC cxx cc++ cl.exe FCC KCC RCC xlC_r xlC clang++
+  do
+    # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args.
+set dummy $ac_tool_prefix$ac_prog; ac_word=$2
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+printf %s "checking for $ac_word... " >&6; }
+if test ${ac_cv_prog_CXX+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) if test -n "$CXX"; then
+  ac_cv_prog_CXX="$CXX" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  case $as_dir in #(((
+    '') as_dir=./ ;;
+    */) ;;
+    *) as_dir=$as_dir/ ;;
+  esac
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir$ac_word$ac_exec_ext"; then
+    ac_cv_prog_CXX="$ac_tool_prefix$ac_prog"
+    printf "%s\n" "$as_me:${as_lineno-$LINENO}: found $as_dir$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi ;;
+esac
+fi
+CXX=$ac_cv_prog_CXX
+if test -n "$CXX"; then
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $CXX" >&5
+printf "%s\n" "$CXX" >&6; }
+else
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
+printf "%s\n" "no" >&6; }
+fi
+
+
+    test -n "$CXX" && break
+  done
+fi
+if test -z "$CXX"; then
+  ac_ct_CXX=$CXX
+  for ac_prog in g++ c++ gpp aCC CC cxx cc++ cl.exe FCC KCC RCC xlC_r xlC clang++
+do
+  # Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+printf %s "checking for $ac_word... " >&6; }
+if test ${ac_cv_prog_ac_ct_CXX+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) if test -n "$ac_ct_CXX"; then
+  ac_cv_prog_ac_ct_CXX="$ac_ct_CXX" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  case $as_dir in #(((
+    '') as_dir=./ ;;
+    */) ;;
+    *) as_dir=$as_dir/ ;;
+  esac
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir$ac_word$ac_exec_ext"; then
+    ac_cv_prog_ac_ct_CXX="$ac_prog"
+    printf "%s\n" "$as_me:${as_lineno-$LINENO}: found $as_dir$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi ;;
+esac
+fi
+ac_ct_CXX=$ac_cv_prog_ac_ct_CXX
+if test -n "$ac_ct_CXX"; then
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_ct_CXX" >&5
+printf "%s\n" "$ac_ct_CXX" >&6; }
+else
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
+printf "%s\n" "no" >&6; }
+fi
+
+
+  test -n "$ac_ct_CXX" && break
+done
+
+  if test "x$ac_ct_CXX" = x; then
+    CXX="g++"
+  else
+    case $cross_compiling:$ac_tool_warned in
+yes:)
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+printf "%s\n" "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+ac_tool_warned=yes ;;
+esac
+    CXX=$ac_ct_CXX
+  fi
+fi
+
+  fi
+fi
+# Provide some information about the compiler.
+printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for C++ compiler version" >&5
+set X $ac_compile
+ac_compiler=$2
+for ac_option in --version -v -V -qversion; do
+  { { ac_try="$ac_compiler $ac_option >&5"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+printf "%s\n" "$ac_try_echo"; } >&5
+  (eval "$ac_compiler $ac_option >&5") 2>conftest.err
+  ac_status=$?
+  if test -s conftest.err; then
+    sed '10a\
+... rest of stderr output deleted ...
+         10q' conftest.err >conftest.er1
+    cat conftest.er1 >&5
+  fi
+  rm -f conftest.er1 conftest.err
+  printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }
+done
+
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main (void)
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+ac_clean_files_save=$ac_clean_files
+ac_clean_files="$ac_clean_files a.out a.out.dSYM a.exe b.out"
+# Try to create an executable without -o first, disregard a.out.
+# It will help us diagnose broken compilers, and finding out an intuition
+# of exeext.
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether the C++ compiler works" >&5
+printf %s "checking whether the C++ compiler works... " >&6; }
+ac_link_default=`printf "%s\n" "$ac_link" | sed 's/ -o *conftest[^ ]*//'`
+
+# The possible output files:
+ac_files="a.out conftest.exe conftest a.exe a_out.exe b.out conftest.*"
+
+ac_rmfiles=
+for ac_file in $ac_files
+do
+  case $ac_file in
+    *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM | *.o | *.obj ) ;;
+    * ) ac_rmfiles="$ac_rmfiles $ac_file";;
+  esac
+done
+rm -f $ac_rmfiles
+
+if { { ac_try="$ac_link_default"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+printf "%s\n" "$ac_try_echo"; } >&5
+  (eval "$ac_link_default") 2>&5
+  ac_status=$?
+  printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }
+then :
+  # Autoconf-2.13 could set the ac_cv_exeext variable to 'no'.
+# So ignore a value of 'no', otherwise this would lead to 'EXEEXT = no'
+# in a Makefile.  We should not override ac_cv_exeext if it was cached,
+# so that the user can short-circuit this test for compilers unknown to
+# Autoconf.
+for ac_file in $ac_files ''
+do
+  test -f "$ac_file" || continue
+  case $ac_file in
+    *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM | *.o | *.obj )
+	;;
+    [ab].out )
+	# We found the default executable, but exeext='' is most
+	# certainly right.
+	break;;
+    *.* )
+	if test ${ac_cv_exeext+y} && test "$ac_cv_exeext" != no;
+	then :; else
+	   ac_cv_exeext=`expr "$ac_file" : '[^.]*\(\..*\)'`
+	fi
+	# We set ac_cv_exeext here because the later test for it is not
+	# safe: cross compilers may not add the suffix if given an '-o'
+	# argument, so we may need to know it at that point already.
+	# Even if this section looks crufty: it has the advantage of
+	# actually working.
+	break;;
+    * )
+	break;;
+  esac
+done
+test "$ac_cv_exeext" = no && ac_cv_exeext=
+
+else case e in #(
+  e) ac_file='' ;;
+esac
+fi
+if test -z "$ac_file"
+then :
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
+printf "%s\n" "no" >&6; }
+printf "%s\n" "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+{ { printf "%s\n" "$as_me:${as_lineno-$LINENO}: error: in '$ac_pwd':" >&5
+printf "%s\n" "$as_me: error: in '$ac_pwd':" >&2;}
+as_fn_error 77 "C++ compiler cannot create executables
+See 'config.log' for more details" "$LINENO" 5; }
+else case e in #(
+  e) { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+printf "%s\n" "yes" >&6; } ;;
+esac
+fi
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for C++ compiler default output file name" >&5
+printf %s "checking for C++ compiler default output file name... " >&6; }
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_file" >&5
+printf "%s\n" "$ac_file" >&6; }
+ac_exeext=$ac_cv_exeext
+
+rm -f -r a.out a.out.dSYM a.exe conftest$ac_cv_exeext b.out
+ac_clean_files=$ac_clean_files_save
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for suffix of executables" >&5
+printf %s "checking for suffix of executables... " >&6; }
+if { { ac_try="$ac_link"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+printf "%s\n" "$ac_try_echo"; } >&5
+  (eval "$ac_link") 2>&5
+  ac_status=$?
+  printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }
+then :
+  # If both 'conftest.exe' and 'conftest' are 'present' (well, observable)
+# catch 'conftest.exe'.  For instance with Cygwin, 'ls conftest' will
+# work properly (i.e., refer to 'conftest.exe'), while it won't with
+# 'rm'.
+for ac_file in conftest.exe conftest conftest.*; do
+  test -f "$ac_file" || continue
+  case $ac_file in
+    *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM | *.o | *.obj ) ;;
+    *.* ) ac_cv_exeext=`expr "$ac_file" : '[^.]*\(\..*\)'`
+	  break;;
+    * ) break;;
+  esac
+done
+else case e in #(
+  e) { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: error: in '$ac_pwd':" >&5
+printf "%s\n" "$as_me: error: in '$ac_pwd':" >&2;}
+as_fn_error $? "cannot compute suffix of executables: cannot compile and link
+See 'config.log' for more details" "$LINENO" 5; } ;;
+esac
+fi
+rm -f conftest conftest$ac_cv_exeext
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_exeext" >&5
+printf "%s\n" "$ac_cv_exeext" >&6; }
+
+rm -f conftest.$ac_ext
+EXEEXT=$ac_cv_exeext
+ac_exeext=$EXEEXT
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <stdio.h>
+int
+main (void)
+{
+FILE *f = fopen ("conftest.out", "w");
+ if (!f)
+  return 1;
+ return ferror (f) || fclose (f) != 0;
+
+  ;
+  return 0;
+}
+_ACEOF
+ac_clean_files="$ac_clean_files conftest.out"
+# Check that the compiler produces executables we can run.  If not, either
+# the compiler is broken, or we cross compile.
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether we are cross compiling" >&5
+printf %s "checking whether we are cross compiling... " >&6; }
+if test "$cross_compiling" != yes; then
+  { { ac_try="$ac_link"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+printf "%s\n" "$ac_try_echo"; } >&5
+  (eval "$ac_link") 2>&5
+  ac_status=$?
+  printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }
+  if { ac_try='./conftest$ac_cv_exeext'
+  { { case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+printf "%s\n" "$ac_try_echo"; } >&5
+  (eval "$ac_try") 2>&5
+  ac_status=$?
+  printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; }; then
+    cross_compiling=no
+  else
+    if test "$cross_compiling" = maybe; then
+	cross_compiling=yes
+    else
+	{ { printf "%s\n" "$as_me:${as_lineno-$LINENO}: error: in '$ac_pwd':" >&5
+printf "%s\n" "$as_me: error: in '$ac_pwd':" >&2;}
+as_fn_error 77 "cannot run C++ compiled programs.
+If you meant to cross compile, use '--host'.
+See 'config.log' for more details" "$LINENO" 5; }
+    fi
+  fi
+fi
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $cross_compiling" >&5
+printf "%s\n" "$cross_compiling" >&6; }
+
+rm -f conftest.$ac_ext conftest$ac_cv_exeext \
+  conftest.o conftest.obj conftest.out
+ac_clean_files=$ac_clean_files_save
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for suffix of object files" >&5
+printf %s "checking for suffix of object files... " >&6; }
+if test ${ac_cv_objext+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main (void)
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+rm -f conftest.o conftest.obj
+if { { ac_try="$ac_compile"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+printf "%s\n" "$ac_try_echo"; } >&5
+  (eval "$ac_compile") 2>&5
+  ac_status=$?
+  printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }
+then :
+  for ac_file in conftest.o conftest.obj conftest.*; do
+  test -f "$ac_file" || continue;
+  case $ac_file in
+    *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM ) ;;
+    *) ac_cv_objext=`expr "$ac_file" : '.*\.\(.*\)'`
+       break;;
+  esac
+done
+else case e in #(
+  e) printf "%s\n" "$as_me: failed program was:" >&5
+sed 's/^/| /' conftest.$ac_ext >&5
+
+{ { printf "%s\n" "$as_me:${as_lineno-$LINENO}: error: in '$ac_pwd':" >&5
+printf "%s\n" "$as_me: error: in '$ac_pwd':" >&2;}
+as_fn_error $? "cannot compute suffix of object files: cannot compile
+See 'config.log' for more details" "$LINENO" 5; } ;;
+esac
+fi
+rm -f conftest.$ac_cv_objext conftest.$ac_ext ;;
+esac
+fi
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_objext" >&5
+printf "%s\n" "$ac_cv_objext" >&6; }
+OBJEXT=$ac_cv_objext
+ac_objext=$OBJEXT
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether the compiler supports GNU C++" >&5
+printf %s "checking whether the compiler supports GNU C++... " >&6; }
+if test ${ac_cv_cxx_compiler_gnu+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main (void)
+{
+#ifndef __GNUC__
+       choke me
+#endif
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"
+then :
+  ac_compiler_gnu=yes
+else case e in #(
+  e) ac_compiler_gnu=no ;;
+esac
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
+ac_cv_cxx_compiler_gnu=$ac_compiler_gnu
+ ;;
+esac
+fi
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_cxx_compiler_gnu" >&5
+printf "%s\n" "$ac_cv_cxx_compiler_gnu" >&6; }
+ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
+
+if test $ac_compiler_gnu = yes; then
+  GXX=yes
+else
+  GXX=
+fi
+ac_test_CXXFLAGS=${CXXFLAGS+y}
+ac_save_CXXFLAGS=$CXXFLAGS
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether $CXX accepts -g" >&5
+printf %s "checking whether $CXX accepts -g... " >&6; }
+if test ${ac_cv_prog_cxx_g+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) ac_save_cxx_werror_flag=$ac_cxx_werror_flag
+   ac_cxx_werror_flag=yes
+   ac_cv_prog_cxx_g=no
+   CXXFLAGS="-g"
+   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main (void)
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"
+then :
+  ac_cv_prog_cxx_g=yes
+else case e in #(
+  e) CXXFLAGS=""
+      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main (void)
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"
+then :
+
+else case e in #(
+  e) ac_cxx_werror_flag=$ac_save_cxx_werror_flag
+	 CXXFLAGS="-g"
+	 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main (void)
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"
+then :
+  ac_cv_prog_cxx_g=yes
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext ;;
+esac
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext ;;
+esac
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
+   ac_cxx_werror_flag=$ac_save_cxx_werror_flag ;;
+esac
+fi
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cxx_g" >&5
+printf "%s\n" "$ac_cv_prog_cxx_g" >&6; }
+if test $ac_test_CXXFLAGS; then
+  CXXFLAGS=$ac_save_CXXFLAGS
+elif test $ac_cv_prog_cxx_g = yes; then
+  if test "$GXX" = yes; then
+    CXXFLAGS="-g -O2"
+  else
+    CXXFLAGS="-g"
+  fi
+else
+  if test "$GXX" = yes; then
+    CXXFLAGS="-O2"
+  else
+    CXXFLAGS=
+  fi
+fi
+ac_prog_cxx_stdcxx=no
+if test x$ac_prog_cxx_stdcxx = xno
+then :
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $CXX option to enable C++11 features" >&5
+printf %s "checking for $CXX option to enable C++11 features... " >&6; }
+if test ${ac_cv_prog_cxx_cxx11+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) ac_cv_prog_cxx_cxx11=no
+ac_save_CXX=$CXX
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$ac_cxx_conftest_cxx11_program
+_ACEOF
+for ac_arg in '' -std=gnu++11 -std=gnu++0x -std=c++11 -std=c++0x -qlanglvl=extended0x -AA
+do
+  CXX="$ac_save_CXX $ac_arg"
+  if ac_fn_cxx_try_compile "$LINENO"
+then :
+  ac_cv_prog_cxx_cxx11=$ac_arg
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.beam
+  test "x$ac_cv_prog_cxx_cxx11" != "xno" && break
+done
+rm -f conftest.$ac_ext
+CXX=$ac_save_CXX ;;
+esac
+fi
+
+if test "x$ac_cv_prog_cxx_cxx11" = xno
+then :
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: unsupported" >&5
+printf "%s\n" "unsupported" >&6; }
+else case e in #(
+  e) if test "x$ac_cv_prog_cxx_cxx11" = x
+then :
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: none needed" >&5
+printf "%s\n" "none needed" >&6; }
+else case e in #(
+  e) { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cxx_cxx11" >&5
+printf "%s\n" "$ac_cv_prog_cxx_cxx11" >&6; }
+     CXX="$CXX $ac_cv_prog_cxx_cxx11" ;;
+esac
+fi
+  ac_cv_prog_cxx_stdcxx=$ac_cv_prog_cxx_cxx11
+  ac_prog_cxx_stdcxx=cxx11 ;;
+esac
+fi
+fi
+if test x$ac_prog_cxx_stdcxx = xno
+then :
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $CXX option to enable C++98 features" >&5
+printf %s "checking for $CXX option to enable C++98 features... " >&6; }
+if test ${ac_cv_prog_cxx_cxx98+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) ac_cv_prog_cxx_cxx98=no
+ac_save_CXX=$CXX
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$ac_cxx_conftest_cxx98_program
+_ACEOF
+for ac_arg in '' -std=gnu++98 -std=c++98 -qlanglvl=extended -AA
+do
+  CXX="$ac_save_CXX $ac_arg"
+  if ac_fn_cxx_try_compile "$LINENO"
+then :
+  ac_cv_prog_cxx_cxx98=$ac_arg
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.beam
+  test "x$ac_cv_prog_cxx_cxx98" != "xno" && break
+done
+rm -f conftest.$ac_ext
+CXX=$ac_save_CXX ;;
+esac
+fi
+
+if test "x$ac_cv_prog_cxx_cxx98" = xno
+then :
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: unsupported" >&5
+printf "%s\n" "unsupported" >&6; }
+else case e in #(
+  e) if test "x$ac_cv_prog_cxx_cxx98" = x
+then :
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: none needed" >&5
+printf "%s\n" "none needed" >&6; }
+else case e in #(
+  e) { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cxx_cxx98" >&5
+printf "%s\n" "$ac_cv_prog_cxx_cxx98" >&6; }
+     CXX="$CXX $ac_cv_prog_cxx_cxx98" ;;
+esac
+fi
+  ac_cv_prog_cxx_stdcxx=$ac_cv_prog_cxx_cxx98
+  ac_prog_cxx_stdcxx=cxx98 ;;
+esac
+fi
+fi
+
+ac_ext=cpp
+ac_cpp='$CXXCPP $CPPFLAGS'
+ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
+
+
+ac_ext=cpp
+ac_cpp='$CXXCPP $CPPFLAGS'
+ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking how to run the C++ preprocessor" >&5
+printf %s "checking how to run the C++ preprocessor... " >&6; }
+if test -z "$CXXCPP"; then
+  if test ${ac_cv_prog_CXXCPP+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e)     # Double quotes because $CXX needs to be expanded
+    for CXXCPP in "$CXX -E" cpp /lib/cpp
+    do
+      ac_preproc_ok=false
+for ac_cxx_preproc_warn_flag in '' yes
+do
+  # Use a header file that comes with gcc, so configuring glibc
+  # with a fresh cross-compiler works.
+  # On the NeXT, cc -E runs the code through the compiler's parser,
+  # not just through cpp. "Syntax error" is here to catch this case.
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <limits.h>
+		     Syntax error
+_ACEOF
+if ac_fn_cxx_try_cpp "$LINENO"
+then :
+
+else case e in #(
+  e) # Broken: fails on valid input.
+continue ;;
+esac
+fi
+rm -f conftest.err conftest.i conftest.$ac_ext
+
+  # OK, works on sane cases.  Now check whether nonexistent headers
+  # can be detected and how.
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <ac_nonexistent.h>
+_ACEOF
+if ac_fn_cxx_try_cpp "$LINENO"
+then :
+  # Broken: success on invalid input.
+continue
+else case e in #(
+  e) # Passes both tests.
+ac_preproc_ok=:
+break ;;
+esac
+fi
+rm -f conftest.err conftest.i conftest.$ac_ext
+
+done
+# Because of 'break', _AC_PREPROC_IFELSE's cleaning code was skipped.
+rm -f conftest.i conftest.err conftest.$ac_ext
+if $ac_preproc_ok
+then :
+  break
+fi
+
+    done
+    ac_cv_prog_CXXCPP=$CXXCPP
+   ;;
+esac
+fi
+  CXXCPP=$ac_cv_prog_CXXCPP
+else
+  ac_cv_prog_CXXCPP=$CXXCPP
+fi
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $CXXCPP" >&5
+printf "%s\n" "$CXXCPP" >&6; }
+ac_preproc_ok=false
+for ac_cxx_preproc_warn_flag in '' yes
+do
+  # Use a header file that comes with gcc, so configuring glibc
+  # with a fresh cross-compiler works.
+  # On the NeXT, cc -E runs the code through the compiler's parser,
+  # not just through cpp. "Syntax error" is here to catch this case.
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <limits.h>
+		     Syntax error
+_ACEOF
+if ac_fn_cxx_try_cpp "$LINENO"
+then :
+
+else case e in #(
+  e) # Broken: fails on valid input.
+continue ;;
+esac
+fi
+rm -f conftest.err conftest.i conftest.$ac_ext
+
+  # OK, works on sane cases.  Now check whether nonexistent headers
+  # can be detected and how.
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+#include <ac_nonexistent.h>
+_ACEOF
+if ac_fn_cxx_try_cpp "$LINENO"
+then :
+  # Broken: success on invalid input.
+continue
+else case e in #(
+  e) # Passes both tests.
+ac_preproc_ok=:
+break ;;
+esac
+fi
+rm -f conftest.err conftest.i conftest.$ac_ext
+
+done
+# Because of 'break', _AC_PREPROC_IFELSE's cleaning code was skipped.
+rm -f conftest.i conftest.err conftest.$ac_ext
+if $ac_preproc_ok
+then :
+
+else case e in #(
+  e) { { printf "%s\n" "$as_me:${as_lineno-$LINENO}: error: in '$ac_pwd':" >&5
+printf "%s\n" "$as_me: error: in '$ac_pwd':" >&2;}
+as_fn_error $? "C++ preprocessor \"$CXXCPP\" fails sanity check
+See 'config.log' for more details" "$LINENO" 5; } ;;
+esac
+fi
+
+ac_ext=cpp
+ac_cpp='$CXXCPP $CPPFLAGS'
+ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
+
+
+
+ac_ext=cpp
+ac_cpp='$CXXCPP $CPPFLAGS'
+ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
+if test -z "$CXX"; then
+  if test -n "$CCC"; then
+    CXX=$CCC
+  else
+    if test -n "$ac_tool_prefix"; then
+  for ac_prog in g++ c++ gpp aCC CC cxx cc++ cl.exe FCC KCC RCC xlC_r xlC clang++
+  do
+    # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args.
+set dummy $ac_tool_prefix$ac_prog; ac_word=$2
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+printf %s "checking for $ac_word... " >&6; }
+if test ${ac_cv_prog_CXX+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) if test -n "$CXX"; then
+  ac_cv_prog_CXX="$CXX" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  case $as_dir in #(((
+    '') as_dir=./ ;;
+    */) ;;
+    *) as_dir=$as_dir/ ;;
+  esac
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir$ac_word$ac_exec_ext"; then
+    ac_cv_prog_CXX="$ac_tool_prefix$ac_prog"
+    printf "%s\n" "$as_me:${as_lineno-$LINENO}: found $as_dir$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi ;;
+esac
+fi
+CXX=$ac_cv_prog_CXX
+if test -n "$CXX"; then
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $CXX" >&5
+printf "%s\n" "$CXX" >&6; }
+else
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
+printf "%s\n" "no" >&6; }
+fi
+
+
+    test -n "$CXX" && break
+  done
+fi
+if test -z "$CXX"; then
+  ac_ct_CXX=$CXX
+  for ac_prog in g++ c++ gpp aCC CC cxx cc++ cl.exe FCC KCC RCC xlC_r xlC clang++
+do
+  # Extract the first word of "$ac_prog", so it can be a program name with args.
+set dummy $ac_prog; ac_word=$2
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5
+printf %s "checking for $ac_word... " >&6; }
+if test ${ac_cv_prog_ac_ct_CXX+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) if test -n "$ac_ct_CXX"; then
+  ac_cv_prog_ac_ct_CXX="$ac_ct_CXX" # Let the user override the test.
+else
+as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  case $as_dir in #(((
+    '') as_dir=./ ;;
+    */) ;;
+    *) as_dir=$as_dir/ ;;
+  esac
+    for ac_exec_ext in '' $ac_executable_extensions; do
+  if as_fn_executable_p "$as_dir$ac_word$ac_exec_ext"; then
+    ac_cv_prog_ac_ct_CXX="$ac_prog"
+    printf "%s\n" "$as_me:${as_lineno-$LINENO}: found $as_dir$ac_word$ac_exec_ext" >&5
+    break 2
+  fi
+done
+  done
+IFS=$as_save_IFS
+
+fi ;;
+esac
+fi
+ac_ct_CXX=$ac_cv_prog_ac_ct_CXX
+if test -n "$ac_ct_CXX"; then
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_ct_CXX" >&5
+printf "%s\n" "$ac_ct_CXX" >&6; }
+else
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
+printf "%s\n" "no" >&6; }
+fi
+
+
+  test -n "$ac_ct_CXX" && break
+done
+
+  if test "x$ac_ct_CXX" = x; then
+    CXX="g++"
+  else
+    case $cross_compiling:$ac_tool_warned in
+yes:)
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5
+printf "%s\n" "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;}
+ac_tool_warned=yes ;;
+esac
+    CXX=$ac_ct_CXX
+  fi
+fi
+
+  fi
+fi
+# Provide some information about the compiler.
+printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for C++ compiler version" >&5
+set X $ac_compile
+ac_compiler=$2
+for ac_option in --version -v -V -qversion; do
+  { { ac_try="$ac_compiler $ac_option >&5"
+case "(($ac_try" in
+  *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
+  *) ac_try_echo=$ac_try;;
+esac
+eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\""
+printf "%s\n" "$ac_try_echo"; } >&5
+  (eval "$ac_compiler $ac_option >&5") 2>conftest.err
+  ac_status=$?
+  if test -s conftest.err; then
+    sed '10a\
+... rest of stderr output deleted ...
+         10q' conftest.err >conftest.er1
+    cat conftest.er1 >&5
+  fi
+  rm -f conftest.er1 conftest.err
+  printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }
+done
+
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether the compiler supports GNU C++" >&5
+printf %s "checking whether the compiler supports GNU C++... " >&6; }
+if test ${ac_cv_cxx_compiler_gnu+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main (void)
+{
+#ifndef __GNUC__
+       choke me
+#endif
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"
+then :
+  ac_compiler_gnu=yes
+else case e in #(
+  e) ac_compiler_gnu=no ;;
+esac
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
+ac_cv_cxx_compiler_gnu=$ac_compiler_gnu
+ ;;
+esac
+fi
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_cxx_compiler_gnu" >&5
+printf "%s\n" "$ac_cv_cxx_compiler_gnu" >&6; }
+ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
+
+if test $ac_compiler_gnu = yes; then
+  GXX=yes
+else
+  GXX=
+fi
+ac_test_CXXFLAGS=${CXXFLAGS+y}
+ac_save_CXXFLAGS=$CXXFLAGS
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether $CXX accepts -g" >&5
+printf %s "checking whether $CXX accepts -g... " >&6; }
+if test ${ac_cv_prog_cxx_g+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) ac_save_cxx_werror_flag=$ac_cxx_werror_flag
+   ac_cxx_werror_flag=yes
+   ac_cv_prog_cxx_g=no
+   CXXFLAGS="-g"
+   cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main (void)
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"
+then :
+  ac_cv_prog_cxx_g=yes
+else case e in #(
+  e) CXXFLAGS=""
+      cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main (void)
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"
+then :
+
+else case e in #(
+  e) ac_cxx_werror_flag=$ac_save_cxx_werror_flag
+	 CXXFLAGS="-g"
+	 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main (void)
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+if ac_fn_cxx_try_compile "$LINENO"
+then :
+  ac_cv_prog_cxx_g=yes
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext ;;
+esac
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext ;;
+esac
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
+   ac_cxx_werror_flag=$ac_save_cxx_werror_flag ;;
+esac
+fi
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cxx_g" >&5
+printf "%s\n" "$ac_cv_prog_cxx_g" >&6; }
+if test $ac_test_CXXFLAGS; then
+  CXXFLAGS=$ac_save_CXXFLAGS
+elif test $ac_cv_prog_cxx_g = yes; then
+  if test "$GXX" = yes; then
+    CXXFLAGS="-g -O2"
+  else
+    CXXFLAGS="-g"
+  fi
+else
+  if test "$GXX" = yes; then
+    CXXFLAGS="-O2"
+  else
+    CXXFLAGS=
+  fi
+fi
+ac_prog_cxx_stdcxx=no
+if test x$ac_prog_cxx_stdcxx = xno
+then :
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $CXX option to enable C++11 features" >&5
+printf %s "checking for $CXX option to enable C++11 features... " >&6; }
+if test ${ac_cv_prog_cxx_cxx11+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) ac_cv_prog_cxx_cxx11=no
+ac_save_CXX=$CXX
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$ac_cxx_conftest_cxx11_program
+_ACEOF
+for ac_arg in '' -std=gnu++11 -std=gnu++0x -std=c++11 -std=c++0x -qlanglvl=extended0x -AA
+do
+  CXX="$ac_save_CXX $ac_arg"
+  if ac_fn_cxx_try_compile "$LINENO"
+then :
+  ac_cv_prog_cxx_cxx11=$ac_arg
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.beam
+  test "x$ac_cv_prog_cxx_cxx11" != "xno" && break
+done
+rm -f conftest.$ac_ext
+CXX=$ac_save_CXX ;;
+esac
+fi
+
+if test "x$ac_cv_prog_cxx_cxx11" = xno
+then :
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: unsupported" >&5
+printf "%s\n" "unsupported" >&6; }
+else case e in #(
+  e) if test "x$ac_cv_prog_cxx_cxx11" = x
+then :
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: none needed" >&5
+printf "%s\n" "none needed" >&6; }
+else case e in #(
+  e) { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cxx_cxx11" >&5
+printf "%s\n" "$ac_cv_prog_cxx_cxx11" >&6; }
+     CXX="$CXX $ac_cv_prog_cxx_cxx11" ;;
+esac
+fi
+  ac_cv_prog_cxx_stdcxx=$ac_cv_prog_cxx_cxx11
+  ac_prog_cxx_stdcxx=cxx11 ;;
+esac
+fi
+fi
+if test x$ac_prog_cxx_stdcxx = xno
+then :
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $CXX option to enable C++98 features" >&5
+printf %s "checking for $CXX option to enable C++98 features... " >&6; }
+if test ${ac_cv_prog_cxx_cxx98+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) ac_cv_prog_cxx_cxx98=no
+ac_save_CXX=$CXX
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+$ac_cxx_conftest_cxx98_program
+_ACEOF
+for ac_arg in '' -std=gnu++98 -std=c++98 -qlanglvl=extended -AA
+do
+  CXX="$ac_save_CXX $ac_arg"
+  if ac_fn_cxx_try_compile "$LINENO"
+then :
+  ac_cv_prog_cxx_cxx98=$ac_arg
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.beam
+  test "x$ac_cv_prog_cxx_cxx98" != "xno" && break
+done
+rm -f conftest.$ac_ext
+CXX=$ac_save_CXX ;;
+esac
+fi
+
+if test "x$ac_cv_prog_cxx_cxx98" = xno
+then :
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: unsupported" >&5
+printf "%s\n" "unsupported" >&6; }
+else case e in #(
+  e) if test "x$ac_cv_prog_cxx_cxx98" = x
+then :
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: none needed" >&5
+printf "%s\n" "none needed" >&6; }
+else case e in #(
+  e) { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cxx_cxx98" >&5
+printf "%s\n" "$ac_cv_prog_cxx_cxx98" >&6; }
+     CXX="$CXX $ac_cv_prog_cxx_cxx98" ;;
+esac
+fi
+  ac_cv_prog_cxx_stdcxx=$ac_cv_prog_cxx_cxx98
+  ac_prog_cxx_stdcxx=cxx98 ;;
+esac
+fi
+fi
+
+ac_ext=cpp
+ac_cpp='$CXXCPP $CPPFLAGS'
+ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_cxx_compiler_gnu
+
+
+## Rely on Sys.info() from R for cross-compilation cases
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking what system we are on" >&5
+printf %s "checking what system we are on... " >&6; }
+SYSKERNEL=$("${R_HOME}/bin/Rscript" --vanilla -e 'cat(Sys.info()["sysname"])')
+SYSMACHINE=$("${R_HOME}/bin/Rscript" --vanilla -e 'cat(Sys.info()["machine"])')
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: running ${SYSKERNEL} on ${SYSMACHINE}" >&5
+printf "%s\n" "running ${SYSKERNEL} on ${SYSMACHINE}" >&6; }
+
+
+## Default to not assuming OpenMP, but then test a variety of setups
+can_use_openmp="no"
+
+## Ensure TMPDIR is set.
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether we have a suitable tempdir" >&5
+printf %s "checking whether we have a suitable tempdir... " >&6; }
+TMPDIR=$("${R_HOME}/bin/R" --vanilla --slave -e "cat(dirname(tempdir()))")
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: ${TMPDIR}" >&5
+printf "%s\n" "${TMPDIR}" >&6; }
+
+## Create private directory in TMPDIR.
+BUILDDIR="${TMPDIR}/rcppeigen-$$-$RANDOM"
+mkdir -p "${BUILDDIR}"
+
+owd=$(pwd)
+cd "${BUILDDIR}"
+
+cat <<EOF > test-omp.cpp
+#include <omp.h>
+int main() {
+  return omp_get_num_threads();
+}
+EOF
+
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether on Linux" >&5
+printf %s "checking whether on Linux... " >&6; }
+if test x"${SYSKERNEL}" = x"Linux"; then
+   { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+printf "%s\n" "yes" >&6; }
+
+   ## Check if R is configured to compile OpenMP programs out-of-the-box.
+   if test x"${can_use_openmp}" = x"no"; then
+       { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether R CMD SHLIB can already compile OpenMP programs" >&5
+printf %s "checking whether R CMD SHLIB can already compile OpenMP programs... " >&6; }
+       "${R_HOME}/bin/R" CMD SHLIB test-omp.cpp >/dev/null 2>&1
+       if test x"$?" = x"0"; then
+           { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+printf "%s\n" "yes" >&6; }
+           can_use_openmp="yes"
+       else
+           { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
+printf "%s\n" "no" >&6; }
+           can_use_openmp="no"
+       fi
+   fi
+
+   ## If needed, check if R is configured to compile OpenMP programs using -fopenmp
+   if test x"${can_use_openmp}" = x"no"; then
+       { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether R CMD SHLIB can compile OpenMP via -fopenmp" >&5
+printf %s "checking whether R CMD SHLIB can compile OpenMP via -fopenmp... " >&6; }
+       PKG_CXXFLAGS="${PKG_CXXFLAGS} -fopenmp" PKG_LIBS="${PKG_LIBS} -fopenmp" "${R_HOME}/bin/R" CMD SHLIB -fopenmp test-omp.cpp >/dev/null 2>&1
+       if test x"$?" = x"0"; then
+           { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+printf "%s\n" "yes" >&6; }
+           can_use_openmp="yes"
+           # keep any entries user may have set
+           PKG_CXXFLAGS="${PKG_CXXFLAGS} -fopenmp"
+           PKG_LIBS="${PKG_LIBS} -fopenmp"
+       else
+           { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
+printf "%s\n" "no" >&6; }
+           can_use_openmp="no"
+       fi
+   fi
+else
+   { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
+printf "%s\n" "no" >&6; }
+fi # if Linux
+
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether on macOS" >&5
+printf %s "checking whether on macOS... " >&6; }
+if test x"${SYSKERNEL}" = x"Darwin"; then
+   { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+printf "%s\n" "yes" >&6; }
+
+   ## Check if R is configured to compile OpenMP programs using -Xclang -fopenmp
+   if test x"${can_use_openmp}" = x"no"; then
+       { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether R CMD SHLIB can compile OpenMP programs using '-Xclang -fopenmp'" >&5
+printf %s "checking whether R CMD SHLIB can compile OpenMP programs using '-Xclang -fopenmp'... " >&6; }
+       PKG_CXXFLAGS="${PKG_CXXFLAGS} -Xclang -fopenmp" PKG_LIBS="${PKG_LIBS} -lomp" "${R_HOME}/bin/R" CMD SHLIB test-omp.cpp >/dev/null 2>&1
+       if test x"$?" = x"0"; then
+           { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+printf "%s\n" "yes" >&6; }
+           can_use_openmp="yes"
+           # keep any entries user may have set
+           PKG_CXXFLAGS="${PKG_CXXFLAGS} -Xclang -fopenmp"
+           PKG_LIBS="${PKG_LIBS} -lomp"
+       else
+           { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
+printf "%s\n" "no" >&6; }
+           can_use_openmp="no"
+       fi
+   fi
+
+   ## Check if R is configured to compile OpenMP programs using -fopenmp (cf data.table #6409)
+   if test x"${can_use_openmp}" = x"no"; then
+       { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether R CMD SHLIB can compile OpenMP programs using '-fopenmp'" >&5
+printf %s "checking whether R CMD SHLIB can compile OpenMP programs using '-fopenmp'... " >&6; }
+       PKG_CXXFLAGS="${PKG_CXXFLAGS} -fopenmp" PKG_LIBS="${PKG_LIBS} -fopenmp" "${R_HOME}/bin/R" CMD SHLIB test-omp.cpp >/dev/null 2>&1
+       if test x"$?" = x"0"; then
+           { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+printf "%s\n" "yes" >&6; }
+           can_use_openmp="yes"
+           # keep any entries user may have set
+           PKG_CXXFLAGS="${PKG_CXXFLAGS} -fopenmp"
+           PKG_LIBS="${PKG_LIBS} -fopenmp"
+       else
+           { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
+printf "%s\n" "no" >&6; }
+           can_use_openmp="no"
+       fi
+   fi
+
+   if test x"${can_use_openmp}" = x"no"; then
+       if test x"${SYSMACHINE}" = x"arm64"; then
+           HOMEBREW_PREFIX=/opt/homebrew
+       else
+           HOMEBREW_PREFIX=/usr/local
+       fi
+       if test -e "${HOMEBREW_PREFIX}/opt/libomp"; then
+           { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether R CMD SHLIB can use libomp at ${HOMEBREW_PREFIX}/opt/libomp" >&5
+printf %s "checking whether R CMD SHLIB can use libomp at ${HOMEBREW_PREFIX}/opt/libomp... " >&6; }
+           LIBOMP_INCLUDE="-I${HOMEBREW_PREFIX}/opt/libomp/include -Xclang -fopenmp"
+           LIBOMP_LINK="-L${HOMEBREW_PREFIX}/opt/libomp/lib -lomp"
+           PKG_CXXFLAGS="${PKG_CXXFLAGS} ${LIBOMP_INCLUDE}" PKG_LIBS="${PKG_LIBS} ${LIBOMP_LINK}" "${R_HOME}/bin/R" CMD SHLIB test-omp.cpp >/dev/null 2>&1
+           if test x"$?" = x"0"; then
+               { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: yes" >&5
+printf "%s\n" "yes" >&6; }
+               can_use_openmp="yes"
+               # keep any entries user may have set
+               PKG_CXXFLAGS="${PKG_CXXFLAGS} ${LIBOMP_INCLUDE}"
+               PKG_LIBS="${PKG_LIBS} ${LIBOMP_LINK}"
+           else
+               { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
+printf "%s\n" "no" >&6; }
+               can_use_openmp="no"
+           fi
+       fi
+   fi
+else
+   { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: no" >&5
+printf "%s\n" "no" >&6; }
+fi # if macOS
+
+## Go back home.
+cd "${owd}"
+rm -rf "${BUILDDIR}"
+
+# Overall summary
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for OpenMP" >&5
+printf %s "checking for OpenMP... " >&6; }
+if test x"${can_use_openmp}" = x"yes"; then
+   { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: found and suitable" >&5
+printf "%s\n" "found and suitable" >&6; }
+   openmp_flag='$(SHLIB_OPENMP_CXXFLAGS)'
+else
+   { printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: missing so no OpenMP acceleration" >&5
+printf "%s\n" "missing so no OpenMP acceleration" >&6; }
+   openmp_flag=""
+fi
+
+## Now use all these
+OPENMP_FLAG="${openmp_flag}"
+
+PKG_CXXFLAGS="${PKG_CXXFLAGS}"
+
+PKG_LIBS="${PKG_LIBS}"
+
+ac_config_files="$ac_config_files src/Makevars"
+
+cat >confcache <<\_ACEOF
+# This file is a shell script that caches the results of configure
+# tests run on this system so they can be shared between configure
+# scripts and configure runs, see configure's option --config-cache.
+# It is not useful on other systems.  If it contains results you don't
+# want to keep, you may remove or edit it.
+#
+# config.status only pays attention to the cache file if you give it
+# the --recheck option to rerun configure.
+#
+# 'ac_cv_env_foo' variables (set or unset) will be overridden when
+# loading this file, other *unset* 'ac_cv_foo' will be assigned the
+# following values.
+
+_ACEOF
+
+# The following way of writing the cache mishandles newlines in values,
+# but we know of no workaround that is simple, portable, and efficient.
+# So, we kill variables containing newlines.
+# Ultrix sh set writes to stderr and can't be redirected directly,
+# and sets the high bit in the cache file unless we assign to the vars.
+(
+  for ac_var in `(set) 2>&1 | sed -n 's/^\([a-zA-Z_][a-zA-Z0-9_]*\)=.*/\1/p'`; do
+    eval ac_val=\$$ac_var
+    case $ac_val in #(
+    *${as_nl}*)
+      case $ac_var in #(
+      *_cv_*) { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: cache variable $ac_var contains a newline" >&5
+printf "%s\n" "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;;
+      esac
+      case $ac_var in #(
+      _ | IFS | as_nl) ;; #(
+      BASH_ARGV | BASH_SOURCE) eval $ac_var= ;; #(
+      *) { eval $ac_var=; unset $ac_var;} ;;
+      esac ;;
+    esac
+  done
+
+  (set) 2>&1 |
+    case $as_nl`(ac_space=' '; set) 2>&1` in #(
+    *${as_nl}ac_space=\ *)
+      # 'set' does not quote correctly, so add quotes: double-quote
+      # substitution turns \\\\ into \\, and sed turns \\ into \.
+      sed -n \
+	"s/'/'\\\\''/g;
+	  s/^\\([_$as_cr_alnum]*_cv_[_$as_cr_alnum]*\\)=\\(.*\\)/\\1='\\2'/p"
+      ;; #(
+    *)
+      # 'set' quotes correctly as required by POSIX, so do not add quotes.
+      sed -n "/^[_$as_cr_alnum]*_cv_[_$as_cr_alnum]*=/p"
+      ;;
+    esac |
+    sort
+) |
+  sed '
+     /^ac_cv_env_/b end
+     t clear
+     :clear
+     s/^\([^=]*\)=\(.*[{}].*\)$/test ${\1+y} || &/
+     t end
+     s/^\([^=]*\)=\(.*\)$/\1=${\1=\2}/
+     :end' >>confcache
+if diff "$cache_file" confcache >/dev/null 2>&1; then :; else
+  if test -w "$cache_file"; then
+    if test "x$cache_file" != "x/dev/null"; then
+      { printf "%s\n" "$as_me:${as_lineno-$LINENO}: updating cache $cache_file" >&5
+printf "%s\n" "$as_me: updating cache $cache_file" >&6;}
+      if test ! -f "$cache_file" || test -h "$cache_file"; then
+	cat confcache >"$cache_file"
+      else
+        case $cache_file in #(
+        */* | ?:*)
+	  mv -f confcache "$cache_file"$$ &&
+	  mv -f "$cache_file"$$ "$cache_file" ;; #(
+        *)
+	  mv -f confcache "$cache_file" ;;
+	esac
+      fi
+    fi
+  else
+    { printf "%s\n" "$as_me:${as_lineno-$LINENO}: not updating unwritable cache $cache_file" >&5
+printf "%s\n" "$as_me: not updating unwritable cache $cache_file" >&6;}
+  fi
+fi
+rm -f confcache
+
+test "x$prefix" = xNONE && prefix=$ac_default_prefix
+# Let make expand exec_prefix.
+test "x$exec_prefix" = xNONE && exec_prefix='${prefix}'
+
+# Transform confdefs.h into DEFS.
+# Protect against shell expansion while executing Makefile rules.
+# Protect against Makefile macro expansion.
+#
+# If the first sed substitution is executed (which looks for macros that
+# take arguments), then branch to the quote section.  Otherwise,
+# look for a macro that doesn't take arguments.
+ac_script='
+:mline
+/\\$/{
+ N
+ s,\\\n,,
+ b mline
+}
+t clear
+:clear
+s/^[	 ]*#[	 ]*define[	 ][	 ]*\([^	 (][^	 (]*([^)]*)\)[	 ]*\(.*\)/-D\1=\2/g
+t quote
+s/^[	 ]*#[	 ]*define[	 ][	 ]*\([^	 ][^	 ]*\)[	 ]*\(.*\)/-D\1=\2/g
+t quote
+b any
+:quote
+s/[][	 `~#$^&*(){}\\|;'\''"<>?]/\\&/g
+s/\$/$$/g
+H
+:any
+${
+	g
+	s/^\n//
+	s/\n/ /g
+	p
+}
+'
+DEFS=`sed -n "$ac_script" confdefs.h`
+
+
+ac_libobjs=
+ac_ltlibobjs=
+U=
+for ac_i in : $LIBOBJS; do test "x$ac_i" = x: && continue
+  # 1. Remove the extension, and $U if already installed.
+  ac_script='s/\$U\././;s/\.o$//;s/\.obj$//'
+  ac_i=`printf "%s\n" "$ac_i" | sed "$ac_script"`
+  # 2. Prepend LIBOBJDIR.  When used with automake>=1.10 LIBOBJDIR
+  #    will be set to the directory where LIBOBJS objects are built.
+  as_fn_append ac_libobjs " \${LIBOBJDIR}$ac_i\$U.$ac_objext"
+  as_fn_append ac_ltlibobjs " \${LIBOBJDIR}$ac_i"'$U.lo'
+done
+LIBOBJS=$ac_libobjs
+
+LTLIBOBJS=$ac_ltlibobjs
+
+
+
+: "${CONFIG_STATUS=./config.status}"
+ac_write_fail=0
+ac_clean_files_save=$ac_clean_files
+ac_clean_files="$ac_clean_files $CONFIG_STATUS"
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: creating $CONFIG_STATUS" >&5
+printf "%s\n" "$as_me: creating $CONFIG_STATUS" >&6;}
+as_write_fail=0
+cat >$CONFIG_STATUS <<_ASEOF || as_write_fail=1
+#! $SHELL
+# Generated by $as_me.
+# Run this file to recreate the current configuration.
+# Compiler output produced by configure, useful for debugging
+# configure, is in config.log if it exists.
+
+debug=false
+ac_cs_recheck=false
+ac_cs_silent=false
+
+SHELL=\${CONFIG_SHELL-$SHELL}
+export SHELL
+_ASEOF
+cat >>$CONFIG_STATUS <<\_ASEOF || as_write_fail=1
+## -------------------- ##
+## M4sh Initialization. ##
+## -------------------- ##
+
+# Be more Bourne compatible
+DUALCASE=1; export DUALCASE # for MKS sh
+if test ${ZSH_VERSION+y} && (emulate sh) >/dev/null 2>&1
+then :
+  emulate sh
+  NULLCMD=:
+  # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which
+  # is contrary to our usage.  Disable this feature.
+  alias -g '${1+"$@"}'='"$@"'
+  setopt NO_GLOB_SUBST
+else case e in #(
+  e) case `(set -o) 2>/dev/null` in #(
+  *posix*) :
+    set -o posix ;; #(
+  *) :
+     ;;
+esac ;;
+esac
+fi
+
+
+
+# Reset variables that may have inherited troublesome values from
+# the environment.
+
+# IFS needs to be set, to space, tab, and newline, in precisely that order.
+# (If _AS_PATH_WALK were called with IFS unset, it would have the
+# side effect of setting IFS to empty, thus disabling word splitting.)
+# Quoting is to prevent editors from complaining about space-tab.
+as_nl='
+'
+export as_nl
+IFS=" ""	$as_nl"
+
+PS1='$ '
+PS2='> '
+PS4='+ '
+
+# Ensure predictable behavior from utilities with locale-dependent output.
+LC_ALL=C
+export LC_ALL
+LANGUAGE=C
+export LANGUAGE
+
+# We cannot yet rely on "unset" to work, but we need these variables
+# to be unset--not just set to an empty or harmless value--now, to
+# avoid bugs in old shells (e.g. pre-3.0 UWIN ksh).  This construct
+# also avoids known problems related to "unset" and subshell syntax
+# in other old shells (e.g. bash 2.01 and pdksh 5.2.14).
+for as_var in BASH_ENV ENV MAIL MAILPATH CDPATH
+do eval test \${$as_var+y} \
+  && ( (unset $as_var) || exit 1) >/dev/null 2>&1 && unset $as_var || :
+done
+
+# Ensure that fds 0, 1, and 2 are open.
+if (exec 3>&0) 2>/dev/null; then :; else exec 0</dev/null; fi
+if (exec 3>&1) 2>/dev/null; then :; else exec 1>/dev/null; fi
+if (exec 3>&2)            ; then :; else exec 2>/dev/null; fi
+
+# The user is always right.
+if ${PATH_SEPARATOR+false} :; then
+  PATH_SEPARATOR=:
+  (PATH='/bin;/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 && {
+    (PATH='/bin:/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 ||
+      PATH_SEPARATOR=';'
+  }
+fi
+
+
+# Find who we are.  Look in the path if we contain no directory separator.
+as_myself=
+case $0 in #((
+  *[\\/]* ) as_myself=$0 ;;
+  *) as_save_IFS=$IFS; IFS=$PATH_SEPARATOR
+for as_dir in $PATH
+do
+  IFS=$as_save_IFS
+  case $as_dir in #(((
+    '') as_dir=./ ;;
+    */) ;;
+    *) as_dir=$as_dir/ ;;
+  esac
+    test -r "$as_dir$0" && as_myself=$as_dir$0 && break
+  done
+IFS=$as_save_IFS
+
+     ;;
+esac
+# We did not find ourselves, most probably we were run as 'sh COMMAND'
+# in which case we are not to be found in the path.
+if test "x$as_myself" = x; then
+  as_myself=$0
+fi
+if test ! -f "$as_myself"; then
+  printf "%s\n" "$as_myself: error: cannot find myself; rerun with an absolute file name" >&2
+  exit 1
+fi
+
+
+
+# as_fn_error STATUS ERROR [LINENO LOG_FD]
+# ----------------------------------------
+# Output "`basename $0`: error: ERROR" to stderr. If LINENO and LOG_FD are
+# provided, also output the error to LOG_FD, referencing LINENO. Then exit the
+# script with STATUS, using 1 if that was 0.
+as_fn_error ()
+{
+  as_status=$1; test $as_status -eq 0 && as_status=1
+  if test "$4"; then
+    as_lineno=${as_lineno-"$3"} as_lineno_stack=as_lineno_stack=$as_lineno_stack
+    printf "%s\n" "$as_me:${as_lineno-$LINENO}: error: $2" >&$4
+  fi
+  printf "%s\n" "$as_me: error: $2" >&2
+  as_fn_exit $as_status
+} # as_fn_error
+
+
+# as_fn_set_status STATUS
+# -----------------------
+# Set $? to STATUS, without forking.
+as_fn_set_status ()
+{
+  return $1
+} # as_fn_set_status
+
+# as_fn_exit STATUS
+# -----------------
+# Exit the shell with STATUS, even in a "trap 0" or "set -e" context.
+as_fn_exit ()
+{
+  set +e
+  as_fn_set_status $1
+  exit $1
+} # as_fn_exit
+
+# as_fn_unset VAR
+# ---------------
+# Portably unset VAR.
+as_fn_unset ()
+{
+  { eval $1=; unset $1;}
+}
+as_unset=as_fn_unset
+
+# as_fn_append VAR VALUE
+# ----------------------
+# Append the text in VALUE to the end of the definition contained in VAR. Take
+# advantage of any shell optimizations that allow amortized linear growth over
+# repeated appends, instead of the typical quadratic growth present in naive
+# implementations.
+if (eval "as_var=1; as_var+=2; test x\$as_var = x12") 2>/dev/null
+then :
+  eval 'as_fn_append ()
+  {
+    eval $1+=\$2
+  }'
+else case e in #(
+  e) as_fn_append ()
+  {
+    eval $1=\$$1\$2
+  } ;;
+esac
+fi # as_fn_append
+
+# as_fn_arith ARG...
+# ------------------
+# Perform arithmetic evaluation on the ARGs, and store the result in the
+# global $as_val. Take advantage of shells that can avoid forks. The arguments
+# must be portable across $(()) and expr.
+if (eval "test \$(( 1 + 1 )) = 2") 2>/dev/null
+then :
+  eval 'as_fn_arith ()
+  {
+    as_val=$(( $* ))
+  }'
+else case e in #(
+  e) as_fn_arith ()
+  {
+    as_val=`expr "$@" || test $? -eq 1`
+  } ;;
+esac
+fi # as_fn_arith
+
+
+if expr a : '\(a\)' >/dev/null 2>&1 &&
+   test "X`expr 00001 : '.*\(...\)'`" = X001; then
+  as_expr=expr
+else
+  as_expr=false
+fi
+
+if (basename -- /) >/dev/null 2>&1 && test "X`basename -- / 2>&1`" = "X/"; then
+  as_basename=basename
+else
+  as_basename=false
+fi
+
+if (as_dir=`dirname -- /` && test "X$as_dir" = X/) >/dev/null 2>&1; then
+  as_dirname=dirname
+else
+  as_dirname=false
+fi
+
+as_me=`$as_basename -- "$0" ||
+$as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \
+	 X"$0" : 'X\(//\)$' \| \
+	 X"$0" : 'X\(/\)' \| . 2>/dev/null ||
+printf "%s\n" X/"$0" |
+    sed '/^.*\/\([^/][^/]*\)\/*$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\/\(\/\/\)$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\/\(\/\).*/{
+	    s//\1/
+	    q
+	  }
+	  s/.*/./; q'`
+
+# Avoid depending upon Character Ranges.
+as_cr_letters='abcdefghijklmnopqrstuvwxyz'
+as_cr_LETTERS='ABCDEFGHIJKLMNOPQRSTUVWXYZ'
+as_cr_Letters=$as_cr_letters$as_cr_LETTERS
+as_cr_digits='0123456789'
+as_cr_alnum=$as_cr_Letters$as_cr_digits
+
+
+# Determine whether it's possible to make 'echo' print without a newline.
+# These variables are no longer used directly by Autoconf, but are AC_SUBSTed
+# for compatibility with existing Makefiles.
+ECHO_C= ECHO_N= ECHO_T=
+case `echo -n x` in #(((((
+-n*)
+  case `echo 'xy\c'` in
+  *c*) ECHO_T='	';;	# ECHO_T is single tab character.
+  xy)  ECHO_C='\c';;
+  *)   echo `echo ksh88 bug on AIX 6.1` > /dev/null
+       ECHO_T='	';;
+  esac;;
+*)
+  ECHO_N='-n';;
+esac
+
+# For backward compatibility with old third-party macros, we provide
+# the shell variables $as_echo and $as_echo_n.  New code should use
+# AS_ECHO(["message"]) and AS_ECHO_N(["message"]), respectively.
+as_echo='printf %s\n'
+as_echo_n='printf %s'
+
+rm -f conf$$ conf$$.exe conf$$.file
+if test -d conf$$.dir; then
+  rm -f conf$$.dir/conf$$.file
+else
+  rm -f conf$$.dir
+  mkdir conf$$.dir 2>/dev/null
+fi
+if (echo >conf$$.file) 2>/dev/null; then
+  if ln -s conf$$.file conf$$ 2>/dev/null; then
+    as_ln_s='ln -s'
+    # ... but there are two gotchas:
+    # 1) On MSYS, both 'ln -s file dir' and 'ln file dir' fail.
+    # 2) DJGPP < 2.04 has no symlinks; 'ln -s' creates a wrapper executable.
+    # In both cases, we have to default to 'cp -pR'.
+    ln -s conf$$.file conf$$.dir 2>/dev/null && test ! -f conf$$.exe ||
+      as_ln_s='cp -pR'
+  elif ln conf$$.file conf$$ 2>/dev/null; then
+    as_ln_s=ln
+  else
+    as_ln_s='cp -pR'
+  fi
+else
+  as_ln_s='cp -pR'
+fi
+rm -f conf$$ conf$$.exe conf$$.dir/conf$$.file conf$$.file
+rmdir conf$$.dir 2>/dev/null
+
+
+# as_fn_mkdir_p
+# -------------
+# Create "$as_dir" as a directory, including parents if necessary.
+as_fn_mkdir_p ()
+{
+
+  case $as_dir in #(
+  -*) as_dir=./$as_dir;;
+  esac
+  test -d "$as_dir" || eval $as_mkdir_p || {
+    as_dirs=
+    while :; do
+      case $as_dir in #(
+      *\'*) as_qdir=`printf "%s\n" "$as_dir" | sed "s/'/'\\\\\\\\''/g"`;; #'(
+      *) as_qdir=$as_dir;;
+      esac
+      as_dirs="'$as_qdir' $as_dirs"
+      as_dir=`$as_dirname -- "$as_dir" ||
+$as_expr X"$as_dir" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+	 X"$as_dir" : 'X\(//\)[^/]' \| \
+	 X"$as_dir" : 'X\(//\)$' \| \
+	 X"$as_dir" : 'X\(/\)' \| . 2>/dev/null ||
+printf "%s\n" X"$as_dir" |
+    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)[^/].*/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\).*/{
+	    s//\1/
+	    q
+	  }
+	  s/.*/./; q'`
+      test -d "$as_dir" && break
+    done
+    test -z "$as_dirs" || eval "mkdir $as_dirs"
+  } || test -d "$as_dir" || as_fn_error $? "cannot create directory $as_dir"
+
+
+} # as_fn_mkdir_p
+if mkdir -p . 2>/dev/null; then
+  as_mkdir_p='mkdir -p "$as_dir"'
+else
+  test -d ./-p && rmdir ./-p
+  as_mkdir_p=false
+fi
+
+
+# as_fn_executable_p FILE
+# -----------------------
+# Test if FILE is an executable regular file.
+as_fn_executable_p ()
+{
+  test -f "$1" && test -x "$1"
+} # as_fn_executable_p
+as_test_x='test -x'
+as_executable_p=as_fn_executable_p
+
+# Sed expression to map a string onto a valid CPP name.
+as_sed_cpp="y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g"
+as_tr_cpp="eval sed '$as_sed_cpp'" # deprecated
+
+# Sed expression to map a string onto a valid variable name.
+as_sed_sh="y%*+%pp%;s%[^_$as_cr_alnum]%_%g"
+as_tr_sh="eval sed '$as_sed_sh'" # deprecated
+
+
+exec 6>&1
+## ----------------------------------- ##
+## Main body of $CONFIG_STATUS script. ##
+## ----------------------------------- ##
+_ASEOF
+test $as_write_fail = 0 && chmod +x $CONFIG_STATUS || ac_write_fail=1
+
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+# Save the log message, to keep $0 and so on meaningful, and to
+# report actual input values of CONFIG_FILES etc. instead of their
+# values after options handling.
+ac_log="
+This file was extended by RcppEigen $as_me 0.3.4.0.2-1, which was
+generated by GNU Autoconf 2.72.  Invocation command line was
+
+  CONFIG_FILES    = $CONFIG_FILES
+  CONFIG_HEADERS  = $CONFIG_HEADERS
+  CONFIG_LINKS    = $CONFIG_LINKS
+  CONFIG_COMMANDS = $CONFIG_COMMANDS
+  $ $0 $@
+
+on `(hostname || uname -n) 2>/dev/null | sed 1q`
+"
+
+_ACEOF
+
+case $ac_config_files in *"
+"*) set x $ac_config_files; shift; ac_config_files=$*;;
+esac
+
+
+
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+# Files that config.status was made for.
+config_files="$ac_config_files"
+
+_ACEOF
+
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+ac_cs_usage="\
+'$as_me' instantiates files and other configuration actions
+from templates according to the current configuration.  Unless the files
+and actions are specified as TAGs, all are instantiated by default.
+
+Usage: $0 [OPTION]... [TAG]...
+
+  -h, --help       print this help, then exit
+  -V, --version    print version number and configuration settings, then exit
+      --config     print configuration, then exit
+  -q, --quiet, --silent
+                   do not print progress messages
+  -d, --debug      don't remove temporary files
+      --recheck    update $as_me by reconfiguring in the same conditions
+      --file=FILE[:TEMPLATE]
+                   instantiate the configuration file FILE
+
+Configuration files:
+$config_files
+
+Report bugs to <edd@debian.org>."
+
+_ACEOF
+ac_cs_config=`printf "%s\n" "$ac_configure_args" | sed "$ac_safe_unquote"`
+ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\''/g"`
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+ac_cs_config='$ac_cs_config_escaped'
+ac_cs_version="\\
+RcppEigen config.status 0.3.4.0.2-1
+configured by $0, generated by GNU Autoconf 2.72,
+  with options \\"\$ac_cs_config\\"
+
+Copyright (C) 2023 Free Software Foundation, Inc.
+This config.status script is free software; the Free Software Foundation
+gives unlimited permission to copy, distribute and modify it."
+
+ac_pwd='$ac_pwd'
+srcdir='$srcdir'
+test -n "\$AWK" || AWK=awk
+_ACEOF
+
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+# The default lists apply if the user does not specify any file.
+ac_need_defaults=:
+while test $# != 0
+do
+  case $1 in
+  --*=?*)
+    ac_option=`expr "X$1" : 'X\([^=]*\)='`
+    ac_optarg=`expr "X$1" : 'X[^=]*=\(.*\)'`
+    ac_shift=:
+    ;;
+  --*=)
+    ac_option=`expr "X$1" : 'X\([^=]*\)='`
+    ac_optarg=
+    ac_shift=:
+    ;;
+  *)
+    ac_option=$1
+    ac_optarg=$2
+    ac_shift=shift
+    ;;
+  esac
+
+  case $ac_option in
+  # Handling of the options.
+  -recheck | --recheck | --rechec | --reche | --rech | --rec | --re | --r)
+    ac_cs_recheck=: ;;
+  --version | --versio | --versi | --vers | --ver | --ve | --v | -V )
+    printf "%s\n" "$ac_cs_version"; exit ;;
+  --config | --confi | --conf | --con | --co | --c )
+    printf "%s\n" "$ac_cs_config"; exit ;;
+  --debug | --debu | --deb | --de | --d | -d )
+    debug=: ;;
+  --file | --fil | --fi | --f )
+    $ac_shift
+    case $ac_optarg in
+    *\'*) ac_optarg=`printf "%s\n" "$ac_optarg" | sed "s/'/'\\\\\\\\''/g"` ;;
+    '') as_fn_error $? "missing file argument" ;;
+    esac
+    as_fn_append CONFIG_FILES " '$ac_optarg'"
+    ac_need_defaults=false;;
+  --he | --h |  --help | --hel | -h )
+    printf "%s\n" "$ac_cs_usage"; exit ;;
+  -q | -quiet | --quiet | --quie | --qui | --qu | --q \
+  | -silent | --silent | --silen | --sile | --sil | --si | --s)
+    ac_cs_silent=: ;;
+
+  # This is an error.
+  -*) as_fn_error $? "unrecognized option: '$1'
+Try '$0 --help' for more information." ;;
+
+  *) as_fn_append ac_config_targets " $1"
+     ac_need_defaults=false ;;
+
+  esac
+  shift
+done
+
+ac_configure_extra_args=
+
+if $ac_cs_silent; then
+  exec 6>/dev/null
+  ac_configure_extra_args="$ac_configure_extra_args --silent"
+fi
+
+_ACEOF
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+if \$ac_cs_recheck; then
+  set X $SHELL '$0' $ac_configure_args \$ac_configure_extra_args --no-create --no-recursion
+  shift
+  \printf "%s\n" "running CONFIG_SHELL=$SHELL \$*" >&6
+  CONFIG_SHELL='$SHELL'
+  export CONFIG_SHELL
+  exec "\$@"
+fi
+
+_ACEOF
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+exec 5>>config.log
+{
+  echo
+  sed 'h;s/./-/g;s/^.../## /;s/...$/ ##/;p;x;p;x' <<_ASBOX
+## Running $as_me. ##
+_ASBOX
+  printf "%s\n" "$ac_log"
+} >&5
+
+_ACEOF
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+_ACEOF
+
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+
+# Handling of arguments.
+for ac_config_target in $ac_config_targets
+do
+  case $ac_config_target in
+    "src/Makevars") CONFIG_FILES="$CONFIG_FILES src/Makevars" ;;
+
+  *) as_fn_error $? "invalid argument: '$ac_config_target'" "$LINENO" 5;;
+  esac
+done
+
+
+# If the user did not use the arguments to specify the items to instantiate,
+# then the envvar interface is used.  Set only those that are not.
+# We use the long form for the default assignment because of an extremely
+# bizarre bug on SunOS 4.1.3.
+if $ac_need_defaults; then
+  test ${CONFIG_FILES+y} || CONFIG_FILES=$config_files
+fi
+
+# Have a temporary directory for convenience.  Make it in the build tree
+# simply because there is no reason against having it here, and in addition,
+# creating and moving files from /tmp can sometimes cause problems.
+# Hook for its removal unless debugging.
+# Note that there is a small window in which the directory will not be cleaned:
+# after its creation but before its name has been assigned to '$tmp'.
+$debug ||
+{
+  tmp= ac_tmp=
+  trap 'exit_status=$?
+  : "${ac_tmp:=$tmp}"
+  { test ! -d "$ac_tmp" || rm -fr "$ac_tmp"; } && exit $exit_status
+' 0
+  trap 'as_fn_exit 1' 1 2 13 15
+}
+# Create a (secure) tmp directory for tmp files.
+
+{
+  tmp=`(umask 077 && mktemp -d "./confXXXXXX") 2>/dev/null` &&
+  test -d "$tmp"
+}  ||
+{
+  tmp=./conf$$-$RANDOM
+  (umask 077 && mkdir "$tmp")
+} || as_fn_error $? "cannot create a temporary directory in ." "$LINENO" 5
+ac_tmp=$tmp
+
+# Set up the scripts for CONFIG_FILES section.
+# No need to generate them if there are no CONFIG_FILES.
+# This happens for instance with './config.status config.h'.
+if test -n "$CONFIG_FILES"; then
+
+
+ac_cr=`echo X | tr X '\015'`
+# On cygwin, bash can eat \r inside `` if the user requested igncr.
+# But we know of no other shell where ac_cr would be empty at this
+# point, so we can use a bashism as a fallback.
+if test "x$ac_cr" = x; then
+  eval ac_cr=\$\'\\r\'
+fi
+ac_cs_awk_cr=`$AWK 'BEGIN { print "a\rb" }' </dev/null 2>/dev/null`
+if test "$ac_cs_awk_cr" = "a${ac_cr}b"; then
+  ac_cs_awk_cr='\\r'
+else
+  ac_cs_awk_cr=$ac_cr
+fi
+
+echo 'BEGIN {' >"$ac_tmp/subs1.awk" &&
+_ACEOF
+
+
+{
+  echo "cat >conf$$subs.awk <<_ACEOF" &&
+  echo "$ac_subst_vars" | sed 's/.*/&!$&$ac_delim/' &&
+  echo "_ACEOF"
+} >conf$$subs.sh ||
+  as_fn_error $? "could not make $CONFIG_STATUS" "$LINENO" 5
+ac_delim_num=`echo "$ac_subst_vars" | grep -c '^'`
+ac_delim='%!_!# '
+for ac_last_try in false false false false false :; do
+  . ./conf$$subs.sh ||
+    as_fn_error $? "could not make $CONFIG_STATUS" "$LINENO" 5
+
+  ac_delim_n=`sed -n "s/.*$ac_delim\$/X/p" conf$$subs.awk | grep -c X`
+  if test $ac_delim_n = $ac_delim_num; then
+    break
+  elif $ac_last_try; then
+    as_fn_error $? "could not make $CONFIG_STATUS" "$LINENO" 5
+  else
+    ac_delim="$ac_delim!$ac_delim _$ac_delim!! "
+  fi
+done
+rm -f conf$$subs.sh
+
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+cat >>"\$ac_tmp/subs1.awk" <<\\_ACAWK &&
+_ACEOF
+sed -n '
+h
+s/^/S["/; s/!.*/"]=/
+p
+g
+s/^[^!]*!//
+:repl
+t repl
+s/'"$ac_delim"'$//
+t delim
+:nl
+h
+s/\(.\{148\}\)..*/\1/
+t more1
+s/["\\]/\\&/g; s/^/"/; s/$/\\n"\\/
+p
+n
+b repl
+:more1
+s/["\\]/\\&/g; s/^/"/; s/$/"\\/
+p
+g
+s/.\{148\}//
+t nl
+:delim
+h
+s/\(.\{148\}\)..*/\1/
+t more2
+s/["\\]/\\&/g; s/^/"/; s/$/"/
+p
+b
+:more2
+s/["\\]/\\&/g; s/^/"/; s/$/"\\/
+p
+g
+s/.\{148\}//
+t delim
+' <conf$$subs.awk | sed '
+/^[^""]/{
+  N
+  s/\n//
+}
+' >>$CONFIG_STATUS || ac_write_fail=1
+rm -f conf$$subs.awk
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+_ACAWK
+cat >>"\$ac_tmp/subs1.awk" <<_ACAWK &&
+  for (key in S) S_is_set[key] = 1
+  FS = ""
+
+}
+{
+  line = $ 0
+  nfields = split(line, field, "@")
+  substed = 0
+  len = length(field[1])
+  for (i = 2; i < nfields; i++) {
+    key = field[i]
+    keylen = length(key)
+    if (S_is_set[key]) {
+      value = S[key]
+      line = substr(line, 1, len) "" value "" substr(line, len + keylen + 3)
+      len += length(value) + length(field[++i])
+      substed = 1
+    } else
+      len += 1 + keylen
+  }
+
+  print line
+}
+
+_ACAWK
+_ACEOF
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+if sed "s/$ac_cr//" < /dev/null > /dev/null 2>&1; then
+  sed "s/$ac_cr\$//; s/$ac_cr/$ac_cs_awk_cr/g"
+else
+  cat
+fi < "$ac_tmp/subs1.awk" > "$ac_tmp/subs.awk" \
+  || as_fn_error $? "could not setup config files machinery" "$LINENO" 5
+_ACEOF
+
+# VPATH may cause trouble with some makes, so we remove sole $(srcdir),
+# ${srcdir} and @srcdir@ entries from VPATH if srcdir is ".", strip leading and
+# trailing colons and then remove the whole line if VPATH becomes empty
+# (actually we leave an empty line to preserve line numbers).
+if test "x$srcdir" = x.; then
+  ac_vpsub='/^[	 ]*VPATH[	 ]*=[	 ]*/{
+h
+s///
+s/^/:/
+s/[	 ]*$/:/
+s/:\$(srcdir):/:/g
+s/:\${srcdir}:/:/g
+s/:@srcdir@:/:/g
+s/^:*//
+s/:*$//
+x
+s/\(=[	 ]*\).*/\1/
+G
+s/\n//
+s/^[^=]*=[	 ]*$//
+}'
+fi
+
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+fi # test -n "$CONFIG_FILES"
+
+
+eval set X "  :F $CONFIG_FILES      "
+shift
+for ac_tag
+do
+  case $ac_tag in
+  :[FHLC]) ac_mode=$ac_tag; continue;;
+  esac
+  case $ac_mode$ac_tag in
+  :[FHL]*:*);;
+  :L* | :C*:*) as_fn_error $? "invalid tag '$ac_tag'" "$LINENO" 5;;
+  :[FH]-) ac_tag=-:-;;
+  :[FH]*) ac_tag=$ac_tag:$ac_tag.in;;
+  esac
+  ac_save_IFS=$IFS
+  IFS=:
+  set x $ac_tag
+  IFS=$ac_save_IFS
+  shift
+  ac_file=$1
+  shift
+
+  case $ac_mode in
+  :L) ac_source=$1;;
+  :[FH])
+    ac_file_inputs=
+    for ac_f
+    do
+      case $ac_f in
+      -) ac_f="$ac_tmp/stdin";;
+      *) # Look for the file first in the build tree, then in the source tree
+	 # (if the path is not absolute).  The absolute path cannot be DOS-style,
+	 # because $ac_f cannot contain ':'.
+	 test -f "$ac_f" ||
+	   case $ac_f in
+	   [\\/$]*) false;;
+	   *) test -f "$srcdir/$ac_f" && ac_f="$srcdir/$ac_f";;
+	   esac ||
+	   as_fn_error 1 "cannot find input file: '$ac_f'" "$LINENO" 5;;
+      esac
+      case $ac_f in *\'*) ac_f=`printf "%s\n" "$ac_f" | sed "s/'/'\\\\\\\\''/g"`;; esac
+      as_fn_append ac_file_inputs " '$ac_f'"
+    done
+
+    # Let's still pretend it is 'configure' which instantiates (i.e., don't
+    # use $as_me), people would be surprised to read:
+    #    /* config.h.  Generated by config.status.  */
+    configure_input='Generated from '`
+	  printf "%s\n" "$*" | sed 's|^[^:]*/||;s|:[^:]*/|, |g'
+	`' by configure.'
+    if test x"$ac_file" != x-; then
+      configure_input="$ac_file.  $configure_input"
+      { printf "%s\n" "$as_me:${as_lineno-$LINENO}: creating $ac_file" >&5
+printf "%s\n" "$as_me: creating $ac_file" >&6;}
+    fi
+    # Neutralize special characters interpreted by sed in replacement strings.
+    case $configure_input in #(
+    *\&* | *\|* | *\\* )
+       ac_sed_conf_input=`printf "%s\n" "$configure_input" |
+       sed 's/[\\\\&|]/\\\\&/g'`;; #(
+    *) ac_sed_conf_input=$configure_input;;
+    esac
+
+    case $ac_tag in
+    *:-:* | *:-) cat >"$ac_tmp/stdin" \
+      || as_fn_error $? "could not create $ac_file" "$LINENO" 5 ;;
+    esac
+    ;;
+  esac
+
+  ac_dir=`$as_dirname -- "$ac_file" ||
+$as_expr X"$ac_file" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \
+	 X"$ac_file" : 'X\(//\)[^/]' \| \
+	 X"$ac_file" : 'X\(//\)$' \| \
+	 X"$ac_file" : 'X\(/\)' \| . 2>/dev/null ||
+printf "%s\n" X"$ac_file" |
+    sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)[^/].*/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\/\)$/{
+	    s//\1/
+	    q
+	  }
+	  /^X\(\/\).*/{
+	    s//\1/
+	    q
+	  }
+	  s/.*/./; q'`
+  as_dir="$ac_dir"; as_fn_mkdir_p
+  ac_builddir=.
+
+case "$ac_dir" in
+.) ac_dir_suffix= ac_top_builddir_sub=. ac_top_build_prefix= ;;
+*)
+  ac_dir_suffix=/`printf "%s\n" "$ac_dir" | sed 's|^\.[\\/]||'`
+  # A ".." for each directory in $ac_dir_suffix.
+  ac_top_builddir_sub=`printf "%s\n" "$ac_dir_suffix" | sed 's|/[^\\/]*|/..|g;s|/||'`
+  case $ac_top_builddir_sub in
+  "") ac_top_builddir_sub=. ac_top_build_prefix= ;;
+  *)  ac_top_build_prefix=$ac_top_builddir_sub/ ;;
+  esac ;;
+esac
+ac_abs_top_builddir=$ac_pwd
+ac_abs_builddir=$ac_pwd$ac_dir_suffix
+# for backward compatibility:
+ac_top_builddir=$ac_top_build_prefix
+
+case $srcdir in
+  .)  # We are building in place.
+    ac_srcdir=.
+    ac_top_srcdir=$ac_top_builddir_sub
+    ac_abs_top_srcdir=$ac_pwd ;;
+  [\\/]* | ?:[\\/]* )  # Absolute name.
+    ac_srcdir=$srcdir$ac_dir_suffix;
+    ac_top_srcdir=$srcdir
+    ac_abs_top_srcdir=$srcdir ;;
+  *) # Relative name.
+    ac_srcdir=$ac_top_build_prefix$srcdir$ac_dir_suffix
+    ac_top_srcdir=$ac_top_build_prefix$srcdir
+    ac_abs_top_srcdir=$ac_pwd/$srcdir ;;
+esac
+ac_abs_srcdir=$ac_abs_top_srcdir$ac_dir_suffix
+
+
+  case $ac_mode in
+  :F)
+  #
+  # CONFIG_FILE
+  #
+
+_ACEOF
+
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+# If the template does not know about datarootdir, expand it.
+# FIXME: This hack should be removed a few years after 2.60.
+ac_datarootdir_hack=; ac_datarootdir_seen=
+ac_sed_dataroot='
+/datarootdir/ {
+  p
+  q
+}
+/@datadir@/p
+/@docdir@/p
+/@infodir@/p
+/@localedir@/p
+/@mandir@/p'
+case `eval "sed -n \"\$ac_sed_dataroot\" $ac_file_inputs"` in
+*datarootdir*) ac_datarootdir_seen=yes;;
+*@datadir@*|*@docdir@*|*@infodir@*|*@localedir@*|*@mandir@*)
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&5
+printf "%s\n" "$as_me: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&2;}
+_ACEOF
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+  ac_datarootdir_hack='
+  s&@datadir@&$datadir&g
+  s&@docdir@&$docdir&g
+  s&@infodir@&$infodir&g
+  s&@localedir@&$localedir&g
+  s&@mandir@&$mandir&g
+  s&\\\${datarootdir}&$datarootdir&g' ;;
+esac
+_ACEOF
+
+# Neutralize VPATH when '$srcdir' = '.'.
+# Shell code in configure.ac might set extrasub.
+# FIXME: do we really want to maintain this feature?
+cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
+ac_sed_extra="$ac_vpsub
+$extrasub
+_ACEOF
+cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
+:t
+/@[a-zA-Z_][a-zA-Z_0-9]*@/!b
+s|@configure_input@|$ac_sed_conf_input|;t t
+s&@top_builddir@&$ac_top_builddir_sub&;t t
+s&@top_build_prefix@&$ac_top_build_prefix&;t t
+s&@srcdir@&$ac_srcdir&;t t
+s&@abs_srcdir@&$ac_abs_srcdir&;t t
+s&@top_srcdir@&$ac_top_srcdir&;t t
+s&@abs_top_srcdir@&$ac_abs_top_srcdir&;t t
+s&@builddir@&$ac_builddir&;t t
+s&@abs_builddir@&$ac_abs_builddir&;t t
+s&@abs_top_builddir@&$ac_abs_top_builddir&;t t
+$ac_datarootdir_hack
+"
+eval sed \"\$ac_sed_extra\" "$ac_file_inputs" | $AWK -f "$ac_tmp/subs.awk" \
+  >$ac_tmp/out || as_fn_error $? "could not create $ac_file" "$LINENO" 5
+
+test -z "$ac_datarootdir_hack$ac_datarootdir_seen" &&
+  { ac_out=`sed -n '/\${datarootdir}/p' "$ac_tmp/out"`; test -n "$ac_out"; } &&
+  { ac_out=`sed -n '/^[	 ]*datarootdir[	 ]*:*=/p' \
+      "$ac_tmp/out"`; test -z "$ac_out"; } &&
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: $ac_file contains a reference to the variable 'datarootdir'
+which seems to be undefined.  Please make sure it is defined" >&5
+printf "%s\n" "$as_me: WARNING: $ac_file contains a reference to the variable 'datarootdir'
+which seems to be undefined.  Please make sure it is defined" >&2;}
+
+  rm -f "$ac_tmp/stdin"
+  case $ac_file in
+  -) cat "$ac_tmp/out" && rm -f "$ac_tmp/out";;
+  *) rm -f "$ac_file" && mv "$ac_tmp/out" "$ac_file";;
+  esac \
+  || as_fn_error $? "could not create $ac_file" "$LINENO" 5
+ ;;
+
+
+
+  esac
+
+done # for ac_tag
+
+
+as_fn_exit 0
+_ACEOF
+ac_clean_files=$ac_clean_files_save
+
+test $ac_write_fail = 0 ||
+  as_fn_error $? "write failure creating $CONFIG_STATUS" "$LINENO" 5
+
+
+# configure is writing to config.log, and then calls config.status.
+# config.status does its own redirection, appending to config.log.
+# Unfortunately, on DOS this fails, as config.log is still kept open
+# by configure, so config.status won't be able to write to it; its
+# output is simply discarded.  So we exec the FD to /dev/null,
+# effectively closing config.log, so it can be properly (re)opened and
+# appended to by config.status.  When coming back to configure, we
+# need to make the FD available again.
+if test "$no_create" != yes; then
+  ac_cs_success=:
+  ac_config_status_args=
+  test "$silent" = yes &&
+    ac_config_status_args="$ac_config_status_args --quiet"
+  exec 5>/dev/null
+  $SHELL $CONFIG_STATUS $ac_config_status_args || ac_cs_success=false
+  exec 5>>config.log
+  # Use ||, not &&, to avoid exiting from the if with $? = 1, which
+  # would make configure fail if this is the last instruction.
+  $ac_cs_success || as_fn_exit 1
+fi
+if test -n "$ac_unrecognized_opts" && test "$enable_option_checking" != no; then
+  { printf "%s\n" "$as_me:${as_lineno-$LINENO}: WARNING: unrecognized options: $ac_unrecognized_opts" >&5
+printf "%s\n" "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2;}
+fi
+
+
diff --git a/configure.ac b/configure.ac
new file mode 100644
index 00000000..af1d0238
--- /dev/null
+++ b/configure.ac
@@ -0,0 +1,182 @@
+
+##  RcppEigen configure.ac
+##
+##  'Rcpp' Integration for the 'Eigen' Templated Linear Algebra Library
+##
+##  Copyright (C) 2025  Dirk Eddelbuettel
+##
+##  Licensed under GPL-2 or later
+
+##  This is borrowed with love from RcppArmadillo, which itself drew a lot
+##  of inspiration from the data.table configuration.  We have it a little
+##  bit more easy here as all actual OpenMP-using code is inside Eigen.
+
+## require at least autoconf 2.69
+AC_PREREQ([2.69])
+
+## Process this file with autoconf to produce a configure script.
+AC_INIT([RcppEigen],[0.3.4.0.2-1],[edd@debian.org])
+
+## Set R_HOME, respecting an environment variable if one is set
+: ${R_HOME=$(R RHOME)}
+if test -z "${R_HOME}"; then
+    AC_MSG_ERROR([Could not determine R_HOME.])
+fi
+
+## Use R to set CXX and CXXFLAGS
+CXX=$(${R_HOME}/bin/R CMD config CXX)
+CXXFLAGS=$("${R_HOME}/bin/R" CMD config CXXFLAGS)
+
+## We are using C++
+AC_LANG(C++)
+AC_REQUIRE_CPP
+AC_PROG_CXX
+
+## Rely on Sys.info() from R for cross-compilation cases
+AC_MSG_CHECKING([what system we are on])
+SYSKERNEL=$("${R_HOME}/bin/Rscript" --vanilla -e 'cat(Sys.info()[["sysname"]])')
+SYSMACHINE=$("${R_HOME}/bin/Rscript" --vanilla -e 'cat(Sys.info()[["machine"]])')
+AC_MSG_RESULT([running ${SYSKERNEL} on ${SYSMACHINE}])
+
+
+## Default to not assuming OpenMP, but then test a variety of setups
+can_use_openmp="no"
+
+## Ensure TMPDIR is set.
+AC_MSG_CHECKING([whether we have a suitable tempdir])
+TMPDIR=$("${R_HOME}/bin/R" --vanilla --slave -e "cat(dirname(tempdir()))")
+AC_MSG_RESULT([${TMPDIR}])
+
+## Create private directory in TMPDIR.
+BUILDDIR="${TMPDIR}/rcppeigen-$$-$RANDOM"
+mkdir -p "${BUILDDIR}"
+
+owd=$(pwd)
+cd "${BUILDDIR}"
+
+cat <<EOF > test-omp.cpp
+#include <omp.h>
+int main() {
+  return omp_get_num_threads();
+}
+EOF
+
+AC_MSG_CHECKING([whether on Linux])
+if test x"${SYSKERNEL}" = x"Linux"; then
+   AC_MSG_RESULT([yes])
+
+   ## Check if R is configured to compile OpenMP programs out-of-the-box.
+   if test x"${can_use_openmp}" = x"no"; then
+       AC_MSG_CHECKING([whether R CMD SHLIB can already compile OpenMP programs])
+       "${R_HOME}/bin/R" CMD SHLIB test-omp.cpp >/dev/null 2>&1
+       if test x"$?" = x"0"; then
+           AC_MSG_RESULT([yes])
+           can_use_openmp="yes"
+       else
+           AC_MSG_RESULT([no])
+           can_use_openmp="no"
+       fi
+   fi
+
+   ## If needed, check if R is configured to compile OpenMP programs using -fopenmp
+   if test x"${can_use_openmp}" = x"no"; then
+       AC_MSG_CHECKING([whether R CMD SHLIB can compile OpenMP via -fopenmp])
+       PKG_CXXFLAGS="${PKG_CXXFLAGS} -fopenmp" PKG_LIBS="${PKG_LIBS} -fopenmp" "${R_HOME}/bin/R" CMD SHLIB -fopenmp test-omp.cpp >/dev/null 2>&1
+       if test x"$?" = x"0"; then
+           AC_MSG_RESULT([yes])
+           can_use_openmp="yes"
+           # keep any entries user may have set
+           PKG_CXXFLAGS="${PKG_CXXFLAGS} -fopenmp"
+           PKG_LIBS="${PKG_LIBS} -fopenmp"
+       else
+           AC_MSG_RESULT([no])
+           can_use_openmp="no"
+       fi
+   fi
+else
+   AC_MSG_RESULT([no])
+fi # if Linux
+
+AC_MSG_CHECKING([whether on macOS])
+if test x"${SYSKERNEL}" = x"Darwin"; then
+   AC_MSG_RESULT([yes])
+
+   ## Check if R is configured to compile OpenMP programs using -Xclang -fopenmp
+   if test x"${can_use_openmp}" = x"no"; then
+       AC_MSG_CHECKING([whether R CMD SHLIB can compile OpenMP programs using '-Xclang -fopenmp'])
+       PKG_CXXFLAGS="${PKG_CXXFLAGS} -Xclang -fopenmp" PKG_LIBS="${PKG_LIBS} -lomp" "${R_HOME}/bin/R" CMD SHLIB test-omp.cpp >/dev/null 2>&1
+       if test x"$?" = x"0"; then
+           AC_MSG_RESULT([yes])
+           can_use_openmp="yes"
+           # keep any entries user may have set
+           PKG_CXXFLAGS="${PKG_CXXFLAGS} -Xclang -fopenmp"
+           PKG_LIBS="${PKG_LIBS} -lomp"
+       else
+           AC_MSG_RESULT([no])
+           can_use_openmp="no"
+       fi
+   fi
+
+   ## Check if R is configured to compile OpenMP programs using -fopenmp (cf data.table #6409)
+   if test x"${can_use_openmp}" = x"no"; then
+       AC_MSG_CHECKING([whether R CMD SHLIB can compile OpenMP programs using '-fopenmp'])
+       PKG_CXXFLAGS="${PKG_CXXFLAGS} -fopenmp" PKG_LIBS="${PKG_LIBS} -fopenmp" "${R_HOME}/bin/R" CMD SHLIB test-omp.cpp >/dev/null 2>&1
+       if test x"$?" = x"0"; then
+           AC_MSG_RESULT([yes])
+           can_use_openmp="yes"
+           # keep any entries user may have set
+           PKG_CXXFLAGS="${PKG_CXXFLAGS} -fopenmp"
+           PKG_LIBS="${PKG_LIBS} -fopenmp"
+       else
+           AC_MSG_RESULT([no])
+           can_use_openmp="no"
+       fi
+   fi
+
+   if test x"${can_use_openmp}" = x"no"; then
+       if test x"${SYSMACHINE}" = x"arm64"; then
+           HOMEBREW_PREFIX=/opt/homebrew
+       else
+           HOMEBREW_PREFIX=/usr/local
+       fi
+       if test -e "${HOMEBREW_PREFIX}/opt/libomp"; then
+           AC_MSG_CHECKING([whether R CMD SHLIB can use libomp at ${HOMEBREW_PREFIX}/opt/libomp])
+           LIBOMP_INCLUDE="-I${HOMEBREW_PREFIX}/opt/libomp/include -Xclang -fopenmp"
+           LIBOMP_LINK="-L${HOMEBREW_PREFIX}/opt/libomp/lib -lomp"
+           PKG_CXXFLAGS="${PKG_CXXFLAGS} ${LIBOMP_INCLUDE}" PKG_LIBS="${PKG_LIBS} ${LIBOMP_LINK}" "${R_HOME}/bin/R" CMD SHLIB test-omp.cpp >/dev/null 2>&1
+           if test x"$?" = x"0"; then
+               AC_MSG_RESULT([yes])
+               can_use_openmp="yes"
+               # keep any entries user may have set
+               PKG_CXXFLAGS="${PKG_CXXFLAGS} ${LIBOMP_INCLUDE}"
+               PKG_LIBS="${PKG_LIBS} ${LIBOMP_LINK}"
+           else
+               AC_MSG_RESULT([no])
+               can_use_openmp="no"
+           fi
+       fi
+   fi
+else
+   AC_MSG_RESULT([no])
+fi # if macOS
+
+## Go back home.
+cd "${owd}"
+rm -rf "${BUILDDIR}"
+
+# Overall summary
+AC_MSG_CHECKING([for OpenMP])
+if test x"${can_use_openmp}" = x"yes"; then
+   AC_MSG_RESULT([found and suitable])
+   openmp_flag='$(SHLIB_OPENMP_CXXFLAGS)'
+else
+   AC_MSG_RESULT([missing so no OpenMP acceleration])
+   openmp_flag=""
+fi
+
+## Now use all these
+AC_SUBST([OPENMP_FLAG], ["${openmp_flag}"])
+AC_SUBST([PKG_CXXFLAGS], ["${PKG_CXXFLAGS}"])
+AC_SUBST([PKG_LIBS], ["${PKG_LIBS}"])
+AC_CONFIG_FILES([src/Makevars])
+AC_OUTPUT
diff --git a/debian/changelog b/debian/changelog
deleted file mode 100644
index 86640a6e..00000000
--- a/debian/changelog
+++ /dev/null
@@ -1,120 +0,0 @@
-r-cran-rcppeigen (0.3.2.7.0-1) unstable; urgency=low
-
-  * New upstream release
-
- -- Dirk Eddelbuettel <edd@debian.org>  Tue, 19 Jan 2016 20:35:19 -0600
-
-r-cran-rcppeigen (0.3.2.5.1-1) unstable; urgency=low
-
-  * New upstream release
-
- -- Dirk Eddelbuettel <edd@debian.org>  Wed, 23 Sep 2015 06:15:08 -0500
-
-r-cran-rcppeigen (0.3.2.5.0-1) unstable; urgency=low
-
-  * New upstream release
-
- -- Dirk Eddelbuettel <edd@debian.org>  Tue, 14 Jul 2015 17:57:58 -0500
-
-r-cran-rcppeigen (0.3.2.4.0-1) unstable; urgency=low
-
-  * New upstream release
-  
-  * debian/control: Add r-cran-pkgkitten to (Build-)Depends
-
- -- Dirk Eddelbuettel <edd@debian.org>  Thu, 26 Feb 2015 14:12:35 -0600
-
-r-cran-rcppeigen (0.3.2.3.0-1) unstable; urgency=low
-
-  * New upstream release
-  
-  * debian/control: Set Build-Depends: to current R version 
-  * debian/control: Set Standards-Version: to current version 
-
- -- Dirk Eddelbuettel <edd@debian.org>  Tue, 23 Dec 2014 14:46:16 -0600
-
-r-cran-rcppeigen (0.3.2.2.0-1) unstable; urgency=low
-
-  * New upstream release
-  
-  * debian/control: Set Build-Depends: to current R version 
-
- -- Dirk Eddelbuettel <edd@debian.org>  Thu, 21 Aug 2014 07:10:31 -0500
-
-r-cran-rcppeigen (0.3.2.1.2-1) unstable; urgency=low
-
-  * New upstream release
-  
-  * debian/control: Set Build-Depends: to current R version 
-
- -- Dirk Eddelbuettel <edd@debian.org>  Tue, 06 May 2014 18:08:48 -0500
-
-r-cran-rcppeigen (0.3.2.1.1-1) unstable; urgency=low
-
-  * New upstream release
-
-  * debian/control: Set Build-Depends: to current R version 
-
- -- Dirk Eddelbuettel <edd@debian.org>  Sat, 08 Mar 2014 08:10:32 -0600
-
-r-cran-rcppeigen (0.3.2.1.0-1) unstable; urgency=low
-
-  * New upstream release
-
- -- Dirk Eddelbuettel <edd@debian.org>  Mon, 03 Mar 2014 19:37:28 -0600
-
-r-cran-rcppeigen (0.3.2.0.3-1) unstable; urgency=low
-
-  * New upstream release
-
- -- Dirk Eddelbuettel <edd@debian.org>  Sat, 01 Mar 2014 19:09:07 -0600
-
-r-cran-rcppeigen (0.3.2.0.2-2) unstable; urgency=low
-
-  * Rebuilt as required for Rcpp 0.11.0
-  * debian/control: Added (Build-)Depends: on r-cran-rcpp (>= 0.11.0-1)
-
- -- Dirk Eddelbuettel <edd@debian.org>  Mon, 03 Feb 2014 20:27:49 -0600
-
-r-cran-rcppeigen (0.3.2.0.2-1) unstable; urgency=low
-
-  * New upstream release
-
- -- Dirk Eddelbuettel <edd@debian.org>  Sun, 26 Jan 2014 11:43:41 -0600
-
-r-cran-rcppeigen (0.3.2.0.1-1) unstable; urgency=low
-
-  * New upstream release
-
- -- Dirk Eddelbuettel <edd@debian.org>  Thu, 19 Dec 2013 19:35:07 -0600
-
-r-cran-rcppeigen (0.3.2.0-1) unstable; urgency=low
-
-  * New upstream release
-
- -- Dirk Eddelbuettel <edd@debian.org>  Wed, 13 Nov 2013 15:06:56 -0600
-
-r-cran-rcppeigen (0.3.1.2.3-2) unstable; urgency=low
-
-  * debian/control: Correcting Depends: on r-cran-rcppeigen
-
- -- Dirk Eddelbuettel <edd@debian.org>  Tue, 29 Oct 2013 16:43:18 -0500
-
-r-cran-rcppeigen (0.3.1.2.3-1) unstable; urgency=low
-
-  * New upstream release
-
-  * debian/control: Set Build-Depends: to current R version
-  * debian/control: Set Standards-Version: to current version 
-  
-  * debian/control: Update Build-Depends: to r-cran-matrix (>= 1.1-0)
-
- -- Dirk Eddelbuettel <edd@debian.org>  Tue, 29 Oct 2013 06:30:00 -0500
-
-r-cran-rcppeigen (0.3.1.2.1-1) unstable; urgency=low
-
-  * Initial Debian release 				(Closes: #722652)
-  
- -- Dirk Eddelbuettel <edd@debian.org>  Thu, 12 Sep 2013 20:59:34 -0500
-
-
diff --git a/debian/compat b/debian/compat
deleted file mode 100644
index 7f8f011e..00000000
--- a/debian/compat
+++ /dev/null
@@ -1 +0,0 @@
-7
diff --git a/debian/control b/debian/control
deleted file mode 100644
index 53d0d062..00000000
--- a/debian/control
+++ /dev/null
@@ -1,22 +0,0 @@
-Source: r-cran-rcppeigen
-Section: gnu-r
-Priority: optional
-Maintainer: Dirk Eddelbuettel <edd@debian.org>
-Build-Depends: debhelper (>= 7.0.0), r-base-dev (>= 3.2.3), cdbs, r-cran-rcpp (>= 0.11.0), r-cran-matrix (>= 1.1-0), r-cran-pkgkitten
-Standards-Version: 3.9.6
-
-Package: r-cran-rcppeigen
-Architecture: any
-Depends: ${shlibs:Depends}, ${misc:Depends}, ${R:Depends}, r-cran-rcpp (>= 0.11.0-1), r-cran-matrix (>= 1.1-0), r-cran-pkgkitten
-Description: GNU R package for Eigen templated linear algebra
- Eigen is a C++ template library for linear algebra: matrices, vectors,
- numerical solvers and related algorithms.  It supports dense
- and sparse matrices on integer, floating point and complex
- numbers, decompositions of such matrices, and solutions of
- linear systems. Its performance on many algorithms is
- comparable with some of the best implementations based on
- Lapack and level-3 BLAS.
- .
- The RcppEigen package includes the header files from the Eigen C++
- template library. Thus users do not need to install Eigen itself in
- order to use RcppEigen.
diff --git a/debian/copyright b/debian/copyright
deleted file mode 100644
index 70f072d6..00000000
--- a/debian/copyright
+++ /dev/null
@@ -1,581 +0,0 @@
-This is the Debian GNU/Linux r-cran-rcppeigen package of RcppEigen.
-The RcppEigen package provides GNU R integration of the Eigen
-templated linear algebra library by using Rcpp and was written by
-Douglas Bates, Romain Francois and Dirk Eddelbuettel.
-
-This package was created by Dirk Eddelbuettel <edd@debian.org>.
-The sources were downloaded from the main CRAN site
-	http://cran.r-project.org/src/contrib/
-and are also available from all CRAN mirrors as e.g.
-	http://cran.us.r-project.org/src/contrib/
-
-The package was renamed from its upstream name 'RcppEigen' to
-'r-cran-rcppeigen' to fit the pattern of CRAN (and non-CRAN) packages
-for R.
-
-
-
-Files: inst/include/Eigen/* inst/include/unsupported/Eigen/*
-Copyright: 2006 - 2012  Gael Guennebaud, Benoit Jacob
-License: MPL-2
-
-Files: unsupported/Eigen/FFT unsupported/Eigen/src/FFT/ei_fftw_impl.h unsupported/Eigen/src/FFT/ei_kissfft_impl.h
-Copyright: 2003 - 2009  Mark Borgerding
-License: MPL-2
-
-Files: inst/include/Eigen/src/SuperLUSupport/SuperLUSupport.h
-Copyright: 1994  Xerox Corporation
-License: 
-   THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
-   EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
-
-Files: inst/include/unsupported/Eigen/src/IterativeSolvers/IterationController.h
-Copyright: 1997 - 2001 Andrew Lumsdaine and Lie-Quan Lee
-License:
-   This file is part of the Iterative Template Library
-
-   You should have received a copy of the License Agreement for the
-   Iterative Template Library along with the software;  see the
-   file LICENSE.  
-
-   Permission to modify the code and to distribute modified code is
-   granted, provided the text of this NOTICE is retained, a notice that
-   the code was modified is included with the above COPYRIGHT NOTICE and
-   with the COPYRIGHT NOTICE in the LICENSE file, and that the LICENSE
-   file is distributed with the modified code.
-
-   LICENSOR MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED.
-   By way of example, but not limitation, Licensor MAKES NO
-   REPRESENTATIONS OR WARRANTIES OF MERCHANTABILITY OR FITNESS FOR ANY
-   PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE COMPONENTS
-   OR DOCUMENTATION WILL NOT INFRINGE ANY PATENTS, COPYRIGHTS, TRADEMARKS
-   OR OTHER RIGHTS.
-
-Files: inst/include/unsupported/Eigen/Splines inst/include/unsupported/Eigen/src/Splines/SplineFitting.h inst/include/unsupported/Eigen/src/Splines/SplineFwd.h inst/include/unsupported/Eigen/src/Splines/Spline.h inst/include/Eigen/src/Geometry/Umeyama.h inst/include/Eigen/src/StlSupport/details.h inst/include/Eigen/src/StlSupport/StdDeque.h inst/include/Eigen/src/StlSupport/StdList.h inst/include/Eigen/src/StlSupport/StdVector.h inst/include/Eigen/StdDeque inst/include/Eigen/StdList inst/include/Eigen/StdVector inst/include/Eigen/src/Core/DenseStorage.h inst/include/Eigen/src/Core/util/Memory.h inst/include/Eigen/src/Geometry/Transform.h
-Copyright: 2009 - 2011 Hauke Heibel <hauke.heibel@gmail.com>
-License: MPL-2
-
-Files: inst/include/Eigen/src/LU/arch/Inverse_SSE.h inst/include/Eigen/src/Cholesky/LLT_MKL.h inst/include/Eigen/src/Core/Assign_MKL.h inst/include/Eigen/src/Core/products/GeneralMatrixMatrix_MKL.h inst/include/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h inst/include/Eigen/src/Core/products/GeneralMatrixVector_MKL.h inst/include/Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h inst/include/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h inst/include/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h inst/include/Eigen/src/Core/products/TriangularMatrixVector_MKL.h inst/include/Eigen/src/Core/products/TriangularSolverMatrix_MKL.h inst/include/Eigen/src/Core/util/MKL_support.h inst/include/Eigen/src/Eigenvalues/ComplexSchur_MKL.h inst/include/Eigen/src/Eigenvalues/RealSchur_MKL.h inst/include/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h inst/include/Eigen/src/LU/PartialPivLU_MKL.h inst/include/Eigen/src/PardisoSupport/PardisoSupport.h inst/include/Eigen/src/QR/ColPivHouseholderQR_MKL.h inst/include/Eigen/src/QR/HouseholderQR_MKL.h inst/include/Eigen/src/SVD/JacobiSVD_MKL.h
-Copyright: 2001 Intel Corporation
-License:
-   Permition is granted to use, copy, distribute and prepare derivative works
-   of this library for any purpose and without fee, provided, that the above
-   copyright notice and this statement appear in all copies.
-   Intel makes no representations about the suitability of this software for
-   any purpose, and specifically disclaims all warranties.
-
-Files: inst/include/unsupported/Eigen/src/IterativeSolvers/ConstrainedConjGrad.h
-Copyright: 2002 - 2007 Yves Renard
-License: LPGL-2.1
-
-Files: inst/include/Eigen/src/OrderingMethods/Amd.h
-Copyright: 2006 Timothy A. Davis
-License: LGPL-2.1
-
-Files: inst/include/Eigen/src/Core/arch/SSE/MathFunctions.h
-Copyright: 2007 Julien Pommier
-License: MPL-2
-
-Files: inst/include/Eigen/src/Core/Assign.h
-Copyright: 2007 Michael Olbrich
-License: MPL-2
-
-Files: inst/include/unsupported/Eigen/src/Skyline/SkylineMatrixBase.h inst/include/unsupported/Eigen/src/Skyline/SkylineMatrix.h inst/include/unsupported/Eigen/src/Skyline/SkylineProduct.h inst/include/unsupported/Eigen/src/Skyline/SkylineStorage.h inst/include/unsupported/Eigen/src/Skyline/SkylineInplaceLU.h inst/include/unsupported/Eigen/src/Skyline/SkylineUtil.h
-Copyright: 2008 - 2009 Guillaume Saupin <guillaume.saupin@cea.fr>;
-License: MPL-2
-
-Files: inst/include/Eigen/src/Core/arch/AltiVec/PacketMath.h inst/include/Eigen/src/Core/arch/NEON/PacketMath.h
-Copyright: 2008 Konstantinos Margaritis
-License: MPL-2
-
-Files: inst/include/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h inst/include/unsupported/Eigen/src/MatrixFunction.h inst/include/unsupported/Eigen/MatrixFunctions inst/include/unsupported/Eigen/src/MatrixFunctions/MatrixFunctionAtomic.h inst/include/Eigen/src/Eigenvalues/ComplexEigenSolver.h inst/include/Eigen/src/Eigenvalues/ComplexSchur.h inst/include/Eigen/src/Eigenvalues/EigenSolver.h inst/include/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h inst/include/Eigen/src/Eigenvalues/HessenbergDecomposition.h inst/include/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h inst/include/Eigen/src/Eigenvalues/RealSchur.h inst/include/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h inst/include/Eigen/src/Eigenvalues/Tridiagonalization.h inst/include/unsupported/Eigen/src/MatrixFunctions/StemFunction.h inst/include/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h inst/include/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h
-Copyright: 2009 - 2010 Jitse Niesen <jitse@maths.leeds.ac.uk>
-License: MPL-2
-
-Files: inst/include/Eigen/src/Eigenvalues/ComplexEigenSolver.h inst/include/Eigen/src/Eigenvalues/ComplexSchur.h
-Copyright: 2009 Claire Maurice
-License: MPL-2
-
-Files: inst/include/unsupported/Eigen/BVH inst/include/unsupported/Eigen/src/BVH/BVAlgorithms.h inst/include/unsupported/Eigen/src/BVH/KdBVH.h
-Copyright: 2009 Ilya Baran <ibaran@mit.edu>
-License: MPL-2
-
-Files: inst/include/Eigen/src/Cholesky/LDLT.h
-Copyright: 2009 Keir Mierle <mierle@gmail.com>
-License: MPL-2
-
-Files: inst/include/Eigen/src/Geometry/Quaternion.h
-Copyright: 2009 Mathieu Gautier <mathieu.gautier@cea.fr>
-License: MPL-2
-
-Files: inst/include/Eigen/src/Geometry/arch/Geometry_SSE.h inst/include/unsupported/Eigen/src/MoreVectorization/MathFunctions.h
-Copyright: 2009 Rohit Garg <rpg.314@gmail.com>
-License: MPL-2
-
-Files: inst/include/unsupported/Eigen/NonLinearOptimization inst/include/unsupported/Eigen/NumericalDiff inst/include/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h inst/include/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h inst/include/unsupported/Eigen/src/NumericalDiff/NumericalDiff.h inst/include/Eigen/src/Core/util/Memory.h
-Copyright: 2009 - 2010 Thomas Capricelli <orzel@freehackers.org>
-License: MPL-2
-
-Files: inst/include/Eigen/src/SparseCore/SparseView.h inst/include/Eigen/src/Sparse/SparseView.h
-Copyright: 2010 Daniel Lowengrub <lowdanie@gmail.com>
-License: MPL-2
-
-Files: inst/include/unsupported/Eigen/src/Polynomials/Companion.h inst/include/unsupported/Eigen/src/Polynomials/PolynomialSolver.h inst/include/unsupported/Eigen/src/Polynomials/PolynomialUtils.h
-Copyright: 2010 Manuel Yguel <manuel.yguel@gmail.com>
-License: MPL-2
-
-Files: inst/include/Eigen/src/Householder/BlockHouseholder.h inst/include/Eigen/src/QR/HouseholderQR.h
-Copyright: 2010 Vincent Lejeune
-License: MPL-2
-
-Files: inst/include/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h
-Copyright: 2011 Andreas Platen <andiplaten@gmx.de>
-License: MPL-2
-
-Files: inst/include/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h inst/include/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h
-Copyright: 2011 Chen-Pang He <jdh8@ms63.hinet.net>
-License: MPL-2
-
-Files: inst/include/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h inst/include/unsupported/Eigen/src/IterativeSolvers/GMRES.h
-Copyright: 2012 Kolja Brix <brix@igpm.rwth-aachen.de>
-License: MPL-2
-
-Files: inst/include/Eigen/src/Cholesky/LDLT.h
-Copyright: 2011 Timothy E. Holy <tim.holy@gmail.com>
-License: MPL-2
-
-Files: inst/include/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h inst/include/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h inst/include/Eigen/src/PaStiXSupport/PaStiXSupport.h inst/include/unsupported/Eigen/src/IterativeSolvers/Scaling.h inst/include/unsupported/Eigen/src/SparseExtra/MarketIO.h inst/include/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h 
-Copyright: 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam@inria.fr>
-License: MPL-2
-
-Files: *
-Copyright: 2011 - 2013  Douglas Bates, Romain Francois and Dirk Eddelbuettel
-License: GPL-2
-
-Files: debian/*
-Copyright: 2013  Dirk Eddelbuettel <edd@debian.org>
-License: GPL-2
-
-
-On a Debian GNU/Linux system, the GPL license (version 2) is included
-in the file /usr/share/common-licenses/GPL-2.
-
-On a Debian GNU/Linux system, the LPGL license (version 2.1) is included
-in the file /usr/share/common-licenses/LGPL-2.1.
-
-License: MPL-2
-    Mozilla Public License Version 2.0
-    ==================================
-    .
-    1. Definitions
-    --------------
-    .
-    1.1. "Contributor"
-    means each individual or legal entity that creates, contributes to
-    the creation of, or owns Covered Software.
-    .
-    1.2. "Contributor Version"
-    means the combination of the Contributions of others (if any) used
-    by a Contributor and that particular Contributor's Contribution.
-    .
-    1.3. "Contribution"
-    means Covered Software of a particular Contributor.
-    .
-    1.4. "Covered Software"
-    means Source Code Form to which the initial Contributor has attached
-    the notice in Exhibit A, the Executable Form of such Source Code
-    Form, and Modifications of such Source Code Form, in each case
-    including portions thereof.
-    .
-    1.5. "Incompatible With Secondary Licenses"
-    means
-    .
-    (a) that the initial Contributor has attached the notice described
-        in Exhibit B to the Covered Software; or
-    .
-    (b) that the Covered Software was made available under the terms of
-        version 1.1 or earlier of the License, but not also under the
-        terms of a Secondary License.
-    .
-    1.6. "Executable Form"
-    means any form of the work other than Source Code Form.
-    .
-    1.7. "Larger Work"
-    means a work that combines Covered Software with other material, in 
-    a separate file or files, that is not Covered Software.
-    .
-    1.8. "License"
-    means this document.
-    .
-    1.9. "Licensable"
-    means having the right to grant, to the maximum extent possible,
-    whether at the time of the initial grant or subsequently, any and
-    all of the rights conveyed by this License.
-    .
-    1.10. "Modifications"
-    means any of the following:
-    .
-    (a) any file in Source Code Form that results from an addition to,
-        deletion from, or modification of the contents of Covered
-        Software; or
-    .
-    (b) any new file in Source Code Form that contains any Covered
-        Software.
-    .
-    1.11. "Patent Claims" of a Contributor
-    means any patent claim(s), including without limitation, method,
-    process, and apparatus claims, in any patent Licensable by such
-    Contributor that would be infringed, but for the grant of the
-    License, by the making, using, selling, offering for sale, having
-    made, import, or transfer of either its Contributions or its
-    Contributor Version.
-    .
-    1.12. "Secondary License"
-    means either the GNU General Public License, Version 2.0, the GNU
-    Lesser General Public License, Version 2.1, the GNU Affero General
-    Public License, Version 3.0, or any later versions of those
-    licenses.
-    .
-    1.13. "Source Code Form"
-    means the form of the work preferred for making modifications.
-    .
-    1.14. "You" (or "Your")
-    means an individual or a legal entity exercising rights under this
-    License. For legal entities, "You" includes any entity that
-    controls, is controlled by, or is under common control with You. For
-    purposes of this definition, "control" means (a) the power, direct
-    or indirect, to cause the direction or management of such entity,
-    whether by contract or otherwise, or (b) ownership of more than
-    fifty percent (50%) of the outstanding shares or beneficial
-    ownership of such entity.
-    .
-    2. License Grants and Conditions
-    --------------------------------
-    .
-    2.1. Grants
-    .
-    Each Contributor hereby grants You a world-wide, royalty-free,
-    non-exclusive license:
-    .
-    (a) under intellectual property rights (other than patent or trademark)
-    Licensable by such Contributor to use, reproduce, make available,
-    modify, display, perform, distribute, and otherwise exploit its
-    Contributions, either on an unmodified basis, with Modifications, or
-    as part of a Larger Work; and
-    .
-    (b) under Patent Claims of such Contributor to make, use, sell, offer
-    for sale, have made, import, and otherwise transfer either its
-    Contributions or its Contributor Version.
-    .
-    2.2. Effective Date
-    .
-    The licenses granted in Section 2.1 with respect to any Contribution
-    become effective for each Contribution on the date the Contributor first
-    distributes such Contribution.
-    .
-    2.3. Limitations on Grant Scope
-    .
-    The licenses granted in this Section 2 are the only rights granted under
-    this License. No additional rights or licenses will be implied from the
-    distribution or licensing of Covered Software under this License.
-    Notwithstanding Section 2.1(b) above, no patent license is granted by a
-    Contributor:
-    .
-    (a) for any code that a Contributor has removed from Covered Software;
-    or
-    .
-    (b) for infringements caused by: (i) Your and any other third party's
-    modifications of Covered Software, or (ii) the combination of its
-    Contributions with other software (except as part of its Contributor
-    Version); or
-    .
-    (c) under Patent Claims infringed by Covered Software in the absence of
-    its Contributions.
-    .
-    This License does not grant any rights in the trademarks, service marks,
-    or logos of any Contributor (except as may be necessary to comply with
-    the notice requirements in Section 3.4).
-    .
-    2.4. Subsequent Licenses
-    .
-    No Contributor makes additional grants as a result of Your choice to
-    distribute the Covered Software under a subsequent version of this
-    License (see Section 10.2) or under the terms of a Secondary License (if
-    permitted under the terms of Section 3.3).
-    .
-    2.5. Representation
-    .
-    Each Contributor represents that the Contributor believes its
-    Contributions are its original creation(s) or it has sufficient rights
-    to grant the rights to its Contributions conveyed by this License.
-    .
-    2.6. Fair Use
-    .
-    This License is not intended to limit any rights You have under
-    applicable copyright doctrines of fair use, fair dealing, or other
-    equivalents.
-    .
-    2.7. Conditions
-    .
-    Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted
-    in Section 2.1.
-    .
-    3. Responsibilities
-    -------------------
-    .
-    3.1. Distribution of Source Form
-    .
-    All distribution of Covered Software in Source Code Form, including any
-    Modifications that You create or to which You contribute, must be under
-    the terms of this License. You must inform recipients that the Source
-    Code Form of the Covered Software is governed by the terms of this
-    License, and how they can obtain a copy of this License. You may not
-    attempt to alter or restrict the recipients' rights in the Source Code
-    Form.
-    .
-    3.2. Distribution of Executable Form
-    .
-    If You distribute Covered Software in Executable Form then:
-    .
-    (a) such Covered Software must also be made available in Source Code
-    Form, as described in Section 3.1, and You must inform recipients of
-    the Executable Form how they can obtain a copy of such Source Code
-    Form by reasonable means in a timely manner, at a charge no more
-    than the cost of distribution to the recipient; and
-    .
-    (b) You may distribute such Executable Form under the terms of this
-    License, or sublicense it under different terms, provided that the
-    license for the Executable Form does not attempt to limit or alter
-    the recipients' rights in the Source Code Form under this License.
-    .
-    3.3. Distribution of a Larger Work
-    .
-    You may create and distribute a Larger Work under terms of Your choice,
-    provided that You also comply with the requirements of this License for
-    the Covered Software. If the Larger Work is a combination of Covered
-    Software with a work governed by one or more Secondary Licenses, and the
-    Covered Software is not Incompatible With Secondary Licenses, this
-    License permits You to additionally distribute such Covered Software
-    under the terms of such Secondary License(s), so that the recipient of
-    the Larger Work may, at their option, further distribute the Covered
-    Software under the terms of either this License or such Secondary
-    License(s).
-    .
-    3.4. Notices
-    .
-    You may not remove or alter the substance of any license notices
-    (including copyright notices, patent notices, disclaimers of warranty,
-    or limitations of liability) contained within the Source Code Form of
-    the Covered Software, except that You may alter any license notices to
-    the extent required to remedy known factual inaccuracies.
-    .
-    3.5. Application of Additional Terms
-    .
-    You may choose to offer, and to charge a fee for, warranty, support,
-    indemnity or liability obligations to one or more recipients of Covered
-    Software. However, You may do so only on Your own behalf, and not on
-    behalf of any Contributor. You must make it absolutely clear that any
-    such warranty, support, indemnity, or liability obligation is offered by
-    You alone, and You hereby agree to indemnify every Contributor for any
-    liability incurred by such Contributor as a result of warranty, support,
-    indemnity or liability terms You offer. You may include additional
-    disclaimers of warranty and limitations of liability specific to any
-    jurisdiction.
-    .
-    4. Inability to Comply Due to Statute or Regulation
-    ---------------------------------------------------
-    .
-    If it is impossible for You to comply with any of the terms of this
-    License with respect to some or all of the Covered Software due to
-    statute, judicial order, or regulation then You must: (a) comply with
-    the terms of this License to the maximum extent possible; and (b)
-    describe the limitations and the code they affect. Such description must
-    be placed in a text file included with all distributions of the Covered
-    Software under this License. Except to the extent prohibited by statute
-    or regulation, such description must be sufficiently detailed for a
-    recipient of ordinary skill to be able to understand it.
-    .
-    5. Termination
-    --------------
-    .
-    5.1. The rights granted under this License will terminate automatically
-    if You fail to comply with any of its terms. However, if You become
-    compliant, then the rights granted under this License from a particular
-    Contributor are reinstated (a) provisionally, unless and until such
-    Contributor explicitly and finally terminates Your grants, and (b) on an
-    ongoing basis, if such Contributor fails to notify You of the
-    non-compliance by some reasonable means prior to 60 days after You have
-    come back into compliance. Moreover, Your grants from a particular
-    Contributor are reinstated on an ongoing basis if such Contributor
-    notifies You of the non-compliance by some reasonable means, this is the
-    first time You have received notice of non-compliance with this License
-    from such Contributor, and You become compliant prior to 30 days after
-    Your receipt of the notice.
-    .
-    5.2. If You initiate litigation against any entity by asserting a patent
-    infringement claim (excluding declaratory judgment actions,
-    counter-claims, and cross-claims) alleging that a Contributor Version
-    directly or indirectly infringes any patent, then the rights granted to
-    You by any and all Contributors for the Covered Software under Section
-    2.1 of this License shall terminate.
-    .
-    5.3. In the event of termination under Sections 5.1 or 5.2 above, all
-    end user license agreements (excluding distributors and resellers) which
-    have been validly granted by You or Your distributors under this License
-    prior to termination shall survive termination.
-    .
-    ************************************************************************
-    *                                                                      *
-    *  6. Disclaimer of Warranty                                           *
-    *  -------------------------                                           *
-    *                                                                      *
-    *  Covered Software is provided under this License on an "as is"       *
-    *  basis, without warranty of any kind, either expressed, implied, or  *
-    *  statutory, including, without limitation, warranties that the       *
-    *  Covered Software is free of defects, merchantable, fit for a        *
-    *  particular purpose or non-infringing. The entire risk as to the     *
-    *  quality and performance of the Covered Software is with You.        *
-    *  Should any Covered Software prove defective in any respect, You     *
-    *  (not any Contributor) assume the cost of any necessary servicing,   *
-    *  repair, or correction. This disclaimer of warranty constitutes an   *
-    *  essential part of this License. No use of any Covered Software is   *
-    *  authorized under this License except under this disclaimer.         *
-    *                                                                      *
-    ************************************************************************
-    .
-    ************************************************************************
-    *                                                                      *
-    *  7. Limitation of Liability                                          *
-    *  --------------------------                                          *
-    *                                                                      *
-    *  Under no circumstances and under no legal theory, whether tort      *
-    *  (including negligence), contract, or otherwise, shall any           *
-    *  Contributor, or anyone who distributes Covered Software as          *
-    *  permitted above, be liable to You for any direct, indirect,         *
-    *  special, incidental, or consequential damages of any character      *
-    *  including, without limitation, damages for lost profits, loss of    *
-    *  goodwill, work stoppage, computer failure or malfunction, or any    *
-    *  and all other commercial damages or losses, even if such party      *
-    *  shall have been informed of the possibility of such damages. This   *
-    *  limitation of liability shall not apply to liability for death or   *
-    *  personal injury resulting from such party's negligence to the       *
-    *  extent applicable law prohibits such limitation. Some               *
-    *  jurisdictions do not allow the exclusion or limitation of           *
-    *  incidental or consequential damages, so this exclusion and          *
-    *  limitation may not apply to You.                                    *
-    *                                                                      *
-    ************************************************************************
-    .
-    8. Litigation
-    -------------
-    .
-    Any litigation relating to this License may be brought only in the
-    courts of a jurisdiction where the defendant maintains its principal
-    place of business and such litigation shall be governed by laws of that
-    jurisdiction, without reference to its conflict-of-law provisions.
-    Nothing in this Section shall prevent a party's ability to bring
-    cross-claims or counter-claims.
-    .
-    9. Miscellaneous
-    ----------------
-    .
-    This License represents the complete agreement concerning the subject
-    matter hereof. If any provision of this License is held to be
-    unenforceable, such provision shall be reformed only to the extent
-    necessary to make it enforceable. Any law or regulation which provides
-    that the language of a contract shall be construed against the drafter
-    shall not be used to construe this License against a Contributor.
-    .
-    10. Versions of the License
-    ---------------------------
-    .
-    10.1. New Versions
-    .
-    Mozilla Foundation is the license steward. Except as provided in Section
-    10.3, no one other than the license steward has the right to modify or
-    publish new versions of this License. Each version will be given a
-    distinguishing version number.
-    .
-    10.2. Effect of New Versions
-    .
-    You may distribute the Covered Software under the terms of the version
-    of the License under which You originally received the Covered Software,
-    or under the terms of any subsequent version published by the license
-    steward.
-    .
-    10.3. Modified Versions
-    .
-    If you create software not governed by this License, and you want to
-    create a new license for such software, you may create and use a
-    modified version of this License if you rename the license and remove
-    any references to the name of the license steward (except to note that
-    such modified license differs from this License).
-    .
-    10.4. Distributing Source Code Form that is Incompatible With Secondary
-    Licenses
-    .
-    If You choose to distribute Source Code Form that is Incompatible With
-    Secondary Licenses under the terms of this version of the License, the
-    notice described in Exhibit B of this License must be attached.
-    .
-    Exhibit A - Source Code Form License Notice
-    -------------------------------------------
-    .
-    This Source Code Form is subject to the terms of the Mozilla Public
-    License, v. 2.0. If a copy of the MPL was not distributed with this
-    file, You can obtain one at http://mozilla.org/MPL/2.0/.
-    .
-    If it is not possible or desirable to put the notice in a particular
-    file, then You may include the notice in a location (such as a LICENSE
-    file in a relevant directory) where a recipient would be likely to look
-    for such a notice.
-    .
-    You may add additional accurate notices of copyright ownership.
-    .
-    Exhibit B - "Incompatible With Secondary Licenses" Notice
-    ---------------------------------------------------------
-    .
-    This Source Code Form is "Incompatible With Secondary Licenses", as
-    defined by the Mozilla Public License, v. 2.0.
-
-
-
-
-For reference, the upstream DESCRIPTION file is included below:
-
-  Package: RcppEigen
-  Type: Package
-  Title: Rcpp integration for the Eigen templated linear algebra library.
-  Version: 0.3.1.2.1
-  Date: 2013-01-14
-  Author: Douglas Bates, Romain Francois and Dirk Eddelbuettel
-  Maintainer: Douglas Bates <bates@stat.wisc.edu>
-  Description: R and Eigen integration using Rcpp.
-   .
-   Eigen is a C++ template library for linear algebra: matrices, vectors,
-          numerical solvers and related algorithms.  It supports dense
-          and sparse matrices on integer, floating point and complex
-          numbers, decompositions of such matrices, and solutions of
-          linear systems. Its performance on many algorithms is
-          comparable with some of the best implementations based on
-          Lapack and level-3 BLAS.
-   .
-   The RcppEigen package includes the header files from the Eigen C++
-          template library (currently version 3.1.0). Thus users do not
-          need to install Eigen itself in order to use RcppEigen.
-   .
-   Eigen is licensed under the GNU LGPL version 3 or later, and also under
-          the GNU GPL version 2 or later.  RcppEigen (the Rcpp
-          bindings/bridge to Eigen) is licensed under the GNU GPL version
-          2 or later, as is the rest of Rcpp.
-  License: GPL (>= 2)
-  Depends: Rcpp (>= 0.10.1), Matrix (>= 1.0-1), R(>= 2.14.0)
-  LazyLoad: yes
-  LinkingTo: Rcpp
-  Imports: Matrix
-  Suggests: inline, RUnit, testthat
-  URL: http://eigen.tuxfamily.org
-  Packaged: 2013-02-03 19:51:07 UTC; bates
-  Repository: CRAN
-  Date/Publication: 2013-02-04 08:02:53
-  
diff --git a/debian/r-cran-rcppeigen.lintian-overrides b/debian/r-cran-rcppeigen.lintian-overrides
deleted file mode 100644
index 84af5289..00000000
--- a/debian/r-cran-rcppeigen.lintian-overrides
+++ /dev/null
@@ -1,7 +0,0 @@
-r-cran-rcppeigen: binary-or-shlib-defines-rpath usr/lib/R/site-library/RcppEigen/libs/RcppEigen.so /usr/lib/R/site-library/Rcpp/lib
-r-cran-rcppeigen: hardening-no-relro usr/lib/R/site-library/RcppEigen/libs/RcppEigen.so
-r-cran-rcppeigen: hardening-no-fortify-functions usr/lib/R/site-library/RcppEigen/libs/RcppEigen.so
-r-cran-rcppeigen: script-not-executable usr/lib/R/site-library/RcppEigen/unitTests/runit.RcppEigen.R
-r-cran-rcppeigen: script-not-executable usr/lib/R/site-library/RcppEigen/unitTests/runit.fastLm.R
-r-cran-rcppeigen: script-not-executable usr/lib/R/site-library/RcppEigen/unitTests/runit.sparse.R
-r-cran-rcppeigen: extra-license-file usr/lib/R/site-library/RcppEigen/LICENSE
diff --git a/debian/rules b/debian/rules
deleted file mode 100755
index 1c53bdcf..00000000
--- a/debian/rules
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/usr/bin/make -f
-# 							-*- makefile -*-
-# debian/rules file for the Debian/GNU Linux r-cran-rcppeigen package
-# Copyright 2003-2013 by Dirk Eddelbuettel <edd@debian.org>
-
-include /usr/share/R/debian/r-cran.mk
diff --git a/debian/source/format b/debian/source/format
deleted file mode 100644
index d3827e75..00000000
--- a/debian/source/format
+++ /dev/null
@@ -1 +0,0 @@
-1.0
diff --git a/debian/watch b/debian/watch
deleted file mode 100644
index 897d3d46..00000000
--- a/debian/watch
+++ /dev/null
@@ -1,2 +0,0 @@
-version=3
-http://cran.r-project.org/src/contrib/RcppEigen_([-\d\.]*)\.tar.gz
diff --git a/inst/CITATION b/inst/CITATION
index ab5ecbce..ed8410b6 100644
--- a/inst/CITATION
+++ b/inst/CITATION
@@ -1,19 +1,13 @@
-citHeader("To cite RcppEigen in publications use:")
-
-citEntry(entry = "Article",
-  title        = "Fast and Elegant Numerical Linear Algebra Using the {RcppEigen} Package",
-  author       = personList(as.person("Douglas Bates"),
-                   as.person("Dirk Eddelbuettel")),
-  journal      = "Journal of Statistical Software",
-  year         = "2013",
-  volume       = "52",
-  number       = "5",
-  pages        = "1--24",
-  url          = "http://www.jstatsoft.org/v52/i05/",
-
-  textVersion  =
-  paste("Douglas Bates, Dirk Eddelbuettel (2013).",
-        "Fast and Elegant Numerical Linear Algebra Using the RcppEigen Package.",
-        "Journal of Statistical Software, 52(5), 1-24.",
-        "URL http://www.jstatsoft.org/v52/i05/.")
-)
+bibentry("Article",
+         title = "Fast and Elegant Numerical Linear Algebra Using the {RcppEigen} Package",
+         author = c(person("Douglas", "Bates",
+                           comment = c(ORCID = "0000-0001-8316-9503")),
+                    person("Dirk", "Eddelbuettel",
+                           email = "edd@debian.org",
+                           comment = c(ORCID = "0000-0001-6419-907X"))),
+         journal = "Journal of Statistical Software",
+         year = "2013",
+         volume = "52",
+         number = "5",
+         pages = "1--24",
+         doi = "10.18637/jss.v052.i05")
diff --git a/inst/NEWS.Rd b/inst/NEWS.Rd
index 7da2ea1f..3c15d3f1 100644
--- a/inst/NEWS.Rd
+++ b/inst/NEWS.Rd
@@ -1,11 +1,177 @@
 \name{NEWS}
-\title{News for Package 'RcppEigen'}
+\title{News for Package \pkg{RcppEigen}}
 \newcommand{\ghpr}{\href{https://github.com/RcppCore/RcppEigen/pull/#1}{##1}}
 \newcommand{\ghit}{\href{https://github.com/RcppCore/RcppEigen/issues/#1}{##1}}
 
+\section{Changes in RcppEigen version 0.3.4.0.2 (2024-08-23)}{
+  \itemize{
+    \item Correct two typos in the ORCID tag
+  }
+}
+
+\section{Changes in RcppEigen version 0.3.4.0.1 (2024-08-14)}{
+  \itemize{
+    \item Conditionally comment-out \code{xerbla} in \code{blas.h} as it is
+    now providedd by R-devel albeit with \code{FC_LEN_T} (per a CRAN request)
+    \item Minor package updates (continuous integration, badges)
+  }
+}
+
+\section{Changes in RcppEigen version 0.3.4.0.0 (2024-02-28)}{
+  \itemize{
+    \item The Eigen version has been upgrade to release 3.4.0 (Yixuan)
+    \item Extensive reverse-dependency checks ensure only three out of over
+    400 packages at CRAN are affected; PRs and patches helped other packages
+    \item The long-running branch also contains substantial contributions
+    from Mikael Jagan (for the lme4 interface) and Andrew Johnson (revdep PRs)
+  }
+}
+
+\section{Changes in RcppEigen version 0.3.3.9.4 (2023-11-01)}{
+  \itemize{
+    \item The CITATION file has been updated for the new \code{bibentry}
+    style.
+    \item The package skeleton generator has been updated and no longer sets
+    an Imports:.
+    \item Some README.md URLs and badged have been updated.
+    \item The use of \code{-fopenmp} has been documented in \code{Makevars},
+    and a simple thread-count reporting function has been added.
+    \item The old manual \code{src/init.c} has been replaced by an
+    autogenerated version, the \code{RcppExports} file have regenerated
+    \item The interface to package \pkg{Matrix} has been updated and
+    simplified thanks to an excllent patch by Mikael Jagan.
+    \item The new upload is coordinated with packages \pkg{lme4} and \pkg{OpenMx}.
+  }
+}
+
+\section{Changes in RcppEigen version 0.3.3.9.3 (2022-11-04)}{
+  \itemize{
+    \item The dependency on R is now versioned to 3.6.0 or later for
+    support for \code{USE_FC_LEN_T} from Fortran.
+    \item An old example typo was corrected (Jonah Gabry in \ghpr{114}).
+    \item The \code{fastLm} methods now reference \code{df.residual} by
+    its full name (Closes \ghit{115}).
+    \item A function prototype was updated for \code{clang-15}.
+    \item GitHub Actions were updated to checkout version 3.
+  }
+}
+
+\section{Changes in RcppEigen version 0.3.3.9.2 (2022-04-05)}{
+  \itemize{
+    \item Added test coverage in continuous integration
+    \item Added new tests to increase test coverage
+    \item Small improvement to the RcppEigen.package.skeleton() code
+    \item Small updates and edits to README.md and inst/CITATION
+    \item Use R_xlen_t for vector rows and columns (by Mikael Jagan)
+    \item Support USE_FC_LEN_T by adding FCONE to two dgesdd
+  }
+}
+
+\section{Changes in RcppEigen version 0.3.3.9.1 (2020-12-17)}{
+  \itemize{
+    \item Upgraded to Eigen 3.3.9 (Dirk in \ghpr{92} fixing \ghit{91}).
+    \item Added GitHub Actions CI using \code{run.sh} from r-ci (Dirk)
+  }
+}
+
+\section{Changes in RcppEigen version 0.3.3.7.0 (2019-11-16)}{
+  \itemize{
+    \item Fixed skeleton package creation listing RcppEigen under Imports
+    (James Balamuta in \ghpr{68} addressing \ghit{16}).
+    \item Small RNG use update to first example in skeleton package used
+    by package creation helper (Dirk addressing \ghit{69}).
+    \item Update vignette example to use RcppEigen:::eigen_version() (Dirk
+    addressing \ghit{71}).
+    \item Correct one RcppEigen.package.skeleton() corner case (Dirk in
+    \ghpr{77} fixing \ghit{75}).
+    \item Correct one usage case with \pkg{pkgKitten} (Dirk in \ghpr{78}).
+    \item The package now uses \pkg{tinytest} for unit tests (Dirk in
+    \ghpr{81}).
+    \item Upgraded to Eigen 3.3.7 (Dirk in \ghpr{82} fixing \ghit{80}).
+  }
+}
+
+\section{Changes in RcppEigen version 0.3.3.5.0 (2018-11-24)}{
+  \itemize{
+    \item Updated to version 3.3.5 of Eigen (Dirk in \ghpr{65})
+    \item Long vectors are now supported via \code{R_xlen_t} (Ralf
+    Stubner in \ghpr{55} fixing \ghit{54}).
+    \item The benchmarking example was updated in its use of
+    RcppArmadillo (Michael Weylandt in \ghpr{56}).
+  }
+}
+
+\section{Changes in RcppEigen version 0.3.3.4.0 (2018-02-05)}{
+  \itemize{
+    \item Updated to version 3.3.4 of Eigen (Yixuan in \ghpr{49})
+    \item Also carried over on new upstream (Yixuan, addressing
+    \ghit{48})
+    \item As before, condition \code{long long} use on C++11.
+    \item Pragmas for g++ & clang to suppress diagnostics messages are
+    disabled per CRAN Policy; use \code{-Wno-ignored-attributes} to quieten.
+  }
+}
+
+\section{Changes in RcppEigen version 0.3.3.3.1 (2017-11-19)}{
+  \itemize{
+    \item Compilation under Haiku-OS is now supported (Yu Gong in
+    \ghpr{45}).
+    \item The \code{Rcpp.plugin.maker} helper function is called via
+    \code{::} as it is in fact exported (yet we had old code using
+    \code{:::}).
+    \item A spurious argument was removed from an example call.
+    \item Travis CI now uses https to fetch the test runner script.
+  }
+}
+
+\section{Changes in RcppEigen version 0.3.3.3.0 (2017-04-29)}{
+  \itemize{
+    \item Updated to version 3.3.3 of Eigen
+    \item Fixed incorrect function names in the examples, thanks to
+    ChingChuan Chen
+    \item The class \code{MappedSparseMatrix<T>} has been deprecated since
+    Eigen 3.3.0. The new structure \code{Map<SparseMatrix<T> >} should be used
+    instead
+    \item Exporters for the new type \code{Map<SparseMatrix<T> >} were added
+    \item Travis CI is now driven via \code{run.sh} from our forked r-travis
+  }
+}
+
+\section{Changes in RcppEigen version 0.3.2.9.1 (2017-03-14)}{
+  \itemize{
+    \item Synchronize CholMod header file with Matrix package to ensure
+    binary compatibility on all platforms (Martin Maechler in \ghpr{42})
+    \item Added file \code{init.c} with calls to \code{R_registerRoutines()}
+    \code{and R_useDynamicSymbols()}; also use \code{.registration=TRUE}
+    in \code{useDynLib} in \code{NAMESPACE}
+  }
+}
+
+\section{Changes in RcppEigen version 0.3.2.9.0 (2016-08-20)}{
+  \itemize{
+    \item Updated to version 3.2.9 of Eigen (PR \ghpr{37} by Yixuan
+    closing \ghit{36} from Bob Carpenter of the Stan team)
+    \item An exporter for \code{RowVectorX} was added (thanks to PR \ghpr{32}
+    by James Balamuta)
+  }
+}
+
+\section{Changes in RcppEigen version 0.3.2.8.1 (2016-02-29)}{
+  \itemize{
+    \item Applied another upstream UBSAN fix (PR \ghpr{30} by Yixuan)
+  }
+}
+
+\section{Changes in RcppEigen version 0.3.2.8.0 (2016-02-23)}{
+  \itemize{
+    \item Updated to version 3.2.8 of Eigen (PR \ghpr{29} by Yixuan)
+  }
+}
+
 \section{Changes in RcppEigen version 0.3.2.7.0 (2016-01-18)}{
   \itemize{
     \item Updated to version 3.2.7 of Eigen
+    \item One unit test file tightened to please R-devel CMD check
     \item The fastLm example will not include the Lapack header if MKL
     is defined (thanks to PR \ghpr{25} by Alexey Stukalow)
   }
@@ -34,7 +200,7 @@
     \CRANpkg{pkgKitten} if available
   }
 }
-    
+
 \section{Changes in RcppEigen version 0.3.2.3.0 (2014-12-22)}{
   \itemize{
     \item Updated to version 3.2.3 of Eigen
@@ -65,7 +231,7 @@
     the cpu id comparison, with thanks to Gael Guennebaud for the patch
   }
 }
-    
+
 \section{Changes in RcppEigen version 0.3.2.1.1 (2014-03-06)}{
   \itemize{
     \item Better \code{ifdef} on one directory entry feature, with
@@ -83,7 +249,7 @@
   \itemize{
     \item Updated and extended \code{RcppEigen.package.skeleton()} to
     use several examples via \CRANpkg{Rcpp} Attributes; also removed the
-    deprecated \code{namespace} argument 
+    deprecated \code{namespace} argument
     \item Updated skeleton package example for \CRANpkg{Rcpp} 0.11.0 or
     later by removing needed for linking with user library
     \item Updated files \code{DESCRIPTION}, \code{NAMESPACE},
@@ -108,7 +274,7 @@
     \item Applied two small patches to deal with non-g++ compilrs
     \item Clarifications concerning license and authorship of
     Eigen (as opposed to RcppEigen) code added to \code{DESCRIPTION} at
-    the request of CRAN 
+    the request of CRAN
   }
 }
 
@@ -137,7 +303,7 @@
     which cannot be changed to a ColMajor form.
     \item Because of changes in R, -DNDEBUG is automatic. One must
     override it with -UNDEBUG in the local ~/.R/Makevars to activate the
-    debugging code. 
+    debugging code.
     \item New (unexported) functions CxxFlags() and RcppEigenCxxFlags()
     for use in Makefiles
     \item Fixes related to Rcpp 0.10.*
@@ -190,5 +356,3 @@
     some difficulty with combining testthat, inline and R CMD check.
   }
 }
- 
-  
diff --git a/inst/doc/code.R b/inst/doc/code.R
deleted file mode 100644
index 634ebb91..00000000
--- a/inst/doc/code.R
+++ /dev/null
@@ -1,237 +0,0 @@
-library(inline)
-library(RcppEigen)
-
-incl <- '
-using   Eigen::LLT;
-using   Eigen::Lower;
-using   Eigen::Map;
-using   Eigen::MatrixXd;
-using   Eigen::MatrixXi;
-using   Eigen::Upper;
-using   Eigen::VectorXd;
-typedef Map<MatrixXd>  MapMatd;
-typedef Map<MatrixXi>  MapMati;
-typedef Map<VectorXd>  MapVecd;
-inline  MatrixXd AtA(const MatrixXd& A) {
-    int    n(A.cols());
-    return   MatrixXd(n,n).setZero().selfadjointView<Lower>()
-             .rankUpdate(A.adjoint());
-}
-inline  MatrixXd AAt(const MatrixXd& A) {
-    int    n(A.cols());
-    return   MatrixXd(n,n).setZero().selfadjointView<Lower>()
-             .rankUpdate(A);
-}
-'
-
-
-## section 3.1
-(A <- matrix(1:6, ncol=2))
-str(A)
-
-transCpp <-'
-using Eigen::Map;
-using Eigen::MatrixXi;
-                 // Map the integer matrix AA from R
-const Map<MatrixXi>  A(as<Map<MatrixXi> >(AA));
-                 // evaluate and return the transpose of A
-const MatrixXi      At(A.transpose());
-return wrap(At);
-'
-
-ftrans <- cxxfunction(signature(AA="matrix"), transCpp, plugin="RcppEigen")
-(At <- ftrans(A))
-stopifnot(all.equal(At, t(A)))
-
-
-
-## section 3.2
-prodCpp <- '
-typedef Eigen::Map<Eigen::MatrixXi>   MapMati;
-const MapMati    B(as<MapMati>(BB));
-const MapMati    C(as<MapMati>(CC));
-return List::create(Named("B %*% C")         = B * C,
-                    Named("crossprod(B, C)") = B.adjoint() * C);
-'
-
-fprod <- cxxfunction(signature(BB = "matrix", CC = "matrix"), prodCpp, "RcppEigen")
-B <- matrix(1:4, ncol=2)
-C <- matrix(6:1, nrow=2)
-str(fp <- fprod(B, C))
-stopifnot(all.equal(fp[[1]], B %*% C), all.equal(fp[[2]], crossprod(B, C)))
-
-
-
-## section 3.3
-
-crossprodCpp <- '
-using Eigen::Map;
-using Eigen::MatrixXi;
-using Eigen::Lower;
-
-const Map<MatrixXi> A(as<Map<MatrixXi> >(AA));
-const int           m(A.rows()), n(A.cols());
-MatrixXi          AtA(MatrixXi(n, n).setZero().
-                      selfadjointView<Lower>().rankUpdate(A.adjoint()));
-MatrixXi          AAt(MatrixXi(m, m).setZero().
-                      selfadjointView<Lower>().rankUpdate(A));
-
-return List::create(Named("crossprod(A)")  = AtA,
-                    Named("tcrossprod(A)") = AAt);
-'
-fcprd <- cxxfunction(signature(AA = "matrix"), crossprodCpp, "RcppEigen")
-str(crp <- fcprd(A))
-stopifnot(all.equal(crp[[1]], crossprod(A)),
-          all.equal(crp[[2]], tcrossprod(A)))
-
-
-
-## section 3.4
-
-storage.mode(A) <- "double"
-
-cholCpp <- '
-const  LLT<MatrixXd> llt(AtA(as<MapMatd>(AA)));
-return List::create(Named("L") = MatrixXd(llt.matrixL()),
-                    Named("R") = MatrixXd(llt.matrixU()));
-'
-
-fchol <- cxxfunction(signature(AA = "matrix"), cholCpp, "RcppEigen", incl)
-(ll <- fchol(A))
-stopifnot(all.equal(ll[[2]], chol(crossprod(A))))
-
-
-# section 3.5
-
-cholDetCpp <- '
-const MatrixXd      ata(AtA(as<MapMatd>(AA)));
-const double       detL(MatrixXd(ata.llt().matrixL()).diagonal().prod());
-const VectorXd     Dvec(ata.ldlt().vectorD());
-return List::create(Named("d1") = detL * detL,
-                    Named("d2") = Dvec.prod(),
-                    Named("ld") = Dvec.array().log().sum());
-'
-
-fdet <- cxxfunction(signature(AA = "matrix"), cholDetCpp, "RcppEigen", incl)
-unlist(ll <- fdet(A))
-
-
-## section 4.1
-lltLSCpp <- '
-const MapMatd         X(as<MapMatd>(XX));
-const MapVecd         y(as<MapVecd>(yy));
-const int             n(X.rows()), p(X.cols());
-const LLT<MatrixXd> llt(AtA(X));
-const VectorXd  betahat(llt.solve(X.adjoint() * y));
-const VectorXd   fitted(X * betahat);
-const VectorXd    resid(y - fitted);
-const int            df(n - p);
-const double        ssq(resid.squaredNorm() / double(df));
-const MatrixXd     vcov(ssq * llt.solve(MatrixXd::Identity(p, p)));
-return     List::create(Named("coefficients")   = betahat,
-                        Named("fitted.values")  = fitted,
-                        Named("residuals")      = resid,
-                        Named("s")              = sqrt(ssq),
-                        Named("df.residual")    = df,
-                        Named("rank")           = p,
-                        Named("vcov")           = vcov);
-'
-
-lltLS <- cxxfunction(signature(XX = "matrix", yy = "numeric"),
-                     lltLSCpp, "RcppEigen", incl)
-data(trees, package="datasets")
-str(lltFit <- with(trees, lltLS(cbind(1, log(Girth)), log(Volume))))
-str(lmFit <- with(trees, lm.fit(cbind(1, log(Girth)), log(Volume))))
-for (nm in c("coefficients", "residuals", "fitted.values", "rank"))
-    stopifnot(all.equal(lltFit[[nm]], unname(lmFit[[nm]])))
-stopifnot(all.equal(unname(vcov(lm(log(Volume) ~ log(Girth), trees))),
-                    lltFit$vcov))
-
-## section 4.3
-
-dd <- data.frame(f1 = gl(4, 6, labels = LETTERS[1:4]),
-                 f2 = gl(3, 2, labels = letters[1:3]))[-(7:8), ]
-xtabs(~ f2 + f1, dd)                    # one missing cell
-mm <- model.matrix(~ f1 * f2, dd)
-kappa(mm)         # large condition number, indicating rank deficiency
-rcond(mm)         # alternative evaluation, the reciprocal condition number
-(c(rank=qr(mm)$rank, p=ncol(mm))) # rank as computed in R's qr function
-set.seed(1)
-dd$y <- mm %*% seq_len(ncol(mm)) + rnorm(nrow(mm), sd = 0.1)
-                         # lm detects the rank deficiency
-fm1 <- lm(y ~ f1 * f2, dd)
-writeLines(capture.output(print(summary(fm1), signif.stars=FALSE))[9:22])
-
-
-## section 4.6
-print(summary(fmPQR <- fastLm(y ~ f1 * f2, dd)), signif.stars=FALSE)
-all.equal(coef(fm1), coef(fmPQR))
-all.equal(unname(fitted(fm1)), fitted(fmPQR))
-all.equal(unname(residuals(fm1)), residuals(fmPQR))
-
-
-print(summary(fmSVD <- fastLm(y ~ f1 * f2, dd, method=4L)), signif.stars=FALSE)
-all.equal(coef(fm1), coef(fmSVD))
-all.equal(unname(fitted(fm1)), fitted(fmSVD))
-all.equal(unname(residuals(fm1)), residuals(fmSVD))
-
-
-fmVLV <- fastLm(y ~ f1 * f2, dd, method=5L)
-all.equal(coef(fmSVD), coef(fmVLV))
-
-
-## section 5
-
-badtransCpp <- '
-const MapMati  A(as<MapMati>(AA));
-return wrap(A.transpose());
-'
-
-Ai <- matrix(1:6, ncol=2L)
-ftrans2 <- cxxfunction(signature(AA = "matrix"), badtransCpp, "RcppEigen", incl)
-(At <- ftrans2(Ai))
-all.equal(At, t(Ai))
-
-
-
-## section 6
-sparseProdCpp <- '
-using Eigen::MappedSparseMatrix;
-using Eigen::SparseMatrix;
-
-const MappedSparseMatrix<double>  A(as<MappedSparseMatrix<double> >(AA));
-const MapVecd                     y(as<MapVecd>(yy));
-const SparseMatrix<double>       At(A.adjoint());
-return List::create(Named("At")  = At,
-                    Named("Aty") = At * y);
-'
-
-sparse1 <- cxxfunction(signature(AA = "dgCMatrix", yy = "numeric"),
-                       sparseProdCpp, "RcppEigen", incl)
-data(KNex, package="Matrix")
-rr <- sparse1(KNex$mm, KNex$y)
-stopifnot(all.equal(rr$At, t(KNex$mm)),
-          all.equal(rr$Aty, as.vector(crossprod(KNex$mm, KNex$y))))
-
-
-sparseLSCpp <- '
-typedef Eigen::MappedSparseMatrix<double>  MSpMat;
-typedef Eigen::SparseMatrix<double>         SpMat;
-typedef Eigen::SimplicialLDLT<SpMat>       SpChol;
-
-const SpMat      At(as<MSpMat>(AA).adjoint());
-const VectorXd  Aty(At * as<MapVecd>(yy));
-const SpChol     Ch(At * At.adjoint());
-if (Ch.info() != Eigen::Success) return R_NilValue;
-return List::create(Named("betahat")  = Ch.solve(Aty),
-                    Named("perm")     = Ch.permutationP().indices());
-'
-
-sparse2 <- cxxfunction(signature(AA = "dgCMatrix", yy = "numeric"),
-                       sparseLSCpp, "RcppEigen", incl)
-str(rr <-  sparse2(KNex$mm, KNex$y))
-res <- as.vector(solve(Ch <- Cholesky(crossprod(KNex$mm)),
-                       crossprod(KNex$mm, KNex$y)))
-stopifnot(all.equal(rr$betahat, res))
-
-all(rr$perm == Ch@perm) # fill-reducing permutations are different
diff --git a/inst/doc/unitTests/RcppEigen-unitTests.R b/inst/doc/unitTests/RcppEigen-unitTests.R
deleted file mode 100644
index 06bfdb40..00000000
--- a/inst/doc/unitTests/RcppEigen-unitTests.R
+++ /dev/null
@@ -1,20 +0,0 @@
-pkg <- "RcppEigen"
-
-# load this package
-require( pkg, character.only = TRUE )
-
-#load RUnit
-runit <- "RUnit" ; require( runit, character.only = TRUE )
-if( file.exists( "unitTests-results" ) ){ unlink("unitTests-results", recursive = TRUE ) }
-dir.create( "unitTests-results" ) 
-
-path <- system.file("unitTests", package = pkg)
-testSuite <- defineTestSuite(name=paste(pkg, "unit testing"), dirs = path)
-tests <- runTestSuite(testSuite)
-printHTMLProtocol(tests, fileName= sprintf( "unitTests-results/%s-unitTests.html" , pkg ) )
-printTextProtocol(tests, fileName= sprintf( "unitTests-results/%s-unitTests.txt"  , pkg ) )
-if( file.exists( "/tmp" ) ){
-	file.copy( sprintf( "unitTests-results/%s-unitTests.txt" , pkg ) , "/tmp", overwrite = TRUE )
-	file.copy( sprintf( "unitTests-results/%s-unitTests.html", pkg ) , "/tmp", overwrite = TRUE )
-}
-
diff --git a/inst/doc/unitTests/RcppEigen-unitTests.Rnw b/inst/doc/unitTests/RcppEigen-unitTests.Rnw
deleted file mode 100644
index e69de29b..00000000
diff --git a/inst/examples/lmBenchmark.R b/inst/examples/lmBenchmark.R
index dd493756..1c07303b 100644
--- a/inst/examples/lmBenchmark.R
+++ b/inst/examples/lmBenchmark.R
@@ -1,65 +1,82 @@
 ## lmBenchmark.R: Benchmark different implementations of linear model solutions
 ##
-## Copyright (C)  2011 Douglas Bates, Dirk Eddelbuettel and Romain Francois
+## Copyright (C)  2011 - 2017  Douglas Bates, Dirk Eddelbuettel and Romain Francois
 ##
 ## This file is part of RcppEigen.
 
 require("stats", character=TRUE, quietly=TRUE)
-require("rbenchmark", character=TRUE, quietly=TRUE)
 require("RcppEigen", character=TRUE, quietly=TRUE)
 
-## define different versions of lm
-exprs <- list()
-
-## These versions use rank-revealing decompositions and thus can
-## handle rank-deficient cases.
-
-                                        # default version used in lm()
-exprs$lm.fit <- expression(stats::lm.fit(mm, y))
-                                        # versions from RcppEigen
-## column-pivoted QR decomposition - similar to lm.fit
-exprs$PivQR <- expression(.Call("fastLm", mm, y, 0L, PACKAGE="RcppEigen"))
-## LDLt Cholesky decomposition with rank detection
-exprs$LDLt <- expression(.Call("fastLm", mm, y, 2L, PACKAGE="RcppEigen"))
-## SVD using the Lapack subroutine dgesdd and Eigen support
-exprs$GESDD <- expression(.Call("fastLm", mm, y, 6L, PACKAGE="RcppEigen"))
-## SVD (the JacobiSVD class from Eigen)
-exprs$SVD <- expression(.Call("fastLm", mm, y, 4L, PACKAGE="RcppEigen"))
-## eigenvalues and eigenvectors of X'X
-exprs$SymmEig <- expression(.Call("fastLm", mm, y, 5L, PACKAGE="RcppEigen"))
-
-## Non-rank-revealing decompositions.  These work fine except when
-## they don't.
-
-## Unpivoted  QR decomposition
-exprs$QR <- expression(.Call("fastLm", mm, y, 1L, PACKAGE="RcppEigen"))
-## LLt Cholesky decomposition
-exprs$LLt <- expression(.Call("fastLm", mm, y, 3L, PACKAGE="RcppEigen"))
-
-if (suppressMessages(require("RcppArmadillo", character=TRUE, quietly=TRUE))) {
-    exprs$arma <- expression(.Call("RcppArmadillo_fastLm", mm, y, PACKAGE="RcppArmadillo"))
-}
+if(require("microbenchmark", character=TRUE, quietly=TRUE)){
 
-if (suppressMessages(require("RcppGSL", character=TRUE, quietly=TRUE))) {
-    exprs$GSL <- expression(.Call("fastLm", mm, y, PACKAGE="RcppGSL"))
-}
+    ## define different versions of lm
+    exprs <- list()
 
-do_bench <- function(n=100000L, p=40L, nrep=20L, suppressSVD=(n > 100000L)) {
-    mm <- cbind(1, matrix(rnorm(n * (p - 1L)), nc=p-1L))
-    y <- rnorm(n)
-    if (suppressSVD) exprs <- exprs[!names(exprs) %in% c("SVD", "GSL")]
-    cat("lm benchmark for n = ", n, " and p = ", p, ": nrep = ", nrep, "\n", sep='')
-    do.call(benchmark, c(exprs,
-                         list(order="relative",
-                              columns = c("test", "relative",
-                              "elapsed", "user.self", "sys.self"),
-                              replications = nrep)))
-}
+    ## These versions use rank-revealing decompositions and thus can
+    ## handle rank-deficient cases.
+
+    # default version used in lm()
+    exprs["lm.fit"] <- alist(stats::lm.fit(mm, y))
+
+    # versions from RcppEigen
+    ## column-pivoted QR decomposition - similar to lm.fit
+    exprs["PivQR"] <- alist(RcppEigen::fastLmPure(mm, y, 0L))
+    ## LDLt Cholesky decomposition with rank detection
+    exprs["LDLt"] <- alist(RcppEigen::fastLmPure(mm, y, 2L))
+    ## SVD using the Lapack subroutine dgesdd and Eigen support
+    exprs["GESDD"] <- alist(RcppEigen::fastLmPure(mm, y, 6L))
+    ## SVD (the JacobiSVD class from Eigen)
+    exprs["SVD"] <- alist(RcppEigen::fastLmPure(mm, y, 4L))
+    ## eigenvalues and eigenvectors of X'X
+    exprs["SymmEig"] <- alist(RcppEigen::fastLmPure(mm, y, 5L))
+
+    ## Non-rank-revealing decompositions.  These work fine except when
+    ## they don't.
+
+    ## Unpivoted  QR decomposition
+    exprs["QR"] <- alist(RcppEigen::fastLmPure(mm, y, 1L))
+    ## LLt Cholesky decomposition
+    exprs["LLt"] <- alist(RcppEigen::fastLmPure(mm, y, 3L))
+
+    if (suppressMessages(require("RcppArmadillo", character=TRUE, quietly=TRUE))) {
+        exprs["arma"] <- alist(RcppArmadillo::fastLmPure(mm, y))
+    }
 
-print(do_bench())
+    if (suppressMessages(require("RcppGSL", character=TRUE, quietly=TRUE))) {
+        exprs["GSL"] <- alist(RcppGSL::fastLmPure(mm, y))
+    }
 
-sessionInfo()
+    do_bench <- function(n=100000L, p=40L, nrep=20L, suppressSVD=(n > 100000L)) {
+        mm <- cbind(1, matrix(rnorm(n * (p - 1L)), nc=p-1L))
+        y <- rnorm(n)
+        if (suppressSVD) exprs <- exprs[!names(exprs) %in% c("SVD", "GSL")]
 
-.Call("eigen_version", FALSE, PACKAGE="RcppEigen")
+        cat("lm benchmark for n = ", n, " and p = ", p, ": nrep = ", nrep, "\n", sep='')
+        cat("RcppEigen: Included Eigen version", paste(RcppEigen:::eigen_version(FALSE), collapse="."), "\n")
+        cat("RcppEigen: Eigen SSE support", RcppEigen:::Eigen_SSE(), "\n")
 
-.Call("Eigen_SSE", FALSE, PACKAGE="RcppEigen")
+        mb <- microbenchmark(list=exprs, times = nrep)
+
+        op <- options(microbenchmark.unit="relative")
+        on.exit(options(op))
+
+        mb_relative <- summary(mb)
+        levels(mb_relative$expr) <- names(exprs)
+
+        options(microbenchmark.unit=NULL)
+        mb_absolute <- summary(mb)
+        levels(mb_absolute$expr) <- names(exprs)
+
+        mb_combined <- merge(mb_relative[, c("expr", "median")],
+                             mb_absolute[, c("expr", "median")],
+                             by="expr")
+
+        colnames(mb_combined) <- c("Method",
+                                   "Relative",
+                                   paste0("Elapsed (", attr(mb_absolute, "unit"), ")"))
+
+        mb_combined[order(mb_combined$Relative),]
+    }
+
+    print(do_bench())
+}
diff --git a/inst/include/Eigen/AccelerateSupport b/inst/include/Eigen/AccelerateSupport
new file mode 100644
index 00000000..533be688
--- /dev/null
+++ b/inst/include/Eigen/AccelerateSupport
@@ -0,0 +1,52 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ACCELERATESUPPORT_MODULE_H
+#define EIGEN_ACCELERATESUPPORT_MODULE_H
+
+#include "SparseCore"
+
+#include "src/Core/util/DisableStupidWarnings.h"
+
+/** \ingroup Support_modules
+ * \defgroup AccelerateSupport_Module AccelerateSupport module
+ *
+ * This module provides an interface to the Apple Accelerate library.
+ * It provides the seven following main factorization classes:
+ * - class AccelerateLLT: a Cholesky (LL^T) factorization.
+ * - class AccelerateLDLT: the default LDL^T factorization.
+ * - class AccelerateLDLTUnpivoted: a Cholesky-like LDL^T factorization with only 1x1 pivots and no pivoting
+ * - class AccelerateLDLTSBK: an LDL^T factorization with Supernode Bunch-Kaufman and static pivoting
+ * - class AccelerateLDLTTPP: an LDL^T factorization with full threshold partial pivoting
+ * - class AccelerateQR: a QR factorization
+ * - class AccelerateCholeskyAtA: a QR factorization without storing Q (equivalent to A^TA = R^T R)
+ *
+ * \code
+ * #include <Eigen/AccelerateSupport>
+ * \endcode
+ *
+ * In order to use this module, the Accelerate headers must be accessible from
+ * the include paths, and your binary must be linked to the Accelerate framework.
+ * The Accelerate library is only available on Apple hardware.
+ *
+ * Note that many of the algorithms can be influenced by the UpLo template
+ * argument. All matrices are assumed to be symmetric. For example, the following
+ * creates an LDLT factorization where your matrix is symmetric (implicit) and
+ * uses the lower triangle:
+ *
+ * \code
+ * AccelerateLDLT<SparseMatrix<float>, Lower> ldlt;
+ * \endcode
+ */
+
+// IWYU pragma: begin_exports
+#include "src/AccelerateSupport/AccelerateSupport.h"
+// IWYU pragma: end_exports
+
+#include "src/Core/util/ReenableStupidWarnings.h"
+
+#endif  // EIGEN_ACCELERATESUPPORT_MODULE_H
diff --git a/inst/include/Eigen/Array b/inst/include/Eigen/Array
deleted file mode 100644
index 3d004fb6..00000000
--- a/inst/include/Eigen/Array
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifndef EIGEN_ARRAY_MODULE_H
-#define EIGEN_ARRAY_MODULE_H
-
-// include Core first to handle Eigen2 support macros
-#include "Core"
-
-#ifndef EIGEN2_SUPPORT
-  #error The Eigen/Array header does no longer exist in Eigen3. All that functionality has moved to Eigen/Core.
-#endif
-
-#endif // EIGEN_ARRAY_MODULE_H
diff --git a/inst/include/Eigen/Cholesky b/inst/include/Eigen/Cholesky
index f727f5d8..b05ed827 100644
--- a/inst/include/Eigen/Cholesky
+++ b/inst/include/Eigen/Cholesky
@@ -1,32 +1,43 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_CHOLESKY_MODULE_H
 #define EIGEN_CHOLESKY_MODULE_H
 
 #include "Core"
+#include "Jacobi"
 
 #include "src/Core/util/DisableStupidWarnings.h"
 
 /** \defgroup Cholesky_Module Cholesky module
-  *
-  *
-  *
-  * This module provides two variants of the Cholesky decomposition for selfadjoint (hermitian) matrices.
-  * Those decompositions are accessible via the following MatrixBase methods:
-  *  - MatrixBase::llt(),
-  *  - MatrixBase::ldlt()
-  *
-  * \code
-  * #include <Eigen/Cholesky>
-  * \endcode
-  */
+ *
+ *
+ *
+ * This module provides two variants of the Cholesky decomposition for selfadjoint (hermitian) matrices.
+ * Those decompositions are also accessible via the following methods:
+ *  - MatrixBase::llt()
+ *  - MatrixBase::ldlt()
+ *  - SelfAdjointView::llt()
+ *  - SelfAdjointView::ldlt()
+ *
+ * \code
+ * #include <Eigen/Cholesky>
+ * \endcode
+ */
 
-#include "src/misc/Solve.h"
+// IWYU pragma: begin_exports
 #include "src/Cholesky/LLT.h"
 #include "src/Cholesky/LDLT.h"
 #ifdef EIGEN_USE_LAPACKE
-#include "src/Cholesky/LLT_MKL.h"
+#include "src/misc/lapacke_helpers.h"
+#include "src/Cholesky/LLT_LAPACKE.h"
 #endif
+// IWYU pragma: end_exports
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
-#endif // EIGEN_CHOLESKY_MODULE_H
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */
+#endif  // EIGEN_CHOLESKY_MODULE_H
diff --git a/inst/include/Eigen/CholmodSupport b/inst/include/Eigen/CholmodSupport
index be327fa2..bff39e6d 100644
--- a/inst/include/Eigen/CholmodSupport
+++ b/inst/include/Eigen/CholmodSupport
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_CHOLMODSUPPORT_MODULE_H
 #define EIGEN_CHOLMODSUPPORT_MODULE_H
 
@@ -5,41 +12,37 @@
 
 #include "src/Core/util/DisableStupidWarnings.h"
 
-extern "C" {
-  #include <RcppEigenCholmod.h>
-}
+#include <RcppEigenCholmod.h>
 
 /** \ingroup Support_modules
-  * \defgroup CholmodSupport_Module CholmodSupport module
-  *
-  * This module provides an interface to the Cholmod library which is part of the <a href="http://www.cise.ufl.edu/research/sparse/SuiteSparse/">suitesparse</a> package.
-  * It provides the two following main factorization classes:
-  * - class CholmodSupernodalLLT: a supernodal LLT Cholesky factorization.
-  * - class CholmodDecomposiiton: a general L(D)LT Cholesky factorization with automatic or explicit runtime selection of the underlying factorization method (supernodal or simplicial).
-  *
-  * For the sake of completeness, this module also propose the two following classes:
-  * - class CholmodSimplicialLLT
-  * - class CholmodSimplicialLDLT
-  * Note that these classes does not bring any particular advantage compared to the built-in
-  * SimplicialLLT and SimplicialLDLT factorization classes.
-  *
-  * \code
-  * #include <Eigen/CholmodSupport>
-  * \endcode
-  *
-  * In order to use this module, the cholmod headers must be accessible from the include paths, and your binary must be linked to the cholmod library and its dependencies.
-  * The dependencies depend on how cholmod has been compiled.
-  * For a cmake based project, you can use our FindCholmod.cmake module to help you in this task.
-  *
-  */
-
-#include "src/misc/Solve.h"
-#include "src/misc/SparseSolve.h"
-
+ * \defgroup CholmodSupport_Module CholmodSupport module
+ *
+ * This module provides an interface to the Cholmod library which is part of the <a
+ * href="http://www.suitesparse.com">suitesparse</a> package. It provides the two following main factorization classes:
+ * - class CholmodSupernodalLLT: a supernodal LLT Cholesky factorization.
+ * - class CholmodDecomposition: a general L(D)LT Cholesky factorization with automatic or explicit runtime selection of
+ * the underlying factorization method (supernodal or simplicial).
+ *
+ * For the sake of completeness, this module also propose the two following classes:
+ * - class CholmodSimplicialLLT
+ * - class CholmodSimplicialLDLT
+ * Note that these classes does not bring any particular advantage compared to the built-in
+ * SimplicialLLT and SimplicialLDLT factorization classes.
+ *
+ * \code
+ * #include <Eigen/CholmodSupport>
+ * \endcode
+ *
+ * In order to use this module, the cholmod headers must be accessible from the include paths, and your binary must be
+ * linked to the cholmod library and its dependencies. The dependencies depend on how cholmod has been compiled. For a
+ * cmake based project, you can use our FindCholmod.cmake module to help you in this task.
+ *
+ */
+
+// IWYU pragma: begin_exports
 #include "src/CholmodSupport/CholmodSupport.h"
-
+// IWYU pragma: end_exports
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
-#endif // EIGEN_CHOLMODSUPPORT_MODULE_H
-
+#endif  // EIGEN_CHOLMODSUPPORT_MODULE_H
diff --git a/inst/include/Eigen/Core b/inst/include/Eigen/Core
index 509c529e..34838f5d 100644
--- a/inst/include/Eigen/Core
+++ b/inst/include/Eigen/Core
@@ -8,138 +8,71 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#ifndef EIGEN_CORE_H
-#define EIGEN_CORE_H
+#ifndef EIGEN_CORE_MODULE_H
+#define EIGEN_CORE_MODULE_H
 
-// first thing Eigen does: stop the compiler from committing suicide
+// Eigen version information.
+#include "Version"
+
+// first thing Eigen does: stop the compiler from reporting useless warnings.
 #include "src/Core/util/DisableStupidWarnings.h"
 
 // then include this file where all our macros are defined. It's really important to do it first because
-// it's where we do all the alignment settings (platform detection and honoring the user's will if he
-// defined e.g. EIGEN_DONT_ALIGN) so it needs to be done before we do anything with vectorization.
+// it's where we do all the compiler/OS/arch detections and define most defaults.
 #include "src/Core/util/Macros.h"
 
-// Disable the ipa-cp-clone optimization flag with MinGW 6.x or newer (enabled by default with -O3)
+// This detects SSE/AVX/NEON/etc. and configure alignment settings
+#include "src/Core/util/ConfigureVectorization.h"
+
+// We need cuda_runtime.h/hip_runtime.h to ensure that
+// the EIGEN_USING_STD macro works properly on the device side
+#if defined(EIGEN_CUDACC)
+#include <cuda_runtime.h>
+#elif defined(EIGEN_HIPCC)
+#include <hip/hip_runtime.h>
+#endif
+
+#ifdef EIGEN_EXCEPTIONS
+#include <new>
+#endif
+
+// Disable the ipa-cp-clone optimization flag with MinGW 6.x or older (enabled by default with -O3)
 // See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=556 for details.
-#if defined(__MINGW32__) && EIGEN_GNUC_AT_LEAST(4,6)
-  #pragma GCC optimize ("-fno-ipa-cp-clone")
+#if EIGEN_COMP_MINGW && EIGEN_GNUC_STRICT_LESS_THAN(6, 0, 0)
+#pragma GCC optimize("-fno-ipa-cp-clone")
 #endif
 
+// Prevent ICC from specializing std::complex operators that silently fail
+// on device. This allows us to use our own device-compatible specializations
+// instead.
+#if EIGEN_COMP_ICC && defined(EIGEN_GPU_COMPILE_PHASE) && !defined(_OVERRIDE_COMPLEX_SPECIALIZATION_)
+#define _OVERRIDE_COMPLEX_SPECIALIZATION_ 1
+#endif
 #include <complex>
 
 // this include file manages BLAS and MKL related macros
 // and inclusion of their respective header files
 #include "src/Core/util/MKL_support.h"
 
-// if alignment is disabled, then disable vectorization. Note: EIGEN_ALIGN is the proper check, it takes into
-// account both the user's will (EIGEN_DONT_ALIGN) and our own platform checks
-#if !EIGEN_ALIGN
-  #ifndef EIGEN_DONT_VECTORIZE
-    #define EIGEN_DONT_VECTORIZE
-  #endif
-#endif
-
-#ifdef _MSC_VER
-  #include <malloc.h> // for _aligned_malloc -- need it regardless of whether vectorization is enabled
-  #if (_MSC_VER >= 1500) // 2008 or later
-    // Remember that usage of defined() in a #define is undefined by the standard.
-    // a user reported that in 64-bit mode, MSVC doesn't care to define _M_IX86_FP.
-    #if (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) || defined(_M_X64)
-      #define EIGEN_SSE2_ON_MSVC_2008_OR_LATER
-    #endif
-  #endif
-#else
-  // Remember that usage of defined() in a #define is undefined by the standard
-  #if (defined __SSE2__) && ( (!defined __GNUC__) || (defined __INTEL_COMPILER) || EIGEN_GNUC_AT_LEAST(4,2) )
-    #define EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC
-  #endif
-#endif
-
-#ifndef EIGEN_DONT_VECTORIZE
-
-  #if defined (EIGEN_SSE2_ON_NON_MSVC_BUT_NOT_OLD_GCC) || defined(EIGEN_SSE2_ON_MSVC_2008_OR_LATER)
-
-    // Defines symbols for compile-time detection of which instructions are
-    // used.
-    // EIGEN_VECTORIZE_YY is defined if and only if the instruction set YY is used
-    #define EIGEN_VECTORIZE
-    #define EIGEN_VECTORIZE_SSE
-    #define EIGEN_VECTORIZE_SSE2
-
-    // Detect sse3/ssse3/sse4:
-    // gcc and icc defines __SSE3__, ...
-    // there is no way to know about this on msvc. You can define EIGEN_VECTORIZE_SSE* if you
-    // want to force the use of those instructions with msvc.
-    #ifdef __SSE3__
-      #define EIGEN_VECTORIZE_SSE3
-    #endif
-    #ifdef __SSSE3__
-      #define EIGEN_VECTORIZE_SSSE3
-    #endif
-    #ifdef __SSE4_1__
-      #define EIGEN_VECTORIZE_SSE4_1
-    #endif
-    #ifdef __SSE4_2__
-      #define EIGEN_VECTORIZE_SSE4_2
-    #endif
-
-    // include files
-
-    // This extern "C" works around a MINGW-w64 compilation issue
-    // https://sourceforge.net/tracker/index.php?func=detail&aid=3018394&group_id=202880&atid=983354
-    // In essence, intrin.h is included by windows.h and also declares intrinsics (just as emmintrin.h etc. below do).
-    // However, intrin.h uses an extern "C" declaration, and g++ thus complains of duplicate declarations
-    // with conflicting linkage.  The linkage for intrinsics doesn't matter, but at that stage the compiler doesn't know;
-    // so, to avoid compile errors when windows.h is included after Eigen/Core, ensure intrinsics are extern "C" here too.
-    // notice that since these are C headers, the extern "C" is theoretically needed anyways.
-    extern "C" {
-      // In theory we should only include immintrin.h and not the other *mmintrin.h header files directly.
-      // Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus:
-      #if defined(__INTEL_COMPILER) && __INTEL_COMPILER >= 1110
-        #include <immintrin.h>
-      #else
-        #include <emmintrin.h>
-        #include <xmmintrin.h>
-        #ifdef  EIGEN_VECTORIZE_SSE3
-        #include <pmmintrin.h>
-        #endif
-        #ifdef EIGEN_VECTORIZE_SSSE3
-        #include <tmmintrin.h>
-        #endif
-        #ifdef EIGEN_VECTORIZE_SSE4_1
-        #include <smmintrin.h>
-        #endif
-        #ifdef EIGEN_VECTORIZE_SSE4_2
-        #include <nmmintrin.h>
-        #endif
-      #endif
-    } // end extern "C"
-  #elif defined __ALTIVEC__
-    #define EIGEN_VECTORIZE
-    #define EIGEN_VECTORIZE_ALTIVEC
-    #include <altivec.h>
-    // We need to #undef all these ugly tokens defined in <altivec.h>
-    // => use __vector instead of vector
-    #undef bool
-    #undef vector
-    #undef pixel
-  #elif defined  __ARM_NEON
-    #define EIGEN_VECTORIZE
-    #define EIGEN_VECTORIZE_NEON
-    #include <arm_neon.h>
-  #endif
+#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
+#define EIGEN_HAS_GPU_FP16
+#endif
+
+#if defined(EIGEN_HAS_CUDA_BF16) || defined(EIGEN_HAS_HIP_BF16)
+#define EIGEN_HAS_GPU_BF16
 #endif
 
 #if (defined _OPENMP) && (!defined EIGEN_DONT_PARALLELIZE)
-  #define EIGEN_HAS_OPENMP
+#define EIGEN_HAS_OPENMP
 #endif
 
 #ifdef EIGEN_HAS_OPENMP
+#include <atomic>
 #include <omp.h>
 #endif
 
 // MSVC for windows mobile does not have the errno.h file
-#if !(defined(_MSC_VER) && defined(_WIN32_WCE)) && !defined(__ARMCC_VERSION)
+#if !(EIGEN_COMP_MSVC && EIGEN_OS_WINCE) && !EIGEN_COMP_ARM
 #define EIGEN_HAS_ERRNO
 #endif
 
@@ -149,183 +82,315 @@
 #include <cstddef>
 #include <cstdlib>
 #include <cmath>
-#include <cassert>
 #include <functional>
+#ifndef EIGEN_NO_IO
+#include <sstream>
 #include <iosfwd>
+#endif
 #include <cstring>
 #include <string>
 #include <limits>
-#include <climits> // for CHAR_BIT
+#include <climits>  // for CHAR_BIT
 // for min/max:
 #include <algorithm>
 
+#include <array>
+#include <memory>
+#include <vector>
+
+// for std::is_nothrow_move_assignable
+#include <type_traits>
+
+// for std::this_thread::yield().
+#if !defined(EIGEN_USE_BLAS) && (defined(EIGEN_HAS_OPENMP) || defined(EIGEN_GEMM_THREADPOOL))
+#include <thread>
+#endif
+
+// for __cpp_lib feature test macros
+#if defined(__has_include) && __has_include(<version>)
+#include <version>
+#endif
+
+// for std::bit_cast()
+#if defined(__cpp_lib_bit_cast) && __cpp_lib_bit_cast >= 201806L
+#include <bit>
+#endif
+
 // for outputting debug info
 #ifdef EIGEN_DEBUG_ASSIGN
 #include <iostream>
 #endif
 
 // required for __cpuid, needs to be included after cmath
-#if defined(_MSC_VER) && (defined(_M_IX86)||defined(_M_X64)) && (!defined(_WIN32_WCE))
-  #include <intrin.h>
+// also required for _BitScanReverse on Windows on ARM
+#if EIGEN_COMP_MSVC && (EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM64) && !EIGEN_OS_WINCE
+#include <intrin.h>
 #endif
 
-#if defined(_CPPUNWIND) || defined(__EXCEPTIONS)
-  #define EIGEN_EXCEPTIONS
+#if defined(EIGEN_USE_SYCL)
+#undef min
+#undef max
+#undef isnan
+#undef isinf
+#undef isfinite
+#include <CL/sycl.hpp>
+#include <map>
+#include <thread>
+#include <utility>
+#ifndef EIGEN_SYCL_LOCAL_THREAD_DIM0
+#define EIGEN_SYCL_LOCAL_THREAD_DIM0 16
 #endif
-
-#ifdef EIGEN_EXCEPTIONS
-  #include <new>
+#ifndef EIGEN_SYCL_LOCAL_THREAD_DIM1
+#define EIGEN_SYCL_LOCAL_THREAD_DIM1 16
 #endif
-
-/** \brief Namespace containing all symbols from the %Eigen library. */
-namespace Eigen {
-
-inline static const char *SimdInstructionSetsInUse(void) {
-#if defined(EIGEN_VECTORIZE_SSE4_2)
-  return "SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
-#elif defined(EIGEN_VECTORIZE_SSE4_1)
-  return "SSE, SSE2, SSE3, SSSE3, SSE4.1";
-#elif defined(EIGEN_VECTORIZE_SSSE3)
-  return "SSE, SSE2, SSE3, SSSE3";
-#elif defined(EIGEN_VECTORIZE_SSE3)
-  return "SSE, SSE2, SSE3";
-#elif defined(EIGEN_VECTORIZE_SSE2)
-  return "SSE, SSE2";
-#elif defined(EIGEN_VECTORIZE_ALTIVEC)
-  return "AltiVec";
-#elif defined(EIGEN_VECTORIZE_NEON)
-  return "ARM NEON";
-#else
-  return "None";
-#endif
-}
-
-} // end namespace Eigen
-
-#define STAGE10_FULL_EIGEN2_API             10
-#define STAGE20_RESOLVE_API_CONFLICTS       20
-#define STAGE30_FULL_EIGEN3_API             30
-#define STAGE40_FULL_EIGEN3_STRICTNESS      40
-#define STAGE99_NO_EIGEN2_SUPPORT           99
-
-#if   defined EIGEN2_SUPPORT_STAGE40_FULL_EIGEN3_STRICTNESS
-  #define EIGEN2_SUPPORT
-  #define EIGEN2_SUPPORT_STAGE STAGE40_FULL_EIGEN3_STRICTNESS
-#elif defined EIGEN2_SUPPORT_STAGE30_FULL_EIGEN3_API
-  #define EIGEN2_SUPPORT
-  #define EIGEN2_SUPPORT_STAGE STAGE30_FULL_EIGEN3_API
-#elif defined EIGEN2_SUPPORT_STAGE20_RESOLVE_API_CONFLICTS
-  #define EIGEN2_SUPPORT
-  #define EIGEN2_SUPPORT_STAGE STAGE20_RESOLVE_API_CONFLICTS
-#elif defined EIGEN2_SUPPORT_STAGE10_FULL_EIGEN2_API
-  #define EIGEN2_SUPPORT
-  #define EIGEN2_SUPPORT_STAGE STAGE10_FULL_EIGEN2_API
-#elif defined EIGEN2_SUPPORT
-  // default to stage 3, that's what it's always meant
-  #define EIGEN2_SUPPORT_STAGE30_FULL_EIGEN3_API
-  #define EIGEN2_SUPPORT_STAGE STAGE30_FULL_EIGEN3_API
-#else
-  #define EIGEN2_SUPPORT_STAGE STAGE99_NO_EIGEN2_SUPPORT
 #endif
 
-#ifdef EIGEN2_SUPPORT
-#undef minor
+#if defined EIGEN2_SUPPORT_STAGE40_FULL_EIGEN3_STRICTNESS || defined EIGEN2_SUPPORT_STAGE30_FULL_EIGEN3_API || \
+    defined EIGEN2_SUPPORT_STAGE20_RESOLVE_API_CONFLICTS || defined EIGEN2_SUPPORT_STAGE10_FULL_EIGEN2_API ||  \
+    defined EIGEN2_SUPPORT
+// This will generate an error message:
+#error Eigen2-support is only available up to version 3.2. Please go to "http://eigen.tuxfamily.org/index.php?title=Eigen2" for further information
 #endif
 
-// we use size_t frequently and we'll never remember to prepend it with std:: everytime just to
+namespace Eigen {
+
+// we use size_t frequently and we'll never remember to prepend it with std:: every time just to
 // ensure QNX/QCC support
 using std::size_t;
-// gcc 4.6.0 wants std:: for ptrdiff_t 
+// gcc 4.6.0 wants std:: for ptrdiff_t
 using std::ptrdiff_t;
 
+}  // namespace Eigen
+
 /** \defgroup Core_Module Core module
-  * This is the main module of Eigen providing dense matrix and vector support
-  * (both fixed and dynamic size) with all the features corresponding to a BLAS library
-  * and much more...
-  *
-  * \code
-  * #include <Eigen/Core>
-  * \endcode
-  */
+ * This is the main module of Eigen providing dense matrix and vector support
+ * (both fixed and dynamic size) with all the features corresponding to a BLAS library
+ * and much more...
+ *
+ * \code
+ * #include <Eigen/Core>
+ * \endcode
+ */
+
+#ifdef EIGEN_USE_LAPACKE
+#ifdef EIGEN_USE_MKL
+#include "mkl_lapacke.h"
+#else
+#include "src/misc/lapacke.h"
+#endif
+#endif
 
+// IWYU pragma: begin_exports
 #include "src/Core/util/Constants.h"
-#include "src/Core/util/ForwardDeclarations.h"
 #include "src/Core/util/Meta.h"
+#include "src/Core/util/Assert.h"
+#include "src/Core/util/ForwardDeclarations.h"
 #include "src/Core/util/StaticAssert.h"
 #include "src/Core/util/XprHelper.h"
 #include "src/Core/util/Memory.h"
+#include "src/Core/util/IntegralConstant.h"
+#include "src/Core/util/Serializer.h"
+#include "src/Core/util/SymbolicIndex.h"
+#include "src/Core/util/EmulateArray.h"
+#include "src/Core/util/MoreMeta.h"
 
 #include "src/Core/NumTraits.h"
 #include "src/Core/MathFunctions.h"
+#include "src/Core/RandomImpl.h"
 #include "src/Core/GenericPacketMath.h"
+#include "src/Core/MathFunctionsImpl.h"
+#include "src/Core/arch/Default/ConjHelper.h"
+// Generic half float support
+#include "src/Core/arch/Default/Half.h"
+#include "src/Core/arch/Default/BFloat16.h"
+#include "src/Core/arch/Default/GenericPacketMathFunctionsFwd.h"
+
+#if defined EIGEN_VECTORIZE_AVX512
+#include "src/Core/arch/SSE/PacketMath.h"
+#include "src/Core/arch/SSE/Reductions.h"
+#include "src/Core/arch/AVX/PacketMath.h"
+#include "src/Core/arch/AVX/Reductions.h"
+#include "src/Core/arch/AVX512/PacketMath.h"
+#include "src/Core/arch/AVX512/Reductions.h"
+#if defined EIGEN_VECTORIZE_AVX512FP16
+#include "src/Core/arch/AVX512/PacketMathFP16.h"
+#endif
+#include "src/Core/arch/SSE/TypeCasting.h"
+#include "src/Core/arch/AVX/TypeCasting.h"
+#include "src/Core/arch/AVX512/TypeCasting.h"
+#if defined EIGEN_VECTORIZE_AVX512FP16
+#include "src/Core/arch/AVX512/TypeCastingFP16.h"
+#endif
+#include "src/Core/arch/SSE/Complex.h"
+#include "src/Core/arch/AVX/Complex.h"
+#include "src/Core/arch/AVX512/Complex.h"
+#include "src/Core/arch/SSE/MathFunctions.h"
+#include "src/Core/arch/AVX/MathFunctions.h"
+#include "src/Core/arch/AVX512/MathFunctions.h"
+#if defined EIGEN_VECTORIZE_AVX512FP16
+#include "src/Core/arch/AVX512/MathFunctionsFP16.h"
+#endif
+#include "src/Core/arch/AVX512/TrsmKernel.h"
+#elif defined EIGEN_VECTORIZE_AVX
+// Use AVX for floats and doubles, SSE for integers
+#include "src/Core/arch/SSE/PacketMath.h"
+#include "src/Core/arch/SSE/Reductions.h"
+#include "src/Core/arch/SSE/TypeCasting.h"
+#include "src/Core/arch/SSE/Complex.h"
+#include "src/Core/arch/AVX/PacketMath.h"
+#include "src/Core/arch/AVX/Reductions.h"
+#include "src/Core/arch/AVX/TypeCasting.h"
+#include "src/Core/arch/AVX/Complex.h"
+#include "src/Core/arch/SSE/MathFunctions.h"
+#include "src/Core/arch/AVX/MathFunctions.h"
+#elif defined EIGEN_VECTORIZE_SSE
+#include "src/Core/arch/SSE/PacketMath.h"
+#include "src/Core/arch/SSE/Reductions.h"
+#include "src/Core/arch/SSE/TypeCasting.h"
+#include "src/Core/arch/SSE/MathFunctions.h"
+#include "src/Core/arch/SSE/Complex.h"
+#endif
 
-#if defined EIGEN_VECTORIZE_SSE
-  #include "src/Core/arch/SSE/PacketMath.h"
-  #include "src/Core/arch/SSE/MathFunctions.h"
-  #include "src/Core/arch/SSE/Complex.h"
-#elif defined EIGEN_VECTORIZE_ALTIVEC
-  #include "src/Core/arch/AltiVec/PacketMath.h"
-  #include "src/Core/arch/AltiVec/Complex.h"
+#if defined(EIGEN_VECTORIZE_ALTIVEC) || defined(EIGEN_VECTORIZE_VSX)
+#include "src/Core/arch/AltiVec/PacketMath.h"
+#include "src/Core/arch/AltiVec/TypeCasting.h"
+#include "src/Core/arch/AltiVec/MathFunctions.h"
+#include "src/Core/arch/AltiVec/Complex.h"
 #elif defined EIGEN_VECTORIZE_NEON
-  #include "src/Core/arch/NEON/PacketMath.h"
-  #include "src/Core/arch/NEON/Complex.h"
+#include "src/Core/arch/NEON/PacketMath.h"
+#include "src/Core/arch/NEON/TypeCasting.h"
+#include "src/Core/arch/NEON/MathFunctions.h"
+#include "src/Core/arch/NEON/Complex.h"
+#elif defined EIGEN_VECTORIZE_LSX
+#include "src/Core/arch/LSX/PacketMath.h"
+#include "src/Core/arch/LSX/TypeCasting.h"
+#include "src/Core/arch/LSX/MathFunctions.h"
+#include "src/Core/arch/LSX/Complex.h"
+#elif defined EIGEN_VECTORIZE_SVE
+#include "src/Core/arch/SVE/PacketMath.h"
+#include "src/Core/arch/SVE/TypeCasting.h"
+#include "src/Core/arch/SVE/MathFunctions.h"
+#elif defined EIGEN_VECTORIZE_ZVECTOR
+#include "src/Core/arch/ZVector/PacketMath.h"
+#include "src/Core/arch/ZVector/MathFunctions.h"
+#include "src/Core/arch/ZVector/Complex.h"
+#elif defined EIGEN_VECTORIZE_MSA
+#include "src/Core/arch/MSA/PacketMath.h"
+#include "src/Core/arch/MSA/MathFunctions.h"
+#include "src/Core/arch/MSA/Complex.h"
+#elif defined EIGEN_VECTORIZE_HVX
+#include "src/Core/arch/HVX/PacketMath.h"
+#endif
+
+#if defined EIGEN_VECTORIZE_GPU
+#include "src/Core/arch/GPU/PacketMath.h"
+#include "src/Core/arch/GPU/MathFunctions.h"
+#include "src/Core/arch/GPU/TypeCasting.h"
+#endif
+
+#if defined(EIGEN_USE_SYCL)
+#include "src/Core/arch/SYCL/InteropHeaders.h"
+#if !defined(EIGEN_DONT_VECTORIZE_SYCL)
+#include "src/Core/arch/SYCL/PacketMath.h"
+#include "src/Core/arch/SYCL/MathFunctions.h"
+#include "src/Core/arch/SYCL/TypeCasting.h"
+#endif
 #endif
 
 #include "src/Core/arch/Default/Settings.h"
+// This file provides generic implementations valid for scalar as well
+#include "src/Core/arch/Default/GenericPacketMathFunctions.h"
+
+#include "src/Core/functors/TernaryFunctors.h"
+#include "src/Core/functors/BinaryFunctors.h"
+#include "src/Core/functors/UnaryFunctors.h"
+#include "src/Core/functors/NullaryFunctors.h"
+#include "src/Core/functors/StlFunctors.h"
+#include "src/Core/functors/AssignmentFunctors.h"
+
+// Specialized functors for GPU.
+#ifdef EIGEN_GPUCC
+#include "src/Core/arch/GPU/Complex.h"
+#endif
 
-#include "src/Core/Functors.h"
+// Specializations of vectorized activation functions for NEON.
+#ifdef EIGEN_VECTORIZE_NEON
+#include "src/Core/arch/NEON/UnaryFunctors.h"
+#endif
+
+#include "src/Core/util/IndexedViewHelper.h"
+#include "src/Core/util/ReshapedHelper.h"
+#include "src/Core/ArithmeticSequence.h"
+#ifndef EIGEN_NO_IO
+#include "src/Core/IO.h"
+#endif
 #include "src/Core/DenseCoeffsBase.h"
 #include "src/Core/DenseBase.h"
 #include "src/Core/MatrixBase.h"
 #include "src/Core/EigenBase.h"
 
-#ifndef EIGEN_PARSED_BY_DOXYGEN // work around Doxygen bug triggered by Assign.h r814874
-                                // at least confirmed with Doxygen 1.5.5 and 1.5.6
-  #include "src/Core/Assign.h"
-#endif
+#include "src/Core/Product.h"
+#include "src/Core/CoreEvaluators.h"
+#include "src/Core/AssignEvaluator.h"
+#include "src/Core/RealView.h"
+#include "src/Core/Assign.h"
 
+#include "src/Core/ArrayBase.h"
 #include "src/Core/util/BlasUtil.h"
 #include "src/Core/DenseStorage.h"
 #include "src/Core/NestByValue.h"
-#include "src/Core/ForceAlignedAccess.h"
+
+// #include "src/Core/ForceAlignedAccess.h"
+
 #include "src/Core/ReturnByValue.h"
 #include "src/Core/NoAlias.h"
 #include "src/Core/PlainObjectBase.h"
 #include "src/Core/Matrix.h"
 #include "src/Core/Array.h"
+#include "src/Core/Fill.h"
+#include "src/Core/CwiseTernaryOp.h"
 #include "src/Core/CwiseBinaryOp.h"
 #include "src/Core/CwiseUnaryOp.h"
 #include "src/Core/CwiseNullaryOp.h"
 #include "src/Core/CwiseUnaryView.h"
 #include "src/Core/SelfCwiseBinaryOp.h"
+#include "src/Core/InnerProduct.h"
 #include "src/Core/Dot.h"
 #include "src/Core/StableNorm.h"
-#include "src/Core/MapBase.h"
 #include "src/Core/Stride.h"
+#include "src/Core/MapBase.h"
 #include "src/Core/Map.h"
+#include "src/Core/Ref.h"
 #include "src/Core/Block.h"
 #include "src/Core/VectorBlock.h"
-#include "src/Core/Ref.h"
+#include "src/Core/IndexedView.h"
+#include "src/Core/Reshaped.h"
 #include "src/Core/Transpose.h"
 #include "src/Core/DiagonalMatrix.h"
 #include "src/Core/Diagonal.h"
 #include "src/Core/DiagonalProduct.h"
-#include "src/Core/PermutationMatrix.h"
-#include "src/Core/Transpositions.h"
+#include "src/Core/SkewSymmetricMatrix3.h"
 #include "src/Core/Redux.h"
 #include "src/Core/Visitor.h"
+#include "src/Core/FindCoeff.h"
 #include "src/Core/Fuzzy.h"
-#include "src/Core/IO.h"
 #include "src/Core/Swap.h"
 #include "src/Core/CommaInitializer.h"
-#include "src/Core/Flagged.h"
-#include "src/Core/ProductBase.h"
 #include "src/Core/GeneralProduct.h"
+#include "src/Core/Solve.h"
+#include "src/Core/Inverse.h"
+#include "src/Core/SolverBase.h"
+#include "src/Core/PermutationMatrix.h"
+#include "src/Core/Transpositions.h"
 #include "src/Core/TriangularMatrix.h"
 #include "src/Core/SelfAdjointView.h"
 #include "src/Core/products/GeneralBlockPanelKernel.h"
+#include "src/Core/DeviceWrapper.h"
+#ifdef EIGEN_GEMM_THREADPOOL
+#include "ThreadPool"
+#endif
 #include "src/Core/products/Parallelizer.h"
-#include "src/Core/products/CoeffBasedProduct.h"
+#include "src/Core/ProductEvaluators.h"
 #include "src/Core/products/GeneralMatrixVector.h"
 #include "src/Core/products/GeneralMatrixMatrix.h"
 #include "src/Core/SolveTriangular.h"
@@ -340,37 +405,47 @@ using std::ptrdiff_t;
 #include "src/Core/products/TriangularSolverVector.h"
 #include "src/Core/BandMatrix.h"
 #include "src/Core/CoreIterators.h"
+#include "src/Core/ConditionEstimator.h"
+
+#if defined(EIGEN_VECTORIZE_VSX)
+#include "src/Core/arch/AltiVec/MatrixProduct.h"
+#elif defined EIGEN_VECTORIZE_NEON
+#include "src/Core/arch/NEON/GeneralBlockPanelKernel.h"
+#elif defined EIGEN_VECTORIZE_LSX
+#include "src/Core/arch/LSX/GeneralBlockPanelKernel.h"
+#endif
+
+#if defined(EIGEN_VECTORIZE_AVX512)
+#include "src/Core/arch/AVX512/GemmKernel.h"
+#endif
 
-#include "src/Core/BooleanRedux.h"
 #include "src/Core/Select.h"
 #include "src/Core/VectorwiseOp.h"
+#include "src/Core/PartialReduxEvaluator.h"
 #include "src/Core/Random.h"
 #include "src/Core/Replicate.h"
 #include "src/Core/Reverse.h"
-#include "src/Core/ArrayBase.h"
 #include "src/Core/ArrayWrapper.h"
+#include "src/Core/StlIterators.h"
 
 #ifdef EIGEN_USE_BLAS
-#include "src/Core/products/GeneralMatrixMatrix_MKL.h"
-#include "src/Core/products/GeneralMatrixVector_MKL.h"
-#include "src/Core/products/GeneralMatrixMatrixTriangular_MKL.h"
-#include "src/Core/products/SelfadjointMatrixMatrix_MKL.h"
-#include "src/Core/products/SelfadjointMatrixVector_MKL.h"
-#include "src/Core/products/TriangularMatrixMatrix_MKL.h"
-#include "src/Core/products/TriangularMatrixVector_MKL.h"
-#include "src/Core/products/TriangularSolverMatrix_MKL.h"
-#endif // EIGEN_USE_BLAS
+#include "src/Core/products/GeneralMatrixMatrix_BLAS.h"
+#include "src/Core/products/GeneralMatrixVector_BLAS.h"
+#include "src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h"
+#include "src/Core/products/SelfadjointMatrixMatrix_BLAS.h"
+#include "src/Core/products/SelfadjointMatrixVector_BLAS.h"
+#include "src/Core/products/TriangularMatrixMatrix_BLAS.h"
+#include "src/Core/products/TriangularMatrixVector_BLAS.h"
+#include "src/Core/products/TriangularSolverMatrix_BLAS.h"
+#endif  // EIGEN_USE_BLAS
 
 #ifdef EIGEN_USE_MKL_VML
 #include "src/Core/Assign_MKL.h"
 #endif
 
 #include "src/Core/GlobalFunctions.h"
+// IWYU pragma: end_exports
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
-#ifdef EIGEN2_SUPPORT
-#include "Eigen2Support"
-#endif
-
-#endif // EIGEN_CORE_H
+#endif  // EIGEN_CORE_MODULE_H
diff --git a/inst/include/Eigen/Eigen b/inst/include/Eigen/Eigen
index 19b40ea4..654c8dc6 100644
--- a/inst/include/Eigen/Eigen
+++ b/inst/include/Eigen/Eigen
@@ -1,2 +1,2 @@
 #include "Dense"
-//#include "Sparse"
+#include "Sparse"
diff --git a/inst/include/Eigen/Eigen2Support b/inst/include/Eigen/Eigen2Support
deleted file mode 100644
index 6aa009d2..00000000
--- a/inst/include/Eigen/Eigen2Support
+++ /dev/null
@@ -1,95 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN2SUPPORT_H
-#define EIGEN2SUPPORT_H
-
-#if (!defined(EIGEN2_SUPPORT)) || (!defined(EIGEN_CORE_H))
-#error Eigen2 support must be enabled by defining EIGEN2_SUPPORT before including any Eigen header
-#endif
-
-#ifndef EIGEN_NO_EIGEN2_DEPRECATED_WARNING
-
-#if defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__clang__)
-#warning "Eigen2 support is deprecated in Eigen 3.2.x and it will be removed in Eigen 3.3. (Define EIGEN_NO_EIGEN2_DEPRECATED_WARNING to disable this warning)"
-#else
-#pragma message ("Eigen2 support is deprecated in Eigen 3.2.x and it will be removed in Eigen 3.3. (Define EIGEN_NO_EIGEN2_DEPRECATED_WARNING to disable this warning)")
-#endif
-
-#endif // EIGEN_NO_EIGEN2_DEPRECATED_WARNING
-
-#include "src/Core/util/DisableStupidWarnings.h"
-
-/** \ingroup Support_modules
-  * \defgroup Eigen2Support_Module Eigen2 support module
-  *
-  * \warning Eigen2 support is deprecated in Eigen 3.2.x and it will be removed in Eigen 3.3.
-  *
-  * This module provides a couple of deprecated functions improving the compatibility with Eigen2.
-  * 
-  * To use it, define EIGEN2_SUPPORT before including any Eigen header
-  * \code
-  * #define EIGEN2_SUPPORT
-  * \endcode
-  *
-  */
-
-#include "src/Eigen2Support/Macros.h"
-#include "src/Eigen2Support/Memory.h"
-#include "src/Eigen2Support/Meta.h"
-#include "src/Eigen2Support/Lazy.h"
-#include "src/Eigen2Support/Cwise.h"
-#include "src/Eigen2Support/CwiseOperators.h"
-#include "src/Eigen2Support/TriangularSolver.h"
-#include "src/Eigen2Support/Block.h"
-#include "src/Eigen2Support/VectorBlock.h"
-#include "src/Eigen2Support/Minor.h"
-#include "src/Eigen2Support/MathFunctions.h"
-
-
-#include "src/Core/util/ReenableStupidWarnings.h"
-
-// Eigen2 used to include iostream
-#include<iostream>
-
-#define EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, SizeSuffix) \
-using Eigen::Matrix##SizeSuffix##TypeSuffix; \
-using Eigen::Vector##SizeSuffix##TypeSuffix; \
-using Eigen::RowVector##SizeSuffix##TypeSuffix;
-
-#define EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE(TypeSuffix) \
-EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, 2) \
-EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, 3) \
-EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, 4) \
-EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, X) \
-
-#define EIGEN_USING_MATRIX_TYPEDEFS \
-EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE(i) \
-EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE(f) \
-EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE(d) \
-EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE(cf) \
-EIGEN_USING_MATRIX_TYPEDEFS_FOR_TYPE(cd)
-
-#define USING_PART_OF_NAMESPACE_EIGEN \
-EIGEN_USING_MATRIX_TYPEDEFS \
-using Eigen::Matrix; \
-using Eigen::MatrixBase; \
-using Eigen::ei_random; \
-using Eigen::ei_real; \
-using Eigen::ei_imag; \
-using Eigen::ei_conj; \
-using Eigen::ei_abs; \
-using Eigen::ei_abs2; \
-using Eigen::ei_sqrt; \
-using Eigen::ei_exp; \
-using Eigen::ei_log; \
-using Eigen::ei_sin; \
-using Eigen::ei_cos;
-
-#endif // EIGEN2SUPPORT_H
diff --git a/inst/include/Eigen/Eigenvalues b/inst/include/Eigen/Eigenvalues
index 53c5a73a..3b0bdee1 100644
--- a/inst/include/Eigen/Eigenvalues
+++ b/inst/include/Eigen/Eigenvalues
@@ -1,30 +1,40 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_EIGENVALUES_MODULE_H
 #define EIGEN_EIGENVALUES_MODULE_H
 
 #include "Core"
 
-#include "src/Core/util/DisableStupidWarnings.h"
-
 #include "Cholesky"
 #include "Jacobi"
 #include "Householder"
 #include "LU"
 #include "Geometry"
 
+#include "src/Core/util/DisableStupidWarnings.h"
+
 /** \defgroup Eigenvalues_Module Eigenvalues module
-  *
-  *
-  *
-  * This module mainly provides various eigenvalue solvers.
-  * This module also provides some MatrixBase methods, including:
-  *  - MatrixBase::eigenvalues(),
-  *  - MatrixBase::operatorNorm()
-  *
-  * \code
-  * #include <Eigen/Eigenvalues>
-  * \endcode
-  */
+ *
+ *
+ *
+ * This module mainly provides various eigenvalue solvers.
+ * This module also provides some MatrixBase methods, including:
+ *  - MatrixBase::eigenvalues(),
+ *  - MatrixBase::operatorNorm()
+ *
+ * \code
+ * #include <Eigen/Eigenvalues>
+ * \endcode
+ */
+
+#include "src/misc/RealSvd2x2.h"
 
+// IWYU pragma: begin_exports
 #include "src/Eigenvalues/Tridiagonalization.h"
 #include "src/Eigenvalues/RealSchur.h"
 #include "src/Eigenvalues/EigenSolver.h"
@@ -37,12 +47,17 @@
 #include "src/Eigenvalues/GeneralizedEigenSolver.h"
 #include "src/Eigenvalues/MatrixBaseEigenvalues.h"
 #ifdef EIGEN_USE_LAPACKE
-#include "src/Eigenvalues/RealSchur_MKL.h"
-#include "src/Eigenvalues/ComplexSchur_MKL.h"
-#include "src/Eigenvalues/SelfAdjointEigenSolver_MKL.h"
+#ifdef EIGEN_USE_MKL
+#include "mkl_lapacke.h"
+#else
+#include "src/misc/lapacke.h"
+#endif
+#include "src/Eigenvalues/RealSchur_LAPACKE.h"
+#include "src/Eigenvalues/ComplexSchur_LAPACKE.h"
+#include "src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h"
 #endif
+// IWYU pragma: end_exports
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
-#endif // EIGEN_EIGENVALUES_MODULE_H
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */
+#endif  // EIGEN_EIGENVALUES_MODULE_H
diff --git a/inst/include/Eigen/Geometry b/inst/include/Eigen/Geometry
index efd9d450..efe3e1fa 100644
--- a/inst/include/Eigen/Geometry
+++ b/inst/include/Eigen/Geometry
@@ -1,63 +1,59 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_GEOMETRY_MODULE_H
 #define EIGEN_GEOMETRY_MODULE_H
 
 #include "Core"
 
-#include "src/Core/util/DisableStupidWarnings.h"
-
 #include "SVD"
 #include "LU"
 #include <limits>
 
-#ifndef M_PI
-#define M_PI 3.14159265358979323846
-#endif
+#include "src/Core/util/DisableStupidWarnings.h"
 
 /** \defgroup Geometry_Module Geometry module
-  *
-  *
-  *
-  * This module provides support for:
-  *  - fixed-size homogeneous transformations
-  *  - translation, scaling, 2D and 3D rotations
-  *  - quaternions
-  *  - \ref MatrixBase::cross() "cross product"
-  *  - \ref MatrixBase::unitOrthogonal() "orthognal vector generation"
-  *  - some linear components: parametrized-lines and hyperplanes
-  *
-  * \code
-  * #include <Eigen/Geometry>
-  * \endcode
-  */
-
+ *
+ * This module provides support for:
+ *  - fixed-size homogeneous transformations
+ *  - translation, scaling, 2D and 3D rotations
+ *  - \link Quaternion quaternions \endlink
+ *  - cross products (\ref MatrixBase::cross(), \ref MatrixBase::cross3())
+ *  - orthogonal vector generation (MatrixBase::unitOrthogonal)
+ *  - some linear components: \link ParametrizedLine parametrized-lines \endlink and \link Hyperplane hyperplanes \endlink
+ *  - \link AlignedBox axis aligned bounding boxes \endlink
+ *  - \link umeyama() least-square transformation fitting \endlink
+ * \code
+ * #include <Eigen/Geometry>
+ * \endcode
+ */
+
+// IWYU pragma: begin_exports
 #include "src/Geometry/OrthoMethods.h"
 #include "src/Geometry/EulerAngles.h"
-
-#if EIGEN2_SUPPORT_STAGE > STAGE20_RESOLVE_API_CONFLICTS
-  #include "src/Geometry/Homogeneous.h"
-  #include "src/Geometry/RotationBase.h"
-  #include "src/Geometry/Rotation2D.h"
-  #include "src/Geometry/Quaternion.h"
-  #include "src/Geometry/AngleAxis.h"
-  #include "src/Geometry/Transform.h"
-  #include "src/Geometry/Translation.h"
-  #include "src/Geometry/Scaling.h"
-  #include "src/Geometry/Hyperplane.h"
-  #include "src/Geometry/ParametrizedLine.h"
-  #include "src/Geometry/AlignedBox.h"
-  #include "src/Geometry/Umeyama.h"
-
-  #if defined EIGEN_VECTORIZE_SSE
-    #include "src/Geometry/arch/Geometry_SSE.h"
-  #endif
-#endif
-
-#ifdef EIGEN2_SUPPORT
-#include "src/Eigen2Support/Geometry/All.h"
+#include "src/Geometry/Homogeneous.h"
+#include "src/Geometry/RotationBase.h"
+#include "src/Geometry/Rotation2D.h"
+#include "src/Geometry/Quaternion.h"
+#include "src/Geometry/AngleAxis.h"
+#include "src/Geometry/Transform.h"
+#include "src/Geometry/Translation.h"
+#include "src/Geometry/Scaling.h"
+#include "src/Geometry/Hyperplane.h"
+#include "src/Geometry/ParametrizedLine.h"
+#include "src/Geometry/AlignedBox.h"
+#include "src/Geometry/Umeyama.h"
+
+// Use the SSE optimized version whenever possible.
+#if (defined EIGEN_VECTORIZE_SSE) || (defined EIGEN_VECTORIZE_NEON)
+#include "src/Geometry/arch/Geometry_SIMD.h"
 #endif
+// IWYU pragma: end_exports
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
-#endif // EIGEN_GEOMETRY_MODULE_H
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */
-
+#endif  // EIGEN_GEOMETRY_MODULE_H
diff --git a/inst/include/Eigen/Householder b/inst/include/Eigen/Householder
index 6e348db5..5070e070 100644
--- a/inst/include/Eigen/Householder
+++ b/inst/include/Eigen/Householder
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_HOUSEHOLDER_MODULE_H
 #define EIGEN_HOUSEHOLDER_MODULE_H
 
@@ -6,18 +13,19 @@
 #include "src/Core/util/DisableStupidWarnings.h"
 
 /** \defgroup Householder_Module Householder module
-  * This module provides Householder transformations.
-  *
-  * \code
-  * #include <Eigen/Householder>
-  * \endcode
-  */
+ * This module provides Householder transformations.
+ *
+ * \code
+ * #include <Eigen/Householder>
+ * \endcode
+ */
 
+// IWYU pragma: begin_exports
 #include "src/Householder/Householder.h"
 #include "src/Householder/HouseholderSequence.h"
 #include "src/Householder/BlockHouseholder.h"
+// IWYU pragma: end_exports
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
-#endif // EIGEN_HOUSEHOLDER_MODULE_H
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */
+#endif  // EIGEN_HOUSEHOLDER_MODULE_H
diff --git a/inst/include/Eigen/IterativeLinearSolvers b/inst/include/Eigen/IterativeLinearSolvers
index 0f4159dc..fe5159e9 100644
--- a/inst/include/Eigen/IterativeLinearSolvers
+++ b/inst/include/Eigen/IterativeLinearSolvers
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_ITERATIVELINEARSOLVERS_MODULE_H
 #define EIGEN_ITERATIVELINEARSOLVERS_MODULE_H
 
@@ -6,35 +13,40 @@
 
 #include "src/Core/util/DisableStupidWarnings.h"
 
-/** 
+/**
   * \defgroup IterativeLinearSolvers_Module IterativeLinearSolvers module
   *
-  * This module currently provides iterative methods to solve problems of the form \c A \c x = \c b, where \c A is a squared matrix, usually very large and sparse.
+  * This module currently provides iterative methods to solve problems of the form \c A \c x = \c b, where \c A is a
+  squared matrix, usually very large and sparse.
   * Those solvers are accessible via the following classes:
   *  - ConjugateGradient for selfadjoint (hermitian) matrices,
+  *  - LeastSquaresConjugateGradient for rectangular least-square problems,
   *  - BiCGSTAB for general square matrices.
   *
   * These iterative solvers are associated with some preconditioners:
   *  - IdentityPreconditioner - not really useful
-  *  - DiagonalPreconditioner - also called JAcobi preconditioner, work very well on diagonal dominant matrices.
-  *  - IncompleteILUT - incomplete LU factorization with dual thresholding
+  *  - DiagonalPreconditioner - also called Jacobi preconditioner, work very well on diagonal dominant matrices.
+  *  - IncompleteLUT - incomplete LU factorization with dual thresholding
   *
-  * Such problems can also be solved using the direct sparse decomposition modules: SparseCholesky, CholmodSupport, UmfPackSupport, SuperLUSupport.
+  * Such problems can also be solved using the direct sparse decomposition modules: SparseCholesky, CholmodSupport,
+  UmfPackSupport, SuperLUSupport, AccelerateSupport.
   *
-  * \code
-  * #include <Eigen/IterativeLinearSolvers>
-  * \endcode
+    \code
+    #include <Eigen/IterativeLinearSolvers>
+    \endcode
   */
 
-#include "src/misc/Solve.h"
-#include "src/misc/SparseSolve.h"
-
+// IWYU pragma: begin_exports
+#include "src/IterativeLinearSolvers/SolveWithGuess.h"
 #include "src/IterativeLinearSolvers/IterativeSolverBase.h"
 #include "src/IterativeLinearSolvers/BasicPreconditioners.h"
 #include "src/IterativeLinearSolvers/ConjugateGradient.h"
+#include "src/IterativeLinearSolvers/LeastSquareConjugateGradient.h"
 #include "src/IterativeLinearSolvers/BiCGSTAB.h"
 #include "src/IterativeLinearSolvers/IncompleteLUT.h"
+#include "src/IterativeLinearSolvers/IncompleteCholesky.h"
+// IWYU pragma: end_exports
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
-#endif // EIGEN_ITERATIVELINEARSOLVERS_MODULE_H
+#endif  // EIGEN_ITERATIVELINEARSOLVERS_MODULE_H
diff --git a/inst/include/Eigen/Jacobi b/inst/include/Eigen/Jacobi
index ba8a4dc3..31eb36a7 100644
--- a/inst/include/Eigen/Jacobi
+++ b/inst/include/Eigen/Jacobi
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_JACOBI_MODULE_H
 #define EIGEN_JACOBI_MODULE_H
 
@@ -6,21 +13,21 @@
 #include "src/Core/util/DisableStupidWarnings.h"
 
 /** \defgroup Jacobi_Module Jacobi module
-  * This module provides Jacobi and Givens rotations.
-  *
-  * \code
-  * #include <Eigen/Jacobi>
-  * \endcode
-  *
-  * In addition to listed classes, it defines the two following MatrixBase methods to apply a Jacobi or Givens rotation:
-  *  - MatrixBase::applyOnTheLeft()
-  *  - MatrixBase::applyOnTheRight().
-  */
+ * This module provides Jacobi and Givens rotations.
+ *
+ * \code
+ * #include <Eigen/Jacobi>
+ * \endcode
+ *
+ * In addition to listed classes, it defines the two following MatrixBase methods to apply a Jacobi or Givens rotation:
+ *  - MatrixBase::applyOnTheLeft()
+ *  - MatrixBase::applyOnTheRight().
+ */
 
+// IWYU pragma: begin_exports
 #include "src/Jacobi/Jacobi.h"
+// IWYU pragma: end_exports
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
-#endif // EIGEN_JACOBI_MODULE_H
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */
-
+#endif  // EIGEN_JACOBI_MODULE_H
diff --git a/inst/include/Eigen/KLUSupport b/inst/include/Eigen/KLUSupport
new file mode 100644
index 00000000..13959a3c
--- /dev/null
+++ b/inst/include/Eigen/KLUSupport
@@ -0,0 +1,43 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_KLUSUPPORT_MODULE_H
+#define EIGEN_KLUSUPPORT_MODULE_H
+
+#include "SparseCore"
+
+#include "src/Core/util/DisableStupidWarnings.h"
+
+extern "C" {
+#include <btf.h>
+#include <klu.h>
+}
+
+/** \ingroup Support_modules
+ * \defgroup KLUSupport_Module KLUSupport module
+ *
+ * This module provides an interface to the KLU library which is part of the <a
+ * href="http://www.suitesparse.com">suitesparse</a> package. It provides the following factorization class:
+ * - class KLU: a sparse LU factorization, well-suited for circuit simulation.
+ *
+ * \code
+ * #include <Eigen/KLUSupport>
+ * \endcode
+ *
+ * In order to use this module, the klu and btf headers must be accessible from the include paths, and your binary must
+ * be linked to the klu library and its dependencies. The dependencies depend on how umfpack has been compiled. For a
+ * cmake based project, you can use our FindKLU.cmake module to help you in this task.
+ *
+ */
+
+// IWYU pragma: begin_exports
+#include "src/KLUSupport/KLUSupport.h"
+// IWYU pragma: end_exports
+
+#include "src/Core/util/ReenableStupidWarnings.h"
+
+#endif  // EIGEN_KLUSUPPORT_MODULE_H
diff --git a/inst/include/Eigen/LU b/inst/include/Eigen/LU
index db579550..d8044803 100644
--- a/inst/include/Eigen/LU
+++ b/inst/include/Eigen/LU
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_LU_MODULE_H
 #define EIGEN_LU_MODULE_H
 
@@ -6,36 +13,34 @@
 #include "src/Core/util/DisableStupidWarnings.h"
 
 /** \defgroup LU_Module LU module
-  * This module includes %LU decomposition and related notions such as matrix inversion and determinant.
-  * This module defines the following MatrixBase methods:
-  *  - MatrixBase::inverse()
-  *  - MatrixBase::determinant()
-  *
-  * \code
-  * #include <Eigen/LU>
-  * \endcode
-  */
-
-#include "src/misc/Solve.h"
+ * This module includes %LU decomposition and related notions such as matrix inversion and determinant.
+ * This module defines the following MatrixBase methods:
+ *  - MatrixBase::inverse()
+ *  - MatrixBase::determinant()
+ *
+ * \code
+ * #include <Eigen/LU>
+ * \endcode
+ */
+
 #include "src/misc/Kernel.h"
 #include "src/misc/Image.h"
+
+// IWYU pragma: begin_exports
 #include "src/LU/FullPivLU.h"
 #include "src/LU/PartialPivLU.h"
 #ifdef EIGEN_USE_LAPACKE
-#include "src/LU/PartialPivLU_MKL.h"
+#include "src/misc/lapacke_helpers.h"
+#include "src/LU/PartialPivLU_LAPACKE.h"
 #endif
 #include "src/LU/Determinant.h"
-#include "src/LU/Inverse.h"
-
-#if defined EIGEN_VECTORIZE_SSE
-  #include "src/LU/arch/Inverse_SSE.h"
-#endif
+#include "src/LU/InverseImpl.h"
 
-#ifdef EIGEN2_SUPPORT
-  #include "src/Eigen2Support/LU.h"
+#if defined EIGEN_VECTORIZE_SSE || defined EIGEN_VECTORIZE_NEON
+#include "src/LU/arch/InverseSize4.h"
 #endif
+// IWYU pragma: end_exports
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
-#endif // EIGEN_LU_MODULE_H
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */
+#endif  // EIGEN_LU_MODULE_H
diff --git a/inst/include/Eigen/LeastSquares b/inst/include/Eigen/LeastSquares
deleted file mode 100644
index 35137c25..00000000
--- a/inst/include/Eigen/LeastSquares
+++ /dev/null
@@ -1,32 +0,0 @@
-#ifndef EIGEN_REGRESSION_MODULE_H
-#define EIGEN_REGRESSION_MODULE_H
-
-#ifndef EIGEN2_SUPPORT
-#error LeastSquares is only available in Eigen2 support mode (define EIGEN2_SUPPORT)
-#endif
-
-// exclude from normal eigen3-only documentation
-#ifdef EIGEN2_SUPPORT
-
-#include "Core"
-
-#include "src/Core/util/DisableStupidWarnings.h"
-
-#include "Eigenvalues"
-#include "Geometry"
-
-/** \defgroup LeastSquares_Module LeastSquares module
-  * This module provides linear regression and related features.
-  *
-  * \code
-  * #include <Eigen/LeastSquares>
-  * \endcode
-  */
-
-#include "src/Eigen2Support/LeastSquares.h"
-
-#include "src/Core/util/ReenableStupidWarnings.h"
-
-#endif // EIGEN2_SUPPORT
-
-#endif // EIGEN_REGRESSION_MODULE_H
diff --git a/inst/include/Eigen/MetisSupport b/inst/include/Eigen/MetisSupport
index 6a113f7a..3636d3a0 100644
--- a/inst/include/Eigen/MetisSupport
+++ b/inst/include/Eigen/MetisSupport
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_METISSUPPORT_MODULE_H
 #define EIGEN_METISSUPPORT_MODULE_H
 
@@ -9,20 +16,20 @@ extern "C" {
 #include <metis.h>
 }
 
-
 /** \ingroup Support_modules
-  * \defgroup MetisSupport_Module MetisSupport module
-  *
-  * \code
-  * #include <Eigen/MetisSupport>
-  * \endcode
-  * This module defines an interface to the METIS reordering package (http://glaros.dtc.umn.edu/gkhome/views/metis). 
-  * It can be used just as any other built-in method as explained in \link OrderingMethods_Module here. \endlink
-  */
-
-
+ * \defgroup MetisSupport_Module MetisSupport module
+ *
+ * \code
+ * #include <Eigen/MetisSupport>
+ * \endcode
+ * This module defines an interface to the METIS reordering package (http://glaros.dtc.umn.edu/gkhome/views/metis).
+ * It can be used just as any other built-in method as explained in \link OrderingMethods_Module here. \endlink
+ */
+
+// IWYU pragma: begin_exports
 #include "src/MetisSupport/MetisSupport.h"
+// IWYU pragma: end_exports
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
-#endif // EIGEN_METISSUPPORT_MODULE_H
+#endif  // EIGEN_METISSUPPORT_MODULE_H
diff --git a/inst/include/Eigen/OrderingMethods b/inst/include/Eigen/OrderingMethods
index 7c0f1fff..01674194 100644
--- a/inst/include/Eigen/OrderingMethods
+++ b/inst/include/Eigen/OrderingMethods
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_ORDERINGMETHODS_MODULE_H
 #define EIGEN_ORDERINGMETHODS_MODULE_H
 
@@ -5,62 +12,62 @@
 
 #include "src/Core/util/DisableStupidWarnings.h"
 
-/** 
-  * \defgroup OrderingMethods_Module OrderingMethods module
-  *
-  * This module is currently for internal use only
-  * 
-  * It defines various built-in and external ordering methods for sparse matrices. 
-  * They are typically used to reduce the number of elements during 
-  * the sparse matrix decomposition (LLT, LU, QR).
-  * Precisely, in a preprocessing step, a permutation matrix P is computed using 
-  * those ordering methods and applied to the columns of the matrix. 
-  * Using for instance the sparse Cholesky decomposition, it is expected that 
-  * the nonzeros elements in LLT(A*P) will be much smaller than that in LLT(A).
-  * 
-  * 
-  * Usage : 
-  * \code
-  * #include <Eigen/OrderingMethods>
-  * \endcode
-  * 
-  * A simple usage is as a template parameter in the sparse decomposition classes : 
-  * 
-  * \code 
-  * SparseLU<MatrixType, COLAMDOrdering<int> > solver;
-  * \endcode 
-  * 
-  * \code 
-  * SparseQR<MatrixType, COLAMDOrdering<int> > solver;
-  * \endcode
-  * 
-  * It is possible as well to call directly a particular ordering method for your own purpose, 
-  * \code 
-  * AMDOrdering<int> ordering;
-  * PermutationMatrix<Dynamic, Dynamic, int> perm;
-  * SparseMatrix<double> A; 
-  * //Fill the matrix ...
-  * 
-  * ordering(A, perm); // Call AMD
-  * \endcode
-  * 
-  * \note Some of these methods (like AMD or METIS), need the sparsity pattern 
-  * of the input matrix to be symmetric. When the matrix is structurally unsymmetric, 
-  * Eigen computes internally the pattern of \f$A^T*A\f$ before calling the method.
-  * If your matrix is already symmetric (at leat in structure), you can avoid that
-  * by calling the method with a SelfAdjointView type.
-  * 
-  * \code
-  *  // Call the ordering on the pattern of the lower triangular matrix A
-  * ordering(A.selfadjointView<Lower>(), perm);
-  * \endcode
-  */
+/**
+ * \defgroup OrderingMethods_Module OrderingMethods module
+ *
+ * This module is currently for internal use only
+ *
+ * It defines various built-in and external ordering methods for sparse matrices.
+ * They are typically used to reduce the number of elements during
+ * the sparse matrix decomposition (LLT, LU, QR).
+ * Precisely, in a preprocessing step, a permutation matrix P is computed using
+ * those ordering methods and applied to the columns of the matrix.
+ * Using for instance the sparse Cholesky decomposition, it is expected that
+ * the nonzeros elements in LLT(A*P) will be much smaller than that in LLT(A).
+ *
+ *
+ * Usage :
+ * \code
+ * #include <Eigen/OrderingMethods>
+ * \endcode
+ *
+ * A simple usage is as a template parameter in the sparse decomposition classes :
+ *
+ * \code
+ * SparseLU<MatrixType, COLAMDOrdering<int> > solver;
+ * \endcode
+ *
+ * \code
+ * SparseQR<MatrixType, COLAMDOrdering<int> > solver;
+ * \endcode
+ *
+ * It is possible as well to call directly a particular ordering method for your own purpose,
+ * \code
+ * AMDOrdering<int> ordering;
+ * PermutationMatrix<Dynamic, Dynamic, int> perm;
+ * SparseMatrix<double> A;
+ * //Fill the matrix ...
+ *
+ * ordering(A, perm); // Call AMD
+ * \endcode
+ *
+ * \note Some of these methods (like AMD or METIS), need the sparsity pattern
+ * of the input matrix to be symmetric. When the matrix is structurally unsymmetric,
+ * Eigen computes internally the pattern of \f$A^T*A\f$ before calling the method.
+ * If your matrix is already symmetric (at least in structure), you can avoid that
+ * by calling the method with a SelfAdjointView type.
+ *
+ * \code
+ *  // Call the ordering on the pattern of the lower triangular matrix A
+ * ordering(A.selfadjointView<Lower>(), perm);
+ * \endcode
+ */
 
-#ifndef EIGEN_MPL2_ONLY
+// IWYU pragma: begin_exports
 #include "src/OrderingMethods/Amd.h"
-#endif
-
 #include "src/OrderingMethods/Ordering.h"
+// IWYU pragma: end_exports
+
 #include "src/Core/util/ReenableStupidWarnings.h"
 
-#endif // EIGEN_ORDERINGMETHODS_MODULE_H
+#endif  // EIGEN_ORDERINGMETHODS_MODULE_H
diff --git a/inst/include/Eigen/PaStiXSupport b/inst/include/Eigen/PaStiXSupport
index 7c616ee5..dd1cfcb1 100644
--- a/inst/include/Eigen/PaStiXSupport
+++ b/inst/include/Eigen/PaStiXSupport
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_PASTIXSUPPORT_MODULE_H
 #define EIGEN_PASTIXSUPPORT_MODULE_H
 
@@ -5,7 +12,6 @@
 
 #include "src/Core/util/DisableStupidWarnings.h"
 
-#include <complex.h>
 extern "C" {
 #include <pastix_nompi.h>
 #include <pastix.h>
@@ -16,31 +22,30 @@ extern "C" {
 #endif
 
 /** \ingroup Support_modules
-  * \defgroup PaStiXSupport_Module PaStiXSupport module
-  * 
-  * This module provides an interface to the <a href="http://pastix.gforge.inria.fr/">PaSTiX</a> library.
-  * PaSTiX is a general \b supernodal, \b parallel and \b opensource sparse solver.
-  * It provides the two following main factorization classes:
-  * - class PastixLLT : a supernodal, parallel LLt Cholesky factorization.
-  * - class PastixLDLT: a supernodal, parallel LDLt Cholesky factorization.
-  * - class PastixLU : a supernodal, parallel LU factorization (optimized for a symmetric pattern).
-  * 
-  * \code
-  * #include <Eigen/PaStiXSupport>
-  * \endcode
-  *
-  * In order to use this module, the PaSTiX headers must be accessible from the include paths, and your binary must be linked to the PaSTiX library and its dependencies.
-  * The dependencies depend on how PaSTiX has been compiled.
-  * For a cmake based project, you can use our FindPaSTiX.cmake module to help you in this task.
-  *
-  */
-
-#include "src/misc/Solve.h"
-#include "src/misc/SparseSolve.h"
-
+ * \defgroup PaStiXSupport_Module PaStiXSupport module
+ *
+ * This module provides an interface to the <a href="http://pastix.gforge.inria.fr/">PaSTiX</a> library.
+ * PaSTiX is a general \b supernodal, \b parallel and \b opensource sparse solver.
+ * It provides the two following main factorization classes:
+ * - class PastixLLT : a supernodal, parallel LLt Cholesky factorization.
+ * - class PastixLDLT: a supernodal, parallel LDLt Cholesky factorization.
+ * - class PastixLU : a supernodal, parallel LU factorization (optimized for a symmetric pattern).
+ *
+ * \code
+ * #include <Eigen/PaStiXSupport>
+ * \endcode
+ *
+ * In order to use this module, the PaSTiX headers must be accessible from the include paths, and your binary must be
+ * linked to the PaSTiX library and its dependencies. This wrapper resuires PaStiX version 5.x compiled without MPI
+ * support. The dependencies depend on how PaSTiX has been compiled. For a cmake based project, you can use our
+ * FindPaSTiX.cmake module to help you in this task.
+ *
+ */
+
+// IWYU pragma: begin_exports
 #include "src/PaStiXSupport/PaStiXSupport.h"
-
+// IWYU pragma: end_exports
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
-#endif // EIGEN_PASTIXSUPPORT_MODULE_H
+#endif  // EIGEN_PASTIXSUPPORT_MODULE_H
diff --git a/inst/include/Eigen/PardisoSupport b/inst/include/Eigen/PardisoSupport
index 99330ce7..4aef5fb3 100644
--- a/inst/include/Eigen/PardisoSupport
+++ b/inst/include/Eigen/PardisoSupport
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_PARDISOSUPPORT_MODULE_H
 #define EIGEN_PARDISOSUPPORT_MODULE_H
 
@@ -7,24 +14,25 @@
 
 #include <mkl_pardiso.h>
 
-#include <unsupported/Eigen/SparseExtra>
-
 /** \ingroup Support_modules
-  * \defgroup PardisoSupport_Module PardisoSupport module
-  *
-  * This module brings support for the Intel(R) MKL PARDISO direct sparse solvers.
-  *
-  * \code
-  * #include <Eigen/PardisoSupport>
-  * \endcode
-  *
-  * In order to use this module, the MKL headers must be accessible from the include paths, and your binary must be linked to the MKL library and its dependencies.
-  * See this \ref TopicUsingIntelMKL "page" for more information on MKL-Eigen integration.
-  * 
-  */
-
+ * \defgroup PardisoSupport_Module PardisoSupport module
+ *
+ * This module brings support for the Intel(R) MKL PARDISO direct sparse solvers.
+ *
+ * \code
+ * #include <Eigen/PardisoSupport>
+ * \endcode
+ *
+ * In order to use this module, the MKL headers must be accessible from the include paths, and your binary must be
+ * linked to the MKL library and its dependencies. See this \ref TopicUsingIntelMKL "page" for more information on
+ * MKL-Eigen integration.
+ *
+ */
+
+// IWYU pragma: begin_exports
 #include "src/PardisoSupport/PardisoSupport.h"
+// IWYU pragma: end_exports
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
-#endif // EIGEN_PARDISOSUPPORT_MODULE_H
+#endif  // EIGEN_PARDISOSUPPORT_MODULE_H
diff --git a/inst/include/Eigen/QR b/inst/include/Eigen/QR
index ac5b0269..c38b453b 100644
--- a/inst/include/Eigen/QR
+++ b/inst/include/Eigen/QR
@@ -1,45 +1,48 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_QR_MODULE_H
 #define EIGEN_QR_MODULE_H
 
 #include "Core"
 
-#include "src/Core/util/DisableStupidWarnings.h"
-
 #include "Cholesky"
 #include "Jacobi"
 #include "Householder"
 
+#include "src/Core/util/DisableStupidWarnings.h"
+
 /** \defgroup QR_Module QR module
-  *
-  *
-  *
-  * This module provides various QR decompositions
-  * This module also provides some MatrixBase methods, including:
-  *  - MatrixBase::qr(),
-  *
-  * \code
-  * #include <Eigen/QR>
-  * \endcode
-  */
-
-#include "src/misc/Solve.h"
+ *
+ *
+ *
+ * This module provides various QR decompositions
+ * This module also provides some MatrixBase methods, including:
+ *  - MatrixBase::householderQr()
+ *  - MatrixBase::colPivHouseholderQr()
+ *  - MatrixBase::fullPivHouseholderQr()
+ *
+ * \code
+ * #include <Eigen/QR>
+ * \endcode
+ */
+
+// IWYU pragma: begin_exports
 #include "src/QR/HouseholderQR.h"
 #include "src/QR/FullPivHouseholderQR.h"
 #include "src/QR/ColPivHouseholderQR.h"
+#include "src/QR/CompleteOrthogonalDecomposition.h"
 #ifdef EIGEN_USE_LAPACKE
-#include "src/QR/HouseholderQR_MKL.h"
-#include "src/QR/ColPivHouseholderQR_MKL.h"
-#endif
-
-#ifdef EIGEN2_SUPPORT
-#include "src/Eigen2Support/QR.h"
+#include "src/misc/lapacke_helpers.h"
+#include "src/QR/HouseholderQR_LAPACKE.h"
+#include "src/QR/ColPivHouseholderQR_LAPACKE.h"
 #endif
+// IWYU pragma: end_exports
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
-#ifdef EIGEN2_SUPPORT
-#include "Eigenvalues"
-#endif
-
-#endif // EIGEN_QR_MODULE_H
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */
+#endif  // EIGEN_QR_MODULE_H
diff --git a/inst/include/Eigen/QtAlignedMalloc b/inst/include/Eigen/QtAlignedMalloc
index 46f7d83b..585f8e81 100644
--- a/inst/include/Eigen/QtAlignedMalloc
+++ b/inst/include/Eigen/QtAlignedMalloc
@@ -1,3 +1,9 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 #ifndef EIGEN_QTMALLOC_MODULE_H
 #define EIGEN_QTMALLOC_MODULE_H
@@ -8,20 +14,13 @@
 
 #include "src/Core/util/DisableStupidWarnings.h"
 
-void *qMalloc(size_t size)
-{
-  return Eigen::internal::aligned_malloc(size);
-}
+void *qMalloc(std::size_t size) { return Eigen::internal::aligned_malloc(size); }
 
-void qFree(void *ptr)
-{
-  Eigen::internal::aligned_free(ptr);
-}
+void qFree(void *ptr) { Eigen::internal::aligned_free(ptr); }
 
-void *qRealloc(void *ptr, size_t size)
-{
-  void* newPtr = Eigen::internal::aligned_malloc(size);
-  memcpy(newPtr, ptr, size);
+void *qRealloc(void *ptr, std::size_t size) {
+  void *newPtr = Eigen::internal::aligned_malloc(size);
+  std::memcpy(newPtr, ptr, size);
   Eigen::internal::aligned_free(ptr);
   return newPtr;
 }
@@ -30,5 +29,4 @@ void *qRealloc(void *ptr, size_t size)
 
 #endif
 
-#endif // EIGEN_QTMALLOC_MODULE_H
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */
+#endif  // EIGEN_QTMALLOC_MODULE_H
diff --git a/inst/include/Eigen/SPQRSupport b/inst/include/Eigen/SPQRSupport
index 77016442..c01dbe00 100644
--- a/inst/include/Eigen/SPQRSupport
+++ b/inst/include/Eigen/SPQRSupport
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_SPQRSUPPORT_MODULE_H
 #define EIGEN_SPQRSUPPORT_MODULE_H
 
@@ -8,22 +15,27 @@
 #include "SuiteSparseQR.hpp"
 
 /** \ingroup Support_modules
-  * \defgroup SPQRSupport_Module SuiteSparseQR module
-  * 
-  * This module provides an interface to the SPQR library, which is part of the <a href="http://www.cise.ufl.edu/research/sparse/SuiteSparse/">suitesparse</a> package.
-  *
-  * \code
-  * #include <Eigen/SPQRSupport>
-  * \endcode
-  *
-  * In order to use this module, the SPQR headers must be accessible from the include paths, and your binary must be linked to the SPQR library and its dependencies (Cholmod, AMD, COLAMD,...).
-  * For a cmake based project, you can use our FindSPQR.cmake and FindCholmod.Cmake modules
-  *
-  */
-
-#include "src/misc/Solve.h"
-#include "src/misc/SparseSolve.h"
-#include "src/CholmodSupport/CholmodSupport.h"
+ * \defgroup SPQRSupport_Module SuiteSparseQR module
+ *
+ * This module provides an interface to the SPQR library, which is part of the <a
+ * href="http://www.suitesparse.com">suitesparse</a> package.
+ *
+ * \code
+ * #include <Eigen/SPQRSupport>
+ * \endcode
+ *
+ * In order to use this module, the SPQR headers must be accessible from the include paths, and your binary must be
+ * linked to the SPQR library and its dependencies (Cholmod, AMD, COLAMD,...). For a cmake based project, you can use
+ * our FindSPQR.cmake and FindCholmod.Cmake modules
+ *
+ */
+
+#include "CholmodSupport"
+
+// IWYU pragma: begin_exports
 #include "src/SPQRSupport/SuiteSparseQRSupport.h"
+// IWYU pragma: end_exports
+
+#include "src/Core/util/ReenableStupidWarnings.h"
 
 #endif
diff --git a/inst/include/Eigen/SVD b/inst/include/Eigen/SVD
index fd310017..2a013f82 100644
--- a/inst/include/Eigen/SVD
+++ b/inst/include/Eigen/SVD
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_SVD_MODULE_H
 #define EIGEN_SVD_MODULE_H
 
@@ -8,30 +15,42 @@
 #include "src/Core/util/DisableStupidWarnings.h"
 
 /** \defgroup SVD_Module SVD module
-  *
-  *
-  *
-  * This module provides SVD decomposition for matrices (both real and complex).
-  * This decomposition is accessible via the following MatrixBase method:
-  *  - MatrixBase::jacobiSvd()
-  *
-  * \code
-  * #include <Eigen/SVD>
-  * \endcode
-  */
+ *
+ *
+ *
+ * This module provides SVD decomposition for matrices (both real and complex).
+ * Two decomposition algorithms are provided:
+ *  - JacobiSVD implementing two-sided Jacobi iterations is numerically very accurate, fast for small matrices, but very
+ * slow for larger ones.
+ *  - BDCSVD implementing a recursive divide & conquer strategy on top of an upper-bidiagonalization which remains fast
+ * for large problems. These decompositions are accessible via the respective classes and following MatrixBase methods:
+ *  - MatrixBase::jacobiSvd()
+ *  - MatrixBase::bdcSvd()
+ *
+ * \code
+ * #include <Eigen/SVD>
+ * \endcode
+ */
 
-#include "src/misc/Solve.h"
+// IWYU pragma: begin_exports
+#include "src/misc/RealSvd2x2.h"
+#include "src/SVD/UpperBidiagonalization.h"
+#include "src/SVD/SVDBase.h"
 #include "src/SVD/JacobiSVD.h"
-#if defined(EIGEN_USE_LAPACKE) && !defined(EIGEN_USE_LAPACKE_STRICT)
-#include "src/SVD/JacobiSVD_MKL.h"
+#include "src/SVD/BDCSVD.h"
+#ifdef EIGEN_USE_LAPACKE
+#ifdef EIGEN_USE_MKL
+#include "mkl_lapacke.h"
+#else
+#include "src/misc/lapacke.h"
 #endif
-#include "src/SVD/UpperBidiagonalization.h"
-
-#ifdef EIGEN2_SUPPORT
-#include "src/Eigen2Support/SVD.h"
+#ifndef EIGEN_USE_LAPACKE_STRICT
+#include "src/SVD/JacobiSVD_LAPACKE.h"
+#endif
+#include "src/SVD/BDCSVD_LAPACKE.h"
 #endif
+// IWYU pragma: end_exports
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
-#endif // EIGEN_SVD_MODULE_H
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */
+#endif  // EIGEN_SVD_MODULE_H
diff --git a/inst/include/Eigen/Sparse b/inst/include/Eigen/Sparse
index 7cc9c091..4d0ee8bc 100644
--- a/inst/include/Eigen/Sparse
+++ b/inst/include/Eigen/Sparse
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_SPARSE_MODULE_H
 #define EIGEN_SPARSE_MODULE_H
 
@@ -11,9 +18,9 @@
   * - \ref SparseQR_Module
   * - \ref IterativeLinearSolvers_Module
   *
-  * \code
-  * #include <Eigen/Sparse>
-  * \endcode
+    \code
+    #include <Eigen/Sparse>
+    \endcode
   */
 
 #include "SparseCore"
@@ -23,5 +30,4 @@
 #include "SparseQR"
 #include "IterativeLinearSolvers"
 
-#endif // EIGEN_SPARSE_MODULE_H
-
+#endif  // EIGEN_SPARSE_MODULE_H
diff --git a/inst/include/Eigen/SparseCholesky b/inst/include/Eigen/SparseCholesky
index 9f5056aa..6abdcd66 100644
--- a/inst/include/Eigen/SparseCholesky
+++ b/inst/include/Eigen/SparseCholesky
@@ -15,33 +15,26 @@
 
 #include "src/Core/util/DisableStupidWarnings.h"
 
-/** 
-  * \defgroup SparseCholesky_Module SparseCholesky module
-  *
-  * This module currently provides two variants of the direct sparse Cholesky decomposition for selfadjoint (hermitian) matrices.
-  * Those decompositions are accessible via the following classes:
-  *  - SimplicialLLt,
-  *  - SimplicialLDLt
-  *
-  * Such problems can also be solved using the ConjugateGradient solver from the IterativeLinearSolvers module.
-  *
-  * \code
-  * #include <Eigen/SparseCholesky>
-  * \endcode
-  */
-
-#ifdef EIGEN_MPL2_ONLY
-#error The SparseCholesky module has nothing to offer in MPL2 only mode
-#endif
-
-#include "src/misc/Solve.h"
-#include "src/misc/SparseSolve.h"
+/**
+ * \defgroup SparseCholesky_Module SparseCholesky module
+ *
+ * This module currently provides two variants of the direct sparse Cholesky decomposition for selfadjoint (hermitian)
+ * matrices. Those decompositions are accessible via the following classes:
+ *  - SimplicialLLt,
+ *  - SimplicialLDLt
+ *
+ * Such problems can also be solved using the ConjugateGradient solver from the IterativeLinearSolvers module.
+ *
+ * \code
+ * #include <Eigen/SparseCholesky>
+ * \endcode
+ */
+
+// IWYU pragma: begin_exports
 #include "src/SparseCholesky/SimplicialCholesky.h"
-
-#ifndef EIGEN_MPL2_ONLY
 #include "src/SparseCholesky/SimplicialCholesky_impl.h"
-#endif
+// IWYU pragma: end_exports
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
-#endif // EIGEN_SPARSECHOLESKY_MODULE_H
+#endif  // EIGEN_SPARSECHOLESKY_MODULE_H
diff --git a/inst/include/Eigen/SparseCore b/inst/include/Eigen/SparseCore
index 24bcf015..56a9401a 100644
--- a/inst/include/Eigen/SparseCore
+++ b/inst/include/Eigen/SparseCore
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_SPARSECORE_MODULE_H
 #define EIGEN_SPARSECORE_MODULE_H
 
@@ -10,55 +17,54 @@
 #include <cstdlib>
 #include <cstring>
 #include <algorithm>
+#include <numeric>
 
-/** 
-  * \defgroup SparseCore_Module SparseCore module
-  *
-  * This module provides a sparse matrix representation, and basic associated matrix manipulations
-  * and operations.
-  *
-  * See the \ref TutorialSparse "Sparse tutorial"
-  *
-  * \code
-  * #include <Eigen/SparseCore>
-  * \endcode
-  *
-  * This module depends on: Core.
-  */
-
-namespace Eigen {
-
-/** The type used to identify a general sparse storage. */
-struct Sparse {};
-
-}
+/**
+ * \defgroup SparseCore_Module SparseCore module
+ *
+ * This module provides a sparse matrix representation, and basic associated matrix manipulations
+ * and operations.
+ *
+ * See the \ref TutorialSparse "Sparse tutorial"
+ *
+ * \code
+ * #include <Eigen/SparseCore>
+ * \endcode
+ *
+ * This module depends on: Core.
+ */
 
+// IWYU pragma: begin_exports
 #include "src/SparseCore/SparseUtil.h"
 #include "src/SparseCore/SparseMatrixBase.h"
+#include "src/SparseCore/SparseAssign.h"
 #include "src/SparseCore/CompressedStorage.h"
 #include "src/SparseCore/AmbiVector.h"
+#include "src/SparseCore/SparseCompressedBase.h"
 #include "src/SparseCore/SparseMatrix.h"
-#include "src/SparseCore/MappedSparseMatrix.h"
+#include "src/SparseCore/SparseMap.h"
 #include "src/SparseCore/SparseVector.h"
-#include "src/SparseCore/SparseBlock.h"
-#include "src/SparseCore/SparseTranspose.h"
+#include "src/SparseCore/SparseRef.h"
 #include "src/SparseCore/SparseCwiseUnaryOp.h"
 #include "src/SparseCore/SparseCwiseBinaryOp.h"
+#include "src/SparseCore/SparseTranspose.h"
+#include "src/SparseCore/SparseBlock.h"
 #include "src/SparseCore/SparseDot.h"
-#include "src/SparseCore/SparsePermutation.h"
 #include "src/SparseCore/SparseRedux.h"
-#include "src/SparseCore/SparseFuzzy.h"
+#include "src/SparseCore/SparseView.h"
+#include "src/SparseCore/SparseDiagonalProduct.h"
 #include "src/SparseCore/ConservativeSparseSparseProduct.h"
 #include "src/SparseCore/SparseSparseProductWithPruning.h"
 #include "src/SparseCore/SparseProduct.h"
 #include "src/SparseCore/SparseDenseProduct.h"
-#include "src/SparseCore/SparseDiagonalProduct.h"
-#include "src/SparseCore/SparseTriangularView.h"
 #include "src/SparseCore/SparseSelfAdjointView.h"
+#include "src/SparseCore/SparseTriangularView.h"
 #include "src/SparseCore/TriangularSolver.h"
-#include "src/SparseCore/SparseView.h"
+#include "src/SparseCore/SparsePermutation.h"
+#include "src/SparseCore/SparseFuzzy.h"
+#include "src/SparseCore/SparseSolverBase.h"
+// IWYU pragma: end_exports
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
-#endif // EIGEN_SPARSECORE_MODULE_H
-
+#endif  // EIGEN_SPARSECORE_MODULE_H
diff --git a/inst/include/Eigen/SparseLU b/inst/include/Eigen/SparseLU
index 8527a49b..6faf1306 100644
--- a/inst/include/Eigen/SparseLU
+++ b/inst/include/Eigen/SparseLU
@@ -13,21 +13,19 @@
 
 #include "SparseCore"
 
-/** 
-  * \defgroup SparseLU_Module SparseLU module
-  * This module defines a supernodal factorization of general sparse matrices.
-  * The code is fully optimized for supernode-panel updates with specialized kernels.
-  * Please, see the documentation of the SparseLU class for more details.
-  */
-
-#include "src/misc/Solve.h"
-#include "src/misc/SparseSolve.h"
+/**
+ * \defgroup SparseLU_Module SparseLU module
+ * This module defines a supernodal factorization of general sparse matrices.
+ * The code is fully optimized for supernode-panel updates with specialized kernels.
+ * Please, see the documentation of the SparseLU class for more details.
+ */
 
 // Ordering interface
 #include "OrderingMethods"
 
-#include "src/SparseLU/SparseLU_gemm_kernel.h"
+#include "src/Core/util/DisableStupidWarnings.h"
 
+// IWYU pragma: begin_exports
 #include "src/SparseLU/SparseLU_Structs.h"
 #include "src/SparseLU/SparseLU_SupernodalMatrix.h"
 #include "src/SparseLU/SparseLUImpl.h"
@@ -45,5 +43,8 @@
 #include "src/SparseLU/SparseLU_pruneL.h"
 #include "src/SparseLU/SparseLU_Utils.h"
 #include "src/SparseLU/SparseLU.h"
+// IWYU pragma: end_exports
+
+#include "src/Core/util/ReenableStupidWarnings.h"
 
-#endif // EIGEN_SPARSELU_MODULE_H
+#endif  // EIGEN_SPARSELU_MODULE_H
diff --git a/inst/include/Eigen/SparseQR b/inst/include/Eigen/SparseQR
index 4ee42065..b4f1cad6 100644
--- a/inst/include/Eigen/SparseQR
+++ b/inst/include/Eigen/SparseQR
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_SPARSEQR_MODULE_H
 #define EIGEN_SPARSEQR_MODULE_H
 
@@ -6,27 +13,25 @@
 #include "src/Core/util/DisableStupidWarnings.h"
 
 /** \defgroup SparseQR_Module SparseQR module
-  * \brief Provides QR decomposition for sparse matrices
-  * 
-  * This module provides a simplicial version of the left-looking Sparse QR decomposition. 
-  * The columns of the input matrix should be reordered to limit the fill-in during the 
-  * decomposition. Built-in methods (COLAMD, AMD) or external  methods (METIS) can be used to this end.
-  * See the \link OrderingMethods_Module OrderingMethods\endlink module for the list 
-  * of built-in and external ordering methods.
-  * 
-  * \code
-  * #include <Eigen/SparseQR>
-  * \endcode
-  * 
-  * 
-  */
-
-#include "src/misc/Solve.h"
-#include "src/misc/SparseSolve.h"
+ * \brief Provides QR decomposition for sparse matrices
+ *
+ * This module provides a simplicial version of the left-looking Sparse QR decomposition.
+ * The columns of the input matrix should be reordered to limit the fill-in during the
+ * decomposition. Built-in methods (COLAMD, AMD) or external  methods (METIS) can be used to this end.
+ * See the \link OrderingMethods_Module OrderingMethods\endlink module for the list
+ * of built-in and external ordering methods.
+ *
+ * \code
+ * #include <Eigen/SparseQR>
+ * \endcode
+ *
+ *
+ */
 
-#include "OrderingMethods"
+// IWYU pragma: begin_exports
 #include "src/SparseCore/SparseColEtree.h"
 #include "src/SparseQR/SparseQR.h"
+// IWYU pragma: end_exports
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
diff --git a/inst/include/Eigen/StdDeque b/inst/include/Eigen/StdDeque
index f2723477..01e1d76f 100644
--- a/inst/include/Eigen/StdDeque
+++ b/inst/include/Eigen/StdDeque
@@ -14,14 +14,17 @@
 #include "Core"
 #include <deque>
 
-#if (defined(_MSC_VER) && defined(_WIN64)) /* MSVC auto aligns in 64 bit builds */
+#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 && \
+    (EIGEN_MAX_STATIC_ALIGN_BYTES <= 16) /* MSVC auto aligns up to 16 bytes in 64 bit builds */
 
 #define EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(...)
 
 #else
 
+// IWYU pragma: begin_exports
 #include "src/StlSupport/StdDeque.h"
+// IWYU pragma: end_exports
 
 #endif
 
-#endif // EIGEN_STDDEQUE_MODULE_H
+#endif  // EIGEN_STDDEQUE_MODULE_H
diff --git a/inst/include/Eigen/StdList b/inst/include/Eigen/StdList
index 225c1e18..1453c9f5 100644
--- a/inst/include/Eigen/StdList
+++ b/inst/include/Eigen/StdList
@@ -13,14 +13,17 @@
 #include "Core"
 #include <list>
 
-#if (defined(_MSC_VER) && defined(_WIN64)) /* MSVC auto aligns in 64 bit builds */    
+#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 && \
+    (EIGEN_MAX_STATIC_ALIGN_BYTES <= 16) /* MSVC auto aligns up to 16 bytes in 64 bit builds */
 
 #define EIGEN_DEFINE_STL_LIST_SPECIALIZATION(...)
 
 #else
 
+// IWYU pragma: begin_exports
 #include "src/StlSupport/StdList.h"
+// IWYU pragma: end_exports
 
 #endif
 
-#endif // EIGEN_STDLIST_MODULE_H
+#endif  // EIGEN_STDLIST_MODULE_H
diff --git a/inst/include/Eigen/StdVector b/inst/include/Eigen/StdVector
index 6b22627f..711a654e 100644
--- a/inst/include/Eigen/StdVector
+++ b/inst/include/Eigen/StdVector
@@ -14,14 +14,17 @@
 #include "Core"
 #include <vector>
 
-#if (defined(_MSC_VER) && defined(_WIN64)) /* MSVC auto aligns in 64 bit builds */
+#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 && \
+    (EIGEN_MAX_STATIC_ALIGN_BYTES <= 16) /* MSVC auto aligns up to 16 bytes in 64 bit builds */
 
 #define EIGEN_DEFINE_STL_VECTOR_SPECIALIZATION(...)
 
 #else
 
+// IWYU pragma: begin_exports
 #include "src/StlSupport/StdVector.h"
+// IWYU pragma: end_exports
 
 #endif
 
-#endif // EIGEN_STDVECTOR_MODULE_H
+#endif  // EIGEN_STDVECTOR_MODULE_H
diff --git a/inst/include/Eigen/SuperLUSupport b/inst/include/Eigen/SuperLUSupport
index 575e14fb..79e2222f 100644
--- a/inst/include/Eigen/SuperLUSupport
+++ b/inst/include/Eigen/SuperLUSupport
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_SUPERLUSUPPORT_MODULE_H
 #define EIGEN_SUPERLUSUPPORT_MODULE_H
 
@@ -19,41 +26,45 @@ typedef int int_t;
 // If EMPTY was already defined then we don't undef it.
 
 #if defined(EIGEN_EMPTY_WAS_ALREADY_DEFINED)
-# undef EIGEN_EMPTY_WAS_ALREADY_DEFINED
+#undef EIGEN_EMPTY_WAS_ALREADY_DEFINED
 #elif defined(EMPTY)
-# undef EMPTY
+#undef EMPTY
 #endif
 
 #define SUPERLU_EMPTY (-1)
 
-namespace Eigen { struct SluMatrix; }
+namespace Eigen {
+struct SluMatrix;
+}
 
 /** \ingroup Support_modules
-  * \defgroup SuperLUSupport_Module SuperLUSupport module
-  *
-  * This module provides an interface to the <a href="http://crd-legacy.lbl.gov/~xiaoye/SuperLU/">SuperLU</a> library.
-  * It provides the following factorization class:
-  * - class SuperLU: a supernodal sequential LU factorization.
-  * - class SuperILU: a supernodal sequential incomplete LU factorization (to be used as a preconditioner for iterative methods).
-  *
-  * \warning When including this module, you have to use SUPERLU_EMPTY instead of EMPTY which is no longer defined because it is too polluting.
-  *
-  * \code
-  * #include <Eigen/SuperLUSupport>
-  * \endcode
-  *
-  * In order to use this module, the superlu headers must be accessible from the include paths, and your binary must be linked to the superlu library and its dependencies.
-  * The dependencies depend on how superlu has been compiled.
-  * For a cmake based project, you can use our FindSuperLU.cmake module to help you in this task.
-  *
-  */
-
-#include "src/misc/Solve.h"
-#include "src/misc/SparseSolve.h"
+ * \defgroup SuperLUSupport_Module SuperLUSupport module
+ *
+ * This module provides an interface to the <a href="http://crd-legacy.lbl.gov/~xiaoye/SuperLU/">SuperLU</a> library.
+ * It provides the following factorization class:
+ * - class SuperLU: a supernodal sequential LU factorization.
+ * - class SuperILU: a supernodal sequential incomplete LU factorization (to be used as a preconditioner for iterative
+ * methods).
+ *
+ * \warning This wrapper requires at least versions 4.0 of SuperLU. The 3.x versions are not supported.
+ *
+ * \warning When including this module, you have to use SUPERLU_EMPTY instead of EMPTY which is no longer defined
+ * because it is too polluting.
+ *
+ * \code
+ * #include <Eigen/SuperLUSupport>
+ * \endcode
+ *
+ * In order to use this module, the superlu headers must be accessible from the include paths, and your binary must be
+ * linked to the superlu library and its dependencies. The dependencies depend on how superlu has been compiled. For a
+ * cmake based project, you can use our FindSuperLU.cmake module to help you in this task.
+ *
+ */
 
+// IWYU pragma: begin_exports
 #include "src/SuperLUSupport/SuperLUSupport.h"
-
+// IWYU pragma: end_exports
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
-#endif // EIGEN_SUPERLUSUPPORT_MODULE_H
+#endif  // EIGEN_SUPERLUSUPPORT_MODULE_H
diff --git a/inst/include/Eigen/ThreadPool b/inst/include/Eigen/ThreadPool
new file mode 100644
index 00000000..39e5d1ee
--- /dev/null
+++ b/inst/include/Eigen/ThreadPool
@@ -0,0 +1,80 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_THREADPOOL_MODULE_H
+#define EIGEN_THREADPOOL_MODULE_H
+
+#include "Core"
+
+#include "src/Core/util/DisableStupidWarnings.h"
+
+/** \defgroup ThreadPool_Module ThreadPool Module
+ *
+ * This module provides 2 threadpool implementations
+ *  - a simple reference implementation
+ *  - a faster non blocking implementation
+ *
+ * \code
+ * #include <Eigen/ThreadPool>
+ * \endcode
+ */
+
+#include <cstddef>
+#include <cstring>
+#include <time.h>
+
+#include <vector>
+#include <atomic>
+#include <condition_variable>
+#include <deque>
+#include <mutex>
+#include <thread>
+#include <functional>
+#include <memory>
+#include <utility>
+
+// There are non-parenthesized calls to "max" in the  <unordered_map> header,
+// which trigger a check in test/main.h causing compilation to fail.
+// We work around the check here by removing the check for max in
+// the case where we have to emulate thread_local.
+#ifdef max
+#undef max
+#endif
+#include <unordered_map>
+
+#include "src/Core/util/Meta.h"
+#include "src/Core/util/MaxSizeVector.h"
+
+#ifndef EIGEN_MUTEX
+#define EIGEN_MUTEX std::mutex
+#endif
+#ifndef EIGEN_MUTEX_LOCK
+#define EIGEN_MUTEX_LOCK std::unique_lock<std::mutex>
+#endif
+#ifndef EIGEN_CONDVAR
+#define EIGEN_CONDVAR std::condition_variable
+#endif
+
+// IWYU pragma: begin_exports
+#include "src/ThreadPool/ThreadLocal.h"
+#include "src/ThreadPool/ThreadYield.h"
+#include "src/ThreadPool/ThreadCancel.h"
+#include "src/ThreadPool/EventCount.h"
+#include "src/ThreadPool/RunQueue.h"
+#include "src/ThreadPool/ThreadPoolInterface.h"
+#include "src/ThreadPool/ThreadEnvironment.h"
+#include "src/ThreadPool/Barrier.h"
+#include "src/ThreadPool/NonBlockingThreadPool.h"
+#include "src/ThreadPool/CoreThreadPoolDevice.h"
+#include "src/ThreadPool/ForkJoin.h"
+// IWYU pragma: end_exports
+
+#include "src/Core/util/ReenableStupidWarnings.h"
+
+#endif  // EIGEN_CXX11_THREADPOOL_MODULE_H
diff --git a/inst/include/Eigen/UmfPackSupport b/inst/include/Eigen/UmfPackSupport
index 984f64a8..126344cb 100644
--- a/inst/include/Eigen/UmfPackSupport
+++ b/inst/include/Eigen/UmfPackSupport
@@ -1,3 +1,10 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
 #ifndef EIGEN_UMFPACKSUPPORT_MODULE_H
 #define EIGEN_UMFPACKSUPPORT_MODULE_H
 
@@ -10,27 +17,26 @@ extern "C" {
 }
 
 /** \ingroup Support_modules
-  * \defgroup UmfPackSupport_Module UmfPackSupport module
-  *
-  * This module provides an interface to the UmfPack library which is part of the <a href="http://www.cise.ufl.edu/research/sparse/SuiteSparse/">suitesparse</a> package.
-  * It provides the following factorization class:
-  * - class UmfPackLU: a multifrontal sequential LU factorization.
-  *
-  * \code
-  * #include <Eigen/UmfPackSupport>
-  * \endcode
-  *
-  * In order to use this module, the umfpack headers must be accessible from the include paths, and your binary must be linked to the umfpack library and its dependencies.
-  * The dependencies depend on how umfpack has been compiled.
-  * For a cmake based project, you can use our FindUmfPack.cmake module to help you in this task.
-  *
-  */
-
-#include "src/misc/Solve.h"
-#include "src/misc/SparseSolve.h"
-
+ * \defgroup UmfPackSupport_Module UmfPackSupport module
+ *
+ * This module provides an interface to the UmfPack library which is part of the <a
+ * href="http://www.suitesparse.com">suitesparse</a> package. It provides the following factorization class:
+ * - class UmfPackLU: a multifrontal sequential LU factorization.
+ *
+ * \code
+ * #include <Eigen/UmfPackSupport>
+ * \endcode
+ *
+ * In order to use this module, the umfpack headers must be accessible from the include paths, and your binary must be
+ * linked to the umfpack library and its dependencies. The dependencies depend on how umfpack has been compiled. For a
+ * cmake based project, you can use our FindUmfPack.cmake module to help you in this task.
+ *
+ */
+
+// IWYU pragma: begin_exports
 #include "src/UmfPackSupport/UmfPackSupport.h"
+// IWYU pragma: endexports
 
 #include "src/Core/util/ReenableStupidWarnings.h"
 
-#endif // EIGEN_UMFPACKSUPPORT_MODULE_H
+#endif  // EIGEN_UMFPACKSUPPORT_MODULE_H
diff --git a/inst/include/Eigen/Version b/inst/include/Eigen/Version
new file mode 100644
index 00000000..9cfd71ff
--- /dev/null
+++ b/inst/include/Eigen/Version
@@ -0,0 +1,14 @@
+#ifndef EIGEN_VERSION_H
+#define EIGEN_VERSION_H
+
+// The "WORLD" version will forever remain "3" for the "Eigen3" library.
+#define EIGEN_WORLD_VERSION 3
+// As of Eigen3 5.0.0, we have moved to Semantic Versioning (semver.org).
+#define EIGEN_MAJOR_VERSION 5
+#define EIGEN_MINOR_VERSION 0
+#define EIGEN_PATCH_VERSION 1
+#define EIGEN_PRERELEASE_VERSION ""
+#define EIGEN_BUILD_VERSION ""
+#define EIGEN_VERSION_STRING "5.0.1"
+
+#endif  // EIGEN_VERSION_H
diff --git a/inst/include/Eigen/src/AccelerateSupport/AccelerateSupport.h b/inst/include/Eigen/src/AccelerateSupport/AccelerateSupport.h
new file mode 100644
index 00000000..13a26dfb
--- /dev/null
+++ b/inst/include/Eigen/src/AccelerateSupport/AccelerateSupport.h
@@ -0,0 +1,423 @@
+#ifndef EIGEN_ACCELERATESUPPORT_H
+#define EIGEN_ACCELERATESUPPORT_H
+
+#include <Accelerate/Accelerate.h>
+
+#include <Eigen/Sparse>
+
+namespace Eigen {
+
+template <typename MatrixType_, int UpLo_, SparseFactorization_t Solver_, bool EnforceSquare_>
+class AccelerateImpl;
+
+/** \ingroup AccelerateSupport_Module
+ * \typedef AccelerateLLT
+ * \brief A direct Cholesky (LLT) factorization and solver based on Accelerate
+ *
+ * \warning Only single and double precision real scalar types are supported by Accelerate
+ *
+ * \tparam MatrixType_ the type of the sparse matrix A, it must be a SparseMatrix<>
+ * \tparam UpLo_ additional information about the matrix structure. Default is Lower.
+ *
+ * \sa \ref TutorialSparseSolverConcept, class AccelerateLLT
+ */
+template <typename MatrixType, int UpLo = Lower>
+using AccelerateLLT = AccelerateImpl<MatrixType, UpLo | Symmetric, SparseFactorizationCholesky, true>;
+
+/** \ingroup AccelerateSupport_Module
+ * \typedef AccelerateLDLT
+ * \brief The default Cholesky (LDLT) factorization and solver based on Accelerate
+ *
+ * \warning Only single and double precision real scalar types are supported by Accelerate
+ *
+ * \tparam MatrixType_ the type of the sparse matrix A, it must be a SparseMatrix<>
+ * \tparam UpLo_ additional information about the matrix structure. Default is Lower.
+ *
+ * \sa \ref TutorialSparseSolverConcept, class AccelerateLDLT
+ */
+template <typename MatrixType, int UpLo = Lower>
+using AccelerateLDLT = AccelerateImpl<MatrixType, UpLo | Symmetric, SparseFactorizationLDLT, true>;
+
+/** \ingroup AccelerateSupport_Module
+ * \typedef AccelerateLDLTUnpivoted
+ * \brief A direct Cholesky-like LDL^T factorization and solver based on Accelerate with only 1x1 pivots and no pivoting
+ *
+ * \warning Only single and double precision real scalar types are supported by Accelerate
+ *
+ * \tparam MatrixType_ the type of the sparse matrix A, it must be a SparseMatrix<>
+ * \tparam UpLo_ additional information about the matrix structure. Default is Lower.
+ *
+ * \sa \ref TutorialSparseSolverConcept, class AccelerateLDLTUnpivoted
+ */
+template <typename MatrixType, int UpLo = Lower>
+using AccelerateLDLTUnpivoted = AccelerateImpl<MatrixType, UpLo | Symmetric, SparseFactorizationLDLTUnpivoted, true>;
+
+/** \ingroup AccelerateSupport_Module
+ * \typedef AccelerateLDLTSBK
+ * \brief A direct Cholesky (LDLT) factorization and solver based on Accelerate with Supernode Bunch-Kaufman and static
+ * pivoting
+ *
+ * \warning Only single and double precision real scalar types are supported by Accelerate
+ *
+ * \tparam MatrixType_ the type of the sparse matrix A, it must be a SparseMatrix<>
+ * \tparam UpLo_ additional information about the matrix structure. Default is Lower.
+ *
+ * \sa \ref TutorialSparseSolverConcept, class AccelerateLDLTSBK
+ */
+template <typename MatrixType, int UpLo = Lower>
+using AccelerateLDLTSBK = AccelerateImpl<MatrixType, UpLo | Symmetric, SparseFactorizationLDLTSBK, true>;
+
+/** \ingroup AccelerateSupport_Module
+ * \typedef AccelerateLDLTTPP
+ * \brief A direct Cholesky (LDLT) factorization and solver based on Accelerate with full threshold partial pivoting
+ *
+ * \warning Only single and double precision real scalar types are supported by Accelerate
+ *
+ * \tparam MatrixType_ the type of the sparse matrix A, it must be a SparseMatrix<>
+ * \tparam UpLo_ additional information about the matrix structure. Default is Lower.
+ *
+ * \sa \ref TutorialSparseSolverConcept, class AccelerateLDLTTPP
+ */
+template <typename MatrixType, int UpLo = Lower>
+using AccelerateLDLTTPP = AccelerateImpl<MatrixType, UpLo | Symmetric, SparseFactorizationLDLTTPP, true>;
+
+/** \ingroup AccelerateSupport_Module
+ * \typedef AccelerateQR
+ * \brief A QR factorization and solver based on Accelerate
+ *
+ * \warning Only single and double precision real scalar types are supported by Accelerate
+ *
+ * \tparam MatrixType_ the type of the sparse matrix A, it must be a SparseMatrix<>
+ *
+ * \sa \ref TutorialSparseSolverConcept, class AccelerateQR
+ */
+template <typename MatrixType>
+using AccelerateQR = AccelerateImpl<MatrixType, 0, SparseFactorizationQR, false>;
+
+/** \ingroup AccelerateSupport_Module
+ * \typedef AccelerateCholeskyAtA
+ * \brief A QR factorization and solver based on Accelerate without storing Q (equivalent to A^TA = R^T R)
+ *
+ * \warning Only single and double precision real scalar types are supported by Accelerate
+ *
+ * \tparam MatrixType_ the type of the sparse matrix A, it must be a SparseMatrix<>
+ *
+ * \sa \ref TutorialSparseSolverConcept, class AccelerateCholeskyAtA
+ */
+template <typename MatrixType>
+using AccelerateCholeskyAtA = AccelerateImpl<MatrixType, 0, SparseFactorizationCholeskyAtA, false>;
+
+namespace internal {
+template <typename T>
+struct AccelFactorizationDeleter {
+  void operator()(T* sym) {
+    if (sym) {
+      SparseCleanup(*sym);
+      delete sym;
+      sym = nullptr;
+    }
+  }
+};
+
+template <typename DenseVecT, typename DenseMatT, typename SparseMatT, typename NumFactT>
+struct SparseTypesTraitBase {
+  typedef DenseVecT AccelDenseVector;
+  typedef DenseMatT AccelDenseMatrix;
+  typedef SparseMatT AccelSparseMatrix;
+
+  typedef SparseOpaqueSymbolicFactorization SymbolicFactorization;
+  typedef NumFactT NumericFactorization;
+
+  typedef AccelFactorizationDeleter<SymbolicFactorization> SymbolicFactorizationDeleter;
+  typedef AccelFactorizationDeleter<NumericFactorization> NumericFactorizationDeleter;
+};
+
+template <typename Scalar>
+struct SparseTypesTrait {};
+
+template <>
+struct SparseTypesTrait<double> : SparseTypesTraitBase<DenseVector_Double, DenseMatrix_Double, SparseMatrix_Double,
+                                                       SparseOpaqueFactorization_Double> {};
+
+template <>
+struct SparseTypesTrait<float>
+    : SparseTypesTraitBase<DenseVector_Float, DenseMatrix_Float, SparseMatrix_Float, SparseOpaqueFactorization_Float> {
+};
+
+}  // end namespace internal
+
+template <typename MatrixType_, int UpLo_, SparseFactorization_t Solver_, bool EnforceSquare_>
+class AccelerateImpl : public SparseSolverBase<AccelerateImpl<MatrixType_, UpLo_, Solver_, EnforceSquare_> > {
+ protected:
+  using Base = SparseSolverBase<AccelerateImpl>;
+  using Base::derived;
+  using Base::m_isInitialized;
+
+ public:
+  using Base::_solve_impl;
+
+  typedef MatrixType_ MatrixType;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::StorageIndex StorageIndex;
+  enum { ColsAtCompileTime = Dynamic, MaxColsAtCompileTime = Dynamic };
+  enum { UpLo = UpLo_ };
+
+  using AccelDenseVector = typename internal::SparseTypesTrait<Scalar>::AccelDenseVector;
+  using AccelDenseMatrix = typename internal::SparseTypesTrait<Scalar>::AccelDenseMatrix;
+  using AccelSparseMatrix = typename internal::SparseTypesTrait<Scalar>::AccelSparseMatrix;
+  using SymbolicFactorization = typename internal::SparseTypesTrait<Scalar>::SymbolicFactorization;
+  using NumericFactorization = typename internal::SparseTypesTrait<Scalar>::NumericFactorization;
+  using SymbolicFactorizationDeleter = typename internal::SparseTypesTrait<Scalar>::SymbolicFactorizationDeleter;
+  using NumericFactorizationDeleter = typename internal::SparseTypesTrait<Scalar>::NumericFactorizationDeleter;
+
+  AccelerateImpl() {
+    m_isInitialized = false;
+
+    auto check_flag_set = [](int value, int flag) { return ((value & flag) == flag); };
+
+    if (check_flag_set(UpLo_, Symmetric)) {
+      m_sparseKind = SparseSymmetric;
+      m_triType = (UpLo_ & Lower) ? SparseLowerTriangle : SparseUpperTriangle;
+    } else if (check_flag_set(UpLo_, UnitLower)) {
+      m_sparseKind = SparseUnitTriangular;
+      m_triType = SparseLowerTriangle;
+    } else if (check_flag_set(UpLo_, UnitUpper)) {
+      m_sparseKind = SparseUnitTriangular;
+      m_triType = SparseUpperTriangle;
+    } else if (check_flag_set(UpLo_, StrictlyLower)) {
+      m_sparseKind = SparseTriangular;
+      m_triType = SparseLowerTriangle;
+    } else if (check_flag_set(UpLo_, StrictlyUpper)) {
+      m_sparseKind = SparseTriangular;
+      m_triType = SparseUpperTriangle;
+    } else if (check_flag_set(UpLo_, Lower)) {
+      m_sparseKind = SparseTriangular;
+      m_triType = SparseLowerTriangle;
+    } else if (check_flag_set(UpLo_, Upper)) {
+      m_sparseKind = SparseTriangular;
+      m_triType = SparseUpperTriangle;
+    } else {
+      m_sparseKind = SparseOrdinary;
+      m_triType = (UpLo_ & Lower) ? SparseLowerTriangle : SparseUpperTriangle;
+    }
+
+    m_order = SparseOrderDefault;
+  }
+
+  explicit AccelerateImpl(const MatrixType& matrix) : AccelerateImpl() { compute(matrix); }
+
+  ~AccelerateImpl() {}
+
+  inline Index cols() const { return m_nCols; }
+  inline Index rows() const { return m_nRows; }
+
+  ComputationInfo info() const {
+    eigen_assert(m_isInitialized && "Decomposition is not initialized.");
+    return m_info;
+  }
+
+  void analyzePattern(const MatrixType& matrix);
+
+  void factorize(const MatrixType& matrix);
+
+  void compute(const MatrixType& matrix);
+
+  template <typename Rhs, typename Dest>
+  void _solve_impl(const MatrixBase<Rhs>& b, MatrixBase<Dest>& dest) const;
+
+  /** Sets the ordering algorithm to use. */
+  void setOrder(SparseOrder_t order) { m_order = order; }
+
+ private:
+  template <typename T>
+  void buildAccelSparseMatrix(const SparseMatrix<T>& a, AccelSparseMatrix& A, std::vector<long>& columnStarts) {
+    const Index nColumnsStarts = a.cols() + 1;
+
+    columnStarts.resize(nColumnsStarts);
+
+    for (Index i = 0; i < nColumnsStarts; i++) columnStarts[i] = a.outerIndexPtr()[i];
+
+    SparseAttributes_t attributes{};
+    attributes.transpose = false;
+    attributes.triangle = m_triType;
+    attributes.kind = m_sparseKind;
+
+    SparseMatrixStructure structure{};
+    structure.attributes = attributes;
+    structure.rowCount = static_cast<int>(a.rows());
+    structure.columnCount = static_cast<int>(a.cols());
+    structure.blockSize = 1;
+    structure.columnStarts = columnStarts.data();
+    structure.rowIndices = const_cast<int*>(a.innerIndexPtr());
+
+    A.structure = structure;
+    A.data = const_cast<T*>(a.valuePtr());
+  }
+
+  void doAnalysis(AccelSparseMatrix& A) {
+    m_numericFactorization.reset(nullptr);
+
+    SparseSymbolicFactorOptions opts{};
+    opts.control = SparseDefaultControl;
+    opts.orderMethod = m_order;
+    opts.order = nullptr;
+    opts.ignoreRowsAndColumns = nullptr;
+    opts.malloc = malloc;
+    opts.free = free;
+    opts.reportError = nullptr;
+
+    m_symbolicFactorization.reset(new SymbolicFactorization(SparseFactor(Solver_, A.structure, opts)));
+
+    SparseStatus_t status = m_symbolicFactorization->status;
+
+    updateInfoStatus(status);
+
+    if (status != SparseStatusOK) m_symbolicFactorization.reset(nullptr);
+  }
+
+  void doFactorization(AccelSparseMatrix& A) {
+    SparseStatus_t status = SparseStatusReleased;
+
+    if (m_symbolicFactorization) {
+      m_numericFactorization.reset(new NumericFactorization(SparseFactor(*m_symbolicFactorization, A)));
+
+      status = m_numericFactorization->status;
+
+      if (status != SparseStatusOK) m_numericFactorization.reset(nullptr);
+    }
+
+    updateInfoStatus(status);
+  }
+
+ protected:
+  void updateInfoStatus(SparseStatus_t status) const {
+    switch (status) {
+      case SparseStatusOK:
+        m_info = Success;
+        break;
+      case SparseFactorizationFailed:
+      case SparseMatrixIsSingular:
+        m_info = NumericalIssue;
+        break;
+      case SparseInternalError:
+      case SparseParameterError:
+      case SparseStatusReleased:
+      default:
+        m_info = InvalidInput;
+        break;
+    }
+  }
+
+  mutable ComputationInfo m_info;
+  Index m_nRows, m_nCols;
+  std::unique_ptr<SymbolicFactorization, SymbolicFactorizationDeleter> m_symbolicFactorization;
+  std::unique_ptr<NumericFactorization, NumericFactorizationDeleter> m_numericFactorization;
+  SparseKind_t m_sparseKind;
+  SparseTriangle_t m_triType;
+  SparseOrder_t m_order;
+};
+
+/** Computes the symbolic and numeric decomposition of matrix \a a */
+template <typename MatrixType_, int UpLo_, SparseFactorization_t Solver_, bool EnforceSquare_>
+void AccelerateImpl<MatrixType_, UpLo_, Solver_, EnforceSquare_>::compute(const MatrixType& a) {
+  if (EnforceSquare_) eigen_assert(a.rows() == a.cols());
+
+  m_nRows = a.rows();
+  m_nCols = a.cols();
+
+  AccelSparseMatrix A{};
+  std::vector<long> columnStarts;
+
+  buildAccelSparseMatrix(a, A, columnStarts);
+
+  doAnalysis(A);
+
+  if (m_symbolicFactorization) doFactorization(A);
+
+  m_isInitialized = true;
+}
+
+/** Performs a symbolic decomposition on the sparsity pattern of matrix \a a.
+ *
+ * This function is particularly useful when solving for several problems having the same structure.
+ *
+ * \sa factorize()
+ */
+template <typename MatrixType_, int UpLo_, SparseFactorization_t Solver_, bool EnforceSquare_>
+void AccelerateImpl<MatrixType_, UpLo_, Solver_, EnforceSquare_>::analyzePattern(const MatrixType& a) {
+  if (EnforceSquare_) eigen_assert(a.rows() == a.cols());
+
+  m_nRows = a.rows();
+  m_nCols = a.cols();
+
+  AccelSparseMatrix A{};
+  std::vector<long> columnStarts;
+
+  buildAccelSparseMatrix(a, A, columnStarts);
+
+  doAnalysis(A);
+
+  m_isInitialized = true;
+}
+
+/** Performs a numeric decomposition of matrix \a a.
+ *
+ * The given matrix must have the same sparsity pattern as the matrix on which the symbolic decomposition has been
+ * performed.
+ *
+ * \sa analyzePattern()
+ */
+template <typename MatrixType_, int UpLo_, SparseFactorization_t Solver_, bool EnforceSquare_>
+void AccelerateImpl<MatrixType_, UpLo_, Solver_, EnforceSquare_>::factorize(const MatrixType& a) {
+  eigen_assert(m_symbolicFactorization && "You must first call analyzePattern()");
+  eigen_assert(m_nRows == a.rows() && m_nCols == a.cols());
+
+  if (EnforceSquare_) eigen_assert(a.rows() == a.cols());
+
+  AccelSparseMatrix A{};
+  std::vector<long> columnStarts;
+
+  buildAccelSparseMatrix(a, A, columnStarts);
+
+  doFactorization(A);
+}
+
+template <typename MatrixType_, int UpLo_, SparseFactorization_t Solver_, bool EnforceSquare_>
+template <typename Rhs, typename Dest>
+void AccelerateImpl<MatrixType_, UpLo_, Solver_, EnforceSquare_>::_solve_impl(const MatrixBase<Rhs>& b,
+                                                                              MatrixBase<Dest>& x) const {
+  if (!m_numericFactorization) {
+    m_info = InvalidInput;
+    return;
+  }
+
+  eigen_assert(m_nRows == b.rows());
+  eigen_assert(((b.cols() == 1) || b.outerStride() == b.rows()));
+
+  SparseStatus_t status = SparseStatusOK;
+
+  Scalar* b_ptr = const_cast<Scalar*>(b.derived().data());
+  Scalar* x_ptr = const_cast<Scalar*>(x.derived().data());
+
+  AccelDenseMatrix xmat{};
+  xmat.attributes = SparseAttributes_t();
+  xmat.columnCount = static_cast<int>(x.cols());
+  xmat.rowCount = static_cast<int>(x.rows());
+  xmat.columnStride = xmat.rowCount;
+  xmat.data = x_ptr;
+
+  AccelDenseMatrix bmat{};
+  bmat.attributes = SparseAttributes_t();
+  bmat.columnCount = static_cast<int>(b.cols());
+  bmat.rowCount = static_cast<int>(b.rows());
+  bmat.columnStride = bmat.rowCount;
+  bmat.data = b_ptr;
+
+  SparseSolve(*m_numericFactorization, bmat, xmat);
+
+  updateInfoStatus(status);
+}
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_ACCELERATESUPPORT_H
diff --git a/inst/include/Eigen/src/AccelerateSupport/InternalHeaderCheck.h b/inst/include/Eigen/src/AccelerateSupport/InternalHeaderCheck.h
new file mode 100644
index 00000000..69bcff50
--- /dev/null
+++ b/inst/include/Eigen/src/AccelerateSupport/InternalHeaderCheck.h
@@ -0,0 +1,3 @@
+#ifndef EIGEN_ACCELERATESUPPORT_MODULE_H
+#error "Please include Eigen/AccelerateSupport instead of including headers inside the src directory directly."
+#endif
diff --git a/inst/include/Eigen/src/Cholesky/InternalHeaderCheck.h b/inst/include/Eigen/src/Cholesky/InternalHeaderCheck.h
new file mode 100644
index 00000000..5de2b219
--- /dev/null
+++ b/inst/include/Eigen/src/Cholesky/InternalHeaderCheck.h
@@ -0,0 +1,3 @@
+#ifndef EIGEN_CHOLESKY_MODULE_H
+#error "Please include Eigen/Cholesky instead of including headers inside the src directory directly."
+#endif
diff --git a/inst/include/Eigen/src/Cholesky/LDLT.h b/inst/include/Eigen/src/Cholesky/LDLT.h
index abd30bd9..b1d801d3 100644
--- a/inst/include/Eigen/src/Cholesky/LDLT.h
+++ b/inst/include/Eigen/src/Cholesky/LDLT.h
@@ -13,296 +13,314 @@
 #ifndef EIGEN_LDLT_H
 #define EIGEN_LDLT_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 namespace internal {
-  template<typename MatrixType, int UpLo> struct LDLT_Traits;
+template <typename MatrixType_, int UpLo_>
+struct traits<LDLT<MatrixType_, UpLo_> > : traits<MatrixType_> {
+  typedef MatrixXpr XprKind;
+  typedef SolverStorage StorageKind;
+  typedef int StorageIndex;
+  enum { Flags = 0 };
+};
 
-  // PositiveSemiDef means positive semi-definite and non-zero; same for NegativeSemiDef
-  enum SignMatrix { PositiveSemiDef, NegativeSemiDef, ZeroSign, Indefinite };
-}
+template <typename MatrixType, int UpLo>
+struct LDLT_Traits;
+
+// PositiveSemiDef means positive semi-definite and non-zero; same for NegativeSemiDef
+enum SignMatrix { PositiveSemiDef, NegativeSemiDef, ZeroSign, Indefinite };
+}  // namespace internal
 
 /** \ingroup Cholesky_Module
-  *
-  * \class LDLT
-  *
-  * \brief Robust Cholesky decomposition of a matrix with pivoting
-  *
-  * \param MatrixType the type of the matrix of which to compute the LDL^T Cholesky decomposition
-  * \param UpLo the triangular part that will be used for the decompositon: Lower (default) or Upper.
-  *             The other triangular part won't be read.
-  *
-  * Perform a robust Cholesky decomposition of a positive semidefinite or negative semidefinite
-  * matrix \f$ A \f$ such that \f$ A =  P^TLDL^*P \f$, where P is a permutation matrix, L
-  * is lower triangular with a unit diagonal and D is a diagonal matrix.
-  *
-  * The decomposition uses pivoting to ensure stability, so that L will have
-  * zeros in the bottom right rank(A) - n submatrix. Avoiding the square root
-  * on D also stabilizes the computation.
-  *
-  * Remember that Cholesky decompositions are not rank-revealing. Also, do not use a Cholesky
-  * decomposition to determine whether a system of equations has a solution.
-  *
-  * \sa MatrixBase::ldlt(), class LLT
-  */
-template<typename _MatrixType, int _UpLo> class LDLT
-{
-  public:
-    typedef _MatrixType MatrixType;
-    enum {
-      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-      Options = MatrixType::Options & ~RowMajorBit, // these are the options for the TmpMatrixType, we need a ColMajor matrix here!
-      MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
-      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
-      UpLo = _UpLo
-    };
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
-    typedef typename MatrixType::Index Index;
-    typedef Matrix<Scalar, RowsAtCompileTime, 1, Options, MaxRowsAtCompileTime, 1> TmpMatrixType;
-
-    typedef Transpositions<RowsAtCompileTime, MaxRowsAtCompileTime> TranspositionType;
-    typedef PermutationMatrix<RowsAtCompileTime, MaxRowsAtCompileTime> PermutationType;
-
-    typedef internal::LDLT_Traits<MatrixType,UpLo> Traits;
-
-    /** \brief Default Constructor.
-      *
-      * The default constructor is useful in cases in which the user intends to
-      * perform decompositions via LDLT::compute(const MatrixType&).
-      */
-    LDLT() 
-      : m_matrix(), 
-        m_transpositions(), 
-        m_sign(internal::ZeroSign),
-        m_isInitialized(false) 
-    {}
-
-    /** \brief Default Constructor with memory preallocation
-      *
-      * Like the default constructor but with preallocation of the internal data
-      * according to the specified problem \a size.
-      * \sa LDLT()
-      */
-    LDLT(Index size)
+ *
+ * \class LDLT
+ *
+ * \brief Robust Cholesky decomposition of a matrix with pivoting
+ *
+ * \tparam MatrixType_ the type of the matrix of which to compute the LDL^T Cholesky decomposition
+ * \tparam UpLo_ the triangular part that will be used for the decomposition: Lower (default) or Upper.
+ *             The other triangular part won't be read.
+ *
+ * Perform a robust Cholesky decomposition of a positive semidefinite or negative semidefinite
+ * matrix \f$ A \f$ such that \f$ A =  P^TLDL^*P \f$, where P is a permutation matrix, L
+ * is lower triangular with a unit diagonal and D is a diagonal matrix.
+ *
+ * The decomposition uses pivoting to ensure stability, so that D will have
+ * zeros in the bottom right rank(A) - n submatrix. Avoiding the square root
+ * on D also stabilizes the computation.
+ *
+ * Remember that Cholesky decompositions are not rank-revealing. Also, do not use a Cholesky
+ * decomposition to determine whether a system of equations has a solution.
+ *
+ * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
+ *
+ * \sa MatrixBase::ldlt(), SelfAdjointView::ldlt(), class LLT
+ */
+template <typename MatrixType_, int UpLo_>
+class LDLT : public SolverBase<LDLT<MatrixType_, UpLo_> > {
+ public:
+  typedef MatrixType_ MatrixType;
+  typedef SolverBase<LDLT> Base;
+  friend class SolverBase<LDLT>;
+
+  EIGEN_GENERIC_PUBLIC_INTERFACE(LDLT)
+  enum {
+    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
+    UpLo = UpLo_
+  };
+  typedef Matrix<Scalar, RowsAtCompileTime, 1, 0, MaxRowsAtCompileTime, 1> TmpMatrixType;
+
+  typedef Transpositions<RowsAtCompileTime, MaxRowsAtCompileTime> TranspositionType;
+  typedef PermutationMatrix<RowsAtCompileTime, MaxRowsAtCompileTime> PermutationType;
+
+  typedef internal::LDLT_Traits<MatrixType, UpLo> Traits;
+
+  /** \brief Default Constructor.
+   *
+   * The default constructor is useful in cases in which the user intends to
+   * perform decompositions via LDLT::compute(const MatrixType&).
+   */
+  LDLT() : m_matrix(), m_transpositions(), m_sign(internal::ZeroSign), m_isInitialized(false) {}
+
+  /** \brief Default Constructor with memory preallocation
+   *
+   * Like the default constructor but with preallocation of the internal data
+   * according to the specified problem \a size.
+   * \sa LDLT()
+   */
+  explicit LDLT(Index size)
       : m_matrix(size, size),
         m_transpositions(size),
         m_temporary(size),
         m_sign(internal::ZeroSign),
-        m_isInitialized(false)
-    {}
-
-    /** \brief Constructor with decomposition
-      *
-      * This calculates the decomposition for the input \a matrix.
-      * \sa LDLT(Index size)
-      */
-    LDLT(const MatrixType& matrix)
+        m_isInitialized(false) {}
+
+  /** \brief Constructor with decomposition
+   *
+   * This calculates the decomposition for the input \a matrix.
+   *
+   * \sa LDLT(Index size)
+   */
+  template <typename InputType>
+  explicit LDLT(const EigenBase<InputType>& matrix)
       : m_matrix(matrix.rows(), matrix.cols()),
         m_transpositions(matrix.rows()),
         m_temporary(matrix.rows()),
         m_sign(internal::ZeroSign),
-        m_isInitialized(false)
-    {
-      compute(matrix);
-    }
-
-    /** Clear any existing decomposition
-     * \sa rankUpdate(w,sigma)
-     */
-    void setZero()
-    {
-      m_isInitialized = false;
-    }
-
-    /** \returns a view of the upper triangular matrix U */
-    inline typename Traits::MatrixU matrixU() const
-    {
-      eigen_assert(m_isInitialized && "LDLT is not initialized.");
-      return Traits::getU(m_matrix);
-    }
-
-    /** \returns a view of the lower triangular matrix L */
-    inline typename Traits::MatrixL matrixL() const
-    {
-      eigen_assert(m_isInitialized && "LDLT is not initialized.");
-      return Traits::getL(m_matrix);
-    }
-
-    /** \returns the permutation matrix P as a transposition sequence.
-      */
-    inline const TranspositionType& transpositionsP() const
-    {
-      eigen_assert(m_isInitialized && "LDLT is not initialized.");
-      return m_transpositions;
-    }
-
-    /** \returns the coefficients of the diagonal matrix D */
-    inline Diagonal<const MatrixType> vectorD() const
-    {
-      eigen_assert(m_isInitialized && "LDLT is not initialized.");
-      return m_matrix.diagonal();
-    }
+        m_isInitialized(false) {
+    compute(matrix.derived());
+  }
 
-    /** \returns true if the matrix is positive (semidefinite) */
-    inline bool isPositive() const
-    {
-      eigen_assert(m_isInitialized && "LDLT is not initialized.");
-      return m_sign == internal::PositiveSemiDef || m_sign == internal::ZeroSign;
-    }
-    
-    #ifdef EIGEN2_SUPPORT
-    inline bool isPositiveDefinite() const
-    {
-      return isPositive();
-    }
-    #endif
+  /** \brief Constructs a LDLT factorization from a given matrix
+   *
+   * This overloaded constructor is provided for \link InplaceDecomposition inplace decomposition \endlink when \c
+   * MatrixType is a Eigen::Ref.
+   *
+   * \sa LDLT(const EigenBase&)
+   */
+  template <typename InputType>
+  explicit LDLT(EigenBase<InputType>& matrix)
+      : m_matrix(matrix.derived()),
+        m_transpositions(matrix.rows()),
+        m_temporary(matrix.rows()),
+        m_sign(internal::ZeroSign),
+        m_isInitialized(false) {
+    compute(matrix.derived());
+  }
 
-    /** \returns true if the matrix is negative (semidefinite) */
-    inline bool isNegative(void) const
-    {
-      eigen_assert(m_isInitialized && "LDLT is not initialized.");
-      return m_sign == internal::NegativeSemiDef || m_sign == internal::ZeroSign;
-    }
+  /** Clear any existing decomposition
+   * \sa rankUpdate(w,sigma)
+   */
+  void setZero() { m_isInitialized = false; }
 
-    /** \returns a solution x of \f$ A x = b \f$ using the current decomposition of A.
-      *
-      * This function also supports in-place solves using the syntax <tt>x = decompositionObject.solve(x)</tt> .
-      *
-      * \note_about_checking_solutions
-      *
-      * More precisely, this method solves \f$ A x = b \f$ using the decomposition \f$ A = P^T L D L^* P \f$
-      * by solving the systems \f$ P^T y_1 = b \f$, \f$ L y_2 = y_1 \f$, \f$ D y_3 = y_2 \f$, 
-      * \f$ L^* y_4 = y_3 \f$ and \f$ P x = y_4 \f$ in succession. If the matrix \f$ A \f$ is singular, then
-      * \f$ D \f$ will also be singular (all the other matrices are invertible). In that case, the
-      * least-square solution of \f$ D y_3 = y_2 \f$ is computed. This does not mean that this function
-      * computes the least-square solution of \f$ A x = b \f$ is \f$ A \f$ is singular.
-      *
-      * \sa MatrixBase::ldlt()
-      */
-    template<typename Rhs>
-    inline const internal::solve_retval<LDLT, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "LDLT is not initialized.");
-      eigen_assert(m_matrix.rows()==b.rows()
-                && "LDLT::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::solve_retval<LDLT, Rhs>(*this, b.derived());
-    }
+  /** \returns a view of the upper triangular matrix U */
+  inline typename Traits::MatrixU matrixU() const {
+    eigen_assert(m_isInitialized && "LDLT is not initialized.");
+    return Traits::getU(m_matrix);
+  }
 
-    #ifdef EIGEN2_SUPPORT
-    template<typename OtherDerived, typename ResultType>
-    bool solve(const MatrixBase<OtherDerived>& b, ResultType *result) const
-    {
-      *result = this->solve(b);
-      return true;
-    }
-    #endif
+  /** \returns a view of the lower triangular matrix L */
+  inline typename Traits::MatrixL matrixL() const {
+    eigen_assert(m_isInitialized && "LDLT is not initialized.");
+    return Traits::getL(m_matrix);
+  }
 
-    template<typename Derived>
-    bool solveInPlace(MatrixBase<Derived> &bAndX) const;
+  /** \returns the permutation matrix P as a transposition sequence.
+   */
+  inline const TranspositionType& transpositionsP() const {
+    eigen_assert(m_isInitialized && "LDLT is not initialized.");
+    return m_transpositions;
+  }
 
-    LDLT& compute(const MatrixType& matrix);
+  /** \returns the coefficients of the diagonal matrix D */
+  inline Diagonal<const MatrixType> vectorD() const {
+    eigen_assert(m_isInitialized && "LDLT is not initialized.");
+    return m_matrix.diagonal();
+  }
 
-    template <typename Derived>
-    LDLT& rankUpdate(const MatrixBase<Derived>& w, const RealScalar& alpha=1);
+  /** \returns true if the matrix is positive (semidefinite) */
+  inline bool isPositive() const {
+    eigen_assert(m_isInitialized && "LDLT is not initialized.");
+    return m_sign == internal::PositiveSemiDef || m_sign == internal::ZeroSign;
+  }
 
-    /** \returns the internal LDLT decomposition matrix
-      *
-      * TODO: document the storage layout
-      */
-    inline const MatrixType& matrixLDLT() const
-    {
-      eigen_assert(m_isInitialized && "LDLT is not initialized.");
-      return m_matrix;
-    }
+  /** \returns true if the matrix is negative (semidefinite) */
+  inline bool isNegative(void) const {
+    eigen_assert(m_isInitialized && "LDLT is not initialized.");
+    return m_sign == internal::NegativeSemiDef || m_sign == internal::ZeroSign;
+  }
 
-    MatrixType reconstructedMatrix() const;
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+  /** \returns a solution x of \f$ A x = b \f$ using the current decomposition of A.
+   *
+   * This function also supports in-place solves using the syntax <tt>x = decompositionObject.solve(x)</tt> .
+   *
+   * \note_about_checking_solutions
+   *
+   * More precisely, this method solves \f$ A x = b \f$ using the decomposition \f$ A = P^T L D L^* P \f$
+   * by solving the systems \f$ P^T y_1 = b \f$, \f$ L y_2 = y_1 \f$, \f$ D y_3 = y_2 \f$,
+   * \f$ L^* y_4 = y_3 \f$ and \f$ P x = y_4 \f$ in succession. If the matrix \f$ A \f$ is singular, then
+   * \f$ D \f$ will also be singular (all the other matrices are invertible). In that case, the
+   * least-square solution of \f$ D y_3 = y_2 \f$ is computed. This does not mean that this function
+   * computes the least-square solution of \f$ A x = b \f$ if \f$ A \f$ is singular.
+   *
+   * \sa MatrixBase::ldlt(), SelfAdjointView::ldlt()
+   */
+  template <typename Rhs>
+  inline const Solve<LDLT, Rhs> solve(const MatrixBase<Rhs>& b) const;
+#endif
+
+  template <typename Derived>
+  bool solveInPlace(MatrixBase<Derived>& bAndX) const;
+
+  template <typename InputType>
+  LDLT& compute(const EigenBase<InputType>& matrix);
+
+  /** \returns an estimate of the reciprocal condition number of the matrix of
+   *  which \c *this is the LDLT decomposition.
+   */
+  RealScalar rcond() const {
+    eigen_assert(m_isInitialized && "LDLT is not initialized.");
+    return internal::rcond_estimate_helper(m_l1_norm, *this);
+  }
 
-    inline Index rows() const { return m_matrix.rows(); }
-    inline Index cols() const { return m_matrix.cols(); }
+  template <typename Derived>
+  LDLT& rankUpdate(const MatrixBase<Derived>& w, const RealScalar& alpha = 1);
 
-    /** \brief Reports whether previous computation was successful.
-      *
-      * \returns \c Success if computation was succesful,
-      *          \c NumericalIssue if the matrix.appears to be negative.
-      */
-    ComputationInfo info() const
-    {
-      eigen_assert(m_isInitialized && "LDLT is not initialized.");
-      return Success;
-    }
+  /** \returns the internal LDLT decomposition matrix
+   *
+   * TODO: document the storage layout
+   */
+  inline const MatrixType& matrixLDLT() const {
+    eigen_assert(m_isInitialized && "LDLT is not initialized.");
+    return m_matrix;
+  }
 
-  protected:
-    
-    static void check_template_parameters()
-    {
-      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
-    }
+  MatrixType reconstructedMatrix() const;
+
+  /** \returns the adjoint of \c *this, that is, a const reference to the decomposition itself as the underlying matrix
+   * is self-adjoint.
+   *
+   * This method is provided for compatibility with other matrix decompositions, thus enabling generic code such as:
+   * \code x = decomposition.adjoint().solve(b) \endcode
+   */
+  const LDLT& adjoint() const { return *this; }
+
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_matrix.rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_matrix.cols(); }
+
+  /** \brief Reports whether previous computation was successful.
+   *
+   * \returns \c Success if computation was successful,
+   *          \c NumericalIssue if the factorization failed because of a zero pivot.
+   */
+  ComputationInfo info() const {
+    eigen_assert(m_isInitialized && "LDLT is not initialized.");
+    return m_info;
+  }
 
-    /** \internal
-      * Used to compute and store the Cholesky decomposition A = L D L^* = U^* D U.
-      * The strict upper part is used during the decomposition, the strict lower
-      * part correspond to the coefficients of L (its diagonal is equal to 1 and
-      * is not stored), and the diagonal entries correspond to D.
-      */
-    MatrixType m_matrix;
-    TranspositionType m_transpositions;
-    TmpMatrixType m_temporary;
-    internal::SignMatrix m_sign;
-    bool m_isInitialized;
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  template <typename RhsType, typename DstType>
+  void _solve_impl(const RhsType& rhs, DstType& dst) const;
+
+  template <bool Conjugate, typename RhsType, typename DstType>
+  void _solve_impl_transposed(const RhsType& rhs, DstType& dst) const;
+#endif
+
+ protected:
+  EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
+
+  /** \internal
+   * Used to compute and store the Cholesky decomposition A = L D L^* = U^* D U.
+   * The strict upper part is used during the decomposition, the strict lower
+   * part correspond to the coefficients of L (its diagonal is equal to 1 and
+   * is not stored), and the diagonal entries correspond to D.
+   */
+  MatrixType m_matrix;
+  RealScalar m_l1_norm;
+  TranspositionType m_transpositions;
+  TmpMatrixType m_temporary;
+  internal::SignMatrix m_sign;
+  bool m_isInitialized;
+  ComputationInfo m_info;
 };
 
 namespace internal {
 
-template<int UpLo> struct ldlt_inplace;
+template <int UpLo>
+struct ldlt_inplace;
 
-template<> struct ldlt_inplace<Lower>
-{
-  template<typename MatrixType, typename TranspositionType, typename Workspace>
-  static bool unblocked(MatrixType& mat, TranspositionType& transpositions, Workspace& temp, SignMatrix& sign)
-  {
+template <>
+struct ldlt_inplace<Lower> {
+  template <typename MatrixType, typename TranspositionType, typename Workspace>
+  static bool unblocked(MatrixType& mat, TranspositionType& transpositions, Workspace& temp, SignMatrix& sign) {
     using std::abs;
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
-    eigen_assert(mat.rows()==mat.cols());
+    typedef typename TranspositionType::StorageIndex IndexType;
+    eigen_assert(mat.rows() == mat.cols());
     const Index size = mat.rows();
+    bool found_zero_pivot = false;
+    bool ret = true;
 
-    if (size <= 1)
-    {
+    if (size <= 1) {
       transpositions.setIdentity();
-      if (numext::real(mat.coeff(0,0)) > 0) sign = PositiveSemiDef;
-      else if (numext::real(mat.coeff(0,0)) < 0) sign = NegativeSemiDef;
-      else sign = ZeroSign;
+      if (size == 0)
+        sign = ZeroSign;
+      else if (numext::real(mat.coeff(0, 0)) > static_cast<RealScalar>(0))
+        sign = PositiveSemiDef;
+      else if (numext::real(mat.coeff(0, 0)) < static_cast<RealScalar>(0))
+        sign = NegativeSemiDef;
+      else
+        sign = ZeroSign;
       return true;
     }
 
-    for (Index k = 0; k < size; ++k)
-    {
+    for (Index k = 0; k < size; ++k) {
       // Find largest diagonal element
       Index index_of_biggest_in_corner;
-      mat.diagonal().tail(size-k).cwiseAbs().maxCoeff(&index_of_biggest_in_corner);
+      mat.diagonal().tail(size - k).cwiseAbs().maxCoeff(&index_of_biggest_in_corner);
       index_of_biggest_in_corner += k;
 
-      transpositions.coeffRef(k) = index_of_biggest_in_corner;
-      if(k != index_of_biggest_in_corner)
-      {
+      transpositions.coeffRef(k) = IndexType(index_of_biggest_in_corner);
+      if (k != index_of_biggest_in_corner) {
         // apply the transposition while taking care to consider only
         // the lower triangular part
-        Index s = size-index_of_biggest_in_corner-1; // trailing size after the biggest element
+        Index s = size - index_of_biggest_in_corner - 1;  // trailing size after the biggest element
         mat.row(k).head(k).swap(mat.row(index_of_biggest_in_corner).head(k));
         mat.col(k).tail(s).swap(mat.col(index_of_biggest_in_corner).tail(s));
-        std::swap(mat.coeffRef(k,k),mat.coeffRef(index_of_biggest_in_corner,index_of_biggest_in_corner));
-        for(int i=k+1;i<index_of_biggest_in_corner;++i)
-        {
-          Scalar tmp = mat.coeffRef(i,k);
-          mat.coeffRef(i,k) = numext::conj(mat.coeffRef(index_of_biggest_in_corner,i));
-          mat.coeffRef(index_of_biggest_in_corner,i) = numext::conj(tmp);
+        std::swap(mat.coeffRef(k, k), mat.coeffRef(index_of_biggest_in_corner, index_of_biggest_in_corner));
+        for (Index i = k + 1; i < index_of_biggest_in_corner; ++i) {
+          Scalar tmp = mat.coeffRef(i, k);
+          mat.coeffRef(i, k) = numext::conj(mat.coeffRef(index_of_biggest_in_corner, i));
+          mat.coeffRef(index_of_biggest_in_corner, i) = numext::conj(tmp);
         }
-        if(NumTraits<Scalar>::IsComplex)
-          mat.coeffRef(index_of_biggest_in_corner,k) = numext::conj(mat.coeff(index_of_biggest_in_corner,k));
+        if (NumTraits<Scalar>::IsComplex)
+          mat.coeffRef(index_of_biggest_in_corner, k) = numext::conj(mat.coeff(index_of_biggest_in_corner, k));
       }
 
       // partition the matrix:
@@ -310,37 +328,57 @@ template<> struct ldlt_inplace<Lower>
       // lu  = A10 | A11 |  -
       //       A20 | A21 | A22
       Index rs = size - k - 1;
-      Block<MatrixType,Dynamic,1> A21(mat,k+1,k,rs,1);
-      Block<MatrixType,1,Dynamic> A10(mat,k,0,1,k);
-      Block<MatrixType,Dynamic,Dynamic> A20(mat,k+1,0,rs,k);
+      Block<MatrixType, Dynamic, 1> A21(mat, k + 1, k, rs, 1);
+      Block<MatrixType, 1, Dynamic> A10(mat, k, 0, 1, k);
+      Block<MatrixType, Dynamic, Dynamic> A20(mat, k + 1, 0, rs, k);
 
-      if(k>0)
-      {
+      if (k > 0) {
         temp.head(k) = mat.diagonal().real().head(k).asDiagonal() * A10.adjoint();
-        mat.coeffRef(k,k) -= (A10 * temp.head(k)).value();
-        if(rs>0)
-          A21.noalias() -= A20 * temp.head(k);
+        mat.coeffRef(k, k) -= (A10 * temp.head(k)).value();
+        if (rs > 0) A21.noalias() -= A20 * temp.head(k);
       }
-      
+
       // In some previous versions of Eigen (e.g., 3.2.1), the scaling was omitted if the pivot
-      // was smaller than the cutoff value. However, soince LDLT is not rank-revealing
-      // we should only make sure we do not introduce INF or NaN values.
-      // LAPACK also uses 0 as the cutoff value.
-      RealScalar realAkk = numext::real(mat.coeffRef(k,k));
-      if((rs>0) && (abs(realAkk) > RealScalar(0)))
+      // was smaller than the cutoff value. However, since LDLT is not rank-revealing
+      // we should only make sure that we do not introduce INF or NaN values.
+      // Remark that LAPACK also uses 0 as the cutoff value.
+      RealScalar realAkk = numext::real(mat.coeffRef(k, k));
+      bool pivot_is_valid = (abs(realAkk) > RealScalar(0));
+
+      if (k == 0 && !pivot_is_valid) {
+        // The entire diagonal is zero, there is nothing more to do
+        // except filling the transpositions, and checking whether the matrix is zero.
+        sign = ZeroSign;
+        for (Index j = 0; j < size; ++j) {
+          transpositions.coeffRef(j) = IndexType(j);
+          ret = ret && (mat.col(j).tail(size - j - 1).array() == Scalar(0)).all();
+        }
+        return ret;
+      }
+
+      if ((rs > 0) && pivot_is_valid)
         A21 /= realAkk;
+      else if (rs > 0)
+        ret = ret && (A21.array() == Scalar(0)).all();
+
+      if (found_zero_pivot && pivot_is_valid)
+        ret = false;  // factorization failed
+      else if (!pivot_is_valid)
+        found_zero_pivot = true;
 
       if (sign == PositiveSemiDef) {
-        if (realAkk < 0) sign = Indefinite;
+        if (realAkk < static_cast<RealScalar>(0)) sign = Indefinite;
       } else if (sign == NegativeSemiDef) {
-        if (realAkk > 0) sign = Indefinite;
+        if (realAkk > static_cast<RealScalar>(0)) sign = Indefinite;
       } else if (sign == ZeroSign) {
-        if (realAkk > 0) sign = PositiveSemiDef;
-        else if (realAkk < 0) sign = NegativeSemiDef;
+        if (realAkk > static_cast<RealScalar>(0))
+          sign = PositiveSemiDef;
+        else if (realAkk < static_cast<RealScalar>(0))
+          sign = NegativeSemiDef;
       }
     }
 
-    return true;
+    return ret;
   }
 
   // Reference for the algorithm: Davis and Hager, "Multiple Rank
@@ -350,108 +388,116 @@ template<> struct ldlt_inplace<Lower>
   // original matrix is not of full rank.
   // Here only rank-1 updates are implemented, to reduce the
   // requirement for intermediate storage and improve accuracy
-  template<typename MatrixType, typename WDerived>
-  static bool updateInPlace(MatrixType& mat, MatrixBase<WDerived>& w, const typename MatrixType::RealScalar& sigma=1)
-  {
+  template <typename MatrixType, typename WDerived>
+  static bool updateInPlace(MatrixType& mat, MatrixBase<WDerived>& w,
+                            const typename MatrixType::RealScalar& sigma = 1) {
     using numext::isfinite;
     typedef typename MatrixType::Scalar Scalar;
     typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
 
     const Index size = mat.rows();
-    eigen_assert(mat.cols() == size && w.size()==size);
+    eigen_assert(mat.cols() == size && w.size() == size);
 
     RealScalar alpha = 1;
 
     // Apply the update
-    for (Index j = 0; j < size; j++)
-    {
+    for (Index j = 0; j < size; j++) {
       // Check for termination due to an original decomposition of low-rank
-      if (!(isfinite)(alpha))
-        break;
+      if (!(isfinite)(alpha)) break;
 
       // Update the diagonal terms
-      RealScalar dj = numext::real(mat.coeff(j,j));
+      RealScalar dj = numext::real(mat.coeff(j, j));
       Scalar wj = w.coeff(j);
-      RealScalar swj2 = sigma*numext::abs2(wj);
-      RealScalar gamma = dj*alpha + swj2;
-
-      mat.coeffRef(j,j) += swj2/alpha;
-      alpha += swj2/dj;
+      RealScalar swj2 = sigma * numext::abs2(wj);
+      RealScalar gamma = dj * alpha + swj2;
 
+      mat.coeffRef(j, j) += swj2 / alpha;
+      alpha += swj2 / dj;
 
       // Update the terms of L
-      Index rs = size-j-1;
+      Index rs = size - j - 1;
       w.tail(rs) -= wj * mat.col(j).tail(rs);
-      if(gamma != 0)
-        mat.col(j).tail(rs) += (sigma*numext::conj(wj)/gamma)*w.tail(rs);
+      if (!numext::is_exactly_zero(gamma)) mat.col(j).tail(rs) += (sigma * numext::conj(wj) / gamma) * w.tail(rs);
     }
     return true;
   }
 
-  template<typename MatrixType, typename TranspositionType, typename Workspace, typename WType>
-  static bool update(MatrixType& mat, const TranspositionType& transpositions, Workspace& tmp, const WType& w, const typename MatrixType::RealScalar& sigma=1)
-  {
+  template <typename MatrixType, typename TranspositionType, typename Workspace, typename WType>
+  static bool update(MatrixType& mat, const TranspositionType& transpositions, Workspace& tmp, const WType& w,
+                     const typename MatrixType::RealScalar& sigma = 1) {
     // Apply the permutation to the input w
     tmp = transpositions * w;
 
-    return ldlt_inplace<Lower>::updateInPlace(mat,tmp,sigma);
+    return ldlt_inplace<Lower>::updateInPlace(mat, tmp, sigma);
   }
 };
 
-template<> struct ldlt_inplace<Upper>
-{
-  template<typename MatrixType, typename TranspositionType, typename Workspace>
-  static EIGEN_STRONG_INLINE bool unblocked(MatrixType& mat, TranspositionType& transpositions, Workspace& temp, SignMatrix& sign)
-  {
+template <>
+struct ldlt_inplace<Upper> {
+  template <typename MatrixType, typename TranspositionType, typename Workspace>
+  static EIGEN_STRONG_INLINE bool unblocked(MatrixType& mat, TranspositionType& transpositions, Workspace& temp,
+                                            SignMatrix& sign) {
     Transpose<MatrixType> matt(mat);
     return ldlt_inplace<Lower>::unblocked(matt, transpositions, temp, sign);
   }
 
-  template<typename MatrixType, typename TranspositionType, typename Workspace, typename WType>
-  static EIGEN_STRONG_INLINE bool update(MatrixType& mat, TranspositionType& transpositions, Workspace& tmp, WType& w, const typename MatrixType::RealScalar& sigma=1)
-  {
+  template <typename MatrixType, typename TranspositionType, typename Workspace, typename WType>
+  static EIGEN_STRONG_INLINE bool update(MatrixType& mat, TranspositionType& transpositions, Workspace& tmp, WType& w,
+                                         const typename MatrixType::RealScalar& sigma = 1) {
     Transpose<MatrixType> matt(mat);
     return ldlt_inplace<Lower>::update(matt, transpositions, tmp, w.conjugate(), sigma);
   }
 };
 
-template<typename MatrixType> struct LDLT_Traits<MatrixType,Lower>
-{
+template <typename MatrixType>
+struct LDLT_Traits<MatrixType, Lower> {
   typedef const TriangularView<const MatrixType, UnitLower> MatrixL;
   typedef const TriangularView<const typename MatrixType::AdjointReturnType, UnitUpper> MatrixU;
-  static inline MatrixL getL(const MatrixType& m) { return m; }
-  static inline MatrixU getU(const MatrixType& m) { return m.adjoint(); }
+  static inline MatrixL getL(const MatrixType& m) { return MatrixL(m); }
+  static inline MatrixU getU(const MatrixType& m) { return MatrixU(m.adjoint()); }
 };
 
-template<typename MatrixType> struct LDLT_Traits<MatrixType,Upper>
-{
+template <typename MatrixType>
+struct LDLT_Traits<MatrixType, Upper> {
   typedef const TriangularView<const typename MatrixType::AdjointReturnType, UnitLower> MatrixL;
   typedef const TriangularView<const MatrixType, UnitUpper> MatrixU;
-  static inline MatrixL getL(const MatrixType& m) { return m.adjoint(); }
-  static inline MatrixU getU(const MatrixType& m) { return m; }
+  static inline MatrixL getL(const MatrixType& m) { return MatrixL(m.adjoint()); }
+  static inline MatrixU getU(const MatrixType& m) { return MatrixU(m); }
 };
 
-} // end namespace internal
+}  // end namespace internal
 
 /** Compute / recompute the LDLT decomposition A = L D L^* = U^* D U of \a matrix
-  */
-template<typename MatrixType, int _UpLo>
-LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::compute(const MatrixType& a)
-{
-  check_template_parameters();
-  
-  eigen_assert(a.rows()==a.cols());
+ */
+template <typename MatrixType, int UpLo_>
+template <typename InputType>
+LDLT<MatrixType, UpLo_>& LDLT<MatrixType, UpLo_>::compute(const EigenBase<InputType>& a) {
+  eigen_assert(a.rows() == a.cols());
   const Index size = a.rows();
 
-  m_matrix = a;
+  m_matrix = a.derived();
+
+  // Compute matrix L1 norm = max abs column sum.
+  m_l1_norm = RealScalar(0);
+  // TODO move this code to SelfAdjointView
+  for (Index col = 0; col < size; ++col) {
+    RealScalar abs_col_sum;
+    if (UpLo_ == Lower)
+      abs_col_sum =
+          m_matrix.col(col).tail(size - col).template lpNorm<1>() + m_matrix.row(col).head(col).template lpNorm<1>();
+    else
+      abs_col_sum =
+          m_matrix.col(col).head(col).template lpNorm<1>() + m_matrix.row(col).tail(size - col).template lpNorm<1>();
+    if (abs_col_sum > m_l1_norm) m_l1_norm = abs_col_sum;
+  }
 
   m_transpositions.resize(size);
   m_isInitialized = false;
   m_temporary.resize(size);
   m_sign = internal::ZeroSign;
 
-  internal::ldlt_inplace<UpLo>::unblocked(m_matrix, m_transpositions, m_temporary, m_sign);
+  m_info = internal::ldlt_inplace<UpLo>::unblocked(m_matrix, m_transpositions, m_temporary, m_sign) ? Success
+                                                                                                    : NumericalIssue;
 
   m_isInitialized = true;
   return *this;
@@ -459,27 +505,24 @@ LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::compute(const MatrixType& a)
 
 /** Update the LDLT decomposition:  given A = L D L^T, efficiently compute the decomposition of A + sigma w w^T.
  * \param w a vector to be incorporated into the decomposition.
- * \param sigma a scalar, +1 for updates and -1 for "downdates," which correspond to removing previously-added column vectors. Optional; default value is +1.
- * \sa setZero()
-  */
-template<typename MatrixType, int _UpLo>
-template<typename Derived>
-LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::rankUpdate(const MatrixBase<Derived>& w, const typename LDLT<MatrixType,_UpLo>::RealScalar& sigma)
-{
+ * \param sigma a scalar, +1 for updates and -1 for "downdates," which correspond to removing previously-added column
+ * vectors. Optional; default value is +1. \sa setZero()
+ */
+template <typename MatrixType, int UpLo_>
+template <typename Derived>
+LDLT<MatrixType, UpLo_>& LDLT<MatrixType, UpLo_>::rankUpdate(
+    const MatrixBase<Derived>& w, const typename LDLT<MatrixType, UpLo_>::RealScalar& sigma) {
+  typedef typename TranspositionType::StorageIndex IndexType;
   const Index size = w.rows();
-  if (m_isInitialized)
-  {
-    eigen_assert(m_matrix.rows()==size);
-  }
-  else
-  {    
-    m_matrix.resize(size,size);
+  if (m_isInitialized) {
+    eigen_assert(m_matrix.rows() == size);
+  } else {
+    m_matrix.resize(size, size);
     m_matrix.setZero();
     m_transpositions.resize(size);
-    for (Index i = 0; i < size; i++)
-      m_transpositions.coeffRef(i) = i;
+    for (Index i = 0; i < size; i++) m_transpositions.coeffRef(i) = IndexType(i);
     m_temporary.resize(size);
-    m_sign = sigma>=0 ? internal::PositiveSemiDef : internal::NegativeSemiDef;
+    m_sign = sigma >= 0 ? internal::PositiveSemiDef : internal::NegativeSemiDef;
     m_isInitialized = true;
   }
 
@@ -488,71 +531,68 @@ LDLT<MatrixType,_UpLo>& LDLT<MatrixType,_UpLo>::rankUpdate(const MatrixBase<Deri
   return *this;
 }
 
-namespace internal {
-template<typename _MatrixType, int _UpLo, typename Rhs>
-struct solve_retval<LDLT<_MatrixType,_UpLo>, Rhs>
-  : solve_retval_base<LDLT<_MatrixType,_UpLo>, Rhs>
-{
-  typedef LDLT<_MatrixType,_UpLo> LDLTType;
-  EIGEN_MAKE_SOLVE_HELPERS(LDLTType,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    eigen_assert(rhs().rows() == dec().matrixLDLT().rows());
-    // dst = P b
-    dst = dec().transpositionsP() * rhs();
-
-    // dst = L^-1 (P b)
-    dec().matrixL().solveInPlace(dst);
-
-    // dst = D^-1 (L^-1 P b)
-    // more precisely, use pseudo-inverse of D (see bug 241)
-    using std::abs;
-    using std::max;
-    typedef typename LDLTType::MatrixType MatrixType;
-    typedef typename LDLTType::RealScalar RealScalar;
-    const typename Diagonal<const MatrixType>::RealReturnType vectorD(dec().vectorD());
-    // In some previous versions, tolerance was set to the max of 1/highest and the maximal diagonal entry * epsilon
-    // as motivated by LAPACK's xGELSS:
-    // RealScalar tolerance = (max)(vectorD.array().abs().maxCoeff() *NumTraits<RealScalar>::epsilon(),RealScalar(1) / NumTraits<RealScalar>::highest());
-    // However, LDLT is not rank revealing, and so adjusting the tolerance wrt to the highest
-    // diagonal element is not well justified and to numerical issues in some cases.
-    // Moreover, Lapack's xSYTRS routines use 0 for the tolerance.
-    RealScalar tolerance = RealScalar(1) / NumTraits<RealScalar>::highest();
-    
-    for (Index i = 0; i < vectorD.size(); ++i) {
-      if(abs(vectorD(i)) > tolerance)
-        dst.row(i) /= vectorD(i);
-      else
-        dst.row(i).setZero();
-    }
-
-    // dst = L^-T (D^-1 L^-1 P b)
-    dec().matrixU().solveInPlace(dst);
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+template <typename MatrixType_, int UpLo_>
+template <typename RhsType, typename DstType>
+void LDLT<MatrixType_, UpLo_>::_solve_impl(const RhsType& rhs, DstType& dst) const {
+  _solve_impl_transposed<true>(rhs, dst);
+}
 
-    // dst = P^-1 (L^-T D^-1 L^-1 P b) = A^-1 b
-    dst = dec().transpositionsP().transpose() * dst;
+template <typename MatrixType_, int UpLo_>
+template <bool Conjugate, typename RhsType, typename DstType>
+void LDLT<MatrixType_, UpLo_>::_solve_impl_transposed(const RhsType& rhs, DstType& dst) const {
+  // dst = P b
+  dst = m_transpositions * rhs;
+
+  // dst = L^-1 (P b)
+  // dst = L^-*T (P b)
+  matrixL().template conjugateIf<!Conjugate>().solveInPlace(dst);
+
+  // dst = D^-* (L^-1 P b)
+  // dst = D^-1 (L^-*T P b)
+  // more precisely, use pseudo-inverse of D (see bug 241)
+  using std::abs;
+  const typename Diagonal<const MatrixType>::RealReturnType vecD(vectorD());
+  // In some previous versions, tolerance was set to the max of 1/highest (or rather numeric_limits::min())
+  // and the maximal diagonal entry * epsilon as motivated by LAPACK's xGELSS:
+  // RealScalar tolerance = numext::maxi(vecD.array().abs().maxCoeff() * NumTraits<RealScalar>::epsilon(),RealScalar(1)
+  // / NumTraits<RealScalar>::highest()); However, LDLT is not rank revealing, and so adjusting the tolerance wrt to the
+  // highest diagonal element is not well justified and leads to numerical issues in some cases. Moreover, Lapack's
+  // xSYTRS routines use 0 for the tolerance. Using numeric_limits::min() gives us more robustness to denormals.
+  RealScalar tolerance = (std::numeric_limits<RealScalar>::min)();
+  for (Index i = 0; i < vecD.size(); ++i) {
+    if (abs(vecD(i)) > tolerance)
+      dst.row(i) /= vecD(i);
+    else
+      dst.row(i).setZero();
   }
-};
+
+  // dst = L^-* (D^-* L^-1 P b)
+  // dst = L^-T (D^-1 L^-*T P b)
+  matrixL().transpose().template conjugateIf<Conjugate>().solveInPlace(dst);
+
+  // dst = P^T (L^-* D^-* L^-1 P b) = A^-1 b
+  // dst = P^-T (L^-T D^-1 L^-*T P b) = A^-1 b
+  dst = m_transpositions.transpose() * dst;
 }
+#endif
 
 /** \internal use x = ldlt_object.solve(x);
-  *
-  * This is the \em in-place version of solve().
-  *
-  * \param bAndX represents both the right-hand side matrix b and result x.
-  *
-  * \returns true always! If you need to check for existence of solutions, use another decomposition like LU, QR, or SVD.
-  *
-  * This version avoids a copy when the right hand side matrix b is not
-  * needed anymore.
-  *
-  * \sa LDLT::solve(), MatrixBase::ldlt()
-  */
-template<typename MatrixType,int _UpLo>
-template<typename Derived>
-bool LDLT<MatrixType,_UpLo>::solveInPlace(MatrixBase<Derived> &bAndX) const
-{
+ *
+ * This is the \em in-place version of solve().
+ *
+ * \param bAndX represents both the right-hand side matrix b and result x.
+ *
+ * \returns true always! If you need to check for existence of solutions, use another decomposition like LU, QR, or SVD.
+ *
+ * This version avoids a copy when the right hand side matrix b is not
+ * needed anymore.
+ *
+ * \sa LDLT::solve(), MatrixBase::ldlt()
+ */
+template <typename MatrixType, int UpLo_>
+template <typename Derived>
+bool LDLT<MatrixType, UpLo_>::solveInPlace(MatrixBase<Derived>& bAndX) const {
   eigen_assert(m_isInitialized && "LDLT is not initialized.");
   eigen_assert(m_matrix.rows() == bAndX.rows());
 
@@ -564,12 +604,11 @@ bool LDLT<MatrixType,_UpLo>::solveInPlace(MatrixBase<Derived> &bAndX) const
 /** \returns the matrix represented by the decomposition,
  * i.e., it returns the product: P^T L D L^* P.
  * This function is provided for debug purpose. */
-template<typename MatrixType, int _UpLo>
-MatrixType LDLT<MatrixType,_UpLo>::reconstructedMatrix() const
-{
+template <typename MatrixType, int UpLo_>
+MatrixType LDLT<MatrixType, UpLo_>::reconstructedMatrix() const {
   eigen_assert(m_isInitialized && "LDLT is not initialized.");
   const Index size = m_matrix.rows();
-  MatrixType res(size,size);
+  MatrixType res(size, size);
 
   // P
   res.setIdentity();
@@ -587,25 +626,24 @@ MatrixType LDLT<MatrixType,_UpLo>::reconstructedMatrix() const
 }
 
 /** \cholesky_module
-  * \returns the Cholesky decomposition with full pivoting without square root of \c *this
-  */
-template<typename MatrixType, unsigned int UpLo>
+ * \returns the Cholesky decomposition with full pivoting without square root of \c *this
+ * \sa MatrixBase::ldlt()
+ */
+template <typename MatrixType, unsigned int UpLo>
 inline const LDLT<typename SelfAdjointView<MatrixType, UpLo>::PlainObject, UpLo>
-SelfAdjointView<MatrixType, UpLo>::ldlt() const
-{
-  return LDLT<PlainObject,UpLo>(m_matrix);
+SelfAdjointView<MatrixType, UpLo>::ldlt() const {
+  return LDLT<PlainObject, UpLo>(m_matrix);
 }
 
 /** \cholesky_module
-  * \returns the Cholesky decomposition with full pivoting without square root of \c *this
-  */
-template<typename Derived>
-inline const LDLT<typename MatrixBase<Derived>::PlainObject>
-MatrixBase<Derived>::ldlt() const
-{
+ * \returns the Cholesky decomposition with full pivoting without square root of \c *this
+ * \sa SelfAdjointView::ldlt()
+ */
+template <typename Derived>
+inline const LDLT<typename MatrixBase<Derived>::PlainObject> MatrixBase<Derived>::ldlt() const {
   return LDLT<PlainObject>(derived());
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_LDLT_H
+#endif  // EIGEN_LDLT_H
diff --git a/inst/include/Eigen/src/Cholesky/LLT.h b/inst/include/Eigen/src/Cholesky/LLT.h
index 7c11a2dc..7fa4fa2a 100644
--- a/inst/include/Eigen/src/Cholesky/LLT.h
+++ b/inst/include/Eigen/src/Cholesky/LLT.h
@@ -10,392 +10,411 @@
 #ifndef EIGEN_LLT_H
 #define EIGEN_LLT_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 
-namespace internal{
-template<typename MatrixType, int UpLo> struct LLT_Traits;
-}
+namespace Eigen {
 
-/** \ingroup Cholesky_Module
-  *
-  * \class LLT
-  *
-  * \brief Standard Cholesky decomposition (LL^T) of a matrix and associated features
-  *
-  * \param MatrixType the type of the matrix of which we are computing the LL^T Cholesky decomposition
-  * \param UpLo the triangular part that will be used for the decompositon: Lower (default) or Upper.
-  *             The other triangular part won't be read.
-  *
-  * This class performs a LL^T Cholesky decomposition of a symmetric, positive definite
-  * matrix A such that A = LL^* = U^*U, where L is lower triangular.
-  *
-  * While the Cholesky decomposition is particularly useful to solve selfadjoint problems like  D^*D x = b,
-  * for that purpose, we recommend the Cholesky decomposition without square root which is more stable
-  * and even faster. Nevertheless, this standard Cholesky decomposition remains useful in many other
-  * situations like generalised eigen problems with hermitian matrices.
-  *
-  * Remember that Cholesky decompositions are not rank-revealing. This LLT decomposition is only stable on positive definite matrices,
-  * use LDLT instead for the semidefinite case. Also, do not use a Cholesky decomposition to determine whether a system of equations
-  * has a solution.
-  *
-  * Example: \include LLT_example.cpp
-  * Output: \verbinclude LLT_example.out
-  *    
-  * \sa MatrixBase::llt(), class LDLT
-  */
- /* HEY THIS DOX IS DISABLED BECAUSE THERE's A BUG EITHER HERE OR IN LDLT ABOUT THAT (OR BOTH)
-  * Note that during the decomposition, only the upper triangular part of A is considered. Therefore,
-  * the strict lower part does not have to store correct values.
-  */
-template<typename _MatrixType, int _UpLo> class LLT
-{
-  public:
-    typedef _MatrixType MatrixType;
-    enum {
-      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-      Options = MatrixType::Options,
-      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
-    };
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
-    typedef typename MatrixType::Index Index;
-
-    enum {
-      PacketSize = internal::packet_traits<Scalar>::size,
-      AlignmentMask = int(PacketSize)-1,
-      UpLo = _UpLo
-    };
-
-    typedef internal::LLT_Traits<MatrixType,UpLo> Traits;
-
-    /**
-      * \brief Default Constructor.
-      *
-      * The default constructor is useful in cases in which the user intends to
-      * perform decompositions via LLT::compute(const MatrixType&).
-      */
-    LLT() : m_matrix(), m_isInitialized(false) {}
-
-    /** \brief Default Constructor with memory preallocation
-      *
-      * Like the default constructor but with preallocation of the internal data
-      * according to the specified problem \a size.
-      * \sa LLT()
-      */
-    LLT(Index size) : m_matrix(size, size),
-                    m_isInitialized(false) {}
-
-    LLT(const MatrixType& matrix)
-      : m_matrix(matrix.rows(), matrix.cols()),
-        m_isInitialized(false)
-    {
-      compute(matrix);
-    }
+namespace internal {
 
-    /** \returns a view of the upper triangular matrix U */
-    inline typename Traits::MatrixU matrixU() const
-    {
-      eigen_assert(m_isInitialized && "LLT is not initialized.");
-      return Traits::getU(m_matrix);
-    }
+template <typename MatrixType_, int UpLo_>
+struct traits<LLT<MatrixType_, UpLo_> > : traits<MatrixType_> {
+  typedef MatrixXpr XprKind;
+  typedef SolverStorage StorageKind;
+  typedef int StorageIndex;
+  enum { Flags = 0 };
+};
 
-    /** \returns a view of the lower triangular matrix L */
-    inline typename Traits::MatrixL matrixL() const
-    {
-      eigen_assert(m_isInitialized && "LLT is not initialized.");
-      return Traits::getL(m_matrix);
-    }
+template <typename MatrixType, int UpLo>
+struct LLT_Traits;
+}  // namespace internal
 
-    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
-      *
-      * Since this LLT class assumes anyway that the matrix A is invertible, the solution
-      * theoretically exists and is unique regardless of b.
-      *
-      * Example: \include LLT_solve.cpp
-      * Output: \verbinclude LLT_solve.out
-      *
-      * \sa solveInPlace(), MatrixBase::llt()
-      */
-    template<typename Rhs>
-    inline const internal::solve_retval<LLT, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "LLT is not initialized.");
-      eigen_assert(m_matrix.rows()==b.rows()
-                && "LLT::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::solve_retval<LLT, Rhs>(*this, b.derived());
-    }
+/** \ingroup Cholesky_Module
+ *
+ * \class LLT
+ *
+ * \brief Standard Cholesky decomposition (LL^T) of a matrix and associated features
+ *
+ * \tparam MatrixType_ the type of the matrix of which we are computing the LL^T Cholesky decomposition
+ * \tparam UpLo_ the triangular part that will be used for the decomposition: Lower (default) or Upper.
+ *               The other triangular part won't be read.
+ *
+ * This class performs a LL^T Cholesky decomposition of a symmetric, positive definite
+ * matrix A such that A = LL^* = U^*U, where L is lower triangular.
+ *
+ * While the Cholesky decomposition is particularly useful to solve selfadjoint problems like  D^*D x = b,
+ * for that purpose, we recommend the Cholesky decomposition without square root which is more stable
+ * and even faster. Nevertheless, this standard Cholesky decomposition remains useful in many other
+ * situations like generalised eigen problems with hermitian matrices.
+ *
+ * Remember that Cholesky decompositions are not rank-revealing. This LLT decomposition is only stable on positive
+ * definite matrices, use LDLT instead for the semidefinite case. Also, do not use a Cholesky decomposition to determine
+ * whether a system of equations has a solution.
+ *
+ * Example: \include LLT_example.cpp
+ * Output: \verbinclude LLT_example.out
+ *
+ * \b Performance: for best performance, it is recommended to use a column-major storage format
+ * with the Lower triangular part (the default), or, equivalently, a row-major storage format
+ * with the Upper triangular part. Otherwise, you might get a 20% slowdown for the full factorization
+ * step, and rank-updates can be up to 3 times slower.
+ *
+ * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
+ *
+ * Note that during the decomposition, only the lower (or upper, as defined by UpLo_) triangular part of A is
+ * considered. Therefore, the strict lower part does not have to store correct values.
+ *
+ * \sa MatrixBase::llt(), SelfAdjointView::llt(), class LDLT
+ */
+template <typename MatrixType_, int UpLo_>
+class LLT : public SolverBase<LLT<MatrixType_, UpLo_> > {
+ public:
+  typedef MatrixType_ MatrixType;
+  typedef SolverBase<LLT> Base;
+  friend class SolverBase<LLT>;
+
+  EIGEN_GENERIC_PUBLIC_INTERFACE(LLT)
+  enum { MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime };
+
+  enum { PacketSize = internal::packet_traits<Scalar>::size, AlignmentMask = int(PacketSize) - 1, UpLo = UpLo_ };
+
+  typedef internal::LLT_Traits<MatrixType, UpLo> Traits;
+
+  /**
+   * \brief Default Constructor.
+   *
+   * The default constructor is useful in cases in which the user intends to
+   * perform decompositions via LLT::compute(const MatrixType&).
+   */
+  LLT() : m_matrix(), m_isInitialized(false) {}
+
+  /** \brief Default Constructor with memory preallocation
+   *
+   * Like the default constructor but with preallocation of the internal data
+   * according to the specified problem \a size.
+   * \sa LLT()
+   */
+  explicit LLT(Index size) : m_matrix(size, size), m_isInitialized(false) {}
+
+  template <typename InputType>
+  explicit LLT(const EigenBase<InputType>& matrix) : m_matrix(matrix.rows(), matrix.cols()), m_isInitialized(false) {
+    compute(matrix.derived());
+  }
 
-    #ifdef EIGEN2_SUPPORT
-    template<typename OtherDerived, typename ResultType>
-    bool solve(const MatrixBase<OtherDerived>& b, ResultType *result) const
-    {
-      *result = this->solve(b);
-      return true;
-    }
-    
-    bool isPositiveDefinite() const { return true; }
-    #endif
-
-    template<typename Derived>
-    void solveInPlace(MatrixBase<Derived> &bAndX) const;
-
-    LLT& compute(const MatrixType& matrix);
-
-    /** \returns the LLT decomposition matrix
-      *
-      * TODO: document the storage layout
-      */
-    inline const MatrixType& matrixLLT() const
-    {
-      eigen_assert(m_isInitialized && "LLT is not initialized.");
-      return m_matrix;
-    }
+  /** \brief Constructs a LLT factorization from a given matrix
+   *
+   * This overloaded constructor is provided for \link InplaceDecomposition inplace decomposition \endlink when
+   * \c MatrixType is a Eigen::Ref.
+   *
+   * \sa LLT(const EigenBase&)
+   */
+  template <typename InputType>
+  explicit LLT(EigenBase<InputType>& matrix) : m_matrix(matrix.derived()), m_isInitialized(false) {
+    compute(matrix.derived());
+  }
 
-    MatrixType reconstructedMatrix() const;
+  /** \returns a view of the upper triangular matrix U */
+  inline typename Traits::MatrixU matrixU() const {
+    eigen_assert(m_isInitialized && "LLT is not initialized.");
+    return Traits::getU(m_matrix);
+  }
 
+  /** \returns a view of the lower triangular matrix L */
+  inline typename Traits::MatrixL matrixL() const {
+    eigen_assert(m_isInitialized && "LLT is not initialized.");
+    return Traits::getL(m_matrix);
+  }
 
-    /** \brief Reports whether previous computation was successful.
-      *
-      * \returns \c Success if computation was succesful,
-      *          \c NumericalIssue if the matrix.appears to be negative.
-      */
-    ComputationInfo info() const
-    {
-      eigen_assert(m_isInitialized && "LLT is not initialized.");
-      return m_info;
-    }
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+  /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
+   *
+   * Since this LLT class assumes anyway that the matrix A is invertible, the solution
+   * theoretically exists and is unique regardless of b.
+   *
+   * Example: \include LLT_solve.cpp
+   * Output: \verbinclude LLT_solve.out
+   *
+   * \sa solveInPlace(), MatrixBase::llt(), SelfAdjointView::llt()
+   */
+  template <typename Rhs>
+  inline const Solve<LLT, Rhs> solve(const MatrixBase<Rhs>& b) const;
+#endif
+
+  template <typename Derived>
+  void solveInPlace(const MatrixBase<Derived>& bAndX) const;
+
+  template <typename InputType>
+  LLT& compute(const EigenBase<InputType>& matrix);
+
+  /** \returns an estimate of the reciprocal condition number of the matrix of
+   *  which \c *this is the Cholesky decomposition.
+   */
+  RealScalar rcond() const {
+    eigen_assert(m_isInitialized && "LLT is not initialized.");
+    eigen_assert(m_info == Success && "LLT failed because matrix appears to be negative");
+    return internal::rcond_estimate_helper(m_l1_norm, *this);
+  }
 
-    inline Index rows() const { return m_matrix.rows(); }
-    inline Index cols() const { return m_matrix.cols(); }
+  /** \returns the LLT decomposition matrix
+   *
+   * TODO: document the storage layout
+   */
+  inline const MatrixType& matrixLLT() const {
+    eigen_assert(m_isInitialized && "LLT is not initialized.");
+    return m_matrix;
+  }
 
-    template<typename VectorType>
-    LLT rankUpdate(const VectorType& vec, const RealScalar& sigma = 1);
+  MatrixType reconstructedMatrix() const;
 
-  protected:
-    
-    static void check_template_parameters()
-    {
-      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
-    }
-    
-    /** \internal
-      * Used to compute and store L
-      * The strict upper part is not used and even not initialized.
-      */
-    MatrixType m_matrix;
-    bool m_isInitialized;
-    ComputationInfo m_info;
+  /** \brief Reports whether previous computation was successful.
+   *
+   * \returns \c Success if computation was successful,
+   *          \c NumericalIssue if the matrix.appears not to be positive definite.
+   */
+  ComputationInfo info() const {
+    eigen_assert(m_isInitialized && "LLT is not initialized.");
+    return m_info;
+  }
+
+  /** \returns the adjoint of \c *this, that is, a const reference to the decomposition itself as the underlying matrix
+   * is self-adjoint.
+   *
+   * This method is provided for compatibility with other matrix decompositions, thus enabling generic code such as:
+   * \code x = decomposition.adjoint().solve(b) \endcode
+   */
+  const LLT& adjoint() const noexcept { return *this; }
+
+  constexpr Index rows() const noexcept { return m_matrix.rows(); }
+  constexpr Index cols() const noexcept { return m_matrix.cols(); }
+
+  template <typename VectorType>
+  LLT& rankUpdate(const VectorType& vec, const RealScalar& sigma = 1);
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  template <typename RhsType, typename DstType>
+  void _solve_impl(const RhsType& rhs, DstType& dst) const;
+
+  template <bool Conjugate, typename RhsType, typename DstType>
+  void _solve_impl_transposed(const RhsType& rhs, DstType& dst) const;
+#endif
+
+ protected:
+  EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
+
+  /** \internal
+   * Used to compute and store L
+   * The strict upper part is not used and even not initialized.
+   */
+  MatrixType m_matrix;
+  RealScalar m_l1_norm;
+  bool m_isInitialized;
+  ComputationInfo m_info;
 };
 
 namespace internal {
 
-template<typename Scalar, int UpLo> struct llt_inplace;
+template <typename Scalar, int UpLo>
+struct llt_inplace;
 
-template<typename MatrixType, typename VectorType>
-static typename MatrixType::Index llt_rank_update_lower(MatrixType& mat, const VectorType& vec, const typename MatrixType::RealScalar& sigma)
-{
+template <typename MatrixType, typename VectorType>
+static Index llt_rank_update_lower(MatrixType& mat, const VectorType& vec,
+                                   const typename MatrixType::RealScalar& sigma) {
   using std::sqrt;
   typedef typename MatrixType::Scalar Scalar;
   typedef typename MatrixType::RealScalar RealScalar;
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::ColXpr ColXpr;
-  typedef typename internal::remove_all<ColXpr>::type ColXprCleaned;
+  typedef internal::remove_all_t<ColXpr> ColXprCleaned;
   typedef typename ColXprCleaned::SegmentReturnType ColXprSegment;
-  typedef Matrix<Scalar,Dynamic,1> TempVectorType;
+  typedef Matrix<Scalar, Dynamic, 1> TempVectorType;
   typedef typename TempVectorType::SegmentReturnType TempVecSegment;
 
   Index n = mat.cols();
-  eigen_assert(mat.rows()==n && vec.size()==n);
+  eigen_assert(mat.rows() == n && vec.size() == n);
 
   TempVectorType temp;
 
-  if(sigma>0)
-  {
+  if (sigma > 0) {
     // This version is based on Givens rotations.
     // It is faster than the other one below, but only works for updates,
     // i.e., for sigma > 0
     temp = sqrt(sigma) * vec;
 
-    for(Index i=0; i<n; ++i)
-    {
+    for (Index i = 0; i < n; ++i) {
       JacobiRotation<Scalar> g;
-      g.makeGivens(mat(i,i), -temp(i), &mat(i,i));
+      g.makeGivens(mat(i, i), -temp(i), &mat(i, i));
 
-      Index rs = n-i-1;
-      if(rs>0)
-      {
+      Index rs = n - i - 1;
+      if (rs > 0) {
         ColXprSegment x(mat.col(i).tail(rs));
         TempVecSegment y(temp.tail(rs));
         apply_rotation_in_the_plane(x, y, g);
       }
     }
-  }
-  else
-  {
+  } else {
     temp = vec;
     RealScalar beta = 1;
-    for(Index j=0; j<n; ++j)
-    {
-      RealScalar Ljj = numext::real(mat.coeff(j,j));
+    for (Index j = 0; j < n; ++j) {
+      RealScalar Ljj = numext::real(mat.coeff(j, j));
       RealScalar dj = numext::abs2(Ljj);
       Scalar wj = temp.coeff(j);
-      RealScalar swj2 = sigma*numext::abs2(wj);
-      RealScalar gamma = dj*beta + swj2;
+      RealScalar swj2 = sigma * numext::abs2(wj);
+      RealScalar gamma = dj * beta + swj2;
 
-      RealScalar x = dj + swj2/beta;
-      if (x<=RealScalar(0))
-        return j;
+      RealScalar x = dj + swj2 / beta;
+      if (x <= RealScalar(0)) return j;
       RealScalar nLjj = sqrt(x);
-      mat.coeffRef(j,j) = nLjj;
-      beta += swj2/dj;
+      mat.coeffRef(j, j) = nLjj;
+      beta += swj2 / dj;
 
       // Update the terms of L
-      Index rs = n-j-1;
-      if(rs)
-      {
-        temp.tail(rs) -= (wj/Ljj) * mat.col(j).tail(rs);
-        if(gamma != 0)
-          mat.col(j).tail(rs) = (nLjj/Ljj) * mat.col(j).tail(rs) + (nLjj * sigma*numext::conj(wj)/gamma)*temp.tail(rs);
+      Index rs = n - j - 1;
+      if (rs) {
+        temp.tail(rs) -= (wj / Ljj) * mat.col(j).tail(rs);
+        if (!numext::is_exactly_zero(gamma))
+          mat.col(j).tail(rs) =
+              (nLjj / Ljj) * mat.col(j).tail(rs) + (nLjj * sigma * numext::conj(wj) / gamma) * temp.tail(rs);
       }
     }
   }
   return -1;
 }
 
-template<typename Scalar> struct llt_inplace<Scalar, Lower>
-{
+template <typename Scalar>
+struct llt_inplace<Scalar, Lower> {
   typedef typename NumTraits<Scalar>::Real RealScalar;
-  template<typename MatrixType>
-  static typename MatrixType::Index unblocked(MatrixType& mat)
-  {
+  template <typename MatrixType>
+  static Index unblocked(MatrixType& mat) {
     using std::sqrt;
-    typedef typename MatrixType::Index Index;
-    
-    eigen_assert(mat.rows()==mat.cols());
+
+    eigen_assert(mat.rows() == mat.cols());
     const Index size = mat.rows();
-    for(Index k = 0; k < size; ++k)
-    {
-      Index rs = size-k-1; // remaining size
-
-      Block<MatrixType,Dynamic,1> A21(mat,k+1,k,rs,1);
-      Block<MatrixType,1,Dynamic> A10(mat,k,0,1,k);
-      Block<MatrixType,Dynamic,Dynamic> A20(mat,k+1,0,rs,k);
-
-      RealScalar x = numext::real(mat.coeff(k,k));
-      if (k>0) x -= A10.squaredNorm();
-      if (x<=RealScalar(0))
-        return k;
-      mat.coeffRef(k,k) = x = sqrt(x);
-      if (k>0 && rs>0) A21.noalias() -= A20 * A10.adjoint();
-      if (rs>0) A21 /= x;
+    for (Index k = 0; k < size; ++k) {
+      Index rs = size - k - 1;  // remaining size
+
+      Block<MatrixType, Dynamic, 1> A21(mat, k + 1, k, rs, 1);
+      Block<MatrixType, 1, Dynamic> A10(mat, k, 0, 1, k);
+      Block<MatrixType, Dynamic, Dynamic> A20(mat, k + 1, 0, rs, k);
+
+      RealScalar x = numext::real(mat.coeff(k, k));
+      if (k > 0) x -= A10.squaredNorm();
+      if (x <= RealScalar(0)) return k;
+      mat.coeffRef(k, k) = x = sqrt(x);
+      if (k > 0 && rs > 0) A21.noalias() -= A20 * A10.adjoint();
+      if (rs > 0) A21 /= x;
     }
     return -1;
   }
 
-  template<typename MatrixType>
-  static typename MatrixType::Index blocked(MatrixType& m)
-  {
-    typedef typename MatrixType::Index Index;
-    eigen_assert(m.rows()==m.cols());
+  template <typename MatrixType>
+  static Index blocked(MatrixType& m) {
+    eigen_assert(m.rows() == m.cols());
     Index size = m.rows();
-    if(size<32)
-      return unblocked(m);
+    if (size < 32) return unblocked(m);
 
-    Index blockSize = size/8;
-    blockSize = (blockSize/16)*16;
-    blockSize = (std::min)((std::max)(blockSize,Index(8)), Index(128));
+    Index blockSize = size / 8;
+    blockSize = (blockSize / 16) * 16;
+    blockSize = (std::min)((std::max)(blockSize, Index(8)), Index(128));
 
-    for (Index k=0; k<size; k+=blockSize)
-    {
+    for (Index k = 0; k < size; k += blockSize) {
       // partition the matrix:
       //       A00 |  -  |  -
       // lu  = A10 | A11 |  -
       //       A20 | A21 | A22
-      Index bs = (std::min)(blockSize, size-k);
+      Index bs = (std::min)(blockSize, size - k);
       Index rs = size - k - bs;
-      Block<MatrixType,Dynamic,Dynamic> A11(m,k,   k,   bs,bs);
-      Block<MatrixType,Dynamic,Dynamic> A21(m,k+bs,k,   rs,bs);
-      Block<MatrixType,Dynamic,Dynamic> A22(m,k+bs,k+bs,rs,rs);
+      Block<MatrixType, Dynamic, Dynamic> A11(m, k, k, bs, bs);
+      Block<MatrixType, Dynamic, Dynamic> A21(m, k + bs, k, rs, bs);
+      Block<MatrixType, Dynamic, Dynamic> A22(m, k + bs, k + bs, rs, rs);
 
       Index ret;
-      if((ret=unblocked(A11))>=0) return k+ret;
-      if(rs>0) A11.adjoint().template triangularView<Upper>().template solveInPlace<OnTheRight>(A21);
-      if(rs>0) A22.template selfadjointView<Lower>().rankUpdate(A21,-1); // bottleneck
+      if ((ret = unblocked(A11)) >= 0) return k + ret;
+      if (rs > 0) A11.adjoint().template triangularView<Upper>().template solveInPlace<OnTheRight>(A21);
+      if (rs > 0)
+        A22.template selfadjointView<Lower>().rankUpdate(A21,
+                                                         typename NumTraits<RealScalar>::Literal(-1));  // bottleneck
     }
     return -1;
   }
 
-  template<typename MatrixType, typename VectorType>
-  static typename MatrixType::Index rankUpdate(MatrixType& mat, const VectorType& vec, const RealScalar& sigma)
-  {
+  template <typename MatrixType, typename VectorType>
+  static Index rankUpdate(MatrixType& mat, const VectorType& vec, const RealScalar& sigma) {
     return Eigen::internal::llt_rank_update_lower(mat, vec, sigma);
   }
 };
-  
-template<typename Scalar> struct llt_inplace<Scalar, Upper>
-{
+
+template <typename Scalar>
+struct llt_inplace<Scalar, Upper> {
   typedef typename NumTraits<Scalar>::Real RealScalar;
 
-  template<typename MatrixType>
-  static EIGEN_STRONG_INLINE typename MatrixType::Index unblocked(MatrixType& mat)
-  {
+  template <typename MatrixType>
+  static EIGEN_STRONG_INLINE Index unblocked(MatrixType& mat) {
     Transpose<MatrixType> matt(mat);
     return llt_inplace<Scalar, Lower>::unblocked(matt);
   }
-  template<typename MatrixType>
-  static EIGEN_STRONG_INLINE typename MatrixType::Index blocked(MatrixType& mat)
-  {
+  template <typename MatrixType>
+  static EIGEN_STRONG_INLINE Index blocked(MatrixType& mat) {
     Transpose<MatrixType> matt(mat);
     return llt_inplace<Scalar, Lower>::blocked(matt);
   }
-  template<typename MatrixType, typename VectorType>
-  static typename MatrixType::Index rankUpdate(MatrixType& mat, const VectorType& vec, const RealScalar& sigma)
-  {
+  template <typename MatrixType, typename VectorType>
+  static Index rankUpdate(MatrixType& mat, const VectorType& vec, const RealScalar& sigma) {
     Transpose<MatrixType> matt(mat);
     return llt_inplace<Scalar, Lower>::rankUpdate(matt, vec.conjugate(), sigma);
   }
 };
 
-template<typename MatrixType> struct LLT_Traits<MatrixType,Lower>
-{
+template <typename MatrixType>
+struct LLT_Traits<MatrixType, Lower> {
   typedef const TriangularView<const MatrixType, Lower> MatrixL;
   typedef const TriangularView<const typename MatrixType::AdjointReturnType, Upper> MatrixU;
-  static inline MatrixL getL(const MatrixType& m) { return m; }
-  static inline MatrixU getU(const MatrixType& m) { return m.adjoint(); }
-  static bool inplace_decomposition(MatrixType& m)
-  { return llt_inplace<typename MatrixType::Scalar, Lower>::blocked(m)==-1; }
+  static inline MatrixL getL(const MatrixType& m) { return MatrixL(m); }
+  static inline MatrixU getU(const MatrixType& m) { return MatrixU(m.adjoint()); }
+  static bool inplace_decomposition(MatrixType& m) {
+    return llt_inplace<typename MatrixType::Scalar, Lower>::blocked(m) == -1;
+  }
 };
 
-template<typename MatrixType> struct LLT_Traits<MatrixType,Upper>
-{
+template <typename MatrixType>
+struct LLT_Traits<MatrixType, Upper> {
   typedef const TriangularView<const typename MatrixType::AdjointReturnType, Lower> MatrixL;
   typedef const TriangularView<const MatrixType, Upper> MatrixU;
-  static inline MatrixL getL(const MatrixType& m) { return m.adjoint(); }
-  static inline MatrixU getU(const MatrixType& m) { return m; }
-  static bool inplace_decomposition(MatrixType& m)
-  { return llt_inplace<typename MatrixType::Scalar, Upper>::blocked(m)==-1; }
+  static inline MatrixL getL(const MatrixType& m) { return MatrixL(m.adjoint()); }
+  static inline MatrixU getU(const MatrixType& m) { return MatrixU(m); }
+  static bool inplace_decomposition(MatrixType& m) {
+    return llt_inplace<typename MatrixType::Scalar, Upper>::blocked(m) == -1;
+  }
 };
 
-} // end namespace internal
+}  // end namespace internal
 
 /** Computes / recomputes the Cholesky decomposition A = LL^* = U^*U of \a matrix
-  *
-  * \returns a reference to *this
-  *
-  * Example: \include TutorialLinAlgComputeTwice.cpp
-  * Output: \verbinclude TutorialLinAlgComputeTwice.out
-  */
-template<typename MatrixType, int _UpLo>
-LLT<MatrixType,_UpLo>& LLT<MatrixType,_UpLo>::compute(const MatrixType& a)
-{
-  check_template_parameters();
-  
-  eigen_assert(a.rows()==a.cols());
+ *
+ * \returns a reference to *this
+ *
+ * Example: \include TutorialLinAlgComputeTwice.cpp
+ * Output: \verbinclude TutorialLinAlgComputeTwice.out
+ */
+template <typename MatrixType, int UpLo_>
+template <typename InputType>
+LLT<MatrixType, UpLo_>& LLT<MatrixType, UpLo_>::compute(const EigenBase<InputType>& a) {
+  eigen_assert(a.rows() == a.cols());
   const Index size = a.rows();
   m_matrix.resize(size, size);
-  m_matrix = a;
+  if (!internal::is_same_dense(m_matrix, a.derived())) m_matrix = a.derived();
+
+  // Compute matrix L1 norm = max abs column sum.
+  m_l1_norm = RealScalar(0);
+  // TODO move this code to SelfAdjointView
+  for (Index col = 0; col < size; ++col) {
+    RealScalar abs_col_sum;
+    if (UpLo_ == Lower)
+      abs_col_sum =
+          m_matrix.col(col).tail(size - col).template lpNorm<1>() + m_matrix.row(col).head(col).template lpNorm<1>();
+    else
+      abs_col_sum =
+          m_matrix.col(col).head(col).template lpNorm<1>() + m_matrix.row(col).tail(size - col).template lpNorm<1>();
+    if (abs_col_sum > m_l1_norm) m_l1_norm = abs_col_sum;
+  }
 
   m_isInitialized = true;
   bool ok = Traits::inplace_decomposition(m_matrix);
@@ -405,60 +424,59 @@ LLT<MatrixType,_UpLo>& LLT<MatrixType,_UpLo>::compute(const MatrixType& a)
 }
 
 /** Performs a rank one update (or dowdate) of the current decomposition.
-  * If A = LL^* before the rank one update,
-  * then after it we have LL^* = A + sigma * v v^* where \a v must be a vector
-  * of same dimension.
-  */
-template<typename _MatrixType, int _UpLo>
-template<typename VectorType>
-LLT<_MatrixType,_UpLo> LLT<_MatrixType,_UpLo>::rankUpdate(const VectorType& v, const RealScalar& sigma)
-{
+ * If A = LL^* before the rank one update,
+ * then after it we have LL^* = A + sigma * v v^* where \a v must be a vector
+ * of same dimension.
+ */
+template <typename MatrixType_, int UpLo_>
+template <typename VectorType>
+LLT<MatrixType_, UpLo_>& LLT<MatrixType_, UpLo_>::rankUpdate(const VectorType& v, const RealScalar& sigma) {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(VectorType);
-  eigen_assert(v.size()==m_matrix.cols());
+  eigen_assert(v.size() == m_matrix.cols());
   eigen_assert(m_isInitialized);
-  if(internal::llt_inplace<typename MatrixType::Scalar, UpLo>::rankUpdate(m_matrix,v,sigma)>=0)
+  if (internal::llt_inplace<typename MatrixType::Scalar, UpLo>::rankUpdate(m_matrix, v, sigma) >= 0)
     m_info = NumericalIssue;
   else
     m_info = Success;
 
   return *this;
 }
-    
-namespace internal {
-template<typename _MatrixType, int UpLo, typename Rhs>
-struct solve_retval<LLT<_MatrixType, UpLo>, Rhs>
-  : solve_retval_base<LLT<_MatrixType, UpLo>, Rhs>
-{
-  typedef LLT<_MatrixType,UpLo> LLTType;
-  EIGEN_MAKE_SOLVE_HELPERS(LLTType,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dst = rhs();
-    dec().solveInPlace(dst);
-  }
-};
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+template <typename MatrixType_, int UpLo_>
+template <typename RhsType, typename DstType>
+void LLT<MatrixType_, UpLo_>::_solve_impl(const RhsType& rhs, DstType& dst) const {
+  _solve_impl_transposed<true>(rhs, dst);
+}
+
+template <typename MatrixType_, int UpLo_>
+template <bool Conjugate, typename RhsType, typename DstType>
+void LLT<MatrixType_, UpLo_>::_solve_impl_transposed(const RhsType& rhs, DstType& dst) const {
+  dst = rhs;
+
+  matrixL().template conjugateIf<!Conjugate>().solveInPlace(dst);
+  matrixU().template conjugateIf<!Conjugate>().solveInPlace(dst);
 }
+#endif
 
 /** \internal use x = llt_object.solve(x);
-  * 
-  * This is the \em in-place version of solve().
-  *
-  * \param bAndX represents both the right-hand side matrix b and result x.
-  *
-  * \returns true always! If you need to check for existence of solutions, use another decomposition like LU, QR, or SVD.
-  *
-  * This version avoids a copy when the right hand side matrix b is not
-  * needed anymore.
-  *
-  * \sa LLT::solve(), MatrixBase::llt()
-  */
-template<typename MatrixType, int _UpLo>
-template<typename Derived>
-void LLT<MatrixType,_UpLo>::solveInPlace(MatrixBase<Derived> &bAndX) const
-{
+ *
+ * This is the \em in-place version of solve().
+ *
+ * \param bAndX represents both the right-hand side matrix b and result x.
+ *
+ * This version avoids a copy when the right hand side matrix b is not needed anymore.
+ *
+ * \warning The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here.
+ * This function will const_cast it, so constness isn't honored here.
+ *
+ * \sa LLT::solve(), MatrixBase::llt()
+ */
+template <typename MatrixType, int UpLo_>
+template <typename Derived>
+void LLT<MatrixType, UpLo_>::solveInPlace(const MatrixBase<Derived>& bAndX) const {
   eigen_assert(m_isInitialized && "LLT is not initialized.");
-  eigen_assert(m_matrix.rows()==bAndX.rows());
+  eigen_assert(m_matrix.rows() == bAndX.rows());
   matrixL().solveInPlace(bAndX);
   matrixU().solveInPlace(bAndX);
 }
@@ -466,33 +484,31 @@ void LLT<MatrixType,_UpLo>::solveInPlace(MatrixBase<Derived> &bAndX) const
 /** \returns the matrix represented by the decomposition,
  * i.e., it returns the product: L L^*.
  * This function is provided for debug purpose. */
-template<typename MatrixType, int _UpLo>
-MatrixType LLT<MatrixType,_UpLo>::reconstructedMatrix() const
-{
+template <typename MatrixType, int UpLo_>
+MatrixType LLT<MatrixType, UpLo_>::reconstructedMatrix() const {
   eigen_assert(m_isInitialized && "LLT is not initialized.");
   return matrixL() * matrixL().adjoint().toDenseMatrix();
 }
 
 /** \cholesky_module
-  * \returns the LLT decomposition of \c *this
-  */
-template<typename Derived>
-inline const LLT<typename MatrixBase<Derived>::PlainObject>
-MatrixBase<Derived>::llt() const
-{
+ * \returns the LLT decomposition of \c *this
+ * \sa SelfAdjointView::llt()
+ */
+template <typename Derived>
+inline const LLT<typename MatrixBase<Derived>::PlainObject> MatrixBase<Derived>::llt() const {
   return LLT<PlainObject>(derived());
 }
 
 /** \cholesky_module
-  * \returns the LLT decomposition of \c *this
-  */
-template<typename MatrixType, unsigned int UpLo>
-inline const LLT<typename SelfAdjointView<MatrixType, UpLo>::PlainObject, UpLo>
-SelfAdjointView<MatrixType, UpLo>::llt() const
-{
-  return LLT<PlainObject,UpLo>(m_matrix);
+ * \returns the LLT decomposition of \c *this
+ * \sa SelfAdjointView::llt()
+ */
+template <typename MatrixType, unsigned int UpLo>
+inline const LLT<typename SelfAdjointView<MatrixType, UpLo>::PlainObject, UpLo> SelfAdjointView<MatrixType, UpLo>::llt()
+    const {
+  return LLT<PlainObject, UpLo>(m_matrix);
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_LLT_H
+#endif  // EIGEN_LLT_H
diff --git a/inst/include/Eigen/src/Cholesky/LLT_LAPACKE.h b/inst/include/Eigen/src/Cholesky/LLT_LAPACKE.h
new file mode 100644
index 00000000..cb55b156
--- /dev/null
+++ b/inst/include/Eigen/src/Cholesky/LLT_LAPACKE.h
@@ -0,0 +1,124 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to LAPACKe
+ *     LLt decomposition based on LAPACKE_?potrf function.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_LLT_LAPACKE_H
+#define EIGEN_LLT_LAPACKE_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+namespace lapacke_helpers {
+// -------------------------------------------------------------------------------------------------------------------
+//        Dispatch for rank update handling upper and lower parts
+// -------------------------------------------------------------------------------------------------------------------
+
+template <UpLoType Mode>
+struct rank_update {};
+
+template <>
+struct rank_update<Lower> {
+  template <typename MatrixType, typename VectorType>
+  static Index run(MatrixType &mat, const VectorType &vec, const typename MatrixType::RealScalar &sigma) {
+    return Eigen::internal::llt_rank_update_lower(mat, vec, sigma);
+  }
+};
+
+template <>
+struct rank_update<Upper> {
+  template <typename MatrixType, typename VectorType>
+  static Index run(MatrixType &mat, const VectorType &vec, const typename MatrixType::RealScalar &sigma) {
+    Transpose<MatrixType> matt(mat);
+    return Eigen::internal::llt_rank_update_lower(matt, vec.conjugate(), sigma);
+  }
+};
+
+// -------------------------------------------------------------------------------------------------------------------
+//        Generic lapacke llt implementation that hands of to the dispatches
+// -------------------------------------------------------------------------------------------------------------------
+
+template <typename Scalar, UpLoType Mode>
+struct lapacke_llt {
+  EIGEN_STATIC_ASSERT(((Mode == Lower) || (Mode == Upper)), MODE_MUST_BE_UPPER_OR_LOWER)
+  template <typename MatrixType>
+  static Index blocked(MatrixType &m) {
+    eigen_assert(m.rows() == m.cols());
+    if (m.rows() == 0) {
+      return -1;
+    }
+    /* Set up parameters for ?potrf */
+    lapack_int size = to_lapack(m.rows());
+    lapack_int matrix_order = lapack_storage_of(m);
+    constexpr char uplo = Mode == Upper ? 'U' : 'L';
+    Scalar *a = &(m.coeffRef(0, 0));
+    lapack_int lda = to_lapack(m.outerStride());
+
+    lapack_int info = potrf(matrix_order, uplo, size, to_lapack(a), lda);
+    info = (info == 0) ? -1 : info > 0 ? info - 1 : size;
+    return info;
+  }
+
+  template <typename MatrixType, typename VectorType>
+  static Index rankUpdate(MatrixType &mat, const VectorType &vec, const typename MatrixType::RealScalar &sigma) {
+    return rank_update<Mode>::run(mat, vec, sigma);
+  }
+};
+}  // namespace lapacke_helpers
+// end namespace lapacke_helpers
+
+/*
+ * Here, we just put the generic implementation from lapacke_llt into a full specialization of the llt_inplace
+ * type. By being a full specialization, the versions defined here thus get precedence over the generic implementation
+ * in LLT.h for double, float and complex double, complex float types.
+ */
+
+#define EIGEN_LAPACKE_LLT(EIGTYPE)                                                             \
+  template <>                                                                                  \
+  struct llt_inplace<EIGTYPE, Lower> : public lapacke_helpers::lapacke_llt<EIGTYPE, Lower> {}; \
+  template <>                                                                                  \
+  struct llt_inplace<EIGTYPE, Upper> : public lapacke_helpers::lapacke_llt<EIGTYPE, Upper> {};
+
+EIGEN_LAPACKE_LLT(double)
+EIGEN_LAPACKE_LLT(float)
+EIGEN_LAPACKE_LLT(std::complex<double>)
+EIGEN_LAPACKE_LLT(std::complex<float>)
+
+#undef EIGEN_LAPACKE_LLT
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_LLT_LAPACKE_H
diff --git a/inst/include/Eigen/src/Cholesky/LLT_MKL.h b/inst/include/Eigen/src/Cholesky/LLT_MKL.h
deleted file mode 100644
index 66675d74..00000000
--- a/inst/include/Eigen/src/Cholesky/LLT_MKL.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- Copyright (c) 2011, Intel Corporation. All rights reserved.
-
- Redistribution and use in source and binary forms, with or without modification,
- are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
- * Neither the name of Intel Corporation nor the names of its contributors may
-   be used to endorse or promote products derived from this software without
-   specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
- ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
- ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
- *     LLt decomposition based on LAPACKE_?potrf function.
- ********************************************************************************
-*/
-
-#ifndef EIGEN_LLT_MKL_H
-#define EIGEN_LLT_MKL_H
-
-#include "Eigen/src/Core/util/MKL_support.h"
-#include <iostream>
-
-namespace Eigen { 
-
-namespace internal {
-
-template<typename Scalar> struct mkl_llt;
-
-#define EIGEN_MKL_LLT(EIGTYPE, MKLTYPE, MKLPREFIX) \
-template<> struct mkl_llt<EIGTYPE> \
-{ \
-  template<typename MatrixType> \
-  static inline typename MatrixType::Index potrf(MatrixType& m, char uplo) \
-  { \
-    lapack_int matrix_order; \
-    lapack_int size, lda, info, StorageOrder; \
-    EIGTYPE* a; \
-    eigen_assert(m.rows()==m.cols()); \
-    /* Set up parameters for ?potrf */ \
-    size = m.rows(); \
-    StorageOrder = MatrixType::Flags&RowMajorBit?RowMajor:ColMajor; \
-    matrix_order = StorageOrder==RowMajor ? LAPACK_ROW_MAJOR : LAPACK_COL_MAJOR; \
-    a = &(m.coeffRef(0,0)); \
-    lda = m.outerStride(); \
-\
-    info = LAPACKE_##MKLPREFIX##potrf( matrix_order, uplo, size, (MKLTYPE*)a, lda ); \
-    info = (info==0) ? -1 : info>0 ? info-1 : size; \
-    return info; \
-  } \
-}; \
-template<> struct llt_inplace<EIGTYPE, Lower> \
-{ \
-  template<typename MatrixType> \
-  static typename MatrixType::Index blocked(MatrixType& m) \
-  { \
-    return mkl_llt<EIGTYPE>::potrf(m, 'L'); \
-  } \
-  template<typename MatrixType, typename VectorType> \
-  static typename MatrixType::Index rankUpdate(MatrixType& mat, const VectorType& vec, const typename MatrixType::RealScalar& sigma) \
-  { return Eigen::internal::llt_rank_update_lower(mat, vec, sigma); } \
-}; \
-template<> struct llt_inplace<EIGTYPE, Upper> \
-{ \
-  template<typename MatrixType> \
-  static typename MatrixType::Index blocked(MatrixType& m) \
-  { \
-    return mkl_llt<EIGTYPE>::potrf(m, 'U'); \
-  } \
-  template<typename MatrixType, typename VectorType> \
-  static typename MatrixType::Index rankUpdate(MatrixType& mat, const VectorType& vec, const typename MatrixType::RealScalar& sigma) \
-  { \
-    Transpose<MatrixType> matt(mat); \
-    return llt_inplace<EIGTYPE, Lower>::rankUpdate(matt, vec.conjugate(), sigma); \
-  } \
-};
-
-EIGEN_MKL_LLT(double, double, d)
-EIGEN_MKL_LLT(float, float, s)
-EIGEN_MKL_LLT(dcomplex, MKL_Complex16, z)
-EIGEN_MKL_LLT(scomplex, MKL_Complex8, c)
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_LLT_MKL_H
diff --git a/inst/include/Eigen/src/CholmodSupport/CholmodSupport.h b/inst/include/Eigen/src/CholmodSupport/CholmodSupport.h
index 99dbe171..d5c39a63 100644
--- a/inst/include/Eigen/src/CholmodSupport/CholmodSupport.h
+++ b/inst/include/Eigen/src/CholmodSupport/CholmodSupport.h
@@ -10,598 +10,725 @@
 #ifndef EIGEN_CHOLMODSUPPORT_H
 #define EIGEN_CHOLMODSUPPORT_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+#ifndef R_MATRIX_CHOLMOD
+# define R_MATRIX_CHOLMOD(_NAME_) cholmod_ ## _NAME_
+#endif
+
+namespace Eigen {
 
 namespace internal {
 
-template<typename Scalar, typename CholmodType>
-void cholmod_configure_matrix(CholmodType& mat)
-{
-  if (internal::is_same<Scalar,float>::value)
-  {
-    mat.xtype = CHOLMOD_REAL;
-    mat.dtype = CHOLMOD_SINGLE;
-  }
-  else if (internal::is_same<Scalar,double>::value)
-  {
+template <typename Scalar>
+struct cholmod_configure_matrix;
+
+template <>
+struct cholmod_configure_matrix<double> {
+  template <typename CholmodType>
+  static void run(CholmodType& mat) {
     mat.xtype = CHOLMOD_REAL;
     mat.dtype = CHOLMOD_DOUBLE;
   }
-  else if (internal::is_same<Scalar,std::complex<float> >::value)
-  {
-    mat.xtype = CHOLMOD_COMPLEX;
-    mat.dtype = CHOLMOD_SINGLE;
-  }
-  else if (internal::is_same<Scalar,std::complex<double> >::value)
-  {
+};
+
+template <>
+struct cholmod_configure_matrix<std::complex<double> > {
+  template <typename CholmodType>
+  static void run(CholmodType& mat) {
     mat.xtype = CHOLMOD_COMPLEX;
     mat.dtype = CHOLMOD_DOUBLE;
   }
-  else
-  {
-    eigen_assert(false && "Scalar type not supported by CHOLMOD");
-  }
-}
+};
+
+// Other scalar types are not yet supported by Cholmod
+// template<> struct cholmod_configure_matrix<float> {
+//   template<typename CholmodType>
+//   static void run(CholmodType& mat) {
+//     mat.xtype = CHOLMOD_REAL;
+//     mat.dtype = CHOLMOD_SINGLE;
+//   }
+// };
+//
+// template<> struct cholmod_configure_matrix<std::complex<float> > {
+//   template<typename CholmodType>
+//   static void run(CholmodType& mat) {
+//     mat.xtype = CHOLMOD_COMPLEX;
+//     mat.dtype = CHOLMOD_SINGLE;
+//   }
+// };
 
-} // namespace internal
+}  // namespace internal
 
 /** Wraps the Eigen sparse matrix \a mat into a Cholmod sparse matrix object.
-  * Note that the data are shared.
-  */
-template<typename _Scalar, int _Options, typename _Index>
-cholmod_sparse viewAsCholmod(SparseMatrix<_Scalar,_Options,_Index>& mat)
-{
+ * Note that the data are shared.
+ */
+template <typename Scalar_, int Options_, typename StorageIndex_>
+cholmod_sparse viewAsCholmod(Ref<SparseMatrix<Scalar_, Options_, StorageIndex_> > mat) {
   cholmod_sparse res;
-  res.nzmax   = mat.nonZeros();
-  res.nrow    = mat.rows();;
-  res.ncol    = mat.cols();
-  res.p       = mat.outerIndexPtr();
-  res.i       = mat.innerIndexPtr();
-  res.x       = mat.valuePtr();
-  res.z       = 0;
-  res.sorted  = 1;
-  if(mat.isCompressed())
-  {
-    res.packed  = 1;
+  res.nzmax = mat.nonZeros();
+  res.nrow = mat.rows();
+  res.ncol = mat.cols();
+  res.p = mat.outerIndexPtr();
+  res.i = mat.innerIndexPtr();
+  res.x = mat.valuePtr();
+  res.z = 0;
+  res.sorted = 1;
+  if (mat.isCompressed()) {
+    res.packed = 1;
     res.nz = 0;
-  }
-  else
-  {
-    res.packed  = 0;
+  } else {
+    res.packed = 0;
     res.nz = mat.innerNonZeroPtr();
   }
 
-  res.dtype   = 0;
-  res.stype   = -1;
-  
-  if (internal::is_same<_Index,int>::value)
-  {
+  res.dtype = 0;
+  res.stype = -1;
+
+  if (internal::is_same<StorageIndex_, int>::value) {
     res.itype = CHOLMOD_INT;
-  }
-  else if (internal::is_same<_Index,SuiteSparse_long>::value)
-  {
-    res.itype = CHOLMOD_LONG;
-  }
-  else
-  {
+  // } else if (internal::is_same<StorageIndex_, SuiteSparse_long>::value) {
+  //   res.itype = CHOLMOD_LONG;
+  } else {
     eigen_assert(false && "Index type not supported yet");
   }
 
   // setup res.xtype
-  internal::cholmod_configure_matrix<_Scalar>(res);
-  
+  internal::cholmod_configure_matrix<Scalar_>::run(res);
+
   res.stype = 0;
-  
+
+  return res;
+}
+
+template <typename Scalar_, int Options_, typename Index_>
+const cholmod_sparse viewAsCholmod(const SparseMatrix<Scalar_, Options_, Index_>& mat) {
+  cholmod_sparse res = viewAsCholmod(Ref<SparseMatrix<Scalar_, Options_, Index_> >(mat.const_cast_derived()));
   return res;
 }
 
-template<typename _Scalar, int _Options, typename _Index>
-const cholmod_sparse viewAsCholmod(const SparseMatrix<_Scalar,_Options,_Index>& mat)
-{
-  cholmod_sparse res = viewAsCholmod(mat.const_cast_derived());
+template <typename Scalar_, int Options_, typename Index_>
+const cholmod_sparse viewAsCholmod(const SparseVector<Scalar_, Options_, Index_>& mat) {
+  cholmod_sparse res = viewAsCholmod(Ref<SparseMatrix<Scalar_, Options_, Index_> >(mat.const_cast_derived()));
   return res;
 }
 
 /** Returns a view of the Eigen sparse matrix \a mat as Cholmod sparse matrix.
-  * The data are not copied but shared. */
-template<typename _Scalar, int _Options, typename _Index, unsigned int UpLo>
-cholmod_sparse viewAsCholmod(const SparseSelfAdjointView<SparseMatrix<_Scalar,_Options,_Index>, UpLo>& mat)
-{
-  cholmod_sparse res = viewAsCholmod(mat.matrix().const_cast_derived());
-  
-  if(UpLo==Upper) res.stype =  1;
-  if(UpLo==Lower) res.stype = -1;
+ * The data are not copied but shared. */
+template <typename Scalar_, int Options_, typename Index_, unsigned int UpLo>
+cholmod_sparse viewAsCholmod(const SparseSelfAdjointView<const SparseMatrix<Scalar_, Options_, Index_>, UpLo>& mat) {
+  cholmod_sparse res = viewAsCholmod(Ref<SparseMatrix<Scalar_, Options_, Index_> >(mat.matrix().const_cast_derived()));
+
+  if (UpLo == Upper) res.stype = 1;
+  if (UpLo == Lower) res.stype = -1;
+  // swap stype for rowmajor matrices (only works for real matrices)
+  EIGEN_STATIC_ASSERT((Options_ & RowMajorBit) == 0 || NumTraits<Scalar_>::IsComplex == 0,
+                      THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES);
+  if (Options_ & RowMajorBit) res.stype *= -1;
 
   return res;
 }
 
 /** Returns a view of the Eigen \b dense matrix \a mat as Cholmod dense matrix.
-  * The data are not copied but shared. */
-template<typename Derived>
-cholmod_dense viewAsCholmod(MatrixBase<Derived>& mat)
-{
-  EIGEN_STATIC_ASSERT((internal::traits<Derived>::Flags&RowMajorBit)==0,THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES);
+ * The data are not copied but shared. */
+template <typename Derived>
+cholmod_dense viewAsCholmod(MatrixBase<Derived>& mat) {
+  EIGEN_STATIC_ASSERT((internal::traits<Derived>::Flags & RowMajorBit) == 0,
+                      THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES);
   typedef typename Derived::Scalar Scalar;
 
   cholmod_dense res;
-  res.nrow   = mat.rows();
-  res.ncol   = mat.cols();
-  res.nzmax  = res.nrow * res.ncol;
-  res.d      = Derived::IsVectorAtCompileTime ? mat.derived().size() : mat.derived().outerStride();
-  res.x      = (void*)(mat.derived().data());
-  res.z      = 0;
+  res.nrow = mat.rows();
+  res.ncol = mat.cols();
+  res.nzmax = res.nrow * res.ncol;
+  res.d = Derived::IsVectorAtCompileTime ? mat.derived().size() : mat.derived().outerStride();
+  res.x = (void*)(mat.derived().data());
+  res.z = 0;
 
-  internal::cholmod_configure_matrix<Scalar>(res);
+  internal::cholmod_configure_matrix<Scalar>::run(res);
 
   return res;
 }
 
 /** Returns a view of the Cholmod sparse matrix \a cm as an Eigen sparse matrix.
-  * The data are not copied but shared. */
-template<typename Scalar, int Flags, typename Index>
-MappedSparseMatrix<Scalar,Flags,Index> viewAsEigen(cholmod_sparse& cm)
-{
-  return MappedSparseMatrix<Scalar,Flags,Index>
-         (cm.nrow, cm.ncol, static_cast<Index*>(cm.p)[cm.ncol],
-          static_cast<Index*>(cm.p), static_cast<Index*>(cm.i),static_cast<Scalar*>(cm.x) );
+ * The data are not copied but shared. */
+template <typename Scalar, typename StorageIndex>
+Map<const SparseMatrix<Scalar, ColMajor, StorageIndex> > viewAsEigen(cholmod_sparse& cm) {
+  return Map<const SparseMatrix<Scalar, ColMajor, StorageIndex> >(
+      cm.nrow, cm.ncol, static_cast<StorageIndex*>(cm.p)[cm.ncol], static_cast<StorageIndex*>(cm.p),
+      static_cast<StorageIndex*>(cm.i), static_cast<Scalar*>(cm.x));
 }
 
-enum CholmodMode {
-  CholmodAuto, CholmodSimplicialLLt, CholmodSupernodalLLt, CholmodLDLt
-};
+/** Returns a view of the Cholmod sparse matrix factor \a cm as an Eigen sparse matrix.
+ * The data are not copied but shared. */
+template <typename Scalar, typename StorageIndex>
+Map<const SparseMatrix<Scalar, ColMajor, StorageIndex> > viewAsEigen(cholmod_factor& cm) {
+  return Map<const SparseMatrix<Scalar, ColMajor, StorageIndex> >(
+      cm.n, cm.n, static_cast<StorageIndex*>(cm.p)[cm.n], static_cast<StorageIndex*>(cm.p),
+      static_cast<StorageIndex*>(cm.i), static_cast<Scalar*>(cm.x));
+}
 
+namespace internal {
+
+// template specializations for int and long that call the correct cholmod method
+
+#define EIGEN_CHOLMOD_SPECIALIZE0(ret, name)                        \
+  template <typename StorageIndex_>                                 \
+  inline ret cm_##name(cholmod_common& Common) {                    \
+    return R_MATRIX_CHOLMOD(name)(&Common);			    \
+  }
+
+#define EIGEN_CHOLMOD_SPECIALIZE1(ret, name, t1, a1)                    \
+  template <typename StorageIndex_>                                     \
+  inline ret cm_##name(t1& a1, cholmod_common& Common) {                \
+    return R_MATRIX_CHOLMOD(name) (&a1, &Common);			\
+  }
+
+EIGEN_CHOLMOD_SPECIALIZE0(int, start)
+EIGEN_CHOLMOD_SPECIALIZE0(int, finish)
+
+EIGEN_CHOLMOD_SPECIALIZE1(int, free_factor, cholmod_factor*, L)
+EIGEN_CHOLMOD_SPECIALIZE1(int, free_dense, cholmod_dense*, X)
+EIGEN_CHOLMOD_SPECIALIZE1(int, free_sparse, cholmod_sparse*, A)
+
+EIGEN_CHOLMOD_SPECIALIZE1(cholmod_factor*, analyze, cholmod_sparse, A)
+EIGEN_CHOLMOD_SPECIALIZE1(cholmod_sparse*, factor_to_sparse, cholmod_factor, L)
+
+template <typename StorageIndex_>
+inline cholmod_dense* cm_solve(int sys, cholmod_factor& L, cholmod_dense& B, cholmod_common& Common) {
+  return R_MATRIX_CHOLMOD(solve) (sys, &L, &B, &Common);
+}
+// template <>
+// inline cholmod_dense* cm_solve<SuiteSparse_long>(int sys, cholmod_factor& L, cholmod_dense& B, cholmod_common& Common) {
+//   return cholmod_l_solve(sys, &L, &B, &Common);
+// }
+
+template <typename StorageIndex_>
+inline cholmod_sparse* cm_spsolve(int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common& Common) {
+  return R_MATRIX_CHOLMOD(spsolve) (sys, &L, &B, &Common);
+}
+// template <>
+// inline cholmod_sparse* cm_spsolve<SuiteSparse_long>(int sys, cholmod_factor& L, cholmod_sparse& B,
+//                                                     cholmod_common& Common) {
+//   return cholmod_l_spsolve(sys, &L, &B, &Common);
+// }
+
+template <typename StorageIndex_>
+inline int cm_factorize_p(cholmod_sparse* A, double beta[2], StorageIndex_* fset, std::size_t fsize, cholmod_factor* L,
+                          cholmod_common& Common) {
+  return R_MATRIX_CHOLMOD(factorize_p) (A, beta, fset, fsize, L, &Common);
+}
+// template <>
+// inline int cm_factorize_p<SuiteSparse_long>(cholmod_sparse* A, double beta[2], SuiteSparse_long* fset,
+//                                             std::size_t fsize, cholmod_factor* L, cholmod_common& Common) {
+//   return cholmod_l_factorize_p(A, beta, fset, fsize, L, &Common);
+// }
+
+#undef EIGEN_CHOLMOD_SPECIALIZE0
+#undef EIGEN_CHOLMOD_SPECIALIZE1
+
+}  // namespace internal
+
+enum CholmodMode { CholmodAuto, CholmodSimplicialLLt, CholmodSupernodalLLt, CholmodLDLt };
 
 /** \ingroup CholmodSupport_Module
-  * \class CholmodBase
-  * \brief The base class for the direct Cholesky factorization of Cholmod
-  * \sa class CholmodSupernodalLLT, class CholmodSimplicialLDLT, class CholmodSimplicialLLT
-  */
-template<typename _MatrixType, int _UpLo, typename Derived>
-class CholmodBase : internal::noncopyable
-{
-  public:
-    typedef _MatrixType MatrixType;
-    enum { UpLo = _UpLo };
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::RealScalar RealScalar;
-    typedef MatrixType CholMatrixType;
-    typedef typename MatrixType::Index Index;
-
-  public:
-
-    CholmodBase()
-      : m_cholmodFactor(0), m_info(Success), m_isInitialized(false)
-    {
-      m_shiftOffset[0] = m_shiftOffset[1] = RealScalar(0.0);
-      cholmod_start(&m_cholmod);
-    }
+ * \class CholmodBase
+ * \brief The base class for the direct Cholesky factorization of Cholmod
+ * \sa class CholmodSupernodalLLT, class CholmodSimplicialLDLT, class CholmodSimplicialLLT
+ */
+template <typename MatrixType_, int UpLo_, typename Derived>
+class CholmodBase : public SparseSolverBase<Derived> {
+ protected:
+  typedef SparseSolverBase<Derived> Base;
+  using Base::derived;
+  using Base::m_isInitialized;
+
+ public:
+  typedef MatrixType_ MatrixType;
+  enum { UpLo = UpLo_ };
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef MatrixType CholMatrixType;
+  typedef typename MatrixType::StorageIndex StorageIndex;
+  enum { ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime };
+
+ public:
+  CholmodBase() : m_cholmodFactor(0), m_info(Success), m_factorizationIsOk(false), m_analysisIsOk(false) {
+    EIGEN_STATIC_ASSERT((internal::is_same<double, RealScalar>::value), CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY);
+    m_shiftOffset[0] = m_shiftOffset[1] = 0.0;
+    internal::cm_start<StorageIndex>(m_cholmod);
+  }
 
-    CholmodBase(const MatrixType& matrix)
-      : m_cholmodFactor(0), m_info(Success), m_isInitialized(false)
-    {
-      m_shiftOffset[0] = m_shiftOffset[1] = RealScalar(0.0);
-      cholmod_start(&m_cholmod);
-      compute(matrix);
-    }
+  explicit CholmodBase(const MatrixType& matrix)
+      : m_cholmodFactor(0), m_info(Success), m_factorizationIsOk(false), m_analysisIsOk(false) {
+    EIGEN_STATIC_ASSERT((internal::is_same<double, RealScalar>::value), CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY);
+    m_shiftOffset[0] = m_shiftOffset[1] = 0.0;
+    internal::cm_start<StorageIndex>(m_cholmod);
+    compute(matrix);
+  }
 
-    ~CholmodBase()
-    {
-      if(m_cholmodFactor)
-        cholmod_free_factor(&m_cholmodFactor, &m_cholmod);
-      cholmod_finish(&m_cholmod);
-    }
-    
-    inline Index cols() const { return m_cholmodFactor->n; }
-    inline Index rows() const { return m_cholmodFactor->n; }
-    
-    Derived& derived() { return *static_cast<Derived*>(this); }
-    const Derived& derived() const { return *static_cast<const Derived*>(this); }
-    
-    /** \brief Reports whether previous computation was successful.
-      *
-      * \returns \c Success if computation was succesful,
-      *          \c NumericalIssue if the matrix.appears to be negative.
-      */
-    ComputationInfo info() const
-    {
-      eigen_assert(m_isInitialized && "Decomposition is not initialized.");
-      return m_info;
-    }
+  ~CholmodBase() {
+    if (m_cholmodFactor) internal::cm_free_factor<StorageIndex>(m_cholmodFactor, m_cholmod);
+    internal::cm_finish<StorageIndex>(m_cholmod);
+  }
 
-    /** Computes the sparse Cholesky decomposition of \a matrix */
-    Derived& compute(const MatrixType& matrix)
-    {
-      analyzePattern(matrix);
-      factorize(matrix);
-      return derived();
-    }
-    
-    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
-      *
-      * \sa compute()
-      */
-    template<typename Rhs>
-    inline const internal::solve_retval<CholmodBase, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "LLT is not initialized.");
-      eigen_assert(rows()==b.rows()
-                && "CholmodDecomposition::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::solve_retval<CholmodBase, Rhs>(*this, b.derived());
-    }
-    
-    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
-      *
-      * \sa compute()
-      */
-    template<typename Rhs>
-    inline const internal::sparse_solve_retval<CholmodBase, Rhs>
-    solve(const SparseMatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "LLT is not initialized.");
-      eigen_assert(rows()==b.rows()
-                && "CholmodDecomposition::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::sparse_solve_retval<CholmodBase, Rhs>(*this, b.derived());
-    }
-    
-    /** Performs a symbolic decomposition on the sparsity pattern of \a matrix.
-      *
-      * This function is particularly useful when solving for several problems having the same structure.
-      * 
-      * \sa factorize()
-      */
-    void analyzePattern(const MatrixType& matrix)
-    {
-      if(m_cholmodFactor)
-      {
-        cholmod_free_factor(&m_cholmodFactor, &m_cholmod);
-        m_cholmodFactor = 0;
-      }
-      cholmod_sparse A = viewAsCholmod(matrix.template selfadjointView<UpLo>());
-      m_cholmodFactor = cholmod_analyze(&A, &m_cholmod);
-      
-      this->m_isInitialized = true;
-      this->m_info = Success;
-      m_analysisIsOk = true;
-      m_factorizationIsOk = false;
+  inline StorageIndex cols() const { return internal::convert_index<StorageIndex, Index>(m_cholmodFactor->n); }
+  inline StorageIndex rows() const { return internal::convert_index<StorageIndex, Index>(m_cholmodFactor->n); }
+
+  /** \brief Reports whether previous computation was successful.
+   *
+   * \returns \c Success if computation was successful,
+   *          \c NumericalIssue if the matrix.appears to be negative.
+   */
+  ComputationInfo info() const {
+    eigen_assert(m_isInitialized && "Decomposition is not initialized.");
+    return m_info;
+  }
+
+  /** Computes the sparse Cholesky decomposition of \a matrix */
+  Derived& compute(const MatrixType& matrix) {
+    analyzePattern(matrix);
+    factorize(matrix);
+    return derived();
+  }
+
+  /** Performs a symbolic decomposition on the sparsity pattern of \a matrix.
+   *
+   * This function is particularly useful when solving for several problems having the same structure.
+   *
+   * \sa factorize()
+   */
+  void analyzePattern(const MatrixType& matrix) {
+    if (m_cholmodFactor) {
+      internal::cm_free_factor<StorageIndex>(m_cholmodFactor, m_cholmod);
+      m_cholmodFactor = 0;
     }
-    
-    /** Performs a numeric decomposition of \a matrix
-      *
-      * The given matrix must have the same sparsity pattern as the matrix on which the symbolic decomposition has been performed.
-      *
-      * \sa analyzePattern()
-      */
-    void factorize(const MatrixType& matrix)
-    {
-      eigen_assert(m_analysisIsOk && "You must first call analyzePattern()");
-      cholmod_sparse A = viewAsCholmod(matrix.template selfadjointView<UpLo>());
-      cholmod_factorize_p(&A, m_shiftOffset, 0, 0, m_cholmodFactor, &m_cholmod);
-      
-      // If the factorization failed, minor is the column at which it did. On success minor == n.
-      this->m_info = (m_cholmodFactor->minor == m_cholmodFactor->n ? Success : NumericalIssue);
-      m_factorizationIsOk = true;
+    cholmod_sparse A = viewAsCholmod(matrix.template selfadjointView<UpLo>());
+    m_cholmodFactor = internal::cm_analyze<StorageIndex>(A, m_cholmod);
+
+    this->m_isInitialized = true;
+    this->m_info = Success;
+    m_analysisIsOk = true;
+    m_factorizationIsOk = false;
+  }
+
+  /** Performs a numeric decomposition of \a matrix
+   *
+   * The given matrix must have the same sparsity pattern as the matrix on which the symbolic decomposition has been
+   * performed.
+   *
+   * \sa analyzePattern()
+   */
+  void factorize(const MatrixType& matrix) {
+    eigen_assert(m_analysisIsOk && "You must first call analyzePattern()");
+    cholmod_sparse A = viewAsCholmod(matrix.template selfadjointView<UpLo>());
+    internal::cm_factorize_p<StorageIndex>(&A, m_shiftOffset, 0, 0, m_cholmodFactor, m_cholmod);
+
+    // If the factorization failed, either the input matrix was zero (so m_cholmodFactor == nullptr), or minor is the
+    // column at which it failed. On success minor == n.
+    this->m_info =
+        (m_cholmodFactor != nullptr && m_cholmodFactor->minor == m_cholmodFactor->n ? Success : NumericalIssue);
+    m_factorizationIsOk = true;
+  }
+
+  /** Returns a reference to the Cholmod's configuration structure to get a full control over the performed operations.
+   *  See the Cholmod user guide for details. */
+  cholmod_common& cholmod() { return m_cholmod; }
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  /** \internal */
+  template <typename Rhs, typename Dest>
+  void _solve_impl(const MatrixBase<Rhs>& b, MatrixBase<Dest>& dest) const {
+    eigen_assert(m_factorizationIsOk &&
+                 "The decomposition is not in a valid state for solving, you must first call either compute() or "
+                 "symbolic()/numeric()");
+    const Index size = m_cholmodFactor->n;
+    EIGEN_UNUSED_VARIABLE(size);
+    eigen_assert(size == b.rows());
+
+    // Cholmod needs column-major storage without inner-stride, which corresponds to the default behavior of Ref.
+    Ref<const Matrix<typename Rhs::Scalar, Dynamic, Dynamic, ColMajor> > b_ref(b.derived());
+
+    cholmod_dense b_cd = viewAsCholmod(b_ref);
+    cholmod_dense* x_cd = internal::cm_solve<StorageIndex>(CHOLMOD_A, *m_cholmodFactor, b_cd, m_cholmod);
+    if (!x_cd) {
+      this->m_info = NumericalIssue;
+      return;
     }
-    
-    /** Returns a reference to the Cholmod's configuration structure to get a full control over the performed operations.
-     *  See the Cholmod user guide for details. */
-    cholmod_common& cholmod() { return m_cholmod; }
-    
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** \internal */
-    template<typename Rhs,typename Dest>
-    void _solve(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const
-    {
-      eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or symbolic()/numeric()");
-      const Index size = m_cholmodFactor->n;
-      EIGEN_UNUSED_VARIABLE(size);
-      eigen_assert(size==b.rows());
-
-      // note: cd stands for Cholmod Dense
-      Rhs& b_ref(b.const_cast_derived());
-      cholmod_dense b_cd = viewAsCholmod(b_ref);
-      cholmod_dense* x_cd = cholmod_solve(CHOLMOD_A, m_cholmodFactor, &b_cd, &m_cholmod);
-      if(!x_cd)
-      {
-        this->m_info = NumericalIssue;
-      }
-      // TODO optimize this copy by swapping when possible (be careful with alignment, etc.)
-      dest = Matrix<Scalar,Dest::RowsAtCompileTime,Dest::ColsAtCompileTime>::Map(reinterpret_cast<Scalar*>(x_cd->x),b.rows(),b.cols());
-      cholmod_free_dense(&x_cd, &m_cholmod);
+    // TODO optimize this copy by swapping when possible (be careful with alignment, etc.)
+    // NOTE Actually, the copy can be avoided by calling cholmod_solve2 instead of cholmod_solve
+    dest = Matrix<Scalar, Dest::RowsAtCompileTime, Dest::ColsAtCompileTime>::Map(reinterpret_cast<Scalar*>(x_cd->x),
+                                                                                 b.rows(), b.cols());
+    internal::cm_free_dense<StorageIndex>(x_cd, m_cholmod);
+  }
+
+  /** \internal */
+  template <typename RhsDerived, typename DestDerived>
+  void _solve_impl(const SparseMatrixBase<RhsDerived>& b, SparseMatrixBase<DestDerived>& dest) const {
+    eigen_assert(m_factorizationIsOk &&
+                 "The decomposition is not in a valid state for solving, you must first call either compute() or "
+                 "symbolic()/numeric()");
+    const Index size = m_cholmodFactor->n;
+    EIGEN_UNUSED_VARIABLE(size);
+    eigen_assert(size == b.rows());
+
+    // note: cs stands for Cholmod Sparse
+    Ref<SparseMatrix<typename RhsDerived::Scalar, ColMajor, typename RhsDerived::StorageIndex> > b_ref(
+        b.const_cast_derived());
+    cholmod_sparse b_cs = viewAsCholmod(b_ref);
+    cholmod_sparse* x_cs = internal::cm_spsolve<StorageIndex>(CHOLMOD_A, *m_cholmodFactor, b_cs, m_cholmod);
+    if (!x_cs) {
+      this->m_info = NumericalIssue;
+      return;
     }
-    
-    /** \internal */
-    template<typename RhsScalar, int RhsOptions, typename RhsIndex, typename DestScalar, int DestOptions, typename DestIndex>
-    void _solve(const SparseMatrix<RhsScalar,RhsOptions,RhsIndex> &b, SparseMatrix<DestScalar,DestOptions,DestIndex> &dest) const
-    {
-      eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or symbolic()/numeric()");
-      const Index size = m_cholmodFactor->n;
-      EIGEN_UNUSED_VARIABLE(size);
-      eigen_assert(size==b.rows());
-
-      // note: cs stands for Cholmod Sparse
-      cholmod_sparse b_cs = viewAsCholmod(b);
-      cholmod_sparse* x_cs = cholmod_spsolve(CHOLMOD_A, m_cholmodFactor, &b_cs, &m_cholmod);
-      if(!x_cs)
-      {
-        this->m_info = NumericalIssue;
+    // TODO optimize this copy by swapping when possible (be careful with alignment, etc.)
+    // NOTE cholmod_spsolve in fact just calls the dense solver for blocks of 4 columns at a time (similar to Eigen's
+    // sparse solver)
+    dest.derived() = viewAsEigen<typename DestDerived::Scalar, typename DestDerived::StorageIndex>(*x_cs);
+    internal::cm_free_sparse<StorageIndex>(x_cs, m_cholmod);
+  }
+#endif  // EIGEN_PARSED_BY_DOXYGEN
+
+  /** Sets the shift parameter that will be used to adjust the diagonal coefficients during the numerical factorization.
+   *
+   * During the numerical factorization, an offset term is added to the diagonal coefficients:\n
+   * \c d_ii = \a offset + \c d_ii
+   *
+   * The default is \a offset=0.
+   *
+   * \returns a reference to \c *this.
+   */
+  Derived& setShift(const RealScalar& offset) {
+    m_shiftOffset[0] = double(offset);
+    return derived();
+  }
+
+  /** \returns the determinant of the underlying matrix from the current factorization */
+  Scalar determinant() const {
+    using std::exp;
+    return exp(logDeterminant());
+  }
+
+  /** \returns the log determinant of the underlying matrix from the current factorization */
+  Scalar logDeterminant() const {
+    using numext::real;
+    using std::log;
+    eigen_assert(m_factorizationIsOk &&
+                 "The decomposition is not in a valid state for solving, you must first call either compute() or "
+                 "symbolic()/numeric()");
+
+    RealScalar logDet = 0;
+    Scalar* x = static_cast<Scalar*>(m_cholmodFactor->x);
+    if (m_cholmodFactor->is_super) {
+      // Supernodal factorization stored as a packed list of dense column-major blocks,
+      // as described by the following structure:
+
+      // super[k] == index of the first column of the j-th super node
+      StorageIndex* super = static_cast<StorageIndex*>(m_cholmodFactor->super);
+      // pi[k] == offset to the description of row indices
+      StorageIndex* pi = static_cast<StorageIndex*>(m_cholmodFactor->pi);
+      // px[k] == offset to the respective dense block
+      StorageIndex* px = static_cast<StorageIndex*>(m_cholmodFactor->px);
+
+      Index nb_super_nodes = m_cholmodFactor->nsuper;
+      for (Index k = 0; k < nb_super_nodes; ++k) {
+        StorageIndex ncols = super[k + 1] - super[k];
+        StorageIndex nrows = pi[k + 1] - pi[k];
+
+        Map<const Array<Scalar, 1, Dynamic>, 0, InnerStride<> > sk(x + px[k], ncols, InnerStride<>(nrows + 1));
+        logDet += sk.real().log().sum();
       }
-      // TODO optimize this copy by swapping when possible (be careful with alignment, etc.)
-      dest = viewAsEigen<DestScalar,DestOptions,DestIndex>(*x_cs);
-      cholmod_free_sparse(&x_cs, &m_cholmod);
+    } else {
+      // Simplicial factorization stored as standard CSC matrix.
+      StorageIndex* p = static_cast<StorageIndex*>(m_cholmodFactor->p);
+      Index size = m_cholmodFactor->n;
+      for (Index k = 0; k < size; ++k) logDet += log(real(x[p[k]]));
     }
-    #endif // EIGEN_PARSED_BY_DOXYGEN
-    
-    
-    /** Sets the shift parameter that will be used to adjust the diagonal coefficients during the numerical factorization.
-      *
-      * During the numerical factorization, an offset term is added to the diagonal coefficients:\n
-      * \c d_ii = \a offset + \c d_ii
-      *
-      * The default is \a offset=0.
-      *
-      * \returns a reference to \c *this.
-      */
-    Derived& setShift(const RealScalar& offset)
-    {
-      m_shiftOffset[0] = offset;
-      return derived();
-    }
-    
-    template<typename Stream>
-    void dumpMemory(Stream& /*s*/)
-    {}
-    
-  protected:
-    mutable cholmod_common m_cholmod;
-    cholmod_factor* m_cholmodFactor;
-    RealScalar m_shiftOffset[2];
-    mutable ComputationInfo m_info;
-    bool m_isInitialized;
-    int m_factorizationIsOk;
-    int m_analysisIsOk;
+    if (m_cholmodFactor->is_ll) logDet *= 2.0;
+    return logDet;
+  }
+
+  template <typename Stream>
+  void dumpMemory(Stream& /*s*/) {}
+
+ protected:
+  mutable cholmod_common m_cholmod;
+  cholmod_factor* m_cholmodFactor;
+  double m_shiftOffset[2];
+  mutable ComputationInfo m_info;
+  int m_factorizationIsOk;
+  int m_analysisIsOk;
 };
 
 /** \ingroup CholmodSupport_Module
-  * \class CholmodSimplicialLLT
-  * \brief A simplicial direct Cholesky (LLT) factorization and solver based on Cholmod
-  *
-  * This class allows to solve for A.X = B sparse linear problems via a simplicial LL^T Cholesky factorization
-  * using the Cholmod library.
-  * This simplicial variant is equivalent to Eigen's built-in SimplicialLLT class. Therefore, it has little practical interest.
-  * The sparse matrix A must be selfadjoint and positive definite. The vectors or matrices
-  * X and B can be either dense or sparse.
-  *
-  * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
-  * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower
-  *               or Upper. Default is Lower.
-  *
-  * This class supports all kind of SparseMatrix<>: row or column major; upper, lower, or both; compressed or non compressed.
-  *
-  * \sa \ref TutorialSparseDirectSolvers, class CholmodSupernodalLLT, class SimplicialLLT
-  */
-template<typename _MatrixType, int _UpLo = Lower>
-class CholmodSimplicialLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimplicialLLT<_MatrixType, _UpLo> >
-{
-    typedef CholmodBase<_MatrixType, _UpLo, CholmodSimplicialLLT> Base;
-    using Base::m_cholmod;
-    
-  public:
-    
-    typedef _MatrixType MatrixType;
-    
-    CholmodSimplicialLLT() : Base() { init(); }
-
-    CholmodSimplicialLLT(const MatrixType& matrix) : Base()
-    {
-      init();
-      Base::compute(matrix);
-    }
+ * \class CholmodSimplicialLLT
+ * \brief A simplicial direct Cholesky (LLT) factorization and solver based on Cholmod
+ *
+ * This class allows to solve for A.X = B sparse linear problems via a simplicial LL^T Cholesky factorization
+ * using the Cholmod library.
+ * This simplicial variant is equivalent to Eigen's built-in SimplicialLLT class. Therefore, it has little practical
+ * interest. The sparse matrix A must be selfadjoint and positive definite. The vectors or matrices X and B can be
+ * either dense or sparse.
+ *
+ * \tparam MatrixType_ the type of the sparse matrix A, it must be a SparseMatrix<>
+ * \tparam UpLo_ the triangular part that will be used for the computations. It can be Lower
+ *               or Upper. Default is Lower.
+ *
+ * \implsparsesolverconcept
+ *
+ * This class supports all kind of SparseMatrix<>: row or column major; upper, lower, or both; compressed or non
+ * compressed.
+ *
+ * \warning Only double precision real and complex scalar types are supported by Cholmod.
+ *
+ * \sa \ref TutorialSparseSolverConcept, class CholmodSupernodalLLT, class SimplicialLLT
+ */
+template <typename MatrixType_, int UpLo_ = Lower>
+class CholmodSimplicialLLT : public CholmodBase<MatrixType_, UpLo_, CholmodSimplicialLLT<MatrixType_, UpLo_> > {
+  typedef CholmodBase<MatrixType_, UpLo_, CholmodSimplicialLLT> Base;
+  using Base::m_cholmod;
+
+ public:
+  typedef MatrixType_ MatrixType;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef typename MatrixType::StorageIndex StorageIndex;
+  typedef TriangularView<const MatrixType, Eigen::Lower> MatrixL;
+  typedef TriangularView<const typename MatrixType::AdjointReturnType, Eigen::Upper> MatrixU;
+
+  CholmodSimplicialLLT() : Base() { init(); }
+
+  CholmodSimplicialLLT(const MatrixType& matrix) : Base() {
+    init();
+    this->compute(matrix);
+  }
 
-    ~CholmodSimplicialLLT() {}
-  protected:
-    void init()
-    {
-      m_cholmod.final_asis = 0;
-      m_cholmod.supernodal = CHOLMOD_SIMPLICIAL;
-      m_cholmod.final_ll = 1;
-    }
-};
+  ~CholmodSimplicialLLT() {}
 
+  /** \returns an expression of the factor L */
+  inline MatrixL matrixL() const { return viewAsEigen<Scalar, StorageIndex>(*Base::m_cholmodFactor); }
 
-/** \ingroup CholmodSupport_Module
-  * \class CholmodSimplicialLDLT
-  * \brief A simplicial direct Cholesky (LDLT) factorization and solver based on Cholmod
-  *
-  * This class allows to solve for A.X = B sparse linear problems via a simplicial LDL^T Cholesky factorization
-  * using the Cholmod library.
-  * This simplicial variant is equivalent to Eigen's built-in SimplicialLDLT class. Therefore, it has little practical interest.
-  * The sparse matrix A must be selfadjoint and positive definite. The vectors or matrices
-  * X and B can be either dense or sparse.
-  *
-  * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
-  * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower
-  *               or Upper. Default is Lower.
-  *
-  * This class supports all kind of SparseMatrix<>: row or column major; upper, lower, or both; compressed or non compressed.
-  *
-  * \sa \ref TutorialSparseDirectSolvers, class CholmodSupernodalLLT, class SimplicialLDLT
-  */
-template<typename _MatrixType, int _UpLo = Lower>
-class CholmodSimplicialLDLT : public CholmodBase<_MatrixType, _UpLo, CholmodSimplicialLDLT<_MatrixType, _UpLo> >
-{
-    typedef CholmodBase<_MatrixType, _UpLo, CholmodSimplicialLDLT> Base;
-    using Base::m_cholmod;
-    
-  public:
-    
-    typedef _MatrixType MatrixType;
-    
-    CholmodSimplicialLDLT() : Base() { init(); }
-
-    CholmodSimplicialLDLT(const MatrixType& matrix) : Base()
-    {
-      init();
-      Base::compute(matrix);
-    }
+  /** \returns an expression of the factor U (= L^*) */
+  inline MatrixU matrixU() const { return matrixL().adjoint(); }
 
-    ~CholmodSimplicialLDLT() {}
-  protected:
-    void init()
-    {
-      m_cholmod.final_asis = 1;
-      m_cholmod.supernodal = CHOLMOD_SIMPLICIAL;
-    }
+ protected:
+  void init() {
+    m_cholmod.final_asis = 0;
+    m_cholmod.supernodal = CHOLMOD_SIMPLICIAL;
+    m_cholmod.final_ll = 1;
+  }
 };
 
 /** \ingroup CholmodSupport_Module
-  * \class CholmodSupernodalLLT
-  * \brief A supernodal Cholesky (LLT) factorization and solver based on Cholmod
-  *
-  * This class allows to solve for A.X = B sparse linear problems via a supernodal LL^T Cholesky factorization
-  * using the Cholmod library.
-  * This supernodal variant performs best on dense enough problems, e.g., 3D FEM, or very high order 2D FEM.
-  * The sparse matrix A must be selfadjoint and positive definite. The vectors or matrices
-  * X and B can be either dense or sparse.
-  *
-  * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
-  * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower
-  *               or Upper. Default is Lower.
-  *
-  * This class supports all kind of SparseMatrix<>: row or column major; upper, lower, or both; compressed or non compressed.
-  *
-  * \sa \ref TutorialSparseDirectSolvers
-  */
-template<typename _MatrixType, int _UpLo = Lower>
-class CholmodSupernodalLLT : public CholmodBase<_MatrixType, _UpLo, CholmodSupernodalLLT<_MatrixType, _UpLo> >
-{
-    typedef CholmodBase<_MatrixType, _UpLo, CholmodSupernodalLLT> Base;
-    using Base::m_cholmod;
-    
-  public:
-    
-    typedef _MatrixType MatrixType;
-    
-    CholmodSupernodalLLT() : Base() { init(); }
-
-    CholmodSupernodalLLT(const MatrixType& matrix) : Base()
-    {
-      init();
-      Base::compute(matrix);
-    }
+ * \class CholmodSimplicialLDLT
+ * \brief A simplicial direct Cholesky (LDLT) factorization and solver based on Cholmod
+ *
+ * This class allows to solve for A.X = B sparse linear problems via a simplicial LDL^T Cholesky factorization
+ * using the Cholmod library.
+ * This simplicial variant is equivalent to Eigen's built-in SimplicialLDLT class. Therefore, it has little practical
+ * interest. The sparse matrix A must be selfadjoint and positive definite. The vectors or matrices X and B can be
+ * either dense or sparse.
+ *
+ * \tparam MatrixType_ the type of the sparse matrix A, it must be a SparseMatrix<>
+ * \tparam UpLo_ the triangular part that will be used for the computations. It can be Lower
+ *               or Upper. Default is Lower.
+ *
+ * \implsparsesolverconcept
+ *
+ * This class supports all kind of SparseMatrix<>: row or column major; upper, lower, or both; compressed or non
+ * compressed.
+ *
+ * \warning Only double precision real and complex scalar types are supported by Cholmod.
+ *
+ * \sa \ref TutorialSparseSolverConcept, class CholmodSupernodalLLT, class SimplicialLDLT
+ */
+template <typename MatrixType_, int UpLo_ = Lower>
+class CholmodSimplicialLDLT : public CholmodBase<MatrixType_, UpLo_, CholmodSimplicialLDLT<MatrixType_, UpLo_> > {
+  typedef CholmodBase<MatrixType_, UpLo_, CholmodSimplicialLDLT> Base;
+  using Base::m_cholmod;
+
+ public:
+  typedef MatrixType_ MatrixType;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef typename MatrixType::StorageIndex StorageIndex;
+  typedef Matrix<Scalar, Dynamic, 1> VectorType;
+  typedef TriangularView<const MatrixType, Eigen::UnitLower> MatrixL;
+  typedef TriangularView<const typename MatrixType::AdjointReturnType, Eigen::UnitUpper> MatrixU;
+
+  CholmodSimplicialLDLT() : Base() { init(); }
+
+  CholmodSimplicialLDLT(const MatrixType& matrix) : Base() {
+    init();
+    this->compute(matrix);
+  }
+
+  ~CholmodSimplicialLDLT() {}
+
+  /** \returns a vector expression of the diagonal D */
+  inline VectorType vectorD() const {
+    auto cholmodL = viewAsEigen<Scalar, StorageIndex>(*Base::m_cholmodFactor);
+
+    VectorType D{cholmodL.rows()};
 
-    ~CholmodSupernodalLLT() {}
-  protected:
-    void init()
-    {
-      m_cholmod.final_asis = 1;
-      m_cholmod.supernodal = CHOLMOD_SUPERNODAL;
+    for (Index k = 0; k < cholmodL.outerSize(); ++k) {
+      typename decltype(cholmodL)::InnerIterator it{cholmodL, k};
+      D(k) = it.value();
     }
+
+    return D;
+  }
+
+  /** \returns an expression of the factor L */
+  inline MatrixL matrixL() const { return viewAsEigen<Scalar, StorageIndex>(*Base::m_cholmodFactor); }
+
+  /** \returns an expression of the factor U (= L^*) */
+  inline MatrixU matrixU() const { return matrixL().adjoint(); }
+
+ protected:
+  void init() {
+    m_cholmod.final_asis = 1;
+    m_cholmod.supernodal = CHOLMOD_SIMPLICIAL;
+  }
 };
 
 /** \ingroup CholmodSupport_Module
-  * \class CholmodDecomposition
-  * \brief A general Cholesky factorization and solver based on Cholmod
-  *
-  * This class allows to solve for A.X = B sparse linear problems via a LL^T or LDL^T Cholesky factorization
-  * using the Cholmod library. The sparse matrix A must be selfadjoint and positive definite. The vectors or matrices
-  * X and B can be either dense or sparse.
-  *
-  * This variant permits to change the underlying Cholesky method at runtime.
-  * On the other hand, it does not provide access to the result of the factorization.
-  * The default is to let Cholmod automatically choose between a simplicial and supernodal factorization.
-  *
-  * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
-  * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower
-  *               or Upper. Default is Lower.
-  *
-  * This class supports all kind of SparseMatrix<>: row or column major; upper, lower, or both; compressed or non compressed.
-  *
-  * \sa \ref TutorialSparseDirectSolvers
-  */
-template<typename _MatrixType, int _UpLo = Lower>
-class CholmodDecomposition : public CholmodBase<_MatrixType, _UpLo, CholmodDecomposition<_MatrixType, _UpLo> >
-{
-    typedef CholmodBase<_MatrixType, _UpLo, CholmodDecomposition> Base;
-    using Base::m_cholmod;
-    
-  public:
-    
-    typedef _MatrixType MatrixType;
-    
-    CholmodDecomposition() : Base() { init(); }
-
-    CholmodDecomposition(const MatrixType& matrix) : Base()
-    {
-      init();
-      Base::compute(matrix);
-    }
+ * \class CholmodSupernodalLLT
+ * \brief A supernodal Cholesky (LLT) factorization and solver based on Cholmod
+ *
+ * This class allows to solve for A.X = B sparse linear problems via a supernodal LL^T Cholesky factorization
+ * using the Cholmod library.
+ * This supernodal variant performs best on dense enough problems, e.g., 3D FEM, or very high order 2D FEM.
+ * The sparse matrix A must be selfadjoint and positive definite. The vectors or matrices
+ * X and B can be either dense or sparse.
+ *
+ * \tparam MatrixType_ the type of the sparse matrix A, it must be a SparseMatrix<>
+ * \tparam UpLo_ the triangular part that will be used for the computations. It can be Lower
+ *               or Upper. Default is Lower.
+ *
+ * \implsparsesolverconcept
+ *
+ * This class supports all kind of SparseMatrix<>: row or column major; upper, lower, or both; compressed or non
+ * compressed.
+ *
+ * \warning Only double precision real and complex scalar types are supported by Cholmod.
+ *
+ * \sa \ref TutorialSparseSolverConcept
+ */
+template <typename MatrixType_, int UpLo_ = Lower>
+class CholmodSupernodalLLT : public CholmodBase<MatrixType_, UpLo_, CholmodSupernodalLLT<MatrixType_, UpLo_> > {
+  typedef CholmodBase<MatrixType_, UpLo_, CholmodSupernodalLLT> Base;
+  using Base::m_cholmod;
+
+ public:
+  typedef MatrixType_ MatrixType;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef typename MatrixType::StorageIndex StorageIndex;
+
+  CholmodSupernodalLLT() : Base() { init(); }
+
+  CholmodSupernodalLLT(const MatrixType& matrix) : Base() {
+    init();
+    this->compute(matrix);
+  }
 
-    ~CholmodDecomposition() {}
-    
-    void setMode(CholmodMode mode)
-    {
-      switch(mode)
-      {
-        case CholmodAuto:
-          m_cholmod.final_asis = 1;
-          m_cholmod.supernodal = CHOLMOD_AUTO;
-          break;
-        case CholmodSimplicialLLt:
-          m_cholmod.final_asis = 0;
-          m_cholmod.supernodal = CHOLMOD_SIMPLICIAL;
-          m_cholmod.final_ll = 1;
-          break;
-        case CholmodSupernodalLLt:
-          m_cholmod.final_asis = 1;
-          m_cholmod.supernodal = CHOLMOD_SUPERNODAL;
-          break;
-        case CholmodLDLt:
-          m_cholmod.final_asis = 1;
-          m_cholmod.supernodal = CHOLMOD_SIMPLICIAL;
-          break;
-        default:
-          break;
-      }
-    }
-  protected:
-    void init()
-    {
-      m_cholmod.final_asis = 1;
-      m_cholmod.supernodal = CHOLMOD_AUTO;
-    }
-};
+  ~CholmodSupernodalLLT() {}
 
-namespace internal {
-  
-template<typename _MatrixType, int _UpLo, typename Derived, typename Rhs>
-struct solve_retval<CholmodBase<_MatrixType,_UpLo,Derived>, Rhs>
-  : solve_retval_base<CholmodBase<_MatrixType,_UpLo,Derived>, Rhs>
-{
-  typedef CholmodBase<_MatrixType,_UpLo,Derived> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
+  /** \returns an expression of the factor L */
+  inline MatrixType matrixL() const {
+    // Convert Cholmod factor's supernodal storage format to Eigen's CSC storage format
+    cholmod_sparse* cholmodL = internal::cm_factor_to_sparse(*Base::m_cholmodFactor, m_cholmod);
+    MatrixType L = viewAsEigen<Scalar, StorageIndex>(*cholmodL);
+    internal::cm_free_sparse<StorageIndex>(cholmodL, m_cholmod);
+
+    return L;
   }
-};
 
-template<typename _MatrixType, int _UpLo, typename Derived, typename Rhs>
-struct sparse_solve_retval<CholmodBase<_MatrixType,_UpLo,Derived>, Rhs>
-  : sparse_solve_retval_base<CholmodBase<_MatrixType,_UpLo,Derived>, Rhs>
-{
-  typedef CholmodBase<_MatrixType,_UpLo,Derived> Dec;
-  EIGEN_MAKE_SPARSE_SOLVE_HELPERS(Dec,Rhs)
+  /** \returns an expression of the factor U (= L^*) */
+  inline MatrixType matrixU() const { return matrixL().adjoint(); }
 
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
+ protected:
+  void init() {
+    m_cholmod.final_asis = 1;
+    m_cholmod.supernodal = CHOLMOD_SUPERNODAL;
   }
 };
 
-} // end namespace internal
+/** \ingroup CholmodSupport_Module
+ * \class CholmodDecomposition
+ * \brief A general Cholesky factorization and solver based on Cholmod
+ *
+ * This class allows to solve for A.X = B sparse linear problems via a LL^T or LDL^T Cholesky factorization
+ * using the Cholmod library. The sparse matrix A must be selfadjoint and positive definite. The vectors or matrices
+ * X and B can be either dense or sparse.
+ *
+ * This variant permits to change the underlying Cholesky method at runtime.
+ * On the other hand, it does not provide access to the result of the factorization.
+ * The default is to let Cholmod automatically choose between a simplicial and supernodal factorization.
+ *
+ * \tparam MatrixType_ the type of the sparse matrix A, it must be a SparseMatrix<>
+ * \tparam UpLo_ the triangular part that will be used for the computations. It can be Lower
+ *               or Upper. Default is Lower.
+ *
+ * \implsparsesolverconcept
+ *
+ * This class supports all kind of SparseMatrix<>: row or column major; upper, lower, or both; compressed or non
+ * compressed.
+ *
+ * \warning Only double precision real and complex scalar types are supported by Cholmod.
+ *
+ * \sa \ref TutorialSparseSolverConcept
+ */
+template <typename MatrixType_, int UpLo_ = Lower>
+class CholmodDecomposition : public CholmodBase<MatrixType_, UpLo_, CholmodDecomposition<MatrixType_, UpLo_> > {
+  typedef CholmodBase<MatrixType_, UpLo_, CholmodDecomposition> Base;
+  using Base::m_cholmod;
+
+ public:
+  typedef MatrixType_ MatrixType;
+
+  CholmodDecomposition() : Base() { init(); }
+
+  CholmodDecomposition(const MatrixType& matrix) : Base() {
+    init();
+    this->compute(matrix);
+  }
+
+  ~CholmodDecomposition() {}
+
+  void setMode(CholmodMode mode) {
+    switch (mode) {
+      case CholmodAuto:
+        m_cholmod.final_asis = 1;
+        m_cholmod.supernodal = CHOLMOD_AUTO;
+        break;
+      case CholmodSimplicialLLt:
+        m_cholmod.final_asis = 0;
+        m_cholmod.supernodal = CHOLMOD_SIMPLICIAL;
+        m_cholmod.final_ll = 1;
+        break;
+      case CholmodSupernodalLLt:
+        m_cholmod.final_asis = 1;
+        m_cholmod.supernodal = CHOLMOD_SUPERNODAL;
+        break;
+      case CholmodLDLt:
+        m_cholmod.final_asis = 1;
+        m_cholmod.supernodal = CHOLMOD_SIMPLICIAL;
+        break;
+      default:
+        break;
+    }
+  }
+
+ protected:
+  void init() {
+    m_cholmod.final_asis = 1;
+    m_cholmod.supernodal = CHOLMOD_AUTO;
+  }
+};
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_CHOLMODSUPPORT_H
+#endif  // EIGEN_CHOLMODSUPPORT_H
diff --git a/inst/include/Eigen/src/CholmodSupport/InternalHeaderCheck.h b/inst/include/Eigen/src/CholmodSupport/InternalHeaderCheck.h
new file mode 100644
index 00000000..0fb3abc7
--- /dev/null
+++ b/inst/include/Eigen/src/CholmodSupport/InternalHeaderCheck.h
@@ -0,0 +1,3 @@
+#ifndef EIGEN_CHOLMODSUPPORT_MODULE_H
+#error "Please include Eigen/CholmodSupport instead of including headers inside the src directory directly."
+#endif
diff --git a/inst/include/Eigen/src/Core/ArithmeticSequence.h b/inst/include/Eigen/src/Core/ArithmeticSequence.h
new file mode 100644
index 00000000..ae6373dd
--- /dev/null
+++ b/inst/include/Eigen/src/Core/ArithmeticSequence.h
@@ -0,0 +1,239 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ARITHMETIC_SEQUENCE_H
+#define EIGEN_ARITHMETIC_SEQUENCE_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+// Helper to cleanup the type of the increment:
+template <typename T>
+struct cleanup_seq_incr {
+  typedef typename cleanup_index_type<T, DynamicIndex>::type type;
+};
+
+}  // namespace internal
+
+//--------------------------------------------------------------------------------
+// seq(first,last,incr) and seqN(first,size,incr)
+//--------------------------------------------------------------------------------
+
+template <typename FirstType = Index, typename SizeType = Index, typename IncrType = internal::FixedInt<1> >
+class ArithmeticSequence;
+
+template <typename FirstType, typename SizeType, typename IncrType>
+ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,
+                   typename internal::cleanup_index_type<SizeType>::type,
+                   typename internal::cleanup_seq_incr<IncrType>::type>
+seqN(FirstType first, SizeType size, IncrType incr);
+
+/** \class ArithmeticSequence
+ * \ingroup Core_Module
+ *
+ * This class represents an arithmetic progression \f$ a_0, a_1, a_2, ..., a_{n-1}\f$ defined by
+ * its \em first value \f$ a_0 \f$, its \em size (aka length) \em n, and the \em increment (aka stride)
+ * that is equal to \f$ a_{i+1}-a_{i}\f$ for any \em i.
+ *
+ * It is internally used as the return type of the Eigen::seq and Eigen::seqN functions, and as the input arguments
+ * of DenseBase::operator()(const RowIndices&, const ColIndices&), and most of the time this is the
+ * only way it is used.
+ *
+ * \tparam FirstType type of the first element, usually an Index,
+ *                   but internally it can be a symbolic expression
+ * \tparam SizeType type representing the size of the sequence, usually an Index
+ *                  or a compile time integral constant. Internally, it can also be a symbolic expression
+ * \tparam IncrType type of the increment, can be a runtime Index, or a compile time integral constant (default is
+ * compile-time 1)
+ *
+ * \sa Eigen::seq, Eigen::seqN, DenseBase::operator()(const RowIndices&, const ColIndices&), class IndexedView
+ */
+template <typename FirstType, typename SizeType, typename IncrType>
+class ArithmeticSequence {
+ public:
+  constexpr ArithmeticSequence() = default;
+  constexpr ArithmeticSequence(FirstType first, SizeType size) : m_first(first), m_size(size) {}
+  constexpr ArithmeticSequence(FirstType first, SizeType size, IncrType incr)
+      : m_first(first), m_size(size), m_incr(incr) {}
+
+  enum {
+    // SizeAtCompileTime = internal::get_fixed_value<SizeType>::value,
+    IncrAtCompileTime = internal::get_fixed_value<IncrType, DynamicIndex>::value
+  };
+
+  /** \returns the size, i.e., number of elements, of the sequence */
+  constexpr Index size() const { return m_size; }
+
+  /** \returns the first element \f$ a_0 \f$ in the sequence */
+  constexpr Index first() const { return m_first; }
+
+  /** \returns the value \f$ a_i \f$ at index \a i in the sequence. */
+  constexpr Index operator[](Index i) const { return m_first + i * m_incr; }
+
+  constexpr const FirstType& firstObject() const { return m_first; }
+  constexpr const SizeType& sizeObject() const { return m_size; }
+  constexpr const IncrType& incrObject() const { return m_incr; }
+
+ protected:
+  FirstType m_first;
+  SizeType m_size;
+  IncrType m_incr;
+
+ public:
+  constexpr auto reverse() const -> decltype(Eigen::seqN(m_first + (m_size + fix<-1>()) * m_incr, m_size, -m_incr)) {
+    return seqN(m_first + (m_size + fix<-1>()) * m_incr, m_size, -m_incr);
+  }
+};
+
+/** \returns an ArithmeticSequence starting at \a first, of length \a size, and increment \a incr
+ *
+ * \sa seqN(FirstType,SizeType), seq(FirstType,LastType,IncrType) */
+template <typename FirstType, typename SizeType, typename IncrType>
+ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,
+                   typename internal::cleanup_index_type<SizeType>::type,
+                   typename internal::cleanup_seq_incr<IncrType>::type>
+seqN(FirstType first, SizeType size, IncrType incr) {
+  return ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,
+                            typename internal::cleanup_index_type<SizeType>::type,
+                            typename internal::cleanup_seq_incr<IncrType>::type>(first, size, incr);
+}
+
+/** \returns an ArithmeticSequence starting at \a first, of length \a size, and unit increment
+ *
+ * \sa seqN(FirstType,SizeType,IncrType), seq(FirstType,LastType) */
+template <typename FirstType, typename SizeType>
+ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,
+                   typename internal::cleanup_index_type<SizeType>::type>
+seqN(FirstType first, SizeType size) {
+  return ArithmeticSequence<typename internal::cleanup_index_type<FirstType>::type,
+                            typename internal::cleanup_index_type<SizeType>::type>(first, size);
+}
+
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+
+/** \returns an ArithmeticSequence starting at \a f, up (or down) to \a l, and with positive (or negative) increment \a
+ * incr
+ *
+ * It is essentially an alias to:
+ * \code
+ * seqN(f, (l-f+incr)/incr, incr);
+ * \endcode
+ *
+ * \sa seqN(FirstType,SizeType,IncrType), seq(FirstType,LastType)
+ */
+template <typename FirstType, typename LastType, typename IncrType>
+auto seq(FirstType f, LastType l, IncrType incr);
+
+/** \returns an ArithmeticSequence starting at \a f, up (or down) to \a l, and unit increment
+ *
+ * It is essentially an alias to:
+ * \code
+ * seqN(f,l-f+1);
+ * \endcode
+ *
+ * \sa seqN(FirstType,SizeType), seq(FirstType,LastType,IncrType)
+ */
+template <typename FirstType, typename LastType>
+auto seq(FirstType f, LastType l);
+
+#else  // EIGEN_PARSED_BY_DOXYGEN
+
+template <typename FirstType, typename LastType>
+auto seq(FirstType f, LastType l)
+    -> decltype(seqN(typename internal::cleanup_index_type<FirstType>::type(f),
+                     (typename internal::cleanup_index_type<LastType>::type(l) -
+                      typename internal::cleanup_index_type<FirstType>::type(f) + fix<1>()))) {
+  return seqN(typename internal::cleanup_index_type<FirstType>::type(f),
+              (typename internal::cleanup_index_type<LastType>::type(l) -
+               typename internal::cleanup_index_type<FirstType>::type(f) + fix<1>()));
+}
+
+template <typename FirstType, typename LastType, typename IncrType>
+auto seq(FirstType f, LastType l, IncrType incr)
+    -> decltype(seqN(typename internal::cleanup_index_type<FirstType>::type(f),
+                     (typename internal::cleanup_index_type<LastType>::type(l) -
+                      typename internal::cleanup_index_type<FirstType>::type(f) +
+                      typename internal::cleanup_seq_incr<IncrType>::type(incr)) /
+                         typename internal::cleanup_seq_incr<IncrType>::type(incr),
+                     typename internal::cleanup_seq_incr<IncrType>::type(incr))) {
+  typedef typename internal::cleanup_seq_incr<IncrType>::type CleanedIncrType;
+  return seqN(typename internal::cleanup_index_type<FirstType>::type(f),
+              (typename internal::cleanup_index_type<LastType>::type(l) -
+               typename internal::cleanup_index_type<FirstType>::type(f) + CleanedIncrType(incr)) /
+                  CleanedIncrType(incr),
+              CleanedIncrType(incr));
+}
+
+#endif  // EIGEN_PARSED_BY_DOXYGEN
+
+namespace placeholders {
+
+/** \cpp11
+ * \returns a symbolic ArithmeticSequence representing the last \a size elements with increment \a incr.
+ *
+ * It is a shortcut for: \code seqN(last-(size-fix<1>)*incr, size, incr) \endcode
+ *
+ * \sa lastN(SizeType), seqN(FirstType,SizeType), seq(FirstType,LastType,IncrType) */
+template <typename SizeType, typename IncrType>
+auto lastN(SizeType size, IncrType incr)
+    -> decltype(seqN(Eigen::placeholders::last - (size - fix<1>()) * incr, size, incr)) {
+  return seqN(Eigen::placeholders::last - (size - fix<1>()) * incr, size, incr);
+}
+
+/** \cpp11
+ * \returns a symbolic ArithmeticSequence representing the last \a size elements with a unit increment.
+ *
+ *  It is a shortcut for: \code seq(last+fix<1>-size, last) \endcode
+ *
+ * \sa lastN(SizeType,IncrType, seqN(FirstType,SizeType), seq(FirstType,LastType) */
+template <typename SizeType>
+auto lastN(SizeType size) -> decltype(seqN(Eigen::placeholders::last + fix<1>() - size, size)) {
+  return seqN(Eigen::placeholders::last + fix<1>() - size, size);
+}
+
+}  // namespace placeholders
+
+/** \namespace Eigen::indexing
+  * \ingroup Core_Module
+  *
+  * The sole purpose of this namespace is to be able to import all functions
+  * and symbols that are expected to be used within operator() for indexing
+  * and slicing. If you already imported the whole Eigen namespace:
+  * \code using namespace Eigen; \endcode
+  * then you are already all set. Otherwise, if you don't want/cannot import
+  * the whole Eigen namespace, the following line:
+  * \code using namespace Eigen::indexing; \endcode
+  * is equivalent to:
+  * \code
+  using Eigen::fix;
+  using Eigen::seq;
+  using Eigen::seqN;
+  using Eigen::placeholders::all;
+  using Eigen::placeholders::last;
+  using Eigen::placeholders::lastN;  // c++11 only
+  using Eigen::placeholders::lastp1;
+  \endcode
+  */
+namespace indexing {
+using Eigen::fix;
+using Eigen::seq;
+using Eigen::seqN;
+using Eigen::placeholders::all;
+using Eigen::placeholders::last;
+using Eigen::placeholders::lastN;
+using Eigen::placeholders::lastp1;
+}  // namespace indexing
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_ARITHMETIC_SEQUENCE_H
diff --git a/inst/include/Eigen/src/Core/Array.h b/inst/include/Eigen/src/Core/Array.h
index 0b9c38c8..57f3186b 100644
--- a/inst/include/Eigen/src/Core/Array.h
+++ b/inst/include/Eigen/src/Core/Array.h
@@ -10,314 +10,367 @@
 #ifndef EIGEN_ARRAY_H
 #define EIGEN_ARRAY_H
 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
 namespace Eigen {
 
-/** \class Array 
-  * \ingroup Core_Module
-  *
-  * \brief General-purpose arrays with easy API for coefficient-wise operations
-  *
-  * The %Array class is very similar to the Matrix class. It provides
-  * general-purpose one- and two-dimensional arrays. The difference between the
-  * %Array and the %Matrix class is primarily in the API: the API for the
-  * %Array class provides easy access to coefficient-wise operations, while the
-  * API for the %Matrix class provides easy access to linear-algebra
-  * operations.
-  *
-  * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_ARRAY_PLUGIN.
-  *
-  * \sa \ref TutorialArrayClass, \ref TopicClassHierarchy
-  */
 namespace internal {
-template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
-struct traits<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > : traits<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
-{
+template <typename Scalar_, int Rows_, int Cols_, int Options_, int MaxRows_, int MaxCols_>
+struct traits<Array<Scalar_, Rows_, Cols_, Options_, MaxRows_, MaxCols_>>
+    : traits<Matrix<Scalar_, Rows_, Cols_, Options_, MaxRows_, MaxCols_>> {
   typedef ArrayXpr XprKind;
-  typedef ArrayBase<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > XprBase;
+  typedef ArrayBase<Array<Scalar_, Rows_, Cols_, Options_, MaxRows_, MaxCols_>> XprBase;
 };
-}
-
-template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
-class Array
-  : public PlainObjectBase<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
-{
-  public:
-
-    typedef PlainObjectBase<Array> Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(Array)
-
-    enum { Options = _Options };
-    typedef typename Base::PlainObject PlainObject;
-
-  protected:
-    template <typename Derived, typename OtherDerived, bool IsVector>
-    friend struct internal::conservative_resize_like_impl;
-
-    using Base::m_storage;
-
-  public:
-
-    using Base::base;
-    using Base::coeff;
-    using Base::coeffRef;
-
-    /**
-      * The usage of
-      *   using Base::operator=;
-      * fails on MSVC. Since the code below is working with GCC and MSVC, we skipped
-      * the usage of 'using'. This should be done only for operator=.
-      */
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE Array& operator=(const EigenBase<OtherDerived> &other)
-    {
-      return Base::operator=(other);
-    }
-
-    /** Copies the value of the expression \a other into \c *this with automatic resizing.
-      *
-      * *this might be resized to match the dimensions of \a other. If *this was a null matrix (not already initialized),
-      * it will be initialized.
-      *
-      * Note that copying a row-vector into a vector (and conversely) is allowed.
-      * The resizing, if any, is then done in the appropriate way so that row-vectors
-      * remain row-vectors and vectors remain vectors.
-      */
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE Array& operator=(const ArrayBase<OtherDerived>& other)
-    {
-      return Base::_set(other);
-    }
-
-    /** This is a special case of the templated operator=. Its purpose is to
-      * prevent a default operator= from hiding the templated operator=.
-      */
-    EIGEN_STRONG_INLINE Array& operator=(const Array& other)
-    {
-      return Base::_set(other);
-    }
-
-    /** Default constructor.
-      *
-      * For fixed-size matrices, does nothing.
-      *
-      * For dynamic-size matrices, creates an empty matrix of size 0. Does not allocate any array. Such a matrix
-      * is called a null matrix. This constructor is the unique way to create null matrices: resizing
-      * a matrix to 0 is not supported.
-      *
-      * \sa resize(Index,Index)
-      */
-    EIGEN_STRONG_INLINE Array() : Base()
-    {
-      Base::_check_template_params();
-      EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
-    }
-
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-    // FIXME is it still needed ??
-    /** \internal */
-    Array(internal::constructor_without_unaligned_array_assert)
-      : Base(internal::constructor_without_unaligned_array_assert())
-    {
-      Base::_check_template_params();
-      EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
-    }
+}  // namespace internal
+
+/** \class Array
+ * \ingroup Core_Module
+ *
+ * \brief General-purpose arrays with easy API for coefficient-wise operations
+ *
+ * The %Array class is very similar to the Matrix class. It provides
+ * general-purpose one- and two-dimensional arrays. The difference between the
+ * %Array and the %Matrix class is primarily in the API: the API for the
+ * %Array class provides easy access to coefficient-wise operations, while the
+ * API for the %Matrix class provides easy access to linear-algebra
+ * operations.
+ *
+ * See documentation of class Matrix for detailed information on the template parameters
+ * storage layout.
+ *
+ * This class can be extended with the help of the plugin mechanism described on the page
+ * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_ARRAY_PLUGIN.
+ *
+ * \sa \blank \ref TutorialArrayClass, \ref TopicClassHierarchy
+ */
+template <typename Scalar_, int Rows_, int Cols_, int Options_, int MaxRows_, int MaxCols_>
+class Array : public PlainObjectBase<Array<Scalar_, Rows_, Cols_, Options_, MaxRows_, MaxCols_>> {
+ public:
+  typedef PlainObjectBase<Array> Base;
+  EIGEN_DENSE_PUBLIC_INTERFACE(Array)
+
+  enum { Options = Options_ };
+  typedef typename Base::PlainObject PlainObject;
+
+ protected:
+  template <typename Derived, typename OtherDerived, bool IsVector>
+  friend struct internal::conservative_resize_like_impl;
+
+  using Base::m_storage;
+
+ public:
+  using Base::base;
+  using Base::coeff;
+  using Base::coeffRef;
+
+  /**
+   * The usage of
+   *   using Base::operator=;
+   * fails on MSVC. Since the code below is working with GCC and MSVC, we skipped
+   * the usage of 'using'. This should be done only for operator=.
+   */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Array& operator=(const EigenBase<OtherDerived>& other) {
+    return Base::operator=(other);
+  }
+
+  /** Set all the entries to \a value.
+   * \sa DenseBase::setConstant(), DenseBase::fill()
+   */
+  /* This overload is needed because the usage of
+   *   using Base::operator=;
+   * fails on MSVC. Since the code below is working with GCC and MSVC, we skipped
+   * the usage of 'using'. This should be done only for operator=.
+   */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Array& operator=(const Scalar& value) {
+    Base::setConstant(value);
+    return *this;
+  }
+
+  /** Copies the value of the expression \a other into \c *this with automatic resizing.
+   *
+   * *this might be resized to match the dimensions of \a other. If *this was a null matrix (not already initialized),
+   * it will be initialized.
+   *
+   * Note that copying a row-vector into a vector (and conversely) is allowed.
+   * The resizing, if any, is then done in the appropriate way so that row-vectors
+   * remain row-vectors and vectors remain vectors.
+   */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Array& operator=(const DenseBase<OtherDerived>& other) {
+    return Base::_set(other);
+  }
+
+  /**
+   * \brief Assigns arrays to each other.
+   *
+   * \note This is a special case of the templated operator=. Its purpose is
+   * to prevent a default operator= from hiding the templated operator=.
+   *
+   * \callgraph
+   */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Array& operator=(const Array& other) { return Base::_set(other); }
+
+  /** Default constructor.
+   *
+   * For fixed-size matrices, does nothing.
+   *
+   * For dynamic-size matrices, creates an empty matrix of size 0. Does not allocate any array. Such a matrix
+   * is called a null matrix. This constructor is the unique way to create null matrices: resizing
+   * a matrix to 0 is not supported.
+   *
+   * \sa resize(Index,Index)
+   */
+#ifdef EIGEN_INITIALIZE_COEFFS
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Array() : Base() { EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED }
+#else
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Array() = default;
 #endif
+  /** \brief Move constructor */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Array(Array&&) = default;
+  EIGEN_DEVICE_FUNC Array& operator=(Array&& other) noexcept(std::is_nothrow_move_assignable<Scalar>::value) {
+    Base::operator=(std::move(other));
+    return *this;
+  }
+
+  /** \brief Construct a row of column vector with fixed size from an arbitrary number of coefficients.
+   *
+   * \only_for_vectors
+   *
+   * This constructor is for 1D array or vectors with more than 4 coefficients.
+   *
+   * \warning To construct a column (resp. row) vector of fixed length, the number of values passed to this
+   * constructor must match the the fixed number of rows (resp. columns) of \c *this.
+   *
+   *
+   * Example: \include Array_variadic_ctor_cxx11.cpp
+   * Output: \verbinclude Array_variadic_ctor_cxx11.out
+   *
+   * \sa Array(const std::initializer_list<std::initializer_list<Scalar>>&)
+   * \sa Array(const Scalar&), Array(const Scalar&,const Scalar&)
+   */
+  template <typename... ArgTypes>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3,
+                                              const ArgTypes&... args)
+      : Base(a0, a1, a2, a3, args...) {}
+
+  /** \brief Constructs an array and initializes it from the coefficients given as initializer-lists grouped by row.
+   * \cpp11
+   *
+   * In the general case, the constructor takes a list of rows, each row being represented as a list of coefficients:
+   *
+   * Example: \include Array_initializer_list_23_cxx11.cpp
+   * Output: \verbinclude Array_initializer_list_23_cxx11.out
+   *
+   * Each of the inner initializer lists must contain the exact same number of elements, otherwise an assertion is
+   * triggered.
+   *
+   * In the case of a compile-time column 1D array, implicit transposition from a single row is allowed.
+   * Therefore <code> Array<int,Dynamic,1>{{1,2,3,4,5}}</code> is legal and the more verbose syntax
+   * <code>Array<int,Dynamic,1>{{1},{2},{3},{4},{5}}</code> can be avoided:
+   *
+   * Example: \include Array_initializer_list_vector_cxx11.cpp
+   * Output: \verbinclude Array_initializer_list_vector_cxx11.out
+   *
+   * In the case of fixed-sized arrays, the initializer list sizes must exactly match the array sizes,
+   * and implicit transposition is allowed for compile-time 1D arrays only.
+   *
+   * \sa  Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
+   */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Array(
+      const std::initializer_list<std::initializer_list<Scalar>>& list)
+      : Base(list) {}
 
-#ifdef EIGEN_HAVE_RVALUE_REFERENCES
-    Array(Array&& other)
-      : Base(std::move(other))
-    {
-      Base::_check_template_params();
-      if (RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic)
-        Base::_set_noalias(other);
-    }
-    Array& operator=(Array&& other)
-    {
-      other.swap(*this);
-      return *this;
-    }
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  template <typename T>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Array(const T& x) {
+    Base::template _init1<T>(x);
+  }
+
+  template <typename T0, typename T1>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Array(const T0& val0, const T1& val1) {
+    this->template _init2<T0, T1>(val0, val1);
+  }
+
+#else
+  /** \brief Constructs a fixed-sized array initialized with coefficients starting at \a data */
+  EIGEN_DEVICE_FUNC explicit Array(const Scalar* data);
+  /** Constructs a vector or row-vector with given dimension. \only_for_vectors
+   *
+   * Note that this is only useful for dynamic-size vectors. For fixed-size vectors,
+   * it is redundant to pass the dimension here, so it makes more sense to use the default
+   * constructor Array() instead.
+   */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Array(Index dim);
+  /** constructs an initialized 1x1 Array with the given coefficient
+   * \sa const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args */
+  Array(const Scalar& value);
+  /** constructs an uninitialized array with \a rows rows and \a cols columns.
+   *
+   * This is useful for dynamic-size arrays. For fixed-size arrays,
+   * it is redundant to pass these parameters, so one should use the default constructor
+   * Array() instead. */
+  Array(Index rows, Index cols);
+  /** constructs an initialized 2D vector with given coefficients
+   * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args) */
+  Array(const Scalar& val0, const Scalar& val1);
+#endif  // end EIGEN_PARSED_BY_DOXYGEN
+
+  /** constructs an initialized 3D vector with given coefficients
+   * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
+   */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Array(const Scalar& val0, const Scalar& val1, const Scalar& val2) {
+    EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(Array, 3)
+    m_storage.data()[0] = val0;
+    m_storage.data()[1] = val1;
+    m_storage.data()[2] = val2;
+  }
+  /** constructs an initialized 4D vector with given coefficients
+   * \sa Array(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3, const ArgTypes&... args)
+   */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Array(const Scalar& val0, const Scalar& val1, const Scalar& val2,
+                                              const Scalar& val3) {
+    EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(Array, 4)
+    m_storage.data()[0] = val0;
+    m_storage.data()[1] = val1;
+    m_storage.data()[2] = val2;
+    m_storage.data()[3] = val3;
+  }
+
+  /** Copy constructor */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Array(const Array&) = default;
+
+ private:
+  struct PrivateType {};
+
+ public:
+  /** \sa MatrixBase::operator=(const EigenBase<OtherDerived>&) */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Array(
+      const EigenBase<OtherDerived>& other,
+      std::enable_if_t<internal::is_convertible<typename OtherDerived::Scalar, Scalar>::value, PrivateType> =
+          PrivateType())
+      : Base(other.derived()) {}
+
+  EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept { return 1; }
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return this->innerSize(); }
+
+#ifdef EIGEN_ARRAY_PLUGIN
+#include EIGEN_ARRAY_PLUGIN
 #endif
 
-    /** Constructs a vector or row-vector with given dimension. \only_for_vectors
-      *
-      * Note that this is only useful for dynamic-size vectors. For fixed-size vectors,
-      * it is redundant to pass the dimension here, so it makes more sense to use the default
-      * constructor Matrix() instead.
-      */
-    EIGEN_STRONG_INLINE explicit Array(Index dim)
-      : Base(dim, RowsAtCompileTime == 1 ? 1 : dim, ColsAtCompileTime == 1 ? 1 : dim)
-    {
-      Base::_check_template_params();
-      EIGEN_STATIC_ASSERT_VECTOR_ONLY(Array)
-      eigen_assert(dim >= 0);
-      eigen_assert(SizeAtCompileTime == Dynamic || SizeAtCompileTime == dim);
-      EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
-    }
-
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    template<typename T0, typename T1>
-    EIGEN_STRONG_INLINE Array(const T0& val0, const T1& val1)
-    {
-      Base::_check_template_params();
-      this->template _init2<T0,T1>(val0, val1);
-    }
-    #else
-    /** constructs an uninitialized matrix with \a rows rows and \a cols columns.
-      *
-      * This is useful for dynamic-size matrices. For fixed-size matrices,
-      * it is redundant to pass these parameters, so one should use the default constructor
-      * Matrix() instead. */
-    Array(Index rows, Index cols);
-    /** constructs an initialized 2D vector with given coefficients */
-    Array(const Scalar& val0, const Scalar& val1);
-    #endif
-
-    /** constructs an initialized 3D vector with given coefficients */
-    EIGEN_STRONG_INLINE Array(const Scalar& val0, const Scalar& val1, const Scalar& val2)
-    {
-      Base::_check_template_params();
-      EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(Array, 3)
-      m_storage.data()[0] = val0;
-      m_storage.data()[1] = val1;
-      m_storage.data()[2] = val2;
-    }
-    /** constructs an initialized 4D vector with given coefficients */
-    EIGEN_STRONG_INLINE Array(const Scalar& val0, const Scalar& val1, const Scalar& val2, const Scalar& val3)
-    {
-      Base::_check_template_params();
-      EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(Array, 4)
-      m_storage.data()[0] = val0;
-      m_storage.data()[1] = val1;
-      m_storage.data()[2] = val2;
-      m_storage.data()[3] = val3;
-    }
-
-    explicit Array(const Scalar *data);
-
-    /** Constructor copying the value of the expression \a other */
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE Array(const ArrayBase<OtherDerived>& other)
-             : Base(other.rows() * other.cols(), other.rows(), other.cols())
-    {
-      Base::_check_template_params();
-      Base::_set_noalias(other);
-    }
-    /** Copy constructor */
-    EIGEN_STRONG_INLINE Array(const Array& other)
-            : Base(other.rows() * other.cols(), other.rows(), other.cols())
-    {
-      Base::_check_template_params();
-      Base::_set_noalias(other);
-    }
-    /** Copy constructor with in-place evaluation */
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE Array(const ReturnByValue<OtherDerived>& other)
-    {
-      Base::_check_template_params();
-      Base::resize(other.rows(), other.cols());
-      other.evalTo(*this);
-    }
-
-    /** \sa MatrixBase::operator=(const EigenBase<OtherDerived>&) */
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE Array(const EigenBase<OtherDerived> &other)
-      : Base(other.derived().rows() * other.derived().cols(), other.derived().rows(), other.derived().cols())
-    {
-      Base::_check_template_params();
-      Base::_resize_to_match(other);
-      *this = other;
-    }
-
-    /** Override MatrixBase::swap() since for dynamic-sized matrices of same type it is enough to swap the
-      * data pointers.
-      */
-    template<typename OtherDerived>
-    void swap(ArrayBase<OtherDerived> const & other)
-    { this->_swap(other.derived()); }
-
-    inline Index innerStride() const { return 1; }
-    inline Index outerStride() const { return this->innerSize(); }
-
-    #ifdef EIGEN_ARRAY_PLUGIN
-    #include EIGEN_ARRAY_PLUGIN
-    #endif
-
-  private:
-
-    template<typename MatrixType, typename OtherDerived, bool SwapPointers>
-    friend struct internal::matrix_swap_impl;
+ private:
+  template <typename MatrixType, typename OtherDerived, bool SwapPointers>
+  friend struct internal::matrix_swap_impl;
 };
 
 /** \defgroup arraytypedefs Global array typedefs
-  * \ingroup Core_Module
-  *
-  * Eigen defines several typedef shortcuts for most common 1D and 2D array types.
-  *
-  * The general patterns are the following:
-  *
-  * \c ArrayRowsColsType where \c Rows and \c Cols can be \c 2,\c 3,\c 4 for fixed size square matrices or \c X for dynamic size,
-  * and where \c Type can be \c i for integer, \c f for float, \c d for double, \c cf for complex float, \c cd
-  * for complex double.
-  *
-  * For example, \c Array33d is a fixed-size 3x3 array type of doubles, and \c ArrayXXf is a dynamic-size matrix of floats.
-  *
-  * There are also \c ArraySizeType which are self-explanatory. For example, \c Array4cf is
-  * a fixed-size 1D array of 4 complex floats.
-  *
-  * \sa class Array
-  */
-
-#define EIGEN_MAKE_ARRAY_TYPEDEFS(Type, TypeSuffix, Size, SizeSuffix)   \
-/** \ingroup arraytypedefs */                                    \
-typedef Array<Type, Size, Size> Array##SizeSuffix##SizeSuffix##TypeSuffix;  \
-/** \ingroup arraytypedefs */                                    \
-typedef Array<Type, Size, 1>    Array##SizeSuffix##TypeSuffix;
-
-#define EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(Type, TypeSuffix, Size)         \
-/** \ingroup arraytypedefs */                                    \
-typedef Array<Type, Size, Dynamic> Array##Size##X##TypeSuffix;  \
-/** \ingroup arraytypedefs */                                    \
-typedef Array<Type, Dynamic, Size> Array##X##Size##TypeSuffix;
+ * \ingroup Core_Module
+ *
+ * %Eigen defines several typedef shortcuts for most common 1D and 2D array types.
+ *
+ * The general patterns are the following:
+ *
+ * \c ArrayRowsColsType where \c Rows and \c Cols can be \c 2,\c 3,\c 4 for fixed size square matrices or \c X for
+ * dynamic size, and where \c Type can be \c i for integer, \c f for float, \c d for double, \c cf for complex float, \c
+ * cd for complex double.
+ *
+ * For example, \c Array33d is a fixed-size 3x3 array type of doubles, and \c ArrayXXf is a dynamic-size matrix of
+ * floats.
+ *
+ * There are also \c ArraySizeType which are self-explanatory. For example, \c Array4cf is
+ * a fixed-size 1D array of 4 complex floats.
+ *
+ * With \cpp11, template alias are also defined for common sizes.
+ * They follow the same pattern as above except that the scalar type suffix is replaced by a
+ * template parameter, i.e.:
+ *   - `ArrayRowsCols<Type>` where `Rows` and `Cols` can be \c 2,\c 3,\c 4, or \c X for fixed or dynamic size.
+ *   - `ArraySize<Type>` where `Size` can be \c 2,\c 3,\c 4 or \c X for fixed or dynamic size 1D arrays.
+ *
+ * \sa class Array
+ */
+
+#define EIGEN_MAKE_ARRAY_TYPEDEFS(Type, TypeSuffix, Size, SizeSuffix)        \
+  /** \ingroup arraytypedefs */                                              \
+  typedef Array<Type, Size, Size> Array##SizeSuffix##SizeSuffix##TypeSuffix; \
+  /** \ingroup arraytypedefs */                                              \
+  typedef Array<Type, Size, 1> Array##SizeSuffix##TypeSuffix;
+
+#define EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(Type, TypeSuffix, Size)  \
+  /** \ingroup arraytypedefs */                                  \
+  typedef Array<Type, Size, Dynamic> Array##Size##X##TypeSuffix; \
+  /** \ingroup arraytypedefs */                                  \
+  typedef Array<Type, Dynamic, Size> Array##X##Size##TypeSuffix;
 
 #define EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES(Type, TypeSuffix) \
-EIGEN_MAKE_ARRAY_TYPEDEFS(Type, TypeSuffix, 2, 2) \
-EIGEN_MAKE_ARRAY_TYPEDEFS(Type, TypeSuffix, 3, 3) \
-EIGEN_MAKE_ARRAY_TYPEDEFS(Type, TypeSuffix, 4, 4) \
-EIGEN_MAKE_ARRAY_TYPEDEFS(Type, TypeSuffix, Dynamic, X) \
-EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(Type, TypeSuffix, 2) \
-EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(Type, TypeSuffix, 3) \
-EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(Type, TypeSuffix, 4)
-
-EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES(int,                  i)
-EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES(float,                f)
-EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES(double,               d)
-EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES(std::complex<float>,  cf)
+  EIGEN_MAKE_ARRAY_TYPEDEFS(Type, TypeSuffix, 2, 2)           \
+  EIGEN_MAKE_ARRAY_TYPEDEFS(Type, TypeSuffix, 3, 3)           \
+  EIGEN_MAKE_ARRAY_TYPEDEFS(Type, TypeSuffix, 4, 4)           \
+  EIGEN_MAKE_ARRAY_TYPEDEFS(Type, TypeSuffix, Dynamic, X)     \
+  EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(Type, TypeSuffix, 2)        \
+  EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(Type, TypeSuffix, 3)        \
+  EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(Type, TypeSuffix, 4)
+
+EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES(int, i)
+EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES(float, f)
+EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES(double, d)
+EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES(std::complex<float>, cf)
 EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES(std::complex<double>, cd)
 
 #undef EIGEN_MAKE_ARRAY_TYPEDEFS_ALL_SIZES
 #undef EIGEN_MAKE_ARRAY_TYPEDEFS
+#undef EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS
+
+#define EIGEN_MAKE_ARRAY_TYPEDEFS(Size, SizeSuffix)              \
+  /** \ingroup arraytypedefs */                                  \
+  /** \brief \cpp11 */                                           \
+  template <typename Type>                                       \
+  using Array##SizeSuffix##SizeSuffix = Array<Type, Size, Size>; \
+  /** \ingroup arraytypedefs */                                  \
+  /** \brief \cpp11 */                                           \
+  template <typename Type>                                       \
+  using Array##SizeSuffix = Array<Type, Size, 1>;
+
+#define EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(Size)        \
+  /** \ingroup arraytypedefs */                      \
+  /** \brief \cpp11 */                               \
+  template <typename Type>                           \
+  using Array##Size##X = Array<Type, Size, Dynamic>; \
+  /** \ingroup arraytypedefs */                      \
+  /** \brief \cpp11 */                               \
+  template <typename Type>                           \
+  using Array##X##Size = Array<Type, Dynamic, Size>;
+
+EIGEN_MAKE_ARRAY_TYPEDEFS(2, 2)
+EIGEN_MAKE_ARRAY_TYPEDEFS(3, 3)
+EIGEN_MAKE_ARRAY_TYPEDEFS(4, 4)
+EIGEN_MAKE_ARRAY_TYPEDEFS(Dynamic, X)
+EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(2)
+EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(3)
+EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS(4)
 
-#undef EIGEN_MAKE_ARRAY_TYPEDEFS_LARGE
+#undef EIGEN_MAKE_ARRAY_TYPEDEFS
+#undef EIGEN_MAKE_ARRAY_FIXED_TYPEDEFS
 
 #define EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, SizeSuffix) \
-using Eigen::Matrix##SizeSuffix##TypeSuffix; \
-using Eigen::Vector##SizeSuffix##TypeSuffix; \
-using Eigen::RowVector##SizeSuffix##TypeSuffix;
-
-#define EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE(TypeSuffix) \
-EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, 2) \
-EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, 3) \
-EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, 4) \
-EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, X) \
-
-#define EIGEN_USING_ARRAY_TYPEDEFS \
-EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE(i) \
-EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE(f) \
-EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE(d) \
-EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE(cf) \
-EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE(cd)
-
-} // end namespace Eigen
-
-#endif // EIGEN_ARRAY_H
+  using Eigen::Matrix##SizeSuffix##TypeSuffix;                               \
+  using Eigen::Vector##SizeSuffix##TypeSuffix;                               \
+  using Eigen::RowVector##SizeSuffix##TypeSuffix;
+
+#define EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE(TypeSuffix)       \
+  EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, 2) \
+  EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, 3) \
+  EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, 4) \
+  EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE_AND_SIZE(TypeSuffix, X)
+
+#define EIGEN_USING_ARRAY_TYPEDEFS        \
+  EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE(i)  \
+  EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE(f)  \
+  EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE(d)  \
+  EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE(cf) \
+  EIGEN_USING_ARRAY_TYPEDEFS_FOR_TYPE(cd)
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_ARRAY_H
diff --git a/inst/include/Eigen/src/Core/ArrayBase.h b/inst/include/Eigen/src/Core/ArrayBase.h
index 33ff5537..8465f54f 100644
--- a/inst/include/Eigen/src/Core/ArrayBase.h
+++ b/inst/include/Eigen/src/Core/ArrayBase.h
@@ -10,217 +10,204 @@
 #ifndef EIGEN_ARRAYBASE_H
 #define EIGEN_ARRAYBASE_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 
-template<typename ExpressionType> class MatrixWrapper;
+namespace Eigen {
 
-/** \class ArrayBase
-  * \ingroup Core_Module
-  *
-  * \brief Base class for all 1D and 2D array, and related expressions
-  *
-  * An array is similar to a dense vector or matrix. While matrices are mathematical
-  * objects with well defined linear algebra operators, an array is just a collection
-  * of scalar values arranged in a one or two dimensionnal fashion. As the main consequence,
-  * all operations applied to an array are performed coefficient wise. Furthermore,
-  * arrays support scalar math functions of the c++ standard library (e.g., std::sin(x)), and convenient
-  * constructors allowing to easily write generic code working for both scalar values
-  * and arrays.
-  *
-  * This class is the base that is inherited by all array expression types.
-  *
-  * \tparam Derived is the derived type, e.g., an array or an expression type.
-  *
-  * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_ARRAYBASE_PLUGIN.
-  *
-  * \sa class MatrixBase, \ref TopicClassHierarchy
-  */
-template<typename Derived> class ArrayBase
-  : public DenseBase<Derived>
-{
-  public:
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** The base class for a given storage type. */
-    typedef ArrayBase StorageBaseType;
-
-    typedef ArrayBase Eigen_BaseClassForSpecializationOfGlobalMathFuncImpl;
-
-    typedef typename internal::traits<Derived>::StorageKind StorageKind;
-    typedef typename internal::traits<Derived>::Index Index;
-    typedef typename internal::traits<Derived>::Scalar Scalar;
-    typedef typename internal::packet_traits<Scalar>::type PacketScalar;
-    typedef typename NumTraits<Scalar>::Real RealScalar;
-
-    typedef DenseBase<Derived> Base;
-    using Base::operator*;
-    using Base::RowsAtCompileTime;
-    using Base::ColsAtCompileTime;
-    using Base::SizeAtCompileTime;
-    using Base::MaxRowsAtCompileTime;
-    using Base::MaxColsAtCompileTime;
-    using Base::MaxSizeAtCompileTime;
-    using Base::IsVectorAtCompileTime;
-    using Base::Flags;
-    using Base::CoeffReadCost;
-
-    using Base::derived;
-    using Base::const_cast_derived;
-    using Base::rows;
-    using Base::cols;
-    using Base::size;
-    using Base::coeff;
-    using Base::coeffRef;
-    using Base::lazyAssign;
-    using Base::operator=;
-    using Base::operator+=;
-    using Base::operator-=;
-    using Base::operator*=;
-    using Base::operator/=;
-
-    typedef typename Base::CoeffReturnType CoeffReturnType;
-
-#endif // not EIGEN_PARSED_BY_DOXYGEN
+template <typename ExpressionType>
+class MatrixWrapper;
 
+/** \class ArrayBase
+ * \ingroup Core_Module
+ *
+ * \brief Base class for all 1D and 2D array, and related expressions
+ *
+ * An array is similar to a dense vector or matrix. While matrices are mathematical
+ * objects with well defined linear algebra operators, an array is just a collection
+ * of scalar values arranged in a one or two dimensional fashion. As the main consequence,
+ * all operations applied to an array are performed coefficient wise. Furthermore,
+ * arrays support scalar math functions of the c++ standard library (e.g., std::sin(x)), and convenient
+ * constructors allowing to easily write generic code working for both scalar values
+ * and arrays.
+ *
+ * This class is the base that is inherited by all array expression types.
+ *
+ * \tparam Derived is the derived type, e.g., an array or an expression type.
+ *
+ * This class can be extended with the help of the plugin mechanism described on the page
+ * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_ARRAYBASE_PLUGIN.
+ *
+ * \sa class MatrixBase, \ref TopicClassHierarchy
+ */
+template <typename Derived>
+class ArrayBase : public DenseBase<Derived> {
+ public:
 #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** \internal the plain matrix type corresponding to this expression. Note that is not necessarily
-      * exactly the return type of eval(): in the case of plain matrices, the return type of eval() is a const
-      * reference to a matrix, not a matrix! It is however guaranteed that the return type of eval() is either
-      * PlainObject or const PlainObject&.
-      */
-    typedef Array<typename internal::traits<Derived>::Scalar,
-                internal::traits<Derived>::RowsAtCompileTime,
-                internal::traits<Derived>::ColsAtCompileTime,
-                AutoAlign | (internal::traits<Derived>::Flags&RowMajorBit ? RowMajor : ColMajor),
-                internal::traits<Derived>::MaxRowsAtCompileTime,
-                internal::traits<Derived>::MaxColsAtCompileTime
-          > PlainObject;
-
-
-    /** \internal Represents a matrix with all coefficients equal to one another*/
-    typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>,Derived> ConstantReturnType;
-#endif // not EIGEN_PARSED_BY_DOXYGEN
+  /** The base class for a given storage type. */
+  typedef ArrayBase StorageBaseType;
+
+  typedef ArrayBase Eigen_BaseClassForSpecializationOfGlobalMathFuncImpl;
+
+  typedef typename internal::traits<Derived>::StorageKind StorageKind;
+  typedef typename internal::traits<Derived>::Scalar Scalar;
+  typedef typename internal::packet_traits<Scalar>::type PacketScalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+
+  typedef DenseBase<Derived> Base;
+  using Base::ColsAtCompileTime;
+  using Base::Flags;
+  using Base::IsVectorAtCompileTime;
+  using Base::MaxColsAtCompileTime;
+  using Base::MaxRowsAtCompileTime;
+  using Base::MaxSizeAtCompileTime;
+  using Base::RowsAtCompileTime;
+  using Base::SizeAtCompileTime;
+
+  using Base::coeff;
+  using Base::coeffRef;
+  using Base::cols;
+  using Base::const_cast_derived;
+  using Base::derived;
+  using Base::lazyAssign;
+  using Base::rows;
+  using Base::size;
+  using Base::operator-;
+  using Base::operator=;
+  using Base::operator+=;
+  using Base::operator-=;
+  using Base::operator*=;
+  using Base::operator/=;
+
+  typedef typename Base::CoeffReturnType CoeffReturnType;
+
+  typedef typename Base::PlainObject PlainObject;
+
+  /** \internal Represents a matrix with all coefficients equal to one another*/
+  typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject> ConstantReturnType;
+#endif  // not EIGEN_PARSED_BY_DOXYGEN
 
 #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::ArrayBase
-#   include "../plugins/CommonCwiseUnaryOps.h"
-#   include "../plugins/MatrixCwiseUnaryOps.h"
-#   include "../plugins/ArrayCwiseUnaryOps.h"
-#   include "../plugins/CommonCwiseBinaryOps.h"
-#   include "../plugins/MatrixCwiseBinaryOps.h"
-#   include "../plugins/ArrayCwiseBinaryOps.h"
-#   ifdef EIGEN_ARRAYBASE_PLUGIN
-#     include EIGEN_ARRAYBASE_PLUGIN
-#   endif
+#define EIGEN_DOC_UNARY_ADDONS(X, Y)
+#include "../plugins/MatrixCwiseUnaryOps.inc"
+#include "../plugins/ArrayCwiseUnaryOps.inc"
+#include "../plugins/CommonCwiseBinaryOps.inc"
+#include "../plugins/MatrixCwiseBinaryOps.inc"
+#include "../plugins/ArrayCwiseBinaryOps.inc"
+#ifdef EIGEN_ARRAYBASE_PLUGIN
+#include EIGEN_ARRAYBASE_PLUGIN
+#endif
 #undef EIGEN_CURRENT_STORAGE_BASE_CLASS
-
-    /** Special case of the template operator=, in order to prevent the compiler
-      * from generating a default operator= (issue hit with g++ 4.1)
-      */
-    Derived& operator=(const ArrayBase& other)
-    {
-      return internal::assign_selector<Derived,Derived>::run(derived(), other.derived());
-    }
-
-    Derived& operator+=(const Scalar& scalar)
-    { return *this = derived() + scalar; }
-    Derived& operator-=(const Scalar& scalar)
-    { return *this = derived() - scalar; }
-
-    template<typename OtherDerived>
-    Derived& operator+=(const ArrayBase<OtherDerived>& other);
-    template<typename OtherDerived>
-    Derived& operator-=(const ArrayBase<OtherDerived>& other);
-
-    template<typename OtherDerived>
-    Derived& operator*=(const ArrayBase<OtherDerived>& other);
-
-    template<typename OtherDerived>
-    Derived& operator/=(const ArrayBase<OtherDerived>& other);
-
-  public:
-    ArrayBase<Derived>& array() { return *this; }
-    const ArrayBase<Derived>& array() const { return *this; }
-
-    /** \returns an \link Eigen::MatrixBase Matrix \endlink expression of this array
-      * \sa MatrixBase::array() */
-    MatrixWrapper<Derived> matrix() { return derived(); }
-    const MatrixWrapper<const Derived> matrix() const { return derived(); }
-
-//     template<typename Dest>
-//     inline void evalTo(Dest& dst) const { dst = matrix(); }
-
-  protected:
-    ArrayBase() : Base() {}
-
-  private:
-    explicit ArrayBase(Index);
-    ArrayBase(Index,Index);
-    template<typename OtherDerived> explicit ArrayBase(const ArrayBase<OtherDerived>&);
-  protected:
-    // mixing arrays and matrices is not legal
-    template<typename OtherDerived> Derived& operator+=(const MatrixBase<OtherDerived>& )
-    {EIGEN_STATIC_ASSERT(std::ptrdiff_t(sizeof(typename OtherDerived::Scalar))==-1,YOU_CANNOT_MIX_ARRAYS_AND_MATRICES); return *this;}
-    // mixing arrays and matrices is not legal
-    template<typename OtherDerived> Derived& operator-=(const MatrixBase<OtherDerived>& )
-    {EIGEN_STATIC_ASSERT(std::ptrdiff_t(sizeof(typename OtherDerived::Scalar))==-1,YOU_CANNOT_MIX_ARRAYS_AND_MATRICES); return *this;}
+#undef EIGEN_DOC_UNARY_ADDONS
+
+  /** Special case of the template operator=, in order to prevent the compiler
+   * from generating a default operator= (issue hit with g++ 4.1)
+   */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const ArrayBase& other) {
+    internal::call_assignment(derived(), other.derived());
+    return derived();
+  }
+
+  /** Set all the entries to \a value.
+   * \sa DenseBase::setConstant(), DenseBase::fill() */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const Scalar& value) {
+    Base::setConstant(value);
+    return derived();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator+=(const Scalar& other) {
+    internal::call_assignment(this->derived(), PlainObject::Constant(rows(), cols(), other),
+                              internal::add_assign_op<Scalar, Scalar>());
+    return derived();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator-=(const Scalar& other) {
+    internal::call_assignment(this->derived(), PlainObject::Constant(rows(), cols(), other),
+                              internal::sub_assign_op<Scalar, Scalar>());
+    return derived();
+  }
+
+  /** replaces \c *this by \c *this + \a other.
+   *
+   * \returns a reference to \c *this
+   */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator+=(const ArrayBase<OtherDerived>& other) {
+    call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar, typename OtherDerived::Scalar>());
+    return derived();
+  }
+
+  /** replaces \c *this by \c *this - \a other.
+   *
+   * \returns a reference to \c *this
+   */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator-=(const ArrayBase<OtherDerived>& other) {
+    call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar, typename OtherDerived::Scalar>());
+    return derived();
+  }
+
+  /** replaces \c *this by \c *this * \a other coefficient wise.
+   *
+   * \returns a reference to \c *this
+   */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator*=(const ArrayBase<OtherDerived>& other) {
+    call_assignment(derived(), other.derived(), internal::mul_assign_op<Scalar, typename OtherDerived::Scalar>());
+    return derived();
+  }
+
+  /** replaces \c *this by \c *this / \a other coefficient wise.
+   *
+   * \returns a reference to \c *this
+   */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator/=(const ArrayBase<OtherDerived>& other) {
+    call_assignment(derived(), other.derived(), internal::div_assign_op<Scalar, typename OtherDerived::Scalar>());
+    return derived();
+  }
+
+ public:
+  EIGEN_DEVICE_FUNC ArrayBase<Derived>& array() { return *this; }
+  EIGEN_DEVICE_FUNC const ArrayBase<Derived>& array() const { return *this; }
+
+  /** \returns an \link Eigen::MatrixBase Matrix \endlink expression of this array
+   * \sa MatrixBase::array() */
+  EIGEN_DEVICE_FUNC MatrixWrapper<Derived> matrix() { return MatrixWrapper<Derived>(derived()); }
+  EIGEN_DEVICE_FUNC const MatrixWrapper<const Derived> matrix() const {
+    return MatrixWrapper<const Derived>(derived());
+  }
+
+  //     template<typename Dest>
+  //     inline void evalTo(Dest& dst) const { dst = matrix(); }
+
+ protected:
+  EIGEN_DEFAULT_COPY_CONSTRUCTOR(ArrayBase)
+  EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(ArrayBase)
+
+ private:
+  explicit ArrayBase(Index);
+  ArrayBase(Index, Index);
+  template <typename OtherDerived>
+  explicit ArrayBase(const ArrayBase<OtherDerived>&);
+
+ protected:
+  // mixing arrays and matrices is not legal
+  template <typename OtherDerived>
+  Derived& operator+=(const MatrixBase<OtherDerived>&) {
+    EIGEN_STATIC_ASSERT(std::ptrdiff_t(sizeof(typename OtherDerived::Scalar)) == -1,
+                        YOU_CANNOT_MIX_ARRAYS_AND_MATRICES);
+    return *this;
+  }
+  // mixing arrays and matrices is not legal
+  template <typename OtherDerived>
+  Derived& operator-=(const MatrixBase<OtherDerived>&) {
+    EIGEN_STATIC_ASSERT(std::ptrdiff_t(sizeof(typename OtherDerived::Scalar)) == -1,
+                        YOU_CANNOT_MIX_ARRAYS_AND_MATRICES);
+    return *this;
+  }
 };
 
-/** replaces \c *this by \c *this - \a other.
-  *
-  * \returns a reference to \c *this
-  */
-template<typename Derived>
-template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived &
-ArrayBase<Derived>::operator-=(const ArrayBase<OtherDerived> &other)
-{
-  SelfCwiseBinaryOp<internal::scalar_difference_op<Scalar>, Derived, OtherDerived> tmp(derived());
-  tmp = other.derived();
-  return derived();
-}
-
-/** replaces \c *this by \c *this + \a other.
-  *
-  * \returns a reference to \c *this
-  */
-template<typename Derived>
-template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived &
-ArrayBase<Derived>::operator+=(const ArrayBase<OtherDerived>& other)
-{
-  SelfCwiseBinaryOp<internal::scalar_sum_op<Scalar>, Derived, OtherDerived> tmp(derived());
-  tmp = other.derived();
-  return derived();
-}
-
-/** replaces \c *this by \c *this * \a other coefficient wise.
-  *
-  * \returns a reference to \c *this
-  */
-template<typename Derived>
-template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived &
-ArrayBase<Derived>::operator*=(const ArrayBase<OtherDerived>& other)
-{
-  SelfCwiseBinaryOp<internal::scalar_product_op<Scalar>, Derived, OtherDerived> tmp(derived());
-  tmp = other.derived();
-  return derived();
-}
-
-/** replaces \c *this by \c *this / \a other coefficient wise.
-  *
-  * \returns a reference to \c *this
-  */
-template<typename Derived>
-template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived &
-ArrayBase<Derived>::operator/=(const ArrayBase<OtherDerived>& other)
-{
-  SelfCwiseBinaryOp<internal::scalar_quotient_op<Scalar>, Derived, OtherDerived> tmp(derived());
-  tmp = other.derived();
-  return derived();
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN_ARRAYBASE_H
+}  // end namespace Eigen
+
+#endif  // EIGEN_ARRAYBASE_H
diff --git a/inst/include/Eigen/src/Core/ArrayWrapper.h b/inst/include/Eigen/src/Core/ArrayWrapper.h
index b4641e2a..c9a194e9 100644
--- a/inst/include/Eigen/src/Core/ArrayWrapper.h
+++ b/inst/include/Eigen/src/Core/ArrayWrapper.h
@@ -10,255 +10,156 @@
 #ifndef EIGEN_ARRAYWRAPPER_H
 #define EIGEN_ARRAYWRAPPER_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 /** \class ArrayWrapper
-  * \ingroup Core_Module
-  *
-  * \brief Expression of a mathematical vector or matrix as an array object
-  *
-  * This class is the return type of MatrixBase::array(), and most of the time
-  * this is the only way it is use.
-  *
-  * \sa MatrixBase::array(), class MatrixWrapper
-  */
+ * \ingroup Core_Module
+ *
+ * \brief Expression of a mathematical vector or matrix as an array object
+ *
+ * This class is the return type of MatrixBase::array(), and most of the time
+ * this is the only way it is use.
+ *
+ * \sa MatrixBase::array(), class MatrixWrapper
+ */
 
 namespace internal {
-template<typename ExpressionType>
-struct traits<ArrayWrapper<ExpressionType> >
-  : public traits<typename remove_all<typename ExpressionType::Nested>::type >
-{
+template <typename ExpressionType>
+struct traits<ArrayWrapper<ExpressionType> > : public traits<remove_all_t<typename ExpressionType::Nested> > {
   typedef ArrayXpr XprKind;
   // Let's remove NestByRefBit
   enum {
-    Flags0 = traits<typename remove_all<typename ExpressionType::Nested>::type >::Flags,
-    Flags = Flags0 & ~NestByRefBit
+    Flags0 = traits<remove_all_t<typename ExpressionType::Nested> >::Flags,
+    LvalueBitFlag = is_lvalue<ExpressionType>::value ? LvalueBit : 0,
+    Flags = (Flags0 & ~(NestByRefBit | LvalueBit)) | LvalueBitFlag
   };
 };
-}
-
-template<typename ExpressionType>
-class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> >
-{
-  public:
-    typedef ArrayBase<ArrayWrapper> Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(ArrayWrapper)
-    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(ArrayWrapper)
-
-    typedef typename internal::conditional<
-                       internal::is_lvalue<ExpressionType>::value,
-                       Scalar,
-                       const Scalar
-                     >::type ScalarWithConstIfNotLvalue;
-
-    typedef typename internal::nested<ExpressionType>::type NestedExpressionType;
-
-    inline ArrayWrapper(ExpressionType& matrix) : m_expression(matrix) {}
-
-    inline Index rows() const { return m_expression.rows(); }
-    inline Index cols() const { return m_expression.cols(); }
-    inline Index outerStride() const { return m_expression.outerStride(); }
-    inline Index innerStride() const { return m_expression.innerStride(); }
-
-    inline ScalarWithConstIfNotLvalue* data() { return m_expression.const_cast_derived().data(); }
-    inline const Scalar* data() const { return m_expression.data(); }
-
-    inline CoeffReturnType coeff(Index rowId, Index colId) const
-    {
-      return m_expression.coeff(rowId, colId);
-    }
-
-    inline Scalar& coeffRef(Index rowId, Index colId)
-    {
-      return m_expression.const_cast_derived().coeffRef(rowId, colId);
-    }
-
-    inline const Scalar& coeffRef(Index rowId, Index colId) const
-    {
-      return m_expression.const_cast_derived().coeffRef(rowId, colId);
-    }
-
-    inline CoeffReturnType coeff(Index index) const
-    {
-      return m_expression.coeff(index);
-    }
-
-    inline Scalar& coeffRef(Index index)
-    {
-      return m_expression.const_cast_derived().coeffRef(index);
-    }
-
-    inline const Scalar& coeffRef(Index index) const
-    {
-      return m_expression.const_cast_derived().coeffRef(index);
-    }
-
-    template<int LoadMode>
-    inline const PacketScalar packet(Index rowId, Index colId) const
-    {
-      return m_expression.template packet<LoadMode>(rowId, colId);
-    }
-
-    template<int LoadMode>
-    inline void writePacket(Index rowId, Index colId, const PacketScalar& val)
-    {
-      m_expression.const_cast_derived().template writePacket<LoadMode>(rowId, colId, val);
-    }
-
-    template<int LoadMode>
-    inline const PacketScalar packet(Index index) const
-    {
-      return m_expression.template packet<LoadMode>(index);
-    }
-
-    template<int LoadMode>
-    inline void writePacket(Index index, const PacketScalar& val)
-    {
-      m_expression.const_cast_derived().template writePacket<LoadMode>(index, val);
-    }
-
-    template<typename Dest>
-    inline void evalTo(Dest& dst) const { dst = m_expression; }
-
-    const typename internal::remove_all<NestedExpressionType>::type& 
-    nestedExpression() const 
-    {
-      return m_expression;
-    }
-
-    /** Forwards the resizing request to the nested expression
-      * \sa DenseBase::resize(Index)  */
-    void resize(Index newSize) { m_expression.const_cast_derived().resize(newSize); }
-    /** Forwards the resizing request to the nested expression
-      * \sa DenseBase::resize(Index,Index)*/
-    void resize(Index nbRows, Index nbCols) { m_expression.const_cast_derived().resize(nbRows,nbCols); }
-
-  protected:
-    NestedExpressionType m_expression;
+}  // namespace internal
+
+template <typename ExpressionType>
+class ArrayWrapper : public ArrayBase<ArrayWrapper<ExpressionType> > {
+ public:
+  typedef ArrayBase<ArrayWrapper> Base;
+  EIGEN_DENSE_PUBLIC_INTERFACE(ArrayWrapper)
+  EIGEN_INHERIT_ASSIGNMENT_OPERATORS(ArrayWrapper)
+  typedef internal::remove_all_t<ExpressionType> NestedExpression;
+
+  typedef std::conditional_t<internal::is_lvalue<ExpressionType>::value, Scalar, const Scalar>
+      ScalarWithConstIfNotLvalue;
+
+  typedef typename internal::ref_selector<ExpressionType>::non_const_type NestedExpressionType;
+
+  using Base::coeffRef;
+
+  EIGEN_DEVICE_FUNC explicit EIGEN_STRONG_INLINE ArrayWrapper(ExpressionType& matrix) : m_expression(matrix) {}
+
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_expression.rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_expression.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return m_expression.outerStride(); }
+  EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept { return m_expression.innerStride(); }
+
+  EIGEN_DEVICE_FUNC constexpr ScalarWithConstIfNotLvalue* data() { return m_expression.data(); }
+  EIGEN_DEVICE_FUNC constexpr const Scalar* data() const { return m_expression.data(); }
+
+  EIGEN_DEVICE_FUNC inline const Scalar& coeffRef(Index rowId, Index colId) const {
+    return m_expression.coeffRef(rowId, colId);
+  }
+
+  EIGEN_DEVICE_FUNC inline const Scalar& coeffRef(Index index) const { return m_expression.coeffRef(index); }
+
+  template <typename Dest>
+  EIGEN_DEVICE_FUNC inline void evalTo(Dest& dst) const {
+    dst = m_expression;
+  }
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<NestedExpressionType>& nestedExpression() const {
+    return m_expression;
+  }
+
+  /** Forwards the resizing request to the nested expression
+   * \sa DenseBase::resize(Index)  */
+  EIGEN_DEVICE_FUNC void resize(Index newSize) { m_expression.resize(newSize); }
+  /** Forwards the resizing request to the nested expression
+   * \sa DenseBase::resize(Index,Index)*/
+  EIGEN_DEVICE_FUNC void resize(Index rows, Index cols) { m_expression.resize(rows, cols); }
+
+ protected:
+  NestedExpressionType m_expression;
 };
 
 /** \class MatrixWrapper
-  * \ingroup Core_Module
-  *
-  * \brief Expression of an array as a mathematical vector or matrix
-  *
-  * This class is the return type of ArrayBase::matrix(), and most of the time
-  * this is the only way it is use.
-  *
-  * \sa MatrixBase::matrix(), class ArrayWrapper
-  */
+ * \ingroup Core_Module
+ *
+ * \brief Expression of an array as a mathematical vector or matrix
+ *
+ * This class is the return type of ArrayBase::matrix(), and most of the time
+ * this is the only way it is use.
+ *
+ * \sa MatrixBase::matrix(), class ArrayWrapper
+ */
 
 namespace internal {
-template<typename ExpressionType>
-struct traits<MatrixWrapper<ExpressionType> >
- : public traits<typename remove_all<typename ExpressionType::Nested>::type >
-{
+template <typename ExpressionType>
+struct traits<MatrixWrapper<ExpressionType> > : public traits<remove_all_t<typename ExpressionType::Nested> > {
   typedef MatrixXpr XprKind;
   // Let's remove NestByRefBit
   enum {
-    Flags0 = traits<typename remove_all<typename ExpressionType::Nested>::type >::Flags,
-    Flags = Flags0 & ~NestByRefBit
+    Flags0 = traits<remove_all_t<typename ExpressionType::Nested> >::Flags,
+    LvalueBitFlag = is_lvalue<ExpressionType>::value ? LvalueBit : 0,
+    Flags = (Flags0 & ~(NestByRefBit | LvalueBit)) | LvalueBitFlag
   };
 };
-}
-
-template<typename ExpressionType>
-class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> >
-{
-  public:
-    typedef MatrixBase<MatrixWrapper<ExpressionType> > Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(MatrixWrapper)
-    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(MatrixWrapper)
-
-    typedef typename internal::conditional<
-                       internal::is_lvalue<ExpressionType>::value,
-                       Scalar,
-                       const Scalar
-                     >::type ScalarWithConstIfNotLvalue;
-
-    typedef typename internal::nested<ExpressionType>::type NestedExpressionType;
-
-    inline MatrixWrapper(ExpressionType& a_matrix) : m_expression(a_matrix) {}
-
-    inline Index rows() const { return m_expression.rows(); }
-    inline Index cols() const { return m_expression.cols(); }
-    inline Index outerStride() const { return m_expression.outerStride(); }
-    inline Index innerStride() const { return m_expression.innerStride(); }
-
-    inline ScalarWithConstIfNotLvalue* data() { return m_expression.const_cast_derived().data(); }
-    inline const Scalar* data() const { return m_expression.data(); }
-
-    inline CoeffReturnType coeff(Index rowId, Index colId) const
-    {
-      return m_expression.coeff(rowId, colId);
-    }
-
-    inline Scalar& coeffRef(Index rowId, Index colId)
-    {
-      return m_expression.const_cast_derived().coeffRef(rowId, colId);
-    }
-
-    inline const Scalar& coeffRef(Index rowId, Index colId) const
-    {
-      return m_expression.derived().coeffRef(rowId, colId);
-    }
-
-    inline CoeffReturnType coeff(Index index) const
-    {
-      return m_expression.coeff(index);
-    }
-
-    inline Scalar& coeffRef(Index index)
-    {
-      return m_expression.const_cast_derived().coeffRef(index);
-    }
-
-    inline const Scalar& coeffRef(Index index) const
-    {
-      return m_expression.const_cast_derived().coeffRef(index);
-    }
-
-    template<int LoadMode>
-    inline const PacketScalar packet(Index rowId, Index colId) const
-    {
-      return m_expression.template packet<LoadMode>(rowId, colId);
-    }
-
-    template<int LoadMode>
-    inline void writePacket(Index rowId, Index colId, const PacketScalar& val)
-    {
-      m_expression.const_cast_derived().template writePacket<LoadMode>(rowId, colId, val);
-    }
-
-    template<int LoadMode>
-    inline const PacketScalar packet(Index index) const
-    {
-      return m_expression.template packet<LoadMode>(index);
-    }
-
-    template<int LoadMode>
-    inline void writePacket(Index index, const PacketScalar& val)
-    {
-      m_expression.const_cast_derived().template writePacket<LoadMode>(index, val);
-    }
-
-    const typename internal::remove_all<NestedExpressionType>::type& 
-    nestedExpression() const 
-    {
-      return m_expression;
-    }
-
-    /** Forwards the resizing request to the nested expression
-      * \sa DenseBase::resize(Index)  */
-    void resize(Index newSize) { m_expression.const_cast_derived().resize(newSize); }
-    /** Forwards the resizing request to the nested expression
-      * \sa DenseBase::resize(Index,Index)*/
-    void resize(Index nbRows, Index nbCols) { m_expression.const_cast_derived().resize(nbRows,nbCols); }
-
-  protected:
-    NestedExpressionType m_expression;
+}  // namespace internal
+
+template <typename ExpressionType>
+class MatrixWrapper : public MatrixBase<MatrixWrapper<ExpressionType> > {
+ public:
+  typedef MatrixBase<MatrixWrapper<ExpressionType> > Base;
+  EIGEN_DENSE_PUBLIC_INTERFACE(MatrixWrapper)
+  EIGEN_INHERIT_ASSIGNMENT_OPERATORS(MatrixWrapper)
+  typedef internal::remove_all_t<ExpressionType> NestedExpression;
+
+  typedef std::conditional_t<internal::is_lvalue<ExpressionType>::value, Scalar, const Scalar>
+      ScalarWithConstIfNotLvalue;
+
+  typedef typename internal::ref_selector<ExpressionType>::non_const_type NestedExpressionType;
+
+  using Base::coeffRef;
+
+  EIGEN_DEVICE_FUNC explicit inline MatrixWrapper(ExpressionType& matrix) : m_expression(matrix) {}
+
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_expression.rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_expression.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return m_expression.outerStride(); }
+  EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept { return m_expression.innerStride(); }
+
+  EIGEN_DEVICE_FUNC constexpr ScalarWithConstIfNotLvalue* data() { return m_expression.data(); }
+  EIGEN_DEVICE_FUNC constexpr const Scalar* data() const { return m_expression.data(); }
+
+  EIGEN_DEVICE_FUNC inline const Scalar& coeffRef(Index rowId, Index colId) const {
+    return m_expression.derived().coeffRef(rowId, colId);
+  }
+
+  EIGEN_DEVICE_FUNC inline const Scalar& coeffRef(Index index) const { return m_expression.coeffRef(index); }
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<NestedExpressionType>& nestedExpression() const {
+    return m_expression;
+  }
+
+  /** Forwards the resizing request to the nested expression
+   * \sa DenseBase::resize(Index)  */
+  EIGEN_DEVICE_FUNC void resize(Index newSize) { m_expression.resize(newSize); }
+  /** Forwards the resizing request to the nested expression
+   * \sa DenseBase::resize(Index,Index)*/
+  EIGEN_DEVICE_FUNC void resize(Index rows, Index cols) { m_expression.resize(rows, cols); }
+
+ protected:
+  NestedExpressionType m_expression;
 };
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_ARRAYWRAPPER_H
+#endif  // EIGEN_ARRAYWRAPPER_H
diff --git a/inst/include/Eigen/src/Core/Assign.h b/inst/include/Eigen/src/Core/Assign.h
index f4817317..4b30f7bb 100644
--- a/inst/include/Eigen/src/Core/Assign.h
+++ b/inst/include/Eigen/src/Core/Assign.h
@@ -12,579 +12,69 @@
 #ifndef EIGEN_ASSIGN_H
 #define EIGEN_ASSIGN_H
 
-namespace Eigen {
-
-namespace internal {
-
-/***************************************************************************
-* Part 1 : the logic deciding a strategy for traversal and unrolling       *
-***************************************************************************/
-
-template <typename Derived, typename OtherDerived>
-struct assign_traits
-{
-public:
-  enum {
-    DstIsAligned = Derived::Flags & AlignedBit,
-    DstHasDirectAccess = Derived::Flags & DirectAccessBit,
-    SrcIsAligned = OtherDerived::Flags & AlignedBit,
-    JointAlignment = bool(DstIsAligned) && bool(SrcIsAligned) ? Aligned : Unaligned
-  };
-
-private:
-  enum {
-    InnerSize = int(Derived::IsVectorAtCompileTime) ? int(Derived::SizeAtCompileTime)
-              : int(Derived::Flags)&RowMajorBit ? int(Derived::ColsAtCompileTime)
-              : int(Derived::RowsAtCompileTime),
-    InnerMaxSize = int(Derived::IsVectorAtCompileTime) ? int(Derived::MaxSizeAtCompileTime)
-              : int(Derived::Flags)&RowMajorBit ? int(Derived::MaxColsAtCompileTime)
-              : int(Derived::MaxRowsAtCompileTime),
-    MaxSizeAtCompileTime = Derived::SizeAtCompileTime,
-    PacketSize = packet_traits<typename Derived::Scalar>::size
-  };
-
-  enum {
-    StorageOrdersAgree = (int(Derived::IsRowMajor) == int(OtherDerived::IsRowMajor)),
-    MightVectorize = StorageOrdersAgree
-                  && (int(Derived::Flags) & int(OtherDerived::Flags) & ActualPacketAccessBit),
-    MayInnerVectorize  = MightVectorize && int(InnerSize)!=Dynamic && int(InnerSize)%int(PacketSize)==0
-                       && int(DstIsAligned) && int(SrcIsAligned),
-    MayLinearize = StorageOrdersAgree && (int(Derived::Flags) & int(OtherDerived::Flags) & LinearAccessBit),
-    MayLinearVectorize = MightVectorize && MayLinearize && DstHasDirectAccess
-                       && (DstIsAligned || MaxSizeAtCompileTime == Dynamic),
-      /* If the destination isn't aligned, we have to do runtime checks and we don't unroll,
-         so it's only good for large enough sizes. */
-    MaySliceVectorize  = MightVectorize && DstHasDirectAccess
-                       && (int(InnerMaxSize)==Dynamic || int(InnerMaxSize)>=3*PacketSize)
-      /* slice vectorization can be slow, so we only want it if the slices are big, which is
-         indicated by InnerMaxSize rather than InnerSize, think of the case of a dynamic block
-         in a fixed-size matrix */
-  };
-
-public:
-  enum {
-    Traversal = int(MayInnerVectorize)  ? int(InnerVectorizedTraversal)
-              : int(MayLinearVectorize) ? int(LinearVectorizedTraversal)
-              : int(MaySliceVectorize)  ? int(SliceVectorizedTraversal)
-              : int(MayLinearize)       ? int(LinearTraversal)
-                                        : int(DefaultTraversal),
-    Vectorized = int(Traversal) == InnerVectorizedTraversal
-              || int(Traversal) == LinearVectorizedTraversal
-              || int(Traversal) == SliceVectorizedTraversal
-  };
-
-private:
-  enum {
-    UnrollingLimit      = EIGEN_UNROLLING_LIMIT * (Vectorized ? int(PacketSize) : 1),
-    MayUnrollCompletely = int(Derived::SizeAtCompileTime) != Dynamic
-                       && int(OtherDerived::CoeffReadCost) != Dynamic
-                       && int(Derived::SizeAtCompileTime) * int(OtherDerived::CoeffReadCost) <= int(UnrollingLimit),
-    MayUnrollInner      = int(InnerSize) != Dynamic
-                       && int(OtherDerived::CoeffReadCost) != Dynamic
-                       && int(InnerSize) * int(OtherDerived::CoeffReadCost) <= int(UnrollingLimit)
-  };
-
-public:
-  enum {
-    Unrolling = (int(Traversal) == int(InnerVectorizedTraversal) || int(Traversal) == int(DefaultTraversal))
-                ? (
-                    int(MayUnrollCompletely) ? int(CompleteUnrolling)
-                  : int(MayUnrollInner)      ? int(InnerUnrolling)
-                                             : int(NoUnrolling)
-                  )
-              : int(Traversal) == int(LinearVectorizedTraversal)
-                ? ( bool(MayUnrollCompletely) && bool(DstIsAligned) ? int(CompleteUnrolling) : int(NoUnrolling) )
-              : int(Traversal) == int(LinearTraversal)
-                ? ( bool(MayUnrollCompletely) ? int(CompleteUnrolling) : int(NoUnrolling) )
-              : int(NoUnrolling)
-  };
-
-#ifdef EIGEN_DEBUG_ASSIGN
-  static void debug()
-  {
-    EIGEN_DEBUG_VAR(DstIsAligned)
-    EIGEN_DEBUG_VAR(SrcIsAligned)
-    EIGEN_DEBUG_VAR(JointAlignment)
-    EIGEN_DEBUG_VAR(InnerSize)
-    EIGEN_DEBUG_VAR(InnerMaxSize)
-    EIGEN_DEBUG_VAR(PacketSize)
-    EIGEN_DEBUG_VAR(StorageOrdersAgree)
-    EIGEN_DEBUG_VAR(MightVectorize)
-    EIGEN_DEBUG_VAR(MayLinearize)
-    EIGEN_DEBUG_VAR(MayInnerVectorize)
-    EIGEN_DEBUG_VAR(MayLinearVectorize)
-    EIGEN_DEBUG_VAR(MaySliceVectorize)
-    EIGEN_DEBUG_VAR(Traversal)
-    EIGEN_DEBUG_VAR(UnrollingLimit)
-    EIGEN_DEBUG_VAR(MayUnrollCompletely)
-    EIGEN_DEBUG_VAR(MayUnrollInner)
-    EIGEN_DEBUG_VAR(Unrolling)
-  }
-#endif
-};
-
-/***************************************************************************
-* Part 2 : meta-unrollers
-***************************************************************************/
-
-/************************
-*** Default traversal ***
-************************/
-
-template<typename Derived1, typename Derived2, int Index, int Stop>
-struct assign_DefaultTraversal_CompleteUnrolling
-{
-  enum {
-    outer = Index / Derived1::InnerSizeAtCompileTime,
-    inner = Index % Derived1::InnerSizeAtCompileTime
-  };
-
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
-  {
-    dst.copyCoeffByOuterInner(outer, inner, src);
-    assign_DefaultTraversal_CompleteUnrolling<Derived1, Derived2, Index+1, Stop>::run(dst, src);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Stop>
-struct assign_DefaultTraversal_CompleteUnrolling<Derived1, Derived2, Stop, Stop>
-{
-  static EIGEN_STRONG_INLINE void run(Derived1 &, const Derived2 &) {}
-};
-
-template<typename Derived1, typename Derived2, int Index, int Stop>
-struct assign_DefaultTraversal_InnerUnrolling
-{
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src, typename Derived1::Index outer)
-  {
-    dst.copyCoeffByOuterInner(outer, Index, src);
-    assign_DefaultTraversal_InnerUnrolling<Derived1, Derived2, Index+1, Stop>::run(dst, src, outer);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Stop>
-struct assign_DefaultTraversal_InnerUnrolling<Derived1, Derived2, Stop, Stop>
-{
-  static EIGEN_STRONG_INLINE void run(Derived1 &, const Derived2 &, typename Derived1::Index) {}
-};
-
-/***********************
-*** Linear traversal ***
-***********************/
-
-template<typename Derived1, typename Derived2, int Index, int Stop>
-struct assign_LinearTraversal_CompleteUnrolling
-{
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
-  {
-    dst.copyCoeff(Index, src);
-    assign_LinearTraversal_CompleteUnrolling<Derived1, Derived2, Index+1, Stop>::run(dst, src);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Stop>
-struct assign_LinearTraversal_CompleteUnrolling<Derived1, Derived2, Stop, Stop>
-{
-  static EIGEN_STRONG_INLINE void run(Derived1 &, const Derived2 &) {}
-};
-
-/**************************
-*** Inner vectorization ***
-**************************/
-
-template<typename Derived1, typename Derived2, int Index, int Stop>
-struct assign_innervec_CompleteUnrolling
-{
-  enum {
-    outer = Index / Derived1::InnerSizeAtCompileTime,
-    inner = Index % Derived1::InnerSizeAtCompileTime,
-    JointAlignment = assign_traits<Derived1,Derived2>::JointAlignment
-  };
-
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
-  {
-    dst.template copyPacketByOuterInner<Derived2, Aligned, JointAlignment>(outer, inner, src);
-    assign_innervec_CompleteUnrolling<Derived1, Derived2,
-      Index+packet_traits<typename Derived1::Scalar>::size, Stop>::run(dst, src);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Stop>
-struct assign_innervec_CompleteUnrolling<Derived1, Derived2, Stop, Stop>
-{
-  static EIGEN_STRONG_INLINE void run(Derived1 &, const Derived2 &) {}
-};
-
-template<typename Derived1, typename Derived2, int Index, int Stop>
-struct assign_innervec_InnerUnrolling
-{
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src, typename Derived1::Index outer)
-  {
-    dst.template copyPacketByOuterInner<Derived2, Aligned, Aligned>(outer, Index, src);
-    assign_innervec_InnerUnrolling<Derived1, Derived2,
-      Index+packet_traits<typename Derived1::Scalar>::size, Stop>::run(dst, src, outer);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Stop>
-struct assign_innervec_InnerUnrolling<Derived1, Derived2, Stop, Stop>
-{
-  static EIGEN_STRONG_INLINE void run(Derived1 &, const Derived2 &, typename Derived1::Index) {}
-};
-
-/***************************************************************************
-* Part 3 : implementation of all cases
-***************************************************************************/
-
-template<typename Derived1, typename Derived2,
-         int Traversal = assign_traits<Derived1, Derived2>::Traversal,
-         int Unrolling = assign_traits<Derived1, Derived2>::Unrolling,
-         int Version = Specialized>
-struct assign_impl;
-
-/************************
-*** Default traversal ***
-************************/
-
-template<typename Derived1, typename Derived2, int Unrolling, int Version>
-struct assign_impl<Derived1, Derived2, InvalidTraversal, Unrolling, Version>
-{
-  static inline void run(Derived1 &, const Derived2 &) { }
-};
-
-template<typename Derived1, typename Derived2, int Version>
-struct assign_impl<Derived1, Derived2, DefaultTraversal, NoUnrolling, Version>
-{
-  typedef typename Derived1::Index Index;
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-    const Index innerSize = dst.innerSize();
-    const Index outerSize = dst.outerSize();
-    for(Index outer = 0; outer < outerSize; ++outer)
-      for(Index inner = 0; inner < innerSize; ++inner)
-        dst.copyCoeffByOuterInner(outer, inner, src);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Version>
-struct assign_impl<Derived1, Derived2, DefaultTraversal, CompleteUnrolling, Version>
-{
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
-  {
-    assign_DefaultTraversal_CompleteUnrolling<Derived1, Derived2, 0, Derived1::SizeAtCompileTime>
-      ::run(dst, src);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Version>
-struct assign_impl<Derived1, Derived2, DefaultTraversal, InnerUnrolling, Version>
-{
-  typedef typename Derived1::Index Index;
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
-  {
-    const Index outerSize = dst.outerSize();
-    for(Index outer = 0; outer < outerSize; ++outer)
-      assign_DefaultTraversal_InnerUnrolling<Derived1, Derived2, 0, Derived1::InnerSizeAtCompileTime>
-        ::run(dst, src, outer);
-  }
-};
-
-/***********************
-*** Linear traversal ***
-***********************/
-
-template<typename Derived1, typename Derived2, int Version>
-struct assign_impl<Derived1, Derived2, LinearTraversal, NoUnrolling, Version>
-{
-  typedef typename Derived1::Index Index;
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-    const Index size = dst.size();
-    for(Index i = 0; i < size; ++i)
-      dst.copyCoeff(i, src);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Version>
-struct assign_impl<Derived1, Derived2, LinearTraversal, CompleteUnrolling, Version>
-{
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
-  {
-    assign_LinearTraversal_CompleteUnrolling<Derived1, Derived2, 0, Derived1::SizeAtCompileTime>
-      ::run(dst, src);
-  }
-};
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 
-/**************************
-*** Inner vectorization ***
-**************************/
-
-template<typename Derived1, typename Derived2, int Version>
-struct assign_impl<Derived1, Derived2, InnerVectorizedTraversal, NoUnrolling, Version>
-{
-  typedef typename Derived1::Index Index;
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-    const Index innerSize = dst.innerSize();
-    const Index outerSize = dst.outerSize();
-    const Index packetSize = packet_traits<typename Derived1::Scalar>::size;
-    for(Index outer = 0; outer < outerSize; ++outer)
-      for(Index inner = 0; inner < innerSize; inner+=packetSize)
-        dst.template copyPacketByOuterInner<Derived2, Aligned, Aligned>(outer, inner, src);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Version>
-struct assign_impl<Derived1, Derived2, InnerVectorizedTraversal, CompleteUnrolling, Version>
-{
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
-  {
-    assign_innervec_CompleteUnrolling<Derived1, Derived2, 0, Derived1::SizeAtCompileTime>
-      ::run(dst, src);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Version>
-struct assign_impl<Derived1, Derived2, InnerVectorizedTraversal, InnerUnrolling, Version>
-{
-  typedef typename Derived1::Index Index;
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
-  {
-    const Index outerSize = dst.outerSize();
-    for(Index outer = 0; outer < outerSize; ++outer)
-      assign_innervec_InnerUnrolling<Derived1, Derived2, 0, Derived1::InnerSizeAtCompileTime>
-        ::run(dst, src, outer);
-  }
-};
-
-/***************************
-*** Linear vectorization ***
-***************************/
-
-template <bool IsAligned = false>
-struct unaligned_assign_impl
-{
-  template <typename Derived, typename OtherDerived>
-  static EIGEN_STRONG_INLINE void run(const Derived&, OtherDerived&, typename Derived::Index, typename Derived::Index) {}
-};
-
-template <>
-struct unaligned_assign_impl<false>
-{
-  // MSVC must not inline this functions. If it does, it fails to optimize the
-  // packet access path.
-#ifdef _MSC_VER
-  template <typename Derived, typename OtherDerived>
-  static EIGEN_DONT_INLINE void run(const Derived& src, OtherDerived& dst, typename Derived::Index start, typename Derived::Index end)
-#else
-  template <typename Derived, typename OtherDerived>
-  static EIGEN_STRONG_INLINE void run(const Derived& src, OtherDerived& dst, typename Derived::Index start, typename Derived::Index end)
-#endif
-  {
-    for (typename Derived::Index index = start; index < end; ++index)
-      dst.copyCoeff(index, src);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Version>
-struct assign_impl<Derived1, Derived2, LinearVectorizedTraversal, NoUnrolling, Version>
-{
-  typedef typename Derived1::Index Index;
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
-  {
-    const Index size = dst.size();
-    typedef packet_traits<typename Derived1::Scalar> PacketTraits;
-    enum {
-      packetSize = PacketTraits::size,
-      dstAlignment = PacketTraits::AlignedOnScalar ? Aligned : int(assign_traits<Derived1,Derived2>::DstIsAligned) ,
-      srcAlignment = assign_traits<Derived1,Derived2>::JointAlignment
-    };
-    const Index alignedStart = assign_traits<Derived1,Derived2>::DstIsAligned ? 0
-                             : internal::first_aligned(&dst.coeffRef(0), size);
-    const Index alignedEnd = alignedStart + ((size-alignedStart)/packetSize)*packetSize;
-
-    unaligned_assign_impl<assign_traits<Derived1,Derived2>::DstIsAligned!=0>::run(src,dst,0,alignedStart);
-
-    for(Index index = alignedStart; index < alignedEnd; index += packetSize)
-    {
-      dst.template copyPacket<Derived2, dstAlignment, srcAlignment>(index, src);
-    }
-
-    unaligned_assign_impl<>::run(src,dst,alignedEnd,size);
-  }
-};
-
-template<typename Derived1, typename Derived2, int Version>
-struct assign_impl<Derived1, Derived2, LinearVectorizedTraversal, CompleteUnrolling, Version>
-{
-  typedef typename Derived1::Index Index;
-  static EIGEN_STRONG_INLINE void run(Derived1 &dst, const Derived2 &src)
-  {
-    enum { size = Derived1::SizeAtCompileTime,
-           packetSize = packet_traits<typename Derived1::Scalar>::size,
-           alignedSize = (size/packetSize)*packetSize };
-
-    assign_innervec_CompleteUnrolling<Derived1, Derived2, 0, alignedSize>::run(dst, src);
-    assign_DefaultTraversal_CompleteUnrolling<Derived1, Derived2, alignedSize, size>::run(dst, src);
-  }
-};
-
-/**************************
-*** Slice vectorization ***
-***************************/
-
-template<typename Derived1, typename Derived2, int Version>
-struct assign_impl<Derived1, Derived2, SliceVectorizedTraversal, NoUnrolling, Version>
-{
-  typedef typename Derived1::Index Index;
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-    typedef typename Derived1::Scalar Scalar;
-    typedef packet_traits<Scalar> PacketTraits;
-    enum {
-      packetSize = PacketTraits::size,
-      alignable = PacketTraits::AlignedOnScalar,
-      dstIsAligned = assign_traits<Derived1,Derived2>::DstIsAligned,
-      dstAlignment = alignable ? Aligned : int(dstIsAligned),
-      srcAlignment = assign_traits<Derived1,Derived2>::JointAlignment
-    };
-    const Scalar *dst_ptr = &dst.coeffRef(0,0);
-    if((!bool(dstIsAligned)) && (size_t(dst_ptr) % sizeof(Scalar))>0)
-    {
-      // the pointer is not aligend-on scalar, so alignment is not possible
-      return assign_impl<Derived1,Derived2,DefaultTraversal,NoUnrolling>::run(dst, src);
-    }
-    const Index packetAlignedMask = packetSize - 1;
-    const Index innerSize = dst.innerSize();
-    const Index outerSize = dst.outerSize();
-    const Index alignedStep = alignable ? (packetSize - dst.outerStride() % packetSize) & packetAlignedMask : 0;
-    Index alignedStart = ((!alignable) || bool(dstIsAligned)) ? 0 : internal::first_aligned(dst_ptr, innerSize);
-
-    for(Index outer = 0; outer < outerSize; ++outer)
-    {
-      const Index alignedEnd = alignedStart + ((innerSize-alignedStart) & ~packetAlignedMask);
-      // do the non-vectorizable part of the assignment
-      for(Index inner = 0; inner<alignedStart ; ++inner)
-        dst.copyCoeffByOuterInner(outer, inner, src);
-
-      // do the vectorizable part of the assignment
-      for(Index inner = alignedStart; inner<alignedEnd; inner+=packetSize)
-        dst.template copyPacketByOuterInner<Derived2, dstAlignment, Unaligned>(outer, inner, src);
-
-      // do the non-vectorizable part of the assignment
-      for(Index inner = alignedEnd; inner<innerSize ; ++inner)
-        dst.copyCoeffByOuterInner(outer, inner, src);
-
-      alignedStart = std::min<Index>((alignedStart+alignedStep)%packetSize, innerSize);
-    }
-  }
-};
-
-} // end namespace internal
-
-/***************************************************************************
-* Part 4 : implementation of DenseBase methods
-***************************************************************************/
+namespace Eigen {
 
-template<typename Derived>
-template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>
-  ::lazyAssign(const DenseBase<OtherDerived>& other)
-{
-  enum{
-    SameType = internal::is_same<typename Derived::Scalar,typename OtherDerived::Scalar>::value
-  };
+template <typename Derived>
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::lazyAssign(const DenseBase<OtherDerived>& other) {
+  enum { SameType = internal::is_same<typename Derived::Scalar, typename OtherDerived::Scalar>::value };
 
   EIGEN_STATIC_ASSERT_LVALUE(Derived)
-  EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Derived,OtherDerived)
-  EIGEN_STATIC_ASSERT(SameType,YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+  EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Derived, OtherDerived)
+  EIGEN_STATIC_ASSERT(
+      SameType,
+      YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
 
-#ifdef EIGEN_DEBUG_ASSIGN
-  internal::assign_traits<Derived, OtherDerived>::debug();
-#endif
   eigen_assert(rows() == other.rows() && cols() == other.cols());
-  internal::assign_impl<Derived, OtherDerived, int(SameType) ? int(internal::assign_traits<Derived, OtherDerived>::Traversal)
-                                                       : int(InvalidTraversal)>::run(derived(),other.derived());
-#ifndef EIGEN_NO_DEBUG
-  checkTransposeAliasing(other.derived());
-#endif
+  internal::call_assignment_no_alias(derived(), other.derived());
+
   return derived();
 }
 
-namespace internal {
-
-template<typename Derived, typename OtherDerived,
-         bool EvalBeforeAssigning = (int(internal::traits<OtherDerived>::Flags) & EvalBeforeAssigningBit) != 0,
-         bool NeedToTranspose = ((int(Derived::RowsAtCompileTime) == 1 && int(OtherDerived::ColsAtCompileTime) == 1)
-                              |   // FIXME | instead of || to please GCC 4.4.0 stupid warning "suggest parentheses around &&".
-                                  // revert to || as soon as not needed anymore.
-                                  (int(Derived::ColsAtCompileTime) == 1 && int(OtherDerived::RowsAtCompileTime) == 1))
-                              && int(Derived::SizeAtCompileTime) != 1>
-struct assign_selector;
-
-template<typename Derived, typename OtherDerived>
-struct assign_selector<Derived,OtherDerived,false,false> {
-  static EIGEN_STRONG_INLINE Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.derived()); }
-  template<typename ActualDerived, typename ActualOtherDerived>
-  static EIGEN_STRONG_INLINE Derived& evalTo(ActualDerived& dst, const ActualOtherDerived& other) { other.evalTo(dst); return dst; }
-};
-template<typename Derived, typename OtherDerived>
-struct assign_selector<Derived,OtherDerived,true,false> {
-  static EIGEN_STRONG_INLINE Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.eval()); }
-};
-template<typename Derived, typename OtherDerived>
-struct assign_selector<Derived,OtherDerived,false,true> {
-  static EIGEN_STRONG_INLINE Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.transpose()); }
-  template<typename ActualDerived, typename ActualOtherDerived>
-  static EIGEN_STRONG_INLINE Derived& evalTo(ActualDerived& dst, const ActualOtherDerived& other) { Transpose<ActualDerived> dstTrans(dst); other.evalTo(dstTrans); return dst; }
-};
-template<typename Derived, typename OtherDerived>
-struct assign_selector<Derived,OtherDerived,true,true> {
-  static EIGEN_STRONG_INLINE Derived& run(Derived& dst, const OtherDerived& other) { return dst.lazyAssign(other.transpose().eval()); }
-};
-
-} // end namespace internal
-
-template<typename Derived>
-template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator=(const DenseBase<OtherDerived>& other)
-{
-  return internal::assign_selector<Derived,OtherDerived>::run(derived(), other.derived());
+template <typename Derived>
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator=(const DenseBase<OtherDerived>& other) {
+  internal::call_assignment(derived(), other.derived());
+  return derived();
 }
 
-template<typename Derived>
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator=(const DenseBase& other)
-{
-  return internal::assign_selector<Derived,Derived>::run(derived(), other.derived());
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator=(const DenseBase& other) {
+  internal::call_assignment(derived(), other.derived());
+  return derived();
 }
 
-template<typename Derived>
-EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const MatrixBase& other)
-{
-  return internal::assign_selector<Derived,Derived>::run(derived(), other.derived());
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const MatrixBase& other) {
+  internal::call_assignment(derived(), other.derived());
+  return derived();
 }
 
-template<typename Derived>
+template <typename Derived>
 template <typename OtherDerived>
-EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const DenseBase<OtherDerived>& other)
-{
-  return internal::assign_selector<Derived,OtherDerived>::run(derived(), other.derived());
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const DenseBase<OtherDerived>& other) {
+  internal::call_assignment(derived(), other.derived());
+  return derived();
 }
 
-template<typename Derived>
+template <typename Derived>
 template <typename OtherDerived>
-EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const EigenBase<OtherDerived>& other)
-{
-  return internal::assign_selector<Derived,OtherDerived,false>::evalTo(derived(), other.derived());
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const EigenBase<OtherDerived>& other) {
+  internal::call_assignment(derived(), other.derived());
+  return derived();
 }
 
-template<typename Derived>
-template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(const ReturnByValue<OtherDerived>& other)
-{
-  return internal::assign_selector<Derived,OtherDerived,false>::evalTo(derived(), other.derived());
+template <typename Derived>
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator=(
+    const ReturnByValue<OtherDerived>& other) {
+  other.derived().evalTo(derived());
+  return derived();
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_ASSIGN_H
+#endif  // EIGEN_ASSIGN_H
diff --git a/inst/include/Eigen/src/Core/AssignEvaluator.h b/inst/include/Eigen/src/Core/AssignEvaluator.h
new file mode 100644
index 00000000..36f0a9d7
--- /dev/null
+++ b/inst/include/Eigen/src/Core/AssignEvaluator.h
@@ -0,0 +1,1057 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2011-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2011-2012 Jitse Niesen <jitse@maths.leeds.ac.uk>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ASSIGN_EVALUATOR_H
+#define EIGEN_ASSIGN_EVALUATOR_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+// This implementation is based on Assign.h
+
+namespace internal {
+
+/***************************************************************************
+ * Part 1 : the logic deciding a strategy for traversal and unrolling       *
+ ***************************************************************************/
+
+// copy_using_evaluator_traits is based on assign_traits
+
+template <typename DstEvaluator, typename SrcEvaluator, typename AssignFunc, int MaxPacketSize = Dynamic>
+struct copy_using_evaluator_traits {
+  using Src = typename SrcEvaluator::XprType;
+  using Dst = typename DstEvaluator::XprType;
+  using DstScalar = typename Dst::Scalar;
+
+  static constexpr int DstFlags = DstEvaluator::Flags;
+  static constexpr int SrcFlags = SrcEvaluator::Flags;
+
+ public:
+  static constexpr int DstAlignment = DstEvaluator::Alignment;
+  static constexpr int SrcAlignment = SrcEvaluator::Alignment;
+  static constexpr int JointAlignment = plain_enum_min(DstAlignment, SrcAlignment);
+  static constexpr bool DstHasDirectAccess = bool(DstFlags & DirectAccessBit);
+  static constexpr bool SrcIsRowMajor = bool(SrcFlags & RowMajorBit);
+  static constexpr bool DstIsRowMajor = bool(DstFlags & RowMajorBit);
+  static constexpr bool IsVectorAtCompileTime = Dst::IsVectorAtCompileTime;
+  static constexpr int RowsAtCompileTime = size_prefer_fixed(Src::RowsAtCompileTime, Dst::RowsAtCompileTime);
+  static constexpr int ColsAtCompileTime = size_prefer_fixed(Src::ColsAtCompileTime, Dst::ColsAtCompileTime);
+  static constexpr int SizeAtCompileTime = size_at_compile_time(RowsAtCompileTime, ColsAtCompileTime);
+  static constexpr int MaxRowsAtCompileTime =
+      min_size_prefer_fixed(Src::MaxRowsAtCompileTime, Dst::MaxRowsAtCompileTime);
+  static constexpr int MaxColsAtCompileTime =
+      min_size_prefer_fixed(Src::MaxColsAtCompileTime, Dst::MaxColsAtCompileTime);
+  static constexpr int MaxSizeAtCompileTime =
+      min_size_prefer_fixed(Src::MaxSizeAtCompileTime, Dst::MaxSizeAtCompileTime);
+  static constexpr int InnerSizeAtCompileTime = IsVectorAtCompileTime ? SizeAtCompileTime
+                                                : DstIsRowMajor       ? ColsAtCompileTime
+                                                                      : RowsAtCompileTime;
+  static constexpr int MaxInnerSizeAtCompileTime = IsVectorAtCompileTime ? MaxSizeAtCompileTime
+                                                   : DstIsRowMajor       ? MaxColsAtCompileTime
+                                                                         : MaxRowsAtCompileTime;
+  static constexpr int RestrictedInnerSize = min_size_prefer_fixed(MaxInnerSizeAtCompileTime, MaxPacketSize);
+  static constexpr int RestrictedLinearSize = min_size_prefer_fixed(MaxSizeAtCompileTime, MaxPacketSize);
+  static constexpr int OuterStride = outer_stride_at_compile_time<Dst>::ret;
+
+  // TODO distinguish between linear traversal and inner-traversals
+  using LinearPacketType = typename find_best_packet<DstScalar, RestrictedLinearSize>::type;
+  using InnerPacketType = typename find_best_packet<DstScalar, RestrictedInnerSize>::type;
+
+  static constexpr int LinearPacketSize = unpacket_traits<LinearPacketType>::size;
+  static constexpr int InnerPacketSize = unpacket_traits<InnerPacketType>::size;
+
+ public:
+  static constexpr int LinearRequiredAlignment = unpacket_traits<LinearPacketType>::alignment;
+  static constexpr int InnerRequiredAlignment = unpacket_traits<InnerPacketType>::alignment;
+
+ private:
+  static constexpr bool StorageOrdersAgree = DstIsRowMajor == SrcIsRowMajor;
+  static constexpr bool MightVectorize = StorageOrdersAgree && bool(DstFlags & SrcFlags & ActualPacketAccessBit) &&
+                                         bool(functor_traits<AssignFunc>::PacketAccess);
+  static constexpr bool MayInnerVectorize = MightVectorize && (InnerSizeAtCompileTime != Dynamic) &&
+                                            (InnerSizeAtCompileTime % InnerPacketSize == 0) &&
+                                            (OuterStride != Dynamic) && (OuterStride % InnerPacketSize == 0) &&
+                                            (EIGEN_UNALIGNED_VECTORIZE || JointAlignment >= InnerRequiredAlignment);
+  static constexpr bool MayLinearize = StorageOrdersAgree && (DstFlags & SrcFlags & LinearAccessBit);
+  static constexpr bool MayLinearVectorize =
+      MightVectorize && MayLinearize && DstHasDirectAccess &&
+      (EIGEN_UNALIGNED_VECTORIZE || (DstAlignment >= LinearRequiredAlignment) || MaxSizeAtCompileTime == Dynamic) &&
+      (MaxSizeAtCompileTime == Dynamic || MaxSizeAtCompileTime >= LinearPacketSize);
+  /* If the destination isn't aligned, we have to do runtime checks and we don't unroll,
+     so it's only good for large enough sizes. */
+  static constexpr int InnerSizeThreshold = (EIGEN_UNALIGNED_VECTORIZE ? 1 : 3) * InnerPacketSize;
+  static constexpr bool MaySliceVectorize =
+      MightVectorize && DstHasDirectAccess &&
+      (MaxInnerSizeAtCompileTime == Dynamic || MaxInnerSizeAtCompileTime >= InnerSizeThreshold);
+  /* slice vectorization can be slow, so we only want it if the slices are big, which is
+     indicated by InnerMaxSize rather than InnerSize, think of the case of a dynamic block
+     in a fixed-size matrix
+     However, with EIGEN_UNALIGNED_VECTORIZE and unrolling, slice vectorization is still worth it */
+
+ public:
+  static constexpr int Traversal = SizeAtCompileTime == 0 ? AllAtOnceTraversal
+                                   : (MayLinearVectorize && (LinearPacketSize > InnerPacketSize))
+                                       ? LinearVectorizedTraversal
+                                   : MayInnerVectorize  ? InnerVectorizedTraversal
+                                   : MayLinearVectorize ? LinearVectorizedTraversal
+                                   : MaySliceVectorize  ? SliceVectorizedTraversal
+                                   : MayLinearize       ? LinearTraversal
+                                                        : DefaultTraversal;
+  static constexpr bool Vectorized = Traversal == InnerVectorizedTraversal || Traversal == LinearVectorizedTraversal ||
+                                     Traversal == SliceVectorizedTraversal;
+
+  using PacketType = std::conditional_t<Traversal == LinearVectorizedTraversal, LinearPacketType, InnerPacketType>;
+
+ private:
+  static constexpr int ActualPacketSize = Vectorized ? unpacket_traits<PacketType>::size : 1;
+  static constexpr int UnrollingLimit = EIGEN_UNROLLING_LIMIT * ActualPacketSize;
+  static constexpr int CoeffReadCost = int(DstEvaluator::CoeffReadCost) + int(SrcEvaluator::CoeffReadCost);
+  static constexpr bool MayUnrollCompletely =
+      (SizeAtCompileTime != Dynamic) && (SizeAtCompileTime * CoeffReadCost <= UnrollingLimit);
+  static constexpr bool MayUnrollInner =
+      (InnerSizeAtCompileTime != Dynamic) && (InnerSizeAtCompileTime * CoeffReadCost <= UnrollingLimit);
+
+ public:
+  static constexpr int Unrolling =
+      (Traversal == InnerVectorizedTraversal || Traversal == DefaultTraversal)
+          ? (MayUnrollCompletely ? CompleteUnrolling
+             : MayUnrollInner    ? InnerUnrolling
+                                 : NoUnrolling)
+      : Traversal == LinearVectorizedTraversal
+          ? (MayUnrollCompletely && (EIGEN_UNALIGNED_VECTORIZE || (DstAlignment >= LinearRequiredAlignment))
+                 ? CompleteUnrolling
+                 : NoUnrolling)
+      : Traversal == LinearTraversal ? (MayUnrollCompletely ? CompleteUnrolling : NoUnrolling)
+#if EIGEN_UNALIGNED_VECTORIZE
+      : Traversal == SliceVectorizedTraversal ? (MayUnrollInner ? InnerUnrolling : NoUnrolling)
+#endif
+                                              : NoUnrolling;
+  static constexpr bool UsePacketSegment = has_packet_segment<PacketType>::value;
+
+#ifdef EIGEN_DEBUG_ASSIGN
+  static void debug() {
+    std::cerr << "DstXpr: " << typeid(typename DstEvaluator::XprType).name() << std::endl;
+    std::cerr << "SrcXpr: " << typeid(typename SrcEvaluator::XprType).name() << std::endl;
+    std::cerr.setf(std::ios::hex, std::ios::basefield);
+    std::cerr << "DstFlags"
+              << " = " << DstFlags << " (" << demangle_flags(DstFlags) << " )" << std::endl;
+    std::cerr << "SrcFlags"
+              << " = " << SrcFlags << " (" << demangle_flags(SrcFlags) << " )" << std::endl;
+    std::cerr.unsetf(std::ios::hex);
+    EIGEN_DEBUG_VAR(DstAlignment)
+    EIGEN_DEBUG_VAR(SrcAlignment)
+    EIGEN_DEBUG_VAR(LinearRequiredAlignment)
+    EIGEN_DEBUG_VAR(InnerRequiredAlignment)
+    EIGEN_DEBUG_VAR(JointAlignment)
+    EIGEN_DEBUG_VAR(InnerSizeAtCompileTime)
+    EIGEN_DEBUG_VAR(MaxInnerSizeAtCompileTime)
+    EIGEN_DEBUG_VAR(LinearPacketSize)
+    EIGEN_DEBUG_VAR(InnerPacketSize)
+    EIGEN_DEBUG_VAR(ActualPacketSize)
+    EIGEN_DEBUG_VAR(StorageOrdersAgree)
+    EIGEN_DEBUG_VAR(MightVectorize)
+    EIGEN_DEBUG_VAR(MayLinearize)
+    EIGEN_DEBUG_VAR(MayInnerVectorize)
+    EIGEN_DEBUG_VAR(MayLinearVectorize)
+    EIGEN_DEBUG_VAR(MaySliceVectorize)
+    std::cerr << "Traversal"
+              << " = " << Traversal << " (" << demangle_traversal(Traversal) << ")" << std::endl;
+    EIGEN_DEBUG_VAR(SrcEvaluator::CoeffReadCost)
+    EIGEN_DEBUG_VAR(DstEvaluator::CoeffReadCost)
+    EIGEN_DEBUG_VAR(Dst::SizeAtCompileTime)
+    EIGEN_DEBUG_VAR(UnrollingLimit)
+    EIGEN_DEBUG_VAR(MayUnrollCompletely)
+    EIGEN_DEBUG_VAR(MayUnrollInner)
+    std::cerr << "Unrolling"
+              << " = " << Unrolling << " (" << demangle_unrolling(Unrolling) << ")" << std::endl;
+    std::cerr << std::endl;
+  }
+#endif
+};
+
+/***************************************************************************
+ * Part 2 : meta-unrollers
+ ***************************************************************************/
+
+/************************
+*** Default traversal ***
+************************/
+
+template <typename Kernel, int Index_, int Stop>
+struct copy_using_evaluator_DefaultTraversal_CompleteUnrolling {
+  static constexpr int Outer = Index_ / Kernel::AssignmentTraits::InnerSizeAtCompileTime;
+  static constexpr int Inner = Index_ % Kernel::AssignmentTraits::InnerSizeAtCompileTime;
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel) {
+    kernel.assignCoeffByOuterInner(Outer, Inner);
+    copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, Index_ + 1, Stop>::run(kernel);
+  }
+};
+
+template <typename Kernel, int Stop>
+struct copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, Stop, Stop> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel&) {}
+};
+
+template <typename Kernel, int Index_, int Stop>
+struct copy_using_evaluator_DefaultTraversal_InnerUnrolling {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel, Index outer) {
+    kernel.assignCoeffByOuterInner(outer, Index_);
+    copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, Index_ + 1, Stop>::run(kernel, outer);
+  }
+};
+
+template <typename Kernel, int Stop>
+struct copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, Stop, Stop> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel&, Index) {}
+};
+
+/***********************
+*** Linear traversal ***
+***********************/
+
+template <typename Kernel, int Index_, int Stop>
+struct copy_using_evaluator_LinearTraversal_CompleteUnrolling {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel) {
+    kernel.assignCoeff(Index_);
+    copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, Index_ + 1, Stop>::run(kernel);
+  }
+};
+
+template <typename Kernel, int Stop>
+struct copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, Stop, Stop> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel&) {}
+};
+
+/**************************
+*** Inner vectorization ***
+**************************/
+
+template <typename Kernel, int Index_, int Stop>
+struct copy_using_evaluator_innervec_CompleteUnrolling {
+  using PacketType = typename Kernel::PacketType;
+  static constexpr int Outer = Index_ / Kernel::AssignmentTraits::InnerSizeAtCompileTime;
+  static constexpr int Inner = Index_ % Kernel::AssignmentTraits::InnerSizeAtCompileTime;
+  static constexpr int NextIndex = Index_ + unpacket_traits<PacketType>::size;
+  static constexpr int SrcAlignment = Kernel::AssignmentTraits::SrcAlignment;
+  static constexpr int DstAlignment = Kernel::AssignmentTraits::DstAlignment;
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel) {
+    kernel.template assignPacketByOuterInner<DstAlignment, SrcAlignment, PacketType>(Outer, Inner);
+    copy_using_evaluator_innervec_CompleteUnrolling<Kernel, NextIndex, Stop>::run(kernel);
+  }
+};
+
+template <typename Kernel, int Stop>
+struct copy_using_evaluator_innervec_CompleteUnrolling<Kernel, Stop, Stop> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel&) {}
+};
+
+template <typename Kernel, int Index_, int Stop, int SrcAlignment, int DstAlignment>
+struct copy_using_evaluator_innervec_InnerUnrolling {
+  using PacketType = typename Kernel::PacketType;
+  static constexpr int NextIndex = Index_ + unpacket_traits<PacketType>::size;
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel, Index outer) {
+    kernel.template assignPacketByOuterInner<DstAlignment, SrcAlignment, PacketType>(outer, Index_);
+    copy_using_evaluator_innervec_InnerUnrolling<Kernel, NextIndex, Stop, SrcAlignment, DstAlignment>::run(kernel,
+                                                                                                           outer);
+  }
+};
+
+template <typename Kernel, int Stop, int SrcAlignment, int DstAlignment>
+struct copy_using_evaluator_innervec_InnerUnrolling<Kernel, Stop, Stop, SrcAlignment, DstAlignment> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel&, Index) {}
+};
+
+template <typename Kernel, int Start, int Stop, int SrcAlignment, int DstAlignment, bool UsePacketSegment>
+struct copy_using_evaluator_innervec_segment {
+  using PacketType = typename Kernel::PacketType;
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel, Index outer) {
+    kernel.template assignPacketSegmentByOuterInner<DstAlignment, SrcAlignment, PacketType>(outer, Start, 0,
+                                                                                            Stop - Start);
+  }
+};
+
+template <typename Kernel, int Start, int Stop, int SrcAlignment, int DstAlignment>
+struct copy_using_evaluator_innervec_segment<Kernel, Start, Stop, SrcAlignment, DstAlignment,
+                                             /*UsePacketSegment*/ false>
+    : copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, Start, Stop> {};
+
+template <typename Kernel, int Stop, int SrcAlignment, int DstAlignment>
+struct copy_using_evaluator_innervec_segment<Kernel, Stop, Stop, SrcAlignment, DstAlignment,
+                                             /*UsePacketSegment*/ true> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel&, Index) {}
+};
+
+template <typename Kernel, int Stop, int SrcAlignment, int DstAlignment>
+struct copy_using_evaluator_innervec_segment<Kernel, Stop, Stop, SrcAlignment, DstAlignment,
+                                             /*UsePacketSegment*/ false> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel&, Index) {}
+};
+
+/***************************************************************************
+ * Part 3 : implementation of all cases
+ ***************************************************************************/
+
+// dense_assignment_loop is based on assign_impl
+
+template <typename Kernel, int Traversal = Kernel::AssignmentTraits::Traversal,
+          int Unrolling = Kernel::AssignmentTraits::Unrolling>
+struct dense_assignment_loop_impl;
+
+template <typename Kernel, int Traversal = Kernel::AssignmentTraits::Traversal,
+          int Unrolling = Kernel::AssignmentTraits::Unrolling>
+struct dense_assignment_loop {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel) {
+#ifdef __cpp_lib_is_constant_evaluated
+    if (internal::is_constant_evaluated())
+      dense_assignment_loop_impl<Kernel, Traversal == AllAtOnceTraversal ? AllAtOnceTraversal : DefaultTraversal,
+                                 NoUnrolling>::run(kernel);
+    else
+#endif
+      dense_assignment_loop_impl<Kernel, Traversal, Unrolling>::run(kernel);
+  }
+};
+
+/************************
+***** Special Cases *****
+************************/
+
+// Zero-sized assignment is a no-op.
+template <typename Kernel, int Unrolling>
+struct dense_assignment_loop_impl<Kernel, AllAtOnceTraversal, Unrolling> {
+  static constexpr int SizeAtCompileTime = Kernel::AssignmentTraits::SizeAtCompileTime;
+
+  EIGEN_DEVICE_FUNC static void EIGEN_STRONG_INLINE constexpr run(Kernel& /*kernel*/) {
+    EIGEN_STATIC_ASSERT(SizeAtCompileTime == 0, EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT)
+  }
+};
+
+/************************
+*** Default traversal ***
+************************/
+
+template <typename Kernel>
+struct dense_assignment_loop_impl<Kernel, DefaultTraversal, NoUnrolling> {
+  EIGEN_DEVICE_FUNC static void EIGEN_STRONG_INLINE constexpr run(Kernel& kernel) {
+    for (Index outer = 0; outer < kernel.outerSize(); ++outer) {
+      for (Index inner = 0; inner < kernel.innerSize(); ++inner) {
+        kernel.assignCoeffByOuterInner(outer, inner);
+      }
+    }
+  }
+};
+
+template <typename Kernel>
+struct dense_assignment_loop_impl<Kernel, DefaultTraversal, CompleteUnrolling> {
+  static constexpr int SizeAtCompileTime = Kernel::AssignmentTraits::SizeAtCompileTime;
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel) {
+    copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, 0, SizeAtCompileTime>::run(kernel);
+  }
+};
+
+template <typename Kernel>
+struct dense_assignment_loop_impl<Kernel, DefaultTraversal, InnerUnrolling> {
+  static constexpr int InnerSizeAtCompileTime = Kernel::AssignmentTraits::InnerSizeAtCompileTime;
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel) {
+    const Index outerSize = kernel.outerSize();
+    for (Index outer = 0; outer < outerSize; ++outer)
+      copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, 0, InnerSizeAtCompileTime>::run(kernel, outer);
+  }
+};
+
+/***************************
+*** Linear vectorization ***
+***************************/
+
+// The goal of unaligned_dense_assignment_loop is simply to factorize the handling
+// of the non vectorizable beginning and ending parts
+
+template <typename PacketType, int DstAlignment, int SrcAlignment, bool UsePacketSegment, bool Skip>
+struct unaligned_dense_assignment_loop {
+  // if Skip == true, then do nothing
+  template <typename Kernel>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& /*kernel*/, Index /*start*/, Index /*end*/) {}
+  template <typename Kernel>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& /*kernel*/, Index /*outer*/,
+                                                                  Index /*innerStart*/, Index /*innerEnd*/) {}
+};
+
+template <typename PacketType, int DstAlignment, int SrcAlignment>
+struct unaligned_dense_assignment_loop<PacketType, DstAlignment, SrcAlignment, /*UsePacketSegment*/ true,
+                                       /*Skip*/ false> {
+  template <typename Kernel>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel, Index start, Index end) {
+    Index count = end - start;
+    eigen_assert(count <= unpacket_traits<PacketType>::size);
+    if (count > 0) kernel.template assignPacketSegment<DstAlignment, SrcAlignment, PacketType>(start, 0, count);
+  }
+  template <typename Kernel>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel, Index outer, Index start, Index end) {
+    Index count = end - start;
+    eigen_assert(count <= unpacket_traits<PacketType>::size);
+    if (count > 0)
+      kernel.template assignPacketSegmentByOuterInner<DstAlignment, SrcAlignment, PacketType>(outer, start, 0, count);
+  }
+};
+
+template <typename PacketType, int DstAlignment, int SrcAlignment>
+struct unaligned_dense_assignment_loop<PacketType, DstAlignment, SrcAlignment, /*UsePacketSegment*/ false,
+                                       /*Skip*/ false> {
+  template <typename Kernel>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel, Index start, Index end) {
+    for (Index index = start; index < end; ++index) kernel.assignCoeff(index);
+  }
+  template <typename Kernel>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel, Index outer, Index innerStart,
+                                                                  Index innerEnd) {
+    for (Index inner = innerStart; inner < innerEnd; ++inner) kernel.assignCoeffByOuterInner(outer, inner);
+  }
+};
+
+template <typename Kernel, int Index_, int Stop>
+struct copy_using_evaluator_linearvec_CompleteUnrolling {
+  using PacketType = typename Kernel::PacketType;
+  static constexpr int SrcAlignment = Kernel::AssignmentTraits::SrcAlignment;
+  static constexpr int DstAlignment = Kernel::AssignmentTraits::DstAlignment;
+  static constexpr int NextIndex = Index_ + unpacket_traits<PacketType>::size;
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel) {
+    kernel.template assignPacket<DstAlignment, SrcAlignment, PacketType>(Index_);
+    copy_using_evaluator_linearvec_CompleteUnrolling<Kernel, NextIndex, Stop>::run(kernel);
+  }
+};
+
+template <typename Kernel, int Stop>
+struct copy_using_evaluator_linearvec_CompleteUnrolling<Kernel, Stop, Stop> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel&) {}
+};
+
+template <typename Kernel, int Index_, int Stop, bool UsePacketSegment>
+struct copy_using_evaluator_linearvec_segment {
+  using PacketType = typename Kernel::PacketType;
+  static constexpr int SrcAlignment = Kernel::AssignmentTraits::SrcAlignment;
+  static constexpr int DstAlignment = Kernel::AssignmentTraits::DstAlignment;
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel) {
+    kernel.template assignPacketSegment<DstAlignment, SrcAlignment, PacketType>(Index_, 0, Stop - Index_);
+  }
+};
+
+template <typename Kernel, int Index_, int Stop>
+struct copy_using_evaluator_linearvec_segment<Kernel, Index_, Stop, /*UsePacketSegment*/ false>
+    : copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, Index_, Stop> {};
+
+template <typename Kernel, int Stop>
+struct copy_using_evaluator_linearvec_segment<Kernel, Stop, Stop, /*UsePacketSegment*/ true> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel&) {}
+};
+
+template <typename Kernel, int Stop>
+struct copy_using_evaluator_linearvec_segment<Kernel, Stop, Stop, /*UsePacketSegment*/ false> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel&) {}
+};
+
+template <typename Kernel>
+struct dense_assignment_loop_impl<Kernel, LinearVectorizedTraversal, NoUnrolling> {
+  using Scalar = typename Kernel::Scalar;
+  using PacketType = typename Kernel::PacketType;
+  static constexpr int PacketSize = unpacket_traits<PacketType>::size;
+  static constexpr int SrcAlignment = Kernel::AssignmentTraits::JointAlignment;
+  static constexpr int DstAlignment = plain_enum_max(Kernel::AssignmentTraits::DstAlignment, alignof(Scalar));
+  static constexpr int RequestedAlignment = unpacket_traits<PacketType>::alignment;
+  static constexpr bool Alignable =
+      (DstAlignment >= RequestedAlignment) || ((RequestedAlignment - DstAlignment) % sizeof(Scalar) == 0);
+  static constexpr int Alignment = Alignable ? RequestedAlignment : DstAlignment;
+  static constexpr bool DstIsAligned = DstAlignment >= Alignment;
+  static constexpr bool UsePacketSegment = Kernel::AssignmentTraits::UsePacketSegment;
+
+  using head_loop =
+      unaligned_dense_assignment_loop<PacketType, DstAlignment, SrcAlignment, UsePacketSegment, DstIsAligned>;
+  using tail_loop = unaligned_dense_assignment_loop<PacketType, Alignment, SrcAlignment, UsePacketSegment, false>;
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel) {
+    const Index size = kernel.size();
+    const Index alignedStart = DstIsAligned ? 0 : first_aligned<Alignment>(kernel.dstDataPtr(), size);
+    const Index alignedEnd = alignedStart + numext::round_down(size - alignedStart, PacketSize);
+
+    head_loop::run(kernel, 0, alignedStart);
+
+    for (Index index = alignedStart; index < alignedEnd; index += PacketSize)
+      kernel.template assignPacket<Alignment, SrcAlignment, PacketType>(index);
+
+    tail_loop::run(kernel, alignedEnd, size);
+  }
+};
+
+template <typename Kernel>
+struct dense_assignment_loop_impl<Kernel, LinearVectorizedTraversal, CompleteUnrolling> {
+  using PacketType = typename Kernel::PacketType;
+  static constexpr int PacketSize = unpacket_traits<PacketType>::size;
+  static constexpr int Size = Kernel::AssignmentTraits::SizeAtCompileTime;
+  static constexpr int AlignedSize = numext::round_down(Size, PacketSize);
+  static constexpr bool UsePacketSegment = Kernel::AssignmentTraits::UsePacketSegment;
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel) {
+    copy_using_evaluator_linearvec_CompleteUnrolling<Kernel, 0, AlignedSize>::run(kernel);
+    copy_using_evaluator_linearvec_segment<Kernel, AlignedSize, Size, UsePacketSegment>::run(kernel);
+  }
+};
+
+/**************************
+*** Inner vectorization ***
+**************************/
+
+template <typename Kernel>
+struct dense_assignment_loop_impl<Kernel, InnerVectorizedTraversal, NoUnrolling> {
+  using PacketType = typename Kernel::PacketType;
+  static constexpr int PacketSize = unpacket_traits<PacketType>::size;
+  static constexpr int SrcAlignment = Kernel::AssignmentTraits::JointAlignment;
+  static constexpr int DstAlignment = Kernel::AssignmentTraits::DstAlignment;
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel) {
+    const Index innerSize = kernel.innerSize();
+    const Index outerSize = kernel.outerSize();
+    for (Index outer = 0; outer < outerSize; ++outer)
+      for (Index inner = 0; inner < innerSize; inner += PacketSize)
+        kernel.template assignPacketByOuterInner<DstAlignment, SrcAlignment, PacketType>(outer, inner);
+  }
+};
+
+template <typename Kernel>
+struct dense_assignment_loop_impl<Kernel, InnerVectorizedTraversal, CompleteUnrolling> {
+  static constexpr int SizeAtCompileTime = Kernel::AssignmentTraits::SizeAtCompileTime;
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel) {
+    copy_using_evaluator_innervec_CompleteUnrolling<Kernel, 0, SizeAtCompileTime>::run(kernel);
+  }
+};
+
+template <typename Kernel>
+struct dense_assignment_loop_impl<Kernel, InnerVectorizedTraversal, InnerUnrolling> {
+  static constexpr int InnerSize = Kernel::AssignmentTraits::InnerSizeAtCompileTime;
+  static constexpr int SrcAlignment = Kernel::AssignmentTraits::SrcAlignment;
+  static constexpr int DstAlignment = Kernel::AssignmentTraits::DstAlignment;
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel) {
+    const Index outerSize = kernel.outerSize();
+    for (Index outer = 0; outer < outerSize; ++outer)
+      copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, InnerSize, SrcAlignment, DstAlignment>::run(kernel,
+                                                                                                          outer);
+  }
+};
+
+/***********************
+*** Linear traversal ***
+***********************/
+
+template <typename Kernel>
+struct dense_assignment_loop_impl<Kernel, LinearTraversal, NoUnrolling> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel) {
+    const Index size = kernel.size();
+    for (Index i = 0; i < size; ++i) kernel.assignCoeff(i);
+  }
+};
+
+template <typename Kernel>
+struct dense_assignment_loop_impl<Kernel, LinearTraversal, CompleteUnrolling> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel) {
+    copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, 0, Kernel::AssignmentTraits::SizeAtCompileTime>::run(
+        kernel);
+  }
+};
+
+/**************************
+*** Slice vectorization ***
+***************************/
+
+template <typename Kernel>
+struct dense_assignment_loop_impl<Kernel, SliceVectorizedTraversal, NoUnrolling> {
+  using Scalar = typename Kernel::Scalar;
+  using PacketType = typename Kernel::PacketType;
+  static constexpr int PacketSize = unpacket_traits<PacketType>::size;
+  static constexpr int SrcAlignment = Kernel::AssignmentTraits::JointAlignment;
+  static constexpr int DstAlignment = plain_enum_max(Kernel::AssignmentTraits::DstAlignment, alignof(Scalar));
+  static constexpr int RequestedAlignment = unpacket_traits<PacketType>::alignment;
+  static constexpr bool Alignable =
+      (DstAlignment >= RequestedAlignment) || ((RequestedAlignment - DstAlignment) % sizeof(Scalar) == 0);
+  static constexpr int Alignment = Alignable ? RequestedAlignment : DstAlignment;
+  static constexpr bool DstIsAligned = DstAlignment >= Alignment;
+  static constexpr bool UsePacketSegment = Kernel::AssignmentTraits::UsePacketSegment;
+
+  using head_loop = unaligned_dense_assignment_loop<PacketType, DstAlignment, Unaligned, UsePacketSegment, !Alignable>;
+  using tail_loop = unaligned_dense_assignment_loop<PacketType, Alignment, Unaligned, UsePacketSegment, false>;
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel) {
+    const Scalar* dst_ptr = kernel.dstDataPtr();
+    const Index innerSize = kernel.innerSize();
+    const Index outerSize = kernel.outerSize();
+    const Index alignedStep = Alignable ? (PacketSize - kernel.outerStride() % PacketSize) % PacketSize : 0;
+    Index alignedStart = ((!Alignable) || DstIsAligned) ? 0 : internal::first_aligned<Alignment>(dst_ptr, innerSize);
+
+    for (Index outer = 0; outer < outerSize; ++outer) {
+      const Index alignedEnd = alignedStart + numext::round_down(innerSize - alignedStart, PacketSize);
+
+      head_loop::run(kernel, outer, 0, alignedStart);
+
+      // do the vectorizable part of the assignment
+      for (Index inner = alignedStart; inner < alignedEnd; inner += PacketSize)
+        kernel.template assignPacketByOuterInner<Alignment, Unaligned, PacketType>(outer, inner);
+
+      tail_loop::run(kernel, outer, alignedEnd, innerSize);
+
+      alignedStart = numext::mini((alignedStart + alignedStep) % PacketSize, innerSize);
+    }
+  }
+};
+
+#if EIGEN_UNALIGNED_VECTORIZE
+template <typename Kernel>
+struct dense_assignment_loop_impl<Kernel, SliceVectorizedTraversal, InnerUnrolling> {
+  using PacketType = typename Kernel::PacketType;
+  static constexpr int PacketSize = unpacket_traits<PacketType>::size;
+  static constexpr int InnerSize = Kernel::AssignmentTraits::InnerSizeAtCompileTime;
+  static constexpr int VectorizableSize = numext::round_down(InnerSize, PacketSize);
+  static constexpr bool UsePacketSegment = Kernel::AssignmentTraits::UsePacketSegment;
+
+  using packet_loop = copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, VectorizableSize, Unaligned, Unaligned>;
+  using packet_segment_loop = copy_using_evaluator_innervec_segment<Kernel, VectorizableSize, InnerSize, Unaligned,
+                                                                    Unaligned, UsePacketSegment>;
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel) {
+    for (Index outer = 0; outer < kernel.outerSize(); ++outer) {
+      packet_loop::run(kernel, outer);
+      packet_segment_loop::run(kernel, outer);
+    }
+  }
+};
+#endif
+
+/***************************************************************************
+ * Part 4 : Generic dense assignment kernel
+ ***************************************************************************/
+
+// This class generalize the assignment of a coefficient (or packet) from one dense evaluator
+// to another dense writable evaluator.
+// It is parametrized by the two evaluators, and the actual assignment functor.
+// This abstraction level permits to keep the evaluation loops as simple and as generic as possible.
+// One can customize the assignment using this generic dense_assignment_kernel with different
+// functors, or by completely overloading it, by-passing a functor.
+template <typename DstEvaluatorTypeT, typename SrcEvaluatorTypeT, typename Functor, int Version = Specialized>
+class generic_dense_assignment_kernel {
+ protected:
+  typedef typename DstEvaluatorTypeT::XprType DstXprType;
+  typedef typename SrcEvaluatorTypeT::XprType SrcXprType;
+
+ public:
+  typedef DstEvaluatorTypeT DstEvaluatorType;
+  typedef SrcEvaluatorTypeT SrcEvaluatorType;
+  typedef typename DstEvaluatorType::Scalar Scalar;
+  typedef copy_using_evaluator_traits<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor> AssignmentTraits;
+  typedef typename AssignmentTraits::PacketType PacketType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr generic_dense_assignment_kernel(DstEvaluatorType& dst,
+                                                                                  const SrcEvaluatorType& src,
+                                                                                  const Functor& func,
+                                                                                  DstXprType& dstExpr)
+      : m_dst(dst), m_src(src), m_functor(func), m_dstExpr(dstExpr) {
+#ifdef EIGEN_DEBUG_ASSIGN
+    AssignmentTraits::debug();
+#endif
+  }
+
+  EIGEN_DEVICE_FUNC constexpr Index size() const noexcept { return m_dstExpr.size(); }
+  EIGEN_DEVICE_FUNC constexpr Index innerSize() const noexcept { return m_dstExpr.innerSize(); }
+  EIGEN_DEVICE_FUNC constexpr Index outerSize() const noexcept { return m_dstExpr.outerSize(); }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_dstExpr.rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_dstExpr.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return m_dstExpr.outerStride(); }
+
+  EIGEN_DEVICE_FUNC DstEvaluatorType& dstEvaluator() noexcept { return m_dst; }
+  EIGEN_DEVICE_FUNC const SrcEvaluatorType& srcEvaluator() const noexcept { return m_src; }
+
+  /// Assign src(row,col) to dst(row,col) through the assignment functor.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void assignCoeff(Index row, Index col) {
+    m_functor.assignCoeff(m_dst.coeffRef(row, col), m_src.coeff(row, col));
+  }
+
+  /// \sa assignCoeff(Index,Index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Index index) {
+    m_functor.assignCoeff(m_dst.coeffRef(index), m_src.coeff(index));
+  }
+
+  /// \sa assignCoeff(Index,Index)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void assignCoeffByOuterInner(Index outer, Index inner) {
+    Index row = rowIndexByOuterInner(outer, inner);
+    Index col = colIndexByOuterInner(outer, inner);
+    assignCoeff(row, col);
+  }
+
+  template <int StoreMode, int LoadMode, typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacket(Index row, Index col) {
+    m_functor.template assignPacket<StoreMode>(&m_dst.coeffRef(row, col),
+                                               m_src.template packet<LoadMode, Packet>(row, col));
+  }
+
+  template <int StoreMode, int LoadMode, typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacket(Index index) {
+    m_functor.template assignPacket<StoreMode>(&m_dst.coeffRef(index), m_src.template packet<LoadMode, Packet>(index));
+  }
+
+  template <int StoreMode, int LoadMode, typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacketByOuterInner(Index outer, Index inner) {
+    Index row = rowIndexByOuterInner(outer, inner);
+    Index col = colIndexByOuterInner(outer, inner);
+    assignPacket<StoreMode, LoadMode, Packet>(row, col);
+  }
+
+  template <int StoreMode, int LoadMode, typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacketSegment(Index row, Index col, Index begin, Index count) {
+    m_functor.template assignPacketSegment<StoreMode>(
+        &m_dst.coeffRef(row, col), m_src.template packetSegment<LoadMode, Packet>(row, col, begin, count), begin,
+        count);
+  }
+
+  template <int StoreMode, int LoadMode, typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacketSegment(Index index, Index begin, Index count) {
+    m_functor.template assignPacketSegment<StoreMode>(
+        &m_dst.coeffRef(index), m_src.template packetSegment<LoadMode, Packet>(index, begin, count), begin, count);
+  }
+
+  template <int StoreMode, int LoadMode, typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacketSegmentByOuterInner(Index outer, Index inner, Index begin,
+                                                                             Index count) {
+    Index row = rowIndexByOuterInner(outer, inner);
+    Index col = colIndexByOuterInner(outer, inner);
+    assignPacketSegment<StoreMode, LoadMode, Packet>(row, col, begin, count);
+  }
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr Index rowIndexByOuterInner(Index outer, Index inner) {
+    typedef typename DstEvaluatorType::ExpressionTraits Traits;
+    return int(Traits::RowsAtCompileTime) == 1          ? 0
+           : int(Traits::ColsAtCompileTime) == 1        ? inner
+           : int(DstEvaluatorType::Flags) & RowMajorBit ? outer
+                                                        : inner;
+  }
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr Index colIndexByOuterInner(Index outer, Index inner) {
+    typedef typename DstEvaluatorType::ExpressionTraits Traits;
+    return int(Traits::ColsAtCompileTime) == 1          ? 0
+           : int(Traits::RowsAtCompileTime) == 1        ? inner
+           : int(DstEvaluatorType::Flags) & RowMajorBit ? inner
+                                                        : outer;
+  }
+
+  EIGEN_DEVICE_FUNC const Scalar* dstDataPtr() const { return m_dstExpr.data(); }
+
+ protected:
+  DstEvaluatorType& m_dst;
+  const SrcEvaluatorType& m_src;
+  const Functor& m_functor;
+  // TODO find a way to avoid the needs of the original expression
+  DstXprType& m_dstExpr;
+};
+
+// Special kernel used when computing small products whose operands have dynamic dimensions.  It ensures that the
+// PacketSize used is no larger than 4, thereby increasing the chance that vectorized instructions will be used
+// when computing the product.
+
+template <typename DstEvaluatorTypeT, typename SrcEvaluatorTypeT, typename Functor>
+class restricted_packet_dense_assignment_kernel
+    : public generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, BuiltIn> {
+ protected:
+  typedef generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, BuiltIn> Base;
+
+ public:
+  typedef typename Base::Scalar Scalar;
+  typedef typename Base::DstXprType DstXprType;
+  typedef copy_using_evaluator_traits<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, 4> AssignmentTraits;
+  typedef typename AssignmentTraits::PacketType PacketType;
+
+  EIGEN_DEVICE_FUNC restricted_packet_dense_assignment_kernel(DstEvaluatorTypeT& dst, const SrcEvaluatorTypeT& src,
+                                                              const Functor& func, DstXprType& dstExpr)
+      : Base(dst, src, func, dstExpr) {}
+};
+
+/***************************************************************************
+ * Part 5 : Entry point for dense rectangular assignment
+ ***************************************************************************/
+
+template <typename DstXprType, typename SrcXprType, typename Functor>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void resize_if_allowed(DstXprType& dst, const SrcXprType& src,
+                                                                       const Functor& /*func*/) {
+  EIGEN_ONLY_USED_FOR_DEBUG(dst);
+  EIGEN_ONLY_USED_FOR_DEBUG(src);
+  eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
+}
+
+template <typename DstXprType, typename SrcXprType, typename T1, typename T2>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void resize_if_allowed(DstXprType& dst, const SrcXprType& src,
+                                                                       const internal::assign_op<T1, T2>& /*func*/) {
+  Index dstRows = src.rows();
+  Index dstCols = src.cols();
+  if (((dst.rows() != dstRows) || (dst.cols() != dstCols))) dst.resize(dstRows, dstCols);
+  eigen_assert(dst.rows() == dstRows && dst.cols() == dstCols);
+}
+
+template <typename DstXprType, typename SrcXprType, typename Functor>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_dense_assignment_loop(DstXprType& dst, const SrcXprType& src,
+                                                                                const Functor& func) {
+  typedef evaluator<DstXprType> DstEvaluatorType;
+  typedef evaluator<SrcXprType> SrcEvaluatorType;
+
+  SrcEvaluatorType srcEvaluator(src);
+
+  // NOTE To properly handle A = (A*A.transpose())/s with A rectangular,
+  // we need to resize the destination after the source evaluator has been created.
+  resize_if_allowed(dst, src, func);
+
+  DstEvaluatorType dstEvaluator(dst);
+
+  typedef generic_dense_assignment_kernel<DstEvaluatorType, SrcEvaluatorType, Functor> Kernel;
+  Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived());
+
+  dense_assignment_loop<Kernel>::run(kernel);
+}
+
+template <typename DstXprType, typename SrcXprType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType& dst, const SrcXprType& src) {
+  call_dense_assignment_loop(dst, src, internal::assign_op<typename DstXprType::Scalar, typename SrcXprType::Scalar>());
+}
+
+/***************************************************************************
+ * Part 6 : Generic assignment
+ ***************************************************************************/
+
+// Based on the respective shapes of the destination and source,
+// the class AssignmentKind determine the kind of assignment mechanism.
+// AssignmentKind must define a Kind typedef.
+template <typename DstShape, typename SrcShape>
+struct AssignmentKind;
+
+// Assignment kind defined in this file:
+struct Dense2Dense {};
+struct EigenBase2EigenBase {};
+
+template <typename, typename>
+struct AssignmentKind {
+  typedef EigenBase2EigenBase Kind;
+};
+template <>
+struct AssignmentKind<DenseShape, DenseShape> {
+  typedef Dense2Dense Kind;
+};
+
+// This is the main assignment class
+template <typename DstXprType, typename SrcXprType, typename Functor,
+          typename Kind = typename AssignmentKind<typename evaluator_traits<DstXprType>::Shape,
+                                                  typename evaluator_traits<SrcXprType>::Shape>::Kind,
+          typename EnableIf = void>
+struct Assignment;
+
+// The only purpose of this call_assignment() function is to deal with noalias() / "assume-aliasing" and automatic
+// transposition. Indeed, I (Gael) think that this concept of "assume-aliasing" was a mistake, and it makes thing quite
+// complicated. So this intermediate function removes everything related to "assume-aliasing" such that Assignment does
+// not has to bother about these annoying details.
+
+template <typename Dst, typename Src>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_assignment(Dst& dst, const Src& src) {
+  call_assignment(dst, src, internal::assign_op<typename Dst::Scalar, typename Src::Scalar>());
+}
+template <typename Dst, typename Src>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_assignment(const Dst& dst, const Src& src) {
+  call_assignment(dst, src, internal::assign_op<typename Dst::Scalar, typename Src::Scalar>());
+}
+
+// Deal with "assume-aliasing"
+template <typename Dst, typename Src, typename Func>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_assignment(
+    Dst& dst, const Src& src, const Func& func, std::enable_if_t<evaluator_assume_aliasing<Src>::value, void*> = 0) {
+  typename plain_matrix_type<Src>::type tmp(src);
+  call_assignment_no_alias(dst, tmp, func);
+}
+
+template <typename Dst, typename Src, typename Func>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_assignment(
+    Dst& dst, const Src& src, const Func& func, std::enable_if_t<!evaluator_assume_aliasing<Src>::value, void*> = 0) {
+  call_assignment_no_alias(dst, src, func);
+}
+
+// by-pass "assume-aliasing"
+// When there is no aliasing, we require that 'dst' has been properly resized
+template <typename Dst, template <typename> class StorageBase, typename Src, typename Func>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_assignment(NoAlias<Dst, StorageBase>& dst, const Src& src,
+                                                                     const Func& func) {
+  call_assignment_no_alias(dst.expression(), src, func);
+}
+
+template <typename Dst, typename Src, typename Func>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_assignment_no_alias(Dst& dst, const Src& src,
+                                                                              const Func& func) {
+  enum {
+    NeedToTranspose = ((int(Dst::RowsAtCompileTime) == 1 && int(Src::ColsAtCompileTime) == 1) ||
+                       (int(Dst::ColsAtCompileTime) == 1 && int(Src::RowsAtCompileTime) == 1)) &&
+                      int(Dst::SizeAtCompileTime) != 1
+  };
+
+  typedef std::conditional_t<NeedToTranspose, Transpose<Dst>, Dst> ActualDstTypeCleaned;
+  typedef std::conditional_t<NeedToTranspose, Transpose<Dst>, Dst&> ActualDstType;
+  ActualDstType actualDst(dst);
+
+  // TODO check whether this is the right place to perform these checks:
+  EIGEN_STATIC_ASSERT_LVALUE(Dst)
+  EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(ActualDstTypeCleaned, Src)
+  EIGEN_CHECK_BINARY_COMPATIBILIY(Func, typename ActualDstTypeCleaned::Scalar, typename Src::Scalar);
+
+  Assignment<ActualDstTypeCleaned, Src, Func>::run(actualDst, src, func);
+}
+
+template <typename Dst, typename Src, typename Func>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_restricted_packet_assignment_no_alias(Dst& dst, const Src& src,
+                                                                                      const Func& func) {
+  typedef evaluator<Dst> DstEvaluatorType;
+  typedef evaluator<Src> SrcEvaluatorType;
+  typedef restricted_packet_dense_assignment_kernel<DstEvaluatorType, SrcEvaluatorType, Func> Kernel;
+
+  EIGEN_STATIC_ASSERT_LVALUE(Dst)
+  EIGEN_CHECK_BINARY_COMPATIBILIY(Func, typename Dst::Scalar, typename Src::Scalar);
+
+  SrcEvaluatorType srcEvaluator(src);
+  resize_if_allowed(dst, src, func);
+
+  DstEvaluatorType dstEvaluator(dst);
+  Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived());
+
+  dense_assignment_loop<Kernel>::run(kernel);
+}
+
+template <typename Dst, typename Src>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_assignment_no_alias(Dst& dst, const Src& src) {
+  call_assignment_no_alias(dst, src, internal::assign_op<typename Dst::Scalar, typename Src::Scalar>());
+}
+
+template <typename Dst, typename Src, typename Func>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src,
+                                                                                           const Func& func) {
+  // TODO check whether this is the right place to perform these checks:
+  EIGEN_STATIC_ASSERT_LVALUE(Dst)
+  EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Dst, Src)
+  EIGEN_CHECK_BINARY_COMPATIBILIY(Func, typename Dst::Scalar, typename Src::Scalar);
+
+  Assignment<Dst, Src, Func>::run(dst, src, func);
+}
+template <typename Dst, typename Src>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src) {
+  call_assignment_no_alias_no_transpose(dst, src, internal::assign_op<typename Dst::Scalar, typename Src::Scalar>());
+}
+
+// forward declaration
+template <typename Dst, typename Src>
+EIGEN_DEVICE_FUNC void check_for_aliasing(const Dst& dst, const Src& src);
+
+// Generic Dense to Dense assignment
+// Note that the last template argument "Weak" is needed to make it possible to perform
+// both partial specialization+SFINAE without ambiguous specialization
+template <typename DstXprType, typename SrcXprType, typename Functor, typename Weak>
+struct Assignment<DstXprType, SrcXprType, Functor, Dense2Dense, Weak> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr void run(DstXprType& dst, const SrcXprType& src,
+                                                                  const Functor& func) {
+#ifndef EIGEN_NO_DEBUG
+    if (!internal::is_constant_evaluated()) {
+      internal::check_for_aliasing(dst, src);
+    }
+#endif
+
+    call_dense_assignment_loop(dst, src, func);
+  }
+};
+
+template <typename DstXprType, typename SrcPlainObject, typename Weak>
+struct Assignment<DstXprType, CwiseNullaryOp<scalar_constant_op<typename DstXprType::Scalar>, SrcPlainObject>,
+                  assign_op<typename DstXprType::Scalar, typename DstXprType::Scalar>, Dense2Dense, Weak> {
+  using Scalar = typename DstXprType::Scalar;
+  using NullaryOp = scalar_constant_op<Scalar>;
+  using SrcXprType = CwiseNullaryOp<NullaryOp, SrcPlainObject>;
+  using Functor = assign_op<Scalar, Scalar>;
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(DstXprType& dst, const SrcXprType& src,
+                                                        const Functor& /*func*/) {
+    eigen_fill_impl<DstXprType>::run(dst, src);
+  }
+};
+
+template <typename DstXprType, typename SrcPlainObject, typename Weak>
+struct Assignment<DstXprType, CwiseNullaryOp<scalar_zero_op<typename DstXprType::Scalar>, SrcPlainObject>,
+                  assign_op<typename DstXprType::Scalar, typename DstXprType::Scalar>, Dense2Dense, Weak> {
+  using Scalar = typename DstXprType::Scalar;
+  using NullaryOp = scalar_zero_op<Scalar>;
+  using SrcXprType = CwiseNullaryOp<NullaryOp, SrcPlainObject>;
+  using Functor = assign_op<Scalar, Scalar>;
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(DstXprType& dst, const SrcXprType& src,
+                                                        const Functor& /*func*/) {
+    eigen_zero_impl<DstXprType>::run(dst, src);
+  }
+};
+
+// Generic assignment through evalTo.
+// TODO: not sure we have to keep that one, but it helps porting current code to new evaluator mechanism.
+// Note that the last template argument "Weak" is needed to make it possible to perform
+// both partial specialization+SFINAE without ambiguous specialization
+template <typename DstXprType, typename SrcXprType, typename Functor, typename Weak>
+struct Assignment<DstXprType, SrcXprType, Functor, EigenBase2EigenBase, Weak> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(
+      DstXprType& dst, const SrcXprType& src,
+      const internal::assign_op<typename DstXprType::Scalar, typename SrcXprType::Scalar>& /*func*/) {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if ((dst.rows() != dstRows) || (dst.cols() != dstCols)) dst.resize(dstRows, dstCols);
+
+    eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
+    src.evalTo(dst);
+  }
+
+  // NOTE The following two functions are templated to avoid their instantiation if not needed
+  //      This is needed because some expressions supports evalTo only and/or have 'void' as scalar type.
+  template <typename SrcScalarType>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(
+      DstXprType& dst, const SrcXprType& src,
+      const internal::add_assign_op<typename DstXprType::Scalar, SrcScalarType>& /*func*/) {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if ((dst.rows() != dstRows) || (dst.cols() != dstCols)) dst.resize(dstRows, dstCols);
+
+    eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
+    src.addTo(dst);
+  }
+
+  template <typename SrcScalarType>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(
+      DstXprType& dst, const SrcXprType& src,
+      const internal::sub_assign_op<typename DstXprType::Scalar, SrcScalarType>& /*func*/) {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if ((dst.rows() != dstRows) || (dst.cols() != dstCols)) dst.resize(dstRows, dstCols);
+
+    eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
+    src.subTo(dst);
+  }
+};
+
+}  // namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_ASSIGN_EVALUATOR_H
diff --git a/inst/include/Eigen/src/Core/Assign_MKL.h b/inst/include/Eigen/src/Core/Assign_MKL.h
index 7772951b..7636445c 100644
--- a/inst/include/Eigen/src/Core/Assign_MKL.h
+++ b/inst/include/Eigen/src/Core/Assign_MKL.h
@@ -1,5 +1,6 @@
 /*
  Copyright (c) 2011, Intel Corporation. All rights reserved.
+ Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 
  Redistribution and use in source and binary forms, with or without modification,
  are permitted provided that the following conditions are met:
@@ -33,192 +34,150 @@
 #ifndef EIGEN_ASSIGN_VML_H
 #define EIGEN_ASSIGN_VML_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 
-namespace internal {
-
-template<typename Op> struct vml_call
-{ enum { IsSupported = 0 }; };
-
-template<typename Dst, typename Src, typename UnaryOp>
-class vml_assign_traits
-{
-  private:
-    enum {
-      DstHasDirectAccess = Dst::Flags & DirectAccessBit,
-      SrcHasDirectAccess = Src::Flags & DirectAccessBit,
-
-      StorageOrdersAgree = (int(Dst::IsRowMajor) == int(Src::IsRowMajor)),
-      InnerSize = int(Dst::IsVectorAtCompileTime) ? int(Dst::SizeAtCompileTime)
-                : int(Dst::Flags)&RowMajorBit ? int(Dst::ColsAtCompileTime)
-                : int(Dst::RowsAtCompileTime),
-      InnerMaxSize  = int(Dst::IsVectorAtCompileTime) ? int(Dst::MaxSizeAtCompileTime)
-                    : int(Dst::Flags)&RowMajorBit ? int(Dst::MaxColsAtCompileTime)
-                    : int(Dst::MaxRowsAtCompileTime),
-      MaxSizeAtCompileTime = Dst::SizeAtCompileTime,
-
-      MightEnableVml =  vml_call<UnaryOp>::IsSupported && StorageOrdersAgree && DstHasDirectAccess && SrcHasDirectAccess
-                     && Src::InnerStrideAtCompileTime==1 && Dst::InnerStrideAtCompileTime==1,
-      MightLinearize = MightEnableVml && (int(Dst::Flags) & int(Src::Flags) & LinearAccessBit),
-      VmlSize = MightLinearize ? MaxSizeAtCompileTime : InnerMaxSize,
-      LargeEnough = VmlSize==Dynamic || VmlSize>=EIGEN_MKL_VML_THRESHOLD,
-      MayEnableVml = MightEnableVml && LargeEnough,
-      MayLinearize = MayEnableVml && MightLinearize
-    };
-  public:
-    enum {
-      Traversal = MayLinearize ? LinearVectorizedTraversal
-                : MayEnableVml ? InnerVectorizedTraversal
-                : DefaultTraversal
-    };
-};
+namespace Eigen {
 
-template<typename Derived1, typename Derived2, typename UnaryOp, int Traversal, int Unrolling,
-         int VmlTraversal = vml_assign_traits<Derived1, Derived2, UnaryOp>::Traversal >
-struct vml_assign_impl
-  : assign_impl<Derived1, Eigen::CwiseUnaryOp<UnaryOp, Derived2>,Traversal,Unrolling,BuiltIn>
-{
-};
+namespace internal {
 
-template<typename Derived1, typename Derived2, typename UnaryOp, int Traversal, int Unrolling>
-struct vml_assign_impl<Derived1, Derived2, UnaryOp, Traversal, Unrolling, InnerVectorizedTraversal>
-{
-  typedef typename Derived1::Scalar Scalar;
-  typedef typename Derived1::Index Index;
-  static inline void run(Derived1& dst, const CwiseUnaryOp<UnaryOp, Derived2>& src)
-  {
-    // in case we want to (or have to) skip VML at runtime we can call:
-    // assign_impl<Derived1,Eigen::CwiseUnaryOp<UnaryOp, Derived2>,Traversal,Unrolling,BuiltIn>::run(dst,src);
-    const Index innerSize = dst.innerSize();
-    const Index outerSize = dst.outerSize();
-    for(Index outer = 0; outer < outerSize; ++outer) {
-      const Scalar *src_ptr = src.IsRowMajor ?  &(src.nestedExpression().coeffRef(outer,0)) :
-                                                &(src.nestedExpression().coeffRef(0, outer));
-      Scalar *dst_ptr = dst.IsRowMajor ? &(dst.coeffRef(outer,0)) : &(dst.coeffRef(0, outer));
-      vml_call<UnaryOp>::run(src.functor(), innerSize, src_ptr, dst_ptr );
-    }
-  }
-};
+template <typename Dst, typename Src>
+class vml_assign_traits {
+ private:
+  enum {
+    DstHasDirectAccess = Dst::Flags & DirectAccessBit,
+    SrcHasDirectAccess = Src::Flags & DirectAccessBit,
+    StorageOrdersAgree = (int(Dst::IsRowMajor) == int(Src::IsRowMajor)),
+    InnerSize = int(Dst::IsVectorAtCompileTime) ? int(Dst::SizeAtCompileTime)
+                : int(Dst::Flags) & RowMajorBit ? int(Dst::ColsAtCompileTime)
+                                                : int(Dst::RowsAtCompileTime),
+    InnerMaxSize = int(Dst::IsVectorAtCompileTime) ? int(Dst::MaxSizeAtCompileTime)
+                   : int(Dst::Flags) & RowMajorBit ? int(Dst::MaxColsAtCompileTime)
+                                                   : int(Dst::MaxRowsAtCompileTime),
+    MaxSizeAtCompileTime = Dst::SizeAtCompileTime,
+
+    MightEnableVml = bool(StorageOrdersAgree) && bool(DstHasDirectAccess) && bool(SrcHasDirectAccess) &&
+                     Src::InnerStrideAtCompileTime == 1 && Dst::InnerStrideAtCompileTime == 1,
+    MightLinearize = bool(MightEnableVml) && (int(Dst::Flags) & int(Src::Flags) & LinearAccessBit),
+    VmlSize = bool(MightLinearize) ? MaxSizeAtCompileTime : InnerMaxSize,
+    LargeEnough = (VmlSize == Dynamic) || VmlSize >= EIGEN_MKL_VML_THRESHOLD
+  };
 
-template<typename Derived1, typename Derived2, typename UnaryOp, int Traversal, int Unrolling>
-struct vml_assign_impl<Derived1, Derived2, UnaryOp, Traversal, Unrolling, LinearVectorizedTraversal>
-{
-  static inline void run(Derived1& dst, const CwiseUnaryOp<UnaryOp, Derived2>& src)
-  {
-    // in case we want to (or have to) skip VML at runtime we can call:
-    // assign_impl<Derived1,Eigen::CwiseUnaryOp<UnaryOp, Derived2>,Traversal,Unrolling,BuiltIn>::run(dst,src);
-    vml_call<UnaryOp>::run(src.functor(), dst.size(), src.nestedExpression().data(), dst.data() );
-  }
+ public:
+  enum { EnableVml = MightEnableVml && LargeEnough, Traversal = MightLinearize ? LinearTraversal : DefaultTraversal };
 };
 
-// Macroses
-
-#define EIGEN_MKL_VML_SPECIALIZE_ASSIGN(TRAVERSAL,UNROLLING) \
-  template<typename Derived1, typename Derived2, typename UnaryOp> \
-  struct assign_impl<Derived1, Eigen::CwiseUnaryOp<UnaryOp, Derived2>, TRAVERSAL, UNROLLING, Specialized>  {  \
-    static inline void run(Derived1 &dst, const Eigen::CwiseUnaryOp<UnaryOp, Derived2> &src) { \
-      vml_assign_impl<Derived1,Derived2,UnaryOp,TRAVERSAL,UNROLLING>::run(dst, src); \
-    } \
-  };
-
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(DefaultTraversal,NoUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(DefaultTraversal,CompleteUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(DefaultTraversal,InnerUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(LinearTraversal,NoUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(LinearTraversal,CompleteUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(InnerVectorizedTraversal,NoUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(InnerVectorizedTraversal,CompleteUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(InnerVectorizedTraversal,InnerUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(LinearVectorizedTraversal,CompleteUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(LinearVectorizedTraversal,NoUnrolling)
-EIGEN_MKL_VML_SPECIALIZE_ASSIGN(SliceVectorizedTraversal,NoUnrolling)
-
-
-#if !defined (EIGEN_FAST_MATH) || (EIGEN_FAST_MATH != 1)
-#define  EIGEN_MKL_VML_MODE VML_HA
+#define EIGEN_PP_EXPAND(ARG) ARG
+#if !defined(EIGEN_FAST_MATH) || (EIGEN_FAST_MATH != 1)
+#define EIGEN_VMLMODE_EXPAND_xLA , VML_HA
 #else
-#define  EIGEN_MKL_VML_MODE VML_LA
+#define EIGEN_VMLMODE_EXPAND_xLA , VML_LA
 #endif
 
-#define EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE)     \
-  template<> struct vml_call< scalar_##EIGENOP##_op<EIGENTYPE> > {               \
-    enum { IsSupported = 1 };                                                    \
-    static inline void run( const scalar_##EIGENOP##_op<EIGENTYPE>& /*func*/,        \
-                            int size, const EIGENTYPE* src, EIGENTYPE* dst) {    \
-      VMLOP(size, (const VMLTYPE*)src, (VMLTYPE*)dst);                           \
-    }                                                                            \
-  };
-
-#define EIGEN_MKL_VML_DECLARE_UNARY_CALL_LA(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE)  \
-  template<> struct vml_call< scalar_##EIGENOP##_op<EIGENTYPE> > {               \
-    enum { IsSupported = 1 };                                                    \
-    static inline void run( const scalar_##EIGENOP##_op<EIGENTYPE>& /*func*/,        \
-                            int size, const EIGENTYPE* src, EIGENTYPE* dst) {    \
-      MKL_INT64 vmlMode = EIGEN_MKL_VML_MODE;                                    \
-      VMLOP(size, (const VMLTYPE*)src, (VMLTYPE*)dst, vmlMode);                  \
-    }                                                                            \
+#define EIGEN_VMLMODE_EXPAND_x_
+
+#define EIGEN_VMLMODE_PREFIX_xLA vm
+#define EIGEN_VMLMODE_PREFIX_x_ v
+#define EIGEN_VMLMODE_PREFIX(VMLMODE) EIGEN_CAT(EIGEN_VMLMODE_PREFIX_x, VMLMODE)
+
+#define EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE, VMLMODE)                      \
+  template <typename DstXprType, typename SrcXprNested>                                                    \
+  struct Assignment<DstXprType, CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested>,              \
+                    assign_op<EIGENTYPE, EIGENTYPE>, Dense2Dense,                                          \
+                    std::enable_if_t<vml_assign_traits<DstXprType, SrcXprNested>::EnableVml>> {            \
+    typedef CwiseUnaryOp<scalar_##EIGENOP##_op<EIGENTYPE>, SrcXprNested> SrcXprType;                       \
+    static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE, EIGENTYPE> &func) { \
+      resize_if_allowed(dst, src, func);                                                                   \
+      eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());                                  \
+      if (vml_assign_traits<DstXprType, SrcXprNested>::Traversal == (int)LinearTraversal) {                \
+        VMLOP(dst.size(), (const VMLTYPE *)src.nestedExpression().data(),                                  \
+              (VMLTYPE *)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_x##VMLMODE));                     \
+      } else {                                                                                             \
+        const Index outerSize = dst.outerSize();                                                           \
+        for (Index outer = 0; outer < outerSize; ++outer) {                                                \
+          const EIGENTYPE *src_ptr = src.IsRowMajor ? &(src.nestedExpression().coeffRef(outer, 0))         \
+                                                    : &(src.nestedExpression().coeffRef(0, outer));        \
+          EIGENTYPE *dst_ptr = dst.IsRowMajor ? &(dst.coeffRef(outer, 0)) : &(dst.coeffRef(0, outer));     \
+          VMLOP(dst.innerSize(), (const VMLTYPE *)src_ptr,                                                 \
+                (VMLTYPE *)dst_ptr EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_x##VMLMODE));                      \
+        }                                                                                                  \
+      }                                                                                                    \
+    }                                                                                                      \
   };
 
-#define EIGEN_MKL_VML_DECLARE_POW_CALL(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE)       \
-  template<> struct vml_call< scalar_##EIGENOP##_op<EIGENTYPE> > {               \
-    enum { IsSupported = 1 };                                                    \
-    static inline void run( const scalar_##EIGENOP##_op<EIGENTYPE>& func,        \
-                          int size, const EIGENTYPE* src, EIGENTYPE* dst) {      \
-      EIGENTYPE exponent = func.m_exponent;                                      \
-      MKL_INT64 vmlMode = EIGEN_MKL_VML_MODE;                                    \
-      VMLOP(&size, (const VMLTYPE*)src, (const VMLTYPE*)&exponent,               \
-                        (VMLTYPE*)dst, &vmlMode);                                \
-    }                                                                            \
+#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(EIGENOP, VMLOP, VMLMODE)                                                \
+  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, EIGEN_CAT(EIGEN_VMLMODE_PREFIX(VMLMODE), s##VMLOP), float, float, VMLMODE) \
+  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, EIGEN_CAT(EIGEN_VMLMODE_PREFIX(VMLMODE), d##VMLOP), double, double, VMLMODE)
+
+#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS_CPLX(EIGENOP, VMLOP, VMLMODE)                                   \
+  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, EIGEN_CAT(EIGEN_VMLMODE_PREFIX(VMLMODE), c##VMLOP), scomplex, \
+                                   MKL_Complex8, VMLMODE)                                                 \
+  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, EIGEN_CAT(EIGEN_VMLMODE_PREFIX(VMLMODE), z##VMLOP), dcomplex, \
+                                   MKL_Complex16, VMLMODE)
+
+#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS(EIGENOP, VMLOP, VMLMODE) \
+  EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(EIGENOP, VMLOP, VMLMODE)  \
+  EIGEN_MKL_VML_DECLARE_UNARY_CALLS_CPLX(EIGENOP, VMLOP, VMLMODE)
+
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(sin, Sin, LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(asin, Asin, LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(sinh, Sinh, LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(cos, Cos, LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(acos, Acos, LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(cosh, Cosh, LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(tan, Tan, LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(atan, Atan, LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(tanh, Tanh, LA)
+// EIGEN_MKL_VML_DECLARE_UNARY_CALLS(abs,   Abs,    _)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(exp, Exp, LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(log, Ln, LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(log10, Log10, LA)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS(sqrt, Sqrt, _)
+
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(square, Sqr, _)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS_CPLX(arg, Arg, _)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(round, Round, _)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(floor, Floor, _)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(ceil, Ceil, _)
+EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(cbrt, Cbrt, _)
+
+#define EIGEN_MKL_VML_DECLARE_POW_CALL(EIGENOP, VMLOP, EIGENTYPE, VMLTYPE, VMLMODE)                        \
+  template <typename DstXprType, typename SrcXprNested, typename Plain>                                    \
+  struct Assignment<DstXprType,                                                                            \
+                    CwiseBinaryOp<scalar_##EIGENOP##_op<EIGENTYPE, EIGENTYPE>, SrcXprNested,               \
+                                  const CwiseNullaryOp<internal::scalar_constant_op<EIGENTYPE>, Plain>>,   \
+                    assign_op<EIGENTYPE, EIGENTYPE>, Dense2Dense,                                          \
+                    std::enable_if_t<vml_assign_traits<DstXprType, SrcXprNested>::EnableVml>> {            \
+    typedef CwiseBinaryOp<scalar_##EIGENOP##_op<EIGENTYPE, EIGENTYPE>, SrcXprNested,                       \
+                          const CwiseNullaryOp<internal::scalar_constant_op<EIGENTYPE>, Plain>>            \
+        SrcXprType;                                                                                        \
+    static void run(DstXprType &dst, const SrcXprType &src, const assign_op<EIGENTYPE, EIGENTYPE> &func) { \
+      resize_if_allowed(dst, src, func);                                                                   \
+      eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());                                  \
+      VMLTYPE exponent = reinterpret_cast<const VMLTYPE &>(src.rhs().functor().m_other);                   \
+      if (vml_assign_traits<DstXprType, SrcXprNested>::Traversal == LinearTraversal) {                     \
+        VMLOP(dst.size(), (const VMLTYPE *)src.lhs().data(), exponent,                                     \
+              (VMLTYPE *)dst.data() EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_x##VMLMODE));                     \
+      } else {                                                                                             \
+        const Index outerSize = dst.outerSize();                                                           \
+        for (Index outer = 0; outer < outerSize; ++outer) {                                                \
+          const EIGENTYPE *src_ptr =                                                                       \
+              src.IsRowMajor ? &(src.lhs().coeffRef(outer, 0)) : &(src.lhs().coeffRef(0, outer));          \
+          EIGENTYPE *dst_ptr = dst.IsRowMajor ? &(dst.coeffRef(outer, 0)) : &(dst.coeffRef(0, outer));     \
+          VMLOP(dst.innerSize(), (const VMLTYPE *)src_ptr, exponent,                                       \
+                (VMLTYPE *)dst_ptr EIGEN_PP_EXPAND(EIGEN_VMLMODE_EXPAND_x##VMLMODE));                      \
+        }                                                                                                  \
+      }                                                                                                    \
+    }                                                                                                      \
   };
 
-#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(EIGENOP, VMLOP)                   \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, vs##VMLOP, float, float)             \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, vd##VMLOP, double, double)
-
-#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS_COMPLEX(EIGENOP, VMLOP)                \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, vc##VMLOP, scomplex, MKL_Complex8)   \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALL(EIGENOP, vz##VMLOP, dcomplex, MKL_Complex16)
-
-#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS(EIGENOP, VMLOP)                        \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(EIGENOP, VMLOP)                         \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALLS_COMPLEX(EIGENOP, VMLOP)
-
-
-#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL_LA(EIGENOP, VMLOP)                \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALL_LA(EIGENOP, vms##VMLOP, float, float)         \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALL_LA(EIGENOP, vmd##VMLOP, double, double)
-
-#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS_COMPLEX_LA(EIGENOP, VMLOP)             \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALL_LA(EIGENOP, vmc##VMLOP, scomplex, MKL_Complex8)  \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALL_LA(EIGENOP, vmz##VMLOP, dcomplex, MKL_Complex16)
-
-#define EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(EIGENOP, VMLOP)                     \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL_LA(EIGENOP, VMLOP)                      \
-  EIGEN_MKL_VML_DECLARE_UNARY_CALLS_COMPLEX_LA(EIGENOP, VMLOP)
-
-
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(sin,  Sin)
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(asin, Asin)
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(cos,  Cos)
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(acos, Acos)
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(tan,  Tan)
-//EIGEN_MKL_VML_DECLARE_UNARY_CALLS(abs,  Abs)
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(exp,  Exp)
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(log,  Ln)
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_LA(sqrt, Sqrt)
-
-EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(square, Sqr)
-
-// The vm*powx functions are not avaibale in the windows version of MKL.
-#ifndef _WIN32
-EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmspowx_, float, float)
-EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmdpowx_, double, double)
-EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmcpowx_, scomplex, MKL_Complex8)
-EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmzpowx_, dcomplex, MKL_Complex16)
-#endif
+EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmsPowx, float, float, LA)
+EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmdPowx, double, double, LA)
+EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmcPowx, scomplex, MKL_Complex8, LA)
+EIGEN_MKL_VML_DECLARE_POW_CALL(pow, vmzPowx, dcomplex, MKL_Complex16, LA)
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_ASSIGN_VML_H
+#endif  // EIGEN_ASSIGN_VML_H
diff --git a/inst/include/Eigen/src/Core/BandMatrix.h b/inst/include/Eigen/src/Core/BandMatrix.h
index ffd7fe8b..57b03229 100644
--- a/inst/include/Eigen/src/Core/BandMatrix.h
+++ b/inst/include/Eigen/src/Core/BandMatrix.h
@@ -10,325 +10,329 @@
 #ifndef EIGEN_BANDMATRIX_H
 #define EIGEN_BANDMATRIX_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 namespace internal {
 
-template<typename Derived>
-class BandMatrixBase : public EigenBase<Derived>
-{
-  public:
+template <typename Derived>
+class BandMatrixBase : public EigenBase<Derived> {
+ public:
+  enum {
+    Flags = internal::traits<Derived>::Flags,
+    CoeffReadCost = internal::traits<Derived>::CoeffReadCost,
+    RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime,
+    ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
+    MaxRowsAtCompileTime = internal::traits<Derived>::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = internal::traits<Derived>::MaxColsAtCompileTime,
+    Supers = internal::traits<Derived>::Supers,
+    Subs = internal::traits<Derived>::Subs,
+    Options = internal::traits<Derived>::Options
+  };
+  typedef typename internal::traits<Derived>::Scalar Scalar;
+  typedef Matrix<Scalar, RowsAtCompileTime, ColsAtCompileTime> DenseMatrixType;
+  typedef typename DenseMatrixType::StorageIndex StorageIndex;
+  typedef typename internal::traits<Derived>::CoefficientsType CoefficientsType;
+  typedef EigenBase<Derived> Base;
 
-    enum {
-      Flags = internal::traits<Derived>::Flags,
-      CoeffReadCost = internal::traits<Derived>::CoeffReadCost,
-      RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime,
-      ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
-      MaxRowsAtCompileTime = internal::traits<Derived>::MaxRowsAtCompileTime,
-      MaxColsAtCompileTime = internal::traits<Derived>::MaxColsAtCompileTime,
-      Supers = internal::traits<Derived>::Supers,
-      Subs   = internal::traits<Derived>::Subs,
-      Options = internal::traits<Derived>::Options
-    };
-    typedef typename internal::traits<Derived>::Scalar Scalar;
-    typedef Matrix<Scalar,RowsAtCompileTime,ColsAtCompileTime> DenseMatrixType;
-    typedef typename DenseMatrixType::Index Index;
-    typedef typename internal::traits<Derived>::CoefficientsType CoefficientsType;
-    typedef EigenBase<Derived> Base;
+ protected:
+  enum {
+    DataRowsAtCompileTime = ((Supers != Dynamic) && (Subs != Dynamic)) ? 1 + Supers + Subs : Dynamic,
+    SizeAtCompileTime = min_size_prefer_dynamic(RowsAtCompileTime, ColsAtCompileTime)
+  };
 
-  protected:
+ public:
+  using Base::cols;
+  using Base::derived;
+  using Base::rows;
+
+  /** \returns the number of super diagonals */
+  inline Index supers() const { return derived().supers(); }
+
+  /** \returns the number of sub diagonals */
+  inline Index subs() const { return derived().subs(); }
+
+  /** \returns an expression of the underlying coefficient matrix */
+  inline const CoefficientsType& coeffs() const { return derived().coeffs(); }
+
+  /** \returns an expression of the underlying coefficient matrix */
+  inline CoefficientsType& coeffs() { return derived().coeffs(); }
+
+  /** \returns a vector expression of the \a i -th column,
+   * only the meaningful part is returned.
+   * \warning the internal storage must be column major. */
+  inline Block<CoefficientsType, Dynamic, 1> col(Index i) {
+    EIGEN_STATIC_ASSERT((int(Options) & int(RowMajor)) == 0, THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES);
+    Index start = 0;
+    Index len = coeffs().rows();
+    if (i <= supers()) {
+      start = supers() - i;
+      len = (std::min)(rows(), std::max<Index>(0, coeffs().rows() - (supers() - i)));
+    } else if (i >= rows() - subs())
+      len = std::max<Index>(0, coeffs().rows() - (i + 1 - rows() + subs()));
+    return Block<CoefficientsType, Dynamic, 1>(coeffs(), start, i, len, 1);
+  }
+
+  /** \returns a vector expression of the main diagonal */
+  inline Block<CoefficientsType, 1, SizeAtCompileTime> diagonal() {
+    return Block<CoefficientsType, 1, SizeAtCompileTime>(coeffs(), supers(), 0, 1, (std::min)(rows(), cols()));
+  }
+
+  /** \returns a vector expression of the main diagonal (const version) */
+  inline const Block<const CoefficientsType, 1, SizeAtCompileTime> diagonal() const {
+    return Block<const CoefficientsType, 1, SizeAtCompileTime>(coeffs(), supers(), 0, 1, (std::min)(rows(), cols()));
+  }
+
+  template <int Index>
+  struct DiagonalIntReturnType {
     enum {
-      DataRowsAtCompileTime = ((Supers!=Dynamic) && (Subs!=Dynamic))
-                            ? 1 + Supers + Subs
-                            : Dynamic,
-      SizeAtCompileTime = EIGEN_SIZE_MIN_PREFER_DYNAMIC(RowsAtCompileTime,ColsAtCompileTime)
-    };
-
-  public:
-    
-    using Base::derived;
-    using Base::rows;
-    using Base::cols;
-
-    /** \returns the number of super diagonals */
-    inline Index supers() const { return derived().supers(); }
-
-    /** \returns the number of sub diagonals */
-    inline Index subs() const { return derived().subs(); }
-    
-    /** \returns an expression of the underlying coefficient matrix */
-    inline const CoefficientsType& coeffs() const { return derived().coeffs(); }
-    
-    /** \returns an expression of the underlying coefficient matrix */
-    inline CoefficientsType& coeffs() { return derived().coeffs(); }
-
-    /** \returns a vector expression of the \a i -th column,
-      * only the meaningful part is returned.
-      * \warning the internal storage must be column major. */
-    inline Block<CoefficientsType,Dynamic,1> col(Index i)
-    {
-      EIGEN_STATIC_ASSERT((Options&RowMajor)==0,THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES);
-      Index start = 0;
-      Index len = coeffs().rows();
-      if (i<=supers())
-      {
-        start = supers()-i;
-        len = (std::min)(rows(),std::max<Index>(0,coeffs().rows() - (supers()-i)));
-      }
-      else if (i>=rows()-subs())
-        len = std::max<Index>(0,coeffs().rows() - (i + 1 - rows() + subs()));
-      return Block<CoefficientsType,Dynamic,1>(coeffs(), start, i, len, 1);
-    }
-
-    /** \returns a vector expression of the main diagonal */
-    inline Block<CoefficientsType,1,SizeAtCompileTime> diagonal()
-    { return Block<CoefficientsType,1,SizeAtCompileTime>(coeffs(),supers(),0,1,(std::min)(rows(),cols())); }
-
-    /** \returns a vector expression of the main diagonal (const version) */
-    inline const Block<const CoefficientsType,1,SizeAtCompileTime> diagonal() const
-    { return Block<const CoefficientsType,1,SizeAtCompileTime>(coeffs(),supers(),0,1,(std::min)(rows(),cols())); }
-
-    template<int Index> struct DiagonalIntReturnType {
-      enum {
-        ReturnOpposite = (Options&SelfAdjoint) && (((Index)>0 && Supers==0) || ((Index)<0 && Subs==0)),
-        Conjugate = ReturnOpposite && NumTraits<Scalar>::IsComplex,
-        ActualIndex = ReturnOpposite ? -Index : Index,
-        DiagonalSize = (RowsAtCompileTime==Dynamic || ColsAtCompileTime==Dynamic)
-                     ? Dynamic
-                     : (ActualIndex<0
-                     ? EIGEN_SIZE_MIN_PREFER_DYNAMIC(ColsAtCompileTime, RowsAtCompileTime + ActualIndex)
-                     : EIGEN_SIZE_MIN_PREFER_DYNAMIC(RowsAtCompileTime, ColsAtCompileTime - ActualIndex))
-      };
-      typedef Block<CoefficientsType,1, DiagonalSize> BuildType;
-      typedef typename internal::conditional<Conjugate,
-                 CwiseUnaryOp<internal::scalar_conjugate_op<Scalar>,BuildType >,
-                 BuildType>::type Type;
+      ReturnOpposite =
+          (int(Options) & int(SelfAdjoint)) && (((Index) > 0 && Supers == 0) || ((Index) < 0 && Subs == 0)),
+      Conjugate = ReturnOpposite && NumTraits<Scalar>::IsComplex,
+      ActualIndex = ReturnOpposite ? -Index : Index,
+      DiagonalSize =
+          (RowsAtCompileTime == Dynamic || ColsAtCompileTime == Dynamic)
+              ? Dynamic
+              : (ActualIndex < 0 ? min_size_prefer_dynamic(ColsAtCompileTime, RowsAtCompileTime + ActualIndex)
+                                 : min_size_prefer_dynamic(RowsAtCompileTime, ColsAtCompileTime - ActualIndex))
     };
+    typedef Block<CoefficientsType, 1, DiagonalSize> BuildType;
+    typedef std::conditional_t<Conjugate, CwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, BuildType>, BuildType>
+        Type;
+  };
 
-    /** \returns a vector expression of the \a N -th sub or super diagonal */
-    template<int N> inline typename DiagonalIntReturnType<N>::Type diagonal()
-    {
-      return typename DiagonalIntReturnType<N>::BuildType(coeffs(), supers()-N, (std::max)(0,N), 1, diagonalLength(N));
-    }
-
-    /** \returns a vector expression of the \a N -th sub or super diagonal */
-    template<int N> inline const typename DiagonalIntReturnType<N>::Type diagonal() const
-    {
-      return typename DiagonalIntReturnType<N>::BuildType(coeffs(), supers()-N, (std::max)(0,N), 1, diagonalLength(N));
-    }
-
-    /** \returns a vector expression of the \a i -th sub or super diagonal */
-    inline Block<CoefficientsType,1,Dynamic> diagonal(Index i)
-    {
-      eigen_assert((i<0 && -i<=subs()) || (i>=0 && i<=supers()));
-      return Block<CoefficientsType,1,Dynamic>(coeffs(), supers()-i, std::max<Index>(0,i), 1, diagonalLength(i));
-    }
-
-    /** \returns a vector expression of the \a i -th sub or super diagonal */
-    inline const Block<const CoefficientsType,1,Dynamic> diagonal(Index i) const
-    {
-      eigen_assert((i<0 && -i<=subs()) || (i>=0 && i<=supers()));
-      return Block<const CoefficientsType,1,Dynamic>(coeffs(), supers()-i, std::max<Index>(0,i), 1, diagonalLength(i));
-    }
-    
-    template<typename Dest> inline void evalTo(Dest& dst) const
-    {
-      dst.resize(rows(),cols());
-      dst.setZero();
-      dst.diagonal() = diagonal();
-      for (Index i=1; i<=supers();++i)
-        dst.diagonal(i) = diagonal(i);
-      for (Index i=1; i<=subs();++i)
-        dst.diagonal(-i) = diagonal(-i);
-    }
-
-    DenseMatrixType toDenseMatrix() const
-    {
-      DenseMatrixType res(rows(),cols());
-      evalTo(res);
-      return res;
-    }
-
-  protected:
-
-    inline Index diagonalLength(Index i) const
-    { return i<0 ? (std::min)(cols(),rows()+i) : (std::min)(rows(),cols()-i); }
+  /** \returns a vector expression of the \a N -th sub or super diagonal */
+  template <int N>
+  inline typename DiagonalIntReturnType<N>::Type diagonal() {
+    return typename DiagonalIntReturnType<N>::BuildType(coeffs(), supers() - N, (std::max)(0, N), 1, diagonalLength(N));
+  }
+
+  /** \returns a vector expression of the \a N -th sub or super diagonal */
+  template <int N>
+  inline const typename DiagonalIntReturnType<N>::Type diagonal() const {
+    return typename DiagonalIntReturnType<N>::BuildType(coeffs(), supers() - N, (std::max)(0, N), 1, diagonalLength(N));
+  }
+
+  /** \returns a vector expression of the \a i -th sub or super diagonal */
+  inline Block<CoefficientsType, 1, Dynamic> diagonal(Index i) {
+    eigen_assert((i < 0 && -i <= subs()) || (i >= 0 && i <= supers()));
+    return Block<CoefficientsType, 1, Dynamic>(coeffs(), supers() - i, std::max<Index>(0, i), 1, diagonalLength(i));
+  }
+
+  /** \returns a vector expression of the \a i -th sub or super diagonal */
+  inline const Block<const CoefficientsType, 1, Dynamic> diagonal(Index i) const {
+    eigen_assert((i < 0 && -i <= subs()) || (i >= 0 && i <= supers()));
+    return Block<const CoefficientsType, 1, Dynamic>(coeffs(), supers() - i, std::max<Index>(0, i), 1,
+                                                     diagonalLength(i));
+  }
+
+  template <typename Dest>
+  inline void evalTo(Dest& dst) const {
+    dst.resize(rows(), cols());
+    dst.setZero();
+    dst.diagonal() = diagonal();
+    for (Index i = 1; i <= supers(); ++i) dst.diagonal(i) = diagonal(i);
+    for (Index i = 1; i <= subs(); ++i) dst.diagonal(-i) = diagonal(-i);
+  }
+
+  DenseMatrixType toDenseMatrix() const {
+    DenseMatrixType res(rows(), cols());
+    evalTo(res);
+    return res;
+  }
+
+ protected:
+  inline Index diagonalLength(Index i) const {
+    return i < 0 ? (std::min)(cols(), rows() + i) : (std::min)(rows(), cols() - i);
+  }
 };
 
 /**
-  * \class BandMatrix
-  * \ingroup Core_Module
-  *
-  * \brief Represents a rectangular matrix with a banded storage
-  *
-  * \param _Scalar Numeric type, i.e. float, double, int
-  * \param Rows Number of rows, or \b Dynamic
-  * \param Cols Number of columns, or \b Dynamic
-  * \param Supers Number of super diagonal
-  * \param Subs Number of sub diagonal
-  * \param _Options A combination of either \b #RowMajor or \b #ColMajor, and of \b #SelfAdjoint
-  *                 The former controls \ref TopicStorageOrders "storage order", and defaults to
-  *                 column-major. The latter controls whether the matrix represents a selfadjoint 
-  *                 matrix in which case either Supers of Subs have to be null.
-  *
-  * \sa class TridiagonalMatrix
-  */
-
-template<typename _Scalar, int _Rows, int _Cols, int _Supers, int _Subs, int _Options>
-struct traits<BandMatrix<_Scalar,_Rows,_Cols,_Supers,_Subs,_Options> >
-{
-  typedef _Scalar Scalar;
+ * \class BandMatrix
+ * \ingroup Core_Module
+ *
+ * \brief Represents a rectangular matrix with a banded storage
+ *
+ * \tparam Scalar_ Numeric type, i.e. float, double, int
+ * \tparam Rows_ Number of rows, or \b Dynamic
+ * \tparam Cols_ Number of columns, or \b Dynamic
+ * \tparam Supers_ Number of super diagonal
+ * \tparam Subs_ Number of sub diagonal
+ * \tparam Options_ A combination of either \b #RowMajor or \b #ColMajor, and of \b #SelfAdjoint
+ *                  The former controls \ref TopicStorageOrders "storage order", and defaults to
+ *                  column-major. The latter controls whether the matrix represents a selfadjoint
+ *                  matrix in which case either Supers of Subs have to be null.
+ *
+ * \sa class TridiagonalMatrix
+ */
+
+template <typename Scalar_, int Rows_, int Cols_, int Supers_, int Subs_, int Options_>
+struct traits<BandMatrix<Scalar_, Rows_, Cols_, Supers_, Subs_, Options_> > {
+  typedef Scalar_ Scalar;
   typedef Dense StorageKind;
-  typedef DenseIndex Index;
+  typedef Eigen::Index StorageIndex;
   enum {
     CoeffReadCost = NumTraits<Scalar>::ReadCost,
-    RowsAtCompileTime = _Rows,
-    ColsAtCompileTime = _Cols,
-    MaxRowsAtCompileTime = _Rows,
-    MaxColsAtCompileTime = _Cols,
+    RowsAtCompileTime = Rows_,
+    ColsAtCompileTime = Cols_,
+    MaxRowsAtCompileTime = Rows_,
+    MaxColsAtCompileTime = Cols_,
     Flags = LvalueBit,
-    Supers = _Supers,
-    Subs = _Subs,
-    Options = _Options,
-    DataRowsAtCompileTime = ((Supers!=Dynamic) && (Subs!=Dynamic)) ? 1 + Supers + Subs : Dynamic
+    Supers = Supers_,
+    Subs = Subs_,
+    Options = Options_,
+    DataRowsAtCompileTime = ((Supers != Dynamic) && (Subs != Dynamic)) ? 1 + Supers + Subs : Dynamic
   };
-  typedef Matrix<Scalar,DataRowsAtCompileTime,ColsAtCompileTime,Options&RowMajor?RowMajor:ColMajor> CoefficientsType;
+  typedef Matrix<Scalar, DataRowsAtCompileTime, ColsAtCompileTime, int(Options) & int(RowMajor) ? RowMajor : ColMajor>
+      CoefficientsType;
 };
 
-template<typename _Scalar, int Rows, int Cols, int Supers, int Subs, int Options>
-class BandMatrix : public BandMatrixBase<BandMatrix<_Scalar,Rows,Cols,Supers,Subs,Options> >
-{
-  public:
+template <typename Scalar_, int Rows, int Cols, int Supers, int Subs, int Options>
+class BandMatrix : public BandMatrixBase<BandMatrix<Scalar_, Rows, Cols, Supers, Subs, Options> > {
+ public:
+  typedef typename internal::traits<BandMatrix>::Scalar Scalar;
+  typedef typename internal::traits<BandMatrix>::StorageIndex StorageIndex;
+  typedef typename internal::traits<BandMatrix>::CoefficientsType CoefficientsType;
 
-    typedef typename internal::traits<BandMatrix>::Scalar Scalar;
-    typedef typename internal::traits<BandMatrix>::Index Index;
-    typedef typename internal::traits<BandMatrix>::CoefficientsType CoefficientsType;
+  explicit inline BandMatrix(Index rows = Rows, Index cols = Cols, Index supers = Supers, Index subs = Subs)
+      : m_coeffs(1 + supers + subs, cols), m_rows(rows), m_supers(supers), m_subs(subs) {}
 
-    inline BandMatrix(Index rows=Rows, Index cols=Cols, Index supers=Supers, Index subs=Subs)
-      : m_coeffs(1+supers+subs,cols),
-        m_rows(rows), m_supers(supers), m_subs(subs)
-    {
-    }
+  /** \returns the number of columns */
+  constexpr Index rows() const { return m_rows.value(); }
 
-    /** \returns the number of columns */
-    inline Index rows() const { return m_rows.value(); }
+  /** \returns the number of rows */
+  constexpr Index cols() const { return m_coeffs.cols(); }
 
-    /** \returns the number of rows */
-    inline Index cols() const { return m_coeffs.cols(); }
+  /** \returns the number of super diagonals */
+  constexpr Index supers() const { return m_supers.value(); }
 
-    /** \returns the number of super diagonals */
-    inline Index supers() const { return m_supers.value(); }
+  /** \returns the number of sub diagonals */
+  constexpr Index subs() const { return m_subs.value(); }
 
-    /** \returns the number of sub diagonals */
-    inline Index subs() const { return m_subs.value(); }
+  inline const CoefficientsType& coeffs() const { return m_coeffs; }
+  inline CoefficientsType& coeffs() { return m_coeffs; }
 
-    inline const CoefficientsType& coeffs() const { return m_coeffs; }
-    inline CoefficientsType& coeffs() { return m_coeffs; }
-
-  protected:
-
-    CoefficientsType m_coeffs;
-    internal::variable_if_dynamic<Index, Rows>   m_rows;
-    internal::variable_if_dynamic<Index, Supers> m_supers;
-    internal::variable_if_dynamic<Index, Subs>   m_subs;
+ protected:
+  CoefficientsType m_coeffs;
+  internal::variable_if_dynamic<Index, Rows> m_rows;
+  internal::variable_if_dynamic<Index, Supers> m_supers;
+  internal::variable_if_dynamic<Index, Subs> m_subs;
 };
 
-template<typename _CoefficientsType,int _Rows, int _Cols, int _Supers, int _Subs,int _Options>
+template <typename CoefficientsType_, int Rows_, int Cols_, int Supers_, int Subs_, int Options_>
 class BandMatrixWrapper;
 
-template<typename _CoefficientsType,int _Rows, int _Cols, int _Supers, int _Subs,int _Options>
-struct traits<BandMatrixWrapper<_CoefficientsType,_Rows,_Cols,_Supers,_Subs,_Options> >
-{
-  typedef typename _CoefficientsType::Scalar Scalar;
-  typedef typename _CoefficientsType::StorageKind StorageKind;
-  typedef typename _CoefficientsType::Index Index;
+template <typename CoefficientsType_, int Rows_, int Cols_, int Supers_, int Subs_, int Options_>
+struct traits<BandMatrixWrapper<CoefficientsType_, Rows_, Cols_, Supers_, Subs_, Options_> > {
+  typedef typename CoefficientsType_::Scalar Scalar;
+  typedef typename CoefficientsType_::StorageKind StorageKind;
+  typedef typename CoefficientsType_::StorageIndex StorageIndex;
   enum {
-    CoeffReadCost = internal::traits<_CoefficientsType>::CoeffReadCost,
-    RowsAtCompileTime = _Rows,
-    ColsAtCompileTime = _Cols,
-    MaxRowsAtCompileTime = _Rows,
-    MaxColsAtCompileTime = _Cols,
+    CoeffReadCost = internal::traits<CoefficientsType_>::CoeffReadCost,
+    RowsAtCompileTime = Rows_,
+    ColsAtCompileTime = Cols_,
+    MaxRowsAtCompileTime = Rows_,
+    MaxColsAtCompileTime = Cols_,
     Flags = LvalueBit,
-    Supers = _Supers,
-    Subs = _Subs,
-    Options = _Options,
-    DataRowsAtCompileTime = ((Supers!=Dynamic) && (Subs!=Dynamic)) ? 1 + Supers + Subs : Dynamic
+    Supers = Supers_,
+    Subs = Subs_,
+    Options = Options_,
+    DataRowsAtCompileTime = ((Supers != Dynamic) && (Subs != Dynamic)) ? 1 + Supers + Subs : Dynamic
   };
-  typedef _CoefficientsType CoefficientsType;
+  typedef CoefficientsType_ CoefficientsType;
 };
 
-template<typename _CoefficientsType,int _Rows, int _Cols, int _Supers, int _Subs,int _Options>
-class BandMatrixWrapper : public BandMatrixBase<BandMatrixWrapper<_CoefficientsType,_Rows,_Cols,_Supers,_Subs,_Options> >
-{
-  public:
+template <typename CoefficientsType_, int Rows_, int Cols_, int Supers_, int Subs_, int Options_>
+class BandMatrixWrapper
+    : public BandMatrixBase<BandMatrixWrapper<CoefficientsType_, Rows_, Cols_, Supers_, Subs_, Options_> > {
+ public:
+  typedef typename internal::traits<BandMatrixWrapper>::Scalar Scalar;
+  typedef typename internal::traits<BandMatrixWrapper>::CoefficientsType CoefficientsType;
+  typedef typename internal::traits<BandMatrixWrapper>::StorageIndex StorageIndex;
 
-    typedef typename internal::traits<BandMatrixWrapper>::Scalar Scalar;
-    typedef typename internal::traits<BandMatrixWrapper>::CoefficientsType CoefficientsType;
-    typedef typename internal::traits<BandMatrixWrapper>::Index Index;
+  explicit inline BandMatrixWrapper(const CoefficientsType& coeffs, Index rows = Rows_, Index cols = Cols_,
+                                    Index supers = Supers_, Index subs = Subs_)
+      : m_coeffs(coeffs), m_rows(rows), m_supers(supers), m_subs(subs) {
+    EIGEN_UNUSED_VARIABLE(cols);
+    // eigen_assert(coeffs.cols()==cols() && (supers()+subs()+1)==coeffs.rows());
+  }
 
-    inline BandMatrixWrapper(const CoefficientsType& coeffs, Index rows=_Rows, Index cols=_Cols, Index supers=_Supers, Index subs=_Subs)
-      : m_coeffs(coeffs),
-        m_rows(rows), m_supers(supers), m_subs(subs)
-    {
-      EIGEN_UNUSED_VARIABLE(cols);
-      //internal::assert(coeffs.cols()==cols() && (supers()+subs()+1)==coeffs.rows());
-    }
+  /** \returns the number of columns */
+  constexpr Index rows() const { return m_rows.value(); }
 
-    /** \returns the number of columns */
-    inline Index rows() const { return m_rows.value(); }
+  /** \returns the number of rows */
+  constexpr Index cols() const { return m_coeffs.cols(); }
 
-    /** \returns the number of rows */
-    inline Index cols() const { return m_coeffs.cols(); }
+  /** \returns the number of super diagonals */
+  constexpr Index supers() const { return m_supers.value(); }
 
-    /** \returns the number of super diagonals */
-    inline Index supers() const { return m_supers.value(); }
+  /** \returns the number of sub diagonals */
+  constexpr Index subs() const { return m_subs.value(); }
 
-    /** \returns the number of sub diagonals */
-    inline Index subs() const { return m_subs.value(); }
+  inline const CoefficientsType& coeffs() const { return m_coeffs; }
 
-    inline const CoefficientsType& coeffs() const { return m_coeffs; }
+ protected:
+  const CoefficientsType& m_coeffs;
+  internal::variable_if_dynamic<Index, Rows_> m_rows;
+  internal::variable_if_dynamic<Index, Supers_> m_supers;
+  internal::variable_if_dynamic<Index, Subs_> m_subs;
+};
 
-  protected:
+/**
+ * \class TridiagonalMatrix
+ * \ingroup Core_Module
+ *
+ * \brief Represents a tridiagonal matrix with a compact banded storage
+ *
+ * \tparam Scalar Numeric type, i.e. float, double, int
+ * \tparam Size Number of rows and cols, or \b Dynamic
+ * \tparam Options Can be 0 or \b SelfAdjoint
+ *
+ * \sa class BandMatrix
+ */
+template <typename Scalar, int Size, int Options>
+class TridiagonalMatrix : public BandMatrix<Scalar, Size, Size, Options & SelfAdjoint ? 0 : 1, 1, Options | RowMajor> {
+  typedef BandMatrix<Scalar, Size, Size, Options & SelfAdjoint ? 0 : 1, 1, Options | RowMajor> Base;
+  typedef typename Base::StorageIndex StorageIndex;
+
+ public:
+  explicit TridiagonalMatrix(Index size = Size) : Base(size, size, Options & SelfAdjoint ? 0 : 1, 1) {}
+
+  inline typename Base::template DiagonalIntReturnType<1>::Type super() { return Base::template diagonal<1>(); }
+  inline const typename Base::template DiagonalIntReturnType<1>::Type super() const {
+    return Base::template diagonal<1>();
+  }
+  inline typename Base::template DiagonalIntReturnType<-1>::Type sub() { return Base::template diagonal<-1>(); }
+  inline const typename Base::template DiagonalIntReturnType<-1>::Type sub() const {
+    return Base::template diagonal<-1>();
+  }
+
+ protected:
+};
 
-    const CoefficientsType& m_coeffs;
-    internal::variable_if_dynamic<Index, _Rows>   m_rows;
-    internal::variable_if_dynamic<Index, _Supers> m_supers;
-    internal::variable_if_dynamic<Index, _Subs>   m_subs;
+struct BandShape {};
+
+template <typename Scalar_, int Rows_, int Cols_, int Supers_, int Subs_, int Options_>
+struct evaluator_traits<BandMatrix<Scalar_, Rows_, Cols_, Supers_, Subs_, Options_> >
+    : public evaluator_traits_base<BandMatrix<Scalar_, Rows_, Cols_, Supers_, Subs_, Options_> > {
+  typedef BandShape Shape;
 };
 
-/**
-  * \class TridiagonalMatrix
-  * \ingroup Core_Module
-  *
-  * \brief Represents a tridiagonal matrix with a compact banded storage
-  *
-  * \param _Scalar Numeric type, i.e. float, double, int
-  * \param Size Number of rows and cols, or \b Dynamic
-  * \param _Options Can be 0 or \b SelfAdjoint
-  *
-  * \sa class BandMatrix
-  */
-template<typename Scalar, int Size, int Options>
-class TridiagonalMatrix : public BandMatrix<Scalar,Size,Size,Options&SelfAdjoint?0:1,1,Options|RowMajor>
-{
-    typedef BandMatrix<Scalar,Size,Size,Options&SelfAdjoint?0:1,1,Options|RowMajor> Base;
-    typedef typename Base::Index Index;
-  public:
-    TridiagonalMatrix(Index size = Size) : Base(size,size,Options&SelfAdjoint?0:1,1) {}
-
-    inline typename Base::template DiagonalIntReturnType<1>::Type super()
-    { return Base::template diagonal<1>(); }
-    inline const typename Base::template DiagonalIntReturnType<1>::Type super() const
-    { return Base::template diagonal<1>(); }
-    inline typename Base::template DiagonalIntReturnType<-1>::Type sub()
-    { return Base::template diagonal<-1>(); }
-    inline const typename Base::template DiagonalIntReturnType<-1>::Type sub() const
-    { return Base::template diagonal<-1>(); }
-  protected:
+template <typename CoefficientsType_, int Rows_, int Cols_, int Supers_, int Subs_, int Options_>
+struct evaluator_traits<BandMatrixWrapper<CoefficientsType_, Rows_, Cols_, Supers_, Subs_, Options_> >
+    : public evaluator_traits_base<BandMatrixWrapper<CoefficientsType_, Rows_, Cols_, Supers_, Subs_, Options_> > {
+  typedef BandShape Shape;
+};
+
+template <>
+struct AssignmentKind<DenseShape, BandShape> {
+  typedef EigenBase2EigenBase Kind;
 };
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_BANDMATRIX_H
+#endif  // EIGEN_BANDMATRIX_H
diff --git a/inst/include/Eigen/src/Core/Block.h b/inst/include/Eigen/src/Core/Block.h
index 82789444..39abff71 100644
--- a/inst/include/Eigen/src/Core/Block.h
+++ b/inst/include/Eigen/src/Core/Block.h
@@ -11,396 +11,419 @@
 #ifndef EIGEN_BLOCK_H
 #define EIGEN_BLOCK_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 
-/** \class Block
-  * \ingroup Core_Module
-  *
-  * \brief Expression of a fixed-size or dynamic-size block
-  *
-  * \param XprType the type of the expression in which we are taking a block
-  * \param BlockRows the number of rows of the block we are taking at compile time (optional)
-  * \param BlockCols the number of columns of the block we are taking at compile time (optional)
-  *
-  * This class represents an expression of either a fixed-size or dynamic-size block. It is the return
-  * type of DenseBase::block(Index,Index,Index,Index) and DenseBase::block<int,int>(Index,Index) and
-  * most of the time this is the only way it is used.
-  *
-  * However, if you want to directly maniputate block expressions,
-  * for instance if you want to write a function returning such an expression, you
-  * will need to use this class.
-  *
-  * Here is an example illustrating the dynamic case:
-  * \include class_Block.cpp
-  * Output: \verbinclude class_Block.out
-  *
-  * \note Even though this expression has dynamic size, in the case where \a XprType
-  * has fixed size, this expression inherits a fixed maximal size which means that evaluating
-  * it does not cause a dynamic memory allocation.
-  *
-  * Here is an example illustrating the fixed-size case:
-  * \include class_FixedBlock.cpp
-  * Output: \verbinclude class_FixedBlock.out
-  *
-  * \sa DenseBase::block(Index,Index,Index,Index), DenseBase::block(Index,Index), class VectorBlock
-  */
+namespace Eigen {
 
 namespace internal {
-template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel>
-struct traits<Block<XprType, BlockRows, BlockCols, InnerPanel> > : traits<XprType>
-{
-  typedef typename traits<XprType>::Scalar Scalar;
-  typedef typename traits<XprType>::StorageKind StorageKind;
-  typedef typename traits<XprType>::XprKind XprKind;
-  typedef typename nested<XprType>::type XprTypeNested;
-  typedef typename remove_reference<XprTypeNested>::type _XprTypeNested;
-  enum{
-    MatrixRows = traits<XprType>::RowsAtCompileTime,
-    MatrixCols = traits<XprType>::ColsAtCompileTime,
+template <typename XprType_, int BlockRows, int BlockCols, bool InnerPanel_>
+struct traits<Block<XprType_, BlockRows, BlockCols, InnerPanel_>> : traits<XprType_> {
+  typedef typename traits<XprType_>::Scalar Scalar;
+  typedef typename traits<XprType_>::StorageKind StorageKind;
+  typedef typename traits<XprType_>::XprKind XprKind;
+  typedef typename ref_selector<XprType_>::type XprTypeNested;
+  typedef std::remove_reference_t<XprTypeNested> XprTypeNested_;
+  enum {
+    MatrixRows = traits<XprType_>::RowsAtCompileTime,
+    MatrixCols = traits<XprType_>::ColsAtCompileTime,
     RowsAtCompileTime = MatrixRows == 0 ? 0 : BlockRows,
     ColsAtCompileTime = MatrixCols == 0 ? 0 : BlockCols,
-    MaxRowsAtCompileTime = BlockRows==0 ? 0
-                         : RowsAtCompileTime != Dynamic ? int(RowsAtCompileTime)
-                         : int(traits<XprType>::MaxRowsAtCompileTime),
-    MaxColsAtCompileTime = BlockCols==0 ? 0
-                         : ColsAtCompileTime != Dynamic ? int(ColsAtCompileTime)
-                         : int(traits<XprType>::MaxColsAtCompileTime),
-    XprTypeIsRowMajor = (int(traits<XprType>::Flags)&RowMajorBit) != 0,
-    IsDense = is_same<StorageKind,Dense>::value,
-    IsRowMajor = (IsDense&&MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1
-               : (IsDense&&MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0
-               : XprTypeIsRowMajor,
+    MaxRowsAtCompileTime = BlockRows == 0                 ? 0
+                           : RowsAtCompileTime != Dynamic ? int(RowsAtCompileTime)
+                                                          : int(traits<XprType_>::MaxRowsAtCompileTime),
+    MaxColsAtCompileTime = BlockCols == 0                 ? 0
+                           : ColsAtCompileTime != Dynamic ? int(ColsAtCompileTime)
+                                                          : int(traits<XprType_>::MaxColsAtCompileTime),
+
+    XprTypeIsRowMajor = (int(traits<XprType_>::Flags) & RowMajorBit) != 0,
+    IsRowMajor = (MaxRowsAtCompileTime == 1 && MaxColsAtCompileTime != 1)   ? 1
+                 : (MaxColsAtCompileTime == 1 && MaxRowsAtCompileTime != 1) ? 0
+                                                                            : XprTypeIsRowMajor,
     HasSameStorageOrderAsXprType = (IsRowMajor == XprTypeIsRowMajor),
     InnerSize = IsRowMajor ? int(ColsAtCompileTime) : int(RowsAtCompileTime),
-    InnerStrideAtCompileTime = HasSameStorageOrderAsXprType
-                             ? int(inner_stride_at_compile_time<XprType>::ret)
-                             : int(outer_stride_at_compile_time<XprType>::ret),
-    OuterStrideAtCompileTime = HasSameStorageOrderAsXprType
-                             ? int(outer_stride_at_compile_time<XprType>::ret)
-                             : int(inner_stride_at_compile_time<XprType>::ret),
-    MaskPacketAccessBit = (InnerSize == Dynamic || (InnerSize % packet_traits<Scalar>::size) == 0)
-                       && (InnerStrideAtCompileTime == 1)
-                        ? PacketAccessBit : 0,
-    MaskAlignedBit = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % 16) == 0)) ? AlignedBit : 0,
-    FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1 || (InnerPanel && (traits<XprType>::Flags&LinearAccessBit))) ? LinearAccessBit : 0,
-    FlagsLvalueBit = is_lvalue<XprType>::value ? LvalueBit : 0,
+    InnerStrideAtCompileTime = HasSameStorageOrderAsXprType ? int(inner_stride_at_compile_time<XprType_>::ret)
+                                                            : int(outer_stride_at_compile_time<XprType_>::ret),
+    OuterStrideAtCompileTime = HasSameStorageOrderAsXprType ? int(outer_stride_at_compile_time<XprType_>::ret)
+                                                            : int(inner_stride_at_compile_time<XprType_>::ret),
+
+    // FIXME, this traits is rather specialized for dense object and it needs to be cleaned further
+    FlagsLvalueBit = is_lvalue<XprType_>::value ? LvalueBit : 0,
     FlagsRowMajorBit = IsRowMajor ? RowMajorBit : 0,
-    Flags0 = traits<XprType>::Flags & ( (HereditaryBits & ~RowMajorBit) |
-                                        DirectAccessBit |
-                                        MaskPacketAccessBit |
-                                        MaskAlignedBit),
-    Flags = Flags0 | FlagsLinearAccessBit | FlagsLvalueBit | FlagsRowMajorBit
+    Flags = (traits<XprType_>::Flags & (DirectAccessBit | (InnerPanel_ ? CompressedAccessBit : 0))) | FlagsLvalueBit |
+            FlagsRowMajorBit,
+    // FIXME DirectAccessBit should not be handled by expressions
+    //
+    // Alignment is needed by MapBase's assertions
+    // We can sefely set it to false here. Internal alignment errors will be detected by an eigen_internal_assert in the
+    // respective evaluator
+    Alignment = 0,
+    InnerPanel = InnerPanel_ ? 1 : 0
   };
 };
 
-template<typename XprType, int BlockRows=Dynamic, int BlockCols=Dynamic, bool InnerPanel = false,
-         bool HasDirectAccess = internal::has_direct_access<XprType>::ret> class BlockImpl_dense;
-         
-} // end namespace internal
-
-template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, typename StorageKind> class BlockImpl;
-
-template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel> class Block
-  : public BlockImpl<XprType, BlockRows, BlockCols, InnerPanel, typename internal::traits<XprType>::StorageKind>
-{
-    typedef BlockImpl<XprType, BlockRows, BlockCols, InnerPanel, typename internal::traits<XprType>::StorageKind> Impl;
-  public:
-    //typedef typename Impl::Base Base;
-    typedef Impl Base;
-    EIGEN_GENERIC_PUBLIC_INTERFACE(Block)
-    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Block)
-  
-    /** Column or Row constructor
-      */
-    inline Block(XprType& xpr, Index i) : Impl(xpr,i)
-    {
-      eigen_assert( (i>=0) && (
-          ((BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) && i<xpr.rows())
-        ||((BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) && i<xpr.cols())));
-    }
-
-    /** Fixed-size constructor
-      */
-    inline Block(XprType& xpr, Index a_startRow, Index a_startCol)
-      : Impl(xpr, a_startRow, a_startCol)
-    {
-      EIGEN_STATIC_ASSERT(RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic,THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE)
-      eigen_assert(a_startRow >= 0 && BlockRows >= 1 && a_startRow + BlockRows <= xpr.rows()
-             && a_startCol >= 0 && BlockCols >= 1 && a_startCol + BlockCols <= xpr.cols());
-    }
-
-    /** Dynamic-size constructor
-      */
-    inline Block(XprType& xpr,
-          Index a_startRow, Index a_startCol,
-          Index blockRows, Index blockCols)
-      : Impl(xpr, a_startRow, a_startCol, blockRows, blockCols)
-    {
-      eigen_assert((RowsAtCompileTime==Dynamic || RowsAtCompileTime==blockRows)
-          && (ColsAtCompileTime==Dynamic || ColsAtCompileTime==blockCols));
-      eigen_assert(a_startRow >= 0 && blockRows >= 0 && a_startRow  <= xpr.rows() - blockRows
-          && a_startCol >= 0 && blockCols >= 0 && a_startCol <= xpr.cols() - blockCols);
-    }
+template <typename XprType, int BlockRows = Dynamic, int BlockCols = Dynamic, bool InnerPanel = false,
+          bool HasDirectAccess = internal::has_direct_access<XprType>::ret>
+class BlockImpl_dense;
+
+}  // end namespace internal
+
+template <typename XprType, int BlockRows, int BlockCols, bool InnerPanel, typename StorageKind>
+class BlockImpl;
+
+/** \class Block
+ * \ingroup Core_Module
+ *
+ * \brief Expression of a fixed-size or dynamic-size block
+ *
+ * \tparam XprType the type of the expression in which we are taking a block
+ * \tparam BlockRows the number of rows of the block we are taking at compile time (optional)
+ * \tparam BlockCols the number of columns of the block we are taking at compile time (optional)
+ * \tparam InnerPanel is true, if the block maps to a set of rows of a row major matrix or
+ *         to set of columns of a column major matrix (optional). The parameter allows to determine
+ *         at compile time whether aligned access is possible on the block expression.
+ *
+ * This class represents an expression of either a fixed-size or dynamic-size block. It is the return
+ * type of DenseBase::block(Index,Index,Index,Index) and DenseBase::block<int,int>(Index,Index) and
+ * most of the time this is the only way it is used.
+ *
+ * However, if you want to directly manipulate block expressions,
+ * for instance if you want to write a function returning such an expression, you
+ * will need to use this class.
+ *
+ * Here is an example illustrating the dynamic case:
+ * \include class_Block.cpp
+ * Output: \verbinclude class_Block.out
+ *
+ * \note Even though this expression has dynamic size, in the case where \a XprType
+ * has fixed size, this expression inherits a fixed maximal size which means that evaluating
+ * it does not cause a dynamic memory allocation.
+ *
+ * Here is an example illustrating the fixed-size case:
+ * \include class_FixedBlock.cpp
+ * Output: \verbinclude class_FixedBlock.out
+ *
+ * \sa DenseBase::block(Index,Index,Index,Index), DenseBase::block(Index,Index), class VectorBlock
+ */
+template <typename XprType, int BlockRows, int BlockCols, bool InnerPanel>
+class Block
+    : public BlockImpl<XprType, BlockRows, BlockCols, InnerPanel, typename internal::traits<XprType>::StorageKind> {
+  typedef BlockImpl<XprType, BlockRows, BlockCols, InnerPanel, typename internal::traits<XprType>::StorageKind> Impl;
+  using BlockHelper = internal::block_xpr_helper<Block>;
+
+ public:
+  // typedef typename Impl::Base Base;
+  typedef Impl Base;
+  EIGEN_GENERIC_PUBLIC_INTERFACE(Block)
+  EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Block)
+
+  typedef internal::remove_all_t<XprType> NestedExpression;
+
+  /** Column or Row constructor
+   */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Block(XprType& xpr, Index i) : Impl(xpr, i) {
+    eigen_assert((i >= 0) && (((BlockRows == 1) && (BlockCols == XprType::ColsAtCompileTime) && i < xpr.rows()) ||
+                              ((BlockRows == XprType::RowsAtCompileTime) && (BlockCols == 1) && i < xpr.cols())));
+  }
+
+  /** Fixed-size constructor
+   */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Block(XprType& xpr, Index startRow, Index startCol)
+      : Impl(xpr, startRow, startCol) {
+    EIGEN_STATIC_ASSERT(RowsAtCompileTime != Dynamic && ColsAtCompileTime != Dynamic,
+                        THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE)
+    eigen_assert(startRow >= 0 && BlockRows >= 0 && startRow + BlockRows <= xpr.rows() && startCol >= 0 &&
+                 BlockCols >= 0 && startCol + BlockCols <= xpr.cols());
+  }
+
+  /** Dynamic-size constructor
+   */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Block(XprType& xpr, Index startRow, Index startCol, Index blockRows,
+                                              Index blockCols)
+      : Impl(xpr, startRow, startCol, blockRows, blockCols) {
+    eigen_assert((RowsAtCompileTime == Dynamic || RowsAtCompileTime == blockRows) &&
+                 (ColsAtCompileTime == Dynamic || ColsAtCompileTime == blockCols));
+    eigen_assert(startRow >= 0 && blockRows >= 0 && startRow <= xpr.rows() - blockRows && startCol >= 0 &&
+                 blockCols >= 0 && startCol <= xpr.cols() - blockCols);
+  }
+
+  // convert nested blocks (e.g. Block<Block<MatrixType>>) to a simple block expression (Block<MatrixType>)
+
+  using ConstUnwindReturnType = Block<const typename BlockHelper::BaseType, BlockRows, BlockCols, InnerPanel>;
+  using UnwindReturnType = Block<typename BlockHelper::BaseType, BlockRows, BlockCols, InnerPanel>;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ConstUnwindReturnType unwind() const {
+    return ConstUnwindReturnType(BlockHelper::base(*this), BlockHelper::row(*this, 0), BlockHelper::col(*this, 0),
+                                 this->rows(), this->cols());
+  }
+
+  template <typename T = Block, typename EnableIf = std::enable_if_t<!std::is_const<T>::value>>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UnwindReturnType unwind() {
+    return UnwindReturnType(BlockHelper::base(*this), BlockHelper::row(*this, 0), BlockHelper::col(*this, 0),
+                            this->rows(), this->cols());
+  }
 };
-         
-// The generic default implementation for dense block simplu forward to the internal::BlockImpl_dense
+
+// The generic default implementation for dense block simply forward to the internal::BlockImpl_dense
 // that must be specialized for direct and non-direct access...
-template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel>
+template <typename XprType, int BlockRows, int BlockCols, bool InnerPanel>
 class BlockImpl<XprType, BlockRows, BlockCols, InnerPanel, Dense>
-  : public internal::BlockImpl_dense<XprType, BlockRows, BlockCols, InnerPanel>
-{
-    typedef internal::BlockImpl_dense<XprType, BlockRows, BlockCols, InnerPanel> Impl;
-    typedef typename XprType::Index Index;
-  public:
-    typedef Impl Base;
-    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl)
-    inline BlockImpl(XprType& xpr, Index i) : Impl(xpr,i) {}
-    inline BlockImpl(XprType& xpr, Index a_startRow, Index a_startCol) : Impl(xpr, a_startRow, a_startCol) {}
-    inline BlockImpl(XprType& xpr, Index a_startRow, Index a_startCol, Index blockRows, Index blockCols)
-      : Impl(xpr, a_startRow, a_startCol, blockRows, blockCols) {}
+    : public internal::BlockImpl_dense<XprType, BlockRows, BlockCols, InnerPanel> {
+  typedef internal::BlockImpl_dense<XprType, BlockRows, BlockCols, InnerPanel> Impl;
+  typedef typename XprType::StorageIndex StorageIndex;
+
+ public:
+  typedef Impl Base;
+  EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockImpl(XprType& xpr, Index i) : Impl(xpr, i) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockImpl(XprType& xpr, Index startRow, Index startCol)
+      : Impl(xpr, startRow, startCol) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockImpl(XprType& xpr, Index startRow, Index startCol, Index blockRows,
+                                                  Index blockCols)
+      : Impl(xpr, startRow, startCol, blockRows, blockCols) {}
 };
 
 namespace internal {
 
 /** \internal Internal implementation of dense Blocks in the general case. */
-template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool HasDirectAccess> class BlockImpl_dense
-  : public internal::dense_xpr_base<Block<XprType, BlockRows, BlockCols, InnerPanel> >::type
-{
-    typedef Block<XprType, BlockRows, BlockCols, InnerPanel> BlockType;
-  public:
+template <typename XprType, int BlockRows, int BlockCols, bool InnerPanel, bool HasDirectAccess>
+class BlockImpl_dense : public internal::dense_xpr_base<Block<XprType, BlockRows, BlockCols, InnerPanel>>::type {
+  typedef Block<XprType, BlockRows, BlockCols, InnerPanel> BlockType;
+  typedef typename internal::ref_selector<XprType>::non_const_type XprTypeNested;
 
-    typedef typename internal::dense_xpr_base<BlockType>::type Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(BlockType)
-    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl_dense)
+ public:
+  typedef typename internal::dense_xpr_base<BlockType>::type Base;
+  EIGEN_DENSE_PUBLIC_INTERFACE(BlockType)
+  EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl_dense)
 
-    class InnerIterator;
+  // class InnerIterator; // FIXME apparently never used
 
-    /** Column or Row constructor
-      */
-    inline BlockImpl_dense(XprType& xpr, Index i)
+  /** Column or Row constructor
+   */
+  EIGEN_DEVICE_FUNC inline BlockImpl_dense(XprType& xpr, Index i)
       : m_xpr(xpr),
         // It is a row if and only if BlockRows==1 and BlockCols==XprType::ColsAtCompileTime,
         // and it is a column if and only if BlockRows==XprType::RowsAtCompileTime and BlockCols==1,
         // all other cases are invalid.
         // The case a 1x1 matrix seems ambiguous, but the result is the same anyway.
-        m_startRow( (BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) ? i : 0),
-        m_startCol( (BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) ? i : 0),
-        m_blockRows(BlockRows==1 ? 1 : xpr.rows()),
-        m_blockCols(BlockCols==1 ? 1 : xpr.cols())
-    {}
-
-    /** Fixed-size constructor
-      */
-    inline BlockImpl_dense(XprType& xpr, Index a_startRow, Index a_startCol)
-      : m_xpr(xpr), m_startRow(a_startRow), m_startCol(a_startCol),
-                    m_blockRows(BlockRows), m_blockCols(BlockCols)
-    {}
-
-    /** Dynamic-size constructor
-      */
-    inline BlockImpl_dense(XprType& xpr,
-          Index a_startRow, Index a_startCol,
-          Index blockRows, Index blockCols)
-      : m_xpr(xpr), m_startRow(a_startRow), m_startCol(a_startCol),
-                    m_blockRows(blockRows), m_blockCols(blockCols)
-    {}
-
-    inline Index rows() const { return m_blockRows.value(); }
-    inline Index cols() const { return m_blockCols.value(); }
-
-    inline Scalar& coeffRef(Index rowId, Index colId)
-    {
-      EIGEN_STATIC_ASSERT_LVALUE(XprType)
-      return m_xpr.const_cast_derived()
-               .coeffRef(rowId + m_startRow.value(), colId + m_startCol.value());
-    }
-
-    inline const Scalar& coeffRef(Index rowId, Index colId) const
-    {
-      return m_xpr.derived()
-               .coeffRef(rowId + m_startRow.value(), colId + m_startCol.value());
-    }
-
-    EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index rowId, Index colId) const
-    {
-      return m_xpr.coeff(rowId + m_startRow.value(), colId + m_startCol.value());
-    }
-
-    inline Scalar& coeffRef(Index index)
-    {
-      EIGEN_STATIC_ASSERT_LVALUE(XprType)
-      return m_xpr.const_cast_derived()
-             .coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
-                       m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
-    }
-
-    inline const Scalar& coeffRef(Index index) const
-    {
-      return m_xpr.const_cast_derived()
-             .coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
+        m_startRow((BlockRows == 1) && (BlockCols == XprType::ColsAtCompileTime) ? i : 0),
+        m_startCol((BlockRows == XprType::RowsAtCompileTime) && (BlockCols == 1) ? i : 0),
+        m_blockRows(BlockRows == 1 ? 1 : xpr.rows()),
+        m_blockCols(BlockCols == 1 ? 1 : xpr.cols()) {}
+
+  /** Fixed-size constructor
+   */
+  EIGEN_DEVICE_FUNC inline BlockImpl_dense(XprType& xpr, Index startRow, Index startCol)
+      : m_xpr(xpr), m_startRow(startRow), m_startCol(startCol), m_blockRows(BlockRows), m_blockCols(BlockCols) {}
+
+  /** Dynamic-size constructor
+   */
+  EIGEN_DEVICE_FUNC inline BlockImpl_dense(XprType& xpr, Index startRow, Index startCol, Index blockRows,
+                                           Index blockCols)
+      : m_xpr(xpr), m_startRow(startRow), m_startCol(startCol), m_blockRows(blockRows), m_blockCols(blockCols) {}
+
+  EIGEN_DEVICE_FUNC inline Index rows() const { return m_blockRows.value(); }
+  EIGEN_DEVICE_FUNC inline Index cols() const { return m_blockCols.value(); }
+
+  EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index rowId, Index colId) {
+    EIGEN_STATIC_ASSERT_LVALUE(XprType)
+    return m_xpr.coeffRef(rowId + m_startRow.value(), colId + m_startCol.value());
+  }
+
+  EIGEN_DEVICE_FUNC inline const Scalar& coeffRef(Index rowId, Index colId) const {
+    return m_xpr.derived().coeffRef(rowId + m_startRow.value(), colId + m_startCol.value());
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index rowId, Index colId) const {
+    return m_xpr.coeff(rowId + m_startRow.value(), colId + m_startCol.value());
+  }
+
+  EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index index) {
+    EIGEN_STATIC_ASSERT_LVALUE(XprType)
+    return m_xpr.coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
+                          m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
+  }
+
+  EIGEN_DEVICE_FUNC inline const Scalar& coeffRef(Index index) const {
+    return m_xpr.coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
+                          m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
+  }
+
+  EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index index) const {
+    return m_xpr.coeff(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
                        m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
-    }
-
-    inline const CoeffReturnType coeff(Index index) const
-    {
-      return m_xpr
-             .coeff(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
-                    m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
-    }
-
-    template<int LoadMode>
-    inline PacketScalar packet(Index rowId, Index colId) const
-    {
-      return m_xpr.template packet<Unaligned>
-              (rowId + m_startRow.value(), colId + m_startCol.value());
-    }
-
-    template<int LoadMode>
-    inline void writePacket(Index rowId, Index colId, const PacketScalar& val)
-    {
-      m_xpr.const_cast_derived().template writePacket<Unaligned>
-              (rowId + m_startRow.value(), colId + m_startCol.value(), val);
-    }
-
-    template<int LoadMode>
-    inline PacketScalar packet(Index index) const
-    {
-      return m_xpr.template packet<Unaligned>
-              (m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
-               m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
-    }
-
-    template<int LoadMode>
-    inline void writePacket(Index index, const PacketScalar& val)
-    {
-      m_xpr.const_cast_derived().template writePacket<Unaligned>
-         (m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
-          m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0), val);
-    }
-
-    #ifdef EIGEN_PARSED_BY_DOXYGEN
-    /** \sa MapBase::data() */
-    inline const Scalar* data() const;
-    inline Index innerStride() const;
-    inline Index outerStride() const;
-    #endif
-
-    const typename internal::remove_all<typename XprType::Nested>::type& nestedExpression() const 
-    { 
-      return m_xpr; 
-    }
-      
-    Index startRow() const 
-    { 
-      return m_startRow.value(); 
-    }
-      
-    Index startCol() const 
-    { 
-      return m_startCol.value(); 
-    }
-
-  protected:
-
-    const typename XprType::Nested m_xpr;
-    const internal::variable_if_dynamic<Index, XprType::RowsAtCompileTime == 1 ? 0 : Dynamic> m_startRow;
-    const internal::variable_if_dynamic<Index, XprType::ColsAtCompileTime == 1 ? 0 : Dynamic> m_startCol;
-    const internal::variable_if_dynamic<Index, RowsAtCompileTime> m_blockRows;
-    const internal::variable_if_dynamic<Index, ColsAtCompileTime> m_blockCols;
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC inline PacketScalar packet(Index rowId, Index colId) const {
+    return m_xpr.template packet<Unaligned>(rowId + m_startRow.value(), colId + m_startCol.value());
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC inline void writePacket(Index rowId, Index colId, const PacketScalar& val) {
+    m_xpr.template writePacket<Unaligned>(rowId + m_startRow.value(), colId + m_startCol.value(), val);
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC inline PacketScalar packet(Index index) const {
+    return m_xpr.template packet<Unaligned>(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
+                                            m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC inline void writePacket(Index index, const PacketScalar& val) {
+    m_xpr.template writePacket<Unaligned>(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
+                                          m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0), val);
+  }
+
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+  /** \sa MapBase::data() */
+  EIGEN_DEVICE_FUNC constexpr const Scalar* data() const;
+  EIGEN_DEVICE_FUNC inline Index innerStride() const;
+  EIGEN_DEVICE_FUNC inline Index outerStride() const;
+#endif
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all_t<XprTypeNested>& nestedExpression() const {
+    return m_xpr;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE XprType& nestedExpression() { return m_xpr; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr StorageIndex startRow() const noexcept { return m_startRow.value(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr StorageIndex startCol() const noexcept { return m_startCol.value(); }
+
+ protected:
+  XprTypeNested m_xpr;
+  const internal::variable_if_dynamic<StorageIndex, (XprType::RowsAtCompileTime == 1 && BlockRows == 1) ? 0 : Dynamic>
+      m_startRow;
+  const internal::variable_if_dynamic<StorageIndex, (XprType::ColsAtCompileTime == 1 && BlockCols == 1) ? 0 : Dynamic>
+      m_startCol;
+  const internal::variable_if_dynamic<StorageIndex, RowsAtCompileTime> m_blockRows;
+  const internal::variable_if_dynamic<StorageIndex, ColsAtCompileTime> m_blockCols;
 };
 
 /** \internal Internal implementation of dense Blocks in the direct access case.*/
-template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel>
-class BlockImpl_dense<XprType,BlockRows,BlockCols, InnerPanel,true>
-  : public MapBase<Block<XprType, BlockRows, BlockCols, InnerPanel> >
-{
-    typedef Block<XprType, BlockRows, BlockCols, InnerPanel> BlockType;
-  public:
-
-    typedef MapBase<BlockType> Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(BlockType)
-    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl_dense)
-
-    /** Column or Row constructor
-      */
-    inline BlockImpl_dense(XprType& xpr, Index i)
-      : Base(internal::const_cast_ptr(&xpr.coeffRef(
-              (BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) ? i : 0,
-              (BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) ? i : 0)),
-             BlockRows==1 ? 1 : xpr.rows(),
-             BlockCols==1 ? 1 : xpr.cols()),
-        m_xpr(xpr)
-    {
-      init();
-    }
-
-    /** Fixed-size constructor
-      */
-    inline BlockImpl_dense(XprType& xpr, Index startRow, Index startCol)
-      : Base(internal::const_cast_ptr(&xpr.coeffRef(startRow,startCol))), m_xpr(xpr)
-    {
-      init();
-    }
-
-    /** Dynamic-size constructor
-      */
-    inline BlockImpl_dense(XprType& xpr,
-          Index startRow, Index startCol,
-          Index blockRows, Index blockCols)
-      : Base(internal::const_cast_ptr(&xpr.coeffRef(startRow,startCol)), blockRows, blockCols),
-        m_xpr(xpr)
-    {
-      init();
-    }
-
-    const typename internal::remove_all<typename XprType::Nested>::type& nestedExpression() const 
-    { 
-      return m_xpr; 
-    }
-      
-    /** \sa MapBase::innerStride() */
-    inline Index innerStride() const
-    {
-      return internal::traits<BlockType>::HasSameStorageOrderAsXprType
-             ? m_xpr.innerStride()
-             : m_xpr.outerStride();
-    }
-
-    /** \sa MapBase::outerStride() */
-    inline Index outerStride() const
-    {
-      return m_outerStride;
-    }
-
-  #ifndef __SUNPRO_CC
+template <typename XprType, int BlockRows, int BlockCols, bool InnerPanel>
+class BlockImpl_dense<XprType, BlockRows, BlockCols, InnerPanel, true>
+    : public MapBase<Block<XprType, BlockRows, BlockCols, InnerPanel>> {
+  typedef Block<XprType, BlockRows, BlockCols, InnerPanel> BlockType;
+  typedef typename internal::ref_selector<XprType>::non_const_type XprTypeNested;
+  enum { XprTypeIsRowMajor = (int(traits<XprType>::Flags) & RowMajorBit) != 0 };
+
+  /** \internal Returns base+offset (unless base is null, in which case returns null).
+   * Adding an offset to nullptr is undefined behavior, so we must avoid it.
+   */
+  template <typename Scalar>
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE static Scalar* add_to_nullable_pointer(Scalar* base, Index offset) {
+    return base != nullptr ? base + offset : nullptr;
+  }
+
+ public:
+  typedef MapBase<BlockType> Base;
+  EIGEN_DENSE_PUBLIC_INTERFACE(BlockType)
+  EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl_dense)
+
+  /** Column or Row constructor
+   */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockImpl_dense(XprType& xpr, Index i)
+      : Base((BlockRows == 0 || BlockCols == 0)
+                 ? nullptr
+                 : add_to_nullable_pointer(
+                       xpr.data(),
+                       i * (((BlockRows == 1) && (BlockCols == XprType::ColsAtCompileTime) && (!XprTypeIsRowMajor)) ||
+                                    ((BlockRows == XprType::RowsAtCompileTime) && (BlockCols == 1) &&
+                                     (XprTypeIsRowMajor))
+                                ? xpr.innerStride()
+                                : xpr.outerStride())),
+             BlockRows == 1 ? 1 : xpr.rows(), BlockCols == 1 ? 1 : xpr.cols()),
+        m_xpr(xpr),
+        m_startRow((BlockRows == 1) && (BlockCols == XprType::ColsAtCompileTime) ? i : 0),
+        m_startCol((BlockRows == XprType::RowsAtCompileTime) && (BlockCols == 1) ? i : 0) {
+    init();
+  }
+
+  /** Fixed-size constructor
+   */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockImpl_dense(XprType& xpr, Index startRow, Index startCol)
+      : Base((BlockRows == 0 || BlockCols == 0)
+                 ? nullptr
+                 : add_to_nullable_pointer(xpr.data(),
+                                           xpr.innerStride() * (XprTypeIsRowMajor ? startCol : startRow) +
+                                               xpr.outerStride() * (XprTypeIsRowMajor ? startRow : startCol))),
+        m_xpr(xpr),
+        m_startRow(startRow),
+        m_startCol(startCol) {
+    init();
+  }
+
+  /** Dynamic-size constructor
+   */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockImpl_dense(XprType& xpr, Index startRow, Index startCol, Index blockRows,
+                                                        Index blockCols)
+      : Base((blockRows == 0 || blockCols == 0)
+                 ? nullptr
+                 : add_to_nullable_pointer(xpr.data(),
+                                           xpr.innerStride() * (XprTypeIsRowMajor ? startCol : startRow) +
+                                               xpr.outerStride() * (XprTypeIsRowMajor ? startRow : startCol)),
+             blockRows, blockCols),
+        m_xpr(xpr),
+        m_startRow(startRow),
+        m_startCol(startCol) {
+    init();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all_t<XprTypeNested>& nestedExpression() const noexcept {
+    return m_xpr;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE XprType& nestedExpression() { return m_xpr; }
+
+  /** \sa MapBase::innerStride() */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index innerStride() const noexcept {
+    return internal::traits<BlockType>::HasSameStorageOrderAsXprType ? m_xpr.innerStride() : m_xpr.outerStride();
+  }
+
+  /** \sa MapBase::outerStride() */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index outerStride() const noexcept {
+    return internal::traits<BlockType>::HasSameStorageOrderAsXprType ? m_xpr.outerStride() : m_xpr.innerStride();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr StorageIndex startRow() const noexcept { return m_startRow.value(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr StorageIndex startCol() const noexcept { return m_startCol.value(); }
+
+#ifndef __SUNPRO_CC
   // FIXME sunstudio is not friendly with the above friend...
   // META-FIXME there is no 'friend' keyword around here. Is this obsolete?
-  protected:
-  #endif
-
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** \internal used by allowAligned() */
-    inline BlockImpl_dense(XprType& xpr, const Scalar* data, Index blockRows, Index blockCols)
-      : Base(data, blockRows, blockCols), m_xpr(xpr)
-    {
-      init();
-    }
-    #endif
-
-  protected:
-    void init()
-    {
-      m_outerStride = internal::traits<BlockType>::HasSameStorageOrderAsXprType
-                    ? m_xpr.outerStride()
-                    : m_xpr.innerStride();
-    }
-
-    typename XprType::Nested m_xpr;
-    Index m_outerStride;
+ protected:
+#endif
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  /** \internal used by allowAligned() */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockImpl_dense(XprType& xpr, const Scalar* data, Index blockRows,
+                                                        Index blockCols)
+      : Base(data, blockRows, blockCols), m_xpr(xpr) {
+    init();
+  }
+#endif
+
+ protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void init() {
+    m_outerStride =
+        internal::traits<BlockType>::HasSameStorageOrderAsXprType ? m_xpr.outerStride() : m_xpr.innerStride();
+  }
+
+  XprTypeNested m_xpr;
+  const internal::variable_if_dynamic<StorageIndex, (XprType::RowsAtCompileTime == 1 && BlockRows == 1) ? 0 : Dynamic>
+      m_startRow;
+  const internal::variable_if_dynamic<StorageIndex, (XprType::ColsAtCompileTime == 1 && BlockCols == 1) ? 0 : Dynamic>
+      m_startCol;
+  Index m_outerStride;
 };
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_BLOCK_H
+#endif  // EIGEN_BLOCK_H
diff --git a/inst/include/Eigen/src/Core/BooleanRedux.h b/inst/include/Eigen/src/Core/BooleanRedux.h
deleted file mode 100644
index be9f48a8..00000000
--- a/inst/include/Eigen/src/Core/BooleanRedux.h
+++ /dev/null
@@ -1,154 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_ALLANDANY_H
-#define EIGEN_ALLANDANY_H
-
-namespace Eigen { 
-
-namespace internal {
-
-template<typename Derived, int UnrollCount>
-struct all_unroller
-{
-  enum {
-    col = (UnrollCount-1) / Derived::RowsAtCompileTime,
-    row = (UnrollCount-1) % Derived::RowsAtCompileTime
-  };
-
-  static inline bool run(const Derived &mat)
-  {
-    return all_unroller<Derived, UnrollCount-1>::run(mat) && mat.coeff(row, col);
-  }
-};
-
-template<typename Derived>
-struct all_unroller<Derived, 0>
-{
-  static inline bool run(const Derived &/*mat*/) { return true; }
-};
-
-template<typename Derived>
-struct all_unroller<Derived, Dynamic>
-{
-  static inline bool run(const Derived &) { return false; }
-};
-
-template<typename Derived, int UnrollCount>
-struct any_unroller
-{
-  enum {
-    col = (UnrollCount-1) / Derived::RowsAtCompileTime,
-    row = (UnrollCount-1) % Derived::RowsAtCompileTime
-  };
-
-  static inline bool run(const Derived &mat)
-  {
-    return any_unroller<Derived, UnrollCount-1>::run(mat) || mat.coeff(row, col);
-  }
-};
-
-template<typename Derived>
-struct any_unroller<Derived, 0>
-{
-  static inline bool run(const Derived & /*mat*/) { return false; }
-};
-
-template<typename Derived>
-struct any_unroller<Derived, Dynamic>
-{
-  static inline bool run(const Derived &) { return false; }
-};
-
-} // end namespace internal
-
-/** \returns true if all coefficients are true
-  *
-  * Example: \include MatrixBase_all.cpp
-  * Output: \verbinclude MatrixBase_all.out
-  *
-  * \sa any(), Cwise::operator<()
-  */
-template<typename Derived>
-inline bool DenseBase<Derived>::all() const
-{
-  enum {
-    unroll = SizeAtCompileTime != Dynamic
-          && CoeffReadCost != Dynamic
-          && NumTraits<Scalar>::AddCost != Dynamic
-          && SizeAtCompileTime * (CoeffReadCost + NumTraits<Scalar>::AddCost) <= EIGEN_UNROLLING_LIMIT
-  };
-  if(unroll)
-    return internal::all_unroller<Derived, unroll ? int(SizeAtCompileTime) : Dynamic>::run(derived());
-  else
-  {
-    for(Index j = 0; j < cols(); ++j)
-      for(Index i = 0; i < rows(); ++i)
-        if (!coeff(i, j)) return false;
-    return true;
-  }
-}
-
-/** \returns true if at least one coefficient is true
-  *
-  * \sa all()
-  */
-template<typename Derived>
-inline bool DenseBase<Derived>::any() const
-{
-  enum {
-    unroll = SizeAtCompileTime != Dynamic
-          && CoeffReadCost != Dynamic
-          && NumTraits<Scalar>::AddCost != Dynamic
-          && SizeAtCompileTime * (CoeffReadCost + NumTraits<Scalar>::AddCost) <= EIGEN_UNROLLING_LIMIT
-  };
-  if(unroll)
-    return internal::any_unroller<Derived, unroll ? int(SizeAtCompileTime) : Dynamic>::run(derived());
-  else
-  {
-    for(Index j = 0; j < cols(); ++j)
-      for(Index i = 0; i < rows(); ++i)
-        if (coeff(i, j)) return true;
-    return false;
-  }
-}
-
-/** \returns the number of coefficients which evaluate to true
-  *
-  * \sa all(), any()
-  */
-template<typename Derived>
-inline typename DenseBase<Derived>::Index DenseBase<Derived>::count() const
-{
-  return derived().template cast<bool>().template cast<Index>().sum();
-}
-
-/** \returns true is \c *this contains at least one Not A Number (NaN).
-  *
-  * \sa allFinite()
-  */
-template<typename Derived>
-inline bool DenseBase<Derived>::hasNaN() const
-{
-  return !((derived().array()==derived().array()).all());
-}
-
-/** \returns true if \c *this contains only finite numbers, i.e., no NaN and no +/-INF values.
-  *
-  * \sa hasNaN()
-  */
-template<typename Derived>
-inline bool DenseBase<Derived>::allFinite() const
-{
-  return !((derived()-derived()).hasNaN());
-}
-    
-} // end namespace Eigen
-
-#endif // EIGEN_ALLANDANY_H
diff --git a/inst/include/Eigen/src/Core/CommaInitializer.h b/inst/include/Eigen/src/Core/CommaInitializer.h
index a036d8c3..c4141179 100644
--- a/inst/include/Eigen/src/Core/CommaInitializer.h
+++ b/inst/include/Eigen/src/Core/CommaInitializer.h
@@ -11,43 +11,46 @@
 #ifndef EIGEN_COMMAINITIALIZER_H
 #define EIGEN_COMMAINITIALIZER_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 /** \class CommaInitializer
-  * \ingroup Core_Module
-  *
-  * \brief Helper class used by the comma initializer operator
-  *
-  * This class is internally used to implement the comma initializer feature. It is
-  * the return type of MatrixBase::operator<<, and most of the time this is the only
-  * way it is used.
-  *
-  * \sa \ref MatrixBaseCommaInitRef "MatrixBase::operator<<", CommaInitializer::finished()
-  */
-template<typename XprType>
-struct CommaInitializer
-{
+ * \ingroup Core_Module
+ *
+ * \brief Helper class used by the comma initializer operator
+ *
+ * This class is internally used to implement the comma initializer feature. It is
+ * the return type of MatrixBase::operator<<, and most of the time this is the only
+ * way it is used.
+ *
+ * \sa \blank \ref MatrixBaseCommaInitRef "MatrixBase::operator<<", CommaInitializer::finished()
+ */
+template <typename XprType>
+struct CommaInitializer {
   typedef typename XprType::Scalar Scalar;
-  typedef typename XprType::Index Index;
 
-  inline CommaInitializer(XprType& xpr, const Scalar& s)
-    : m_xpr(xpr), m_row(0), m_col(1), m_currentBlockRows(1)
-  {
-    m_xpr.coeffRef(0,0) = s;
+  EIGEN_DEVICE_FUNC inline CommaInitializer(XprType& xpr, const Scalar& s)
+      : m_xpr(xpr), m_row(0), m_col(1), m_currentBlockRows(1) {
+    eigen_assert(m_xpr.rows() > 0 && m_xpr.cols() > 0 && "Cannot comma-initialize a 0x0 matrix (operator<<)");
+    m_xpr.coeffRef(0, 0) = s;
   }
 
-  template<typename OtherDerived>
-  inline CommaInitializer(XprType& xpr, const DenseBase<OtherDerived>& other)
-    : m_xpr(xpr), m_row(0), m_col(other.cols()), m_currentBlockRows(other.rows())
-  {
-    m_xpr.block(0, 0, other.rows(), other.cols()) = other;
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC inline CommaInitializer(XprType& xpr, const DenseBase<OtherDerived>& other)
+      : m_xpr(xpr), m_row(0), m_col(other.cols()), m_currentBlockRows(other.rows()) {
+    eigen_assert(m_xpr.rows() >= other.rows() && m_xpr.cols() >= other.cols() &&
+                 "Cannot comma-initialize a 0x0 matrix (operator<<)");
+    m_xpr.template block<OtherDerived::RowsAtCompileTime, OtherDerived::ColsAtCompileTime>(0, 0, other.rows(),
+                                                                                           other.cols()) = other;
   }
 
-  /* Copy/Move constructor which transfers ownership. This is crucial in 
+  /* Copy/Move constructor which transfers ownership. This is crucial in
    * absence of return value optimization to avoid assertions during destruction. */
   // FIXME in C++11 mode this could be replaced by a proper RValue constructor
-  inline CommaInitializer(const CommaInitializer& o)
-  : m_xpr(o.m_xpr), m_row(o.m_row), m_col(o.m_col), m_currentBlockRows(o.m_currentBlockRows) {
+  EIGEN_DEVICE_FUNC inline CommaInitializer(const CommaInitializer& o)
+      : m_xpr(o.m_xpr), m_row(o.m_row), m_col(o.m_col), m_currentBlockRows(o.m_currentBlockRows) {
     // Mark original object as finished. In absence of R-value references we need to const_cast:
     const_cast<CommaInitializer&>(o).m_row = m_xpr.rows();
     const_cast<CommaInitializer&>(o).m_col = m_xpr.cols();
@@ -55,100 +58,92 @@ struct CommaInitializer
   }
 
   /* inserts a scalar value in the target matrix */
-  CommaInitializer& operator,(const Scalar& s)
-  {
-    if (m_col==m_xpr.cols())
-    {
-      m_row+=m_currentBlockRows;
+  EIGEN_DEVICE_FUNC CommaInitializer &operator,(const Scalar& s) {
+    if (m_col == m_xpr.cols()) {
+      m_row += m_currentBlockRows;
       m_col = 0;
       m_currentBlockRows = 1;
-      eigen_assert(m_row<m_xpr.rows()
-        && "Too many rows passed to comma initializer (operator<<)");
+      eigen_assert(m_row < m_xpr.rows() && "Too many rows passed to comma initializer (operator<<)");
     }
-    eigen_assert(m_col<m_xpr.cols()
-      && "Too many coefficients passed to comma initializer (operator<<)");
-    eigen_assert(m_currentBlockRows==1);
+    eigen_assert(m_col < m_xpr.cols() && "Too many coefficients passed to comma initializer (operator<<)");
+    eigen_assert(m_currentBlockRows == 1);
     m_xpr.coeffRef(m_row, m_col++) = s;
     return *this;
   }
 
   /* inserts a matrix expression in the target matrix */
-  template<typename OtherDerived>
-  CommaInitializer& operator,(const DenseBase<OtherDerived>& other)
-  {
-    if(other.cols()==0 || other.rows()==0)
-      return *this;
-    if (m_col==m_xpr.cols())
-    {
-      m_row+=m_currentBlockRows;
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC CommaInitializer &operator,(const DenseBase<OtherDerived>& other) {
+    if (m_col == m_xpr.cols() && (other.cols() != 0 || other.rows() != m_currentBlockRows)) {
+      m_row += m_currentBlockRows;
       m_col = 0;
       m_currentBlockRows = other.rows();
-      eigen_assert(m_row+m_currentBlockRows<=m_xpr.rows()
-        && "Too many rows passed to comma initializer (operator<<)");
+      eigen_assert(m_row + m_currentBlockRows <= m_xpr.rows() &&
+                   "Too many rows passed to comma initializer (operator<<)");
     }
-    eigen_assert(m_col<m_xpr.cols()
-      && "Too many coefficients passed to comma initializer (operator<<)");
-    eigen_assert(m_currentBlockRows==other.rows());
-    if (OtherDerived::SizeAtCompileTime != Dynamic)
-      m_xpr.template block<OtherDerived::RowsAtCompileTime != Dynamic ? OtherDerived::RowsAtCompileTime : 1,
-                              OtherDerived::ColsAtCompileTime != Dynamic ? OtherDerived::ColsAtCompileTime : 1>
-                    (m_row, m_col) = other;
-    else
-      m_xpr.block(m_row, m_col, other.rows(), other.cols()) = other;
+    eigen_assert((m_col + other.cols() <= m_xpr.cols()) &&
+                 "Too many coefficients passed to comma initializer (operator<<)");
+    eigen_assert(m_currentBlockRows == other.rows());
+    m_xpr.template block<OtherDerived::RowsAtCompileTime, OtherDerived::ColsAtCompileTime>(m_row, m_col, other.rows(),
+                                                                                           other.cols()) = other;
     m_col += other.cols();
     return *this;
   }
 
-  inline ~CommaInitializer()
+  EIGEN_DEVICE_FUNC inline ~CommaInitializer()
+#if defined VERIFY_RAISES_ASSERT && (!defined EIGEN_NO_ASSERTION_CHECKING) && defined EIGEN_EXCEPTIONS
+      noexcept(false)  // Eigen::eigen_assert_exception
+#endif
   {
-    eigen_assert((m_row+m_currentBlockRows) == m_xpr.rows()
-         && m_col == m_xpr.cols()
-         && "Too few coefficients passed to comma initializer (operator<<)");
+    finished();
   }
 
   /** \returns the built matrix once all its coefficients have been set.
-    * Calling finished is 100% optional. Its purpose is to write expressions
-    * like this:
-    * \code
-    * quaternion.fromRotationMatrix((Matrix3f() << axis0, axis1, axis2).finished());
-    * \endcode
-    */
-  inline XprType& finished() { return m_xpr; }
-
-  XprType& m_xpr;   // target expression
-  Index m_row;              // current row id
-  Index m_col;              // current col id
-  Index m_currentBlockRows; // current block height
+   * Calling finished is 100% optional. Its purpose is to write expressions
+   * like this:
+   * \code
+   * quaternion.fromRotationMatrix((Matrix3f() << axis0, axis1, axis2).finished());
+   * \endcode
+   */
+  EIGEN_DEVICE_FUNC inline XprType& finished() {
+    eigen_assert(((m_row + m_currentBlockRows) == m_xpr.rows() || m_xpr.cols() == 0) && m_col == m_xpr.cols() &&
+                 "Too few coefficients passed to comma initializer (operator<<)");
+    return m_xpr;
+  }
+
+  XprType& m_xpr;            // target expression
+  Index m_row;               // current row id
+  Index m_col;               // current col id
+  Index m_currentBlockRows;  // current block height
 };
 
 /** \anchor MatrixBaseCommaInitRef
-  * Convenient operator to set the coefficients of a matrix.
-  *
-  * The coefficients must be provided in a row major order and exactly match
-  * the size of the matrix. Otherwise an assertion is raised.
-  *
-  * Example: \include MatrixBase_set.cpp
-  * Output: \verbinclude MatrixBase_set.out
-  * 
-  * \note According the c++ standard, the argument expressions of this comma initializer are evaluated in arbitrary order.
-  *
-  * \sa CommaInitializer::finished(), class CommaInitializer
-  */
-template<typename Derived>
-inline CommaInitializer<Derived> DenseBase<Derived>::operator<< (const Scalar& s)
-{
+ * Convenient operator to set the coefficients of a matrix.
+ *
+ * The coefficients must be provided in a row major order and exactly match
+ * the size of the matrix. Otherwise an assertion is raised.
+ *
+ * Example: \include MatrixBase_set.cpp
+ * Output: \verbinclude MatrixBase_set.out
+ *
+ * \note According the c++ standard, the argument expressions of this comma initializer are evaluated in arbitrary
+ * order.
+ *
+ * \sa CommaInitializer::finished(), class CommaInitializer
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC inline CommaInitializer<Derived> DenseBase<Derived>::operator<<(const Scalar& s) {
   return CommaInitializer<Derived>(*static_cast<Derived*>(this), s);
 }
 
 /** \sa operator<<(const Scalar&) */
-template<typename Derived>
-template<typename OtherDerived>
-inline CommaInitializer<Derived>
-DenseBase<Derived>::operator<<(const DenseBase<OtherDerived>& other)
-{
-  return CommaInitializer<Derived>(*static_cast<Derived *>(this), other);
+template <typename Derived>
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC inline CommaInitializer<Derived> DenseBase<Derived>::operator<<(
+    const DenseBase<OtherDerived>& other) {
+  return CommaInitializer<Derived>(*static_cast<Derived*>(this), other);
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_COMMAINITIALIZER_H
+#endif  // EIGEN_COMMAINITIALIZER_H
diff --git a/inst/include/Eigen/src/Core/ConditionEstimator.h b/inst/include/Eigen/src/Core/ConditionEstimator.h
new file mode 100644
index 00000000..dd1770b1
--- /dev/null
+++ b/inst/include/Eigen/src/Core/ConditionEstimator.h
@@ -0,0 +1,173 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Rasmus Munk Larsen (rmlarsen@google.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CONDITIONESTIMATOR_H
+#define EIGEN_CONDITIONESTIMATOR_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename Vector, typename RealVector, bool IsComplex>
+struct rcond_compute_sign {
+  static inline Vector run(const Vector& v) {
+    const RealVector v_abs = v.cwiseAbs();
+    return (v_abs.array() == static_cast<typename Vector::RealScalar>(0))
+        .select(Vector::Ones(v.size()), v.cwiseQuotient(v_abs));
+  }
+};
+
+// Partial specialization to avoid elementwise division for real vectors.
+template <typename Vector>
+struct rcond_compute_sign<Vector, Vector, false> {
+  static inline Vector run(const Vector& v) {
+    return (v.array() < static_cast<typename Vector::RealScalar>(0))
+        .select(-Vector::Ones(v.size()), Vector::Ones(v.size()));
+  }
+};
+
+/**
+ * \returns an estimate of ||inv(matrix)||_1 given a decomposition of
+ * \a matrix that implements .solve() and .adjoint().solve() methods.
+ *
+ * This function implements Algorithms 4.1 and 5.1 from
+ *   http://www.maths.manchester.ac.uk/~higham/narep/narep135.pdf
+ * which also forms the basis for the condition number estimators in
+ * LAPACK. Since at most 10 calls to the solve method of dec are
+ * performed, the total cost is O(dims^2), as opposed to O(dims^3)
+ * needed to compute the inverse matrix explicitly.
+ *
+ * The most common usage is in estimating the condition number
+ * ||matrix||_1 * ||inv(matrix)||_1. The first term ||matrix||_1 can be
+ * computed directly in O(n^2) operations.
+ *
+ * Supports the following decompositions: FullPivLU, PartialPivLU, LDLT, and
+ * LLT.
+ *
+ * \sa FullPivLU, PartialPivLU, LDLT, LLT.
+ */
+template <typename Decomposition>
+typename Decomposition::RealScalar rcond_invmatrix_L1_norm_estimate(const Decomposition& dec) {
+  typedef typename Decomposition::MatrixType MatrixType;
+  typedef typename Decomposition::Scalar Scalar;
+  typedef typename Decomposition::RealScalar RealScalar;
+  typedef typename internal::plain_col_type<MatrixType>::type Vector;
+  typedef typename internal::plain_col_type<MatrixType, RealScalar>::type RealVector;
+  const bool is_complex = (NumTraits<Scalar>::IsComplex != 0);
+
+  eigen_assert(dec.rows() == dec.cols());
+  const Index n = dec.rows();
+  if (n == 0) return 0;
+
+    // Disable Index to float conversion warning
+#ifdef __INTEL_COMPILER
+#pragma warning push
+#pragma warning(disable : 2259)
+#endif
+  Vector v = dec.solve(Vector::Ones(n) / Scalar(n));
+#ifdef __INTEL_COMPILER
+#pragma warning pop
+#endif
+
+  // lower_bound is a lower bound on
+  //   ||inv(matrix)||_1  = sup_v ||inv(matrix) v||_1 / ||v||_1
+  // and is the objective maximized by the ("super-") gradient ascent
+  // algorithm below.
+  RealScalar lower_bound = v.template lpNorm<1>();
+  if (n == 1) return lower_bound;
+
+  // Gradient ascent algorithm follows: We know that the optimum is achieved at
+  // one of the simplices v = e_i, so in each iteration we follow a
+  // super-gradient to move towards the optimal one.
+  RealScalar old_lower_bound = lower_bound;
+  Vector sign_vector(n);
+  Vector old_sign_vector;
+  Index v_max_abs_index = -1;
+  Index old_v_max_abs_index = v_max_abs_index;
+  for (int k = 0; k < 4; ++k) {
+    sign_vector = internal::rcond_compute_sign<Vector, RealVector, is_complex>::run(v);
+    if (k > 0 && !is_complex && sign_vector == old_sign_vector) {
+      // Break if the solution stagnated.
+      break;
+    }
+    // v_max_abs_index = argmax |real( inv(matrix)^T * sign_vector )|
+    v = dec.adjoint().solve(sign_vector);
+    v.real().cwiseAbs().maxCoeff(&v_max_abs_index);
+    if (v_max_abs_index == old_v_max_abs_index) {
+      // Break if the solution stagnated.
+      break;
+    }
+    // Move to the new simplex e_j, where j = v_max_abs_index.
+    v = dec.solve(Vector::Unit(n, v_max_abs_index));  // v = inv(matrix) * e_j.
+    lower_bound = v.template lpNorm<1>();
+    if (lower_bound <= old_lower_bound) {
+      // Break if the gradient step did not increase the lower_bound.
+      break;
+    }
+    if (!is_complex) {
+      old_sign_vector = sign_vector;
+    }
+    old_v_max_abs_index = v_max_abs_index;
+    old_lower_bound = lower_bound;
+  }
+  // The following calculates an independent estimate of ||matrix||_1 by
+  // multiplying matrix by a vector with entries of slowly increasing
+  // magnitude and alternating sign:
+  //   v_i = (-1)^{i} (1 + (i / (dim-1))), i = 0,...,dim-1.
+  // This improvement to Hager's algorithm above is due to Higham. It was
+  // added to make the algorithm more robust in certain corner cases where
+  // large elements in the matrix might otherwise escape detection due to
+  // exact cancellation (especially when op and op_adjoint correspond to a
+  // sequence of backsubstitutions and permutations), which could cause
+  // Hager's algorithm to vastly underestimate ||matrix||_1.
+  Scalar alternating_sign(RealScalar(1));
+  for (Index i = 0; i < n; ++i) {
+    // The static_cast is needed when Scalar is a complex and RealScalar implements expression templates
+    v[i] = alternating_sign * static_cast<RealScalar>(RealScalar(1) + (RealScalar(i) / (RealScalar(n - 1))));
+    alternating_sign = -alternating_sign;
+  }
+  v = dec.solve(v);
+  const RealScalar alternate_lower_bound = (2 * v.template lpNorm<1>()) / (3 * RealScalar(n));
+  return numext::maxi(lower_bound, alternate_lower_bound);
+}
+
+/** \brief Reciprocal condition number estimator.
+ *
+ * Computing a decomposition of a dense matrix takes O(n^3) operations, while
+ * this method estimates the condition number quickly and reliably in O(n^2)
+ * operations.
+ *
+ * \returns an estimate of the reciprocal condition number
+ * (1 / (||matrix||_1 * ||inv(matrix)||_1)) of matrix, given ||matrix||_1 and
+ * its decomposition. Supports the following decompositions: FullPivLU,
+ * PartialPivLU, LDLT, and LLT.
+ *
+ * \sa FullPivLU, PartialPivLU, LDLT, LLT.
+ */
+template <typename Decomposition>
+typename Decomposition::RealScalar rcond_estimate_helper(typename Decomposition::RealScalar matrix_norm,
+                                                         const Decomposition& dec) {
+  typedef typename Decomposition::RealScalar RealScalar;
+  eigen_assert(dec.rows() == dec.cols());
+  if (dec.rows() == 0) return NumTraits<RealScalar>::infinity();
+  if (numext::is_exactly_zero(matrix_norm)) return RealScalar(0);
+  if (dec.rows() == 1) return RealScalar(1);
+  const RealScalar inverse_matrix_norm = rcond_invmatrix_L1_norm_estimate(dec);
+  return (numext::is_exactly_zero(inverse_matrix_norm) ? RealScalar(0)
+                                                       : (RealScalar(1) / inverse_matrix_norm) / matrix_norm);
+}
+
+}  // namespace internal
+
+}  // namespace Eigen
+
+#endif
diff --git a/inst/include/Eigen/src/Core/CoreEvaluators.h b/inst/include/Eigen/src/Core/CoreEvaluators.h
new file mode 100644
index 00000000..60857e2c
--- /dev/null
+++ b/inst/include/Eigen/src/Core/CoreEvaluators.h
@@ -0,0 +1,2018 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2011-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2011-2012 Jitse Niesen <jitse@maths.leeds.ac.uk>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_COREEVALUATORS_H
+#define EIGEN_COREEVALUATORS_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+// This class returns the evaluator kind from the expression storage kind.
+// Default assumes index based accessors
+template <typename StorageKind>
+struct storage_kind_to_evaluator_kind {
+  typedef IndexBased Kind;
+};
+
+// This class returns the evaluator shape from the expression storage kind.
+// It can be Dense, Sparse, Triangular, Diagonal, SelfAdjoint, Band, etc.
+template <typename StorageKind>
+struct storage_kind_to_shape;
+
+template <>
+struct storage_kind_to_shape<Dense> {
+  typedef DenseShape Shape;
+};
+template <>
+struct storage_kind_to_shape<SolverStorage> {
+  typedef SolverShape Shape;
+};
+template <>
+struct storage_kind_to_shape<PermutationStorage> {
+  typedef PermutationShape Shape;
+};
+template <>
+struct storage_kind_to_shape<TranspositionsStorage> {
+  typedef TranspositionsShape Shape;
+};
+
+// Evaluators have to be specialized with respect to various criteria such as:
+//  - storage/structure/shape
+//  - scalar type
+//  - etc.
+// Therefore, we need specialization of evaluator providing additional template arguments for each kind of evaluators.
+// We currently distinguish the following kind of evaluators:
+// - unary_evaluator    for expressions taking only one arguments (CwiseUnaryOp, CwiseUnaryView, Transpose,
+// MatrixWrapper, ArrayWrapper, Reverse, Replicate)
+// - binary_evaluator   for expression taking two arguments (CwiseBinaryOp)
+// - ternary_evaluator   for expression taking three arguments (CwiseTernaryOp)
+// - product_evaluator  for linear algebra products (Product); special case of binary_evaluator because it requires
+// additional tags for dispatching.
+// - mapbase_evaluator  for Map, Block, Ref
+// - block_evaluator    for Block (special dispatching to a mapbase_evaluator or unary_evaluator)
+
+template <typename T, typename Arg1Kind = typename evaluator_traits<typename T::Arg1>::Kind,
+          typename Arg2Kind = typename evaluator_traits<typename T::Arg2>::Kind,
+          typename Arg3Kind = typename evaluator_traits<typename T::Arg3>::Kind,
+          typename Arg1Scalar = typename traits<typename T::Arg1>::Scalar,
+          typename Arg2Scalar = typename traits<typename T::Arg2>::Scalar,
+          typename Arg3Scalar = typename traits<typename T::Arg3>::Scalar>
+struct ternary_evaluator;
+
+template <typename T, typename LhsKind = typename evaluator_traits<typename T::Lhs>::Kind,
+          typename RhsKind = typename evaluator_traits<typename T::Rhs>::Kind,
+          typename LhsScalar = typename traits<typename T::Lhs>::Scalar,
+          typename RhsScalar = typename traits<typename T::Rhs>::Scalar>
+struct binary_evaluator;
+
+template <typename T, typename Kind = typename evaluator_traits<typename T::NestedExpression>::Kind,
+          typename Scalar = typename T::Scalar>
+struct unary_evaluator;
+
+// evaluator_traits<T> contains traits for evaluator<T>
+
+template <typename T>
+struct evaluator_traits_base {
+  // by default, get evaluator kind and shape from storage
+  typedef typename storage_kind_to_evaluator_kind<typename traits<T>::StorageKind>::Kind Kind;
+  typedef typename storage_kind_to_shape<typename traits<T>::StorageKind>::Shape Shape;
+};
+
+// Default evaluator traits
+template <typename T>
+struct evaluator_traits : public evaluator_traits_base<T> {};
+
+template <typename T, typename Shape = typename evaluator_traits<T>::Shape>
+struct evaluator_assume_aliasing {
+  static const bool value = false;
+};
+
+// By default, we assume a unary expression:
+template <typename T>
+struct evaluator : public unary_evaluator<T> {
+  typedef unary_evaluator<T> Base;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const T& xpr) : Base(xpr) {}
+};
+
+// TODO: Think about const-correctness
+template <typename T>
+struct evaluator<const T> : evaluator<T> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const T& xpr) : evaluator<T>(xpr) {}
+};
+
+// ---------- base class for all evaluators ----------
+
+template <typename ExpressionType>
+struct evaluator_base {
+  // TODO that's not very nice to have to propagate all these traits. They are currently only needed to handle
+  // outer,inner indices.
+  typedef traits<ExpressionType> ExpressionTraits;
+
+  enum { Alignment = 0 };
+  // noncopyable:
+  // Don't make this class inherit noncopyable as this kills EBO (Empty Base Optimization)
+  // and make complex evaluator much larger than then should do.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr evaluator_base() = default;
+
+ private:
+  EIGEN_DEVICE_FUNC evaluator_base(const evaluator_base&);
+  EIGEN_DEVICE_FUNC const evaluator_base& operator=(const evaluator_base&);
+};
+
+// -------------------- Matrix and Array --------------------
+//
+// evaluator<PlainObjectBase> is a common base class for the
+// Matrix and Array evaluators.
+// Here we directly specialize evaluator. This is not really a unary expression, and it is, by definition, dense,
+// so no need for more sophisticated dispatching.
+
+// this helper permits to completely eliminate m_outerStride if it is known at compiletime.
+template <typename Scalar, int OuterStride>
+class plainobjectbase_evaluator_data {
+ public:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride)
+      : data(ptr) {
+#ifndef EIGEN_INTERNAL_DEBUGGING
+    EIGEN_UNUSED_VARIABLE(outerStride);
+#endif
+    eigen_internal_assert(outerStride == OuterStride);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index outerStride() const noexcept { return OuterStride; }
+  const Scalar* data;
+};
+
+template <typename Scalar>
+class plainobjectbase_evaluator_data<Scalar, Dynamic> {
+ public:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr plainobjectbase_evaluator_data(const Scalar* ptr, Index outerStride)
+      : data(ptr), m_outerStride(outerStride) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index outerStride() const { return m_outerStride; }
+  const Scalar* data;
+
+ protected:
+  Index m_outerStride;
+};
+
+template <typename Derived>
+struct evaluator<PlainObjectBase<Derived>> : evaluator_base<Derived> {
+  typedef PlainObjectBase<Derived> PlainObjectType;
+  typedef typename PlainObjectType::Scalar Scalar;
+  typedef typename PlainObjectType::CoeffReturnType CoeffReturnType;
+
+  enum {
+    IsRowMajor = PlainObjectType::IsRowMajor,
+    IsVectorAtCompileTime = PlainObjectType::IsVectorAtCompileTime,
+    RowsAtCompileTime = PlainObjectType::RowsAtCompileTime,
+    ColsAtCompileTime = PlainObjectType::ColsAtCompileTime,
+
+    CoeffReadCost = NumTraits<Scalar>::ReadCost,
+    Flags = traits<Derived>::EvaluatorFlags,
+    Alignment = traits<Derived>::Alignment
+  };
+  enum {
+    // We do not need to know the outer stride for vectors
+    OuterStrideAtCompileTime = IsVectorAtCompileTime ? 0
+                               : int(IsRowMajor)     ? ColsAtCompileTime
+                                                     : RowsAtCompileTime
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr evaluator() : m_d(0, OuterStrideAtCompileTime) {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr explicit evaluator(const PlainObjectType& m)
+      : m_d(m.data(), IsVectorAtCompileTime ? 0 : m.outerStride()) {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType coeff(Index row, Index col) const {
+    return coeff(getIndex(row, col));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType coeff(Index index) const { return m_d.data[index]; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& coeffRef(Index row, Index col) {
+    return coeffRef(getIndex(row, col));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& coeffRef(Index index) {
+    return const_cast<Scalar*>(m_d.data)[index];
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
+    return packet<LoadMode, PacketType>(getIndex(row, col));
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index index) const {
+    return ploadt<PacketType, LoadMode>(m_d.data + index);
+  }
+
+  template <int StoreMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index row, Index col, const PacketType& x) {
+    writePacket<StoreMode, PacketType>(getIndex(row, col), x);
+  }
+
+  template <int StoreMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) {
+    pstoret<Scalar, PacketType, StoreMode>(const_cast<Scalar*>(m_d.data) + index, x);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
+    return packetSegment<LoadMode, PacketType>(getIndex(row, col), begin, count);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
+    return ploadtSegment<PacketType, LoadMode>(m_d.data + index, begin, count);
+  }
+
+  template <int StoreMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index row, Index col, const PacketType& x, Index begin,
+                                                                Index count) {
+    writePacketSegment<StoreMode, PacketType>(getIndex(row, col), x, begin, count);
+  }
+
+  template <int StoreMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index index, const PacketType& x, Index begin,
+                                                                Index count) {
+    pstoretSegment<Scalar, PacketType, StoreMode>(const_cast<Scalar*>(m_d.data) + index, x, begin, count);
+  }
+
+ protected:
+  plainobjectbase_evaluator_data<Scalar, OuterStrideAtCompileTime> m_d;
+
+ private:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index constexpr getIndex(Index row, Index col) const {
+    return IsRowMajor ? row * m_d.outerStride() + col : row + col * m_d.outerStride();
+  }
+};
+
+template <typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
+struct evaluator<Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols>>
+    : evaluator<PlainObjectBase<Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols>>> {
+  typedef Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols> XprType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr evaluator() = default;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr explicit evaluator(const XprType& m)
+      : evaluator<PlainObjectBase<XprType>>(m) {}
+};
+
+template <typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
+struct evaluator<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols>>
+    : evaluator<PlainObjectBase<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols>>> {
+  typedef Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> XprType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr evaluator() = default;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr explicit evaluator(const XprType& m)
+      : evaluator<PlainObjectBase<XprType>>(m) {}
+};
+
+// -------------------- Transpose --------------------
+
+template <typename ArgType>
+struct unary_evaluator<Transpose<ArgType>, IndexBased> : evaluator_base<Transpose<ArgType>> {
+  typedef Transpose<ArgType> XprType;
+
+  enum {
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
+    Flags = evaluator<ArgType>::Flags ^ RowMajorBit,
+    Alignment = evaluator<ArgType>::Alignment
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit unary_evaluator(const XprType& t) : m_argImpl(t.nestedExpression()) {}
+
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
+    return m_argImpl.coeff(col, row);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_argImpl.coeff(index); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) { return m_argImpl.coeffRef(col, row); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename XprType::Scalar& coeffRef(Index index) {
+    return m_argImpl.coeffRef(index);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
+    return m_argImpl.template packet<LoadMode, PacketType>(col, row);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index index) const {
+    return m_argImpl.template packet<LoadMode, PacketType>(index);
+  }
+
+  template <int StoreMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index row, Index col, const PacketType& x) {
+    m_argImpl.template writePacket<StoreMode, PacketType>(col, row, x);
+  }
+
+  template <int StoreMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) {
+    m_argImpl.template writePacket<StoreMode, PacketType>(index, x);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
+    return m_argImpl.template packetSegment<LoadMode, PacketType>(col, row, begin, count);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
+    return m_argImpl.template packetSegment<LoadMode, PacketType>(index, begin, count);
+  }
+
+  template <int StoreMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index row, Index col, const PacketType& x, Index begin,
+                                                                Index count) {
+    m_argImpl.template writePacketSegment<StoreMode, PacketType>(col, row, x, begin, count);
+  }
+
+  template <int StoreMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index index, const PacketType& x, Index begin,
+                                                                Index count) {
+    m_argImpl.template writePacketSegment<StoreMode, PacketType>(index, x, begin, count);
+  }
+
+ protected:
+  evaluator<ArgType> m_argImpl;
+};
+
+// -------------------- CwiseNullaryOp --------------------
+// Like Matrix and Array, this is not really a unary expression, so we directly specialize evaluator.
+// Likewise, there is not need to more sophisticated dispatching here.
+
+template <typename Scalar, typename NullaryOp, bool has_nullary = has_nullary_operator<NullaryOp>::value,
+          bool has_unary = has_unary_operator<NullaryOp>::value,
+          bool has_binary = has_binary_operator<NullaryOp>::value>
+struct nullary_wrapper {
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i, IndexType j) const {
+    return op(i, j);
+  }
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i) const {
+    return op(i);
+  }
+
+  template <typename T, typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType i, IndexType j) const {
+    return op.template packetOp<T>(i, j);
+  }
+  template <typename T, typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType i) const {
+    return op.template packetOp<T>(i);
+  }
+};
+
+template <typename Scalar, typename NullaryOp>
+struct nullary_wrapper<Scalar, NullaryOp, true, false, false> {
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType = 0, IndexType = 0) const {
+    return op();
+  }
+  template <typename T, typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType = 0, IndexType = 0) const {
+    return op.template packetOp<T>();
+  }
+};
+
+template <typename Scalar, typename NullaryOp>
+struct nullary_wrapper<Scalar, NullaryOp, false, false, true> {
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i, IndexType j = 0) const {
+    return op(i, j);
+  }
+  template <typename T, typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType i, IndexType j = 0) const {
+    return op.template packetOp<T>(i, j);
+  }
+};
+
+// We need the following specialization for vector-only functors assigned to a runtime vector,
+// for instance, using linspace and assigning a RowVectorXd to a MatrixXd or even a row of a MatrixXd.
+// In this case, i==0 and j is used for the actual iteration.
+template <typename Scalar, typename NullaryOp>
+struct nullary_wrapper<Scalar, NullaryOp, false, true, false> {
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i, IndexType j) const {
+    eigen_assert(i == 0 || j == 0);
+    return op(i + j);
+  }
+  template <typename T, typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType i, IndexType j) const {
+    eigen_assert(i == 0 || j == 0);
+    return op.template packetOp<T>(i + j);
+  }
+
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i) const {
+    return op(i);
+  }
+  template <typename T, typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType i) const {
+    return op.template packetOp<T>(i);
+  }
+};
+
+template <typename Scalar, typename NullaryOp>
+struct nullary_wrapper<Scalar, NullaryOp, false, false, false> {};
+
+#if 0 && EIGEN_COMP_MSVC > 0
+// Disable this ugly workaround. This is now handled in traits<Ref>::match,
+// but this piece of code might still become handly if some other weird compilation
+// errors pop up again.
+
+// MSVC exhibits a weird compilation error when
+// compiling:
+//    Eigen::MatrixXf A = MatrixXf::Random(3,3);
+//    Ref<const MatrixXf> R = 2.f*A;
+// and that has_*ary_operator<scalar_constant_op<float>> have not been instantiated yet.
+// The "problem" is that evaluator<2.f*A> is instantiated by traits<Ref>::match<2.f*A>
+// and at that time has_*ary_operator<T> returns true regardless of T.
+// Then nullary_wrapper is badly instantiated as nullary_wrapper<.,.,true,true,true>.
+// The trick is thus to defer the proper instantiation of nullary_wrapper when coeff(),
+// and packet() are really instantiated as implemented below:
+
+// This is a simple wrapper around Index to enforce the re-instantiation of
+// has_*ary_operator when needed.
+template<typename T> struct nullary_wrapper_workaround_msvc {
+  nullary_wrapper_workaround_msvc(const T&);
+  operator T()const;
+};
+
+template<typename Scalar,typename NullaryOp>
+struct nullary_wrapper<Scalar,NullaryOp,true,true,true>
+{
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i, IndexType j) const {
+    return nullary_wrapper<Scalar,NullaryOp,
+    has_nullary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
+    has_unary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
+    has_binary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value>().operator()(op,i,j);
+  }
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const NullaryOp& op, IndexType i) const {
+    return nullary_wrapper<Scalar,NullaryOp,
+    has_nullary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
+    has_unary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
+    has_binary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value>().operator()(op,i);
+  }
+
+  template <typename T, typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType i, IndexType j) const {
+    return nullary_wrapper<Scalar,NullaryOp,
+    has_nullary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
+    has_unary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
+    has_binary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value>().template packetOp<T>(op,i,j);
+  }
+  template <typename T, typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T packetOp(const NullaryOp& op, IndexType i) const {
+    return nullary_wrapper<Scalar,NullaryOp,
+    has_nullary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
+    has_unary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value,
+    has_binary_operator<NullaryOp,nullary_wrapper_workaround_msvc<IndexType> >::value>().template packetOp<T>(op,i);
+  }
+};
+#endif  // MSVC workaround
+
+template <typename NullaryOp, typename PlainObjectType>
+struct evaluator<CwiseNullaryOp<NullaryOp, PlainObjectType>>
+    : evaluator_base<CwiseNullaryOp<NullaryOp, PlainObjectType>> {
+  typedef CwiseNullaryOp<NullaryOp, PlainObjectType> XprType;
+  typedef remove_all_t<PlainObjectType> PlainObjectTypeCleaned;
+
+  enum {
+    CoeffReadCost = functor_traits<NullaryOp>::Cost,
+
+    Flags = (evaluator<PlainObjectTypeCleaned>::Flags &
+             (HereditaryBits | (functor_has_linear_access<NullaryOp>::ret ? LinearAccessBit : 0) |
+              (functor_traits<NullaryOp>::PacketAccess ? PacketAccessBit : 0))) |
+            (functor_traits<NullaryOp>::IsRepeatable ? 0 : EvalBeforeNestingBit),
+    Alignment = AlignedMax
+  };
+
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& n) : m_functor(n.functor()), m_wrapper() {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(IndexType row, IndexType col) const {
+    return m_wrapper(m_functor, row, col);
+  }
+
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(IndexType index) const {
+    return m_wrapper(m_functor, index);
+  }
+
+  template <int LoadMode, typename PacketType, typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(IndexType row, IndexType col) const {
+    return m_wrapper.template packetOp<PacketType>(m_functor, row, col);
+  }
+
+  template <int LoadMode, typename PacketType, typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(IndexType index) const {
+    return m_wrapper.template packetOp<PacketType>(m_functor, index);
+  }
+
+  template <int LoadMode, typename PacketType, typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(IndexType row, IndexType col, Index /*begin*/,
+                                                                 Index /*count*/) const {
+    return packet<LoadMode, PacketType, IndexType>(row, col);
+  }
+
+  template <int LoadMode, typename PacketType, typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(IndexType index, Index /*begin*/,
+                                                                 Index /*count*/) const {
+    return packet<LoadMode, PacketType, IndexType>(index);
+  }
+
+ protected:
+  const NullaryOp m_functor;
+  const nullary_wrapper<CoeffReturnType, NullaryOp> m_wrapper;
+};
+
+// -------------------- CwiseUnaryOp --------------------
+
+template <typename UnaryOp, typename ArgType>
+struct unary_evaluator<CwiseUnaryOp<UnaryOp, ArgType>, IndexBased> : evaluator_base<CwiseUnaryOp<UnaryOp, ArgType>> {
+  typedef CwiseUnaryOp<UnaryOp, ArgType> XprType;
+
+  enum {
+    CoeffReadCost = int(evaluator<ArgType>::CoeffReadCost) + int(functor_traits<UnaryOp>::Cost),
+
+    Flags = evaluator<ArgType>::Flags &
+            (HereditaryBits | LinearAccessBit | (functor_traits<UnaryOp>::PacketAccess ? PacketAccessBit : 0)),
+    Alignment = evaluator<ArgType>::Alignment
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit unary_evaluator(const XprType& op) : m_d(op) {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<UnaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
+    return m_d.func()(m_d.argImpl.coeff(row, col));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    return m_d.func()(m_d.argImpl.coeff(index));
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
+    return m_d.func().packetOp(m_d.argImpl.template packet<LoadMode, PacketType>(row, col));
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index index) const {
+    return m_d.func().packetOp(m_d.argImpl.template packet<LoadMode, PacketType>(index));
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
+    return m_d.func().packetOp(m_d.argImpl.template packetSegment<LoadMode, PacketType>(row, col, begin, count));
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
+    return m_d.func().packetOp(m_d.argImpl.template packetSegment<LoadMode, PacketType>(index, begin, count));
+  }
+
+ protected:
+  // this helper permits to completely eliminate the functor if it is empty
+  struct Data {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Data(const XprType& xpr)
+        : op(xpr.functor()), argImpl(xpr.nestedExpression()) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const UnaryOp& func() const { return op; }
+    UnaryOp op;
+    evaluator<ArgType> argImpl;
+  };
+
+  Data m_d;
+};
+
+// ----------------------- Casting ---------------------
+
+template <typename SrcType, typename DstType, typename ArgType>
+struct unary_evaluator<CwiseUnaryOp<core_cast_op<SrcType, DstType>, ArgType>, IndexBased> {
+  using CastOp = core_cast_op<SrcType, DstType>;
+  using XprType = CwiseUnaryOp<CastOp, ArgType>;
+
+  // Use the largest packet type by default
+  using SrcPacketType = typename packet_traits<SrcType>::type;
+  static constexpr int SrcPacketSize = unpacket_traits<SrcPacketType>::size;
+  static constexpr int SrcPacketBytes = SrcPacketSize * sizeof(SrcType);
+
+  enum {
+    CoeffReadCost = int(evaluator<ArgType>::CoeffReadCost) + int(functor_traits<CastOp>::Cost),
+    PacketAccess = functor_traits<CastOp>::PacketAccess,
+    ActualPacketAccessBit = PacketAccess ? PacketAccessBit : 0,
+    Flags = evaluator<ArgType>::Flags & (HereditaryBits | LinearAccessBit | ActualPacketAccessBit),
+    IsRowMajor = (evaluator<ArgType>::Flags & RowMajorBit),
+    Alignment = evaluator<ArgType>::Alignment
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit unary_evaluator(const XprType& xpr)
+      : m_argImpl(xpr.nestedExpression()), m_rows(xpr.rows()), m_cols(xpr.cols()) {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<CastOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  template <typename DstPacketType>
+  using AltSrcScalarOp = std::enable_if_t<(unpacket_traits<DstPacketType>::size < SrcPacketSize &&
+                                           !find_packet_by_size<SrcType, unpacket_traits<DstPacketType>::size>::value),
+                                          bool>;
+  template <typename DstPacketType>
+  using SrcPacketArgs1 =
+      std::enable_if_t<(find_packet_by_size<SrcType, unpacket_traits<DstPacketType>::size>::value), bool>;
+  template <typename DstPacketType>
+  using SrcPacketArgs2 = std::enable_if_t<(unpacket_traits<DstPacketType>::size) == (2 * SrcPacketSize), bool>;
+  template <typename DstPacketType>
+  using SrcPacketArgs4 = std::enable_if_t<(unpacket_traits<DstPacketType>::size) == (4 * SrcPacketSize), bool>;
+  template <typename DstPacketType>
+  using SrcPacketArgs8 = std::enable_if_t<(unpacket_traits<DstPacketType>::size) == (8 * SrcPacketSize), bool>;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_array_bounds(Index row, Index col, Index begin, Index count) const {
+    return IsRowMajor ? (col + count + begin <= cols()) : (row + count + begin <= rows());
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_array_bounds(Index index, Index begin, Index count) const {
+    return index + count + begin <= size();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SrcType srcCoeff(Index row, Index col, Index offset) const {
+    Index actualRow = IsRowMajor ? row : row + offset;
+    Index actualCol = IsRowMajor ? col + offset : col;
+    return m_argImpl.coeff(actualRow, actualCol);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SrcType srcCoeff(Index index, Index offset) const {
+    Index actualIndex = index + offset;
+    return m_argImpl.coeff(actualIndex);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstType coeff(Index row, Index col) const {
+    return cast<SrcType, DstType>(srcCoeff(row, col, 0));
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstType coeff(Index index) const {
+    return cast<SrcType, DstType>(srcCoeff(index, 0));
+  }
+
+  template <int LoadMode, typename PacketType = SrcPacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType srcPacket(Index row, Index col, Index offset) const {
+    constexpr int PacketSize = unpacket_traits<PacketType>::size;
+    Index packetOffset = offset * PacketSize;
+    Index actualRow = IsRowMajor ? row : row + packetOffset;
+    Index actualCol = IsRowMajor ? col + packetOffset : col;
+    eigen_assert(check_array_bounds(actualRow, actualCol, 0, PacketSize) && "Array index out of bounds");
+    return m_argImpl.template packet<LoadMode, PacketType>(actualRow, actualCol);
+  }
+  template <int LoadMode, typename PacketType = SrcPacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType srcPacket(Index index, Index offset) const {
+    constexpr int PacketSize = unpacket_traits<PacketType>::size;
+    Index packetOffset = offset * PacketSize;
+    Index actualIndex = index + packetOffset;
+    eigen_assert(check_array_bounds(actualIndex, 0, PacketSize) && "Array index out of bounds");
+    return m_argImpl.template packet<LoadMode, PacketType>(actualIndex);
+  }
+  template <int LoadMode, typename PacketType = SrcPacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType srcPacketSegment(Index row, Index col, Index begin, Index count,
+                                                                    Index offset) const {
+    constexpr int PacketSize = unpacket_traits<PacketType>::size;
+    Index packetOffset = offset * PacketSize;
+    Index actualRow = IsRowMajor ? row : row + packetOffset;
+    Index actualCol = IsRowMajor ? col + packetOffset : col;
+    eigen_assert(check_array_bounds(actualRow, actualCol, begin, count) && "Array index out of bounds");
+    return m_argImpl.template packetSegment<LoadMode, PacketType>(actualRow, actualCol, begin, count);
+  }
+  template <int LoadMode, typename PacketType = SrcPacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType srcPacketSegment(Index index, Index begin, Index count,
+                                                                    Index offset) const {
+    constexpr int PacketSize = unpacket_traits<PacketType>::size;
+    Index packetOffset = offset * PacketSize;
+    Index actualIndex = index + packetOffset;
+    eigen_assert(check_array_bounds(actualIndex, begin, count) && "Array index out of bounds");
+    return m_argImpl.template packetSegment<LoadMode, PacketType>(actualIndex, begin, count);
+  }
+
+  template <int NumPackets, int LoadMode, typename PacketType = SrcPacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketBlock<PacketType, NumPackets> srcPacketSegmentHelper(Index row, Index col,
+                                                                                                   Index begin,
+                                                                                                   Index count) const {
+    constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
+    PacketBlock<PacketType, NumPackets> packets;
+    for (Index i = 0; i < NumPackets; i++) packets.packet[i] = pzero(PacketType());
+    Index offset = begin / SrcPacketSize;
+    Index actualBegin = begin % SrcPacketSize;
+    for (; offset < NumPackets; offset++) {
+      Index actualCount = numext::mini(SrcPacketSize - actualBegin, count);
+      packets.packet[offset] = srcPacketSegment<SrcLoadMode>(row, col, actualBegin, actualCount, offset);
+      if (count == actualCount) break;
+      actualBegin = 0;
+      count -= actualCount;
+    }
+    return packets;
+  }
+  template <int NumPackets, int LoadMode, typename PacketType = SrcPacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketBlock<PacketType, NumPackets> srcPacketSegmentHelper(Index index,
+                                                                                                   Index begin,
+                                                                                                   Index count) const {
+    constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
+    PacketBlock<PacketType, NumPackets> packets;
+    for (Index i = 0; i < NumPackets; i++) packets.packet[i] = pzero(PacketType());
+    Index offset = begin / SrcPacketSize;
+    Index actualBegin = begin % SrcPacketSize;
+    for (; offset < NumPackets; offset++) {
+      Index actualCount = numext::mini(SrcPacketSize - actualBegin, count);
+      packets.packet[offset] = srcPacketSegment<SrcLoadMode>(index, actualBegin, actualCount, offset);
+      if (count == actualCount) break;
+      actualBegin = 0;
+      count -= actualCount;
+    }
+    return packets;
+  }
+
+  // There is no source packet type with equal or fewer elements than DstPacketType.
+  // This is problematic as the evaluation loop may attempt to access data outside the bounds of the array.
+  // For example, consider the cast utilizing pcast<Packet4f,Packet2d> with an array of size 4: {0.0f,1.0f,2.0f,3.0f}.
+  // The first iteration of the evaluation loop will load 16 bytes: {0.0f,1.0f,2.0f,3.0f} and cast to {0.0,1.0}, which
+  // is acceptable. The second iteration will load 16 bytes: {2.0f,3.0f,?,?}, which is outside the bounds of the array.
+  template <int LoadMode, typename DstPacketType, AltSrcScalarOp<DstPacketType> = true>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packet(Index row, Index col) const {
+    constexpr int DstPacketSize = unpacket_traits<DstPacketType>::size;
+    constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType);
+    constexpr int SrcLoadMode = plain_enum_min(SrcBytesIncrement, LoadMode);
+    return pcast<SrcPacketType, DstPacketType>(srcPacketSegment<SrcLoadMode>(row, col, 0, DstPacketSize, 0));
+  }
+  // Use the source packet type with the same size as DstPacketType, if it exists
+  template <int LoadMode, typename DstPacketType, SrcPacketArgs1<DstPacketType> = true>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packet(Index row, Index col) const {
+    constexpr int DstPacketSize = unpacket_traits<DstPacketType>::size;
+    using SizedSrcPacketType = typename find_packet_by_size<SrcType, DstPacketSize>::type;
+    constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType);
+    constexpr int SrcLoadMode = plain_enum_min(SrcBytesIncrement, LoadMode);
+    return pcast<SizedSrcPacketType, DstPacketType>(srcPacket<SrcLoadMode, SizedSrcPacketType>(row, col, 0));
+  }
+  // unpacket_traits<DstPacketType>::size == 2 * SrcPacketSize
+  template <int LoadMode, typename DstPacketType, SrcPacketArgs2<DstPacketType> = true>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packet(Index row, Index col) const {
+    constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
+    return pcast<SrcPacketType, DstPacketType>(srcPacket<SrcLoadMode>(row, col, 0),
+                                               srcPacket<SrcLoadMode>(row, col, 1));
+  }
+  // unpacket_traits<DstPacketType>::size == 4 * SrcPacketSize
+  template <int LoadMode, typename DstPacketType, SrcPacketArgs4<DstPacketType> = true>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packet(Index row, Index col) const {
+    constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
+    return pcast<SrcPacketType, DstPacketType>(srcPacket<SrcLoadMode>(row, col, 0), srcPacket<SrcLoadMode>(row, col, 1),
+                                               srcPacket<SrcLoadMode>(row, col, 2),
+                                               srcPacket<SrcLoadMode>(row, col, 3));
+  }
+  // unpacket_traits<DstPacketType>::size == 8 * SrcPacketSize
+  template <int LoadMode, typename DstPacketType, SrcPacketArgs8<DstPacketType> = true>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packet(Index row, Index col) const {
+    constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
+    return pcast<SrcPacketType, DstPacketType>(
+        srcPacket<SrcLoadMode>(row, col, 0), srcPacket<SrcLoadMode>(row, col, 1), srcPacket<SrcLoadMode>(row, col, 2),
+        srcPacket<SrcLoadMode>(row, col, 3), srcPacket<SrcLoadMode>(row, col, 4), srcPacket<SrcLoadMode>(row, col, 5),
+        srcPacket<SrcLoadMode>(row, col, 6), srcPacket<SrcLoadMode>(row, col, 7));
+  }
+
+  // packetSegment variants
+  template <int LoadMode, typename DstPacketType, AltSrcScalarOp<DstPacketType> = true>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index row, Index col, Index begin,
+                                                                    Index count) const {
+    constexpr int DstPacketSize = unpacket_traits<DstPacketType>::size;
+    constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType);
+    constexpr int SrcLoadMode = plain_enum_min(SrcBytesIncrement, LoadMode);
+    return pcast<SrcPacketType, DstPacketType>(srcPacketSegment<SrcLoadMode>(row, col, begin, count, 0));
+  }
+  // Use the source packet type with the same size as DstPacketType, if it exists
+  template <int LoadMode, typename DstPacketType, SrcPacketArgs1<DstPacketType> = true>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index row, Index col, Index begin,
+                                                                    Index count) const {
+    constexpr int DstPacketSize = unpacket_traits<DstPacketType>::size;
+    using SizedSrcPacketType = typename find_packet_by_size<SrcType, DstPacketSize>::type;
+    constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType);
+    constexpr int SrcLoadMode = plain_enum_min(SrcBytesIncrement, LoadMode);
+    return pcast<SizedSrcPacketType, DstPacketType>(
+        srcPacketSegment<SrcLoadMode, SizedSrcPacketType>(row, col, begin, count, 0));
+  }
+  // unpacket_traits<DstPacketType>::size == 2 * SrcPacketSize
+  template <int LoadMode, typename DstPacketType, SrcPacketArgs2<DstPacketType> = true>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index row, Index col, Index begin,
+                                                                    Index count) const {
+    constexpr int NumPackets = 2;
+    constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
+    PacketBlock<SrcPacketType, NumPackets> packets =
+        srcPacketSegmentHelper<NumPackets, SrcLoadMode>(row, col, begin, count);
+    return pcast<SrcPacketType, DstPacketType>(packets.packet[0], packets.packet[1]);
+  }
+  // unpacket_traits<DstPacketType>::size == 4 * SrcPacketSize
+  template <int LoadMode, typename DstPacketType, SrcPacketArgs4<DstPacketType> = true>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index row, Index col, Index begin,
+                                                                    Index count) const {
+    constexpr int NumPackets = 4;
+    constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
+    PacketBlock<SrcPacketType, NumPackets> packets =
+        srcPacketSegmentHelper<NumPackets, SrcLoadMode>(row, col, begin, count);
+    return pcast<SrcPacketType, DstPacketType>(packets.packet[0], packets.packet[1], packets.packet[2],
+                                               packets.packet[3]);
+  }
+  // unpacket_traits<DstPacketType>::size == 8 * SrcPacketSize
+  template <int LoadMode, typename DstPacketType, SrcPacketArgs8<DstPacketType> = true>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index row, Index col, Index begin,
+                                                                    Index count) const {
+    constexpr int NumPackets = 8;
+    constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
+    PacketBlock<SrcPacketType, NumPackets> packets =
+        srcPacketSegmentHelper<NumPackets, SrcLoadMode>(row, col, begin, count);
+    return pcast<SrcPacketType, DstPacketType>(packets.packet[0], packets.packet[1], packets.packet[2],
+                                               packets.packet[3], packets.packet[4], packets.packet[5],
+                                               packets.packet[6], packets.packet[7]);
+  }
+
+  // Analogous routines for linear access.
+  template <int LoadMode, typename DstPacketType, AltSrcScalarOp<DstPacketType> = true>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packet(Index index) const {
+    constexpr int DstPacketSize = unpacket_traits<DstPacketType>::size;
+    constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType);
+    constexpr int SrcLoadMode = plain_enum_min(SrcBytesIncrement, LoadMode);
+    return pcast<SrcPacketType, DstPacketType>(srcPacketSegment<SrcLoadMode>(index, 0, DstPacketSize, 0));
+  }
+  template <int LoadMode, typename DstPacketType, SrcPacketArgs1<DstPacketType> = true>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packet(Index index) const {
+    constexpr int DstPacketSize = unpacket_traits<DstPacketType>::size;
+    using SizedSrcPacketType = typename find_packet_by_size<SrcType, DstPacketSize>::type;
+    constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType);
+    constexpr int SrcLoadMode = plain_enum_min(SrcBytesIncrement, LoadMode);
+    return pcast<SizedSrcPacketType, DstPacketType>(srcPacket<SrcLoadMode, SizedSrcPacketType>(index, 0));
+  }
+  template <int LoadMode, typename DstPacketType, SrcPacketArgs2<DstPacketType> = true>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packet(Index index) const {
+    constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
+    return pcast<SrcPacketType, DstPacketType>(srcPacket<SrcLoadMode>(index, 0), srcPacket<SrcLoadMode>(index, 1));
+  }
+  template <int LoadMode, typename DstPacketType, SrcPacketArgs4<DstPacketType> = true>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packet(Index index) const {
+    constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
+    return pcast<SrcPacketType, DstPacketType>(srcPacket<SrcLoadMode>(index, 0), srcPacket<SrcLoadMode>(index, 1),
+                                               srcPacket<SrcLoadMode>(index, 2), srcPacket<SrcLoadMode>(index, 3));
+  }
+  template <int LoadMode, typename DstPacketType, SrcPacketArgs8<DstPacketType> = true>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packet(Index index) const {
+    constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
+    return pcast<SrcPacketType, DstPacketType>(srcPacket<SrcLoadMode>(index, 0), srcPacket<SrcLoadMode>(index, 1),
+                                               srcPacket<SrcLoadMode>(index, 2), srcPacket<SrcLoadMode>(index, 3),
+                                               srcPacket<SrcLoadMode>(index, 4), srcPacket<SrcLoadMode>(index, 5),
+                                               srcPacket<SrcLoadMode>(index, 6), srcPacket<SrcLoadMode>(index, 7));
+  }
+
+  // packetSegment variants
+  template <int LoadMode, typename DstPacketType, AltSrcScalarOp<DstPacketType> = true>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index index, Index begin, Index count) const {
+    constexpr int DstPacketSize = unpacket_traits<DstPacketType>::size;
+    constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType);
+    constexpr int SrcLoadMode = plain_enum_min(SrcBytesIncrement, LoadMode);
+    return pcast<SrcPacketType, DstPacketType>(srcPacketSegment<SrcLoadMode>(index, begin, count, 0));
+  }
+  // Use the source packet type with the same size as DstPacketType, if it exists
+  template <int LoadMode, typename DstPacketType, SrcPacketArgs1<DstPacketType> = true>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index index, Index begin, Index count) const {
+    constexpr int DstPacketSize = unpacket_traits<DstPacketType>::size;
+    using SizedSrcPacketType = typename find_packet_by_size<SrcType, DstPacketSize>::type;
+    constexpr int SrcBytesIncrement = DstPacketSize * sizeof(SrcType);
+    constexpr int SrcLoadMode = plain_enum_min(SrcBytesIncrement, LoadMode);
+    return pcast<SizedSrcPacketType, DstPacketType>(
+        srcPacketSegment<SrcLoadMode, SizedSrcPacketType>(index, begin, count, 0));
+  }
+  // unpacket_traits<DstPacketType>::size == 2 * SrcPacketSize
+  template <int LoadMode, typename DstPacketType, SrcPacketArgs2<DstPacketType> = true>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index index, Index begin, Index count) const {
+    constexpr int NumPackets = 2;
+    constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
+    PacketBlock<SrcPacketType, NumPackets> packets =
+        srcPacketSegmentHelper<NumPackets, SrcLoadMode>(index, begin, count);
+    return pcast<SrcPacketType, DstPacketType>(packets.packet[0], packets.packet[1]);
+  }
+  // unpacket_traits<DstPacketType>::size == 4 * SrcPacketSize
+  template <int LoadMode, typename DstPacketType, SrcPacketArgs4<DstPacketType> = true>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index index, Index begin, Index count) const {
+    constexpr int NumPackets = 4;
+    constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
+    PacketBlock<SrcPacketType, NumPackets> packets =
+        srcPacketSegmentHelper<NumPackets, SrcLoadMode>(index, begin, count);
+    return pcast<SrcPacketType, DstPacketType>(packets.packet[0], packets.packet[1], packets.packet[2],
+                                               packets.packet[3]);
+  }
+  // unpacket_traits<DstPacketType>::size == 8 * SrcPacketSize
+  template <int LoadMode, typename DstPacketType, SrcPacketArgs8<DstPacketType> = true>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DstPacketType packetSegment(Index index, Index begin, Index count) const {
+    constexpr int NumPackets = 8;
+    constexpr int SrcLoadMode = plain_enum_min(SrcPacketBytes, LoadMode);
+    PacketBlock<SrcPacketType, NumPackets> packets =
+        srcPacketSegmentHelper<NumPackets, SrcLoadMode>(index, begin, count);
+    return pcast<SrcPacketType, DstPacketType>(packets.packet[0], packets.packet[1], packets.packet[2],
+                                               packets.packet[3], packets.packet[4], packets.packet[5],
+                                               packets.packet[6], packets.packet[7]);
+  }
+
+  constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rows() const { return m_rows; }
+  constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index cols() const { return m_cols; }
+  constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_rows * m_cols; }
+
+ protected:
+  const evaluator<ArgType> m_argImpl;
+  const variable_if_dynamic<Index, XprType::RowsAtCompileTime> m_rows;
+  const variable_if_dynamic<Index, XprType::ColsAtCompileTime> m_cols;
+};
+
+// -------------------- CwiseTernaryOp --------------------
+
+// this is a ternary expression
+template <typename TernaryOp, typename Arg1, typename Arg2, typename Arg3>
+struct evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3>>
+    : public ternary_evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3>> {
+  typedef CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> XprType;
+  typedef ternary_evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3>> Base;
+
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : Base(xpr) {}
+};
+
+template <typename TernaryOp, typename Arg1, typename Arg2, typename Arg3>
+struct ternary_evaluator<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3>, IndexBased, IndexBased>
+    : evaluator_base<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3>> {
+  typedef CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3> XprType;
+
+  enum {
+    CoeffReadCost = int(evaluator<Arg1>::CoeffReadCost) + int(evaluator<Arg2>::CoeffReadCost) +
+                    int(evaluator<Arg3>::CoeffReadCost) + int(functor_traits<TernaryOp>::Cost),
+
+    Arg1Flags = evaluator<Arg1>::Flags,
+    Arg2Flags = evaluator<Arg2>::Flags,
+    Arg3Flags = evaluator<Arg3>::Flags,
+    SameType = is_same<typename Arg1::Scalar, typename Arg2::Scalar>::value &&
+               is_same<typename Arg1::Scalar, typename Arg3::Scalar>::value,
+    StorageOrdersAgree = (int(Arg1Flags) & RowMajorBit) == (int(Arg2Flags) & RowMajorBit) &&
+                         (int(Arg1Flags) & RowMajorBit) == (int(Arg3Flags) & RowMajorBit),
+    Flags0 = (int(Arg1Flags) | int(Arg2Flags) | int(Arg3Flags)) &
+             (HereditaryBits |
+              (int(Arg1Flags) & int(Arg2Flags) & int(Arg3Flags) &
+               ((StorageOrdersAgree ? LinearAccessBit : 0) |
+                (functor_traits<TernaryOp>::PacketAccess && StorageOrdersAgree && SameType ? PacketAccessBit : 0)))),
+    Flags = (Flags0 & ~RowMajorBit) | (Arg1Flags & RowMajorBit),
+    Alignment = plain_enum_min(plain_enum_min(evaluator<Arg1>::Alignment, evaluator<Arg2>::Alignment),
+                               evaluator<Arg3>::Alignment)
+  };
+
+  EIGEN_DEVICE_FUNC explicit ternary_evaluator(const XprType& xpr) : m_d(xpr) {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<TernaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
+    return m_d.func()(m_d.arg1Impl.coeff(row, col), m_d.arg2Impl.coeff(row, col), m_d.arg3Impl.coeff(row, col));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    return m_d.func()(m_d.arg1Impl.coeff(index), m_d.arg2Impl.coeff(index), m_d.arg3Impl.coeff(index));
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
+    return m_d.func().packetOp(m_d.arg1Impl.template packet<LoadMode, PacketType>(row, col),
+                               m_d.arg2Impl.template packet<LoadMode, PacketType>(row, col),
+                               m_d.arg3Impl.template packet<LoadMode, PacketType>(row, col));
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index index) const {
+    return m_d.func().packetOp(m_d.arg1Impl.template packet<LoadMode, PacketType>(index),
+                               m_d.arg2Impl.template packet<LoadMode, PacketType>(index),
+                               m_d.arg3Impl.template packet<LoadMode, PacketType>(index));
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
+    return m_d.func().packetOp(m_d.arg1Impl.template packetSegment<LoadMode, PacketType>(row, col, begin, count),
+                               m_d.arg2Impl.template packetSegment<LoadMode, PacketType>(row, col, begin, count),
+                               m_d.arg3Impl.template packetSegment<LoadMode, PacketType>(row, col, begin, count));
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
+    return m_d.func().packetOp(m_d.arg1Impl.template packetSegment<LoadMode, PacketType>(index, begin, count),
+                               m_d.arg2Impl.template packetSegment<LoadMode, PacketType>(index, begin, count),
+                               m_d.arg3Impl.template packetSegment<LoadMode, PacketType>(index, begin, count));
+  }
+
+ protected:
+  // this helper permits to completely eliminate the functor if it is empty
+  struct Data {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Data(const XprType& xpr)
+        : op(xpr.functor()), arg1Impl(xpr.arg1()), arg2Impl(xpr.arg2()), arg3Impl(xpr.arg3()) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TernaryOp& func() const { return op; }
+    TernaryOp op;
+    evaluator<Arg1> arg1Impl;
+    evaluator<Arg2> arg2Impl;
+    evaluator<Arg3> arg3Impl;
+  };
+
+  Data m_d;
+};
+
+template <typename Arg1, typename Arg2, typename Scalar, typename CmpLhsType, typename CmpRhsType, ComparisonName cmp>
+struct scalar_boolean_select_spec {
+  using DummyTernaryOp = scalar_boolean_select_op<Scalar, Scalar, bool>;
+  using DummyArg3 = CwiseBinaryOp<scalar_cmp_op<Scalar, Scalar, cmp, false>, CmpLhsType, CmpRhsType>;
+  using DummyXprType = CwiseTernaryOp<DummyTernaryOp, Arg1, Arg2, DummyArg3>;
+
+  // only use the typed comparison if it is vectorized
+  static constexpr bool UseTyped = functor_traits<scalar_cmp_op<Scalar, Scalar, cmp, true>>::PacketAccess;
+  using CondScalar = std::conditional_t<UseTyped, Scalar, bool>;
+
+  using TernaryOp = scalar_boolean_select_op<Scalar, Scalar, CondScalar>;
+  using Arg3 = CwiseBinaryOp<scalar_cmp_op<Scalar, Scalar, cmp, UseTyped>, CmpLhsType, CmpRhsType>;
+  using XprType = CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3>;
+
+  using Base = ternary_evaluator<XprType>;
+};
+
+// specialization for expressions like (a < b).select(c, d) to enable full vectorization
+template <typename Arg1, typename Arg2, typename Scalar, typename CmpLhsType, typename CmpRhsType, ComparisonName cmp>
+struct evaluator<CwiseTernaryOp<scalar_boolean_select_op<Scalar, Scalar, bool>, Arg1, Arg2,
+                                CwiseBinaryOp<scalar_cmp_op<Scalar, Scalar, cmp, false>, CmpLhsType, CmpRhsType>>>
+    : public scalar_boolean_select_spec<Arg1, Arg2, Scalar, CmpLhsType, CmpRhsType, cmp>::Base {
+  using Helper = scalar_boolean_select_spec<Arg1, Arg2, Scalar, CmpLhsType, CmpRhsType, cmp>;
+  using Base = typename Helper::Base;
+  using DummyXprType = typename Helper::DummyXprType;
+  using Arg3 = typename Helper::Arg3;
+  using XprType = typename Helper::XprType;
+
+  EIGEN_DEVICE_FUNC explicit evaluator(const DummyXprType& xpr)
+      : Base(XprType(xpr.arg1(), xpr.arg2(), Arg3(xpr.arg3().lhs(), xpr.arg3().rhs()))) {}
+};
+
+// -------------------- CwiseBinaryOp --------------------
+
+// this is a binary expression
+template <typename BinaryOp, typename Lhs, typename Rhs>
+struct evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>> : public binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>> {
+  typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> XprType;
+  typedef binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>> Base;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr) : Base(xpr) {}
+};
+
+template <typename BinaryOp, typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IndexBased, IndexBased>
+    : evaluator_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs>> {
+  typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> XprType;
+
+  enum {
+    CoeffReadCost =
+        int(evaluator<Lhs>::CoeffReadCost) + int(evaluator<Rhs>::CoeffReadCost) + int(functor_traits<BinaryOp>::Cost),
+
+    LhsFlags = evaluator<Lhs>::Flags,
+    RhsFlags = evaluator<Rhs>::Flags,
+    SameType = is_same<typename Lhs::Scalar, typename Rhs::Scalar>::value,
+    StorageOrdersAgree = (int(LhsFlags) & RowMajorBit) == (int(RhsFlags) & RowMajorBit),
+    Flags0 = (int(LhsFlags) | int(RhsFlags)) &
+             (HereditaryBits |
+              (int(LhsFlags) & int(RhsFlags) &
+               ((StorageOrdersAgree ? LinearAccessBit : 0) |
+                (functor_traits<BinaryOp>::PacketAccess && StorageOrdersAgree && SameType ? PacketAccessBit : 0)))),
+    Flags = (Flags0 & ~RowMajorBit) | (LhsFlags & RowMajorBit),
+    Alignment = plain_enum_min(evaluator<Lhs>::Alignment, evaluator<Rhs>::Alignment)
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit binary_evaluator(const XprType& xpr) : m_d(xpr) {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<BinaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
+    return m_d.func()(m_d.lhsImpl.coeff(row, col), m_d.rhsImpl.coeff(row, col));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    return m_d.func()(m_d.lhsImpl.coeff(index), m_d.rhsImpl.coeff(index));
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
+    return m_d.func().packetOp(m_d.lhsImpl.template packet<LoadMode, PacketType>(row, col),
+                               m_d.rhsImpl.template packet<LoadMode, PacketType>(row, col));
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index index) const {
+    return m_d.func().packetOp(m_d.lhsImpl.template packet<LoadMode, PacketType>(index),
+                               m_d.rhsImpl.template packet<LoadMode, PacketType>(index));
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
+    return m_d.func().packetOp(m_d.lhsImpl.template packetSegment<LoadMode, PacketType>(row, col, begin, count),
+                               m_d.rhsImpl.template packetSegment<LoadMode, PacketType>(row, col, begin, count));
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
+    return m_d.func().packetOp(m_d.lhsImpl.template packetSegment<LoadMode, PacketType>(index, begin, count),
+                               m_d.rhsImpl.template packetSegment<LoadMode, PacketType>(index, begin, count));
+  }
+
+ protected:
+  // this helper permits to completely eliminate the functor if it is empty
+  struct Data {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Data(const XprType& xpr)
+        : op(xpr.functor()), lhsImpl(xpr.lhs()), rhsImpl(xpr.rhs()) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const BinaryOp& func() const { return op; }
+    BinaryOp op;
+    evaluator<Lhs> lhsImpl;
+    evaluator<Rhs> rhsImpl;
+  };
+
+  Data m_d;
+};
+
+// -------------------- CwiseUnaryView --------------------
+
+template <typename UnaryOp, typename ArgType, typename StrideType>
+struct unary_evaluator<CwiseUnaryView<UnaryOp, ArgType, StrideType>, IndexBased>
+    : evaluator_base<CwiseUnaryView<UnaryOp, ArgType, StrideType>> {
+  typedef CwiseUnaryView<UnaryOp, ArgType, StrideType> XprType;
+
+  enum {
+    CoeffReadCost = int(evaluator<ArgType>::CoeffReadCost) + int(functor_traits<UnaryOp>::Cost),
+
+    Flags = (evaluator<ArgType>::Flags & (HereditaryBits | LinearAccessBit | DirectAccessBit)),
+
+    Alignment = 0  // FIXME it is not very clear why alignment is necessarily lost...
+  };
+
+  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& op) : m_d(op) {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<UnaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
+    return m_d.func()(m_d.argImpl.coeff(row, col));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    return m_d.func()(m_d.argImpl.coeff(index));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) {
+    return m_d.func()(m_d.argImpl.coeffRef(row, col));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
+    return m_d.func()(m_d.argImpl.coeffRef(index));
+  }
+
+ protected:
+  // this helper permits to completely eliminate the functor if it is empty
+  struct Data {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Data(const XprType& xpr)
+        : op(xpr.functor()), argImpl(xpr.nestedExpression()) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const UnaryOp& func() const { return op; }
+    UnaryOp op;
+    evaluator<ArgType> argImpl;
+  };
+
+  Data m_d;
+};
+
+// -------------------- Map --------------------
+
+// FIXME perhaps the PlainObjectType could be provided by Derived::PlainObject ?
+// but that might complicate template specialization
+template <typename Derived, typename PlainObjectType>
+struct mapbase_evaluator;
+
+template <typename Derived, typename PlainObjectType>
+struct mapbase_evaluator : evaluator_base<Derived> {
+  typedef Derived XprType;
+  typedef typename XprType::PointerType PointerType;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  enum {
+    IsRowMajor = XprType::RowsAtCompileTime,
+    ColsAtCompileTime = XprType::ColsAtCompileTime,
+    CoeffReadCost = NumTraits<Scalar>::ReadCost
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit mapbase_evaluator(const XprType& map)
+      : m_data(const_cast<PointerType>(map.data())),
+        m_innerStride(map.innerStride()),
+        m_outerStride(map.outerStride()) {
+    EIGEN_STATIC_ASSERT(check_implication((evaluator<Derived>::Flags & PacketAccessBit) != 0,
+                                          inner_stride_at_compile_time<Derived>::ret == 1),
+                        PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
+    return m_data[col * colStride() + row * rowStride()];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    return m_data[index * m_innerStride.value()];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) {
+    return m_data[col * colStride() + row * rowStride()];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { return m_data[index * m_innerStride.value()]; }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
+    PointerType ptr = m_data + row * rowStride() + col * colStride();
+    return ploadt<PacketType, LoadMode>(ptr);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index index) const {
+    return ploadt<PacketType, LoadMode>(m_data + index * m_innerStride.value());
+  }
+
+  template <int StoreMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index row, Index col, const PacketType& x) {
+    PointerType ptr = m_data + row * rowStride() + col * colStride();
+    pstoret<Scalar, PacketType, StoreMode>(ptr, x);
+  }
+
+  template <int StoreMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) {
+    pstoret<Scalar, PacketType, StoreMode>(m_data + index * m_innerStride.value(), x);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
+    PointerType ptr = m_data + row * rowStride() + col * colStride();
+    return ploadtSegment<PacketType, LoadMode>(ptr, begin, count);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
+    return ploadtSegment<PacketType, LoadMode>(m_data + index * m_innerStride.value(), begin, count);
+  }
+
+  template <int StoreMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index row, Index col, const PacketType& x, Index begin,
+                                                                Index count) {
+    PointerType ptr = m_data + row * rowStride() + col * colStride();
+    pstoretSegment<Scalar, PacketType, StoreMode>(ptr, x, begin, count);
+  }
+
+  template <int StoreMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index index, const PacketType& x, Index begin,
+                                                                Index count) {
+    pstoretSegment<Scalar, PacketType, StoreMode>(m_data + index * m_innerStride.value(), x, begin, count);
+  }
+
+ protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rowStride() const noexcept {
+    return XprType::IsRowMajor ? m_outerStride.value() : m_innerStride.value();
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index colStride() const noexcept {
+    return XprType::IsRowMajor ? m_innerStride.value() : m_outerStride.value();
+  }
+
+  PointerType m_data;
+  const variable_if_dynamic<Index, XprType::InnerStrideAtCompileTime> m_innerStride;
+  const variable_if_dynamic<Index, XprType::OuterStrideAtCompileTime> m_outerStride;
+};
+
+template <typename PlainObjectType, int MapOptions, typename StrideType>
+struct evaluator<Map<PlainObjectType, MapOptions, StrideType>>
+    : public mapbase_evaluator<Map<PlainObjectType, MapOptions, StrideType>, PlainObjectType> {
+  typedef Map<PlainObjectType, MapOptions, StrideType> XprType;
+  typedef typename XprType::Scalar Scalar;
+  // TODO: should check for smaller packet types once we can handle multi-sized packet types
+  typedef typename packet_traits<Scalar>::type PacketScalar;
+
+  enum {
+    InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0
+                                   ? int(PlainObjectType::InnerStrideAtCompileTime)
+                                   : int(StrideType::InnerStrideAtCompileTime),
+    OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0
+                                   ? int(PlainObjectType::OuterStrideAtCompileTime)
+                                   : int(StrideType::OuterStrideAtCompileTime),
+    HasNoInnerStride = InnerStrideAtCompileTime == 1,
+    HasNoOuterStride = StrideType::OuterStrideAtCompileTime == 0,
+    HasNoStride = HasNoInnerStride && HasNoOuterStride,
+    IsDynamicSize = PlainObjectType::SizeAtCompileTime == Dynamic,
+
+    PacketAccessMask = bool(HasNoInnerStride) ? ~int(0) : ~int(PacketAccessBit),
+    LinearAccessMask =
+        bool(HasNoStride) || bool(PlainObjectType::IsVectorAtCompileTime) ? ~int(0) : ~int(LinearAccessBit),
+    Flags = int(evaluator<PlainObjectType>::Flags) & (LinearAccessMask & PacketAccessMask),
+
+    Alignment = int(MapOptions) & int(AlignedMask)
+  };
+
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& map) : mapbase_evaluator<XprType, PlainObjectType>(map) {}
+};
+
+// -------------------- Ref --------------------
+
+template <typename PlainObjectType, int RefOptions, typename StrideType>
+struct evaluator<Ref<PlainObjectType, RefOptions, StrideType>>
+    : public mapbase_evaluator<Ref<PlainObjectType, RefOptions, StrideType>, PlainObjectType> {
+  typedef Ref<PlainObjectType, RefOptions, StrideType> XprType;
+
+  enum {
+    Flags = evaluator<Map<PlainObjectType, RefOptions, StrideType>>::Flags,
+    Alignment = evaluator<Map<PlainObjectType, RefOptions, StrideType>>::Alignment
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& ref)
+      : mapbase_evaluator<XprType, PlainObjectType>(ref) {}
+};
+
+// -------------------- Block --------------------
+
+template <typename ArgType, int BlockRows, int BlockCols, bool InnerPanel,
+          bool HasDirectAccess = has_direct_access<ArgType>::ret>
+struct block_evaluator;
+
+template <typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
+struct evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>>
+    : block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel> {
+  typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
+  typedef typename XprType::Scalar Scalar;
+  // TODO: should check for smaller packet types once we can handle multi-sized packet types
+  typedef typename packet_traits<Scalar>::type PacketScalar;
+
+  enum {
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
+
+    RowsAtCompileTime = traits<XprType>::RowsAtCompileTime,
+    ColsAtCompileTime = traits<XprType>::ColsAtCompileTime,
+    MaxRowsAtCompileTime = traits<XprType>::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = traits<XprType>::MaxColsAtCompileTime,
+
+    ArgTypeIsRowMajor = (int(evaluator<ArgType>::Flags) & RowMajorBit) != 0,
+    IsRowMajor = (MaxRowsAtCompileTime == 1 && MaxColsAtCompileTime != 1)   ? 1
+                 : (MaxColsAtCompileTime == 1 && MaxRowsAtCompileTime != 1) ? 0
+                                                                            : ArgTypeIsRowMajor,
+    HasSameStorageOrderAsArgType = (IsRowMajor == ArgTypeIsRowMajor),
+    InnerSize = IsRowMajor ? int(ColsAtCompileTime) : int(RowsAtCompileTime),
+    InnerStrideAtCompileTime = HasSameStorageOrderAsArgType ? int(inner_stride_at_compile_time<ArgType>::ret)
+                                                            : int(outer_stride_at_compile_time<ArgType>::ret),
+    OuterStrideAtCompileTime = HasSameStorageOrderAsArgType ? int(outer_stride_at_compile_time<ArgType>::ret)
+                                                            : int(inner_stride_at_compile_time<ArgType>::ret),
+    MaskPacketAccessBit = (InnerStrideAtCompileTime == 1 || HasSameStorageOrderAsArgType) ? PacketAccessBit : 0,
+
+    FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1 ||
+                            (InnerPanel && (evaluator<ArgType>::Flags & LinearAccessBit)))
+                               ? LinearAccessBit
+                               : 0,
+    FlagsRowMajorBit = XprType::Flags & RowMajorBit,
+    Flags0 = evaluator<ArgType>::Flags & ((HereditaryBits & ~RowMajorBit) | DirectAccessBit | MaskPacketAccessBit),
+    Flags = Flags0 | FlagsLinearAccessBit | FlagsRowMajorBit,
+
+    PacketAlignment = unpacket_traits<PacketScalar>::alignment,
+    Alignment0 = (InnerPanel && (OuterStrideAtCompileTime != Dynamic) && (OuterStrideAtCompileTime != 0) &&
+                  (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % int(PacketAlignment)) == 0))
+                     ? int(PacketAlignment)
+                     : 0,
+    Alignment = plain_enum_min(evaluator<ArgType>::Alignment, Alignment0)
+  };
+  typedef block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel> block_evaluator_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& block) : block_evaluator_type(block) {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+};
+
+// no direct-access => dispatch to a unary evaluator
+template <typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
+struct block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel, /*HasDirectAccess*/ false>
+    : unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>> {
+  typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit block_evaluator(const XprType& block)
+      : unary_evaluator<XprType>(block) {}
+};
+
+template <typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
+struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IndexBased>
+    : evaluator_base<Block<ArgType, BlockRows, BlockCols, InnerPanel>> {
+  typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit unary_evaluator(const XprType& block)
+      : m_argImpl(block.nestedExpression()),
+        m_startRow(block.startRow()),
+        m_startCol(block.startCol()),
+        m_linear_offset(ForwardLinearAccess
+                            ? (ArgType::IsRowMajor
+                                   ? block.startRow() * block.nestedExpression().cols() + block.startCol()
+                                   : block.startCol() * block.nestedExpression().rows() + block.startRow())
+                            : 0) {}
+
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  enum {
+    RowsAtCompileTime = XprType::RowsAtCompileTime,
+    ForwardLinearAccess = (InnerPanel || int(XprType::IsRowMajor) == int(ArgType::IsRowMajor)) &&
+                          bool(evaluator<ArgType>::Flags & LinearAccessBit)
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
+    return m_argImpl.coeff(m_startRow.value() + row, m_startCol.value() + col);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    return linear_coeff_impl(index, bool_constant<ForwardLinearAccess>());
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) {
+    return m_argImpl.coeffRef(m_startRow.value() + row, m_startCol.value() + col);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
+    return linear_coeffRef_impl(index, bool_constant<ForwardLinearAccess>());
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
+    return m_argImpl.template packet<LoadMode, PacketType>(m_startRow.value() + row, m_startCol.value() + col);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index index) const {
+    if (ForwardLinearAccess)
+      return m_argImpl.template packet<LoadMode, PacketType>(m_linear_offset.value() + index);
+    else
+      return packet<LoadMode, PacketType>(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
+  }
+
+  template <int StoreMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index row, Index col, const PacketType& x) {
+    return m_argImpl.template writePacket<StoreMode, PacketType>(m_startRow.value() + row, m_startCol.value() + col, x);
+  }
+
+  template <int StoreMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) {
+    if (ForwardLinearAccess)
+      return m_argImpl.template writePacket<StoreMode, PacketType>(m_linear_offset.value() + index, x);
+    else
+      return writePacket<StoreMode, PacketType>(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0,
+                                                x);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
+    return m_argImpl.template packetSegment<LoadMode, PacketType>(m_startRow.value() + row, m_startCol.value() + col,
+                                                                  begin, count);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
+    if (ForwardLinearAccess)
+      return m_argImpl.template packetSegment<LoadMode, PacketType>(m_linear_offset.value() + index, begin, count);
+    else
+      return packetSegment<LoadMode, PacketType>(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0,
+                                                 begin, count);
+  }
+
+  template <int StoreMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index row, Index col, const PacketType& x, Index begin,
+                                                                Index count) {
+    return m_argImpl.template writePacketSegment<StoreMode, PacketType>(m_startRow.value() + row,
+                                                                        m_startCol.value() + col, x, begin, count);
+  }
+
+  template <int StoreMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index index, const PacketType& x, Index begin,
+                                                                Index count) {
+    if (ForwardLinearAccess)
+      return m_argImpl.template writePacketSegment<StoreMode, PacketType>(m_linear_offset.value() + index, x, begin,
+                                                                          count);
+    else
+      return writePacketSegment<StoreMode, PacketType>(RowsAtCompileTime == 1 ? 0 : index,
+                                                       RowsAtCompileTime == 1 ? index : 0, x, begin, count);
+  }
+
+ protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType
+  linear_coeff_impl(Index index, internal::true_type /* ForwardLinearAccess */) const {
+    return m_argImpl.coeff(m_linear_offset.value() + index);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType
+  linear_coeff_impl(Index index, internal::false_type /* not ForwardLinearAccess */) const {
+    return coeff(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& linear_coeffRef_impl(Index index,
+                                                                     internal::true_type /* ForwardLinearAccess */) {
+    return m_argImpl.coeffRef(m_linear_offset.value() + index);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& linear_coeffRef_impl(
+      Index index, internal::false_type /* not ForwardLinearAccess */) {
+    return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0);
+  }
+
+  evaluator<ArgType> m_argImpl;
+  const variable_if_dynamic<Index, (ArgType::RowsAtCompileTime == 1 && BlockRows == 1) ? 0 : Dynamic> m_startRow;
+  const variable_if_dynamic<Index, (ArgType::ColsAtCompileTime == 1 && BlockCols == 1) ? 0 : Dynamic> m_startCol;
+  const variable_if_dynamic<Index, ForwardLinearAccess ? Dynamic : 0> m_linear_offset;
+};
+
+// TODO: This evaluator does not actually use the child evaluator;
+// all action is via the data() as returned by the Block expression.
+
+template <typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
+struct block_evaluator<ArgType, BlockRows, BlockCols, InnerPanel, /* HasDirectAccess */ true>
+    : mapbase_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>,
+                        typename Block<ArgType, BlockRows, BlockCols, InnerPanel>::PlainObject> {
+  typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
+  typedef typename XprType::Scalar Scalar;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit block_evaluator(const XprType& block)
+      : mapbase_evaluator<XprType, typename XprType::PlainObject>(block) {
+    eigen_internal_assert((internal::is_constant_evaluated() ||
+                           (std::uintptr_t(block.data()) % plain_enum_max(1, evaluator<XprType>::Alignment)) == 0) &&
+                          "data is not aligned");
+  }
+};
+
+// -------------------- Replicate --------------------
+
+template <typename ArgType, int RowFactor, int ColFactor>
+struct unary_evaluator<Replicate<ArgType, RowFactor, ColFactor>>
+    : evaluator_base<Replicate<ArgType, RowFactor, ColFactor>> {
+  typedef Replicate<ArgType, RowFactor, ColFactor> XprType;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  enum { Factor = (RowFactor == Dynamic || ColFactor == Dynamic) ? Dynamic : RowFactor * ColFactor };
+  typedef typename nested_eval<ArgType, Factor>::type ArgTypeNested;
+  typedef remove_all_t<ArgTypeNested> ArgTypeNestedCleaned;
+
+  enum {
+    CoeffReadCost = evaluator<ArgTypeNestedCleaned>::CoeffReadCost,
+    LinearAccessMask = XprType::IsVectorAtCompileTime ? LinearAccessBit : 0,
+    Flags = (evaluator<ArgTypeNestedCleaned>::Flags & (HereditaryBits | LinearAccessMask) & ~RowMajorBit) |
+            (traits<XprType>::Flags & RowMajorBit),
+
+    Alignment = evaluator<ArgTypeNestedCleaned>::Alignment
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit unary_evaluator(const XprType& replicate)
+      : m_arg(replicate.nestedExpression()),
+        m_argImpl(m_arg),
+        m_rows(replicate.nestedExpression().rows()),
+        m_cols(replicate.nestedExpression().cols()) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
+    // try to avoid using modulo; this is a pure optimization strategy
+    const Index actual_row = traits<XprType>::RowsAtCompileTime == 1 ? 0 : RowFactor == 1 ? row : row % m_rows.value();
+    const Index actual_col = traits<XprType>::ColsAtCompileTime == 1 ? 0 : ColFactor == 1 ? col : col % m_cols.value();
+
+    return m_argImpl.coeff(actual_row, actual_col);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    // try to avoid using modulo; this is a pure optimization strategy
+    const Index actual_index = traits<XprType>::RowsAtCompileTime == 1
+                                   ? (ColFactor == 1 ? index : index % m_cols.value())
+                                   : (RowFactor == 1 ? index : index % m_rows.value());
+
+    return m_argImpl.coeff(actual_index);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
+    const Index actual_row = traits<XprType>::RowsAtCompileTime == 1 ? 0 : RowFactor == 1 ? row : row % m_rows.value();
+    const Index actual_col = traits<XprType>::ColsAtCompileTime == 1 ? 0 : ColFactor == 1 ? col : col % m_cols.value();
+
+    return m_argImpl.template packet<LoadMode, PacketType>(actual_row, actual_col);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index index) const {
+    const Index actual_index = traits<XprType>::RowsAtCompileTime == 1
+                                   ? (ColFactor == 1 ? index : index % m_cols.value())
+                                   : (RowFactor == 1 ? index : index % m_rows.value());
+
+    return m_argImpl.template packet<LoadMode, PacketType>(actual_index);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
+    const Index actual_row = traits<XprType>::RowsAtCompileTime == 1 ? 0 : RowFactor == 1 ? row : row % m_rows.value();
+    const Index actual_col = traits<XprType>::ColsAtCompileTime == 1 ? 0 : ColFactor == 1 ? col : col % m_cols.value();
+
+    return m_argImpl.template packetSegment<LoadMode, PacketType>(actual_row, actual_col, begin, count);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
+    const Index actual_index = traits<XprType>::RowsAtCompileTime == 1
+                                   ? (ColFactor == 1 ? index : index % m_cols.value())
+                                   : (RowFactor == 1 ? index : index % m_rows.value());
+
+    return m_argImpl.template packetSegment<LoadMode, PacketType>(actual_index, begin, count);
+  }
+
+ protected:
+  const ArgTypeNested m_arg;
+  evaluator<ArgTypeNestedCleaned> m_argImpl;
+  const variable_if_dynamic<Index, ArgType::RowsAtCompileTime> m_rows;
+  const variable_if_dynamic<Index, ArgType::ColsAtCompileTime> m_cols;
+};
+
+// -------------------- MatrixWrapper and ArrayWrapper --------------------
+//
+// evaluator_wrapper_base<T> is a common base class for the
+// MatrixWrapper and ArrayWrapper evaluators.
+
+template <typename XprType>
+struct evaluator_wrapper_base : evaluator_base<XprType> {
+  typedef remove_all_t<typename XprType::NestedExpressionType> ArgType;
+  enum {
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
+    Flags = evaluator<ArgType>::Flags,
+    Alignment = evaluator<ArgType>::Alignment
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator_wrapper_base(const ArgType& arg) : m_argImpl(arg) {}
+
+  typedef typename ArgType::Scalar Scalar;
+  typedef typename ArgType::CoeffReturnType CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
+    return m_argImpl.coeff(row, col);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_argImpl.coeff(index); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) { return m_argImpl.coeffRef(row, col); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { return m_argImpl.coeffRef(index); }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
+    return m_argImpl.template packet<LoadMode, PacketType>(row, col);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index index) const {
+    return m_argImpl.template packet<LoadMode, PacketType>(index);
+  }
+
+  template <int StoreMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index row, Index col, const PacketType& x) {
+    m_argImpl.template writePacket<StoreMode>(row, col, x);
+  }
+
+  template <int StoreMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) {
+    m_argImpl.template writePacket<StoreMode>(index, x);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
+    return m_argImpl.template packetSegment<LoadMode, PacketType>(row, col, begin, count);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
+    return m_argImpl.template packetSegment<LoadMode, PacketType>(index, begin, count);
+  }
+
+  template <int StoreMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index row, Index col, const PacketType& x, Index begin,
+                                                                Index count) {
+    m_argImpl.template writePacketSegment<StoreMode>(row, col, x, begin, count);
+  }
+
+  template <int StoreMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index index, const PacketType& x, Index begin,
+                                                                Index count) {
+    m_argImpl.template writePacketSegment<StoreMode>(index, x, begin, count);
+  }
+
+ protected:
+  evaluator<ArgType> m_argImpl;
+};
+
+template <typename TArgType>
+struct unary_evaluator<MatrixWrapper<TArgType>> : evaluator_wrapper_base<MatrixWrapper<TArgType>> {
+  typedef MatrixWrapper<TArgType> XprType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit unary_evaluator(const XprType& wrapper)
+      : evaluator_wrapper_base<MatrixWrapper<TArgType>>(wrapper.nestedExpression()) {}
+};
+
+template <typename TArgType>
+struct unary_evaluator<ArrayWrapper<TArgType>> : evaluator_wrapper_base<ArrayWrapper<TArgType>> {
+  typedef ArrayWrapper<TArgType> XprType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit unary_evaluator(const XprType& wrapper)
+      : evaluator_wrapper_base<ArrayWrapper<TArgType>>(wrapper.nestedExpression()) {}
+};
+
+// -------------------- Reverse --------------------
+
+// defined in Reverse.h:
+template <typename PacketType, bool ReversePacket>
+struct reverse_packet_cond;
+
+template <typename ArgType, int Direction>
+struct unary_evaluator<Reverse<ArgType, Direction>> : evaluator_base<Reverse<ArgType, Direction>> {
+  typedef Reverse<ArgType, Direction> XprType;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  enum {
+    IsRowMajor = XprType::IsRowMajor,
+    IsColMajor = !IsRowMajor,
+    ReverseRow = (Direction == Vertical) || (Direction == BothDirections),
+    ReverseCol = (Direction == Horizontal) || (Direction == BothDirections),
+    ReversePacket = (Direction == BothDirections) || ((Direction == Vertical) && IsColMajor) ||
+                    ((Direction == Horizontal) && IsRowMajor),
+
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
+
+    // let's enable LinearAccess only with vectorization because of the product overhead
+    // FIXME enable DirectAccess with negative strides?
+    Flags0 = evaluator<ArgType>::Flags,
+    LinearAccess =
+        ((Direction == BothDirections) && (int(Flags0) & PacketAccessBit)) ||
+                ((ReverseRow && XprType::ColsAtCompileTime == 1) || (ReverseCol && XprType::RowsAtCompileTime == 1))
+            ? LinearAccessBit
+            : 0,
+
+    Flags = int(Flags0) & (HereditaryBits | PacketAccessBit | LinearAccess),
+
+    Alignment = 0  // FIXME in some rare cases, Alignment could be preserved, like a Vector4f.
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit unary_evaluator(const XprType& reverse)
+      : m_argImpl(reverse.nestedExpression()),
+        m_rows(ReverseRow ? reverse.nestedExpression().rows() : 1),
+        m_cols(ReverseCol ? reverse.nestedExpression().cols() : 1) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
+    return m_argImpl.coeff(ReverseRow ? m_rows.value() - row - 1 : row, ReverseCol ? m_cols.value() - col - 1 : col);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    return m_argImpl.coeff(m_rows.value() * m_cols.value() - index - 1);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) {
+    return m_argImpl.coeffRef(ReverseRow ? m_rows.value() - row - 1 : row, ReverseCol ? m_cols.value() - col - 1 : col);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
+    return m_argImpl.coeffRef(m_rows.value() * m_cols.value() - index - 1);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
+    static constexpr int PacketSize = unpacket_traits<PacketType>::size;
+    static constexpr int OffsetRow = ReverseRow && IsColMajor ? PacketSize : 1;
+    static constexpr int OffsetCol = ReverseCol && IsRowMajor ? PacketSize : 1;
+    using reverse_packet = reverse_packet_cond<PacketType, ReversePacket>;
+
+    Index actualRow = ReverseRow ? m_rows.value() - row - OffsetRow : row;
+    Index actualCol = ReverseCol ? m_cols.value() - col - OffsetCol : col;
+
+    return reverse_packet::run(m_argImpl.template packet<LoadMode, PacketType>(actualRow, actualCol));
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index index) const {
+    static constexpr int PacketSize = unpacket_traits<PacketType>::size;
+
+    Index actualIndex = m_rows.value() * m_cols.value() - index - PacketSize;
+
+    return preverse(m_argImpl.template packet<LoadMode, PacketType>(actualIndex));
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index row, Index col, const PacketType& x) {
+    static constexpr int PacketSize = unpacket_traits<PacketType>::size;
+    static constexpr int OffsetRow = ReverseRow && IsColMajor ? PacketSize : 1;
+    static constexpr int OffsetCol = ReverseCol && IsRowMajor ? PacketSize : 1;
+    using reverse_packet = reverse_packet_cond<PacketType, ReversePacket>;
+
+    Index actualRow = ReverseRow ? m_rows.value() - row - OffsetRow : row;
+    Index actualCol = ReverseCol ? m_cols.value() - col - OffsetCol : col;
+
+    m_argImpl.template writePacket<LoadMode>(actualRow, actualCol, reverse_packet::run(x));
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) {
+    static constexpr int PacketSize = unpacket_traits<PacketType>::size;
+
+    Index actualIndex = m_rows.value() * m_cols.value() - index - PacketSize;
+
+    m_argImpl.template writePacket<LoadMode>(actualIndex, preverse(x));
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
+    static constexpr int PacketSize = unpacket_traits<PacketType>::size;
+    static constexpr int OffsetRow = ReverseRow && IsColMajor ? PacketSize : 1;
+    static constexpr int OffsetCol = ReverseCol && IsRowMajor ? PacketSize : 1;
+    using reverse_packet = reverse_packet_cond<PacketType, ReversePacket>;
+
+    Index actualRow = ReverseRow ? m_rows.value() - row - OffsetRow : row;
+    Index actualCol = ReverseCol ? m_cols.value() - col - OffsetCol : col;
+    Index actualBegin = ReversePacket ? (PacketSize - count - begin) : begin;
+
+    return reverse_packet::run(
+        m_argImpl.template packetSegment<LoadMode, PacketType>(actualRow, actualCol, actualBegin, count));
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
+    static constexpr int PacketSize = unpacket_traits<PacketType>::size;
+
+    Index actualIndex = m_rows.value() * m_cols.value() - index - PacketSize;
+    Index actualBegin = PacketSize - count - begin;
+
+    return preverse(m_argImpl.template packetSegment<LoadMode, PacketType>(actualIndex, actualBegin, count));
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index row, Index col, const PacketType& x, Index begin,
+                                                                Index count) {
+    static constexpr int PacketSize = unpacket_traits<PacketType>::size;
+    static constexpr int OffsetRow = ReverseRow && IsColMajor ? PacketSize : 1;
+    static constexpr int OffsetCol = ReverseCol && IsRowMajor ? PacketSize : 1;
+    using reverse_packet = reverse_packet_cond<PacketType, ReversePacket>;
+
+    Index actualRow = ReverseRow ? m_rows.value() - row - OffsetRow : row;
+    Index actualCol = ReverseCol ? m_cols.value() - col - OffsetCol : col;
+    Index actualBegin = ReversePacket ? (PacketSize - count - begin) : begin;
+
+    m_argImpl.template writePacketSegment<LoadMode>(actualRow, actualCol, reverse_packet::run(x), actualBegin, count);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacketSegment(Index index, const PacketType& x, Index begin,
+                                                                Index count) {
+    static constexpr int PacketSize = unpacket_traits<PacketType>::size;
+
+    Index actualIndex = m_rows.value() * m_cols.value() - index - PacketSize;
+    Index actualBegin = PacketSize - count - begin;
+
+    m_argImpl.template writePacketSegment<LoadMode>(actualIndex, preverse(x), actualBegin, count);
+  }
+
+ protected:
+  evaluator<ArgType> m_argImpl;
+
+  // If we do not reverse rows, then we do not need to know the number of rows; same for columns
+  // Nonetheless, in this case it is important to set to 1 such that the coeff(index) method works fine for vectors.
+  const variable_if_dynamic<Index, ReverseRow ? ArgType::RowsAtCompileTime : 1> m_rows;
+  const variable_if_dynamic<Index, ReverseCol ? ArgType::ColsAtCompileTime : 1> m_cols;
+};
+
+// -------------------- Diagonal --------------------
+
+template <typename ArgType, int DiagIndex>
+struct evaluator<Diagonal<ArgType, DiagIndex>> : evaluator_base<Diagonal<ArgType, DiagIndex>> {
+  typedef Diagonal<ArgType, DiagIndex> XprType;
+
+  enum {
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
+
+    Flags =
+        (unsigned int)(evaluator<ArgType>::Flags & (HereditaryBits | DirectAccessBit) & ~RowMajorBit) | LinearAccessBit,
+
+    Alignment = 0
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& diagonal)
+      : m_argImpl(diagonal.nestedExpression()), m_index(diagonal.index()) {}
+
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index) const {
+    return m_argImpl.coeff(row + rowOffset(), row + colOffset());
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    return m_argImpl.coeff(index + rowOffset(), index + colOffset());
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index) {
+    return m_argImpl.coeffRef(row + rowOffset(), row + colOffset());
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
+    return m_argImpl.coeffRef(index + rowOffset(), index + colOffset());
+  }
+
+ protected:
+  evaluator<ArgType> m_argImpl;
+  const variable_if_dynamicindex<Index, XprType::DiagIndex> m_index;
+
+ private:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rowOffset() const {
+    return m_index.value() > 0 ? 0 : -m_index.value();
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index colOffset() const {
+    return m_index.value() > 0 ? m_index.value() : 0;
+  }
+};
+
+//----------------------------------------------------------------------
+// deprecated code
+//----------------------------------------------------------------------
+
+// -------------------- EvalToTemp --------------------
+
+// expression class for evaluating nested expression to a temporary
+
+template <typename ArgType>
+class EvalToTemp;
+
+template <typename ArgType>
+struct traits<EvalToTemp<ArgType>> : public traits<ArgType> {};
+
+template <typename ArgType>
+class EvalToTemp : public dense_xpr_base<EvalToTemp<ArgType>>::type {
+ public:
+  typedef typename dense_xpr_base<EvalToTemp>::type Base;
+  EIGEN_GENERIC_PUBLIC_INTERFACE(EvalToTemp)
+
+  explicit EvalToTemp(const ArgType& arg) : m_arg(arg) {}
+
+  const ArgType& arg() const { return m_arg; }
+
+  constexpr Index rows() const noexcept { return m_arg.rows(); }
+
+  constexpr Index cols() const noexcept { return m_arg.cols(); }
+
+ private:
+  const ArgType& m_arg;
+};
+
+template <typename ArgType>
+struct evaluator<EvalToTemp<ArgType>> : public evaluator<typename ArgType::PlainObject> {
+  typedef EvalToTemp<ArgType> XprType;
+  typedef typename ArgType::PlainObject PlainObject;
+  typedef evaluator<PlainObject> Base;
+
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : m_result(xpr.arg()) {
+    internal::construct_at<Base>(this, m_result);
+  }
+
+  // This constructor is used when nesting an EvalTo evaluator in another evaluator
+  EIGEN_DEVICE_FUNC evaluator(const ArgType& arg) : m_result(arg) { internal::construct_at<Base>(this, m_result); }
+
+ protected:
+  PlainObject m_result;
+};
+
+}  // namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_COREEVALUATORS_H
diff --git a/inst/include/Eigen/src/Core/CoreIterators.h b/inst/include/Eigen/src/Core/CoreIterators.h
index 6da4683d..f62cf238 100644
--- a/inst/include/Eigen/src/Core/CoreIterators.h
+++ b/inst/include/Eigen/src/Core/CoreIterators.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,52 +10,132 @@
 #ifndef EIGEN_COREITERATORS_H
 #define EIGEN_COREITERATORS_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 /* This file contains the respective InnerIterator definition of the expressions defined in Eigen/Core
  */
 
-/** \ingroup SparseCore_Module
-  * \class InnerIterator
-  * \brief An InnerIterator allows to loop over the element of a sparse (or dense) matrix or expression
-  *
-  * todo
-  */
-
-// generic version for dense matrix and expressions
-template<typename Derived> class DenseBase<Derived>::InnerIterator
-{
-  protected:
-    typedef typename Derived::Scalar Scalar;
-    typedef typename Derived::Index Index;
-
-    enum { IsRowMajor = (Derived::Flags&RowMajorBit)==RowMajorBit };
-  public:
-    EIGEN_STRONG_INLINE InnerIterator(const Derived& expr, Index outer)
-      : m_expression(expr), m_inner(0), m_outer(outer), m_end(expr.innerSize())
-    {}
-
-    EIGEN_STRONG_INLINE Scalar value() const
-    {
-      return (IsRowMajor) ? m_expression.coeff(m_outer, m_inner)
-                          : m_expression.coeff(m_inner, m_outer);
-    }
-
-    EIGEN_STRONG_INLINE InnerIterator& operator++() { m_inner++; return *this; }
-
-    EIGEN_STRONG_INLINE Index index() const { return m_inner; }
-    inline Index row() const { return IsRowMajor ? m_outer : index(); }
-    inline Index col() const { return IsRowMajor ? index() : m_outer; }
-
-    EIGEN_STRONG_INLINE operator bool() const { return m_inner < m_end && m_inner>=0; }
-
-  protected:
-    const Derived& m_expression;
-    Index m_inner;
-    const Index m_outer;
-    const Index m_end;
+namespace internal {
+
+template <typename XprType, typename EvaluatorKind>
+class inner_iterator_selector;
+
+}
+
+/** \class InnerIterator
+ * \brief An InnerIterator allows to loop over the element of any matrix expression.
+ *
+ * \warning To be used with care because an evaluator is constructed every time an InnerIterator iterator is
+ * constructed.
+ *
+ * TODO: add a usage example
+ */
+template <typename XprType>
+class InnerIterator {
+ protected:
+  typedef internal::inner_iterator_selector<XprType, typename internal::evaluator_traits<XprType>::Kind> IteratorType;
+  typedef internal::evaluator<XprType> EvaluatorType;
+  typedef typename internal::traits<XprType>::Scalar Scalar;
+
+ public:
+  /** Construct an iterator over the \a outerId -th row or column of \a xpr */
+  InnerIterator(const XprType &xpr, const Index &outerId) : m_eval(xpr), m_iter(m_eval, outerId, xpr.innerSize()) {}
+
+  /// \returns the value of the current coefficient.
+  EIGEN_STRONG_INLINE Scalar value() const { return m_iter.value(); }
+  /** Increment the iterator \c *this to the next non-zero coefficient.
+   * Explicit zeros are not skipped over. To skip explicit zeros, see class SparseView
+   */
+  EIGEN_STRONG_INLINE InnerIterator &operator++() {
+    m_iter.operator++();
+    return *this;
+  }
+  EIGEN_STRONG_INLINE InnerIterator &operator+=(Index i) {
+    m_iter.operator+=(i);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE InnerIterator operator+(Index i) {
+    InnerIterator result(*this);
+    result += i;
+    return result;
+  }
+
+  /// \returns the column or row index of the current coefficient.
+  EIGEN_STRONG_INLINE Index index() const { return m_iter.index(); }
+  /// \returns the row index of the current coefficient.
+  EIGEN_STRONG_INLINE Index row() const { return m_iter.row(); }
+  /// \returns the column index of the current coefficient.
+  EIGEN_STRONG_INLINE Index col() const { return m_iter.col(); }
+  /// \returns \c true if the iterator \c *this still references a valid coefficient.
+  EIGEN_STRONG_INLINE operator bool() const { return m_iter; }
+
+ protected:
+  EvaluatorType m_eval;
+  IteratorType m_iter;
+
+ private:
+  // If you get here, then you're not using the right InnerIterator type, e.g.:
+  //   SparseMatrix<double,RowMajor> A;
+  //   SparseMatrix<double>::InnerIterator it(A,0);
+  template <typename T>
+  InnerIterator(const EigenBase<T> &, Index outer);
 };
 
-} // end namespace Eigen
+namespace internal {
+
+// Generic inner iterator implementation for dense objects
+template <typename XprType>
+class inner_iterator_selector<XprType, IndexBased> {
+ protected:
+  typedef evaluator<XprType> EvaluatorType;
+  typedef typename traits<XprType>::Scalar Scalar;
+  enum { IsRowMajor = (XprType::Flags & RowMajorBit) == RowMajorBit };
+
+ public:
+  EIGEN_STRONG_INLINE inner_iterator_selector(const EvaluatorType &eval, const Index &outerId, const Index &innerSize)
+      : m_eval(eval), m_inner(0), m_outer(outerId), m_end(innerSize) {}
+
+  EIGEN_STRONG_INLINE Scalar value() const {
+    return (IsRowMajor) ? m_eval.coeff(m_outer, m_inner) : m_eval.coeff(m_inner, m_outer);
+  }
+
+  EIGEN_STRONG_INLINE inner_iterator_selector &operator++() {
+    m_inner++;
+    return *this;
+  }
+
+  EIGEN_STRONG_INLINE Index index() const { return m_inner; }
+  inline Index row() const { return IsRowMajor ? m_outer : index(); }
+  inline Index col() const { return IsRowMajor ? index() : m_outer; }
+
+  EIGEN_STRONG_INLINE operator bool() const { return m_inner < m_end && m_inner >= 0; }
+
+ protected:
+  const EvaluatorType &m_eval;
+  Index m_inner;
+  const Index m_outer;
+  const Index m_end;
+};
+
+// For iterator-based evaluator, inner-iterator is already implemented as
+// evaluator<>::InnerIterator
+template <typename XprType>
+class inner_iterator_selector<XprType, IteratorBased> : public evaluator<XprType>::InnerIterator {
+ protected:
+  typedef typename evaluator<XprType>::InnerIterator Base;
+  typedef evaluator<XprType> EvaluatorType;
+
+ public:
+  EIGEN_STRONG_INLINE inner_iterator_selector(const EvaluatorType &eval, const Index &outerId,
+                                              const Index & /*innerSize*/)
+      : Base(eval, outerId) {}
+};
+
+}  // end namespace internal
+
+}  // end namespace Eigen
 
-#endif // EIGEN_COREITERATORS_H
+#endif  // EIGEN_COREITERATORS_H
diff --git a/inst/include/Eigen/src/Core/CwiseBinaryOp.h b/inst/include/Eigen/src/Core/CwiseBinaryOp.h
index 519a866e..e2b2da5a 100644
--- a/inst/include/Eigen/src/Core/CwiseBinaryOp.h
+++ b/inst/include/Eigen/src/Core/CwiseBinaryOp.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -11,35 +11,17 @@
 #ifndef EIGEN_CWISE_BINARY_OP_H
 #define EIGEN_CWISE_BINARY_OP_H
 
-namespace Eigen {
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 
-/** \class CwiseBinaryOp
-  * \ingroup Core_Module
-  *
-  * \brief Generic expression where a coefficient-wise binary operator is applied to two expressions
-  *
-  * \param BinaryOp template functor implementing the operator
-  * \param Lhs the type of the left-hand side
-  * \param Rhs the type of the right-hand side
-  *
-  * This class represents an expression  where a coefficient-wise binary operator is applied to two expressions.
-  * It is the return type of binary operators, by which we mean only those binary operators where
-  * both the left-hand side and the right-hand side are Eigen expressions.
-  * For example, the return type of matrix1+matrix2 is a CwiseBinaryOp.
-  *
-  * Most of the time, this is the only way that it is used, so you typically don't have to name
-  * CwiseBinaryOp types explicitly.
-  *
-  * \sa MatrixBase::binaryExpr(const MatrixBase<OtherDerived> &,const CustomBinaryOp &) const, class CwiseUnaryOp, class CwiseNullaryOp
-  */
+namespace Eigen {
 
 namespace internal {
-template<typename BinaryOp, typename Lhs, typename Rhs>
-struct traits<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
-{
+template <typename BinaryOp, typename Lhs, typename Rhs>
+struct traits<CwiseBinaryOp<BinaryOp, Lhs, Rhs>> {
   // we must not inherit from traits<Lhs> since it has
   // the potential to cause problems with MSVC
-  typedef typename remove_all<Lhs>::type Ancestor;
+  typedef remove_all_t<Lhs> Ancestor;
   typedef typename traits<Ancestor>::XprKind XprKind;
   enum {
     RowsAtCompileTime = traits<Ancestor>::RowsAtCompileTime,
@@ -50,181 +32,135 @@ struct traits<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
 
   // even though we require Lhs and Rhs to have the same scalar type (see CwiseBinaryOp constructor),
   // we still want to handle the case when the result type is different.
-  typedef typename result_of<
-                     BinaryOp(
-                       typename Lhs::Scalar,
-                       typename Rhs::Scalar
-                     )
-                   >::type Scalar;
-  typedef typename promote_storage_type<typename traits<Lhs>::StorageKind,
-                                           typename traits<Rhs>::StorageKind>::ret StorageKind;
-  typedef typename promote_index_type<typename traits<Lhs>::Index,
-                                         typename traits<Rhs>::Index>::type Index;
+  typedef typename result_of<BinaryOp(const typename Lhs::Scalar&, const typename Rhs::Scalar&)>::type Scalar;
+  typedef typename cwise_promote_storage_type<typename traits<Lhs>::StorageKind, typename traits<Rhs>::StorageKind,
+                                              BinaryOp>::ret StorageKind;
+  typedef typename promote_index_type<typename traits<Lhs>::StorageIndex, typename traits<Rhs>::StorageIndex>::type
+      StorageIndex;
   typedef typename Lhs::Nested LhsNested;
   typedef typename Rhs::Nested RhsNested;
-  typedef typename remove_reference<LhsNested>::type _LhsNested;
-  typedef typename remove_reference<RhsNested>::type _RhsNested;
+  typedef std::remove_reference_t<LhsNested> LhsNested_;
+  typedef std::remove_reference_t<RhsNested> RhsNested_;
   enum {
-    LhsCoeffReadCost = _LhsNested::CoeffReadCost,
-    RhsCoeffReadCost = _RhsNested::CoeffReadCost,
-    LhsFlags = _LhsNested::Flags,
-    RhsFlags = _RhsNested::Flags,
-    SameType = is_same<typename _LhsNested::Scalar,typename _RhsNested::Scalar>::value,
-    StorageOrdersAgree = (int(Lhs::Flags)&RowMajorBit)==(int(Rhs::Flags)&RowMajorBit),
-    Flags0 = (int(LhsFlags) | int(RhsFlags)) & (
-        HereditaryBits
-      | (int(LhsFlags) & int(RhsFlags) &
-           ( AlignedBit
-           | (StorageOrdersAgree ? LinearAccessBit : 0)
-           | (functor_traits<BinaryOp>::PacketAccess && StorageOrdersAgree && SameType ? PacketAccessBit : 0)
-           )
-        )
-     ),
-    Flags = (Flags0 & ~RowMajorBit) | (LhsFlags & RowMajorBit),
-    Cost0 = EIGEN_ADD_COST(LhsCoeffReadCost,RhsCoeffReadCost),
-    CoeffReadCost = EIGEN_ADD_COST(Cost0,functor_traits<BinaryOp>::Cost)
+    Flags = cwise_promote_storage_order<typename traits<Lhs>::StorageKind, typename traits<Rhs>::StorageKind,
+                                        LhsNested_::Flags & RowMajorBit, RhsNested_::Flags & RowMajorBit>::value
   };
 };
-} // end namespace internal
-
-// we require Lhs and Rhs to have the same scalar type. Currently there is no example of a binary functor
-// that would take two operands of different types. If there were such an example, then this check should be
-// moved to the BinaryOp functors, on a per-case basis. This would however require a change in the BinaryOp functors, as
-// currently they take only one typename Scalar template parameter.
-// It is tempting to always allow mixing different types but remember that this is often impossible in the vectorized paths.
-// So allowing mixing different types gives very unexpected errors when enabling vectorization, when the user tries to
-// add together a float matrix and a double matrix.
-#define EIGEN_CHECK_BINARY_COMPATIBILIY(BINOP,LHS,RHS) \
-  EIGEN_STATIC_ASSERT((internal::functor_is_product_like<BINOP>::ret \
-                        ? int(internal::scalar_product_traits<LHS, RHS>::Defined) \
-                        : int(internal::is_same<LHS, RHS>::value)), \
-    YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
-
-template<typename BinaryOp, typename Lhs, typename Rhs, typename StorageKind>
+}  // end namespace internal
+
+template <typename BinaryOp, typename Lhs, typename Rhs, typename StorageKind>
 class CwiseBinaryOpImpl;
 
-template<typename BinaryOp, typename Lhs, typename Rhs>
-class CwiseBinaryOp : internal::no_assignment_operator,
-  public CwiseBinaryOpImpl<
-          BinaryOp, Lhs, Rhs,
-          typename internal::promote_storage_type<typename internal::traits<Lhs>::StorageKind,
-                                           typename internal::traits<Rhs>::StorageKind>::ret>
-{
-  public:
-
-    typedef typename CwiseBinaryOpImpl<
-        BinaryOp, Lhs, Rhs,
-        typename internal::promote_storage_type<typename internal::traits<Lhs>::StorageKind,
-                                         typename internal::traits<Rhs>::StorageKind>::ret>::Base Base;
-    EIGEN_GENERIC_PUBLIC_INTERFACE(CwiseBinaryOp)
-
-    typedef typename internal::nested<Lhs>::type LhsNested;
-    typedef typename internal::nested<Rhs>::type RhsNested;
-    typedef typename internal::remove_reference<LhsNested>::type _LhsNested;
-    typedef typename internal::remove_reference<RhsNested>::type _RhsNested;
-
-    EIGEN_STRONG_INLINE CwiseBinaryOp(const Lhs& aLhs, const Rhs& aRhs, const BinaryOp& func = BinaryOp())
-      : m_lhs(aLhs), m_rhs(aRhs), m_functor(func)
-    {
-      EIGEN_CHECK_BINARY_COMPATIBILIY(BinaryOp,typename Lhs::Scalar,typename Rhs::Scalar);
-      // require the sizes to match
-      EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Lhs, Rhs)
-      eigen_assert(aLhs.rows() == aRhs.rows() && aLhs.cols() == aRhs.cols());
-    }
-
-    EIGEN_STRONG_INLINE Index rows() const {
-      // return the fixed size type if available to enable compile time optimizations
-      if (internal::traits<typename internal::remove_all<LhsNested>::type>::RowsAtCompileTime==Dynamic)
-        return m_rhs.rows();
-      else
-        return m_lhs.rows();
-    }
-    EIGEN_STRONG_INLINE Index cols() const {
-      // return the fixed size type if available to enable compile time optimizations
-      if (internal::traits<typename internal::remove_all<LhsNested>::type>::ColsAtCompileTime==Dynamic)
-        return m_rhs.cols();
-      else
-        return m_lhs.cols();
-    }
-
-    /** \returns the left hand side nested expression */
-    const _LhsNested& lhs() const { return m_lhs; }
-    /** \returns the right hand side nested expression */
-    const _RhsNested& rhs() const { return m_rhs; }
-    /** \returns the functor representing the binary operation */
-    const BinaryOp& functor() const { return m_functor; }
-
-  protected:
-    LhsNested m_lhs;
-    RhsNested m_rhs;
-    const BinaryOp m_functor;
+/** \class CwiseBinaryOp
+ * \ingroup Core_Module
+ *
+ * \brief Generic expression where a coefficient-wise binary operator is applied to two expressions
+ *
+ * \tparam BinaryOp template functor implementing the operator
+ * \tparam LhsType the type of the left-hand side
+ * \tparam RhsType the type of the right-hand side
+ *
+ * This class represents an expression  where a coefficient-wise binary operator is applied to two expressions.
+ * It is the return type of binary operators, by which we mean only those binary operators where
+ * both the left-hand side and the right-hand side are Eigen expressions.
+ * For example, the return type of matrix1+matrix2 is a CwiseBinaryOp.
+ *
+ * Most of the time, this is the only way that it is used, so you typically don't have to name
+ * CwiseBinaryOp types explicitly.
+ *
+ * \sa MatrixBase::binaryExpr(const MatrixBase<OtherDerived> &,const CustomBinaryOp &) const, class CwiseUnaryOp, class
+ * CwiseNullaryOp
+ */
+template <typename BinaryOp, typename LhsType, typename RhsType>
+class CwiseBinaryOp : public CwiseBinaryOpImpl<BinaryOp, LhsType, RhsType,
+                                               typename internal::cwise_promote_storage_type<
+                                                   typename internal::traits<LhsType>::StorageKind,
+                                                   typename internal::traits<RhsType>::StorageKind, BinaryOp>::ret>,
+                      internal::no_assignment_operator {
+ public:
+  typedef internal::remove_all_t<BinaryOp> Functor;
+  typedef internal::remove_all_t<LhsType> Lhs;
+  typedef internal::remove_all_t<RhsType> Rhs;
+
+  typedef typename CwiseBinaryOpImpl<
+      BinaryOp, LhsType, RhsType,
+      typename internal::cwise_promote_storage_type<typename internal::traits<LhsType>::StorageKind,
+                                                    typename internal::traits<Rhs>::StorageKind, BinaryOp>::ret>::Base
+      Base;
+  EIGEN_GENERIC_PUBLIC_INTERFACE(CwiseBinaryOp)
+
+  EIGEN_CHECK_BINARY_COMPATIBILIY(BinaryOp, typename Lhs::Scalar, typename Rhs::Scalar)
+  EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Lhs, Rhs)
+
+  typedef typename internal::ref_selector<LhsType>::type LhsNested;
+  typedef typename internal::ref_selector<RhsType>::type RhsNested;
+  typedef std::remove_reference_t<LhsNested> LhsNested_;
+  typedef std::remove_reference_t<RhsNested> RhsNested_;
+
+#if EIGEN_COMP_MSVC
+  // Required for Visual Studio or the Copy constructor will probably not get inlined!
+  EIGEN_STRONG_INLINE CwiseBinaryOp(const CwiseBinaryOp<BinaryOp, LhsType, RhsType>&) = default;
+#endif
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CwiseBinaryOp(const Lhs& aLhs, const Rhs& aRhs,
+                                                      const BinaryOp& func = BinaryOp())
+      : m_lhs(aLhs), m_rhs(aRhs), m_functor(func) {
+    eigen_assert(aLhs.rows() == aRhs.rows() && aLhs.cols() == aRhs.cols());
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const noexcept {
+    // return the fixed size type if available to enable compile time optimizations
+    return internal::traits<internal::remove_all_t<LhsNested>>::RowsAtCompileTime == Dynamic ? m_rhs.rows()
+                                                                                             : m_lhs.rows();
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const noexcept {
+    // return the fixed size type if available to enable compile time optimizations
+    return internal::traits<internal::remove_all_t<LhsNested>>::ColsAtCompileTime == Dynamic ? m_rhs.cols()
+                                                                                             : m_lhs.cols();
+  }
+
+  /** \returns the left hand side nested expression */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const LhsNested_& lhs() const { return m_lhs; }
+  /** \returns the right hand side nested expression */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const RhsNested_& rhs() const { return m_rhs; }
+  /** \returns the functor representing the binary operation */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const BinaryOp& functor() const { return m_functor; }
+
+ protected:
+  LhsNested m_lhs;
+  RhsNested m_rhs;
+  const BinaryOp m_functor;
 };
 
-template<typename BinaryOp, typename Lhs, typename Rhs>
-class CwiseBinaryOpImpl<BinaryOp, Lhs, Rhs, Dense>
-  : public internal::dense_xpr_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >::type
-{
-    typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> Derived;
-  public:
-
-    typedef typename internal::dense_xpr_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >::type Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE( Derived )
-
-    EIGEN_STRONG_INLINE const Scalar coeff(Index rowId, Index colId) const
-    {
-      return derived().functor()(derived().lhs().coeff(rowId, colId),
-                                 derived().rhs().coeff(rowId, colId));
-    }
-
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE PacketScalar packet(Index rowId, Index colId) const
-    {
-      return derived().functor().packetOp(derived().lhs().template packet<LoadMode>(rowId, colId),
-                                          derived().rhs().template packet<LoadMode>(rowId, colId));
-    }
-
-    EIGEN_STRONG_INLINE const Scalar coeff(Index index) const
-    {
-      return derived().functor()(derived().lhs().coeff(index),
-                                 derived().rhs().coeff(index));
-    }
-
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE PacketScalar packet(Index index) const
-    {
-      return derived().functor().packetOp(derived().lhs().template packet<LoadMode>(index),
-                                          derived().rhs().template packet<LoadMode>(index));
-    }
+// Generic API dispatcher
+template <typename BinaryOp, typename Lhs, typename Rhs, typename StorageKind>
+class CwiseBinaryOpImpl : public internal::generic_xpr_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs>>::type {
+ public:
+  typedef typename internal::generic_xpr_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs>>::type Base;
 };
 
 /** replaces \c *this by \c *this - \a other.
-  *
-  * \returns a reference to \c *this
-  */
-template<typename Derived>
-template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived &
-MatrixBase<Derived>::operator-=(const MatrixBase<OtherDerived> &other)
-{
-  SelfCwiseBinaryOp<internal::scalar_difference_op<Scalar>, Derived, OtherDerived> tmp(derived());
-  tmp = other.derived();
+ *
+ * \returns a reference to \c *this
+ */
+template <typename Derived>
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator-=(const MatrixBase<OtherDerived>& other) {
+  call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar, typename OtherDerived::Scalar>());
   return derived();
 }
 
 /** replaces \c *this by \c *this + \a other.
-  *
-  * \returns a reference to \c *this
-  */
-template<typename Derived>
-template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived &
-MatrixBase<Derived>::operator+=(const MatrixBase<OtherDerived>& other)
-{
-  SelfCwiseBinaryOp<internal::scalar_sum_op<Scalar>, Derived, OtherDerived> tmp(derived());
-  tmp = other.derived();
+ *
+ * \returns a reference to \c *this
+ */
+template <typename Derived>
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::operator+=(const MatrixBase<OtherDerived>& other) {
+  call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar, typename OtherDerived::Scalar>());
   return derived();
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_CWISE_BINARY_OP_H
+#endif  // EIGEN_CWISE_BINARY_OP_H
diff --git a/inst/include/Eigen/src/Core/CwiseNullaryOp.h b/inst/include/Eigen/src/Core/CwiseNullaryOp.h
index a93bab2d..084f503f 100644
--- a/inst/include/Eigen/src/Core/CwiseNullaryOp.h
+++ b/inst/include/Eigen/src/Core/CwiseNullaryOp.h
@@ -10,15 +10,26 @@
 #ifndef EIGEN_CWISE_NULLARY_OP_H
 #define EIGEN_CWISE_NULLARY_OP_H
 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
 namespace Eigen {
 
+namespace internal {
+template <typename NullaryOp, typename PlainObjectType>
+struct traits<CwiseNullaryOp<NullaryOp, PlainObjectType> > : traits<PlainObjectType> {
+  enum { Flags = traits<PlainObjectType>::Flags & RowMajorBit };
+};
+
+}  // namespace internal
+
 /** \class CwiseNullaryOp
   * \ingroup Core_Module
   *
   * \brief Generic expression of a matrix where all coefficients are defined by a functor
   *
-  * \param NullaryOp template functor implementing the operator
-  * \param PlainObjectType the underlying plain matrix/array type
+  * \tparam NullaryOp template functor implementing the operator
+  * \tparam PlainObjectType the underlying plain matrix/array type
   *
   * This class represents an expression of a generic nullary operator.
   * It is the return type of the Ones(), Zero(), Constant(), Identity() and Random() methods,
@@ -27,708 +38,773 @@ namespace Eigen {
   * However, if you want to write a function returning such an expression, you
   * will need to use this class.
   *
-  * \sa class CwiseUnaryOp, class CwiseBinaryOp, DenseBase::NullaryExpr()
-  */
-
-namespace internal {
-template<typename NullaryOp, typename PlainObjectType>
-struct traits<CwiseNullaryOp<NullaryOp, PlainObjectType> > : traits<PlainObjectType>
-{
-  enum {
-    Flags = (traits<PlainObjectType>::Flags
-      & (  HereditaryBits
-         | (functor_has_linear_access<NullaryOp>::ret ? LinearAccessBit : 0)
-         | (functor_traits<NullaryOp>::PacketAccess ? PacketAccessBit : 0)))
-      | (functor_traits<NullaryOp>::IsRepeatable ? 0 : EvalBeforeNestingBit),
-    CoeffReadCost = functor_traits<NullaryOp>::Cost
-  };
-};
-}
-
-template<typename NullaryOp, typename PlainObjectType>
-class CwiseNullaryOp : internal::no_assignment_operator,
-  public internal::dense_xpr_base< CwiseNullaryOp<NullaryOp, PlainObjectType> >::type
-{
-  public:
-
-    typedef typename internal::dense_xpr_base<CwiseNullaryOp>::type Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(CwiseNullaryOp)
-
-    CwiseNullaryOp(Index nbRows, Index nbCols, const NullaryOp& func = NullaryOp())
-      : m_rows(nbRows), m_cols(nbCols), m_functor(func)
-    {
-      eigen_assert(nbRows >= 0
-            && (RowsAtCompileTime == Dynamic || RowsAtCompileTime == nbRows)
-            &&  nbCols >= 0
-            && (ColsAtCompileTime == Dynamic || ColsAtCompileTime == nbCols));
-    }
-
-    EIGEN_STRONG_INLINE Index rows() const { return m_rows.value(); }
-    EIGEN_STRONG_INLINE Index cols() const { return m_cols.value(); }
-
-    EIGEN_STRONG_INLINE const Scalar coeff(Index rowId, Index colId) const
-    {
-      return m_functor(rowId, colId);
-    }
-
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE PacketScalar packet(Index rowId, Index colId) const
-    {
-      return m_functor.packetOp(rowId, colId);
-    }
-
-    EIGEN_STRONG_INLINE const Scalar coeff(Index index) const
-    {
-      return m_functor(index);
-    }
+  * The functor NullaryOp must expose one of the following method:
+    <table class="manual">
+    <tr            ><td>\c operator()() </td><td>if the procedural generation does not depend on the coefficient entries
+  (e.g., random numbers)</td></tr> <tr class="alt"><td>\c operator()(Index i)</td><td>if the procedural generation makes
+  sense for vectors only and that it depends on the coefficient index \c i (e.g., linspace) </td></tr> <tr ><td>\c
+  operator()(Index i,Index j)</td><td>if the procedural generation depends on the matrix coordinates \c i, \c j (e.g.,
+  to generate a checkerboard with 0 and 1)</td></tr>
+    </table>
+  * It is also possible to expose the last two operators if the generation makes sense for matrices but can be optimized
+  for vectors.
+  *
+  * See DenseBase::NullaryExpr(Index,const CustomNullaryOp&) for an example binding
+  * C++11 random number generators.
+  *
+  * A nullary expression can also be used to implement custom sophisticated matrix manipulations
+  * that cannot be covered by the existing set of natively supported matrix manipulations.
+  * See this \ref TopicCustomizing_NullaryExpr "page" for some examples and additional explanations
+  * on the behavior of CwiseNullaryOp.
+  *
+  * \sa class CwiseUnaryOp, class CwiseBinaryOp, DenseBase::NullaryExpr
+  */
+template <typename NullaryOp, typename PlainObjectType>
+class CwiseNullaryOp : public internal::dense_xpr_base<CwiseNullaryOp<NullaryOp, PlainObjectType> >::type,
+                       internal::no_assignment_operator {
+ public:
+  typedef typename internal::dense_xpr_base<CwiseNullaryOp>::type Base;
+  EIGEN_DENSE_PUBLIC_INTERFACE(CwiseNullaryOp)
+
+  EIGEN_DEVICE_FUNC CwiseNullaryOp(Index rows, Index cols, const NullaryOp& func = NullaryOp())
+      : m_rows(rows), m_cols(cols), m_functor(func) {
+    eigen_assert(rows >= 0 && (RowsAtCompileTime == Dynamic || RowsAtCompileTime == rows) && cols >= 0 &&
+                 (ColsAtCompileTime == Dynamic || ColsAtCompileTime == cols));
+  }
+  EIGEN_DEVICE_FUNC CwiseNullaryOp(Index size, const NullaryOp& func = NullaryOp())
+      : CwiseNullaryOp(RowsAtCompileTime == 1 ? 1 : size, RowsAtCompileTime == 1 ? size : 1, func) {
+    EIGEN_STATIC_ASSERT(CwiseNullaryOp::IsVectorAtCompileTime, YOU_TRIED_CALLING_A_VECTOR_METHOD_ON_A_MATRIX);
+  }
 
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE PacketScalar packet(Index index) const
-    {
-      return m_functor.packetOp(index);
-    }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const { return m_rows.value(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const { return m_cols.value(); }
 
-    /** \returns the functor representing the nullary operation */
-    const NullaryOp& functor() const { return m_functor; }
+  /** \returns the functor representing the nullary operation */
+  EIGEN_DEVICE_FUNC const NullaryOp& functor() const { return m_functor; }
 
-  protected:
-    const internal::variable_if_dynamic<Index, RowsAtCompileTime> m_rows;
-    const internal::variable_if_dynamic<Index, ColsAtCompileTime> m_cols;
-    const NullaryOp m_functor;
+ protected:
+  const internal::variable_if_dynamic<Index, RowsAtCompileTime> m_rows;
+  const internal::variable_if_dynamic<Index, ColsAtCompileTime> m_cols;
+  const NullaryOp m_functor;
 };
 
-
 /** \returns an expression of a matrix defined by a custom functor \a func
-  *
-  * The parameters \a rows and \a cols are the number of rows and of columns of
-  * the returned matrix. Must be compatible with this MatrixBase type.
-  *
-  * This variant is meant to be used for dynamic-size matrix types. For fixed-size types,
-  * it is redundant to pass \a rows and \a cols as arguments, so Zero() should be used
-  * instead.
-  *
-  * The template parameter \a CustomNullaryOp is the type of the functor.
-  *
-  * \sa class CwiseNullaryOp
-  */
-template<typename Derived>
-template<typename CustomNullaryOp>
-EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, Derived>
-DenseBase<Derived>::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& func)
-{
-  return CwiseNullaryOp<CustomNullaryOp, Derived>(rows, cols, func);
+ *
+ * The parameters \a rows and \a cols are the number of rows and of columns of
+ * the returned matrix. Must be compatible with this MatrixBase type.
+ *
+ * This variant is meant to be used for dynamic-size matrix types. For fixed-size types,
+ * it is redundant to pass \a rows and \a cols as arguments, so NullaryExpr(const CustomNullaryOp&) should be used
+ * instead.
+ *
+ * The template parameter \a CustomNullaryOp is the type of the functor.
+ *
+ * \sa class CwiseNullaryOp
+ */
+template <typename Derived>
+template <typename CustomNullaryOp>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
+#else
+    const CwiseNullaryOp<CustomNullaryOp, PlainObject>
+#endif
+    DenseBase<Derived>::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& func) {
+  return CwiseNullaryOp<CustomNullaryOp, PlainObject>(rows, cols, func);
 }
 
 /** \returns an expression of a matrix defined by a custom functor \a func
-  *
-  * The parameter \a size is the size of the returned vector.
-  * Must be compatible with this MatrixBase type.
-  *
-  * \only_for_vectors
-  *
-  * This variant is meant to be used for dynamic-size vector types. For fixed-size types,
-  * it is redundant to pass \a size as argument, so Zero() should be used
-  * instead.
-  *
-  * The template parameter \a CustomNullaryOp is the type of the functor.
-  *
-  * \sa class CwiseNullaryOp
-  */
-template<typename Derived>
-template<typename CustomNullaryOp>
-EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, Derived>
-DenseBase<Derived>::NullaryExpr(Index size, const CustomNullaryOp& func)
-{
+ *
+ * The parameter \a size is the size of the returned vector.
+ * Must be compatible with this MatrixBase type.
+ *
+ * \only_for_vectors
+ *
+ * This variant is meant to be used for dynamic-size vector types. For fixed-size types,
+ * it is redundant to pass \a size as argument, so NullaryExpr(const CustomNullaryOp&) should be used
+ * instead.
+ *
+ * The template parameter \a CustomNullaryOp is the type of the functor.
+ *
+ * Here is an example with C++11 random generators: \include random_cpp11.cpp
+ * Output: \verbinclude random_cpp11.out
+ *
+ * \sa class CwiseNullaryOp
+ */
+template <typename Derived>
+template <typename CustomNullaryOp>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
+#else
+    const CwiseNullaryOp<CustomNullaryOp, PlainObject>
+#endif
+    DenseBase<Derived>::NullaryExpr(Index size, const CustomNullaryOp& func) {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  if(RowsAtCompileTime == 1) return CwiseNullaryOp<CustomNullaryOp, Derived>(1, size, func);
-  else return CwiseNullaryOp<CustomNullaryOp, Derived>(size, 1, func);
+  if (RowsAtCompileTime == 1)
+    return CwiseNullaryOp<CustomNullaryOp, PlainObject>(1, size, func);
+  else
+    return CwiseNullaryOp<CustomNullaryOp, PlainObject>(size, 1, func);
 }
 
 /** \returns an expression of a matrix defined by a custom functor \a func
-  *
-  * This variant is only for fixed-size DenseBase types. For dynamic-size types, you
-  * need to use the variants taking size arguments.
-  *
-  * The template parameter \a CustomNullaryOp is the type of the functor.
-  *
-  * \sa class CwiseNullaryOp
-  */
-template<typename Derived>
-template<typename CustomNullaryOp>
-EIGEN_STRONG_INLINE const CwiseNullaryOp<CustomNullaryOp, Derived>
-DenseBase<Derived>::NullaryExpr(const CustomNullaryOp& func)
-{
-  return CwiseNullaryOp<CustomNullaryOp, Derived>(RowsAtCompileTime, ColsAtCompileTime, func);
+ *
+ * This variant is only for fixed-size DenseBase types. For dynamic-size types, you
+ * need to use the variants taking size arguments.
+ *
+ * The template parameter \a CustomNullaryOp is the type of the functor.
+ *
+ * \sa class CwiseNullaryOp
+ */
+template <typename Derived>
+template <typename CustomNullaryOp>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    const CwiseNullaryOp<CustomNullaryOp, typename DenseBase<Derived>::PlainObject>
+#else
+    const CwiseNullaryOp<CustomNullaryOp, PlainObject>
+#endif
+    DenseBase<Derived>::NullaryExpr(const CustomNullaryOp& func) {
+  return CwiseNullaryOp<CustomNullaryOp, PlainObject>(RowsAtCompileTime, ColsAtCompileTime, func);
 }
 
 /** \returns an expression of a constant matrix of value \a value
-  *
-  * The parameters \a nbRows and \a nbCols are the number of rows and of columns of
-  * the returned matrix. Must be compatible with this DenseBase type.
-  *
-  * This variant is meant to be used for dynamic-size matrix types. For fixed-size types,
-  * it is redundant to pass \a nbRows and \a nbCols as arguments, so Zero() should be used
-  * instead.
-  *
-  * The template parameter \a CustomNullaryOp is the type of the functor.
-  *
-  * \sa class CwiseNullaryOp
-  */
-template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
-DenseBase<Derived>::Constant(Index nbRows, Index nbCols, const Scalar& value)
-{
-  return DenseBase<Derived>::NullaryExpr(nbRows, nbCols, internal::scalar_constant_op<Scalar>(value));
+ *
+ * The parameters \a rows and \a cols are the number of rows and of columns of
+ * the returned matrix. Must be compatible with this DenseBase type.
+ *
+ * This variant is meant to be used for dynamic-size matrix types. For fixed-size types,
+ * it is redundant to pass \a rows and \a cols as arguments, so Constant(const Scalar&) should be used
+ * instead.
+ *
+ * The template parameter \a CustomNullaryOp is the type of the functor.
+ *
+ * \sa class CwiseNullaryOp
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+DenseBase<Derived>::Constant(Index rows, Index cols, const Scalar& value) {
+  return DenseBase<Derived>::NullaryExpr(rows, cols, internal::scalar_constant_op<Scalar>(value));
 }
 
 /** \returns an expression of a constant matrix of value \a value
-  *
-  * The parameter \a size is the size of the returned vector.
-  * Must be compatible with this DenseBase type.
-  *
-  * \only_for_vectors
-  *
-  * This variant is meant to be used for dynamic-size vector types. For fixed-size types,
-  * it is redundant to pass \a size as argument, so Zero() should be used
-  * instead.
-  *
-  * The template parameter \a CustomNullaryOp is the type of the functor.
-  *
-  * \sa class CwiseNullaryOp
-  */
-template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
-DenseBase<Derived>::Constant(Index size, const Scalar& value)
-{
+ *
+ * The parameter \a size is the size of the returned vector.
+ * Must be compatible with this DenseBase type.
+ *
+ * \only_for_vectors
+ *
+ * This variant is meant to be used for dynamic-size vector types. For fixed-size types,
+ * it is redundant to pass \a size as argument, so Constant(const Scalar&) should be used
+ * instead.
+ *
+ * The template parameter \a CustomNullaryOp is the type of the functor.
+ *
+ * \sa class CwiseNullaryOp
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+DenseBase<Derived>::Constant(Index size, const Scalar& value) {
   return DenseBase<Derived>::NullaryExpr(size, internal::scalar_constant_op<Scalar>(value));
 }
 
 /** \returns an expression of a constant matrix of value \a value
-  *
-  * This variant is only for fixed-size DenseBase types. For dynamic-size types, you
-  * need to use the variants taking size arguments.
-  *
-  * The template parameter \a CustomNullaryOp is the type of the functor.
-  *
-  * \sa class CwiseNullaryOp
-  */
-template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
-DenseBase<Derived>::Constant(const Scalar& value)
-{
+ *
+ * This variant is only for fixed-size DenseBase types. For dynamic-size types, you
+ * need to use the variants taking size arguments.
+ *
+ * The template parameter \a CustomNullaryOp is the type of the functor.
+ *
+ * \sa class CwiseNullaryOp
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
+DenseBase<Derived>::Constant(const Scalar& value) {
   EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
-  return DenseBase<Derived>::NullaryExpr(RowsAtCompileTime, ColsAtCompileTime, internal::scalar_constant_op<Scalar>(value));
+  return DenseBase<Derived>::NullaryExpr(RowsAtCompileTime, ColsAtCompileTime,
+                                         internal::scalar_constant_op<Scalar>(value));
+}
+
+/** \deprecated because of accuracy loss. In Eigen 3.3, it is an alias for LinSpaced(Index,const Scalar&,const Scalar&)
+ *
+ * \only_for_vectors
+ *
+ * Example: \include DenseBase_LinSpaced_seq_deprecated.cpp
+ * Output: \verbinclude DenseBase_LinSpaced_seq_deprecated.out
+ *
+ * \sa LinSpaced(Index,const Scalar&, const Scalar&), setLinSpaced(Index,const Scalar&,const Scalar&)
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
+DenseBase<Derived>::LinSpaced(Sequential_t, Index size, const Scalar& low, const Scalar& high) {
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return DenseBase<Derived>::NullaryExpr(size, internal::linspaced_op<Scalar>(low, high, size));
+}
+
+/** \deprecated because of accuracy loss. In Eigen 3.3, it is an alias for LinSpaced(const Scalar&,const Scalar&)
+ *
+ * \sa LinSpaced(const Scalar&, const Scalar&)
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
+DenseBase<Derived>::LinSpaced(Sequential_t, const Scalar& low, const Scalar& high) {
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
+  return DenseBase<Derived>::NullaryExpr(Derived::SizeAtCompileTime,
+                                         internal::linspaced_op<Scalar>(low, high, Derived::SizeAtCompileTime));
 }
 
 /**
-  * \brief Sets a linearly space vector.
-  *
-  * The function generates 'size' equally spaced values in the closed interval [low,high].
-  * This particular version of LinSpaced() uses sequential access, i.e. vector access is
-  * assumed to be a(0), a(1), ..., a(size). This assumption allows for better vectorization
-  * and yields faster code than the random access version.
-  *
-  * When size is set to 1, a vector of length 1 containing 'high' is returned.
-  *
-  * \only_for_vectors
-  *
-  * Example: \include DenseBase_LinSpaced_seq.cpp
-  * Output: \verbinclude DenseBase_LinSpaced_seq.out
-  *
-  * \sa setLinSpaced(Index,const Scalar&,const Scalar&), LinSpaced(Index,Scalar,Scalar), CwiseNullaryOp
-  */
-template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::SequentialLinSpacedReturnType
-DenseBase<Derived>::LinSpaced(Sequential_t, Index size, const Scalar& low, const Scalar& high)
-{
+ * \brief Sets a linearly spaced vector.
+ *
+ * The function generates 'size' equally spaced values in the closed interval [low,high].
+ * When size is set to 1, a vector of length 1 containing 'high' is returned.
+ *
+ * \only_for_vectors
+ *
+ * Example: \include DenseBase_LinSpaced.cpp
+ * Output: \verbinclude DenseBase_LinSpaced.out
+ *
+ * For integer scalar types, an even spacing is possible if and only if the length of the range,
+ * i.e., \c high-low is a scalar multiple of \c size-1, or if \c size is a scalar multiple of the
+ * number of values \c high-low+1 (meaning each value can be repeated the same number of time).
+ * If one of these two considions is not satisfied, then \c high is lowered to the largest value
+ * satisfying one of this constraint.
+ * Here are some examples:
+ *
+ * Example: \include DenseBase_LinSpacedInt.cpp
+ * Output: \verbinclude DenseBase_LinSpacedInt.out
+ *
+ * \sa setLinSpaced(Index,const Scalar&,const Scalar&), CwiseNullaryOp
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
+DenseBase<Derived>::LinSpaced(Index size, const Scalar& low, const Scalar& high) {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return DenseBase<Derived>::NullaryExpr(size, internal::linspaced_op<Scalar,false>(low,high,size));
+  return DenseBase<Derived>::NullaryExpr(size, internal::linspaced_op<Scalar>(low, high, size));
 }
 
 /**
-  * \copydoc DenseBase::LinSpaced(Sequential_t, Index, const Scalar&, const Scalar&)
-  * Special version for fixed size types which does not require the size parameter.
-  */
-template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::SequentialLinSpacedReturnType
-DenseBase<Derived>::LinSpaced(Sequential_t, const Scalar& low, const Scalar& high)
-{
+ * \copydoc DenseBase::LinSpaced(Index, const DenseBase::Scalar&, const DenseBase::Scalar&)
+ * Special version for fixed size types which does not require the size parameter.
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
+DenseBase<Derived>::LinSpaced(const Scalar& low, const Scalar& high) {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
-  return DenseBase<Derived>::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op<Scalar,false>(low,high,Derived::SizeAtCompileTime));
+  return DenseBase<Derived>::NullaryExpr(Derived::SizeAtCompileTime,
+                                         internal::linspaced_op<Scalar>(low, high, Derived::SizeAtCompileTime));
 }
 
-/**
-  * \brief Sets a linearly space vector.
-  *
-  * The function generates 'size' equally spaced values in the closed interval [low,high].
-  * When size is set to 1, a vector of length 1 containing 'high' is returned.
-  *
-  * \only_for_vectors
-  *
-  * Example: \include DenseBase_LinSpaced.cpp
-  * Output: \verbinclude DenseBase_LinSpaced.out
-  *
-  * \sa setLinSpaced(Index,const Scalar&,const Scalar&), LinSpaced(Sequential_t,Index,const Scalar&,const Scalar&,Index), CwiseNullaryOp
-  */
-template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
-DenseBase<Derived>::LinSpaced(Index size, const Scalar& low, const Scalar& high)
-{
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessEqualSpacedReturnType
+DenseBase<Derived>::EqualSpaced(Index size, const Scalar& low, const Scalar& step) {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return DenseBase<Derived>::NullaryExpr(size, internal::linspaced_op<Scalar,true>(low,high,size));
+  return DenseBase<Derived>::NullaryExpr(size, internal::equalspaced_op<Scalar>(low, step));
 }
 
-/**
-  * \copydoc DenseBase::LinSpaced(Index, const Scalar&, const Scalar&)
-  * Special version for fixed size types which does not require the size parameter.
-  */
-template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessLinSpacedReturnType
-DenseBase<Derived>::LinSpaced(const Scalar& low, const Scalar& high)
-{
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::RandomAccessEqualSpacedReturnType
+DenseBase<Derived>::EqualSpaced(const Scalar& low, const Scalar& step) {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
-  return DenseBase<Derived>::NullaryExpr(Derived::SizeAtCompileTime, internal::linspaced_op<Scalar,true>(low,high,Derived::SizeAtCompileTime));
+  return DenseBase<Derived>::NullaryExpr(Derived::SizeAtCompileTime, internal::equalspaced_op<Scalar>(low, step));
 }
 
 /** \returns true if all coefficients in this matrix are approximately equal to \a val, to within precision \a prec */
-template<typename Derived>
-bool DenseBase<Derived>::isApproxToConstant
-(const Scalar& val, const RealScalar& prec) const
-{
-  for(Index j = 0; j < cols(); ++j)
-    for(Index i = 0; i < rows(); ++i)
-      if(!internal::isApprox(this->coeff(i, j), val, prec))
-        return false;
+template <typename Derived>
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isApproxToConstant(const Scalar& val, const RealScalar& prec) const {
+  typename internal::nested_eval<Derived, 1>::type self(derived());
+  for (Index j = 0; j < cols(); ++j)
+    for (Index i = 0; i < rows(); ++i)
+      if (!internal::isApprox(self.coeff(i, j), val, prec)) return false;
   return true;
 }
 
 /** This is just an alias for isApproxToConstant().
-  *
-  * \returns true if all coefficients in this matrix are approximately equal to \a value, to within precision \a prec */
-template<typename Derived>
-bool DenseBase<Derived>::isConstant
-(const Scalar& val, const RealScalar& prec) const
-{
+ *
+ * \returns true if all coefficients in this matrix are approximately equal to \a value, to within precision \a prec */
+template <typename Derived>
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isConstant(const Scalar& val, const RealScalar& prec) const {
   return isApproxToConstant(val, prec);
 }
 
 /** Alias for setConstant(): sets all coefficients in this expression to \a val.
-  *
-  * \sa setConstant(), Constant(), class CwiseNullaryOp
-  */
-template<typename Derived>
-EIGEN_STRONG_INLINE void DenseBase<Derived>::fill(const Scalar& val)
-{
+ *
+ * \sa setConstant(), Constant(), class CwiseNullaryOp
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void DenseBase<Derived>::fill(const Scalar& val) {
   setConstant(val);
 }
 
-/** Sets all coefficients in this expression to \a value.
-  *
-  * \sa fill(), setConstant(Index,const Scalar&), setConstant(Index,Index,const Scalar&), setZero(), setOnes(), Constant(), class CwiseNullaryOp, setZero(), setOnes()
-  */
-template<typename Derived>
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setConstant(const Scalar& val)
-{
-  return derived() = Constant(rows(), cols(), val);
-}
-
-/** Resizes to the given \a size, and sets all coefficients in this expression to the given \a value.
-  *
-  * \only_for_vectors
-  *
-  * Example: \include Matrix_setConstant_int.cpp
-  * Output: \verbinclude Matrix_setConstant_int.out
-  *
-  * \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&)
-  */
-template<typename Derived>
-EIGEN_STRONG_INLINE Derived&
-PlainObjectBase<Derived>::setConstant(Index size, const Scalar& val)
-{
+/** Sets all coefficients in this expression to value \a val.
+ *
+ * \sa fill(), setConstant(Index,const Scalar&), setConstant(Index,Index,const Scalar&), setZero(), setOnes(),
+ * Constant(), class CwiseNullaryOp, setZero(), setOnes()
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setConstant(const Scalar& val) {
+  internal::eigen_fill_impl<Derived>::run(derived(), val);
+  return derived();
+}
+
+/** Resizes to the given \a size, and sets all coefficients in this expression to the given value \a val.
+ *
+ * \only_for_vectors
+ *
+ * Example: \include Matrix_setConstant_int.cpp
+ * Output: \verbinclude Matrix_setConstant_int.out
+ *
+ * \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,Index,const Scalar&), class CwiseNullaryOp,
+ * MatrixBase::Constant(const Scalar&)
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& PlainObjectBase<Derived>::setConstant(Index size, const Scalar& val) {
   resize(size);
   return setConstant(val);
 }
 
-/** Resizes to the given size, and sets all coefficients in this expression to the given \a value.
-  *
-  * \param nbRows the new number of rows
-  * \param nbCols the new number of columns
-  * \param val the value to which all coefficients are set
-  *
-  * Example: \include Matrix_setConstant_int_int.cpp
-  * Output: \verbinclude Matrix_setConstant_int_int.out
-  *
-  * \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&)
-  */
-template<typename Derived>
-EIGEN_STRONG_INLINE Derived&
-PlainObjectBase<Derived>::setConstant(Index nbRows, Index nbCols, const Scalar& val)
-{
-  resize(nbRows, nbCols);
+/** Resizes to the given size, and sets all coefficients in this expression to the given value \a val.
+ *
+ * \param rows the new number of rows
+ * \param cols the new number of columns
+ * \param val the value to which all coefficients are set
+ *
+ * Example: \include Matrix_setConstant_int_int.cpp
+ * Output: \verbinclude Matrix_setConstant_int_int.out
+ *
+ * \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,const Scalar&), class CwiseNullaryOp,
+ * MatrixBase::Constant(const Scalar&)
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& PlainObjectBase<Derived>::setConstant(Index rows, Index cols,
+                                                                                     const Scalar& val) {
+  resize(rows, cols);
   return setConstant(val);
 }
 
+/** Resizes to the given size, changing only the number of columns, and sets all
+ * coefficients in this expression to the given value \a val. For the parameter
+ * of type NoChange_t, just pass the special value \c NoChange.
+ *
+ * \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,const Scalar&), class CwiseNullaryOp,
+ * MatrixBase::Constant(const Scalar&)
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& PlainObjectBase<Derived>::setConstant(NoChange_t, Index cols,
+                                                                                     const Scalar& val) {
+  return setConstant(rows(), cols, val);
+}
+
+/** Resizes to the given size, changing only the number of rows, and sets all
+ * coefficients in this expression to the given value \a val. For the parameter
+ * of type NoChange_t, just pass the special value \c NoChange.
+ *
+ * \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,const Scalar&), class CwiseNullaryOp,
+ * MatrixBase::Constant(const Scalar&)
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& PlainObjectBase<Derived>::setConstant(Index rows, NoChange_t,
+                                                                                     const Scalar& val) {
+  return setConstant(rows, cols(), val);
+}
+
 /**
-  * \brief Sets a linearly space vector.
-  *
-  * The function generates 'size' equally spaced values in the closed interval [low,high].
-  * When size is set to 1, a vector of length 1 containing 'high' is returned.
-  *
-  * \only_for_vectors
-  *
-  * Example: \include DenseBase_setLinSpaced.cpp
-  * Output: \verbinclude DenseBase_setLinSpaced.out
-  *
-  * \sa CwiseNullaryOp
-  */
-template<typename Derived>
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(Index newSize, const Scalar& low, const Scalar& high)
-{
+ * \brief Sets a linearly spaced vector.
+ *
+ * The function generates 'size' equally spaced values in the closed interval [low,high].
+ * When size is set to 1, a vector of length 1 containing 'high' is returned.
+ *
+ * \only_for_vectors
+ *
+ * Example: \include DenseBase_setLinSpaced.cpp
+ * Output: \verbinclude DenseBase_setLinSpaced.out
+ *
+ * For integer scalar types, do not miss the explanations on the definition
+ * of \link LinSpaced(Index,const Scalar&,const Scalar&) even spacing \endlink.
+ *
+ * \sa LinSpaced(Index,const Scalar&,const Scalar&), CwiseNullaryOp
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(Index newSize, const Scalar& low,
+                                                                                const Scalar& high) {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return derived() = Derived::NullaryExpr(newSize, internal::linspaced_op<Scalar,false>(low,high,newSize));
+  return derived() = Derived::NullaryExpr(newSize, internal::linspaced_op<Scalar>(low, high, newSize));
 }
 
 /**
-  * \brief Sets a linearly space vector.
-  *
-  * The function fill *this with equally spaced values in the closed interval [low,high].
-  * When size is set to 1, a vector of length 1 containing 'high' is returned.
-  *
-  * \only_for_vectors
-  *
-  * \sa setLinSpaced(Index, const Scalar&, const Scalar&), CwiseNullaryOp
-  */
-template<typename Derived>
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(const Scalar& low, const Scalar& high)
-{
+ * \brief Sets a linearly spaced vector.
+ *
+ * The function fills \c *this with equally spaced values in the closed interval [low,high].
+ * When size is set to 1, a vector of length 1 containing 'high' is returned.
+ *
+ * \only_for_vectors
+ *
+ * For integer scalar types, do not miss the explanations on the definition
+ * of \link LinSpaced(Index,const Scalar&,const Scalar&) even spacing \endlink.
+ *
+ * \sa LinSpaced(Index,const Scalar&,const Scalar&), setLinSpaced(Index, const Scalar&, const Scalar&), CwiseNullaryOp
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setLinSpaced(const Scalar& low, const Scalar& high) {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return setLinSpaced(size(), low, high);
 }
 
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setEqualSpaced(Index newSize, const Scalar& low,
+                                                                                  const Scalar& step) {
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return derived() = Derived::NullaryExpr(newSize, internal::equalspaced_op<Scalar>(low, step));
+}
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setEqualSpaced(const Scalar& low,
+                                                                                  const Scalar& step) {
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return setEqualSpaced(size(), low, step);
+}
+
 // zero:
 
 /** \returns an expression of a zero matrix.
-  *
-  * The parameters \a rows and \a cols are the number of rows and of columns of
-  * the returned matrix. Must be compatible with this MatrixBase type.
-  *
-  * This variant is meant to be used for dynamic-size matrix types. For fixed-size types,
-  * it is redundant to pass \a rows and \a cols as arguments, so Zero() should be used
-  * instead.
-  *
-  * Example: \include MatrixBase_zero_int_int.cpp
-  * Output: \verbinclude MatrixBase_zero_int_int.out
-  *
-  * \sa Zero(), Zero(Index)
-  */
-template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
-DenseBase<Derived>::Zero(Index nbRows, Index nbCols)
-{
-  return Constant(nbRows, nbCols, Scalar(0));
+ *
+ * The parameters \a rows and \a cols are the number of rows and of columns of
+ * the returned matrix. Must be compatible with this MatrixBase type.
+ *
+ * This variant is meant to be used for dynamic-size matrix types. For fixed-size types,
+ * it is redundant to pass \a rows and \a cols as arguments, so Zero() should be used
+ * instead.
+ *
+ * Example: \include MatrixBase_zero_int_int.cpp
+ * Output: \verbinclude MatrixBase_zero_int_int.out
+ *
+ * \sa Zero(), Zero(Index)
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ZeroReturnType DenseBase<Derived>::Zero(
+    Index rows, Index cols) {
+  return ZeroReturnType(rows, cols);
 }
 
 /** \returns an expression of a zero vector.
-  *
-  * The parameter \a size is the size of the returned vector.
-  * Must be compatible with this MatrixBase type.
-  *
-  * \only_for_vectors
-  *
-  * This variant is meant to be used for dynamic-size vector types. For fixed-size types,
-  * it is redundant to pass \a size as argument, so Zero() should be used
-  * instead.
-  *
-  * Example: \include MatrixBase_zero_int.cpp
-  * Output: \verbinclude MatrixBase_zero_int.out
-  *
-  * \sa Zero(), Zero(Index,Index)
-  */
-template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
-DenseBase<Derived>::Zero(Index size)
-{
-  return Constant(size, Scalar(0));
+ *
+ * The parameter \a size is the size of the returned vector.
+ * Must be compatible with this MatrixBase type.
+ *
+ * \only_for_vectors
+ *
+ * This variant is meant to be used for dynamic-size vector types. For fixed-size types,
+ * it is redundant to pass \a size as argument, so Zero() should be used
+ * instead.
+ *
+ * Example: \include MatrixBase_zero_int.cpp
+ * Output: \verbinclude MatrixBase_zero_int.out
+ *
+ * \sa Zero(), Zero(Index,Index)
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ZeroReturnType DenseBase<Derived>::Zero(
+    Index size) {
+  return ZeroReturnType(size);
 }
 
 /** \returns an expression of a fixed-size zero matrix or vector.
-  *
-  * This variant is only for fixed-size MatrixBase types. For dynamic-size types, you
-  * need to use the variants taking size arguments.
-  *
-  * Example: \include MatrixBase_zero.cpp
-  * Output: \verbinclude MatrixBase_zero.out
-  *
-  * \sa Zero(Index), Zero(Index,Index)
-  */
-template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
-DenseBase<Derived>::Zero()
-{
-  return Constant(Scalar(0));
+ *
+ * This variant is only for fixed-size MatrixBase types. For dynamic-size types, you
+ * need to use the variants taking size arguments.
+ *
+ * Example: \include MatrixBase_zero.cpp
+ * Output: \verbinclude MatrixBase_zero.out
+ *
+ * \sa Zero(Index), Zero(Index,Index)
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ZeroReturnType DenseBase<Derived>::Zero() {
+  return ZeroReturnType(RowsAtCompileTime, ColsAtCompileTime);
 }
 
 /** \returns true if *this is approximately equal to the zero matrix,
-  *          within the precision given by \a prec.
-  *
-  * Example: \include MatrixBase_isZero.cpp
-  * Output: \verbinclude MatrixBase_isZero.out
-  *
-  * \sa class CwiseNullaryOp, Zero()
-  */
-template<typename Derived>
-bool DenseBase<Derived>::isZero(const RealScalar& prec) const
-{
-  for(Index j = 0; j < cols(); ++j)
-    for(Index i = 0; i < rows(); ++i)
-      if(!internal::isMuchSmallerThan(this->coeff(i, j), static_cast<Scalar>(1), prec))
-        return false;
+ *          within the precision given by \a prec.
+ *
+ * Example: \include MatrixBase_isZero.cpp
+ * Output: \verbinclude MatrixBase_isZero.out
+ *
+ * \sa class CwiseNullaryOp, Zero()
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isZero(const RealScalar& prec) const {
+  typename internal::nested_eval<Derived, 1>::type self(derived());
+  for (Index j = 0; j < cols(); ++j)
+    for (Index i = 0; i < rows(); ++i)
+      if (!internal::isMuchSmallerThan(self.coeff(i, j), static_cast<Scalar>(1), prec)) return false;
   return true;
 }
 
 /** Sets all coefficients in this expression to zero.
-  *
-  * Example: \include MatrixBase_setZero.cpp
-  * Output: \verbinclude MatrixBase_setZero.out
-  *
-  * \sa class CwiseNullaryOp, Zero()
-  */
-template<typename Derived>
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setZero()
-{
-  return setConstant(Scalar(0));
+ *
+ * Example: \include MatrixBase_setZero.cpp
+ * Output: \verbinclude MatrixBase_setZero.out
+ *
+ * \sa class CwiseNullaryOp, Zero()
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setZero() {
+  internal::eigen_zero_impl<Derived>::run(derived());
+  return derived();
 }
 
 /** Resizes to the given \a size, and sets all coefficients in this expression to zero.
-  *
-  * \only_for_vectors
-  *
-  * Example: \include Matrix_setZero_int.cpp
-  * Output: \verbinclude Matrix_setZero_int.out
-  *
-  * \sa DenseBase::setZero(), setZero(Index,Index), class CwiseNullaryOp, DenseBase::Zero()
-  */
-template<typename Derived>
-EIGEN_STRONG_INLINE Derived&
-PlainObjectBase<Derived>::setZero(Index newSize)
-{
+ *
+ * \only_for_vectors
+ *
+ * Example: \include Matrix_setZero_int.cpp
+ * Output: \verbinclude Matrix_setZero_int.out
+ *
+ * \sa DenseBase::setZero(), setZero(Index,Index), class CwiseNullaryOp, DenseBase::Zero()
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& PlainObjectBase<Derived>::setZero(Index newSize) {
   resize(newSize);
-  return setConstant(Scalar(0));
+  return setZero();
 }
 
 /** Resizes to the given size, and sets all coefficients in this expression to zero.
-  *
-  * \param nbRows the new number of rows
-  * \param nbCols the new number of columns
-  *
-  * Example: \include Matrix_setZero_int_int.cpp
-  * Output: \verbinclude Matrix_setZero_int_int.out
-  *
-  * \sa DenseBase::setZero(), setZero(Index), class CwiseNullaryOp, DenseBase::Zero()
-  */
-template<typename Derived>
-EIGEN_STRONG_INLINE Derived&
-PlainObjectBase<Derived>::setZero(Index nbRows, Index nbCols)
-{
-  resize(nbRows, nbCols);
-  return setConstant(Scalar(0));
+ *
+ * \param rows the new number of rows
+ * \param cols the new number of columns
+ *
+ * Example: \include Matrix_setZero_int_int.cpp
+ * Output: \verbinclude Matrix_setZero_int_int.out
+ *
+ * \sa DenseBase::setZero(), setZero(Index), class CwiseNullaryOp, DenseBase::Zero()
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& PlainObjectBase<Derived>::setZero(Index rows, Index cols) {
+  resize(rows, cols);
+  return setZero();
+}
+
+/** Resizes to the given size, changing only the number of columns, and sets all
+ * coefficients in this expression to zero. For the parameter of type NoChange_t,
+ * just pass the special value \c NoChange.
+ *
+ * \sa DenseBase::setZero(), setZero(Index), setZero(Index, Index), setZero(Index, NoChange_t), class CwiseNullaryOp,
+ * DenseBase::Zero()
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& PlainObjectBase<Derived>::setZero(NoChange_t, Index cols) {
+  return setZero(rows(), cols);
+}
+
+/** Resizes to the given size, changing only the number of rows, and sets all
+ * coefficients in this expression to zero. For the parameter of type NoChange_t,
+ * just pass the special value \c NoChange.
+ *
+ * \sa DenseBase::setZero(), setZero(Index), setZero(Index, Index), setZero(NoChange_t, Index), class CwiseNullaryOp,
+ * DenseBase::Zero()
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& PlainObjectBase<Derived>::setZero(Index rows, NoChange_t) {
+  return setZero(rows, cols());
 }
 
 // ones:
 
 /** \returns an expression of a matrix where all coefficients equal one.
-  *
-  * The parameters \a nbRows and \a nbCols are the number of rows and of columns of
-  * the returned matrix. Must be compatible with this MatrixBase type.
-  *
-  * This variant is meant to be used for dynamic-size matrix types. For fixed-size types,
-  * it is redundant to pass \a rows and \a cols as arguments, so Ones() should be used
-  * instead.
-  *
-  * Example: \include MatrixBase_ones_int_int.cpp
-  * Output: \verbinclude MatrixBase_ones_int_int.out
-  *
-  * \sa Ones(), Ones(Index), isOnes(), class Ones
-  */
-template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
-DenseBase<Derived>::Ones(Index nbRows, Index nbCols)
-{
-  return Constant(nbRows, nbCols, Scalar(1));
+ *
+ * The parameters \a rows and \a cols are the number of rows and of columns of
+ * the returned matrix. Must be compatible with this MatrixBase type.
+ *
+ * This variant is meant to be used for dynamic-size matrix types. For fixed-size types,
+ * it is redundant to pass \a rows and \a cols as arguments, so Ones() should be used
+ * instead.
+ *
+ * Example: \include MatrixBase_ones_int_int.cpp
+ * Output: \verbinclude MatrixBase_ones_int_int.out
+ *
+ * \sa Ones(), Ones(Index), isOnes(), class Ones
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType DenseBase<Derived>::Ones(
+    Index rows, Index cols) {
+  return Constant(rows, cols, Scalar(1));
 }
 
 /** \returns an expression of a vector where all coefficients equal one.
-  *
-  * The parameter \a newSize is the size of the returned vector.
-  * Must be compatible with this MatrixBase type.
-  *
-  * \only_for_vectors
-  *
-  * This variant is meant to be used for dynamic-size vector types. For fixed-size types,
-  * it is redundant to pass \a size as argument, so Ones() should be used
-  * instead.
-  *
-  * Example: \include MatrixBase_ones_int.cpp
-  * Output: \verbinclude MatrixBase_ones_int.out
-  *
-  * \sa Ones(), Ones(Index,Index), isOnes(), class Ones
-  */
-template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
-DenseBase<Derived>::Ones(Index newSize)
-{
+ *
+ * The parameter \a newSize is the size of the returned vector.
+ * Must be compatible with this MatrixBase type.
+ *
+ * \only_for_vectors
+ *
+ * This variant is meant to be used for dynamic-size vector types. For fixed-size types,
+ * it is redundant to pass \a size as argument, so Ones() should be used
+ * instead.
+ *
+ * Example: \include MatrixBase_ones_int.cpp
+ * Output: \verbinclude MatrixBase_ones_int.out
+ *
+ * \sa Ones(), Ones(Index,Index), isOnes(), class Ones
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType DenseBase<Derived>::Ones(
+    Index newSize) {
   return Constant(newSize, Scalar(1));
 }
 
 /** \returns an expression of a fixed-size matrix or vector where all coefficients equal one.
-  *
-  * This variant is only for fixed-size MatrixBase types. For dynamic-size types, you
-  * need to use the variants taking size arguments.
-  *
-  * Example: \include MatrixBase_ones.cpp
-  * Output: \verbinclude MatrixBase_ones.out
-  *
-  * \sa Ones(Index), Ones(Index,Index), isOnes(), class Ones
-  */
-template<typename Derived>
-EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType
-DenseBase<Derived>::Ones()
-{
+ *
+ * This variant is only for fixed-size MatrixBase types. For dynamic-size types, you
+ * need to use the variants taking size arguments.
+ *
+ * Example: \include MatrixBase_ones.cpp
+ * Output: \verbinclude MatrixBase_ones.out
+ *
+ * \sa Ones(Index), Ones(Index,Index), isOnes(), class Ones
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstantReturnType DenseBase<Derived>::Ones() {
   return Constant(Scalar(1));
 }
 
 /** \returns true if *this is approximately equal to the matrix where all coefficients
-  *          are equal to 1, within the precision given by \a prec.
-  *
-  * Example: \include MatrixBase_isOnes.cpp
-  * Output: \verbinclude MatrixBase_isOnes.out
-  *
-  * \sa class CwiseNullaryOp, Ones()
-  */
-template<typename Derived>
-bool DenseBase<Derived>::isOnes
-(const RealScalar& prec) const
-{
+ *          are equal to 1, within the precision given by \a prec.
+ *
+ * Example: \include MatrixBase_isOnes.cpp
+ * Output: \verbinclude MatrixBase_isOnes.out
+ *
+ * \sa class CwiseNullaryOp, Ones()
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isOnes(const RealScalar& prec) const {
   return isApproxToConstant(Scalar(1), prec);
 }
 
 /** Sets all coefficients in this expression to one.
-  *
-  * Example: \include MatrixBase_setOnes.cpp
-  * Output: \verbinclude MatrixBase_setOnes.out
-  *
-  * \sa class CwiseNullaryOp, Ones()
-  */
-template<typename Derived>
-EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setOnes()
-{
+ *
+ * Example: \include MatrixBase_setOnes.cpp
+ * Output: \verbinclude MatrixBase_setOnes.out
+ *
+ * \sa class CwiseNullaryOp, Ones()
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::setOnes() {
   return setConstant(Scalar(1));
 }
 
 /** Resizes to the given \a newSize, and sets all coefficients in this expression to one.
-  *
-  * \only_for_vectors
-  *
-  * Example: \include Matrix_setOnes_int.cpp
-  * Output: \verbinclude Matrix_setOnes_int.out
-  *
-  * \sa MatrixBase::setOnes(), setOnes(Index,Index), class CwiseNullaryOp, MatrixBase::Ones()
-  */
-template<typename Derived>
-EIGEN_STRONG_INLINE Derived&
-PlainObjectBase<Derived>::setOnes(Index newSize)
-{
+ *
+ * \only_for_vectors
+ *
+ * Example: \include Matrix_setOnes_int.cpp
+ * Output: \verbinclude Matrix_setOnes_int.out
+ *
+ * \sa MatrixBase::setOnes(), setOnes(Index,Index), class CwiseNullaryOp, MatrixBase::Ones()
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& PlainObjectBase<Derived>::setOnes(Index newSize) {
   resize(newSize);
   return setConstant(Scalar(1));
 }
 
 /** Resizes to the given size, and sets all coefficients in this expression to one.
-  *
-  * \param nbRows the new number of rows
-  * \param nbCols the new number of columns
-  *
-  * Example: \include Matrix_setOnes_int_int.cpp
-  * Output: \verbinclude Matrix_setOnes_int_int.out
-  *
-  * \sa MatrixBase::setOnes(), setOnes(Index), class CwiseNullaryOp, MatrixBase::Ones()
-  */
-template<typename Derived>
-EIGEN_STRONG_INLINE Derived&
-PlainObjectBase<Derived>::setOnes(Index nbRows, Index nbCols)
-{
-  resize(nbRows, nbCols);
+ *
+ * \param rows the new number of rows
+ * \param cols the new number of columns
+ *
+ * Example: \include Matrix_setOnes_int_int.cpp
+ * Output: \verbinclude Matrix_setOnes_int_int.out
+ *
+ * \sa MatrixBase::setOnes(), setOnes(Index), class CwiseNullaryOp, MatrixBase::Ones()
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& PlainObjectBase<Derived>::setOnes(Index rows, Index cols) {
+  resize(rows, cols);
   return setConstant(Scalar(1));
 }
 
+/** Resizes to the given size, changing only the number of rows, and sets all
+ * coefficients in this expression to one. For the parameter of type NoChange_t,
+ * just pass the special value \c NoChange.
+ *
+ * \sa MatrixBase::setOnes(), setOnes(Index), setOnes(Index, Index), setOnes(NoChange_t, Index), class CwiseNullaryOp,
+ * MatrixBase::Ones()
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& PlainObjectBase<Derived>::setOnes(Index rows, NoChange_t) {
+  return setOnes(rows, cols());
+}
+
+/** Resizes to the given size, changing only the number of columns, and sets all
+ * coefficients in this expression to one. For the parameter of type NoChange_t,
+ * just pass the special value \c NoChange.
+ *
+ * \sa MatrixBase::setOnes(), setOnes(Index), setOnes(Index, Index), setOnes(Index, NoChange_t) class CwiseNullaryOp,
+ * MatrixBase::Ones()
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& PlainObjectBase<Derived>::setOnes(NoChange_t, Index cols) {
+  return setOnes(rows(), cols);
+}
+
 // Identity:
 
 /** \returns an expression of the identity matrix (not necessarily square).
-  *
-  * The parameters \a nbRows and \a nbCols are the number of rows and of columns of
-  * the returned matrix. Must be compatible with this MatrixBase type.
-  *
-  * This variant is meant to be used for dynamic-size matrix types. For fixed-size types,
-  * it is redundant to pass \a rows and \a cols as arguments, so Identity() should be used
-  * instead.
-  *
-  * Example: \include MatrixBase_identity_int_int.cpp
-  * Output: \verbinclude MatrixBase_identity_int_int.out
-  *
-  * \sa Identity(), setIdentity(), isIdentity()
-  */
-template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
-MatrixBase<Derived>::Identity(Index nbRows, Index nbCols)
-{
-  return DenseBase<Derived>::NullaryExpr(nbRows, nbCols, internal::scalar_identity_op<Scalar>());
+ *
+ * The parameters \a rows and \a cols are the number of rows and of columns of
+ * the returned matrix. Must be compatible with this MatrixBase type.
+ *
+ * This variant is meant to be used for dynamic-size matrix types. For fixed-size types,
+ * it is redundant to pass \a rows and \a cols as arguments, so Identity() should be used
+ * instead.
+ *
+ * Example: \include MatrixBase_identity_int_int.cpp
+ * Output: \verbinclude MatrixBase_identity_int_int.out
+ *
+ * \sa Identity(), setIdentity(), isIdentity()
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
+MatrixBase<Derived>::Identity(Index rows, Index cols) {
+  return DenseBase<Derived>::NullaryExpr(rows, cols, internal::scalar_identity_op<Scalar>());
 }
 
 /** \returns an expression of the identity matrix (not necessarily square).
-  *
-  * This variant is only for fixed-size MatrixBase types. For dynamic-size types, you
-  * need to use the variant taking size arguments.
-  *
-  * Example: \include MatrixBase_identity.cpp
-  * Output: \verbinclude MatrixBase_identity.out
-  *
-  * \sa Identity(Index,Index), setIdentity(), isIdentity()
-  */
-template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
-MatrixBase<Derived>::Identity()
-{
+ *
+ * This variant is only for fixed-size MatrixBase types. For dynamic-size types, you
+ * need to use the variant taking size arguments.
+ *
+ * Example: \include MatrixBase_identity.cpp
+ * Output: \verbinclude MatrixBase_identity.out
+ *
+ * \sa Identity(Index,Index), setIdentity(), isIdentity()
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::IdentityReturnType
+MatrixBase<Derived>::Identity() {
   EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
   return MatrixBase<Derived>::NullaryExpr(RowsAtCompileTime, ColsAtCompileTime, internal::scalar_identity_op<Scalar>());
 }
 
 /** \returns true if *this is approximately equal to the identity matrix
-  *          (not necessarily square),
-  *          within the precision given by \a prec.
-  *
-  * Example: \include MatrixBase_isIdentity.cpp
-  * Output: \verbinclude MatrixBase_isIdentity.out
-  *
-  * \sa class CwiseNullaryOp, Identity(), Identity(Index,Index), setIdentity()
-  */
-template<typename Derived>
-bool MatrixBase<Derived>::isIdentity
-(const RealScalar& prec) const
-{
-  for(Index j = 0; j < cols(); ++j)
-  {
-    for(Index i = 0; i < rows(); ++i)
-    {
-      if(i == j)
-      {
-        if(!internal::isApprox(this->coeff(i, j), static_cast<Scalar>(1), prec))
-          return false;
-      }
-      else
-      {
-        if(!internal::isMuchSmallerThan(this->coeff(i, j), static_cast<RealScalar>(1), prec))
-          return false;
+ *          (not necessarily square),
+ *          within the precision given by \a prec.
+ *
+ * Example: \include MatrixBase_isIdentity.cpp
+ * Output: \verbinclude MatrixBase_isIdentity.out
+ *
+ * \sa class CwiseNullaryOp, Identity(), Identity(Index,Index), setIdentity()
+ */
+template <typename Derived>
+bool MatrixBase<Derived>::isIdentity(const RealScalar& prec) const {
+  typename internal::nested_eval<Derived, 1>::type self(derived());
+  for (Index j = 0; j < cols(); ++j) {
+    for (Index i = 0; i < rows(); ++i) {
+      if (i == j) {
+        if (!internal::isApprox(self.coeff(i, j), static_cast<Scalar>(1), prec)) return false;
+      } else {
+        if (!internal::isMuchSmallerThan(self.coeff(i, j), static_cast<RealScalar>(1), prec)) return false;
       }
     }
   }
@@ -737,128 +813,163 @@ bool MatrixBase<Derived>::isIdentity
 
 namespace internal {
 
-template<typename Derived, bool Big = (Derived::SizeAtCompileTime>=16)>
-struct setIdentity_impl
-{
-  static EIGEN_STRONG_INLINE Derived& run(Derived& m)
-  {
+template <typename Derived, bool Big = (Derived::SizeAtCompileTime >= 16)>
+struct setIdentity_impl {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Derived& run(Derived& m) {
     return m = Derived::Identity(m.rows(), m.cols());
   }
 };
 
-template<typename Derived>
-struct setIdentity_impl<Derived, true>
-{
-  typedef typename Derived::Index Index;
-  static EIGEN_STRONG_INLINE Derived& run(Derived& m)
-  {
+template <typename Derived>
+struct setIdentity_impl<Derived, true> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Derived& run(Derived& m) {
     m.setZero();
-    const Index size = (std::min)(m.rows(), m.cols());
-    for(Index i = 0; i < size; ++i) m.coeffRef(i,i) = typename Derived::Scalar(1);
+    const Index size = numext::mini(m.rows(), m.cols());
+    for (Index i = 0; i < size; ++i) m.coeffRef(i, i) = typename Derived::Scalar(1);
     return m;
   }
 };
 
-} // end namespace internal
+}  // end namespace internal
 
 /** Writes the identity expression (not necessarily square) into *this.
-  *
-  * Example: \include MatrixBase_setIdentity.cpp
-  * Output: \verbinclude MatrixBase_setIdentity.out
-  *
-  * \sa class CwiseNullaryOp, Identity(), Identity(Index,Index), isIdentity()
-  */
-template<typename Derived>
-EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity()
-{
+ *
+ * Example: \include MatrixBase_setIdentity.cpp
+ * Output: \verbinclude MatrixBase_setIdentity.out
+ *
+ * \sa class CwiseNullaryOp, Identity(), Identity(Index,Index), isIdentity()
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity() {
   return internal::setIdentity_impl<Derived>::run(derived());
 }
 
 /** \brief Resizes to the given size, and writes the identity expression (not necessarily square) into *this.
-  *
-  * \param nbRows the new number of rows
-  * \param nbCols the new number of columns
-  *
-  * Example: \include Matrix_setIdentity_int_int.cpp
-  * Output: \verbinclude Matrix_setIdentity_int_int.out
-  *
-  * \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Identity()
-  */
-template<typename Derived>
-EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity(Index nbRows, Index nbCols)
-{
-  derived().resize(nbRows, nbCols);
+ *
+ * \param rows the new number of rows
+ * \param cols the new number of columns
+ *
+ * Example: \include Matrix_setIdentity_int_int.cpp
+ * Output: \verbinclude Matrix_setIdentity_int_int.out
+ *
+ * \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Identity()
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setIdentity(Index rows, Index cols) {
+  derived().resize(rows, cols);
   return setIdentity();
 }
 
 /** \returns an expression of the i-th unit (basis) vector.
-  *
-  * \only_for_vectors
-  *
-  * \sa MatrixBase::Unit(Index), MatrixBase::UnitX(), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
-  */
-template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index newSize, Index i)
-{
+ *
+ * \only_for_vectors
+ *
+ * \sa MatrixBase::Unit(Index), MatrixBase::UnitX(), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(
+    Index newSize, Index i) {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return BasisReturnType(SquareMatrixType::Identity(newSize,newSize), i);
+  return BasisReturnType(SquareMatrixType::Identity(newSize, newSize), i);
 }
 
 /** \returns an expression of the i-th unit (basis) vector.
-  *
-  * \only_for_vectors
-  *
-  * This variant is for fixed-size vector only.
-  *
-  * \sa MatrixBase::Unit(Index,Index), MatrixBase::UnitX(), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
-  */
-template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(Index i)
-{
+ *
+ * \only_for_vectors
+ *
+ * This variant is for fixed-size vector only.
+ *
+ * \sa MatrixBase::Unit(Index,Index), MatrixBase::UnitX(), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::Unit(
+    Index i) {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return BasisReturnType(SquareMatrixType::Identity(),i);
+  return BasisReturnType(SquareMatrixType::Identity(), i);
 }
 
 /** \returns an expression of the X axis unit vector (1{,0}^*)
-  *
-  * \only_for_vectors
-  *
-  * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
-  */
-template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitX()
-{ return Derived::Unit(0); }
+ *
+ * \only_for_vectors
+ *
+ * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(),
+ * MatrixBase::UnitW()
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitX() {
+  return Derived::Unit(0);
+}
 
 /** \returns an expression of the Y axis unit vector (0,1{,0}^*)
-  *
-  * \only_for_vectors
-  *
-  * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
-  */
-template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitY()
-{ return Derived::Unit(1); }
+ *
+ * \only_for_vectors
+ *
+ * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(),
+ * MatrixBase::UnitW()
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitY() {
+  return Derived::Unit(1);
+}
 
 /** \returns an expression of the Z axis unit vector (0,0,1{,0}^*)
-  *
-  * \only_for_vectors
-  *
-  * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
-  */
-template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitZ()
-{ return Derived::Unit(2); }
+ *
+ * \only_for_vectors
+ *
+ * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(),
+ * MatrixBase::UnitW()
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitZ() {
+  return Derived::Unit(2);
+}
 
 /** \returns an expression of the W axis unit vector (0,0,0,1)
-  *
-  * \only_for_vectors
-  *
-  * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW()
-  */
-template<typename Derived>
-EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitW()
-{ return Derived::Unit(3); }
-
-} // end namespace Eigen
-
-#endif // EIGEN_CWISE_NULLARY_OP_H
+ *
+ * \only_for_vectors
+ *
+ * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(),
+ * MatrixBase::UnitW()
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::BasisReturnType MatrixBase<Derived>::UnitW() {
+  return Derived::Unit(3);
+}
+
+/** \brief Set the coefficients of \c *this to the i-th unit (basis) vector
+ *
+ * \param i index of the unique coefficient to be set to 1
+ *
+ * \only_for_vectors
+ *
+ * \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Unit(Index,Index)
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setUnit(Index i) {
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
+  eigen_assert(i < size());
+  derived().setZero();
+  derived().coeffRef(i) = Scalar(1);
+  return derived();
+}
+
+/** \brief Resizes to the given \a newSize, and writes the i-th unit (basis) vector into *this.
+ *
+ * \param newSize the new size of the vector
+ * \param i index of the unique coefficient to be set to 1
+ *
+ * \only_for_vectors
+ *
+ * \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Unit(Index,Index)
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase<Derived>::setUnit(Index newSize, Index i) {
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
+  eigen_assert(i < newSize);
+  derived().resize(newSize);
+  return setUnit(i);
+}
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CWISE_NULLARY_OP_H
diff --git a/inst/include/Eigen/src/Core/CwiseTernaryOp.h b/inst/include/Eigen/src/Core/CwiseTernaryOp.h
new file mode 100644
index 00000000..9bb0d407
--- /dev/null
+++ b/inst/include/Eigen/src/Core/CwiseTernaryOp.h
@@ -0,0 +1,171 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2016 Eugene Brevdo <ebrevdo@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CWISE_TERNARY_OP_H
+#define EIGEN_CWISE_TERNARY_OP_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+template <typename TernaryOp, typename Arg1, typename Arg2, typename Arg3>
+struct traits<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3>> {
+  // we must not inherit from traits<Arg1> since it has
+  // the potential to cause problems with MSVC
+  typedef remove_all_t<Arg1> Ancestor;
+  typedef typename traits<Ancestor>::XprKind XprKind;
+  enum {
+    RowsAtCompileTime = traits<Ancestor>::RowsAtCompileTime,
+    ColsAtCompileTime = traits<Ancestor>::ColsAtCompileTime,
+    MaxRowsAtCompileTime = traits<Ancestor>::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = traits<Ancestor>::MaxColsAtCompileTime
+  };
+
+  // even though we require Arg1, Arg2, and Arg3 to have the same scalar type
+  // (see CwiseTernaryOp constructor),
+  // we still want to handle the case when the result type is different.
+  typedef typename result_of<TernaryOp(const typename Arg1::Scalar&, const typename Arg2::Scalar&,
+                                       const typename Arg3::Scalar&)>::type Scalar;
+
+  typedef typename internal::traits<Arg1>::StorageKind StorageKind;
+  typedef typename internal::traits<Arg1>::StorageIndex StorageIndex;
+
+  typedef typename Arg1::Nested Arg1Nested;
+  typedef typename Arg2::Nested Arg2Nested;
+  typedef typename Arg3::Nested Arg3Nested;
+  typedef std::remove_reference_t<Arg1Nested> Arg1Nested_;
+  typedef std::remove_reference_t<Arg2Nested> Arg2Nested_;
+  typedef std::remove_reference_t<Arg3Nested> Arg3Nested_;
+  enum { Flags = Arg1Nested_::Flags & RowMajorBit };
+};
+}  // end namespace internal
+
+template <typename TernaryOp, typename Arg1, typename Arg2, typename Arg3, typename StorageKind>
+class CwiseTernaryOpImpl;
+
+/** \class CwiseTernaryOp
+ * \ingroup Core_Module
+ *
+ * \brief Generic expression where a coefficient-wise ternary operator is
+ * applied to two expressions
+ *
+ * \tparam TernaryOp template functor implementing the operator
+ * \tparam Arg1Type the type of the first argument
+ * \tparam Arg2Type the type of the second argument
+ * \tparam Arg3Type the type of the third argument
+ *
+ * This class represents an expression where a coefficient-wise ternary
+ * operator is applied to three expressions.
+ * It is the return type of ternary operators, by which we mean only those
+ * ternary operators where
+ * all three arguments are Eigen expressions.
+ * For example, the return type of betainc(matrix1, matrix2, matrix3) is a
+ * CwiseTernaryOp.
+ *
+ * Most of the time, this is the only way that it is used, so you typically
+ * don't have to name
+ * CwiseTernaryOp types explicitly.
+ *
+ * \sa MatrixBase::ternaryExpr(const MatrixBase<Argument2> &, const
+ * MatrixBase<Argument3> &, const CustomTernaryOp &) const, class CwiseBinaryOp,
+ * class CwiseUnaryOp, class CwiseNullaryOp
+ */
+template <typename TernaryOp, typename Arg1Type, typename Arg2Type, typename Arg3Type>
+class CwiseTernaryOp : public CwiseTernaryOpImpl<TernaryOp, Arg1Type, Arg2Type, Arg3Type,
+                                                 typename internal::traits<Arg1Type>::StorageKind>,
+                       internal::no_assignment_operator {
+ public:
+  typedef internal::remove_all_t<Arg1Type> Arg1;
+  typedef internal::remove_all_t<Arg2Type> Arg2;
+  typedef internal::remove_all_t<Arg3Type> Arg3;
+
+  // require the sizes to match
+  EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Arg1, Arg2)
+  EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Arg1, Arg3)
+
+  // The index types should match
+  EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::StorageKind,
+                                         typename internal::traits<Arg2Type>::StorageKind>::value),
+                      STORAGE_KIND_MUST_MATCH)
+  EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::StorageKind,
+                                         typename internal::traits<Arg3Type>::StorageKind>::value),
+                      STORAGE_KIND_MUST_MATCH)
+
+  typedef typename CwiseTernaryOpImpl<TernaryOp, Arg1Type, Arg2Type, Arg3Type,
+                                      typename internal::traits<Arg1Type>::StorageKind>::Base Base;
+  EIGEN_GENERIC_PUBLIC_INTERFACE(CwiseTernaryOp)
+
+  typedef typename internal::ref_selector<Arg1Type>::type Arg1Nested;
+  typedef typename internal::ref_selector<Arg2Type>::type Arg2Nested;
+  typedef typename internal::ref_selector<Arg3Type>::type Arg3Nested;
+  typedef std::remove_reference_t<Arg1Nested> Arg1Nested_;
+  typedef std::remove_reference_t<Arg2Nested> Arg2Nested_;
+  typedef std::remove_reference_t<Arg3Nested> Arg3Nested_;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CwiseTernaryOp(const Arg1& a1, const Arg2& a2, const Arg3& a3,
+                                                       const TernaryOp& func = TernaryOp())
+      : m_arg1(a1), m_arg2(a2), m_arg3(a3), m_functor(func) {
+    eigen_assert(a1.rows() == a2.rows() && a1.cols() == a2.cols() && a1.rows() == a3.rows() && a1.cols() == a3.cols());
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rows() const {
+    // return the fixed size type if available to enable compile time
+    // optimizations
+    if (internal::traits<internal::remove_all_t<Arg1Nested>>::RowsAtCompileTime == Dynamic &&
+        internal::traits<internal::remove_all_t<Arg2Nested>>::RowsAtCompileTime == Dynamic)
+      return m_arg3.rows();
+    else if (internal::traits<internal::remove_all_t<Arg1Nested>>::RowsAtCompileTime == Dynamic &&
+             internal::traits<internal::remove_all_t<Arg3Nested>>::RowsAtCompileTime == Dynamic)
+      return m_arg2.rows();
+    else
+      return m_arg1.rows();
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index cols() const {
+    // return the fixed size type if available to enable compile time
+    // optimizations
+    if (internal::traits<internal::remove_all_t<Arg1Nested>>::ColsAtCompileTime == Dynamic &&
+        internal::traits<internal::remove_all_t<Arg2Nested>>::ColsAtCompileTime == Dynamic)
+      return m_arg3.cols();
+    else if (internal::traits<internal::remove_all_t<Arg1Nested>>::ColsAtCompileTime == Dynamic &&
+             internal::traits<internal::remove_all_t<Arg3Nested>>::ColsAtCompileTime == Dynamic)
+      return m_arg2.cols();
+    else
+      return m_arg1.cols();
+  }
+
+  /** \returns the first argument nested expression */
+  EIGEN_DEVICE_FUNC const Arg1Nested_& arg1() const { return m_arg1; }
+  /** \returns the first argument nested expression */
+  EIGEN_DEVICE_FUNC const Arg2Nested_& arg2() const { return m_arg2; }
+  /** \returns the third argument nested expression */
+  EIGEN_DEVICE_FUNC const Arg3Nested_& arg3() const { return m_arg3; }
+  /** \returns the functor representing the ternary operation */
+  EIGEN_DEVICE_FUNC const TernaryOp& functor() const { return m_functor; }
+
+ protected:
+  Arg1Nested m_arg1;
+  Arg2Nested m_arg2;
+  Arg3Nested m_arg3;
+  const TernaryOp m_functor;
+};
+
+// Generic API dispatcher
+template <typename TernaryOp, typename Arg1, typename Arg2, typename Arg3, typename StorageKind>
+class CwiseTernaryOpImpl : public internal::generic_xpr_base<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3>>::type {
+ public:
+  typedef typename internal::generic_xpr_base<CwiseTernaryOp<TernaryOp, Arg1, Arg2, Arg3>>::type Base;
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CWISE_TERNARY_OP_H
diff --git a/inst/include/Eigen/src/Core/CwiseUnaryOp.h b/inst/include/Eigen/src/Core/CwiseUnaryOp.h
index f7ee60e9..94ec1a0f 100644
--- a/inst/include/Eigen/src/Core/CwiseUnaryOp.h
+++ b/inst/include/Eigen/src/Core/CwiseUnaryOp.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -11,116 +11,81 @@
 #ifndef EIGEN_CWISE_UNARY_OP_H
 #define EIGEN_CWISE_UNARY_OP_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 
-/** \class CwiseUnaryOp
-  * \ingroup Core_Module
-  *
-  * \brief Generic expression where a coefficient-wise unary operator is applied to an expression
-  *
-  * \param UnaryOp template functor implementing the operator
-  * \param XprType the type of the expression to which we are applying the unary operator
-  *
-  * This class represents an expression where a unary operator is applied to an expression.
-  * It is the return type of all operations taking exactly 1 input expression, regardless of the
-  * presence of other inputs such as scalars. For example, the operator* in the expression 3*matrix
-  * is considered unary, because only the right-hand side is an expression, and its
-  * return type is a specialization of CwiseUnaryOp.
-  *
-  * Most of the time, this is the only way that it is used, so you typically don't have to name
-  * CwiseUnaryOp types explicitly.
-  *
-  * \sa MatrixBase::unaryExpr(const CustomUnaryOp &) const, class CwiseBinaryOp, class CwiseNullaryOp
-  */
+namespace Eigen {
 
 namespace internal {
-template<typename UnaryOp, typename XprType>
-struct traits<CwiseUnaryOp<UnaryOp, XprType> >
- : traits<XprType>
-{
-  typedef typename result_of<
-                     UnaryOp(typename XprType::Scalar)
-                   >::type Scalar;
+template <typename UnaryOp, typename XprType>
+struct traits<CwiseUnaryOp<UnaryOp, XprType> > : traits<XprType> {
+  typedef typename result_of<UnaryOp(const typename XprType::Scalar&)>::type Scalar;
   typedef typename XprType::Nested XprTypeNested;
-  typedef typename remove_reference<XprTypeNested>::type _XprTypeNested;
-  enum {
-    Flags = _XprTypeNested::Flags & (
-      HereditaryBits | LinearAccessBit | AlignedBit
-      | (functor_traits<UnaryOp>::PacketAccess ? PacketAccessBit : 0)),
-    CoeffReadCost = EIGEN_ADD_COST(_XprTypeNested::CoeffReadCost, functor_traits<UnaryOp>::Cost)
-  };
+  typedef std::remove_reference_t<XprTypeNested> XprTypeNested_;
+  enum { Flags = XprTypeNested_::Flags & RowMajorBit };
 };
-}
+}  // namespace internal
 
-template<typename UnaryOp, typename XprType, typename StorageKind>
+template <typename UnaryOp, typename XprType, typename StorageKind>
 class CwiseUnaryOpImpl;
 
-template<typename UnaryOp, typename XprType>
-class CwiseUnaryOp : internal::no_assignment_operator,
-  public CwiseUnaryOpImpl<UnaryOp, XprType, typename internal::traits<XprType>::StorageKind>
-{
-  public:
-
-    typedef typename CwiseUnaryOpImpl<UnaryOp, XprType,typename internal::traits<XprType>::StorageKind>::Base Base;
-    EIGEN_GENERIC_PUBLIC_INTERFACE(CwiseUnaryOp)
-
-    inline CwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp())
+/** \class CwiseUnaryOp
+ * \ingroup Core_Module
+ *
+ * \brief Generic expression where a coefficient-wise unary operator is applied to an expression
+ *
+ * \tparam UnaryOp template functor implementing the operator
+ * \tparam XprType the type of the expression to which we are applying the unary operator
+ *
+ * This class represents an expression where a unary operator is applied to an expression.
+ * It is the return type of all operations taking exactly 1 input expression, regardless of the
+ * presence of other inputs such as scalars. For example, the operator* in the expression 3*matrix
+ * is considered unary, because only the right-hand side is an expression, and its
+ * return type is a specialization of CwiseUnaryOp.
+ *
+ * Most of the time, this is the only way that it is used, so you typically don't have to name
+ * CwiseUnaryOp types explicitly.
+ *
+ * \sa MatrixBase::unaryExpr(const CustomUnaryOp &) const, class CwiseBinaryOp, class CwiseNullaryOp
+ */
+template <typename UnaryOp, typename XprType>
+class CwiseUnaryOp : public CwiseUnaryOpImpl<UnaryOp, XprType, typename internal::traits<XprType>::StorageKind>,
+                     internal::no_assignment_operator {
+ public:
+  typedef typename CwiseUnaryOpImpl<UnaryOp, XprType, typename internal::traits<XprType>::StorageKind>::Base Base;
+  EIGEN_GENERIC_PUBLIC_INTERFACE(CwiseUnaryOp)
+  typedef typename internal::ref_selector<XprType>::type XprTypeNested;
+  typedef internal::remove_all_t<XprType> NestedExpression;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit CwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp())
       : m_xpr(xpr), m_functor(func) {}
 
-    EIGEN_STRONG_INLINE Index rows() const { return m_xpr.rows(); }
-    EIGEN_STRONG_INLINE Index cols() const { return m_xpr.cols(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const noexcept { return m_xpr.rows(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const noexcept { return m_xpr.cols(); }
 
-    /** \returns the functor representing the unary operation */
-    const UnaryOp& functor() const { return m_functor; }
+  /** \returns the functor representing the unary operation */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const UnaryOp& functor() const { return m_functor; }
 
-    /** \returns the nested expression */
-    const typename internal::remove_all<typename XprType::Nested>::type&
-    nestedExpression() const { return m_xpr; }
+  /** \returns the nested expression */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all_t<XprTypeNested>& nestedExpression() const {
+    return m_xpr;
+  }
 
-    /** \returns the nested expression */
-    typename internal::remove_all<typename XprType::Nested>::type&
-    nestedExpression() { return m_xpr.const_cast_derived(); }
+  /** \returns the nested expression */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::remove_all_t<XprTypeNested>& nestedExpression() { return m_xpr; }
 
-  protected:
-    typename XprType::Nested m_xpr;
-    const UnaryOp m_functor;
+ protected:
+  XprTypeNested m_xpr;
+  const UnaryOp m_functor;
 };
 
-// This is the generic implementation for dense storage.
-// It can be used for any expression types implementing the dense concept.
-template<typename UnaryOp, typename XprType>
-class CwiseUnaryOpImpl<UnaryOp,XprType,Dense>
-  : public internal::dense_xpr_base<CwiseUnaryOp<UnaryOp, XprType> >::type
-{
-  public:
-
-    typedef CwiseUnaryOp<UnaryOp, XprType> Derived;
-    typedef typename internal::dense_xpr_base<CwiseUnaryOp<UnaryOp, XprType> >::type Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(Derived)
-
-    EIGEN_STRONG_INLINE const Scalar coeff(Index rowId, Index colId) const
-    {
-      return derived().functor()(derived().nestedExpression().coeff(rowId, colId));
-    }
-
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE PacketScalar packet(Index rowId, Index colId) const
-    {
-      return derived().functor().packetOp(derived().nestedExpression().template packet<LoadMode>(rowId, colId));
-    }
-
-    EIGEN_STRONG_INLINE const Scalar coeff(Index index) const
-    {
-      return derived().functor()(derived().nestedExpression().coeff(index));
-    }
-
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE PacketScalar packet(Index index) const
-    {
-      return derived().functor().packetOp(derived().nestedExpression().template packet<LoadMode>(index));
-    }
+// Generic API dispatcher
+template <typename UnaryOp, typename XprType, typename StorageKind>
+class CwiseUnaryOpImpl : public internal::generic_xpr_base<CwiseUnaryOp<UnaryOp, XprType> >::type {
+ public:
+  typedef typename internal::generic_xpr_base<CwiseUnaryOp<UnaryOp, XprType> >::type Base;
 };
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_CWISE_UNARY_OP_H
+#endif  // EIGEN_CWISE_UNARY_OP_H
diff --git a/inst/include/Eigen/src/Core/CwiseUnaryView.h b/inst/include/Eigen/src/Core/CwiseUnaryView.h
index b2638d32..7dd7623f 100644
--- a/inst/include/Eigen/src/Core/CwiseUnaryView.h
+++ b/inst/include/Eigen/src/Core/CwiseUnaryView.h
@@ -10,130 +10,158 @@
 #ifndef EIGEN_CWISE_UNARY_VIEW_H
 #define EIGEN_CWISE_UNARY_VIEW_H
 
-namespace Eigen {
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 
-/** \class CwiseUnaryView
-  * \ingroup Core_Module
-  *
-  * \brief Generic lvalue expression of a coefficient-wise unary operator of a matrix or a vector
-  *
-  * \param ViewOp template functor implementing the view
-  * \param MatrixType the type of the matrix we are applying the unary operator
-  *
-  * This class represents a lvalue expression of a generic unary view operator of a matrix or a vector.
-  * It is the return type of real() and imag(), and most of the time this is the only way it is used.
-  *
-  * \sa MatrixBase::unaryViewExpr(const CustomUnaryOp &) const, class CwiseUnaryOp
-  */
+namespace Eigen {
 
 namespace internal {
-template<typename ViewOp, typename MatrixType>
-struct traits<CwiseUnaryView<ViewOp, MatrixType> >
- : traits<MatrixType>
-{
-  typedef typename result_of<
-                     ViewOp(typename traits<MatrixType>::Scalar)
-                   >::type Scalar;
+template <typename ViewOp, typename MatrixType, typename StrideType>
+struct traits<CwiseUnaryView<ViewOp, MatrixType, StrideType> > : traits<MatrixType> {
+  typedef typename result_of<ViewOp(typename traits<MatrixType>::Scalar&)>::type1 ScalarRef;
+  static_assert(std::is_reference<ScalarRef>::value, "Views must return a reference type.");
+  typedef remove_all_t<ScalarRef> Scalar;
   typedef typename MatrixType::Nested MatrixTypeNested;
-  typedef typename remove_all<MatrixTypeNested>::type _MatrixTypeNested;
+  typedef remove_all_t<MatrixTypeNested> MatrixTypeNested_;
   enum {
-    Flags = (traits<_MatrixTypeNested>::Flags & (HereditaryBits | LvalueBit | LinearAccessBit | DirectAccessBit)),
-    CoeffReadCost = traits<_MatrixTypeNested>::CoeffReadCost + functor_traits<ViewOp>::Cost,
-    MatrixTypeInnerStride =  inner_stride_at_compile_time<MatrixType>::ret,
+    FlagsLvalueBit = is_lvalue<MatrixType>::value ? LvalueBit : 0,
+    Flags =
+        traits<MatrixTypeNested_>::Flags &
+        (RowMajorBit | FlagsLvalueBit | DirectAccessBit),  // FIXME DirectAccessBit should not be handled by expressions
+    MatrixTypeInnerStride = inner_stride_at_compile_time<MatrixType>::ret,
     // need to cast the sizeof's from size_t to int explicitly, otherwise:
     // "error: no integral type can represent all of the enumerator values
-    InnerStrideAtCompileTime = MatrixTypeInnerStride == Dynamic
-                             ? int(Dynamic)
-                             : int(MatrixTypeInnerStride) * int(sizeof(typename traits<MatrixType>::Scalar) / sizeof(Scalar)),
-    OuterStrideAtCompileTime = outer_stride_at_compile_time<MatrixType>::ret == Dynamic
-                             ? int(Dynamic)
-                             : outer_stride_at_compile_time<MatrixType>::ret * int(sizeof(typename traits<MatrixType>::Scalar) / sizeof(Scalar))
+    InnerStrideAtCompileTime =
+        StrideType::InnerStrideAtCompileTime == 0
+            ? (MatrixTypeInnerStride == Dynamic
+                   ? int(Dynamic)
+                   : int(MatrixTypeInnerStride) * int(sizeof(typename traits<MatrixType>::Scalar) / sizeof(Scalar)))
+            : int(StrideType::InnerStrideAtCompileTime),
+
+    OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0
+                                   ? (outer_stride_at_compile_time<MatrixType>::ret == Dynamic
+                                          ? int(Dynamic)
+                                          : outer_stride_at_compile_time<MatrixType>::ret *
+                                                int(sizeof(typename traits<MatrixType>::Scalar) / sizeof(Scalar)))
+                                   : int(StrideType::OuterStrideAtCompileTime)
   };
 };
-}
 
-template<typename ViewOp, typename MatrixType, typename StorageKind>
-class CwiseUnaryViewImpl;
+// Generic API dispatcher
+template <typename ViewOp, typename XprType, typename StrideType, typename StorageKind,
+          bool Mutable = !std::is_const<XprType>::value>
+class CwiseUnaryViewImpl : public generic_xpr_base<CwiseUnaryView<ViewOp, XprType, StrideType> >::type {
+ public:
+  typedef typename generic_xpr_base<CwiseUnaryView<ViewOp, XprType, StrideType> >::type Base;
+};
 
-template<typename ViewOp, typename MatrixType>
-class CwiseUnaryView : public CwiseUnaryViewImpl<ViewOp, MatrixType, typename internal::traits<MatrixType>::StorageKind>
-{
-  public:
+template <typename ViewOp, typename MatrixType, typename StrideType>
+class CwiseUnaryViewImpl<ViewOp, MatrixType, StrideType, Dense, false>
+    : public dense_xpr_base<CwiseUnaryView<ViewOp, MatrixType, StrideType> >::type {
+ public:
+  typedef CwiseUnaryView<ViewOp, MatrixType, StrideType> Derived;
+  typedef typename dense_xpr_base<CwiseUnaryView<ViewOp, MatrixType, StrideType> >::type Base;
+  EIGEN_DENSE_PUBLIC_INTERFACE(Derived)
+  EIGEN_INHERIT_ASSIGNMENT_OPERATORS(CwiseUnaryViewImpl)
+
+  EIGEN_DEVICE_FUNC inline const Scalar* data() const { return &(this->coeffRef(0)); }
+
+  EIGEN_DEVICE_FUNC constexpr Index innerStride() const {
+    return StrideType::InnerStrideAtCompileTime != 0 ? int(StrideType::InnerStrideAtCompileTime)
+                                                     : derived().nestedExpression().innerStride() *
+                                                           sizeof(typename traits<MatrixType>::Scalar) / sizeof(Scalar);
+  }
+
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const {
+    return StrideType::OuterStrideAtCompileTime != 0 ? int(StrideType::OuterStrideAtCompileTime)
+                                                     : derived().nestedExpression().outerStride() *
+                                                           sizeof(typename traits<MatrixType>::Scalar) / sizeof(Scalar);
+  }
+
+ protected:
+  EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(CwiseUnaryViewImpl)
+
+  // Allow const access to coeffRef for the case of direct access being enabled.
+  EIGEN_DEVICE_FUNC inline const Scalar& coeffRef(Index index) const {
+    return internal::evaluator<Derived>(derived()).coeffRef(index);
+  }
+
+  EIGEN_DEVICE_FUNC inline const Scalar& coeffRef(Index row, Index col) const {
+    return internal::evaluator<Derived>(derived()).coeffRef(row, col);
+  }
+};
 
-    typedef typename CwiseUnaryViewImpl<ViewOp, MatrixType,typename internal::traits<MatrixType>::StorageKind>::Base Base;
-    EIGEN_GENERIC_PUBLIC_INTERFACE(CwiseUnaryView)
+template <typename ViewOp, typename MatrixType, typename StrideType>
+class CwiseUnaryViewImpl<ViewOp, MatrixType, StrideType, Dense, true>
+    : public CwiseUnaryViewImpl<ViewOp, MatrixType, StrideType, Dense, false> {
+ public:
+  typedef CwiseUnaryViewImpl<ViewOp, MatrixType, StrideType, Dense, false> Base;
+  typedef CwiseUnaryView<ViewOp, MatrixType, StrideType> Derived;
+  EIGEN_DENSE_PUBLIC_INTERFACE(Derived)
+  EIGEN_INHERIT_ASSIGNMENT_OPERATORS(CwiseUnaryViewImpl)
 
-    inline CwiseUnaryView(const MatrixType& mat, const ViewOp& func = ViewOp())
-      : m_matrix(mat), m_functor(func) {}
+  using Base::data;
+  EIGEN_DEVICE_FUNC inline Scalar* data() { return &(this->coeffRef(0)); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) {
+    return internal::evaluator<Derived>(derived()).coeffRef(row, col);
+  }
 
-    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(CwiseUnaryView)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
+    return internal::evaluator<Derived>(derived()).coeffRef(index);
+  }
 
-    EIGEN_STRONG_INLINE Index rows() const { return m_matrix.rows(); }
-    EIGEN_STRONG_INLINE Index cols() const { return m_matrix.cols(); }
+ protected:
+  EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(CwiseUnaryViewImpl)
+};
 
-    /** \returns the functor representing unary operation */
-    const ViewOp& functor() const { return m_functor; }
+}  // namespace internal
 
-    /** \returns the nested expression */
-    const typename internal::remove_all<typename MatrixType::Nested>::type&
-    nestedExpression() const { return m_matrix; }
+/** \class CwiseUnaryView
+ * \ingroup Core_Module
+ *
+ * \brief Generic lvalue expression of a coefficient-wise unary operator of a matrix or a vector
+ *
+ * \tparam ViewOp template functor implementing the view
+ * \tparam MatrixType the type of the matrix we are applying the unary operator
+ *
+ * This class represents a lvalue expression of a generic unary view operator of a matrix or a vector.
+ * It is the return type of real() and imag(), and most of the time this is the only way it is used.
+ *
+ * \sa MatrixBase::unaryViewExpr(const CustomUnaryOp &) const, class CwiseUnaryOp
+ */
+template <typename ViewOp, typename MatrixType, typename StrideType>
+class CwiseUnaryView : public internal::CwiseUnaryViewImpl<ViewOp, MatrixType, StrideType,
+                                                           typename internal::traits<MatrixType>::StorageKind> {
+ public:
+  typedef typename internal::CwiseUnaryViewImpl<ViewOp, MatrixType, StrideType,
+                                                typename internal::traits<MatrixType>::StorageKind>::Base Base;
+  EIGEN_GENERIC_PUBLIC_INTERFACE(CwiseUnaryView)
+  typedef typename internal::ref_selector<MatrixType>::non_const_type MatrixTypeNested;
+  typedef internal::remove_all_t<MatrixType> NestedExpression;
+
+  explicit EIGEN_DEVICE_FUNC inline CwiseUnaryView(MatrixType& mat, const ViewOp& func = ViewOp())
+      : m_matrix(mat), m_functor(func) {}
 
-    /** \returns the nested expression */
-    typename internal::remove_all<typename MatrixType::Nested>::type&
-    nestedExpression() { return m_matrix.const_cast_derived(); }
+  EIGEN_INHERIT_ASSIGNMENT_OPERATORS(CwiseUnaryView)
 
-  protected:
-    // FIXME changed from MatrixType::Nested because of a weird compilation error with sun CC
-    typename internal::nested<MatrixType>::type m_matrix;
-    ViewOp m_functor;
-};
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const noexcept { return m_matrix.rows(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const noexcept { return m_matrix.cols(); }
+
+  /** \returns the functor representing unary operation */
+  EIGEN_DEVICE_FUNC const ViewOp& functor() const { return m_functor; }
+
+  /** \returns the nested expression */
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<MatrixTypeNested>& nestedExpression() const { return m_matrix; }
+
+  /** \returns the nested expression */
+  EIGEN_DEVICE_FUNC std::remove_reference_t<MatrixTypeNested>& nestedExpression() { return m_matrix; }
 
-template<typename ViewOp, typename MatrixType>
-class CwiseUnaryViewImpl<ViewOp,MatrixType,Dense>
-  : public internal::dense_xpr_base< CwiseUnaryView<ViewOp, MatrixType> >::type
-{
-  public:
-
-    typedef CwiseUnaryView<ViewOp, MatrixType> Derived;
-    typedef typename internal::dense_xpr_base< CwiseUnaryView<ViewOp, MatrixType> >::type Base;
-
-    EIGEN_DENSE_PUBLIC_INTERFACE(Derived)
-    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(CwiseUnaryViewImpl)
-    
-    inline Scalar* data() { return &coeffRef(0); }
-    inline const Scalar* data() const { return &coeff(0); }
-
-    inline Index innerStride() const
-    {
-      return derived().nestedExpression().innerStride() * sizeof(typename internal::traits<MatrixType>::Scalar) / sizeof(Scalar);
-    }
-
-    inline Index outerStride() const
-    {
-      return derived().nestedExpression().outerStride() * sizeof(typename internal::traits<MatrixType>::Scalar) / sizeof(Scalar);
-    }
-
-    EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const
-    {
-      return derived().functor()(derived().nestedExpression().coeff(row, col));
-    }
-
-    EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
-    {
-      return derived().functor()(derived().nestedExpression().coeff(index));
-    }
-
-    EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col)
-    {
-      return derived().functor()(const_cast_derived().nestedExpression().coeffRef(row, col));
-    }
-
-    EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
-    {
-      return derived().functor()(const_cast_derived().nestedExpression().coeffRef(index));
-    }
+ protected:
+  MatrixTypeNested m_matrix;
+  ViewOp m_functor;
 };
 
-} // end namespace Eigen
+}  // namespace Eigen
 
-#endif // EIGEN_CWISE_UNARY_VIEW_H
+#endif  // EIGEN_CWISE_UNARY_VIEW_H
diff --git a/inst/include/Eigen/src/Core/DenseBase.h b/inst/include/Eigen/src/Core/DenseBase.h
index 4b371b07..c81e1d10 100644
--- a/inst/include/Eigen/src/Core/DenseBase.h
+++ b/inst/include/Eigen/src/Core/DenseBase.h
@@ -11,511 +11,663 @@
 #ifndef EIGEN_DENSEBASE_H
 #define EIGEN_DENSEBASE_H
 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
 namespace Eigen {
 
-namespace internal {
-  
 // The index type defined by EIGEN_DEFAULT_DENSE_INDEX_TYPE must be a signed type.
-// This dummy function simply aims at checking that at compile time.
-static inline void check_DenseIndex_is_signed() {
-  EIGEN_STATIC_ASSERT(NumTraits<DenseIndex>::IsSigned,THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE); 
-}
+EIGEN_STATIC_ASSERT(NumTraits<DenseIndex>::IsSigned, THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE)
 
-} // end namespace internal
-  
 /** \class DenseBase
-  * \ingroup Core_Module
-  *
-  * \brief Base class for all dense matrices, vectors, and arrays
-  *
-  * This class is the base that is inherited by all dense objects (matrix, vector, arrays,
-  * and related expression types). The common Eigen API for dense objects is contained in this class.
-  *
-  * \tparam Derived is the derived type, e.g., a matrix type or an expression.
-  *
-  * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_DENSEBASE_PLUGIN.
-  *
-  * \sa \ref TopicClassHierarchy
-  */
-template<typename Derived> class DenseBase
+ * \ingroup Core_Module
+ *
+ * \brief Base class for all dense matrices, vectors, and arrays
+ *
+ * This class is the base that is inherited by all dense objects (matrix, vector, arrays,
+ * and related expression types). The common Eigen API for dense objects is contained in this class.
+ *
+ * \tparam Derived is the derived type, e.g., a matrix type or an expression.
+ *
+ * This class can be extended with the help of the plugin mechanism described on the page
+ * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_DENSEBASE_PLUGIN.
+ *
+ * \sa \blank \ref TopicClassHierarchy
+ */
+template <typename Derived>
+class DenseBase
 #ifndef EIGEN_PARSED_BY_DOXYGEN
-  : public internal::special_scalar_op_base<Derived, typename internal::traits<Derived>::Scalar,
-                                            typename NumTraits<typename internal::traits<Derived>::Scalar>::Real,
-                                            DenseCoeffsBase<Derived> >
+    : public DenseCoeffsBase<Derived, internal::accessors_level<Derived>::value>
 #else
-  : public DenseCoeffsBase<Derived>
-#endif // not EIGEN_PARSED_BY_DOXYGEN
+    : public DenseCoeffsBase<Derived, DirectWriteAccessors>
+#endif  // not EIGEN_PARSED_BY_DOXYGEN
 {
-  public:
-
-    class InnerIterator;
-
-    typedef typename internal::traits<Derived>::StorageKind StorageKind;
-
-    /** \brief The type of indices 
-      * \details To change this, \c \#define the preprocessor symbol \c EIGEN_DEFAULT_DENSE_INDEX_TYPE.
-      * \sa \ref TopicPreprocessorDirectives.
-      */
-    typedef typename internal::traits<Derived>::Index Index; 
-
-    typedef typename internal::traits<Derived>::Scalar Scalar;
-    typedef typename internal::packet_traits<Scalar>::type PacketScalar;
-    typedef typename NumTraits<Scalar>::Real RealScalar;
-    typedef internal::special_scalar_op_base<Derived,Scalar,RealScalar, DenseCoeffsBase<Derived> > Base;
-
-    using Base::operator*;
-    using Base::derived;
-    using Base::const_cast_derived;
-    using Base::rows;
-    using Base::cols;
-    using Base::size;
-    using Base::rowIndexByOuterInner;
-    using Base::colIndexByOuterInner;
-    using Base::coeff;
-    using Base::coeffByOuterInner;
-    using Base::packet;
-    using Base::packetByOuterInner;
-    using Base::writePacket;
-    using Base::writePacketByOuterInner;
-    using Base::coeffRef;
-    using Base::coeffRefByOuterInner;
-    using Base::copyCoeff;
-    using Base::copyCoeffByOuterInner;
-    using Base::copyPacket;
-    using Base::copyPacketByOuterInner;
-    using Base::operator();
-    using Base::operator[];
-    using Base::x;
-    using Base::y;
-    using Base::z;
-    using Base::w;
-    using Base::stride;
-    using Base::innerStride;
-    using Base::outerStride;
-    using Base::rowStride;
-    using Base::colStride;
-    typedef typename Base::CoeffReturnType CoeffReturnType;
-
-    enum {
-
-      RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime,
-        /**< The number of rows at compile-time. This is just a copy of the value provided
-          * by the \a Derived type. If a value is not known at compile-time,
-          * it is set to the \a Dynamic constant.
-          * \sa MatrixBase::rows(), MatrixBase::cols(), ColsAtCompileTime, SizeAtCompileTime */
-
-      ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
-        /**< The number of columns at compile-time. This is just a copy of the value provided
-          * by the \a Derived type. If a value is not known at compile-time,
-          * it is set to the \a Dynamic constant.
-          * \sa MatrixBase::rows(), MatrixBase::cols(), RowsAtCompileTime, SizeAtCompileTime */
-
-
-      SizeAtCompileTime = (internal::size_at_compile_time<internal::traits<Derived>::RowsAtCompileTime,
-                                                   internal::traits<Derived>::ColsAtCompileTime>::ret),
-        /**< This is equal to the number of coefficients, i.e. the number of
-          * rows times the number of columns, or to \a Dynamic if this is not
-          * known at compile-time. \sa RowsAtCompileTime, ColsAtCompileTime */
-
-      MaxRowsAtCompileTime = internal::traits<Derived>::MaxRowsAtCompileTime,
-        /**< This value is equal to the maximum possible number of rows that this expression
-          * might have. If this expression might have an arbitrarily high number of rows,
-          * this value is set to \a Dynamic.
-          *
-          * This value is useful to know when evaluating an expression, in order to determine
-          * whether it is possible to avoid doing a dynamic memory allocation.
-          *
-          * \sa RowsAtCompileTime, MaxColsAtCompileTime, MaxSizeAtCompileTime
-          */
-
-      MaxColsAtCompileTime = internal::traits<Derived>::MaxColsAtCompileTime,
-        /**< This value is equal to the maximum possible number of columns that this expression
-          * might have. If this expression might have an arbitrarily high number of columns,
-          * this value is set to \a Dynamic.
-          *
-          * This value is useful to know when evaluating an expression, in order to determine
-          * whether it is possible to avoid doing a dynamic memory allocation.
-          *
-          * \sa ColsAtCompileTime, MaxRowsAtCompileTime, MaxSizeAtCompileTime
-          */
-
-      MaxSizeAtCompileTime = (internal::size_at_compile_time<internal::traits<Derived>::MaxRowsAtCompileTime,
-                                                      internal::traits<Derived>::MaxColsAtCompileTime>::ret),
-        /**< This value is equal to the maximum possible number of coefficients that this expression
-          * might have. If this expression might have an arbitrarily high number of coefficients,
-          * this value is set to \a Dynamic.
-          *
-          * This value is useful to know when evaluating an expression, in order to determine
-          * whether it is possible to avoid doing a dynamic memory allocation.
-          *
-          * \sa SizeAtCompileTime, MaxRowsAtCompileTime, MaxColsAtCompileTime
-          */
-
-      IsVectorAtCompileTime = internal::traits<Derived>::MaxRowsAtCompileTime == 1
-                           || internal::traits<Derived>::MaxColsAtCompileTime == 1,
-        /**< This is set to true if either the number of rows or the number of
-          * columns is known at compile-time to be equal to 1. Indeed, in that case,
-          * we are dealing with a column-vector (if there is only one column) or with
-          * a row-vector (if there is only one row). */
-
-      Flags = internal::traits<Derived>::Flags,
-        /**< This stores expression \ref flags flags which may or may not be inherited by new expressions
-          * constructed from this one. See the \ref flags "list of flags".
-          */
-
-      IsRowMajor = int(Flags) & RowMajorBit, /**< True if this expression has row-major storage order. */
-
-      InnerSizeAtCompileTime = int(IsVectorAtCompileTime) ? int(SizeAtCompileTime)
-                             : int(IsRowMajor) ? int(ColsAtCompileTime) : int(RowsAtCompileTime),
-
-      CoeffReadCost = internal::traits<Derived>::CoeffReadCost,
-        /**< This is a rough measure of how expensive it is to read one coefficient from
-          * this expression.
-          */
-
-      InnerStrideAtCompileTime = internal::inner_stride_at_compile_time<Derived>::ret,
-      OuterStrideAtCompileTime = internal::outer_stride_at_compile_time<Derived>::ret
-    };
-
-    enum { ThisConstantIsPrivateInPlainObjectBase };
-
-    /** \returns the number of nonzero coefficients which is in practice the number
-      * of stored coefficients. */
-    inline Index nonZeros() const { return size(); }
-
-    /** \returns the outer size.
-      *
-      * \note For a vector, this returns just 1. For a matrix (non-vector), this is the major dimension
-      * with respect to the \ref TopicStorageOrders "storage order", i.e., the number of columns for a
-      * column-major matrix, and the number of rows for a row-major matrix. */
-    Index outerSize() const
-    {
-      return IsVectorAtCompileTime ? 1
-           : int(IsRowMajor) ? this->rows() : this->cols();
-    }
-
-    /** \returns the inner size.
-      *
-      * \note For a vector, this is just the size. For a matrix (non-vector), this is the minor dimension
-      * with respect to the \ref TopicStorageOrders "storage order", i.e., the number of rows for a 
-      * column-major matrix, and the number of columns for a row-major matrix. */
-    Index innerSize() const
-    {
-      return IsVectorAtCompileTime ? this->size()
-           : int(IsRowMajor) ? this->cols() : this->rows();
-    }
-
-    /** Only plain matrices/arrays, not expressions, may be resized; therefore the only useful resize methods are
-      * Matrix::resize() and Array::resize(). The present method only asserts that the new size equals the old size, and does
-      * nothing else.
-      */
-    void resize(Index newSize)
-    {
-      EIGEN_ONLY_USED_FOR_DEBUG(newSize);
-      eigen_assert(newSize == this->size()
-                && "DenseBase::resize() does not actually allow to resize.");
-    }
-    /** Only plain matrices/arrays, not expressions, may be resized; therefore the only useful resize methods are
-      * Matrix::resize() and Array::resize(). The present method only asserts that the new size equals the old size, and does
-      * nothing else.
-      */
-    void resize(Index nbRows, Index nbCols)
-    {
-      EIGEN_ONLY_USED_FOR_DEBUG(nbRows);
-      EIGEN_ONLY_USED_FOR_DEBUG(nbCols);
-      eigen_assert(nbRows == this->rows() && nbCols == this->cols()
-                && "DenseBase::resize() does not actually allow to resize.");
-    }
+ public:
+  /** Inner iterator type to iterate over the coefficients of a row or column.
+   * \sa class InnerIterator
+   */
+  typedef Eigen::InnerIterator<Derived> InnerIterator;
+
+  typedef typename internal::traits<Derived>::StorageKind StorageKind;
+
+  /**
+   * \brief The type used to store indices
+   * \details This typedef is relevant for types that store multiple indices such as
+   *          PermutationMatrix or Transpositions, otherwise it defaults to Eigen::Index
+   * \sa \blank \ref TopicPreprocessorDirectives, Eigen::Index, SparseMatrixBase.
+   */
+  typedef typename internal::traits<Derived>::StorageIndex StorageIndex;
+
+  /** The numeric type of the expression' coefficients, e.g. float, double, int or std::complex<float>, etc. */
+  typedef typename internal::traits<Derived>::Scalar Scalar;
+
+  /** The numeric type of the expression' coefficients, e.g. float, double, int or std::complex<float>, etc.
+   *
+   * It is an alias for the Scalar type */
+  typedef Scalar value_type;
+
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef DenseCoeffsBase<Derived, internal::accessors_level<Derived>::value> Base;
+
+  using Base::coeff;
+  using Base::coeffByOuterInner;
+  using Base::colIndexByOuterInner;
+  using Base::cols;
+  using Base::const_cast_derived;
+  using Base::derived;
+  using Base::rowIndexByOuterInner;
+  using Base::rows;
+  using Base::size;
+  using Base::operator();
+  using Base::operator[];
+  using Base::colStride;
+  using Base::innerStride;
+  using Base::outerStride;
+  using Base::rowStride;
+  using Base::stride;
+  using Base::w;
+  using Base::x;
+  using Base::y;
+  using Base::z;
+  typedef typename Base::CoeffReturnType CoeffReturnType;
+
+  enum {
+
+    RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime,
+    /**< The number of rows at compile-time. This is just a copy of the value provided
+     * by the \a Derived type. If a value is not known at compile-time,
+     * it is set to the \a Dynamic constant.
+     * \sa MatrixBase::rows(), MatrixBase::cols(), ColsAtCompileTime, SizeAtCompileTime */
+
+    ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
+    /**< The number of columns at compile-time. This is just a copy of the value provided
+     * by the \a Derived type. If a value is not known at compile-time,
+     * it is set to the \a Dynamic constant.
+     * \sa MatrixBase::rows(), MatrixBase::cols(), RowsAtCompileTime, SizeAtCompileTime */
+
+    SizeAtCompileTime = (internal::size_of_xpr_at_compile_time<Derived>::ret),
+    /**< This is equal to the number of coefficients, i.e. the number of
+     * rows times the number of columns, or to \a Dynamic if this is not
+     * known at compile-time. \sa RowsAtCompileTime, ColsAtCompileTime */
+
+    MaxRowsAtCompileTime = internal::traits<Derived>::MaxRowsAtCompileTime,
+    /**< This value is equal to the maximum possible number of rows that this expression
+     * might have. If this expression might have an arbitrarily high number of rows,
+     * this value is set to \a Dynamic.
+     *
+     * This value is useful to know when evaluating an expression, in order to determine
+     * whether it is possible to avoid doing a dynamic memory allocation.
+     *
+     * \sa RowsAtCompileTime, MaxColsAtCompileTime, MaxSizeAtCompileTime
+     */
+
+    MaxColsAtCompileTime = internal::traits<Derived>::MaxColsAtCompileTime,
+    /**< This value is equal to the maximum possible number of columns that this expression
+     * might have. If this expression might have an arbitrarily high number of columns,
+     * this value is set to \a Dynamic.
+     *
+     * This value is useful to know when evaluating an expression, in order to determine
+     * whether it is possible to avoid doing a dynamic memory allocation.
+     *
+     * \sa ColsAtCompileTime, MaxRowsAtCompileTime, MaxSizeAtCompileTime
+     */
+
+    MaxSizeAtCompileTime = internal::size_at_compile_time(internal::traits<Derived>::MaxRowsAtCompileTime,
+                                                          internal::traits<Derived>::MaxColsAtCompileTime),
+    /**< This value is equal to the maximum possible number of coefficients that this expression
+     * might have. If this expression might have an arbitrarily high number of coefficients,
+     * this value is set to \a Dynamic.
+     *
+     * This value is useful to know when evaluating an expression, in order to determine
+     * whether it is possible to avoid doing a dynamic memory allocation.
+     *
+     * \sa SizeAtCompileTime, MaxRowsAtCompileTime, MaxColsAtCompileTime
+     */
+
+    IsVectorAtCompileTime =
+        internal::traits<Derived>::RowsAtCompileTime == 1 || internal::traits<Derived>::ColsAtCompileTime == 1,
+    /**< This is set to true if either the number of rows or the number of
+     * columns is known at compile-time to be equal to 1. Indeed, in that case,
+     * we are dealing with a column-vector (if there is only one column) or with
+     * a row-vector (if there is only one row). */
+
+    NumDimensions = int(MaxSizeAtCompileTime) == 1 ? 0
+                    : bool(IsVectorAtCompileTime)  ? 1
+                                                   : 2,
+    /**< This value is equal to Tensor::NumDimensions, i.e. 0 for scalars, 1 for vectors,
+     * and 2 for matrices.
+     */
+
+    Flags = internal::traits<Derived>::Flags,
+    /**< This stores expression \ref flags flags which may or may not be inherited by new expressions
+     * constructed from this one. See the \ref flags "list of flags".
+     */
+
+    IsRowMajor = int(Flags) & RowMajorBit, /**< True if this expression has row-major storage order. */
+
+    InnerSizeAtCompileTime = int(IsVectorAtCompileTime) ? int(SizeAtCompileTime)
+                             : int(IsRowMajor)          ? int(ColsAtCompileTime)
+                                                        : int(RowsAtCompileTime),
+
+    InnerStrideAtCompileTime = internal::inner_stride_at_compile_time<Derived>::ret,
+    OuterStrideAtCompileTime = internal::outer_stride_at_compile_time<Derived>::ret
+  };
+
+  typedef typename internal::find_best_packet<Scalar, SizeAtCompileTime>::type PacketScalar;
+
+  enum { IsPlainObjectBase = 0 };
+
+  /** The plain matrix type corresponding to this expression.
+   * \sa PlainObject */
+  typedef Matrix<typename internal::traits<Derived>::Scalar, internal::traits<Derived>::RowsAtCompileTime,
+                 internal::traits<Derived>::ColsAtCompileTime,
+                 AutoAlign | (internal::traits<Derived>::Flags & RowMajorBit ? RowMajor : ColMajor),
+                 internal::traits<Derived>::MaxRowsAtCompileTime, internal::traits<Derived>::MaxColsAtCompileTime>
+      PlainMatrix;
+
+  /** The plain array type corresponding to this expression.
+   * \sa PlainObject */
+  typedef Array<typename internal::traits<Derived>::Scalar, internal::traits<Derived>::RowsAtCompileTime,
+                internal::traits<Derived>::ColsAtCompileTime,
+                AutoAlign | (internal::traits<Derived>::Flags & RowMajorBit ? RowMajor : ColMajor),
+                internal::traits<Derived>::MaxRowsAtCompileTime, internal::traits<Derived>::MaxColsAtCompileTime>
+      PlainArray;
+
+  /** \brief The plain matrix or array type corresponding to this expression.
+   *
+   * This is not necessarily exactly the return type of eval(). In the case of plain matrices,
+   * the return type of eval() is a const reference to a matrix, not a matrix! It is however guaranteed
+   * that the return type of eval() is either PlainObject or const PlainObject&.
+   */
+  typedef std::conditional_t<internal::is_same<typename internal::traits<Derived>::XprKind, MatrixXpr>::value,
+                             PlainMatrix, PlainArray>
+      PlainObject;
+
+  /** \returns the outer size.
+   *
+   * \note For a vector, this returns just 1. For a matrix (non-vector), this is the major dimension
+   * with respect to the \ref TopicStorageOrders "storage order", i.e., the number of columns for a
+   * column-major matrix, and the number of rows for a row-major matrix. */
+  EIGEN_DEVICE_FUNC constexpr Index outerSize() const {
+    return IsVectorAtCompileTime ? 1 : int(IsRowMajor) ? this->rows() : this->cols();
+  }
+
+  /** \returns the inner size.
+   *
+   * \note For a vector, this is just the size. For a matrix (non-vector), this is the minor dimension
+   * with respect to the \ref TopicStorageOrders "storage order", i.e., the number of rows for a
+   * column-major matrix, and the number of columns for a row-major matrix. */
+  EIGEN_DEVICE_FUNC constexpr Index innerSize() const {
+    return IsVectorAtCompileTime ? this->size() : int(IsRowMajor) ? this->cols() : this->rows();
+  }
+
+  /** Only plain matrices/arrays, not expressions, may be resized; therefore the only useful resize methods are
+   * Matrix::resize() and Array::resize(). The present method only asserts that the new size equals the old size, and
+   * does nothing else.
+   */
+  EIGEN_DEVICE_FUNC void resize(Index newSize) {
+    EIGEN_ONLY_USED_FOR_DEBUG(newSize);
+    eigen_assert(newSize == this->size() && "DenseBase::resize() does not actually allow to resize.");
+  }
+  /** Only plain matrices/arrays, not expressions, may be resized; therefore the only useful resize methods are
+   * Matrix::resize() and Array::resize(). The present method only asserts that the new size equals the old size, and
+   * does nothing else.
+   */
+  EIGEN_DEVICE_FUNC void resize(Index rows, Index cols) {
+    EIGEN_ONLY_USED_FOR_DEBUG(rows);
+    EIGEN_ONLY_USED_FOR_DEBUG(cols);
+    eigen_assert(rows == this->rows() && cols == this->cols() &&
+                 "DenseBase::resize() does not actually allow to resize.");
+  }
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
+  /** \internal Represents a matrix with all coefficients equal to one another*/
+  typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject> ConstantReturnType;
+  /** \internal Represents a matrix with all coefficients equal to zero*/
+  typedef CwiseNullaryOp<internal::scalar_zero_op<Scalar>, PlainObject> ZeroReturnType;
+  /** \internal \deprecated Represents a vector with linearly spaced coefficients that allows sequential access only. */
+  EIGEN_DEPRECATED typedef CwiseNullaryOp<internal::linspaced_op<Scalar>, PlainObject> SequentialLinSpacedReturnType;
+  /** \internal Represents a vector with linearly spaced coefficients that allows random access. */
+  typedef CwiseNullaryOp<internal::linspaced_op<Scalar>, PlainObject> RandomAccessLinSpacedReturnType;
+  /** \internal Represents a vector with equally spaced coefficients that allows random access. */
+  typedef CwiseNullaryOp<internal::equalspaced_op<Scalar>, PlainObject> RandomAccessEqualSpacedReturnType;
+  /** \internal the return type of MatrixBase::eigenvalues() */
+  typedef Matrix<typename NumTraits<typename internal::traits<Derived>::Scalar>::Real,
+                 internal::traits<Derived>::ColsAtCompileTime, 1>
+      EigenvaluesReturnType;
+
+#endif  // not EIGEN_PARSED_BY_DOXYGEN
+
+  /** Copies \a other into *this. \returns a reference to *this. */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const DenseBase<OtherDerived>& other);
+
+  /** Special case of the template operator=, in order to prevent the compiler
+   * from generating a default operator= (issue hit with g++ 4.1)
+   */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const DenseBase& other);
+
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC Derived& operator=(const EigenBase<OtherDerived>& other);
+
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC Derived& operator+=(const EigenBase<OtherDerived>& other);
+
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC Derived& operator-=(const EigenBase<OtherDerived>& other);
+
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC Derived& operator=(const ReturnByValue<OtherDerived>& func);
+
+  /** \internal
+   * Copies \a other into *this without evaluating other. \returns a reference to *this. */
+  template <typename OtherDerived>
+  /** \deprecated */
+  EIGEN_DEPRECATED EIGEN_DEVICE_FUNC Derived& lazyAssign(const DenseBase<OtherDerived>& other);
+
+  EIGEN_DEVICE_FUNC CommaInitializer<Derived> operator<<(const Scalar& s);
+
+  template <unsigned int Added, unsigned int Removed>
+  /** \deprecated it now returns \c *this */
+  EIGEN_DEPRECATED const Derived& flagged() const {
+    return derived();
+  }
+
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC CommaInitializer<Derived> operator<<(const DenseBase<OtherDerived>& other);
+
+  typedef Transpose<Derived> TransposeReturnType;
+  EIGEN_DEVICE_FUNC TransposeReturnType transpose();
+  typedef Transpose<const Derived> ConstTransposeReturnType;
+  EIGEN_DEVICE_FUNC const ConstTransposeReturnType transpose() const;
+  EIGEN_DEVICE_FUNC void transposeInPlace();
+
+  EIGEN_DEVICE_FUNC static const ConstantReturnType Constant(Index rows, Index cols, const Scalar& value);
+  EIGEN_DEVICE_FUNC static const ConstantReturnType Constant(Index size, const Scalar& value);
+  EIGEN_DEVICE_FUNC static const ConstantReturnType Constant(const Scalar& value);
+
+  EIGEN_DEPRECATED_WITH_REASON("The method may result in accuracy loss. Use .EqualSpaced() instead.")
+  EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType LinSpaced(Sequential_t, Index size, const Scalar& low,
+                                                                           const Scalar& high);
+  EIGEN_DEPRECATED_WITH_REASON("The method may result in accuracy loss. Use .EqualSpaced() instead.")
+  EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType LinSpaced(Sequential_t, const Scalar& low,
+                                                                           const Scalar& high);
+
+  EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType LinSpaced(Index size, const Scalar& low,
+                                                                           const Scalar& high);
+  EIGEN_DEVICE_FUNC static const RandomAccessLinSpacedReturnType LinSpaced(const Scalar& low, const Scalar& high);
+
+  EIGEN_DEVICE_FUNC static const RandomAccessEqualSpacedReturnType EqualSpaced(Index size, const Scalar& low,
+                                                                               const Scalar& step);
+  EIGEN_DEVICE_FUNC static const RandomAccessEqualSpacedReturnType EqualSpaced(const Scalar& low, const Scalar& step);
+
+  template <typename CustomNullaryOp>
+  EIGEN_DEVICE_FUNC static const CwiseNullaryOp<CustomNullaryOp, PlainObject> NullaryExpr(Index rows, Index cols,
+                                                                                          const CustomNullaryOp& func);
+  template <typename CustomNullaryOp>
+  EIGEN_DEVICE_FUNC static const CwiseNullaryOp<CustomNullaryOp, PlainObject> NullaryExpr(Index size,
+                                                                                          const CustomNullaryOp& func);
+  template <typename CustomNullaryOp>
+  EIGEN_DEVICE_FUNC static const CwiseNullaryOp<CustomNullaryOp, PlainObject> NullaryExpr(const CustomNullaryOp& func);
+
+  EIGEN_DEVICE_FUNC static const ZeroReturnType Zero(Index rows, Index cols);
+  EIGEN_DEVICE_FUNC static const ZeroReturnType Zero(Index size);
+  EIGEN_DEVICE_FUNC static const ZeroReturnType Zero();
+  EIGEN_DEVICE_FUNC static const ConstantReturnType Ones(Index rows, Index cols);
+  EIGEN_DEVICE_FUNC static const ConstantReturnType Ones(Index size);
+  EIGEN_DEVICE_FUNC static const ConstantReturnType Ones();
+
+  EIGEN_DEVICE_FUNC void fill(const Scalar& value);
+  EIGEN_DEVICE_FUNC Derived& setConstant(const Scalar& value);
+  EIGEN_DEVICE_FUNC Derived& setLinSpaced(Index size, const Scalar& low, const Scalar& high);
+  EIGEN_DEVICE_FUNC Derived& setLinSpaced(const Scalar& low, const Scalar& high);
+  EIGEN_DEVICE_FUNC Derived& setEqualSpaced(Index size, const Scalar& low, const Scalar& step);
+  EIGEN_DEVICE_FUNC Derived& setEqualSpaced(const Scalar& low, const Scalar& step);
+  EIGEN_DEVICE_FUNC Derived& setZero();
+  EIGEN_DEVICE_FUNC Derived& setOnes();
+  EIGEN_DEVICE_FUNC Derived& setRandom();
+
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC bool isApprox(const DenseBase<OtherDerived>& other,
+                                  const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+  EIGEN_DEVICE_FUNC bool isMuchSmallerThan(const RealScalar& other,
+                                           const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC bool isMuchSmallerThan(const DenseBase<OtherDerived>& other,
+                                           const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+
+  EIGEN_DEVICE_FUNC bool isApproxToConstant(const Scalar& value,
+                                            const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+  EIGEN_DEVICE_FUNC bool isConstant(const Scalar& value,
+                                    const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+  EIGEN_DEVICE_FUNC bool isZero(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+  EIGEN_DEVICE_FUNC bool isOnes(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+
+  EIGEN_DEVICE_FUNC inline bool hasNaN() const;
+  EIGEN_DEVICE_FUNC inline bool allFinite() const;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator*=(const Scalar& other);
+  template <bool Enable = !internal::is_same<Scalar, RealScalar>::value, typename = std::enable_if_t<Enable>>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator*=(const RealScalar& other);
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator/=(const Scalar& other);
+  template <bool Enable = !internal::is_same<Scalar, RealScalar>::value, typename = std::enable_if_t<Enable>>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator/=(const RealScalar& other);
+
+  typedef internal::add_const_on_value_type_t<typename internal::eval<Derived>::type> EvalReturnType;
+  /** \returns the matrix or vector obtained by evaluating this expression.
+   *
+   * Notice that in the case of a plain matrix or vector (not an expression) this function just returns
+   * a const reference, in order to avoid a useless copy.
+   *
+   * \warning Be careful with eval() and the auto C++ keyword, as detailed in this \link TopicPitfalls_auto_keyword page
+   * \endlink.
+   */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvalReturnType eval() const {
+    // Even though MSVC does not honor strong inlining when the return type
+    // is a dynamic matrix, we desperately need strong inlining for fixed
+    // size types on MSVC.
+    return typename internal::eval<Derived>::type(derived());
+  }
+
+  /** swaps *this with the expression \a other.
+   *
+   */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void swap(const DenseBase<OtherDerived>& other) {
+    EIGEN_STATIC_ASSERT(!OtherDerived::IsPlainObjectBase, THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
+    eigen_assert(rows() == other.rows() && cols() == other.cols());
+    call_assignment(derived(), other.const_cast_derived(), internal::swap_assign_op<Scalar>());
+  }
+
+  /** swaps *this with the matrix or array \a other.
+   *
+   */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void swap(PlainObjectBase<OtherDerived>& other) {
+    eigen_assert(rows() == other.rows() && cols() == other.cols());
+    call_assignment(derived(), other.derived(), internal::swap_assign_op<Scalar>());
+  }
+
+  EIGEN_DEVICE_FUNC inline const NestByValue<Derived> nestByValue() const;
+  EIGEN_DEVICE_FUNC inline const ForceAlignedAccess<Derived> forceAlignedAccess() const;
+  EIGEN_DEVICE_FUNC inline ForceAlignedAccess<Derived> forceAlignedAccess();
+  template <bool Enable>
+  EIGEN_DEVICE_FUNC inline const std::conditional_t<Enable, ForceAlignedAccess<Derived>, Derived&>
+  forceAlignedAccessIf() const;
+  template <bool Enable>
+  EIGEN_DEVICE_FUNC inline std::conditional_t<Enable, ForceAlignedAccess<Derived>, Derived&> forceAlignedAccessIf();
+
+  EIGEN_DEVICE_FUNC Scalar sum() const;
+  EIGEN_DEVICE_FUNC Scalar mean() const;
+  EIGEN_DEVICE_FUNC Scalar trace() const;
+
+  EIGEN_DEVICE_FUNC Scalar prod() const;
+
+  template <int NaNPropagation>
+  EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar minCoeff() const;
+  template <int NaNPropagation>
+  EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar maxCoeff() const;
+
+  // By default, the fastest version with undefined NaN propagation semantics is
+  // used.
+  // TODO(rmlarsen): Replace with default template argument when we move to
+  // c++11 or beyond.
+  EIGEN_DEVICE_FUNC inline typename internal::traits<Derived>::Scalar minCoeff() const {
+    return minCoeff<PropagateFast>();
+  }
+  EIGEN_DEVICE_FUNC inline typename internal::traits<Derived>::Scalar maxCoeff() const {
+    return maxCoeff<PropagateFast>();
+  }
+
+  template <int NaNPropagation, typename IndexType>
+  EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar minCoeff(IndexType* row, IndexType* col) const;
+  template <int NaNPropagation, typename IndexType>
+  EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar maxCoeff(IndexType* row, IndexType* col) const;
+  template <int NaNPropagation, typename IndexType>
+  EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar minCoeff(IndexType* index) const;
+  template <int NaNPropagation, typename IndexType>
+  EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar maxCoeff(IndexType* index) const;
+
+  // TODO(rmlarsen): Replace these methods with a default template argument.
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC inline typename internal::traits<Derived>::Scalar minCoeff(IndexType* row, IndexType* col) const {
+    return minCoeff<PropagateFast>(row, col);
+  }
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC inline typename internal::traits<Derived>::Scalar maxCoeff(IndexType* row, IndexType* col) const {
+    return maxCoeff<PropagateFast>(row, col);
+  }
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC inline typename internal::traits<Derived>::Scalar minCoeff(IndexType* index) const {
+    return minCoeff<PropagateFast>(index);
+  }
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC inline typename internal::traits<Derived>::Scalar maxCoeff(IndexType* index) const {
+    return maxCoeff<PropagateFast>(index);
+  }
+
+  template <typename BinaryOp>
+  EIGEN_DEVICE_FUNC Scalar redux(const BinaryOp& func) const;
+
+  template <typename Visitor>
+  EIGEN_DEVICE_FUNC void visit(Visitor& func) const;
+
+  /** \returns a WithFormat proxy object allowing to print a matrix the with given
+   * format \a fmt.
+   *
+   * See class IOFormat for some examples.
+   *
+   * \sa class IOFormat, class WithFormat
+   */
+  inline const WithFormat<Derived> format(const IOFormat& fmt) const { return WithFormat<Derived>(derived(), fmt); }
+
+  /** \returns the unique coefficient of a 1x1 expression */
+  EIGEN_DEVICE_FUNC CoeffReturnType value() const {
+    EIGEN_STATIC_ASSERT_SIZE_1x1(Derived) eigen_assert(this->rows() == 1 && this->cols() == 1);
+    return derived().coeff(0, 0);
+  }
+
+  EIGEN_DEVICE_FUNC bool all() const;
+  EIGEN_DEVICE_FUNC bool any() const;
+  EIGEN_DEVICE_FUNC Index count() const;
+
+  typedef VectorwiseOp<Derived, Horizontal> RowwiseReturnType;
+  typedef const VectorwiseOp<const Derived, Horizontal> ConstRowwiseReturnType;
+  typedef VectorwiseOp<Derived, Vertical> ColwiseReturnType;
+  typedef const VectorwiseOp<const Derived, Vertical> ConstColwiseReturnType;
+
+  /** \returns a VectorwiseOp wrapper of *this for broadcasting and partial reductions
+   *
+   * Example: \include MatrixBase_rowwise.cpp
+   * Output: \verbinclude MatrixBase_rowwise.out
+   *
+   * \sa colwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
+   */
+  // Code moved here due to a CUDA compiler bug
+  EIGEN_DEVICE_FUNC inline ConstRowwiseReturnType rowwise() const { return ConstRowwiseReturnType(derived()); }
+  EIGEN_DEVICE_FUNC RowwiseReturnType rowwise();
+
+  /** \returns a VectorwiseOp wrapper of *this broadcasting and partial reductions
+   *
+   * Example: \include MatrixBase_colwise.cpp
+   * Output: \verbinclude MatrixBase_colwise.out
+   *
+   * \sa rowwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
+   */
+  EIGEN_DEVICE_FUNC inline ConstColwiseReturnType colwise() const { return ConstColwiseReturnType(derived()); }
+  EIGEN_DEVICE_FUNC ColwiseReturnType colwise();
+
+  typedef CwiseNullaryOp<internal::scalar_random_op<Scalar>, PlainObject> RandomReturnType;
+  static const RandomReturnType Random(Index rows, Index cols);
+  static const RandomReturnType Random(Index size);
+  static const RandomReturnType Random();
+
+  template <typename ThenDerived, typename ElseDerived>
+  inline EIGEN_DEVICE_FUNC
+      CwiseTernaryOp<internal::scalar_boolean_select_op<typename DenseBase<ThenDerived>::Scalar,
+                                                        typename DenseBase<ElseDerived>::Scalar, Scalar>,
+                     ThenDerived, ElseDerived, Derived>
+      select(const DenseBase<ThenDerived>& thenMatrix, const DenseBase<ElseDerived>& elseMatrix) const;
+
+  template <typename ThenDerived>
+  inline EIGEN_DEVICE_FUNC
+      CwiseTernaryOp<internal::scalar_boolean_select_op<typename DenseBase<ThenDerived>::Scalar,
+                                                        typename DenseBase<ThenDerived>::Scalar, Scalar>,
+                     ThenDerived, typename DenseBase<ThenDerived>::ConstantReturnType, Derived>
+      select(const DenseBase<ThenDerived>& thenMatrix, const typename DenseBase<ThenDerived>::Scalar& elseScalar) const;
+
+  template <typename ElseDerived>
+  inline EIGEN_DEVICE_FUNC
+      CwiseTernaryOp<internal::scalar_boolean_select_op<typename DenseBase<ElseDerived>::Scalar,
+                                                        typename DenseBase<ElseDerived>::Scalar, Scalar>,
+                     typename DenseBase<ElseDerived>::ConstantReturnType, ElseDerived, Derived>
+      select(const typename DenseBase<ElseDerived>::Scalar& thenScalar, const DenseBase<ElseDerived>& elseMatrix) const;
+
+  template <int p>
+  RealScalar lpNorm() const;
+
+  template <int RowFactor, int ColFactor>
+  EIGEN_DEVICE_FUNC const Replicate<Derived, RowFactor, ColFactor> replicate() const;
+  /**
+   * \return an expression of the replication of \c *this
+   *
+   * Example: \include MatrixBase_replicate_int_int.cpp
+   * Output: \verbinclude MatrixBase_replicate_int_int.out
+   *
+   * \sa VectorwiseOp::replicate(), DenseBase::replicate<int,int>(), class Replicate
+   */
+  // Code moved here due to a CUDA compiler bug
+  EIGEN_DEVICE_FUNC const Replicate<Derived, Dynamic, Dynamic> replicate(Index rowFactor, Index colFactor) const {
+    return Replicate<Derived, Dynamic, Dynamic>(derived(), rowFactor, colFactor);
+  }
+
+  typedef Reverse<Derived, BothDirections> ReverseReturnType;
+  typedef const Reverse<const Derived, BothDirections> ConstReverseReturnType;
+  EIGEN_DEVICE_FUNC ReverseReturnType reverse();
+  /** This is the const version of reverse(). */
+  // Code moved here due to a CUDA compiler bug
+  EIGEN_DEVICE_FUNC ConstReverseReturnType reverse() const { return ConstReverseReturnType(derived()); }
+  EIGEN_DEVICE_FUNC void reverseInPlace();
+
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+  /** STL-like <a href="https://en.cppreference.com/w/cpp/named_req/RandomAccessIterator">RandomAccessIterator</a>
+   * iterator type as returned by the begin() and end() methods.
+   */
+  typedef random_access_iterator_type iterator;
+  /** This is the const version of iterator (aka read-only) */
+  typedef random_access_iterator_type const_iterator;
+#else
+  typedef std::conditional_t<(Flags & DirectAccessBit) == DirectAccessBit,
+                             internal::pointer_based_stl_iterator<Derived>,
+                             internal::generic_randaccess_stl_iterator<Derived> >
+      iterator_type;
 
-    /** \internal Represents a matrix with all coefficients equal to one another*/
-    typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>,Derived> ConstantReturnType;
-    /** \internal Represents a vector with linearly spaced coefficients that allows sequential access only. */
-    typedef CwiseNullaryOp<internal::linspaced_op<Scalar,false>,Derived> SequentialLinSpacedReturnType;
-    /** \internal Represents a vector with linearly spaced coefficients that allows random access. */
-    typedef CwiseNullaryOp<internal::linspaced_op<Scalar,true>,Derived> RandomAccessLinSpacedReturnType;
-    /** \internal the return type of MatrixBase::eigenvalues() */
-    typedef Matrix<typename NumTraits<typename internal::traits<Derived>::Scalar>::Real, internal::traits<Derived>::ColsAtCompileTime, 1> EigenvaluesReturnType;
-
-#endif // not EIGEN_PARSED_BY_DOXYGEN
-
-    /** Copies \a other into *this. \returns a reference to *this. */
-    template<typename OtherDerived>
-    Derived& operator=(const DenseBase<OtherDerived>& other);
-
-    /** Special case of the template operator=, in order to prevent the compiler
-      * from generating a default operator= (issue hit with g++ 4.1)
-      */
-    Derived& operator=(const DenseBase& other);
-
-    template<typename OtherDerived>
-    Derived& operator=(const EigenBase<OtherDerived> &other);
-
-    template<typename OtherDerived>
-    Derived& operator+=(const EigenBase<OtherDerived> &other);
-
-    template<typename OtherDerived>
-    Derived& operator-=(const EigenBase<OtherDerived> &other);
-
-    template<typename OtherDerived>
-    Derived& operator=(const ReturnByValue<OtherDerived>& func);
-
-    /** \internal Copies \a other into *this without evaluating other. \returns a reference to *this. */
-    template<typename OtherDerived>
-    Derived& lazyAssign(const DenseBase<OtherDerived>& other);
-
-    /** \internal Evaluates \a other into *this. \returns a reference to *this. */
-    template<typename OtherDerived>
-    Derived& lazyAssign(const ReturnByValue<OtherDerived>& other);
-
-    CommaInitializer<Derived> operator<< (const Scalar& s);
+  typedef std::conditional_t<(Flags & DirectAccessBit) == DirectAccessBit,
+                             internal::pointer_based_stl_iterator<const Derived>,
+                             internal::generic_randaccess_stl_iterator<const Derived> >
+      const_iterator_type;
 
-    template<unsigned int Added,unsigned int Removed>
-    const Flagged<Derived, Added, Removed> flagged() const;
+  // Stl-style iterators are supported only for vectors.
 
-    template<typename OtherDerived>
-    CommaInitializer<Derived> operator<< (const DenseBase<OtherDerived>& other);
+  typedef std::conditional_t<IsVectorAtCompileTime, iterator_type, void> iterator;
 
-    Eigen::Transpose<Derived> transpose();
-	typedef typename internal::add_const<Transpose<const Derived> >::type ConstTransposeReturnType;
-    ConstTransposeReturnType transpose() const;
-    void transposeInPlace();
-#ifndef EIGEN_NO_DEBUG
-  protected:
-    template<typename OtherDerived>
-    void checkTransposeAliasing(const OtherDerived& other) const;
-  public:
+  typedef std::conditional_t<IsVectorAtCompileTime, const_iterator_type, void> const_iterator;
 #endif
 
+  inline iterator begin();
+  inline const_iterator begin() const;
+  inline const_iterator cbegin() const;
+  inline iterator end();
+  inline const_iterator end() const;
+  inline const_iterator cend() const;
+
+  using RealViewReturnType = std::conditional_t<NumTraits<Scalar>::IsComplex, RealView<Derived>, Derived&>;
+  using ConstRealViewReturnType =
+      std::conditional_t<NumTraits<Scalar>::IsComplex, RealView<const Derived>, const Derived&>;
 
-    static const ConstantReturnType
-    Constant(Index rows, Index cols, const Scalar& value);
-    static const ConstantReturnType
-    Constant(Index size, const Scalar& value);
-    static const ConstantReturnType
-    Constant(const Scalar& value);
-
-    static const SequentialLinSpacedReturnType
-    LinSpaced(Sequential_t, Index size, const Scalar& low, const Scalar& high);
-    static const RandomAccessLinSpacedReturnType
-    LinSpaced(Index size, const Scalar& low, const Scalar& high);
-    static const SequentialLinSpacedReturnType
-    LinSpaced(Sequential_t, const Scalar& low, const Scalar& high);
-    static const RandomAccessLinSpacedReturnType
-    LinSpaced(const Scalar& low, const Scalar& high);
-
-    template<typename CustomNullaryOp>
-    static const CwiseNullaryOp<CustomNullaryOp, Derived>
-    NullaryExpr(Index rows, Index cols, const CustomNullaryOp& func);
-    template<typename CustomNullaryOp>
-    static const CwiseNullaryOp<CustomNullaryOp, Derived>
-    NullaryExpr(Index size, const CustomNullaryOp& func);
-    template<typename CustomNullaryOp>
-    static const CwiseNullaryOp<CustomNullaryOp, Derived>
-    NullaryExpr(const CustomNullaryOp& func);
-
-    static const ConstantReturnType Zero(Index rows, Index cols);
-    static const ConstantReturnType Zero(Index size);
-    static const ConstantReturnType Zero();
-    static const ConstantReturnType Ones(Index rows, Index cols);
-    static const ConstantReturnType Ones(Index size);
-    static const ConstantReturnType Ones();
-
-    void fill(const Scalar& value);
-    Derived& setConstant(const Scalar& value);
-    Derived& setLinSpaced(Index size, const Scalar& low, const Scalar& high);
-    Derived& setLinSpaced(const Scalar& low, const Scalar& high);
-    Derived& setZero();
-    Derived& setOnes();
-    Derived& setRandom();
-
-    template<typename OtherDerived>
-    bool isApprox(const DenseBase<OtherDerived>& other,
-                  const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
-    bool isMuchSmallerThan(const RealScalar& other,
-                           const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
-    template<typename OtherDerived>
-    bool isMuchSmallerThan(const DenseBase<OtherDerived>& other,
-                           const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
-
-    bool isApproxToConstant(const Scalar& value, const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
-    bool isConstant(const Scalar& value, const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
-    bool isZero(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
-    bool isOnes(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
-    
-    inline bool hasNaN() const;
-    inline bool allFinite() const;
-
-    inline Derived& operator*=(const Scalar& other);
-    inline Derived& operator/=(const Scalar& other);
-
-    typedef typename internal::add_const_on_value_type<typename internal::eval<Derived>::type>::type EvalReturnType;
-    /** \returns the matrix or vector obtained by evaluating this expression.
-      *
-      * Notice that in the case of a plain matrix or vector (not an expression) this function just returns
-      * a const reference, in order to avoid a useless copy.
-      */
-    EIGEN_STRONG_INLINE EvalReturnType eval() const
-    {
-      // Even though MSVC does not honor strong inlining when the return type
-      // is a dynamic matrix, we desperately need strong inlining for fixed
-      // size types on MSVC.
-      return typename internal::eval<Derived>::type(derived());
-    }
-
-    /** swaps *this with the expression \a other.
-      *
-      */
-    template<typename OtherDerived>
-    void swap(const DenseBase<OtherDerived>& other,
-              int = OtherDerived::ThisConstantIsPrivateInPlainObjectBase)
-    {
-      SwapWrapper<Derived>(derived()).lazyAssign(other.derived());
-    }
-
-    /** swaps *this with the matrix or array \a other.
-      *
-      */
-    template<typename OtherDerived>
-    void swap(PlainObjectBase<OtherDerived>& other)
-    {
-      SwapWrapper<Derived>(derived()).lazyAssign(other.derived());
-    }
-
-
-    inline const NestByValue<Derived> nestByValue() const;
-    inline const ForceAlignedAccess<Derived> forceAlignedAccess() const;
-    inline ForceAlignedAccess<Derived> forceAlignedAccess();
-    template<bool Enable> inline const typename internal::conditional<Enable,ForceAlignedAccess<Derived>,Derived&>::type forceAlignedAccessIf() const;
-    template<bool Enable> inline typename internal::conditional<Enable,ForceAlignedAccess<Derived>,Derived&>::type forceAlignedAccessIf();
-
-    Scalar sum() const;
-    Scalar mean() const;
-    Scalar trace() const;
-
-    Scalar prod() const;
-
-    typename internal::traits<Derived>::Scalar minCoeff() const;
-    typename internal::traits<Derived>::Scalar maxCoeff() const;
-
-    template<typename IndexType>
-    typename internal::traits<Derived>::Scalar minCoeff(IndexType* row, IndexType* col) const;
-    template<typename IndexType>
-    typename internal::traits<Derived>::Scalar maxCoeff(IndexType* row, IndexType* col) const;
-    template<typename IndexType>
-    typename internal::traits<Derived>::Scalar minCoeff(IndexType* index) const;
-    template<typename IndexType>
-    typename internal::traits<Derived>::Scalar maxCoeff(IndexType* index) const;
-
-    template<typename BinaryOp>
-    typename internal::result_of<BinaryOp(typename internal::traits<Derived>::Scalar)>::type
-    redux(const BinaryOp& func) const;
-
-    template<typename Visitor>
-    void visit(Visitor& func) const;
-
-    inline const WithFormat<Derived> format(const IOFormat& fmt) const;
-
-    /** \returns the unique coefficient of a 1x1 expression */
-    CoeffReturnType value() const
-    {
-      EIGEN_STATIC_ASSERT_SIZE_1x1(Derived)
-      eigen_assert(this->rows() == 1 && this->cols() == 1);
-      return derived().coeff(0,0);
-    }
-
-    bool all(void) const;
-    bool any(void) const;
-    Index count() const;
-
-    typedef VectorwiseOp<Derived, Horizontal> RowwiseReturnType;
-    typedef const VectorwiseOp<const Derived, Horizontal> ConstRowwiseReturnType;
-    typedef VectorwiseOp<Derived, Vertical> ColwiseReturnType;
-    typedef const VectorwiseOp<const Derived, Vertical> ConstColwiseReturnType;
-
-    ConstRowwiseReturnType rowwise() const;
-    RowwiseReturnType rowwise();
-    ConstColwiseReturnType colwise() const;
-    ColwiseReturnType colwise();
-
-    static const CwiseNullaryOp<internal::scalar_random_op<Scalar>,Derived> Random(Index rows, Index cols);
-    static const CwiseNullaryOp<internal::scalar_random_op<Scalar>,Derived> Random(Index size);
-    static const CwiseNullaryOp<internal::scalar_random_op<Scalar>,Derived> Random();
-
-    template<typename ThenDerived,typename ElseDerived>
-    const Select<Derived,ThenDerived,ElseDerived>
-    select(const DenseBase<ThenDerived>& thenMatrix,
-           const DenseBase<ElseDerived>& elseMatrix) const;
-
-    template<typename ThenDerived>
-    inline const Select<Derived,ThenDerived, typename ThenDerived::ConstantReturnType>
-    select(const DenseBase<ThenDerived>& thenMatrix, const typename ThenDerived::Scalar& elseScalar) const;
-
-    template<typename ElseDerived>
-    inline const Select<Derived, typename ElseDerived::ConstantReturnType, ElseDerived >
-    select(const typename ElseDerived::Scalar& thenScalar, const DenseBase<ElseDerived>& elseMatrix) const;
-
-    template<int p> RealScalar lpNorm() const;
-
-    template<int RowFactor, int ColFactor>
-    inline const Replicate<Derived,RowFactor,ColFactor> replicate() const;
-    
-    typedef Replicate<Derived,Dynamic,Dynamic> ReplicateReturnType;
-    inline const ReplicateReturnType replicate(Index rowFacor,Index colFactor) const;
-
-    typedef Reverse<Derived, BothDirections> ReverseReturnType;
-    typedef const Reverse<const Derived, BothDirections> ConstReverseReturnType;
-    ReverseReturnType reverse();
-    ConstReverseReturnType reverse() const;
-    void reverseInPlace();
+  EIGEN_DEVICE_FUNC RealViewReturnType realView();
+  EIGEN_DEVICE_FUNC ConstRealViewReturnType realView() const;
 
 #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::DenseBase
-#   include "../plugins/BlockMethods.h"
-#   ifdef EIGEN_DENSEBASE_PLUGIN
-#     include EIGEN_DENSEBASE_PLUGIN
-#   endif
+#define EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+#define EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(COND)
+#define EIGEN_DOC_UNARY_ADDONS(X, Y)
+#include "../plugins/CommonCwiseUnaryOps.inc"
+#include "../plugins/BlockMethods.inc"
+#include "../plugins/IndexedViewMethods.inc"
+#include "../plugins/ReshapedMethods.inc"
+#ifdef EIGEN_DENSEBASE_PLUGIN
+#include EIGEN_DENSEBASE_PLUGIN
+#endif
 #undef EIGEN_CURRENT_STORAGE_BASE_CLASS
-
-#ifdef EIGEN2_SUPPORT
-
-    Block<Derived> corner(CornerType type, Index cRows, Index cCols);
-    const Block<Derived> corner(CornerType type, Index cRows, Index cCols) const;
-    template<int CRows, int CCols>
-    Block<Derived, CRows, CCols> corner(CornerType type);
-    template<int CRows, int CCols>
-    const Block<Derived, CRows, CCols> corner(CornerType type) const;
-
-#endif // EIGEN2_SUPPORT
-
-
-    // disable the use of evalTo for dense objects with a nice compilation error
-    template<typename Dest> inline void evalTo(Dest& ) const
-    {
-      EIGEN_STATIC_ASSERT((internal::is_same<Dest,void>::value),THE_EVAL_EVALTO_FUNCTION_SHOULD_NEVER_BE_CALLED_FOR_DENSE_OBJECTS);
-    }
-
-  protected:
-    /** Default constructor. Do nothing. */
-    DenseBase()
-    {
-      /* Just checks for self-consistency of the flags.
-       * Only do it when debugging Eigen, as this borders on paranoiac and could slow compilation down
-       */
+#undef EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+#undef EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF
+#undef EIGEN_DOC_UNARY_ADDONS
+
+  // disable the use of evalTo for dense objects with a nice compilation error
+  template <typename Dest>
+  EIGEN_DEVICE_FUNC inline void evalTo(Dest&) const {
+    EIGEN_STATIC_ASSERT((internal::is_same<Dest, void>::value),
+                        THE_EVAL_EVALTO_FUNCTION_SHOULD_NEVER_BE_CALLED_FOR_DENSE_OBJECTS);
+  }
+
+ protected:
+  EIGEN_DEFAULT_COPY_CONSTRUCTOR(DenseBase)
+  /** Default constructor. Do nothing. */
 #ifdef EIGEN_INTERNAL_DEBUGGING
-      EIGEN_STATIC_ASSERT((EIGEN_IMPLIES(MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1, int(IsRowMajor))
-                        && EIGEN_IMPLIES(MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1, int(!IsRowMajor))),
-                          INVALID_STORAGE_ORDER_FOR_THIS_VECTOR_EXPRESSION)
+  EIGEN_DEVICE_FUNC constexpr DenseBase() {
+    /* Just checks for self-consistency of the flags.
+     * Only do it when debugging Eigen, as this borders on paranoia and could slow compilation down
+     */
+    EIGEN_STATIC_ASSERT(
+        (internal::check_implication(MaxRowsAtCompileTime == 1 && MaxColsAtCompileTime != 1, int(IsRowMajor)) &&
+         internal::check_implication(MaxColsAtCompileTime == 1 && MaxRowsAtCompileTime != 1, int(!IsRowMajor))),
+        INVALID_STORAGE_ORDER_FOR_THIS_VECTOR_EXPRESSION)
+  }
+#else
+  EIGEN_DEVICE_FUNC constexpr DenseBase() = default;
 #endif
-    }
 
-  private:
-    explicit DenseBase(int);
-    DenseBase(int,int);
-    template<typename OtherDerived> explicit DenseBase(const DenseBase<OtherDerived>&);
+ private:
+  EIGEN_DEVICE_FUNC explicit DenseBase(int);
+  EIGEN_DEVICE_FUNC DenseBase(int, int);
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC explicit DenseBase(const DenseBase<OtherDerived>&);
 };
 
-} // end namespace Eigen
+/** Free-function swap.
+ */
+template <typename DerivedA, typename DerivedB>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    // Use forwarding references to capture all combinations of cv-qualified l+r-value cases.
+    std::enable_if_t<std::is_base_of<DenseBase<std::decay_t<DerivedA>>, std::decay_t<DerivedA>>::value &&
+                         std::is_base_of<DenseBase<std::decay_t<DerivedB>>, std::decay_t<DerivedB>>::value,
+                     void>
+    swap(DerivedA&& a, DerivedB&& b) {
+  a.swap(b);
+}
+
+}  // end namespace Eigen
 
-#endif // EIGEN_DENSEBASE_H
+#endif  // EIGEN_DENSEBASE_H
diff --git a/inst/include/Eigen/src/Core/DenseCoeffsBase.h b/inst/include/Eigen/src/Core/DenseCoeffsBase.h
index 3c890f21..377df574 100644
--- a/inst/include/Eigen/src/Core/DenseCoeffsBase.h
+++ b/inst/include/Eigen/src/Core/DenseCoeffsBase.h
@@ -10,745 +10,559 @@
 #ifndef EIGEN_DENSECOEFFSBASE_H
 #define EIGEN_DENSECOEFFSBASE_H
 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
 namespace Eigen {
 
 namespace internal {
-template<typename T> struct add_const_on_value_type_if_arithmetic
-{
-  typedef typename conditional<is_arithmetic<T>::value, T, typename add_const_on_value_type<T>::type>::type type;
+template <typename T>
+struct add_const_on_value_type_if_arithmetic {
+  typedef std::conditional_t<is_arithmetic<T>::value, T, add_const_on_value_type_t<T>> type;
 };
-}
+}  // namespace internal
 
 /** \brief Base class providing read-only coefficient access to matrices and arrays.
-  * \ingroup Core_Module
-  * \tparam Derived Type of the derived class
-  * \tparam #ReadOnlyAccessors Constant indicating read-only access
-  *
-  * This class defines the \c operator() \c const function and friends, which can be used to read specific
-  * entries of a matrix or array.
-  * 
-  * \sa DenseCoeffsBase<Derived, WriteAccessors>, DenseCoeffsBase<Derived, DirectAccessors>,
-  *     \ref TopicClassHierarchy
-  */
-template<typename Derived>
-class DenseCoeffsBase<Derived,ReadOnlyAccessors> : public EigenBase<Derived>
-{
-  public:
-    typedef typename internal::traits<Derived>::StorageKind StorageKind;
-    typedef typename internal::traits<Derived>::Index Index;
-    typedef typename internal::traits<Derived>::Scalar Scalar;
-    typedef typename internal::packet_traits<Scalar>::type PacketScalar;
-
-    // Explanation for this CoeffReturnType typedef.
-    // - This is the return type of the coeff() method.
-    // - The LvalueBit means exactly that we can offer a coeffRef() method, which means exactly that we can get references
-    // to coeffs, which means exactly that we can have coeff() return a const reference (as opposed to returning a value).
-    // - The is_artihmetic check is required since "const int", "const double", etc. will cause warnings on some systems
-    // while the declaration of "const T", where T is a non arithmetic type does not. Always returning "const Scalar&" is
-    // not possible, since the underlying expressions might not offer a valid address the reference could be referring to.
-    typedef typename internal::conditional<bool(internal::traits<Derived>::Flags&LvalueBit),
-                         const Scalar&,
-                         typename internal::conditional<internal::is_arithmetic<Scalar>::value, Scalar, const Scalar>::type
-                     >::type CoeffReturnType;
-
-    typedef typename internal::add_const_on_value_type_if_arithmetic<
-                         typename internal::packet_traits<Scalar>::type
-                     >::type PacketReturnType;
-
-    typedef EigenBase<Derived> Base;
-    using Base::rows;
-    using Base::cols;
-    using Base::size;
-    using Base::derived;
-
-    EIGEN_STRONG_INLINE Index rowIndexByOuterInner(Index outer, Index inner) const
-    {
-      return int(Derived::RowsAtCompileTime) == 1 ? 0
-          : int(Derived::ColsAtCompileTime) == 1 ? inner
-          : int(Derived::Flags)&RowMajorBit ? outer
-          : inner;
-    }
-
-    EIGEN_STRONG_INLINE Index colIndexByOuterInner(Index outer, Index inner) const
-    {
-      return int(Derived::ColsAtCompileTime) == 1 ? 0
-          : int(Derived::RowsAtCompileTime) == 1 ? inner
-          : int(Derived::Flags)&RowMajorBit ? inner
-          : outer;
-    }
-
-    /** Short version: don't use this function, use
-      * \link operator()(Index,Index) const \endlink instead.
-      *
-      * Long version: this function is similar to
-      * \link operator()(Index,Index) const \endlink, but without the assertion.
-      * Use this for limiting the performance cost of debugging code when doing
-      * repeated coefficient access. Only use this when it is guaranteed that the
-      * parameters \a row and \a col are in range.
-      *
-      * If EIGEN_INTERNAL_DEBUGGING is defined, an assertion will be made, making this
-      * function equivalent to \link operator()(Index,Index) const \endlink.
-      *
-      * \sa operator()(Index,Index) const, coeffRef(Index,Index), coeff(Index) const
-      */
-    EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const
-    {
-      eigen_internal_assert(row >= 0 && row < rows()
-                        && col >= 0 && col < cols());
-      return derived().coeff(row, col);
-    }
-
-    EIGEN_STRONG_INLINE CoeffReturnType coeffByOuterInner(Index outer, Index inner) const
-    {
-      return coeff(rowIndexByOuterInner(outer, inner),
-                   colIndexByOuterInner(outer, inner));
-    }
-
-    /** \returns the coefficient at given the given row and column.
-      *
-      * \sa operator()(Index,Index), operator[](Index)
-      */
-    EIGEN_STRONG_INLINE CoeffReturnType operator()(Index row, Index col) const
-    {
-      eigen_assert(row >= 0 && row < rows()
-          && col >= 0 && col < cols());
-      return derived().coeff(row, col);
-    }
-
-    /** Short version: don't use this function, use
-      * \link operator[](Index) const \endlink instead.
-      *
-      * Long version: this function is similar to
-      * \link operator[](Index) const \endlink, but without the assertion.
-      * Use this for limiting the performance cost of debugging code when doing
-      * repeated coefficient access. Only use this when it is guaranteed that the
-      * parameter \a index is in range.
-      *
-      * If EIGEN_INTERNAL_DEBUGGING is defined, an assertion will be made, making this
-      * function equivalent to \link operator[](Index) const \endlink.
-      *
-      * \sa operator[](Index) const, coeffRef(Index), coeff(Index,Index) const
-      */
-
-    EIGEN_STRONG_INLINE CoeffReturnType
-    coeff(Index index) const
-    {
-      eigen_internal_assert(index >= 0 && index < size());
-      return derived().coeff(index);
-    }
-
-
-    /** \returns the coefficient at given index.
-      *
-      * This method is allowed only for vector expressions, and for matrix expressions having the LinearAccessBit.
-      *
-      * \sa operator[](Index), operator()(Index,Index) const, x() const, y() const,
-      * z() const, w() const
-      */
-
-    EIGEN_STRONG_INLINE CoeffReturnType
-    operator[](Index index) const
-    {
-      #ifndef EIGEN2_SUPPORT
-      EIGEN_STATIC_ASSERT(Derived::IsVectorAtCompileTime,
-                          THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD)
-      #endif
-      eigen_assert(index >= 0 && index < size());
-      return derived().coeff(index);
-    }
-
-    /** \returns the coefficient at given index.
-      *
-      * This is synonymous to operator[](Index) const.
-      *
-      * This method is allowed only for vector expressions, and for matrix expressions having the LinearAccessBit.
-      *
-      * \sa operator[](Index), operator()(Index,Index) const, x() const, y() const,
-      * z() const, w() const
-      */
-
-    EIGEN_STRONG_INLINE CoeffReturnType
-    operator()(Index index) const
-    {
-      eigen_assert(index >= 0 && index < size());
-      return derived().coeff(index);
-    }
-
-    /** equivalent to operator[](0).  */
-
-    EIGEN_STRONG_INLINE CoeffReturnType
-    x() const { return (*this)[0]; }
-
-    /** equivalent to operator[](1).  */
-
-    EIGEN_STRONG_INLINE CoeffReturnType
-    y() const { return (*this)[1]; }
-
-    /** equivalent to operator[](2).  */
-
-    EIGEN_STRONG_INLINE CoeffReturnType
-    z() const { return (*this)[2]; }
-
-    /** equivalent to operator[](3).  */
-
-    EIGEN_STRONG_INLINE CoeffReturnType
-    w() const { return (*this)[3]; }
-
-    /** \internal
-      * \returns the packet of coefficients starting at the given row and column. It is your responsibility
-      * to ensure that a packet really starts there. This method is only available on expressions having the
-      * PacketAccessBit.
-      *
-      * The \a LoadMode parameter may have the value \a #Aligned or \a #Unaligned. Its effect is to select
-      * the appropriate vectorization instruction. Aligned access is faster, but is only possible for packets
-      * starting at an address which is a multiple of the packet size.
-      */
-
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE PacketReturnType packet(Index row, Index col) const
-    {
-      eigen_internal_assert(row >= 0 && row < rows()
-                      && col >= 0 && col < cols());
-      return derived().template packet<LoadMode>(row,col);
-    }
-
-
-    /** \internal */
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE PacketReturnType packetByOuterInner(Index outer, Index inner) const
-    {
-      return packet<LoadMode>(rowIndexByOuterInner(outer, inner),
-                              colIndexByOuterInner(outer, inner));
-    }
-
-    /** \internal
-      * \returns the packet of coefficients starting at the given index. It is your responsibility
-      * to ensure that a packet really starts there. This method is only available on expressions having the
-      * PacketAccessBit and the LinearAccessBit.
-      *
-      * The \a LoadMode parameter may have the value \a #Aligned or \a #Unaligned. Its effect is to select
-      * the appropriate vectorization instruction. Aligned access is faster, but is only possible for packets
-      * starting at an address which is a multiple of the packet size.
-      */
-
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
-    {
-      eigen_internal_assert(index >= 0 && index < size());
-      return derived().template packet<LoadMode>(index);
-    }
-
-  protected:
-    // explanation: DenseBase is doing "using ..." on the methods from DenseCoeffsBase.
-    // But some methods are only available in the DirectAccess case.
-    // So we add dummy methods here with these names, so that "using... " doesn't fail.
-    // It's not private so that the child class DenseBase can access them, and it's not public
-    // either since it's an implementation detail, so has to be protected.
-    void coeffRef();
-    void coeffRefByOuterInner();
-    void writePacket();
-    void writePacketByOuterInner();
-    void copyCoeff();
-    void copyCoeffByOuterInner();
-    void copyPacket();
-    void copyPacketByOuterInner();
-    void stride();
-    void innerStride();
-    void outerStride();
-    void rowStride();
-    void colStride();
+ * \ingroup Core_Module
+ * \tparam Derived Type of the derived class
+ *
+ * \note #ReadOnlyAccessors Constant indicating read-only access
+ *
+ * This class defines the \c operator() \c const function and friends, which can be used to read specific
+ * entries of a matrix or array.
+ *
+ * \sa DenseCoeffsBase<Derived, WriteAccessors>, DenseCoeffsBase<Derived, DirectAccessors>,
+ *     \ref TopicClassHierarchy
+ */
+template <typename Derived>
+class DenseCoeffsBase<Derived, ReadOnlyAccessors> : public EigenBase<Derived> {
+ public:
+  typedef typename internal::traits<Derived>::StorageKind StorageKind;
+  typedef typename internal::traits<Derived>::Scalar Scalar;
+  typedef typename internal::packet_traits<Scalar>::type PacketScalar;
+
+  // Explanation for this CoeffReturnType typedef.
+  // - This is the return type of the coeff() method.
+  // - The LvalueBit means exactly that we can offer a coeffRef() method, which means exactly that we can get references
+  // to coeffs, which means exactly that we can have coeff() return a const reference (as opposed to returning a value).
+  // - The DirectAccessBit means exactly that the underlying data of coefficients can be directly accessed as a plain
+  // strided array, which means exactly that the underlying data of coefficients does exist in memory, which means
+  // exactly that the coefficients is const-referencable, which means exactly that we can have coeff() return a const
+  // reference. For example, Map<const Matrix> have DirectAccessBit but not LvalueBit, so that Map<const Matrix>.coeff()
+  // does points to a const Scalar& which exists in memory, while does not allow coeffRef() as it would not provide a
+  // lvalue. Notice that DirectAccessBit and LvalueBit are mutually orthogonal.
+  // - The is_arithmetic check is required since "const int", "const double", etc. will cause warnings on some systems
+  // while the declaration of "const T", where T is a non arithmetic type does not. Always returning "const Scalar&" is
+  // not possible, since the underlying expressions might not offer a valid address the reference could be referring to.
+  typedef std::conditional_t<bool(internal::traits<Derived>::Flags&(LvalueBit | DirectAccessBit)), const Scalar&,
+                             std::conditional_t<internal::is_arithmetic<Scalar>::value, Scalar, const Scalar>>
+      CoeffReturnType;
+
+  typedef typename internal::add_const_on_value_type_if_arithmetic<typename internal::packet_traits<Scalar>::type>::type
+      PacketReturnType;
+
+  typedef EigenBase<Derived> Base;
+  using Base::cols;
+  using Base::derived;
+  using Base::rows;
+  using Base::size;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowIndexByOuterInner(Index outer, Index inner) const {
+    return int(Derived::RowsAtCompileTime) == 1   ? 0
+           : int(Derived::ColsAtCompileTime) == 1 ? inner
+           : int(Derived::Flags) & RowMajorBit    ? outer
+                                                  : inner;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colIndexByOuterInner(Index outer, Index inner) const {
+    return int(Derived::ColsAtCompileTime) == 1   ? 0
+           : int(Derived::RowsAtCompileTime) == 1 ? inner
+           : int(Derived::Flags) & RowMajorBit    ? inner
+                                                  : outer;
+  }
+
+  /** Short version: don't use this function, use
+   * \link operator()(Index,Index) const \endlink instead.
+   *
+   * Long version: this function is similar to
+   * \link operator()(Index,Index) const \endlink, but without the assertion.
+   * Use this for limiting the performance cost of debugging code when doing
+   * repeated coefficient access. Only use this when it is guaranteed that the
+   * parameters \a row and \a col are in range.
+   *
+   * If EIGEN_INTERNAL_DEBUGGING is defined, an assertion will be made, making this
+   * function equivalent to \link operator()(Index,Index) const \endlink.
+   *
+   * \sa operator()(Index,Index) const, coeffRef(Index,Index), coeff(Index) const
+   */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType coeff(Index row, Index col) const {
+    eigen_internal_assert(row >= 0 && row < rows() && col >= 0 && col < cols());
+    return internal::evaluator<Derived>(derived()).coeff(row, col);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType coeffByOuterInner(Index outer, Index inner) const {
+    return coeff(rowIndexByOuterInner(outer, inner), colIndexByOuterInner(outer, inner));
+  }
+
+  /** \returns the coefficient at given the given row and column.
+   *
+   * \sa operator()(Index,Index), operator[](Index)
+   */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType operator()(Index row, Index col) const {
+    eigen_assert(row >= 0 && row < rows() && col >= 0 && col < cols());
+    return coeff(row, col);
+  }
+
+  /** Short version: don't use this function, use
+   * \link operator[](Index) const \endlink instead.
+   *
+   * Long version: this function is similar to
+   * \link operator[](Index) const \endlink, but without the assertion.
+   * Use this for limiting the performance cost of debugging code when doing
+   * repeated coefficient access. Only use this when it is guaranteed that the
+   * parameter \a index is in range.
+   *
+   * If EIGEN_INTERNAL_DEBUGGING is defined, an assertion will be made, making this
+   * function equivalent to \link operator[](Index) const \endlink.
+   *
+   * \sa operator[](Index) const, coeffRef(Index), coeff(Index,Index) const
+   */
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType coeff(Index index) const {
+    EIGEN_STATIC_ASSERT(internal::evaluator<Derived>::Flags & LinearAccessBit,
+                        THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS)
+    eigen_internal_assert(index >= 0 && index < size());
+    return internal::evaluator<Derived>(derived()).coeff(index);
+  }
+
+  /** \returns the coefficient at given index.
+   *
+   * This method is allowed only for vector expressions, and for matrix expressions having the LinearAccessBit.
+   *
+   * \sa operator[](Index), operator()(Index,Index) const, x() const, y() const,
+   * z() const, w() const
+   */
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType operator[](Index index) const {
+    EIGEN_STATIC_ASSERT(Derived::IsVectorAtCompileTime,
+                        THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD)
+    eigen_assert(index >= 0 && index < size());
+    return coeff(index);
+  }
+
+  /** \returns the coefficient at given index.
+   *
+   * This is synonymous to operator[](Index) const.
+   *
+   * This method is allowed only for vector expressions, and for matrix expressions having the LinearAccessBit.
+   *
+   * \sa operator[](Index), operator()(Index,Index) const, x() const, y() const,
+   * z() const, w() const
+   */
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType operator()(Index index) const {
+    eigen_assert(index >= 0 && index < size());
+    return coeff(index);
+  }
+
+  /** equivalent to operator[](0).  */
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType x() const { return (*this)[0]; }
+
+  /** equivalent to operator[](1).  */
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType y() const {
+    EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime == -1 || Derived::SizeAtCompileTime >= 2, OUT_OF_RANGE_ACCESS);
+    return (*this)[1];
+  }
+
+  /** equivalent to operator[](2).  */
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType z() const {
+    EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime == -1 || Derived::SizeAtCompileTime >= 3, OUT_OF_RANGE_ACCESS);
+    return (*this)[2];
+  }
+
+  /** equivalent to operator[](3).  */
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr CoeffReturnType w() const {
+    EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime == -1 || Derived::SizeAtCompileTime >= 4, OUT_OF_RANGE_ACCESS);
+    return (*this)[3];
+  }
+
+  /** \internal
+   * \returns the packet of coefficients starting at the given row and column. It is your responsibility
+   * to ensure that a packet really starts there. This method is only available on expressions having the
+   * PacketAccessBit.
+   *
+   * The \a LoadMode parameter may have the value \a #Aligned or \a #Unaligned. Its effect is to select
+   * the appropriate vectorization instruction. Aligned access is faster, but is only possible for packets
+   * starting at an address which is a multiple of the packet size.
+   */
+
+  template <int LoadMode>
+  EIGEN_STRONG_INLINE PacketReturnType packet(Index row, Index col) const {
+    typedef typename internal::packet_traits<Scalar>::type DefaultPacketType;
+    eigen_internal_assert(row >= 0 && row < rows() && col >= 0 && col < cols());
+    return internal::evaluator<Derived>(derived()).template packet<LoadMode, DefaultPacketType>(row, col);
+  }
+
+  /** \internal */
+  template <int LoadMode>
+  EIGEN_STRONG_INLINE PacketReturnType packetByOuterInner(Index outer, Index inner) const {
+    return packet<LoadMode>(rowIndexByOuterInner(outer, inner), colIndexByOuterInner(outer, inner));
+  }
+
+  /** \internal
+   * \returns the packet of coefficients starting at the given index. It is your responsibility
+   * to ensure that a packet really starts there. This method is only available on expressions having the
+   * PacketAccessBit and the LinearAccessBit.
+   *
+   * The \a LoadMode parameter may have the value \a #Aligned or \a #Unaligned. Its effect is to select
+   * the appropriate vectorization instruction. Aligned access is faster, but is only possible for packets
+   * starting at an address which is a multiple of the packet size.
+   */
+
+  template <int LoadMode>
+  EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+    EIGEN_STATIC_ASSERT(internal::evaluator<Derived>::Flags & LinearAccessBit,
+                        THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS)
+    typedef typename internal::packet_traits<Scalar>::type DefaultPacketType;
+    eigen_internal_assert(index >= 0 && index < size());
+    return internal::evaluator<Derived>(derived()).template packet<LoadMode, DefaultPacketType>(index);
+  }
+
+ protected:
+  // explanation: DenseBase is doing "using ..." on the methods from DenseCoeffsBase.
+  // But some methods are only available in the DirectAccess case.
+  // So we add dummy methods here with these names, so that "using... " doesn't fail.
+  // It's not private so that the child class DenseBase can access them, and it's not public
+  // either since it's an implementation detail, so has to be protected.
+  void coeffRef();
+  void coeffRefByOuterInner();
+  void writePacket();
+  void writePacketByOuterInner();
+  void copyCoeff();
+  void copyCoeffByOuterInner();
+  void copyPacket();
+  void copyPacketByOuterInner();
+  void stride();
+  void innerStride();
+  void outerStride();
+  void rowStride();
+  void colStride();
 };
 
 /** \brief Base class providing read/write coefficient access to matrices and arrays.
-  * \ingroup Core_Module
-  * \tparam Derived Type of the derived class
-  * \tparam #WriteAccessors Constant indicating read/write access
-  *
-  * This class defines the non-const \c operator() function and friends, which can be used to write specific
-  * entries of a matrix or array. This class inherits DenseCoeffsBase<Derived, ReadOnlyAccessors> which
-  * defines the const variant for reading specific entries.
-  * 
-  * \sa DenseCoeffsBase<Derived, DirectAccessors>, \ref TopicClassHierarchy
-  */
-template<typename Derived>
-class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived, ReadOnlyAccessors>
-{
-  public:
-
-    typedef DenseCoeffsBase<Derived, ReadOnlyAccessors> Base;
-
-    typedef typename internal::traits<Derived>::StorageKind StorageKind;
-    typedef typename internal::traits<Derived>::Index Index;
-    typedef typename internal::traits<Derived>::Scalar Scalar;
-    typedef typename internal::packet_traits<Scalar>::type PacketScalar;
-    typedef typename NumTraits<Scalar>::Real RealScalar;
-
-    using Base::coeff;
-    using Base::rows;
-    using Base::cols;
-    using Base::size;
-    using Base::derived;
-    using Base::rowIndexByOuterInner;
-    using Base::colIndexByOuterInner;
-    using Base::operator[];
-    using Base::operator();
-    using Base::x;
-    using Base::y;
-    using Base::z;
-    using Base::w;
-
-    /** Short version: don't use this function, use
-      * \link operator()(Index,Index) \endlink instead.
-      *
-      * Long version: this function is similar to
-      * \link operator()(Index,Index) \endlink, but without the assertion.
-      * Use this for limiting the performance cost of debugging code when doing
-      * repeated coefficient access. Only use this when it is guaranteed that the
-      * parameters \a row and \a col are in range.
-      *
-      * If EIGEN_INTERNAL_DEBUGGING is defined, an assertion will be made, making this
-      * function equivalent to \link operator()(Index,Index) \endlink.
-      *
-      * \sa operator()(Index,Index), coeff(Index, Index) const, coeffRef(Index)
-      */
-    EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col)
-    {
-      eigen_internal_assert(row >= 0 && row < rows()
-                        && col >= 0 && col < cols());
-      return derived().coeffRef(row, col);
-    }
-
-    EIGEN_STRONG_INLINE Scalar&
-    coeffRefByOuterInner(Index outer, Index inner)
-    {
-      return coeffRef(rowIndexByOuterInner(outer, inner),
-                      colIndexByOuterInner(outer, inner));
-    }
-
-    /** \returns a reference to the coefficient at given the given row and column.
-      *
-      * \sa operator[](Index)
-      */
-
-    EIGEN_STRONG_INLINE Scalar&
-    operator()(Index row, Index col)
-    {
-      eigen_assert(row >= 0 && row < rows()
-          && col >= 0 && col < cols());
-      return derived().coeffRef(row, col);
-    }
-
-
-    /** Short version: don't use this function, use
-      * \link operator[](Index) \endlink instead.
-      *
-      * Long version: this function is similar to
-      * \link operator[](Index) \endlink, but without the assertion.
-      * Use this for limiting the performance cost of debugging code when doing
-      * repeated coefficient access. Only use this when it is guaranteed that the
-      * parameters \a row and \a col are in range.
-      *
-      * If EIGEN_INTERNAL_DEBUGGING is defined, an assertion will be made, making this
-      * function equivalent to \link operator[](Index) \endlink.
-      *
-      * \sa operator[](Index), coeff(Index) const, coeffRef(Index,Index)
-      */
-
-    EIGEN_STRONG_INLINE Scalar&
-    coeffRef(Index index)
-    {
-      eigen_internal_assert(index >= 0 && index < size());
-      return derived().coeffRef(index);
-    }
-
-    /** \returns a reference to the coefficient at given index.
-      *
-      * This method is allowed only for vector expressions, and for matrix expressions having the LinearAccessBit.
-      *
-      * \sa operator[](Index) const, operator()(Index,Index), x(), y(), z(), w()
-      */
-
-    EIGEN_STRONG_INLINE Scalar&
-    operator[](Index index)
-    {
-      #ifndef EIGEN2_SUPPORT
-      EIGEN_STATIC_ASSERT(Derived::IsVectorAtCompileTime,
-                          THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD)
-      #endif
-      eigen_assert(index >= 0 && index < size());
-      return derived().coeffRef(index);
-    }
-
-    /** \returns a reference to the coefficient at given index.
-      *
-      * This is synonymous to operator[](Index).
-      *
-      * This method is allowed only for vector expressions, and for matrix expressions having the LinearAccessBit.
-      *
-      * \sa operator[](Index) const, operator()(Index,Index), x(), y(), z(), w()
-      */
-
-    EIGEN_STRONG_INLINE Scalar&
-    operator()(Index index)
-    {
-      eigen_assert(index >= 0 && index < size());
-      return derived().coeffRef(index);
-    }
-
-    /** equivalent to operator[](0).  */
-
-    EIGEN_STRONG_INLINE Scalar&
-    x() { return (*this)[0]; }
-
-    /** equivalent to operator[](1).  */
-
-    EIGEN_STRONG_INLINE Scalar&
-    y() { return (*this)[1]; }
-
-    /** equivalent to operator[](2).  */
-
-    EIGEN_STRONG_INLINE Scalar&
-    z() { return (*this)[2]; }
-
-    /** equivalent to operator[](3).  */
-
-    EIGEN_STRONG_INLINE Scalar&
-    w() { return (*this)[3]; }
-
-    /** \internal
-      * Stores the given packet of coefficients, at the given row and column of this expression. It is your responsibility
-      * to ensure that a packet really starts there. This method is only available on expressions having the
-      * PacketAccessBit.
-      *
-      * The \a LoadMode parameter may have the value \a #Aligned or \a #Unaligned. Its effect is to select
-      * the appropriate vectorization instruction. Aligned access is faster, but is only possible for packets
-      * starting at an address which is a multiple of the packet size.
-      */
-
-    template<int StoreMode>
-    EIGEN_STRONG_INLINE void writePacket
-    (Index row, Index col, const typename internal::packet_traits<Scalar>::type& val)
-    {
-      eigen_internal_assert(row >= 0 && row < rows()
-                        && col >= 0 && col < cols());
-      derived().template writePacket<StoreMode>(row,col,val);
-    }
-
-
-    /** \internal */
-    template<int StoreMode>
-    EIGEN_STRONG_INLINE void writePacketByOuterInner
-    (Index outer, Index inner, const typename internal::packet_traits<Scalar>::type& val)
-    {
-      writePacket<StoreMode>(rowIndexByOuterInner(outer, inner),
-                            colIndexByOuterInner(outer, inner),
-                            val);
-    }
-
-    /** \internal
-      * Stores the given packet of coefficients, at the given index in this expression. It is your responsibility
-      * to ensure that a packet really starts there. This method is only available on expressions having the
-      * PacketAccessBit and the LinearAccessBit.
-      *
-      * The \a LoadMode parameter may have the value \a Aligned or \a Unaligned. Its effect is to select
-      * the appropriate vectorization instruction. Aligned access is faster, but is only possible for packets
-      * starting at an address which is a multiple of the packet size.
-      */
-    template<int StoreMode>
-    EIGEN_STRONG_INLINE void writePacket
-    (Index index, const typename internal::packet_traits<Scalar>::type& val)
-    {
-      eigen_internal_assert(index >= 0 && index < size());
-      derived().template writePacket<StoreMode>(index,val);
-    }
-
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-
-    /** \internal Copies the coefficient at position (row,col) of other into *this.
-      *
-      * This method is overridden in SwapWrapper, allowing swap() assignments to share 99% of their code
-      * with usual assignments.
-      *
-      * Outside of this internal usage, this method has probably no usefulness. It is hidden in the public API dox.
-      */
-
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE void copyCoeff(Index row, Index col, const DenseBase<OtherDerived>& other)
-    {
-      eigen_internal_assert(row >= 0 && row < rows()
-                        && col >= 0 && col < cols());
-      derived().coeffRef(row, col) = other.derived().coeff(row, col);
-    }
-
-    /** \internal Copies the coefficient at the given index of other into *this.
-      *
-      * This method is overridden in SwapWrapper, allowing swap() assignments to share 99% of their code
-      * with usual assignments.
-      *
-      * Outside of this internal usage, this method has probably no usefulness. It is hidden in the public API dox.
-      */
-
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE void copyCoeff(Index index, const DenseBase<OtherDerived>& other)
-    {
-      eigen_internal_assert(index >= 0 && index < size());
-      derived().coeffRef(index) = other.derived().coeff(index);
-    }
-
-
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE void copyCoeffByOuterInner(Index outer, Index inner, const DenseBase<OtherDerived>& other)
-    {
-      const Index row = rowIndexByOuterInner(outer,inner);
-      const Index col = colIndexByOuterInner(outer,inner);
-      // derived() is important here: copyCoeff() may be reimplemented in Derived!
-      derived().copyCoeff(row, col, other);
-    }
-
-    /** \internal Copies the packet at position (row,col) of other into *this.
-      *
-      * This method is overridden in SwapWrapper, allowing swap() assignments to share 99% of their code
-      * with usual assignments.
-      *
-      * Outside of this internal usage, this method has probably no usefulness. It is hidden in the public API dox.
-      */
-
-    template<typename OtherDerived, int StoreMode, int LoadMode>
-    EIGEN_STRONG_INLINE void copyPacket(Index row, Index col, const DenseBase<OtherDerived>& other)
-    {
-      eigen_internal_assert(row >= 0 && row < rows()
-                        && col >= 0 && col < cols());
-      derived().template writePacket<StoreMode>(row, col,
-        other.derived().template packet<LoadMode>(row, col));
-    }
-
-    /** \internal Copies the packet at the given index of other into *this.
-      *
-      * This method is overridden in SwapWrapper, allowing swap() assignments to share 99% of their code
-      * with usual assignments.
-      *
-      * Outside of this internal usage, this method has probably no usefulness. It is hidden in the public API dox.
-      */
-
-    template<typename OtherDerived, int StoreMode, int LoadMode>
-    EIGEN_STRONG_INLINE void copyPacket(Index index, const DenseBase<OtherDerived>& other)
-    {
-      eigen_internal_assert(index >= 0 && index < size());
-      derived().template writePacket<StoreMode>(index,
-        other.derived().template packet<LoadMode>(index));
-    }
-
-    /** \internal */
-    template<typename OtherDerived, int StoreMode, int LoadMode>
-    EIGEN_STRONG_INLINE void copyPacketByOuterInner(Index outer, Index inner, const DenseBase<OtherDerived>& other)
-    {
-      const Index row = rowIndexByOuterInner(outer,inner);
-      const Index col = colIndexByOuterInner(outer,inner);
-      // derived() is important here: copyCoeff() may be reimplemented in Derived!
-      derived().template copyPacket< OtherDerived, StoreMode, LoadMode>(row, col, other);
-    }
-#endif
+ * \ingroup Core_Module
+ * \tparam Derived Type of the derived class
+ *
+ * \note #WriteAccessors Constant indicating read/write access
+ *
+ * This class defines the non-const \c operator() function and friends, which can be used to write specific
+ * entries of a matrix or array. This class inherits DenseCoeffsBase<Derived, ReadOnlyAccessors> which
+ * defines the const variant for reading specific entries.
+ *
+ * \sa DenseCoeffsBase<Derived, DirectAccessors>, \ref TopicClassHierarchy
+ */
+template <typename Derived>
+class DenseCoeffsBase<Derived, WriteAccessors> : public DenseCoeffsBase<Derived, ReadOnlyAccessors> {
+ public:
+  typedef DenseCoeffsBase<Derived, ReadOnlyAccessors> Base;
+
+  typedef typename internal::traits<Derived>::StorageKind StorageKind;
+  typedef typename internal::traits<Derived>::Scalar Scalar;
+  typedef typename internal::packet_traits<Scalar>::type PacketScalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+
+  using Base::coeff;
+  using Base::colIndexByOuterInner;
+  using Base::cols;
+  using Base::derived;
+  using Base::rowIndexByOuterInner;
+  using Base::rows;
+  using Base::size;
+  using Base::operator[];
+  using Base::operator();
+  using Base::w;
+  using Base::x;
+  using Base::y;
+  using Base::z;
+
+  /** Short version: don't use this function, use
+   * \link operator()(Index,Index) \endlink instead.
+   *
+   * Long version: this function is similar to
+   * \link operator()(Index,Index) \endlink, but without the assertion.
+   * Use this for limiting the performance cost of debugging code when doing
+   * repeated coefficient access. Only use this when it is guaranteed that the
+   * parameters \a row and \a col are in range.
+   *
+   * If EIGEN_INTERNAL_DEBUGGING is defined, an assertion will be made, making this
+   * function equivalent to \link operator()(Index,Index) \endlink.
+   *
+   * \sa operator()(Index,Index), coeff(Index, Index) const, coeffRef(Index)
+   */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& coeffRef(Index row, Index col) {
+    eigen_internal_assert(row >= 0 && row < rows() && col >= 0 && col < cols());
+    return internal::evaluator<Derived>(derived()).coeffRef(row, col);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRefByOuterInner(Index outer, Index inner) {
+    return coeffRef(rowIndexByOuterInner(outer, inner), colIndexByOuterInner(outer, inner));
+  }
+
+  /** \returns a reference to the coefficient at given the given row and column.
+   *
+   * \sa operator[](Index)
+   */
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& operator()(Index row, Index col) {
+    eigen_assert(row >= 0 && row < rows() && col >= 0 && col < cols());
+    return coeffRef(row, col);
+  }
+
+  /** Short version: don't use this function, use
+   * \link operator[](Index) \endlink instead.
+   *
+   * Long version: this function is similar to
+   * \link operator[](Index) \endlink, but without the assertion.
+   * Use this for limiting the performance cost of debugging code when doing
+   * repeated coefficient access. Only use this when it is guaranteed that the
+   * parameters \a row and \a col are in range.
+   *
+   * If EIGEN_INTERNAL_DEBUGGING is defined, an assertion will be made, making this
+   * function equivalent to \link operator[](Index) \endlink.
+   *
+   * \sa operator[](Index), coeff(Index) const, coeffRef(Index,Index)
+   */
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& coeffRef(Index index) {
+    EIGEN_STATIC_ASSERT(internal::evaluator<Derived>::Flags & LinearAccessBit,
+                        THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS)
+    eigen_internal_assert(index >= 0 && index < size());
+    return internal::evaluator<Derived>(derived()).coeffRef(index);
+  }
+
+  /** \returns a reference to the coefficient at given index.
+   *
+   * This method is allowed only for vector expressions, and for matrix expressions having the LinearAccessBit.
+   *
+   * \sa operator[](Index) const, operator()(Index,Index), x(), y(), z(), w()
+   */
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& operator[](Index index) {
+    EIGEN_STATIC_ASSERT(Derived::IsVectorAtCompileTime,
+                        THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD)
+    eigen_assert(index >= 0 && index < size());
+    return coeffRef(index);
+  }
+
+  /** \returns a reference to the coefficient at given index.
+   *
+   * This is synonymous to operator[](Index).
+   *
+   * This method is allowed only for vector expressions, and for matrix expressions having the LinearAccessBit.
+   *
+   * \sa operator[](Index) const, operator()(Index,Index), x(), y(), z(), w()
+   */
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& operator()(Index index) {
+    eigen_assert(index >= 0 && index < size());
+    return coeffRef(index);
+  }
+
+  /** equivalent to operator[](0).  */
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& x() { return (*this)[0]; }
+
+  /** equivalent to operator[](1).  */
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& y() {
+    EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime == -1 || Derived::SizeAtCompileTime >= 2, OUT_OF_RANGE_ACCESS);
+    return (*this)[1];
+  }
+
+  /** equivalent to operator[](2).  */
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& z() {
+    EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime == -1 || Derived::SizeAtCompileTime >= 3, OUT_OF_RANGE_ACCESS);
+    return (*this)[2];
+  }
+
+  /** equivalent to operator[](3).  */
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& w() {
+    EIGEN_STATIC_ASSERT(Derived::SizeAtCompileTime == -1 || Derived::SizeAtCompileTime >= 4, OUT_OF_RANGE_ACCESS);
+    return (*this)[3];
+  }
 };
 
 /** \brief Base class providing direct read-only coefficient access to matrices and arrays.
-  * \ingroup Core_Module
-  * \tparam Derived Type of the derived class
-  * \tparam #DirectAccessors Constant indicating direct access
-  *
-  * This class defines functions to work with strides which can be used to access entries directly. This class
-  * inherits DenseCoeffsBase<Derived, ReadOnlyAccessors> which defines functions to access entries read-only using
-  * \c operator() .
-  *
-  * \sa \ref TopicClassHierarchy
-  */
-template<typename Derived>
-class DenseCoeffsBase<Derived, DirectAccessors> : public DenseCoeffsBase<Derived, ReadOnlyAccessors>
-{
-  public:
-
-    typedef DenseCoeffsBase<Derived, ReadOnlyAccessors> Base;
-    typedef typename internal::traits<Derived>::Index Index;
-    typedef typename internal::traits<Derived>::Scalar Scalar;
-    typedef typename NumTraits<Scalar>::Real RealScalar;
-
-    using Base::rows;
-    using Base::cols;
-    using Base::size;
-    using Base::derived;
-
-    /** \returns the pointer increment between two consecutive elements within a slice in the inner direction.
-      *
-      * \sa outerStride(), rowStride(), colStride()
-      */
-    inline Index innerStride() const
-    {
-      return derived().innerStride();
-    }
-
-    /** \returns the pointer increment between two consecutive inner slices (for example, between two consecutive columns
-      *          in a column-major matrix).
-      *
-      * \sa innerStride(), rowStride(), colStride()
-      */
-    inline Index outerStride() const
-    {
-      return derived().outerStride();
-    }
-
-    // FIXME shall we remove it ?
-    inline Index stride() const
-    {
-      return Derived::IsVectorAtCompileTime ? innerStride() : outerStride();
-    }
-
-    /** \returns the pointer increment between two consecutive rows.
-      *
-      * \sa innerStride(), outerStride(), colStride()
-      */
-    inline Index rowStride() const
-    {
-      return Derived::IsRowMajor ? outerStride() : innerStride();
-    }
-
-    /** \returns the pointer increment between two consecutive columns.
-      *
-      * \sa innerStride(), outerStride(), rowStride()
-      */
-    inline Index colStride() const
-    {
-      return Derived::IsRowMajor ? innerStride() : outerStride();
-    }
+ * \ingroup Core_Module
+ * \tparam Derived Type of the derived class
+ *
+ * \note #DirectAccessors Constant indicating direct access
+ *
+ * This class defines functions to work with strides which can be used to access entries directly. This class
+ * inherits DenseCoeffsBase<Derived, ReadOnlyAccessors> which defines functions to access entries read-only using
+ * \c operator() .
+ *
+ * \sa \blank \ref TopicClassHierarchy
+ */
+template <typename Derived>
+class DenseCoeffsBase<Derived, DirectAccessors> : public DenseCoeffsBase<Derived, ReadOnlyAccessors> {
+ public:
+  typedef DenseCoeffsBase<Derived, ReadOnlyAccessors> Base;
+  typedef typename internal::traits<Derived>::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+
+  using Base::cols;
+  using Base::derived;
+  using Base::rows;
+  using Base::size;
+
+  /** \returns the pointer increment between two consecutive elements within a slice in the inner direction.
+   *
+   * \sa outerStride(), rowStride(), colStride()
+   */
+  EIGEN_DEVICE_FUNC constexpr Index innerStride() const { return derived().innerStride(); }
+
+  /** \returns the pointer increment between two consecutive inner slices (for example, between two consecutive columns
+   *          in a column-major matrix).
+   *
+   * \sa innerStride(), rowStride(), colStride()
+   */
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const { return derived().outerStride(); }
+
+  // FIXME shall we remove it ?
+  constexpr Index stride() const { return Derived::IsVectorAtCompileTime ? innerStride() : outerStride(); }
+
+  /** \returns the pointer increment between two consecutive rows.
+   *
+   * \sa innerStride(), outerStride(), colStride()
+   */
+  EIGEN_DEVICE_FUNC constexpr Index rowStride() const { return Derived::IsRowMajor ? outerStride() : innerStride(); }
+
+  /** \returns the pointer increment between two consecutive columns.
+   *
+   * \sa innerStride(), outerStride(), rowStride()
+   */
+  EIGEN_DEVICE_FUNC constexpr Index colStride() const { return Derived::IsRowMajor ? innerStride() : outerStride(); }
 };
 
 /** \brief Base class providing direct read/write coefficient access to matrices and arrays.
-  * \ingroup Core_Module
-  * \tparam Derived Type of the derived class
-  * \tparam #DirectWriteAccessors Constant indicating direct access
-  *
-  * This class defines functions to work with strides which can be used to access entries directly. This class
-  * inherits DenseCoeffsBase<Derived, WriteAccessors> which defines functions to access entries read/write using
-  * \c operator().
-  *
-  * \sa \ref TopicClassHierarchy
-  */
-template<typename Derived>
-class DenseCoeffsBase<Derived, DirectWriteAccessors>
-  : public DenseCoeffsBase<Derived, WriteAccessors>
-{
-  public:
-
-    typedef DenseCoeffsBase<Derived, WriteAccessors> Base;
-    typedef typename internal::traits<Derived>::Index Index;
-    typedef typename internal::traits<Derived>::Scalar Scalar;
-    typedef typename NumTraits<Scalar>::Real RealScalar;
-
-    using Base::rows;
-    using Base::cols;
-    using Base::size;
-    using Base::derived;
-
-    /** \returns the pointer increment between two consecutive elements within a slice in the inner direction.
-      *
-      * \sa outerStride(), rowStride(), colStride()
-      */
-    inline Index innerStride() const
-    {
-      return derived().innerStride();
-    }
-
-    /** \returns the pointer increment between two consecutive inner slices (for example, between two consecutive columns
-      *          in a column-major matrix).
-      *
-      * \sa innerStride(), rowStride(), colStride()
-      */
-    inline Index outerStride() const
-    {
-      return derived().outerStride();
-    }
-
-    // FIXME shall we remove it ?
-    inline Index stride() const
-    {
-      return Derived::IsVectorAtCompileTime ? innerStride() : outerStride();
-    }
-
-    /** \returns the pointer increment between two consecutive rows.
-      *
-      * \sa innerStride(), outerStride(), colStride()
-      */
-    inline Index rowStride() const
-    {
-      return Derived::IsRowMajor ? outerStride() : innerStride();
-    }
-
-    /** \returns the pointer increment between two consecutive columns.
-      *
-      * \sa innerStride(), outerStride(), rowStride()
-      */
-    inline Index colStride() const
-    {
-      return Derived::IsRowMajor ? innerStride() : outerStride();
-    }
+ * \ingroup Core_Module
+ * \tparam Derived Type of the derived class
+ *
+ * \note #DirectWriteAccessors Constant indicating direct access
+ *
+ * This class defines functions to work with strides which can be used to access entries directly. This class
+ * inherits DenseCoeffsBase<Derived, WriteAccessors> which defines functions to access entries read/write using
+ * \c operator().
+ *
+ * \sa \blank \ref TopicClassHierarchy
+ */
+template <typename Derived>
+class DenseCoeffsBase<Derived, DirectWriteAccessors> : public DenseCoeffsBase<Derived, WriteAccessors> {
+ public:
+  typedef DenseCoeffsBase<Derived, WriteAccessors> Base;
+  typedef typename internal::traits<Derived>::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+
+  using Base::cols;
+  using Base::derived;
+  using Base::rows;
+  using Base::size;
+
+  /** \returns the pointer increment between two consecutive elements within a slice in the inner direction.
+   *
+   * \sa outerStride(), rowStride(), colStride()
+   */
+  EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept { return derived().innerStride(); }
+
+  /** \returns the pointer increment between two consecutive inner slices (for example, between two consecutive columns
+   *          in a column-major matrix).
+   *
+   * \sa innerStride(), rowStride(), colStride()
+   */
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return derived().outerStride(); }
+
+  // FIXME shall we remove it ?
+  constexpr Index stride() const noexcept { return Derived::IsVectorAtCompileTime ? innerStride() : outerStride(); }
+
+  /** \returns the pointer increment between two consecutive rows.
+   *
+   * \sa innerStride(), outerStride(), colStride()
+   */
+  EIGEN_DEVICE_FUNC constexpr Index rowStride() const noexcept {
+    return Derived::IsRowMajor ? outerStride() : innerStride();
+  }
+
+  /** \returns the pointer increment between two consecutive columns.
+   *
+   * \sa innerStride(), outerStride(), rowStride()
+   */
+  EIGEN_DEVICE_FUNC constexpr Index colStride() const noexcept {
+    return Derived::IsRowMajor ? innerStride() : outerStride();
+  }
 };
 
 namespace internal {
 
-template<typename Derived, bool JustReturnZero>
-struct first_aligned_impl
-{
-  static inline typename Derived::Index run(const Derived&)
-  { return 0; }
+template <int Alignment, typename Derived, bool JustReturnZero>
+struct first_aligned_impl {
+  static constexpr Index run(const Derived&) noexcept { return 0; }
 };
 
-template<typename Derived>
-struct first_aligned_impl<Derived, false>
-{
-  static inline typename Derived::Index run(const Derived& m)
-  {
-    return internal::first_aligned(&m.const_cast_derived().coeffRef(0,0), m.size());
-  }
+template <int Alignment, typename Derived>
+struct first_aligned_impl<Alignment, Derived, false> {
+  static inline Index run(const Derived& m) { return internal::first_aligned<Alignment>(m.data(), m.size()); }
 };
 
-/** \internal \returns the index of the first element of the array that is well aligned for vectorization.
-  *
-  * There is also the variant first_aligned(const Scalar*, Integer) defined in Memory.h. See it for more
-  * documentation.
-  */
-template<typename Derived>
-static inline typename Derived::Index first_aligned(const Derived& m)
-{
-  return first_aligned_impl
-          <Derived, (Derived::Flags & AlignedBit) || !(Derived::Flags & DirectAccessBit)>
-          ::run(m);
+/** \internal \returns the index of the first element of the array stored by \a m that is properly aligned with respect
+ * to \a Alignment for vectorization.
+ *
+ * \tparam Alignment requested alignment in Bytes.
+ *
+ * There is also the variant first_aligned(const Scalar*, Integer) defined in Memory.h. See it for more
+ * documentation.
+ */
+template <int Alignment, typename Derived>
+static inline Index first_aligned(const DenseBase<Derived>& m) {
+  enum { ReturnZero = (int(evaluator<Derived>::Alignment) >= Alignment) || !(Derived::Flags & DirectAccessBit) };
+  return first_aligned_impl<Alignment, Derived, ReturnZero>::run(m.derived());
+}
+
+template <typename Derived>
+static inline Index first_default_aligned(const DenseBase<Derived>& m) {
+  typedef typename Derived::Scalar Scalar;
+  typedef typename packet_traits<Scalar>::type DefaultPacketType;
+  return internal::first_aligned<int(unpacket_traits<DefaultPacketType>::alignment), Derived>(m);
 }
 
-template<typename Derived, bool HasDirectAccess = has_direct_access<Derived>::ret>
-struct inner_stride_at_compile_time
-{
+template <typename Derived, bool HasDirectAccess = has_direct_access<Derived>::ret>
+struct inner_stride_at_compile_time {
   enum { ret = traits<Derived>::InnerStrideAtCompileTime };
 };
 
-template<typename Derived>
-struct inner_stride_at_compile_time<Derived, false>
-{
+template <typename Derived>
+struct inner_stride_at_compile_time<Derived, false> {
   enum { ret = 0 };
 };
 
-template<typename Derived, bool HasDirectAccess = has_direct_access<Derived>::ret>
-struct outer_stride_at_compile_time
-{
+template <typename Derived, bool HasDirectAccess = has_direct_access<Derived>::ret>
+struct outer_stride_at_compile_time {
   enum { ret = traits<Derived>::OuterStrideAtCompileTime };
 };
 
-template<typename Derived>
-struct outer_stride_at_compile_time<Derived, false>
-{
+template <typename Derived>
+struct outer_stride_at_compile_time<Derived, false> {
   enum { ret = 0 };
 };
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_DENSECOEFFSBASE_H
+#endif  // EIGEN_DENSECOEFFSBASE_H
diff --git a/inst/include/Eigen/src/Core/DenseStorage.h b/inst/include/Eigen/src/Core/DenseStorage.h
index 568493cb..45c8779a 100644
--- a/inst/include/Eigen/src/Core/DenseStorage.h
+++ b/inst/include/Eigen/src/Core/DenseStorage.h
@@ -3,7 +3,7 @@
 //
 // Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2006-2009 Benoit Jacob <jacob.benoit.1@gmail.com>
-// Copyright (C) 2010 Hauke Heibel <hauke.heibel@gmail.com>
+// Copyright (C) 2010-2013 Hauke Heibel <hauke.heibel@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -13,422 +13,561 @@
 #define EIGEN_MATRIXSTORAGE_H
 
 #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
-  #define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN EIGEN_DENSE_STORAGE_CTOR_PLUGIN;
+#define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(X) \
+  X;                                                \
+  EIGEN_DENSE_STORAGE_CTOR_PLUGIN;
 #else
-  #define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
+#define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(X)
 #endif
 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
 namespace Eigen {
 
 namespace internal {
 
-struct constructor_without_unaligned_array_assert {};
+#if defined(EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT)
+#define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(Alignment)
+#else
+#define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(Alignment)                                        \
+  eigen_assert((is_constant_evaluated() || (std::uintptr_t(array) % Alignment == 0)) &&     \
+               "this assertion is explained here: "                                         \
+               "http://eigen.tuxfamily.org/dox-devel/group__TopicUnalignedArrayAssert.html" \
+               " **** READ THIS WEB PAGE !!! ****");
+#endif
 
-template<typename T, int Size> void check_static_allocation_size()
-{
-  // if EIGEN_STACK_ALLOCATION_LIMIT is defined to 0, then no limit
-  #if EIGEN_STACK_ALLOCATION_LIMIT
-  EIGEN_STATIC_ASSERT(Size * sizeof(T) <= EIGEN_STACK_ALLOCATION_LIMIT, OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG);
-  #endif
-}
+#if EIGEN_STACK_ALLOCATION_LIMIT
+#define EIGEN_MAKE_STACK_ALLOCATION_ASSERT(X) \
+  EIGEN_STATIC_ASSERT(X <= EIGEN_STACK_ALLOCATION_LIMIT, OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG)
+#else
+#define EIGEN_MAKE_STACK_ALLOCATION_ASSERT(X)
+#endif
 
 /** \internal
-  * Static array. If the MatrixOrArrayOptions require auto-alignment, the array will be automatically aligned:
-  * to 16 bytes boundary if the total size is a multiple of 16 bytes.
-  */
-template <typename T, int Size, int MatrixOrArrayOptions,
-          int Alignment = (MatrixOrArrayOptions&DontAlign) ? 0
-                        : (((Size*sizeof(T))%16)==0) ? 16
-                        : 0 >
-struct plain_array
-{
-  T array[Size];
-
-  plain_array() 
-  { 
-    check_static_allocation_size<T,Size>();
-  }
+ * Static array. If the MatrixOrArrayOptions require auto-alignment, the array will be automatically aligned:
+ * to 16 bytes boundary if the total size is a multiple of 16 bytes.
+ */
 
-  plain_array(constructor_without_unaligned_array_assert) 
-  { 
-    check_static_allocation_size<T,Size>();
+template <typename T, int Size, int MatrixOrArrayOptions,
+          int Alignment = (MatrixOrArrayOptions & DontAlign) ? 0 : compute_default_alignment<T, Size>::value>
+struct plain_array {
+  EIGEN_ALIGN_TO_BOUNDARY(Alignment) T array[Size];
+#if defined(EIGEN_NO_DEBUG) || defined(EIGEN_TESTING_PLAINOBJECT_CTOR)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr plain_array() = default;
+#else
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr plain_array() {
+    EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(Alignment)
+    EIGEN_MAKE_STACK_ALLOCATION_ASSERT(Size * sizeof(T))
   }
+#endif
 };
 
-#if defined(EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT)
-  #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask)
-#elif EIGEN_GNUC_AT_LEAST(4,7) 
-  // GCC 4.7 is too aggressive in its optimizations and remove the alignement test based on the fact the array is declared to be aligned.
-  // See this bug report: http://gcc.gnu.org/bugzilla/show_bug.cgi?id=53900
-  // Hiding the origin of the array pointer behind a function argument seems to do the trick even if the function is inlined:
-  template<typename PtrType>
-  EIGEN_ALWAYS_INLINE PtrType eigen_unaligned_array_assert_workaround_gcc47(PtrType array) { return array; }
-  #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) \
-    eigen_assert((reinterpret_cast<size_t>(eigen_unaligned_array_assert_workaround_gcc47(array)) & sizemask) == 0 \
-              && "this assertion is explained here: " \
-              "http://eigen.tuxfamily.org/dox-devel/group__TopicUnalignedArrayAssert.html" \
-              " **** READ THIS WEB PAGE !!! ****");
+template <typename T, int Size, int MatrixOrArrayOptions>
+struct plain_array<T, Size, MatrixOrArrayOptions, 0> {
+  // on some 32-bit platforms, stack-allocated arrays are aligned to 4 bytes, not the preferred alignment of T
+  EIGEN_ALIGN_TO_BOUNDARY(alignof(T)) T array[Size];
+#if defined(EIGEN_NO_DEBUG) || defined(EIGEN_TESTING_PLAINOBJECT_CTOR)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr plain_array() = default;
 #else
-  #define EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(sizemask) \
-    eigen_assert((reinterpret_cast<size_t>(array) & sizemask) == 0 \
-              && "this assertion is explained here: " \
-              "http://eigen.tuxfamily.org/dox-devel/group__TopicUnalignedArrayAssert.html" \
-              " **** READ THIS WEB PAGE !!! ****");
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr plain_array() { EIGEN_MAKE_STACK_ALLOCATION_ASSERT(Size * sizeof(T)) }
 #endif
+};
 
-template <typename T, int Size, int MatrixOrArrayOptions>
-struct plain_array<T, Size, MatrixOrArrayOptions, 16>
-{
-  EIGEN_USER_ALIGN16 T array[Size];
+template <typename T, int Size, int Options, int Alignment>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void swap_plain_array(plain_array<T, Size, Options, Alignment>& a,
+                                                                      plain_array<T, Size, Options, Alignment>& b,
+                                                                      Index a_size, Index b_size) {
+  Index common_size = numext::mini(a_size, b_size);
+  std::swap_ranges(a.array, a.array + common_size, b.array);
+  if (a_size > b_size)
+    smart_copy(a.array + common_size, a.array + a_size, b.array + common_size);
+  else if (b_size > a_size)
+    smart_copy(b.array + common_size, b.array + b_size, a.array + common_size);
+}
 
-  plain_array() 
-  { 
-    EIGEN_MAKE_UNALIGNED_ARRAY_ASSERT(0xf);
-    check_static_allocation_size<T,Size>();
-  }
+template <typename T, int Size, int Rows, int Cols, int Options>
+class DenseStorage_impl {
+  plain_array<T, Size, Options> m_data;
 
-  plain_array(constructor_without_unaligned_array_assert) 
-  { 
-    check_static_allocation_size<T,Size>();
+ public:
+#ifndef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl() = default;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(const DenseStorage_impl&) = default;
+#else
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl() {
+    EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = Size)
   }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(const DenseStorage_impl& other) {
+    EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = Size)
+    smart_copy(other.m_data.array, other.m_data.array + Size, m_data.array);
+  }
+#endif
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(Index /*size*/, Index /*rows*/, Index /*cols*/) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl& operator=(const DenseStorage_impl&) = default;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void swap(DenseStorage_impl& other) {
+    numext::swap(m_data, other.m_data);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void conservativeResize(Index /*size*/, Index /*rows*/,
+                                                                          Index /*cols*/) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void resize(Index /*size*/, Index /*rows*/, Index /*cols*/) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const { return Rows; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const { return Cols; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index size() const { return Rows * Cols; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr T* data() { return m_data.array; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const T* data() const { return m_data.array; }
 };
+template <typename T, int Size, int Cols, int Options>
+class DenseStorage_impl<T, Size, Dynamic, Cols, Options> {
+  plain_array<T, Size, Options> m_data;
+  Index m_rows = 0;
 
-template <typename T, int MatrixOrArrayOptions, int Alignment>
-struct plain_array<T, 0, MatrixOrArrayOptions, Alignment>
-{
-  EIGEN_USER_ALIGN16 T array[1];
-  plain_array() {}
-  plain_array(constructor_without_unaligned_array_assert) {}
+ public:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl() = default;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(const DenseStorage_impl& other)
+      : m_rows(other.m_rows) {
+    EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = other.size())
+    smart_copy(other.m_data.array, other.m_data.array + other.size(), m_data.array);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(Index size, Index rows, Index /*cols*/)
+      : m_rows(rows) {
+    EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
+    EIGEN_UNUSED_VARIABLE(size)
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl& operator=(const DenseStorage_impl& other) {
+    smart_copy(other.m_data.array, other.m_data.array + other.size(), m_data.array);
+    m_rows = other.m_rows;
+    return *this;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void swap(DenseStorage_impl& other) {
+    swap_plain_array(m_data, other.m_data, size(), other.size());
+    numext::swap(m_rows, other.m_rows);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void conservativeResize(Index /*size*/, Index rows, Index /*cols*/) {
+    m_rows = rows;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void resize(Index /*size*/, Index rows, Index /*cols*/) {
+    m_rows = rows;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const { return m_rows; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const { return Cols; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index size() const { return m_rows * Cols; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr T* data() { return m_data.array; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const T* data() const { return m_data.array; }
 };
+template <typename T, int Size, int Rows, int Options>
+class DenseStorage_impl<T, Size, Rows, Dynamic, Options> {
+  plain_array<T, Size, Options> m_data;
+  Index m_cols = 0;
 
-} // end namespace internal
-
-/** \internal
-  *
-  * \class DenseStorage
-  * \ingroup Core_Module
-  *
-  * \brief Stores the data of a matrix
-  *
-  * This class stores the data of fixed-size, dynamic-size or mixed matrices
-  * in a way as compact as possible.
-  *
-  * \sa Matrix
-  */
-template<typename T, int Size, int _Rows, int _Cols, int _Options> class DenseStorage;
-
-// purely fixed-size matrix
-template<typename T, int Size, int _Rows, int _Cols, int _Options> class DenseStorage
-{
-    internal::plain_array<T,Size,_Options> m_data;
-  public:
-    DenseStorage() {}
-    DenseStorage(internal::constructor_without_unaligned_array_assert)
-      : m_data(internal::constructor_without_unaligned_array_assert()) {}
-    DenseStorage(const DenseStorage& other) : m_data(other.m_data) {}
-    DenseStorage& operator=(const DenseStorage& other)
-    {
-      if (this != &other) m_data = other.m_data;
-      return *this;
-    }
-    DenseStorage(DenseIndex,DenseIndex,DenseIndex) {}
-    void swap(DenseStorage& other) { std::swap(m_data,other.m_data); }
-    static DenseIndex rows(void) {return _Rows;}
-    static DenseIndex cols(void) {return _Cols;}
-    void conservativeResize(DenseIndex,DenseIndex,DenseIndex) {}
-    void resize(DenseIndex,DenseIndex,DenseIndex) {}
-    const T *data() const { return m_data.array; }
-    T *data() { return m_data.array; }
+ public:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl() = default;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(const DenseStorage_impl& other)
+      : m_cols(other.m_cols) {
+    EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = other.size())
+    smart_copy(other.m_data.array, other.m_data.array + other.size(), m_data.array);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(Index size, Index /*rows*/, Index cols)
+      : m_cols(cols) {
+    EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
+    EIGEN_UNUSED_VARIABLE(size)
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl& operator=(const DenseStorage_impl& other) {
+    smart_copy(other.m_data.array, other.m_data.array + other.size(), m_data.array);
+    m_cols = other.m_cols;
+    return *this;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void swap(DenseStorage_impl& other) {
+    swap_plain_array(m_data, other.m_data, size(), other.size());
+    numext::swap(m_cols, other.m_cols);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void conservativeResize(Index /*size*/, Index /*rows*/, Index cols) {
+    m_cols = cols;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void resize(Index /*size*/, Index /*rows*/, Index cols) {
+    m_cols = cols;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const { return Rows; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const { return m_cols; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index size() const { return Rows * m_cols; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr T* data() { return m_data.array; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const T* data() const { return m_data.array; }
 };
+template <typename T, int Size, int Options>
+class DenseStorage_impl<T, Size, Dynamic, Dynamic, Options> {
+  plain_array<T, Size, Options> m_data;
+  Index m_rows = 0;
+  Index m_cols = 0;
 
-// null matrix
-template<typename T, int _Rows, int _Cols, int _Options> class DenseStorage<T, 0, _Rows, _Cols, _Options>
-{
-  public:
-    DenseStorage() {}
-    DenseStorage(internal::constructor_without_unaligned_array_assert) {}
-    DenseStorage(const DenseStorage&) {}
-    DenseStorage& operator=(const DenseStorage&) { return *this; }
-    DenseStorage(DenseIndex,DenseIndex,DenseIndex) {}
-    void swap(DenseStorage& ) {}
-    static DenseIndex rows(void) {return _Rows;}
-    static DenseIndex cols(void) {return _Cols;}
-    void conservativeResize(DenseIndex,DenseIndex,DenseIndex) {}
-    void resize(DenseIndex,DenseIndex,DenseIndex) {}
-    const T *data() const { return 0; }
-    T *data() { return 0; }
+ public:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl() = default;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(const DenseStorage_impl& other)
+      : m_rows(other.m_rows), m_cols(other.m_cols) {
+    EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = other.size())
+    smart_copy(other.m_data.array, other.m_data.array + other.size(), m_data.array);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(Index size, Index rows, Index cols)
+      : m_rows(rows), m_cols(cols) {
+    EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
+    EIGEN_UNUSED_VARIABLE(size)
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl& operator=(const DenseStorage_impl& other) {
+    smart_copy(other.m_data.array, other.m_data.array + other.size(), m_data.array);
+    m_rows = other.m_rows;
+    m_cols = other.m_cols;
+    return *this;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void swap(DenseStorage_impl& other) {
+    swap_plain_array(m_data, other.m_data, size(), other.size());
+    numext::swap(m_rows, other.m_rows);
+    numext::swap(m_cols, other.m_cols);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void conservativeResize(Index /*size*/, Index rows, Index cols) {
+    m_rows = rows;
+    m_cols = cols;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void resize(Index /*size*/, Index rows, Index cols) {
+    m_rows = rows;
+    m_cols = cols;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const { return m_rows; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const { return m_cols; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index size() const { return m_rows * m_cols; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr T* data() { return m_data.array; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const T* data() const { return m_data.array; }
 };
+// null matrix variants
+template <typename T, int Rows, int Cols, int Options>
+class DenseStorage_impl<T, 0, Rows, Cols, Options> {
+ public:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl() = default;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(const DenseStorage_impl&) = default;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(Index /*size*/, Index /*rows*/, Index /*cols*/) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl& operator=(const DenseStorage_impl&) = default;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void swap(DenseStorage_impl&) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void conservativeResize(Index /*size*/, Index /*rows*/,
+                                                                          Index /*cols*/) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void resize(Index /*size*/, Index /*rows*/, Index /*cols*/) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const { return Rows; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const { return Cols; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index size() const { return Rows * Cols; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr T* data() { return nullptr; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const T* data() const { return nullptr; }
+};
+template <typename T, int Cols, int Options>
+class DenseStorage_impl<T, 0, Dynamic, Cols, Options> {
+  Index m_rows = 0;
 
-// more specializations for null matrices; these are necessary to resolve ambiguities
-template<typename T, int _Options> class DenseStorage<T, 0, Dynamic, Dynamic, _Options>
-: public DenseStorage<T, 0, 0, 0, _Options> { };
-
-template<typename T, int _Rows, int _Options> class DenseStorage<T, 0, _Rows, Dynamic, _Options>
-: public DenseStorage<T, 0, 0, 0, _Options> { };
+ public:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl() = default;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(const DenseStorage_impl&) = default;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(Index /*size*/, Index rows, Index /*cols*/)
+      : m_rows(rows) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl& operator=(const DenseStorage_impl&) = default;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void swap(DenseStorage_impl& other) noexcept {
+    numext::swap(m_rows, other.m_rows);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void conservativeResize(Index /*size*/, Index rows, Index /*cols*/) {
+    m_rows = rows;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void resize(Index /*size*/, Index rows, Index /*cols*/) {
+    m_rows = rows;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const { return m_rows; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const { return Cols; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index size() const { return m_rows * Cols; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr T* data() { return nullptr; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const T* data() const { return nullptr; }
+};
+template <typename T, int Rows, int Options>
+class DenseStorage_impl<T, 0, Rows, Dynamic, Options> {
+  Index m_cols = 0;
 
-template<typename T, int _Cols, int _Options> class DenseStorage<T, 0, Dynamic, _Cols, _Options>
-: public DenseStorage<T, 0, 0, 0, _Options> { };
+ public:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl() = default;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(const DenseStorage_impl&) = default;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(Index /*size*/, Index /*rows*/, Index cols)
+      : m_cols(cols) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl& operator=(const DenseStorage_impl&) = default;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void swap(DenseStorage_impl& other) noexcept {
+    numext::swap(m_cols, other.m_cols);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void conservativeResize(Index /*size*/, Index /*rows*/, Index cols) {
+    m_cols = cols;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void resize(Index /*size*/, Index /*rows*/, Index cols) {
+    m_cols = cols;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const { return Rows; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const { return m_cols; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index size() const { return Rows * m_cols; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr T* data() { return nullptr; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const T* data() const { return nullptr; }
+};
+template <typename T, int Options>
+class DenseStorage_impl<T, 0, Dynamic, Dynamic, Options> {
+  Index m_rows = 0;
+  Index m_cols = 0;
 
-// dynamic-size matrix with fixed-size storage
-template<typename T, int Size, int _Options> class DenseStorage<T, Size, Dynamic, Dynamic, _Options>
-{
-    internal::plain_array<T,Size,_Options> m_data;
-    DenseIndex m_rows;
-    DenseIndex m_cols;
-  public:
-    DenseStorage() : m_rows(0), m_cols(0) {}
-    DenseStorage(internal::constructor_without_unaligned_array_assert)
-      : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(0), m_cols(0) {}
-    DenseStorage(const DenseStorage& other) : m_data(other.m_data), m_rows(other.m_rows), m_cols(other.m_cols) {}
-    DenseStorage& operator=(const DenseStorage& other)
-    {
-      if (this != &other)
-      {
-        m_data = other.m_data;
-        m_rows = other.m_rows;
-        m_cols = other.m_cols;
-      }
-      return *this;
-    }
-    DenseStorage(DenseIndex, DenseIndex nbRows, DenseIndex nbCols) : m_rows(nbRows), m_cols(nbCols) {}
-    void swap(DenseStorage& other)
-    { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); }
-    DenseIndex rows() const {return m_rows;}
-    DenseIndex cols() const {return m_cols;}
-    void conservativeResize(DenseIndex, DenseIndex nbRows, DenseIndex nbCols) { m_rows = nbRows; m_cols = nbCols; }
-    void resize(DenseIndex, DenseIndex nbRows, DenseIndex nbCols) { m_rows = nbRows; m_cols = nbCols; }
-    const T *data() const { return m_data.array; }
-    T *data() { return m_data.array; }
+ public:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl() = default;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(const DenseStorage_impl&) = default;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(Index /*size*/, Index rows, Index cols)
+      : m_rows(rows), m_cols(cols) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl& operator=(const DenseStorage_impl&) = default;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void swap(DenseStorage_impl& other) noexcept {
+    numext::swap(m_rows, other.m_rows);
+    numext::swap(m_cols, other.m_cols);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void conservativeResize(Index /*size*/, Index rows, Index cols) {
+    m_rows = rows;
+    m_cols = cols;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void resize(Index /*size*/, Index rows, Index cols) {
+    m_rows = rows;
+    m_cols = cols;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const { return m_rows; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const { return m_cols; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index size() const { return m_rows * m_cols; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr T* data() { return nullptr; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const T* data() const { return nullptr; }
 };
+// fixed-size matrix with dynamic memory allocation not currently supported
+template <typename T, int Rows, int Cols, int Options>
+class DenseStorage_impl<T, Dynamic, Rows, Cols, Options> {};
+// dynamic-sized variants
+template <typename T, int Cols, int Options>
+class DenseStorage_impl<T, Dynamic, Dynamic, Cols, Options> {
+  static constexpr bool Align = (Options & DontAlign) == 0;
+  T* m_data = nullptr;
+  Index m_rows = 0;
 
-// dynamic-size matrix with fixed-size storage and fixed width
-template<typename T, int Size, int _Cols, int _Options> class DenseStorage<T, Size, Dynamic, _Cols, _Options>
-{
-    internal::plain_array<T,Size,_Options> m_data;
-    DenseIndex m_rows;
-  public:
-    DenseStorage() : m_rows(0) {}
-    DenseStorage(internal::constructor_without_unaligned_array_assert)
-      : m_data(internal::constructor_without_unaligned_array_assert()), m_rows(0) {}
-    DenseStorage(const DenseStorage& other) : m_data(other.m_data), m_rows(other.m_rows) {}
-    DenseStorage& operator=(const DenseStorage& other)
-    {
-      if (this != &other)
-      {
-        m_data = other.m_data;
-        m_rows = other.m_rows;
-      }
-      return *this;
+ public:
+  static constexpr int Size = Dynamic;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl() = default;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(const DenseStorage_impl& other)
+      : m_data(conditional_aligned_new_auto<T, Align>(other.size())), m_rows(other.m_rows) {
+    EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = other.size())
+    smart_copy(other.m_data, other.m_data + other.size(), m_data);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(Index size, Index rows, Index /*cols*/)
+      : m_data(conditional_aligned_new_auto<T, Align>(size)), m_rows(rows) {
+    EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(DenseStorage_impl&& other) noexcept
+      : m_data(other.m_data), m_rows(other.m_rows) {
+    other.m_data = nullptr;
+    other.m_rows = 0;
+  }
+  EIGEN_DEVICE_FUNC ~DenseStorage_impl() { conditional_aligned_delete_auto<T, Align>(m_data, size()); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl& operator=(const DenseStorage_impl& other) {
+    resize(other.size(), other.rows(), other.cols());
+    smart_copy(other.m_data, other.m_data + other.size(), m_data);
+    return *this;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl& operator=(DenseStorage_impl&& other) noexcept {
+    this->swap(other);
+    return *this;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void swap(DenseStorage_impl& other) noexcept {
+    numext::swap(m_data, other.m_data);
+    numext::swap(m_rows, other.m_rows);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void conservativeResize(Index size, Index rows, Index /*cols*/) {
+    m_data = conditional_aligned_realloc_new_auto<T, Align>(m_data, size, this->size());
+    m_rows = rows;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void resize(Index size, Index rows, Index /*cols*/) {
+    Index oldSize = this->size();
+    if (oldSize != size) {
+      conditional_aligned_delete_auto<T, Align>(m_data, oldSize);
+      m_data = conditional_aligned_new_auto<T, Align>(size);
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
     }
-    DenseStorage(DenseIndex, DenseIndex nbRows, DenseIndex) : m_rows(nbRows) {}
-    void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
-    DenseIndex rows(void) const {return m_rows;}
-    DenseIndex cols(void) const {return _Cols;}
-    void conservativeResize(DenseIndex, DenseIndex nbRows, DenseIndex) { m_rows = nbRows; }
-    void resize(DenseIndex, DenseIndex nbRows, DenseIndex) { m_rows = nbRows; }
-    const T *data() const { return m_data.array; }
-    T *data() { return m_data.array; }
+    m_rows = rows;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const { return m_rows; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const { return Cols; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index size() const { return m_rows * Cols; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr T* data() { return m_data; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const T* data() const { return m_data; }
 };
+template <typename T, int Rows, int Options>
+class DenseStorage_impl<T, Dynamic, Rows, Dynamic, Options> {
+  static constexpr bool Align = (Options & DontAlign) == 0;
+  T* m_data = nullptr;
+  Index m_cols = 0;
 
-// dynamic-size matrix with fixed-size storage and fixed height
-template<typename T, int Size, int _Rows, int _Options> class DenseStorage<T, Size, _Rows, Dynamic, _Options>
-{
-    internal::plain_array<T,Size,_Options> m_data;
-    DenseIndex m_cols;
-  public:
-    DenseStorage() : m_cols(0) {}
-    DenseStorage(internal::constructor_without_unaligned_array_assert)
-      : m_data(internal::constructor_without_unaligned_array_assert()), m_cols(0) {}
-    DenseStorage(const DenseStorage& other) : m_data(other.m_data), m_cols(other.m_cols) {}
-    DenseStorage& operator=(const DenseStorage& other)
-    {
-      if (this != &other)
-      {
-        m_data = other.m_data;
-        m_cols = other.m_cols;
-      }
-      return *this;
+ public:
+  static constexpr int Size = Dynamic;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl() = default;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(const DenseStorage_impl& other)
+      : m_data(conditional_aligned_new_auto<T, Align>(other.size())), m_cols(other.m_cols) {
+    EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = other.size())
+    smart_copy(other.m_data, other.m_data + other.size(), m_data);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(Index size, Index /*rows*/, Index cols)
+      : m_data(conditional_aligned_new_auto<T, Align>(size)), m_cols(cols) {
+    EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(DenseStorage_impl&& other) noexcept
+      : m_data(other.m_data), m_cols(other.m_cols) {
+    other.m_data = nullptr;
+    other.m_cols = 0;
+  }
+  EIGEN_DEVICE_FUNC ~DenseStorage_impl() { conditional_aligned_delete_auto<T, Align>(m_data, size()); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl& operator=(const DenseStorage_impl& other) {
+    resize(other.size(), other.rows(), other.cols());
+    smart_copy(other.m_data, other.m_data + other.size(), m_data);
+    return *this;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl& operator=(DenseStorage_impl&& other) noexcept {
+    this->swap(other);
+    return *this;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void swap(DenseStorage_impl& other) noexcept {
+    numext::swap(m_data, other.m_data);
+    numext::swap(m_cols, other.m_cols);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void conservativeResize(Index size, Index /*rows*/, Index cols) {
+    m_data = conditional_aligned_realloc_new_auto<T, Align>(m_data, size, this->size());
+    m_cols = cols;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void resize(Index size, Index /*rows*/, Index cols) {
+    Index oldSize = this->size();
+    if (oldSize != size) {
+      conditional_aligned_delete_auto<T, Align>(m_data, oldSize);
+      m_data = conditional_aligned_new_auto<T, Align>(size);
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
     }
-    DenseStorage(DenseIndex, DenseIndex, DenseIndex nbCols) : m_cols(nbCols) {}
-    void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
-    DenseIndex rows(void) const {return _Rows;}
-    DenseIndex cols(void) const {return m_cols;}
-    void conservativeResize(DenseIndex, DenseIndex, DenseIndex nbCols) { m_cols = nbCols; }
-    void resize(DenseIndex, DenseIndex, DenseIndex nbCols) { m_cols = nbCols; }
-    const T *data() const { return m_data.array; }
-    T *data() { return m_data.array; }
+    m_cols = cols;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const { return Rows; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const { return m_cols; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index size() const { return Rows * m_cols; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr T* data() { return m_data; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const T* data() const { return m_data; }
 };
+template <typename T, int Options>
+class DenseStorage_impl<T, Dynamic, Dynamic, Dynamic, Options> {
+  static constexpr bool Align = (Options & DontAlign) == 0;
+  T* m_data = nullptr;
+  Index m_rows = 0;
+  Index m_cols = 0;
 
-// purely dynamic matrix.
-template<typename T, int _Options> class DenseStorage<T, Dynamic, Dynamic, Dynamic, _Options>
-{
-    T *m_data;
-    DenseIndex m_rows;
-    DenseIndex m_cols;
-  public:
-    DenseStorage() : m_data(0), m_rows(0), m_cols(0) {}
-    DenseStorage(internal::constructor_without_unaligned_array_assert)
-       : m_data(0), m_rows(0), m_cols(0) {}
-    DenseStorage(DenseIndex size, DenseIndex nbRows, DenseIndex nbCols)
-      : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_rows(nbRows), m_cols(nbCols)
-    { EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN }
-#ifdef EIGEN_HAVE_RVALUE_REFERENCES
-    DenseStorage(DenseStorage&& other)
-      : m_data(std::move(other.m_data))
-      , m_rows(std::move(other.m_rows))
-      , m_cols(std::move(other.m_cols))
-    {
-      other.m_data = nullptr;
-    }
-    DenseStorage& operator=(DenseStorage&& other)
-    {
-      using std::swap;
-      swap(m_data, other.m_data);
-      swap(m_rows, other.m_rows);
-      swap(m_cols, other.m_cols);
-      return *this;
-    }
-#endif
-    ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, m_rows*m_cols); }
-    void swap(DenseStorage& other)
-    { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); std::swap(m_cols,other.m_cols); }
-    DenseIndex rows(void) const {return m_rows;}
-    DenseIndex cols(void) const {return m_cols;}
-    void conservativeResize(DenseIndex size, DenseIndex nbRows, DenseIndex nbCols)
-    {
-      m_data = internal::conditional_aligned_realloc_new_auto<T,(_Options&DontAlign)==0>(m_data, size, m_rows*m_cols);
-      m_rows = nbRows;
-      m_cols = nbCols;
-    }
-    void resize(DenseIndex size, DenseIndex nbRows, DenseIndex nbCols)
-    {
-      if(size != m_rows*m_cols)
-      {
-        internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, m_rows*m_cols);
-        if (size)
-          m_data = internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size);
-        else
-          m_data = 0;
-        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
-      }
-      m_rows = nbRows;
-      m_cols = nbCols;
+ public:
+  static constexpr int Size = Dynamic;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl() = default;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(const DenseStorage_impl& other)
+      : m_data(conditional_aligned_new_auto<T, Align>(other.size())), m_rows(other.m_rows), m_cols(other.m_cols) {
+    EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = other.size())
+    smart_copy(other.m_data, other.m_data + other.size(), m_data);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(Index size, Index rows, Index cols)
+      : m_data(conditional_aligned_new_auto<T, Align>(size)), m_rows(rows), m_cols(cols) {
+    EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl(DenseStorage_impl&& other) noexcept
+      : m_data(other.m_data), m_rows(other.m_rows), m_cols(other.m_cols) {
+    other.m_data = nullptr;
+    other.m_rows = 0;
+    other.m_cols = 0;
+  }
+  EIGEN_DEVICE_FUNC ~DenseStorage_impl() { conditional_aligned_delete_auto<T, Align>(m_data, size()); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl& operator=(const DenseStorage_impl& other) {
+    resize(other.size(), other.rows(), other.cols());
+    smart_copy(other.m_data, other.m_data + other.size(), m_data);
+    return *this;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage_impl& operator=(DenseStorage_impl&& other) noexcept {
+    this->swap(other);
+    return *this;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void swap(DenseStorage_impl& other) noexcept {
+    numext::swap(m_data, other.m_data);
+    numext::swap(m_rows, other.m_rows);
+    numext::swap(m_cols, other.m_cols);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void conservativeResize(Index size, Index rows, Index cols) {
+    m_data = conditional_aligned_realloc_new_auto<T, Align>(m_data, size, this->size());
+    m_rows = rows;
+    m_cols = cols;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void resize(Index size, Index rows, Index cols) {
+    Index oldSize = this->size();
+    if (oldSize != size) {
+      conditional_aligned_delete_auto<T, Align>(m_data, oldSize);
+      m_data = conditional_aligned_new_auto<T, Align>(size);
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
     }
-    const T *data() const { return m_data; }
-    T *data() { return m_data; }
-  private:
-    DenseStorage(const DenseStorage&);
-    DenseStorage& operator=(const DenseStorage&);
+    m_rows = rows;
+    m_cols = cols;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const { return m_rows; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const { return m_cols; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index size() const { return m_rows * m_cols; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr T* data() { return m_data; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const T* data() const { return m_data; }
+};
+template <typename T, int Size, int Rows, int Cols>
+struct use_default_move {
+  static constexpr bool DynamicObject = Size == Dynamic;
+  static constexpr bool TrivialObject =
+      (!NumTraits<T>::RequireInitialization) && (Rows >= 0) && (Cols >= 0) && (Size == Rows * Cols);
+  static constexpr bool value = DynamicObject || TrivialObject;
 };
+}  // end namespace internal
 
-// matrix with dynamic width and fixed height (so that matrix has dynamic size).
-template<typename T, int _Rows, int _Options> class DenseStorage<T, Dynamic, _Rows, Dynamic, _Options>
-{
-    T *m_data;
-    DenseIndex m_cols;
-  public:
-    DenseStorage() : m_data(0), m_cols(0) {}
-    DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_cols(0) {}
-    DenseStorage(DenseIndex size, DenseIndex, DenseIndex nbCols) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_cols(nbCols)
-    { EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN }
-#ifdef EIGEN_HAVE_RVALUE_REFERENCES
-    DenseStorage(DenseStorage&& other)
-      : m_data(std::move(other.m_data))
-      , m_cols(std::move(other.m_cols))
-    {
-      other.m_data = nullptr;
-    }
-    DenseStorage& operator=(DenseStorage&& other)
-    {
-      using std::swap;
-      swap(m_data, other.m_data);
-      swap(m_cols, other.m_cols);
-      return *this;
-    }
-#endif
-    ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Rows*m_cols); }
-    void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_cols,other.m_cols); }
-    static DenseIndex rows(void) {return _Rows;}
-    DenseIndex cols(void) const {return m_cols;}
-    void conservativeResize(DenseIndex size, DenseIndex, DenseIndex nbCols)
-    {
-      m_data = internal::conditional_aligned_realloc_new_auto<T,(_Options&DontAlign)==0>(m_data, size, _Rows*m_cols);
-      m_cols = nbCols;
-    }
-    EIGEN_STRONG_INLINE void resize(DenseIndex size, DenseIndex, DenseIndex nbCols)
-    {
-      if(size != _Rows*m_cols)
-      {
-        internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Rows*m_cols);
-        if (size)
-          m_data = internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size);
-        else
-          m_data = 0;
-        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
-      }
-      m_cols = nbCols;
-    }
-    const T *data() const { return m_data; }
-    T *data() { return m_data; }
-  private:
-    DenseStorage(const DenseStorage&);
-    DenseStorage& operator=(const DenseStorage&);
+/** \internal
+ *
+ * \class DenseStorage_impl
+ * \ingroup Core_Module
+ *
+ * \brief Stores the data of a matrix
+ *
+ * This class stores the data of fixed-size, dynamic-size or mixed matrices
+ * in a way as compact as possible.
+ *
+ * \sa Matrix
+ */
+template <typename T, int Size, int Rows, int Cols, int Options,
+          bool Trivial = internal::use_default_move<T, Size, Rows, Cols>::value>
+class DenseStorage : public internal::DenseStorage_impl<T, Size, Rows, Cols, Options> {
+  using Base = internal::DenseStorage_impl<T, Size, Rows, Cols, Options>;
+
+ public:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage() = default;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage(const DenseStorage&) = default;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage(Index size, Index rows, Index cols)
+      : Base(size, rows, cols) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage& operator=(const DenseStorage&) = default;
+  // if DenseStorage meets the requirements of use_default_move, then use the move construction and move assignment
+  // operation defined in DenseStorage_impl, or the compiler-generated version if none is defined
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage(DenseStorage&&) = default;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage& operator=(DenseStorage&&) = default;
 };
+template <typename T, int Size, int Rows, int Cols, int Options>
+class DenseStorage<T, Size, Rows, Cols, Options, false>
+    : public internal::DenseStorage_impl<T, Size, Rows, Cols, Options> {
+  using Base = internal::DenseStorage_impl<T, Size, Rows, Cols, Options>;
 
-// matrix with dynamic height and fixed width (so that matrix has dynamic size).
-template<typename T, int _Cols, int _Options> class DenseStorage<T, Dynamic, Dynamic, _Cols, _Options>
-{
-    T *m_data;
-    DenseIndex m_rows;
-  public:
-    DenseStorage() : m_data(0), m_rows(0) {}
-    DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(0), m_rows(0) {}
-    DenseStorage(DenseIndex size, DenseIndex nbRows, DenseIndex) : m_data(internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size)), m_rows(nbRows)
-    { EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN }
-#ifdef EIGEN_HAVE_RVALUE_REFERENCES
-    DenseStorage(DenseStorage&& other)
-      : m_data(std::move(other.m_data))
-      , m_rows(std::move(other.m_rows))
-    {
-      other.m_data = nullptr;
-    }
-    DenseStorage& operator=(DenseStorage&& other)
-    {
-      using std::swap;
-      swap(m_data, other.m_data);
-      swap(m_rows, other.m_rows);
-      return *this;
-    }
-#endif
-    ~DenseStorage() { internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Cols*m_rows); }
-    void swap(DenseStorage& other) { std::swap(m_data,other.m_data); std::swap(m_rows,other.m_rows); }
-    DenseIndex rows(void) const {return m_rows;}
-    static DenseIndex cols(void) {return _Cols;}
-    void conservativeResize(DenseIndex size, DenseIndex nbRows, DenseIndex)
-    {
-      m_data = internal::conditional_aligned_realloc_new_auto<T,(_Options&DontAlign)==0>(m_data, size, m_rows*_Cols);
-      m_rows = nbRows;
-    }
-    EIGEN_STRONG_INLINE void resize(DenseIndex size, DenseIndex nbRows, DenseIndex)
-    {
-      if(size != m_rows*_Cols)
-      {
-        internal::conditional_aligned_delete_auto<T,(_Options&DontAlign)==0>(m_data, _Cols*m_rows);
-        if (size)
-          m_data = internal::conditional_aligned_new_auto<T,(_Options&DontAlign)==0>(size);
-        else
-          m_data = 0;
-        EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN
-      }
-      m_rows = nbRows;
-    }
-    const T *data() const { return m_data; }
-    T *data() { return m_data; }
-  private:
-    DenseStorage(const DenseStorage&);
-    DenseStorage& operator=(const DenseStorage&);
+ public:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage() = default;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage(const DenseStorage&) = default;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage(Index size, Index rows, Index cols)
+      : Base(size, rows, cols) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage& operator=(const DenseStorage&) = default;
+  // if DenseStorage does not meet the requirements of use_default_move, then defer to the copy construction and copy
+  // assignment behavior
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage(DenseStorage&& other)
+      : DenseStorage(static_cast<const DenseStorage&>(other)) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr DenseStorage& operator=(DenseStorage&& other) {
+    *this = other;
+    return *this;
+  }
 };
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_MATRIX_H
+#endif  // EIGEN_MATRIX_H
diff --git a/inst/include/Eigen/src/Core/DeviceWrapper.h b/inst/include/Eigen/src/Core/DeviceWrapper.h
new file mode 100644
index 00000000..012dce10
--- /dev/null
+++ b/inst/include/Eigen/src/Core/DeviceWrapper.h
@@ -0,0 +1,153 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2023 Charlie Schlosser <cs.schlosser@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_DEVICEWRAPPER_H
+#define EIGEN_DEVICEWRAPPER_H
+
+namespace Eigen {
+template <typename Derived, typename Device>
+struct DeviceWrapper {
+  using Base = EigenBase<internal::remove_all_t<Derived>>;
+  using Scalar = typename Derived::Scalar;
+
+  EIGEN_DEVICE_FUNC DeviceWrapper(Base& xpr, Device& device) : m_xpr(xpr.derived()), m_device(device) {}
+  EIGEN_DEVICE_FUNC DeviceWrapper(const Base& xpr, Device& device) : m_xpr(xpr.derived()), m_device(device) {}
+
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const EigenBase<OtherDerived>& other) {
+    using AssignOp = internal::assign_op<Scalar, typename OtherDerived::Scalar>;
+    internal::call_assignment(*this, other.derived(), AssignOp());
+    return m_xpr;
+  }
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator+=(const EigenBase<OtherDerived>& other) {
+    using AddAssignOp = internal::add_assign_op<Scalar, typename OtherDerived::Scalar>;
+    internal::call_assignment(*this, other.derived(), AddAssignOp());
+    return m_xpr;
+  }
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator-=(const EigenBase<OtherDerived>& other) {
+    using SubAssignOp = internal::sub_assign_op<Scalar, typename OtherDerived::Scalar>;
+    internal::call_assignment(*this, other.derived(), SubAssignOp());
+    return m_xpr;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& derived() { return m_xpr; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Device& device() { return m_device; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NoAlias<DeviceWrapper, EigenBase> noalias() {
+    return NoAlias<DeviceWrapper, EigenBase>(*this);
+  }
+
+  Derived& m_xpr;
+  Device& m_device;
+};
+
+namespace internal {
+
+// this is where we differentiate between lazy assignment and specialized kernels (e.g. matrix products)
+template <typename DstXprType, typename SrcXprType, typename Functor, typename Device,
+          typename Kind = typename AssignmentKind<typename evaluator_traits<DstXprType>::Shape,
+                                                  typename evaluator_traits<SrcXprType>::Shape>::Kind,
+          typename EnableIf = void>
+struct AssignmentWithDevice;
+
+// unless otherwise specified, use the default product implementation
+template <typename DstXprType, typename Lhs, typename Rhs, int Options, typename Functor, typename Device,
+          typename Weak>
+struct AssignmentWithDevice<DstXprType, Product<Lhs, Rhs, Options>, Functor, Device, Dense2Dense, Weak> {
+  using SrcXprType = Product<Lhs, Rhs, Options>;
+  using Base = Assignment<DstXprType, SrcXprType, Functor>;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(DstXprType& dst, const SrcXprType& src, const Functor& func,
+                                                        Device&) {
+    Base::run(dst, src, func);
+  }
+};
+
+// specialization for coeffcient-wise assignment
+template <typename DstXprType, typename SrcXprType, typename Functor, typename Device, typename Weak>
+struct AssignmentWithDevice<DstXprType, SrcXprType, Functor, Device, Dense2Dense, Weak> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(DstXprType& dst, const SrcXprType& src, const Functor& func,
+                                                        Device& device) {
+#ifndef EIGEN_NO_DEBUG
+    internal::check_for_aliasing(dst, src);
+#endif
+
+    call_dense_assignment_loop(dst, src, func, device);
+  }
+};
+
+// this allows us to use the default evaluation scheme if it is not specialized for the device
+template <typename Kernel, typename Device, int Traversal = Kernel::AssignmentTraits::Traversal,
+          int Unrolling = Kernel::AssignmentTraits::Unrolling>
+struct dense_assignment_loop_with_device {
+  using Base = dense_assignment_loop<Kernel, Traversal, Unrolling>;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void run(Kernel& kernel, Device&) { Base::run(kernel); }
+};
+
+// entry point for a generic expression with device
+template <typename Dst, typename Src, typename Func, typename Device>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_assignment_no_alias(DeviceWrapper<Dst, Device> dst,
+                                                                              const Src& src, const Func& func) {
+  enum {
+    NeedToTranspose = ((int(Dst::RowsAtCompileTime) == 1 && int(Src::ColsAtCompileTime) == 1) ||
+                       (int(Dst::ColsAtCompileTime) == 1 && int(Src::RowsAtCompileTime) == 1)) &&
+                      int(Dst::SizeAtCompileTime) != 1
+  };
+
+  using ActualDstTypeCleaned = std::conditional_t<NeedToTranspose, Transpose<Dst>, Dst>;
+  using ActualDstType = std::conditional_t<NeedToTranspose, Transpose<Dst>, Dst&>;
+  ActualDstType actualDst(dst.derived());
+
+  // TODO check whether this is the right place to perform these checks:
+  EIGEN_STATIC_ASSERT_LVALUE(Dst)
+  EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(ActualDstTypeCleaned, Src)
+  EIGEN_CHECK_BINARY_COMPATIBILIY(Func, typename ActualDstTypeCleaned::Scalar, typename Src::Scalar);
+
+  // this provides a mechanism for specializing simple assignments, matrix products, etc
+  AssignmentWithDevice<ActualDstTypeCleaned, Src, Func, Device>::run(actualDst, src, func, dst.device());
+}
+
+// copy and pasted from AssignEvaluator except forward device to kernel
+template <typename DstXprType, typename SrcXprType, typename Functor, typename Device>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void call_dense_assignment_loop(DstXprType& dst, const SrcXprType& src,
+                                                                                const Functor& func, Device& device) {
+  using DstEvaluatorType = evaluator<DstXprType>;
+  using SrcEvaluatorType = evaluator<SrcXprType>;
+
+  SrcEvaluatorType srcEvaluator(src);
+
+  // NOTE To properly handle A = (A*A.transpose())/s with A rectangular,
+  // we need to resize the destination after the source evaluator has been created.
+  resize_if_allowed(dst, src, func);
+
+  DstEvaluatorType dstEvaluator(dst);
+
+  using Kernel = generic_dense_assignment_kernel<DstEvaluatorType, SrcEvaluatorType, Functor>;
+
+  Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived());
+
+  dense_assignment_loop_with_device<Kernel, Device>::run(kernel, device);
+}
+
+}  // namespace internal
+
+template <typename Derived>
+template <typename Device>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DeviceWrapper<Derived, Device> EigenBase<Derived>::device(Device& device) {
+  return DeviceWrapper<Derived, Device>(derived(), device);
+}
+
+template <typename Derived>
+template <typename Device>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DeviceWrapper<const Derived, Device> EigenBase<Derived>::device(
+    Device& device) const {
+  return DeviceWrapper<const Derived, Device>(derived(), device);
+}
+}  // namespace Eigen
+#endif
diff --git a/inst/include/Eigen/src/Core/Diagonal.h b/inst/include/Eigen/src/Core/Diagonal.h
index 68cf6d4b..ff8611c6 100644
--- a/inst/include/Eigen/src/Core/Diagonal.h
+++ b/inst/include/Eigen/src/Core/Diagonal.h
@@ -11,227 +11,209 @@
 #ifndef EIGEN_DIAGONAL_H
 #define EIGEN_DIAGONAL_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 /** \class Diagonal
-  * \ingroup Core_Module
-  *
-  * \brief Expression of a diagonal/subdiagonal/superdiagonal in a matrix
-  *
-  * \param MatrixType the type of the object in which we are taking a sub/main/super diagonal
-  * \param DiagIndex the index of the sub/super diagonal. The default is 0 and it means the main diagonal.
-  *              A positive value means a superdiagonal, a negative value means a subdiagonal.
-  *              You can also use Dynamic so the index can be set at runtime.
-  *
-  * The matrix is not required to be square.
-  *
-  * This class represents an expression of the main diagonal, or any sub/super diagonal
-  * of a square matrix. It is the return type of MatrixBase::diagonal() and MatrixBase::diagonal(Index) and most of the
-  * time this is the only way it is used.
-  *
-  * \sa MatrixBase::diagonal(), MatrixBase::diagonal(Index)
-  */
+ * \ingroup Core_Module
+ *
+ * \brief Expression of a diagonal/subdiagonal/superdiagonal in a matrix
+ *
+ * \tparam MatrixType the type of the object in which we are taking a sub/main/super diagonal
+ * \tparam DiagIndex the index of the sub/super diagonal. The default is 0 and it means the main diagonal.
+ *              A positive value means a superdiagonal, a negative value means a subdiagonal.
+ *              You can also use DynamicIndex so the index can be set at runtime.
+ *
+ * The matrix is not required to be square.
+ *
+ * This class represents an expression of the main diagonal, or any sub/super diagonal
+ * of a square matrix. It is the return type of MatrixBase::diagonal() and MatrixBase::diagonal(Index) and most of the
+ * time this is the only way it is used.
+ *
+ * \sa MatrixBase::diagonal(), MatrixBase::diagonal(Index)
+ */
 
 namespace internal {
-template<typename MatrixType, int DiagIndex>
-struct traits<Diagonal<MatrixType,DiagIndex> >
- : traits<MatrixType>
-{
-  typedef typename nested<MatrixType>::type MatrixTypeNested;
-  typedef typename remove_reference<MatrixTypeNested>::type _MatrixTypeNested;
+template <typename MatrixType, int DiagIndex>
+struct traits<Diagonal<MatrixType, DiagIndex> > : traits<MatrixType> {
+  typedef typename ref_selector<MatrixType>::type MatrixTypeNested;
+  typedef std::remove_reference_t<MatrixTypeNested> MatrixTypeNested_;
   typedef typename MatrixType::StorageKind StorageKind;
   enum {
-    RowsAtCompileTime = (int(DiagIndex) == DynamicIndex || int(MatrixType::SizeAtCompileTime) == Dynamic) ? Dynamic
-                      : (EIGEN_PLAIN_ENUM_MIN(MatrixType::RowsAtCompileTime - EIGEN_PLAIN_ENUM_MAX(-DiagIndex, 0),
-                                              MatrixType::ColsAtCompileTime - EIGEN_PLAIN_ENUM_MAX( DiagIndex, 0))),
+    RowsAtCompileTime = (int(DiagIndex) == DynamicIndex || int(MatrixType::SizeAtCompileTime) == Dynamic)
+                            ? Dynamic
+                            : (plain_enum_min(MatrixType::RowsAtCompileTime - plain_enum_max(-DiagIndex, 0),
+                                              MatrixType::ColsAtCompileTime - plain_enum_max(DiagIndex, 0))),
     ColsAtCompileTime = 1,
-    MaxRowsAtCompileTime = int(MatrixType::MaxSizeAtCompileTime) == Dynamic ? Dynamic
-                         : DiagIndex == DynamicIndex ? EIGEN_SIZE_MIN_PREFER_FIXED(MatrixType::MaxRowsAtCompileTime,
-                                                                              MatrixType::MaxColsAtCompileTime)
-                         : (EIGEN_PLAIN_ENUM_MIN(MatrixType::MaxRowsAtCompileTime - EIGEN_PLAIN_ENUM_MAX(-DiagIndex, 0),
-                                                 MatrixType::MaxColsAtCompileTime - EIGEN_PLAIN_ENUM_MAX( DiagIndex, 0))),
+    MaxRowsAtCompileTime =
+        int(MatrixType::MaxSizeAtCompileTime) == Dynamic ? Dynamic
+        : DiagIndex == DynamicIndex
+            ? min_size_prefer_fixed(MatrixType::MaxRowsAtCompileTime, MatrixType::MaxColsAtCompileTime)
+            : (plain_enum_min(MatrixType::MaxRowsAtCompileTime - plain_enum_max(-DiagIndex, 0),
+                              MatrixType::MaxColsAtCompileTime - plain_enum_max(DiagIndex, 0))),
     MaxColsAtCompileTime = 1,
     MaskLvalueBit = is_lvalue<MatrixType>::value ? LvalueBit : 0,
-    Flags = (unsigned int)_MatrixTypeNested::Flags & (HereditaryBits | LinearAccessBit | MaskLvalueBit | DirectAccessBit) & ~RowMajorBit,
-    CoeffReadCost = _MatrixTypeNested::CoeffReadCost,
+    Flags = (unsigned int)MatrixTypeNested_::Flags & (RowMajorBit | MaskLvalueBit | DirectAccessBit) &
+            ~RowMajorBit,  // FIXME DirectAccessBit should not be handled by expressions
     MatrixTypeOuterStride = outer_stride_at_compile_time<MatrixType>::ret,
-    InnerStrideAtCompileTime = MatrixTypeOuterStride == Dynamic ? Dynamic : MatrixTypeOuterStride+1,
+    InnerStrideAtCompileTime = MatrixTypeOuterStride == Dynamic ? Dynamic : MatrixTypeOuterStride + 1,
     OuterStrideAtCompileTime = 0
   };
 };
-}
-
-template<typename MatrixType, int _DiagIndex> class Diagonal
-   : public internal::dense_xpr_base< Diagonal<MatrixType,_DiagIndex> >::type
-{
-  public:
-
-    enum { DiagIndex = _DiagIndex };
-    typedef typename internal::dense_xpr_base<Diagonal>::type Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(Diagonal)
-
-    inline Diagonal(MatrixType& matrix, Index a_index = DiagIndex) : m_matrix(matrix), m_index(a_index) {}
-
-    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Diagonal)
-
-    inline Index rows() const
-    { return m_index.value()<0 ? (std::min<Index>)(m_matrix.cols(),m_matrix.rows()+m_index.value()) : (std::min<Index>)(m_matrix.rows(),m_matrix.cols()-m_index.value()); }
-
-    inline Index cols() const { return 1; }
-
-    inline Index innerStride() const
-    {
-      return m_matrix.outerStride() + 1;
-    }
-
-    inline Index outerStride() const
-    {
-      return 0;
-    }
-
-    typedef typename internal::conditional<
-                       internal::is_lvalue<MatrixType>::value,
-                       Scalar,
-                       const Scalar
-                     >::type ScalarWithConstIfNotLvalue;
-
-    inline ScalarWithConstIfNotLvalue* data() { return &(m_matrix.const_cast_derived().coeffRef(rowOffset(), colOffset())); }
-    inline const Scalar* data() const { return &(m_matrix.const_cast_derived().coeffRef(rowOffset(), colOffset())); }
-
-    inline Scalar& coeffRef(Index row, Index)
-    {
-      EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
-      return m_matrix.const_cast_derived().coeffRef(row+rowOffset(), row+colOffset());
-    }
-
-    inline const Scalar& coeffRef(Index row, Index) const
-    {
-      return m_matrix.const_cast_derived().coeffRef(row+rowOffset(), row+colOffset());
-    }
-
-    inline CoeffReturnType coeff(Index row, Index) const
-    {
-      return m_matrix.coeff(row+rowOffset(), row+colOffset());
-    }
-
-    inline Scalar& coeffRef(Index idx)
-    {
-      EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
-      return m_matrix.const_cast_derived().coeffRef(idx+rowOffset(), idx+colOffset());
-    }
-
-    inline const Scalar& coeffRef(Index idx) const
-    {
-      return m_matrix.const_cast_derived().coeffRef(idx+rowOffset(), idx+colOffset());
-    }
-
-    inline CoeffReturnType coeff(Index idx) const
-    {
-      return m_matrix.coeff(idx+rowOffset(), idx+colOffset());
-    }
-
-    const typename internal::remove_all<typename MatrixType::Nested>::type& 
-    nestedExpression() const 
-    {
-      return m_matrix;
-    }
-
-    int index() const
-    {
-      return m_index.value();
-    }
-
-  protected:
-    typename MatrixType::Nested m_matrix;
-    const internal::variable_if_dynamicindex<Index, DiagIndex> m_index;
-
-  private:
-    // some compilers may fail to optimize std::max etc in case of compile-time constants...
-    EIGEN_STRONG_INLINE Index absDiagIndex() const { return m_index.value()>0 ? m_index.value() : -m_index.value(); }
-    EIGEN_STRONG_INLINE Index rowOffset() const { return m_index.value()>0 ? 0 : -m_index.value(); }
-    EIGEN_STRONG_INLINE Index colOffset() const { return m_index.value()>0 ? m_index.value() : 0; }
-    // triger a compile time error is someone try to call packet
-    template<int LoadMode> typename MatrixType::PacketReturnType packet(Index) const;
-    template<int LoadMode> typename MatrixType::PacketReturnType packet(Index,Index) const;
+}  // namespace internal
+
+template <typename MatrixType, int DiagIndex_>
+class Diagonal : public internal::dense_xpr_base<Diagonal<MatrixType, DiagIndex_> >::type {
+ public:
+  enum { DiagIndex = DiagIndex_ };
+  typedef typename internal::dense_xpr_base<Diagonal>::type Base;
+  EIGEN_DENSE_PUBLIC_INTERFACE(Diagonal)
+
+  EIGEN_DEVICE_FUNC explicit inline Diagonal(MatrixType& matrix, Index a_index = DiagIndex)
+      : m_matrix(matrix), m_index(a_index) {
+    eigen_assert(a_index <= m_matrix.cols() && -a_index <= m_matrix.rows());
+  }
+
+  EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Diagonal)
+
+  EIGEN_DEVICE_FUNC inline Index rows() const {
+    return m_index.value() < 0 ? numext::mini<Index>(m_matrix.cols(), m_matrix.rows() + m_index.value())
+                               : numext::mini<Index>(m_matrix.rows(), m_matrix.cols() - m_index.value());
+  }
+
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return 1; }
+
+  EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept { return m_matrix.outerStride() + 1; }
+
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return 0; }
+
+  typedef std::conditional_t<internal::is_lvalue<MatrixType>::value, Scalar, const Scalar> ScalarWithConstIfNotLvalue;
+
+  EIGEN_DEVICE_FUNC inline ScalarWithConstIfNotLvalue* data() { return &(m_matrix.coeffRef(rowOffset(), colOffset())); }
+  EIGEN_DEVICE_FUNC inline const Scalar* data() const { return &(m_matrix.coeffRef(rowOffset(), colOffset())); }
+
+  EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index row, Index) {
+    EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
+    return m_matrix.coeffRef(row + rowOffset(), row + colOffset());
+  }
+
+  EIGEN_DEVICE_FUNC inline const Scalar& coeffRef(Index row, Index) const {
+    return m_matrix.coeffRef(row + rowOffset(), row + colOffset());
+  }
+
+  EIGEN_DEVICE_FUNC inline CoeffReturnType coeff(Index row, Index) const {
+    return m_matrix.coeff(row + rowOffset(), row + colOffset());
+  }
+
+  EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index idx) {
+    EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
+    return m_matrix.coeffRef(idx + rowOffset(), idx + colOffset());
+  }
+
+  EIGEN_DEVICE_FUNC inline const Scalar& coeffRef(Index idx) const {
+    return m_matrix.coeffRef(idx + rowOffset(), idx + colOffset());
+  }
+
+  EIGEN_DEVICE_FUNC inline CoeffReturnType coeff(Index idx) const {
+    return m_matrix.coeff(idx + rowOffset(), idx + colOffset());
+  }
+
+  EIGEN_DEVICE_FUNC inline const internal::remove_all_t<typename MatrixType::Nested>& nestedExpression() const {
+    return m_matrix;
+  }
+
+  EIGEN_DEVICE_FUNC inline Index index() const { return m_index.value(); }
+
+ protected:
+  typename internal::ref_selector<MatrixType>::non_const_type m_matrix;
+  const internal::variable_if_dynamicindex<Index, DiagIndex> m_index;
+
+ private:
+  // some compilers may fail to optimize std::max etc in case of compile-time constants...
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index absDiagIndex() const noexcept {
+    return m_index.value() > 0 ? m_index.value() : -m_index.value();
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rowOffset() const noexcept {
+    return m_index.value() > 0 ? 0 : -m_index.value();
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index colOffset() const noexcept {
+    return m_index.value() > 0 ? m_index.value() : 0;
+  }
+  // trigger a compile-time error if someone try to call packet
+  template <int LoadMode>
+  typename MatrixType::PacketReturnType packet(Index) const;
+  template <int LoadMode>
+  typename MatrixType::PacketReturnType packet(Index, Index) const;
 };
 
 /** \returns an expression of the main diagonal of the matrix \c *this
-  *
-  * \c *this is not required to be square.
-  *
-  * Example: \include MatrixBase_diagonal.cpp
-  * Output: \verbinclude MatrixBase_diagonal.out
-  *
-  * \sa class Diagonal */
-template<typename Derived>
-inline typename MatrixBase<Derived>::DiagonalReturnType
-MatrixBase<Derived>::diagonal()
-{
-  return derived();
+ *
+ * \c *this is not required to be square.
+ *
+ * Example: \include MatrixBase_diagonal.cpp
+ * Output: \verbinclude MatrixBase_diagonal.out
+ *
+ * \sa class Diagonal */
+template <typename Derived>
+EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::DiagonalReturnType MatrixBase<Derived>::diagonal() {
+  return DiagonalReturnType(derived());
 }
 
 /** This is the const version of diagonal(). */
-template<typename Derived>
-inline typename MatrixBase<Derived>::ConstDiagonalReturnType
-MatrixBase<Derived>::diagonal() const
-{
+template <typename Derived>
+EIGEN_DEVICE_FUNC inline const typename MatrixBase<Derived>::ConstDiagonalReturnType MatrixBase<Derived>::diagonal()
+    const {
   return ConstDiagonalReturnType(derived());
 }
 
 /** \returns an expression of the \a DiagIndex-th sub or super diagonal of the matrix \c *this
-  *
-  * \c *this is not required to be square.
-  *
-  * The template parameter \a DiagIndex represent a super diagonal if \a DiagIndex > 0
-  * and a sub diagonal otherwise. \a DiagIndex == 0 is equivalent to the main diagonal.
-  *
-  * Example: \include MatrixBase_diagonal_int.cpp
-  * Output: \verbinclude MatrixBase_diagonal_int.out
-  *
-  * \sa MatrixBase::diagonal(), class Diagonal */
-template<typename Derived>
-inline typename MatrixBase<Derived>::DiagonalDynamicIndexReturnType
-MatrixBase<Derived>::diagonal(Index index)
-{
-  return DiagonalDynamicIndexReturnType(derived(), index);
+ *
+ * \c *this is not required to be square.
+ *
+ * The template parameter \a DiagIndex represent a super diagonal if \a DiagIndex > 0
+ * and a sub diagonal otherwise. \a DiagIndex == 0 is equivalent to the main diagonal.
+ *
+ * Example: \include MatrixBase_diagonal_int.cpp
+ * Output: \verbinclude MatrixBase_diagonal_int.out
+ *
+ * \sa MatrixBase::diagonal(), class Diagonal */
+template <typename Derived>
+EIGEN_DEVICE_FUNC inline Diagonal<Derived, DynamicIndex> MatrixBase<Derived>::diagonal(Index index) {
+  return Diagonal<Derived, DynamicIndex>(derived(), index);
 }
 
 /** This is the const version of diagonal(Index). */
-template<typename Derived>
-inline typename MatrixBase<Derived>::ConstDiagonalDynamicIndexReturnType
-MatrixBase<Derived>::diagonal(Index index) const
-{
-  return ConstDiagonalDynamicIndexReturnType(derived(), index);
+template <typename Derived>
+EIGEN_DEVICE_FUNC inline const Diagonal<const Derived, DynamicIndex> MatrixBase<Derived>::diagonal(Index index) const {
+  return Diagonal<const Derived, DynamicIndex>(derived(), index);
 }
 
 /** \returns an expression of the \a DiagIndex-th sub or super diagonal of the matrix \c *this
-  *
-  * \c *this is not required to be square.
-  *
-  * The template parameter \a DiagIndex represent a super diagonal if \a DiagIndex > 0
-  * and a sub diagonal otherwise. \a DiagIndex == 0 is equivalent to the main diagonal.
-  *
-  * Example: \include MatrixBase_diagonal_template_int.cpp
-  * Output: \verbinclude MatrixBase_diagonal_template_int.out
-  *
-  * \sa MatrixBase::diagonal(), class Diagonal */
-template<typename Derived>
-template<int Index>
-inline typename MatrixBase<Derived>::template DiagonalIndexReturnType<Index>::Type
-MatrixBase<Derived>::diagonal()
-{
-  return derived();
+ *
+ * \c *this is not required to be square.
+ *
+ * The template parameter \a DiagIndex represent a super diagonal if \a DiagIndex > 0
+ * and a sub diagonal otherwise. \a DiagIndex == 0 is equivalent to the main diagonal.
+ *
+ * Example: \include MatrixBase_diagonal_template_int.cpp
+ * Output: \verbinclude MatrixBase_diagonal_template_int.out
+ *
+ * \sa MatrixBase::diagonal(), class Diagonal */
+template <typename Derived>
+template <int Index_>
+EIGEN_DEVICE_FUNC inline Diagonal<Derived, Index_> MatrixBase<Derived>::diagonal() {
+  return Diagonal<Derived, Index_>(derived());
 }
 
 /** This is the const version of diagonal<int>(). */
-template<typename Derived>
-template<int Index>
-inline typename MatrixBase<Derived>::template ConstDiagonalIndexReturnType<Index>::Type
-MatrixBase<Derived>::diagonal() const
-{
-  return derived();
+template <typename Derived>
+template <int Index_>
+EIGEN_DEVICE_FUNC inline const Diagonal<const Derived, Index_> MatrixBase<Derived>::diagonal() const {
+  return Diagonal<const Derived, Index_>(derived());
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_DIAGONAL_H
+#endif  // EIGEN_DIAGONAL_H
diff --git a/inst/include/Eigen/src/Core/DiagonalMatrix.h b/inst/include/Eigen/src/Core/DiagonalMatrix.h
index e6c220f4..52630d92 100644
--- a/inst/include/Eigen/src/Core/DiagonalMatrix.h
+++ b/inst/include/Eigen/src/Core/DiagonalMatrix.h
@@ -11,303 +11,410 @@
 #ifndef EIGEN_DIAGONALMATRIX_H
 #define EIGEN_DIAGONALMATRIX_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \class DiagonalBase
+ * \ingroup Core_Module
+ *
+ * \brief Base class for diagonal matrices and expressions
+ *
+ * This is the base class that is inherited by diagonal matrix and related expression
+ * types, which internally use a vector for storing the diagonal entries. Diagonal
+ * types always represent square matrices.
+ *
+ * \tparam Derived is the derived type, a DiagonalMatrix or DiagonalWrapper.
+ *
+ * \sa class DiagonalMatrix, class DiagonalWrapper
+ */
+template <typename Derived>
+class DiagonalBase : public EigenBase<Derived> {
+ public:
+  typedef typename internal::traits<Derived>::DiagonalVectorType DiagonalVectorType;
+  typedef typename DiagonalVectorType::Scalar Scalar;
+  typedef typename DiagonalVectorType::RealScalar RealScalar;
+  typedef typename internal::traits<Derived>::StorageKind StorageKind;
+  typedef typename internal::traits<Derived>::StorageIndex StorageIndex;
 
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-template<typename Derived>
-class DiagonalBase : public EigenBase<Derived>
-{
-  public:
-    typedef typename internal::traits<Derived>::DiagonalVectorType DiagonalVectorType;
-    typedef typename DiagonalVectorType::Scalar Scalar;
-    typedef typename DiagonalVectorType::RealScalar RealScalar;
-    typedef typename internal::traits<Derived>::StorageKind StorageKind;
-    typedef typename internal::traits<Derived>::Index Index;
-
-    enum {
-      RowsAtCompileTime = DiagonalVectorType::SizeAtCompileTime,
-      ColsAtCompileTime = DiagonalVectorType::SizeAtCompileTime,
-      MaxRowsAtCompileTime = DiagonalVectorType::MaxSizeAtCompileTime,
-      MaxColsAtCompileTime = DiagonalVectorType::MaxSizeAtCompileTime,
-      IsVectorAtCompileTime = 0,
-      Flags = 0
-    };
-
-    typedef Matrix<Scalar, RowsAtCompileTime, ColsAtCompileTime, 0, MaxRowsAtCompileTime, MaxColsAtCompileTime> DenseMatrixType;
-    typedef DenseMatrixType DenseType;
-    typedef DiagonalMatrix<Scalar,DiagonalVectorType::SizeAtCompileTime,DiagonalVectorType::MaxSizeAtCompileTime> PlainObject;
-
-    inline const Derived& derived() const { return *static_cast<const Derived*>(this); }
-    inline Derived& derived() { return *static_cast<Derived*>(this); }
-
-    DenseMatrixType toDenseMatrix() const { return derived(); }
-    template<typename DenseDerived>
-    void evalTo(MatrixBase<DenseDerived> &other) const;
-    template<typename DenseDerived>
-    void addTo(MatrixBase<DenseDerived> &other) const
-    { other.diagonal() += diagonal(); }
-    template<typename DenseDerived>
-    void subTo(MatrixBase<DenseDerived> &other) const
-    { other.diagonal() -= diagonal(); }
-
-    inline const DiagonalVectorType& diagonal() const { return derived().diagonal(); }
-    inline DiagonalVectorType& diagonal() { return derived().diagonal(); }
-
-    inline Index rows() const { return diagonal().size(); }
-    inline Index cols() const { return diagonal().size(); }
-
-    /** \returns the diagonal matrix product of \c *this by the matrix \a matrix.
-      */
-    template<typename MatrixDerived>
-    const DiagonalProduct<MatrixDerived, Derived, OnTheLeft>
-    operator*(const MatrixBase<MatrixDerived> &matrix) const
-    {
-      return DiagonalProduct<MatrixDerived, Derived, OnTheLeft>(matrix.derived(), derived());
-    }
+  enum {
+    RowsAtCompileTime = DiagonalVectorType::SizeAtCompileTime,
+    ColsAtCompileTime = DiagonalVectorType::SizeAtCompileTime,
+    MaxRowsAtCompileTime = DiagonalVectorType::MaxSizeAtCompileTime,
+    MaxColsAtCompileTime = DiagonalVectorType::MaxSizeAtCompileTime,
+    IsVectorAtCompileTime = 0,
+    Flags = NoPreferredStorageOrderBit
+  };
 
-    inline const DiagonalWrapper<const CwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const DiagonalVectorType> >
-    inverse() const
-    {
-      return diagonal().cwiseInverse();
-    }
-    
-    inline const DiagonalWrapper<const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DiagonalVectorType> >
-    operator*(const Scalar& scalar) const
-    {
-      return diagonal() * scalar;
-    }
-    friend inline const DiagonalWrapper<const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DiagonalVectorType> >
-    operator*(const Scalar& scalar, const DiagonalBase& other)
-    {
-      return other.diagonal() * scalar;
-    }
-    
-    #ifdef EIGEN2_SUPPORT
-    template<typename OtherDerived>
-    bool isApprox(const DiagonalBase<OtherDerived>& other, typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision()) const
-    {
-      return diagonal().isApprox(other.diagonal(), precision);
-    }
-    template<typename OtherDerived>
-    bool isApprox(const MatrixBase<OtherDerived>& other, typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision()) const
-    {
-      return toDenseMatrix().isApprox(other, precision);
-    }
-    #endif
-};
+  typedef Matrix<Scalar, RowsAtCompileTime, ColsAtCompileTime, 0, MaxRowsAtCompileTime, MaxColsAtCompileTime>
+      DenseMatrixType;
+  typedef DenseMatrixType DenseType;
+  typedef DiagonalMatrix<Scalar, DiagonalVectorType::SizeAtCompileTime, DiagonalVectorType::MaxSizeAtCompileTime>
+      PlainObject;
+
+  /** \returns a reference to the derived object. */
+  EIGEN_DEVICE_FUNC inline const Derived& derived() const { return *static_cast<const Derived*>(this); }
+  /** \returns a const reference to the derived object. */
+  EIGEN_DEVICE_FUNC inline Derived& derived() { return *static_cast<Derived*>(this); }
+
+  /**
+   * Constructs a dense matrix from \c *this. Note, this directly returns a dense matrix type,
+   * not an expression.
+   * \returns A dense matrix, with its diagonal entries set from the the derived object. */
+  EIGEN_DEVICE_FUNC DenseMatrixType toDenseMatrix() const { return derived(); }
+
+  /** \returns a reference to the derived object's vector of diagonal coefficients. */
+  EIGEN_DEVICE_FUNC inline const DiagonalVectorType& diagonal() const { return derived().diagonal(); }
+  /** \returns a const reference to the derived object's vector of diagonal coefficients. */
+  EIGEN_DEVICE_FUNC inline DiagonalVectorType& diagonal() { return derived().diagonal(); }
+
+  /** \returns the value of the coefficient as if \c *this was a dense matrix. */
+  EIGEN_DEVICE_FUNC inline Scalar coeff(Index row, Index col) const {
+    eigen_assert(row >= 0 && col >= 0 && row < rows() && col <= cols());
+    return row == col ? diagonal().coeff(row) : Scalar(0);
+  }
 
-template<typename Derived>
-template<typename DenseDerived>
-void DiagonalBase<Derived>::evalTo(MatrixBase<DenseDerived> &other) const
-{
-  other.setZero();
-  other.diagonal() = diagonal();
-}
-#endif
+  /** \returns the number of rows. */
+  EIGEN_DEVICE_FUNC constexpr Index rows() const { return diagonal().size(); }
+  /** \returns the number of columns. */
+  EIGEN_DEVICE_FUNC constexpr Index cols() const { return diagonal().size(); }
+
+  /** \returns the diagonal matrix product of \c *this by the dense matrix, \a matrix */
+  template <typename MatrixDerived>
+  EIGEN_DEVICE_FUNC const Product<Derived, MatrixDerived, LazyProduct> operator*(
+      const MatrixBase<MatrixDerived>& matrix) const {
+    return Product<Derived, MatrixDerived, LazyProduct>(derived(), matrix.derived());
+  }
+
+  template <typename OtherDerived>
+  using DiagonalProductReturnType = DiagonalWrapper<const EIGEN_CWISE_BINARY_RETURN_TYPE(
+      DiagonalVectorType, typename OtherDerived::DiagonalVectorType, product)>;
+
+  /** \returns the diagonal matrix product of \c *this by the diagonal matrix \a other */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC const DiagonalProductReturnType<OtherDerived> operator*(
+      const DiagonalBase<OtherDerived>& other) const {
+    return diagonal().cwiseProduct(other.diagonal()).asDiagonal();
+  }
+
+  using DiagonalInverseReturnType =
+      DiagonalWrapper<const CwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const DiagonalVectorType>>;
+
+  /** \returns the inverse \c *this. Computed as the coefficient-wise inverse of the diagonal. */
+  EIGEN_DEVICE_FUNC inline const DiagonalInverseReturnType inverse() const {
+    return diagonal().cwiseInverse().asDiagonal();
+  }
+
+  using DiagonalScaleReturnType =
+      DiagonalWrapper<const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DiagonalVectorType, Scalar, product)>;
+
+  /** \returns the product of \c *this by the scalar \a scalar */
+  EIGEN_DEVICE_FUNC inline const DiagonalScaleReturnType operator*(const Scalar& scalar) const {
+    return (diagonal() * scalar).asDiagonal();
+  }
+
+  using ScaleDiagonalReturnType =
+      DiagonalWrapper<const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar, DiagonalVectorType, product)>;
+
+  /** \returns the product of a scalar and the diagonal matrix \a other */
+  EIGEN_DEVICE_FUNC friend inline const ScaleDiagonalReturnType operator*(const Scalar& scalar,
+                                                                          const DiagonalBase& other) {
+    return (scalar * other.diagonal()).asDiagonal();
+  }
+
+  template <typename OtherDerived>
+  using DiagonalSumReturnType = DiagonalWrapper<const EIGEN_CWISE_BINARY_RETURN_TYPE(
+      DiagonalVectorType, typename OtherDerived::DiagonalVectorType, sum)>;
+
+  /** \returns the sum of \c *this and the diagonal matrix \a other */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC inline const DiagonalSumReturnType<OtherDerived> operator+(
+      const DiagonalBase<OtherDerived>& other) const {
+    return (diagonal() + other.diagonal()).asDiagonal();
+  }
+
+  template <typename OtherDerived>
+  using DiagonalDifferenceReturnType = DiagonalWrapper<const EIGEN_CWISE_BINARY_RETURN_TYPE(
+      DiagonalVectorType, typename OtherDerived::DiagonalVectorType, difference)>;
+
+  /** \returns the difference of \c *this and the diagonal matrix \a other */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC inline const DiagonalDifferenceReturnType<OtherDerived> operator-(
+      const DiagonalBase<OtherDerived>& other) const {
+    return (diagonal() - other.diagonal()).asDiagonal();
+  }
+};
 
 /** \class DiagonalMatrix
-  * \ingroup Core_Module
-  *
-  * \brief Represents a diagonal matrix with its storage
-  *
-  * \param _Scalar the type of coefficients
-  * \param SizeAtCompileTime the dimension of the matrix, or Dynamic
-  * \param MaxSizeAtCompileTime the dimension of the matrix, or Dynamic. This parameter is optional and defaults
-  *        to SizeAtCompileTime. Most of the time, you do not need to specify it.
-  *
-  * \sa class DiagonalWrapper
-  */
+ * \ingroup Core_Module
+ *
+ * \brief Represents a diagonal matrix with its storage
+ *
+ * \tparam Scalar_ the type of coefficients
+ * \tparam SizeAtCompileTime the dimension of the matrix, or Dynamic
+ * \tparam MaxSizeAtCompileTime the dimension of the matrix, or Dynamic. This parameter is optional and defaults
+ *        to SizeAtCompileTime. Most of the time, you do not need to specify it.
+ *
+ * \sa class DiagonalBase, class DiagonalWrapper
+ */
 
 namespace internal {
-template<typename _Scalar, int SizeAtCompileTime, int MaxSizeAtCompileTime>
-struct traits<DiagonalMatrix<_Scalar,SizeAtCompileTime,MaxSizeAtCompileTime> >
- : traits<Matrix<_Scalar,SizeAtCompileTime,SizeAtCompileTime,0,MaxSizeAtCompileTime,MaxSizeAtCompileTime> >
-{
-  typedef Matrix<_Scalar,SizeAtCompileTime,1,0,MaxSizeAtCompileTime,1> DiagonalVectorType;
-  typedef Dense StorageKind;
-  typedef DenseIndex Index;
-  enum {
-    Flags = LvalueBit
-  };
+template <typename Scalar_, int SizeAtCompileTime, int MaxSizeAtCompileTime>
+struct traits<DiagonalMatrix<Scalar_, SizeAtCompileTime, MaxSizeAtCompileTime>>
+    : traits<Matrix<Scalar_, SizeAtCompileTime, SizeAtCompileTime, 0, MaxSizeAtCompileTime, MaxSizeAtCompileTime>> {
+  typedef Matrix<Scalar_, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1> DiagonalVectorType;
+  typedef DiagonalShape StorageKind;
+  enum { Flags = LvalueBit | NoPreferredStorageOrderBit | NestByRefBit };
 };
-}
-template<typename _Scalar, int SizeAtCompileTime, int MaxSizeAtCompileTime>
-class DiagonalMatrix
-  : public DiagonalBase<DiagonalMatrix<_Scalar,SizeAtCompileTime,MaxSizeAtCompileTime> >
-{
-  public:
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    typedef typename internal::traits<DiagonalMatrix>::DiagonalVectorType DiagonalVectorType;
-    typedef const DiagonalMatrix& Nested;
-    typedef _Scalar Scalar;
-    typedef typename internal::traits<DiagonalMatrix>::StorageKind StorageKind;
-    typedef typename internal::traits<DiagonalMatrix>::Index Index;
-    #endif
-
-  protected:
-
-    DiagonalVectorType m_diagonal;
-
-  public:
-
-    /** const version of diagonal(). */
-    inline const DiagonalVectorType& diagonal() const { return m_diagonal; }
-    /** \returns a reference to the stored vector of diagonal coefficients. */
-    inline DiagonalVectorType& diagonal() { return m_diagonal; }
-
-    /** Default constructor without initialization */
-    inline DiagonalMatrix() {}
-
-    /** Constructs a diagonal matrix with given dimension  */
-    inline DiagonalMatrix(Index dim) : m_diagonal(dim) {}
-
-    /** 2D constructor. */
-    inline DiagonalMatrix(const Scalar& x, const Scalar& y) : m_diagonal(x,y) {}
-
-    /** 3D constructor. */
-    inline DiagonalMatrix(const Scalar& x, const Scalar& y, const Scalar& z) : m_diagonal(x,y,z) {}
-
-    /** Copy constructor. */
-    template<typename OtherDerived>
-    inline DiagonalMatrix(const DiagonalBase<OtherDerived>& other) : m_diagonal(other.diagonal()) {}
-
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** copy constructor. prevent a default copy constructor from hiding the other templated constructor */
-    inline DiagonalMatrix(const DiagonalMatrix& other) : m_diagonal(other.diagonal()) {}
-    #endif
-
-    /** generic constructor from expression of the diagonal coefficients */
-    template<typename OtherDerived>
-    explicit inline DiagonalMatrix(const MatrixBase<OtherDerived>& other) : m_diagonal(other)
-    {}
-
-    /** Copy operator. */
-    template<typename OtherDerived>
-    DiagonalMatrix& operator=(const DiagonalBase<OtherDerived>& other)
-    {
-      m_diagonal = other.diagonal();
-      return *this;
-    }
+}  // namespace internal
+template <typename Scalar_, int SizeAtCompileTime, int MaxSizeAtCompileTime>
+class DiagonalMatrix : public DiagonalBase<DiagonalMatrix<Scalar_, SizeAtCompileTime, MaxSizeAtCompileTime>> {
+ public:
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  typedef typename internal::traits<DiagonalMatrix>::DiagonalVectorType DiagonalVectorType;
+  typedef const DiagonalMatrix& Nested;
+  typedef Scalar_ Scalar;
+  typedef typename internal::traits<DiagonalMatrix>::StorageKind StorageKind;
+  typedef typename internal::traits<DiagonalMatrix>::StorageIndex StorageIndex;
+#endif
 
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** This is a special case of the templated operator=. Its purpose is to
-      * prevent a default operator= from hiding the templated operator=.
-      */
-    DiagonalMatrix& operator=(const DiagonalMatrix& other)
-    {
-      m_diagonal = other.diagonal();
-      return *this;
-    }
-    #endif
-
-    /** Resizes to given size. */
-    inline void resize(Index size) { m_diagonal.resize(size); }
-    /** Sets all coefficients to zero. */
-    inline void setZero() { m_diagonal.setZero(); }
-    /** Resizes and sets all coefficients to zero. */
-    inline void setZero(Index size) { m_diagonal.setZero(size); }
-    /** Sets this matrix to be the identity matrix of the current size. */
-    inline void setIdentity() { m_diagonal.setOnes(); }
-    /** Sets this matrix to be the identity matrix of the given size. */
-    inline void setIdentity(Index size) { m_diagonal.setOnes(size); }
+ protected:
+  DiagonalVectorType m_diagonal;
+
+ public:
+  /** const version of diagonal(). */
+  EIGEN_DEVICE_FUNC inline const DiagonalVectorType& diagonal() const { return m_diagonal; }
+  /** \returns a reference to the stored vector of diagonal coefficients. */
+  EIGEN_DEVICE_FUNC inline DiagonalVectorType& diagonal() { return m_diagonal; }
+
+  /** Default constructor without initialization */
+  EIGEN_DEVICE_FUNC inline DiagonalMatrix() {}
+
+  /** Constructs a diagonal matrix with given dimension  */
+  EIGEN_DEVICE_FUNC explicit inline DiagonalMatrix(Index dim) : m_diagonal(dim) {}
+
+  /** 2D constructor. */
+  EIGEN_DEVICE_FUNC inline DiagonalMatrix(const Scalar& x, const Scalar& y) : m_diagonal(x, y) {}
+
+  /** 3D constructor. */
+  EIGEN_DEVICE_FUNC inline DiagonalMatrix(const Scalar& x, const Scalar& y, const Scalar& z) : m_diagonal(x, y, z) {}
+
+  /** \brief Construct a diagonal matrix with fixed size from an arbitrary number of coefficients.
+   *
+   * \warning To construct a diagonal matrix of fixed size, the number of values passed to this
+   * constructor must match the fixed dimension of \c *this.
+   *
+   * \sa DiagonalMatrix(const Scalar&, const Scalar&)
+   * \sa DiagonalMatrix(const Scalar&, const Scalar&, const Scalar&)
+   */
+  template <typename... ArgTypes>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DiagonalMatrix(const Scalar& a0, const Scalar& a1, const Scalar& a2,
+                                                       const ArgTypes&... args)
+      : m_diagonal(a0, a1, a2, args...) {}
+
+  /** \brief Constructs a DiagonalMatrix and initializes it by elements given by an initializer list of initializer
+   * lists \cpp11
+   */
+  EIGEN_DEVICE_FUNC explicit EIGEN_STRONG_INLINE DiagonalMatrix(
+      const std::initializer_list<std::initializer_list<Scalar>>& list)
+      : m_diagonal(list) {}
+
+  /** \brief Constructs a DiagonalMatrix from an r-value diagonal vector type */
+  EIGEN_DEVICE_FUNC explicit inline DiagonalMatrix(DiagonalVectorType&& diag) : m_diagonal(std::move(diag)) {}
+
+  /** Copy constructor. */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC inline DiagonalMatrix(const DiagonalBase<OtherDerived>& other) : m_diagonal(other.diagonal()) {}
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  /** copy constructor. prevent a default copy constructor from hiding the other templated constructor */
+  inline DiagonalMatrix(const DiagonalMatrix& other) : m_diagonal(other.diagonal()) {}
+#endif
+
+  /** generic constructor from expression of the diagonal coefficients */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC explicit inline DiagonalMatrix(const MatrixBase<OtherDerived>& other) : m_diagonal(other) {}
+
+  /** Copy operator. */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC DiagonalMatrix& operator=(const DiagonalBase<OtherDerived>& other) {
+    m_diagonal = other.diagonal();
+    return *this;
+  }
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  /** This is a special case of the templated operator=. Its purpose is to
+   * prevent a default operator= from hiding the templated operator=.
+   */
+  EIGEN_DEVICE_FUNC DiagonalMatrix& operator=(const DiagonalMatrix& other) {
+    m_diagonal = other.diagonal();
+    return *this;
+  }
+#endif
+
+  typedef DiagonalWrapper<const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, DiagonalVectorType>>
+      InitializeReturnType;
+
+  typedef DiagonalWrapper<const CwiseNullaryOp<internal::scalar_zero_op<Scalar>, DiagonalVectorType>>
+      ZeroInitializeReturnType;
+
+  /** Initializes a diagonal matrix of size SizeAtCompileTime with coefficients set to zero */
+  EIGEN_DEVICE_FUNC static const ZeroInitializeReturnType Zero() { return DiagonalVectorType::Zero().asDiagonal(); }
+  /** Initializes a diagonal matrix of size dim with coefficients set to zero */
+  EIGEN_DEVICE_FUNC static const ZeroInitializeReturnType Zero(Index size) {
+    return DiagonalVectorType::Zero(size).asDiagonal();
+  }
+  /** Initializes a identity matrix of size SizeAtCompileTime */
+  EIGEN_DEVICE_FUNC static const InitializeReturnType Identity() { return DiagonalVectorType::Ones().asDiagonal(); }
+  /** Initializes a identity matrix of size dim */
+  EIGEN_DEVICE_FUNC static const InitializeReturnType Identity(Index size) {
+    return DiagonalVectorType::Ones(size).asDiagonal();
+  }
+
+  /** Resizes to given size. */
+  EIGEN_DEVICE_FUNC inline void resize(Index size) { m_diagonal.resize(size); }
+  /** Sets all coefficients to zero. */
+  EIGEN_DEVICE_FUNC inline void setZero() { m_diagonal.setZero(); }
+  /** Resizes and sets all coefficients to zero. */
+  EIGEN_DEVICE_FUNC inline void setZero(Index size) { m_diagonal.setZero(size); }
+  /** Sets this matrix to be the identity matrix of the current size. */
+  EIGEN_DEVICE_FUNC inline void setIdentity() { m_diagonal.setOnes(); }
+  /** Sets this matrix to be the identity matrix of the given size. */
+  EIGEN_DEVICE_FUNC inline void setIdentity(Index size) { m_diagonal.setOnes(size); }
 };
 
 /** \class DiagonalWrapper
-  * \ingroup Core_Module
-  *
-  * \brief Expression of a diagonal matrix
-  *
-  * \param _DiagonalVectorType the type of the vector of diagonal coefficients
-  *
-  * This class is an expression of a diagonal matrix, but not storing its own vector of diagonal coefficients,
-  * instead wrapping an existing vector expression. It is the return type of MatrixBase::asDiagonal()
-  * and most of the time this is the only way that it is used.
-  *
-  * \sa class DiagonalMatrix, class DiagonalBase, MatrixBase::asDiagonal()
-  */
+ * \ingroup Core_Module
+ *
+ * \brief Expression of a diagonal matrix
+ *
+ * \tparam DiagonalVectorType_ the type of the vector of diagonal coefficients
+ *
+ * This class is an expression of a diagonal matrix, but not storing its own vector of diagonal coefficients,
+ * instead wrapping an existing vector expression. It is the return type of MatrixBase::asDiagonal()
+ * and most of the time this is the only way that it is used.
+ *
+ * \sa class DiagonalMatrix, class DiagonalBase, MatrixBase::asDiagonal()
+ */
 
 namespace internal {
-template<typename _DiagonalVectorType>
-struct traits<DiagonalWrapper<_DiagonalVectorType> >
-{
-  typedef _DiagonalVectorType DiagonalVectorType;
+template <typename DiagonalVectorType_>
+struct traits<DiagonalWrapper<DiagonalVectorType_>> {
+  typedef DiagonalVectorType_ DiagonalVectorType;
   typedef typename DiagonalVectorType::Scalar Scalar;
-  typedef typename DiagonalVectorType::Index Index;
-  typedef typename DiagonalVectorType::StorageKind StorageKind;
+  typedef typename DiagonalVectorType::StorageIndex StorageIndex;
+  typedef DiagonalShape StorageKind;
+  typedef typename traits<DiagonalVectorType>::XprKind XprKind;
   enum {
     RowsAtCompileTime = DiagonalVectorType::SizeAtCompileTime,
     ColsAtCompileTime = DiagonalVectorType::SizeAtCompileTime,
-    MaxRowsAtCompileTime = DiagonalVectorType::SizeAtCompileTime,
-    MaxColsAtCompileTime = DiagonalVectorType::SizeAtCompileTime,
-    Flags =  traits<DiagonalVectorType>::Flags & LvalueBit
+    MaxRowsAtCompileTime = DiagonalVectorType::MaxSizeAtCompileTime,
+    MaxColsAtCompileTime = DiagonalVectorType::MaxSizeAtCompileTime,
+    Flags = (traits<DiagonalVectorType>::Flags & LvalueBit) | NoPreferredStorageOrderBit
   };
 };
-}
+}  // namespace internal
 
-template<typename _DiagonalVectorType>
-class DiagonalWrapper
-  : public DiagonalBase<DiagonalWrapper<_DiagonalVectorType> >, internal::no_assignment_operator
-{
-  public:
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    typedef _DiagonalVectorType DiagonalVectorType;
-    typedef DiagonalWrapper Nested;
-    #endif
+template <typename DiagonalVectorType_>
+class DiagonalWrapper : public DiagonalBase<DiagonalWrapper<DiagonalVectorType_>>, internal::no_assignment_operator {
+ public:
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  typedef DiagonalVectorType_ DiagonalVectorType;
+  typedef DiagonalWrapper Nested;
+#endif
 
-    /** Constructor from expression of diagonal coefficients to wrap. */
-    inline DiagonalWrapper(DiagonalVectorType& a_diagonal) : m_diagonal(a_diagonal) {}
+  /** Constructor from expression of diagonal coefficients to wrap. */
+  EIGEN_DEVICE_FUNC explicit inline DiagonalWrapper(DiagonalVectorType& a_diagonal) : m_diagonal(a_diagonal) {}
 
-    /** \returns a const reference to the wrapped expression of diagonal coefficients. */
-    const DiagonalVectorType& diagonal() const { return m_diagonal; }
+  /** \returns a const reference to the wrapped expression of diagonal coefficients. */
+  EIGEN_DEVICE_FUNC const DiagonalVectorType& diagonal() const { return m_diagonal; }
 
-  protected:
-    typename DiagonalVectorType::Nested m_diagonal;
+ protected:
+  typename DiagonalVectorType::Nested m_diagonal;
 };
 
 /** \returns a pseudo-expression of a diagonal matrix with *this as vector of diagonal coefficients
-  *
-  * \only_for_vectors
-  *
-  * Example: \include MatrixBase_asDiagonal.cpp
-  * Output: \verbinclude MatrixBase_asDiagonal.out
-  *
-  * \sa class DiagonalWrapper, class DiagonalMatrix, diagonal(), isDiagonal()
-  **/
-template<typename Derived>
-inline const DiagonalWrapper<const Derived>
-MatrixBase<Derived>::asDiagonal() const
-{
-  return derived();
+ *
+ * \only_for_vectors
+ *
+ * Example: \include MatrixBase_asDiagonal.cpp
+ * Output: \verbinclude MatrixBase_asDiagonal.out
+ *
+ * \sa class DiagonalWrapper, class DiagonalMatrix, diagonal(), isDiagonal()
+ **/
+template <typename Derived>
+EIGEN_DEVICE_FUNC inline const DiagonalWrapper<const Derived> MatrixBase<Derived>::asDiagonal() const {
+  return DiagonalWrapper<const Derived>(derived());
 }
 
 /** \returns true if *this is approximately equal to a diagonal matrix,
-  *          within the precision given by \a prec.
-  *
-  * Example: \include MatrixBase_isDiagonal.cpp
-  * Output: \verbinclude MatrixBase_isDiagonal.out
-  *
-  * \sa asDiagonal()
-  */
-template<typename Derived>
-bool MatrixBase<Derived>::isDiagonal(const RealScalar& prec) const
-{
-  using std::abs;
-  if(cols() != rows()) return false;
+ *          within the precision given by \a prec.
+ *
+ * Example: \include MatrixBase_isDiagonal.cpp
+ * Output: \verbinclude MatrixBase_isDiagonal.out
+ *
+ * \sa asDiagonal()
+ */
+template <typename Derived>
+bool MatrixBase<Derived>::isDiagonal(const RealScalar& prec) const {
+  if (cols() != rows()) return false;
   RealScalar maxAbsOnDiagonal = static_cast<RealScalar>(-1);
-  for(Index j = 0; j < cols(); ++j)
-  {
-    RealScalar absOnDiagonal = abs(coeff(j,j));
-    if(absOnDiagonal > maxAbsOnDiagonal) maxAbsOnDiagonal = absOnDiagonal;
+  for (Index j = 0; j < cols(); ++j) {
+    RealScalar absOnDiagonal = numext::abs(coeff(j, j));
+    if (absOnDiagonal > maxAbsOnDiagonal) maxAbsOnDiagonal = absOnDiagonal;
   }
-  for(Index j = 0; j < cols(); ++j)
-    for(Index i = 0; i < j; ++i)
-    {
-      if(!internal::isMuchSmallerThan(coeff(i, j), maxAbsOnDiagonal, prec)) return false;
-      if(!internal::isMuchSmallerThan(coeff(j, i), maxAbsOnDiagonal, prec)) return false;
+  for (Index j = 0; j < cols(); ++j)
+    for (Index i = 0; i < j; ++i) {
+      if (!internal::isMuchSmallerThan(coeff(i, j), maxAbsOnDiagonal, prec)) return false;
+      if (!internal::isMuchSmallerThan(coeff(j, i), maxAbsOnDiagonal, prec)) return false;
     }
   return true;
 }
 
-} // end namespace Eigen
+namespace internal {
+
+template <>
+struct storage_kind_to_shape<DiagonalShape> {
+  typedef DiagonalShape Shape;
+};
+
+struct Diagonal2Dense {};
+
+template <>
+struct AssignmentKind<DenseShape, DiagonalShape> {
+  typedef Diagonal2Dense Kind;
+};
+
+// Diagonal matrix to Dense assignment
+template <typename DstXprType, typename SrcXprType, typename Functor>
+struct Assignment<DstXprType, SrcXprType, Functor, Diagonal2Dense> {
+  static EIGEN_DEVICE_FUNC void run(
+      DstXprType& dst, const SrcXprType& src,
+      const internal::assign_op<typename DstXprType::Scalar, typename SrcXprType::Scalar>& /*func*/) {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if ((dst.rows() != dstRows) || (dst.cols() != dstCols)) dst.resize(dstRows, dstCols);
+
+    dst.setZero();
+    dst.diagonal() = src.diagonal();
+  }
+
+  static EIGEN_DEVICE_FUNC void run(
+      DstXprType& dst, const SrcXprType& src,
+      const internal::add_assign_op<typename DstXprType::Scalar, typename SrcXprType::Scalar>& /*func*/) {
+    dst.diagonal() += src.diagonal();
+  }
+
+  static EIGEN_DEVICE_FUNC void run(
+      DstXprType& dst, const SrcXprType& src,
+      const internal::sub_assign_op<typename DstXprType::Scalar, typename SrcXprType::Scalar>& /*func*/) {
+    dst.diagonal() -= src.diagonal();
+  }
+};
+
+}  // namespace internal
+
+}  // end namespace Eigen
 
-#endif // EIGEN_DIAGONALMATRIX_H
+#endif  // EIGEN_DIAGONALMATRIX_H
diff --git a/inst/include/Eigen/src/Core/DiagonalProduct.h b/inst/include/Eigen/src/Core/DiagonalProduct.h
index cc6b536e..bd0feeac 100644
--- a/inst/include/Eigen/src/Core/DiagonalProduct.h
+++ b/inst/include/Eigen/src/Core/DiagonalProduct.h
@@ -11,121 +11,20 @@
 #ifndef EIGEN_DIAGONALPRODUCT_H
 #define EIGEN_DIAGONALPRODUCT_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 
-namespace internal {
-template<typename MatrixType, typename DiagonalType, int ProductOrder>
-struct traits<DiagonalProduct<MatrixType, DiagonalType, ProductOrder> >
- : traits<MatrixType>
-{
-  typedef typename scalar_product_traits<typename MatrixType::Scalar, typename DiagonalType::Scalar>::ReturnType Scalar;
-  enum {
-    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-    ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
-    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
-
-    _StorageOrder = MatrixType::Flags & RowMajorBit ? RowMajor : ColMajor,
-    _ScalarAccessOnDiag =  !((int(_StorageOrder) == ColMajor && int(ProductOrder) == OnTheLeft)
-                          ||(int(_StorageOrder) == RowMajor && int(ProductOrder) == OnTheRight)),
-    _SameTypes = is_same<typename MatrixType::Scalar, typename DiagonalType::Scalar>::value,
-    // FIXME currently we need same types, but in the future the next rule should be the one
-    //_Vectorizable = bool(int(MatrixType::Flags)&PacketAccessBit) && ((!_PacketOnDiag) || (_SameTypes && bool(int(DiagonalType::DiagonalVectorType::Flags)&PacketAccessBit))),
-    _Vectorizable = bool(int(MatrixType::Flags)&PacketAccessBit) && _SameTypes && (_ScalarAccessOnDiag || (bool(int(DiagonalType::DiagonalVectorType::Flags)&PacketAccessBit))),
-    _LinearAccessMask = (RowsAtCompileTime==1 || ColsAtCompileTime==1) ? LinearAccessBit : 0,
-
-    Flags = ((HereditaryBits|_LinearAccessMask|AlignedBit) & (unsigned int)(MatrixType::Flags)) | (_Vectorizable ? PacketAccessBit : 0),//(int(MatrixType::Flags)&int(DiagonalType::DiagonalVectorType::Flags)&AlignedBit),
-    Cost0 = EIGEN_ADD_COST(NumTraits<Scalar>::MulCost, MatrixType::CoeffReadCost),
-    CoeffReadCost = EIGEN_ADD_COST(Cost0,DiagonalType::DiagonalVectorType::CoeffReadCost)
-  };
-};
-}
-
-template<typename MatrixType, typename DiagonalType, int ProductOrder>
-class DiagonalProduct : internal::no_assignment_operator,
-                        public MatrixBase<DiagonalProduct<MatrixType, DiagonalType, ProductOrder> >
-{
-  public:
-
-    typedef MatrixBase<DiagonalProduct> Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(DiagonalProduct)
-
-    inline DiagonalProduct(const MatrixType& matrix, const DiagonalType& diagonal)
-      : m_matrix(matrix), m_diagonal(diagonal)
-    {
-      eigen_assert(diagonal.diagonal().size() == (ProductOrder == OnTheLeft ? matrix.rows() : matrix.cols()));
-    }
-
-    EIGEN_STRONG_INLINE Index rows() const { return m_matrix.rows(); }
-    EIGEN_STRONG_INLINE Index cols() const { return m_matrix.cols(); }
-
-    EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const
-    {
-      return m_diagonal.diagonal().coeff(ProductOrder == OnTheLeft ? row : col) * m_matrix.coeff(row, col);
-    }
-    
-    EIGEN_STRONG_INLINE const Scalar coeff(Index idx) const
-    {
-      enum {
-        StorageOrder = int(MatrixType::Flags) & RowMajorBit ? RowMajor : ColMajor
-      };
-      return coeff(int(StorageOrder)==ColMajor?idx:0,int(StorageOrder)==ColMajor?0:idx);
-    }
-
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE PacketScalar packet(Index row, Index col) const
-    {
-      enum {
-        StorageOrder = Flags & RowMajorBit ? RowMajor : ColMajor
-      };
-      const Index indexInDiagonalVector = ProductOrder == OnTheLeft ? row : col;
-      return packet_impl<LoadMode>(row,col,indexInDiagonalVector,typename internal::conditional<
-        ((int(StorageOrder) == RowMajor && int(ProductOrder) == OnTheLeft)
-       ||(int(StorageOrder) == ColMajor && int(ProductOrder) == OnTheRight)), internal::true_type, internal::false_type>::type());
-    }
-    
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE PacketScalar packet(Index idx) const
-    {
-      enum {
-        StorageOrder = int(MatrixType::Flags) & RowMajorBit ? RowMajor : ColMajor
-      };
-      return packet<LoadMode>(int(StorageOrder)==ColMajor?idx:0,int(StorageOrder)==ColMajor?0:idx);
-    }
-
-  protected:
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE PacketScalar packet_impl(Index row, Index col, Index id, internal::true_type) const
-    {
-      return internal::pmul(m_matrix.template packet<LoadMode>(row, col),
-                     internal::pset1<PacketScalar>(m_diagonal.diagonal().coeff(id)));
-    }
-
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE PacketScalar packet_impl(Index row, Index col, Index id, internal::false_type) const
-    {
-      enum {
-        InnerSize = (MatrixType::Flags & RowMajorBit) ? MatrixType::ColsAtCompileTime : MatrixType::RowsAtCompileTime,
-        DiagonalVectorPacketLoadMode = (LoadMode == Aligned && (((InnerSize%16) == 0) || (int(DiagonalType::DiagonalVectorType::Flags)&AlignedBit)==AlignedBit) ? Aligned : Unaligned)
-      };
-      return internal::pmul(m_matrix.template packet<LoadMode>(row, col),
-                     m_diagonal.diagonal().template packet<DiagonalVectorPacketLoadMode>(id));
-    }
-
-    typename MatrixType::Nested m_matrix;
-    typename DiagonalType::Nested m_diagonal;
-};
+namespace Eigen {
 
 /** \returns the diagonal matrix product of \c *this by the diagonal matrix \a diagonal.
-  */
-template<typename Derived>
-template<typename DiagonalDerived>
-inline const DiagonalProduct<Derived, DiagonalDerived, OnTheRight>
-MatrixBase<Derived>::operator*(const DiagonalBase<DiagonalDerived> &a_diagonal) const
-{
-  return DiagonalProduct<Derived, DiagonalDerived, OnTheRight>(derived(), a_diagonal.derived());
+ */
+template <typename Derived>
+template <typename DiagonalDerived>
+EIGEN_DEVICE_FUNC inline const Product<Derived, DiagonalDerived, LazyProduct> MatrixBase<Derived>::operator*(
+    const DiagonalBase<DiagonalDerived> &a_diagonal) const {
+  return Product<Derived, DiagonalDerived, LazyProduct>(derived(), a_diagonal.derived());
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_DIAGONALPRODUCT_H
+#endif  // EIGEN_DIAGONALPRODUCT_H
diff --git a/inst/include/Eigen/src/Core/Dot.h b/inst/include/Eigen/src/Core/Dot.h
index 9d7651f1..059527c8 100644
--- a/inst/include/Eigen/src/Core/Dot.h
+++ b/inst/include/Eigen/src/Core/Dot.h
@@ -10,254 +10,259 @@
 #ifndef EIGEN_DOT_H
 #define EIGEN_DOT_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 namespace internal {
 
-// helper function for dot(). The problem is that if we put that in the body of dot(), then upon calling dot
-// with mismatched types, the compiler emits errors about failing to instantiate cwiseProduct BEFORE
-// looking at the static assertions. Thus this is a trick to get better compile errors.
-template<typename T, typename U,
-// the NeedToTranspose condition here is taken straight from Assign.h
-         bool NeedToTranspose = T::IsVectorAtCompileTime
-                && U::IsVectorAtCompileTime
-                && ((int(T::RowsAtCompileTime) == 1 && int(U::ColsAtCompileTime) == 1)
-                      |  // FIXME | instead of || to please GCC 4.4.0 stupid warning "suggest parentheses around &&".
-                         // revert to || as soon as not needed anymore.
-                    (int(T::ColsAtCompileTime) == 1 && int(U::RowsAtCompileTime) == 1))
->
-struct dot_nocheck
-{
-  typedef typename scalar_product_traits<typename traits<T>::Scalar,typename traits<U>::Scalar>::ReturnType ResScalar;
-  static inline ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
-  {
-    return a.template binaryExpr<scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> >(b).sum();
+template <typename Derived, typename Scalar = typename traits<Derived>::Scalar>
+struct squared_norm_impl {
+  using Real = typename NumTraits<Scalar>::Real;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Real run(const Derived& a) {
+    Scalar result = a.unaryExpr(squared_norm_functor<Scalar>()).sum();
+    return numext::real(result) + numext::imag(result);
   }
 };
 
-template<typename T, typename U>
-struct dot_nocheck<T, U, true>
-{
-  typedef typename scalar_product_traits<typename traits<T>::Scalar,typename traits<U>::Scalar>::ReturnType ResScalar;
-  static inline ResScalar run(const MatrixBase<T>& a, const MatrixBase<U>& b)
-  {
-    return a.transpose().template binaryExpr<scalar_conj_product_op<typename traits<T>::Scalar,typename traits<U>::Scalar> >(b).sum();
-  }
+template <typename Derived>
+struct squared_norm_impl<Derived, bool> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(const Derived& a) { return a.any(); }
 };
 
-} // end namespace internal
-
-/** \returns the dot product of *this with other.
-  *
-  * \only_for_vectors
-  *
-  * \note If the scalar type is complex numbers, then this function returns the hermitian
-  * (sesquilinear) dot product, conjugate-linear in the first variable and linear in the
-  * second variable.
-  *
-  * \sa squaredNorm(), norm()
-  */
-template<typename Derived>
-template<typename OtherDerived>
-typename internal::scalar_product_traits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType
-MatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
-  EIGEN_STATIC_ASSERT_SAME_VECTOR_SIZE(Derived,OtherDerived)
-  typedef internal::scalar_conj_product_op<Scalar,typename OtherDerived::Scalar> func;
-  EIGEN_CHECK_BINARY_COMPATIBILIY(func,Scalar,typename OtherDerived::Scalar);
-
-  eigen_assert(size() == other.size());
-
-  return internal::dot_nocheck<Derived,OtherDerived>::run(*this, other);
-}
-
-#ifdef EIGEN2_SUPPORT
-/** \returns the dot product of *this with other, with the Eigen2 convention that the dot product is linear in the first variable
-  * (conjugating the second variable). Of course this only makes a difference in the complex case.
-  *
-  * This method is only available in EIGEN2_SUPPORT mode.
-  *
-  * \only_for_vectors
-  *
-  * \sa dot()
-  */
-template<typename Derived>
-template<typename OtherDerived>
-typename internal::traits<Derived>::Scalar
-MatrixBase<Derived>::eigen2_dot(const MatrixBase<OtherDerived>& other) const
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
-  EIGEN_STATIC_ASSERT_SAME_VECTOR_SIZE(Derived,OtherDerived)
-  EIGEN_STATIC_ASSERT((internal::is_same<Scalar, typename OtherDerived::Scalar>::value),
-    YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
-
-  eigen_assert(size() == other.size());
+}  // end namespace internal
 
-  return internal::dot_nocheck<OtherDerived,Derived>::run(other,*this);
+/** \fn MatrixBase::dot
+ * \returns the dot product of *this with other.
+ *
+ * \only_for_vectors
+ *
+ * \note If the scalar type is complex numbers, then this function returns the hermitian
+ * (sesquilinear) dot product, conjugate-linear in the first variable and linear in the
+ * second variable.
+ *
+ * \sa squaredNorm(), norm()
+ */
+template <typename Derived>
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    typename ScalarBinaryOpTraits<typename internal::traits<Derived>::Scalar,
+                                  typename internal::traits<OtherDerived>::Scalar>::ReturnType
+    MatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const {
+  return internal::dot_impl<Derived, OtherDerived>::run(derived(), other.derived());
 }
-#endif
-
 
 //---------- implementation of L2 norm and related functions ----------
 
-/** \returns, for vectors, the squared \em l2 norm of \c *this, and for matrices the Frobenius norm.
-  * In both cases, it consists in the sum of the square of all the matrix entries.
-  * For vectors, this is also equals to the dot product of \c *this with itself.
-  *
-  * \sa dot(), norm()
-  */
-template<typename Derived>
-EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::squaredNorm() const
-{
-  return numext::real((*this).cwiseAbs2().sum());
+/** \returns, for vectors, the squared \em l2 norm of \c *this, and for matrices the squared Frobenius norm.
+ * In both cases, it consists in the sum of the square of all the matrix entries.
+ * For vectors, this is also equals to the dot product of \c *this with itself.
+ *
+ * \sa dot(), norm(), lpNorm()
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
+MatrixBase<Derived>::squaredNorm() const {
+  return internal::squared_norm_impl<Derived>::run(derived());
 }
 
 /** \returns, for vectors, the \em l2 norm of \c *this, and for matrices the Frobenius norm.
-  * In both cases, it consists in the square root of the sum of the square of all the matrix entries.
-  * For vectors, this is also equals to the square root of the dot product of \c *this with itself.
-  *
-  * \sa dot(), squaredNorm()
-  */
-template<typename Derived>
-inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::norm() const
-{
-  using std::sqrt;
-  return sqrt(squaredNorm());
+ * In both cases, it consists in the square root of the sum of the square of all the matrix entries.
+ * For vectors, this is also equals to the square root of the dot product of \c *this with itself.
+ *
+ * \sa lpNorm(), dot(), squaredNorm()
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
+MatrixBase<Derived>::norm() const {
+  return numext::sqrt(squaredNorm());
 }
 
-/** \returns an expression of the quotient of *this by its own norm.
-  *
-  * \only_for_vectors
-  *
-  * \sa norm(), normalize()
-  */
-template<typename Derived>
-inline const typename MatrixBase<Derived>::PlainObject
-MatrixBase<Derived>::normalized() const
-{
-  typedef typename internal::nested<Derived>::type Nested;
-  typedef typename internal::remove_reference<Nested>::type _Nested;
-  _Nested n(derived());
-  return n / n.norm();
+/** \returns an expression of the quotient of \c *this by its own norm.
+ *
+ * \warning If the input vector is too small (i.e., this->norm()==0),
+ *          then this function returns a copy of the input.
+ *
+ * \only_for_vectors
+ *
+ * \sa norm(), normalize()
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::PlainObject MatrixBase<Derived>::normalized()
+    const {
+  typedef typename internal::nested_eval<Derived, 2>::type Nested_;
+  Nested_ n(derived());
+  RealScalar z = n.squaredNorm();
+  // NOTE: after extensive benchmarking, this conditional does not impact performance, at least on recent x86 CPU
+  if (z > RealScalar(0))
+    return n / numext::sqrt(z);
+  else
+    return n;
 }
 
 /** Normalizes the vector, i.e. divides it by its own norm.
-  *
-  * \only_for_vectors
-  *
-  * \sa norm(), normalized()
-  */
-template<typename Derived>
-inline void MatrixBase<Derived>::normalize()
-{
-  *this /= norm();
+ *
+ * \only_for_vectors
+ *
+ * \warning If the input vector is too small (i.e., this->norm()==0), then \c *this is left unchanged.
+ *
+ * \sa norm(), normalized()
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void MatrixBase<Derived>::normalize() {
+  RealScalar z = squaredNorm();
+  // NOTE: after extensive benchmarking, this conditional does not impact performance, at least on recent x86 CPU
+  if (z > RealScalar(0)) derived() /= numext::sqrt(z);
+}
+
+/** \returns an expression of the quotient of \c *this by its own norm while avoiding underflow and overflow.
+ *
+ * \only_for_vectors
+ *
+ * This method is analogue to the normalized() method, but it reduces the risk of
+ * underflow and overflow when computing the norm.
+ *
+ * \warning If the input vector is too small (i.e., this->norm()==0),
+ *          then this function returns a copy of the input.
+ *
+ * \sa stableNorm(), stableNormalize(), normalized()
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase<Derived>::PlainObject
+MatrixBase<Derived>::stableNormalized() const {
+  typedef typename internal::nested_eval<Derived, 3>::type Nested_;
+  Nested_ n(derived());
+  RealScalar w = n.cwiseAbs().maxCoeff();
+  RealScalar z = (n / w).squaredNorm();
+  if (z > RealScalar(0))
+    return n / (numext::sqrt(z) * w);
+  else
+    return n;
+}
+
+/** Normalizes the vector while avoid underflow and overflow
+ *
+ * \only_for_vectors
+ *
+ * This method is analogue to the normalize() method, but it reduces the risk of
+ * underflow and overflow when computing the norm.
+ *
+ * \warning If the input vector is too small (i.e., this->norm()==0), then \c *this is left unchanged.
+ *
+ * \sa stableNorm(), stableNormalized(), normalize()
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void MatrixBase<Derived>::stableNormalize() {
+  RealScalar w = cwiseAbs().maxCoeff();
+  RealScalar z = (derived() / w).squaredNorm();
+  if (z > RealScalar(0)) derived() /= numext::sqrt(z) * w;
 }
 
 //---------- implementation of other norms ----------
 
 namespace internal {
 
-template<typename Derived, int p>
-struct lpNorm_selector
-{
+template <typename Derived, int p>
+struct lpNorm_selector {
   typedef typename NumTraits<typename traits<Derived>::Scalar>::Real RealScalar;
-  static inline RealScalar run(const MatrixBase<Derived>& m)
-  {
-    using std::pow;
-    return pow(m.cwiseAbs().array().pow(p).sum(), RealScalar(1)/p);
+  EIGEN_DEVICE_FUNC static inline RealScalar run(const MatrixBase<Derived>& m) {
+    EIGEN_USING_STD(pow)
+    return pow(m.cwiseAbs().array().pow(p).sum(), RealScalar(1) / p);
   }
 };
 
-template<typename Derived>
-struct lpNorm_selector<Derived, 1>
-{
-  static inline typename NumTraits<typename traits<Derived>::Scalar>::Real run(const MatrixBase<Derived>& m)
-  {
+template <typename Derived>
+struct lpNorm_selector<Derived, 1> {
+  EIGEN_DEVICE_FUNC static inline typename NumTraits<typename traits<Derived>::Scalar>::Real run(
+      const MatrixBase<Derived>& m) {
     return m.cwiseAbs().sum();
   }
 };
 
-template<typename Derived>
-struct lpNorm_selector<Derived, 2>
-{
-  static inline typename NumTraits<typename traits<Derived>::Scalar>::Real run(const MatrixBase<Derived>& m)
-  {
+template <typename Derived>
+struct lpNorm_selector<Derived, 2> {
+  EIGEN_DEVICE_FUNC static inline typename NumTraits<typename traits<Derived>::Scalar>::Real run(
+      const MatrixBase<Derived>& m) {
     return m.norm();
   }
 };
 
-template<typename Derived>
-struct lpNorm_selector<Derived, Infinity>
-{
-  static inline typename NumTraits<typename traits<Derived>::Scalar>::Real run(const MatrixBase<Derived>& m)
-  {
+template <typename Derived>
+struct lpNorm_selector<Derived, Infinity> {
+  typedef typename NumTraits<typename traits<Derived>::Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC static inline RealScalar run(const MatrixBase<Derived>& m) {
+    if (Derived::SizeAtCompileTime == 0 || (Derived::SizeAtCompileTime == Dynamic && m.size() == 0))
+      return RealScalar(0);
     return m.cwiseAbs().maxCoeff();
   }
 };
 
-} // end namespace internal
+}  // end namespace internal
 
-/** \returns the \f$ \ell^p \f$ norm of *this, that is, returns the p-th root of the sum of the p-th powers of the absolute values
-  *          of the coefficients of *this. If \a p is the special value \a Eigen::Infinity, this function returns the \f$ \ell^\infty \f$
-  *          norm, that is the maximum of the absolute values of the coefficients of *this.
-  *
-  * \sa norm()
-  */
-template<typename Derived>
-template<int p>
-inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
-MatrixBase<Derived>::lpNorm() const
-{
+/** \returns the \b coefficient-wise \f$ \ell^p \f$ norm of \c *this, that is, returns the p-th root of the sum of the
+ * p-th powers of the absolute values of the coefficients of \c *this. If \a p is the special value \a Eigen::Infinity,
+ * this function returns the \f$ \ell^\infty \f$ norm, that is the maximum of the absolute values of the coefficients of
+ * \c *this.
+ *
+ * In all cases, if \c *this is empty, then the value 0 is returned.
+ *
+ * \note For matrices, this function does not compute the <a
+ * href="https://en.wikipedia.org/wiki/Operator_norm">operator-norm</a>. That is, if \c *this is a matrix, then its
+ * coefficients are interpreted as a 1D vector. Nonetheless, you can easily compute the 1-norm and \f$\infty\f$-norm
+ * matrix operator norms using \link TutorialReductionsVisitorsBroadcastingReductionsNorm partial reductions \endlink.
+ *
+ * \sa norm()
+ */
+template <typename Derived>
+template <int p>
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+EIGEN_DEVICE_FUNC inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
+#else
+EIGEN_DEVICE_FUNC MatrixBase<Derived>::RealScalar
+#endif
+MatrixBase<Derived>::lpNorm() const {
   return internal::lpNorm_selector<Derived, p>::run(*this);
 }
 
 //---------- implementation of isOrthogonal / isUnitary ----------
 
 /** \returns true if *this is approximately orthogonal to \a other,
-  *          within the precision given by \a prec.
-  *
-  * Example: \include MatrixBase_isOrthogonal.cpp
-  * Output: \verbinclude MatrixBase_isOrthogonal.out
-  */
-template<typename Derived>
-template<typename OtherDerived>
-bool MatrixBase<Derived>::isOrthogonal
-(const MatrixBase<OtherDerived>& other, const RealScalar& prec) const
-{
-  typename internal::nested<Derived,2>::type nested(derived());
-  typename internal::nested<OtherDerived,2>::type otherNested(other.derived());
+ *          within the precision given by \a prec.
+ *
+ * Example: \include MatrixBase_isOrthogonal.cpp
+ * Output: \verbinclude MatrixBase_isOrthogonal.out
+ */
+template <typename Derived>
+template <typename OtherDerived>
+bool MatrixBase<Derived>::isOrthogonal(const MatrixBase<OtherDerived>& other, const RealScalar& prec) const {
+  typename internal::nested_eval<Derived, 2>::type nested(derived());
+  typename internal::nested_eval<OtherDerived, 2>::type otherNested(other.derived());
   return numext::abs2(nested.dot(otherNested)) <= prec * prec * nested.squaredNorm() * otherNested.squaredNorm();
 }
 
 /** \returns true if *this is approximately an unitary matrix,
-  *          within the precision given by \a prec. In the case where the \a Scalar
-  *          type is real numbers, a unitary matrix is an orthogonal matrix, whence the name.
-  *
-  * \note This can be used to check whether a family of vectors forms an orthonormal basis.
-  *       Indeed, \c m.isUnitary() returns true if and only if the columns (equivalently, the rows) of m form an
-  *       orthonormal basis.
-  *
-  * Example: \include MatrixBase_isUnitary.cpp
-  * Output: \verbinclude MatrixBase_isUnitary.out
-  */
-template<typename Derived>
-bool MatrixBase<Derived>::isUnitary(const RealScalar& prec) const
-{
-  typename Derived::Nested nested(derived());
-  for(Index i = 0; i < cols(); ++i)
-  {
-    if(!internal::isApprox(nested.col(i).squaredNorm(), static_cast<RealScalar>(1), prec))
-      return false;
-    for(Index j = 0; j < i; ++j)
-      if(!internal::isMuchSmallerThan(nested.col(i).dot(nested.col(j)), static_cast<Scalar>(1), prec))
-        return false;
+ *          within the precision given by \a prec. In the case where the \a Scalar
+ *          type is real numbers, a unitary matrix is an orthogonal matrix, whence the name.
+ *
+ * \note This can be used to check whether a family of vectors forms an orthonormal basis.
+ *       Indeed, \c m.isUnitary() returns true if and only if the columns (equivalently, the rows) of m form an
+ *       orthonormal basis.
+ *
+ * Example: \include MatrixBase_isUnitary.cpp
+ * Output: \verbinclude MatrixBase_isUnitary.out
+ */
+template <typename Derived>
+bool MatrixBase<Derived>::isUnitary(const RealScalar& prec) const {
+  typename internal::nested_eval<Derived, 1>::type self(derived());
+  for (Index i = 0; i < cols(); ++i) {
+    if (!internal::isApprox(self.col(i).squaredNorm(), static_cast<RealScalar>(1), prec)) return false;
+    for (Index j = 0; j < i; ++j)
+      if (!internal::isMuchSmallerThan(self.col(i).dot(self.col(j)), static_cast<Scalar>(1), prec)) return false;
   }
   return true;
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_DOT_H
+#endif  // EIGEN_DOT_H
diff --git a/inst/include/Eigen/src/Core/EigenBase.h b/inst/include/Eigen/src/Core/EigenBase.h
index fadb4585..c9a6e88e 100644
--- a/inst/include/Eigen/src/Core/EigenBase.h
+++ b/inst/include/Eigen/src/Core/EigenBase.h
@@ -11,121 +11,139 @@
 #ifndef EIGEN_EIGENBASE_H
 #define EIGEN_EIGENBASE_H
 
-namespace Eigen {
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 
-/** Common base class for all classes T such that MatrixBase has an operator=(T) and a constructor MatrixBase(T).
-  *
-  * In other words, an EigenBase object is an object that can be copied into a MatrixBase.
-  *
-  * Besides MatrixBase-derived classes, this also includes special matrix classes such as diagonal matrices, etc.
-  *
-  * Notice that this class is trivial, it is only used to disambiguate overloaded functions.
-  *
-  * \sa \ref TopicClassHierarchy
-  */
-template<typename Derived> struct EigenBase
-{
-//   typedef typename internal::plain_matrix_type<Derived>::type PlainObject;
+namespace Eigen {
 
+/** \class EigenBase
+ * \ingroup Core_Module
+ *
+ * Common base class for all classes T such that MatrixBase has an operator=(T) and a constructor MatrixBase(T).
+ *
+ * In other words, an EigenBase object is an object that can be copied into a MatrixBase.
+ *
+ * Besides MatrixBase-derived classes, this also includes special matrix classes such as diagonal matrices, etc.
+ *
+ * Notice that this class is trivial, it is only used to disambiguate overloaded functions.
+ *
+ * \sa \blank \ref TopicClassHierarchy
+ */
+template <typename Derived>
+struct EigenBase {
+  //   typedef typename internal::plain_matrix_type<Derived>::type PlainObject;
+
+  /** \brief The interface type of indices
+   * \details To change this, \c \#define the preprocessor symbol \c EIGEN_DEFAULT_DENSE_INDEX_TYPE.
+   * \sa StorageIndex, \ref TopicPreprocessorDirectives.
+   * DEPRECATED: Since Eigen 3.3, its usage is deprecated. Use Eigen::Index instead.
+   * Deprecation is not marked with a doxygen comment because there are too many existing usages to add the deprecation
+   * attribute.
+   */
+  typedef Eigen::Index Index;
+
+  // FIXME is it needed?
   typedef typename internal::traits<Derived>::StorageKind StorageKind;
-  typedef typename internal::traits<Derived>::Index Index;
 
   /** \returns a reference to the derived object */
-  Derived& derived() { return *static_cast<Derived*>(this); }
+  EIGEN_DEVICE_FUNC constexpr Derived& derived() { return *static_cast<Derived*>(this); }
   /** \returns a const reference to the derived object */
-  const Derived& derived() const { return *static_cast<const Derived*>(this); }
+  EIGEN_DEVICE_FUNC constexpr const Derived& derived() const { return *static_cast<const Derived*>(this); }
 
-  inline Derived& const_cast_derived() const
-  { return *static_cast<Derived*>(const_cast<EigenBase*>(this)); }
-  inline const Derived& const_derived() const
-  { return *static_cast<const Derived*>(this); }
+  EIGEN_DEVICE_FUNC inline constexpr Derived& const_cast_derived() const {
+    return *static_cast<Derived*>(const_cast<EigenBase*>(this));
+  }
+  EIGEN_DEVICE_FUNC inline const Derived& const_derived() const { return *static_cast<const Derived*>(this); }
 
   /** \returns the number of rows. \sa cols(), RowsAtCompileTime */
-  inline Index rows() const { return derived().rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return derived().rows(); }
   /** \returns the number of columns. \sa rows(), ColsAtCompileTime*/
-  inline Index cols() const { return derived().cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return derived().cols(); }
   /** \returns the number of coefficients, which is rows()*cols().
-    * \sa rows(), cols(), SizeAtCompileTime. */
-  inline Index size() const { return rows() * cols(); }
+   * \sa rows(), cols(), SizeAtCompileTime. */
+  EIGEN_DEVICE_FUNC constexpr Index size() const noexcept { return rows() * cols(); }
 
   /** \internal Don't use it, but do the equivalent: \code dst = *this; \endcode */
-  template<typename Dest> inline void evalTo(Dest& dst) const
-  { derived().evalTo(dst); }
+  template <typename Dest>
+  EIGEN_DEVICE_FUNC inline void evalTo(Dest& dst) const {
+    derived().evalTo(dst);
+  }
 
   /** \internal Don't use it, but do the equivalent: \code dst += *this; \endcode */
-  template<typename Dest> inline void addTo(Dest& dst) const
-  {
+  template <typename Dest>
+  EIGEN_DEVICE_FUNC inline void addTo(Dest& dst) const {
     // This is the default implementation,
     // derived class can reimplement it in a more optimized way.
-    typename Dest::PlainObject res(rows(),cols());
+    typename Dest::PlainObject res(rows(), cols());
     evalTo(res);
     dst += res;
   }
 
   /** \internal Don't use it, but do the equivalent: \code dst -= *this; \endcode */
-  template<typename Dest> inline void subTo(Dest& dst) const
-  {
+  template <typename Dest>
+  EIGEN_DEVICE_FUNC inline void subTo(Dest& dst) const {
     // This is the default implementation,
     // derived class can reimplement it in a more optimized way.
-    typename Dest::PlainObject res(rows(),cols());
+    typename Dest::PlainObject res(rows(), cols());
     evalTo(res);
     dst -= res;
   }
 
   /** \internal Don't use it, but do the equivalent: \code dst.applyOnTheRight(*this); \endcode */
-  template<typename Dest> inline void applyThisOnTheRight(Dest& dst) const
-  {
+  template <typename Dest>
+  EIGEN_DEVICE_FUNC inline void applyThisOnTheRight(Dest& dst) const {
     // This is the default implementation,
     // derived class can reimplement it in a more optimized way.
     dst = dst * this->derived();
   }
 
   /** \internal Don't use it, but do the equivalent: \code dst.applyOnTheLeft(*this); \endcode */
-  template<typename Dest> inline void applyThisOnTheLeft(Dest& dst) const
-  {
+  template <typename Dest>
+  EIGEN_DEVICE_FUNC inline void applyThisOnTheLeft(Dest& dst) const {
     // This is the default implementation,
     // derived class can reimplement it in a more optimized way.
     dst = this->derived() * dst;
   }
 
+  template <typename Device>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DeviceWrapper<Derived, Device> device(Device& device);
+  template <typename Device>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DeviceWrapper<const Derived, Device> device(Device& device) const;
 };
 
 /***************************************************************************
-* Implementation of matrix base methods
-***************************************************************************/
+ * Implementation of matrix base methods
+ ***************************************************************************/
 
 /** \brief Copies the generic expression \a other into *this.
-  *
-  * \details The expression must provide a (templated) evalTo(Derived& dst) const
-  * function which does the actual job. In practice, this allows any user to write
-  * its own special matrix without having to modify MatrixBase
-  *
-  * \returns a reference to *this.
-  */
-template<typename Derived>
-template<typename OtherDerived>
-Derived& DenseBase<Derived>::operator=(const EigenBase<OtherDerived> &other)
-{
-  other.derived().evalTo(derived());
+ *
+ * \details The expression must provide a (templated) evalTo(Derived& dst) const
+ * function which does the actual job. In practice, this allows any user to write
+ * its own special matrix without having to modify MatrixBase
+ *
+ * \returns a reference to *this.
+ */
+template <typename Derived>
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC Derived& DenseBase<Derived>::operator=(const EigenBase<OtherDerived>& other) {
+  call_assignment(derived(), other.derived());
   return derived();
 }
 
-template<typename Derived>
-template<typename OtherDerived>
-Derived& DenseBase<Derived>::operator+=(const EigenBase<OtherDerived> &other)
-{
-  other.derived().addTo(derived());
+template <typename Derived>
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC Derived& DenseBase<Derived>::operator+=(const EigenBase<OtherDerived>& other) {
+  call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar, typename OtherDerived::Scalar>());
   return derived();
 }
 
-template<typename Derived>
-template<typename OtherDerived>
-Derived& DenseBase<Derived>::operator-=(const EigenBase<OtherDerived> &other)
-{
-  other.derived().subTo(derived());
+template <typename Derived>
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC Derived& DenseBase<Derived>::operator-=(const EigenBase<OtherDerived>& other) {
+  call_assignment(derived(), other.derived(), internal::sub_assign_op<Scalar, typename OtherDerived::Scalar>());
   return derived();
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_EIGENBASE_H
+#endif  // EIGEN_EIGENBASE_H
diff --git a/inst/include/Eigen/src/Core/Fill.h b/inst/include/Eigen/src/Core/Fill.h
new file mode 100644
index 00000000..779ef26a
--- /dev/null
+++ b/inst/include/Eigen/src/Core/Fill.h
@@ -0,0 +1,138 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2024 Charles Schlosser <cs.schlosser@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_FILL_H
+#define EIGEN_FILL_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename Xpr>
+struct eigen_fill_helper : std::false_type {};
+
+template <typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
+struct eigen_fill_helper<Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols>> : std::true_type {};
+
+template <typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
+struct eigen_fill_helper<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols>> : std::true_type {};
+
+template <typename Xpr, int BlockRows, int BlockCols>
+struct eigen_fill_helper<Block<Xpr, BlockRows, BlockCols, /*InnerPanel*/ true>> : eigen_fill_helper<Xpr> {};
+
+template <typename Xpr, int BlockRows, int BlockCols>
+struct eigen_fill_helper<Block<Xpr, BlockRows, BlockCols, /*InnerPanel*/ false>>
+    : std::integral_constant<bool, eigen_fill_helper<Xpr>::value &&
+                                       (Xpr::IsRowMajor ? (BlockRows == 1) : (BlockCols == 1))> {};
+
+template <typename Xpr, int Options>
+struct eigen_fill_helper<Map<Xpr, Options, Stride<0, 0>>> : eigen_fill_helper<Xpr> {};
+
+template <typename Xpr, int Options, int OuterStride_>
+struct eigen_fill_helper<Map<Xpr, Options, Stride<OuterStride_, 0>>>
+    : std::integral_constant<bool, eigen_fill_helper<Xpr>::value &&
+                                       enum_eq_not_dynamic(OuterStride_, Xpr::InnerSizeAtCompileTime)> {};
+
+template <typename Xpr, int Options, int OuterStride_>
+struct eigen_fill_helper<Map<Xpr, Options, Stride<OuterStride_, 1>>>
+    : eigen_fill_helper<Map<Xpr, Options, Stride<OuterStride_, 0>>> {};
+
+template <typename Xpr, int Options, int InnerStride_>
+struct eigen_fill_helper<Map<Xpr, Options, InnerStride<InnerStride_>>>
+    : eigen_fill_helper<Map<Xpr, Options, Stride<0, InnerStride_>>> {};
+
+template <typename Xpr, int Options, int OuterStride_>
+struct eigen_fill_helper<Map<Xpr, Options, OuterStride<OuterStride_>>>
+    : eigen_fill_helper<Map<Xpr, Options, Stride<OuterStride_, 0>>> {};
+
+template <typename Xpr>
+struct eigen_fill_impl<Xpr, /*use_fill*/ false> {
+  using Scalar = typename Xpr::Scalar;
+  using Func = scalar_constant_op<Scalar>;
+  using PlainObject = typename Xpr::PlainObject;
+  using Constant = typename PlainObject::ConstantReturnType;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void run(Xpr& dst, const Scalar& val) {
+    const Constant src(dst.rows(), dst.cols(), val);
+    run(dst, src);
+  }
+  template <typename SrcXpr>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void run(Xpr& dst, const SrcXpr& src) {
+    call_dense_assignment_loop(dst, src, assign_op<Scalar, Scalar>());
+  }
+};
+
+#if EIGEN_COMP_MSVC || defined(EIGEN_GPU_COMPILE_PHASE)
+template <typename Xpr>
+struct eigen_fill_impl<Xpr, /*use_fill*/ true> : eigen_fill_impl<Xpr, /*use_fill*/ false> {};
+#else
+template <typename Xpr>
+struct eigen_fill_impl<Xpr, /*use_fill*/ true> {
+  using Scalar = typename Xpr::Scalar;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Xpr& dst, const Scalar& val) {
+    const Scalar val_copy = val;
+    using std::fill_n;
+    fill_n(dst.data(), dst.size(), val_copy);
+  }
+  template <typename SrcXpr>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Xpr& dst, const SrcXpr& src) {
+    resize_if_allowed(dst, src, assign_op<Scalar, Scalar>());
+    const Scalar& val = src.functor()();
+    run(dst, val);
+  }
+};
+#endif
+
+template <typename Xpr>
+struct eigen_memset_helper {
+  static constexpr bool value =
+      std::is_trivially_copyable<typename Xpr::Scalar>::value && eigen_fill_helper<Xpr>::value;
+};
+
+template <typename Xpr>
+struct eigen_zero_impl<Xpr, /*use_memset*/ false> {
+  using Scalar = typename Xpr::Scalar;
+  using PlainObject = typename Xpr::PlainObject;
+  using Zero = typename PlainObject::ZeroReturnType;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void run(Xpr& dst) {
+    const Zero src(dst.rows(), dst.cols());
+    run(dst, src);
+  }
+  template <typename SrcXpr>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void run(Xpr& dst, const SrcXpr& src) {
+    call_dense_assignment_loop(dst, src, assign_op<Scalar, Scalar>());
+  }
+};
+
+template <typename Xpr>
+struct eigen_zero_impl<Xpr, /*use_memset*/ true> {
+  using Scalar = typename Xpr::Scalar;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Xpr& dst) {
+    const std::ptrdiff_t num_bytes = dst.size() * static_cast<std::ptrdiff_t>(sizeof(Scalar));
+    if (num_bytes <= 0) return;
+    void* dst_ptr = static_cast<void*>(dst.data());
+#ifndef EIGEN_NO_DEBUG
+    eigen_assert((dst_ptr != nullptr) && "null pointer dereference error!");
+#endif
+    EIGEN_USING_STD(memset);
+    memset(dst_ptr, 0, static_cast<std::size_t>(num_bytes));
+  }
+  template <typename SrcXpr>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Xpr& dst, const SrcXpr& src) {
+    resize_if_allowed(dst, src, assign_op<Scalar, Scalar>());
+    run(dst);
+  }
+};
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_FILL_H
diff --git a/inst/include/Eigen/src/Core/FindCoeff.h b/inst/include/Eigen/src/Core/FindCoeff.h
new file mode 100644
index 00000000..0102e8af
--- /dev/null
+++ b/inst/include/Eigen/src/Core/FindCoeff.h
@@ -0,0 +1,464 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2025 Charlie Schlosser <cs.schlosser@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_FIND_COEFF_H
+#define EIGEN_FIND_COEFF_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename Scalar, int NaNPropagation, bool IsInteger = NumTraits<Scalar>::IsInteger>
+struct max_coeff_functor {
+  EIGEN_DEVICE_FUNC inline bool compareCoeff(const Scalar& incumbent, const Scalar& candidate) const {
+    return candidate > incumbent;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet comparePacket(const Packet& incumbent, const Packet& candidate) const {
+    return pcmp_lt(incumbent, candidate);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Scalar predux(const Packet& a) const {
+    return predux_max(a);
+  }
+};
+
+template <typename Scalar>
+struct max_coeff_functor<Scalar, PropagateNaN, false> {
+  EIGEN_DEVICE_FUNC inline Scalar compareCoeff(const Scalar& incumbent, const Scalar& candidate) {
+    return (candidate > incumbent) || ((candidate != candidate) && (incumbent == incumbent));
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet comparePacket(const Packet& incumbent, const Packet& candidate) {
+    return pandnot(pcmp_lt_or_nan(incumbent, candidate), pisnan(incumbent));
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Scalar predux(const Packet& a) const {
+    return predux_max<PropagateNaN>(a);
+  }
+};
+
+template <typename Scalar>
+struct max_coeff_functor<Scalar, PropagateNumbers, false> {
+  EIGEN_DEVICE_FUNC inline bool compareCoeff(const Scalar& incumbent, const Scalar& candidate) const {
+    return (candidate > incumbent) || ((candidate == candidate) && (incumbent != incumbent));
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet comparePacket(const Packet& incumbent, const Packet& candidate) const {
+    return pandnot(pcmp_lt_or_nan(incumbent, candidate), pisnan(candidate));
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Scalar predux(const Packet& a) const {
+    return predux_max<PropagateNumbers>(a);
+  }
+};
+
+template <typename Scalar, int NaNPropagation, bool IsInteger = NumTraits<Scalar>::IsInteger>
+struct min_coeff_functor {
+  EIGEN_DEVICE_FUNC inline bool compareCoeff(const Scalar& incumbent, const Scalar& candidate) const {
+    return candidate < incumbent;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet comparePacket(const Packet& incumbent, const Packet& candidate) const {
+    return pcmp_lt(candidate, incumbent);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Scalar predux(const Packet& a) const {
+    return predux_min(a);
+  }
+};
+
+template <typename Scalar>
+struct min_coeff_functor<Scalar, PropagateNaN, false> {
+  EIGEN_DEVICE_FUNC inline Scalar compareCoeff(const Scalar& incumbent, const Scalar& candidate) {
+    return (candidate < incumbent) || ((candidate != candidate) && (incumbent == incumbent));
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet comparePacket(const Packet& incumbent, const Packet& candidate) {
+    return pandnot(pcmp_lt_or_nan(candidate, incumbent), pisnan(incumbent));
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Scalar predux(const Packet& a) const {
+    return predux_min<PropagateNaN>(a);
+  }
+};
+
+template <typename Scalar>
+struct min_coeff_functor<Scalar, PropagateNumbers, false> {
+  EIGEN_DEVICE_FUNC inline bool compareCoeff(const Scalar& incumbent, const Scalar& candidate) const {
+    return (candidate < incumbent) || ((candidate == candidate) && (incumbent != incumbent));
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet comparePacket(const Packet& incumbent, const Packet& candidate) const {
+    return pandnot(pcmp_lt_or_nan(candidate, incumbent), pisnan(candidate));
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Scalar predux(const Packet& a) const {
+    return predux_min<PropagateNumbers>(a);
+  }
+};
+
+template <typename Scalar>
+struct min_max_traits {
+  static constexpr bool PacketAccess = packet_traits<Scalar>::Vectorizable;
+};
+template <typename Scalar, int NaNPropagation>
+struct functor_traits<max_coeff_functor<Scalar, NaNPropagation>> : min_max_traits<Scalar> {};
+template <typename Scalar, int NaNPropagation>
+struct functor_traits<min_coeff_functor<Scalar, NaNPropagation>> : min_max_traits<Scalar> {};
+
+template <typename Evaluator, typename Func, bool Linear, bool Vectorize>
+struct find_coeff_loop;
+template <typename Evaluator, typename Func>
+struct find_coeff_loop<Evaluator, Func, /*Linear*/ false, /*Vectorize*/ false> {
+  using Scalar = typename Evaluator::Scalar;
+  static EIGEN_DEVICE_FUNC inline void run(const Evaluator& eval, Func& func, Scalar& res, Index& outer, Index& inner) {
+    Index outerSize = eval.outerSize();
+    Index innerSize = eval.innerSize();
+
+    /* initialization performed in calling function */
+    /* result = eval.coeff(0, 0); */
+    /* outer = 0; */
+    /* inner = 0; */
+
+    for (Index j = 0; j < outerSize; j++) {
+      for (Index i = 0; i < innerSize; i++) {
+        Scalar xprCoeff = eval.coeffByOuterInner(j, i);
+        bool newRes = func.compareCoeff(res, xprCoeff);
+        if (newRes) {
+          outer = j;
+          inner = i;
+          res = xprCoeff;
+        }
+      }
+    }
+  }
+};
+template <typename Evaluator, typename Func>
+struct find_coeff_loop<Evaluator, Func, /*Linear*/ true, /*Vectorize*/ false> {
+  using Scalar = typename Evaluator::Scalar;
+  static EIGEN_DEVICE_FUNC inline void run(const Evaluator& eval, Func& func, Scalar& res, Index& index) {
+    Index size = eval.size();
+
+    /* initialization performed in calling function */
+    /* result = eval.coeff(0); */
+    /* index = 0; */
+
+    for (Index k = 0; k < size; k++) {
+      Scalar xprCoeff = eval.coeff(k);
+      bool newRes = func.compareCoeff(res, xprCoeff);
+      if (newRes) {
+        index = k;
+        res = xprCoeff;
+      }
+    }
+  }
+};
+template <typename Evaluator, typename Func>
+struct find_coeff_loop<Evaluator, Func, /*Linear*/ false, /*Vectorize*/ true> {
+  using ScalarImpl = find_coeff_loop<Evaluator, Func, false, false>;
+  using Scalar = typename Evaluator::Scalar;
+  using Packet = typename Evaluator::Packet;
+  static constexpr int PacketSize = unpacket_traits<Packet>::size;
+  static EIGEN_DEVICE_FUNC inline void run(const Evaluator& eval, Func& func, Scalar& result, Index& outer,
+                                           Index& inner) {
+    Index outerSize = eval.outerSize();
+    Index innerSize = eval.innerSize();
+    Index packetEnd = numext::round_down(innerSize, PacketSize);
+
+    /* initialization performed in calling function */
+    /* result = eval.coeff(0, 0); */
+    /* outer = 0; */
+    /* inner = 0; */
+
+    bool checkPacket = false;
+
+    for (Index j = 0; j < outerSize; j++) {
+      Packet resultPacket = pset1<Packet>(result);
+      for (Index i = 0; i < packetEnd; i += PacketSize) {
+        Packet xprPacket = eval.template packetByOuterInner<Unaligned, Packet>(j, i);
+        if (predux_any(func.comparePacket(resultPacket, xprPacket))) {
+          outer = j;
+          inner = i;
+          result = func.predux(xprPacket);
+          resultPacket = pset1<Packet>(result);
+          checkPacket = true;
+        }
+      }
+
+      for (Index i = packetEnd; i < innerSize; i++) {
+        Scalar xprCoeff = eval.coeffByOuterInner(j, i);
+        if (func.compareCoeff(result, xprCoeff)) {
+          outer = j;
+          inner = i;
+          result = xprCoeff;
+          checkPacket = false;
+        }
+      }
+    }
+
+    if (checkPacket) {
+      result = eval.coeffByOuterInner(outer, inner);
+      Index i_end = inner + PacketSize;
+      for (Index i = inner; i < i_end; i++) {
+        Scalar xprCoeff = eval.coeffByOuterInner(outer, i);
+        if (func.compareCoeff(result, xprCoeff)) {
+          inner = i;
+          result = xprCoeff;
+        }
+      }
+    }
+  }
+};
+template <typename Evaluator, typename Func>
+struct find_coeff_loop<Evaluator, Func, /*Linear*/ true, /*Vectorize*/ true> {
+  using ScalarImpl = find_coeff_loop<Evaluator, Func, true, false>;
+  using Scalar = typename Evaluator::Scalar;
+  using Packet = typename Evaluator::Packet;
+  static constexpr int PacketSize = unpacket_traits<Packet>::size;
+  static constexpr int Alignment = Evaluator::Alignment;
+
+  static EIGEN_DEVICE_FUNC inline void run(const Evaluator& eval, Func& func, Scalar& result, Index& index) {
+    Index size = eval.size();
+    Index packetEnd = numext::round_down(size, PacketSize);
+
+    /* initialization performed in calling function */
+    /* result = eval.coeff(0); */
+    /* index = 0; */
+
+    Packet resultPacket = pset1<Packet>(result);
+    bool checkPacket = false;
+
+    for (Index k = 0; k < packetEnd; k += PacketSize) {
+      Packet xprPacket = eval.template packet<Alignment, Packet>(k);
+      if (predux_any(func.comparePacket(resultPacket, xprPacket))) {
+        index = k;
+        result = func.predux(xprPacket);
+        resultPacket = pset1<Packet>(result);
+        checkPacket = true;
+      }
+    }
+
+    for (Index k = packetEnd; k < size; k++) {
+      Scalar xprCoeff = eval.coeff(k);
+      if (func.compareCoeff(result, xprCoeff)) {
+        index = k;
+        result = xprCoeff;
+        checkPacket = false;
+      }
+    }
+
+    if (checkPacket) {
+      result = eval.coeff(index);
+      Index k_end = index + PacketSize;
+      for (Index k = index; k < k_end; k++) {
+        Scalar xprCoeff = eval.coeff(k);
+        if (func.compareCoeff(result, xprCoeff)) {
+          index = k;
+          result = xprCoeff;
+        }
+      }
+    }
+  }
+};
+
+template <typename Derived>
+struct find_coeff_evaluator : public evaluator<Derived> {
+  using Base = evaluator<Derived>;
+  using Scalar = typename Derived::Scalar;
+  using Packet = typename packet_traits<Scalar>::type;
+  static constexpr int Flags = Base::Flags;
+  static constexpr bool IsRowMajor = bool(Flags & RowMajorBit);
+  EIGEN_DEVICE_FUNC inline find_coeff_evaluator(const Derived& xpr) : Base(xpr), m_xpr(xpr) {}
+
+  EIGEN_DEVICE_FUNC inline Scalar coeffByOuterInner(Index outer, Index inner) const {
+    Index row = IsRowMajor ? outer : inner;
+    Index col = IsRowMajor ? inner : outer;
+    return Base::coeff(row, col);
+  }
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC inline PacketType packetByOuterInner(Index outer, Index inner) const {
+    Index row = IsRowMajor ? outer : inner;
+    Index col = IsRowMajor ? inner : outer;
+    return Base::template packet<LoadMode, PacketType>(row, col);
+  }
+
+  EIGEN_DEVICE_FUNC inline Index innerSize() const { return m_xpr.innerSize(); }
+  EIGEN_DEVICE_FUNC inline Index outerSize() const { return m_xpr.outerSize(); }
+  EIGEN_DEVICE_FUNC inline Index size() const { return m_xpr.size(); }
+
+  const Derived& m_xpr;
+};
+
+template <typename Derived, typename Func>
+struct find_coeff_impl {
+  using Evaluator = find_coeff_evaluator<Derived>;
+  static constexpr int Flags = Evaluator::Flags;
+  static constexpr int Alignment = Evaluator::Alignment;
+  static constexpr bool IsRowMajor = Derived::IsRowMajor;
+  static constexpr int MaxInnerSizeAtCompileTime =
+      IsRowMajor ? Derived::MaxColsAtCompileTime : Derived::MaxRowsAtCompileTime;
+  static constexpr int MaxSizeAtCompileTime = Derived::MaxSizeAtCompileTime;
+
+  using Scalar = typename Derived::Scalar;
+  using Packet = typename Evaluator::Packet;
+
+  static constexpr int PacketSize = unpacket_traits<Packet>::size;
+  static constexpr bool Linearize = bool(Flags & LinearAccessBit);
+  static constexpr bool DontVectorize =
+      enum_lt_not_dynamic(Linearize ? MaxSizeAtCompileTime : MaxInnerSizeAtCompileTime, PacketSize);
+  static constexpr bool Vectorize =
+      !DontVectorize && bool(Flags & PacketAccessBit) && functor_traits<Func>::PacketAccess;
+
+  using Loop = find_coeff_loop<Evaluator, Func, Linearize, Vectorize>;
+
+  template <bool ForwardLinearAccess = Linearize, std::enable_if_t<!ForwardLinearAccess, bool> = true>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Derived& xpr, Func& func, Scalar& res, Index& outer,
+                                                        Index& inner) {
+    Evaluator eval(xpr);
+    Loop::run(eval, func, res, outer, inner);
+  }
+  template <bool ForwardLinearAccess = Linearize, std::enable_if_t<ForwardLinearAccess, bool> = true>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Derived& xpr, Func& func, Scalar& res, Index& outer,
+                                                        Index& inner) {
+    // where possible, use the linear loop and back-calculate the outer and inner indices
+    Index index = 0;
+    run(xpr, func, res, index);
+    outer = index / xpr.innerSize();
+    inner = index % xpr.innerSize();
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Derived& xpr, Func& func, Scalar& res, Index& index) {
+    Evaluator eval(xpr);
+    Loop::run(eval, func, res, index);
+  }
+};
+
+template <typename Derived, typename IndexType, typename Func>
+EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar findCoeff(const DenseBase<Derived>& mat, Func& func,
+                                                                       IndexType* rowPtr, IndexType* colPtr) {
+  eigen_assert(mat.rows() > 0 && mat.cols() > 0 && "you are using an empty matrix");
+  using Scalar = typename DenseBase<Derived>::Scalar;
+  using FindCoeffImpl = internal::find_coeff_impl<Derived, Func>;
+  Index outer = 0;
+  Index inner = 0;
+  Scalar res = mat.coeff(0, 0);
+  FindCoeffImpl::run(mat.derived(), func, res, outer, inner);
+  *rowPtr = internal::convert_index<IndexType>(Derived::IsRowMajor ? outer : inner);
+  if (colPtr) *colPtr = internal::convert_index<IndexType>(Derived::IsRowMajor ? inner : outer);
+  return res;
+}
+
+template <typename Derived, typename IndexType, typename Func>
+EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar findCoeff(const DenseBase<Derived>& mat, Func& func,
+                                                                       IndexType* indexPtr) {
+  eigen_assert(mat.size() > 0 && "you are using an empty matrix");
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  using Scalar = typename DenseBase<Derived>::Scalar;
+  using FindCoeffImpl = internal::find_coeff_impl<Derived, Func>;
+  Index index = 0;
+  Scalar res = mat.coeff(0);
+  FindCoeffImpl::run(mat.derived(), func, res, index);
+  *indexPtr = internal::convert_index<IndexType>(index);
+  return res;
+}
+
+}  // namespace internal
+
+/** \fn DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const
+ * \returns the minimum of all coefficients of *this and puts in *row and *col its location.
+ *
+ * If there are multiple coefficients with the same extreme value, the location of the first instance is returned.
+ *
+ * In case \c *this contains NaN, NaNPropagation determines the behavior:
+ *   NaNPropagation == PropagateFast : undefined
+ *   NaNPropagation == PropagateNaN : result is NaN
+ *   NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN
+ * \warning the matrix must be not empty, otherwise an assertion is triggered.
+ *
+ * \sa DenseBase::minCoeff(Index*), DenseBase::maxCoeff(Index*,Index*), DenseBase::visit(), DenseBase::minCoeff()
+ */
+template <typename Derived>
+template <int NaNPropagation, typename IndexType>
+EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar DenseBase<Derived>::minCoeff(IndexType* rowPtr,
+                                                                                          IndexType* colPtr) const {
+  using Func = internal::min_coeff_functor<Scalar, NaNPropagation>;
+  Func func;
+  return internal::findCoeff(derived(), func, rowPtr, colPtr);
+}
+
+/** \returns the minimum of all coefficients of *this and puts in *index its location.
+ *
+ * If there are multiple coefficients with the same extreme value, the location of the first instance is returned.
+ *
+ * In case \c *this contains NaN, NaNPropagation determines the behavior:
+ *   NaNPropagation == PropagateFast : undefined
+ *   NaNPropagation == PropagateNaN : result is NaN
+ *   NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN
+ * \warning the matrix must be not empty, otherwise an assertion is triggered.
+ *
+ * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::visit(),
+ * DenseBase::minCoeff()
+ */
+template <typename Derived>
+template <int NaNPropagation, typename IndexType>
+EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar DenseBase<Derived>::minCoeff(IndexType* indexPtr) const {
+  using Func = internal::min_coeff_functor<Scalar, NaNPropagation>;
+  Func func;
+  return internal::findCoeff(derived(), func, indexPtr);
+}
+
+/** \fn DenseBase<Derived>::maxCoeff(IndexType* rowId, IndexType* colId) const
+ * \returns the maximum of all coefficients of *this and puts in *row and *col its location.
+ *
+ * If there are multiple coefficients with the same extreme value, the location of the first instance is returned.
+ *
+ * In case \c *this contains NaN, NaNPropagation determines the behavior:
+ *   NaNPropagation == PropagateFast : undefined
+ *   NaNPropagation == PropagateNaN : result is NaN
+ *   NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN
+ * \warning the matrix must be not empty, otherwise an assertion is triggered.
+ *
+ * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visit(), DenseBase::maxCoeff()
+ */
+template <typename Derived>
+template <int NaNPropagation, typename IndexType>
+EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar DenseBase<Derived>::maxCoeff(IndexType* rowPtr,
+                                                                                          IndexType* colPtr) const {
+  using Func = internal::max_coeff_functor<Scalar, NaNPropagation>;
+  Func func;
+  return internal::findCoeff(derived(), func, rowPtr, colPtr);
+}
+
+/** \returns the maximum of all coefficients of *this and puts in *index its location.
+ *
+ * If there are multiple coefficients with the same extreme value, the location of the first instance is returned.
+ *
+ * In case \c *this contains NaN, NaNPropagation determines the behavior:
+ *   NaNPropagation == PropagateFast : undefined
+ *   NaNPropagation == PropagateNaN : result is NaN
+ *   NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN
+ * \warning the matrix must be not empty, otherwise an assertion is triggered.
+ *
+ * \sa DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visitor(),
+ * DenseBase::maxCoeff()
+ */
+template <typename Derived>
+template <int NaNPropagation, typename IndexType>
+EIGEN_DEVICE_FUNC typename internal::traits<Derived>::Scalar DenseBase<Derived>::maxCoeff(IndexType* indexPtr) const {
+  using Func = internal::max_coeff_functor<Scalar, NaNPropagation>;
+  Func func;
+  return internal::findCoeff(derived(), func, indexPtr);
+}
+
+}  // namespace Eigen
+
+#endif  // EIGEN_FIND_COEFF_H
diff --git a/inst/include/Eigen/src/Core/Flagged.h b/inst/include/Eigen/src/Core/Flagged.h
deleted file mode 100644
index 1f2955fc..00000000
--- a/inst/include/Eigen/src/Core/Flagged.h
+++ /dev/null
@@ -1,140 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_FLAGGED_H
-#define EIGEN_FLAGGED_H
-
-namespace Eigen { 
-
-/** \class Flagged
-  * \ingroup Core_Module
-  *
-  * \brief Expression with modified flags
-  *
-  * \param ExpressionType the type of the object of which we are modifying the flags
-  * \param Added the flags added to the expression
-  * \param Removed the flags removed from the expression (has priority over Added).
-  *
-  * This class represents an expression whose flags have been modified.
-  * It is the return type of MatrixBase::flagged()
-  * and most of the time this is the only way it is used.
-  *
-  * \sa MatrixBase::flagged()
-  */
-
-namespace internal {
-template<typename ExpressionType, unsigned int Added, unsigned int Removed>
-struct traits<Flagged<ExpressionType, Added, Removed> > : traits<ExpressionType>
-{
-  enum { Flags = (ExpressionType::Flags | Added) & ~Removed };
-};
-}
-
-template<typename ExpressionType, unsigned int Added, unsigned int Removed> class Flagged
-  : public MatrixBase<Flagged<ExpressionType, Added, Removed> >
-{
-  public:
-
-    typedef MatrixBase<Flagged> Base;
-    
-    EIGEN_DENSE_PUBLIC_INTERFACE(Flagged)
-    typedef typename internal::conditional<internal::must_nest_by_value<ExpressionType>::ret,
-        ExpressionType, const ExpressionType&>::type ExpressionTypeNested;
-    typedef typename ExpressionType::InnerIterator InnerIterator;
-
-    inline Flagged(const ExpressionType& matrix) : m_matrix(matrix) {}
-
-    inline Index rows() const { return m_matrix.rows(); }
-    inline Index cols() const { return m_matrix.cols(); }
-    inline Index outerStride() const { return m_matrix.outerStride(); }
-    inline Index innerStride() const { return m_matrix.innerStride(); }
-
-    inline CoeffReturnType coeff(Index row, Index col) const
-    {
-      return m_matrix.coeff(row, col);
-    }
-
-    inline CoeffReturnType coeff(Index index) const
-    {
-      return m_matrix.coeff(index);
-    }
-    
-    inline const Scalar& coeffRef(Index row, Index col) const
-    {
-      return m_matrix.const_cast_derived().coeffRef(row, col);
-    }
-
-    inline const Scalar& coeffRef(Index index) const
-    {
-      return m_matrix.const_cast_derived().coeffRef(index);
-    }
-
-    inline Scalar& coeffRef(Index row, Index col)
-    {
-      return m_matrix.const_cast_derived().coeffRef(row, col);
-    }
-
-    inline Scalar& coeffRef(Index index)
-    {
-      return m_matrix.const_cast_derived().coeffRef(index);
-    }
-
-    template<int LoadMode>
-    inline const PacketScalar packet(Index row, Index col) const
-    {
-      return m_matrix.template packet<LoadMode>(row, col);
-    }
-
-    template<int LoadMode>
-    inline void writePacket(Index row, Index col, const PacketScalar& x)
-    {
-      m_matrix.const_cast_derived().template writePacket<LoadMode>(row, col, x);
-    }
-
-    template<int LoadMode>
-    inline const PacketScalar packet(Index index) const
-    {
-      return m_matrix.template packet<LoadMode>(index);
-    }
-
-    template<int LoadMode>
-    inline void writePacket(Index index, const PacketScalar& x)
-    {
-      m_matrix.const_cast_derived().template writePacket<LoadMode>(index, x);
-    }
-
-    const ExpressionType& _expression() const { return m_matrix; }
-
-    template<typename OtherDerived>
-    typename ExpressionType::PlainObject solveTriangular(const MatrixBase<OtherDerived>& other) const;
-
-    template<typename OtherDerived>
-    void solveTriangularInPlace(const MatrixBase<OtherDerived>& other) const;
-
-  protected:
-    ExpressionTypeNested m_matrix;
-};
-
-/** \returns an expression of *this with added and removed flags
-  *
-  * This is mostly for internal use.
-  *
-  * \sa class Flagged
-  */
-template<typename Derived>
-template<unsigned int Added,unsigned int Removed>
-inline const Flagged<Derived, Added, Removed>
-DenseBase<Derived>::flagged() const
-{
-  return derived();
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN_FLAGGED_H
diff --git a/inst/include/Eigen/src/Core/ForceAlignedAccess.h b/inst/include/Eigen/src/Core/ForceAlignedAccess.h
index 807c7a29..55beab35 100644
--- a/inst/include/Eigen/src/Core/ForceAlignedAccess.h
+++ b/inst/include/Eigen/src/Core/ForceAlignedAccess.h
@@ -10,137 +10,118 @@
 #ifndef EIGEN_FORCEALIGNEDACCESS_H
 #define EIGEN_FORCEALIGNEDACCESS_H
 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
 namespace Eigen {
 
 /** \class ForceAlignedAccess
-  * \ingroup Core_Module
-  *
-  * \brief Enforce aligned packet loads and stores regardless of what is requested
-  *
-  * \param ExpressionType the type of the object of which we are forcing aligned packet access
-  *
-  * This class is the return type of MatrixBase::forceAlignedAccess()
-  * and most of the time this is the only way it is used.
-  *
-  * \sa MatrixBase::forceAlignedAccess()
-  */
+ * \ingroup Core_Module
+ *
+ * \brief Enforce aligned packet loads and stores regardless of what is requested
+ *
+ * \param ExpressionType the type of the object of which we are forcing aligned packet access
+ *
+ * This class is the return type of MatrixBase::forceAlignedAccess()
+ * and most of the time this is the only way it is used.
+ *
+ * \sa MatrixBase::forceAlignedAccess()
+ */
 
 namespace internal {
-template<typename ExpressionType>
-struct traits<ForceAlignedAccess<ExpressionType> > : public traits<ExpressionType>
-{};
-}
+template <typename ExpressionType>
+struct traits<ForceAlignedAccess<ExpressionType>> : public traits<ExpressionType> {};
+}  // namespace internal
+
+template <typename ExpressionType>
+class ForceAlignedAccess : public internal::dense_xpr_base<ForceAlignedAccess<ExpressionType>>::type {
+ public:
+  typedef typename internal::dense_xpr_base<ForceAlignedAccess>::type Base;
+  EIGEN_DENSE_PUBLIC_INTERFACE(ForceAlignedAccess)
+
+  EIGEN_DEVICE_FUNC explicit inline ForceAlignedAccess(const ExpressionType& matrix) : m_expression(matrix) {}
+
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_expression.rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_expression.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return m_expression.outerStride(); }
+  EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept { return m_expression.innerStride(); }
+
+  EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index row, Index col) const {
+    return m_expression.coeff(row, col);
+  }
+
+  EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index row, Index col) {
+    return m_expression.const_cast_derived().coeffRef(row, col);
+  }
+
+  EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index index) const { return m_expression.coeff(index); }
+
+  EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index index) { return m_expression.const_cast_derived().coeffRef(index); }
+
+  template <int LoadMode>
+  inline const PacketScalar packet(Index row, Index col) const {
+    return m_expression.template packet<Aligned>(row, col);
+  }
+
+  template <int LoadMode>
+  inline void writePacket(Index row, Index col, const PacketScalar& x) {
+    m_expression.const_cast_derived().template writePacket<Aligned>(row, col, x);
+  }
+
+  template <int LoadMode>
+  inline const PacketScalar packet(Index index) const {
+    return m_expression.template packet<Aligned>(index);
+  }
+
+  template <int LoadMode>
+  inline void writePacket(Index index, const PacketScalar& x) {
+    m_expression.const_cast_derived().template writePacket<Aligned>(index, x);
+  }
+
+  EIGEN_DEVICE_FUNC operator const ExpressionType&() const { return m_expression; }
+
+ protected:
+  const ExpressionType& m_expression;
 
-template<typename ExpressionType> class ForceAlignedAccess
-  : public internal::dense_xpr_base< ForceAlignedAccess<ExpressionType> >::type
-{
-  public:
-
-    typedef typename internal::dense_xpr_base<ForceAlignedAccess>::type Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(ForceAlignedAccess)
-
-    inline ForceAlignedAccess(const ExpressionType& matrix) : m_expression(matrix) {}
-
-    inline Index rows() const { return m_expression.rows(); }
-    inline Index cols() const { return m_expression.cols(); }
-    inline Index outerStride() const { return m_expression.outerStride(); }
-    inline Index innerStride() const { return m_expression.innerStride(); }
-
-    inline const CoeffReturnType coeff(Index row, Index col) const
-    {
-      return m_expression.coeff(row, col);
-    }
-
-    inline Scalar& coeffRef(Index row, Index col)
-    {
-      return m_expression.const_cast_derived().coeffRef(row, col);
-    }
-
-    inline const CoeffReturnType coeff(Index index) const
-    {
-      return m_expression.coeff(index);
-    }
-
-    inline Scalar& coeffRef(Index index)
-    {
-      return m_expression.const_cast_derived().coeffRef(index);
-    }
-
-    template<int LoadMode>
-    inline const PacketScalar packet(Index row, Index col) const
-    {
-      return m_expression.template packet<Aligned>(row, col);
-    }
-
-    template<int LoadMode>
-    inline void writePacket(Index row, Index col, const PacketScalar& x)
-    {
-      m_expression.const_cast_derived().template writePacket<Aligned>(row, col, x);
-    }
-
-    template<int LoadMode>
-    inline const PacketScalar packet(Index index) const
-    {
-      return m_expression.template packet<Aligned>(index);
-    }
-
-    template<int LoadMode>
-    inline void writePacket(Index index, const PacketScalar& x)
-    {
-      m_expression.const_cast_derived().template writePacket<Aligned>(index, x);
-    }
-
-    operator const ExpressionType&() const { return m_expression; }
-
-  protected:
-    const ExpressionType& m_expression;
-
-  private:
-    ForceAlignedAccess& operator=(const ForceAlignedAccess&);
+ private:
+  ForceAlignedAccess& operator=(const ForceAlignedAccess&);
 };
 
 /** \returns an expression of *this with forced aligned access
-  * \sa forceAlignedAccessIf(),class ForceAlignedAccess
-  */
-template<typename Derived>
-inline const ForceAlignedAccess<Derived>
-MatrixBase<Derived>::forceAlignedAccess() const
-{
+ * \sa forceAlignedAccessIf(),class ForceAlignedAccess
+ */
+template <typename Derived>
+inline const ForceAlignedAccess<Derived> MatrixBase<Derived>::forceAlignedAccess() const {
   return ForceAlignedAccess<Derived>(derived());
 }
 
 /** \returns an expression of *this with forced aligned access
-  * \sa forceAlignedAccessIf(), class ForceAlignedAccess
-  */
-template<typename Derived>
-inline ForceAlignedAccess<Derived>
-MatrixBase<Derived>::forceAlignedAccess()
-{
+ * \sa forceAlignedAccessIf(), class ForceAlignedAccess
+ */
+template <typename Derived>
+inline ForceAlignedAccess<Derived> MatrixBase<Derived>::forceAlignedAccess() {
   return ForceAlignedAccess<Derived>(derived());
 }
 
 /** \returns an expression of *this with forced aligned access if \a Enable is true.
-  * \sa forceAlignedAccess(), class ForceAlignedAccess
-  */
-template<typename Derived>
-template<bool Enable>
-inline typename internal::add_const_on_value_type<typename internal::conditional<Enable,ForceAlignedAccess<Derived>,Derived&>::type>::type
-MatrixBase<Derived>::forceAlignedAccessIf() const
-{
-  return derived();
+ * \sa forceAlignedAccess(), class ForceAlignedAccess
+ */
+template <typename Derived>
+template <bool Enable>
+inline add_const_on_value_type_t<std::conditional_t<Enable, ForceAlignedAccess<Derived>, Derived&>>
+MatrixBase<Derived>::forceAlignedAccessIf() const {
+  return derived();  // FIXME This should not work but apparently is never used
 }
 
 /** \returns an expression of *this with forced aligned access if \a Enable is true.
-  * \sa forceAlignedAccess(), class ForceAlignedAccess
-  */
-template<typename Derived>
-template<bool Enable>
-inline typename internal::conditional<Enable,ForceAlignedAccess<Derived>,Derived&>::type
-MatrixBase<Derived>::forceAlignedAccessIf()
-{
-  return derived();
+ * \sa forceAlignedAccess(), class ForceAlignedAccess
+ */
+template <typename Derived>
+template <bool Enable>
+inline std::conditional_t<Enable, ForceAlignedAccess<Derived>, Derived&> MatrixBase<Derived>::forceAlignedAccessIf() {
+  return derived();  // FIXME This should not work but apparently is never used
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_FORCEALIGNEDACCESS_H
+#endif  // EIGEN_FORCEALIGNEDACCESS_H
diff --git a/inst/include/Eigen/src/Core/Functors.h b/inst/include/Eigen/src/Core/Functors.h
deleted file mode 100644
index 5f14c658..00000000
--- a/inst/include/Eigen/src/Core/Functors.h
+++ /dev/null
@@ -1,1026 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_FUNCTORS_H
-#define EIGEN_FUNCTORS_H
-
-namespace Eigen {
-
-namespace internal {
-
-// associative functors:
-
-/** \internal
-  * \brief Template functor to compute the sum of two scalars
-  *
-  * \sa class CwiseBinaryOp, MatrixBase::operator+, class VectorwiseOp, MatrixBase::sum()
-  */
-template<typename Scalar> struct scalar_sum_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_sum_op)
-  EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return a + b; }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::padd(a,b); }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Scalar predux(const Packet& a) const
-  { return internal::predux(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_sum_op<Scalar> > {
-  enum {
-    Cost = NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasAdd
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the product of two scalars
-  *
-  * \sa class CwiseBinaryOp, Cwise::operator*(), class VectorwiseOp, MatrixBase::redux()
-  */
-template<typename LhsScalar,typename RhsScalar> struct scalar_product_op {
-  enum {
-    // TODO vectorize mixed product
-    Vectorizable = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasMul && packet_traits<RhsScalar>::HasMul
-  };
-  typedef typename scalar_product_traits<LhsScalar,RhsScalar>::ReturnType result_type;
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_product_op)
-  EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a * b; }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::pmul(a,b); }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const result_type predux(const Packet& a) const
-  { return internal::predux_mul(a); }
-};
-template<typename LhsScalar,typename RhsScalar>
-struct functor_traits<scalar_product_op<LhsScalar,RhsScalar> > {
-  enum {
-    Cost = (NumTraits<LhsScalar>::MulCost + NumTraits<RhsScalar>::MulCost)/2, // rough estimate!
-    PacketAccess = scalar_product_op<LhsScalar,RhsScalar>::Vectorizable
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the conjugate product of two scalars
-  *
-  * This is a short cut for conj(x) * y which is needed for optimization purpose; in Eigen2 support mode, this becomes x * conj(y)
-  */
-template<typename LhsScalar,typename RhsScalar> struct scalar_conj_product_op {
-
-  enum {
-    Conj = NumTraits<LhsScalar>::IsComplex
-  };
-  
-  typedef typename scalar_product_traits<LhsScalar,RhsScalar>::ReturnType result_type;
-  
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_conj_product_op)
-  EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const
-  { return conj_helper<LhsScalar,RhsScalar,Conj,false>().pmul(a,b); }
-  
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return conj_helper<Packet,Packet,Conj,false>().pmul(a,b); }
-};
-template<typename LhsScalar,typename RhsScalar>
-struct functor_traits<scalar_conj_product_op<LhsScalar,RhsScalar> > {
-  enum {
-    Cost = NumTraits<LhsScalar>::MulCost,
-    PacketAccess = internal::is_same<LhsScalar, RhsScalar>::value && packet_traits<LhsScalar>::HasMul
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the min of two scalars
-  *
-  * \sa class CwiseBinaryOp, MatrixBase::cwiseMin, class VectorwiseOp, MatrixBase::minCoeff()
-  */
-template<typename Scalar> struct scalar_min_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_min_op)
-  EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { using std::min; return (min)(a, b); }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::pmin(a,b); }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Scalar predux(const Packet& a) const
-  { return internal::predux_min(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_min_op<Scalar> > {
-  enum {
-    Cost = NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasMin
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the max of two scalars
-  *
-  * \sa class CwiseBinaryOp, MatrixBase::cwiseMax, class VectorwiseOp, MatrixBase::maxCoeff()
-  */
-template<typename Scalar> struct scalar_max_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_max_op)
-  EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { using std::max; return (max)(a, b); }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::pmax(a,b); }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Scalar predux(const Packet& a) const
-  { return internal::predux_max(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_max_op<Scalar> > {
-  enum {
-    Cost = NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasMax
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the hypot of two scalars
-  *
-  * \sa MatrixBase::stableNorm(), class Redux
-  */
-template<typename Scalar> struct scalar_hypot_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_hypot_op)
-//   typedef typename NumTraits<Scalar>::Real result_type;
-  EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& _x, const Scalar& _y) const
-  {
-    using std::max;
-    using std::min;
-    using std::sqrt;
-    Scalar p = (max)(_x, _y);
-    Scalar q = (min)(_x, _y);
-    Scalar qp = q/p;
-    return p * sqrt(Scalar(1) + qp*qp);
-  }
-};
-template<typename Scalar>
-struct functor_traits<scalar_hypot_op<Scalar> > {
-  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess=0 };
-};
-
-/** \internal
-  * \brief Template functor to compute the pow of two scalars
-  */
-template<typename Scalar, typename OtherScalar> struct scalar_binary_pow_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_binary_pow_op)
-  inline Scalar operator() (const Scalar& a, const OtherScalar& b) const { return numext::pow(a, b); }
-};
-template<typename Scalar, typename OtherScalar>
-struct functor_traits<scalar_binary_pow_op<Scalar,OtherScalar> > {
-  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
-};
-
-// other binary functors:
-
-/** \internal
-  * \brief Template functor to compute the difference of two scalars
-  *
-  * \sa class CwiseBinaryOp, MatrixBase::operator-
-  */
-template<typename Scalar> struct scalar_difference_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_difference_op)
-  EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& b) const { return a - b; }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::psub(a,b); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_difference_op<Scalar> > {
-  enum {
-    Cost = NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasSub
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the quotient of two scalars
-  *
-  * \sa class CwiseBinaryOp, Cwise::operator/()
-  */
-template<typename LhsScalar,typename RhsScalar> struct scalar_quotient_op {
-  enum {
-    // TODO vectorize mixed product
-    Vectorizable = is_same<LhsScalar,RhsScalar>::value && packet_traits<LhsScalar>::HasDiv && packet_traits<RhsScalar>::HasDiv
-  };
-  typedef typename scalar_product_traits<LhsScalar,RhsScalar>::ReturnType result_type;
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_quotient_op)
-  EIGEN_STRONG_INLINE const result_type operator() (const LhsScalar& a, const RhsScalar& b) const { return a / b; }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const
-  { return internal::pdiv(a,b); }
-};
-template<typename LhsScalar,typename RhsScalar>
-struct functor_traits<scalar_quotient_op<LhsScalar,RhsScalar> > {
-  enum {
-    Cost = (NumTraits<LhsScalar>::MulCost + NumTraits<RhsScalar>::MulCost), // rough estimate!
-    PacketAccess = scalar_quotient_op<LhsScalar,RhsScalar>::Vectorizable
-  };
-};
-
-
-
-/** \internal
-  * \brief Template functor to compute the and of two booleans
-  *
-  * \sa class CwiseBinaryOp, ArrayBase::operator&&
-  */
-struct scalar_boolean_and_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_and_op)
-  EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a && b; }
-};
-template<> struct functor_traits<scalar_boolean_and_op> {
-  enum {
-    Cost = NumTraits<bool>::AddCost,
-    PacketAccess = false
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the or of two booleans
-  *
-  * \sa class CwiseBinaryOp, ArrayBase::operator||
-  */
-struct scalar_boolean_or_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_boolean_or_op)
-  EIGEN_STRONG_INLINE bool operator() (const bool& a, const bool& b) const { return a || b; }
-};
-template<> struct functor_traits<scalar_boolean_or_op> {
-  enum {
-    Cost = NumTraits<bool>::AddCost,
-    PacketAccess = false
-  };
-};
-
-/** \internal
-  * \brief Template functors for comparison of two scalars
-  * \todo Implement packet-comparisons
-  */
-template<typename Scalar, ComparisonName cmp> struct scalar_cmp_op;
-
-template<typename Scalar, ComparisonName cmp>
-struct functor_traits<scalar_cmp_op<Scalar, cmp> > {
-  enum {
-    Cost = NumTraits<Scalar>::AddCost,
-    PacketAccess = false
-  };
-};
-
-template<ComparisonName Cmp, typename Scalar>
-struct result_of<scalar_cmp_op<Scalar, Cmp>(Scalar,Scalar)> {
-  typedef bool type;
-};
-
-
-template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_EQ> {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
-  EIGEN_STRONG_INLINE bool operator()(const Scalar& a, const Scalar& b) const {return a==b;}
-};
-template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_LT> {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
-  EIGEN_STRONG_INLINE bool operator()(const Scalar& a, const Scalar& b) const {return a<b;}
-};
-template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_LE> {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
-  EIGEN_STRONG_INLINE bool operator()(const Scalar& a, const Scalar& b) const {return a<=b;}
-};
-template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_UNORD> {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
-  EIGEN_STRONG_INLINE bool operator()(const Scalar& a, const Scalar& b) const {return !(a<=b || b<=a);}
-};
-template<typename Scalar> struct scalar_cmp_op<Scalar, cmp_NEQ> {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cmp_op)
-  EIGEN_STRONG_INLINE bool operator()(const Scalar& a, const Scalar& b) const {return a!=b;}
-};
-
-// unary functors:
-
-/** \internal
-  * \brief Template functor to compute the opposite of a scalar
-  *
-  * \sa class CwiseUnaryOp, MatrixBase::operator-
-  */
-template<typename Scalar> struct scalar_opposite_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_opposite_op)
-  EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { return -a; }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
-  { return internal::pnegate(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_opposite_op<Scalar> >
-{ enum {
-    Cost = NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasNegate };
-};
-
-/** \internal
-  * \brief Template functor to compute the absolute value of a scalar
-  *
-  * \sa class CwiseUnaryOp, Cwise::abs
-  */
-template<typename Scalar> struct scalar_abs_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_abs_op)
-  typedef typename NumTraits<Scalar>::Real result_type;
-  EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { using std::abs; return abs(a); }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
-  { return internal::pabs(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_abs_op<Scalar> >
-{
-  enum {
-    Cost = NumTraits<Scalar>::AddCost,
-    PacketAccess = packet_traits<Scalar>::HasAbs
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the squared absolute value of a scalar
-  *
-  * \sa class CwiseUnaryOp, Cwise::abs2
-  */
-template<typename Scalar> struct scalar_abs2_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_abs2_op)
-  typedef typename NumTraits<Scalar>::Real result_type;
-  EIGEN_STRONG_INLINE const result_type operator() (const Scalar& a) const { return numext::abs2(a); }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
-  { return internal::pmul(a,a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_abs2_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasAbs2 }; };
-
-/** \internal
-  * \brief Template functor to compute the conjugate of a complex value
-  *
-  * \sa class CwiseUnaryOp, MatrixBase::conjugate()
-  */
-template<typename Scalar> struct scalar_conjugate_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_conjugate_op)
-  EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a) const { using numext::conj; return conj(a); }
-  template<typename Packet>
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const { return internal::pconj(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_conjugate_op<Scalar> >
-{
-  enum {
-    Cost = NumTraits<Scalar>::IsComplex ? NumTraits<Scalar>::AddCost : 0,
-    PacketAccess = packet_traits<Scalar>::HasConj
-  };
-};
-
-/** \internal
-  * \brief Template functor to cast a scalar to another type
-  *
-  * \sa class CwiseUnaryOp, MatrixBase::cast()
-  */
-template<typename Scalar, typename NewType>
-struct scalar_cast_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cast_op)
-  typedef NewType result_type;
-  EIGEN_STRONG_INLINE const NewType operator() (const Scalar& a) const { return cast<Scalar, NewType>(a); }
-};
-template<typename Scalar, typename NewType>
-struct functor_traits<scalar_cast_op<Scalar,NewType> >
-{ enum { Cost = is_same<Scalar, NewType>::value ? 0 : NumTraits<NewType>::AddCost, PacketAccess = false }; };
-
-/** \internal
-  * \brief Template functor to extract the real part of a complex
-  *
-  * \sa class CwiseUnaryOp, MatrixBase::real()
-  */
-template<typename Scalar>
-struct scalar_real_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_real_op)
-  typedef typename NumTraits<Scalar>::Real result_type;
-  EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return numext::real(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_real_op<Scalar> >
-{ enum { Cost = 0, PacketAccess = false }; };
-
-/** \internal
-  * \brief Template functor to extract the imaginary part of a complex
-  *
-  * \sa class CwiseUnaryOp, MatrixBase::imag()
-  */
-template<typename Scalar>
-struct scalar_imag_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_imag_op)
-  typedef typename NumTraits<Scalar>::Real result_type;
-  EIGEN_STRONG_INLINE result_type operator() (const Scalar& a) const { return numext::imag(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_imag_op<Scalar> >
-{ enum { Cost = 0, PacketAccess = false }; };
-
-/** \internal
-  * \brief Template functor to extract the real part of a complex as a reference
-  *
-  * \sa class CwiseUnaryOp, MatrixBase::real()
-  */
-template<typename Scalar>
-struct scalar_real_ref_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_real_ref_op)
-  typedef typename NumTraits<Scalar>::Real result_type;
-  EIGEN_STRONG_INLINE result_type& operator() (const Scalar& a) const { return numext::real_ref(*const_cast<Scalar*>(&a)); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_real_ref_op<Scalar> >
-{ enum { Cost = 0, PacketAccess = false }; };
-
-/** \internal
-  * \brief Template functor to extract the imaginary part of a complex as a reference
-  *
-  * \sa class CwiseUnaryOp, MatrixBase::imag()
-  */
-template<typename Scalar>
-struct scalar_imag_ref_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_imag_ref_op)
-  typedef typename NumTraits<Scalar>::Real result_type;
-  EIGEN_STRONG_INLINE result_type& operator() (const Scalar& a) const { return numext::imag_ref(*const_cast<Scalar*>(&a)); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_imag_ref_op<Scalar> >
-{ enum { Cost = 0, PacketAccess = false }; };
-
-/** \internal
-  *
-  * \brief Template functor to compute the exponential of a scalar
-  *
-  * \sa class CwiseUnaryOp, Cwise::exp()
-  */
-template<typename Scalar> struct scalar_exp_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_exp_op)
-  inline const Scalar operator() (const Scalar& a) const { using std::exp; return exp(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
-  inline Packet packetOp(const Packet& a) const { return internal::pexp(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_exp_op<Scalar> >
-{ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasExp }; };
-
-/** \internal
-  *
-  * \brief Template functor to compute the logarithm of a scalar
-  *
-  * \sa class CwiseUnaryOp, Cwise::log()
-  */
-template<typename Scalar> struct scalar_log_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_log_op)
-  inline const Scalar operator() (const Scalar& a) const { using std::log; return log(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
-  inline Packet packetOp(const Packet& a) const { return internal::plog(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_log_op<Scalar> >
-{ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasLog }; };
-
-/** \internal
-  * \brief Template functor to multiply a scalar by a fixed other one
-  *
-  * \sa class CwiseUnaryOp, MatrixBase::operator*, MatrixBase::operator/
-  */
-/* NOTE why doing the pset1() in packetOp *is* an optimization ?
- * indeed it seems better to declare m_other as a Packet and do the pset1() once
- * in the constructor. However, in practice:
- *  - GCC does not like m_other as a Packet and generate a load every time it needs it
- *  - on the other hand GCC is able to moves the pset1() outside the loop :)
- *  - simpler code ;)
- * (ICC and gcc 4.4 seems to perform well in both cases, the issue is visible with y = a*x + b*y)
- */
-template<typename Scalar>
-struct scalar_multiple_op {
-  typedef typename packet_traits<Scalar>::type Packet;
-  // FIXME default copy constructors seems bugged with std::complex<>
-  EIGEN_STRONG_INLINE scalar_multiple_op(const scalar_multiple_op& other) : m_other(other.m_other) { }
-  EIGEN_STRONG_INLINE scalar_multiple_op(const Scalar& other) : m_other(other) { }
-  EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a) const { return a * m_other; }
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
-  { return internal::pmul(a, pset1<Packet>(m_other)); }
-  typename add_const_on_value_type<typename NumTraits<Scalar>::Nested>::type m_other;
-};
-template<typename Scalar>
-struct functor_traits<scalar_multiple_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasMul }; };
-
-template<typename Scalar1, typename Scalar2>
-struct scalar_multiple2_op {
-  typedef typename scalar_product_traits<Scalar1,Scalar2>::ReturnType result_type;
-  EIGEN_STRONG_INLINE scalar_multiple2_op(const scalar_multiple2_op& other) : m_other(other.m_other) { }
-  EIGEN_STRONG_INLINE scalar_multiple2_op(const Scalar2& other) : m_other(other) { }
-  EIGEN_STRONG_INLINE result_type operator() (const Scalar1& a) const { return a * m_other; }
-  typename add_const_on_value_type<typename NumTraits<Scalar2>::Nested>::type m_other;
-};
-template<typename Scalar1,typename Scalar2>
-struct functor_traits<scalar_multiple2_op<Scalar1,Scalar2> >
-{ enum { Cost = NumTraits<Scalar1>::MulCost, PacketAccess = false }; };
-
-/** \internal
-  * \brief Template functor to divide a scalar by a fixed other one
-  *
-  * This functor is used to implement the quotient of a matrix by
-  * a scalar where the scalar type is not necessarily a floating point type.
-  *
-  * \sa class CwiseUnaryOp, MatrixBase::operator/
-  */
-template<typename Scalar>
-struct scalar_quotient1_op {
-  typedef typename packet_traits<Scalar>::type Packet;
-  // FIXME default copy constructors seems bugged with std::complex<>
-  EIGEN_STRONG_INLINE scalar_quotient1_op(const scalar_quotient1_op& other) : m_other(other.m_other) { }
-  EIGEN_STRONG_INLINE scalar_quotient1_op(const Scalar& other) : m_other(other) {}
-  EIGEN_STRONG_INLINE Scalar operator() (const Scalar& a) const { return a / m_other; }
-  EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const
-  { return internal::pdiv(a, pset1<Packet>(m_other)); }
-  typename add_const_on_value_type<typename NumTraits<Scalar>::Nested>::type m_other;
-};
-template<typename Scalar>
-struct functor_traits<scalar_quotient1_op<Scalar> >
-{ enum { Cost = 2 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasDiv }; };
-
-// nullary functors
-
-template<typename Scalar>
-struct scalar_constant_op {
-  typedef typename packet_traits<Scalar>::type Packet;
-  EIGEN_STRONG_INLINE scalar_constant_op(const scalar_constant_op& other) : m_other(other.m_other) { }
-  EIGEN_STRONG_INLINE scalar_constant_op(const Scalar& other) : m_other(other) { }
-  template<typename Index>
-  EIGEN_STRONG_INLINE const Scalar operator() (Index, Index = 0) const { return m_other; }
-  template<typename Index>
-  EIGEN_STRONG_INLINE const Packet packetOp(Index, Index = 0) const { return internal::pset1<Packet>(m_other); }
-  const Scalar m_other;
-};
-template<typename Scalar>
-struct functor_traits<scalar_constant_op<Scalar> >
-// FIXME replace this packet test by a safe one
-{ enum { Cost = 1, PacketAccess = packet_traits<Scalar>::Vectorizable, IsRepeatable = true }; };
-
-template<typename Scalar> struct scalar_identity_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_identity_op)
-  template<typename Index>
-  EIGEN_STRONG_INLINE const Scalar operator() (Index row, Index col) const { return row==col ? Scalar(1) : Scalar(0); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_identity_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = false, IsRepeatable = true }; };
-
-template <typename Scalar, bool RandomAccess> struct linspaced_op_impl;
-
-// linear access for packet ops:
-// 1) initialization
-//   base = [low, ..., low] + ([step, ..., step] * [-size, ..., 0])
-// 2) each step (where size is 1 for coeff access or PacketSize for packet access)
-//   base += [size*step, ..., size*step]
-//
-// TODO: Perhaps it's better to initialize lazily (so not in the constructor but in packetOp)
-//       in order to avoid the padd() in operator() ?
-template <typename Scalar>
-struct linspaced_op_impl<Scalar,false>
-{
-  typedef typename packet_traits<Scalar>::type Packet;
-
-  linspaced_op_impl(const Scalar& low, const Scalar& step) :
-  m_low(low), m_step(step),
-  m_packetStep(pset1<Packet>(packet_traits<Scalar>::size*step)),
-  m_base(padd(pset1<Packet>(low), pmul(pset1<Packet>(step),plset<Scalar>(-packet_traits<Scalar>::size)))) {}
-
-  template<typename Index>
-  EIGEN_STRONG_INLINE const Scalar operator() (Index i) const 
-  { 
-    m_base = padd(m_base, pset1<Packet>(m_step));
-    return m_low+Scalar(i)*m_step; 
-  }
-
-  template<typename Index>
-  EIGEN_STRONG_INLINE const Packet packetOp(Index) const { return m_base = padd(m_base,m_packetStep); }
-
-  const Scalar m_low;
-  const Scalar m_step;
-  const Packet m_packetStep;
-  mutable Packet m_base;
-};
-
-// random access for packet ops:
-// 1) each step
-//   [low, ..., low] + ( [step, ..., step] * ( [i, ..., i] + [0, ..., size] ) )
-template <typename Scalar>
-struct linspaced_op_impl<Scalar,true>
-{
-  typedef typename packet_traits<Scalar>::type Packet;
-
-  linspaced_op_impl(const Scalar& low, const Scalar& step) :
-  m_low(low), m_step(step),
-  m_lowPacket(pset1<Packet>(m_low)), m_stepPacket(pset1<Packet>(m_step)), m_interPacket(plset<Scalar>(0)) {}
-
-  template<typename Index>
-  EIGEN_STRONG_INLINE const Scalar operator() (Index i) const { return m_low+i*m_step; }
-
-  template<typename Index>
-  EIGEN_STRONG_INLINE const Packet packetOp(Index i) const
-  { return internal::padd(m_lowPacket, pmul(m_stepPacket, padd(pset1<Packet>(Scalar(i)),m_interPacket))); }
-
-  const Scalar m_low;
-  const Scalar m_step;
-  const Packet m_lowPacket;
-  const Packet m_stepPacket;
-  const Packet m_interPacket;
-};
-
-// ----- Linspace functor ----------------------------------------------------------------
-
-// Forward declaration (we default to random access which does not really give
-// us a speed gain when using packet access but it allows to use the functor in
-// nested expressions).
-template <typename Scalar, bool RandomAccess = true> struct linspaced_op;
-template <typename Scalar, bool RandomAccess> struct functor_traits< linspaced_op<Scalar,RandomAccess> >
-{ enum { Cost = 1, PacketAccess = packet_traits<Scalar>::HasSetLinear, IsRepeatable = true }; };
-template <typename Scalar, bool RandomAccess> struct linspaced_op
-{
-  typedef typename packet_traits<Scalar>::type Packet;
-  linspaced_op(const Scalar& low, const Scalar& high, DenseIndex num_steps) : impl((num_steps==1 ? high : low), (num_steps==1 ? Scalar() : (high-low)/Scalar(num_steps-1))) {}
-
-  template<typename Index>
-  EIGEN_STRONG_INLINE const Scalar operator() (Index i) const { return impl(i); }
-
-  // We need this function when assigning e.g. a RowVectorXd to a MatrixXd since
-  // there row==0 and col is used for the actual iteration.
-  template<typename Index>
-  EIGEN_STRONG_INLINE const Scalar operator() (Index row, Index col) const 
-  {
-    eigen_assert(col==0 || row==0);
-    return impl(col + row);
-  }
-
-  template<typename Index>
-  EIGEN_STRONG_INLINE const Packet packetOp(Index i) const { return impl.packetOp(i); }
-
-  // We need this function when assigning e.g. a RowVectorXd to a MatrixXd since
-  // there row==0 and col is used for the actual iteration.
-  template<typename Index>
-  EIGEN_STRONG_INLINE const Packet packetOp(Index row, Index col) const
-  {
-    eigen_assert(col==0 || row==0);
-    return impl.packetOp(col + row);
-  }
-
-  // This proxy object handles the actual required temporaries, the different
-  // implementations (random vs. sequential access) as well as the
-  // correct piping to size 2/4 packet operations.
-  const linspaced_op_impl<Scalar,RandomAccess> impl;
-};
-
-// all functors allow linear access, except scalar_identity_op. So we fix here a quick meta
-// to indicate whether a functor allows linear access, just always answering 'yes' except for
-// scalar_identity_op.
-// FIXME move this to functor_traits adding a functor_default
-template<typename Functor> struct functor_has_linear_access { enum { ret = 1 }; };
-template<typename Scalar> struct functor_has_linear_access<scalar_identity_op<Scalar> > { enum { ret = 0 }; };
-
-// In Eigen, any binary op (Product, CwiseBinaryOp) require the Lhs and Rhs to have the same scalar type, except for multiplication
-// where the mixing of different types is handled by scalar_product_traits
-// In particular, real * complex<real> is allowed.
-// FIXME move this to functor_traits adding a functor_default
-template<typename Functor> struct functor_is_product_like { enum { ret = 0 }; };
-template<typename LhsScalar,typename RhsScalar> struct functor_is_product_like<scalar_product_op<LhsScalar,RhsScalar> > { enum { ret = 1 }; };
-template<typename LhsScalar,typename RhsScalar> struct functor_is_product_like<scalar_conj_product_op<LhsScalar,RhsScalar> > { enum { ret = 1 }; };
-template<typename LhsScalar,typename RhsScalar> struct functor_is_product_like<scalar_quotient_op<LhsScalar,RhsScalar> > { enum { ret = 1 }; };
-
-
-/** \internal
-  * \brief Template functor to add a scalar to a fixed other one
-  * \sa class CwiseUnaryOp, Array::operator+
-  */
-/* If you wonder why doing the pset1() in packetOp() is an optimization check scalar_multiple_op */
-template<typename Scalar>
-struct scalar_add_op {
-  typedef typename packet_traits<Scalar>::type Packet;
-  // FIXME default copy constructors seems bugged with std::complex<>
-  inline scalar_add_op(const scalar_add_op& other) : m_other(other.m_other) { }
-  inline scalar_add_op(const Scalar& other) : m_other(other) { }
-  inline Scalar operator() (const Scalar& a) const { return a + m_other; }
-  inline const Packet packetOp(const Packet& a) const
-  { return internal::padd(a, pset1<Packet>(m_other)); }
-  const Scalar m_other;
-};
-template<typename Scalar>
-struct functor_traits<scalar_add_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = packet_traits<Scalar>::HasAdd }; };
-
-/** \internal
-  * \brief Template functor to compute the square root of a scalar
-  * \sa class CwiseUnaryOp, Cwise::sqrt()
-  */
-template<typename Scalar> struct scalar_sqrt_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_sqrt_op)
-  inline const Scalar operator() (const Scalar& a) const { using std::sqrt; return sqrt(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
-  inline Packet packetOp(const Packet& a) const { return internal::psqrt(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_sqrt_op<Scalar> >
-{ enum {
-    Cost = 5 * NumTraits<Scalar>::MulCost,
-    PacketAccess = packet_traits<Scalar>::HasSqrt
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the cosine of a scalar
-  * \sa class CwiseUnaryOp, ArrayBase::cos()
-  */
-template<typename Scalar> struct scalar_cos_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cos_op)
-  inline Scalar operator() (const Scalar& a) const { using std::cos; return cos(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
-  inline Packet packetOp(const Packet& a) const { return internal::pcos(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_cos_op<Scalar> >
-{
-  enum {
-    Cost = 5 * NumTraits<Scalar>::MulCost,
-    PacketAccess = packet_traits<Scalar>::HasCos
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the sine of a scalar
-  * \sa class CwiseUnaryOp, ArrayBase::sin()
-  */
-template<typename Scalar> struct scalar_sin_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_sin_op)
-  inline const Scalar operator() (const Scalar& a) const { using std::sin; return sin(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
-  inline Packet packetOp(const Packet& a) const { return internal::psin(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_sin_op<Scalar> >
-{
-  enum {
-    Cost = 5 * NumTraits<Scalar>::MulCost,
-    PacketAccess = packet_traits<Scalar>::HasSin
-  };
-};
-
-
-/** \internal
-  * \brief Template functor to compute the tan of a scalar
-  * \sa class CwiseUnaryOp, ArrayBase::tan()
-  */
-template<typename Scalar> struct scalar_tan_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_tan_op)
-  inline const Scalar operator() (const Scalar& a) const { using std::tan; return tan(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
-  inline Packet packetOp(const Packet& a) const { return internal::ptan(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_tan_op<Scalar> >
-{
-  enum {
-    Cost = 5 * NumTraits<Scalar>::MulCost,
-    PacketAccess = packet_traits<Scalar>::HasTan
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the arc cosine of a scalar
-  * \sa class CwiseUnaryOp, ArrayBase::acos()
-  */
-template<typename Scalar> struct scalar_acos_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_acos_op)
-  inline const Scalar operator() (const Scalar& a) const { using std::acos; return acos(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
-  inline Packet packetOp(const Packet& a) const { return internal::pacos(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_acos_op<Scalar> >
-{
-  enum {
-    Cost = 5 * NumTraits<Scalar>::MulCost,
-    PacketAccess = packet_traits<Scalar>::HasACos
-  };
-};
-
-/** \internal
-  * \brief Template functor to compute the arc sine of a scalar
-  * \sa class CwiseUnaryOp, ArrayBase::asin()
-  */
-template<typename Scalar> struct scalar_asin_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_asin_op)
-  inline const Scalar operator() (const Scalar& a) const { using std::asin; return asin(a); }
-  typedef typename packet_traits<Scalar>::type Packet;
-  inline Packet packetOp(const Packet& a) const { return internal::pasin(a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_asin_op<Scalar> >
-{
-  enum {
-    Cost = 5 * NumTraits<Scalar>::MulCost,
-    PacketAccess = packet_traits<Scalar>::HasASin
-  };
-};
-
-/** \internal
-  * \brief Template functor to raise a scalar to a power
-  * \sa class CwiseUnaryOp, Cwise::pow
-  */
-template<typename Scalar>
-struct scalar_pow_op {
-  // FIXME default copy constructors seems bugged with std::complex<>
-  inline scalar_pow_op(const scalar_pow_op& other) : m_exponent(other.m_exponent) { }
-  inline scalar_pow_op(const Scalar& exponent) : m_exponent(exponent) {}
-  inline Scalar operator() (const Scalar& a) const { return numext::pow(a, m_exponent); }
-  const Scalar m_exponent;
-};
-template<typename Scalar>
-struct functor_traits<scalar_pow_op<Scalar> >
-{ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false }; };
-
-/** \internal
-  * \brief Template functor to compute the quotient between a scalar and array entries.
-  * \sa class CwiseUnaryOp, Cwise::inverse()
-  */
-template<typename Scalar>
-struct scalar_inverse_mult_op {
-  scalar_inverse_mult_op(const Scalar& other) : m_other(other) {}
-  inline Scalar operator() (const Scalar& a) const { return m_other / a; }
-  template<typename Packet>
-  inline const Packet packetOp(const Packet& a) const
-  { return internal::pdiv(pset1<Packet>(m_other),a); }
-  Scalar m_other;
-};
-
-/** \internal
-  * \brief Template functor to compute the inverse of a scalar
-  * \sa class CwiseUnaryOp, Cwise::inverse()
-  */
-template<typename Scalar>
-struct scalar_inverse_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_inverse_op)
-  inline Scalar operator() (const Scalar& a) const { return Scalar(1)/a; }
-  template<typename Packet>
-  inline const Packet packetOp(const Packet& a) const
-  { return internal::pdiv(pset1<Packet>(Scalar(1)),a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_inverse_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasDiv }; };
-
-/** \internal
-  * \brief Template functor to compute the square of a scalar
-  * \sa class CwiseUnaryOp, Cwise::square()
-  */
-template<typename Scalar>
-struct scalar_square_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_square_op)
-  inline Scalar operator() (const Scalar& a) const { return a*a; }
-  template<typename Packet>
-  inline const Packet packetOp(const Packet& a) const
-  { return internal::pmul(a,a); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_square_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasMul }; };
-
-/** \internal
-  * \brief Template functor to compute the cube of a scalar
-  * \sa class CwiseUnaryOp, Cwise::cube()
-  */
-template<typename Scalar>
-struct scalar_cube_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_cube_op)
-  inline Scalar operator() (const Scalar& a) const { return a*a*a; }
-  template<typename Packet>
-  inline const Packet packetOp(const Packet& a) const
-  { return internal::pmul(a,pmul(a,a)); }
-};
-template<typename Scalar>
-struct functor_traits<scalar_cube_op<Scalar> >
-{ enum { Cost = 2*NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasMul }; };
-
-// default functor traits for STL functors:
-
-template<typename T>
-struct functor_traits<std::multiplies<T> >
-{ enum { Cost = NumTraits<T>::MulCost, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::divides<T> >
-{ enum { Cost = NumTraits<T>::MulCost, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::plus<T> >
-{ enum { Cost = NumTraits<T>::AddCost, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::minus<T> >
-{ enum { Cost = NumTraits<T>::AddCost, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::negate<T> >
-{ enum { Cost = NumTraits<T>::AddCost, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::logical_or<T> >
-{ enum { Cost = 1, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::logical_and<T> >
-{ enum { Cost = 1, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::logical_not<T> >
-{ enum { Cost = 1, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::greater<T> >
-{ enum { Cost = 1, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::less<T> >
-{ enum { Cost = 1, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::greater_equal<T> >
-{ enum { Cost = 1, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::less_equal<T> >
-{ enum { Cost = 1, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::equal_to<T> >
-{ enum { Cost = 1, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::not_equal_to<T> >
-{ enum { Cost = 1, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::binder2nd<T> >
-{ enum { Cost = functor_traits<T>::Cost, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::binder1st<T> >
-{ enum { Cost = functor_traits<T>::Cost, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::unary_negate<T> >
-{ enum { Cost = 1 + functor_traits<T>::Cost, PacketAccess = false }; };
-
-template<typename T>
-struct functor_traits<std::binary_negate<T> >
-{ enum { Cost = 1 + functor_traits<T>::Cost, PacketAccess = false }; };
-
-#ifdef EIGEN_STDEXT_SUPPORT
-
-template<typename T0,typename T1>
-struct functor_traits<std::project1st<T0,T1> >
-{ enum { Cost = 0, PacketAccess = false }; };
-
-template<typename T0,typename T1>
-struct functor_traits<std::project2nd<T0,T1> >
-{ enum { Cost = 0, PacketAccess = false }; };
-
-template<typename T0,typename T1>
-struct functor_traits<std::select2nd<std::pair<T0,T1> > >
-{ enum { Cost = 0, PacketAccess = false }; };
-
-template<typename T0,typename T1>
-struct functor_traits<std::select1st<std::pair<T0,T1> > >
-{ enum { Cost = 0, PacketAccess = false }; };
-
-template<typename T0,typename T1>
-struct functor_traits<std::unary_compose<T0,T1> >
-{ enum { Cost = functor_traits<T0>::Cost + functor_traits<T1>::Cost, PacketAccess = false }; };
-
-template<typename T0,typename T1,typename T2>
-struct functor_traits<std::binary_compose<T0,T1,T2> >
-{ enum { Cost = functor_traits<T0>::Cost + functor_traits<T1>::Cost + functor_traits<T2>::Cost, PacketAccess = false }; };
-
-#endif // EIGEN_STDEXT_SUPPORT
-
-// allow to add new functors and specializations of functor_traits from outside Eigen.
-// this macro is really needed because functor_traits must be specialized after it is declared but before it is used...
-#ifdef EIGEN_FUNCTORS_PLUGIN
-#include EIGEN_FUNCTORS_PLUGIN
-#endif
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_FUNCTORS_H
diff --git a/inst/include/Eigen/src/Core/Fuzzy.h b/inst/include/Eigen/src/Core/Fuzzy.h
index fe63bd29..ed6b4ffe 100644
--- a/inst/include/Eigen/src/Core/Fuzzy.h
+++ b/inst/include/Eigen/src/Core/Fuzzy.h
@@ -11,140 +11,122 @@
 #ifndef EIGEN_FUZZY_H
 #define EIGEN_FUZZY_H
 
-namespace Eigen { 
-
-namespace internal
-{
-
-template<typename Derived, typename OtherDerived, bool is_integer = NumTraits<typename Derived::Scalar>::IsInteger>
-struct isApprox_selector
-{
-  static bool run(const Derived& x, const OtherDerived& y, const typename Derived::RealScalar& prec)
-  {
-    using std::min;
-    typename internal::nested<Derived,2>::type nested(x);
-    typename internal::nested<OtherDerived,2>::type otherNested(y);
-    return (nested - otherNested).cwiseAbs2().sum() <= prec * prec * (min)(nested.cwiseAbs2().sum(), otherNested.cwiseAbs2().sum());
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename Derived, typename OtherDerived, bool is_integer = NumTraits<typename Derived::Scalar>::IsInteger>
+struct isApprox_selector {
+  EIGEN_DEVICE_FUNC static bool run(const Derived& x, const OtherDerived& y, const typename Derived::RealScalar& prec) {
+    typename internal::nested_eval<Derived, 2>::type nested(x);
+    typename internal::nested_eval<OtherDerived, 2>::type otherNested(y);
+    return (nested.matrix() - otherNested.matrix()).cwiseAbs2().sum() <=
+           prec * prec * numext::mini(nested.cwiseAbs2().sum(), otherNested.cwiseAbs2().sum());
   }
 };
 
-template<typename Derived, typename OtherDerived>
-struct isApprox_selector<Derived, OtherDerived, true>
-{
-  static bool run(const Derived& x, const OtherDerived& y, const typename Derived::RealScalar&)
-  {
+template <typename Derived, typename OtherDerived>
+struct isApprox_selector<Derived, OtherDerived, true> {
+  EIGEN_DEVICE_FUNC static bool run(const Derived& x, const OtherDerived& y, const typename Derived::RealScalar&) {
     return x.matrix() == y.matrix();
   }
 };
 
-template<typename Derived, typename OtherDerived, bool is_integer = NumTraits<typename Derived::Scalar>::IsInteger>
-struct isMuchSmallerThan_object_selector
-{
-  static bool run(const Derived& x, const OtherDerived& y, const typename Derived::RealScalar& prec)
-  {
+template <typename Derived, typename OtherDerived, bool is_integer = NumTraits<typename Derived::Scalar>::IsInteger>
+struct isMuchSmallerThan_object_selector {
+  EIGEN_DEVICE_FUNC static bool run(const Derived& x, const OtherDerived& y, const typename Derived::RealScalar& prec) {
     return x.cwiseAbs2().sum() <= numext::abs2(prec) * y.cwiseAbs2().sum();
   }
 };
 
-template<typename Derived, typename OtherDerived>
-struct isMuchSmallerThan_object_selector<Derived, OtherDerived, true>
-{
-  static bool run(const Derived& x, const OtherDerived&, const typename Derived::RealScalar&)
-  {
+template <typename Derived, typename OtherDerived>
+struct isMuchSmallerThan_object_selector<Derived, OtherDerived, true> {
+  EIGEN_DEVICE_FUNC static bool run(const Derived& x, const OtherDerived&, const typename Derived::RealScalar&) {
     return x.matrix() == Derived::Zero(x.rows(), x.cols()).matrix();
   }
 };
 
-template<typename Derived, bool is_integer = NumTraits<typename Derived::Scalar>::IsInteger>
-struct isMuchSmallerThan_scalar_selector
-{
-  static bool run(const Derived& x, const typename Derived::RealScalar& y, const typename Derived::RealScalar& prec)
-  {
+template <typename Derived, bool is_integer = NumTraits<typename Derived::Scalar>::IsInteger>
+struct isMuchSmallerThan_scalar_selector {
+  EIGEN_DEVICE_FUNC static bool run(const Derived& x, const typename Derived::RealScalar& y,
+                                    const typename Derived::RealScalar& prec) {
     return x.cwiseAbs2().sum() <= numext::abs2(prec * y);
   }
 };
 
-template<typename Derived>
-struct isMuchSmallerThan_scalar_selector<Derived, true>
-{
-  static bool run(const Derived& x, const typename Derived::RealScalar&, const typename Derived::RealScalar&)
-  {
+template <typename Derived>
+struct isMuchSmallerThan_scalar_selector<Derived, true> {
+  EIGEN_DEVICE_FUNC static bool run(const Derived& x, const typename Derived::RealScalar&,
+                                    const typename Derived::RealScalar&) {
     return x.matrix() == Derived::Zero(x.rows(), x.cols()).matrix();
   }
 };
 
-} // end namespace internal
-
+}  // end namespace internal
 
 /** \returns \c true if \c *this is approximately equal to \a other, within the precision
-  * determined by \a prec.
-  *
-  * \note The fuzzy compares are done multiplicatively. Two vectors \f$ v \f$ and \f$ w \f$
-  * are considered to be approximately equal within precision \f$ p \f$ if
-  * \f[ \Vert v - w \Vert \leqslant p\,\min(\Vert v\Vert, \Vert w\Vert). \f]
-  * For matrices, the comparison is done using the Hilbert-Schmidt norm (aka Frobenius norm
-  * L2 norm).
-  *
-  * \note Because of the multiplicativeness of this comparison, one can't use this function
-  * to check whether \c *this is approximately equal to the zero matrix or vector.
-  * Indeed, \c isApprox(zero) returns false unless \c *this itself is exactly the zero matrix
-  * or vector. If you want to test whether \c *this is zero, use internal::isMuchSmallerThan(const
-  * RealScalar&, RealScalar) instead.
-  *
-  * \sa internal::isMuchSmallerThan(const RealScalar&, RealScalar) const
-  */
-template<typename Derived>
-template<typename OtherDerived>
-bool DenseBase<Derived>::isApprox(
-  const DenseBase<OtherDerived>& other,
-  const RealScalar& prec
-) const
-{
+ * determined by \a prec.
+ *
+ * \note The fuzzy compares are done multiplicatively. Two vectors \f$ v \f$ and \f$ w \f$
+ * are considered to be approximately equal within precision \f$ p \f$ if
+ * \f[ \Vert v - w \Vert \leqslant p\,\min(\Vert v\Vert, \Vert w\Vert). \f]
+ * For matrices, the comparison is done using the Hilbert-Schmidt norm (aka Frobenius norm
+ * L2 norm).
+ *
+ * \note Because of the multiplicativeness of this comparison, one can't use this function
+ * to check whether \c *this is approximately equal to the zero matrix or vector.
+ * Indeed, \c isApprox(zero) returns false unless \c *this itself is exactly the zero matrix
+ * or vector. If you want to test whether \c *this is zero, use internal::isMuchSmallerThan(const
+ * RealScalar&, RealScalar) instead.
+ *
+ * \sa internal::isMuchSmallerThan(const RealScalar&, RealScalar) const
+ */
+template <typename Derived>
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isApprox(const DenseBase<OtherDerived>& other,
+                                                    const RealScalar& prec) const {
   return internal::isApprox_selector<Derived, OtherDerived>::run(derived(), other.derived(), prec);
 }
 
 /** \returns \c true if the norm of \c *this is much smaller than \a other,
-  * within the precision determined by \a prec.
-  *
-  * \note The fuzzy compares are done multiplicatively. A vector \f$ v \f$ is
-  * considered to be much smaller than \f$ x \f$ within precision \f$ p \f$ if
-  * \f[ \Vert v \Vert \leqslant p\,\vert x\vert. \f]
-  *
-  * For matrices, the comparison is done using the Hilbert-Schmidt norm. For this reason,
-  * the value of the reference scalar \a other should come from the Hilbert-Schmidt norm
-  * of a reference matrix of same dimensions.
-  *
-  * \sa isApprox(), isMuchSmallerThan(const DenseBase<OtherDerived>&, RealScalar) const
-  */
-template<typename Derived>
-bool DenseBase<Derived>::isMuchSmallerThan(
-  const typename NumTraits<Scalar>::Real& other,
-  const RealScalar& prec
-) const
-{
+ * within the precision determined by \a prec.
+ *
+ * \note The fuzzy compares are done multiplicatively. A vector \f$ v \f$ is
+ * considered to be much smaller than \f$ x \f$ within precision \f$ p \f$ if
+ * \f[ \Vert v \Vert \leqslant p\,\vert x\vert. \f]
+ *
+ * For matrices, the comparison is done using the Hilbert-Schmidt norm. For this reason,
+ * the value of the reference scalar \a other should come from the Hilbert-Schmidt norm
+ * of a reference matrix of same dimensions.
+ *
+ * \sa isApprox(), isMuchSmallerThan(const DenseBase<OtherDerived>&, RealScalar) const
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isMuchSmallerThan(const typename NumTraits<Scalar>::Real& other,
+                                                             const RealScalar& prec) const {
   return internal::isMuchSmallerThan_scalar_selector<Derived>::run(derived(), other, prec);
 }
 
 /** \returns \c true if the norm of \c *this is much smaller than the norm of \a other,
-  * within the precision determined by \a prec.
-  *
-  * \note The fuzzy compares are done multiplicatively. A vector \f$ v \f$ is
-  * considered to be much smaller than a vector \f$ w \f$ within precision \f$ p \f$ if
-  * \f[ \Vert v \Vert \leqslant p\,\Vert w\Vert. \f]
-  * For matrices, the comparison is done using the Hilbert-Schmidt norm.
-  *
-  * \sa isApprox(), isMuchSmallerThan(const RealScalar&, RealScalar) const
-  */
-template<typename Derived>
-template<typename OtherDerived>
-bool DenseBase<Derived>::isMuchSmallerThan(
-  const DenseBase<OtherDerived>& other,
-  const RealScalar& prec
-) const
-{
+ * within the precision determined by \a prec.
+ *
+ * \note The fuzzy compares are done multiplicatively. A vector \f$ v \f$ is
+ * considered to be much smaller than a vector \f$ w \f$ within precision \f$ p \f$ if
+ * \f[ \Vert v \Vert \leqslant p\,\Vert w\Vert. \f]
+ * For matrices, the comparison is done using the Hilbert-Schmidt norm.
+ *
+ * \sa isApprox(), isMuchSmallerThan(const RealScalar&, RealScalar) const
+ */
+template <typename Derived>
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC bool DenseBase<Derived>::isMuchSmallerThan(const DenseBase<OtherDerived>& other,
+                                                             const RealScalar& prec) const {
   return internal::isMuchSmallerThan_object_selector<Derived, OtherDerived>::run(derived(), other.derived(), prec);
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_FUZZY_H
+#endif  // EIGEN_FUZZY_H
diff --git a/inst/include/Eigen/src/Core/GeneralProduct.h b/inst/include/Eigen/src/Core/GeneralProduct.h
index 0eae5299..e4c51d2a 100644
--- a/inst/include/Eigen/src/Core/GeneralProduct.h
+++ b/inst/include/Eigen/src/Core/GeneralProduct.h
@@ -11,174 +11,187 @@
 #ifndef EIGEN_GENERAL_PRODUCT_H
 #define EIGEN_GENERAL_PRODUCT_H
 
-namespace Eigen { 
-
-/** \class GeneralProduct
-  * \ingroup Core_Module
-  *
-  * \brief Expression of the product of two general matrices or vectors
-  *
-  * \param LhsNested the type used to store the left-hand side
-  * \param RhsNested the type used to store the right-hand side
-  * \param ProductMode the type of the product
-  *
-  * This class represents an expression of the product of two general matrices.
-  * We call a general matrix, a dense matrix with full storage. For instance,
-  * This excludes triangular, selfadjoint, and sparse matrices.
-  * It is the return type of the operator* between general matrices. Its template
-  * arguments are determined automatically by ProductReturnType. Therefore,
-  * GeneralProduct should never be used direclty. To determine the result type of a
-  * function which involves a matrix product, use ProductReturnType::Type.
-  *
-  * \sa ProductReturnType, MatrixBase::operator*(const MatrixBase<OtherDerived>&)
-  */
-template<typename Lhs, typename Rhs, int ProductType = internal::product_type<Lhs,Rhs>::value>
-class GeneralProduct;
-
-enum {
-  Large = 2,
-  Small = 3
-};
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+enum { Large = 2, Small = 3 };
+
+// Define the threshold value to fallback from the generic matrix-matrix product
+// implementation (heavy) to the lightweight coeff-based product one.
+// See generic_product_impl<Lhs,Rhs,DenseShape,DenseShape,GemmProduct>
+// in products/GeneralMatrixMatrix.h for more details.
+// TODO This threshold should also be used in the compile-time selector below.
+#ifndef EIGEN_GEMM_TO_COEFFBASED_THRESHOLD
+// This default value has been obtained on a Haswell architecture.
+#define EIGEN_GEMM_TO_COEFFBASED_THRESHOLD 20
+#endif
 
 namespace internal {
 
-template<int Rows, int Cols, int Depth> struct product_type_selector;
+template <int Rows, int Cols, int Depth>
+struct product_type_selector;
 
-template<int Size, int MaxSize> struct product_size_category
-{
-  enum { is_large = MaxSize == Dynamic ||
-                    Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD,
-         value = is_large  ? Large
-               : Size == 1 ? 1
-                           : Small
+template <int Size, int MaxSize>
+struct product_size_category {
+  enum {
+#ifndef EIGEN_GPU_COMPILE_PHASE
+    is_large = MaxSize == Dynamic || Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD ||
+               (Size == Dynamic && MaxSize >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD),
+#else
+    is_large = 0,
+#endif
+    value = is_large    ? Large
+            : Size == 1 ? 1
+                        : Small
   };
 };
 
-template<typename Lhs, typename Rhs> struct product_type
-{
-  typedef typename remove_all<Lhs>::type _Lhs;
-  typedef typename remove_all<Rhs>::type _Rhs;
+template <typename Lhs, typename Rhs>
+struct product_type {
+  typedef remove_all_t<Lhs> Lhs_;
+  typedef remove_all_t<Rhs> Rhs_;
   enum {
-    MaxRows  = _Lhs::MaxRowsAtCompileTime,
-    Rows  = _Lhs::RowsAtCompileTime,
-    MaxCols  = _Rhs::MaxColsAtCompileTime,
-    Cols  = _Rhs::ColsAtCompileTime,
-    MaxDepth = EIGEN_SIZE_MIN_PREFER_FIXED(_Lhs::MaxColsAtCompileTime,
-                                           _Rhs::MaxRowsAtCompileTime),
-    Depth = EIGEN_SIZE_MIN_PREFER_FIXED(_Lhs::ColsAtCompileTime,
-                                        _Rhs::RowsAtCompileTime),
-    LargeThreshold = EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+    MaxRows = traits<Lhs_>::MaxRowsAtCompileTime,
+    Rows = traits<Lhs_>::RowsAtCompileTime,
+    MaxCols = traits<Rhs_>::MaxColsAtCompileTime,
+    Cols = traits<Rhs_>::ColsAtCompileTime,
+    MaxDepth = min_size_prefer_fixed(traits<Lhs_>::MaxColsAtCompileTime, traits<Rhs_>::MaxRowsAtCompileTime),
+    Depth = min_size_prefer_fixed(traits<Lhs_>::ColsAtCompileTime, traits<Rhs_>::RowsAtCompileTime)
   };
 
   // the splitting into different lines of code here, introducing the _select enums and the typedef below,
   // is to work around an internal compiler error with gcc 4.1 and 4.2.
-private:
+ private:
   enum {
-    rows_select = product_size_category<Rows,MaxRows>::value,
-    cols_select = product_size_category<Cols,MaxCols>::value,
-    depth_select = product_size_category<Depth,MaxDepth>::value
+    rows_select = product_size_category<Rows, MaxRows>::value,
+    cols_select = product_size_category<Cols, MaxCols>::value,
+    depth_select = product_size_category<Depth, MaxDepth>::value
   };
   typedef product_type_selector<rows_select, cols_select, depth_select> selector;
 
-public:
-  enum {
-    value = selector::ret
-  };
+ public:
+  enum { value = selector::ret, ret = selector::ret };
 #ifdef EIGEN_DEBUG_PRODUCT
-  static void debug()
-  {
-      EIGEN_DEBUG_VAR(Rows);
-      EIGEN_DEBUG_VAR(Cols);
-      EIGEN_DEBUG_VAR(Depth);
-      EIGEN_DEBUG_VAR(rows_select);
-      EIGEN_DEBUG_VAR(cols_select);
-      EIGEN_DEBUG_VAR(depth_select);
-      EIGEN_DEBUG_VAR(value);
+  static void debug() {
+    EIGEN_DEBUG_VAR(Rows);
+    EIGEN_DEBUG_VAR(Cols);
+    EIGEN_DEBUG_VAR(Depth);
+    EIGEN_DEBUG_VAR(rows_select);
+    EIGEN_DEBUG_VAR(cols_select);
+    EIGEN_DEBUG_VAR(depth_select);
+    EIGEN_DEBUG_VAR(value);
   }
 #endif
 };
 
-
 /* The following allows to select the kind of product at compile time
  * based on the three dimensions of the product.
  * This is a compile time mapping from {1,Small,Large}^3 -> {product types} */
 // FIXME I'm not sure the current mapping is the ideal one.
-template<int M, int N>  struct product_type_selector<M,N,1>              { enum { ret = OuterProduct }; };
-template<int Depth>     struct product_type_selector<1,    1,    Depth>  { enum { ret = InnerProduct }; };
-template<>              struct product_type_selector<1,    1,    1>      { enum { ret = InnerProduct }; };
-template<>              struct product_type_selector<Small,1,    Small>  { enum { ret = CoeffBasedProductMode }; };
-template<>              struct product_type_selector<1,    Small,Small>  { enum { ret = CoeffBasedProductMode }; };
-template<>              struct product_type_selector<Small,Small,Small>  { enum { ret = CoeffBasedProductMode }; };
-template<>              struct product_type_selector<Small, Small, 1>    { enum { ret = LazyCoeffBasedProductMode }; };
-template<>              struct product_type_selector<Small, Large, 1>    { enum { ret = LazyCoeffBasedProductMode }; };
-template<>              struct product_type_selector<Large, Small, 1>    { enum { ret = LazyCoeffBasedProductMode }; };
-template<>              struct product_type_selector<1,    Large,Small>  { enum { ret = CoeffBasedProductMode }; };
-template<>              struct product_type_selector<1,    Large,Large>  { enum { ret = GemvProduct }; };
-template<>              struct product_type_selector<1,    Small,Large>  { enum { ret = CoeffBasedProductMode }; };
-template<>              struct product_type_selector<Large,1,    Small>  { enum { ret = CoeffBasedProductMode }; };
-template<>              struct product_type_selector<Large,1,    Large>  { enum { ret = GemvProduct }; };
-template<>              struct product_type_selector<Small,1,    Large>  { enum { ret = CoeffBasedProductMode }; };
-template<>              struct product_type_selector<Small,Small,Large>  { enum { ret = GemmProduct }; };
-template<>              struct product_type_selector<Large,Small,Large>  { enum { ret = GemmProduct }; };
-template<>              struct product_type_selector<Small,Large,Large>  { enum { ret = GemmProduct }; };
-template<>              struct product_type_selector<Large,Large,Large>  { enum { ret = GemmProduct }; };
-template<>              struct product_type_selector<Large,Small,Small>  { enum { ret = GemmProduct }; };
-template<>              struct product_type_selector<Small,Large,Small>  { enum { ret = GemmProduct }; };
-template<>              struct product_type_selector<Large,Large,Small>  { enum { ret = GemmProduct }; };
-
-} // end namespace internal
-
-/** \class ProductReturnType
-  * \ingroup Core_Module
-  *
-  * \brief Helper class to get the correct and optimized returned type of operator*
-  *
-  * \param Lhs the type of the left-hand side
-  * \param Rhs the type of the right-hand side
-  * \param ProductMode the type of the product (determined automatically by internal::product_mode)
-  *
-  * This class defines the typename Type representing the optimized product expression
-  * between two matrix expressions. In practice, using ProductReturnType<Lhs,Rhs>::Type
-  * is the recommended way to define the result type of a function returning an expression
-  * which involve a matrix product. The class Product should never be
-  * used directly.
-  *
-  * \sa class Product, MatrixBase::operator*(const MatrixBase<OtherDerived>&)
-  */
-template<typename Lhs, typename Rhs, int ProductType>
-struct ProductReturnType
-{
-  // TODO use the nested type to reduce instanciations ????
-//   typedef typename internal::nested<Lhs,Rhs::ColsAtCompileTime>::type LhsNested;
-//   typedef typename internal::nested<Rhs,Lhs::RowsAtCompileTime>::type RhsNested;
-
-  typedef GeneralProduct<Lhs/*Nested*/, Rhs/*Nested*/, ProductType> Type;
-};
-
-template<typename Lhs, typename Rhs>
-struct ProductReturnType<Lhs,Rhs,CoeffBasedProductMode>
-{
-  typedef typename internal::nested<Lhs, Rhs::ColsAtCompileTime, typename internal::plain_matrix_type<Lhs>::type >::type LhsNested;
-  typedef typename internal::nested<Rhs, Lhs::RowsAtCompileTime, typename internal::plain_matrix_type<Rhs>::type >::type RhsNested;
-  typedef CoeffBasedProduct<LhsNested, RhsNested, EvalBeforeAssigningBit | EvalBeforeNestingBit> Type;
-};
-
-template<typename Lhs, typename Rhs>
-struct ProductReturnType<Lhs,Rhs,LazyCoeffBasedProductMode>
-{
-  typedef typename internal::nested<Lhs, Rhs::ColsAtCompileTime, typename internal::plain_matrix_type<Lhs>::type >::type LhsNested;
-  typedef typename internal::nested<Rhs, Lhs::RowsAtCompileTime, typename internal::plain_matrix_type<Rhs>::type >::type RhsNested;
-  typedef CoeffBasedProduct<LhsNested, RhsNested, NestByRefBit> Type;
-};
-
-// this is a workaround for sun CC
-template<typename Lhs, typename Rhs>
-struct LazyProductReturnType : public ProductReturnType<Lhs,Rhs,LazyCoeffBasedProductMode>
-{};
+template <int M, int N>
+struct product_type_selector<M, N, 1> {
+  enum { ret = OuterProduct };
+};
+template <int M>
+struct product_type_selector<M, 1, 1> {
+  enum { ret = LazyCoeffBasedProductMode };
+};
+template <int N>
+struct product_type_selector<1, N, 1> {
+  enum { ret = LazyCoeffBasedProductMode };
+};
+template <int Depth>
+struct product_type_selector<1, 1, Depth> {
+  enum { ret = InnerProduct };
+};
+template <>
+struct product_type_selector<1, 1, 1> {
+  enum { ret = InnerProduct };
+};
+template <>
+struct product_type_selector<Small, 1, Small> {
+  enum { ret = CoeffBasedProductMode };
+};
+template <>
+struct product_type_selector<1, Small, Small> {
+  enum { ret = CoeffBasedProductMode };
+};
+template <>
+struct product_type_selector<Small, Small, Small> {
+  enum { ret = CoeffBasedProductMode };
+};
+template <>
+struct product_type_selector<Small, Small, 1> {
+  enum { ret = LazyCoeffBasedProductMode };
+};
+template <>
+struct product_type_selector<Small, Large, 1> {
+  enum { ret = LazyCoeffBasedProductMode };
+};
+template <>
+struct product_type_selector<Large, Small, 1> {
+  enum { ret = LazyCoeffBasedProductMode };
+};
+template <>
+struct product_type_selector<1, Large, Small> {
+  enum { ret = CoeffBasedProductMode };
+};
+template <>
+struct product_type_selector<1, Large, Large> {
+  enum { ret = GemvProduct };
+};
+template <>
+struct product_type_selector<1, Small, Large> {
+  enum { ret = CoeffBasedProductMode };
+};
+template <>
+struct product_type_selector<Large, 1, Small> {
+  enum { ret = CoeffBasedProductMode };
+};
+template <>
+struct product_type_selector<Large, 1, Large> {
+  enum { ret = GemvProduct };
+};
+template <>
+struct product_type_selector<Small, 1, Large> {
+  enum { ret = CoeffBasedProductMode };
+};
+template <>
+struct product_type_selector<Small, Small, Large> {
+  enum { ret = GemmProduct };
+};
+template <>
+struct product_type_selector<Large, Small, Large> {
+  enum { ret = GemmProduct };
+};
+template <>
+struct product_type_selector<Small, Large, Large> {
+  enum { ret = GemmProduct };
+};
+template <>
+struct product_type_selector<Large, Large, Large> {
+  enum { ret = GemmProduct };
+};
+template <>
+struct product_type_selector<Large, Small, Small> {
+  enum { ret = CoeffBasedProductMode };
+};
+template <>
+struct product_type_selector<Small, Large, Small> {
+  enum { ret = CoeffBasedProductMode };
+};
+template <>
+struct product_type_selector<Large, Large, Small> {
+  enum { ret = GemmProduct };
+};
+
+}  // end namespace internal
 
 /***********************************************************************
-*  Implementation of Inner Vector Vector Product
-***********************************************************************/
+ *  Implementation of Inner Vector Vector Product
+ ***********************************************************************/
 
 // FIXME : maybe the "inner product" could return a Scalar
 // instead of a 1x1 matrix ??
@@ -187,122 +200,13 @@ struct LazyProductReturnType : public ProductReturnType<Lhs,Rhs,LazyCoeffBasedPr
 // product ends up to a row-vector times col-vector product... To tackle this use
 // case, we could have a specialization for Block<MatrixType,1,1> with: operator=(Scalar x);
 
-namespace internal {
-
-template<typename Lhs, typename Rhs>
-struct traits<GeneralProduct<Lhs,Rhs,InnerProduct> >
- : traits<Matrix<typename scalar_product_traits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType,1,1> >
-{};
-
-}
-
-template<typename Lhs, typename Rhs>
-class GeneralProduct<Lhs, Rhs, InnerProduct>
-  : internal::no_assignment_operator,
-    public Matrix<typename internal::scalar_product_traits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType,1,1>
-{
-    typedef Matrix<typename internal::scalar_product_traits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType,1,1> Base;
-  public:
-    GeneralProduct(const Lhs& lhs, const Rhs& rhs)
-    {
-      EIGEN_STATIC_ASSERT((internal::is_same<typename Lhs::RealScalar, typename Rhs::RealScalar>::value),
-        YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
-
-      Base::coeffRef(0,0) = (lhs.transpose().cwiseProduct(rhs)).sum();
-    }
-
-    /** Convertion to scalar */
-    operator const typename Base::Scalar() const {
-      return Base::coeff(0,0);
-    }
-};
-
 /***********************************************************************
-*  Implementation of Outer Vector Vector Product
-***********************************************************************/
-
-namespace internal {
-
-// Column major
-template<typename ProductType, typename Dest, typename Func>
-EIGEN_DONT_INLINE void outer_product_selector_run(const ProductType& prod, Dest& dest, const Func& func, const false_type&)
-{
-  typedef typename Dest::Index Index;
-  // FIXME make sure lhs is sequentially stored
-  // FIXME not very good if rhs is real and lhs complex while alpha is real too
-  const Index cols = dest.cols();
-  for (Index j=0; j<cols; ++j)
-    func(dest.col(j), prod.rhs().coeff(0,j) * prod.lhs());
-}
-
-// Row major
-template<typename ProductType, typename Dest, typename Func>
-EIGEN_DONT_INLINE void outer_product_selector_run(const ProductType& prod, Dest& dest, const Func& func, const true_type&) {
-  typedef typename Dest::Index Index;
-  // FIXME make sure rhs is sequentially stored
-  // FIXME not very good if lhs is real and rhs complex while alpha is real too
-  const Index rows = dest.rows();
-  for (Index i=0; i<rows; ++i)
-    func(dest.row(i), prod.lhs().coeff(i,0) * prod.rhs());
-}
-
-template<typename Lhs, typename Rhs>
-struct traits<GeneralProduct<Lhs,Rhs,OuterProduct> >
- : traits<ProductBase<GeneralProduct<Lhs,Rhs,OuterProduct>, Lhs, Rhs> >
-{};
-
-}
-
-template<typename Lhs, typename Rhs>
-class GeneralProduct<Lhs, Rhs, OuterProduct>
-  : public ProductBase<GeneralProduct<Lhs,Rhs,OuterProduct>, Lhs, Rhs>
-{
-    template<typename T> struct is_row_major : internal::conditional<(int(T::Flags)&RowMajorBit), internal::true_type, internal::false_type>::type {};
-    
-  public:
-    EIGEN_PRODUCT_PUBLIC_INTERFACE(GeneralProduct)
-
-    GeneralProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs)
-    {
-      EIGEN_STATIC_ASSERT((internal::is_same<typename Lhs::RealScalar, typename Rhs::RealScalar>::value),
-        YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
-    }
-    
-    struct set  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived()  = src; } };
-    struct add  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() += src; } };
-    struct sub  { template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const { dst.const_cast_derived() -= src; } };
-    struct adds {
-      Scalar m_scale;
-      adds(const Scalar& s) : m_scale(s) {}
-      template<typename Dst, typename Src> void operator()(const Dst& dst, const Src& src) const {
-        dst.const_cast_derived() += m_scale * src;
-      }
-    };
-    
-    template<typename Dest>
-    inline void evalTo(Dest& dest) const {
-      internal::outer_product_selector_run(*this, dest, set(), is_row_major<Dest>());
-    }
-    
-    template<typename Dest>
-    inline void addTo(Dest& dest) const {
-      internal::outer_product_selector_run(*this, dest, add(), is_row_major<Dest>());
-    }
-
-    template<typename Dest>
-    inline void subTo(Dest& dest) const {
-      internal::outer_product_selector_run(*this, dest, sub(), is_row_major<Dest>());
-    }
-
-    template<typename Dest> void scaleAndAddTo(Dest& dest, const Scalar& alpha) const
-    {
-      internal::outer_product_selector_run(*this, dest, adds(alpha), is_row_major<Dest>());
-    }
-};
+ *  Implementation of Outer Vector Vector Product
+ ***********************************************************************/
 
 /***********************************************************************
-*  Implementation of General Matrix Vector Product
-***********************************************************************/
+ *  Implementation of General Matrix Vector Product
+ ***********************************************************************/
 
 /*  According to the shape/flags of the matrix we have to distinghish 3 different cases:
  *   1 - the matrix is col-major, BLAS compatible and M is large => call fast BLAS-like colmajor routine
@@ -313,323 +217,303 @@ class GeneralProduct<Lhs, Rhs, OuterProduct>
  */
 namespace internal {
 
-template<typename Lhs, typename Rhs>
-struct traits<GeneralProduct<Lhs,Rhs,GemvProduct> >
- : traits<ProductBase<GeneralProduct<Lhs,Rhs,GemvProduct>, Lhs, Rhs> >
-{};
+template <int Side, int StorageOrder, bool BlasCompatible>
+struct gemv_dense_selector;
 
-template<int Side, int StorageOrder, bool BlasCompatible>
-struct gemv_selector;
+}  // end namespace internal
 
-} // end namespace internal
-
-template<typename Lhs, typename Rhs>
-class GeneralProduct<Lhs, Rhs, GemvProduct>
-  : public ProductBase<GeneralProduct<Lhs,Rhs,GemvProduct>, Lhs, Rhs>
-{
-  public:
-    EIGEN_PRODUCT_PUBLIC_INTERFACE(GeneralProduct)
-
-    typedef typename Lhs::Scalar LhsScalar;
-    typedef typename Rhs::Scalar RhsScalar;
+namespace internal {
 
-    GeneralProduct(const Lhs& a_lhs, const Rhs& a_rhs) : Base(a_lhs,a_rhs)
-    {
-//       EIGEN_STATIC_ASSERT((internal::is_same<typename Lhs::Scalar, typename Rhs::Scalar>::value),
-//         YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
-    }
+template <typename Scalar, int Size, int MaxSize, bool Cond>
+struct gemv_static_vector_if;
 
-    enum { Side = Lhs::IsVectorAtCompileTime ? OnTheLeft : OnTheRight };
-    typedef typename internal::conditional<int(Side)==OnTheRight,_LhsNested,_RhsNested>::type MatrixType;
+template <typename Scalar, int Size, int MaxSize>
+struct gemv_static_vector_if<Scalar, Size, MaxSize, false> {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr Scalar* data() {
+    eigen_internal_assert(false && "should never be called");
+    return 0;
+  }
+};
 
-    template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const
-    {
-      eigen_assert(m_lhs.rows() == dst.rows() && m_rhs.cols() == dst.cols());
-      internal::gemv_selector<Side,(int(MatrixType::Flags)&RowMajorBit) ? RowMajor : ColMajor,
-                       bool(internal::blas_traits<MatrixType>::HasUsableDirectAccess)>::run(*this, dst, alpha);
-    }
+template <typename Scalar, int Size>
+struct gemv_static_vector_if<Scalar, Size, Dynamic, true> {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr Scalar* data() { return 0; }
 };
 
-namespace internal {
+template <typename Scalar, int Size, int MaxSize>
+struct gemv_static_vector_if<Scalar, Size, MaxSize, true> {
+#if EIGEN_MAX_STATIC_ALIGN_BYTES != 0
+  internal::plain_array<Scalar, internal::min_size_prefer_fixed(Size, MaxSize), 0, AlignedMax> m_data;
+  EIGEN_STRONG_INLINE constexpr Scalar* data() { return m_data.array; }
+#else
+  // Some architectures cannot align on the stack,
+  // => let's manually enforce alignment by allocating more data and return the address of the first aligned element.
+  internal::plain_array<Scalar, internal::min_size_prefer_fixed(Size, MaxSize) + EIGEN_MAX_ALIGN_BYTES, 0> m_data;
+  EIGEN_STRONG_INLINE constexpr Scalar* data() {
+    return reinterpret_cast<Scalar*>((std::uintptr_t(m_data.array) & ~(std::size_t(EIGEN_MAX_ALIGN_BYTES - 1))) +
+                                     EIGEN_MAX_ALIGN_BYTES);
+  }
+#endif
+};
 
 // The vector is on the left => transposition
-template<int StorageOrder, bool BlasCompatible>
-struct gemv_selector<OnTheLeft,StorageOrder,BlasCompatible>
-{
-  template<typename ProductType, typename Dest>
-  static void run(const ProductType& prod, Dest& dest, const typename ProductType::Scalar& alpha)
-  {
+template <int StorageOrder, bool BlasCompatible>
+struct gemv_dense_selector<OnTheLeft, StorageOrder, BlasCompatible> {
+  template <typename Lhs, typename Rhs, typename Dest>
+  static void run(const Lhs& lhs, const Rhs& rhs, Dest& dest, const typename Dest::Scalar& alpha) {
     Transpose<Dest> destT(dest);
     enum { OtherStorageOrder = StorageOrder == RowMajor ? ColMajor : RowMajor };
-    gemv_selector<OnTheRight,OtherStorageOrder,BlasCompatible>
-      ::run(GeneralProduct<Transpose<const typename ProductType::_RhsNested>,Transpose<const typename ProductType::_LhsNested>, GemvProduct>
-        (prod.rhs().transpose(), prod.lhs().transpose()), destT, alpha);
+    gemv_dense_selector<OnTheRight, OtherStorageOrder, BlasCompatible>::run(rhs.transpose(), lhs.transpose(), destT,
+                                                                            alpha);
   }
 };
 
-template<typename Scalar,int Size,int MaxSize,bool Cond> struct gemv_static_vector_if;
+template <>
+struct gemv_dense_selector<OnTheRight, ColMajor, true> {
+  template <typename Lhs, typename Rhs, typename Dest>
+  static inline void run(const Lhs& lhs, const Rhs& rhs, Dest& dest, const typename Dest::Scalar& alpha) {
+    typedef typename Lhs::Scalar LhsScalar;
+    typedef typename Rhs::Scalar RhsScalar;
+    typedef typename Dest::Scalar ResScalar;
 
-template<typename Scalar,int Size,int MaxSize>
-struct gemv_static_vector_if<Scalar,Size,MaxSize,false>
-{
-  EIGEN_STRONG_INLINE  Scalar* data() { eigen_internal_assert(false && "should never be called"); return 0; }
-};
+    typedef internal::blas_traits<Lhs> LhsBlasTraits;
+    typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
+    typedef internal::blas_traits<Rhs> RhsBlasTraits;
+    typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
 
-template<typename Scalar,int Size>
-struct gemv_static_vector_if<Scalar,Size,Dynamic,true>
-{
-  EIGEN_STRONG_INLINE Scalar* data() { return 0; }
-};
+    typedef Map<Matrix<ResScalar, Dynamic, 1>, plain_enum_min(AlignedMax, internal::packet_traits<ResScalar>::size)>
+        MappedDest;
 
-template<typename Scalar,int Size,int MaxSize>
-struct gemv_static_vector_if<Scalar,Size,MaxSize,true>
-{
-  #if EIGEN_ALIGN_STATICALLY
-  internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize),0> m_data;
-  EIGEN_STRONG_INLINE Scalar* data() { return m_data.array; }
-  #else
-  // Some architectures cannot align on the stack,
-  // => let's manually enforce alignment by allocating more data and return the address of the first aligned element.
-  enum {
-    ForceAlignment  = internal::packet_traits<Scalar>::Vectorizable,
-    PacketSize      = internal::packet_traits<Scalar>::size
-  };
-  internal::plain_array<Scalar,EIGEN_SIZE_MIN_PREFER_FIXED(Size,MaxSize)+(ForceAlignment?PacketSize:0),0> m_data;
-  EIGEN_STRONG_INLINE Scalar* data() {
-    return ForceAlignment
-            ? reinterpret_cast<Scalar*>((reinterpret_cast<size_t>(m_data.array) & ~(size_t(15))) + 16)
-            : m_data.array;
-  }
-  #endif
-};
-
-template<> struct gemv_selector<OnTheRight,ColMajor,true>
-{
-  template<typename ProductType, typename Dest>
-  static inline void run(const ProductType& prod, Dest& dest, const typename ProductType::Scalar& alpha)
-  {
-    typedef typename ProductType::Index Index;
-    typedef typename ProductType::LhsScalar   LhsScalar;
-    typedef typename ProductType::RhsScalar   RhsScalar;
-    typedef typename ProductType::Scalar      ResScalar;
-    typedef typename ProductType::RealScalar  RealScalar;
-    typedef typename ProductType::ActualLhsType ActualLhsType;
-    typedef typename ProductType::ActualRhsType ActualRhsType;
-    typedef typename ProductType::LhsBlasTraits LhsBlasTraits;
-    typedef typename ProductType::RhsBlasTraits RhsBlasTraits;
-    typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest;
-
-    ActualLhsType actualLhs = LhsBlasTraits::extract(prod.lhs());
-    ActualRhsType actualRhs = RhsBlasTraits::extract(prod.rhs());
-
-    ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs())
-                                  * RhsBlasTraits::extractScalarFactor(prod.rhs());
+    ActualLhsType actualLhs = LhsBlasTraits::extract(lhs);
+    ActualRhsType actualRhs = RhsBlasTraits::extract(rhs);
+
+    ResScalar actualAlpha = combine_scalar_factors(alpha, lhs, rhs);
+
+    // make sure Dest is a compile-time vector type (bug 1166)
+    typedef std::conditional_t<Dest::IsVectorAtCompileTime, Dest, typename Dest::ColXpr> ActualDest;
 
     enum {
       // FIXME find a way to allow an inner stride on the result if packet_traits<Scalar>::size==1
       // on, the other hand it is good for the cache to pack the vector anyways...
-      EvalToDestAtCompileTime = Dest::InnerStrideAtCompileTime==1,
+      EvalToDestAtCompileTime = (ActualDest::InnerStrideAtCompileTime == 1),
       ComplexByReal = (NumTraits<LhsScalar>::IsComplex) && (!NumTraits<RhsScalar>::IsComplex),
-      MightCannotUseDest = (Dest::InnerStrideAtCompileTime!=1) || ComplexByReal
+      MightCannotUseDest = ((!EvalToDestAtCompileTime) || ComplexByReal) && (ActualDest::MaxSizeAtCompileTime != 0)
     };
 
-    gemv_static_vector_if<ResScalar,Dest::SizeAtCompileTime,Dest::MaxSizeAtCompileTime,MightCannotUseDest> static_dest;
-
-    bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0));
-    bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible;
-    
-    RhsScalar compatibleAlpha = get_factor<ResScalar,RhsScalar>::run(actualAlpha);
-
-    ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(),
-                                                  evalToDest ? dest.data() : static_dest.data());
-    
-    if(!evalToDest)
-    {
-      #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
-      int size = dest.size();
-      EIGEN_DENSE_STORAGE_CTOR_PLUGIN
-      #endif
-      if(!alphaIsCompatible)
-      {
-        MappedDest(actualDestPtr, dest.size()).setZero();
-        compatibleAlpha = RhsScalar(1);
+    typedef const_blas_data_mapper<LhsScalar, Index, ColMajor> LhsMapper;
+    typedef const_blas_data_mapper<RhsScalar, Index, RowMajor> RhsMapper;
+    RhsScalar compatibleAlpha = get_factor<ResScalar, RhsScalar>::run(actualAlpha);
+
+    if (!MightCannotUseDest) {
+      // shortcut if we are sure to be able to use dest directly,
+      // this ease the compiler to generate cleaner and more optimzized code for most common cases
+      general_matrix_vector_product<Index, LhsScalar, LhsMapper, ColMajor, LhsBlasTraits::NeedToConjugate, RhsScalar,
+                                    RhsMapper, RhsBlasTraits::NeedToConjugate>::run(actualLhs.rows(), actualLhs.cols(),
+                                                                                    LhsMapper(actualLhs.data(),
+                                                                                              actualLhs.outerStride()),
+                                                                                    RhsMapper(actualRhs.data(),
+                                                                                              actualRhs.innerStride()),
+                                                                                    dest.data(), 1, compatibleAlpha);
+    } else {
+      gemv_static_vector_if<ResScalar, ActualDest::SizeAtCompileTime, ActualDest::MaxSizeAtCompileTime,
+                            MightCannotUseDest>
+          static_dest;
+
+      const bool alphaIsCompatible = (!ComplexByReal) || (numext::is_exactly_zero(numext::imag(actualAlpha)));
+      const bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible;
+
+      ei_declare_aligned_stack_constructed_variable(ResScalar, actualDestPtr, dest.size(),
+                                                    evalToDest ? dest.data() : static_dest.data());
+
+      if (!evalToDest) {
+#ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+        constexpr int Size = Dest::SizeAtCompileTime;
+        Index size = dest.size();
+        EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+#endif
+        if (!alphaIsCompatible) {
+          MappedDest(actualDestPtr, dest.size()).setZero();
+          compatibleAlpha = RhsScalar(1);
+        } else
+          MappedDest(actualDestPtr, dest.size()) = dest;
       }
-      else
-        MappedDest(actualDestPtr, dest.size()) = dest;
-    }
 
-    general_matrix_vector_product
-      <Index,LhsScalar,ColMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsBlasTraits::NeedToConjugate>::run(
-        actualLhs.rows(), actualLhs.cols(),
-        actualLhs.data(), actualLhs.outerStride(),
-        actualRhs.data(), actualRhs.innerStride(),
-        actualDestPtr, 1,
-        compatibleAlpha);
-
-    if (!evalToDest)
-    {
-      if(!alphaIsCompatible)
-        dest += actualAlpha * MappedDest(actualDestPtr, dest.size());
-      else
-        dest = MappedDest(actualDestPtr, dest.size());
+      general_matrix_vector_product<Index, LhsScalar, LhsMapper, ColMajor, LhsBlasTraits::NeedToConjugate, RhsScalar,
+                                    RhsMapper, RhsBlasTraits::NeedToConjugate>::run(actualLhs.rows(), actualLhs.cols(),
+                                                                                    LhsMapper(actualLhs.data(),
+                                                                                              actualLhs.outerStride()),
+                                                                                    RhsMapper(actualRhs.data(),
+                                                                                              actualRhs.innerStride()),
+                                                                                    actualDestPtr, 1, compatibleAlpha);
+
+      if (!evalToDest) {
+        if (!alphaIsCompatible)
+          dest.matrix() += actualAlpha * MappedDest(actualDestPtr, dest.size());
+        else
+          dest = MappedDest(actualDestPtr, dest.size());
+      }
     }
   }
 };
 
-template<> struct gemv_selector<OnTheRight,RowMajor,true>
-{
-  template<typename ProductType, typename Dest>
-  static void run(const ProductType& prod, Dest& dest, const typename ProductType::Scalar& alpha)
-  {
-    typedef typename ProductType::LhsScalar LhsScalar;
-    typedef typename ProductType::RhsScalar RhsScalar;
-    typedef typename ProductType::Scalar    ResScalar;
-    typedef typename ProductType::Index Index;
-    typedef typename ProductType::ActualLhsType ActualLhsType;
-    typedef typename ProductType::ActualRhsType ActualRhsType;
-    typedef typename ProductType::_ActualRhsType _ActualRhsType;
-    typedef typename ProductType::LhsBlasTraits LhsBlasTraits;
-    typedef typename ProductType::RhsBlasTraits RhsBlasTraits;
+template <>
+struct gemv_dense_selector<OnTheRight, RowMajor, true> {
+  template <typename Lhs, typename Rhs, typename Dest>
+  static void run(const Lhs& lhs, const Rhs& rhs, Dest& dest, const typename Dest::Scalar& alpha) {
+    typedef typename Lhs::Scalar LhsScalar;
+    typedef typename Rhs::Scalar RhsScalar;
+    typedef typename Dest::Scalar ResScalar;
+
+    typedef internal::blas_traits<Lhs> LhsBlasTraits;
+    typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
+    typedef internal::blas_traits<Rhs> RhsBlasTraits;
+    typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
+    typedef internal::remove_all_t<ActualRhsType> ActualRhsTypeCleaned;
 
-    typename add_const<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(prod.lhs());
-    typename add_const<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(prod.rhs());
+    std::add_const_t<ActualLhsType> actualLhs = LhsBlasTraits::extract(lhs);
+    std::add_const_t<ActualRhsType> actualRhs = RhsBlasTraits::extract(rhs);
 
-    ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs())
-                                  * RhsBlasTraits::extractScalarFactor(prod.rhs());
+    ResScalar actualAlpha = combine_scalar_factors(alpha, lhs, rhs);
 
     enum {
       // FIXME find a way to allow an inner stride on the result if packet_traits<Scalar>::size==1
       // on, the other hand it is good for the cache to pack the vector anyways...
-      DirectlyUseRhs = _ActualRhsType::InnerStrideAtCompileTime==1
+      DirectlyUseRhs =
+          ActualRhsTypeCleaned::InnerStrideAtCompileTime == 1 || ActualRhsTypeCleaned::MaxSizeAtCompileTime == 0
     };
 
-    gemv_static_vector_if<RhsScalar,_ActualRhsType::SizeAtCompileTime,_ActualRhsType::MaxSizeAtCompileTime,!DirectlyUseRhs> static_rhs;
+    gemv_static_vector_if<RhsScalar, ActualRhsTypeCleaned::SizeAtCompileTime,
+                          ActualRhsTypeCleaned::MaxSizeAtCompileTime, !DirectlyUseRhs>
+        static_rhs;
 
-    ei_declare_aligned_stack_constructed_variable(RhsScalar,actualRhsPtr,actualRhs.size(),
+    ei_declare_aligned_stack_constructed_variable(
+        RhsScalar, actualRhsPtr, actualRhs.size(),
         DirectlyUseRhs ? const_cast<RhsScalar*>(actualRhs.data()) : static_rhs.data());
 
-    if(!DirectlyUseRhs)
-    {
-      #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
-      int size = actualRhs.size();
+    if (!DirectlyUseRhs) {
+#ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+      constexpr int Size = ActualRhsTypeCleaned::SizeAtCompileTime;
+      Index size = actualRhs.size();
       EIGEN_DENSE_STORAGE_CTOR_PLUGIN
-      #endif
-      Map<typename _ActualRhsType::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs;
+#endif
+      Map<typename ActualRhsTypeCleaned::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs;
     }
 
-    general_matrix_vector_product
-      <Index,LhsScalar,RowMajor,LhsBlasTraits::NeedToConjugate,RhsScalar,RhsBlasTraits::NeedToConjugate>::run(
-        actualLhs.rows(), actualLhs.cols(),
-        actualLhs.data(), actualLhs.outerStride(),
-        actualRhsPtr, 1,
-        dest.data(), dest.innerStride(),
-        actualAlpha);
+    typedef const_blas_data_mapper<LhsScalar, Index, RowMajor> LhsMapper;
+    typedef const_blas_data_mapper<RhsScalar, Index, ColMajor> RhsMapper;
+    general_matrix_vector_product<Index, LhsScalar, LhsMapper, RowMajor, LhsBlasTraits::NeedToConjugate, RhsScalar,
+                                  RhsMapper, RhsBlasTraits::NeedToConjugate>::
+        run(actualLhs.rows(), actualLhs.cols(), LhsMapper(actualLhs.data(), actualLhs.outerStride()),
+            RhsMapper(actualRhsPtr, 1), dest.data(),
+            dest.col(0).innerStride(),  // NOTE  if dest is not a vector at compile-time, then dest.innerStride() might
+                                        // be wrong. (bug 1166)
+            actualAlpha);
   }
 };
 
-template<> struct gemv_selector<OnTheRight,ColMajor,false>
-{
-  template<typename ProductType, typename Dest>
-  static void run(const ProductType& prod, Dest& dest, const typename ProductType::Scalar& alpha)
-  {
-    typedef typename Dest::Index Index;
-    // TODO makes sure dest is sequentially stored in memory, otherwise use a temp
-    const Index size = prod.rhs().rows();
-    for(Index k=0; k<size; ++k)
-      dest += (alpha*prod.rhs().coeff(k)) * prod.lhs().col(k);
+template <>
+struct gemv_dense_selector<OnTheRight, ColMajor, false> {
+  template <typename Lhs, typename Rhs, typename Dest>
+  static void run(const Lhs& lhs, const Rhs& rhs, Dest& dest, const typename Dest::Scalar& alpha) {
+    EIGEN_STATIC_ASSERT((!nested_eval<Lhs, 1>::Evaluate),
+                        EIGEN_INTERNAL_COMPILATION_ERROR_OR_YOU_MADE_A_PROGRAMMING_MISTAKE);
+    // TODO if rhs is large enough it might be beneficial to make sure that dest is sequentially stored in memory,
+    // otherwise use a temp
+    typename nested_eval<Rhs, 1>::type actual_rhs(rhs);
+    const Index size = rhs.rows();
+    for (Index k = 0; k < size; ++k) dest += (alpha * actual_rhs.coeff(k)) * lhs.col(k);
   }
 };
 
-template<> struct gemv_selector<OnTheRight,RowMajor,false>
-{
-  template<typename ProductType, typename Dest>
-  static void run(const ProductType& prod, Dest& dest, const typename ProductType::Scalar& alpha)
-  {
-    typedef typename Dest::Index Index;
-    // TODO makes sure rhs is sequentially stored in memory, otherwise use a temp
-    const Index rows = prod.rows();
-    for(Index i=0; i<rows; ++i)
-      dest.coeffRef(i) += alpha * (prod.lhs().row(i).cwiseProduct(prod.rhs().transpose())).sum();
+template <>
+struct gemv_dense_selector<OnTheRight, RowMajor, false> {
+  template <typename Lhs, typename Rhs, typename Dest>
+  static void run(const Lhs& lhs, const Rhs& rhs, Dest& dest, const typename Dest::Scalar& alpha) {
+    EIGEN_STATIC_ASSERT((!nested_eval<Lhs, 1>::Evaluate),
+                        EIGEN_INTERNAL_COMPILATION_ERROR_OR_YOU_MADE_A_PROGRAMMING_MISTAKE);
+    typename nested_eval<Rhs, Lhs::RowsAtCompileTime>::type actual_rhs(rhs);
+    const Index rows = dest.rows();
+    for (Index i = 0; i < rows; ++i)
+      dest.coeffRef(i) += alpha * (lhs.row(i).cwiseProduct(actual_rhs.transpose())).sum();
   }
 };
 
-} // end namespace internal
+}  // end namespace internal
 
 /***************************************************************************
-* Implementation of matrix base methods
-***************************************************************************/
+ * Implementation of matrix base methods
+ ***************************************************************************/
 
 /** \returns the matrix product of \c *this and \a other.
-  *
-  * \note If instead of the matrix product you want the coefficient-wise product, see Cwise::operator*().
-  *
-  * \sa lazyProduct(), operator*=(const MatrixBase&), Cwise::operator*()
-  */
-template<typename Derived>
-template<typename OtherDerived>
-inline const typename ProductReturnType<Derived, OtherDerived>::Type
-MatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const
-{
+ *
+ * \note If instead of the matrix product you want the coefficient-wise product, see Cwise::operator*().
+ *
+ * \sa lazyProduct(), operator*=(const MatrixBase&), Cwise::operator*()
+ */
+template <typename Derived>
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Product<Derived, OtherDerived> MatrixBase<Derived>::operator*(
+    const MatrixBase<OtherDerived>& other) const {
   // A note regarding the function declaration: In MSVC, this function will sometimes
   // not be inlined since DenseStorage is an unwindable object for dynamic
   // matrices and product types are holding a member to store the result.
   // Thus it does not help tagging this function with EIGEN_STRONG_INLINE.
   enum {
-    ProductIsValid =  Derived::ColsAtCompileTime==Dynamic
-                   || OtherDerived::RowsAtCompileTime==Dynamic
-                   || int(Derived::ColsAtCompileTime)==int(OtherDerived::RowsAtCompileTime),
+    ProductIsValid = Derived::ColsAtCompileTime == Dynamic || OtherDerived::RowsAtCompileTime == Dynamic ||
+                     int(Derived::ColsAtCompileTime) == int(OtherDerived::RowsAtCompileTime),
     AreVectors = Derived::IsVectorAtCompileTime && OtherDerived::IsVectorAtCompileTime,
-    SameSizes = EIGEN_PREDICATE_SAME_MATRIX_SIZE(Derived,OtherDerived)
+    SameSizes = EIGEN_PREDICATE_SAME_MATRIX_SIZE(Derived, OtherDerived)
   };
   // note to the lost user:
   //    * for a dot product use: v1.dot(v2)
   //    * for a coeff-wise product use: v1.cwiseProduct(v2)
-  EIGEN_STATIC_ASSERT(ProductIsValid || !(AreVectors && SameSizes),
-    INVALID_VECTOR_VECTOR_PRODUCT__IF_YOU_WANTED_A_DOT_OR_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTIONS)
+  EIGEN_STATIC_ASSERT(
+      ProductIsValid || !(AreVectors && SameSizes),
+      INVALID_VECTOR_VECTOR_PRODUCT__IF_YOU_WANTED_A_DOT_OR_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTIONS)
   EIGEN_STATIC_ASSERT(ProductIsValid || !(SameSizes && !AreVectors),
-    INVALID_MATRIX_PRODUCT__IF_YOU_WANTED_A_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTION)
+                      INVALID_MATRIX_PRODUCT__IF_YOU_WANTED_A_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTION)
   EIGEN_STATIC_ASSERT(ProductIsValid || SameSizes, INVALID_MATRIX_PRODUCT)
 #ifdef EIGEN_DEBUG_PRODUCT
-  internal::product_type<Derived,OtherDerived>::debug();
+  internal::product_type<Derived, OtherDerived>::debug();
 #endif
-  return typename ProductReturnType<Derived,OtherDerived>::Type(derived(), other.derived());
+
+  return Product<Derived, OtherDerived>(derived(), other.derived());
 }
 
 /** \returns an expression of the matrix product of \c *this and \a other without implicit evaluation.
-  *
-  * The returned product will behave like any other expressions: the coefficients of the product will be
-  * computed once at a time as requested. This might be useful in some extremely rare cases when only
-  * a small and no coherent fraction of the result's coefficients have to be computed.
-  *
-  * \warning This version of the matrix product can be much much slower. So use it only if you know
-  * what you are doing and that you measured a true speed improvement.
-  *
-  * \sa operator*(const MatrixBase&)
-  */
-template<typename Derived>
-template<typename OtherDerived>
-const typename LazyProductReturnType<Derived,OtherDerived>::Type
-MatrixBase<Derived>::lazyProduct(const MatrixBase<OtherDerived> &other) const
-{
+ *
+ * The returned product will behave like any other expressions: the coefficients of the product will be
+ * computed once at a time as requested. This might be useful in some extremely rare cases when only
+ * a small and no coherent fraction of the result's coefficients have to be computed.
+ *
+ * \warning This version of the matrix product can be much much slower. So use it only if you know
+ * what you are doing and that you measured a true speed improvement.
+ *
+ * \sa operator*(const MatrixBase&)
+ */
+template <typename Derived>
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Product<Derived, OtherDerived, LazyProduct>
+MatrixBase<Derived>::lazyProduct(const MatrixBase<OtherDerived>& other) const {
   enum {
-    ProductIsValid =  Derived::ColsAtCompileTime==Dynamic
-                   || OtherDerived::RowsAtCompileTime==Dynamic
-                   || int(Derived::ColsAtCompileTime)==int(OtherDerived::RowsAtCompileTime),
+    ProductIsValid = Derived::ColsAtCompileTime == Dynamic || OtherDerived::RowsAtCompileTime == Dynamic ||
+                     int(Derived::ColsAtCompileTime) == int(OtherDerived::RowsAtCompileTime),
     AreVectors = Derived::IsVectorAtCompileTime && OtherDerived::IsVectorAtCompileTime,
-    SameSizes = EIGEN_PREDICATE_SAME_MATRIX_SIZE(Derived,OtherDerived)
+    SameSizes = EIGEN_PREDICATE_SAME_MATRIX_SIZE(Derived, OtherDerived)
   };
   // note to the lost user:
   //    * for a dot product use: v1.dot(v2)
   //    * for a coeff-wise product use: v1.cwiseProduct(v2)
-  EIGEN_STATIC_ASSERT(ProductIsValid || !(AreVectors && SameSizes),
-    INVALID_VECTOR_VECTOR_PRODUCT__IF_YOU_WANTED_A_DOT_OR_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTIONS)
+  EIGEN_STATIC_ASSERT(
+      ProductIsValid || !(AreVectors && SameSizes),
+      INVALID_VECTOR_VECTOR_PRODUCT__IF_YOU_WANTED_A_DOT_OR_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTIONS)
   EIGEN_STATIC_ASSERT(ProductIsValid || !(SameSizes && !AreVectors),
-    INVALID_MATRIX_PRODUCT__IF_YOU_WANTED_A_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTION)
+                      INVALID_MATRIX_PRODUCT__IF_YOU_WANTED_A_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTION)
   EIGEN_STATIC_ASSERT(ProductIsValid || SameSizes, INVALID_MATRIX_PRODUCT)
 
-  return typename LazyProductReturnType<Derived,OtherDerived>::Type(derived(), other.derived());
+  return Product<Derived, OtherDerived, LazyProduct>(derived(), other.derived());
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_PRODUCT_H
+#endif  // EIGEN_PRODUCT_H
diff --git a/inst/include/Eigen/src/Core/GenericPacketMath.h b/inst/include/Eigen/src/Core/GenericPacketMath.h
index 5f783ebe..64e11231 100644
--- a/inst/include/Eigen/src/Core/GenericPacketMath.h
+++ b/inst/include/Eigen/src/Core/GenericPacketMath.h
@@ -11,17 +11,20 @@
 #ifndef EIGEN_GENERIC_PACKET_MATH_H
 #define EIGEN_GENERIC_PACKET_MATH_H
 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
 namespace Eigen {
 
 namespace internal {
 
 /** \internal
-  * \file GenericPacketMath.h
-  *
-  * Default implementation for types not supported by the vectorization.
-  * In practice these functions are provided to make easier the writing
-  * of generic vectorized code.
-  */
+ * \file GenericPacketMath.h
+ *
+ * Default implementation for types not supported by the vectorization.
+ * In practice these functions are provided to make easier the writing
+ * of generic vectorized code.
+ */
 
 #ifndef EIGEN_DEBUG_ALIGNED_LOAD
 #define EIGEN_DEBUG_ALIGNED_LOAD
@@ -39,312 +42,1661 @@ namespace internal {
 #define EIGEN_DEBUG_UNALIGNED_STORE
 #endif
 
-struct default_packet_traits
-{
+struct default_packet_traits {
   enum {
-    HasAdd    = 1,
-    HasSub    = 1,
-    HasMul    = 1,
+    // Ops that are implemented for most types.
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
     HasNegate = 1,
-    HasAbs    = 1,
-    HasAbs2   = 1,
-    HasMin    = 1,
-    HasMax    = 1,
-    HasConj   = 1,
+    HasAbs = 1,
+    HasAbs2 = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
     HasSetLinear = 1,
-
-    HasDiv    = 0,
-    HasSqrt   = 0,
-    HasExp    = 0,
-    HasLog    = 0,
-    HasPow    = 0,
-
-    HasSin    = 0,
-    HasCos    = 0,
-    HasTan    = 0,
-    HasASin   = 0,
-    HasACos   = 0,
-    HasATan   = 0
+    HasSign = 1,
+    // By default, the nearest integer functions (rint, round, floor, ceil, trunc) are enabled for all scalar and packet
+    // types
+    HasRound = 1,
+
+    HasArg = 0,
+    HasAbsDiff = 0,
+    HasBlend = 0,
+    // This flag is used to indicate whether packet comparison is supported.
+    // pcmp_eq and pcmp_lt should be defined for it to be true.
+    HasCmp = 0,
+
+    HasDiv = 0,
+    HasReciprocal = 0,
+    HasSqrt = 0,
+    HasRsqrt = 0,
+    HasCbrt = 0,
+    HasExp = 0,
+    HasExpm1 = 0,
+    HasLog = 0,
+    HasLog1p = 0,
+    HasLog10 = 0,
+    HasPow = 0,
+    HasSin = 0,
+    HasCos = 0,
+    HasTan = 0,
+    HasASin = 0,
+    HasACos = 0,
+    HasATan = 0,
+    HasATanh = 0,
+    HasSinh = 0,
+    HasCosh = 0,
+    HasTanh = 0,
+    HasLGamma = 0,
+    HasDiGamma = 0,
+    HasZeta = 0,
+    HasPolygamma = 0,
+    HasErf = 0,
+    HasErfc = 0,
+    HasNdtri = 0,
+    HasBessel = 0,
+    HasIGamma = 0,
+    HasIGammaDerA = 0,
+    HasGammaSampleDerAlpha = 0,
+    HasIGammac = 0,
+    HasBetaInc = 0
   };
 };
 
-template<typename T> struct packet_traits : default_packet_traits
-{
+template <typename T>
+struct packet_traits : default_packet_traits {
   typedef T type;
+  typedef T half;
   enum {
     Vectorizable = 0,
     size = 1,
-    AlignedOnScalar = 0
+    AlignedOnScalar = 0,
   };
   enum {
-    HasAdd    = 0,
-    HasSub    = 0,
-    HasMul    = 0,
+    HasAdd = 0,
+    HasSub = 0,
+    HasMul = 0,
     HasNegate = 0,
-    HasAbs    = 0,
-    HasAbs2   = 0,
-    HasMin    = 0,
-    HasMax    = 0,
-    HasConj   = 0,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 0,
+    HasMax = 0,
+    HasConj = 0,
     HasSetLinear = 0
   };
 };
 
+template <typename T>
+struct packet_traits<const T> : packet_traits<T> {};
+
+template <typename T>
+struct unpacket_traits {
+  typedef T type;
+  typedef T half;
+  typedef typename numext::get_integer_by_size<sizeof(T)>::signed_type integer_packet;
+  enum {
+    size = 1,
+    alignment = alignof(T),
+    vectorizable = false,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template <typename T>
+struct unpacket_traits<const T> : unpacket_traits<T> {};
+
+/** \internal A convenience utility for determining if the type is a scalar.
+ * This is used to enable some generic packet implementations.
+ */
+template <typename Packet>
+struct is_scalar {
+  using Scalar = typename unpacket_traits<Packet>::type;
+  enum { value = internal::is_same<Packet, Scalar>::value };
+};
+
+// automatically and succinctly define combinations of pcast<SrcPacket,TgtPacket> when
+// 1) the packets are the same type, or
+// 2) the packets differ only in sign.
+// In both of these cases, preinterpret (bit_cast) is equivalent to pcast (static_cast)
+template <typename SrcPacket, typename TgtPacket,
+          bool Scalar = is_scalar<SrcPacket>::value && is_scalar<TgtPacket>::value>
+struct is_degenerate_helper : is_same<SrcPacket, TgtPacket> {};
+template <>
+struct is_degenerate_helper<int8_t, uint8_t, true> : std::true_type {};
+template <>
+struct is_degenerate_helper<int16_t, uint16_t, true> : std::true_type {};
+template <>
+struct is_degenerate_helper<int32_t, uint32_t, true> : std::true_type {};
+template <>
+struct is_degenerate_helper<int64_t, uint64_t, true> : std::true_type {};
+
+template <typename SrcPacket, typename TgtPacket>
+struct is_degenerate_helper<SrcPacket, TgtPacket, false> {
+  using SrcScalar = typename unpacket_traits<SrcPacket>::type;
+  static constexpr int SrcSize = unpacket_traits<SrcPacket>::size;
+  using TgtScalar = typename unpacket_traits<TgtPacket>::type;
+  static constexpr int TgtSize = unpacket_traits<TgtPacket>::size;
+  static constexpr bool value = is_degenerate_helper<SrcScalar, TgtScalar, true>::value && (SrcSize == TgtSize);
+};
+
+// is_degenerate<T1,T2>::value == is_degenerate<T2,T1>::value
+template <typename SrcPacket, typename TgtPacket>
+struct is_degenerate {
+  static constexpr bool value =
+      is_degenerate_helper<SrcPacket, TgtPacket>::value || is_degenerate_helper<TgtPacket, SrcPacket>::value;
+};
+
+template <typename Packet>
+struct is_half {
+  using Scalar = typename unpacket_traits<Packet>::type;
+  static constexpr int Size = unpacket_traits<Packet>::size;
+  using DefaultPacket = typename packet_traits<Scalar>::type;
+  static constexpr int DefaultSize = unpacket_traits<DefaultPacket>::size;
+  static constexpr bool value = Size != 1 && Size < DefaultSize;
+};
+
+template <typename Src, typename Tgt>
+struct type_casting_traits {
+  enum {
+    VectorizedCast =
+        is_degenerate<Src, Tgt>::value && packet_traits<Src>::Vectorizable && packet_traits<Tgt>::Vectorizable,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+// provides a succinct template to define vectorized casting traits with respect to the largest accessible packet types
+template <typename Src, typename Tgt>
+struct vectorized_type_casting_traits {
+  enum : int {
+    DefaultSrcPacketSize = packet_traits<Src>::size,
+    DefaultTgtPacketSize = packet_traits<Tgt>::size,
+    VectorizedCast = 1,
+    SrcCoeffRatio = plain_enum_max(DefaultTgtPacketSize / DefaultSrcPacketSize, 1),
+    TgtCoeffRatio = plain_enum_max(DefaultSrcPacketSize / DefaultTgtPacketSize, 1)
+  };
+};
+
+/** \internal Wrapper to ensure that multiple packet types can map to the same
+    same underlying vector type. */
+template <typename T, int unique_id = 0>
+struct eigen_packet_wrapper {
+  EIGEN_ALWAYS_INLINE operator T&() { return m_val; }
+  EIGEN_ALWAYS_INLINE operator const T&() const { return m_val; }
+  EIGEN_ALWAYS_INLINE eigen_packet_wrapper() = default;
+  EIGEN_ALWAYS_INLINE eigen_packet_wrapper(const T& v) : m_val(v) {}
+  EIGEN_ALWAYS_INLINE eigen_packet_wrapper& operator=(const T& v) {
+    m_val = v;
+    return *this;
+  }
+
+  T m_val;
+};
+
+template <typename Target, typename Packet, bool IsSame = is_same<Target, Packet>::value>
+struct preinterpret_generic;
+
+template <typename Target, typename Packet>
+struct preinterpret_generic<Target, Packet, false> {
+  // the packets are not the same, attempt scalar bit_cast
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Target run(const Packet& a) {
+    return numext::bit_cast<Target, Packet>(a);
+  }
+};
+
+template <typename Packet>
+struct preinterpret_generic<Packet, Packet, true> {
+  // the packets are the same type: do nothing
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& a) { return a; }
+};
+
+template <typename ComplexPacket>
+struct preinterpret_generic<typename unpacket_traits<ComplexPacket>::as_real, ComplexPacket, false> {
+  using RealPacket = typename unpacket_traits<ComplexPacket>::as_real;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE RealPacket run(const ComplexPacket& a) { return a.v; }
+};
+
+/** \internal \returns reinterpret_cast<Target>(a) */
+template <typename Target, typename Packet>
+EIGEN_DEVICE_FUNC inline Target preinterpret(const Packet& a) {
+  return preinterpret_generic<Target, Packet>::run(a);
+}
+
+template <typename SrcPacket, typename TgtPacket, bool Degenerate = is_degenerate<SrcPacket, TgtPacket>::value,
+          bool TgtIsHalf = is_half<TgtPacket>::value>
+struct pcast_generic;
+
+template <typename SrcPacket, typename TgtPacket>
+struct pcast_generic<SrcPacket, TgtPacket, false, false> {
+  // the packets are not degenerate: attempt scalar static_cast
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket run(const SrcPacket& a) {
+    return cast_impl<SrcPacket, TgtPacket>::run(a);
+  }
+};
+
+template <typename Packet>
+struct pcast_generic<Packet, Packet, true, false> {
+  // the packets are the same: do nothing
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& a) { return a; }
+};
+
+template <typename SrcPacket, typename TgtPacket, bool TgtIsHalf>
+struct pcast_generic<SrcPacket, TgtPacket, true, TgtIsHalf> {
+  // the packets are degenerate: preinterpret is equivalent to pcast
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket run(const SrcPacket& a) { return preinterpret<TgtPacket>(a); }
+};
+
+/** \internal \returns static_cast<TgtType>(a) (coeff-wise) */
+template <typename SrcPacket, typename TgtPacket>
+EIGEN_DEVICE_FUNC inline TgtPacket pcast(const SrcPacket& a) {
+  return pcast_generic<SrcPacket, TgtPacket>::run(a);
+}
+template <typename SrcPacket, typename TgtPacket>
+EIGEN_DEVICE_FUNC inline TgtPacket pcast(const SrcPacket& a, const SrcPacket& b) {
+  return pcast_generic<SrcPacket, TgtPacket>::run(a, b);
+}
+template <typename SrcPacket, typename TgtPacket>
+EIGEN_DEVICE_FUNC inline TgtPacket pcast(const SrcPacket& a, const SrcPacket& b, const SrcPacket& c,
+                                         const SrcPacket& d) {
+  return pcast_generic<SrcPacket, TgtPacket>::run(a, b, c, d);
+}
+template <typename SrcPacket, typename TgtPacket>
+EIGEN_DEVICE_FUNC inline TgtPacket pcast(const SrcPacket& a, const SrcPacket& b, const SrcPacket& c, const SrcPacket& d,
+                                         const SrcPacket& e, const SrcPacket& f, const SrcPacket& g,
+                                         const SrcPacket& h) {
+  return pcast_generic<SrcPacket, TgtPacket>::run(a, b, c, d, e, f, g, h);
+}
+
+template <typename SrcPacket, typename TgtPacket>
+struct pcast_generic<SrcPacket, TgtPacket, false, true> {
+  // TgtPacket is a half packet of some other type
+  // perform cast and truncate result
+  using DefaultTgtPacket = typename is_half<TgtPacket>::DefaultPacket;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket run(const SrcPacket& a) {
+    return preinterpret<TgtPacket>(pcast<SrcPacket, DefaultTgtPacket>(a));
+  }
+};
+
 /** \internal \returns a + b (coeff-wise) */
-template<typename Packet> inline Packet
-padd(const Packet& a,
-        const Packet& b) { return a+b; }
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet padd(const Packet& a, const Packet& b) {
+  return a + b;
+}
+// Avoid compiler warning for boolean algebra.
+template <>
+EIGEN_DEVICE_FUNC inline bool padd(const bool& a, const bool& b) {
+  return a || b;
+}
+
+/** \internal \returns a packet version of \a *from, (un-aligned masked add)
+ * There is no generic implementation. We only have implementations for specialized
+ * cases. Generic case should not be called.
+ */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline std::enable_if_t<unpacket_traits<Packet>::masked_fpops_available, Packet> padd(
+    const Packet& a, const Packet& b, typename unpacket_traits<Packet>::mask_t umask);
 
 /** \internal \returns a - b (coeff-wise) */
-template<typename Packet> inline Packet
-psub(const Packet& a,
-        const Packet& b) { return a-b; }
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet psub(const Packet& a, const Packet& b) {
+  return a - b;
+}
 
 /** \internal \returns -a (coeff-wise) */
-template<typename Packet> inline Packet
-pnegate(const Packet& a) { return -a; }
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pnegate(const Packet& a) {
+  EIGEN_STATIC_ASSERT((!is_same<typename unpacket_traits<Packet>::type, bool>::value),
+                      NEGATE IS NOT DEFINED FOR BOOLEAN TYPES)
+  return numext::negate(a);
+}
 
 /** \internal \returns conj(a) (coeff-wise) */
-template<typename Packet> inline Packet
-pconj(const Packet& a) { return numext::conj(a); }
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pconj(const Packet& a) {
+  return numext::conj(a);
+}
 
 /** \internal \returns a * b (coeff-wise) */
-template<typename Packet> inline Packet
-pmul(const Packet& a,
-        const Packet& b) { return a*b; }
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pmul(const Packet& a, const Packet& b) {
+  return a * b;
+}
+// Avoid compiler warning for boolean algebra.
+template <>
+EIGEN_DEVICE_FUNC inline bool pmul(const bool& a, const bool& b) {
+  return a && b;
+}
 
 /** \internal \returns a / b (coeff-wise) */
-template<typename Packet> inline Packet
-pdiv(const Packet& a,
-        const Packet& b) { return a/b; }
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pdiv(const Packet& a, const Packet& b) {
+  return a / b;
+}
+// Avoid compiler warning for boolean algebra.
+template <>
+EIGEN_DEVICE_FUNC inline bool pdiv(const bool& a, const bool& b) {
+  return a && b;
+}
 
-/** \internal \returns the min of \a a and \a b  (coeff-wise) */
-template<typename Packet> inline Packet
-pmin(const Packet& a,
-        const Packet& b) { using std::min; return (min)(a, b); }
+// In the generic packet case, memset to all one bits.
+template <typename Packet, typename EnableIf = void>
+struct ptrue_impl {
+  static EIGEN_DEVICE_FUNC inline Packet run(const Packet& /*a*/) {
+    Packet b;
+    memset(static_cast<void*>(&b), 0xff, sizeof(Packet));
+    return b;
+  }
+};
 
-/** \internal \returns the max of \a a and \a b  (coeff-wise) */
-template<typename Packet> inline Packet
-pmax(const Packet& a,
-        const Packet& b) { using std::max; return (max)(a, b); }
+// Use a value of one for scalars.
+template <typename Scalar>
+struct ptrue_impl<Scalar, std::enable_if_t<is_scalar<Scalar>::value>> {
+  static EIGEN_DEVICE_FUNC inline Scalar run(const Scalar&) { return Scalar(1); }
+};
 
-/** \internal \returns the absolute value of \a a */
-template<typename Packet> inline Packet
-pabs(const Packet& a) { using std::abs; return abs(a); }
+// For booleans, we can only directly set a valid `bool` value to avoid UB.
+template <>
+struct ptrue_impl<bool, void> {
+  static EIGEN_DEVICE_FUNC inline bool run(const bool&) { return true; }
+};
+
+/** \internal \returns one bits. */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet ptrue(const Packet& a) {
+  return ptrue_impl<Packet>::run(a);
+}
+
+// In the general packet case, memset to zero.
+template <typename Packet, typename EnableIf = void>
+struct pzero_impl {
+  static EIGEN_DEVICE_FUNC inline Packet run(const Packet& /*a*/) {
+    Packet b;
+    memset(static_cast<void*>(&b), 0x00, sizeof(Packet));
+    return b;
+  }
+};
+
+// For scalars, explicitly set to Scalar(0), since the underlying representation
+// for zero may not consist of all-zero bits.
+template <typename T>
+struct pzero_impl<T, std::enable_if_t<is_scalar<T>::value>> {
+  static EIGEN_DEVICE_FUNC inline T run(const T& /*a*/) { return T(0); }
+};
+
+/** \internal \returns packet of zeros */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pzero(const Packet& a) {
+  return pzero_impl<Packet>::run(a);
+}
+
+template <typename T>
+struct bit_and {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE T operator()(const T& a, const T& b) const { return a & b; }
+};
+
+template <typename T>
+struct bit_or {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE T operator()(const T& a, const T& b) const { return a | b; }
+};
+
+template <typename T>
+struct bit_xor {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE T operator()(const T& a, const T& b) const { return a ^ b; }
+};
+
+template <typename T>
+struct bit_not {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE T operator()(const T& a) const { return ~a; }
+};
+
+template <>
+struct bit_and<bool> {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE bool operator()(const bool& a, const bool& b) const { return a && b; }
+};
+
+template <>
+struct bit_or<bool> {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE bool operator()(const bool& a, const bool& b) const { return a || b; }
+};
+
+template <>
+struct bit_xor<bool> {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE bool operator()(const bool& a, const bool& b) const { return a != b; }
+};
+
+template <>
+struct bit_not<bool> {
+  EIGEN_DEVICE_FUNC constexpr EIGEN_ALWAYS_INLINE bool operator()(const bool& a) const { return !a; }
+};
+
+// Use operators &, |, ^, ~.
+template <typename T>
+struct operator_bitwise_helper {
+  EIGEN_DEVICE_FUNC static inline T bitwise_and(const T& a, const T& b) { return bit_and<T>()(a, b); }
+  EIGEN_DEVICE_FUNC static inline T bitwise_or(const T& a, const T& b) { return bit_or<T>()(a, b); }
+  EIGEN_DEVICE_FUNC static inline T bitwise_xor(const T& a, const T& b) { return bit_xor<T>()(a, b); }
+  EIGEN_DEVICE_FUNC static inline T bitwise_not(const T& a) { return bit_not<T>()(a); }
+};
+
+// Apply binary operations byte-by-byte
+template <typename T>
+struct bytewise_bitwise_helper {
+  EIGEN_DEVICE_FUNC static inline T bitwise_and(const T& a, const T& b) {
+    return binary(a, b, bit_and<unsigned char>());
+  }
+  EIGEN_DEVICE_FUNC static inline T bitwise_or(const T& a, const T& b) { return binary(a, b, bit_or<unsigned char>()); }
+  EIGEN_DEVICE_FUNC static inline T bitwise_xor(const T& a, const T& b) {
+    return binary(a, b, bit_xor<unsigned char>());
+  }
+  EIGEN_DEVICE_FUNC static inline T bitwise_not(const T& a) { return unary(a, bit_not<unsigned char>()); }
+
+ private:
+  template <typename Op>
+  EIGEN_DEVICE_FUNC static inline T unary(const T& a, Op op) {
+    const unsigned char* a_ptr = reinterpret_cast<const unsigned char*>(&a);
+    T c;
+    unsigned char* c_ptr = reinterpret_cast<unsigned char*>(&c);
+    for (size_t i = 0; i < sizeof(T); ++i) {
+      *c_ptr++ = op(*a_ptr++);
+    }
+    return c;
+  }
+
+  template <typename Op>
+  EIGEN_DEVICE_FUNC static inline T binary(const T& a, const T& b, Op op) {
+    const unsigned char* a_ptr = reinterpret_cast<const unsigned char*>(&a);
+    const unsigned char* b_ptr = reinterpret_cast<const unsigned char*>(&b);
+    T c;
+    unsigned char* c_ptr = reinterpret_cast<unsigned char*>(&c);
+    for (size_t i = 0; i < sizeof(T); ++i) {
+      *c_ptr++ = op(*a_ptr++, *b_ptr++);
+    }
+    return c;
+  }
+};
+
+// In the general case, use byte-by-byte manipulation.
+template <typename T, typename EnableIf = void>
+struct bitwise_helper : public bytewise_bitwise_helper<T> {};
+
+// For integers or non-trivial scalars, use binary operators.
+template <typename T>
+struct bitwise_helper<T, typename std::enable_if_t<is_scalar<T>::value &&
+                                                   (NumTraits<T>::IsInteger || NumTraits<T>::RequireInitialization)>>
+    : public operator_bitwise_helper<T> {};
 
 /** \internal \returns the bitwise and of \a a and \a b */
-template<typename Packet> inline Packet
-pand(const Packet& a, const Packet& b) { return a & b; }
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pand(const Packet& a, const Packet& b) {
+  return bitwise_helper<Packet>::bitwise_and(a, b);
+}
 
 /** \internal \returns the bitwise or of \a a and \a b */
-template<typename Packet> inline Packet
-por(const Packet& a, const Packet& b) { return a | b; }
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet por(const Packet& a, const Packet& b) {
+  return bitwise_helper<Packet>::bitwise_or(a, b);
+}
 
 /** \internal \returns the bitwise xor of \a a and \a b */
-template<typename Packet> inline Packet
-pxor(const Packet& a, const Packet& b) { return a ^ b; }
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pxor(const Packet& a, const Packet& b) {
+  return bitwise_helper<Packet>::bitwise_xor(a, b);
+}
+
+/** \internal \returns the bitwise not of \a a */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pnot(const Packet& a) {
+  return bitwise_helper<Packet>::bitwise_not(a);
+}
+
+/** \internal \returns the bitwise and of \a a and not \a b */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pandnot(const Packet& a, const Packet& b) {
+  return pand(a, pnot(b));
+}
+
+/** \internal \returns a < b as a bit mask */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pcmp_lt(const Packet& a, const Packet& b) {
+  return a < b ? ptrue(a) : pzero(a);
+}
+
+/** \internal \returns a == b as a bit mask */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pcmp_eq(const Packet& a, const Packet& b) {
+  return a == b ? ptrue(a) : pzero(a);
+}
+
+/** \internal \returns a <= b as a bit mask */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pcmp_le(const Packet& a, const Packet& b) {
+  return por(pcmp_eq(a, b), pcmp_lt(a, b));
+}
+
+/** \internal \returns a < b or a==NaN or b==NaN as a bit mask */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pcmp_lt_or_nan(const Packet& a, const Packet& b) {
+  return a >= b ? pzero(a) : ptrue(a);
+}
+
+// In the general case, use bitwise select.
+template <typename Packet, bool is_scalar = is_scalar<Packet>::value>
+struct pselect_impl {
+  static EIGEN_DEVICE_FUNC inline Packet run(const Packet& mask, const Packet& a, const Packet& b) {
+    return por(pand(a, mask), pandnot(b, mask));
+  }
+};
+
+// For scalars, use ternary select.
+template <typename Packet>
+struct pselect_impl<Packet, true> {
+  static EIGEN_DEVICE_FUNC inline Packet run(const Packet& mask, const Packet& a, const Packet& b) {
+    return numext::select(mask, a, b);
+  }
+};
+
+/** \internal \returns \a or \b for each field in packet according to \mask */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pselect(const Packet& mask, const Packet& a, const Packet& b) {
+  return pselect_impl<Packet>::run(mask, a, b);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline bool pselect<bool>(const bool& cond, const bool& a, const bool& b) {
+  return cond ? a : b;
+}
+
+/** \internal \returns the min or of \a a and \a b (coeff-wise)
+    If either \a a or \a b are NaN, the result is implementation defined. */
+template <int NaNPropagation, bool IsInteger>
+struct pminmax_impl {
+  template <typename Packet, typename Op>
+  static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a, const Packet& b, Op op) {
+    return op(a, b);
+  }
+};
 
-/** \internal \returns the bitwise andnot of \a a and \a b */
-template<typename Packet> inline Packet
-pandnot(const Packet& a, const Packet& b) { return a & (!b); }
+/** \internal \returns the min or max of \a a and \a b (coeff-wise)
+    If either \a a or \a b are NaN, NaN is returned. */
+template <>
+struct pminmax_impl<PropagateNaN, false> {
+  template <typename Packet, typename Op>
+  static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a, const Packet& b, Op op) {
+    Packet not_nan_mask_a = pcmp_eq(a, a);
+    Packet not_nan_mask_b = pcmp_eq(b, b);
+    return pselect(not_nan_mask_a, pselect(not_nan_mask_b, op(a, b), b), a);
+  }
+};
+
+/** \internal \returns the min or max of \a a and \a b (coeff-wise)
+    If both \a a and \a b are NaN, NaN is returned.
+    Equivalent to std::fmin(a, b).  */
+template <>
+struct pminmax_impl<PropagateNumbers, false> {
+  template <typename Packet, typename Op>
+  static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a, const Packet& b, Op op) {
+    Packet not_nan_mask_a = pcmp_eq(a, a);
+    Packet not_nan_mask_b = pcmp_eq(b, b);
+    return pselect(not_nan_mask_a, pselect(not_nan_mask_b, op(a, b), a), b);
+  }
+};
 
-/** \internal \returns a packet version of \a *from, from must be 16 bytes aligned */
-template<typename Packet> inline Packet
-pload(const typename unpacket_traits<Packet>::type* from) { return *from; }
+#define EIGEN_BINARY_OP_NAN_PROPAGATION(Type, Func) [](const Type& aa, const Type& bb) { return Func(aa, bb); }
+
+/** \internal \returns the min of \a a and \a b  (coeff-wise).
+    If \a a or \b b is NaN, the return value is implementation defined. */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pmin(const Packet& a, const Packet& b) {
+  return numext::mini(a, b);
+}
+
+/** \internal \returns the min of \a a and \a b  (coeff-wise).
+    NaNPropagation determines the NaN propagation semantics. */
+template <int NaNPropagation, typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pmin(const Packet& a, const Packet& b) {
+  constexpr bool IsInteger = NumTraits<typename unpacket_traits<Packet>::type>::IsInteger;
+  return pminmax_impl<NaNPropagation, IsInteger>::run(a, b, EIGEN_BINARY_OP_NAN_PROPAGATION(Packet, (pmin<Packet>)));
+}
+
+/** \internal \returns the max of \a a and \a b  (coeff-wise)
+    If \a a or \b b is NaN, the return value is implementation defined. */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pmax(const Packet& a, const Packet& b) {
+  return numext::maxi(a, b);
+}
+
+/** \internal \returns the max of \a a and \a b  (coeff-wise).
+    NaNPropagation determines the NaN propagation semantics. */
+template <int NaNPropagation, typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pmax(const Packet& a, const Packet& b) {
+  constexpr bool IsInteger = NumTraits<typename unpacket_traits<Packet>::type>::IsInteger;
+  return pminmax_impl<NaNPropagation, IsInteger>::run(a, b, EIGEN_BINARY_OP_NAN_PROPAGATION(Packet, (pmax<Packet>)));
+}
+
+/** \internal \returns the absolute value of \a a */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pabs(const Packet& a) {
+  return numext::abs(a);
+}
+template <>
+EIGEN_DEVICE_FUNC inline unsigned int pabs(const unsigned int& a) {
+  return a;
+}
+template <>
+EIGEN_DEVICE_FUNC inline unsigned long pabs(const unsigned long& a) {
+  return a;
+}
+template <>
+EIGEN_DEVICE_FUNC inline unsigned long long pabs(const unsigned long long& a) {
+  return a;
+}
+
+/** \internal \returns the addsub value of \a a,b */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet paddsub(const Packet& a, const Packet& b) {
+  return pselect(peven_mask(a), padd(a, b), psub(a, b));
+}
+
+/** \internal \returns the phase angle of \a a */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet parg(const Packet& a) {
+  using numext::arg;
+  return arg(a);
+}
+
+/** \internal \returns \a a arithmetically shifted by N bits to the right */
+template <int N, typename T>
+EIGEN_DEVICE_FUNC inline T parithmetic_shift_right(const T& a) {
+  return numext::arithmetic_shift_right(a, N);
+}
+
+/** \internal \returns \a a logically shifted by N bits to the right */
+template <int N, typename T>
+EIGEN_DEVICE_FUNC inline T plogical_shift_right(const T& a) {
+  return numext::logical_shift_right(a, N);
+}
+
+/** \internal \returns \a a shifted by N bits to the left */
+template <int N, typename T>
+EIGEN_DEVICE_FUNC inline T plogical_shift_left(const T& a) {
+  return numext::logical_shift_left(a, N);
+}
+
+/** \internal \returns the significant and exponent of the underlying floating point numbers
+ * See https://en.cppreference.com/w/cpp/numeric/math/frexp
+ */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pfrexp(const Packet& a, Packet& exponent) {
+  int exp;
+  EIGEN_USING_STD(frexp);
+  Packet result = static_cast<Packet>(frexp(a, &exp));
+  exponent = static_cast<Packet>(exp);
+  return result;
+}
+
+/** \internal \returns a * 2^((int)exponent)
+ * See https://en.cppreference.com/w/cpp/numeric/math/ldexp
+ */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pldexp(const Packet& a, const Packet& exponent) {
+  EIGEN_USING_STD(ldexp)
+  return static_cast<Packet>(ldexp(a, static_cast<int>(exponent)));
+}
+
+/** \internal \returns the min of \a a and \a b  (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pabsdiff(const Packet& a, const Packet& b) {
+  return pselect(pcmp_lt(a, b), psub(b, a), psub(a, b));
+}
+
+/** \internal \returns a packet version of \a *from, from must be properly aligned */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pload(const typename unpacket_traits<Packet>::type* from) {
+  return *from;
+}
+
+/** \internal \returns n elements of a packet version of \a *from, from must be properly aligned
+ * offset indicates the starting element in which to load and
+ * offset + n <= unpacket_traits::size
+ * All elements before offset and after the last element loaded will initialized with zero */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pload_partial(const typename unpacket_traits<Packet>::type* from, const Index n,
+                                              const Index offset = 0) {
+  const Index packet_size = unpacket_traits<Packet>::size;
+  eigen_assert(n + offset <= packet_size && "number of elements plus offset will read past end of packet");
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  EIGEN_ALIGN_MAX Scalar elements[packet_size] = {Scalar(0)};
+  for (Index i = offset; i < numext::mini(n + offset, packet_size); i++) {
+    elements[i] = from[i - offset];
+  }
+  return pload<Packet>(elements);
+}
 
 /** \internal \returns a packet version of \a *from, (un-aligned load) */
-template<typename Packet> inline Packet
-ploadu(const typename unpacket_traits<Packet>::type* from) { return *from; }
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet ploadu(const typename unpacket_traits<Packet>::type* from) {
+  return *from;
+}
 
-/** \internal \returns a packet with elements of \a *from duplicated.
-  * For instance, for a packet of 8 elements, 4 scalar will be read from \a *from and
-  * duplicated to form: {from[0],from[0],from[1],from[1],,from[2],from[2],,from[3],from[3]}
-  * Currently, this function is only used for scalar * complex products.
+/** \internal \returns n elements of a packet version of \a *from, (un-aligned load)
+ * All elements after the last element loaded will initialized with zero */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet ploadu_partial(const typename unpacket_traits<Packet>::type* from, const Index n,
+                                               const Index offset = 0) {
+  const Index packet_size = unpacket_traits<Packet>::size;
+  eigen_assert(n + offset <= packet_size && "number of elements plus offset will read past end of packet");
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  EIGEN_ALIGN_MAX Scalar elements[packet_size] = {Scalar(0)};
+  for (Index i = offset; i < numext::mini(n + offset, packet_size); i++) {
+    elements[i] = from[i - offset];
+  }
+  return pload<Packet>(elements);
+}
+
+/** \internal \returns a packet version of \a *from, (un-aligned masked load)
+ * There is no generic implementation. We only have implementations for specialized
+ * cases. Generic case should not be called.
  */
-template<typename Packet> inline Packet
-ploaddup(const typename unpacket_traits<Packet>::type* from) { return *from; }
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline std::enable_if_t<unpacket_traits<Packet>::masked_load_available, Packet> ploadu(
+    const typename unpacket_traits<Packet>::type* from, typename unpacket_traits<Packet>::mask_t umask);
 
 /** \internal \returns a packet with constant coefficients \a a, e.g.: (a,a,a,a) */
-template<typename Packet> inline Packet
-pset1(const typename unpacket_traits<Packet>::type& a) { return a; }
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pset1(const typename unpacket_traits<Packet>::type& a) {
+  return a;
+}
+
+/** \internal \returns a packet with constant coefficients set from bits */
+template <typename Packet, typename BitsType>
+EIGEN_DEVICE_FUNC inline Packet pset1frombits(BitsType a);
+
+/** \internal \returns a packet with constant coefficients \a a[0], e.g.: (a[0],a[0],a[0],a[0]) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pload1(const typename unpacket_traits<Packet>::type* a) {
+  return pset1<Packet>(*a);
+}
+
+/** \internal \returns a packet with elements of \a *from duplicated.
+ * For instance, for a packet of 8 elements, 4 scalars will be read from \a *from and
+ * duplicated to form: {from[0],from[0],from[1],from[1],from[2],from[2],from[3],from[3]}
+ * Currently, this function is only used for scalar * complex products.
+ */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet ploaddup(const typename unpacket_traits<Packet>::type* from) {
+  return *from;
+}
+
+/** \internal \returns a packet with elements of \a *from quadrupled.
+ * For instance, for a packet of 8 elements, 2 scalars will be read from \a *from and
+ * replicated to form: {from[0],from[0],from[0],from[0],from[1],from[1],from[1],from[1]}
+ * Currently, this function is only used in matrix products.
+ * For packet-size smaller or equal to 4, this function is equivalent to pload1
+ */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet ploadquad(const typename unpacket_traits<Packet>::type* from) {
+  return pload1<Packet>(from);
+}
+
+/** \internal equivalent to
+ * \code
+ * a0 = pload1(a+0);
+ * a1 = pload1(a+1);
+ * a2 = pload1(a+2);
+ * a3 = pload1(a+3);
+ * \endcode
+ * \sa pset1, pload1, ploaddup, pbroadcast2
+ */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline void pbroadcast4(const typename unpacket_traits<Packet>::type* a, Packet& a0, Packet& a1,
+                                          Packet& a2, Packet& a3) {
+  a0 = pload1<Packet>(a + 0);
+  a1 = pload1<Packet>(a + 1);
+  a2 = pload1<Packet>(a + 2);
+  a3 = pload1<Packet>(a + 3);
+}
+
+/** \internal equivalent to
+ * \code
+ * a0 = pload1(a+0);
+ * a1 = pload1(a+1);
+ * \endcode
+ * \sa pset1, pload1, ploaddup, pbroadcast4
+ */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline void pbroadcast2(const typename unpacket_traits<Packet>::type* a, Packet& a0, Packet& a1) {
+  a0 = pload1<Packet>(a + 0);
+  a1 = pload1<Packet>(a + 1);
+}
 
 /** \internal \brief Returns a packet with coefficients (a,a+1,...,a+packet_size-1). */
-template<typename Scalar> inline typename packet_traits<Scalar>::type
-plset(const Scalar& a) { return a; }
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet plset(const typename unpacket_traits<Packet>::type& a) {
+  return a;
+}
+
+template <typename Packet, typename EnableIf = void>
+struct peven_mask_impl {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet&) {
+    typedef typename unpacket_traits<Packet>::type Scalar;
+    const size_t n = unpacket_traits<Packet>::size;
+    EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) Scalar elements[n];
+    for (size_t i = 0; i < n; ++i) {
+      memset(elements + i, ((i & 1) == 0 ? 0xff : 0), sizeof(Scalar));
+    }
+    return ploadu<Packet>(elements);
+  }
+};
+
+template <typename Scalar>
+struct peven_mask_impl<Scalar, std::enable_if_t<is_scalar<Scalar>::value>> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run(const Scalar&) { return Scalar(1); }
+};
+
+/** \internal \returns a packet with constant coefficients \a a, e.g.: (x, 0, x, 0),
+     where x is the value of all 1-bits. */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet peven_mask(const Packet& a) {
+  return peven_mask_impl<Packet>::run(a);
+}
 
-/** \internal copy the packet \a from to \a *to, \a to must be 16 bytes aligned */
-template<typename Scalar, typename Packet> inline void pstore(Scalar* to, const Packet& from)
-{ (*to) = from; }
+/** \internal copy the packet \a from to \a *to, \a to must be properly aligned */
+template <typename Scalar, typename Packet>
+EIGEN_DEVICE_FUNC inline void pstore(Scalar* to, const Packet& from) {
+  (*to) = from;
+}
+
+/** \internal copy n elements of the packet \a from to \a *to, \a to must be properly aligned
+ * offset indicates the starting element in which to store and
+ * offset + n <= unpacket_traits::size */
+template <typename Scalar, typename Packet>
+EIGEN_DEVICE_FUNC inline void pstore_partial(Scalar* to, const Packet& from, const Index n, const Index offset = 0) {
+  const Index packet_size = unpacket_traits<Packet>::size;
+  eigen_assert(n + offset <= packet_size && "number of elements plus offset will write past end of packet");
+  EIGEN_ALIGN_MAX Scalar elements[packet_size];
+  pstore<Scalar>(elements, from);
+  for (Index i = 0; i < numext::mini(n, packet_size - offset); i++) {
+    to[i] = elements[i + offset];
+  }
+}
 
 /** \internal copy the packet \a from to \a *to, (un-aligned store) */
-template<typename Scalar, typename Packet> inline void pstoreu(Scalar* to, const Packet& from)
-{ (*to) = from; }
+template <typename Scalar, typename Packet>
+EIGEN_DEVICE_FUNC inline void pstoreu(Scalar* to, const Packet& from) {
+  (*to) = from;
+}
 
-/** \internal tries to do cache prefetching of \a addr */
-template<typename Scalar> inline void prefetch(const Scalar* addr)
-{
-#if !defined(_MSC_VER)
-__builtin_prefetch(addr);
-#endif
+/** \internal copy n elements of the packet \a from to \a *to, (un-aligned store) */
+template <typename Scalar, typename Packet>
+EIGEN_DEVICE_FUNC inline void pstoreu_partial(Scalar* to, const Packet& from, const Index n, const Index offset = 0) {
+  const Index packet_size = unpacket_traits<Packet>::size;
+  eigen_assert(n + offset <= packet_size && "number of elements plus offset will write past end of packet");
+  EIGEN_ALIGN_MAX Scalar elements[packet_size];
+  pstore<Scalar>(elements, from);
+  for (Index i = 0; i < numext::mini(n, packet_size - offset); i++) {
+    to[i] = elements[i + offset];
+  }
 }
 
-/** \internal \returns the first element of a packet */
-template<typename Packet> inline typename unpacket_traits<Packet>::type pfirst(const Packet& a)
-{ return a; }
+/** \internal copy the packet \a from to \a *to, (un-aligned store with a mask)
+ * There is no generic implementation. We only have implementations for specialized
+ * cases. Generic case should not be called.
+ */
+template <typename Scalar, typename Packet>
+EIGEN_DEVICE_FUNC inline std::enable_if_t<unpacket_traits<Packet>::masked_store_available, void> pstoreu(
+    Scalar* to, const Packet& from, typename unpacket_traits<Packet>::mask_t umask);
 
-/** \internal \returns a packet where the element i contains the sum of the packet of \a vec[i] */
-template<typename Packet> inline Packet
-preduxp(const Packet* vecs) { return vecs[0]; }
+template <typename Scalar, typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pgather(const Scalar* from, Index /*stride*/) {
+  return ploadu<Packet>(from);
+}
 
-/** \internal \returns the sum of the elements of \a a*/
-template<typename Packet> inline typename unpacket_traits<Packet>::type predux(const Packet& a)
-{ return a; }
+template <typename Scalar, typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pgather_partial(const Scalar* from, Index stride, const Index n) {
+  const Index packet_size = unpacket_traits<Packet>::size;
+  EIGEN_ALIGN_MAX Scalar elements[packet_size] = {Scalar(0)};
+  for (Index i = 0; i < numext::mini(n, packet_size); i++) {
+    elements[i] = from[i * stride];
+  }
+  return pload<Packet>(elements);
+}
 
-/** \internal \returns the product of the elements of \a a*/
-template<typename Packet> inline typename unpacket_traits<Packet>::type predux_mul(const Packet& a)
-{ return a; }
+template <typename Scalar, typename Packet>
+EIGEN_DEVICE_FUNC inline void pscatter(Scalar* to, const Packet& from, Index /*stride*/) {
+  pstore(to, from);
+}
 
-/** \internal \returns the min of the elements of \a a*/
-template<typename Packet> inline typename unpacket_traits<Packet>::type predux_min(const Packet& a)
-{ return a; }
+template <typename Scalar, typename Packet>
+EIGEN_DEVICE_FUNC inline void pscatter_partial(Scalar* to, const Packet& from, Index stride, const Index n) {
+  const Index packet_size = unpacket_traits<Packet>::size;
+  EIGEN_ALIGN_MAX Scalar elements[packet_size];
+  pstore<Scalar>(elements, from);
+  for (Index i = 0; i < numext::mini(n, packet_size); i++) {
+    to[i * stride] = elements[i];
+  }
+}
 
-/** \internal \returns the max of the elements of \a a*/
-template<typename Packet> inline typename unpacket_traits<Packet>::type predux_max(const Packet& a)
-{ return a; }
+/** \internal tries to do cache prefetching of \a addr */
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline void prefetch(const Scalar* addr) {
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+  // do nothing
+#elif defined(EIGEN_CUDA_ARCH)
+#if defined(__LP64__) || EIGEN_OS_WIN64
+  // 64-bit pointer operand constraint for inlined asm
+  asm(" prefetch.L1 [ %1 ];" : "=l"(addr) : "l"(addr));
+#else
+  // 32-bit pointer operand constraint for inlined asm
+  asm(" prefetch.L1 [ %1 ];" : "=r"(addr) : "r"(addr));
+#endif
+#elif (!EIGEN_COMP_MSVC) && (EIGEN_COMP_GNUC || EIGEN_COMP_CLANG || EIGEN_COMP_ICC)
+  __builtin_prefetch(addr);
+#endif
+}
 
 /** \internal \returns the reversed elements of \a a*/
-template<typename Packet> inline Packet preverse(const Packet& a)
-{ return a; }
-
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet preverse(const Packet& a) {
+  return a;
+}
 
 /** \internal \returns \a a with real and imaginary part flipped (for complex type only) */
-template<typename Packet> inline Packet pcplxflip(const Packet& a)
-{
-  // FIXME: uncomment the following in case we drop the internal imag and real functions.
-//   using std::imag;
-//   using std::real;
-  return Packet(imag(a),real(a));
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pcplxflip(const Packet& a) {
+  return Packet(numext::imag(a), numext::real(a));
 }
 
 /**************************
-* Special math functions
-***************************/
+ * Special math functions
+ ***************************/
+
+/** \internal \returns isnan(a) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pisnan(const Packet& a) {
+  return pandnot(ptrue(a), pcmp_eq(a, a));
+}
+
+/** \internal \returns isinf(a) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pisinf(const Packet& a) {
+  using Scalar = typename unpacket_traits<Packet>::type;
+  constexpr Scalar inf = NumTraits<Scalar>::infinity();
+  return pcmp_eq(pabs(a), pset1<Packet>(inf));
+}
 
 /** \internal \returns the sine of \a a (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet psin(const Packet& a) { using std::sin; return sin(a); }
+template <typename Packet>
+EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psin(const Packet& a) {
+  EIGEN_USING_STD(sin);
+  return sin(a);
+}
 
 /** \internal \returns the cosine of \a a (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pcos(const Packet& a) { using std::cos; return cos(a); }
+template <typename Packet>
+EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcos(const Packet& a) {
+  EIGEN_USING_STD(cos);
+  return cos(a);
+}
 
 /** \internal \returns the tan of \a a (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet ptan(const Packet& a) { using std::tan; return tan(a); }
+template <typename Packet>
+EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet ptan(const Packet& a) {
+  EIGEN_USING_STD(tan);
+  return tan(a);
+}
 
 /** \internal \returns the arc sine of \a a (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pasin(const Packet& a) { using std::asin; return asin(a); }
+template <typename Packet>
+EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pasin(const Packet& a) {
+  EIGEN_USING_STD(asin);
+  return asin(a);
+}
 
 /** \internal \returns the arc cosine of \a a (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pacos(const Packet& a) { using std::acos; return acos(a); }
+template <typename Packet>
+EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pacos(const Packet& a) {
+  EIGEN_USING_STD(acos);
+  return acos(a);
+}
+
+/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
+template <typename Packet>
+EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psinh(const Packet& a) {
+  EIGEN_USING_STD(sinh);
+  return sinh(a);
+}
+
+/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */
+template <typename Packet>
+EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcosh(const Packet& a) {
+  EIGEN_USING_STD(cosh);
+  return cosh(a);
+}
+
+/** \internal \returns the arc tangent of \a a (coeff-wise) */
+template <typename Packet>
+EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patan(const Packet& a) {
+  EIGEN_USING_STD(atan);
+  return atan(a);
+}
+
+/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
+template <typename Packet>
+EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet ptanh(const Packet& a) {
+  EIGEN_USING_STD(tanh);
+  return tanh(a);
+}
+
+/** \internal \returns the arc tangent of \a a (coeff-wise) */
+template <typename Packet>
+EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patanh(const Packet& a) {
+  EIGEN_USING_STD(atanh);
+  return atanh(a);
+}
 
 /** \internal \returns the exp of \a a (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet pexp(const Packet& a) { using std::exp; return exp(a); }
+template <typename Packet>
+EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp(const Packet& a) {
+  return numext::exp(a);
+}
+
+/** \internal \returns the exp2 of \a a (coeff-wise) */
+template <typename Packet>
+EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp2(const Packet& a) {
+  return numext::exp2(a);
+}
+
+/** \internal \returns the expm1 of \a a (coeff-wise) */
+template <typename Packet>
+EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexpm1(const Packet& a) {
+  return numext::expm1(a);
+}
 
 /** \internal \returns the log of \a a (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet plog(const Packet& a) { using std::log; return log(a); }
+template <typename Packet>
+EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog(const Packet& a) {
+  EIGEN_USING_STD(log);
+  return log(a);
+}
+
+/** \internal \returns the log1p of \a a (coeff-wise) */
+template <typename Packet>
+EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog1p(const Packet& a) {
+  return numext::log1p(a);
+}
+
+/** \internal \returns the log10 of \a a (coeff-wise) */
+template <typename Packet>
+EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog10(const Packet& a) {
+  EIGEN_USING_STD(log10);
+  return log10(a);
+}
+
+/** \internal \returns the log2 of \a a (coeff-wise) */
+template <typename Packet>
+EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog2(const Packet& a) {
+  using Scalar = typename internal::unpacket_traits<Packet>::type;
+  using RealScalar = typename NumTraits<Scalar>::Real;
+  return pmul(pset1<Packet>(Scalar(RealScalar(EIGEN_LOG2E))), plog(a));
+}
 
 /** \internal \returns the square-root of \a a (coeff-wise) */
-template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-Packet psqrt(const Packet& a) { using std::sqrt; return sqrt(a); }
+template <typename Packet>
+EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psqrt(const Packet& a) {
+  return numext::sqrt(a);
+}
 
-/***************************************************************************
-* The following functions might not have to be overwritten for vectorized types
-***************************************************************************/
-
-/** \internal copy a packet with constant coeficient \a a (e.g., [a,a,a,a]) to \a *to. \a to must be 16 bytes aligned */
-// NOTE: this function must really be templated on the packet type (think about different packet types for the same scalar type)
-template<typename Packet>
-inline void pstore1(typename unpacket_traits<Packet>::type* to, const typename unpacket_traits<Packet>::type& a)
-{
-  pstore(to, pset1<Packet>(a));
+/** \internal \returns the cube-root of \a a (coeff-wise) */
+template <typename Packet>
+EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcbrt(const Packet& a) {
+  return numext::cbrt(a);
+}
+
+template <typename Packet, bool IsScalar = is_scalar<Packet>::value,
+          bool IsInteger = NumTraits<typename unpacket_traits<Packet>::type>::IsInteger>
+struct nearest_integer_packetop_impl {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_floor(const Packet& x) { return numext::floor(x); }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_ceil(const Packet& x) { return numext::ceil(x); }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_rint(const Packet& x) { return numext::rint(x); }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_round(const Packet& x) { return numext::round(x); }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_trunc(const Packet& x) { return numext::trunc(x); }
+};
+
+/** \internal \returns the rounded value of \a a (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pround(const Packet& a) {
+  return nearest_integer_packetop_impl<Packet>::run_round(a);
+}
+
+/** \internal \returns the floor of \a a (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pfloor(const Packet& a) {
+  return nearest_integer_packetop_impl<Packet>::run_floor(a);
+}
+
+/** \internal \returns the rounded value of \a a (coeff-wise) with current
+ * rounding mode */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet print(const Packet& a) {
+  return nearest_integer_packetop_impl<Packet>::run_rint(a);
+}
+
+/** \internal \returns the ceil of \a a (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pceil(const Packet& a) {
+  return nearest_integer_packetop_impl<Packet>::run_ceil(a);
+}
+
+/** \internal \returns the truncation of \a a (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet ptrunc(const Packet& a) {
+  return nearest_integer_packetop_impl<Packet>::run_trunc(a);
+}
+
+template <typename Packet, typename EnableIf = void>
+struct psign_impl {
+  static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a) { return numext::sign(a); }
+};
+
+/** \internal \returns the sign of \a a (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet psign(const Packet& a) {
+  return psign_impl<Packet>::run(a);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline bool psign(const bool& a) {
+  return a;
+}
+
+/** \internal \returns the first element of a packet */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type pfirst(const Packet& a) {
+  return a;
+}
+
+/** \internal \returns the sum of the elements of upper and lower half of \a a if \a a is larger than 4.
+ * For a packet {a0, a1, a2, a3, a4, a5, a6, a7}, it returns a half packet {a0+a4, a1+a5, a2+a6, a3+a7}
+ * For packet-size smaller or equal to 4, this boils down to a noop.
+ */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline std::conditional_t<(unpacket_traits<Packet>::size % 8) == 0,
+                                            typename unpacket_traits<Packet>::half, Packet>
+predux_half_dowto4(const Packet& a) {
+  return a;
+}
+
+// Slow generic implementation of Packet reduction.
+template <typename Packet, typename Op>
+EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_helper(const Packet& a, Op op) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  const size_t n = unpacket_traits<Packet>::size;
+  EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) Scalar elements[n];
+  pstoreu<Scalar>(elements, a);
+  for (size_t k = n / 2; k > 0; k /= 2) {
+    for (size_t i = 0; i < k; ++i) {
+      elements[i] = op(elements[i], elements[i + k]);
+    }
+  }
+  return elements[0];
+}
+
+/** \internal \returns the sum of the elements of \a a*/
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux(const Packet& a) {
+  return a;
 }
 
+/** \internal \returns the product of the elements of \a a */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_mul(const Packet& a) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmul<Scalar>)));
+}
+
+/** \internal \returns the min of the elements of \a a */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_min(const Packet& a) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmin<Scalar>)));
+}
+
+/** \internal \returns the max of the elements of \a a */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_max(const Packet& a) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmax<Scalar>)));
+}
+
+template <int NaNPropagation, typename Packet>
+struct predux_min_max_helper_impl {
+  using Scalar = typename unpacket_traits<Packet>::type;
+  static constexpr bool UsePredux_ = NaNPropagation == PropagateFast || NumTraits<Scalar>::IsInteger;
+  template <bool UsePredux = UsePredux_, std::enable_if_t<!UsePredux, bool> = true>
+  static EIGEN_DEVICE_FUNC inline Scalar run_min(const Packet& a) {
+    return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmin<NaNPropagation, Scalar>)));
+  }
+  template <bool UsePredux = UsePredux_, std::enable_if_t<!UsePredux, bool> = true>
+  static EIGEN_DEVICE_FUNC inline Scalar run_max(const Packet& a) {
+    return predux_helper(a, EIGEN_BINARY_OP_NAN_PROPAGATION(Scalar, (pmax<NaNPropagation, Scalar>)));
+  }
+  template <bool UsePredux = UsePredux_, std::enable_if_t<UsePredux, bool> = true>
+  static EIGEN_DEVICE_FUNC inline Scalar run_min(const Packet& a) {
+    return predux_min(a);
+  }
+  template <bool UsePredux = UsePredux_, std::enable_if_t<UsePredux, bool> = true>
+  static EIGEN_DEVICE_FUNC inline Scalar run_max(const Packet& a) {
+    return predux_max(a);
+  }
+};
+
+template <int NaNPropagation, typename Packet>
+EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_min(const Packet& a) {
+  return predux_min_max_helper_impl<NaNPropagation, Packet>::run_min(a);
+}
+
+template <int NaNPropagation, typename Packet>
+EIGEN_DEVICE_FUNC inline typename unpacket_traits<Packet>::type predux_max(const Packet& a) {
+  return predux_min_max_helper_impl<NaNPropagation, Packet>::run_max(a);
+}
+
+#undef EIGEN_BINARY_OP_NAN_PROPAGATION
+
+/** \internal \returns true if all coeffs of \a a means "true"
+ * It is supposed to be called on values returned by pcmp_*.
+ */
+// not needed yet
+// template<typename Packet> EIGEN_DEVICE_FUNC inline bool predux_all(const Packet& a)
+// { return bool(a); }
+
+/** \internal \returns true if any coeffs of \a a means "true"
+ * It is supposed to be called on values returned by pcmp_*.
+ */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline bool predux_any(const Packet& a) {
+  // Dirty but generic implementation where "true" is assumed to be non 0 and all the sames.
+  // It is expected that "true" is either:
+  //  - Scalar(1)
+  //  - bits full of ones (NaN for floats),
+  //  - or first bit equals to 1 (1 for ints, smallest denormal for floats).
+  // For all these cases, taking the sum is just fine, and this boils down to a no-op for scalars.
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  return numext::not_equal_strict(predux(a), Scalar(0));
+}
+
+/***************************************************************************
+ * The following functions might not have to be overwritten for vectorized types
+ ***************************************************************************/
+
+template <typename Packet, typename EnableIf = void>
+struct pmadd_impl {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pmadd(const Packet& a, const Packet& b, const Packet& c) {
+    return padd(pmul(a, b), c);
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pmsub(const Packet& a, const Packet& b, const Packet& c) {
+    return psub(pmul(a, b), c);
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pnmadd(const Packet& a, const Packet& b, const Packet& c) {
+    return psub(c, pmul(a, b));
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pnmsub(const Packet& a, const Packet& b, const Packet& c) {
+    return pnegate(pmadd(a, b, c));
+  }
+};
+
+template <typename Scalar>
+struct pmadd_impl<Scalar, std::enable_if_t<is_scalar<Scalar>::value && NumTraits<Scalar>::IsSigned>> {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar pmadd(const Scalar& a, const Scalar& b, const Scalar& c) {
+    return numext::madd<Scalar>(a, b, c);
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar pmsub(const Scalar& a, const Scalar& b, const Scalar& c) {
+    return numext::madd<Scalar>(a, b, Scalar(-c));
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar pnmadd(const Scalar& a, const Scalar& b, const Scalar& c) {
+    return numext::madd<Scalar>(Scalar(-a), b, c);
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar pnmsub(const Scalar& a, const Scalar& b, const Scalar& c) {
+    return -Scalar(numext::madd<Scalar>(a, b, c));
+  }
+};
+
+// Multiply-add instructions.
 /** \internal \returns a * b + c (coeff-wise) */
-template<typename Packet> inline Packet
-pmadd(const Packet&  a,
-         const Packet&  b,
-         const Packet&  c)
-{ return padd(pmul(a, b),c); }
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pmadd(const Packet& a, const Packet& b, const Packet& c) {
+  return pmadd_impl<Packet>::pmadd(a, b, c);
+}
+
+/** \internal \returns a * b - c (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pmsub(const Packet& a, const Packet& b, const Packet& c) {
+  return pmadd_impl<Packet>::pmsub(a, b, c);
+}
+
+/** \internal \returns -(a * b) + c (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pnmadd(const Packet& a, const Packet& b, const Packet& c) {
+  return pmadd_impl<Packet>::pnmadd(a, b, c);
+}
+
+/** \internal \returns -((a * b + c) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pnmsub(const Packet& a, const Packet& b, const Packet& c) {
+  return pmadd_impl<Packet>::pnmsub(a, b, c);
+}
+
+/** \internal copy a packet with constant coefficient \a a (e.g., [a,a,a,a]) to \a *to. \a to must be 16 bytes aligned
+ */
+// NOTE: this function must really be templated on the packet type (think about different packet types for the same
+// scalar type)
+template <typename Packet>
+inline void pstore1(typename unpacket_traits<Packet>::type* to, const typename unpacket_traits<Packet>::type& a) {
+  pstore(to, pset1<Packet>(a));
+}
 
 /** \internal \returns a packet version of \a *from.
-  * If LoadMode equals #Aligned, \a from must be 16 bytes aligned */
-template<typename Packet, int LoadMode>
-inline Packet ploadt(const typename unpacket_traits<Packet>::type* from)
-{
-  if(LoadMode == Aligned)
+ * The pointer \a from must be aligned on a \a Alignment bytes boundary. */
+template <typename Packet, int Alignment>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt(const typename unpacket_traits<Packet>::type* from) {
+  if (Alignment >= unpacket_traits<Packet>::alignment)
     return pload<Packet>(from);
   else
     return ploadu<Packet>(from);
 }
 
+/** \internal \returns n elements of a packet version of \a *from.
+ * The pointer \a from must be aligned on a \a Alignment bytes boundary. */
+template <typename Packet, int Alignment>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt_partial(const typename unpacket_traits<Packet>::type* from,
+                                                            const Index n, const Index offset = 0) {
+  if (Alignment >= unpacket_traits<Packet>::alignment)
+    return pload_partial<Packet>(from, n, offset);
+  else
+    return ploadu_partial<Packet>(from, n, offset);
+}
+
 /** \internal copy the packet \a from to \a *to.
-  * If StoreMode equals #Aligned, \a to must be 16 bytes aligned */
-template<typename Scalar, typename Packet, int LoadMode>
-inline void pstoret(Scalar* to, const Packet& from)
-{
-  if(LoadMode == Aligned)
+ * The pointer \a from must be aligned on a \a Alignment bytes boundary. */
+template <typename Scalar, typename Packet, int Alignment>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(Scalar* to, const Packet& from) {
+  if (Alignment >= unpacket_traits<Packet>::alignment)
     pstore(to, from);
   else
     pstoreu(to, from);
 }
 
-/** \internal default implementation of palign() allowing partial specialization */
-template<int Offset,typename PacketType>
-struct palign_impl
-{
-  // by default data are aligned, so there is nothing to be done :)
-  static inline void run(PacketType&, const PacketType&) {}
-};
-
-/** \internal update \a first using the concatenation of the packet_size minus \a Offset last elements
-  * of \a first and \a Offset first elements of \a second.
-  * 
-  * This function is currently only used to optimize matrix-vector products on unligned matrices.
-  * It takes 2 packets that represent a contiguous memory array, and returns a packet starting
-  * at the position \a Offset. For instance, for packets of 4 elements, we have:
-  *  Input:
-  *  - first = {f0,f1,f2,f3}
-  *  - second = {s0,s1,s2,s3}
-  * Output: 
-  *   - if Offset==0 then {f0,f1,f2,f3}
-  *   - if Offset==1 then {f1,f2,f3,s0}
-  *   - if Offset==2 then {f2,f3,s0,s1}
-  *   - if Offset==3 then {f3,s0,s1,s3}
-  */
-template<int Offset,typename PacketType>
-inline void palign(PacketType& first, const PacketType& second)
-{
-  palign_impl<Offset,PacketType>::run(first,second);
+/** \internal copy n elements of the packet \a from to \a *to.
+ * The pointer \a from must be aligned on a \a Alignment bytes boundary. */
+template <typename Scalar, typename Packet, int Alignment>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret_partial(Scalar* to, const Packet& from, const Index n,
+                                                           const Index offset = 0) {
+  if (Alignment >= unpacket_traits<Packet>::alignment)
+    pstore_partial(to, from, n, offset);
+  else
+    pstoreu_partial(to, from, n, offset);
+}
+
+/** \internal \returns a packet version of \a *from.
+ * Unlike ploadt, ploadt_ro takes advantage of the read-only memory path on the
+ * hardware if available to speedup the loading of data that won't be modified
+ * by the current computation.
+ */
+template <typename Packet, int LoadMode>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt_ro(const typename unpacket_traits<Packet>::type* from) {
+  return ploadt<Packet, LoadMode>(from);
 }
 
 /***************************************************************************
-* Fast complex products (GCC generates a function call which is very slow)
-***************************************************************************/
+ * Fast complex products (GCC generates a function call which is very slow)
+ ***************************************************************************/
+
+// Eigen+CUDA does not support complexes.
+#if !defined(EIGEN_GPUCC)
 
-template<> inline std::complex<float> pmul(const std::complex<float>& a, const std::complex<float>& b)
-{ return std::complex<float>(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); }
+template <>
+inline std::complex<float> pmul(const std::complex<float>& a, const std::complex<float>& b) {
+  return std::complex<float>(a.real() * b.real() - a.imag() * b.imag(), a.imag() * b.real() + a.real() * b.imag());
+}
+
+template <>
+inline std::complex<double> pmul(const std::complex<double>& a, const std::complex<double>& b) {
+  return std::complex<double>(a.real() * b.real() - a.imag() * b.imag(), a.imag() * b.real() + a.real() * b.imag());
+}
 
-template<> inline std::complex<double> pmul(const std::complex<double>& a, const std::complex<double>& b)
-{ return std::complex<double>(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); }
+#endif
+
+/***************************************************************************
+ * PacketBlock, that is a collection of N packets where the number of words
+ * in the packet is a multiple of N.
+ ***************************************************************************/
+template <typename Packet, int N = unpacket_traits<Packet>::size>
+struct PacketBlock {
+  Packet packet[N];
+};
+
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet, 1>& /*kernel*/) {
+  // Nothing to do in the scalar case, i.e. a 1x1 matrix.
+}
+
+/***************************************************************************
+ * Selector, i.e. vector of N boolean values used to select (i.e. blend)
+ * words from 2 packets.
+ ***************************************************************************/
+template <size_t N>
+struct Selector {
+  bool select[N];
+};
+
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pblend(const Selector<unpacket_traits<Packet>::size>& ifPacket,
+                                       const Packet& thenPacket, const Packet& elsePacket) {
+  return ifPacket.select[0] ? thenPacket : elsePacket;
+}
+
+/** \internal \returns 1 / a (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet preciprocal(const Packet& a) {
+  using Scalar = typename unpacket_traits<Packet>::type;
+  return pdiv(pset1<Packet>(Scalar(1)), a);
+}
+
+/** \internal \returns the reciprocal square-root of \a a (coeff-wise) */
+template <typename Packet>
+EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet prsqrt(const Packet& a) {
+  return preciprocal<Packet>(psqrt(a));
+}
+
+template <typename Packet, bool IsScalar = is_scalar<Packet>::value,
+          bool IsInteger = NumTraits<typename unpacket_traits<Packet>::type>::IsInteger>
+struct psignbit_impl;
+template <typename Packet, bool IsInteger>
+struct psignbit_impl<Packet, true, IsInteger> {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static constexpr Packet run(const Packet& a) { return numext::signbit(a); }
+};
+template <typename Packet>
+struct psignbit_impl<Packet, false, false> {
+  // generic implementation if not specialized in PacketMath.h
+  // slower than arithmetic shift
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static Packet run(const Packet& a) {
+    const Packet cst_pos_one = pset1<Packet>(Scalar(1));
+    const Packet cst_neg_one = pset1<Packet>(Scalar(-1));
+    return pcmp_eq(por(pand(a, cst_neg_one), cst_pos_one), cst_neg_one);
+  }
+};
+template <typename Packet>
+struct psignbit_impl<Packet, false, true> {
+  // generic implementation for integer packets
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static constexpr Packet run(const Packet& a) { return pcmp_lt(a, pzero(a)); }
+};
+/** \internal \returns the sign bit of \a a as a bitmask*/
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE constexpr Packet psignbit(const Packet& a) {
+  return psignbit_impl<Packet>::run(a);
+}
+
+/** \internal \returns the 2-argument arc tangent of \a y and \a x (coeff-wise) */
+template <typename Packet, std::enable_if_t<is_scalar<Packet>::value, int> = 0>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet patan2(const Packet& y, const Packet& x) {
+  return numext::atan2(y, x);
+}
+
+/** \internal \returns the 2-argument arc tangent of \a y and \a x (coeff-wise) */
+template <typename Packet, std::enable_if_t<!is_scalar<Packet>::value, int> = 0>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet patan2(const Packet& y, const Packet& x) {
+  typedef typename internal::unpacket_traits<Packet>::type Scalar;
+
+  // See https://en.cppreference.com/w/cpp/numeric/math/atan2
+  // for how corner cases are supposed to be handled according to the
+  // IEEE floating-point standard (IEC 60559).
+  const Packet kSignMask = pset1<Packet>(-Scalar(0));
+  const Packet kZero = pzero(x);
+  const Packet kOne = pset1<Packet>(Scalar(1));
+  const Packet kPi = pset1<Packet>(Scalar(EIGEN_PI));
+
+  const Packet x_has_signbit = psignbit(x);
+  const Packet y_signmask = pand(y, kSignMask);
+  const Packet x_signmask = pand(x, kSignMask);
+  const Packet result_signmask = pxor(y_signmask, x_signmask);
+  const Packet shift = por(pand(x_has_signbit, kPi), y_signmask);
+
+  const Packet x_and_y_are_same = pcmp_eq(pabs(x), pabs(y));
+  const Packet x_and_y_are_zero = pcmp_eq(por(x, y), kZero);
+
+  Packet arg = pdiv(y, x);
+  arg = pselect(x_and_y_are_same, por(kOne, result_signmask), arg);
+  arg = pselect(x_and_y_are_zero, result_signmask, arg);
+
+  Packet result = patan(arg);
+  result = padd(result, shift);
+  return result;
+}
+
+/** \internal \returns the argument of \a a as a complex number */
+template <typename Packet, std::enable_if_t<is_scalar<Packet>::value, int> = 0>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pcarg(const Packet& a) {
+  return Packet(numext::arg(a));
+}
+
+/** \internal \returns the argument of \a a as a complex number */
+template <typename Packet, std::enable_if_t<!is_scalar<Packet>::value, int> = 0>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pcarg(const Packet& a) {
+  EIGEN_STATIC_ASSERT(NumTraits<typename unpacket_traits<Packet>::type>::IsComplex,
+                      THIS METHOD IS FOR COMPLEX TYPES ONLY)
+  using RealPacket = typename unpacket_traits<Packet>::as_real;
+  // a                                              // r     i    r     i    ...
+  RealPacket aflip = pcplxflip(a).v;                // i     r    i     r    ...
+  RealPacket result = patan2(aflip, a.v);           // atan2 crap atan2 crap ...
+  return (Packet)pand(result, peven_mask(result));  // atan2 0    atan2 0    ...
+}
+
+/** \internal \returns a packet populated with values in the range [begin, begin + count). Elements
+ * outside this range are not defined. \a *from does not need to be aligned, and can be null if \a count is zero.*/
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet ploaduSegment(const typename unpacket_traits<Packet>::type* from, Index begin,
+                                              Index count) {
+  using Scalar = typename unpacket_traits<Packet>::type;
+  constexpr Index PacketSize = unpacket_traits<Packet>::size;
+  eigen_assert((begin >= 0 && count >= 0 && begin + count <= PacketSize) && "invalid range");
+  Scalar aux[PacketSize] = {};
+  for (Index k = begin; k < begin + count; k++) {
+    aux[k] = from[k];
+  }
+  return ploadu<Packet>(aux);
+}
+
+/** \internal \returns a packet populated with values in the range [begin, begin + count). Elements
+ * outside this range are not defined. \a *from must be aligned, and cannot be null.*/
+template <typename Packet>
+EIGEN_DEVICE_FUNC inline Packet ploadSegment(const typename unpacket_traits<Packet>::type* from, Index begin,
+                                             Index count) {
+  return ploaduSegment<Packet>(from, begin, count);
+}
+
+/** \internal copy the packet \a from in the range [begin, begin + count) to \a *to.
+Elements outside of the range [begin, begin + count) are not defined. \a *to does not need to be aligned, and can be
+null if \a count is zero.*/
+template <typename Scalar, typename Packet>
+EIGEN_DEVICE_FUNC inline void pstoreuSegment(Scalar* to, const Packet& from, Index begin, Index count) {
+  constexpr Index PacketSize = unpacket_traits<Packet>::size;
+  eigen_assert((begin >= 0 && count >= 0 && begin + count <= PacketSize) && "invalid range");
+  Scalar aux[PacketSize];
+  pstoreu<Scalar, Packet>(aux, from);
+  for (Index k = begin; k < begin + count; k++) {
+    to[k] = aux[k];
+  }
+}
+
+/** \internal copy the packet \a from in the range [begin, begin + count) to \a *to.
+Elements outside of the range [begin, begin + count) are not defined. \a *to must be aligned, and cannot be
+null.*/
+template <typename Scalar, typename Packet>
+EIGEN_DEVICE_FUNC inline void pstoreSegment(Scalar* to, const Packet& from, Index begin, Index count) {
+  return pstoreuSegment(to, from, begin, count);
+}
+
+/** \internal \returns a packet populated with values in the range [begin, begin + count). Elements
+ * outside this range are not defined.*/
+template <typename Packet, int Alignment>
+EIGEN_DEVICE_FUNC inline Packet ploadtSegment(const typename unpacket_traits<Packet>::type* from, Index begin,
+                                              Index count) {
+  constexpr int RequiredAlignment = unpacket_traits<Packet>::alignment;
+  if (Alignment >= RequiredAlignment) {
+    return ploadSegment<Packet>(from, begin, count);
+  } else {
+    return ploaduSegment<Packet>(from, begin, count);
+  }
+}
+
+/** \internal copy the packet \a from in the range [begin, begin + count) to \a *to.
+Elements outside of the range [begin, begin + count) are not defined.*/
+template <typename Scalar, typename Packet, int Alignment>
+EIGEN_DEVICE_FUNC inline void pstoretSegment(Scalar* to, const Packet& from, Index begin, Index count) {
+  constexpr int RequiredAlignment = unpacket_traits<Packet>::alignment;
+  if (Alignment >= RequiredAlignment) {
+    pstoreSegment<Scalar, Packet>(to, from, begin, count);
+  } else {
+    pstoreuSegment<Scalar, Packet>(to, from, begin, count);
+  }
+}
+
+#ifndef EIGEN_NO_IO
+
+template <typename Packet>
+class StreamablePacket {
+ public:
+  using Scalar = typename unpacket_traits<Packet>::type;
+  StreamablePacket(const Packet& packet) { pstoreu(v_, packet); }
+
+  friend std::ostream& operator<<(std::ostream& os, const StreamablePacket& packet) {
+    os << "{" << packet.v_[0];
+    for (int i = 1; i < unpacket_traits<Packet>::size; ++i) {
+      os << "," << packet.v_[i];
+    }
+    os << "}";
+    return os;
+  }
+
+ private:
+  Scalar v_[unpacket_traits<Packet>::size];
+};
+
+/**
+ * \internal \returns an intermediary that can be used to ostream packets, e.g. for debugging.
+ */
+template <typename Packet>
+StreamablePacket<Packet> postream(const Packet& packet) {
+  return StreamablePacket<Packet>(packet);
+}
 
-} // end namespace internal
+#endif  // EIGEN_NO_IO
 
-} // end namespace Eigen
+}  // end namespace internal
 
-#endif // EIGEN_GENERIC_PACKET_MATH_H
+}  // end namespace Eigen
 
+#endif  // EIGEN_GENERIC_PACKET_MATH_H
diff --git a/inst/include/Eigen/src/Core/GlobalFunctions.h b/inst/include/Eigen/src/Core/GlobalFunctions.h
index 2acf9772..df1098e2 100644
--- a/inst/include/Eigen/src/Core/GlobalFunctions.h
+++ b/inst/include/Eigen/src/Core/GlobalFunctions.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2010-2012 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2010-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2010 Benoit Jacob <jacob.benoit.1@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -11,82 +11,220 @@
 #ifndef EIGEN_GLOBAL_FUNCTIONS_H
 #define EIGEN_GLOBAL_FUNCTIONS_H
 
-#define EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(NAME,FUNCTOR) \
-  template<typename Derived> \
-  inline const Eigen::CwiseUnaryOp<Eigen::internal::FUNCTOR<typename Derived::Scalar>, const Derived> \
-  NAME(const Eigen::ArrayBase<Derived>& x) { \
-    return x.derived(); \
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+
+#define EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(NAME, FUNCTOR, DOC_OP, DOC_DETAILS)                                    \
+  /** \returns an expression of the coefficient-wise DOC_OP of \a x                                             \
+                                                                                                              \ \
+    DOC_DETAILS                                                                                                 \
+                                                                                                              \ \
+    \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_##NAME">Math functions</a>, class CwiseUnaryOp   \
+    */                                                                                                          \
+  template <typename Derived>                                                                                   \
+  inline const Eigen::CwiseUnaryOp<Eigen::internal::FUNCTOR<typename Derived::Scalar>, const Derived> NAME(     \
+      const Eigen::ArrayBase<Derived>& x);
+
+#else
+
+#define EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(NAME, FUNCTOR, DOC_OP, DOC_DETAILS)                                    \
+  template <typename Derived>                                                                                   \
+  inline const Eigen::CwiseUnaryOp<Eigen::internal::FUNCTOR<typename Derived::Scalar>, const Derived>(NAME)(    \
+      const Eigen::ArrayBase<Derived>& x) {                                                                     \
+    return Eigen::CwiseUnaryOp<Eigen::internal::FUNCTOR<typename Derived::Scalar>, const Derived>(x.derived()); \
   }
 
-#define EIGEN_ARRAY_DECLARE_GLOBAL_EIGEN_UNARY(NAME,FUNCTOR) \
-  \
-  template<typename Derived> \
-  struct NAME##_retval<ArrayBase<Derived> > \
-  { \
+#endif  // EIGEN_PARSED_BY_DOXYGEN
+
+#define EIGEN_ARRAY_DECLARE_GLOBAL_EIGEN_UNARY(NAME, FUNCTOR)                                                  \
+                                                                                                               \
+  template <typename Derived>                                                                                  \
+  struct NAME##_retval<ArrayBase<Derived> > {                                                                  \
     typedef const Eigen::CwiseUnaryOp<Eigen::internal::FUNCTOR<typename Derived::Scalar>, const Derived> type; \
-  }; \
-  template<typename Derived> \
-  struct NAME##_impl<ArrayBase<Derived> > \
-  { \
-    static inline typename NAME##_retval<ArrayBase<Derived> >::type run(const Eigen::ArrayBase<Derived>& x) \
-    { \
-      return x.derived(); \
-    } \
+  };                                                                                                           \
+  template <typename Derived>                                                                                  \
+  struct NAME##_impl<ArrayBase<Derived> > {                                                                    \
+    static inline typename NAME##_retval<ArrayBase<Derived> >::type run(const Eigen::ArrayBase<Derived>& x) {  \
+      return typename NAME##_retval<ArrayBase<Derived> >::type(x.derived());                                   \
+    }                                                                                                          \
   };
 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 
-namespace Eigen
-{
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(real,scalar_real_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(imag,scalar_imag_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(conj,scalar_conjugate_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sin,scalar_sin_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cos,scalar_cos_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(asin,scalar_asin_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(acos,scalar_acos_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tan,scalar_tan_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(exp,scalar_exp_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log,scalar_log_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(abs,scalar_abs_op)
-  EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sqrt,scalar_sqrt_op)
-  
-  template<typename Derived>
-  inline const Eigen::CwiseUnaryOp<Eigen::internal::scalar_pow_op<typename Derived::Scalar>, const Derived>
-  pow(const Eigen::ArrayBase<Derived>& x, const typename Derived::Scalar& exponent) {
-    return x.derived().pow(exponent);
-  }
+namespace Eigen {
+EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(real, scalar_real_op, real part,\sa ArrayBase::real)
+EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(imag, scalar_imag_op, imaginary part,\sa ArrayBase::imag)
+EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(conj, scalar_conjugate_op, complex conjugate,\sa ArrayBase::conjugate)
+EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(inverse, scalar_inverse_op, inverse,\sa ArrayBase::inverse)
+EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sin, scalar_sin_op, sine,\sa ArrayBase::sin)
+EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cos, scalar_cos_op, cosine,\sa ArrayBase::cos)
+EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tan, scalar_tan_op, tangent,\sa ArrayBase::tan)
+EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(atan, scalar_atan_op, arc - tangent,\sa ArrayBase::atan)
+EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(asin, scalar_asin_op, arc - sine,\sa ArrayBase::asin)
+EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(acos, scalar_acos_op, arc - consine,\sa ArrayBase::acos)
+EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sinh, scalar_sinh_op, hyperbolic sine,\sa ArrayBase::sinh)
+EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cosh, scalar_cosh_op, hyperbolic cosine,\sa ArrayBase::cosh)
+EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(tanh, scalar_tanh_op, hyperbolic tangent,\sa ArrayBase::tanh)
+EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(asinh, scalar_asinh_op, inverse hyperbolic sine,\sa ArrayBase::asinh)
+EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(acosh, scalar_acosh_op, inverse hyperbolic cosine,\sa ArrayBase::acosh)
+EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(atanh, scalar_atanh_op, inverse hyperbolic tangent,\sa ArrayBase::atanh)
+EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(logistic, scalar_logistic_op, logistic function,\sa ArrayBase::logistic)
+EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(lgamma, scalar_lgamma_op,
+                                 natural logarithm of the gamma function,\sa ArrayBase::lgamma)
+EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(digamma, scalar_digamma_op, derivative of lgamma,\sa ArrayBase::digamma)
+EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erf, scalar_erf_op, error function,\sa ArrayBase::erf)
+EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(erfc, scalar_erfc_op, complement error function,\sa ArrayBase::erfc)
+EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(ndtri, scalar_ndtri_op, inverse normal distribution function,\sa ArrayBase::ndtri)
+EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(exp, scalar_exp_op, exponential,\sa ArrayBase::exp)
+EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(exp2, scalar_exp2_op, exponential,\sa ArrayBase::exp2)
+EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(expm1, scalar_expm1_op, exponential of a value minus 1,\sa ArrayBase::expm1)
+EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log, scalar_log_op, natural logarithm,\sa Eigen::log10 DOXCOMMA ArrayBase::log)
+EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log1p, scalar_log1p_op, natural logarithm of 1 plus the value,\sa ArrayBase::log1p)
+EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log10, scalar_log10_op, base 10 logarithm,\sa Eigen::log DOXCOMMA ArrayBase::log10)
+EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(log2, scalar_log2_op, base 2 logarithm,\sa Eigen::log DOXCOMMA ArrayBase::log2)
+EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(abs, scalar_abs_op, absolute value,\sa ArrayBase::abs DOXCOMMA MatrixBase::cwiseAbs)
+EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(abs2, scalar_abs2_op,
+                                 squared absolute value,\sa ArrayBase::abs2 DOXCOMMA MatrixBase::cwiseAbs2)
+EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(arg, scalar_arg_op, complex argument,\sa ArrayBase::arg DOXCOMMA MatrixBase::cwiseArg)
+EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(carg, scalar_carg_op,
+                                 complex argument, \sa ArrayBase::carg DOXCOMMA MatrixBase::cwiseCArg)
+EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sqrt, scalar_sqrt_op, square root,\sa ArrayBase::sqrt DOXCOMMA MatrixBase::cwiseSqrt)
+EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cbrt, scalar_cbrt_op, cube root,\sa ArrayBase::cbrt DOXCOMMA MatrixBase::cwiseCbrt)
+EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(rsqrt, scalar_rsqrt_op, reciprocal square root,\sa ArrayBase::rsqrt)
+EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(square, scalar_square_op,
+                                 square(power 2),\sa Eigen::abs2 DOXCOMMA Eigen::pow DOXCOMMA ArrayBase::square)
+EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(cube, scalar_cube_op, cube(power 3),\sa Eigen::pow DOXCOMMA ArrayBase::cube)
+EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(rint, scalar_rint_op,
+                                 nearest integer,\sa Eigen::floor DOXCOMMA Eigen::ceil DOXCOMMA ArrayBase::round)
+EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(round, scalar_round_op,
+                                 nearest integer,\sa Eigen::floor DOXCOMMA Eigen::ceil DOXCOMMA ArrayBase::round)
+EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(
+    floor, scalar_floor_op, nearest integer not greater than the given value,\sa Eigen::ceil DOXCOMMA ArrayBase::floor)
+EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(
+    ceil, scalar_ceil_op, nearest integer not less than the given value,\sa Eigen::floor DOXCOMMA ArrayBase::ceil)
+EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(trunc, scalar_trunc_op,
+                                 nearest integer not greater in magnitude than the given value,\sa Eigen::trunc DOXCOMMA
+                                     ArrayBase::trunc)
+EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(
+    isnan, scalar_isnan_op, not -a - number test,\sa Eigen::isinf DOXCOMMA Eigen::isfinite DOXCOMMA ArrayBase::isnan)
+EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(
+    isinf, scalar_isinf_op, infinite value test,\sa Eigen::isnan DOXCOMMA Eigen::isfinite DOXCOMMA ArrayBase::isinf)
+EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(isfinite, scalar_isfinite_op,
+                                 finite value test,\sa Eigen::isinf DOXCOMMA Eigen::isnan DOXCOMMA ArrayBase::isfinite)
+EIGEN_ARRAY_DECLARE_GLOBAL_UNARY(sign, scalar_sign_op, sign(or 0),\sa ArrayBase::sign)
 
-  template<typename Derived>
-  inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_binary_pow_op<typename Derived::Scalar, typename Derived::Scalar>, const Derived, const Derived>
-  pow(const Eigen::ArrayBase<Derived>& x, const Eigen::ArrayBase<Derived>& exponents) 
-  {
-    return Eigen::CwiseBinaryOp<Eigen::internal::scalar_binary_pow_op<typename Derived::Scalar, typename Derived::Scalar>, const Derived, const Derived>(
-      x.derived(),
-      exponents.derived()
-    );
-  }
-  
-  /**
-  * \brief Component-wise division of a scalar by array elements.
-  **/
-  template <typename Derived>
-  inline const Eigen::CwiseUnaryOp<Eigen::internal::scalar_inverse_mult_op<typename Derived::Scalar>, const Derived>
-    operator/(const typename Derived::Scalar& s, const Eigen::ArrayBase<Derived>& a)
-  {
-    return Eigen::CwiseUnaryOp<Eigen::internal::scalar_inverse_mult_op<typename Derived::Scalar>, const Derived>(
-      a.derived(),
-      Eigen::internal::scalar_inverse_mult_op<typename Derived::Scalar>(s)  
-    );
-  }
+template <typename Derived, typename ScalarExponent>
+using GlobalUnaryPowReturnType = std::enable_if_t<
+    !internal::is_arithmetic<typename NumTraits<Derived>::Real>::value &&
+        internal::is_arithmetic<typename NumTraits<ScalarExponent>::Real>::value,
+    CwiseUnaryOp<internal::scalar_unary_pow_op<typename Derived::Scalar, ScalarExponent>, const Derived> >;
 
-  namespace internal
-  {
-    EIGEN_ARRAY_DECLARE_GLOBAL_EIGEN_UNARY(real,scalar_real_op)
-    EIGEN_ARRAY_DECLARE_GLOBAL_EIGEN_UNARY(imag,scalar_imag_op)
-    EIGEN_ARRAY_DECLARE_GLOBAL_EIGEN_UNARY(abs2,scalar_abs2_op)
-  }
+/** \returns an expression of the coefficient-wise power of \a x to the given constant \a exponent.
+ *
+ * \tparam ScalarExponent is the scalar type of \a exponent. It must be compatible with the scalar type of the given
+ * expression (\c Derived::Scalar).
+ *
+ * \sa ArrayBase::pow()
+ *
+ * \relates ArrayBase
+ */
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+template <typename Derived, typename ScalarExponent>
+EIGEN_DEVICE_FUNC inline const GlobalUnaryPowReturnType<Derived, ScalarExponent> pow(const Eigen::ArrayBase<Derived>& x,
+                                                                                     const ScalarExponent& exponent);
+#else
+template <typename Derived, typename ScalarExponent>
+EIGEN_DEVICE_FUNC inline const GlobalUnaryPowReturnType<Derived, ScalarExponent> pow(const Eigen::ArrayBase<Derived>& x,
+                                                                                     const ScalarExponent& exponent) {
+  return GlobalUnaryPowReturnType<Derived, ScalarExponent>(
+      x.derived(), internal::scalar_unary_pow_op<typename Derived::Scalar, ScalarExponent>(exponent));
 }
+#endif
+
+/** \returns an expression of the coefficient-wise power of \a x to the given array of \a exponents.
+ *
+ * This function computes the coefficient-wise power.
+ *
+ * Example: \include Cwise_array_power_array.cpp
+ * Output: \verbinclude Cwise_array_power_array.out
+ *
+ * \sa ArrayBase::pow()
+ *
+ * \relates ArrayBase
+ */
+template <typename Derived, typename ExponentDerived>
+inline const Eigen::CwiseBinaryOp<
+    Eigen::internal::scalar_pow_op<typename Derived::Scalar, typename ExponentDerived::Scalar>, const Derived,
+    const ExponentDerived>
+pow(const Eigen::ArrayBase<Derived>& x, const Eigen::ArrayBase<ExponentDerived>& exponents) {
+  return Eigen::CwiseBinaryOp<
+      Eigen::internal::scalar_pow_op<typename Derived::Scalar, typename ExponentDerived::Scalar>, const Derived,
+      const ExponentDerived>(x.derived(), exponents.derived());
+}
+
+/** \returns an expression of the coefficient-wise power of the scalar \a x to the given array of \a exponents.
+ *
+ * This function computes the coefficient-wise power between a scalar and an array of exponents.
+ *
+ * \tparam Scalar is the scalar type of \a x. It must be compatible with the scalar type of the given array expression
+ * (\c Derived::Scalar).
+ *
+ * Example: \include Cwise_scalar_power_array.cpp
+ * Output: \verbinclude Cwise_scalar_power_array.out
+ *
+ * \sa ArrayBase::pow()
+ *
+ * \relates ArrayBase
+ */
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+template <typename Scalar, typename Derived>
+inline const CwiseBinaryOp<internal::scalar_pow_op<Scalar, Derived::Scalar>, Constant<Scalar>, Derived> pow(
+    const Scalar& x, const Eigen::ArrayBase<Derived>& x);
+#else
+template <typename Scalar, typename Derived>
+EIGEN_DEVICE_FUNC inline const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(
+    typename internal::promote_scalar_arg<typename Derived::Scalar EIGEN_COMMA Scalar EIGEN_COMMA
+                                              EIGEN_SCALAR_BINARY_SUPPORTED(pow, Scalar,
+                                                                            typename Derived::Scalar)>::type,
+    Derived, pow) pow(const Scalar& x, const Eigen::ArrayBase<Derived>& exponents) {
+  typedef
+      typename internal::promote_scalar_arg<typename Derived::Scalar, Scalar,
+                                            EIGEN_SCALAR_BINARY_SUPPORTED(pow, Scalar, typename Derived::Scalar)>::type
+          PromotedScalar;
+  return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(PromotedScalar, Derived, pow)(
+      typename internal::plain_constant_type<Derived, PromotedScalar>::type(
+          exponents.derived().rows(), exponents.derived().cols(), internal::scalar_constant_op<PromotedScalar>(x)),
+      exponents.derived());
+}
+#endif
+
+/** \returns an expression of the coefficient-wise atan2(\a x, \a y). \a x and \a y must be of the same type.
+ *
+ * This function computes the coefficient-wise atan2().
+ *
+ * \sa ArrayBase::atan2()
+ *
+ * \relates ArrayBase
+ */
+template <typename LhsDerived, typename RhsDerived>
+inline const std::enable_if_t<
+    std::is_same<typename LhsDerived::Scalar, typename RhsDerived::Scalar>::value,
+    Eigen::CwiseBinaryOp<Eigen::internal::scalar_atan2_op<typename LhsDerived::Scalar, typename RhsDerived::Scalar>,
+                         const LhsDerived, const RhsDerived> >
+atan2(const Eigen::ArrayBase<LhsDerived>& x, const Eigen::ArrayBase<RhsDerived>& exponents) {
+  return Eigen::CwiseBinaryOp<
+      Eigen::internal::scalar_atan2_op<typename LhsDerived::Scalar, typename RhsDerived::Scalar>, const LhsDerived,
+      const RhsDerived>(x.derived(), exponents.derived());
+}
+
+namespace internal {
+EIGEN_ARRAY_DECLARE_GLOBAL_EIGEN_UNARY(real, scalar_real_op)
+EIGEN_ARRAY_DECLARE_GLOBAL_EIGEN_UNARY(imag, scalar_imag_op)
+EIGEN_ARRAY_DECLARE_GLOBAL_EIGEN_UNARY(abs2, scalar_abs2_op)
+}  // namespace internal
+}  // namespace Eigen
 
-// TODO: cleanly disable those functions that are not supported on Array (numext::real_ref, internal::random, internal::isApprox...)
+// TODO: cleanly disable those functions that are not supported on Array (numext::real_ref, internal::random,
+// internal::isApprox...)
 
-#endif // EIGEN_GLOBAL_FUNCTIONS_H
+#endif  // EIGEN_GLOBAL_FUNCTIONS_H
diff --git a/inst/include/Eigen/src/Core/IO.h b/inst/include/Eigen/src/Core/IO.h
index 8d4bc59e..0a1b583d 100644
--- a/inst/include/Eigen/src/Core/IO.h
+++ b/inst/include/Eigen/src/Core/IO.h
@@ -11,55 +11,65 @@
 #ifndef EIGEN_IO_H
 #define EIGEN_IO_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 enum { DontAlignCols = 1 };
-enum { StreamPrecision = -1,
-       FullPrecision = -2 };
+enum { StreamPrecision = -1, FullPrecision = -2 };
 
 namespace internal {
-template<typename Derived>
-std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat& fmt);
+template <typename Derived>
+std::ostream& print_matrix(std::ostream& s, const Derived& _m, const IOFormat& fmt);
 }
 
 /** \class IOFormat
-  * \ingroup Core_Module
-  *
-  * \brief Stores a set of parameters controlling the way matrices are printed
-  *
-  * List of available parameters:
-  *  - \b precision number of digits for floating point values, or one of the special constants \c StreamPrecision and \c FullPrecision.
-  *                 The default is the special value \c StreamPrecision which means to use the
-  *                 stream's own precision setting, as set for instance using \c cout.precision(3). The other special value
-  *                 \c FullPrecision means that the number of digits will be computed to match the full precision of each floating-point
-  *                 type.
-  *  - \b flags an OR-ed combination of flags, the default value is 0, the only currently available flag is \c DontAlignCols which
-  *             allows to disable the alignment of columns, resulting in faster code.
-  *  - \b coeffSeparator string printed between two coefficients of the same row
-  *  - \b rowSeparator string printed between two rows
-  *  - \b rowPrefix string printed at the beginning of each row
-  *  - \b rowSuffix string printed at the end of each row
-  *  - \b matPrefix string printed at the beginning of the matrix
-  *  - \b matSuffix string printed at the end of the matrix
-  *
-  * Example: \include IOFormat.cpp
-  * Output: \verbinclude IOFormat.out
-  *
-  * \sa DenseBase::format(), class WithFormat
-  */
-struct IOFormat
-{
-  /** Default contructor, see class IOFormat for the meaning of the parameters */
-  IOFormat(int _precision = StreamPrecision, int _flags = 0,
-    const std::string& _coeffSeparator = " ",
-    const std::string& _rowSeparator = "\n", const std::string& _rowPrefix="", const std::string& _rowSuffix="",
-    const std::string& _matPrefix="", const std::string& _matSuffix="")
-  : matPrefix(_matPrefix), matSuffix(_matSuffix), rowPrefix(_rowPrefix), rowSuffix(_rowSuffix), rowSeparator(_rowSeparator),
-    rowSpacer(""), coeffSeparator(_coeffSeparator), precision(_precision), flags(_flags)
-  {
-    int i = int(matSuffix.length())-1;
-    while (i>=0 && matSuffix[i]!='\n')
-    {
+ * \ingroup Core_Module
+ *
+ * \brief Stores a set of parameters controlling the way matrices are printed
+ *
+ * List of available parameters:
+ *  - \b precision number of digits for floating point values, or one of the special constants \c StreamPrecision and \c
+ * FullPrecision. The default is the special value \c StreamPrecision which means to use the stream's own precision
+ * setting, as set for instance using \c cout.precision(3). The other special value \c FullPrecision means that the
+ * number of digits will be computed to match the full precision of each floating-point type.
+ *  - \b flags an OR-ed combination of flags, the default value is 0, the only currently available flag is \c
+ * DontAlignCols which allows to disable the alignment of columns, resulting in faster code.
+ *  - \b coeffSeparator string printed between two coefficients of the same row
+ *  - \b rowSeparator string printed between two rows
+ *  - \b rowPrefix string printed at the beginning of each row
+ *  - \b rowSuffix string printed at the end of each row
+ *  - \b matPrefix string printed at the beginning of the matrix
+ *  - \b matSuffix string printed at the end of the matrix
+ *  - \b fill character printed to fill the empty space in aligned columns
+ *
+ * Example: \include IOFormat.cpp
+ * Output: \verbinclude IOFormat.out
+ *
+ * \sa DenseBase::format(), class WithFormat
+ */
+struct IOFormat {
+  /** Default constructor, see class IOFormat for the meaning of the parameters */
+  IOFormat(int _precision = StreamPrecision, int _flags = 0, const std::string& _coeffSeparator = " ",
+           const std::string& _rowSeparator = "\n", const std::string& _rowPrefix = "",
+           const std::string& _rowSuffix = "", const std::string& _matPrefix = "", const std::string& _matSuffix = "",
+           const char _fill = ' ')
+      : matPrefix(_matPrefix),
+        matSuffix(_matSuffix),
+        rowPrefix(_rowPrefix),
+        rowSuffix(_rowSuffix),
+        rowSeparator(_rowSeparator),
+        rowSpacer(""),
+        coeffSeparator(_coeffSeparator),
+        fill(_fill),
+        precision(_precision),
+        flags(_flags) {
+    // TODO check if rowPrefix, rowSuffix or rowSeparator contains a newline
+    // don't add rowSpacer if columns are not to be aligned
+    if ((flags & DontAlignCols)) return;
+    int i = int(matPrefix.length()) - 1;
+    while (i >= 0 && matPrefix[i] != '\n') {
       rowSpacer += ' ';
       i--;
     }
@@ -67,184 +77,157 @@ struct IOFormat
   std::string matPrefix, matSuffix;
   std::string rowPrefix, rowSuffix, rowSeparator, rowSpacer;
   std::string coeffSeparator;
+  char fill;
   int precision;
   int flags;
 };
 
 /** \class WithFormat
-  * \ingroup Core_Module
-  *
-  * \brief Pseudo expression providing matrix output with given format
-  *
-  * \param ExpressionType the type of the object on which IO stream operations are performed
-  *
-  * This class represents an expression with stream operators controlled by a given IOFormat.
-  * It is the return type of DenseBase::format()
-  * and most of the time this is the only way it is used.
-  *
-  * See class IOFormat for some examples.
-  *
-  * \sa DenseBase::format(), class IOFormat
-  */
-template<typename ExpressionType>
-class WithFormat
-{
-  public:
-
-    WithFormat(const ExpressionType& matrix, const IOFormat& format)
-      : m_matrix(matrix), m_format(format)
-    {}
-
-    friend std::ostream & operator << (std::ostream & s, const WithFormat& wf)
-    {
-      return internal::print_matrix(s, wf.m_matrix.eval(), wf.m_format);
-    }
+ * \ingroup Core_Module
+ *
+ * \brief Pseudo expression providing matrix output with given format
+ *
+ * \tparam ExpressionType the type of the object on which IO stream operations are performed
+ *
+ * This class represents an expression with stream operators controlled by a given IOFormat.
+ * It is the return type of DenseBase::format()
+ * and most of the time this is the only way it is used.
+ *
+ * See class IOFormat for some examples.
+ *
+ * \sa DenseBase::format(), class IOFormat
+ */
+template <typename ExpressionType>
+class WithFormat {
+ public:
+  WithFormat(const ExpressionType& matrix, const IOFormat& format) : m_matrix(matrix), m_format(format) {}
+
+  friend std::ostream& operator<<(std::ostream& s, const WithFormat& wf) {
+    return internal::print_matrix(s, wf.m_matrix.eval(), wf.m_format);
+  }
 
-  protected:
-    const typename ExpressionType::Nested m_matrix;
-    IOFormat m_format;
+ protected:
+  typename ExpressionType::Nested m_matrix;
+  IOFormat m_format;
 };
 
-/** \returns a WithFormat proxy object allowing to print a matrix the with given
-  * format \a fmt.
-  *
-  * See class IOFormat for some examples.
-  *
-  * \sa class IOFormat, class WithFormat
-  */
-template<typename Derived>
-inline const WithFormat<Derived>
-DenseBase<Derived>::format(const IOFormat& fmt) const
-{
-  return WithFormat<Derived>(derived(), fmt);
-}
-
 namespace internal {
 
-template<typename Scalar, bool IsInteger>
-struct significant_decimals_default_impl
-{
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  static inline int run()
-  {
-    using std::ceil;
-    using std::log;
-    return cast<RealScalar,int>(ceil(-log(NumTraits<RealScalar>::epsilon())/log(RealScalar(10))));
-  }
+// NOTE: This helper is kept for backward compatibility with previous code specializing
+//       this internal::significant_decimals_impl structure. In the future we should directly
+//       call max_digits10().
+template <typename Scalar>
+struct significant_decimals_impl {
+  static inline int run() { return NumTraits<Scalar>::max_digits10(); }
 };
 
-template<typename Scalar>
-struct significant_decimals_default_impl<Scalar, true>
-{
-  static inline int run()
-  {
-    return 0;
-  }
-};
-
-template<typename Scalar>
-struct significant_decimals_impl
-  : significant_decimals_default_impl<Scalar, NumTraits<Scalar>::IsInteger>
-{};
-
 /** \internal
-  * print the matrix \a _m to the output stream \a s using the output format \a fmt */
-template<typename Derived>
-std::ostream & print_matrix(std::ostream & s, const Derived& _m, const IOFormat& fmt)
-{
-  if(_m.size() == 0)
-  {
+ * print the matrix \a _m to the output stream \a s using the output format \a fmt */
+template <typename Derived>
+std::ostream& print_matrix(std::ostream& s, const Derived& _m, const IOFormat& fmt) {
+  using internal::is_same;
+
+  if (_m.size() == 0) {
     s << fmt.matPrefix << fmt.matSuffix;
     return s;
   }
-  
+
   typename Derived::Nested m = _m;
   typedef typename Derived::Scalar Scalar;
-  typedef typename Derived::Index Index;
+  typedef std::conditional_t<is_same<Scalar, char>::value || is_same<Scalar, unsigned char>::value ||
+                                 is_same<Scalar, numext::int8_t>::value || is_same<Scalar, numext::uint8_t>::value,
+                             int,
+                             std::conditional_t<is_same<Scalar, std::complex<char> >::value ||
+                                                    is_same<Scalar, std::complex<unsigned char> >::value ||
+                                                    is_same<Scalar, std::complex<numext::int8_t> >::value ||
+                                                    is_same<Scalar, std::complex<numext::uint8_t> >::value,
+                                                std::complex<int>, const Scalar&> >
+      PrintType;
 
   Index width = 0;
 
   std::streamsize explicit_precision;
-  if(fmt.precision == StreamPrecision)
-  {
+  if (fmt.precision == StreamPrecision) {
     explicit_precision = 0;
-  }
-  else if(fmt.precision == FullPrecision)
-  {
-    if (NumTraits<Scalar>::IsInteger)
-    {
+  } else if (fmt.precision == FullPrecision) {
+    if (NumTraits<Scalar>::IsInteger) {
       explicit_precision = 0;
-    }
-    else
-    {
+    } else {
       explicit_precision = significant_decimals_impl<Scalar>::run();
     }
-  }
-  else
-  {
+  } else {
     explicit_precision = fmt.precision;
   }
 
   std::streamsize old_precision = 0;
-  if(explicit_precision) old_precision = s.precision(explicit_precision);
+  if (explicit_precision) old_precision = s.precision(explicit_precision);
 
   bool align_cols = !(fmt.flags & DontAlignCols);
-  if(align_cols)
-  {
+  if (align_cols) {
     // compute the largest width
-    for(Index j = 0; j < m.cols(); ++j)
-      for(Index i = 0; i < m.rows(); ++i)
-      {
+    for (Index j = 0; j < m.cols(); ++j)
+      for (Index i = 0; i < m.rows(); ++i) {
         std::stringstream sstr;
         sstr.copyfmt(s);
-        sstr << m.coeff(i,j);
+        sstr << static_cast<PrintType>(m.coeff(i, j));
         width = std::max<Index>(width, Index(sstr.str().length()));
       }
   }
+  std::streamsize old_width = s.width();
+  char old_fill_character = s.fill();
   s << fmt.matPrefix;
-  for(Index i = 0; i < m.rows(); ++i)
-  {
-    if (i)
-      s << fmt.rowSpacer;
+  for (Index i = 0; i < m.rows(); ++i) {
+    if (i) s << fmt.rowSpacer;
     s << fmt.rowPrefix;
-    if(width) s.width(width);
-    s << m.coeff(i, 0);
-    for(Index j = 1; j < m.cols(); ++j)
-    {
+    if (width) {
+      s.fill(fmt.fill);
+      s.width(width);
+    }
+    s << static_cast<PrintType>(m.coeff(i, 0));
+    for (Index j = 1; j < m.cols(); ++j) {
       s << fmt.coeffSeparator;
-      if (width) s.width(width);
-      s << m.coeff(i, j);
+      if (width) {
+        s.fill(fmt.fill);
+        s.width(width);
+      }
+      s << static_cast<PrintType>(m.coeff(i, j));
     }
     s << fmt.rowSuffix;
-    if( i < m.rows() - 1)
-      s << fmt.rowSeparator;
+    if (i < m.rows() - 1) s << fmt.rowSeparator;
   }
   s << fmt.matSuffix;
-  if(explicit_precision) s.precision(old_precision);
+  if (explicit_precision) s.precision(old_precision);
+  if (width) {
+    s.fill(old_fill_character);
+    s.width(old_width);
+  }
   return s;
 }
 
-} // end namespace internal
+}  // end namespace internal
 
 /** \relates DenseBase
-  *
-  * Outputs the matrix, to the given stream.
-  *
-  * If you wish to print the matrix with a format different than the default, use DenseBase::format().
-  *
-  * It is also possible to change the default format by defining EIGEN_DEFAULT_IO_FORMAT before including Eigen headers.
-  * If not defined, this will automatically be defined to Eigen::IOFormat(), that is the Eigen::IOFormat with default parameters.
-  *
-  * \sa DenseBase::format()
-  */
-template<typename Derived>
-std::ostream & operator <<
-(std::ostream & s,
- const DenseBase<Derived> & m)
-{
+ *
+ * Outputs the matrix, to the given stream.
+ *
+ * If you wish to print the matrix with a format different than the default, use DenseBase::format().
+ *
+ * It is also possible to change the default format by defining EIGEN_DEFAULT_IO_FORMAT before including Eigen headers.
+ * If not defined, this will automatically be defined to Eigen::IOFormat(), that is the Eigen::IOFormat with default
+ * parameters.
+ *
+ * \sa DenseBase::format()
+ */
+template <typename Derived>
+std::ostream& operator<<(std::ostream& s, const DenseBase<Derived>& m) {
   return internal::print_matrix(s, m.eval(), EIGEN_DEFAULT_IO_FORMAT);
 }
 
-} // end namespace Eigen
+template <typename Derived>
+std::ostream& operator<<(std::ostream& s, const DiagonalBase<Derived>& m) {
+  return internal::print_matrix(s, m.derived(), EIGEN_DEFAULT_IO_FORMAT);
+}
+
+}  // end namespace Eigen
 
-#endif // EIGEN_IO_H
+#endif  // EIGEN_IO_H
diff --git a/inst/include/Eigen/src/Core/IndexedView.h b/inst/include/Eigen/src/Core/IndexedView.h
new file mode 100644
index 00000000..10562c19
--- /dev/null
+++ b/inst/include/Eigen/src/Core/IndexedView.h
@@ -0,0 +1,321 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_INDEXED_VIEW_H
+#define EIGEN_INDEXED_VIEW_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename XprType, typename RowIndices, typename ColIndices>
+struct traits<IndexedView<XprType, RowIndices, ColIndices>> : traits<XprType> {
+  enum {
+    RowsAtCompileTime = int(IndexedViewHelper<RowIndices>::SizeAtCompileTime),
+    ColsAtCompileTime = int(IndexedViewHelper<ColIndices>::SizeAtCompileTime),
+    MaxRowsAtCompileTime = RowsAtCompileTime,
+    MaxColsAtCompileTime = ColsAtCompileTime,
+
+    XprTypeIsRowMajor = (int(traits<XprType>::Flags) & RowMajorBit) != 0,
+    IsRowMajor = (MaxRowsAtCompileTime == 1 && MaxColsAtCompileTime != 1)   ? 1
+                 : (MaxColsAtCompileTime == 1 && MaxRowsAtCompileTime != 1) ? 0
+                                                                            : XprTypeIsRowMajor,
+
+    RowIncr = int(IndexedViewHelper<RowIndices>::IncrAtCompileTime),
+    ColIncr = int(IndexedViewHelper<ColIndices>::IncrAtCompileTime),
+    InnerIncr = IsRowMajor ? ColIncr : RowIncr,
+    OuterIncr = IsRowMajor ? RowIncr : ColIncr,
+
+    HasSameStorageOrderAsXprType = (IsRowMajor == XprTypeIsRowMajor),
+    XprInnerStride = HasSameStorageOrderAsXprType ? int(inner_stride_at_compile_time<XprType>::ret)
+                                                  : int(outer_stride_at_compile_time<XprType>::ret),
+    XprOuterstride = HasSameStorageOrderAsXprType ? int(outer_stride_at_compile_time<XprType>::ret)
+                                                  : int(inner_stride_at_compile_time<XprType>::ret),
+
+    InnerSize = XprTypeIsRowMajor ? ColsAtCompileTime : RowsAtCompileTime,
+    IsBlockAlike = InnerIncr == 1 && OuterIncr == 1,
+    IsInnerPannel = HasSameStorageOrderAsXprType &&
+                    is_same<AllRange<InnerSize>, std::conditional_t<XprTypeIsRowMajor, ColIndices, RowIndices>>::value,
+
+    InnerStrideAtCompileTime =
+        InnerIncr < 0 || InnerIncr == DynamicIndex || XprInnerStride == Dynamic || InnerIncr == Undefined
+            ? Dynamic
+            : XprInnerStride * InnerIncr,
+    OuterStrideAtCompileTime =
+        OuterIncr < 0 || OuterIncr == DynamicIndex || XprOuterstride == Dynamic || OuterIncr == Undefined
+            ? Dynamic
+            : XprOuterstride * OuterIncr,
+
+    ReturnAsScalar = is_single_range<RowIndices>::value && is_single_range<ColIndices>::value,
+    ReturnAsBlock = (!ReturnAsScalar) && IsBlockAlike,
+    ReturnAsIndexedView = (!ReturnAsScalar) && (!ReturnAsBlock),
+
+    // FIXME we deal with compile-time strides if and only if we have DirectAccessBit flag,
+    // but this is too strict regarding negative strides...
+    DirectAccessMask = (int(InnerIncr) != Undefined && int(OuterIncr) != Undefined && InnerIncr >= 0 && OuterIncr >= 0)
+                           ? DirectAccessBit
+                           : 0,
+    FlagsRowMajorBit = IsRowMajor ? RowMajorBit : 0,
+    FlagsLvalueBit = is_lvalue<XprType>::value ? LvalueBit : 0,
+    FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1) ? LinearAccessBit : 0,
+    Flags = (traits<XprType>::Flags & (HereditaryBits | DirectAccessMask)) | FlagsLvalueBit | FlagsRowMajorBit |
+            FlagsLinearAccessBit
+  };
+
+  typedef Block<XprType, RowsAtCompileTime, ColsAtCompileTime, IsInnerPannel> BlockType;
+};
+
+template <typename XprType, typename RowIndices, typename ColIndices, typename StorageKind, bool DirectAccess>
+class IndexedViewImpl;
+
+}  // namespace internal
+
+/** \class IndexedView
+ * \ingroup Core_Module
+ *
+ * \brief Expression of a non-sequential sub-matrix defined by arbitrary sequences of row and column indices
+ *
+ * \tparam XprType the type of the expression in which we are taking the intersections of sub-rows and sub-columns
+ * \tparam RowIndices the type of the object defining the sequence of row indices
+ * \tparam ColIndices the type of the object defining the sequence of column indices
+ *
+ * This class represents an expression of a sub-matrix (or sub-vector) defined as the intersection
+ * of sub-sets of rows and columns, that are themself defined by generic sequences of row indices \f$
+ * \{r_0,r_1,..r_{m-1}\} \f$ and column indices \f$ \{c_0,c_1,..c_{n-1} \}\f$. Let \f$ A \f$  be the nested matrix, then
+ * the resulting matrix \f$ B \f$ has \c m rows and \c n columns, and its entries are given by: \f$ B(i,j) = A(r_i,c_j)
+ * \f$.
+ *
+ * The \c RowIndices and \c ColIndices types must be compatible with the following API:
+ * \code
+ * <integral type> operator[](Index) const;
+ * Index size() const;
+ * \endcode
+ *
+ * Typical supported types thus include:
+ *  - std::vector<int>
+ *  - std::valarray<int>
+ *  - std::array<int>
+ *  - Eigen::ArrayXi
+ *  - decltype(ArrayXi::LinSpaced(...))
+ *  - Any view/expressions of the previous types
+ *  - Eigen::ArithmeticSequence
+ *  - Eigen::internal::AllRange     (helper for Eigen::placeholders::all)
+ *  - Eigen::internal::SingleRange  (helper for single index)
+ *  - etc.
+ *
+ * In typical usages of %Eigen, this class should never be used directly. It is the return type of
+ * DenseBase::operator()(const RowIndices&, const ColIndices&).
+ *
+ * \sa class Block
+ */
+template <typename XprType, typename RowIndices, typename ColIndices>
+class IndexedView
+    : public internal::IndexedViewImpl<XprType, RowIndices, ColIndices, typename internal::traits<XprType>::StorageKind,
+                                       (internal::traits<IndexedView<XprType, RowIndices, ColIndices>>::Flags &
+                                        DirectAccessBit) != 0> {
+ public:
+  typedef typename internal::IndexedViewImpl<
+      XprType, RowIndices, ColIndices, typename internal::traits<XprType>::StorageKind,
+      (internal::traits<IndexedView<XprType, RowIndices, ColIndices>>::Flags & DirectAccessBit) != 0>
+      Base;
+  EIGEN_GENERIC_PUBLIC_INTERFACE(IndexedView)
+  EIGEN_INHERIT_ASSIGNMENT_OPERATORS(IndexedView)
+
+  template <typename T0, typename T1>
+  IndexedView(XprType& xpr, const T0& rowIndices, const T1& colIndices) : Base(xpr, rowIndices, colIndices) {}
+};
+
+namespace internal {
+
+// Generic API dispatcher
+template <typename XprType, typename RowIndices, typename ColIndices, typename StorageKind, bool DirectAccess>
+class IndexedViewImpl : public internal::generic_xpr_base<IndexedView<XprType, RowIndices, ColIndices>>::type {
+ public:
+  typedef typename internal::generic_xpr_base<IndexedView<XprType, RowIndices, ColIndices>>::type Base;
+  typedef typename internal::ref_selector<XprType>::non_const_type MatrixTypeNested;
+  typedef internal::remove_all_t<XprType> NestedExpression;
+  typedef typename XprType::Scalar Scalar;
+
+  EIGEN_INHERIT_ASSIGNMENT_OPERATORS(IndexedViewImpl)
+
+  template <typename T0, typename T1>
+  IndexedViewImpl(XprType& xpr, const T0& rowIndices, const T1& colIndices)
+      : m_xpr(xpr), m_rowIndices(rowIndices), m_colIndices(colIndices) {}
+
+  /** \returns number of rows */
+  Index rows() const { return IndexedViewHelper<RowIndices>::size(m_rowIndices); }
+
+  /** \returns number of columns */
+  Index cols() const { return IndexedViewHelper<ColIndices>::size(m_colIndices); }
+
+  /** \returns the nested expression */
+  const internal::remove_all_t<XprType>& nestedExpression() const { return m_xpr; }
+
+  /** \returns the nested expression */
+  std::remove_reference_t<XprType>& nestedExpression() { return m_xpr; }
+
+  /** \returns a const reference to the object storing/generating the row indices */
+  const RowIndices& rowIndices() const { return m_rowIndices; }
+
+  /** \returns a const reference to the object storing/generating the column indices */
+  const ColIndices& colIndices() const { return m_colIndices; }
+
+  constexpr Scalar& coeffRef(Index rowId, Index colId) {
+    return nestedExpression().coeffRef(m_rowIndices[rowId], m_colIndices[colId]);
+  }
+
+  constexpr const Scalar& coeffRef(Index rowId, Index colId) const {
+    return nestedExpression().coeffRef(m_rowIndices[rowId], m_colIndices[colId]);
+  }
+
+ protected:
+  MatrixTypeNested m_xpr;
+  RowIndices m_rowIndices;
+  ColIndices m_colIndices;
+};
+
+template <typename XprType, typename RowIndices, typename ColIndices, typename StorageKind>
+class IndexedViewImpl<XprType, RowIndices, ColIndices, StorageKind, true>
+    : public IndexedViewImpl<XprType, RowIndices, ColIndices, StorageKind, false> {
+ public:
+  using Base = internal::IndexedViewImpl<XprType, RowIndices, ColIndices,
+                                         typename internal::traits<XprType>::StorageKind, false>;
+  using Derived = IndexedView<XprType, RowIndices, ColIndices>;
+
+  EIGEN_INHERIT_ASSIGNMENT_OPERATORS(IndexedViewImpl)
+
+  template <typename T0, typename T1>
+  IndexedViewImpl(XprType& xpr, const T0& rowIndices, const T1& colIndices) : Base(xpr, rowIndices, colIndices) {}
+
+  Index rowIncrement() const {
+    if (traits<Derived>::RowIncr != DynamicIndex && traits<Derived>::RowIncr != Undefined) {
+      return traits<Derived>::RowIncr;
+    }
+    return IndexedViewHelper<RowIndices>::incr(this->rowIndices());
+  }
+  Index colIncrement() const {
+    if (traits<Derived>::ColIncr != DynamicIndex && traits<Derived>::ColIncr != Undefined) {
+      return traits<Derived>::ColIncr;
+    }
+    return IndexedViewHelper<ColIndices>::incr(this->colIndices());
+  }
+
+  Index innerIncrement() const { return traits<Derived>::IsRowMajor ? colIncrement() : rowIncrement(); }
+
+  Index outerIncrement() const { return traits<Derived>::IsRowMajor ? rowIncrement() : colIncrement(); }
+
+  std::decay_t<typename XprType::Scalar>* data() {
+    Index row_offset = this->rowIndices()[0] * this->nestedExpression().rowStride();
+    Index col_offset = this->colIndices()[0] * this->nestedExpression().colStride();
+    return this->nestedExpression().data() + row_offset + col_offset;
+  }
+
+  const std::decay_t<typename XprType::Scalar>* data() const {
+    Index row_offset = this->rowIndices()[0] * this->nestedExpression().rowStride();
+    Index col_offset = this->colIndices()[0] * this->nestedExpression().colStride();
+    return this->nestedExpression().data() + row_offset + col_offset;
+  }
+
+  EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept {
+    if (traits<Derived>::InnerStrideAtCompileTime != Dynamic) {
+      return traits<Derived>::InnerStrideAtCompileTime;
+    }
+    return innerIncrement() * this->nestedExpression().innerStride();
+  }
+
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept {
+    if (traits<Derived>::OuterStrideAtCompileTime != Dynamic) {
+      return traits<Derived>::OuterStrideAtCompileTime;
+    }
+    return outerIncrement() * this->nestedExpression().outerStride();
+  }
+};
+
+template <typename ArgType, typename RowIndices, typename ColIndices>
+struct unary_evaluator<IndexedView<ArgType, RowIndices, ColIndices>, IndexBased>
+    : evaluator_base<IndexedView<ArgType, RowIndices, ColIndices>> {
+  typedef IndexedView<ArgType, RowIndices, ColIndices> XprType;
+
+  enum {
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost /* TODO + cost of row/col index */,
+
+    FlagsLinearAccessBit =
+        (traits<XprType>::RowsAtCompileTime == 1 || traits<XprType>::ColsAtCompileTime == 1) ? LinearAccessBit : 0,
+
+    FlagsRowMajorBit = traits<XprType>::FlagsRowMajorBit,
+
+    Flags = (evaluator<ArgType>::Flags & (HereditaryBits & ~RowMajorBit /*| LinearAccessBit | DirectAccessBit*/)) |
+            FlagsLinearAccessBit | FlagsRowMajorBit,
+
+    Alignment = 0
+  };
+
+  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& xpr) : m_argImpl(xpr.nestedExpression()), m_xpr(xpr) {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
+    eigen_assert(m_xpr.rowIndices()[row] >= 0 && m_xpr.rowIndices()[row] < m_xpr.nestedExpression().rows() &&
+                 m_xpr.colIndices()[col] >= 0 && m_xpr.colIndices()[col] < m_xpr.nestedExpression().cols());
+    return m_argImpl.coeff(m_xpr.rowIndices()[row], m_xpr.colIndices()[col]);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) {
+    eigen_assert(m_xpr.rowIndices()[row] >= 0 && m_xpr.rowIndices()[row] < m_xpr.nestedExpression().rows() &&
+                 m_xpr.colIndices()[col] >= 0 && m_xpr.colIndices()[col] < m_xpr.nestedExpression().cols());
+    return m_argImpl.coeffRef(m_xpr.rowIndices()[row], m_xpr.colIndices()[col]);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
+    EIGEN_STATIC_ASSERT_LVALUE(XprType)
+    Index row = XprType::RowsAtCompileTime == 1 ? 0 : index;
+    Index col = XprType::RowsAtCompileTime == 1 ? index : 0;
+    eigen_assert(m_xpr.rowIndices()[row] >= 0 && m_xpr.rowIndices()[row] < m_xpr.nestedExpression().rows() &&
+                 m_xpr.colIndices()[col] >= 0 && m_xpr.colIndices()[col] < m_xpr.nestedExpression().cols());
+    return m_argImpl.coeffRef(m_xpr.rowIndices()[row], m_xpr.colIndices()[col]);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeffRef(Index index) const {
+    Index row = XprType::RowsAtCompileTime == 1 ? 0 : index;
+    Index col = XprType::RowsAtCompileTime == 1 ? index : 0;
+    eigen_assert(m_xpr.rowIndices()[row] >= 0 && m_xpr.rowIndices()[row] < m_xpr.nestedExpression().rows() &&
+                 m_xpr.colIndices()[col] >= 0 && m_xpr.colIndices()[col] < m_xpr.nestedExpression().cols());
+    return m_argImpl.coeffRef(m_xpr.rowIndices()[row], m_xpr.colIndices()[col]);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index index) const {
+    Index row = XprType::RowsAtCompileTime == 1 ? 0 : index;
+    Index col = XprType::RowsAtCompileTime == 1 ? index : 0;
+    eigen_assert(m_xpr.rowIndices()[row] >= 0 && m_xpr.rowIndices()[row] < m_xpr.nestedExpression().rows() &&
+                 m_xpr.colIndices()[col] >= 0 && m_xpr.colIndices()[col] < m_xpr.nestedExpression().cols());
+    return m_argImpl.coeff(m_xpr.rowIndices()[row], m_xpr.colIndices()[col]);
+  }
+
+ protected:
+  evaluator<ArgType> m_argImpl;
+  const XprType& m_xpr;
+};
+
+// Catch assignments to an IndexedView.
+template <typename ArgType, typename RowIndices, typename ColIndices>
+struct evaluator_assume_aliasing<IndexedView<ArgType, RowIndices, ColIndices>> {
+  static const bool value = true;
+};
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_INDEXED_VIEW_H
diff --git a/inst/include/Eigen/src/Core/InnerProduct.h b/inst/include/Eigen/src/Core/InnerProduct.h
new file mode 100644
index 00000000..686ad137
--- /dev/null
+++ b/inst/include/Eigen/src/Core/InnerProduct.h
@@ -0,0 +1,260 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2024 Charlie Schlosser <cs.schlosser@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_INNER_PRODUCT_EVAL_H
+#define EIGEN_INNER_PRODUCT_EVAL_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+// recursively searches for the largest simd type that does not exceed Size, or the smallest if no such type exists
+template <typename Scalar, int Size, typename Packet = typename packet_traits<Scalar>::type,
+          bool Stop =
+              (unpacket_traits<Packet>::size <= Size) || is_same<Packet, typename unpacket_traits<Packet>::half>::value>
+struct find_inner_product_packet_helper;
+
+template <typename Scalar, int Size, typename Packet>
+struct find_inner_product_packet_helper<Scalar, Size, Packet, false> {
+  using type = typename find_inner_product_packet_helper<Scalar, Size, typename unpacket_traits<Packet>::half>::type;
+};
+
+template <typename Scalar, int Size, typename Packet>
+struct find_inner_product_packet_helper<Scalar, Size, Packet, true> {
+  using type = Packet;
+};
+
+template <typename Scalar, int Size>
+struct find_inner_product_packet : find_inner_product_packet_helper<Scalar, Size> {};
+
+template <typename Scalar>
+struct find_inner_product_packet<Scalar, Dynamic> {
+  using type = typename packet_traits<Scalar>::type;
+};
+
+template <typename Lhs, typename Rhs>
+struct inner_product_assert {
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Lhs)
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Rhs)
+  EIGEN_STATIC_ASSERT_SAME_VECTOR_SIZE(Lhs, Rhs)
+#ifndef EIGEN_NO_DEBUG
+  static EIGEN_DEVICE_FUNC void run(const Lhs& lhs, const Rhs& rhs) {
+    eigen_assert((lhs.size() == rhs.size()) && "Inner product: lhs and rhs vectors must have same size");
+  }
+#else
+  static EIGEN_DEVICE_FUNC void run(const Lhs&, const Rhs&) {}
+#endif
+};
+
+template <typename Func, typename Lhs, typename Rhs>
+struct inner_product_evaluator {
+  static constexpr int LhsFlags = evaluator<Lhs>::Flags;
+  static constexpr int RhsFlags = evaluator<Rhs>::Flags;
+  static constexpr int SizeAtCompileTime = size_prefer_fixed(Lhs::SizeAtCompileTime, Rhs::SizeAtCompileTime);
+  static constexpr int MaxSizeAtCompileTime =
+      min_size_prefer_fixed(Lhs::MaxSizeAtCompileTime, Rhs::MaxSizeAtCompileTime);
+  static constexpr int LhsAlignment = evaluator<Lhs>::Alignment;
+  static constexpr int RhsAlignment = evaluator<Rhs>::Alignment;
+
+  using Scalar = typename Func::result_type;
+  using Packet = typename find_inner_product_packet<Scalar, SizeAtCompileTime>::type;
+
+  static constexpr bool Vectorize =
+      bool(LhsFlags & RhsFlags & PacketAccessBit) && Func::PacketAccess &&
+      ((MaxSizeAtCompileTime == Dynamic) || (unpacket_traits<Packet>::size <= MaxSizeAtCompileTime));
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit inner_product_evaluator(const Lhs& lhs, const Rhs& rhs,
+                                                                         Func func = Func())
+      : m_func(func), m_lhs(lhs), m_rhs(rhs), m_size(lhs.size()) {
+    inner_product_assert<Lhs, Rhs>::run(lhs, rhs);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_size.value(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(Index index) const {
+    return m_func.coeff(m_lhs.coeff(index), m_rhs.coeff(index));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(const Scalar& value, Index index) const {
+    return m_func.coeff(value, m_lhs.coeff(index), m_rhs.coeff(index));
+  }
+
+  template <typename PacketType, int LhsMode = LhsAlignment, int RhsMode = RhsAlignment>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index index) const {
+    return m_func.packet(m_lhs.template packet<LhsMode, PacketType>(index),
+                         m_rhs.template packet<RhsMode, PacketType>(index));
+  }
+
+  template <typename PacketType, int LhsMode = LhsAlignment, int RhsMode = RhsAlignment>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(const PacketType& value, Index index) const {
+    return m_func.packet(value, m_lhs.template packet<LhsMode, PacketType>(index),
+                         m_rhs.template packet<RhsMode, PacketType>(index));
+  }
+
+  const Func m_func;
+  const evaluator<Lhs> m_lhs;
+  const evaluator<Rhs> m_rhs;
+  const variable_if_dynamic<Index, SizeAtCompileTime> m_size;
+};
+
+template <typename Evaluator, bool Vectorize = Evaluator::Vectorize>
+struct inner_product_impl;
+
+// scalar loop
+template <typename Evaluator>
+struct inner_product_impl<Evaluator, false> {
+  using Scalar = typename Evaluator::Scalar;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run(const Evaluator& eval) {
+    const Index size = eval.size();
+    if (size == 0) return Scalar(0);
+
+    Scalar result = eval.coeff(0);
+    for (Index k = 1; k < size; k++) {
+      result = eval.coeff(result, k);
+    }
+
+    return result;
+  }
+};
+
+// vector loop
+template <typename Evaluator>
+struct inner_product_impl<Evaluator, true> {
+  using UnsignedIndex = std::make_unsigned_t<Index>;
+  using Scalar = typename Evaluator::Scalar;
+  using Packet = typename Evaluator::Packet;
+  static constexpr int PacketSize = unpacket_traits<Packet>::size;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run(const Evaluator& eval) {
+    const UnsignedIndex size = static_cast<UnsignedIndex>(eval.size());
+    if (size < PacketSize) return inner_product_impl<Evaluator, false>::run(eval);
+
+    const UnsignedIndex packetEnd = numext::round_down(size, PacketSize);
+    const UnsignedIndex quadEnd = numext::round_down(size, 4 * PacketSize);
+    const UnsignedIndex numPackets = size / PacketSize;
+    const UnsignedIndex numRemPackets = (packetEnd - quadEnd) / PacketSize;
+
+    Packet presult0, presult1, presult2, presult3;
+
+    presult0 = eval.template packet<Packet>(0 * PacketSize);
+    if (numPackets >= 2) presult1 = eval.template packet<Packet>(1 * PacketSize);
+    if (numPackets >= 3) presult2 = eval.template packet<Packet>(2 * PacketSize);
+    if (numPackets >= 4) {
+      presult3 = eval.template packet<Packet>(3 * PacketSize);
+
+      for (UnsignedIndex k = 4 * PacketSize; k < quadEnd; k += 4 * PacketSize) {
+        presult0 = eval.packet(presult0, k + 0 * PacketSize);
+        presult1 = eval.packet(presult1, k + 1 * PacketSize);
+        presult2 = eval.packet(presult2, k + 2 * PacketSize);
+        presult3 = eval.packet(presult3, k + 3 * PacketSize);
+      }
+
+      if (numRemPackets >= 1) presult0 = eval.packet(presult0, quadEnd + 0 * PacketSize);
+      if (numRemPackets >= 2) presult1 = eval.packet(presult1, quadEnd + 1 * PacketSize);
+      if (numRemPackets == 3) presult2 = eval.packet(presult2, quadEnd + 2 * PacketSize);
+
+      presult2 = padd(presult2, presult3);
+    }
+
+    if (numPackets >= 3) presult1 = padd(presult1, presult2);
+    if (numPackets >= 2) presult0 = padd(presult0, presult1);
+
+    Scalar result = predux(presult0);
+    for (UnsignedIndex k = packetEnd; k < size; k++) {
+      result = eval.coeff(result, k);
+    }
+
+    return result;
+  }
+};
+
+template <typename Scalar, bool Conj>
+struct conditional_conj;
+
+template <typename Scalar>
+struct conditional_conj<Scalar, true> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(const Scalar& a) { return numext::conj(a); }
+  template <typename Packet>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packet(const Packet& a) {
+    return pconj(a);
+  }
+};
+
+template <typename Scalar>
+struct conditional_conj<Scalar, false> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(const Scalar& a) { return a; }
+  template <typename Packet>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packet(const Packet& a) {
+    return a;
+  }
+};
+
+template <typename LhsScalar, typename RhsScalar, bool Conj>
+struct scalar_inner_product_op {
+  using result_type = typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType;
+  using conj_helper = conditional_conj<LhsScalar, Conj>;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type coeff(const LhsScalar& a, const RhsScalar& b) const {
+    return (conj_helper::coeff(a) * b);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type coeff(const result_type& accum, const LhsScalar& a,
+                                                          const RhsScalar& b) const {
+    return (conj_helper::coeff(a) * b) + accum;
+  }
+  static constexpr bool PacketAccess = false;
+};
+
+// Partial specialization for packet access if and only if
+// LhsScalar == RhsScalar == ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType.
+template <typename Scalar, bool Conj>
+struct scalar_inner_product_op<
+    Scalar,
+    typename std::enable_if<internal::is_same<typename ScalarBinaryOpTraits<Scalar, Scalar>::ReturnType, Scalar>::value,
+                            Scalar>::type,
+    Conj> {
+  using result_type = Scalar;
+  using conj_helper = conditional_conj<Scalar, Conj>;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(const Scalar& a, const Scalar& b) const {
+    return pmul(conj_helper::coeff(a), b);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(const Scalar& accum, const Scalar& a, const Scalar& b) const {
+    return pmadd(conj_helper::coeff(a), b, accum);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packet(const Packet& a, const Packet& b) const {
+    return pmul(conj_helper::packet(a), b);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packet(const Packet& accum, const Packet& a, const Packet& b) const {
+    return pmadd(conj_helper::packet(a), b, accum);
+  }
+  static constexpr bool PacketAccess = packet_traits<Scalar>::HasMul && packet_traits<Scalar>::HasAdd;
+};
+
+template <typename Lhs, typename Rhs, bool Conj>
+struct default_inner_product_impl {
+  using LhsScalar = typename traits<Lhs>::Scalar;
+  using RhsScalar = typename traits<Rhs>::Scalar;
+  using Op = scalar_inner_product_op<LhsScalar, RhsScalar, Conj>;
+  using Evaluator = inner_product_evaluator<Op, Lhs, Rhs>;
+  using result_type = typename Evaluator::Scalar;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type run(const MatrixBase<Lhs>& a, const MatrixBase<Rhs>& b) {
+    Evaluator eval(a.derived(), b.derived(), Op());
+    return inner_product_impl<Evaluator>::run(eval);
+  }
+};
+
+template <typename Lhs, typename Rhs>
+struct dot_impl : default_inner_product_impl<Lhs, Rhs, true> {};
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_INNER_PRODUCT_EVAL_H
diff --git a/inst/include/Eigen/src/Core/InternalHeaderCheck.h b/inst/include/Eigen/src/Core/InternalHeaderCheck.h
new file mode 100644
index 00000000..1cea572d
--- /dev/null
+++ b/inst/include/Eigen/src/Core/InternalHeaderCheck.h
@@ -0,0 +1,3 @@
+#ifndef EIGEN_CORE_MODULE_H
+#error "Please include Eigen/Core instead of including headers inside the src directory directly."
+#endif
diff --git a/inst/include/Eigen/src/Core/Inverse.h b/inst/include/Eigen/src/Core/Inverse.h
new file mode 100644
index 00000000..79fc3ab6
--- /dev/null
+++ b/inst/include/Eigen/src/Core/Inverse.h
@@ -0,0 +1,108 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014-2019 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_INVERSE_H
+#define EIGEN_INVERSE_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+template <typename XprType, typename StorageKind>
+class InverseImpl;
+
+namespace internal {
+
+template <typename XprType>
+struct traits<Inverse<XprType> > : traits<typename XprType::PlainObject> {
+  typedef typename XprType::PlainObject PlainObject;
+  typedef traits<PlainObject> BaseTraits;
+  enum { Flags = BaseTraits::Flags & RowMajorBit };
+};
+
+}  // end namespace internal
+
+/** \class Inverse
+ *
+ * \brief Expression of the inverse of another expression
+ *
+ * \tparam XprType the type of the expression we are taking the inverse
+ *
+ * This class represents an abstract expression of A.inverse()
+ * and most of the time this is the only way it is used.
+ *
+ */
+template <typename XprType>
+class Inverse : public InverseImpl<XprType, typename internal::traits<XprType>::StorageKind> {
+ public:
+  typedef typename XprType::StorageIndex StorageIndex;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename internal::ref_selector<XprType>::type XprTypeNested;
+  typedef internal::remove_all_t<XprTypeNested> XprTypeNestedCleaned;
+  typedef typename internal::ref_selector<Inverse>::type Nested;
+  typedef internal::remove_all_t<XprType> NestedExpression;
+
+  explicit EIGEN_DEVICE_FUNC Inverse(const XprType& xpr) : m_xpr(xpr) {}
+
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_xpr.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_xpr.rows(); }
+
+  EIGEN_DEVICE_FUNC const XprTypeNestedCleaned& nestedExpression() const { return m_xpr; }
+
+ protected:
+  XprTypeNested m_xpr;
+};
+
+// Generic API dispatcher
+template <typename XprType, typename StorageKind>
+class InverseImpl : public internal::generic_xpr_base<Inverse<XprType> >::type {
+ public:
+  typedef typename internal::generic_xpr_base<Inverse<XprType> >::type Base;
+  typedef typename XprType::Scalar Scalar;
+
+ private:
+  Scalar coeff(Index row, Index col) const;
+  Scalar coeff(Index i) const;
+};
+
+namespace internal {
+
+/** \internal
+ * \brief Default evaluator for Inverse expression.
+ *
+ * This default evaluator for Inverse expression simply evaluate the inverse into a temporary
+ * by a call to internal::call_assignment_no_alias.
+ * Therefore, inverse implementers only have to specialize Assignment<Dst,Inverse<...>, ...> for
+ * there own nested expression.
+ *
+ * \sa class Inverse
+ */
+template <typename ArgType>
+struct unary_evaluator<Inverse<ArgType> > : public evaluator<typename Inverse<ArgType>::PlainObject> {
+  typedef Inverse<ArgType> InverseType;
+  typedef typename InverseType::PlainObject PlainObject;
+  typedef evaluator<PlainObject> Base;
+
+  enum { Flags = Base::Flags | EvalBeforeNestingBit };
+
+  EIGEN_DEVICE_FUNC unary_evaluator(const InverseType& inv_xpr) : m_result(inv_xpr.rows(), inv_xpr.cols()) {
+    internal::construct_at<Base>(this, m_result);
+    internal::call_assignment_no_alias(m_result, inv_xpr);
+  }
+
+ protected:
+  PlainObject m_result;
+};
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_INVERSE_H
diff --git a/inst/include/Eigen/src/Core/Map.h b/inst/include/Eigen/src/Core/Map.h
index f804c89d..c740da72 100644
--- a/inst/include/Eigen/src/Core/Map.h
+++ b/inst/include/Eigen/src/Core/Map.h
@@ -11,182 +11,143 @@
 #ifndef EIGEN_MAP_H
 #define EIGEN_MAP_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 
-/** \class Map
-  * \ingroup Core_Module
-  *
-  * \brief A matrix or vector expression mapping an existing array of data.
-  *
-  * \tparam PlainObjectType the equivalent matrix type of the mapped data
-  * \tparam MapOptions specifies whether the pointer is \c #Aligned, or \c #Unaligned.
-  *                The default is \c #Unaligned.
-  * \tparam StrideType optionally specifies strides. By default, Map assumes the memory layout
-  *                   of an ordinary, contiguous array. This can be overridden by specifying strides.
-  *                   The type passed here must be a specialization of the Stride template, see examples below.
-  *
-  * This class represents a matrix or vector expression mapping an existing array of data.
-  * It can be used to let Eigen interface without any overhead with non-Eigen data structures,
-  * such as plain C arrays or structures from other libraries. By default, it assumes that the
-  * data is laid out contiguously in memory. You can however override this by explicitly specifying
-  * inner and outer strides.
-  *
-  * Here's an example of simply mapping a contiguous array as a \ref TopicStorageOrders "column-major" matrix:
-  * \include Map_simple.cpp
-  * Output: \verbinclude Map_simple.out
-  *
-  * If you need to map non-contiguous arrays, you can do so by specifying strides:
-  *
-  * Here's an example of mapping an array as a vector, specifying an inner stride, that is, the pointer
-  * increment between two consecutive coefficients. Here, we're specifying the inner stride as a compile-time
-  * fixed value.
-  * \include Map_inner_stride.cpp
-  * Output: \verbinclude Map_inner_stride.out
-  *
-  * Here's an example of mapping an array while specifying an outer stride. Here, since we're mapping
-  * as a column-major matrix, 'outer stride' means the pointer increment between two consecutive columns.
-  * Here, we're specifying the outer stride as a runtime parameter. Note that here \c OuterStride<> is
-  * a short version of \c OuterStride<Dynamic> because the default template parameter of OuterStride
-  * is  \c Dynamic
-  * \include Map_outer_stride.cpp
-  * Output: \verbinclude Map_outer_stride.out
-  *
-  * For more details and for an example of specifying both an inner and an outer stride, see class Stride.
-  *
-  * \b Tip: to change the array of data mapped by a Map object, you can use the C++
-  * placement new syntax:
-  *
-  * Example: \include Map_placement_new.cpp
-  * Output: \verbinclude Map_placement_new.out
-  *
-  * This class is the return type of PlainObjectBase::Map() but can also be used directly.
-  *
-  * \sa PlainObjectBase::Map(), \ref TopicStorageOrders
-  */
+namespace Eigen {
 
 namespace internal {
-template<typename PlainObjectType, int MapOptions, typename StrideType>
-struct traits<Map<PlainObjectType, MapOptions, StrideType> >
-  : public traits<PlainObjectType>
-{
+template <typename PlainObjectType, int MapOptions, typename StrideType>
+struct traits<Map<PlainObjectType, MapOptions, StrideType> > : public traits<PlainObjectType> {
   typedef traits<PlainObjectType> TraitsBase;
-  typedef typename PlainObjectType::Index Index;
-  typedef typename PlainObjectType::Scalar Scalar;
   enum {
+    PlainObjectTypeInnerSize = ((traits<PlainObjectType>::Flags & RowMajorBit) == RowMajorBit)
+                                   ? PlainObjectType::ColsAtCompileTime
+                                   : PlainObjectType::RowsAtCompileTime,
+
     InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0
-                             ? int(PlainObjectType::InnerStrideAtCompileTime)
-                             : int(StrideType::InnerStrideAtCompileTime),
+                                   ? int(PlainObjectType::InnerStrideAtCompileTime)
+                                   : int(StrideType::InnerStrideAtCompileTime),
     OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0
-                             ? int(PlainObjectType::OuterStrideAtCompileTime)
-                             : int(StrideType::OuterStrideAtCompileTime),
-    HasNoInnerStride = InnerStrideAtCompileTime == 1,
-    HasNoOuterStride = StrideType::OuterStrideAtCompileTime == 0,
-    HasNoStride = HasNoInnerStride && HasNoOuterStride,
-    IsAligned = bool(EIGEN_ALIGN) && ((int(MapOptions)&Aligned)==Aligned),
-    IsDynamicSize = PlainObjectType::SizeAtCompileTime==Dynamic,
-    KeepsPacketAccess = bool(HasNoInnerStride)
-                        && ( bool(IsDynamicSize)
-                           || HasNoOuterStride
-                           || ( OuterStrideAtCompileTime!=Dynamic
-                           && ((static_cast<int>(sizeof(Scalar))*OuterStrideAtCompileTime)%16)==0 ) ),
+                                   ? (InnerStrideAtCompileTime == Dynamic || PlainObjectTypeInnerSize == Dynamic
+                                          ? Dynamic
+                                          : int(InnerStrideAtCompileTime) * int(PlainObjectTypeInnerSize))
+                                   : int(StrideType::OuterStrideAtCompileTime),
+    Alignment = int(MapOptions) & int(AlignedMask),
     Flags0 = TraitsBase::Flags & (~NestByRefBit),
-    Flags1 = IsAligned ? (int(Flags0) | AlignedBit) : (int(Flags0) & ~AlignedBit),
-    Flags2 = (bool(HasNoStride) || bool(PlainObjectType::IsVectorAtCompileTime))
-           ? int(Flags1) : int(Flags1 & ~LinearAccessBit),
-    Flags3 = is_lvalue<PlainObjectType>::value ? int(Flags2) : (int(Flags2) & ~LvalueBit),
-    Flags = KeepsPacketAccess ? int(Flags3) : (int(Flags3) & ~PacketAccessBit)
+    Flags = is_lvalue<PlainObjectType>::value ? int(Flags0) : (int(Flags0) & ~LvalueBit)
   };
-private:
-  enum { Options }; // Expressions don't have Options
-};
-}
-
-template<typename PlainObjectType, int MapOptions, typename StrideType> class Map
-  : public MapBase<Map<PlainObjectType, MapOptions, StrideType> >
-{
-  public:
-
-    typedef MapBase<Map> Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(Map)
-
-    typedef typename Base::PointerType PointerType;
-#if EIGEN2_SUPPORT_STAGE <= STAGE30_FULL_EIGEN3_API
-    typedef const Scalar* PointerArgType;
-    inline PointerType cast_to_pointer_type(PointerArgType ptr) { return const_cast<PointerType>(ptr); }
-#else
-    typedef PointerType PointerArgType;
-    inline PointerType cast_to_pointer_type(PointerArgType ptr) { return ptr; }
-#endif
-
-    inline Index innerStride() const
-    {
-      return StrideType::InnerStrideAtCompileTime != 0 ? m_stride.inner() : 1;
-    }
-
-    inline Index outerStride() const
-    {
-      return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer()
-           : IsVectorAtCompileTime ? this->size()
-           : int(Flags)&RowMajorBit ? this->cols()
-           : this->rows();
-    }
-
-    /** Constructor in the fixed-size case.
-      *
-      * \param dataPtr pointer to the array to map
-      * \param a_stride optional Stride object, passing the strides.
-      */
-    inline Map(PointerArgType dataPtr, const StrideType& a_stride = StrideType())
-      : Base(cast_to_pointer_type(dataPtr)), m_stride(a_stride)
-    {
-      PlainObjectType::Base::_check_template_params();
-    }
-
-    /** Constructor in the dynamic-size vector case.
-      *
-      * \param dataPtr pointer to the array to map
-      * \param a_size the size of the vector expression
-      * \param a_stride optional Stride object, passing the strides.
-      */
-    inline Map(PointerArgType dataPtr, Index a_size, const StrideType& a_stride = StrideType())
-      : Base(cast_to_pointer_type(dataPtr), a_size), m_stride(a_stride)
-    {
-      PlainObjectType::Base::_check_template_params();
-    }
-
-    /** Constructor in the dynamic-size matrix case.
-      *
-      * \param dataPtr pointer to the array to map
-      * \param nbRows the number of rows of the matrix expression
-      * \param nbCols the number of columns of the matrix expression
-      * \param a_stride optional Stride object, passing the strides.
-      */
-    inline Map(PointerArgType dataPtr, Index nbRows, Index nbCols, const StrideType& a_stride = StrideType())
-      : Base(cast_to_pointer_type(dataPtr), nbRows, nbCols), m_stride(a_stride)
-    {
-      PlainObjectType::Base::_check_template_params();
-    }
-
-    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Map)
-
-  protected:
-    StrideType m_stride;
-};
 
-template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
-inline Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>
-  ::Array(const Scalar *data)
-{
-  this->_set_noalias(Eigen::Map<const Array>(data));
-}
+ private:
+  enum { Options };  // Expressions don't have Options
+};
+}  // namespace internal
 
-template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
-inline Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>
-  ::Matrix(const Scalar *data)
-{
-  this->_set_noalias(Eigen::Map<const Matrix>(data));
-}
+/** \class Map
+ * \ingroup Core_Module
+ *
+ * \brief A matrix or vector expression mapping an existing array of data.
+ *
+ * \tparam PlainObjectType the equivalent matrix type of the mapped data
+ * \tparam MapOptions specifies the pointer alignment in bytes. It can be: \c #Aligned128, \c #Aligned64, \c #Aligned32,
+ * \c #Aligned16, \c #Aligned8 or \c #Unaligned. The default is \c #Unaligned. \tparam StrideType optionally specifies
+ * strides. By default, Map assumes the memory layout of an ordinary, contiguous array. This can be overridden by
+ * specifying strides. The type passed here must be a specialization of the Stride template, see examples below.
+ *
+ * This class represents a matrix or vector expression mapping an existing array of data.
+ * It can be used to let Eigen interface without any overhead with non-Eigen data structures,
+ * such as plain C arrays or structures from other libraries. By default, it assumes that the
+ * data is laid out contiguously in memory. You can however override this by explicitly specifying
+ * inner and outer strides.
+ *
+ * Here's an example of simply mapping a contiguous array as a \ref TopicStorageOrders "column-major" matrix:
+ * \include Map_simple.cpp
+ * Output: \verbinclude Map_simple.out
+ *
+ * If you need to map non-contiguous arrays, you can do so by specifying strides:
+ *
+ * Here's an example of mapping an array as a vector, specifying an inner stride, that is, the pointer
+ * increment between two consecutive coefficients. Here, we're specifying the inner stride as a compile-time
+ * fixed value.
+ * \include Map_inner_stride.cpp
+ * Output: \verbinclude Map_inner_stride.out
+ *
+ * Here's an example of mapping an array while specifying an outer stride. Here, since we're mapping
+ * as a column-major matrix, 'outer stride' means the pointer increment between two consecutive columns.
+ * Here, we're specifying the outer stride as a runtime parameter. Note that here \c OuterStride<> is
+ * a short version of \c OuterStride<Dynamic> because the default template parameter of OuterStride
+ * is  \c Dynamic
+ * \include Map_outer_stride.cpp
+ * Output: \verbinclude Map_outer_stride.out
+ *
+ * For more details and for an example of specifying both an inner and an outer stride, see class Stride.
+ *
+ * \b Tip: to change the array of data mapped by a Map object, you can use the C++
+ * placement new syntax:
+ *
+ * Example: \include Map_placement_new.cpp
+ * Output: \verbinclude Map_placement_new.out
+ *
+ * This class is the return type of PlainObjectBase::Map() but can also be used directly.
+ *
+ * \sa PlainObjectBase::Map(), \ref TopicStorageOrders
+ */
+template <typename PlainObjectType, int MapOptions, typename StrideType>
+class Map : public MapBase<Map<PlainObjectType, MapOptions, StrideType> > {
+ public:
+  typedef MapBase<Map> Base;
+  EIGEN_DENSE_PUBLIC_INTERFACE(Map)
+
+  typedef typename Base::PointerType PointerType;
+  typedef PointerType PointerArgType;
+  EIGEN_DEVICE_FUNC inline PointerType cast_to_pointer_type(PointerArgType ptr) { return ptr; }
+
+  EIGEN_DEVICE_FUNC constexpr Index innerStride() const {
+    return StrideType::InnerStrideAtCompileTime != 0 ? m_stride.inner() : 1;
+  }
+
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const {
+    return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer()
+           : internal::traits<Map>::OuterStrideAtCompileTime != Dynamic
+               ? Index(internal::traits<Map>::OuterStrideAtCompileTime)
+           : IsVectorAtCompileTime    ? (this->size() * innerStride())
+           : int(Flags) & RowMajorBit ? (this->cols() * innerStride())
+                                      : (this->rows() * innerStride());
+  }
+
+  /** Constructor in the fixed-size case.
+   *
+   * \param dataPtr pointer to the array to map
+   * \param stride optional Stride object, passing the strides.
+   */
+  EIGEN_DEVICE_FUNC explicit inline Map(PointerArgType dataPtr, const StrideType& stride = StrideType())
+      : Base(cast_to_pointer_type(dataPtr)), m_stride(stride) {}
+
+  /** Constructor in the dynamic-size vector case.
+   *
+   * \param dataPtr pointer to the array to map
+   * \param size the size of the vector expression
+   * \param stride optional Stride object, passing the strides.
+   */
+  EIGEN_DEVICE_FUNC inline Map(PointerArgType dataPtr, Index size, const StrideType& stride = StrideType())
+      : Base(cast_to_pointer_type(dataPtr), size), m_stride(stride) {}
+
+  /** Constructor in the dynamic-size matrix case.
+   *
+   * \param dataPtr pointer to the array to map
+   * \param rows the number of rows of the matrix expression
+   * \param cols the number of columns of the matrix expression
+   * \param stride optional Stride object, passing the strides.
+   */
+  EIGEN_DEVICE_FUNC inline Map(PointerArgType dataPtr, Index rows, Index cols, const StrideType& stride = StrideType())
+      : Base(cast_to_pointer_type(dataPtr), rows, cols), m_stride(stride) {}
+
+  EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Map)
+
+ protected:
+  StrideType m_stride;
+};
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_MAP_H
+#endif  // EIGEN_MAP_H
diff --git a/inst/include/Eigen/src/Core/MapBase.h b/inst/include/Eigen/src/Core/MapBase.h
index a9828f7f..5e3d746b 100644
--- a/inst/include/Eigen/src/Core/MapBase.h
+++ b/inst/include/Eigen/src/Core/MapBase.h
@@ -11,237 +11,273 @@
 #ifndef EIGEN_MAPBASE_H
 #define EIGEN_MAPBASE_H
 
-#define EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS(Derived) \
-      EIGEN_STATIC_ASSERT((int(internal::traits<Derived>::Flags) & LinearAccessBit) || Derived::IsVectorAtCompileTime, \
-                          YOU_ARE_TRYING_TO_USE_AN_INDEX_BASED_ACCESSOR_ON_AN_EXPRESSION_THAT_DOES_NOT_SUPPORT_THAT)
-
-namespace Eigen { 
-
-/** \class MapBase
-  * \ingroup Core_Module
-  *
-  * \brief Base class for Map and Block expression with direct access
-  *
-  * \sa class Map, class Block
-  */
-template<typename Derived> class MapBase<Derived, ReadOnlyAccessors>
-  : public internal::dense_xpr_base<Derived>::type
-{
-  public:
-
-    typedef typename internal::dense_xpr_base<Derived>::type Base;
-    enum {
-      RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime,
-      ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
-      SizeAtCompileTime = Base::SizeAtCompileTime
-    };
-
-    typedef typename internal::traits<Derived>::StorageKind StorageKind;
-    typedef typename internal::traits<Derived>::Index Index;
-    typedef typename internal::traits<Derived>::Scalar Scalar;
-    typedef typename internal::packet_traits<Scalar>::type PacketScalar;
-    typedef typename NumTraits<Scalar>::Real RealScalar;
-    typedef typename internal::conditional<
-                         bool(internal::is_lvalue<Derived>::value),
-                         Scalar *,
-                         const Scalar *>::type
-                     PointerType;
-
-    using Base::derived;
-//    using Base::RowsAtCompileTime;
-//    using Base::ColsAtCompileTime;
-//    using Base::SizeAtCompileTime;
-    using Base::MaxRowsAtCompileTime;
-    using Base::MaxColsAtCompileTime;
-    using Base::MaxSizeAtCompileTime;
-    using Base::IsVectorAtCompileTime;
-    using Base::Flags;
-    using Base::IsRowMajor;
-
-    using Base::rows;
-    using Base::cols;
-    using Base::size;
-    using Base::coeff;
-    using Base::coeffRef;
-    using Base::lazyAssign;
-    using Base::eval;
-
-    using Base::innerStride;
-    using Base::outerStride;
-    using Base::rowStride;
-    using Base::colStride;
-
-    // bug 217 - compile error on ICC 11.1
-    using Base::operator=;
-
-    typedef typename Base::CoeffReturnType CoeffReturnType;
-
-    inline Index rows() const { return m_rows.value(); }
-    inline Index cols() const { return m_cols.value(); }
-
-    /** Returns a pointer to the first coefficient of the matrix or vector.
-      *
-      * \note When addressing this data, make sure to honor the strides returned by innerStride() and outerStride().
-      *
-      * \sa innerStride(), outerStride()
-      */
-    inline const Scalar* data() const { return m_data; }
-
-    inline const Scalar& coeff(Index rowId, Index colId) const
-    {
-      return m_data[colId * colStride() + rowId * rowStride()];
-    }
-
-    inline const Scalar& coeff(Index index) const
-    {
-      EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS(Derived)
-      return m_data[index * innerStride()];
-    }
-
-    inline const Scalar& coeffRef(Index rowId, Index colId) const
-    {
-      return this->m_data[colId * colStride() + rowId * rowStride()];
-    }
-
-    inline const Scalar& coeffRef(Index index) const
-    {
-      EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS(Derived)
-      return this->m_data[index * innerStride()];
-    }
-
-    template<int LoadMode>
-    inline PacketScalar packet(Index rowId, Index colId) const
-    {
-      return internal::ploadt<PacketScalar, LoadMode>
-               (m_data + (colId * colStride() + rowId * rowStride()));
-    }
-
-    template<int LoadMode>
-    inline PacketScalar packet(Index index) const
-    {
-      EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS(Derived)
-      return internal::ploadt<PacketScalar, LoadMode>(m_data + index * innerStride());
-    }
-
-    explicit inline MapBase(PointerType dataPtr) : m_data(dataPtr), m_rows(RowsAtCompileTime), m_cols(ColsAtCompileTime)
-    {
-      EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
-      checkSanity();
-    }
-
-    inline MapBase(PointerType dataPtr, Index vecSize)
-            : m_data(dataPtr),
-              m_rows(RowsAtCompileTime == Dynamic ? vecSize : Index(RowsAtCompileTime)),
-              m_cols(ColsAtCompileTime == Dynamic ? vecSize : Index(ColsAtCompileTime))
-    {
-      EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-      eigen_assert(vecSize >= 0);
-      eigen_assert(dataPtr == 0 || SizeAtCompileTime == Dynamic || SizeAtCompileTime == vecSize);
-      checkSanity();
-    }
-
-    inline MapBase(PointerType dataPtr, Index nbRows, Index nbCols)
-            : m_data(dataPtr), m_rows(nbRows), m_cols(nbCols)
-    {
-      eigen_assert( (dataPtr == 0)
-              || (   nbRows >= 0 && (RowsAtCompileTime == Dynamic || RowsAtCompileTime == nbRows)
-                  && nbCols >= 0 && (ColsAtCompileTime == Dynamic || ColsAtCompileTime == nbCols)));
-      checkSanity();
-    }
-
-  protected:
-
-    void checkSanity() const
-    {
-      EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(internal::traits<Derived>::Flags&PacketAccessBit,
-                                        internal::inner_stride_at_compile_time<Derived>::ret==1),
-                          PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1);
-      eigen_assert(EIGEN_IMPLIES(internal::traits<Derived>::Flags&AlignedBit, (size_t(m_data) % 16) == 0)
-                   && "input pointer is not aligned on a 16 byte boundary");
-    }
-
-    PointerType m_data;
-    const internal::variable_if_dynamic<Index, RowsAtCompileTime> m_rows;
-    const internal::variable_if_dynamic<Index, ColsAtCompileTime> m_cols;
+#define EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS(Derived)                                                               \
+  EIGEN_STATIC_ASSERT((int(internal::evaluator<Derived>::Flags) & LinearAccessBit) || Derived::IsVectorAtCompileTime, \
+                      YOU_ARE_TRYING_TO_USE_AN_INDEX_BASED_ACCESSOR_ON_AN_EXPRESSION_THAT_DOES_NOT_SUPPORT_THAT)
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \ingroup Core_Module
+ *
+ * \brief Base class for dense Map and Block expression with direct access
+ *
+ * This base class provides the const low-level accessors (e.g. coeff, coeffRef) of dense
+ * Map and Block objects with direct access.
+ * Typical users do not have to directly deal with this class.
+ *
+ * This class can be extended by through the macro plugin \c EIGEN_MAPBASE_PLUGIN.
+ * See \link TopicCustomizing_Plugins customizing Eigen \endlink for details.
+ *
+ * The \c Derived class has to provide the following two methods describing the memory layout:
+ *  \code Index innerStride() const; \endcode
+ *  \code Index outerStride() const; \endcode
+ *
+ * \sa class Map, class Block
+ */
+template <typename Derived>
+class MapBase<Derived, ReadOnlyAccessors> : public internal::dense_xpr_base<Derived>::type {
+ public:
+  typedef typename internal::dense_xpr_base<Derived>::type Base;
+  enum {
+    RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime,
+    ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
+    InnerStrideAtCompileTime = internal::traits<Derived>::InnerStrideAtCompileTime,
+    SizeAtCompileTime = Base::SizeAtCompileTime
+  };
+
+  typedef typename internal::traits<Derived>::StorageKind StorageKind;
+  typedef typename internal::traits<Derived>::Scalar Scalar;
+  typedef typename internal::packet_traits<Scalar>::type PacketScalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef std::conditional_t<bool(internal::is_lvalue<Derived>::value), Scalar*, const Scalar*> PointerType;
+
+  using Base::derived;
+  //    using Base::RowsAtCompileTime;
+  //    using Base::ColsAtCompileTime;
+  //    using Base::SizeAtCompileTime;
+  using Base::Flags;
+  using Base::IsRowMajor;
+  using Base::IsVectorAtCompileTime;
+  using Base::MaxColsAtCompileTime;
+  using Base::MaxRowsAtCompileTime;
+  using Base::MaxSizeAtCompileTime;
+
+  using Base::coeff;
+  using Base::coeffRef;
+  using Base::cols;
+  using Base::eval;
+  using Base::lazyAssign;
+  using Base::rows;
+  using Base::size;
+
+  using Base::colStride;
+  using Base::innerStride;
+  using Base::outerStride;
+  using Base::rowStride;
+
+  // bug 217 - compile error on ICC 11.1
+  using Base::operator=;
+
+  typedef typename Base::CoeffReturnType CoeffReturnType;
+
+  /** \copydoc DenseBase::rows() */
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_rows.value(); }
+  /** \copydoc DenseBase::cols() */
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_cols.value(); }
+
+  /** Returns a pointer to the first coefficient of the matrix or vector.
+   *
+   * \note When addressing this data, make sure to honor the strides returned by innerStride() and outerStride().
+   *
+   * \sa innerStride(), outerStride()
+   */
+  EIGEN_DEVICE_FUNC constexpr const Scalar* data() const { return m_data; }
+
+  /** \copydoc PlainObjectBase::coeff(Index,Index) const */
+  EIGEN_DEVICE_FUNC inline const Scalar& coeff(Index rowId, Index colId) const {
+    return m_data[colId * colStride() + rowId * rowStride()];
+  }
+
+  /** \copydoc PlainObjectBase::coeff(Index) const */
+  EIGEN_DEVICE_FUNC inline const Scalar& coeff(Index index) const {
+    EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS(Derived)
+    return m_data[index * innerStride()];
+  }
+
+  /** \copydoc PlainObjectBase::coeffRef(Index,Index) const */
+  EIGEN_DEVICE_FUNC inline const Scalar& coeffRef(Index rowId, Index colId) const {
+    return this->m_data[colId * colStride() + rowId * rowStride()];
+  }
+
+  /** \copydoc PlainObjectBase::coeffRef(Index) const */
+  EIGEN_DEVICE_FUNC inline const Scalar& coeffRef(Index index) const {
+    EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS(Derived)
+    return this->m_data[index * innerStride()];
+  }
+
+  /** \internal */
+  template <int LoadMode>
+  inline PacketScalar packet(Index rowId, Index colId) const {
+    return internal::ploadt<PacketScalar, LoadMode>(m_data + (colId * colStride() + rowId * rowStride()));
+  }
+
+  /** \internal */
+  template <int LoadMode>
+  inline PacketScalar packet(Index index) const {
+    EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS(Derived)
+    return internal::ploadt<PacketScalar, LoadMode>(m_data + index * innerStride());
+  }
+
+  /** \internal Constructor for fixed size matrices or vectors */
+  EIGEN_DEVICE_FUNC explicit inline MapBase(PointerType dataPtr)
+      : m_data(dataPtr), m_rows(RowsAtCompileTime), m_cols(ColsAtCompileTime) {
+    EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived)
+    checkSanity<Derived>();
+  }
+
+  /** \internal Constructor for dynamically sized vectors */
+  EIGEN_DEVICE_FUNC inline MapBase(PointerType dataPtr, Index vecSize)
+      : m_data(dataPtr),
+        m_rows(RowsAtCompileTime == Dynamic ? vecSize : Index(RowsAtCompileTime)),
+        m_cols(ColsAtCompileTime == Dynamic ? vecSize : Index(ColsAtCompileTime)) {
+    EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+    eigen_assert(vecSize >= 0);
+    eigen_assert(dataPtr == 0 || SizeAtCompileTime == Dynamic || SizeAtCompileTime == vecSize);
+    checkSanity<Derived>();
+  }
+
+  /** \internal Constructor for dynamically sized matrices */
+  EIGEN_DEVICE_FUNC inline MapBase(PointerType dataPtr, Index rows, Index cols)
+      : m_data(dataPtr), m_rows(rows), m_cols(cols) {
+    eigen_assert((dataPtr == 0) || (rows >= 0 && (RowsAtCompileTime == Dynamic || RowsAtCompileTime == rows) &&
+                                    cols >= 0 && (ColsAtCompileTime == Dynamic || ColsAtCompileTime == cols)));
+    checkSanity<Derived>();
+  }
+
+#ifdef EIGEN_MAPBASE_PLUGIN
+#include EIGEN_MAPBASE_PLUGIN
+#endif
+
+ protected:
+  EIGEN_DEFAULT_COPY_CONSTRUCTOR(MapBase)
+  EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(MapBase)
+
+  template <typename T>
+  EIGEN_DEVICE_FUNC void checkSanity(std::enable_if_t<(internal::traits<T>::Alignment > 0), void*> = 0) const {
+// Temporary macro to allow scalars to not be properly aligned.  This is while we sort out failures
+// in TensorFlow Lite that are currently relying on this UB.
+#ifndef EIGEN_ALLOW_UNALIGNED_SCALARS
+    // Pointer must be aligned to the Scalar type, otherwise we get UB.
+    eigen_assert((std::uintptr_t(m_data) % alignof(Scalar) == 0) && "data is not scalar-aligned");
+#endif
+#if EIGEN_MAX_ALIGN_BYTES > 0
+    // innerStride() is not set yet when this function is called, so we optimistically assume the lowest plausible
+    // value:
+    const Index minInnerStride = InnerStrideAtCompileTime == Dynamic ? 1 : Index(InnerStrideAtCompileTime);
+    EIGEN_ONLY_USED_FOR_DEBUG(minInnerStride);
+    eigen_assert((((std::uintptr_t(m_data) % internal::traits<Derived>::Alignment) == 0) ||
+                  (cols() * rows() * minInnerStride * sizeof(Scalar)) < internal::traits<Derived>::Alignment) &&
+                 "data is not aligned");
+#endif
+  }
+
+  template <typename T>
+  EIGEN_DEVICE_FUNC void checkSanity(std::enable_if_t<internal::traits<T>::Alignment == 0, void*> = 0) const {
+#ifndef EIGEN_ALLOW_UNALIGNED_SCALARS
+    // Pointer must be aligned to the Scalar type, otherwise we get UB.
+    eigen_assert((std::uintptr_t(m_data) % alignof(Scalar) == 0) && "data is not scalar-aligned");
+#endif
+  }
+
+  PointerType m_data;
+  const internal::variable_if_dynamic<Index, RowsAtCompileTime> m_rows;
+  const internal::variable_if_dynamic<Index, ColsAtCompileTime> m_cols;
 };
 
-template<typename Derived> class MapBase<Derived, WriteAccessors>
-  : public MapBase<Derived, ReadOnlyAccessors>
-{
-    typedef MapBase<Derived, ReadOnlyAccessors> ReadOnlyMapBase;
-  public:
-
-    typedef MapBase<Derived, ReadOnlyAccessors> Base;
-
-    typedef typename Base::Scalar Scalar;
-    typedef typename Base::PacketScalar PacketScalar;
-    typedef typename Base::Index Index;
-    typedef typename Base::PointerType PointerType;
-
-    using Base::derived;
-    using Base::rows;
-    using Base::cols;
-    using Base::size;
-    using Base::coeff;
-    using Base::coeffRef;
-
-    using Base::innerStride;
-    using Base::outerStride;
-    using Base::rowStride;
-    using Base::colStride;
-
-    typedef typename internal::conditional<
-                    internal::is_lvalue<Derived>::value,
-                    Scalar,
-                    const Scalar
-                  >::type ScalarWithConstIfNotLvalue;
-
-    inline const Scalar* data() const { return this->m_data; }
-    inline ScalarWithConstIfNotLvalue* data() { return this->m_data; } // no const-cast here so non-const-correct code will give a compile error
-
-    inline ScalarWithConstIfNotLvalue& coeffRef(Index row, Index col)
-    {
-      return this->m_data[col * colStride() + row * rowStride()];
-    }
-
-    inline ScalarWithConstIfNotLvalue& coeffRef(Index index)
-    {
-      EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS(Derived)
-      return this->m_data[index * innerStride()];
-    }
-
-    template<int StoreMode>
-    inline void writePacket(Index row, Index col, const PacketScalar& val)
-    {
-      internal::pstoret<Scalar, PacketScalar, StoreMode>
-               (this->m_data + (col * colStride() + row * rowStride()), val);
-    }
-
-    template<int StoreMode>
-    inline void writePacket(Index index, const PacketScalar& val)
-    {
-      EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS(Derived)
-      internal::pstoret<Scalar, PacketScalar, StoreMode>
-                (this->m_data + index * innerStride(), val);
-    }
-
-    explicit inline MapBase(PointerType dataPtr) : Base(dataPtr) {}
-    inline MapBase(PointerType dataPtr, Index vecSize) : Base(dataPtr, vecSize) {}
-    inline MapBase(PointerType dataPtr, Index nbRows, Index nbCols) : Base(dataPtr, nbRows, nbCols) {}
-
-    Derived& operator=(const MapBase& other)
-    {
-      ReadOnlyMapBase::Base::operator=(other);
-      return derived();
-    }
-
-    // In theory we could simply refer to Base:Base::operator=, but MSVC does not like Base::Base,
-    // see bugs 821 and 920.
-    using ReadOnlyMapBase::Base::operator=;
+/** \ingroup Core_Module
+ *
+ * \brief Base class for non-const dense Map and Block expression with direct access
+ *
+ * This base class provides the non-const low-level accessors (e.g. coeff and coeffRef) of
+ * dense Map and Block objects with direct access.
+ * It inherits MapBase<Derived, ReadOnlyAccessors> which defines the const variant for reading specific entries.
+ *
+ * \sa class Map, class Block
+ */
+template <typename Derived>
+class MapBase<Derived, WriteAccessors> : public MapBase<Derived, ReadOnlyAccessors> {
+  typedef MapBase<Derived, ReadOnlyAccessors> ReadOnlyMapBase;
+
+ public:
+  typedef MapBase<Derived, ReadOnlyAccessors> Base;
+
+  typedef typename Base::Scalar Scalar;
+  typedef typename Base::PacketScalar PacketScalar;
+  typedef typename Base::StorageIndex StorageIndex;
+  typedef typename Base::PointerType PointerType;
+
+  using Base::coeff;
+  using Base::coeffRef;
+  using Base::cols;
+  using Base::derived;
+  using Base::rows;
+  using Base::size;
+
+  using Base::colStride;
+  using Base::innerStride;
+  using Base::outerStride;
+  using Base::rowStride;
+
+  typedef std::conditional_t<internal::is_lvalue<Derived>::value, Scalar, const Scalar> ScalarWithConstIfNotLvalue;
+
+  EIGEN_DEVICE_FUNC constexpr const Scalar* data() const { return this->m_data; }
+  EIGEN_DEVICE_FUNC constexpr ScalarWithConstIfNotLvalue* data() {
+    return this->m_data;
+  }  // no const-cast here so non-const-correct code will give a compile error
+
+  EIGEN_DEVICE_FUNC inline ScalarWithConstIfNotLvalue& coeffRef(Index row, Index col) {
+    return this->m_data[col * colStride() + row * rowStride()];
+  }
+
+  EIGEN_DEVICE_FUNC inline ScalarWithConstIfNotLvalue& coeffRef(Index index) {
+    EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS(Derived)
+    return this->m_data[index * innerStride()];
+  }
+
+  template <int StoreMode>
+  inline void writePacket(Index row, Index col, const PacketScalar& val) {
+    internal::pstoret<Scalar, PacketScalar, StoreMode>(this->m_data + (col * colStride() + row * rowStride()), val);
+  }
+
+  template <int StoreMode>
+  inline void writePacket(Index index, const PacketScalar& val) {
+    EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS(Derived)
+    internal::pstoret<Scalar, PacketScalar, StoreMode>(this->m_data + index * innerStride(), val);
+  }
+
+  EIGEN_DEVICE_FUNC explicit inline MapBase(PointerType dataPtr) : Base(dataPtr) {}
+  EIGEN_DEVICE_FUNC inline MapBase(PointerType dataPtr, Index vecSize) : Base(dataPtr, vecSize) {}
+  EIGEN_DEVICE_FUNC inline MapBase(PointerType dataPtr, Index rows, Index cols) : Base(dataPtr, rows, cols) {}
+
+  EIGEN_DEVICE_FUNC Derived& operator=(const MapBase& other) {
+    ReadOnlyMapBase::Base::operator=(other);
+    return derived();
+  }
+
+  // In theory we could simply refer to Base:Base::operator=, but MSVC does not like Base::Base,
+  // see bugs 821 and 920.
+  using ReadOnlyMapBase::Base::operator=;
+
+ protected:
+  EIGEN_DEFAULT_COPY_CONSTRUCTOR(MapBase)
+  EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(MapBase)
 };
 
 #undef EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_MAPBASE_H
+#endif  // EIGEN_MAPBASE_H
diff --git a/inst/include/Eigen/src/Core/MathFunctions.h b/inst/include/Eigen/src/Core/MathFunctions.h
index adf2f9c5..5e36ce84 100644
--- a/inst/include/Eigen/src/Core/MathFunctions.h
+++ b/inst/include/Eigen/src/Core/MathFunctions.h
@@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2006-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,719 +11,2022 @@
 #ifndef EIGEN_MATHFUNCTIONS_H
 #define EIGEN_MATHFUNCTIONS_H
 
+// TODO this should better be moved to NumTraits
+// Source: WolframAlpha
+#define EIGEN_PI 3.141592653589793238462643383279502884197169399375105820974944592307816406L
+#define EIGEN_LOG2E 1.442695040888963407359924681001892137426645954152985934135449406931109219L
+#define EIGEN_LN2 0.693147180559945309417232121458176568075500134360255254120680009493393621L
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
 namespace Eigen {
 
 namespace internal {
 
-/** \internal \struct global_math_functions_filtering_base
-  *
-  * What it does:
-  * Defines a typedef 'type' as follows:
-  * - if type T has a member typedef Eigen_BaseClassForSpecializationOfGlobalMathFuncImpl, then
-  *   global_math_functions_filtering_base<T>::type is a typedef for it.
-  * - otherwise, global_math_functions_filtering_base<T>::type is a typedef for T.
-  *
-  * How it's used:
-  * To allow to defined the global math functions (like sin...) in certain cases, like the Array expressions.
-  * When you do sin(array1+array2), the object array1+array2 has a complicated expression type, all what you want to know
-  * is that it inherits ArrayBase. So we implement a partial specialization of sin_impl for ArrayBase<Derived>.
-  * So we must make sure to use sin_impl<ArrayBase<Derived> > and not sin_impl<Derived>, otherwise our partial specialization
-  * won't be used. How does sin know that? That's exactly what global_math_functions_filtering_base tells it.
-  *
-  * How it's implemented:
-  * SFINAE in the style of enable_if. Highly susceptible of breaking compilers. With GCC, it sure does work, but if you replace
-  * the typename dummy by an integer template parameter, it doesn't work anymore!
-  */
-
-template<typename T, typename dummy = void>
-struct global_math_functions_filtering_base
-{
+/** \internal \class global_math_functions_filtering_base
+ *
+ * What it does:
+ * Defines a typedef 'type' as follows:
+ * - if type T has a member typedef Eigen_BaseClassForSpecializationOfGlobalMathFuncImpl, then
+ *   global_math_functions_filtering_base<T>::type is a typedef for it.
+ * - otherwise, global_math_functions_filtering_base<T>::type is a typedef for T.
+ *
+ * How it's used:
+ * To allow to defined the global math functions (like sin...) in certain cases, like the Array expressions.
+ * When you do sin(array1+array2), the object array1+array2 has a complicated expression type, all what you want to know
+ * is that it inherits ArrayBase. So we implement a partial specialization of sin_impl for ArrayBase<Derived>.
+ * So we must make sure to use sin_impl<ArrayBase<Derived> > and not sin_impl<Derived>, otherwise our partial
+ * specialization won't be used. How does sin know that? That's exactly what global_math_functions_filtering_base tells
+ * it.
+ *
+ * How it's implemented:
+ * SFINAE in the style of enable_if. Highly susceptible of breaking compilers. With GCC, it sure does work, but if you
+ * replace the typename dummy by an integer template parameter, it doesn't work anymore!
+ */
+
+template <typename T, typename dummy = void>
+struct global_math_functions_filtering_base {
   typedef T type;
 };
 
-template<typename T> struct always_void { typedef void type; };
+template <typename T>
+struct always_void {
+  typedef void type;
+};
 
-template<typename T>
-struct global_math_functions_filtering_base
-  <T,
-   typename always_void<typename T::Eigen_BaseClassForSpecializationOfGlobalMathFuncImpl>::type
-  >
-{
+template <typename T>
+struct global_math_functions_filtering_base<
+    T, typename always_void<typename T::Eigen_BaseClassForSpecializationOfGlobalMathFuncImpl>::type> {
   typedef typename T::Eigen_BaseClassForSpecializationOfGlobalMathFuncImpl type;
 };
 
-#define EIGEN_MATHFUNC_IMPL(func, scalar) Eigen::internal::func##_impl<typename Eigen::internal::global_math_functions_filtering_base<scalar>::type>
-#define EIGEN_MATHFUNC_RETVAL(func, scalar) typename Eigen::internal::func##_retval<typename Eigen::internal::global_math_functions_filtering_base<scalar>::type>::type
+#define EIGEN_MATHFUNC_IMPL(func, scalar) \
+  Eigen::internal::func##_impl<typename Eigen::internal::global_math_functions_filtering_base<scalar>::type>
+#define EIGEN_MATHFUNC_RETVAL(func, scalar) \
+  typename Eigen::internal::func##_retval<  \
+      typename Eigen::internal::global_math_functions_filtering_base<scalar>::type>::type
 
 /****************************************************************************
-* Implementation of real                                                 *
-****************************************************************************/
+ * Implementation of real                                                 *
+ ****************************************************************************/
 
-template<typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
-struct real_default_impl
-{
+template <typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
+struct real_default_impl {
   typedef typename NumTraits<Scalar>::Real RealScalar;
-  static inline RealScalar run(const Scalar& x)
-  {
-    return x;
-  }
+  EIGEN_DEVICE_FUNC static inline RealScalar run(const Scalar& x) { return x; }
 };
 
-template<typename Scalar>
-struct real_default_impl<Scalar,true>
-{
+template <typename Scalar>
+struct real_default_impl<Scalar, true> {
   typedef typename NumTraits<Scalar>::Real RealScalar;
-  static inline RealScalar run(const Scalar& x)
-  {
+  EIGEN_DEVICE_FUNC static inline RealScalar run(const Scalar& x) {
     using std::real;
     return real(x);
   }
 };
 
-template<typename Scalar> struct real_impl : real_default_impl<Scalar> {};
+template <typename Scalar>
+struct real_impl : real_default_impl<Scalar> {};
 
-template<typename Scalar>
-struct real_retval
-{
-  typedef typename NumTraits<Scalar>::Real type;
+#if defined(EIGEN_GPU_COMPILE_PHASE)
+template <typename T>
+struct real_impl<std::complex<T>> {
+  typedef T RealScalar;
+  EIGEN_DEVICE_FUNC static inline T run(const std::complex<T>& x) { return x.real(); }
 };
+#endif
 
+template <typename Scalar>
+struct real_retval {
+  typedef typename NumTraits<Scalar>::Real type;
+};
 
 /****************************************************************************
-* Implementation of imag                                                 *
-****************************************************************************/
+ * Implementation of imag                                                 *
+ ****************************************************************************/
 
-template<typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
-struct imag_default_impl
-{
+template <typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
+struct imag_default_impl {
   typedef typename NumTraits<Scalar>::Real RealScalar;
-  static inline RealScalar run(const Scalar&)
-  {
-    return RealScalar(0);
-  }
+  EIGEN_DEVICE_FUNC static inline RealScalar run(const Scalar&) { return RealScalar(0); }
 };
 
-template<typename Scalar>
-struct imag_default_impl<Scalar,true>
-{
+template <typename Scalar>
+struct imag_default_impl<Scalar, true> {
   typedef typename NumTraits<Scalar>::Real RealScalar;
-  static inline RealScalar run(const Scalar& x)
-  {
+  EIGEN_DEVICE_FUNC static inline RealScalar run(const Scalar& x) {
     using std::imag;
     return imag(x);
   }
 };
 
-template<typename Scalar> struct imag_impl : imag_default_impl<Scalar> {};
+template <typename Scalar>
+struct imag_impl : imag_default_impl<Scalar> {};
 
-template<typename Scalar>
-struct imag_retval
-{
+#if defined(EIGEN_GPU_COMPILE_PHASE)
+template <typename T>
+struct imag_impl<std::complex<T>> {
+  typedef T RealScalar;
+  EIGEN_DEVICE_FUNC static inline T run(const std::complex<T>& x) { return x.imag(); }
+};
+#endif
+
+template <typename Scalar>
+struct imag_retval {
   typedef typename NumTraits<Scalar>::Real type;
 };
 
 /****************************************************************************
-* Implementation of real_ref                                             *
-****************************************************************************/
+ * Implementation of real_ref                                             *
+ ****************************************************************************/
 
-template<typename Scalar>
-struct real_ref_impl
-{
+template <typename Scalar>
+struct real_ref_impl {
   typedef typename NumTraits<Scalar>::Real RealScalar;
-  static inline RealScalar& run(Scalar& x)
-  {
-    return reinterpret_cast<RealScalar*>(&x)[0];
-  }
-  static inline const RealScalar& run(const Scalar& x)
-  {
+  EIGEN_DEVICE_FUNC static inline RealScalar& run(Scalar& x) { return reinterpret_cast<RealScalar*>(&x)[0]; }
+  EIGEN_DEVICE_FUNC static inline const RealScalar& run(const Scalar& x) {
     return reinterpret_cast<const RealScalar*>(&x)[0];
   }
 };
 
-template<typename Scalar>
-struct real_ref_retval
-{
-  typedef typename NumTraits<Scalar>::Real & type;
+template <typename Scalar>
+struct real_ref_retval {
+  typedef typename NumTraits<Scalar>::Real& type;
 };
 
 /****************************************************************************
-* Implementation of imag_ref                                             *
-****************************************************************************/
+ * Implementation of imag_ref                                             *
+ ****************************************************************************/
 
-template<typename Scalar, bool IsComplex>
-struct imag_ref_default_impl
-{
+template <typename Scalar, bool IsComplex>
+struct imag_ref_default_impl {
   typedef typename NumTraits<Scalar>::Real RealScalar;
-  static inline RealScalar& run(Scalar& x)
-  {
-    return reinterpret_cast<RealScalar*>(&x)[1];
-  }
-  static inline const RealScalar& run(const Scalar& x)
-  {
-    return reinterpret_cast<RealScalar*>(&x)[1];
+  EIGEN_DEVICE_FUNC static inline RealScalar& run(Scalar& x) { return reinterpret_cast<RealScalar*>(&x)[1]; }
+  EIGEN_DEVICE_FUNC static inline const RealScalar& run(const Scalar& x) {
+    return reinterpret_cast<const RealScalar*>(&x)[1];
   }
 };
 
-template<typename Scalar>
-struct imag_ref_default_impl<Scalar, false>
-{
-  static inline Scalar run(Scalar&)
-  {
-    return Scalar(0);
-  }
-  static inline const Scalar run(const Scalar&)
-  {
-    return Scalar(0);
-  }
+template <typename Scalar>
+struct imag_ref_default_impl<Scalar, false> {
+  EIGEN_DEVICE_FUNC constexpr static Scalar run(Scalar&) { return Scalar(0); }
+  EIGEN_DEVICE_FUNC constexpr static const Scalar run(const Scalar&) { return Scalar(0); }
 };
 
-template<typename Scalar>
+template <typename Scalar>
 struct imag_ref_impl : imag_ref_default_impl<Scalar, NumTraits<Scalar>::IsComplex> {};
 
-template<typename Scalar>
-struct imag_ref_retval
-{
-  typedef typename NumTraits<Scalar>::Real & type;
+template <typename Scalar>
+struct imag_ref_retval {
+  typedef typename NumTraits<Scalar>::Real& type;
 };
 
+}  // namespace internal
+
+namespace numext {
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(real, Scalar) real(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(real, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline internal::add_const_on_value_type_t<EIGEN_MATHFUNC_RETVAL(real_ref, Scalar)> real_ref(
+    const Scalar& x) {
+  return internal::real_ref_impl<Scalar>::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(real_ref, Scalar) real_ref(Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(real_ref, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(imag, Scalar) imag(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(imag, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar select(const Scalar& mask, const Scalar& a, const Scalar& b) {
+  return numext::is_exactly_zero(mask) ? b : a;
+}
+
+}  // namespace numext
+
+namespace internal {
+
 /****************************************************************************
-* Implementation of conj                                                 *
-****************************************************************************/
+ * Implementation of conj                                                 *
+ ****************************************************************************/
 
-template<typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
-struct conj_impl
-{
-  static inline Scalar run(const Scalar& x)
-  {
-    return x;
-  }
+template <typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
+struct conj_default_impl {
+  EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) { return x; }
 };
 
-template<typename Scalar>
-struct conj_impl<Scalar,true>
-{
-  static inline Scalar run(const Scalar& x)
-  {
+template <typename Scalar>
+struct conj_default_impl<Scalar, true> {
+  EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) {
     using std::conj;
     return conj(x);
   }
 };
 
-template<typename Scalar>
-struct conj_retval
-{
+template <typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
+struct conj_impl : conj_default_impl<Scalar, IsComplex> {};
+
+template <typename Scalar>
+struct conj_retval {
   typedef Scalar type;
 };
 
 /****************************************************************************
-* Implementation of abs2                                                 *
-****************************************************************************/
+ * Implementation of abs2                                                 *
+ ****************************************************************************/
 
-template<typename Scalar>
-struct abs2_impl
+template <typename Scalar, bool IsComplex>
+struct abs2_impl_default {
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC static inline RealScalar run(const Scalar& x) { return x * x; }
+};
+
+template <typename Scalar>
+struct abs2_impl_default<Scalar, true>  // IsComplex
 {
   typedef typename NumTraits<Scalar>::Real RealScalar;
-  static inline RealScalar run(const Scalar& x)
-  {
-    return x*x;
+  EIGEN_DEVICE_FUNC static inline RealScalar run(const Scalar& x) {
+    return numext::real(x) * numext::real(x) + numext::imag(x) * numext::imag(x);
   }
 };
 
-template<typename RealScalar>
-struct abs2_impl<std::complex<RealScalar> >
-{
-  static inline RealScalar run(const std::complex<RealScalar>& x)
-  {
-    return real(x)*real(x) + imag(x)*imag(x);
+template <typename Scalar>
+struct abs2_impl {
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC static inline RealScalar run(const Scalar& x) {
+    return abs2_impl_default<Scalar, NumTraits<Scalar>::IsComplex>::run(x);
   }
 };
 
-template<typename Scalar>
-struct abs2_retval
-{
+template <typename Scalar>
+struct abs2_retval {
   typedef typename NumTraits<Scalar>::Real type;
 };
 
 /****************************************************************************
-* Implementation of norm1                                                *
-****************************************************************************/
+ * Implementation of sqrt/rsqrt                                             *
+ ****************************************************************************/
+
+template <typename Scalar>
+struct sqrt_impl {
+  EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE Scalar run(const Scalar& x) {
+    EIGEN_USING_STD(sqrt);
+    return sqrt(x);
+  }
+};
+
+// Complex sqrt defined in MathFunctionsImpl.h.
+template <typename ComplexT>
+EIGEN_DEVICE_FUNC ComplexT complex_sqrt(const ComplexT& a_x);
+
+// Custom implementation is faster than `std::sqrt`, works on
+// GPU, and correctly handles special cases (unlike MSVC).
+template <typename T>
+struct sqrt_impl<std::complex<T>> {
+  EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE std::complex<T> run(const std::complex<T>& x) { return complex_sqrt(x); }
+};
+
+template <typename Scalar>
+struct sqrt_retval {
+  typedef Scalar type;
+};
+
+// Default implementation relies on numext::sqrt, at bottom of file.
+template <typename T>
+struct rsqrt_impl;
+
+// Complex rsqrt defined in MathFunctionsImpl.h.
+template <typename ComplexT>
+EIGEN_DEVICE_FUNC ComplexT complex_rsqrt(const ComplexT& a_x);
+
+template <typename T>
+struct rsqrt_impl<std::complex<T>> {
+  EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE std::complex<T> run(const std::complex<T>& x) {
+    return complex_rsqrt(x);
+  }
+};
+
+template <typename Scalar>
+struct rsqrt_retval {
+  typedef Scalar type;
+};
+
+/****************************************************************************
+ * Implementation of norm1                                                *
+ ****************************************************************************/
 
-template<typename Scalar, bool IsComplex>
-struct norm1_default_impl
-{
+template <typename Scalar, bool IsComplex>
+struct norm1_default_impl;
+
+template <typename Scalar>
+struct norm1_default_impl<Scalar, true> {
   typedef typename NumTraits<Scalar>::Real RealScalar;
-  static inline RealScalar run(const Scalar& x)
-  {
-    using std::abs;
-    return abs(real(x)) + abs(imag(x));
+  EIGEN_DEVICE_FUNC static inline RealScalar run(const Scalar& x) {
+    EIGEN_USING_STD(abs);
+    return abs(numext::real(x)) + abs(numext::imag(x));
   }
 };
 
-template<typename Scalar>
-struct norm1_default_impl<Scalar, false>
-{
-  static inline Scalar run(const Scalar& x)
-  {
-    using std::abs;
+template <typename Scalar>
+struct norm1_default_impl<Scalar, false> {
+  EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) {
+    EIGEN_USING_STD(abs);
     return abs(x);
   }
 };
 
-template<typename Scalar>
+template <typename Scalar>
 struct norm1_impl : norm1_default_impl<Scalar, NumTraits<Scalar>::IsComplex> {};
 
-template<typename Scalar>
-struct norm1_retval
-{
+template <typename Scalar>
+struct norm1_retval {
   typedef typename NumTraits<Scalar>::Real type;
 };
 
 /****************************************************************************
-* Implementation of hypot                                                *
-****************************************************************************/
+ * Implementation of hypot                                                *
+ ****************************************************************************/
 
-template<typename Scalar>
-struct hypot_impl
-{
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  static inline RealScalar run(const Scalar& x, const Scalar& y)
-  {
-    using std::max;
-    using std::min;
-    using std::abs;
-    using std::sqrt;
-    RealScalar _x = abs(x);
-    RealScalar _y = abs(y);
-    RealScalar p = (max)(_x, _y);
-    if(p==RealScalar(0)) return RealScalar(0);
-    RealScalar q = (min)(_x, _y);
-    RealScalar qp = q/p;
-    return p * sqrt(RealScalar(1) + qp*qp);
-  }
-};
-
-template<typename Scalar>
-struct hypot_retval
-{
+template <typename Scalar>
+struct hypot_impl;
+
+template <typename Scalar>
+struct hypot_retval {
   typedef typename NumTraits<Scalar>::Real type;
 };
 
 /****************************************************************************
-* Implementation of cast                                                 *
-****************************************************************************/
+ * Implementation of cast                                                 *
+ ****************************************************************************/
 
-template<typename OldType, typename NewType>
-struct cast_impl
-{
-  static inline NewType run(const OldType& x)
-  {
-    return static_cast<NewType>(x);
+template <typename OldType, typename NewType, typename EnableIf = void>
+struct cast_impl {
+  EIGEN_DEVICE_FUNC static inline NewType run(const OldType& x) { return static_cast<NewType>(x); }
+};
+
+template <typename OldType>
+struct cast_impl<OldType, bool> {
+  EIGEN_DEVICE_FUNC static inline bool run(const OldType& x) { return x != OldType(0); }
+};
+
+// Casting from S -> Complex<T> leads to an implicit conversion from S to T,
+// generating warnings on clang.  Here we explicitly cast the real component.
+template <typename OldType, typename NewType>
+struct cast_impl<OldType, NewType,
+                 typename std::enable_if_t<!NumTraits<OldType>::IsComplex && NumTraits<NewType>::IsComplex>> {
+  EIGEN_DEVICE_FUNC static inline NewType run(const OldType& x) {
+    typedef typename NumTraits<NewType>::Real NewReal;
+    return static_cast<NewType>(static_cast<NewReal>(x));
   }
 };
 
 // here, for once, we're plainly returning NewType: we don't want cast to do weird things.
 
-template<typename OldType, typename NewType>
-inline NewType cast(const OldType& x)
-{
+template <typename OldType, typename NewType>
+EIGEN_DEVICE_FUNC inline NewType cast(const OldType& x) {
   return cast_impl<OldType, NewType>::run(x);
 }
 
 /****************************************************************************
-* Implementation of atanh2                                                *
-****************************************************************************/
-
-template<typename Scalar, bool IsInteger>
-struct atanh2_default_impl
-{
-  typedef Scalar retval;
+ * Implementation of arg                                                     *
+ ****************************************************************************/
+
+// Visual Studio 2017 has a bug where arg(float) returns 0 for negative inputs.
+// This seems to be fixed in VS 2019.
+#if (!EIGEN_COMP_MSVC || EIGEN_COMP_MSVC >= 1920)
+// std::arg is only defined for types of std::complex, or integer types or float/double/long double
+template <typename Scalar, bool HasStdImpl = NumTraits<Scalar>::IsComplex || is_integral<Scalar>::value ||
+                                             is_same<Scalar, float>::value || is_same<Scalar, double>::value ||
+                                             is_same<Scalar, long double>::value>
+struct arg_default_impl;
+
+template <typename Scalar>
+struct arg_default_impl<Scalar, true> {
   typedef typename NumTraits<Scalar>::Real RealScalar;
-  static inline Scalar run(const Scalar& x, const Scalar& y)
-  {
-    using std::abs;
-    using std::log;
-    using std::sqrt;
-    Scalar z = x / y;
-    if (y == Scalar(0) || abs(z) > sqrt(NumTraits<RealScalar>::epsilon()))
-      return RealScalar(0.5) * log((y + x) / (y - x));
-    else
-      return z + z*z*z / RealScalar(3);
+  EIGEN_DEVICE_FUNC static inline RealScalar run(const Scalar& x) {
+    // There is no official ::arg on device in CUDA/HIP, so we always need to use std::arg.
+    using std::arg;
+    return static_cast<RealScalar>(arg(x));
   }
 };
 
-template<typename Scalar>
-struct atanh2_default_impl<Scalar, true>
-{
-  static inline Scalar run(const Scalar&, const Scalar&)
-  {
-    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
-    return Scalar(0);
+// Must be non-complex floating-point type (e.g. half/bfloat16).
+template <typename Scalar>
+struct arg_default_impl<Scalar, false> {
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC static inline RealScalar run(const Scalar& x) {
+    return (x < Scalar(0)) ? RealScalar(EIGEN_PI) : RealScalar(0);
+  }
+};
+#else
+template <typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
+struct arg_default_impl {
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC static inline RealScalar run(const Scalar& x) {
+    return (x < RealScalar(0)) ? RealScalar(EIGEN_PI) : RealScalar(0);
   }
 };
 
-template<typename Scalar>
-struct atanh2_impl : atanh2_default_impl<Scalar, NumTraits<Scalar>::IsInteger> {};
+template <typename Scalar>
+struct arg_default_impl<Scalar, true> {
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_DEVICE_FUNC static inline RealScalar run(const Scalar& x) {
+    EIGEN_USING_STD(arg);
+    return arg(x);
+  }
+};
+#endif
+template <typename Scalar>
+struct arg_impl : arg_default_impl<Scalar> {};
 
-template<typename Scalar>
-struct atanh2_retval
-{
-  typedef Scalar type;
+template <typename Scalar>
+struct arg_retval {
+  typedef typename NumTraits<Scalar>::Real type;
 };
 
 /****************************************************************************
-* Implementation of pow                                                  *
-****************************************************************************/
+ * Implementation of expm1                                                   *
+ ****************************************************************************/
+
+// This implementation is based on GSL Math's expm1.
+namespace std_fallback {
+// fallback expm1 implementation in case there is no expm1(Scalar) function in namespace of Scalar,
+// or that there is no suitable std::expm1 function available. Implementation
+// attributed to Kahan. See: http://www.plunk.org/~hatch/rightway.php.
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline Scalar expm1(const Scalar& x) {
+  EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
+  typedef typename NumTraits<Scalar>::Real RealScalar;
 
-template<typename Scalar, bool IsInteger>
-struct pow_default_impl
-{
-  typedef Scalar retval;
-  static inline Scalar run(const Scalar& x, const Scalar& y)
-  {
-    using std::pow;
-    return pow(x, y);
+  EIGEN_USING_STD(exp);
+  Scalar u = exp(x);
+  if (numext::equal_strict(u, Scalar(1))) {
+    return x;
+  }
+  Scalar um1 = u - RealScalar(1);
+  if (numext::equal_strict(um1, Scalar(-1))) {
+    return RealScalar(-1);
   }
-};
 
-template<typename Scalar>
-struct pow_default_impl<Scalar, true>
-{
-  static inline Scalar run(Scalar x, Scalar y)
-  {
-    Scalar res(1);
-    eigen_assert(!NumTraits<Scalar>::IsSigned || y >= 0);
-    if(y & 1) res *= x;
-    y >>= 1;
-    while(y)
-    {
-      x *= x;
-      if(y&1) res *= x;
-      y >>= 1;
-    }
-    return res;
+  EIGEN_USING_STD(log);
+  Scalar logu = log(u);
+  return numext::equal_strict(u, logu) ? u : (u - RealScalar(1)) * x / logu;
+}
+}  // namespace std_fallback
+
+template <typename Scalar>
+struct expm1_impl {
+  EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) {
+    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
+    EIGEN_USING_STD(expm1);
+    return expm1(x);
   }
 };
 
-template<typename Scalar>
-struct pow_impl : pow_default_impl<Scalar, NumTraits<Scalar>::IsInteger> {};
-
-template<typename Scalar>
-struct pow_retval
-{
+template <typename Scalar>
+struct expm1_retval {
   typedef Scalar type;
 };
 
 /****************************************************************************
-* Implementation of random                                               *
-****************************************************************************/
+ * Implementation of log                                                     *
+ ****************************************************************************/
+
+// Complex log defined in MathFunctionsImpl.h.
+template <typename ComplexT>
+EIGEN_DEVICE_FUNC ComplexT complex_log(const ComplexT& z);
+
+template <typename Scalar>
+struct log_impl {
+  EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) {
+    EIGEN_USING_STD(log);
+    return static_cast<Scalar>(log(x));
+  }
+};
+
+template <typename Scalar>
+struct log_impl<std::complex<Scalar>> {
+  EIGEN_DEVICE_FUNC static inline std::complex<Scalar> run(const std::complex<Scalar>& z) { return complex_log(z); }
+};
 
-template<typename Scalar,
-         bool IsComplex,
-         bool IsInteger>
-struct random_default_impl {};
+/****************************************************************************
+ * Implementation of log1p                                                   *
+ ****************************************************************************/
+
+namespace std_fallback {
+// fallback log1p implementation in case there is no log1p(Scalar) function in namespace of Scalar,
+// or that there is no suitable std::log1p function available
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline Scalar log1p(const Scalar& x) {
+  EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  EIGEN_USING_STD(log);
+  Scalar x1p = RealScalar(1) + x;
+  Scalar log_1p = log_impl<Scalar>::run(x1p);
+  const bool is_small = numext::equal_strict(x1p, Scalar(1));
+  const bool is_inf = numext::equal_strict(x1p, log_1p);
+  return (is_small || is_inf) ? x : x * (log_1p / (x1p - RealScalar(1)));
+}
+}  // namespace std_fallback
 
-template<typename Scalar>
-struct random_impl : random_default_impl<Scalar, NumTraits<Scalar>::IsComplex, NumTraits<Scalar>::IsInteger> {};
+template <typename Scalar>
+struct log1p_impl {
+  EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
 
-template<typename Scalar>
-struct random_retval
-{
-  typedef Scalar type;
+  EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& x) {
+    EIGEN_USING_STD(log1p);
+    return log1p(x);
+  }
 };
 
-template<typename Scalar> inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random(const Scalar& x, const Scalar& y);
-template<typename Scalar> inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random();
+// Specialization for complex types that are not supported by std::log1p.
+template <typename RealScalar>
+struct log1p_impl<std::complex<RealScalar>> {
+  EIGEN_STATIC_ASSERT_NON_INTEGER(RealScalar)
 
-template<typename Scalar>
-struct random_default_impl<Scalar, false, false>
-{
-  static inline Scalar run(const Scalar& x, const Scalar& y)
-  {
-    return x + (y-x) * Scalar(std::rand()) / Scalar(RAND_MAX);
+  EIGEN_DEVICE_FUNC static inline std::complex<RealScalar> run(const std::complex<RealScalar>& x) {
+    return std_fallback::log1p(x);
   }
-  static inline Scalar run()
-  {
-    return run(Scalar(NumTraits<Scalar>::IsSigned ? -1 : 0), Scalar(1));
+};
+
+template <typename Scalar>
+struct log1p_retval {
+  typedef Scalar type;
+};
+
+/****************************************************************************
+ * Implementation of pow                                                  *
+ ****************************************************************************/
+
+template <typename ScalarX, typename ScalarY,
+          bool IsInteger = NumTraits<ScalarX>::IsInteger && NumTraits<ScalarY>::IsInteger>
+struct pow_impl {
+  // typedef Scalar retval;
+  typedef typename ScalarBinaryOpTraits<ScalarX, ScalarY, internal::scalar_pow_op<ScalarX, ScalarY>>::ReturnType
+      result_type;
+  static EIGEN_DEVICE_FUNC inline result_type run(const ScalarX& x, const ScalarY& y) {
+    EIGEN_USING_STD(pow);
+    return pow(x, y);
   }
 };
 
-enum {
-  floor_log2_terminate,
-  floor_log2_move_up,
-  floor_log2_move_down,
-  floor_log2_bogus
+template <typename ScalarX, typename ScalarY>
+struct pow_impl<ScalarX, ScalarY, true> {
+  typedef ScalarX result_type;
+  static EIGEN_DEVICE_FUNC inline ScalarX run(ScalarX x, ScalarY y) {
+    ScalarX res(1);
+    eigen_assert(!NumTraits<ScalarY>::IsSigned || y >= 0);
+    if (y & 1) res *= x;
+    y >>= 1;
+    while (y) {
+      x *= x;
+      if (y & 1) res *= x;
+      y >>= 1;
+    }
+    return res;
+  }
 };
 
-template<unsigned int n, int lower, int upper> struct floor_log2_selector
-{
-  enum { middle = (lower + upper) / 2,
-         value = (upper <= lower + 1) ? int(floor_log2_terminate)
-               : (n < (1 << middle)) ? int(floor_log2_move_down)
-               : (n==0) ? int(floor_log2_bogus)
-               : int(floor_log2_move_up)
+enum { meta_floor_log2_terminate, meta_floor_log2_move_up, meta_floor_log2_move_down, meta_floor_log2_bogus };
+
+template <unsigned int n, int lower, int upper>
+struct meta_floor_log2_selector {
+  enum {
+    middle = (lower + upper) / 2,
+    value = (upper <= lower + 1)  ? int(meta_floor_log2_terminate)
+            : (n < (1 << middle)) ? int(meta_floor_log2_move_down)
+            : (n == 0)            ? int(meta_floor_log2_bogus)
+                                  : int(meta_floor_log2_move_up)
   };
 };
 
-template<unsigned int n,
-         int lower = 0,
-         int upper = sizeof(unsigned int) * CHAR_BIT - 1,
-         int selector = floor_log2_selector<n, lower, upper>::value>
-struct floor_log2 {};
+template <unsigned int n, int lower = 0, int upper = sizeof(unsigned int) * CHAR_BIT - 1,
+          int selector = meta_floor_log2_selector<n, lower, upper>::value>
+struct meta_floor_log2 {};
 
-template<unsigned int n, int lower, int upper>
-struct floor_log2<n, lower, upper, floor_log2_move_down>
-{
-  enum { value = floor_log2<n, lower, floor_log2_selector<n, lower, upper>::middle>::value };
+template <unsigned int n, int lower, int upper>
+struct meta_floor_log2<n, lower, upper, meta_floor_log2_move_down> {
+  enum { value = meta_floor_log2<n, lower, meta_floor_log2_selector<n, lower, upper>::middle>::value };
 };
 
-template<unsigned int n, int lower, int upper>
-struct floor_log2<n, lower, upper, floor_log2_move_up>
-{
-  enum { value = floor_log2<n, floor_log2_selector<n, lower, upper>::middle, upper>::value };
+template <unsigned int n, int lower, int upper>
+struct meta_floor_log2<n, lower, upper, meta_floor_log2_move_up> {
+  enum { value = meta_floor_log2<n, meta_floor_log2_selector<n, lower, upper>::middle, upper>::value };
 };
 
-template<unsigned int n, int lower, int upper>
-struct floor_log2<n, lower, upper, floor_log2_terminate>
-{
-  enum { value = (n >= ((unsigned int)(1) << (lower+1))) ? lower+1 : lower };
+template <unsigned int n, int lower, int upper>
+struct meta_floor_log2<n, lower, upper, meta_floor_log2_terminate> {
+  enum { value = (n >= ((unsigned int)(1) << (lower + 1))) ? lower + 1 : lower };
 };
 
-template<unsigned int n, int lower, int upper>
-struct floor_log2<n, lower, upper, floor_log2_bogus>
-{
+template <unsigned int n, int lower, int upper>
+struct meta_floor_log2<n, lower, upper, meta_floor_log2_bogus> {
   // no value, error at compile time
 };
 
-template<typename Scalar>
-struct random_default_impl<Scalar, false, true>
-{
-  typedef typename NumTraits<Scalar>::NonInteger NonInteger;
+template <typename BitsType, typename EnableIf = void>
+struct count_bits_impl {
+  static_assert(std::is_integral<BitsType>::value && std::is_unsigned<BitsType>::value,
+                "BitsType must be an unsigned integer");
+  static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) {
+    int n = CHAR_BIT * sizeof(BitsType);
+    int shift = n / 2;
+    while (bits > 0 && shift > 0) {
+      BitsType y = bits >> shift;
+      if (y > 0) {
+        n -= shift;
+        bits = y;
+      }
+      shift /= 2;
+    }
+    if (shift == 0) {
+      --n;
+    }
+    return n;
+  }
 
-  static inline Scalar run(const Scalar& x, const Scalar& y)
-  {
-    return x + Scalar((NonInteger(y)-x+1) * std::rand() / (RAND_MAX + NonInteger(1)));
+  static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) {
+    int n = CHAR_BIT * sizeof(BitsType);
+    int shift = n / 2;
+    while (bits > 0 && shift > 0) {
+      BitsType y = bits << shift;
+      if (y > 0) {
+        n -= shift;
+        bits = y;
+      }
+      shift /= 2;
+    }
+    if (shift == 0) {
+      --n;
+    }
+    return n;
   }
+};
 
-  static inline Scalar run()
-  {
-#ifdef EIGEN_MAKING_DOCS
-    return run(Scalar(NumTraits<Scalar>::IsSigned ? -10 : 0), Scalar(10));
-#else
-    enum { rand_bits = floor_log2<(unsigned int)(RAND_MAX)+1>::value,
-           scalar_bits = sizeof(Scalar) * CHAR_BIT,
-           shift = EIGEN_PLAIN_ENUM_MAX(0, int(rand_bits) - int(scalar_bits)),
-           offset = NumTraits<Scalar>::IsSigned ? (1 << (EIGEN_PLAIN_ENUM_MIN(rand_bits,scalar_bits)-1)) : 0
-    };
-    return Scalar((std::rand() >> shift) - offset);
-#endif
+// Count leading zeros.
+template <typename BitsType>
+EIGEN_DEVICE_FUNC inline int clz(BitsType bits) {
+  return count_bits_impl<BitsType>::clz(bits);
+}
+
+// Count trailing zeros.
+template <typename BitsType>
+EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) {
+  return count_bits_impl<BitsType>::ctz(bits);
+}
+
+#if EIGEN_COMP_GNUC || EIGEN_COMP_CLANG
+
+template <typename BitsType>
+struct count_bits_impl<
+    BitsType, std::enable_if_t<std::is_integral<BitsType>::value && sizeof(BitsType) <= sizeof(unsigned int)>> {
+  static constexpr int kNumBits = static_cast<int>(sizeof(BitsType) * CHAR_BIT);
+  static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) {
+    static constexpr int kLeadingBitsOffset = (sizeof(unsigned int) - sizeof(BitsType)) * CHAR_BIT;
+    return bits == 0 ? kNumBits : __builtin_clz(static_cast<unsigned int>(bits)) - kLeadingBitsOffset;
+  }
+
+  static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) {
+    return bits == 0 ? kNumBits : __builtin_ctz(static_cast<unsigned int>(bits));
   }
 };
 
-template<typename Scalar>
-struct random_default_impl<Scalar, true, false>
-{
-  static inline Scalar run(const Scalar& x, const Scalar& y)
-  {
-    return Scalar(random(real(x), real(y)),
-                  random(imag(x), imag(y)));
+template <typename BitsType>
+struct count_bits_impl<BitsType,
+                       std::enable_if_t<std::is_integral<BitsType>::value && sizeof(unsigned int) < sizeof(BitsType) &&
+                                        sizeof(BitsType) <= sizeof(unsigned long)>> {
+  static constexpr int kNumBits = static_cast<int>(sizeof(BitsType) * CHAR_BIT);
+  static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) {
+    static constexpr int kLeadingBitsOffset = (sizeof(unsigned long) - sizeof(BitsType)) * CHAR_BIT;
+    return bits == 0 ? kNumBits : __builtin_clzl(static_cast<unsigned long>(bits)) - kLeadingBitsOffset;
   }
-  static inline Scalar run()
-  {
-    typedef typename NumTraits<Scalar>::Real RealScalar;
-    return Scalar(random<RealScalar>(), random<RealScalar>());
+
+  static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) {
+    return bits == 0 ? kNumBits : __builtin_ctzl(static_cast<unsigned long>(bits));
   }
 };
 
-template<typename Scalar>
-inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random(const Scalar& x, const Scalar& y)
-{
-  return EIGEN_MATHFUNC_IMPL(random, Scalar)::run(x, y);
-}
+template <typename BitsType>
+struct count_bits_impl<BitsType,
+                       std::enable_if_t<std::is_integral<BitsType>::value && sizeof(unsigned long) < sizeof(BitsType) &&
+                                        sizeof(BitsType) <= sizeof(unsigned long long)>> {
+  static constexpr int kNumBits = static_cast<int>(sizeof(BitsType) * CHAR_BIT);
+  static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) {
+    static constexpr int kLeadingBitsOffset = (sizeof(unsigned long long) - sizeof(BitsType)) * CHAR_BIT;
+    return bits == 0 ? kNumBits : __builtin_clzll(static_cast<unsigned long long>(bits)) - kLeadingBitsOffset;
+  }
 
-template<typename Scalar>
-inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random()
-{
-  return EIGEN_MATHFUNC_IMPL(random, Scalar)::run();
-}
+  static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) {
+    return bits == 0 ? kNumBits : __builtin_ctzll(static_cast<unsigned long long>(bits));
+  }
+};
 
-} // end namespace internal
+#elif EIGEN_COMP_MSVC
 
-/****************************************************************************
-* Generic math function                                                    *
-****************************************************************************/
+template <typename BitsType>
+struct count_bits_impl<
+    BitsType, std::enable_if_t<std::is_integral<BitsType>::value && sizeof(BitsType) <= sizeof(unsigned long)>> {
+  static constexpr int kNumBits = static_cast<int>(sizeof(BitsType) * CHAR_BIT);
+  static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) {
+    unsigned long out;
+    _BitScanReverse(&out, static_cast<unsigned long>(bits));
+    return bits == 0 ? kNumBits : (kNumBits - 1) - static_cast<int>(out);
+  }
 
-namespace numext {
+  static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) {
+    unsigned long out;
+    _BitScanForward(&out, static_cast<unsigned long>(bits));
+    return bits == 0 ? kNumBits : static_cast<int>(out);
+  }
+};
 
-template<typename Scalar>
-inline EIGEN_MATHFUNC_RETVAL(real, Scalar) real(const Scalar& x)
-{
-  return EIGEN_MATHFUNC_IMPL(real, Scalar)::run(x);
-}  
+#ifdef _WIN64
+
+template <typename BitsType>
+struct count_bits_impl<BitsType,
+                       std::enable_if_t<std::is_integral<BitsType>::value && sizeof(unsigned long) < sizeof(BitsType) &&
+                                        sizeof(BitsType) <= sizeof(__int64)>> {
+  static constexpr int kNumBits = static_cast<int>(sizeof(BitsType) * CHAR_BIT);
+  static EIGEN_DEVICE_FUNC inline int clz(BitsType bits) {
+    unsigned long out;
+    _BitScanReverse64(&out, static_cast<unsigned __int64>(bits));
+    return bits == 0 ? kNumBits : (kNumBits - 1) - static_cast<int>(out);
+  }
 
-template<typename Scalar>
-inline typename internal::add_const_on_value_type< EIGEN_MATHFUNC_RETVAL(real_ref, Scalar) >::type real_ref(const Scalar& x)
-{
-  return internal::real_ref_impl<Scalar>::run(x);
-}
+  static EIGEN_DEVICE_FUNC inline int ctz(BitsType bits) {
+    unsigned long out;
+    _BitScanForward64(&out, static_cast<unsigned __int64>(bits));
+    return bits == 0 ? kNumBits : static_cast<int>(out);
+  }
+};
 
-template<typename Scalar>
-inline EIGEN_MATHFUNC_RETVAL(real_ref, Scalar) real_ref(Scalar& x)
-{
-  return EIGEN_MATHFUNC_IMPL(real_ref, Scalar)::run(x);
-}
+#endif  // _WIN64
 
-template<typename Scalar>
-inline EIGEN_MATHFUNC_RETVAL(imag, Scalar) imag(const Scalar& x)
-{
-  return EIGEN_MATHFUNC_IMPL(imag, Scalar)::run(x);
-}
+#endif  // EIGEN_COMP_GNUC || EIGEN_COMP_CLANG
 
-template<typename Scalar>
-inline typename internal::add_const_on_value_type< EIGEN_MATHFUNC_RETVAL(imag_ref, Scalar) >::type imag_ref(const Scalar& x)
-{
-  return internal::imag_ref_impl<Scalar>::run(x);
-}
+template <typename BitsType>
+struct log_2_impl {
+  static constexpr int kTotalBits = sizeof(BitsType) * CHAR_BIT;
+  static EIGEN_DEVICE_FUNC inline int run_ceil(const BitsType& x) {
+    const int n = kTotalBits - clz(x);
+    bool power_of_two = (x & (x - 1)) == 0;
+    return x == 0 ? 0 : power_of_two ? (n - 1) : n;
+  }
+  static EIGEN_DEVICE_FUNC inline int run_floor(const BitsType& x) {
+    const int n = kTotalBits - clz(x);
+    return x == 0 ? 0 : n - 1;
+  }
+};
 
-template<typename Scalar>
-inline EIGEN_MATHFUNC_RETVAL(imag_ref, Scalar) imag_ref(Scalar& x)
-{
-  return EIGEN_MATHFUNC_IMPL(imag_ref, Scalar)::run(x);
+template <typename BitsType>
+int log2_ceil(const BitsType& x) {
+  return log_2_impl<BitsType>::run_ceil(x);
 }
 
-template<typename Scalar>
-inline EIGEN_MATHFUNC_RETVAL(conj, Scalar) conj(const Scalar& x)
-{
-  return EIGEN_MATHFUNC_IMPL(conj, Scalar)::run(x);
+template <typename BitsType>
+int log2_floor(const BitsType& x) {
+  return log_2_impl<BitsType>::run_floor(x);
 }
 
-template<typename Scalar>
-inline EIGEN_MATHFUNC_RETVAL(abs2, Scalar) abs2(const Scalar& x)
-{
-  return EIGEN_MATHFUNC_IMPL(abs2, Scalar)::run(x);
-}
+// Implementation of is* functions
 
-template<typename Scalar>
-inline EIGEN_MATHFUNC_RETVAL(norm1, Scalar) norm1(const Scalar& x)
-{
-  return EIGEN_MATHFUNC_IMPL(norm1, Scalar)::run(x);
+template <typename T>
+EIGEN_DEVICE_FUNC std::enable_if_t<!(std::numeric_limits<T>::has_infinity || std::numeric_limits<T>::has_quiet_NaN ||
+                                     std::numeric_limits<T>::has_signaling_NaN),
+                                   bool>
+isfinite_impl(const T&) {
+  return true;
 }
 
-template<typename Scalar>
-inline EIGEN_MATHFUNC_RETVAL(hypot, Scalar) hypot(const Scalar& x, const Scalar& y)
-{
-  return EIGEN_MATHFUNC_IMPL(hypot, Scalar)::run(x, y);
+template <typename T>
+EIGEN_DEVICE_FUNC std::enable_if_t<(std::numeric_limits<T>::has_infinity || std::numeric_limits<T>::has_quiet_NaN ||
+                                    std::numeric_limits<T>::has_signaling_NaN) &&
+                                       (!NumTraits<T>::IsComplex),
+                                   bool>
+isfinite_impl(const T& x) {
+  EIGEN_USING_STD(isfinite);
+  return isfinite EIGEN_NOT_A_MACRO(x);
 }
 
-template<typename Scalar>
-inline EIGEN_MATHFUNC_RETVAL(atanh2, Scalar) atanh2(const Scalar& x, const Scalar& y)
-{
-  return EIGEN_MATHFUNC_IMPL(atanh2, Scalar)::run(x, y);
+template <typename T>
+EIGEN_DEVICE_FUNC std::enable_if_t<!std::numeric_limits<T>::has_infinity, bool> isinf_impl(const T&) {
+  return false;
 }
 
-template<typename Scalar>
-inline EIGEN_MATHFUNC_RETVAL(pow, Scalar) pow(const Scalar& x, const Scalar& y)
-{
-  return EIGEN_MATHFUNC_IMPL(pow, Scalar)::run(x, y);
+template <typename T>
+EIGEN_DEVICE_FUNC std::enable_if_t<(std::numeric_limits<T>::has_infinity && !NumTraits<T>::IsComplex), bool> isinf_impl(
+    const T& x) {
+  EIGEN_USING_STD(isinf);
+  return isinf EIGEN_NOT_A_MACRO(x);
 }
 
-// std::isfinite is non standard, so let's define our own version,
-// even though it is not very efficient.
-template<typename T> bool (isfinite)(const T& x)
-{
-  return x<NumTraits<T>::highest() && x>NumTraits<T>::lowest();
+template <typename T>
+EIGEN_DEVICE_FUNC
+std::enable_if_t<!(std::numeric_limits<T>::has_quiet_NaN || std::numeric_limits<T>::has_signaling_NaN), bool>
+isnan_impl(const T&) {
+  return false;
 }
 
-} // end namespace numext
+template <typename T>
+EIGEN_DEVICE_FUNC std::enable_if_t<
+    (std::numeric_limits<T>::has_quiet_NaN || std::numeric_limits<T>::has_signaling_NaN) && (!NumTraits<T>::IsComplex),
+    bool>
+isnan_impl(const T& x) {
+  EIGEN_USING_STD(isnan);
+  return isnan EIGEN_NOT_A_MACRO(x);
+}
 
-namespace internal {
+// The following overload are defined at the end of this file
+template <typename T>
+EIGEN_DEVICE_FUNC bool isfinite_impl(const std::complex<T>& x);
+template <typename T>
+EIGEN_DEVICE_FUNC bool isnan_impl(const std::complex<T>& x);
+template <typename T>
+EIGEN_DEVICE_FUNC bool isinf_impl(const std::complex<T>& x);
+template <typename T>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS T ptanh_float(const T& a_x);
 
 /****************************************************************************
-* Implementation of fuzzy comparisons                                       *
-****************************************************************************/
+ * Implementation of sign                                                 *
+ ****************************************************************************/
+template <typename Scalar, bool IsComplex = (NumTraits<Scalar>::IsComplex != 0),
+          bool IsInteger = (NumTraits<Scalar>::IsInteger != 0)>
+struct sign_impl {
+  EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& a) { return Scalar((a > Scalar(0)) - (a < Scalar(0))); }
+};
 
-template<typename Scalar,
-         bool IsComplex,
-         bool IsInteger>
-struct scalar_fuzzy_default_impl {};
+template <typename Scalar>
+struct sign_impl<Scalar, false, false> {
+  EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& a) {
+    return (isnan_impl<Scalar>)(a) ? a : Scalar((a > Scalar(0)) - (a < Scalar(0)));
+  }
+};
 
-template<typename Scalar>
-struct scalar_fuzzy_default_impl<Scalar, false, false>
-{
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  template<typename OtherScalar>
-  static inline bool isMuchSmallerThan(const Scalar& x, const OtherScalar& y, const RealScalar& prec)
-  {
-    using std::abs;
-    return abs(x) <= abs(y) * prec;
-  }
-  static inline bool isApprox(const Scalar& x, const Scalar& y, const RealScalar& prec)
-  {
-    using std::min;
-    using std::abs;
-    return abs(x - y) <= (min)(abs(x), abs(y)) * prec;
-  }
-  static inline bool isApproxOrLessThan(const Scalar& x, const Scalar& y, const RealScalar& prec)
-  {
-    return x <= y || isApprox(x, y, prec);
+template <typename Scalar, bool IsInteger>
+struct sign_impl<Scalar, true, IsInteger> {
+  EIGEN_DEVICE_FUNC static inline Scalar run(const Scalar& a) {
+    using real_type = typename NumTraits<Scalar>::Real;
+    EIGEN_USING_STD(abs);
+    real_type aa = abs(a);
+    if (aa == real_type(0)) return Scalar(0);
+    aa = real_type(1) / aa;
+    return Scalar(numext::real(a) * aa, numext::imag(a) * aa);
   }
 };
 
-template<typename Scalar>
-struct scalar_fuzzy_default_impl<Scalar, false, true>
-{
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  template<typename OtherScalar>
-  static inline bool isMuchSmallerThan(const Scalar& x, const Scalar&, const RealScalar&)
-  {
-    return x == Scalar(0);
+// The sign function for bool is the identity.
+template <>
+struct sign_impl<bool, false, true> {
+  EIGEN_DEVICE_FUNC static inline bool run(const bool& a) { return a; }
+};
+
+template <typename Scalar>
+struct sign_retval {
+  typedef Scalar type;
+};
+
+// suppress "unary minus operator applied to unsigned type, result still unsigned" warnings on MSVC
+// note: `0 - a` is distinct from `-a` when Scalar is a floating point type and `a` is zero
+
+template <typename Scalar, bool IsInteger = NumTraits<Scalar>::IsInteger>
+struct negate_impl {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar run(const Scalar& a) { return -a; }
+};
+
+template <typename Scalar>
+struct negate_impl<Scalar, true> {
+  EIGEN_STATIC_ASSERT((!is_same<Scalar, bool>::value), NEGATE IS NOT DEFINED FOR BOOLEAN TYPES)
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar run(const Scalar& a) { return Scalar(0) - a; }
+};
+
+template <typename Scalar>
+struct negate_retval {
+  typedef Scalar type;
+};
+
+template <typename Scalar, bool IsInteger = NumTraits<typename unpacket_traits<Scalar>::type>::IsInteger>
+struct nearest_integer_impl {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run_floor(const Scalar& x) {
+    EIGEN_USING_STD(floor) return floor(x);
   }
-  static inline bool isApprox(const Scalar& x, const Scalar& y, const RealScalar&)
-  {
-    return x == y;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run_ceil(const Scalar& x) {
+    EIGEN_USING_STD(ceil) return ceil(x);
   }
-  static inline bool isApproxOrLessThan(const Scalar& x, const Scalar& y, const RealScalar&)
-  {
-    return x <= y;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run_rint(const Scalar& x) {
+    EIGEN_USING_STD(rint) return rint(x);
   }
-};
-
-template<typename Scalar>
-struct scalar_fuzzy_default_impl<Scalar, true, false>
-{
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  template<typename OtherScalar>
-  static inline bool isMuchSmallerThan(const Scalar& x, const OtherScalar& y, const RealScalar& prec)
-  {
-    return numext::abs2(x) <= numext::abs2(y) * prec * prec;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run_round(const Scalar& x) {
+    EIGEN_USING_STD(round) return round(x);
   }
-  static inline bool isApprox(const Scalar& x, const Scalar& y, const RealScalar& prec)
-  {
-    using std::min;
-    return numext::abs2(x - y) <= (min)(numext::abs2(x), numext::abs2(y)) * prec * prec;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run_trunc(const Scalar& x) {
+    EIGEN_USING_STD(trunc) return trunc(x);
   }
 };
+template <typename Scalar>
+struct nearest_integer_impl<Scalar, true> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run_floor(const Scalar& x) { return x; }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run_ceil(const Scalar& x) { return x; }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run_rint(const Scalar& x) { return x; }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run_round(const Scalar& x) { return x; }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run_trunc(const Scalar& x) { return x; }
+};
 
-template<typename Scalar>
-struct scalar_fuzzy_impl : scalar_fuzzy_default_impl<Scalar, NumTraits<Scalar>::IsComplex, NumTraits<Scalar>::IsInteger> {};
+// Extra namespace to prevent leaking std::fma into Eigen::internal.
+namespace has_fma_detail {
 
-template<typename Scalar, typename OtherScalar>
-inline bool isMuchSmallerThan(const Scalar& x, const OtherScalar& y,
-                                   typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision())
-{
-  return scalar_fuzzy_impl<Scalar>::template isMuchSmallerThan<OtherScalar>(x, y, precision);
-}
+template <typename T, typename EnableIf = void>
+struct has_fma_impl : public std::false_type {};
 
-template<typename Scalar>
-inline bool isApprox(const Scalar& x, const Scalar& y,
-                          typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision())
-{
+using std::fma;
+
+template <typename T>
+struct has_fma_impl<
+    T, std::enable_if_t<std::is_same<T, decltype(fma(std::declval<T>(), std::declval<T>(), std::declval<T>()))>::value>>
+    : public std::true_type {};
+
+}  // namespace has_fma_detail
+
+template <typename T>
+struct has_fma : public has_fma_detail::has_fma_impl<T> {};
+
+// Default implementation.
+template <typename T, typename Enable = void>
+struct fma_impl {
+  static_assert(has_fma<T>::value, "No function fma(...) for type.  Please provide an implementation.");
+};
+
+// STD or ADL version if it exists.
+template <typename T>
+struct fma_impl<T, std::enable_if_t<has_fma<T>::value>> {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T run(const T& a, const T& b, const T& c) {
+    using std::fma;
+    return fma(a, b, c);
+  }
+};
+
+#if defined(EIGEN_GPUCC)
+template <>
+struct has_fma<float> : public true_type {};
+
+template <>
+struct fma_impl<float, void> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float run(const float& a, const float& b, const float& c) {
+    return ::fmaf(a, b, c);
+  }
+};
+
+template <>
+struct has_fma<double> : public true_type {};
+
+template <>
+struct fma_impl<double, void> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double run(const double& a, const double& b, const double& c) {
+    return ::fma(a, b, c);
+  }
+};
+#endif
+
+// Basic multiply-add.
+template <typename Scalar, typename EnableIf = void>
+struct madd_impl {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run(const Scalar& x, const Scalar& y, const Scalar& z) {
+    return x * y + z;
+  }
+};
+
+#if EIGEN_SCALAR_MADD_USE_FMA
+template <typename Scalar>
+struct madd_impl<Scalar, std::enable_if_t<has_fma<Scalar>::value>> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run(const Scalar& x, const Scalar& y, const Scalar& z) {
+    return fma_impl<Scalar>::run(x, y, z);
+  }
+};
+#endif
+
+}  // end namespace internal
+
+/****************************************************************************
+ * Generic math functions                                                    *
+ ****************************************************************************/
+
+namespace numext {
+
+#if (!defined(EIGEN_GPUCC) || defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC))
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y) {
+  EIGEN_USING_STD(min)
+  return min EIGEN_NOT_A_MACRO(x, y);
+}
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y) {
+  EIGEN_USING_STD(max)
+  return max EIGEN_NOT_A_MACRO(x, y);
+}
+#else
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T mini(const T& x, const T& y) {
+  return y < x ? y : x;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float mini(const float& x, const float& y) {
+  return fminf(x, y);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double mini(const double& x, const double& y) {
+  return fmin(x, y);
+}
+
+#ifndef EIGEN_GPU_COMPILE_PHASE
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE long double mini(const long double& x, const long double& y) {
+#if defined(EIGEN_HIPCC)
+  // no "fminl" on HIP yet
+  return (x < y) ? x : y;
+#else
+  return fminl(x, y);
+#endif
+}
+#endif
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T maxi(const T& x, const T& y) {
+  return x < y ? y : x;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float maxi(const float& x, const float& y) {
+  return fmaxf(x, y);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double maxi(const double& x, const double& y) {
+  return fmax(x, y);
+}
+#ifndef EIGEN_GPU_COMPILE_PHASE
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE long double maxi(const long double& x, const long double& y) {
+#if defined(EIGEN_HIPCC)
+  // no "fmaxl" on HIP yet
+  return (x > y) ? x : y;
+#else
+  return fmaxl(x, y);
+#endif
+}
+#endif
+#endif
+
+#if defined(SYCL_DEVICE_ONLY)
+
+#define SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_char)    \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_short)   \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_int)     \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_long)
+#define SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_char)    \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_short)   \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_int)     \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_long)
+#define SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_uchar)     \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_ushort)    \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_uint)      \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_ulong)
+#define SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_uchar)     \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_ushort)    \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_uint)      \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_ulong)
+#define SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(NAME, FUNC)  \
+  SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY(NAME, FUNC) \
+  SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY(NAME, FUNC)
+#define SYCL_SPECIALIZE_INTEGER_TYPES_UNARY(NAME, FUNC)  \
+  SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY(NAME, FUNC) \
+  SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY(NAME, FUNC)
+#define SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(NAME, FUNC)     \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_float) \
+  SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, cl::sycl::cl_double)
+#define SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(NAME, FUNC)     \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_float) \
+  SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, cl::sycl::cl_double)
+#define SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(NAME, FUNC, RET_TYPE) \
+  SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, cl::sycl::cl_float)       \
+  SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, cl::sycl::cl_double)
+
+#define SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE)     \
+  template <>                                                              \
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE RET_TYPE NAME(const ARG_TYPE& x) { \
+    return cl::sycl::FUNC(x);                                              \
+  }
+
+#define SYCL_SPECIALIZE_UNARY_FUNC(NAME, FUNC, TYPE) SYCL_SPECIALIZE_GEN_UNARY_FUNC(NAME, FUNC, TYPE, TYPE)
+
+#define SYCL_SPECIALIZE_GEN1_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE1, ARG_TYPE2)            \
+  template <>                                                                                   \
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE RET_TYPE NAME(const ARG_TYPE1& x, const ARG_TYPE2& y) { \
+    return cl::sycl::FUNC(x, y);                                                                \
+  }
+
+#define SYCL_SPECIALIZE_GEN2_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE) \
+  SYCL_SPECIALIZE_GEN1_BINARY_FUNC(NAME, FUNC, RET_TYPE, ARG_TYPE, ARG_TYPE)
+
+#define SYCL_SPECIALIZE_BINARY_FUNC(NAME, FUNC, TYPE) SYCL_SPECIALIZE_GEN2_BINARY_FUNC(NAME, FUNC, TYPE, TYPE)
+
+SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(mini, min)
+SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(mini, fmin)
+SYCL_SPECIALIZE_INTEGER_TYPES_BINARY(maxi, max)
+SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(maxi, fmax)
+
+#endif
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(arg, Scalar) arg(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(arg, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline internal::add_const_on_value_type_t<EIGEN_MATHFUNC_RETVAL(imag_ref, Scalar)> imag_ref(
+    const Scalar& x) {
+  return internal::imag_ref_impl<Scalar>::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(imag_ref, Scalar) imag_ref(Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(imag_ref, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(conj, Scalar) conj(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(conj, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(sign, Scalar) sign(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(sign, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(negate, Scalar) negate(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(negate, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(abs2, Scalar) abs2(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(abs2, Scalar)::run(x);
+}
+
+EIGEN_DEVICE_FUNC inline bool abs2(bool x) { return x; }
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T absdiff(const T& x, const T& y) {
+  return x > y ? x - y : y - x;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float absdiff(const float& x, const float& y) {
+  return fabsf(x - y);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double absdiff(const double& x, const double& y) {
+  return fabs(x - y);
+}
+
+// HIP and CUDA do not support long double.
+#ifndef EIGEN_GPU_COMPILE_PHASE
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE long double absdiff(const long double& x, const long double& y) {
+  return fabsl(x - y);
+}
+#endif
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(norm1, Scalar) norm1(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(norm1, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(hypot, Scalar) hypot(const Scalar& x, const Scalar& y) {
+  return EIGEN_MATHFUNC_IMPL(hypot, Scalar)::run(x, y);
+}
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(hypot, hypot)
+#endif
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(log1p, Scalar) log1p(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(log1p, Scalar)::run(x);
+}
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(log1p, log1p)
+#endif
+
+#if defined(EIGEN_GPUCC)
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float log1p(const float& x) {
+  return ::log1pf(x);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double log1p(const double& x) {
+  return ::log1p(x);
+}
+#endif
+
+template <typename ScalarX, typename ScalarY>
+EIGEN_DEVICE_FUNC inline typename internal::pow_impl<ScalarX, ScalarY>::result_type pow(const ScalarX& x,
+                                                                                        const ScalarY& y) {
+  return internal::pow_impl<ScalarX, ScalarY>::run(x, y);
+}
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(pow, pow)
+#endif
+
+template <typename T>
+EIGEN_DEVICE_FUNC bool(isnan)(const T& x) {
+  return internal::isnan_impl(x);
+}
+template <typename T>
+EIGEN_DEVICE_FUNC bool(isinf)(const T& x) {
+  return internal::isinf_impl(x);
+}
+template <typename T>
+EIGEN_DEVICE_FUNC bool(isfinite)(const T& x) {
+  return internal::isfinite_impl(x);
+}
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isnan, isnan, bool)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isinf, isinf, bool)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE(isfinite, isfinite, bool)
+#endif
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar rint(const Scalar& x) {
+  return internal::nearest_integer_impl<Scalar>::run_rint(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar round(const Scalar& x) {
+  return internal::nearest_integer_impl<Scalar>::run_round(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar(floor)(const Scalar& x) {
+  return internal::nearest_integer_impl<Scalar>::run_floor(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar(ceil)(const Scalar& x) {
+  return internal::nearest_integer_impl<Scalar>::run_ceil(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar(trunc)(const Scalar& x) {
+  return internal::nearest_integer_impl<Scalar>::run_trunc(x);
+}
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(round, round)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(floor, floor)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(ceil, ceil)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(trunc, trunc)
+#endif
+
+#if defined(EIGEN_GPUCC)
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float floor(const float& x) {
+  return ::floorf(x);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double floor(const double& x) {
+  return ::floor(x);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float ceil(const float& x) {
+  return ::ceilf(x);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double ceil(const double& x) {
+  return ::ceil(x);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float trunc(const float& x) {
+  return ::truncf(x);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double trunc(const double& x) {
+  return ::trunc(x);
+}
+#endif
+
+// Integer division with rounding up.
+// T is assumed to be an integer type with a>=0, and b>0
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE constexpr T div_ceil(T a, T b) {
+  using UnsignedT = typename internal::make_unsigned<T>::type;
+  EIGEN_STATIC_ASSERT((NumTraits<T>::IsInteger), THIS FUNCTION IS FOR INTEGER TYPES)
+  // Note: explicitly declaring a and b as non-negative values allows the compiler to use better optimizations
+  const UnsignedT ua = UnsignedT(a);
+  const UnsignedT ub = UnsignedT(b);
+  // Note: This form is used because it cannot overflow.
+  return ua == 0 ? 0 : (ua - 1) / ub + 1;
+}
+
+// Integer round down to nearest power of b
+// T is assumed to be an integer type with a>=0, and b>0
+template <typename T, typename U>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE constexpr T round_down(T a, U b) {
+  using UnsignedT = typename internal::make_unsigned<T>::type;
+  using UnsignedU = typename internal::make_unsigned<U>::type;
+  EIGEN_STATIC_ASSERT((NumTraits<T>::IsInteger), THIS FUNCTION IS FOR INTEGER TYPES)
+  EIGEN_STATIC_ASSERT((NumTraits<U>::IsInteger), THIS FUNCTION IS FOR INTEGER TYPES)
+  // Note: explicitly declaring a and b as non-negative values allows the compiler to use better optimizations
+  const UnsignedT ua = UnsignedT(a);
+  const UnsignedU ub = UnsignedU(b);
+  return ub * (ua / ub);
+}
+
+/** Log base 2 for 32 bits positive integers.
+ * Conveniently returns 0 for x==0. */
+constexpr int log2(int x) {
+  unsigned int v(x);
+  constexpr int table[32] = {0, 9,  1,  10, 13, 21, 2,  29, 11, 14, 16, 18, 22, 25, 3, 30,
+                             8, 12, 20, 28, 15, 17, 24, 7,  19, 27, 23, 6,  26, 5,  4, 31};
+  v |= v >> 1;
+  v |= v >> 2;
+  v |= v >> 4;
+  v |= v >> 8;
+  v |= v >> 16;
+  return table[(v * 0x07C4ACDDU) >> 27];
+}
+
+/** \returns the square root of \a x.
+ *
+ * It is essentially equivalent to
+ * \code using std::sqrt; return sqrt(x); \endcode
+ * but slightly faster for float/double and some compilers (e.g., gcc), thanks to
+ * specializations when SSE is enabled.
+ *
+ * It's usage is justified in performance critical functions, like norm/normalize.
+ */
+template <typename Scalar>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE EIGEN_MATHFUNC_RETVAL(sqrt, Scalar) sqrt(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(sqrt, Scalar)::run(x);
+}
+
+// Boolean specialization, avoids implicit float to bool conversion (-Wimplicit-conversion-floating-point-to-bool).
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_DEVICE_FUNC bool sqrt<bool>(const bool& x) {
+  return x;
+}
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sqrt, sqrt)
+#endif
+
+/** \returns the cube root of \a x. **/
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t<!NumTraits<T>::IsComplex, T> cbrt(const T& x) {
+  EIGEN_USING_STD(cbrt);
+  return static_cast<T>(cbrt(x));
+}
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t<NumTraits<T>::IsComplex, T> cbrt(const T& x) {
+  EIGEN_USING_STD(pow);
+  return pow(x, typename NumTraits<T>::Real(1.0 / 3.0));
+}
+
+/** \returns the reciprocal square root of \a x. **/
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T rsqrt(const T& x) {
+  return internal::rsqrt_impl<T>::run(x);
+}
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T log(const T& x) {
+  return internal::log_impl<T>::run(x);
+}
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(log, log)
+#endif
+
+#if defined(EIGEN_GPUCC)
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float log(const float& x) {
+  return ::logf(x);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double log(const double& x) {
+  return ::log(x);
+}
+#endif
+
+template <typename T>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE std::enable_if_t<NumTraits<T>::IsSigned || NumTraits<T>::IsComplex, typename NumTraits<T>::Real>
+abs(const T& x) {
+  EIGEN_USING_STD(abs);
+  return abs(x);
+}
+
+template <typename T>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE std::enable_if_t<!(NumTraits<T>::IsSigned || NumTraits<T>::IsComplex), typename NumTraits<T>::Real>
+abs(const T& x) {
+  return x;
+}
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_INTEGER_TYPES_UNARY(abs, abs)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(abs, fabs)
+#endif
+
+#if defined(EIGEN_GPUCC)
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float abs(const float& x) {
+  return ::fabsf(x);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double abs(const double& x) {
+  return ::fabs(x);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float abs(const std::complex<float>& x) {
+  return ::hypotf(x.real(), x.imag());
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double abs(const std::complex<double>& x) {
+  return ::hypot(x.real(), x.imag());
+}
+#endif
+
+template <typename Scalar, bool IsInteger = NumTraits<Scalar>::IsInteger, bool IsSigned = NumTraits<Scalar>::IsSigned>
+struct signbit_impl;
+template <typename Scalar>
+struct signbit_impl<Scalar, false, true> {
+  static constexpr size_t Size = sizeof(Scalar);
+  static constexpr size_t Shift = (CHAR_BIT * Size) - 1;
+  using intSize_t = typename get_integer_by_size<Size>::signed_type;
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static Scalar run(const Scalar& x) {
+    intSize_t a = bit_cast<intSize_t, Scalar>(x);
+    a = a >> Shift;
+    Scalar result = bit_cast<Scalar, intSize_t>(a);
+    return result;
+  }
+};
+template <typename Scalar>
+struct signbit_impl<Scalar, true, true> {
+  static constexpr size_t Size = sizeof(Scalar);
+  static constexpr size_t Shift = (CHAR_BIT * Size) - 1;
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static constexpr Scalar run(const Scalar& x) { return x >> Shift; }
+};
+template <typename Scalar>
+struct signbit_impl<Scalar, true, false> {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static constexpr Scalar run(const Scalar&) { return Scalar(0); }
+};
+template <typename Scalar>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static constexpr Scalar signbit(const Scalar& x) {
+  return signbit_impl<Scalar>::run(x);
+}
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T exp(const T& x) {
+  EIGEN_USING_STD(exp);
+  return exp(x);
+}
+
+// MSVC screws up some edge-cases for std::exp(complex).
+#ifdef EIGEN_COMP_MSVC
+template <typename RealScalar>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::complex<RealScalar> exp(const std::complex<RealScalar>& x) {
+  EIGEN_USING_STD(exp);
+  // If z is (x,±∞) (for any finite x), the result is (NaN,NaN) and FE_INVALID is raised.
+  // If z is (x,NaN) (for any finite x), the result is (NaN,NaN) and FE_INVALID may be raised.
+  if ((isfinite)(real_ref(x)) && !(isfinite)(imag_ref(x))) {
+    return std::complex<RealScalar>(NumTraits<RealScalar>::quiet_NaN(), NumTraits<RealScalar>::quiet_NaN());
+  }
+  // If z is (+∞,±∞), the result is (±∞,NaN) and FE_INVALID is raised (the sign of the real part is unspecified)
+  // If z is (+∞,NaN), the result is (±∞,NaN) (the sign of the real part is unspecified)
+  if ((real_ref(x) == NumTraits<RealScalar>::infinity() && !(isfinite)(imag_ref(x)))) {
+    return std::complex<RealScalar>(NumTraits<RealScalar>::infinity(), NumTraits<RealScalar>::quiet_NaN());
+  }
+  return exp(x);
+}
+#endif
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(exp, exp)
+#endif
+
+#if defined(EIGEN_GPUCC)
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float exp(const float& x) {
+  return ::expf(x);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double exp(const double& x) {
+  return ::exp(x);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::complex<float> exp(const std::complex<float>& x) {
+  float com = ::expf(x.real());
+  float res_real = com * ::cosf(x.imag());
+  float res_imag = com * ::sinf(x.imag());
+  return std::complex<float>(res_real, res_imag);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::complex<double> exp(const std::complex<double>& x) {
+  double com = ::exp(x.real());
+  double res_real = com * ::cos(x.imag());
+  double res_imag = com * ::sin(x.imag());
+  return std::complex<double>(res_real, res_imag);
+}
+#endif
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T exp2(const T& x) {
+  EIGEN_USING_STD(exp2);
+  return exp2(x);
+}
+
+// MSVC screws up some edge-cases for std::exp2(complex).
+#ifdef EIGEN_COMP_MSVC
+template <typename RealScalar>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::complex<RealScalar> exp2(const std::complex<RealScalar>& x) {
+  EIGEN_USING_STD(exp);
+  // If z is (x,±∞) (for any finite x), the result is (NaN,NaN) and FE_INVALID is raised.
+  // If z is (x,NaN) (for any finite x), the result is (NaN,NaN) and FE_INVALID may be raised.
+  if ((isfinite)(real_ref(x)) && !(isfinite)(imag_ref(x))) {
+    return std::complex<RealScalar>(NumTraits<RealScalar>::quiet_NaN(), NumTraits<RealScalar>::quiet_NaN());
+  }
+  // If z is (+∞,±∞), the result is (±∞,NaN) and FE_INVALID is raised (the sign of the real part is unspecified)
+  // If z is (+∞,NaN), the result is (±∞,NaN) (the sign of the real part is unspecified)
+  if ((real_ref(x) == NumTraits<RealScalar>::infinity() && !(isfinite)(imag_ref(x)))) {
+    return std::complex<RealScalar>(NumTraits<RealScalar>::infinity(), NumTraits<RealScalar>::quiet_NaN());
+  }
+  return exp2(x);
+}
+#endif
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(exp2, exp2)
+#endif
+
+#if defined(EIGEN_GPUCC)
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float exp2(const float& x) {
+  return ::exp2f(x);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double exp2(const double& x) {
+  return ::exp2(x);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::complex<float> exp2(const std::complex<float>& x) {
+  float com = ::exp2f(x.real());
+  float res_real = com * ::cosf(static_cast<float>(EIGEN_LN2) * x.imag());
+  float res_imag = com * ::sinf(static_cast<float>(EIGEN_LN2) * x.imag());
+  return std::complex<float>(res_real, res_imag);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::complex<double> exp2(const std::complex<double>& x) {
+  double com = ::exp2(x.real());
+  double res_real = com * ::cos(static_cast<double>(EIGEN_LN2) * x.imag());
+  double res_imag = com * ::sin(static_cast<double>(EIGEN_LN2) * x.imag());
+  return std::complex<double>(res_real, res_imag);
+}
+#endif
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(expm1, Scalar) expm1(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(expm1, Scalar)::run(x);
+}
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(expm1, expm1)
+#endif
+
+#if defined(EIGEN_GPUCC)
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float expm1(const float& x) {
+  return ::expm1f(x);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double expm1(const double& x) {
+  return ::expm1(x);
+}
+#endif
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T cos(const T& x) {
+  EIGEN_USING_STD(cos);
+  return cos(x);
+}
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(cos, cos)
+#endif
+
+#if defined(EIGEN_GPUCC)
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float cos(const float& x) {
+  return ::cosf(x);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double cos(const double& x) {
+  return ::cos(x);
+}
+#endif
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T sin(const T& x) {
+  EIGEN_USING_STD(sin);
+  return sin(x);
+}
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sin, sin)
+#endif
+
+#if defined(EIGEN_GPUCC)
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float sin(const float& x) {
+  return ::sinf(x);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double sin(const double& x) {
+  return ::sin(x);
+}
+#endif
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T tan(const T& x) {
+  EIGEN_USING_STD(tan);
+  return tan(x);
+}
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(tan, tan)
+#endif
+
+#if defined(EIGEN_GPUCC)
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tan(const float& x) {
+  return ::tanf(x);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double tan(const double& x) {
+  return ::tan(x);
+}
+#endif
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T acos(const T& x) {
+  EIGEN_USING_STD(acos);
+  return acos(x);
+}
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T acosh(const T& x) {
+  EIGEN_USING_STD(acosh);
+  return static_cast<T>(acosh(x));
+}
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(acos, acos)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(acosh, acosh)
+#endif
+
+#if defined(EIGEN_GPUCC)
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float acos(const float& x) {
+  return ::acosf(x);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double acos(const double& x) {
+  return ::acos(x);
+}
+#endif
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T asin(const T& x) {
+  EIGEN_USING_STD(asin);
+  return asin(x);
+}
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T asinh(const T& x) {
+  EIGEN_USING_STD(asinh);
+  return static_cast<T>(asinh(x));
+}
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(asin, asin)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(asinh, asinh)
+#endif
+
+#if defined(EIGEN_GPUCC)
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float asin(const float& x) {
+  return ::asinf(x);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double asin(const double& x) {
+  return ::asin(x);
+}
+#endif
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T atan(const T& x) {
+  EIGEN_USING_STD(atan);
+  return static_cast<T>(atan(x));
+}
+
+template <typename T, std::enable_if_t<!NumTraits<T>::IsComplex, int> = 0>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T atan2(const T& y, const T& x) {
+  EIGEN_USING_STD(atan2);
+  return static_cast<T>(atan2(y, x));
+}
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T atanh(const T& x) {
+  EIGEN_USING_STD(atanh);
+  return static_cast<T>(atanh(x));
+}
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(atan, atan)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(atanh, atanh)
+#endif
+
+#if defined(EIGEN_GPUCC)
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float atan(const float& x) {
+  return ::atanf(x);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double atan(const double& x) {
+  return ::atan(x);
+}
+#endif
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T cosh(const T& x) {
+  EIGEN_USING_STD(cosh);
+  return static_cast<T>(cosh(x));
+}
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(cosh, cosh)
+#endif
+
+#if defined(EIGEN_GPUCC)
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float cosh(const float& x) {
+  return ::coshf(x);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double cosh(const double& x) {
+  return ::cosh(x);
+}
+#endif
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T sinh(const T& x) {
+  EIGEN_USING_STD(sinh);
+  return static_cast<T>(sinh(x));
+}
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(sinh, sinh)
+#endif
+
+#if defined(EIGEN_GPUCC)
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float sinh(const float& x) {
+  return ::sinhf(x);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double sinh(const double& x) {
+  return ::sinh(x);
+}
+#endif
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T tanh(const T& x) {
+  EIGEN_USING_STD(tanh);
+  return tanh(x);
+}
+
+#if (!defined(EIGEN_GPUCC)) && EIGEN_FAST_MATH && !defined(SYCL_DEVICE_ONLY)
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tanh(float x) { return internal::ptanh_float(x); }
+#endif
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_UNARY(tanh, tanh)
+#endif
+
+#if defined(EIGEN_GPUCC)
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float tanh(const float& x) {
+  return ::tanhf(x);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double tanh(const double& x) {
+  return ::tanh(x);
+}
+#endif
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T fmod(const T& a, const T& b) {
+  EIGEN_USING_STD(fmod);
+  return fmod(a, b);
+}
+
+#if defined(SYCL_DEVICE_ONLY)
+SYCL_SPECIALIZE_FLOATING_TYPES_BINARY(fmod, fmod)
+#endif
+
+#if defined(EIGEN_GPUCC)
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float fmod(const float& a, const float& b) {
+  return ::fmodf(a, b);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double fmod(const double& a, const double& b) {
+  return ::fmod(a, b);
+}
+#endif
+
+#if defined(SYCL_DEVICE_ONLY)
+#undef SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_BINARY
+#undef SYCL_SPECIALIZE_SIGNED_INTEGER_TYPES_UNARY
+#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_BINARY
+#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY
+#undef SYCL_SPECIALIZE_INTEGER_TYPES_BINARY
+#undef SYCL_SPECIALIZE_UNSIGNED_INTEGER_TYPES_UNARY
+#undef SYCL_SPECIALIZE_FLOATING_TYPES_BINARY
+#undef SYCL_SPECIALIZE_FLOATING_TYPES_UNARY
+#undef SYCL_SPECIALIZE_FLOATING_TYPES_UNARY_FUNC_RET_TYPE
+#undef SYCL_SPECIALIZE_GEN_UNARY_FUNC
+#undef SYCL_SPECIALIZE_UNARY_FUNC
+#undef SYCL_SPECIALIZE_GEN1_BINARY_FUNC
+#undef SYCL_SPECIALIZE_GEN2_BINARY_FUNC
+#undef SYCL_SPECIALIZE_BINARY_FUNC
+#endif
+
+template <typename Scalar, typename Enable = std::enable_if_t<std::is_integral<Scalar>::value>>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar logical_shift_left(const Scalar& a, int n) {
+  return a << n;
+}
+
+template <typename Scalar, typename Enable = std::enable_if_t<std::is_integral<Scalar>::value>>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar logical_shift_right(const Scalar& a, int n) {
+  using UnsignedScalar = typename numext::get_integer_by_size<sizeof(Scalar)>::unsigned_type;
+  return bit_cast<Scalar, UnsignedScalar>(bit_cast<UnsignedScalar, Scalar>(a) >> n);
+}
+
+template <typename Scalar, typename Enable = std::enable_if_t<std::is_integral<Scalar>::value>>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar arithmetic_shift_right(const Scalar& a, int n) {
+  using SignedScalar = typename numext::get_integer_by_size<sizeof(Scalar)>::signed_type;
+  return bit_cast<Scalar, SignedScalar>(bit_cast<SignedScalar, Scalar>(a) >> n);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar fma(const Scalar& x, const Scalar& y, const Scalar& z) {
+  return internal::fma_impl<Scalar>::run(x, y, z);
+}
+
+// Multiply-add.
+template <typename Scalar>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar madd(const Scalar& x, const Scalar& y, const Scalar& z) {
+  return internal::madd_impl<Scalar>::run(x, y, z);
+}
+
+}  // end namespace numext
+
+namespace internal {
+
+template <typename T>
+EIGEN_DEVICE_FUNC bool isfinite_impl(const std::complex<T>& x) {
+  return (numext::isfinite)(numext::real(x)) && (numext::isfinite)(numext::imag(x));
+}
+
+template <typename T>
+EIGEN_DEVICE_FUNC bool isnan_impl(const std::complex<T>& x) {
+  return (numext::isnan)(numext::real(x)) || (numext::isnan)(numext::imag(x));
+}
+
+template <typename T>
+EIGEN_DEVICE_FUNC bool isinf_impl(const std::complex<T>& x) {
+  return ((numext::isinf)(numext::real(x)) || (numext::isinf)(numext::imag(x))) && (!(numext::isnan)(x));
+}
+
+/****************************************************************************
+ * Implementation of fuzzy comparisons                                       *
+ ****************************************************************************/
+
+template <typename Scalar, bool IsComplex, bool IsInteger>
+struct scalar_fuzzy_default_impl {};
+
+template <typename Scalar>
+struct scalar_fuzzy_default_impl<Scalar, false, false> {
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  template <typename OtherScalar>
+  EIGEN_DEVICE_FUNC static inline bool isMuchSmallerThan(const Scalar& x, const OtherScalar& y,
+                                                         const RealScalar& prec) {
+    return numext::abs(x) <= numext::abs(y) * prec;
+  }
+  EIGEN_DEVICE_FUNC static inline bool isApprox(const Scalar& x, const Scalar& y, const RealScalar& prec) {
+    return numext::abs(x - y) <= numext::mini(numext::abs(x), numext::abs(y)) * prec;
+  }
+  EIGEN_DEVICE_FUNC static inline bool isApproxOrLessThan(const Scalar& x, const Scalar& y, const RealScalar& prec) {
+    return x <= y || isApprox(x, y, prec);
+  }
+};
+
+template <typename Scalar>
+struct scalar_fuzzy_default_impl<Scalar, false, true> {
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  template <typename OtherScalar>
+  EIGEN_DEVICE_FUNC static inline bool isMuchSmallerThan(const Scalar& x, const Scalar&, const RealScalar&) {
+    return x == Scalar(0);
+  }
+  EIGEN_DEVICE_FUNC static inline bool isApprox(const Scalar& x, const Scalar& y, const RealScalar&) { return x == y; }
+  EIGEN_DEVICE_FUNC static inline bool isApproxOrLessThan(const Scalar& x, const Scalar& y, const RealScalar&) {
+    return x <= y;
+  }
+};
+
+template <typename Scalar>
+struct scalar_fuzzy_default_impl<Scalar, true, false> {
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  template <typename OtherScalar>
+  EIGEN_DEVICE_FUNC static inline bool isMuchSmallerThan(const Scalar& x, const OtherScalar& y,
+                                                         const RealScalar& prec) {
+    return numext::abs2(x) <= numext::abs2(y) * prec * prec;
+  }
+  EIGEN_DEVICE_FUNC static inline bool isApprox(const Scalar& x, const Scalar& y, const RealScalar& prec) {
+    return numext::abs2(x - y) <= numext::mini(numext::abs2(x), numext::abs2(y)) * prec * prec;
+  }
+};
+
+template <typename Scalar>
+struct scalar_fuzzy_impl
+    : scalar_fuzzy_default_impl<Scalar, NumTraits<Scalar>::IsComplex, NumTraits<Scalar>::IsInteger> {};
+
+template <typename Scalar, typename OtherScalar>
+EIGEN_DEVICE_FUNC inline bool isMuchSmallerThan(
+    const Scalar& x, const OtherScalar& y,
+    const typename NumTraits<Scalar>::Real& precision = NumTraits<Scalar>::dummy_precision()) {
+  return scalar_fuzzy_impl<Scalar>::template isMuchSmallerThan<OtherScalar>(x, y, precision);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline bool isApprox(
+    const Scalar& x, const Scalar& y,
+    const typename NumTraits<Scalar>::Real& precision = NumTraits<Scalar>::dummy_precision()) {
   return scalar_fuzzy_impl<Scalar>::isApprox(x, y, precision);
 }
 
-template<typename Scalar>
-inline bool isApproxOrLessThan(const Scalar& x, const Scalar& y,
-                                    typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision())
-{
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline bool isApproxOrLessThan(
+    const Scalar& x, const Scalar& y,
+    const typename NumTraits<Scalar>::Real& precision = NumTraits<Scalar>::dummy_precision()) {
   return scalar_fuzzy_impl<Scalar>::isApproxOrLessThan(x, y, precision);
 }
 
@@ -730,39 +2034,70 @@ inline bool isApproxOrLessThan(const Scalar& x, const Scalar& y,
 ***  The special case of the  bool type ***
 ******************************************/
 
-template<> struct random_impl<bool>
-{
-  static inline bool run()
-  {
-    return random<int>(0,1)==0 ? false : true;
-  }
-};
-
-template<> struct scalar_fuzzy_impl<bool>
-{
+template <>
+struct scalar_fuzzy_impl<bool> {
   typedef bool RealScalar;
-  
-  template<typename OtherScalar>
-  static inline bool isMuchSmallerThan(const bool& x, const bool&, const bool&)
-  {
+
+  template <typename OtherScalar>
+  EIGEN_DEVICE_FUNC static inline bool isMuchSmallerThan(const bool& x, const bool&, const bool&) {
     return !x;
   }
-  
-  static inline bool isApprox(bool x, bool y, bool)
-  {
-    return x == y;
-  }
 
-  static inline bool isApproxOrLessThan(const bool& x, const bool& y, const bool&)
-  {
+  EIGEN_DEVICE_FUNC static inline bool isApprox(bool x, bool y, bool) { return x == y; }
+
+  EIGEN_DEVICE_FUNC static inline bool isApproxOrLessThan(const bool& x, const bool& y, const bool&) {
     return (!x) || y;
   }
-  
 };
 
-  
-} // end namespace internal
+}  // end namespace internal
+
+// Default implementations that rely on other numext implementations
+namespace internal {
+
+// Specialization for complex types that are not supported by std::expm1.
+template <typename RealScalar>
+struct expm1_impl<std::complex<RealScalar>> {
+  EIGEN_STATIC_ASSERT_NON_INTEGER(RealScalar)
+
+  EIGEN_DEVICE_FUNC static inline std::complex<RealScalar> run(const std::complex<RealScalar>& x) {
+    RealScalar xr = x.real();
+    RealScalar xi = x.imag();
+    // expm1(z) = exp(z) - 1
+    //          = exp(x +  i * y) - 1
+    //          = exp(x) * (cos(y) + i * sin(y)) - 1
+    //          = exp(x) * cos(y) - 1 + i * exp(x) * sin(y)
+    // Imag(expm1(z)) = exp(x) * sin(y)
+    // Real(expm1(z)) = exp(x) * cos(y) - 1
+    //          = exp(x) * cos(y) - 1.
+    //          = expm1(x) + exp(x) * (cos(y) - 1)
+    //          = expm1(x) + exp(x) * (2 * sin(y / 2) ** 2)
+    RealScalar erm1 = numext::expm1<RealScalar>(xr);
+    RealScalar er = erm1 + RealScalar(1.);
+    RealScalar sin2 = numext::sin(xi / RealScalar(2.));
+    sin2 = sin2 * sin2;
+    RealScalar s = numext::sin(xi);
+    RealScalar real_part = erm1 - RealScalar(2.) * er * sin2;
+    return std::complex<RealScalar>(real_part, er * s);
+  }
+};
+
+template <typename T>
+struct rsqrt_impl {
+  EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE T run(const T& x) { return T(1) / numext::sqrt(x); }
+};
+
+#if defined(EIGEN_GPU_COMPILE_PHASE)
+template <typename T>
+struct conj_impl<std::complex<T>, true> {
+  EIGEN_DEVICE_FUNC static inline std::complex<T> run(const std::complex<T>& x) {
+    return std::complex<T>(numext::real(x), -numext::imag(x));
+  }
+};
+#endif
+
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_MATHFUNCTIONS_H
+#endif  // EIGEN_MATHFUNCTIONS_H
diff --git a/inst/include/Eigen/src/Core/MathFunctionsImpl.h b/inst/include/Eigen/src/Core/MathFunctionsImpl.h
new file mode 100644
index 00000000..c4b5da3c
--- /dev/null
+++ b/inst/include/Eigen/src/Core/MathFunctionsImpl.h
@@ -0,0 +1,263 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com)
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATHFUNCTIONSIMPL_H
+#define EIGEN_MATHFUNCTIONSIMPL_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+/** \internal Fast reciprocal using Newton-Raphson's method.
+
+ Preconditions:
+   1. The starting guess provided in approx_a_recip must have at least half
+      the leading mantissa bits in the correct result, such that a single
+      Newton-Raphson step is sufficient to get within 1-2 ulps of the correct
+      result.
+   2. If a is zero, approx_a_recip must be infinite with the same sign as a.
+   3. If a is infinite, approx_a_recip must be zero with the same sign as a.
+
+   If the preconditions are satisfied, which they are for the _*_rcp_ps
+   instructions on x86, the result has a maximum relative error of 2 ulps,
+   and correctly handles reciprocals of zero, infinity, and NaN.
+*/
+template <typename Packet, int Steps>
+struct generic_reciprocal_newton_step {
+  static_assert(Steps > 0, "Steps must be at least 1.");
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Packet run(const Packet& a, const Packet& approx_a_recip) {
+    using Scalar = typename unpacket_traits<Packet>::type;
+    const Packet two = pset1<Packet>(Scalar(2));
+    // Refine the approximation using one Newton-Raphson step:
+    //   x_{i} = x_{i-1} * (2 - a * x_{i-1})
+    const Packet x = generic_reciprocal_newton_step<Packet, Steps - 1>::run(a, approx_a_recip);
+    const Packet tmp = pnmadd(a, x, two);
+    // If tmp is NaN, it means that a is either +/-0 or +/-Inf.
+    // In this case return the approximation directly.
+    const Packet is_not_nan = pcmp_eq(tmp, tmp);
+    return pselect(is_not_nan, pmul(x, tmp), x);
+  }
+};
+
+template <typename Packet>
+struct generic_reciprocal_newton_step<Packet, 0> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Packet run(const Packet& /*unused*/, const Packet& approx_rsqrt) {
+    return approx_rsqrt;
+  }
+};
+
+/** \internal Fast reciprocal sqrt using Newton-Raphson's method.
+
+ Preconditions:
+   1. The starting guess provided in approx_a_recip must have at least half
+      the leading mantissa bits in the correct result, such that a single
+      Newton-Raphson step is sufficient to get within 1-2 ulps of the correct
+      result.
+   2. If a is zero, approx_a_recip must be infinite with the same sign as a.
+   3. If a is infinite, approx_a_recip must be zero with the same sign as a.
+
+   If the preconditions are satisfied, which they are for the _*_rcp_ps
+   instructions on x86, the result has a maximum relative error of 2 ulps,
+   and correctly handles zero, infinity, and NaN. Positive denormals are
+   treated as zero.
+*/
+template <typename Packet, int Steps>
+struct generic_rsqrt_newton_step {
+  static_assert(Steps > 0, "Steps must be at least 1.");
+  using Scalar = typename unpacket_traits<Packet>::type;
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Packet run(const Packet& a, const Packet& approx_rsqrt) {
+    const Scalar kMinusHalf = Scalar(-1) / Scalar(2);
+    const Packet cst_minus_half = pset1<Packet>(kMinusHalf);
+    const Packet cst_minus_one = pset1<Packet>(Scalar(-1));
+
+    Packet inv_sqrt = approx_rsqrt;
+    for (int step = 0; step < Steps; ++step) {
+      // Refine the approximation using one Newton-Raphson step:
+      // h_n = (x * inv_sqrt) * inv_sqrt - 1 (so that h_n is nearly 0).
+      // inv_sqrt = inv_sqrt - 0.5 * inv_sqrt * h_n
+      Packet r2 = pmul(a, inv_sqrt);
+      Packet half_r = pmul(inv_sqrt, cst_minus_half);
+      Packet h_n = pmadd(r2, inv_sqrt, cst_minus_one);
+      inv_sqrt = pmadd(half_r, h_n, inv_sqrt);
+    }
+
+    // If x is NaN, then either:
+    // 1) the input is NaN
+    // 2) zero and infinity were multiplied
+    // In either of these cases, return approx_rsqrt
+    return pselect(pisnan(inv_sqrt), approx_rsqrt, inv_sqrt);
+  }
+};
+
+template <typename Packet>
+struct generic_rsqrt_newton_step<Packet, 0> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Packet run(const Packet& /*unused*/, const Packet& approx_rsqrt) {
+    return approx_rsqrt;
+  }
+};
+
+/** \internal Fast sqrt using Newton-Raphson's method.
+
+ Preconditions:
+   1. The starting guess for the reciprocal sqrt provided in approx_rsqrt must
+      have at least half the leading mantissa bits in the correct result, such
+      that a single Newton-Raphson step is sufficient to get within 1-2 ulps of
+      the correct result.
+   2. If a is zero, approx_rsqrt must be infinite.
+   3. If a is infinite, approx_rsqrt must be zero.
+
+   If the preconditions are satisfied, which they are for the _*_rsqrt_ps
+   instructions on x86, the result has a maximum relative error of 2 ulps,
+   and correctly handles zero and infinity, and NaN. Positive denormal inputs
+   are treated as zero.
+*/
+template <typename Packet, int Steps = 1>
+struct generic_sqrt_newton_step {
+  static_assert(Steps > 0, "Steps must be at least 1.");
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Packet run(const Packet& a, const Packet& approx_rsqrt) {
+    using Scalar = typename unpacket_traits<Packet>::type;
+    const Packet one_point_five = pset1<Packet>(Scalar(1.5));
+    const Packet minus_half = pset1<Packet>(Scalar(-0.5));
+    // If a is inf or zero, return a directly.
+    const Packet inf_mask = pcmp_eq(a, pset1<Packet>(NumTraits<Scalar>::infinity()));
+    const Packet return_a = por(pcmp_eq(a, pzero(a)), inf_mask);
+    // Do a single step of Newton's iteration for reciprocal square root:
+    //   x_{n+1} = x_n * (1.5 + (-0.5 * x_n) * (a * x_n))).
+    // The Newton's step is computed this way to avoid over/under-flows.
+    Packet rsqrt = pmul(approx_rsqrt, pmadd(pmul(minus_half, approx_rsqrt), pmul(a, approx_rsqrt), one_point_five));
+    for (int step = 1; step < Steps; ++step) {
+      rsqrt = pmul(rsqrt, pmadd(pmul(minus_half, rsqrt), pmul(a, rsqrt), one_point_five));
+    }
+
+    // Return sqrt(x) = x * rsqrt(x) for non-zero finite positive arguments.
+    // Return a itself for 0 or +inf, NaN for negative arguments.
+    return pselect(return_a, a, pmul(a, rsqrt));
+  }
+};
+
+template <typename RealScalar>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE RealScalar positive_real_hypot(const RealScalar& x, const RealScalar& y) {
+  // IEEE IEC 6059 special cases.
+  if ((numext::isinf)(x) || (numext::isinf)(y)) return NumTraits<RealScalar>::infinity();
+  if ((numext::isnan)(x) || (numext::isnan)(y)) return NumTraits<RealScalar>::quiet_NaN();
+
+  EIGEN_USING_STD(sqrt);
+  RealScalar p, qp;
+  p = numext::maxi(x, y);
+  if (numext::is_exactly_zero(p)) return RealScalar(0);
+  qp = numext::mini(y, x) / p;
+  return p * sqrt(RealScalar(1) + qp * qp);
+}
+
+template <typename Scalar>
+struct hypot_impl {
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  static EIGEN_DEVICE_FUNC inline RealScalar run(const Scalar& x, const Scalar& y) {
+    EIGEN_USING_STD(abs);
+    return positive_real_hypot<RealScalar>(abs(x), abs(y));
+  }
+};
+
+// Generic complex sqrt implementation that correctly handles corner cases
+// according to https://en.cppreference.com/w/cpp/numeric/complex/sqrt
+template <typename ComplexT>
+EIGEN_DEVICE_FUNC ComplexT complex_sqrt(const ComplexT& z) {
+  // Computes the principal sqrt of the input.
+  //
+  // For a complex square root of the number x + i*y. We want to find real
+  // numbers u and v such that
+  //    (u + i*v)^2 = x + i*y  <=>
+  //    u^2 - v^2 + i*2*u*v = x + i*v.
+  // By equating the real and imaginary parts we get:
+  //    u^2 - v^2 = x
+  //    2*u*v = y.
+  //
+  // For x >= 0, this has the numerically stable solution
+  //    u = sqrt(0.5 * (x + sqrt(x^2 + y^2)))
+  //    v = y / (2 * u)
+  // and for x < 0,
+  //    v = sign(y) * sqrt(0.5 * (-x + sqrt(x^2 + y^2)))
+  //    u = y / (2 * v)
+  //
+  // Letting w = sqrt(0.5 * (|x| + |z|)),
+  //   if x == 0: u = w, v = sign(y) * w
+  //   if x > 0:  u = w, v = y / (2 * w)
+  //   if x < 0:  u = |y| / (2 * w), v = sign(y) * w
+  using T = typename NumTraits<ComplexT>::Real;
+  const T x = numext::real(z);
+  const T y = numext::imag(z);
+  const T zero = T(0);
+  const T w = numext::sqrt(T(0.5) * (numext::abs(x) + numext::hypot(x, y)));
+
+  return (numext::isinf)(y)           ? ComplexT(NumTraits<T>::infinity(), y)
+         : numext::is_exactly_zero(x) ? ComplexT(w, y < zero ? -w : w)
+         : x > zero                   ? ComplexT(w, y / (2 * w))
+                                      : ComplexT(numext::abs(y) / (2 * w), y < zero ? -w : w);
+}
+
+// Generic complex rsqrt implementation.
+template <typename ComplexT>
+EIGEN_DEVICE_FUNC ComplexT complex_rsqrt(const ComplexT& z) {
+  // Computes the principal reciprocal sqrt of the input.
+  //
+  // For a complex reciprocal square root of the number z = x + i*y. We want to
+  // find real numbers u and v such that
+  //    (u + i*v)^2 = 1 / (x + i*y)  <=>
+  //    u^2 - v^2 + i*2*u*v = x/|z|^2 - i*v/|z|^2.
+  // By equating the real and imaginary parts we get:
+  //    u^2 - v^2 = x/|z|^2
+  //    2*u*v = y/|z|^2.
+  //
+  // For x >= 0, this has the numerically stable solution
+  //    u = sqrt(0.5 * (x + |z|)) / |z|
+  //    v = -y / (2 * u * |z|)
+  // and for x < 0,
+  //    v = -sign(y) * sqrt(0.5 * (-x + |z|)) / |z|
+  //    u = -y / (2 * v * |z|)
+  //
+  // Letting w = sqrt(0.5 * (|x| + |z|)),
+  //   if x == 0: u = w / |z|, v = -sign(y) * w / |z|
+  //   if x > 0:  u = w / |z|, v = -y / (2 * w * |z|)
+  //   if x < 0:  u = |y| / (2 * w * |z|), v = -sign(y) * w / |z|
+  using T = typename NumTraits<ComplexT>::Real;
+  const T x = numext::real(z);
+  const T y = numext::imag(z);
+  const T zero = T(0);
+
+  const T abs_z = numext::hypot(x, y);
+  const T w = numext::sqrt(T(0.5) * (numext::abs(x) + abs_z));
+  const T woz = w / abs_z;
+  // Corner cases consistent with 1/sqrt(z) on gcc/clang.
+  return numext::is_exactly_zero(abs_z)               ? ComplexT(NumTraits<T>::infinity(), NumTraits<T>::quiet_NaN())
+         : ((numext::isinf)(x) || (numext::isinf)(y)) ? ComplexT(zero, zero)
+         : numext::is_exactly_zero(x)                 ? ComplexT(woz, y < zero ? woz : -woz)
+         : x > zero                                   ? ComplexT(woz, -y / (2 * w * abs_z))
+                    : ComplexT(numext::abs(y) / (2 * w * abs_z), y < zero ? woz : -woz);
+}
+
+template <typename ComplexT>
+EIGEN_DEVICE_FUNC ComplexT complex_log(const ComplexT& z) {
+  // Computes complex log.
+  using T = typename NumTraits<ComplexT>::Real;
+  T a = numext::abs(z);
+  EIGEN_USING_STD(atan2);
+  T b = atan2(z.imag(), z.real());
+  return ComplexT(numext::log(a), b);
+}
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_MATHFUNCTIONSIMPL_H
diff --git a/inst/include/Eigen/src/Core/Matrix.h b/inst/include/Eigen/src/Core/Matrix.h
index 02be142d..a2c8eba5 100644
--- a/inst/include/Eigen/src/Core/Matrix.h
+++ b/inst/include/Eigen/src/Core/Matrix.h
@@ -11,410 +11,524 @@
 #ifndef EIGEN_MATRIX_H
 #define EIGEN_MATRIX_H
 
-namespace Eigen {
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 
-/** \class Matrix
-  * \ingroup Core_Module
-  *
-  * \brief The matrix class, also used for vectors and row-vectors
-  *
-  * The %Matrix class is the work-horse for all \em dense (\ref dense "note") matrices and vectors within Eigen.
-  * Vectors are matrices with one column, and row-vectors are matrices with one row.
-  *
-  * The %Matrix class encompasses \em both fixed-size and dynamic-size objects (\ref fixedsize "note").
-  *
-  * The first three template parameters are required:
-  * \tparam _Scalar \anchor matrix_tparam_scalar Numeric type, e.g. float, double, int or std::complex<float>.
-  *                 User defined sclar types are supported as well (see \ref user_defined_scalars "here").
-  * \tparam _Rows Number of rows, or \b Dynamic
-  * \tparam _Cols Number of columns, or \b Dynamic
-  *
-  * The remaining template parameters are optional -- in most cases you don't have to worry about them.
-  * \tparam _Options \anchor matrix_tparam_options A combination of either \b #RowMajor or \b #ColMajor, and of either
-  *                 \b #AutoAlign or \b #DontAlign.
-  *                 The former controls \ref TopicStorageOrders "storage order", and defaults to column-major. The latter controls alignment, which is required
-  *                 for vectorization. It defaults to aligning matrices except for fixed sizes that aren't a multiple of the packet size.
-  * \tparam _MaxRows Maximum number of rows. Defaults to \a _Rows (\ref maxrows "note").
-  * \tparam _MaxCols Maximum number of columns. Defaults to \a _Cols (\ref maxrows "note").
-  *
-  * Eigen provides a number of typedefs covering the usual cases. Here are some examples:
-  *
-  * \li \c Matrix2d is a 2x2 square matrix of doubles (\c Matrix<double, 2, 2>)
-  * \li \c Vector4f is a vector of 4 floats (\c Matrix<float, 4, 1>)
-  * \li \c RowVector3i is a row-vector of 3 ints (\c Matrix<int, 1, 3>)
-  *
-  * \li \c MatrixXf is a dynamic-size matrix of floats (\c Matrix<float, Dynamic, Dynamic>)
-  * \li \c VectorXf is a dynamic-size vector of floats (\c Matrix<float, Dynamic, 1>)
-  *
-  * \li \c Matrix2Xf is a partially fixed-size (dynamic-size) matrix of floats (\c Matrix<float, 2, Dynamic>)
-  * \li \c MatrixX3d is a partially dynamic-size (fixed-size) matrix of double (\c Matrix<double, Dynamic, 3>)
-  *
-  * See \link matrixtypedefs this page \endlink for a complete list of predefined \em %Matrix and \em Vector typedefs.
-  *
-  * You can access elements of vectors and matrices using normal subscripting:
-  *
-  * \code
-  * Eigen::VectorXd v(10);
-  * v[0] = 0.1;
-  * v[1] = 0.2;
-  * v(0) = 0.3;
-  * v(1) = 0.4;
-  *
-  * Eigen::MatrixXi m(10, 10);
-  * m(0, 1) = 1;
-  * m(0, 2) = 2;
-  * m(0, 3) = 3;
-  * \endcode
-  *
-  * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_MATRIX_PLUGIN.
-  *
-  * <i><b>Some notes:</b></i>
-  *
-  * <dl>
-  * <dt><b>\anchor dense Dense versus sparse:</b></dt>
-  * <dd>This %Matrix class handles dense, not sparse matrices and vectors. For sparse matrices and vectors, see the Sparse module.
-  *
-  * Dense matrices and vectors are plain usual arrays of coefficients. All the coefficients are stored, in an ordinary contiguous array.
-  * This is unlike Sparse matrices and vectors where the coefficients are stored as a list of nonzero coefficients.</dd>
-  *
-  * <dt><b>\anchor fixedsize Fixed-size versus dynamic-size:</b></dt>
-  * <dd>Fixed-size means that the numbers of rows and columns are known are compile-time. In this case, Eigen allocates the array
-  * of coefficients as a fixed-size array, as a class member. This makes sense for very small matrices, typically up to 4x4, sometimes up
-  * to 16x16. Larger matrices should be declared as dynamic-size even if one happens to know their size at compile-time.
-  *
-  * Dynamic-size means that the numbers of rows or columns are not necessarily known at compile-time. In this case they are runtime
-  * variables, and the array of coefficients is allocated dynamically on the heap.
-  *
-  * Note that \em dense matrices, be they Fixed-size or Dynamic-size, <em>do not</em> expand dynamically in the sense of a std::map.
-  * If you want this behavior, see the Sparse module.</dd>
-  *
-  * <dt><b>\anchor maxrows _MaxRows and _MaxCols:</b></dt>
-  * <dd>In most cases, one just leaves these parameters to the default values.
-  * These parameters mean the maximum size of rows and columns that the matrix may have. They are useful in cases
-  * when the exact numbers of rows and columns are not known are compile-time, but it is known at compile-time that they cannot
-  * exceed a certain value. This happens when taking dynamic-size blocks inside fixed-size matrices: in this case _MaxRows and _MaxCols
-  * are the dimensions of the original matrix, while _Rows and _Cols are Dynamic.</dd>
-  * </dl>
-  *
-  * \see MatrixBase for the majority of the API methods for matrices, \ref TopicClassHierarchy, 
-  * \ref TopicStorageOrders 
-  */
+namespace Eigen {
 
 namespace internal {
-template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
-struct traits<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
-{
-  typedef _Scalar Scalar;
+template <typename Scalar_, int Rows_, int Cols_, int Options_, int MaxRows_, int MaxCols_>
+struct traits<Matrix<Scalar_, Rows_, Cols_, Options_, MaxRows_, MaxCols_>> {
+ private:
+  constexpr static int size = internal::size_at_compile_time(Rows_, Cols_);
+  typedef typename find_best_packet<Scalar_, size>::type PacketScalar;
+  enum {
+    row_major_bit = Options_ & RowMajor ? RowMajorBit : 0,
+    is_dynamic_size_storage = MaxRows_ == Dynamic || MaxCols_ == Dynamic,
+    max_size = is_dynamic_size_storage ? Dynamic : MaxRows_ * MaxCols_,
+    default_alignment = compute_default_alignment<Scalar_, max_size>::value,
+    actual_alignment = ((Options_ & DontAlign) == 0) ? default_alignment : 0,
+    required_alignment = unpacket_traits<PacketScalar>::alignment,
+    packet_access_bit = (packet_traits<Scalar_>::Vectorizable &&
+                         (EIGEN_UNALIGNED_VECTORIZE || (int(actual_alignment) >= int(required_alignment))))
+                            ? PacketAccessBit
+                            : 0
+  };
+
+ public:
+  typedef Scalar_ Scalar;
   typedef Dense StorageKind;
-  typedef DenseIndex Index;
+  typedef Eigen::Index StorageIndex;
   typedef MatrixXpr XprKind;
   enum {
-    RowsAtCompileTime = _Rows,
-    ColsAtCompileTime = _Cols,
-    MaxRowsAtCompileTime = _MaxRows,
-    MaxColsAtCompileTime = _MaxCols,
-    Flags = compute_matrix_flags<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>::ret,
-    CoeffReadCost = NumTraits<Scalar>::ReadCost,
-    Options = _Options,
+    RowsAtCompileTime = Rows_,
+    ColsAtCompileTime = Cols_,
+    MaxRowsAtCompileTime = MaxRows_,
+    MaxColsAtCompileTime = MaxCols_,
+    Flags = compute_matrix_flags(Options_),
+    Options = Options_,
     InnerStrideAtCompileTime = 1,
-    OuterStrideAtCompileTime = (Options&RowMajor) ? ColsAtCompileTime : RowsAtCompileTime
+    OuterStrideAtCompileTime = (int(Options) & int(RowMajor)) ? ColsAtCompileTime : RowsAtCompileTime,
+
+    // FIXME, the following flag in only used to define NeedsToAlign in PlainObjectBase
+    EvaluatorFlags = LinearAccessBit | DirectAccessBit | packet_access_bit | row_major_bit,
+    Alignment = actual_alignment
   };
 };
-}
-
-template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
-class Matrix
-  : public PlainObjectBase<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
-{
-  public:
-
-    /** \brief Base class typedef.
-      * \sa PlainObjectBase
-      */
-    typedef PlainObjectBase<Matrix> Base;
-
-    enum { Options = _Options };
-
-    EIGEN_DENSE_PUBLIC_INTERFACE(Matrix)
-
-    typedef typename Base::PlainObject PlainObject;
-
-    using Base::base;
-    using Base::coeffRef;
-
-    /**
-      * \brief Assigns matrices to each other.
-      *
-      * \note This is a special case of the templated operator=. Its purpose is
-      * to prevent a default operator= from hiding the templated operator=.
-      *
-      * \callgraph
-      */
-    EIGEN_STRONG_INLINE Matrix& operator=(const Matrix& other)
-    {
-      return Base::_set(other);
-    }
-
-    /** \internal
-      * \brief Copies the value of the expression \a other into \c *this with automatic resizing.
-      *
-      * *this might be resized to match the dimensions of \a other. If *this was a null matrix (not already initialized),
-      * it will be initialized.
-      *
-      * Note that copying a row-vector into a vector (and conversely) is allowed.
-      * The resizing, if any, is then done in the appropriate way so that row-vectors
-      * remain row-vectors and vectors remain vectors.
-      */
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE Matrix& operator=(const MatrixBase<OtherDerived>& other)
-    {
-      return Base::_set(other);
-    }
-
-    /* Here, doxygen failed to copy the brief information when using \copydoc */
-
-    /**
-      * \brief Copies the generic expression \a other into *this.
-      * \copydetails DenseBase::operator=(const EigenBase<OtherDerived> &other)
-      */
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE Matrix& operator=(const EigenBase<OtherDerived> &other)
-    {
-      return Base::operator=(other);
-    }
-
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE Matrix& operator=(const ReturnByValue<OtherDerived>& func)
-    {
-      return Base::operator=(func);
-    }
-
-    /** \brief Default constructor.
-      *
-      * For fixed-size matrices, does nothing.
-      *
-      * For dynamic-size matrices, creates an empty matrix of size 0. Does not allocate any array. Such a matrix
-      * is called a null matrix. This constructor is the unique way to create null matrices: resizing
-      * a matrix to 0 is not supported.
-      *
-      * \sa resize(Index,Index)
-      */
-    EIGEN_STRONG_INLINE Matrix() : Base()
-    {
-      Base::_check_template_params();
-      EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
-    }
-
-    // FIXME is it still needed
-    Matrix(internal::constructor_without_unaligned_array_assert)
-      : Base(internal::constructor_without_unaligned_array_assert())
-    { Base::_check_template_params(); EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED }
-
-#ifdef EIGEN_HAVE_RVALUE_REFERENCES
-    Matrix(Matrix&& other)
-      : Base(std::move(other))
-    {
-      Base::_check_template_params();
-      if (RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic)
-        Base::_set_noalias(other);
-    }
-    Matrix& operator=(Matrix&& other)
-    {
-      other.swap(*this);
-      return *this;
-    }
+}  // namespace internal
+
+/** \class Matrix
+ * \ingroup Core_Module
+ *
+ * \brief The matrix class, also used for vectors and row-vectors
+ *
+ * The %Matrix class is the work-horse for all \em dense (\ref dense "note") matrices and vectors within Eigen.
+ * Vectors are matrices with one column, and row-vectors are matrices with one row.
+ *
+ * The %Matrix class encompasses \em both fixed-size and dynamic-size objects (\ref fixedsize "note").
+ *
+ * The first three template parameters are required:
+ * \tparam Scalar_ Numeric type, e.g. float, double, int or std::complex<float>.
+ *                 User defined scalar types are supported as well (see \ref user_defined_scalars "here").
+ * \tparam Rows_ Number of rows, or \b Dynamic
+ * \tparam Cols_ Number of columns, or \b Dynamic
+ *
+ * The remaining template parameters are optional -- in most cases you don't have to worry about them.
+ * \tparam Options_ A combination of either \b #RowMajor or \b #ColMajor, and of either
+ *                 \b #AutoAlign or \b #DontAlign.
+ *                 The former controls \ref TopicStorageOrders "storage order", and defaults to column-major. The latter
+ * controls alignment, which is required for vectorization. It defaults to aligning matrices except for fixed sizes that
+ * aren't a multiple of the packet size. \tparam MaxRows_ Maximum number of rows. Defaults to \a Rows_ (\ref maxrows
+ * "note"). \tparam MaxCols_ Maximum number of columns. Defaults to \a Cols_ (\ref maxrows "note").
+ *
+ * Eigen provides a number of typedefs covering the usual cases. Here are some examples:
+ *
+ * \li \c Matrix2d is a 2x2 square matrix of doubles (\c Matrix<double, 2, 2>)
+ * \li \c Vector4f is a vector of 4 floats (\c Matrix<float, 4, 1>)
+ * \li \c RowVector3i is a row-vector of 3 ints (\c Matrix<int, 1, 3>)
+ *
+ * \li \c MatrixXf is a dynamic-size matrix of floats (\c Matrix<float, Dynamic, Dynamic>)
+ * \li \c VectorXf is a dynamic-size vector of floats (\c Matrix<float, Dynamic, 1>)
+ *
+ * \li \c Matrix2Xf is a partially fixed-size (dynamic-size) matrix of floats (\c Matrix<float, 2, Dynamic>)
+ * \li \c MatrixX3d is a partially dynamic-size (fixed-size) matrix of double (\c Matrix<double, Dynamic, 3>)
+ *
+ * See \link matrixtypedefs this page \endlink for a complete list of predefined \em %Matrix and \em Vector typedefs.
+ *
+ * You can access elements of vectors and matrices using normal subscripting:
+ *
+ * \code
+ * Eigen::VectorXd v(10);
+ * v[0] = 0.1;
+ * v[1] = 0.2;
+ * v(0) = 0.3;
+ * v(1) = 0.4;
+ *
+ * Eigen::MatrixXi m(10, 10);
+ * m(0, 1) = 1;
+ * m(0, 2) = 2;
+ * m(0, 3) = 3;
+ * \endcode
+ *
+ * This class can be extended with the help of the plugin mechanism described on the page
+ * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_MATRIX_PLUGIN.
+ *
+ * <i><b>Some notes:</b></i>
+ *
+ * <dl>
+ * <dt><b>\anchor dense Dense versus sparse:</b></dt>
+ * <dd>This %Matrix class handles dense, not sparse matrices and vectors. For sparse matrices and vectors, see the
+ * Sparse module.
+ *
+ * Dense matrices and vectors are plain usual arrays of coefficients. All the coefficients are stored, in an ordinary
+ * contiguous array. This is unlike Sparse matrices and vectors where the coefficients are stored as a list of nonzero
+ * coefficients.</dd>
+ *
+ * <dt><b>\anchor fixedsize Fixed-size versus dynamic-size:</b></dt>
+ * <dd>Fixed-size means that the numbers of rows and columns are known at compile-time. In this case, Eigen allocates
+ * the array of coefficients as a fixed-size array, as a class member. This makes sense for very small matrices,
+ * typically up to 4x4, sometimes up to 16x16. Larger matrices should be declared as dynamic-size even if one happens to
+ * know their size at compile-time.
+ *
+ * Dynamic-size means that the numbers of rows or columns are not necessarily known at compile-time. In this case they
+ * are runtime variables, and the array of coefficients is allocated dynamically on the heap.
+ *
+ * Note that \em dense matrices, be they Fixed-size or Dynamic-size, <em>do not</em> expand dynamically in the sense of
+ * a std::map. If you want this behavior, see the Sparse module.</dd>
+ *
+ * <dt><b>\anchor maxrows MaxRows_ and MaxCols_:</b></dt>
+ * <dd>In most cases, one just leaves these parameters to the default values.
+ * These parameters mean the maximum size of rows and columns that the matrix may have. They are useful in cases
+ * when the exact numbers of rows and columns are not known at compile-time, but it is known at compile-time that they
+ * cannot exceed a certain value. This happens when taking dynamic-size blocks inside fixed-size matrices: in this case
+ * MaxRows_ and MaxCols_ are the dimensions of the original matrix, while Rows_ and Cols_ are Dynamic.</dd>
+ * </dl>
+ *
+ * <i><b>ABI and storage layout</b></i>
+ *
+ * The table below summarizes the ABI of some possible Matrix instances which is fixed thorough the lifetime of Eigen 3.
+ * <table  class="manual">
+ * <tr><th>Matrix type</th><th>Equivalent C structure</th></tr>
+ * <tr><td>\code Matrix<T,Dynamic,Dynamic> \endcode</td><td>\code
+ * struct {
+ *   T *data;                  // with (size_t(data)%EIGEN_MAX_ALIGN_BYTES)==0
+ *   Eigen::Index rows, cols;
+ *  };
+ * \endcode</td></tr>
+ * <tr class="alt"><td>\code
+ * Matrix<T,Dynamic,1>
+ * Matrix<T,1,Dynamic> \endcode</td><td>\code
+ * struct {
+ *   T *data;                  // with (size_t(data)%EIGEN_MAX_ALIGN_BYTES)==0
+ *   Eigen::Index size;
+ *  };
+ * \endcode</td></tr>
+ * <tr><td>\code Matrix<T,Rows,Cols> \endcode</td><td>\code
+ * struct {
+ *   T data[Rows*Cols];        // with (size_t(data)%A(Rows*Cols*sizeof(T)))==0
+ *  };
+ * \endcode</td></tr>
+ * <tr class="alt"><td>\code Matrix<T,Dynamic,Dynamic,0,MaxRows,MaxCols> \endcode</td><td>\code
+ * struct {
+ *   T data[MaxRows*MaxCols];  // with (size_t(data)%A(MaxRows*MaxCols*sizeof(T)))==0
+ *   Eigen::Index rows, cols;
+ *  };
+ * \endcode</td></tr>
+ * </table>
+ * Note that in this table Rows, Cols, MaxRows and MaxCols are all positive integers. A(S) is defined to the largest
+ * possible power-of-two smaller to EIGEN_MAX_STATIC_ALIGN_BYTES.
+ *
+ * \see MatrixBase for the majority of the API methods for matrices, \ref TopicClassHierarchy,
+ * \ref TopicStorageOrders
+ */
+
+template <typename Scalar_, int Rows_, int Cols_, int Options_, int MaxRows_, int MaxCols_>
+class Matrix : public PlainObjectBase<Matrix<Scalar_, Rows_, Cols_, Options_, MaxRows_, MaxCols_>> {
+ public:
+  /** \brief Base class typedef.
+   * \sa PlainObjectBase
+   */
+  typedef PlainObjectBase<Matrix> Base;
+
+  enum { Options = Options_ };
+
+  EIGEN_DENSE_PUBLIC_INTERFACE(Matrix)
+
+  typedef typename Base::PlainObject PlainObject;
+
+  using Base::base;
+  using Base::coeffRef;
+
+  /**
+   * \brief Assigns matrices to each other.
+   *
+   * \note This is a special case of the templated operator=. Its purpose is
+   * to prevent a default operator= from hiding the templated operator=.
+   *
+   * \callgraph
+   */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Matrix& operator=(const Matrix& other) { return Base::_set(other); }
+
+  /** \internal
+   * \brief Copies the value of the expression \a other into \c *this with automatic resizing.
+   *
+   * *this might be resized to match the dimensions of \a other. If *this was a null matrix (not already initialized),
+   * it will be initialized.
+   *
+   * Note that copying a row-vector into a vector (and conversely) is allowed.
+   * The resizing, if any, is then done in the appropriate way so that row-vectors
+   * remain row-vectors and vectors remain vectors.
+   */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Matrix& operator=(const DenseBase<OtherDerived>& other) {
+    return Base::_set(other);
+  }
+
+  /**
+   * \brief Copies the generic expression \a other into *this.
+   * \copydetails DenseBase::operator=(const EigenBase<OtherDerived> &other)
+   */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Matrix& operator=(const EigenBase<OtherDerived>& other) {
+    return Base::operator=(other);
+  }
+
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Matrix& operator=(const ReturnByValue<OtherDerived>& func) {
+    return Base::operator=(func);
+  }
+
+  /** \brief Default constructor.
+   *
+   * For fixed-size matrices, does nothing.
+   *
+   * For dynamic-size matrices, creates an empty matrix of size 0. Does not allocate any array. Such a matrix
+   * is called a null matrix. This constructor is the unique way to create null matrices: resizing
+   * a matrix to 0 is not supported.
+   *
+   * \sa resize(Index,Index)
+   */
+#if defined(EIGEN_INITIALIZE_COEFFS)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Matrix() { EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED }
+#else
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Matrix() = default;
+#endif
+  /** \brief Move constructor */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Matrix(Matrix&&) = default;
+  /** \brief Moves the matrix into the other one.
+   *
+   */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Matrix& operator=(Matrix&& other) noexcept(
+      std::is_nothrow_move_assignable<Scalar>::value) {
+    Base::operator=(std::move(other));
+    return *this;
+  }
+
+  /** \brief Construct a row of column vector with fixed size from an arbitrary number of coefficients.
+   *
+   * \only_for_vectors
+   *
+   * This constructor is for 1D array or vectors with more than 4 coefficients.
+   *
+   * \warning To construct a column (resp. row) vector of fixed length, the number of values passed to this
+   * constructor must match the the fixed number of rows (resp. columns) of \c *this.
+   *
+   *
+   * Example: \include Matrix_variadic_ctor_cxx11.cpp
+   * Output: \verbinclude Matrix_variadic_ctor_cxx11.out
+   *
+   * \sa Matrix(const std::initializer_list<std::initializer_list<Scalar>>&)
+   */
+  template <typename... ArgTypes>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Matrix(const Scalar& a0, const Scalar& a1, const Scalar& a2, const Scalar& a3,
+                                               const ArgTypes&... args)
+      : Base(a0, a1, a2, a3, args...) {}
+
+  /** \brief Constructs a Matrix and initializes it from the coefficients given as initializer-lists grouped by row.
+   * \cpp11
+   * \anchor matrix_initializer_list
+   *
+   * In the general case, the constructor takes a list of rows, each row being represented as a list of coefficients:
+   *
+   * Example: \include Matrix_initializer_list_23_cxx11.cpp
+   * Output: \verbinclude Matrix_initializer_list_23_cxx11.out
+   *
+   * Each of the inner initializer lists must contain the exact same number of elements, otherwise an assertion is
+   * triggered.
+   *
+   * In the case of a compile-time column vector, implicit transposition from a single row is allowed.
+   * Therefore <code>VectorXd{{1,2,3,4,5}}</code> is legal and the more verbose syntax
+   * <code>RowVectorXd{{1},{2},{3},{4},{5}}</code> can be avoided:
+   *
+   * Example: \include Matrix_initializer_list_vector_cxx11.cpp
+   * Output: \verbinclude Matrix_initializer_list_vector_cxx11.out
+   *
+   * In the case of fixed-sized matrices, the initializer list sizes must exactly match the matrix sizes,
+   * and implicit transposition is allowed for compile-time vectors only.
+   *
+   * \sa Matrix(const Scalar& a0, const Scalar& a1, const Scalar& a2,  const Scalar& a3, const ArgTypes&... args)
+   */
+  EIGEN_DEVICE_FUNC explicit constexpr EIGEN_STRONG_INLINE Matrix(
+      const std::initializer_list<std::initializer_list<Scalar>>& list)
+      : Base(list) {}
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+
+  // This constructor is for both 1x1 matrices and dynamic vectors
+  template <typename T>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Matrix(const T& x) {
+    Base::template _init1<T>(x);
+  }
+
+  template <typename T0, typename T1>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Matrix(const T0& x, const T1& y) {
+    Base::template _init2<T0, T1>(x, y);
+  }
+
+#else
+  /** \brief Constructs a fixed-sized matrix initialized with coefficients starting at \a data */
+  EIGEN_DEVICE_FUNC explicit Matrix(const Scalar* data);
+
+  /** \brief Constructs a vector or row-vector with given dimension. \only_for_vectors
+   *
+   * This is useful for dynamic-size vectors. For fixed-size vectors,
+   * it is redundant to pass these parameters, so one should use the default constructor
+   * Matrix() instead.
+   *
+   * \warning This constructor is disabled for fixed-size \c 1x1 matrices. For instance,
+   * calling Matrix<double,1,1>(1) will call the initialization constructor: Matrix(const Scalar&).
+   * For fixed-size \c 1x1 matrices it is therefore recommended to use the default
+   * constructor Matrix() instead, especially when using one of the non standard
+   * \c EIGEN_INITIALIZE_MATRICES_BY_{ZERO,\c NAN} macros (see \ref TopicPreprocessorDirectives).
+   */
+  EIGEN_STRONG_INLINE explicit Matrix(Index dim);
+  /** \brief Constructs an initialized 1x1 matrix with the given coefficient
+   * \sa Matrix(const Scalar&, const Scalar&, const Scalar&,  const Scalar&, const ArgTypes&...) */
+  Matrix(const Scalar& x);
+  /** \brief Constructs an uninitialized matrix with \a rows rows and \a cols columns.
+   *
+   * This is useful for dynamic-size matrices. For fixed-size matrices,
+   * it is redundant to pass these parameters, so one should use the default constructor
+   * Matrix() instead.
+   *
+   * \warning This constructor is disabled for fixed-size \c 1x2 and \c 2x1 vectors. For instance,
+   * calling Matrix2f(2,1) will call the initialization constructor: Matrix(const Scalar& x, const Scalar& y).
+   * For fixed-size \c 1x2 or \c 2x1 vectors it is therefore recommended to use the default
+   * constructor Matrix() instead, especially when using one of the non standard
+   * \c EIGEN_INITIALIZE_MATRICES_BY_{ZERO,\c NAN} macros (see \ref TopicPreprocessorDirectives).
+   */
+  EIGEN_DEVICE_FUNC Matrix(Index rows, Index cols);
+
+  /** \brief Constructs an initialized 2D vector with given coefficients
+   * \sa Matrix(const Scalar&, const Scalar&, const Scalar&,  const Scalar&, const ArgTypes&...) */
+  Matrix(const Scalar& x, const Scalar& y);
+#endif  // end EIGEN_PARSED_BY_DOXYGEN
+
+  /** \brief Constructs an initialized 3D vector with given coefficients
+   * \sa Matrix(const Scalar&, const Scalar&, const Scalar&,  const Scalar&, const ArgTypes&...)
+   */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z) {
+    EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(Matrix, 3)
+    m_storage.data()[0] = x;
+    m_storage.data()[1] = y;
+    m_storage.data()[2] = z;
+  }
+  /** \brief Constructs an initialized 4D vector with given coefficients
+   * \sa Matrix(const Scalar&, const Scalar&, const Scalar&,  const Scalar&, const ArgTypes&...)
+   */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z, const Scalar& w) {
+    EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(Matrix, 4)
+    m_storage.data()[0] = x;
+    m_storage.data()[1] = y;
+    m_storage.data()[2] = z;
+    m_storage.data()[3] = w;
+  }
+
+  /** \brief Copy constructor */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Matrix(const Matrix&) = default;
+
+  /** \brief Copy constructor for generic expressions.
+   * \sa MatrixBase::operator=(const EigenBase<OtherDerived>&)
+   */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Matrix(const EigenBase<OtherDerived>& other) : Base(other.derived()) {}
+
+  EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept { return 1; }
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return this->innerSize(); }
+
+  /////////// Geometry module ///////////
+
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC explicit Matrix(const RotationBase<OtherDerived, ColsAtCompileTime>& r);
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC Matrix& operator=(const RotationBase<OtherDerived, ColsAtCompileTime>& r);
+
+// allow to extend Matrix outside Eigen
+#ifdef EIGEN_MATRIX_PLUGIN
+#include EIGEN_MATRIX_PLUGIN
 #endif
 
-    /** \brief Constructs a vector or row-vector with given dimension. \only_for_vectors
-      *
-      * Note that this is only useful for dynamic-size vectors. For fixed-size vectors,
-      * it is redundant to pass the dimension here, so it makes more sense to use the default
-      * constructor Matrix() instead.
-      */
-    EIGEN_STRONG_INLINE explicit Matrix(Index dim)
-      : Base(dim, RowsAtCompileTime == 1 ? 1 : dim, ColsAtCompileTime == 1 ? 1 : dim)
-    {
-      Base::_check_template_params();
-      EIGEN_STATIC_ASSERT_VECTOR_ONLY(Matrix)
-      eigen_assert(dim >= 0);
-      eigen_assert(SizeAtCompileTime == Dynamic || SizeAtCompileTime == dim);
-      EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
-    }
-
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    template<typename T0, typename T1>
-    EIGEN_STRONG_INLINE Matrix(const T0& x, const T1& y)
-    {
-      Base::_check_template_params();
-      Base::template _init2<T0,T1>(x, y);
-    }
-    #else
-    /** \brief Constructs an uninitialized matrix with \a rows rows and \a cols columns.
-      *
-      * This is useful for dynamic-size matrices. For fixed-size matrices,
-      * it is redundant to pass these parameters, so one should use the default constructor
-      * Matrix() instead. */
-    Matrix(Index rows, Index cols);
-    /** \brief Constructs an initialized 2D vector with given coefficients */
-    Matrix(const Scalar& x, const Scalar& y);
-    #endif
-
-    /** \brief Constructs an initialized 3D vector with given coefficients */
-    EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z)
-    {
-      Base::_check_template_params();
-      EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(Matrix, 3)
-      m_storage.data()[0] = x;
-      m_storage.data()[1] = y;
-      m_storage.data()[2] = z;
-    }
-    /** \brief Constructs an initialized 4D vector with given coefficients */
-    EIGEN_STRONG_INLINE Matrix(const Scalar& x, const Scalar& y, const Scalar& z, const Scalar& w)
-    {
-      Base::_check_template_params();
-      EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(Matrix, 4)
-      m_storage.data()[0] = x;
-      m_storage.data()[1] = y;
-      m_storage.data()[2] = z;
-      m_storage.data()[3] = w;
-    }
-
-    explicit Matrix(const Scalar *data);
-
-    /** \brief Constructor copying the value of the expression \a other */
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE Matrix(const MatrixBase<OtherDerived>& other)
-             : Base(other.rows() * other.cols(), other.rows(), other.cols())
-    {
-      // This test resides here, to bring the error messages closer to the user. Normally, these checks
-      // are performed deeply within the library, thus causing long and scary error traces.
-      EIGEN_STATIC_ASSERT((internal::is_same<Scalar, typename OtherDerived::Scalar>::value),
-        YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
-
-      Base::_check_template_params();
-      Base::_set_noalias(other);
-    }
-    /** \brief Copy constructor */
-    EIGEN_STRONG_INLINE Matrix(const Matrix& other)
-            : Base(other.rows() * other.cols(), other.rows(), other.cols())
-    {
-      Base::_check_template_params();
-      Base::_set_noalias(other);
-    }
-    /** \brief Copy constructor with in-place evaluation */
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE Matrix(const ReturnByValue<OtherDerived>& other)
-    {
-      Base::_check_template_params();
-      Base::resize(other.rows(), other.cols());
-      other.evalTo(*this);
-    }
-
-    /** \brief Copy constructor for generic expressions.
-      * \sa MatrixBase::operator=(const EigenBase<OtherDerived>&)
-      */
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE Matrix(const EigenBase<OtherDerived> &other)
-      : Base(other.derived().rows() * other.derived().cols(), other.derived().rows(), other.derived().cols())
-    {
-      Base::_check_template_params();
-      Base::_resize_to_match(other);
-      // FIXME/CHECK: isn't *this = other.derived() more efficient. it allows to
-      //              go for pure _set() implementations, right?
-      *this = other;
-    }
-
-    /** \internal
-      * \brief Override MatrixBase::swap() since for dynamic-sized matrices
-      * of same type it is enough to swap the data pointers.
-      */
-    template<typename OtherDerived>
-    void swap(MatrixBase<OtherDerived> const & other)
-    { this->_swap(other.derived()); }
-
-    inline Index innerStride() const { return 1; }
-    inline Index outerStride() const { return this->innerSize(); }
-
-    /////////// Geometry module ///////////
-
-    template<typename OtherDerived>
-    explicit Matrix(const RotationBase<OtherDerived,ColsAtCompileTime>& r);
-    template<typename OtherDerived>
-    Matrix& operator=(const RotationBase<OtherDerived,ColsAtCompileTime>& r);
-
-    #ifdef EIGEN2_SUPPORT
-    template<typename OtherDerived>
-    explicit Matrix(const eigen2_RotationBase<OtherDerived,ColsAtCompileTime>& r);
-    template<typename OtherDerived>
-    Matrix& operator=(const eigen2_RotationBase<OtherDerived,ColsAtCompileTime>& r);
-    #endif
-
-    // allow to extend Matrix outside Eigen
-    #ifdef EIGEN_MATRIX_PLUGIN
-    #include EIGEN_MATRIX_PLUGIN
-    #endif
-
-  protected:
-    template <typename Derived, typename OtherDerived, bool IsVector>
-    friend struct internal::conservative_resize_like_impl;
-
-    using Base::m_storage;
+ protected:
+  template <typename Derived, typename OtherDerived, bool IsVector>
+  friend struct internal::conservative_resize_like_impl;
+
+  using Base::m_storage;
 };
 
 /** \defgroup matrixtypedefs Global matrix typedefs
-  *
-  * \ingroup Core_Module
-  *
-  * Eigen defines several typedef shortcuts for most common matrix and vector types.
-  *
-  * The general patterns are the following:
-  *
-  * \c MatrixSizeType where \c Size can be \c 2,\c 3,\c 4 for fixed size square matrices or \c X for dynamic size,
-  * and where \c Type can be \c i for integer, \c f for float, \c d for double, \c cf for complex float, \c cd
-  * for complex double.
-  *
-  * For example, \c Matrix3d is a fixed-size 3x3 matrix type of doubles, and \c MatrixXf is a dynamic-size matrix of floats.
-  *
-  * There are also \c VectorSizeType and \c RowVectorSizeType which are self-explanatory. For example, \c Vector4cf is
-  * a fixed-size vector of 4 complex floats.
-  *
-  * \sa class Matrix
-  */
-
-#define EIGEN_MAKE_TYPEDEFS(Type, TypeSuffix, Size, SizeSuffix)   \
-/** \ingroup matrixtypedefs */                                    \
-typedef Matrix<Type, Size, Size> Matrix##SizeSuffix##TypeSuffix;  \
-/** \ingroup matrixtypedefs */                                    \
-typedef Matrix<Type, Size, 1>    Vector##SizeSuffix##TypeSuffix;  \
-/** \ingroup matrixtypedefs */                                    \
-typedef Matrix<Type, 1, Size>    RowVector##SizeSuffix##TypeSuffix;
-
-#define EIGEN_MAKE_FIXED_TYPEDEFS(Type, TypeSuffix, Size)         \
-/** \ingroup matrixtypedefs */                                    \
-typedef Matrix<Type, Size, Dynamic> Matrix##Size##X##TypeSuffix;  \
-/** \ingroup matrixtypedefs */                                    \
-typedef Matrix<Type, Dynamic, Size> Matrix##X##Size##TypeSuffix;
+ *
+ * \ingroup Core_Module
+ *
+ * %Eigen defines several typedef shortcuts for most common matrix and vector types.
+ *
+ * The general patterns are the following:
+ *
+ * \c MatrixSizeType where \c Size can be \c 2,\c 3,\c 4 for fixed size square matrices or \c X for dynamic size,
+ * and where \c Type can be \c i for integer, \c f for float, \c d for double, \c cf for complex float, \c cd
+ * for complex double.
+ *
+ * For example, \c Matrix3d is a fixed-size 3x3 matrix type of doubles, and \c MatrixXf is a dynamic-size matrix of
+ * floats.
+ *
+ * There are also \c VectorSizeType and \c RowVectorSizeType which are self-explanatory. For example, \c Vector4cf is
+ * a fixed-size vector of 4 complex floats.
+ *
+ * With \cpp11, template alias are also defined for common sizes.
+ * They follow the same pattern as above except that the scalar type suffix is replaced by a
+ * template parameter, i.e.:
+ *   - `MatrixSize<Type>` where `Size` can be \c 2,\c 3,\c 4 for fixed size square matrices or \c X for dynamic size.
+ *   - `MatrixXSize<Type>` and `MatrixSizeX<Type>` where `Size` can be \c 2,\c 3,\c 4 for hybrid dynamic/fixed matrices.
+ *   - `VectorSize<Type>` and `RowVectorSize<Type>` for column and row vectors.
+ *
+ * With \cpp11, you can also use fully generic column and row vector types: `Vector<Type,Size>` and
+ * `RowVector<Type,Size>`.
+ *
+ * \sa class Matrix
+ */
+
+#define EIGEN_MAKE_TYPEDEFS(Type, TypeSuffix, Size, SizeSuffix)    \
+  /** \ingroup matrixtypedefs */                                   \
+  /** \brief `Size`&times;`Size` matrix of type `Type`. */         \
+  typedef Matrix<Type, Size, Size> Matrix##SizeSuffix##TypeSuffix; \
+  /** \ingroup matrixtypedefs */                                   \
+  /** \brief `Size`&times;`1` vector of type `Type`. */            \
+  typedef Matrix<Type, Size, 1> Vector##SizeSuffix##TypeSuffix;    \
+  /** \ingroup matrixtypedefs */                                   \
+  /** \brief `1`&times;`Size` vector of type `Type`. */            \
+  typedef Matrix<Type, 1, Size> RowVector##SizeSuffix##TypeSuffix;
+
+#define EIGEN_MAKE_FIXED_TYPEDEFS(Type, TypeSuffix, Size)          \
+  /** \ingroup matrixtypedefs */                                   \
+  /** \brief `Size`&times;`Dynamic` matrix of type `Type`. */      \
+  typedef Matrix<Type, Size, Dynamic> Matrix##Size##X##TypeSuffix; \
+  /** \ingroup matrixtypedefs */                                   \
+  /** \brief `Dynamic`&times;`Size` matrix of type `Type`. */      \
+  typedef Matrix<Type, Dynamic, Size> Matrix##X##Size##TypeSuffix;
 
 #define EIGEN_MAKE_TYPEDEFS_ALL_SIZES(Type, TypeSuffix) \
-EIGEN_MAKE_TYPEDEFS(Type, TypeSuffix, 2, 2) \
-EIGEN_MAKE_TYPEDEFS(Type, TypeSuffix, 3, 3) \
-EIGEN_MAKE_TYPEDEFS(Type, TypeSuffix, 4, 4) \
-EIGEN_MAKE_TYPEDEFS(Type, TypeSuffix, Dynamic, X) \
-EIGEN_MAKE_FIXED_TYPEDEFS(Type, TypeSuffix, 2) \
-EIGEN_MAKE_FIXED_TYPEDEFS(Type, TypeSuffix, 3) \
-EIGEN_MAKE_FIXED_TYPEDEFS(Type, TypeSuffix, 4)
-
-EIGEN_MAKE_TYPEDEFS_ALL_SIZES(int,                  i)
-EIGEN_MAKE_TYPEDEFS_ALL_SIZES(float,                f)
-EIGEN_MAKE_TYPEDEFS_ALL_SIZES(double,               d)
-EIGEN_MAKE_TYPEDEFS_ALL_SIZES(std::complex<float>,  cf)
+  EIGEN_MAKE_TYPEDEFS(Type, TypeSuffix, 2, 2)           \
+  EIGEN_MAKE_TYPEDEFS(Type, TypeSuffix, 3, 3)           \
+  EIGEN_MAKE_TYPEDEFS(Type, TypeSuffix, 4, 4)           \
+  EIGEN_MAKE_TYPEDEFS(Type, TypeSuffix, Dynamic, X)     \
+  EIGEN_MAKE_FIXED_TYPEDEFS(Type, TypeSuffix, 2)        \
+  EIGEN_MAKE_FIXED_TYPEDEFS(Type, TypeSuffix, 3)        \
+  EIGEN_MAKE_FIXED_TYPEDEFS(Type, TypeSuffix, 4)
+
+EIGEN_MAKE_TYPEDEFS_ALL_SIZES(int, i)
+EIGEN_MAKE_TYPEDEFS_ALL_SIZES(float, f)
+EIGEN_MAKE_TYPEDEFS_ALL_SIZES(double, d)
+EIGEN_MAKE_TYPEDEFS_ALL_SIZES(std::complex<float>, cf)
 EIGEN_MAKE_TYPEDEFS_ALL_SIZES(std::complex<double>, cd)
 
 #undef EIGEN_MAKE_TYPEDEFS_ALL_SIZES
 #undef EIGEN_MAKE_TYPEDEFS
 #undef EIGEN_MAKE_FIXED_TYPEDEFS
 
-} // end namespace Eigen
+#define EIGEN_MAKE_TYPEDEFS(Size, SizeSuffix)                    \
+  /** \ingroup matrixtypedefs */                                 \
+  /** \brief \cpp11 `Size`&times;`Size` matrix of type `Type`.*/ \
+  template <typename Type>                                       \
+  using Matrix##SizeSuffix = Matrix<Type, Size, Size>;           \
+  /** \ingroup matrixtypedefs */                                 \
+  /** \brief \cpp11 `Size`&times;`1` vector of type `Type`.*/    \
+  template <typename Type>                                       \
+  using Vector##SizeSuffix = Matrix<Type, Size, 1>;              \
+  /** \ingroup matrixtypedefs */                                 \
+  /** \brief \cpp11 `1`&times;`Size` vector of type `Type`.*/    \
+  template <typename Type>                                       \
+  using RowVector##SizeSuffix = Matrix<Type, 1, Size>;
+
+#define EIGEN_MAKE_FIXED_TYPEDEFS(Size)                              \
+  /** \ingroup matrixtypedefs */                                     \
+  /** \brief \cpp11 `Size`&times;`Dynamic` matrix of type `Type` */  \
+  template <typename Type>                                           \
+  using Matrix##Size##X = Matrix<Type, Size, Dynamic>;               \
+  /** \ingroup matrixtypedefs */                                     \
+  /** \brief \cpp11 `Dynamic`&times;`Size` matrix of type `Type`. */ \
+  template <typename Type>                                           \
+  using Matrix##X##Size = Matrix<Type, Dynamic, Size>;
+
+EIGEN_MAKE_TYPEDEFS(2, 2)
+EIGEN_MAKE_TYPEDEFS(3, 3)
+EIGEN_MAKE_TYPEDEFS(4, 4)
+EIGEN_MAKE_TYPEDEFS(Dynamic, X)
+EIGEN_MAKE_FIXED_TYPEDEFS(2)
+EIGEN_MAKE_FIXED_TYPEDEFS(3)
+EIGEN_MAKE_FIXED_TYPEDEFS(4)
+
+/** \ingroup matrixtypedefs
+ * \brief \cpp11 `Size`&times;`1` vector of type `Type`. */
+template <typename Type, int Size>
+using Vector = Matrix<Type, Size, 1>;
+
+/** \ingroup matrixtypedefs
+ * \brief \cpp11 `1`&times;`Size` vector of type `Type`. */
+template <typename Type, int Size>
+using RowVector = Matrix<Type, 1, Size>;
+
+#undef EIGEN_MAKE_TYPEDEFS
+#undef EIGEN_MAKE_FIXED_TYPEDEFS
+
+}  // end namespace Eigen
 
-#endif // EIGEN_MATRIX_H
+#endif  // EIGEN_MATRIX_H
diff --git a/inst/include/Eigen/src/Core/MatrixBase.h b/inst/include/Eigen/src/Core/MatrixBase.h
index e83ef4dc..045993d4 100644
--- a/inst/include/Eigen/src/Core/MatrixBase.h
+++ b/inst/include/Eigen/src/Core/MatrixBase.h
@@ -11,6 +11,9 @@
 #ifndef EIGEN_MATRIXBASE_H
 #define EIGEN_MATRIXBASE_H
 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
 namespace Eigen {
 
 /** \class MatrixBase
@@ -41,523 +44,502 @@ namespace Eigen {
   * \endcode
   *
   * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_MATRIXBASE_PLUGIN.
+  * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_MATRIXBASE_PLUGIN.
   *
-  * \sa \ref TopicClassHierarchy
+  * \sa \blank \ref TopicClassHierarchy
   */
-template<typename Derived> class MatrixBase
-  : public DenseBase<Derived>
-{
-  public:
+template <typename Derived>
+class MatrixBase : public DenseBase<Derived> {
+ public:
 #ifndef EIGEN_PARSED_BY_DOXYGEN
-    typedef MatrixBase StorageBaseType;
-    typedef typename internal::traits<Derived>::StorageKind StorageKind;
-    typedef typename internal::traits<Derived>::Index Index;
-    typedef typename internal::traits<Derived>::Scalar Scalar;
-    typedef typename internal::packet_traits<Scalar>::type PacketScalar;
-    typedef typename NumTraits<Scalar>::Real RealScalar;
-
-    typedef DenseBase<Derived> Base;
-    using Base::RowsAtCompileTime;
-    using Base::ColsAtCompileTime;
-    using Base::SizeAtCompileTime;
-    using Base::MaxRowsAtCompileTime;
-    using Base::MaxColsAtCompileTime;
-    using Base::MaxSizeAtCompileTime;
-    using Base::IsVectorAtCompileTime;
-    using Base::Flags;
-    using Base::CoeffReadCost;
-
-    using Base::derived;
-    using Base::const_cast_derived;
-    using Base::rows;
-    using Base::cols;
-    using Base::size;
-    using Base::coeff;
-    using Base::coeffRef;
-    using Base::lazyAssign;
-    using Base::eval;
-    using Base::operator+=;
-    using Base::operator-=;
-    using Base::operator*=;
-    using Base::operator/=;
-
-    typedef typename Base::CoeffReturnType CoeffReturnType;
-    typedef typename Base::ConstTransposeReturnType ConstTransposeReturnType;
-    typedef typename Base::RowXpr RowXpr;
-    typedef typename Base::ColXpr ColXpr;
-#endif // not EIGEN_PARSED_BY_DOXYGEN
+  typedef MatrixBase StorageBaseType;
+  typedef typename internal::traits<Derived>::StorageKind StorageKind;
+  typedef typename internal::traits<Derived>::StorageIndex StorageIndex;
+  typedef typename internal::traits<Derived>::Scalar Scalar;
+  typedef typename internal::packet_traits<Scalar>::type PacketScalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+
+  typedef DenseBase<Derived> Base;
+  using Base::ColsAtCompileTime;
+  using Base::Flags;
+  using Base::IsVectorAtCompileTime;
+  using Base::MaxColsAtCompileTime;
+  using Base::MaxRowsAtCompileTime;
+  using Base::MaxSizeAtCompileTime;
+  using Base::RowsAtCompileTime;
+  using Base::SizeAtCompileTime;
+
+  using Base::coeff;
+  using Base::coeffRef;
+  using Base::cols;
+  using Base::const_cast_derived;
+  using Base::derived;
+  using Base::eval;
+  using Base::lazyAssign;
+  using Base::rows;
+  using Base::size;
+  using Base::operator-;
+  using Base::operator+=;
+  using Base::operator-=;
+  using Base::operator*=;
+  using Base::operator/=;
+
+  typedef typename Base::CoeffReturnType CoeffReturnType;
+  typedef typename Base::ConstTransposeReturnType ConstTransposeReturnType;
+  typedef typename Base::RowXpr RowXpr;
+  typedef typename Base::ColXpr ColXpr;
+#endif  // not EIGEN_PARSED_BY_DOXYGEN
 
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  /** type of the equivalent square matrix */
+  typedef Matrix<Scalar, internal::max_size_prefer_dynamic(RowsAtCompileTime, ColsAtCompileTime),
+                 internal::max_size_prefer_dynamic(RowsAtCompileTime, ColsAtCompileTime)>
+      SquareMatrixType;
+#endif  // not EIGEN_PARSED_BY_DOXYGEN
 
+  /** \returns the size of the main diagonal, which is min(rows(),cols()).
+   * \sa rows(), cols(), SizeAtCompileTime. */
+  EIGEN_DEVICE_FUNC inline Index diagonalSize() const { return (numext::mini)(rows(), cols()); }
 
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** type of the equivalent square matrix */
-    typedef Matrix<Scalar,EIGEN_SIZE_MAX(RowsAtCompileTime,ColsAtCompileTime),
-                          EIGEN_SIZE_MAX(RowsAtCompileTime,ColsAtCompileTime)> SquareMatrixType;
-#endif // not EIGEN_PARSED_BY_DOXYGEN
-
-    /** \returns the size of the main diagonal, which is min(rows(),cols()).
-      * \sa rows(), cols(), SizeAtCompileTime. */
-    inline Index diagonalSize() const { return (std::min)(rows(),cols()); }
-
-    /** \brief The plain matrix type corresponding to this expression.
-      *
-      * This is not necessarily exactly the return type of eval(). In the case of plain matrices,
-      * the return type of eval() is a const reference to a matrix, not a matrix! It is however guaranteed
-      * that the return type of eval() is either PlainObject or const PlainObject&.
-      */
-    typedef Matrix<typename internal::traits<Derived>::Scalar,
-                internal::traits<Derived>::RowsAtCompileTime,
-                internal::traits<Derived>::ColsAtCompileTime,
-                AutoAlign | (internal::traits<Derived>::Flags&RowMajorBit ? RowMajor : ColMajor),
-                internal::traits<Derived>::MaxRowsAtCompileTime,
-                internal::traits<Derived>::MaxColsAtCompileTime
-          > PlainObject;
+  typedef typename Base::PlainObject PlainObject;
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** \internal Represents a matrix with all coefficients equal to one another*/
-    typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>,Derived> ConstantReturnType;
-    /** \internal the return type of MatrixBase::adjoint() */
-    typedef typename internal::conditional<NumTraits<Scalar>::IsComplex,
-                        CwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, ConstTransposeReturnType>,
-                        ConstTransposeReturnType
-                     >::type AdjointReturnType;
-    /** \internal Return type of eigenvalues() */
-    typedef Matrix<std::complex<RealScalar>, internal::traits<Derived>::ColsAtCompileTime, 1, ColMajor> EigenvaluesReturnType;
-    /** \internal the return type of identity */
-    typedef CwiseNullaryOp<internal::scalar_identity_op<Scalar>,Derived> IdentityReturnType;
-    /** \internal the return type of unit vectors */
-    typedef Block<const CwiseNullaryOp<internal::scalar_identity_op<Scalar>, SquareMatrixType>,
-                  internal::traits<Derived>::RowsAtCompileTime,
-                  internal::traits<Derived>::ColsAtCompileTime> BasisReturnType;
-#endif // not EIGEN_PARSED_BY_DOXYGEN
+  /** \internal Represents a matrix with all coefficients equal to one another*/
+  typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject> ConstantReturnType;
+  /** \internal the return type of MatrixBase::adjoint() */
+  typedef std::conditional_t<NumTraits<Scalar>::IsComplex,
+                             CwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, ConstTransposeReturnType>,
+                             ConstTransposeReturnType>
+      AdjointReturnType;
+  /** \internal Return type of eigenvalues() */
+  typedef Matrix<internal::make_complex_t<Scalar>, internal::traits<Derived>::ColsAtCompileTime, 1, ColMajor>
+      EigenvaluesReturnType;
+  /** \internal the return type of identity */
+  typedef CwiseNullaryOp<internal::scalar_identity_op<Scalar>, PlainObject> IdentityReturnType;
+  /** \internal the return type of unit vectors */
+  typedef Block<const CwiseNullaryOp<internal::scalar_identity_op<Scalar>, SquareMatrixType>,
+                internal::traits<Derived>::RowsAtCompileTime, internal::traits<Derived>::ColsAtCompileTime>
+      BasisReturnType;
+#endif  // not EIGEN_PARSED_BY_DOXYGEN
 
 #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::MatrixBase
-#   include "../plugins/CommonCwiseUnaryOps.h"
-#   include "../plugins/CommonCwiseBinaryOps.h"
-#   include "../plugins/MatrixCwiseUnaryOps.h"
-#   include "../plugins/MatrixCwiseBinaryOps.h"
-#   ifdef EIGEN_MATRIXBASE_PLUGIN
-#     include EIGEN_MATRIXBASE_PLUGIN
-#   endif
-#undef EIGEN_CURRENT_STORAGE_BASE_CLASS
-
-    /** Special case of the template operator=, in order to prevent the compiler
-      * from generating a default operator= (issue hit with g++ 4.1)
-      */
-    Derived& operator=(const MatrixBase& other);
-
-    // We cannot inherit here via Base::operator= since it is causing
-    // trouble with MSVC.
-
-    template <typename OtherDerived>
-    Derived& operator=(const DenseBase<OtherDerived>& other);
-
-    template <typename OtherDerived>
-    Derived& operator=(const EigenBase<OtherDerived>& other);
-
-    template<typename OtherDerived>
-    Derived& operator=(const ReturnByValue<OtherDerived>& other);
-
-    template<typename ProductDerived, typename Lhs, typename Rhs>
-    Derived& lazyAssign(const ProductBase<ProductDerived, Lhs,Rhs>& other);
-
-    template<typename MatrixPower, typename Lhs, typename Rhs>
-    Derived& lazyAssign(const MatrixPowerProduct<MatrixPower, Lhs,Rhs>& other);
-
-    template<typename OtherDerived>
-    Derived& operator+=(const MatrixBase<OtherDerived>& other);
-    template<typename OtherDerived>
-    Derived& operator-=(const MatrixBase<OtherDerived>& other);
-
-    template<typename OtherDerived>
-    const typename ProductReturnType<Derived,OtherDerived>::Type
-    operator*(const MatrixBase<OtherDerived> &other) const;
-
-    template<typename OtherDerived>
-    const typename LazyProductReturnType<Derived,OtherDerived>::Type
-    lazyProduct(const MatrixBase<OtherDerived> &other) const;
-
-    template<typename OtherDerived>
-    Derived& operator*=(const EigenBase<OtherDerived>& other);
-
-    template<typename OtherDerived>
-    void applyOnTheLeft(const EigenBase<OtherDerived>& other);
-
-    template<typename OtherDerived>
-    void applyOnTheRight(const EigenBase<OtherDerived>& other);
-
-    template<typename DiagonalDerived>
-    const DiagonalProduct<Derived, DiagonalDerived, OnTheRight>
-    operator*(const DiagonalBase<DiagonalDerived> &diagonal) const;
-
-    template<typename OtherDerived>
-    typename internal::scalar_product_traits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType
-    dot(const MatrixBase<OtherDerived>& other) const;
-
-    #ifdef EIGEN2_SUPPORT
-      template<typename OtherDerived>
-      Scalar eigen2_dot(const MatrixBase<OtherDerived>& other) const;
-    #endif
-
-    RealScalar squaredNorm() const;
-    RealScalar norm() const;
-    RealScalar stableNorm() const;
-    RealScalar blueNorm() const;
-    RealScalar hypotNorm() const;
-    const PlainObject normalized() const;
-    void normalize();
-
-    const AdjointReturnType adjoint() const;
-    void adjointInPlace();
-
-    typedef Diagonal<Derived> DiagonalReturnType;
-    DiagonalReturnType diagonal();
-    typedef typename internal::add_const<Diagonal<const Derived> >::type ConstDiagonalReturnType;
-    ConstDiagonalReturnType diagonal() const;
-
-    template<int Index> struct DiagonalIndexReturnType { typedef Diagonal<Derived,Index> Type; };
-    template<int Index> struct ConstDiagonalIndexReturnType { typedef const Diagonal<const Derived,Index> Type; };
-
-    template<int Index> typename DiagonalIndexReturnType<Index>::Type diagonal();
-    template<int Index> typename ConstDiagonalIndexReturnType<Index>::Type diagonal() const;
-    
-    typedef Diagonal<Derived,DynamicIndex> DiagonalDynamicIndexReturnType;
-    typedef typename internal::add_const<Diagonal<const Derived,DynamicIndex> >::type ConstDiagonalDynamicIndexReturnType;
-
-    DiagonalDynamicIndexReturnType diagonal(Index index);
-    ConstDiagonalDynamicIndexReturnType diagonal(Index index) const;
-
-    #ifdef EIGEN2_SUPPORT
-    template<unsigned int Mode> typename internal::eigen2_part_return_type<Derived, Mode>::type part();
-    template<unsigned int Mode> const typename internal::eigen2_part_return_type<Derived, Mode>::type part() const;
-    
-    // huuuge hack. make Eigen2's matrix.part<Diagonal>() work in eigen3. Problem: Diagonal is now a class template instead
-    // of an integer constant. Solution: overload the part() method template wrt template parameters list.
-    template<template<typename T, int N> class U>
-    const DiagonalWrapper<ConstDiagonalReturnType> part() const
-    { return diagonal().asDiagonal(); }
-    #endif // EIGEN2_SUPPORT
-
-    template<unsigned int Mode> struct TriangularViewReturnType { typedef TriangularView<Derived, Mode> Type; };
-    template<unsigned int Mode> struct ConstTriangularViewReturnType { typedef const TriangularView<const Derived, Mode> Type; };
-
-    template<unsigned int Mode> typename TriangularViewReturnType<Mode>::Type triangularView();
-    template<unsigned int Mode> typename ConstTriangularViewReturnType<Mode>::Type triangularView() const;
-
-    template<unsigned int UpLo> struct SelfAdjointViewReturnType { typedef SelfAdjointView<Derived, UpLo> Type; };
-    template<unsigned int UpLo> struct ConstSelfAdjointViewReturnType { typedef const SelfAdjointView<const Derived, UpLo> Type; };
-
-    template<unsigned int UpLo> typename SelfAdjointViewReturnType<UpLo>::Type selfadjointView();
-    template<unsigned int UpLo> typename ConstSelfAdjointViewReturnType<UpLo>::Type selfadjointView() const;
-
-    const SparseView<Derived> sparseView(const Scalar& m_reference = Scalar(0),
-                                         const typename NumTraits<Scalar>::Real& m_epsilon = NumTraits<Scalar>::dummy_precision()) const;
-    static const IdentityReturnType Identity();
-    static const IdentityReturnType Identity(Index rows, Index cols);
-    static const BasisReturnType Unit(Index size, Index i);
-    static const BasisReturnType Unit(Index i);
-    static const BasisReturnType UnitX();
-    static const BasisReturnType UnitY();
-    static const BasisReturnType UnitZ();
-    static const BasisReturnType UnitW();
-
-    const DiagonalWrapper<const Derived> asDiagonal() const;
-    const PermutationWrapper<const Derived> asPermutation() const;
-
-    Derived& setIdentity();
-    Derived& setIdentity(Index rows, Index cols);
-
-    bool isIdentity(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
-    bool isDiagonal(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
-
-    bool isUpperTriangular(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
-    bool isLowerTriangular(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
-
-    template<typename OtherDerived>
-    bool isOrthogonal(const MatrixBase<OtherDerived>& other,
-                      const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
-    bool isUnitary(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
-
-    /** \returns true if each coefficients of \c *this and \a other are all exactly equal.
-      * \warning When using floating point scalar values you probably should rather use a
-      *          fuzzy comparison such as isApprox()
-      * \sa isApprox(), operator!= */
-    template<typename OtherDerived>
-    inline bool operator==(const MatrixBase<OtherDerived>& other) const
-    { return cwiseEqual(other).all(); }
-
-    /** \returns true if at least one pair of coefficients of \c *this and \a other are not exactly equal to each other.
-      * \warning When using floating point scalar values you probably should rather use a
-      *          fuzzy comparison such as isApprox()
-      * \sa isApprox(), operator== */
-    template<typename OtherDerived>
-    inline bool operator!=(const MatrixBase<OtherDerived>& other) const
-    { return cwiseNotEqual(other).any(); }
-
-    NoAlias<Derived,Eigen::MatrixBase > noalias();
-
-    inline const ForceAlignedAccess<Derived> forceAlignedAccess() const;
-    inline ForceAlignedAccess<Derived> forceAlignedAccess();
-    template<bool Enable> inline typename internal::add_const_on_value_type<typename internal::conditional<Enable,ForceAlignedAccess<Derived>,Derived&>::type>::type forceAlignedAccessIf() const;
-    template<bool Enable> inline typename internal::conditional<Enable,ForceAlignedAccess<Derived>,Derived&>::type forceAlignedAccessIf();
-
-    Scalar trace() const;
-
-/////////// Array module ///////////
-
-    template<int p> RealScalar lpNorm() const;
-
-    MatrixBase<Derived>& matrix() { return *this; }
-    const MatrixBase<Derived>& matrix() const { return *this; }
-
-    /** \returns an \link Eigen::ArrayBase Array \endlink expression of this matrix
-      * \sa ArrayBase::matrix() */
-    ArrayWrapper<Derived> array() { return derived(); }
-    const ArrayWrapper<const Derived> array() const { return derived(); }
-
-/////////// LU module ///////////
-
-    const FullPivLU<PlainObject> fullPivLu() const;
-    const PartialPivLU<PlainObject> partialPivLu() const;
-
-    #if EIGEN2_SUPPORT_STAGE < STAGE20_RESOLVE_API_CONFLICTS
-    const LU<PlainObject> lu() const;
-    #endif
-
-    #ifdef EIGEN2_SUPPORT
-    const LU<PlainObject> eigen2_lu() const;
-    #endif
-
-    #if EIGEN2_SUPPORT_STAGE > STAGE20_RESOLVE_API_CONFLICTS
-    const PartialPivLU<PlainObject> lu() const;
-    #endif
-    
-    #ifdef EIGEN2_SUPPORT
-    template<typename ResultType>
-    void computeInverse(MatrixBase<ResultType> *result) const {
-      *result = this->inverse();
-    }
-    #endif
-
-    const internal::inverse_impl<Derived> inverse() const;
-    template<typename ResultType>
-    void computeInverseAndDetWithCheck(
-      ResultType& inverse,
-      typename ResultType::Scalar& determinant,
-      bool& invertible,
-      const RealScalar& absDeterminantThreshold = NumTraits<Scalar>::dummy_precision()
-    ) const;
-    template<typename ResultType>
-    void computeInverseWithCheck(
-      ResultType& inverse,
-      bool& invertible,
-      const RealScalar& absDeterminantThreshold = NumTraits<Scalar>::dummy_precision()
-    ) const;
-    Scalar determinant() const;
-
-/////////// Cholesky module ///////////
-
-    const LLT<PlainObject>  llt() const;
-    const LDLT<PlainObject> ldlt() const;
-
-/////////// QR module ///////////
-
-    const HouseholderQR<PlainObject> householderQr() const;
-    const ColPivHouseholderQR<PlainObject> colPivHouseholderQr() const;
-    const FullPivHouseholderQR<PlainObject> fullPivHouseholderQr() const;
-    
-    #ifdef EIGEN2_SUPPORT
-    const QR<PlainObject> qr() const;
-    #endif
-
-    EigenvaluesReturnType eigenvalues() const;
-    RealScalar operatorNorm() const;
-
-/////////// SVD module ///////////
-
-    JacobiSVD<PlainObject> jacobiSvd(unsigned int computationOptions = 0) const;
-
-    #ifdef EIGEN2_SUPPORT
-    SVD<PlainObject> svd() const;
-    #endif
-
-/////////// Geometry module ///////////
-
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /// \internal helper struct to form the return type of the cross product
-    template<typename OtherDerived> struct cross_product_return_type {
-      typedef typename internal::scalar_product_traits<typename internal::traits<Derived>::Scalar,typename internal::traits<OtherDerived>::Scalar>::ReturnType Scalar;
-      typedef Matrix<Scalar,MatrixBase::RowsAtCompileTime,MatrixBase::ColsAtCompileTime> type;
-    };
-    #endif // EIGEN_PARSED_BY_DOXYGEN
-    template<typename OtherDerived>
-    typename cross_product_return_type<OtherDerived>::type
-    cross(const MatrixBase<OtherDerived>& other) const;
-    template<typename OtherDerived>
-    PlainObject cross3(const MatrixBase<OtherDerived>& other) const;
-    PlainObject unitOrthogonal(void) const;
-    Matrix<Scalar,3,1> eulerAngles(Index a0, Index a1, Index a2) const;
-    
-    #if EIGEN2_SUPPORT_STAGE > STAGE20_RESOLVE_API_CONFLICTS
-    ScalarMultipleReturnType operator*(const UniformScaling<Scalar>& s) const;
-    // put this as separate enum value to work around possible GCC 4.3 bug (?)
-    enum { HomogeneousReturnTypeDirection = ColsAtCompileTime==1?Vertical:Horizontal };
-    typedef Homogeneous<Derived, HomogeneousReturnTypeDirection> HomogeneousReturnType;
-    HomogeneousReturnType homogeneous() const;
-    #endif
-    
-    enum {
-      SizeMinusOne = SizeAtCompileTime==Dynamic ? Dynamic : SizeAtCompileTime-1
-    };
-    typedef Block<const Derived,
-                  internal::traits<Derived>::ColsAtCompileTime==1 ? SizeMinusOne : 1,
-                  internal::traits<Derived>::ColsAtCompileTime==1 ? 1 : SizeMinusOne> ConstStartMinusOne;
-    typedef CwiseUnaryOp<internal::scalar_quotient1_op<typename internal::traits<Derived>::Scalar>,
-                const ConstStartMinusOne > HNormalizedReturnType;
-
-    const HNormalizedReturnType hnormalized() const;
-
-////////// Householder module ///////////
-
-    void makeHouseholderInPlace(Scalar& tau, RealScalar& beta);
-    template<typename EssentialPart>
-    void makeHouseholder(EssentialPart& essential,
-                         Scalar& tau, RealScalar& beta) const;
-    template<typename EssentialPart>
-    void applyHouseholderOnTheLeft(const EssentialPart& essential,
-                                   const Scalar& tau,
-                                   Scalar* workspace);
-    template<typename EssentialPart>
-    void applyHouseholderOnTheRight(const EssentialPart& essential,
-                                    const Scalar& tau,
-                                    Scalar* workspace);
-
-///////// Jacobi module /////////
-
-    template<typename OtherScalar>
-    void applyOnTheLeft(Index p, Index q, const JacobiRotation<OtherScalar>& j);
-    template<typename OtherScalar>
-    void applyOnTheRight(Index p, Index q, const JacobiRotation<OtherScalar>& j);
-
-///////// SparseCore module /////////
-
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE const typename SparseMatrixBase<OtherDerived>::template CwiseProductDenseReturnType<Derived>::Type
-    cwiseProduct(const SparseMatrixBase<OtherDerived> &other) const
-    {
-      return other.cwiseProduct(derived());
-    }
-
-///////// MatrixFunctions module /////////
-
-    typedef typename internal::stem_function<Scalar>::type StemFunction;
-    const MatrixExponentialReturnValue<Derived> exp() const;
-    const MatrixFunctionReturnValue<Derived> matrixFunction(StemFunction f) const;
-    const MatrixFunctionReturnValue<Derived> cosh() const;
-    const MatrixFunctionReturnValue<Derived> sinh() const;
-    const MatrixFunctionReturnValue<Derived> cos() const;
-    const MatrixFunctionReturnValue<Derived> sin() const;
-    const MatrixSquareRootReturnValue<Derived> sqrt() const;
-    const MatrixLogarithmReturnValue<Derived> log() const;
-    const MatrixPowerReturnValue<Derived> pow(const RealScalar& p) const;
-
-#ifdef EIGEN2_SUPPORT
-    template<typename ProductDerived, typename Lhs, typename Rhs>
-    Derived& operator+=(const Flagged<ProductBase<ProductDerived, Lhs,Rhs>, 0,
-                                      EvalBeforeAssigningBit>& other);
-
-    template<typename ProductDerived, typename Lhs, typename Rhs>
-    Derived& operator-=(const Flagged<ProductBase<ProductDerived, Lhs,Rhs>, 0,
-                                      EvalBeforeAssigningBit>& other);
-
-    /** \deprecated because .lazy() is deprecated
-      * Overloaded for cache friendly product evaluation */
-    template<typename OtherDerived>
-    Derived& lazyAssign(const Flagged<OtherDerived, 0, EvalBeforeAssigningBit>& other)
-    { return lazyAssign(other._expression()); }
-
-    template<unsigned int Added>
-    const Flagged<Derived, Added, 0> marked() const;
-    const Flagged<Derived, 0, EvalBeforeAssigningBit> lazy() const;
-
-    inline const Cwise<Derived> cwise() const;
-    inline Cwise<Derived> cwise();
-
-    VectorBlock<Derived> start(Index size);
-    const VectorBlock<const Derived> start(Index size) const;
-    VectorBlock<Derived> end(Index size);
-    const VectorBlock<const Derived> end(Index size) const;
-    template<int Size> VectorBlock<Derived,Size> start();
-    template<int Size> const VectorBlock<const Derived,Size> start() const;
-    template<int Size> VectorBlock<Derived,Size> end();
-    template<int Size> const VectorBlock<const Derived,Size> end() const;
-
-    Minor<Derived> minor(Index row, Index col);
-    const Minor<Derived> minor(Index row, Index col) const;
+#define EIGEN_DOC_UNARY_ADDONS(X, Y)
+#include "../plugins/CommonCwiseBinaryOps.inc"
+#include "../plugins/MatrixCwiseUnaryOps.inc"
+#include "../plugins/MatrixCwiseBinaryOps.inc"
+#ifdef EIGEN_MATRIXBASE_PLUGIN
+#include EIGEN_MATRIXBASE_PLUGIN
 #endif
-
-  protected:
-    MatrixBase() : Base() {}
-
-  private:
-    explicit MatrixBase(int);
-    MatrixBase(int,int);
-    template<typename OtherDerived> explicit MatrixBase(const MatrixBase<OtherDerived>&);
-  protected:
-    // mixing arrays and matrices is not legal
-    template<typename OtherDerived> Derived& operator+=(const ArrayBase<OtherDerived>& )
-    {EIGEN_STATIC_ASSERT(std::ptrdiff_t(sizeof(typename OtherDerived::Scalar))==-1,YOU_CANNOT_MIX_ARRAYS_AND_MATRICES); return *this;}
-    // mixing arrays and matrices is not legal
-    template<typename OtherDerived> Derived& operator-=(const ArrayBase<OtherDerived>& )
-    {EIGEN_STATIC_ASSERT(std::ptrdiff_t(sizeof(typename OtherDerived::Scalar))==-1,YOU_CANNOT_MIX_ARRAYS_AND_MATRICES); return *this;}
+#undef EIGEN_CURRENT_STORAGE_BASE_CLASS
+#undef EIGEN_DOC_UNARY_ADDONS
+
+  /** Special case of the template operator=, in order to prevent the compiler
+   * from generating a default operator= (issue hit with g++ 4.1)
+   */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const MatrixBase& other);
+
+  // We cannot inherit here via Base::operator= since it is causing
+  // trouble with MSVC.
+
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const DenseBase<OtherDerived>& other);
+
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC Derived& operator=(const EigenBase<OtherDerived>& other);
+
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC Derived& operator=(const ReturnByValue<OtherDerived>& other);
+
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator+=(const MatrixBase<OtherDerived>& other);
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator-=(const MatrixBase<OtherDerived>& other);
+
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC const Product<Derived, OtherDerived> operator*(const MatrixBase<OtherDerived>& other) const;
+
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC const Product<Derived, OtherDerived, LazyProduct> lazyProduct(
+      const MatrixBase<OtherDerived>& other) const;
+
+  template <typename OtherDerived>
+  Derived& operator*=(const EigenBase<OtherDerived>& other);
+
+  template <typename OtherDerived>
+  void applyOnTheLeft(const EigenBase<OtherDerived>& other);
+
+  template <typename OtherDerived>
+  void applyOnTheRight(const EigenBase<OtherDerived>& other);
+
+  template <typename DiagonalDerived>
+  EIGEN_DEVICE_FUNC const Product<Derived, DiagonalDerived, LazyProduct> operator*(
+      const DiagonalBase<DiagonalDerived>& diagonal) const;
+
+  template <typename SkewDerived>
+  EIGEN_DEVICE_FUNC const Product<Derived, SkewDerived, LazyProduct> operator*(
+      const SkewSymmetricBase<SkewDerived>& skew) const;
+
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC typename ScalarBinaryOpTraits<typename internal::traits<Derived>::Scalar,
+                                                  typename internal::traits<OtherDerived>::Scalar>::ReturnType
+  dot(const MatrixBase<OtherDerived>& other) const;
+
+  EIGEN_DEVICE_FUNC RealScalar squaredNorm() const;
+  EIGEN_DEVICE_FUNC RealScalar norm() const;
+  RealScalar stableNorm() const;
+  RealScalar blueNorm() const;
+  RealScalar hypotNorm() const;
+  EIGEN_DEVICE_FUNC const PlainObject normalized() const;
+  EIGEN_DEVICE_FUNC const PlainObject stableNormalized() const;
+  EIGEN_DEVICE_FUNC void normalize();
+  EIGEN_DEVICE_FUNC void stableNormalize();
+
+  EIGEN_DEVICE_FUNC const AdjointReturnType adjoint() const;
+  EIGEN_DEVICE_FUNC void adjointInPlace();
+
+  typedef Diagonal<Derived> DiagonalReturnType;
+  EIGEN_DEVICE_FUNC DiagonalReturnType diagonal();
+
+  typedef Diagonal<const Derived> ConstDiagonalReturnType;
+  EIGEN_DEVICE_FUNC const ConstDiagonalReturnType diagonal() const;
+
+  template <int Index>
+  EIGEN_DEVICE_FUNC Diagonal<Derived, Index> diagonal();
+
+  template <int Index>
+  EIGEN_DEVICE_FUNC const Diagonal<const Derived, Index> diagonal() const;
+
+  EIGEN_DEVICE_FUNC Diagonal<Derived, DynamicIndex> diagonal(Index index);
+  EIGEN_DEVICE_FUNC const Diagonal<const Derived, DynamicIndex> diagonal(Index index) const;
+
+  template <unsigned int Mode>
+  struct TriangularViewReturnType {
+    typedef TriangularView<Derived, Mode> Type;
+  };
+  template <unsigned int Mode>
+  struct ConstTriangularViewReturnType {
+    typedef const TriangularView<const Derived, Mode> Type;
+  };
+
+  template <unsigned int Mode>
+  EIGEN_DEVICE_FUNC typename TriangularViewReturnType<Mode>::Type triangularView();
+  template <unsigned int Mode>
+  EIGEN_DEVICE_FUNC typename ConstTriangularViewReturnType<Mode>::Type triangularView() const;
+
+  template <unsigned int UpLo>
+  struct SelfAdjointViewReturnType {
+    typedef SelfAdjointView<Derived, UpLo> Type;
+  };
+  template <unsigned int UpLo>
+  struct ConstSelfAdjointViewReturnType {
+    typedef const SelfAdjointView<const Derived, UpLo> Type;
+  };
+
+  template <unsigned int UpLo>
+  EIGEN_DEVICE_FUNC typename SelfAdjointViewReturnType<UpLo>::Type selfadjointView();
+  template <unsigned int UpLo>
+  EIGEN_DEVICE_FUNC typename ConstSelfAdjointViewReturnType<UpLo>::Type selfadjointView() const;
+
+  const SparseView<Derived> sparseView(
+      const Scalar& m_reference = Scalar(0),
+      const typename NumTraits<Scalar>::Real& m_epsilon = NumTraits<Scalar>::dummy_precision()) const;
+  EIGEN_DEVICE_FUNC static const IdentityReturnType Identity();
+  EIGEN_DEVICE_FUNC static const IdentityReturnType Identity(Index rows, Index cols);
+  EIGEN_DEVICE_FUNC static const BasisReturnType Unit(Index size, Index i);
+  EIGEN_DEVICE_FUNC static const BasisReturnType Unit(Index i);
+  EIGEN_DEVICE_FUNC static const BasisReturnType UnitX();
+  EIGEN_DEVICE_FUNC static const BasisReturnType UnitY();
+  EIGEN_DEVICE_FUNC static const BasisReturnType UnitZ();
+  EIGEN_DEVICE_FUNC static const BasisReturnType UnitW();
+
+  EIGEN_DEVICE_FUNC const DiagonalWrapper<const Derived> asDiagonal() const;
+  const PermutationWrapper<const Derived> asPermutation() const;
+  EIGEN_DEVICE_FUNC const SkewSymmetricWrapper<const Derived> asSkewSymmetric() const;
+
+  EIGEN_DEVICE_FUNC Derived& setIdentity();
+  EIGEN_DEVICE_FUNC Derived& setIdentity(Index rows, Index cols);
+  EIGEN_DEVICE_FUNC Derived& setUnit(Index i);
+  EIGEN_DEVICE_FUNC Derived& setUnit(Index newSize, Index i);
+
+  bool isIdentity(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+  bool isDiagonal(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+
+  bool isUpperTriangular(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+  bool isLowerTriangular(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+
+  bool isSkewSymmetric(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+
+  template <typename OtherDerived>
+  bool isOrthogonal(const MatrixBase<OtherDerived>& other,
+                    const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+  bool isUnitary(const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+
+  /** \returns true if each coefficients of \c *this and \a other are all exactly equal.
+   * \warning When using floating point scalar values you probably should rather use a
+   *          fuzzy comparison such as isApprox()
+   * \sa isApprox(), operator!= */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC inline bool operator==(const MatrixBase<OtherDerived>& other) const {
+    return (this->rows() == other.rows()) && (this->cols() == other.cols()) && cwiseEqual(other).all();
+  }
+
+  /** \returns true if at least one pair of coefficients of \c *this and \a other are not exactly equal to each other.
+   * \warning When using floating point scalar values you probably should rather use a
+   *          fuzzy comparison such as isApprox()
+   * \sa isApprox(), operator== */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC inline bool operator!=(const MatrixBase<OtherDerived>& other) const {
+    return !(*this == other);
+  }
+
+  NoAlias<Derived, Eigen::MatrixBase> EIGEN_DEVICE_FUNC noalias();
+
+  // TODO forceAlignedAccess is temporarily disabled
+  // Need to find a nicer workaround.
+  inline const Derived& forceAlignedAccess() const { return derived(); }
+  inline Derived& forceAlignedAccess() { return derived(); }
+  template <bool Enable>
+  inline const Derived& forceAlignedAccessIf() const {
+    return derived();
+  }
+  template <bool Enable>
+  inline Derived& forceAlignedAccessIf() {
+    return derived();
+  }
+
+  EIGEN_DEVICE_FUNC Scalar trace() const;
+
+  template <int p>
+  EIGEN_DEVICE_FUNC RealScalar lpNorm() const;
+
+  EIGEN_DEVICE_FUNC MatrixBase<Derived>& matrix() { return *this; }
+  EIGEN_DEVICE_FUNC const MatrixBase<Derived>& matrix() const { return *this; }
+
+  /** \returns an \link Eigen::ArrayBase Array \endlink expression of this matrix
+   * \sa ArrayBase::matrix() */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ArrayWrapper<Derived> array() { return ArrayWrapper<Derived>(derived()); }
+  /** \returns a const \link Eigen::ArrayBase Array \endlink expression of this matrix
+   * \sa ArrayBase::matrix() */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const ArrayWrapper<const Derived> array() const {
+    return ArrayWrapper<const Derived>(derived());
+  }
+
+  /////////// LU module ///////////
+
+  template <typename PermutationIndex = DefaultPermutationIndex>
+  inline const FullPivLU<PlainObject, PermutationIndex> fullPivLu() const;
+  template <typename PermutationIndex = DefaultPermutationIndex>
+  inline const PartialPivLU<PlainObject, PermutationIndex> partialPivLu() const;
+
+  template <typename PermutationIndex = DefaultPermutationIndex>
+  inline const PartialPivLU<PlainObject, PermutationIndex> lu() const;
+
+  EIGEN_DEVICE_FUNC inline const Inverse<Derived> inverse() const;
+
+  template <typename ResultType>
+  inline void computeInverseAndDetWithCheck(
+      ResultType& inverse, typename ResultType::Scalar& determinant, bool& invertible,
+      const RealScalar& absDeterminantThreshold = NumTraits<Scalar>::dummy_precision()) const;
+
+  template <typename ResultType>
+  inline void computeInverseWithCheck(
+      ResultType& inverse, bool& invertible,
+      const RealScalar& absDeterminantThreshold = NumTraits<Scalar>::dummy_precision()) const;
+
+  EIGEN_DEVICE_FUNC Scalar determinant() const;
+
+  /////////// Cholesky module ///////////
+
+  inline const LLT<PlainObject> llt() const;
+  inline const LDLT<PlainObject> ldlt() const;
+
+  /////////// QR module ///////////
+
+  inline const HouseholderQR<PlainObject> householderQr() const;
+  template <typename PermutationIndex = DefaultPermutationIndex>
+  inline const ColPivHouseholderQR<PlainObject, PermutationIndex> colPivHouseholderQr() const;
+  template <typename PermutationIndex = DefaultPermutationIndex>
+  inline const FullPivHouseholderQR<PlainObject, PermutationIndex> fullPivHouseholderQr() const;
+  template <typename PermutationIndex = DefaultPermutationIndex>
+  inline const CompleteOrthogonalDecomposition<PlainObject, PermutationIndex> completeOrthogonalDecomposition() const;
+
+  /////////// Eigenvalues module ///////////
+
+  inline EigenvaluesReturnType eigenvalues() const;
+  inline RealScalar operatorNorm() const;
+
+  /////////// SVD module ///////////
+
+  template <int Options = 0>
+  inline JacobiSVD<PlainObject, Options> jacobiSvd() const;
+  template <int Options = 0>
+  EIGEN_DEPRECATED_WITH_REASON("Options should be specified using method's template parameter.")
+  inline JacobiSVD<PlainObject, Options> jacobiSvd(unsigned int computationOptions) const;
+
+  template <int Options = 0>
+  inline BDCSVD<PlainObject, Options> bdcSvd() const;
+  template <int Options = 0>
+  EIGEN_DEPRECATED_WITH_REASON("Options should be specified using method's template parameter.")
+  inline BDCSVD<PlainObject, Options> bdcSvd(unsigned int computationOptions) const;
+
+  /////////// Geometry module ///////////
+
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC inline typename internal::cross_impl<Derived, OtherDerived>::return_type cross(
+      const MatrixBase<OtherDerived>& other) const;
+
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC inline PlainObject cross3(const MatrixBase<OtherDerived>& other) const;
+
+  EIGEN_DEVICE_FUNC inline PlainObject unitOrthogonal(void) const;
+
+  EIGEN_DEPRECATED_WITH_REASON("Use .canonicalEulerAngles() instead.")
+  EIGEN_DEVICE_FUNC inline Matrix<Scalar, 3, 1> eulerAngles(Index a0, Index a1, Index a2) const;
+
+  EIGEN_DEVICE_FUNC inline Matrix<Scalar, 3, 1> canonicalEulerAngles(Index a0, Index a1, Index a2) const;
+
+  // put this as separate enum value to work around possible GCC 4.3 bug (?)
+  enum {
+    HomogeneousReturnTypeDirection =
+        ColsAtCompileTime == 1 && RowsAtCompileTime == 1
+            ? ((internal::traits<Derived>::Flags & RowMajorBit) == RowMajorBit ? Horizontal : Vertical)
+        : ColsAtCompileTime == 1 ? Vertical
+                                 : Horizontal
+  };
+  typedef Homogeneous<Derived, HomogeneousReturnTypeDirection> HomogeneousReturnType;
+  EIGEN_DEVICE_FUNC inline HomogeneousReturnType homogeneous() const;
+
+  enum { SizeMinusOne = SizeAtCompileTime == Dynamic ? Dynamic : SizeAtCompileTime - 1 };
+  typedef Block<const Derived, internal::traits<Derived>::ColsAtCompileTime == 1 ? SizeMinusOne : 1,
+                internal::traits<Derived>::ColsAtCompileTime == 1 ? 1 : SizeMinusOne>
+      ConstStartMinusOne;
+  typedef EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(ConstStartMinusOne, Scalar, quotient) HNormalizedReturnType;
+  EIGEN_DEVICE_FUNC inline const HNormalizedReturnType hnormalized() const;
+
+  ////////// Householder module ///////////
+
+  EIGEN_DEVICE_FUNC void makeHouseholderInPlace(Scalar& tau, RealScalar& beta);
+  template <typename EssentialPart>
+  EIGEN_DEVICE_FUNC void makeHouseholder(EssentialPart& essential, Scalar& tau, RealScalar& beta) const;
+  template <typename EssentialPart>
+  EIGEN_DEVICE_FUNC void applyHouseholderOnTheLeft(const EssentialPart& essential, const Scalar& tau,
+                                                   Scalar* workspace);
+  template <typename EssentialPart>
+  EIGEN_DEVICE_FUNC void applyHouseholderOnTheRight(const EssentialPart& essential, const Scalar& tau,
+                                                    Scalar* workspace);
+
+  ///////// Jacobi module /////////
+
+  template <typename OtherScalar>
+  EIGEN_DEVICE_FUNC void applyOnTheLeft(Index p, Index q, const JacobiRotation<OtherScalar>& j);
+  template <typename OtherScalar>
+  EIGEN_DEVICE_FUNC void applyOnTheRight(Index p, Index q, const JacobiRotation<OtherScalar>& j);
+
+  ///////// SparseCore module /////////
+
+  template <typename OtherDerived>
+  EIGEN_STRONG_INLINE const typename SparseMatrixBase<OtherDerived>::template CwiseProductDenseReturnType<Derived>::Type
+  cwiseProduct(const SparseMatrixBase<OtherDerived>& other) const {
+    return other.cwiseProduct(derived());
+  }
+
+  ///////// MatrixFunctions module /////////
+
+  typedef typename internal::stem_function<Scalar>::type StemFunction;
+#define EIGEN_MATRIX_FUNCTION(ReturnType, Name, Description)                                                        \
+  /** \returns an expression of the matrix Description of \c *this. \brief This function requires the <a            \
+   * href="unsupported/group__MatrixFunctions__Module.html"> unsupported MatrixFunctions module</a>. To compute the \
+   * coefficient-wise Description use ArrayBase::##Name . */                                                        \
+  const ReturnType<Derived> Name() const;
+#define EIGEN_MATRIX_FUNCTION_1(ReturnType, Name, Description, Argument)                                            \
+  /** \returns an expression of the matrix Description of \c *this. \brief This function requires the <a            \
+   * href="unsupported/group__MatrixFunctions__Module.html"> unsupported MatrixFunctions module</a>. To compute the \
+   * coefficient-wise Description use ArrayBase::##Name . */                                                        \
+  const ReturnType<Derived> Name(Argument) const;
+
+  EIGEN_MATRIX_FUNCTION(MatrixExponentialReturnValue, exp, exponential)
+  /** \brief Helper function for the <a href="unsupported/group__MatrixFunctions__Module.html"> unsupported
+   * MatrixFunctions module</a>.*/
+  const MatrixFunctionReturnValue<Derived> matrixFunction(StemFunction f) const;
+  EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, cosh, hyperbolic cosine)
+  EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, sinh, hyperbolic sine)
+  EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, atanh, inverse hyperbolic cosine)
+  EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, acosh, inverse hyperbolic cosine)
+  EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, asinh, inverse hyperbolic sine)
+  EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, cos, cosine)
+  EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, sin, sine)
+  EIGEN_MATRIX_FUNCTION(MatrixSquareRootReturnValue, sqrt, square root)
+  EIGEN_MATRIX_FUNCTION(MatrixLogarithmReturnValue, log, logarithm)
+  EIGEN_MATRIX_FUNCTION_1(MatrixPowerReturnValue, pow, power to \c p, const RealScalar& p)
+  EIGEN_MATRIX_FUNCTION_1(MatrixComplexPowerReturnValue, pow, power to \c p, const internal::make_complex_t<Scalar>& p)
+
+ protected:
+  EIGEN_DEFAULT_COPY_CONSTRUCTOR(MatrixBase)
+  EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(MatrixBase)
+
+ private:
+  EIGEN_DEVICE_FUNC explicit MatrixBase(int);
+  EIGEN_DEVICE_FUNC MatrixBase(int, int);
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC explicit MatrixBase(const MatrixBase<OtherDerived>&);
+
+ protected:
+  // mixing arrays and matrices is not legal
+  template <typename OtherDerived>
+  Derived& operator+=(const ArrayBase<OtherDerived>&) {
+    EIGEN_STATIC_ASSERT(std::ptrdiff_t(sizeof(typename OtherDerived::Scalar)) == -1,
+                        YOU_CANNOT_MIX_ARRAYS_AND_MATRICES);
+    return *this;
+  }
+  // mixing arrays and matrices is not legal
+  template <typename OtherDerived>
+  Derived& operator-=(const ArrayBase<OtherDerived>&) {
+    EIGEN_STATIC_ASSERT(std::ptrdiff_t(sizeof(typename OtherDerived::Scalar)) == -1,
+                        YOU_CANNOT_MIX_ARRAYS_AND_MATRICES);
+    return *this;
+  }
 };
 
-
 /***************************************************************************
-* Implementation of matrix base methods
-***************************************************************************/
+ * Implementation of matrix base methods
+ ***************************************************************************/
 
 /** replaces \c *this by \c *this * \a other.
-  *
-  * \returns a reference to \c *this
-  *
-  * Example: \include MatrixBase_applyOnTheRight.cpp
-  * Output: \verbinclude MatrixBase_applyOnTheRight.out
-  */
-template<typename Derived>
-template<typename OtherDerived>
-inline Derived&
-MatrixBase<Derived>::operator*=(const EigenBase<OtherDerived> &other)
-{
+ *
+ * \returns a reference to \c *this
+ *
+ * Example: \include MatrixBase_applyOnTheRight.cpp
+ * Output: \verbinclude MatrixBase_applyOnTheRight.out
+ */
+template <typename Derived>
+template <typename OtherDerived>
+inline Derived& MatrixBase<Derived>::operator*=(const EigenBase<OtherDerived>& other) {
   other.derived().applyThisOnTheRight(derived());
   return derived();
 }
 
 /** replaces \c *this by \c *this * \a other. It is equivalent to MatrixBase::operator*=().
-  *
-  * Example: \include MatrixBase_applyOnTheRight.cpp
-  * Output: \verbinclude MatrixBase_applyOnTheRight.out
-  */
-template<typename Derived>
-template<typename OtherDerived>
-inline void MatrixBase<Derived>::applyOnTheRight(const EigenBase<OtherDerived> &other)
-{
+ *
+ * Example: \include MatrixBase_applyOnTheRight.cpp
+ * Output: \verbinclude MatrixBase_applyOnTheRight.out
+ */
+template <typename Derived>
+template <typename OtherDerived>
+inline void MatrixBase<Derived>::applyOnTheRight(const EigenBase<OtherDerived>& other) {
   other.derived().applyThisOnTheRight(derived());
 }
 
 /** replaces \c *this by \a other * \c *this.
-  *
-  * Example: \include MatrixBase_applyOnTheLeft.cpp
-  * Output: \verbinclude MatrixBase_applyOnTheLeft.out
-  */
-template<typename Derived>
-template<typename OtherDerived>
-inline void MatrixBase<Derived>::applyOnTheLeft(const EigenBase<OtherDerived> &other)
-{
+ *
+ * Example: \include MatrixBase_applyOnTheLeft.cpp
+ * Output: \verbinclude MatrixBase_applyOnTheLeft.out
+ */
+template <typename Derived>
+template <typename OtherDerived>
+inline void MatrixBase<Derived>::applyOnTheLeft(const EigenBase<OtherDerived>& other) {
   other.derived().applyThisOnTheLeft(derived());
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_MATRIXBASE_H
+#endif  // EIGEN_MATRIXBASE_H
diff --git a/inst/include/Eigen/src/Core/NestByValue.h b/inst/include/Eigen/src/Core/NestByValue.h
index a893b176..2ce83a8c 100644
--- a/inst/include/Eigen/src/Core/NestByValue.h
+++ b/inst/include/Eigen/src/Core/NestByValue.h
@@ -11,101 +11,81 @@
 #ifndef EIGEN_NESTBYVALUE_H
 #define EIGEN_NESTBYVALUE_H
 
-namespace Eigen {
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 
-/** \class NestByValue
-  * \ingroup Core_Module
-  *
-  * \brief Expression which must be nested by value
-  *
-  * \param ExpressionType the type of the object of which we are requiring nesting-by-value
-  *
-  * This class is the return type of MatrixBase::nestByValue()
-  * and most of the time this is the only way it is used.
-  *
-  * \sa MatrixBase::nestByValue()
-  */
+namespace Eigen {
 
 namespace internal {
-template<typename ExpressionType>
-struct traits<NestByValue<ExpressionType> > : public traits<ExpressionType>
-{};
-}
+template <typename ExpressionType>
+struct traits<NestByValue<ExpressionType> > : public traits<ExpressionType> {
+  enum { Flags = traits<ExpressionType>::Flags & ~NestByRefBit };
+};
+}  // namespace internal
 
-template<typename ExpressionType> class NestByValue
-  : public internal::dense_xpr_base< NestByValue<ExpressionType> >::type
-{
-  public:
-
-    typedef typename internal::dense_xpr_base<NestByValue>::type Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(NestByValue)
-
-    inline NestByValue(const ExpressionType& matrix) : m_expression(matrix) {}
-
-    inline Index rows() const { return m_expression.rows(); }
-    inline Index cols() const { return m_expression.cols(); }
-    inline Index outerStride() const { return m_expression.outerStride(); }
-    inline Index innerStride() const { return m_expression.innerStride(); }
-
-    inline const CoeffReturnType coeff(Index row, Index col) const
-    {
-      return m_expression.coeff(row, col);
-    }
-
-    inline Scalar& coeffRef(Index row, Index col)
-    {
-      return m_expression.const_cast_derived().coeffRef(row, col);
-    }
-
-    inline const CoeffReturnType coeff(Index index) const
-    {
-      return m_expression.coeff(index);
-    }
-
-    inline Scalar& coeffRef(Index index)
-    {
-      return m_expression.const_cast_derived().coeffRef(index);
-    }
-
-    template<int LoadMode>
-    inline const PacketScalar packet(Index row, Index col) const
-    {
-      return m_expression.template packet<LoadMode>(row, col);
-    }
-
-    template<int LoadMode>
-    inline void writePacket(Index row, Index col, const PacketScalar& x)
-    {
-      m_expression.const_cast_derived().template writePacket<LoadMode>(row, col, x);
-    }
-
-    template<int LoadMode>
-    inline const PacketScalar packet(Index index) const
-    {
-      return m_expression.template packet<LoadMode>(index);
-    }
-
-    template<int LoadMode>
-    inline void writePacket(Index index, const PacketScalar& x)
-    {
-      m_expression.const_cast_derived().template writePacket<LoadMode>(index, x);
-    }
-
-    operator const ExpressionType&() const { return m_expression; }
-
-  protected:
-    const ExpressionType m_expression;
+/** \class NestByValue
+ * \ingroup Core_Module
+ *
+ * \brief Expression which must be nested by value
+ *
+ * \tparam ExpressionType the type of the object of which we are requiring nesting-by-value
+ *
+ * This class is the return type of MatrixBase::nestByValue()
+ * and most of the time this is the only way it is used.
+ *
+ * \sa MatrixBase::nestByValue()
+ */
+template <typename ExpressionType>
+class NestByValue : public internal::dense_xpr_base<NestByValue<ExpressionType> >::type {
+ public:
+  typedef typename internal::dense_xpr_base<NestByValue>::type Base;
+  static constexpr bool HasDirectAccess = internal::has_direct_access<ExpressionType>::ret;
+
+  EIGEN_DENSE_PUBLIC_INTERFACE(NestByValue)
+
+  EIGEN_DEVICE_FUNC explicit inline NestByValue(const ExpressionType& matrix) : m_expression(matrix) {}
+
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_expression.rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_expression.cols(); }
+
+  EIGEN_DEVICE_FUNC operator const ExpressionType&() const { return m_expression; }
+
+  EIGEN_DEVICE_FUNC const ExpressionType& nestedExpression() const { return m_expression; }
+
+  EIGEN_DEVICE_FUNC typename std::enable_if<HasDirectAccess, const Scalar*>::type data() const {
+    return m_expression.data();
+  }
+
+  EIGEN_DEVICE_FUNC typename std::enable_if<HasDirectAccess, Index>::type innerStride() const {
+    return m_expression.innerStride();
+  }
+
+  EIGEN_DEVICE_FUNC typename std::enable_if<HasDirectAccess, Index>::type outerStride() const {
+    return m_expression.outerStride();
+  }
+
+ protected:
+  const ExpressionType m_expression;
 };
 
 /** \returns an expression of the temporary version of *this.
-  */
-template<typename Derived>
-inline const NestByValue<Derived>
-DenseBase<Derived>::nestByValue() const
-{
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC inline const NestByValue<Derived> DenseBase<Derived>::nestByValue() const {
   return NestByValue<Derived>(derived());
 }
 
-} // end namespace Eigen
+namespace internal {
+
+// Evaluator of Solve -> eval into a temporary
+template <typename ArgType>
+struct evaluator<NestByValue<ArgType> > : public evaluator<ArgType> {
+  typedef evaluator<ArgType> Base;
+
+  EIGEN_DEVICE_FUNC explicit evaluator(const NestByValue<ArgType>& xpr) : Base(xpr.nestedExpression()) {}
+};
+}  // namespace internal
+
+}  // end namespace Eigen
 
-#endif // EIGEN_NESTBYVALUE_H
+#endif  // EIGEN_NESTBYVALUE_H
diff --git a/inst/include/Eigen/src/Core/NoAlias.h b/inst/include/Eigen/src/Core/NoAlias.h
index 768bfb18..b6c72091 100644
--- a/inst/include/Eigen/src/Core/NoAlias.h
+++ b/inst/include/Eigen/src/Core/NoAlias.h
@@ -10,125 +10,93 @@
 #ifndef EIGEN_NOALIAS_H
 #define EIGEN_NOALIAS_H
 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
 namespace Eigen {
 
 /** \class NoAlias
-  * \ingroup Core_Module
-  *
-  * \brief Pseudo expression providing an operator = assuming no aliasing
-  *
-  * \param ExpressionType the type of the object on which to do the lazy assignment
-  *
-  * This class represents an expression with special assignment operators
-  * assuming no aliasing between the target expression and the source expression.
-  * More precisely it alloas to bypass the EvalBeforeAssignBit flag of the source expression.
-  * It is the return type of MatrixBase::noalias()
-  * and most of the time this is the only way it is used.
-  *
-  * \sa MatrixBase::noalias()
-  */
-template<typename ExpressionType, template <typename> class StorageBase>
-class NoAlias
-{
-    typedef typename ExpressionType::Scalar Scalar;
-  public:
-    NoAlias(ExpressionType& expression) : m_expression(expression) {}
-
-    /** Behaves like MatrixBase::lazyAssign(other)
-      * \sa MatrixBase::lazyAssign() */
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE ExpressionType& operator=(const StorageBase<OtherDerived>& other)
-    { return internal::assign_selector<ExpressionType,OtherDerived,false>::run(m_expression,other.derived()); }
-
-    /** \sa MatrixBase::operator+= */
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE ExpressionType& operator+=(const StorageBase<OtherDerived>& other)
-    {
-      typedef SelfCwiseBinaryOp<internal::scalar_sum_op<Scalar>, ExpressionType, OtherDerived> SelfAdder;
-      SelfAdder tmp(m_expression);
-      typedef typename internal::nested<OtherDerived>::type OtherDerivedNested;
-      typedef typename internal::remove_all<OtherDerivedNested>::type _OtherDerivedNested;
-      internal::assign_selector<SelfAdder,_OtherDerivedNested,false>::run(tmp,OtherDerivedNested(other.derived()));
-      return m_expression;
-    }
-
-    /** \sa MatrixBase::operator-= */
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE ExpressionType& operator-=(const StorageBase<OtherDerived>& other)
-    {
-      typedef SelfCwiseBinaryOp<internal::scalar_difference_op<Scalar>, ExpressionType, OtherDerived> SelfAdder;
-      SelfAdder tmp(m_expression);
-      typedef typename internal::nested<OtherDerived>::type OtherDerivedNested;
-      typedef typename internal::remove_all<OtherDerivedNested>::type _OtherDerivedNested;
-      internal::assign_selector<SelfAdder,_OtherDerivedNested,false>::run(tmp,OtherDerivedNested(other.derived()));
-      return m_expression;
-    }
+ * \ingroup Core_Module
+ *
+ * \brief Pseudo expression providing an operator = assuming no aliasing
+ *
+ * \tparam ExpressionType the type of the object on which to do the lazy assignment
+ *
+ * This class represents an expression with special assignment operators
+ * assuming no aliasing between the target expression and the source expression.
+ * More precisely it alloas to bypass the EvalBeforeAssignBit flag of the source expression.
+ * It is the return type of MatrixBase::noalias()
+ * and most of the time this is the only way it is used.
+ *
+ * \sa MatrixBase::noalias()
+ */
+template <typename ExpressionType, template <typename> class StorageBase>
+class NoAlias {
+ public:
+  typedef typename ExpressionType::Scalar Scalar;
 
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-    template<typename ProductDerived, typename Lhs, typename Rhs>
-    EIGEN_STRONG_INLINE ExpressionType& operator+=(const ProductBase<ProductDerived, Lhs,Rhs>& other)
-    { other.derived().addTo(m_expression); return m_expression; }
+  EIGEN_DEVICE_FUNC explicit NoAlias(ExpressionType& expression) : m_expression(expression) {}
 
-    template<typename ProductDerived, typename Lhs, typename Rhs>
-    EIGEN_STRONG_INLINE ExpressionType& operator-=(const ProductBase<ProductDerived, Lhs,Rhs>& other)
-    { other.derived().subTo(m_expression); return m_expression; }
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ExpressionType& operator=(const StorageBase<OtherDerived>& other) {
+    call_assignment_no_alias(m_expression, other.derived(),
+                             internal::assign_op<Scalar, typename OtherDerived::Scalar>());
+    return m_expression;
+  }
 
-    template<typename Lhs, typename Rhs, int NestingFlags>
-    EIGEN_STRONG_INLINE ExpressionType& operator+=(const CoeffBasedProduct<Lhs,Rhs,NestingFlags>& other)
-    { return m_expression.derived() += CoeffBasedProduct<Lhs,Rhs,NestByRefBit>(other.lhs(), other.rhs()); }
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ExpressionType& operator+=(const StorageBase<OtherDerived>& other) {
+    call_assignment_no_alias(m_expression, other.derived(),
+                             internal::add_assign_op<Scalar, typename OtherDerived::Scalar>());
+    return m_expression;
+  }
 
-    template<typename Lhs, typename Rhs, int NestingFlags>
-    EIGEN_STRONG_INLINE ExpressionType& operator-=(const CoeffBasedProduct<Lhs,Rhs,NestingFlags>& other)
-    { return m_expression.derived() -= CoeffBasedProduct<Lhs,Rhs,NestByRefBit>(other.lhs(), other.rhs()); }
-    
-    template<typename OtherDerived>
-    ExpressionType& operator=(const ReturnByValue<OtherDerived>& func)
-    { return m_expression = func; }
-#endif
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ExpressionType& operator-=(const StorageBase<OtherDerived>& other) {
+    call_assignment_no_alias(m_expression, other.derived(),
+                             internal::sub_assign_op<Scalar, typename OtherDerived::Scalar>());
+    return m_expression;
+  }
 
-    ExpressionType& expression() const
-    {
-      return m_expression;
-    }
+  EIGEN_DEVICE_FUNC ExpressionType& expression() const { return m_expression; }
 
-  protected:
-    ExpressionType& m_expression;
+ protected:
+  ExpressionType& m_expression;
 };
 
 /** \returns a pseudo expression of \c *this with an operator= assuming
-  * no aliasing between \c *this and the source expression.
-  *
-  * More precisely, noalias() allows to bypass the EvalBeforeAssignBit flag.
-  * Currently, even though several expressions may alias, only product
-  * expressions have this flag. Therefore, noalias() is only usefull when
-  * the source expression contains a matrix product.
-  *
-  * Here are some examples where noalias is usefull:
-  * \code
-  * D.noalias()  = A * B;
-  * D.noalias() += A.transpose() * B;
-  * D.noalias() -= 2 * A * B.adjoint();
-  * \endcode
-  *
-  * On the other hand the following example will lead to a \b wrong result:
-  * \code
-  * A.noalias() = A * B;
-  * \endcode
-  * because the result matrix A is also an operand of the matrix product. Therefore,
-  * there is no alternative than evaluating A * B in a temporary, that is the default
-  * behavior when you write:
-  * \code
-  * A = A * B;
-  * \endcode
-  *
-  * \sa class NoAlias
-  */
-template<typename Derived>
-NoAlias<Derived,MatrixBase> MatrixBase<Derived>::noalias()
-{
-  return derived();
+ * no aliasing between \c *this and the source expression.
+ *
+ * More precisely, noalias() allows to bypass the EvalBeforeAssignBit flag.
+ * Currently, even though several expressions may alias, only product
+ * expressions have this flag. Therefore, noalias() is only useful when
+ * the source expression contains a matrix product.
+ *
+ * Here are some examples where noalias is useful:
+ * \code
+ * D.noalias()  = A * B;
+ * D.noalias() += A.transpose() * B;
+ * D.noalias() -= 2 * A * B.adjoint();
+ * \endcode
+ *
+ * On the other hand the following example will lead to a \b wrong result:
+ * \code
+ * A.noalias() = A * B;
+ * \endcode
+ * because the result matrix A is also an operand of the matrix product. Therefore,
+ * there is no alternative than evaluating A * B in a temporary, that is the default
+ * behavior when you write:
+ * \code
+ * A = A * B;
+ * \endcode
+ *
+ * \sa class NoAlias
+ */
+template <typename Derived>
+NoAlias<Derived, MatrixBase> EIGEN_DEVICE_FUNC MatrixBase<Derived>::noalias() {
+  return NoAlias<Derived, Eigen::MatrixBase>(derived());
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_NOALIAS_H
+#endif  // EIGEN_NOALIAS_H
diff --git a/inst/include/Eigen/src/Core/NumTraits.h b/inst/include/Eigen/src/Core/NumTraits.h
index bac9e50b..bf41c3bb 100644
--- a/inst/include/Eigen/src/Core/NumTraits.h
+++ b/inst/include/Eigen/src/Core/NumTraits.h
@@ -10,46 +10,183 @@
 #ifndef EIGEN_NUMTRAITS_H
 #define EIGEN_NUMTRAITS_H
 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
 namespace Eigen {
 
-/** \class NumTraits
-  * \ingroup Core_Module
-  *
-  * \brief Holds information about the various numeric (i.e. scalar) types allowed by Eigen.
-  *
-  * \param T the numeric type at hand
-  *
-  * This class stores enums, typedefs and static methods giving information about a numeric type.
-  *
-  * The provided data consists of:
-  * \li A typedef \a Real, giving the "real part" type of \a T. If \a T is already real,
-  *     then \a Real is just a typedef to \a T. If \a T is \c std::complex<U> then \a Real
-  *     is a typedef to \a U.
-  * \li A typedef \a NonInteger, giving the type that should be used for operations producing non-integral values,
-  *     such as quotients, square roots, etc. If \a T is a floating-point type, then this typedef just gives
-  *     \a T again. Note however that many Eigen functions such as internal::sqrt simply refuse to
-  *     take integers. Outside of a few cases, Eigen doesn't do automatic type promotion. Thus, this typedef is
-  *     only intended as a helper for code that needs to explicitly promote types.
-  * \li A typedef \a Nested giving the type to use to nest a value inside of the expression tree. If you don't know what
-  *     this means, just use \a T here.
-  * \li An enum value \a IsComplex. It is equal to 1 if \a T is a \c std::complex
-  *     type, and to 0 otherwise.
-  * \li An enum value \a IsInteger. It is equal to \c 1 if \a T is an integer type such as \c int,
-  *     and to \c 0 otherwise.
-  * \li Enum values ReadCost, AddCost and MulCost representing a rough estimate of the number of CPU cycles needed
-  *     to by move / add / mul instructions respectively, assuming the data is already stored in CPU registers.
-  *     Stay vague here. No need to do architecture-specific stuff.
-  * \li An enum value \a IsSigned. It is equal to \c 1 if \a T is a signed type and to 0 if \a T is unsigned.
-  * \li An enum value \a RequireInitialization. It is equal to \c 1 if the constructor of the numeric type \a T must
-  *     be called, and to 0 if it is safe not to call it. Default is 0 if \a T is an arithmetic type, and 1 otherwise.
-  * \li An epsilon() function which, unlike std::numeric_limits::epsilon(), returns a \a Real instead of a \a T.
-  * \li A dummy_precision() function returning a weak epsilon value. It is mainly used as a default
-  *     value by the fuzzy comparison operators.
-  * \li highest() and lowest() functions returning the highest and lowest possible values respectively.
-  */
-
-template<typename T> struct GenericNumTraits
+namespace internal {
+
+// default implementation of digits(), based on numeric_limits if specialized,
+// 0 for integer types, and log2(epsilon()) otherwise.
+template <typename T, bool use_numeric_limits = std::numeric_limits<T>::is_specialized,
+          bool is_integer = NumTraits<T>::IsInteger>
+struct default_digits_impl {
+  EIGEN_DEVICE_FUNC constexpr static int run() { return std::numeric_limits<T>::digits; }
+};
+
+template <typename T>
+struct default_digits_impl<T, false, false>  // Floating point
+{
+  EIGEN_DEVICE_FUNC constexpr static int run() {
+    using std::ceil;
+    using std::log2;
+    typedef typename NumTraits<T>::Real Real;
+    return int(ceil(-log2(NumTraits<Real>::epsilon())));
+  }
+};
+
+template <typename T>
+struct default_digits_impl<T, false, true>  // Integer
 {
+  EIGEN_DEVICE_FUNC constexpr static int run() { return 0; }
+};
+
+// default implementation of digits10(), based on numeric_limits if specialized,
+// 0 for integer types, and floor((digits()-1)*log10(2)) otherwise.
+template <typename T, bool use_numeric_limits = std::numeric_limits<T>::is_specialized,
+          bool is_integer = NumTraits<T>::IsInteger>
+struct default_digits10_impl {
+  EIGEN_DEVICE_FUNC constexpr static int run() { return std::numeric_limits<T>::digits10; }
+};
+
+template <typename T>
+struct default_digits10_impl<T, false, false>  // Floating point
+{
+  EIGEN_DEVICE_FUNC constexpr static int run() {
+    using std::floor;
+    using std::log10;
+    typedef typename NumTraits<T>::Real Real;
+    return int(floor((internal::default_digits_impl<Real>::run() - 1) * log10(2)));
+  }
+};
+
+template <typename T>
+struct default_digits10_impl<T, false, true>  // Integer
+{
+  EIGEN_DEVICE_FUNC constexpr static int run() { return 0; }
+};
+
+// default implementation of max_digits10(), based on numeric_limits if specialized,
+// 0 for integer types, and log10(2) * digits() + 1 otherwise.
+template <typename T, bool use_numeric_limits = std::numeric_limits<T>::is_specialized,
+          bool is_integer = NumTraits<T>::IsInteger>
+struct default_max_digits10_impl {
+  EIGEN_DEVICE_FUNC constexpr static int run() { return std::numeric_limits<T>::max_digits10; }
+};
+
+template <typename T>
+struct default_max_digits10_impl<T, false, false>  // Floating point
+{
+  EIGEN_DEVICE_FUNC constexpr static int run() {
+    using std::ceil;
+    using std::log10;
+    typedef typename NumTraits<T>::Real Real;
+    return int(ceil(internal::default_digits_impl<Real>::run() * log10(2) + 1));
+  }
+};
+
+template <typename T>
+struct default_max_digits10_impl<T, false, true>  // Integer
+{
+  EIGEN_DEVICE_FUNC constexpr static int run() { return 0; }
+};
+
+}  // end namespace internal
+
+namespace numext {
+
+/** \internal bit-wise cast without changing the underlying bit representation. */
+#if defined(__cpp_lib_bit_cast) && __cpp_lib_bit_cast >= 201806L
+template <typename Tgt, typename Src>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr Tgt bit_cast(const Src& src) {
+  return std::bit_cast<Tgt>(src);
+}
+#elif EIGEN_HAS_BUILTIN(__builtin_bit_cast)
+template <typename Tgt, typename Src>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr Tgt bit_cast(const Src& src) {
+  EIGEN_STATIC_ASSERT(std::is_trivially_copyable<Src>::value, THIS_TYPE_IS_NOT_SUPPORTED)
+  EIGEN_STATIC_ASSERT(std::is_trivially_copyable<Tgt>::value, THIS_TYPE_IS_NOT_SUPPORTED)
+  EIGEN_STATIC_ASSERT(sizeof(Src) == sizeof(Tgt), THIS_TYPE_IS_NOT_SUPPORTED)
+  return __builtin_bit_cast(Tgt, src);
+}
+#else
+template <typename Tgt, typename Src>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Tgt bit_cast(const Src& src) {
+  // The behaviour of memcpy is not specified for non-trivially copyable types
+  EIGEN_STATIC_ASSERT(std::is_trivially_copyable<Src>::value, THIS_TYPE_IS_NOT_SUPPORTED)
+  EIGEN_STATIC_ASSERT(std::is_trivially_copyable<Tgt>::value && std::is_default_constructible<Tgt>::value,
+                      THIS_TYPE_IS_NOT_SUPPORTED)
+  EIGEN_STATIC_ASSERT(sizeof(Src) == sizeof(Tgt), THIS_TYPE_IS_NOT_SUPPORTED)
+
+  Tgt tgt;
+  // Load src into registers first. This allows the memcpy to be elided by CUDA.
+  const Src staged = src;
+  EIGEN_USING_STD(memcpy)
+  memcpy(static_cast<void*>(&tgt), static_cast<const void*>(&staged), sizeof(Tgt));
+  return tgt;
+}
+#endif
+}  // namespace numext
+
+// clang-format off
+/** \class NumTraits
+ * \ingroup Core_Module
+ *
+ * \brief Holds information about the various numeric (i.e. scalar) types allowed by Eigen.
+ *
+ * \tparam T the numeric type at hand
+ *
+ * This class stores enums, typedefs and static methods giving information about a numeric type.
+ *
+ * The provided data consists of:
+ * \li A typedef \c Real, giving the "real part" type of \a T. If \a T is already real,
+ *     then \c Real is just a typedef to \a T. If \a T is `std::complex<U>` then \c Real
+ *     is a typedef to \a U.
+ * \li A typedef \c NonInteger, giving the type that should be used for operations producing non-integral values,
+ *     such as quotients, square roots, etc. If \a T is a floating-point type, then this typedef just gives
+ *     \a T again. Note however that many Eigen functions such as internal::sqrt simply refuse to
+ *     take integers. Outside of a few cases, Eigen doesn't do automatic type promotion. Thus, this typedef is
+ *     only intended as a helper for code that needs to explicitly promote types.
+ * \li A typedef \c Literal giving the type to use for numeric literals such as "2" or "0.5". For instance, for
+ *     `std::complex<U>`, Literal is defined as \a U. Of course, this type must be fully compatible with \a T. In doubt,
+ *     just use \a T here.
+ * \li A typedef \c Nested giving the type to use to nest a value inside of the expression tree. If you don't know what
+ *     this means, just use \a T here.
+ * \li An enum value \c IsComplex. It is equal to 1 if \a T is a \c std::complex type, and to 0 otherwise.
+ * \li An enum value \c IsInteger. It is equal to \c 1 if \a T is an integer type such as \c int, and to \c 0 otherwise.
+ * \li Enum values \c ReadCost, \c AddCost and \c MulCost representing a rough estimate of the number of CPU cycles needed to by
+ *     move / add / mul instructions respectively, assuming the data is already stored in CPU registers. Stay vague here.
+ *     No need to do architecture-specific stuff. If you don't know what this means, just use \c Eigen::HugeCost.
+ * \li An enum value \c IsSigned. It is equal to \c 1 if \a T is a signed type and to 0 if \a T is unsigned.
+ * \li An enum value \c RequireInitialization. It is equal to \c 1 if the constructor of the numeric type \a T must be
+ *     called, and to 0 if it is safe not to call it. Default is 0 if \a T is an arithmetic type, and 1 otherwise.
+ * \li An epsilon() function which, unlike <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/epsilon">
+ *     `std::numeric_limits::epsilon()`</a>, it returns a \c Real instead of a \a T.
+ * \li A dummy_precision() function returning a weak epsilon value. It is mainly used as a default value by the fuzzy
+ *     comparison operators.
+ * \li highest() and lowest() functions returning the highest and lowest possible values respectively.
+ * \li digits() function returning the number of radix digits (non-sign digits for integers, mantissa for floating-point).
+ *     This is the analogue of <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/digits">
+ *     `std::numeric_limits<T>::digits`</a> which is used as the default implementation if specialized.
+ * \li digits10() function returning the number of decimal digits that can be represented without change. This is the
+ *     analogue of <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/digits10">
+ *     `std::numeric_limits<T>::digits10`</a> which is used as the default implementation if specialized.
+ * \li max_digits10() function returning the number of decimal digits required to uniquely represent all distinct values
+ *     of the type. This is the analogue of <a
+ *     href="http://en.cppreference.com/w/cpp/types/numeric_limits/max_digits10">`std::numeric_limits<T>::max_digits10`</a>
+ *     which is used as the default implementation if specialized.
+ * \li min_exponent() and max_exponent() functions returning the highest and lowest possible values, respectively,
+ *     such that the radix raised to the power exponent-1 is a normalized floating-point number.  These are equivalent
+ *     to <a href="http://en.cppreference.com/w/cpp/types/numeric_limits/min_exponent">
+ *     `std::numeric_limits<T>::min_exponent`</a>/<a
+ *     href="http://en.cppreference.com/w/cpp/types/numeric_limits/max_exponent">`std::numeric_limits<T>::max_exponent`</a>.
+ * \li infinity() function returning a representation of positive infinity, if available.
+ * \li quiet_NaN() function returning a non-signaling "not-a-number", if available.
+ */
+// clang-format on
+template <typename T>
+struct GenericNumTraits {
   enum {
     IsInteger = std::numeric_limits<T>::is_integer,
     IsSigned = std::numeric_limits<T>::is_signed,
@@ -61,90 +198,138 @@ template<typename T> struct GenericNumTraits
   };
 
   typedef T Real;
-  typedef typename internal::conditional<
-                     IsInteger,
-                     typename internal::conditional<sizeof(T)<=2, float, double>::type,
-                     T
-                   >::type NonInteger;
+  typedef std::conditional_t<IsInteger, std::conditional_t<sizeof(T) <= 2, float, double>, T> NonInteger;
   typedef T Nested;
+  typedef T Literal;
+
+  EIGEN_DEVICE_FUNC constexpr static Real epsilon() { return numext::numeric_limits<T>::epsilon(); }
+
+  EIGEN_DEVICE_FUNC constexpr static int digits10() { return internal::default_digits10_impl<T>::run(); }
+
+  EIGEN_DEVICE_FUNC constexpr static int max_digits10() { return internal::default_max_digits10_impl<T>::run(); }
 
-  static inline Real epsilon() { return std::numeric_limits<T>::epsilon(); }
-  static inline Real dummy_precision()
-  {
+  EIGEN_DEVICE_FUNC constexpr static int digits() { return internal::default_digits_impl<T>::run(); }
+
+  EIGEN_DEVICE_FUNC constexpr static int min_exponent() { return numext::numeric_limits<T>::min_exponent; }
+
+  EIGEN_DEVICE_FUNC constexpr static int max_exponent() { return numext::numeric_limits<T>::max_exponent; }
+
+  EIGEN_DEVICE_FUNC constexpr static Real dummy_precision() {
     // make sure to override this for floating-point types
     return Real(0);
   }
-  static inline T highest() { return (std::numeric_limits<T>::max)(); }
-  static inline T lowest()  { return IsInteger ? (std::numeric_limits<T>::min)() : (-(std::numeric_limits<T>::max)()); }
-  
-#ifdef EIGEN2_SUPPORT
-  enum {
-    HasFloatingPoint = !IsInteger
-  };
-  typedef NonInteger FloatingPoint;
-#endif
+
+  EIGEN_DEVICE_FUNC constexpr static T highest() { return (numext::numeric_limits<T>::max)(); }
+
+  EIGEN_DEVICE_FUNC constexpr static T lowest() { return (numext::numeric_limits<T>::lowest)(); }
+
+  EIGEN_DEVICE_FUNC constexpr static T infinity() { return numext::numeric_limits<T>::infinity(); }
+
+  EIGEN_DEVICE_FUNC constexpr static T quiet_NaN() { return numext::numeric_limits<T>::quiet_NaN(); }
 };
 
-template<typename T> struct NumTraits : GenericNumTraits<T>
-{};
+template <typename T>
+struct NumTraits : GenericNumTraits<T> {};
 
-template<> struct NumTraits<float>
-  : GenericNumTraits<float>
-{
-  static inline float dummy_precision() { return 1e-5f; }
+template <>
+struct NumTraits<float> : GenericNumTraits<float> {
+  EIGEN_DEVICE_FUNC constexpr static float dummy_precision() { return 1e-5f; }
 };
 
-template<> struct NumTraits<double> : GenericNumTraits<double>
-{
-  static inline double dummy_precision() { return 1e-12; }
+template <>
+struct NumTraits<double> : GenericNumTraits<double> {
+  EIGEN_DEVICE_FUNC constexpr static double dummy_precision() { return 1e-12; }
 };
 
-template<> struct NumTraits<long double>
-  : GenericNumTraits<long double>
-{
-  static inline long double dummy_precision() { return 1e-15l; }
+// GPU devices treat `long double` as `double`.
+#ifndef EIGEN_GPU_COMPILE_PHASE
+template <>
+struct NumTraits<long double> : GenericNumTraits<long double> {
+  EIGEN_DEVICE_FUNC constexpr static long double dummy_precision() { return static_cast<long double>(1e-15l); }
+
+#if defined(EIGEN_ARCH_PPC) && (__LDBL_MANT_DIG__ == 106)
+  // PowerPC double double causes issues with some values
+  EIGEN_DEVICE_FUNC constexpr static long double epsilon() {
+    // 2^(-(__LDBL_MANT_DIG__)+1)
+    return static_cast<long double>(2.4651903288156618919116517665087e-32l);
+  }
+#endif
 };
+#endif
 
-template<typename _Real> struct NumTraits<std::complex<_Real> >
-  : GenericNumTraits<std::complex<_Real> >
-{
-  typedef _Real Real;
+template <typename Real_>
+struct NumTraits<std::complex<Real_> > : GenericNumTraits<std::complex<Real_> > {
+  typedef Real_ Real;
+  typedef typename NumTraits<Real_>::Literal Literal;
   enum {
     IsComplex = 1,
-    RequireInitialization = NumTraits<_Real>::RequireInitialization,
-    ReadCost = 2 * NumTraits<_Real>::ReadCost,
+    IsSigned = NumTraits<Real_>::IsSigned,
+    RequireInitialization = NumTraits<Real_>::RequireInitialization,
+    ReadCost = 2 * NumTraits<Real_>::ReadCost,
     AddCost = 2 * NumTraits<Real>::AddCost,
     MulCost = 4 * NumTraits<Real>::MulCost + 2 * NumTraits<Real>::AddCost
   };
 
-  static inline Real epsilon() { return NumTraits<Real>::epsilon(); }
-  static inline Real dummy_precision() { return NumTraits<Real>::dummy_precision(); }
+  EIGEN_DEVICE_FUNC constexpr static Real epsilon() { return NumTraits<Real>::epsilon(); }
+  EIGEN_DEVICE_FUNC constexpr static Real dummy_precision() { return NumTraits<Real>::dummy_precision(); }
+  EIGEN_DEVICE_FUNC constexpr static int digits10() { return NumTraits<Real>::digits10(); }
+  EIGEN_DEVICE_FUNC constexpr static int max_digits10() { return NumTraits<Real>::max_digits10(); }
 };
 
-template<typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
-struct NumTraits<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> >
-{
+template <typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
+struct NumTraits<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> > {
   typedef Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols> ArrayType;
   typedef typename NumTraits<Scalar>::Real RealScalar;
   typedef Array<RealScalar, Rows, Cols, Options, MaxRows, MaxCols> Real;
   typedef typename NumTraits<Scalar>::NonInteger NonIntegerScalar;
   typedef Array<NonIntegerScalar, Rows, Cols, Options, MaxRows, MaxCols> NonInteger;
-  typedef ArrayType & Nested;
-  
+  typedef ArrayType& Nested;
+  typedef typename NumTraits<Scalar>::Literal Literal;
+
   enum {
     IsComplex = NumTraits<Scalar>::IsComplex,
     IsInteger = NumTraits<Scalar>::IsInteger,
-    IsSigned  = NumTraits<Scalar>::IsSigned,
+    IsSigned = NumTraits<Scalar>::IsSigned,
     RequireInitialization = 1,
-    ReadCost = ArrayType::SizeAtCompileTime==Dynamic ? Dynamic : ArrayType::SizeAtCompileTime * NumTraits<Scalar>::ReadCost,
-    AddCost  = ArrayType::SizeAtCompileTime==Dynamic ? Dynamic : ArrayType::SizeAtCompileTime * NumTraits<Scalar>::AddCost,
-    MulCost  = ArrayType::SizeAtCompileTime==Dynamic ? Dynamic : ArrayType::SizeAtCompileTime * NumTraits<Scalar>::MulCost
+    ReadCost = ArrayType::SizeAtCompileTime == Dynamic
+                   ? HugeCost
+                   : ArrayType::SizeAtCompileTime * int(NumTraits<Scalar>::ReadCost),
+    AddCost = ArrayType::SizeAtCompileTime == Dynamic ? HugeCost
+                                                      : ArrayType::SizeAtCompileTime * int(NumTraits<Scalar>::AddCost),
+    MulCost = ArrayType::SizeAtCompileTime == Dynamic ? HugeCost
+                                                      : ArrayType::SizeAtCompileTime * int(NumTraits<Scalar>::MulCost)
   };
-  
-  static inline RealScalar epsilon() { return NumTraits<RealScalar>::epsilon(); }
-  static inline RealScalar dummy_precision() { return NumTraits<RealScalar>::dummy_precision(); }
+
+  EIGEN_DEVICE_FUNC constexpr static RealScalar epsilon() { return NumTraits<RealScalar>::epsilon(); }
+  EIGEN_DEVICE_FUNC constexpr static RealScalar dummy_precision() { return NumTraits<RealScalar>::dummy_precision(); }
+
+  constexpr static int digits10() { return NumTraits<Scalar>::digits10(); }
+  constexpr static int max_digits10() { return NumTraits<Scalar>::max_digits10(); }
+};
+
+template <>
+struct NumTraits<std::string> : GenericNumTraits<std::string> {
+  enum { RequireInitialization = 1, ReadCost = HugeCost, AddCost = HugeCost, MulCost = HugeCost };
+
+  constexpr static int digits10() { return 0; }
+  constexpr static int max_digits10() { return 0; }
+
+ private:
+  static inline std::string epsilon();
+  static inline std::string dummy_precision();
+  static inline std::string lowest();
+  static inline std::string highest();
+  static inline std::string infinity();
+  static inline std::string quiet_NaN();
 };
 
-} // end namespace Eigen
+// Empty specialization for void to allow template specialization based on NumTraits<T>::Real with T==void and SFINAE.
+template <>
+struct NumTraits<void> {};
+
+template <>
+struct NumTraits<bool> : GenericNumTraits<bool> {};
+
+}  // end namespace Eigen
 
-#endif // EIGEN_NUMTRAITS_H
+#endif  // EIGEN_NUMTRAITS_H
diff --git a/inst/include/Eigen/src/Core/PartialReduxEvaluator.h b/inst/include/Eigen/src/Core/PartialReduxEvaluator.h
new file mode 100644
index 00000000..1f638f9a
--- /dev/null
+++ b/inst/include/Eigen/src/Core/PartialReduxEvaluator.h
@@ -0,0 +1,253 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2011-2018 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PARTIALREDUX_H
+#define EIGEN_PARTIALREDUX_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+/***************************************************************************
+ *
+ * This file provides evaluators for partial reductions.
+ * There are two modes:
+ *
+ *  - scalar path: simply calls the respective function on the column or row.
+ *    -> nothing special here, all the tricky part is handled by the return
+ *       types of VectorwiseOp's members. They embed the functor calling the
+ *       respective DenseBase's member function.
+ *
+ *  - vectorized path: implements a packet-wise reductions followed by
+ *    some (optional) processing of the outcome, e.g., division by n for mean.
+ *
+ * For the vectorized path let's observe that the packet-size and outer-unrolling
+ * are both decided by the assignment logic. So all we have to do is to decide
+ * on the inner unrolling.
+ *
+ * For the unrolling, we can reuse "internal::redux_vec_unroller" from Redux.h,
+ * but be need to be careful to specify correct increment.
+ *
+ ***************************************************************************/
+
+/* logic deciding a strategy for unrolling of vectorized paths */
+template <typename Func, typename Evaluator>
+struct packetwise_redux_traits {
+  enum {
+    OuterSize = int(Evaluator::IsRowMajor) ? Evaluator::RowsAtCompileTime : Evaluator::ColsAtCompileTime,
+    Cost = OuterSize == Dynamic ? HugeCost
+                                : OuterSize * Evaluator::CoeffReadCost + (OuterSize - 1) * functor_traits<Func>::Cost,
+    Unrolling = Cost <= EIGEN_UNROLLING_LIMIT ? CompleteUnrolling : NoUnrolling
+  };
+};
+
+/* Value to be returned when size==0 , by default let's return 0 */
+template <typename PacketType, typename Func>
+EIGEN_DEVICE_FUNC PacketType packetwise_redux_empty_value(const Func&) {
+  const typename unpacket_traits<PacketType>::type zero(0);
+  return pset1<PacketType>(zero);
+}
+
+/* For products the default is 1 */
+template <typename PacketType, typename Scalar>
+EIGEN_DEVICE_FUNC PacketType packetwise_redux_empty_value(const scalar_product_op<Scalar, Scalar>&) {
+  return pset1<PacketType>(Scalar(1));
+}
+
+/* Perform the actual reduction */
+template <typename Func, typename Evaluator, int Unrolling = packetwise_redux_traits<Func, Evaluator>::Unrolling>
+struct packetwise_redux_impl;
+
+/* Perform the actual reduction with unrolling */
+template <typename Func, typename Evaluator>
+struct packetwise_redux_impl<Func, Evaluator, CompleteUnrolling> {
+  typedef redux_novec_unroller<Func, Evaluator, 0, Evaluator::SizeAtCompileTime> Base;
+  typedef typename Evaluator::Scalar Scalar;
+
+  template <typename PacketType>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE PacketType run(const Evaluator& eval, const Func& func, Index /*size*/) {
+    return redux_vec_unroller<Func, Evaluator, 0,
+                              packetwise_redux_traits<Func, Evaluator>::OuterSize>::template run<PacketType>(eval,
+                                                                                                             func);
+  }
+};
+
+/* Add a specialization of redux_vec_unroller for size==0 at compiletime.
+ * This specialization is not required for general reductions, which is
+ * why it is defined here.
+ */
+template <typename Func, typename Evaluator, Index Start>
+struct redux_vec_unroller<Func, Evaluator, Start, 0> {
+  template <typename PacketType>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE PacketType run(const Evaluator&, const Func& f) {
+    return packetwise_redux_empty_value<PacketType>(f);
+  }
+};
+
+/* Perform the actual reduction for dynamic sizes */
+template <typename Func, typename Evaluator>
+struct packetwise_redux_impl<Func, Evaluator, NoUnrolling> {
+  typedef typename Evaluator::Scalar Scalar;
+  typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
+
+  template <typename PacketType>
+  EIGEN_DEVICE_FUNC static PacketType run(const Evaluator& eval, const Func& func, Index size) {
+    if (size == 0) return packetwise_redux_empty_value<PacketType>(func);
+
+    const Index size4 = 1 + numext::round_down(size - 1, 4);
+    PacketType p = eval.template packetByOuterInner<Unaligned, PacketType>(0, 0);
+    // This loop is optimized for instruction pipelining:
+    // - each iteration generates two independent instructions
+    // - thanks to branch prediction and out-of-order execution we have independent instructions across loops
+    for (Index i = 1; i < size4; i += 4)
+      p = func.packetOp(
+          p, func.packetOp(func.packetOp(eval.template packetByOuterInner<Unaligned, PacketType>(i + 0, 0),
+                                         eval.template packetByOuterInner<Unaligned, PacketType>(i + 1, 0)),
+                           func.packetOp(eval.template packetByOuterInner<Unaligned, PacketType>(i + 2, 0),
+                                         eval.template packetByOuterInner<Unaligned, PacketType>(i + 3, 0))));
+    for (Index i = size4; i < size; ++i)
+      p = func.packetOp(p, eval.template packetByOuterInner<Unaligned, PacketType>(i, 0));
+    return p;
+  }
+};
+
+template <typename Func, typename Evaluator>
+struct packetwise_segment_redux_impl {
+  typedef typename Evaluator::Scalar Scalar;
+  typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
+
+  template <typename PacketType>
+  EIGEN_DEVICE_FUNC static PacketType run(const Evaluator& eval, const Func& func, Index size, Index begin,
+                                          Index count) {
+    if (size == 0) return packetwise_redux_empty_value<PacketType>(func);
+
+    PacketType p = eval.template packetSegmentByOuterInner<Unaligned, PacketType>(0, 0, begin, count);
+    for (Index i = 1; i < size; ++i)
+      p = func.packetOp(p, eval.template packetSegmentByOuterInner<Unaligned, PacketType>(i, 0, begin, count));
+    return p;
+  }
+};
+
+template <typename ArgType, typename MemberOp, int Direction>
+struct evaluator<PartialReduxExpr<ArgType, MemberOp, Direction> >
+    : evaluator_base<PartialReduxExpr<ArgType, MemberOp, Direction> > {
+  typedef PartialReduxExpr<ArgType, MemberOp, Direction> XprType;
+  typedef typename internal::nested_eval<ArgType, 1>::type ArgTypeNested;
+  typedef add_const_on_value_type_t<ArgTypeNested> ConstArgTypeNested;
+  typedef internal::remove_all_t<ArgTypeNested> ArgTypeNestedCleaned;
+  typedef typename ArgType::Scalar InputScalar;
+  typedef typename XprType::Scalar Scalar;
+  enum {
+    TraversalSize = Direction == int(Vertical) ? int(ArgType::RowsAtCompileTime) : int(ArgType::ColsAtCompileTime)
+  };
+  typedef typename MemberOp::template Cost<int(TraversalSize)> CostOpType;
+  enum {
+    CoeffReadCost = TraversalSize == Dynamic ? HugeCost
+                    : TraversalSize == 0
+                        ? 1
+                        : int(TraversalSize) * int(evaluator<ArgType>::CoeffReadCost) + int(CostOpType::value),
+
+    ArgFlags_ = evaluator<ArgType>::Flags,
+
+    Vectorizable_ = bool(int(ArgFlags_) & PacketAccessBit) && bool(MemberOp::Vectorizable) &&
+                    (Direction == int(Vertical) ? bool(ArgFlags_ & RowMajorBit) : (ArgFlags_ & RowMajorBit) == 0) &&
+                    (TraversalSize != 0),
+
+    Flags = (traits<XprType>::Flags & RowMajorBit) | (evaluator<ArgType>::Flags & (HereditaryBits & (~RowMajorBit))) |
+            (Vectorizable_ ? PacketAccessBit : 0) | LinearAccessBit,
+
+    Alignment = 0  // FIXME this will need to be improved once PartialReduxExpr is vectorized
+  };
+
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType xpr) : m_arg(xpr.nestedExpression()), m_functor(xpr.functor()) {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(TraversalSize == Dynamic ? HugeCost
+                                                             : (TraversalSize == 0 ? 1 : int(CostOpType::value)));
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index i, Index j) const {
+    return coeff(Direction == Vertical ? j : i);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index index) const {
+    return m_functor(m_arg.template subVector<DirectionType(Direction)>(index));
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index i, Index j) const {
+    return packet<LoadMode, PacketType>(Direction == Vertical ? j : i);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC PacketType packet(Index idx) const {
+    static constexpr int PacketSize = internal::unpacket_traits<PacketType>::size;
+    static constexpr int PanelRows = Direction == Vertical ? ArgType::RowsAtCompileTime : PacketSize;
+    static constexpr int PanelCols = Direction == Vertical ? PacketSize : ArgType::ColsAtCompileTime;
+    using PanelType = Block<const ArgTypeNestedCleaned, PanelRows, PanelCols, true /* InnerPanel */>;
+    using PanelEvaluator = typename internal::redux_evaluator<PanelType>;
+    using BinaryOp = typename MemberOp::BinaryOp;
+    using Impl = internal::packetwise_redux_impl<BinaryOp, PanelEvaluator>;
+
+    // FIXME
+    // See bug 1612, currently if PacketSize==1 (i.e. complex<double> with 128bits registers) then the storage-order of
+    // panel get reversed and methods like packetByOuterInner do not make sense anymore in this context. So let's just
+    // by pass "vectorization" in this case:
+    if (PacketSize == 1) return internal::pset1<PacketType>(coeff(idx));
+
+    Index startRow = Direction == Vertical ? 0 : idx;
+    Index startCol = Direction == Vertical ? idx : 0;
+    Index numRows = Direction == Vertical ? m_arg.rows() : PacketSize;
+    Index numCols = Direction == Vertical ? PacketSize : m_arg.cols();
+
+    PanelType panel(m_arg, startRow, startCol, numRows, numCols);
+    PanelEvaluator panel_eval(panel);
+    PacketType p = Impl::template run<PacketType>(panel_eval, m_functor.binaryFunc(), m_arg.outerSize());
+    return p;
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index i, Index j, Index begin, Index count) const {
+    return packetSegment<LoadMode, PacketType>(Direction == Vertical ? j : i, begin, count);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC PacketType packetSegment(Index idx, Index begin, Index count) const {
+    static constexpr int PanelRows = Direction == Vertical ? ArgType::RowsAtCompileTime : Dynamic;
+    static constexpr int PanelCols = Direction == Vertical ? Dynamic : ArgType::ColsAtCompileTime;
+    using PanelType = Block<const ArgTypeNestedCleaned, PanelRows, PanelCols, true /* InnerPanel */>;
+    using PanelEvaluator = typename internal::redux_evaluator<PanelType>;
+    using BinaryOp = typename MemberOp::BinaryOp;
+    using Impl = internal::packetwise_segment_redux_impl<BinaryOp, PanelEvaluator>;
+
+    Index startRow = Direction == Vertical ? 0 : idx;
+    Index startCol = Direction == Vertical ? idx : 0;
+    Index numRows = Direction == Vertical ? m_arg.rows() : begin + count;
+    Index numCols = Direction == Vertical ? begin + count : m_arg.cols();
+
+    PanelType panel(m_arg, startRow, startCol, numRows, numCols);
+    PanelEvaluator panel_eval(panel);
+    PacketType p = Impl::template run<PacketType>(panel_eval, m_functor.binaryFunc(), m_arg.outerSize(), begin, count);
+    return p;
+  }
+
+ protected:
+  ConstArgTypeNested m_arg;
+  const MemberOp m_functor;
+};
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_PARTIALREDUX_H
diff --git a/inst/include/Eigen/src/Core/PermutationMatrix.h b/inst/include/Eigen/src/Core/PermutationMatrix.h
index 85ffae26..77133545 100644
--- a/inst/include/Eigen/src/Core/PermutationMatrix.h
+++ b/inst/include/Eigen/src/Core/PermutationMatrix.h
@@ -2,7 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
-// Copyright (C) 2009-2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2009-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -11,711 +11,545 @@
 #ifndef EIGEN_PERMUTATIONMATRIX_H
 #define EIGEN_PERMUTATIONMATRIX_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 
-template<int RowCol,typename IndicesType,typename MatrixType, typename StorageKind> class PermutedImpl;
-
-/** \class PermutationBase
-  * \ingroup Core_Module
-  *
-  * \brief Base class for permutations
-  *
-  * \param Derived the derived class
-  *
-  * This class is the base class for all expressions representing a permutation matrix,
-  * internally stored as a vector of integers.
-  * The convention followed here is that if \f$ \sigma \f$ is a permutation, the corresponding permutation matrix
-  * \f$ P_\sigma \f$ is such that if \f$ (e_1,\ldots,e_p) \f$ is the canonical basis, we have:
-  *  \f[ P_\sigma(e_i) = e_{\sigma(i)}. \f]
-  * This convention ensures that for any two permutations \f$ \sigma, \tau \f$, we have:
-  *  \f[ P_{\sigma\circ\tau} = P_\sigma P_\tau. \f]
-  *
-  * Permutation matrices are square and invertible.
-  *
-  * Notice that in addition to the member functions and operators listed here, there also are non-member
-  * operator* to multiply any kind of permutation object with any kind of matrix expression (MatrixBase)
-  * on either side.
-  *
-  * \sa class PermutationMatrix, class PermutationWrapper
-  */
+namespace Eigen {
 
 namespace internal {
 
-template<typename PermutationType, typename MatrixType, int Side, bool Transposed=false>
-struct permut_matrix_product_retval;
-template<typename PermutationType, typename MatrixType, int Side, bool Transposed=false>
-struct permut_sparsematrix_product_retval;
-enum PermPermProduct_t {PermPermProduct};
-
-} // end namespace internal
-
-template<typename Derived>
-class PermutationBase : public EigenBase<Derived>
-{
-    typedef internal::traits<Derived> Traits;
-    typedef EigenBase<Derived> Base;
-  public:
-
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    typedef typename Traits::IndicesType IndicesType;
-    enum {
-      Flags = Traits::Flags,
-      CoeffReadCost = Traits::CoeffReadCost,
-      RowsAtCompileTime = Traits::RowsAtCompileTime,
-      ColsAtCompileTime = Traits::ColsAtCompileTime,
-      MaxRowsAtCompileTime = Traits::MaxRowsAtCompileTime,
-      MaxColsAtCompileTime = Traits::MaxColsAtCompileTime
-    };
-    typedef typename Traits::Scalar Scalar;
-    typedef typename Traits::Index Index;
-    typedef Matrix<Scalar,RowsAtCompileTime,ColsAtCompileTime,0,MaxRowsAtCompileTime,MaxColsAtCompileTime>
-            DenseMatrixType;
-    typedef PermutationMatrix<IndicesType::SizeAtCompileTime,IndicesType::MaxSizeAtCompileTime,Index>
-            PlainPermutationType;
-    using Base::derived;
-    #endif
-
-    /** Copies the other permutation into *this */
-    template<typename OtherDerived>
-    Derived& operator=(const PermutationBase<OtherDerived>& other)
-    {
-      indices() = other.indices();
-      return derived();
-    }
-
-    /** Assignment from the Transpositions \a tr */
-    template<typename OtherDerived>
-    Derived& operator=(const TranspositionsBase<OtherDerived>& tr)
-    {
-      setIdentity(tr.size());
-      for(Index k=size()-1; k>=0; --k)
-        applyTranspositionOnTheRight(k,tr.coeff(k));
-      return derived();
-    }
+enum PermPermProduct_t { PermPermProduct };
 
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** This is a special case of the templated operator=. Its purpose is to
-      * prevent a default operator= from hiding the templated operator=.
-      */
-    Derived& operator=(const PermutationBase& other)
-    {
-      indices() = other.indices();
-      return derived();
-    }
-    #endif
+}  // end namespace internal
 
-    /** \returns the number of rows */
-    inline Index rows() const { return Index(indices().size()); }
-
-    /** \returns the number of columns */
-    inline Index cols() const { return Index(indices().size()); }
-
-    /** \returns the size of a side of the respective square matrix, i.e., the number of indices */
-    inline Index size() const { return Index(indices().size()); }
+/** \class PermutationBase
+ * \ingroup Core_Module
+ *
+ * \brief Base class for permutations
+ *
+ * \tparam Derived the derived class
+ *
+ * This class is the base class for all expressions representing a permutation matrix,
+ * internally stored as a vector of integers.
+ * The convention followed here is that if \f$ \sigma \f$ is a permutation, the corresponding permutation matrix
+ * \f$ P_\sigma \f$ is such that if \f$ (e_1,\ldots,e_p) \f$ is the canonical basis, we have:
+ *  \f[ P_\sigma(e_i) = e_{\sigma(i)}. \f]
+ * This convention ensures that for any two permutations \f$ \sigma, \tau \f$, we have:
+ *  \f[ P_{\sigma\circ\tau} = P_\sigma P_\tau. \f]
+ *
+ * Permutation matrices are square and invertible.
+ *
+ * Notice that in addition to the member functions and operators listed here, there also are non-member
+ * operator* to multiply any kind of permutation object with any kind of matrix expression (MatrixBase)
+ * on either side.
+ *
+ * \sa class PermutationMatrix, class PermutationWrapper
+ */
+template <typename Derived>
+class PermutationBase : public EigenBase<Derived> {
+  typedef internal::traits<Derived> Traits;
+  typedef EigenBase<Derived> Base;
+
+ public:
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  typedef typename Traits::IndicesType IndicesType;
+  enum {
+    Flags = Traits::Flags,
+    RowsAtCompileTime = Traits::RowsAtCompileTime,
+    ColsAtCompileTime = Traits::ColsAtCompileTime,
+    MaxRowsAtCompileTime = Traits::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = Traits::MaxColsAtCompileTime
+  };
+  typedef typename Traits::StorageIndex StorageIndex;
+  typedef Matrix<StorageIndex, RowsAtCompileTime, ColsAtCompileTime, 0, MaxRowsAtCompileTime, MaxColsAtCompileTime>
+      DenseMatrixType;
+  typedef PermutationMatrix<IndicesType::SizeAtCompileTime, IndicesType::MaxSizeAtCompileTime, StorageIndex>
+      PlainPermutationType;
+  typedef PlainPermutationType PlainObject;
+  using Base::derived;
+  typedef Inverse<Derived> InverseReturnType;
+  typedef void Scalar;
+#endif
 
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    template<typename DenseDerived>
-    void evalTo(MatrixBase<DenseDerived>& other) const
-    {
-      other.setZero();
-      for (int i=0; i<rows();++i)
-        other.coeffRef(indices().coeff(i),i) = typename DenseDerived::Scalar(1);
-    }
-    #endif
-
-    /** \returns a Matrix object initialized from this permutation matrix. Notice that it
-      * is inefficient to return this Matrix object by value. For efficiency, favor using
-      * the Matrix constructor taking EigenBase objects.
-      */
-    DenseMatrixType toDenseMatrix() const
-    {
-      return derived();
-    }
+  /** Copies the other permutation into *this */
+  template <typename OtherDerived>
+  Derived& operator=(const PermutationBase<OtherDerived>& other) {
+    indices() = other.indices();
+    return derived();
+  }
 
-    /** const version of indices(). */
-    const IndicesType& indices() const { return derived().indices(); }
-    /** \returns a reference to the stored array representing the permutation. */
-    IndicesType& indices() { return derived().indices(); }
+  /** Assignment from the Transpositions \a tr */
+  template <typename OtherDerived>
+  Derived& operator=(const TranspositionsBase<OtherDerived>& tr) {
+    setIdentity(tr.size());
+    for (Index k = size() - 1; k >= 0; --k) applyTranspositionOnTheRight(k, tr.coeff(k));
+    return derived();
+  }
 
-    /** Resizes to given size.
-      */
-    inline void resize(Index newSize)
-    {
-      indices().resize(newSize);
-    }
+  /** \returns the number of rows */
+  inline EIGEN_DEVICE_FUNC Index rows() const { return Index(indices().size()); }
 
-    /** Sets *this to be the identity permutation matrix */
-    void setIdentity()
-    {
-      for(Index i = 0; i < size(); ++i)
-        indices().coeffRef(i) = i;
-    }
+  /** \returns the number of columns */
+  inline EIGEN_DEVICE_FUNC Index cols() const { return Index(indices().size()); }
 
-    /** Sets *this to be the identity permutation matrix of given size.
-      */
-    void setIdentity(Index newSize)
-    {
-      resize(newSize);
-      setIdentity();
-    }
+  /** \returns the size of a side of the respective square matrix, i.e., the number of indices */
+  inline EIGEN_DEVICE_FUNC Index size() const { return Index(indices().size()); }
 
-    /** Multiplies *this by the transposition \f$(ij)\f$ on the left.
-      *
-      * \returns a reference to *this.
-      *
-      * \warning This is much slower than applyTranspositionOnTheRight(int,int):
-      * this has linear complexity and requires a lot of branching.
-      *
-      * \sa applyTranspositionOnTheRight(int,int)
-      */
-    Derived& applyTranspositionOnTheLeft(Index i, Index j)
-    {
-      eigen_assert(i>=0 && j>=0 && i<size() && j<size());
-      for(Index k = 0; k < size(); ++k)
-      {
-        if(indices().coeff(k) == i) indices().coeffRef(k) = j;
-        else if(indices().coeff(k) == j) indices().coeffRef(k) = i;
-      }
-      return derived();
-    }
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  template <typename DenseDerived>
+  void evalTo(MatrixBase<DenseDerived>& other) const {
+    other.setZero();
+    for (Index i = 0; i < rows(); ++i) other.coeffRef(indices().coeff(i), i) = typename DenseDerived::Scalar(1);
+  }
+#endif
 
-    /** Multiplies *this by the transposition \f$(ij)\f$ on the right.
-      *
-      * \returns a reference to *this.
-      *
-      * This is a fast operation, it only consists in swapping two indices.
-      *
-      * \sa applyTranspositionOnTheLeft(int,int)
-      */
-    Derived& applyTranspositionOnTheRight(Index i, Index j)
-    {
-      eigen_assert(i>=0 && j>=0 && i<size() && j<size());
-      std::swap(indices().coeffRef(i), indices().coeffRef(j));
-      return derived();
+  /** \returns a Matrix object initialized from this permutation matrix. Notice that it
+   * is inefficient to return this Matrix object by value. For efficiency, favor using
+   * the Matrix constructor taking EigenBase objects.
+   */
+  DenseMatrixType toDenseMatrix() const { return derived(); }
+
+  /** \returns the plain matrix representation of the permutation. */
+  DenseMatrixType eval() const { return toDenseMatrix(); }
+
+  /** const version of indices(). */
+  const IndicesType& indices() const { return derived().indices(); }
+  /** \returns a reference to the stored array representing the permutation. */
+  IndicesType& indices() { return derived().indices(); }
+
+  /** Resizes to given size.
+   */
+  inline void resize(Index newSize) { indices().resize(newSize); }
+
+  /** Sets *this to be the identity permutation matrix */
+  void setIdentity() {
+    StorageIndex n = StorageIndex(size());
+    for (StorageIndex i = 0; i < n; ++i) indices().coeffRef(i) = i;
+  }
+
+  /** Sets *this to be the identity permutation matrix of given size.
+   */
+  void setIdentity(Index newSize) {
+    resize(newSize);
+    setIdentity();
+  }
+
+  /** Multiplies *this by the transposition \f$(ij)\f$ on the left.
+   *
+   * \returns a reference to *this.
+   *
+   * \warning This is much slower than applyTranspositionOnTheRight(Index,Index):
+   * this has linear complexity and requires a lot of branching.
+   *
+   * \sa applyTranspositionOnTheRight(Index,Index)
+   */
+  Derived& applyTranspositionOnTheLeft(Index i, Index j) {
+    eigen_assert(i >= 0 && j >= 0 && i < size() && j < size());
+    for (Index k = 0; k < size(); ++k) {
+      if (indices().coeff(k) == i)
+        indices().coeffRef(k) = StorageIndex(j);
+      else if (indices().coeff(k) == j)
+        indices().coeffRef(k) = StorageIndex(i);
     }
+    return derived();
+  }
+
+  /** Multiplies *this by the transposition \f$(ij)\f$ on the right.
+   *
+   * \returns a reference to *this.
+   *
+   * This is a fast operation, it only consists in swapping two indices.
+   *
+   * \sa applyTranspositionOnTheLeft(Index,Index)
+   */
+  Derived& applyTranspositionOnTheRight(Index i, Index j) {
+    eigen_assert(i >= 0 && j >= 0 && i < size() && j < size());
+    std::swap(indices().coeffRef(i), indices().coeffRef(j));
+    return derived();
+  }
+
+  /** \returns the inverse permutation matrix.
+   *
+   * \note \blank \note_try_to_help_rvo
+   */
+  inline InverseReturnType inverse() const { return InverseReturnType(derived()); }
+  /** \returns the transpose permutation matrix.
+   *
+   * \note \blank \note_try_to_help_rvo
+   */
+  inline InverseReturnType transpose() const { return InverseReturnType(derived()); }
+
+  /**** multiplication helpers to hopefully get RVO ****/
 
-    /** \returns the inverse permutation matrix.
-      *
-      * \note \note_try_to_help_rvo
-      */
-    inline Transpose<PermutationBase> inverse() const
-    { return derived(); }
-    /** \returns the tranpose permutation matrix.
-      *
-      * \note \note_try_to_help_rvo
-      */
-    inline Transpose<PermutationBase> transpose() const
-    { return derived(); }
-
-    /**** multiplication helpers to hopefully get RVO ****/
-
-  
 #ifndef EIGEN_PARSED_BY_DOXYGEN
-  protected:
-    template<typename OtherDerived>
-    void assignTranspose(const PermutationBase<OtherDerived>& other)
-    {
-      for (int i=0; i<rows();++i) indices().coeffRef(other.indices().coeff(i)) = i;
-    }
-    template<typename Lhs,typename Rhs>
-    void assignProduct(const Lhs& lhs, const Rhs& rhs)
-    {
-      eigen_assert(lhs.cols() == rhs.rows());
-      for (int i=0; i<rows();++i) indices().coeffRef(i) = lhs.indices().coeff(rhs.indices().coeff(i));
-    }
+ protected:
+  template <typename OtherDerived>
+  void assignTranspose(const PermutationBase<OtherDerived>& other) {
+    for (Index i = 0; i < rows(); ++i) indices().coeffRef(other.indices().coeff(i)) = i;
+  }
+  template <typename Lhs, typename Rhs>
+  void assignProduct(const Lhs& lhs, const Rhs& rhs) {
+    eigen_assert(lhs.cols() == rhs.rows());
+    for (Index i = 0; i < rows(); ++i) indices().coeffRef(i) = lhs.indices().coeff(rhs.indices().coeff(i));
+  }
 #endif
 
-  public:
-
-    /** \returns the product permutation matrix.
-      *
-      * \note \note_try_to_help_rvo
-      */
-    template<typename Other>
-    inline PlainPermutationType operator*(const PermutationBase<Other>& other) const
-    { return PlainPermutationType(internal::PermPermProduct, derived(), other.derived()); }
-
-    /** \returns the product of a permutation with another inverse permutation.
-      *
-      * \note \note_try_to_help_rvo
-      */
-    template<typename Other>
-    inline PlainPermutationType operator*(const Transpose<PermutationBase<Other> >& other) const
-    { return PlainPermutationType(internal::PermPermProduct, *this, other.eval()); }
-
-    /** \returns the product of an inverse permutation with another permutation.
-      *
-      * \note \note_try_to_help_rvo
-      */
-    template<typename Other> friend
-    inline PlainPermutationType operator*(const Transpose<PermutationBase<Other> >& other, const PermutationBase& perm)
-    { return PlainPermutationType(internal::PermPermProduct, other.eval(), perm); }
-    
-    /** \returns the determinant of the permutation matrix, which is either 1 or -1 depending on the parity of the permutation.
-      *
-      * This function is O(\c n) procedure allocating a buffer of \c n booleans.
-      */
-    Index determinant() const
-    {
-      Index res = 1;
-      Index n = size();
-      Matrix<bool,RowsAtCompileTime,1,0,MaxRowsAtCompileTime> mask(n);
-      mask.fill(false);
-      Index r = 0;
-      while(r < n)
-      {
-        // search for the next seed
-        while(r<n && mask[r]) r++;
-        if(r>=n)
-          break;
-        // we got one, let's follow it until we are back to the seed
-        Index k0 = r++;
-        mask.coeffRef(k0) = true;
-        for(Index k=indices().coeff(k0); k!=k0; k=indices().coeff(k))
-        {
-          mask.coeffRef(k) = true;
-          res = -res;
-        }
+ public:
+  /** \returns the product permutation matrix.
+   *
+   * \note \blank \note_try_to_help_rvo
+   */
+  template <typename Other>
+  inline PlainPermutationType operator*(const PermutationBase<Other>& other) const {
+    return PlainPermutationType(internal::PermPermProduct, derived(), other.derived());
+  }
+
+  /** \returns the product of a permutation with another inverse permutation.
+   *
+   * \note \blank \note_try_to_help_rvo
+   */
+  template <typename Other>
+  inline PlainPermutationType operator*(const InverseImpl<Other, PermutationStorage>& other) const {
+    return PlainPermutationType(internal::PermPermProduct, *this, other.eval());
+  }
+
+  /** \returns the product of an inverse permutation with another permutation.
+   *
+   * \note \blank \note_try_to_help_rvo
+   */
+  template <typename Other>
+  friend inline PlainPermutationType operator*(const InverseImpl<Other, PermutationStorage>& other,
+                                               const PermutationBase& perm) {
+    return PlainPermutationType(internal::PermPermProduct, other.eval(), perm);
+  }
+
+  /** \returns the determinant of the permutation matrix, which is either 1 or -1 depending on the parity of the
+   * permutation.
+   *
+   * This function is O(\c n) procedure allocating a buffer of \c n booleans.
+   */
+  Index determinant() const {
+    Index res = 1;
+    Index n = size();
+    Matrix<bool, RowsAtCompileTime, 1, 0, MaxRowsAtCompileTime> mask(n);
+    mask.fill(false);
+    Index r = 0;
+    while (r < n) {
+      // search for the next seed
+      while (r < n && mask[r]) r++;
+      if (r >= n) break;
+      // we got one, let's follow it until we are back to the seed
+      Index k0 = r++;
+      mask.coeffRef(k0) = true;
+      for (Index k = indices().coeff(k0); k != k0; k = indices().coeff(k)) {
+        mask.coeffRef(k) = true;
+        res = -res;
       }
-      return res;
     }
+    return res;
+  }
 
-  protected:
-
+ protected:
 };
 
-/** \class PermutationMatrix
-  * \ingroup Core_Module
-  *
-  * \brief Permutation matrix
-  *
-  * \param SizeAtCompileTime the number of rows/cols, or Dynamic
-  * \param MaxSizeAtCompileTime the maximum number of rows/cols, or Dynamic. This optional parameter defaults to SizeAtCompileTime. Most of the time, you should not have to specify it.
-  * \param IndexType the interger type of the indices
-  *
-  * This class represents a permutation matrix, internally stored as a vector of integers.
-  *
-  * \sa class PermutationBase, class PermutationWrapper, class DiagonalMatrix
-  */
-
 namespace internal {
-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType>
-struct traits<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, IndexType> >
- : traits<Matrix<IndexType,SizeAtCompileTime,SizeAtCompileTime,0,MaxSizeAtCompileTime,MaxSizeAtCompileTime> >
-{
-  typedef IndexType Index;
-  typedef Matrix<IndexType, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1> IndicesType;
+template <int SizeAtCompileTime, int MaxSizeAtCompileTime, typename StorageIndex_>
+struct traits<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, StorageIndex_> >
+    : traits<
+          Matrix<StorageIndex_, SizeAtCompileTime, SizeAtCompileTime, 0, MaxSizeAtCompileTime, MaxSizeAtCompileTime> > {
+  typedef PermutationStorage StorageKind;
+  typedef Matrix<StorageIndex_, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1> IndicesType;
+  typedef StorageIndex_ StorageIndex;
+  typedef void Scalar;
 };
-}
+}  // namespace internal
 
-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType>
-class PermutationMatrix : public PermutationBase<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, IndexType> >
-{
-    typedef PermutationBase<PermutationMatrix> Base;
-    typedef internal::traits<PermutationMatrix> Traits;
-  public:
-
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    typedef typename Traits::IndicesType IndicesType;
-    #endif
-
-    inline PermutationMatrix()
-    {}
-
-    /** Constructs an uninitialized permutation matrix of given size.
-      */
-    inline PermutationMatrix(int size) : m_indices(size)
-    {}
-
-    /** Copy constructor. */
-    template<typename OtherDerived>
-    inline PermutationMatrix(const PermutationBase<OtherDerived>& other)
-      : m_indices(other.indices()) {}
-
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** Standard copy constructor. Defined only to prevent a default copy constructor
-      * from hiding the other templated constructor */
-    inline PermutationMatrix(const PermutationMatrix& other) : m_indices(other.indices()) {}
-    #endif
-
-    /** Generic constructor from expression of the indices. The indices
-      * array has the meaning that the permutations sends each integer i to indices[i].
-      *
-      * \warning It is your responsibility to check that the indices array that you passes actually
-      * describes a permutation, i.e., each value between 0 and n-1 occurs exactly once, where n is the
-      * array's size.
-      */
-    template<typename Other>
-    explicit inline PermutationMatrix(const MatrixBase<Other>& a_indices) : m_indices(a_indices)
-    {}
-
-    /** Convert the Transpositions \a tr to a permutation matrix */
-    template<typename Other>
-    explicit PermutationMatrix(const TranspositionsBase<Other>& tr)
-      : m_indices(tr.size())
-    {
-      *this = tr;
-    }
+/** \class PermutationMatrix
+ * \ingroup Core_Module
+ *
+ * \brief Permutation matrix
+ *
+ * \tparam SizeAtCompileTime the number of rows/cols, or Dynamic
+ * \tparam MaxSizeAtCompileTime the maximum number of rows/cols, or Dynamic. This optional parameter defaults to
+ * SizeAtCompileTime. Most of the time, you should not have to specify it. \tparam StorageIndex_ the integer type of the
+ * indices
+ *
+ * This class represents a permutation matrix, internally stored as a vector of integers.
+ *
+ * \sa class PermutationBase, class PermutationWrapper, class DiagonalMatrix
+ */
+template <int SizeAtCompileTime, int MaxSizeAtCompileTime, typename StorageIndex_>
+class PermutationMatrix
+    : public PermutationBase<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, StorageIndex_> > {
+  typedef PermutationBase<PermutationMatrix> Base;
+  typedef internal::traits<PermutationMatrix> Traits;
+
+ public:
+  typedef const PermutationMatrix& Nested;
 
-    /** Copies the other permutation into *this */
-    template<typename Other>
-    PermutationMatrix& operator=(const PermutationBase<Other>& other)
-    {
-      m_indices = other.indices();
-      return *this;
-    }
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  typedef typename Traits::IndicesType IndicesType;
+  typedef typename Traits::StorageIndex StorageIndex;
+#endif
 
-    /** Assignment from the Transpositions \a tr */
-    template<typename Other>
-    PermutationMatrix& operator=(const TranspositionsBase<Other>& tr)
-    {
-      return Base::operator=(tr.derived());
-    }
+  inline PermutationMatrix() {}
+
+  /** Constructs an uninitialized permutation matrix of given size.
+   */
+  explicit inline PermutationMatrix(Index size) : m_indices(size) {
+    eigen_internal_assert(size <= NumTraits<StorageIndex>::highest());
+  }
+
+  /** Copy constructor. */
+  template <typename OtherDerived>
+  inline PermutationMatrix(const PermutationBase<OtherDerived>& other) : m_indices(other.indices()) {}
+
+  /** Generic constructor from expression of the indices. The indices
+   * array has the meaning that the permutations sends each integer i to indices[i].
+   *
+   * \warning It is your responsibility to check that the indices array that you passes actually
+   * describes a permutation, i.e., each value between 0 and n-1 occurs exactly once, where n is the
+   * array's size.
+   */
+  template <typename Other>
+  explicit inline PermutationMatrix(const MatrixBase<Other>& indices) : m_indices(indices) {}
+
+  /** Convert the Transpositions \a tr to a permutation matrix */
+  template <typename Other>
+  explicit PermutationMatrix(const TranspositionsBase<Other>& tr) : m_indices(tr.size()) {
+    *this = tr;
+  }
+
+  /** Copies the other permutation into *this */
+  template <typename Other>
+  PermutationMatrix& operator=(const PermutationBase<Other>& other) {
+    m_indices = other.indices();
+    return *this;
+  }
+
+  /** Assignment from the Transpositions \a tr */
+  template <typename Other>
+  PermutationMatrix& operator=(const TranspositionsBase<Other>& tr) {
+    return Base::operator=(tr.derived());
+  }
+
+  /** const version of indices(). */
+  const IndicesType& indices() const { return m_indices; }
+  /** \returns a reference to the stored array representing the permutation. */
+  IndicesType& indices() { return m_indices; }
+
+  /**** multiplication helpers to hopefully get RVO ****/
 
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** This is a special case of the templated operator=. Its purpose is to
-      * prevent a default operator= from hiding the templated operator=.
-      */
-    PermutationMatrix& operator=(const PermutationMatrix& other)
-    {
-      m_indices = other.m_indices;
-      return *this;
-    }
-    #endif
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  template <typename Other>
+  PermutationMatrix(const InverseImpl<Other, PermutationStorage>& other)
+      : m_indices(other.derived().nestedExpression().size()) {
+    eigen_internal_assert(m_indices.size() <= NumTraits<StorageIndex>::highest());
+    StorageIndex end = StorageIndex(m_indices.size());
+    for (StorageIndex i = 0; i < end; ++i)
+      m_indices.coeffRef(other.derived().nestedExpression().indices().coeff(i)) = i;
+  }
+  template <typename Lhs, typename Rhs>
+  PermutationMatrix(internal::PermPermProduct_t, const Lhs& lhs, const Rhs& rhs) : m_indices(lhs.indices().size()) {
+    Base::assignProduct(lhs, rhs);
+  }
+#endif
 
-    /** const version of indices(). */
-    const IndicesType& indices() const { return m_indices; }
-    /** \returns a reference to the stored array representing the permutation. */
-    IndicesType& indices() { return m_indices; }
+ protected:
+  IndicesType m_indices;
+};
 
+namespace internal {
+template <int SizeAtCompileTime, int MaxSizeAtCompileTime, typename StorageIndex_, int PacketAccess_>
+struct traits<Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, StorageIndex_>, PacketAccess_> >
+    : traits<
+          Matrix<StorageIndex_, SizeAtCompileTime, SizeAtCompileTime, 0, MaxSizeAtCompileTime, MaxSizeAtCompileTime> > {
+  typedef PermutationStorage StorageKind;
+  typedef Map<const Matrix<StorageIndex_, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1>, PacketAccess_> IndicesType;
+  typedef StorageIndex_ StorageIndex;
+  typedef void Scalar;
+};
+}  // namespace internal
 
-    /**** multiplication helpers to hopefully get RVO ****/
+template <int SizeAtCompileTime, int MaxSizeAtCompileTime, typename StorageIndex_, int PacketAccess_>
+class Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, StorageIndex_>, PacketAccess_>
+    : public PermutationBase<
+          Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, StorageIndex_>, PacketAccess_> > {
+  typedef PermutationBase<Map> Base;
+  typedef internal::traits<Map> Traits;
 
+ public:
 #ifndef EIGEN_PARSED_BY_DOXYGEN
-    template<typename Other>
-    PermutationMatrix(const Transpose<PermutationBase<Other> >& other)
-      : m_indices(other.nestedPermutation().size())
-    {
-      for (int i=0; i<m_indices.size();++i) m_indices.coeffRef(other.nestedPermutation().indices().coeff(i)) = i;
-    }
-    template<typename Lhs,typename Rhs>
-    PermutationMatrix(internal::PermPermProduct_t, const Lhs& lhs, const Rhs& rhs)
-      : m_indices(lhs.indices().size())
-    {
-      Base::assignProduct(lhs,rhs);
-    }
+  typedef typename Traits::IndicesType IndicesType;
+  typedef typename IndicesType::Scalar StorageIndex;
 #endif
 
-  protected:
+  inline Map(const StorageIndex* indicesPtr) : m_indices(indicesPtr) {}
 
-    IndicesType m_indices;
-};
+  inline Map(const StorageIndex* indicesPtr, Index size) : m_indices(indicesPtr, size) {}
 
+  /** Copies the other permutation into *this */
+  template <typename Other>
+  Map& operator=(const PermutationBase<Other>& other) {
+    return Base::operator=(other.derived());
+  }
 
-namespace internal {
-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType, int _PacketAccess>
-struct traits<Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, IndexType>,_PacketAccess> >
- : traits<Matrix<IndexType,SizeAtCompileTime,SizeAtCompileTime,0,MaxSizeAtCompileTime,MaxSizeAtCompileTime> >
-{
-  typedef IndexType Index;
-  typedef Map<const Matrix<IndexType, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1>, _PacketAccess> IndicesType;
-};
-}
-
-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType, int _PacketAccess>
-class Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, IndexType>,_PacketAccess>
-  : public PermutationBase<Map<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, IndexType>,_PacketAccess> >
-{
-    typedef PermutationBase<Map> Base;
-    typedef internal::traits<Map> Traits;
-  public:
-
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    typedef typename Traits::IndicesType IndicesType;
-    typedef typename IndicesType::Scalar Index;
-    #endif
-
-    inline Map(const Index* indicesPtr)
-      : m_indices(indicesPtr)
-    {}
-
-    inline Map(const Index* indicesPtr, Index size)
-      : m_indices(indicesPtr,size)
-    {}
-
-    /** Copies the other permutation into *this */
-    template<typename Other>
-    Map& operator=(const PermutationBase<Other>& other)
-    { return Base::operator=(other.derived()); }
-
-    /** Assignment from the Transpositions \a tr */
-    template<typename Other>
-    Map& operator=(const TranspositionsBase<Other>& tr)
-    { return Base::operator=(tr.derived()); }
-
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** This is a special case of the templated operator=. Its purpose is to
-      * prevent a default operator= from hiding the templated operator=.
-      */
-    Map& operator=(const Map& other)
-    {
-      m_indices = other.m_indices;
-      return *this;
-    }
-    #endif
+  /** Assignment from the Transpositions \a tr */
+  template <typename Other>
+  Map& operator=(const TranspositionsBase<Other>& tr) {
+    return Base::operator=(tr.derived());
+  }
 
-    /** const version of indices(). */
-    const IndicesType& indices() const { return m_indices; }
-    /** \returns a reference to the stored array representing the permutation. */
-    IndicesType& indices() { return m_indices; }
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  /** This is a special case of the templated operator=. Its purpose is to
+   * prevent a default operator= from hiding the templated operator=.
+   */
+  Map& operator=(const Map& other) {
+    m_indices = other.m_indices;
+    return *this;
+  }
+#endif
 
-  protected:
+  /** const version of indices(). */
+  const IndicesType& indices() const { return m_indices; }
+  /** \returns a reference to the stored array representing the permutation. */
+  IndicesType& indices() { return m_indices; }
 
-    IndicesType m_indices;
+ protected:
+  IndicesType m_indices;
 };
 
-/** \class PermutationWrapper
-  * \ingroup Core_Module
-  *
-  * \brief Class to view a vector of integers as a permutation matrix
-  *
-  * \param _IndicesType the type of the vector of integer (can be any compatible expression)
-  *
-  * This class allows to view any vector expression of integers as a permutation matrix.
-  *
-  * \sa class PermutationBase, class PermutationMatrix
-  */
-
-struct PermutationStorage {};
-
-template<typename _IndicesType> class TranspositionsWrapper;
+template <typename IndicesType_>
+class TranspositionsWrapper;
 namespace internal {
-template<typename _IndicesType>
-struct traits<PermutationWrapper<_IndicesType> >
-{
+template <typename IndicesType_>
+struct traits<PermutationWrapper<IndicesType_> > {
   typedef PermutationStorage StorageKind;
-  typedef typename _IndicesType::Scalar Scalar;
-  typedef typename _IndicesType::Scalar Index;
-  typedef _IndicesType IndicesType;
+  typedef void Scalar;
+  typedef typename IndicesType_::Scalar StorageIndex;
+  typedef IndicesType_ IndicesType;
   enum {
-    RowsAtCompileTime = _IndicesType::SizeAtCompileTime,
-    ColsAtCompileTime = _IndicesType::SizeAtCompileTime,
-    MaxRowsAtCompileTime = IndicesType::MaxRowsAtCompileTime,
-    MaxColsAtCompileTime = IndicesType::MaxColsAtCompileTime,
-    Flags = 0,
-    CoeffReadCost = _IndicesType::CoeffReadCost
+    RowsAtCompileTime = IndicesType_::SizeAtCompileTime,
+    ColsAtCompileTime = IndicesType_::SizeAtCompileTime,
+    MaxRowsAtCompileTime = IndicesType::MaxSizeAtCompileTime,
+    MaxColsAtCompileTime = IndicesType::MaxSizeAtCompileTime,
+    Flags = 0
   };
 };
-}
-
-template<typename _IndicesType>
-class PermutationWrapper : public PermutationBase<PermutationWrapper<_IndicesType> >
-{
-    typedef PermutationBase<PermutationWrapper> Base;
-    typedef internal::traits<PermutationWrapper> Traits;
-  public:
+}  // namespace internal
 
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    typedef typename Traits::IndicesType IndicesType;
-    #endif
-
-    inline PermutationWrapper(const IndicesType& a_indices)
-      : m_indices(a_indices)
-    {}
+/** \class PermutationWrapper
+ * \ingroup Core_Module
+ *
+ * \brief Class to view a vector of integers as a permutation matrix
+ *
+ * \tparam IndicesType_ the type of the vector of integer (can be any compatible expression)
+ *
+ * This class allows to view any vector expression of integers as a permutation matrix.
+ *
+ * \sa class PermutationBase, class PermutationMatrix
+ */
+template <typename IndicesType_>
+class PermutationWrapper : public PermutationBase<PermutationWrapper<IndicesType_> > {
+  typedef PermutationBase<PermutationWrapper> Base;
+  typedef internal::traits<PermutationWrapper> Traits;
+
+ public:
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  typedef typename Traits::IndicesType IndicesType;
+#endif
 
-    /** const version of indices(). */
-    const typename internal::remove_all<typename IndicesType::Nested>::type&
-    indices() const { return m_indices; }
+  inline PermutationWrapper(const IndicesType& indices) : m_indices(indices) {}
 
-  protected:
+  /** const version of indices(). */
+  const internal::remove_all_t<typename IndicesType::Nested>& indices() const { return m_indices; }
 
-    typename IndicesType::Nested m_indices;
+ protected:
+  typename IndicesType::Nested m_indices;
 };
 
 /** \returns the matrix with the permutation applied to the columns.
-  */
-template<typename Derived, typename PermutationDerived>
-inline const internal::permut_matrix_product_retval<PermutationDerived, Derived, OnTheRight>
-operator*(const MatrixBase<Derived>& matrix,
-          const PermutationBase<PermutationDerived> &permutation)
-{
-  return internal::permut_matrix_product_retval
-           <PermutationDerived, Derived, OnTheRight>
-           (permutation.derived(), matrix.derived());
+ */
+template <typename MatrixDerived, typename PermutationDerived>
+EIGEN_DEVICE_FUNC const Product<MatrixDerived, PermutationDerived, DefaultProduct> operator*(
+    const MatrixBase<MatrixDerived>& matrix, const PermutationBase<PermutationDerived>& permutation) {
+  return Product<MatrixDerived, PermutationDerived, DefaultProduct>(matrix.derived(), permutation.derived());
 }
 
 /** \returns the matrix with the permutation applied to the rows.
-  */
-template<typename Derived, typename PermutationDerived>
-inline const internal::permut_matrix_product_retval
-               <PermutationDerived, Derived, OnTheLeft>
-operator*(const PermutationBase<PermutationDerived> &permutation,
-          const MatrixBase<Derived>& matrix)
-{
-  return internal::permut_matrix_product_retval
-           <PermutationDerived, Derived, OnTheLeft>
-           (permutation.derived(), matrix.derived());
+ */
+template <typename PermutationDerived, typename MatrixDerived>
+EIGEN_DEVICE_FUNC const Product<PermutationDerived, MatrixDerived, DefaultProduct> operator*(
+    const PermutationBase<PermutationDerived>& permutation, const MatrixBase<MatrixDerived>& matrix) {
+  return Product<PermutationDerived, MatrixDerived, DefaultProduct>(permutation.derived(), matrix.derived());
 }
 
-namespace internal {
+template <typename PermutationType>
+class InverseImpl<PermutationType, PermutationStorage> : public EigenBase<Inverse<PermutationType> > {
+  typedef typename PermutationType::PlainPermutationType PlainPermutationType;
+  typedef internal::traits<PermutationType> PermTraits;
 
-template<typename PermutationType, typename MatrixType, int Side, bool Transposed>
-struct traits<permut_matrix_product_retval<PermutationType, MatrixType, Side, Transposed> >
-{
-  typedef typename MatrixType::PlainObject ReturnType;
-};
+ protected:
+  InverseImpl() {}
 
-template<typename PermutationType, typename MatrixType, int Side, bool Transposed>
-struct permut_matrix_product_retval
- : public ReturnByValue<permut_matrix_product_retval<PermutationType, MatrixType, Side, Transposed> >
-{
-    typedef typename remove_all<typename MatrixType::Nested>::type MatrixTypeNestedCleaned;
-    typedef typename MatrixType::Index Index;
-
-    permut_matrix_product_retval(const PermutationType& perm, const MatrixType& matrix)
-      : m_permutation(perm), m_matrix(matrix)
-    {}
-
-    inline Index rows() const { return m_matrix.rows(); }
-    inline Index cols() const { return m_matrix.cols(); }
-
-    template<typename Dest> inline void evalTo(Dest& dst) const
-    {
-      const Index n = Side==OnTheLeft ? rows() : cols();
-      // FIXME we need an is_same for expression that is not sensitive to constness. For instance
-      // is_same_xpr<Block<const Matrix>, Block<Matrix> >::value should be true.
-      if(    is_same<MatrixTypeNestedCleaned,Dest>::value
-          && blas_traits<MatrixTypeNestedCleaned>::HasUsableDirectAccess
-          && blas_traits<Dest>::HasUsableDirectAccess
-          && extract_data(dst) == extract_data(m_matrix))
-      {
-        // apply the permutation inplace
-        Matrix<bool,PermutationType::RowsAtCompileTime,1,0,PermutationType::MaxRowsAtCompileTime> mask(m_permutation.size());
-        mask.fill(false);
-        Index r = 0;
-        while(r < m_permutation.size())
-        {
-          // search for the next seed
-          while(r<m_permutation.size() && mask[r]) r++;
-          if(r>=m_permutation.size())
-            break;
-          // we got one, let's follow it until we are back to the seed
-          Index k0 = r++;
-          Index kPrev = k0;
-          mask.coeffRef(k0) = true;
-          for(Index k=m_permutation.indices().coeff(k0); k!=k0; k=m_permutation.indices().coeff(k))
-          {
-                  Block<Dest, Side==OnTheLeft ? 1 : Dest::RowsAtCompileTime, Side==OnTheRight ? 1 : Dest::ColsAtCompileTime>(dst, k)
-            .swap(Block<Dest, Side==OnTheLeft ? 1 : Dest::RowsAtCompileTime, Side==OnTheRight ? 1 : Dest::ColsAtCompileTime>
-                       (dst,((Side==OnTheLeft) ^ Transposed) ? k0 : kPrev));
-
-            mask.coeffRef(k) = true;
-            kPrev = k;
-          }
-        }
-      }
-      else
-      {
-        for(int i = 0; i < n; ++i)
-        {
-          Block<Dest, Side==OnTheLeft ? 1 : Dest::RowsAtCompileTime, Side==OnTheRight ? 1 : Dest::ColsAtCompileTime>
-               (dst, ((Side==OnTheLeft) ^ Transposed) ? m_permutation.indices().coeff(i) : i)
-
-          =
-
-          Block<const MatrixTypeNestedCleaned,Side==OnTheLeft ? 1 : MatrixType::RowsAtCompileTime,Side==OnTheRight ? 1 : MatrixType::ColsAtCompileTime>
-               (m_matrix, ((Side==OnTheRight) ^ Transposed) ? m_permutation.indices().coeff(i) : i);
-        }
-      }
-    }
-
-  protected:
-    const PermutationType& m_permutation;
-    typename MatrixType::Nested m_matrix;
-};
-
-/* Template partial specialization for transposed/inverse permutations */
-
-template<typename Derived>
-struct traits<Transpose<PermutationBase<Derived> > >
- : traits<Derived>
-{};
-
-} // end namespace internal
-
-template<typename Derived>
-class Transpose<PermutationBase<Derived> >
-  : public EigenBase<Transpose<PermutationBase<Derived> > >
-{
-    typedef Derived PermutationType;
-    typedef typename PermutationType::IndicesType IndicesType;
-    typedef typename PermutationType::PlainPermutationType PlainPermutationType;
-  public:
-
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    typedef internal::traits<PermutationType> Traits;
-    typedef typename Derived::DenseMatrixType DenseMatrixType;
-    enum {
-      Flags = Traits::Flags,
-      CoeffReadCost = Traits::CoeffReadCost,
-      RowsAtCompileTime = Traits::RowsAtCompileTime,
-      ColsAtCompileTime = Traits::ColsAtCompileTime,
-      MaxRowsAtCompileTime = Traits::MaxRowsAtCompileTime,
-      MaxColsAtCompileTime = Traits::MaxColsAtCompileTime
-    };
-    typedef typename Traits::Scalar Scalar;
-    #endif
-
-    Transpose(const PermutationType& p) : m_permutation(p) {}
-
-    inline int rows() const { return m_permutation.rows(); }
-    inline int cols() const { return m_permutation.cols(); }
-
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    template<typename DenseDerived>
-    void evalTo(MatrixBase<DenseDerived>& other) const
-    {
-      other.setZero();
-      for (int i=0; i<rows();++i)
-        other.coeffRef(i, m_permutation.indices().coeff(i)) = typename DenseDerived::Scalar(1);
-    }
-    #endif
+ public:
+  typedef Inverse<PermutationType> InverseType;
+  using EigenBase<Inverse<PermutationType> >::derived;
 
-    /** \return the equivalent permutation matrix */
-    PlainPermutationType eval() const { return *this; }
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  typedef typename PermutationType::DenseMatrixType DenseMatrixType;
+  enum {
+    RowsAtCompileTime = PermTraits::RowsAtCompileTime,
+    ColsAtCompileTime = PermTraits::ColsAtCompileTime,
+    MaxRowsAtCompileTime = PermTraits::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = PermTraits::MaxColsAtCompileTime
+  };
+#endif
 
-    DenseMatrixType toDenseMatrix() const { return *this; }
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  template <typename DenseDerived>
+  void evalTo(MatrixBase<DenseDerived>& other) const {
+    other.setZero();
+    for (Index i = 0; i < derived().rows(); ++i)
+      other.coeffRef(i, derived().nestedExpression().indices().coeff(i)) = typename DenseDerived::Scalar(1);
+  }
+#endif
 
-    /** \returns the matrix with the inverse permutation applied to the columns.
-      */
-    template<typename OtherDerived> friend
-    inline const internal::permut_matrix_product_retval<PermutationType, OtherDerived, OnTheRight, true>
-    operator*(const MatrixBase<OtherDerived>& matrix, const Transpose& trPerm)
-    {
-      return internal::permut_matrix_product_retval<PermutationType, OtherDerived, OnTheRight, true>(trPerm.m_permutation, matrix.derived());
-    }
+  /** \return the equivalent permutation matrix */
+  PlainPermutationType eval() const { return derived(); }
+
+  DenseMatrixType toDenseMatrix() const { return derived(); }
+
+  /** \returns the matrix with the inverse permutation applied to the columns.
+   */
+  template <typename OtherDerived>
+  friend const Product<OtherDerived, InverseType, DefaultProduct> operator*(const MatrixBase<OtherDerived>& matrix,
+                                                                            const InverseType& trPerm) {
+    return Product<OtherDerived, InverseType, DefaultProduct>(matrix.derived(), trPerm.derived());
+  }
+
+  /** \returns the matrix with the inverse permutation applied to the rows.
+   */
+  template <typename OtherDerived>
+  const Product<InverseType, OtherDerived, DefaultProduct> operator*(const MatrixBase<OtherDerived>& matrix) const {
+    return Product<InverseType, OtherDerived, DefaultProduct>(derived(), matrix.derived());
+  }
+};
 
-    /** \returns the matrix with the inverse permutation applied to the rows.
-      */
-    template<typename OtherDerived>
-    inline const internal::permut_matrix_product_retval<PermutationType, OtherDerived, OnTheLeft, true>
-    operator*(const MatrixBase<OtherDerived>& matrix) const
-    {
-      return internal::permut_matrix_product_retval<PermutationType, OtherDerived, OnTheLeft, true>(m_permutation, matrix.derived());
-    }
+template <typename Derived>
+const PermutationWrapper<const Derived> MatrixBase<Derived>::asPermutation() const {
+  return derived();
+}
 
-    const PermutationType& nestedPermutation() const { return m_permutation; }
+namespace internal {
 
-  protected:
-    const PermutationType& m_permutation;
+template <>
+struct AssignmentKind<DenseShape, PermutationShape> {
+  typedef EigenBase2EigenBase Kind;
 };
 
-template<typename Derived>
-const PermutationWrapper<const Derived> MatrixBase<Derived>::asPermutation() const
-{
-  return derived();
-}
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_PERMUTATIONMATRIX_H
+#endif  // EIGEN_PERMUTATIONMATRIX_H
diff --git a/inst/include/Eigen/src/Core/PlainObjectBase.h b/inst/include/Eigen/src/Core/PlainObjectBase.h
index a4e4af4a..a78305e2 100644
--- a/inst/include/Eigen/src/Core/PlainObjectBase.h
+++ b/inst/include/Eigen/src/Core/PlainObjectBase.h
@@ -12,727 +12,922 @@
 #define EIGEN_DENSESTORAGEBASE_H
 
 #if defined(EIGEN_INITIALIZE_MATRICES_BY_ZERO)
-# define EIGEN_INITIALIZE_COEFFS
-# define EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED for(int i=0;i<base().size();++i) coeffRef(i)=Scalar(0);
+#define EIGEN_INITIALIZE_COEFFS
+#define EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED \
+  for (Index i = 0; i < base().size(); ++i) coeffRef(i) = Scalar(0);
 #elif defined(EIGEN_INITIALIZE_MATRICES_BY_NAN)
-# define EIGEN_INITIALIZE_COEFFS
-# define EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED for(int i=0;i<base().size();++i) coeffRef(i)=std::numeric_limits<Scalar>::quiet_NaN();
+#define EIGEN_INITIALIZE_COEFFS
+#define EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED \
+  for (Index i = 0; i < base().size(); ++i) coeffRef(i) = std::numeric_limits<Scalar>::quiet_NaN();
 #else
-# undef EIGEN_INITIALIZE_COEFFS
-# define EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
+#undef EIGEN_INITIALIZE_COEFFS
+#define EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
 #endif
 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
 namespace Eigen {
 
 namespace internal {
 
-template<int MaxSizeAtCompileTime> struct check_rows_cols_for_overflow {
-  template<typename Index>
-  static EIGEN_ALWAYS_INLINE void run(Index, Index)
-  {
+#ifndef EIGEN_NO_DEBUG
+template <int MaxSizeAtCompileTime, int MaxRowsAtCompileTime, int MaxColsAtCompileTime>
+struct check_rows_cols_for_overflow {
+  EIGEN_STATIC_ASSERT(MaxRowsAtCompileTime* MaxColsAtCompileTime == MaxSizeAtCompileTime,
+                      YOU MADE A PROGRAMMING MISTAKE)
+  template <typename Index>
+  EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE constexpr void run(Index, Index) {}
+};
+
+template <int MaxRowsAtCompileTime>
+struct check_rows_cols_for_overflow<Dynamic, MaxRowsAtCompileTime, Dynamic> {
+  template <typename Index>
+  EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE constexpr void run(Index, Index cols) {
+    constexpr Index MaxIndex = NumTraits<Index>::highest();
+    bool error = cols > (MaxIndex / MaxRowsAtCompileTime);
+    if (error) throw_std_bad_alloc();
+  }
+};
+
+template <int MaxColsAtCompileTime>
+struct check_rows_cols_for_overflow<Dynamic, Dynamic, MaxColsAtCompileTime> {
+  template <typename Index>
+  EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE constexpr void run(Index rows, Index) {
+    constexpr Index MaxIndex = NumTraits<Index>::highest();
+    bool error = rows > (MaxIndex / MaxColsAtCompileTime);
+    if (error) throw_std_bad_alloc();
   }
 };
 
-template<> struct check_rows_cols_for_overflow<Dynamic> {
-  template<typename Index>
-  static EIGEN_ALWAYS_INLINE void run(Index rows, Index cols)
-  {
-    // http://hg.mozilla.org/mozilla-central/file/6c8a909977d3/xpcom/ds/CheckedInt.h#l242
-    // we assume Index is signed
-    Index max_index = (size_t(1) << (8 * sizeof(Index) - 1)) - 1; // assume Index is signed
-    bool error = (rows == 0 || cols == 0) ? false
-               : (rows > max_index / cols);
-    if (error)
-      throw_std_bad_alloc();
+template <>
+struct check_rows_cols_for_overflow<Dynamic, Dynamic, Dynamic> {
+  template <typename Index>
+  EIGEN_DEVICE_FUNC static EIGEN_ALWAYS_INLINE constexpr void run(Index rows, Index cols) {
+    constexpr Index MaxIndex = NumTraits<Index>::highest();
+    bool error = cols == 0 ? false : (rows > (MaxIndex / cols));
+    if (error) throw_std_bad_alloc();
   }
 };
+#endif
 
-template <typename Derived,
-          typename OtherDerived = Derived,
+template <typename Derived, typename OtherDerived = Derived,
           bool IsVector = bool(Derived::IsVectorAtCompileTime) && bool(OtherDerived::IsVectorAtCompileTime)>
 struct conservative_resize_like_impl;
 
-template<typename MatrixTypeA, typename MatrixTypeB, bool SwapPointers> struct matrix_swap_impl;
+template <typename MatrixTypeA, typename MatrixTypeB, bool SwapPointers>
+struct matrix_swap_impl;
 
-} // end namespace internal
+}  // end namespace internal
 
 /** \class PlainObjectBase
-  * \brief %Dense storage base class for matrices and arrays.
-  *
-  * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_PLAINOBJECTBASE_PLUGIN.
-  *
-  * \sa \ref TopicClassHierarchy
-  */
-#ifdef EIGEN_PARSED_BY_DOXYGEN
-namespace internal {
+ * \ingroup Core_Module
+ * \brief %Dense storage base class for matrices and arrays.
+ *
+ * This class can be extended with the help of the plugin mechanism described on the page
+ * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_PLAINOBJECTBASE_PLUGIN.
+ *
+ * \tparam Derived is the derived type, e.g., a Matrix or Array
+ *
+ * \sa \ref TopicClassHierarchy
+ */
+template <typename Derived>
+class PlainObjectBase : public internal::dense_xpr_base<Derived>::type {
+ public:
+  enum { Options = internal::traits<Derived>::Options };
+  typedef typename internal::dense_xpr_base<Derived>::type Base;
+
+  typedef typename internal::traits<Derived>::StorageKind StorageKind;
+  typedef typename internal::traits<Derived>::Scalar Scalar;
+
+  typedef typename internal::packet_traits<Scalar>::type PacketScalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef Derived DenseType;
+
+  using Base::ColsAtCompileTime;
+  using Base::Flags;
+  using Base::IsVectorAtCompileTime;
+  using Base::MaxColsAtCompileTime;
+  using Base::MaxRowsAtCompileTime;
+  using Base::MaxSizeAtCompileTime;
+  using Base::RowsAtCompileTime;
+  using Base::SizeAtCompileTime;
+
+  typedef Eigen::Map<Derived, Unaligned> MapType;
+  typedef const Eigen::Map<const Derived, Unaligned> ConstMapType;
+  typedef Eigen::Map<Derived, AlignedMax> AlignedMapType;
+  typedef const Eigen::Map<const Derived, AlignedMax> ConstAlignedMapType;
+  template <typename StrideType>
+  struct StridedMapType {
+    typedef Eigen::Map<Derived, Unaligned, StrideType> type;
+  };
+  template <typename StrideType>
+  struct StridedConstMapType {
+    typedef Eigen::Map<const Derived, Unaligned, StrideType> type;
+  };
+  template <typename StrideType>
+  struct StridedAlignedMapType {
+    typedef Eigen::Map<Derived, AlignedMax, StrideType> type;
+  };
+  template <typename StrideType>
+  struct StridedConstAlignedMapType {
+    typedef Eigen::Map<const Derived, AlignedMax, StrideType> type;
+  };
+
+ protected:
+  DenseStorage<Scalar, Base::MaxSizeAtCompileTime, Base::RowsAtCompileTime, Base::ColsAtCompileTime, Options> m_storage;
+
+ public:
+  enum { NeedsToAlign = (SizeAtCompileTime != Dynamic) && (internal::traits<Derived>::Alignment > 0) };
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign)
+
+  EIGEN_STATIC_ASSERT(internal::check_implication(MaxRowsAtCompileTime == 1 && MaxColsAtCompileTime != 1,
+                                                  (int(Options) & RowMajor) == RowMajor),
+                      INVALID_MATRIX_TEMPLATE_PARAMETERS)
+  EIGEN_STATIC_ASSERT(internal::check_implication(MaxColsAtCompileTime == 1 && MaxRowsAtCompileTime != 1,
+                                                  (int(Options) & RowMajor) == 0),
+                      INVALID_MATRIX_TEMPLATE_PARAMETERS)
+  EIGEN_STATIC_ASSERT((RowsAtCompileTime == Dynamic) || (RowsAtCompileTime >= 0), INVALID_MATRIX_TEMPLATE_PARAMETERS)
+  EIGEN_STATIC_ASSERT((ColsAtCompileTime == Dynamic) || (ColsAtCompileTime >= 0), INVALID_MATRIX_TEMPLATE_PARAMETERS)
+  EIGEN_STATIC_ASSERT((MaxRowsAtCompileTime == Dynamic) || (MaxRowsAtCompileTime >= 0),
+                      INVALID_MATRIX_TEMPLATE_PARAMETERS)
+  EIGEN_STATIC_ASSERT((MaxColsAtCompileTime == Dynamic) || (MaxColsAtCompileTime >= 0),
+                      INVALID_MATRIX_TEMPLATE_PARAMETERS)
+  EIGEN_STATIC_ASSERT((MaxRowsAtCompileTime == RowsAtCompileTime || RowsAtCompileTime == Dynamic),
+                      INVALID_MATRIX_TEMPLATE_PARAMETERS)
+  EIGEN_STATIC_ASSERT((MaxColsAtCompileTime == ColsAtCompileTime || ColsAtCompileTime == Dynamic),
+                      INVALID_MATRIX_TEMPLATE_PARAMETERS)
+  EIGEN_STATIC_ASSERT(((Options & (DontAlign | RowMajor)) == Options), INVALID_MATRIX_TEMPLATE_PARAMETERS)
+
+  EIGEN_DEVICE_FUNC Base& base() { return *static_cast<Base*>(this); }
+  EIGEN_DEVICE_FUNC const Base& base() const { return *static_cast<const Base*>(this); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const noexcept { return m_storage.rows(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const noexcept { return m_storage.cols(); }
+
+  /** This is an overloaded version of DenseCoeffsBase<Derived,ReadOnlyAccessors>::coeff(Index,Index) const
+   * provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts.
+   *
+   * See DenseCoeffsBase<Derived,ReadOnlyAccessors>::coeff(Index) const for details. */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const Scalar& coeff(Index rowId, Index colId) const {
+    if (Flags & RowMajorBit)
+      return m_storage.data()[colId + rowId * m_storage.cols()];
+    else  // column-major
+      return m_storage.data()[rowId + colId * m_storage.rows()];
+  }
+
+  /** This is an overloaded version of DenseCoeffsBase<Derived,ReadOnlyAccessors>::coeff(Index) const
+   * provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts.
+   *
+   * See DenseCoeffsBase<Derived,ReadOnlyAccessors>::coeff(Index) const for details. */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const Scalar& coeff(Index index) const {
+    return m_storage.data()[index];
+  }
+
+  /** This is an overloaded version of DenseCoeffsBase<Derived,WriteAccessors>::coeffRef(Index,Index) const
+   * provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts.
+   *
+   * See DenseCoeffsBase<Derived,WriteAccessors>::coeffRef(Index,Index) const for details. */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& coeffRef(Index rowId, Index colId) {
+    if (Flags & RowMajorBit)
+      return m_storage.data()[colId + rowId * m_storage.cols()];
+    else  // column-major
+      return m_storage.data()[rowId + colId * m_storage.rows()];
+  }
+
+  /** This is an overloaded version of DenseCoeffsBase<Derived,WriteAccessors>::coeffRef(Index) const
+   * provided to by-pass the creation of an evaluator of the expression, thus saving compilation efforts.
+   *
+   * See DenseCoeffsBase<Derived,WriteAccessors>::coeffRef(Index) const for details. */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Scalar& coeffRef(Index index) { return m_storage.data()[index]; }
+
+  /** This is the const version of coeffRef(Index,Index) which is thus synonym of coeff(Index,Index).
+   * It is provided for convenience. */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const Scalar& coeffRef(Index rowId, Index colId) const {
+    if (Flags & RowMajorBit)
+      return m_storage.data()[colId + rowId * m_storage.cols()];
+    else  // column-major
+      return m_storage.data()[rowId + colId * m_storage.rows()];
+  }
+
+  /** This is the const version of coeffRef(Index) which is thus synonym of coeff(Index).
+   * It is provided for convenience. */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const Scalar& coeffRef(Index index) const {
+    return m_storage.data()[index];
+  }
+
+  /** \internal */
+  template <int LoadMode>
+  EIGEN_STRONG_INLINE PacketScalar packet(Index rowId, Index colId) const {
+    return internal::ploadt<PacketScalar, LoadMode>(
+        m_storage.data() + (Flags & RowMajorBit ? colId + rowId * m_storage.cols() : rowId + colId * m_storage.rows()));
+  }
+
+  /** \internal */
+  template <int LoadMode>
+  EIGEN_STRONG_INLINE PacketScalar packet(Index index) const {
+    return internal::ploadt<PacketScalar, LoadMode>(m_storage.data() + index);
+  }
+
+  /** \internal */
+  template <int StoreMode>
+  EIGEN_STRONG_INLINE void writePacket(Index rowId, Index colId, const PacketScalar& val) {
+    internal::pstoret<Scalar, PacketScalar, StoreMode>(
+        m_storage.data() + (Flags & RowMajorBit ? colId + rowId * m_storage.cols() : rowId + colId * m_storage.rows()),
+        val);
+  }
 
-// this is a warkaround to doxygen not being able to understand the inheritence logic
-// when it is hidden by the dense_xpr_base helper struct.
-template<typename Derived> struct dense_xpr_base_dispatcher_for_doxygen;// : public MatrixBase<Derived> {};
-/** This class is just a workaround for Doxygen and it does not not actually exist. */
-template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
-struct dense_xpr_base_dispatcher_for_doxygen<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
-    : public MatrixBase<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > {};
-/** This class is just a workaround for Doxygen and it does not not actually exist. */
-template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
-struct dense_xpr_base_dispatcher_for_doxygen<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> >
-    : public ArrayBase<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols> > {};
-
-} // namespace internal
-
-template<typename Derived>
-class PlainObjectBase : public internal::dense_xpr_base_dispatcher_for_doxygen<Derived>
+  /** \internal */
+  template <int StoreMode>
+  EIGEN_STRONG_INLINE void writePacket(Index index, const PacketScalar& val) {
+    internal::pstoret<Scalar, PacketScalar, StoreMode>(m_storage.data() + index, val);
+  }
+
+  /** \returns a const pointer to the data array of this matrix */
+  EIGEN_DEVICE_FUNC constexpr const Scalar* data() const { return m_storage.data(); }
+
+  /** \returns a pointer to the data array of this matrix */
+  EIGEN_DEVICE_FUNC constexpr Scalar* data() { return m_storage.data(); }
+
+  /** Resizes \c *this to a \a rows x \a cols matrix.
+   *
+   * This method is intended for dynamic-size matrices, although it is legal to call it on any
+   * matrix as long as fixed dimensions are left unchanged. If you only want to change the number
+   * of rows and/or of columns, you can use resize(NoChange_t, Index), resize(Index, NoChange_t).
+   *
+   * If the current number of coefficients of \c *this exactly matches the
+   * product \a rows * \a cols, then no memory allocation is performed and
+   * the current values are left unchanged. In all other cases, including
+   * shrinking, the data is reallocated and all previous values are lost.
+   *
+   * Example: \include Matrix_resize_int_int.cpp
+   * Output: \verbinclude Matrix_resize_int_int.out
+   *
+   * \sa resize(Index) for vectors, resize(NoChange_t, Index), resize(Index, NoChange_t)
+   */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void resize(Index rows, Index cols) {
+    eigen_assert(internal::check_implication(RowsAtCompileTime != Dynamic, rows == RowsAtCompileTime) &&
+                 internal::check_implication(ColsAtCompileTime != Dynamic, cols == ColsAtCompileTime) &&
+                 internal::check_implication(RowsAtCompileTime == Dynamic && MaxRowsAtCompileTime != Dynamic,
+                                             rows <= MaxRowsAtCompileTime) &&
+                 internal::check_implication(ColsAtCompileTime == Dynamic && MaxColsAtCompileTime != Dynamic,
+                                             cols <= MaxColsAtCompileTime) &&
+                 rows >= 0 && cols >= 0 && "Invalid sizes when resizing a matrix or array.");
+#ifndef EIGEN_NO_DEBUG
+    internal::check_rows_cols_for_overflow<MaxSizeAtCompileTime, MaxRowsAtCompileTime, MaxColsAtCompileTime>::run(rows,
+                                                                                                                  cols);
+#endif
+#ifdef EIGEN_INITIALIZE_COEFFS
+    Index size = rows * cols;
+    bool size_changed = size != this->size();
+    m_storage.resize(size, rows, cols);
+    if (size_changed) EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
 #else
-template<typename Derived>
-class PlainObjectBase : public internal::dense_xpr_base<Derived>::type
+    m_storage.resize(rows * cols, rows, cols);
 #endif
-{
-  public:
-    enum { Options = internal::traits<Derived>::Options };
-    typedef typename internal::dense_xpr_base<Derived>::type Base;
-
-    typedef typename internal::traits<Derived>::StorageKind StorageKind;
-    typedef typename internal::traits<Derived>::Index Index;
-    typedef typename internal::traits<Derived>::Scalar Scalar;
-    typedef typename internal::packet_traits<Scalar>::type PacketScalar;
-    typedef typename NumTraits<Scalar>::Real RealScalar;
-    typedef Derived DenseType;
-
-    using Base::RowsAtCompileTime;
-    using Base::ColsAtCompileTime;
-    using Base::SizeAtCompileTime;
-    using Base::MaxRowsAtCompileTime;
-    using Base::MaxColsAtCompileTime;
-    using Base::MaxSizeAtCompileTime;
-    using Base::IsVectorAtCompileTime;
-    using Base::Flags;
-
-    template<typename PlainObjectType, int MapOptions, typename StrideType> friend class Eigen::Map;
-    friend  class Eigen::Map<Derived, Unaligned>;
-    typedef Eigen::Map<Derived, Unaligned>  MapType;
-    friend  class Eigen::Map<const Derived, Unaligned>;
-    typedef const Eigen::Map<const Derived, Unaligned> ConstMapType;
-    friend  class Eigen::Map<Derived, Aligned>;
-    typedef Eigen::Map<Derived, Aligned> AlignedMapType;
-    friend  class Eigen::Map<const Derived, Aligned>;
-    typedef const Eigen::Map<const Derived, Aligned> ConstAlignedMapType;
-    template<typename StrideType> struct StridedMapType { typedef Eigen::Map<Derived, Unaligned, StrideType> type; };
-    template<typename StrideType> struct StridedConstMapType { typedef Eigen::Map<const Derived, Unaligned, StrideType> type; };
-    template<typename StrideType> struct StridedAlignedMapType { typedef Eigen::Map<Derived, Aligned, StrideType> type; };
-    template<typename StrideType> struct StridedConstAlignedMapType { typedef Eigen::Map<const Derived, Aligned, StrideType> type; };
-
-  protected:
-    DenseStorage<Scalar, Base::MaxSizeAtCompileTime, Base::RowsAtCompileTime, Base::ColsAtCompileTime, Options> m_storage;
-
-  public:
-    enum { NeedsToAlign = SizeAtCompileTime != Dynamic && (internal::traits<Derived>::Flags & AlignedBit) != 0 };
-    EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign)
-
-    Base& base() { return *static_cast<Base*>(this); }
-    const Base& base() const { return *static_cast<const Base*>(this); }
-
-    EIGEN_STRONG_INLINE Index rows() const { return m_storage.rows(); }
-    EIGEN_STRONG_INLINE Index cols() const { return m_storage.cols(); }
-
-    EIGEN_STRONG_INLINE const Scalar& coeff(Index rowId, Index colId) const
-    {
-      if(Flags & RowMajorBit)
-        return m_storage.data()[colId + rowId * m_storage.cols()];
-      else // column-major
-        return m_storage.data()[rowId + colId * m_storage.rows()];
-    }
-
-    EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const
-    {
-      return m_storage.data()[index];
-    }
+  }
 
-    EIGEN_STRONG_INLINE Scalar& coeffRef(Index rowId, Index colId)
-    {
-      if(Flags & RowMajorBit)
-        return m_storage.data()[colId + rowId * m_storage.cols()];
-      else // column-major
-        return m_storage.data()[rowId + colId * m_storage.rows()];
-    }
+  /** Resizes \c *this to a vector of length \a size
+   *
+   * \only_for_vectors. This method does not work for
+   * partially dynamic matrices when the static dimension is anything other
+   * than 1. For example it will not work with Matrix<double, 2, Dynamic>.
+   *
+   * Example: \include Matrix_resize_int.cpp
+   * Output: \verbinclude Matrix_resize_int.out
+   *
+   * \sa resize(Index,Index), resize(NoChange_t, Index), resize(Index, NoChange_t)
+   */
+  EIGEN_DEVICE_FUNC constexpr void resize(Index size) {
+    EIGEN_STATIC_ASSERT_VECTOR_ONLY(PlainObjectBase)
+    eigen_assert(((SizeAtCompileTime == Dynamic && (MaxSizeAtCompileTime == Dynamic || size <= MaxSizeAtCompileTime)) ||
+                  SizeAtCompileTime == size) &&
+                 size >= 0);
+#ifdef EIGEN_INITIALIZE_COEFFS
+    bool size_changed = size != this->size();
+#endif
+    if (RowsAtCompileTime == 1)
+      m_storage.resize(size, 1, size);
+    else
+      m_storage.resize(size, size, 1);
+#ifdef EIGEN_INITIALIZE_COEFFS
+    if (size_changed) EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
+#endif
+  }
 
-    EIGEN_STRONG_INLINE Scalar& coeffRef(Index index)
-    {
-      return m_storage.data()[index];
-    }
+  /** Resizes the matrix, changing only the number of columns. For the parameter of type NoChange_t, just pass the
+   * special value \c NoChange as in the example below.
+   *
+   * Example: \include Matrix_resize_NoChange_int.cpp
+   * Output: \verbinclude Matrix_resize_NoChange_int.out
+   *
+   * \sa resize(Index,Index)
+   */
+  EIGEN_DEVICE_FUNC constexpr void resize(NoChange_t, Index cols) { resize(rows(), cols); }
+
+  /** Resizes the matrix, changing only the number of rows. For the parameter of type NoChange_t, just pass the special
+   * value \c NoChange as in the example below.
+   *
+   * Example: \include Matrix_resize_int_NoChange.cpp
+   * Output: \verbinclude Matrix_resize_int_NoChange.out
+   *
+   * \sa resize(Index,Index)
+   */
+  EIGEN_DEVICE_FUNC constexpr void resize(Index rows, NoChange_t) { resize(rows, cols()); }
+
+  /** Resizes \c *this to have the same dimensions as \a other.
+   * Takes care of doing all the checking that's needed.
+   *
+   * Note that copying a row-vector into a vector (and conversely) is allowed.
+   * The resizing, if any, is then done in the appropriate way so that row-vectors
+   * remain row-vectors and vectors remain vectors.
+   */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void resizeLike(const EigenBase<OtherDerived>& _other) {
+    const OtherDerived& other = _other.derived();
+#ifndef EIGEN_NO_DEBUG
+    internal::check_rows_cols_for_overflow<MaxSizeAtCompileTime, MaxRowsAtCompileTime, MaxColsAtCompileTime>::run(
+        other.rows(), other.cols());
+#endif
+    const Index othersize = other.rows() * other.cols();
+    if (RowsAtCompileTime == 1) {
+      eigen_assert(other.rows() == 1 || other.cols() == 1);
+      resize(1, othersize);
+    } else if (ColsAtCompileTime == 1) {
+      eigen_assert(other.rows() == 1 || other.cols() == 1);
+      resize(othersize, 1);
+    } else
+      resize(other.rows(), other.cols());
+  }
 
-    EIGEN_STRONG_INLINE const Scalar& coeffRef(Index rowId, Index colId) const
-    {
-      if(Flags & RowMajorBit)
-        return m_storage.data()[colId + rowId * m_storage.cols()];
-      else // column-major
-        return m_storage.data()[rowId + colId * m_storage.rows()];
-    }
+  /** Resizes the matrix to \a rows x \a cols while leaving old values untouched.
+   *
+   * The method is intended for matrices of dynamic size. If you only want to change the number
+   * of rows and/or of columns, you can use conservativeResize(NoChange_t, Index) or
+   * conservativeResize(Index, NoChange_t).
+   *
+   * Matrices are resized relative to the top-left element. In case values need to be
+   * appended to the matrix they will be uninitialized.
+   */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void conservativeResize(Index rows, Index cols) {
+    internal::conservative_resize_like_impl<Derived>::run(*this, rows, cols);
+  }
 
-    EIGEN_STRONG_INLINE const Scalar& coeffRef(Index index) const
-    {
-      return m_storage.data()[index];
-    }
+  /** Resizes the matrix to \a rows x \a cols while leaving old values untouched.
+   *
+   * As opposed to conservativeResize(Index rows, Index cols), this version leaves
+   * the number of columns unchanged.
+   *
+   * In case the matrix is growing, new rows will be uninitialized.
+   */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void conservativeResize(Index rows, NoChange_t) {
+    // Note: see the comment in conservativeResize(Index,Index)
+    conservativeResize(rows, cols());
+  }
 
-    /** \internal */
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE PacketScalar packet(Index rowId, Index colId) const
-    {
-      return internal::ploadt<PacketScalar, LoadMode>
-               (m_storage.data() + (Flags & RowMajorBit
-                                   ? colId + rowId * m_storage.cols()
-                                   : rowId + colId * m_storage.rows()));
-    }
+  /** Resizes the matrix to \a rows x \a cols while leaving old values untouched.
+   *
+   * As opposed to conservativeResize(Index rows, Index cols), this version leaves
+   * the number of rows unchanged.
+   *
+   * In case the matrix is growing, new columns will be uninitialized.
+   */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void conservativeResize(NoChange_t, Index cols) {
+    // Note: see the comment in conservativeResize(Index,Index)
+    conservativeResize(rows(), cols);
+  }
 
-    /** \internal */
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE PacketScalar packet(Index index) const
-    {
-      return internal::ploadt<PacketScalar, LoadMode>(m_storage.data() + index);
-    }
+  /** Resizes the vector to \a size while retaining old values.
+   *
+   * \only_for_vectors. This method does not work for
+   * partially dynamic matrices when the static dimension is anything other
+   * than 1. For example it will not work with Matrix<double, 2, Dynamic>.
+   *
+   * When values are appended, they will be uninitialized.
+   */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void conservativeResize(Index size) {
+    internal::conservative_resize_like_impl<Derived>::run(*this, size);
+  }
 
-    /** \internal */
-    template<int StoreMode>
-    EIGEN_STRONG_INLINE void writePacket(Index rowId, Index colId, const PacketScalar& val)
-    {
-      internal::pstoret<Scalar, PacketScalar, StoreMode>
-              (m_storage.data() + (Flags & RowMajorBit
-                                   ? colId + rowId * m_storage.cols()
-                                   : rowId + colId * m_storage.rows()), val);
-    }
+  /** Resizes the matrix to \a rows x \a cols of \c other, while leaving old values untouched.
+   *
+   * The method is intended for matrices of dynamic size. If you only want to change the number
+   * of rows and/or of columns, you can use conservativeResize(NoChange_t, Index) or
+   * conservativeResize(Index, NoChange_t).
+   *
+   * Matrices are resized relative to the top-left element. In case values need to be
+   * appended to the matrix they will copied from \c other.
+   */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void conservativeResizeLike(const DenseBase<OtherDerived>& other) {
+    internal::conservative_resize_like_impl<Derived, OtherDerived>::run(*this, other);
+  }
 
-    /** \internal */
-    template<int StoreMode>
-    EIGEN_STRONG_INLINE void writePacket(Index index, const PacketScalar& val)
-    {
-      internal::pstoret<Scalar, PacketScalar, StoreMode>(m_storage.data() + index, val);
-    }
+  /** This is a special case of the templated operator=. Its purpose is to
+   * prevent a default operator= from hiding the templated operator=.
+   */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Derived& operator=(const PlainObjectBase& other) {
+    return _set(other);
+  }
 
-    /** \returns a const pointer to the data array of this matrix */
-    EIGEN_STRONG_INLINE const Scalar *data() const
-    { return m_storage.data(); }
-
-    /** \returns a pointer to the data array of this matrix */
-    EIGEN_STRONG_INLINE Scalar *data()
-    { return m_storage.data(); }
-
-    /** Resizes \c *this to a \a rows x \a cols matrix.
-      *
-      * This method is intended for dynamic-size matrices, although it is legal to call it on any
-      * matrix as long as fixed dimensions are left unchanged. If you only want to change the number
-      * of rows and/or of columns, you can use resize(NoChange_t, Index), resize(Index, NoChange_t).
-      *
-      * If the current number of coefficients of \c *this exactly matches the
-      * product \a rows * \a cols, then no memory allocation is performed and
-      * the current values are left unchanged. In all other cases, including
-      * shrinking, the data is reallocated and all previous values are lost.
-      *
-      * Example: \include Matrix_resize_int_int.cpp
-      * Output: \verbinclude Matrix_resize_int_int.out
-      *
-      * \sa resize(Index) for vectors, resize(NoChange_t, Index), resize(Index, NoChange_t)
-      */
-    EIGEN_STRONG_INLINE void resize(Index nbRows, Index nbCols)
-    {
-      eigen_assert(   EIGEN_IMPLIES(RowsAtCompileTime!=Dynamic,nbRows==RowsAtCompileTime)
-                   && EIGEN_IMPLIES(ColsAtCompileTime!=Dynamic,nbCols==ColsAtCompileTime)
-                   && EIGEN_IMPLIES(RowsAtCompileTime==Dynamic && MaxRowsAtCompileTime!=Dynamic,nbRows<=MaxRowsAtCompileTime)
-                   && EIGEN_IMPLIES(ColsAtCompileTime==Dynamic && MaxColsAtCompileTime!=Dynamic,nbCols<=MaxColsAtCompileTime)
-                   && nbRows>=0 && nbCols>=0 && "Invalid sizes when resizing a matrix or array.");
-      internal::check_rows_cols_for_overflow<MaxSizeAtCompileTime>::run(nbRows, nbCols);
-      #ifdef EIGEN_INITIALIZE_COEFFS
-        Index size = nbRows*nbCols;
-        bool size_changed = size != this->size();
-        m_storage.resize(size, nbRows, nbCols);
-        if(size_changed) EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
-      #else
-        internal::check_rows_cols_for_overflow<MaxSizeAtCompileTime>::run(nbRows, nbCols);
-        m_storage.resize(nbRows*nbCols, nbRows, nbCols);
-      #endif
-    }
+  /** \sa MatrixBase::lazyAssign() */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& lazyAssign(const DenseBase<OtherDerived>& other) {
+    _resize_to_match(other);
+    return Base::lazyAssign(other.derived());
+  }
 
-    /** Resizes \c *this to a vector of length \a size
-      *
-      * \only_for_vectors. This method does not work for
-      * partially dynamic matrices when the static dimension is anything other
-      * than 1. For example it will not work with Matrix<double, 2, Dynamic>.
-      *
-      * Example: \include Matrix_resize_int.cpp
-      * Output: \verbinclude Matrix_resize_int.out
-      *
-      * \sa resize(Index,Index), resize(NoChange_t, Index), resize(Index, NoChange_t)
-      */
-    inline void resize(Index size)
-    {
-      EIGEN_STATIC_ASSERT_VECTOR_ONLY(PlainObjectBase)
-      eigen_assert(((SizeAtCompileTime == Dynamic && (MaxSizeAtCompileTime==Dynamic || size<=MaxSizeAtCompileTime)) || SizeAtCompileTime == size) && size>=0);
-      #ifdef EIGEN_INITIALIZE_COEFFS
-        bool size_changed = size != this->size();
-      #endif
-      if(RowsAtCompileTime == 1)
-        m_storage.resize(size, 1, size);
-      else
-        m_storage.resize(size, size, 1);
-      #ifdef EIGEN_INITIALIZE_COEFFS
-        if(size_changed) EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
-      #endif
-    }
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const ReturnByValue<OtherDerived>& func) {
+    resize(func.rows(), func.cols());
+    return Base::operator=(func);
+  }
 
-    /** Resizes the matrix, changing only the number of columns. For the parameter of type NoChange_t, just pass the special value \c NoChange
-      * as in the example below.
-      *
-      * Example: \include Matrix_resize_NoChange_int.cpp
-      * Output: \verbinclude Matrix_resize_NoChange_int.out
-      *
-      * \sa resize(Index,Index)
-      */
-    inline void resize(NoChange_t, Index nbCols)
-    {
-      resize(rows(), nbCols);
-    }
+  // Prevent user from trying to instantiate PlainObjectBase objects
+  // by making all its constructor protected. See bug 1074.
+ protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr PlainObjectBase() = default;
+  /** \brief Move constructor */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr PlainObjectBase(PlainObjectBase&&) = default;
+  /** \brief Move assignment operator */
+  EIGEN_DEVICE_FUNC constexpr PlainObjectBase& operator=(PlainObjectBase&& other) noexcept {
+    m_storage = std::move(other.m_storage);
+    return *this;
+  }
 
-    /** Resizes the matrix, changing only the number of rows. For the parameter of type NoChange_t, just pass the special value \c NoChange
-      * as in the example below.
-      *
-      * Example: \include Matrix_resize_int_NoChange.cpp
-      * Output: \verbinclude Matrix_resize_int_NoChange.out
-      *
-      * \sa resize(Index,Index)
-      */
-    inline void resize(Index nbRows, NoChange_t)
-    {
-      resize(nbRows, cols());
-    }
+  /** Copy constructor */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr PlainObjectBase(const PlainObjectBase&) = default;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PlainObjectBase(Index size, Index rows, Index cols)
+      : m_storage(size, rows, cols) {}
+
+  /** \brief Construct a row of column vector with fixed size from an arbitrary number of coefficients.
+   *
+   * \only_for_vectors
+   *
+   * This constructor is for 1D array or vectors with more than 4 coefficients.
+   *
+   * \warning To construct a column (resp. row) vector of fixed length, the number of values passed to this
+   * constructor must match the the fixed number of rows (resp. columns) of \c *this.
+   */
+  template <typename... ArgTypes>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PlainObjectBase(const Scalar& a0, const Scalar& a1, const Scalar& a2,
+                                                        const Scalar& a3, const ArgTypes&... args)
+      : m_storage() {
+    EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, sizeof...(args) + 4);
+    m_storage.data()[0] = a0;
+    m_storage.data()[1] = a1;
+    m_storage.data()[2] = a2;
+    m_storage.data()[3] = a3;
+    Index i = 4;
+    auto x = {(m_storage.data()[i++] = args, 0)...};
+    static_cast<void>(x);
+  }
 
-    /** Resizes \c *this to have the same dimensions as \a other.
-      * Takes care of doing all the checking that's needed.
-      *
-      * Note that copying a row-vector into a vector (and conversely) is allowed.
-      * The resizing, if any, is then done in the appropriate way so that row-vectors
-      * remain row-vectors and vectors remain vectors.
-      */
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE void resizeLike(const EigenBase<OtherDerived>& _other)
-    {
-      const OtherDerived& other = _other.derived();
-      internal::check_rows_cols_for_overflow<MaxSizeAtCompileTime>::run(other.rows(), other.cols());
-      const Index othersize = other.rows()*other.cols();
-      if(RowsAtCompileTime == 1)
-      {
-        eigen_assert(other.rows() == 1 || other.cols() == 1);
-        resize(1, othersize);
+  /** \brief Constructs a Matrix or Array and initializes it by elements given by an initializer list of initializer
+   * lists
+   */
+  EIGEN_DEVICE_FUNC explicit constexpr EIGEN_STRONG_INLINE PlainObjectBase(
+      const std::initializer_list<std::initializer_list<Scalar>>& list)
+      : m_storage() {
+    size_t list_size = 0;
+    if (list.begin() != list.end()) {
+      list_size = list.begin()->size();
+    }
+
+    // This is to allow syntax like VectorXi {{1, 2, 3, 4}}
+    if (ColsAtCompileTime == 1 && list.size() == 1) {
+      eigen_assert(list_size == static_cast<size_t>(RowsAtCompileTime) || RowsAtCompileTime == Dynamic);
+      resize(list_size, ColsAtCompileTime);
+      if (list.begin()->begin() != nullptr) {
+        Index index = 0;
+        for (const Scalar& e : *list.begin()) {
+          coeffRef(index++) = e;
+        }
       }
-      else if(ColsAtCompileTime == 1)
-      {
-        eigen_assert(other.rows() == 1 || other.cols() == 1);
-        resize(othersize, 1);
+    } else {
+      eigen_assert(list.size() == static_cast<size_t>(RowsAtCompileTime) || RowsAtCompileTime == Dynamic);
+      eigen_assert(list_size == static_cast<size_t>(ColsAtCompileTime) || ColsAtCompileTime == Dynamic);
+      resize(list.size(), list_size);
+
+      Index row_index = 0;
+      for (const std::initializer_list<Scalar>& row : list) {
+        eigen_assert(list_size == row.size());
+        Index col_index = 0;
+        for (const Scalar& e : row) {
+          coeffRef(row_index, col_index) = e;
+          ++col_index;
+        }
+        ++row_index;
       }
-      else resize(other.rows(), other.cols());
-    }
-
-    /** Resizes the matrix to \a rows x \a cols while leaving old values untouched.
-      *
-      * The method is intended for matrices of dynamic size. If you only want to change the number
-      * of rows and/or of columns, you can use conservativeResize(NoChange_t, Index) or
-      * conservativeResize(Index, NoChange_t).
-      *
-      * Matrices are resized relative to the top-left element. In case values need to be 
-      * appended to the matrix they will be uninitialized.
-      */
-    EIGEN_STRONG_INLINE void conservativeResize(Index nbRows, Index nbCols)
-    {
-      internal::conservative_resize_like_impl<Derived>::run(*this, nbRows, nbCols);
     }
+  }
 
-    /** Resizes the matrix to \a rows x \a cols while leaving old values untouched.
-      *
-      * As opposed to conservativeResize(Index rows, Index cols), this version leaves
-      * the number of columns unchanged.
-      *
-      * In case the matrix is growing, new rows will be uninitialized.
-      */
-    EIGEN_STRONG_INLINE void conservativeResize(Index nbRows, NoChange_t)
-    {
-      // Note: see the comment in conservativeResize(Index,Index)
-      conservativeResize(nbRows, cols());
-    }
+  /** \sa PlainObjectBase::operator=(const EigenBase<OtherDerived>&) */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PlainObjectBase(const DenseBase<OtherDerived>& other) : m_storage() {
+    resizeLike(other);
+    _set_noalias(other);
+  }
 
-    /** Resizes the matrix to \a rows x \a cols while leaving old values untouched.
-      *
-      * As opposed to conservativeResize(Index rows, Index cols), this version leaves
-      * the number of rows unchanged.
-      *
-      * In case the matrix is growing, new columns will be uninitialized.
-      */
-    EIGEN_STRONG_INLINE void conservativeResize(NoChange_t, Index nbCols)
-    {
-      // Note: see the comment in conservativeResize(Index,Index)
-      conservativeResize(rows(), nbCols);
-    }
+  /** \sa PlainObjectBase::operator=(const EigenBase<OtherDerived>&) */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PlainObjectBase(const EigenBase<OtherDerived>& other) : m_storage() {
+    resizeLike(other);
+    *this = other.derived();
+  }
+  /** \brief Copy constructor with in-place evaluation */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PlainObjectBase(const ReturnByValue<OtherDerived>& other) {
+    // FIXME this does not automatically transpose vectors if necessary
+    resize(other.rows(), other.cols());
+    other.evalTo(this->derived());
+  }
 
-    /** Resizes the vector to \a size while retaining old values.
-      *
-      * \only_for_vectors. This method does not work for
-      * partially dynamic matrices when the static dimension is anything other
-      * than 1. For example it will not work with Matrix<double, 2, Dynamic>.
-      *
-      * When values are appended, they will be uninitialized.
-      */
-    EIGEN_STRONG_INLINE void conservativeResize(Index size)
-    {
-      internal::conservative_resize_like_impl<Derived>::run(*this, size);
-    }
+ public:
+  /** \brief Copies the generic expression \a other into *this.
+   * \copydetails DenseBase::operator=(const EigenBase<OtherDerived> &other)
+   */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const EigenBase<OtherDerived>& other) {
+    _resize_to_match(other);
+    Base::operator=(other.derived());
+    return this->derived();
+  }
 
-    /** Resizes the matrix to \a rows x \a cols of \c other, while leaving old values untouched.
-      *
-      * The method is intended for matrices of dynamic size. If you only want to change the number
-      * of rows and/or of columns, you can use conservativeResize(NoChange_t, Index) or
-      * conservativeResize(Index, NoChange_t).
-      *
-      * Matrices are resized relative to the top-left element. In case values need to be 
-      * appended to the matrix they will copied from \c other.
-      */
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE void conservativeResizeLike(const DenseBase<OtherDerived>& other)
-    {
-      internal::conservative_resize_like_impl<Derived,OtherDerived>::run(*this, other);
-    }
+  /** \name Map
+   * These are convenience functions returning Map objects. The Map() static functions return unaligned Map objects,
+   * while the AlignedMap() functions return aligned Map objects and thus should be called only with 16-byte-aligned
+   * \a data pointers.
+   *
+   * Here is an example using strides:
+   * \include Matrix_Map_stride.cpp
+   * Output: \verbinclude Matrix_Map_stride.out
+   *
+   * \see class Map
+   */
+  ///@{
+  static inline ConstMapType Map(const Scalar* data) { return ConstMapType(data); }
+  static inline MapType Map(Scalar* data) { return MapType(data); }
+  static inline ConstMapType Map(const Scalar* data, Index size) { return ConstMapType(data, size); }
+  static inline MapType Map(Scalar* data, Index size) { return MapType(data, size); }
+  static inline ConstMapType Map(const Scalar* data, Index rows, Index cols) { return ConstMapType(data, rows, cols); }
+  static inline MapType Map(Scalar* data, Index rows, Index cols) { return MapType(data, rows, cols); }
+
+  static inline ConstAlignedMapType MapAligned(const Scalar* data) { return ConstAlignedMapType(data); }
+  static inline AlignedMapType MapAligned(Scalar* data) { return AlignedMapType(data); }
+  static inline ConstAlignedMapType MapAligned(const Scalar* data, Index size) {
+    return ConstAlignedMapType(data, size);
+  }
+  static inline AlignedMapType MapAligned(Scalar* data, Index size) { return AlignedMapType(data, size); }
+  static inline ConstAlignedMapType MapAligned(const Scalar* data, Index rows, Index cols) {
+    return ConstAlignedMapType(data, rows, cols);
+  }
+  static inline AlignedMapType MapAligned(Scalar* data, Index rows, Index cols) {
+    return AlignedMapType(data, rows, cols);
+  }
 
-    /** This is a special case of the templated operator=. Its purpose is to
-      * prevent a default operator= from hiding the templated operator=.
-      */
-    EIGEN_STRONG_INLINE Derived& operator=(const PlainObjectBase& other)
-    {
-      return _set(other);
-    }
+  template <int Outer, int Inner>
+  static inline typename StridedConstMapType<Stride<Outer, Inner>>::type Map(const Scalar* data,
+                                                                             const Stride<Outer, Inner>& stride) {
+    return typename StridedConstMapType<Stride<Outer, Inner>>::type(data, stride);
+  }
+  template <int Outer, int Inner>
+  static inline typename StridedMapType<Stride<Outer, Inner>>::type Map(Scalar* data,
+                                                                        const Stride<Outer, Inner>& stride) {
+    return typename StridedMapType<Stride<Outer, Inner>>::type(data, stride);
+  }
+  template <int Outer, int Inner>
+  static inline typename StridedConstMapType<Stride<Outer, Inner>>::type Map(const Scalar* data, Index size,
+                                                                             const Stride<Outer, Inner>& stride) {
+    return typename StridedConstMapType<Stride<Outer, Inner>>::type(data, size, stride);
+  }
+  template <int Outer, int Inner>
+  static inline typename StridedMapType<Stride<Outer, Inner>>::type Map(Scalar* data, Index size,
+                                                                        const Stride<Outer, Inner>& stride) {
+    return typename StridedMapType<Stride<Outer, Inner>>::type(data, size, stride);
+  }
+  template <int Outer, int Inner>
+  static inline typename StridedConstMapType<Stride<Outer, Inner>>::type Map(const Scalar* data, Index rows, Index cols,
+                                                                             const Stride<Outer, Inner>& stride) {
+    return typename StridedConstMapType<Stride<Outer, Inner>>::type(data, rows, cols, stride);
+  }
+  template <int Outer, int Inner>
+  static inline typename StridedMapType<Stride<Outer, Inner>>::type Map(Scalar* data, Index rows, Index cols,
+                                                                        const Stride<Outer, Inner>& stride) {
+    return typename StridedMapType<Stride<Outer, Inner>>::type(data, rows, cols, stride);
+  }
 
-    /** \sa MatrixBase::lazyAssign() */
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE Derived& lazyAssign(const DenseBase<OtherDerived>& other)
-    {
-      _resize_to_match(other);
-      return Base::lazyAssign(other.derived());
-    }
+  template <int Outer, int Inner>
+  static inline typename StridedConstAlignedMapType<Stride<Outer, Inner>>::type MapAligned(
+      const Scalar* data, const Stride<Outer, Inner>& stride) {
+    return typename StridedConstAlignedMapType<Stride<Outer, Inner>>::type(data, stride);
+  }
+  template <int Outer, int Inner>
+  static inline typename StridedAlignedMapType<Stride<Outer, Inner>>::type MapAligned(
+      Scalar* data, const Stride<Outer, Inner>& stride) {
+    return typename StridedAlignedMapType<Stride<Outer, Inner>>::type(data, stride);
+  }
+  template <int Outer, int Inner>
+  static inline typename StridedConstAlignedMapType<Stride<Outer, Inner>>::type MapAligned(
+      const Scalar* data, Index size, const Stride<Outer, Inner>& stride) {
+    return typename StridedConstAlignedMapType<Stride<Outer, Inner>>::type(data, size, stride);
+  }
+  template <int Outer, int Inner>
+  static inline typename StridedAlignedMapType<Stride<Outer, Inner>>::type MapAligned(
+      Scalar* data, Index size, const Stride<Outer, Inner>& stride) {
+    return typename StridedAlignedMapType<Stride<Outer, Inner>>::type(data, size, stride);
+  }
+  template <int Outer, int Inner>
+  static inline typename StridedConstAlignedMapType<Stride<Outer, Inner>>::type MapAligned(
+      const Scalar* data, Index rows, Index cols, const Stride<Outer, Inner>& stride) {
+    return typename StridedConstAlignedMapType<Stride<Outer, Inner>>::type(data, rows, cols, stride);
+  }
+  template <int Outer, int Inner>
+  static inline typename StridedAlignedMapType<Stride<Outer, Inner>>::type MapAligned(
+      Scalar* data, Index rows, Index cols, const Stride<Outer, Inner>& stride) {
+    return typename StridedAlignedMapType<Stride<Outer, Inner>>::type(data, rows, cols, stride);
+  }
+  ///@}
+
+  using Base::setConstant;
+  EIGEN_DEVICE_FUNC Derived& setConstant(Index size, const Scalar& val);
+  EIGEN_DEVICE_FUNC Derived& setConstant(Index rows, Index cols, const Scalar& val);
+  EIGEN_DEVICE_FUNC Derived& setConstant(NoChange_t, Index cols, const Scalar& val);
+  EIGEN_DEVICE_FUNC Derived& setConstant(Index rows, NoChange_t, const Scalar& val);
+
+  using Base::setZero;
+  EIGEN_DEVICE_FUNC Derived& setZero(Index size);
+  EIGEN_DEVICE_FUNC Derived& setZero(Index rows, Index cols);
+  EIGEN_DEVICE_FUNC Derived& setZero(NoChange_t, Index cols);
+  EIGEN_DEVICE_FUNC Derived& setZero(Index rows, NoChange_t);
+
+  using Base::setOnes;
+  EIGEN_DEVICE_FUNC Derived& setOnes(Index size);
+  EIGEN_DEVICE_FUNC Derived& setOnes(Index rows, Index cols);
+  EIGEN_DEVICE_FUNC Derived& setOnes(NoChange_t, Index cols);
+  EIGEN_DEVICE_FUNC Derived& setOnes(Index rows, NoChange_t);
+
+  using Base::setRandom;
+  Derived& setRandom(Index size);
+  Derived& setRandom(Index rows, Index cols);
+  Derived& setRandom(NoChange_t, Index cols);
+  Derived& setRandom(Index rows, NoChange_t);
+
+#ifdef EIGEN_PLAINOBJECTBASE_PLUGIN
+#include EIGEN_PLAINOBJECTBASE_PLUGIN
+#endif
 
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE Derived& operator=(const ReturnByValue<OtherDerived>& func)
-    {
-      resize(func.rows(), func.cols());
-      return Base::operator=(func);
-    }
+ protected:
+  /** \internal Resizes *this in preparation for assigning \a other to it.
+   * Takes care of doing all the checking that's needed.
+   *
+   * Note that copying a row-vector into a vector (and conversely) is allowed.
+   * The resizing, if any, is then done in the appropriate way so that row-vectors
+   * remain row-vectors and vectors remain vectors.
+   */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _resize_to_match(const EigenBase<OtherDerived>& other) {
+#ifdef EIGEN_NO_AUTOMATIC_RESIZING
+    eigen_assert((this->size() == 0 || (IsVectorAtCompileTime ? (this->size() == other.size())
+                                                              : (rows() == other.rows() && cols() == other.cols()))) &&
+                 "Size mismatch. Automatic resizing is disabled because EIGEN_NO_AUTOMATIC_RESIZING is defined");
+    EIGEN_ONLY_USED_FOR_DEBUG(other);
+#else
+    resizeLike(other);
+#endif
+  }
 
-    EIGEN_STRONG_INLINE PlainObjectBase() : m_storage()
-    {
-//       _check_template_params();
-//       EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
-    }
+  /**
+   * \brief Copies the value of the expression \a other into \c *this with automatic resizing.
+   *
+   * *this might be resized to match the dimensions of \a other. If *this was a null matrix (not already initialized),
+   * it will be initialized.
+   *
+   * Note that copying a row-vector into a vector (and conversely) is allowed.
+   * The resizing, if any, is then done in the appropriate way so that row-vectors
+   * remain row-vectors and vectors remain vectors.
+   *
+   * \sa operator=(const MatrixBase<OtherDerived>&), _set_noalias()
+   *
+   * \internal
+   */
+  // aliasing is dealt once in internal::call_assignment
+  // so at this stage we have to assume aliasing... and resising has to be done later.
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Derived& _set(const DenseBase<OtherDerived>& other) {
+    internal::call_assignment(this->derived(), other.derived());
+    return this->derived();
+  }
 
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-    // FIXME is it still needed ?
-    /** \internal */
-    PlainObjectBase(internal::constructor_without_unaligned_array_assert)
-      : m_storage(internal::constructor_without_unaligned_array_assert())
-    {
-//       _check_template_params(); EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
-    }
-#endif
+  /** \internal Like _set() but additionally makes the assumption that no aliasing effect can happen (which
+   * is the case when creating a new matrix) so one can enforce lazy evaluation.
+   *
+   * \sa operator=(const MatrixBase<OtherDerived>&), _set()
+   */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Derived& _set_noalias(const DenseBase<OtherDerived>& other) {
+    // I don't think we need this resize call since the lazyAssign will anyways resize
+    // and lazyAssign will be called by the assign selector.
+    //_resize_to_match(other);
+    // the 'false' below means to enforce lazy evaluation. We don't use lazyAssign() because
+    // it wouldn't allow to copy a row-vector into a column-vector.
+    internal::call_assignment_no_alias(this->derived(), other.derived(),
+                                       internal::assign_op<Scalar, typename OtherDerived::Scalar>());
+    return this->derived();
+  }
 
-#ifdef EIGEN_HAVE_RVALUE_REFERENCES
-    PlainObjectBase(PlainObjectBase&& other)
-      : m_storage( std::move(other.m_storage) )
-    {
-    }
+  template <typename T0, typename T1>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init2(Index rows, Index cols,
+                                                    std::enable_if_t<Base::SizeAtCompileTime != 2, T0>* = 0) {
+    EIGEN_STATIC_ASSERT(internal::is_valid_index_type<T0>::value && internal::is_valid_index_type<T1>::value,
+                        T0 AND T1 MUST BE INTEGER TYPES)
+    resize(rows, cols);
+  }
 
-    PlainObjectBase& operator=(PlainObjectBase&& other)
-    {
-      using std::swap;
-      swap(m_storage, other.m_storage);
-      return *this;
-    }
-#endif
+  template <typename T0, typename T1>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init2(const T0& val0, const T1& val1,
+                                                    std::enable_if_t<Base::SizeAtCompileTime == 2, T0>* = 0) {
+    EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 2)
+    m_storage.data()[0] = Scalar(val0);
+    m_storage.data()[1] = Scalar(val1);
+  }
 
-    /** Copy constructor */
-    EIGEN_STRONG_INLINE PlainObjectBase(const PlainObjectBase& other)
-      : m_storage()
-    {
-      _check_template_params();
-      lazyAssign(other);
-    }
+  template <typename T0, typename T1>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init2(
+      const Index& val0, const Index& val1,
+      std::enable_if_t<(!internal::is_same<Index, Scalar>::value) && (internal::is_same<T0, Index>::value) &&
+                           (internal::is_same<T1, Index>::value) && Base::SizeAtCompileTime == 2,
+                       T1>* = 0) {
+    EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 2)
+    m_storage.data()[0] = Scalar(val0);
+    m_storage.data()[1] = Scalar(val1);
+  }
 
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE PlainObjectBase(const DenseBase<OtherDerived> &other)
-      : m_storage()
-    {
-      _check_template_params();
-      lazyAssign(other);
-    }
+  // The argument is convertible to the Index type and we either have a non 1x1 Matrix, or a dynamic-sized Array,
+  // then the argument is meant to be the size of the object.
+  template <typename T>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init1(
+      Index size,
+      std::enable_if_t<(Base::SizeAtCompileTime != 1 || !internal::is_convertible<T, Scalar>::value) &&
+                           ((!internal::is_same<typename internal::traits<Derived>::XprKind, ArrayXpr>::value ||
+                             Base::SizeAtCompileTime == Dynamic)),
+                       T>* = 0) {
+    // NOTE MSVC 2008 complains if we directly put bool(NumTraits<T>::IsInteger) as the EIGEN_STATIC_ASSERT argument.
+    const bool is_integer_alike = internal::is_valid_index_type<T>::value;
+    EIGEN_UNUSED_VARIABLE(is_integer_alike);
+    EIGEN_STATIC_ASSERT(is_integer_alike, FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED)
+    resize(size);
+  }
 
-    EIGEN_STRONG_INLINE PlainObjectBase(Index a_size, Index nbRows, Index nbCols)
-      : m_storage(a_size, nbRows, nbCols)
-    {
-//       _check_template_params();
-//       EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
-    }
+  // We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar
+  // type can be implicitly converted)
+  template <typename T>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init1(
+      const Scalar& val0,
+      std::enable_if_t<Base::SizeAtCompileTime == 1 && internal::is_convertible<T, Scalar>::value, T>* = 0) {
+    EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 1)
+    m_storage.data()[0] = val0;
+  }
 
-    /** \copydoc MatrixBase::operator=(const EigenBase<OtherDerived>&)
-      */
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE Derived& operator=(const EigenBase<OtherDerived> &other)
-    {
-      _resize_to_match(other);
-      Base::operator=(other.derived());
-      return this->derived();
-    }
+  // We have a 1x1 matrix/array => the argument is interpreted as the value of the unique coefficient (case where scalar
+  // type match the index type)
+  template <typename T>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init1(
+      const Index& val0,
+      std::enable_if_t<(!internal::is_same<Index, Scalar>::value) && (internal::is_same<Index, T>::value) &&
+                           Base::SizeAtCompileTime == 1 && internal::is_convertible<T, Scalar>::value,
+                       T*>* = 0) {
+    EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 1)
+    m_storage.data()[0] = Scalar(val0);
+  }
 
-    /** \sa MatrixBase::operator=(const EigenBase<OtherDerived>&) */
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE PlainObjectBase(const EigenBase<OtherDerived> &other)
-      : m_storage(other.derived().rows() * other.derived().cols(), other.derived().rows(), other.derived().cols())
-    {
-      _check_template_params();
-      internal::check_rows_cols_for_overflow<MaxSizeAtCompileTime>::run(other.derived().rows(), other.derived().cols());
-      Base::operator=(other.derived());
-    }
+  // Initialize a fixed size matrix from a pointer to raw data
+  template <typename T>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init1(const Scalar* data) {
+    this->_set_noalias(ConstMapType(data));
+  }
 
-    /** \name Map
-      * These are convenience functions returning Map objects. The Map() static functions return unaligned Map objects,
-      * while the AlignedMap() functions return aligned Map objects and thus should be called only with 16-byte-aligned
-      * \a data pointers.
-      *
-      * \see class Map
-      */
-    //@{
-    static inline ConstMapType Map(const Scalar* data)
-    { return ConstMapType(data); }
-    static inline MapType Map(Scalar* data)
-    { return MapType(data); }
-    static inline ConstMapType Map(const Scalar* data, Index size)
-    { return ConstMapType(data, size); }
-    static inline MapType Map(Scalar* data, Index size)
-    { return MapType(data, size); }
-    static inline ConstMapType Map(const Scalar* data, Index rows, Index cols)
-    { return ConstMapType(data, rows, cols); }
-    static inline MapType Map(Scalar* data, Index rows, Index cols)
-    { return MapType(data, rows, cols); }
-
-    static inline ConstAlignedMapType MapAligned(const Scalar* data)
-    { return ConstAlignedMapType(data); }
-    static inline AlignedMapType MapAligned(Scalar* data)
-    { return AlignedMapType(data); }
-    static inline ConstAlignedMapType MapAligned(const Scalar* data, Index size)
-    { return ConstAlignedMapType(data, size); }
-    static inline AlignedMapType MapAligned(Scalar* data, Index size)
-    { return AlignedMapType(data, size); }
-    static inline ConstAlignedMapType MapAligned(const Scalar* data, Index rows, Index cols)
-    { return ConstAlignedMapType(data, rows, cols); }
-    static inline AlignedMapType MapAligned(Scalar* data, Index rows, Index cols)
-    { return AlignedMapType(data, rows, cols); }
-
-    template<int Outer, int Inner>
-    static inline typename StridedConstMapType<Stride<Outer, Inner> >::type Map(const Scalar* data, const Stride<Outer, Inner>& stride)
-    { return typename StridedConstMapType<Stride<Outer, Inner> >::type(data, stride); }
-    template<int Outer, int Inner>
-    static inline typename StridedMapType<Stride<Outer, Inner> >::type Map(Scalar* data, const Stride<Outer, Inner>& stride)
-    { return typename StridedMapType<Stride<Outer, Inner> >::type(data, stride); }
-    template<int Outer, int Inner>
-    static inline typename StridedConstMapType<Stride<Outer, Inner> >::type Map(const Scalar* data, Index size, const Stride<Outer, Inner>& stride)
-    { return typename StridedConstMapType<Stride<Outer, Inner> >::type(data, size, stride); }
-    template<int Outer, int Inner>
-    static inline typename StridedMapType<Stride<Outer, Inner> >::type Map(Scalar* data, Index size, const Stride<Outer, Inner>& stride)
-    { return typename StridedMapType<Stride<Outer, Inner> >::type(data, size, stride); }
-    template<int Outer, int Inner>
-    static inline typename StridedConstMapType<Stride<Outer, Inner> >::type Map(const Scalar* data, Index rows, Index cols, const Stride<Outer, Inner>& stride)
-    { return typename StridedConstMapType<Stride<Outer, Inner> >::type(data, rows, cols, stride); }
-    template<int Outer, int Inner>
-    static inline typename StridedMapType<Stride<Outer, Inner> >::type Map(Scalar* data, Index rows, Index cols, const Stride<Outer, Inner>& stride)
-    { return typename StridedMapType<Stride<Outer, Inner> >::type(data, rows, cols, stride); }
-
-    template<int Outer, int Inner>
-    static inline typename StridedConstAlignedMapType<Stride<Outer, Inner> >::type MapAligned(const Scalar* data, const Stride<Outer, Inner>& stride)
-    { return typename StridedConstAlignedMapType<Stride<Outer, Inner> >::type(data, stride); }
-    template<int Outer, int Inner>
-    static inline typename StridedAlignedMapType<Stride<Outer, Inner> >::type MapAligned(Scalar* data, const Stride<Outer, Inner>& stride)
-    { return typename StridedAlignedMapType<Stride<Outer, Inner> >::type(data, stride); }
-    template<int Outer, int Inner>
-    static inline typename StridedConstAlignedMapType<Stride<Outer, Inner> >::type MapAligned(const Scalar* data, Index size, const Stride<Outer, Inner>& stride)
-    { return typename StridedConstAlignedMapType<Stride<Outer, Inner> >::type(data, size, stride); }
-    template<int Outer, int Inner>
-    static inline typename StridedAlignedMapType<Stride<Outer, Inner> >::type MapAligned(Scalar* data, Index size, const Stride<Outer, Inner>& stride)
-    { return typename StridedAlignedMapType<Stride<Outer, Inner> >::type(data, size, stride); }
-    template<int Outer, int Inner>
-    static inline typename StridedConstAlignedMapType<Stride<Outer, Inner> >::type MapAligned(const Scalar* data, Index rows, Index cols, const Stride<Outer, Inner>& stride)
-    { return typename StridedConstAlignedMapType<Stride<Outer, Inner> >::type(data, rows, cols, stride); }
-    template<int Outer, int Inner>
-    static inline typename StridedAlignedMapType<Stride<Outer, Inner> >::type MapAligned(Scalar* data, Index rows, Index cols, const Stride<Outer, Inner>& stride)
-    { return typename StridedAlignedMapType<Stride<Outer, Inner> >::type(data, rows, cols, stride); }
-    //@}
-
-    using Base::setConstant;
-    Derived& setConstant(Index size, const Scalar& value);
-    Derived& setConstant(Index rows, Index cols, const Scalar& value);
-
-    using Base::setZero;
-    Derived& setZero(Index size);
-    Derived& setZero(Index rows, Index cols);
-
-    using Base::setOnes;
-    Derived& setOnes(Index size);
-    Derived& setOnes(Index rows, Index cols);
-
-    using Base::setRandom;
-    Derived& setRandom(Index size);
-    Derived& setRandom(Index rows, Index cols);
-
-    #ifdef EIGEN_PLAINOBJECTBASE_PLUGIN
-    #include EIGEN_PLAINOBJECTBASE_PLUGIN
-    #endif
-
-  protected:
-    /** \internal Resizes *this in preparation for assigning \a other to it.
-      * Takes care of doing all the checking that's needed.
-      *
-      * Note that copying a row-vector into a vector (and conversely) is allowed.
-      * The resizing, if any, is then done in the appropriate way so that row-vectors
-      * remain row-vectors and vectors remain vectors.
-      */
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE void _resize_to_match(const EigenBase<OtherDerived>& other)
-    {
-      #ifdef EIGEN_NO_AUTOMATIC_RESIZING
-      eigen_assert((this->size()==0 || (IsVectorAtCompileTime ? (this->size() == other.size())
-                 : (rows() == other.rows() && cols() == other.cols())))
-        && "Size mismatch. Automatic resizing is disabled because EIGEN_NO_AUTOMATIC_RESIZING is defined");
-      EIGEN_ONLY_USED_FOR_DEBUG(other);
-      if(this->size()==0)
-        resizeLike(other);
-      #else
-      resizeLike(other);
-      #endif
-    }
+  // Initialize an arbitrary matrix from a dense expression
+  template <typename T, typename OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init1(const DenseBase<OtherDerived>& other) {
+    this->_set_noalias(other);
+  }
 
-    /**
-      * \brief Copies the value of the expression \a other into \c *this with automatic resizing.
-      *
-      * *this might be resized to match the dimensions of \a other. If *this was a null matrix (not already initialized),
-      * it will be initialized.
-      *
-      * Note that copying a row-vector into a vector (and conversely) is allowed.
-      * The resizing, if any, is then done in the appropriate way so that row-vectors
-      * remain row-vectors and vectors remain vectors.
-      *
-      * \sa operator=(const MatrixBase<OtherDerived>&), _set_noalias()
-      *
-      * \internal
-      */
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE Derived& _set(const DenseBase<OtherDerived>& other)
-    {
-      _set_selector(other.derived(), typename internal::conditional<static_cast<bool>(int(OtherDerived::Flags) & EvalBeforeAssigningBit), internal::true_type, internal::false_type>::type());
-      return this->derived();
-    }
+  // Initialize an arbitrary matrix from an object convertible to the Derived type.
+  template <typename T>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init1(const Derived& other) {
+    this->_set_noalias(other);
+  }
 
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE void _set_selector(const OtherDerived& other, const internal::true_type&) { _set_noalias(other.eval()); }
+  // Initialize an arbitrary matrix from a generic Eigen expression
+  template <typename T, typename OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init1(const EigenBase<OtherDerived>& other) {
+    this->derived() = other;
+  }
 
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE void _set_selector(const OtherDerived& other, const internal::false_type&) { _set_noalias(other); }
+  template <typename T, typename OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init1(const ReturnByValue<OtherDerived>& other) {
+    resize(other.rows(), other.cols());
+    other.evalTo(this->derived());
+  }
 
-    /** \internal Like _set() but additionally makes the assumption that no aliasing effect can happen (which
-      * is the case when creating a new matrix) so one can enforce lazy evaluation.
-      *
-      * \sa operator=(const MatrixBase<OtherDerived>&), _set()
-      */
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE Derived& _set_noalias(const DenseBase<OtherDerived>& other)
-    {
-      // I don't think we need this resize call since the lazyAssign will anyways resize
-      // and lazyAssign will be called by the assign selector.
-      //_resize_to_match(other);
-      // the 'false' below means to enforce lazy evaluation. We don't use lazyAssign() because
-      // it wouldn't allow to copy a row-vector into a column-vector.
-      return internal::assign_selector<Derived,OtherDerived,false>::run(this->derived(), other.derived());
-    }
+  template <typename T, typename OtherDerived, int ColsAtCompileTime>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init1(const RotationBase<OtherDerived, ColsAtCompileTime>& r) {
+    this->derived() = r;
+  }
 
-    template<typename T0, typename T1>
-    EIGEN_STRONG_INLINE void _init2(Index nbRows, Index nbCols, typename internal::enable_if<Base::SizeAtCompileTime!=2,T0>::type* = 0)
-    {
-      EIGEN_STATIC_ASSERT(bool(NumTraits<T0>::IsInteger) &&
-                          bool(NumTraits<T1>::IsInteger),
-                          FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED)
-      resize(nbRows,nbCols);
-    }
-    template<typename T0, typename T1>
-    EIGEN_STRONG_INLINE void _init2(const Scalar& val0, const Scalar& val1, typename internal::enable_if<Base::SizeAtCompileTime==2,T0>::type* = 0)
-    {
-      EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(PlainObjectBase, 2)
-      m_storage.data()[0] = val0;
-      m_storage.data()[1] = val1;
-    }
+  // For fixed-size Array<Scalar,...>
+  template <typename T>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init1(
+      const Scalar& val0,
+      std::enable_if_t<Base::SizeAtCompileTime != Dynamic && Base::SizeAtCompileTime != 1 &&
+                           internal::is_convertible<T, Scalar>::value &&
+                           internal::is_same<typename internal::traits<Derived>::XprKind, ArrayXpr>::value,
+                       T>* = 0) {
+    Base::setConstant(val0);
+  }
 
-    template<typename MatrixTypeA, typename MatrixTypeB, bool SwapPointers>
-    friend struct internal::matrix_swap_impl;
+  // For fixed-size Array<Index,...>
+  template <typename T>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init1(
+      const Index& val0,
+      std::enable_if_t<(!internal::is_same<Index, Scalar>::value) && (internal::is_same<Index, T>::value) &&
+                           Base::SizeAtCompileTime != Dynamic && Base::SizeAtCompileTime != 1 &&
+                           internal::is_convertible<T, Scalar>::value &&
+                           internal::is_same<typename internal::traits<Derived>::XprKind, ArrayXpr>::value,
+                       T*>* = 0) {
+    Base::setConstant(val0);
+  }
 
-    /** \internal generic implementation of swap for dense storage since for dynamic-sized matrices of same type it is enough to swap the
-      * data pointers.
-      */
-    template<typename OtherDerived>
-    void _swap(DenseBase<OtherDerived> const & other)
-    {
-      enum { SwapPointers = internal::is_same<Derived, OtherDerived>::value && Base::SizeAtCompileTime==Dynamic };
-      internal::matrix_swap_impl<Derived, OtherDerived, bool(SwapPointers)>::run(this->derived(), other.const_cast_derived());
-    }
+  template <typename MatrixTypeA, typename MatrixTypeB, bool SwapPointers>
+  friend struct internal::matrix_swap_impl;
 
-  public:
+ public:
 #ifndef EIGEN_PARSED_BY_DOXYGEN
-    static EIGEN_STRONG_INLINE void _check_template_params()
-    {
-      EIGEN_STATIC_ASSERT((EIGEN_IMPLIES(MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1, (Options&RowMajor)==RowMajor)
-                        && EIGEN_IMPLIES(MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1, (Options&RowMajor)==0)
-                        && ((RowsAtCompileTime == Dynamic) || (RowsAtCompileTime >= 0))
-                        && ((ColsAtCompileTime == Dynamic) || (ColsAtCompileTime >= 0))
-                        && ((MaxRowsAtCompileTime == Dynamic) || (MaxRowsAtCompileTime >= 0))
-                        && ((MaxColsAtCompileTime == Dynamic) || (MaxColsAtCompileTime >= 0))
-                        && (MaxRowsAtCompileTime == RowsAtCompileTime || RowsAtCompileTime==Dynamic)
-                        && (MaxColsAtCompileTime == ColsAtCompileTime || ColsAtCompileTime==Dynamic)
-                        && (Options & (DontAlign|RowMajor)) == Options),
-        INVALID_MATRIX_TEMPLATE_PARAMETERS)
-    }
-#endif
+  /** \internal
+   * \brief Override DenseBase::swap() since for dynamic-sized matrices
+   * of same type it is enough to swap the data pointers.
+   */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void swap(DenseBase<OtherDerived>& other) {
+    enum {SwapPointers = internal::is_same<Derived, OtherDerived>::value && Base::SizeAtCompileTime == Dynamic};
+    internal::matrix_swap_impl<Derived, OtherDerived, bool(SwapPointers)>::run(this->derived(), other.derived());
+  }
 
-private:
-    enum { ThisConstantIsPrivateInPlainObjectBase };
+  /** \internal
+   * \brief const version forwarded to DenseBase::swap
+   */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void swap(DenseBase<OtherDerived> const& other) {
+    Base::swap(other.derived());
+  }
+
+  enum {IsPlainObjectBase = 1};
+#endif
+ public:
+  // These apparently need to be down here for nvcc+icc to prevent duplicate
+  // Map symbol.
+  template <typename PlainObjectType, int MapOptions, typename StrideType>
+  friend class Eigen::Map;
+  friend class Eigen::Map<Derived, Unaligned>;
+  friend class Eigen::Map<const Derived, Unaligned>;
+#if EIGEN_MAX_ALIGN_BYTES > 0
+  // for EIGEN_MAX_ALIGN_BYTES==0, AlignedMax==Unaligned, and many compilers generate warnings for friend-ing a class
+  // twice.
+  friend class Eigen::Map<Derived, AlignedMax>;
+  friend class Eigen::Map<const Derived, AlignedMax>;
+#endif
 };
 
 namespace internal {
 
 template <typename Derived, typename OtherDerived, bool IsVector>
-struct conservative_resize_like_impl
-{
-  typedef typename Derived::Index Index;
-  static void run(DenseBase<Derived>& _this, Index rows, Index cols)
-  {
+struct conservative_resize_like_impl {
+  static constexpr bool IsRelocatable = std::is_trivially_copyable<typename Derived::Scalar>::value;
+  static void run(DenseBase<Derived>& _this, Index rows, Index cols) {
     if (_this.rows() == rows && _this.cols() == cols) return;
     EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(Derived)
 
-    if ( ( Derived::IsRowMajor && _this.cols() == cols) || // row-major and we change only the number of rows
-         (!Derived::IsRowMajor && _this.rows() == rows) )  // column-major and we change only the number of columns
-    {
-      internal::check_rows_cols_for_overflow<Derived::MaxSizeAtCompileTime>::run(rows, cols);
-      _this.derived().m_storage.conservativeResize(rows*cols,rows,cols);
-    }
-    else
+    if (IsRelocatable &&
+        ((Derived::IsRowMajor && _this.cols() == cols) ||  // row-major and we change only the number of rows
+         (!Derived::IsRowMajor && _this.rows() == rows)))  // column-major and we change only the number of columns
     {
+#ifndef EIGEN_NO_DEBUG
+      internal::check_rows_cols_for_overflow<Derived::MaxSizeAtCompileTime, Derived::MaxRowsAtCompileTime,
+                                             Derived::MaxColsAtCompileTime>::run(rows, cols);
+#endif
+      _this.derived().m_storage.conservativeResize(rows * cols, rows, cols);
+    } else {
       // The storage order does not allow us to use reallocation.
-      typename Derived::PlainObject tmp(rows,cols);
-      const Index common_rows = (std::min)(rows, _this.rows());
-      const Index common_cols = (std::min)(cols, _this.cols());
-      tmp.block(0,0,common_rows,common_cols) = _this.block(0,0,common_rows,common_cols);
+      Derived tmp(rows, cols);
+      const Index common_rows = numext::mini(rows, _this.rows());
+      const Index common_cols = numext::mini(cols, _this.cols());
+      tmp.block(0, 0, common_rows, common_cols) = _this.block(0, 0, common_rows, common_cols);
       _this.derived().swap(tmp);
     }
   }
 
-  static void run(DenseBase<Derived>& _this, const DenseBase<OtherDerived>& other)
-  {
+  static void run(DenseBase<Derived>& _this, const DenseBase<OtherDerived>& other) {
     if (_this.rows() == other.rows() && _this.cols() == other.cols()) return;
 
     // Note: Here is space for improvement. Basically, for conservativeResize(Index,Index),
@@ -743,24 +938,24 @@ struct conservative_resize_like_impl
     EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(Derived)
     EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(OtherDerived)
 
-    if ( ( Derived::IsRowMajor && _this.cols() == other.cols()) || // row-major and we change only the number of rows
-         (!Derived::IsRowMajor && _this.rows() == other.rows()) )  // column-major and we change only the number of columns
+    if (IsRelocatable &&
+        ((Derived::IsRowMajor && _this.cols() == other.cols()) ||  // row-major and we change only the number of rows
+         (!Derived::IsRowMajor &&
+          _this.rows() == other.rows())))  // column-major and we change only the number of columns
     {
       const Index new_rows = other.rows() - _this.rows();
       const Index new_cols = other.cols() - _this.cols();
-      _this.derived().m_storage.conservativeResize(other.size(),other.rows(),other.cols());
-      if (new_rows>0)
+      _this.derived().m_storage.conservativeResize(other.size(), other.rows(), other.cols());
+      if (new_rows > 0)
         _this.bottomRightCorner(new_rows, other.cols()) = other.bottomRows(new_rows);
-      else if (new_cols>0)
+      else if (new_cols > 0)
         _this.bottomRightCorner(other.rows(), new_cols) = other.rightCols(new_cols);
-    }
-    else
-    {
+    } else {
       // The storage order does not allow us to use reallocation.
-      typename Derived::PlainObject tmp(other);
-      const Index common_rows = (std::min)(tmp.rows(), _this.rows());
-      const Index common_cols = (std::min)(tmp.cols(), _this.cols());
-      tmp.block(0,0,common_rows,common_cols) = _this.block(0,0,common_rows,common_cols);
+      Derived tmp(other);
+      const Index common_rows = numext::mini(tmp.rows(), _this.rows());
+      const Index common_cols = numext::mini(tmp.cols(), _this.cols());
+      tmp.block(0, 0, common_rows, common_cols) = _this.block(0, 0, common_rows, common_cols);
       _this.derived().swap(tmp);
     }
   }
@@ -769,54 +964,51 @@ struct conservative_resize_like_impl
 // Here, the specialization for vectors inherits from the general matrix case
 // to allow calling .conservativeResize(rows,cols) on vectors.
 template <typename Derived, typename OtherDerived>
-struct conservative_resize_like_impl<Derived,OtherDerived,true>
-  : conservative_resize_like_impl<Derived,OtherDerived,false>
-{
-  using conservative_resize_like_impl<Derived,OtherDerived,false>::run;
-  
-  typedef typename Derived::Index Index;
-  static void run(DenseBase<Derived>& _this, Index size)
-  {
-    const Index new_rows = Derived::RowsAtCompileTime==1 ? 1 : size;
-    const Index new_cols = Derived::RowsAtCompileTime==1 ? size : 1;
-    _this.derived().m_storage.conservativeResize(size,new_rows,new_cols);
-  }
-
-  static void run(DenseBase<Derived>& _this, const DenseBase<OtherDerived>& other)
-  {
+struct conservative_resize_like_impl<Derived, OtherDerived, true>
+    : conservative_resize_like_impl<Derived, OtherDerived, false> {
+  typedef conservative_resize_like_impl<Derived, OtherDerived, false> Base;
+  using Base::IsRelocatable;
+  using Base::run;
+
+  static void run(DenseBase<Derived>& _this, Index size) {
+    const Index new_rows = Derived::RowsAtCompileTime == 1 ? 1 : size;
+    const Index new_cols = Derived::RowsAtCompileTime == 1 ? size : 1;
+    if (IsRelocatable)
+      _this.derived().m_storage.conservativeResize(size, new_rows, new_cols);
+    else
+      Base::run(_this.derived(), new_rows, new_cols);
+  }
+
+  static void run(DenseBase<Derived>& _this, const DenseBase<OtherDerived>& other) {
     if (_this.rows() == other.rows() && _this.cols() == other.cols()) return;
 
     const Index num_new_elements = other.size() - _this.size();
 
-    const Index new_rows = Derived::RowsAtCompileTime==1 ? 1 : other.rows();
-    const Index new_cols = Derived::RowsAtCompileTime==1 ? other.cols() : 1;
-    _this.derived().m_storage.conservativeResize(other.size(),new_rows,new_cols);
+    const Index new_rows = Derived::RowsAtCompileTime == 1 ? 1 : other.rows();
+    const Index new_cols = Derived::RowsAtCompileTime == 1 ? other.cols() : 1;
+    if (IsRelocatable)
+      _this.derived().m_storage.conservativeResize(other.size(), new_rows, new_cols);
+    else
+      Base::run(_this.derived(), new_rows, new_cols);
 
-    if (num_new_elements > 0)
-      _this.tail(num_new_elements) = other.tail(num_new_elements);
+    if (num_new_elements > 0) _this.tail(num_new_elements) = other.tail(num_new_elements);
   }
 };
 
-template<typename MatrixTypeA, typename MatrixTypeB, bool SwapPointers>
-struct matrix_swap_impl
-{
-  static inline void run(MatrixTypeA& a, MatrixTypeB& b)
-  {
-    a.base().swap(b);
-  }
+template <typename MatrixTypeA, typename MatrixTypeB, bool SwapPointers>
+struct matrix_swap_impl {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(MatrixTypeA& a, MatrixTypeB& b) { a.base().swap(b); }
 };
 
-template<typename MatrixTypeA, typename MatrixTypeB>
-struct matrix_swap_impl<MatrixTypeA, MatrixTypeB, true>
-{
-  static inline void run(MatrixTypeA& a, MatrixTypeB& b)
-  {
+template <typename MatrixTypeA, typename MatrixTypeB>
+struct matrix_swap_impl<MatrixTypeA, MatrixTypeB, true> {
+  EIGEN_DEVICE_FUNC static inline void run(MatrixTypeA& a, MatrixTypeB& b) {
     static_cast<typename MatrixTypeA::Base&>(a).m_storage.swap(static_cast<typename MatrixTypeB::Base&>(b).m_storage);
   }
 };
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_DENSESTORAGEBASE_H
+#endif  // EIGEN_DENSESTORAGEBASE_H
diff --git a/inst/include/Eigen/src/Core/Product.h b/inst/include/Eigen/src/Core/Product.h
new file mode 100644
index 00000000..e16c7cc9
--- /dev/null
+++ b/inst/include/Eigen/src/Core/Product.h
@@ -0,0 +1,307 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PRODUCT_H
+#define EIGEN_PRODUCT_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+template <typename Lhs, typename Rhs, int Option, typename StorageKind>
+class ProductImpl;
+
+namespace internal {
+
+template <typename Lhs, typename Rhs, int Option>
+struct traits<Product<Lhs, Rhs, Option>> {
+  typedef remove_all_t<Lhs> LhsCleaned;
+  typedef remove_all_t<Rhs> RhsCleaned;
+  typedef traits<LhsCleaned> LhsTraits;
+  typedef traits<RhsCleaned> RhsTraits;
+
+  typedef MatrixXpr XprKind;
+
+  typedef typename ScalarBinaryOpTraits<typename traits<LhsCleaned>::Scalar,
+                                        typename traits<RhsCleaned>::Scalar>::ReturnType Scalar;
+  typedef typename product_promote_storage_type<typename LhsTraits::StorageKind, typename RhsTraits::StorageKind,
+                                                internal::product_type<Lhs, Rhs>::ret>::ret StorageKind;
+  typedef typename promote_index_type<typename LhsTraits::StorageIndex, typename RhsTraits::StorageIndex>::type
+      StorageIndex;
+
+  enum {
+    RowsAtCompileTime = LhsTraits::RowsAtCompileTime,
+    ColsAtCompileTime = RhsTraits::ColsAtCompileTime,
+    MaxRowsAtCompileTime = LhsTraits::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = RhsTraits::MaxColsAtCompileTime,
+
+    // FIXME: only needed by GeneralMatrixMatrixTriangular
+    InnerSize = min_size_prefer_fixed(LhsTraits::ColsAtCompileTime, RhsTraits::RowsAtCompileTime),
+
+    // The storage order is somewhat arbitrary here. The correct one will be determined through the evaluator.
+    Flags = (MaxRowsAtCompileTime == 1 && MaxColsAtCompileTime != 1)   ? RowMajorBit
+            : (MaxColsAtCompileTime == 1 && MaxRowsAtCompileTime != 1) ? 0
+            : (((LhsTraits::Flags & NoPreferredStorageOrderBit) && (RhsTraits::Flags & RowMajorBit)) ||
+               ((RhsTraits::Flags & NoPreferredStorageOrderBit) && (LhsTraits::Flags & RowMajorBit)))
+                ? RowMajorBit
+                : NoPreferredStorageOrderBit
+  };
+};
+
+struct TransposeProductEnum {
+  // convenience enumerations to specialize transposed products
+  enum : int {
+    Default = 0x00,
+    Matrix = 0x01,
+    Permutation = 0x02,
+    MatrixMatrix = (Matrix << 8) | Matrix,
+    MatrixPermutation = (Matrix << 8) | Permutation,
+    PermutationMatrix = (Permutation << 8) | Matrix
+  };
+};
+template <typename Xpr>
+struct TransposeKind {
+  static constexpr int Kind = is_matrix_base_xpr<Xpr>::value        ? TransposeProductEnum::Matrix
+                              : is_permutation_base_xpr<Xpr>::value ? TransposeProductEnum::Permutation
+                                                                    : TransposeProductEnum::Default;
+};
+
+template <typename Lhs, typename Rhs>
+struct TransposeProductKind {
+  static constexpr int Kind = (TransposeKind<Lhs>::Kind << 8) | TransposeKind<Rhs>::Kind;
+};
+
+template <typename Lhs, typename Rhs, int Option, int Kind = TransposeProductKind<Lhs, Rhs>::Kind>
+struct product_transpose_helper {
+  // by default, don't optimize the transposed product
+  using Derived = Product<Lhs, Rhs, Option>;
+  using Scalar = typename Derived::Scalar;
+  using TransposeType = Transpose<const Derived>;
+  using ConjugateTransposeType = CwiseUnaryOp<scalar_conjugate_op<Scalar>, TransposeType>;
+  using AdjointType = std::conditional_t<NumTraits<Scalar>::IsComplex, ConjugateTransposeType, TransposeType>;
+
+  // return (lhs * rhs)^T
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TransposeType run_transpose(const Derived& derived) {
+    return TransposeType(derived);
+  }
+  // return (lhs * rhs)^H
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AdjointType run_adjoint(const Derived& derived) {
+    return AdjointType(TransposeType(derived));
+  }
+};
+
+template <typename Lhs, typename Rhs, int Option>
+struct product_transpose_helper<Lhs, Rhs, Option, TransposeProductEnum::MatrixMatrix> {
+  // expand the transposed matrix-matrix product
+  using Derived = Product<Lhs, Rhs, Option>;
+
+  using LhsScalar = typename traits<Lhs>::Scalar;
+  using LhsTransposeType = typename DenseBase<Lhs>::ConstTransposeReturnType;
+  using LhsConjugateTransposeType = CwiseUnaryOp<scalar_conjugate_op<LhsScalar>, LhsTransposeType>;
+  using LhsAdjointType =
+      std::conditional_t<NumTraits<LhsScalar>::IsComplex, LhsConjugateTransposeType, LhsTransposeType>;
+
+  using RhsScalar = typename traits<Rhs>::Scalar;
+  using RhsTransposeType = typename DenseBase<Rhs>::ConstTransposeReturnType;
+  using RhsConjugateTransposeType = CwiseUnaryOp<scalar_conjugate_op<RhsScalar>, RhsTransposeType>;
+  using RhsAdjointType =
+      std::conditional_t<NumTraits<RhsScalar>::IsComplex, RhsConjugateTransposeType, RhsTransposeType>;
+
+  using TransposeType = Product<RhsTransposeType, LhsTransposeType, Option>;
+  using AdjointType = Product<RhsAdjointType, LhsAdjointType, Option>;
+
+  // return rhs^T * lhs^T
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TransposeType run_transpose(const Derived& derived) {
+    return TransposeType(RhsTransposeType(derived.rhs()), LhsTransposeType(derived.lhs()));
+  }
+  // return rhs^H * lhs^H
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AdjointType run_adjoint(const Derived& derived) {
+    return AdjointType(RhsAdjointType(RhsTransposeType(derived.rhs())),
+                       LhsAdjointType(LhsTransposeType(derived.lhs())));
+  }
+};
+template <typename Lhs, typename Rhs, int Option>
+struct product_transpose_helper<Lhs, Rhs, Option, TransposeProductEnum::PermutationMatrix> {
+  // expand the transposed permutation-matrix product
+  using Derived = Product<Lhs, Rhs, Option>;
+
+  using LhsInverseType = typename PermutationBase<Lhs>::InverseReturnType;
+
+  using RhsScalar = typename traits<Rhs>::Scalar;
+  using RhsTransposeType = typename DenseBase<Rhs>::ConstTransposeReturnType;
+  using RhsConjugateTransposeType = CwiseUnaryOp<scalar_conjugate_op<RhsScalar>, RhsTransposeType>;
+  using RhsAdjointType =
+      std::conditional_t<NumTraits<RhsScalar>::IsComplex, RhsConjugateTransposeType, RhsTransposeType>;
+
+  using TransposeType = Product<RhsTransposeType, LhsInverseType, Option>;
+  using AdjointType = Product<RhsAdjointType, LhsInverseType, Option>;
+
+  // return rhs^T * lhs^-1
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TransposeType run_transpose(const Derived& derived) {
+    return TransposeType(RhsTransposeType(derived.rhs()), LhsInverseType(derived.lhs()));
+  }
+  // return rhs^H * lhs^-1
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AdjointType run_adjoint(const Derived& derived) {
+    return AdjointType(RhsAdjointType(RhsTransposeType(derived.rhs())), LhsInverseType(derived.lhs()));
+  }
+};
+template <typename Lhs, typename Rhs, int Option>
+struct product_transpose_helper<Lhs, Rhs, Option, TransposeProductEnum::MatrixPermutation> {
+  // expand the transposed matrix-permutation product
+  using Derived = Product<Lhs, Rhs, Option>;
+
+  using LhsScalar = typename traits<Lhs>::Scalar;
+  using LhsTransposeType = typename DenseBase<Lhs>::ConstTransposeReturnType;
+  using LhsConjugateTransposeType = CwiseUnaryOp<scalar_conjugate_op<LhsScalar>, LhsTransposeType>;
+  using LhsAdjointType =
+      std::conditional_t<NumTraits<LhsScalar>::IsComplex, LhsConjugateTransposeType, LhsTransposeType>;
+
+  using RhsInverseType = typename PermutationBase<Rhs>::InverseReturnType;
+
+  using TransposeType = Product<RhsInverseType, LhsTransposeType, Option>;
+  using AdjointType = Product<RhsInverseType, LhsAdjointType, Option>;
+
+  // return rhs^-1 * lhs^T
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TransposeType run_transpose(const Derived& derived) {
+    return TransposeType(RhsInverseType(derived.rhs()), LhsTransposeType(derived.lhs()));
+  }
+  // return rhs^-1 * lhs^H
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AdjointType run_adjoint(const Derived& derived) {
+    return AdjointType(RhsInverseType(derived.rhs()), LhsAdjointType(LhsTransposeType(derived.lhs())));
+  }
+};
+
+}  // end namespace internal
+
+/** \class Product
+ * \ingroup Core_Module
+ *
+ * \brief Expression of the product of two arbitrary matrices or vectors
+ *
+ * \tparam Lhs_ the type of the left-hand side expression
+ * \tparam Rhs_ the type of the right-hand side expression
+ *
+ * This class represents an expression of the product of two arbitrary matrices.
+ *
+ * The other template parameters are:
+ * \tparam Option     can be DefaultProduct, AliasFreeProduct, or LazyProduct
+ *
+ */
+template <typename Lhs_, typename Rhs_, int Option>
+class Product
+    : public ProductImpl<Lhs_, Rhs_, Option,
+                         typename internal::product_promote_storage_type<
+                             typename internal::traits<Lhs_>::StorageKind, typename internal::traits<Rhs_>::StorageKind,
+                             internal::product_type<Lhs_, Rhs_>::ret>::ret> {
+ public:
+  typedef Lhs_ Lhs;
+  typedef Rhs_ Rhs;
+
+  typedef
+      typename ProductImpl<Lhs, Rhs, Option,
+                           typename internal::product_promote_storage_type<
+                               typename internal::traits<Lhs>::StorageKind, typename internal::traits<Rhs>::StorageKind,
+                               internal::product_type<Lhs, Rhs>::ret>::ret>::Base Base;
+  EIGEN_GENERIC_PUBLIC_INTERFACE(Product)
+
+  typedef typename internal::ref_selector<Lhs>::type LhsNested;
+  typedef typename internal::ref_selector<Rhs>::type RhsNested;
+  typedef internal::remove_all_t<LhsNested> LhsNestedCleaned;
+  typedef internal::remove_all_t<RhsNested> RhsNestedCleaned;
+
+  using TransposeReturnType = typename internal::product_transpose_helper<Lhs, Rhs, Option>::TransposeType;
+  using AdjointReturnType = typename internal::product_transpose_helper<Lhs, Rhs, Option>::AdjointType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Product(const Lhs& lhs, const Rhs& rhs) : m_lhs(lhs), m_rhs(rhs) {
+    eigen_assert(lhs.cols() == rhs.rows() && "invalid matrix product" &&
+                 "if you wanted a coeff-wise or a dot product use the respective explicit functions");
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const noexcept { return m_lhs.rows(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const noexcept { return m_rhs.cols(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const LhsNestedCleaned& lhs() const { return m_lhs; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const RhsNestedCleaned& rhs() const { return m_rhs; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TransposeReturnType transpose() const {
+    return internal::product_transpose_helper<Lhs, Rhs, Option>::run_transpose(*this);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AdjointReturnType adjoint() const {
+    return internal::product_transpose_helper<Lhs, Rhs, Option>::run_adjoint(*this);
+  }
+
+ protected:
+  LhsNested m_lhs;
+  RhsNested m_rhs;
+};
+
+namespace internal {
+
+template <typename Lhs, typename Rhs, int Option, int ProductTag = internal::product_type<Lhs, Rhs>::ret>
+class dense_product_base : public internal::dense_xpr_base<Product<Lhs, Rhs, Option>>::type {};
+
+/** Conversion to scalar for inner-products */
+template <typename Lhs, typename Rhs, int Option>
+class dense_product_base<Lhs, Rhs, Option, InnerProduct>
+    : public internal::dense_xpr_base<Product<Lhs, Rhs, Option>>::type {
+  typedef Product<Lhs, Rhs, Option> ProductXpr;
+  typedef typename internal::dense_xpr_base<ProductXpr>::type Base;
+
+ public:
+  using Base::derived;
+  typedef typename Base::Scalar Scalar;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE operator const Scalar() const {
+    return internal::evaluator<ProductXpr>(derived()).coeff(0, 0);
+  }
+};
+
+}  // namespace internal
+
+// Generic API dispatcher
+template <typename Lhs, typename Rhs, int Option, typename StorageKind>
+class ProductImpl : public internal::generic_xpr_base<Product<Lhs, Rhs, Option>, MatrixXpr, StorageKind>::type {
+ public:
+  typedef typename internal::generic_xpr_base<Product<Lhs, Rhs, Option>, MatrixXpr, StorageKind>::type Base;
+};
+
+template <typename Lhs, typename Rhs, int Option>
+class ProductImpl<Lhs, Rhs, Option, Dense> : public internal::dense_product_base<Lhs, Rhs, Option> {
+  typedef Product<Lhs, Rhs, Option> Derived;
+
+ public:
+  typedef typename internal::dense_product_base<Lhs, Rhs, Option> Base;
+  EIGEN_DENSE_PUBLIC_INTERFACE(Derived)
+ protected:
+  enum {
+    IsOneByOne = (RowsAtCompileTime == 1 || RowsAtCompileTime == Dynamic) &&
+                 (ColsAtCompileTime == 1 || ColsAtCompileTime == Dynamic),
+    EnableCoeff = IsOneByOne || Option == LazyProduct
+  };
+
+ public:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(Index row, Index col) const {
+    EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS);
+    eigen_assert((Option == LazyProduct) || (this->rows() == 1 && this->cols() == 1));
+
+    return internal::evaluator<Derived>(derived()).coeff(row, col);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(Index i) const {
+    EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS);
+    eigen_assert((Option == LazyProduct) || (this->rows() == 1 && this->cols() == 1));
+
+    return internal::evaluator<Derived>(derived()).coeff(i);
+  }
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_PRODUCT_H
diff --git a/inst/include/Eigen/src/Core/ProductBase.h b/inst/include/Eigen/src/Core/ProductBase.h
deleted file mode 100644
index cf74470a..00000000
--- a/inst/include/Eigen/src/Core/ProductBase.h
+++ /dev/null
@@ -1,290 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_PRODUCTBASE_H
-#define EIGEN_PRODUCTBASE_H
-
-namespace Eigen { 
-
-/** \class ProductBase
-  * \ingroup Core_Module
-  *
-  */
-
-namespace internal {
-template<typename Derived, typename _Lhs, typename _Rhs>
-struct traits<ProductBase<Derived,_Lhs,_Rhs> >
-{
-  typedef MatrixXpr XprKind;
-  typedef typename remove_all<_Lhs>::type Lhs;
-  typedef typename remove_all<_Rhs>::type Rhs;
-  typedef typename scalar_product_traits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType Scalar;
-  typedef typename promote_storage_type<typename traits<Lhs>::StorageKind,
-                                           typename traits<Rhs>::StorageKind>::ret StorageKind;
-  typedef typename promote_index_type<typename traits<Lhs>::Index,
-                                         typename traits<Rhs>::Index>::type Index;
-  enum {
-    RowsAtCompileTime = traits<Lhs>::RowsAtCompileTime,
-    ColsAtCompileTime = traits<Rhs>::ColsAtCompileTime,
-    MaxRowsAtCompileTime = traits<Lhs>::MaxRowsAtCompileTime,
-    MaxColsAtCompileTime = traits<Rhs>::MaxColsAtCompileTime,
-    Flags = (MaxRowsAtCompileTime==1 ? RowMajorBit : 0)
-          | EvalBeforeNestingBit | EvalBeforeAssigningBit | NestByRefBit,
-                  // Note that EvalBeforeNestingBit and NestByRefBit
-                  // are not used in practice because nested is overloaded for products
-    CoeffReadCost = 0 // FIXME why is it needed ?
-  };
-};
-}
-
-#define EIGEN_PRODUCT_PUBLIC_INTERFACE(Derived) \
-  typedef ProductBase<Derived, Lhs, Rhs > Base; \
-  EIGEN_DENSE_PUBLIC_INTERFACE(Derived) \
-  typedef typename Base::LhsNested LhsNested; \
-  typedef typename Base::_LhsNested _LhsNested; \
-  typedef typename Base::LhsBlasTraits LhsBlasTraits; \
-  typedef typename Base::ActualLhsType ActualLhsType; \
-  typedef typename Base::_ActualLhsType _ActualLhsType; \
-  typedef typename Base::RhsNested RhsNested; \
-  typedef typename Base::_RhsNested _RhsNested; \
-  typedef typename Base::RhsBlasTraits RhsBlasTraits; \
-  typedef typename Base::ActualRhsType ActualRhsType; \
-  typedef typename Base::_ActualRhsType _ActualRhsType; \
-  using Base::m_lhs; \
-  using Base::m_rhs;
-
-template<typename Derived, typename Lhs, typename Rhs>
-class ProductBase : public MatrixBase<Derived>
-{
-  public:
-    typedef MatrixBase<Derived> Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(ProductBase)
-    
-    typedef typename Lhs::Nested LhsNested;
-    typedef typename internal::remove_all<LhsNested>::type _LhsNested;
-    typedef internal::blas_traits<_LhsNested> LhsBlasTraits;
-    typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
-    typedef typename internal::remove_all<ActualLhsType>::type _ActualLhsType;
-    typedef typename internal::traits<Lhs>::Scalar LhsScalar;
-
-    typedef typename Rhs::Nested RhsNested;
-    typedef typename internal::remove_all<RhsNested>::type _RhsNested;
-    typedef internal::blas_traits<_RhsNested> RhsBlasTraits;
-    typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
-    typedef typename internal::remove_all<ActualRhsType>::type _ActualRhsType;
-    typedef typename internal::traits<Rhs>::Scalar RhsScalar;
-
-    // Diagonal of a product: no need to evaluate the arguments because they are going to be evaluated only once
-    typedef CoeffBasedProduct<LhsNested, RhsNested, 0> FullyLazyCoeffBaseProductType;
-
-  public:
-
-#ifndef EIGEN_NO_MALLOC
-    typedef typename Base::PlainObject BasePlainObject;
-    typedef Matrix<Scalar,RowsAtCompileTime==1?1:Dynamic,ColsAtCompileTime==1?1:Dynamic,BasePlainObject::Options> DynPlainObject;
-    typedef typename internal::conditional<(BasePlainObject::SizeAtCompileTime==Dynamic) || (BasePlainObject::SizeAtCompileTime*int(sizeof(Scalar)) < int(EIGEN_STACK_ALLOCATION_LIMIT)),
-                                           BasePlainObject, DynPlainObject>::type PlainObject;
-#else
-    typedef typename Base::PlainObject PlainObject;
-#endif
-
-    ProductBase(const Lhs& a_lhs, const Rhs& a_rhs)
-      : m_lhs(a_lhs), m_rhs(a_rhs)
-    {
-      eigen_assert(a_lhs.cols() == a_rhs.rows()
-        && "invalid matrix product"
-        && "if you wanted a coeff-wise or a dot product use the respective explicit functions");
-    }
-
-    inline Index rows() const { return m_lhs.rows(); }
-    inline Index cols() const { return m_rhs.cols(); }
-
-    template<typename Dest>
-    inline void evalTo(Dest& dst) const { dst.setZero(); scaleAndAddTo(dst,Scalar(1)); }
-
-    template<typename Dest>
-    inline void addTo(Dest& dst) const { scaleAndAddTo(dst,Scalar(1)); }
-
-    template<typename Dest>
-    inline void subTo(Dest& dst) const { scaleAndAddTo(dst,Scalar(-1)); }
-
-    template<typename Dest>
-    inline void scaleAndAddTo(Dest& dst, const Scalar& alpha) const { derived().scaleAndAddTo(dst,alpha); }
-
-    const _LhsNested& lhs() const { return m_lhs; }
-    const _RhsNested& rhs() const { return m_rhs; }
-
-    // Implicit conversion to the nested type (trigger the evaluation of the product)
-    operator const PlainObject& () const
-    {
-      m_result.resize(m_lhs.rows(), m_rhs.cols());
-      derived().evalTo(m_result);
-      return m_result;
-    }
-
-    const Diagonal<const FullyLazyCoeffBaseProductType,0> diagonal() const
-    { return FullyLazyCoeffBaseProductType(m_lhs, m_rhs); }
-
-    template<int Index>
-    const Diagonal<FullyLazyCoeffBaseProductType,Index> diagonal() const
-    { return FullyLazyCoeffBaseProductType(m_lhs, m_rhs); }
-
-    const Diagonal<FullyLazyCoeffBaseProductType,Dynamic> diagonal(Index index) const
-    { return FullyLazyCoeffBaseProductType(m_lhs, m_rhs).diagonal(index); }
-
-    // restrict coeff accessors to 1x1 expressions. No need to care about mutators here since this isnt a Lvalue expression
-    typename Base::CoeffReturnType coeff(Index row, Index col) const
-    {
-#ifdef EIGEN2_SUPPORT
-      return lhs().row(row).cwiseProduct(rhs().col(col).transpose()).sum();
-#else
-      EIGEN_STATIC_ASSERT_SIZE_1x1(Derived)
-      eigen_assert(this->rows() == 1 && this->cols() == 1);
-      Matrix<Scalar,1,1> result = *this;
-      return result.coeff(row,col);
-#endif
-    }
-
-    typename Base::CoeffReturnType coeff(Index i) const
-    {
-      EIGEN_STATIC_ASSERT_SIZE_1x1(Derived)
-      eigen_assert(this->rows() == 1 && this->cols() == 1);
-      Matrix<Scalar,1,1> result = *this;
-      return result.coeff(i);
-    }
-
-    const Scalar& coeffRef(Index row, Index col) const
-    {
-      EIGEN_STATIC_ASSERT_SIZE_1x1(Derived)
-      eigen_assert(this->rows() == 1 && this->cols() == 1);
-      return derived().coeffRef(row,col);
-    }
-
-    const Scalar& coeffRef(Index i) const
-    {
-      EIGEN_STATIC_ASSERT_SIZE_1x1(Derived)
-      eigen_assert(this->rows() == 1 && this->cols() == 1);
-      return derived().coeffRef(i);
-    }
-
-  protected:
-
-    LhsNested m_lhs;
-    RhsNested m_rhs;
-
-    mutable PlainObject m_result;
-};
-
-// here we need to overload the nested rule for products
-// such that the nested type is a const reference to a plain matrix
-namespace internal {
-template<typename Lhs, typename Rhs, int Mode, int N, typename PlainObject>
-struct nested<GeneralProduct<Lhs,Rhs,Mode>, N, PlainObject>
-{
-  typedef typename GeneralProduct<Lhs,Rhs,Mode>::PlainObject const& type;
-};
-template<typename Lhs, typename Rhs, int Mode, int N, typename PlainObject>
-struct nested<const GeneralProduct<Lhs,Rhs,Mode>, N, PlainObject>
-{
-  typedef typename GeneralProduct<Lhs,Rhs,Mode>::PlainObject const& type;
-};
-}
-
-template<typename NestedProduct>
-class ScaledProduct;
-
-// Note that these two operator* functions are not defined as member
-// functions of ProductBase, because, otherwise we would have to
-// define all overloads defined in MatrixBase. Furthermore, Using
-// "using Base::operator*" would not work with MSVC.
-//
-// Also note that here we accept any compatible scalar types
-template<typename Derived,typename Lhs,typename Rhs>
-const ScaledProduct<Derived>
-operator*(const ProductBase<Derived,Lhs,Rhs>& prod, const typename Derived::Scalar& x)
-{ return ScaledProduct<Derived>(prod.derived(), x); }
-
-template<typename Derived,typename Lhs,typename Rhs>
-typename internal::enable_if<!internal::is_same<typename Derived::Scalar,typename Derived::RealScalar>::value,
-                      const ScaledProduct<Derived> >::type
-operator*(const ProductBase<Derived,Lhs,Rhs>& prod, const typename Derived::RealScalar& x)
-{ return ScaledProduct<Derived>(prod.derived(), x); }
-
-
-template<typename Derived,typename Lhs,typename Rhs>
-const ScaledProduct<Derived>
-operator*(const typename Derived::Scalar& x,const ProductBase<Derived,Lhs,Rhs>& prod)
-{ return ScaledProduct<Derived>(prod.derived(), x); }
-
-template<typename Derived,typename Lhs,typename Rhs>
-typename internal::enable_if<!internal::is_same<typename Derived::Scalar,typename Derived::RealScalar>::value,
-                      const ScaledProduct<Derived> >::type
-operator*(const typename Derived::RealScalar& x,const ProductBase<Derived,Lhs,Rhs>& prod)
-{ return ScaledProduct<Derived>(prod.derived(), x); }
-
-namespace internal {
-template<typename NestedProduct>
-struct traits<ScaledProduct<NestedProduct> >
- : traits<ProductBase<ScaledProduct<NestedProduct>,
-                         typename NestedProduct::_LhsNested,
-                         typename NestedProduct::_RhsNested> >
-{
-  typedef typename traits<NestedProduct>::StorageKind StorageKind;
-};
-}
-
-template<typename NestedProduct>
-class ScaledProduct
-  : public ProductBase<ScaledProduct<NestedProduct>,
-                       typename NestedProduct::_LhsNested,
-                       typename NestedProduct::_RhsNested>
-{
-  public:
-    typedef ProductBase<ScaledProduct<NestedProduct>,
-                       typename NestedProduct::_LhsNested,
-                       typename NestedProduct::_RhsNested> Base;
-    typedef typename Base::Scalar Scalar;
-    typedef typename Base::PlainObject PlainObject;
-//     EIGEN_PRODUCT_PUBLIC_INTERFACE(ScaledProduct)
-
-    ScaledProduct(const NestedProduct& prod, const Scalar& x)
-    : Base(prod.lhs(),prod.rhs()), m_prod(prod), m_alpha(x) {}
-
-    template<typename Dest>
-    inline void evalTo(Dest& dst) const { dst.setZero(); scaleAndAddTo(dst, Scalar(1)); }
-
-    template<typename Dest>
-    inline void addTo(Dest& dst) const { scaleAndAddTo(dst, Scalar(1)); }
-
-    template<typename Dest>
-    inline void subTo(Dest& dst) const { scaleAndAddTo(dst, Scalar(-1)); }
-
-    template<typename Dest>
-    inline void scaleAndAddTo(Dest& dst, const Scalar& a_alpha) const { m_prod.derived().scaleAndAddTo(dst,a_alpha * m_alpha); }
-
-    const Scalar& alpha() const { return m_alpha; }
-    
-  protected:
-    const NestedProduct& m_prod;
-    Scalar m_alpha;
-};
-
-/** \internal
-  * Overloaded to perform an efficient C = (A*B).lazy() */
-template<typename Derived>
-template<typename ProductDerived, typename Lhs, typename Rhs>
-Derived& MatrixBase<Derived>::lazyAssign(const ProductBase<ProductDerived, Lhs,Rhs>& other)
-{
-  other.derived().evalTo(derived());
-  return derived();
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN_PRODUCTBASE_H
diff --git a/inst/include/Eigen/src/Core/ProductEvaluators.h b/inst/include/Eigen/src/Core/ProductEvaluators.h
new file mode 100644
index 00000000..be55be5e
--- /dev/null
+++ b/inst/include/Eigen/src/Core/ProductEvaluators.h
@@ -0,0 +1,1287 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2011 Jitse Niesen <jitse@maths.leeds.ac.uk>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PRODUCTEVALUATORS_H
+#define EIGEN_PRODUCTEVALUATORS_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+/** \internal
+ * Evaluator of a product expression.
+ * Since products require special treatments to handle all possible cases,
+ * we simply defer the evaluation logic to a product_evaluator class
+ * which offers more partial specialization possibilities.
+ *
+ * \sa class product_evaluator
+ */
+template <typename Lhs, typename Rhs, int Options>
+struct evaluator<Product<Lhs, Rhs, Options>> : public product_evaluator<Product<Lhs, Rhs, Options>> {
+  typedef Product<Lhs, Rhs, Options> XprType;
+  typedef product_evaluator<XprType> Base;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr) : Base(xpr) {}
+};
+
+// Catch "scalar * ( A * B )" and transform it to "(A*scalar) * B"
+// TODO we should apply that rule only if that's really helpful
+template <typename Lhs, typename Rhs, typename Scalar1, typename Scalar2, typename Plain1>
+struct evaluator_assume_aliasing<CwiseBinaryOp<internal::scalar_product_op<Scalar1, Scalar2>,
+                                               const CwiseNullaryOp<internal::scalar_constant_op<Scalar1>, Plain1>,
+                                               const Product<Lhs, Rhs, DefaultProduct>>> {
+  static const bool value = true;
+};
+template <typename Lhs, typename Rhs, typename Scalar1, typename Scalar2, typename Plain1>
+struct evaluator<CwiseBinaryOp<internal::scalar_product_op<Scalar1, Scalar2>,
+                               const CwiseNullaryOp<internal::scalar_constant_op<Scalar1>, Plain1>,
+                               const Product<Lhs, Rhs, DefaultProduct>>>
+    : public evaluator<Product<EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar1, Lhs, product), Rhs, DefaultProduct>> {
+  typedef CwiseBinaryOp<internal::scalar_product_op<Scalar1, Scalar2>,
+                        const CwiseNullaryOp<internal::scalar_constant_op<Scalar1>, Plain1>,
+                        const Product<Lhs, Rhs, DefaultProduct>>
+      XprType;
+  typedef evaluator<Product<EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar1, Lhs, product), Rhs, DefaultProduct>> Base;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr)
+      : Base(xpr.lhs().functor().m_other * xpr.rhs().lhs() * xpr.rhs().rhs()) {}
+};
+
+template <typename Lhs, typename Rhs, int DiagIndex>
+struct evaluator<Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex>>
+    : public evaluator<Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex>> {
+  typedef Diagonal<const Product<Lhs, Rhs, DefaultProduct>, DiagIndex> XprType;
+  typedef evaluator<Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex>> Base;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr)
+      : Base(Diagonal<const Product<Lhs, Rhs, LazyProduct>, DiagIndex>(
+            Product<Lhs, Rhs, LazyProduct>(xpr.nestedExpression().lhs(), xpr.nestedExpression().rhs()), xpr.index())) {}
+};
+
+// Helper class to perform a matrix product with the destination at hand.
+// Depending on the sizes of the factors, there are different evaluation strategies
+// as controlled by internal::product_type.
+template <typename Lhs, typename Rhs, typename LhsShape = typename evaluator_traits<Lhs>::Shape,
+          typename RhsShape = typename evaluator_traits<Rhs>::Shape,
+          int ProductType = internal::product_type<Lhs, Rhs>::value>
+struct generic_product_impl;
+
+template <typename Lhs, typename Rhs>
+struct evaluator_assume_aliasing<Product<Lhs, Rhs, DefaultProduct>> {
+  static const bool value = true;
+};
+
+// This is the default evaluator implementation for products:
+// It creates a temporary and call generic_product_impl
+template <typename Lhs, typename Rhs, int Options, int ProductTag, typename LhsShape, typename RhsShape>
+struct product_evaluator<Product<Lhs, Rhs, Options>, ProductTag, LhsShape, RhsShape>
+    : public evaluator<typename Product<Lhs, Rhs, Options>::PlainObject> {
+  typedef Product<Lhs, Rhs, Options> XprType;
+  typedef typename XprType::PlainObject PlainObject;
+  typedef evaluator<PlainObject> Base;
+  enum { Flags = Base::Flags | EvalBeforeNestingBit };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit product_evaluator(const XprType& xpr)
+      : m_result(xpr.rows(), xpr.cols()) {
+    internal::construct_at<Base>(this, m_result);
+
+    // FIXME shall we handle nested_eval here?,
+    // if so, then we must take care at removing the call to nested_eval in the specializations (e.g., in
+    // permutation_matrix_product, transposition_matrix_product, etc.)
+    //     typedef typename internal::nested_eval<Lhs,Rhs::ColsAtCompileTime>::type LhsNested;
+    //     typedef typename internal::nested_eval<Rhs,Lhs::RowsAtCompileTime>::type RhsNested;
+    //     typedef internal::remove_all_t<LhsNested> LhsNestedCleaned;
+    //     typedef internal::remove_all_t<RhsNested> RhsNestedCleaned;
+    //
+    //     const LhsNested lhs(xpr.lhs());
+    //     const RhsNested rhs(xpr.rhs());
+    //
+    //     generic_product_impl<LhsNestedCleaned, RhsNestedCleaned>::evalTo(m_result, lhs, rhs);
+
+    generic_product_impl<Lhs, Rhs, LhsShape, RhsShape, ProductTag>::evalTo(m_result, xpr.lhs(), xpr.rhs());
+  }
+
+ protected:
+  PlainObject m_result;
+};
+
+// The following three shortcuts are enabled only if the scalar types match exactly.
+// TODO: we could enable them for different scalar types when the product is not vectorized.
+
+// Dense = Product
+template <typename DstXprType, typename Lhs, typename Rhs, int Options, typename Scalar>
+struct Assignment<DstXprType, Product<Lhs, Rhs, Options>, internal::assign_op<Scalar, Scalar>, Dense2Dense,
+                  std::enable_if_t<(Options == DefaultProduct || Options == AliasFreeProduct)>> {
+  typedef Product<Lhs, Rhs, Options> SrcXprType;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(DstXprType& dst, const SrcXprType& src,
+                                                        const internal::assign_op<Scalar, Scalar>&) {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if ((dst.rows() != dstRows) || (dst.cols() != dstCols)) dst.resize(dstRows, dstCols);
+    // FIXME shall we handle nested_eval here?
+    generic_product_impl<Lhs, Rhs>::evalTo(dst, src.lhs(), src.rhs());
+  }
+};
+
+// Dense += Product
+template <typename DstXprType, typename Lhs, typename Rhs, int Options, typename Scalar>
+struct Assignment<DstXprType, Product<Lhs, Rhs, Options>, internal::add_assign_op<Scalar, Scalar>, Dense2Dense,
+                  std::enable_if_t<(Options == DefaultProduct || Options == AliasFreeProduct)>> {
+  typedef Product<Lhs, Rhs, Options> SrcXprType;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(DstXprType& dst, const SrcXprType& src,
+                                                        const internal::add_assign_op<Scalar, Scalar>&) {
+    eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
+    // FIXME shall we handle nested_eval here?
+    generic_product_impl<Lhs, Rhs>::addTo(dst, src.lhs(), src.rhs());
+  }
+};
+
+// Dense -= Product
+template <typename DstXprType, typename Lhs, typename Rhs, int Options, typename Scalar>
+struct Assignment<DstXprType, Product<Lhs, Rhs, Options>, internal::sub_assign_op<Scalar, Scalar>, Dense2Dense,
+                  std::enable_if_t<(Options == DefaultProduct || Options == AliasFreeProduct)>> {
+  typedef Product<Lhs, Rhs, Options> SrcXprType;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(DstXprType& dst, const SrcXprType& src,
+                                                        const internal::sub_assign_op<Scalar, Scalar>&) {
+    eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols());
+    // FIXME shall we handle nested_eval here?
+    generic_product_impl<Lhs, Rhs>::subTo(dst, src.lhs(), src.rhs());
+  }
+};
+
+// Dense ?= scalar * Product
+// TODO we should apply that rule if that's really helpful
+// for instance, this is not good for inner products
+template <typename DstXprType, typename Lhs, typename Rhs, typename AssignFunc, typename Scalar, typename ScalarBis,
+          typename Plain>
+struct Assignment<DstXprType,
+                  CwiseBinaryOp<internal::scalar_product_op<ScalarBis, Scalar>,
+                                const CwiseNullaryOp<internal::scalar_constant_op<ScalarBis>, Plain>,
+                                const Product<Lhs, Rhs, DefaultProduct>>,
+                  AssignFunc, Dense2Dense> {
+  typedef CwiseBinaryOp<internal::scalar_product_op<ScalarBis, Scalar>,
+                        const CwiseNullaryOp<internal::scalar_constant_op<ScalarBis>, Plain>,
+                        const Product<Lhs, Rhs, DefaultProduct>>
+      SrcXprType;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(DstXprType& dst, const SrcXprType& src,
+                                                        const AssignFunc& func) {
+    call_assignment_no_alias(dst, (src.lhs().functor().m_other * src.rhs().lhs()) * src.rhs().rhs(), func);
+  }
+};
+
+//----------------------------------------
+// Catch "Dense ?= xpr + Product<>" expression to save one temporary
+// FIXME we could probably enable these rules for any product, i.e., not only Dense and DefaultProduct
+
+template <typename OtherXpr, typename Lhs, typename Rhs>
+struct evaluator_assume_aliasing<
+    CwiseBinaryOp<
+        internal::scalar_sum_op<typename OtherXpr::Scalar, typename Product<Lhs, Rhs, DefaultProduct>::Scalar>,
+        const OtherXpr, const Product<Lhs, Rhs, DefaultProduct>>,
+    DenseShape> {
+  static const bool value = true;
+};
+
+template <typename OtherXpr, typename Lhs, typename Rhs>
+struct evaluator_assume_aliasing<
+    CwiseBinaryOp<
+        internal::scalar_difference_op<typename OtherXpr::Scalar, typename Product<Lhs, Rhs, DefaultProduct>::Scalar>,
+        const OtherXpr, const Product<Lhs, Rhs, DefaultProduct>>,
+    DenseShape> {
+  static const bool value = true;
+};
+
+template <typename DstXprType, typename OtherXpr, typename ProductType, typename Func1, typename Func2>
+struct assignment_from_xpr_op_product {
+  template <typename SrcXprType, typename InitialFunc>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(DstXprType& dst, const SrcXprType& src,
+                                                        const InitialFunc& /*func*/) {
+    call_assignment_no_alias(dst, src.lhs(), Func1());
+    call_assignment_no_alias(dst, src.rhs(), Func2());
+  }
+};
+
+#define EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(ASSIGN_OP, BINOP, ASSIGN_OP2)                             \
+  template <typename DstXprType, typename OtherXpr, typename Lhs, typename Rhs, typename DstScalar, \
+            typename SrcScalar, typename OtherScalar, typename ProdScalar>                          \
+  struct Assignment<DstXprType,                                                                     \
+                    CwiseBinaryOp<internal::BINOP<OtherScalar, ProdScalar>, const OtherXpr,         \
+                                  const Product<Lhs, Rhs, DefaultProduct>>,                         \
+                    internal::ASSIGN_OP<DstScalar, SrcScalar>, Dense2Dense>                         \
+      : assignment_from_xpr_op_product<DstXprType, OtherXpr, Product<Lhs, Rhs, DefaultProduct>,     \
+                                       internal::ASSIGN_OP<DstScalar, OtherScalar>,                 \
+                                       internal::ASSIGN_OP2<DstScalar, ProdScalar>> {}
+
+EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(assign_op, scalar_sum_op, add_assign_op);
+EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(add_assign_op, scalar_sum_op, add_assign_op);
+EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(sub_assign_op, scalar_sum_op, sub_assign_op);
+
+EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(assign_op, scalar_difference_op, sub_assign_op);
+EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(add_assign_op, scalar_difference_op, sub_assign_op);
+EIGEN_CATCH_ASSIGN_XPR_OP_PRODUCT(sub_assign_op, scalar_difference_op, add_assign_op);
+
+//----------------------------------------
+
+template <typename Lhs, typename Rhs>
+struct generic_product_impl<Lhs, Rhs, DenseShape, DenseShape, InnerProduct> {
+  using impl = default_inner_product_impl<Lhs, Rhs, false>;
+  template <typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) {
+    dst.coeffRef(0, 0) = impl::run(lhs, rhs);
+  }
+
+  template <typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) {
+    dst.coeffRef(0, 0) += impl::run(lhs, rhs);
+  }
+
+  template <typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) {
+    dst.coeffRef(0, 0) -= impl::run(lhs, rhs);
+  }
+};
+
+/***********************************************************************
+ *  Implementation of outer dense * dense vector product
+ ***********************************************************************/
+
+// Column major result
+template <typename Dst, typename Lhs, typename Rhs, typename Func>
+void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Func& func,
+                                                  const false_type&) {
+  evaluator<Rhs> rhsEval(rhs);
+  ei_declare_local_nested_eval(Lhs, lhs, Rhs::SizeAtCompileTime, actual_lhs);
+  // FIXME if cols is large enough, then it might be useful to make sure that lhs is sequentially stored
+  // FIXME not very good if rhs is real and lhs complex while alpha is real too
+  const Index cols = dst.cols();
+  for (Index j = 0; j < cols; ++j) func(dst.col(j), rhsEval.coeff(Index(0), j) * actual_lhs);
+}
+
+// Row major result
+template <typename Dst, typename Lhs, typename Rhs, typename Func>
+void EIGEN_DEVICE_FUNC outer_product_selector_run(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Func& func,
+                                                  const true_type&) {
+  evaluator<Lhs> lhsEval(lhs);
+  ei_declare_local_nested_eval(Rhs, rhs, Lhs::SizeAtCompileTime, actual_rhs);
+  // FIXME if rows is large enough, then it might be useful to make sure that rhs is sequentially stored
+  // FIXME not very good if lhs is real and rhs complex while alpha is real too
+  const Index rows = dst.rows();
+  for (Index i = 0; i < rows; ++i) func(dst.row(i), lhsEval.coeff(i, Index(0)) * actual_rhs);
+}
+
+template <typename Lhs, typename Rhs>
+struct generic_product_impl<Lhs, Rhs, DenseShape, DenseShape, OuterProduct> {
+  template <typename T>
+  struct is_row_major : bool_constant<(int(T::Flags) & RowMajorBit)> {};
+  typedef typename Product<Lhs, Rhs>::Scalar Scalar;
+
+  // TODO it would be nice to be able to exploit our *_assign_op functors for that purpose
+  struct set {
+    template <typename Dst, typename Src>
+    EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const {
+      dst.const_cast_derived() = src;
+    }
+  };
+  struct add {
+    /** Add to dst. */
+    template <typename Dst, typename Src>
+    EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const {
+      dst.const_cast_derived() += src;
+    }
+  };
+  struct sub {
+    template <typename Dst, typename Src>
+    EIGEN_DEVICE_FUNC void operator()(const Dst& dst, const Src& src) const {
+      dst.const_cast_derived() -= src;
+    }
+  };
+  /** Scaled add. */
+  struct adds {
+    Scalar m_scale;
+    /** Constructor */
+    explicit adds(const Scalar& s) : m_scale(s) {}
+    /** Scaled add to dst. */
+    template <typename Dst, typename Src>
+    void EIGEN_DEVICE_FUNC operator()(const Dst& dst, const Src& src) const {
+      dst.const_cast_derived() += m_scale * src;
+    }
+  };
+
+  template <typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) {
+    internal::outer_product_selector_run(dst, lhs, rhs, set(), is_row_major<Dst>());
+  }
+
+  template <typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) {
+    internal::outer_product_selector_run(dst, lhs, rhs, add(), is_row_major<Dst>());
+  }
+
+  template <typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) {
+    internal::outer_product_selector_run(dst, lhs, rhs, sub(), is_row_major<Dst>());
+  }
+
+  template <typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs,
+                                                                  const Scalar& alpha) {
+    internal::outer_product_selector_run(dst, lhs, rhs, adds(alpha), is_row_major<Dst>());
+  }
+};
+
+// This base class provides default implementations for evalTo, addTo, subTo, in terms of scaleAndAddTo
+template <typename Lhs, typename Rhs, typename Derived>
+struct generic_product_impl_base {
+  typedef typename Product<Lhs, Rhs>::Scalar Scalar;
+
+  template <typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) {
+    dst.setZero();
+    scaleAndAddTo(dst, lhs, rhs, Scalar(1));
+  }
+
+  template <typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) {
+    scaleAndAddTo(dst, lhs, rhs, Scalar(1));
+  }
+
+  template <typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) {
+    scaleAndAddTo(dst, lhs, rhs, Scalar(-1));
+  }
+
+  template <typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs,
+                                                                  const Scalar& alpha) {
+    Derived::scaleAndAddTo(dst, lhs, rhs, alpha);
+  }
+};
+
+template <typename Lhs, typename Rhs>
+struct generic_product_impl<Lhs, Rhs, DenseShape, DenseShape, GemvProduct>
+    : generic_product_impl_base<Lhs, Rhs, generic_product_impl<Lhs, Rhs, DenseShape, DenseShape, GemvProduct>> {
+  typedef typename nested_eval<Lhs, 1>::type LhsNested;
+  typedef typename nested_eval<Rhs, 1>::type RhsNested;
+  typedef typename Product<Lhs, Rhs>::Scalar Scalar;
+  enum { Side = Lhs::IsVectorAtCompileTime ? OnTheLeft : OnTheRight };
+  typedef internal::remove_all_t<std::conditional_t<int(Side) == OnTheRight, LhsNested, RhsNested>> MatrixType;
+
+  template <typename Dest>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs,
+                                                                  const Scalar& alpha) {
+    // Fallback to inner product if both the lhs and rhs is a runtime vector.
+    if (lhs.rows() == 1 && rhs.cols() == 1) {
+      dst.coeffRef(0, 0) += alpha * lhs.row(0).conjugate().dot(rhs.col(0));
+      return;
+    }
+    LhsNested actual_lhs(lhs);
+    RhsNested actual_rhs(rhs);
+    internal::gemv_dense_selector<Side, (int(MatrixType::Flags) & RowMajorBit) ? RowMajor : ColMajor,
+                                  bool(internal::blas_traits<MatrixType>::HasUsableDirectAccess)>::run(actual_lhs,
+                                                                                                       actual_rhs, dst,
+                                                                                                       alpha);
+  }
+};
+
+template <typename Lhs, typename Rhs>
+struct generic_product_impl<Lhs, Rhs, DenseShape, DenseShape, CoeffBasedProductMode> {
+  typedef typename Product<Lhs, Rhs>::Scalar Scalar;
+
+  template <typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) {
+    // Same as: dst.noalias() = lhs.lazyProduct(rhs);
+    // but easier on the compiler side
+    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::assign_op<typename Dst::Scalar, Scalar>());
+  }
+
+  template <typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) {
+    // dst.noalias() += lhs.lazyProduct(rhs);
+    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::add_assign_op<typename Dst::Scalar, Scalar>());
+  }
+
+  template <typename Dst>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) {
+    // dst.noalias() -= lhs.lazyProduct(rhs);
+    call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::sub_assign_op<typename Dst::Scalar, Scalar>());
+  }
+
+  // This is a special evaluation path called from generic_product_impl<...,GemmProduct> in file GeneralMatrixMatrix.h
+  // This variant tries to extract scalar multiples from both the LHS and RHS and factor them out. For instance:
+  //   dst {,+,-}= (s1*A)*(B*s2)
+  // will be rewritten as:
+  //   dst {,+,-}= (s1*s2) * (A.lazyProduct(B))
+  // There are at least four benefits of doing so:
+  //  1 - huge performance gain for heap-allocated matrix types as it save costly allocations.
+  //  2 - it is faster than simply by-passing the heap allocation through stack allocation.
+  //  3 - it makes this fallback consistent with the heavy GEMM routine.
+  //  4 - it fully by-passes huge stack allocation attempts when multiplying huge fixed-size matrices.
+  //      (see https://stackoverflow.com/questions/54738495)
+  // For small fixed sizes matrices, however, the gains are less obvious, it is sometimes x2 faster, but sometimes x3
+  // slower, and the behavior depends also a lot on the compiler... This is why this re-writing strategy is currently
+  // enabled only when falling back from the main GEMM.
+  template <typename Dst, typename Func>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void eval_dynamic(Dst& dst, const Lhs& lhs, const Rhs& rhs,
+                                                                 const Func& func) {
+    enum {
+      HasScalarFactor = blas_traits<Lhs>::HasScalarFactor || blas_traits<Rhs>::HasScalarFactor,
+      ConjLhs = blas_traits<Lhs>::NeedToConjugate,
+      ConjRhs = blas_traits<Rhs>::NeedToConjugate
+    };
+    // FIXME: in c++11 this should be auto, and extractScalarFactor should also return auto
+    //        this is important for real*complex_mat
+    Scalar actualAlpha = combine_scalar_factors<Scalar>(lhs, rhs);
+
+    eval_dynamic_impl(dst, blas_traits<Lhs>::extract(lhs).template conjugateIf<ConjLhs>(),
+                      blas_traits<Rhs>::extract(rhs).template conjugateIf<ConjRhs>(), func, actualAlpha,
+                      bool_constant<HasScalarFactor>());
+  }
+
+ protected:
+  template <typename Dst, typename LhsT, typename RhsT, typename Func, typename Scalar>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void eval_dynamic_impl(Dst& dst, const LhsT& lhs, const RhsT& rhs,
+                                                                      const Func& func, const Scalar& s /* == 1 */,
+                                                                      false_type) {
+    EIGEN_UNUSED_VARIABLE(s);
+    eigen_internal_assert(numext::is_exactly_one(s));
+    call_restricted_packet_assignment_no_alias(dst, lhs.lazyProduct(rhs), func);
+  }
+
+  template <typename Dst, typename LhsT, typename RhsT, typename Func, typename Scalar>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void eval_dynamic_impl(Dst& dst, const LhsT& lhs, const RhsT& rhs,
+                                                                      const Func& func, const Scalar& s, true_type) {
+    call_restricted_packet_assignment_no_alias(dst, s * lhs.lazyProduct(rhs), func);
+  }
+};
+
+// This specialization enforces the use of a coefficient-based evaluation strategy
+template <typename Lhs, typename Rhs>
+struct generic_product_impl<Lhs, Rhs, DenseShape, DenseShape, LazyCoeffBasedProductMode>
+    : generic_product_impl<Lhs, Rhs, DenseShape, DenseShape, CoeffBasedProductMode> {};
+
+// Case 2: Evaluate coeff by coeff
+//
+// This is mostly taken from CoeffBasedProduct.h
+// The main difference is that we add an extra argument to the etor_product_*_impl::run() function
+// for the inner dimension of the product, because evaluator object do not know their size.
+
+template <int Traversal, int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar>
+struct etor_product_coeff_impl;
+
+template <int StorageOrder, int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl;
+
+template <typename Lhs, typename Rhs, int ProductTag>
+struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape, DenseShape>
+    : evaluator_base<Product<Lhs, Rhs, LazyProduct>> {
+  typedef Product<Lhs, Rhs, LazyProduct> XprType;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit product_evaluator(const XprType& xpr)
+      : m_lhs(xpr.lhs()),
+        m_rhs(xpr.rhs()),
+        m_lhsImpl(m_lhs),  // FIXME the creation of the evaluator objects should result in a no-op, but check that!
+        m_rhsImpl(m_rhs),  //       Moreover, they are only useful for the packet path, so we could completely disable
+                           //       them when not needed, or perhaps declare them on the fly on the packet method... We
+                           //       have experiment to check what's best.
+        m_innerDim(xpr.lhs().cols()) {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits<Scalar>::MulCost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits<Scalar>::AddCost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+#if 0
+    std::cerr << "LhsOuterStrideBytes=  " << LhsOuterStrideBytes << "\n";
+    std::cerr << "RhsOuterStrideBytes=  " << RhsOuterStrideBytes << "\n";
+    std::cerr << "LhsAlignment=         " << LhsAlignment << "\n";
+    std::cerr << "RhsAlignment=         " << RhsAlignment << "\n";
+    std::cerr << "CanVectorizeLhs=      " << CanVectorizeLhs << "\n";
+    std::cerr << "CanVectorizeRhs=      " << CanVectorizeRhs << "\n";
+    std::cerr << "CanVectorizeInner=    " << CanVectorizeInner << "\n";
+    std::cerr << "EvalToRowMajor=       " << EvalToRowMajor << "\n";
+    std::cerr << "Alignment=            " << Alignment << "\n";
+    std::cerr << "Flags=                " << Flags << "\n";
+#endif
+  }
+
+  // Everything below here is taken from CoeffBasedProduct.h
+
+  typedef typename internal::nested_eval<Lhs, Rhs::ColsAtCompileTime>::type LhsNested;
+  typedef typename internal::nested_eval<Rhs, Lhs::RowsAtCompileTime>::type RhsNested;
+
+  typedef internal::remove_all_t<LhsNested> LhsNestedCleaned;
+  typedef internal::remove_all_t<RhsNested> RhsNestedCleaned;
+
+  typedef evaluator<LhsNestedCleaned> LhsEtorType;
+  typedef evaluator<RhsNestedCleaned> RhsEtorType;
+
+  enum {
+    RowsAtCompileTime = LhsNestedCleaned::RowsAtCompileTime,
+    ColsAtCompileTime = RhsNestedCleaned::ColsAtCompileTime,
+    InnerSize = min_size_prefer_fixed(LhsNestedCleaned::ColsAtCompileTime, RhsNestedCleaned::RowsAtCompileTime),
+    MaxRowsAtCompileTime = LhsNestedCleaned::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = RhsNestedCleaned::MaxColsAtCompileTime
+  };
+
+  typedef typename find_best_packet<Scalar, RowsAtCompileTime>::type LhsVecPacketType;
+  typedef typename find_best_packet<Scalar, ColsAtCompileTime>::type RhsVecPacketType;
+
+  enum {
+
+    LhsCoeffReadCost = LhsEtorType::CoeffReadCost,
+    RhsCoeffReadCost = RhsEtorType::CoeffReadCost,
+    CoeffReadCost = InnerSize == 0 ? NumTraits<Scalar>::ReadCost
+                    : InnerSize == Dynamic
+                        ? HugeCost
+                        : InnerSize * (NumTraits<Scalar>::MulCost + int(LhsCoeffReadCost) + int(RhsCoeffReadCost)) +
+                              (InnerSize - 1) * NumTraits<Scalar>::AddCost,
+
+    Unroll = CoeffReadCost <= EIGEN_UNROLLING_LIMIT,
+
+    LhsFlags = LhsEtorType::Flags,
+    RhsFlags = RhsEtorType::Flags,
+
+    LhsRowMajor = LhsFlags & RowMajorBit,
+    RhsRowMajor = RhsFlags & RowMajorBit,
+
+    LhsVecPacketSize = unpacket_traits<LhsVecPacketType>::size,
+    RhsVecPacketSize = unpacket_traits<RhsVecPacketType>::size,
+
+    // Here, we don't care about alignment larger than the usable packet size.
+    LhsAlignment =
+        plain_enum_min(LhsEtorType::Alignment, LhsVecPacketSize* int(sizeof(typename LhsNestedCleaned::Scalar))),
+    RhsAlignment =
+        plain_enum_min(RhsEtorType::Alignment, RhsVecPacketSize* int(sizeof(typename RhsNestedCleaned::Scalar))),
+
+    SameType = is_same<typename LhsNestedCleaned::Scalar, typename RhsNestedCleaned::Scalar>::value,
+
+    CanVectorizeRhs = bool(RhsRowMajor) && (RhsFlags & PacketAccessBit) && (ColsAtCompileTime != 1),
+    CanVectorizeLhs = (!LhsRowMajor) && (LhsFlags & PacketAccessBit) && (RowsAtCompileTime != 1),
+
+    EvalToRowMajor = (MaxRowsAtCompileTime == 1 && MaxColsAtCompileTime != 1) ? 1
+                     : (MaxColsAtCompileTime == 1 && MaxRowsAtCompileTime != 1)
+                         ? 0
+                         : (bool(RhsRowMajor) && !CanVectorizeLhs),
+
+    Flags = ((int(LhsFlags) | int(RhsFlags)) & HereditaryBits & ~RowMajorBit) |
+            (EvalToRowMajor ? RowMajorBit : 0)
+            // TODO enable vectorization for mixed types
+            | (SameType && (CanVectorizeLhs || CanVectorizeRhs) ? PacketAccessBit : 0) |
+            (XprType::IsVectorAtCompileTime ? LinearAccessBit : 0),
+
+    LhsOuterStrideBytes =
+        int(LhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename LhsNestedCleaned::Scalar)),
+    RhsOuterStrideBytes =
+        int(RhsNestedCleaned::OuterStrideAtCompileTime) * int(sizeof(typename RhsNestedCleaned::Scalar)),
+
+    Alignment = bool(CanVectorizeLhs)
+                    ? (LhsOuterStrideBytes <= 0 || (int(LhsOuterStrideBytes) % plain_enum_max(1, LhsAlignment)) != 0
+                           ? 0
+                           : LhsAlignment)
+                : bool(CanVectorizeRhs)
+                    ? (RhsOuterStrideBytes <= 0 || (int(RhsOuterStrideBytes) % plain_enum_max(1, RhsAlignment)) != 0
+                           ? 0
+                           : RhsAlignment)
+                    : 0,
+
+    /* CanVectorizeInner deserves special explanation. It does not affect the product flags. It is not used outside
+     * of Product. If the Product itself is not a packet-access expression, there is still a chance that the inner
+     * loop of the product might be vectorized. This is the meaning of CanVectorizeInner. Since it doesn't affect
+     * the Flags, it is safe to make this value depend on ActualPacketAccessBit, that doesn't affect the ABI.
+     */
+    CanVectorizeInner = SameType && LhsRowMajor && (!RhsRowMajor) &&
+                        (int(LhsFlags) & int(RhsFlags) & ActualPacketAccessBit) &&
+                        (int(InnerSize) % packet_traits<Scalar>::size == 0)
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index row, Index col) const {
+    return (m_lhs.row(row).transpose().cwiseProduct(m_rhs.col(col))).sum();
+  }
+
+  /* Allow index-based non-packet access. It is impossible though to allow index-based packed access,
+   * which is why we don't set the LinearAccessBit.
+   * TODO: this seems possible when the result is a vector
+   */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index index) const {
+    const Index row = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime == 1) ? 0 : index;
+    const Index col = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime == 1) ? index : 0;
+    return (m_lhs.row(row).transpose().cwiseProduct(m_rhs.col(col))).sum();
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType packet(Index row, Index col) const {
+    PacketType res;
+    typedef etor_product_packet_impl<bool(int(Flags) & RowMajorBit) ? RowMajor : ColMajor,
+                                     Unroll ? int(InnerSize) : Dynamic, LhsEtorType, RhsEtorType, PacketType, LoadMode>
+        PacketImpl;
+    PacketImpl::run(row, col, m_lhsImpl, m_rhsImpl, m_innerDim, res);
+    return res;
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType packet(Index index) const {
+    const Index row = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime == 1) ? 0 : index;
+    const Index col = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime == 1) ? index : 0;
+    return packet<LoadMode, PacketType>(row, col);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType packetSegment(Index row, Index col, Index begin,
+                                                                       Index count) const {
+    PacketType res;
+    typedef etor_product_packet_impl<bool(int(Flags) & RowMajorBit) ? RowMajor : ColMajor,
+                                     Unroll ? int(InnerSize) : Dynamic, LhsEtorType, RhsEtorType, PacketType, LoadMode>
+        PacketImpl;
+    PacketImpl::run_segment(row, col, m_lhsImpl, m_rhsImpl, m_innerDim, res, begin, count);
+    return res;
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType packetSegment(Index index, Index begin, Index count) const {
+    const Index row = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime == 1) ? 0 : index;
+    const Index col = (RowsAtCompileTime == 1 || MaxRowsAtCompileTime == 1) ? index : 0;
+    return packetSegment<LoadMode, PacketType>(row, col, begin, count);
+  }
+
+ protected:
+  add_const_on_value_type_t<LhsNested> m_lhs;
+  add_const_on_value_type_t<RhsNested> m_rhs;
+
+  LhsEtorType m_lhsImpl;
+  RhsEtorType m_rhsImpl;
+
+  // TODO: Get rid of m_innerDim if known at compile time
+  Index m_innerDim;
+};
+
+template <typename Lhs, typename Rhs>
+struct product_evaluator<Product<Lhs, Rhs, DefaultProduct>, LazyCoeffBasedProductMode, DenseShape, DenseShape>
+    : product_evaluator<Product<Lhs, Rhs, LazyProduct>, CoeffBasedProductMode, DenseShape, DenseShape> {
+  typedef Product<Lhs, Rhs, DefaultProduct> XprType;
+  typedef Product<Lhs, Rhs, LazyProduct> BaseProduct;
+  typedef product_evaluator<BaseProduct, CoeffBasedProductMode, DenseShape, DenseShape> Base;
+  enum { Flags = Base::Flags | EvalBeforeNestingBit };
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit product_evaluator(const XprType& xpr)
+      : Base(BaseProduct(xpr.lhs(), xpr.rhs())) {}
+};
+
+/****************************************
+*** Coeff based product, Packet path  ***
+****************************************/
+
+template <int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<RowMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
+                                                        Index innerDim, Packet& res) {
+    etor_product_packet_impl<RowMajor, UnrollingIndex - 1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs,
+                                                                                            innerDim, res);
+    res = pmadd(pset1<Packet>(lhs.coeff(row, Index(UnrollingIndex - 1))),
+                rhs.template packet<LoadMode, Packet>(Index(UnrollingIndex - 1), col), res);
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
+                                                                Index innerDim, Packet& res, Index begin, Index count) {
+    etor_product_packet_impl<RowMajor, UnrollingIndex - 1, Lhs, Rhs, Packet, LoadMode>::run_segment(
+        row, col, lhs, rhs, innerDim, res, begin, count);
+    res = pmadd(pset1<Packet>(lhs.coeff(row, Index(UnrollingIndex - 1))),
+                rhs.template packetSegment<LoadMode, Packet>(Index(UnrollingIndex - 1), col, begin, count), res);
+  }
+};
+
+template <int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<ColMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
+                                                        Index innerDim, Packet& res) {
+    etor_product_packet_impl<ColMajor, UnrollingIndex - 1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs,
+                                                                                            innerDim, res);
+    res = pmadd(lhs.template packet<LoadMode, Packet>(row, Index(UnrollingIndex - 1)),
+                pset1<Packet>(rhs.coeff(Index(UnrollingIndex - 1), col)), res);
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
+                                                                Index innerDim, Packet& res, Index begin, Index count) {
+    etor_product_packet_impl<ColMajor, UnrollingIndex - 1, Lhs, Rhs, Packet, LoadMode>::run_segment(
+        row, col, lhs, rhs, innerDim, res, begin, count);
+    res = pmadd(lhs.template packetSegment<LoadMode, Packet>(row, Index(UnrollingIndex - 1), begin, count),
+                pset1<Packet>(rhs.coeff(Index(UnrollingIndex - 1), col)), res);
+  }
+};
+
+template <typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<RowMajor, 1, Lhs, Rhs, Packet, LoadMode> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
+                                                        Index /*innerDim*/, Packet& res) {
+    res = pmul(pset1<Packet>(lhs.coeff(row, Index(0))), rhs.template packet<LoadMode, Packet>(Index(0), col));
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
+                                                                Index /*innerDim*/, Packet& res, Index begin,
+                                                                Index count) {
+    res = pmul(pset1<Packet>(lhs.coeff(row, Index(0))),
+               rhs.template packetSegment<LoadMode, Packet>(Index(0), col, begin, count));
+  }
+};
+
+template <typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<ColMajor, 1, Lhs, Rhs, Packet, LoadMode> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
+                                                        Index /*innerDim*/, Packet& res) {
+    res = pmul(lhs.template packet<LoadMode, Packet>(row, Index(0)), pset1<Packet>(rhs.coeff(Index(0), col)));
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
+                                                                Index /*innerDim*/, Packet& res, Index begin,
+                                                                Index count) {
+    res = pmul(lhs.template packetSegment<LoadMode, Packet>(row, Index(0), begin, count),
+               pset1<Packet>(rhs.coeff(Index(0), col)));
+  }
+};
+
+template <typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/,
+                                                        const Rhs& /*rhs*/, Index /*innerDim*/, Packet& res) {
+    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/,
+                                                                const Rhs& /*rhs*/, Index /*innerDim*/, Packet& res,
+                                                                Index /*begin*/, Index /*count*/) {
+    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
+  }
+};
+
+template <typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/,
+                                                        const Rhs& /*rhs*/, Index /*innerDim*/, Packet& res) {
+    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/,
+                                                                const Rhs& /*rhs*/, Index /*innerDim*/, Packet& res,
+                                                                Index /*begin*/, Index /*count*/) {
+    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
+  }
+};
+
+template <typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, Packet, LoadMode> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
+                                                        Index innerDim, Packet& res) {
+    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
+    for (Index i = 0; i < innerDim; ++i)
+      res = pmadd(pset1<Packet>(lhs.coeff(row, i)), rhs.template packet<LoadMode, Packet>(i, col), res);
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
+                                                                Index innerDim, Packet& res, Index begin, Index count) {
+    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
+    for (Index i = 0; i < innerDim; ++i)
+      res = pmadd(pset1<Packet>(lhs.coeff(row, i)), rhs.template packetSegment<LoadMode, Packet>(i, col, begin, count),
+                  res);
+  }
+};
+
+template <typename Lhs, typename Rhs, typename Packet, int LoadMode>
+struct etor_product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, Packet, LoadMode> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
+                                                        Index innerDim, Packet& res) {
+    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
+    for (Index i = 0; i < innerDim; ++i)
+      res = pmadd(lhs.template packet<LoadMode, Packet>(row, i), pset1<Packet>(rhs.coeff(i, col)), res);
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run_segment(Index row, Index col, const Lhs& lhs, const Rhs& rhs,
+                                                                Index innerDim, Packet& res, Index begin, Index count) {
+    res = pset1<Packet>(typename unpacket_traits<Packet>::type(0));
+    for (Index i = 0; i < innerDim; ++i)
+      res = pmadd(lhs.template packetSegment<LoadMode, Packet>(row, i, begin, count), pset1<Packet>(rhs.coeff(i, col)),
+                  res);
+  }
+};
+
+/***************************************************************************
+ * Triangular products
+ ***************************************************************************/
+template <int Mode, bool LhsIsTriangular, typename Lhs, bool LhsIsVector, typename Rhs, bool RhsIsVector>
+struct triangular_product_impl;
+
+template <typename Lhs, typename Rhs, int ProductTag>
+struct generic_product_impl<Lhs, Rhs, TriangularShape, DenseShape, ProductTag>
+    : generic_product_impl_base<Lhs, Rhs, generic_product_impl<Lhs, Rhs, TriangularShape, DenseShape, ProductTag>> {
+  typedef typename Product<Lhs, Rhs>::Scalar Scalar;
+
+  template <typename Dest>
+  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) {
+    triangular_product_impl<Lhs::Mode, true, typename Lhs::MatrixType, false, Rhs, Rhs::ColsAtCompileTime == 1>::run(
+        dst, lhs.nestedExpression(), rhs, alpha);
+  }
+};
+
+template <typename Lhs, typename Rhs, int ProductTag>
+struct generic_product_impl<Lhs, Rhs, DenseShape, TriangularShape, ProductTag>
+    : generic_product_impl_base<Lhs, Rhs, generic_product_impl<Lhs, Rhs, DenseShape, TriangularShape, ProductTag>> {
+  typedef typename Product<Lhs, Rhs>::Scalar Scalar;
+
+  template <typename Dest>
+  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) {
+    triangular_product_impl<Rhs::Mode, false, Lhs, Lhs::RowsAtCompileTime == 1, typename Rhs::MatrixType, false>::run(
+        dst, lhs, rhs.nestedExpression(), alpha);
+  }
+};
+
+/***************************************************************************
+ * SelfAdjoint products
+ ***************************************************************************/
+template <typename Lhs, int LhsMode, bool LhsIsVector, typename Rhs, int RhsMode, bool RhsIsVector>
+struct selfadjoint_product_impl;
+
+template <typename Lhs, typename Rhs, int ProductTag>
+struct generic_product_impl<Lhs, Rhs, SelfAdjointShape, DenseShape, ProductTag>
+    : generic_product_impl_base<Lhs, Rhs, generic_product_impl<Lhs, Rhs, SelfAdjointShape, DenseShape, ProductTag>> {
+  typedef typename Product<Lhs, Rhs>::Scalar Scalar;
+
+  template <typename Dest>
+  static EIGEN_DEVICE_FUNC void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) {
+    selfadjoint_product_impl<typename Lhs::MatrixType, Lhs::Mode, false, Rhs, 0, Rhs::ColsAtCompileTime == 1>::run(
+        dst, lhs.nestedExpression(), rhs, alpha);
+  }
+};
+
+template <typename Lhs, typename Rhs, int ProductTag>
+struct generic_product_impl<Lhs, Rhs, DenseShape, SelfAdjointShape, ProductTag>
+    : generic_product_impl_base<Lhs, Rhs, generic_product_impl<Lhs, Rhs, DenseShape, SelfAdjointShape, ProductTag>> {
+  typedef typename Product<Lhs, Rhs>::Scalar Scalar;
+
+  template <typename Dest>
+  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) {
+    selfadjoint_product_impl<Lhs, 0, Lhs::RowsAtCompileTime == 1, typename Rhs::MatrixType, Rhs::Mode, false>::run(
+        dst, lhs, rhs.nestedExpression(), alpha);
+  }
+};
+
+/***************************************************************************
+ * Diagonal products
+ ***************************************************************************/
+
+template <typename MatrixType, typename DiagonalType, typename Derived, int ProductOrder>
+struct diagonal_product_evaluator_base : evaluator_base<Derived> {
+  typedef typename ScalarBinaryOpTraits<typename MatrixType::Scalar, typename DiagonalType::Scalar>::ReturnType Scalar;
+
+ public:
+  enum {
+    CoeffReadCost = int(NumTraits<Scalar>::MulCost) + int(evaluator<MatrixType>::CoeffReadCost) +
+                    int(evaluator<DiagonalType>::CoeffReadCost),
+
+    MatrixFlags = evaluator<MatrixType>::Flags,
+    DiagFlags = evaluator<DiagonalType>::Flags,
+
+    StorageOrder_ = (Derived::MaxRowsAtCompileTime == 1 && Derived::MaxColsAtCompileTime != 1)   ? RowMajor
+                    : (Derived::MaxColsAtCompileTime == 1 && Derived::MaxRowsAtCompileTime != 1) ? ColMajor
+                    : MatrixFlags & RowMajorBit                                                  ? RowMajor
+                                                                                                 : ColMajor,
+    SameStorageOrder_ = int(StorageOrder_) == ((MatrixFlags & RowMajorBit) ? RowMajor : ColMajor),
+
+    ScalarAccessOnDiag_ = !((int(StorageOrder_) == ColMajor && int(ProductOrder) == OnTheLeft) ||
+                            (int(StorageOrder_) == RowMajor && int(ProductOrder) == OnTheRight)),
+    SameTypes_ = is_same<typename MatrixType::Scalar, typename DiagonalType::Scalar>::value,
+    // FIXME currently we need same types, but in the future the next rule should be the one
+    // Vectorizable_ = bool(int(MatrixFlags)&PacketAccessBit) && ((!_PacketOnDiag) || (SameTypes_ &&
+    // bool(int(DiagFlags)&PacketAccessBit))),
+    Vectorizable_ = bool(int(MatrixFlags) & PacketAccessBit) && SameTypes_ &&
+                    (SameStorageOrder_ || (MatrixFlags & LinearAccessBit) == LinearAccessBit) &&
+                    (ScalarAccessOnDiag_ || (bool(int(DiagFlags) & PacketAccessBit))),
+    LinearAccessMask_ =
+        (MatrixType::RowsAtCompileTime == 1 || MatrixType::ColsAtCompileTime == 1) ? LinearAccessBit : 0,
+    Flags =
+        ((HereditaryBits | LinearAccessMask_) & (unsigned int)(MatrixFlags)) | (Vectorizable_ ? PacketAccessBit : 0),
+    Alignment = evaluator<MatrixType>::Alignment,
+
+    AsScalarProduct =
+        (DiagonalType::SizeAtCompileTime == 1) ||
+        (DiagonalType::SizeAtCompileTime == Dynamic && MatrixType::RowsAtCompileTime == 1 &&
+         ProductOrder == OnTheLeft) ||
+        (DiagonalType::SizeAtCompileTime == Dynamic && MatrixType::ColsAtCompileTime == 1 && ProductOrder == OnTheRight)
+  };
+
+  EIGEN_DEVICE_FUNC diagonal_product_evaluator_base(const MatrixType& mat, const DiagonalType& diag)
+      : m_diagImpl(diag), m_matImpl(mat) {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(NumTraits<Scalar>::MulCost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index idx) const {
+    if (AsScalarProduct)
+      return m_diagImpl.coeff(0) * m_matImpl.coeff(idx);
+    else
+      return m_diagImpl.coeff(idx) * m_matImpl.coeff(idx);
+  }
+
+ protected:
+  template <int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packet_impl(Index row, Index col, Index id, internal::true_type) const {
+    return internal::pmul(m_matImpl.template packet<LoadMode, PacketType>(row, col),
+                          internal::pset1<PacketType>(m_diagImpl.coeff(id)));
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packet_impl(Index row, Index col, Index id, internal::false_type) const {
+    enum {
+      InnerSize = (MatrixType::Flags & RowMajorBit) ? MatrixType::ColsAtCompileTime : MatrixType::RowsAtCompileTime,
+      DiagonalPacketLoadMode = plain_enum_min(
+          LoadMode,
+          ((InnerSize % 16) == 0) ? int(Aligned16) : int(evaluator<DiagonalType>::Alignment))  // FIXME hardcoded 16!!
+    };
+    return internal::pmul(m_matImpl.template packet<LoadMode, PacketType>(row, col),
+                          m_diagImpl.template packet<DiagonalPacketLoadMode, PacketType>(id));
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packet_segment_impl(Index row, Index col, Index id, Index begin, Index count,
+                                                     internal::true_type) const {
+    return internal::pmul(m_matImpl.template packetSegment<LoadMode, PacketType>(row, col, begin, count),
+                          internal::pset1<PacketType>(m_diagImpl.coeff(id)));
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packet_segment_impl(Index row, Index col, Index id, Index begin, Index count,
+                                                     internal::false_type) const {
+    enum {
+      InnerSize = (MatrixType::Flags & RowMajorBit) ? MatrixType::ColsAtCompileTime : MatrixType::RowsAtCompileTime,
+      DiagonalPacketLoadMode = plain_enum_min(
+          LoadMode,
+          ((InnerSize % 16) == 0) ? int(Aligned16) : int(evaluator<DiagonalType>::Alignment))  // FIXME hardcoded 16!!
+    };
+    return internal::pmul(m_matImpl.template packetSegment<LoadMode, PacketType>(row, col, begin, count),
+                          m_diagImpl.template packetSegment<DiagonalPacketLoadMode, PacketType>(id, begin, count));
+  }
+
+  evaluator<DiagonalType> m_diagImpl;
+  evaluator<MatrixType> m_matImpl;
+};
+
+// diagonal * dense
+template <typename Lhs, typename Rhs, int ProductKind, int ProductTag>
+struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DiagonalShape, DenseShape>
+    : diagonal_product_evaluator_base<Rhs, typename Lhs::DiagonalVectorType, Product<Lhs, Rhs, LazyProduct>,
+                                      OnTheLeft> {
+  typedef diagonal_product_evaluator_base<Rhs, typename Lhs::DiagonalVectorType, Product<Lhs, Rhs, LazyProduct>,
+                                          OnTheLeft>
+      Base;
+  using Base::coeff;
+  using Base::m_diagImpl;
+  using Base::m_matImpl;
+  typedef typename Base::Scalar Scalar;
+
+  typedef Product<Lhs, Rhs, ProductKind> XprType;
+  typedef typename XprType::PlainObject PlainObject;
+  typedef typename Lhs::DiagonalVectorType DiagonalType;
+
+  static constexpr int StorageOrder = Base::StorageOrder_;
+  using IsRowMajor_t = bool_constant<StorageOrder == RowMajor>;
+
+  EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr) : Base(xpr.rhs(), xpr.lhs().diagonal()) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const {
+    return m_diagImpl.coeff(row) * m_matImpl.coeff(row, col);
+  }
+
+#ifndef EIGEN_GPUCC
+  template <int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
+    // FIXME: NVCC used to complain about the template keyword, but we have to check whether this is still the case.
+    // See also similar calls below.
+    return this->template packet_impl<LoadMode, PacketType>(row, col, row, IsRowMajor_t());
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packet(Index idx) const {
+    return packet<LoadMode, PacketType>(int(StorageOrder) == ColMajor ? idx : 0,
+                                        int(StorageOrder) == ColMajor ? 0 : idx);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
+    // FIXME: NVCC used to complain about the template keyword, but we have to check whether this is still the case.
+    // See also similar calls below.
+    return this->template packet_segment_impl<LoadMode, PacketType>(row, col, row, begin, count, IsRowMajor_t());
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packetSegment(Index idx, Index begin, Index count) const {
+    return packetSegment<LoadMode, PacketType>(StorageOrder == ColMajor ? idx : 0, StorageOrder == ColMajor ? 0 : idx,
+                                               begin, count);
+  }
+#endif
+};
+
+// dense * diagonal
+template <typename Lhs, typename Rhs, int ProductKind, int ProductTag>
+struct product_evaluator<Product<Lhs, Rhs, ProductKind>, ProductTag, DenseShape, DiagonalShape>
+    : diagonal_product_evaluator_base<Lhs, typename Rhs::DiagonalVectorType, Product<Lhs, Rhs, LazyProduct>,
+                                      OnTheRight> {
+  typedef diagonal_product_evaluator_base<Lhs, typename Rhs::DiagonalVectorType, Product<Lhs, Rhs, LazyProduct>,
+                                          OnTheRight>
+      Base;
+  using Base::coeff;
+  using Base::m_diagImpl;
+  using Base::m_matImpl;
+  typedef typename Base::Scalar Scalar;
+
+  typedef Product<Lhs, Rhs, ProductKind> XprType;
+  typedef typename XprType::PlainObject PlainObject;
+
+  static constexpr int StorageOrder = Base::StorageOrder_;
+  using IsColMajor_t = bool_constant<StorageOrder == ColMajor>;
+
+  EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr) : Base(xpr.lhs(), xpr.rhs().diagonal()) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const {
+    return m_matImpl.coeff(row, col) * m_diagImpl.coeff(col);
+  }
+
+#ifndef EIGEN_GPUCC
+  template <int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
+    return this->template packet_impl<LoadMode, PacketType>(row, col, col, IsColMajor_t());
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packet(Index idx) const {
+    return packet<LoadMode, PacketType>(StorageOrder == ColMajor ? idx : 0, StorageOrder == ColMajor ? 0 : idx);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
+    return this->template packet_segment_impl<LoadMode, PacketType>(row, col, col, begin, count, IsColMajor_t());
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packetSegment(Index idx, Index begin, Index count) const {
+    return packetSegment<LoadMode, PacketType>(StorageOrder == ColMajor ? idx : 0, StorageOrder == ColMajor ? 0 : idx,
+                                               begin, count);
+  }
+#endif
+};
+
+/***************************************************************************
+ * Products with permutation matrices
+ ***************************************************************************/
+
+/** \internal
+ * \class permutation_matrix_product
+ * Internal helper class implementing the product between a permutation matrix and a matrix.
+ * This class is specialized for DenseShape below and for SparseShape in SparseCore/SparsePermutation.h
+ */
+template <typename ExpressionType, int Side, bool Transposed, typename ExpressionShape>
+struct permutation_matrix_product;
+
+template <typename ExpressionType, int Side, bool Transposed>
+struct permutation_matrix_product<ExpressionType, Side, Transposed, DenseShape> {
+  typedef typename nested_eval<ExpressionType, 1>::type MatrixType;
+  typedef remove_all_t<MatrixType> MatrixTypeCleaned;
+
+  template <typename Dest, typename PermutationType>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Dest& dst, const PermutationType& perm,
+                                                        const ExpressionType& xpr) {
+    MatrixType mat(xpr);
+    const Index n = Side == OnTheLeft ? mat.rows() : mat.cols();
+    // FIXME we need an is_same for expression that is not sensitive to constness. For instance
+    // is_same_xpr<Block<const Matrix>, Block<Matrix> >::value should be true.
+    // if(is_same<MatrixTypeCleaned,Dest>::value && extract_data(dst) == extract_data(mat))
+    if (is_same_dense(dst, mat)) {
+      // apply the permutation inplace
+      Matrix<bool, PermutationType::RowsAtCompileTime, 1, 0, PermutationType::MaxRowsAtCompileTime> mask(perm.size());
+      mask.fill(false);
+      Index r = 0;
+      while (r < perm.size()) {
+        // search for the next seed
+        while (r < perm.size() && mask[r]) r++;
+        if (r >= perm.size()) break;
+        // we got one, let's follow it until we are back to the seed
+        Index k0 = r++;
+        Index kPrev = k0;
+        mask.coeffRef(k0) = true;
+        for (Index k = perm.indices().coeff(k0); k != k0; k = perm.indices().coeff(k)) {
+          Block<Dest, Side == OnTheLeft ? 1 : Dest::RowsAtCompileTime,
+                Side == OnTheRight ? 1 : Dest::ColsAtCompileTime>(dst, k)
+              .swap(Block < Dest, Side == OnTheLeft ? 1 : Dest::RowsAtCompileTime,
+                    Side == OnTheRight
+                        ? 1
+                        : Dest::ColsAtCompileTime > (dst, ((Side == OnTheLeft) ^ Transposed) ? k0 : kPrev));
+
+          mask.coeffRef(k) = true;
+          kPrev = k;
+        }
+      }
+    } else {
+      for (Index i = 0; i < n; ++i) {
+        Block<Dest, Side == OnTheLeft ? 1 : Dest::RowsAtCompileTime, Side == OnTheRight ? 1 : Dest::ColsAtCompileTime>(
+            dst, ((Side == OnTheLeft) ^ Transposed) ? perm.indices().coeff(i) : i)
+
+            =
+
+                Block < const MatrixTypeCleaned,
+            Side == OnTheLeft ? 1 : MatrixTypeCleaned::RowsAtCompileTime,
+            Side == OnTheRight ? 1
+                               : MatrixTypeCleaned::ColsAtCompileTime >
+                                     (mat, ((Side == OnTheRight) ^ Transposed) ? perm.indices().coeff(i) : i);
+      }
+    }
+  }
+};
+
+template <typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Lhs, Rhs, PermutationShape, MatrixShape, ProductTag> {
+  template <typename Dest>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs) {
+    permutation_matrix_product<Rhs, OnTheLeft, false, MatrixShape>::run(dst, lhs, rhs);
+  }
+};
+
+template <typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Lhs, Rhs, MatrixShape, PermutationShape, ProductTag> {
+  template <typename Dest>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs) {
+    permutation_matrix_product<Lhs, OnTheRight, false, MatrixShape>::run(dst, rhs, lhs);
+  }
+};
+
+template <typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Inverse<Lhs>, Rhs, PermutationShape, MatrixShape, ProductTag> {
+  template <typename Dest>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Inverse<Lhs>& lhs, const Rhs& rhs) {
+    permutation_matrix_product<Rhs, OnTheLeft, true, MatrixShape>::run(dst, lhs.nestedExpression(), rhs);
+  }
+};
+
+template <typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Lhs, Inverse<Rhs>, MatrixShape, PermutationShape, ProductTag> {
+  template <typename Dest>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Inverse<Rhs>& rhs) {
+    permutation_matrix_product<Lhs, OnTheRight, true, MatrixShape>::run(dst, rhs.nestedExpression(), lhs);
+  }
+};
+
+/***************************************************************************
+ * Products with transpositions matrices
+ ***************************************************************************/
+
+// FIXME could we unify Transpositions and Permutation into a single "shape"??
+
+/** \internal
+ * \class transposition_matrix_product
+ * Internal helper class implementing the product between a permutation matrix and a matrix.
+ */
+template <typename ExpressionType, int Side, bool Transposed, typename ExpressionShape>
+struct transposition_matrix_product {
+  typedef typename nested_eval<ExpressionType, 1>::type MatrixType;
+  typedef remove_all_t<MatrixType> MatrixTypeCleaned;
+
+  template <typename Dest, typename TranspositionType>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Dest& dst, const TranspositionType& tr,
+                                                        const ExpressionType& xpr) {
+    MatrixType mat(xpr);
+    typedef typename TranspositionType::StorageIndex StorageIndex;
+    const Index size = tr.size();
+    StorageIndex j = 0;
+
+    if (!is_same_dense(dst, mat)) dst = mat;
+
+    for (Index k = (Transposed ? size - 1 : 0); Transposed ? k >= 0 : k < size; Transposed ? --k : ++k)
+      if (Index(j = tr.coeff(k)) != k) {
+        if (Side == OnTheLeft)
+          dst.row(k).swap(dst.row(j));
+        else if (Side == OnTheRight)
+          dst.col(k).swap(dst.col(j));
+      }
+  }
+};
+
+template <typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Lhs, Rhs, TranspositionsShape, MatrixShape, ProductTag> {
+  template <typename Dest>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs) {
+    transposition_matrix_product<Rhs, OnTheLeft, false, MatrixShape>::run(dst, lhs, rhs);
+  }
+};
+
+template <typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Lhs, Rhs, MatrixShape, TranspositionsShape, ProductTag> {
+  template <typename Dest>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs) {
+    transposition_matrix_product<Lhs, OnTheRight, false, MatrixShape>::run(dst, rhs, lhs);
+  }
+};
+
+template <typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Transpose<Lhs>, Rhs, TranspositionsShape, MatrixShape, ProductTag> {
+  template <typename Dest>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Transpose<Lhs>& lhs, const Rhs& rhs) {
+    transposition_matrix_product<Rhs, OnTheLeft, true, MatrixShape>::run(dst, lhs.nestedExpression(), rhs);
+  }
+};
+
+template <typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Lhs, Transpose<Rhs>, MatrixShape, TranspositionsShape, ProductTag> {
+  template <typename Dest>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Transpose<Rhs>& rhs) {
+    transposition_matrix_product<Lhs, OnTheRight, true, MatrixShape>::run(dst, rhs.nestedExpression(), lhs);
+  }
+};
+
+/***************************************************************************
+ * skew symmetric products
+ * for now we just call the generic implementation
+ ***************************************************************************/
+template <typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Lhs, Rhs, SkewSymmetricShape, MatrixShape, ProductTag> {
+  template <typename Dest>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs) {
+    generic_product_impl<typename Lhs::DenseMatrixType, Rhs, DenseShape, MatrixShape, ProductTag>::evalTo(dst, lhs,
+                                                                                                          rhs);
+  }
+};
+
+template <typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Lhs, Rhs, MatrixShape, SkewSymmetricShape, ProductTag> {
+  template <typename Dest>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs) {
+    generic_product_impl<Lhs, typename Rhs::DenseMatrixType, MatrixShape, DenseShape, ProductTag>::evalTo(dst, lhs,
+                                                                                                          rhs);
+  }
+};
+
+template <typename Lhs, typename Rhs, int ProductTag>
+struct generic_product_impl<Lhs, Rhs, SkewSymmetricShape, SkewSymmetricShape, ProductTag> {
+  template <typename Dest>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs) {
+    generic_product_impl<typename Lhs::DenseMatrixType, typename Rhs::DenseMatrixType, DenseShape, DenseShape,
+                         ProductTag>::evalTo(dst, lhs, rhs);
+  }
+};
+
+template <typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Lhs, Rhs, MatrixShape, HomogeneousShape, ProductTag>
+    : generic_product_impl<Lhs, typename Rhs::PlainObject, MatrixShape, DenseShape, ProductTag> {};
+
+template <typename Lhs, typename Rhs, int ProductTag, typename MatrixShape>
+struct generic_product_impl<Lhs, Rhs, HomogeneousShape, MatrixShape, ProductTag>
+    : generic_product_impl<typename Lhs::PlainObject, Rhs, DenseShape, MatrixShape, ProductTag> {};
+
+template <typename Lhs, typename Rhs, int ProductTag>
+struct generic_product_impl<Lhs, Rhs, PermutationShape, HomogeneousShape, ProductTag>
+    : generic_product_impl<Lhs, Rhs, PermutationShape, DenseShape, ProductTag> {};
+
+template <typename Lhs, typename Rhs, int ProductTag>
+struct generic_product_impl<Lhs, Rhs, HomogeneousShape, PermutationShape, ProductTag>
+    : generic_product_impl<Lhs, Rhs, DenseShape, PermutationShape, ProductTag> {};
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_PRODUCT_EVALUATORS_H
diff --git a/inst/include/Eigen/src/Core/Random.h b/inst/include/Eigen/src/Core/Random.h
index 480fea40..f8a54356 100644
--- a/inst/include/Eigen/src/Core/Random.h
+++ b/inst/include/Eigen/src/Core/Random.h
@@ -10,143 +10,198 @@
 #ifndef EIGEN_RANDOM_H
 #define EIGEN_RANDOM_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 namespace internal {
 
-template<typename Scalar> struct scalar_random_op {
-  EIGEN_EMPTY_STRUCT_CTOR(scalar_random_op)
-  template<typename Index>
-  inline const Scalar operator() (Index, Index = 0) const { return random<Scalar>(); }
+template <typename Scalar>
+struct scalar_random_op {
+  inline const Scalar operator()() const { return random<Scalar>(); }
 };
 
-template<typename Scalar>
-struct functor_traits<scalar_random_op<Scalar> >
-{ enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false, IsRepeatable = false }; };
+template <typename Scalar>
+struct functor_traits<scalar_random_op<Scalar> > {
+  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false, IsRepeatable = false };
+};
 
-} // end namespace internal
+}  // end namespace internal
 
 /** \returns a random matrix expression
-  *
-  * The parameters \a rows and \a cols are the number of rows and of columns of
-  * the returned matrix. Must be compatible with this MatrixBase type.
-  *
-  * This variant is meant to be used for dynamic-size matrix types. For fixed-size types,
-  * it is redundant to pass \a rows and \a cols as arguments, so Random() should be used
-  * instead.
-  *
-  * Example: \include MatrixBase_random_int_int.cpp
-  * Output: \verbinclude MatrixBase_random_int_int.out
-  *
-  * This expression has the "evaluate before nesting" flag so that it will be evaluated into
-  * a temporary matrix whenever it is nested in a larger expression. This prevents unexpected
-  * behavior with expressions involving random matrices.
-  *
-  * \sa MatrixBase::setRandom(), MatrixBase::Random(Index), MatrixBase::Random()
-  */
-template<typename Derived>
-inline const CwiseNullaryOp<internal::scalar_random_op<typename internal::traits<Derived>::Scalar>, Derived>
-DenseBase<Derived>::Random(Index rows, Index cols)
-{
+ *
+ * Numbers are uniformly spread through their whole definition range for integer types,
+ * and in the [-1:1] range for floating point scalar types.
+ *
+ * The parameters \a rows and \a cols are the number of rows and of columns of
+ * the returned matrix. Must be compatible with this MatrixBase type.
+ *
+ * \not_reentrant
+ *
+ * This variant is meant to be used for dynamic-size matrix types. For fixed-size types,
+ * it is redundant to pass \a rows and \a cols as arguments, so Random() should be used
+ * instead.
+ *
+ *
+ * Example: \include MatrixBase_random_int_int.cpp
+ * Output: \verbinclude MatrixBase_random_int_int.out
+ *
+ * This expression has the "evaluate before nesting" flag so that it will be evaluated into
+ * a temporary matrix whenever it is nested in a larger expression. This prevents unexpected
+ * behavior with expressions involving random matrices.
+ *
+ * See DenseBase::NullaryExpr(Index, const CustomNullaryOp&) for an example using C++11 random generators.
+ *
+ * \sa DenseBase::setRandom(), DenseBase::Random(Index), DenseBase::Random()
+ */
+template <typename Derived>
+inline const typename DenseBase<Derived>::RandomReturnType DenseBase<Derived>::Random(Index rows, Index cols) {
   return NullaryExpr(rows, cols, internal::scalar_random_op<Scalar>());
 }
 
 /** \returns a random vector expression
-  *
-  * The parameter \a size is the size of the returned vector.
-  * Must be compatible with this MatrixBase type.
-  *
-  * \only_for_vectors
-  *
-  * This variant is meant to be used for dynamic-size vector types. For fixed-size types,
-  * it is redundant to pass \a size as argument, so Random() should be used
-  * instead.
-  *
-  * Example: \include MatrixBase_random_int.cpp
-  * Output: \verbinclude MatrixBase_random_int.out
-  *
-  * This expression has the "evaluate before nesting" flag so that it will be evaluated into
-  * a temporary vector whenever it is nested in a larger expression. This prevents unexpected
-  * behavior with expressions involving random matrices.
-  *
-  * \sa MatrixBase::setRandom(), MatrixBase::Random(Index,Index), MatrixBase::Random()
-  */
-template<typename Derived>
-inline const CwiseNullaryOp<internal::scalar_random_op<typename internal::traits<Derived>::Scalar>, Derived>
-DenseBase<Derived>::Random(Index size)
-{
+ *
+ * Numbers are uniformly spread through their whole definition range for integer types,
+ * and in the [-1:1] range for floating point scalar types.
+ *
+ * The parameter \a size is the size of the returned vector.
+ * Must be compatible with this MatrixBase type.
+ *
+ * \only_for_vectors
+ * \not_reentrant
+ *
+ * This variant is meant to be used for dynamic-size vector types. For fixed-size types,
+ * it is redundant to pass \a size as argument, so Random() should be used
+ * instead.
+ *
+ * Example: \include MatrixBase_random_int.cpp
+ * Output: \verbinclude MatrixBase_random_int.out
+ *
+ * This expression has the "evaluate before nesting" flag so that it will be evaluated into
+ * a temporary vector whenever it is nested in a larger expression. This prevents unexpected
+ * behavior with expressions involving random matrices.
+ *
+ * \sa DenseBase::setRandom(), DenseBase::Random(Index,Index), DenseBase::Random()
+ */
+template <typename Derived>
+inline const typename DenseBase<Derived>::RandomReturnType DenseBase<Derived>::Random(Index size) {
   return NullaryExpr(size, internal::scalar_random_op<Scalar>());
 }
 
 /** \returns a fixed-size random matrix or vector expression
-  *
-  * This variant is only for fixed-size MatrixBase types. For dynamic-size types, you
-  * need to use the variants taking size arguments.
-  *
-  * Example: \include MatrixBase_random.cpp
-  * Output: \verbinclude MatrixBase_random.out
-  *
-  * This expression has the "evaluate before nesting" flag so that it will be evaluated into
-  * a temporary matrix whenever it is nested in a larger expression. This prevents unexpected
-  * behavior with expressions involving random matrices.
-  *
-  * \sa MatrixBase::setRandom(), MatrixBase::Random(Index,Index), MatrixBase::Random(Index)
-  */
-template<typename Derived>
-inline const CwiseNullaryOp<internal::scalar_random_op<typename internal::traits<Derived>::Scalar>, Derived>
-DenseBase<Derived>::Random()
-{
+ *
+ * Numbers are uniformly spread through their whole definition range for integer types,
+ * and in the [-1:1] range for floating point scalar types.
+ *
+ * This variant is only for fixed-size MatrixBase types. For dynamic-size types, you
+ * need to use the variants taking size arguments.
+ *
+ * Example: \include MatrixBase_random.cpp
+ * Output: \verbinclude MatrixBase_random.out
+ *
+ * This expression has the "evaluate before nesting" flag so that it will be evaluated into
+ * a temporary matrix whenever it is nested in a larger expression. This prevents unexpected
+ * behavior with expressions involving random matrices.
+ *
+ * \not_reentrant
+ *
+ * \sa DenseBase::setRandom(), DenseBase::Random(Index,Index), DenseBase::Random(Index)
+ */
+template <typename Derived>
+inline const typename DenseBase<Derived>::RandomReturnType DenseBase<Derived>::Random() {
   return NullaryExpr(RowsAtCompileTime, ColsAtCompileTime, internal::scalar_random_op<Scalar>());
 }
 
 /** Sets all coefficients in this expression to random values.
-  *
-  * Example: \include MatrixBase_setRandom.cpp
-  * Output: \verbinclude MatrixBase_setRandom.out
-  *
-  * \sa class CwiseNullaryOp, setRandom(Index), setRandom(Index,Index)
-  */
-template<typename Derived>
-inline Derived& DenseBase<Derived>::setRandom()
-{
+ *
+ * Numbers are uniformly spread through their whole definition range for integer types,
+ * and in the [-1:1] range for floating point scalar types.
+ *
+ * \not_reentrant
+ *
+ * Example: \include MatrixBase_setRandom.cpp
+ * Output: \verbinclude MatrixBase_setRandom.out
+ *
+ * \sa class CwiseNullaryOp, setRandom(Index), setRandom(Index,Index)
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC inline Derived& DenseBase<Derived>::setRandom() {
   return *this = Random(rows(), cols());
 }
 
 /** Resizes to the given \a newSize, and sets all coefficients in this expression to random values.
-  *
-  * \only_for_vectors
-  *
-  * Example: \include Matrix_setRandom_int.cpp
-  * Output: \verbinclude Matrix_setRandom_int.out
-  *
-  * \sa MatrixBase::setRandom(), setRandom(Index,Index), class CwiseNullaryOp, MatrixBase::Random()
-  */
-template<typename Derived>
-EIGEN_STRONG_INLINE Derived&
-PlainObjectBase<Derived>::setRandom(Index newSize)
-{
+ *
+ * Numbers are uniformly spread through their whole definition range for integer types,
+ * and in the [-1:1] range for floating point scalar types.
+ *
+ * \only_for_vectors
+ * \not_reentrant
+ *
+ * Example: \include Matrix_setRandom_int.cpp
+ * Output: \verbinclude Matrix_setRandom_int.out
+ *
+ * \sa DenseBase::setRandom(), setRandom(Index,Index), class CwiseNullaryOp, DenseBase::Random()
+ */
+template <typename Derived>
+EIGEN_STRONG_INLINE Derived& PlainObjectBase<Derived>::setRandom(Index newSize) {
   resize(newSize);
   return setRandom();
 }
 
 /** Resizes to the given size, and sets all coefficients in this expression to random values.
-  *
-  * \param nbRows the new number of rows
-  * \param nbCols the new number of columns
-  *
-  * Example: \include Matrix_setRandom_int_int.cpp
-  * Output: \verbinclude Matrix_setRandom_int_int.out
-  *
-  * \sa MatrixBase::setRandom(), setRandom(Index), class CwiseNullaryOp, MatrixBase::Random()
-  */
-template<typename Derived>
-EIGEN_STRONG_INLINE Derived&
-PlainObjectBase<Derived>::setRandom(Index nbRows, Index nbCols)
-{
-  resize(nbRows, nbCols);
+ *
+ * Numbers are uniformly spread through their whole definition range for integer types,
+ * and in the [-1:1] range for floating point scalar types.
+ *
+ * \not_reentrant
+ *
+ * \param rows the new number of rows
+ * \param cols the new number of columns
+ *
+ * Example: \include Matrix_setRandom_int_int.cpp
+ * Output: \verbinclude Matrix_setRandom_int_int.out
+ *
+ * \sa DenseBase::setRandom(), setRandom(Index), class CwiseNullaryOp, DenseBase::Random()
+ */
+template <typename Derived>
+EIGEN_STRONG_INLINE Derived& PlainObjectBase<Derived>::setRandom(Index rows, Index cols) {
+  resize(rows, cols);
   return setRandom();
 }
 
-} // end namespace Eigen
+/** Resizes to the given size, changing only the number of columns, and sets all
+ * coefficients in this expression to random values. For the parameter of type
+ * NoChange_t, just pass the special value \c NoChange.
+ *
+ * Numbers are uniformly spread through their whole definition range for integer types,
+ * and in the [-1:1] range for floating point scalar types.
+ *
+ * \not_reentrant
+ *
+ * \sa DenseBase::setRandom(), setRandom(Index), setRandom(Index, NoChange_t), class CwiseNullaryOp, DenseBase::Random()
+ */
+template <typename Derived>
+EIGEN_STRONG_INLINE Derived& PlainObjectBase<Derived>::setRandom(NoChange_t, Index cols) {
+  return setRandom(rows(), cols);
+}
+
+/** Resizes to the given size, changing only the number of rows, and sets all
+ * coefficients in this expression to random values. For the parameter of type
+ * NoChange_t, just pass the special value \c NoChange.
+ *
+ * Numbers are uniformly spread through their whole definition range for integer types,
+ * and in the [-1:1] range for floating point scalar types.
+ *
+ * \not_reentrant
+ *
+ * \sa DenseBase::setRandom(), setRandom(Index), setRandom(NoChange_t, Index), class CwiseNullaryOp, DenseBase::Random()
+ */
+template <typename Derived>
+EIGEN_STRONG_INLINE Derived& PlainObjectBase<Derived>::setRandom(Index rows, NoChange_t) {
+  return setRandom(rows, cols());
+}
+
+}  // end namespace Eigen
 
-#endif // EIGEN_RANDOM_H
+#endif  // EIGEN_RANDOM_H
diff --git a/inst/include/Eigen/src/Core/RandomImpl.h b/inst/include/Eigen/src/Core/RandomImpl.h
new file mode 100644
index 00000000..1a82e625
--- /dev/null
+++ b/inst/include/Eigen/src/Core/RandomImpl.h
@@ -0,0 +1,262 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2024 Charles Schlosser <cs.schlosser@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_RANDOM_IMPL_H
+#define EIGEN_RANDOM_IMPL_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+/****************************************************************************
+ * Implementation of random                                               *
+ ****************************************************************************/
+
+template <typename Scalar, bool IsComplex, bool IsInteger>
+struct random_default_impl {};
+
+template <typename Scalar>
+struct random_impl : random_default_impl<Scalar, NumTraits<Scalar>::IsComplex, NumTraits<Scalar>::IsInteger> {};
+
+template <typename Scalar>
+struct random_retval {
+  typedef Scalar type;
+};
+
+template <typename Scalar>
+inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random(const Scalar& x, const Scalar& y) {
+  return EIGEN_MATHFUNC_IMPL(random, Scalar)::run(x, y);
+}
+
+template <typename Scalar>
+inline EIGEN_MATHFUNC_RETVAL(random, Scalar) random() {
+  return EIGEN_MATHFUNC_IMPL(random, Scalar)::run();
+}
+
+// TODO: replace or provide alternatives to this, e.g. std::random_device
+struct eigen_random_device {
+  using ReturnType = int;
+  static constexpr int Entropy = meta_floor_log2<(unsigned int)(RAND_MAX) + 1>::value;
+  static constexpr ReturnType Highest = RAND_MAX;
+  static EIGEN_DEVICE_FUNC inline ReturnType run() { return std::rand(); }
+};
+
+// Fill a built-in unsigned integer with numRandomBits beginning with the least significant bit
+template <typename Scalar>
+struct random_bits_impl {
+  EIGEN_STATIC_ASSERT(std::is_unsigned<Scalar>::value, SCALAR MUST BE A BUILT - IN UNSIGNED INTEGER)
+  using RandomDevice = eigen_random_device;
+  using RandomReturnType = typename RandomDevice::ReturnType;
+  static constexpr int kEntropy = RandomDevice::Entropy;
+  static constexpr int kTotalBits = sizeof(Scalar) * CHAR_BIT;
+  // return a Scalar filled with numRandomBits beginning from the least significant bit
+  static EIGEN_DEVICE_FUNC inline Scalar run(int numRandomBits) {
+    eigen_assert((numRandomBits >= 0) && (numRandomBits <= kTotalBits));
+    const Scalar mask = Scalar(-1) >> ((kTotalBits - numRandomBits) & (kTotalBits - 1));
+    Scalar randomBits = 0;
+    for (int shift = 0; shift < numRandomBits; shift += kEntropy) {
+      RandomReturnType r = RandomDevice::run();
+      randomBits |= static_cast<Scalar>(r) << shift;
+    }
+    // clear the excess bits
+    randomBits &= mask;
+    return randomBits;
+  }
+};
+
+template <typename BitsType>
+EIGEN_DEVICE_FUNC inline BitsType getRandomBits(int numRandomBits) {
+  return random_bits_impl<BitsType>::run(numRandomBits);
+}
+
+// random implementation for a built-in floating point type
+template <typename Scalar, bool BuiltIn = std::is_floating_point<Scalar>::value>
+struct random_float_impl {
+  using BitsType = typename numext::get_integer_by_size<sizeof(Scalar)>::unsigned_type;
+  static constexpr EIGEN_DEVICE_FUNC inline int mantissaBits() {
+    const int digits = NumTraits<Scalar>::digits();
+    return digits - 1;
+  }
+  static EIGEN_DEVICE_FUNC inline Scalar run(int numRandomBits) {
+    eigen_assert(numRandomBits >= 0 && numRandomBits <= mantissaBits());
+    BitsType randomBits = getRandomBits<BitsType>(numRandomBits);
+    // if fewer than MantissaBits is requested, shift them to the left
+    randomBits <<= (mantissaBits() - numRandomBits);
+    // randomBits is in the half-open interval [2,4)
+    randomBits |= numext::bit_cast<BitsType>(Scalar(2));
+    // result is in the half-open interval [-1,1)
+    Scalar result = numext::bit_cast<Scalar>(randomBits) - Scalar(3);
+    return result;
+  }
+};
+// random implementation for a custom floating point type
+// uses double as the implementation with a mantissa with a size equal to either the target scalar's mantissa or that of
+// double, whichever is smaller
+template <typename Scalar>
+struct random_float_impl<Scalar, false> {
+  static EIGEN_DEVICE_FUNC inline int mantissaBits() {
+    const int digits = NumTraits<Scalar>::digits();
+    constexpr int kDoubleDigits = NumTraits<double>::digits();
+    return numext::mini(digits, kDoubleDigits) - 1;
+  }
+  static EIGEN_DEVICE_FUNC inline Scalar run(int numRandomBits) {
+    eigen_assert(numRandomBits >= 0 && numRandomBits <= mantissaBits());
+    Scalar result = static_cast<Scalar>(random_float_impl<double>::run(numRandomBits));
+    return result;
+  }
+};
+
+#if !EIGEN_COMP_NVCC
+// random implementation for long double
+// this specialization is not compatible with double-double scalars
+template <bool Specialize = (sizeof(long double) == 2 * sizeof(uint64_t)) &&
+                            ((std::numeric_limits<long double>::digits != (2 * std::numeric_limits<double>::digits)))>
+struct random_longdouble_impl {
+  static constexpr int Size = sizeof(long double);
+  static constexpr EIGEN_DEVICE_FUNC int mantissaBits() { return NumTraits<long double>::digits() - 1; }
+  static EIGEN_DEVICE_FUNC inline long double run(int numRandomBits) {
+    eigen_assert(numRandomBits >= 0 && numRandomBits <= mantissaBits());
+    EIGEN_USING_STD(memcpy);
+    int numLowBits = numext::mini(numRandomBits, 64);
+    int numHighBits = numext::maxi(numRandomBits - 64, 0);
+    uint64_t randomBits[2];
+    long double result = 2.0L;
+    memcpy(&randomBits, &result, Size);
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+    randomBits[0] |= getRandomBits<uint64_t>(numLowBits);
+    randomBits[1] |= getRandomBits<uint64_t>(numHighBits);
+#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+    randomBits[0] |= getRandomBits<uint64_t>(numHighBits);
+    randomBits[1] |= getRandomBits<uint64_t>(numLowBits);
+#else
+#error Unexpected or undefined __BYTE_ORDER__
+#endif
+    memcpy(&result, &randomBits, Size);
+    result -= 3.0L;
+    return result;
+  }
+};
+template <>
+struct random_longdouble_impl<false> {
+  static constexpr EIGEN_DEVICE_FUNC int mantissaBits() { return NumTraits<double>::digits() - 1; }
+  static EIGEN_DEVICE_FUNC inline long double run(int numRandomBits) {
+    return static_cast<long double>(random_float_impl<double>::run(numRandomBits));
+  }
+};
+template <>
+struct random_float_impl<long double> : random_longdouble_impl<> {};
+#endif
+
+template <typename Scalar>
+struct random_default_impl<Scalar, false, false> {
+  using Impl = random_float_impl<Scalar>;
+  static EIGEN_DEVICE_FUNC inline Scalar run(const Scalar& x, const Scalar& y, int numRandomBits) {
+    Scalar half_x = Scalar(0.5) * x;
+    Scalar half_y = Scalar(0.5) * y;
+    Scalar result = (half_x + half_y) + (half_y - half_x) * run(numRandomBits);
+    // result is in the half-open interval [x, y) -- provided that x < y
+    return result;
+  }
+  static EIGEN_DEVICE_FUNC inline Scalar run(const Scalar& x, const Scalar& y) {
+    return run(x, y, Impl::mantissaBits());
+  }
+  static EIGEN_DEVICE_FUNC inline Scalar run(int numRandomBits) { return Impl::run(numRandomBits); }
+  static EIGEN_DEVICE_FUNC inline Scalar run() { return run(Impl::mantissaBits()); }
+};
+
+template <typename Scalar, bool IsSigned = NumTraits<Scalar>::IsSigned, bool BuiltIn = std::is_integral<Scalar>::value>
+struct random_int_impl;
+
+// random implementation for a built-in unsigned integer type
+template <typename Scalar>
+struct random_int_impl<Scalar, false, true> {
+  static constexpr int kTotalBits = sizeof(Scalar) * CHAR_BIT;
+  static EIGEN_DEVICE_FUNC inline Scalar run(const Scalar& x, const Scalar& y) {
+    if (y <= x) return x;
+    Scalar range = y - x;
+    // handle edge case where [x,y] spans the entire range of Scalar
+    if (range == NumTraits<Scalar>::highest()) return run();
+    Scalar count = range + 1;
+    // calculate the number of random bits needed to fill range
+    int numRandomBits = log2_ceil(count);
+    Scalar randomBits;
+    do {
+      randomBits = getRandomBits<Scalar>(numRandomBits);
+      // if the random draw is outside [0, range), try again (rejection sampling)
+      // in the worst-case scenario, the probability of rejection is: 1/2 - 1/2^numRandomBits < 50%
+    } while (randomBits >= count);
+    Scalar result = x + randomBits;
+    return result;
+  }
+  static EIGEN_DEVICE_FUNC inline Scalar run() { return getRandomBits<Scalar>(kTotalBits); }
+};
+
+// random implementation for a built-in signed integer type
+template <typename Scalar>
+struct random_int_impl<Scalar, true, true> {
+  static constexpr int kTotalBits = sizeof(Scalar) * CHAR_BIT;
+  using BitsType = typename make_unsigned<Scalar>::type;
+  static EIGEN_DEVICE_FUNC inline Scalar run(const Scalar& x, const Scalar& y) {
+    if (y <= x) return x;
+    // Avoid overflow by representing `range` as an unsigned type
+    BitsType range = static_cast<BitsType>(y) - static_cast<BitsType>(x);
+    BitsType randomBits = random_int_impl<BitsType>::run(0, range);
+    // Avoid overflow in the case where `x` is negative and there is a large range so
+    // `randomBits` would also be negative if cast to `Scalar` first.
+    Scalar result = static_cast<Scalar>(static_cast<BitsType>(x) + randomBits);
+    return result;
+  }
+  static EIGEN_DEVICE_FUNC inline Scalar run() { return static_cast<Scalar>(getRandomBits<BitsType>(kTotalBits)); }
+};
+
+// todo: custom integers
+template <typename Scalar, bool IsSigned>
+struct random_int_impl<Scalar, IsSigned, false> {
+  static EIGEN_DEVICE_FUNC inline Scalar run(const Scalar&, const Scalar&) { return run(); }
+  static EIGEN_DEVICE_FUNC inline Scalar run() {
+    eigen_assert(std::false_type::value && "RANDOM FOR CUSTOM INTEGERS NOT YET SUPPORTED");
+    return Scalar(0);
+  }
+};
+
+template <typename Scalar>
+struct random_default_impl<Scalar, false, true> : random_int_impl<Scalar> {};
+
+template <>
+struct random_impl<bool> {
+  static EIGEN_DEVICE_FUNC inline bool run(const bool& x, const bool& y) {
+    if (y <= x) return x;
+    return run();
+  }
+  static EIGEN_DEVICE_FUNC inline bool run() { return getRandomBits<unsigned>(1) ? true : false; }
+};
+
+template <typename Scalar>
+struct random_default_impl<Scalar, true, false> {
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  using Impl = random_impl<RealScalar>;
+  static EIGEN_DEVICE_FUNC inline Scalar run(const Scalar& x, const Scalar& y, int numRandomBits) {
+    return Scalar(Impl::run(x.real(), y.real(), numRandomBits), Impl::run(x.imag(), y.imag(), numRandomBits));
+  }
+  static EIGEN_DEVICE_FUNC inline Scalar run(const Scalar& x, const Scalar& y) {
+    return Scalar(Impl::run(x.real(), y.real()), Impl::run(x.imag(), y.imag()));
+  }
+  static EIGEN_DEVICE_FUNC inline Scalar run(int numRandomBits) {
+    return Scalar(Impl::run(numRandomBits), Impl::run(numRandomBits));
+  }
+  static EIGEN_DEVICE_FUNC inline Scalar run() { return Scalar(Impl::run(), Impl::run()); }
+};
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_RANDOM_IMPL_H
diff --git a/inst/include/Eigen/src/Core/RealView.h b/inst/include/Eigen/src/Core/RealView.h
new file mode 100644
index 00000000..7ba42f9a
--- /dev/null
+++ b/inst/include/Eigen/src/Core/RealView.h
@@ -0,0 +1,250 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2025 Charlie Schlosser <cs.schlosser@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_REALVIEW_H
+#define EIGEN_REALVIEW_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+// Vectorized assignment to RealView requires array-oriented access to the real and imaginary components.
+// From https://en.cppreference.com/w/cpp/numeric/complex.html:
+// For any pointer to an element of an array of std::complex<T> named p and any valid array index i,
+// reinterpret_cast<T*>(p)[2 * i] is the real part of the complex number p[i], and
+// reinterpret_cast<T*>(p)[2 * i + 1] is the imaginary part of the complex number p[i].
+
+template <typename ComplexScalar>
+struct complex_array_access : std::false_type {};
+template <>
+struct complex_array_access<std::complex<float>> : std::true_type {};
+template <>
+struct complex_array_access<std::complex<double>> : std::true_type {};
+template <>
+struct complex_array_access<std::complex<long double>> : std::true_type {};
+
+template <typename Xpr>
+struct traits<RealView<Xpr>> : public traits<Xpr> {
+  template <typename T>
+  static constexpr int double_size(T size, bool times_two) {
+    int size_as_int = int(size);
+    if (size_as_int == Dynamic) return Dynamic;
+    return times_two ? (2 * size_as_int) : size_as_int;
+  }
+  using Base = traits<Xpr>;
+  using ComplexScalar = typename Base::Scalar;
+  using Scalar = typename NumTraits<ComplexScalar>::Real;
+  static constexpr int ActualDirectAccessBit = complex_array_access<ComplexScalar>::value ? DirectAccessBit : 0;
+  static constexpr int ActualPacketAccessBit = packet_traits<Scalar>::Vectorizable ? PacketAccessBit : 0;
+  static constexpr int FlagMask =
+      ActualDirectAccessBit | ActualPacketAccessBit | HereditaryBits | LinearAccessBit | LvalueBit;
+  static constexpr int BaseFlags = int(evaluator<Xpr>::Flags) | int(Base::Flags);
+  static constexpr int Flags = BaseFlags & FlagMask;
+  static constexpr bool IsRowMajor = Flags & RowMajorBit;
+  static constexpr int RowsAtCompileTime = double_size(Base::RowsAtCompileTime, !IsRowMajor);
+  static constexpr int ColsAtCompileTime = double_size(Base::ColsAtCompileTime, IsRowMajor);
+  static constexpr int SizeAtCompileTime = size_at_compile_time(RowsAtCompileTime, ColsAtCompileTime);
+  static constexpr int MaxRowsAtCompileTime = double_size(Base::MaxRowsAtCompileTime, !IsRowMajor);
+  static constexpr int MaxColsAtCompileTime = double_size(Base::MaxColsAtCompileTime, IsRowMajor);
+  static constexpr int MaxSizeAtCompileTime = size_at_compile_time(MaxRowsAtCompileTime, MaxColsAtCompileTime);
+  static constexpr int OuterStrideAtCompileTime = double_size(outer_stride_at_compile_time<Xpr>::ret, true);
+  static constexpr int InnerStrideAtCompileTime = inner_stride_at_compile_time<Xpr>::ret;
+};
+
+template <typename Xpr>
+struct evaluator<RealView<Xpr>> : private evaluator<Xpr> {
+  using BaseEvaluator = evaluator<Xpr>;
+  using XprType = RealView<Xpr>;
+  using ExpressionTraits = traits<XprType>;
+  using ComplexScalar = typename ExpressionTraits::ComplexScalar;
+  using ComplexCoeffReturnType = typename BaseEvaluator::CoeffReturnType;
+  using Scalar = typename ExpressionTraits::Scalar;
+
+  static constexpr bool IsRowMajor = ExpressionTraits::IsRowMajor;
+  static constexpr int Flags = ExpressionTraits::Flags;
+  static constexpr int CoeffReadCost = BaseEvaluator::CoeffReadCost;
+  static constexpr int Alignment = BaseEvaluator::Alignment;
+
+  EIGEN_DEVICE_FUNC explicit evaluator(XprType realView) : BaseEvaluator(realView.m_xpr) {}
+
+  template <bool Enable = std::is_reference<ComplexCoeffReturnType>::value, typename = std::enable_if_t<!Enable>>
+  constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(Index row, Index col) const {
+    ComplexCoeffReturnType cscalar = BaseEvaluator::coeff(IsRowMajor ? row : row / 2, IsRowMajor ? col / 2 : col);
+    Index p = (IsRowMajor ? col : row) & 1;
+    return p ? numext::real(cscalar) : numext::imag(cscalar);
+  }
+
+  template <bool Enable = std::is_reference<ComplexCoeffReturnType>::value, typename = std::enable_if_t<Enable>>
+  constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index row, Index col) const {
+    ComplexCoeffReturnType cscalar = BaseEvaluator::coeff(IsRowMajor ? row : row / 2, IsRowMajor ? col / 2 : col);
+    Index p = (IsRowMajor ? col : row) & 1;
+    return reinterpret_cast<const Scalar(&)[2]>(cscalar)[p];
+  }
+
+  constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index row, Index col) {
+    ComplexScalar& cscalar = BaseEvaluator::coeffRef(IsRowMajor ? row : row / 2, IsRowMajor ? col / 2 : col);
+    Index p = (IsRowMajor ? col : row) & 1;
+    return reinterpret_cast<Scalar(&)[2]>(cscalar)[p];
+  }
+
+  template <bool Enable = std::is_reference<ComplexCoeffReturnType>::value, typename = std::enable_if_t<!Enable>>
+  constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(Index index) const {
+    ComplexCoeffReturnType cscalar = BaseEvaluator::coeff(index / 2);
+    Index p = index & 1;
+    return p ? numext::real(cscalar) : numext::imag(cscalar);
+  }
+
+  template <bool Enable = std::is_reference<ComplexCoeffReturnType>::value, typename = std::enable_if_t<Enable>>
+  constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const {
+    ComplexCoeffReturnType cscalar = BaseEvaluator::coeff(index / 2);
+    Index p = index & 1;
+    return reinterpret_cast<const Scalar(&)[2]>(cscalar)[p];
+  }
+
+  constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
+    ComplexScalar& cscalar = BaseEvaluator::coeffRef(index / 2);
+    Index p = index & 1;
+    return reinterpret_cast<Scalar(&)[2]>(cscalar)[p];
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
+    constexpr int RealPacketSize = unpacket_traits<PacketType>::size;
+    using ComplexPacket = typename find_packet_by_size<ComplexScalar, RealPacketSize / 2>::type;
+    EIGEN_STATIC_ASSERT((find_packet_by_size<ComplexScalar, RealPacketSize / 2>::value),
+                        MISSING COMPATIBLE COMPLEX PACKET TYPE)
+    eigen_assert(((IsRowMajor ? col : row) % 2 == 0) && "the inner index must be even");
+
+    Index crow = IsRowMajor ? row : row / 2;
+    Index ccol = IsRowMajor ? col / 2 : col;
+    ComplexPacket cpacket = BaseEvaluator::template packet<LoadMode, ComplexPacket>(crow, ccol);
+    return preinterpret<PacketType, ComplexPacket>(cpacket);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packet(Index index) const {
+    constexpr int RealPacketSize = unpacket_traits<PacketType>::size;
+    using ComplexPacket = typename find_packet_by_size<ComplexScalar, RealPacketSize / 2>::type;
+    EIGEN_STATIC_ASSERT((find_packet_by_size<ComplexScalar, RealPacketSize / 2>::value),
+                        MISSING COMPATIBLE COMPLEX PACKET TYPE)
+    eigen_assert((index % 2 == 0) && "the index must be even");
+
+    Index cindex = index / 2;
+    ComplexPacket cpacket = BaseEvaluator::template packet<LoadMode, ComplexPacket>(cindex);
+    return preinterpret<PacketType, ComplexPacket>(cpacket);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index row, Index col, Index begin, Index count) const {
+    constexpr int RealPacketSize = unpacket_traits<PacketType>::size;
+    using ComplexPacket = typename find_packet_by_size<ComplexScalar, RealPacketSize / 2>::type;
+    EIGEN_STATIC_ASSERT((find_packet_by_size<ComplexScalar, RealPacketSize / 2>::value),
+                        MISSING COMPATIBLE COMPLEX PACKET TYPE)
+    eigen_assert(((IsRowMajor ? col : row) % 2 == 0) && "the inner index must be even");
+    eigen_assert((begin % 2 == 0) && (count % 2 == 0) && "begin and count must be even");
+
+    Index crow = IsRowMajor ? row : row / 2;
+    Index ccol = IsRowMajor ? col / 2 : col;
+    Index cbegin = begin / 2;
+    Index ccount = count / 2;
+    ComplexPacket cpacket = BaseEvaluator::template packetSegment<LoadMode, ComplexPacket>(crow, ccol, cbegin, ccount);
+    return preinterpret<PacketType, ComplexPacket>(cpacket);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegment(Index index, Index begin, Index count) const {
+    constexpr int RealPacketSize = unpacket_traits<PacketType>::size;
+    using ComplexPacket = typename find_packet_by_size<ComplexScalar, RealPacketSize / 2>::type;
+    EIGEN_STATIC_ASSERT((find_packet_by_size<ComplexScalar, RealPacketSize / 2>::value),
+                        MISSING COMPATIBLE COMPLEX PACKET TYPE)
+    eigen_assert((index % 2 == 0) && "the index must be even");
+    eigen_assert((begin % 2 == 0) && (count % 2 == 0) && "begin and count must be even");
+
+    Index cindex = index / 2;
+    Index cbegin = begin / 2;
+    Index ccount = count / 2;
+    ComplexPacket cpacket = BaseEvaluator::template packetSegment<LoadMode, ComplexPacket>(cindex, cbegin, ccount);
+    return preinterpret<PacketType, ComplexPacket>(cpacket);
+  }
+};
+
+}  // namespace internal
+
+template <typename Xpr>
+class RealView : public internal::dense_xpr_base<RealView<Xpr>>::type {
+  using ExpressionTraits = internal::traits<RealView>;
+  EIGEN_STATIC_ASSERT(NumTraits<typename Xpr::Scalar>::IsComplex, SCALAR MUST BE COMPLEX)
+ public:
+  using Scalar = typename ExpressionTraits::Scalar;
+  using Nested = RealView;
+
+  EIGEN_DEVICE_FUNC explicit RealView(Xpr& xpr) : m_xpr(xpr) {}
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return Xpr::IsRowMajor ? m_xpr.rows() : 2 * m_xpr.rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return Xpr::IsRowMajor ? 2 * m_xpr.cols() : m_xpr.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index size() const noexcept { return 2 * m_xpr.size(); }
+  EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept { return m_xpr.innerStride(); }
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return 2 * m_xpr.outerStride(); }
+  EIGEN_DEVICE_FUNC void resize(Index rows, Index cols) {
+    m_xpr.resize(Xpr::IsRowMajor ? rows : rows / 2, Xpr::IsRowMajor ? cols / 2 : cols);
+  }
+  EIGEN_DEVICE_FUNC void resize(Index size) { m_xpr.resize(size / 2); }
+  EIGEN_DEVICE_FUNC Scalar* data() { return reinterpret_cast<Scalar*>(m_xpr.data()); }
+  EIGEN_DEVICE_FUNC const Scalar* data() const { return reinterpret_cast<const Scalar*>(m_xpr.data()); }
+
+  EIGEN_DEVICE_FUNC RealView(const RealView&) = default;
+
+  EIGEN_DEVICE_FUNC RealView& operator=(const RealView& other);
+
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC RealView& operator=(const RealView<OtherDerived>& other);
+
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC RealView& operator=(const DenseBase<OtherDerived>& other);
+
+ protected:
+  friend struct internal::evaluator<RealView<Xpr>>;
+  Xpr& m_xpr;
+};
+
+template <typename Xpr>
+EIGEN_DEVICE_FUNC RealView<Xpr>& RealView<Xpr>::operator=(const RealView& other) {
+  internal::call_assignment(*this, other);
+  return *this;
+}
+
+template <typename Xpr>
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC RealView<Xpr>& RealView<Xpr>::operator=(const RealView<OtherDerived>& other) {
+  internal::call_assignment(*this, other);
+  return *this;
+}
+
+template <typename Xpr>
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC RealView<Xpr>& RealView<Xpr>::operator=(const DenseBase<OtherDerived>& other) {
+  internal::call_assignment(*this, other.derived());
+  return *this;
+}
+
+template <typename Derived>
+EIGEN_DEVICE_FUNC typename DenseBase<Derived>::RealViewReturnType DenseBase<Derived>::realView() {
+  return RealViewReturnType(derived());
+}
+
+template <typename Derived>
+EIGEN_DEVICE_FUNC typename DenseBase<Derived>::ConstRealViewReturnType DenseBase<Derived>::realView() const {
+  return ConstRealViewReturnType(derived());
+}
+
+}  // namespace Eigen
+
+#endif  // EIGEN_REALVIEW_H
diff --git a/inst/include/Eigen/src/Core/Redux.h b/inst/include/Eigen/src/Core/Redux.h
index 9b8662a6..4e9ab0e4 100644
--- a/inst/include/Eigen/src/Core/Redux.h
+++ b/inst/include/Eigen/src/Core/Redux.h
@@ -11,7 +11,10 @@
 #ifndef EIGEN_REDUX_H
 #define EIGEN_REDUX_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 namespace internal {
 
@@ -20,227 +23,300 @@ namespace internal {
 //  * factorize code
 
 /***************************************************************************
-* Part 1 : the logic deciding a strategy for vectorization and unrolling
-***************************************************************************/
+ * Part 1 : the logic deciding a strategy for vectorization and unrolling
+ ***************************************************************************/
 
-template<typename Func, typename Derived>
-struct redux_traits
-{
-public:
+template <typename Func, typename Evaluator>
+struct redux_traits {
+ public:
+  typedef typename find_best_packet<typename Evaluator::Scalar, Evaluator::SizeAtCompileTime>::type PacketType;
   enum {
-    PacketSize = packet_traits<typename Derived::Scalar>::size,
-    InnerMaxSize = int(Derived::IsRowMajor)
-                 ? Derived::MaxColsAtCompileTime
-                 : Derived::MaxRowsAtCompileTime
+    PacketSize = unpacket_traits<PacketType>::size,
+    InnerMaxSize = int(Evaluator::IsRowMajor) ? Evaluator::MaxColsAtCompileTime : Evaluator::MaxRowsAtCompileTime,
+    OuterMaxSize = int(Evaluator::IsRowMajor) ? Evaluator::MaxRowsAtCompileTime : Evaluator::MaxColsAtCompileTime,
+    SliceVectorizedWork = int(InnerMaxSize) == Dynamic   ? Dynamic
+                          : int(OuterMaxSize) == Dynamic ? (int(InnerMaxSize) >= int(PacketSize) ? Dynamic : 0)
+                                                         : (int(InnerMaxSize) / int(PacketSize)) * int(OuterMaxSize)
   };
 
   enum {
-    MightVectorize = (int(Derived::Flags)&ActualPacketAccessBit)
-                  && (functor_traits<Func>::PacketAccess),
-    MayLinearVectorize = MightVectorize && (int(Derived::Flags)&LinearAccessBit),
-    MaySliceVectorize  = MightVectorize && int(InnerMaxSize)>=3*PacketSize
+    MayLinearize = (int(Evaluator::Flags) & LinearAccessBit),
+    MightVectorize = (int(Evaluator::Flags) & ActualPacketAccessBit) && (functor_traits<Func>::PacketAccess),
+    MayLinearVectorize = bool(MightVectorize) && bool(MayLinearize),
+    MaySliceVectorize = bool(MightVectorize) && (int(SliceVectorizedWork) == Dynamic || int(SliceVectorizedWork) >= 3)
   };
 
-public:
+ public:
   enum {
-    Traversal = int(MayLinearVectorize) ? int(LinearVectorizedTraversal)
-              : int(MaySliceVectorize)  ? int(SliceVectorizedTraversal)
-                                        : int(DefaultTraversal)
+    Traversal = int(MayLinearVectorize)  ? int(LinearVectorizedTraversal)
+                : int(MaySliceVectorize) ? int(SliceVectorizedTraversal)
+                : int(MayLinearize)      ? int(LinearTraversal)
+                                         : int(DefaultTraversal)
   };
 
-public:
+ public:
   enum {
-    Cost = (  Derived::SizeAtCompileTime == Dynamic
-           || Derived::CoeffReadCost == Dynamic
-           || (Derived::SizeAtCompileTime!=1 && functor_traits<Func>::Cost == Dynamic)
-           ) ? Dynamic
-           : Derived::SizeAtCompileTime * Derived::CoeffReadCost
-               + (Derived::SizeAtCompileTime-1) * functor_traits<Func>::Cost,
+    Cost = Evaluator::SizeAtCompileTime == Dynamic
+               ? HugeCost
+               : int(Evaluator::SizeAtCompileTime) * int(Evaluator::CoeffReadCost) +
+                     (Evaluator::SizeAtCompileTime - 1) * functor_traits<Func>::Cost,
     UnrollingLimit = EIGEN_UNROLLING_LIMIT * (int(Traversal) == int(DefaultTraversal) ? 1 : int(PacketSize))
   };
 
-public:
-  enum {
-    Unrolling = Cost != Dynamic && Cost <= UnrollingLimit
-              ? CompleteUnrolling
-              : NoUnrolling
-  };
+ public:
+  enum { Unrolling = Cost <= UnrollingLimit ? CompleteUnrolling : NoUnrolling };
+
+#ifdef EIGEN_DEBUG_ASSIGN
+  static void debug() {
+    std::cerr << "Xpr: " << typeid(typename Evaluator::XprType).name() << std::endl;
+    std::cerr.setf(std::ios::hex, std::ios::basefield);
+    EIGEN_DEBUG_VAR(Evaluator::Flags)
+    std::cerr.unsetf(std::ios::hex);
+    EIGEN_DEBUG_VAR(InnerMaxSize)
+    EIGEN_DEBUG_VAR(OuterMaxSize)
+    EIGEN_DEBUG_VAR(SliceVectorizedWork)
+    EIGEN_DEBUG_VAR(PacketSize)
+    EIGEN_DEBUG_VAR(MightVectorize)
+    EIGEN_DEBUG_VAR(MayLinearVectorize)
+    EIGEN_DEBUG_VAR(MaySliceVectorize)
+    std::cerr << "Traversal"
+              << " = " << Traversal << " (" << demangle_traversal(Traversal) << ")" << std::endl;
+    EIGEN_DEBUG_VAR(UnrollingLimit)
+    std::cerr << "Unrolling"
+              << " = " << Unrolling << " (" << demangle_unrolling(Unrolling) << ")" << std::endl;
+    std::cerr << std::endl;
+  }
+#endif
 };
 
 /***************************************************************************
-* Part 2 : unrollers
-***************************************************************************/
+ * Part 2 : unrollers
+ ***************************************************************************/
 
 /*** no vectorization ***/
 
-template<typename Func, typename Derived, int Start, int Length>
-struct redux_novec_unroller
-{
-  enum {
-    HalfLength = Length/2
-  };
+template <typename Func, typename Evaluator, Index Start, Index Length>
+struct redux_novec_unroller {
+  static constexpr Index HalfLength = Length / 2;
 
-  typedef typename Derived::Scalar Scalar;
+  typedef typename Evaluator::Scalar Scalar;
 
-  static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func& func)
-  {
-    return func(redux_novec_unroller<Func, Derived, Start, HalfLength>::run(mat,func),
-                redux_novec_unroller<Func, Derived, Start+HalfLength, Length-HalfLength>::run(mat,func));
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Evaluator& eval, const Func& func) {
+    return func(redux_novec_unroller<Func, Evaluator, Start, HalfLength>::run(eval, func),
+                redux_novec_unroller<Func, Evaluator, Start + HalfLength, Length - HalfLength>::run(eval, func));
   }
 };
 
-template<typename Func, typename Derived, int Start>
-struct redux_novec_unroller<Func, Derived, Start, 1>
-{
-  enum {
-    outer = Start / Derived::InnerSizeAtCompileTime,
-    inner = Start % Derived::InnerSizeAtCompileTime
-  };
+template <typename Func, typename Evaluator, Index Start>
+struct redux_novec_unroller<Func, Evaluator, Start, 1> {
+  static constexpr Index outer = Start / Evaluator::InnerSizeAtCompileTime;
+  static constexpr Index inner = Start % Evaluator::InnerSizeAtCompileTime;
 
-  typedef typename Derived::Scalar Scalar;
+  typedef typename Evaluator::Scalar Scalar;
 
-  static EIGEN_STRONG_INLINE Scalar run(const Derived &mat, const Func&)
-  {
-    return mat.coeffByOuterInner(outer, inner);
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Evaluator& eval, const Func&) {
+    return eval.coeffByOuterInner(outer, inner);
   }
 };
 
 // This is actually dead code and will never be called. It is required
 // to prevent false warnings regarding failed inlining though
 // for 0 length run() will never be called at all.
-template<typename Func, typename Derived, int Start>
-struct redux_novec_unroller<Func, Derived, Start, 0>
-{
-  typedef typename Derived::Scalar Scalar;
-  static EIGEN_STRONG_INLINE Scalar run(const Derived&, const Func&) { return Scalar(); }
+template <typename Func, typename Evaluator, Index Start>
+struct redux_novec_unroller<Func, Evaluator, Start, 0> {
+  typedef typename Evaluator::Scalar Scalar;
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Evaluator&, const Func&) { return Scalar(); }
 };
 
-/*** vectorization ***/
+template <typename Func, typename Evaluator, Index Start, Index Length>
+struct redux_novec_linear_unroller {
+  static constexpr Index HalfLength = Length / 2;
 
-template<typename Func, typename Derived, int Start, int Length>
-struct redux_vec_unroller
-{
-  enum {
-    PacketSize = packet_traits<typename Derived::Scalar>::size,
-    HalfLength = Length/2
-  };
+  typedef typename Evaluator::Scalar Scalar;
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Evaluator& eval, const Func& func) {
+    return func(redux_novec_linear_unroller<Func, Evaluator, Start, HalfLength>::run(eval, func),
+                redux_novec_linear_unroller<Func, Evaluator, Start + HalfLength, Length - HalfLength>::run(eval, func));
+  }
+};
+
+template <typename Func, typename Evaluator, Index Start>
+struct redux_novec_linear_unroller<Func, Evaluator, Start, 1> {
+  typedef typename Evaluator::Scalar Scalar;
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Evaluator& eval, const Func&) {
+    return eval.coeff(Start);
+  }
+};
+
+// This is actually dead code and will never be called. It is required
+// to prevent false warnings regarding failed inlining though
+// for 0 length run() will never be called at all.
+template <typename Func, typename Evaluator, Index Start>
+struct redux_novec_linear_unroller<Func, Evaluator, Start, 0> {
+  typedef typename Evaluator::Scalar Scalar;
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Evaluator&, const Func&) { return Scalar(); }
+};
+
+/*** vectorization ***/
 
-  typedef typename Derived::Scalar Scalar;
-  typedef typename packet_traits<Scalar>::type PacketScalar;
+template <typename Func, typename Evaluator, Index Start, Index Length>
+struct redux_vec_unroller {
+  template <typename PacketType>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE PacketType run(const Evaluator& eval, const Func& func) {
+    constexpr Index HalfLength = Length / 2;
 
-  static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func& func)
-  {
     return func.packetOp(
-            redux_vec_unroller<Func, Derived, Start, HalfLength>::run(mat,func),
-            redux_vec_unroller<Func, Derived, Start+HalfLength, Length-HalfLength>::run(mat,func) );
+        redux_vec_unroller<Func, Evaluator, Start, HalfLength>::template run<PacketType>(eval, func),
+        redux_vec_unroller<Func, Evaluator, Start + HalfLength, Length - HalfLength>::template run<PacketType>(eval,
+                                                                                                               func));
   }
 };
 
-template<typename Func, typename Derived, int Start>
-struct redux_vec_unroller<Func, Derived, Start, 1>
-{
-  enum {
-    index = Start * packet_traits<typename Derived::Scalar>::size,
-    outer = index / int(Derived::InnerSizeAtCompileTime),
-    inner = index % int(Derived::InnerSizeAtCompileTime),
-    alignment = (Derived::Flags & AlignedBit) ? Aligned : Unaligned
-  };
+template <typename Func, typename Evaluator, Index Start>
+struct redux_vec_unroller<Func, Evaluator, Start, 1> {
+  template <typename PacketType>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE PacketType run(const Evaluator& eval, const Func&) {
+    constexpr Index PacketSize = unpacket_traits<PacketType>::size;
+    constexpr Index index = Start * PacketSize;
+    constexpr Index outer = index / int(Evaluator::InnerSizeAtCompileTime);
+    constexpr Index inner = index % int(Evaluator::InnerSizeAtCompileTime);
+    constexpr int alignment = Evaluator::Alignment;
+
+    return eval.template packetByOuterInner<alignment, PacketType>(outer, inner);
+  }
+};
 
-  typedef typename Derived::Scalar Scalar;
-  typedef typename packet_traits<Scalar>::type PacketScalar;
+template <typename Func, typename Evaluator, Index Start, Index Length>
+struct redux_vec_linear_unroller {
+  template <typename PacketType>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE PacketType run(const Evaluator& eval, const Func& func) {
+    constexpr Index HalfLength = Length / 2;
 
-  static EIGEN_STRONG_INLINE PacketScalar run(const Derived &mat, const Func&)
-  {
-    return mat.template packetByOuterInner<alignment>(outer, inner);
+    return func.packetOp(
+        redux_vec_linear_unroller<Func, Evaluator, Start, HalfLength>::template run<PacketType>(eval, func),
+        redux_vec_linear_unroller<Func, Evaluator, Start + HalfLength, Length - HalfLength>::template run<PacketType>(
+            eval, func));
+  }
+};
+
+template <typename Func, typename Evaluator, Index Start>
+struct redux_vec_linear_unroller<Func, Evaluator, Start, 1> {
+  template <typename PacketType>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE PacketType run(const Evaluator& eval, const Func&) {
+    constexpr Index PacketSize = unpacket_traits<PacketType>::size;
+    constexpr Index index = (Start * PacketSize);
+    constexpr int alignment = Evaluator::Alignment;
+    return eval.template packet<alignment, PacketType>(index);
   }
 };
 
 /***************************************************************************
-* Part 3 : implementation of all cases
-***************************************************************************/
+ * Part 3 : implementation of all cases
+ ***************************************************************************/
 
-template<typename Func, typename Derived,
-         int Traversal = redux_traits<Func, Derived>::Traversal,
-         int Unrolling = redux_traits<Func, Derived>::Unrolling
->
+template <typename Func, typename Evaluator, int Traversal = redux_traits<Func, Evaluator>::Traversal,
+          int Unrolling = redux_traits<Func, Evaluator>::Unrolling>
 struct redux_impl;
 
-template<typename Func, typename Derived>
-struct redux_impl<Func, Derived, DefaultTraversal, NoUnrolling>
-{
-  typedef typename Derived::Scalar Scalar;
-  typedef typename Derived::Index Index;
-  static EIGEN_STRONG_INLINE Scalar run(const Derived& mat, const Func& func)
-  {
-    eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
-    Scalar res;
-    res = mat.coeffByOuterInner(0, 0);
-    for(Index i = 1; i < mat.innerSize(); ++i)
-      res = func(res, mat.coeffByOuterInner(0, i));
-    for(Index i = 1; i < mat.outerSize(); ++i)
-      for(Index j = 0; j < mat.innerSize(); ++j)
-        res = func(res, mat.coeffByOuterInner(i, j));
+template <typename Func, typename Evaluator>
+struct redux_impl<Func, Evaluator, DefaultTraversal, NoUnrolling> {
+  typedef typename Evaluator::Scalar Scalar;
+
+  template <typename XprType>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Evaluator& eval, const Func& func, const XprType& xpr) {
+    eigen_assert(xpr.rows() > 0 && xpr.cols() > 0 && "you are using an empty matrix");
+    Scalar res = eval.coeffByOuterInner(0, 0);
+    for (Index i = 1; i < xpr.innerSize(); ++i) res = func(res, eval.coeffByOuterInner(0, i));
+    for (Index i = 1; i < xpr.outerSize(); ++i)
+      for (Index j = 0; j < xpr.innerSize(); ++j) res = func(res, eval.coeffByOuterInner(i, j));
+    return res;
+  }
+};
+
+template <typename Func, typename Evaluator>
+struct redux_impl<Func, Evaluator, LinearTraversal, NoUnrolling> {
+  typedef typename Evaluator::Scalar Scalar;
+
+  template <typename XprType>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Evaluator& eval, const Func& func, const XprType& xpr) {
+    eigen_assert(xpr.size() > 0 && "you are using an empty matrix");
+    Scalar res = eval.coeff(0);
+    for (Index k = 1; k < xpr.size(); ++k) res = func(res, eval.coeff(k));
     return res;
   }
 };
 
-template<typename Func, typename Derived>
-struct redux_impl<Func,Derived, DefaultTraversal, CompleteUnrolling>
-  : public redux_novec_unroller<Func,Derived, 0, Derived::SizeAtCompileTime>
-{};
-
-template<typename Func, typename Derived>
-struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
-{
-  typedef typename Derived::Scalar Scalar;
-  typedef typename packet_traits<Scalar>::type PacketScalar;
-  typedef typename Derived::Index Index;
-
-  static Scalar run(const Derived& mat, const Func& func)
-  {
-    const Index size = mat.size();
-    eigen_assert(size && "you are using an empty matrix");
-    const Index packetSize = packet_traits<Scalar>::size;
-    const Index alignedStart = internal::first_aligned(mat);
-    enum {
-      alignment = bool(Derived::Flags & DirectAccessBit) || bool(Derived::Flags & AlignedBit)
-                ? Aligned : Unaligned
-    };
-    const Index alignedSize2 = ((size-alignedStart)/(2*packetSize))*(2*packetSize);
-    const Index alignedSize = ((size-alignedStart)/(packetSize))*(packetSize);
+template <typename Func, typename Evaluator>
+struct redux_impl<Func, Evaluator, DefaultTraversal, CompleteUnrolling>
+    : redux_novec_unroller<Func, Evaluator, 0, Evaluator::SizeAtCompileTime> {
+  typedef redux_novec_unroller<Func, Evaluator, 0, Evaluator::SizeAtCompileTime> Base;
+  typedef typename Evaluator::Scalar Scalar;
+  template <typename XprType>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Evaluator& eval, const Func& func,
+                                                          const XprType& /*xpr*/) {
+    return Base::run(eval, func);
+  }
+};
+
+template <typename Func, typename Evaluator>
+struct redux_impl<Func, Evaluator, LinearTraversal, CompleteUnrolling>
+    : redux_novec_linear_unroller<Func, Evaluator, 0, Evaluator::SizeAtCompileTime> {
+  typedef redux_novec_linear_unroller<Func, Evaluator, 0, Evaluator::SizeAtCompileTime> Base;
+  typedef typename Evaluator::Scalar Scalar;
+  template <typename XprType>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Evaluator& eval, const Func& func,
+                                                          const XprType& /*xpr*/) {
+    return Base::run(eval, func);
+  }
+};
+
+template <typename Func, typename Evaluator>
+struct redux_impl<Func, Evaluator, LinearVectorizedTraversal, NoUnrolling> {
+  typedef typename Evaluator::Scalar Scalar;
+  typedef typename redux_traits<Func, Evaluator>::PacketType PacketScalar;
+
+  template <typename XprType>
+  static Scalar run(const Evaluator& eval, const Func& func, const XprType& xpr) {
+    const Index size = xpr.size();
+
+    constexpr Index packetSize = redux_traits<Func, Evaluator>::PacketSize;
+    constexpr int packetAlignment = unpacket_traits<PacketScalar>::alignment;
+    constexpr int alignment0 =
+        (bool(Evaluator::Flags & DirectAccessBit) && bool(packet_traits<Scalar>::AlignedOnScalar))
+            ? int(packetAlignment)
+            : int(Unaligned);
+    constexpr int alignment = plain_enum_max(alignment0, Evaluator::Alignment);
+    const Index alignedStart = internal::first_default_aligned(xpr);
+    const Index alignedSize2 = ((size - alignedStart) / (2 * packetSize)) * (2 * packetSize);
+    const Index alignedSize = ((size - alignedStart) / (packetSize)) * (packetSize);
     const Index alignedEnd2 = alignedStart + alignedSize2;
-    const Index alignedEnd  = alignedStart + alignedSize;
+    const Index alignedEnd = alignedStart + alignedSize;
     Scalar res;
-    if(alignedSize)
-    {
-      PacketScalar packet_res0 = mat.template packet<alignment>(alignedStart);
-      if(alignedSize>packetSize) // we have at least two packets to partly unroll the loop
+    if (alignedSize) {
+      PacketScalar packet_res0 = eval.template packet<alignment, PacketScalar>(alignedStart);
+      if (alignedSize > packetSize)  // we have at least two packets to partly unroll the loop
       {
-        PacketScalar packet_res1 = mat.template packet<alignment>(alignedStart+packetSize);
-        for(Index index = alignedStart + 2*packetSize; index < alignedEnd2; index += 2*packetSize)
-        {
-          packet_res0 = func.packetOp(packet_res0, mat.template packet<alignment>(index));
-          packet_res1 = func.packetOp(packet_res1, mat.template packet<alignment>(index+packetSize));
+        PacketScalar packet_res1 = eval.template packet<alignment, PacketScalar>(alignedStart + packetSize);
+        for (Index index = alignedStart + 2 * packetSize; index < alignedEnd2; index += 2 * packetSize) {
+          packet_res0 = func.packetOp(packet_res0, eval.template packet<alignment, PacketScalar>(index));
+          packet_res1 = func.packetOp(packet_res1, eval.template packet<alignment, PacketScalar>(index + packetSize));
         }
 
-        packet_res0 = func.packetOp(packet_res0,packet_res1);
-        if(alignedEnd>alignedEnd2)
-          packet_res0 = func.packetOp(packet_res0, mat.template packet<alignment>(alignedEnd2));
+        packet_res0 = func.packetOp(packet_res0, packet_res1);
+        if (alignedEnd > alignedEnd2)
+          packet_res0 = func.packetOp(packet_res0, eval.template packet<alignment, PacketScalar>(alignedEnd2));
       }
       res = func.predux(packet_res0);
 
-      for(Index index = 0; index < alignedStart; ++index)
-        res = func(res,mat.coeff(index));
+      for (Index index = 0; index < alignedStart; ++index) res = func(res, eval.coeff(index));
 
-      for(Index index = alignedEnd; index < size; ++index)
-        res = func(res,mat.coeff(index));
-    }
-    else // too small to vectorize anything.
-         // since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize.
+      for (Index index = alignedEnd; index < size; ++index) res = func(res, eval.coeff(index));
+    } else  // too small to vectorize anything.
+            // since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize.
     {
-      res = mat.coeff(0);
-      for(Index index = 1; index < size; ++index)
-        res = func(res,mat.coeff(index));
+      res = eval.coeff(0);
+      for (Index index = 1; index < size; ++index) res = func(res, eval.coeff(index));
     }
 
     return res;
@@ -248,162 +324,212 @@ struct redux_impl<Func, Derived, LinearVectorizedTraversal, NoUnrolling>
 };
 
 // NOTE: for SliceVectorizedTraversal we simply bypass unrolling
-template<typename Func, typename Derived, int Unrolling>
-struct redux_impl<Func, Derived, SliceVectorizedTraversal, Unrolling>
-{
-  typedef typename Derived::Scalar Scalar;
-  typedef typename packet_traits<Scalar>::type PacketScalar;
-  typedef typename Derived::Index Index;
-
-  static Scalar run(const Derived& mat, const Func& func)
-  {
-    eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
-    const Index innerSize = mat.innerSize();
-    const Index outerSize = mat.outerSize();
-    enum {
-      packetSize = packet_traits<Scalar>::size
-    };
-    const Index packetedInnerSize = ((innerSize)/packetSize)*packetSize;
+template <typename Func, typename Evaluator, int Unrolling>
+struct redux_impl<Func, Evaluator, SliceVectorizedTraversal, Unrolling> {
+  typedef typename Evaluator::Scalar Scalar;
+  typedef typename redux_traits<Func, Evaluator>::PacketType PacketType;
+
+  template <typename XprType>
+  EIGEN_DEVICE_FUNC static Scalar run(const Evaluator& eval, const Func& func, const XprType& xpr) {
+    eigen_assert(xpr.rows() > 0 && xpr.cols() > 0 && "you are using an empty matrix");
+    constexpr Index packetSize = redux_traits<Func, Evaluator>::PacketSize;
+    const Index innerSize = xpr.innerSize();
+    const Index outerSize = xpr.outerSize();
+    const Index packetedInnerSize = ((innerSize) / packetSize) * packetSize;
     Scalar res;
-    if(packetedInnerSize)
-    {
-      PacketScalar packet_res = mat.template packet<Unaligned>(0,0);
-      for(Index j=0; j<outerSize; ++j)
-        for(Index i=(j==0?packetSize:0); i<packetedInnerSize; i+=Index(packetSize))
-          packet_res = func.packetOp(packet_res, mat.template packetByOuterInner<Unaligned>(j,i));
+    if (packetedInnerSize) {
+      PacketType packet_res = eval.template packet<Unaligned, PacketType>(0, 0);
+      for (Index j = 0; j < outerSize; ++j)
+        for (Index i = (j == 0 ? packetSize : 0); i < packetedInnerSize; i += Index(packetSize))
+          packet_res = func.packetOp(packet_res, eval.template packetByOuterInner<Unaligned, PacketType>(j, i));
 
       res = func.predux(packet_res);
-      for(Index j=0; j<outerSize; ++j)
-        for(Index i=packetedInnerSize; i<innerSize; ++i)
-          res = func(res, mat.coeffByOuterInner(j,i));
-    }
-    else // too small to vectorize anything.
-         // since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize.
+      for (Index j = 0; j < outerSize; ++j)
+        for (Index i = packetedInnerSize; i < innerSize; ++i) res = func(res, eval.coeffByOuterInner(j, i));
+    } else  // too small to vectorize anything.
+            // since this is dynamic-size hence inefficient anyway for such small sizes, don't try to optimize.
     {
-      res = redux_impl<Func, Derived, DefaultTraversal, NoUnrolling>::run(mat, func);
+      res = redux_impl<Func, Evaluator, DefaultTraversal, NoUnrolling>::run(eval, func, xpr);
     }
 
     return res;
   }
 };
 
-template<typename Func, typename Derived>
-struct redux_impl<Func, Derived, LinearVectorizedTraversal, CompleteUnrolling>
-{
-  typedef typename Derived::Scalar Scalar;
-  typedef typename packet_traits<Scalar>::type PacketScalar;
+template <typename Func, typename Evaluator>
+struct redux_impl<Func, Evaluator, LinearVectorizedTraversal, CompleteUnrolling> {
+  typedef typename Evaluator::Scalar Scalar;
+
+  typedef typename redux_traits<Func, Evaluator>::PacketType PacketType;
+  static constexpr Index PacketSize = redux_traits<Func, Evaluator>::PacketSize;
+  static constexpr Index Size = Evaluator::SizeAtCompileTime;
+  static constexpr Index VectorizedSize = (int(Size) / int(PacketSize)) * int(PacketSize);
+
+  template <typename XprType>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Evaluator& eval, const Func& func, const XprType& xpr) {
+    EIGEN_ONLY_USED_FOR_DEBUG(xpr)
+    eigen_assert(xpr.rows() > 0 && xpr.cols() > 0 && "you are using an empty matrix");
+    if (VectorizedSize > 0) {
+      Scalar res = func.predux(
+          redux_vec_linear_unroller<Func, Evaluator, 0, Size / PacketSize>::template run<PacketType>(eval, func));
+      if (VectorizedSize != Size)
+        res = func(
+            res, redux_novec_linear_unroller<Func, Evaluator, VectorizedSize, Size - VectorizedSize>::run(eval, func));
+      return res;
+    } else {
+      return redux_novec_linear_unroller<Func, Evaluator, 0, Size>::run(eval, func);
+    }
+  }
+};
+
+// evaluator adaptor
+template <typename XprType_>
+class redux_evaluator : public internal::evaluator<XprType_> {
+  typedef internal::evaluator<XprType_> Base;
+
+ public:
+  typedef XprType_ XprType;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit redux_evaluator(const XprType& xpr) : Base(xpr) {}
+
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename XprType::PacketScalar PacketScalar;
+
   enum {
-    PacketSize = packet_traits<Scalar>::size,
-    Size = Derived::SizeAtCompileTime,
-    VectorizedSize = (Size / PacketSize) * PacketSize
+    MaxRowsAtCompileTime = XprType::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = XprType::MaxColsAtCompileTime,
+    // TODO we should not remove DirectAccessBit and rather find an elegant way to query the alignment offset at runtime
+    // from the evaluator
+    Flags = Base::Flags & ~DirectAccessBit,
+    IsRowMajor = XprType::IsRowMajor,
+    SizeAtCompileTime = XprType::SizeAtCompileTime,
+    InnerSizeAtCompileTime = XprType::InnerSizeAtCompileTime
   };
-  static EIGEN_STRONG_INLINE Scalar run(const Derived& mat, const Func& func)
-  {
-    eigen_assert(mat.rows()>0 && mat.cols()>0 && "you are using an empty matrix");
-    Scalar res = func.predux(redux_vec_unroller<Func, Derived, 0, Size / PacketSize>::run(mat,func));
-    if (VectorizedSize != Size)
-      res = func(res,redux_novec_unroller<Func, Derived, VectorizedSize, Size-VectorizedSize>::run(mat,func));
-    return res;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffByOuterInner(Index outer, Index inner) const {
+    return Base::coeff(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetByOuterInner(Index outer, Index inner) const {
+    return Base::template packet<LoadMode, PacketType>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketType packetSegmentByOuterInner(Index outer, Index inner, Index begin,
+                                                                             Index count) const {
+    return Base::template packetSegment<LoadMode, PacketType>(IsRowMajor ? outer : inner, IsRowMajor ? inner : outer,
+                                                              begin, count);
   }
 };
 
-} // end namespace internal
+}  // end namespace internal
 
 /***************************************************************************
-* Part 4 : public API
-***************************************************************************/
-
+ * Part 4 : public API
+ ***************************************************************************/
 
 /** \returns the result of a full redux operation on the whole matrix or vector using \a func
-  *
-  * The template parameter \a BinaryOp is the type of the functor \a func which must be
-  * an associative operator. Both current STL and TR1 functor styles are handled.
-  *
-  * \sa DenseBase::sum(), DenseBase::minCoeff(), DenseBase::maxCoeff(), MatrixBase::colwise(), MatrixBase::rowwise()
-  */
-template<typename Derived>
-template<typename Func>
-EIGEN_STRONG_INLINE typename internal::result_of<Func(typename internal::traits<Derived>::Scalar)>::type
-DenseBase<Derived>::redux(const Func& func) const
-{
-  typedef typename internal::remove_all<typename Derived::Nested>::type ThisNested;
-  return internal::redux_impl<Func, ThisNested>
-            ::run(derived(), func);
+ *
+ * The template parameter \a BinaryOp is the type of the functor \a func which must be
+ * an associative operator. Both current C++98 and C++11 functor styles are handled.
+ *
+ * \warning the matrix must be not empty, otherwise an assertion is triggered.
+ *
+ * \sa DenseBase::sum(), DenseBase::minCoeff(), DenseBase::maxCoeff(), MatrixBase::colwise(), MatrixBase::rowwise()
+ */
+template <typename Derived>
+template <typename Func>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar DenseBase<Derived>::redux(
+    const Func& func) const {
+  eigen_assert(this->rows() > 0 && this->cols() > 0 && "you are using an empty matrix");
+
+  typedef typename internal::redux_evaluator<Derived> ThisEvaluator;
+  ThisEvaluator thisEval(derived());
+
+  // The initial expression is passed to the reducer as an additional argument instead of
+  // passing it as a member of redux_evaluator to help
+  return internal::redux_impl<Func, ThisEvaluator>::run(thisEval, func, derived());
 }
 
 /** \returns the minimum of all coefficients of \c *this.
-  * \warning the result is undefined if \c *this contains NaN.
-  */
-template<typename Derived>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
-DenseBase<Derived>::minCoeff() const
-{
-  return this->redux(Eigen::internal::scalar_min_op<Scalar>());
+ * In case \c *this contains NaN, NaNPropagation determines the behavior:
+ *   NaNPropagation == PropagateFast : undefined
+ *   NaNPropagation == PropagateNaN : result is NaN
+ *   NaNPropagation == PropagateNumbers : result is minimum of elements that are not NaN
+ * \warning the matrix must be not empty, otherwise an assertion is triggered.
+ */
+template <typename Derived>
+template <int NaNPropagation>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar DenseBase<Derived>::minCoeff() const {
+  return derived().redux(Eigen::internal::scalar_min_op<Scalar, Scalar, NaNPropagation>());
 }
 
 /** \returns the maximum of all coefficients of \c *this.
-  * \warning the result is undefined if \c *this contains NaN.
-  */
-template<typename Derived>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
-DenseBase<Derived>::maxCoeff() const
-{
-  return this->redux(Eigen::internal::scalar_max_op<Scalar>());
+ * In case \c *this contains NaN, NaNPropagation determines the behavior:
+ *   NaNPropagation == PropagateFast : undefined
+ *   NaNPropagation == PropagateNaN : result is NaN
+ *   NaNPropagation == PropagateNumbers : result is maximum of elements that are not NaN
+ * \warning the matrix must be not empty, otherwise an assertion is triggered.
+ */
+template <typename Derived>
+template <int NaNPropagation>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar DenseBase<Derived>::maxCoeff() const {
+  return derived().redux(Eigen::internal::scalar_max_op<Scalar, Scalar, NaNPropagation>());
 }
 
-/** \returns the sum of all coefficients of *this
-  *
-  * \sa trace(), prod(), mean()
-  */
-template<typename Derived>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
-DenseBase<Derived>::sum() const
-{
-  if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0))
-    return Scalar(0);
-  return this->redux(Eigen::internal::scalar_sum_op<Scalar>());
+/** \returns the sum of all coefficients of \c *this
+ *
+ * If \c *this is empty, then the value 0 is returned.
+ *
+ * \sa trace(), prod(), mean()
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar DenseBase<Derived>::sum() const {
+  if (SizeAtCompileTime == 0 || (SizeAtCompileTime == Dynamic && size() == 0)) return Scalar(0);
+  return derived().redux(Eigen::internal::scalar_sum_op<Scalar, Scalar>());
 }
 
 /** \returns the mean of all coefficients of *this
-*
-* \sa trace(), prod(), sum()
-*/
-template<typename Derived>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
-DenseBase<Derived>::mean() const
-{
-  return Scalar(this->redux(Eigen::internal::scalar_sum_op<Scalar>())) / Scalar(this->size());
+ *
+ * \sa trace(), prod(), sum()
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar DenseBase<Derived>::mean() const {
+#ifdef __INTEL_COMPILER
+#pragma warning push
+#pragma warning(disable : 2259)
+#endif
+  return Scalar(derived().redux(Eigen::internal::scalar_sum_op<Scalar, Scalar>())) / Scalar(this->size());
+#ifdef __INTEL_COMPILER
+#pragma warning pop
+#endif
 }
 
 /** \returns the product of all coefficients of *this
-  *
-  * Example: \include MatrixBase_prod.cpp
-  * Output: \verbinclude MatrixBase_prod.out
-  *
-  * \sa sum(), mean(), trace()
-  */
-template<typename Derived>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
-DenseBase<Derived>::prod() const
-{
-  if(SizeAtCompileTime==0 || (SizeAtCompileTime==Dynamic && size()==0))
-    return Scalar(1);
-  return this->redux(Eigen::internal::scalar_product_op<Scalar>());
+ *
+ * Example: \include MatrixBase_prod.cpp
+ * Output: \verbinclude MatrixBase_prod.out
+ *
+ * \sa sum(), mean(), trace()
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar DenseBase<Derived>::prod() const {
+  if (SizeAtCompileTime == 0 || (SizeAtCompileTime == Dynamic && size() == 0)) return Scalar(1);
+  return derived().redux(Eigen::internal::scalar_product_op<Scalar>());
 }
 
 /** \returns the trace of \c *this, i.e. the sum of the coefficients on the main diagonal.
-  *
-  * \c *this can be any matrix, not necessarily square.
-  *
-  * \sa diagonal(), sum()
-  */
-template<typename Derived>
-EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar
-MatrixBase<Derived>::trace() const
-{
+ *
+ * \c *this can be any matrix, not necessarily square.
+ *
+ * \sa diagonal(), sum()
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::traits<Derived>::Scalar MatrixBase<Derived>::trace() const {
   return derived().diagonal().sum();
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_REDUX_H
+#endif  // EIGEN_REDUX_H
diff --git a/inst/include/Eigen/src/Core/Ref.h b/inst/include/Eigen/src/Core/Ref.h
index 7a3becaf..30ec277d 100644
--- a/inst/include/Eigen/src/Core/Ref.h
+++ b/inst/include/Eigen/src/Core/Ref.h
@@ -10,269 +10,374 @@
 #ifndef EIGEN_REF_H
 #define EIGEN_REF_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 
-template<typename Derived> class RefBase;
-template<typename PlainObjectType, int Options = 0,
-         typename StrideType = typename internal::conditional<PlainObjectType::IsVectorAtCompileTime,InnerStride<1>,OuterStride<> >::type > class Ref;
-
-/** \class Ref
-  * \ingroup Core_Module
-  *
-  * \brief A matrix or vector expression mapping an existing expressions
-  *
-  * \tparam PlainObjectType the equivalent matrix type of the mapped data
-  * \tparam Options specifies whether the pointer is \c #Aligned, or \c #Unaligned.
-  *                The default is \c #Unaligned.
-  * \tparam StrideType optionally specifies strides. By default, Ref implies a contiguous storage along the inner dimension (inner stride==1),
-  *                   but accept a variable outer stride (leading dimension).
-  *                   This can be overridden by specifying strides.
-  *                   The type passed here must be a specialization of the Stride template, see examples below.
-  *
-  * This class permits to write non template functions taking Eigen's object as parameters while limiting the number of copies.
-  * A Ref<> object can represent either a const expression or a l-value:
-  * \code
-  * // in-out argument:
-  * void foo1(Ref<VectorXf> x);
-  *
-  * // read-only const argument:
-  * void foo2(const Ref<const VectorXf>& x);
-  * \endcode
-  *
-  * In the in-out case, the input argument must satisfies the constraints of the actual Ref<> type, otherwise a compilation issue will be triggered.
-  * By default, a Ref<VectorXf> can reference any dense vector expression of float having a contiguous memory layout.
-  * Likewise, a Ref<MatrixXf> can reference any column major dense matrix expression of float whose column's elements are contiguously stored with
-  * the possibility to have a constant space inbetween each column, i.e.: the inner stride mmust be equal to 1, but the outer-stride (or leading dimension),
-  * can be greater than the number of rows.
-  *
-  * In the const case, if the input expression does not match the above requirement, then it is evaluated into a temporary before being passed to the function.
-  * Here are some examples:
-  * \code
-  * MatrixXf A;
-  * VectorXf a;
-  * foo1(a.head());             // OK
-  * foo1(A.col());              // OK
-  * foo1(A.row());              // compilation error because here innerstride!=1
-  * foo2(A.row());              // The row is copied into a contiguous temporary
-  * foo2(2*a);                  // The expression is evaluated into a temporary
-  * foo2(A.col().segment(2,4)); // No temporary
-  * \endcode
-  *
-  * The range of inputs that can be referenced without temporary can be enlarged using the last two template parameter.
-  * Here is an example accepting an innerstride!=1:
-  * \code
-  * // in-out argument:
-  * void foo3(Ref<VectorXf,0,InnerStride<> > x);
-  * foo3(A.row());              // OK
-  * \endcode
-  * The downside here is that the function foo3 might be significantly slower than foo1 because it won't be able to exploit vectorization, and will involved more
-  * expensive address computations even if the input is contiguously stored in memory. To overcome this issue, one might propose to overloads internally calling a
-  * template function, e.g.:
-  * \code
-  * // in the .h:
-  * void foo(const Ref<MatrixXf>& A);
-  * void foo(const Ref<MatrixXf,0,Stride<> >& A);
-  *
-  * // in the .cpp:
-  * template<typename TypeOfA> void foo_impl(const TypeOfA& A) {
-  *     ... // crazy code goes here
-  * }
-  * void foo(const Ref<MatrixXf>& A) { foo_impl(A); }
-  * void foo(const Ref<MatrixXf,0,Stride<> >& A) { foo_impl(A); }
-  * \endcode
-  *
-  *
-  * \sa PlainObjectBase::Map(), \ref TopicStorageOrders
-  */
+namespace Eigen {
 
 namespace internal {
 
-template<typename _PlainObjectType, int _Options, typename _StrideType>
-struct traits<Ref<_PlainObjectType, _Options, _StrideType> >
-  : public traits<Map<_PlainObjectType, _Options, _StrideType> >
-{
-  typedef _PlainObjectType PlainObjectType;
-  typedef _StrideType StrideType;
+template <typename PlainObjectType_, int Options_, typename StrideType_>
+struct traits<Ref<PlainObjectType_, Options_, StrideType_> >
+    : public traits<Map<PlainObjectType_, Options_, StrideType_> > {
+  typedef PlainObjectType_ PlainObjectType;
+  typedef StrideType_ StrideType;
   enum {
-    Options = _Options,
-    Flags = traits<Map<_PlainObjectType, _Options, _StrideType> >::Flags | NestByRefBit
+    Options = Options_,
+    Flags = traits<Map<PlainObjectType_, Options_, StrideType_> >::Flags | NestByRefBit,
+    Alignment = traits<Map<PlainObjectType_, Options_, StrideType_> >::Alignment,
+    InnerStrideAtCompileTime = traits<Map<PlainObjectType_, Options_, StrideType_> >::InnerStrideAtCompileTime,
+    OuterStrideAtCompileTime = traits<Map<PlainObjectType_, Options_, StrideType_> >::OuterStrideAtCompileTime
   };
 
-  template<typename Derived> struct match {
+  template <typename Derived>
+  struct match {
     enum {
+      IsVectorAtCompileTime = PlainObjectType::IsVectorAtCompileTime || Derived::IsVectorAtCompileTime,
       HasDirectAccess = internal::has_direct_access<Derived>::ret,
-      StorageOrderMatch = PlainObjectType::IsVectorAtCompileTime || Derived::IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)),
-      InnerStrideMatch = int(StrideType::InnerStrideAtCompileTime)==int(Dynamic)
-                      || int(StrideType::InnerStrideAtCompileTime)==int(Derived::InnerStrideAtCompileTime)
-                      || (int(StrideType::InnerStrideAtCompileTime)==0 && int(Derived::InnerStrideAtCompileTime)==1),
-      OuterStrideMatch = Derived::IsVectorAtCompileTime
-                      || int(StrideType::OuterStrideAtCompileTime)==int(Dynamic) || int(StrideType::OuterStrideAtCompileTime)==int(Derived::OuterStrideAtCompileTime),
-      AlignmentMatch = (_Options!=Aligned) || ((PlainObjectType::Flags&AlignedBit)==0) || ((traits<Derived>::Flags&AlignedBit)==AlignedBit),
+      StorageOrderMatch =
+          IsVectorAtCompileTime || ((PlainObjectType::Flags & RowMajorBit) == (Derived::Flags & RowMajorBit)),
+      InnerStrideMatch = int(InnerStrideAtCompileTime) == int(Dynamic) ||
+                         int(InnerStrideAtCompileTime) == int(Derived::InnerStrideAtCompileTime) ||
+                         (int(InnerStrideAtCompileTime) == 0 && int(Derived::InnerStrideAtCompileTime) == 1),
+      OuterStrideMatch = IsVectorAtCompileTime || int(OuterStrideAtCompileTime) == int(Dynamic) ||
+                         int(OuterStrideAtCompileTime) == int(Derived::OuterStrideAtCompileTime),
+      // NOTE, this indirection of evaluator<Derived>::Alignment is needed
+      // to workaround a very strange bug in MSVC related to the instantiation
+      // of has_*ary_operator in evaluator<CwiseNullaryOp>.
+      // This line is surprisingly very sensitive. For instance, simply adding parenthesis
+      // as "DerivedAlignment = (int(evaluator<Derived>::Alignment))," will make MSVC fail...
+      DerivedAlignment = int(evaluator<Derived>::Alignment),
+      AlignmentMatch = (int(traits<PlainObjectType>::Alignment) == int(Unaligned)) ||
+                       (DerivedAlignment >= int(Alignment)),  // FIXME the first condition is not very clear, it should
+                                                              // be replaced by the required alignment
       ScalarTypeMatch = internal::is_same<typename PlainObjectType::Scalar, typename Derived::Scalar>::value,
-      MatchAtCompileTime = HasDirectAccess && StorageOrderMatch && InnerStrideMatch && OuterStrideMatch && AlignmentMatch && ScalarTypeMatch
+      MatchAtCompileTime = HasDirectAccess && StorageOrderMatch && InnerStrideMatch && OuterStrideMatch &&
+                           AlignmentMatch && ScalarTypeMatch
     };
-    typedef typename internal::conditional<MatchAtCompileTime,internal::true_type,internal::false_type>::type type;
+    typedef std::conditional_t<MatchAtCompileTime, internal::true_type, internal::false_type> type;
   };
-  
 };
 
-template<typename Derived>
+template <typename Derived>
 struct traits<RefBase<Derived> > : public traits<Derived> {};
 
-}
+}  // namespace internal
 
-template<typename Derived> class RefBase
- : public MapBase<Derived>
-{
+template <typename Derived>
+class RefBase : public MapBase<Derived> {
   typedef typename internal::traits<Derived>::PlainObjectType PlainObjectType;
   typedef typename internal::traits<Derived>::StrideType StrideType;
 
-public:
-
+ public:
   typedef MapBase<Derived> Base;
   EIGEN_DENSE_PUBLIC_INTERFACE(RefBase)
 
-  inline Index innerStride() const
-  {
+  EIGEN_DEVICE_FUNC constexpr Index innerStride() const {
     return StrideType::InnerStrideAtCompileTime != 0 ? m_stride.inner() : 1;
   }
 
-  inline Index outerStride() const
-  {
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const {
     return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer()
-         : IsVectorAtCompileTime ? this->size()
-         : int(Flags)&RowMajorBit ? this->cols()
-         : this->rows();
+           : IsVectorAtCompileTime                   ? this->size()
+           : int(Flags) & RowMajorBit                ? this->cols()
+                                                     : this->rows();
   }
 
-  RefBase()
-    : Base(0,RowsAtCompileTime==Dynamic?0:RowsAtCompileTime,ColsAtCompileTime==Dynamic?0:ColsAtCompileTime),
-      // Stride<> does not allow default ctor for Dynamic strides, so let' initialize it with dummy values:
-      m_stride(StrideType::OuterStrideAtCompileTime==Dynamic?0:StrideType::OuterStrideAtCompileTime,
-               StrideType::InnerStrideAtCompileTime==Dynamic?0:StrideType::InnerStrideAtCompileTime)
-  {}
-  
+  EIGEN_DEVICE_FUNC RefBase()
+      : Base(0, RowsAtCompileTime == Dynamic ? 0 : RowsAtCompileTime,
+             ColsAtCompileTime == Dynamic ? 0 : ColsAtCompileTime),
+        // Stride<> does not allow default ctor for Dynamic strides, so let' initialize it with dummy values:
+        m_stride(StrideType::OuterStrideAtCompileTime == Dynamic ? 0 : StrideType::OuterStrideAtCompileTime,
+                 StrideType::InnerStrideAtCompileTime == Dynamic ? 0 : StrideType::InnerStrideAtCompileTime) {}
+
   EIGEN_INHERIT_ASSIGNMENT_OPERATORS(RefBase)
 
-protected:
+ protected:
+  typedef Stride<StrideType::OuterStrideAtCompileTime, StrideType::InnerStrideAtCompileTime> StrideBase;
 
-  typedef Stride<StrideType::OuterStrideAtCompileTime,StrideType::InnerStrideAtCompileTime> StrideBase;
+  // Resolves inner stride if default 0.
+  static EIGEN_DEVICE_FUNC constexpr Index resolveInnerStride(Index inner) { return inner == 0 ? 1 : inner; }
 
-  template<typename Expression>
-  void construct(Expression& expr)
-  {
-    if(PlainObjectType::RowsAtCompileTime==1)
-    {
-      eigen_assert(expr.rows()==1 || expr.cols()==1);
-      ::new (static_cast<Base*>(this)) Base(expr.data(), 1, expr.size());
+  // Resolves outer stride if default 0.
+  static EIGEN_DEVICE_FUNC constexpr Index resolveOuterStride(Index inner, Index outer, Index rows, Index cols,
+                                                              bool isVectorAtCompileTime, bool isRowMajor) {
+    return outer == 0 ? isVectorAtCompileTime ? inner * rows * cols : isRowMajor ? inner * cols : inner * rows : outer;
+  }
+
+  // Returns true if construction is valid, false if there is a stride mismatch,
+  // and fails if there is a size mismatch.
+  template <typename Expression>
+  EIGEN_DEVICE_FUNC bool construct(Expression& expr) {
+    // Check matrix sizes.  If this is a compile-time vector, we do allow
+    // implicitly transposing.
+    EIGEN_STATIC_ASSERT(EIGEN_PREDICATE_SAME_MATRIX_SIZE(PlainObjectType, Expression)
+                            // If it is a vector, the transpose sizes might match.
+                            || (PlainObjectType::IsVectorAtCompileTime &&
+                                ((int(PlainObjectType::RowsAtCompileTime) == Eigen::Dynamic ||
+                                  int(Expression::ColsAtCompileTime) == Eigen::Dynamic ||
+                                  int(PlainObjectType::RowsAtCompileTime) == int(Expression::ColsAtCompileTime)) &&
+                                 (int(PlainObjectType::ColsAtCompileTime) == Eigen::Dynamic ||
+                                  int(Expression::RowsAtCompileTime) == Eigen::Dynamic ||
+                                  int(PlainObjectType::ColsAtCompileTime) == int(Expression::RowsAtCompileTime)))),
+                        YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES)
+
+    // Determine runtime rows and columns.
+    Index rows = expr.rows();
+    Index cols = expr.cols();
+    if (PlainObjectType::RowsAtCompileTime == 1) {
+      eigen_assert(expr.rows() == 1 || expr.cols() == 1);
+      rows = 1;
+      cols = expr.size();
+    } else if (PlainObjectType::ColsAtCompileTime == 1) {
+      eigen_assert(expr.rows() == 1 || expr.cols() == 1);
+      rows = expr.size();
+      cols = 1;
     }
-    else if(PlainObjectType::ColsAtCompileTime==1)
-    {
-      eigen_assert(expr.rows()==1 || expr.cols()==1);
-      ::new (static_cast<Base*>(this)) Base(expr.data(), expr.size(), 1);
+    // Verify that the sizes are valid.
+    eigen_assert((PlainObjectType::RowsAtCompileTime == Dynamic) || (PlainObjectType::RowsAtCompileTime == rows));
+    eigen_assert((PlainObjectType::ColsAtCompileTime == Dynamic) || (PlainObjectType::ColsAtCompileTime == cols));
+
+    // If this is a vector, we might be transposing, which means that stride should swap.
+    const bool transpose = PlainObjectType::IsVectorAtCompileTime && (rows != expr.rows());
+    // If the storage format differs, we also need to swap the stride.
+    const bool row_major = ((PlainObjectType::Flags)&RowMajorBit) != 0;
+    const bool expr_row_major = (Expression::Flags & RowMajorBit) != 0;
+    const bool storage_differs = (row_major != expr_row_major);
+
+    const bool swap_stride = (transpose != storage_differs);
+
+    // Determine expr's actual strides, resolving any defaults if zero.
+    const Index expr_inner_actual = resolveInnerStride(expr.innerStride());
+    const Index expr_outer_actual = resolveOuterStride(expr_inner_actual, expr.outerStride(), expr.rows(), expr.cols(),
+                                                       Expression::IsVectorAtCompileTime != 0, expr_row_major);
+
+    // If this is a column-major row vector or row-major column vector, the inner-stride
+    // is arbitrary, so set it to either the compile-time inner stride or 1.
+    const bool row_vector = (rows == 1);
+    const bool col_vector = (cols == 1);
+    const Index inner_stride =
+        ((!row_major && row_vector) || (row_major && col_vector))
+            ? (StrideType::InnerStrideAtCompileTime > 0 ? Index(StrideType::InnerStrideAtCompileTime) : 1)
+        : swap_stride ? expr_outer_actual
+                      : expr_inner_actual;
+
+    // If this is a column-major column vector or row-major row vector, the outer-stride
+    // is arbitrary, so set it to either the compile-time outer stride or vector size.
+    const Index outer_stride =
+        ((!row_major && col_vector) || (row_major && row_vector))
+            ? (StrideType::OuterStrideAtCompileTime > 0 ? Index(StrideType::OuterStrideAtCompileTime)
+                                                        : rows * cols * inner_stride)
+        : swap_stride ? expr_inner_actual
+                      : expr_outer_actual;
+
+    // Check if given inner/outer strides are compatible with compile-time strides.
+    const bool inner_valid = (StrideType::InnerStrideAtCompileTime == Dynamic) ||
+                             (resolveInnerStride(Index(StrideType::InnerStrideAtCompileTime)) == inner_stride);
+    if (!inner_valid) {
+      return false;
     }
-    else
-      ::new (static_cast<Base*>(this)) Base(expr.data(), expr.rows(), expr.cols());
-    
-    if(Expression::IsVectorAtCompileTime && (!PlainObjectType::IsVectorAtCompileTime) && ((Expression::Flags&RowMajorBit)!=(PlainObjectType::Flags&RowMajorBit)))
-      ::new (&m_stride) StrideBase(expr.innerStride(), StrideType::InnerStrideAtCompileTime==0?0:1);
-    else
-      ::new (&m_stride) StrideBase(StrideType::OuterStrideAtCompileTime==0?0:expr.outerStride(),
-                                   StrideType::InnerStrideAtCompileTime==0?0:expr.innerStride());    
+
+    const bool outer_valid =
+        (StrideType::OuterStrideAtCompileTime == Dynamic) ||
+        (resolveOuterStride(inner_stride, Index(StrideType::OuterStrideAtCompileTime), rows, cols,
+                            PlainObjectType::IsVectorAtCompileTime != 0, row_major) == outer_stride);
+    if (!outer_valid) {
+      return false;
+    }
+
+    internal::construct_at<Base>(this, expr.data(), rows, cols);
+    internal::construct_at(&m_stride, (StrideType::OuterStrideAtCompileTime == 0) ? 0 : outer_stride,
+                           (StrideType::InnerStrideAtCompileTime == 0) ? 0 : inner_stride);
+    return true;
   }
 
   StrideBase m_stride;
 };
 
+/** \class Ref
+ * \ingroup Core_Module
+ *
+ * \brief A matrix or vector expression mapping an existing expression
+ *
+ * \tparam PlainObjectType the equivalent matrix type of the mapped data
+ * \tparam Options specifies the pointer alignment in bytes. It can be: \c #Aligned128, , \c #Aligned64, \c #Aligned32,
+ * \c #Aligned16, \c #Aligned8 or \c #Unaligned. The default is \c #Unaligned. \tparam StrideType optionally specifies
+ * strides. By default, Ref implies a contiguous storage along the inner dimension (inner stride==1), but accepts a
+ * variable outer stride (leading dimension). This can be overridden by specifying strides. The type passed here must be
+ * a specialization of the Stride template, see examples below.
+ *
+ * This class provides a way to write non-template functions taking Eigen objects as parameters while limiting the
+ * number of copies. A Ref<> object can represent either a const expression or a l-value: \code
+ * // in-out argument:
+ * void foo1(Ref<VectorXf> x);
+ *
+ * // read-only const argument:
+ * void foo2(const Ref<const VectorXf>& x);
+ * \endcode
+ *
+ * In the in-out case, the input argument must satisfy the constraints of the actual Ref<> type, otherwise a compilation
+ * issue will be triggered. By default, a Ref<VectorXf> can reference any dense vector expression of float having a
+ * contiguous memory layout. Likewise, a Ref<MatrixXf> can reference any column-major dense matrix expression of float
+ * whose column's elements are contiguously stored with the possibility to have a constant space in-between each column,
+ * i.e. the inner stride must be equal to 1, but the outer stride (or leading dimension) can be greater than the number
+ * of rows.
+ *
+ * In the const case, if the input expression does not match the above requirement, then it is evaluated into a
+ * temporary before being passed to the function. Here are some examples: \code MatrixXf A; VectorXf a; foo1(a.head());
+ * // OK foo1(A.col());              // OK foo1(A.row());              // Compilation error because here innerstride!=1
+ * foo2(A.row());              // Compilation error because A.row() is a 1xN object while foo2 is expecting a Nx1 object
+ * foo2(A.row().transpose());  // The row is copied into a contiguous temporary
+ * foo2(2*a);                  // The expression is evaluated into a temporary
+ * foo2(A.col().segment(2,4)); // No temporary
+ * \endcode
+ *
+ * The range of inputs that can be referenced without temporary can be enlarged using the last two template parameters.
+ * Here is an example accepting an innerstride!=1:
+ * \code
+ * // in-out argument:
+ * void foo3(Ref<VectorXf,0,InnerStride<> > x);
+ * foo3(A.row());              // OK
+ * \endcode
+ * The downside here is that the function foo3 might be significantly slower than foo1 because it won't be able to
+ * exploit vectorization, and will involve more expensive address computations even if the input is contiguously stored
+ * in memory. To overcome this issue, one might propose to overload internally calling a template function, e.g.: \code
+ * // in the .h:
+ * void foo(const Ref<MatrixXf>& A);
+ * void foo(const Ref<MatrixXf,0,Stride<> >& A);
+ *
+ * // in the .cpp:
+ * template<typename TypeOfA> void foo_impl(const TypeOfA& A) {
+ *     ... // crazy code goes here
+ * }
+ * void foo(const Ref<MatrixXf>& A) { foo_impl(A); }
+ * void foo(const Ref<MatrixXf,0,Stride<> >& A) { foo_impl(A); }
+ * \endcode
+ *
+ * See also the following stackoverflow questions for further references:
+ *  - <a href="http://stackoverflow.com/questions/21132538/correct-usage-of-the-eigenref-class">Correct usage of the
+ * Eigen::Ref<> class</a>
+ *
+ * \sa PlainObjectBase::Map(), \ref TopicStorageOrders
+ */
+template <typename PlainObjectType, int Options, typename StrideType>
+class Ref : public RefBase<Ref<PlainObjectType, Options, StrideType> > {
+ private:
+  typedef internal::traits<Ref> Traits;
+  template <typename Derived>
+  EIGEN_DEVICE_FUNC inline Ref(
+      const PlainObjectBase<Derived>& expr,
+      std::enable_if_t<bool(Traits::template match<Derived>::MatchAtCompileTime), Derived>* = 0);
+
+ public:
+  typedef RefBase<Ref> Base;
+  EIGEN_DENSE_PUBLIC_INTERFACE(Ref)
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  template <typename Derived>
+  EIGEN_DEVICE_FUNC inline Ref(
+      PlainObjectBase<Derived>& expr,
+      std::enable_if_t<bool(Traits::template match<Derived>::MatchAtCompileTime), Derived>* = 0) {
+    EIGEN_STATIC_ASSERT(bool(Traits::template match<Derived>::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
+    // Construction must pass since we will not create temporary storage in the non-const case.
+    const bool success = Base::construct(expr.derived());
+    EIGEN_UNUSED_VARIABLE(success)
+    eigen_assert(success);
+  }
+  template <typename Derived>
+  EIGEN_DEVICE_FUNC inline Ref(
+      const DenseBase<Derived>& expr,
+      std::enable_if_t<bool(Traits::template match<Derived>::MatchAtCompileTime), Derived>* = 0)
+#else
+  /** Implicit constructor from any dense expression */
+  template <typename Derived>
+  inline Ref(DenseBase<Derived>& expr)
+#endif
+  {
+    EIGEN_STATIC_ASSERT(bool(internal::is_lvalue<Derived>::value), THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
+    EIGEN_STATIC_ASSERT(bool(Traits::template match<Derived>::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
+    EIGEN_STATIC_ASSERT(!Derived::IsPlainObjectBase, THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
+    // Construction must pass since we will not create temporary storage in the non-const case.
+    const bool success = Base::construct(expr.const_cast_derived());
+    EIGEN_UNUSED_VARIABLE(success)
+    eigen_assert(success);
+  }
 
-template<typename PlainObjectType, int Options, typename StrideType> class Ref
-  : public RefBase<Ref<PlainObjectType, Options, StrideType> >
-{
-  private:
-    typedef internal::traits<Ref> Traits;
-    template<typename Derived>
-    inline Ref(const PlainObjectBase<Derived>& expr,
-               typename internal::enable_if<bool(Traits::template match<Derived>::MatchAtCompileTime),Derived>::type* = 0);
-  public:
-
-    typedef RefBase<Ref> Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(Ref)
-
-
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    template<typename Derived>
-    inline Ref(PlainObjectBase<Derived>& expr,
-               typename internal::enable_if<bool(Traits::template match<Derived>::MatchAtCompileTime),Derived>::type* = 0)
-    {
-      EIGEN_STATIC_ASSERT(static_cast<bool>(Traits::template match<Derived>::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
-      Base::construct(expr.derived());
-    }
-    template<typename Derived>
-    inline Ref(const DenseBase<Derived>& expr,
-               typename internal::enable_if<bool(Traits::template match<Derived>::MatchAtCompileTime),Derived>::type* = 0)
-    #else
-    template<typename Derived>
-    inline Ref(DenseBase<Derived>& expr)
-    #endif
-    {
-      EIGEN_STATIC_ASSERT(static_cast<bool>(internal::is_lvalue<Derived>::value), THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
-      EIGEN_STATIC_ASSERT(static_cast<bool>(Traits::template match<Derived>::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
-      enum { THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY = Derived::ThisConstantIsPrivateInPlainObjectBase};
-      Base::construct(expr.const_cast_derived());
-    }
-
-    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Ref)
-
+  EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Ref)
 };
 
 // this is the const ref version
-template<typename TPlainObjectType, int Options, typename StrideType> class Ref<const TPlainObjectType, Options, StrideType>
-  : public RefBase<Ref<const TPlainObjectType, Options, StrideType> >
-{
-    typedef internal::traits<Ref> Traits;
-  public:
-
-    typedef RefBase<Ref> Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(Ref)
-
-    template<typename Derived>
-    inline Ref(const DenseBase<Derived>& expr,
-               typename internal::enable_if<bool(Traits::template match<Derived>::ScalarTypeMatch),Derived>::type* = 0)
-    {
-//      std::cout << match_helper<Derived>::HasDirectAccess << "," << match_helper<Derived>::OuterStrideMatch << "," << match_helper<Derived>::InnerStrideMatch << "\n";
-//      std::cout << int(StrideType::OuterStrideAtCompileTime) << " - " << int(Derived::OuterStrideAtCompileTime) << "\n";
-//      std::cout << int(StrideType::InnerStrideAtCompileTime) << " - " << int(Derived::InnerStrideAtCompileTime) << "\n";
-      construct(expr.derived(), typename Traits::template match<Derived>::type());
-    }
-    
-    inline Ref(const Ref& other) : Base(other) {
-      // copy constructor shall not copy the m_object, to avoid unnecessary malloc and copy
-    }
+template <typename TPlainObjectType, int Options, typename StrideType>
+class Ref<const TPlainObjectType, Options, StrideType>
+    : public RefBase<Ref<const TPlainObjectType, Options, StrideType> > {
+  typedef internal::traits<Ref> Traits;
+
+  static constexpr bool may_map_m_object_successfully =
+      (static_cast<int>(StrideType::InnerStrideAtCompileTime) == 0 ||
+       static_cast<int>(StrideType::InnerStrideAtCompileTime) == 1 ||
+       static_cast<int>(StrideType::InnerStrideAtCompileTime) == Dynamic) &&
+      (TPlainObjectType::IsVectorAtCompileTime || static_cast<int>(StrideType::OuterStrideAtCompileTime) == 0 ||
+       static_cast<int>(StrideType::OuterStrideAtCompileTime) == Dynamic ||
+       static_cast<int>(StrideType::OuterStrideAtCompileTime) ==
+           static_cast<int>(TPlainObjectType::InnerSizeAtCompileTime) ||
+       static_cast<int>(TPlainObjectType::InnerSizeAtCompileTime) == Dynamic);
+
+ public:
+  typedef RefBase<Ref> Base;
+  EIGEN_DENSE_PUBLIC_INTERFACE(Ref)
+
+  template <typename Derived>
+  EIGEN_DEVICE_FUNC inline Ref(const DenseBase<Derived>& expr,
+                               std::enable_if_t<bool(Traits::template match<Derived>::ScalarTypeMatch), Derived>* = 0) {
+    //      std::cout << match_helper<Derived>::HasDirectAccess << "," << match_helper<Derived>::OuterStrideMatch << ","
+    //      << match_helper<Derived>::InnerStrideMatch << "\n"; std::cout << int(StrideType::OuterStrideAtCompileTime)
+    //      << " - " << int(Derived::OuterStrideAtCompileTime) << "\n"; std::cout <<
+    //      int(StrideType::InnerStrideAtCompileTime) << " - " << int(Derived::InnerStrideAtCompileTime) << "\n";
+    EIGEN_STATIC_ASSERT(Traits::template match<Derived>::type::value || may_map_m_object_successfully,
+                        STORAGE_LAYOUT_DOES_NOT_MATCH);
+    construct(expr.derived(), typename Traits::template match<Derived>::type());
+  }
 
-    template<typename OtherRef>
-    inline Ref(const RefBase<OtherRef>& other) {
-      construct(other.derived(), typename Traits::template match<OtherRef>::type());
-    }
+  EIGEN_DEVICE_FUNC inline Ref(const Ref& other) : Base(other) {
+    // copy constructor shall not copy the m_object, to avoid unnecessary malloc and copy
+  }
 
-  protected:
+  EIGEN_DEVICE_FUNC inline Ref(Ref&& other) {
+    if (other.data() == other.m_object.data()) {
+      m_object = std::move(other.m_object);
+      Base::construct(m_object);
+    } else
+      Base::construct(other);
+  }
 
-    template<typename Expression>
-    void construct(const Expression& expr,internal::true_type)
-    {
-      Base::construct(expr);
-    }
+  template <typename OtherRef>
+  EIGEN_DEVICE_FUNC inline Ref(const RefBase<OtherRef>& other) {
+    EIGEN_STATIC_ASSERT(Traits::template match<OtherRef>::type::value || may_map_m_object_successfully,
+                        STORAGE_LAYOUT_DOES_NOT_MATCH);
+    construct(other.derived(), typename Traits::template match<OtherRef>::type());
+  }
 
-    template<typename Expression>
-    void construct(const Expression& expr, internal::false_type)
-    {
-      m_object.lazyAssign(expr);
-      Base::construct(m_object);
+ protected:
+  template <typename Expression>
+  EIGEN_DEVICE_FUNC void construct(const Expression& expr, internal::true_type) {
+    // Check if we can use the underlying expr's storage directly, otherwise call the copy version.
+    if (!Base::construct(expr)) {
+      construct(expr, internal::false_type());
     }
+  }
+
+  template <typename Expression>
+  EIGEN_DEVICE_FUNC void construct(const Expression& expr, internal::false_type) {
+    internal::call_assignment_no_alias(m_object, expr, internal::assign_op<Scalar, Scalar>());
+    const bool success = Base::construct(m_object);
+    EIGEN_ONLY_USED_FOR_DEBUG(success)
+    eigen_assert(success);
+  }
 
-  protected:
-    TPlainObjectType m_object;
+ protected:
+  TPlainObjectType m_object;
 };
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_REF_H
+#endif  // EIGEN_REF_H
diff --git a/inst/include/Eigen/src/Core/Replicate.h b/inst/include/Eigen/src/Core/Replicate.h
index ac4537c1..34150452 100644
--- a/inst/include/Eigen/src/Core/Replicate.h
+++ b/inst/include/Eigen/src/Core/Replicate.h
@@ -10,168 +10,121 @@
 #ifndef EIGEN_REPLICATE_H
 #define EIGEN_REPLICATE_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 
-/**
-  * \class Replicate
-  * \ingroup Core_Module
-  *
-  * \brief Expression of the multiple replication of a matrix or vector
-  *
-  * \param MatrixType the type of the object we are replicating
-  *
-  * This class represents an expression of the multiple replication of a matrix or vector.
-  * It is the return type of DenseBase::replicate() and most of the time
-  * this is the only way it is used.
-  *
-  * \sa DenseBase::replicate()
-  */
+namespace Eigen {
 
 namespace internal {
-template<typename MatrixType,int RowFactor,int ColFactor>
-struct traits<Replicate<MatrixType,RowFactor,ColFactor> >
- : traits<MatrixType>
-{
+template <typename MatrixType, int RowFactor, int ColFactor>
+struct traits<Replicate<MatrixType, RowFactor, ColFactor> > : traits<MatrixType> {
   typedef typename MatrixType::Scalar Scalar;
   typedef typename traits<MatrixType>::StorageKind StorageKind;
   typedef typename traits<MatrixType>::XprKind XprKind;
+  typedef typename ref_selector<MatrixType>::type MatrixTypeNested;
+  typedef std::remove_reference_t<MatrixTypeNested> MatrixTypeNested_;
   enum {
-    Factor = (RowFactor==Dynamic || ColFactor==Dynamic) ? Dynamic : RowFactor*ColFactor
-  };
-  typedef typename nested<MatrixType,Factor>::type MatrixTypeNested;
-  typedef typename remove_reference<MatrixTypeNested>::type _MatrixTypeNested;
-  enum {
-    RowsAtCompileTime = RowFactor==Dynamic || int(MatrixType::RowsAtCompileTime)==Dynamic
-                      ? Dynamic
-                      : RowFactor * MatrixType::RowsAtCompileTime,
-    ColsAtCompileTime = ColFactor==Dynamic || int(MatrixType::ColsAtCompileTime)==Dynamic
-                      ? Dynamic
-                      : ColFactor * MatrixType::ColsAtCompileTime,
-   //FIXME we don't propagate the max sizes !!!
+    RowsAtCompileTime = RowFactor == Dynamic || int(MatrixType::RowsAtCompileTime) == Dynamic
+                            ? Dynamic
+                            : RowFactor * MatrixType::RowsAtCompileTime,
+    ColsAtCompileTime = ColFactor == Dynamic || int(MatrixType::ColsAtCompileTime) == Dynamic
+                            ? Dynamic
+                            : ColFactor * MatrixType::ColsAtCompileTime,
+    // FIXME we don't propagate the max sizes !!!
     MaxRowsAtCompileTime = RowsAtCompileTime,
     MaxColsAtCompileTime = ColsAtCompileTime,
-    IsRowMajor = MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1 ? 1
-               : MaxColsAtCompileTime==1 && MaxRowsAtCompileTime!=1 ? 0
-               : (MatrixType::Flags & RowMajorBit) ? 1 : 0,
-    Flags = (_MatrixTypeNested::Flags & HereditaryBits & ~RowMajorBit) | (IsRowMajor ? RowMajorBit : 0),
-    CoeffReadCost = _MatrixTypeNested::CoeffReadCost
-  };
-};
-}
+    IsRowMajor = MaxRowsAtCompileTime == 1 && MaxColsAtCompileTime != 1   ? 1
+                 : MaxColsAtCompileTime == 1 && MaxRowsAtCompileTime != 1 ? 0
+                 : (MatrixType::Flags & RowMajorBit)                      ? 1
+                                                                          : 0,
 
-template<typename MatrixType,int RowFactor,int ColFactor> class Replicate
-  : public internal::dense_xpr_base< Replicate<MatrixType,RowFactor,ColFactor> >::type
-{
-    typedef typename internal::traits<Replicate>::MatrixTypeNested MatrixTypeNested;
-    typedef typename internal::traits<Replicate>::_MatrixTypeNested _MatrixTypeNested;
-  public:
-
-    typedef typename internal::dense_xpr_base<Replicate>::type Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(Replicate)
-
-    template<typename OriginalMatrixType>
-    inline explicit Replicate(const OriginalMatrixType& a_matrix)
-      : m_matrix(a_matrix), m_rowFactor(RowFactor), m_colFactor(ColFactor)
-    {
-      EIGEN_STATIC_ASSERT((internal::is_same<typename internal::remove_const<MatrixType>::type,OriginalMatrixType>::value),
-                          THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE)
-      eigen_assert(RowFactor!=Dynamic && ColFactor!=Dynamic);
-    }
-
-    template<typename OriginalMatrixType>
-    inline Replicate(const OriginalMatrixType& a_matrix, Index rowFactor, Index colFactor)
-      : m_matrix(a_matrix), m_rowFactor(rowFactor), m_colFactor(colFactor)
-    {
-      EIGEN_STATIC_ASSERT((internal::is_same<typename internal::remove_const<MatrixType>::type,OriginalMatrixType>::value),
-                          THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE)
-    }
-
-    inline Index rows() const { return m_matrix.rows() * m_rowFactor.value(); }
-    inline Index cols() const { return m_matrix.cols() * m_colFactor.value(); }
-
-    inline Scalar coeff(Index rowId, Index colId) const
-    {
-      // try to avoid using modulo; this is a pure optimization strategy
-      const Index actual_row  = internal::traits<MatrixType>::RowsAtCompileTime==1 ? 0
-                            : RowFactor==1 ? rowId
-                            : rowId%m_matrix.rows();
-      const Index actual_col  = internal::traits<MatrixType>::ColsAtCompileTime==1 ? 0
-                            : ColFactor==1 ? colId
-                            : colId%m_matrix.cols();
-
-      return m_matrix.coeff(actual_row, actual_col);
-    }
-    template<int LoadMode>
-    inline PacketScalar packet(Index rowId, Index colId) const
-    {
-      const Index actual_row  = internal::traits<MatrixType>::RowsAtCompileTime==1 ? 0
-                            : RowFactor==1 ? rowId
-                            : rowId%m_matrix.rows();
-      const Index actual_col  = internal::traits<MatrixType>::ColsAtCompileTime==1 ? 0
-                            : ColFactor==1 ? colId
-                            : colId%m_matrix.cols();
-
-      return m_matrix.template packet<LoadMode>(actual_row, actual_col);
-    }
-
-    const _MatrixTypeNested& nestedExpression() const
-    { 
-      return m_matrix; 
-    }
-
-  protected:
-    MatrixTypeNested m_matrix;
-    const internal::variable_if_dynamic<Index, RowFactor> m_rowFactor;
-    const internal::variable_if_dynamic<Index, ColFactor> m_colFactor;
+    // FIXME enable DirectAccess with negative strides?
+    Flags = IsRowMajor ? RowMajorBit : 0
+  };
 };
+}  // namespace internal
 
 /**
-  * \return an expression of the replication of \c *this
-  *
-  * Example: \include MatrixBase_replicate.cpp
-  * Output: \verbinclude MatrixBase_replicate.out
-  *
-  * \sa VectorwiseOp::replicate(), DenseBase::replicate(Index,Index), class Replicate
-  */
-template<typename Derived>
-template<int RowFactor, int ColFactor>
-const Replicate<Derived,RowFactor,ColFactor>
-DenseBase<Derived>::replicate() const
-{
-  return Replicate<Derived,RowFactor,ColFactor>(derived());
-}
+ * \class Replicate
+ * \ingroup Core_Module
+ *
+ * \brief Expression of the multiple replication of a matrix or vector
+ *
+ * \tparam MatrixType the type of the object we are replicating
+ * \tparam RowFactor number of repetitions at compile time along the vertical direction, can be Dynamic.
+ * \tparam ColFactor number of repetitions at compile time along the horizontal direction, can be Dynamic.
+ *
+ * This class represents an expression of the multiple replication of a matrix or vector.
+ * It is the return type of DenseBase::replicate() and most of the time
+ * this is the only way it is used.
+ *
+ * \sa DenseBase::replicate()
+ */
+template <typename MatrixType, int RowFactor, int ColFactor>
+class Replicate : public internal::dense_xpr_base<Replicate<MatrixType, RowFactor, ColFactor> >::type {
+  typedef typename internal::traits<Replicate>::MatrixTypeNested MatrixTypeNested;
+  typedef typename internal::traits<Replicate>::MatrixTypeNested_ MatrixTypeNested_;
+
+ public:
+  typedef typename internal::dense_xpr_base<Replicate>::type Base;
+  EIGEN_DENSE_PUBLIC_INTERFACE(Replicate)
+  typedef internal::remove_all_t<MatrixType> NestedExpression;
+
+  template <typename OriginalMatrixType>
+  EIGEN_DEVICE_FUNC inline explicit Replicate(const OriginalMatrixType& matrix)
+      : m_matrix(matrix), m_rowFactor(RowFactor), m_colFactor(ColFactor) {
+    EIGEN_STATIC_ASSERT((internal::is_same<std::remove_const_t<MatrixType>, OriginalMatrixType>::value),
+                        THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE)
+    eigen_assert(RowFactor != Dynamic && ColFactor != Dynamic);
+  }
+
+  template <typename OriginalMatrixType>
+  EIGEN_DEVICE_FUNC inline Replicate(const OriginalMatrixType& matrix, Index rowFactor, Index colFactor)
+      : m_matrix(matrix), m_rowFactor(rowFactor), m_colFactor(colFactor) {
+    EIGEN_STATIC_ASSERT((internal::is_same<std::remove_const_t<MatrixType>, OriginalMatrixType>::value),
+                        THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE)
+  }
+
+  EIGEN_DEVICE_FUNC constexpr Index rows() const { return m_matrix.rows() * m_rowFactor.value(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const { return m_matrix.cols() * m_colFactor.value(); }
+
+  EIGEN_DEVICE_FUNC const MatrixTypeNested_& nestedExpression() const { return m_matrix; }
+
+ protected:
+  MatrixTypeNested m_matrix;
+  const internal::variable_if_dynamic<Index, RowFactor> m_rowFactor;
+  const internal::variable_if_dynamic<Index, ColFactor> m_colFactor;
+};
 
 /**
-  * \return an expression of the replication of \c *this
-  *
-  * Example: \include MatrixBase_replicate_int_int.cpp
-  * Output: \verbinclude MatrixBase_replicate_int_int.out
-  *
-  * \sa VectorwiseOp::replicate(), DenseBase::replicate<int,int>(), class Replicate
-  */
-template<typename Derived>
-const typename DenseBase<Derived>::ReplicateReturnType
-DenseBase<Derived>::replicate(Index rowFactor,Index colFactor) const
-{
-  return Replicate<Derived,Dynamic,Dynamic>(derived(),rowFactor,colFactor);
+ * \return an expression of the replication of \c *this
+ *
+ * Example: \include MatrixBase_replicate.cpp
+ * Output: \verbinclude MatrixBase_replicate.out
+ *
+ * \sa VectorwiseOp::replicate(), DenseBase::replicate(Index,Index), class Replicate
+ */
+template <typename Derived>
+template <int RowFactor, int ColFactor>
+EIGEN_DEVICE_FUNC const Replicate<Derived, RowFactor, ColFactor> DenseBase<Derived>::replicate() const {
+  return Replicate<Derived, RowFactor, ColFactor>(derived());
 }
 
 /**
-  * \return an expression of the replication of each column (or row) of \c *this
-  *
-  * Example: \include DirectionWise_replicate_int.cpp
-  * Output: \verbinclude DirectionWise_replicate_int.out
-  *
-  * \sa VectorwiseOp::replicate(), DenseBase::replicate(), class Replicate
-  */
-template<typename ExpressionType, int Direction>
-const typename VectorwiseOp<ExpressionType,Direction>::ReplicateReturnType
-VectorwiseOp<ExpressionType,Direction>::replicate(Index factor) const
-{
-  return typename VectorwiseOp<ExpressionType,Direction>::ReplicateReturnType
-          (_expression(),Direction==Vertical?factor:1,Direction==Horizontal?factor:1);
+ * \return an expression of the replication of each column (or row) of \c *this
+ *
+ * Example: \include DirectionWise_replicate_int.cpp
+ * Output: \verbinclude DirectionWise_replicate_int.out
+ *
+ * \sa VectorwiseOp::replicate(), DenseBase::replicate(), class Replicate
+ */
+template <typename ExpressionType, int Direction>
+EIGEN_DEVICE_FUNC const typename VectorwiseOp<ExpressionType, Direction>::ReplicateReturnType
+VectorwiseOp<ExpressionType, Direction>::replicate(Index factor) const {
+  return typename VectorwiseOp<ExpressionType, Direction>::ReplicateReturnType(
+      _expression(), Direction == Vertical ? factor : 1, Direction == Horizontal ? factor : 1);
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_REPLICATE_H
+#endif  // EIGEN_REPLICATE_H
diff --git a/inst/include/Eigen/src/Core/Reshaped.h b/inst/include/Eigen/src/Core/Reshaped.h
new file mode 100644
index 00000000..22acdc0b
--- /dev/null
+++ b/inst/include/Eigen/src/Core/Reshaped.h
@@ -0,0 +1,398 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2017 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2014 yoco <peter.xiau@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_RESHAPED_H
+#define EIGEN_RESHAPED_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \class Reshaped
+ * \ingroup Core_Module
+ *
+ * \brief Expression of a fixed-size or dynamic-size reshape
+ *
+ * \tparam XprType the type of the expression in which we are taking a reshape
+ * \tparam Rows the number of rows of the reshape we are taking at compile time (optional)
+ * \tparam Cols the number of columns of the reshape we are taking at compile time (optional)
+ * \tparam Order can be ColMajor or RowMajor, default is ColMajor.
+ *
+ * This class represents an expression of either a fixed-size or dynamic-size reshape.
+ * It is the return type of DenseBase::reshaped(NRowsType,NColsType) and
+ * most of the time this is the only way it is used.
+ *
+ * If you want to directly manipulate reshaped expressions,
+ * for instance if you want to write a function returning such an expression,
+ * it is advised to use the \em auto keyword for such use cases.
+ *
+ * Here is an example illustrating the dynamic case:
+ * \include class_Reshaped.cpp
+ * Output: \verbinclude class_Reshaped.out
+ *
+ * Here is an example illustrating the fixed-size case:
+ * \include class_FixedReshaped.cpp
+ * Output: \verbinclude class_FixedReshaped.out
+ *
+ * \sa DenseBase::reshaped(NRowsType,NColsType)
+ */
+
+namespace internal {
+
+template <typename XprType, int Rows, int Cols, int Order>
+struct traits<Reshaped<XprType, Rows, Cols, Order> > : traits<XprType> {
+  typedef typename traits<XprType>::Scalar Scalar;
+  typedef typename traits<XprType>::StorageKind StorageKind;
+  typedef typename traits<XprType>::XprKind XprKind;
+  enum {
+    MatrixRows = traits<XprType>::RowsAtCompileTime,
+    MatrixCols = traits<XprType>::ColsAtCompileTime,
+    RowsAtCompileTime = Rows,
+    ColsAtCompileTime = Cols,
+    MaxRowsAtCompileTime = Rows,
+    MaxColsAtCompileTime = Cols,
+    XpxStorageOrder = ((int(traits<XprType>::Flags) & RowMajorBit) == RowMajorBit) ? RowMajor : ColMajor,
+    ReshapedStorageOrder = (RowsAtCompileTime == 1 && ColsAtCompileTime != 1)   ? RowMajor
+                           : (ColsAtCompileTime == 1 && RowsAtCompileTime != 1) ? ColMajor
+                                                                                : XpxStorageOrder,
+    HasSameStorageOrderAsXprType = (ReshapedStorageOrder == XpxStorageOrder),
+    InnerSize = (ReshapedStorageOrder == int(RowMajor)) ? int(ColsAtCompileTime) : int(RowsAtCompileTime),
+    InnerStrideAtCompileTime = HasSameStorageOrderAsXprType ? int(inner_stride_at_compile_time<XprType>::ret) : Dynamic,
+    OuterStrideAtCompileTime = Dynamic,
+
+    HasDirectAccess = internal::has_direct_access<XprType>::ret && (Order == int(XpxStorageOrder)) &&
+                      ((evaluator<XprType>::Flags & LinearAccessBit) == LinearAccessBit),
+
+    MaskPacketAccessBit =
+        (InnerSize == Dynamic || (InnerSize % packet_traits<Scalar>::size) == 0) && (InnerStrideAtCompileTime == 1)
+            ? PacketAccessBit
+            : 0,
+    // MaskAlignedBit = ((OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % 16)
+    // == 0)) ? AlignedBit : 0,
+    FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1) ? LinearAccessBit : 0,
+    FlagsLvalueBit = is_lvalue<XprType>::value ? LvalueBit : 0,
+    FlagsRowMajorBit = (ReshapedStorageOrder == int(RowMajor)) ? RowMajorBit : 0,
+    FlagsDirectAccessBit = HasDirectAccess ? DirectAccessBit : 0,
+    Flags0 = traits<XprType>::Flags & ((HereditaryBits & ~RowMajorBit) | MaskPacketAccessBit),
+
+    Flags = (Flags0 | FlagsLinearAccessBit | FlagsLvalueBit | FlagsRowMajorBit | FlagsDirectAccessBit)
+  };
+};
+
+template <typename XprType, int Rows, int Cols, int Order, bool HasDirectAccess>
+class ReshapedImpl_dense;
+
+}  // end namespace internal
+
+template <typename XprType, int Rows, int Cols, int Order, typename StorageKind>
+class ReshapedImpl;
+
+template <typename XprType, int Rows, int Cols, int Order>
+class Reshaped : public ReshapedImpl<XprType, Rows, Cols, Order, typename internal::traits<XprType>::StorageKind> {
+  typedef ReshapedImpl<XprType, Rows, Cols, Order, typename internal::traits<XprType>::StorageKind> Impl;
+
+ public:
+  // typedef typename Impl::Base Base;
+  typedef Impl Base;
+  EIGEN_GENERIC_PUBLIC_INTERFACE(Reshaped)
+  EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Reshaped)
+
+  /** Fixed-size constructor
+   */
+  EIGEN_DEVICE_FUNC inline Reshaped(XprType& xpr) : Impl(xpr) {
+    EIGEN_STATIC_ASSERT(RowsAtCompileTime != Dynamic && ColsAtCompileTime != Dynamic,
+                        THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE)
+    eigen_assert(Rows * Cols == xpr.rows() * xpr.cols());
+  }
+
+  /** Dynamic-size constructor
+   */
+  EIGEN_DEVICE_FUNC inline Reshaped(XprType& xpr, Index reshapeRows, Index reshapeCols)
+      : Impl(xpr, reshapeRows, reshapeCols) {
+    eigen_assert((RowsAtCompileTime == Dynamic || RowsAtCompileTime == reshapeRows) &&
+                 (ColsAtCompileTime == Dynamic || ColsAtCompileTime == reshapeCols));
+    eigen_assert(reshapeRows * reshapeCols == xpr.rows() * xpr.cols());
+  }
+};
+
+// The generic default implementation for dense reshape simply forward to the internal::ReshapedImpl_dense
+// that must be specialized for direct and non-direct access...
+template <typename XprType, int Rows, int Cols, int Order>
+class ReshapedImpl<XprType, Rows, Cols, Order, Dense>
+    : public internal::ReshapedImpl_dense<XprType, Rows, Cols, Order,
+                                          internal::traits<Reshaped<XprType, Rows, Cols, Order> >::HasDirectAccess> {
+  typedef internal::ReshapedImpl_dense<XprType, Rows, Cols, Order,
+                                       internal::traits<Reshaped<XprType, Rows, Cols, Order> >::HasDirectAccess>
+      Impl;
+
+ public:
+  typedef Impl Base;
+  EIGEN_INHERIT_ASSIGNMENT_OPERATORS(ReshapedImpl)
+  EIGEN_DEVICE_FUNC inline ReshapedImpl(XprType& xpr) : Impl(xpr) {}
+  EIGEN_DEVICE_FUNC inline ReshapedImpl(XprType& xpr, Index reshapeRows, Index reshapeCols)
+      : Impl(xpr, reshapeRows, reshapeCols) {}
+};
+
+namespace internal {
+
+/** \internal Internal implementation of dense Reshaped in the general case. */
+template <typename XprType, int Rows, int Cols, int Order>
+class ReshapedImpl_dense<XprType, Rows, Cols, Order, false>
+    : public internal::dense_xpr_base<Reshaped<XprType, Rows, Cols, Order> >::type {
+  typedef Reshaped<XprType, Rows, Cols, Order> ReshapedType;
+
+ public:
+  typedef typename internal::dense_xpr_base<ReshapedType>::type Base;
+  EIGEN_DENSE_PUBLIC_INTERFACE(ReshapedType)
+  EIGEN_INHERIT_ASSIGNMENT_OPERATORS(ReshapedImpl_dense)
+
+  typedef typename internal::ref_selector<XprType>::non_const_type MatrixTypeNested;
+  typedef internal::remove_all_t<XprType> NestedExpression;
+
+  class InnerIterator;
+
+  /** Fixed-size constructor
+   */
+  EIGEN_DEVICE_FUNC inline ReshapedImpl_dense(XprType& xpr) : m_xpr(xpr), m_rows(Rows), m_cols(Cols) {}
+
+  /** Dynamic-size constructor
+   */
+  EIGEN_DEVICE_FUNC inline ReshapedImpl_dense(XprType& xpr, Index nRows, Index nCols)
+      : m_xpr(xpr), m_rows(nRows), m_cols(nCols) {}
+
+  EIGEN_DEVICE_FUNC Index rows() const { return m_rows; }
+  EIGEN_DEVICE_FUNC Index cols() const { return m_cols; }
+
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+  /** \sa MapBase::data() */
+  EIGEN_DEVICE_FUNC constexpr const Scalar* data() const;
+  EIGEN_DEVICE_FUNC inline Index innerStride() const;
+  EIGEN_DEVICE_FUNC inline Index outerStride() const;
+#endif
+
+  /** \returns the nested expression */
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<XprType>& nestedExpression() const { return m_xpr; }
+
+  /** \returns the nested expression */
+  EIGEN_DEVICE_FUNC std::remove_reference_t<XprType>& nestedExpression() { return m_xpr; }
+
+ protected:
+  MatrixTypeNested m_xpr;
+  const internal::variable_if_dynamic<Index, Rows> m_rows;
+  const internal::variable_if_dynamic<Index, Cols> m_cols;
+};
+
+/** \internal Internal implementation of dense Reshaped in the direct access case. */
+template <typename XprType, int Rows, int Cols, int Order>
+class ReshapedImpl_dense<XprType, Rows, Cols, Order, true> : public MapBase<Reshaped<XprType, Rows, Cols, Order> > {
+  typedef Reshaped<XprType, Rows, Cols, Order> ReshapedType;
+  typedef typename internal::ref_selector<XprType>::non_const_type XprTypeNested;
+
+ public:
+  typedef MapBase<ReshapedType> Base;
+  EIGEN_DENSE_PUBLIC_INTERFACE(ReshapedType)
+  EIGEN_INHERIT_ASSIGNMENT_OPERATORS(ReshapedImpl_dense)
+
+  /** Fixed-size constructor
+   */
+  EIGEN_DEVICE_FUNC inline ReshapedImpl_dense(XprType& xpr) : Base(xpr.data()), m_xpr(xpr) {}
+
+  /** Dynamic-size constructor
+   */
+  EIGEN_DEVICE_FUNC inline ReshapedImpl_dense(XprType& xpr, Index nRows, Index nCols)
+      : Base(xpr.data(), nRows, nCols), m_xpr(xpr) {}
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<XprTypeNested>& nestedExpression() const { return m_xpr; }
+
+  EIGEN_DEVICE_FUNC XprType& nestedExpression() { return m_xpr; }
+
+  /** \sa MapBase::innerStride() */
+  EIGEN_DEVICE_FUNC constexpr Index innerStride() const { return m_xpr.innerStride(); }
+
+  /** \sa MapBase::outerStride() */
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const {
+    return (((Flags & RowMajorBit) == RowMajorBit) ? this->cols() : this->rows()) * m_xpr.innerStride();
+  }
+
+ protected:
+  XprTypeNested m_xpr;
+};
+
+// Evaluators
+template <typename ArgType, int Rows, int Cols, int Order, bool HasDirectAccess>
+struct reshaped_evaluator;
+
+template <typename ArgType, int Rows, int Cols, int Order>
+struct evaluator<Reshaped<ArgType, Rows, Cols, Order> >
+    : reshaped_evaluator<ArgType, Rows, Cols, Order, traits<Reshaped<ArgType, Rows, Cols, Order> >::HasDirectAccess> {
+  typedef Reshaped<ArgType, Rows, Cols, Order> XprType;
+  typedef typename XprType::Scalar Scalar;
+  // TODO: should check for smaller packet types
+  typedef typename packet_traits<Scalar>::type PacketScalar;
+
+  enum {
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
+    HasDirectAccess = traits<XprType>::HasDirectAccess,
+
+    //     RowsAtCompileTime = traits<XprType>::RowsAtCompileTime,
+    //     ColsAtCompileTime = traits<XprType>::ColsAtCompileTime,
+    //     MaxRowsAtCompileTime = traits<XprType>::MaxRowsAtCompileTime,
+    //     MaxColsAtCompileTime = traits<XprType>::MaxColsAtCompileTime,
+    //
+    //     InnerStrideAtCompileTime = traits<XprType>::HasSameStorageOrderAsXprType
+    //                              ? int(inner_stride_at_compile_time<ArgType>::ret)
+    //                              : Dynamic,
+    //     OuterStrideAtCompileTime = Dynamic,
+
+    FlagsLinearAccessBit =
+        (traits<XprType>::RowsAtCompileTime == 1 || traits<XprType>::ColsAtCompileTime == 1 || HasDirectAccess)
+            ? LinearAccessBit
+            : 0,
+    FlagsRowMajorBit = (traits<XprType>::ReshapedStorageOrder == int(RowMajor)) ? RowMajorBit : 0,
+    FlagsDirectAccessBit = HasDirectAccess ? DirectAccessBit : 0,
+    Flags0 = evaluator<ArgType>::Flags & (HereditaryBits & ~RowMajorBit),
+    Flags = Flags0 | FlagsLinearAccessBit | FlagsRowMajorBit | FlagsDirectAccessBit,
+
+    PacketAlignment = unpacket_traits<PacketScalar>::alignment,
+    Alignment = evaluator<ArgType>::Alignment
+  };
+  typedef reshaped_evaluator<ArgType, Rows, Cols, Order, HasDirectAccess> reshaped_evaluator_type;
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : reshaped_evaluator_type(xpr) {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+};
+
+template <typename ArgType, int Rows, int Cols, int Order>
+struct reshaped_evaluator<ArgType, Rows, Cols, Order, /* HasDirectAccess */ false>
+    : evaluator_base<Reshaped<ArgType, Rows, Cols, Order> > {
+  typedef Reshaped<ArgType, Rows, Cols, Order> XprType;
+
+  enum {
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost /* TODO + cost of index computations */,
+
+    Flags = (evaluator<ArgType>::Flags & (HereditaryBits /*| LinearAccessBit | DirectAccessBit*/)),
+
+    Alignment = 0
+  };
+
+  EIGEN_DEVICE_FUNC explicit reshaped_evaluator(const XprType& xpr) : m_argImpl(xpr.nestedExpression()), m_xpr(xpr) {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  typedef std::pair<Index, Index> RowCol;
+
+  EIGEN_DEVICE_FUNC inline RowCol index_remap(Index rowId, Index colId) const {
+    if (Order == ColMajor) {
+      const Index nth_elem_idx = colId * m_xpr.rows() + rowId;
+      return RowCol(nth_elem_idx % m_xpr.nestedExpression().rows(), nth_elem_idx / m_xpr.nestedExpression().rows());
+    } else {
+      const Index nth_elem_idx = colId + rowId * m_xpr.cols();
+      return RowCol(nth_elem_idx / m_xpr.nestedExpression().cols(), nth_elem_idx % m_xpr.nestedExpression().cols());
+    }
+  }
+
+  EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index rowId, Index colId) {
+    EIGEN_STATIC_ASSERT_LVALUE(XprType)
+    const RowCol row_col = index_remap(rowId, colId);
+    return m_argImpl.coeffRef(row_col.first, row_col.second);
+  }
+
+  EIGEN_DEVICE_FUNC inline const Scalar& coeffRef(Index rowId, Index colId) const {
+    const RowCol row_col = index_remap(rowId, colId);
+    return m_argImpl.coeffRef(row_col.first, row_col.second);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CoeffReturnType coeff(Index rowId, Index colId) const {
+    const RowCol row_col = index_remap(rowId, colId);
+    return m_argImpl.coeff(row_col.first, row_col.second);
+  }
+
+  EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index index) {
+    EIGEN_STATIC_ASSERT_LVALUE(XprType)
+    const RowCol row_col = index_remap(Rows == 1 ? 0 : index, Rows == 1 ? index : 0);
+    return m_argImpl.coeffRef(row_col.first, row_col.second);
+  }
+
+  EIGEN_DEVICE_FUNC inline const Scalar& coeffRef(Index index) const {
+    const RowCol row_col = index_remap(Rows == 1 ? 0 : index, Rows == 1 ? index : 0);
+    return m_argImpl.coeffRef(row_col.first, row_col.second);
+  }
+
+  EIGEN_DEVICE_FUNC inline const CoeffReturnType coeff(Index index) const {
+    const RowCol row_col = index_remap(Rows == 1 ? 0 : index, Rows == 1 ? index : 0);
+    return m_argImpl.coeff(row_col.first, row_col.second);
+  }
+#if 0
+  EIGEN_DEVICE_FUNC
+  template<int LoadMode>
+  inline PacketScalar packet(Index rowId, Index colId) const
+  {
+    const RowCol row_col = index_remap(rowId, colId);
+    return m_argImpl.template packet<Unaligned>(row_col.first, row_col.second);
+
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC
+  inline void writePacket(Index rowId, Index colId, const PacketScalar& val)
+  {
+    const RowCol row_col = index_remap(rowId, colId);
+    m_argImpl.const_cast_derived().template writePacket<Unaligned>
+            (row_col.first, row_col.second, val);
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC
+  inline PacketScalar packet(Index index) const
+  {
+    const RowCol row_col = index_remap(RowsAtCompileTime == 1 ? 0 : index,
+                                        RowsAtCompileTime == 1 ? index : 0);
+    return m_argImpl.template packet<Unaligned>(row_col.first, row_col.second);
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC
+  inline void writePacket(Index index, const PacketScalar& val)
+  {
+    const RowCol row_col = index_remap(RowsAtCompileTime == 1 ? 0 : index,
+                                        RowsAtCompileTime == 1 ? index : 0);
+    return m_argImpl.template packet<Unaligned>(row_col.first, row_col.second, val);
+  }
+#endif
+ protected:
+  evaluator<ArgType> m_argImpl;
+  const XprType& m_xpr;
+};
+
+template <typename ArgType, int Rows, int Cols, int Order>
+struct reshaped_evaluator<ArgType, Rows, Cols, Order, /* HasDirectAccess */ true>
+    : mapbase_evaluator<Reshaped<ArgType, Rows, Cols, Order>,
+                        typename Reshaped<ArgType, Rows, Cols, Order>::PlainObject> {
+  typedef Reshaped<ArgType, Rows, Cols, Order> XprType;
+  typedef typename XprType::Scalar Scalar;
+
+  EIGEN_DEVICE_FUNC explicit reshaped_evaluator(const XprType& xpr)
+      : mapbase_evaluator<XprType, typename XprType::PlainObject>(xpr) {
+    // TODO: for the 3.4 release, this should be turned to an internal assertion, but let's keep it as is for the beta
+    // lifetime
+    eigen_assert(((std::uintptr_t(xpr.data()) % plain_enum_max(1, evaluator<XprType>::Alignment)) == 0) &&
+                 "data is not aligned");
+  }
+};
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_RESHAPED_H
diff --git a/inst/include/Eigen/src/Core/ReturnByValue.h b/inst/include/Eigen/src/Core/ReturnByValue.h
index f635598d..892c193b 100644
--- a/inst/include/Eigen/src/Core/ReturnByValue.h
+++ b/inst/include/Eigen/src/Core/ReturnByValue.h
@@ -11,25 +11,20 @@
 #ifndef EIGEN_RETURNBYVALUE_H
 #define EIGEN_RETURNBYVALUE_H
 
-namespace Eigen {
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 
-/** \class ReturnByValue
-  * \ingroup Core_Module
-  *
-  */
+namespace Eigen {
 
 namespace internal {
 
-template<typename Derived>
-struct traits<ReturnByValue<Derived> >
-  : public traits<typename traits<Derived>::ReturnType>
-{
+template <typename Derived>
+struct traits<ReturnByValue<Derived> > : public traits<typename traits<Derived>::ReturnType> {
   enum {
     // We're disabling the DirectAccess because e.g. the constructor of
     // the Block-with-DirectAccess expression requires to have a coeffRef method.
     // Also, we don't want to have to implement the stride stuff.
-    Flags = (traits<typename traits<Derived>::ReturnType>::Flags
-             | EvalBeforeNestingBit) & ~DirectAccessBit
+    Flags = (traits<typename traits<Derived>::ReturnType>::Flags | EvalBeforeNestingBit) & ~DirectAccessBit
   };
 };
 
@@ -38,62 +33,79 @@ struct traits<ReturnByValue<Derived> >
  * So internal::nested always gives the plain return matrix type.
  *
  * FIXME: I don't understand why we need this specialization: isn't this taken care of by the EvalBeforeNestingBit ??
+ * Answer: EvalBeforeNestingBit should be deprecated since we have the evaluators
  */
-template<typename Derived,int n,typename PlainObject>
-struct nested<ReturnByValue<Derived>, n, PlainObject>
-{
+template <typename Derived, int n, typename PlainObject>
+struct nested_eval<ReturnByValue<Derived>, n, PlainObject> {
   typedef typename traits<Derived>::ReturnType type;
 };
 
-} // end namespace internal
+}  // end namespace internal
 
-template<typename Derived> class ReturnByValue
-  : internal::no_assignment_operator, public internal::dense_xpr_base< ReturnByValue<Derived> >::type
-{
-  public:
-    typedef typename internal::traits<Derived>::ReturnType ReturnType;
+/** \class ReturnByValue
+ * \ingroup Core_Module
+ *
+ */
+template <typename Derived>
+class ReturnByValue : public internal::dense_xpr_base<ReturnByValue<Derived> >::type, internal::no_assignment_operator {
+ public:
+  typedef typename internal::traits<Derived>::ReturnType ReturnType;
 
-    typedef typename internal::dense_xpr_base<ReturnByValue>::type Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(ReturnByValue)
+  typedef typename internal::dense_xpr_base<ReturnByValue>::type Base;
+  EIGEN_DENSE_PUBLIC_INTERFACE(ReturnByValue)
 
-    template<typename Dest>
-    inline void evalTo(Dest& dst) const
-    { static_cast<const Derived*>(this)->evalTo(dst); }
-    inline Index rows() const { return static_cast<const Derived*>(this)->rows(); }
-    inline Index cols() const { return static_cast<const Derived*>(this)->cols(); }
+  template <typename Dest>
+  EIGEN_DEVICE_FUNC inline void evalTo(Dest& dst) const {
+    static_cast<const Derived*>(this)->evalTo(dst);
+  }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return static_cast<const Derived*>(this)->rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return static_cast<const Derived*>(this)->cols(); }
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
-#define Unusable YOU_ARE_TRYING_TO_ACCESS_A_SINGLE_COEFFICIENT_IN_A_SPECIAL_EXPRESSION_WHERE_THAT_IS_NOT_ALLOWED_BECAUSE_THAT_WOULD_BE_INEFFICIENT
-    class Unusable{
-      Unusable(const Unusable&) {}
-      Unusable& operator=(const Unusable&) {return *this;}
-    };
-    const Unusable& coeff(Index) const { return *reinterpret_cast<const Unusable*>(this); }
-    const Unusable& coeff(Index,Index) const { return *reinterpret_cast<const Unusable*>(this); }
-    Unusable& coeffRef(Index) { return *reinterpret_cast<Unusable*>(this); }
-    Unusable& coeffRef(Index,Index) { return *reinterpret_cast<Unusable*>(this); }
-    template<int LoadMode>  Unusable& packet(Index) const;
-    template<int LoadMode>  Unusable& packet(Index, Index) const;
+#define Unusable \
+  YOU_ARE_TRYING_TO_ACCESS_A_SINGLE_COEFFICIENT_IN_A_SPECIAL_EXPRESSION_WHERE_THAT_IS_NOT_ALLOWED_BECAUSE_THAT_WOULD_BE_INEFFICIENT
+  class Unusable {
+    Unusable(const Unusable&) {}
+    Unusable& operator=(const Unusable&) { return *this; }
+  };
+  const Unusable& coeff(Index) const { return *reinterpret_cast<const Unusable*>(this); }
+  const Unusable& coeff(Index, Index) const { return *reinterpret_cast<const Unusable*>(this); }
+  Unusable& coeffRef(Index) { return *reinterpret_cast<Unusable*>(this); }
+  Unusable& coeffRef(Index, Index) { return *reinterpret_cast<Unusable*>(this); }
+#undef Unusable
 #endif
 };
 
-template<typename Derived>
-template<typename OtherDerived>
-Derived& DenseBase<Derived>::operator=(const ReturnByValue<OtherDerived>& other)
-{
+template <typename Derived>
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC Derived& DenseBase<Derived>::operator=(const ReturnByValue<OtherDerived>& other) {
   other.evalTo(derived());
   return derived();
 }
 
-template<typename Derived>
-template<typename OtherDerived>
-Derived& DenseBase<Derived>::lazyAssign(const ReturnByValue<OtherDerived>& other)
-{
-  other.evalTo(derived());
-  return derived();
-}
+namespace internal {
+
+// Expression is evaluated in a temporary; default implementation of Assignment is bypassed so that
+// when a ReturnByValue expression is assigned, the evaluator is not constructed.
+// TODO: Finalize port to new regime; ReturnByValue should not exist in the expression world
+
+template <typename Derived>
+struct evaluator<ReturnByValue<Derived> > : public evaluator<typename internal::traits<Derived>::ReturnType> {
+  typedef ReturnByValue<Derived> XprType;
+  typedef typename internal::traits<Derived>::ReturnType PlainObject;
+  typedef evaluator<PlainObject> Base;
+
+  EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : m_result(xpr.rows(), xpr.cols()) {
+    internal::construct_at<Base>(this, m_result);
+    xpr.evalTo(m_result);
+  }
+
+ protected:
+  PlainObject m_result;
+};
 
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_RETURNBYVALUE_H
+#endif  // EIGEN_RETURNBYVALUE_H
diff --git a/inst/include/Eigen/src/Core/Reverse.h b/inst/include/Eigen/src/Core/Reverse.h
index e30ae3d2..d11ba167 100644
--- a/inst/include/Eigen/src/Core/Reverse.h
+++ b/inst/include/Eigen/src/Core/Reverse.h
@@ -12,213 +12,191 @@
 #ifndef EIGEN_REVERSE_H
 #define EIGEN_REVERSE_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 
-/** \class Reverse
-  * \ingroup Core_Module
-  *
-  * \brief Expression of the reverse of a vector or matrix
-  *
-  * \param MatrixType the type of the object of which we are taking the reverse
-  *
-  * This class represents an expression of the reverse of a vector.
-  * It is the return type of MatrixBase::reverse() and VectorwiseOp::reverse()
-  * and most of the time this is the only way it is used.
-  *
-  * \sa MatrixBase::reverse(), VectorwiseOp::reverse()
-  */
+namespace Eigen {
 
 namespace internal {
 
-template<typename MatrixType, int Direction>
-struct traits<Reverse<MatrixType, Direction> >
- : traits<MatrixType>
-{
+template <typename MatrixType, int Direction>
+struct traits<Reverse<MatrixType, Direction> > : traits<MatrixType> {
   typedef typename MatrixType::Scalar Scalar;
   typedef typename traits<MatrixType>::StorageKind StorageKind;
   typedef typename traits<MatrixType>::XprKind XprKind;
-  typedef typename nested<MatrixType>::type MatrixTypeNested;
-  typedef typename remove_reference<MatrixTypeNested>::type _MatrixTypeNested;
+  typedef typename ref_selector<MatrixType>::type MatrixTypeNested;
+  typedef std::remove_reference_t<MatrixTypeNested> MatrixTypeNested_;
   enum {
     RowsAtCompileTime = MatrixType::RowsAtCompileTime,
     ColsAtCompileTime = MatrixType::ColsAtCompileTime,
     MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
-
-    // let's enable LinearAccess only with vectorization because of the product overhead
-    LinearAccess = ( (Direction==BothDirections) && (int(_MatrixTypeNested::Flags)&PacketAccessBit) )
-                 ? LinearAccessBit : 0,
-
-    Flags = int(_MatrixTypeNested::Flags) & (HereditaryBits | LvalueBit | PacketAccessBit | LinearAccess),
-
-    CoeffReadCost = _MatrixTypeNested::CoeffReadCost
+    Flags = MatrixTypeNested_::Flags & (RowMajorBit | LvalueBit)
   };
 };
 
-template<typename PacketScalar, bool ReversePacket> struct reverse_packet_cond
-{
-  static inline PacketScalar run(const PacketScalar& x) { return preverse(x); }
+template <typename PacketType, bool ReversePacket>
+struct reverse_packet_cond {
+  static inline PacketType run(const PacketType& x) { return preverse(x); }
 };
 
-template<typename PacketScalar> struct reverse_packet_cond<PacketScalar,false>
-{
-  static inline PacketScalar run(const PacketScalar& x) { return x; }
+template <typename PacketType>
+struct reverse_packet_cond<PacketType, false> {
+  static inline PacketType run(const PacketType& x) { return x; }
 };
 
-} // end namespace internal 
-
-template<typename MatrixType, int Direction> class Reverse
-  : public internal::dense_xpr_base< Reverse<MatrixType, Direction> >::type
-{
-  public:
-
-    typedef typename internal::dense_xpr_base<Reverse>::type Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(Reverse)
-    using Base::IsRowMajor;
-
-    // next line is necessary because otherwise const version of operator()
-    // is hidden by non-const version defined in this file
-    using Base::operator(); 
-
-  protected:
-    enum {
-      PacketSize = internal::packet_traits<Scalar>::size,
-      IsColMajor = !IsRowMajor,
-      ReverseRow = (Direction == Vertical)   || (Direction == BothDirections),
-      ReverseCol = (Direction == Horizontal) || (Direction == BothDirections),
-      OffsetRow  = ReverseRow && IsColMajor ? PacketSize : 1,
-      OffsetCol  = ReverseCol && IsRowMajor ? PacketSize : 1,
-      ReversePacket = (Direction == BothDirections)
-                    || ((Direction == Vertical)   && IsColMajor)
-                    || ((Direction == Horizontal) && IsRowMajor)
-    };
-    typedef internal::reverse_packet_cond<PacketScalar,ReversePacket> reverse_packet;
-  public:
-
-    inline Reverse(const MatrixType& matrix) : m_matrix(matrix) { }
-
-    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Reverse)
-
-    inline Index rows() const { return m_matrix.rows(); }
-    inline Index cols() const { return m_matrix.cols(); }
-
-    inline Index innerStride() const
-    {
-      return -m_matrix.innerStride();
-    }
+}  // end namespace internal
 
-    inline Scalar& operator()(Index row, Index col)
-    {
-      eigen_assert(row >= 0 && row < rows() && col >= 0 && col < cols());
-      return coeffRef(row, col);
-    }
+/** \class Reverse
+ * \ingroup Core_Module
+ *
+ * \brief Expression of the reverse of a vector or matrix
+ *
+ * \tparam MatrixType the type of the object of which we are taking the reverse
+ * \tparam Direction defines the direction of the reverse operation, can be Vertical, Horizontal, or BothDirections
+ *
+ * This class represents an expression of the reverse of a vector.
+ * It is the return type of MatrixBase::reverse() and VectorwiseOp::reverse()
+ * and most of the time this is the only way it is used.
+ *
+ * \sa MatrixBase::reverse(), VectorwiseOp::reverse()
+ */
+template <typename MatrixType, int Direction>
+class Reverse : public internal::dense_xpr_base<Reverse<MatrixType, Direction> >::type {
+ public:
+  typedef typename internal::dense_xpr_base<Reverse>::type Base;
+  EIGEN_DENSE_PUBLIC_INTERFACE(Reverse)
+  typedef internal::remove_all_t<MatrixType> NestedExpression;
+  using Base::IsRowMajor;
+
+ protected:
+  enum {
+    PacketSize = internal::packet_traits<Scalar>::size,
+    IsColMajor = !IsRowMajor,
+    ReverseRow = (Direction == Vertical) || (Direction == BothDirections),
+    ReverseCol = (Direction == Horizontal) || (Direction == BothDirections),
+    OffsetRow = ReverseRow && IsColMajor ? PacketSize : 1,
+    OffsetCol = ReverseCol && IsRowMajor ? PacketSize : 1,
+    ReversePacket = (Direction == BothDirections) || ((Direction == Vertical) && IsColMajor) ||
+                    ((Direction == Horizontal) && IsRowMajor)
+  };
+  typedef internal::reverse_packet_cond<PacketScalar, ReversePacket> reverse_packet;
 
-    inline Scalar& coeffRef(Index row, Index col)
-    {
-      return m_matrix.const_cast_derived().coeffRef(ReverseRow ? m_matrix.rows() - row - 1 : row,
-                                                    ReverseCol ? m_matrix.cols() - col - 1 : col);
-    }
+ public:
+  EIGEN_DEVICE_FUNC explicit inline Reverse(const MatrixType& matrix) : m_matrix(matrix) {}
 
-    inline CoeffReturnType coeff(Index row, Index col) const
-    {
-      return m_matrix.coeff(ReverseRow ? m_matrix.rows() - row - 1 : row,
-                            ReverseCol ? m_matrix.cols() - col - 1 : col);
-    }
+  EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Reverse)
 
-    inline CoeffReturnType coeff(Index index) const
-    {
-      return m_matrix.coeff(m_matrix.size() - index - 1);
-    }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_matrix.rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_matrix.cols(); }
 
-    inline Scalar& coeffRef(Index index)
-    {
-      return m_matrix.const_cast_derived().coeffRef(m_matrix.size() - index - 1);
-    }
+  EIGEN_DEVICE_FUNC inline Index innerStride() const { return -m_matrix.innerStride(); }
 
-    inline Scalar& operator()(Index index)
-    {
-      eigen_assert(index >= 0 && index < m_matrix.size());
-      return coeffRef(index);
-    }
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename MatrixType::Nested>& nestedExpression() const {
+    return m_matrix;
+  }
 
-    template<int LoadMode>
-    inline const PacketScalar packet(Index row, Index col) const
-    {
-      return reverse_packet::run(m_matrix.template packet<LoadMode>(
-                                    ReverseRow ? m_matrix.rows() - row - OffsetRow : row,
-                                    ReverseCol ? m_matrix.cols() - col - OffsetCol : col));
-    }
+ protected:
+  typename MatrixType::Nested m_matrix;
+};
 
-    template<int LoadMode>
-    inline void writePacket(Index row, Index col, const PacketScalar& x)
-    {
-      m_matrix.const_cast_derived().template writePacket<LoadMode>(
-                                      ReverseRow ? m_matrix.rows() - row - OffsetRow : row,
-                                      ReverseCol ? m_matrix.cols() - col - OffsetCol : col,
-                                      reverse_packet::run(x));
-    }
+/** \returns an expression of the reverse of *this.
+ *
+ * Example: \include MatrixBase_reverse.cpp
+ * Output: \verbinclude MatrixBase_reverse.out
+ *
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::ReverseReturnType DenseBase<Derived>::reverse() {
+  return ReverseReturnType(derived());
+}
 
-    template<int LoadMode>
-    inline const PacketScalar packet(Index index) const
-    {
-      return internal::preverse(m_matrix.template packet<LoadMode>( m_matrix.size() - index - PacketSize ));
-    }
+// reverse const overload moved DenseBase.h due to a CUDA compiler bug
 
-    template<int LoadMode>
-    inline void writePacket(Index index, const PacketScalar& x)
-    {
-      m_matrix.const_cast_derived().template writePacket<LoadMode>(m_matrix.size() - index - PacketSize, internal::preverse(x));
+/** This is the "in place" version of reverse: it reverses \c *this.
+ *
+ * In most cases it is probably better to simply use the reversed expression
+ * of a matrix. However, when reversing the matrix data itself is really needed,
+ * then this "in-place" version is probably the right choice because it provides
+ * the following additional benefits:
+ *  - less error prone: doing the same operation with .reverse() requires special care:
+ *    \code m = m.reverse().eval(); \endcode
+ *  - this API enables reverse operations without the need for a temporary
+ *  - it allows future optimizations (cache friendliness, etc.)
+ *
+ * \sa VectorwiseOp::reverseInPlace(), reverse() */
+template <typename Derived>
+EIGEN_DEVICE_FUNC inline void DenseBase<Derived>::reverseInPlace() {
+  constexpr int HalfRowsAtCompileTime = RowsAtCompileTime == Dynamic ? Dynamic : RowsAtCompileTime / 2;
+  constexpr int HalfColsAtCompileTime = ColsAtCompileTime == Dynamic ? Dynamic : ColsAtCompileTime / 2;
+  if (cols() > rows()) {
+    Index half = cols() / 2;
+    this->template leftCols<HalfColsAtCompileTime>(half).swap(
+        this->template rightCols<HalfColsAtCompileTime>(half).reverse());
+    if ((cols() % 2) == 1) {
+      Index half2 = rows() / 2;
+      col(half).template head<HalfRowsAtCompileTime>(half2).swap(
+          col(half).template tail<HalfRowsAtCompileTime>(half2).reverse());
     }
-
-    const typename internal::remove_all<typename MatrixType::Nested>::type& 
-    nestedExpression() const 
-    {
-      return m_matrix;
+  } else {
+    Index half = rows() / 2;
+    this->template topRows<HalfRowsAtCompileTime>(half).swap(
+        this->template bottomRows<HalfRowsAtCompileTime>(half).reverse());
+    if ((rows() % 2) == 1) {
+      Index half2 = cols() / 2;
+      row(half).template head<HalfColsAtCompileTime>(half2).swap(
+          row(half).template tail<HalfColsAtCompileTime>(half2).reverse());
     }
+  }
+}
 
-  protected:
-    typename MatrixType::Nested m_matrix;
-};
+namespace internal {
 
-/** \returns an expression of the reverse of *this.
-  *
-  * Example: \include MatrixBase_reverse.cpp
-  * Output: \verbinclude MatrixBase_reverse.out
-  *
-  */
-template<typename Derived>
-inline typename DenseBase<Derived>::ReverseReturnType
-DenseBase<Derived>::reverse()
-{
-  return derived();
-}
+template <int Direction>
+struct vectorwise_reverse_inplace_impl;
+
+template <>
+struct vectorwise_reverse_inplace_impl<Vertical> {
+  template <typename ExpressionType>
+  static void run(ExpressionType& xpr) {
+    constexpr Index HalfAtCompileTime =
+        ExpressionType::RowsAtCompileTime == Dynamic ? Dynamic : ExpressionType::RowsAtCompileTime / 2;
+    Index half = xpr.rows() / 2;
+    xpr.template topRows<HalfAtCompileTime>(half).swap(
+        xpr.template bottomRows<HalfAtCompileTime>(half).colwise().reverse());
+  }
+};
 
-/** This is the const version of reverse(). */
-template<typename Derived>
-inline const typename DenseBase<Derived>::ConstReverseReturnType
-DenseBase<Derived>::reverse() const
-{
-  return derived();
-}
+template <>
+struct vectorwise_reverse_inplace_impl<Horizontal> {
+  template <typename ExpressionType>
+  static void run(ExpressionType& xpr) {
+    constexpr Index HalfAtCompileTime =
+        ExpressionType::ColsAtCompileTime == Dynamic ? Dynamic : ExpressionType::ColsAtCompileTime / 2;
+    Index half = xpr.cols() / 2;
+    xpr.template leftCols<HalfAtCompileTime>(half).swap(
+        xpr.template rightCols<HalfAtCompileTime>(half).rowwise().reverse());
+  }
+};
 
-/** This is the "in place" version of reverse: it reverses \c *this.
-  *
-  * In most cases it is probably better to simply use the reversed expression
-  * of a matrix. However, when reversing the matrix data itself is really needed,
-  * then this "in-place" version is probably the right choice because it provides
-  * the following additional features:
-  *  - less error prone: doing the same operation with .reverse() requires special care:
-  *    \code m = m.reverse().eval(); \endcode
-  *  - this API allows to avoid creating a temporary (the current implementation creates a temporary, but that could be avoided using swap)
-  *  - it allows future optimizations (cache friendliness, etc.)
-  *
-  * \sa reverse() */
-template<typename Derived>
-inline void DenseBase<Derived>::reverseInPlace()
-{
-  derived() = derived().reverse().eval();
+}  // end namespace internal
+
+/** This is the "in place" version of VectorwiseOp::reverse: it reverses each column or row of \c *this.
+ *
+ * In most cases it is probably better to simply use the reversed expression
+ * of a matrix. However, when reversing the matrix data itself is really needed,
+ * then this "in-place" version is probably the right choice because it provides
+ * the following additional benefits:
+ *  - less error prone: doing the same operation with .reverse() requires special care:
+ *    \code m = m.reverse().eval(); \endcode
+ *  - this API enables reverse operations without the need for a temporary
+ *
+ * \sa DenseBase::reverseInPlace(), reverse() */
+template <typename ExpressionType, int Direction>
+EIGEN_DEVICE_FUNC void VectorwiseOp<ExpressionType, Direction>::reverseInPlace() {
+  internal::vectorwise_reverse_inplace_impl<Direction>::run(m_matrix);
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_REVERSE_H
+#endif  // EIGEN_REVERSE_H
diff --git a/inst/include/Eigen/src/Core/Select.h b/inst/include/Eigen/src/Core/Select.h
index 87993bbb..61a67c2f 100644
--- a/inst/include/Eigen/src/Core/Select.h
+++ b/inst/include/Eigen/src/Core/Select.h
@@ -10,153 +10,83 @@
 #ifndef EIGEN_SELECT_H
 #define EIGEN_SELECT_H
 
-namespace Eigen { 
-
-/** \class Select
-  * \ingroup Core_Module
-  *
-  * \brief Expression of a coefficient wise version of the C++ ternary operator ?:
-  *
-  * \param ConditionMatrixType the type of the \em condition expression which must be a boolean matrix
-  * \param ThenMatrixType the type of the \em then expression
-  * \param ElseMatrixType the type of the \em else expression
-  *
-  * This class represents an expression of a coefficient wise version of the C++ ternary operator ?:.
-  * It is the return type of DenseBase::select() and most of the time this is the only way it is used.
-  *
-  * \sa DenseBase::select(const DenseBase<ThenDerived>&, const DenseBase<ElseDerived>&) const
-  */
-
-namespace internal {
-template<typename ConditionMatrixType, typename ThenMatrixType, typename ElseMatrixType>
-struct traits<Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >
- : traits<ThenMatrixType>
-{
-  typedef typename traits<ThenMatrixType>::Scalar Scalar;
-  typedef Dense StorageKind;
-  typedef typename traits<ThenMatrixType>::XprKind XprKind;
-  typedef typename ConditionMatrixType::Nested ConditionMatrixNested;
-  typedef typename ThenMatrixType::Nested ThenMatrixNested;
-  typedef typename ElseMatrixType::Nested ElseMatrixNested;
-  enum {
-    RowsAtCompileTime = ConditionMatrixType::RowsAtCompileTime,
-    ColsAtCompileTime = ConditionMatrixType::ColsAtCompileTime,
-    MaxRowsAtCompileTime = ConditionMatrixType::MaxRowsAtCompileTime,
-    MaxColsAtCompileTime = ConditionMatrixType::MaxColsAtCompileTime,
-    Flags = (unsigned int)ThenMatrixType::Flags & ElseMatrixType::Flags & HereditaryBits,
-    CoeffReadCost = traits<typename remove_all<ConditionMatrixNested>::type>::CoeffReadCost
-                  + EIGEN_SIZE_MAX(traits<typename remove_all<ThenMatrixNested>::type>::CoeffReadCost,
-                                   traits<typename remove_all<ElseMatrixNested>::type>::CoeffReadCost)
-  };
-};
-}
-
-template<typename ConditionMatrixType, typename ThenMatrixType, typename ElseMatrixType>
-class Select : internal::no_assignment_operator,
-  public internal::dense_xpr_base< Select<ConditionMatrixType, ThenMatrixType, ElseMatrixType> >::type
-{
-  public:
-
-    typedef typename internal::dense_xpr_base<Select>::type Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(Select)
-
-    Select(const ConditionMatrixType& a_conditionMatrix,
-           const ThenMatrixType& a_thenMatrix,
-           const ElseMatrixType& a_elseMatrix)
-      : m_condition(a_conditionMatrix), m_then(a_thenMatrix), m_else(a_elseMatrix)
-    {
-      eigen_assert(m_condition.rows() == m_then.rows() && m_condition.rows() == m_else.rows());
-      eigen_assert(m_condition.cols() == m_then.cols() && m_condition.cols() == m_else.cols());
-    }
-
-    Index rows() const { return m_condition.rows(); }
-    Index cols() const { return m_condition.cols(); }
-
-    const Scalar coeff(Index i, Index j) const
-    {
-      if (m_condition.coeff(i,j))
-        return m_then.coeff(i,j);
-      else
-        return m_else.coeff(i,j);
-    }
-
-    const Scalar coeff(Index i) const
-    {
-      if (m_condition.coeff(i))
-        return m_then.coeff(i);
-      else
-        return m_else.coeff(i);
-    }
-
-    const ConditionMatrixType& conditionMatrix() const
-    {
-      return m_condition;
-    }
-
-    const ThenMatrixType& thenMatrix() const
-    {
-      return m_then;
-    }
-
-    const ElseMatrixType& elseMatrix() const
-    {
-      return m_else;
-    }
-
-  protected:
-    typename ConditionMatrixType::Nested m_condition;
-    typename ThenMatrixType::Nested m_then;
-    typename ElseMatrixType::Nested m_else;
-};
-
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \typedef Select
+ * \ingroup Core_Module
+ *
+ * \brief Expression of a coefficient wise version of the C++ ternary operator ?:
+ *
+ * \tparam ConditionMatrixType the type of the \em condition expression which must be a boolean matrix
+ * \tparam ThenMatrixType the type of the \em then expression
+ * \tparam ElseMatrixType the type of the \em else expression
+ *
+ * This type represents an expression of a coefficient wise version of the C++ ternary operator ?:.
+ * It is the return type of DenseBase::select() and most of the time this is the only way it is used.
+ *
+ * \sa DenseBase::select(const DenseBase<ThenDerived>&, const DenseBase<ElseDerived>&) const
+ */
+template <typename ConditionMatrixType, typename ThenMatrixType, typename ElseMatrixType>
+using Select = CwiseTernaryOp<internal::scalar_boolean_select_op<typename DenseBase<ThenMatrixType>::Scalar,
+                                                                 typename DenseBase<ElseMatrixType>::Scalar,
+                                                                 typename DenseBase<ConditionMatrixType>::Scalar>,
+                              ThenMatrixType, ElseMatrixType, ConditionMatrixType>;
 
 /** \returns a matrix where each coefficient (i,j) is equal to \a thenMatrix(i,j)
-  * if \c *this(i,j), and \a elseMatrix(i,j) otherwise.
-  *
-  * Example: \include MatrixBase_select.cpp
-  * Output: \verbinclude MatrixBase_select.out
-  *
-  * \sa class Select
-  */
-template<typename Derived>
-template<typename ThenDerived,typename ElseDerived>
-inline const Select<Derived,ThenDerived,ElseDerived>
-DenseBase<Derived>::select(const DenseBase<ThenDerived>& thenMatrix,
-                            const DenseBase<ElseDerived>& elseMatrix) const
-{
-  return Select<Derived,ThenDerived,ElseDerived>(derived(), thenMatrix.derived(), elseMatrix.derived());
+ * if \c *this(i,j) != Scalar(0), and \a elseMatrix(i,j) otherwise.
+ *
+ * Example: \include MatrixBase_select.cpp
+ * Output: \verbinclude MatrixBase_select.out
+ *
+ * \sa typedef Select
+ */
+template <typename Derived>
+template <typename ThenDerived, typename ElseDerived>
+inline EIGEN_DEVICE_FUNC CwiseTernaryOp<
+    internal::scalar_boolean_select_op<typename DenseBase<ThenDerived>::Scalar, typename DenseBase<ElseDerived>::Scalar,
+                                       typename DenseBase<Derived>::Scalar>,
+    ThenDerived, ElseDerived, Derived>
+DenseBase<Derived>::select(const DenseBase<ThenDerived>& thenMatrix, const DenseBase<ElseDerived>& elseMatrix) const {
+  return Select<Derived, ThenDerived, ElseDerived>(thenMatrix.derived(), elseMatrix.derived(), derived());
 }
-
 /** Version of DenseBase::select(const DenseBase&, const DenseBase&) with
-  * the \em else expression being a scalar value.
-  *
-  * \sa DenseBase::select(const DenseBase<ThenDerived>&, const DenseBase<ElseDerived>&) const, class Select
-  */
-template<typename Derived>
-template<typename ThenDerived>
-inline const Select<Derived,ThenDerived, typename ThenDerived::ConstantReturnType>
+ * the \em else expression being a scalar value.
+ *
+ * \sa typedef Select
+ */
+template <typename Derived>
+template <typename ThenDerived>
+inline EIGEN_DEVICE_FUNC CwiseTernaryOp<
+    internal::scalar_boolean_select_op<typename DenseBase<ThenDerived>::Scalar, typename DenseBase<ThenDerived>::Scalar,
+                                       typename DenseBase<Derived>::Scalar>,
+    ThenDerived, typename DenseBase<ThenDerived>::ConstantReturnType, Derived>
 DenseBase<Derived>::select(const DenseBase<ThenDerived>& thenMatrix,
-                           const typename ThenDerived::Scalar& elseScalar) const
-{
-  return Select<Derived,ThenDerived,typename ThenDerived::ConstantReturnType>(
-    derived(), thenMatrix.derived(), ThenDerived::Constant(rows(),cols(),elseScalar));
+                           const typename DenseBase<ThenDerived>::Scalar& elseScalar) const {
+  using ElseConstantType = typename DenseBase<ThenDerived>::ConstantReturnType;
+  return Select<Derived, ThenDerived, ElseConstantType>(thenMatrix.derived(),
+                                                        ElseConstantType(rows(), cols(), elseScalar), derived());
 }
-
 /** Version of DenseBase::select(const DenseBase&, const DenseBase&) with
-  * the \em then expression being a scalar value.
-  *
-  * \sa DenseBase::select(const DenseBase<ThenDerived>&, const DenseBase<ElseDerived>&) const, class Select
-  */
-template<typename Derived>
-template<typename ElseDerived>
-inline const Select<Derived, typename ElseDerived::ConstantReturnType, ElseDerived >
-DenseBase<Derived>::select(const typename ElseDerived::Scalar& thenScalar,
-                           const DenseBase<ElseDerived>& elseMatrix) const
-{
-  return Select<Derived,typename ElseDerived::ConstantReturnType,ElseDerived>(
-    derived(), ElseDerived::Constant(rows(),cols(),thenScalar), elseMatrix.derived());
+ * the \em then expression being a scalar value.
+ *
+ * \sa typedef Select
+ */
+template <typename Derived>
+template <typename ElseDerived>
+inline EIGEN_DEVICE_FUNC CwiseTernaryOp<
+    internal::scalar_boolean_select_op<typename DenseBase<ElseDerived>::Scalar, typename DenseBase<ElseDerived>::Scalar,
+                                       typename DenseBase<Derived>::Scalar>,
+    typename DenseBase<ElseDerived>::ConstantReturnType, ElseDerived, Derived>
+DenseBase<Derived>::select(const typename DenseBase<ElseDerived>::Scalar& thenScalar,
+                           const DenseBase<ElseDerived>& elseMatrix) const {
+  using ThenConstantType = typename DenseBase<ElseDerived>::ConstantReturnType;
+  return Select<Derived, ThenConstantType, ElseDerived>(ThenConstantType(rows(), cols(), thenScalar),
+                                                        elseMatrix.derived(), derived());
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_SELECT_H
+#endif  // EIGEN_SELECT_H
diff --git a/inst/include/Eigen/src/Core/SelfAdjointView.h b/inst/include/Eigen/src/Core/SelfAdjointView.h
index 6fa7cd15..16f0e751 100644
--- a/inst/include/Eigen/src/Core/SelfAdjointView.h
+++ b/inst/include/Eigen/src/Core/SelfAdjointView.h
@@ -10,305 +10,320 @@
 #ifndef EIGEN_SELFADJOINTMATRIX_H
 #define EIGEN_SELFADJOINTMATRIX_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 /** \class SelfAdjointView
-  * \ingroup Core_Module
-  *
-  *
-  * \brief Expression of a selfadjoint matrix from a triangular part of a dense matrix
-  *
-  * \param MatrixType the type of the dense matrix storing the coefficients
-  * \param TriangularPart can be either \c #Lower or \c #Upper
-  *
-  * This class is an expression of a sefladjoint matrix from a triangular part of a matrix
-  * with given dense storage of the coefficients. It is the return type of MatrixBase::selfadjointView()
-  * and most of the time this is the only way that it is used.
-  *
-  * \sa class TriangularBase, MatrixBase::selfadjointView()
-  */
+ * \ingroup Core_Module
+ *
+ *
+ * \brief Expression of a selfadjoint matrix from a triangular part of a dense matrix
+ *
+ * \tparam MatrixType the type of the dense matrix storing the coefficients
+ * \tparam TriangularPart can be either \c #Lower or \c #Upper
+ *
+ * This class is an expression of a sefladjoint matrix from a triangular part of a matrix
+ * with given dense storage of the coefficients. It is the return type of MatrixBase::selfadjointView()
+ * and most of the time this is the only way that it is used.
+ *
+ * \sa class TriangularBase, MatrixBase::selfadjointView()
+ */
 
 namespace internal {
-template<typename MatrixType, unsigned int UpLo>
-struct traits<SelfAdjointView<MatrixType, UpLo> > : traits<MatrixType>
-{
-  typedef typename nested<MatrixType>::type MatrixTypeNested;
-  typedef typename remove_all<MatrixTypeNested>::type MatrixTypeNestedCleaned;
+template <typename MatrixType, unsigned int UpLo>
+struct traits<SelfAdjointView<MatrixType, UpLo> > : traits<MatrixType> {
+  typedef typename ref_selector<MatrixType>::non_const_type MatrixTypeNested;
+  typedef remove_all_t<MatrixTypeNested> MatrixTypeNestedCleaned;
   typedef MatrixType ExpressionType;
-  typedef typename MatrixType::PlainObject DenseMatrixType;
+  typedef typename MatrixType::PlainObject FullMatrixType;
   enum {
     Mode = UpLo | SelfAdjoint,
-    Flags =  MatrixTypeNestedCleaned::Flags & (HereditaryBits)
-           & (~(PacketAccessBit | DirectAccessBit | LinearAccessBit)), // FIXME these flags should be preserved
-    CoeffReadCost = MatrixTypeNestedCleaned::CoeffReadCost
+    FlagsLvalueBit = is_lvalue<MatrixType>::value ? LvalueBit : 0,
+    Flags = MatrixTypeNestedCleaned::Flags & (HereditaryBits | FlagsLvalueBit) &
+            (~(PacketAccessBit | DirectAccessBit | LinearAccessBit))  // FIXME these flags should be preserved
   };
 };
-}
-
-template <typename Lhs, int LhsMode, bool LhsIsVector,
-          typename Rhs, int RhsMode, bool RhsIsVector>
-struct SelfadjointProductMatrix;
-
-// FIXME could also be called SelfAdjointWrapper to be consistent with DiagonalWrapper ??
-template<typename MatrixType, unsigned int UpLo> class SelfAdjointView
-  : public TriangularBase<SelfAdjointView<MatrixType, UpLo> >
-{
-  public:
-
-    typedef TriangularBase<SelfAdjointView> Base;
-    typedef typename internal::traits<SelfAdjointView>::MatrixTypeNested MatrixTypeNested;
-    typedef typename internal::traits<SelfAdjointView>::MatrixTypeNestedCleaned MatrixTypeNestedCleaned;
-
-    /** \brief The type of coefficients in this matrix */
-    typedef typename internal::traits<SelfAdjointView>::Scalar Scalar; 
-
-    typedef typename MatrixType::Index Index;
-
-    enum {
-      Mode = internal::traits<SelfAdjointView>::Mode
-    };
-    typedef typename MatrixType::PlainObject PlainObject;
-
-    inline SelfAdjointView(MatrixType& matrix) : m_matrix(matrix)
-    {}
-
-    inline Index rows() const { return m_matrix.rows(); }
-    inline Index cols() const { return m_matrix.cols(); }
-    inline Index outerStride() const { return m_matrix.outerStride(); }
-    inline Index innerStride() const { return m_matrix.innerStride(); }
-
-    /** \sa MatrixBase::coeff()
-      * \warning the coordinates must fit into the referenced triangular part
-      */
-    inline Scalar coeff(Index row, Index col) const
-    {
-      Base::check_coordinates_internal(row, col);
-      return m_matrix.coeff(row, col);
-    }
-
-    /** \sa MatrixBase::coeffRef()
-      * \warning the coordinates must fit into the referenced triangular part
-      */
-    inline Scalar& coeffRef(Index row, Index col)
-    {
-      Base::check_coordinates_internal(row, col);
-      return m_matrix.const_cast_derived().coeffRef(row, col);
-    }
-
-    /** \internal */
-    const MatrixTypeNestedCleaned& _expression() const { return m_matrix; }
-
-    const MatrixTypeNestedCleaned& nestedExpression() const { return m_matrix; }
-    MatrixTypeNestedCleaned& nestedExpression() { return *const_cast<MatrixTypeNestedCleaned*>(&m_matrix); }
-
-    /** Efficient self-adjoint matrix times vector/matrix product */
-    template<typename OtherDerived>
-    SelfadjointProductMatrix<MatrixType,Mode,false,OtherDerived,0,OtherDerived::IsVectorAtCompileTime>
-    operator*(const MatrixBase<OtherDerived>& rhs) const
-    {
-      return SelfadjointProductMatrix
-              <MatrixType,Mode,false,OtherDerived,0,OtherDerived::IsVectorAtCompileTime>
-              (m_matrix, rhs.derived());
-    }
-
-    /** Efficient vector/matrix times self-adjoint matrix product */
-    template<typename OtherDerived> friend
-    SelfadjointProductMatrix<OtherDerived,0,OtherDerived::IsVectorAtCompileTime,MatrixType,Mode,false>
-    operator*(const MatrixBase<OtherDerived>& lhs, const SelfAdjointView& rhs)
-    {
-      return SelfadjointProductMatrix
-              <OtherDerived,0,OtherDerived::IsVectorAtCompileTime,MatrixType,Mode,false>
-              (lhs.derived(),rhs.m_matrix);
-    }
-
-    /** Perform a symmetric rank 2 update of the selfadjoint matrix \c *this:
-      * \f$ this = this + \alpha u v^* + conj(\alpha) v u^* \f$
-      * \returns a reference to \c *this
-      *
-      * The vectors \a u and \c v \b must be column vectors, however they can be
-      * a adjoint expression without any overhead. Only the meaningful triangular
-      * part of the matrix is updated, the rest is left unchanged.
-      *
-      * \sa rankUpdate(const MatrixBase<DerivedU>&, Scalar)
-      */
-    template<typename DerivedU, typename DerivedV>
-    SelfAdjointView& rankUpdate(const MatrixBase<DerivedU>& u, const MatrixBase<DerivedV>& v, const Scalar& alpha = Scalar(1));
-
-    /** Perform a symmetric rank K update of the selfadjoint matrix \c *this:
-      * \f$ this = this + \alpha ( u u^* ) \f$ where \a u is a vector or matrix.
-      *
-      * \returns a reference to \c *this
-      *
-      * Note that to perform \f$ this = this + \alpha ( u^* u ) \f$ you can simply
-      * call this function with u.adjoint().
-      *
-      * \sa rankUpdate(const MatrixBase<DerivedU>&, const MatrixBase<DerivedV>&, Scalar)
-      */
-    template<typename DerivedU>
-    SelfAdjointView& rankUpdate(const MatrixBase<DerivedU>& u, const Scalar& alpha = Scalar(1));
-
-/////////// Cholesky module ///////////
-
-    const LLT<PlainObject, UpLo> llt() const;
-    const LDLT<PlainObject, UpLo> ldlt() const;
-
-/////////// Eigenvalue module ///////////
-
-    /** Real part of #Scalar */
-    typedef typename NumTraits<Scalar>::Real RealScalar;
-    /** Return type of eigenvalues() */
-    typedef Matrix<RealScalar, internal::traits<MatrixType>::ColsAtCompileTime, 1> EigenvaluesReturnType;
-
-    EigenvaluesReturnType eigenvalues() const;
-    RealScalar operatorNorm() const;
-    
-    #ifdef EIGEN2_SUPPORT
-    template<typename OtherDerived>
-    SelfAdjointView& operator=(const MatrixBase<OtherDerived>& other)
-    {
-      enum {
-        OtherPart = UpLo == Upper ? StrictlyLower : StrictlyUpper
-      };
-      m_matrix.const_cast_derived().template triangularView<UpLo>() = other;
-      m_matrix.const_cast_derived().template triangularView<OtherPart>() = other.adjoint();
-      return *this;
-    }
-    template<typename OtherMatrixType, unsigned int OtherMode>
-    SelfAdjointView& operator=(const TriangularView<OtherMatrixType, OtherMode>& other)
-    {
-      enum {
-        OtherPart = UpLo == Upper ? StrictlyLower : StrictlyUpper
-      };
-      m_matrix.const_cast_derived().template triangularView<UpLo>() = other.toDenseMatrix();
-      m_matrix.const_cast_derived().template triangularView<OtherPart>() = other.toDenseMatrix().adjoint();
-      return *this;
-    }
-    #endif
-
-  protected:
-    MatrixTypeNested m_matrix;
-};
+}  // namespace internal
 
+template <typename MatrixType_, unsigned int UpLo>
+class SelfAdjointView : public TriangularBase<SelfAdjointView<MatrixType_, UpLo> > {
+ public:
+  EIGEN_STATIC_ASSERT(UpLo == Lower || UpLo == Upper, SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY)
 
-// template<typename OtherDerived, typename MatrixType, unsigned int UpLo>
-// internal::selfadjoint_matrix_product_returntype<OtherDerived,SelfAdjointView<MatrixType,UpLo> >
-// operator*(const MatrixBase<OtherDerived>& lhs, const SelfAdjointView<MatrixType,UpLo>& rhs)
-// {
-//   return internal::matrix_selfadjoint_product_returntype<OtherDerived,SelfAdjointView<MatrixType,UpLo> >(lhs.derived(),rhs);
-// }
-
-// selfadjoint to dense matrix
+  typedef MatrixType_ MatrixType;
+  typedef TriangularBase<SelfAdjointView> Base;
+  typedef typename internal::traits<SelfAdjointView>::MatrixTypeNested MatrixTypeNested;
+  typedef typename internal::traits<SelfAdjointView>::MatrixTypeNestedCleaned MatrixTypeNestedCleaned;
+  typedef MatrixTypeNestedCleaned NestedExpression;
 
-namespace internal {
+  /** \brief The type of coefficients in this matrix */
+  typedef typename internal::traits<SelfAdjointView>::Scalar Scalar;
+  typedef typename MatrixType::StorageIndex StorageIndex;
+  typedef internal::remove_all_t<typename MatrixType::ConjugateReturnType> MatrixConjugateReturnType;
+  typedef SelfAdjointView<std::add_const_t<MatrixType>, UpLo> ConstSelfAdjointView;
 
-template<typename Derived1, typename Derived2, int UnrollCount, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, (SelfAdjoint|Upper), UnrollCount, ClearOpposite>
-{
   enum {
-    col = (UnrollCount-1) / Derived1::RowsAtCompileTime,
-    row = (UnrollCount-1) % Derived1::RowsAtCompileTime
+    Mode = internal::traits<SelfAdjointView>::Mode,
+    Flags = internal::traits<SelfAdjointView>::Flags,
+    TransposeMode = ((int(Mode) & int(Upper)) ? Lower : 0) | ((int(Mode) & int(Lower)) ? Upper : 0)
   };
+  typedef typename MatrixType::PlainObject PlainObject;
+
+  EIGEN_DEVICE_FUNC explicit inline SelfAdjointView(MatrixType& matrix) : m_matrix(matrix) {}
 
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-    triangular_assignment_selector<Derived1, Derived2, (SelfAdjoint|Upper), UnrollCount-1, ClearOpposite>::run(dst, src);
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_matrix.rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_matrix.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return m_matrix.outerStride(); }
+  EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept { return m_matrix.innerStride(); }
 
-    if(row == col)
-      dst.coeffRef(row, col) = numext::real(src.coeff(row, col));
-    else if(row < col)
-      dst.coeffRef(col, row) = numext::conj(dst.coeffRef(row, col) = src.coeff(row, col));
+  /** \sa MatrixBase::coeff()
+   * \warning the coordinates must fit into the referenced triangular part
+   */
+  EIGEN_DEVICE_FUNC inline Scalar coeff(Index row, Index col) const {
+    Base::check_coordinates_internal(row, col);
+    return m_matrix.coeff(row, col);
   }
-};
 
-template<typename Derived1, typename Derived2, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, SelfAdjoint|Upper, 0, ClearOpposite>
-{
-  static inline void run(Derived1 &, const Derived2 &) {}
-};
+  /** \sa MatrixBase::coeffRef()
+   * \warning the coordinates must fit into the referenced triangular part
+   */
+  EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index row, Index col) {
+    EIGEN_STATIC_ASSERT_LVALUE(SelfAdjointView);
+    Base::check_coordinates_internal(row, col);
+    return m_matrix.coeffRef(row, col);
+  }
 
-template<typename Derived1, typename Derived2, int UnrollCount, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, (SelfAdjoint|Lower), UnrollCount, ClearOpposite>
-{
-  enum {
-    col = (UnrollCount-1) / Derived1::RowsAtCompileTime,
-    row = (UnrollCount-1) % Derived1::RowsAtCompileTime
-  };
+  /** \internal */
+  EIGEN_DEVICE_FUNC const MatrixTypeNestedCleaned& _expression() const { return m_matrix; }
 
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-    triangular_assignment_selector<Derived1, Derived2, (SelfAdjoint|Lower), UnrollCount-1, ClearOpposite>::run(dst, src);
+  EIGEN_DEVICE_FUNC const MatrixTypeNestedCleaned& nestedExpression() const { return m_matrix; }
+  EIGEN_DEVICE_FUNC MatrixTypeNestedCleaned& nestedExpression() { return m_matrix; }
 
-    if(row == col)
-      dst.coeffRef(row, col) = numext::real(src.coeff(row, col));
-    else if(row > col)
-      dst.coeffRef(col, row) = numext::conj(dst.coeffRef(row, col) = src.coeff(row, col));
+  /** Efficient triangular matrix times vector/matrix product */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC const Product<SelfAdjointView, OtherDerived> operator*(const MatrixBase<OtherDerived>& rhs) const {
+    return Product<SelfAdjointView, OtherDerived>(*this, rhs.derived());
   }
-};
 
-template<typename Derived1, typename Derived2, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, SelfAdjoint|Lower, 0, ClearOpposite>
-{
-  static inline void run(Derived1 &, const Derived2 &) {}
-};
+  /** Efficient vector/matrix times triangular matrix product */
+  template <typename OtherDerived>
+  friend EIGEN_DEVICE_FUNC const Product<OtherDerived, SelfAdjointView> operator*(const MatrixBase<OtherDerived>& lhs,
+                                                                                  const SelfAdjointView& rhs) {
+    return Product<OtherDerived, SelfAdjointView>(lhs.derived(), rhs);
+  }
+
+  friend EIGEN_DEVICE_FUNC const
+      SelfAdjointView<const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar, MatrixType, product), UpLo>
+      operator*(const Scalar& s, const SelfAdjointView& mat) {
+    return (s * mat.nestedExpression()).template selfadjointView<UpLo>();
+  }
+
+  /** Perform a symmetric rank 2 update of the selfadjoint matrix \c *this:
+   * \f$ this = this + \alpha u v^* + conj(\alpha) v u^* \f$
+   * \returns a reference to \c *this
+   *
+   * The vectors \a u and \c v \b must be column vectors, however they can be
+   * a adjoint expression without any overhead. Only the meaningful triangular
+   * part of the matrix is updated, the rest is left unchanged.
+   *
+   * \sa rankUpdate(const MatrixBase<DerivedU>&, Scalar)
+   */
+  template <typename DerivedU, typename DerivedV>
+  EIGEN_DEVICE_FUNC SelfAdjointView& rankUpdate(const MatrixBase<DerivedU>& u, const MatrixBase<DerivedV>& v,
+                                                const Scalar& alpha = Scalar(1));
+
+  /** Perform a symmetric rank K update of the selfadjoint matrix \c *this:
+   * \f$ this = this + \alpha ( u u^* ) \f$ where \a u is a vector or matrix.
+   *
+   * \returns a reference to \c *this
+   *
+   * Note that to perform \f$ this = this + \alpha ( u^* u ) \f$ you can simply
+   * call this function with u.adjoint().
+   *
+   * \sa rankUpdate(const MatrixBase<DerivedU>&, const MatrixBase<DerivedV>&, Scalar)
+   */
+  template <typename DerivedU>
+  EIGEN_DEVICE_FUNC SelfAdjointView& rankUpdate(const MatrixBase<DerivedU>& u, const Scalar& alpha = Scalar(1));
+
+  /** \returns an expression of a triangular view extracted from the current selfadjoint view of a given triangular part
+   *
+   * The parameter \a TriMode can have the following values: \c #Upper, \c #StrictlyUpper, \c #UnitUpper,
+   * \c #Lower, \c #StrictlyLower, \c #UnitLower.
+   *
+   * If \c TriMode references the same triangular part than \c *this, then this method simply return a \c TriangularView
+   * of the nested expression, otherwise, the nested expression is first transposed, thus returning a \c
+   * TriangularView<Transpose<MatrixType>> object.
+   *
+   * \sa MatrixBase::triangularView(), class TriangularView
+   */
+  template <unsigned int TriMode>
+  EIGEN_DEVICE_FUNC
+      std::conditional_t<(TriMode & (Upper | Lower)) == (UpLo & (Upper | Lower)), TriangularView<MatrixType, TriMode>,
+                         TriangularView<typename MatrixType::AdjointReturnType, TriMode> >
+      triangularView() const {
+    std::conditional_t<(TriMode & (Upper | Lower)) == (UpLo & (Upper | Lower)), MatrixType&,
+                       typename MatrixType::ConstTransposeReturnType>
+        tmp1(m_matrix);
+    std::conditional_t<(TriMode & (Upper | Lower)) == (UpLo & (Upper | Lower)), MatrixType&,
+                       typename MatrixType::AdjointReturnType>
+        tmp2(tmp1);
+    return std::conditional_t<(TriMode & (Upper | Lower)) == (UpLo & (Upper | Lower)),
+                              TriangularView<MatrixType, TriMode>,
+                              TriangularView<typename MatrixType::AdjointReturnType, TriMode> >(tmp2);
+  }
+
+  typedef SelfAdjointView<const MatrixConjugateReturnType, UpLo> ConjugateReturnType;
+  /** \sa MatrixBase::conjugate() const */
+  EIGEN_DEVICE_FUNC inline const ConjugateReturnType conjugate() const {
+    return ConjugateReturnType(m_matrix.conjugate());
+  }
 
-template<typename Derived1, typename Derived2, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, SelfAdjoint|Upper, Dynamic, ClearOpposite>
-{
-  typedef typename Derived1::Index Index;
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-    for(Index j = 0; j < dst.cols(); ++j)
-    {
-      for(Index i = 0; i < j; ++i)
-      {
-        dst.copyCoeff(i, j, src);
-        dst.coeffRef(j,i) = numext::conj(dst.coeff(i,j));
-      }
-      dst.copyCoeff(j, j, src);
-    }
+  /** \returns an expression of the complex conjugate of \c *this if Cond==true,
+   *           returns \c *this otherwise.
+   */
+  template <bool Cond>
+  EIGEN_DEVICE_FUNC inline std::conditional_t<Cond, ConjugateReturnType, ConstSelfAdjointView> conjugateIf() const {
+    typedef std::conditional_t<Cond, ConjugateReturnType, ConstSelfAdjointView> ReturnType;
+    return ReturnType(m_matrix.template conjugateIf<Cond>());
   }
+
+  typedef SelfAdjointView<const typename MatrixType::AdjointReturnType, TransposeMode> AdjointReturnType;
+  /** \sa MatrixBase::adjoint() const */
+  EIGEN_DEVICE_FUNC inline const AdjointReturnType adjoint() const { return AdjointReturnType(m_matrix.adjoint()); }
+
+  typedef SelfAdjointView<typename MatrixType::TransposeReturnType, TransposeMode> TransposeReturnType;
+  /** \sa MatrixBase::transpose() */
+  template <class Dummy = int>
+  EIGEN_DEVICE_FUNC inline TransposeReturnType transpose(
+      std::enable_if_t<Eigen::internal::is_lvalue<MatrixType>::value, Dummy*> = nullptr) {
+    typename MatrixType::TransposeReturnType tmp(m_matrix);
+    return TransposeReturnType(tmp);
+  }
+
+  typedef SelfAdjointView<const typename MatrixType::ConstTransposeReturnType, TransposeMode> ConstTransposeReturnType;
+  /** \sa MatrixBase::transpose() const */
+  EIGEN_DEVICE_FUNC inline const ConstTransposeReturnType transpose() const {
+    return ConstTransposeReturnType(m_matrix.transpose());
+  }
+
+  /** \returns a const expression of the main diagonal of the matrix \c *this
+   *
+   * This method simply returns the diagonal of the nested expression, thus by-passing the SelfAdjointView decorator.
+   *
+   * \sa MatrixBase::diagonal(), class Diagonal */
+  EIGEN_DEVICE_FUNC typename MatrixType::ConstDiagonalReturnType diagonal() const {
+    return typename MatrixType::ConstDiagonalReturnType(m_matrix);
+  }
+
+  /////////// Cholesky module ///////////
+
+  const LLT<PlainObject, UpLo> llt() const;
+  const LDLT<PlainObject, UpLo> ldlt() const;
+
+  /////////// Eigenvalue module ///////////
+
+  /** Real part of #Scalar */
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  /** Return type of eigenvalues() */
+  typedef Matrix<RealScalar, internal::traits<MatrixType>::ColsAtCompileTime, 1> EigenvaluesReturnType;
+
+  EIGEN_DEVICE_FUNC EigenvaluesReturnType eigenvalues() const;
+  EIGEN_DEVICE_FUNC RealScalar operatorNorm() const;
+
+ protected:
+  MatrixTypeNested m_matrix;
+};
+
+// template<typename OtherDerived, typename MatrixType, unsigned int UpLo>
+// internal::selfadjoint_matrix_product_returntype<OtherDerived,SelfAdjointView<MatrixType,UpLo> >
+// operator*(const MatrixBase<OtherDerived>& lhs, const SelfAdjointView<MatrixType,UpLo>& rhs)
+// {
+//   return internal::matrix_selfadjoint_product_returntype<OtherDerived,SelfAdjointView<MatrixType,UpLo>
+//   >(lhs.derived(),rhs);
+// }
+
+// selfadjoint to dense matrix
+
+namespace internal {
+
+// TODO currently a selfadjoint expression has the form SelfAdjointView<.,.>
+//      in the future selfadjoint-ness should be defined by the expression traits
+//      such that Transpose<SelfAdjointView<.,.> > is valid. (currently TriangularBase::transpose() is overloaded to
+//      make it work)
+template <typename MatrixType, unsigned int Mode>
+struct evaluator_traits<SelfAdjointView<MatrixType, Mode> > {
+  typedef typename storage_kind_to_evaluator_kind<typename MatrixType::StorageKind>::Kind Kind;
+  typedef SelfAdjointShape Shape;
 };
 
-template<typename Derived1, typename Derived2, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, SelfAdjoint|Lower, Dynamic, ClearOpposite>
-{
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-  typedef typename Derived1::Index Index;
-    for(Index i = 0; i < dst.rows(); ++i)
-    {
-      for(Index j = 0; j < i; ++j)
-      {
-        dst.copyCoeff(i, j, src);
-        dst.coeffRef(j,i) = numext::conj(dst.coeff(i,j));
-      }
-      dst.copyCoeff(i, i, src);
-    }
+template <int UpLo, int SetOpposite, typename DstEvaluatorTypeT, typename SrcEvaluatorTypeT, typename Functor,
+          int Version>
+class triangular_dense_assignment_kernel<UpLo, SelfAdjoint, SetOpposite, DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor,
+                                         Version>
+    : public generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, Version> {
+ protected:
+  typedef generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, Version> Base;
+  typedef typename Base::DstXprType DstXprType;
+  typedef typename Base::SrcXprType SrcXprType;
+  using Base::m_dst;
+  using Base::m_functor;
+  using Base::m_src;
+
+ public:
+  typedef typename Base::DstEvaluatorType DstEvaluatorType;
+  typedef typename Base::SrcEvaluatorType SrcEvaluatorType;
+  typedef typename Base::Scalar Scalar;
+  typedef typename Base::AssignmentTraits AssignmentTraits;
+
+  EIGEN_DEVICE_FUNC triangular_dense_assignment_kernel(DstEvaluatorType& dst, const SrcEvaluatorType& src,
+                                                       const Functor& func, DstXprType& dstExpr)
+      : Base(dst, src, func, dstExpr) {}
+
+  EIGEN_DEVICE_FUNC void assignCoeff(Index row, Index col) {
+    eigen_internal_assert(row != col);
+    Scalar tmp = m_src.coeff(row, col);
+    m_functor.assignCoeff(m_dst.coeffRef(row, col), tmp);
+    m_functor.assignCoeff(m_dst.coeffRef(col, row), numext::conj(tmp));
   }
+
+  EIGEN_DEVICE_FUNC void assignDiagonalCoeff(Index id) { Base::assignCoeff(id, id); }
+
+  EIGEN_DEVICE_FUNC void assignOppositeCoeff(Index, Index) { eigen_internal_assert(false && "should never be called"); }
 };
 
-} // end namespace internal
+}  // end namespace internal
 
 /***************************************************************************
-* Implementation of MatrixBase methods
-***************************************************************************/
-
-template<typename Derived>
-template<unsigned int UpLo>
-typename MatrixBase<Derived>::template ConstSelfAdjointViewReturnType<UpLo>::Type
-MatrixBase<Derived>::selfadjointView() const
-{
-  return derived();
+ * Implementation of MatrixBase methods
+ ***************************************************************************/
+
+/** This is the const version of MatrixBase::selfadjointView() */
+template <typename Derived>
+template <unsigned int UpLo>
+EIGEN_DEVICE_FUNC typename MatrixBase<Derived>::template ConstSelfAdjointViewReturnType<UpLo>::Type
+MatrixBase<Derived>::selfadjointView() const {
+  return typename ConstSelfAdjointViewReturnType<UpLo>::Type(derived());
 }
 
-template<typename Derived>
-template<unsigned int UpLo>
-typename MatrixBase<Derived>::template SelfAdjointViewReturnType<UpLo>::Type
-MatrixBase<Derived>::selfadjointView()
-{
-  return derived();
+/** \returns an expression of a symmetric/self-adjoint view extracted from the upper or lower triangular part of the
+ * current matrix
+ *
+ * The parameter \a UpLo can be either \c #Upper or \c #Lower
+ *
+ * Example: \include MatrixBase_selfadjointView.cpp
+ * Output: \verbinclude MatrixBase_selfadjointView.out
+ *
+ * \sa class SelfAdjointView
+ */
+template <typename Derived>
+template <unsigned int UpLo>
+EIGEN_DEVICE_FUNC typename MatrixBase<Derived>::template SelfAdjointViewReturnType<UpLo>::Type
+MatrixBase<Derived>::selfadjointView() {
+  return typename SelfAdjointViewReturnType<UpLo>::Type(derived());
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_SELFADJOINTMATRIX_H
+#endif  // EIGEN_SELFADJOINTMATRIX_H
diff --git a/inst/include/Eigen/src/Core/SelfCwiseBinaryOp.h b/inst/include/Eigen/src/Core/SelfCwiseBinaryOp.h
index 0956475a..1bc03737 100644
--- a/inst/include/Eigen/src/Core/SelfCwiseBinaryOp.h
+++ b/inst/include/Eigen/src/Core/SelfCwiseBinaryOp.h
@@ -10,182 +10,41 @@
 #ifndef EIGEN_SELFCWISEBINARYOP_H
 #define EIGEN_SELFCWISEBINARYOP_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 
-/** \class SelfCwiseBinaryOp
-  * \ingroup Core_Module
-  *
-  * \internal
-  *
-  * \brief Internal helper class for optimizing operators like +=, -=
-  *
-  * This is a pseudo expression class re-implementing the copyCoeff/copyPacket
-  * method to directly performs a +=/-= operations in an optimal way. In particular,
-  * this allows to make sure that the input/output data are loaded only once using
-  * aligned packet loads.
-  *
-  * \sa class SwapWrapper for a similar trick.
-  */
+namespace Eigen {
 
-namespace internal {
-template<typename BinaryOp, typename Lhs, typename Rhs>
-struct traits<SelfCwiseBinaryOp<BinaryOp,Lhs,Rhs> >
-  : traits<CwiseBinaryOp<BinaryOp,Lhs,Rhs> >
-{
-  enum {
-    // Note that it is still a good idea to preserve the DirectAccessBit
-    // so that assign can correctly align the data.
-    Flags = traits<CwiseBinaryOp<BinaryOp,Lhs,Rhs> >::Flags | (Lhs::Flags&DirectAccessBit) | (Lhs::Flags&LvalueBit),
-    OuterStrideAtCompileTime = Lhs::OuterStrideAtCompileTime,
-    InnerStrideAtCompileTime = Lhs::InnerStrideAtCompileTime
-  };
-};
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator*=(const Scalar& other) {
+  using ConstantExpr = typename internal::plain_constant_type<Derived, Scalar>::type;
+  using Op = internal::mul_assign_op<Scalar>;
+  internal::call_assignment(derived(), ConstantExpr(rows(), cols(), other), Op());
+  return derived();
 }
 
-template<typename BinaryOp, typename Lhs, typename Rhs> class SelfCwiseBinaryOp
-  : public internal::dense_xpr_base< SelfCwiseBinaryOp<BinaryOp, Lhs, Rhs> >::type
-{
-  public:
-
-    typedef typename internal::dense_xpr_base<SelfCwiseBinaryOp>::type Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(SelfCwiseBinaryOp)
-
-    typedef typename internal::packet_traits<Scalar>::type Packet;
-
-    inline SelfCwiseBinaryOp(Lhs& xpr, const BinaryOp& func = BinaryOp()) : m_matrix(xpr), m_functor(func) {}
-
-    inline Index rows() const { return m_matrix.rows(); }
-    inline Index cols() const { return m_matrix.cols(); }
-    inline Index outerStride() const { return m_matrix.outerStride(); }
-    inline Index innerStride() const { return m_matrix.innerStride(); }
-    inline const Scalar* data() const { return m_matrix.data(); }
-
-    // note that this function is needed by assign to correctly align loads/stores
-    // TODO make Assign use .data()
-    inline Scalar& coeffRef(Index row, Index col)
-    {
-      EIGEN_STATIC_ASSERT_LVALUE(Lhs)
-      return m_matrix.const_cast_derived().coeffRef(row, col);
-    }
-    inline const Scalar& coeffRef(Index row, Index col) const
-    {
-      return m_matrix.coeffRef(row, col);
-    }
-
-    // note that this function is needed by assign to correctly align loads/stores
-    // TODO make Assign use .data()
-    inline Scalar& coeffRef(Index index)
-    {
-      EIGEN_STATIC_ASSERT_LVALUE(Lhs)
-      return m_matrix.const_cast_derived().coeffRef(index);
-    }
-    inline const Scalar& coeffRef(Index index) const
-    {
-      return m_matrix.const_cast_derived().coeffRef(index);
-    }
-
-    template<typename OtherDerived>
-    void copyCoeff(Index row, Index col, const DenseBase<OtherDerived>& other)
-    {
-      OtherDerived& _other = other.const_cast_derived();
-      eigen_internal_assert(row >= 0 && row < rows()
-                         && col >= 0 && col < cols());
-      Scalar& tmp = m_matrix.coeffRef(row,col);
-      tmp = m_functor(tmp, _other.coeff(row,col));
-    }
-
-    template<typename OtherDerived>
-    void copyCoeff(Index index, const DenseBase<OtherDerived>& other)
-    {
-      OtherDerived& _other = other.const_cast_derived();
-      eigen_internal_assert(index >= 0 && index < m_matrix.size());
-      Scalar& tmp = m_matrix.coeffRef(index);
-      tmp = m_functor(tmp, _other.coeff(index));
-    }
-
-    template<typename OtherDerived, int StoreMode, int LoadMode>
-    void copyPacket(Index row, Index col, const DenseBase<OtherDerived>& other)
-    {
-      OtherDerived& _other = other.const_cast_derived();
-      eigen_internal_assert(row >= 0 && row < rows()
-                        && col >= 0 && col < cols());
-      m_matrix.template writePacket<StoreMode>(row, col,
-        m_functor.packetOp(m_matrix.template packet<StoreMode>(row, col),_other.template packet<LoadMode>(row, col)) );
-    }
-
-    template<typename OtherDerived, int StoreMode, int LoadMode>
-    void copyPacket(Index index, const DenseBase<OtherDerived>& other)
-    {
-      OtherDerived& _other = other.const_cast_derived();
-      eigen_internal_assert(index >= 0 && index < m_matrix.size());
-      m_matrix.template writePacket<StoreMode>(index,
-        m_functor.packetOp(m_matrix.template packet<StoreMode>(index),_other.template packet<LoadMode>(index)) );
-    }
-
-    // reimplement lazyAssign to handle complex *= real
-    // see CwiseBinaryOp ctor for details
-    template<typename RhsDerived>
-    EIGEN_STRONG_INLINE SelfCwiseBinaryOp& lazyAssign(const DenseBase<RhsDerived>& rhs)
-    {
-      EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Lhs,RhsDerived)
-      EIGEN_CHECK_BINARY_COMPATIBILIY(BinaryOp,typename Lhs::Scalar,typename RhsDerived::Scalar);
-      
-    #ifdef EIGEN_DEBUG_ASSIGN
-      internal::assign_traits<SelfCwiseBinaryOp, RhsDerived>::debug();
-    #endif
-      eigen_assert(rows() == rhs.rows() && cols() == rhs.cols());
-      internal::assign_impl<SelfCwiseBinaryOp, RhsDerived>::run(*this,rhs.derived());
-    #ifndef EIGEN_NO_DEBUG
-      this->checkTransposeAliasing(rhs.derived());
-    #endif
-      return *this;
-    }
-    
-    // overloaded to honor evaluation of special matrices
-    // maybe another solution would be to not use SelfCwiseBinaryOp
-    // at first...
-    SelfCwiseBinaryOp& operator=(const Rhs& _rhs)
-    {
-      typename internal::nested<Rhs>::type rhs(_rhs);
-      return Base::operator=(rhs);
-    }
-
-    Lhs& expression() const 
-    { 
-      return m_matrix;
-    }
-
-    const BinaryOp& functor() const 
-    { 
-      return m_functor;
-    }
-
-  protected:
-    Lhs& m_matrix;
-    const BinaryOp& m_functor;
-
-  private:
-    SelfCwiseBinaryOp& operator=(const SelfCwiseBinaryOp&);
-};
+template <typename Derived>
+template <bool Enable, typename>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator*=(const RealScalar& other) {
+  realView() *= other;
+  return derived();
+}
 
-template<typename Derived>
-inline Derived& DenseBase<Derived>::operator*=(const Scalar& other)
-{
-  typedef typename Derived::PlainObject PlainObject;
-  SelfCwiseBinaryOp<internal::scalar_product_op<Scalar>, Derived, typename PlainObject::ConstantReturnType> tmp(derived());
-  tmp = PlainObject::Constant(rows(),cols(),other);
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator/=(const Scalar& other) {
+  using ConstantExpr = typename internal::plain_constant_type<Derived, Scalar>::type;
+  using Op = internal::div_assign_op<Scalar>;
+  internal::call_assignment(derived(), ConstantExpr(rows(), cols(), other), Op());
   return derived();
 }
 
-template<typename Derived>
-inline Derived& DenseBase<Derived>::operator/=(const Scalar& other)
-{
-  typedef typename Derived::PlainObject PlainObject;
-  SelfCwiseBinaryOp<internal::scalar_quotient_op<Scalar>, Derived, typename PlainObject::ConstantReturnType> tmp(derived());
-  tmp = PlainObject::Constant(rows(),cols(), other);
+template <typename Derived>
+template <bool Enable, typename>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase<Derived>::operator/=(const RealScalar& other) {
+  realView() /= other;
   return derived();
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_SELFCWISEBINARYOP_H
+#endif  // EIGEN_SELFCWISEBINARYOP_H
diff --git a/inst/include/Eigen/src/Core/SkewSymmetricMatrix3.h b/inst/include/Eigen/src/Core/SkewSymmetricMatrix3.h
new file mode 100644
index 00000000..3545afc7
--- /dev/null
+++ b/inst/include/Eigen/src/Core/SkewSymmetricMatrix3.h
@@ -0,0 +1,382 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2007-2009 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SKEWSYMMETRICMATRIX3_H
+#define EIGEN_SKEWSYMMETRICMATRIX3_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \class SkewSymmetricBase
+ * \ingroup Core_Module
+ *
+ * \brief Base class for skew symmetric matrices and expressions
+ *
+ * This is the base class that is inherited by SkewSymmetricMatrix3 and related expression
+ * types, which internally use a three vector for storing the entries. SkewSymmetric
+ * types always represent square three times three matrices.
+ *
+ * This implementations follows class DiagonalMatrix
+ *
+ * \tparam Derived is the derived type, a SkewSymmetricMatrix3 or SkewSymmetricWrapper.
+ *
+ * \sa class SkewSymmetricMatrix3, class SkewSymmetricWrapper
+ */
+template <typename Derived>
+class SkewSymmetricBase : public EigenBase<Derived> {
+ public:
+  typedef typename internal::traits<Derived>::SkewSymmetricVectorType SkewSymmetricVectorType;
+  typedef typename SkewSymmetricVectorType::Scalar Scalar;
+  typedef typename SkewSymmetricVectorType::RealScalar RealScalar;
+  typedef typename internal::traits<Derived>::StorageKind StorageKind;
+  typedef typename internal::traits<Derived>::StorageIndex StorageIndex;
+
+  enum {
+    RowsAtCompileTime = SkewSymmetricVectorType::SizeAtCompileTime,
+    ColsAtCompileTime = SkewSymmetricVectorType::SizeAtCompileTime,
+    MaxRowsAtCompileTime = SkewSymmetricVectorType::MaxSizeAtCompileTime,
+    MaxColsAtCompileTime = SkewSymmetricVectorType::MaxSizeAtCompileTime,
+    IsVectorAtCompileTime = 0,
+    Flags = NoPreferredStorageOrderBit
+  };
+
+  typedef Matrix<Scalar, RowsAtCompileTime, ColsAtCompileTime, 0, MaxRowsAtCompileTime, MaxColsAtCompileTime>
+      DenseMatrixType;
+  typedef DenseMatrixType DenseType;
+  typedef SkewSymmetricMatrix3<Scalar> PlainObject;
+
+  /** \returns a reference to the derived object. */
+  EIGEN_DEVICE_FUNC inline const Derived& derived() const { return *static_cast<const Derived*>(this); }
+  /** \returns a const reference to the derived object. */
+  EIGEN_DEVICE_FUNC inline Derived& derived() { return *static_cast<Derived*>(this); }
+
+  /**
+   * Constructs a dense matrix from \c *this. Note, this directly returns a dense matrix type,
+   * not an expression.
+   * \returns A dense matrix, with its entries set from the the derived object. */
+  EIGEN_DEVICE_FUNC DenseMatrixType toDenseMatrix() const { return derived(); }
+
+  /** Determinant vanishes */
+  EIGEN_DEVICE_FUNC constexpr Scalar determinant() const { return 0; }
+
+  /** A.transpose() = -A */
+  EIGEN_DEVICE_FUNC PlainObject transpose() const { return (-vector()).asSkewSymmetric(); }
+
+  /** \returns the exponential of this matrix using Rodrigues’ formula */
+  EIGEN_DEVICE_FUNC DenseMatrixType exponential() const {
+    DenseMatrixType retVal = DenseMatrixType::Identity();
+    const SkewSymmetricVectorType& v = vector();
+    if (v.isZero()) {
+      return retVal;
+    }
+    const Scalar norm2 = v.squaredNorm();
+    const Scalar norm = numext::sqrt(norm2);
+    retVal += ((((1 - numext::cos(norm)) / norm2) * derived()) * derived()) +
+              (numext::sin(norm) / norm) * derived().toDenseMatrix();
+    return retVal;
+  }
+
+  /** \returns a reference to the derived object's vector of coefficients. */
+  EIGEN_DEVICE_FUNC inline const SkewSymmetricVectorType& vector() const { return derived().vector(); }
+  /** \returns a const reference to the derived object's vector of coefficients. */
+  EIGEN_DEVICE_FUNC inline SkewSymmetricVectorType& vector() { return derived().vector(); }
+
+  /** \returns the number of rows. */
+  EIGEN_DEVICE_FUNC constexpr Index rows() const { return 3; }
+  /** \returns the number of columns. */
+  EIGEN_DEVICE_FUNC constexpr Index cols() const { return 3; }
+
+  /** \returns the matrix product of \c *this by the dense matrix, \a matrix */
+  template <typename MatrixDerived>
+  EIGEN_DEVICE_FUNC Product<Derived, MatrixDerived, LazyProduct> operator*(
+      const MatrixBase<MatrixDerived>& matrix) const {
+    return Product<Derived, MatrixDerived, LazyProduct>(derived(), matrix.derived());
+  }
+
+  /** \returns the matrix product of \c *this by the skew symmetric matrix, \a matrix */
+  template <typename MatrixDerived>
+  EIGEN_DEVICE_FUNC Product<Derived, MatrixDerived, LazyProduct> operator*(
+      const SkewSymmetricBase<MatrixDerived>& matrix) const {
+    return Product<Derived, MatrixDerived, LazyProduct>(derived(), matrix.derived());
+  }
+
+  template <typename OtherDerived>
+  using SkewSymmetricProductReturnType = SkewSymmetricWrapper<const EIGEN_CWISE_BINARY_RETURN_TYPE(
+      SkewSymmetricVectorType, typename OtherDerived::SkewSymmetricVectorType, product)>;
+
+  /** \returns the wedge product of \c *this by the skew symmetric matrix \a other
+   *  A wedge B = AB - BA */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC SkewSymmetricProductReturnType<OtherDerived> wedge(
+      const SkewSymmetricBase<OtherDerived>& other) const {
+    return vector().cross(other.vector()).asSkewSymmetric();
+  }
+
+  using SkewSymmetricScaleReturnType =
+      SkewSymmetricWrapper<const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(SkewSymmetricVectorType, Scalar, product)>;
+
+  /** \returns the product of \c *this by the scalar \a scalar */
+  EIGEN_DEVICE_FUNC inline SkewSymmetricScaleReturnType operator*(const Scalar& scalar) const {
+    return (vector() * scalar).asSkewSymmetric();
+  }
+
+  using ScaleSkewSymmetricReturnType =
+      SkewSymmetricWrapper<const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(Scalar, SkewSymmetricVectorType, product)>;
+
+  /** \returns the product of a scalar and the skew symmetric matrix \a other */
+  EIGEN_DEVICE_FUNC friend inline ScaleSkewSymmetricReturnType operator*(const Scalar& scalar,
+                                                                         const SkewSymmetricBase& other) {
+    return (scalar * other.vector()).asSkewSymmetric();
+  }
+
+  template <typename OtherDerived>
+  using SkewSymmetricSumReturnType = SkewSymmetricWrapper<const EIGEN_CWISE_BINARY_RETURN_TYPE(
+      SkewSymmetricVectorType, typename OtherDerived::SkewSymmetricVectorType, sum)>;
+
+  /** \returns the sum of \c *this and the skew symmetric matrix \a other */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC inline SkewSymmetricSumReturnType<OtherDerived> operator+(
+      const SkewSymmetricBase<OtherDerived>& other) const {
+    return (vector() + other.vector()).asSkewSymmetric();
+  }
+
+  template <typename OtherDerived>
+  using SkewSymmetricDifferenceReturnType = SkewSymmetricWrapper<const EIGEN_CWISE_BINARY_RETURN_TYPE(
+      SkewSymmetricVectorType, typename OtherDerived::SkewSymmetricVectorType, difference)>;
+
+  /** \returns the difference of \c *this and the skew symmetric matrix \a other */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC inline SkewSymmetricDifferenceReturnType<OtherDerived> operator-(
+      const SkewSymmetricBase<OtherDerived>& other) const {
+    return (vector() - other.vector()).asSkewSymmetric();
+  }
+};
+
+/** \class SkewSymmetricMatrix3
+ * \ingroup Core_Module
+ *
+ * \brief Represents a 3x3 skew symmetric matrix with its storage
+ *
+ * \tparam Scalar_ the type of coefficients
+ *
+ * \sa class SkewSymmetricBase, class SkewSymmetricWrapper
+ */
+
+namespace internal {
+template <typename Scalar_>
+struct traits<SkewSymmetricMatrix3<Scalar_>> : traits<Matrix<Scalar_, 3, 3, 0, 3, 3>> {
+  typedef Matrix<Scalar_, 3, 1, 0, 3, 1> SkewSymmetricVectorType;
+  typedef SkewSymmetricShape StorageKind;
+  enum { Flags = LvalueBit | NoPreferredStorageOrderBit | NestByRefBit };
+};
+}  // namespace internal
+template <typename Scalar_>
+class SkewSymmetricMatrix3 : public SkewSymmetricBase<SkewSymmetricMatrix3<Scalar_>> {
+ public:
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  typedef typename internal::traits<SkewSymmetricMatrix3>::SkewSymmetricVectorType SkewSymmetricVectorType;
+  typedef const SkewSymmetricMatrix3& Nested;
+  typedef Scalar_ Scalar;
+  typedef typename internal::traits<SkewSymmetricMatrix3>::StorageKind StorageKind;
+  typedef typename internal::traits<SkewSymmetricMatrix3>::StorageIndex StorageIndex;
+#endif
+
+ protected:
+  SkewSymmetricVectorType m_vector;
+
+ public:
+  /** const version of vector(). */
+  EIGEN_DEVICE_FUNC inline const SkewSymmetricVectorType& vector() const { return m_vector; }
+  /** \returns a reference to the stored vector of coefficients. */
+  EIGEN_DEVICE_FUNC inline SkewSymmetricVectorType& vector() { return m_vector; }
+
+  /** Default constructor without initialization */
+  EIGEN_DEVICE_FUNC inline SkewSymmetricMatrix3() {}
+
+  /** Constructor from three scalars */
+  EIGEN_DEVICE_FUNC inline SkewSymmetricMatrix3(const Scalar& x, const Scalar& y, const Scalar& z)
+      : m_vector(x, y, z) {}
+
+  /** \brief Constructs a SkewSymmetricMatrix3 from an r-value vector type */
+  EIGEN_DEVICE_FUNC explicit inline SkewSymmetricMatrix3(SkewSymmetricVectorType&& vec) : m_vector(std::move(vec)) {}
+
+  /** generic constructor from expression of the coefficients */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC explicit inline SkewSymmetricMatrix3(const MatrixBase<OtherDerived>& other) : m_vector(other) {}
+
+  /** Copy constructor. */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC inline SkewSymmetricMatrix3(const SkewSymmetricBase<OtherDerived>& other)
+      : m_vector(other.vector()) {}
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  /** copy constructor. prevent a default copy constructor from hiding the other templated constructor */
+  inline SkewSymmetricMatrix3(const SkewSymmetricMatrix3& other) : m_vector(other.vector()) {}
+#endif
+
+  /** Copy operator. */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC SkewSymmetricMatrix3& operator=(const SkewSymmetricBase<OtherDerived>& other) {
+    m_vector = other.vector();
+    return *this;
+  }
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  /** This is a special case of the templated operator=. Its purpose is to
+   * prevent a default operator= from hiding the templated operator=.
+   */
+  EIGEN_DEVICE_FUNC SkewSymmetricMatrix3& operator=(const SkewSymmetricMatrix3& other) {
+    m_vector = other.vector();
+    return *this;
+  }
+#endif
+
+  typedef SkewSymmetricWrapper<const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, SkewSymmetricVectorType>>
+      InitializeReturnType;
+
+  /** Initializes a skew symmetric matrix with coefficients set to zero */
+  EIGEN_DEVICE_FUNC static InitializeReturnType Zero() { return SkewSymmetricVectorType::Zero().asSkewSymmetric(); }
+
+  /** Sets all coefficients to zero. */
+  EIGEN_DEVICE_FUNC inline void setZero() { m_vector.setZero(); }
+};
+
+/** \class SkewSymmetricWrapper
+ * \ingroup Core_Module
+ *
+ * \brief Expression of a skew symmetric matrix
+ *
+ * \tparam SkewSymmetricVectorType_ the type of the vector of coefficients
+ *
+ * This class is an expression of a skew symmetric matrix, but not storing its own vector of coefficients,
+ * instead wrapping an existing vector expression. It is the return type of MatrixBase::asSkewSymmetric()
+ * and most of the time this is the only way that it is used.
+ *
+ * \sa class SkewSymmetricMatrix3, class SkewSymmetricBase, MatrixBase::asSkewSymmetric()
+ */
+
+namespace internal {
+template <typename SkewSymmetricVectorType_>
+struct traits<SkewSymmetricWrapper<SkewSymmetricVectorType_>> {
+  typedef SkewSymmetricVectorType_ SkewSymmetricVectorType;
+  typedef typename SkewSymmetricVectorType::Scalar Scalar;
+  typedef typename SkewSymmetricVectorType::StorageIndex StorageIndex;
+  typedef SkewSymmetricShape StorageKind;
+  typedef typename traits<SkewSymmetricVectorType>::XprKind XprKind;
+  enum {
+    RowsAtCompileTime = SkewSymmetricVectorType::SizeAtCompileTime,
+    ColsAtCompileTime = SkewSymmetricVectorType::SizeAtCompileTime,
+    MaxRowsAtCompileTime = SkewSymmetricVectorType::MaxSizeAtCompileTime,
+    MaxColsAtCompileTime = SkewSymmetricVectorType::MaxSizeAtCompileTime,
+    Flags = (traits<SkewSymmetricVectorType>::Flags & LvalueBit) | NoPreferredStorageOrderBit
+  };
+};
+}  // namespace internal
+
+template <typename SkewSymmetricVectorType_>
+class SkewSymmetricWrapper : public SkewSymmetricBase<SkewSymmetricWrapper<SkewSymmetricVectorType_>>,
+                             internal::no_assignment_operator {
+ public:
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  typedef SkewSymmetricVectorType_ SkewSymmetricVectorType;
+  typedef SkewSymmetricWrapper Nested;
+#endif
+
+  /** Constructor from expression of coefficients to wrap. */
+  EIGEN_DEVICE_FUNC explicit inline SkewSymmetricWrapper(SkewSymmetricVectorType& a_vector) : m_vector(a_vector) {}
+
+  /** \returns a const reference to the wrapped expression of coefficients. */
+  EIGEN_DEVICE_FUNC const SkewSymmetricVectorType& vector() const { return m_vector; }
+
+ protected:
+  typename SkewSymmetricVectorType::Nested m_vector;
+};
+
+/** \returns a pseudo-expression of a skew symmetric matrix with *this as vector of coefficients
+ *
+ * \only_for_vectors
+ *
+ * \sa class SkewSymmetricWrapper, class SkewSymmetricMatrix3, vector(), isSkewSymmetric()
+ **/
+template <typename Derived>
+EIGEN_DEVICE_FUNC inline const SkewSymmetricWrapper<const Derived> MatrixBase<Derived>::asSkewSymmetric() const {
+  return SkewSymmetricWrapper<const Derived>(derived());
+}
+
+/** \returns true if *this is approximately equal to a skew symmetric matrix,
+ *          within the precision given by \a prec.
+ */
+template <typename Derived>
+bool MatrixBase<Derived>::isSkewSymmetric(const RealScalar& prec) const {
+  if (cols() != rows()) return false;
+  return (this->transpose() + *this).isZero(prec);
+}
+
+/** \returns the matrix product of \c *this by the skew symmetric matrix \a skew.
+ */
+template <typename Derived>
+template <typename SkewDerived>
+EIGEN_DEVICE_FUNC inline const Product<Derived, SkewDerived, LazyProduct> MatrixBase<Derived>::operator*(
+    const SkewSymmetricBase<SkewDerived>& skew) const {
+  return Product<Derived, SkewDerived, LazyProduct>(derived(), skew.derived());
+}
+
+namespace internal {
+
+template <>
+struct storage_kind_to_shape<SkewSymmetricShape> {
+  typedef SkewSymmetricShape Shape;
+};
+
+struct SkewSymmetric2Dense {};
+
+template <>
+struct AssignmentKind<DenseShape, SkewSymmetricShape> {
+  typedef SkewSymmetric2Dense Kind;
+};
+
+// SkewSymmetric matrix to Dense assignment
+template <typename DstXprType, typename SrcXprType, typename Functor>
+struct Assignment<DstXprType, SrcXprType, Functor, SkewSymmetric2Dense> {
+  EIGEN_DEVICE_FUNC static void run(
+      DstXprType& dst, const SrcXprType& src,
+      const internal::assign_op<typename DstXprType::Scalar, typename SrcXprType::Scalar>& /*func*/) {
+    if ((dst.rows() != 3) || (dst.cols() != 3)) {
+      dst.resize(3, 3);
+    }
+    dst.diagonal().setZero();
+    const typename SrcXprType::SkewSymmetricVectorType v = src.vector();
+    dst(0, 1) = -v(2);
+    dst(1, 0) = v(2);
+    dst(0, 2) = v(1);
+    dst(2, 0) = -v(1);
+    dst(1, 2) = -v(0);
+    dst(2, 1) = v(0);
+  }
+  EIGEN_DEVICE_FUNC static void run(
+      DstXprType& dst, const SrcXprType& src,
+      const internal::add_assign_op<typename DstXprType::Scalar, typename SrcXprType::Scalar>& /*func*/) {
+    dst.vector() += src.vector();
+  }
+
+  EIGEN_DEVICE_FUNC static void run(
+      DstXprType& dst, const SrcXprType& src,
+      const internal::sub_assign_op<typename DstXprType::Scalar, typename SrcXprType::Scalar>& /*func*/) {
+    dst.vector() -= src.vector();
+  }
+};
+
+}  // namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_SKEWSYMMETRICMATRIX3_H
diff --git a/inst/include/Eigen/src/Core/Solve.h b/inst/include/Eigen/src/Core/Solve.h
new file mode 100644
index 00000000..aa514100
--- /dev/null
+++ b/inst/include/Eigen/src/Core/Solve.h
@@ -0,0 +1,174 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SOLVE_H
+#define EIGEN_SOLVE_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+template <typename Decomposition, typename RhsType, typename StorageKind>
+class SolveImpl;
+
+/** \class Solve
+ * \ingroup Core_Module
+ *
+ * \brief Pseudo expression representing a solving operation
+ *
+ * \tparam Decomposition the type of the matrix or decomposition object
+ * \tparam Rhstype the type of the right-hand side
+ *
+ * This class represents an expression of A.solve(B)
+ * and most of the time this is the only way it is used.
+ *
+ */
+namespace internal {
+
+// this solve_traits class permits to determine the evaluation type with respect to storage kind (Dense vs Sparse)
+template <typename Decomposition, typename RhsType, typename StorageKind>
+struct solve_traits;
+
+template <typename Decomposition, typename RhsType>
+struct solve_traits<Decomposition, RhsType, Dense> {
+  typedef typename make_proper_matrix_type<typename RhsType::Scalar, Decomposition::ColsAtCompileTime,
+                                           RhsType::ColsAtCompileTime, RhsType::PlainObject::Options,
+                                           Decomposition::MaxColsAtCompileTime, RhsType::MaxColsAtCompileTime>::type
+      PlainObject;
+};
+
+template <typename Decomposition, typename RhsType>
+struct traits<Solve<Decomposition, RhsType> >
+    : traits<
+          typename solve_traits<Decomposition, RhsType, typename internal::traits<RhsType>::StorageKind>::PlainObject> {
+  typedef typename solve_traits<Decomposition, RhsType, typename internal::traits<RhsType>::StorageKind>::PlainObject
+      PlainObject;
+  typedef typename promote_index_type<typename Decomposition::StorageIndex, typename RhsType::StorageIndex>::type
+      StorageIndex;
+  typedef traits<PlainObject> BaseTraits;
+  enum { Flags = BaseTraits::Flags & RowMajorBit, CoeffReadCost = HugeCost };
+};
+
+}  // namespace internal
+
+template <typename Decomposition, typename RhsType>
+class Solve : public SolveImpl<Decomposition, RhsType, typename internal::traits<RhsType>::StorageKind> {
+ public:
+  typedef typename internal::traits<Solve>::PlainObject PlainObject;
+  typedef typename internal::traits<Solve>::StorageIndex StorageIndex;
+
+  Solve(const Decomposition &dec, const RhsType &rhs) : m_dec(dec), m_rhs(rhs) {}
+
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_dec.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_rhs.cols(); }
+
+  EIGEN_DEVICE_FUNC const Decomposition &dec() const { return m_dec; }
+  EIGEN_DEVICE_FUNC const RhsType &rhs() const { return m_rhs; }
+
+ protected:
+  const Decomposition &m_dec;
+  const typename internal::ref_selector<RhsType>::type m_rhs;
+};
+
+// Specialization of the Solve expression for dense results
+template <typename Decomposition, typename RhsType>
+class SolveImpl<Decomposition, RhsType, Dense> : public MatrixBase<Solve<Decomposition, RhsType> > {
+  typedef Solve<Decomposition, RhsType> Derived;
+
+ public:
+  typedef MatrixBase<Solve<Decomposition, RhsType> > Base;
+  EIGEN_DENSE_PUBLIC_INTERFACE(Derived)
+
+ private:
+  Scalar coeff(Index row, Index col) const;
+  Scalar coeff(Index i) const;
+};
+
+// Generic API dispatcher
+template <typename Decomposition, typename RhsType, typename StorageKind>
+class SolveImpl : public internal::generic_xpr_base<Solve<Decomposition, RhsType>, MatrixXpr, StorageKind>::type {
+ public:
+  typedef typename internal::generic_xpr_base<Solve<Decomposition, RhsType>, MatrixXpr, StorageKind>::type Base;
+};
+
+namespace internal {
+
+// Evaluator of Solve -> eval into a temporary
+template <typename Decomposition, typename RhsType>
+struct evaluator<Solve<Decomposition, RhsType> >
+    : public evaluator<typename Solve<Decomposition, RhsType>::PlainObject> {
+  typedef Solve<Decomposition, RhsType> SolveType;
+  typedef typename SolveType::PlainObject PlainObject;
+  typedef evaluator<PlainObject> Base;
+
+  enum { Flags = Base::Flags | EvalBeforeNestingBit };
+
+  EIGEN_DEVICE_FUNC explicit evaluator(const SolveType &solve) : m_result(solve.rows(), solve.cols()) {
+    internal::construct_at<Base>(this, m_result);
+    solve.dec()._solve_impl(solve.rhs(), m_result);
+  }
+
+ protected:
+  PlainObject m_result;
+};
+
+// Specialization for "dst = dec.solve(rhs)"
+// NOTE we need to specialize it for Dense2Dense to avoid ambiguous specialization error and a Sparse2Sparse
+// specialization must exist somewhere
+template <typename DstXprType, typename DecType, typename RhsType, typename Scalar>
+struct Assignment<DstXprType, Solve<DecType, RhsType>, internal::assign_op<Scalar, Scalar>, Dense2Dense> {
+  typedef Solve<DecType, RhsType> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar, Scalar> &) {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if ((dst.rows() != dstRows) || (dst.cols() != dstCols)) dst.resize(dstRows, dstCols);
+
+    src.dec()._solve_impl(src.rhs(), dst);
+  }
+};
+
+// Specialization for "dst = dec.transpose().solve(rhs)"
+template <typename DstXprType, typename DecType, typename RhsType, typename Scalar>
+struct Assignment<DstXprType, Solve<Transpose<const DecType>, RhsType>, internal::assign_op<Scalar, Scalar>,
+                  Dense2Dense> {
+  typedef Solve<Transpose<const DecType>, RhsType> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar, Scalar> &) {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if ((dst.rows() != dstRows) || (dst.cols() != dstCols)) dst.resize(dstRows, dstCols);
+
+    src.dec().nestedExpression().template _solve_impl_transposed<false>(src.rhs(), dst);
+  }
+};
+
+// Specialization for "dst = dec.adjoint().solve(rhs)"
+template <typename DstXprType, typename DecType, typename RhsType, typename Scalar>
+struct Assignment<
+    DstXprType,
+    Solve<CwiseUnaryOp<internal::scalar_conjugate_op<typename DecType::Scalar>, const Transpose<const DecType> >,
+          RhsType>,
+    internal::assign_op<Scalar, Scalar>, Dense2Dense> {
+  typedef Solve<CwiseUnaryOp<internal::scalar_conjugate_op<typename DecType::Scalar>, const Transpose<const DecType> >,
+                RhsType>
+      SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar, Scalar> &) {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if ((dst.rows() != dstRows) || (dst.cols() != dstCols)) dst.resize(dstRows, dstCols);
+
+    src.dec().nestedExpression().nestedExpression().template _solve_impl_transposed<true>(src.rhs(), dst);
+  }
+};
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_SOLVE_H
diff --git a/inst/include/Eigen/src/Core/SolveTriangular.h b/inst/include/Eigen/src/Core/SolveTriangular.h
index ef17f288..9d318742 100644
--- a/inst/include/Eigen/src/Core/SolveTriangular.h
+++ b/inst/include/Eigen/src/Core/SolveTriangular.h
@@ -10,251 +10,228 @@
 #ifndef EIGEN_SOLVETRIANGULAR_H
 #define EIGEN_SOLVETRIANGULAR_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 namespace internal {
 
 // Forward declarations:
 // The following two routines are implemented in the products/TriangularSolver*.h files
-template<typename LhsScalar, typename RhsScalar, typename Index, int Side, int Mode, bool Conjugate, int StorageOrder>
+template <typename LhsScalar, typename RhsScalar, typename Index, int Side, int Mode, bool Conjugate, int StorageOrder>
 struct triangular_solve_vector;
 
-template <typename Scalar, typename Index, int Side, int Mode, bool Conjugate, int TriStorageOrder, int OtherStorageOrder>
+template <typename Scalar, typename Index, int Side, int Mode, bool Conjugate, int TriStorageOrder,
+          int OtherStorageOrder, int OtherInnerStride>
 struct triangular_solve_matrix;
 
 // small helper struct extracting some traits on the underlying solver operation
-template<typename Lhs, typename Rhs, int Side>
-class trsolve_traits
-{
-  private:
-    enum {
-      RhsIsVectorAtCompileTime = (Side==OnTheLeft ? Rhs::ColsAtCompileTime : Rhs::RowsAtCompileTime)==1
-    };
-  public:
-    enum {
-      Unrolling   = (RhsIsVectorAtCompileTime && Rhs::SizeAtCompileTime != Dynamic && Rhs::SizeAtCompileTime <= 8)
-                  ? CompleteUnrolling : NoUnrolling,
-      RhsVectors  = RhsIsVectorAtCompileTime ? 1 : Dynamic
-    };
+template <typename Lhs, typename Rhs, int Side>
+class trsolve_traits {
+ private:
+  enum { RhsIsVectorAtCompileTime = (Side == OnTheLeft ? Rhs::ColsAtCompileTime : Rhs::RowsAtCompileTime) == 1 };
+
+ public:
+  enum {
+    Unrolling = (RhsIsVectorAtCompileTime && Rhs::SizeAtCompileTime != Dynamic && Rhs::SizeAtCompileTime <= 8)
+                    ? CompleteUnrolling
+                    : NoUnrolling,
+    RhsVectors = RhsIsVectorAtCompileTime ? 1 : Dynamic
+  };
 };
 
-template<typename Lhs, typename Rhs,
-  int Side, // can be OnTheLeft/OnTheRight
-  int Mode, // can be Upper/Lower | UnitDiag
-  int Unrolling = trsolve_traits<Lhs,Rhs,Side>::Unrolling,
-  int RhsVectors = trsolve_traits<Lhs,Rhs,Side>::RhsVectors
-  >
+template <typename Lhs, typename Rhs,
+          int Side,  // can be OnTheLeft/OnTheRight
+          int Mode,  // can be Upper/Lower | UnitDiag
+          int Unrolling = trsolve_traits<Lhs, Rhs, Side>::Unrolling,
+          int RhsVectors = trsolve_traits<Lhs, Rhs, Side>::RhsVectors>
 struct triangular_solver_selector;
 
-template<typename Lhs, typename Rhs, int Side, int Mode>
-struct triangular_solver_selector<Lhs,Rhs,Side,Mode,NoUnrolling,1>
-{
+template <typename Lhs, typename Rhs, int Side, int Mode>
+struct triangular_solver_selector<Lhs, Rhs, Side, Mode, NoUnrolling, 1> {
   typedef typename Lhs::Scalar LhsScalar;
   typedef typename Rhs::Scalar RhsScalar;
   typedef blas_traits<Lhs> LhsProductTraits;
   typedef typename LhsProductTraits::ExtractType ActualLhsType;
-  typedef Map<Matrix<RhsScalar,Dynamic,1>, Aligned> MappedRhs;
-  static void run(const Lhs& lhs, Rhs& rhs)
-  {
+  typedef Map<Matrix<RhsScalar, Dynamic, 1>, Aligned> MappedRhs;
+  static EIGEN_DEVICE_FUNC void run(const Lhs& lhs, Rhs& rhs) {
     ActualLhsType actualLhs = LhsProductTraits::extract(lhs);
 
     // FIXME find a way to allow an inner stride if packet_traits<Scalar>::size==1
 
-    bool useRhsDirectly = Rhs::InnerStrideAtCompileTime==1 || rhs.innerStride()==1;
+    bool useRhsDirectly = Rhs::InnerStrideAtCompileTime == 1 || rhs.innerStride() == 1;
 
-    ei_declare_aligned_stack_constructed_variable(RhsScalar,actualRhs,rhs.size(),
-                                                  (useRhsDirectly ? rhs.data() : 0));
-                                                  
-    if(!useRhsDirectly)
-      MappedRhs(actualRhs,rhs.size()) = rhs;
+    ei_declare_aligned_stack_constructed_variable(RhsScalar, actualRhs, rhs.size(), (useRhsDirectly ? rhs.data() : 0));
 
-    triangular_solve_vector<LhsScalar, RhsScalar, typename Lhs::Index, Side, Mode, LhsProductTraits::NeedToConjugate,
-                            (int(Lhs::Flags) & RowMajorBit) ? RowMajor : ColMajor>
-      ::run(actualLhs.cols(), actualLhs.data(), actualLhs.outerStride(), actualRhs);
+    if (!useRhsDirectly) MappedRhs(actualRhs, rhs.size()) = rhs;
 
-    if(!useRhsDirectly)
-      rhs = MappedRhs(actualRhs, rhs.size());
+    triangular_solve_vector<LhsScalar, RhsScalar, Index, Side, Mode, LhsProductTraits::NeedToConjugate,
+                            (int(Lhs::Flags) & RowMajorBit) ? RowMajor : ColMajor>::run(actualLhs.cols(),
+                                                                                        actualLhs.data(),
+                                                                                        actualLhs.outerStride(),
+                                                                                        actualRhs);
+
+    if (!useRhsDirectly) rhs = MappedRhs(actualRhs, rhs.size());
   }
 };
 
 // the rhs is a matrix
-template<typename Lhs, typename Rhs, int Side, int Mode>
-struct triangular_solver_selector<Lhs,Rhs,Side,Mode,NoUnrolling,Dynamic>
-{
+template <typename Lhs, typename Rhs, int Side, int Mode>
+struct triangular_solver_selector<Lhs, Rhs, Side, Mode, NoUnrolling, Dynamic> {
   typedef typename Rhs::Scalar Scalar;
-  typedef typename Rhs::Index Index;
   typedef blas_traits<Lhs> LhsProductTraits;
   typedef typename LhsProductTraits::DirectLinearAccessType ActualLhsType;
 
-  static void run(const Lhs& lhs, Rhs& rhs)
-  {
-    typename internal::add_const_on_value_type<ActualLhsType>::type actualLhs = LhsProductTraits::extract(lhs);
+  static EIGEN_DEVICE_FUNC void run(const Lhs& lhs, Rhs& rhs) {
+    add_const_on_value_type_t<ActualLhsType> actualLhs = LhsProductTraits::extract(lhs);
 
     const Index size = lhs.rows();
-    const Index othersize = Side==OnTheLeft? rhs.cols() : rhs.rows();
-
-    typedef internal::gemm_blocking_space<(Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar,
-              Rhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxRowsAtCompileTime,4> BlockingType;
-
-    BlockingType blocking(rhs.rows(), rhs.cols(), size);
-
-    triangular_solve_matrix<Scalar,Index,Side,Mode,LhsProductTraits::NeedToConjugate,(int(Lhs::Flags) & RowMajorBit) ? RowMajor : ColMajor,
-                               (Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor>
-      ::run(size, othersize, &actualLhs.coeffRef(0,0), actualLhs.outerStride(), &rhs.coeffRef(0,0), rhs.outerStride(), blocking);
+    const Index othersize = Side == OnTheLeft ? rhs.cols() : rhs.rows();
+
+    typedef internal::gemm_blocking_space<(Rhs::Flags & RowMajorBit) ? RowMajor : ColMajor, Scalar, Scalar,
+                                          Rhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime,
+                                          Lhs::MaxRowsAtCompileTime, 4>
+        BlockingType;
+
+    // Nothing to solve.
+    if (actualLhs.size() == 0 || rhs.size() == 0) {
+      return;
+    }
+
+    BlockingType blocking(rhs.rows(), rhs.cols(), size, 1, false);
+
+    triangular_solve_matrix<Scalar, Index, Side, Mode, LhsProductTraits::NeedToConjugate,
+                            (int(Lhs::Flags) & RowMajorBit) ? RowMajor : ColMajor,
+                            (Rhs::Flags & RowMajorBit) ? RowMajor : ColMajor,
+                            Rhs::InnerStrideAtCompileTime>::run(size, othersize, &actualLhs.coeffRef(0, 0),
+                                                                actualLhs.outerStride(), &rhs.coeffRef(0, 0),
+                                                                rhs.innerStride(), rhs.outerStride(), blocking);
   }
 };
 
 /***************************************************************************
-* meta-unrolling implementation
-***************************************************************************/
+ * meta-unrolling implementation
+ ***************************************************************************/
 
-template<typename Lhs, typename Rhs, int Mode, int Index, int Size,
-         bool Stop = Index==Size>
+template <typename Lhs, typename Rhs, int Mode, int LoopIndex, int Size, bool Stop = LoopIndex == Size>
 struct triangular_solver_unroller;
 
-template<typename Lhs, typename Rhs, int Mode, int Index, int Size>
-struct triangular_solver_unroller<Lhs,Rhs,Mode,Index,Size,false> {
+template <typename Lhs, typename Rhs, int Mode, int LoopIndex, int Size>
+struct triangular_solver_unroller<Lhs, Rhs, Mode, LoopIndex, Size, false> {
   enum {
-    IsLower = ((Mode&Lower)==Lower),
-    I = IsLower ? Index : Size - Index - 1,
-    S = IsLower ? 0     : I+1
+    IsLower = ((Mode & Lower) == Lower),
+    DiagIndex = IsLower ? LoopIndex : Size - LoopIndex - 1,
+    StartIndex = IsLower ? 0 : DiagIndex + 1
   };
-  static void run(const Lhs& lhs, Rhs& rhs)
-  {
-    if (Index>0)
-      rhs.coeffRef(I) -= lhs.row(I).template segment<Index>(S).transpose()
-                         .cwiseProduct(rhs.template segment<Index>(S)).sum();
+  static EIGEN_DEVICE_FUNC void run(const Lhs& lhs, Rhs& rhs) {
+    if (LoopIndex > 0)
+      rhs.coeffRef(DiagIndex) -= lhs.row(DiagIndex)
+                                     .template segment<LoopIndex>(StartIndex)
+                                     .transpose()
+                                     .cwiseProduct(rhs.template segment<LoopIndex>(StartIndex))
+                                     .sum();
 
-    if(!(Mode & UnitDiag))
-      rhs.coeffRef(I) /= lhs.coeff(I,I);
+    if (!(Mode & UnitDiag)) rhs.coeffRef(DiagIndex) /= lhs.coeff(DiagIndex, DiagIndex);
 
-    triangular_solver_unroller<Lhs,Rhs,Mode,Index+1,Size>::run(lhs,rhs);
+    triangular_solver_unroller<Lhs, Rhs, Mode, LoopIndex + 1, Size>::run(lhs, rhs);
   }
 };
 
-template<typename Lhs, typename Rhs, int Mode, int Index, int Size>
-struct triangular_solver_unroller<Lhs,Rhs,Mode,Index,Size,true> {
-  static void run(const Lhs&, Rhs&) {}
+template <typename Lhs, typename Rhs, int Mode, int LoopIndex, int Size>
+struct triangular_solver_unroller<Lhs, Rhs, Mode, LoopIndex, Size, true> {
+  static EIGEN_DEVICE_FUNC void run(const Lhs&, Rhs&) {}
 };
 
-template<typename Lhs, typename Rhs, int Mode>
-struct triangular_solver_selector<Lhs,Rhs,OnTheLeft,Mode,CompleteUnrolling,1> {
-  static void run(const Lhs& lhs, Rhs& rhs)
-  { triangular_solver_unroller<Lhs,Rhs,Mode,0,Rhs::SizeAtCompileTime>::run(lhs,rhs); }
+template <typename Lhs, typename Rhs, int Mode>
+struct triangular_solver_selector<Lhs, Rhs, OnTheLeft, Mode, CompleteUnrolling, 1> {
+  static EIGEN_DEVICE_FUNC void run(const Lhs& lhs, Rhs& rhs) {
+    triangular_solver_unroller<Lhs, Rhs, Mode, 0, Rhs::SizeAtCompileTime>::run(lhs, rhs);
+  }
 };
 
-template<typename Lhs, typename Rhs, int Mode>
-struct triangular_solver_selector<Lhs,Rhs,OnTheRight,Mode,CompleteUnrolling,1> {
-  static void run(const Lhs& lhs, Rhs& rhs)
-  {
+template <typename Lhs, typename Rhs, int Mode>
+struct triangular_solver_selector<Lhs, Rhs, OnTheRight, Mode, CompleteUnrolling, 1> {
+  static EIGEN_DEVICE_FUNC void run(const Lhs& lhs, Rhs& rhs) {
     Transpose<const Lhs> trLhs(lhs);
     Transpose<Rhs> trRhs(rhs);
-    
-    triangular_solver_unroller<Transpose<const Lhs>,Transpose<Rhs>,
-                              ((Mode&Upper)==Upper ? Lower : Upper) | (Mode&UnitDiag),
-                              0,Rhs::SizeAtCompileTime>::run(trLhs,trRhs);
+
+    triangular_solver_unroller<Transpose<const Lhs>, Transpose<Rhs>,
+                               ((Mode & Upper) == Upper ? Lower : Upper) | (Mode & UnitDiag), 0,
+                               Rhs::SizeAtCompileTime>::run(trLhs, trRhs);
   }
 };
 
-} // end namespace internal
+}  // end namespace internal
 
 /***************************************************************************
-* TriangularView methods
-***************************************************************************/
-
-/** "in-place" version of TriangularView::solve() where the result is written in \a other
-  *
-  * \warning The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here.
-  * This function will const_cast it, so constness isn't honored here.
-  *
-  * See TriangularView:solve() for the details.
-  */
-template<typename MatrixType, unsigned int Mode>
-template<int Side, typename OtherDerived>
-void TriangularView<MatrixType,Mode>::solveInPlace(const MatrixBase<OtherDerived>& _other) const
-{
+ * TriangularView methods
+ ***************************************************************************/
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+template <typename MatrixType, unsigned int Mode>
+template <int Side, typename OtherDerived>
+EIGEN_DEVICE_FUNC void TriangularViewImpl<MatrixType, Mode, Dense>::solveInPlace(
+    const MatrixBase<OtherDerived>& _other) const {
   OtherDerived& other = _other.const_cast_derived();
-  eigen_assert( cols() == rows() && ((Side==OnTheLeft && cols() == other.rows()) || (Side==OnTheRight && cols() == other.cols())) );
-  eigen_assert((!(Mode & ZeroDiag)) && bool(Mode & (Upper|Lower)));
+  eigen_assert(derived().cols() == derived().rows() && ((Side == OnTheLeft && derived().cols() == other.rows()) ||
+                                                        (Side == OnTheRight && derived().cols() == other.cols())));
+  eigen_assert((!(int(Mode) & int(ZeroDiag))) && bool(int(Mode) & (int(Upper) | int(Lower))));
+  // If solving for a 0x0 matrix, nothing to do, simply return.
+  if (derived().cols() == 0) return;
 
-  enum { copy = internal::traits<OtherDerived>::Flags & RowMajorBit  && OtherDerived::IsVectorAtCompileTime };
-  typedef typename internal::conditional<copy,
-    typename internal::plain_matrix_type_column_major<OtherDerived>::type, OtherDerived&>::type OtherCopy;
+  enum {
+    copy = (internal::traits<OtherDerived>::Flags & RowMajorBit) && OtherDerived::IsVectorAtCompileTime &&
+           OtherDerived::SizeAtCompileTime != 1
+  };
+  typedef std::conditional_t<copy, typename internal::plain_matrix_type_column_major<OtherDerived>::type, OtherDerived&>
+      OtherCopy;
   OtherCopy otherCopy(other);
 
-  internal::triangular_solver_selector<MatrixType, typename internal::remove_reference<OtherCopy>::type,
-    Side, Mode>::run(nestedExpression(), otherCopy);
+  internal::triangular_solver_selector<MatrixType, std::remove_reference_t<OtherCopy>, Side, Mode>::run(
+      derived().nestedExpression(), otherCopy);
 
-  if (copy)
-    other = otherCopy;
+  if (copy) other = otherCopy;
 }
 
-/** \returns the product of the inverse of \c *this with \a other, \a *this being triangular.
-  *
-  * This function computes the inverse-matrix matrix product inverse(\c *this) * \a other if
-  * \a Side==OnTheLeft (the default), or the right-inverse-multiply  \a other * inverse(\c *this) if
-  * \a Side==OnTheRight.
-  *
-  * The matrix \c *this must be triangular and invertible (i.e., all the coefficients of the
-  * diagonal must be non zero). It works as a forward (resp. backward) substitution if \c *this
-  * is an upper (resp. lower) triangular matrix.
-  *
-  * Example: \include MatrixBase_marked.cpp
-  * Output: \verbinclude MatrixBase_marked.out
-  *
-  * This function returns an expression of the inverse-multiply and can works in-place if it is assigned
-  * to the same matrix or vector \a other.
-  *
-  * For users coming from BLAS, this function (and more specifically solveInPlace()) offer
-  * all the operations supported by the \c *TRSV and \c *TRSM BLAS routines.
-  *
-  * \sa TriangularView::solveInPlace()
-  */
-template<typename Derived, unsigned int Mode>
-template<int Side, typename Other>
-const internal::triangular_solve_retval<Side,TriangularView<Derived,Mode>,Other>
-TriangularView<Derived,Mode>::solve(const MatrixBase<Other>& other) const
-{
-  return internal::triangular_solve_retval<Side,TriangularView,Other>(*this, other.derived());
+template <typename Derived, unsigned int Mode>
+template <int Side, typename Other>
+const internal::triangular_solve_retval<Side, TriangularView<Derived, Mode>, Other>
+TriangularViewImpl<Derived, Mode, Dense>::solve(const MatrixBase<Other>& other) const {
+  return internal::triangular_solve_retval<Side, TriangularViewType, Other>(derived(), other.derived());
 }
+#endif
 
 namespace internal {
 
-
-template<int Side, typename TriangularType, typename Rhs>
-struct traits<triangular_solve_retval<Side, TriangularType, Rhs> >
-{
+template <int Side, typename TriangularType, typename Rhs>
+struct traits<triangular_solve_retval<Side, TriangularType, Rhs> > {
   typedef typename internal::plain_matrix_type_column_major<Rhs>::type ReturnType;
 };
 
-template<int Side, typename TriangularType, typename Rhs> struct triangular_solve_retval
- : public ReturnByValue<triangular_solve_retval<Side, TriangularType, Rhs> >
-{
-  typedef typename remove_all<typename Rhs::Nested>::type RhsNestedCleaned;
+template <int Side, typename TriangularType, typename Rhs>
+struct triangular_solve_retval : public ReturnByValue<triangular_solve_retval<Side, TriangularType, Rhs> > {
+  typedef remove_all_t<typename Rhs::Nested> RhsNestedCleaned;
   typedef ReturnByValue<triangular_solve_retval> Base;
-  typedef typename Base::Index Index;
 
-  triangular_solve_retval(const TriangularType& tri, const Rhs& rhs)
-    : m_triangularMatrix(tri), m_rhs(rhs)
-  {}
+  triangular_solve_retval(const TriangularType& tri, const Rhs& rhs) : m_triangularMatrix(tri), m_rhs(rhs) {}
 
-  inline Index rows() const { return m_rhs.rows(); }
-  inline Index cols() const { return m_rhs.cols(); }
+  constexpr Index rows() const noexcept { return m_rhs.rows(); }
+  constexpr Index cols() const noexcept { return m_rhs.cols(); }
 
-  template<typename Dest> inline void evalTo(Dest& dst) const
-  {
-    if(!(is_same<RhsNestedCleaned,Dest>::value && extract_data(dst) == extract_data(m_rhs)))
-      dst = m_rhs;
+  template <typename Dest>
+  inline void evalTo(Dest& dst) const {
+    if (!is_same_dense(dst, m_rhs)) dst = m_rhs;
     m_triangularMatrix.template solveInPlace<Side>(dst);
   }
 
-  protected:
-    const TriangularType& m_triangularMatrix;
-    typename Rhs::Nested m_rhs;
+ protected:
+  const TriangularType& m_triangularMatrix;
+  typename Rhs::Nested m_rhs;
 };
 
-} // namespace internal
+}  // namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_SOLVETRIANGULAR_H
+#endif  // EIGEN_SOLVETRIANGULAR_H
diff --git a/inst/include/Eigen/src/Core/SolverBase.h b/inst/include/Eigen/src/Core/SolverBase.h
new file mode 100644
index 00000000..5a6dfd42
--- /dev/null
+++ b/inst/include/Eigen/src/Core/SolverBase.h
@@ -0,0 +1,167 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SOLVERBASE_H
+#define EIGEN_SOLVERBASE_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename Derived>
+struct solve_assertion {
+  template <bool Transpose_, typename Rhs>
+  static void run(const Derived& solver, const Rhs& b) {
+    solver.template _check_solve_assertion<Transpose_>(b);
+  }
+};
+
+template <typename Derived>
+struct solve_assertion<Transpose<Derived>> {
+  typedef Transpose<Derived> type;
+
+  template <bool Transpose_, typename Rhs>
+  static void run(const type& transpose, const Rhs& b) {
+    internal::solve_assertion<internal::remove_all_t<Derived>>::template run<true>(transpose.nestedExpression(), b);
+  }
+};
+
+template <typename Scalar, typename Derived>
+struct solve_assertion<CwiseUnaryOp<Eigen::internal::scalar_conjugate_op<Scalar>, const Transpose<Derived>>> {
+  typedef CwiseUnaryOp<Eigen::internal::scalar_conjugate_op<Scalar>, const Transpose<Derived>> type;
+
+  template <bool Transpose_, typename Rhs>
+  static void run(const type& adjoint, const Rhs& b) {
+    internal::solve_assertion<internal::remove_all_t<Transpose<Derived>>>::template run<true>(
+        adjoint.nestedExpression(), b);
+  }
+};
+}  // end namespace internal
+
+/** \class SolverBase
+ * \brief A base class for matrix decomposition and solvers
+ *
+ * \tparam Derived the actual type of the decomposition/solver.
+ *
+ * Any matrix decomposition inheriting this base class provide the following API:
+ *
+ * \code
+ * MatrixType A, b, x;
+ * DecompositionType dec(A);
+ * x = dec.solve(b);             // solve A   * x = b
+ * x = dec.transpose().solve(b); // solve A^T * x = b
+ * x = dec.adjoint().solve(b);   // solve A'  * x = b
+ * \endcode
+ *
+ * \warning Currently, any other usage of transpose() and adjoint() are not supported and will produce compilation
+ * errors.
+ *
+ * \sa class PartialPivLU, class FullPivLU, class HouseholderQR, class ColPivHouseholderQR, class FullPivHouseholderQR,
+ * class CompleteOrthogonalDecomposition, class LLT, class LDLT, class SVDBase
+ */
+template <typename Derived>
+class SolverBase : public EigenBase<Derived> {
+ public:
+  typedef EigenBase<Derived> Base;
+  typedef typename internal::traits<Derived>::Scalar Scalar;
+  typedef Scalar CoeffReturnType;
+
+  template <typename Derived_>
+  friend struct internal::solve_assertion;
+
+  ComputationInfo info() const {
+    // CRTP static dispatch: Calls the 'info()' method on the derived class.
+    // Derived must implement 'ComputationInfo info() const'.
+    // If not implemented, name lookup falls back to this base method, causing
+    // infinite recursion (detectable by -Winfinite-recursion).
+    return derived().info();
+  }
+
+  enum {
+    RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime,
+    ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
+    SizeAtCompileTime = (internal::size_of_xpr_at_compile_time<Derived>::ret),
+    MaxRowsAtCompileTime = internal::traits<Derived>::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = internal::traits<Derived>::MaxColsAtCompileTime,
+    MaxSizeAtCompileTime = internal::size_at_compile_time(internal::traits<Derived>::MaxRowsAtCompileTime,
+                                                          internal::traits<Derived>::MaxColsAtCompileTime),
+    IsVectorAtCompileTime =
+        internal::traits<Derived>::MaxRowsAtCompileTime == 1 || internal::traits<Derived>::MaxColsAtCompileTime == 1,
+    NumDimensions = int(MaxSizeAtCompileTime) == 1 ? 0
+                    : bool(IsVectorAtCompileTime)  ? 1
+                                                   : 2
+  };
+
+  /** Default constructor */
+  SolverBase() {}
+
+  ~SolverBase() {}
+
+  using Base::derived;
+
+  /** \returns an expression of the solution x of \f$ A x = b \f$ using the current decomposition of A.
+   */
+  template <typename Rhs>
+  inline const Solve<Derived, Rhs> solve(const MatrixBase<Rhs>& b) const {
+    internal::solve_assertion<internal::remove_all_t<Derived>>::template run<false>(derived(), b);
+    return Solve<Derived, Rhs>(derived(), b.derived());
+  }
+
+  /** \internal the return type of transpose() */
+  typedef Transpose<const Derived> ConstTransposeReturnType;
+  /** \returns an expression of the transposed of the factored matrix.
+   *
+   * A typical usage is to solve for the transposed problem A^T x = b:
+   * \code x = dec.transpose().solve(b); \endcode
+   *
+   * \sa adjoint(), solve()
+   */
+  inline const ConstTransposeReturnType transpose() const { return ConstTransposeReturnType(derived()); }
+
+  /** \internal the return type of adjoint() */
+  typedef std::conditional_t<NumTraits<Scalar>::IsComplex,
+                             CwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, const ConstTransposeReturnType>,
+                             const ConstTransposeReturnType>
+      AdjointReturnType;
+  /** \returns an expression of the adjoint of the factored matrix
+   *
+   * A typical usage is to solve for the adjoint problem A' x = b:
+   * \code x = dec.adjoint().solve(b); \endcode
+   *
+   * For real scalar types, this function is equivalent to transpose().
+   *
+   * \sa transpose(), solve()
+   */
+  inline const AdjointReturnType adjoint() const { return AdjointReturnType(derived().transpose()); }
+
+ protected:
+  template <bool Transpose_, typename Rhs>
+  void _check_solve_assertion(const Rhs& b) const {
+    EIGEN_ONLY_USED_FOR_DEBUG(b);
+    eigen_assert(derived().m_isInitialized && "Solver is not initialized.");
+    eigen_assert((Transpose_ ? derived().cols() : derived().rows()) == b.rows() &&
+                 "SolverBase::solve(): invalid number of rows of the right hand side matrix b");
+  }
+};
+
+namespace internal {
+
+template <typename Derived>
+struct generic_xpr_base<Derived, MatrixXpr, SolverStorage> {
+  typedef SolverBase<Derived> type;
+};
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_SOLVERBASE_H
diff --git a/inst/include/Eigen/src/Core/StableNorm.h b/inst/include/Eigen/src/Core/StableNorm.h
index 389d9427..711ee3fb 100644
--- a/inst/include/Eigen/src/Core/StableNorm.h
+++ b/inst/include/Eigen/src/Core/StableNorm.h
@@ -10,194 +10,208 @@
 #ifndef EIGEN_STABLENORM_H
 #define EIGEN_STABLENORM_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 namespace internal {
 
-template<typename ExpressionType, typename Scalar>
-inline void stable_norm_kernel(const ExpressionType& bl, Scalar& ssq, Scalar& scale, Scalar& invScale)
-{
-  using std::max;
+template <typename ExpressionType, typename Scalar>
+inline void stable_norm_kernel(const ExpressionType& bl, Scalar& ssq, Scalar& scale, Scalar& invScale) {
   Scalar maxCoeff = bl.cwiseAbs().maxCoeff();
-  
-  if (maxCoeff>scale)
-  {
-    ssq = ssq * numext::abs2(scale/maxCoeff);
-    Scalar tmp = Scalar(1)/maxCoeff;
-    if(tmp > NumTraits<Scalar>::highest())
-    {
+
+  if (maxCoeff > scale) {
+    ssq = ssq * numext::abs2(scale / maxCoeff);
+    Scalar tmp = Scalar(1) / maxCoeff;
+    if (tmp > NumTraits<Scalar>::highest()) {
       invScale = NumTraits<Scalar>::highest();
-      scale = Scalar(1)/invScale;
-    }
-    else
+      scale = Scalar(1) / invScale;
+    } else if (maxCoeff > NumTraits<Scalar>::highest())  // we got a INF
     {
+      invScale = Scalar(1);
+      scale = maxCoeff;
+    } else {
       scale = maxCoeff;
       invScale = tmp;
     }
+  } else if (maxCoeff != maxCoeff)  // we got a NaN
+  {
+    scale = maxCoeff;
   }
-  
+
   // TODO if the maxCoeff is much much smaller than the current scale,
   // then we can neglect this sub vector
-  if(scale>Scalar(0)) // if scale==0, then bl is 0 
-    ssq += (bl*invScale).squaredNorm();
+  if (scale > Scalar(0))  // if scale==0, then bl is 0
+    ssq += (bl * invScale).squaredNorm();
 }
 
-template<typename Derived>
-inline typename NumTraits<typename traits<Derived>::Scalar>::Real
-blueNorm_impl(const EigenBase<Derived>& _vec)
-{
-  typedef typename Derived::RealScalar RealScalar;  
-  typedef typename Derived::Index Index;
-  using std::pow;
-  using std::min;
-  using std::max;
+template <typename VectorType, typename RealScalar>
+void stable_norm_impl_inner_step(const VectorType& vec, RealScalar& ssq, RealScalar& scale, RealScalar& invScale) {
+  const Index blockSize = 4096;
+
+  Index n = vec.size();
+  Index blockEnd = numext::round_down(n, blockSize);
+  for (Index i = 0; i < blockEnd; i += blockSize) {
+    internal::stable_norm_kernel(vec.template segment<blockSize>(i), ssq, scale, invScale);
+  }
+  if (n > blockEnd) {
+    internal::stable_norm_kernel(vec.tail(n - blockEnd), ssq, scale, invScale);
+  }
+}
+
+template <typename VectorType>
+typename VectorType::RealScalar stable_norm_impl(const VectorType& vec,
+                                                 std::enable_if_t<VectorType::IsVectorAtCompileTime>* = 0) {
+  using std::abs;
   using std::sqrt;
+
+  Index n = vec.size();
+  if (EIGEN_PREDICT_FALSE(n == 1)) return abs(vec.coeff(0));
+
+  typedef typename VectorType::RealScalar RealScalar;
+  RealScalar scale(0);
+  RealScalar invScale(1);
+  RealScalar ssq(0);  // sum of squares
+
+  stable_norm_impl_inner_step(vec, ssq, scale, invScale);
+
+  return scale * sqrt(ssq);
+}
+
+template <typename MatrixType>
+typename MatrixType::RealScalar stable_norm_impl(const MatrixType& mat,
+                                                 std::enable_if_t<!MatrixType::IsVectorAtCompileTime>* = 0) {
+  using std::sqrt;
+
+  typedef typename MatrixType::RealScalar RealScalar;
+  RealScalar scale(0);
+  RealScalar invScale(1);
+  RealScalar ssq(0);  // sum of squares
+
+  for (Index j = 0; j < mat.outerSize(); ++j) stable_norm_impl_inner_step(mat.innerVector(j), ssq, scale, invScale);
+  return scale * sqrt(ssq);
+}
+
+template <typename Derived>
+inline typename NumTraits<typename traits<Derived>::Scalar>::Real blueNorm_impl(const EigenBase<Derived>& _vec) {
+  typedef typename Derived::RealScalar RealScalar;
   using std::abs;
+  using std::pow;
+  using std::sqrt;
+
+  // This program calculates the machine-dependent constants
+  // bl, b2, slm, s2m, relerr overfl
+  // from the "basic" machine-dependent numbers
+  // nbig, ibeta, it, iemin, iemax, rbig.
+  // The following define the basic machine-dependent constants.
+  // For portability, the PORT subprograms "ilmaeh" and "rlmach"
+  // are used. For any specific computer, each of the assignment
+  // statements can be replaced
+  static const int ibeta = std::numeric_limits<RealScalar>::radix;  // base for floating-point numbers
+  static const int it = NumTraits<RealScalar>::digits();            // number of base-beta digits in mantissa
+  static const int iemin = NumTraits<RealScalar>::min_exponent();   // minimum exponent
+  static const int iemax = NumTraits<RealScalar>::max_exponent();   // maximum exponent
+  static const RealScalar rbig = NumTraits<RealScalar>::highest();  // largest floating-point number
+  static const RealScalar b1 =
+      RealScalar(pow(RealScalar(ibeta), RealScalar(-((1 - iemin) / 2))));  // lower boundary of midrange
+  static const RealScalar b2 =
+      RealScalar(pow(RealScalar(ibeta), RealScalar((iemax + 1 - it) / 2)));  // upper boundary of midrange
+  static const RealScalar s1m =
+      RealScalar(pow(RealScalar(ibeta), RealScalar((2 - iemin) / 2)));  // scaling factor for lower range
+  static const RealScalar s2m =
+      RealScalar(pow(RealScalar(ibeta), RealScalar(-((iemax + it) / 2))));  // scaling factor for upper range
+  static const RealScalar eps = RealScalar(pow(double(ibeta), 1 - it));
+  static const RealScalar relerr = sqrt(eps);  // tolerance for neglecting asml
+
   const Derived& vec(_vec.derived());
-  static bool initialized = false;
-  static RealScalar b1, b2, s1m, s2m, overfl, rbig, relerr;
-  if(!initialized)
-  {
-    int ibeta, it, iemin, iemax, iexp;
-    RealScalar eps;
-    // This program calculates the machine-dependent constants
-    // bl, b2, slm, s2m, relerr overfl
-    // from the "basic" machine-dependent numbers
-    // nbig, ibeta, it, iemin, iemax, rbig.
-    // The following define the basic machine-dependent constants.
-    // For portability, the PORT subprograms "ilmaeh" and "rlmach"
-    // are used. For any specific computer, each of the assignment
-    // statements can be replaced
-    ibeta = std::numeric_limits<RealScalar>::radix;                 // base for floating-point numbers
-    it    = std::numeric_limits<RealScalar>::digits;                // number of base-beta digits in mantissa
-    iemin = std::numeric_limits<RealScalar>::min_exponent;          // minimum exponent
-    iemax = std::numeric_limits<RealScalar>::max_exponent;          // maximum exponent
-    rbig  = (std::numeric_limits<RealScalar>::max)();               // largest floating-point number
-
-    iexp  = -((1-iemin)/2);
-    b1    = RealScalar(pow(RealScalar(ibeta),RealScalar(iexp)));    // lower boundary of midrange
-    iexp  = (iemax + 1 - it)/2;
-    b2    = RealScalar(pow(RealScalar(ibeta),RealScalar(iexp)));    // upper boundary of midrange
-
-    iexp  = (2-iemin)/2;
-    s1m   = RealScalar(pow(RealScalar(ibeta),RealScalar(iexp)));    // scaling factor for lower range
-    iexp  = - ((iemax+it)/2);
-    s2m   = RealScalar(pow(RealScalar(ibeta),RealScalar(iexp)));    // scaling factor for upper range
-
-    overfl  = rbig*s2m;                                             // overflow boundary for abig
-    eps     = RealScalar(pow(double(ibeta), 1-it));
-    relerr  = sqrt(eps);                                            // tolerance for neglecting asml
-    initialized = true;
-  }
   Index n = vec.size();
   RealScalar ab2 = b2 / RealScalar(n);
   RealScalar asml = RealScalar(0);
   RealScalar amed = RealScalar(0);
   RealScalar abig = RealScalar(0);
-  for(typename Derived::InnerIterator it(vec, 0); it; ++it)
-  {
-    RealScalar ax = abs(it.value());
-    if(ax > ab2)     abig += numext::abs2(ax*s2m);
-    else if(ax < b1) asml += numext::abs2(ax*s1m);
-    else             amed += numext::abs2(ax);
+
+  for (Index j = 0; j < vec.outerSize(); ++j) {
+    for (typename Derived::InnerIterator iter(vec, j); iter; ++iter) {
+      RealScalar ax = abs(iter.value());
+      if (ax > ab2)
+        abig += numext::abs2(ax * s2m);
+      else if (ax < b1)
+        asml += numext::abs2(ax * s1m);
+      else
+        amed += numext::abs2(ax);
+    }
   }
-  if(abig > RealScalar(0))
-  {
+  if (amed != amed) return amed;  // we got a NaN
+  if (abig > RealScalar(0)) {
     abig = sqrt(abig);
-    if(abig > overfl)
-    {
-      return rbig;
-    }
-    if(amed > RealScalar(0))
-    {
-      abig = abig/s2m;
+    if (abig > rbig)  // overflow, or *this contains INF values
+      return abig;    // return INF
+    if (amed > RealScalar(0)) {
+      abig = abig / s2m;
       amed = sqrt(amed);
-    }
-    else
-      return abig/s2m;
-  }
-  else if(asml > RealScalar(0))
-  {
-    if (amed > RealScalar(0))
-    {
+    } else
+      return abig / s2m;
+  } else if (asml > RealScalar(0)) {
+    if (amed > RealScalar(0)) {
       abig = sqrt(amed);
       amed = sqrt(asml) / s1m;
-    }
-    else
-      return sqrt(asml)/s1m;
-  }
-  else
+    } else
+      return sqrt(asml) / s1m;
+  } else
     return sqrt(amed);
-  asml = (min)(abig, amed);
-  abig = (max)(abig, amed);
-  if(asml <= abig*relerr)
+  asml = numext::mini(abig, amed);
+  abig = numext::maxi(abig, amed);
+  if (asml <= abig * relerr)
     return abig;
   else
-    return abig * sqrt(RealScalar(1) + numext::abs2(asml/abig));
+    return abig * sqrt(RealScalar(1) + numext::abs2(asml / abig));
 }
 
-} // end namespace internal
+}  // end namespace internal
 
 /** \returns the \em l2 norm of \c *this avoiding underflow and overflow.
-  * This version use a blockwise two passes algorithm:
-  *  1 - find the absolute largest coefficient \c s
-  *  2 - compute \f$ s \Vert \frac{*this}{s} \Vert \f$ in a standard way
-  *
-  * For architecture/scalar types supporting vectorization, this version
-  * is faster than blueNorm(). Otherwise the blueNorm() is much faster.
-  *
-  * \sa norm(), blueNorm(), hypotNorm()
-  */
-template<typename Derived>
-inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
-MatrixBase<Derived>::stableNorm() const
-{
-  using std::min;
-  using std::sqrt;
-  const Index blockSize = 4096;
-  RealScalar scale(0);
-  RealScalar invScale(1);
-  RealScalar ssq(0); // sum of square
-  enum {
-    Alignment = (int(Flags)&DirectAccessBit) || (int(Flags)&AlignedBit) ? 1 : 0
-  };
-  Index n = size();
-  Index bi = internal::first_aligned(derived());
-  if (bi>0)
-    internal::stable_norm_kernel(this->head(bi), ssq, scale, invScale);
-  for (; bi<n; bi+=blockSize)
-    internal::stable_norm_kernel(this->segment(bi,(min)(blockSize, n - bi)).template forceAlignedAccessIf<Alignment>(), ssq, scale, invScale);
-  return scale * sqrt(ssq);
+ * This version use a blockwise two passes algorithm:
+ *  1 - find the absolute largest coefficient \c s
+ *  2 - compute \f$ s \Vert \frac{*this}{s} \Vert \f$ in a standard way
+ *
+ * For architecture/scalar types supporting vectorization, this version
+ * is faster than blueNorm(). Otherwise the blueNorm() is much faster.
+ *
+ * \sa norm(), blueNorm(), hypotNorm()
+ */
+template <typename Derived>
+inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::stableNorm() const {
+  return internal::stable_norm_impl(derived());
 }
 
 /** \returns the \em l2 norm of \c *this using the Blue's algorithm.
-  * A Portable Fortran Program to Find the Euclidean Norm of a Vector,
-  * ACM TOMS, Vol 4, Issue 1, 1978.
-  *
-  * For architecture/scalar types without vectorization, this version
-  * is much faster than stableNorm(). Otherwise the stableNorm() is faster.
-  *
-  * \sa norm(), stableNorm(), hypotNorm()
-  */
-template<typename Derived>
-inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
-MatrixBase<Derived>::blueNorm() const
-{
+ * A Portable Fortran Program to Find the Euclidean Norm of a Vector,
+ * ACM TOMS, Vol 4, Issue 1, 1978.
+ *
+ * For architecture/scalar types without vectorization, this version
+ * is much faster than stableNorm(). Otherwise the stableNorm() is faster.
+ *
+ * \sa norm(), stableNorm(), hypotNorm()
+ */
+template <typename Derived>
+inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::blueNorm() const {
   return internal::blueNorm_impl(*this);
 }
 
-/** \returns the \em l2 norm of \c *this avoiding undeflow and overflow.
-  * This version use a concatenation of hypot() calls, and it is very slow.
-  *
-  * \sa norm(), stableNorm()
-  */
-template<typename Derived>
-inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
-MatrixBase<Derived>::hypotNorm() const
-{
-  return this->cwiseAbs().redux(internal::scalar_hypot_op<RealScalar>());
+/** \returns the \em l2 norm of \c *this avoiding underflow and overflow.
+ * This version use a concatenation of hypot() calls, and it is very slow.
+ *
+ * \sa norm(), stableNorm()
+ */
+template <typename Derived>
+inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real MatrixBase<Derived>::hypotNorm() const {
+  if (size() == 1)
+    return numext::abs(coeff(0, 0));
+  else
+    return this->cwiseAbs().redux(internal::scalar_hypot_op<RealScalar>());
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_STABLENORM_H
+#endif  // EIGEN_STABLENORM_H
diff --git a/inst/include/Eigen/src/Core/StlIterators.h b/inst/include/Eigen/src/Core/StlIterators.h
new file mode 100644
index 00000000..a24d4c23
--- /dev/null
+++ b/inst/include/Eigen/src/Core/StlIterators.h
@@ -0,0 +1,619 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_STLITERATORS_H
+#define EIGEN_STLITERATORS_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename IteratorType>
+struct indexed_based_stl_iterator_traits;
+
+template <typename Derived>
+class indexed_based_stl_iterator_base {
+ protected:
+  typedef indexed_based_stl_iterator_traits<Derived> traits;
+  typedef typename traits::XprType XprType;
+  typedef indexed_based_stl_iterator_base<typename traits::non_const_iterator> non_const_iterator;
+  typedef indexed_based_stl_iterator_base<typename traits::const_iterator> const_iterator;
+  typedef std::conditional_t<internal::is_const<XprType>::value, non_const_iterator, const_iterator> other_iterator;
+  // NOTE: in C++03 we cannot declare friend classes through typedefs because we need to write friend class:
+  friend class indexed_based_stl_iterator_base<typename traits::const_iterator>;
+  friend class indexed_based_stl_iterator_base<typename traits::non_const_iterator>;
+
+ public:
+  typedef Index difference_type;
+  typedef std::random_access_iterator_tag iterator_category;
+
+  indexed_based_stl_iterator_base() noexcept : mp_xpr(0), m_index(0) {}
+  indexed_based_stl_iterator_base(XprType& xpr, Index index) noexcept : mp_xpr(&xpr), m_index(index) {}
+
+  indexed_based_stl_iterator_base(const non_const_iterator& other) noexcept
+      : mp_xpr(other.mp_xpr), m_index(other.m_index) {}
+
+  indexed_based_stl_iterator_base& operator=(const non_const_iterator& other) {
+    mp_xpr = other.mp_xpr;
+    m_index = other.m_index;
+    return *this;
+  }
+
+  Derived& operator++() {
+    ++m_index;
+    return derived();
+  }
+  Derived& operator--() {
+    --m_index;
+    return derived();
+  }
+
+  Derived operator++(int) {
+    Derived prev(derived());
+    operator++();
+    return prev;
+  }
+  Derived operator--(int) {
+    Derived prev(derived());
+    operator--();
+    return prev;
+  }
+
+  friend Derived operator+(const indexed_based_stl_iterator_base& a, Index b) {
+    Derived ret(a.derived());
+    ret += b;
+    return ret;
+  }
+  friend Derived operator-(const indexed_based_stl_iterator_base& a, Index b) {
+    Derived ret(a.derived());
+    ret -= b;
+    return ret;
+  }
+  friend Derived operator+(Index a, const indexed_based_stl_iterator_base& b) {
+    Derived ret(b.derived());
+    ret += a;
+    return ret;
+  }
+  friend Derived operator-(Index a, const indexed_based_stl_iterator_base& b) {
+    Derived ret(b.derived());
+    ret -= a;
+    return ret;
+  }
+
+  Derived& operator+=(Index b) {
+    m_index += b;
+    return derived();
+  }
+  Derived& operator-=(Index b) {
+    m_index -= b;
+    return derived();
+  }
+
+  difference_type operator-(const indexed_based_stl_iterator_base& other) const {
+    eigen_assert(mp_xpr == other.mp_xpr);
+    return m_index - other.m_index;
+  }
+
+  difference_type operator-(const other_iterator& other) const {
+    eigen_assert(mp_xpr == other.mp_xpr);
+    return m_index - other.m_index;
+  }
+
+  bool operator==(const indexed_based_stl_iterator_base& other) const {
+    eigen_assert(mp_xpr == other.mp_xpr);
+    return m_index == other.m_index;
+  }
+  bool operator!=(const indexed_based_stl_iterator_base& other) const {
+    eigen_assert(mp_xpr == other.mp_xpr);
+    return m_index != other.m_index;
+  }
+  bool operator<(const indexed_based_stl_iterator_base& other) const {
+    eigen_assert(mp_xpr == other.mp_xpr);
+    return m_index < other.m_index;
+  }
+  bool operator<=(const indexed_based_stl_iterator_base& other) const {
+    eigen_assert(mp_xpr == other.mp_xpr);
+    return m_index <= other.m_index;
+  }
+  bool operator>(const indexed_based_stl_iterator_base& other) const {
+    eigen_assert(mp_xpr == other.mp_xpr);
+    return m_index > other.m_index;
+  }
+  bool operator>=(const indexed_based_stl_iterator_base& other) const {
+    eigen_assert(mp_xpr == other.mp_xpr);
+    return m_index >= other.m_index;
+  }
+
+  bool operator==(const other_iterator& other) const {
+    eigen_assert(mp_xpr == other.mp_xpr);
+    return m_index == other.m_index;
+  }
+  bool operator!=(const other_iterator& other) const {
+    eigen_assert(mp_xpr == other.mp_xpr);
+    return m_index != other.m_index;
+  }
+  bool operator<(const other_iterator& other) const {
+    eigen_assert(mp_xpr == other.mp_xpr);
+    return m_index < other.m_index;
+  }
+  bool operator<=(const other_iterator& other) const {
+    eigen_assert(mp_xpr == other.mp_xpr);
+    return m_index <= other.m_index;
+  }
+  bool operator>(const other_iterator& other) const {
+    eigen_assert(mp_xpr == other.mp_xpr);
+    return m_index > other.m_index;
+  }
+  bool operator>=(const other_iterator& other) const {
+    eigen_assert(mp_xpr == other.mp_xpr);
+    return m_index >= other.m_index;
+  }
+
+ protected:
+  Derived& derived() { return static_cast<Derived&>(*this); }
+  const Derived& derived() const { return static_cast<const Derived&>(*this); }
+
+  XprType* mp_xpr;
+  Index m_index;
+};
+
+template <typename Derived>
+class indexed_based_stl_reverse_iterator_base {
+ protected:
+  typedef indexed_based_stl_iterator_traits<Derived> traits;
+  typedef typename traits::XprType XprType;
+  typedef indexed_based_stl_reverse_iterator_base<typename traits::non_const_iterator> non_const_iterator;
+  typedef indexed_based_stl_reverse_iterator_base<typename traits::const_iterator> const_iterator;
+  typedef std::conditional_t<internal::is_const<XprType>::value, non_const_iterator, const_iterator> other_iterator;
+  // NOTE: in C++03 we cannot declare friend classes through typedefs because we need to write friend class:
+  friend class indexed_based_stl_reverse_iterator_base<typename traits::const_iterator>;
+  friend class indexed_based_stl_reverse_iterator_base<typename traits::non_const_iterator>;
+
+ public:
+  typedef Index difference_type;
+  typedef std::random_access_iterator_tag iterator_category;
+
+  indexed_based_stl_reverse_iterator_base() : mp_xpr(0), m_index(0) {}
+  indexed_based_stl_reverse_iterator_base(XprType& xpr, Index index) : mp_xpr(&xpr), m_index(index) {}
+
+  indexed_based_stl_reverse_iterator_base(const non_const_iterator& other)
+      : mp_xpr(other.mp_xpr), m_index(other.m_index) {}
+
+  indexed_based_stl_reverse_iterator_base& operator=(const non_const_iterator& other) {
+    mp_xpr = other.mp_xpr;
+    m_index = other.m_index;
+    return *this;
+  }
+
+  Derived& operator++() {
+    --m_index;
+    return derived();
+  }
+  Derived& operator--() {
+    ++m_index;
+    return derived();
+  }
+
+  Derived operator++(int) {
+    Derived prev(derived());
+    operator++();
+    return prev;
+  }
+  Derived operator--(int) {
+    Derived prev(derived());
+    operator--();
+    return prev;
+  }
+
+  friend Derived operator+(const indexed_based_stl_reverse_iterator_base& a, Index b) {
+    Derived ret(a.derived());
+    ret += b;
+    return ret;
+  }
+  friend Derived operator-(const indexed_based_stl_reverse_iterator_base& a, Index b) {
+    Derived ret(a.derived());
+    ret -= b;
+    return ret;
+  }
+  friend Derived operator+(Index a, const indexed_based_stl_reverse_iterator_base& b) {
+    Derived ret(b.derived());
+    ret += a;
+    return ret;
+  }
+  friend Derived operator-(Index a, const indexed_based_stl_reverse_iterator_base& b) {
+    Derived ret(b.derived());
+    ret -= a;
+    return ret;
+  }
+
+  Derived& operator+=(Index b) {
+    m_index -= b;
+    return derived();
+  }
+  Derived& operator-=(Index b) {
+    m_index += b;
+    return derived();
+  }
+
+  difference_type operator-(const indexed_based_stl_reverse_iterator_base& other) const {
+    eigen_assert(mp_xpr == other.mp_xpr);
+    return other.m_index - m_index;
+  }
+
+  difference_type operator-(const other_iterator& other) const {
+    eigen_assert(mp_xpr == other.mp_xpr);
+    return other.m_index - m_index;
+  }
+
+  bool operator==(const indexed_based_stl_reverse_iterator_base& other) const {
+    eigen_assert(mp_xpr == other.mp_xpr);
+    return m_index == other.m_index;
+  }
+  bool operator!=(const indexed_based_stl_reverse_iterator_base& other) const {
+    eigen_assert(mp_xpr == other.mp_xpr);
+    return m_index != other.m_index;
+  }
+  bool operator<(const indexed_based_stl_reverse_iterator_base& other) const {
+    eigen_assert(mp_xpr == other.mp_xpr);
+    return m_index > other.m_index;
+  }
+  bool operator<=(const indexed_based_stl_reverse_iterator_base& other) const {
+    eigen_assert(mp_xpr == other.mp_xpr);
+    return m_index >= other.m_index;
+  }
+  bool operator>(const indexed_based_stl_reverse_iterator_base& other) const {
+    eigen_assert(mp_xpr == other.mp_xpr);
+    return m_index < other.m_index;
+  }
+  bool operator>=(const indexed_based_stl_reverse_iterator_base& other) const {
+    eigen_assert(mp_xpr == other.mp_xpr);
+    return m_index <= other.m_index;
+  }
+
+  bool operator==(const other_iterator& other) const {
+    eigen_assert(mp_xpr == other.mp_xpr);
+    return m_index == other.m_index;
+  }
+  bool operator!=(const other_iterator& other) const {
+    eigen_assert(mp_xpr == other.mp_xpr);
+    return m_index != other.m_index;
+  }
+  bool operator<(const other_iterator& other) const {
+    eigen_assert(mp_xpr == other.mp_xpr);
+    return m_index > other.m_index;
+  }
+  bool operator<=(const other_iterator& other) const {
+    eigen_assert(mp_xpr == other.mp_xpr);
+    return m_index >= other.m_index;
+  }
+  bool operator>(const other_iterator& other) const {
+    eigen_assert(mp_xpr == other.mp_xpr);
+    return m_index < other.m_index;
+  }
+  bool operator>=(const other_iterator& other) const {
+    eigen_assert(mp_xpr == other.mp_xpr);
+    return m_index <= other.m_index;
+  }
+
+ protected:
+  Derived& derived() { return static_cast<Derived&>(*this); }
+  const Derived& derived() const { return static_cast<const Derived&>(*this); }
+
+  XprType* mp_xpr;
+  Index m_index;
+};
+
+template <typename XprType>
+class pointer_based_stl_iterator {
+  enum { is_lvalue = internal::is_lvalue<XprType>::value };
+  typedef pointer_based_stl_iterator<std::remove_const_t<XprType>> non_const_iterator;
+  typedef pointer_based_stl_iterator<std::add_const_t<XprType>> const_iterator;
+  typedef std::conditional_t<internal::is_const<XprType>::value, non_const_iterator, const_iterator> other_iterator;
+  // NOTE: in C++03 we cannot declare friend classes through typedefs because we need to write friend class:
+  friend class pointer_based_stl_iterator<std::add_const_t<XprType>>;
+  friend class pointer_based_stl_iterator<std::remove_const_t<XprType>>;
+
+ public:
+  typedef Index difference_type;
+  typedef typename XprType::Scalar value_type;
+#if EIGEN_COMP_CXXVER >= 20 && defined(__cpp_lib_concepts) && __cpp_lib_concepts >= 202002L
+  typedef std::conditional_t<XprType::InnerStrideAtCompileTime == 1, std::contiguous_iterator_tag,
+                             std::random_access_iterator_tag>
+      iterator_category;
+#else
+  typedef std::random_access_iterator_tag iterator_category;
+#endif
+  typedef std::conditional_t<bool(is_lvalue), value_type*, const value_type*> pointer;
+  typedef std::conditional_t<bool(is_lvalue), value_type&, const value_type&> reference;
+
+  pointer_based_stl_iterator() noexcept : m_ptr(0) {}
+  pointer_based_stl_iterator(XprType& xpr, Index index) noexcept : m_incr(xpr.innerStride()) {
+    m_ptr = xpr.data() + index * m_incr.value();
+  }
+
+  pointer_based_stl_iterator(const non_const_iterator& other) noexcept : m_ptr(other.m_ptr), m_incr(other.m_incr) {}
+
+  pointer_based_stl_iterator& operator=(const non_const_iterator& other) noexcept {
+    m_ptr = other.m_ptr;
+    m_incr.setValue(other.m_incr);
+    return *this;
+  }
+
+  reference operator*() const { return *m_ptr; }
+  reference operator[](Index i) const { return *(m_ptr + i * m_incr.value()); }
+  pointer operator->() const { return m_ptr; }
+
+  pointer_based_stl_iterator& operator++() {
+    m_ptr += m_incr.value();
+    return *this;
+  }
+  pointer_based_stl_iterator& operator--() {
+    m_ptr -= m_incr.value();
+    return *this;
+  }
+
+  pointer_based_stl_iterator operator++(int) {
+    pointer_based_stl_iterator prev(*this);
+    operator++();
+    return prev;
+  }
+  pointer_based_stl_iterator operator--(int) {
+    pointer_based_stl_iterator prev(*this);
+    operator--();
+    return prev;
+  }
+
+  friend pointer_based_stl_iterator operator+(const pointer_based_stl_iterator& a, Index b) {
+    pointer_based_stl_iterator ret(a);
+    ret += b;
+    return ret;
+  }
+  friend pointer_based_stl_iterator operator-(const pointer_based_stl_iterator& a, Index b) {
+    pointer_based_stl_iterator ret(a);
+    ret -= b;
+    return ret;
+  }
+  friend pointer_based_stl_iterator operator+(Index a, const pointer_based_stl_iterator& b) {
+    pointer_based_stl_iterator ret(b);
+    ret += a;
+    return ret;
+  }
+  friend pointer_based_stl_iterator operator-(Index a, const pointer_based_stl_iterator& b) {
+    pointer_based_stl_iterator ret(b);
+    ret -= a;
+    return ret;
+  }
+
+  pointer_based_stl_iterator& operator+=(Index b) {
+    m_ptr += b * m_incr.value();
+    return *this;
+  }
+  pointer_based_stl_iterator& operator-=(Index b) {
+    m_ptr -= b * m_incr.value();
+    return *this;
+  }
+
+  difference_type operator-(const pointer_based_stl_iterator& other) const {
+    return (m_ptr - other.m_ptr) / m_incr.value();
+  }
+
+  difference_type operator-(const other_iterator& other) const { return (m_ptr - other.m_ptr) / m_incr.value(); }
+
+  bool operator==(const pointer_based_stl_iterator& other) const { return m_ptr == other.m_ptr; }
+  bool operator!=(const pointer_based_stl_iterator& other) const { return m_ptr != other.m_ptr; }
+  bool operator<(const pointer_based_stl_iterator& other) const { return m_ptr < other.m_ptr; }
+  bool operator<=(const pointer_based_stl_iterator& other) const { return m_ptr <= other.m_ptr; }
+  bool operator>(const pointer_based_stl_iterator& other) const { return m_ptr > other.m_ptr; }
+  bool operator>=(const pointer_based_stl_iterator& other) const { return m_ptr >= other.m_ptr; }
+
+  bool operator==(const other_iterator& other) const { return m_ptr == other.m_ptr; }
+  bool operator!=(const other_iterator& other) const { return m_ptr != other.m_ptr; }
+  bool operator<(const other_iterator& other) const { return m_ptr < other.m_ptr; }
+  bool operator<=(const other_iterator& other) const { return m_ptr <= other.m_ptr; }
+  bool operator>(const other_iterator& other) const { return m_ptr > other.m_ptr; }
+  bool operator>=(const other_iterator& other) const { return m_ptr >= other.m_ptr; }
+
+ protected:
+  pointer m_ptr;
+  internal::variable_if_dynamic<Index, XprType::InnerStrideAtCompileTime> m_incr;
+};
+
+template <typename XprType_>
+struct indexed_based_stl_iterator_traits<generic_randaccess_stl_iterator<XprType_>> {
+  typedef XprType_ XprType;
+  typedef generic_randaccess_stl_iterator<std::remove_const_t<XprType>> non_const_iterator;
+  typedef generic_randaccess_stl_iterator<std::add_const_t<XprType>> const_iterator;
+};
+
+template <typename XprType>
+class generic_randaccess_stl_iterator
+    : public indexed_based_stl_iterator_base<generic_randaccess_stl_iterator<XprType>> {
+ public:
+  typedef typename XprType::Scalar value_type;
+
+ protected:
+  enum {
+    has_direct_access = (internal::traits<XprType>::Flags & DirectAccessBit) ? 1 : 0,
+    is_lvalue = internal::is_lvalue<XprType>::value
+  };
+
+  typedef indexed_based_stl_iterator_base<generic_randaccess_stl_iterator> Base;
+  using Base::m_index;
+  using Base::mp_xpr;
+
+  // TODO currently const Transpose/Reshape expressions never returns const references,
+  // so lets return by value too.
+  // typedef std::conditional_t<bool(has_direct_access), const value_type&, const value_type> read_only_ref_t;
+  typedef const value_type read_only_ref_t;
+
+ public:
+  typedef std::conditional_t<bool(is_lvalue), value_type*, const value_type*> pointer;
+  typedef std::conditional_t<bool(is_lvalue), value_type&, read_only_ref_t> reference;
+
+  generic_randaccess_stl_iterator() : Base() {}
+  generic_randaccess_stl_iterator(XprType& xpr, Index index) : Base(xpr, index) {}
+  generic_randaccess_stl_iterator(const typename Base::non_const_iterator& other) : Base(other) {}
+  using Base::operator=;
+
+  reference operator*() const { return (*mp_xpr)(m_index); }
+  reference operator[](Index i) const { return (*mp_xpr)(m_index + i); }
+  pointer operator->() const { return &((*mp_xpr)(m_index)); }
+};
+
+template <typename XprType_, DirectionType Direction>
+struct indexed_based_stl_iterator_traits<subvector_stl_iterator<XprType_, Direction>> {
+  typedef XprType_ XprType;
+  typedef subvector_stl_iterator<std::remove_const_t<XprType>, Direction> non_const_iterator;
+  typedef subvector_stl_iterator<std::add_const_t<XprType>, Direction> const_iterator;
+};
+
+template <typename XprType, DirectionType Direction>
+class subvector_stl_iterator : public indexed_based_stl_iterator_base<subvector_stl_iterator<XprType, Direction>> {
+ protected:
+  enum { is_lvalue = internal::is_lvalue<XprType>::value };
+
+  typedef indexed_based_stl_iterator_base<subvector_stl_iterator> Base;
+  using Base::m_index;
+  using Base::mp_xpr;
+
+  typedef std::conditional_t<Direction == Vertical, typename XprType::ColXpr, typename XprType::RowXpr> SubVectorType;
+  typedef std::conditional_t<Direction == Vertical, typename XprType::ConstColXpr, typename XprType::ConstRowXpr>
+      ConstSubVectorType;
+
+ public:
+  typedef std::conditional_t<bool(is_lvalue), SubVectorType, ConstSubVectorType> reference;
+  typedef typename reference::PlainObject value_type;
+
+ private:
+  class subvector_stl_iterator_ptr {
+   public:
+    subvector_stl_iterator_ptr(const reference& subvector) : m_subvector(subvector) {}
+    reference* operator->() { return &m_subvector; }
+
+   private:
+    reference m_subvector;
+  };
+
+ public:
+  typedef subvector_stl_iterator_ptr pointer;
+
+  subvector_stl_iterator() : Base() {}
+  subvector_stl_iterator(XprType& xpr, Index index) : Base(xpr, index) {}
+
+  reference operator*() const { return (*mp_xpr).template subVector<Direction>(m_index); }
+  reference operator[](Index i) const { return (*mp_xpr).template subVector<Direction>(m_index + i); }
+  pointer operator->() const { return (*mp_xpr).template subVector<Direction>(m_index); }
+};
+
+template <typename XprType_, DirectionType Direction>
+struct indexed_based_stl_iterator_traits<subvector_stl_reverse_iterator<XprType_, Direction>> {
+  typedef XprType_ XprType;
+  typedef subvector_stl_reverse_iterator<std::remove_const_t<XprType>, Direction> non_const_iterator;
+  typedef subvector_stl_reverse_iterator<std::add_const_t<XprType>, Direction> const_iterator;
+};
+
+template <typename XprType, DirectionType Direction>
+class subvector_stl_reverse_iterator
+    : public indexed_based_stl_reverse_iterator_base<subvector_stl_reverse_iterator<XprType, Direction>> {
+ protected:
+  enum { is_lvalue = internal::is_lvalue<XprType>::value };
+
+  typedef indexed_based_stl_reverse_iterator_base<subvector_stl_reverse_iterator> Base;
+  using Base::m_index;
+  using Base::mp_xpr;
+
+  typedef std::conditional_t<Direction == Vertical, typename XprType::ColXpr, typename XprType::RowXpr> SubVectorType;
+  typedef std::conditional_t<Direction == Vertical, typename XprType::ConstColXpr, typename XprType::ConstRowXpr>
+      ConstSubVectorType;
+
+ public:
+  typedef std::conditional_t<bool(is_lvalue), SubVectorType, ConstSubVectorType> reference;
+  typedef typename reference::PlainObject value_type;
+
+ private:
+  class subvector_stl_reverse_iterator_ptr {
+   public:
+    subvector_stl_reverse_iterator_ptr(const reference& subvector) : m_subvector(subvector) {}
+    reference* operator->() { return &m_subvector; }
+
+   private:
+    reference m_subvector;
+  };
+
+ public:
+  typedef subvector_stl_reverse_iterator_ptr pointer;
+
+  subvector_stl_reverse_iterator() : Base() {}
+  subvector_stl_reverse_iterator(XprType& xpr, Index index) : Base(xpr, index) {}
+
+  reference operator*() const { return (*mp_xpr).template subVector<Direction>(m_index); }
+  reference operator[](Index i) const { return (*mp_xpr).template subVector<Direction>(m_index + i); }
+  pointer operator->() const { return (*mp_xpr).template subVector<Direction>(m_index); }
+};
+
+}  // namespace internal
+
+/** returns an iterator to the first element of the 1D vector or array
+ * \only_for_vectors
+ * \sa end(), cbegin()
+ */
+template <typename Derived>
+inline typename DenseBase<Derived>::iterator DenseBase<Derived>::begin() {
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
+  return iterator(derived(), 0);
+}
+
+/** const version of begin() */
+template <typename Derived>
+inline typename DenseBase<Derived>::const_iterator DenseBase<Derived>::begin() const {
+  return cbegin();
+}
+
+/** returns a read-only const_iterator to the first element of the 1D vector or array
+ * \only_for_vectors
+ * \sa cend(), begin()
+ */
+template <typename Derived>
+inline typename DenseBase<Derived>::const_iterator DenseBase<Derived>::cbegin() const {
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
+  return const_iterator(derived(), 0);
+}
+
+/** returns an iterator to the element following the last element of the 1D vector or array
+ * \only_for_vectors
+ * \sa begin(), cend()
+ */
+template <typename Derived>
+inline typename DenseBase<Derived>::iterator DenseBase<Derived>::end() {
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
+  return iterator(derived(), size());
+}
+
+/** const version of end() */
+template <typename Derived>
+inline typename DenseBase<Derived>::const_iterator DenseBase<Derived>::end() const {
+  return cend();
+}
+
+/** returns a read-only const_iterator to the element following the last element of the 1D vector or array
+ * \only_for_vectors
+ * \sa begin(), cend()
+ */
+template <typename Derived>
+inline typename DenseBase<Derived>::const_iterator DenseBase<Derived>::cend() const {
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
+  return const_iterator(derived(), size());
+}
+
+}  // namespace Eigen
+
+#endif  // EIGEN_STLITERATORS_H
diff --git a/inst/include/Eigen/src/Core/Stride.h b/inst/include/Eigen/src/Core/Stride.h
index 1e3f5fe9..692f0a1c 100644
--- a/inst/include/Eigen/src/Core/Stride.h
+++ b/inst/include/Eigen/src/Core/Stride.h
@@ -10,99 +10,105 @@
 #ifndef EIGEN_STRIDE_H
 #define EIGEN_STRIDE_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 /** \class Stride
-  * \ingroup Core_Module
-  *
-  * \brief Holds strides information for Map
-  *
-  * This class holds the strides information for mapping arrays with strides with class Map.
-  *
-  * It holds two values: the inner stride and the outer stride.
-  *
-  * The inner stride is the pointer increment between two consecutive entries within a given row of a
-  * row-major matrix or within a given column of a column-major matrix.
-  *
-  * The outer stride is the pointer increment between two consecutive rows of a row-major matrix or
-  * between two consecutive columns of a column-major matrix.
-  *
-  * These two values can be passed either at compile-time as template parameters, or at runtime as
-  * arguments to the constructor.
-  *
-  * Indeed, this class takes two template parameters:
-  *  \param _OuterStrideAtCompileTime the outer stride, or Dynamic if you want to specify it at runtime.
-  *  \param _InnerStrideAtCompileTime the inner stride, or Dynamic if you want to specify it at runtime.
-  *
-  * Here is an example:
-  * \include Map_general_stride.cpp
-  * Output: \verbinclude Map_general_stride.out
-  *
-  * \sa class InnerStride, class OuterStride, \ref TopicStorageOrders
-  */
-template<int _OuterStrideAtCompileTime, int _InnerStrideAtCompileTime>
-class Stride
-{
-  public:
-    typedef DenseIndex Index;
-    enum {
-      InnerStrideAtCompileTime = _InnerStrideAtCompileTime,
-      OuterStrideAtCompileTime = _OuterStrideAtCompileTime
-    };
-
-    /** Default constructor, for use when strides are fixed at compile time */
-    Stride()
-      : m_outer(OuterStrideAtCompileTime), m_inner(InnerStrideAtCompileTime)
-    {
-      eigen_assert(InnerStrideAtCompileTime != Dynamic && OuterStrideAtCompileTime != Dynamic);
-    }
-
-    /** Constructor allowing to pass the strides at runtime */
-    Stride(Index outerStride, Index innerStride)
-      : m_outer(outerStride), m_inner(innerStride)
-    {
-      eigen_assert(innerStride>=0 && outerStride>=0);
-    }
-
-    /** Copy constructor */
-    Stride(const Stride& other)
-      : m_outer(other.outer()), m_inner(other.inner())
-    {}
-
-    /** \returns the outer stride */
-    inline Index outer() const { return m_outer.value(); }
-    /** \returns the inner stride */
-    inline Index inner() const { return m_inner.value(); }
-
-  protected:
-    internal::variable_if_dynamic<Index, OuterStrideAtCompileTime> m_outer;
-    internal::variable_if_dynamic<Index, InnerStrideAtCompileTime> m_inner;
+ * \ingroup Core_Module
+ *
+ * \brief Holds strides information for Map
+ *
+ * This class holds the strides information for mapping arrays with strides with class Map.
+ *
+ * It holds two values: the inner stride and the outer stride.
+ *
+ * The inner stride is the pointer increment between two consecutive entries within a given row of a
+ * row-major matrix or within a given column of a column-major matrix.
+ *
+ * The outer stride is the pointer increment between two consecutive rows of a row-major matrix or
+ * between two consecutive columns of a column-major matrix.
+ *
+ * These two values can be passed either at compile-time as template parameters, or at runtime as
+ * arguments to the constructor.
+ *
+ * Indeed, this class takes two template parameters:
+ *  \tparam OuterStrideAtCompileTime_ the outer stride, or Dynamic if you want to specify it at runtime.
+ *  \tparam InnerStrideAtCompileTime_ the inner stride, or Dynamic if you want to specify it at runtime.
+ *
+ * Here is an example:
+ * \include Map_general_stride.cpp
+ * Output: \verbinclude Map_general_stride.out
+ *
+ * Both strides can be negative. However, a negative stride of -1 cannot be specified at compile time
+ * because of the ambiguity with Dynamic which is defined to -1 (historically, negative strides were
+ * not allowed).
+ *
+ * Note that for compile-time vectors (ColsAtCompileTime==1 or RowsAtCompile==1),
+ * the inner stride is the pointer increment between two consecutive elements,
+ * regardless of storage layout.
+ *
+ * \sa class InnerStride, class OuterStride, \ref TopicStorageOrders
+ */
+template <int OuterStrideAtCompileTime_, int InnerStrideAtCompileTime_>
+class Stride {
+ public:
+  typedef Eigen::Index Index;  ///< \deprecated since Eigen 3.3
+  enum { InnerStrideAtCompileTime = InnerStrideAtCompileTime_, OuterStrideAtCompileTime = OuterStrideAtCompileTime_ };
+
+  /** Default constructor, for use when strides are fixed at compile time */
+  EIGEN_DEVICE_FUNC Stride() : m_outer(OuterStrideAtCompileTime), m_inner(InnerStrideAtCompileTime) {
+    // FIXME: for Eigen 4 we should use DynamicIndex instead of Dynamic.
+    // FIXME: for Eigen 4 we should also unify this API with fix<>
+    eigen_assert(InnerStrideAtCompileTime != Dynamic && OuterStrideAtCompileTime != Dynamic);
+  }
+
+  /** Constructor allowing to pass the strides at runtime */
+  EIGEN_DEVICE_FUNC Stride(Index outerStride, Index innerStride) : m_outer(outerStride), m_inner(innerStride) {}
+
+  /** Copy constructor */
+  EIGEN_DEVICE_FUNC Stride(const Stride& other) : m_outer(other.outer()), m_inner(other.inner()) {}
+
+  /** Copy assignment operator */
+  EIGEN_DEVICE_FUNC Stride& operator=(const Stride& other) {
+    m_outer.setValue(other.outer());
+    m_inner.setValue(other.inner());
+    return *this;
+  }
+
+  /** \returns the outer stride */
+  EIGEN_DEVICE_FUNC constexpr Index outer() const { return m_outer.value(); }
+  /** \returns the inner stride */
+  EIGEN_DEVICE_FUNC constexpr Index inner() const { return m_inner.value(); }
+
+ protected:
+  internal::variable_if_dynamic<Index, OuterStrideAtCompileTime> m_outer;
+  internal::variable_if_dynamic<Index, InnerStrideAtCompileTime> m_inner;
 };
 
 /** \brief Convenience specialization of Stride to specify only an inner stride
-  * See class Map for some examples */
-template<int Value = Dynamic>
-class InnerStride : public Stride<0, Value>
-{
-    typedef Stride<0, Value> Base;
-  public:
-    typedef DenseIndex Index;
-    InnerStride() : Base() {}
-    InnerStride(Index v) : Base(0, v) {}
+ * See class Map for some examples */
+template <int Value>
+class InnerStride : public Stride<0, Value> {
+  typedef Stride<0, Value> Base;
+
+ public:
+  EIGEN_DEVICE_FUNC InnerStride() : Base() {}
+  EIGEN_DEVICE_FUNC InnerStride(Index v) : Base(0, v) {}  // FIXME making this explicit could break valid code
 };
 
 /** \brief Convenience specialization of Stride to specify only an outer stride
-  * See class Map for some examples */
-template<int Value = Dynamic>
-class OuterStride : public Stride<Value, 0>
-{
-    typedef Stride<Value, 0> Base;
-  public:
-    typedef DenseIndex Index;
-    OuterStride() : Base() {}
-    OuterStride(Index v) : Base(v,0) {}
+ * See class Map for some examples */
+template <int Value>
+class OuterStride : public Stride<Value, 0> {
+  typedef Stride<Value, 0> Base;
+
+ public:
+  EIGEN_DEVICE_FUNC OuterStride() : Base() {}
+  EIGEN_DEVICE_FUNC OuterStride(Index v) : Base(v, 0) {}  // FIXME making this explicit could break valid code
 };
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_STRIDE_H
+#endif  // EIGEN_STRIDE_H
diff --git a/inst/include/Eigen/src/Core/Swap.h b/inst/include/Eigen/src/Core/Swap.h
index bf58bd59..dd825e90 100644
--- a/inst/include/Eigen/src/Core/Swap.h
+++ b/inst/include/Eigen/src/Core/Swap.h
@@ -10,117 +10,90 @@
 #ifndef EIGEN_SWAP_H
 #define EIGEN_SWAP_H
 
-namespace Eigen { 
-
-/** \class SwapWrapper
-  * \ingroup Core_Module
-  *
-  * \internal
-  *
-  * \brief Internal helper class for swapping two expressions
-  */
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
 namespace internal {
-template<typename ExpressionType>
-struct traits<SwapWrapper<ExpressionType> > : traits<ExpressionType> {};
-}
-
-template<typename ExpressionType> class SwapWrapper
-  : public internal::dense_xpr_base<SwapWrapper<ExpressionType> >::type
-{
-  public:
-
-    typedef typename internal::dense_xpr_base<SwapWrapper>::type Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(SwapWrapper)
-    typedef typename internal::packet_traits<Scalar>::type Packet;
-
-    inline SwapWrapper(ExpressionType& xpr) : m_expression(xpr) {}
-
-    inline Index rows() const { return m_expression.rows(); }
-    inline Index cols() const { return m_expression.cols(); }
-    inline Index outerStride() const { return m_expression.outerStride(); }
-    inline Index innerStride() const { return m_expression.innerStride(); }
-    
-    typedef typename internal::conditional<
-                       internal::is_lvalue<ExpressionType>::value,
-                       Scalar,
-                       const Scalar
-                     >::type ScalarWithConstIfNotLvalue;
-                     
-    inline ScalarWithConstIfNotLvalue* data() { return m_expression.data(); }
-    inline const Scalar* data() const { return m_expression.data(); }
-
-    inline Scalar& coeffRef(Index rowId, Index colId)
-    {
-      return m_expression.const_cast_derived().coeffRef(rowId, colId);
-    }
-
-    inline Scalar& coeffRef(Index index)
-    {
-      return m_expression.const_cast_derived().coeffRef(index);
-    }
-
-    inline Scalar& coeffRef(Index rowId, Index colId) const
-    {
-      return m_expression.coeffRef(rowId, colId);
-    }
-
-    inline Scalar& coeffRef(Index index) const
-    {
-      return m_expression.coeffRef(index);
-    }
-
-    template<typename OtherDerived>
-    void copyCoeff(Index rowId, Index colId, const DenseBase<OtherDerived>& other)
-    {
-      OtherDerived& _other = other.const_cast_derived();
-      eigen_internal_assert(rowId >= 0 && rowId < rows()
-                         && colId >= 0 && colId < cols());
-      Scalar tmp = m_expression.coeff(rowId, colId);
-      m_expression.coeffRef(rowId, colId) = _other.coeff(rowId, colId);
-      _other.coeffRef(rowId, colId) = tmp;
-    }
-
-    template<typename OtherDerived>
-    void copyCoeff(Index index, const DenseBase<OtherDerived>& other)
-    {
-      OtherDerived& _other = other.const_cast_derived();
-      eigen_internal_assert(index >= 0 && index < m_expression.size());
-      Scalar tmp = m_expression.coeff(index);
-      m_expression.coeffRef(index) = _other.coeff(index);
-      _other.coeffRef(index) = tmp;
-    }
-
-    template<typename OtherDerived, int StoreMode, int LoadMode>
-    void copyPacket(Index rowId, Index colId, const DenseBase<OtherDerived>& other)
-    {
-      OtherDerived& _other = other.const_cast_derived();
-      eigen_internal_assert(rowId >= 0 && rowId < rows()
-                        && colId >= 0 && colId < cols());
-      Packet tmp = m_expression.template packet<StoreMode>(rowId, colId);
-      m_expression.template writePacket<StoreMode>(rowId, colId,
-        _other.template packet<LoadMode>(rowId, colId)
-      );
-      _other.template writePacket<LoadMode>(rowId, colId, tmp);
-    }
-
-    template<typename OtherDerived, int StoreMode, int LoadMode>
-    void copyPacket(Index index, const DenseBase<OtherDerived>& other)
-    {
-      OtherDerived& _other = other.const_cast_derived();
-      eigen_internal_assert(index >= 0 && index < m_expression.size());
-      Packet tmp = m_expression.template packet<StoreMode>(index);
-      m_expression.template writePacket<StoreMode>(index,
-        _other.template packet<LoadMode>(index)
-      );
-      _other.template writePacket<LoadMode>(index, tmp);
-    }
-
-    ExpressionType& expression() const { return m_expression; }
-
-  protected:
-    ExpressionType& m_expression;
+
+// Overload default assignPacket behavior for swapping them
+template <typename DstEvaluatorTypeT, typename SrcEvaluatorTypeT>
+class generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT,
+                                      swap_assign_op<typename DstEvaluatorTypeT::Scalar>, Specialized>
+    : public generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT,
+                                             swap_assign_op<typename DstEvaluatorTypeT::Scalar>, BuiltIn> {
+ protected:
+  typedef generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT,
+                                          swap_assign_op<typename DstEvaluatorTypeT::Scalar>, BuiltIn>
+      Base;
+  using Base::m_dst;
+  using Base::m_functor;
+  using Base::m_src;
+
+ public:
+  typedef typename Base::Scalar Scalar;
+  typedef typename Base::DstXprType DstXprType;
+  typedef swap_assign_op<Scalar> Functor;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE generic_dense_assignment_kernel(DstEvaluatorTypeT &dst,
+                                                                        const SrcEvaluatorTypeT &src,
+                                                                        const Functor &func, DstXprType &dstExpr)
+      : Base(dst, src, func, dstExpr) {}
+
+  template <int StoreMode, int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE void assignPacket(Index row, Index col) {
+    PacketType tmp = m_src.template packet<LoadMode, PacketType>(row, col);
+    const_cast<SrcEvaluatorTypeT &>(m_src).template writePacket<LoadMode>(
+        row, col, m_dst.template packet<StoreMode, PacketType>(row, col));
+    m_dst.template writePacket<StoreMode>(row, col, tmp);
+  }
+
+  template <int StoreMode, int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE void assignPacket(Index index) {
+    PacketType tmp = m_src.template packet<LoadMode, PacketType>(index);
+    const_cast<SrcEvaluatorTypeT &>(m_src).template writePacket<LoadMode>(
+        index, m_dst.template packet<StoreMode, PacketType>(index));
+    m_dst.template writePacket<StoreMode>(index, tmp);
+  }
+
+  // TODO find a simple way not to have to copy/paste this function from generic_dense_assignment_kernel, by simple I
+  // mean no CRTP (Gael)
+  template <int StoreMode, int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE void assignPacketByOuterInner(Index outer, Index inner) {
+    Index row = Base::rowIndexByOuterInner(outer, inner);
+    Index col = Base::colIndexByOuterInner(outer, inner);
+    assignPacket<StoreMode, LoadMode, PacketType>(row, col);
+  }
+
+  template <int StoreMode, int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE void assignPacketSegment(Index row, Index col, Index begin, Index count) {
+    PacketType tmp = m_src.template packetSegment<LoadMode, PacketType>(row, col, begin, count);
+    const_cast<SrcEvaluatorTypeT &>(m_src).template writePacketSegment<LoadMode>(
+        row, col, m_dst.template packetSegment<StoreMode, PacketType>(row, col, begin, count), begin, count);
+    m_dst.template writePacketSegment<StoreMode>(row, col, tmp, begin, count);
+  }
+
+  template <int StoreMode, int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE void assignPacketSegment(Index index, Index begin, Index count) {
+    PacketType tmp = m_src.template packetSegment<LoadMode, PacketType>(index, begin, count);
+    const_cast<SrcEvaluatorTypeT &>(m_src).template writePacketSegment<LoadMode>(
+        index, m_dst.template packetSegment<StoreMode, PacketType>(index, begin, count), begin, count);
+    m_dst.template writePacketSegment<StoreMode>(index, tmp, begin, count);
+  }
+
+  // TODO find a simple way not to have to copy/paste this function from generic_dense_assignment_kernel, by simple I
+  // mean no CRTP (Gael)
+  template <int StoreMode, int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE void assignPacketSegmentByOuterInner(Index outer, Index inner, Index begin, Index count) {
+    Index row = Base::rowIndexByOuterInner(outer, inner);
+    Index col = Base::colIndexByOuterInner(outer, inner);
+    assignPacketSegment<StoreMode, LoadMode, PacketType>(row, col, begin, count);
+  }
 };
 
-} // end namespace Eigen
+}  // namespace internal
+
+}  // end namespace Eigen
 
-#endif // EIGEN_SWAP_H
+#endif  // EIGEN_SWAP_H
diff --git a/inst/include/Eigen/src/Core/Transpose.h b/inst/include/Eigen/src/Core/Transpose.h
index 22096ea2..0676a252 100644
--- a/inst/include/Eigen/src/Core/Transpose.h
+++ b/inst/include/Eigen/src/Core/Transpose.h
@@ -2,7 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2009-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -11,317 +11,343 @@
 #ifndef EIGEN_TRANSPOSE_H
 #define EIGEN_TRANSPOSE_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 
-/** \class Transpose
-  * \ingroup Core_Module
-  *
-  * \brief Expression of the transpose of a matrix
-  *
-  * \param MatrixType the type of the object of which we are taking the transpose
-  *
-  * This class represents an expression of the transpose of a matrix.
-  * It is the return type of MatrixBase::transpose() and MatrixBase::adjoint()
-  * and most of the time this is the only way it is used.
-  *
-  * \sa MatrixBase::transpose(), MatrixBase::adjoint()
-  */
+namespace Eigen {
 
 namespace internal {
-template<typename MatrixType>
-struct traits<Transpose<MatrixType> > : traits<MatrixType>
-{
-  typedef typename MatrixType::Scalar Scalar;
-  typedef typename nested<MatrixType>::type MatrixTypeNested;
-  typedef typename remove_reference<MatrixTypeNested>::type MatrixTypeNestedPlain;
-  typedef typename traits<MatrixType>::StorageKind StorageKind;
-  typedef typename traits<MatrixType>::XprKind XprKind;
+template <typename MatrixType>
+struct traits<Transpose<MatrixType> > : public traits<MatrixType> {
+  typedef typename ref_selector<MatrixType>::type MatrixTypeNested;
+  typedef std::remove_reference_t<MatrixTypeNested> MatrixTypeNestedPlain;
   enum {
     RowsAtCompileTime = MatrixType::ColsAtCompileTime,
     ColsAtCompileTime = MatrixType::RowsAtCompileTime,
     MaxRowsAtCompileTime = MatrixType::MaxColsAtCompileTime,
     MaxColsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
     FlagsLvalueBit = is_lvalue<MatrixType>::value ? LvalueBit : 0,
-    Flags0 = MatrixTypeNestedPlain::Flags & ~(LvalueBit | NestByRefBit),
+    Flags0 = traits<MatrixTypeNestedPlain>::Flags & ~(LvalueBit | NestByRefBit),
     Flags1 = Flags0 | FlagsLvalueBit,
     Flags = Flags1 ^ RowMajorBit,
-    CoeffReadCost = MatrixTypeNestedPlain::CoeffReadCost,
     InnerStrideAtCompileTime = inner_stride_at_compile_time<MatrixType>::ret,
     OuterStrideAtCompileTime = outer_stride_at_compile_time<MatrixType>::ret
   };
 };
-}
-
-template<typename MatrixType, typename StorageKind> class TransposeImpl;
-
-template<typename MatrixType> class Transpose
-  : public TransposeImpl<MatrixType,typename internal::traits<MatrixType>::StorageKind>
-{
-  public:
-
-    typedef typename TransposeImpl<MatrixType,typename internal::traits<MatrixType>::StorageKind>::Base Base;
-    EIGEN_GENERIC_PUBLIC_INTERFACE(Transpose)
-
-    inline Transpose(MatrixType& a_matrix) : m_matrix(a_matrix) {}
+}  // namespace internal
 
-    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Transpose)
+template <typename MatrixType, typename StorageKind>
+class TransposeImpl;
 
-    inline Index rows() const { return m_matrix.cols(); }
-    inline Index cols() const { return m_matrix.rows(); }
+/** \class Transpose
+ * \ingroup Core_Module
+ *
+ * \brief Expression of the transpose of a matrix
+ *
+ * \tparam MatrixType the type of the object of which we are taking the transpose
+ *
+ * This class represents an expression of the transpose of a matrix.
+ * It is the return type of MatrixBase::transpose() and MatrixBase::adjoint()
+ * and most of the time this is the only way it is used.
+ *
+ * \sa MatrixBase::transpose(), MatrixBase::adjoint()
+ */
+template <typename MatrixType>
+class Transpose : public TransposeImpl<MatrixType, typename internal::traits<MatrixType>::StorageKind> {
+ public:
+  typedef typename internal::ref_selector<MatrixType>::non_const_type MatrixTypeNested;
+
+  typedef typename TransposeImpl<MatrixType, typename internal::traits<MatrixType>::StorageKind>::Base Base;
+  EIGEN_GENERIC_PUBLIC_INTERFACE(Transpose)
+  typedef internal::remove_all_t<MatrixType> NestedExpression;
+
+  EIGEN_DEVICE_FUNC explicit EIGEN_STRONG_INLINE Transpose(MatrixType& matrix) : m_matrix(matrix) {}
+
+  EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Transpose)
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index rows() const noexcept { return m_matrix.cols(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index cols() const noexcept { return m_matrix.rows(); }
+
+  /** \returns the nested expression */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all_t<MatrixTypeNested>& nestedExpression() const {
+    return m_matrix;
+  }
 
-    /** \returns the nested expression */
-    const typename internal::remove_all<typename MatrixType::Nested>::type&
-    nestedExpression() const { return m_matrix; }
+  /** \returns the nested expression */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::remove_reference_t<MatrixTypeNested>& nestedExpression() {
+    return m_matrix;
+  }
 
-    /** \returns the nested expression */
-    typename internal::remove_all<typename MatrixType::Nested>::type&
-    nestedExpression() { return m_matrix.const_cast_derived(); }
+  /** \internal */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void resize(Index nrows, Index ncols) { m_matrix.resize(ncols, nrows); }
 
-  protected:
-    typename MatrixType::Nested m_matrix;
+ protected:
+  typename internal::ref_selector<MatrixType>::non_const_type m_matrix;
 };
 
 namespace internal {
 
-template<typename MatrixType, bool HasDirectAccess = has_direct_access<MatrixType>::ret>
-struct TransposeImpl_base
-{
+template <typename MatrixType, bool HasDirectAccess = has_direct_access<MatrixType>::ret>
+struct TransposeImpl_base {
   typedef typename dense_xpr_base<Transpose<MatrixType> >::type type;
 };
 
-template<typename MatrixType>
-struct TransposeImpl_base<MatrixType, false>
-{
+template <typename MatrixType>
+struct TransposeImpl_base<MatrixType, false> {
   typedef typename dense_xpr_base<Transpose<MatrixType> >::type type;
 };
 
-} // end namespace internal
-
-template<typename MatrixType> class TransposeImpl<MatrixType,Dense>
-  : public internal::TransposeImpl_base<MatrixType>::type
-{
-  public:
-
-    typedef typename internal::TransposeImpl_base<MatrixType>::type Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(Transpose<MatrixType>)
-    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(TransposeImpl)
-
-    inline Index innerStride() const { return derived().nestedExpression().innerStride(); }
-    inline Index outerStride() const { return derived().nestedExpression().outerStride(); }
-
-    typedef typename internal::conditional<
-                       internal::is_lvalue<MatrixType>::value,
-                       Scalar,
-                       const Scalar
-                     >::type ScalarWithConstIfNotLvalue;
+}  // end namespace internal
 
-    inline ScalarWithConstIfNotLvalue* data() { return derived().nestedExpression().data(); }
-    inline const Scalar* data() const { return derived().nestedExpression().data(); }
+// Generic API dispatcher
+template <typename XprType, typename StorageKind>
+class TransposeImpl : public internal::generic_xpr_base<Transpose<XprType> >::type {
+ public:
+  typedef typename internal::generic_xpr_base<Transpose<XprType> >::type Base;
+};
 
-    inline ScalarWithConstIfNotLvalue& coeffRef(Index rowId, Index colId)
-    {
-      EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
-      return derived().nestedExpression().const_cast_derived().coeffRef(colId, rowId);
-    }
+template <typename MatrixType>
+class TransposeImpl<MatrixType, Dense> : public internal::TransposeImpl_base<MatrixType>::type {
+ public:
+  typedef typename internal::TransposeImpl_base<MatrixType>::type Base;
+  using Base::coeffRef;
+  EIGEN_DENSE_PUBLIC_INTERFACE(Transpose<MatrixType>)
+  EIGEN_INHERIT_ASSIGNMENT_OPERATORS(TransposeImpl)
 
-    inline ScalarWithConstIfNotLvalue& coeffRef(Index index)
-    {
-      EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
-      return derived().nestedExpression().const_cast_derived().coeffRef(index);
-    }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index innerStride() const { return derived().nestedExpression().innerStride(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outerStride() const { return derived().nestedExpression().outerStride(); }
 
-    inline const Scalar& coeffRef(Index rowId, Index colId) const
-    {
-      return derived().nestedExpression().coeffRef(colId, rowId);
-    }
+  typedef std::conditional_t<internal::is_lvalue<MatrixType>::value, Scalar, const Scalar> ScalarWithConstIfNotLvalue;
 
-    inline const Scalar& coeffRef(Index index) const
-    {
-      return derived().nestedExpression().coeffRef(index);
-    }
-
-    inline CoeffReturnType coeff(Index rowId, Index colId) const
-    {
-      return derived().nestedExpression().coeff(colId, rowId);
-    }
-
-    inline CoeffReturnType coeff(Index index) const
-    {
-      return derived().nestedExpression().coeff(index);
-    }
-
-    template<int LoadMode>
-    inline const PacketScalar packet(Index rowId, Index colId) const
-    {
-      return derived().nestedExpression().template packet<LoadMode>(colId, rowId);
-    }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr ScalarWithConstIfNotLvalue* data() {
+    return derived().nestedExpression().data();
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const Scalar* data() const {
+    return derived().nestedExpression().data();
+  }
 
-    template<int LoadMode>
-    inline void writePacket(Index rowId, Index colId, const PacketScalar& x)
-    {
-      derived().nestedExpression().const_cast_derived().template writePacket<LoadMode>(colId, rowId, x);
-    }
+  // FIXME: shall we keep the const version of coeffRef?
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeffRef(Index rowId, Index colId) const {
+    return derived().nestedExpression().coeffRef(colId, rowId);
+  }
 
-    template<int LoadMode>
-    inline const PacketScalar packet(Index index) const
-    {
-      return derived().nestedExpression().template packet<LoadMode>(index);
-    }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeffRef(Index index) const {
+    return derived().nestedExpression().coeffRef(index);
+  }
 
-    template<int LoadMode>
-    inline void writePacket(Index index, const PacketScalar& x)
-    {
-      derived().nestedExpression().const_cast_derived().template writePacket<LoadMode>(index, x);
-    }
+ protected:
+  EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(TransposeImpl)
 };
 
 /** \returns an expression of the transpose of *this.
-  *
-  * Example: \include MatrixBase_transpose.cpp
-  * Output: \verbinclude MatrixBase_transpose.out
-  *
-  * \warning If you want to replace a matrix by its own transpose, do \b NOT do this:
-  * \code
-  * m = m.transpose(); // bug!!! caused by aliasing effect
-  * \endcode
-  * Instead, use the transposeInPlace() method:
-  * \code
-  * m.transposeInPlace();
-  * \endcode
-  * which gives Eigen good opportunities for optimization, or alternatively you can also do:
-  * \code
-  * m = m.transpose().eval();
-  * \endcode
-  *
-  * \sa transposeInPlace(), adjoint() */
-template<typename Derived>
-inline Transpose<Derived>
-DenseBase<Derived>::transpose()
-{
-  return derived();
+ *
+ * Example: \include MatrixBase_transpose.cpp
+ * Output: \verbinclude MatrixBase_transpose.out
+ *
+ * \warning If you want to replace a matrix by its own transpose, do \b NOT do this:
+ * \code
+ * m = m.transpose(); // bug!!! caused by aliasing effect
+ * \endcode
+ * Instead, use the transposeInPlace() method:
+ * \code
+ * m.transposeInPlace();
+ * \endcode
+ * which gives Eigen good opportunities for optimization, or alternatively you can also do:
+ * \code
+ * m = m.transpose().eval();
+ * \endcode
+ *
+ * \sa transposeInPlace(), adjoint() */
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename DenseBase<Derived>::TransposeReturnType DenseBase<Derived>::transpose() {
+  return TransposeReturnType(derived());
 }
 
 /** This is the const version of transpose().
-  *
-  * Make sure you read the warning for transpose() !
-  *
-  * \sa transposeInPlace(), adjoint() */
-template<typename Derived>
-inline typename DenseBase<Derived>::ConstTransposeReturnType
-DenseBase<Derived>::transpose() const
-{
+ *
+ * Make sure you read the warning for transpose() !
+ *
+ * \sa transposeInPlace(), adjoint() */
+template <typename Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase<Derived>::ConstTransposeReturnType
+DenseBase<Derived>::transpose() const {
   return ConstTransposeReturnType(derived());
 }
 
 /** \returns an expression of the adjoint (i.e. conjugate transpose) of *this.
-  *
-  * Example: \include MatrixBase_adjoint.cpp
-  * Output: \verbinclude MatrixBase_adjoint.out
-  *
-  * \warning If you want to replace a matrix by its own adjoint, do \b NOT do this:
-  * \code
-  * m = m.adjoint(); // bug!!! caused by aliasing effect
-  * \endcode
-  * Instead, use the adjointInPlace() method:
-  * \code
-  * m.adjointInPlace();
-  * \endcode
-  * which gives Eigen good opportunities for optimization, or alternatively you can also do:
-  * \code
-  * m = m.adjoint().eval();
-  * \endcode
-  *
-  * \sa adjointInPlace(), transpose(), conjugate(), class Transpose, class internal::scalar_conjugate_op */
-template<typename Derived>
-inline const typename MatrixBase<Derived>::AdjointReturnType
-MatrixBase<Derived>::adjoint() const
-{
-  return this->transpose(); // in the complex case, the .conjugate() is be implicit here
-                            // due to implicit conversion to return type
+ *
+ * Example: \include MatrixBase_adjoint.cpp
+ * Output: \verbinclude MatrixBase_adjoint.out
+ *
+ * \warning If you want to replace a matrix by its own adjoint, do \b NOT do this:
+ * \code
+ * m = m.adjoint(); // bug!!! caused by aliasing effect
+ * \endcode
+ * Instead, use the adjointInPlace() method:
+ * \code
+ * m.adjointInPlace();
+ * \endcode
+ * which gives Eigen good opportunities for optimization, or alternatively you can also do:
+ * \code
+ * m = m.adjoint().eval();
+ * \endcode
+ *
+ * \sa adjointInPlace(), transpose(), conjugate(), class Transpose, class internal::scalar_conjugate_op */
+template <typename Derived>
+EIGEN_DEVICE_FUNC inline const typename MatrixBase<Derived>::AdjointReturnType MatrixBase<Derived>::adjoint() const {
+  return AdjointReturnType(this->transpose());
 }
 
 /***************************************************************************
-* "in place" transpose implementation
-***************************************************************************/
+ * "in place" transpose implementation
+ ***************************************************************************/
 
 namespace internal {
 
-template<typename MatrixType,
-  bool IsSquare = (MatrixType::RowsAtCompileTime == MatrixType::ColsAtCompileTime) && MatrixType::RowsAtCompileTime!=Dynamic>
+template <typename MatrixType,
+          bool IsSquare = (MatrixType::RowsAtCompileTime == MatrixType::ColsAtCompileTime) &&
+                          MatrixType::RowsAtCompileTime != Dynamic,
+          bool MatchPacketSize =
+              (int(MatrixType::RowsAtCompileTime) == int(internal::packet_traits<typename MatrixType::Scalar>::size)) &&
+              (internal::evaluator<MatrixType>::Flags & PacketAccessBit)>
 struct inplace_transpose_selector;
 
-template<typename MatrixType>
-struct inplace_transpose_selector<MatrixType,true> { // square matrix
+template <typename MatrixType>
+struct inplace_transpose_selector<MatrixType, true, false> {  // square matrix
   static void run(MatrixType& m) {
-    m.matrix().template triangularView<StrictlyUpper>().swap(m.matrix().transpose());
+    m.matrix().template triangularView<StrictlyUpper>().swap(
+        m.matrix().transpose().template triangularView<StrictlyUpper>());
   }
 };
 
-template<typename MatrixType>
-struct inplace_transpose_selector<MatrixType,false> { // non square matrix
+template <typename MatrixType>
+struct inplace_transpose_selector<MatrixType, true, true> {  // PacketSize x PacketSize
+  static void run(MatrixType& m) {
+    typedef typename MatrixType::Scalar Scalar;
+    typedef typename internal::packet_traits<typename MatrixType::Scalar>::type Packet;
+    const Index PacketSize = internal::packet_traits<Scalar>::size;
+    const Index Alignment = internal::evaluator<MatrixType>::Alignment;
+    PacketBlock<Packet> A;
+    for (Index i = 0; i < PacketSize; ++i) A.packet[i] = m.template packetByOuterInner<Alignment>(i, 0);
+    internal::ptranspose(A);
+    for (Index i = 0; i < PacketSize; ++i)
+      m.template writePacket<Alignment>(m.rowIndexByOuterInner(i, 0), m.colIndexByOuterInner(i, 0), A.packet[i]);
+  }
+};
+
+template <typename MatrixType, Index Alignment>
+void BlockedInPlaceTranspose(MatrixType& m) {
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename internal::packet_traits<typename MatrixType::Scalar>::type Packet;
+  const Index PacketSize = internal::packet_traits<Scalar>::size;
+  eigen_assert(m.rows() == m.cols());
+  int row_start = 0;
+  for (; row_start + PacketSize <= m.rows(); row_start += PacketSize) {
+    for (int col_start = row_start; col_start + PacketSize <= m.cols(); col_start += PacketSize) {
+      PacketBlock<Packet> A;
+      if (row_start == col_start) {
+        for (Index i = 0; i < PacketSize; ++i)
+          A.packet[i] = m.template packetByOuterInner<Alignment>(row_start + i, col_start);
+        internal::ptranspose(A);
+        for (Index i = 0; i < PacketSize; ++i)
+          m.template writePacket<Alignment>(m.rowIndexByOuterInner(row_start + i, col_start),
+                                            m.colIndexByOuterInner(row_start + i, col_start), A.packet[i]);
+      } else {
+        PacketBlock<Packet> B;
+        for (Index i = 0; i < PacketSize; ++i) {
+          A.packet[i] = m.template packetByOuterInner<Alignment>(row_start + i, col_start);
+          B.packet[i] = m.template packetByOuterInner<Alignment>(col_start + i, row_start);
+        }
+        internal::ptranspose(A);
+        internal::ptranspose(B);
+        for (Index i = 0; i < PacketSize; ++i) {
+          m.template writePacket<Alignment>(m.rowIndexByOuterInner(row_start + i, col_start),
+                                            m.colIndexByOuterInner(row_start + i, col_start), B.packet[i]);
+          m.template writePacket<Alignment>(m.rowIndexByOuterInner(col_start + i, row_start),
+                                            m.colIndexByOuterInner(col_start + i, row_start), A.packet[i]);
+        }
+      }
+    }
+  }
+  for (Index row = row_start; row < m.rows(); ++row) {
+    m.matrix().row(row).head(row).swap(m.matrix().col(row).head(row).transpose());
+  }
+}
+
+template <typename MatrixType, bool MatchPacketSize>
+struct inplace_transpose_selector<MatrixType, false, MatchPacketSize> {  // non square or dynamic matrix
   static void run(MatrixType& m) {
-    if (m.rows()==m.cols())
-      m.matrix().template triangularView<StrictlyUpper>().swap(m.matrix().transpose());
-    else
+    typedef typename MatrixType::Scalar Scalar;
+    if (m.rows() == m.cols()) {
+      const Index PacketSize = internal::packet_traits<Scalar>::size;
+      if (!NumTraits<Scalar>::IsComplex && m.rows() >= PacketSize) {
+        if ((m.rows() % PacketSize) == 0)
+          BlockedInPlaceTranspose<MatrixType, internal::evaluator<MatrixType>::Alignment>(m);
+        else
+          BlockedInPlaceTranspose<MatrixType, Unaligned>(m);
+      } else {
+        m.matrix().template triangularView<StrictlyUpper>().swap(
+            m.matrix().transpose().template triangularView<StrictlyUpper>());
+      }
+    } else {
       m = m.transpose().eval();
+    }
   }
 };
 
-} // end namespace internal
+}  // end namespace internal
 
 /** This is the "in place" version of transpose(): it replaces \c *this by its own transpose.
-  * Thus, doing
-  * \code
-  * m.transposeInPlace();
-  * \endcode
-  * has the same effect on m as doing
-  * \code
-  * m = m.transpose().eval();
-  * \endcode
-  * and is faster and also safer because in the latter line of code, forgetting the eval() results
-  * in a bug caused by \ref TopicAliasing "aliasing".
-  *
-  * Notice however that this method is only useful if you want to replace a matrix by its own transpose.
-  * If you just need the transpose of a matrix, use transpose().
-  *
-  * \note if the matrix is not square, then \c *this must be a resizable matrix. 
-  * This excludes (non-square) fixed-size matrices, block-expressions and maps.
-  *
-  * \sa transpose(), adjoint(), adjointInPlace() */
-template<typename Derived>
-inline void DenseBase<Derived>::transposeInPlace()
-{
-  eigen_assert((rows() == cols() || (RowsAtCompileTime == Dynamic && ColsAtCompileTime == Dynamic))
-               && "transposeInPlace() called on a non-square non-resizable matrix");
+ * Thus, doing
+ * \code
+ * m.transposeInPlace();
+ * \endcode
+ * has the same effect on m as doing
+ * \code
+ * m = m.transpose().eval();
+ * \endcode
+ * and is faster and also safer because in the latter line of code, forgetting the eval() results
+ * in a bug caused by \ref TopicAliasing "aliasing".
+ *
+ * Notice however that this method is only useful if you want to replace a matrix by its own transpose.
+ * If you just need the transpose of a matrix, use transpose().
+ *
+ * \note if the matrix is not square, then \c *this must be a resizable matrix.
+ * This excludes (non-square) fixed-size matrices, block-expressions and maps.
+ *
+ * \sa transpose(), adjoint(), adjointInPlace() */
+template <typename Derived>
+EIGEN_DEVICE_FUNC inline void DenseBase<Derived>::transposeInPlace() {
+  eigen_assert((rows() == cols() || (RowsAtCompileTime == Dynamic && ColsAtCompileTime == Dynamic)) &&
+               "transposeInPlace() called on a non-square non-resizable matrix");
   internal::inplace_transpose_selector<Derived>::run(derived());
 }
 
 /***************************************************************************
-* "in place" adjoint implementation
-***************************************************************************/
+ * "in place" adjoint implementation
+ ***************************************************************************/
 
 /** This is the "in place" version of adjoint(): it replaces \c *this by its own transpose.
-  * Thus, doing
-  * \code
-  * m.adjointInPlace();
-  * \endcode
-  * has the same effect on m as doing
-  * \code
-  * m = m.adjoint().eval();
-  * \endcode
-  * and is faster and also safer because in the latter line of code, forgetting the eval() results
-  * in a bug caused by aliasing.
-  *
-  * Notice however that this method is only useful if you want to replace a matrix by its own adjoint.
-  * If you just need the adjoint of a matrix, use adjoint().
-  *
-  * \note if the matrix is not square, then \c *this must be a resizable matrix.
-  * This excludes (non-square) fixed-size matrices, block-expressions and maps.
-  *
-  * \sa transpose(), adjoint(), transposeInPlace() */
-template<typename Derived>
-inline void MatrixBase<Derived>::adjointInPlace()
-{
+ * Thus, doing
+ * \code
+ * m.adjointInPlace();
+ * \endcode
+ * has the same effect on m as doing
+ * \code
+ * m = m.adjoint().eval();
+ * \endcode
+ * and is faster and also safer because in the latter line of code, forgetting the eval() results
+ * in a bug caused by aliasing.
+ *
+ * Notice however that this method is only useful if you want to replace a matrix by its own adjoint.
+ * If you just need the adjoint of a matrix, use adjoint().
+ *
+ * \note if the matrix is not square, then \c *this must be a resizable matrix.
+ * This excludes (non-square) fixed-size matrices, block-expressions and maps.
+ *
+ * \sa transpose(), adjoint(), transposeInPlace() */
+template <typename Derived>
+EIGEN_DEVICE_FUNC inline void MatrixBase<Derived>::adjointInPlace() {
   derived() = adjoint().eval();
 }
 
@@ -331,44 +357,34 @@ inline void MatrixBase<Derived>::adjointInPlace()
 
 namespace internal {
 
-template<typename BinOp,typename NestedXpr,typename Rhs>
-struct blas_traits<SelfCwiseBinaryOp<BinOp,NestedXpr,Rhs> >
- : blas_traits<NestedXpr>
-{
-  typedef SelfCwiseBinaryOp<BinOp,NestedXpr,Rhs> XprType;
-  static inline const XprType extract(const XprType& x) { return x; }
-};
-
-template<bool DestIsTransposed, typename OtherDerived>
-struct check_transpose_aliasing_compile_time_selector
-{
+template <bool DestIsTransposed, typename OtherDerived>
+struct check_transpose_aliasing_compile_time_selector {
   enum { ret = bool(blas_traits<OtherDerived>::IsTransposed) != DestIsTransposed };
 };
 
-template<bool DestIsTransposed, typename BinOp, typename DerivedA, typename DerivedB>
-struct check_transpose_aliasing_compile_time_selector<DestIsTransposed,CwiseBinaryOp<BinOp,DerivedA,DerivedB> >
-{
-  enum { ret =    bool(blas_traits<DerivedA>::IsTransposed) != DestIsTransposed
-               || bool(blas_traits<DerivedB>::IsTransposed) != DestIsTransposed
+template <bool DestIsTransposed, typename BinOp, typename DerivedA, typename DerivedB>
+struct check_transpose_aliasing_compile_time_selector<DestIsTransposed, CwiseBinaryOp<BinOp, DerivedA, DerivedB> > {
+  enum {
+    ret = bool(blas_traits<DerivedA>::IsTransposed) != DestIsTransposed ||
+          bool(blas_traits<DerivedB>::IsTransposed) != DestIsTransposed
   };
 };
 
-template<typename Scalar, bool DestIsTransposed, typename OtherDerived>
-struct check_transpose_aliasing_run_time_selector
-{
-  static bool run(const Scalar* dest, const OtherDerived& src)
-  {
-    return (bool(blas_traits<OtherDerived>::IsTransposed) != DestIsTransposed) && (dest!=0 && dest==(const Scalar*)extract_data(src));
+template <typename Scalar, bool DestIsTransposed, typename OtherDerived>
+struct check_transpose_aliasing_run_time_selector {
+  EIGEN_DEVICE_FUNC static bool run(const Scalar* dest, const OtherDerived& src) {
+    return (bool(blas_traits<OtherDerived>::IsTransposed) != DestIsTransposed) &&
+           (dest != 0 && dest == (const Scalar*)extract_data(src));
   }
 };
 
-template<typename Scalar, bool DestIsTransposed, typename BinOp, typename DerivedA, typename DerivedB>
-struct check_transpose_aliasing_run_time_selector<Scalar,DestIsTransposed,CwiseBinaryOp<BinOp,DerivedA,DerivedB> >
-{
-  static bool run(const Scalar* dest, const CwiseBinaryOp<BinOp,DerivedA,DerivedB>& src)
-  {
-    return ((blas_traits<DerivedA>::IsTransposed != DestIsTransposed) && (dest!=0 && dest==(const Scalar*)extract_data(src.lhs())))
-        || ((blas_traits<DerivedB>::IsTransposed != DestIsTransposed) && (dest!=0 && dest==(const Scalar*)extract_data(src.rhs())));
+template <typename Scalar, bool DestIsTransposed, typename BinOp, typename DerivedA, typename DerivedB>
+struct check_transpose_aliasing_run_time_selector<Scalar, DestIsTransposed, CwiseBinaryOp<BinOp, DerivedA, DerivedB> > {
+  EIGEN_DEVICE_FUNC static bool run(const Scalar* dest, const CwiseBinaryOp<BinOp, DerivedA, DerivedB>& src) {
+    return ((blas_traits<DerivedA>::IsTransposed != DestIsTransposed) &&
+            (dest != 0 && dest == (const Scalar*)extract_data(src.lhs()))) ||
+           ((blas_traits<DerivedB>::IsTransposed != DestIsTransposed) &&
+            (dest != 0 && dest == (const Scalar*)extract_data(src.rhs())));
   }
 };
 
@@ -378,42 +394,34 @@ struct check_transpose_aliasing_run_time_selector<Scalar,DestIsTransposed,CwiseB
 // known at compile time to be false, and using that, we can avoid generating the code of the assert again
 // and again for all these expressions that don't need it.
 
-template<typename Derived, typename OtherDerived,
-         bool MightHaveTransposeAliasing
-                 = check_transpose_aliasing_compile_time_selector
-                     <blas_traits<Derived>::IsTransposed,OtherDerived>::ret
-        >
-struct checkTransposeAliasing_impl
-{
-    static void run(const Derived& dst, const OtherDerived& other)
-    {
-        eigen_assert((!check_transpose_aliasing_run_time_selector
-                      <typename Derived::Scalar,blas_traits<Derived>::IsTransposed,OtherDerived>
-                      ::run(extract_data(dst), other))
-          && "aliasing detected during transposition, use transposeInPlace() "
-             "or evaluate the rhs into a temporary using .eval()");
-
-    }
+template <typename Derived, typename OtherDerived,
+          bool MightHaveTransposeAliasing =
+              check_transpose_aliasing_compile_time_selector<blas_traits<Derived>::IsTransposed, OtherDerived>::ret>
+struct checkTransposeAliasing_impl {
+  EIGEN_DEVICE_FUNC static void run(const Derived& dst, const OtherDerived& other) {
+    eigen_assert(
+        (!check_transpose_aliasing_run_time_selector<typename Derived::Scalar, blas_traits<Derived>::IsTransposed,
+                                                     OtherDerived>::run(extract_data(dst), other)) &&
+        "aliasing detected during transposition, use transposeInPlace() "
+        "or evaluate the rhs into a temporary using .eval()");
+  }
 };
 
-template<typename Derived, typename OtherDerived>
-struct checkTransposeAliasing_impl<Derived, OtherDerived, false>
-{
-    static void run(const Derived&, const OtherDerived&)
-    {
-    }
+template <typename Derived, typename OtherDerived>
+struct checkTransposeAliasing_impl<Derived, OtherDerived, false> {
+  EIGEN_DEVICE_FUNC static void run(const Derived&, const OtherDerived&) {}
 };
 
-} // end namespace internal
-
-template<typename Derived>
-template<typename OtherDerived>
-void DenseBase<Derived>::checkTransposeAliasing(const OtherDerived& other) const
-{
-    internal::checkTransposeAliasing_impl<Derived, OtherDerived>::run(derived(), other);
+template <typename Dst, typename Src>
+EIGEN_DEVICE_FUNC inline void check_for_aliasing(const Dst& dst, const Src& src) {
+  if ((!Dst::IsVectorAtCompileTime) && dst.rows() > 1 && dst.cols() > 1)
+    internal::checkTransposeAliasing_impl<Dst, Src>::run(dst, src);
 }
-#endif
 
-} // end namespace Eigen
+}  // end namespace internal
+
+#endif  // EIGEN_NO_DEBUG
+
+}  // end namespace Eigen
 
-#endif // EIGEN_TRANSPOSE_H
+#endif  // EIGEN_TRANSPOSE_H
diff --git a/inst/include/Eigen/src/Core/Transpositions.h b/inst/include/Eigen/src/Core/Transpositions.h
index e4ba0756..f6dd2584 100644
--- a/inst/include/Eigen/src/Core/Transpositions.h
+++ b/inst/include/Eigen/src/Core/Transpositions.h
@@ -10,427 +10,314 @@
 #ifndef EIGEN_TRANSPOSITIONS_H
 #define EIGEN_TRANSPOSITIONS_H
 
-namespace Eigen { 
-
-/** \class Transpositions
-  * \ingroup Core_Module
-  *
-  * \brief Represents a sequence of transpositions (row/column interchange)
-  *
-  * \param SizeAtCompileTime the number of transpositions, or Dynamic
-  * \param MaxSizeAtCompileTime the maximum number of transpositions, or Dynamic. This optional parameter defaults to SizeAtCompileTime. Most of the time, you should not have to specify it.
-  *
-  * This class represents a permutation transformation as a sequence of \em n transpositions
-  * \f$[T_{n-1} \ldots T_{i} \ldots T_{0}]\f$. It is internally stored as a vector of integers \c indices.
-  * Each transposition \f$ T_{i} \f$ applied on the left of a matrix (\f$ T_{i} M\f$) interchanges
-  * the rows \c i and \c indices[i] of the matrix \c M.
-  * A transposition applied on the right (e.g., \f$ M T_{i}\f$) yields a column interchange.
-  *
-  * Compared to the class PermutationMatrix, such a sequence of transpositions is what is
-  * computed during a decomposition with pivoting, and it is faster when applying the permutation in-place.
-  * 
-  * To apply a sequence of transpositions to a matrix, simply use the operator * as in the following example:
-  * \code
-  * Transpositions tr;
-  * MatrixXf mat;
-  * mat = tr * mat;
-  * \endcode
-  * In this example, we detect that the matrix appears on both side, and so the transpositions
-  * are applied in-place without any temporary or extra copy.
-  *
-  * \sa class PermutationMatrix
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+template <typename Derived>
+class TranspositionsBase {
+  typedef internal::traits<Derived> Traits;
+
+ public:
+  typedef typename Traits::IndicesType IndicesType;
+  typedef typename IndicesType::Scalar StorageIndex;
+  typedef Eigen::Index Index;  ///< \deprecated since Eigen 3.3
+
+  EIGEN_DEVICE_FUNC Derived& derived() { return *static_cast<Derived*>(this); }
+  EIGEN_DEVICE_FUNC const Derived& derived() const { return *static_cast<const Derived*>(this); }
+
+  /** Copies the \a other transpositions into \c *this */
+  template <typename OtherDerived>
+  Derived& operator=(const TranspositionsBase<OtherDerived>& other) {
+    indices() = other.indices();
+    return derived();
+  }
+
+  /** \returns the number of transpositions */
+  EIGEN_DEVICE_FUNC Index size() const { return indices().size(); }
+  /** \returns the number of rows of the equivalent permutation matrix */
+  EIGEN_DEVICE_FUNC Index rows() const { return indices().size(); }
+  /** \returns the number of columns of the equivalent permutation matrix */
+  EIGEN_DEVICE_FUNC Index cols() const { return indices().size(); }
+
+  /** Direct access to the underlying index vector */
+  EIGEN_DEVICE_FUNC inline const StorageIndex& coeff(Index i) const { return indices().coeff(i); }
+  /** Direct access to the underlying index vector */
+  inline StorageIndex& coeffRef(Index i) { return indices().coeffRef(i); }
+  /** Direct access to the underlying index vector */
+  inline const StorageIndex& operator()(Index i) const { return indices()(i); }
+  /** Direct access to the underlying index vector */
+  inline StorageIndex& operator()(Index i) { return indices()(i); }
+  /** Direct access to the underlying index vector */
+  inline const StorageIndex& operator[](Index i) const { return indices()(i); }
+  /** Direct access to the underlying index vector */
+  inline StorageIndex& operator[](Index i) { return indices()(i); }
+
+  /** const version of indices(). */
+  EIGEN_DEVICE_FUNC const IndicesType& indices() const { return derived().indices(); }
+  /** \returns a reference to the stored array representing the transpositions. */
+  EIGEN_DEVICE_FUNC IndicesType& indices() { return derived().indices(); }
+
+  /** Resizes to given size. */
+  inline void resize(Index newSize) { indices().resize(newSize); }
+
+  /** Sets \c *this to represents an identity transformation */
+  void setIdentity() {
+    for (StorageIndex i = 0; i < indices().size(); ++i) coeffRef(i) = i;
+  }
+
+  // FIXME: do we want such methods ?
+  // might be useful when the target matrix expression is complex, e.g.:
+  // object.matrix().block(..,..,..,..) = trans * object.matrix().block(..,..,..,..);
+  /*
+  template<typename MatrixType>
+  void applyForwardToRows(MatrixType& mat) const
+  {
+    for(Index k=0 ; k<size() ; ++k)
+      if(m_indices(k)!=k)
+        mat.row(k).swap(mat.row(m_indices(k)));
+  }
+
+  template<typename MatrixType>
+  void applyBackwardToRows(MatrixType& mat) const
+  {
+    for(Index k=size()-1 ; k>=0 ; --k)
+      if(m_indices(k)!=k)
+        mat.row(k).swap(mat.row(m_indices(k)));
+  }
   */
 
-namespace internal {
-template<typename TranspositionType, typename MatrixType, int Side, bool Transposed=false> struct transposition_matrix_product_retval;
-}
+  /** \returns the inverse transformation */
+  inline Transpose<TranspositionsBase> inverse() const { return Transpose<TranspositionsBase>(derived()); }
 
-template<typename Derived>
-class TranspositionsBase
-{
-    typedef internal::traits<Derived> Traits;
-    
-  public:
-
-    typedef typename Traits::IndicesType IndicesType;
-    typedef typename IndicesType::Scalar Index;
-
-    Derived& derived() { return *static_cast<Derived*>(this); }
-    const Derived& derived() const { return *static_cast<const Derived*>(this); }
-
-    /** Copies the \a other transpositions into \c *this */
-    template<typename OtherDerived>
-    Derived& operator=(const TranspositionsBase<OtherDerived>& other)
-    {
-      indices() = other.indices();
-      return derived();
-    }
-
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** This is a special case of the templated operator=. Its purpose is to
-      * prevent a default operator= from hiding the templated operator=.
-      */
-    Derived& operator=(const TranspositionsBase& other)
-    {
-      indices() = other.indices();
-      return derived();
-    }
-    #endif
-
-    /** \returns the number of transpositions */
-    inline Index size() const { return indices().size(); }
-
-    /** Direct access to the underlying index vector */
-    inline const Index& coeff(Index i) const { return indices().coeff(i); }
-    /** Direct access to the underlying index vector */
-    inline Index& coeffRef(Index i) { return indices().coeffRef(i); }
-    /** Direct access to the underlying index vector */
-    inline const Index& operator()(Index i) const { return indices()(i); }
-    /** Direct access to the underlying index vector */
-    inline Index& operator()(Index i) { return indices()(i); }
-    /** Direct access to the underlying index vector */
-    inline const Index& operator[](Index i) const { return indices()(i); }
-    /** Direct access to the underlying index vector */
-    inline Index& operator[](Index i) { return indices()(i); }
-
-    /** const version of indices(). */
-    const IndicesType& indices() const { return derived().indices(); }
-    /** \returns a reference to the stored array representing the transpositions. */
-    IndicesType& indices() { return derived().indices(); }
-
-    /** Resizes to given size. */
-    inline void resize(int newSize)
-    {
-      indices().resize(newSize);
-    }
-
-    /** Sets \c *this to represents an identity transformation */
-    void setIdentity()
-    {
-      for(int i = 0; i < indices().size(); ++i)
-        coeffRef(i) = i;
-    }
-
-    // FIXME: do we want such methods ?
-    // might be usefull when the target matrix expression is complex, e.g.:
-    // object.matrix().block(..,..,..,..) = trans * object.matrix().block(..,..,..,..);
-    /*
-    template<typename MatrixType>
-    void applyForwardToRows(MatrixType& mat) const
-    {
-      for(Index k=0 ; k<size() ; ++k)
-        if(m_indices(k)!=k)
-          mat.row(k).swap(mat.row(m_indices(k)));
-    }
-
-    template<typename MatrixType>
-    void applyBackwardToRows(MatrixType& mat) const
-    {
-      for(Index k=size()-1 ; k>=0 ; --k)
-        if(m_indices(k)!=k)
-          mat.row(k).swap(mat.row(m_indices(k)));
-    }
-    */
-
-    /** \returns the inverse transformation */
-    inline Transpose<TranspositionsBase> inverse() const
-    { return Transpose<TranspositionsBase>(derived()); }
-
-    /** \returns the tranpose transformation */
-    inline Transpose<TranspositionsBase> transpose() const
-    { return Transpose<TranspositionsBase>(derived()); }
-
-  protected:
+  /** \returns the transpose transformation */
+  inline Transpose<TranspositionsBase> transpose() const { return Transpose<TranspositionsBase>(derived()); }
+
+ protected:
 };
 
 namespace internal {
-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType>
-struct traits<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,IndexType> >
-{
-  typedef IndexType Index;
-  typedef Matrix<Index, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1> IndicesType;
+template <int SizeAtCompileTime, int MaxSizeAtCompileTime, typename StorageIndex_>
+struct traits<Transpositions<SizeAtCompileTime, MaxSizeAtCompileTime, StorageIndex_> >
+    : traits<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, StorageIndex_> > {
+  typedef Matrix<StorageIndex_, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1> IndicesType;
+  typedef TranspositionsStorage StorageKind;
 };
-}
+}  // namespace internal
 
-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType>
-class Transpositions : public TranspositionsBase<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,IndexType> >
-{
-    typedef internal::traits<Transpositions> Traits;
-  public:
-
-    typedef TranspositionsBase<Transpositions> Base;
-    typedef typename Traits::IndicesType IndicesType;
-    typedef typename IndicesType::Scalar Index;
-
-    inline Transpositions() {}
-
-    /** Copy constructor. */
-    template<typename OtherDerived>
-    inline Transpositions(const TranspositionsBase<OtherDerived>& other)
-      : m_indices(other.indices()) {}
-
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** Standard copy constructor. Defined only to prevent a default copy constructor
-      * from hiding the other templated constructor */
-    inline Transpositions(const Transpositions& other) : m_indices(other.indices()) {}
-    #endif
-
-    /** Generic constructor from expression of the transposition indices. */
-    template<typename Other>
-    explicit inline Transpositions(const MatrixBase<Other>& a_indices) : m_indices(a_indices)
-    {}
-
-    /** Copies the \a other transpositions into \c *this */
-    template<typename OtherDerived>
-    Transpositions& operator=(const TranspositionsBase<OtherDerived>& other)
-    {
-      return Base::operator=(other);
-    }
-
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** This is a special case of the templated operator=. Its purpose is to
-      * prevent a default operator= from hiding the templated operator=.
-      */
-    Transpositions& operator=(const Transpositions& other)
-    {
-      m_indices = other.m_indices;
-      return *this;
-    }
-    #endif
-
-    /** Constructs an uninitialized permutation matrix of given size.
-      */
-    inline Transpositions(Index size) : m_indices(size)
-    {}
-
-    /** const version of indices(). */
-    const IndicesType& indices() const { return m_indices; }
-    /** \returns a reference to the stored array representing the transpositions. */
-    IndicesType& indices() { return m_indices; }
-
-  protected:
-
-    IndicesType m_indices;
+/** \class Transpositions
+ * \ingroup Core_Module
+ *
+ * \brief Represents a sequence of transpositions (row/column interchange)
+ *
+ * \tparam SizeAtCompileTime the number of transpositions, or Dynamic
+ * \tparam MaxSizeAtCompileTime the maximum number of transpositions, or Dynamic. This optional parameter defaults to
+ * SizeAtCompileTime. Most of the time, you should not have to specify it.
+ *
+ * This class represents a permutation transformation as a sequence of \em n transpositions
+ * \f$[T_{n-1} \ldots T_{i} \ldots T_{0}]\f$. It is internally stored as a vector of integers \c indices.
+ * Each transposition \f$ T_{i} \f$ applied on the left of a matrix (\f$ T_{i} M\f$) interchanges
+ * the rows \c i and \c indices[i] of the matrix \c M.
+ * A transposition applied on the right (e.g., \f$ M T_{i}\f$) yields a column interchange.
+ *
+ * Compared to the class PermutationMatrix, such a sequence of transpositions is what is
+ * computed during a decomposition with pivoting, and it is faster when applying the permutation in-place.
+ *
+ * To apply a sequence of transpositions to a matrix, simply use the operator * as in the following example:
+ * \code
+ * Transpositions tr;
+ * MatrixXf mat;
+ * mat = tr * mat;
+ * \endcode
+ * In this example, we detect that the matrix appears on both side, and so the transpositions
+ * are applied in-place without any temporary or extra copy.
+ *
+ * \sa class PermutationMatrix
+ */
+
+template <int SizeAtCompileTime, int MaxSizeAtCompileTime, typename StorageIndex_>
+class Transpositions
+    : public TranspositionsBase<Transpositions<SizeAtCompileTime, MaxSizeAtCompileTime, StorageIndex_> > {
+  typedef internal::traits<Transpositions> Traits;
+
+ public:
+  typedef TranspositionsBase<Transpositions> Base;
+  typedef typename Traits::IndicesType IndicesType;
+  typedef typename IndicesType::Scalar StorageIndex;
+
+  inline Transpositions() {}
+
+  /** Copy constructor. */
+  template <typename OtherDerived>
+  inline Transpositions(const TranspositionsBase<OtherDerived>& other) : m_indices(other.indices()) {}
+
+  /** Generic constructor from expression of the transposition indices. */
+  template <typename Other>
+  explicit inline Transpositions(const MatrixBase<Other>& indices) : m_indices(indices) {}
+
+  /** Copies the \a other transpositions into \c *this */
+  template <typename OtherDerived>
+  Transpositions& operator=(const TranspositionsBase<OtherDerived>& other) {
+    return Base::operator=(other);
+  }
+
+  /** Constructs an uninitialized permutation matrix of given size.
+   */
+  inline Transpositions(Index size) : m_indices(size) {}
+
+  /** const version of indices(). */
+  EIGEN_DEVICE_FUNC const IndicesType& indices() const { return m_indices; }
+  /** \returns a reference to the stored array representing the transpositions. */
+  EIGEN_DEVICE_FUNC IndicesType& indices() { return m_indices; }
+
+ protected:
+  IndicesType m_indices;
 };
 
-
 namespace internal {
-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType, int _PacketAccess>
-struct traits<Map<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,IndexType>,_PacketAccess> >
-{
-  typedef IndexType Index;
-  typedef Map<const Matrix<Index,SizeAtCompileTime,1,0,MaxSizeAtCompileTime,1>, _PacketAccess> IndicesType;
+template <int SizeAtCompileTime, int MaxSizeAtCompileTime, typename StorageIndex_, int PacketAccess_>
+struct traits<Map<Transpositions<SizeAtCompileTime, MaxSizeAtCompileTime, StorageIndex_>, PacketAccess_> >
+    : traits<PermutationMatrix<SizeAtCompileTime, MaxSizeAtCompileTime, StorageIndex_> > {
+  typedef Map<const Matrix<StorageIndex_, SizeAtCompileTime, 1, 0, MaxSizeAtCompileTime, 1>, PacketAccess_> IndicesType;
+  typedef StorageIndex_ StorageIndex;
+  typedef TranspositionsStorage StorageKind;
 };
-}
-
-template<int SizeAtCompileTime, int MaxSizeAtCompileTime, typename IndexType, int PacketAccess>
-class Map<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,IndexType>,PacketAccess>
- : public TranspositionsBase<Map<Transpositions<SizeAtCompileTime,MaxSizeAtCompileTime,IndexType>,PacketAccess> >
-{
-    typedef internal::traits<Map> Traits;
-  public:
-
-    typedef TranspositionsBase<Map> Base;
-    typedef typename Traits::IndicesType IndicesType;
-    typedef typename IndicesType::Scalar Index;
-
-    inline Map(const Index* indicesPtr)
-      : m_indices(indicesPtr)
-    {}
-
-    inline Map(const Index* indicesPtr, Index size)
-      : m_indices(indicesPtr,size)
-    {}
-
-    /** Copies the \a other transpositions into \c *this */
-    template<typename OtherDerived>
-    Map& operator=(const TranspositionsBase<OtherDerived>& other)
-    {
-      return Base::operator=(other);
-    }
-
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** This is a special case of the templated operator=. Its purpose is to
-      * prevent a default operator= from hiding the templated operator=.
-      */
-    Map& operator=(const Map& other)
-    {
-      m_indices = other.m_indices;
-      return *this;
-    }
-    #endif
-
-    /** const version of indices(). */
-    const IndicesType& indices() const { return m_indices; }
-    
-    /** \returns a reference to the stored array representing the transpositions. */
-    IndicesType& indices() { return m_indices; }
-
-  protected:
-
-    IndicesType m_indices;
+}  // namespace internal
+
+template <int SizeAtCompileTime, int MaxSizeAtCompileTime, typename StorageIndex_, int PacketAccess>
+class Map<Transpositions<SizeAtCompileTime, MaxSizeAtCompileTime, StorageIndex_>, PacketAccess>
+    : public TranspositionsBase<
+          Map<Transpositions<SizeAtCompileTime, MaxSizeAtCompileTime, StorageIndex_>, PacketAccess> > {
+  typedef internal::traits<Map> Traits;
+
+ public:
+  typedef TranspositionsBase<Map> Base;
+  typedef typename Traits::IndicesType IndicesType;
+  typedef typename IndicesType::Scalar StorageIndex;
+
+  explicit inline Map(const StorageIndex* indicesPtr) : m_indices(indicesPtr) {}
+
+  inline Map(const StorageIndex* indicesPtr, Index size) : m_indices(indicesPtr, size) {}
+
+  /** Copies the \a other transpositions into \c *this */
+  template <typename OtherDerived>
+  Map& operator=(const TranspositionsBase<OtherDerived>& other) {
+    return Base::operator=(other);
+  }
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  /** This is a special case of the templated operator=. Its purpose is to
+   * prevent a default operator= from hiding the templated operator=.
+   */
+  Map& operator=(const Map& other) {
+    m_indices = other.m_indices;
+    return *this;
+  }
+#endif
+
+  /** const version of indices(). */
+  EIGEN_DEVICE_FUNC const IndicesType& indices() const { return m_indices; }
+
+  /** \returns a reference to the stored array representing the transpositions. */
+  EIGEN_DEVICE_FUNC IndicesType& indices() { return m_indices; }
+
+ protected:
+  IndicesType m_indices;
 };
 
 namespace internal {
-template<typename _IndicesType>
-struct traits<TranspositionsWrapper<_IndicesType> >
-{
-  typedef typename _IndicesType::Scalar Index;
-  typedef _IndicesType IndicesType;
+template <typename IndicesType_>
+struct traits<TranspositionsWrapper<IndicesType_> > : traits<PermutationWrapper<IndicesType_> > {
+  typedef TranspositionsStorage StorageKind;
 };
-}
+}  // namespace internal
+
+template <typename IndicesType_>
+class TranspositionsWrapper : public TranspositionsBase<TranspositionsWrapper<IndicesType_> > {
+  typedef internal::traits<TranspositionsWrapper> Traits;
+
+ public:
+  typedef TranspositionsBase<TranspositionsWrapper> Base;
+  typedef typename Traits::IndicesType IndicesType;
+  typedef typename IndicesType::Scalar StorageIndex;
+
+  explicit inline TranspositionsWrapper(IndicesType& indices) : m_indices(indices) {}
 
-template<typename _IndicesType>
-class TranspositionsWrapper
- : public TranspositionsBase<TranspositionsWrapper<_IndicesType> >
-{
-    typedef internal::traits<TranspositionsWrapper> Traits;
-  public:
-
-    typedef TranspositionsBase<TranspositionsWrapper> Base;
-    typedef typename Traits::IndicesType IndicesType;
-    typedef typename IndicesType::Scalar Index;
-
-    inline TranspositionsWrapper(IndicesType& a_indices)
-      : m_indices(a_indices)
-    {}
-
-    /** Copies the \a other transpositions into \c *this */
-    template<typename OtherDerived>
-    TranspositionsWrapper& operator=(const TranspositionsBase<OtherDerived>& other)
-    {
-      return Base::operator=(other);
-    }
-
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** This is a special case of the templated operator=. Its purpose is to
-      * prevent a default operator= from hiding the templated operator=.
-      */
-    TranspositionsWrapper& operator=(const TranspositionsWrapper& other)
-    {
-      m_indices = other.m_indices;
-      return *this;
-    }
-    #endif
-
-    /** const version of indices(). */
-    const IndicesType& indices() const { return m_indices; }
-
-    /** \returns a reference to the stored array representing the transpositions. */
-    IndicesType& indices() { return m_indices; }
-
-  protected:
-
-    const typename IndicesType::Nested m_indices;
+  /** Copies the \a other transpositions into \c *this */
+  template <typename OtherDerived>
+  TranspositionsWrapper& operator=(const TranspositionsBase<OtherDerived>& other) {
+    return Base::operator=(other);
+  }
+
+  /** const version of indices(). */
+  EIGEN_DEVICE_FUNC const IndicesType& indices() const { return m_indices; }
+
+  /** \returns a reference to the stored array representing the transpositions. */
+  EIGEN_DEVICE_FUNC IndicesType& indices() { return m_indices; }
+
+ protected:
+  typename IndicesType::Nested m_indices;
 };
 
 /** \returns the \a matrix with the \a transpositions applied to the columns.
-  */
-template<typename Derived, typename TranspositionsDerived>
-inline const internal::transposition_matrix_product_retval<TranspositionsDerived, Derived, OnTheRight>
-operator*(const MatrixBase<Derived>& matrix,
-          const TranspositionsBase<TranspositionsDerived> &transpositions)
-{
-  return internal::transposition_matrix_product_retval
-           <TranspositionsDerived, Derived, OnTheRight>
-           (transpositions.derived(), matrix.derived());
+ */
+template <typename MatrixDerived, typename TranspositionsDerived>
+EIGEN_DEVICE_FUNC const Product<MatrixDerived, TranspositionsDerived, AliasFreeProduct> operator*(
+    const MatrixBase<MatrixDerived>& matrix, const TranspositionsBase<TranspositionsDerived>& transpositions) {
+  return Product<MatrixDerived, TranspositionsDerived, AliasFreeProduct>(matrix.derived(), transpositions.derived());
 }
 
 /** \returns the \a matrix with the \a transpositions applied to the rows.
-  */
-template<typename Derived, typename TranspositionDerived>
-inline const internal::transposition_matrix_product_retval
-               <TranspositionDerived, Derived, OnTheLeft>
-operator*(const TranspositionsBase<TranspositionDerived> &transpositions,
-          const MatrixBase<Derived>& matrix)
-{
-  return internal::transposition_matrix_product_retval
-           <TranspositionDerived, Derived, OnTheLeft>
-           (transpositions.derived(), matrix.derived());
+ */
+template <typename TranspositionsDerived, typename MatrixDerived>
+EIGEN_DEVICE_FUNC const Product<TranspositionsDerived, MatrixDerived, AliasFreeProduct> operator*(
+    const TranspositionsBase<TranspositionsDerived>& transpositions, const MatrixBase<MatrixDerived>& matrix) {
+  return Product<TranspositionsDerived, MatrixDerived, AliasFreeProduct>(transpositions.derived(), matrix.derived());
 }
 
+// Template partial specialization for transposed/inverse transpositions
+
 namespace internal {
 
-template<typename TranspositionType, typename MatrixType, int Side, bool Transposed>
-struct traits<transposition_matrix_product_retval<TranspositionType, MatrixType, Side, Transposed> >
-{
-  typedef typename MatrixType::PlainObject ReturnType;
-};
+template <typename Derived>
+struct traits<Transpose<TranspositionsBase<Derived> > > : traits<Derived> {};
 
-template<typename TranspositionType, typename MatrixType, int Side, bool Transposed>
-struct transposition_matrix_product_retval
- : public ReturnByValue<transposition_matrix_product_retval<TranspositionType, MatrixType, Side, Transposed> >
-{
-    typedef typename remove_all<typename MatrixType::Nested>::type MatrixTypeNestedCleaned;
-    typedef typename TranspositionType::Index Index;
-
-    transposition_matrix_product_retval(const TranspositionType& tr, const MatrixType& matrix)
-      : m_transpositions(tr), m_matrix(matrix)
-    {}
-
-    inline int rows() const { return m_matrix.rows(); }
-    inline int cols() const { return m_matrix.cols(); }
-
-    template<typename Dest> inline void evalTo(Dest& dst) const
-    {
-      const int size = m_transpositions.size();
-      Index j = 0;
-
-      if(!(is_same<MatrixTypeNestedCleaned,Dest>::value && extract_data(dst) == extract_data(m_matrix)))
-        dst = m_matrix;
-
-      for(int k=(Transposed?size-1:0) ; Transposed?k>=0:k<size ; Transposed?--k:++k)
-        if((j=m_transpositions.coeff(k))!=k)
-        {
-          if(Side==OnTheLeft)
-            dst.row(k).swap(dst.row(j));
-          else if(Side==OnTheRight)
-            dst.col(k).swap(dst.col(j));
-        }
-    }
-
-  protected:
-    const TranspositionType& m_transpositions;
-    typename MatrixType::Nested m_matrix;
-};
+}  // end namespace internal
+
+template <typename TranspositionsDerived>
+class Transpose<TranspositionsBase<TranspositionsDerived> > {
+  typedef TranspositionsDerived TranspositionType;
+  typedef typename TranspositionType::IndicesType IndicesType;
+
+ public:
+  explicit Transpose(const TranspositionType& t) : m_transpositions(t) {}
+
+  EIGEN_DEVICE_FUNC constexpr Index size() const noexcept { return m_transpositions.size(); }
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_transpositions.size(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_transpositions.size(); }
+
+  /** \returns the \a matrix with the inverse transpositions applied to the columns.
+   */
+  template <typename OtherDerived>
+  friend const Product<OtherDerived, Transpose, AliasFreeProduct> operator*(const MatrixBase<OtherDerived>& matrix,
+                                                                            const Transpose& trt) {
+    return Product<OtherDerived, Transpose, AliasFreeProduct>(matrix.derived(), trt);
+  }
+
+  /** \returns the \a matrix with the inverse transpositions applied to the rows.
+   */
+  template <typename OtherDerived>
+  const Product<Transpose, OtherDerived, AliasFreeProduct> operator*(const MatrixBase<OtherDerived>& matrix) const {
+    return Product<Transpose, OtherDerived, AliasFreeProduct>(*this, matrix.derived());
+  }
+
+  EIGEN_DEVICE_FUNC const TranspositionType& nestedExpression() const { return m_transpositions; }
 
-} // end namespace internal
-
-/* Template partial specialization for transposed/inverse transpositions */
-
-template<typename TranspositionsDerived>
-class Transpose<TranspositionsBase<TranspositionsDerived> >
-{
-    typedef TranspositionsDerived TranspositionType;
-    typedef typename TranspositionType::IndicesType IndicesType;
-  public:
-
-    Transpose(const TranspositionType& t) : m_transpositions(t) {}
-
-    inline int size() const { return m_transpositions.size(); }
-
-    /** \returns the \a matrix with the inverse transpositions applied to the columns.
-      */
-    template<typename Derived> friend
-    inline const internal::transposition_matrix_product_retval<TranspositionType, Derived, OnTheRight, true>
-    operator*(const MatrixBase<Derived>& matrix, const Transpose& trt)
-    {
-      return internal::transposition_matrix_product_retval<TranspositionType, Derived, OnTheRight, true>(trt.m_transpositions, matrix.derived());
-    }
-
-    /** \returns the \a matrix with the inverse transpositions applied to the rows.
-      */
-    template<typename Derived>
-    inline const internal::transposition_matrix_product_retval<TranspositionType, Derived, OnTheLeft, true>
-    operator*(const MatrixBase<Derived>& matrix) const
-    {
-      return internal::transposition_matrix_product_retval<TranspositionType, Derived, OnTheLeft, true>(m_transpositions, matrix.derived());
-    }
-
-  protected:
-    const TranspositionType& m_transpositions;
+ protected:
+  const TranspositionType& m_transpositions;
 };
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_TRANSPOSITIONS_H
+#endif  // EIGEN_TRANSPOSITIONS_H
diff --git a/inst/include/Eigen/src/Core/TriangularMatrix.h b/inst/include/Eigen/src/Core/TriangularMatrix.h
index 4d65392c..27ad78ec 100644
--- a/inst/include/Eigen/src/Core/TriangularMatrix.h
+++ b/inst/include/Eigen/src/Core/TriangularMatrix.h
@@ -11,829 +11,890 @@
 #ifndef EIGEN_TRIANGULARMATRIX_H
 #define EIGEN_TRIANGULARMATRIX_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 namespace internal {
-  
-template<int Side, typename TriangularType, typename Rhs> struct triangular_solve_retval;
-  
+
+template <int Side, typename TriangularType, typename Rhs>
+struct triangular_solve_retval;
+
 }
 
-/** \internal
-  *
-  * \class TriangularBase
-  * \ingroup Core_Module
-  *
-  * \brief Base class for triangular part in a matrix
-  */
-template<typename Derived> class TriangularBase : public EigenBase<Derived>
-{
-  public:
-
-    enum {
-      Mode = internal::traits<Derived>::Mode,
-      CoeffReadCost = internal::traits<Derived>::CoeffReadCost,
-      RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime,
-      ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
-      MaxRowsAtCompileTime = internal::traits<Derived>::MaxRowsAtCompileTime,
-      MaxColsAtCompileTime = internal::traits<Derived>::MaxColsAtCompileTime
-    };
-    typedef typename internal::traits<Derived>::Scalar Scalar;
-    typedef typename internal::traits<Derived>::StorageKind StorageKind;
-    typedef typename internal::traits<Derived>::Index Index;
-    typedef typename internal::traits<Derived>::DenseMatrixType DenseMatrixType;
-    typedef DenseMatrixType DenseType;
-
-    inline TriangularBase() { eigen_assert(!((Mode&UnitDiag) && (Mode&ZeroDiag))); }
-
-    inline Index rows() const { return derived().rows(); }
-    inline Index cols() const { return derived().cols(); }
-    inline Index outerStride() const { return derived().outerStride(); }
-    inline Index innerStride() const { return derived().innerStride(); }
-
-    inline Scalar coeff(Index row, Index col) const  { return derived().coeff(row,col); }
-    inline Scalar& coeffRef(Index row, Index col) { return derived().coeffRef(row,col); }
-
-    /** \see MatrixBase::copyCoeff(row,col)
-      */
-    template<typename Other>
-    EIGEN_STRONG_INLINE void copyCoeff(Index row, Index col, Other& other)
-    {
-      derived().coeffRef(row, col) = other.coeff(row, col);
-    }
+/** \class TriangularBase
+ * \ingroup Core_Module
+ *
+ * \brief Base class for triangular part in a matrix
+ */
+template <typename Derived>
+class TriangularBase : public EigenBase<Derived> {
+ public:
+  enum {
+    Mode = internal::traits<Derived>::Mode,
+    RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime,
+    ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
+    MaxRowsAtCompileTime = internal::traits<Derived>::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = internal::traits<Derived>::MaxColsAtCompileTime,
 
-    inline Scalar operator()(Index row, Index col) const
-    {
-      check_coordinates(row, col);
-      return coeff(row,col);
-    }
-    inline Scalar& operator()(Index row, Index col)
-    {
-      check_coordinates(row, col);
-      return coeffRef(row,col);
-    }
+    SizeAtCompileTime = (internal::size_of_xpr_at_compile_time<Derived>::ret),
+    /**< This is equal to the number of coefficients, i.e. the number of
+     * rows times the number of columns, or to \a Dynamic if this is not
+     * known at compile-time. \sa RowsAtCompileTime, ColsAtCompileTime */
 
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    inline const Derived& derived() const { return *static_cast<const Derived*>(this); }
-    inline Derived& derived() { return *static_cast<Derived*>(this); }
-    #endif // not EIGEN_PARSED_BY_DOXYGEN
-
-    template<typename DenseDerived>
-    void evalTo(MatrixBase<DenseDerived> &other) const;
-    template<typename DenseDerived>
-    void evalToLazy(MatrixBase<DenseDerived> &other) const;
-
-    DenseMatrixType toDenseMatrix() const
-    {
-      DenseMatrixType res(rows(), cols());
-      evalToLazy(res);
-      return res;
-    }
+    MaxSizeAtCompileTime = internal::size_at_compile_time(internal::traits<Derived>::MaxRowsAtCompileTime,
+                                                          internal::traits<Derived>::MaxColsAtCompileTime)
 
-  protected:
-
-    void check_coordinates(Index row, Index col) const
-    {
-      EIGEN_ONLY_USED_FOR_DEBUG(row);
-      EIGEN_ONLY_USED_FOR_DEBUG(col);
-      eigen_assert(col>=0 && col<cols() && row>=0 && row<rows());
-      const int mode = int(Mode) & ~SelfAdjoint;
-      EIGEN_ONLY_USED_FOR_DEBUG(mode);
-      eigen_assert((mode==Upper && col>=row)
-                || (mode==Lower && col<=row)
-                || ((mode==StrictlyUpper || mode==UnitUpper) && col>row)
-                || ((mode==StrictlyLower || mode==UnitLower) && col<row));
-    }
+  };
+  typedef typename internal::traits<Derived>::Scalar Scalar;
+  typedef typename internal::traits<Derived>::StorageKind StorageKind;
+  typedef typename internal::traits<Derived>::StorageIndex StorageIndex;
+  typedef typename internal::traits<Derived>::FullMatrixType DenseMatrixType;
+  typedef DenseMatrixType DenseType;
+  typedef Derived const& Nested;
+
+  EIGEN_DEVICE_FUNC inline TriangularBase() {
+    eigen_assert(!((int(Mode) & int(UnitDiag)) && (int(Mode) & int(ZeroDiag))));
+  }
 
-    #ifdef EIGEN_INTERNAL_DEBUGGING
-    void check_coordinates_internal(Index row, Index col) const
-    {
-      check_coordinates(row, col);
-    }
-    #else
-    void check_coordinates_internal(Index , Index ) const {}
-    #endif
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return derived().rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return derived().cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index outerStride() const noexcept { return derived().outerStride(); }
+  EIGEN_DEVICE_FUNC constexpr Index innerStride() const noexcept { return derived().innerStride(); }
+
+  // dummy resize function
+  EIGEN_DEVICE_FUNC void resize(Index rows, Index cols) {
+    EIGEN_UNUSED_VARIABLE(rows);
+    EIGEN_UNUSED_VARIABLE(cols);
+    eigen_assert(rows == this->rows() && cols == this->cols());
+  }
+
+  EIGEN_DEVICE_FUNC inline Scalar coeff(Index row, Index col) const { return derived().coeff(row, col); }
+  EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index row, Index col) { return derived().coeffRef(row, col); }
 
+  /** \see MatrixBase::copyCoeff(row,col)
+   */
+  template <typename Other>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void copyCoeff(Index row, Index col, Other& other) {
+    derived().coeffRef(row, col) = other.coeff(row, col);
+  }
+
+  EIGEN_DEVICE_FUNC inline Scalar operator()(Index row, Index col) const {
+    check_coordinates(row, col);
+    return coeff(row, col);
+  }
+  EIGEN_DEVICE_FUNC inline Scalar& operator()(Index row, Index col) {
+    check_coordinates(row, col);
+    return coeffRef(row, col);
+  }
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  EIGEN_DEVICE_FUNC inline const Derived& derived() const { return *static_cast<const Derived*>(this); }
+  EIGEN_DEVICE_FUNC inline Derived& derived() { return *static_cast<Derived*>(this); }
+#endif  // not EIGEN_PARSED_BY_DOXYGEN
+
+  template <typename DenseDerived>
+  EIGEN_DEVICE_FUNC void evalTo(MatrixBase<DenseDerived>& other) const;
+  template <typename DenseDerived>
+  EIGEN_DEVICE_FUNC void evalToLazy(MatrixBase<DenseDerived>& other) const;
+
+  EIGEN_DEVICE_FUNC DenseMatrixType toDenseMatrix() const {
+    DenseMatrixType res(rows(), cols());
+    evalToLazy(res);
+    return res;
+  }
+
+ protected:
+  void check_coordinates(Index row, Index col) const {
+    EIGEN_ONLY_USED_FOR_DEBUG(row);
+    EIGEN_ONLY_USED_FOR_DEBUG(col);
+    eigen_assert(col >= 0 && col < cols() && row >= 0 && row < rows());
+    const int mode = int(Mode) & ~SelfAdjoint;
+    EIGEN_ONLY_USED_FOR_DEBUG(mode);
+    eigen_assert((mode == Upper && col >= row) || (mode == Lower && col <= row) ||
+                 ((mode == StrictlyUpper || mode == UnitUpper) && col > row) ||
+                 ((mode == StrictlyLower || mode == UnitLower) && col < row));
+  }
+
+#ifdef EIGEN_INTERNAL_DEBUGGING
+  void check_coordinates_internal(Index row, Index col) const { check_coordinates(row, col); }
+#else
+  void check_coordinates_internal(Index, Index) const {}
+#endif
 };
 
 /** \class TriangularView
-  * \ingroup Core_Module
-  *
-  * \brief Base class for triangular part in a matrix
-  *
-  * \param MatrixType the type of the object in which we are taking the triangular part
-  * \param Mode the kind of triangular matrix expression to construct. Can be #Upper,
-  *             #Lower, #UnitUpper, #UnitLower, #StrictlyUpper, or #StrictlyLower.
-  *             This is in fact a bit field; it must have either #Upper or #Lower, 
-  *             and additionnaly it may have #UnitDiag or #ZeroDiag or neither.
-  *
-  * This class represents a triangular part of a matrix, not necessarily square. Strictly speaking, for rectangular
-  * matrices one should speak of "trapezoid" parts. This class is the return type
-  * of MatrixBase::triangularView() and most of the time this is the only way it is used.
-  *
-  * \sa MatrixBase::triangularView()
-  */
+ * \ingroup Core_Module
+ *
+ * \brief Expression of a triangular part in a matrix
+ *
+ * \tparam MatrixType the type of the object in which we are taking the triangular part
+ * \tparam Mode the kind of triangular matrix expression to construct. Can be #Upper,
+ *             #Lower, #UnitUpper, #UnitLower, #StrictlyUpper, or #StrictlyLower.
+ *             This is in fact a bit field; it must have either #Upper or #Lower,
+ *             and additionally it may have #UnitDiag or #ZeroDiag or neither.
+ *
+ * This class represents a triangular part of a matrix, not necessarily square. Strictly speaking, for rectangular
+ * matrices one should speak of "trapezoid" parts. This class is the return type
+ * of MatrixBase::triangularView() and SparseMatrixBase::triangularView(), and most of the time this is the only way it
+ * is used.
+ *
+ * \sa MatrixBase::triangularView()
+ */
 namespace internal {
-template<typename MatrixType, unsigned int _Mode>
-struct traits<TriangularView<MatrixType, _Mode> > : traits<MatrixType>
-{
-  typedef typename nested<MatrixType>::type MatrixTypeNested;
-  typedef typename remove_reference<MatrixTypeNested>::type MatrixTypeNestedNonRef;
-  typedef typename remove_all<MatrixTypeNested>::type MatrixTypeNestedCleaned;
+template <typename MatrixType, unsigned int Mode_>
+struct traits<TriangularView<MatrixType, Mode_>> : traits<MatrixType> {
+  typedef typename ref_selector<MatrixType>::non_const_type MatrixTypeNested;
+  typedef std::remove_reference_t<MatrixTypeNested> MatrixTypeNestedNonRef;
+  typedef remove_all_t<MatrixTypeNested> MatrixTypeNestedCleaned;
+  typedef typename MatrixType::PlainObject FullMatrixType;
   typedef MatrixType ExpressionType;
-  typedef typename MatrixType::PlainObject DenseMatrixType;
   enum {
-    Mode = _Mode,
-    Flags = (MatrixTypeNestedCleaned::Flags & (HereditaryBits) & (~(PacketAccessBit | DirectAccessBit | LinearAccessBit))) | Mode,
-    CoeffReadCost = MatrixTypeNestedCleaned::CoeffReadCost
+    Mode = Mode_,
+    FlagsLvalueBit = is_lvalue<MatrixType>::value ? LvalueBit : 0,
+    Flags = (MatrixTypeNestedCleaned::Flags & (HereditaryBits | FlagsLvalueBit) &
+             (~(PacketAccessBit | DirectAccessBit | LinearAccessBit)))
   };
 };
-}
+}  // namespace internal
 
-template<int Mode, bool LhsIsTriangular,
-         typename Lhs, bool LhsIsVector,
-         typename Rhs, bool RhsIsVector>
-struct TriangularProduct;
-
-template<typename _MatrixType, unsigned int _Mode> class TriangularView
-  : public TriangularBase<TriangularView<_MatrixType, _Mode> >
-{
-  public:
-
-    typedef TriangularBase<TriangularView> Base;
-    typedef typename internal::traits<TriangularView>::Scalar Scalar;
-
-    typedef _MatrixType MatrixType;
-    typedef typename internal::traits<TriangularView>::DenseMatrixType DenseMatrixType;
-    typedef DenseMatrixType PlainObject;
-
-  protected:
-    typedef typename internal::traits<TriangularView>::MatrixTypeNested MatrixTypeNested;
-    typedef typename internal::traits<TriangularView>::MatrixTypeNestedNonRef MatrixTypeNestedNonRef;
-    typedef typename internal::traits<TriangularView>::MatrixTypeNestedCleaned MatrixTypeNestedCleaned;
-
-    typedef typename internal::remove_all<typename MatrixType::ConjugateReturnType>::type MatrixConjugateReturnType;
-    
-  public:
-    using Base::evalToLazy;
-  
-
-    typedef typename internal::traits<TriangularView>::StorageKind StorageKind;
-    typedef typename internal::traits<TriangularView>::Index Index;
-
-    enum {
-      Mode = _Mode,
-      TransposeMode = (Mode & Upper ? Lower : 0)
-                    | (Mode & Lower ? Upper : 0)
-                    | (Mode & (UnitDiag))
-                    | (Mode & (ZeroDiag))
-    };
-
-    inline TriangularView(const MatrixType& matrix) : m_matrix(matrix)
-    {}
-
-    inline Index rows() const { return m_matrix.rows(); }
-    inline Index cols() const { return m_matrix.cols(); }
-    inline Index outerStride() const { return m_matrix.outerStride(); }
-    inline Index innerStride() const { return m_matrix.innerStride(); }
-
-    /** \sa MatrixBase::operator+=() */
-    template<typename Other> TriangularView&  operator+=(const DenseBase<Other>& other) { return *this = m_matrix + other.derived(); }
-    /** \sa MatrixBase::operator-=() */
-    template<typename Other> TriangularView&  operator-=(const DenseBase<Other>& other) { return *this = m_matrix - other.derived(); }
-    /** \sa MatrixBase::operator*=() */
-    TriangularView&  operator*=(const typename internal::traits<MatrixType>::Scalar& other) { return *this = m_matrix * other; }
-    /** \sa MatrixBase::operator/=() */
-    TriangularView&  operator/=(const typename internal::traits<MatrixType>::Scalar& other) { return *this = m_matrix / other; }
-
-    /** \sa MatrixBase::fill() */
-    void fill(const Scalar& value) { setConstant(value); }
-    /** \sa MatrixBase::setConstant() */
-    TriangularView& setConstant(const Scalar& value)
-    { return *this = MatrixType::Constant(rows(), cols(), value); }
-    /** \sa MatrixBase::setZero() */
-    TriangularView& setZero() { return setConstant(Scalar(0)); }
-    /** \sa MatrixBase::setOnes() */
-    TriangularView& setOnes() { return setConstant(Scalar(1)); }
-
-    /** \sa MatrixBase::coeff()
-      * \warning the coordinates must fit into the referenced triangular part
-      */
-    inline Scalar coeff(Index row, Index col) const
-    {
-      Base::check_coordinates_internal(row, col);
-      return m_matrix.coeff(row, col);
-    }
+template <typename MatrixType_, unsigned int Mode_, typename StorageKind>
+class TriangularViewImpl;
 
-    /** \sa MatrixBase::coeffRef()
-      * \warning the coordinates must fit into the referenced triangular part
-      */
-    inline Scalar& coeffRef(Index row, Index col)
-    {
-      Base::check_coordinates_internal(row, col);
-      return m_matrix.const_cast_derived().coeffRef(row, col);
-    }
-
-    const MatrixTypeNestedCleaned& nestedExpression() const { return m_matrix; }
-    MatrixTypeNestedCleaned& nestedExpression() { return *const_cast<MatrixTypeNestedCleaned*>(&m_matrix); }
+template <typename MatrixType_, unsigned int Mode_>
+class TriangularView
+    : public TriangularViewImpl<MatrixType_, Mode_, typename internal::traits<MatrixType_>::StorageKind> {
+ public:
+  typedef TriangularViewImpl<MatrixType_, Mode_, typename internal::traits<MatrixType_>::StorageKind> Base;
+  typedef typename internal::traits<TriangularView>::Scalar Scalar;
+  typedef MatrixType_ MatrixType;
 
-    /** Assigns a triangular matrix to a triangular part of a dense matrix */
-    template<typename OtherDerived>
-    TriangularView& operator=(const TriangularBase<OtherDerived>& other);
+ protected:
+  typedef typename internal::traits<TriangularView>::MatrixTypeNested MatrixTypeNested;
+  typedef typename internal::traits<TriangularView>::MatrixTypeNestedNonRef MatrixTypeNestedNonRef;
 
-    template<typename OtherDerived>
-    TriangularView& operator=(const MatrixBase<OtherDerived>& other);
+  typedef internal::remove_all_t<typename MatrixType::ConjugateReturnType> MatrixConjugateReturnType;
+  typedef TriangularView<std::add_const_t<MatrixType>, Mode_> ConstTriangularView;
 
-    TriangularView& operator=(const TriangularView& other)
-    { return *this = other.nestedExpression(); }
+ public:
+  typedef typename internal::traits<TriangularView>::StorageKind StorageKind;
+  typedef typename internal::traits<TriangularView>::MatrixTypeNestedCleaned NestedExpression;
 
-    template<typename OtherDerived>
-    void lazyAssign(const TriangularBase<OtherDerived>& other);
+  enum {
+    Mode = Mode_,
+    Flags = internal::traits<TriangularView>::Flags,
+    TransposeMode = (int(Mode) & int(Upper) ? Lower : 0) | (int(Mode) & int(Lower) ? Upper : 0) |
+                    (int(Mode) & int(UnitDiag)) | (int(Mode) & int(ZeroDiag)),
+    IsVectorAtCompileTime = false
+  };
 
-    template<typename OtherDerived>
-    void lazyAssign(const MatrixBase<OtherDerived>& other);
+  EIGEN_DEVICE_FUNC explicit inline TriangularView(MatrixType& matrix) : m_matrix(matrix) {}
 
-    /** \sa MatrixBase::conjugate() */
-    inline TriangularView<MatrixConjugateReturnType,Mode> conjugate()
-    { return m_matrix.conjugate(); }
-    /** \sa MatrixBase::conjugate() const */
-    inline const TriangularView<MatrixConjugateReturnType,Mode> conjugate() const
-    { return m_matrix.conjugate(); }
+  EIGEN_INHERIT_ASSIGNMENT_OPERATORS(TriangularView)
 
-    /** \sa MatrixBase::adjoint() const */
-    inline const TriangularView<const typename MatrixType::AdjointReturnType,TransposeMode> adjoint() const
-    { return m_matrix.adjoint(); }
+  /** \copydoc EigenBase::rows() */
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_matrix.rows(); }
+  /** \copydoc EigenBase::cols() */
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_matrix.cols(); }
 
-    /** \sa MatrixBase::transpose() */
-    inline TriangularView<Transpose<MatrixType>,TransposeMode> transpose()
-    {
-      EIGEN_STATIC_ASSERT_LVALUE(MatrixType)
-      return m_matrix.const_cast_derived().transpose();
-    }
-    /** \sa MatrixBase::transpose() const */
-    inline const TriangularView<Transpose<MatrixType>,TransposeMode> transpose() const
-    {
-      return m_matrix.transpose();
-    }
+  /** \returns a const reference to the nested expression */
+  EIGEN_DEVICE_FUNC const NestedExpression& nestedExpression() const { return m_matrix; }
 
-    /** Efficient triangular matrix times vector/matrix product */
-    template<typename OtherDerived>
-    TriangularProduct<Mode, true, MatrixType, false, OtherDerived, OtherDerived::ColsAtCompileTime==1>
-    operator*(const MatrixBase<OtherDerived>& rhs) const
-    {
-      return TriangularProduct
-              <Mode, true, MatrixType, false, OtherDerived, OtherDerived::ColsAtCompileTime==1>
-              (m_matrix, rhs.derived());
-    }
+  /** \returns a reference to the nested expression */
+  EIGEN_DEVICE_FUNC NestedExpression& nestedExpression() { return m_matrix; }
 
-    /** Efficient vector/matrix times triangular matrix product */
-    template<typename OtherDerived> friend
-    TriangularProduct<Mode, false, OtherDerived, OtherDerived::RowsAtCompileTime==1, MatrixType, false>
-    operator*(const MatrixBase<OtherDerived>& lhs, const TriangularView& rhs)
-    {
-      return TriangularProduct
-              <Mode, false, OtherDerived, OtherDerived::RowsAtCompileTime==1, MatrixType, false>
-              (lhs.derived(),rhs.m_matrix);
-    }
-
-    #ifdef EIGEN2_SUPPORT
-    template<typename OtherDerived>
-    struct eigen2_product_return_type
-    {
-      typedef typename TriangularView<MatrixType,Mode>::DenseMatrixType DenseMatrixType;
-      typedef typename OtherDerived::PlainObject::DenseType OtherPlainObject;
-      typedef typename ProductReturnType<DenseMatrixType, OtherPlainObject>::Type ProdRetType;
-      typedef typename ProdRetType::PlainObject type;
-    };
-    template<typename OtherDerived>
-    const typename eigen2_product_return_type<OtherDerived>::type
-    operator*(const EigenBase<OtherDerived>& rhs) const
-    {
-      typename OtherDerived::PlainObject::DenseType rhsPlainObject;
-      rhs.evalTo(rhsPlainObject);
-      return this->toDenseMatrix() * rhsPlainObject;
-    }
-    template<typename OtherMatrixType>
-    bool isApprox(const TriangularView<OtherMatrixType, Mode>& other, typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision()) const
-    {
-      return this->toDenseMatrix().isApprox(other.toDenseMatrix(), precision);
-    }
-    template<typename OtherDerived>
-    bool isApprox(const MatrixBase<OtherDerived>& other, typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision()) const
-    {
-      return this->toDenseMatrix().isApprox(other, precision);
-    }
-    #endif // EIGEN2_SUPPORT
+  typedef TriangularView<const MatrixConjugateReturnType, Mode> ConjugateReturnType;
+  /** \sa MatrixBase::conjugate() const */
+  EIGEN_DEVICE_FUNC inline const ConjugateReturnType conjugate() const {
+    return ConjugateReturnType(m_matrix.conjugate());
+  }
 
-    template<int Side, typename Other>
-    inline const internal::triangular_solve_retval<Side,TriangularView, Other>
-    solve(const MatrixBase<Other>& other) const;
+  /** \returns an expression of the complex conjugate of \c *this if Cond==true,
+   *           returns \c *this otherwise.
+   */
+  template <bool Cond>
+  EIGEN_DEVICE_FUNC inline std::conditional_t<Cond, ConjugateReturnType, ConstTriangularView> conjugateIf() const {
+    typedef std::conditional_t<Cond, ConjugateReturnType, ConstTriangularView> ReturnType;
+    return ReturnType(m_matrix.template conjugateIf<Cond>());
+  }
 
-    template<int Side, typename OtherDerived>
-    void solveInPlace(const MatrixBase<OtherDerived>& other) const;
+  typedef TriangularView<const typename MatrixType::AdjointReturnType, TransposeMode> AdjointReturnType;
+  /** \sa MatrixBase::adjoint() const */
+  EIGEN_DEVICE_FUNC inline const AdjointReturnType adjoint() const { return AdjointReturnType(m_matrix.adjoint()); }
+
+  typedef TriangularView<typename MatrixType::TransposeReturnType, TransposeMode> TransposeReturnType;
+  /** \sa MatrixBase::transpose() */
+  template <class Dummy = int>
+  EIGEN_DEVICE_FUNC inline TransposeReturnType transpose(
+      std::enable_if_t<Eigen::internal::is_lvalue<MatrixType>::value, Dummy*> = nullptr) {
+    typename MatrixType::TransposeReturnType tmp(m_matrix);
+    return TransposeReturnType(tmp);
+  }
 
-    template<typename Other>
-    inline const internal::triangular_solve_retval<OnTheLeft,TriangularView, Other> 
-    solve(const MatrixBase<Other>& other) const
-    { return solve<OnTheLeft>(other); }
+  typedef TriangularView<const typename MatrixType::ConstTransposeReturnType, TransposeMode> ConstTransposeReturnType;
+  /** \sa MatrixBase::transpose() const */
+  EIGEN_DEVICE_FUNC inline const ConstTransposeReturnType transpose() const {
+    return ConstTransposeReturnType(m_matrix.transpose());
+  }
 
-    template<typename OtherDerived>
-    void solveInPlace(const MatrixBase<OtherDerived>& other) const
-    { return solveInPlace<OnTheLeft>(other); }
+  template <typename Other>
+  EIGEN_DEVICE_FUNC inline const Solve<TriangularView, Other> solve(const MatrixBase<Other>& other) const {
+    return Solve<TriangularView, Other>(*this, other.derived());
+  }
 
-    const SelfAdjointView<MatrixTypeNestedNonRef,Mode> selfadjointView() const
-    {
-      EIGEN_STATIC_ASSERT((Mode&UnitDiag)==0,PROGRAMMING_ERROR);
-      return SelfAdjointView<MatrixTypeNestedNonRef,Mode>(m_matrix);
-    }
-    SelfAdjointView<MatrixTypeNestedNonRef,Mode> selfadjointView()
-    {
-      EIGEN_STATIC_ASSERT((Mode&UnitDiag)==0,PROGRAMMING_ERROR);
-      return SelfAdjointView<MatrixTypeNestedNonRef,Mode>(m_matrix);
-    }
+// workaround MSVC ICE
+#if EIGEN_COMP_MSVC
+  template <int Side, typename Other>
+  EIGEN_DEVICE_FUNC inline const internal::triangular_solve_retval<Side, TriangularView, Other> solve(
+      const MatrixBase<Other>& other) const {
+    return Base::template solve<Side>(other);
+  }
+#else
+  using Base::solve;
+#endif
 
-    template<typename OtherDerived>
-    void swap(TriangularBase<OtherDerived> const & other)
-    {
-      TriangularView<SwapWrapper<MatrixType>,Mode>(const_cast<MatrixType&>(m_matrix)).lazyAssign(other.derived());
-    }
+  /** \returns a selfadjoint view of the referenced triangular part which must be either \c #Upper or \c #Lower.
+   *
+   * This is a shortcut for \code this->nestedExpression().selfadjointView<(*this)::Mode>() \endcode
+   * \sa MatrixBase::selfadjointView() */
+  EIGEN_DEVICE_FUNC SelfAdjointView<MatrixTypeNestedNonRef, Mode> selfadjointView() {
+    EIGEN_STATIC_ASSERT((Mode & (UnitDiag | ZeroDiag)) == 0, PROGRAMMING_ERROR);
+    return SelfAdjointView<MatrixTypeNestedNonRef, Mode>(m_matrix);
+  }
 
-    template<typename OtherDerived>
-    void swap(MatrixBase<OtherDerived> const & other)
-    {
-      SwapWrapper<MatrixType> swaper(const_cast<MatrixType&>(m_matrix));
-      TriangularView<SwapWrapper<MatrixType>,Mode>(swaper).lazyAssign(other.derived());
-    }
+  /** This is the const version of selfadjointView() */
+  EIGEN_DEVICE_FUNC const SelfAdjointView<MatrixTypeNestedNonRef, Mode> selfadjointView() const {
+    EIGEN_STATIC_ASSERT((Mode & (UnitDiag | ZeroDiag)) == 0, PROGRAMMING_ERROR);
+    return SelfAdjointView<MatrixTypeNestedNonRef, Mode>(m_matrix);
+  }
 
-    Scalar determinant() const
-    {
-      if (Mode & UnitDiag)
-        return 1;
-      else if (Mode & ZeroDiag)
-        return 0;
-      else
-        return m_matrix.diagonal().prod();
-    }
-    
-    // TODO simplify the following:
-    template<typename ProductDerived, typename Lhs, typename Rhs>
-    EIGEN_STRONG_INLINE TriangularView& operator=(const ProductBase<ProductDerived, Lhs,Rhs>& other)
-    {
-      setZero();
-      return assignProduct(other.derived(),1);
-    }
-    
-    template<typename ProductDerived, typename Lhs, typename Rhs>
-    EIGEN_STRONG_INLINE TriangularView& operator+=(const ProductBase<ProductDerived, Lhs,Rhs>& other)
-    {
-      return assignProduct(other.derived(),1);
-    }
-    
-    template<typename ProductDerived, typename Lhs, typename Rhs>
-    EIGEN_STRONG_INLINE TriangularView& operator-=(const ProductBase<ProductDerived, Lhs,Rhs>& other)
-    {
-      return assignProduct(other.derived(),-1);
-    }
-    
-    
-    template<typename ProductDerived>
-    EIGEN_STRONG_INLINE TriangularView& operator=(const ScaledProduct<ProductDerived>& other)
-    {
-      setZero();
-      return assignProduct(other.derived(),other.alpha());
-    }
-    
-    template<typename ProductDerived>
-    EIGEN_STRONG_INLINE TriangularView& operator+=(const ScaledProduct<ProductDerived>& other)
-    {
-      return assignProduct(other.derived(),other.alpha());
-    }
-    
-    template<typename ProductDerived>
-    EIGEN_STRONG_INLINE TriangularView& operator-=(const ScaledProduct<ProductDerived>& other)
-    {
-      return assignProduct(other.derived(),-other.alpha());
-    }
-    
-  protected:
-    
-    template<typename ProductDerived, typename Lhs, typename Rhs>
-    EIGEN_STRONG_INLINE TriangularView& assignProduct(const ProductBase<ProductDerived, Lhs,Rhs>& prod, const Scalar& alpha);
-    
-    template<int Mode, bool LhsIsTriangular,
-         typename Lhs, bool LhsIsVector,
-         typename Rhs, bool RhsIsVector>
-    EIGEN_STRONG_INLINE TriangularView& assignProduct(const TriangularProduct<Mode, LhsIsTriangular, Lhs, LhsIsVector, Rhs, RhsIsVector>& prod, const Scalar& alpha)
-    {
-      lazyAssign(alpha*prod.eval());
-      return *this;
-    }
+  /** \returns the determinant of the triangular matrix
+   * \sa MatrixBase::determinant() */
+  EIGEN_DEVICE_FUNC Scalar determinant() const {
+    if (Mode & UnitDiag)
+      return 1;
+    else if (Mode & ZeroDiag)
+      return 0;
+    else
+      return m_matrix.diagonal().prod();
+  }
 
-    MatrixTypeNested m_matrix;
+ protected:
+  MatrixTypeNested m_matrix;
 };
 
-/***************************************************************************
-* Implementation of triangular evaluation/assignment
-***************************************************************************/
+/** \ingroup Core_Module
+ *
+ * \brief Base class for a triangular part in a \b dense matrix
+ *
+ * This class is an abstract base class of class TriangularView, and objects of type TriangularViewImpl cannot be
+ * instantiated. It extends class TriangularView with additional methods which available for dense expressions only.
+ *
+ * \sa class TriangularView, MatrixBase::triangularView()
+ */
+template <typename MatrixType_, unsigned int Mode_>
+class TriangularViewImpl<MatrixType_, Mode_, Dense> : public TriangularBase<TriangularView<MatrixType_, Mode_>> {
+ public:
+  typedef TriangularView<MatrixType_, Mode_> TriangularViewType;
+
+  typedef TriangularBase<TriangularViewType> Base;
+  typedef typename internal::traits<TriangularViewType>::Scalar Scalar;
+
+  typedef MatrixType_ MatrixType;
+  typedef typename MatrixType::PlainObject DenseMatrixType;
+  typedef DenseMatrixType PlainObject;
 
-namespace internal {
+ public:
+  using Base::derived;
+  using Base::evalToLazy;
 
-template<typename Derived1, typename Derived2, unsigned int Mode, int UnrollCount, bool ClearOpposite>
-struct triangular_assignment_selector
-{
-  enum {
-    col = (UnrollCount-1) / Derived1::RowsAtCompileTime,
-    row = (UnrollCount-1) % Derived1::RowsAtCompileTime
-  };
-  
-  typedef typename Derived1::Scalar Scalar;
+  typedef typename internal::traits<TriangularViewType>::StorageKind StorageKind;
 
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-    triangular_assignment_selector<Derived1, Derived2, Mode, UnrollCount-1, ClearOpposite>::run(dst, src);
-
-    eigen_assert( Mode == Upper || Mode == Lower
-            || Mode == StrictlyUpper || Mode == StrictlyLower
-            || Mode == UnitUpper || Mode == UnitLower);
-    if((Mode == Upper && row <= col)
-    || (Mode == Lower && row >= col)
-    || (Mode == StrictlyUpper && row < col)
-    || (Mode == StrictlyLower && row > col)
-    || (Mode == UnitUpper && row < col)
-    || (Mode == UnitLower && row > col))
-      dst.copyCoeff(row, col, src);
-    else if(ClearOpposite)
-    {
-      if (Mode&UnitDiag && row==col)
-        dst.coeffRef(row, col) = Scalar(1);
-      else
-        dst.coeffRef(row, col) = Scalar(0);
-    }
-  }
-};
+  enum { Mode = Mode_, Flags = internal::traits<TriangularViewType>::Flags };
 
-// prevent buggy user code from causing an infinite recursion
-template<typename Derived1, typename Derived2, unsigned int Mode, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, Mode, 0, ClearOpposite>
-{
-  static inline void run(Derived1 &, const Derived2 &) {}
-};
+  /** \returns the outer-stride of the underlying dense matrix
+   * \sa DenseCoeffsBase::outerStride() */
+  EIGEN_DEVICE_FUNC inline Index outerStride() const { return derived().nestedExpression().outerStride(); }
+  /** \returns the inner-stride of the underlying dense matrix
+   * \sa DenseCoeffsBase::innerStride() */
+  EIGEN_DEVICE_FUNC inline Index innerStride() const { return derived().nestedExpression().innerStride(); }
 
-template<typename Derived1, typename Derived2, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, Upper, Dynamic, ClearOpposite>
-{
-  typedef typename Derived1::Index Index;
-  typedef typename Derived1::Scalar Scalar;
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-    for(Index j = 0; j < dst.cols(); ++j)
-    {
-      Index maxi = (std::min)(j, dst.rows()-1);
-      for(Index i = 0; i <= maxi; ++i)
-        dst.copyCoeff(i, j, src);
-      if (ClearOpposite)
-        for(Index i = maxi+1; i < dst.rows(); ++i)
-          dst.coeffRef(i, j) = Scalar(0);
-    }
+  /** \sa MatrixBase::operator+=() */
+  template <typename Other>
+  EIGEN_DEVICE_FUNC TriangularViewType& operator+=(const DenseBase<Other>& other) {
+    internal::call_assignment_no_alias(derived(), other.derived(),
+                                       internal::add_assign_op<Scalar, typename Other::Scalar>());
+    return derived();
   }
-};
-
-template<typename Derived1, typename Derived2, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, Lower, Dynamic, ClearOpposite>
-{
-  typedef typename Derived1::Index Index;
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-    for(Index j = 0; j < dst.cols(); ++j)
-    {
-      for(Index i = j; i < dst.rows(); ++i)
-        dst.copyCoeff(i, j, src);
-      Index maxi = (std::min)(j, dst.rows());
-      if (ClearOpposite)
-        for(Index i = 0; i < maxi; ++i)
-          dst.coeffRef(i, j) = static_cast<typename Derived1::Scalar>(0);
-    }
+  /** \sa MatrixBase::operator-=() */
+  template <typename Other>
+  EIGEN_DEVICE_FUNC TriangularViewType& operator-=(const DenseBase<Other>& other) {
+    internal::call_assignment_no_alias(derived(), other.derived(),
+                                       internal::sub_assign_op<Scalar, typename Other::Scalar>());
+    return derived();
   }
-};
 
-template<typename Derived1, typename Derived2, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, StrictlyUpper, Dynamic, ClearOpposite>
-{
-  typedef typename Derived1::Index Index;
-  typedef typename Derived1::Scalar Scalar;
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-    for(Index j = 0; j < dst.cols(); ++j)
-    {
-      Index maxi = (std::min)(j, dst.rows());
-      for(Index i = 0; i < maxi; ++i)
-        dst.copyCoeff(i, j, src);
-      if (ClearOpposite)
-        for(Index i = maxi; i < dst.rows(); ++i)
-          dst.coeffRef(i, j) = Scalar(0);
-    }
+  /** \sa MatrixBase::operator*=() */
+  EIGEN_DEVICE_FUNC TriangularViewType& operator*=(const typename internal::traits<MatrixType>::Scalar& other) {
+    return *this = derived().nestedExpression() * other;
   }
-};
-
-template<typename Derived1, typename Derived2, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, StrictlyLower, Dynamic, ClearOpposite>
-{
-  typedef typename Derived1::Index Index;
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-    for(Index j = 0; j < dst.cols(); ++j)
-    {
-      for(Index i = j+1; i < dst.rows(); ++i)
-        dst.copyCoeff(i, j, src);
-      Index maxi = (std::min)(j, dst.rows()-1);
-      if (ClearOpposite)
-        for(Index i = 0; i <= maxi; ++i)
-          dst.coeffRef(i, j) = static_cast<typename Derived1::Scalar>(0);
-    }
+  /** \sa DenseBase::operator/=() */
+  EIGEN_DEVICE_FUNC TriangularViewType& operator/=(const typename internal::traits<MatrixType>::Scalar& other) {
+    return *this = derived().nestedExpression() / other;
   }
-};
 
-template<typename Derived1, typename Derived2, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, UnitUpper, Dynamic, ClearOpposite>
-{
-  typedef typename Derived1::Index Index;
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-    for(Index j = 0; j < dst.cols(); ++j)
-    {
-      Index maxi = (std::min)(j, dst.rows());
-      for(Index i = 0; i < maxi; ++i)
-        dst.copyCoeff(i, j, src);
-      if (ClearOpposite)
-      {
-        for(Index i = maxi+1; i < dst.rows(); ++i)
-          dst.coeffRef(i, j) = 0;
-      }
-    }
-    dst.diagonal().setOnes();
+  /** \sa MatrixBase::fill() */
+  EIGEN_DEVICE_FUNC void fill(const Scalar& value) { setConstant(value); }
+  /** \sa MatrixBase::setConstant() */
+  EIGEN_DEVICE_FUNC TriangularViewType& setConstant(const Scalar& value) {
+    return *this = MatrixType::Constant(derived().rows(), derived().cols(), value);
   }
-};
-template<typename Derived1, typename Derived2, bool ClearOpposite>
-struct triangular_assignment_selector<Derived1, Derived2, UnitLower, Dynamic, ClearOpposite>
-{
-  typedef typename Derived1::Index Index;
-  static inline void run(Derived1 &dst, const Derived2 &src)
-  {
-    for(Index j = 0; j < dst.cols(); ++j)
-    {
-      Index maxi = (std::min)(j, dst.rows());
-      for(Index i = maxi+1; i < dst.rows(); ++i)
-        dst.copyCoeff(i, j, src);
-      if (ClearOpposite)
-      {
-        for(Index i = 0; i < maxi; ++i)
-          dst.coeffRef(i, j) = 0;
-      }
-    }
-    dst.diagonal().setOnes();
+  /** \sa MatrixBase::setZero() */
+  EIGEN_DEVICE_FUNC TriangularViewType& setZero() { return setConstant(Scalar(0)); }
+  /** \sa MatrixBase::setOnes() */
+  EIGEN_DEVICE_FUNC TriangularViewType& setOnes() { return setConstant(Scalar(1)); }
+
+  /** \sa MatrixBase::coeff()
+   * \warning the coordinates must fit into the referenced triangular part
+   */
+  EIGEN_DEVICE_FUNC inline Scalar coeff(Index row, Index col) const {
+    Base::check_coordinates_internal(row, col);
+    return derived().nestedExpression().coeff(row, col);
   }
-};
-
-} // end namespace internal
 
-// FIXME should we keep that possibility
-template<typename MatrixType, unsigned int Mode>
-template<typename OtherDerived>
-inline TriangularView<MatrixType, Mode>&
-TriangularView<MatrixType, Mode>::operator=(const MatrixBase<OtherDerived>& other)
-{
-  if(OtherDerived::Flags & EvalBeforeAssigningBit)
-  {
-    typename internal::plain_matrix_type<OtherDerived>::type other_evaluated(other.rows(), other.cols());
-    other_evaluated.template triangularView<Mode>().lazyAssign(other.derived());
-    lazyAssign(other_evaluated);
+  /** \sa MatrixBase::coeffRef()
+   * \warning the coordinates must fit into the referenced triangular part
+   */
+  EIGEN_DEVICE_FUNC inline Scalar& coeffRef(Index row, Index col) {
+    EIGEN_STATIC_ASSERT_LVALUE(TriangularViewType);
+    Base::check_coordinates_internal(row, col);
+    return derived().nestedExpression().coeffRef(row, col);
   }
-  else
-    lazyAssign(other.derived());
-  return *this;
-}
 
-// FIXME should we keep that possibility
-template<typename MatrixType, unsigned int Mode>
-template<typename OtherDerived>
-void TriangularView<MatrixType, Mode>::lazyAssign(const MatrixBase<OtherDerived>& other)
-{
-  enum {
-    unroll = MatrixType::SizeAtCompileTime != Dynamic
-          && internal::traits<OtherDerived>::CoeffReadCost != Dynamic
-          && MatrixType::SizeAtCompileTime*internal::traits<OtherDerived>::CoeffReadCost/2 <= EIGEN_UNROLLING_LIMIT
-  };
-  eigen_assert(m_matrix.rows() == other.rows() && m_matrix.cols() == other.cols());
+  /** Assigns a triangular matrix to a triangular part of a dense matrix */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC TriangularViewType& operator=(const TriangularBase<OtherDerived>& other);
 
-  internal::triangular_assignment_selector
-    <MatrixType, OtherDerived, int(Mode),
-    unroll ? int(MatrixType::SizeAtCompileTime) : Dynamic,
-    false // do not change the opposite triangular part
-    >::run(m_matrix.const_cast_derived(), other.derived());
-}
+  /** Shortcut for\code *this = other.other.triangularView<(*this)::Mode>() \endcode */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC TriangularViewType& operator=(const MatrixBase<OtherDerived>& other);
 
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  EIGEN_DEVICE_FUNC TriangularViewType& operator=(const TriangularViewImpl& other) {
+    return *this = other.derived().nestedExpression();
+  }
 
+  template <typename OtherDerived>
+  /** \deprecated */
+  EIGEN_DEPRECATED EIGEN_DEVICE_FUNC void lazyAssign(const TriangularBase<OtherDerived>& other);
 
-template<typename MatrixType, unsigned int Mode>
-template<typename OtherDerived>
-inline TriangularView<MatrixType, Mode>&
-TriangularView<MatrixType, Mode>::operator=(const TriangularBase<OtherDerived>& other)
-{
-  eigen_assert(Mode == int(OtherDerived::Mode));
-  if(internal::traits<OtherDerived>::Flags & EvalBeforeAssigningBit)
-  {
-    typename OtherDerived::DenseMatrixType other_evaluated(other.rows(), other.cols());
-    other_evaluated.template triangularView<Mode>().lazyAssign(other.derived().nestedExpression());
-    lazyAssign(other_evaluated);
-  }
-  else
-    lazyAssign(other.derived().nestedExpression());
-  return *this;
-}
+  template <typename OtherDerived>
+  /** \deprecated */
+  EIGEN_DEPRECATED EIGEN_DEVICE_FUNC void lazyAssign(const MatrixBase<OtherDerived>& other);
+#endif
 
-template<typename MatrixType, unsigned int Mode>
-template<typename OtherDerived>
-void TriangularView<MatrixType, Mode>::lazyAssign(const TriangularBase<OtherDerived>& other)
-{
-  enum {
-    unroll = MatrixType::SizeAtCompileTime != Dynamic
-                   && internal::traits<OtherDerived>::CoeffReadCost != Dynamic
-                   && MatrixType::SizeAtCompileTime * internal::traits<OtherDerived>::CoeffReadCost / 2
-                        <= EIGEN_UNROLLING_LIMIT
-  };
-  eigen_assert(m_matrix.rows() == other.rows() && m_matrix.cols() == other.cols());
+  /** Efficient triangular matrix times vector/matrix product */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC const Product<TriangularViewType, OtherDerived> operator*(
+      const MatrixBase<OtherDerived>& rhs) const {
+    return Product<TriangularViewType, OtherDerived>(derived(), rhs.derived());
+  }
 
-  internal::triangular_assignment_selector
-    <MatrixType, OtherDerived, int(Mode),
-    unroll ? int(MatrixType::SizeAtCompileTime) : Dynamic,
-    false // preserve the opposite triangular part
-    >::run(m_matrix.const_cast_derived(), other.derived().nestedExpression());
-}
+  /** Efficient vector/matrix times triangular matrix product */
+  template <typename OtherDerived>
+  friend EIGEN_DEVICE_FUNC const Product<OtherDerived, TriangularViewType> operator*(
+      const MatrixBase<OtherDerived>& lhs, const TriangularViewImpl& rhs) {
+    return Product<OtherDerived, TriangularViewType>(lhs.derived(), rhs.derived());
+  }
 
-/***************************************************************************
-* Implementation of TriangularBase methods
-***************************************************************************/
+  /** \returns the product of the inverse of \c *this with \a other, \a *this being triangular.
+   *
+   * This function computes the inverse-matrix matrix product inverse(\c *this) * \a other if
+   * \a Side==OnTheLeft (the default), or the right-inverse-multiply  \a other * inverse(\c *this) if
+   * \a Side==OnTheRight.
+   *
+   * Note that the template parameter \c Side can be omitted, in which case \c Side==OnTheLeft
+   *
+   * The matrix \c *this must be triangular and invertible (i.e., all the coefficients of the
+   * diagonal must be non zero). It works as a forward (resp. backward) substitution if \c *this
+   * is an upper (resp. lower) triangular matrix.
+   *
+   * Example: \include Triangular_solve.cpp
+   * Output: \verbinclude Triangular_solve.out
+   *
+   * This function returns an expression of the inverse-multiply and can works in-place if it is assigned
+   * to the same matrix or vector \a other.
+   *
+   * For users coming from BLAS, this function (and more specifically solveInPlace()) offer
+   * all the operations supported by the \c *TRSV and \c *TRSM BLAS routines.
+   *
+   * \sa TriangularView::solveInPlace()
+   */
+  template <int Side, typename Other>
+  inline const internal::triangular_solve_retval<Side, TriangularViewType, Other> solve(
+      const MatrixBase<Other>& other) const;
+
+  /** "in-place" version of TriangularView::solve() where the result is written in \a other
+   *
+   * \warning The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here.
+   * This function will const_cast it, so constness isn't honored here.
+   *
+   * Note that the template parameter \c Side can be omitted, in which case \c Side==OnTheLeft
+   *
+   * See TriangularView:solve() for the details.
+   */
+  template <int Side, typename OtherDerived>
+  EIGEN_DEVICE_FUNC void solveInPlace(const MatrixBase<OtherDerived>& other) const;
+
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC void solveInPlace(const MatrixBase<OtherDerived>& other) const {
+    return solveInPlace<OnTheLeft>(other);
+  }
 
-/** Assigns a triangular or selfadjoint matrix to a dense matrix.
-  * If the matrix is triangular, the opposite part is set to zero. */
-template<typename Derived>
-template<typename DenseDerived>
-void TriangularBase<Derived>::evalTo(MatrixBase<DenseDerived> &other) const
-{
-  if(internal::traits<Derived>::Flags & EvalBeforeAssigningBit)
+  /** Swaps the coefficients of the common triangular parts of two matrices */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+      void
+      swap(TriangularBase<OtherDerived>& other)
+#else
+      void
+      swap(TriangularBase<OtherDerived> const& other)
+#endif
   {
-    typename internal::plain_matrix_type<Derived>::type other_evaluated(rows(), cols());
-    evalToLazy(other_evaluated);
-    other.derived().swap(other_evaluated);
+    EIGEN_STATIC_ASSERT_LVALUE(OtherDerived);
+    call_assignment(derived(), other.const_cast_derived(), internal::swap_assign_op<Scalar>());
   }
-  else
-    evalToLazy(other.derived());
-}
 
-/** Assigns a triangular or selfadjoint matrix to a dense matrix.
-  * If the matrix is triangular, the opposite part is set to zero. */
-template<typename Derived>
-template<typename DenseDerived>
-void TriangularBase<Derived>::evalToLazy(MatrixBase<DenseDerived> &other) const
-{
-  enum {
-    unroll = DenseDerived::SizeAtCompileTime != Dynamic
-                   && internal::traits<Derived>::CoeffReadCost != Dynamic
-                   && DenseDerived::SizeAtCompileTime * internal::traits<Derived>::CoeffReadCost / 2
-                        <= EIGEN_UNROLLING_LIMIT
-  };
-  other.derived().resize(this->rows(), this->cols());
-
-  internal::triangular_assignment_selector
-    <DenseDerived, typename internal::traits<Derived>::MatrixTypeNestedCleaned, Derived::Mode,
-    unroll ? int(DenseDerived::SizeAtCompileTime) : Dynamic,
-    true // clear the opposite triangular part
-    >::run(other.derived(), derived().nestedExpression());
-}
+  /** Shortcut for \code (*this).swap(other.triangularView<(*this)::Mode>()) \endcode */
+  template <typename OtherDerived>
+  /** \deprecated */
+  EIGEN_DEPRECATED EIGEN_DEVICE_FUNC void swap(MatrixBase<OtherDerived> const& other) {
+    EIGEN_STATIC_ASSERT_LVALUE(OtherDerived);
+    call_assignment(derived(), other.const_cast_derived(), internal::swap_assign_op<Scalar>());
+  }
 
-/***************************************************************************
-* Implementation of TriangularView methods
-***************************************************************************/
+  template <typename RhsType, typename DstType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _solve_impl(const RhsType& rhs, DstType& dst) const {
+    if (!internal::is_same_dense(dst, rhs)) dst = rhs;
+    this->solveInPlace(dst);
+  }
 
-/***************************************************************************
-* Implementation of MatrixBase methods
-***************************************************************************/
+  template <typename ProductType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TriangularViewType& _assignProduct(const ProductType& prod, const Scalar& alpha,
+                                                                           bool beta);
 
-#ifdef EIGEN2_SUPPORT
+ protected:
+  EIGEN_DEFAULT_COPY_CONSTRUCTOR(TriangularViewImpl)
+  EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(TriangularViewImpl)
+};
 
-// implementation of part<>(), including the SelfAdjoint case.
+/***************************************************************************
+ * Implementation of triangular evaluation/assignment
+ ***************************************************************************/
 
-namespace internal {
-template<typename MatrixType, unsigned int Mode>
-struct eigen2_part_return_type
-{
-  typedef TriangularView<MatrixType, Mode> type;
-};
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+// FIXME should we keep that possibility
+template <typename MatrixType, unsigned int Mode>
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC inline TriangularView<MatrixType, Mode>& TriangularViewImpl<MatrixType, Mode, Dense>::operator=(
+    const MatrixBase<OtherDerived>& other) {
+  internal::call_assignment_no_alias(derived(), other.derived(),
+                                     internal::assign_op<Scalar, typename OtherDerived::Scalar>());
+  return derived();
+}
 
-template<typename MatrixType>
-struct eigen2_part_return_type<MatrixType, SelfAdjoint>
-{
-  typedef SelfAdjointView<MatrixType, Upper> type;
-};
+// FIXME should we keep that possibility
+template <typename MatrixType, unsigned int Mode>
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(const MatrixBase<OtherDerived>& other) {
+  internal::call_assignment_no_alias(derived(), other.template triangularView<Mode>());
 }
 
-/** \deprecated use MatrixBase::triangularView() */
-template<typename Derived>
-template<unsigned int Mode>
-const typename internal::eigen2_part_return_type<Derived, Mode>::type MatrixBase<Derived>::part() const
-{
+template <typename MatrixType, unsigned int Mode>
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC inline TriangularView<MatrixType, Mode>& TriangularViewImpl<MatrixType, Mode, Dense>::operator=(
+    const TriangularBase<OtherDerived>& other) {
+  eigen_assert(Mode == int(OtherDerived::Mode));
+  internal::call_assignment(derived(), other.derived());
   return derived();
 }
 
-/** \deprecated use MatrixBase::triangularView() */
-template<typename Derived>
-template<unsigned int Mode>
-typename internal::eigen2_part_return_type<Derived, Mode>::type MatrixBase<Derived>::part()
-{
-  return derived();
+template <typename MatrixType, unsigned int Mode>
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC void TriangularViewImpl<MatrixType, Mode, Dense>::lazyAssign(
+    const TriangularBase<OtherDerived>& other) {
+  eigen_assert(Mode == int(OtherDerived::Mode));
+  internal::call_assignment_no_alias(derived(), other.derived());
 }
 #endif
 
+/***************************************************************************
+ * Implementation of TriangularBase methods
+ ***************************************************************************/
+
+/** Assigns a triangular or selfadjoint matrix to a dense matrix.
+ * If the matrix is triangular, the opposite part is set to zero. */
+template <typename Derived>
+template <typename DenseDerived>
+EIGEN_DEVICE_FUNC void TriangularBase<Derived>::evalTo(MatrixBase<DenseDerived>& other) const {
+  evalToLazy(other.derived());
+}
+
+/***************************************************************************
+ * Implementation of TriangularView methods
+ ***************************************************************************/
+
+/***************************************************************************
+ * Implementation of MatrixBase methods
+ ***************************************************************************/
+
 /**
-  * \returns an expression of a triangular view extracted from the current matrix
-  *
-  * The parameter \a Mode can have the following values: \c #Upper, \c #StrictlyUpper, \c #UnitUpper,
-  * \c #Lower, \c #StrictlyLower, \c #UnitLower.
-  *
-  * Example: \include MatrixBase_extract.cpp
-  * Output: \verbinclude MatrixBase_extract.out
-  *
-  * \sa class TriangularView
-  */
-template<typename Derived>
-template<unsigned int Mode>
-typename MatrixBase<Derived>::template TriangularViewReturnType<Mode>::Type
-MatrixBase<Derived>::triangularView()
-{
-  return derived();
+ * \returns an expression of a triangular view extracted from the current matrix
+ *
+ * The parameter \a Mode can have the following values: \c #Upper, \c #StrictlyUpper, \c #UnitUpper,
+ * \c #Lower, \c #StrictlyLower, \c #UnitLower.
+ *
+ * Example: \include MatrixBase_triangularView.cpp
+ * Output: \verbinclude MatrixBase_triangularView.out
+ *
+ * \sa class TriangularView
+ */
+template <typename Derived>
+template <unsigned int Mode>
+EIGEN_DEVICE_FUNC typename MatrixBase<Derived>::template TriangularViewReturnType<Mode>::Type
+MatrixBase<Derived>::triangularView() {
+  return typename TriangularViewReturnType<Mode>::Type(derived());
 }
 
 /** This is the const version of MatrixBase::triangularView() */
-template<typename Derived>
-template<unsigned int Mode>
-typename MatrixBase<Derived>::template ConstTriangularViewReturnType<Mode>::Type
-MatrixBase<Derived>::triangularView() const
-{
-  return derived();
+template <typename Derived>
+template <unsigned int Mode>
+EIGEN_DEVICE_FUNC typename MatrixBase<Derived>::template ConstTriangularViewReturnType<Mode>::Type
+MatrixBase<Derived>::triangularView() const {
+  return typename ConstTriangularViewReturnType<Mode>::Type(derived());
 }
 
 /** \returns true if *this is approximately equal to an upper triangular matrix,
-  *          within the precision given by \a prec.
-  *
-  * \sa isLowerTriangular()
-  */
-template<typename Derived>
-bool MatrixBase<Derived>::isUpperTriangular(const RealScalar& prec) const
-{
-  using std::abs;
+ *          within the precision given by \a prec.
+ *
+ * \sa isLowerTriangular()
+ */
+template <typename Derived>
+bool MatrixBase<Derived>::isUpperTriangular(const RealScalar& prec) const {
   RealScalar maxAbsOnUpperPart = static_cast<RealScalar>(-1);
-  for(Index j = 0; j < cols(); ++j)
-  {
-    Index maxi = (std::min)(j, rows()-1);
-    for(Index i = 0; i <= maxi; ++i)
-    {
-      RealScalar absValue = abs(coeff(i,j));
-      if(absValue > maxAbsOnUpperPart) maxAbsOnUpperPart = absValue;
+  for (Index j = 0; j < cols(); ++j) {
+    Index maxi = numext::mini(j, rows() - 1);
+    for (Index i = 0; i <= maxi; ++i) {
+      RealScalar absValue = numext::abs(coeff(i, j));
+      if (absValue > maxAbsOnUpperPart) maxAbsOnUpperPart = absValue;
     }
   }
   RealScalar threshold = maxAbsOnUpperPart * prec;
-  for(Index j = 0; j < cols(); ++j)
-    for(Index i = j+1; i < rows(); ++i)
-      if(abs(coeff(i, j)) > threshold) return false;
+  for (Index j = 0; j < cols(); ++j)
+    for (Index i = j + 1; i < rows(); ++i)
+      if (numext::abs(coeff(i, j)) > threshold) return false;
   return true;
 }
 
 /** \returns true if *this is approximately equal to a lower triangular matrix,
-  *          within the precision given by \a prec.
-  *
-  * \sa isUpperTriangular()
-  */
-template<typename Derived>
-bool MatrixBase<Derived>::isLowerTriangular(const RealScalar& prec) const
-{
-  using std::abs;
+ *          within the precision given by \a prec.
+ *
+ * \sa isUpperTriangular()
+ */
+template <typename Derived>
+bool MatrixBase<Derived>::isLowerTriangular(const RealScalar& prec) const {
   RealScalar maxAbsOnLowerPart = static_cast<RealScalar>(-1);
-  for(Index j = 0; j < cols(); ++j)
-    for(Index i = j; i < rows(); ++i)
-    {
-      RealScalar absValue = abs(coeff(i,j));
-      if(absValue > maxAbsOnLowerPart) maxAbsOnLowerPart = absValue;
+  for (Index j = 0; j < cols(); ++j)
+    for (Index i = j; i < rows(); ++i) {
+      RealScalar absValue = numext::abs(coeff(i, j));
+      if (absValue > maxAbsOnLowerPart) maxAbsOnLowerPart = absValue;
     }
   RealScalar threshold = maxAbsOnLowerPart * prec;
-  for(Index j = 1; j < cols(); ++j)
-  {
-    Index maxi = (std::min)(j, rows()-1);
-    for(Index i = 0; i < maxi; ++i)
-      if(abs(coeff(i, j)) > threshold) return false;
+  for (Index j = 1; j < cols(); ++j) {
+    Index maxi = numext::mini(j, rows() - 1);
+    for (Index i = 0; i < maxi; ++i)
+      if (numext::abs(coeff(i, j)) > threshold) return false;
   }
   return true;
 }
 
-} // end namespace Eigen
+/***************************************************************************
+****************************************************************************
+* Evaluators and Assignment of triangular expressions
+***************************************************************************
+***************************************************************************/
+
+namespace internal {
+
+// TODO currently a triangular expression has the form TriangularView<.,.>
+//      in the future triangular-ness should be defined by the expression traits
+//      such that Transpose<TriangularView<.,.> > is valid. (currently TriangularBase::transpose() is overloaded to make
+//      it work)
+template <typename MatrixType, unsigned int Mode>
+struct evaluator_traits<TriangularView<MatrixType, Mode>> {
+  typedef typename storage_kind_to_evaluator_kind<typename MatrixType::StorageKind>::Kind Kind;
+  typedef typename glue_shapes<typename evaluator_traits<MatrixType>::Shape, TriangularShape>::type Shape;
+};
+
+template <typename MatrixType, unsigned int Mode>
+struct unary_evaluator<TriangularView<MatrixType, Mode>, IndexBased> : evaluator<internal::remove_all_t<MatrixType>> {
+  typedef TriangularView<MatrixType, Mode> XprType;
+  typedef evaluator<internal::remove_all_t<MatrixType>> Base;
+  EIGEN_DEVICE_FUNC unary_evaluator(const XprType& xpr) : Base(xpr.nestedExpression()) {}
+};
+
+// Additional assignment kinds:
+struct Triangular2Triangular {};
+struct Triangular2Dense {};
+struct Dense2Triangular {};
+
+template <typename Kernel, unsigned int Mode, int UnrollCount, bool ClearOpposite>
+struct triangular_assignment_loop;
+
+/** \internal Specialization of the dense assignment kernel for triangular matrices.
+ * The main difference is that the triangular, diagonal, and opposite parts are processed through three different
+ * functions. \tparam UpLo must be either Lower or Upper \tparam Mode must be either 0, UnitDiag, ZeroDiag, or
+ * SelfAdjoint
+ */
+template <int UpLo, int Mode, int SetOpposite, typename DstEvaluatorTypeT, typename SrcEvaluatorTypeT, typename Functor,
+          int Version = Specialized>
+class triangular_dense_assignment_kernel
+    : public generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, Version> {
+ protected:
+  typedef generic_dense_assignment_kernel<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor, Version> Base;
+  typedef typename Base::DstXprType DstXprType;
+  typedef typename Base::SrcXprType SrcXprType;
+  using Base::m_dst;
+  using Base::m_functor;
+  using Base::m_src;
+
+ public:
+  typedef typename Base::DstEvaluatorType DstEvaluatorType;
+  typedef typename Base::SrcEvaluatorType SrcEvaluatorType;
+  typedef typename Base::Scalar Scalar;
+  typedef typename Base::AssignmentTraits AssignmentTraits;
+
+  EIGEN_DEVICE_FUNC triangular_dense_assignment_kernel(DstEvaluatorType& dst, const SrcEvaluatorType& src,
+                                                       const Functor& func, DstXprType& dstExpr)
+      : Base(dst, src, func, dstExpr) {}
+
+#ifdef EIGEN_INTERNAL_DEBUGGING
+  EIGEN_DEVICE_FUNC void assignCoeff(Index row, Index col) {
+    eigen_internal_assert(row != col);
+    Base::assignCoeff(row, col);
+  }
+#else
+  using Base::assignCoeff;
+#endif
+
+  EIGEN_DEVICE_FUNC void assignDiagonalCoeff(Index id) {
+    if (Mode == UnitDiag && SetOpposite)
+      m_functor.assignCoeff(m_dst.coeffRef(id, id), Scalar(1));
+    else if (Mode == ZeroDiag && SetOpposite)
+      m_functor.assignCoeff(m_dst.coeffRef(id, id), Scalar(0));
+    else if (Mode == 0)
+      Base::assignCoeff(id, id);
+  }
+
+  EIGEN_DEVICE_FUNC void assignOppositeCoeff(Index row, Index col) {
+    eigen_internal_assert(row != col);
+    if (SetOpposite) m_functor.assignCoeff(m_dst.coeffRef(row, col), Scalar(0));
+  }
+};
+
+template <int Mode, bool SetOpposite, typename DstXprType, typename SrcXprType, typename Functor>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_triangular_assignment_loop(DstXprType& dst, const SrcXprType& src,
+                                                                           const Functor& func) {
+  typedef evaluator<DstXprType> DstEvaluatorType;
+  typedef evaluator<SrcXprType> SrcEvaluatorType;
+
+  SrcEvaluatorType srcEvaluator(src);
+
+  Index dstRows = src.rows();
+  Index dstCols = src.cols();
+  if ((dst.rows() != dstRows) || (dst.cols() != dstCols)) dst.resize(dstRows, dstCols);
+  DstEvaluatorType dstEvaluator(dst);
+
+  typedef triangular_dense_assignment_kernel<Mode&(Lower | Upper), Mode&(UnitDiag | ZeroDiag | SelfAdjoint),
+                                             SetOpposite, DstEvaluatorType, SrcEvaluatorType, Functor>
+      Kernel;
+  Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived());
+
+  enum {
+    unroll = DstXprType::SizeAtCompileTime != Dynamic && SrcEvaluatorType::CoeffReadCost < HugeCost &&
+             DstXprType::SizeAtCompileTime *
+                     (int(DstEvaluatorType::CoeffReadCost) + int(SrcEvaluatorType::CoeffReadCost)) / 2 <=
+                 EIGEN_UNROLLING_LIMIT
+  };
+
+  triangular_assignment_loop<Kernel, Mode, unroll ? int(DstXprType::SizeAtCompileTime) : Dynamic, SetOpposite>::run(
+      kernel);
+}
+
+template <int Mode, bool SetOpposite, typename DstXprType, typename SrcXprType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_triangular_assignment_loop(DstXprType& dst, const SrcXprType& src) {
+  call_triangular_assignment_loop<Mode, SetOpposite>(
+      dst, src, internal::assign_op<typename DstXprType::Scalar, typename SrcXprType::Scalar>());
+}
+
+template <>
+struct AssignmentKind<TriangularShape, TriangularShape> {
+  typedef Triangular2Triangular Kind;
+};
+template <>
+struct AssignmentKind<DenseShape, TriangularShape> {
+  typedef Triangular2Dense Kind;
+};
+template <>
+struct AssignmentKind<TriangularShape, DenseShape> {
+  typedef Dense2Triangular Kind;
+};
+
+template <typename DstXprType, typename SrcXprType, typename Functor>
+struct Assignment<DstXprType, SrcXprType, Functor, Triangular2Triangular> {
+  EIGEN_DEVICE_FUNC static void run(DstXprType& dst, const SrcXprType& src, const Functor& func) {
+    eigen_assert(int(DstXprType::Mode) == int(SrcXprType::Mode));
+
+    call_triangular_assignment_loop<DstXprType::Mode, false>(dst, src, func);
+  }
+};
+
+template <typename DstXprType, typename SrcXprType, typename Functor>
+struct Assignment<DstXprType, SrcXprType, Functor, Triangular2Dense> {
+  EIGEN_DEVICE_FUNC static void run(DstXprType& dst, const SrcXprType& src, const Functor& func) {
+    call_triangular_assignment_loop<SrcXprType::Mode, (int(SrcXprType::Mode) & int(SelfAdjoint)) == 0>(dst, src, func);
+  }
+};
+
+template <typename DstXprType, typename SrcXprType, typename Functor>
+struct Assignment<DstXprType, SrcXprType, Functor, Dense2Triangular> {
+  EIGEN_DEVICE_FUNC static void run(DstXprType& dst, const SrcXprType& src, const Functor& func) {
+    call_triangular_assignment_loop<DstXprType::Mode, false>(dst, src, func);
+  }
+};
+
+template <typename Kernel, unsigned int Mode, int UnrollCount, bool SetOpposite>
+struct triangular_assignment_loop {
+  // FIXME: this is not very clean, perhaps this information should be provided by the kernel?
+  typedef typename Kernel::DstEvaluatorType DstEvaluatorType;
+  typedef typename DstEvaluatorType::XprType DstXprType;
+
+  enum {
+    col = (UnrollCount - 1) / DstXprType::RowsAtCompileTime,
+    row = (UnrollCount - 1) % DstXprType::RowsAtCompileTime
+  };
+
+  typedef typename Kernel::Scalar Scalar;
+
+  EIGEN_DEVICE_FUNC static inline void run(Kernel& kernel) {
+    triangular_assignment_loop<Kernel, Mode, UnrollCount - 1, SetOpposite>::run(kernel);
+
+    if (row == col)
+      kernel.assignDiagonalCoeff(row);
+    else if (((Mode & Lower) && row > col) || ((Mode & Upper) && row < col))
+      kernel.assignCoeff(row, col);
+    else if (SetOpposite)
+      kernel.assignOppositeCoeff(row, col);
+  }
+};
+
+// prevent buggy user code from causing an infinite recursion
+template <typename Kernel, unsigned int Mode, bool SetOpposite>
+struct triangular_assignment_loop<Kernel, Mode, 0, SetOpposite> {
+  EIGEN_DEVICE_FUNC static inline void run(Kernel&) {}
+};
+
+// TODO: experiment with a recursive assignment procedure splitting the current
+//       triangular part into one rectangular and two triangular parts.
+
+template <typename Kernel, unsigned int Mode, bool SetOpposite>
+struct triangular_assignment_loop<Kernel, Mode, Dynamic, SetOpposite> {
+  typedef typename Kernel::Scalar Scalar;
+  EIGEN_DEVICE_FUNC static inline void run(Kernel& kernel) {
+    for (Index j = 0; j < kernel.cols(); ++j) {
+      Index maxi = numext::mini(j, kernel.rows());
+      Index i = 0;
+      if (((Mode & Lower) && SetOpposite) || (Mode & Upper)) {
+        for (; i < maxi; ++i)
+          if (Mode & Upper)
+            kernel.assignCoeff(i, j);
+          else
+            kernel.assignOppositeCoeff(i, j);
+      } else
+        i = maxi;
+
+      if (i < kernel.rows())  // then i==j
+        kernel.assignDiagonalCoeff(i++);
+
+      if (((Mode & Upper) && SetOpposite) || (Mode & Lower)) {
+        for (; i < kernel.rows(); ++i)
+          if (Mode & Lower)
+            kernel.assignCoeff(i, j);
+          else
+            kernel.assignOppositeCoeff(i, j);
+      }
+    }
+  }
+};
+
+}  // end namespace internal
+
+/** Assigns a triangular or selfadjoint matrix to a dense matrix.
+ * If the matrix is triangular, the opposite part is set to zero. */
+template <typename Derived>
+template <typename DenseDerived>
+EIGEN_DEVICE_FUNC void TriangularBase<Derived>::evalToLazy(MatrixBase<DenseDerived>& other) const {
+  other.derived().resize(this->rows(), this->cols());
+  internal::call_triangular_assignment_loop<Derived::Mode,
+                                            (int(Derived::Mode) & int(SelfAdjoint)) == 0 /* SetOpposite */>(
+      other.derived(), derived().nestedExpression());
+}
+
+namespace internal {
+
+// Triangular = Product
+template <typename DstXprType, typename Lhs, typename Rhs, typename Scalar>
+struct Assignment<DstXprType, Product<Lhs, Rhs, DefaultProduct>,
+                  internal::assign_op<Scalar, typename Product<Lhs, Rhs, DefaultProduct>::Scalar>, Dense2Triangular> {
+  typedef Product<Lhs, Rhs, DefaultProduct> SrcXprType;
+  static void run(DstXprType& dst, const SrcXprType& src,
+                  const internal::assign_op<Scalar, typename SrcXprType::Scalar>&) {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if ((dst.rows() != dstRows) || (dst.cols() != dstCols)) dst.resize(dstRows, dstCols);
+
+    dst._assignProduct(src, Scalar(1), false);
+  }
+};
+
+// Triangular += Product
+template <typename DstXprType, typename Lhs, typename Rhs, typename Scalar>
+struct Assignment<DstXprType, Product<Lhs, Rhs, DefaultProduct>,
+                  internal::add_assign_op<Scalar, typename Product<Lhs, Rhs, DefaultProduct>::Scalar>,
+                  Dense2Triangular> {
+  typedef Product<Lhs, Rhs, DefaultProduct> SrcXprType;
+  static void run(DstXprType& dst, const SrcXprType& src,
+                  const internal::add_assign_op<Scalar, typename SrcXprType::Scalar>&) {
+    dst._assignProduct(src, Scalar(1), true);
+  }
+};
+
+// Triangular -= Product
+template <typename DstXprType, typename Lhs, typename Rhs, typename Scalar>
+struct Assignment<DstXprType, Product<Lhs, Rhs, DefaultProduct>,
+                  internal::sub_assign_op<Scalar, typename Product<Lhs, Rhs, DefaultProduct>::Scalar>,
+                  Dense2Triangular> {
+  typedef Product<Lhs, Rhs, DefaultProduct> SrcXprType;
+  static void run(DstXprType& dst, const SrcXprType& src,
+                  const internal::sub_assign_op<Scalar, typename SrcXprType::Scalar>&) {
+    dst._assignProduct(src, Scalar(-1), true);
+  }
+};
+
+}  // end namespace internal
+
+}  // end namespace Eigen
 
-#endif // EIGEN_TRIANGULARMATRIX_H
+#endif  // EIGEN_TRIANGULARMATRIX_H
diff --git a/inst/include/Eigen/src/Core/VectorBlock.h b/inst/include/Eigen/src/Core/VectorBlock.h
index 1a7330f3..5ac13eb8 100644
--- a/inst/include/Eigen/src/Core/VectorBlock.h
+++ b/inst/include/Eigen/src/Core/VectorBlock.h
@@ -11,85 +11,73 @@
 #ifndef EIGEN_VECTORBLOCK_H
 #define EIGEN_VECTORBLOCK_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 
-/** \class VectorBlock
-  * \ingroup Core_Module
-  *
-  * \brief Expression of a fixed-size or dynamic-size sub-vector
-  *
-  * \param VectorType the type of the object in which we are taking a sub-vector
-  * \param Size size of the sub-vector we are taking at compile time (optional)
-  *
-  * This class represents an expression of either a fixed-size or dynamic-size sub-vector.
-  * It is the return type of DenseBase::segment(Index,Index) and DenseBase::segment<int>(Index) and
-  * most of the time this is the only way it is used.
-  *
-  * However, if you want to directly maniputate sub-vector expressions,
-  * for instance if you want to write a function returning such an expression, you
-  * will need to use this class.
-  *
-  * Here is an example illustrating the dynamic case:
-  * \include class_VectorBlock.cpp
-  * Output: \verbinclude class_VectorBlock.out
-  *
-  * \note Even though this expression has dynamic size, in the case where \a VectorType
-  * has fixed size, this expression inherits a fixed maximal size which means that evaluating
-  * it does not cause a dynamic memory allocation.
-  *
-  * Here is an example illustrating the fixed-size case:
-  * \include class_FixedVectorBlock.cpp
-  * Output: \verbinclude class_FixedVectorBlock.out
-  *
-  * \sa class Block, DenseBase::segment(Index,Index,Index,Index), DenseBase::segment(Index,Index)
-  */
+namespace Eigen {
 
 namespace internal {
-template<typename VectorType, int Size>
+template <typename VectorType, int Size>
 struct traits<VectorBlock<VectorType, Size> >
-  : public traits<Block<VectorType,
-                     traits<VectorType>::Flags & RowMajorBit ? 1 : Size,
-                     traits<VectorType>::Flags & RowMajorBit ? Size : 1> >
-{
-};
-}
+    : public traits<Block<VectorType, traits<VectorType>::Flags & RowMajorBit ? 1 : Size,
+                          traits<VectorType>::Flags & RowMajorBit ? Size : 1> > {};
+}  // namespace internal
 
-template<typename VectorType, int Size> class VectorBlock
-  : public Block<VectorType,
-                     internal::traits<VectorType>::Flags & RowMajorBit ? 1 : Size,
-                     internal::traits<VectorType>::Flags & RowMajorBit ? Size : 1>
-{
-    typedef Block<VectorType,
-                     internal::traits<VectorType>::Flags & RowMajorBit ? 1 : Size,
-                     internal::traits<VectorType>::Flags & RowMajorBit ? Size : 1> Base;
-    enum {
-      IsColVector = !(internal::traits<VectorType>::Flags & RowMajorBit)
-    };
-  public:
-    EIGEN_DENSE_PUBLIC_INTERFACE(VectorBlock)
+/** \class VectorBlock
+ * \ingroup Core_Module
+ *
+ * \brief Expression of a fixed-size or dynamic-size sub-vector
+ *
+ * \tparam VectorType the type of the object in which we are taking a sub-vector
+ * \tparam Size size of the sub-vector we are taking at compile time (optional)
+ *
+ * This class represents an expression of either a fixed-size or dynamic-size sub-vector.
+ * It is the return type of DenseBase::segment(Index,Index) and DenseBase::segment<int>(Index) and
+ * most of the time this is the only way it is used.
+ *
+ * However, if you want to directly manipulate sub-vector expressions,
+ * for instance if you want to write a function returning such an expression, you
+ * will need to use this class.
+ *
+ * Here is an example illustrating the dynamic case:
+ * \include class_VectorBlock.cpp
+ * Output: \verbinclude class_VectorBlock.out
+ *
+ * \note Even though this expression has dynamic size, in the case where \a VectorType
+ * has fixed size, this expression inherits a fixed maximal size which means that evaluating
+ * it does not cause a dynamic memory allocation.
+ *
+ * Here is an example illustrating the fixed-size case:
+ * \include class_FixedVectorBlock.cpp
+ * Output: \verbinclude class_FixedVectorBlock.out
+ *
+ * \sa class Block, DenseBase::segment(Index,Index,Index,Index), DenseBase::segment(Index,Index)
+ */
+template <typename VectorType, int Size>
+class VectorBlock : public Block<VectorType, internal::traits<VectorType>::Flags & RowMajorBit ? 1 : Size,
+                                 internal::traits<VectorType>::Flags & RowMajorBit ? Size : 1> {
+  typedef Block<VectorType, internal::traits<VectorType>::Flags & RowMajorBit ? 1 : Size,
+                internal::traits<VectorType>::Flags & RowMajorBit ? Size : 1>
+      Base;
+  enum { IsColVector = !(internal::traits<VectorType>::Flags & RowMajorBit) };
 
-    using Base::operator=;
+ public:
+  EIGEN_DENSE_PUBLIC_INTERFACE(VectorBlock)
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(VectorBlock)
+  EIGEN_INHERIT_ASSIGNMENT_OPERATORS(VectorBlock)
 
-    /** Dynamic-size constructor
-      */
-    inline VectorBlock(VectorType& vector, Index start, Index size)
-      : Base(vector,
-             IsColVector ? start : 0, IsColVector ? 0 : start,
-             IsColVector ? size  : 1, IsColVector ? 1 : size)
-    {
-      EIGEN_STATIC_ASSERT_VECTOR_ONLY(VectorBlock);
-    }
+  /** Dynamic-size constructor
+   */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE VectorBlock(VectorType& vector, Index start, Index size)
+      : Base(vector, IsColVector ? start : 0, IsColVector ? 0 : start, IsColVector ? size : 1, IsColVector ? 1 : size) {
+  }
 
-    /** Fixed-size constructor
-      */
-    inline VectorBlock(VectorType& vector, Index start)
-      : Base(vector, IsColVector ? start : 0, IsColVector ? 0 : start)
-    {
-      EIGEN_STATIC_ASSERT_VECTOR_ONLY(VectorBlock);
-    }
+  /** Fixed-size constructor
+   */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE VectorBlock(VectorType& vector, Index start)
+      : Base(vector, IsColVector ? start : 0, IsColVector ? 0 : start) {}
 };
 
+}  // end namespace Eigen
 
-} // end namespace Eigen
-
-#endif // EIGEN_VECTORBLOCK_H
+#endif  // EIGEN_VECTORBLOCK_H
diff --git a/inst/include/Eigen/src/Core/VectorwiseOp.h b/inst/include/Eigen/src/Core/VectorwiseOp.h
index d5ab0366..688b49b6 100644
--- a/inst/include/Eigen/src/Core/VectorwiseOp.h
+++ b/inst/include/Eigen/src/Core/VectorwiseOp.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2019 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -11,632 +11,723 @@
 #ifndef EIGEN_PARTIAL_REDUX_H
 #define EIGEN_PARTIAL_REDUX_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 /** \class PartialReduxExpr
-  * \ingroup Core_Module
-  *
-  * \brief Generic expression of a partially reduxed matrix
-  *
-  * \tparam MatrixType the type of the matrix we are applying the redux operation
-  * \tparam MemberOp type of the member functor
-  * \tparam Direction indicates the direction of the redux (#Vertical or #Horizontal)
-  *
-  * This class represents an expression of a partial redux operator of a matrix.
-  * It is the return type of some VectorwiseOp functions,
-  * and most of the time this is the only way it is used.
-  *
-  * \sa class VectorwiseOp
-  */
-
-template< typename MatrixType, typename MemberOp, int Direction>
+ * \ingroup Core_Module
+ *
+ * \brief Generic expression of a partially reduxed matrix
+ *
+ * \tparam MatrixType the type of the matrix we are applying the redux operation
+ * \tparam MemberOp type of the member functor
+ * \tparam Direction indicates the direction of the redux (#Vertical or #Horizontal)
+ *
+ * This class represents an expression of a partial redux operator of a matrix.
+ * It is the return type of some VectorwiseOp functions,
+ * and most of the time this is the only way it is used.
+ *
+ * \sa class VectorwiseOp
+ */
+
+template <typename MatrixType, typename MemberOp, int Direction>
 class PartialReduxExpr;
 
 namespace internal {
-template<typename MatrixType, typename MemberOp, int Direction>
-struct traits<PartialReduxExpr<MatrixType, MemberOp, Direction> >
- : traits<MatrixType>
-{
+
+template <typename MatrixType, typename MemberOp, int Direction>
+struct traits<PartialReduxExpr<MatrixType, MemberOp, Direction> > : traits<MatrixType> {
   typedef typename MemberOp::result_type Scalar;
   typedef typename traits<MatrixType>::StorageKind StorageKind;
   typedef typename traits<MatrixType>::XprKind XprKind;
   typedef typename MatrixType::Scalar InputScalar;
-  typedef typename nested<MatrixType>::type MatrixTypeNested;
-  typedef typename remove_all<MatrixTypeNested>::type _MatrixTypeNested;
-  enum {
-    RowsAtCompileTime = Direction==Vertical   ? 1 : MatrixType::RowsAtCompileTime,
-    ColsAtCompileTime = Direction==Horizontal ? 1 : MatrixType::ColsAtCompileTime,
-    MaxRowsAtCompileTime = Direction==Vertical   ? 1 : MatrixType::MaxRowsAtCompileTime,
-    MaxColsAtCompileTime = Direction==Horizontal ? 1 : MatrixType::MaxColsAtCompileTime,
-    Flags0 = (unsigned int)_MatrixTypeNested::Flags & HereditaryBits,
-    Flags = (Flags0 & ~RowMajorBit) | (RowsAtCompileTime == 1 ? RowMajorBit : 0),
-    TraversalSize = Direction==Vertical ? MatrixType::RowsAtCompileTime :  MatrixType::ColsAtCompileTime
-  };
-  #if EIGEN_GNUC_AT_LEAST(3,4)
-  typedef typename MemberOp::template Cost<InputScalar,int(TraversalSize)> CostOpType;
-  #else
-  typedef typename MemberOp::template Cost<InputScalar,TraversalSize> CostOpType;
-  #endif
   enum {
-    CoeffReadCost = TraversalSize==Dynamic ? Dynamic
-                  : TraversalSize * traits<_MatrixTypeNested>::CoeffReadCost + int(CostOpType::value)
+    RowsAtCompileTime = Direction == Vertical ? 1 : MatrixType::RowsAtCompileTime,
+    ColsAtCompileTime = Direction == Horizontal ? 1 : MatrixType::ColsAtCompileTime,
+    MaxRowsAtCompileTime = Direction == Vertical ? 1 : MatrixType::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = Direction == Horizontal ? 1 : MatrixType::MaxColsAtCompileTime,
+    Flags = RowsAtCompileTime == 1 ? RowMajorBit : 0,
+    TraversalSize = Direction == Vertical ? MatrixType::RowsAtCompileTime : MatrixType::ColsAtCompileTime
   };
 };
-}
-
-template< typename MatrixType, typename MemberOp, int Direction>
-class PartialReduxExpr : internal::no_assignment_operator,
-  public internal::dense_xpr_base< PartialReduxExpr<MatrixType, MemberOp, Direction> >::type
-{
-  public:
+}  // namespace internal
 
-    typedef typename internal::dense_xpr_base<PartialReduxExpr>::type Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(PartialReduxExpr)
-    typedef typename internal::traits<PartialReduxExpr>::MatrixTypeNested MatrixTypeNested;
-    typedef typename internal::traits<PartialReduxExpr>::_MatrixTypeNested _MatrixTypeNested;
+template <typename MatrixType, typename MemberOp, int Direction>
+class PartialReduxExpr : public internal::dense_xpr_base<PartialReduxExpr<MatrixType, MemberOp, Direction> >::type,
+                         internal::no_assignment_operator {
+ public:
+  typedef typename internal::dense_xpr_base<PartialReduxExpr>::type Base;
+  EIGEN_DENSE_PUBLIC_INTERFACE(PartialReduxExpr)
 
-    PartialReduxExpr(const MatrixType& mat, const MemberOp& func = MemberOp())
+  EIGEN_DEVICE_FUNC explicit PartialReduxExpr(const MatrixType& mat, const MemberOp& func = MemberOp())
       : m_matrix(mat), m_functor(func) {}
 
-    Index rows() const { return (Direction==Vertical   ? 1 : m_matrix.rows()); }
-    Index cols() const { return (Direction==Horizontal ? 1 : m_matrix.cols()); }
-
-    EIGEN_STRONG_INLINE const Scalar coeff(Index i, Index j) const
-    {
-      if (Direction==Vertical)
-        return m_functor(m_matrix.col(j));
-      else
-        return m_functor(m_matrix.row(i));
-    }
-
-    const Scalar coeff(Index index) const
-    {
-      if (Direction==Vertical)
-        return m_functor(m_matrix.col(index));
-      else
-        return m_functor(m_matrix.row(index));
-    }
-
-  protected:
-    MatrixTypeNested m_matrix;
-    const MemberOp m_functor;
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return (Direction == Vertical ? 1 : m_matrix.rows()); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return (Direction == Horizontal ? 1 : m_matrix.cols()); }
+
+  EIGEN_DEVICE_FUNC typename MatrixType::Nested nestedExpression() const { return m_matrix; }
+
+  EIGEN_DEVICE_FUNC const MemberOp& functor() const { return m_functor; }
+
+ protected:
+  typename MatrixType::Nested m_matrix;
+  const MemberOp m_functor;
 };
 
-#define EIGEN_MEMBER_FUNCTOR(MEMBER,COST)                               \
-  template <typename ResultType>                                        \
-  struct member_##MEMBER {                                              \
-    EIGEN_EMPTY_STRUCT_CTOR(member_##MEMBER)                            \
-    typedef ResultType result_type;                                     \
-    template<typename Scalar, int Size> struct Cost                     \
-    { enum { value = COST }; };                                         \
-    template<typename XprType>                                          \
-    EIGEN_STRONG_INLINE ResultType operator()(const XprType& mat) const \
-    { return mat.MEMBER(); } \
+template <typename A, typename B>
+struct partial_redux_dummy_func;
+
+#define EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(MEMBER, COST, VECTORIZABLE, BINARYOP)              \
+  template <typename ResultType, typename Scalar>                                           \
+  struct member_##MEMBER {                                                                  \
+    typedef ResultType result_type;                                                         \
+    typedef BINARYOP<Scalar, Scalar> BinaryOp;                                              \
+    template <int Size>                                                                     \
+    struct Cost {                                                                           \
+      enum { value = COST };                                                                \
+    };                                                                                      \
+    enum { Vectorizable = VECTORIZABLE };                                                   \
+    template <typename XprType>                                                             \
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType operator()(const XprType& mat) const { \
+      return mat.MEMBER();                                                                  \
+    }                                                                                       \
+    BinaryOp binaryFunc() const { return BinaryOp(); }                                      \
   }
 
+#define EIGEN_MEMBER_FUNCTOR(MEMBER, COST) EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(MEMBER, COST, 0, partial_redux_dummy_func)
+
 namespace internal {
 
-EIGEN_MEMBER_FUNCTOR(squaredNorm, Size * NumTraits<Scalar>::MulCost + (Size-1)*NumTraits<Scalar>::AddCost);
-EIGEN_MEMBER_FUNCTOR(norm, (Size+5) * NumTraits<Scalar>::MulCost + (Size-1)*NumTraits<Scalar>::AddCost);
-EIGEN_MEMBER_FUNCTOR(stableNorm, (Size+5) * NumTraits<Scalar>::MulCost + (Size-1)*NumTraits<Scalar>::AddCost);
-EIGEN_MEMBER_FUNCTOR(blueNorm, (Size+5) * NumTraits<Scalar>::MulCost + (Size-1)*NumTraits<Scalar>::AddCost);
-EIGEN_MEMBER_FUNCTOR(hypotNorm, (Size-1) * functor_traits<scalar_hypot_op<Scalar> >::Cost );
-EIGEN_MEMBER_FUNCTOR(sum, (Size-1)*NumTraits<Scalar>::AddCost);
-EIGEN_MEMBER_FUNCTOR(mean, (Size-1)*NumTraits<Scalar>::AddCost + NumTraits<Scalar>::MulCost);
-EIGEN_MEMBER_FUNCTOR(minCoeff, (Size-1)*NumTraits<Scalar>::AddCost);
-EIGEN_MEMBER_FUNCTOR(maxCoeff, (Size-1)*NumTraits<Scalar>::AddCost);
-EIGEN_MEMBER_FUNCTOR(all, (Size-1)*NumTraits<Scalar>::AddCost);
-EIGEN_MEMBER_FUNCTOR(any, (Size-1)*NumTraits<Scalar>::AddCost);
-EIGEN_MEMBER_FUNCTOR(count, (Size-1)*NumTraits<Scalar>::AddCost);
-EIGEN_MEMBER_FUNCTOR(prod, (Size-1)*NumTraits<Scalar>::MulCost);
-
-
-template <typename BinaryOp, typename Scalar>
+EIGEN_MEMBER_FUNCTOR(norm, (Size + 5) * NumTraits<Scalar>::MulCost + (Size - 1) * NumTraits<Scalar>::AddCost);
+EIGEN_MEMBER_FUNCTOR(stableNorm, (Size + 5) * NumTraits<Scalar>::MulCost + (Size - 1) * NumTraits<Scalar>::AddCost);
+EIGEN_MEMBER_FUNCTOR(blueNorm, (Size + 5) * NumTraits<Scalar>::MulCost + (Size - 1) * NumTraits<Scalar>::AddCost);
+EIGEN_MEMBER_FUNCTOR(hypotNorm, (Size - 1) * functor_traits<scalar_hypot_op<Scalar> >::Cost);
+EIGEN_MEMBER_FUNCTOR(all, (Size - 1) * NumTraits<Scalar>::AddCost);
+EIGEN_MEMBER_FUNCTOR(any, (Size - 1) * NumTraits<Scalar>::AddCost);
+EIGEN_MEMBER_FUNCTOR(count, (Size - 1) * NumTraits<Scalar>::AddCost);
+
+EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(sum, (Size - 1) * NumTraits<Scalar>::AddCost, 1, internal::scalar_sum_op);
+EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(minCoeff, (Size - 1) * NumTraits<Scalar>::AddCost, 1, internal::scalar_min_op);
+EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(maxCoeff, (Size - 1) * NumTraits<Scalar>::AddCost, 1, internal::scalar_max_op);
+EIGEN_MAKE_PARTIAL_REDUX_FUNCTOR(prod, (Size - 1) * NumTraits<Scalar>::MulCost, 1, internal::scalar_product_op);
+
+template <int p, typename ResultType, typename Scalar>
+struct member_lpnorm {
+  typedef ResultType result_type;
+  enum { Vectorizable = 0 };
+  template <int Size>
+  struct Cost {
+    enum { value = (Size + 5) * NumTraits<Scalar>::MulCost + (Size - 1) * NumTraits<Scalar>::AddCost };
+  };
+  EIGEN_DEVICE_FUNC member_lpnorm() {}
+  template <typename XprType>
+  EIGEN_DEVICE_FUNC inline ResultType operator()(const XprType& mat) const {
+    return mat.template lpNorm<p>();
+  }
+};
+
+template <typename BinaryOpT, typename Scalar>
 struct member_redux {
-  typedef typename result_of<
-                     BinaryOp(Scalar)
-                   >::type  result_type;
-  template<typename _Scalar, int Size> struct Cost
-  { enum { value = (Size-1) * functor_traits<BinaryOp>::Cost }; };
-  member_redux(const BinaryOp func) : m_functor(func) {}
-  template<typename Derived>
-  inline result_type operator()(const DenseBase<Derived>& mat) const
-  { return mat.redux(m_functor); }
+  typedef BinaryOpT BinaryOp;
+  typedef typename result_of<BinaryOp(const Scalar&, const Scalar&)>::type result_type;
+
+  enum { Vectorizable = functor_traits<BinaryOp>::PacketAccess };
+  template <int Size>
+  struct Cost {
+    enum { value = (Size - 1) * functor_traits<BinaryOp>::Cost };
+  };
+  EIGEN_DEVICE_FUNC explicit member_redux(const BinaryOp func) : m_functor(func) {}
+  template <typename Derived>
+  EIGEN_DEVICE_FUNC inline result_type operator()(const DenseBase<Derived>& mat) const {
+    return mat.redux(m_functor);
+  }
+  const BinaryOp& binaryFunc() const { return m_functor; }
   const BinaryOp m_functor;
 };
-}
+
+template <typename Scalar>
+struct scalar_replace_zero_with_one_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& x) const {
+    return numext::is_exactly_zero(x) ? Scalar(1) : x;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+    return pselect(pcmp_eq(x, pzero(x)), pset1<Packet>(Scalar(1)), x);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_replace_zero_with_one_op<Scalar>> {
+  enum { Cost = 1, PacketAccess = packet_traits<Scalar>::HasCmp };
+};
+
+}  // namespace internal
 
 /** \class VectorwiseOp
-  * \ingroup Core_Module
-  *
-  * \brief Pseudo expression providing partial reduction operations
-  *
-  * \param ExpressionType the type of the object on which to do partial reductions
-  * \param Direction indicates the direction of the redux (#Vertical or #Horizontal)
-  *
-  * This class represents a pseudo expression with partial reduction features.
-  * It is the return type of DenseBase::colwise() and DenseBase::rowwise()
-  * and most of the time this is the only way it is used.
-  *
-  * Example: \include MatrixBase_colwise.cpp
-  * Output: \verbinclude MatrixBase_colwise.out
-  *
-  * \sa DenseBase::colwise(), DenseBase::rowwise(), class PartialReduxExpr
-  */
-template<typename ExpressionType, int Direction> class VectorwiseOp
-{
-  public:
-
-    typedef typename ExpressionType::Scalar Scalar;
-    typedef typename ExpressionType::RealScalar RealScalar;
-    typedef typename ExpressionType::Index Index;
-    typedef typename internal::conditional<internal::must_nest_by_value<ExpressionType>::ret,
-        ExpressionType, ExpressionType&>::type ExpressionTypeNested;
-    typedef typename internal::remove_all<ExpressionTypeNested>::type ExpressionTypeNestedCleaned;
-
-    template<template<typename _Scalar> class Functor,
-                      typename Scalar=typename internal::traits<ExpressionType>::Scalar> struct ReturnType
-    {
-      typedef PartialReduxExpr<ExpressionType,
-                               Functor<Scalar>,
-                               Direction
-                              > Type;
-    };
-
-    template<typename BinaryOp> struct ReduxReturnType
-    {
-      typedef PartialReduxExpr<ExpressionType,
-                               internal::member_redux<BinaryOp,typename internal::traits<ExpressionType>::Scalar>,
-                               Direction
-                              > Type;
-    };
-
-    enum {
-      IsVertical   = (Direction==Vertical) ? 1 : 0,
-      IsHorizontal = (Direction==Horizontal) ? 1 : 0
-    };
-
-  protected:
-
-    /** \internal
-      * \returns the i-th subvector according to the \c Direction */
-    typedef typename internal::conditional<Direction==Vertical,
-                               typename ExpressionType::ColXpr,
-                               typename ExpressionType::RowXpr>::type SubVector;
-    SubVector subVector(Index i)
-    {
-      return SubVector(m_matrix.derived(),i);
-    }
-
-    /** \internal
-      * \returns the number of subvectors in the direction \c Direction */
-    Index subVectors() const
-    { return Direction==Vertical?m_matrix.cols():m_matrix.rows(); }
-
-    template<typename OtherDerived> struct ExtendedType {
-      typedef Replicate<OtherDerived,
-                        Direction==Vertical   ? 1 : ExpressionType::RowsAtCompileTime,
-                        Direction==Horizontal ? 1 : ExpressionType::ColsAtCompileTime> Type;
-    };
-
-    /** \internal
-      * Replicates a vector to match the size of \c *this */
-    template<typename OtherDerived>
-    typename ExtendedType<OtherDerived>::Type
-    extendedTo(const DenseBase<OtherDerived>& other) const
-    {
-      EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(Direction==Vertical, OtherDerived::MaxColsAtCompileTime==1),
-                          YOU_PASSED_A_ROW_VECTOR_BUT_A_COLUMN_VECTOR_WAS_EXPECTED)
-      EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(Direction==Horizontal, OtherDerived::MaxRowsAtCompileTime==1),
-                          YOU_PASSED_A_COLUMN_VECTOR_BUT_A_ROW_VECTOR_WAS_EXPECTED)
-      return typename ExtendedType<OtherDerived>::Type
-                      (other.derived(),
-                       Direction==Vertical   ? 1 : m_matrix.rows(),
-                       Direction==Horizontal ? 1 : m_matrix.cols());
-    }
-    
-    template<typename OtherDerived> struct OppositeExtendedType {
-      typedef Replicate<OtherDerived,
-                        Direction==Horizontal ? 1 : ExpressionType::RowsAtCompileTime,
-                        Direction==Vertical   ? 1 : ExpressionType::ColsAtCompileTime> Type;
-    };
-
-    /** \internal
-      * Replicates a vector in the opposite direction to match the size of \c *this */
-    template<typename OtherDerived>
-    typename OppositeExtendedType<OtherDerived>::Type
-    extendedToOpposite(const DenseBase<OtherDerived>& other) const
-    {
-      EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(Direction==Horizontal, OtherDerived::MaxColsAtCompileTime==1),
-                          YOU_PASSED_A_ROW_VECTOR_BUT_A_COLUMN_VECTOR_WAS_EXPECTED)
-      EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(Direction==Vertical, OtherDerived::MaxRowsAtCompileTime==1),
-                          YOU_PASSED_A_COLUMN_VECTOR_BUT_A_ROW_VECTOR_WAS_EXPECTED)
-      return typename OppositeExtendedType<OtherDerived>::Type
-                      (other.derived(),
-                       Direction==Horizontal  ? 1 : m_matrix.rows(),
-                       Direction==Vertical    ? 1 : m_matrix.cols());
-    }
-
-  public:
-
-    inline VectorwiseOp(ExpressionType& matrix) : m_matrix(matrix) {}
-
-    /** \internal */
-    inline const ExpressionType& _expression() const { return m_matrix; }
-
-    /** \returns a row or column vector expression of \c *this reduxed by \a func
-      *
-      * The template parameter \a BinaryOp is the type of the functor
-      * of the custom redux operator. Note that func must be an associative operator.
-      *
-      * \sa class VectorwiseOp, DenseBase::colwise(), DenseBase::rowwise()
-      */
-    template<typename BinaryOp>
-    const typename ReduxReturnType<BinaryOp>::Type
-    redux(const BinaryOp& func = BinaryOp()) const
-    { return typename ReduxReturnType<BinaryOp>::Type(_expression(), func); }
-
-    /** \returns a row (or column) vector expression of the smallest coefficient
-      * of each column (or row) of the referenced expression.
-      * 
-      * \warning the result is undefined if \c *this contains NaN.
-      *
-      * Example: \include PartialRedux_minCoeff.cpp
-      * Output: \verbinclude PartialRedux_minCoeff.out
-      *
-      * \sa DenseBase::minCoeff() */
-    const typename ReturnType<internal::member_minCoeff>::Type minCoeff() const
-    { return _expression(); }
-
-    /** \returns a row (or column) vector expression of the largest coefficient
-      * of each column (or row) of the referenced expression.
-      * 
-      * \warning the result is undefined if \c *this contains NaN.
-      *
-      * Example: \include PartialRedux_maxCoeff.cpp
-      * Output: \verbinclude PartialRedux_maxCoeff.out
-      *
-      * \sa DenseBase::maxCoeff() */
-    const typename ReturnType<internal::member_maxCoeff>::Type maxCoeff() const
-    { return _expression(); }
-
-    /** \returns a row (or column) vector expression of the squared norm
-      * of each column (or row) of the referenced expression.
-      *
-      * Example: \include PartialRedux_squaredNorm.cpp
-      * Output: \verbinclude PartialRedux_squaredNorm.out
-      *
-      * \sa DenseBase::squaredNorm() */
-    const typename ReturnType<internal::member_squaredNorm,RealScalar>::Type squaredNorm() const
-    { return _expression(); }
-
-    /** \returns a row (or column) vector expression of the norm
-      * of each column (or row) of the referenced expression.
-      *
-      * Example: \include PartialRedux_norm.cpp
-      * Output: \verbinclude PartialRedux_norm.out
-      *
-      * \sa DenseBase::norm() */
-    const typename ReturnType<internal::member_norm,RealScalar>::Type norm() const
-    { return _expression(); }
-
-
-    /** \returns a row (or column) vector expression of the norm
-      * of each column (or row) of the referenced expression, using
-      * blue's algorithm.
-      *
-      * \sa DenseBase::blueNorm() */
-    const typename ReturnType<internal::member_blueNorm,RealScalar>::Type blueNorm() const
-    { return _expression(); }
-
-
-    /** \returns a row (or column) vector expression of the norm
-      * of each column (or row) of the referenced expression, avoiding
-      * underflow and overflow.
-      *
-      * \sa DenseBase::stableNorm() */
-    const typename ReturnType<internal::member_stableNorm,RealScalar>::Type stableNorm() const
-    { return _expression(); }
-
-
-    /** \returns a row (or column) vector expression of the norm
-      * of each column (or row) of the referenced expression, avoiding
-      * underflow and overflow using a concatenation of hypot() calls.
-      *
-      * \sa DenseBase::hypotNorm() */
-    const typename ReturnType<internal::member_hypotNorm,RealScalar>::Type hypotNorm() const
-    { return _expression(); }
-
-    /** \returns a row (or column) vector expression of the sum
-      * of each column (or row) of the referenced expression.
-      *
-      * Example: \include PartialRedux_sum.cpp
-      * Output: \verbinclude PartialRedux_sum.out
-      *
-      * \sa DenseBase::sum() */
-    const typename ReturnType<internal::member_sum>::Type sum() const
-    { return _expression(); }
-
-    /** \returns a row (or column) vector expression of the mean
-    * of each column (or row) of the referenced expression.
-    *
-    * \sa DenseBase::mean() */
-    const typename ReturnType<internal::member_mean>::Type mean() const
-    { return _expression(); }
-
-    /** \returns a row (or column) vector expression representing
-      * whether \b all coefficients of each respective column (or row) are \c true.
-      *
-      * \sa DenseBase::all() */
-    const typename ReturnType<internal::member_all>::Type all() const
-    { return _expression(); }
-
-    /** \returns a row (or column) vector expression representing
-      * whether \b at \b least one coefficient of each respective column (or row) is \c true.
-      *
-      * \sa DenseBase::any() */
-    const typename ReturnType<internal::member_any>::Type any() const
-    { return _expression(); }
-
-    /** \returns a row (or column) vector expression representing
-      * the number of \c true coefficients of each respective column (or row).
-      *
-      * Example: \include PartialRedux_count.cpp
-      * Output: \verbinclude PartialRedux_count.out
-      *
-      * \sa DenseBase::count() */
-    const PartialReduxExpr<ExpressionType, internal::member_count<Index>, Direction> count() const
-    { return _expression(); }
-
-    /** \returns a row (or column) vector expression of the product
-      * of each column (or row) of the referenced expression.
-      *
-      * Example: \include PartialRedux_prod.cpp
-      * Output: \verbinclude PartialRedux_prod.out
-      *
-      * \sa DenseBase::prod() */
-    const typename ReturnType<internal::member_prod>::Type prod() const
-    { return _expression(); }
-
-
-    /** \returns a matrix expression
-      * where each column (or row) are reversed.
-      *
-      * Example: \include Vectorwise_reverse.cpp
-      * Output: \verbinclude Vectorwise_reverse.out
-      *
-      * \sa DenseBase::reverse() */
-    const Reverse<ExpressionType, Direction> reverse() const
-    { return Reverse<ExpressionType, Direction>( _expression() ); }
-
-    typedef Replicate<ExpressionType,Direction==Vertical?Dynamic:1,Direction==Horizontal?Dynamic:1> ReplicateReturnType;
-    const ReplicateReturnType replicate(Index factor) const;
-
-    /**
-      * \return an expression of the replication of each column (or row) of \c *this
-      *
-      * Example: \include DirectionWise_replicate.cpp
-      * Output: \verbinclude DirectionWise_replicate.out
-      *
-      * \sa VectorwiseOp::replicate(Index), DenseBase::replicate(), class Replicate
-      */
-    // NOTE implemented here because of sunstudio's compilation errors
-    template<int Factor> const Replicate<ExpressionType,(IsVertical?Factor:1),(IsHorizontal?Factor:1)>
-    replicate(Index factor = Factor) const
-    {
-      return Replicate<ExpressionType,Direction==Vertical?Factor:1,Direction==Horizontal?Factor:1>
-          (_expression(),Direction==Vertical?factor:1,Direction==Horizontal?factor:1);
-    }
-
-/////////// Artithmetic operators ///////////
-
-    /** Copies the vector \a other to each subvector of \c *this */
-    template<typename OtherDerived>
-    ExpressionType& operator=(const DenseBase<OtherDerived>& other)
-    {
-      EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
-      EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
-      //eigen_assert((m_matrix.isNull()) == (other.isNull())); FIXME
-      return const_cast<ExpressionType&>(m_matrix = extendedTo(other.derived()));
-    }
-
-    /** Adds the vector \a other to each subvector of \c *this */
-    template<typename OtherDerived>
-    ExpressionType& operator+=(const DenseBase<OtherDerived>& other)
-    {
-      EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
-      EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
-      return const_cast<ExpressionType&>(m_matrix += extendedTo(other.derived()));
-    }
-
-    /** Substracts the vector \a other to each subvector of \c *this */
-    template<typename OtherDerived>
-    ExpressionType& operator-=(const DenseBase<OtherDerived>& other)
-    {
-      EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
-      EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
-      return const_cast<ExpressionType&>(m_matrix -= extendedTo(other.derived()));
-    }
-
-    /** Multiples each subvector of \c *this by the vector \a other */
-    template<typename OtherDerived>
-    ExpressionType& operator*=(const DenseBase<OtherDerived>& other)
-    {
-      EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
-      EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType)
-      EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
-      m_matrix *= extendedTo(other.derived());
-      return const_cast<ExpressionType&>(m_matrix);
-    }
-
-    /** Divides each subvector of \c *this by the vector \a other */
-    template<typename OtherDerived>
-    ExpressionType& operator/=(const DenseBase<OtherDerived>& other)
-    {
-      EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
-      EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType)
-      EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
-      m_matrix /= extendedTo(other.derived());
-      return const_cast<ExpressionType&>(m_matrix);
-    }
-
-    /** Returns the expression of the sum of the vector \a other to each subvector of \c *this */
-    template<typename OtherDerived> EIGEN_STRONG_INLINE
-    CwiseBinaryOp<internal::scalar_sum_op<Scalar>,
-                  const ExpressionTypeNestedCleaned,
-                  const typename ExtendedType<OtherDerived>::Type>
-    operator+(const DenseBase<OtherDerived>& other) const
-    {
-      EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
-      EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
-      return m_matrix + extendedTo(other.derived());
-    }
-
-    /** Returns the expression of the difference between each subvector of \c *this and the vector \a other */
-    template<typename OtherDerived>
-    CwiseBinaryOp<internal::scalar_difference_op<Scalar>,
-                  const ExpressionTypeNestedCleaned,
-                  const typename ExtendedType<OtherDerived>::Type>
-    operator-(const DenseBase<OtherDerived>& other) const
-    {
-      EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
-      EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
-      return m_matrix - extendedTo(other.derived());
-    }
-
-    /** Returns the expression where each subvector is the product of the vector \a other
-      * by the corresponding subvector of \c *this */
-    template<typename OtherDerived> EIGEN_STRONG_INLINE
-    CwiseBinaryOp<internal::scalar_product_op<Scalar>,
-                  const ExpressionTypeNestedCleaned,
-                  const typename ExtendedType<OtherDerived>::Type>
-    operator*(const DenseBase<OtherDerived>& other) const
-    {
-      EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
-      EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType)
-      EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
-      return m_matrix * extendedTo(other.derived());
-    }
-
-    /** Returns the expression where each subvector is the quotient of the corresponding
-      * subvector of \c *this by the vector \a other */
-    template<typename OtherDerived>
-    CwiseBinaryOp<internal::scalar_quotient_op<Scalar>,
-                  const ExpressionTypeNestedCleaned,
-                  const typename ExtendedType<OtherDerived>::Type>
-    operator/(const DenseBase<OtherDerived>& other) const
-    {
-      EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
-      EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType)
-      EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
-      return m_matrix / extendedTo(other.derived());
-    }
-    
-    /** \returns an expression where each column of row of the referenced matrix are normalized.
-      * The referenced matrix is \b not modified.
-      * \sa MatrixBase::normalized(), normalize()
-      */
-    CwiseBinaryOp<internal::scalar_quotient_op<Scalar>,
-                  const ExpressionTypeNestedCleaned,
-                  const typename OppositeExtendedType<typename ReturnType<internal::member_norm,RealScalar>::Type>::Type>
-    normalized() const { return m_matrix.cwiseQuotient(extendedToOpposite(this->norm())); }
-    
-    
-    /** Normalize in-place each row or columns of the referenced matrix.
-      * \sa MatrixBase::normalize(), normalized()
-      */
-    void normalize() {
-      m_matrix = this->normalized();
-    }
-
-/////////// Geometry module ///////////
-
-    #if EIGEN2_SUPPORT_STAGE > STAGE20_RESOLVE_API_CONFLICTS
-    Homogeneous<ExpressionType,Direction> homogeneous() const;
-    #endif
-
-    typedef typename ExpressionType::PlainObject CrossReturnType;
-    template<typename OtherDerived>
-    const CrossReturnType cross(const MatrixBase<OtherDerived>& other) const;
-
-    enum {
-      HNormalized_Size = Direction==Vertical ? internal::traits<ExpressionType>::RowsAtCompileTime
+ * \ingroup Core_Module
+ *
+ * \brief Pseudo expression providing broadcasting and partial reduction operations
+ *
+ * \tparam ExpressionType the type of the object on which to do partial reductions
+ * \tparam Direction indicates whether to operate on columns (#Vertical) or rows (#Horizontal)
+ *
+ * This class represents a pseudo expression with broadcasting and partial reduction features.
+ * It is the return type of DenseBase::colwise() and DenseBase::rowwise()
+ * and most of the time this is the only way it is explicitly used.
+ *
+ * To understand the logic of rowwise/colwise expression, let's consider a generic case `A.colwise().foo()`
+ * where `foo` is any method of `VectorwiseOp`. This expression is equivalent to applying `foo()` to each
+ * column of `A` and then re-assemble the outputs in a matrix expression:
+ * \code [A.col(0).foo(), A.col(1).foo(), ..., A.col(A.cols()-1).foo()] \endcode
+ *
+ * Example: \include MatrixBase_colwise.cpp
+ * Output: \verbinclude MatrixBase_colwise.out
+ *
+ * The begin() and end() methods are obviously exceptions to the previous rule as they
+ * return STL-compatible begin/end iterators to the rows or columns of the nested expression.
+ * Typical use cases include for-range-loop and calls to STL algorithms:
+ *
+ * Example: \include MatrixBase_colwise_iterator_cxx11.cpp
+ * Output: \verbinclude MatrixBase_colwise_iterator_cxx11.out
+ *
+ * For a partial reduction on an empty input, some rules apply.
+ * For the sake of clarity, let's consider a vertical reduction:
+ *   - If the number of columns is zero, then a 1x0 row-major vector expression is returned.
+ *   - Otherwise, if the number of rows is zero, then
+ *       - a row vector of zeros is returned for sum-like reductions (sum, squaredNorm, norm, etc.)
+ *       - a row vector of ones is returned for a product reduction (e.g., <code>MatrixXd(n,0).colwise().prod()</code>)
+ *       - an assert is triggered for all other reductions (minCoeff,maxCoeff,redux(bin_op))
+ *
+ * \sa DenseBase::colwise(), DenseBase::rowwise(), class PartialReduxExpr
+ */
+template <typename ExpressionType, int Direction>
+class VectorwiseOp {
+ public:
+  typedef typename ExpressionType::Scalar Scalar;
+  typedef typename ExpressionType::RealScalar RealScalar;
+  typedef internal::remove_all_t<ExpressionType> ExpressionTypeCleaned;
+
+  template <template <typename OutScalar, typename InputScalar> class Functor, typename ReturnScalar = Scalar>
+  struct ReturnType {
+    typedef PartialReduxExpr<ExpressionType, Functor<ReturnScalar, Scalar>, Direction> Type;
+  };
+
+  template <typename BinaryOp>
+  struct ReduxReturnType {
+    typedef PartialReduxExpr<ExpressionType, internal::member_redux<BinaryOp, Scalar>, Direction> Type;
+  };
+
+  enum { isVertical = (Direction == Vertical) ? 1 : 0, isHorizontal = (Direction == Horizontal) ? 1 : 0 };
+
+ protected:
+  template <typename OtherDerived>
+  struct ExtendedType {
+    typedef Replicate<OtherDerived, isVertical ? 1 : ExpressionType::RowsAtCompileTime,
+                      isHorizontal ? 1 : ExpressionType::ColsAtCompileTime>
+        Type;
+  };
+
+  /** \internal
+   * Replicates a vector to match the size of \c *this */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC typename ExtendedType<OtherDerived>::Type extendedTo(const DenseBase<OtherDerived>& other) const {
+    EIGEN_STATIC_ASSERT(internal::check_implication(isVertical, OtherDerived::MaxColsAtCompileTime == 1),
+                        YOU_PASSED_A_ROW_VECTOR_BUT_A_COLUMN_VECTOR_WAS_EXPECTED)
+    EIGEN_STATIC_ASSERT(internal::check_implication(isHorizontal, OtherDerived::MaxRowsAtCompileTime == 1),
+                        YOU_PASSED_A_COLUMN_VECTOR_BUT_A_ROW_VECTOR_WAS_EXPECTED)
+    return typename ExtendedType<OtherDerived>::Type(other.derived(), isVertical ? 1 : m_matrix.rows(),
+                                                     isHorizontal ? 1 : m_matrix.cols());
+  }
+
+  template <typename OtherDerived>
+  struct OppositeExtendedType {
+    typedef Replicate<OtherDerived, isHorizontal ? 1 : ExpressionType::RowsAtCompileTime,
+                      isVertical ? 1 : ExpressionType::ColsAtCompileTime>
+        Type;
+  };
+
+  /** \internal
+   * Replicates a vector in the opposite direction to match the size of \c *this */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC typename OppositeExtendedType<OtherDerived>::Type extendedToOpposite(
+      const DenseBase<OtherDerived>& other) const {
+    EIGEN_STATIC_ASSERT(internal::check_implication(isHorizontal, OtherDerived::MaxColsAtCompileTime == 1),
+                        YOU_PASSED_A_ROW_VECTOR_BUT_A_COLUMN_VECTOR_WAS_EXPECTED)
+    EIGEN_STATIC_ASSERT(internal::check_implication(isVertical, OtherDerived::MaxRowsAtCompileTime == 1),
+                        YOU_PASSED_A_COLUMN_VECTOR_BUT_A_ROW_VECTOR_WAS_EXPECTED)
+    return typename OppositeExtendedType<OtherDerived>::Type(other.derived(), isHorizontal ? 1 : m_matrix.rows(),
+                                                             isVertical ? 1 : m_matrix.cols());
+  }
+
+ public:
+  EIGEN_DEVICE_FUNC explicit inline VectorwiseOp(ExpressionType& matrix) : m_matrix(matrix) {}
+
+  /** \internal */
+  EIGEN_DEVICE_FUNC inline const ExpressionType& _expression() const { return m_matrix; }
+
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+  /** STL-like <a href="https://en.cppreference.com/w/cpp/named_req/RandomAccessIterator">RandomAccessIterator</a>
+   * iterator type over the columns or rows as returned by the begin() and end() methods.
+   */
+  random_access_iterator_type iterator;
+  /** This is the const version of iterator (aka read-only) */
+  random_access_iterator_type const_iterator;
+#else
+  typedef internal::subvector_stl_iterator<ExpressionType, DirectionType(Direction)> iterator;
+  typedef internal::subvector_stl_iterator<const ExpressionType, DirectionType(Direction)> const_iterator;
+  typedef internal::subvector_stl_reverse_iterator<ExpressionType, DirectionType(Direction)> reverse_iterator;
+  typedef internal::subvector_stl_reverse_iterator<const ExpressionType, DirectionType(Direction)>
+      const_reverse_iterator;
+#endif
+
+  /** returns an iterator to the first row (rowwise) or column (colwise) of the nested expression.
+   * \sa end(), cbegin()
+   */
+  iterator begin() { return iterator(m_matrix, 0); }
+  /** const version of begin() */
+  const_iterator begin() const { return const_iterator(m_matrix, 0); }
+  /** const version of begin() */
+  const_iterator cbegin() const { return const_iterator(m_matrix, 0); }
+
+  /** returns a reverse iterator to the last row (rowwise) or column (colwise) of the nested expression.
+   * \sa rend(), crbegin()
+   */
+  reverse_iterator rbegin() {
+    return reverse_iterator(m_matrix, m_matrix.template subVectors<DirectionType(Direction)>() - 1);
+  }
+  /** const version of rbegin() */
+  const_reverse_iterator rbegin() const {
+    return const_reverse_iterator(m_matrix, m_matrix.template subVectors<DirectionType(Direction)>() - 1);
+  }
+  /** const version of rbegin() */
+  const_reverse_iterator crbegin() const {
+    return const_reverse_iterator(m_matrix, m_matrix.template subVectors<DirectionType(Direction)>() - 1);
+  }
+
+  /** returns an iterator to the row (resp. column) following the last row (resp. column) of the nested expression
+   * \sa begin(), cend()
+   */
+  iterator end() { return iterator(m_matrix, m_matrix.template subVectors<DirectionType(Direction)>()); }
+  /** const version of end() */
+  const_iterator end() const {
+    return const_iterator(m_matrix, m_matrix.template subVectors<DirectionType(Direction)>());
+  }
+  /** const version of end() */
+  const_iterator cend() const {
+    return const_iterator(m_matrix, m_matrix.template subVectors<DirectionType(Direction)>());
+  }
+
+  /** returns a reverse iterator to the row (resp. column) before the first row (resp. column) of the nested expression
+   * \sa begin(), cend()
+   */
+  reverse_iterator rend() { return reverse_iterator(m_matrix, -1); }
+  /** const version of rend() */
+  const_reverse_iterator rend() const { return const_reverse_iterator(m_matrix, -1); }
+  /** const version of rend() */
+  const_reverse_iterator crend() const { return const_reverse_iterator(m_matrix, -1); }
+
+  /** \returns a row or column vector expression of \c *this reduxed by \a func
+   *
+   * The template parameter \a BinaryOp is the type of the functor
+   * of the custom redux operator. Note that func must be an associative operator.
+   *
+   * \warning the size along the reduction direction must be strictly positive,
+   *          otherwise an assertion is triggered.
+   *
+   * \sa class VectorwiseOp, DenseBase::colwise(), DenseBase::rowwise()
+   */
+  template <typename BinaryOp>
+  EIGEN_DEVICE_FUNC const typename ReduxReturnType<BinaryOp>::Type redux(const BinaryOp& func = BinaryOp()) const {
+    eigen_assert(redux_length() > 0 && "you are using an empty matrix");
+    return typename ReduxReturnType<BinaryOp>::Type(_expression(), internal::member_redux<BinaryOp, Scalar>(func));
+  }
+
+  typedef typename ReturnType<internal::member_minCoeff>::Type MinCoeffReturnType;
+  typedef typename ReturnType<internal::member_maxCoeff>::Type MaxCoeffReturnType;
+  typedef PartialReduxExpr<const CwiseUnaryOp<internal::scalar_abs2_op<Scalar>, const ExpressionTypeCleaned>,
+                           internal::member_sum<RealScalar, RealScalar>, Direction>
+      SquaredNormReturnType;
+  typedef CwiseUnaryOp<internal::scalar_sqrt_op<RealScalar>, const SquaredNormReturnType> NormReturnType;
+  typedef typename ReturnType<internal::member_blueNorm, RealScalar>::Type BlueNormReturnType;
+  typedef typename ReturnType<internal::member_stableNorm, RealScalar>::Type StableNormReturnType;
+  typedef typename ReturnType<internal::member_hypotNorm, RealScalar>::Type HypotNormReturnType;
+  typedef typename ReturnType<internal::member_sum>::Type SumReturnType;
+  typedef EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(SumReturnType, Scalar, quotient) MeanReturnType;
+  typedef typename ReturnType<internal::member_all, bool>::Type AllReturnType;
+  typedef typename ReturnType<internal::member_any, bool>::Type AnyReturnType;
+  typedef PartialReduxExpr<ExpressionType, internal::member_count<Index, Scalar>, Direction> CountReturnType;
+  typedef typename ReturnType<internal::member_prod>::Type ProdReturnType;
+  typedef Reverse<const ExpressionType, Direction> ConstReverseReturnType;
+  typedef Reverse<ExpressionType, Direction> ReverseReturnType;
+
+  template <int p>
+  struct LpNormReturnType {
+    typedef PartialReduxExpr<ExpressionType, internal::member_lpnorm<p, RealScalar, Scalar>, Direction> Type;
+  };
+
+  /** \returns a row (or column) vector expression of the smallest coefficient
+   * of each column (or row) of the referenced expression.
+   *
+   * \warning the size along the reduction direction must be strictly positive,
+   *          otherwise an assertion is triggered.
+   *
+   * \warning the result is undefined if \c *this contains NaN.
+   *
+   * Example: \include PartialRedux_minCoeff.cpp
+   * Output: \verbinclude PartialRedux_minCoeff.out
+   *
+   * \sa DenseBase::minCoeff() */
+  EIGEN_DEVICE_FUNC const MinCoeffReturnType minCoeff() const {
+    eigen_assert(redux_length() > 0 && "you are using an empty matrix");
+    return MinCoeffReturnType(_expression());
+  }
+
+  /** \returns a row (or column) vector expression of the largest coefficient
+   * of each column (or row) of the referenced expression.
+   *
+   * \warning the size along the reduction direction must be strictly positive,
+   *          otherwise an assertion is triggered.
+   *
+   * \warning the result is undefined if \c *this contains NaN.
+   *
+   * Example: \include PartialRedux_maxCoeff.cpp
+   * Output: \verbinclude PartialRedux_maxCoeff.out
+   *
+   * \sa DenseBase::maxCoeff() */
+  EIGEN_DEVICE_FUNC const MaxCoeffReturnType maxCoeff() const {
+    eigen_assert(redux_length() > 0 && "you are using an empty matrix");
+    return MaxCoeffReturnType(_expression());
+  }
+
+  /** \returns a row (or column) vector expression of the squared norm
+   * of each column (or row) of the referenced expression.
+   * This is a vector with real entries, even if the original matrix has complex entries.
+   *
+   * Example: \include PartialRedux_squaredNorm.cpp
+   * Output: \verbinclude PartialRedux_squaredNorm.out
+   *
+   * \sa DenseBase::squaredNorm() */
+  EIGEN_DEVICE_FUNC const SquaredNormReturnType squaredNorm() const {
+    return SquaredNormReturnType(m_matrix.cwiseAbs2());
+  }
+
+  /** \returns a row (or column) vector expression of the norm
+   * of each column (or row) of the referenced expression.
+   * This is a vector with real entries, even if the original matrix has complex entries.
+   *
+   * Example: \include PartialRedux_norm.cpp
+   * Output: \verbinclude PartialRedux_norm.out
+   *
+   * \sa DenseBase::norm() */
+  EIGEN_DEVICE_FUNC const NormReturnType norm() const { return NormReturnType(squaredNorm()); }
+
+  /** \returns a row (or column) vector expression of the norm
+   * of each column (or row) of the referenced expression.
+   * This is a vector with real entries, even if the original matrix has complex entries.
+   *
+   * Example: \include PartialRedux_norm.cpp
+   * Output: \verbinclude PartialRedux_norm.out
+   *
+   * \sa DenseBase::norm() */
+  template <int p>
+  EIGEN_DEVICE_FUNC const typename LpNormReturnType<p>::Type lpNorm() const {
+    return typename LpNormReturnType<p>::Type(_expression());
+  }
+
+  /** \returns a row (or column) vector expression of the norm
+   * of each column (or row) of the referenced expression, using
+   * Blue's algorithm.
+   * This is a vector with real entries, even if the original matrix has complex entries.
+   *
+   * \sa DenseBase::blueNorm() */
+  EIGEN_DEVICE_FUNC const BlueNormReturnType blueNorm() const { return BlueNormReturnType(_expression()); }
+
+  /** \returns a row (or column) vector expression of the norm
+   * of each column (or row) of the referenced expression, avoiding
+   * underflow and overflow.
+   * This is a vector with real entries, even if the original matrix has complex entries.
+   *
+   * \sa DenseBase::stableNorm() */
+  EIGEN_DEVICE_FUNC const StableNormReturnType stableNorm() const { return StableNormReturnType(_expression()); }
+
+  /** \returns a row (or column) vector expression of the norm
+   * of each column (or row) of the referenced expression, avoiding
+   * underflow and overflow using a concatenation of hypot() calls.
+   * This is a vector with real entries, even if the original matrix has complex entries.
+   *
+   * \sa DenseBase::hypotNorm() */
+  EIGEN_DEVICE_FUNC const HypotNormReturnType hypotNorm() const { return HypotNormReturnType(_expression()); }
+
+  /** \returns a row (or column) vector expression of the sum
+   * of each column (or row) of the referenced expression.
+   *
+   * Example: \include PartialRedux_sum.cpp
+   * Output: \verbinclude PartialRedux_sum.out
+   *
+   * \sa DenseBase::sum() */
+  EIGEN_DEVICE_FUNC const SumReturnType sum() const { return SumReturnType(_expression()); }
+
+  /** \returns a row (or column) vector expression of the mean
+   * of each column (or row) of the referenced expression.
+   *
+   * \sa DenseBase::mean() */
+  EIGEN_DEVICE_FUNC const MeanReturnType mean() const {
+    return sum() / Scalar(Direction == Vertical ? m_matrix.rows() : m_matrix.cols());
+  }
+
+  /** \returns a row (or column) vector expression representing
+   * whether \b all coefficients of each respective column (or row) are \c true.
+   * This expression can be assigned to a vector with entries of type \c bool.
+   *
+   * \sa DenseBase::all() */
+  EIGEN_DEVICE_FUNC const AllReturnType all() const { return AllReturnType(_expression()); }
+
+  /** \returns a row (or column) vector expression representing
+   * whether \b at \b least one coefficient of each respective column (or row) is \c true.
+   * This expression can be assigned to a vector with entries of type \c bool.
+   *
+   * \sa DenseBase::any() */
+  EIGEN_DEVICE_FUNC const AnyReturnType any() const { return AnyReturnType(_expression()); }
+
+  /** \returns a row (or column) vector expression representing
+   * the number of \c true coefficients of each respective column (or row).
+   * This expression can be assigned to a vector whose entries have the same type as is used to
+   * index entries of the original matrix; for dense matrices, this is \c std::ptrdiff_t .
+   *
+   * Example: \include PartialRedux_count.cpp
+   * Output: \verbinclude PartialRedux_count.out
+   *
+   * \sa DenseBase::count() */
+  EIGEN_DEVICE_FUNC const CountReturnType count() const { return CountReturnType(_expression()); }
+
+  /** \returns a row (or column) vector expression of the product
+   * of each column (or row) of the referenced expression.
+   *
+   * Example: \include PartialRedux_prod.cpp
+   * Output: \verbinclude PartialRedux_prod.out
+   *
+   * \sa DenseBase::prod() */
+  EIGEN_DEVICE_FUNC const ProdReturnType prod() const { return ProdReturnType(_expression()); }
+
+  /** \returns a matrix expression
+   * where each column (or row) are reversed.
+   *
+   * Example: \include Vectorwise_reverse.cpp
+   * Output: \verbinclude Vectorwise_reverse.out
+   *
+   * \sa DenseBase::reverse() */
+  EIGEN_DEVICE_FUNC const ConstReverseReturnType reverse() const { return ConstReverseReturnType(_expression()); }
+
+  /** \returns a writable matrix expression
+   * where each column (or row) are reversed.
+   *
+   * \sa reverse() const */
+  EIGEN_DEVICE_FUNC ReverseReturnType reverse() { return ReverseReturnType(_expression()); }
+
+  typedef Replicate<ExpressionType, (isVertical ? Dynamic : 1), (isHorizontal ? Dynamic : 1)> ReplicateReturnType;
+  EIGEN_DEVICE_FUNC const ReplicateReturnType replicate(Index factor) const;
+
+  /**
+   * \return an expression of the replication of each column (or row) of \c *this
+   *
+   * Example: \include DirectionWise_replicate.cpp
+   * Output: \verbinclude DirectionWise_replicate.out
+   *
+   * \sa VectorwiseOp::replicate(Index), DenseBase::replicate(), class Replicate
+   */
+  // NOTE implemented here because of sunstudio's compilation errors
+  // isVertical*Factor+isHorizontal instead of (isVertical?Factor:1) to handle CUDA bug with ternary operator
+  template <int Factor>
+  const Replicate<ExpressionType, isVertical * Factor + isHorizontal,
+                  isHorizontal * Factor + isVertical> EIGEN_DEVICE_FUNC
+  replicate(Index factor = Factor) const {
+    return Replicate<ExpressionType, (isVertical ? Factor : 1), (isHorizontal ? Factor : 1)>(
+        _expression(), isVertical ? factor : 1, isHorizontal ? factor : 1);
+  }
+
+  /////////// Artithmetic operators ///////////
+
+  /** Copies the vector \a other to each subvector of \c *this */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC ExpressionType& operator=(const DenseBase<OtherDerived>& other) {
+    EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
+    EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
+    // eigen_assert((m_matrix.isNull()) == (other.isNull())); FIXME
+    return m_matrix = extendedTo(other.derived());
+  }
+
+  /** Adds the vector \a other to each subvector of \c *this */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC ExpressionType& operator+=(const DenseBase<OtherDerived>& other) {
+    EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
+    EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
+    return m_matrix += extendedTo(other.derived());
+  }
+
+  /** Subtracts the vector \a other to each subvector of \c *this */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC ExpressionType& operator-=(const DenseBase<OtherDerived>& other) {
+    EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
+    EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
+    return m_matrix -= extendedTo(other.derived());
+  }
+
+  /** Multiplies each subvector of \c *this by the vector \a other */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC ExpressionType& operator*=(const DenseBase<OtherDerived>& other) {
+    EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
+    EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType)
+    EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
+    m_matrix *= extendedTo(other.derived());
+    return m_matrix;
+  }
+
+  /** Divides each subvector of \c *this by the vector \a other */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC ExpressionType& operator/=(const DenseBase<OtherDerived>& other) {
+    EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
+    EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType)
+    EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
+    m_matrix /= extendedTo(other.derived());
+    return m_matrix;
+  }
+
+  /** Returns the expression of the sum of the vector \a other to each subvector of \c *this */
+  template <typename OtherDerived>
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC
+      CwiseBinaryOp<internal::scalar_sum_op<Scalar, typename OtherDerived::Scalar>, const ExpressionTypeCleaned,
+                    const typename ExtendedType<OtherDerived>::Type>
+      operator+(const DenseBase<OtherDerived>& other) const {
+    EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
+    EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
+    return m_matrix + extendedTo(other.derived());
+  }
+
+  /** Returns the expression of the difference between each subvector of \c *this and the vector \a other */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC CwiseBinaryOp<internal::scalar_difference_op<Scalar, typename OtherDerived::Scalar>,
+                                  const ExpressionTypeCleaned, const typename ExtendedType<OtherDerived>::Type>
+  operator-(const DenseBase<OtherDerived>& other) const {
+    EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
+    EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
+    return m_matrix - extendedTo(other.derived());
+  }
+
+  /** Returns the expression where each subvector is the product of the vector \a other
+   * by the corresponding subvector of \c *this */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC CwiseBinaryOp<internal::scalar_product_op<Scalar, typename OtherDerived::Scalar>,
+                                  const ExpressionTypeCleaned, const typename ExtendedType<OtherDerived>::Type>
+  operator*(const DenseBase<OtherDerived>& other) const {
+    EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
+    EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType)
+    EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
+    return m_matrix * extendedTo(other.derived());
+  }
+
+  /** Returns the expression where each subvector is the quotient of the corresponding
+   * subvector of \c *this by the vector \a other */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC CwiseBinaryOp<internal::scalar_quotient_op<Scalar, typename OtherDerived::Scalar>,
+                                  const ExpressionTypeCleaned, const typename ExtendedType<OtherDerived>::Type>
+  operator/(const DenseBase<OtherDerived>& other) const {
+    EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
+    EIGEN_STATIC_ASSERT_ARRAYXPR(ExpressionType)
+    EIGEN_STATIC_ASSERT_SAME_XPR_KIND(ExpressionType, OtherDerived)
+    return m_matrix / extendedTo(other.derived());
+  }
+
+  using Normalized_NonzeroNormType =
+      CwiseUnaryOp<internal::scalar_replace_zero_with_one_op<Scalar>, const NormReturnType>;
+  using NormalizedReturnType = CwiseBinaryOp<internal::scalar_quotient_op<Scalar>, const ExpressionTypeCleaned,
+                                             const typename OppositeExtendedType<Normalized_NonzeroNormType>::Type>;
+
+  /** \returns an expression where each column (or row) of the referenced matrix are normalized.
+   * The referenced matrix is \b not modified.
+   *
+   * \warning If the input columns (or rows) are too small (i.e., their norm equals to 0), they remain unchanged in the
+   *          resulting expression.
+   *
+   * \sa MatrixBase::normalized(), normalize()
+   */
+  EIGEN_DEVICE_FUNC NormalizedReturnType normalized() const {
+    return m_matrix.cwiseQuotient(extendedToOpposite(Normalized_NonzeroNormType(this->norm())));
+  }
+
+  /** Normalize in-place each row or columns of the referenced matrix.
+   *
+   * \warning If the input columns (or rows) are too small (i.e., their norm equals to 0), they are left unchanged.
+   *
+   * \sa MatrixBase::normalized(), normalize()
+   */
+  EIGEN_DEVICE_FUNC void normalize() { m_matrix = this->normalized(); }
+
+  EIGEN_DEVICE_FUNC inline void reverseInPlace();
+
+  /////////// Geometry module ///////////
+
+  typedef Homogeneous<ExpressionType, Direction> HomogeneousReturnType;
+  EIGEN_DEVICE_FUNC HomogeneousReturnType homogeneous() const;
+
+  typedef typename ExpressionType::PlainObject CrossReturnType;
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC const CrossReturnType cross(const MatrixBase<OtherDerived>& other) const;
+
+  enum {
+    HNormalized_Size = Direction == Vertical ? internal::traits<ExpressionType>::RowsAtCompileTime
                                              : internal::traits<ExpressionType>::ColsAtCompileTime,
-      HNormalized_SizeMinusOne = HNormalized_Size==Dynamic ? Dynamic : HNormalized_Size-1
-    };
-    typedef Block<const ExpressionType,
-                  Direction==Vertical   ? int(HNormalized_SizeMinusOne)
-                                        : int(internal::traits<ExpressionType>::RowsAtCompileTime),
-                  Direction==Horizontal ? int(HNormalized_SizeMinusOne)
+    HNormalized_SizeMinusOne = HNormalized_Size == Dynamic ? Dynamic : HNormalized_Size - 1
+  };
+  typedef Block<const ExpressionType,
+                Direction == Vertical ? int(HNormalized_SizeMinusOne)
+                                      : int(internal::traits<ExpressionType>::RowsAtCompileTime),
+                Direction == Horizontal ? int(HNormalized_SizeMinusOne)
                                         : int(internal::traits<ExpressionType>::ColsAtCompileTime)>
-            HNormalized_Block;
-    typedef Block<const ExpressionType,
-                  Direction==Vertical   ? 1 : int(internal::traits<ExpressionType>::RowsAtCompileTime),
-                  Direction==Horizontal ? 1 : int(internal::traits<ExpressionType>::ColsAtCompileTime)>
-            HNormalized_Factors;
-    typedef CwiseBinaryOp<internal::scalar_quotient_op<typename internal::traits<ExpressionType>::Scalar>,
-                const HNormalized_Block,
-                const Replicate<HNormalized_Factors,
-                  Direction==Vertical   ? HNormalized_SizeMinusOne : 1,
-                  Direction==Horizontal ? HNormalized_SizeMinusOne : 1> >
-            HNormalizedReturnType;
-
-    const HNormalizedReturnType hnormalized() const;
-
-  protected:
-    ExpressionTypeNested m_matrix;
+      HNormalized_Block;
+  typedef Block<const ExpressionType,
+                Direction == Vertical ? 1 : int(internal::traits<ExpressionType>::RowsAtCompileTime),
+                Direction == Horizontal ? 1 : int(internal::traits<ExpressionType>::ColsAtCompileTime)>
+      HNormalized_Factors;
+  typedef CwiseBinaryOp<internal::scalar_quotient_op<typename internal::traits<ExpressionType>::Scalar>,
+                        const HNormalized_Block,
+                        const Replicate<HNormalized_Factors, Direction == Vertical ? HNormalized_SizeMinusOne : 1,
+                                        Direction == Horizontal ? HNormalized_SizeMinusOne : 1> >
+      HNormalizedReturnType;
+
+  EIGEN_DEVICE_FUNC const HNormalizedReturnType hnormalized() const;
+
+#ifdef EIGEN_VECTORWISEOP_PLUGIN
+#include EIGEN_VECTORWISEOP_PLUGIN
+#endif
+
+ protected:
+  EIGEN_DEVICE_FUNC Index redux_length() const { return Direction == Vertical ? m_matrix.rows() : m_matrix.cols(); }
+  ExpressionType& m_matrix;
 };
 
-/** \returns a VectorwiseOp wrapper of *this providing additional partial reduction operations
-  *
-  * Example: \include MatrixBase_colwise.cpp
-  * Output: \verbinclude MatrixBase_colwise.out
-  *
-  * \sa rowwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
-  */
-template<typename Derived>
-inline const typename DenseBase<Derived>::ConstColwiseReturnType
-DenseBase<Derived>::colwise() const
-{
-  return derived();
-}
+// const colwise moved to DenseBase.h due to CUDA compiler bug
 
 /** \returns a writable VectorwiseOp wrapper of *this providing additional partial reduction operations
-  *
-  * \sa rowwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
-  */
-template<typename Derived>
-inline typename DenseBase<Derived>::ColwiseReturnType
-DenseBase<Derived>::colwise()
-{
-  return derived();
+ *
+ * \sa rowwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::ColwiseReturnType DenseBase<Derived>::colwise() {
+  return ColwiseReturnType(derived());
 }
 
-/** \returns a VectorwiseOp wrapper of *this providing additional partial reduction operations
-  *
-  * Example: \include MatrixBase_rowwise.cpp
-  * Output: \verbinclude MatrixBase_rowwise.out
-  *
-  * \sa colwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
-  */
-template<typename Derived>
-inline const typename DenseBase<Derived>::ConstRowwiseReturnType
-DenseBase<Derived>::rowwise() const
-{
-  return derived();
-}
+// const rowwise moved to DenseBase.h due to CUDA compiler bug
 
 /** \returns a writable VectorwiseOp wrapper of *this providing additional partial reduction operations
-  *
-  * \sa colwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
-  */
-template<typename Derived>
-inline typename DenseBase<Derived>::RowwiseReturnType
-DenseBase<Derived>::rowwise()
-{
-  return derived();
+ *
+ * \sa colwise(), class VectorwiseOp, \ref TutorialReductionsVisitorsBroadcasting
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC inline typename DenseBase<Derived>::RowwiseReturnType DenseBase<Derived>::rowwise() {
+  return RowwiseReturnType(derived());
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_PARTIAL_REDUX_H
+#endif  // EIGEN_PARTIAL_REDUX_H
diff --git a/inst/include/Eigen/src/Core/Visitor.h b/inst/include/Eigen/src/Core/Visitor.h
index 64867b7a..e1d2ca52 100644
--- a/inst/include/Eigen/src/Core/Visitor.h
+++ b/inst/include/Eigen/src/Core/Visitor.h
@@ -10,228 +10,528 @@
 #ifndef EIGEN_VISITOR_H
 #define EIGEN_VISITOR_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 namespace internal {
 
-template<typename Visitor, typename Derived, int UnrollCount>
-struct visitor_impl
-{
-  enum {
-    col = (UnrollCount-1) / Derived::RowsAtCompileTime,
-    row = (UnrollCount-1) % Derived::RowsAtCompileTime
-  };
+template <typename Visitor, typename Derived, int UnrollCount,
+          bool Vectorize = (Derived::PacketAccess && functor_traits<Visitor>::PacketAccess), bool LinearAccess = false,
+          bool ShortCircuitEvaluation = false>
+struct visitor_impl;
 
-  static inline void run(const Derived &mat, Visitor& visitor)
-  {
-    visitor_impl<Visitor, Derived, UnrollCount-1>::run(mat, visitor);
-    visitor(mat.coeff(row, col), row, col);
-  }
+template <typename Visitor, bool ShortCircuitEvaluation = false>
+struct short_circuit_eval_impl {
+  // if short circuit evaluation is not used, do nothing
+  static constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(const Visitor&) { return false; }
+};
+template <typename Visitor>
+struct short_circuit_eval_impl<Visitor, true> {
+  // if short circuit evaluation is used, check the visitor
+  static constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(const Visitor& visitor) { return visitor.done(); }
 };
 
-template<typename Visitor, typename Derived>
-struct visitor_impl<Visitor, Derived, 1>
-{
-  static inline void run(const Derived &mat, Visitor& visitor)
-  {
-    return visitor.init(mat.coeff(0, 0), 0, 0);
+// unrolled inner-outer traversal
+template <typename Visitor, typename Derived, int UnrollCount, bool Vectorize, bool ShortCircuitEvaluation>
+struct visitor_impl<Visitor, Derived, UnrollCount, Vectorize, false, ShortCircuitEvaluation> {
+  // don't use short circuit evaluation for unrolled version
+  using Scalar = typename Derived::Scalar;
+  using Packet = typename packet_traits<Scalar>::type;
+  static constexpr bool RowMajor = Derived::IsRowMajor;
+  static constexpr int RowsAtCompileTime = Derived::RowsAtCompileTime;
+  static constexpr int ColsAtCompileTime = Derived::ColsAtCompileTime;
+  static constexpr int PacketSize = packet_traits<Scalar>::size;
+
+  static constexpr bool CanVectorize(int K) {
+    constexpr int InnerSizeAtCompileTime = RowMajor ? ColsAtCompileTime : RowsAtCompileTime;
+    if (InnerSizeAtCompileTime < PacketSize) return false;
+    return Vectorize && (InnerSizeAtCompileTime - (K % InnerSizeAtCompileTime) >= PacketSize);
+  }
+
+  template <int K = 0, bool Empty = (K == UnrollCount), std::enable_if_t<Empty, bool> = true>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Derived&, Visitor&) {}
+
+  template <int K = 0, bool Empty = (K == UnrollCount), bool Initialize = (K == 0), bool DoVectorOp = CanVectorize(K),
+            std::enable_if_t<!Empty && Initialize && !DoVectorOp, bool> = true>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Derived& mat, Visitor& visitor) {
+    visitor.init(mat.coeff(0, 0), 0, 0);
+    run<1>(mat, visitor);
+  }
+
+  template <int K = 0, bool Empty = (K == UnrollCount), bool Initialize = (K == 0), bool DoVectorOp = CanVectorize(K),
+            std::enable_if_t<!Empty && !Initialize && !DoVectorOp, bool> = true>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Derived& mat, Visitor& visitor) {
+    static constexpr int R = RowMajor ? (K / ColsAtCompileTime) : (K % RowsAtCompileTime);
+    static constexpr int C = RowMajor ? (K % ColsAtCompileTime) : (K / RowsAtCompileTime);
+    visitor(mat.coeff(R, C), R, C);
+    run<K + 1>(mat, visitor);
+  }
+
+  template <int K = 0, bool Empty = (K == UnrollCount), bool Initialize = (K == 0), bool DoVectorOp = CanVectorize(K),
+            std::enable_if_t<!Empty && Initialize && DoVectorOp, bool> = true>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Derived& mat, Visitor& visitor) {
+    Packet P = mat.template packet<Packet>(0, 0);
+    visitor.initpacket(P, 0, 0);
+    run<PacketSize>(mat, visitor);
+  }
+
+  template <int K = 0, bool Empty = (K == UnrollCount), bool Initialize = (K == 0), bool DoVectorOp = CanVectorize(K),
+            std::enable_if_t<!Empty && !Initialize && DoVectorOp, bool> = true>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Derived& mat, Visitor& visitor) {
+    static constexpr int R = RowMajor ? (K / ColsAtCompileTime) : (K % RowsAtCompileTime);
+    static constexpr int C = RowMajor ? (K % ColsAtCompileTime) : (K / RowsAtCompileTime);
+    Packet P = mat.template packet<Packet>(R, C);
+    visitor.packet(P, R, C);
+    run<K + PacketSize>(mat, visitor);
   }
 };
 
-template<typename Visitor, typename Derived>
-struct visitor_impl<Visitor, Derived, Dynamic>
-{
-  typedef typename Derived::Index Index;
-  static inline void run(const Derived& mat, Visitor& visitor)
-  {
-    visitor.init(mat.coeff(0,0), 0, 0);
-    for(Index i = 1; i < mat.rows(); ++i)
-      visitor(mat.coeff(i, 0), i, 0);
-    for(Index j = 1; j < mat.cols(); ++j)
-      for(Index i = 0; i < mat.rows(); ++i)
-        visitor(mat.coeff(i, j), i, j);
+// unrolled linear traversal
+template <typename Visitor, typename Derived, int UnrollCount, bool Vectorize, bool ShortCircuitEvaluation>
+struct visitor_impl<Visitor, Derived, UnrollCount, Vectorize, true, ShortCircuitEvaluation> {
+  // don't use short circuit evaluation for unrolled version
+  using Scalar = typename Derived::Scalar;
+  using Packet = typename packet_traits<Scalar>::type;
+  static constexpr int PacketSize = packet_traits<Scalar>::size;
+
+  static constexpr bool CanVectorize(int K) { return Vectorize && ((UnrollCount - K) >= PacketSize); }
+
+  // empty
+  template <int K = 0, bool Empty = (K == UnrollCount), std::enable_if_t<Empty, bool> = true>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Derived&, Visitor&) {}
+
+  // scalar initialization
+  template <int K = 0, bool Empty = (K == UnrollCount), bool Initialize = (K == 0), bool DoVectorOp = CanVectorize(K),
+            std::enable_if_t<!Empty && Initialize && !DoVectorOp, bool> = true>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Derived& mat, Visitor& visitor) {
+    visitor.init(mat.coeff(0), 0);
+    run<1>(mat, visitor);
+  }
+
+  // scalar iteration
+  template <int K = 0, bool Empty = (K == UnrollCount), bool Initialize = (K == 0), bool DoVectorOp = CanVectorize(K),
+            std::enable_if_t<!Empty && !Initialize && !DoVectorOp, bool> = true>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Derived& mat, Visitor& visitor) {
+    visitor(mat.coeff(K), K);
+    run<K + 1>(mat, visitor);
+  }
+
+  // vector initialization
+  template <int K = 0, bool Empty = (K == UnrollCount), bool Initialize = (K == 0), bool DoVectorOp = CanVectorize(K),
+            std::enable_if_t<!Empty && Initialize && DoVectorOp, bool> = true>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Derived& mat, Visitor& visitor) {
+    Packet P = mat.template packet<Packet>(0);
+    visitor.initpacket(P, 0);
+    run<PacketSize>(mat, visitor);
+  }
+
+  // vector iteration
+  template <int K = 0, bool Empty = (K == UnrollCount), bool Initialize = (K == 0), bool DoVectorOp = CanVectorize(K),
+            std::enable_if_t<!Empty && !Initialize && DoVectorOp, bool> = true>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Derived& mat, Visitor& visitor) {
+    Packet P = mat.template packet<Packet>(K);
+    visitor.packet(P, K);
+    run<K + PacketSize>(mat, visitor);
   }
 };
 
-} // end namespace internal
+// dynamic scalar outer-inner traversal
+template <typename Visitor, typename Derived, bool ShortCircuitEvaluation>
+struct visitor_impl<Visitor, Derived, Dynamic, /*Vectorize=*/false, /*LinearAccess=*/false, ShortCircuitEvaluation> {
+  using short_circuit = short_circuit_eval_impl<Visitor, ShortCircuitEvaluation>;
+  static constexpr bool RowMajor = Derived::IsRowMajor;
 
-/** Applies the visitor \a visitor to the whole coefficients of the matrix or vector.
-  *
-  * The template parameter \a Visitor is the type of the visitor and provides the following interface:
-  * \code
-  * struct MyVisitor {
-  *   // called for the first coefficient
-  *   void init(const Scalar& value, Index i, Index j);
-  *   // called for all other coefficients
-  *   void operator() (const Scalar& value, Index i, Index j);
-  * };
-  * \endcode
-  *
-  * \note compared to one or two \em for \em loops, visitors offer automatic
-  * unrolling for small fixed size matrix.
-  *
-  * \sa minCoeff(Index*,Index*), maxCoeff(Index*,Index*), DenseBase::redux()
-  */
-template<typename Derived>
-template<typename Visitor>
-void DenseBase<Derived>::visit(Visitor& visitor) const
-{
-  enum { unroll = SizeAtCompileTime != Dynamic
-                   && CoeffReadCost != Dynamic
-                   && (SizeAtCompileTime == 1 || internal::functor_traits<Visitor>::Cost != Dynamic)
-                   && SizeAtCompileTime * CoeffReadCost + (SizeAtCompileTime-1) * internal::functor_traits<Visitor>::Cost
-                      <= EIGEN_UNROLLING_LIMIT };
-  return internal::visitor_impl<Visitor, Derived,
-      unroll ? int(SizeAtCompileTime) : Dynamic
-    >::run(derived(), visitor);
-}
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Derived& mat, Visitor& visitor) {
+    const Index innerSize = RowMajor ? mat.cols() : mat.rows();
+    const Index outerSize = RowMajor ? mat.rows() : mat.cols();
+    if (innerSize == 0 || outerSize == 0) return;
+    {
+      visitor.init(mat.coeff(0, 0), 0, 0);
+      if (short_circuit::run(visitor)) return;
+      for (Index i = 1; i < innerSize; ++i) {
+        Index r = RowMajor ? 0 : i;
+        Index c = RowMajor ? i : 0;
+        visitor(mat.coeff(r, c), r, c);
+        if EIGEN_PREDICT_FALSE (short_circuit::run(visitor)) return;
+      }
+    }
+    for (Index j = 1; j < outerSize; j++) {
+      for (Index i = 0; i < innerSize; ++i) {
+        Index r = RowMajor ? j : i;
+        Index c = RowMajor ? i : j;
+        visitor(mat.coeff(r, c), r, c);
+        if EIGEN_PREDICT_FALSE (short_circuit::run(visitor)) return;
+      }
+    }
+  }
+};
 
-namespace internal {
+// dynamic vectorized outer-inner traversal
+template <typename Visitor, typename Derived, bool ShortCircuitEvaluation>
+struct visitor_impl<Visitor, Derived, Dynamic, /*Vectorize=*/true, /*LinearAccess=*/false, ShortCircuitEvaluation> {
+  using Scalar = typename Derived::Scalar;
+  using Packet = typename packet_traits<Scalar>::type;
+  static constexpr int PacketSize = packet_traits<Scalar>::size;
+  using short_circuit = short_circuit_eval_impl<Visitor, ShortCircuitEvaluation>;
+  static constexpr bool RowMajor = Derived::IsRowMajor;
 
-/** \internal
-  * \brief Base class to implement min and max visitors
-  */
-template <typename Derived>
-struct coeff_visitor
-{
-  typedef typename Derived::Index Index;
-  typedef typename Derived::Scalar Scalar;
-  Index row, col;
-  Scalar res;
-  inline void init(const Scalar& value, Index i, Index j)
-  {
-    res = value;
-    row = i;
-    col = j;
-  }
-};
-
-/** \internal
-  * \brief Visitor computing the min coefficient with its value and coordinates
-  *
-  * \sa DenseBase::minCoeff(Index*, Index*)
-  */
-template <typename Derived>
-struct min_coeff_visitor : coeff_visitor<Derived>
-{
-  typedef typename Derived::Index Index;
-  typedef typename Derived::Scalar Scalar;
-  void operator() (const Scalar& value, Index i, Index j)
-  {
-    if(value < this->res)
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Derived& mat, Visitor& visitor) {
+    const Index innerSize = RowMajor ? mat.cols() : mat.rows();
+    const Index outerSize = RowMajor ? mat.rows() : mat.cols();
+    if (innerSize == 0 || outerSize == 0) return;
     {
-      this->res = value;
-      this->row = i;
-      this->col = j;
+      Index i = 0;
+      if (innerSize < PacketSize) {
+        visitor.init(mat.coeff(0, 0), 0, 0);
+        i = 1;
+      } else {
+        Packet p = mat.template packet<Packet>(0, 0);
+        visitor.initpacket(p, 0, 0);
+        i = PacketSize;
+      }
+      if EIGEN_PREDICT_FALSE (short_circuit::run(visitor)) return;
+      for (; i + PacketSize - 1 < innerSize; i += PacketSize) {
+        Index r = RowMajor ? 0 : i;
+        Index c = RowMajor ? i : 0;
+        Packet p = mat.template packet<Packet>(r, c);
+        visitor.packet(p, r, c);
+        if EIGEN_PREDICT_FALSE (short_circuit::run(visitor)) return;
+      }
+      for (; i < innerSize; ++i) {
+        Index r = RowMajor ? 0 : i;
+        Index c = RowMajor ? i : 0;
+        visitor(mat.coeff(r, c), r, c);
+        if EIGEN_PREDICT_FALSE (short_circuit::run(visitor)) return;
+      }
+    }
+    for (Index j = 1; j < outerSize; j++) {
+      Index i = 0;
+      for (; i + PacketSize - 1 < innerSize; i += PacketSize) {
+        Index r = RowMajor ? j : i;
+        Index c = RowMajor ? i : j;
+        Packet p = mat.template packet<Packet>(r, c);
+        visitor.packet(p, r, c);
+        if EIGEN_PREDICT_FALSE (short_circuit::run(visitor)) return;
+      }
+      for (; i < innerSize; ++i) {
+        Index r = RowMajor ? j : i;
+        Index c = RowMajor ? i : j;
+        visitor(mat.coeff(r, c), r, c);
+        if EIGEN_PREDICT_FALSE (short_circuit::run(visitor)) return;
+      }
     }
   }
 };
 
-template<typename Scalar>
-struct functor_traits<min_coeff_visitor<Scalar> > {
-  enum {
-    Cost = NumTraits<Scalar>::AddCost
-  };
+// dynamic scalar linear traversal
+template <typename Visitor, typename Derived, bool ShortCircuitEvaluation>
+struct visitor_impl<Visitor, Derived, Dynamic, /*Vectorize=*/false, /*LinearAccess=*/true, ShortCircuitEvaluation> {
+  using short_circuit = short_circuit_eval_impl<Visitor, ShortCircuitEvaluation>;
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Derived& mat, Visitor& visitor) {
+    const Index size = mat.size();
+    if (size == 0) return;
+    visitor.init(mat.coeff(0), 0);
+    if EIGEN_PREDICT_FALSE (short_circuit::run(visitor)) return;
+    for (Index k = 1; k < size; k++) {
+      visitor(mat.coeff(k), k);
+      if EIGEN_PREDICT_FALSE (short_circuit::run(visitor)) return;
+    }
+  }
 };
 
-/** \internal
-  * \brief Visitor computing the max coefficient with its value and coordinates
-  *
-  * \sa DenseBase::maxCoeff(Index*, Index*)
-  */
-template <typename Derived>
-struct max_coeff_visitor : coeff_visitor<Derived>
-{
-  typedef typename Derived::Index Index;
-  typedef typename Derived::Scalar Scalar;
-  void operator() (const Scalar& value, Index i, Index j)
-  {
-    if(value > this->res)
-    {
-      this->res = value;
-      this->row = i;
-      this->col = j;
+// dynamic vectorized linear traversal
+template <typename Visitor, typename Derived, bool ShortCircuitEvaluation>
+struct visitor_impl<Visitor, Derived, Dynamic, /*Vectorize=*/true, /*LinearAccess=*/true, ShortCircuitEvaluation> {
+  using Scalar = typename Derived::Scalar;
+  using Packet = typename packet_traits<Scalar>::type;
+  static constexpr int PacketSize = packet_traits<Scalar>::size;
+  using short_circuit = short_circuit_eval_impl<Visitor, ShortCircuitEvaluation>;
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const Derived& mat, Visitor& visitor) {
+    const Index size = mat.size();
+    if (size == 0) return;
+    Index k = 0;
+    if (size < PacketSize) {
+      visitor.init(mat.coeff(0), 0);
+      k = 1;
+    } else {
+      Packet p = mat.template packet<Packet>(k);
+      visitor.initpacket(p, k);
+      k = PacketSize;
+    }
+    if EIGEN_PREDICT_FALSE (short_circuit::run(visitor)) return;
+    for (; k + PacketSize - 1 < size; k += PacketSize) {
+      Packet p = mat.template packet<Packet>(k);
+      visitor.packet(p, k);
+      if EIGEN_PREDICT_FALSE (short_circuit::run(visitor)) return;
+    }
+    for (; k < size; k++) {
+      visitor(mat.coeff(k), k);
+      if EIGEN_PREDICT_FALSE (short_circuit::run(visitor)) return;
     }
   }
 };
 
-template<typename Scalar>
-struct functor_traits<max_coeff_visitor<Scalar> > {
+// evaluator adaptor
+template <typename XprType>
+class visitor_evaluator {
+ public:
+  typedef evaluator<XprType> Evaluator;
+  typedef typename XprType::Scalar Scalar;
+  using Packet = typename packet_traits<Scalar>::type;
+  typedef std::remove_const_t<typename XprType::CoeffReturnType> CoeffReturnType;
+
+  static constexpr bool PacketAccess = static_cast<bool>(Evaluator::Flags & PacketAccessBit);
+  static constexpr bool LinearAccess = static_cast<bool>(Evaluator::Flags & LinearAccessBit);
+  static constexpr bool IsRowMajor = static_cast<bool>(XprType::IsRowMajor);
+  static constexpr int RowsAtCompileTime = XprType::RowsAtCompileTime;
+  static constexpr int ColsAtCompileTime = XprType::ColsAtCompileTime;
+  static constexpr int XprAlignment = Evaluator::Alignment;
+  static constexpr int CoeffReadCost = Evaluator::CoeffReadCost;
+
+  EIGEN_DEVICE_FUNC explicit visitor_evaluator(const XprType& xpr) : m_evaluator(xpr), m_xpr(xpr) {}
+
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_xpr.rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_xpr.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index size() const noexcept { return m_xpr.size(); }
+  // outer-inner access
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
+    return m_evaluator.coeff(row, col);
+  }
+  template <typename Packet, int Alignment = Unaligned>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packet(Index row, Index col) const {
+    return m_evaluator.template packet<Alignment, Packet>(row, col);
+  }
+  // linear access
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_evaluator.coeff(index); }
+  template <typename Packet, int Alignment = XprAlignment>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packet(Index index) const {
+    return m_evaluator.template packet<Alignment, Packet>(index);
+  }
+
+ protected:
+  Evaluator m_evaluator;
+  const XprType& m_xpr;
+};
+
+template <typename Derived, typename Visitor, bool ShortCircuitEvaulation>
+struct visit_impl {
+  using Evaluator = visitor_evaluator<Derived>;
+  using Scalar = typename DenseBase<Derived>::Scalar;
+
+  static constexpr bool IsRowMajor = DenseBase<Derived>::IsRowMajor;
+  static constexpr int SizeAtCompileTime = DenseBase<Derived>::SizeAtCompileTime;
+  static constexpr int RowsAtCompileTime = DenseBase<Derived>::RowsAtCompileTime;
+  static constexpr int ColsAtCompileTime = DenseBase<Derived>::ColsAtCompileTime;
+  static constexpr int InnerSizeAtCompileTime = IsRowMajor ? ColsAtCompileTime : RowsAtCompileTime;
+  static constexpr int OuterSizeAtCompileTime = IsRowMajor ? RowsAtCompileTime : ColsAtCompileTime;
+
+  static constexpr bool LinearAccess =
+      Evaluator::LinearAccess && static_cast<bool>(functor_traits<Visitor>::LinearAccess);
+  static constexpr bool Vectorize = Evaluator::PacketAccess && static_cast<bool>(functor_traits<Visitor>::PacketAccess);
+
+  static constexpr int PacketSize = packet_traits<Scalar>::size;
+  static constexpr int VectorOps =
+      Vectorize ? (LinearAccess ? (SizeAtCompileTime / PacketSize)
+                                : (OuterSizeAtCompileTime * (InnerSizeAtCompileTime / PacketSize)))
+                : 0;
+  static constexpr int ScalarOps = SizeAtCompileTime - (VectorOps * PacketSize);
+  // treat vector op and scalar op as same cost for unroll logic
+  static constexpr int TotalOps = VectorOps + ScalarOps;
+
+  static constexpr int UnrollCost = int(Evaluator::CoeffReadCost) + int(functor_traits<Visitor>::Cost);
+  static constexpr bool Unroll = (SizeAtCompileTime != Dynamic) && ((TotalOps * UnrollCost) <= EIGEN_UNROLLING_LIMIT);
+  static constexpr int UnrollCount = Unroll ? int(SizeAtCompileTime) : Dynamic;
+
+  using impl = visitor_impl<Visitor, Evaluator, UnrollCount, Vectorize, LinearAccess, ShortCircuitEvaulation>;
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(const DenseBase<Derived>& mat, Visitor& visitor) {
+    Evaluator evaluator(mat.derived());
+    impl::run(evaluator, visitor);
+  }
+};
+
+}  // end namespace internal
+
+/** Applies the visitor \a visitor to the whole coefficients of the matrix or vector.
+ *
+ * The template parameter \a Visitor is the type of the visitor and provides the following interface:
+ * \code
+ * struct MyVisitor {
+ *   // called for the first coefficient
+ *   void init(const Scalar& value, Index i, Index j);
+ *   // called for all other coefficients
+ *   void operator() (const Scalar& value, Index i, Index j);
+ * };
+ * \endcode
+ *
+ * \note compared to one or two \em for \em loops, visitors offer automatic
+ * unrolling for small fixed size matrix.
+ *
+ * \note if the matrix is empty, then the visitor is left unchanged.
+ *
+ * \sa minCoeff(Index*,Index*), maxCoeff(Index*,Index*), DenseBase::redux()
+ */
+template <typename Derived>
+template <typename Visitor>
+EIGEN_DEVICE_FUNC void DenseBase<Derived>::visit(Visitor& visitor) const {
+  using impl = internal::visit_impl<Derived, Visitor, /*ShortCircuitEvaulation*/ false>;
+  impl::run(derived(), visitor);
+}
+
+namespace internal {
+
+template <typename Scalar>
+struct all_visitor {
+  using result_type = bool;
+  using Packet = typename packet_traits<Scalar>::type;
+  EIGEN_DEVICE_FUNC inline void init(const Scalar& value, Index, Index) { res = (value != Scalar(0)); }
+  EIGEN_DEVICE_FUNC inline void init(const Scalar& value, Index) { res = (value != Scalar(0)); }
+  EIGEN_DEVICE_FUNC inline bool all_predux(const Packet& p) const { return !predux_any(pcmp_eq(p, pzero(p))); }
+  EIGEN_DEVICE_FUNC inline void initpacket(const Packet& p, Index, Index) { res = all_predux(p); }
+  EIGEN_DEVICE_FUNC inline void initpacket(const Packet& p, Index) { res = all_predux(p); }
+  EIGEN_DEVICE_FUNC inline void operator()(const Scalar& value, Index, Index) { res = res && (value != Scalar(0)); }
+  EIGEN_DEVICE_FUNC inline void operator()(const Scalar& value, Index) { res = res && (value != Scalar(0)); }
+  EIGEN_DEVICE_FUNC inline void packet(const Packet& p, Index, Index) { res = res && all_predux(p); }
+  EIGEN_DEVICE_FUNC inline void packet(const Packet& p, Index) { res = res && all_predux(p); }
+  EIGEN_DEVICE_FUNC inline bool done() const { return !res; }
+  bool res = true;
+};
+template <typename Scalar>
+struct functor_traits<all_visitor<Scalar>> {
+  enum { Cost = NumTraits<Scalar>::ReadCost, LinearAccess = true, PacketAccess = packet_traits<Scalar>::HasCmp };
+};
+
+template <typename Scalar>
+struct any_visitor {
+  using result_type = bool;
+  using Packet = typename packet_traits<Scalar>::type;
+  EIGEN_DEVICE_FUNC inline void init(const Scalar& value, Index, Index) { res = (value != Scalar(0)); }
+  EIGEN_DEVICE_FUNC inline void init(const Scalar& value, Index) { res = (value != Scalar(0)); }
+  EIGEN_DEVICE_FUNC inline bool any_predux(const Packet& p) const {
+    return predux_any(pandnot(ptrue(p), pcmp_eq(p, pzero(p))));
+  }
+  EIGEN_DEVICE_FUNC inline void initpacket(const Packet& p, Index, Index) { res = any_predux(p); }
+  EIGEN_DEVICE_FUNC inline void initpacket(const Packet& p, Index) { res = any_predux(p); }
+  EIGEN_DEVICE_FUNC inline void operator()(const Scalar& value, Index, Index) { res = res || (value != Scalar(0)); }
+  EIGEN_DEVICE_FUNC inline void operator()(const Scalar& value, Index) { res = res || (value != Scalar(0)); }
+  EIGEN_DEVICE_FUNC inline void packet(const Packet& p, Index, Index) { res = res || any_predux(p); }
+  EIGEN_DEVICE_FUNC inline void packet(const Packet& p, Index) { res = res || any_predux(p); }
+  EIGEN_DEVICE_FUNC inline bool done() const { return res; }
+  bool res = false;
+};
+template <typename Scalar>
+struct functor_traits<any_visitor<Scalar>> {
+  enum { Cost = NumTraits<Scalar>::ReadCost, LinearAccess = true, PacketAccess = packet_traits<Scalar>::HasCmp };
+};
+
+template <typename Scalar>
+struct count_visitor {
+  using result_type = Index;
+  using Packet = typename packet_traits<Scalar>::type;
+  EIGEN_DEVICE_FUNC inline void init(const Scalar& value, Index, Index) { res = value != Scalar(0) ? 1 : 0; }
+  EIGEN_DEVICE_FUNC inline void init(const Scalar& value, Index) { res = value != Scalar(0) ? 1 : 0; }
+  EIGEN_DEVICE_FUNC inline Index count_redux(const Packet& p) const {
+    const Packet cst_one = pset1<Packet>(Scalar(1));
+    Packet true_vals = pandnot(cst_one, pcmp_eq(p, pzero(p)));
+    Scalar num_true = predux(true_vals);
+    return static_cast<Index>(num_true);
+  }
+  EIGEN_DEVICE_FUNC inline void initpacket(const Packet& p, Index, Index) { res = count_redux(p); }
+  EIGEN_DEVICE_FUNC inline void initpacket(const Packet& p, Index) { res = count_redux(p); }
+  EIGEN_DEVICE_FUNC inline void operator()(const Scalar& value, Index, Index) {
+    if (value != Scalar(0)) res++;
+  }
+  EIGEN_DEVICE_FUNC inline void operator()(const Scalar& value, Index) {
+    if (value != Scalar(0)) res++;
+  }
+  EIGEN_DEVICE_FUNC inline void packet(const Packet& p, Index, Index) { res += count_redux(p); }
+  EIGEN_DEVICE_FUNC inline void packet(const Packet& p, Index) { res += count_redux(p); }
+  Index res = 0;
+};
+
+template <typename Scalar>
+struct functor_traits<count_visitor<Scalar>> {
   enum {
-    Cost = NumTraits<Scalar>::AddCost
+    Cost = NumTraits<Scalar>::AddCost,
+    LinearAccess = true,
+    // predux is problematic for bool
+    PacketAccess = packet_traits<Scalar>::HasCmp && packet_traits<Scalar>::HasAdd && !is_same<Scalar, bool>::value
   };
 };
 
-} // end namespace internal
-
-/** \returns the minimum of all coefficients of *this and puts in *row and *col its location.
-  * \warning the result is undefined if \c *this contains NaN.
-  *
-  * \sa DenseBase::minCoeff(Index*), DenseBase::maxCoeff(Index*,Index*), DenseBase::visitor(), DenseBase::minCoeff()
-  */
-template<typename Derived>
-template<typename IndexType>
-typename internal::traits<Derived>::Scalar
-DenseBase<Derived>::minCoeff(IndexType* rowId, IndexType* colId) const
-{
-  internal::min_coeff_visitor<Derived> minVisitor;
-  this->visit(minVisitor);
-  *rowId = minVisitor.row;
-  if (colId) *colId = minVisitor.col;
-  return minVisitor.res;
+template <typename Derived, bool AlwaysTrue = NumTraits<typename traits<Derived>::Scalar>::IsInteger>
+struct all_finite_impl {
+  static EIGEN_DEVICE_FUNC inline bool run(const Derived& /*derived*/) { return true; }
+};
+#if !defined(__FINITE_MATH_ONLY__) || !(__FINITE_MATH_ONLY__)
+template <typename Derived>
+struct all_finite_impl<Derived, false> {
+  static EIGEN_DEVICE_FUNC inline bool run(const Derived& derived) { return derived.array().isFiniteTyped().all(); }
+};
+#endif
+
+}  // end namespace internal
+
+/** \returns true if all coefficients are true
+ *
+ * Example: \include MatrixBase_all.cpp
+ * Output: \verbinclude MatrixBase_all.out
+ *
+ * \sa any(), Cwise::operator<()
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC inline bool DenseBase<Derived>::all() const {
+  using Visitor = internal::all_visitor<Scalar>;
+  using impl = internal::visit_impl<Derived, Visitor, /*ShortCircuitEvaulation*/ true>;
+  Visitor visitor;
+  impl::run(derived(), visitor);
+  return visitor.res;
 }
 
-/** \returns the minimum of all coefficients of *this and puts in *index its location.
-  * \warning the result is undefined if \c *this contains NaN. 
-  *
-  * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::visitor(), DenseBase::minCoeff()
-  */
-template<typename Derived>
-template<typename IndexType>
-typename internal::traits<Derived>::Scalar
-DenseBase<Derived>::minCoeff(IndexType* index) const
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  internal::min_coeff_visitor<Derived> minVisitor;
-  this->visit(minVisitor);
-  *index = (RowsAtCompileTime==1) ? minVisitor.col : minVisitor.row;
-  return minVisitor.res;
+/** \returns true if at least one coefficient is true
+ *
+ * \sa all()
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC inline bool DenseBase<Derived>::any() const {
+  using Visitor = internal::any_visitor<Scalar>;
+  using impl = internal::visit_impl<Derived, Visitor, /*ShortCircuitEvaulation*/ true>;
+  Visitor visitor;
+  impl::run(derived(), visitor);
+  return visitor.res;
 }
 
-/** \returns the maximum of all coefficients of *this and puts in *row and *col its location.
-  * \warning the result is undefined if \c *this contains NaN. 
-  *
-  * \sa DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visitor(), DenseBase::maxCoeff()
-  */
-template<typename Derived>
-template<typename IndexType>
-typename internal::traits<Derived>::Scalar
-DenseBase<Derived>::maxCoeff(IndexType* rowPtr, IndexType* colPtr) const
-{
-  internal::max_coeff_visitor<Derived> maxVisitor;
-  this->visit(maxVisitor);
-  *rowPtr = maxVisitor.row;
-  if (colPtr) *colPtr = maxVisitor.col;
-  return maxVisitor.res;
+/** \returns the number of coefficients which evaluate to true
+ *
+ * \sa all(), any()
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC Index DenseBase<Derived>::count() const {
+  using Visitor = internal::count_visitor<Scalar>;
+  using impl = internal::visit_impl<Derived, Visitor, /*ShortCircuitEvaulation*/ false>;
+  Visitor visitor;
+  impl::run(derived(), visitor);
+  return visitor.res;
 }
 
-/** \returns the maximum of all coefficients of *this and puts in *index its location.
-  * \warning the result is undefined if \c *this contains NaN.
-  *
-  * \sa DenseBase::maxCoeff(IndexType*,IndexType*), DenseBase::minCoeff(IndexType*,IndexType*), DenseBase::visitor(), DenseBase::maxCoeff()
-  */
-template<typename Derived>
-template<typename IndexType>
-typename internal::traits<Derived>::Scalar
-DenseBase<Derived>::maxCoeff(IndexType* index) const
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  internal::max_coeff_visitor<Derived> maxVisitor;
-  this->visit(maxVisitor);
-  *index = (RowsAtCompileTime==1) ? maxVisitor.col : maxVisitor.row;
-  return maxVisitor.res;
+template <typename Derived>
+EIGEN_DEVICE_FUNC inline bool DenseBase<Derived>::hasNaN() const {
+  return derived().cwiseTypedNotEqual(derived()).any();
+}
+
+/** \returns true if \c *this contains only finite numbers, i.e., no NaN and no +/-INF values.
+ *
+ * \sa hasNaN()
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC inline bool DenseBase<Derived>::allFinite() const {
+  return internal::all_finite_impl<Derived>::run(derived());
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_VISITOR_H
+#endif  // EIGEN_VISITOR_H
diff --git a/inst/include/Eigen/src/Core/arch/AVX/Complex.h b/inst/include/Eigen/src/Core/arch/AVX/Complex.h
new file mode 100644
index 00000000..a4a87c4f
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/AVX/Complex.h
@@ -0,0 +1,565 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner (benoit.steiner.goog@gmail.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_COMPLEX_AVX_H
+#define EIGEN_COMPLEX_AVX_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+//---------- float ----------
+struct Packet4cf {
+  EIGEN_STRONG_INLINE Packet4cf() {}
+  EIGEN_STRONG_INLINE explicit Packet4cf(const __m256& a) : v(a) {}
+  __m256 v;
+};
+
+#ifndef EIGEN_VECTORIZE_AVX512
+template <>
+struct packet_traits<std::complex<float> > : default_packet_traits {
+  typedef Packet4cf type;
+  typedef Packet2cf half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasNegate = 1,
+    HasSqrt = 1,
+    HasLog = 1,
+    HasExp = 1,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 0,
+    HasMax = 0,
+    HasSetLinear = 0
+  };
+};
+#endif
+
+template <>
+struct unpacket_traits<Packet4cf> {
+  typedef std::complex<float> type;
+  typedef Packet2cf half;
+  typedef Packet8f as_real;
+  enum {
+    size = 4,
+    alignment = Aligned32,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet4cf padd<Packet4cf>(const Packet4cf& a, const Packet4cf& b) {
+  return Packet4cf(_mm256_add_ps(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cf psub<Packet4cf>(const Packet4cf& a, const Packet4cf& b) {
+  return Packet4cf(_mm256_sub_ps(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cf pnegate(const Packet4cf& a) {
+  return Packet4cf(pnegate(a.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cf pconj(const Packet4cf& a) {
+  const __m256 mask = _mm256_castsi256_ps(_mm256_setr_epi32(0x00000000, 0x80000000, 0x00000000, 0x80000000, 0x00000000,
+                                                            0x80000000, 0x00000000, 0x80000000));
+  return Packet4cf(_mm256_xor_ps(a.v, mask));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& a, const Packet4cf& b) {
+  __m256 tmp1 = _mm256_mul_ps(_mm256_movehdup_ps(a.v), _mm256_permute_ps(b.v, _MM_SHUFFLE(2, 3, 0, 1)));
+  __m256 tmp2 = _mm256_moveldup_ps(a.v);
+#ifdef EIGEN_VECTORIZE_FMA
+  __m256 result = _mm256_fmaddsub_ps(tmp2, b.v, tmp1);
+#else
+  __m256 result = _mm256_addsub_ps(_mm256_mul_ps(tmp2, b.v), tmp1);
+#endif
+  return Packet4cf(result);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4cf pcmp_eq(const Packet4cf& a, const Packet4cf& b) {
+  __m256 eq = _mm256_cmp_ps(a.v, b.v, _CMP_EQ_OQ);
+  return Packet4cf(_mm256_and_ps(eq, _mm256_permute_ps(eq, 0xb1)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4cf ptrue<Packet4cf>(const Packet4cf& a) {
+  return Packet4cf(ptrue(Packet8f(a.v)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cf pand<Packet4cf>(const Packet4cf& a, const Packet4cf& b) {
+  return Packet4cf(_mm256_and_ps(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cf por<Packet4cf>(const Packet4cf& a, const Packet4cf& b) {
+  return Packet4cf(_mm256_or_ps(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cf pxor<Packet4cf>(const Packet4cf& a, const Packet4cf& b) {
+  return Packet4cf(_mm256_xor_ps(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cf pandnot<Packet4cf>(const Packet4cf& a, const Packet4cf& b) {
+  return Packet4cf(_mm256_andnot_ps(b.v, a.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4cf pload<Packet4cf>(const std::complex<float>* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return Packet4cf(_mm256_load_ps(&numext::real_ref(*from)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cf ploadu<Packet4cf>(const std::complex<float>* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return Packet4cf(_mm256_loadu_ps(&numext::real_ref(*from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4cf pset1<Packet4cf>(const std::complex<float>& from) {
+  const float re = std::real(from);
+  const float im = std::imag(from);
+  return Packet4cf(_mm256_set_ps(im, re, im, re, im, re, im, re));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4cf ploaddup<Packet4cf>(const std::complex<float>* from) {
+  // FIXME The following might be optimized using _mm256_movedup_pd
+  Packet2cf a = ploaddup<Packet2cf>(from);
+  Packet2cf b = ploaddup<Packet2cf>(from + 1);
+  return Packet4cf(_mm256_insertf128_ps(_mm256_castps128_ps256(a.v), b.v, 1));
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<std::complex<float> >(std::complex<float>* to, const Packet4cf& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm256_store_ps(&numext::real_ref(*to), from.v);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet4cf& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_ps(&numext::real_ref(*to), from.v);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet4cf pgather<std::complex<float>, Packet4cf>(const std::complex<float>* from,
+                                                                           Index stride) {
+  return Packet4cf(_mm256_set_ps(std::imag(from[3 * stride]), std::real(from[3 * stride]), std::imag(from[2 * stride]),
+                                 std::real(from[2 * stride]), std::imag(from[1 * stride]), std::real(from[1 * stride]),
+                                 std::imag(from[0 * stride]), std::real(from[0 * stride])));
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet4cf>(std::complex<float>* to, const Packet4cf& from,
+                                                                       Index stride) {
+  __m128 low = _mm256_extractf128_ps(from.v, 0);
+  to[stride * 0] =
+      std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 0)), _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1)));
+  to[stride * 1] =
+      std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(low, low, 2)), _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3)));
+
+  __m128 high = _mm256_extractf128_ps(from.v, 1);
+  to[stride * 2] =
+      std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 0)), _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1)));
+  to[stride * 3] =
+      std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(high, high, 2)), _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3)));
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet4cf>(const Packet4cf& a) {
+  return pfirst(Packet2cf(_mm256_castps256_ps128(a.v)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4cf preverse(const Packet4cf& a) {
+  __m128 low = _mm256_extractf128_ps(a.v, 0);
+  __m128 high = _mm256_extractf128_ps(a.v, 1);
+  __m128d lowd = _mm_castps_pd(low);
+  __m128d highd = _mm_castps_pd(high);
+  low = _mm_castpd_ps(_mm_shuffle_pd(lowd, lowd, 0x1));
+  high = _mm_castpd_ps(_mm_shuffle_pd(highd, highd, 0x1));
+  __m256 result = _mm256_setzero_ps();
+  result = _mm256_insertf128_ps(result, low, 1);
+  result = _mm256_insertf128_ps(result, high, 0);
+  return Packet4cf(result);
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux<Packet4cf>(const Packet4cf& a) {
+  return predux(padd(Packet2cf(_mm256_extractf128_ps(a.v, 0)), Packet2cf(_mm256_extractf128_ps(a.v, 1))));
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet4cf>(const Packet4cf& a) {
+  return predux_mul(pmul(Packet2cf(_mm256_extractf128_ps(a.v, 0)), Packet2cf(_mm256_extractf128_ps(a.v, 1))));
+}
+
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cf, Packet8f)
+
+template <>
+EIGEN_STRONG_INLINE Packet4cf pdiv<Packet4cf>(const Packet4cf& a, const Packet4cf& b) {
+  return pdiv_complex(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4cf pcplxflip<Packet4cf>(const Packet4cf& x) {
+  return Packet4cf(_mm256_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2, 3, 0, 1)));
+}
+
+//---------- double ----------
+struct Packet2cd {
+  EIGEN_STRONG_INLINE Packet2cd() {}
+  EIGEN_STRONG_INLINE explicit Packet2cd(const __m256d& a) : v(a) {}
+  __m256d v;
+};
+
+#ifndef EIGEN_VECTORIZE_AVX512
+template <>
+struct packet_traits<std::complex<double> > : default_packet_traits {
+  typedef Packet2cd type;
+  typedef Packet1cd half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 0,
+    size = 2,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasNegate = 1,
+    HasSqrt = 1,
+    HasLog = 1,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 0,
+    HasMax = 0,
+    HasSetLinear = 0
+  };
+};
+#endif
+
+template <>
+struct unpacket_traits<Packet2cd> {
+  typedef std::complex<double> type;
+  typedef Packet1cd half;
+  typedef Packet4d as_real;
+  enum {
+    size = 2,
+    alignment = Aligned32,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet2cd padd<Packet2cd>(const Packet2cd& a, const Packet2cd& b) {
+  return Packet2cd(_mm256_add_pd(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cd psub<Packet2cd>(const Packet2cd& a, const Packet2cd& b) {
+  return Packet2cd(_mm256_sub_pd(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cd pnegate(const Packet2cd& a) {
+  return Packet2cd(pnegate(a.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cd pconj(const Packet2cd& a) {
+  const __m256d mask = _mm256_castsi256_pd(_mm256_set_epi32(0x80000000, 0x0, 0x0, 0x0, 0x80000000, 0x0, 0x0, 0x0));
+  return Packet2cd(_mm256_xor_pd(a.v, mask));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& a, const Packet2cd& b) {
+  __m256d tmp1 = _mm256_mul_pd(_mm256_permute_pd(a.v, 0xF), _mm256_permute_pd(b.v, 0x5));
+  __m256d tmp2 = _mm256_movedup_pd(a.v);
+#ifdef EIGEN_VECTORIZE_FMA
+  __m256d result = _mm256_fmaddsub_pd(tmp2, b.v, tmp1);
+#else
+  __m256d result = _mm256_addsub_pd(_mm256_mul_pd(tmp2, b.v), tmp1);
+#endif
+  return Packet2cd(result);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cd pcmp_eq(const Packet2cd& a, const Packet2cd& b) {
+  __m256d eq = _mm256_cmp_pd(a.v, b.v, _CMP_EQ_OQ);
+  return Packet2cd(pand(eq, _mm256_permute_pd(eq, 0x5)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cd ptrue<Packet2cd>(const Packet2cd& a) {
+  return Packet2cd(ptrue(Packet4d(a.v)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cd pand<Packet2cd>(const Packet2cd& a, const Packet2cd& b) {
+  return Packet2cd(_mm256_and_pd(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cd por<Packet2cd>(const Packet2cd& a, const Packet2cd& b) {
+  return Packet2cd(_mm256_or_pd(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cd pxor<Packet2cd>(const Packet2cd& a, const Packet2cd& b) {
+  return Packet2cd(_mm256_xor_pd(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cd pandnot<Packet2cd>(const Packet2cd& a, const Packet2cd& b) {
+  return Packet2cd(_mm256_andnot_pd(b.v, a.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cd pload<Packet2cd>(const std::complex<double>* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return Packet2cd(_mm256_load_pd((const double*)from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cd ploadu<Packet2cd>(const std::complex<double>* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cd(_mm256_loadu_pd((const double*)from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cd pset1<Packet2cd>(const std::complex<double>& from) {
+  // in case casting to a __m128d* is really not safe, then we can still fallback to this version: (much slower though)
+  //   return Packet2cd(_mm256_loadu2_m128d((const double*)&from,(const double*)&from));
+  return Packet2cd(_mm256_broadcast_pd((const __m128d*)(const void*)&from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cd ploaddup<Packet2cd>(const std::complex<double>* from) {
+  return pset1<Packet2cd>(*from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<std::complex<double> >(std::complex<double>* to, const Packet2cd& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm256_store_pd((double*)to, from.v);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double>* to, const Packet2cd& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_pd((double*)to, from.v);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet2cd pgather<std::complex<double>, Packet2cd>(const std::complex<double>* from,
+                                                                            Index stride) {
+  return Packet2cd(_mm256_set_pd(std::imag(from[1 * stride]), std::real(from[1 * stride]), std::imag(from[0 * stride]),
+                                 std::real(from[0 * stride])));
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet2cd>(std::complex<double>* to, const Packet2cd& from,
+                                                                        Index stride) {
+  __m128d low = _mm256_extractf128_pd(from.v, 0);
+  to[stride * 0] = std::complex<double>(_mm_cvtsd_f64(low), _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1)));
+  __m128d high = _mm256_extractf128_pd(from.v, 1);
+  to[stride * 1] = std::complex<double>(_mm_cvtsd_f64(high), _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1)));
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet2cd>(const Packet2cd& a) {
+  __m128d low = _mm256_extractf128_pd(a.v, 0);
+  EIGEN_ALIGN16 double res[2];
+  _mm_store_pd(res, low);
+  return std::complex<double>(res[0], res[1]);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cd preverse(const Packet2cd& a) {
+  __m256d result = _mm256_permute2f128_pd(a.v, a.v, 1);
+  return Packet2cd(result);
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux<Packet2cd>(const Packet2cd& a) {
+  return predux(padd(Packet1cd(_mm256_extractf128_pd(a.v, 0)), Packet1cd(_mm256_extractf128_pd(a.v, 1))));
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet2cd>(const Packet2cd& a) {
+  return predux(pmul(Packet1cd(_mm256_extractf128_pd(a.v, 0)), Packet1cd(_mm256_extractf128_pd(a.v, 1))));
+}
+
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cd, Packet4d)
+
+template <>
+EIGEN_STRONG_INLINE Packet2cd pdiv<Packet2cd>(const Packet2cd& a, const Packet2cd& b) {
+  return pdiv_complex(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cd pcplxflip<Packet2cd>(const Packet2cd& x) {
+  return Packet2cd(_mm256_shuffle_pd(x.v, x.v, 0x5));
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4cf, 4>& kernel) {
+  __m256d P0 = _mm256_castps_pd(kernel.packet[0].v);
+  __m256d P1 = _mm256_castps_pd(kernel.packet[1].v);
+  __m256d P2 = _mm256_castps_pd(kernel.packet[2].v);
+  __m256d P3 = _mm256_castps_pd(kernel.packet[3].v);
+
+  __m256d T0 = _mm256_shuffle_pd(P0, P1, 15);
+  __m256d T1 = _mm256_shuffle_pd(P0, P1, 0);
+  __m256d T2 = _mm256_shuffle_pd(P2, P3, 15);
+  __m256d T3 = _mm256_shuffle_pd(P2, P3, 0);
+
+  kernel.packet[1].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T0, T2, 32));
+  kernel.packet[3].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T0, T2, 49));
+  kernel.packet[0].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T1, T3, 32));
+  kernel.packet[2].v = _mm256_castpd_ps(_mm256_permute2f128_pd(T1, T3, 49));
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2cd, 2>& kernel) {
+  __m256d tmp = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 0 + (2 << 4));
+  kernel.packet[1].v = _mm256_permute2f128_pd(kernel.packet[0].v, kernel.packet[1].v, 1 + (3 << 4));
+  kernel.packet[0].v = tmp;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cd psqrt<Packet2cd>(const Packet2cd& a) {
+  return psqrt_complex<Packet2cd>(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4cf psqrt<Packet4cf>(const Packet4cf& a) {
+  return psqrt_complex<Packet4cf>(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cd plog<Packet2cd>(const Packet2cd& a) {
+  return plog_complex<Packet2cd>(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4cf plog<Packet4cf>(const Packet4cf& a) {
+  return plog_complex<Packet4cf>(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4cf pexp<Packet4cf>(const Packet4cf& a) {
+  return pexp_complex<Packet4cf>(a);
+}
+
+#ifdef EIGEN_VECTORIZE_FMA
+// std::complex<float>
+template <>
+EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& a, const Packet4cf& b, const Packet4cf& c) {
+  __m256 a_odd = _mm256_movehdup_ps(a.v);
+  __m256 a_even = _mm256_moveldup_ps(a.v);
+  __m256 b_swap = _mm256_permute_ps(b.v, _MM_SHUFFLE(2, 3, 0, 1));
+  __m256 result = _mm256_fmaddsub_ps(a_even, b.v, _mm256_fmaddsub_ps(a_odd, b_swap, c.v));
+  return Packet4cf(result);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cf pmsub(const Packet4cf& a, const Packet4cf& b, const Packet4cf& c) {
+  __m256 a_odd = _mm256_movehdup_ps(a.v);
+  __m256 a_even = _mm256_moveldup_ps(a.v);
+  __m256 b_swap = _mm256_permute_ps(b.v, _MM_SHUFFLE(2, 3, 0, 1));
+  __m256 result = _mm256_fmaddsub_ps(a_even, b.v, _mm256_fmsubadd_ps(a_odd, b_swap, c.v));
+  return Packet4cf(result);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cf pnmadd(const Packet4cf& a, const Packet4cf& b, const Packet4cf& c) {
+  return pnegate(pmsub(a, b, c));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cf pnmsub(const Packet4cf& a, const Packet4cf& b, const Packet4cf& c) {
+  return pnegate(pmadd(a, b, c));
+}
+// std::complex<double>
+template <>
+EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& a, const Packet2cd& b, const Packet2cd& c) {
+  __m256d a_odd = _mm256_permute_pd(a.v, 0xF);
+  __m256d a_even = _mm256_movedup_pd(a.v);
+  __m256d b_swap = _mm256_permute_pd(b.v, 0x5);
+  __m256d result = _mm256_fmaddsub_pd(a_even, b.v, _mm256_fmaddsub_pd(a_odd, b_swap, c.v));
+  return Packet2cd(result);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cd pmsub(const Packet2cd& a, const Packet2cd& b, const Packet2cd& c) {
+  __m256d a_odd = _mm256_permute_pd(a.v, 0xF);
+  __m256d a_even = _mm256_movedup_pd(a.v);
+  __m256d b_swap = _mm256_permute_pd(b.v, 0x5);
+  __m256d result = _mm256_fmaddsub_pd(a_even, b.v, _mm256_fmsubadd_pd(a_odd, b_swap, c.v));
+  return Packet2cd(result);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cd pnmadd(const Packet2cd& a, const Packet2cd& b, const Packet2cd& c) {
+  return pnegate(pmsub(a, b, c));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cd pnmsub(const Packet2cd& a, const Packet2cd& b, const Packet2cd& c) {
+  return pnegate(pmadd(a, b, c));
+}
+#endif
+
+/*---------------- load/store segment support ----------------*/
+
+/*---------------- std::complex<float> ----------------*/
+
+template <>
+struct has_packet_segment<Packet2cf> : std::true_type {};
+
+template <>
+struct has_packet_segment<Packet4cf> : std::true_type {};
+
+template <>
+inline Packet2cf ploaduSegment<Packet2cf>(const std::complex<float>* from, Index begin, Index count) {
+  return (Packet2cf)_mm_maskload_ps(&numext::real_ref(*from), segment_mask_2x64(begin, count));
+}
+
+template <>
+inline void pstoreuSegment<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from, Index begin,
+                                                           Index count) {
+  _mm_maskstore_ps(&numext::real_ref(*to), segment_mask_2x64(begin, count), from.v);
+}
+
+template <>
+inline Packet4cf ploaduSegment<Packet4cf>(const std::complex<float>* from, Index begin, Index count) {
+  return (Packet4cf)_mm256_maskload_ps(&numext::real_ref(*from), segment_mask_4x64(begin, count));
+}
+
+template <>
+inline void pstoreuSegment<std::complex<float>, Packet4cf>(std::complex<float>* to, const Packet4cf& from, Index begin,
+                                                           Index count) {
+  _mm256_maskstore_ps(&numext::real_ref(*to), segment_mask_4x64(begin, count), from.v);
+}
+
+/*---------------- std::complex<double> ----------------*/
+
+template <>
+struct has_packet_segment<Packet2cd> : std::true_type {};
+
+template <>
+inline Packet2cd ploaduSegment<Packet2cd>(const std::complex<double>* from, Index begin, Index count) {
+  return (Packet2cd)_mm256_maskload_pd(&numext::real_ref(*from), segment_mask_4x64(2 * begin, 2 * count));
+}
+
+template <>
+inline void pstoreuSegment<std::complex<double>, Packet2cd>(std::complex<double>* to, const Packet2cd& from,
+                                                            Index begin, Index count) {
+  _mm256_maskstore_pd(&numext::real_ref(*to), segment_mask_4x64(2 * begin, 2 * count), from.v);
+}
+
+/*---------------- end load/store segment support ----------------*/
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_COMPLEX_AVX_H
diff --git a/inst/include/Eigen/src/Core/arch/AVX/MathFunctions.h b/inst/include/Eigen/src/Core/arch/AVX/MathFunctions.h
new file mode 100644
index 00000000..5b7285f9
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/AVX/MathFunctions.h
@@ -0,0 +1,130 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATH_FUNCTIONS_AVX_H
+#define EIGEN_MATH_FUNCTIONS_AVX_H
+
+/* The sin and cos functions of this file are loosely derived from
+ * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
+ */
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(Packet8f)
+
+EIGEN_DOUBLE_PACKET_FUNCTION(atanh, Packet4d)
+EIGEN_DOUBLE_PACKET_FUNCTION(log, Packet4d)
+EIGEN_DOUBLE_PACKET_FUNCTION(log2, Packet4d)
+EIGEN_DOUBLE_PACKET_FUNCTION(exp, Packet4d)
+EIGEN_DOUBLE_PACKET_FUNCTION(tanh, Packet4d)
+EIGEN_DOUBLE_PACKET_FUNCTION(cbrt, Packet4d)
+#ifdef EIGEN_VECTORIZE_AVX2
+EIGEN_DOUBLE_PACKET_FUNCTION(sin, Packet4d)
+EIGEN_DOUBLE_PACKET_FUNCTION(cos, Packet4d)
+#endif
+EIGEN_GENERIC_PACKET_FUNCTION(atan, Packet4d)
+EIGEN_GENERIC_PACKET_FUNCTION(exp2, Packet4d)
+
+// Notice that for newer processors, it is counterproductive to use Newton
+// iteration for square root. In particular, Skylake and Zen2 processors
+// have approximately doubled throughput of the _mm_sqrt_ps instruction
+// compared to their predecessors.
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8f psqrt<Packet8f>(const Packet8f& _x) {
+  return _mm256_sqrt_ps(_x);
+}
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4d psqrt<Packet4d>(const Packet4d& _x) {
+  return _mm256_sqrt_pd(_x);
+}
+
+// Even on Skylake, using Newton iteration is a win for reciprocal square root.
+#if EIGEN_FAST_MATH
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8f prsqrt<Packet8f>(const Packet8f& a) {
+  // _mm256_rsqrt_ps returns -inf for negative denormals.
+  // _mm512_rsqrt**_ps returns -NaN for negative denormals.  We may want
+  // consistency here.
+  // const Packet8f rsqrt = pselect(pcmp_lt(a, pzero(a)),
+  //                                pset1<Packet8f>(-NumTraits<float>::quiet_NaN()),
+  //                                _mm256_rsqrt_ps(a));
+  return generic_rsqrt_newton_step<Packet8f, /*Steps=*/1>::run(a, _mm256_rsqrt_ps(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8f preciprocal<Packet8f>(const Packet8f& a) {
+  return generic_reciprocal_newton_step<Packet8f, /*Steps=*/1>::run(a, _mm256_rcp_ps(a));
+}
+
+#endif
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pfrexp(const Packet8h& a, Packet8h& exponent) {
+  Packet8f fexponent;
+  const Packet8h out = float2half(pfrexp<Packet8f>(half2float(a), fexponent));
+  exponent = float2half(fexponent);
+  return out;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pldexp(const Packet8h& a, const Packet8h& exponent) {
+  return float2half(pldexp<Packet8f>(half2float(a), half2float(exponent)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pfrexp(const Packet8bf& a, Packet8bf& exponent) {
+  Packet8f fexponent;
+  const Packet8bf out = F32ToBf16(pfrexp<Packet8f>(Bf16ToF32(a), fexponent));
+  exponent = F32ToBf16(fexponent);
+  return out;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pldexp(const Packet8bf& a, const Packet8bf& exponent) {
+  return F32ToBf16(pldexp<Packet8f>(Bf16ToF32(a), Bf16ToF32(exponent)));
+}
+
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pcos)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pexp)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pexp2)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pexpm1)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, plog)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, plog1p)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, plog2)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, preciprocal)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, prsqrt)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, psin)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, psqrt)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, ptanh)
+
+#ifndef EIGEN_VECTORIZE_AVX512FP16
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pcos)
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pexp)
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pexp2)
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pexpm1)
+F16_PACKET_FUNCTION(Packet8f, Packet8h, plog)
+F16_PACKET_FUNCTION(Packet8f, Packet8h, plog1p)
+F16_PACKET_FUNCTION(Packet8f, Packet8h, plog2)
+F16_PACKET_FUNCTION(Packet8f, Packet8h, preciprocal)
+F16_PACKET_FUNCTION(Packet8f, Packet8h, prsqrt)
+F16_PACKET_FUNCTION(Packet8f, Packet8h, psin)
+F16_PACKET_FUNCTION(Packet8f, Packet8h, psqrt)
+F16_PACKET_FUNCTION(Packet8f, Packet8h, ptanh)
+#endif
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_MATH_FUNCTIONS_AVX_H
diff --git a/inst/include/Eigen/src/Core/arch/AVX/PacketMath.h b/inst/include/Eigen/src/Core/arch/AVX/PacketMath.h
new file mode 100644
index 00000000..b1dfb07f
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/AVX/PacketMath.h
@@ -0,0 +1,3081 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner (benoit.steiner.goog@gmail.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_AVX_H
+#define EIGEN_PACKET_MATH_AVX_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
+#endif
+
+#if !defined(EIGEN_VECTORIZE_AVX512) && !defined(EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS)
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
+#endif
+
+#ifdef EIGEN_VECTORIZE_FMA
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#endif
+#endif
+
+typedef __m256 Packet8f;
+typedef eigen_packet_wrapper<__m256i, 0> Packet8i;
+typedef __m256d Packet4d;
+#ifndef EIGEN_VECTORIZE_AVX512FP16
+typedef eigen_packet_wrapper<__m128i, 2> Packet8h;
+#endif
+typedef eigen_packet_wrapper<__m128i, 3> Packet8bf;
+typedef eigen_packet_wrapper<__m256i, 4> Packet8ui;
+
+#ifdef EIGEN_VECTORIZE_AVX2
+// Start from 3 to be compatible with AVX512
+typedef eigen_packet_wrapper<__m256i, 3> Packet4l;
+typedef eigen_packet_wrapper<__m256i, 5> Packet4ul;
+#endif
+
+template <>
+struct is_arithmetic<__m256> {
+  enum { value = true };
+};
+template <>
+struct is_arithmetic<__m256i> {
+  enum { value = true };
+};
+template <>
+struct is_arithmetic<__m256d> {
+  enum { value = true };
+};
+template <>
+struct is_arithmetic<Packet8i> {
+  enum { value = true };
+};
+// Note that `Packet8ui` uses the underlying type `__m256i`, which is
+// interpreted as a vector of _signed_ `int32`s, which breaks some arithmetic
+// operations used in `GenericPacketMath.h`.
+template <>
+struct is_arithmetic<Packet8ui> {
+  enum { value = false };
+};
+#ifndef EIGEN_VECTORIZE_AVX512FP16
+template <>
+struct is_arithmetic<Packet8h> {
+  enum { value = true };
+};
+#endif
+template <>
+struct is_arithmetic<Packet8bf> {
+  enum { value = true };
+};
+#ifdef EIGEN_VECTORIZE_AVX2
+template <>
+struct is_arithmetic<Packet4l> {
+  enum { value = true };
+};
+// Note that `Packet4ul` uses the underlying type `__m256i`, which is
+// interpreted as a vector of _signed_ `int32`s, which breaks some arithmetic
+// operations used in `GenericPacketMath.h`.
+template <>
+struct is_arithmetic<Packet4ul> {
+  enum { value = false };
+};
+#endif
+
+// Use the packet_traits defined in AVX512/PacketMath.h instead if we're going
+// to leverage AVX512 instructions.
+#ifndef EIGEN_VECTORIZE_AVX512
+template <>
+struct packet_traits<float> : default_packet_traits {
+  typedef Packet8f type;
+  typedef Packet4f half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 8,
+
+    HasCmp = 1,
+    HasDiv = 1,
+    HasReciprocal = EIGEN_FAST_MATH,
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+    HasACos = 1,
+    HasASin = 1,
+    HasATan = 1,
+    HasATanh = 1,
+    HasLog = 1,
+    HasLog1p = 1,
+    HasExpm1 = 1,
+    HasExp = 1,
+    HasPow = 1,
+    HasNdtri = 1,
+    HasBessel = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasCbrt = 1,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH,
+    HasErfc = EIGEN_FAST_MATH,
+    HasBlend = 1
+  };
+};
+template <>
+struct packet_traits<double> : default_packet_traits {
+  typedef Packet4d type;
+  typedef Packet2d half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+
+    HasCmp = 1,
+    HasDiv = 1,
+#ifdef EIGEN_VECTORIZE_AVX2
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+#endif
+    HasTanh = EIGEN_FAST_MATH,
+    HasLog = 1,
+    HasErf = 1,
+    HasErfc = 1,
+    HasExp = 1,
+    HasPow = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasCbrt = 1,
+    HasATan = 1,
+    HasATanh = 1,
+    HasBlend = 1
+  };
+};
+
+template <>
+struct packet_traits<Eigen::half> : default_packet_traits {
+  typedef Packet8h type;
+  // There is no half-size packet for Packet8h.
+  typedef Packet8h half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 8,
+
+    HasCmp = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasAbs2 = 0,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
+    HasSetLinear = 0,
+    HasLog = 1,
+    HasLog1p = 1,
+    HasExpm1 = 1,
+    HasExp = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH,
+    HasBlend = 0,
+    HasBessel = 1,
+    HasNdtri = 1
+  };
+};
+
+template <>
+struct packet_traits<bfloat16> : default_packet_traits {
+  typedef Packet8bf type;
+  // There is no half-size packet for current Packet8bf.
+  // TODO: support as SSE path.
+  typedef Packet8bf half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 8,
+
+    HasCmp = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasAbs2 = 0,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
+    HasSetLinear = 0,
+    HasLog = 1,
+    HasLog1p = 1,
+    HasExpm1 = 1,
+    HasExp = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH,
+    HasBlend = 0,
+    HasBessel = 1,
+    HasNdtri = 1
+  };
+};
+
+template <>
+struct packet_traits<int> : default_packet_traits {
+  typedef Packet8i type;
+  typedef Packet4i half;
+  enum { Vectorizable = 1, AlignedOnScalar = 1, HasCmp = 1, HasDiv = 1, size = 8 };
+};
+template <>
+struct packet_traits<uint32_t> : default_packet_traits {
+  typedef Packet8ui type;
+  typedef Packet4ui half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 8,
+
+    HasDiv = 0,
+    HasNegate = 0,
+    HasSqrt = 0,
+
+    HasCmp = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasShift = 1
+  };
+};
+
+#ifdef EIGEN_VECTORIZE_AVX2
+template <>
+struct packet_traits<int64_t> : default_packet_traits {
+  typedef Packet4l type;
+  typedef Packet2l half;
+  enum { Vectorizable = 1, AlignedOnScalar = 1, HasCmp = 1, size = 4 };
+};
+template <>
+struct packet_traits<uint64_t> : default_packet_traits {
+  typedef Packet4ul type;
+  // There is no half-size packet for current Packet4ul.
+  // TODO: support as SSE path.
+  typedef Packet4ul half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+
+    // HasMin = 0,
+    // HasMax = 0,
+    HasDiv = 0,
+    HasBlend = 0,
+    HasTranspose = 0,
+    HasNegate = 0,
+    HasSqrt = 0,
+    HasCmp = 1,
+    HasShift = 1
+  };
+};
+#endif
+
+#endif
+
+template <>
+struct scalar_div_cost<float, true> {
+  enum { value = 14 };
+};
+template <>
+struct scalar_div_cost<double, true> {
+  enum { value = 16 };
+};
+
+template <>
+struct unpacket_traits<Packet8f> {
+  typedef float type;
+  typedef Packet4f half;
+  typedef Packet8i integer_packet;
+  typedef uint8_t mask_t;
+  enum {
+    size = 8,
+    alignment = Aligned32,
+    vectorizable = true,
+    masked_load_available = true,
+    masked_store_available = true
+#ifdef EIGEN_VECTORIZE_AVX512
+    ,
+    masked_fpops_available = true
+#endif
+  };
+};
+template <>
+struct unpacket_traits<Packet4d> {
+  typedef double type;
+  typedef Packet2d half;
+#ifdef EIGEN_VECTORIZE_AVX2
+  typedef Packet4l integer_packet;
+#endif
+  enum {
+    size = 4,
+    alignment = Aligned32,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template <>
+struct unpacket_traits<Packet8i> {
+  typedef int type;
+  typedef Packet4i half;
+  enum {
+    size = 8,
+    alignment = Aligned32,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template <>
+struct unpacket_traits<Packet8ui> {
+  typedef uint32_t type;
+  typedef Packet4ui half;
+  enum {
+    size = 8,
+    alignment = Aligned32,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+#ifdef EIGEN_VECTORIZE_AVX2
+template <>
+struct unpacket_traits<Packet4l> {
+  typedef int64_t type;
+  typedef Packet2l half;
+  enum {
+    size = 4,
+    alignment = Aligned32,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template <>
+struct unpacket_traits<Packet4ul> {
+  typedef uint64_t type;
+  typedef Packet4ul half;
+  enum {
+    size = 4,
+    alignment = Aligned32,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+#endif
+template <>
+struct unpacket_traits<Packet8bf> {
+  typedef bfloat16 type;
+  typedef Packet8bf half;
+  enum {
+    size = 8,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+// Helper function for bit packing snippet of low precision comparison.
+// It packs the flags from 16x16 to 8x16.
+EIGEN_STRONG_INLINE __m128i Pack16To8(Packet8f rf) {
+  return _mm_packs_epi32(_mm256_extractf128_si256(_mm256_castps_si256(rf), 0),
+                         _mm256_extractf128_si256(_mm256_castps_si256(rf), 1));
+}
+
+#ifdef EIGEN_VECTORIZE_AVX2
+template <>
+EIGEN_STRONG_INLINE Packet4l pset1<Packet4l>(const int64_t& from) {
+  return _mm256_set1_epi64x(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ul pset1<Packet4ul>(const uint64_t& from) {
+  return _mm256_set1_epi64x(numext::bit_cast<uint64_t>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4l pzero(const Packet4l& /*a*/) {
+  return _mm256_setzero_si256();
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ul pzero(const Packet4ul& /*a*/) {
+  return _mm256_setzero_si256();
+}
+template <>
+EIGEN_STRONG_INLINE Packet4l peven_mask(const Packet4l& /*a*/) {
+  return _mm256_set_epi64x(0ll, -1ll, 0ll, -1ll);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ul peven_mask(const Packet4ul& /*a*/) {
+  return _mm256_set_epi64x(0ll, -1ll, 0ll, -1ll);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4l pload1<Packet4l>(const int64_t* from) {
+  return _mm256_set1_epi64x(*from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ul pload1<Packet4ul>(const uint64_t* from) {
+  return _mm256_set1_epi64x(*from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4l padd<Packet4l>(const Packet4l& a, const Packet4l& b) {
+  return _mm256_add_epi64(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ul padd<Packet4ul>(const Packet4ul& a, const Packet4ul& b) {
+  return _mm256_add_epi64(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4l plset<Packet4l>(const int64_t& a) {
+  return padd(pset1<Packet4l>(a), Packet4l(_mm256_set_epi64x(3ll, 2ll, 1ll, 0ll)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ul plset<Packet4ul>(const uint64_t& a) {
+  return padd(pset1<Packet4ul>(a), Packet4ul(_mm256_set_epi64x(3ll, 2ll, 1ll, 0ll)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4l psub<Packet4l>(const Packet4l& a, const Packet4l& b) {
+  return _mm256_sub_epi64(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ul psub<Packet4ul>(const Packet4ul& a, const Packet4ul& b) {
+  return _mm256_sub_epi64(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4l pnegate(const Packet4l& a) {
+  return psub(pzero(a), a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4l pconj(const Packet4l& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4l pcmp_le(const Packet4l& a, const Packet4l& b) {
+  return _mm256_xor_si256(_mm256_cmpgt_epi64(a, b), _mm256_set1_epi32(-1));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ul pcmp_le(const Packet4ul& a, const Packet4ul& b) {
+  return (Packet4ul)pcmp_le((Packet4l)psub(a, pset1<Packet4ul>(0x8000000000000000UL)),
+                            (Packet4l)psub(b, pset1<Packet4ul>(0x8000000000000000UL)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4l pcmp_lt(const Packet4l& a, const Packet4l& b) {
+  return _mm256_cmpgt_epi64(b, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ul pcmp_lt(const Packet4ul& a, const Packet4ul& b) {
+  return (Packet4ul)pcmp_lt((Packet4l)psub(a, pset1<Packet4ul>(0x8000000000000000UL)),
+                            (Packet4l)psub(b, pset1<Packet4ul>(0x8000000000000000UL)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4l pcmp_eq(const Packet4l& a, const Packet4l& b) {
+  return _mm256_cmpeq_epi64(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ul pcmp_eq(const Packet4ul& a, const Packet4ul& b) {
+  return _mm256_cmpeq_epi64(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4l ptrue<Packet4l>(const Packet4l& a) {
+  return _mm256_cmpeq_epi64(a, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ul ptrue<Packet4ul>(const Packet4ul& a) {
+  return _mm256_cmpeq_epi64(a, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4l pand<Packet4l>(const Packet4l& a, const Packet4l& b) {
+  return _mm256_and_si256(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4l por<Packet4l>(const Packet4l& a, const Packet4l& b) {
+  return _mm256_or_si256(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4l pxor<Packet4l>(const Packet4l& a, const Packet4l& b) {
+  return _mm256_xor_si256(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ul pxor<Packet4ul>(const Packet4ul& a, const Packet4ul& b) {
+  return _mm256_xor_si256(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4l pandnot<Packet4l>(const Packet4l& a, const Packet4l& b) {
+  return _mm256_andnot_si256(b, a);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4l plogical_shift_right(Packet4l a) {
+  return _mm256_srli_epi64(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4l plogical_shift_left(Packet4l a) {
+  return _mm256_slli_epi64(a, N);
+}
+#ifdef EIGEN_VECTORIZE_AVX512FP16
+template <int N>
+EIGEN_STRONG_INLINE Packet4l parithmetic_shift_right(Packet4l a) {
+  return _mm256_srai_epi64(a, N);
+}
+#else
+template <int N>
+EIGEN_STRONG_INLINE std::enable_if_t<(N == 0), Packet4l> parithmetic_shift_right(Packet4l a) {
+  return a;
+}
+template <int N>
+EIGEN_STRONG_INLINE std::enable_if_t<(N > 0) && (N < 32), Packet4l> parithmetic_shift_right(Packet4l a) {
+  __m256i hi_word = _mm256_srai_epi32(a, N);
+  __m256i lo_word = _mm256_srli_epi64(a, N);
+  return _mm256_blend_epi32(hi_word, lo_word, 0b01010101);
+}
+template <int N>
+EIGEN_STRONG_INLINE std::enable_if_t<(N >= 32) && (N < 63), Packet4l> parithmetic_shift_right(Packet4l a) {
+  __m256i hi_word = _mm256_srai_epi32(a, 31);
+  __m256i lo_word = _mm256_shuffle_epi32(_mm256_srai_epi32(a, N - 32), (shuffle_mask<1, 1, 3, 3>::mask));
+  return _mm256_blend_epi32(hi_word, lo_word, 0b01010101);
+}
+template <int N>
+EIGEN_STRONG_INLINE std::enable_if_t<(N == 63), Packet4l> parithmetic_shift_right(Packet4l a) {
+  return _mm256_cmpgt_epi64(_mm256_setzero_si256(), a);
+}
+template <int N>
+EIGEN_STRONG_INLINE std::enable_if_t<(N < 0) || (N > 63), Packet4l> parithmetic_shift_right(Packet4l a) {
+  return parithmetic_shift_right<int(N & 63)>(a);
+}
+#endif
+template <>
+EIGEN_STRONG_INLINE Packet4l pload<Packet4l>(const int64_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(reinterpret_cast<const __m256i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ul pload<Packet4ul>(const uint64_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(reinterpret_cast<const __m256i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4l ploadu<Packet4l>(const int64_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ul ploadu<Packet4ul>(const uint64_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
+}
+// Loads 2 int64_ts from memory a returns the packet {a0, a0, a1, a1}
+template <>
+EIGEN_STRONG_INLINE Packet4l ploaddup<Packet4l>(const int64_t* from) {
+  const Packet4l a = _mm256_castsi128_si256(_mm_loadu_si128(reinterpret_cast<const __m128i*>(from)));
+  return _mm256_permutevar8x32_epi32(a, _mm256_setr_epi32(0, 1, 0, 1, 2, 3, 2, 3));
+}
+// Loads 2 uint64_ts from memory a returns the packet {a0, a0, a1, a1}
+template <>
+EIGEN_STRONG_INLINE Packet4ul ploaddup<Packet4ul>(const uint64_t* from) {
+  const Packet4ul a = _mm256_castsi128_si256(_mm_loadu_si128(reinterpret_cast<const __m128i*>(from)));
+  return _mm256_permutevar8x32_epi32(a, _mm256_setr_epi32(0, 1, 0, 1, 2, 3, 2, 3));
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<int64_t>(int64_t* to, const Packet4l& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to), from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<uint64_t>(uint64_t* to, const Packet4ul& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to), from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int64_t>(int64_t* to, const Packet4l& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<uint64_t>(uint64_t* to, const Packet4ul& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from);
+}
+template <>
+EIGEN_DEVICE_FUNC inline Packet4l pgather<int64_t, Packet4l>(const int64_t* from, Index stride) {
+  return _mm256_set_epi64x(from[3 * stride], from[2 * stride], from[1 * stride], from[0 * stride]);
+}
+template <>
+EIGEN_DEVICE_FUNC inline Packet4ul pgather<uint64_t, Packet4ul>(const uint64_t* from, Index stride) {
+  return _mm256_set_epi64x(from[3 * stride], from[2 * stride], from[1 * stride], from[0 * stride]);
+}
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<int64_t, Packet4l>(int64_t* to, const Packet4l& from, Index stride) {
+  __m128i low = _mm256_extractf128_si256(from, 0);
+  to[stride * 0] = _mm_extract_epi64_0(low);
+  to[stride * 1] = _mm_extract_epi64_1(low);
+
+  __m128i high = _mm256_extractf128_si256(from, 1);
+  to[stride * 2] = _mm_extract_epi64_0(high);
+  to[stride * 3] = _mm_extract_epi64_1(high);
+}
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<uint64_t, Packet4ul>(uint64_t* to, const Packet4ul& from, Index stride) {
+  __m128i low = _mm256_extractf128_si256(from, 0);
+  to[stride * 0] = _mm_extract_epi64_0(low);
+  to[stride * 1] = _mm_extract_epi64_1(low);
+
+  __m128i high = _mm256_extractf128_si256(from, 1);
+  to[stride * 2] = _mm_extract_epi64_0(high);
+  to[stride * 3] = _mm_extract_epi64_1(high);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore1<Packet4l>(int64_t* to, const int64_t& a) {
+  Packet4l pa = pset1<Packet4l>(a);
+  pstore(to, pa);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore1<Packet4ul>(uint64_t* to, const uint64_t& a) {
+  Packet4ul pa = pset1<Packet4ul>(a);
+  pstore(to, pa);
+}
+template <>
+EIGEN_STRONG_INLINE int64_t pfirst<Packet4l>(const Packet4l& a) {
+  return _mm_extract_epi64_0(_mm256_castsi256_si128(a));
+}
+template <>
+EIGEN_STRONG_INLINE uint64_t pfirst<Packet4ul>(const Packet4ul& a) {
+  return _mm_extract_epi64_0(_mm256_castsi256_si128(a));
+}
+
+#define MM256_SHUFFLE_EPI64(A, B, M) _mm256_shuffle_pd(_mm256_castsi256_pd(A), _mm256_castsi256_pd(B), M)
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4l, 4>& kernel) {
+  __m256d T0 = MM256_SHUFFLE_EPI64(kernel.packet[0], kernel.packet[1], 15);
+  __m256d T1 = MM256_SHUFFLE_EPI64(kernel.packet[0], kernel.packet[1], 0);
+  __m256d T2 = MM256_SHUFFLE_EPI64(kernel.packet[2], kernel.packet[3], 15);
+  __m256d T3 = MM256_SHUFFLE_EPI64(kernel.packet[2], kernel.packet[3], 0);
+
+  kernel.packet[1] = _mm256_castpd_si256(_mm256_permute2f128_pd(T0, T2, 32));
+  kernel.packet[3] = _mm256_castpd_si256(_mm256_permute2f128_pd(T0, T2, 49));
+  kernel.packet[0] = _mm256_castpd_si256(_mm256_permute2f128_pd(T1, T3, 32));
+  kernel.packet[2] = _mm256_castpd_si256(_mm256_permute2f128_pd(T1, T3, 49));
+}
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4ul, 4>& kernel) {
+  ptranspose((PacketBlock<Packet4l, 4>&)kernel);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4l pmin<Packet4l>(const Packet4l& a, const Packet4l& b) {
+  __m256i cmp = _mm256_cmpgt_epi64(a, b);
+  __m256i a_min = _mm256_andnot_si256(cmp, a);
+  __m256i b_min = _mm256_and_si256(cmp, b);
+  return Packet4l(_mm256_or_si256(a_min, b_min));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ul pmin<Packet4ul>(const Packet4ul& a, const Packet4ul& b) {
+  return padd((Packet4ul)pmin((Packet4l)psub(a, pset1<Packet4ul>(0x8000000000000000UL)),
+                              (Packet4l)psub(b, pset1<Packet4ul>(0x8000000000000000UL))),
+              pset1<Packet4ul>(0x8000000000000000UL));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4l pmax<Packet4l>(const Packet4l& a, const Packet4l& b) {
+  __m256i cmp = _mm256_cmpgt_epi64(a, b);
+  __m256i a_min = _mm256_and_si256(cmp, a);
+  __m256i b_min = _mm256_andnot_si256(cmp, b);
+  return Packet4l(_mm256_or_si256(a_min, b_min));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ul pmax<Packet4ul>(const Packet4ul& a, const Packet4ul& b) {
+  return padd((Packet4ul)pmax((Packet4l)psub(a, pset1<Packet4ul>(0x8000000000000000UL)),
+                              (Packet4l)psub(b, pset1<Packet4ul>(0x8000000000000000UL))),
+              pset1<Packet4ul>(0x8000000000000000UL));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4l pabs<Packet4l>(const Packet4l& a) {
+  Packet4l pz = pzero<Packet4l>(a);
+  Packet4l cmp = _mm256_cmpgt_epi64(a, pz);
+  return psub(cmp, pxor(a, cmp));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ul pabs<Packet4ul>(const Packet4ul& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4l pmul<Packet4l>(const Packet4l& a, const Packet4l& b) {
+  // 64-bit mul requires avx512, so do this with 32-bit multiplication
+  __m256i upper32_a = _mm256_srli_epi64(a, 32);
+  __m256i upper32_b = _mm256_srli_epi64(b, 32);
+
+  // upper * lower
+  __m256i mul1 = _mm256_mul_epu32(upper32_a, b);
+  __m256i mul2 = _mm256_mul_epu32(upper32_b, a);
+  // Gives us both upper*upper and lower*lower
+  __m256i mul3 = _mm256_mul_epu32(a, b);
+
+  __m256i high = _mm256_slli_epi64(_mm256_add_epi64(mul1, mul2), 32);
+  return _mm256_add_epi64(high, mul3);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ul pmul<Packet4ul>(const Packet4ul& a, const Packet4ul& b) {
+  return (Packet4ul)pmul<Packet4l>((Packet4l)a, (Packet4l)b);
+}
+#endif
+
+template <>
+EIGEN_STRONG_INLINE Packet8f pset1<Packet8f>(const float& from) {
+  return _mm256_set1_ps(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pset1<Packet4d>(const double& from) {
+  return _mm256_set1_pd(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8i pset1<Packet8i>(const int& from) {
+  return _mm256_set1_epi32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8ui pset1<Packet8ui>(const uint32_t& from) {
+  return _mm256_set1_epi32(from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8f pset1frombits<Packet8f>(unsigned int from) {
+  return _mm256_castsi256_ps(pset1<Packet8i>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pset1frombits<Packet4d>(uint64_t from) {
+  return _mm256_castsi256_pd(_mm256_set1_epi64x(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8f pzero(const Packet8f& /*a*/) {
+  return _mm256_setzero_ps();
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pzero(const Packet4d& /*a*/) {
+  return _mm256_setzero_pd();
+}
+template <>
+EIGEN_STRONG_INLINE Packet8i pzero(const Packet8i& /*a*/) {
+  return _mm256_setzero_si256();
+}
+template <>
+EIGEN_STRONG_INLINE Packet8ui pzero(const Packet8ui& /*a*/) {
+  return _mm256_setzero_si256();
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8f peven_mask(const Packet8f& /*a*/) {
+  return _mm256_castsi256_ps(_mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8i peven_mask(const Packet8i& /*a*/) {
+  return _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8ui peven_mask(const Packet8ui& /*a*/) {
+  return _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d peven_mask(const Packet4d& /*a*/) {
+  return _mm256_castsi256_pd(_mm256_set_epi32(0, 0, -1, -1, 0, 0, -1, -1));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8f pload1<Packet8f>(const float* from) {
+  return _mm256_broadcast_ss(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pload1<Packet4d>(const double* from) {
+  return _mm256_broadcast_sd(from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8f padd<Packet8f>(const Packet8f& a, const Packet8f& b) {
+  return _mm256_add_ps(a, b);
+}
+#ifdef EIGEN_VECTORIZE_AVX512
+template <>
+EIGEN_STRONG_INLINE Packet8f padd<Packet8f>(const Packet8f& a, const Packet8f& b, uint8_t umask) {
+  __mmask16 mask = static_cast<__mmask16>(umask & 0x00FF);
+  return _mm512_castps512_ps256(_mm512_maskz_add_ps(mask, _mm512_castps256_ps512(a), _mm512_castps256_ps512(b)));
+}
+#endif
+template <>
+EIGEN_STRONG_INLINE Packet4d padd<Packet4d>(const Packet4d& a, const Packet4d& b) {
+  return _mm256_add_pd(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8i padd<Packet8i>(const Packet8i& a, const Packet8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_add_epi32(a, b);
+#else
+  __m128i lo = _mm_add_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
+  __m128i hi = _mm_add_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet8ui padd<Packet8ui>(const Packet8ui& a, const Packet8ui& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_add_epi32(a, b);
+#else
+  __m128i lo = _mm_add_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
+  __m128i hi = _mm_add_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8f plset<Packet8f>(const float& a) {
+  return padd(pset1<Packet8f>(a), _mm256_set_ps(7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d plset<Packet4d>(const double& a) {
+  return padd(pset1<Packet4d>(a), _mm256_set_pd(3.0, 2.0, 1.0, 0.0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8i plset<Packet8i>(const int& a) {
+  return padd(pset1<Packet8i>(a), (Packet8i)_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8ui plset<Packet8ui>(const uint32_t& a) {
+  return padd(pset1<Packet8ui>(a), (Packet8ui)_mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8f psub<Packet8f>(const Packet8f& a, const Packet8f& b) {
+  return _mm256_sub_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d psub<Packet4d>(const Packet4d& a, const Packet4d& b) {
+  return _mm256_sub_pd(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8i psub<Packet8i>(const Packet8i& a, const Packet8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_sub_epi32(a, b);
+#else
+  __m128i lo = _mm_sub_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
+  __m128i hi = _mm_sub_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet8ui psub<Packet8ui>(const Packet8ui& a, const Packet8ui& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_sub_epi32(a, b);
+#else
+  __m128i lo = _mm_sub_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
+  __m128i hi = _mm_sub_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a) {
+  const Packet8f mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000));
+  return _mm256_xor_ps(a, mask);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pnegate(const Packet4d& a) {
+  const Packet4d mask = _mm256_castsi256_pd(_mm256_set1_epi64x(0x8000000000000000ULL));
+  return _mm256_xor_pd(a, mask);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8i pnegate(const Packet8i& a) {
+  return psub(pzero(a), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8f pconj(const Packet8f& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pconj(const Packet4d& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8i pconj(const Packet8i& a) {
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8f pmul<Packet8f>(const Packet8f& a, const Packet8f& b) {
+  return _mm256_mul_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pmul<Packet4d>(const Packet4d& a, const Packet4d& b) {
+  return _mm256_mul_pd(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8i pmul<Packet8i>(const Packet8i& a, const Packet8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_mullo_epi32(a, b);
+#else
+  const __m128i lo = _mm_mullo_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
+  const __m128i hi = _mm_mullo_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet8ui pmul<Packet8ui>(const Packet8ui& a, const Packet8ui& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_mullo_epi32(a, b);
+#else
+  const __m128i lo = _mm_mullo_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
+  const __m128i hi = _mm_mullo_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8f pdiv<Packet8f>(const Packet8f& a, const Packet8f& b) {
+  return _mm256_div_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pdiv<Packet4d>(const Packet4d& a, const Packet4d& b) {
+  return _mm256_div_pd(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8i pdiv<Packet8i>(const Packet8i& a, const Packet8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX512
+  return _mm512_cvttpd_epi32(_mm512_div_pd(_mm512_cvtepi32_pd(a), _mm512_cvtepi32_pd(b)));
+#else
+  Packet4i lo = pdiv<Packet4i>(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
+  Packet4i hi = pdiv<Packet4i>(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
+#endif
+}
+
+#ifdef EIGEN_VECTORIZE_FMA
+template <>
+EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) {
+  return _mm256_fmadd_ps(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) {
+  return _mm256_fmadd_pd(a, b, c);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8f pmsub(const Packet8f& a, const Packet8f& b, const Packet8f& c) {
+  return _mm256_fmsub_ps(a, b, c);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4d pmsub(const Packet4d& a, const Packet4d& b, const Packet4d& c) {
+  return _mm256_fmsub_pd(a, b, c);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8f pnmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) {
+  return _mm256_fnmadd_ps(a, b, c);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4d pnmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) {
+  return _mm256_fnmadd_pd(a, b, c);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8f pnmsub(const Packet8f& a, const Packet8f& b, const Packet8f& c) {
+  return _mm256_fnmsub_ps(a, b, c);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4d pnmsub(const Packet4d& a, const Packet4d& b, const Packet4d& c) {
+  return _mm256_fnmsub_pd(a, b, c);
+}
+
+#endif
+
+template <>
+EIGEN_STRONG_INLINE Packet8f pcmp_le(const Packet8f& a, const Packet8f& b) {
+  return _mm256_cmp_ps(a, b, _CMP_LE_OQ);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pcmp_lt(const Packet8f& a, const Packet8f& b) {
+  return _mm256_cmp_ps(a, b, _CMP_LT_OQ);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pcmp_lt_or_nan(const Packet8f& a, const Packet8f& b) {
+  return _mm256_cmp_ps(a, b, _CMP_NGE_UQ);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pcmp_eq(const Packet8f& a, const Packet8f& b) {
+  return _mm256_cmp_ps(a, b, _CMP_EQ_OQ);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pisnan(const Packet8f& a) {
+  return _mm256_cmp_ps(a, a, _CMP_UNORD_Q);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4d pcmp_le(const Packet4d& a, const Packet4d& b) {
+  return _mm256_cmp_pd(a, b, _CMP_LE_OQ);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pcmp_lt(const Packet4d& a, const Packet4d& b) {
+  return _mm256_cmp_pd(a, b, _CMP_LT_OQ);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pcmp_lt_or_nan(const Packet4d& a, const Packet4d& b) {
+  return _mm256_cmp_pd(a, b, _CMP_NGE_UQ);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pcmp_eq(const Packet4d& a, const Packet4d& b) {
+  return _mm256_cmp_pd(a, b, _CMP_EQ_OQ);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8i pcmp_le(const Packet8i& a, const Packet8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_xor_si256(_mm256_cmpgt_epi32(a, b), _mm256_set1_epi32(-1));
+#else
+  __m128i lo = _mm_cmpgt_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
+  lo = _mm_xor_si128(lo, _mm_set1_epi32(-1));
+  __m128i hi = _mm_cmpgt_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
+  hi = _mm_xor_si128(hi, _mm_set1_epi32(-1));
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet8i pcmp_lt(const Packet8i& a, const Packet8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_cmpgt_epi32(b, a);
+#else
+  __m128i lo = _mm_cmpgt_epi32(_mm256_extractf128_si256(b, 0), _mm256_extractf128_si256(a, 0));
+  __m128i hi = _mm_cmpgt_epi32(_mm256_extractf128_si256(b, 1), _mm256_extractf128_si256(a, 1));
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet8i pcmp_eq(const Packet8i& a, const Packet8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_cmpeq_epi32(a, b);
+#else
+  __m128i lo = _mm_cmpeq_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
+  __m128i hi = _mm_cmpeq_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet8ui pcmp_eq(const Packet8ui& a, const Packet8ui& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_cmpeq_epi32(a, b);
+#else
+  __m128i lo = _mm_cmpeq_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
+  __m128i hi = _mm_cmpeq_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8f pmin<Packet8f>(const Packet8f& a, const Packet8f& b) {
+#if EIGEN_GNUC_STRICT_LESS_THAN(6, 3, 0)
+  // There appears to be a bug in GCC, by which the optimizer may flip
+  // the argument order in calls to _mm_min_ps/_mm_max_ps, so we have to
+  // resort to inline ASM here. This is supposed to be fixed in gcc6.3,
+  // see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
+  Packet8f res;
+  asm("vminps %[a], %[b], %[res]" : [res] "=x"(res) : [a] "x"(a), [b] "x"(b));
+  return res;
+#else
+  // Arguments are swapped to match NaN propagation behavior of std::min.
+  return _mm256_min_ps(b, a);
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pmin<Packet4d>(const Packet4d& a, const Packet4d& b) {
+#if EIGEN_GNUC_STRICT_LESS_THAN(6, 3, 0)
+  // See pmin above
+  Packet4d res;
+  asm("vminpd %[a], %[b], %[res]" : [res] "=x"(res) : [a] "x"(a), [b] "x"(b));
+  return res;
+#else
+  // Arguments are swapped to match NaN propagation behavior of std::min.
+  return _mm256_min_pd(b, a);
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet8i pmin<Packet8i>(const Packet8i& a, const Packet8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_min_epi32(a, b);
+#else
+  __m128i lo = _mm_min_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
+  __m128i hi = _mm_min_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet8ui pmin<Packet8ui>(const Packet8ui& a, const Packet8ui& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_min_epu32(a, b);
+#else
+  __m128i lo = _mm_min_epu32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
+  __m128i hi = _mm_min_epu32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8f pmax<Packet8f>(const Packet8f& a, const Packet8f& b) {
+#if EIGEN_GNUC_STRICT_LESS_THAN(6, 3, 0)
+  // See pmin above
+  Packet8f res;
+  asm("vmaxps %[a], %[b], %[res]" : [res] "=x"(res) : [a] "x"(a), [b] "x"(b));
+  return res;
+#else
+  // Arguments are swapped to match NaN propagation behavior of std::max.
+  return _mm256_max_ps(b, a);
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pmax<Packet4d>(const Packet4d& a, const Packet4d& b) {
+#if EIGEN_GNUC_STRICT_LESS_THAN(6, 3, 0)
+  // See pmin above
+  Packet4d res;
+  asm("vmaxpd %[a], %[b], %[res]" : [res] "=x"(res) : [a] "x"(a), [b] "x"(b));
+  return res;
+#else
+  // Arguments are swapped to match NaN propagation behavior of std::max.
+  return _mm256_max_pd(b, a);
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet8i pmax<Packet8i>(const Packet8i& a, const Packet8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_max_epi32(a, b);
+#else
+  __m128i lo = _mm_max_epi32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
+  __m128i hi = _mm_max_epi32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet8ui pmax<Packet8ui>(const Packet8ui& a, const Packet8ui& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_max_epu32(a, b);
+#else
+  __m128i lo = _mm_max_epu32(_mm256_extractf128_si256(a, 0), _mm256_extractf128_si256(b, 0));
+  __m128i hi = _mm_max_epu32(_mm256_extractf128_si256(a, 1), _mm256_extractf128_si256(b, 1));
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
+#endif
+}
+
+#ifdef EIGEN_VECTORIZE_AVX2
+template <>
+EIGEN_STRONG_INLINE Packet8i psign(const Packet8i& a) {
+  return _mm256_sign_epi32(_mm256_set1_epi32(1), a);
+}
+#endif
+
+// Add specializations for min/max with prescribed NaN propagation.
+template <>
+EIGEN_STRONG_INLINE Packet8f pmin<PropagateNumbers, Packet8f>(const Packet8f& a, const Packet8f& b) {
+  return pminmax_propagate_numbers(a, b, pmin<Packet8f>);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pmin<PropagateNumbers, Packet4d>(const Packet4d& a, const Packet4d& b) {
+  return pminmax_propagate_numbers(a, b, pmin<Packet4d>);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pmax<PropagateNumbers, Packet8f>(const Packet8f& a, const Packet8f& b) {
+  return pminmax_propagate_numbers(a, b, pmax<Packet8f>);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pmax<PropagateNumbers, Packet4d>(const Packet4d& a, const Packet4d& b) {
+  return pminmax_propagate_numbers(a, b, pmax<Packet4d>);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pmin<PropagateNaN, Packet8f>(const Packet8f& a, const Packet8f& b) {
+  return pminmax_propagate_nan(a, b, pmin<Packet8f>);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pmin<PropagateNaN, Packet4d>(const Packet4d& a, const Packet4d& b) {
+  return pminmax_propagate_nan(a, b, pmin<Packet4d>);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pmax<PropagateNaN, Packet8f>(const Packet8f& a, const Packet8f& b) {
+  return pminmax_propagate_nan(a, b, pmax<Packet8f>);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pmax<PropagateNaN, Packet4d>(const Packet4d& a, const Packet4d& b) {
+  return pminmax_propagate_nan(a, b, pmax<Packet4d>);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8f print<Packet8f>(const Packet8f& a) {
+  return _mm256_round_ps(a, _MM_FROUND_CUR_DIRECTION);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d print<Packet4d>(const Packet4d& a) {
+  return _mm256_round_pd(a, _MM_FROUND_CUR_DIRECTION);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8f pceil<Packet8f>(const Packet8f& a) {
+  return _mm256_ceil_ps(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pceil<Packet4d>(const Packet4d& a) {
+  return _mm256_ceil_pd(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8f pfloor<Packet8f>(const Packet8f& a) {
+  return _mm256_floor_ps(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pfloor<Packet4d>(const Packet4d& a) {
+  return _mm256_floor_pd(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8f ptrunc<Packet8f>(const Packet8f& a) {
+  return _mm256_round_ps(a, _MM_FROUND_TRUNC);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d ptrunc<Packet4d>(const Packet4d& a) {
+  return _mm256_round_pd(a, _MM_FROUND_TRUNC);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8i ptrue<Packet8i>(const Packet8i& a) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  // vpcmpeqd has lower latency than the more general vcmpps
+  return _mm256_cmpeq_epi32(a, a);
+#else
+  const __m256 b = _mm256_castsi256_ps(a);
+  return _mm256_castps_si256(_mm256_cmp_ps(b, b, _CMP_TRUE_UQ));
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8f ptrue<Packet8f>(const Packet8f& a) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  // vpcmpeqd has lower latency than the more general vcmpps
+  const __m256i b = _mm256_castps_si256(a);
+  return _mm256_castsi256_ps(_mm256_cmpeq_epi32(b, b));
+#else
+  return _mm256_cmp_ps(a, a, _CMP_TRUE_UQ);
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4d ptrue<Packet4d>(const Packet4d& a) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  // vpcmpeqq has lower latency than the more general vcmppd
+  const __m256i b = _mm256_castpd_si256(a);
+  return _mm256_castsi256_pd(_mm256_cmpeq_epi64(b, b));
+#else
+  return _mm256_cmp_pd(a, a, _CMP_TRUE_UQ);
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8f pand<Packet8f>(const Packet8f& a, const Packet8f& b) {
+  return _mm256_and_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pand<Packet4d>(const Packet4d& a, const Packet4d& b) {
+  return _mm256_and_pd(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8i pand<Packet8i>(const Packet8i& a, const Packet8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_and_si256(a, b);
+#else
+  return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet8ui pand<Packet8ui>(const Packet8ui& a, const Packet8ui& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_and_si256(a, b);
+#else
+  return _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8f por<Packet8f>(const Packet8f& a, const Packet8f& b) {
+  return _mm256_or_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d por<Packet4d>(const Packet4d& a, const Packet4d& b) {
+  return _mm256_or_pd(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8i por<Packet8i>(const Packet8i& a, const Packet8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_or_si256(a, b);
+#else
+  return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet8ui por<Packet8ui>(const Packet8ui& a, const Packet8ui& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_or_si256(a, b);
+#else
+  return _mm256_castps_si256(_mm256_or_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8f pxor<Packet8f>(const Packet8f& a, const Packet8f& b) {
+  return _mm256_xor_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pxor<Packet4d>(const Packet4d& a, const Packet4d& b) {
+  return _mm256_xor_pd(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8i pxor<Packet8i>(const Packet8i& a, const Packet8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_xor_si256(a, b);
+#else
+  return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet8ui pxor<Packet8ui>(const Packet8ui& a, const Packet8ui& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_xor_si256(a, b);
+#else
+  return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)));
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8f pandnot<Packet8f>(const Packet8f& a, const Packet8f& b) {
+  return _mm256_andnot_ps(b, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pandnot<Packet4d>(const Packet4d& a, const Packet4d& b) {
+  return _mm256_andnot_pd(b, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8i pandnot<Packet8i>(const Packet8i& a, const Packet8i& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_andnot_si256(b, a);
+#else
+  return _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(b), _mm256_castsi256_ps(a)));
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet8ui pandnot<Packet8ui>(const Packet8ui& a, const Packet8ui& b) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_andnot_si256(b, a);
+#else
+  return _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(b), _mm256_castsi256_ps(a)));
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8ui pcmp_lt(const Packet8ui& a, const Packet8ui& b) {
+  return pxor(pcmp_eq(a, pmax(a, b)), ptrue(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8ui pcmp_le(const Packet8ui& a, const Packet8ui& b) {
+  return pcmp_eq(a, pmin(a, b));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8f pround<Packet8f>(const Packet8f& a) {
+  const Packet8f mask = pset1frombits<Packet8f>(static_cast<numext::uint32_t>(0x80000000u));
+  const Packet8f prev0dot5 = pset1frombits<Packet8f>(static_cast<numext::uint32_t>(0x3EFFFFFFu));
+  return _mm256_round_ps(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pround<Packet4d>(const Packet4d& a) {
+  const Packet4d mask = pset1frombits<Packet4d>(static_cast<numext::uint64_t>(0x8000000000000000ull));
+  const Packet4d prev0dot5 = pset1frombits<Packet4d>(static_cast<numext::uint64_t>(0x3FDFFFFFFFFFFFFFull));
+  return _mm256_round_pd(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8f pselect<Packet8f>(const Packet8f& mask, const Packet8f& a, const Packet8f& b) {
+  return _mm256_blendv_ps(b, a, mask);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8i pselect<Packet8i>(const Packet8i& mask, const Packet8i& a, const Packet8i& b) {
+  return _mm256_castps_si256(
+      _mm256_blendv_ps(_mm256_castsi256_ps(b), _mm256_castsi256_ps(a), _mm256_castsi256_ps(mask)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8ui pselect<Packet8ui>(const Packet8ui& mask, const Packet8ui& a, const Packet8ui& b) {
+  return _mm256_castps_si256(
+      _mm256_blendv_ps(_mm256_castsi256_ps(b), _mm256_castsi256_ps(a), _mm256_castsi256_ps(mask)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4d pselect<Packet4d>(const Packet4d& mask, const Packet4d& a, const Packet4d& b) {
+  return _mm256_blendv_pd(b, a, mask);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet8i parithmetic_shift_right(Packet8i a) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_srai_epi32(a, N);
+#else
+  __m128i lo = _mm_srai_epi32(_mm256_extractf128_si256(a, 0), N);
+  __m128i hi = _mm_srai_epi32(_mm256_extractf128_si256(a, 1), N);
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
+#endif
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet8i plogical_shift_right(Packet8i a) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_srli_epi32(a, N);
+#else
+  __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(a, 0), N);
+  __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(a, 1), N);
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
+#endif
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet8i plogical_shift_left(Packet8i a) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_slli_epi32(a, N);
+#else
+  __m128i lo = _mm_slli_epi32(_mm256_extractf128_si256(a, 0), N);
+  __m128i hi = _mm_slli_epi32(_mm256_extractf128_si256(a, 1), N);
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
+#endif
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet8ui parithmetic_shift_right(Packet8ui a) {
+  return (Packet8ui)plogical_shift_right<N>((Packet8i)a);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet8ui plogical_shift_right(Packet8ui a) {
+  return (Packet8ui)plogical_shift_right<N>((Packet8i)a);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet8ui plogical_shift_left(Packet8ui a) {
+  return (Packet8ui)plogical_shift_left<N>((Packet8i)a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8f pload<Packet8f>(const float* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_ps(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pload<Packet4d>(const double* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_pd(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8i pload<Packet8i>(const int* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(reinterpret_cast<const __m256i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8ui pload<Packet8ui>(const uint32_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_si256(reinterpret_cast<const __m256i*>(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8f ploadu<Packet8f>(const float* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_ps(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d ploadu<Packet4d>(const double* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_pd(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8i ploadu<Packet8i>(const int* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8ui ploadu<Packet8ui>(const uint32_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8f ploadu<Packet8f>(const float* from, uint8_t umask) {
+#ifdef EIGEN_VECTORIZE_AVX512
+  __mmask16 mask = static_cast<__mmask16>(umask & 0x00FF);
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_castps512_ps256(_mm512_maskz_loadu_ps(mask, from));
+#else
+  Packet8i mask = _mm256_set1_epi8(static_cast<char>(umask));
+  const Packet8i bit_mask =
+      _mm256_set_epi32(0xffffff7f, 0xffffffbf, 0xffffffdf, 0xffffffef, 0xfffffff7, 0xfffffffb, 0xfffffffd, 0xfffffffe);
+  mask = por<Packet8i>(mask, bit_mask);
+  mask = pcmp_eq<Packet8i>(mask, _mm256_set1_epi32(0xffffffff));
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_maskload_ps(from, mask);
+#endif
+}
+
+// Loads 4 floats from memory a returns the packet {a0, a0  a1, a1, a2, a2, a3, a3}
+template <>
+EIGEN_STRONG_INLINE Packet8f ploaddup<Packet8f>(const float* from) {
+  // TODO try to find a way to avoid the need of a temporary register
+  //   Packet8f tmp  = _mm256_castps128_ps256(_mm_loadu_ps(from));
+  //   tmp = _mm256_insertf128_ps(tmp, _mm_movehl_ps(_mm256_castps256_ps128(tmp),_mm256_castps256_ps128(tmp)), 1);
+  //   return _mm256_unpacklo_ps(tmp,tmp);
+
+  // _mm256_insertf128_ps is very slow on Haswell, thus:
+  Packet8f tmp = _mm256_broadcast_ps((const __m128*)(const void*)from);
+  // mimic an "inplace" permutation of the lower 128bits using a blend
+  tmp = _mm256_blend_ps(
+      tmp, _mm256_castps128_ps256(_mm_permute_ps(_mm256_castps256_ps128(tmp), _MM_SHUFFLE(1, 0, 1, 0))), 15);
+  // then we can perform a consistent permutation on the global register to get everything in shape:
+  return _mm256_permute_ps(tmp, _MM_SHUFFLE(3, 3, 2, 2));
+}
+// Loads 2 doubles from memory a returns the packet {a0, a0, a1, a1}
+template <>
+EIGEN_STRONG_INLINE Packet4d ploaddup<Packet4d>(const double* from) {
+  Packet4d tmp = _mm256_broadcast_pd((const __m128d*)(const void*)from);
+  return _mm256_permute_pd(tmp, 3 << 2);
+}
+// Loads 4 integers from memory a returns the packet {a0, a0, a1, a1, a2, a2, a3, a3}
+template <>
+EIGEN_STRONG_INLINE Packet8i ploaddup<Packet8i>(const int* from) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  const Packet8i a = _mm256_castsi128_si256(ploadu<Packet4i>(from));
+  return _mm256_permutevar8x32_epi32(a, _mm256_setr_epi32(0, 0, 1, 1, 2, 2, 3, 3));
+#else
+  __m256 tmp = _mm256_broadcast_ps((const __m128*)(const void*)from);
+  // mimic an "inplace" permutation of the lower 128bits using a blend
+  tmp = _mm256_blend_ps(
+      tmp, _mm256_castps128_ps256(_mm_permute_ps(_mm256_castps256_ps128(tmp), _MM_SHUFFLE(1, 0, 1, 0))), 15);
+  // then we can perform a consistent permutation on the global register to get everything in shape:
+  return _mm256_castps_si256(_mm256_permute_ps(tmp, _MM_SHUFFLE(3, 3, 2, 2)));
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet8ui ploaddup<Packet8ui>(const uint32_t* from) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  const Packet8ui a = _mm256_castsi128_si256(ploadu<Packet4ui>(from));
+  return _mm256_permutevar8x32_epi32(a, _mm256_setr_epi32(0, 0, 1, 1, 2, 2, 3, 3));
+#else
+  __m256 tmp = _mm256_broadcast_ps((const __m128*)(const void*)from);
+  // mimic an "inplace" permutation of the lower 128bits using a blend
+  tmp = _mm256_blend_ps(
+      tmp, _mm256_castps128_ps256(_mm_permute_ps(_mm256_castps256_ps128(tmp), _MM_SHUFFLE(1, 0, 1, 0))), 15);
+  // then we can perform a consistent permutation on the global register to get
+  // everything in shape:
+  return _mm256_castps_si256(_mm256_permute_ps(tmp, _MM_SHUFFLE(3, 3, 2, 2)));
+#endif
+}
+
+// Loads 2 floats from memory a returns the packet {a0, a0  a0, a0, a1, a1, a1, a1}
+template <>
+EIGEN_STRONG_INLINE Packet8f ploadquad<Packet8f>(const float* from) {
+  Packet8f tmp = _mm256_castps128_ps256(_mm_broadcast_ss(from));
+  return _mm256_insertf128_ps(tmp, _mm_broadcast_ss(from + 1), 1);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8i ploadquad<Packet8i>(const int* from) {
+  return _mm256_insertf128_si256(_mm256_set1_epi32(*from), _mm_set1_epi32(*(from + 1)), 1);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8ui ploadquad<Packet8ui>(const uint32_t* from) {
+  return _mm256_insertf128_si256(_mm256_set1_epi32(*from), _mm_set1_epi32(*(from + 1)), 1);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet8f& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm256_store_ps(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet4d& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm256_store_pd(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet8i& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to), from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet8ui& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm256_store_si256(reinterpret_cast<__m256i*>(to), from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet8f& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_ps(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet4d& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_pd(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet8i& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<uint32_t>(uint32_t* to, const Packet8ui& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet8f& from, uint8_t umask) {
+#ifdef EIGEN_VECTORIZE_AVX512
+  __mmask16 mask = static_cast<__mmask16>(umask & 0x00FF);
+  EIGEN_DEBUG_UNALIGNED_STORE _mm512_mask_storeu_ps(to, mask, _mm512_castps256_ps512(from));
+#else
+  Packet8i mask = _mm256_set1_epi8(static_cast<char>(umask));
+  const Packet8i bit_mask =
+      _mm256_set_epi32(0x7f7f7f7f, 0xbfbfbfbf, 0xdfdfdfdf, 0xefefefef, 0xf7f7f7f7, 0xfbfbfbfb, 0xfdfdfdfd, 0xfefefefe);
+  mask = por<Packet8i>(mask, bit_mask);
+  mask = pcmp_eq<Packet8i>(mask, _mm256_set1_epi32(0xffffffff));
+#if EIGEN_COMP_MSVC
+  // MSVC sometimes seems to use a bogus mask with maskstore.
+  const __m256i ifrom = _mm256_castps_si256(from);
+  EIGEN_DEBUG_UNALIGNED_STORE _mm_maskmoveu_si128(_mm256_extractf128_si256(ifrom, 0), _mm256_extractf128_si256(mask, 0),
+                                                  reinterpret_cast<char*>(to));
+  EIGEN_DEBUG_UNALIGNED_STORE _mm_maskmoveu_si128(_mm256_extractf128_si256(ifrom, 1), _mm256_extractf128_si256(mask, 1),
+                                                  reinterpret_cast<char*>(to + 4));
+#else
+  EIGEN_DEBUG_UNALIGNED_STORE _mm256_maskstore_ps(to, mask, from);
+#endif
+#endif
+}
+
+// NOTE: leverage _mm256_i32gather_ps and _mm256_i32gather_pd if AVX2 instructions are available
+// NOTE: for the record the following seems to be slower: return _mm256_i32gather_ps(from, _mm256_set1_epi32(stride),
+// 4);
+template <>
+EIGEN_DEVICE_FUNC inline Packet8f pgather<float, Packet8f>(const float* from, Index stride) {
+  return _mm256_set_ps(from[7 * stride], from[6 * stride], from[5 * stride], from[4 * stride], from[3 * stride],
+                       from[2 * stride], from[1 * stride], from[0 * stride]);
+}
+template <>
+EIGEN_DEVICE_FUNC inline Packet4d pgather<double, Packet4d>(const double* from, Index stride) {
+  return _mm256_set_pd(from[3 * stride], from[2 * stride], from[1 * stride], from[0 * stride]);
+}
+template <>
+EIGEN_DEVICE_FUNC inline Packet8i pgather<int, Packet8i>(const int* from, Index stride) {
+  return _mm256_set_epi32(from[7 * stride], from[6 * stride], from[5 * stride], from[4 * stride], from[3 * stride],
+                          from[2 * stride], from[1 * stride], from[0 * stride]);
+}
+template <>
+EIGEN_DEVICE_FUNC inline Packet8ui pgather<uint32_t, Packet8ui>(const uint32_t* from, Index stride) {
+  return (Packet8ui)pgather<int, Packet8i>((int*)from, stride);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<float, Packet8f>(float* to, const Packet8f& from, Index stride) {
+  __m128 low = _mm256_extractf128_ps(from, 0);
+  to[stride * 0] = _mm_cvtss_f32(low);
+  to[stride * 1] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 1));
+  to[stride * 2] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 2));
+  to[stride * 3] = _mm_cvtss_f32(_mm_shuffle_ps(low, low, 3));
+
+  __m128 high = _mm256_extractf128_ps(from, 1);
+  to[stride * 4] = _mm_cvtss_f32(high);
+  to[stride * 5] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 1));
+  to[stride * 6] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 2));
+  to[stride * 7] = _mm_cvtss_f32(_mm_shuffle_ps(high, high, 3));
+}
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<double, Packet4d>(double* to, const Packet4d& from, Index stride) {
+  __m128d low = _mm256_extractf128_pd(from, 0);
+  to[stride * 0] = _mm_cvtsd_f64(low);
+  to[stride * 1] = _mm_cvtsd_f64(_mm_shuffle_pd(low, low, 1));
+  __m128d high = _mm256_extractf128_pd(from, 1);
+  to[stride * 2] = _mm_cvtsd_f64(high);
+  to[stride * 3] = _mm_cvtsd_f64(_mm_shuffle_pd(high, high, 1));
+}
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<int, Packet8i>(int* to, const Packet8i& from, Index stride) {
+  __m128i low = _mm256_extractf128_si256(from, 0);
+  to[stride * 0] = _mm_extract_epi32(low, 0);
+  to[stride * 1] = _mm_extract_epi32(low, 1);
+  to[stride * 2] = _mm_extract_epi32(low, 2);
+  to[stride * 3] = _mm_extract_epi32(low, 3);
+
+  __m128i high = _mm256_extractf128_si256(from, 1);
+  to[stride * 4] = _mm_extract_epi32(high, 0);
+  to[stride * 5] = _mm_extract_epi32(high, 1);
+  to[stride * 6] = _mm_extract_epi32(high, 2);
+  to[stride * 7] = _mm_extract_epi32(high, 3);
+}
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<uint32_t, Packet8ui>(uint32_t* to, const Packet8ui& from, Index stride) {
+  pscatter<int, Packet8i>((int*)to, (Packet8i)from, stride);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore1<Packet8f>(float* to, const float& a) {
+  Packet8f pa = pset1<Packet8f>(a);
+  pstore(to, pa);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore1<Packet4d>(double* to, const double& a) {
+  Packet4d pa = pset1<Packet4d>(a);
+  pstore(to, pa);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore1<Packet8i>(int* to, const int& a) {
+  Packet8i pa = pset1<Packet8i>(a);
+  pstore(to, pa);
+}
+
+#ifndef EIGEN_VECTORIZE_AVX512
+template <>
+EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
+  _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
+  _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) {
+  _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<uint32_t>(const uint32_t* addr) {
+  _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
+}
+#endif
+
+template <>
+EIGEN_STRONG_INLINE float pfirst<Packet8f>(const Packet8f& a) {
+  return _mm_cvtss_f32(_mm256_castps256_ps128(a));
+}
+template <>
+EIGEN_STRONG_INLINE double pfirst<Packet4d>(const Packet4d& a) {
+  return _mm_cvtsd_f64(_mm256_castpd256_pd128(a));
+}
+template <>
+EIGEN_STRONG_INLINE int pfirst<Packet8i>(const Packet8i& a) {
+  return _mm_cvtsi128_si32(_mm256_castsi256_si128(a));
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t pfirst<Packet8ui>(const Packet8ui& a) {
+  return numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(_mm256_castsi256_si128(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8f preverse(const Packet8f& a) {
+  __m256 tmp = _mm256_shuffle_ps(a, a, 0x1b);
+  return _mm256_permute2f128_ps(tmp, tmp, 1);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d preverse(const Packet4d& a) {
+  __m256d tmp = _mm256_shuffle_pd(a, a, 5);
+  return _mm256_permute2f128_pd(tmp, tmp, 1);
+#if 0
+  // This version is unlikely to be faster as _mm256_shuffle_ps and _mm256_permute_pd
+  // exhibit the same latency/throughput, but it is here for future reference/benchmarking...
+  __m256d swap_halves = _mm256_permute2f128_pd(a,a,1);
+    return _mm256_permute_pd(swap_halves,5);
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet8i preverse(const Packet8i& a) {
+  return _mm256_castps_si256(preverse(_mm256_castsi256_ps(a)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8ui preverse(const Packet8ui& a) {
+  return _mm256_castps_si256(preverse(_mm256_castsi256_ps(a)));
+}
+
+#ifdef EIGEN_VECTORIZE_AVX2
+template <>
+EIGEN_STRONG_INLINE Packet4l preverse(const Packet4l& a) {
+  return _mm256_castpd_si256(preverse(_mm256_castsi256_pd(a)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ul preverse(const Packet4ul& a) {
+  return _mm256_castpd_si256(preverse(_mm256_castsi256_pd(a)));
+}
+#endif
+
+// pabs should be ok
+template <>
+EIGEN_STRONG_INLINE Packet8f pabs(const Packet8f& a) {
+  const Packet8f mask = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
+  return _mm256_and_ps(a, mask);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d pabs(const Packet4d& a) {
+  const Packet4d mask = _mm256_castsi256_pd(_mm256_set1_epi64x(0x7FFFFFFFFFFFFFFF));
+  return _mm256_and_pd(a, mask);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8i pabs(const Packet8i& a) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_abs_epi32(a);
+#else
+  __m128i lo = _mm_abs_epi32(_mm256_extractf128_si256(a, 0));
+  __m128i hi = _mm_abs_epi32(_mm256_extractf128_si256(a, 1));
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet8ui pabs(const Packet8ui& a) {
+  return a;
+}
+
+#ifndef EIGEN_VECTORIZE_AVX512FP16
+template <>
+EIGEN_STRONG_INLINE Packet8h psignbit(const Packet8h& a) {
+  return _mm_cmpgt_epi16(_mm_setzero_si128(), a);
+}
+#endif  // EIGEN_VECTORIZE_AVX512FP16
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf psignbit(const Packet8bf& a) {
+  return _mm_cmpgt_epi16(_mm_setzero_si128(), a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f psignbit(const Packet8f& a) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_castsi256_ps(_mm256_cmpgt_epi32(_mm256_setzero_si256(), _mm256_castps_si256(a)));
+#else
+  return _mm256_castsi256_ps(parithmetic_shift_right<31>(Packet8i(_mm256_castps_si256(a))));
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet8ui psignbit(const Packet8ui& /*unused*/) {
+  return _mm256_setzero_si256();
+}
+#ifdef EIGEN_VECTORIZE_AVX2
+template <>
+EIGEN_STRONG_INLINE Packet4d psignbit(const Packet4d& a) {
+  return _mm256_castsi256_pd(_mm256_cmpgt_epi64(_mm256_setzero_si256(), _mm256_castpd_si256(a)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ul psignbit(const Packet4ul& /*unused*/) {
+  return _mm256_setzero_si256();
+}
+#endif
+
+template <>
+EIGEN_STRONG_INLINE Packet8f pfrexp<Packet8f>(const Packet8f& a, Packet8f& exponent) {
+  return pfrexp_generic(a, exponent);
+}
+
+// Extract exponent without existence of Packet4l.
+template <>
+EIGEN_STRONG_INLINE Packet4d pfrexp_generic_get_biased_exponent(const Packet4d& a) {
+  const Packet4d cst_exp_mask = pset1frombits<Packet4d>(static_cast<uint64_t>(0x7ff0000000000000ull));
+  __m256i a_expo = _mm256_castpd_si256(pand(a, cst_exp_mask));
+#ifdef EIGEN_VECTORIZE_AVX2
+  a_expo = _mm256_srli_epi64(a_expo, 52);
+  __m128i lo = _mm256_extractf128_si256(a_expo, 0);
+  __m128i hi = _mm256_extractf128_si256(a_expo, 1);
+#else
+  __m128i lo = _mm256_extractf128_si256(a_expo, 0);
+  __m128i hi = _mm256_extractf128_si256(a_expo, 1);
+  lo = _mm_srli_epi64(lo, 52);
+  hi = _mm_srli_epi64(hi, 52);
+#endif
+  Packet2d exponent_lo = _mm_cvtepi32_pd(vec4i_swizzle1(lo, 0, 2, 1, 3));
+  Packet2d exponent_hi = _mm_cvtepi32_pd(vec4i_swizzle1(hi, 0, 2, 1, 3));
+  Packet4d exponent = _mm256_insertf128_pd(_mm256_setzero_pd(), exponent_lo, 0);
+  exponent = _mm256_insertf128_pd(exponent, exponent_hi, 1);
+  return exponent;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4d pfrexp<Packet4d>(const Packet4d& a, Packet4d& exponent) {
+  return pfrexp_generic(a, exponent);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8f pldexp<Packet8f>(const Packet8f& a, const Packet8f& exponent) {
+  return pldexp_generic(a, exponent);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4d pldexp<Packet4d>(const Packet4d& a, const Packet4d& exponent) {
+  // Clamp exponent to [-2099, 2099]
+  const Packet4d max_exponent = pset1<Packet4d>(2099.0);
+  const Packet4i e = _mm256_cvtpd_epi32(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
+
+  // Split 2^e into four factors and multiply.
+  const Packet4i bias = pset1<Packet4i>(1023);
+  Packet4i b = parithmetic_shift_right<2>(e);  // floor(e/4)
+
+  // 2^b
+  Packet4i hi = vec4i_swizzle1(padd(b, bias), 0, 2, 1, 3);
+  Packet4i lo = _mm_slli_epi64(hi, 52);
+  hi = _mm_slli_epi64(_mm_srli_epi64(hi, 32), 52);
+  Packet4d c = _mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1));
+  Packet4d out = pmul(pmul(pmul(a, c), c), c);  // a * 2^(3b)
+
+  // 2^(e - 3b)
+  b = psub(psub(psub(e, b), b), b);  // e - 3b
+  hi = vec4i_swizzle1(padd(b, bias), 0, 2, 1, 3);
+  lo = _mm_slli_epi64(hi, 52);
+  hi = _mm_slli_epi64(_mm_srli_epi64(hi, 32), 52);
+  c = _mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1));
+  out = pmul(out, c);  // a * 2^e
+  return out;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4d pldexp_fast<Packet4d>(const Packet4d& a, const Packet4d& exponent) {
+  // Clamp exponent to [-1024, 1024]
+  const Packet4d min_exponent = pset1<Packet4d>(-1023.0);
+  const Packet4d max_exponent = pset1<Packet4d>(1024.0);
+  const Packet4i e = _mm256_cvtpd_epi32(pmin(pmax(exponent, min_exponent), max_exponent));
+  const Packet4i bias = pset1<Packet4i>(1023);
+
+  // 2^e
+  Packet4i hi = vec4i_swizzle1(padd(e, bias), 0, 2, 1, 3);
+  const Packet4i lo = _mm_slli_epi64(hi, 52);
+  hi = _mm_slli_epi64(_mm_srli_epi64(hi, 32), 52);
+  const Packet4d c = _mm256_castsi256_pd(_mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1));
+  return pmul(a, c);  // a * 2^e
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f predux_half_dowto4<Packet8f>(const Packet8f& a) {
+  return _mm_add_ps(_mm256_castps256_ps128(a), _mm256_extractf128_ps(a, 1));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i predux_half_dowto4<Packet8i>(const Packet8i& a) {
+  return _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui predux_half_dowto4<Packet8ui>(const Packet8ui& a) {
+  return _mm_add_epi32(_mm256_castsi256_si128(a), _mm256_extractf128_si256(a, 1));
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8f, 8>& kernel) {
+  __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
+  __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
+  __m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
+  __m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
+  __m256 T4 = _mm256_unpacklo_ps(kernel.packet[4], kernel.packet[5]);
+  __m256 T5 = _mm256_unpackhi_ps(kernel.packet[4], kernel.packet[5]);
+  __m256 T6 = _mm256_unpacklo_ps(kernel.packet[6], kernel.packet[7]);
+  __m256 T7 = _mm256_unpackhi_ps(kernel.packet[6], kernel.packet[7]);
+  __m256 S0 = _mm256_shuffle_ps(T0, T2, _MM_SHUFFLE(1, 0, 1, 0));
+  __m256 S1 = _mm256_shuffle_ps(T0, T2, _MM_SHUFFLE(3, 2, 3, 2));
+  __m256 S2 = _mm256_shuffle_ps(T1, T3, _MM_SHUFFLE(1, 0, 1, 0));
+  __m256 S3 = _mm256_shuffle_ps(T1, T3, _MM_SHUFFLE(3, 2, 3, 2));
+  __m256 S4 = _mm256_shuffle_ps(T4, T6, _MM_SHUFFLE(1, 0, 1, 0));
+  __m256 S5 = _mm256_shuffle_ps(T4, T6, _MM_SHUFFLE(3, 2, 3, 2));
+  __m256 S6 = _mm256_shuffle_ps(T5, T7, _MM_SHUFFLE(1, 0, 1, 0));
+  __m256 S7 = _mm256_shuffle_ps(T5, T7, _MM_SHUFFLE(3, 2, 3, 2));
+  kernel.packet[0] = _mm256_permute2f128_ps(S0, S4, 0x20);
+  kernel.packet[1] = _mm256_permute2f128_ps(S1, S5, 0x20);
+  kernel.packet[2] = _mm256_permute2f128_ps(S2, S6, 0x20);
+  kernel.packet[3] = _mm256_permute2f128_ps(S3, S7, 0x20);
+  kernel.packet[4] = _mm256_permute2f128_ps(S0, S4, 0x31);
+  kernel.packet[5] = _mm256_permute2f128_ps(S1, S5, 0x31);
+  kernel.packet[6] = _mm256_permute2f128_ps(S2, S6, 0x31);
+  kernel.packet[7] = _mm256_permute2f128_ps(S3, S7, 0x31);
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8f, 4>& kernel) {
+  __m256 T0 = _mm256_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
+  __m256 T1 = _mm256_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
+  __m256 T2 = _mm256_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
+  __m256 T3 = _mm256_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
+
+  __m256 S0 = _mm256_shuffle_ps(T0, T2, _MM_SHUFFLE(1, 0, 1, 0));
+  __m256 S1 = _mm256_shuffle_ps(T0, T2, _MM_SHUFFLE(3, 2, 3, 2));
+  __m256 S2 = _mm256_shuffle_ps(T1, T3, _MM_SHUFFLE(1, 0, 1, 0));
+  __m256 S3 = _mm256_shuffle_ps(T1, T3, _MM_SHUFFLE(3, 2, 3, 2));
+
+  kernel.packet[0] = _mm256_permute2f128_ps(S0, S1, 0x20);
+  kernel.packet[1] = _mm256_permute2f128_ps(S2, S3, 0x20);
+  kernel.packet[2] = _mm256_permute2f128_ps(S0, S1, 0x31);
+  kernel.packet[3] = _mm256_permute2f128_ps(S2, S3, 0x31);
+}
+
+#define MM256_SHUFFLE_EPI32(A, B, M) \
+  _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(A), _mm256_castsi256_ps(B), M))
+
+#ifndef EIGEN_VECTORIZE_AVX2
+#define MM256_UNPACKLO_EPI32(A, B) \
+  _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(A), _mm256_castsi256_ps(B)))
+#define MM256_UNPACKHI_EPI32(A, B) \
+  _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(A), _mm256_castsi256_ps(B)))
+#else
+#define MM256_UNPACKLO_EPI32(A, B) _mm256_unpacklo_epi32(A, B)
+#define MM256_UNPACKHI_EPI32(A, B) _mm256_unpackhi_epi32(A, B)
+#endif
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8i, 8>& kernel) {
+  __m256i T0 = MM256_UNPACKLO_EPI32(kernel.packet[0], kernel.packet[1]);
+  __m256i T1 = MM256_UNPACKHI_EPI32(kernel.packet[0], kernel.packet[1]);
+  __m256i T2 = MM256_UNPACKLO_EPI32(kernel.packet[2], kernel.packet[3]);
+  __m256i T3 = MM256_UNPACKHI_EPI32(kernel.packet[2], kernel.packet[3]);
+  __m256i T4 = MM256_UNPACKLO_EPI32(kernel.packet[4], kernel.packet[5]);
+  __m256i T5 = MM256_UNPACKHI_EPI32(kernel.packet[4], kernel.packet[5]);
+  __m256i T6 = MM256_UNPACKLO_EPI32(kernel.packet[6], kernel.packet[7]);
+  __m256i T7 = MM256_UNPACKHI_EPI32(kernel.packet[6], kernel.packet[7]);
+  __m256i S0 = MM256_SHUFFLE_EPI32(T0, T2, _MM_SHUFFLE(1, 0, 1, 0));
+  __m256i S1 = MM256_SHUFFLE_EPI32(T0, T2, _MM_SHUFFLE(3, 2, 3, 2));
+  __m256i S2 = MM256_SHUFFLE_EPI32(T1, T3, _MM_SHUFFLE(1, 0, 1, 0));
+  __m256i S3 = MM256_SHUFFLE_EPI32(T1, T3, _MM_SHUFFLE(3, 2, 3, 2));
+  __m256i S4 = MM256_SHUFFLE_EPI32(T4, T6, _MM_SHUFFLE(1, 0, 1, 0));
+  __m256i S5 = MM256_SHUFFLE_EPI32(T4, T6, _MM_SHUFFLE(3, 2, 3, 2));
+  __m256i S6 = MM256_SHUFFLE_EPI32(T5, T7, _MM_SHUFFLE(1, 0, 1, 0));
+  __m256i S7 = MM256_SHUFFLE_EPI32(T5, T7, _MM_SHUFFLE(3, 2, 3, 2));
+  kernel.packet[0] = _mm256_permute2f128_si256(S0, S4, 0x20);
+  kernel.packet[1] = _mm256_permute2f128_si256(S1, S5, 0x20);
+  kernel.packet[2] = _mm256_permute2f128_si256(S2, S6, 0x20);
+  kernel.packet[3] = _mm256_permute2f128_si256(S3, S7, 0x20);
+  kernel.packet[4] = _mm256_permute2f128_si256(S0, S4, 0x31);
+  kernel.packet[5] = _mm256_permute2f128_si256(S1, S5, 0x31);
+  kernel.packet[6] = _mm256_permute2f128_si256(S2, S6, 0x31);
+  kernel.packet[7] = _mm256_permute2f128_si256(S3, S7, 0x31);
+}
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8ui, 8>& kernel) {
+  ptranspose((PacketBlock<Packet8i, 8>&)kernel);
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8i, 4>& kernel) {
+  __m256i T0 = MM256_UNPACKLO_EPI32(kernel.packet[0], kernel.packet[1]);
+  __m256i T1 = MM256_UNPACKHI_EPI32(kernel.packet[0], kernel.packet[1]);
+  __m256i T2 = MM256_UNPACKLO_EPI32(kernel.packet[2], kernel.packet[3]);
+  __m256i T3 = MM256_UNPACKHI_EPI32(kernel.packet[2], kernel.packet[3]);
+
+  __m256i S0 = MM256_SHUFFLE_EPI32(T0, T2, _MM_SHUFFLE(1, 0, 1, 0));
+  __m256i S1 = MM256_SHUFFLE_EPI32(T0, T2, _MM_SHUFFLE(3, 2, 3, 2));
+  __m256i S2 = MM256_SHUFFLE_EPI32(T1, T3, _MM_SHUFFLE(1, 0, 1, 0));
+  __m256i S3 = MM256_SHUFFLE_EPI32(T1, T3, _MM_SHUFFLE(3, 2, 3, 2));
+
+  kernel.packet[0] = _mm256_permute2f128_si256(S0, S1, 0x20);
+  kernel.packet[1] = _mm256_permute2f128_si256(S2, S3, 0x20);
+  kernel.packet[2] = _mm256_permute2f128_si256(S0, S1, 0x31);
+  kernel.packet[3] = _mm256_permute2f128_si256(S2, S3, 0x31);
+}
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8ui, 4>& kernel) {
+  ptranspose((PacketBlock<Packet8i, 4>&)kernel);
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4d, 4>& kernel) {
+  __m256d T0 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 15);
+  __m256d T1 = _mm256_shuffle_pd(kernel.packet[0], kernel.packet[1], 0);
+  __m256d T2 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 15);
+  __m256d T3 = _mm256_shuffle_pd(kernel.packet[2], kernel.packet[3], 0);
+
+  kernel.packet[1] = _mm256_permute2f128_pd(T0, T2, 32);
+  kernel.packet[3] = _mm256_permute2f128_pd(T0, T2, 49);
+  kernel.packet[0] = _mm256_permute2f128_pd(T1, T3, 32);
+  kernel.packet[2] = _mm256_permute2f128_pd(T1, T3, 49);
+}
+
+EIGEN_STRONG_INLINE __m256i avx_blend_mask(const Selector<4>& ifPacket) {
+  return _mm256_set_epi64x(0 - ifPacket.select[3], 0 - ifPacket.select[2], 0 - ifPacket.select[1],
+                           0 - ifPacket.select[0]);
+}
+
+EIGEN_STRONG_INLINE __m256i avx_blend_mask(const Selector<8>& ifPacket) {
+  return _mm256_set_epi32(0 - ifPacket.select[7], 0 - ifPacket.select[6], 0 - ifPacket.select[5],
+                          0 - ifPacket.select[4], 0 - ifPacket.select[3], 0 - ifPacket.select[2],
+                          0 - ifPacket.select[1], 0 - ifPacket.select[0]);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8f pblend(const Selector<8>& ifPacket, const Packet8f& thenPacket,
+                                    const Packet8f& elsePacket) {
+  const __m256 true_mask = _mm256_castsi256_ps(avx_blend_mask(ifPacket));
+  return pselect<Packet8f>(true_mask, thenPacket, elsePacket);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4d pblend(const Selector<4>& ifPacket, const Packet4d& thenPacket,
+                                    const Packet4d& elsePacket) {
+  const __m256d true_mask = _mm256_castsi256_pd(avx_blend_mask(ifPacket));
+  return pselect<Packet4d>(true_mask, thenPacket, elsePacket);
+}
+
+// Packet math for Eigen::half
+#ifndef EIGEN_VECTORIZE_AVX512FP16
+template <>
+struct unpacket_traits<Packet8h> {
+  typedef Eigen::half type;
+  enum {
+    size = 8,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+  typedef Packet8h half;
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pset1<Packet8h>(const Eigen::half& from) {
+  return _mm_set1_epi16(numext::bit_cast<numext::uint16_t>(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Eigen::half pfirst<Packet8h>(const Packet8h& from) {
+  return numext::bit_cast<Eigen::half>(static_cast<numext::uint16_t>(_mm_extract_epi16(from, 0)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pload<Packet8h>(const Eigen::half* from) {
+  return _mm_load_si128(reinterpret_cast<const __m128i*>(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h ploadu<Packet8h>(const Eigen::half* from) {
+  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet8h& from) {
+  _mm_store_si128(reinterpret_cast<__m128i*>(to), from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet8h& from) {
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h ploaddup<Packet8h>(const Eigen::half* from) {
+  const numext::uint16_t a = numext::bit_cast<numext::uint16_t>(from[0]);
+  const numext::uint16_t b = numext::bit_cast<numext::uint16_t>(from[1]);
+  const numext::uint16_t c = numext::bit_cast<numext::uint16_t>(from[2]);
+  const numext::uint16_t d = numext::bit_cast<numext::uint16_t>(from[3]);
+  return _mm_set_epi16(d, d, c, c, b, b, a, a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h ploadquad<Packet8h>(const Eigen::half* from) {
+  const numext::uint16_t a = numext::bit_cast<numext::uint16_t>(from[0]);
+  const numext::uint16_t b = numext::bit_cast<numext::uint16_t>(from[1]);
+  return _mm_set_epi16(b, b, b, b, a, a, a, a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h ptrue(const Packet8h& a) {
+  return _mm_cmpeq_epi32(a, a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pabs(const Packet8h& a) {
+  const __m128i sign_mask = _mm_set1_epi16(static_cast<numext::uint16_t>(0x8000));
+  return _mm_andnot_si128(sign_mask, a);
+}
+
+EIGEN_STRONG_INLINE Packet8f half2float(const Packet8h& a) {
+#ifdef EIGEN_HAS_FP16_C
+  return _mm256_cvtph_ps(a);
+#else
+  Eigen::internal::Packet8f pp = _mm256_castsi256_ps(
+      _mm256_insertf128_si256(_mm256_castsi128_si256(half2floatsse(a)), half2floatsse(_mm_srli_si128(a, 8)), 1));
+  return pp;
+#endif
+}
+
+EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) {
+#ifdef EIGEN_HAS_FP16_C
+  return _mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT);
+#else
+  __m128i lo = float2half(_mm256_extractf128_ps(a, 0));
+  __m128i hi = float2half(_mm256_extractf128_ps(a, 1));
+  return _mm_packus_epi32(lo, hi);
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pmin<Packet8h>(const Packet8h& a, const Packet8h& b) {
+  return float2half(pmin<Packet8f>(half2float(a), half2float(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pmax<Packet8h>(const Packet8h& a, const Packet8h& b) {
+  return float2half(pmax<Packet8f>(half2float(a), half2float(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h plset<Packet8h>(const half& a) {
+  return float2half(plset<Packet8f>(static_cast<float>(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h por(const Packet8h& a, const Packet8h& b) {
+  // in some cases Packet4i is a wrapper around __m128i, so we either need to
+  // cast to Packet4i to directly call the intrinsics as below:
+  return _mm_or_si128(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8h pxor(const Packet8h& a, const Packet8h& b) {
+  return _mm_xor_si128(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8h pand(const Packet8h& a, const Packet8h& b) {
+  return _mm_and_si128(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8h pandnot(const Packet8h& a, const Packet8h& b) {
+  return _mm_andnot_si128(b, a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pselect(const Packet8h& mask, const Packet8h& a, const Packet8h& b) {
+  return _mm_blendv_epi8(b, a, mask);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pround<Packet8h>(const Packet8h& a) {
+  return float2half(pround<Packet8f>(half2float(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h print<Packet8h>(const Packet8h& a) {
+  return float2half(print<Packet8f>(half2float(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pceil<Packet8h>(const Packet8h& a) {
+  return float2half(pceil<Packet8f>(half2float(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pfloor<Packet8h>(const Packet8h& a) {
+  return float2half(pfloor<Packet8f>(half2float(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h ptrunc<Packet8h>(const Packet8h& a) {
+  return float2half(ptrunc<Packet8f>(half2float(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pisinf<Packet8h>(const Packet8h& a) {
+  constexpr uint16_t kInf = ((1 << 5) - 1) << 10;
+  constexpr uint16_t kAbsMask = (1 << 15) - 1;
+  return _mm_cmpeq_epi16(_mm_and_si128(a.m_val, _mm_set1_epi16(kAbsMask)), _mm_set1_epi16(kInf));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pisnan<Packet8h>(const Packet8h& a) {
+  constexpr uint16_t kInf = ((1 << 5) - 1) << 10;
+  constexpr uint16_t kAbsMask = (1 << 15) - 1;
+  return _mm_cmpgt_epi16(_mm_and_si128(a.m_val, _mm_set1_epi16(kAbsMask)), _mm_set1_epi16(kInf));
+}
+
+// convert the sign-magnitude representation to two's complement
+EIGEN_STRONG_INLINE __m128i pmaptosigned(const __m128i& a) {
+  constexpr uint16_t kAbsMask = (1 << 15) - 1;
+  // if 'a' has the sign bit set, clear the sign bit and negate the result as if it were an integer
+  return _mm_sign_epi16(_mm_and_si128(a, _mm_set1_epi16(kAbsMask)), a);
+}
+
+// return true if both `a` and `b` are not NaN
+EIGEN_STRONG_INLINE Packet8h pisordered(const Packet8h& a, const Packet8h& b) {
+  constexpr uint16_t kInf = ((1 << 5) - 1) << 10;
+  constexpr uint16_t kAbsMask = (1 << 15) - 1;
+  __m128i abs_a = _mm_and_si128(a.m_val, _mm_set1_epi16(kAbsMask));
+  __m128i abs_b = _mm_and_si128(b.m_val, _mm_set1_epi16(kAbsMask));
+  // check if both `abs_a <= kInf` and `abs_b <= kInf` by checking if max(abs_a, abs_b) <= kInf
+  // SSE has no `lesser or equal` instruction for integers, but comparing against kInf + 1 accomplishes the same goal
+  return _mm_cmplt_epi16(_mm_max_epu16(abs_a, abs_b), _mm_set1_epi16(kInf + 1));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pcmp_eq(const Packet8h& a, const Packet8h& b) {
+  __m128i isOrdered = pisordered(a, b);
+  __m128i isEqual = _mm_cmpeq_epi16(pmaptosigned(a.m_val), pmaptosigned(b.m_val));
+  return _mm_and_si128(isOrdered, isEqual);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pcmp_le(const Packet8h& a, const Packet8h& b) {
+  __m128i isOrdered = pisordered(a, b);
+  __m128i isGreater = _mm_cmpgt_epi16(pmaptosigned(a.m_val), pmaptosigned(b.m_val));
+  return _mm_andnot_si128(isGreater, isOrdered);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pcmp_lt(const Packet8h& a, const Packet8h& b) {
+  __m128i isOrdered = pisordered(a, b);
+  __m128i isLess = _mm_cmplt_epi16(pmaptosigned(a.m_val), pmaptosigned(b.m_val));
+  return _mm_and_si128(isOrdered, isLess);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pcmp_lt_or_nan(const Packet8h& a, const Packet8h& b) {
+  __m128i isUnordered = por(pisnan(a), pisnan(b));
+  __m128i isLess = _mm_cmplt_epi16(pmaptosigned(a.m_val), pmaptosigned(b.m_val));
+  return _mm_or_si128(isUnordered, isLess);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) {
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pnegate(const Packet8h& a) {
+  Packet8h sign_mask = _mm_set1_epi16(static_cast<numext::uint16_t>(0x8000));
+  return _mm_xor_si128(a, sign_mask);
+}
+
+#ifndef EIGEN_VECTORIZE_AVX512FP16
+template <>
+EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) {
+  Packet8f af = half2float(a);
+  Packet8f bf = half2float(b);
+  Packet8f rf = padd(af, bf);
+  return float2half(rf);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h psub<Packet8h>(const Packet8h& a, const Packet8h& b) {
+  Packet8f af = half2float(a);
+  Packet8f bf = half2float(b);
+  Packet8f rf = psub(af, bf);
+  return float2half(rf);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) {
+  Packet8f af = half2float(a);
+  Packet8f bf = half2float(b);
+  Packet8f rf = pmul(af, bf);
+  return float2half(rf);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pmadd<Packet8h>(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
+  return float2half(pmadd(half2float(a), half2float(b), half2float(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pmsub<Packet8h>(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
+  return float2half(pmsub(half2float(a), half2float(b), half2float(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pnmadd<Packet8h>(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
+  return float2half(pnmadd(half2float(a), half2float(b), half2float(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pnmsub<Packet8h>(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
+  return float2half(pnmsub(half2float(a), half2float(b), half2float(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pdiv<Packet8h>(const Packet8h& a, const Packet8h& b) {
+  Packet8f af = half2float(a);
+  Packet8f bf = half2float(b);
+  Packet8f rf = pdiv(af, bf);
+  return float2half(rf);
+}
+#endif
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pgather<Eigen::half, Packet8h>(const Eigen::half* from, Index stride) {
+  const numext::uint16_t s0 = numext::bit_cast<numext::uint16_t>(from[0 * stride]);
+  const numext::uint16_t s1 = numext::bit_cast<numext::uint16_t>(from[1 * stride]);
+  const numext::uint16_t s2 = numext::bit_cast<numext::uint16_t>(from[2 * stride]);
+  const numext::uint16_t s3 = numext::bit_cast<numext::uint16_t>(from[3 * stride]);
+  const numext::uint16_t s4 = numext::bit_cast<numext::uint16_t>(from[4 * stride]);
+  const numext::uint16_t s5 = numext::bit_cast<numext::uint16_t>(from[5 * stride]);
+  const numext::uint16_t s6 = numext::bit_cast<numext::uint16_t>(from[6 * stride]);
+  const numext::uint16_t s7 = numext::bit_cast<numext::uint16_t>(from[7 * stride]);
+  return _mm_set_epi16(s7, s6, s5, s4, s3, s2, s1, s0);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8h>(Eigen::half* to, const Packet8h& from, Index stride) {
+  EIGEN_ALIGN32 Eigen::half aux[8];
+  pstore(aux, from);
+  to[stride * 0] = aux[0];
+  to[stride * 1] = aux[1];
+  to[stride * 2] = aux[2];
+  to[stride * 3] = aux[3];
+  to[stride * 4] = aux[4];
+  to[stride * 5] = aux[5];
+  to[stride * 6] = aux[6];
+  to[stride * 7] = aux[7];
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h preverse(const Packet8h& a) {
+  __m128i m = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
+  return _mm_shuffle_epi8(a, m);
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8h, 8>& kernel) {
+  __m128i a = kernel.packet[0];
+  __m128i b = kernel.packet[1];
+  __m128i c = kernel.packet[2];
+  __m128i d = kernel.packet[3];
+  __m128i e = kernel.packet[4];
+  __m128i f = kernel.packet[5];
+  __m128i g = kernel.packet[6];
+  __m128i h = kernel.packet[7];
+
+  __m128i a03b03 = _mm_unpacklo_epi16(a, b);
+  __m128i c03d03 = _mm_unpacklo_epi16(c, d);
+  __m128i e03f03 = _mm_unpacklo_epi16(e, f);
+  __m128i g03h03 = _mm_unpacklo_epi16(g, h);
+  __m128i a47b47 = _mm_unpackhi_epi16(a, b);
+  __m128i c47d47 = _mm_unpackhi_epi16(c, d);
+  __m128i e47f47 = _mm_unpackhi_epi16(e, f);
+  __m128i g47h47 = _mm_unpackhi_epi16(g, h);
+
+  __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03);
+  __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03);
+  __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03);
+  __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03);
+  __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47);
+  __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47);
+  __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47);
+  __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47);
+
+  __m128i a0b0c0d0e0f0g0h0 = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01);
+  __m128i a1b1c1d1e1f1g1h1 = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01);
+  __m128i a2b2c2d2e2f2g2h2 = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23);
+  __m128i a3b3c3d3e3f3g3h3 = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23);
+  __m128i a4b4c4d4e4f4g4h4 = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45);
+  __m128i a5b5c5d5e5f5g5h5 = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45);
+  __m128i a6b6c6d6e6f6g6h6 = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67);
+  __m128i a7b7c7d7e7f7g7h7 = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67);
+
+  kernel.packet[0] = a0b0c0d0e0f0g0h0;
+  kernel.packet[1] = a1b1c1d1e1f1g1h1;
+  kernel.packet[2] = a2b2c2d2e2f2g2h2;
+  kernel.packet[3] = a3b3c3d3e3f3g3h3;
+  kernel.packet[4] = a4b4c4d4e4f4g4h4;
+  kernel.packet[5] = a5b5c5d5e5f5g5h5;
+  kernel.packet[6] = a6b6c6d6e6f6g6h6;
+  kernel.packet[7] = a7b7c7d7e7f7g7h7;
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8h, 4>& kernel) {
+  EIGEN_ALIGN32 Eigen::half in[4][8];
+  pstore<Eigen::half>(in[0], kernel.packet[0]);
+  pstore<Eigen::half>(in[1], kernel.packet[1]);
+  pstore<Eigen::half>(in[2], kernel.packet[2]);
+  pstore<Eigen::half>(in[3], kernel.packet[3]);
+
+  EIGEN_ALIGN32 Eigen::half out[4][8];
+
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      out[i][j] = in[j][2 * i];
+    }
+    for (int j = 0; j < 4; ++j) {
+      out[i][j + 4] = in[j][2 * i + 1];
+    }
+  }
+
+  kernel.packet[0] = pload<Packet8h>(out[0]);
+  kernel.packet[1] = pload<Packet8h>(out[1]);
+  kernel.packet[2] = pload<Packet8h>(out[2]);
+  kernel.packet[3] = pload<Packet8h>(out[3]);
+}
+
+#endif
+
+// BFloat16 implementation.
+
+EIGEN_STRONG_INLINE Packet8f Bf16ToF32(const Packet8bf& a) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  __m256i extend = _mm256_cvtepu16_epi32(a);
+  return _mm256_castsi256_ps(_mm256_slli_epi32(extend, 16));
+#else
+  __m128i lo = _mm_cvtepu16_epi32(a);
+  __m128i hi = _mm_cvtepu16_epi32(_mm_srli_si128(a, 8));
+  __m128i lo_shift = _mm_slli_epi32(lo, 16);
+  __m128i hi_shift = _mm_slli_epi32(hi, 16);
+  return _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(lo_shift), hi_shift, 1));
+#endif
+}
+
+// Convert float to bfloat16 according to round-to-nearest-even/denormals algorithm.
+EIGEN_STRONG_INLINE Packet8bf F32ToBf16(const Packet8f& a) {
+  __m256i input = _mm256_castps_si256(a);
+
+#ifdef EIGEN_VECTORIZE_AVX2
+  // uint32_t lsb = (input >> 16);
+  __m256i t = _mm256_srli_epi32(input, 16);
+  // uint32_t lsb = lsb & 1;
+  t = _mm256_and_si256(t, _mm256_set1_epi32(1));
+  // uint32_t rounding_bias = 0x7fff + lsb;
+  t = _mm256_add_epi32(t, _mm256_set1_epi32(0x7fff));
+  // input += rounding_bias;
+  t = _mm256_add_epi32(t, input);
+  // input = input >> 16;
+  t = _mm256_srli_epi32(t, 16);
+  // Check NaN before converting back to bf16
+  __m256 mask = _mm256_cmp_ps(a, a, _CMP_ORD_Q);
+  __m256i nan = _mm256_set1_epi32(0x7fc0);
+  t = _mm256_blendv_epi8(nan, t, _mm256_castps_si256(mask));
+  // output = numext::bit_cast<uint16_t>(input);
+  return _mm_packus_epi32(_mm256_extractf128_si256(t, 0), _mm256_extractf128_si256(t, 1));
+#else
+  // uint32_t lsb = (input >> 16);
+  __m128i lo = _mm_srli_epi32(_mm256_extractf128_si256(input, 0), 16);
+  __m128i hi = _mm_srli_epi32(_mm256_extractf128_si256(input, 1), 16);
+  // uint32_t lsb = lsb & 1;
+  lo = _mm_and_si128(lo, _mm_set1_epi32(1));
+  hi = _mm_and_si128(hi, _mm_set1_epi32(1));
+  // uint32_t rounding_bias = 0x7fff + lsb;
+  lo = _mm_add_epi32(lo, _mm_set1_epi32(0x7fff));
+  hi = _mm_add_epi32(hi, _mm_set1_epi32(0x7fff));
+  // input += rounding_bias;
+  lo = _mm_add_epi32(lo, _mm256_extractf128_si256(input, 0));
+  hi = _mm_add_epi32(hi, _mm256_extractf128_si256(input, 1));
+  // input = input >> 16;
+  lo = _mm_srli_epi32(lo, 16);
+  hi = _mm_srli_epi32(hi, 16);
+  // Check NaN before converting back to bf16
+  __m256 mask = _mm256_cmp_ps(a, a, _CMP_ORD_Q);
+  __m128i nan = _mm_set1_epi32(0x7fc0);
+  lo = _mm_blendv_epi8(nan, lo, _mm_castps_si128(_mm256_castps256_ps128(mask)));
+  hi = _mm_blendv_epi8(nan, hi, _mm_castps_si128(_mm256_extractf128_ps(mask, 1)));
+  // output = numext::bit_cast<uint16_t>(input);
+  return _mm_packus_epi32(lo, hi);
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pset1<Packet8bf>(const bfloat16& from) {
+  return _mm_set1_epi16(numext::bit_cast<numext::uint16_t>(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 pfirst<Packet8bf>(const Packet8bf& from) {
+  return numext::bit_cast<bfloat16>(static_cast<numext::uint16_t>(_mm_extract_epi16(from, 0)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pload<Packet8bf>(const bfloat16* from) {
+  return _mm_load_si128(reinterpret_cast<const __m128i*>(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf ploadu<Packet8bf>(const bfloat16* from) {
+  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet8bf& from) {
+  _mm_store_si128(reinterpret_cast<__m128i*>(to), from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet8bf& from) {
+  _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf ploaddup<Packet8bf>(const bfloat16* from) {
+  const numext::uint16_t a = numext::bit_cast<numext::uint16_t>(from[0]);
+  const numext::uint16_t b = numext::bit_cast<numext::uint16_t>(from[1]);
+  const numext::uint16_t c = numext::bit_cast<numext::uint16_t>(from[2]);
+  const numext::uint16_t d = numext::bit_cast<numext::uint16_t>(from[3]);
+  return _mm_set_epi16(d, d, c, c, b, b, a, a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf ploadquad<Packet8bf>(const bfloat16* from) {
+  const numext::uint16_t a = numext::bit_cast<numext::uint16_t>(from[0]);
+  const numext::uint16_t b = numext::bit_cast<numext::uint16_t>(from[1]);
+  return _mm_set_epi16(b, b, b, b, a, a, a, a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf ptrue(const Packet8bf& a) {
+  return _mm_cmpeq_epi32(a, a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pabs(const Packet8bf& a) {
+  const __m128i sign_mask = _mm_set1_epi16(static_cast<numext::uint16_t>(0x8000));
+  return _mm_andnot_si128(sign_mask, a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pmin<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+  return F32ToBf16(pmin<Packet8f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pmax<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+  return F32ToBf16(pmax<Packet8f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf plset<Packet8bf>(const bfloat16& a) {
+  return F32ToBf16(plset<Packet8f>(static_cast<float>(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf por(const Packet8bf& a, const Packet8bf& b) {
+  return _mm_or_si128(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8bf pxor(const Packet8bf& a, const Packet8bf& b) {
+  return _mm_xor_si128(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8bf pand(const Packet8bf& a, const Packet8bf& b) {
+  return _mm_and_si128(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8bf pandnot(const Packet8bf& a, const Packet8bf& b) {
+  return _mm_andnot_si128(b, a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pselect(const Packet8bf& mask, const Packet8bf& a, const Packet8bf& b) {
+  return _mm_blendv_epi8(b, a, mask);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pround<Packet8bf>(const Packet8bf& a) {
+  return F32ToBf16(pround<Packet8f>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf print<Packet8bf>(const Packet8bf& a) {
+  return F32ToBf16(print<Packet8f>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pceil<Packet8bf>(const Packet8bf& a) {
+  return F32ToBf16(pceil<Packet8f>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pfloor<Packet8bf>(const Packet8bf& a) {
+  return F32ToBf16(pfloor<Packet8f>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf ptrunc<Packet8bf>(const Packet8bf& a) {
+  return F32ToBf16(ptrunc<Packet8f>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pcmp_eq(const Packet8bf& a, const Packet8bf& b) {
+  return Pack16To8(pcmp_eq(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pcmp_le(const Packet8bf& a, const Packet8bf& b) {
+  return Pack16To8(pcmp_le(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pcmp_lt(const Packet8bf& a, const Packet8bf& b) {
+  return Pack16To8(pcmp_lt(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pcmp_lt_or_nan(const Packet8bf& a, const Packet8bf& b) {
+  return Pack16To8(pcmp_lt_or_nan(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pconj(const Packet8bf& a) {
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pnegate(const Packet8bf& a) {
+  Packet8bf sign_mask = _mm_set1_epi16(static_cast<numext::uint16_t>(0x8000));
+  return _mm_xor_si128(a, sign_mask);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf padd<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+  return F32ToBf16(padd<Packet8f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf psub<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+  return F32ToBf16(psub<Packet8f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pmul<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+  return F32ToBf16(pmul<Packet8f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pmadd<Packet8bf>(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
+  return F32ToBf16(pmadd(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pmsub<Packet8bf>(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
+  return F32ToBf16(pmsub(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pnmadd<Packet8bf>(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
+  return F32ToBf16(pnmadd(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pnmsub<Packet8bf>(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
+  return F32ToBf16(pnmsub(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pdiv<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+  return F32ToBf16(pdiv<Packet8f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pgather<bfloat16, Packet8bf>(const bfloat16* from, Index stride) {
+  const numext::uint16_t s0 = numext::bit_cast<numext::uint16_t>(from[0 * stride]);
+  const numext::uint16_t s1 = numext::bit_cast<numext::uint16_t>(from[1 * stride]);
+  const numext::uint16_t s2 = numext::bit_cast<numext::uint16_t>(from[2 * stride]);
+  const numext::uint16_t s3 = numext::bit_cast<numext::uint16_t>(from[3 * stride]);
+  const numext::uint16_t s4 = numext::bit_cast<numext::uint16_t>(from[4 * stride]);
+  const numext::uint16_t s5 = numext::bit_cast<numext::uint16_t>(from[5 * stride]);
+  const numext::uint16_t s6 = numext::bit_cast<numext::uint16_t>(from[6 * stride]);
+  const numext::uint16_t s7 = numext::bit_cast<numext::uint16_t>(from[7 * stride]);
+  return _mm_set_epi16(s7, s6, s5, s4, s3, s2, s1, s0);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pscatter<bfloat16, Packet8bf>(bfloat16* to, const Packet8bf& from, Index stride) {
+  EIGEN_ALIGN32 bfloat16 aux[8];
+  pstore(aux, from);
+  to[stride * 0] = aux[0];
+  to[stride * 1] = aux[1];
+  to[stride * 2] = aux[2];
+  to[stride * 3] = aux[3];
+  to[stride * 4] = aux[4];
+  to[stride * 5] = aux[5];
+  to[stride * 6] = aux[6];
+  to[stride * 7] = aux[7];
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf preverse(const Packet8bf& a) {
+  __m128i m = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
+  return _mm_shuffle_epi8(a, m);
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8bf, 8>& kernel) {
+  __m128i a = kernel.packet[0];
+  __m128i b = kernel.packet[1];
+  __m128i c = kernel.packet[2];
+  __m128i d = kernel.packet[3];
+  __m128i e = kernel.packet[4];
+  __m128i f = kernel.packet[5];
+  __m128i g = kernel.packet[6];
+  __m128i h = kernel.packet[7];
+
+  __m128i a03b03 = _mm_unpacklo_epi16(a, b);
+  __m128i c03d03 = _mm_unpacklo_epi16(c, d);
+  __m128i e03f03 = _mm_unpacklo_epi16(e, f);
+  __m128i g03h03 = _mm_unpacklo_epi16(g, h);
+  __m128i a47b47 = _mm_unpackhi_epi16(a, b);
+  __m128i c47d47 = _mm_unpackhi_epi16(c, d);
+  __m128i e47f47 = _mm_unpackhi_epi16(e, f);
+  __m128i g47h47 = _mm_unpackhi_epi16(g, h);
+
+  __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03);
+  __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03);
+  __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03);
+  __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03);
+  __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47);
+  __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47);
+  __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47);
+  __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47);
+
+  kernel.packet[0] = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01);
+  kernel.packet[1] = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01);
+  kernel.packet[2] = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23);
+  kernel.packet[3] = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23);
+  kernel.packet[4] = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45);
+  kernel.packet[5] = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45);
+  kernel.packet[6] = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67);
+  kernel.packet[7] = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67);
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8bf, 4>& kernel) {
+  __m128i a = kernel.packet[0];
+  __m128i b = kernel.packet[1];
+  __m128i c = kernel.packet[2];
+  __m128i d = kernel.packet[3];
+
+  __m128i ab_03 = _mm_unpacklo_epi16(a, b);
+  __m128i cd_03 = _mm_unpacklo_epi16(c, d);
+  __m128i ab_47 = _mm_unpackhi_epi16(a, b);
+  __m128i cd_47 = _mm_unpackhi_epi16(c, d);
+
+  kernel.packet[0] = _mm_unpacklo_epi32(ab_03, cd_03);
+  kernel.packet[1] = _mm_unpackhi_epi32(ab_03, cd_03);
+  kernel.packet[2] = _mm_unpacklo_epi32(ab_47, cd_47);
+  kernel.packet[3] = _mm_unpackhi_epi32(ab_47, cd_47);
+}
+
+/*---------------- load/store segment support ----------------*/
+
+// returns a mask of 8-bit elements (at most 4) that are all 1's in the range [begin, begin + count) and 0 elsewhere.
+inline __m128i segment_mask_4x8(Index begin, Index count) {
+  eigen_assert(begin >= 0 && begin + count <= 4);
+  long long mask = 1;
+  mask <<= CHAR_BIT * count;
+  mask--;
+  mask <<= CHAR_BIT * begin;
+#if !EIGEN_ARCH_x86_64
+  return _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&mask));
+#else
+  return _mm_cvtsi64_si128(mask);
+#endif
+}
+
+// returns a mask of 8-bit elements (at most 8) that are all 1's in the range [begin, begin + count) and 0 elsewhere.
+inline __m128i segment_mask_8x8(Index begin, Index count) {
+  eigen_assert(begin >= 0 && begin + count <= 8);
+  long long mask = 1;
+  // avoid UB when count == 8
+  mask <<= (CHAR_BIT / 2) * count;
+  mask <<= (CHAR_BIT / 2) * count;
+  mask--;
+  mask <<= CHAR_BIT * begin;
+#if !EIGEN_ARCH_x86_64
+  return _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&mask));
+#else
+  return _mm_cvtsi64_si128(mask);
+#endif
+}
+
+// returns a mask of 32-bit elements (at most 4) that are all 1's in the range [begin, begin + count) and 0 elsewhere.
+inline __m128i segment_mask_4x32(Index begin, Index count) {
+  eigen_assert(begin >= 0 && begin + count <= 4);
+  return _mm_cvtepi8_epi32(segment_mask_4x8(begin, count));
+}
+
+// returns a mask of 64-bit elements (at most 2) that are all 1's in the range [begin, begin + count) and 0 elsewhere.
+inline __m128i segment_mask_2x64(Index begin, Index count) {
+  eigen_assert(begin >= 0 && begin + count <= 2);
+  return _mm_cvtepi8_epi64(segment_mask_4x8(begin, count));
+}
+
+// returns a mask of 32-bit elements (at most 8) that are all 1's in the range [begin, begin + count) and 0 elsewhere.
+inline __m256i segment_mask_8x32(Index begin, Index count) {
+  __m128i mask_epi8 = segment_mask_8x8(begin, count);
+#ifdef EIGEN_VECTORIZE_AVX2
+  __m256i mask_epi32 = _mm256_cvtepi8_epi32(mask_epi8);
+#else
+  __m128i mask_epi32_lo = _mm_cvtepi8_epi32(mask_epi8);
+  __m128i mask_epi32_hi = _mm_cvtepi8_epi32(_mm_srli_epi64(mask_epi8, 32));
+  __m256i mask_epi32 = _mm256_insertf128_si256(_mm256_castsi128_si256(mask_epi32_lo), mask_epi32_hi, 1);
+#endif
+  return mask_epi32;
+}
+
+// returns a mask of 64-bit elements (at most 4) that are all 1's in the range [begin, begin + count) and 0 elsewhere.
+inline __m256i segment_mask_4x64(Index begin, Index count) {
+  __m128i mask_epi8 = segment_mask_4x8(begin, count);
+#ifdef EIGEN_VECTORIZE_AVX2
+  __m256i mask_epi64 = _mm256_cvtepi8_epi64(mask_epi8);
+#else
+  __m128i mask_epi64_lo = _mm_cvtepi8_epi64(mask_epi8);
+  __m128i mask_epi64_hi = _mm_cvtepi8_epi64(_mm_srli_epi64(mask_epi8, 16));
+  __m256i mask_epi64 = _mm256_insertf128_si256(_mm256_castsi128_si256(mask_epi64_lo), mask_epi64_hi, 1);
+#endif
+  return mask_epi64;
+}
+
+/*---------------- float ----------------*/
+
+template <>
+struct has_packet_segment<Packet4f> : std::true_type {};
+
+template <>
+struct has_packet_segment<Packet8f> : std::true_type {};
+
+template <>
+inline Packet4f ploaduSegment<Packet4f>(const float* from, Index begin, Index count) {
+  return _mm_maskload_ps(from, segment_mask_4x32(begin, count));
+}
+
+template <>
+inline void pstoreuSegment<float, Packet4f>(float* to, const Packet4f& from, Index begin, Index count) {
+  _mm_maskstore_ps(to, segment_mask_4x32(begin, count), from);
+}
+
+template <>
+inline Packet8f ploaduSegment<Packet8f>(const float* from, Index begin, Index count) {
+  return _mm256_maskload_ps(from, segment_mask_8x32(begin, count));
+}
+
+template <>
+inline void pstoreuSegment<float, Packet8f>(float* to, const Packet8f& from, Index begin, Index count) {
+  _mm256_maskstore_ps(to, segment_mask_8x32(begin, count), from);
+}
+
+/*---------------- int32 ----------------*/
+
+template <>
+struct has_packet_segment<Packet4i> : std::true_type {};
+
+template <>
+struct has_packet_segment<Packet8i> : std::true_type {};
+
+#ifdef EIGEN_VECTORIZE_AVX2
+
+template <>
+inline Packet4i ploaduSegment<Packet4i>(const int* from, Index begin, Index count) {
+  return _mm_maskload_epi32(from, segment_mask_4x32(begin, count));
+}
+
+template <>
+inline void pstoreuSegment<int, Packet4i>(int* to, const Packet4i& from, Index begin, Index count) {
+  _mm_maskstore_epi32(to, segment_mask_4x32(begin, count), from);
+}
+
+template <>
+inline Packet8i ploaduSegment<Packet8i>(const int* from, Index begin, Index count) {
+  return _mm256_maskload_epi32(from, segment_mask_8x32(begin, count));
+}
+
+template <>
+inline void pstoreuSegment<int, Packet8i>(int* to, const Packet8i& from, Index begin, Index count) {
+  _mm256_maskstore_epi32(to, segment_mask_8x32(begin, count), from);
+}
+
+#else
+
+template <>
+inline Packet4i ploaduSegment<Packet4i>(const int* from, Index begin, Index count) {
+  return _mm_castps_si128(ploaduSegment<Packet4f>(reinterpret_cast<const float*>(from), begin, count));
+}
+
+template <>
+inline void pstoreuSegment<int, Packet4i>(int* to, const Packet4i& from, Index begin, Index count) {
+  pstoreuSegment<float, Packet4f>(reinterpret_cast<float*>(to), _mm_castsi128_ps(from), begin, count);
+}
+
+template <>
+inline Packet8i ploaduSegment<Packet8i>(const int* from, Index begin, Index count) {
+  return _mm256_castps_si256(ploaduSegment<Packet8f>(reinterpret_cast<const float*>(from), begin, count));
+}
+
+template <>
+inline void pstoreuSegment<int, Packet8i>(int* to, const Packet8i& from, Index begin, Index count) {
+  pstoreuSegment<float, Packet8f>(reinterpret_cast<float*>(to), _mm256_castsi256_ps(from), begin, count);
+}
+
+#endif
+
+/*---------------- uint32 ----------------*/
+
+template <>
+struct has_packet_segment<Packet4ui> : std::true_type {};
+
+template <>
+struct has_packet_segment<Packet8ui> : std::true_type {};
+
+template <>
+inline Packet4ui ploaduSegment<Packet4ui>(const uint32_t* from, Index begin, Index count) {
+  return Packet4ui(ploaduSegment<Packet4i>(reinterpret_cast<const int*>(from), begin, count));
+}
+
+template <>
+inline void pstoreuSegment<uint32_t, Packet4ui>(uint32_t* to, const Packet4ui& from, Index begin, Index count) {
+  pstoreuSegment<int, Packet4i>(reinterpret_cast<int*>(to), Packet4i(from), begin, count);
+}
+
+template <>
+inline Packet8ui ploaduSegment<Packet8ui>(const uint32_t* from, Index begin, Index count) {
+  return Packet8ui(ploaduSegment<Packet8i>(reinterpret_cast<const int*>(from), begin, count));
+}
+
+template <>
+inline void pstoreuSegment<uint32_t, Packet8ui>(uint32_t* to, const Packet8ui& from, Index begin, Index count) {
+  pstoreuSegment<int, Packet8i>(reinterpret_cast<int*>(to), Packet8i(from), begin, count);
+}
+
+/*---------------- double ----------------*/
+
+template <>
+struct has_packet_segment<Packet2d> : std::true_type {};
+
+template <>
+struct has_packet_segment<Packet4d> : std::true_type {};
+
+template <>
+inline Packet2d ploaduSegment<Packet2d>(const double* from, Index begin, Index count) {
+  return _mm_maskload_pd(from, segment_mask_2x64(begin, count));
+}
+
+template <>
+inline void pstoreuSegment<double, Packet2d>(double* to, const Packet2d& from, Index begin, Index count) {
+  _mm_maskstore_pd(to, segment_mask_2x64(begin, count), from);
+}
+
+template <>
+inline Packet4d ploaduSegment<Packet4d>(const double* from, Index begin, Index count) {
+  return _mm256_maskload_pd(from, segment_mask_4x64(begin, count));
+}
+
+template <>
+inline void pstoreuSegment<double, Packet4d>(double* to, const Packet4d& from, Index begin, Index count) {
+  _mm256_maskstore_pd(to, segment_mask_4x64(begin, count), from);
+}
+
+#ifdef EIGEN_VECTORIZE_AVX2
+
+/*---------------- int64_t ----------------*/
+
+template <>
+struct has_packet_segment<Packet2l> : std::true_type {};
+
+template <>
+struct has_packet_segment<Packet4l> : std::true_type {};
+
+template <>
+inline Packet2l ploaduSegment<Packet2l>(const int64_t* from, Index begin, Index count) {
+  return _mm_maskload_epi64(reinterpret_cast<const long long*>(from), segment_mask_2x64(begin, count));
+}
+template <>
+inline void pstoreuSegment<int64_t, Packet2l>(int64_t* to, const Packet2l& from, Index begin, Index count) {
+  _mm_maskstore_epi64(reinterpret_cast<long long*>(to), segment_mask_2x64(begin, count), from);
+}
+template <>
+inline Packet4l ploaduSegment<Packet4l>(const int64_t* from, Index begin, Index count) {
+  return _mm256_maskload_epi64(reinterpret_cast<const long long*>(from), segment_mask_4x64(begin, count));
+}
+template <>
+inline void pstoreuSegment<int64_t, Packet4l>(int64_t* to, const Packet4l& from, Index begin, Index count) {
+  _mm256_maskstore_epi64(reinterpret_cast<long long*>(to), segment_mask_4x64(begin, count), from);
+}
+
+/*---------------- uint64_t ----------------*/
+
+template <>
+struct has_packet_segment<Packet4ul> : std::true_type {};
+
+template <>
+inline Packet4ul ploaduSegment<Packet4ul>(const uint64_t* from, Index begin, Index count) {
+  return Packet4ul(ploaduSegment<Packet4l>(reinterpret_cast<const int64_t*>(from), begin, count));
+}
+template <>
+inline void pstoreuSegment<uint64_t, Packet4ul>(uint64_t* to, const Packet4ul& from, Index begin, Index count) {
+  pstoreuSegment<int64_t, Packet4l>(reinterpret_cast<int64_t*>(to), Packet4l(from), begin, count);
+}
+#endif
+
+/*---------------- end load/store segment support ----------------*/
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_PACKET_MATH_AVX_H
diff --git a/inst/include/Eigen/src/Core/arch/AVX/Reductions.h b/inst/include/Eigen/src/Core/arch/AVX/Reductions.h
new file mode 100644
index 00000000..237617c5
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/AVX/Reductions.h
@@ -0,0 +1,353 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2025 Charlie Schlosser <cs.schlosser@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_REDUCTIONS_AVX_H
+#define EIGEN_REDUCTIONS_AVX_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet8i -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <>
+EIGEN_STRONG_INLINE int predux(const Packet8i& a) {
+  Packet4i lo = _mm256_castsi256_si128(a);
+  Packet4i hi = _mm256_extractf128_si256(a, 1);
+  return predux(padd(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE int predux_mul(const Packet8i& a) {
+  Packet4i lo = _mm256_castsi256_si128(a);
+  Packet4i hi = _mm256_extractf128_si256(a, 1);
+  return predux_mul(pmul(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE int predux_min(const Packet8i& a) {
+  Packet4i lo = _mm256_castsi256_si128(a);
+  Packet4i hi = _mm256_extractf128_si256(a, 1);
+  return predux_min(pmin(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE int predux_max(const Packet8i& a) {
+  Packet4i lo = _mm256_castsi256_si128(a);
+  Packet4i hi = _mm256_extractf128_si256(a, 1);
+  return predux_max(pmax(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet8i& a) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_movemask_epi8(a) != 0x0;
+#else
+  return _mm256_movemask_ps(_mm256_castsi256_ps(a)) != 0x0;
+#endif
+}
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet8ui -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <>
+EIGEN_STRONG_INLINE uint32_t predux(const Packet8ui& a) {
+  Packet4ui lo = _mm256_castsi256_si128(a);
+  Packet4ui hi = _mm256_extractf128_si256(a, 1);
+  return predux(padd(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_mul(const Packet8ui& a) {
+  Packet4ui lo = _mm256_castsi256_si128(a);
+  Packet4ui hi = _mm256_extractf128_si256(a, 1);
+  return predux_mul(pmul(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_min(const Packet8ui& a) {
+  Packet4ui lo = _mm256_castsi256_si128(a);
+  Packet4ui hi = _mm256_extractf128_si256(a, 1);
+  return predux_min(pmin(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_max(const Packet8ui& a) {
+  Packet4ui lo = _mm256_castsi256_si128(a);
+  Packet4ui hi = _mm256_extractf128_si256(a, 1);
+  return predux_max(pmax(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet8ui& a) {
+#ifdef EIGEN_VECTORIZE_AVX2
+  return _mm256_movemask_epi8(a) != 0x0;
+#else
+  return _mm256_movemask_ps(_mm256_castsi256_ps(a)) != 0x0;
+#endif
+}
+
+#ifdef EIGEN_VECTORIZE_AVX2
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet4l -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <>
+EIGEN_STRONG_INLINE int64_t predux(const Packet4l& a) {
+  Packet2l lo = _mm256_castsi256_si128(a);
+  Packet2l hi = _mm256_extractf128_si256(a, 1);
+  return predux(padd(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet4l& a) {
+  return _mm256_movemask_pd(_mm256_castsi256_pd(a)) != 0x0;
+}
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet4ul -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <>
+EIGEN_STRONG_INLINE uint64_t predux(const Packet4ul& a) {
+  return static_cast<uint64_t>(predux(Packet4l(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet4ul& a) {
+  return _mm256_movemask_pd(_mm256_castsi256_pd(a)) != 0x0;
+}
+
+#endif
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet8f -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <>
+EIGEN_STRONG_INLINE float predux(const Packet8f& a) {
+  Packet4f lo = _mm256_castps256_ps128(a);
+  Packet4f hi = _mm256_extractf128_ps(a, 1);
+  return predux(padd(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_mul(const Packet8f& a) {
+  Packet4f lo = _mm256_castps256_ps128(a);
+  Packet4f hi = _mm256_extractf128_ps(a, 1);
+  return predux_mul(pmul(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_min(const Packet8f& a) {
+  Packet4f lo = _mm256_castps256_ps128(a);
+  Packet4f hi = _mm256_extractf128_ps(a, 1);
+  return predux_min(pmin(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_min<PropagateNumbers>(const Packet8f& a) {
+  Packet4f lo = _mm256_castps256_ps128(a);
+  Packet4f hi = _mm256_extractf128_ps(a, 1);
+  return predux_min<PropagateNumbers>(pmin<PropagateNumbers>(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_min<PropagateNaN>(const Packet8f& a) {
+  Packet4f lo = _mm256_castps256_ps128(a);
+  Packet4f hi = _mm256_extractf128_ps(a, 1);
+  return predux_min<PropagateNaN>(pmin<PropagateNaN>(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_max(const Packet8f& a) {
+  Packet4f lo = _mm256_castps256_ps128(a);
+  Packet4f hi = _mm256_extractf128_ps(a, 1);
+  return predux_max(pmax(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_max<PropagateNumbers>(const Packet8f& a) {
+  Packet4f lo = _mm256_castps256_ps128(a);
+  Packet4f hi = _mm256_extractf128_ps(a, 1);
+  return predux_max<PropagateNumbers>(pmax<PropagateNumbers>(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_max<PropagateNaN>(const Packet8f& a) {
+  Packet4f lo = _mm256_castps256_ps128(a);
+  Packet4f hi = _mm256_extractf128_ps(a, 1);
+  return predux_max<PropagateNaN>(pmax<PropagateNaN>(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet8f& a) {
+  return _mm256_movemask_ps(a) != 0x0;
+}
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet4d -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <>
+EIGEN_STRONG_INLINE double predux(const Packet4d& a) {
+  Packet2d lo = _mm256_castpd256_pd128(a);
+  Packet2d hi = _mm256_extractf128_pd(a, 1);
+  return predux(padd(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_mul(const Packet4d& a) {
+  Packet2d lo = _mm256_castpd256_pd128(a);
+  Packet2d hi = _mm256_extractf128_pd(a, 1);
+  return predux_mul(pmul(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_min(const Packet4d& a) {
+  Packet2d lo = _mm256_castpd256_pd128(a);
+  Packet2d hi = _mm256_extractf128_pd(a, 1);
+  return predux_min(pmin(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_min<PropagateNumbers>(const Packet4d& a) {
+  Packet2d lo = _mm256_castpd256_pd128(a);
+  Packet2d hi = _mm256_extractf128_pd(a, 1);
+  return predux_min<PropagateNumbers>(pmin<PropagateNumbers>(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_min<PropagateNaN>(const Packet4d& a) {
+  Packet2d lo = _mm256_castpd256_pd128(a);
+  Packet2d hi = _mm256_extractf128_pd(a, 1);
+  return predux_min<PropagateNaN>(pmin<PropagateNaN>(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_max(const Packet4d& a) {
+  Packet2d lo = _mm256_castpd256_pd128(a);
+  Packet2d hi = _mm256_extractf128_pd(a, 1);
+  return predux_max(pmax(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_max<PropagateNumbers>(const Packet4d& a) {
+  Packet2d lo = _mm256_castpd256_pd128(a);
+  Packet2d hi = _mm256_extractf128_pd(a, 1);
+  return predux_max<PropagateNumbers>(pmax<PropagateNumbers>(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_max<PropagateNaN>(const Packet4d& a) {
+  Packet2d lo = _mm256_castpd256_pd128(a);
+  Packet2d hi = _mm256_extractf128_pd(a, 1);
+  return predux_max<PropagateNaN>(pmax<PropagateNaN>(lo, hi));
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet4d& a) {
+  return _mm256_movemask_pd(a) != 0x0;
+}
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet8h -- -- -- -- -- -- -- -- -- -- -- -- */
+#ifndef EIGEN_VECTORIZE_AVX512FP16
+
+template <>
+EIGEN_STRONG_INLINE half predux(const Packet8h& a) {
+  return static_cast<half>(predux(half2float(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_mul(const Packet8h& a) {
+  return static_cast<half>(predux_mul(half2float(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_min(const Packet8h& a) {
+  return static_cast<half>(predux_min(half2float(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_min<PropagateNumbers>(const Packet8h& a) {
+  return static_cast<half>(predux_min<PropagateNumbers>(half2float(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_min<PropagateNaN>(const Packet8h& a) {
+  return static_cast<half>(predux_min<PropagateNaN>(half2float(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_max(const Packet8h& a) {
+  return static_cast<half>(predux_max(half2float(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_max<PropagateNumbers>(const Packet8h& a) {
+  return static_cast<half>(predux_max<PropagateNumbers>(half2float(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_max<PropagateNaN>(const Packet8h& a) {
+  return static_cast<half>(predux_max<PropagateNaN>(half2float(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet8h& a) {
+  return _mm_movemask_epi8(a) != 0;
+}
+#endif  // EIGEN_VECTORIZE_AVX512FP16
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet8bf -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux(const Packet8bf& a) {
+  return static_cast<bfloat16>(predux<Packet8f>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_mul(const Packet8bf& a) {
+  return static_cast<bfloat16>(predux_mul<Packet8f>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_min(const Packet8bf& a) {
+  return static_cast<bfloat16>(predux_min(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_min<PropagateNumbers>(const Packet8bf& a) {
+  return static_cast<bfloat16>(predux_min<PropagateNumbers>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_min<PropagateNaN>(const Packet8bf& a) {
+  return static_cast<bfloat16>(predux_min<PropagateNaN>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_max(const Packet8bf& a) {
+  return static_cast<bfloat16>(predux_max<Packet8f>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_max<PropagateNumbers>(const Packet8bf& a) {
+  return static_cast<bfloat16>(predux_max<PropagateNumbers>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_max<PropagateNaN>(const Packet8bf& a) {
+  return static_cast<bfloat16>(predux_max<PropagateNaN>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet8bf& a) {
+  return _mm_movemask_epi8(a) != 0;
+}
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_REDUCTIONS_AVX_H
diff --git a/inst/include/Eigen/src/Core/arch/AVX/TypeCasting.h b/inst/include/Eigen/src/Core/arch/AVX/TypeCasting.h
new file mode 100644
index 00000000..767e2d55
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/AVX/TypeCasting.h
@@ -0,0 +1,308 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TYPE_CASTING_AVX_H
+#define EIGEN_TYPE_CASTING_AVX_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+#ifndef EIGEN_VECTORIZE_AVX512
+template <>
+struct type_casting_traits<float, bool> : vectorized_type_casting_traits<float, bool> {};
+template <>
+struct type_casting_traits<bool, float> : vectorized_type_casting_traits<bool, float> {};
+
+template <>
+struct type_casting_traits<float, int> : vectorized_type_casting_traits<float, int> {};
+template <>
+struct type_casting_traits<int, float> : vectorized_type_casting_traits<int, float> {};
+
+template <>
+struct type_casting_traits<float, double> : vectorized_type_casting_traits<float, double> {};
+template <>
+struct type_casting_traits<double, float> : vectorized_type_casting_traits<double, float> {};
+
+template <>
+struct type_casting_traits<double, int> : vectorized_type_casting_traits<double, int> {};
+template <>
+struct type_casting_traits<int, double> : vectorized_type_casting_traits<int, double> {};
+
+template <>
+struct type_casting_traits<half, float> : vectorized_type_casting_traits<half, float> {};
+template <>
+struct type_casting_traits<float, half> : vectorized_type_casting_traits<float, half> {};
+
+template <>
+struct type_casting_traits<bfloat16, float> : vectorized_type_casting_traits<bfloat16, float> {};
+template <>
+struct type_casting_traits<float, bfloat16> : vectorized_type_casting_traits<float, bfloat16> {};
+
+#ifdef EIGEN_VECTORIZE_AVX2
+template <>
+struct type_casting_traits<double, int64_t> : vectorized_type_casting_traits<double, int64_t> {};
+template <>
+struct type_casting_traits<int64_t, double> : vectorized_type_casting_traits<int64_t, double> {};
+#endif
+#endif
+
+template <>
+EIGEN_STRONG_INLINE Packet16b pcast<Packet8f, Packet16b>(const Packet8f& a, const Packet8f& b) {
+  __m256 nonzero_a = _mm256_cmp_ps(a, pzero(a), _CMP_NEQ_UQ);
+  __m256 nonzero_b = _mm256_cmp_ps(b, pzero(b), _CMP_NEQ_UQ);
+  constexpr char kFF = '\255';
+#ifndef EIGEN_VECTORIZE_AVX2
+  __m128i shuffle_mask128_a_lo = _mm_set_epi8(kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, 12, 8, 4, 0);
+  __m128i shuffle_mask128_a_hi = _mm_set_epi8(kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, 12, 8, 4, 0, kFF, kFF, kFF, kFF);
+  __m128i shuffle_mask128_b_lo = _mm_set_epi8(kFF, kFF, kFF, kFF, 12, 8, 4, 0, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF);
+  __m128i shuffle_mask128_b_hi = _mm_set_epi8(12, 8, 4, 0, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF);
+  __m128i a_hi = _mm_shuffle_epi8(_mm256_extractf128_si256(_mm256_castps_si256(nonzero_a), 1), shuffle_mask128_a_hi);
+  __m128i a_lo = _mm_shuffle_epi8(_mm256_extractf128_si256(_mm256_castps_si256(nonzero_a), 0), shuffle_mask128_a_lo);
+  __m128i b_hi = _mm_shuffle_epi8(_mm256_extractf128_si256(_mm256_castps_si256(nonzero_b), 1), shuffle_mask128_b_hi);
+  __m128i b_lo = _mm_shuffle_epi8(_mm256_extractf128_si256(_mm256_castps_si256(nonzero_b), 0), shuffle_mask128_b_lo);
+  __m128i merged = _mm_or_si128(_mm_or_si128(b_lo, b_hi), _mm_or_si128(a_lo, a_hi));
+  return _mm_and_si128(merged, _mm_set1_epi8(1));
+#else
+  __m256i a_shuffle_mask = _mm256_set_epi8(kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, 12, 8, 4, 0, kFF, kFF, kFF, kFF, kFF,
+                                           kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, 12, 8, 4, 0);
+  __m256i b_shuffle_mask = _mm256_set_epi8(12, 8, 4, 0, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF,
+                                           kFF, kFF, kFF, 12, 8, 4, 0, kFF, kFF, kFF, kFF, kFF, kFF, kFF, kFF);
+  __m256i a_shuff = _mm256_shuffle_epi8(_mm256_castps_si256(nonzero_a), a_shuffle_mask);
+  __m256i b_shuff = _mm256_shuffle_epi8(_mm256_castps_si256(nonzero_b), b_shuffle_mask);
+  __m256i a_or_b = _mm256_or_si256(a_shuff, b_shuff);
+  __m256i merged = _mm256_or_si256(a_or_b, _mm256_castsi128_si256(_mm256_extractf128_si256(a_or_b, 1)));
+  return _mm256_castsi256_si128(_mm256_and_si256(merged, _mm256_set1_epi8(1)));
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8f pcast<Packet16b, Packet8f>(const Packet16b& a) {
+  const __m256 cst_one = _mm256_set1_ps(1.0f);
+#ifdef EIGEN_VECTORIZE_AVX2
+  __m256i a_extended = _mm256_cvtepi8_epi32(a);
+  __m256i abcd_efgh = _mm256_cmpeq_epi32(a_extended, _mm256_setzero_si256());
+#else
+  __m128i abcd_efhg_ijkl_mnop = _mm_cmpeq_epi8(a, _mm_setzero_si128());
+  __m128i aabb_ccdd_eeff_gghh = _mm_unpacklo_epi8(abcd_efhg_ijkl_mnop, abcd_efhg_ijkl_mnop);
+  __m128i aaaa_bbbb_cccc_dddd = _mm_unpacklo_epi8(aabb_ccdd_eeff_gghh, aabb_ccdd_eeff_gghh);
+  __m128i eeee_ffff_gggg_hhhh = _mm_unpackhi_epi8(aabb_ccdd_eeff_gghh, aabb_ccdd_eeff_gghh);
+  __m256i abcd_efgh = _mm256_setr_m128i(aaaa_bbbb_cccc_dddd, eeee_ffff_gggg_hhhh);
+#endif
+  __m256 result = _mm256_andnot_ps(_mm256_castsi256_ps(abcd_efgh), cst_one);
+  return result;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8i pcast<Packet8f, Packet8i>(const Packet8f& a) {
+  return _mm256_cvttps_epi32(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8i pcast<Packet4d, Packet8i>(const Packet4d& a, const Packet4d& b) {
+  return _mm256_set_m128i(_mm256_cvttpd_epi32(b), _mm256_cvttpd_epi32(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet4d, Packet4i>(const Packet4d& a) {
+  return _mm256_cvttpd_epi32(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8f pcast<Packet8i, Packet8f>(const Packet8i& a) {
+  return _mm256_cvtepi32_ps(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8f pcast<Packet4d, Packet8f>(const Packet4d& a, const Packet4d& b) {
+  return _mm256_set_m128(_mm256_cvtpd_ps(b), _mm256_cvtpd_ps(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet4d, Packet4f>(const Packet4d& a) {
+  return _mm256_cvtpd_ps(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4d pcast<Packet8i, Packet4d>(const Packet8i& a) {
+  return _mm256_cvtepi32_pd(_mm256_castsi256_si128(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4d pcast<Packet4i, Packet4d>(const Packet4i& a) {
+  return _mm256_cvtepi32_pd(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4d pcast<Packet8f, Packet4d>(const Packet8f& a) {
+  return _mm256_cvtps_pd(_mm256_castps256_ps128(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4d pcast<Packet4f, Packet4d>(const Packet4f& a) {
+  return _mm256_cvtps_pd(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8i preinterpret<Packet8i, Packet8f>(const Packet8f& a) {
+  return _mm256_castps_si256(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8f preinterpret<Packet8f, Packet8i>(const Packet8i& a) {
+  return _mm256_castsi256_ps(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8ui preinterpret<Packet8ui, Packet8i>(const Packet8i& a) {
+  return Packet8ui(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8i preinterpret<Packet8i, Packet8ui>(const Packet8ui& a) {
+  return Packet8i(a);
+}
+
+// truncation operations
+
+template <>
+EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f, Packet8f>(const Packet8f& a) {
+  return _mm256_castps256_ps128(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet4d>(const Packet4d& a) {
+  return _mm256_castpd256_pd128(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet8i>(const Packet8i& a) {
+  return _mm256_castsi256_si128(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4ui preinterpret<Packet4ui, Packet8ui>(const Packet8ui& a) {
+  return _mm256_castsi256_si128(a);
+}
+
+#ifdef EIGEN_VECTORIZE_AVX2
+template <>
+EIGEN_STRONG_INLINE Packet4l pcast<Packet4d, Packet4l>(const Packet4d& a) {
+#if defined(EIGEN_VECTORIZE_AVX512DQ) && defined(EIGEN_VECTORIZE_AVS512VL)
+  return _mm256_cvttpd_epi64(a);
+#else
+
+  // if 'a' exceeds the numerical limits of int64_t, the behavior is undefined
+
+  // e <= 0 corresponds to |a| < 1, which should result in zero. incidentally, intel intrinsics with shift arguments
+  // greater than or equal to 64 produce zero. furthermore, negative shifts appear to be interpreted as large positive
+  // shifts (two's complement), which also result in zero. therefore, e does not need to be clamped to [0, 64)
+
+  constexpr int kTotalBits = sizeof(double) * CHAR_BIT, kMantissaBits = std::numeric_limits<double>::digits - 1,
+                kExponentBits = kTotalBits - kMantissaBits - 1, kBias = (1 << (kExponentBits - 1)) - 1;
+
+  const __m256i cst_one = _mm256_set1_epi64x(1);
+  const __m256i cst_total_bits = _mm256_set1_epi64x(kTotalBits);
+  const __m256i cst_bias = _mm256_set1_epi64x(kBias);
+
+  __m256i a_bits = _mm256_castpd_si256(a);
+  // shift left by 1 to clear the sign bit, and shift right by kMantissaBits + 1 to recover biased exponent
+  __m256i biased_e = _mm256_srli_epi64(_mm256_slli_epi64(a_bits, 1), kMantissaBits + 1);
+  __m256i e = _mm256_sub_epi64(biased_e, cst_bias);
+
+  // shift to the left by kExponentBits + 1 to clear the sign and exponent bits
+  __m256i shifted_mantissa = _mm256_slli_epi64(a_bits, kExponentBits + 1);
+  // shift to the right by kTotalBits - e to convert the significand to an integer
+  __m256i result_significand = _mm256_srlv_epi64(shifted_mantissa, _mm256_sub_epi64(cst_total_bits, e));
+
+  // add the implied bit
+  __m256i result_exponent = _mm256_sllv_epi64(cst_one, e);
+  // e <= 0 is interpreted as a large positive shift (2's complement), which also conveniently results in zero
+  __m256i result = _mm256_add_epi64(result_significand, result_exponent);
+  // handle negative arguments
+  __m256i sign_mask = _mm256_cmpgt_epi64(_mm256_setzero_si256(), a_bits);
+  result = _mm256_sub_epi64(_mm256_xor_si256(result, sign_mask), sign_mask);
+  return result;
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4d pcast<Packet4l, Packet4d>(const Packet4l& a) {
+#if defined(EIGEN_VECTORIZE_AVX512DQ) && defined(EIGEN_VECTORIZE_AVS512VL)
+  return _mm256_cvtepi64_pd(a);
+#else
+  int64_t aux[4];
+  pstoreu(aux, a);
+  return _mm256_set_pd(static_cast<double>(aux[3]), static_cast<double>(aux[2]), static_cast<double>(aux[1]),
+                       static_cast<double>(aux[0]));
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4d pcast<Packet2l, Packet4d>(const Packet2l& a, const Packet2l& b) {
+  return _mm256_set_m128d((pcast<Packet2l, Packet2d>(b)), (pcast<Packet2l, Packet2d>(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4ul preinterpret<Packet4ul, Packet4l>(const Packet4l& a) {
+  return Packet4ul(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4l preinterpret<Packet4l, Packet4ul>(const Packet4ul& a) {
+  return Packet4l(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4l preinterpret<Packet4l, Packet4d>(const Packet4d& a) {
+  return _mm256_castpd_si256(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4d preinterpret<Packet4d, Packet4l>(const Packet4l& a) {
+  return _mm256_castsi256_pd(a);
+}
+
+// truncation operations
+template <>
+EIGEN_STRONG_INLINE Packet2l preinterpret<Packet2l, Packet4l>(const Packet4l& a) {
+  return _mm256_castsi256_si128(a);
+}
+#endif
+
+#ifndef EIGEN_VECTORIZE_AVX512FP16
+template <>
+EIGEN_STRONG_INLINE Packet8f pcast<Packet8h, Packet8f>(const Packet8h& a) {
+  return half2float(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pcast<Packet8f, Packet8h>(const Packet8f& a) {
+  return float2half(a);
+}
+#endif
+
+template <>
+EIGEN_STRONG_INLINE Packet8f pcast<Packet8bf, Packet8f>(const Packet8bf& a) {
+  return Bf16ToF32(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pcast<Packet8f, Packet8bf>(const Packet8f& a) {
+  return F32ToBf16(a);
+}
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_TYPE_CASTING_AVX_H
diff --git a/inst/include/Eigen/src/Core/arch/AVX512/Complex.h b/inst/include/Eigen/src/Core/arch/AVX512/Complex.h
new file mode 100644
index 00000000..b70c7fef
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/AVX512/Complex.h
@@ -0,0 +1,472 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_COMPLEX_AVX512_H
+#define EIGEN_COMPLEX_AVX512_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+//---------- float ----------
+struct Packet8cf {
+  EIGEN_STRONG_INLINE Packet8cf() {}
+  EIGEN_STRONG_INLINE explicit Packet8cf(const __m512& a) : v(a) {}
+  __m512 v;
+};
+
+template <>
+struct packet_traits<std::complex<float> > : default_packet_traits {
+  typedef Packet8cf type;
+  typedef Packet4cf half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 8,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasNegate = 1,
+    HasSqrt = 1,
+    HasLog = 1,
+    HasExp = 1,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 0,
+    HasMax = 0,
+    HasSetLinear = 0
+  };
+};
+
+template <>
+struct unpacket_traits<Packet8cf> {
+  typedef std::complex<float> type;
+  typedef Packet4cf half;
+  typedef Packet16f as_real;
+  enum {
+    size = 8,
+    alignment = unpacket_traits<Packet16f>::alignment,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet8cf ptrue<Packet8cf>(const Packet8cf& a) {
+  return Packet8cf(ptrue(Packet16f(a.v)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8cf padd<Packet8cf>(const Packet8cf& a, const Packet8cf& b) {
+  return Packet8cf(_mm512_add_ps(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8cf psub<Packet8cf>(const Packet8cf& a, const Packet8cf& b) {
+  return Packet8cf(_mm512_sub_ps(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8cf pnegate(const Packet8cf& a) {
+  return Packet8cf(pnegate(a.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8cf pconj(const Packet8cf& a) {
+  const __m512 mask = _mm512_castsi512_ps(_mm512_setr_epi32(
+      0x00000000, 0x80000000, 0x00000000, 0x80000000, 0x00000000, 0x80000000, 0x00000000, 0x80000000, 0x00000000,
+      0x80000000, 0x00000000, 0x80000000, 0x00000000, 0x80000000, 0x00000000, 0x80000000));
+  return Packet8cf(pxor(a.v, mask));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8cf pmul<Packet8cf>(const Packet8cf& a, const Packet8cf& b) {
+  __m512 tmp2 = _mm512_mul_ps(_mm512_movehdup_ps(a.v), _mm512_permute_ps(b.v, _MM_SHUFFLE(2, 3, 0, 1)));
+  return Packet8cf(_mm512_fmaddsub_ps(_mm512_moveldup_ps(a.v), b.v, tmp2));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8cf pand<Packet8cf>(const Packet8cf& a, const Packet8cf& b) {
+  return Packet8cf(pand(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8cf por<Packet8cf>(const Packet8cf& a, const Packet8cf& b) {
+  return Packet8cf(por(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8cf pxor<Packet8cf>(const Packet8cf& a, const Packet8cf& b) {
+  return Packet8cf(pxor(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8cf pandnot<Packet8cf>(const Packet8cf& a, const Packet8cf& b) {
+  return Packet8cf(pandnot(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8cf pcmp_eq(const Packet8cf& a, const Packet8cf& b) {
+  __m512 eq = pcmp_eq<Packet16f>(a.v, b.v);
+  return Packet8cf(pand(eq, _mm512_permute_ps(eq, 0xB1)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8cf pload<Packet8cf>(const std::complex<float>* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return Packet8cf(pload<Packet16f>(&numext::real_ref(*from)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8cf ploadu<Packet8cf>(const std::complex<float>* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return Packet8cf(ploadu<Packet16f>(&numext::real_ref(*from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8cf pset1<Packet8cf>(const std::complex<float>& from) {
+  const float re = std::real(from);
+  const float im = std::imag(from);
+  return Packet8cf(_mm512_set_ps(im, re, im, re, im, re, im, re, im, re, im, re, im, re, im, re));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8cf ploaddup<Packet8cf>(const std::complex<float>* from) {
+  return Packet8cf(_mm512_castpd_ps(ploaddup<Packet8d>((const double*)(const void*)from)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8cf ploadquad<Packet8cf>(const std::complex<float>* from) {
+  return Packet8cf(_mm512_castpd_ps(ploadquad<Packet8d>((const double*)(const void*)from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<std::complex<float> >(std::complex<float>* to, const Packet8cf& from) {
+  EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet8cf& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet8cf pgather<std::complex<float>, Packet8cf>(const std::complex<float>* from,
+                                                                           Index stride) {
+  return Packet8cf(_mm512_castpd_ps(pgather<double, Packet8d>((const double*)(const void*)from, stride)));
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet8cf>(std::complex<float>* to, const Packet8cf& from,
+                                                                       Index stride) {
+  pscatter((double*)(void*)to, _mm512_castps_pd(from.v), stride);
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet8cf>(const Packet8cf& a) {
+  return pfirst(Packet2cf(_mm512_castps512_ps128(a.v)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8cf preverse(const Packet8cf& a) {
+  return Packet8cf(_mm512_castsi512_ps(_mm512_permutexvar_epi64(
+      _mm512_set_epi32(0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7), _mm512_castps_si512(a.v))));
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux<Packet8cf>(const Packet8cf& a) {
+  return predux(padd(Packet4cf(extract256<0>(a.v)), Packet4cf(extract256<1>(a.v))));
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet8cf>(const Packet8cf& a) {
+  return predux_mul(pmul(Packet4cf(extract256<0>(a.v)), Packet4cf(extract256<1>(a.v))));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4cf predux_half_dowto4<Packet8cf>(const Packet8cf& a) {
+  __m256 lane0 = extract256<0>(a.v);
+  __m256 lane1 = extract256<1>(a.v);
+  __m256 res = _mm256_add_ps(lane0, lane1);
+  return Packet4cf(res);
+}
+
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet8cf, Packet16f)
+
+template <>
+EIGEN_STRONG_INLINE Packet8cf pdiv<Packet8cf>(const Packet8cf& a, const Packet8cf& b) {
+  return pdiv_complex(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8cf pcplxflip<Packet8cf>(const Packet8cf& x) {
+  return Packet8cf(_mm512_shuffle_ps(x.v, x.v, _MM_SHUFFLE(2, 3, 0, 1)));
+}
+
+//---------- double ----------
+struct Packet4cd {
+  EIGEN_STRONG_INLINE Packet4cd() {}
+  EIGEN_STRONG_INLINE explicit Packet4cd(const __m512d& a) : v(a) {}
+  __m512d v;
+};
+
+template <>
+struct packet_traits<std::complex<double> > : default_packet_traits {
+  typedef Packet4cd type;
+  typedef Packet2cd half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 0,
+    size = 4,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasNegate = 1,
+    HasSqrt = 1,
+    HasLog = 1,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 0,
+    HasMax = 0,
+    HasSetLinear = 0
+  };
+};
+
+template <>
+struct unpacket_traits<Packet4cd> {
+  typedef std::complex<double> type;
+  typedef Packet2cd half;
+  typedef Packet8d as_real;
+  enum {
+    size = 4,
+    alignment = unpacket_traits<Packet8d>::alignment,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet4cd padd<Packet4cd>(const Packet4cd& a, const Packet4cd& b) {
+  return Packet4cd(_mm512_add_pd(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cd psub<Packet4cd>(const Packet4cd& a, const Packet4cd& b) {
+  return Packet4cd(_mm512_sub_pd(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cd pnegate(const Packet4cd& a) {
+  return Packet4cd(pnegate(a.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cd pconj(const Packet4cd& a) {
+  const __m512d mask = _mm512_castsi512_pd(_mm512_set_epi32(0x80000000, 0x0, 0x0, 0x0, 0x80000000, 0x0, 0x0, 0x0,
+                                                            0x80000000, 0x0, 0x0, 0x0, 0x80000000, 0x0, 0x0, 0x0));
+  return Packet4cd(pxor(a.v, mask));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4cd pmul<Packet4cd>(const Packet4cd& a, const Packet4cd& b) {
+  __m512d tmp1 = _mm512_shuffle_pd(a.v, a.v, 0x0);
+  __m512d tmp2 = _mm512_shuffle_pd(a.v, a.v, 0xFF);
+  __m512d tmp3 = _mm512_shuffle_pd(b.v, b.v, 0x55);
+  __m512d odd = _mm512_mul_pd(tmp2, tmp3);
+  return Packet4cd(_mm512_fmaddsub_pd(tmp1, b.v, odd));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4cd ptrue<Packet4cd>(const Packet4cd& a) {
+  return Packet4cd(ptrue(Packet8d(a.v)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cd pand<Packet4cd>(const Packet4cd& a, const Packet4cd& b) {
+  return Packet4cd(pand(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cd por<Packet4cd>(const Packet4cd& a, const Packet4cd& b) {
+  return Packet4cd(por(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cd pxor<Packet4cd>(const Packet4cd& a, const Packet4cd& b) {
+  return Packet4cd(pxor(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cd pandnot<Packet4cd>(const Packet4cd& a, const Packet4cd& b) {
+  return Packet4cd(pandnot(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4cd pcmp_eq(const Packet4cd& a, const Packet4cd& b) {
+  __m512d eq = pcmp_eq<Packet8d>(a.v, b.v);
+  return Packet4cd(pand(eq, _mm512_permute_pd(eq, 0x55)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4cd pload<Packet4cd>(const std::complex<double>* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return Packet4cd(pload<Packet8d>((const double*)from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4cd ploadu<Packet4cd>(const std::complex<double>* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return Packet4cd(ploadu<Packet8d>((const double*)from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4cd pset1<Packet4cd>(const std::complex<double>& from) {
+  return Packet4cd(_mm512_castps_pd(_mm512_broadcast_f32x4(_mm_castpd_ps(pset1<Packet1cd>(from).v))));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4cd ploaddup<Packet4cd>(const std::complex<double>* from) {
+  return Packet4cd(
+      _mm512_insertf64x4(_mm512_castpd256_pd512(ploaddup<Packet2cd>(from).v), ploaddup<Packet2cd>(from + 1).v, 1));
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<std::complex<double> >(std::complex<double>* to, const Packet4cd& from) {
+  EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double>* to, const Packet4cd& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet4cd pgather<std::complex<double>, Packet4cd>(const std::complex<double>* from,
+                                                                            Index stride) {
+  return Packet4cd(_mm512_insertf64x4(
+      _mm512_castpd256_pd512(_mm256_insertf128_pd(_mm256_castpd128_pd256(ploadu<Packet1cd>(from + 0 * stride).v),
+                                                  ploadu<Packet1cd>(from + 1 * stride).v, 1)),
+      _mm256_insertf128_pd(_mm256_castpd128_pd256(ploadu<Packet1cd>(from + 2 * stride).v),
+                           ploadu<Packet1cd>(from + 3 * stride).v, 1),
+      1));
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet4cd>(std::complex<double>* to, const Packet4cd& from,
+                                                                        Index stride) {
+  __m512i fromi = _mm512_castpd_si512(from.v);
+  double* tod = (double*)(void*)to;
+  _mm_storeu_pd(tod + 0 * stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi, 0)));
+  _mm_storeu_pd(tod + 2 * stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi, 1)));
+  _mm_storeu_pd(tod + 4 * stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi, 2)));
+  _mm_storeu_pd(tod + 6 * stride, _mm_castsi128_pd(_mm512_extracti32x4_epi32(fromi, 3)));
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet4cd>(const Packet4cd& a) {
+  __m128d low = extract128<0>(a.v);
+  EIGEN_ALIGN16 double res[2];
+  _mm_store_pd(res, low);
+  return std::complex<double>(res[0], res[1]);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4cd preverse(const Packet4cd& a) {
+  return Packet4cd(_mm512_shuffle_f64x2(a.v, a.v, (shuffle_mask<3, 2, 1, 0>::mask)));
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux<Packet4cd>(const Packet4cd& a) {
+  return predux(padd(Packet2cd(_mm512_extractf64x4_pd(a.v, 0)), Packet2cd(_mm512_extractf64x4_pd(a.v, 1))));
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet4cd>(const Packet4cd& a) {
+  return predux_mul(pmul(Packet2cd(_mm512_extractf64x4_pd(a.v, 0)), Packet2cd(_mm512_extractf64x4_pd(a.v, 1))));
+}
+
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cd, Packet8d)
+
+template <>
+EIGEN_STRONG_INLINE Packet4cd pdiv<Packet4cd>(const Packet4cd& a, const Packet4cd& b) {
+  return pdiv_complex(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4cd pcplxflip<Packet4cd>(const Packet4cd& x) {
+  return Packet4cd(_mm512_permute_pd(x.v, 0x55));
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8cf, 4>& kernel) {
+  PacketBlock<Packet8d, 4> pb;
+
+  pb.packet[0] = _mm512_castps_pd(kernel.packet[0].v);
+  pb.packet[1] = _mm512_castps_pd(kernel.packet[1].v);
+  pb.packet[2] = _mm512_castps_pd(kernel.packet[2].v);
+  pb.packet[3] = _mm512_castps_pd(kernel.packet[3].v);
+  ptranspose(pb);
+  kernel.packet[0].v = _mm512_castpd_ps(pb.packet[0]);
+  kernel.packet[1].v = _mm512_castpd_ps(pb.packet[1]);
+  kernel.packet[2].v = _mm512_castpd_ps(pb.packet[2]);
+  kernel.packet[3].v = _mm512_castpd_ps(pb.packet[3]);
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8cf, 8>& kernel) {
+  PacketBlock<Packet8d, 8> pb;
+
+  pb.packet[0] = _mm512_castps_pd(kernel.packet[0].v);
+  pb.packet[1] = _mm512_castps_pd(kernel.packet[1].v);
+  pb.packet[2] = _mm512_castps_pd(kernel.packet[2].v);
+  pb.packet[3] = _mm512_castps_pd(kernel.packet[3].v);
+  pb.packet[4] = _mm512_castps_pd(kernel.packet[4].v);
+  pb.packet[5] = _mm512_castps_pd(kernel.packet[5].v);
+  pb.packet[6] = _mm512_castps_pd(kernel.packet[6].v);
+  pb.packet[7] = _mm512_castps_pd(kernel.packet[7].v);
+  ptranspose(pb);
+  kernel.packet[0].v = _mm512_castpd_ps(pb.packet[0]);
+  kernel.packet[1].v = _mm512_castpd_ps(pb.packet[1]);
+  kernel.packet[2].v = _mm512_castpd_ps(pb.packet[2]);
+  kernel.packet[3].v = _mm512_castpd_ps(pb.packet[3]);
+  kernel.packet[4].v = _mm512_castpd_ps(pb.packet[4]);
+  kernel.packet[5].v = _mm512_castpd_ps(pb.packet[5]);
+  kernel.packet[6].v = _mm512_castpd_ps(pb.packet[6]);
+  kernel.packet[7].v = _mm512_castpd_ps(pb.packet[7]);
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4cd, 4>& kernel) {
+  __m512d T0 =
+      _mm512_shuffle_f64x2(kernel.packet[0].v, kernel.packet[1].v, (shuffle_mask<0, 1, 0, 1>::mask));  // [a0 a1 b0 b1]
+  __m512d T1 =
+      _mm512_shuffle_f64x2(kernel.packet[0].v, kernel.packet[1].v, (shuffle_mask<2, 3, 2, 3>::mask));  // [a2 a3 b2 b3]
+  __m512d T2 =
+      _mm512_shuffle_f64x2(kernel.packet[2].v, kernel.packet[3].v, (shuffle_mask<0, 1, 0, 1>::mask));  // [c0 c1 d0 d1]
+  __m512d T3 =
+      _mm512_shuffle_f64x2(kernel.packet[2].v, kernel.packet[3].v, (shuffle_mask<2, 3, 2, 3>::mask));  // [c2 c3 d2 d3]
+
+  kernel.packet[3] = Packet4cd(_mm512_shuffle_f64x2(T1, T3, (shuffle_mask<1, 3, 1, 3>::mask)));  // [a3 b3 c3 d3]
+  kernel.packet[2] = Packet4cd(_mm512_shuffle_f64x2(T1, T3, (shuffle_mask<0, 2, 0, 2>::mask)));  // [a2 b2 c2 d2]
+  kernel.packet[1] = Packet4cd(_mm512_shuffle_f64x2(T0, T2, (shuffle_mask<1, 3, 1, 3>::mask)));  // [a1 b1 c1 d1]
+  kernel.packet[0] = Packet4cd(_mm512_shuffle_f64x2(T0, T2, (shuffle_mask<0, 2, 0, 2>::mask)));  // [a0 b0 c0 d0]
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4cd psqrt<Packet4cd>(const Packet4cd& a) {
+  return psqrt_complex<Packet4cd>(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8cf psqrt<Packet8cf>(const Packet8cf& a) {
+  return psqrt_complex<Packet8cf>(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4cd plog<Packet4cd>(const Packet4cd& a) {
+  return plog_complex<Packet4cd>(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8cf plog<Packet8cf>(const Packet8cf& a) {
+  return plog_complex<Packet8cf>(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8cf pexp<Packet8cf>(const Packet8cf& a) {
+  return pexp_complex<Packet8cf>(a);
+}
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_COMPLEX_AVX512_H
diff --git a/inst/include/Eigen/src/Core/arch/AVX512/GemmKernel.h b/inst/include/Eigen/src/Core/arch/AVX512/GemmKernel.h
new file mode 100644
index 00000000..e06b83c9
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/AVX512/GemmKernel.h
@@ -0,0 +1,1245 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2022 Intel Corporation
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CORE_ARCH_AVX512_GEMM_KERNEL_H
+#define EIGEN_CORE_ARCH_AVX512_GEMM_KERNEL_H
+
+#if EIGEN_COMP_MSVC
+#include <intrin.h>
+#else
+#include <x86intrin.h>
+#endif
+#include <immintrin.h>
+#include <type_traits>
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+#if !defined(EIGEN_USE_AVX512_GEMM_KERNELS)
+#define EIGEN_USE_AVX512_GEMM_KERNELS 1
+#endif
+
+#define SECOND_FETCH (32)
+#if (EIGEN_COMP_GNUC_STRICT != 0) && !defined(EIGEN_ARCH_AVX512_GEMM_KERNEL_USE_LESS_A_REGS)
+// Use less registers to load A elements to workaround compiler spills. Loose a
+// bit of performance (less than ~2%).
+#define EIGEN_ARCH_AVX512_GEMM_KERNEL_USE_LESS_A_REGS
+#endif
+
+namespace Eigen {
+namespace internal {
+
+template <typename Scalar, bool is_unit_inc>
+class gemm_class {
+  using vec = typename packet_traits<Scalar>::type;
+  using vec_ymm = typename unpacket_traits<vec>::half;
+  using vec_xmm = typename unpacket_traits<vec_ymm>::half;
+  using umask_t = typename unpacket_traits<vec>::mask_t;
+
+  static constexpr bool is_f32 = sizeof(Scalar) == sizeof(float);
+  static constexpr bool is_f64 = sizeof(Scalar) == sizeof(double);
+
+#ifndef EIGEN_ARCH_AVX512_GEMM_KERNEL_USE_LESS_A_REGS
+  static constexpr bool use_less_a_regs = !is_unit_inc;
+#else
+  static constexpr bool use_less_a_regs = true;
+#endif
+#ifndef EIGEN_ARCH_AVX512_GEMM_KERNEL_USE_LESS_B_REGS
+  static constexpr bool use_less_b_regs = !is_unit_inc;
+#else
+  static constexpr bool use_less_b_regs = true;
+#endif
+
+  static constexpr int a_regs[] = {0, 1, 2, use_less_a_regs ? 0 : 3, use_less_a_regs ? 1 : 4, use_less_a_regs ? 2 : 5};
+  static constexpr int b_regs[] = {6, use_less_b_regs ? 6 : 7};
+  static constexpr int c_regs[] = {
+      8, 16, 24, 9, 17, 25, 10, 18, 26, 11, 19, 27, 12, 20, 28, 13, 21, 29, 14, 22, 30, 15, 23, 31,
+  };
+
+  static constexpr int alpha_load_reg = 0;
+  static constexpr int c_load_regs[] = {1, 2, 6};
+
+  static constexpr int a_shift = 128;
+  static constexpr int b_shift = 128;
+
+  static constexpr int nelems_in_cache_line = is_f32 ? 16 : 8;
+  static constexpr int a_prefetch_size = nelems_in_cache_line * 2;
+  static constexpr int b_prefetch_size = nelems_in_cache_line * 8;
+
+  vec zmm[32];
+  umask_t mask;
+
+  // gemm arguments.
+  Index m;
+  const Index n, k, ldc;
+  const Index inc;
+  const Scalar *alpha;
+
+  const Scalar *a, *b;
+  Scalar *c;
+
+  const bool is_alpha1;
+  const bool is_beta0;
+
+  const Index a_stride, b_stride;
+  const Index a_off, b_off;
+
+  EIGEN_ALWAYS_INLINE void prefetch_a(const Scalar *a_addr) {
+    _mm_prefetch((char *)(a_prefetch_size + a_addr - a_shift), _MM_HINT_T0);
+  }
+
+  EIGEN_ALWAYS_INLINE void prefetch_b(const Scalar *b_addr) {
+    _mm_prefetch((char *)(b_prefetch_size + b_addr - b_shift), _MM_HINT_T0);
+  }
+
+  EIGEN_ALWAYS_INLINE void prefetch_x(const Scalar *x_addr) { _mm_prefetch((char *)(x_addr - a_shift), _MM_HINT_T2); }
+
+  EIGEN_ALWAYS_INLINE void prefetch_c(const Scalar *c_addr) {
+#if defined(__PRFCHW__) && __PRFCHW__ == 1
+    _m_prefetchw((void *)c_addr);
+#else
+    _mm_prefetch((char *)c_addr, _MM_HINT_T0);
+#endif
+  }
+
+  template <int nelems>
+  EIGEN_ALWAYS_INLINE void a_load(vec &a_reg, const Scalar *a_addr) {
+    switch (nelems * sizeof(*a_addr) * 8) {
+      default:
+      case 512 * 3:
+        a_reg = ploadu<vec>(a_addr);
+        break;
+      case 512 * 2:
+        a_reg = ploadu<vec>(a_addr);
+        break;
+      case 512 * 1:
+        a_reg = ploadu<vec>(a_addr);
+        break;
+      case 256 * 1:
+        a_reg = preinterpret<vec>(_mm512_broadcast_f64x4(ploadu<Packet4d>(reinterpret_cast<const double *>(a_addr))));
+        break;
+      case 128 * 1:
+        a_reg = preinterpret<vec>(_mm512_broadcast_f32x4(ploadu<Packet4f>(reinterpret_cast<const float *>(a_addr))));
+        break;
+      case 64 * 1:
+        a_reg = preinterpret<vec>(pload1<Packet8d>(reinterpret_cast<const double *>(a_addr)));
+        break;
+      case 32 * 1:
+        a_reg = pload1<vec>(a_addr);
+        break;
+    }
+  }
+
+  EIGEN_ALWAYS_INLINE void b_load(vec &b_reg, const Scalar *b_addr) { b_reg = pload1<vec>(b_addr); }
+
+  template <int nelems>
+  EIGEN_ALWAYS_INLINE void c_store(Scalar *mem, vec &src) {
+    if (is_unit_inc) {
+      switch (nelems * sizeof(*mem) * 8) {
+        default:
+        case 512 * 3:
+          pstoreu(mem, src);
+          break;
+        case 512 * 2:
+          pstoreu(mem, src);
+          break;
+        case 512 * 1:
+          pstoreu(mem, src);
+          break;
+        case 256 * 1:
+          pstoreu(mem, preinterpret<vec_ymm>(src));
+          break;
+        case 128 * 1:
+          pstoreu(mem, preinterpret<vec_xmm>(src));
+          break;
+        case 64 * 1:
+          pstorel(mem, preinterpret<vec_xmm>(src));
+          break;
+        case 32 * 1:
+          pstores(mem, preinterpret<vec_xmm>(src));
+          break;
+      }
+    } else {
+      switch (nelems * sizeof(*mem) * 8) {
+        default:
+        case 512 * 3:
+          pscatter(mem, src, inc);
+          break;
+        case 512 * 2:
+          pscatter(mem, src, inc);
+          break;
+        case 512 * 1:
+          pscatter(mem, src, inc);
+          break;
+        case 256 * 1:
+          pscatter(mem, src, inc, mask);
+          break;
+        case 128 * 1:
+          pscatter(mem, src, inc, mask);
+          break;
+        case 64 * 1:
+          pscatter(mem, src, inc, mask);
+          break;
+        case 32 * 1:
+          pscatter(mem, src, inc, mask);
+          break;
+      }
+    }
+  }
+
+  template <int nelems>
+  EIGEN_ALWAYS_INLINE void vaddm(vec &dst, const Scalar *mem, vec &src, vec &reg) {
+    if (is_unit_inc) {
+      switch (nelems * sizeof(*mem) * 8) {
+        default:
+        case 512 * 3:
+          dst = padd(src, ploadu<vec>(mem));
+          break;
+        case 512 * 2:
+          dst = padd(src, ploadu<vec>(mem));
+          break;
+        case 512 * 1:
+          dst = padd(src, ploadu<vec>(mem));
+          break;
+        case 256 * 1:
+          dst = preinterpret<vec>(padd(preinterpret<vec_ymm>(src), ploadu<vec_ymm>(mem)));
+          break;
+        case 128 * 1:
+          dst = preinterpret<vec>(padd(preinterpret<vec_xmm>(src), ploadu<vec_xmm>(mem)));
+          break;
+        case 64 * 1:
+          dst = preinterpret<vec>(padd(preinterpret<vec_xmm>(src), ploadl<vec_xmm>(mem)));
+          break;
+        case 32 * 1:
+          dst = preinterpret<vec>(padds(preinterpret<vec_xmm>(src), ploads<vec_xmm>(mem)));
+          break;
+      }
+    } else {
+      // Zero out scratch register
+      reg = pzero(reg);
+
+      switch (nelems * sizeof(*mem) * 8) {
+        default:
+        case 512 * 3:
+          reg = pgather<Scalar, vec>(mem, inc);
+          dst = padd(src, reg);
+          break;
+        case 512 * 2:
+          reg = pgather<Scalar, vec>(mem, inc);
+          dst = padd(src, reg);
+          break;
+        case 512 * 1:
+          reg = pgather<Scalar, vec>(mem, inc);
+          dst = padd(src, reg);
+          break;
+        case 256 * 1:
+          reg = preinterpret<vec>(pgather<Scalar, vec_ymm>(mem, inc));
+          dst = preinterpret<vec>(padd(preinterpret<vec_ymm>(src), preinterpret<vec_ymm>(reg)));
+          break;
+        case 128 * 1:
+          reg = preinterpret<vec>(pgather<Scalar, vec_xmm>(mem, inc));
+          dst = preinterpret<vec>(padd(preinterpret<vec_xmm>(src), preinterpret<vec_xmm>(reg)));
+          break;
+        case 64 * 1:
+          if (is_f32) {
+            reg = pgather(reg, mem, inc, mask);
+            dst = preinterpret<vec>(padd(preinterpret<vec_xmm>(src), preinterpret<vec_xmm>(reg)));
+          } else {
+            dst = preinterpret<vec>(padd(preinterpret<vec_xmm>(src), ploadl<vec_xmm>(mem)));
+          }
+          break;
+        case 32 * 1:
+          dst = preinterpret<vec>(padds(preinterpret<vec_xmm>(src), ploads<vec_xmm>(mem)));
+          break;
+      }
+    }
+  }
+
+  EIGEN_STRONG_INLINE void vfmadd(vec &dst, const vec &src1, const vec &src2) {
+    dst = pmadd(src1, src2, dst);
+
+#if (EIGEN_COMP_GNUC != 0) || (EIGEN_COMP_CLANG != 0)
+    // Workaround register spills for gcc and clang
+    __asm__("#" : [dst] "+v"(dst) : [src1] "%v"(src1), [src2] "v"(src2));
+#endif
+  }
+
+  template <int nelems>
+  EIGEN_ALWAYS_INLINE void vfmaddm(vec &dst, const Scalar *mem, vec &src, vec &scale, vec &reg) {
+    if (is_unit_inc) {
+      switch (nelems * sizeof(*mem) * 8) {
+        default:
+        case 512 * 3:
+          dst = pmadd(scale, src, ploadu<vec>(mem));
+          break;
+        case 512 * 2:
+          dst = pmadd(scale, src, ploadu<vec>(mem));
+          break;
+        case 512 * 1:
+          dst = pmadd(scale, src, ploadu<vec>(mem));
+          break;
+        case 256 * 1:
+          dst =
+              preinterpret<vec>(pmadd(preinterpret<vec_ymm>(scale), preinterpret<vec_ymm>(src), ploadu<vec_ymm>(mem)));
+          break;
+        case 128 * 1:
+          dst =
+              preinterpret<vec>(pmadd(preinterpret<vec_xmm>(scale), preinterpret<vec_xmm>(src), ploadu<vec_xmm>(mem)));
+          break;
+        case 64 * 1:
+          dst =
+              preinterpret<vec>(pmadd(preinterpret<vec_xmm>(scale), preinterpret<vec_xmm>(src), ploadl<vec_xmm>(mem)));
+          break;
+        case 32 * 1:
+          dst =
+              preinterpret<vec>(pmadds(preinterpret<vec_xmm>(scale), preinterpret<vec_xmm>(src), ploads<vec_xmm>(mem)));
+          break;
+      }
+    } else {
+      // Zero out scratch register
+      reg = pzero(reg);
+
+      switch (nelems * sizeof(*mem) * 8) {
+        default:
+        case 512 * 3:
+          reg = pgather<Scalar, vec>(mem, inc);
+          dst = pmadd(scale, src, reg);
+          break;
+        case 512 * 2:
+          reg = pgather<Scalar, vec>(mem, inc);
+          dst = pmadd(scale, src, reg);
+          break;
+        case 512 * 1:
+          reg = pgather<Scalar, vec>(mem, inc);
+          dst = pmadd(scale, src, reg);
+          break;
+        case 256 * 1:
+          reg = preinterpret<vec>(pgather<Scalar, vec_ymm>(mem, inc));
+          dst = preinterpret<vec>(
+              pmadd(preinterpret<vec_ymm>(scale), preinterpret<vec_ymm>(src), preinterpret<vec_ymm>(reg)));
+          break;
+        case 128 * 1:
+          reg = preinterpret<vec>(pgather<Scalar, vec_xmm>(mem, inc));
+          dst = preinterpret<vec>(
+              pmadd(preinterpret<vec_xmm>(scale), preinterpret<vec_xmm>(src), preinterpret<vec_xmm>(reg)));
+          break;
+        case 64 * 1:
+          if (is_f32) {
+            reg = pgather(reg, mem, inc, mask);
+            dst = preinterpret<vec>(
+                pmadd(preinterpret<vec_xmm>(scale), preinterpret<vec_xmm>(src), preinterpret<vec_xmm>(reg)));
+          } else {
+            dst = preinterpret<vec>(
+                pmadd(preinterpret<vec_xmm>(scale), preinterpret<vec_xmm>(src), ploadl<vec_xmm>(mem)));
+          }
+          break;
+        case 32 * 1:
+          dst =
+              preinterpret<vec>(pmadds(preinterpret<vec_xmm>(scale), preinterpret<vec_xmm>(src), ploads<vec_xmm>(mem)));
+          break;
+      }
+    }
+  }
+
+  template <int j, int endX, int i, int endY, int nelems>
+  EIGEN_ALWAYS_INLINE std::enable_if_t<(j > endX) || (i > endY)> a_loads(const Scalar *ao) {
+    EIGEN_UNUSED_VARIABLE(ao);
+  }
+
+  template <int j, int endX, int i, int endY, int nelems>
+  EIGEN_ALWAYS_INLINE std::enable_if_t<(j <= endX) && (i <= endY)> a_loads(const Scalar *ao) {
+    if (j < endX) {
+      if (i < endY) {
+        auto &a_reg = zmm[a_regs[i + (j % 2) * 3]];
+        const Scalar *a_addr = ao + nelems * j + nelems_in_cache_line * i - a_shift;
+        a_load<nelems>(a_reg, a_addr);
+
+        a_loads<j, endX, i + 1, endY, nelems>(ao);
+      } else {
+        a_loads<j + 1, endX, 0, endY, nelems>(ao);
+      }
+    }
+  }
+
+  template <int un, int max_b_unroll, int i, int um_vecs, int a_unroll, int b_unroll>
+  EIGEN_ALWAYS_INLINE std::enable_if_t<(un > max_b_unroll) || (i > um_vecs)> prefetch_cs(const Scalar *co1,
+                                                                                         const Scalar *co2) {
+    EIGEN_UNUSED_VARIABLE(co1);
+    EIGEN_UNUSED_VARIABLE(co2);
+  }
+
+  /* C prefetch loop structure.
+   * for (int un = 0; un < 8; un++) {
+   *     if (b_unroll >= un + 1) {
+   *         if (un == 4) co2 = co1 + 4 * ldc;
+   *
+   *         for (int i = 0; i < um_vecs; i++) {
+   *             Scalar *co = (un + 1 <= 4) ? co1 : co2;
+   *             auto co_off = (un % 4) * ldc + a_unroll - 1 + i * nelems_in_cache_line * sizeof *co;
+   *             prefetch_c(co + co_off);
+   *         }
+   *     }
+   * }
+   */
+
+  template <int un, int max_b_unroll, int i, int um_vecs, int a_unroll, int b_unroll>
+  EIGEN_ALWAYS_INLINE std::enable_if_t<(un <= max_b_unroll) && (i <= um_vecs)> prefetch_cs(Scalar *&co1, Scalar *&co2) {
+    if (un < max_b_unroll) {
+      if (b_unroll >= un + 1) {
+        if (un == 4 && i == 0) co2 = co1 + 4 * ldc;
+
+        if (i < um_vecs) {
+          Scalar *co = (un + 1 <= 4) ? co1 : co2;
+          auto co_off = (un % 4) * ldc + a_unroll - 1 + i * nelems_in_cache_line * sizeof *co;
+          prefetch_c(co + co_off);
+
+          prefetch_cs<un, max_b_unroll, i + 1, um_vecs, a_unroll, b_unroll>(co1, co2);
+        } else {
+          prefetch_cs<un + 1, max_b_unroll, 0, um_vecs, a_unroll, b_unroll>(co1, co2);
+        }
+
+      } else {
+        prefetch_cs<un + 1, max_b_unroll, 0, um_vecs, a_unroll, b_unroll>(co1, co2);
+      }
+    }
+  }
+
+  // load_c
+  template <int i, int um_vecs, int idx, int nelems>
+  EIGEN_ALWAYS_INLINE std::enable_if_t<(i > um_vecs)> scale_load_c(const Scalar *cox, vec &alpha_reg) {
+    EIGEN_UNUSED_VARIABLE(cox);
+    EIGEN_UNUSED_VARIABLE(alpha_reg);
+  }
+
+  template <int i, int um_vecs, int idx, int nelems>
+  EIGEN_ALWAYS_INLINE std::enable_if_t<(i <= um_vecs)> scale_load_c(const Scalar *cox, vec &alpha_reg) {
+    if (i < um_vecs) {
+      auto &c_reg = zmm[c_regs[i + idx * 3]];
+      auto &c_load_reg = zmm[c_load_regs[i % 3]];
+      auto c_mem = cox;
+      if (is_unit_inc)
+        c_mem += i * nelems_in_cache_line;
+      else
+        c_mem += i * nelems_in_cache_line * inc;
+
+      if (!is_beta0 && is_alpha1)
+        vaddm<nelems>(c_reg, c_mem, c_reg, c_load_reg);
+      else if (!is_beta0 && !is_alpha1)
+        vfmaddm<nelems>(c_reg, c_mem, c_reg, alpha_reg, c_load_reg);
+      else if (is_beta0 && !is_alpha1)
+        c_reg = pmul(alpha_reg, c_reg);
+
+      scale_load_c<i + 1, um_vecs, idx, nelems>(cox, alpha_reg);
+    }
+  }
+
+  // store_c
+  template <int i, int um_vecs, int idx, int nelems>
+  EIGEN_ALWAYS_INLINE std::enable_if_t<(i > um_vecs)> write_c(Scalar *cox) {
+    EIGEN_UNUSED_VARIABLE(cox);
+  }
+
+  template <int i, int um_vecs, int idx, int nelems>
+  EIGEN_ALWAYS_INLINE std::enable_if_t<(i <= um_vecs)> write_c(Scalar *cox) {
+    if (i < um_vecs) {
+      auto &c_reg = zmm[c_regs[i + idx * 3]];
+      auto c_mem = cox;
+      if (is_unit_inc)
+        c_mem += i * nelems_in_cache_line;
+      else
+        c_mem += i * nelems_in_cache_line * inc;
+
+      c_store<nelems>(c_mem, c_reg);
+      c_reg = pzero(c_reg);
+
+      write_c<i + 1, um_vecs, idx, nelems>(cox);
+    }
+  }
+
+  /*  C update loop structure.
+   *  co2 = co1 + ldc;
+   *
+   *  auto &alpha_reg = zmm[alpha_load_reg];
+   *  if (!is_alpha1) alpha_reg = pload1<vec>(alpha);
+   *
+   *  int idx = 0;
+   *  for (pow = 1; pow <= 8; pow <<= 1) {
+   *
+   *      if (b_unroll >= pow) {
+   *          for (count = 1; count < (pow + 1) / 2 + 1;  count++) {
+   *              if (pow >= 4) co2 += ldc;
+   *
+   *              const Scalar *cox = (idx == 0) ? co1 : co2;
+   *
+   *              const int um_vecs = numext::div_ceil(a_unroll, nelems_in_cache_line);
+   *              scale_load_c<0, um_vecs, idx, a_unroll>(cox, alpha_reg);
+   *              write_c<0, um_vecs, idx, a_unroll>(cox);
+   *
+   *              idx++;
+   *          }
+   *      }
+   *  }
+   *
+   *  if (b_unroll == 1)
+   *      co1 += ldc;
+   *  else
+   *      co1 = co2 + ldc;
+   */
+
+  template <int pow, int a_unroll, int idx>
+  EIGEN_ALWAYS_INLINE void c_update_1count(Scalar *&cox) {
+    if (pow >= 4) cox += ldc;
+
+    const int um_vecs = numext::div_ceil(a_unroll, nelems_in_cache_line);
+    auto &alpha_reg = zmm[alpha_load_reg];
+
+    scale_load_c<0, um_vecs, idx, a_unroll>(cox, alpha_reg);
+    write_c<0, um_vecs, idx, a_unroll>(cox);
+  }
+
+  template <int pow, int a_unroll>
+  EIGEN_ALWAYS_INLINE void c_update_1pow(Scalar *&co1, Scalar *&co2) {
+    constexpr int idx = pow / 2;
+    Scalar *&cox = idx == 0 ? co1 : co2;
+
+    constexpr int max_count = (pow + 1) / 2;
+    static_assert(max_count <= 4, "Unsupported max_count.");
+
+    if (1 <= max_count) c_update_1count<pow, a_unroll, idx + 0>(cox);
+    if (2 <= max_count) c_update_1count<pow, a_unroll, idx + 1>(cox);
+    if (3 <= max_count) c_update_1count<pow, a_unroll, idx + 2>(cox);
+    if (4 <= max_count) c_update_1count<pow, a_unroll, idx + 3>(cox);
+  }
+
+  template <int max_b_unroll, int a_unroll, int b_unroll>
+  EIGEN_ALWAYS_INLINE void c_update(Scalar *&co1, Scalar *&co2) {
+    auto &alpha_reg = zmm[alpha_load_reg];
+
+    co2 = co1 + ldc;
+    if (!is_alpha1) alpha_reg = pload1<vec>(alpha);
+    if (!is_unit_inc && a_unroll < nelems_in_cache_line) mask = static_cast<umask_t>((1ull << a_unroll) - 1);
+
+    static_assert(max_b_unroll <= 8, "Unsupported max_b_unroll");
+
+    if (1 <= max_b_unroll && 1 <= b_unroll) c_update_1pow<1, a_unroll>(co1, co2);
+    if (2 <= max_b_unroll && 2 <= b_unroll) c_update_1pow<2, a_unroll>(co1, co2);
+    if (4 <= max_b_unroll && 4 <= b_unroll) c_update_1pow<4, a_unroll>(co1, co2);
+    if (8 <= max_b_unroll && 8 <= b_unroll) c_update_1pow<8, a_unroll>(co1, co2);
+
+    if (b_unroll == 1)
+      co1 += ldc;
+    else
+      co1 = co2 + ldc;
+  }
+
+  // compute
+  template <int um, int um_vecs, int idx, int uk, bool fetch_x, bool ktail>
+  EIGEN_ALWAYS_INLINE std::enable_if_t<(um > um_vecs)> compute(const Scalar *ao, const Scalar *bo, int &fetchA_idx,
+                                                               int &fetchB_idx, vec &b_reg) {
+    EIGEN_UNUSED_VARIABLE(ao);
+    EIGEN_UNUSED_VARIABLE(bo);
+    EIGEN_UNUSED_VARIABLE(fetchA_idx);
+    EIGEN_UNUSED_VARIABLE(fetchB_idx);
+    EIGEN_UNUSED_VARIABLE(b_reg);
+  }
+
+  template <int um, int um_vecs, int idx, int uk, bool fetch_x, bool ktail>
+  EIGEN_ALWAYS_INLINE std::enable_if_t<(um <= um_vecs)> compute(const Scalar *ao, const Scalar *bo, int &fetchA_idx,
+                                                                int &fetchB_idx, vec &b_reg) {
+    if (um < um_vecs) {
+      auto &c_reg = zmm[c_regs[um + idx * 3]];
+      auto &a_reg = zmm[a_regs[um + (uk % 2) * 3]];
+
+      vfmadd(c_reg, a_reg, b_reg);
+
+      if (!fetch_x && um == 0 &&
+          (((idx == 0 || idx == 6) && (uk % 2 == 0 || is_f64 || ktail)) ||
+           (idx == 3 && (uk % 2 == 1 || is_f64 || ktail)))) {
+        prefetch_a(ao + nelems_in_cache_line * fetchA_idx);
+        fetchA_idx++;
+      }
+
+      if (um == 0 && idx == 1 && (uk % 2 == 0 || is_f64 || ktail)) {
+        prefetch_b(bo + nelems_in_cache_line * fetchB_idx);
+        fetchB_idx++;
+      }
+
+      compute<um + 1, um_vecs, idx, uk, fetch_x, ktail>(ao, bo, fetchA_idx, fetchB_idx, b_reg);
+    }
+  }
+
+  // load_a
+  template <int um, int um_vecs, int uk, int nelems, bool ktail>
+  EIGEN_ALWAYS_INLINE std::enable_if_t<(um > um_vecs)> load_a(const Scalar *ao) {
+    EIGEN_UNUSED_VARIABLE(ao);
+  }
+
+  template <int um, int um_vecs, int uk, int nelems, bool ktail>
+  EIGEN_ALWAYS_INLINE std::enable_if_t<(um <= um_vecs)> load_a(const Scalar *ao) {
+    if (um < um_vecs) {
+      auto &a_reg = zmm[a_regs[um + (uk % 2) * 3]];
+      const Scalar *a_addr = ao + nelems * (1 + !ktail * !use_less_a_regs + uk) + nelems_in_cache_line * um - a_shift;
+      a_load<nelems>(a_reg, a_addr);
+
+      load_a<um + 1, um_vecs, uk, nelems, ktail>(ao);
+    }
+  }
+  template <int uk, int pow, int count, int um_vecs, int b_unroll, bool ktail, bool fetch_x, bool c_fetch>
+  EIGEN_ALWAYS_INLINE std::enable_if_t<(count > (pow + 1) / 2)> innerkernel_1pow(const Scalar *&aa,
+                                                                                 const Scalar *const &ao,
+                                                                                 const Scalar *const &bo, Scalar *&co2,
+                                                                                 int &fetchA_idx, int &fetchB_idx) {
+    EIGEN_UNUSED_VARIABLE(aa);
+    EIGEN_UNUSED_VARIABLE(ao);
+    EIGEN_UNUSED_VARIABLE(bo);
+    EIGEN_UNUSED_VARIABLE(co2);
+    EIGEN_UNUSED_VARIABLE(fetchA_idx);
+    EIGEN_UNUSED_VARIABLE(fetchB_idx);
+  }
+
+  template <int uk, int pow, int count, int um_vecs, int b_unroll, bool ktail, bool fetch_x, bool c_fetch>
+  EIGEN_ALWAYS_INLINE std::enable_if_t<(count <= (pow + 1) / 2)> innerkernel_1pow(const Scalar *&aa,
+                                                                                  const Scalar *const &ao,
+                                                                                  const Scalar *const &bo, Scalar *&co2,
+                                                                                  int &fetchA_idx, int &fetchB_idx) {
+    const int idx = (pow / 2) + count;
+
+    if (count < (pow + 1) / 2) {
+      auto &b_reg = zmm[b_regs[idx % 2]];
+
+      if (fetch_x && uk == 3 && idx == 0) prefetch_x(aa);
+      if (fetch_x && uk == 3 && idx == 4) aa += 8;
+
+      if (b_unroll >= pow) {
+        compute<0, um_vecs, idx, uk, fetch_x, ktail>(ao, bo, fetchA_idx, fetchB_idx, b_reg);
+
+        const Scalar *b_addr = bo + b_unroll * uk + idx + 1 + (b_unroll > 1) * !use_less_b_regs - b_shift;
+        b_load(b_reg, b_addr);
+      }
+
+      // Go to the next count.
+      innerkernel_1pow<uk, pow, count + 1, um_vecs, b_unroll, ktail, fetch_x, c_fetch>(aa, ao, bo, co2, fetchA_idx,
+                                                                                       fetchB_idx);
+
+    } else {
+      // Maybe prefetch C data after count-loop.
+      if (pow == 2 && c_fetch) {
+        if (uk % 3 == 0 && uk > 0) {
+          co2 += ldc;
+        } else {
+          prefetch_c(co2 + (uk % 3) * nelems_in_cache_line);
+        }
+      }
+    }
+  }
+
+  template <int uk, int max_b_unroll, int a_unroll, int b_unroll, bool ktail, bool fetch_x, bool c_fetch,
+            bool no_a_preload = false>
+  EIGEN_ALWAYS_INLINE void innerkernel_1uk(const Scalar *&aa, const Scalar *const &ao, const Scalar *const &bo,
+                                           Scalar *&co2, int &fetchA_idx, int &fetchB_idx) {
+    const int um_vecs = numext::div_ceil(a_unroll, nelems_in_cache_line);
+
+    if (max_b_unroll >= 1)
+      innerkernel_1pow<uk, 1, 0, um_vecs, b_unroll, ktail, fetch_x, c_fetch>(aa, ao, bo, co2, fetchA_idx, fetchB_idx);
+    if (max_b_unroll >= 2)
+      innerkernel_1pow<uk, 2, 0, um_vecs, b_unroll, ktail, fetch_x, c_fetch>(aa, ao, bo, co2, fetchA_idx, fetchB_idx);
+    if (max_b_unroll >= 4)
+      innerkernel_1pow<uk, 4, 0, um_vecs, b_unroll, ktail, fetch_x, c_fetch>(aa, ao, bo, co2, fetchA_idx, fetchB_idx);
+    if (max_b_unroll >= 8)
+      innerkernel_1pow<uk, 8, 0, um_vecs, b_unroll, ktail, fetch_x, c_fetch>(aa, ao, bo, co2, fetchA_idx, fetchB_idx);
+
+    // Load A after pow-loop. Skip this at the end to prevent running over the buffer
+    if (!no_a_preload) load_a<0, um_vecs, uk, a_unroll, ktail>(ao);
+  }
+
+  /*  Inner kernel loop structure.
+   *  for (int uk = 0; uk < kfactor; uk++) {
+   *      int idx = 0;
+   *
+   *      for (pow = 1; pow < max_b_unroll << 1; pow <<= 1) {
+   *          for (int count = 0; count < (pow + 1) / 2; count++) {
+   *              auto &b_reg = zmm[b_regs[idx % 2]];
+   *
+   *              if (fetch_x && uk == 3 && idx == 0) prefetch_x(aa);
+   *              if (fetch_x && uk == 3 && idx == 4) aa += 8;
+   *
+   *              if (b_unroll >= pow) {
+   *                  compute<0, um_vecs, idx, uk, fetchx, ktail>(ao, bo, fetchA_idx, fetchB_idx, b_reg);
+   *
+   *                  const Scalar *b_addr = bo + b_unroll * uk + idx + 1 + (b_unroll > 1) - b_shift ;
+   *                  b_load(b_reg, b_addr);
+   *              }
+   *              idx++;
+   *          }
+   *
+   *          Maybe prefetch C data.
+   *          if (pow == 2 && c_fetch) {
+   *              if (uk % 3 == 0 && uk > 0) {
+   *                  co2 += ldc;
+   *              } else {
+   *                  prefetch_c(co2 + (uk % 3) * nelems_in_cache_line);
+   *              }
+   *          }
+   *      }
+   *
+   *      Load A.
+   *      load_a<0, um_vecs, uk, ktail, a_unroll>(ao);
+   *  }
+   *
+   *  Advance A/B pointers after uk-loop.
+   *  ao += a_unroll * kfactor;
+   *  bo += b_unroll * kfactor;
+   */
+
+  template <int a_unroll, int b_unroll, int k_factor, int max_b_unroll, int max_k_factor, bool c_fetch,
+            bool no_a_preload = false>
+  EIGEN_ALWAYS_INLINE void innerkernel(const Scalar *&aa, const Scalar *&ao, const Scalar *&bo, Scalar *&co2) {
+    int fetchA_idx = 0;
+    int fetchB_idx = 0;
+
+    const bool fetch_x = k_factor == max_k_factor;
+    const bool ktail = k_factor == 1;
+
+    static_assert(k_factor <= 4 && k_factor > 0, "innerkernel maximum k_factor supported is 4");
+    static_assert(no_a_preload == false || (no_a_preload == true && k_factor == 1),
+                  "skipping a preload only allowed when k unroll is 1");
+
+    if (k_factor > 0)
+      innerkernel_1uk<0, max_b_unroll, a_unroll, b_unroll, ktail, fetch_x, c_fetch, no_a_preload>(
+          aa, ao, bo, co2, fetchA_idx, fetchB_idx);
+    if (k_factor > 1)
+      innerkernel_1uk<1, max_b_unroll, a_unroll, b_unroll, ktail, fetch_x, c_fetch, no_a_preload>(
+          aa, ao, bo, co2, fetchA_idx, fetchB_idx);
+    if (k_factor > 2)
+      innerkernel_1uk<2, max_b_unroll, a_unroll, b_unroll, ktail, fetch_x, c_fetch, no_a_preload>(
+          aa, ao, bo, co2, fetchA_idx, fetchB_idx);
+    if (k_factor > 3)
+      innerkernel_1uk<3, max_b_unroll, a_unroll, b_unroll, ktail, fetch_x, c_fetch, no_a_preload>(
+          aa, ao, bo, co2, fetchA_idx, fetchB_idx);
+
+    // Advance A/B pointers after uk-loop.
+    ao += a_unroll * k_factor;
+    bo += b_unroll * k_factor;
+  }
+
+  template <int a_unroll, int b_unroll, int max_b_unroll>
+  EIGEN_ALWAYS_INLINE void kloop(const Scalar *&aa, const Scalar *&ao, const Scalar *&bo, Scalar *&co1, Scalar *&co2) {
+    const int um_vecs = numext::div_ceil(a_unroll, nelems_in_cache_line);
+    if (!use_less_a_regs && k > 1)
+      a_loads<0, 2, 0, um_vecs, a_unroll>(ao);
+    else
+      a_loads<0, 1, 0, um_vecs, a_unroll>(ao);
+
+    b_load(zmm[b_regs[0]], bo - b_shift + 0);
+    if (!use_less_b_regs) b_load(zmm[b_regs[1]], bo - b_shift + 1);
+
+#ifndef SECOND_FETCH
+    prefetch_cs<0, max_b_unroll, 0, um_vecs, a_unroll, b_unroll>(co1, co2);
+#endif  // SECOND_FETCH
+
+    // Unrolling k-loop by a factor of 4.
+    const int max_k_factor = 4;
+    Index kRem = k % max_k_factor;
+    Index k_ = k - kRem;
+    if (k_ >= max_k_factor) {
+      k_ -= max_k_factor;
+      kRem += max_k_factor;
+    }
+    Index loop_count = k_ / max_k_factor;
+
+    if (loop_count > 0) {
+#ifdef SECOND_FETCH
+      loop_count -= SECOND_FETCH;
+#endif
+      while (loop_count > 0) {
+        innerkernel<a_unroll, b_unroll, max_k_factor, max_b_unroll, max_k_factor, 0>(aa, ao, bo, co2);
+        loop_count--;
+      }
+#ifdef SECOND_FETCH
+      co2 = co1 + nelems_in_cache_line - 1;
+
+      loop_count += b_unroll;
+      while (loop_count > 0) {
+        innerkernel<a_unroll, b_unroll, max_k_factor, max_b_unroll, max_k_factor, 1>(aa, ao, bo, co2);
+        loop_count--;
+      }
+
+      loop_count += SECOND_FETCH - b_unroll;
+      while (loop_count > 0) {
+        innerkernel<a_unroll, b_unroll, max_k_factor, max_b_unroll, max_k_factor, 0>(aa, ao, bo, co2);
+        loop_count--;
+      }
+#endif
+    }
+
+    // k-loop remainder handling.
+    loop_count = kRem;
+    while (loop_count > 1) {
+      innerkernel<a_unroll, b_unroll, 1, max_b_unroll, max_k_factor, 0>(aa, ao, bo, co2);
+      loop_count--;
+    }
+    if (loop_count > 0) {
+      innerkernel<a_unroll, b_unroll, 1, max_b_unroll, max_k_factor, 0, true>(aa, ao, bo, co2);
+    }
+
+    // Update C matrix.
+    c_update<max_b_unroll, a_unroll, b_unroll>(co1, co2);
+  }
+
+  template <int a_unroll, int b_unroll, int max_b_unroll>
+  EIGEN_ALWAYS_INLINE void nloop(const Scalar *&aa, const Scalar *&ao, const Scalar *&bo, Scalar *&co1, Scalar *&co2) {
+    // Set A matrix pointer.
+    ao = a + a_off * a_unroll;
+
+    // Set B matrix pointer if needed.
+    bo += b_unroll * b_off;
+
+    kloop<a_unroll, b_unroll, max_b_unroll>(aa, ao, bo, co1, co2);
+
+    // Advance B matrix pointer if needed.
+    bo += b_unroll * (b_stride - k - b_off);
+
+    // Advance prefetch A pointer.
+    aa += 16;
+  }
+
+  template <int a_unroll, int max_a_unroll, int max_b_unroll>
+  EIGEN_ALWAYS_INLINE void mloop(const Scalar *&ao, const Scalar *&bo, Scalar *&co1, Scalar *&co2) {
+    // Set prefetch A pointers.
+    const Scalar *aa = a + a_unroll * a_stride;
+
+    // Set C matrix pointers.
+    co1 = c;
+    if (a_unroll >= max_a_unroll) co2 = c + 2 * ldc;
+    if (is_unit_inc)
+      c += a_unroll;
+    else
+      c += a_unroll * inc;
+
+    // Set B matrix pointer.
+    bo = b;
+
+    // Main n-loop.
+    for (Index i = n / max_b_unroll; i > 0; i--) nloop<a_unroll, max_b_unroll, max_b_unroll>(aa, ao, bo, co1, co2);
+
+    // n-remainders.
+    if (n & 4 && max_b_unroll > 4) nloop<a_unroll, 4, max_b_unroll>(aa, ao, bo, co1, co2);
+#if 0
+        if (n & 2 && max_b_unroll > 2) nloop<a_unroll, 2, max_b_unroll>(aa, ao, bo, co1, co2);
+        if (n & 1 && max_b_unroll > 1) nloop<a_unroll, 1, max_b_unroll>(aa, ao, bo, co1, co2);
+#else
+    // Copy kernels don't support tails of n = 2 for single/double precision.
+    // Loop over ones.
+    int n_rem = 2 * ((n & 2) != 0) + 1 * ((n & 1) != 0);
+    while (n_rem > 0) {
+      nloop<a_unroll, 1, max_b_unroll>(aa, ao, bo, co1, co2);
+      n_rem--;
+    }
+#endif
+
+    // Advance A matrix pointer.
+    a = ao + a_unroll * (a_stride - k - a_off);
+  }
+
+ public:
+  // Compute kernel unrolling C matrix by max_a_unroll x max_b_unroll.
+  template <int max_a_unroll, int max_b_unroll>
+  EIGEN_ALWAYS_INLINE void compute_kern() {
+    a -= -a_shift;
+    b -= -b_shift;
+
+    const Scalar *ao = nullptr;
+    const Scalar *bo = nullptr;
+    Scalar *co1 = nullptr;
+    Scalar *co2 = nullptr;
+
+    // Main m-loop.
+    for (; m >= max_a_unroll; m -= max_a_unroll) mloop<max_a_unroll, max_a_unroll, max_b_unroll>(ao, bo, co1, co2);
+
+    // m-remainders.
+    if (m & 32 && max_a_unroll > 32) mloop<32, max_a_unroll, max_b_unroll>(ao, bo, co1, co2);
+    if (m & 16 && max_a_unroll > 16) mloop<16, max_a_unroll, max_b_unroll>(ao, bo, co1, co2);
+    if (m & 8 && max_a_unroll > 8) mloop<8, max_a_unroll, max_b_unroll>(ao, bo, co1, co2);
+    if (m & 4 && max_a_unroll > 4) mloop<4, max_a_unroll, max_b_unroll>(ao, bo, co1, co2);
+    if (m & 2 && max_a_unroll > 2 && is_f64) mloop<2, max_a_unroll, max_b_unroll>(ao, bo, co1, co2);
+    if (m & 1 && max_a_unroll > 1 && is_f64) mloop<1, max_a_unroll, max_b_unroll>(ao, bo, co1, co2);
+
+    // Copy kernels don't support tails of m = 2 for single precision.
+    // Loop over ones.
+    if (is_f32) {
+      int m_rem = 2 * ((m & 2) != 0) + 1 * ((m & 1) != 0);
+      while (m_rem > 0) {
+        mloop<1, max_a_unroll, max_b_unroll>(ao, bo, co1, co2);
+        m_rem--;
+      }
+    }
+  }
+
+  gemm_class(Index m_, Index n_, Index k_, Index ldc_, Index inc_, const Scalar *alpha_, const Scalar *a_,
+             const Scalar *b_, Scalar *c_, bool is_alpha1_, bool is_beta0_, Index a_stride_, Index b_stride_,
+             Index a_off_, Index b_off_)
+      : m(m_),
+        n(n_),
+        k(k_),
+        ldc(ldc_),
+        inc(inc_),
+        alpha(alpha_),
+        a(a_),
+        b(b_),
+        c(c_),
+        is_alpha1(is_alpha1_),
+        is_beta0(is_beta0_),
+        a_stride(a_stride_),
+        b_stride(b_stride_),
+        a_off(a_off_),
+        b_off(b_off_) {
+    // Zero out all accumulation registers.
+    zmm[8] = pzero(zmm[8]);
+    zmm[9] = pzero(zmm[9]);
+    zmm[10] = pzero(zmm[10]);
+    zmm[11] = pzero(zmm[11]);
+    zmm[12] = pzero(zmm[12]);
+    zmm[13] = pzero(zmm[13]);
+    zmm[14] = pzero(zmm[14]);
+    zmm[15] = pzero(zmm[15]);
+    zmm[16] = pzero(zmm[16]);
+    zmm[17] = pzero(zmm[17]);
+    zmm[18] = pzero(zmm[18]);
+    zmm[19] = pzero(zmm[19]);
+    zmm[20] = pzero(zmm[20]);
+    zmm[21] = pzero(zmm[21]);
+    zmm[22] = pzero(zmm[22]);
+    zmm[23] = pzero(zmm[23]);
+    zmm[24] = pzero(zmm[24]);
+    zmm[25] = pzero(zmm[25]);
+    zmm[26] = pzero(zmm[26]);
+    zmm[27] = pzero(zmm[27]);
+    zmm[28] = pzero(zmm[28]);
+    zmm[29] = pzero(zmm[29]);
+    zmm[30] = pzero(zmm[30]);
+    zmm[31] = pzero(zmm[31]);
+  }
+};
+
+// Compute kernel with max unroll support of:
+//   Single precision:
+//     max_a_unroll: 48, 32, 16, 8, 4, 2, 1
+//     max_b_unroll: 8, 4, 2, 1
+//   Double precision:
+//     max_a_unroll: 24, 16, 8, 4, 2, 1
+//     max_b_unroll: 8, 4, 2, 1
+template <typename Scalar, int max_a_unroll, int max_b_unroll, bool is_alpha1, bool is_beta0, bool is_unit_inc>
+EIGEN_DONT_INLINE void gemm_kern_avx512(Index m, Index n, Index k, Scalar *alpha, const Scalar *a, const Scalar *b,
+                                        Scalar *c, Index ldc, Index inc = 1, Index a_stride = -1, Index b_stride = -1,
+                                        Index a_off = 0, Index b_off = 0) {
+  if (a_stride == -1) a_stride = k;
+  if (b_stride == -1) b_stride = k;
+
+  gemm_class<Scalar, is_unit_inc> g(m, n, k, ldc, inc, alpha, a, b, c, is_alpha1, is_beta0, a_stride, b_stride, a_off,
+                                    b_off);
+  g.template compute_kern<max_a_unroll, max_b_unroll>();
+}
+
+// Template specializations of GEBP kernels with nr = 8.
+#if EIGEN_USE_AVX512_GEMM_KERNELS
+template <bool ConjLhs_, bool ConjRhs_, int PacketSize_>
+class gebp_traits<float, float, ConjLhs_, ConjRhs_, Architecture::Target, PacketSize_>
+    : public gebp_traits<float, float, ConjLhs_, ConjRhs_, Architecture::Generic, PacketSize_> {
+  using Base = gebp_traits<float, float, ConjLhs_, ConjRhs_, Architecture::Generic, PacketSize_>;
+
+ public:
+  enum { nr = Base::Vectorizable ? 8 : 4 };
+};
+
+template <bool ConjLhs_, bool ConjRhs_, int PacketSize_>
+class gebp_traits<double, double, ConjLhs_, ConjRhs_, Architecture::Target, PacketSize_>
+    : public gebp_traits<double, double, ConjLhs_, ConjRhs_, Architecture::Generic, PacketSize_> {
+  using Base = gebp_traits<double, double, ConjLhs_, ConjRhs_, Architecture::Generic, PacketSize_>;
+
+ public:
+  enum { nr = Base::Vectorizable ? 8 : 4 };
+};
+
+template <typename Scalar, typename Index, typename DataMapper, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<Scalar, Index, DataMapper, 8, ColMajor, Conjugate, PanelMode> {
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename DataMapper::LinearMapper LinearMapper;
+  enum { PacketSize = packet_traits<Scalar>::size };
+  EIGEN_DONT_INLINE void operator()(Scalar *blockB, const DataMapper &rhs, Index depth, Index cols, Index stride = 0,
+                                    Index offset = 0);
+};
+
+template <typename Scalar, typename Index, typename DataMapper, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, 8, ColMajor, Conjugate, PanelMode>::operator()(
+    Scalar *blockB, const DataMapper &rhs, Index depth, Index cols, Index stride, Index offset) {
+  constexpr int nr = 8;
+  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR");
+  EIGEN_UNUSED_VARIABLE(stride);
+  EIGEN_UNUSED_VARIABLE(offset);
+  eigen_assert(((!PanelMode) && stride == 0 && offset == 0) || (PanelMode && stride >= depth && offset <= stride));
+  conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
+  Index packet_cols8 = nr >= 8 ? (cols / 8) * 8 : 0;
+  Index packet_cols4 = nr >= 4 ? (cols / 4) * 4 : 0;
+  Index count = 0;
+  const Index peeled_k = (depth / PacketSize) * PacketSize;
+  if (nr >= 8) {
+    for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
+      // skip what we have before
+      if (PanelMode) count += 8 * offset;
+      const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
+      const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
+      const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
+      const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
+      const LinearMapper dm4 = rhs.getLinearMapper(0, j2 + 4);
+      const LinearMapper dm5 = rhs.getLinearMapper(0, j2 + 5);
+      const LinearMapper dm6 = rhs.getLinearMapper(0, j2 + 6);
+      const LinearMapper dm7 = rhs.getLinearMapper(0, j2 + 7);
+      Index k = 0;
+      if ((PacketSize % 8) == 0)  // TODO enable vectorized transposition for PacketSize==4
+      {
+        for (; k < peeled_k; k += PacketSize) {
+          PacketBlock<Packet, (PacketSize % 8) == 0 ? 8 : PacketSize> kernel;
+
+          kernel.packet[0] = dm0.template loadPacket<Packet>(k);
+          kernel.packet[1] = dm1.template loadPacket<Packet>(k);
+          kernel.packet[2] = dm2.template loadPacket<Packet>(k);
+          kernel.packet[3] = dm3.template loadPacket<Packet>(k);
+          kernel.packet[4] = dm4.template loadPacket<Packet>(k);
+          kernel.packet[5] = dm5.template loadPacket<Packet>(k);
+          kernel.packet[6] = dm6.template loadPacket<Packet>(k);
+          kernel.packet[7] = dm7.template loadPacket<Packet>(k);
+
+          ptranspose(kernel);
+
+          pstoreu(blockB + count + 0 * PacketSize, cj.pconj(kernel.packet[0]));
+          pstoreu(blockB + count + 1 * PacketSize, cj.pconj(kernel.packet[1 % PacketSize]));
+          pstoreu(blockB + count + 2 * PacketSize, cj.pconj(kernel.packet[2 % PacketSize]));
+          pstoreu(blockB + count + 3 * PacketSize, cj.pconj(kernel.packet[3 % PacketSize]));
+          pstoreu(blockB + count + 4 * PacketSize, cj.pconj(kernel.packet[4 % PacketSize]));
+          pstoreu(blockB + count + 5 * PacketSize, cj.pconj(kernel.packet[5 % PacketSize]));
+          pstoreu(blockB + count + 6 * PacketSize, cj.pconj(kernel.packet[6 % PacketSize]));
+          pstoreu(blockB + count + 7 * PacketSize, cj.pconj(kernel.packet[7 % PacketSize]));
+          count += 8 * PacketSize;
+        }
+      }
+      for (; k < depth; k++) {
+        blockB[count + 0] = cj(dm0(k));
+        blockB[count + 1] = cj(dm1(k));
+        blockB[count + 2] = cj(dm2(k));
+        blockB[count + 3] = cj(dm3(k));
+        blockB[count + 4] = cj(dm4(k));
+        blockB[count + 5] = cj(dm5(k));
+        blockB[count + 6] = cj(dm6(k));
+        blockB[count + 7] = cj(dm7(k));
+        count += 8;
+      }
+      // skip what we have after
+      if (PanelMode) count += 8 * (stride - offset - depth);
+    }
+  }
+
+  if (nr >= 4) {
+    for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
+      // skip what we have before
+      if (PanelMode) count += 4 * offset;
+      const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
+      const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
+      const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
+      const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
+
+      Index k = 0;
+      if ((PacketSize % 4) == 0)  // TODO enable vectorized transposition for PacketSize==2 ??
+      {
+        for (; k < peeled_k; k += PacketSize) {
+          PacketBlock<Packet, (PacketSize % 4) == 0 ? 4 : PacketSize> kernel;
+          kernel.packet[0] = dm0.template loadPacket<Packet>(k);
+          kernel.packet[1 % PacketSize] = dm1.template loadPacket<Packet>(k);
+          kernel.packet[2 % PacketSize] = dm2.template loadPacket<Packet>(k);
+          kernel.packet[3 % PacketSize] = dm3.template loadPacket<Packet>(k);
+          ptranspose(kernel);
+          pstoreu(blockB + count + 0 * PacketSize, cj.pconj(kernel.packet[0]));
+          pstoreu(blockB + count + 1 * PacketSize, cj.pconj(kernel.packet[1 % PacketSize]));
+          pstoreu(blockB + count + 2 * PacketSize, cj.pconj(kernel.packet[2 % PacketSize]));
+          pstoreu(blockB + count + 3 * PacketSize, cj.pconj(kernel.packet[3 % PacketSize]));
+          count += 4 * PacketSize;
+        }
+      }
+      for (; k < depth; k++) {
+        blockB[count + 0] = cj(dm0(k));
+        blockB[count + 1] = cj(dm1(k));
+        blockB[count + 2] = cj(dm2(k));
+        blockB[count + 3] = cj(dm3(k));
+        count += 4;
+      }
+      // skip what we have after
+      if (PanelMode) count += 4 * (stride - offset - depth);
+    }
+  }
+
+  // copy the remaining columns one at a time (nr==1)
+  for (Index j2 = packet_cols4; j2 < cols; ++j2) {
+    if (PanelMode) count += offset;
+    const LinearMapper dm0 = rhs.getLinearMapper(0, j2);
+    for (Index k = 0; k < depth; k++) {
+      blockB[count] = cj(dm0(k));
+      count += 1;
+    }
+    if (PanelMode) count += (stride - offset - depth);
+  }
+}
+
+template <typename Scalar, typename Index, typename DataMapper, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<Scalar, Index, DataMapper, 8, RowMajor, Conjugate, PanelMode> {
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename unpacket_traits<Packet>::half HalfPacket;
+  typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
+  typedef typename DataMapper::LinearMapper LinearMapper;
+  enum {
+    PacketSize = packet_traits<Scalar>::size,
+    HalfPacketSize = unpacket_traits<HalfPacket>::size,
+    QuarterPacketSize = unpacket_traits<QuarterPacket>::size
+  };
+  EIGEN_DONT_INLINE void operator()(Scalar *blockB, const DataMapper &rhs, Index depth, Index cols, Index stride = 0,
+                                    Index offset = 0) {
+    constexpr int nr = 8;
+    EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
+    EIGEN_UNUSED_VARIABLE(stride);
+    EIGEN_UNUSED_VARIABLE(offset);
+    eigen_assert(((!PanelMode) && stride == 0 && offset == 0) || (PanelMode && stride >= depth && offset <= stride));
+    const bool HasHalf = (int)HalfPacketSize < (int)PacketSize;
+    const bool HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize;
+    conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
+    Index packet_cols8 = nr >= 8 ? (cols / 8) * 8 : 0;
+    Index packet_cols4 = nr >= 4 ? (cols / 4) * 4 : 0;
+    Index count = 0;
+
+    if (nr >= 8) {
+      for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
+        // skip what we have before
+        if (PanelMode) count += 8 * offset;
+        for (Index k = 0; k < depth; k++) {
+          if (PacketSize == 8) {
+            // Packet A = ploadu<Packet>(&rhs.data()[k*rhs.stride() + j2]);
+            Packet A = rhs.template loadPacket<Packet>(k, j2);
+            pstoreu(blockB + count, cj.pconj(A));
+          } else if (HasHalf && HalfPacketSize == 8) {
+            HalfPacket A = rhs.template loadPacket<HalfPacket>(k, j2);
+            pstoreu(blockB + count, cj.pconj(A));
+          } else if (HasQuarter && QuarterPacketSize == 8) {
+            QuarterPacket A = rhs.template loadPacket<QuarterPacket>(k, j2);
+            pstoreu(blockB + count, cj.pconj(A));
+          } else if (PacketSize == 4) {
+            // Packet A = ploadu<Packet>(&rhs.data()[k*rhs.stride() + j2]);
+            // Packet B = ploadu<Packet>(&rhs.data()[k*rhs.stride() + j2 + PacketSize]);
+            Packet A = rhs.template loadPacket<Packet>(k, j2);
+            Packet B = rhs.template loadPacket<Packet>(k, j2 + PacketSize);
+            pstoreu(blockB + count, cj.pconj(A));
+            pstoreu(blockB + count + PacketSize, cj.pconj(B));
+          } else {
+            // const Scalar* b0 = &rhs.data()[k*rhs.stride() + j2];
+            const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
+            blockB[count + 0] = cj(dm0(0));
+            blockB[count + 1] = cj(dm0(1));
+            blockB[count + 2] = cj(dm0(2));
+            blockB[count + 3] = cj(dm0(3));
+            blockB[count + 4] = cj(dm0(4));
+            blockB[count + 5] = cj(dm0(5));
+            blockB[count + 6] = cj(dm0(6));
+            blockB[count + 7] = cj(dm0(7));
+          }
+          count += 8;
+        }
+        // skip what we have after
+        if (PanelMode) count += 8 * (stride - offset - depth);
+      }
+    }
+
+    if (nr >= 4) {
+      for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
+        // skip what we have before
+        if (PanelMode) count += 4 * offset;
+        for (Index k = 0; k < depth; k++) {
+          if (PacketSize == 4) {
+            Packet A = rhs.template loadPacket<Packet>(k, j2);
+            pstoreu(blockB + count, cj.pconj(A));
+            count += PacketSize;
+          } else if (HasHalf && HalfPacketSize == 4) {
+            HalfPacket A = rhs.template loadPacket<HalfPacket>(k, j2);
+            pstoreu(blockB + count, cj.pconj(A));
+            count += HalfPacketSize;
+          } else if (HasQuarter && QuarterPacketSize == 4) {
+            QuarterPacket A = rhs.template loadPacket<QuarterPacket>(k, j2);
+            pstoreu(blockB + count, cj.pconj(A));
+            count += QuarterPacketSize;
+          } else {
+            const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
+            blockB[count + 0] = cj(dm0(0));
+            blockB[count + 1] = cj(dm0(1));
+            blockB[count + 2] = cj(dm0(2));
+            blockB[count + 3] = cj(dm0(3));
+            count += 4;
+          }
+        }
+        // skip what we have after
+        if (PanelMode) count += 4 * (stride - offset - depth);
+      }
+    }
+    // copy the remaining columns one at a time (nr==1)
+    for (Index j2 = packet_cols4; j2 < cols; ++j2) {
+      if (PanelMode) count += offset;
+      for (Index k = 0; k < depth; k++) {
+        blockB[count] = cj(rhs(k, j2));
+        count += 1;
+      }
+      if (PanelMode) count += stride - offset - depth;
+    }
+  }
+};
+
+template <typename Scalar, typename Index, typename DataMapper, int mr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<Scalar, Scalar, Index, DataMapper, mr, 8, ConjugateLhs, ConjugateRhs> {
+  EIGEN_ALWAYS_INLINE void operator()(const DataMapper &res, const Scalar *blockA, const Scalar *blockB, Index rows,
+                                      Index depth, Index cols, Scalar alpha, Index strideA = -1, Index strideB = -1,
+                                      Index offsetA = 0, Index offsetB = 0);
+};
+
+template <typename Scalar, typename Index, typename DataMapper, int mr, bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_ALWAYS_INLINE void gebp_kernel<Scalar, Scalar, Index, DataMapper, mr, 8, ConjugateLhs, ConjugateRhs>::operator()(
+    const DataMapper &res, const Scalar *blockA, const Scalar *blockB, Index rows, Index depth, Index cols,
+    Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) {
+  if (res.incr() == 1) {
+    if (alpha == 1) {
+      gemm_kern_avx512<Scalar, mr, 8, true, false, true>(rows, cols, depth, &alpha, blockA, blockB,
+                                                         (Scalar *)res.data(), res.stride(), res.incr(), strideA,
+                                                         strideB, offsetA, offsetB);
+    } else {
+      gemm_kern_avx512<Scalar, mr, 8, false, false, true>(rows, cols, depth, &alpha, blockA, blockB,
+                                                          (Scalar *)res.data(), res.stride(), res.incr(), strideA,
+                                                          strideB, offsetA, offsetB);
+    }
+  } else {
+    if (alpha == 1) {
+      gemm_kern_avx512<Scalar, mr, 8, true, false, false>(rows, cols, depth, &alpha, blockA, blockB,
+                                                          (Scalar *)res.data(), res.stride(), res.incr(), strideA,
+                                                          strideB, offsetA, offsetB);
+    } else {
+      gemm_kern_avx512<Scalar, mr, 8, false, false, false>(rows, cols, depth, &alpha, blockA, blockB,
+                                                           (Scalar *)res.data(), res.stride(), res.incr(), strideA,
+                                                           strideB, offsetA, offsetB);
+    }
+  }
+}
+#endif  // EIGEN_USE_AVX512_GEMM_KERNELS
+
+}  // namespace internal
+}  // namespace Eigen
+
+#undef SECOND_FETCH
+
+#endif  // EIGEN_CORE_ARCH_AVX512_GEMM_KERNEL_H
diff --git a/inst/include/Eigen/src/Core/arch/AVX512/MathFunctions.h b/inst/include/Eigen/src/Core/arch/AVX512/MathFunctions.h
new file mode 100644
index 00000000..04499a0c
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/AVX512/MathFunctions.h
@@ -0,0 +1,141 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Pedro Gonnet (pedro.gonnet@gmail.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef THIRD_PARTY_EIGEN3_EIGEN_SRC_CORE_ARCH_AVX512_MATHFUNCTIONS_H_
+#define THIRD_PARTY_EIGEN3_EIGEN_SRC_CORE_ARCH_AVX512_MATHFUNCTIONS_H_
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(Packet16f)
+EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_DOUBLE(Packet8d)
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pfrexp(const Packet16h& a, Packet16h& exponent) {
+  Packet16f fexponent;
+  const Packet16h out = float2half(pfrexp<Packet16f>(half2float(a), fexponent));
+  exponent = float2half(fexponent);
+  return out;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pldexp(const Packet16h& a, const Packet16h& exponent) {
+  return float2half(pldexp<Packet16f>(half2float(a), half2float(exponent)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pfrexp(const Packet16bf& a, Packet16bf& exponent) {
+  Packet16f fexponent;
+  const Packet16bf out = F32ToBf16(pfrexp<Packet16f>(Bf16ToF32(a), fexponent));
+  exponent = F32ToBf16(fexponent);
+  return out;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pldexp(const Packet16bf& a, const Packet16bf& exponent) {
+  return F32ToBf16(pldexp<Packet16f>(Bf16ToF32(a), Bf16ToF32(exponent)));
+}
+
+#if EIGEN_FAST_MATH
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16f psqrt<Packet16f>(const Packet16f& x) {
+  return generic_sqrt_newton_step<Packet16f>::run(x, _mm512_rsqrt14_ps(x));
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8d psqrt<Packet8d>(const Packet8d& x) {
+#ifdef EIGEN_VECTORIZE_AVX512ER
+  return generic_sqrt_newton_step<Packet8d, /*Steps=*/1>::run(x, _mm512_rsqrt28_pd(x));
+#else
+  return generic_sqrt_newton_step<Packet8d, /*Steps=*/2>::run(x, _mm512_rsqrt14_pd(x));
+#endif
+}
+#else
+template <>
+EIGEN_STRONG_INLINE Packet16f psqrt<Packet16f>(const Packet16f& x) {
+  return _mm512_sqrt_ps(x);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8d psqrt<Packet8d>(const Packet8d& x) {
+  return _mm512_sqrt_pd(x);
+}
+#endif
+
+// prsqrt for float.
+#if defined(EIGEN_VECTORIZE_AVX512ER)
+template <>
+EIGEN_STRONG_INLINE Packet16f prsqrt<Packet16f>(const Packet16f& x) {
+  return _mm512_rsqrt28_ps(x);
+}
+#elif EIGEN_FAST_MATH
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16f prsqrt<Packet16f>(const Packet16f& x) {
+  return generic_rsqrt_newton_step<Packet16f, /*Steps=*/1>::run(x, _mm512_rsqrt14_ps(x));
+}
+#endif
+
+// prsqrt for double.
+#if EIGEN_FAST_MATH
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet8d prsqrt<Packet8d>(const Packet8d& x) {
+#ifdef EIGEN_VECTORIZE_AVX512ER
+  return generic_rsqrt_newton_step<Packet8d, /*Steps=*/1>::run(x, _mm512_rsqrt28_pd(x));
+#else
+  return generic_rsqrt_newton_step<Packet8d, /*Steps=*/2>::run(x, _mm512_rsqrt14_pd(x));
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f preciprocal<Packet16f>(const Packet16f& a) {
+#ifdef EIGEN_VECTORIZE_AVX512ER
+  return _mm512_rcp28_ps(a);
+#else
+  return generic_reciprocal_newton_step<Packet16f, /*Steps=*/1>::run(a, _mm512_rcp14_ps(a));
+#endif
+}
+#endif
+
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pcos)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pexp)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pexp2)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pexpm1)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, plog)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, plog1p)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, plog2)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, preciprocal)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, prsqrt)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, psin)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, psqrt)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, ptanh)
+
+#ifndef EIGEN_VECTORIZE_AVX512FP16
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pcos)
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pexp)
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pexp2)
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pexpm1)
+F16_PACKET_FUNCTION(Packet16f, Packet16h, plog)
+F16_PACKET_FUNCTION(Packet16f, Packet16h, plog1p)
+F16_PACKET_FUNCTION(Packet16f, Packet16h, plog2)
+F16_PACKET_FUNCTION(Packet16f, Packet16h, preciprocal)
+F16_PACKET_FUNCTION(Packet16f, Packet16h, prsqrt)
+F16_PACKET_FUNCTION(Packet16f, Packet16h, psin)
+F16_PACKET_FUNCTION(Packet16f, Packet16h, psqrt)
+F16_PACKET_FUNCTION(Packet16f, Packet16h, ptanh)
+#endif  // EIGEN_VECTORIZE_AVX512FP16
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // THIRD_PARTY_EIGEN3_EIGEN_SRC_CORE_ARCH_AVX512_MATHFUNCTIONS_H_
diff --git a/inst/include/Eigen/src/Core/arch/AVX512/MathFunctionsFP16.h b/inst/include/Eigen/src/Core/arch/AVX512/MathFunctionsFP16.h
new file mode 100644
index 00000000..240ade43
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/AVX512/MathFunctionsFP16.h
@@ -0,0 +1,75 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2025 The Eigen Authors.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATH_FUNCTIONS_FP16_AVX512_H
+#define EIGEN_MATH_FUNCTIONS_FP16_AVX512_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+EIGEN_STRONG_INLINE Packet32h combine2Packet16h(const Packet16h& a, const Packet16h& b) {
+  __m512i result = _mm512_castsi256_si512(_mm256_castph_si256(a));
+  result = _mm512_inserti64x4(result, _mm256_castph_si256(b), 1);
+  return _mm512_castsi512_ph(result);
+}
+
+EIGEN_STRONG_INLINE void extract2Packet16h(const Packet32h& x, Packet16h& a, Packet16h& b) {
+  a = _mm256_castsi256_ph(_mm512_castsi512_si256(_mm512_castph_si512(x)));
+  b = _mm256_castsi256_ph(_mm512_extracti64x4_epi64(_mm512_castph_si512(x), 1));
+}
+
+#define _EIGEN_GENERATE_FP16_MATH_FUNCTION(func)                      \
+  template <>                                                         \
+  EIGEN_STRONG_INLINE Packet8h func<Packet8h>(const Packet8h& a) {    \
+    return float2half(func(half2float(a)));                           \
+  }                                                                   \
+                                                                      \
+  template <>                                                         \
+  EIGEN_STRONG_INLINE Packet16h func<Packet16h>(const Packet16h& a) { \
+    return float2half(func(half2float(a)));                           \
+  }                                                                   \
+                                                                      \
+  template <>                                                         \
+  EIGEN_STRONG_INLINE Packet32h func<Packet32h>(const Packet32h& a) { \
+    Packet16h low;                                                    \
+    Packet16h high;                                                   \
+    extract2Packet16h(a, low, high);                                  \
+    return combine2Packet16h(func(low), func(high));                  \
+  }
+
+_EIGEN_GENERATE_FP16_MATH_FUNCTION(psin)
+_EIGEN_GENERATE_FP16_MATH_FUNCTION(pcos)
+_EIGEN_GENERATE_FP16_MATH_FUNCTION(plog)
+_EIGEN_GENERATE_FP16_MATH_FUNCTION(plog2)
+_EIGEN_GENERATE_FP16_MATH_FUNCTION(plog1p)
+_EIGEN_GENERATE_FP16_MATH_FUNCTION(pexp)
+_EIGEN_GENERATE_FP16_MATH_FUNCTION(pexpm1)
+_EIGEN_GENERATE_FP16_MATH_FUNCTION(pexp2)
+_EIGEN_GENERATE_FP16_MATH_FUNCTION(ptanh)
+#undef _EIGEN_GENERATE_FP16_MATH_FUNCTION
+
+// pfrexp
+template <>
+EIGEN_STRONG_INLINE Packet32h pfrexp<Packet32h>(const Packet32h& a, Packet32h& exponent) {
+  return pfrexp_generic(a, exponent);
+}
+
+// pldexp
+template <>
+EIGEN_STRONG_INLINE Packet32h pldexp<Packet32h>(const Packet32h& a, const Packet32h& exponent) {
+  return pldexp_generic(a, exponent);
+}
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_MATH_FUNCTIONS_FP16_AVX512_H
\ No newline at end of file
diff --git a/inst/include/Eigen/src/Core/arch/AVX512/PacketMath.h b/inst/include/Eigen/src/Core/arch/AVX512/PacketMath.h
new file mode 100644
index 00000000..b76c8a77
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/AVX512/PacketMath.h
@@ -0,0 +1,3146 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner (benoit.steiner.goog@gmail.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_AVX512_H
+#define EIGEN_PACKET_MATH_AVX512_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
+#endif
+
+#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
+#endif
+
+#ifdef EIGEN_VECTORIZE_FMA
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#endif
+#endif
+
+typedef __m512 Packet16f;
+typedef __m512i Packet16i;
+typedef __m512d Packet8d;
+typedef eigen_packet_wrapper<__m512i, 1> Packet8l;
+#ifndef EIGEN_VECTORIZE_AVX512FP16
+typedef eigen_packet_wrapper<__m256i, 1> Packet16h;
+#endif
+typedef eigen_packet_wrapper<__m256i, 2> Packet16bf;
+
+typedef eigen_packet_wrapper<__m512i, 6> Packet32s;
+typedef eigen_packet_wrapper<__m256i, 6> Packet16s;
+typedef eigen_packet_wrapper<__m128i, 6> Packet8s;
+
+template <>
+struct is_arithmetic<__m512> {
+  enum { value = true };
+};
+template <>
+struct is_arithmetic<__m512i> {
+  enum { value = true };
+};
+template <>
+struct is_arithmetic<__m512d> {
+  enum { value = true };
+};
+template <>
+struct is_arithmetic<Packet8l> {
+  enum { value = true };
+};
+
+#ifndef EIGEN_VECTORIZE_AVX512FP16
+template <>
+struct is_arithmetic<Packet16h> {
+  enum { value = true };
+};
+
+template <>
+struct packet_traits<half> : default_packet_traits {
+  typedef Packet16h type;
+  // There is no half-size packet for Packet16h.
+  typedef Packet16h half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 16,
+
+    HasCmp = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasAbs2 = 0,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
+    HasSetLinear = 0,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasLog = 1,
+    HasLog1p = 1,
+    HasExp = 1,
+    HasExpm1 = 1,
+    HasBessel = 1,
+    HasNdtri = 1,
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH,
+    HasBlend = 0
+  };
+};
+#endif
+
+template <>
+struct packet_traits<float> : default_packet_traits {
+  typedef Packet16f type;
+  typedef Packet8f half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 16,
+
+    HasAbs = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
+    HasBlend = 1,
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+    HasACos = 1,
+    HasASin = 1,
+    HasATan = 1,
+    HasATanh = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasCbrt = 1,
+    HasLog = 1,
+    HasLog1p = 1,
+    HasExpm1 = 1,
+    HasNdtri = 1,
+    HasBessel = 1,
+    HasExp = 1,
+    HasPow = 1,
+    HasReciprocal = EIGEN_FAST_MATH,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH,
+    HasErfc = EIGEN_FAST_MATH,
+    HasCmp = 1,
+    HasDiv = 1
+  };
+};
+template <>
+struct packet_traits<double> : default_packet_traits {
+  typedef Packet8d type;
+  typedef Packet4d half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 8,
+    HasBlend = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasCbrt = 1,
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+    HasLog = 1,
+    HasExp = 1,
+    HasPow = 1,
+    HasATan = 1,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH,
+    HasErfc = EIGEN_FAST_MATH,
+    HasATanh = 1,
+    HasCmp = 1,
+    HasDiv = 1
+  };
+};
+
+template <>
+struct packet_traits<int> : default_packet_traits {
+  typedef Packet16i type;
+  typedef Packet8i half;
+  enum { Vectorizable = 1, AlignedOnScalar = 1, HasBlend = 0, HasCmp = 1, HasDiv = 1, size = 16 };
+};
+
+template <>
+struct packet_traits<int64_t> : default_packet_traits {
+  typedef Packet8l type;
+  typedef Packet4l half;
+  enum { Vectorizable = 1, AlignedOnScalar = 1, HasCmp = 1, size = 8 };
+};
+
+template <>
+struct unpacket_traits<Packet16f> {
+  typedef float type;
+  typedef Packet8f half;
+  typedef Packet16i integer_packet;
+  typedef uint16_t mask_t;
+  enum {
+    size = 16,
+    alignment = Aligned64,
+    vectorizable = true,
+    masked_load_available = true,
+    masked_store_available = true,
+    masked_fpops_available = true
+  };
+};
+template <>
+struct unpacket_traits<Packet8d> {
+  typedef double type;
+  typedef Packet4d half;
+  typedef Packet8l integer_packet;
+  typedef uint8_t mask_t;
+  enum {
+    size = 8,
+    alignment = Aligned64,
+    vectorizable = true,
+    masked_load_available = true,
+    masked_store_available = true,
+    masked_fpops_available = true
+  };
+};
+template <>
+struct unpacket_traits<Packet16i> {
+  typedef int type;
+  typedef Packet8i half;
+  enum {
+    size = 16,
+    alignment = Aligned64,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template <>
+struct unpacket_traits<Packet8l> {
+  typedef int64_t type;
+  typedef Packet4l half;
+  enum {
+    size = 8,
+    alignment = Aligned64,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+#ifndef EIGEN_VECTORIZE_AVX512FP16
+template <>
+struct unpacket_traits<Packet16h> {
+  typedef Eigen::half type;
+  typedef Packet8h half;
+  enum {
+    size = 16,
+    alignment = Aligned32,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+#endif
+
+template <>
+struct unpacket_traits<Packet32s> {
+  typedef numext::int16_t type;
+  typedef Packet16s half;
+  enum {
+    size = 32,
+    alignment = Aligned64,
+    vectorizable = false,
+  };
+};
+
+template <>
+struct unpacket_traits<Packet16s> {
+  typedef numext::int16_t type;
+  typedef Packet8s half;
+  enum {
+    size = 16,
+    alignment = Aligned32,
+    vectorizable = false,
+  };
+};
+
+template <>
+struct unpacket_traits<Packet8s> {
+  typedef numext::int16_t type;
+  typedef Packet8s half;
+  enum {
+    size = 8,
+    alignment = Aligned16,
+    vectorizable = false,
+  };
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pset1<Packet16f>(const float& from) {
+  return _mm512_set1_ps(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pset1<Packet8d>(const double& from) {
+  return _mm512_set1_pd(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16i pset1<Packet16i>(const int& from) {
+  return _mm512_set1_epi32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8l pset1<Packet8l>(const int64_t& from) {
+  return _mm512_set1_epi64(from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pset1frombits<Packet16f>(unsigned int from) {
+  return _mm512_castsi512_ps(_mm512_set1_epi32(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8d pset1frombits<Packet8d>(const numext::uint64_t from) {
+  return _mm512_castsi512_pd(_mm512_set1_epi64(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pzero(const Packet16f& /*a*/) {
+  return _mm512_setzero_ps();
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pzero(const Packet8d& /*a*/) {
+  return _mm512_setzero_pd();
+}
+template <>
+EIGEN_STRONG_INLINE Packet16i pzero(const Packet16i& /*a*/) {
+  return _mm512_setzero_si512();
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8l pzero(const Packet8l& /*a*/) {
+  return _mm512_setzero_si512();
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f peven_mask(const Packet16f& /*a*/) {
+  return _mm512_castsi512_ps(_mm512_set_epi32(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16i peven_mask(const Packet16i& /*a*/) {
+  return _mm512_set_epi32(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d peven_mask(const Packet8d& /*a*/) {
+  return _mm512_castsi512_pd(_mm512_set_epi32(0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8l peven_mask(const Packet8l& /*a*/) {
+  return _mm512_set_epi32(0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, -1);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pload1<Packet16f>(const float* from) {
+#if (EIGEN_COMP_GNUC != 0) || (EIGEN_COMP_CLANG != 0)
+  // Inline asm here helps reduce some register spilling in TRSM kernels.
+  // See note in unrolls::gemm::microKernel in TrsmKernel.h
+  Packet16f ret;
+  __asm__("vbroadcastss %[mem], %[dst]" : [dst] "=v"(ret) : [mem] "m"(*from));
+  return ret;
+#else
+  return _mm512_broadcastss_ps(_mm_load_ps1(from));
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pload1<Packet8d>(const double* from) {
+#if (EIGEN_COMP_GNUC != 0) || (EIGEN_COMP_CLANG != 0)
+  Packet8d ret;
+  __asm__("vbroadcastsd %[mem], %[dst]" : [dst] "=v"(ret) : [mem] "m"(*from));
+  return ret;
+#else
+  return _mm512_set1_pd(*from);
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f plset<Packet16f>(const float& a) {
+  return _mm512_add_ps(_mm512_set1_ps(a), _mm512_set_ps(15.0f, 14.0f, 13.0f, 12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f,
+                                                        6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d plset<Packet8d>(const double& a) {
+  return _mm512_add_pd(_mm512_set1_pd(a), _mm512_set_pd(7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 0.0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16i plset<Packet16i>(const int& a) {
+  return _mm512_add_epi32(_mm512_set1_epi32(a), _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8l plset<Packet8l>(const int64_t& a) {
+  return _mm512_add_epi64(_mm512_set1_epi64(a), _mm512_set_epi64(7, 6, 5, 4, 3, 2, 1, 0));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f padd<Packet16f>(const Packet16f& a, const Packet16f& b) {
+  return _mm512_add_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d padd<Packet8d>(const Packet8d& a, const Packet8d& b) {
+  return _mm512_add_pd(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16i padd<Packet16i>(const Packet16i& a, const Packet16i& b) {
+  return _mm512_add_epi32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8l padd<Packet8l>(const Packet8l& a, const Packet8l& b) {
+  return _mm512_add_epi64(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f padd<Packet16f>(const Packet16f& a, const Packet16f& b, uint16_t umask) {
+  __mmask16 mask = static_cast<__mmask16>(umask);
+  return _mm512_maskz_add_ps(mask, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d padd<Packet8d>(const Packet8d& a, const Packet8d& b, uint8_t umask) {
+  __mmask8 mask = static_cast<__mmask8>(umask);
+  return _mm512_maskz_add_pd(mask, a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f psub<Packet16f>(const Packet16f& a, const Packet16f& b) {
+  return _mm512_sub_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d psub<Packet8d>(const Packet8d& a, const Packet8d& b) {
+  return _mm512_sub_pd(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16i psub<Packet16i>(const Packet16i& a, const Packet16i& b) {
+  return _mm512_sub_epi32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8l psub<Packet8l>(const Packet8l& a, const Packet8l& b) {
+  return _mm512_sub_epi64(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pnegate(const Packet16f& a) {
+  // NOTE: MSVC seems to struggle with _mm512_set1_epi32, leading to random results.
+  //       The intel docs give it a relatively high latency as well, so we're probably
+  //       better off with using _mm512_set_epi32 directly anyways.
+  const __m512i mask =
+      _mm512_set_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000,
+                       0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000);
+  return _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(a), mask));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pnegate(const Packet8d& a) {
+  const __m512i mask =
+      _mm512_set_epi64(0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL,
+                       0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL, 0x8000000000000000ULL);
+  return _mm512_castsi512_pd(_mm512_xor_epi64(_mm512_castpd_si512(a), mask));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16i pnegate(const Packet16i& a) {
+  return _mm512_sub_epi32(_mm512_setzero_si512(), a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8l pnegate(const Packet8l& a) {
+  return _mm512_sub_epi64(_mm512_setzero_si512(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pconj(const Packet16f& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pconj(const Packet8d& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet16i pconj(const Packet16i& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8l pconj(const Packet8l& a) {
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pmul<Packet16f>(const Packet16f& a, const Packet16f& b) {
+  return _mm512_mul_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pmul<Packet8d>(const Packet8d& a, const Packet8d& b) {
+  return _mm512_mul_pd(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16i pmul<Packet16i>(const Packet16i& a, const Packet16i& b) {
+  return _mm512_mullo_epi32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8l pmul<Packet8l>(const Packet8l& a, const Packet8l& b) {
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+  return _mm512_mullo_epi64(a, b);
+#else
+  return _mm512_mullox_epi64(a, b);
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pdiv<Packet16f>(const Packet16f& a, const Packet16f& b) {
+  return _mm512_div_ps(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8d pdiv<Packet8d>(const Packet8d& a, const Packet8d& b) {
+  return _mm512_div_pd(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16i pdiv<Packet16i>(const Packet16i& a, const Packet16i& b) {
+  Packet8i q_lo = pdiv<Packet8i>(_mm512_extracti64x4_epi64(a, 0), _mm512_extracti64x4_epi64(b, 0));
+  Packet8i q_hi = pdiv<Packet8i>(_mm512_extracti64x4_epi64(a, 1), _mm512_extracti64x4_epi64(b, 1));
+  return _mm512_inserti64x4(_mm512_castsi256_si512(q_lo), q_hi, 1);
+}
+
+#ifdef EIGEN_VECTORIZE_FMA
+template <>
+EIGEN_STRONG_INLINE Packet16f pmadd(const Packet16f& a, const Packet16f& b, const Packet16f& c) {
+  return _mm512_fmadd_ps(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pmadd(const Packet8d& a, const Packet8d& b, const Packet8d& c) {
+  return _mm512_fmadd_pd(a, b, c);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pmsub(const Packet16f& a, const Packet16f& b, const Packet16f& c) {
+  return _mm512_fmsub_ps(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pmsub(const Packet8d& a, const Packet8d& b, const Packet8d& c) {
+  return _mm512_fmsub_pd(a, b, c);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pnmadd(const Packet16f& a, const Packet16f& b, const Packet16f& c) {
+  return _mm512_fnmadd_ps(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pnmadd(const Packet8d& a, const Packet8d& b, const Packet8d& c) {
+  return _mm512_fnmadd_pd(a, b, c);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pnmsub(const Packet16f& a, const Packet16f& b, const Packet16f& c) {
+  return _mm512_fnmsub_ps(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pnmsub(const Packet8d& a, const Packet8d& b, const Packet8d& c) {
+  return _mm512_fnmsub_pd(a, b, c);
+}
+#endif
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet16f pselect(const Packet16f& mask, const Packet16f& a, const Packet16f& b) {
+  __mmask16 mask16 = _mm512_cmpeq_epi32_mask(_mm512_castps_si512(mask), _mm512_setzero_epi32());
+  return _mm512_mask_blend_ps(mask16, a, b);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet16i pselect(const Packet16i& mask, const Packet16i& a, const Packet16i& b) {
+  __mmask16 mask16 = _mm512_cmpeq_epi32_mask(mask, _mm512_setzero_epi32());
+  return _mm512_mask_blend_epi32(mask16, a, b);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet8l pselect(const Packet8l& mask, const Packet8l& a, const Packet8l& b) {
+  __mmask8 mask8 = _mm512_cmpeq_epi64_mask(mask, _mm512_setzero_si512());
+  return _mm512_mask_blend_epi64(mask8, a, b);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet8d pselect(const Packet8d& mask, const Packet8d& a, const Packet8d& b) {
+  __mmask8 mask8 = _mm512_cmp_epi64_mask(_mm512_castpd_si512(mask), _mm512_setzero_epi32(), _MM_CMPINT_EQ);
+  return _mm512_mask_blend_pd(mask8, a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pmin<Packet16f>(const Packet16f& a, const Packet16f& b) {
+  // Arguments are reversed to match NaN propagation behavior of std::min.
+  return _mm512_min_ps(b, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pmin<Packet8d>(const Packet8d& a, const Packet8d& b) {
+  // Arguments are reversed to match NaN propagation behavior of std::min.
+  return _mm512_min_pd(b, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16i pmin<Packet16i>(const Packet16i& a, const Packet16i& b) {
+  return _mm512_min_epi32(b, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8l pmin<Packet8l>(const Packet8l& a, const Packet8l& b) {
+  return _mm512_min_epi64(b, a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pmax<Packet16f>(const Packet16f& a, const Packet16f& b) {
+  // Arguments are reversed to match NaN propagation behavior of std::max.
+  return _mm512_max_ps(b, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pmax<Packet8d>(const Packet8d& a, const Packet8d& b) {
+  // Arguments are reversed to match NaN propagation behavior of std::max.
+  return _mm512_max_pd(b, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16i pmax<Packet16i>(const Packet16i& a, const Packet16i& b) {
+  return _mm512_max_epi32(b, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8l pmax<Packet8l>(const Packet8l& a, const Packet8l& b) {
+  return _mm512_max_epi64(b, a);
+}
+
+// Add specializations for min/max with prescribed NaN propagation.
+template <>
+EIGEN_STRONG_INLINE Packet16f pmin<PropagateNumbers, Packet16f>(const Packet16f& a, const Packet16f& b) {
+  return pminmax_propagate_numbers(a, b, pmin<Packet16f>);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pmin<PropagateNumbers, Packet8d>(const Packet8d& a, const Packet8d& b) {
+  return pminmax_propagate_numbers(a, b, pmin<Packet8d>);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f pmax<PropagateNumbers, Packet16f>(const Packet16f& a, const Packet16f& b) {
+  return pminmax_propagate_numbers(a, b, pmax<Packet16f>);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pmax<PropagateNumbers, Packet8d>(const Packet8d& a, const Packet8d& b) {
+  return pminmax_propagate_numbers(a, b, pmax<Packet8d>);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f pmin<PropagateNaN, Packet16f>(const Packet16f& a, const Packet16f& b) {
+  return pminmax_propagate_nan(a, b, pmin<Packet16f>);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pmin<PropagateNaN, Packet8d>(const Packet8d& a, const Packet8d& b) {
+  return pminmax_propagate_nan(a, b, pmin<Packet8d>);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f pmax<PropagateNaN, Packet16f>(const Packet16f& a, const Packet16f& b) {
+  return pminmax_propagate_nan(a, b, pmax<Packet16f>);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pmax<PropagateNaN, Packet8d>(const Packet8d& a, const Packet8d& b) {
+  return pminmax_propagate_nan(a, b, pmax<Packet8d>);
+}
+
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+template <int I_>
+EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) {
+  return _mm512_extractf32x8_ps(x, I_);
+}
+template <int I_>
+EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) {
+  return _mm512_extractf64x2_pd(x, I_);
+}
+EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) {
+  return _mm512_insertf32x8(_mm512_castps256_ps512(a), b, 1);
+}
+EIGEN_STRONG_INLINE Packet16i cat256i(Packet8i a, Packet8i b) {
+  return _mm512_inserti32x8(_mm512_castsi256_si512(a), b, 1);
+}
+#else
+// AVX512F does not define _mm512_extractf32x8_ps to extract _m256 from _m512
+template <int I_>
+EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) {
+  return _mm256_castsi256_ps(_mm512_extracti64x4_epi64(_mm512_castps_si512(x), I_));
+}
+
+// AVX512F does not define _mm512_extractf64x2_pd to extract _m128 from _m512
+template <int I_>
+EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) {
+  return _mm_castsi128_pd(_mm512_extracti32x4_epi32(_mm512_castpd_si512(x), I_));
+}
+
+EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) {
+  return _mm512_castsi512_ps(
+      _mm512_inserti64x4(_mm512_castsi256_si512(_mm256_castps_si256(a)), _mm256_castps_si256(b), 1));
+}
+EIGEN_STRONG_INLINE Packet16i cat256i(Packet8i a, Packet8i b) {
+  return _mm512_inserti64x4(_mm512_castsi256_si512(a), b, 1);
+}
+#endif
+
+// Helper function for bit packing snippet of low precision comparison.
+// It packs the flags from 32x16 to 16x16.
+EIGEN_STRONG_INLINE __m256i Pack32To16(Packet16f rf) {
+  // Split data into small pieces and handle with AVX instructions
+  // to guarantee internal order of vector.
+  // Operation:
+  //   dst[15:0]    := Saturate16(rf[31:0])
+  //   dst[31:16]   := Saturate16(rf[63:32])
+  //   ...
+  //   dst[255:240] := Saturate16(rf[255:224])
+  __m256i lo = _mm256_castps_si256(extract256<0>(rf));
+  __m256i hi = _mm256_castps_si256(extract256<1>(rf));
+  __m128i result_lo = _mm_packs_epi32(_mm256_extractf128_si256(lo, 0), _mm256_extractf128_si256(lo, 1));
+  __m128i result_hi = _mm_packs_epi32(_mm256_extractf128_si256(hi, 0), _mm256_extractf128_si256(hi, 1));
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(result_lo), result_hi, 1);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pisnan(const Packet16f& a) {
+  __mmask16 mask = _mm512_cmp_ps_mask(a, a, _CMP_UNORD_Q);
+  return _mm512_castsi512_ps(_mm512_maskz_set1_epi32(mask, int32_t(-1)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pcmp_eq(const Packet16f& a, const Packet16f& b) {
+  __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_EQ_OQ);
+  return _mm512_castsi512_ps(_mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, int32_t(-1)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f pcmp_le(const Packet16f& a, const Packet16f& b) {
+  __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_LE_OQ);
+  return _mm512_castsi512_ps(_mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, int32_t(-1)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pcmp_lt(const Packet16f& a, const Packet16f& b) {
+  __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_LT_OQ);
+  return _mm512_castsi512_ps(_mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, int32_t(-1)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pcmp_lt_or_nan(const Packet16f& a, const Packet16f& b) {
+  __mmask16 mask = _mm512_cmp_ps_mask(a, b, _CMP_NGE_UQ);
+  return _mm512_castsi512_ps(_mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, int32_t(-1)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16i pcmp_eq(const Packet16i& a, const Packet16i& b) {
+  __mmask16 mask = _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_EQ);
+  return _mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, int32_t(-1));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16i pcmp_le(const Packet16i& a, const Packet16i& b) {
+  __mmask16 mask = _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_LE);
+  return _mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, int32_t(-1));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16i pcmp_lt(const Packet16i& a, const Packet16i& b) {
+  __mmask16 mask = _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_LT);
+  return _mm512_mask_set1_epi32(_mm512_setzero_epi32(), mask, int32_t(-1));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8l pcmp_eq(const Packet8l& a, const Packet8l& b) {
+  __mmask8 mask = _mm512_cmp_epi64_mask(a, b, _MM_CMPINT_EQ);
+  return _mm512_mask_set1_epi64(_mm512_setzero_si512(), mask, int64_t(-1));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8l pcmp_le(const Packet8l& a, const Packet8l& b) {
+  __mmask8 mask = _mm512_cmp_epi64_mask(a, b, _MM_CMPINT_LE);
+  return _mm512_mask_set1_epi64(_mm512_setzero_si512(), mask, int64_t(-1));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8l pcmp_lt(const Packet8l& a, const Packet8l& b) {
+  __mmask8 mask = _mm512_cmp_epi64_mask(a, b, _MM_CMPINT_LT);
+  return _mm512_mask_set1_epi64(_mm512_setzero_si512(), mask, int64_t(-1));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8d pcmp_eq(const Packet8d& a, const Packet8d& b) {
+  __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_EQ_OQ);
+  return _mm512_castsi512_pd(_mm512_mask_set1_epi64(_mm512_setzero_epi32(), mask, 0xffffffffffffffffu));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pcmp_le(const Packet8d& a, const Packet8d& b) {
+  __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_LE_OQ);
+  return _mm512_castsi512_pd(_mm512_mask_set1_epi64(_mm512_setzero_epi32(), mask, 0xffffffffffffffffu));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pcmp_lt(const Packet8d& a, const Packet8d& b) {
+  __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_LT_OQ);
+  return _mm512_castsi512_pd(_mm512_mask_set1_epi64(_mm512_setzero_epi32(), mask, 0xffffffffffffffffu));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pcmp_lt_or_nan(const Packet8d& a, const Packet8d& b) {
+  __mmask8 mask = _mm512_cmp_pd_mask(a, b, _CMP_NGE_UQ);
+  return _mm512_castsi512_pd(_mm512_mask_set1_epi64(_mm512_setzero_epi32(), mask, 0xffffffffffffffffu));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f print<Packet16f>(const Packet16f& a) {
+  return _mm512_roundscale_ps(a, _MM_FROUND_CUR_DIRECTION);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d print<Packet8d>(const Packet8d& a) {
+  return _mm512_roundscale_pd(a, _MM_FROUND_CUR_DIRECTION);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pceil<Packet16f>(const Packet16f& a) {
+  return _mm512_roundscale_ps(a, _MM_FROUND_TO_POS_INF);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pceil<Packet8d>(const Packet8d& a) {
+  return _mm512_roundscale_pd(a, _MM_FROUND_TO_POS_INF);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pfloor<Packet16f>(const Packet16f& a) {
+  return _mm512_roundscale_ps(a, _MM_FROUND_TO_NEG_INF);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pfloor<Packet8d>(const Packet8d& a) {
+  return _mm512_roundscale_pd(a, _MM_FROUND_TO_NEG_INF);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f ptrunc<Packet16f>(const Packet16f& a) {
+  return _mm512_roundscale_ps(a, _MM_FROUND_TO_ZERO);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d ptrunc<Packet8d>(const Packet8d& a) {
+  return _mm512_roundscale_pd(a, _MM_FROUND_TO_ZERO);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16i ptrue<Packet16i>(const Packet16i& /*a*/) {
+  return _mm512_set1_epi32(int32_t(-1));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8l ptrue<Packet8l>(const Packet8l& /*a*/) {
+  return _mm512_set1_epi64(int64_t(-1));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f ptrue<Packet16f>(const Packet16f& a) {
+  return _mm512_castsi512_ps(ptrue<Packet16i>(_mm512_castps_si512(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8d ptrue<Packet8d>(const Packet8d& a) {
+  return _mm512_castsi512_pd(ptrue<Packet16i>(_mm512_castpd_si512(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16i pand<Packet16i>(const Packet16i& a, const Packet16i& b) {
+  return _mm512_and_si512(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8l pand<Packet8l>(const Packet8l& a, const Packet8l& b) {
+  return _mm512_and_si512(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pand<Packet16f>(const Packet16f& a, const Packet16f& b) {
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+  return _mm512_and_ps(a, b);
+#else
+  return _mm512_castsi512_ps(pand(_mm512_castps_si512(a), _mm512_castps_si512(b)));
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pand<Packet8d>(const Packet8d& a, const Packet8d& b) {
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+  return _mm512_and_pd(a, b);
+#else
+  Packet8d res = _mm512_undefined_pd();
+  Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0);
+  Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0);
+  res = _mm512_insertf64x4(res, _mm256_and_pd(lane0_a, lane0_b), 0);
+
+  Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1);
+  Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1);
+  return _mm512_insertf64x4(res, _mm256_and_pd(lane1_a, lane1_b), 1);
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16i por<Packet16i>(const Packet16i& a, const Packet16i& b) {
+  return _mm512_or_si512(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8l por<Packet8l>(const Packet8l& a, const Packet8l& b) {
+  return _mm512_or_si512(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f por<Packet16f>(const Packet16f& a, const Packet16f& b) {
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+  return _mm512_or_ps(a, b);
+#else
+  return _mm512_castsi512_ps(por(_mm512_castps_si512(a), _mm512_castps_si512(b)));
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8d por<Packet8d>(const Packet8d& a, const Packet8d& b) {
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+  return _mm512_or_pd(a, b);
+#else
+  return _mm512_castsi512_pd(por(_mm512_castpd_si512(a), _mm512_castpd_si512(b)));
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16i pxor<Packet16i>(const Packet16i& a, const Packet16i& b) {
+  return _mm512_xor_si512(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8l pxor<Packet8l>(const Packet8l& a, const Packet8l& b) {
+  return _mm512_xor_si512(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pxor<Packet16f>(const Packet16f& a, const Packet16f& b) {
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+  return _mm512_xor_ps(a, b);
+#else
+  return _mm512_castsi512_ps(pxor(_mm512_castps_si512(a), _mm512_castps_si512(b)));
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8d pxor<Packet8d>(const Packet8d& a, const Packet8d& b) {
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+  return _mm512_xor_pd(a, b);
+#else
+  return _mm512_castsi512_pd(pxor(_mm512_castpd_si512(a), _mm512_castpd_si512(b)));
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16i pandnot<Packet16i>(const Packet16i& a, const Packet16i& b) {
+  return _mm512_andnot_si512(b, a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8l pandnot<Packet8l>(const Packet8l& a, const Packet8l& b) {
+  return _mm512_andnot_si512(b, a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pandnot<Packet16f>(const Packet16f& a, const Packet16f& b) {
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+  return _mm512_andnot_ps(b, a);
+#else
+  return _mm512_castsi512_ps(pandnot(_mm512_castps_si512(a), _mm512_castps_si512(b)));
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pandnot<Packet8d>(const Packet8d& a, const Packet8d& b) {
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+  return _mm512_andnot_pd(b, a);
+#else
+  return _mm512_castsi512_pd(pandnot(_mm512_castpd_si512(a), _mm512_castpd_si512(b)));
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pround<Packet16f>(const Packet16f& a) {
+  // Work-around for default std::round rounding mode.
+  const Packet16f mask = pset1frombits<Packet16f>(static_cast<numext::uint32_t>(0x80000000u));
+  const Packet16f prev0dot5 = pset1frombits<Packet16f>(static_cast<numext::uint32_t>(0x3EFFFFFFu));
+  return _mm512_roundscale_ps(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pround<Packet8d>(const Packet8d& a) {
+  // Work-around for default std::round rounding mode.
+  const Packet8d mask = pset1frombits<Packet8d>(static_cast<numext::uint64_t>(0x8000000000000000ull));
+  const Packet8d prev0dot5 = pset1frombits<Packet8d>(static_cast<numext::uint64_t>(0x3FDFFFFFFFFFFFFFull));
+  return _mm512_roundscale_pd(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet16i parithmetic_shift_right(Packet16i a) {
+  return _mm512_srai_epi32(a, N);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet16i plogical_shift_right(Packet16i a) {
+  return _mm512_srli_epi32(a, N);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet16i plogical_shift_left(Packet16i a) {
+  return _mm512_slli_epi32(a, N);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet8l parithmetic_shift_right(Packet8l a) {
+  return _mm512_srai_epi64(a, N);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet8l plogical_shift_right(Packet8l a) {
+  return _mm512_srli_epi64(a, N);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet8l plogical_shift_left(Packet8l a) {
+  return _mm512_slli_epi64(a, N);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pload<Packet16f>(const float* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_ps(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pload<Packet8d>(const double* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_pd(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16i pload<Packet16i>(const int* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_epi64(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8l pload<Packet8l>(const int64_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_epi64(from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f ploadu<Packet16f>(const float* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_ps(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d ploadu<Packet8d>(const double* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_pd(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16i ploadu<Packet16i>(const int* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_epi32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8l ploadu<Packet8l>(const int64_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_epi64(from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f ploadu<Packet16f>(const float* from, uint16_t umask) {
+  __mmask16 mask = static_cast<__mmask16>(umask);
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_maskz_loadu_ps(mask, from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d ploadu<Packet8d>(const double* from, uint8_t umask) {
+  __mmask8 mask = static_cast<__mmask8>(umask);
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_maskz_loadu_pd(mask, from);
+}
+
+// Loads 8 floats from memory a returns the packet
+// {a0, a0  a1, a1, a2, a2, a3, a3, a4, a4, a5, a5, a6, a6, a7, a7}
+template <>
+EIGEN_STRONG_INLINE Packet16f ploaddup<Packet16f>(const float* from) {
+  // an unaligned load is required here as there is no requirement
+  // on the alignment of input pointer 'from'
+  __m256i low_half = _mm256_castps_si256(_mm256_loadu_ps(from));
+  __m512 even_elements = _mm512_castsi512_ps(_mm512_cvtepu32_epi64(low_half));
+  __m512 pairs = _mm512_permute_ps(even_elements, _MM_SHUFFLE(2, 2, 0, 0));
+  return pairs;
+}
+
+// Loads 4 doubles from memory a returns the packet {a0, a0,  a1, a1, a2, a2, a3,
+// a3}
+template <>
+EIGEN_STRONG_INLINE Packet8d ploaddup<Packet8d>(const double* from) {
+  Packet8d tmp = _mm512_castpd256_pd512(ploadu<Packet4d>(from));
+  const Packet8l scatter_mask = _mm512_set_epi64(3, 3, 2, 2, 1, 1, 0, 0);
+  return _mm512_permutexvar_pd(scatter_mask, tmp);
+}
+
+// Loads 4 int64_t from memory a returns the packet {a0, a0,  a1, a1, a2, a2, a3,
+// a3}
+template <>
+EIGEN_STRONG_INLINE Packet8l ploaddup<Packet8l>(const int64_t* from) {
+  Packet8l tmp = _mm512_castsi256_si512(ploadu<Packet4l>(from));
+  const Packet8l scatter_mask = _mm512_set_epi64(3, 3, 2, 2, 1, 1, 0, 0);
+  return _mm512_permutexvar_epi64(scatter_mask, tmp);
+}
+
+// Loads 8 integers from memory and returns the packet
+// {a0, a0  a1, a1, a2, a2, a3, a3, a4, a4, a5, a5, a6, a6, a7, a7}
+template <>
+EIGEN_STRONG_INLINE Packet16i ploaddup<Packet16i>(const int* from) {
+  __m256i low_half = _mm256_load_si256(reinterpret_cast<const __m256i*>(from));
+  __m512 even_elements = _mm512_castsi512_ps(_mm512_cvtepu32_epi64(low_half));
+  __m512 pairs = _mm512_permute_ps(even_elements, _MM_SHUFFLE(2, 2, 0, 0));
+  return _mm512_castps_si512(pairs);
+}
+
+// Loads 4 floats from memory a returns the packet
+// {a0, a0  a0, a0, a1, a1, a1, a1, a2, a2, a2, a2, a3, a3, a3, a3}
+template <>
+EIGEN_STRONG_INLINE Packet16f ploadquad<Packet16f>(const float* from) {
+  Packet16f tmp = _mm512_castps128_ps512(ploadu<Packet4f>(from));
+  const Packet16i scatter_mask = _mm512_set_epi32(3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0);
+  return _mm512_permutexvar_ps(scatter_mask, tmp);
+}
+
+// Loads 2 doubles from memory a returns the packet
+// {a0, a0  a0, a0, a1, a1, a1, a1}
+template <>
+EIGEN_STRONG_INLINE Packet8d ploadquad<Packet8d>(const double* from) {
+  __m256d lane0 = _mm256_set1_pd(*from);
+  __m256d lane1 = _mm256_set1_pd(*(from + 1));
+  __m512d tmp = _mm512_undefined_pd();
+  tmp = _mm512_insertf64x4(tmp, lane0, 0);
+  return _mm512_insertf64x4(tmp, lane1, 1);
+}
+
+// Loads 2 int64_t from memory a returns the packet
+// {a0, a0  a0, a0, a1, a1, a1, a1}
+template <>
+EIGEN_STRONG_INLINE Packet8l ploadquad<Packet8l>(const int64_t* from) {
+  __m256i lane0 = _mm256_set1_epi64x(*from);
+  __m256i lane1 = _mm256_set1_epi64x(*(from + 1));
+  __m512i tmp = _mm512_undefined_epi32();
+  tmp = _mm512_inserti64x4(tmp, lane0, 0);
+  return _mm512_inserti64x4(tmp, lane1, 1);
+}
+
+// Loads 4 integers from memory and returns the packet
+// {a0, a0  a0, a0, a1, a1, a1, a1, a2, a2, a2, a2, a3, a3, a3, a3}
+template <>
+EIGEN_STRONG_INLINE Packet16i ploadquad<Packet16i>(const int* from) {
+  Packet16i tmp = _mm512_castsi128_si512(ploadu<Packet4i>(from));
+  const Packet16i scatter_mask = _mm512_set_epi32(3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0);
+  return _mm512_permutexvar_epi32(scatter_mask, tmp);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet16f& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm512_store_ps(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet8d& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm512_store_pd(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet16i& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm512_store_epi32(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<int64_t>(int64_t* to, const Packet8l& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm512_store_epi64(to, from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet16f& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_ps(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet8d& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_pd(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet16i& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_epi32(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int64_t>(int64_t* to, const Packet8l& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_epi64(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet16f& from, uint16_t umask) {
+  __mmask16 mask = static_cast<__mmask16>(umask);
+  EIGEN_DEBUG_UNALIGNED_STORE return _mm512_mask_storeu_ps(to, mask, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet8d& from, uint8_t umask) {
+  __mmask8 mask = static_cast<__mmask8>(umask);
+  EIGEN_DEBUG_UNALIGNED_STORE return _mm512_mask_storeu_pd(to, mask, from);
+}
+
+template <typename Scalar, typename Packet>
+EIGEN_DEVICE_FUNC inline Packet pgather(const Packet& src, const Scalar* from, Index stride,
+                                        typename unpacket_traits<Packet>::mask_t umask);
+template <>
+EIGEN_DEVICE_FUNC inline Packet16f pgather<float, Packet16f>(const Packet16f& src, const float* from, Index stride,
+                                                             uint16_t umask) {
+  Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
+  Packet16i stride_multiplier = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
+  __mmask16 mask = static_cast<__mmask16>(umask);
+
+  return _mm512_mask_i32gather_ps(src, mask, indices, from, 4);
+}
+template <>
+EIGEN_DEVICE_FUNC inline Packet8d pgather<double, Packet8d>(const Packet8d& src, const double* from, Index stride,
+                                                            uint8_t umask) {
+  Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
+  Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+  Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
+  __mmask8 mask = static_cast<__mmask8>(umask);
+
+  return _mm512_mask_i32gather_pd(src, mask, indices, from, 8);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet16f pgather<float, Packet16f>(const float* from, Index stride) {
+  Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
+  Packet16i stride_multiplier = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
+
+  return _mm512_i32gather_ps(indices, from, 4);
+}
+template <>
+EIGEN_DEVICE_FUNC inline Packet8d pgather<double, Packet8d>(const double* from, Index stride) {
+  Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
+  Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+  Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
+
+  return _mm512_i32gather_pd(indices, from, 8);
+}
+template <>
+EIGEN_DEVICE_FUNC inline Packet8l pgather<int64_t, Packet8l>(const int64_t* from, Index stride) {
+  Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
+  Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+  Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
+
+  return _mm512_i32gather_epi64(indices, from, 8);
+}
+template <>
+EIGEN_DEVICE_FUNC inline Packet16i pgather<int, Packet16i>(const int* from, Index stride) {
+  Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
+  Packet16i stride_multiplier = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
+  return _mm512_i32gather_epi32(indices, from, 4);
+}
+
+template <typename Scalar, typename Packet>
+EIGEN_DEVICE_FUNC inline void pscatter(Scalar* to, const Packet& from, Index stride,
+                                       typename unpacket_traits<Packet>::mask_t umask);
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<float, Packet16f>(float* to, const Packet16f& from, Index stride,
+                                                         uint16_t umask) {
+  Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
+  Packet16i stride_multiplier = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
+  __mmask16 mask = static_cast<__mmask16>(umask);
+  _mm512_mask_i32scatter_ps(to, mask, indices, from, 4);
+}
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<double, Packet8d>(double* to, const Packet8d& from, Index stride,
+                                                         uint8_t umask) {
+  Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
+  Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+  Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
+  __mmask8 mask = static_cast<__mmask8>(umask);
+  _mm512_mask_i32scatter_pd(to, mask, indices, from, 8);
+}
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<float, Packet16f>(float* to, const Packet16f& from, Index stride) {
+  Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
+  Packet16i stride_multiplier = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
+  _mm512_i32scatter_ps(to, indices, from, 4);
+}
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<double, Packet8d>(double* to, const Packet8d& from, Index stride) {
+  Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
+  Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+  Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
+  _mm512_i32scatter_pd(to, indices, from, 8);
+}
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<int64_t, Packet8l>(int64_t* to, const Packet8l& from, Index stride) {
+  Packet8i stride_vector = _mm256_set1_epi32(convert_index<int>(stride));
+  Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+  Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier);
+  _mm512_i32scatter_epi64(to, indices, from, 8);
+}
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<int, Packet16i>(int* to, const Packet16i& from, Index stride) {
+  Packet16i stride_vector = _mm512_set1_epi32(convert_index<int>(stride));
+  Packet16i stride_multiplier = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
+  Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier);
+  _mm512_i32scatter_epi32(to, indices, from, 4);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore1<Packet16f>(float* to, const float& a) {
+  Packet16f pa = pset1<Packet16f>(a);
+  pstore(to, pa);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore1<Packet8d>(double* to, const double& a) {
+  Packet8d pa = pset1<Packet8d>(a);
+  pstore(to, pa);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore1<Packet16i>(int* to, const int& a) {
+  Packet16i pa = pset1<Packet16i>(a);
+  pstore(to, pa);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore1<Packet8l>(int64_t* to, const int64_t& a) {
+  Packet8l pa = pset1<Packet8l>(a);
+  pstore(to, pa);
+}
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
+  _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
+  _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) {
+  _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
+}
+
+template <>
+EIGEN_STRONG_INLINE float pfirst<Packet16f>(const Packet16f& a) {
+  return _mm512_cvtss_f32(a);
+}
+template <>
+EIGEN_STRONG_INLINE double pfirst<Packet8d>(const Packet8d& a) {
+  return _mm512_cvtsd_f64(a);
+}
+template <>
+EIGEN_STRONG_INLINE int64_t pfirst<Packet8l>(const Packet8l& a) {
+  int64_t x = _mm_extract_epi64_0(_mm512_extracti32x4_epi32(a, 0));
+  return x;
+}
+template <>
+EIGEN_STRONG_INLINE int pfirst<Packet16i>(const Packet16i& a) {
+#if EIGEN_GNUC_STRICT_LESS_THAN(11, 0, 0)
+  return _mm_cvtsi128_si32(_mm512_castsi512_si128(a));
+#else
+  return _mm512_cvtsi512_si32(a);
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f preverse(const Packet16f& a) {
+  return _mm512_permutexvar_ps(_mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8d preverse(const Packet8d& a) {
+  return _mm512_permutexvar_pd(_mm512_set_epi32(0, 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16i preverse(const Packet16i& a) {
+  return _mm512_permutexvar_epi32(_mm512_set_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8l preverse(const Packet8l& a) {
+  return _mm512_permutexvar_epi64(_mm512_set_epi64(0, 1, 2, 3, 4, 5, 6, 7), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pabs(const Packet16f& a) {
+  // _mm512_abs_ps intrinsic not found, so hack around it
+  return _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(a), _mm512_set1_epi32(0x7fffffff)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pabs(const Packet8d& a) {
+  // _mm512_abs_ps intrinsic not found, so hack around it
+  return _mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(a), _mm512_set1_epi64(0x7fffffffffffffff)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16i pabs(const Packet16i& a) {
+  return _mm512_abs_epi32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8l pabs(const Packet8l& a) {
+  return _mm512_abs_epi64(a);
+}
+
+#ifndef EIGEN_VECTORIZE_AVX512FP16
+template <>
+EIGEN_STRONG_INLINE Packet16h psignbit(const Packet16h& a) {
+  return _mm256_srai_epi16(a, 15);
+}
+#endif  // EIGEN_VECTORIZE_AVX512FP16
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf psignbit(const Packet16bf& a) {
+  return _mm256_srai_epi16(a, 15);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f psignbit(const Packet16f& a) {
+  return _mm512_castsi512_ps(_mm512_srai_epi32(_mm512_castps_si512(a), 31));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d psignbit(const Packet8d& a) {
+  return _mm512_castsi512_pd(_mm512_srai_epi64(_mm512_castpd_si512(a), 63));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pfrexp<Packet16f>(const Packet16f& a, Packet16f& exponent) {
+  return pfrexp_generic(a, exponent);
+}
+
+// Extract exponent without existence of Packet8l.
+template <>
+EIGEN_STRONG_INLINE Packet8d pfrexp_generic_get_biased_exponent(const Packet8d& a) {
+  const Packet8d cst_exp_mask = pset1frombits<Packet8d>(static_cast<uint64_t>(0x7ff0000000000000ull));
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+  return _mm512_cvtepi64_pd(_mm512_srli_epi64(_mm512_castpd_si512(pand(a, cst_exp_mask)), 52));
+#else
+  return _mm512_cvtepi32_pd(_mm512_cvtepi64_epi32(_mm512_srli_epi64(_mm512_castpd_si512(pand(a, cst_exp_mask)), 52)));
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8d pfrexp<Packet8d>(const Packet8d& a, Packet8d& exponent) {
+  return pfrexp_generic(a, exponent);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pldexp<Packet16f>(const Packet16f& a, const Packet16f& exponent) {
+  return pldexp_generic(a, exponent);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8d pldexp<Packet8d>(const Packet8d& a, const Packet8d& exponent) {
+  // Clamp exponent to [-2099, 2099]
+  const Packet8d max_exponent = pset1<Packet8d>(2099.0);
+  const Packet8i e = _mm512_cvtpd_epi32(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
+
+  // Split 2^e into four factors and multiply.
+  const Packet8i bias = pset1<Packet8i>(1023);
+  Packet8i b = parithmetic_shift_right<2>(e);  // floor(e/4)
+
+  // 2^b
+  const Packet8i permute_idx = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
+  Packet8i hi = _mm256_permutevar8x32_epi32(padd(b, bias), permute_idx);
+  Packet8i lo = _mm256_slli_epi64(hi, 52);
+  hi = _mm256_slli_epi64(_mm256_srli_epi64(hi, 32), 52);
+  Packet8d c = _mm512_castsi512_pd(_mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1));
+  Packet8d out = pmul(pmul(pmul(a, c), c), c);  // a * 2^(3b)
+
+  // 2^(e - 3b)
+  b = psub(psub(psub(e, b), b), b);  // e - 3b
+  hi = _mm256_permutevar8x32_epi32(padd(b, bias), permute_idx);
+  lo = _mm256_slli_epi64(hi, 52);
+  hi = _mm256_slli_epi64(_mm256_srli_epi64(hi, 32), 52);
+  c = _mm512_castsi512_pd(_mm512_inserti64x4(_mm512_castsi256_si512(lo), hi, 1));
+  out = pmul(out, c);  // a * 2^e
+  return out;
+}
+
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+// AVX512F does not define _mm512_extractf32x8_ps to extract _m256 from _m512
+#define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT)        \
+  __m256 OUTPUT##_0 = _mm512_extractf32x8_ps(INPUT, 0); \
+  __m256 OUTPUT##_1 = _mm512_extractf32x8_ps(INPUT, 1)
+
+// AVX512F does not define _mm512_extracti32x8_epi32 to extract _m256i from _m512i
+#define EIGEN_EXTRACT_8i_FROM_16i(INPUT, OUTPUT)            \
+  __m256i OUTPUT##_0 = _mm512_extracti32x8_epi32(INPUT, 0); \
+  __m256i OUTPUT##_1 = _mm512_extracti32x8_epi32(INPUT, 1)
+#else
+#define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT)                                                     \
+  __m256 OUTPUT##_0 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm512_extractf32x4_ps(INPUT, 0)), \
+                                           _mm512_extractf32x4_ps(INPUT, 1), 1);                     \
+  __m256 OUTPUT##_1 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm512_extractf32x4_ps(INPUT, 2)), \
+                                           _mm512_extractf32x4_ps(INPUT, 3), 1)
+
+#define EIGEN_EXTRACT_8i_FROM_16i(INPUT, OUTPUT)                                                            \
+  __m256i OUTPUT##_0 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm512_extracti32x4_epi32(INPUT, 0)), \
+                                               _mm512_extracti32x4_epi32(INPUT, 1), 1);                     \
+  __m256i OUTPUT##_1 = _mm256_insertf128_si256(_mm256_castsi128_si256(_mm512_extracti32x4_epi32(INPUT, 2)), \
+                                               _mm512_extracti32x4_epi32(INPUT, 3), 1)
+#endif
+
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+#define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \
+  OUTPUT = _mm512_insertf32x8(_mm512_castps256_ps512(INPUTA), INPUTB, 1);
+
+#define EIGEN_INSERT_8i_INTO_16i(OUTPUT, INPUTA, INPUTB) \
+  OUTPUT = _mm512_inserti32x8(_mm512_castsi256_si512(INPUTA), INPUTB, 1);
+#else
+#define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB)                    \
+  OUTPUT = _mm512_undefined_ps();                                           \
+  OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 0), 0); \
+  OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 1), 1); \
+  OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 0), 2); \
+  OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 1), 3);
+
+#define EIGEN_INSERT_8i_INTO_16i(OUTPUT, INPUTA, INPUTB)                       \
+  OUTPUT = _mm512_undefined_epi32();                                           \
+  OUTPUT = _mm512_inserti32x4(OUTPUT, _mm256_extractf128_si256(INPUTA, 0), 0); \
+  OUTPUT = _mm512_inserti32x4(OUTPUT, _mm256_extractf128_si256(INPUTA, 1), 1); \
+  OUTPUT = _mm512_inserti32x4(OUTPUT, _mm256_extractf128_si256(INPUTB, 0), 2); \
+  OUTPUT = _mm512_inserti32x4(OUTPUT, _mm256_extractf128_si256(INPUTB, 1), 3);
+#endif
+
+template <>
+EIGEN_STRONG_INLINE Packet8f predux_half_dowto4<Packet16f>(const Packet16f& a) {
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+  __m256 lane0 = _mm512_extractf32x8_ps(a, 0);
+  __m256 lane1 = _mm512_extractf32x8_ps(a, 1);
+  return _mm256_add_ps(lane0, lane1);
+#else
+  __m128 lane0 = _mm512_extractf32x4_ps(a, 0);
+  __m128 lane1 = _mm512_extractf32x4_ps(a, 1);
+  __m128 lane2 = _mm512_extractf32x4_ps(a, 2);
+  __m128 lane3 = _mm512_extractf32x4_ps(a, 3);
+  __m128 sum0 = _mm_add_ps(lane0, lane2);
+  __m128 sum1 = _mm_add_ps(lane1, lane3);
+  return _mm256_insertf128_ps(_mm256_castps128_ps256(sum0), sum1, 1);
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet4d predux_half_dowto4<Packet8d>(const Packet8d& a) {
+  __m256d lane0 = _mm512_extractf64x4_pd(a, 0);
+  __m256d lane1 = _mm512_extractf64x4_pd(a, 1);
+  return _mm256_add_pd(lane0, lane1);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8i predux_half_dowto4<Packet16i>(const Packet16i& a) {
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+  __m256i lane0 = _mm512_extracti32x8_epi32(a, 0);
+  __m256i lane1 = _mm512_extracti32x8_epi32(a, 1);
+  return _mm256_add_epi32(lane0, lane1);
+#else
+  __m128i lane0 = _mm512_extracti32x4_epi32(a, 0);
+  __m128i lane1 = _mm512_extracti32x4_epi32(a, 1);
+  __m128i lane2 = _mm512_extracti32x4_epi32(a, 2);
+  __m128i lane3 = _mm512_extracti32x4_epi32(a, 3);
+  __m128i sum0 = _mm_add_epi32(lane0, lane2);
+  __m128i sum1 = _mm_add_epi32(lane1, lane3);
+  return _mm256_inserti128_si256(_mm256_castsi128_si256(sum0), sum1, 1);
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4l predux_half_dowto4<Packet8l>(const Packet8l& a) {
+  __m256i lane0 = _mm512_extracti64x4_epi64(a, 0);
+  __m256i lane1 = _mm512_extracti64x4_epi64(a, 1);
+  return _mm256_add_epi64(lane0, lane1);
+}
+
+#define PACK_OUTPUT(OUTPUT, INPUT, INDEX, STRIDE) \
+  EIGEN_INSERT_8f_INTO_16f(OUTPUT[INDEX], INPUT[INDEX], INPUT[INDEX + STRIDE]);
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16f, 16>& kernel) {
+  __m512 T0 = _mm512_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
+  __m512 T1 = _mm512_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
+  __m512 T2 = _mm512_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
+  __m512 T3 = _mm512_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
+  __m512 T4 = _mm512_unpacklo_ps(kernel.packet[4], kernel.packet[5]);
+  __m512 T5 = _mm512_unpackhi_ps(kernel.packet[4], kernel.packet[5]);
+  __m512 T6 = _mm512_unpacklo_ps(kernel.packet[6], kernel.packet[7]);
+  __m512 T7 = _mm512_unpackhi_ps(kernel.packet[6], kernel.packet[7]);
+  __m512 T8 = _mm512_unpacklo_ps(kernel.packet[8], kernel.packet[9]);
+  __m512 T9 = _mm512_unpackhi_ps(kernel.packet[8], kernel.packet[9]);
+  __m512 T10 = _mm512_unpacklo_ps(kernel.packet[10], kernel.packet[11]);
+  __m512 T11 = _mm512_unpackhi_ps(kernel.packet[10], kernel.packet[11]);
+  __m512 T12 = _mm512_unpacklo_ps(kernel.packet[12], kernel.packet[13]);
+  __m512 T13 = _mm512_unpackhi_ps(kernel.packet[12], kernel.packet[13]);
+  __m512 T14 = _mm512_unpacklo_ps(kernel.packet[14], kernel.packet[15]);
+  __m512 T15 = _mm512_unpackhi_ps(kernel.packet[14], kernel.packet[15]);
+  __m512 S0 = _mm512_shuffle_ps(T0, T2, _MM_SHUFFLE(1, 0, 1, 0));
+  __m512 S1 = _mm512_shuffle_ps(T0, T2, _MM_SHUFFLE(3, 2, 3, 2));
+  __m512 S2 = _mm512_shuffle_ps(T1, T3, _MM_SHUFFLE(1, 0, 1, 0));
+  __m512 S3 = _mm512_shuffle_ps(T1, T3, _MM_SHUFFLE(3, 2, 3, 2));
+  __m512 S4 = _mm512_shuffle_ps(T4, T6, _MM_SHUFFLE(1, 0, 1, 0));
+  __m512 S5 = _mm512_shuffle_ps(T4, T6, _MM_SHUFFLE(3, 2, 3, 2));
+  __m512 S6 = _mm512_shuffle_ps(T5, T7, _MM_SHUFFLE(1, 0, 1, 0));
+  __m512 S7 = _mm512_shuffle_ps(T5, T7, _MM_SHUFFLE(3, 2, 3, 2));
+  __m512 S8 = _mm512_shuffle_ps(T8, T10, _MM_SHUFFLE(1, 0, 1, 0));
+  __m512 S9 = _mm512_shuffle_ps(T8, T10, _MM_SHUFFLE(3, 2, 3, 2));
+  __m512 S10 = _mm512_shuffle_ps(T9, T11, _MM_SHUFFLE(1, 0, 1, 0));
+  __m512 S11 = _mm512_shuffle_ps(T9, T11, _MM_SHUFFLE(3, 2, 3, 2));
+  __m512 S12 = _mm512_shuffle_ps(T12, T14, _MM_SHUFFLE(1, 0, 1, 0));
+  __m512 S13 = _mm512_shuffle_ps(T12, T14, _MM_SHUFFLE(3, 2, 3, 2));
+  __m512 S14 = _mm512_shuffle_ps(T13, T15, _MM_SHUFFLE(1, 0, 1, 0));
+  __m512 S15 = _mm512_shuffle_ps(T13, T15, _MM_SHUFFLE(3, 2, 3, 2));
+
+  EIGEN_EXTRACT_8f_FROM_16f(S0, S0);
+  EIGEN_EXTRACT_8f_FROM_16f(S1, S1);
+  EIGEN_EXTRACT_8f_FROM_16f(S2, S2);
+  EIGEN_EXTRACT_8f_FROM_16f(S3, S3);
+  EIGEN_EXTRACT_8f_FROM_16f(S4, S4);
+  EIGEN_EXTRACT_8f_FROM_16f(S5, S5);
+  EIGEN_EXTRACT_8f_FROM_16f(S6, S6);
+  EIGEN_EXTRACT_8f_FROM_16f(S7, S7);
+  EIGEN_EXTRACT_8f_FROM_16f(S8, S8);
+  EIGEN_EXTRACT_8f_FROM_16f(S9, S9);
+  EIGEN_EXTRACT_8f_FROM_16f(S10, S10);
+  EIGEN_EXTRACT_8f_FROM_16f(S11, S11);
+  EIGEN_EXTRACT_8f_FROM_16f(S12, S12);
+  EIGEN_EXTRACT_8f_FROM_16f(S13, S13);
+  EIGEN_EXTRACT_8f_FROM_16f(S14, S14);
+  EIGEN_EXTRACT_8f_FROM_16f(S15, S15);
+
+  PacketBlock<Packet8f, 32> tmp;
+
+  tmp.packet[0] = _mm256_permute2f128_ps(S0_0, S4_0, 0x20);
+  tmp.packet[1] = _mm256_permute2f128_ps(S1_0, S5_0, 0x20);
+  tmp.packet[2] = _mm256_permute2f128_ps(S2_0, S6_0, 0x20);
+  tmp.packet[3] = _mm256_permute2f128_ps(S3_0, S7_0, 0x20);
+  tmp.packet[4] = _mm256_permute2f128_ps(S0_0, S4_0, 0x31);
+  tmp.packet[5] = _mm256_permute2f128_ps(S1_0, S5_0, 0x31);
+  tmp.packet[6] = _mm256_permute2f128_ps(S2_0, S6_0, 0x31);
+  tmp.packet[7] = _mm256_permute2f128_ps(S3_0, S7_0, 0x31);
+
+  tmp.packet[8] = _mm256_permute2f128_ps(S0_1, S4_1, 0x20);
+  tmp.packet[9] = _mm256_permute2f128_ps(S1_1, S5_1, 0x20);
+  tmp.packet[10] = _mm256_permute2f128_ps(S2_1, S6_1, 0x20);
+  tmp.packet[11] = _mm256_permute2f128_ps(S3_1, S7_1, 0x20);
+  tmp.packet[12] = _mm256_permute2f128_ps(S0_1, S4_1, 0x31);
+  tmp.packet[13] = _mm256_permute2f128_ps(S1_1, S5_1, 0x31);
+  tmp.packet[14] = _mm256_permute2f128_ps(S2_1, S6_1, 0x31);
+  tmp.packet[15] = _mm256_permute2f128_ps(S3_1, S7_1, 0x31);
+
+  // Second set of _m256 outputs
+  tmp.packet[16] = _mm256_permute2f128_ps(S8_0, S12_0, 0x20);
+  tmp.packet[17] = _mm256_permute2f128_ps(S9_0, S13_0, 0x20);
+  tmp.packet[18] = _mm256_permute2f128_ps(S10_0, S14_0, 0x20);
+  tmp.packet[19] = _mm256_permute2f128_ps(S11_0, S15_0, 0x20);
+  tmp.packet[20] = _mm256_permute2f128_ps(S8_0, S12_0, 0x31);
+  tmp.packet[21] = _mm256_permute2f128_ps(S9_0, S13_0, 0x31);
+  tmp.packet[22] = _mm256_permute2f128_ps(S10_0, S14_0, 0x31);
+  tmp.packet[23] = _mm256_permute2f128_ps(S11_0, S15_0, 0x31);
+
+  tmp.packet[24] = _mm256_permute2f128_ps(S8_1, S12_1, 0x20);
+  tmp.packet[25] = _mm256_permute2f128_ps(S9_1, S13_1, 0x20);
+  tmp.packet[26] = _mm256_permute2f128_ps(S10_1, S14_1, 0x20);
+  tmp.packet[27] = _mm256_permute2f128_ps(S11_1, S15_1, 0x20);
+  tmp.packet[28] = _mm256_permute2f128_ps(S8_1, S12_1, 0x31);
+  tmp.packet[29] = _mm256_permute2f128_ps(S9_1, S13_1, 0x31);
+  tmp.packet[30] = _mm256_permute2f128_ps(S10_1, S14_1, 0x31);
+  tmp.packet[31] = _mm256_permute2f128_ps(S11_1, S15_1, 0x31);
+
+  // Pack them into the output
+  PACK_OUTPUT(kernel.packet, tmp.packet, 0, 16);
+  PACK_OUTPUT(kernel.packet, tmp.packet, 1, 16);
+  PACK_OUTPUT(kernel.packet, tmp.packet, 2, 16);
+  PACK_OUTPUT(kernel.packet, tmp.packet, 3, 16);
+
+  PACK_OUTPUT(kernel.packet, tmp.packet, 4, 16);
+  PACK_OUTPUT(kernel.packet, tmp.packet, 5, 16);
+  PACK_OUTPUT(kernel.packet, tmp.packet, 6, 16);
+  PACK_OUTPUT(kernel.packet, tmp.packet, 7, 16);
+
+  PACK_OUTPUT(kernel.packet, tmp.packet, 8, 16);
+  PACK_OUTPUT(kernel.packet, tmp.packet, 9, 16);
+  PACK_OUTPUT(kernel.packet, tmp.packet, 10, 16);
+  PACK_OUTPUT(kernel.packet, tmp.packet, 11, 16);
+
+  PACK_OUTPUT(kernel.packet, tmp.packet, 12, 16);
+  PACK_OUTPUT(kernel.packet, tmp.packet, 13, 16);
+  PACK_OUTPUT(kernel.packet, tmp.packet, 14, 16);
+  PACK_OUTPUT(kernel.packet, tmp.packet, 15, 16);
+}
+#define PACK_OUTPUT_2(OUTPUT, INPUT, INDEX, STRIDE) \
+  EIGEN_INSERT_8f_INTO_16f(OUTPUT[INDEX], INPUT[2 * INDEX], INPUT[2 * INDEX + STRIDE]);
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16f, 8>& kernel) {
+  __m512 T0 = _mm512_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
+  __m512 T1 = _mm512_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
+  __m512 T2 = _mm512_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
+  __m512 T3 = _mm512_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
+  __m512 T4 = _mm512_unpacklo_ps(kernel.packet[4], kernel.packet[5]);
+  __m512 T5 = _mm512_unpackhi_ps(kernel.packet[4], kernel.packet[5]);
+  __m512 T6 = _mm512_unpacklo_ps(kernel.packet[6], kernel.packet[7]);
+  __m512 T7 = _mm512_unpackhi_ps(kernel.packet[6], kernel.packet[7]);
+
+  kernel.packet[0] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(T0), _mm512_castps_pd(T2)));
+  kernel.packet[1] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(T0), _mm512_castps_pd(T2)));
+  kernel.packet[2] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(T1), _mm512_castps_pd(T3)));
+  kernel.packet[3] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(T1), _mm512_castps_pd(T3)));
+  kernel.packet[4] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(T4), _mm512_castps_pd(T6)));
+  kernel.packet[5] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(T4), _mm512_castps_pd(T6)));
+  kernel.packet[6] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(T5), _mm512_castps_pd(T7)));
+  kernel.packet[7] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(T5), _mm512_castps_pd(T7)));
+
+  T0 = _mm512_shuffle_f32x4(kernel.packet[0], kernel.packet[4], 0x44);
+  T1 = _mm512_shuffle_f32x4(kernel.packet[0], kernel.packet[4], 0xee);
+  T2 = _mm512_shuffle_f32x4(kernel.packet[1], kernel.packet[5], 0x44);
+  T3 = _mm512_shuffle_f32x4(kernel.packet[1], kernel.packet[5], 0xee);
+  T4 = _mm512_shuffle_f32x4(kernel.packet[2], kernel.packet[6], 0x44);
+  T5 = _mm512_shuffle_f32x4(kernel.packet[2], kernel.packet[6], 0xee);
+  T6 = _mm512_shuffle_f32x4(kernel.packet[3], kernel.packet[7], 0x44);
+  T7 = _mm512_shuffle_f32x4(kernel.packet[3], kernel.packet[7], 0xee);
+
+  kernel.packet[0] = _mm512_shuffle_f32x4(T0, T2, 0x88);
+  kernel.packet[2] = _mm512_shuffle_f32x4(T0, T2, 0xdd);
+  kernel.packet[1] = _mm512_shuffle_f32x4(T4, T6, 0x88);
+  kernel.packet[3] = _mm512_shuffle_f32x4(T4, T6, 0xdd);
+  kernel.packet[4] = _mm512_shuffle_f32x4(T1, T3, 0x88);
+  kernel.packet[6] = _mm512_shuffle_f32x4(T1, T3, 0xdd);
+  kernel.packet[5] = _mm512_shuffle_f32x4(T5, T7, 0x88);
+  kernel.packet[7] = _mm512_shuffle_f32x4(T5, T7, 0xdd);
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16f, 4>& kernel) {
+  __m512 T0 = _mm512_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
+  __m512 T1 = _mm512_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
+  __m512 T2 = _mm512_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
+  __m512 T3 = _mm512_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
+
+  __m512 S0 = _mm512_shuffle_ps(T0, T2, _MM_SHUFFLE(1, 0, 1, 0));
+  __m512 S1 = _mm512_shuffle_ps(T0, T2, _MM_SHUFFLE(3, 2, 3, 2));
+  __m512 S2 = _mm512_shuffle_ps(T1, T3, _MM_SHUFFLE(1, 0, 1, 0));
+  __m512 S3 = _mm512_shuffle_ps(T1, T3, _MM_SHUFFLE(3, 2, 3, 2));
+
+  EIGEN_EXTRACT_8f_FROM_16f(S0, S0);
+  EIGEN_EXTRACT_8f_FROM_16f(S1, S1);
+  EIGEN_EXTRACT_8f_FROM_16f(S2, S2);
+  EIGEN_EXTRACT_8f_FROM_16f(S3, S3);
+
+  PacketBlock<Packet8f, 8> tmp;
+
+  tmp.packet[0] = _mm256_permute2f128_ps(S0_0, S1_0, 0x20);
+  tmp.packet[1] = _mm256_permute2f128_ps(S2_0, S3_0, 0x20);
+  tmp.packet[2] = _mm256_permute2f128_ps(S0_0, S1_0, 0x31);
+  tmp.packet[3] = _mm256_permute2f128_ps(S2_0, S3_0, 0x31);
+
+  tmp.packet[4] = _mm256_permute2f128_ps(S0_1, S1_1, 0x20);
+  tmp.packet[5] = _mm256_permute2f128_ps(S2_1, S3_1, 0x20);
+  tmp.packet[6] = _mm256_permute2f128_ps(S0_1, S1_1, 0x31);
+  tmp.packet[7] = _mm256_permute2f128_ps(S2_1, S3_1, 0x31);
+
+  PACK_OUTPUT_2(kernel.packet, tmp.packet, 0, 1);
+  PACK_OUTPUT_2(kernel.packet, tmp.packet, 1, 1);
+  PACK_OUTPUT_2(kernel.packet, tmp.packet, 2, 1);
+  PACK_OUTPUT_2(kernel.packet, tmp.packet, 3, 1);
+}
+
+#define PACK_OUTPUT_SQ_D(OUTPUT, INPUT, INDEX, STRIDE)                \
+  OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[INDEX], 0); \
+  OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[INDEX + STRIDE], 1);
+
+#define PACK_OUTPUT_D(OUTPUT, INPUT, INDEX, STRIDE)                         \
+  OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[(2 * INDEX)], 0); \
+  OUTPUT[INDEX] = _mm512_insertf64x4(OUTPUT[INDEX], INPUT[(2 * INDEX) + STRIDE], 1);
+
+#define PACK_OUTPUT_L(OUTPUT, INPUT, INDEX, STRIDE)                         \
+  OUTPUT[INDEX] = _mm512_inserti64x4(OUTPUT[INDEX], INPUT[(2 * INDEX)], 0); \
+  OUTPUT[INDEX] = _mm512_inserti64x4(OUTPUT[INDEX], INPUT[(2 * INDEX) + STRIDE], 1);
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8d, 4>& kernel) {
+  __m512d T0 = _mm512_shuffle_pd(kernel.packet[0], kernel.packet[1], 0);
+  __m512d T1 = _mm512_shuffle_pd(kernel.packet[0], kernel.packet[1], 0xff);
+  __m512d T2 = _mm512_shuffle_pd(kernel.packet[2], kernel.packet[3], 0);
+  __m512d T3 = _mm512_shuffle_pd(kernel.packet[2], kernel.packet[3], 0xff);
+
+  PacketBlock<Packet4d, 8> tmp;
+
+  tmp.packet[0] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0), _mm512_extractf64x4_pd(T2, 0), 0x20);
+  tmp.packet[1] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0), _mm512_extractf64x4_pd(T3, 0), 0x20);
+  tmp.packet[2] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 0), _mm512_extractf64x4_pd(T2, 0), 0x31);
+  tmp.packet[3] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 0), _mm512_extractf64x4_pd(T3, 0), 0x31);
+
+  tmp.packet[4] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1), _mm512_extractf64x4_pd(T2, 1), 0x20);
+  tmp.packet[5] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1), _mm512_extractf64x4_pd(T3, 1), 0x20);
+  tmp.packet[6] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T0, 1), _mm512_extractf64x4_pd(T2, 1), 0x31);
+  tmp.packet[7] = _mm256_permute2f128_pd(_mm512_extractf64x4_pd(T1, 1), _mm512_extractf64x4_pd(T3, 1), 0x31);
+
+  PACK_OUTPUT_D(kernel.packet, tmp.packet, 0, 1);
+  PACK_OUTPUT_D(kernel.packet, tmp.packet, 1, 1);
+  PACK_OUTPUT_D(kernel.packet, tmp.packet, 2, 1);
+  PACK_OUTPUT_D(kernel.packet, tmp.packet, 3, 1);
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8d, 8>& kernel) {
+  __m512d T0 = _mm512_unpacklo_pd(kernel.packet[0], kernel.packet[1]);
+  __m512d T1 = _mm512_unpackhi_pd(kernel.packet[0], kernel.packet[1]);
+  __m512d T2 = _mm512_unpacklo_pd(kernel.packet[2], kernel.packet[3]);
+  __m512d T3 = _mm512_unpackhi_pd(kernel.packet[2], kernel.packet[3]);
+  __m512d T4 = _mm512_unpacklo_pd(kernel.packet[4], kernel.packet[5]);
+  __m512d T5 = _mm512_unpackhi_pd(kernel.packet[4], kernel.packet[5]);
+  __m512d T6 = _mm512_unpacklo_pd(kernel.packet[6], kernel.packet[7]);
+  __m512d T7 = _mm512_unpackhi_pd(kernel.packet[6], kernel.packet[7]);
+
+  kernel.packet[0] = _mm512_permutex_pd(T2, 0x4E);
+  kernel.packet[0] = _mm512_mask_blend_pd(0xCC, T0, kernel.packet[0]);
+  kernel.packet[2] = _mm512_permutex_pd(T0, 0x4E);
+  kernel.packet[2] = _mm512_mask_blend_pd(0xCC, kernel.packet[2], T2);
+  kernel.packet[1] = _mm512_permutex_pd(T3, 0x4E);
+  kernel.packet[1] = _mm512_mask_blend_pd(0xCC, T1, kernel.packet[1]);
+  kernel.packet[3] = _mm512_permutex_pd(T1, 0x4E);
+  kernel.packet[3] = _mm512_mask_blend_pd(0xCC, kernel.packet[3], T3);
+  kernel.packet[4] = _mm512_permutex_pd(T6, 0x4E);
+  kernel.packet[4] = _mm512_mask_blend_pd(0xCC, T4, kernel.packet[4]);
+  kernel.packet[6] = _mm512_permutex_pd(T4, 0x4E);
+  kernel.packet[6] = _mm512_mask_blend_pd(0xCC, kernel.packet[6], T6);
+  kernel.packet[5] = _mm512_permutex_pd(T7, 0x4E);
+  kernel.packet[5] = _mm512_mask_blend_pd(0xCC, T5, kernel.packet[5]);
+  kernel.packet[7] = _mm512_permutex_pd(T5, 0x4E);
+  kernel.packet[7] = _mm512_mask_blend_pd(0xCC, kernel.packet[7], T7);
+
+  T0 = _mm512_shuffle_f64x2(kernel.packet[4], kernel.packet[4], 0x4E);
+  T0 = _mm512_mask_blend_pd(0xF0, kernel.packet[0], T0);
+  T4 = _mm512_shuffle_f64x2(kernel.packet[0], kernel.packet[0], 0x4E);
+  T4 = _mm512_mask_blend_pd(0xF0, T4, kernel.packet[4]);
+  T1 = _mm512_shuffle_f64x2(kernel.packet[5], kernel.packet[5], 0x4E);
+  T1 = _mm512_mask_blend_pd(0xF0, kernel.packet[1], T1);
+  T5 = _mm512_shuffle_f64x2(kernel.packet[1], kernel.packet[1], 0x4E);
+  T5 = _mm512_mask_blend_pd(0xF0, T5, kernel.packet[5]);
+  T2 = _mm512_shuffle_f64x2(kernel.packet[6], kernel.packet[6], 0x4E);
+  T2 = _mm512_mask_blend_pd(0xF0, kernel.packet[2], T2);
+  T6 = _mm512_shuffle_f64x2(kernel.packet[2], kernel.packet[2], 0x4E);
+  T6 = _mm512_mask_blend_pd(0xF0, T6, kernel.packet[6]);
+  T3 = _mm512_shuffle_f64x2(kernel.packet[7], kernel.packet[7], 0x4E);
+  T3 = _mm512_mask_blend_pd(0xF0, kernel.packet[3], T3);
+  T7 = _mm512_shuffle_f64x2(kernel.packet[3], kernel.packet[3], 0x4E);
+  T7 = _mm512_mask_blend_pd(0xF0, T7, kernel.packet[7]);
+
+  kernel.packet[0] = T0;
+  kernel.packet[1] = T1;
+  kernel.packet[2] = T2;
+  kernel.packet[3] = T3;
+  kernel.packet[4] = T4;
+  kernel.packet[5] = T5;
+  kernel.packet[6] = T6;
+  kernel.packet[7] = T7;
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8l, 4>& kernel) {
+  __m512i T0 = _mm512_castpd_si512(
+      _mm512_shuffle_pd(_mm512_castsi512_pd(kernel.packet[0]), _mm512_castsi512_pd(kernel.packet[1]), 0));
+  __m512i T1 = _mm512_castpd_si512(
+      _mm512_shuffle_pd(_mm512_castsi512_pd(kernel.packet[0]), _mm512_castsi512_pd(kernel.packet[1]), 0xff));
+  __m512i T2 = _mm512_castpd_si512(
+      _mm512_shuffle_pd(_mm512_castsi512_pd(kernel.packet[2]), _mm512_castsi512_pd(kernel.packet[3]), 0));
+  __m512i T3 = _mm512_castpd_si512(
+      _mm512_shuffle_pd(_mm512_castsi512_pd(kernel.packet[2]), _mm512_castsi512_pd(kernel.packet[3]), 0xff));
+
+  PacketBlock<Packet4l, 8> tmp;
+
+  tmp.packet[0] = _mm256_permute2x128_si256(_mm512_extracti64x4_epi64(T0, 0), _mm512_extracti64x4_epi64(T2, 0), 0x20);
+  tmp.packet[1] = _mm256_permute2x128_si256(_mm512_extracti64x4_epi64(T1, 0), _mm512_extracti64x4_epi64(T3, 0), 0x20);
+  tmp.packet[2] = _mm256_permute2x128_si256(_mm512_extracti64x4_epi64(T0, 0), _mm512_extracti64x4_epi64(T2, 0), 0x31);
+  tmp.packet[3] = _mm256_permute2x128_si256(_mm512_extracti64x4_epi64(T1, 0), _mm512_extracti64x4_epi64(T3, 0), 0x31);
+
+  tmp.packet[4] = _mm256_permute2x128_si256(_mm512_extracti64x4_epi64(T0, 1), _mm512_extracti64x4_epi64(T2, 1), 0x20);
+  tmp.packet[5] = _mm256_permute2x128_si256(_mm512_extracti64x4_epi64(T1, 1), _mm512_extracti64x4_epi64(T3, 1), 0x20);
+  tmp.packet[6] = _mm256_permute2x128_si256(_mm512_extracti64x4_epi64(T0, 1), _mm512_extracti64x4_epi64(T2, 1), 0x31);
+  tmp.packet[7] = _mm256_permute2x128_si256(_mm512_extracti64x4_epi64(T1, 1), _mm512_extracti64x4_epi64(T3, 1), 0x31);
+
+  PACK_OUTPUT_L(kernel.packet, tmp.packet, 0, 1);
+  PACK_OUTPUT_L(kernel.packet, tmp.packet, 1, 1);
+  PACK_OUTPUT_L(kernel.packet, tmp.packet, 2, 1);
+  PACK_OUTPUT_L(kernel.packet, tmp.packet, 3, 1);
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8l, 8>& kernel) {
+  __m512i T0 = _mm512_unpacklo_epi64(kernel.packet[0], kernel.packet[1]);
+  __m512i T1 = _mm512_unpackhi_epi64(kernel.packet[0], kernel.packet[1]);
+  __m512i T2 = _mm512_unpacklo_epi64(kernel.packet[2], kernel.packet[3]);
+  __m512i T3 = _mm512_unpackhi_epi64(kernel.packet[2], kernel.packet[3]);
+  __m512i T4 = _mm512_unpacklo_epi64(kernel.packet[4], kernel.packet[5]);
+  __m512i T5 = _mm512_unpackhi_epi64(kernel.packet[4], kernel.packet[5]);
+  __m512i T6 = _mm512_unpacklo_epi64(kernel.packet[6], kernel.packet[7]);
+  __m512i T7 = _mm512_unpackhi_epi64(kernel.packet[6], kernel.packet[7]);
+
+  kernel.packet[0] = _mm512_permutex_epi64(T2, 0x4E);
+  kernel.packet[0] = _mm512_mask_blend_epi64(0xCC, T0, kernel.packet[0]);
+  kernel.packet[2] = _mm512_permutex_epi64(T0, 0x4E);
+  kernel.packet[2] = _mm512_mask_blend_epi64(0xCC, kernel.packet[2], T2);
+  kernel.packet[1] = _mm512_permutex_epi64(T3, 0x4E);
+  kernel.packet[1] = _mm512_mask_blend_epi64(0xCC, T1, kernel.packet[1]);
+  kernel.packet[3] = _mm512_permutex_epi64(T1, 0x4E);
+  kernel.packet[3] = _mm512_mask_blend_epi64(0xCC, kernel.packet[3], T3);
+  kernel.packet[4] = _mm512_permutex_epi64(T6, 0x4E);
+  kernel.packet[4] = _mm512_mask_blend_epi64(0xCC, T4, kernel.packet[4]);
+  kernel.packet[6] = _mm512_permutex_epi64(T4, 0x4E);
+  kernel.packet[6] = _mm512_mask_blend_epi64(0xCC, kernel.packet[6], T6);
+  kernel.packet[5] = _mm512_permutex_epi64(T7, 0x4E);
+  kernel.packet[5] = _mm512_mask_blend_epi64(0xCC, T5, kernel.packet[5]);
+  kernel.packet[7] = _mm512_permutex_epi64(T5, 0x4E);
+  kernel.packet[7] = _mm512_mask_blend_epi64(0xCC, kernel.packet[7], T7);
+
+  T0 = _mm512_shuffle_i64x2(kernel.packet[4], kernel.packet[4], 0x4E);
+  T0 = _mm512_mask_blend_epi64(0xF0, kernel.packet[0], T0);
+  T4 = _mm512_shuffle_i64x2(kernel.packet[0], kernel.packet[0], 0x4E);
+  T4 = _mm512_mask_blend_epi64(0xF0, T4, kernel.packet[4]);
+  T1 = _mm512_shuffle_i64x2(kernel.packet[5], kernel.packet[5], 0x4E);
+  T1 = _mm512_mask_blend_epi64(0xF0, kernel.packet[1], T1);
+  T5 = _mm512_shuffle_i64x2(kernel.packet[1], kernel.packet[1], 0x4E);
+  T5 = _mm512_mask_blend_epi64(0xF0, T5, kernel.packet[5]);
+  T2 = _mm512_shuffle_i64x2(kernel.packet[6], kernel.packet[6], 0x4E);
+  T2 = _mm512_mask_blend_epi64(0xF0, kernel.packet[2], T2);
+  T6 = _mm512_shuffle_i64x2(kernel.packet[2], kernel.packet[2], 0x4E);
+  T6 = _mm512_mask_blend_epi64(0xF0, T6, kernel.packet[6]);
+  T3 = _mm512_shuffle_i64x2(kernel.packet[7], kernel.packet[7], 0x4E);
+  T3 = _mm512_mask_blend_epi64(0xF0, kernel.packet[3], T3);
+  T7 = _mm512_shuffle_i64x2(kernel.packet[3], kernel.packet[3], 0x4E);
+  T7 = _mm512_mask_blend_epi64(0xF0, T7, kernel.packet[7]);
+
+  kernel.packet[0] = T0;
+  kernel.packet[1] = T1;
+  kernel.packet[2] = T2;
+  kernel.packet[3] = T3;
+  kernel.packet[4] = T4;
+  kernel.packet[5] = T5;
+  kernel.packet[6] = T6;
+  kernel.packet[7] = T7;
+}
+
+#define PACK_OUTPUT_I32(OUTPUT, INPUT, INDEX, STRIDE) \
+  EIGEN_INSERT_8i_INTO_16i(OUTPUT[INDEX], INPUT[INDEX], INPUT[INDEX + STRIDE]);
+
+#define PACK_OUTPUT_I32_2(OUTPUT, INPUT, INDEX, STRIDE) \
+  EIGEN_INSERT_8i_INTO_16i(OUTPUT[INDEX], INPUT[2 * INDEX], INPUT[2 * INDEX + STRIDE]);
+
+#define SHUFFLE_EPI32(A, B, M) _mm512_castps_si512(_mm512_shuffle_ps(_mm512_castsi512_ps(A), _mm512_castsi512_ps(B), M))
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16i, 16>& kernel) {
+  __m512i T0 = _mm512_unpacklo_epi32(kernel.packet[0], kernel.packet[1]);
+  __m512i T1 = _mm512_unpackhi_epi32(kernel.packet[0], kernel.packet[1]);
+  __m512i T2 = _mm512_unpacklo_epi32(kernel.packet[2], kernel.packet[3]);
+  __m512i T3 = _mm512_unpackhi_epi32(kernel.packet[2], kernel.packet[3]);
+  __m512i T4 = _mm512_unpacklo_epi32(kernel.packet[4], kernel.packet[5]);
+  __m512i T5 = _mm512_unpackhi_epi32(kernel.packet[4], kernel.packet[5]);
+  __m512i T6 = _mm512_unpacklo_epi32(kernel.packet[6], kernel.packet[7]);
+  __m512i T7 = _mm512_unpackhi_epi32(kernel.packet[6], kernel.packet[7]);
+  __m512i T8 = _mm512_unpacklo_epi32(kernel.packet[8], kernel.packet[9]);
+  __m512i T9 = _mm512_unpackhi_epi32(kernel.packet[8], kernel.packet[9]);
+  __m512i T10 = _mm512_unpacklo_epi32(kernel.packet[10], kernel.packet[11]);
+  __m512i T11 = _mm512_unpackhi_epi32(kernel.packet[10], kernel.packet[11]);
+  __m512i T12 = _mm512_unpacklo_epi32(kernel.packet[12], kernel.packet[13]);
+  __m512i T13 = _mm512_unpackhi_epi32(kernel.packet[12], kernel.packet[13]);
+  __m512i T14 = _mm512_unpacklo_epi32(kernel.packet[14], kernel.packet[15]);
+  __m512i T15 = _mm512_unpackhi_epi32(kernel.packet[14], kernel.packet[15]);
+  __m512i S0 = SHUFFLE_EPI32(T0, T2, _MM_SHUFFLE(1, 0, 1, 0));
+  __m512i S1 = SHUFFLE_EPI32(T0, T2, _MM_SHUFFLE(3, 2, 3, 2));
+  __m512i S2 = SHUFFLE_EPI32(T1, T3, _MM_SHUFFLE(1, 0, 1, 0));
+  __m512i S3 = SHUFFLE_EPI32(T1, T3, _MM_SHUFFLE(3, 2, 3, 2));
+  __m512i S4 = SHUFFLE_EPI32(T4, T6, _MM_SHUFFLE(1, 0, 1, 0));
+  __m512i S5 = SHUFFLE_EPI32(T4, T6, _MM_SHUFFLE(3, 2, 3, 2));
+  __m512i S6 = SHUFFLE_EPI32(T5, T7, _MM_SHUFFLE(1, 0, 1, 0));
+  __m512i S7 = SHUFFLE_EPI32(T5, T7, _MM_SHUFFLE(3, 2, 3, 2));
+  __m512i S8 = SHUFFLE_EPI32(T8, T10, _MM_SHUFFLE(1, 0, 1, 0));
+  __m512i S9 = SHUFFLE_EPI32(T8, T10, _MM_SHUFFLE(3, 2, 3, 2));
+  __m512i S10 = SHUFFLE_EPI32(T9, T11, _MM_SHUFFLE(1, 0, 1, 0));
+  __m512i S11 = SHUFFLE_EPI32(T9, T11, _MM_SHUFFLE(3, 2, 3, 2));
+  __m512i S12 = SHUFFLE_EPI32(T12, T14, _MM_SHUFFLE(1, 0, 1, 0));
+  __m512i S13 = SHUFFLE_EPI32(T12, T14, _MM_SHUFFLE(3, 2, 3, 2));
+  __m512i S14 = SHUFFLE_EPI32(T13, T15, _MM_SHUFFLE(1, 0, 1, 0));
+  __m512i S15 = SHUFFLE_EPI32(T13, T15, _MM_SHUFFLE(3, 2, 3, 2));
+
+  EIGEN_EXTRACT_8i_FROM_16i(S0, S0);
+  EIGEN_EXTRACT_8i_FROM_16i(S1, S1);
+  EIGEN_EXTRACT_8i_FROM_16i(S2, S2);
+  EIGEN_EXTRACT_8i_FROM_16i(S3, S3);
+  EIGEN_EXTRACT_8i_FROM_16i(S4, S4);
+  EIGEN_EXTRACT_8i_FROM_16i(S5, S5);
+  EIGEN_EXTRACT_8i_FROM_16i(S6, S6);
+  EIGEN_EXTRACT_8i_FROM_16i(S7, S7);
+  EIGEN_EXTRACT_8i_FROM_16i(S8, S8);
+  EIGEN_EXTRACT_8i_FROM_16i(S9, S9);
+  EIGEN_EXTRACT_8i_FROM_16i(S10, S10);
+  EIGEN_EXTRACT_8i_FROM_16i(S11, S11);
+  EIGEN_EXTRACT_8i_FROM_16i(S12, S12);
+  EIGEN_EXTRACT_8i_FROM_16i(S13, S13);
+  EIGEN_EXTRACT_8i_FROM_16i(S14, S14);
+  EIGEN_EXTRACT_8i_FROM_16i(S15, S15);
+
+  PacketBlock<Packet8i, 32> tmp;
+
+  tmp.packet[0] = _mm256_permute2f128_si256(S0_0, S4_0, 0x20);
+  tmp.packet[1] = _mm256_permute2f128_si256(S1_0, S5_0, 0x20);
+  tmp.packet[2] = _mm256_permute2f128_si256(S2_0, S6_0, 0x20);
+  tmp.packet[3] = _mm256_permute2f128_si256(S3_0, S7_0, 0x20);
+  tmp.packet[4] = _mm256_permute2f128_si256(S0_0, S4_0, 0x31);
+  tmp.packet[5] = _mm256_permute2f128_si256(S1_0, S5_0, 0x31);
+  tmp.packet[6] = _mm256_permute2f128_si256(S2_0, S6_0, 0x31);
+  tmp.packet[7] = _mm256_permute2f128_si256(S3_0, S7_0, 0x31);
+
+  tmp.packet[8] = _mm256_permute2f128_si256(S0_1, S4_1, 0x20);
+  tmp.packet[9] = _mm256_permute2f128_si256(S1_1, S5_1, 0x20);
+  tmp.packet[10] = _mm256_permute2f128_si256(S2_1, S6_1, 0x20);
+  tmp.packet[11] = _mm256_permute2f128_si256(S3_1, S7_1, 0x20);
+  tmp.packet[12] = _mm256_permute2f128_si256(S0_1, S4_1, 0x31);
+  tmp.packet[13] = _mm256_permute2f128_si256(S1_1, S5_1, 0x31);
+  tmp.packet[14] = _mm256_permute2f128_si256(S2_1, S6_1, 0x31);
+  tmp.packet[15] = _mm256_permute2f128_si256(S3_1, S7_1, 0x31);
+
+  // Second set of _m256 outputs
+  tmp.packet[16] = _mm256_permute2f128_si256(S8_0, S12_0, 0x20);
+  tmp.packet[17] = _mm256_permute2f128_si256(S9_0, S13_0, 0x20);
+  tmp.packet[18] = _mm256_permute2f128_si256(S10_0, S14_0, 0x20);
+  tmp.packet[19] = _mm256_permute2f128_si256(S11_0, S15_0, 0x20);
+  tmp.packet[20] = _mm256_permute2f128_si256(S8_0, S12_0, 0x31);
+  tmp.packet[21] = _mm256_permute2f128_si256(S9_0, S13_0, 0x31);
+  tmp.packet[22] = _mm256_permute2f128_si256(S10_0, S14_0, 0x31);
+  tmp.packet[23] = _mm256_permute2f128_si256(S11_0, S15_0, 0x31);
+
+  tmp.packet[24] = _mm256_permute2f128_si256(S8_1, S12_1, 0x20);
+  tmp.packet[25] = _mm256_permute2f128_si256(S9_1, S13_1, 0x20);
+  tmp.packet[26] = _mm256_permute2f128_si256(S10_1, S14_1, 0x20);
+  tmp.packet[27] = _mm256_permute2f128_si256(S11_1, S15_1, 0x20);
+  tmp.packet[28] = _mm256_permute2f128_si256(S8_1, S12_1, 0x31);
+  tmp.packet[29] = _mm256_permute2f128_si256(S9_1, S13_1, 0x31);
+  tmp.packet[30] = _mm256_permute2f128_si256(S10_1, S14_1, 0x31);
+  tmp.packet[31] = _mm256_permute2f128_si256(S11_1, S15_1, 0x31);
+
+  // Pack them into the output
+  PACK_OUTPUT_I32(kernel.packet, tmp.packet, 0, 16);
+  PACK_OUTPUT_I32(kernel.packet, tmp.packet, 1, 16);
+  PACK_OUTPUT_I32(kernel.packet, tmp.packet, 2, 16);
+  PACK_OUTPUT_I32(kernel.packet, tmp.packet, 3, 16);
+
+  PACK_OUTPUT_I32(kernel.packet, tmp.packet, 4, 16);
+  PACK_OUTPUT_I32(kernel.packet, tmp.packet, 5, 16);
+  PACK_OUTPUT_I32(kernel.packet, tmp.packet, 6, 16);
+  PACK_OUTPUT_I32(kernel.packet, tmp.packet, 7, 16);
+
+  PACK_OUTPUT_I32(kernel.packet, tmp.packet, 8, 16);
+  PACK_OUTPUT_I32(kernel.packet, tmp.packet, 9, 16);
+  PACK_OUTPUT_I32(kernel.packet, tmp.packet, 10, 16);
+  PACK_OUTPUT_I32(kernel.packet, tmp.packet, 11, 16);
+
+  PACK_OUTPUT_I32(kernel.packet, tmp.packet, 12, 16);
+  PACK_OUTPUT_I32(kernel.packet, tmp.packet, 13, 16);
+  PACK_OUTPUT_I32(kernel.packet, tmp.packet, 14, 16);
+  PACK_OUTPUT_I32(kernel.packet, tmp.packet, 15, 16);
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16i, 4>& kernel) {
+  __m512i T0 = _mm512_unpacklo_epi32(kernel.packet[0], kernel.packet[1]);
+  __m512i T1 = _mm512_unpackhi_epi32(kernel.packet[0], kernel.packet[1]);
+  __m512i T2 = _mm512_unpacklo_epi32(kernel.packet[2], kernel.packet[3]);
+  __m512i T3 = _mm512_unpackhi_epi32(kernel.packet[2], kernel.packet[3]);
+
+  __m512i S0 = SHUFFLE_EPI32(T0, T2, _MM_SHUFFLE(1, 0, 1, 0));
+  __m512i S1 = SHUFFLE_EPI32(T0, T2, _MM_SHUFFLE(3, 2, 3, 2));
+  __m512i S2 = SHUFFLE_EPI32(T1, T3, _MM_SHUFFLE(1, 0, 1, 0));
+  __m512i S3 = SHUFFLE_EPI32(T1, T3, _MM_SHUFFLE(3, 2, 3, 2));
+
+  EIGEN_EXTRACT_8i_FROM_16i(S0, S0);
+  EIGEN_EXTRACT_8i_FROM_16i(S1, S1);
+  EIGEN_EXTRACT_8i_FROM_16i(S2, S2);
+  EIGEN_EXTRACT_8i_FROM_16i(S3, S3);
+
+  PacketBlock<Packet8i, 8> tmp;
+
+  tmp.packet[0] = _mm256_permute2f128_si256(S0_0, S1_0, 0x20);
+  tmp.packet[1] = _mm256_permute2f128_si256(S2_0, S3_0, 0x20);
+  tmp.packet[2] = _mm256_permute2f128_si256(S0_0, S1_0, 0x31);
+  tmp.packet[3] = _mm256_permute2f128_si256(S2_0, S3_0, 0x31);
+
+  tmp.packet[4] = _mm256_permute2f128_si256(S0_1, S1_1, 0x20);
+  tmp.packet[5] = _mm256_permute2f128_si256(S2_1, S3_1, 0x20);
+  tmp.packet[6] = _mm256_permute2f128_si256(S0_1, S1_1, 0x31);
+  tmp.packet[7] = _mm256_permute2f128_si256(S2_1, S3_1, 0x31);
+
+  PACK_OUTPUT_I32_2(kernel.packet, tmp.packet, 0, 1);
+  PACK_OUTPUT_I32_2(kernel.packet, tmp.packet, 1, 1);
+  PACK_OUTPUT_I32_2(kernel.packet, tmp.packet, 2, 1);
+  PACK_OUTPUT_I32_2(kernel.packet, tmp.packet, 3, 1);
+}
+
+template <size_t N>
+EIGEN_STRONG_INLINE int avx512_blend_mask(const Selector<N>& ifPacket) {
+  alignas(__m128i) uint8_t aux[sizeof(__m128i)];
+  for (size_t i = 0; i < N; i++) aux[i] = static_cast<uint8_t>(ifPacket.select[i]);
+  __m128i paux = _mm_sub_epi8(_mm_setzero_si128(), _mm_load_si128(reinterpret_cast<const __m128i*>(aux)));
+  return _mm_movemask_epi8(paux);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pblend(const Selector<16>& ifPacket, const Packet16f& thenPacket,
+                                     const Packet16f& elsePacket) {
+  __mmask16 m = avx512_blend_mask(ifPacket);
+  return _mm512_mask_blend_ps(m, elsePacket, thenPacket);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8d pblend(const Selector<8>& ifPacket, const Packet8d& thenPacket,
+                                    const Packet8d& elsePacket) {
+  __mmask8 m = avx512_blend_mask(ifPacket);
+  return _mm512_mask_blend_pd(m, elsePacket, thenPacket);
+}
+
+// Packet math for Eigen::half
+#ifndef EIGEN_VECTORIZE_AVX512FP16
+template <>
+EIGEN_STRONG_INLINE Packet16h pset1<Packet16h>(const Eigen::half& from) {
+  return _mm256_set1_epi16(from.x);
+}
+
+template <>
+EIGEN_STRONG_INLINE Eigen::half pfirst<Packet16h>(const Packet16h& from) {
+  return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm256_extract_epi16(from, 0)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pload<Packet16h>(const Eigen::half* from) {
+  return _mm256_load_si256(reinterpret_cast<const __m256i*>(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h ploadu<Packet16h>(const Eigen::half* from) {
+  return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet16h& from) {
+  // (void*) -> workaround clang warning:
+  // cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32
+  EIGEN_DEBUG_ALIGNED_STORE
+  _mm256_store_si256((__m256i*)(void*)to, from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet16h& from) {
+  // (void*) -> workaround clang warning:
+  // cast from 'Eigen::half *' to '__m256i *' increases required alignment from 2 to 32
+  EIGEN_DEBUG_UNALIGNED_STORE
+  _mm256_storeu_si256((__m256i*)(void*)to, from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h ploaddup<Packet16h>(const Eigen::half* from) {
+  unsigned short a = from[0].x;
+  unsigned short b = from[1].x;
+  unsigned short c = from[2].x;
+  unsigned short d = from[3].x;
+  unsigned short e = from[4].x;
+  unsigned short f = from[5].x;
+  unsigned short g = from[6].x;
+  unsigned short h = from[7].x;
+  return _mm256_set_epi16(h, h, g, g, f, f, e, e, d, d, c, c, b, b, a, a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h ploadquad(const Eigen::half* from) {
+  unsigned short a = from[0].x;
+  unsigned short b = from[1].x;
+  unsigned short c = from[2].x;
+  unsigned short d = from[3].x;
+  return _mm256_set_epi16(d, d, d, d, c, c, c, c, b, b, b, b, a, a, a, a);
+}
+
+EIGEN_STRONG_INLINE Packet16f half2float(const Packet16h& a) { return _mm512_cvtph_ps(a); }
+
+EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) {
+  return _mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h ptrue(const Packet16h& a) {
+  return Packet16h(ptrue(Packet8i(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pabs(const Packet16h& a) {
+  const __m256i sign_mask = _mm256_set1_epi16(static_cast<numext::uint16_t>(0x8000));
+  return _mm256_andnot_si256(sign_mask, a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pmin<Packet16h>(const Packet16h& a, const Packet16h& b) {
+  return float2half(pmin<Packet16f>(half2float(a), half2float(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pmax<Packet16h>(const Packet16h& a, const Packet16h& b) {
+  return float2half(pmax<Packet16f>(half2float(a), half2float(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h plset<Packet16h>(const half& a) {
+  return float2half(plset<Packet16f>(static_cast<float>(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h por(const Packet16h& a, const Packet16h& b) {
+  // in some cases Packet8i is a wrapper around __m256i, so we need to
+  // cast to Packet8i to call the correct overload.
+  return Packet16h(por(Packet8i(a), Packet8i(b)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16h pxor(const Packet16h& a, const Packet16h& b) {
+  return Packet16h(pxor(Packet8i(a), Packet8i(b)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16h pand(const Packet16h& a, const Packet16h& b) {
+  return Packet16h(pand(Packet8i(a), Packet8i(b)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16h pandnot(const Packet16h& a, const Packet16h& b) {
+  return Packet16h(pandnot(Packet8i(a), Packet8i(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pselect(const Packet16h& mask, const Packet16h& a, const Packet16h& b) {
+  return _mm256_blendv_epi8(b, a, mask);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pround<Packet16h>(const Packet16h& a) {
+  return float2half(pround<Packet16f>(half2float(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h print<Packet16h>(const Packet16h& a) {
+  return float2half(print<Packet16f>(half2float(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pceil<Packet16h>(const Packet16h& a) {
+  return float2half(pceil<Packet16f>(half2float(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pfloor<Packet16h>(const Packet16h& a) {
+  return float2half(pfloor<Packet16f>(half2float(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h ptrunc<Packet16h>(const Packet16h& a) {
+  return float2half(ptrunc<Packet16f>(half2float(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pcmp_eq(const Packet16h& a, const Packet16h& b) {
+  Packet16f af = half2float(a);
+  Packet16f bf = half2float(b);
+  return Pack32To16(pcmp_eq(af, bf));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pcmp_le(const Packet16h& a, const Packet16h& b) {
+  return Pack32To16(pcmp_le(half2float(a), half2float(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pcmp_lt(const Packet16h& a, const Packet16h& b) {
+  return Pack32To16(pcmp_lt(half2float(a), half2float(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pcmp_lt_or_nan(const Packet16h& a, const Packet16h& b) {
+  return Pack32To16(pcmp_lt_or_nan(half2float(a), half2float(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pconj(const Packet16h& a) {
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pnegate(const Packet16h& a) {
+  Packet16h sign_mask = _mm256_set1_epi16(static_cast<unsigned short>(0x8000));
+  return _mm256_xor_si256(a, sign_mask);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) {
+  Packet16f af = half2float(a);
+  Packet16f bf = half2float(b);
+  Packet16f rf = padd(af, bf);
+  return float2half(rf);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h psub<Packet16h>(const Packet16h& a, const Packet16h& b) {
+  Packet16f af = half2float(a);
+  Packet16f bf = half2float(b);
+  Packet16f rf = psub(af, bf);
+  return float2half(rf);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) {
+  Packet16f af = half2float(a);
+  Packet16f bf = half2float(b);
+  Packet16f rf = pmul(af, bf);
+  return float2half(rf);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pdiv<Packet16h>(const Packet16h& a, const Packet16h& b) {
+  Packet16f af = half2float(a);
+  Packet16f bf = half2float(b);
+  Packet16f rf = pdiv(af, bf);
+  return float2half(rf);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pmadd<Packet16h>(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
+  return float2half(pmadd(half2float(a), half2float(b), half2float(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pmsub<Packet16h>(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
+  return float2half(pmsub(half2float(a), half2float(b), half2float(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pnmadd<Packet16h>(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
+  return float2half(pnmadd(half2float(a), half2float(b), half2float(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pnmsub<Packet16h>(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
+  return float2half(pnmsub(half2float(a), half2float(b), half2float(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h predux_half_dowto4<Packet16h>(const Packet16h& a) {
+  Packet8h lane0 = _mm256_extractf128_si256(a, 0);
+  Packet8h lane1 = _mm256_extractf128_si256(a, 1);
+  return padd<Packet8h>(lane0, lane1);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h preverse(const Packet16h& a) {
+  __m128i m = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(_mm_shuffle_epi8(_mm256_extractf128_si256(a, 1), m)),
+                                 _mm_shuffle_epi8(_mm256_extractf128_si256(a, 0), m), 1);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pgather<Eigen::half, Packet16h>(const Eigen::half* from, Index stride) {
+  return _mm256_set_epi16(from[15 * stride].x, from[14 * stride].x, from[13 * stride].x, from[12 * stride].x,
+                          from[11 * stride].x, from[10 * stride].x, from[9 * stride].x, from[8 * stride].x,
+                          from[7 * stride].x, from[6 * stride].x, from[5 * stride].x, from[4 * stride].x,
+                          from[3 * stride].x, from[2 * stride].x, from[1 * stride].x, from[0 * stride].x);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pscatter<half, Packet16h>(half* to, const Packet16h& from, Index stride) {
+  EIGEN_ALIGN64 half aux[16];
+  pstore(aux, from);
+  to[stride * 0] = aux[0];
+  to[stride * 1] = aux[1];
+  to[stride * 2] = aux[2];
+  to[stride * 3] = aux[3];
+  to[stride * 4] = aux[4];
+  to[stride * 5] = aux[5];
+  to[stride * 6] = aux[6];
+  to[stride * 7] = aux[7];
+  to[stride * 8] = aux[8];
+  to[stride * 9] = aux[9];
+  to[stride * 10] = aux[10];
+  to[stride * 11] = aux[11];
+  to[stride * 12] = aux[12];
+  to[stride * 13] = aux[13];
+  to[stride * 14] = aux[14];
+  to[stride * 15] = aux[15];
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16h, 16>& kernel) {
+  __m256i a = kernel.packet[0];
+  __m256i b = kernel.packet[1];
+  __m256i c = kernel.packet[2];
+  __m256i d = kernel.packet[3];
+  __m256i e = kernel.packet[4];
+  __m256i f = kernel.packet[5];
+  __m256i g = kernel.packet[6];
+  __m256i h = kernel.packet[7];
+  __m256i i = kernel.packet[8];
+  __m256i j = kernel.packet[9];
+  __m256i k = kernel.packet[10];
+  __m256i l = kernel.packet[11];
+  __m256i m = kernel.packet[12];
+  __m256i n = kernel.packet[13];
+  __m256i o = kernel.packet[14];
+  __m256i p = kernel.packet[15];
+
+  __m256i ab_07 = _mm256_unpacklo_epi16(a, b);
+  __m256i cd_07 = _mm256_unpacklo_epi16(c, d);
+  __m256i ef_07 = _mm256_unpacklo_epi16(e, f);
+  __m256i gh_07 = _mm256_unpacklo_epi16(g, h);
+  __m256i ij_07 = _mm256_unpacklo_epi16(i, j);
+  __m256i kl_07 = _mm256_unpacklo_epi16(k, l);
+  __m256i mn_07 = _mm256_unpacklo_epi16(m, n);
+  __m256i op_07 = _mm256_unpacklo_epi16(o, p);
+
+  __m256i ab_8f = _mm256_unpackhi_epi16(a, b);
+  __m256i cd_8f = _mm256_unpackhi_epi16(c, d);
+  __m256i ef_8f = _mm256_unpackhi_epi16(e, f);
+  __m256i gh_8f = _mm256_unpackhi_epi16(g, h);
+  __m256i ij_8f = _mm256_unpackhi_epi16(i, j);
+  __m256i kl_8f = _mm256_unpackhi_epi16(k, l);
+  __m256i mn_8f = _mm256_unpackhi_epi16(m, n);
+  __m256i op_8f = _mm256_unpackhi_epi16(o, p);
+
+  __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07);
+  __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07);
+  __m256i efgh_03 = _mm256_unpacklo_epi32(ef_07, gh_07);
+  __m256i efgh_47 = _mm256_unpackhi_epi32(ef_07, gh_07);
+  __m256i ijkl_03 = _mm256_unpacklo_epi32(ij_07, kl_07);
+  __m256i ijkl_47 = _mm256_unpackhi_epi32(ij_07, kl_07);
+  __m256i mnop_03 = _mm256_unpacklo_epi32(mn_07, op_07);
+  __m256i mnop_47 = _mm256_unpackhi_epi32(mn_07, op_07);
+
+  __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f);
+  __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f);
+  __m256i efgh_8b = _mm256_unpacklo_epi32(ef_8f, gh_8f);
+  __m256i efgh_cf = _mm256_unpackhi_epi32(ef_8f, gh_8f);
+  __m256i ijkl_8b = _mm256_unpacklo_epi32(ij_8f, kl_8f);
+  __m256i ijkl_cf = _mm256_unpackhi_epi32(ij_8f, kl_8f);
+  __m256i mnop_8b = _mm256_unpacklo_epi32(mn_8f, op_8f);
+  __m256i mnop_cf = _mm256_unpackhi_epi32(mn_8f, op_8f);
+
+  __m256i abcdefgh_01 = _mm256_unpacklo_epi64(abcd_03, efgh_03);
+  __m256i abcdefgh_23 = _mm256_unpackhi_epi64(abcd_03, efgh_03);
+  __m256i ijklmnop_01 = _mm256_unpacklo_epi64(ijkl_03, mnop_03);
+  __m256i ijklmnop_23 = _mm256_unpackhi_epi64(ijkl_03, mnop_03);
+  __m256i abcdefgh_45 = _mm256_unpacklo_epi64(abcd_47, efgh_47);
+  __m256i abcdefgh_67 = _mm256_unpackhi_epi64(abcd_47, efgh_47);
+  __m256i ijklmnop_45 = _mm256_unpacklo_epi64(ijkl_47, mnop_47);
+  __m256i ijklmnop_67 = _mm256_unpackhi_epi64(ijkl_47, mnop_47);
+  __m256i abcdefgh_89 = _mm256_unpacklo_epi64(abcd_8b, efgh_8b);
+  __m256i abcdefgh_ab = _mm256_unpackhi_epi64(abcd_8b, efgh_8b);
+  __m256i ijklmnop_89 = _mm256_unpacklo_epi64(ijkl_8b, mnop_8b);
+  __m256i ijklmnop_ab = _mm256_unpackhi_epi64(ijkl_8b, mnop_8b);
+  __m256i abcdefgh_cd = _mm256_unpacklo_epi64(abcd_cf, efgh_cf);
+  __m256i abcdefgh_ef = _mm256_unpackhi_epi64(abcd_cf, efgh_cf);
+  __m256i ijklmnop_cd = _mm256_unpacklo_epi64(ijkl_cf, mnop_cf);
+  __m256i ijklmnop_ef = _mm256_unpackhi_epi64(ijkl_cf, mnop_cf);
+
+  // NOTE: no unpacklo/hi instr in this case, so using permute instr.
+  __m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20);
+  __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20);
+  __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20);
+  __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20);
+  __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20);
+  __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20);
+  __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20);
+  __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20);
+  __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31);
+  __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31);
+  __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31);
+  __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31);
+  __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31);
+  __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31);
+  __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31);
+  __m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31);
+
+  kernel.packet[0] = a_p_0;
+  kernel.packet[1] = a_p_1;
+  kernel.packet[2] = a_p_2;
+  kernel.packet[3] = a_p_3;
+  kernel.packet[4] = a_p_4;
+  kernel.packet[5] = a_p_5;
+  kernel.packet[6] = a_p_6;
+  kernel.packet[7] = a_p_7;
+  kernel.packet[8] = a_p_8;
+  kernel.packet[9] = a_p_9;
+  kernel.packet[10] = a_p_a;
+  kernel.packet[11] = a_p_b;
+  kernel.packet[12] = a_p_c;
+  kernel.packet[13] = a_p_d;
+  kernel.packet[14] = a_p_e;
+  kernel.packet[15] = a_p_f;
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16h, 8>& kernel) {
+  EIGEN_ALIGN64 half in[8][16];
+  pstore<half>(in[0], kernel.packet[0]);
+  pstore<half>(in[1], kernel.packet[1]);
+  pstore<half>(in[2], kernel.packet[2]);
+  pstore<half>(in[3], kernel.packet[3]);
+  pstore<half>(in[4], kernel.packet[4]);
+  pstore<half>(in[5], kernel.packet[5]);
+  pstore<half>(in[6], kernel.packet[6]);
+  pstore<half>(in[7], kernel.packet[7]);
+
+  EIGEN_ALIGN64 half out[8][16];
+
+  for (int i = 0; i < 8; ++i) {
+    for (int j = 0; j < 8; ++j) {
+      out[i][j] = in[j][2 * i];
+    }
+    for (int j = 0; j < 8; ++j) {
+      out[i][j + 8] = in[j][2 * i + 1];
+    }
+  }
+
+  kernel.packet[0] = pload<Packet16h>(out[0]);
+  kernel.packet[1] = pload<Packet16h>(out[1]);
+  kernel.packet[2] = pload<Packet16h>(out[2]);
+  kernel.packet[3] = pload<Packet16h>(out[3]);
+  kernel.packet[4] = pload<Packet16h>(out[4]);
+  kernel.packet[5] = pload<Packet16h>(out[5]);
+  kernel.packet[6] = pload<Packet16h>(out[6]);
+  kernel.packet[7] = pload<Packet16h>(out[7]);
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16h, 4>& kernel) {
+  EIGEN_ALIGN64 half in[4][16];
+  pstore<half>(in[0], kernel.packet[0]);
+  pstore<half>(in[1], kernel.packet[1]);
+  pstore<half>(in[2], kernel.packet[2]);
+  pstore<half>(in[3], kernel.packet[3]);
+
+  EIGEN_ALIGN64 half out[4][16];
+
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      out[i][j] = in[j][4 * i];
+    }
+    for (int j = 0; j < 4; ++j) {
+      out[i][j + 4] = in[j][4 * i + 1];
+    }
+    for (int j = 0; j < 4; ++j) {
+      out[i][j + 8] = in[j][4 * i + 2];
+    }
+    for (int j = 0; j < 4; ++j) {
+      out[i][j + 12] = in[j][4 * i + 3];
+    }
+  }
+
+  kernel.packet[0] = pload<Packet16h>(out[0]);
+  kernel.packet[1] = pload<Packet16h>(out[1]);
+  kernel.packet[2] = pload<Packet16h>(out[2]);
+  kernel.packet[3] = pload<Packet16h>(out[3]);
+}
+
+#endif  // EIGEN_VECTORIZE_AVX512FP16
+
+template <>
+struct is_arithmetic<Packet16bf> {
+  enum { value = true };
+};
+
+template <>
+struct packet_traits<bfloat16> : default_packet_traits {
+  typedef Packet16bf type;
+  typedef Packet8bf half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 16,
+    HasBlend = 0,
+    HasInsert = 1,
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+    HasLog = 1,  // Currently fails test with bad accuracy.
+    HasLog1p = 1,
+    HasExpm1 = 1,
+    HasNdtri = 1,
+    HasBessel = 1,
+#endif
+    HasExp = 1,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH,
+    HasCmp = 1,
+    HasDiv = 1
+  };
+};
+
+template <>
+struct unpacket_traits<Packet16bf> {
+  typedef bfloat16 type;
+  enum {
+    size = 16,
+    alignment = Aligned32,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+  typedef Packet8bf half;
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pset1<Packet16bf>(const bfloat16& from) {
+  return _mm256_set1_epi16(from.value);
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 pfirst<Packet16bf>(const Packet16bf& from) {
+  bfloat16 t;
+  t.value = static_cast<unsigned short>(_mm256_extract_epi16(from, 0));
+  return t;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pload<Packet16bf>(const bfloat16* from) {
+  return _mm256_load_si256(reinterpret_cast<const __m256i*>(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf ploadu<Packet16bf>(const bfloat16* from) {
+  return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet16bf& from) {
+  EIGEN_DEBUG_ALIGNED_STORE
+  _mm256_store_si256(reinterpret_cast<__m256i*>(to), from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet16bf& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE
+  _mm256_storeu_si256(reinterpret_cast<__m256i*>(to), from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf ploaddup<Packet16bf>(const bfloat16* from) {
+  unsigned short a = from[0].value;
+  unsigned short b = from[1].value;
+  unsigned short c = from[2].value;
+  unsigned short d = from[3].value;
+  unsigned short e = from[4].value;
+  unsigned short f = from[5].value;
+  unsigned short g = from[6].value;
+  unsigned short h = from[7].value;
+  return _mm256_set_epi16(h, h, g, g, f, f, e, e, d, d, c, c, b, b, a, a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf ploadquad(const bfloat16* from) {
+  unsigned short a = from[0].value;
+  unsigned short b = from[1].value;
+  unsigned short c = from[2].value;
+  unsigned short d = from[3].value;
+  return _mm256_set_epi16(d, d, d, d, c, c, c, c, b, b, b, b, a, a, a, a);
+}
+
+EIGEN_STRONG_INLINE Packet16f Bf16ToF32(const Packet16bf& a) {
+  return _mm512_castsi512_ps(_mm512_slli_epi32(_mm512_cvtepu16_epi32(a), 16));
+}
+
+// Convert float to bfloat16 according to round-to-nearest-even/denormals algorithm.
+EIGEN_STRONG_INLINE Packet16bf F32ToBf16(const Packet16f& a) {
+  Packet16bf r;
+
+#if defined(EIGEN_VECTORIZE_AVX512BF16) && EIGEN_GNUC_STRICT_AT_LEAST(10, 1, 0)
+  // Since GCC 10.1 supports avx512bf16 and C style explicit cast
+  // (C++ static_cast is not supported yet), do conversion via intrinsic
+  // and register path for performance.
+  r = (__m256i)(_mm512_cvtneps_pbh(a));
+
+#else
+  __m512i t;
+  __m512i input = _mm512_castps_si512(a);
+  __m512i nan = _mm512_set1_epi32(0x7fc0);
+
+  // uint32_t lsb = (input >> 16) & 1;
+  t = _mm512_and_si512(_mm512_srli_epi32(input, 16), _mm512_set1_epi32(1));
+  // uint32_t rounding_bias = 0x7fff + lsb;
+  t = _mm512_add_epi32(t, _mm512_set1_epi32(0x7fff));
+  // input += rounding_bias;
+  t = _mm512_add_epi32(t, input);
+  // input = input >> 16;
+  t = _mm512_srli_epi32(t, 16);
+
+  // Check NaN before converting back to bf16
+  __mmask16 mask = _mm512_cmp_ps_mask(a, a, _CMP_ORD_Q);
+
+  t = _mm512_mask_blend_epi32(mask, nan, t);
+  // output.value = static_cast<uint16_t>(input);
+  r = _mm512_cvtepi32_epi16(t);
+#endif  // EIGEN_VECTORIZE_AVX512BF16
+
+  return r;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf ptrue(const Packet16bf& a) {
+  return Packet16bf(ptrue<Packet8i>(Packet8i(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf por(const Packet16bf& a, const Packet16bf& b) {
+  return Packet16bf(por<Packet8i>(Packet8i(a), Packet8i(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pxor(const Packet16bf& a, const Packet16bf& b) {
+  return Packet16bf(pxor<Packet8i>(Packet8i(a), Packet8i(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pand(const Packet16bf& a, const Packet16bf& b) {
+  return Packet16bf(pand<Packet8i>(Packet8i(a), Packet8i(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pandnot(const Packet16bf& a, const Packet16bf& b) {
+  return Packet16bf(pandnot<Packet8i>(Packet8i(a), Packet8i(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pselect(const Packet16bf& mask, const Packet16bf& a, const Packet16bf& b) {
+  // Input mask is expected to be all 0/1, handle it with 8-bit
+  // intrinsic for performance.
+  return _mm256_blendv_epi8(b, a, mask);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pround<Packet16bf>(const Packet16bf& a) {
+  return F32ToBf16(pround<Packet16f>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf print<Packet16bf>(const Packet16bf& a) {
+  return F32ToBf16(print<Packet16f>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pceil<Packet16bf>(const Packet16bf& a) {
+  return F32ToBf16(pceil<Packet16f>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pfloor<Packet16bf>(const Packet16bf& a) {
+  return F32ToBf16(pfloor<Packet16f>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf ptrunc<Packet16bf>(const Packet16bf& a) {
+  return F32ToBf16(ptrunc<Packet16f>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pcmp_eq(const Packet16bf& a, const Packet16bf& b) {
+  return Pack32To16(pcmp_eq(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pcmp_le(const Packet16bf& a, const Packet16bf& b) {
+  return Pack32To16(pcmp_le(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pcmp_lt(const Packet16bf& a, const Packet16bf& b) {
+  return Pack32To16(pcmp_lt(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pcmp_lt_or_nan(const Packet16bf& a, const Packet16bf& b) {
+  return Pack32To16(pcmp_lt_or_nan(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pnegate(const Packet16bf& a) {
+  Packet16bf sign_mask = _mm256_set1_epi16(static_cast<unsigned short>(0x8000));
+  return _mm256_xor_si256(a, sign_mask);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pconj(const Packet16bf& a) {
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pabs(const Packet16bf& a) {
+  const __m256i sign_mask = _mm256_set1_epi16(static_cast<numext::uint16_t>(0x8000));
+  return _mm256_andnot_si256(sign_mask, a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf padd<Packet16bf>(const Packet16bf& a, const Packet16bf& b) {
+  return F32ToBf16(padd<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf psub<Packet16bf>(const Packet16bf& a, const Packet16bf& b) {
+  return F32ToBf16(psub<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pmul<Packet16bf>(const Packet16bf& a, const Packet16bf& b) {
+  return F32ToBf16(pmul(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pmadd<Packet16bf>(const Packet16bf& a, const Packet16bf& b, const Packet16bf& c) {
+  return F32ToBf16(pmadd(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pmsub<Packet16bf>(const Packet16bf& a, const Packet16bf& b, const Packet16bf& c) {
+  return F32ToBf16(pmsub(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pnmadd<Packet16bf>(const Packet16bf& a, const Packet16bf& b, const Packet16bf& c) {
+  return F32ToBf16(pnmadd(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pnmsub<Packet16bf>(const Packet16bf& a, const Packet16bf& b, const Packet16bf& c) {
+  return F32ToBf16(pnmsub(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pdiv<Packet16bf>(const Packet16bf& a, const Packet16bf& b) {
+  return F32ToBf16(pdiv<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pmin<Packet16bf>(const Packet16bf& a, const Packet16bf& b) {
+  return F32ToBf16(pmin<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pmax<Packet16bf>(const Packet16bf& a, const Packet16bf& b) {
+  return F32ToBf16(pmax<Packet16f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf plset<Packet16bf>(const bfloat16& a) {
+  return F32ToBf16(plset<Packet16f>(static_cast<float>(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf predux_half_dowto4<Packet16bf>(const Packet16bf& a) {
+  Packet8bf lane0 = _mm256_extractf128_si256(a, 0);
+  Packet8bf lane1 = _mm256_extractf128_si256(a, 1);
+  return padd<Packet8bf>(lane0, lane1);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf preverse(const Packet16bf& a) {
+  __m256i m = _mm256_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7,
+                               4, 5, 2, 3, 0, 1);
+
+  Packet16bf res;
+  // Swap hi and lo first because shuffle is in 128-bit lanes.
+  res = _mm256_permute2x128_si256(a, a, 1);
+  // Shuffle 8-bit values in src within 2*128-bit lanes.
+  return _mm256_shuffle_epi8(res, m);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pgather<bfloat16, Packet16bf>(const bfloat16* from, Index stride) {
+  return _mm256_set_epi16(
+      from[15 * stride].value, from[14 * stride].value, from[13 * stride].value, from[12 * stride].value,
+      from[11 * stride].value, from[10 * stride].value, from[9 * stride].value, from[8 * stride].value,
+      from[7 * stride].value, from[6 * stride].value, from[5 * stride].value, from[4 * stride].value,
+      from[3 * stride].value, from[2 * stride].value, from[1 * stride].value, from[0 * stride].value);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pscatter<bfloat16, Packet16bf>(bfloat16* to, const Packet16bf& from, Index stride) {
+  EIGEN_ALIGN64 bfloat16 aux[16];
+  pstore(aux, from);
+  to[stride * 0] = aux[0];
+  to[stride * 1] = aux[1];
+  to[stride * 2] = aux[2];
+  to[stride * 3] = aux[3];
+  to[stride * 4] = aux[4];
+  to[stride * 5] = aux[5];
+  to[stride * 6] = aux[6];
+  to[stride * 7] = aux[7];
+  to[stride * 8] = aux[8];
+  to[stride * 9] = aux[9];
+  to[stride * 10] = aux[10];
+  to[stride * 11] = aux[11];
+  to[stride * 12] = aux[12];
+  to[stride * 13] = aux[13];
+  to[stride * 14] = aux[14];
+  to[stride * 15] = aux[15];
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16bf, 16>& kernel) {
+  __m256i a = kernel.packet[0];
+  __m256i b = kernel.packet[1];
+  __m256i c = kernel.packet[2];
+  __m256i d = kernel.packet[3];
+  __m256i e = kernel.packet[4];
+  __m256i f = kernel.packet[5];
+  __m256i g = kernel.packet[6];
+  __m256i h = kernel.packet[7];
+  __m256i i = kernel.packet[8];
+  __m256i j = kernel.packet[9];
+  __m256i k = kernel.packet[10];
+  __m256i l = kernel.packet[11];
+  __m256i m = kernel.packet[12];
+  __m256i n = kernel.packet[13];
+  __m256i o = kernel.packet[14];
+  __m256i p = kernel.packet[15];
+
+  __m256i ab_07 = _mm256_unpacklo_epi16(a, b);
+  __m256i cd_07 = _mm256_unpacklo_epi16(c, d);
+  __m256i ef_07 = _mm256_unpacklo_epi16(e, f);
+  __m256i gh_07 = _mm256_unpacklo_epi16(g, h);
+  __m256i ij_07 = _mm256_unpacklo_epi16(i, j);
+  __m256i kl_07 = _mm256_unpacklo_epi16(k, l);
+  __m256i mn_07 = _mm256_unpacklo_epi16(m, n);
+  __m256i op_07 = _mm256_unpacklo_epi16(o, p);
+
+  __m256i ab_8f = _mm256_unpackhi_epi16(a, b);
+  __m256i cd_8f = _mm256_unpackhi_epi16(c, d);
+  __m256i ef_8f = _mm256_unpackhi_epi16(e, f);
+  __m256i gh_8f = _mm256_unpackhi_epi16(g, h);
+  __m256i ij_8f = _mm256_unpackhi_epi16(i, j);
+  __m256i kl_8f = _mm256_unpackhi_epi16(k, l);
+  __m256i mn_8f = _mm256_unpackhi_epi16(m, n);
+  __m256i op_8f = _mm256_unpackhi_epi16(o, p);
+
+  __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07);
+  __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07);
+  __m256i efgh_03 = _mm256_unpacklo_epi32(ef_07, gh_07);
+  __m256i efgh_47 = _mm256_unpackhi_epi32(ef_07, gh_07);
+  __m256i ijkl_03 = _mm256_unpacklo_epi32(ij_07, kl_07);
+  __m256i ijkl_47 = _mm256_unpackhi_epi32(ij_07, kl_07);
+  __m256i mnop_03 = _mm256_unpacklo_epi32(mn_07, op_07);
+  __m256i mnop_47 = _mm256_unpackhi_epi32(mn_07, op_07);
+
+  __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f);
+  __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f);
+  __m256i efgh_8b = _mm256_unpacklo_epi32(ef_8f, gh_8f);
+  __m256i efgh_cf = _mm256_unpackhi_epi32(ef_8f, gh_8f);
+  __m256i ijkl_8b = _mm256_unpacklo_epi32(ij_8f, kl_8f);
+  __m256i ijkl_cf = _mm256_unpackhi_epi32(ij_8f, kl_8f);
+  __m256i mnop_8b = _mm256_unpacklo_epi32(mn_8f, op_8f);
+  __m256i mnop_cf = _mm256_unpackhi_epi32(mn_8f, op_8f);
+
+  __m256i abcdefgh_01 = _mm256_unpacklo_epi64(abcd_03, efgh_03);
+  __m256i abcdefgh_23 = _mm256_unpackhi_epi64(abcd_03, efgh_03);
+  __m256i ijklmnop_01 = _mm256_unpacklo_epi64(ijkl_03, mnop_03);
+  __m256i ijklmnop_23 = _mm256_unpackhi_epi64(ijkl_03, mnop_03);
+  __m256i abcdefgh_45 = _mm256_unpacklo_epi64(abcd_47, efgh_47);
+  __m256i abcdefgh_67 = _mm256_unpackhi_epi64(abcd_47, efgh_47);
+  __m256i ijklmnop_45 = _mm256_unpacklo_epi64(ijkl_47, mnop_47);
+  __m256i ijklmnop_67 = _mm256_unpackhi_epi64(ijkl_47, mnop_47);
+  __m256i abcdefgh_89 = _mm256_unpacklo_epi64(abcd_8b, efgh_8b);
+  __m256i abcdefgh_ab = _mm256_unpackhi_epi64(abcd_8b, efgh_8b);
+  __m256i ijklmnop_89 = _mm256_unpacklo_epi64(ijkl_8b, mnop_8b);
+  __m256i ijklmnop_ab = _mm256_unpackhi_epi64(ijkl_8b, mnop_8b);
+  __m256i abcdefgh_cd = _mm256_unpacklo_epi64(abcd_cf, efgh_cf);
+  __m256i abcdefgh_ef = _mm256_unpackhi_epi64(abcd_cf, efgh_cf);
+  __m256i ijklmnop_cd = _mm256_unpacklo_epi64(ijkl_cf, mnop_cf);
+  __m256i ijklmnop_ef = _mm256_unpackhi_epi64(ijkl_cf, mnop_cf);
+
+  // NOTE: no unpacklo/hi instr in this case, so using permute instr.
+  kernel.packet[0] = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20);
+  kernel.packet[1] = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20);
+  kernel.packet[2] = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20);
+  kernel.packet[3] = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20);
+  kernel.packet[4] = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20);
+  kernel.packet[5] = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20);
+  kernel.packet[6] = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20);
+  kernel.packet[7] = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20);
+  kernel.packet[8] = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31);
+  kernel.packet[9] = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31);
+  kernel.packet[10] = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31);
+  kernel.packet[11] = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31);
+  kernel.packet[12] = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31);
+  kernel.packet[13] = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31);
+  kernel.packet[14] = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31);
+  kernel.packet[15] = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31);
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16bf, 4>& kernel) {
+  __m256i a = kernel.packet[0];
+  __m256i b = kernel.packet[1];
+  __m256i c = kernel.packet[2];
+  __m256i d = kernel.packet[3];
+
+  __m256i ab_07 = _mm256_unpacklo_epi16(a, b);
+  __m256i cd_07 = _mm256_unpacklo_epi16(c, d);
+  __m256i ab_8f = _mm256_unpackhi_epi16(a, b);
+  __m256i cd_8f = _mm256_unpackhi_epi16(c, d);
+
+  __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07);
+  __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07);
+  __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f);
+  __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f);
+
+  // NOTE: no unpacklo/hi instr in this case, so using permute instr.
+  kernel.packet[0] = _mm256_permute2x128_si256(abcd_03, abcd_47, 0x20);
+  kernel.packet[1] = _mm256_permute2x128_si256(abcd_8b, abcd_cf, 0x20);
+  kernel.packet[2] = _mm256_permute2x128_si256(abcd_03, abcd_47, 0x31);
+  kernel.packet[3] = _mm256_permute2x128_si256(abcd_8b, abcd_cf, 0x31);
+}
+
+// Minimal implementation of 16-bit int packets for use in pfrexp, pldexp.
+
+template <>
+EIGEN_STRONG_INLINE Packet32s pset1<Packet32s>(const numext::int16_t& x) {
+  return _mm512_set1_epi16(x);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16s pset1<Packet16s>(const numext::int16_t& x) {
+  return _mm256_set1_epi16(x);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(const numext::int16_t& x) {
+  return _mm_set1_epi16(x);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<numext::int16_t, Packet32s>(numext::int16_t* out, const Packet32s& x) {
+  EIGEN_DEBUG_ALIGNED_STORE
+  _mm512_store_epi32(out, x);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<numext::int16_t, Packet16s>(numext::int16_t* out, const Packet16s& x) {
+  EIGEN_DEBUG_ALIGNED_STORE
+#if defined(EIGEN_VECTORIZE_AVX512F) && defined(EIGEN_VECTORIZE_AVX512VL)
+  _mm256_store_epi32(out, x);
+#else
+  _mm256_store_si256(reinterpret_cast<__m256i*>(out), x);
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<numext::int16_t, Packet8s>(numext::int16_t* out, const Packet8s& x) {
+  EIGEN_DEBUG_ALIGNED_STORE
+#if defined(EIGEN_VECTORIZE_AVX512F) && defined(EIGEN_VECTORIZE_AVX512VL)
+  _mm256_store_epi32(out, x);
+#else
+  _mm_store_si128(reinterpret_cast<__m128i*>(out), x);
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<numext::int16_t, Packet32s>(numext::int16_t* out, const Packet32s& x) {
+  EIGEN_DEBUG_UNALIGNED_STORE
+  _mm512_storeu_epi32(out, x);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<numext::int16_t, Packet16s>(numext::int16_t* out, const Packet16s& x) {
+  EIGEN_DEBUG_UNALIGNED_STORE
+  _mm256_storeu_epi32(out, x);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<numext::int16_t, Packet8s>(numext::int16_t* out, const Packet8s& x) {
+  EIGEN_DEBUG_UNALIGNED_STORE
+  _mm_storeu_epi32(out, x);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet32s padd(const Packet32s& a, const Packet32s& b) {
+  return _mm512_add_epi16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16s padd(const Packet16s& a, const Packet16s& b) {
+  return _mm256_add_epi16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8s padd(const Packet8s& a, const Packet8s& b) {
+  return _mm_add_epi16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet32s psub(const Packet32s& a, const Packet32s& b) {
+  return _mm512_sub_epi16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16s psub(const Packet16s& a, const Packet16s& b) {
+  return _mm256_sub_epi16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8s psub(const Packet8s& a, const Packet8s& b) {
+  return _mm_sub_epi16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet32s pmul(const Packet32s& a, const Packet32s& b) {
+  return _mm512_mullo_epi16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16s pmul(const Packet16s& a, const Packet16s& b) {
+  return _mm256_mullo_epi16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8s pmul(const Packet8s& a, const Packet8s& b) {
+  return _mm_mullo_epi16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet32s pnegate(const Packet32s& a) {
+  return _mm512_sub_epi16(_mm512_setzero_si512(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16s pnegate(const Packet16s& a) {
+  return _mm256_sub_epi16(_mm256_setzero_si256(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8s pnegate(const Packet8s& a) {
+  return _mm_sub_epi16(_mm_setzero_si128(), a);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet32s parithmetic_shift_right(Packet32s a) {
+  return _mm512_srai_epi16(a, N);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet16s parithmetic_shift_right(Packet16s a) {
+  return _mm256_srai_epi16(a, N);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet8s parithmetic_shift_right(Packet8s a) {
+  return _mm_srai_epi16(a, N);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet32s plogical_shift_left(Packet32s a) {
+  return _mm512_slli_epi16(a, N);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet16s plogical_shift_left(Packet16s a) {
+  return _mm256_slli_epi16(a, N);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet8s plogical_shift_left(Packet8s a) {
+  return _mm_slli_epi16(a, N);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet32s plogical_shift_right(Packet32s a) {
+  return _mm512_srli_epi16(a, N);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet16s plogical_shift_right(Packet16s a) {
+  return _mm256_srli_epi16(a, N);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet8s plogical_shift_right(Packet8s a) {
+  return _mm_srli_epi16(a, N);
+}
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_PACKET_MATH_AVX512_H
diff --git a/inst/include/Eigen/src/Core/arch/AVX512/PacketMathFP16.h b/inst/include/Eigen/src/Core/arch/AVX512/PacketMathFP16.h
new file mode 100644
index 00000000..a040bbea
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/AVX512/PacketMathFP16.h
@@ -0,0 +1,1413 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2025 The Eigen Authors.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_FP16_AVX512_H
+#define EIGEN_PACKET_MATH_FP16_AVX512_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+typedef __m512h Packet32h;
+typedef __m256h Packet16h;
+typedef __m128h Packet8h;
+
+template <>
+struct is_arithmetic<Packet8h> {
+  enum { value = true };
+};
+
+template <>
+struct packet_traits<half> : default_packet_traits {
+  typedef Packet32h type;
+  typedef Packet16h half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 32,
+
+    HasCmp = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasAbs2 = 0,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
+    HasSetLinear = 0,
+    HasLog = 1,
+    HasLog1p = 1,
+    HasExp = 1,
+    HasExpm1 = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    // These ones should be implemented in future
+    HasBessel = 0,
+    HasNdtri = 0,
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf = 0,  // EIGEN_FAST_MATH,
+    HasBlend = 0
+  };
+};
+
+template <>
+struct unpacket_traits<Packet32h> {
+  typedef Eigen::half type;
+  typedef Packet16h half;
+  typedef Packet32s integer_packet;
+  enum {
+    size = 32,
+    alignment = Aligned64,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template <>
+struct unpacket_traits<Packet16h> {
+  typedef Eigen::half type;
+  typedef Packet8h half;
+  typedef Packet16s integer_packet;
+  enum {
+    size = 16,
+    alignment = Aligned32,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template <>
+struct unpacket_traits<Packet8h> {
+  typedef Eigen::half type;
+  typedef Packet8h half;
+  typedef Packet8s integer_packet;
+  enum {
+    size = 8,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+// Conversions
+
+EIGEN_STRONG_INLINE Packet16f half2float(const Packet16h& a) { return _mm512_cvtxph_ps(a); }
+
+EIGEN_STRONG_INLINE Packet8f half2float(const Packet8h& a) { return _mm256_cvtxph_ps(a); }
+
+EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) { return _mm512_cvtxps_ph(a); }
+
+EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) { return _mm256_cvtxps_ph(a); }
+
+// Memory functions
+
+// pset1
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pset1<Packet32h>(const Eigen::half& from) {
+  return _mm512_set1_ph(from.x);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pset1<Packet16h>(const Eigen::half& from) {
+  return _mm256_set1_ph(from.x);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pset1<Packet8h>(const Eigen::half& from) {
+  return _mm_set1_ph(from.x);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pzero(const Packet32h& /*a*/) {
+  return _mm512_setzero_ph();
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pzero(const Packet16h& /*a*/) {
+  return _mm256_setzero_ph();
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pzero(const Packet8h& /*a*/) {
+  return _mm_setzero_ph();
+}
+
+// pset1frombits
+template <>
+EIGEN_STRONG_INLINE Packet32h pset1frombits<Packet32h>(unsigned short from) {
+  return _mm512_castsi512_ph(_mm512_set1_epi16(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pset1frombits<Packet16h>(unsigned short from) {
+  return _mm256_castsi256_ph(_mm256_set1_epi16(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pset1frombits<Packet8h>(unsigned short from) {
+  return _mm_castsi128_ph(_mm_set1_epi16(from));
+}
+
+// pfirst
+
+template <>
+EIGEN_STRONG_INLINE Eigen::half pfirst<Packet32h>(const Packet32h& from) {
+  return Eigen::half(_mm512_cvtsh_h(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Eigen::half pfirst<Packet16h>(const Packet16h& from) {
+  return Eigen::half(_mm256_cvtsh_h(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Eigen::half pfirst<Packet8h>(const Packet8h& from) {
+  return Eigen::half(_mm_cvtsh_h(from));
+}
+
+// pload
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pload<Packet32h>(const Eigen::half* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_ph(from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pload<Packet16h>(const Eigen::half* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm256_load_ph(from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pload<Packet8h>(const Eigen::half* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ph(from);
+}
+
+// ploadu
+
+template <>
+EIGEN_STRONG_INLINE Packet32h ploadu<Packet32h>(const Eigen::half* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm512_loadu_ph(from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h ploadu<Packet16h>(const Eigen::half* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm256_loadu_ph(from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h ploadu<Packet8h>(const Eigen::half* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_ph(from);
+}
+
+// pstore
+
+template <>
+EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet32h& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm512_store_ph(to, from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet16h& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm256_store_ph(to, from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet8h& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm_store_ph(to, from);
+}
+
+// pstoreu
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet32h& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm512_storeu_ph(to, from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet16h& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm256_storeu_ph(to, from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet8h& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_ph(to, from);
+}
+
+// ploaddup
+template <>
+EIGEN_STRONG_INLINE Packet32h ploaddup<Packet32h>(const Eigen::half* from) {
+  __m512h a = _mm512_castph256_ph512(_mm256_loadu_ph(from));
+  return _mm512_permutexvar_ph(_mm512_set_epi16(15, 15, 14, 14, 13, 13, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8, 7, 7, 6, 6,
+                                                5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0),
+                               a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h ploaddup<Packet16h>(const Eigen::half* from) {
+  __m256h a = _mm256_castph128_ph256(_mm_loadu_ph(from));
+  return _mm256_permutexvar_ph(_mm256_set_epi16(7, 7, 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0, 0), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h ploaddup<Packet8h>(const Eigen::half* from) {
+  return _mm_set_ph(from[3].x, from[3].x, from[2].x, from[2].x, from[1].x, from[1].x, from[0].x, from[0].x);
+}
+
+// ploadquad
+template <>
+EIGEN_STRONG_INLINE Packet32h ploadquad<Packet32h>(const Eigen::half* from) {
+  __m512h a = _mm512_castph128_ph512(_mm_loadu_ph(from));
+  return _mm512_permutexvar_ph(
+      _mm512_set_epi16(7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0),
+      a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h ploadquad<Packet16h>(const Eigen::half* from) {
+  return _mm256_set_ph(from[3].x, from[3].x, from[3].x, from[3].x, from[2].x, from[2].x, from[2].x, from[2].x,
+                       from[1].x, from[1].x, from[1].x, from[1].x, from[0].x, from[0].x, from[0].x, from[0].x);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h ploadquad<Packet8h>(const Eigen::half* from) {
+  return _mm_set_ph(from[1].x, from[1].x, from[1].x, from[1].x, from[0].x, from[0].x, from[0].x, from[0].x);
+}
+
+// pabs
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pabs<Packet32h>(const Packet32h& a) {
+  return _mm512_abs_ph(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pabs<Packet16h>(const Packet16h& a) {
+  return _mm256_abs_ph(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pabs<Packet8h>(const Packet8h& a) {
+  return _mm_abs_ph(a);
+}
+
+// psignbit
+
+template <>
+EIGEN_STRONG_INLINE Packet32h psignbit<Packet32h>(const Packet32h& a) {
+  return _mm512_castsi512_ph(_mm512_srai_epi16(_mm512_castph_si512(a), 15));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h psignbit<Packet16h>(const Packet16h& a) {
+  return _mm256_castsi256_ph(_mm256_srai_epi16(_mm256_castph_si256(a), 15));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h psignbit<Packet8h>(const Packet8h& a) {
+  return _mm_castsi128_ph(_mm_srai_epi16(_mm_castph_si128(a), 15));
+}
+
+// pmin
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pmin<Packet32h>(const Packet32h& a, const Packet32h& b) {
+  return _mm512_min_ph(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pmin<Packet16h>(const Packet16h& a, const Packet16h& b) {
+  return _mm256_min_ph(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pmin<Packet8h>(const Packet8h& a, const Packet8h& b) {
+  return _mm_min_ph(a, b);
+}
+
+// pmax
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pmax<Packet32h>(const Packet32h& a, const Packet32h& b) {
+  return _mm512_max_ph(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pmax<Packet16h>(const Packet16h& a, const Packet16h& b) {
+  return _mm256_max_ph(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pmax<Packet8h>(const Packet8h& a, const Packet8h& b) {
+  return _mm_max_ph(a, b);
+}
+
+// plset
+template <>
+EIGEN_STRONG_INLINE Packet32h plset<Packet32h>(const half& a) {
+  return _mm512_add_ph(pset1<Packet32h>(a), _mm512_set_ph(31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
+                                                          16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h plset<Packet16h>(const half& a) {
+  return _mm256_add_ph(pset1<Packet16h>(a), _mm256_set_ph(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h plset<Packet8h>(const half& a) {
+  return _mm_add_ph(pset1<Packet8h>(a), _mm_set_ph(7, 6, 5, 4, 3, 2, 1, 0));
+}
+
+// por
+
+template <>
+EIGEN_STRONG_INLINE Packet32h por(const Packet32h& a, const Packet32h& b) {
+  return _mm512_castsi512_ph(_mm512_or_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h por(const Packet16h& a, const Packet16h& b) {
+  return _mm256_castsi256_ph(_mm256_or_si256(_mm256_castph_si256(a), _mm256_castph_si256(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h por(const Packet8h& a, const Packet8h& b) {
+  return _mm_castsi128_ph(_mm_or_si128(_mm_castph_si128(a), _mm_castph_si128(b)));
+}
+
+// pxor
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pxor(const Packet32h& a, const Packet32h& b) {
+  return _mm512_castsi512_ph(_mm512_xor_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pxor(const Packet16h& a, const Packet16h& b) {
+  return _mm256_castsi256_ph(_mm256_xor_si256(_mm256_castph_si256(a), _mm256_castph_si256(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pxor(const Packet8h& a, const Packet8h& b) {
+  return _mm_castsi128_ph(_mm_xor_si128(_mm_castph_si128(a), _mm_castph_si128(b)));
+}
+
+// pand
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pand(const Packet32h& a, const Packet32h& b) {
+  return _mm512_castsi512_ph(_mm512_and_si512(_mm512_castph_si512(a), _mm512_castph_si512(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pand(const Packet16h& a, const Packet16h& b) {
+  return _mm256_castsi256_ph(_mm256_and_si256(_mm256_castph_si256(a), _mm256_castph_si256(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pand(const Packet8h& a, const Packet8h& b) {
+  return _mm_castsi128_ph(_mm_and_si128(_mm_castph_si128(a), _mm_castph_si128(b)));
+}
+
+// pandnot
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pandnot(const Packet32h& a, const Packet32h& b) {
+  return _mm512_castsi512_ph(_mm512_andnot_si512(_mm512_castph_si512(b), _mm512_castph_si512(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pandnot(const Packet16h& a, const Packet16h& b) {
+  return _mm256_castsi256_ph(_mm256_andnot_si256(_mm256_castph_si256(b), _mm256_castph_si256(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pandnot(const Packet8h& a, const Packet8h& b) {
+  return _mm_castsi128_ph(_mm_andnot_si128(_mm_castph_si128(b), _mm_castph_si128(a)));
+}
+
+// pselect
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet32h pselect(const Packet32h& mask, const Packet32h& a, const Packet32h& b) {
+  __mmask32 mask32 = _mm512_cmp_epi16_mask(_mm512_castph_si512(mask), _mm512_setzero_epi32(), _MM_CMPINT_EQ);
+  return _mm512_mask_blend_ph(mask32, a, b);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet16h pselect(const Packet16h& mask, const Packet16h& a, const Packet16h& b) {
+  __mmask16 mask16 = _mm256_cmp_epi16_mask(_mm256_castph_si256(mask), _mm256_setzero_si256(), _MM_CMPINT_EQ);
+  return _mm256_mask_blend_ph(mask16, a, b);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet8h pselect(const Packet8h& mask, const Packet8h& a, const Packet8h& b) {
+  __mmask8 mask8 = _mm_cmp_epi16_mask(_mm_castph_si128(mask), _mm_setzero_si128(), _MM_CMPINT_EQ);
+  return _mm_mask_blend_ph(mask8, a, b);
+}
+
+// pcmp_eq
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pcmp_eq(const Packet32h& a, const Packet32h& b) {
+  __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_EQ_OQ);
+  return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, static_cast<short>(0xffffu)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pcmp_eq(const Packet16h& a, const Packet16h& b) {
+  __mmask16 mask = _mm256_cmp_ph_mask(a, b, _CMP_EQ_OQ);
+  return _mm256_castsi256_ph(_mm256_mask_set1_epi16(_mm256_set1_epi32(0), mask, static_cast<short>(0xffffu)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pcmp_eq(const Packet8h& a, const Packet8h& b) {
+  __mmask8 mask = _mm_cmp_ph_mask(a, b, _CMP_EQ_OQ);
+  return _mm_castsi128_ph(_mm_mask_set1_epi16(_mm_set1_epi32(0), mask, static_cast<short>(0xffffu)));
+}
+
+// pcmp_le
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pcmp_le(const Packet32h& a, const Packet32h& b) {
+  __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_LE_OQ);
+  return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, static_cast<short>(0xffffu)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pcmp_le(const Packet16h& a, const Packet16h& b) {
+  __mmask16 mask = _mm256_cmp_ph_mask(a, b, _CMP_LE_OQ);
+  return _mm256_castsi256_ph(_mm256_mask_set1_epi16(_mm256_set1_epi32(0), mask, static_cast<short>(0xffffu)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pcmp_le(const Packet8h& a, const Packet8h& b) {
+  __mmask8 mask = _mm_cmp_ph_mask(a, b, _CMP_LE_OQ);
+  return _mm_castsi128_ph(_mm_mask_set1_epi16(_mm_set1_epi32(0), mask, static_cast<short>(0xffffu)));
+}
+
+// pcmp_lt
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pcmp_lt(const Packet32h& a, const Packet32h& b) {
+  __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_LT_OQ);
+  return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi32(0), mask, static_cast<short>(0xffffu)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pcmp_lt(const Packet16h& a, const Packet16h& b) {
+  __mmask16 mask = _mm256_cmp_ph_mask(a, b, _CMP_LT_OQ);
+  return _mm256_castsi256_ph(_mm256_mask_set1_epi16(_mm256_set1_epi32(0), mask, static_cast<short>(0xffffu)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pcmp_lt(const Packet8h& a, const Packet8h& b) {
+  __mmask8 mask = _mm_cmp_ph_mask(a, b, _CMP_LT_OQ);
+  return _mm_castsi128_ph(_mm_mask_set1_epi16(_mm_set1_epi32(0), mask, static_cast<short>(0xffffu)));
+}
+
+// pcmp_lt_or_nan
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pcmp_lt_or_nan(const Packet32h& a, const Packet32h& b) {
+  __mmask32 mask = _mm512_cmp_ph_mask(a, b, _CMP_NGE_UQ);
+  return _mm512_castsi512_ph(_mm512_mask_set1_epi16(_mm512_set1_epi16(0), mask, static_cast<short>(0xffffu)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pcmp_lt_or_nan(const Packet16h& a, const Packet16h& b) {
+  __mmask16 mask = _mm256_cmp_ph_mask(a, b, _CMP_NGE_UQ);
+  return _mm256_castsi256_ph(_mm256_mask_set1_epi16(_mm256_set1_epi32(0), mask, static_cast<short>(0xffffu)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pcmp_lt_or_nan(const Packet8h& a, const Packet8h& b) {
+  __mmask8 mask = _mm_cmp_ph_mask(a, b, _CMP_NGE_UQ);
+  return _mm_castsi128_ph(_mm_mask_set1_epi16(_mm_set1_epi32(0), mask, static_cast<short>(0xffffu)));
+}
+
+// padd
+
+template <>
+EIGEN_STRONG_INLINE Packet32h padd<Packet32h>(const Packet32h& a, const Packet32h& b) {
+  return _mm512_add_ph(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) {
+  return _mm256_add_ph(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) {
+  return _mm_add_ph(a, b);
+}
+
+// psub
+
+template <>
+EIGEN_STRONG_INLINE Packet32h psub<Packet32h>(const Packet32h& a, const Packet32h& b) {
+  return _mm512_sub_ph(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h psub<Packet16h>(const Packet16h& a, const Packet16h& b) {
+  return _mm256_sub_ph(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h psub<Packet8h>(const Packet8h& a, const Packet8h& b) {
+  return _mm_sub_ph(a, b);
+}
+
+// pmul
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pmul<Packet32h>(const Packet32h& a, const Packet32h& b) {
+  return _mm512_mul_ph(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) {
+  return _mm256_mul_ph(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) {
+  return _mm_mul_ph(a, b);
+}
+
+// pdiv
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pdiv<Packet32h>(const Packet32h& a, const Packet32h& b) {
+  return _mm512_div_ph(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pdiv<Packet16h>(const Packet16h& a, const Packet16h& b) {
+  return _mm256_div_ph(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pdiv<Packet8h>(const Packet8h& a, const Packet8h& b) {
+  return _mm_div_ph(a, b);
+  ;
+}
+
+// pround
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pround<Packet32h>(const Packet32h& a) {
+  // Work-around for default std::round rounding mode.
+
+  // Mask for the sign bit.
+  const Packet32h signMask =
+      pset1frombits<Packet32h>(static_cast<numext::uint16_t>(static_cast<std::uint16_t>(0x8000u)));
+  // The largest half-precision float less than 0.5.
+  const Packet32h prev0dot5 = pset1frombits<Packet32h>(static_cast<numext::uint16_t>(0x37FFu));
+
+  return _mm512_roundscale_ph(padd(por(pand(a, signMask), prev0dot5), a), _MM_FROUND_TO_ZERO);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pround<Packet16h>(const Packet16h& a) {
+  // Work-around for default std::round rounding mode.
+
+  // Mask for the sign bit.
+  const Packet16h signMask =
+      pset1frombits<Packet16h>(static_cast<numext::uint16_t>(static_cast<std::uint16_t>(0x8000u)));
+  // The largest half-precision float less than 0.5.
+  const Packet16h prev0dot5 = pset1frombits<Packet16h>(static_cast<numext::uint16_t>(0x37FFu));
+
+  return _mm256_roundscale_ph(padd(por(pand(a, signMask), prev0dot5), a), _MM_FROUND_TO_ZERO);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pround<Packet8h>(const Packet8h& a) {
+  // Work-around for default std::round rounding mode.
+
+  // Mask for the sign bit.
+  const Packet8h signMask = pset1frombits<Packet8h>(static_cast<numext::uint16_t>(static_cast<std::uint16_t>(0x8000u)));
+  // The largest half-precision float less than 0.5.
+  const Packet8h prev0dot5 = pset1frombits<Packet8h>(static_cast<numext::uint16_t>(0x37FFu));
+
+  return _mm_roundscale_ph(padd(por(pand(a, signMask), prev0dot5), a), _MM_FROUND_TO_ZERO);
+}
+
+// print
+
+template <>
+EIGEN_STRONG_INLINE Packet32h print<Packet32h>(const Packet32h& a) {
+  return _mm512_roundscale_ph(a, _MM_FROUND_CUR_DIRECTION);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h print<Packet16h>(const Packet16h& a) {
+  return _mm256_roundscale_ph(a, _MM_FROUND_CUR_DIRECTION);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h print<Packet8h>(const Packet8h& a) {
+  return _mm_roundscale_ph(a, _MM_FROUND_CUR_DIRECTION);
+}
+
+// pceil
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pceil<Packet32h>(const Packet32h& a) {
+  return _mm512_roundscale_ph(a, _MM_FROUND_TO_POS_INF);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pceil<Packet16h>(const Packet16h& a) {
+  return _mm256_roundscale_ph(a, _MM_FROUND_TO_POS_INF);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pceil<Packet8h>(const Packet8h& a) {
+  return _mm_roundscale_ph(a, _MM_FROUND_TO_POS_INF);
+}
+
+// pfloor
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pfloor<Packet32h>(const Packet32h& a) {
+  return _mm512_roundscale_ph(a, _MM_FROUND_TO_NEG_INF);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pfloor<Packet16h>(const Packet16h& a) {
+  return _mm256_roundscale_ph(a, _MM_FROUND_TO_NEG_INF);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pfloor<Packet8h>(const Packet8h& a) {
+  return _mm_roundscale_ph(a, _MM_FROUND_TO_NEG_INF);
+}
+
+// ptrunc
+
+template <>
+EIGEN_STRONG_INLINE Packet32h ptrunc<Packet32h>(const Packet32h& a) {
+  return _mm512_roundscale_ph(a, _MM_FROUND_TO_ZERO);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h ptrunc<Packet16h>(const Packet16h& a) {
+  return _mm256_roundscale_ph(a, _MM_FROUND_TO_ZERO);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h ptrunc<Packet8h>(const Packet8h& a) {
+  return _mm_roundscale_ph(a, _MM_FROUND_TO_ZERO);
+}
+
+// predux
+template <>
+EIGEN_STRONG_INLINE half predux<Packet32h>(const Packet32h& a) {
+  return half(_mm512_reduce_add_ph(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& a) {
+  return half(_mm256_reduce_add_ph(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux<Packet8h>(const Packet8h& a) {
+  return half(_mm_reduce_add_ph(a));
+}
+
+// predux_half_dowto4
+template <>
+EIGEN_STRONG_INLINE Packet16h predux_half_dowto4<Packet32h>(const Packet32h& a) {
+  const __m512i bits = _mm512_castph_si512(a);
+  Packet16h lo = _mm256_castsi256_ph(_mm512_castsi512_si256(bits));
+  Packet16h hi = _mm256_castsi256_ph(_mm512_extracti64x4_epi64(bits, 1));
+  return padd(lo, hi);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h predux_half_dowto4<Packet16h>(const Packet16h& a) {
+  Packet8h lo = _mm_castsi128_ph(_mm256_castsi256_si128(_mm256_castph_si256(a)));
+  Packet8h hi = _mm_castps_ph(_mm256_extractf128_ps(_mm256_castph_ps(a), 1));
+  return padd(lo, hi);
+}
+
+// predux_max
+
+template <>
+EIGEN_STRONG_INLINE half predux_max<Packet32h>(const Packet32h& a) {
+  return half(_mm512_reduce_max_ph(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_max<Packet16h>(const Packet16h& a) {
+  return half(_mm256_reduce_max_ph(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_max<Packet8h>(const Packet8h& a) {
+  return half(_mm_reduce_max_ph(a));
+}
+
+// predux_min
+
+template <>
+EIGEN_STRONG_INLINE half predux_min<Packet32h>(const Packet32h& a) {
+  return half(_mm512_reduce_min_ph(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_min<Packet16h>(const Packet16h& a) {
+  return half(_mm256_reduce_min_ph(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_min<Packet8h>(const Packet8h& a) {
+  return half(_mm_reduce_min_ph(a));
+}
+
+// predux_mul
+
+template <>
+EIGEN_STRONG_INLINE half predux_mul<Packet32h>(const Packet32h& a) {
+  return half(_mm512_reduce_mul_ph(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_mul<Packet16h>(const Packet16h& a) {
+  return half(_mm256_reduce_mul_ph(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_mul<Packet8h>(const Packet8h& a) {
+  return half(_mm_reduce_mul_ph(a));
+}
+
+#ifdef EIGEN_VECTORIZE_FMA
+
+// pmadd
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pmadd(const Packet32h& a, const Packet32h& b, const Packet32h& c) {
+  return _mm512_fmadd_ph(a, b, c);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pmadd(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
+  return _mm256_fmadd_ph(a, b, c);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pmadd(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
+  return _mm_fmadd_ph(a, b, c);
+}
+
+// pmsub
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pmsub(const Packet32h& a, const Packet32h& b, const Packet32h& c) {
+  return _mm512_fmsub_ph(a, b, c);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pmsub(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
+  return _mm256_fmsub_ph(a, b, c);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pmsub(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
+  return _mm_fmsub_ph(a, b, c);
+}
+
+// pnmadd
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pnmadd(const Packet32h& a, const Packet32h& b, const Packet32h& c) {
+  return _mm512_fnmadd_ph(a, b, c);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pnmadd(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
+  return _mm256_fnmadd_ph(a, b, c);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pnmadd(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
+  return _mm_fnmadd_ph(a, b, c);
+}
+
+// pnmsub
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pnmsub(const Packet32h& a, const Packet32h& b, const Packet32h& c) {
+  return _mm512_fnmsub_ph(a, b, c);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pnmsub(const Packet16h& a, const Packet16h& b, const Packet16h& c) {
+  return _mm256_fnmsub_ph(a, b, c);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pnmsub(const Packet8h& a, const Packet8h& b, const Packet8h& c) {
+  return _mm_fnmsub_ph(a, b, c);
+}
+
+#endif
+
+// pnegate
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pnegate<Packet32h>(const Packet32h& a) {
+  return _mm512_castsi512_ph(
+      _mm512_xor_si512(_mm512_castph_si512(a), _mm512_set1_epi16(static_cast<std::uint16_t>(0x8000u))));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pnegate<Packet16h>(const Packet16h& a) {
+  return _mm256_castsi256_ph(
+      _mm256_xor_si256(_mm256_castph_si256(a), _mm256_set1_epi16(static_cast<std::uint16_t>(0x8000u))));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pnegate<Packet8h>(const Packet8h& a) {
+  return _mm_castsi128_ph(_mm_xor_si128(_mm_castph_si128(a), _mm_set1_epi16(static_cast<std::uint16_t>(0x8000u))));
+}
+
+// pconj
+
+// Nothing, packets are real.
+
+// psqrt
+
+template <>
+EIGEN_STRONG_INLINE Packet32h psqrt<Packet32h>(const Packet32h& a) {
+  return generic_sqrt_newton_step<Packet32h>::run(a, _mm512_rsqrt_ph(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h psqrt<Packet16h>(const Packet16h& a) {
+  return generic_sqrt_newton_step<Packet16h>::run(a, _mm256_rsqrt_ph(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h psqrt<Packet8h>(const Packet8h& a) {
+  return generic_sqrt_newton_step<Packet8h>::run(a, _mm_rsqrt_ph(a));
+}
+
+// prsqrt
+
+template <>
+EIGEN_STRONG_INLINE Packet32h prsqrt<Packet32h>(const Packet32h& a) {
+  return generic_rsqrt_newton_step<Packet32h, /*Steps=*/1>::run(a, _mm512_rsqrt_ph(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h prsqrt<Packet16h>(const Packet16h& a) {
+  return generic_rsqrt_newton_step<Packet16h, /*Steps=*/1>::run(a, _mm256_rsqrt_ph(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h prsqrt<Packet8h>(const Packet8h& a) {
+  return generic_rsqrt_newton_step<Packet8h, /*Steps=*/1>::run(a, _mm_rsqrt_ph(a));
+}
+
+// preciprocal
+
+template <>
+EIGEN_STRONG_INLINE Packet32h preciprocal<Packet32h>(const Packet32h& a) {
+  return generic_reciprocal_newton_step<Packet32h, /*Steps=*/1>::run(a, _mm512_rcp_ph(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h preciprocal<Packet16h>(const Packet16h& a) {
+  return generic_reciprocal_newton_step<Packet16h, /*Steps=*/1>::run(a, _mm256_rcp_ph(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h preciprocal<Packet8h>(const Packet8h& a) {
+  return generic_reciprocal_newton_step<Packet8h, /*Steps=*/1>::run(a, _mm_rcp_ph(a));
+}
+
+// ptranspose
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet32h, 32>& a) {
+  __m512i t[32];
+
+  EIGEN_UNROLL_LOOP
+  for (int i = 0; i < 16; i++) {
+    t[2 * i] = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[2 * i]), _mm512_castph_si512(a.packet[2 * i + 1]));
+    t[2 * i + 1] =
+        _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[2 * i]), _mm512_castph_si512(a.packet[2 * i + 1]));
+  }
+
+  __m512i p[32];
+
+  EIGEN_UNROLL_LOOP
+  for (int i = 0; i < 8; i++) {
+    p[4 * i] = _mm512_unpacklo_epi32(t[4 * i], t[4 * i + 2]);
+    p[4 * i + 1] = _mm512_unpackhi_epi32(t[4 * i], t[4 * i + 2]);
+    p[4 * i + 2] = _mm512_unpacklo_epi32(t[4 * i + 1], t[4 * i + 3]);
+    p[4 * i + 3] = _mm512_unpackhi_epi32(t[4 * i + 1], t[4 * i + 3]);
+  }
+
+  __m512i q[32];
+
+  EIGEN_UNROLL_LOOP
+  for (int i = 0; i < 4; i++) {
+    q[8 * i] = _mm512_unpacklo_epi64(p[8 * i], p[8 * i + 4]);
+    q[8 * i + 1] = _mm512_unpackhi_epi64(p[8 * i], p[8 * i + 4]);
+    q[8 * i + 2] = _mm512_unpacklo_epi64(p[8 * i + 1], p[8 * i + 5]);
+    q[8 * i + 3] = _mm512_unpackhi_epi64(p[8 * i + 1], p[8 * i + 5]);
+    q[8 * i + 4] = _mm512_unpacklo_epi64(p[8 * i + 2], p[8 * i + 6]);
+    q[8 * i + 5] = _mm512_unpackhi_epi64(p[8 * i + 2], p[8 * i + 6]);
+    q[8 * i + 6] = _mm512_unpacklo_epi64(p[8 * i + 3], p[8 * i + 7]);
+    q[8 * i + 7] = _mm512_unpackhi_epi64(p[8 * i + 3], p[8 * i + 7]);
+  }
+
+  __m512i f[32];
+
+#define PACKET32H_TRANSPOSE_HELPER(X, Y)                                                            \
+  do {                                                                                              \
+    f[Y * 8] = _mm512_inserti32x4(f[Y * 8], _mm512_extracti32x4_epi32(q[X * 8], Y), X);             \
+    f[Y * 8 + 1] = _mm512_inserti32x4(f[Y * 8 + 1], _mm512_extracti32x4_epi32(q[X * 8 + 1], Y), X); \
+    f[Y * 8 + 2] = _mm512_inserti32x4(f[Y * 8 + 2], _mm512_extracti32x4_epi32(q[X * 8 + 2], Y), X); \
+    f[Y * 8 + 3] = _mm512_inserti32x4(f[Y * 8 + 3], _mm512_extracti32x4_epi32(q[X * 8 + 3], Y), X); \
+    f[Y * 8 + 4] = _mm512_inserti32x4(f[Y * 8 + 4], _mm512_extracti32x4_epi32(q[X * 8 + 4], Y), X); \
+    f[Y * 8 + 5] = _mm512_inserti32x4(f[Y * 8 + 5], _mm512_extracti32x4_epi32(q[X * 8 + 5], Y), X); \
+    f[Y * 8 + 6] = _mm512_inserti32x4(f[Y * 8 + 6], _mm512_extracti32x4_epi32(q[X * 8 + 6], Y), X); \
+    f[Y * 8 + 7] = _mm512_inserti32x4(f[Y * 8 + 7], _mm512_extracti32x4_epi32(q[X * 8 + 7], Y), X); \
+  } while (false);
+
+  PACKET32H_TRANSPOSE_HELPER(0, 0);
+  PACKET32H_TRANSPOSE_HELPER(1, 1);
+  PACKET32H_TRANSPOSE_HELPER(2, 2);
+  PACKET32H_TRANSPOSE_HELPER(3, 3);
+
+  PACKET32H_TRANSPOSE_HELPER(1, 0);
+  PACKET32H_TRANSPOSE_HELPER(2, 0);
+  PACKET32H_TRANSPOSE_HELPER(3, 0);
+  PACKET32H_TRANSPOSE_HELPER(2, 1);
+  PACKET32H_TRANSPOSE_HELPER(3, 1);
+  PACKET32H_TRANSPOSE_HELPER(3, 2);
+
+  PACKET32H_TRANSPOSE_HELPER(0, 1);
+  PACKET32H_TRANSPOSE_HELPER(0, 2);
+  PACKET32H_TRANSPOSE_HELPER(0, 3);
+  PACKET32H_TRANSPOSE_HELPER(1, 2);
+  PACKET32H_TRANSPOSE_HELPER(1, 3);
+  PACKET32H_TRANSPOSE_HELPER(2, 3);
+
+#undef PACKET32H_TRANSPOSE_HELPER
+
+  EIGEN_UNROLL_LOOP
+  for (int i = 0; i < 32; i++) {
+    a.packet[i] = _mm512_castsi512_ph(f[i]);
+  }
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet32h, 4>& a) {
+  __m512i p0, p1, p2, p3, t0, t1, t2, t3, a0, a1, a2, a3;
+  t0 = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[0]), _mm512_castph_si512(a.packet[1]));
+  t1 = _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[0]), _mm512_castph_si512(a.packet[1]));
+  t2 = _mm512_unpacklo_epi16(_mm512_castph_si512(a.packet[2]), _mm512_castph_si512(a.packet[3]));
+  t3 = _mm512_unpackhi_epi16(_mm512_castph_si512(a.packet[2]), _mm512_castph_si512(a.packet[3]));
+
+  p0 = _mm512_unpacklo_epi32(t0, t2);
+  p1 = _mm512_unpackhi_epi32(t0, t2);
+  p2 = _mm512_unpacklo_epi32(t1, t3);
+  p3 = _mm512_unpackhi_epi32(t1, t3);
+
+  a0 = p0;
+  a1 = p1;
+  a2 = p2;
+  a3 = p3;
+
+  a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p1, 0), 1);
+  a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p0, 1), 0);
+
+  a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p2, 0), 2);
+  a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p0, 2), 0);
+
+  a0 = _mm512_inserti32x4(a0, _mm512_extracti32x4_epi32(p3, 0), 3);
+  a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p0, 3), 0);
+
+  a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p2, 1), 2);
+  a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p1, 2), 1);
+
+  a2 = _mm512_inserti32x4(a2, _mm512_extracti32x4_epi32(p3, 2), 3);
+  a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p2, 3), 2);
+
+  a1 = _mm512_inserti32x4(a1, _mm512_extracti32x4_epi32(p3, 1), 3);
+  a3 = _mm512_inserti32x4(a3, _mm512_extracti32x4_epi32(p1, 3), 1);
+
+  a.packet[0] = _mm512_castsi512_ph(a0);
+  a.packet[1] = _mm512_castsi512_ph(a1);
+  a.packet[2] = _mm512_castsi512_ph(a2);
+  a.packet[3] = _mm512_castsi512_ph(a3);
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16h, 16>& kernel) {
+  __m256i a = _mm256_castph_si256(kernel.packet[0]);
+  __m256i b = _mm256_castph_si256(kernel.packet[1]);
+  __m256i c = _mm256_castph_si256(kernel.packet[2]);
+  __m256i d = _mm256_castph_si256(kernel.packet[3]);
+  __m256i e = _mm256_castph_si256(kernel.packet[4]);
+  __m256i f = _mm256_castph_si256(kernel.packet[5]);
+  __m256i g = _mm256_castph_si256(kernel.packet[6]);
+  __m256i h = _mm256_castph_si256(kernel.packet[7]);
+  __m256i i = _mm256_castph_si256(kernel.packet[8]);
+  __m256i j = _mm256_castph_si256(kernel.packet[9]);
+  __m256i k = _mm256_castph_si256(kernel.packet[10]);
+  __m256i l = _mm256_castph_si256(kernel.packet[11]);
+  __m256i m = _mm256_castph_si256(kernel.packet[12]);
+  __m256i n = _mm256_castph_si256(kernel.packet[13]);
+  __m256i o = _mm256_castph_si256(kernel.packet[14]);
+  __m256i p = _mm256_castph_si256(kernel.packet[15]);
+
+  __m256i ab_07 = _mm256_unpacklo_epi16(a, b);
+  __m256i cd_07 = _mm256_unpacklo_epi16(c, d);
+  __m256i ef_07 = _mm256_unpacklo_epi16(e, f);
+  __m256i gh_07 = _mm256_unpacklo_epi16(g, h);
+  __m256i ij_07 = _mm256_unpacklo_epi16(i, j);
+  __m256i kl_07 = _mm256_unpacklo_epi16(k, l);
+  __m256i mn_07 = _mm256_unpacklo_epi16(m, n);
+  __m256i op_07 = _mm256_unpacklo_epi16(o, p);
+
+  __m256i ab_8f = _mm256_unpackhi_epi16(a, b);
+  __m256i cd_8f = _mm256_unpackhi_epi16(c, d);
+  __m256i ef_8f = _mm256_unpackhi_epi16(e, f);
+  __m256i gh_8f = _mm256_unpackhi_epi16(g, h);
+  __m256i ij_8f = _mm256_unpackhi_epi16(i, j);
+  __m256i kl_8f = _mm256_unpackhi_epi16(k, l);
+  __m256i mn_8f = _mm256_unpackhi_epi16(m, n);
+  __m256i op_8f = _mm256_unpackhi_epi16(o, p);
+
+  __m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07);
+  __m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07);
+  __m256i efgh_03 = _mm256_unpacklo_epi32(ef_07, gh_07);
+  __m256i efgh_47 = _mm256_unpackhi_epi32(ef_07, gh_07);
+  __m256i ijkl_03 = _mm256_unpacklo_epi32(ij_07, kl_07);
+  __m256i ijkl_47 = _mm256_unpackhi_epi32(ij_07, kl_07);
+  __m256i mnop_03 = _mm256_unpacklo_epi32(mn_07, op_07);
+  __m256i mnop_47 = _mm256_unpackhi_epi32(mn_07, op_07);
+
+  __m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f);
+  __m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f);
+  __m256i efgh_8b = _mm256_unpacklo_epi32(ef_8f, gh_8f);
+  __m256i efgh_cf = _mm256_unpackhi_epi32(ef_8f, gh_8f);
+  __m256i ijkl_8b = _mm256_unpacklo_epi32(ij_8f, kl_8f);
+  __m256i ijkl_cf = _mm256_unpackhi_epi32(ij_8f, kl_8f);
+  __m256i mnop_8b = _mm256_unpacklo_epi32(mn_8f, op_8f);
+  __m256i mnop_cf = _mm256_unpackhi_epi32(mn_8f, op_8f);
+
+  __m256i abcdefgh_01 = _mm256_unpacklo_epi64(abcd_03, efgh_03);
+  __m256i abcdefgh_23 = _mm256_unpackhi_epi64(abcd_03, efgh_03);
+  __m256i ijklmnop_01 = _mm256_unpacklo_epi64(ijkl_03, mnop_03);
+  __m256i ijklmnop_23 = _mm256_unpackhi_epi64(ijkl_03, mnop_03);
+  __m256i abcdefgh_45 = _mm256_unpacklo_epi64(abcd_47, efgh_47);
+  __m256i abcdefgh_67 = _mm256_unpackhi_epi64(abcd_47, efgh_47);
+  __m256i ijklmnop_45 = _mm256_unpacklo_epi64(ijkl_47, mnop_47);
+  __m256i ijklmnop_67 = _mm256_unpackhi_epi64(ijkl_47, mnop_47);
+  __m256i abcdefgh_89 = _mm256_unpacklo_epi64(abcd_8b, efgh_8b);
+  __m256i abcdefgh_ab = _mm256_unpackhi_epi64(abcd_8b, efgh_8b);
+  __m256i ijklmnop_89 = _mm256_unpacklo_epi64(ijkl_8b, mnop_8b);
+  __m256i ijklmnop_ab = _mm256_unpackhi_epi64(ijkl_8b, mnop_8b);
+  __m256i abcdefgh_cd = _mm256_unpacklo_epi64(abcd_cf, efgh_cf);
+  __m256i abcdefgh_ef = _mm256_unpackhi_epi64(abcd_cf, efgh_cf);
+  __m256i ijklmnop_cd = _mm256_unpacklo_epi64(ijkl_cf, mnop_cf);
+  __m256i ijklmnop_ef = _mm256_unpackhi_epi64(ijkl_cf, mnop_cf);
+
+  // NOTE: no unpacklo/hi instr in this case, so using permute instr.
+  __m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x20);
+  __m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x20);
+  __m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x20);
+  __m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x20);
+  __m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x20);
+  __m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x20);
+  __m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x20);
+  __m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x20);
+  __m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, 0x31);
+  __m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, 0x31);
+  __m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, 0x31);
+  __m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, 0x31);
+  __m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, 0x31);
+  __m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, 0x31);
+  __m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, 0x31);
+  __m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, 0x31);
+
+  kernel.packet[0] = _mm256_castsi256_ph(a_p_0);
+  kernel.packet[1] = _mm256_castsi256_ph(a_p_1);
+  kernel.packet[2] = _mm256_castsi256_ph(a_p_2);
+  kernel.packet[3] = _mm256_castsi256_ph(a_p_3);
+  kernel.packet[4] = _mm256_castsi256_ph(a_p_4);
+  kernel.packet[5] = _mm256_castsi256_ph(a_p_5);
+  kernel.packet[6] = _mm256_castsi256_ph(a_p_6);
+  kernel.packet[7] = _mm256_castsi256_ph(a_p_7);
+  kernel.packet[8] = _mm256_castsi256_ph(a_p_8);
+  kernel.packet[9] = _mm256_castsi256_ph(a_p_9);
+  kernel.packet[10] = _mm256_castsi256_ph(a_p_a);
+  kernel.packet[11] = _mm256_castsi256_ph(a_p_b);
+  kernel.packet[12] = _mm256_castsi256_ph(a_p_c);
+  kernel.packet[13] = _mm256_castsi256_ph(a_p_d);
+  kernel.packet[14] = _mm256_castsi256_ph(a_p_e);
+  kernel.packet[15] = _mm256_castsi256_ph(a_p_f);
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16h, 8>& kernel) {
+  EIGEN_ALIGN64 half in[8][16];
+  pstore<half>(in[0], kernel.packet[0]);
+  pstore<half>(in[1], kernel.packet[1]);
+  pstore<half>(in[2], kernel.packet[2]);
+  pstore<half>(in[3], kernel.packet[3]);
+  pstore<half>(in[4], kernel.packet[4]);
+  pstore<half>(in[5], kernel.packet[5]);
+  pstore<half>(in[6], kernel.packet[6]);
+  pstore<half>(in[7], kernel.packet[7]);
+
+  EIGEN_ALIGN64 half out[8][16];
+
+  for (int i = 0; i < 8; ++i) {
+    for (int j = 0; j < 8; ++j) {
+      out[i][j] = in[j][2 * i];
+    }
+    for (int j = 0; j < 8; ++j) {
+      out[i][j + 8] = in[j][2 * i + 1];
+    }
+  }
+
+  kernel.packet[0] = pload<Packet16h>(out[0]);
+  kernel.packet[1] = pload<Packet16h>(out[1]);
+  kernel.packet[2] = pload<Packet16h>(out[2]);
+  kernel.packet[3] = pload<Packet16h>(out[3]);
+  kernel.packet[4] = pload<Packet16h>(out[4]);
+  kernel.packet[5] = pload<Packet16h>(out[5]);
+  kernel.packet[6] = pload<Packet16h>(out[6]);
+  kernel.packet[7] = pload<Packet16h>(out[7]);
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16h, 4>& kernel) {
+  EIGEN_ALIGN64 half in[4][16];
+  pstore<half>(in[0], kernel.packet[0]);
+  pstore<half>(in[1], kernel.packet[1]);
+  pstore<half>(in[2], kernel.packet[2]);
+  pstore<half>(in[3], kernel.packet[3]);
+
+  EIGEN_ALIGN64 half out[4][16];
+
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      out[i][j] = in[j][4 * i];
+    }
+    for (int j = 0; j < 4; ++j) {
+      out[i][j + 4] = in[j][4 * i + 1];
+    }
+    for (int j = 0; j < 4; ++j) {
+      out[i][j + 8] = in[j][4 * i + 2];
+    }
+    for (int j = 0; j < 4; ++j) {
+      out[i][j + 12] = in[j][4 * i + 3];
+    }
+  }
+
+  kernel.packet[0] = pload<Packet16h>(out[0]);
+  kernel.packet[1] = pload<Packet16h>(out[1]);
+  kernel.packet[2] = pload<Packet16h>(out[2]);
+  kernel.packet[3] = pload<Packet16h>(out[3]);
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8h, 8>& kernel) {
+  __m128i a = _mm_castph_si128(kernel.packet[0]);
+  __m128i b = _mm_castph_si128(kernel.packet[1]);
+  __m128i c = _mm_castph_si128(kernel.packet[2]);
+  __m128i d = _mm_castph_si128(kernel.packet[3]);
+  __m128i e = _mm_castph_si128(kernel.packet[4]);
+  __m128i f = _mm_castph_si128(kernel.packet[5]);
+  __m128i g = _mm_castph_si128(kernel.packet[6]);
+  __m128i h = _mm_castph_si128(kernel.packet[7]);
+
+  __m128i a03b03 = _mm_unpacklo_epi16(a, b);
+  __m128i c03d03 = _mm_unpacklo_epi16(c, d);
+  __m128i e03f03 = _mm_unpacklo_epi16(e, f);
+  __m128i g03h03 = _mm_unpacklo_epi16(g, h);
+  __m128i a47b47 = _mm_unpackhi_epi16(a, b);
+  __m128i c47d47 = _mm_unpackhi_epi16(c, d);
+  __m128i e47f47 = _mm_unpackhi_epi16(e, f);
+  __m128i g47h47 = _mm_unpackhi_epi16(g, h);
+
+  __m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03);
+  __m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03);
+  __m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03);
+  __m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03);
+  __m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47);
+  __m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47);
+  __m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47);
+  __m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47);
+
+  __m128i a0b0c0d0e0f0g0h0 = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01);
+  __m128i a1b1c1d1e1f1g1h1 = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01);
+  __m128i a2b2c2d2e2f2g2h2 = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23);
+  __m128i a3b3c3d3e3f3g3h3 = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23);
+  __m128i a4b4c4d4e4f4g4h4 = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45);
+  __m128i a5b5c5d5e5f5g5h5 = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45);
+  __m128i a6b6c6d6e6f6g6h6 = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67);
+  __m128i a7b7c7d7e7f7g7h7 = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67);
+
+  kernel.packet[0] = _mm_castsi128_ph(a0b0c0d0e0f0g0h0);
+  kernel.packet[1] = _mm_castsi128_ph(a1b1c1d1e1f1g1h1);
+  kernel.packet[2] = _mm_castsi128_ph(a2b2c2d2e2f2g2h2);
+  kernel.packet[3] = _mm_castsi128_ph(a3b3c3d3e3f3g3h3);
+  kernel.packet[4] = _mm_castsi128_ph(a4b4c4d4e4f4g4h4);
+  kernel.packet[5] = _mm_castsi128_ph(a5b5c5d5e5f5g5h5);
+  kernel.packet[6] = _mm_castsi128_ph(a6b6c6d6e6f6g6h6);
+  kernel.packet[7] = _mm_castsi128_ph(a7b7c7d7e7f7g7h7);
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8h, 4>& kernel) {
+  EIGEN_ALIGN32 Eigen::half in[4][8];
+  pstore<Eigen::half>(in[0], kernel.packet[0]);
+  pstore<Eigen::half>(in[1], kernel.packet[1]);
+  pstore<Eigen::half>(in[2], kernel.packet[2]);
+  pstore<Eigen::half>(in[3], kernel.packet[3]);
+
+  EIGEN_ALIGN32 Eigen::half out[4][8];
+
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      out[i][j] = in[j][2 * i];
+    }
+    for (int j = 0; j < 4; ++j) {
+      out[i][j + 4] = in[j][2 * i + 1];
+    }
+  }
+
+  kernel.packet[0] = pload<Packet8h>(out[0]);
+  kernel.packet[1] = pload<Packet8h>(out[1]);
+  kernel.packet[2] = pload<Packet8h>(out[2]);
+  kernel.packet[3] = pload<Packet8h>(out[3]);
+}
+
+// preverse
+
+template <>
+EIGEN_STRONG_INLINE Packet32h preverse(const Packet32h& a) {
+  return _mm512_permutexvar_ph(_mm512_set_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+                                                20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31),
+                               a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h preverse(const Packet16h& a) {
+  __m128i m = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
+  return _mm256_castsi256_ph(_mm256_insertf128_si256(
+      _mm256_castsi128_si256(_mm_shuffle_epi8(_mm256_extractf128_si256(_mm256_castph_si256(a), 1), m)),
+      _mm_shuffle_epi8(_mm256_extractf128_si256(_mm256_castph_si256(a), 0), m), 1));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h preverse(const Packet8h& a) {
+  __m128i m = _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
+  return _mm_castsi128_ph(_mm_shuffle_epi8(_mm_castph_si128(a), m));
+}
+
+// pscatter
+
+template <>
+EIGEN_STRONG_INLINE void pscatter<half, Packet32h>(half* to, const Packet32h& from, Index stride) {
+  EIGEN_ALIGN64 half aux[32];
+  pstore(aux, from);
+
+  EIGEN_UNROLL_LOOP
+  for (int i = 0; i < 32; i++) {
+    to[stride * i] = aux[i];
+  }
+}
+template <>
+EIGEN_STRONG_INLINE void pscatter<half, Packet16h>(half* to, const Packet16h& from, Index stride) {
+  EIGEN_ALIGN64 half aux[16];
+  pstore(aux, from);
+  to[stride * 0] = aux[0];
+  to[stride * 1] = aux[1];
+  to[stride * 2] = aux[2];
+  to[stride * 3] = aux[3];
+  to[stride * 4] = aux[4];
+  to[stride * 5] = aux[5];
+  to[stride * 6] = aux[6];
+  to[stride * 7] = aux[7];
+  to[stride * 8] = aux[8];
+  to[stride * 9] = aux[9];
+  to[stride * 10] = aux[10];
+  to[stride * 11] = aux[11];
+  to[stride * 12] = aux[12];
+  to[stride * 13] = aux[13];
+  to[stride * 14] = aux[14];
+  to[stride * 15] = aux[15];
+}
+
+template <>
+EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8h>(Eigen::half* to, const Packet8h& from, Index stride) {
+  EIGEN_ALIGN32 Eigen::half aux[8];
+  pstore(aux, from);
+  to[stride * 0] = aux[0];
+  to[stride * 1] = aux[1];
+  to[stride * 2] = aux[2];
+  to[stride * 3] = aux[3];
+  to[stride * 4] = aux[4];
+  to[stride * 5] = aux[5];
+  to[stride * 6] = aux[6];
+  to[stride * 7] = aux[7];
+}
+
+// pgather
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pgather<Eigen::half, Packet32h>(const Eigen::half* from, Index stride) {
+  return _mm512_set_ph(from[31 * stride].x, from[30 * stride].x, from[29 * stride].x, from[28 * stride].x,
+                       from[27 * stride].x, from[26 * stride].x, from[25 * stride].x, from[24 * stride].x,
+                       from[23 * stride].x, from[22 * stride].x, from[21 * stride].x, from[20 * stride].x,
+                       from[19 * stride].x, from[18 * stride].x, from[17 * stride].x, from[16 * stride].x,
+                       from[15 * stride].x, from[14 * stride].x, from[13 * stride].x, from[12 * stride].x,
+                       from[11 * stride].x, from[10 * stride].x, from[9 * stride].x, from[8 * stride].x,
+                       from[7 * stride].x, from[6 * stride].x, from[5 * stride].x, from[4 * stride].x,
+                       from[3 * stride].x, from[2 * stride].x, from[1 * stride].x, from[0 * stride].x);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pgather<Eigen::half, Packet16h>(const Eigen::half* from, Index stride) {
+  return _mm256_set_ph(from[15 * stride].x, from[14 * stride].x, from[13 * stride].x, from[12 * stride].x,
+                       from[11 * stride].x, from[10 * stride].x, from[9 * stride].x, from[8 * stride].x,
+                       from[7 * stride].x, from[6 * stride].x, from[5 * stride].x, from[4 * stride].x,
+                       from[3 * stride].x, from[2 * stride].x, from[1 * stride].x, from[0 * stride].x);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8h pgather<Eigen::half, Packet8h>(const Eigen::half* from, Index stride) {
+  return _mm_set_ph(from[7 * stride].x, from[6 * stride].x, from[5 * stride].x, from[4 * stride].x, from[3 * stride].x,
+                    from[2 * stride].x, from[1 * stride].x, from[0 * stride].x);
+}
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_PACKET_MATH_FP16_AVX512_H
diff --git a/inst/include/Eigen/src/Core/arch/AVX512/Reductions.h b/inst/include/Eigen/src/Core/arch/AVX512/Reductions.h
new file mode 100644
index 00000000..f7b4c25a
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/AVX512/Reductions.h
@@ -0,0 +1,297 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2025 Charlie Schlosser <cs.schlosser@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_REDUCTIONS_AVX512_H
+#define EIGEN_REDUCTIONS_AVX512_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet16i -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <>
+EIGEN_STRONG_INLINE int predux(const Packet16i& a) {
+  return _mm512_reduce_add_epi32(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE int predux_mul(const Packet16i& a) {
+  return _mm512_reduce_mul_epi32(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE int predux_min(const Packet16i& a) {
+  return _mm512_reduce_min_epi32(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE int predux_max(const Packet16i& a) {
+  return _mm512_reduce_max_epi32(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet16i& a) {
+  return _mm512_reduce_or_epi32(a) != 0;
+}
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet8l -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <>
+EIGEN_STRONG_INLINE int64_t predux(const Packet8l& a) {
+  return _mm512_reduce_add_epi64(a);
+}
+
+#if EIGEN_COMP_MSVC
+// MSVC's _mm512_reduce_mul_epi64 is borked, at least up to and including 1939.
+//    alignas(64) int64_t data[] = { 1,1,-1,-1,1,-1,-1,-1 };
+//    int64_t out = _mm512_reduce_mul_epi64(_mm512_load_epi64(data));
+// produces garbage: 4294967295.  It seems to happen whenever the output is supposed to be negative.
+// Fall back to a manual approach:
+template <>
+EIGEN_STRONG_INLINE int64_t predux_mul(const Packet8l& a) {
+  Packet4l lane0 = _mm512_extracti64x4_epi64(a, 0);
+  Packet4l lane1 = _mm512_extracti64x4_epi64(a, 1);
+  return predux_mul(pmul(lane0, lane1));
+}
+#else
+template <>
+EIGEN_STRONG_INLINE int64_t predux_mul<Packet8l>(const Packet8l& a) {
+  return _mm512_reduce_mul_epi64(a);
+}
+#endif
+
+template <>
+EIGEN_STRONG_INLINE int64_t predux_min(const Packet8l& a) {
+  return _mm512_reduce_min_epi64(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE int64_t predux_max(const Packet8l& a) {
+  return _mm512_reduce_max_epi64(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet8l& a) {
+  return _mm512_reduce_or_epi64(a) != 0;
+}
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet16f -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <>
+EIGEN_STRONG_INLINE float predux(const Packet16f& a) {
+  return _mm512_reduce_add_ps(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_mul(const Packet16f& a) {
+  return _mm512_reduce_mul_ps(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_min(const Packet16f& a) {
+  return _mm512_reduce_min_ps(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_min<PropagateNumbers>(const Packet16f& a) {
+  Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
+  Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
+  return predux_min<PropagateNumbers>(pmin<PropagateNumbers>(lane0, lane1));
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_min<PropagateNaN>(const Packet16f& a) {
+  Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
+  Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
+  return predux_min<PropagateNaN>(pmin<PropagateNaN>(lane0, lane1));
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_max(const Packet16f& a) {
+  return _mm512_reduce_max_ps(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_max<PropagateNumbers>(const Packet16f& a) {
+  Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
+  Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
+  return predux_max<PropagateNumbers>(pmax<PropagateNumbers>(lane0, lane1));
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_max<PropagateNaN>(const Packet16f& a) {
+  Packet8f lane0 = _mm512_extractf32x8_ps(a, 0);
+  Packet8f lane1 = _mm512_extractf32x8_ps(a, 1);
+  return predux_max<PropagateNaN>(pmax<PropagateNaN>(lane0, lane1));
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet16f& a) {
+  return _mm512_reduce_or_epi32(_mm512_castps_si512(a)) != 0;
+}
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet8d -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <>
+EIGEN_STRONG_INLINE double predux(const Packet8d& a) {
+  return _mm512_reduce_add_pd(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_mul(const Packet8d& a) {
+  return _mm512_reduce_mul_pd(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_min(const Packet8d& a) {
+  return _mm512_reduce_min_pd(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_min<PropagateNumbers>(const Packet8d& a) {
+  Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
+  Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
+  return predux_min<PropagateNumbers>(pmin<PropagateNumbers>(lane0, lane1));
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_min<PropagateNaN>(const Packet8d& a) {
+  Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
+  Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
+  return predux_min<PropagateNaN>(pmin<PropagateNaN>(lane0, lane1));
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_max(const Packet8d& a) {
+  return _mm512_reduce_max_pd(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_max<PropagateNumbers>(const Packet8d& a) {
+  Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
+  Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
+  return predux_max<PropagateNumbers>(pmax<PropagateNumbers>(lane0, lane1));
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_max<PropagateNaN>(const Packet8d& a) {
+  Packet4d lane0 = _mm512_extractf64x4_pd(a, 0);
+  Packet4d lane1 = _mm512_extractf64x4_pd(a, 1);
+  return predux_max<PropagateNaN>(pmax<PropagateNaN>(lane0, lane1));
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet8d& a) {
+  return _mm512_reduce_or_epi64(_mm512_castpd_si512(a)) != 0;
+}
+
+#ifndef EIGEN_VECTORIZE_AVX512FP16
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet16h -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <>
+EIGEN_STRONG_INLINE half predux(const Packet16h& from) {
+  return half(predux(half2float(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_mul(const Packet16h& from) {
+  return half(predux_mul(half2float(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_min(const Packet16h& from) {
+  return half(predux_min(half2float(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_min<PropagateNumbers>(const Packet16h& from) {
+  return half(predux_min<PropagateNumbers>(half2float(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_min<PropagateNaN>(const Packet16h& from) {
+  return half(predux_min<PropagateNaN>(half2float(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_max(const Packet16h& from) {
+  return half(predux_max(half2float(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_max<PropagateNumbers>(const Packet16h& from) {
+  return half(predux_max<PropagateNumbers>(half2float(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE half predux_max<PropagateNaN>(const Packet16h& from) {
+  return half(predux_max<PropagateNaN>(half2float(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet16h& a) {
+  return predux_any<Packet8i>(a.m_val);
+}
+#endif
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet16bf -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux(const Packet16bf& from) {
+  return static_cast<bfloat16>(predux<Packet16f>(Bf16ToF32(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_mul(const Packet16bf& from) {
+  return static_cast<bfloat16>(predux_mul<Packet16f>(Bf16ToF32(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_min(const Packet16bf& from) {
+  return static_cast<bfloat16>(predux_min<Packet16f>(Bf16ToF32(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_min<PropagateNumbers>(const Packet16bf& from) {
+  return static_cast<bfloat16>(predux_min<PropagateNumbers>(Bf16ToF32(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_min<PropagateNaN>(const Packet16bf& from) {
+  return static_cast<bfloat16>(predux_min<PropagateNaN>(Bf16ToF32(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_max(const Packet16bf& from) {
+  return static_cast<bfloat16>(predux_max(Bf16ToF32(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_max<PropagateNumbers>(const Packet16bf& from) {
+  return static_cast<bfloat16>(predux_max<PropagateNumbers>(Bf16ToF32(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_max<PropagateNaN>(const Packet16bf& from) {
+  return static_cast<bfloat16>(predux_max<PropagateNaN>(Bf16ToF32(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet16bf& a) {
+  return predux_any<Packet8i>(a.m_val);
+}
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_REDUCTIONS_AVX512_H
diff --git a/inst/include/Eigen/src/Core/arch/AVX512/TrsmKernel.h b/inst/include/Eigen/src/Core/arch/AVX512/TrsmKernel.h
new file mode 100644
index 00000000..c763b5fe
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/AVX512/TrsmKernel.h
@@ -0,0 +1,1167 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2022 Intel Corporation
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CORE_ARCH_AVX512_TRSM_KERNEL_H
+#define EIGEN_CORE_ARCH_AVX512_TRSM_KERNEL_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+#if !defined(EIGEN_USE_AVX512_TRSM_KERNELS)
+#define EIGEN_USE_AVX512_TRSM_KERNELS 1
+#endif
+
+// TRSM kernels currently unconditionally rely on malloc with AVX512.
+// Disable them if malloc is explicitly disabled at compile-time.
+#ifdef EIGEN_NO_MALLOC
+#undef EIGEN_USE_AVX512_TRSM_KERNELS
+#define EIGEN_USE_AVX512_TRSM_KERNELS 0
+#endif
+
+#if EIGEN_USE_AVX512_TRSM_KERNELS
+#if !defined(EIGEN_USE_AVX512_TRSM_R_KERNELS)
+#define EIGEN_USE_AVX512_TRSM_R_KERNELS 1
+#endif
+#if !defined(EIGEN_USE_AVX512_TRSM_L_KERNELS)
+#define EIGEN_USE_AVX512_TRSM_L_KERNELS 1
+#endif
+#else  // EIGEN_USE_AVX512_TRSM_KERNELS == 0
+#define EIGEN_USE_AVX512_TRSM_R_KERNELS 0
+#define EIGEN_USE_AVX512_TRSM_L_KERNELS 0
+#endif
+
+// Need this for some std::min calls.
+#ifdef min
+#undef min
+#endif
+
+namespace Eigen {
+namespace internal {
+
+#define EIGEN_AVX_MAX_NUM_ACC (int64_t(24))
+#define EIGEN_AVX_MAX_NUM_ROW (int64_t(8))  // Denoted L in code.
+#define EIGEN_AVX_MAX_K_UNROL (int64_t(4))
+#define EIGEN_AVX_B_LOAD_SETS (int64_t(2))
+#define EIGEN_AVX_MAX_A_BCAST (int64_t(2))
+typedef Packet16f vecFullFloat;
+typedef Packet8d vecFullDouble;
+typedef Packet8f vecHalfFloat;
+typedef Packet4d vecHalfDouble;
+
+// Compile-time unrolls are implemented here.
+// Note: this depends on macros and typedefs above.
+#include "TrsmUnrolls.inc"
+
+#if (EIGEN_USE_AVX512_TRSM_KERNELS) && (EIGEN_COMP_CLANG != 0)
+/**
+ * For smaller problem sizes, and certain compilers, using the optimized kernels trsmKernelL/R directly
+ * is faster than the packed versions in TriangularSolverMatrix.h.
+ *
+ * The current heuristic is based on having having all arrays used in the largest gemm-update
+ * in triSolve fit in roughly L2Cap (percentage) of the L2 cache. These cutoffs are a bit conservative and could be
+ * larger for some trsm cases.
+ * The formula:
+ *
+ *   (L*M + M*N + L*N)*sizeof(Scalar) < L2Cache*L2Cap
+ *
+ *  L = number of rows to solve at a time
+ *  N = number of rhs
+ *  M = Dimension of triangular matrix
+ *
+ */
+#if !defined(EIGEN_ENABLE_AVX512_NOCOPY_TRSM_CUTOFFS)
+#define EIGEN_ENABLE_AVX512_NOCOPY_TRSM_CUTOFFS 1
+#endif
+
+#if EIGEN_ENABLE_AVX512_NOCOPY_TRSM_CUTOFFS
+
+#if EIGEN_USE_AVX512_TRSM_R_KERNELS
+#if !defined(EIGEN_ENABLE_AVX512_NOCOPY_TRSM_R_CUTOFFS)
+#define EIGEN_ENABLE_AVX512_NOCOPY_TRSM_R_CUTOFFS 1
+#endif  // !defined(EIGEN_ENABLE_AVX512_NOCOPY_TRSM_R_CUTOFFS)
+#endif
+
+#if EIGEN_USE_AVX512_TRSM_L_KERNELS
+#if !defined(EIGEN_ENABLE_AVX512_NOCOPY_TRSM_L_CUTOFFS)
+#define EIGEN_ENABLE_AVX512_NOCOPY_TRSM_L_CUTOFFS 1
+#endif
+#endif  // EIGEN_USE_AVX512_TRSM_L_KERNELS
+
+#else  // EIGEN_ENABLE_AVX512_NOCOPY_TRSM_CUTOFFS == 0
+#define EIGEN_ENABLE_AVX512_NOCOPY_TRSM_R_CUTOFFS 0
+#define EIGEN_ENABLE_AVX512_NOCOPY_TRSM_L_CUTOFFS 0
+#endif  // EIGEN_ENABLE_AVX512_NOCOPY_TRSM_CUTOFFS
+
+template <typename Scalar>
+int64_t avx512_trsm_cutoff(int64_t L2Size, int64_t N, double L2Cap) {
+  const int64_t U3 = 3 * packet_traits<Scalar>::size;
+  const int64_t MaxNb = 5 * U3;
+  int64_t Nb = std::min(MaxNb, N);
+  double cutoff_d =
+      (((L2Size * L2Cap) / (sizeof(Scalar))) - (EIGEN_AVX_MAX_NUM_ROW)*Nb) / ((EIGEN_AVX_MAX_NUM_ROW) + Nb);
+  int64_t cutoff_l = static_cast<int64_t>(cutoff_d);
+  return (cutoff_l / EIGEN_AVX_MAX_NUM_ROW) * EIGEN_AVX_MAX_NUM_ROW;
+}
+#else  // !(EIGEN_USE_AVX512_TRSM_KERNELS) || !(EIGEN_COMP_CLANG != 0)
+#define EIGEN_ENABLE_AVX512_NOCOPY_TRSM_CUTOFFS 0
+#define EIGEN_ENABLE_AVX512_NOCOPY_TRSM_R_CUTOFFS 0
+#define EIGEN_ENABLE_AVX512_NOCOPY_TRSM_L_CUTOFFS 0
+#endif
+
+/**
+ * Used by gemmKernel for the case A/B row-major and C col-major.
+ */
+template <typename Scalar, typename vec, int64_t unrollM, int64_t unrollN, bool remM, bool remN>
+EIGEN_ALWAYS_INLINE void transStoreC(PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm, Scalar *C_arr,
+                                     int64_t LDC, int64_t remM_ = 0, int64_t remN_ = 0) {
+  EIGEN_UNUSED_VARIABLE(remN_);
+  EIGEN_UNUSED_VARIABLE(remM_);
+  using urolls = unrolls::trans<Scalar>;
+
+  constexpr int64_t U3 = urolls::PacketSize * 3;
+  constexpr int64_t U2 = urolls::PacketSize * 2;
+  constexpr int64_t U1 = urolls::PacketSize * 1;
+
+  static_assert(unrollN == U1 || unrollN == U2 || unrollN == U3, "unrollN should be a multiple of PacketSize");
+  static_assert(unrollM == EIGEN_AVX_MAX_NUM_ROW, "unrollM should be equal to EIGEN_AVX_MAX_NUM_ROW");
+
+  urolls::template transpose<unrollN, 0>(zmm);
+  EIGEN_IF_CONSTEXPR(unrollN > U2) urolls::template transpose<unrollN, 2>(zmm);
+  EIGEN_IF_CONSTEXPR(unrollN > U1) urolls::template transpose<unrollN, 1>(zmm);
+
+  static_assert((remN && unrollN == U1) || !remN, "When handling N remainder set unrollN=U1");
+  EIGEN_IF_CONSTEXPR(!remN) {
+    urolls::template storeC<std::min(unrollN, U1), unrollN, 0, remM>(C_arr, LDC, zmm, remM_);
+    EIGEN_IF_CONSTEXPR(unrollN > U1) {
+      constexpr int64_t unrollN_ = std::min(unrollN - U1, U1);
+      urolls::template storeC<unrollN_, unrollN, 1, remM>(C_arr + U1 * LDC, LDC, zmm, remM_);
+    }
+    EIGEN_IF_CONSTEXPR(unrollN > U2) {
+      constexpr int64_t unrollN_ = std::min(unrollN - U2, U1);
+      urolls::template storeC<unrollN_, unrollN, 2, remM>(C_arr + U2 * LDC, LDC, zmm, remM_);
+    }
+  }
+  else {
+    EIGEN_IF_CONSTEXPR((std::is_same<Scalar, float>::value)) {
+      // Note: without "if constexpr" this section of code will also be
+      // parsed by the compiler so each of the storeC will still be instantiated.
+      // We use enable_if in aux_storeC to set it to an empty function for
+      // these cases.
+      if (remN_ == 15)
+        urolls::template storeC<15, unrollN, 0, remM>(C_arr, LDC, zmm, remM_);
+      else if (remN_ == 14)
+        urolls::template storeC<14, unrollN, 0, remM>(C_arr, LDC, zmm, remM_);
+      else if (remN_ == 13)
+        urolls::template storeC<13, unrollN, 0, remM>(C_arr, LDC, zmm, remM_);
+      else if (remN_ == 12)
+        urolls::template storeC<12, unrollN, 0, remM>(C_arr, LDC, zmm, remM_);
+      else if (remN_ == 11)
+        urolls::template storeC<11, unrollN, 0, remM>(C_arr, LDC, zmm, remM_);
+      else if (remN_ == 10)
+        urolls::template storeC<10, unrollN, 0, remM>(C_arr, LDC, zmm, remM_);
+      else if (remN_ == 9)
+        urolls::template storeC<9, unrollN, 0, remM>(C_arr, LDC, zmm, remM_);
+      else if (remN_ == 8)
+        urolls::template storeC<8, unrollN, 0, remM>(C_arr, LDC, zmm, remM_);
+      else if (remN_ == 7)
+        urolls::template storeC<7, unrollN, 0, remM>(C_arr, LDC, zmm, remM_);
+      else if (remN_ == 6)
+        urolls::template storeC<6, unrollN, 0, remM>(C_arr, LDC, zmm, remM_);
+      else if (remN_ == 5)
+        urolls::template storeC<5, unrollN, 0, remM>(C_arr, LDC, zmm, remM_);
+      else if (remN_ == 4)
+        urolls::template storeC<4, unrollN, 0, remM>(C_arr, LDC, zmm, remM_);
+      else if (remN_ == 3)
+        urolls::template storeC<3, unrollN, 0, remM>(C_arr, LDC, zmm, remM_);
+      else if (remN_ == 2)
+        urolls::template storeC<2, unrollN, 0, remM>(C_arr, LDC, zmm, remM_);
+      else if (remN_ == 1)
+        urolls::template storeC<1, unrollN, 0, remM>(C_arr, LDC, zmm, remM_);
+    }
+    else {
+      if (remN_ == 7)
+        urolls::template storeC<7, unrollN, 0, remM>(C_arr, LDC, zmm, remM_);
+      else if (remN_ == 6)
+        urolls::template storeC<6, unrollN, 0, remM>(C_arr, LDC, zmm, remM_);
+      else if (remN_ == 5)
+        urolls::template storeC<5, unrollN, 0, remM>(C_arr, LDC, zmm, remM_);
+      else if (remN_ == 4)
+        urolls::template storeC<4, unrollN, 0, remM>(C_arr, LDC, zmm, remM_);
+      else if (remN_ == 3)
+        urolls::template storeC<3, unrollN, 0, remM>(C_arr, LDC, zmm, remM_);
+      else if (remN_ == 2)
+        urolls::template storeC<2, unrollN, 0, remM>(C_arr, LDC, zmm, remM_);
+      else if (remN_ == 1)
+        urolls::template storeC<1, unrollN, 0, remM>(C_arr, LDC, zmm, remM_);
+    }
+  }
+}
+
+/**
+ * GEMM like operation for trsm panel updates.
+ * Computes: C -= A*B
+ * K must be multiple of 4.
+ *
+ * Unrolls used are {1,2,4,8}x{U1,U2,U3};
+ * For good performance we want K to be large with M/N relatively small, but also large enough
+ * to use the {8,U3} unroll block.
+ *
+ * isARowMajor: is A_arr row-major?
+ * isCRowMajor: is C_arr row-major? (B_arr is assumed to be row-major).
+ * isAdd: C += A*B or C -= A*B (used by trsm)
+ * handleKRem: Handle arbitrary K? This is not needed for trsm.
+ */
+template <typename Scalar, bool isARowMajor, bool isCRowMajor, bool isAdd, bool handleKRem>
+void gemmKernel(Scalar *A_arr, Scalar *B_arr, Scalar *C_arr, int64_t M, int64_t N, int64_t K, int64_t LDA, int64_t LDB,
+                int64_t LDC) {
+  using urolls = unrolls::gemm<Scalar, isAdd>;
+  constexpr int64_t U3 = urolls::PacketSize * 3;
+  constexpr int64_t U2 = urolls::PacketSize * 2;
+  constexpr int64_t U1 = urolls::PacketSize * 1;
+  using vec = typename std::conditional<std::is_same<Scalar, float>::value, vecFullFloat, vecFullDouble>::type;
+  int64_t N_ = (N / U3) * U3;
+  int64_t M_ = (M / EIGEN_AVX_MAX_NUM_ROW) * EIGEN_AVX_MAX_NUM_ROW;
+  int64_t K_ = (K / EIGEN_AVX_MAX_K_UNROL) * EIGEN_AVX_MAX_K_UNROL;
+  int64_t j = 0;
+  for (; j < N_; j += U3) {
+    constexpr int64_t EIGEN_AVX_MAX_B_LOAD = EIGEN_AVX_B_LOAD_SETS * 3;
+    int64_t i = 0;
+    for (; i < M_; i += EIGEN_AVX_MAX_NUM_ROW) {
+      Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)], *B_t = &B_arr[0 * LDB + j];
+      PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
+      urolls::template setzero<3, EIGEN_AVX_MAX_NUM_ROW>(zmm);
+      for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) {
+        urolls::template microKernel<isARowMajor, 3, EIGEN_AVX_MAX_NUM_ROW, EIGEN_AVX_MAX_K_UNROL, EIGEN_AVX_MAX_B_LOAD,
+                                     EIGEN_AVX_MAX_A_BCAST>(B_t, A_t, LDB, LDA, zmm);
+        B_t += EIGEN_AVX_MAX_K_UNROL * LDB;
+        EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL;
+        else A_t += EIGEN_AVX_MAX_K_UNROL * LDA;
+      }
+      EIGEN_IF_CONSTEXPR(handleKRem) {
+        for (int64_t k = K_; k < K; k++) {
+          urolls::template microKernel<isARowMajor, 3, EIGEN_AVX_MAX_NUM_ROW, 1, EIGEN_AVX_B_LOAD_SETS * 3,
+                                       EIGEN_AVX_MAX_A_BCAST>(B_t, A_t, LDB, LDA, zmm);
+          B_t += LDB;
+          EIGEN_IF_CONSTEXPR(isARowMajor) A_t++;
+          else A_t += LDA;
+        }
+      }
+      EIGEN_IF_CONSTEXPR(isCRowMajor) {
+        urolls::template updateC<3, EIGEN_AVX_MAX_NUM_ROW>(&C_arr[i * LDC + j], LDC, zmm);
+        urolls::template storeC<3, EIGEN_AVX_MAX_NUM_ROW>(&C_arr[i * LDC + j], LDC, zmm);
+      }
+      else {
+        transStoreC<Scalar, vec, EIGEN_AVX_MAX_NUM_ROW, U3, false, false>(zmm, &C_arr[i + j * LDC], LDC);
+      }
+    }
+    if (M - i >= 4) {  // Note: this block assumes EIGEN_AVX_MAX_NUM_ROW = 8. Should be removed otherwise
+      Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
+      Scalar *B_t = &B_arr[0 * LDB + j];
+      PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
+      urolls::template setzero<3, 4>(zmm);
+      for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) {
+        urolls::template microKernel<isARowMajor, 3, 4, EIGEN_AVX_MAX_K_UNROL, EIGEN_AVX_B_LOAD_SETS * 3,
+                                     EIGEN_AVX_MAX_A_BCAST>(B_t, A_t, LDB, LDA, zmm);
+        B_t += EIGEN_AVX_MAX_K_UNROL * LDB;
+        EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL;
+        else A_t += EIGEN_AVX_MAX_K_UNROL * LDA;
+      }
+      EIGEN_IF_CONSTEXPR(handleKRem) {
+        for (int64_t k = K_; k < K; k++) {
+          urolls::template microKernel<isARowMajor, 3, 4, 1, EIGEN_AVX_B_LOAD_SETS * 3, EIGEN_AVX_MAX_A_BCAST>(
+              B_t, A_t, LDB, LDA, zmm);
+          B_t += LDB;
+          EIGEN_IF_CONSTEXPR(isARowMajor) A_t++;
+          else A_t += LDA;
+        }
+      }
+      EIGEN_IF_CONSTEXPR(isCRowMajor) {
+        urolls::template updateC<3, 4>(&C_arr[i * LDC + j], LDC, zmm);
+        urolls::template storeC<3, 4>(&C_arr[i * LDC + j], LDC, zmm);
+      }
+      else {
+        transStoreC<Scalar, vec, EIGEN_AVX_MAX_NUM_ROW, U3, true, false>(zmm, &C_arr[i + j * LDC], LDC, 4);
+      }
+      i += 4;
+    }
+    if (M - i >= 2) {
+      Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
+      Scalar *B_t = &B_arr[0 * LDB + j];
+      PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
+      urolls::template setzero<3, 2>(zmm);
+      for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) {
+        urolls::template microKernel<isARowMajor, 3, 2, EIGEN_AVX_MAX_K_UNROL, EIGEN_AVX_B_LOAD_SETS * 3,
+                                     EIGEN_AVX_MAX_A_BCAST>(B_t, A_t, LDB, LDA, zmm);
+        B_t += EIGEN_AVX_MAX_K_UNROL * LDB;
+        EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL;
+        else A_t += EIGEN_AVX_MAX_K_UNROL * LDA;
+      }
+      EIGEN_IF_CONSTEXPR(handleKRem) {
+        for (int64_t k = K_; k < K; k++) {
+          urolls::template microKernel<isARowMajor, 3, 2, 1, EIGEN_AVX_B_LOAD_SETS * 3, EIGEN_AVX_MAX_A_BCAST>(
+              B_t, A_t, LDB, LDA, zmm);
+          B_t += LDB;
+          EIGEN_IF_CONSTEXPR(isARowMajor) A_t++;
+          else A_t += LDA;
+        }
+      }
+      EIGEN_IF_CONSTEXPR(isCRowMajor) {
+        urolls::template updateC<3, 2>(&C_arr[i * LDC + j], LDC, zmm);
+        urolls::template storeC<3, 2>(&C_arr[i * LDC + j], LDC, zmm);
+      }
+      else {
+        transStoreC<Scalar, vec, EIGEN_AVX_MAX_NUM_ROW, U3, true, false>(zmm, &C_arr[i + j * LDC], LDC, 2);
+      }
+      i += 2;
+    }
+    if (M - i > 0) {
+      Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
+      Scalar *B_t = &B_arr[0 * LDB + j];
+      PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
+      urolls::template setzero<3, 1>(zmm);
+      {
+        for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) {
+          urolls::template microKernel<isARowMajor, 3, 1, EIGEN_AVX_MAX_K_UNROL, EIGEN_AVX_B_LOAD_SETS * 3, 1>(
+              B_t, A_t, LDB, LDA, zmm);
+          B_t += EIGEN_AVX_MAX_K_UNROL * LDB;
+          EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL;
+          else A_t += EIGEN_AVX_MAX_K_UNROL * LDA;
+        }
+        EIGEN_IF_CONSTEXPR(handleKRem) {
+          for (int64_t k = K_; k < K; k++) {
+            urolls::template microKernel<isARowMajor, 3, 1, 1, EIGEN_AVX_B_LOAD_SETS * 3, 1>(B_t, A_t, LDB, LDA, zmm);
+            B_t += LDB;
+            EIGEN_IF_CONSTEXPR(isARowMajor) A_t++;
+            else A_t += LDA;
+          }
+        }
+        EIGEN_IF_CONSTEXPR(isCRowMajor) {
+          urolls::template updateC<3, 1>(&C_arr[i * LDC + j], LDC, zmm);
+          urolls::template storeC<3, 1>(&C_arr[i * LDC + j], LDC, zmm);
+        }
+        else {
+          transStoreC<Scalar, vec, EIGEN_AVX_MAX_NUM_ROW, U3, true, false>(zmm, &C_arr[i + j * LDC], LDC, 1);
+        }
+      }
+    }
+  }
+  if (N - j >= U2) {
+    constexpr int64_t EIGEN_AVX_MAX_B_LOAD = EIGEN_AVX_B_LOAD_SETS * 2;
+    int64_t i = 0;
+    for (; i < M_; i += EIGEN_AVX_MAX_NUM_ROW) {
+      Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)], *B_t = &B_arr[0 * LDB + j];
+      EIGEN_IF_CONSTEXPR(isCRowMajor) B_t = &B_arr[0 * LDB + j];
+      PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
+      urolls::template setzero<2, EIGEN_AVX_MAX_NUM_ROW>(zmm);
+      for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) {
+        urolls::template microKernel<isARowMajor, 2, EIGEN_AVX_MAX_NUM_ROW, EIGEN_AVX_MAX_K_UNROL, EIGEN_AVX_MAX_B_LOAD,
+                                     EIGEN_AVX_MAX_A_BCAST>(B_t, A_t, LDB, LDA, zmm);
+        B_t += EIGEN_AVX_MAX_K_UNROL * LDB;
+        EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL;
+        else A_t += EIGEN_AVX_MAX_K_UNROL * LDA;
+      }
+      EIGEN_IF_CONSTEXPR(handleKRem) {
+        for (int64_t k = K_; k < K; k++) {
+          urolls::template microKernel<isARowMajor, 2, EIGEN_AVX_MAX_NUM_ROW, 1, EIGEN_AVX_MAX_B_LOAD,
+                                       EIGEN_AVX_MAX_A_BCAST>(B_t, A_t, LDB, LDA, zmm);
+          B_t += LDB;
+          EIGEN_IF_CONSTEXPR(isARowMajor) A_t++;
+          else A_t += LDA;
+        }
+      }
+      EIGEN_IF_CONSTEXPR(isCRowMajor) {
+        urolls::template updateC<2, EIGEN_AVX_MAX_NUM_ROW>(&C_arr[i * LDC + j], LDC, zmm);
+        urolls::template storeC<2, EIGEN_AVX_MAX_NUM_ROW>(&C_arr[i * LDC + j], LDC, zmm);
+      }
+      else {
+        transStoreC<Scalar, vec, EIGEN_AVX_MAX_NUM_ROW, U2, false, false>(zmm, &C_arr[i + j * LDC], LDC);
+      }
+    }
+    if (M - i >= 4) {  // Note: this block assumes EIGEN_AVX_MAX_NUM_ROW = 8. Should be removed otherwise
+      Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
+      Scalar *B_t = &B_arr[0 * LDB + j];
+      PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
+      urolls::template setzero<2, 4>(zmm);
+      for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) {
+        urolls::template microKernel<isARowMajor, 2, 4, EIGEN_AVX_MAX_K_UNROL, EIGEN_AVX_MAX_B_LOAD,
+                                     EIGEN_AVX_MAX_A_BCAST>(B_t, A_t, LDB, LDA, zmm);
+        B_t += EIGEN_AVX_MAX_K_UNROL * LDB;
+        EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL;
+        else A_t += EIGEN_AVX_MAX_K_UNROL * LDA;
+      }
+      EIGEN_IF_CONSTEXPR(handleKRem) {
+        for (int64_t k = K_; k < K; k++) {
+          urolls::template microKernel<isARowMajor, 2, 4, 1, EIGEN_AVX_MAX_B_LOAD, EIGEN_AVX_MAX_A_BCAST>(B_t, A_t, LDB,
+                                                                                                          LDA, zmm);
+          B_t += LDB;
+          EIGEN_IF_CONSTEXPR(isARowMajor) A_t++;
+          else A_t += LDA;
+        }
+      }
+      EIGEN_IF_CONSTEXPR(isCRowMajor) {
+        urolls::template updateC<2, 4>(&C_arr[i * LDC + j], LDC, zmm);
+        urolls::template storeC<2, 4>(&C_arr[i * LDC + j], LDC, zmm);
+      }
+      else {
+        transStoreC<Scalar, vec, EIGEN_AVX_MAX_NUM_ROW, U2, true, false>(zmm, &C_arr[i + j * LDC], LDC, 4);
+      }
+      i += 4;
+    }
+    if (M - i >= 2) {
+      Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
+      Scalar *B_t = &B_arr[0 * LDB + j];
+      PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
+      urolls::template setzero<2, 2>(zmm);
+      for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) {
+        urolls::template microKernel<isARowMajor, 2, 2, EIGEN_AVX_MAX_K_UNROL, EIGEN_AVX_MAX_B_LOAD,
+                                     EIGEN_AVX_MAX_A_BCAST>(B_t, A_t, LDB, LDA, zmm);
+        B_t += EIGEN_AVX_MAX_K_UNROL * LDB;
+        EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL;
+        else A_t += EIGEN_AVX_MAX_K_UNROL * LDA;
+      }
+      EIGEN_IF_CONSTEXPR(handleKRem) {
+        for (int64_t k = K_; k < K; k++) {
+          urolls::template microKernel<isARowMajor, 2, 2, 1, EIGEN_AVX_MAX_B_LOAD, EIGEN_AVX_MAX_A_BCAST>(B_t, A_t, LDB,
+                                                                                                          LDA, zmm);
+          B_t += LDB;
+          EIGEN_IF_CONSTEXPR(isARowMajor) A_t++;
+          else A_t += LDA;
+        }
+      }
+      EIGEN_IF_CONSTEXPR(isCRowMajor) {
+        urolls::template updateC<2, 2>(&C_arr[i * LDC + j], LDC, zmm);
+        urolls::template storeC<2, 2>(&C_arr[i * LDC + j], LDC, zmm);
+      }
+      else {
+        transStoreC<Scalar, vec, EIGEN_AVX_MAX_NUM_ROW, U2, true, false>(zmm, &C_arr[i + j * LDC], LDC, 2);
+      }
+      i += 2;
+    }
+    if (M - i > 0) {
+      Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
+      Scalar *B_t = &B_arr[0 * LDB + j];
+      PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
+      urolls::template setzero<2, 1>(zmm);
+      for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) {
+        urolls::template microKernel<isARowMajor, 2, 1, EIGEN_AVX_MAX_K_UNROL, EIGEN_AVX_MAX_B_LOAD, 1>(B_t, A_t, LDB,
+                                                                                                        LDA, zmm);
+        B_t += EIGEN_AVX_MAX_K_UNROL * LDB;
+        EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL;
+        else A_t += EIGEN_AVX_MAX_K_UNROL * LDA;
+      }
+      EIGEN_IF_CONSTEXPR(handleKRem) {
+        for (int64_t k = K_; k < K; k++) {
+          urolls::template microKernel<isARowMajor, 2, 1, 1, EIGEN_AVX_MAX_B_LOAD, 1>(B_t, A_t, LDB, LDA, zmm);
+          B_t += LDB;
+          EIGEN_IF_CONSTEXPR(isARowMajor) A_t++;
+          else A_t += LDA;
+        }
+      }
+      EIGEN_IF_CONSTEXPR(isCRowMajor) {
+        urolls::template updateC<2, 1>(&C_arr[i * LDC + j], LDC, zmm);
+        urolls::template storeC<2, 1>(&C_arr[i * LDC + j], LDC, zmm);
+      }
+      else {
+        transStoreC<Scalar, vec, EIGEN_AVX_MAX_NUM_ROW, U2, true, false>(zmm, &C_arr[i + j * LDC], LDC, 1);
+      }
+    }
+    j += U2;
+  }
+  if (N - j >= U1) {
+    constexpr int64_t EIGEN_AVX_MAX_B_LOAD = EIGEN_AVX_B_LOAD_SETS * 1;
+    int64_t i = 0;
+    for (; i < M_; i += EIGEN_AVX_MAX_NUM_ROW) {
+      Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)], *B_t = &B_arr[0 * LDB + j];
+      PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
+      urolls::template setzero<1, EIGEN_AVX_MAX_NUM_ROW>(zmm);
+      for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) {
+        urolls::template microKernel<isARowMajor, 1, EIGEN_AVX_MAX_NUM_ROW, EIGEN_AVX_MAX_K_UNROL, EIGEN_AVX_MAX_B_LOAD,
+                                     EIGEN_AVX_MAX_A_BCAST>(B_t, A_t, LDB, LDA, zmm);
+        B_t += EIGEN_AVX_MAX_K_UNROL * LDB;
+        EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL;
+        else A_t += EIGEN_AVX_MAX_K_UNROL * LDA;
+      }
+      EIGEN_IF_CONSTEXPR(handleKRem) {
+        for (int64_t k = K_; k < K; k++) {
+          urolls::template microKernel<isARowMajor, 1, EIGEN_AVX_MAX_NUM_ROW, 1, EIGEN_AVX_B_LOAD_SETS * 1,
+                                       EIGEN_AVX_MAX_A_BCAST>(B_t, A_t, LDB, LDA, zmm);
+          B_t += LDB;
+          EIGEN_IF_CONSTEXPR(isARowMajor) A_t++;
+          else A_t += LDA;
+        }
+      }
+      EIGEN_IF_CONSTEXPR(isCRowMajor) {
+        urolls::template updateC<1, EIGEN_AVX_MAX_NUM_ROW>(&C_arr[i * LDC + j], LDC, zmm);
+        urolls::template storeC<1, EIGEN_AVX_MAX_NUM_ROW>(&C_arr[i * LDC + j], LDC, zmm);
+      }
+      else {
+        transStoreC<Scalar, vec, EIGEN_AVX_MAX_NUM_ROW, U1, false, false>(zmm, &C_arr[i + j * LDC], LDC);
+      }
+    }
+    if (M - i >= 4) {  // Note: this block assumes EIGEN_AVX_MAX_NUM_ROW = 8. Should be removed otherwise
+      Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
+      Scalar *B_t = &B_arr[0 * LDB + j];
+      PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
+      urolls::template setzero<1, 4>(zmm);
+      for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) {
+        urolls::template microKernel<isARowMajor, 1, 4, EIGEN_AVX_MAX_K_UNROL, EIGEN_AVX_MAX_B_LOAD,
+                                     EIGEN_AVX_MAX_A_BCAST>(B_t, A_t, LDB, LDA, zmm);
+        B_t += EIGEN_AVX_MAX_K_UNROL * LDB;
+        EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL;
+        else A_t += EIGEN_AVX_MAX_K_UNROL * LDA;
+      }
+      EIGEN_IF_CONSTEXPR(handleKRem) {
+        for (int64_t k = K_; k < K; k++) {
+          urolls::template microKernel<isARowMajor, 1, 4, 1, EIGEN_AVX_MAX_B_LOAD, EIGEN_AVX_MAX_A_BCAST>(B_t, A_t, LDB,
+                                                                                                          LDA, zmm);
+          B_t += LDB;
+          EIGEN_IF_CONSTEXPR(isARowMajor) A_t++;
+          else A_t += LDA;
+        }
+      }
+      EIGEN_IF_CONSTEXPR(isCRowMajor) {
+        urolls::template updateC<1, 4>(&C_arr[i * LDC + j], LDC, zmm);
+        urolls::template storeC<1, 4>(&C_arr[i * LDC + j], LDC, zmm);
+      }
+      else {
+        transStoreC<Scalar, vec, EIGEN_AVX_MAX_NUM_ROW, U1, true, false>(zmm, &C_arr[i + j * LDC], LDC, 4);
+      }
+      i += 4;
+    }
+    if (M - i >= 2) {
+      Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
+      Scalar *B_t = &B_arr[0 * LDB + j];
+      PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
+      urolls::template setzero<1, 2>(zmm);
+      for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) {
+        urolls::template microKernel<isARowMajor, 1, 2, EIGEN_AVX_MAX_K_UNROL, EIGEN_AVX_MAX_B_LOAD,
+                                     EIGEN_AVX_MAX_A_BCAST>(B_t, A_t, LDB, LDA, zmm);
+        B_t += EIGEN_AVX_MAX_K_UNROL * LDB;
+        EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL;
+        else A_t += EIGEN_AVX_MAX_K_UNROL * LDA;
+      }
+      EIGEN_IF_CONSTEXPR(handleKRem) {
+        for (int64_t k = K_; k < K; k++) {
+          urolls::template microKernel<isARowMajor, 1, 2, 1, EIGEN_AVX_MAX_B_LOAD, EIGEN_AVX_MAX_A_BCAST>(B_t, A_t, LDB,
+                                                                                                          LDA, zmm);
+          B_t += LDB;
+          EIGEN_IF_CONSTEXPR(isARowMajor) A_t++;
+          else A_t += LDA;
+        }
+      }
+      EIGEN_IF_CONSTEXPR(isCRowMajor) {
+        urolls::template updateC<1, 2>(&C_arr[i * LDC + j], LDC, zmm);
+        urolls::template storeC<1, 2>(&C_arr[i * LDC + j], LDC, zmm);
+      }
+      else {
+        transStoreC<Scalar, vec, EIGEN_AVX_MAX_NUM_ROW, U1, true, false>(zmm, &C_arr[i + j * LDC], LDC, 2);
+      }
+      i += 2;
+    }
+    if (M - i > 0) {
+      Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
+      Scalar *B_t = &B_arr[0 * LDB + j];
+      PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
+      urolls::template setzero<1, 1>(zmm);
+      {
+        for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) {
+          urolls::template microKernel<isARowMajor, 1, 1, EIGEN_AVX_MAX_K_UNROL, EIGEN_AVX_MAX_B_LOAD, 1>(B_t, A_t, LDB,
+                                                                                                          LDA, zmm);
+          B_t += EIGEN_AVX_MAX_K_UNROL * LDB;
+          EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL;
+          else A_t += EIGEN_AVX_MAX_K_UNROL * LDA;
+        }
+        EIGEN_IF_CONSTEXPR(handleKRem) {
+          for (int64_t k = K_; k < K; k++) {
+            urolls::template microKernel<isARowMajor, 1, 1, 1, EIGEN_AVX_B_LOAD_SETS * 1, 1>(B_t, A_t, LDB, LDA, zmm);
+            B_t += LDB;
+            EIGEN_IF_CONSTEXPR(isARowMajor) A_t++;
+            else A_t += LDA;
+          }
+        }
+        EIGEN_IF_CONSTEXPR(isCRowMajor) {
+          urolls::template updateC<1, 1>(&C_arr[i * LDC + j], LDC, zmm);
+          urolls::template storeC<1, 1>(&C_arr[i * LDC + j], LDC, zmm);
+        }
+        else {
+          transStoreC<Scalar, vec, EIGEN_AVX_MAX_NUM_ROW, U1, true, false>(zmm, &C_arr[i + j * LDC], LDC, 1);
+        }
+      }
+    }
+    j += U1;
+  }
+  if (N - j > 0) {
+    constexpr int64_t EIGEN_AVX_MAX_B_LOAD = EIGEN_AVX_B_LOAD_SETS * 1;
+    int64_t i = 0;
+    for (; i < M_; i += EIGEN_AVX_MAX_NUM_ROW) {
+      Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
+      Scalar *B_t = &B_arr[0 * LDB + j];
+      PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
+      urolls::template setzero<1, EIGEN_AVX_MAX_NUM_ROW>(zmm);
+      for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) {
+        urolls::template microKernel<isARowMajor, 1, EIGEN_AVX_MAX_NUM_ROW, EIGEN_AVX_MAX_K_UNROL, EIGEN_AVX_MAX_B_LOAD,
+                                     EIGEN_AVX_MAX_A_BCAST, true>(B_t, A_t, LDB, LDA, zmm, N - j);
+        B_t += EIGEN_AVX_MAX_K_UNROL * LDB;
+        EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL;
+        else A_t += EIGEN_AVX_MAX_K_UNROL * LDA;
+      }
+      EIGEN_IF_CONSTEXPR(handleKRem) {
+        for (int64_t k = K_; k < K; k++) {
+          urolls::template microKernel<isARowMajor, 1, EIGEN_AVX_MAX_NUM_ROW, 1, EIGEN_AVX_MAX_B_LOAD,
+                                       EIGEN_AVX_MAX_A_BCAST, true>(B_t, A_t, LDB, LDA, zmm, N - j);
+          B_t += LDB;
+          EIGEN_IF_CONSTEXPR(isARowMajor) A_t++;
+          else A_t += LDA;
+        }
+      }
+      EIGEN_IF_CONSTEXPR(isCRowMajor) {
+        urolls::template updateC<1, EIGEN_AVX_MAX_NUM_ROW, true>(&C_arr[i * LDC + j], LDC, zmm, N - j);
+        urolls::template storeC<1, EIGEN_AVX_MAX_NUM_ROW, true>(&C_arr[i * LDC + j], LDC, zmm, N - j);
+      }
+      else {
+        transStoreC<Scalar, vec, EIGEN_AVX_MAX_NUM_ROW, U1, false, true>(zmm, &C_arr[i + j * LDC], LDC, 0, N - j);
+      }
+    }
+    if (M - i >= 4) {  // Note: this block assumes EIGEN_AVX_MAX_NUM_ROW = 8. Should be removed otherwise
+      Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
+      Scalar *B_t = &B_arr[0 * LDB + j];
+      PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
+      urolls::template setzero<1, 4>(zmm);
+      for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) {
+        urolls::template microKernel<isARowMajor, 1, 4, EIGEN_AVX_MAX_K_UNROL, EIGEN_AVX_MAX_B_LOAD,
+                                     EIGEN_AVX_MAX_A_BCAST, true>(B_t, A_t, LDB, LDA, zmm, N - j);
+        B_t += EIGEN_AVX_MAX_K_UNROL * LDB;
+        EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL;
+        else A_t += EIGEN_AVX_MAX_K_UNROL * LDA;
+      }
+      EIGEN_IF_CONSTEXPR(handleKRem) {
+        for (int64_t k = K_; k < K; k++) {
+          urolls::template microKernel<isARowMajor, 1, 4, 1, EIGEN_AVX_MAX_B_LOAD, EIGEN_AVX_MAX_A_BCAST, true>(
+              B_t, A_t, LDB, LDA, zmm, N - j);
+          B_t += LDB;
+          EIGEN_IF_CONSTEXPR(isARowMajor) A_t++;
+          else A_t += LDA;
+        }
+      }
+      EIGEN_IF_CONSTEXPR(isCRowMajor) {
+        urolls::template updateC<1, 4, true>(&C_arr[i * LDC + j], LDC, zmm, N - j);
+        urolls::template storeC<1, 4, true>(&C_arr[i * LDC + j], LDC, zmm, N - j);
+      }
+      else {
+        transStoreC<Scalar, vec, EIGEN_AVX_MAX_NUM_ROW, U1, true, true>(zmm, &C_arr[i + j * LDC], LDC, 4, N - j);
+      }
+      i += 4;
+    }
+    if (M - i >= 2) {
+      Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
+      Scalar *B_t = &B_arr[0 * LDB + j];
+      PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
+      urolls::template setzero<1, 2>(zmm);
+      for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) {
+        urolls::template microKernel<isARowMajor, 1, 2, EIGEN_AVX_MAX_K_UNROL, EIGEN_AVX_MAX_B_LOAD,
+                                     EIGEN_AVX_MAX_A_BCAST, true>(B_t, A_t, LDB, LDA, zmm, N - j);
+        B_t += EIGEN_AVX_MAX_K_UNROL * LDB;
+        EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL;
+        else A_t += EIGEN_AVX_MAX_K_UNROL * LDA;
+      }
+      EIGEN_IF_CONSTEXPR(handleKRem) {
+        for (int64_t k = K_; k < K; k++) {
+          urolls::template microKernel<isARowMajor, 1, 2, 1, EIGEN_AVX_MAX_B_LOAD, EIGEN_AVX_MAX_A_BCAST, true>(
+              B_t, A_t, LDB, LDA, zmm, N - j);
+          B_t += LDB;
+          EIGEN_IF_CONSTEXPR(isARowMajor) A_t++;
+          else A_t += LDA;
+        }
+      }
+      EIGEN_IF_CONSTEXPR(isCRowMajor) {
+        urolls::template updateC<1, 2, true>(&C_arr[i * LDC + j], LDC, zmm, N - j);
+        urolls::template storeC<1, 2, true>(&C_arr[i * LDC + j], LDC, zmm, N - j);
+      }
+      else {
+        transStoreC<Scalar, vec, EIGEN_AVX_MAX_NUM_ROW, U1, true, true>(zmm, &C_arr[i + j * LDC], LDC, 2, N - j);
+      }
+      i += 2;
+    }
+    if (M - i > 0) {
+      Scalar *A_t = &A_arr[idA<isARowMajor>(i, 0, LDA)];
+      Scalar *B_t = &B_arr[0 * LDB + j];
+      PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> zmm;
+      urolls::template setzero<1, 1>(zmm);
+      for (int64_t k = 0; k < K_; k += EIGEN_AVX_MAX_K_UNROL) {
+        urolls::template microKernel<isARowMajor, 1, 1, EIGEN_AVX_MAX_K_UNROL, EIGEN_AVX_MAX_B_LOAD, 1, true>(
+            B_t, A_t, LDB, LDA, zmm, N - j);
+        B_t += EIGEN_AVX_MAX_K_UNROL * LDB;
+        EIGEN_IF_CONSTEXPR(isARowMajor) A_t += EIGEN_AVX_MAX_K_UNROL;
+        else A_t += EIGEN_AVX_MAX_K_UNROL * LDA;
+      }
+      EIGEN_IF_CONSTEXPR(handleKRem) {
+        for (int64_t k = K_; k < K; k++) {
+          urolls::template microKernel<isARowMajor, 1, 1, 1, EIGEN_AVX_MAX_B_LOAD, 1, true>(B_t, A_t, LDB, LDA, zmm,
+                                                                                            N - j);
+          B_t += LDB;
+          EIGEN_IF_CONSTEXPR(isARowMajor) A_t++;
+          else A_t += LDA;
+        }
+      }
+      EIGEN_IF_CONSTEXPR(isCRowMajor) {
+        urolls::template updateC<1, 1, true>(&C_arr[i * LDC + j], LDC, zmm, N - j);
+        urolls::template storeC<1, 1, true>(&C_arr[i * LDC + j], LDC, zmm, N - j);
+      }
+      else {
+        transStoreC<Scalar, vec, EIGEN_AVX_MAX_NUM_ROW, U1, true, true>(zmm, &C_arr[i + j * LDC], LDC, 1, N - j);
+      }
+    }
+  }
+}
+
+/**
+ * Triangular solve kernel with A on left with K number of rhs. dim(A) = unrollM
+ *
+ * unrollM: dimension of A matrix (triangular matrix). unrollM should be <= EIGEN_AVX_MAX_NUM_ROW
+ * isFWDSolve: is forward solve?
+ * isUnitDiag: is the diagonal of A all ones?
+ * The B matrix (RHS) is assumed to be row-major
+ */
+template <typename Scalar, typename vec, int64_t unrollM, bool isARowMajor, bool isFWDSolve, bool isUnitDiag>
+EIGEN_ALWAYS_INLINE void triSolveKernel(Scalar *A_arr, Scalar *B_arr, int64_t K, int64_t LDA, int64_t LDB) {
+  static_assert(unrollM <= EIGEN_AVX_MAX_NUM_ROW, "unrollM should be equal to EIGEN_AVX_MAX_NUM_ROW");
+  using urolls = unrolls::trsm<Scalar>;
+  constexpr int64_t U3 = urolls::PacketSize * 3;
+  constexpr int64_t U2 = urolls::PacketSize * 2;
+  constexpr int64_t U1 = urolls::PacketSize * 1;
+
+  PacketBlock<vec, EIGEN_AVX_MAX_NUM_ACC> RHSInPacket;
+  PacketBlock<vec, EIGEN_AVX_MAX_NUM_ROW> AInPacket;
+
+  int64_t k = 0;
+  while (K - k >= U3) {
+    urolls::template loadRHS<isFWDSolve, unrollM, 3>(B_arr + k, LDB, RHSInPacket);
+    urolls::template triSolveMicroKernel<isARowMajor, isFWDSolve, isUnitDiag, unrollM, 3>(A_arr, LDA, RHSInPacket,
+                                                                                          AInPacket);
+    urolls::template storeRHS<isFWDSolve, unrollM, 3>(B_arr + k, LDB, RHSInPacket);
+    k += U3;
+  }
+  if (K - k >= U2) {
+    urolls::template loadRHS<isFWDSolve, unrollM, 2>(B_arr + k, LDB, RHSInPacket);
+    urolls::template triSolveMicroKernel<isARowMajor, isFWDSolve, isUnitDiag, unrollM, 2>(A_arr, LDA, RHSInPacket,
+                                                                                          AInPacket);
+    urolls::template storeRHS<isFWDSolve, unrollM, 2>(B_arr + k, LDB, RHSInPacket);
+    k += U2;
+  }
+  if (K - k >= U1) {
+    urolls::template loadRHS<isFWDSolve, unrollM, 1>(B_arr + k, LDB, RHSInPacket);
+    urolls::template triSolveMicroKernel<isARowMajor, isFWDSolve, isUnitDiag, unrollM, 1>(A_arr, LDA, RHSInPacket,
+                                                                                          AInPacket);
+    urolls::template storeRHS<isFWDSolve, unrollM, 1>(B_arr + k, LDB, RHSInPacket);
+    k += U1;
+  }
+  if (K - k > 0) {
+    // Handle remaining number of RHS
+    urolls::template loadRHS<isFWDSolve, unrollM, 1, true>(B_arr + k, LDB, RHSInPacket, K - k);
+    urolls::template triSolveMicroKernel<isARowMajor, isFWDSolve, isUnitDiag, unrollM, 1>(A_arr, LDA, RHSInPacket,
+                                                                                          AInPacket);
+    urolls::template storeRHS<isFWDSolve, unrollM, 1, true>(B_arr + k, LDB, RHSInPacket, K - k);
+  }
+}
+
+/**
+ * Triangular solve routine with A on left and dimension of at most L with K number of rhs. This is essentially
+ * a wrapper for triSolveMicrokernel for M = {1,2,3,4,5,6,7,8}.
+ *
+ * isFWDSolve: is forward solve?
+ * isUnitDiag: is the diagonal of A all ones?
+ * The B matrix (RHS) is assumed to be row-major
+ */
+template <typename Scalar, bool isARowMajor, bool isFWDSolve, bool isUnitDiag>
+void triSolveKernelLxK(Scalar *A_arr, Scalar *B_arr, int64_t M, int64_t K, int64_t LDA, int64_t LDB) {
+  // Note: this assumes EIGEN_AVX_MAX_NUM_ROW = 8. Unrolls should be adjusted
+  // accordingly if EIGEN_AVX_MAX_NUM_ROW is smaller.
+  using vec = typename std::conditional<std::is_same<Scalar, float>::value, vecFullFloat, vecFullDouble>::type;
+  if (M == 8)
+    triSolveKernel<Scalar, vec, 8, isARowMajor, isFWDSolve, isUnitDiag>(A_arr, B_arr, K, LDA, LDB);
+  else if (M == 7)
+    triSolveKernel<Scalar, vec, 7, isARowMajor, isFWDSolve, isUnitDiag>(A_arr, B_arr, K, LDA, LDB);
+  else if (M == 6)
+    triSolveKernel<Scalar, vec, 6, isARowMajor, isFWDSolve, isUnitDiag>(A_arr, B_arr, K, LDA, LDB);
+  else if (M == 5)
+    triSolveKernel<Scalar, vec, 5, isARowMajor, isFWDSolve, isUnitDiag>(A_arr, B_arr, K, LDA, LDB);
+  else if (M == 4)
+    triSolveKernel<Scalar, vec, 4, isARowMajor, isFWDSolve, isUnitDiag>(A_arr, B_arr, K, LDA, LDB);
+  else if (M == 3)
+    triSolveKernel<Scalar, vec, 3, isARowMajor, isFWDSolve, isUnitDiag>(A_arr, B_arr, K, LDA, LDB);
+  else if (M == 2)
+    triSolveKernel<Scalar, vec, 2, isARowMajor, isFWDSolve, isUnitDiag>(A_arr, B_arr, K, LDA, LDB);
+  else if (M == 1)
+    triSolveKernel<Scalar, vec, 1, isARowMajor, isFWDSolve, isUnitDiag>(A_arr, B_arr, K, LDA, LDB);
+  return;
+}
+
+/**
+ * This routine is used to copy B to/from a temporary array (row-major) for cases where B is column-major.
+ *
+ * toTemp: true => copy to temporary array, false => copy from temporary array
+ * remM: true = need to handle remainder values for M (M < EIGEN_AVX_MAX_NUM_ROW)
+ *
+ */
+template <typename Scalar, bool toTemp = true, bool remM = false>
+EIGEN_ALWAYS_INLINE void copyBToRowMajor(Scalar *B_arr, int64_t LDB, int64_t K, Scalar *B_temp, int64_t LDB_,
+                                         int64_t remM_ = 0) {
+  EIGEN_UNUSED_VARIABLE(remM_);
+  using urolls = unrolls::transB<Scalar>;
+  using vecHalf = typename std::conditional<std::is_same<Scalar, float>::value, vecHalfFloat, vecFullDouble>::type;
+  PacketBlock<vecHalf, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> ymm;
+  constexpr int64_t U3 = urolls::PacketSize * 3;
+  constexpr int64_t U2 = urolls::PacketSize * 2;
+  constexpr int64_t U1 = urolls::PacketSize * 1;
+  int64_t K_ = K / U3 * U3;
+  int64_t k = 0;
+
+  for (; k < K_; k += U3) {
+    urolls::template transB_kernel<U3, toTemp, remM>(B_arr + k * LDB, LDB, B_temp, LDB_, ymm, remM_);
+    B_temp += U3;
+  }
+  if (K - k >= U2) {
+    urolls::template transB_kernel<U2, toTemp, remM>(B_arr + k * LDB, LDB, B_temp, LDB_, ymm, remM_);
+    B_temp += U2;
+    k += U2;
+  }
+  if (K - k >= U1) {
+    urolls::template transB_kernel<U1, toTemp, remM>(B_arr + k * LDB, LDB, B_temp, LDB_, ymm, remM_);
+    B_temp += U1;
+    k += U1;
+  }
+  EIGEN_IF_CONSTEXPR(U1 > 8) {
+    // Note: without "if constexpr" this section of code will also be
+    // parsed by the compiler so there is an additional check in {load/store}BBlock
+    // to make sure the counter is not non-negative.
+    if (K - k >= 8) {
+      urolls::template transB_kernel<8, toTemp, remM>(B_arr + k * LDB, LDB, B_temp, LDB_, ymm, remM_);
+      B_temp += 8;
+      k += 8;
+    }
+  }
+  EIGEN_IF_CONSTEXPR(U1 > 4) {
+    // Note: without "if constexpr" this section of code will also be
+    // parsed by the compiler so there is an additional check in {load/store}BBlock
+    // to make sure the counter is not non-negative.
+    if (K - k >= 4) {
+      urolls::template transB_kernel<4, toTemp, remM>(B_arr + k * LDB, LDB, B_temp, LDB_, ymm, remM_);
+      B_temp += 4;
+      k += 4;
+    }
+  }
+  if (K - k >= 2) {
+    urolls::template transB_kernel<2, toTemp, remM>(B_arr + k * LDB, LDB, B_temp, LDB_, ymm, remM_);
+    B_temp += 2;
+    k += 2;
+  }
+  if (K - k >= 1) {
+    urolls::template transB_kernel<1, toTemp, remM>(B_arr + k * LDB, LDB, B_temp, LDB_, ymm, remM_);
+    B_temp += 1;
+    k += 1;
+  }
+}
+
+/**
+ * Main triangular solve driver
+ *
+ * Triangular solve with A on the left.
+ * Scalar:    Scalar precision, only float/double is supported.
+ * isARowMajor:  is A row-major?
+ * isBRowMajor:  is B row-major?
+ * isFWDSolve:   is this forward solve or backward (true => forward)?
+ * isUnitDiag: is diagonal of A unit or nonunit (true => A has unit diagonal)?
+ *
+ * M: dimension of A
+ * numRHS: number of right hand sides (coincides with K dimension for gemm updates)
+ *
+ * Here are the mapping between the different TRSM cases (col-major) and triSolve:
+ *
+ * LLN (left , lower, A non-transposed) ::  isARowMajor=false, isBRowMajor=false, isFWDSolve=true
+ * LUT (left , upper, A transposed)     ::  isARowMajor=true,  isBRowMajor=false, isFWDSolve=true
+ * LUN (left , upper, A non-transposed) ::  isARowMajor=false, isBRowMajor=false, isFWDSolve=false
+ * LLT (left , lower, A transposed)     ::  isARowMajor=true,  isBRowMajor=false, isFWDSolve=false
+ * RUN (right, upper, A non-transposed) ::  isARowMajor=true,  isBRowMajor=true,  isFWDSolve=true
+ * RLT (right, lower, A transposed)     ::  isARowMajor=false, isBRowMajor=true,  isFWDSolve=true
+ * RUT (right, upper, A transposed)     ::  isARowMajor=false, isBRowMajor=true,  isFWDSolve=false
+ * RLN (right, lower, A non-transposed) ::  isARowMajor=true,  isBRowMajor=true,  isFWDSolve=false
+ *
+ * Note: For RXX cases M,numRHS should be swapped.
+ *
+ */
+template <typename Scalar, bool isARowMajor = true, bool isBRowMajor = true, bool isFWDSolve = true,
+          bool isUnitDiag = false>
+void triSolve(Scalar *A_arr, Scalar *B_arr, int64_t M, int64_t numRHS, int64_t LDA, int64_t LDB) {
+  constexpr int64_t psize = packet_traits<Scalar>::size;
+  /**
+   * The values for kB, numM were determined experimentally.
+   * kB: Number of RHS we process at a time.
+   * numM: number of rows of B we will store in a temporary array (see below.) This should be a multiple of L.
+   *
+   * kB was determined by initially setting kB = numRHS and benchmarking triSolve (TRSM-RUN case)
+   * performance with M=numRHS.
+   * It was observed that performance started to drop around M=numRHS=240. This is likely machine dependent.
+   *
+   * numM was chosen "arbitrarily". It should be relatively small so B_temp is not too large, but it should be
+   * large enough to allow GEMM updates to have larger "K"s (see below.) No benchmarking has been done so far to
+   * determine optimal values for numM.
+   */
+  constexpr int64_t kB = (3 * psize) * 5;  // 5*U3
+  constexpr int64_t numM = 8 * EIGEN_AVX_MAX_NUM_ROW;
+
+  int64_t sizeBTemp = 0;
+  Scalar *B_temp = NULL;
+  EIGEN_IF_CONSTEXPR(!isBRowMajor) {
+    /**
+     * If B is col-major, we copy it to a fixed-size temporary array of size at most ~numM*kB and
+     * transpose it to row-major. Call the solve routine, and copy+transpose it back to the original array.
+     * The updated row-major copy of B is reused in the GEMM updates.
+     */
+    sizeBTemp = (((std::min(kB, numRHS) + psize - 1) / psize + 4) * psize) * numM;
+  }
+
+  EIGEN_IF_CONSTEXPR(!isBRowMajor) B_temp = (Scalar *)handmade_aligned_malloc(sizeof(Scalar) * sizeBTemp, 64);
+
+  for (int64_t k = 0; k < numRHS; k += kB) {
+    int64_t bK = numRHS - k > kB ? kB : numRHS - k;
+    int64_t M_ = (M / EIGEN_AVX_MAX_NUM_ROW) * EIGEN_AVX_MAX_NUM_ROW, gemmOff = 0;
+
+    // bK rounded up to next multiple of L=EIGEN_AVX_MAX_NUM_ROW. When B_temp is used, we solve for bkL RHS
+    // instead of bK RHS in triSolveKernelLxK.
+    int64_t bkL = ((bK + (EIGEN_AVX_MAX_NUM_ROW - 1)) / EIGEN_AVX_MAX_NUM_ROW) * EIGEN_AVX_MAX_NUM_ROW;
+    const int64_t numScalarPerCache = 64 / sizeof(Scalar);
+    // Leading dimension of B_temp, will be a multiple of the cache line size.
+    int64_t LDT = ((bkL + (numScalarPerCache - 1)) / numScalarPerCache) * numScalarPerCache;
+    int64_t offsetBTemp = 0;
+    for (int64_t i = 0; i < M_; i += EIGEN_AVX_MAX_NUM_ROW) {
+      EIGEN_IF_CONSTEXPR(!isBRowMajor) {
+        int64_t indA_i = isFWDSolve ? i : M - 1 - i;
+        int64_t indB_i = isFWDSolve ? i : M - (i + EIGEN_AVX_MAX_NUM_ROW);
+        int64_t offB_1 = isFWDSolve ? offsetBTemp : sizeBTemp - EIGEN_AVX_MAX_NUM_ROW * LDT - offsetBTemp;
+        int64_t offB_2 = isFWDSolve ? offsetBTemp : sizeBTemp - LDT - offsetBTemp;
+        // Copy values from B to B_temp.
+        copyBToRowMajor<Scalar, true, false>(B_arr + indB_i + k * LDB, LDB, bK, B_temp + offB_1, LDT);
+        // Triangular solve with a small block of A and long horizontal blocks of B (or B_temp if B col-major)
+        triSolveKernelLxK<Scalar, isARowMajor, isFWDSolve, isUnitDiag>(
+            &A_arr[idA<isARowMajor>(indA_i, indA_i, LDA)], B_temp + offB_2, EIGEN_AVX_MAX_NUM_ROW, bkL, LDA, LDT);
+        // Copy values from B_temp back to B. B_temp will be reused in gemm call below.
+        copyBToRowMajor<Scalar, false, false>(B_arr + indB_i + k * LDB, LDB, bK, B_temp + offB_1, LDT);
+
+        offsetBTemp += EIGEN_AVX_MAX_NUM_ROW * LDT;
+      }
+      else {
+        int64_t ind = isFWDSolve ? i : M - 1 - i;
+        triSolveKernelLxK<Scalar, isARowMajor, isFWDSolve, isUnitDiag>(
+            &A_arr[idA<isARowMajor>(ind, ind, LDA)], B_arr + k + ind * LDB, EIGEN_AVX_MAX_NUM_ROW, bK, LDA, LDB);
+      }
+      if (i + EIGEN_AVX_MAX_NUM_ROW < M_) {
+        /**
+         * For the GEMM updates, we want "K" (K=i+8 in this case) to be large as soon as possible
+         * to reuse the accumulators in GEMM as much as possible. So we only update 8xbK blocks of
+         * B as follows:
+         *
+         *        A             B
+         *     __
+         *    |__|__           |__|
+         *    |__|__|__        |__|
+         *    |__|__|__|__     |__|
+         *    |********|__|    |**|
+         */
+        EIGEN_IF_CONSTEXPR(isBRowMajor) {
+          int64_t indA_i = isFWDSolve ? i + EIGEN_AVX_MAX_NUM_ROW : M - (i + 2 * EIGEN_AVX_MAX_NUM_ROW);
+          int64_t indA_j = isFWDSolve ? 0 : M - (i + EIGEN_AVX_MAX_NUM_ROW);
+          int64_t indB_i = isFWDSolve ? 0 : M - (i + EIGEN_AVX_MAX_NUM_ROW);
+          int64_t indB_i2 = isFWDSolve ? i + EIGEN_AVX_MAX_NUM_ROW : M - (i + 2 * EIGEN_AVX_MAX_NUM_ROW);
+          gemmKernel<Scalar, isARowMajor, isBRowMajor, false, false>(
+              &A_arr[idA<isARowMajor>(indA_i, indA_j, LDA)], B_arr + k + indB_i * LDB, B_arr + k + indB_i2 * LDB,
+              EIGEN_AVX_MAX_NUM_ROW, bK, i + EIGEN_AVX_MAX_NUM_ROW, LDA, LDB, LDB);
+        }
+        else {
+          if (offsetBTemp + EIGEN_AVX_MAX_NUM_ROW * LDT > sizeBTemp) {
+            /**
+             * Similar idea as mentioned above, but here we are limited by the number of updated values of B
+             * that can be stored (row-major) in B_temp.
+             *
+             * If there is not enough space to store the next batch of 8xbK of B in B_temp, we call GEMM
+             * update and partially update the remaining old values of B which depends on the new values
+             * of B stored in B_temp. These values are then no longer needed and can be overwritten.
+             */
+            int64_t indA_i = isFWDSolve ? i + EIGEN_AVX_MAX_NUM_ROW : 0;
+            int64_t indA_j = isFWDSolve ? gemmOff : M - (i + EIGEN_AVX_MAX_NUM_ROW);
+            int64_t indB_i = isFWDSolve ? i + EIGEN_AVX_MAX_NUM_ROW : 0;
+            int64_t offB_1 = isFWDSolve ? 0 : sizeBTemp - offsetBTemp;
+            gemmKernel<Scalar, isARowMajor, isBRowMajor, false, false>(
+                &A_arr[idA<isARowMajor>(indA_i, indA_j, LDA)], B_temp + offB_1, B_arr + indB_i + (k)*LDB,
+                M - (i + EIGEN_AVX_MAX_NUM_ROW), bK, i + EIGEN_AVX_MAX_NUM_ROW - gemmOff, LDA, LDT, LDB);
+            offsetBTemp = 0;
+            gemmOff = i + EIGEN_AVX_MAX_NUM_ROW;
+          } else {
+            /**
+             * If there is enough space in B_temp, we only update the next 8xbK values of B.
+             */
+            int64_t indA_i = isFWDSolve ? i + EIGEN_AVX_MAX_NUM_ROW : M - (i + 2 * EIGEN_AVX_MAX_NUM_ROW);
+            int64_t indA_j = isFWDSolve ? gemmOff : M - (i + EIGEN_AVX_MAX_NUM_ROW);
+            int64_t indB_i = isFWDSolve ? i + EIGEN_AVX_MAX_NUM_ROW : M - (i + 2 * EIGEN_AVX_MAX_NUM_ROW);
+            int64_t offB_1 = isFWDSolve ? 0 : sizeBTemp - offsetBTemp;
+            gemmKernel<Scalar, isARowMajor, isBRowMajor, false, false>(
+                &A_arr[idA<isARowMajor>(indA_i, indA_j, LDA)], B_temp + offB_1, B_arr + indB_i + (k)*LDB,
+                EIGEN_AVX_MAX_NUM_ROW, bK, i + EIGEN_AVX_MAX_NUM_ROW - gemmOff, LDA, LDT, LDB);
+          }
+        }
+      }
+    }
+    // Handle M remainder..
+    int64_t bM = M - M_;
+    if (bM > 0) {
+      if (M_ > 0) {
+        EIGEN_IF_CONSTEXPR(isBRowMajor) {
+          int64_t indA_i = isFWDSolve ? M_ : 0;
+          int64_t indA_j = isFWDSolve ? 0 : bM;
+          int64_t indB_i = isFWDSolve ? 0 : bM;
+          int64_t indB_i2 = isFWDSolve ? M_ : 0;
+          gemmKernel<Scalar, isARowMajor, isBRowMajor, false, false>(
+              &A_arr[idA<isARowMajor>(indA_i, indA_j, LDA)], B_arr + k + indB_i * LDB, B_arr + k + indB_i2 * LDB, bM,
+              bK, M_, LDA, LDB, LDB);
+        }
+        else {
+          int64_t indA_i = isFWDSolve ? M_ : 0;
+          int64_t indA_j = isFWDSolve ? gemmOff : bM;
+          int64_t indB_i = isFWDSolve ? M_ : 0;
+          int64_t offB_1 = isFWDSolve ? 0 : sizeBTemp - offsetBTemp;
+          gemmKernel<Scalar, isARowMajor, isBRowMajor, false, false>(&A_arr[idA<isARowMajor>(indA_i, indA_j, LDA)],
+                                                                     B_temp + offB_1, B_arr + indB_i + (k)*LDB, bM, bK,
+                                                                     M_ - gemmOff, LDA, LDT, LDB);
+        }
+      }
+      EIGEN_IF_CONSTEXPR(!isBRowMajor) {
+        int64_t indA_i = isFWDSolve ? M_ : M - 1 - M_;
+        int64_t indB_i = isFWDSolve ? M_ : 0;
+        int64_t offB_1 = isFWDSolve ? 0 : (bM - 1) * bkL;
+        copyBToRowMajor<Scalar, true, true>(B_arr + indB_i + k * LDB, LDB, bK, B_temp, bkL, bM);
+        triSolveKernelLxK<Scalar, isARowMajor, isFWDSolve, isUnitDiag>(&A_arr[idA<isARowMajor>(indA_i, indA_i, LDA)],
+                                                                       B_temp + offB_1, bM, bkL, LDA, bkL);
+        copyBToRowMajor<Scalar, false, true>(B_arr + indB_i + k * LDB, LDB, bK, B_temp, bkL, bM);
+      }
+      else {
+        int64_t ind = isFWDSolve ? M_ : M - 1 - M_;
+        triSolveKernelLxK<Scalar, isARowMajor, isFWDSolve, isUnitDiag>(&A_arr[idA<isARowMajor>(ind, ind, LDA)],
+                                                                       B_arr + k + ind * LDB, bM, bK, LDA, LDB);
+      }
+    }
+  }
+
+  EIGEN_IF_CONSTEXPR(!isBRowMajor) handmade_aligned_free(B_temp);
+}
+
+// Template specializations of trsmKernelL/R for float/double and inner strides of 1.
+#if (EIGEN_USE_AVX512_TRSM_KERNELS)
+#if (EIGEN_USE_AVX512_TRSM_R_KERNELS)
+template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride,
+          bool Specialized>
+struct trsmKernelR;
+
+template <typename Index, int Mode, int TriStorageOrder>
+struct trsmKernelR<float, Index, Mode, false, TriStorageOrder, 1, true> {
+  static void kernel(Index size, Index otherSize, const float *_tri, Index triStride, float *_other, Index otherIncr,
+                     Index otherStride);
+};
+
+template <typename Index, int Mode, int TriStorageOrder>
+struct trsmKernelR<double, Index, Mode, false, TriStorageOrder, 1, true> {
+  static void kernel(Index size, Index otherSize, const double *_tri, Index triStride, double *_other, Index otherIncr,
+                     Index otherStride);
+};
+
+template <typename Index, int Mode, int TriStorageOrder>
+EIGEN_DONT_INLINE void trsmKernelR<float, Index, Mode, false, TriStorageOrder, 1, true>::kernel(
+    Index size, Index otherSize, const float *_tri, Index triStride, float *_other, Index otherIncr,
+    Index otherStride) {
+  EIGEN_UNUSED_VARIABLE(otherIncr);
+#ifdef EIGEN_RUNTIME_NO_MALLOC
+  if (!is_malloc_allowed()) {
+    trsmKernelR<float, Index, Mode, false, TriStorageOrder, 1, /*Specialized=*/false>::kernel(
+        size, otherSize, _tri, triStride, _other, otherIncr, otherStride);
+    return;
+  }
+#endif
+  triSolve<float, TriStorageOrder != RowMajor, true, (Mode & Lower) != Lower, (Mode & UnitDiag) != 0>(
+      const_cast<float *>(_tri), _other, size, otherSize, triStride, otherStride);
+}
+
+template <typename Index, int Mode, int TriStorageOrder>
+EIGEN_DONT_INLINE void trsmKernelR<double, Index, Mode, false, TriStorageOrder, 1, true>::kernel(
+    Index size, Index otherSize, const double *_tri, Index triStride, double *_other, Index otherIncr,
+    Index otherStride) {
+  EIGEN_UNUSED_VARIABLE(otherIncr);
+#ifdef EIGEN_RUNTIME_NO_MALLOC
+  if (!is_malloc_allowed()) {
+    trsmKernelR<double, Index, Mode, false, TriStorageOrder, 1, /*Specialized=*/false>::kernel(
+        size, otherSize, _tri, triStride, _other, otherIncr, otherStride);
+    return;
+  }
+#endif
+  triSolve<double, TriStorageOrder != RowMajor, true, (Mode & Lower) != Lower, (Mode & UnitDiag) != 0>(
+      const_cast<double *>(_tri), _other, size, otherSize, triStride, otherStride);
+}
+#endif  // (EIGEN_USE_AVX512_TRSM_R_KERNELS)
+
+// These trsm kernels require temporary memory allocation
+#if (EIGEN_USE_AVX512_TRSM_L_KERNELS)
+template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride,
+          bool Specialized = true>
+struct trsmKernelL;
+
+template <typename Index, int Mode, int TriStorageOrder>
+struct trsmKernelL<float, Index, Mode, false, TriStorageOrder, 1, true> {
+  static void kernel(Index size, Index otherSize, const float *_tri, Index triStride, float *_other, Index otherIncr,
+                     Index otherStride);
+};
+
+template <typename Index, int Mode, int TriStorageOrder>
+struct trsmKernelL<double, Index, Mode, false, TriStorageOrder, 1, true> {
+  static void kernel(Index size, Index otherSize, const double *_tri, Index triStride, double *_other, Index otherIncr,
+                     Index otherStride);
+};
+
+template <typename Index, int Mode, int TriStorageOrder>
+EIGEN_DONT_INLINE void trsmKernelL<float, Index, Mode, false, TriStorageOrder, 1, true>::kernel(
+    Index size, Index otherSize, const float *_tri, Index triStride, float *_other, Index otherIncr,
+    Index otherStride) {
+  EIGEN_UNUSED_VARIABLE(otherIncr);
+#ifdef EIGEN_RUNTIME_NO_MALLOC
+  if (!is_malloc_allowed()) {
+    trsmKernelL<float, Index, Mode, false, TriStorageOrder, 1, /*Specialized=*/false>::kernel(
+        size, otherSize, _tri, triStride, _other, otherIncr, otherStride);
+    return;
+  }
+#endif
+  triSolve<float, TriStorageOrder == RowMajor, false, (Mode & Lower) == Lower, (Mode & UnitDiag) != 0>(
+      const_cast<float *>(_tri), _other, size, otherSize, triStride, otherStride);
+}
+
+template <typename Index, int Mode, int TriStorageOrder>
+EIGEN_DONT_INLINE void trsmKernelL<double, Index, Mode, false, TriStorageOrder, 1, true>::kernel(
+    Index size, Index otherSize, const double *_tri, Index triStride, double *_other, Index otherIncr,
+    Index otherStride) {
+  EIGEN_UNUSED_VARIABLE(otherIncr);
+#ifdef EIGEN_RUNTIME_NO_MALLOC
+  if (!is_malloc_allowed()) {
+    trsmKernelL<double, Index, Mode, false, TriStorageOrder, 1, /*Specialized=*/false>::kernel(
+        size, otherSize, _tri, triStride, _other, otherIncr, otherStride);
+    return;
+  }
+#endif
+  triSolve<double, TriStorageOrder == RowMajor, false, (Mode & Lower) == Lower, (Mode & UnitDiag) != 0>(
+      const_cast<double *>(_tri), _other, size, otherSize, triStride, otherStride);
+}
+#endif  // EIGEN_USE_AVX512_TRSM_L_KERNELS
+#endif  // EIGEN_USE_AVX512_TRSM_KERNELS
+}  // namespace internal
+}  // namespace Eigen
+#endif  // EIGEN_CORE_ARCH_AVX512_TRSM_KERNEL_H
diff --git a/inst/include/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc b/inst/include/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc
new file mode 100644
index 00000000..3a5f68eb
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/AVX512/TrsmUnrolls.inc
@@ -0,0 +1,1219 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2022 Intel Corporation
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CORE_ARCH_AVX512_TRSM_UNROLLS_H
+#define EIGEN_CORE_ARCH_AVX512_TRSM_UNROLLS_H
+
+template <bool isARowMajor = true>
+EIGEN_ALWAYS_INLINE int64_t idA(int64_t i, int64_t j, int64_t LDA) {
+  EIGEN_IF_CONSTEXPR(isARowMajor) return i * LDA + j;
+  else return i + j * LDA;
+}
+
+/**
+ * This namespace contains various classes used to generate compile-time unrolls which are
+ * used throughout the trsm/gemm kernels. The unrolls are characterized as for-loops (1-D), nested
+ * for-loops (2-D), or triple nested for-loops (3-D). Unrolls are generated using template recursion
+ *
+ * Example, the 2-D for-loop is unrolled recursively by first flattening to a 1-D loop.
+ *
+ * for(startI = 0; startI < endI; startI++)             for(startC = 0; startC < endI*endJ; startC++)
+ *   for(startJ = 0; startJ < endJ; startJ++)  ---->      startI = (startC)/(endJ)
+ *     func(startI,startJ)                                startJ = (startC)%(endJ)
+ *                                                        func(...)
+ *
+ * The 1-D loop can be unrolled recursively by using enable_if and defining an auxiliary function
+ * with a template parameter used as a counter.
+ *
+ * template <endI, endJ, counter>
+ * std::enable_if_t<(counter <= 0)>  <---- tail case.
+ * aux_func {}
+ *
+ * template <endI, endJ, counter>
+ * std::enable_if_t<(counter > 0)>   <---- actual for-loop
+ * aux_func {
+ *   startC = endI*endJ - counter
+ *   startI = (startC)/(endJ)
+ *   startJ = (startC)%(endJ)
+ *   func(startI, startJ)
+ *   aux_func<endI, endJ, counter-1>()
+ * }
+ *
+ * Note: Additional wrapper functions are provided for aux_func which hides the counter template
+ * parameter since counter usually depends on endI, endJ, etc...
+ *
+ * Conventions:
+ * 1) endX: specifies the terminal value for the for-loop, (ex: for(startX = 0; startX < endX; startX++))
+ *
+ * 2) rem, remM, remK template parameters are used for deciding whether to use masked operations for
+ *    handling remaining tails (when sizes are not multiples of PacketSize or EIGEN_AVX_MAX_NUM_ROW)
+ */
+namespace unrolls {
+
+template <int64_t N>
+EIGEN_ALWAYS_INLINE auto remMask(int64_t m) {
+  EIGEN_IF_CONSTEXPR(N == 16) { return 0xFFFF >> (16 - m); }
+  else EIGEN_IF_CONSTEXPR(N == 8) {
+    return 0xFF >> (8 - m);
+  }
+  else EIGEN_IF_CONSTEXPR(N == 4) {
+    return 0x0F >> (4 - m);
+  }
+  return 0;
+}
+
+template <typename Packet>
+EIGEN_ALWAYS_INLINE void trans8x8blocks(PacketBlock<Packet, 8> &kernel);
+
+template <>
+EIGEN_ALWAYS_INLINE void trans8x8blocks(PacketBlock<Packet16f, 8> &kernel) {
+  __m512 T0 = _mm512_unpacklo_ps(kernel.packet[0], kernel.packet[1]);
+  __m512 T1 = _mm512_unpackhi_ps(kernel.packet[0], kernel.packet[1]);
+  __m512 T2 = _mm512_unpacklo_ps(kernel.packet[2], kernel.packet[3]);
+  __m512 T3 = _mm512_unpackhi_ps(kernel.packet[2], kernel.packet[3]);
+  __m512 T4 = _mm512_unpacklo_ps(kernel.packet[4], kernel.packet[5]);
+  __m512 T5 = _mm512_unpackhi_ps(kernel.packet[4], kernel.packet[5]);
+  __m512 T6 = _mm512_unpacklo_ps(kernel.packet[6], kernel.packet[7]);
+  __m512 T7 = _mm512_unpackhi_ps(kernel.packet[6], kernel.packet[7]);
+
+  kernel.packet[0] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(T0), _mm512_castps_pd(T2)));
+  kernel.packet[1] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(T0), _mm512_castps_pd(T2)));
+  kernel.packet[2] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(T1), _mm512_castps_pd(T3)));
+  kernel.packet[3] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(T1), _mm512_castps_pd(T3)));
+  kernel.packet[4] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(T4), _mm512_castps_pd(T6)));
+  kernel.packet[5] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(T4), _mm512_castps_pd(T6)));
+  kernel.packet[6] = _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(T5), _mm512_castps_pd(T7)));
+  kernel.packet[7] = _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(T5), _mm512_castps_pd(T7)));
+
+  T0 = _mm512_castpd_ps(_mm512_permutex_pd(_mm512_castps_pd(kernel.packet[4]), 0x4E));
+  T0 = _mm512_mask_blend_ps(0xF0F0, kernel.packet[0], T0);
+  T4 = _mm512_castpd_ps(_mm512_permutex_pd(_mm512_castps_pd(kernel.packet[0]), 0x4E));
+  T4 = _mm512_mask_blend_ps(0xF0F0, T4, kernel.packet[4]);
+  T1 = _mm512_castpd_ps(_mm512_permutex_pd(_mm512_castps_pd(kernel.packet[5]), 0x4E));
+  T1 = _mm512_mask_blend_ps(0xF0F0, kernel.packet[1], T1);
+  T5 = _mm512_castpd_ps(_mm512_permutex_pd(_mm512_castps_pd(kernel.packet[1]), 0x4E));
+  T5 = _mm512_mask_blend_ps(0xF0F0, T5, kernel.packet[5]);
+  T2 = _mm512_castpd_ps(_mm512_permutex_pd(_mm512_castps_pd(kernel.packet[6]), 0x4E));
+  T2 = _mm512_mask_blend_ps(0xF0F0, kernel.packet[2], T2);
+  T6 = _mm512_castpd_ps(_mm512_permutex_pd(_mm512_castps_pd(kernel.packet[2]), 0x4E));
+  T6 = _mm512_mask_blend_ps(0xF0F0, T6, kernel.packet[6]);
+  T3 = _mm512_castpd_ps(_mm512_permutex_pd(_mm512_castps_pd(kernel.packet[7]), 0x4E));
+  T3 = _mm512_mask_blend_ps(0xF0F0, kernel.packet[3], T3);
+  T7 = _mm512_castpd_ps(_mm512_permutex_pd(_mm512_castps_pd(kernel.packet[3]), 0x4E));
+  T7 = _mm512_mask_blend_ps(0xF0F0, T7, kernel.packet[7]);
+
+  kernel.packet[0] = T0;
+  kernel.packet[1] = T1;
+  kernel.packet[2] = T2;
+  kernel.packet[3] = T3;
+  kernel.packet[4] = T4;
+  kernel.packet[5] = T5;
+  kernel.packet[6] = T6;
+  kernel.packet[7] = T7;
+}
+
+template <>
+EIGEN_ALWAYS_INLINE void trans8x8blocks(PacketBlock<Packet8d, 8> &kernel) {
+  ptranspose(kernel);
+}
+
+/***
+ * Unrolls for transposed C stores
+ */
+template <typename Scalar>
+class trans {
+ public:
+  using vec = typename std::conditional<std::is_same<Scalar, float>::value, vecFullFloat, vecFullDouble>::type;
+  using vecHalf = typename std::conditional<std::is_same<Scalar, float>::value, vecHalfFloat, vecFullDouble>::type;
+  static constexpr int64_t PacketSize = packet_traits<Scalar>::size;
+
+  /***********************************
+   * Auxiliary Functions for:
+   *  - storeC
+   ***********************************
+   */
+
+  /**
+   * aux_storeC
+   *
+   * 1-D unroll
+   *      for(startN = 0; startN < endN; startN++)
+   *
+   * (endN <= PacketSize) is required to handle the fp32 case, see comments in transStoreC
+   *
+   **/
+  template <int64_t endN, int64_t counter, int64_t unrollN, int64_t packetIndexOffset, bool remM>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0 && endN <= PacketSize)> aux_storeC(
+      Scalar *C_arr, int64_t LDC, PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm, int64_t remM_ = 0) {
+    constexpr int64_t counterReverse = endN - counter;
+    constexpr int64_t startN = counterReverse;
+
+    EIGEN_IF_CONSTEXPR(startN < EIGEN_AVX_MAX_NUM_ROW) {
+      EIGEN_IF_CONSTEXPR(remM) {
+        pstoreu<Scalar>(
+            C_arr + LDC * startN,
+            padd(ploadu<vecHalf>((const Scalar *)C_arr + LDC * startN, remMask<EIGEN_AVX_MAX_NUM_ROW>(remM_)),
+                 preinterpret<vecHalf>(zmm.packet[packetIndexOffset + (unrollN / PacketSize) * startN]),
+                 remMask<EIGEN_AVX_MAX_NUM_ROW>(remM_)),
+            remMask<EIGEN_AVX_MAX_NUM_ROW>(remM_));
+      }
+      else {
+        pstoreu<Scalar>(C_arr + LDC * startN,
+                        padd(ploadu<vecHalf>((const Scalar *)C_arr + LDC * startN),
+                             preinterpret<vecHalf>(zmm.packet[packetIndexOffset + (unrollN / PacketSize) * startN])));
+      }
+    }
+    else {  // This block is only needed for fp32 case
+      // Reinterpret as __m512 for _mm512_shuffle_f32x4
+      vecFullFloat zmm2vecFullFloat = preinterpret<vecFullFloat>(
+          zmm.packet[packetIndexOffset + (unrollN / PacketSize) * (startN - EIGEN_AVX_MAX_NUM_ROW)]);
+      // Swap lower and upper half of avx register.
+      zmm.packet[packetIndexOffset + (unrollN / PacketSize) * (startN - EIGEN_AVX_MAX_NUM_ROW)] =
+          preinterpret<vec>(_mm512_shuffle_f32x4(zmm2vecFullFloat, zmm2vecFullFloat, 0b01001110));
+
+      EIGEN_IF_CONSTEXPR(remM) {
+        pstoreu<Scalar>(
+            C_arr + LDC * startN,
+            padd(ploadu<vecHalf>((const Scalar *)C_arr + LDC * startN, remMask<EIGEN_AVX_MAX_NUM_ROW>(remM_)),
+                 preinterpret<vecHalf>(
+                     zmm.packet[packetIndexOffset + (unrollN / PacketSize) * (startN - EIGEN_AVX_MAX_NUM_ROW)])),
+            remMask<EIGEN_AVX_MAX_NUM_ROW>(remM_));
+      }
+      else {
+        pstoreu<Scalar>(
+            C_arr + LDC * startN,
+            padd(ploadu<vecHalf>((const Scalar *)C_arr + LDC * startN),
+                 preinterpret<vecHalf>(
+                     zmm.packet[packetIndexOffset + (unrollN / PacketSize) * (startN - EIGEN_AVX_MAX_NUM_ROW)])));
+      }
+    }
+    aux_storeC<endN, counter - 1, unrollN, packetIndexOffset, remM>(C_arr, LDC, zmm, remM_);
+  }
+
+  template <int64_t endN, int64_t counter, int64_t unrollN, int64_t packetIndexOffset, bool remM>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<!(counter > 0 && endN <= PacketSize)> aux_storeC(
+      Scalar *C_arr, int64_t LDC, PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm, int64_t remM_ = 0) {
+    EIGEN_UNUSED_VARIABLE(C_arr);
+    EIGEN_UNUSED_VARIABLE(LDC);
+    EIGEN_UNUSED_VARIABLE(zmm);
+    EIGEN_UNUSED_VARIABLE(remM_);
+  }
+
+  template <int64_t endN, int64_t unrollN, int64_t packetIndexOffset, bool remM>
+  static EIGEN_ALWAYS_INLINE void storeC(Scalar *C_arr, int64_t LDC,
+                                         PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm,
+                                         int64_t remM_ = 0) {
+    aux_storeC<endN, endN, unrollN, packetIndexOffset, remM>(C_arr, LDC, zmm, remM_);
+  }
+
+  /**
+   * Transposes LxunrollN row major block of matrices stored `EIGEN_AVX_MAX_NUM_ACC` zmm registers to
+   * "unrollN"xL ymm registers to be stored col-major into C.
+   *
+   *  For 8x48, the 8x48 block (row-major) is stored in zmm as follows:
+   *
+   *  ```
+   *  row0: zmm0 zmm1 zmm2
+   *  row1: zmm3 zmm4 zmm5
+   *    .
+   *    .
+   *  row7: zmm21 zmm22 zmm23
+   *
+   *  For 8x32, the 8x32 block (row-major) is stored in zmm as follows:
+   *
+   *  row0: zmm0 zmm1
+   *  row1: zmm2 zmm3
+   *    .
+   *    .
+   *  row7: zmm14 zmm15
+   * ```
+   *
+   * In general we will have {1,2,3} groups of avx registers each of size
+   * `EIGEN_AVX_MAX_NUM_ROW`. packetIndexOffset is used to select which "block" of
+   * avx registers are being transposed.
+   */
+  template <int64_t unrollN, int64_t packetIndexOffset>
+  static EIGEN_ALWAYS_INLINE void transpose(PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm) {
+    // Note: this assumes EIGEN_AVX_MAX_NUM_ROW = 8. Unrolls should be adjusted
+    // accordingly if EIGEN_AVX_MAX_NUM_ROW is smaller.
+    constexpr int64_t zmmStride = unrollN / PacketSize;
+    PacketBlock<vec, EIGEN_AVX_MAX_NUM_ROW> r;
+    r.packet[0] = zmm.packet[packetIndexOffset + zmmStride * 0];
+    r.packet[1] = zmm.packet[packetIndexOffset + zmmStride * 1];
+    r.packet[2] = zmm.packet[packetIndexOffset + zmmStride * 2];
+    r.packet[3] = zmm.packet[packetIndexOffset + zmmStride * 3];
+    r.packet[4] = zmm.packet[packetIndexOffset + zmmStride * 4];
+    r.packet[5] = zmm.packet[packetIndexOffset + zmmStride * 5];
+    r.packet[6] = zmm.packet[packetIndexOffset + zmmStride * 6];
+    r.packet[7] = zmm.packet[packetIndexOffset + zmmStride * 7];
+    trans8x8blocks(r);
+    zmm.packet[packetIndexOffset + zmmStride * 0] = r.packet[0];
+    zmm.packet[packetIndexOffset + zmmStride * 1] = r.packet[1];
+    zmm.packet[packetIndexOffset + zmmStride * 2] = r.packet[2];
+    zmm.packet[packetIndexOffset + zmmStride * 3] = r.packet[3];
+    zmm.packet[packetIndexOffset + zmmStride * 4] = r.packet[4];
+    zmm.packet[packetIndexOffset + zmmStride * 5] = r.packet[5];
+    zmm.packet[packetIndexOffset + zmmStride * 6] = r.packet[6];
+    zmm.packet[packetIndexOffset + zmmStride * 7] = r.packet[7];
+  }
+};
+
+/**
+ * Unrolls for copyBToRowMajor
+ *
+ * Idea:
+ *  1) Load a block of right-hand sides to registers (using loadB).
+ *  2) Convert the block from column-major to row-major (transposeLxL)
+ *  3) Store the blocks from register either to a temp array (toTemp == true), or back to B (toTemp == false).
+ *
+ *  We use at most EIGEN_AVX_MAX_NUM_ACC avx registers to store the blocks of B. The remaining registers are
+ *  used as temps for transposing.
+ *
+ *  Blocks will be of size Lx{U1,U2,U3}. packetIndexOffset is used to index between these subblocks
+ *  For fp32, PacketSize = 2*EIGEN_AVX_MAX_NUM_ROW, so we reinterpret packets as packets half the size (zmm -> ymm).
+ */
+template <typename Scalar>
+class transB {
+ public:
+  using vec = typename std::conditional<std::is_same<Scalar, float>::value, vecFullFloat, vecFullDouble>::type;
+  using vecHalf = typename std::conditional<std::is_same<Scalar, float>::value, vecHalfFloat, vecFullDouble>::type;
+  static constexpr int64_t PacketSize = packet_traits<Scalar>::size;
+
+  /***********************************
+   * Auxiliary Functions for:
+   *  - loadB
+   *  - storeB
+   *  - loadBBlock
+   *  - storeBBlock
+   ***********************************
+   */
+
+  /**
+   * aux_loadB
+   *
+   * 1-D unroll
+   *      for(startN = 0; startN < endN; startN++)
+   **/
+  template <int64_t endN, int64_t counter, int64_t packetIndexOffset, bool remM, int64_t remN_>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> aux_loadB(
+      Scalar *B_arr, int64_t LDB, PacketBlock<vecHalf, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &ymm,
+      int64_t remM_ = 0) {
+    constexpr int64_t counterReverse = endN - counter;
+    constexpr int64_t startN = counterReverse;
+
+    EIGEN_IF_CONSTEXPR(remM) {
+      ymm.packet[packetIndexOffset + startN] =
+          ploadu<vecHalf>((const Scalar *)&B_arr[startN * LDB], remMask<EIGEN_AVX_MAX_NUM_ROW>(remM_));
+    }
+    else {
+      EIGEN_IF_CONSTEXPR(remN_ == 0) {
+        ymm.packet[packetIndexOffset + startN] = ploadu<vecHalf>((const Scalar *)&B_arr[startN * LDB]);
+      }
+      else ymm.packet[packetIndexOffset + startN] =
+          ploadu<vecHalf>((const Scalar *)&B_arr[startN * LDB], remMask<EIGEN_AVX_MAX_NUM_ROW>(remN_));
+    }
+
+    aux_loadB<endN, counter - 1, packetIndexOffset, remM, remN_>(B_arr, LDB, ymm, remM_);
+  }
+
+  template <int64_t endN, int64_t counter, int64_t packetIndexOffset, bool remM, int64_t remN_>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)> aux_loadB(
+      Scalar *B_arr, int64_t LDB, PacketBlock<vecHalf, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &ymm,
+      int64_t remM_ = 0) {
+    EIGEN_UNUSED_VARIABLE(B_arr);
+    EIGEN_UNUSED_VARIABLE(LDB);
+    EIGEN_UNUSED_VARIABLE(ymm);
+    EIGEN_UNUSED_VARIABLE(remM_);
+  }
+
+  /**
+   * aux_storeB
+   *
+   * 1-D unroll
+   *      for(startN = 0; startN < endN; startN++)
+   **/
+  template <int64_t endN, int64_t counter, int64_t packetIndexOffset, bool remK, bool remM>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> aux_storeB(
+      Scalar *B_arr, int64_t LDB, PacketBlock<vecHalf, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &ymm, int64_t rem_ = 0) {
+    constexpr int64_t counterReverse = endN - counter;
+    constexpr int64_t startN = counterReverse;
+
+    EIGEN_IF_CONSTEXPR(remK || remM) {
+      pstoreu<Scalar>(&B_arr[startN * LDB], ymm.packet[packetIndexOffset + startN],
+                      remMask<EIGEN_AVX_MAX_NUM_ROW>(rem_));
+    }
+    else {
+      pstoreu<Scalar>(&B_arr[startN * LDB], ymm.packet[packetIndexOffset + startN]);
+    }
+
+    aux_storeB<endN, counter - 1, packetIndexOffset, remK, remM>(B_arr, LDB, ymm, rem_);
+  }
+
+  template <int64_t endN, int64_t counter, int64_t packetIndexOffset, bool remK, bool remM>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)> aux_storeB(
+      Scalar *B_arr, int64_t LDB, PacketBlock<vecHalf, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &ymm, int64_t rem_ = 0) {
+    EIGEN_UNUSED_VARIABLE(B_arr);
+    EIGEN_UNUSED_VARIABLE(LDB);
+    EIGEN_UNUSED_VARIABLE(ymm);
+    EIGEN_UNUSED_VARIABLE(rem_);
+  }
+
+  /**
+   * aux_loadBBlock
+   *
+   * 1-D unroll
+   *      for(startN = 0; startN < endN; startN += EIGEN_AVX_MAX_NUM_ROW)
+   **/
+  template <int64_t endN, int64_t counter, bool toTemp, bool remM, int64_t remN_>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> aux_loadBBlock(
+      Scalar *B_arr, int64_t LDB, Scalar *B_temp, int64_t LDB_,
+      PacketBlock<vecHalf, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &ymm, int64_t remM_ = 0) {
+    constexpr int64_t counterReverse = endN - counter;
+    constexpr int64_t startN = counterReverse;
+    transB::template loadB<EIGEN_AVX_MAX_NUM_ROW, startN, false, (toTemp ? 0 : remN_)>(&B_temp[startN], LDB_, ymm);
+    aux_loadBBlock<endN, counter - EIGEN_AVX_MAX_NUM_ROW, toTemp, remM, remN_>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
+  }
+
+  template <int64_t endN, int64_t counter, bool toTemp, bool remM, int64_t remN_>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)> aux_loadBBlock(
+      Scalar *B_arr, int64_t LDB, Scalar *B_temp, int64_t LDB_,
+      PacketBlock<vecHalf, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &ymm, int64_t remM_ = 0) {
+    EIGEN_UNUSED_VARIABLE(B_arr);
+    EIGEN_UNUSED_VARIABLE(LDB);
+    EIGEN_UNUSED_VARIABLE(B_temp);
+    EIGEN_UNUSED_VARIABLE(LDB_);
+    EIGEN_UNUSED_VARIABLE(ymm);
+    EIGEN_UNUSED_VARIABLE(remM_);
+  }
+
+  /**
+   * aux_storeBBlock
+   *
+   * 1-D unroll
+   *      for(startN = 0; startN < endN; startN += EIGEN_AVX_MAX_NUM_ROW)
+   **/
+  template <int64_t endN, int64_t counter, bool toTemp, bool remM, int64_t remK_>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> aux_storeBBlock(
+      Scalar *B_arr, int64_t LDB, Scalar *B_temp, int64_t LDB_,
+      PacketBlock<vecHalf, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &ymm, int64_t remM_ = 0) {
+    constexpr int64_t counterReverse = endN - counter;
+    constexpr int64_t startN = counterReverse;
+
+    EIGEN_IF_CONSTEXPR(toTemp) {
+      transB::template storeB<EIGEN_AVX_MAX_NUM_ROW, startN, remK_ != 0, false>(&B_temp[startN], LDB_, ymm, remK_);
+    }
+    else {
+      transB::template storeB<std::min(EIGEN_AVX_MAX_NUM_ROW, endN), startN, false, remM>(&B_arr[0 + startN * LDB], LDB,
+                                                                                          ymm, remM_);
+    }
+    aux_storeBBlock<endN, counter - EIGEN_AVX_MAX_NUM_ROW, toTemp, remM, remK_>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
+  }
+
+  template <int64_t endN, int64_t counter, bool toTemp, bool remM, int64_t remK_>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)> aux_storeBBlock(
+      Scalar *B_arr, int64_t LDB, Scalar *B_temp, int64_t LDB_,
+      PacketBlock<vecHalf, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &ymm, int64_t remM_ = 0) {
+    EIGEN_UNUSED_VARIABLE(B_arr);
+    EIGEN_UNUSED_VARIABLE(LDB);
+    EIGEN_UNUSED_VARIABLE(B_temp);
+    EIGEN_UNUSED_VARIABLE(LDB_);
+    EIGEN_UNUSED_VARIABLE(ymm);
+    EIGEN_UNUSED_VARIABLE(remM_);
+  }
+
+  /********************************************************
+   * Wrappers for aux_XXXX to hide counter parameter
+   ********************************************************/
+
+  template <int64_t endN, int64_t packetIndexOffset, bool remM, int64_t remN_>
+  static EIGEN_ALWAYS_INLINE void loadB(Scalar *B_arr, int64_t LDB,
+                                        PacketBlock<vecHalf, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &ymm,
+                                        int64_t remM_ = 0) {
+    aux_loadB<endN, endN, packetIndexOffset, remM, remN_>(B_arr, LDB, ymm, remM_);
+  }
+
+  template <int64_t endN, int64_t packetIndexOffset, bool remK, bool remM>
+  static EIGEN_ALWAYS_INLINE void storeB(Scalar *B_arr, int64_t LDB,
+                                         PacketBlock<vecHalf, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &ymm,
+                                         int64_t rem_ = 0) {
+    aux_storeB<endN, endN, packetIndexOffset, remK, remM>(B_arr, LDB, ymm, rem_);
+  }
+
+  template <int64_t unrollN, bool toTemp, bool remM, int64_t remN_ = 0>
+  static EIGEN_ALWAYS_INLINE void loadBBlock(Scalar *B_arr, int64_t LDB, Scalar *B_temp, int64_t LDB_,
+                                             PacketBlock<vecHalf, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &ymm,
+                                             int64_t remM_ = 0) {
+    EIGEN_IF_CONSTEXPR(toTemp) { transB::template loadB<unrollN, 0, remM, 0>(&B_arr[0], LDB, ymm, remM_); }
+    else {
+      aux_loadBBlock<unrollN, unrollN, toTemp, remM, remN_>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
+    }
+  }
+
+  template <int64_t unrollN, bool toTemp, bool remM, int64_t remK_>
+  static EIGEN_ALWAYS_INLINE void storeBBlock(Scalar *B_arr, int64_t LDB, Scalar *B_temp, int64_t LDB_,
+                                              PacketBlock<vecHalf, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &ymm,
+                                              int64_t remM_ = 0) {
+    aux_storeBBlock<unrollN, unrollN, toTemp, remM, remK_>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
+  }
+
+  template <int64_t packetIndexOffset>
+  static EIGEN_ALWAYS_INLINE void transposeLxL(PacketBlock<vecHalf, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &ymm) {
+    // Note: this assumes EIGEN_AVX_MAX_NUM_ROW = 8. Unrolls should be adjusted
+    // accordingly if EIGEN_AVX_MAX_NUM_ROW is smaller.
+    PacketBlock<vecHalf, EIGEN_AVX_MAX_NUM_ROW> r;
+    r.packet[0] = ymm.packet[packetIndexOffset + 0];
+    r.packet[1] = ymm.packet[packetIndexOffset + 1];
+    r.packet[2] = ymm.packet[packetIndexOffset + 2];
+    r.packet[3] = ymm.packet[packetIndexOffset + 3];
+    r.packet[4] = ymm.packet[packetIndexOffset + 4];
+    r.packet[5] = ymm.packet[packetIndexOffset + 5];
+    r.packet[6] = ymm.packet[packetIndexOffset + 6];
+    r.packet[7] = ymm.packet[packetIndexOffset + 7];
+    ptranspose(r);
+    ymm.packet[packetIndexOffset + 0] = r.packet[0];
+    ymm.packet[packetIndexOffset + 1] = r.packet[1];
+    ymm.packet[packetIndexOffset + 2] = r.packet[2];
+    ymm.packet[packetIndexOffset + 3] = r.packet[3];
+    ymm.packet[packetIndexOffset + 4] = r.packet[4];
+    ymm.packet[packetIndexOffset + 5] = r.packet[5];
+    ymm.packet[packetIndexOffset + 6] = r.packet[6];
+    ymm.packet[packetIndexOffset + 7] = r.packet[7];
+  }
+
+  template <int64_t unrollN, bool toTemp, bool remM>
+  static EIGEN_ALWAYS_INLINE void transB_kernel(Scalar *B_arr, int64_t LDB, Scalar *B_temp, int64_t LDB_,
+                                                PacketBlock<vecHalf, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &ymm,
+                                                int64_t remM_ = 0) {
+    constexpr int64_t U3 = PacketSize * 3;
+    constexpr int64_t U2 = PacketSize * 2;
+    constexpr int64_t U1 = PacketSize * 1;
+    /**
+     *  Unrolls needed for each case:
+     *   - AVX512 fp32 48 32 16 8 4 2 1
+     *   - AVX512 fp64 24 16 8  4 2 1
+     *
+     *  For fp32 L and U1 are 1:2 so for U3/U2 cases the loads/stores need to be split up.
+     */
+    EIGEN_IF_CONSTEXPR(unrollN == U3) {
+      // load LxU3 B col major, transpose LxU3 row major
+      constexpr int64_t maxUBlock = std::min(3 * EIGEN_AVX_MAX_NUM_ROW, U3);
+      transB::template loadBBlock<maxUBlock, toTemp, remM>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
+      transB::template transposeLxL<0 * EIGEN_AVX_MAX_NUM_ROW>(ymm);
+      transB::template transposeLxL<1 * EIGEN_AVX_MAX_NUM_ROW>(ymm);
+      transB::template transposeLxL<2 * EIGEN_AVX_MAX_NUM_ROW>(ymm);
+      transB::template storeBBlock<maxUBlock, toTemp, remM, 0>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
+
+      EIGEN_IF_CONSTEXPR(maxUBlock < U3) {
+        transB::template loadBBlock<maxUBlock, toTemp, remM>(&B_arr[maxUBlock * LDB], LDB, &B_temp[maxUBlock], LDB_,
+                                                             ymm, remM_);
+        transB::template transposeLxL<0 * EIGEN_AVX_MAX_NUM_ROW>(ymm);
+        transB::template transposeLxL<1 * EIGEN_AVX_MAX_NUM_ROW>(ymm);
+        transB::template transposeLxL<2 * EIGEN_AVX_MAX_NUM_ROW>(ymm);
+        transB::template storeBBlock<maxUBlock, toTemp, remM, 0>(&B_arr[maxUBlock * LDB], LDB, &B_temp[maxUBlock], LDB_,
+                                                                 ymm, remM_);
+      }
+    }
+    else EIGEN_IF_CONSTEXPR(unrollN == U2) {
+      // load LxU2 B col major, transpose LxU2 row major
+      constexpr int64_t maxUBlock = std::min(3 * EIGEN_AVX_MAX_NUM_ROW, U2);
+      transB::template loadBBlock<maxUBlock, toTemp, remM>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
+      transB::template transposeLxL<0 * EIGEN_AVX_MAX_NUM_ROW>(ymm);
+      transB::template transposeLxL<1 * EIGEN_AVX_MAX_NUM_ROW>(ymm);
+      EIGEN_IF_CONSTEXPR(maxUBlock < U2) transB::template transposeLxL<2 * EIGEN_AVX_MAX_NUM_ROW>(ymm);
+      transB::template storeBBlock<maxUBlock, toTemp, remM, 0>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
+
+      EIGEN_IF_CONSTEXPR(maxUBlock < U2) {
+        transB::template loadBBlock<EIGEN_AVX_MAX_NUM_ROW, toTemp, remM>(&B_arr[maxUBlock * LDB], LDB,
+                                                                         &B_temp[maxUBlock], LDB_, ymm, remM_);
+        transB::template transposeLxL<0>(ymm);
+        transB::template storeBBlock<EIGEN_AVX_MAX_NUM_ROW, toTemp, remM, 0>(&B_arr[maxUBlock * LDB], LDB,
+                                                                             &B_temp[maxUBlock], LDB_, ymm, remM_);
+      }
+    }
+    else EIGEN_IF_CONSTEXPR(unrollN == U1) {
+      // load LxU1 B col major, transpose LxU1 row major
+      transB::template loadBBlock<U1, toTemp, remM>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
+      transB::template transposeLxL<0>(ymm);
+      EIGEN_IF_CONSTEXPR(EIGEN_AVX_MAX_NUM_ROW < U1) { transB::template transposeLxL<1 * EIGEN_AVX_MAX_NUM_ROW>(ymm); }
+      transB::template storeBBlock<U1, toTemp, remM, 0>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
+    }
+    else EIGEN_IF_CONSTEXPR(unrollN == 8 && U1 > 8) {
+      // load Lx4 B col major, transpose Lx4 row major
+      transB::template loadBBlock<8, toTemp, remM>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
+      transB::template transposeLxL<0>(ymm);
+      transB::template storeBBlock<8, toTemp, remM, 8>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
+    }
+    else EIGEN_IF_CONSTEXPR(unrollN == 4 && U1 > 4) {
+      // load Lx4 B col major, transpose Lx4 row major
+      transB::template loadBBlock<4, toTemp, remM>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
+      transB::template transposeLxL<0>(ymm);
+      transB::template storeBBlock<4, toTemp, remM, 4>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
+    }
+    else EIGEN_IF_CONSTEXPR(unrollN == 2) {
+      // load Lx2 B col major, transpose Lx2 row major
+      transB::template loadBBlock<2, toTemp, remM, 2>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
+      transB::template transposeLxL<0>(ymm);
+      transB::template storeBBlock<2, toTemp, remM, 2>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
+    }
+    else EIGEN_IF_CONSTEXPR(unrollN == 1) {
+      // load Lx1 B col major, transpose Lx1 row major
+      transB::template loadBBlock<1, toTemp, remM, 1>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
+      transB::template transposeLxL<0>(ymm);
+      transB::template storeBBlock<1, toTemp, remM, 1>(B_arr, LDB, B_temp, LDB_, ymm, remM_);
+    }
+  }
+};
+
+/**
+ * Unrolls for triSolveKernel
+ *
+ * Idea:
+ *  1) Load a block of right-hand sides to registers in RHSInPacket (using loadRHS).
+ *  2) Do triangular solve with RHSInPacket and a small block of A (triangular matrix)
+ *     stored in AInPacket (using triSolveMicroKernel).
+ *  3) Store final results (in avx registers) back into memory (using storeRHS).
+ *
+ *  RHSInPacket uses at most EIGEN_AVX_MAX_NUM_ACC avx registers and AInPacket uses at most
+ *  EIGEN_AVX_MAX_NUM_ROW registers.
+ */
+template <typename Scalar>
+class trsm {
+ public:
+  using vec = typename std::conditional<std::is_same<Scalar, float>::value, vecFullFloat, vecFullDouble>::type;
+  static constexpr int64_t PacketSize = packet_traits<Scalar>::size;
+
+  /***********************************
+   * Auxiliary Functions for:
+   *  - loadRHS
+   *  - storeRHS
+   *  - divRHSByDiag
+   *  - updateRHS
+   *  - triSolveMicroKernel
+   ************************************/
+  /**
+   * aux_loadRHS
+   *
+   * 2-D unroll
+   *      for(startM = 0; startM < endM; startM++)
+   *        for(startK = 0; startK < endK; startK++)
+   **/
+  template <bool isFWDSolve, int64_t endM, int64_t endK, int64_t counter, bool krem>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> aux_loadRHS(
+      Scalar *B_arr, int64_t LDB, PacketBlock<vec, EIGEN_AVX_MAX_NUM_ACC> &RHSInPacket, int64_t rem = 0) {
+    constexpr int64_t counterReverse = endM * endK - counter;
+    constexpr int64_t startM = counterReverse / (endK);
+    constexpr int64_t startK = counterReverse % endK;
+
+    constexpr int64_t packetIndex = startM * endK + startK;
+    constexpr int64_t startM_ = isFWDSolve ? startM : -startM;
+    const int64_t rhsIndex = (startK * PacketSize) + startM_ * LDB;
+    EIGEN_IF_CONSTEXPR(krem) {
+      RHSInPacket.packet[packetIndex] = ploadu<vec>(&B_arr[rhsIndex], remMask<PacketSize>(rem));
+    }
+    else {
+      RHSInPacket.packet[packetIndex] = ploadu<vec>(&B_arr[rhsIndex]);
+    }
+    aux_loadRHS<isFWDSolve, endM, endK, counter - 1, krem>(B_arr, LDB, RHSInPacket, rem);
+  }
+
+  template <bool isFWDSolve, int64_t endM, int64_t endK, int64_t counter, bool krem>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)> aux_loadRHS(
+      Scalar *B_arr, int64_t LDB, PacketBlock<vec, EIGEN_AVX_MAX_NUM_ACC> &RHSInPacket, int64_t rem = 0) {
+    EIGEN_UNUSED_VARIABLE(B_arr);
+    EIGEN_UNUSED_VARIABLE(LDB);
+    EIGEN_UNUSED_VARIABLE(RHSInPacket);
+    EIGEN_UNUSED_VARIABLE(rem);
+  }
+
+  /**
+   * aux_storeRHS
+   *
+   * 2-D unroll
+   *      for(startM = 0; startM < endM; startM++)
+   *        for(startK = 0; startK < endK; startK++)
+   **/
+  template <bool isFWDSolve, int64_t endM, int64_t endK, int64_t counter, bool krem>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> aux_storeRHS(
+      Scalar *B_arr, int64_t LDB, PacketBlock<vec, EIGEN_AVX_MAX_NUM_ACC> &RHSInPacket, int64_t rem = 0) {
+    constexpr int64_t counterReverse = endM * endK - counter;
+    constexpr int64_t startM = counterReverse / (endK);
+    constexpr int64_t startK = counterReverse % endK;
+
+    constexpr int64_t packetIndex = startM * endK + startK;
+    constexpr int64_t startM_ = isFWDSolve ? startM : -startM;
+    const int64_t rhsIndex = (startK * PacketSize) + startM_ * LDB;
+    EIGEN_IF_CONSTEXPR(krem) {
+      pstoreu<Scalar>(&B_arr[rhsIndex], RHSInPacket.packet[packetIndex], remMask<PacketSize>(rem));
+    }
+    else {
+      pstoreu<Scalar>(&B_arr[rhsIndex], RHSInPacket.packet[packetIndex]);
+    }
+    aux_storeRHS<isFWDSolve, endM, endK, counter - 1, krem>(B_arr, LDB, RHSInPacket, rem);
+  }
+
+  template <bool isFWDSolve, int64_t endM, int64_t endK, int64_t counter, bool krem>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)> aux_storeRHS(
+      Scalar *B_arr, int64_t LDB, PacketBlock<vec, EIGEN_AVX_MAX_NUM_ACC> &RHSInPacket, int64_t rem = 0) {
+    EIGEN_UNUSED_VARIABLE(B_arr);
+    EIGEN_UNUSED_VARIABLE(LDB);
+    EIGEN_UNUSED_VARIABLE(RHSInPacket);
+    EIGEN_UNUSED_VARIABLE(rem);
+  }
+
+  /**
+   * aux_divRHSByDiag
+   *
+   * currM may be -1, (currM >=0) in enable_if checks for this
+   *
+   * 1-D unroll
+   *      for(startK = 0; startK < endK; startK++)
+   **/
+  template <int64_t currM, int64_t endK, int64_t counter>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0 && currM >= 0)> aux_divRHSByDiag(
+      PacketBlock<vec, EIGEN_AVX_MAX_NUM_ACC> &RHSInPacket, PacketBlock<vec, EIGEN_AVX_MAX_NUM_ROW> &AInPacket) {
+    constexpr int64_t counterReverse = endK - counter;
+    constexpr int64_t startK = counterReverse;
+
+    constexpr int64_t packetIndex = currM * endK + startK;
+    RHSInPacket.packet[packetIndex] = pmul(AInPacket.packet[currM], RHSInPacket.packet[packetIndex]);
+    aux_divRHSByDiag<currM, endK, counter - 1>(RHSInPacket, AInPacket);
+  }
+
+  template <int64_t currM, int64_t endK, int64_t counter>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<!(counter > 0 && currM >= 0)> aux_divRHSByDiag(
+      PacketBlock<vec, EIGEN_AVX_MAX_NUM_ACC> &RHSInPacket, PacketBlock<vec, EIGEN_AVX_MAX_NUM_ROW> &AInPacket) {
+    EIGEN_UNUSED_VARIABLE(RHSInPacket);
+    EIGEN_UNUSED_VARIABLE(AInPacket);
+  }
+
+  /**
+   * aux_updateRHS
+   *
+   * 2-D unroll
+   *      for(startM = initM; startM < endM; startM++)
+   *        for(startK = 0; startK < endK; startK++)
+   **/
+  template <bool isARowMajor, bool isFWDSolve, bool isUnitDiag, int64_t initM, int64_t endM, int64_t endK,
+            int64_t counter, int64_t currentM>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> aux_updateRHS(
+      Scalar *A_arr, int64_t LDA, PacketBlock<vec, EIGEN_AVX_MAX_NUM_ACC> &RHSInPacket,
+      PacketBlock<vec, EIGEN_AVX_MAX_NUM_ROW> &AInPacket) {
+    constexpr int64_t counterReverse = (endM - initM) * endK - counter;
+    constexpr int64_t startM = initM + counterReverse / (endK);
+    constexpr int64_t startK = counterReverse % endK;
+
+    // For each row of A, first update all corresponding RHS
+    constexpr int64_t packetIndex = startM * endK + startK;
+    EIGEN_IF_CONSTEXPR(currentM > 0) {
+      RHSInPacket.packet[packetIndex] =
+          pnmadd(AInPacket.packet[startM], RHSInPacket.packet[(currentM - 1) * endK + startK],
+                 RHSInPacket.packet[packetIndex]);
+    }
+
+    EIGEN_IF_CONSTEXPR(startK == endK - 1) {
+      // Once all RHS for previous row of A is updated, we broadcast the next element in the column A_{i, currentM}.
+      EIGEN_IF_CONSTEXPR(startM == currentM && !isUnitDiag) {
+        // If diagonal is not unit, we broadcast reciprocals of diagonals AinPacket.packet[currentM].
+        // This will be used in divRHSByDiag
+        EIGEN_IF_CONSTEXPR(isFWDSolve)
+        AInPacket.packet[currentM] = pset1<vec>(Scalar(1) / A_arr[idA<isARowMajor>(currentM, currentM, LDA)]);
+        else AInPacket.packet[currentM] = pset1<vec>(Scalar(1) / A_arr[idA<isARowMajor>(-currentM, -currentM, LDA)]);
+      }
+      else {
+        // Broadcast next off diagonal element of A
+        EIGEN_IF_CONSTEXPR(isFWDSolve)
+        AInPacket.packet[startM] = pset1<vec>(A_arr[idA<isARowMajor>(startM, currentM, LDA)]);
+        else AInPacket.packet[startM] = pset1<vec>(A_arr[idA<isARowMajor>(-startM, -currentM, LDA)]);
+      }
+    }
+
+    aux_updateRHS<isARowMajor, isFWDSolve, isUnitDiag, initM, endM, endK, counter - 1, currentM>(
+        A_arr, LDA, RHSInPacket, AInPacket);
+  }
+
+  template <bool isARowMajor, bool isFWDSolve, bool isUnitDiag, int64_t initM, int64_t endM, int64_t endK,
+            int64_t counter, int64_t currentM>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)> aux_updateRHS(
+      Scalar *A_arr, int64_t LDA, PacketBlock<vec, EIGEN_AVX_MAX_NUM_ACC> &RHSInPacket,
+      PacketBlock<vec, EIGEN_AVX_MAX_NUM_ROW> &AInPacket) {
+    EIGEN_UNUSED_VARIABLE(A_arr);
+    EIGEN_UNUSED_VARIABLE(LDA);
+    EIGEN_UNUSED_VARIABLE(RHSInPacket);
+    EIGEN_UNUSED_VARIABLE(AInPacket);
+  }
+
+  /**
+   * aux_triSolverMicroKernel
+   *
+   * 1-D unroll
+   *      for(startM = 0; startM < endM; startM++)
+   **/
+  template <bool isARowMajor, bool isFWDSolve, bool isUnitDiag, int64_t endM, int64_t counter, int64_t numK>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> aux_triSolveMicroKernel(
+      Scalar *A_arr, int64_t LDA, PacketBlock<vec, EIGEN_AVX_MAX_NUM_ACC> &RHSInPacket,
+      PacketBlock<vec, EIGEN_AVX_MAX_NUM_ROW> &AInPacket) {
+    constexpr int64_t counterReverse = endM - counter;
+    constexpr int64_t startM = counterReverse;
+
+    constexpr int64_t currentM = startM;
+    // Divides the right-hand side in row startM, by digonal value of A
+    // broadcasted to AInPacket.packet[startM-1] in the previous iteration.
+    //
+    // Without "if constexpr" the compiler instantiates the case <-1, numK>
+    // this is handled with enable_if to prevent out-of-bound warnings
+    // from the compiler
+    EIGEN_IF_CONSTEXPR(!isUnitDiag && startM > 0)
+    trsm::template divRHSByDiag<startM - 1, numK>(RHSInPacket, AInPacket);
+
+    // After division, the rhs corresponding to subsequent rows of A can be partially updated
+    // We also broadcast the reciprocal of the next diagonal to AInPacket.packet[currentM] (if needed)
+    // to be used in the next iteration.
+    trsm::template updateRHS<isARowMajor, isFWDSolve, isUnitDiag, startM, endM, numK, currentM>(A_arr, LDA, RHSInPacket,
+                                                                                                AInPacket);
+
+    // Handle division for the RHS corresponding to the final row of A.
+    EIGEN_IF_CONSTEXPR(!isUnitDiag && startM == endM - 1)
+    trsm::template divRHSByDiag<startM, numK>(RHSInPacket, AInPacket);
+
+    aux_triSolveMicroKernel<isARowMajor, isFWDSolve, isUnitDiag, endM, counter - 1, numK>(A_arr, LDA, RHSInPacket,
+                                                                                          AInPacket);
+  }
+
+  template <bool isARowMajor, bool isFWDSolve, bool isUnitDiag, int64_t endM, int64_t counter, int64_t numK>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)> aux_triSolveMicroKernel(
+      Scalar *A_arr, int64_t LDA, PacketBlock<vec, EIGEN_AVX_MAX_NUM_ACC> &RHSInPacket,
+      PacketBlock<vec, EIGEN_AVX_MAX_NUM_ROW> &AInPacket) {
+    EIGEN_UNUSED_VARIABLE(A_arr);
+    EIGEN_UNUSED_VARIABLE(LDA);
+    EIGEN_UNUSED_VARIABLE(RHSInPacket);
+    EIGEN_UNUSED_VARIABLE(AInPacket);
+  }
+
+  /********************************************************
+   * Wrappers for aux_XXXX to hide counter parameter
+   ********************************************************/
+
+  /**
+   * Load endMxendK block of B to RHSInPacket
+   * Masked loads are used for cases where endK is not a multiple of PacketSize
+   */
+  template <bool isFWDSolve, int64_t endM, int64_t endK, bool krem = false>
+  static EIGEN_ALWAYS_INLINE void loadRHS(Scalar *B_arr, int64_t LDB,
+                                          PacketBlock<vec, EIGEN_AVX_MAX_NUM_ACC> &RHSInPacket, int64_t rem = 0) {
+    aux_loadRHS<isFWDSolve, endM, endK, endM * endK, krem>(B_arr, LDB, RHSInPacket, rem);
+  }
+
+  /**
+   * Load endMxendK block of B to RHSInPacket
+   * Masked loads are used for cases where endK is not a multiple of PacketSize
+   */
+  template <bool isFWDSolve, int64_t endM, int64_t endK, bool krem = false>
+  static EIGEN_ALWAYS_INLINE void storeRHS(Scalar *B_arr, int64_t LDB,
+                                           PacketBlock<vec, EIGEN_AVX_MAX_NUM_ACC> &RHSInPacket, int64_t rem = 0) {
+    aux_storeRHS<isFWDSolve, endM, endK, endM * endK, krem>(B_arr, LDB, RHSInPacket, rem);
+  }
+
+  /**
+   * Only used if Triangular matrix has non-unit diagonal values
+   */
+  template <int64_t currM, int64_t endK>
+  static EIGEN_ALWAYS_INLINE void divRHSByDiag(PacketBlock<vec, EIGEN_AVX_MAX_NUM_ACC> &RHSInPacket,
+                                               PacketBlock<vec, EIGEN_AVX_MAX_NUM_ROW> &AInPacket) {
+    aux_divRHSByDiag<currM, endK, endK>(RHSInPacket, AInPacket);
+  }
+
+  /**
+   * Update right-hand sides (stored in avx registers)
+   * Traversing along the column A_{i,currentM}, where currentM <= i <= endM, and broadcasting each value to AInPacket.
+   **/
+  template <bool isARowMajor, bool isFWDSolve, bool isUnitDiag, int64_t startM, int64_t endM, int64_t endK,
+            int64_t currentM>
+  static EIGEN_ALWAYS_INLINE void updateRHS(Scalar *A_arr, int64_t LDA,
+                                            PacketBlock<vec, EIGEN_AVX_MAX_NUM_ACC> &RHSInPacket,
+                                            PacketBlock<vec, EIGEN_AVX_MAX_NUM_ROW> &AInPacket) {
+    aux_updateRHS<isARowMajor, isFWDSolve, isUnitDiag, startM, endM, endK, (endM - startM) * endK, currentM>(
+        A_arr, LDA, RHSInPacket, AInPacket);
+  }
+
+  /**
+   * endM: dimension of A. 1 <= endM <= EIGEN_AVX_MAX_NUM_ROW
+   * numK: number of avx registers to use for each row of B (ex fp32: 48 rhs => 3 avx reg used). 1 <= endK <= 3.
+   * isFWDSolve: true => forward substitution, false => backwards substitution
+   * isUnitDiag: true => triangular matrix has unit diagonal.
+   */
+  template <bool isARowMajor, bool isFWDSolve, bool isUnitDiag, int64_t endM, int64_t numK>
+  static EIGEN_ALWAYS_INLINE void triSolveMicroKernel(Scalar *A_arr, int64_t LDA,
+                                                      PacketBlock<vec, EIGEN_AVX_MAX_NUM_ACC> &RHSInPacket,
+                                                      PacketBlock<vec, EIGEN_AVX_MAX_NUM_ROW> &AInPacket) {
+    static_assert(numK >= 1 && numK <= 3, "numK out of range");
+    aux_triSolveMicroKernel<isARowMajor, isFWDSolve, isUnitDiag, endM, endM, numK>(A_arr, LDA, RHSInPacket, AInPacket);
+  }
+};
+
+/**
+ * Unrolls for gemm kernel
+ *
+ * isAdd: true => C += A*B, false => C -= A*B
+ */
+template <typename Scalar, bool isAdd>
+class gemm {
+ public:
+  using vec = typename std::conditional<std::is_same<Scalar, float>::value, vecFullFloat, vecFullDouble>::type;
+  static constexpr int64_t PacketSize = packet_traits<Scalar>::size;
+
+  /***********************************
+   * Auxiliary Functions for:
+   *  - setzero
+   *  - updateC
+   *  - storeC
+   *  - startLoadB
+   *  - triSolveMicroKernel
+   ************************************/
+
+  /**
+   * aux_setzero
+   *
+   * 2-D unroll
+   *      for(startM = 0; startM < endM; startM++)
+   *        for(startN = 0; startN < endN; startN++)
+   **/
+  template <int64_t endM, int64_t endN, int64_t counter>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> aux_setzero(
+      PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm) {
+    constexpr int64_t counterReverse = endM * endN - counter;
+    constexpr int64_t startM = counterReverse / (endN);
+    constexpr int64_t startN = counterReverse % endN;
+
+    zmm.packet[startN * endM + startM] = pzero(zmm.packet[startN * endM + startM]);
+    aux_setzero<endM, endN, counter - 1>(zmm);
+  }
+
+  template <int64_t endM, int64_t endN, int64_t counter>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)> aux_setzero(
+      PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm) {
+    EIGEN_UNUSED_VARIABLE(zmm);
+  }
+
+  /**
+   * aux_updateC
+   *
+   * 2-D unroll
+   *      for(startM = 0; startM < endM; startM++)
+   *        for(startN = 0; startN < endN; startN++)
+   **/
+  template <int64_t endM, int64_t endN, int64_t counter, bool rem>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> aux_updateC(
+      Scalar *C_arr, int64_t LDC, PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm, int64_t rem_ = 0) {
+    EIGEN_UNUSED_VARIABLE(rem_);
+    constexpr int64_t counterReverse = endM * endN - counter;
+    constexpr int64_t startM = counterReverse / (endN);
+    constexpr int64_t startN = counterReverse % endN;
+
+    EIGEN_IF_CONSTEXPR(rem)
+    zmm.packet[startN * endM + startM] =
+        padd(ploadu<vec>(&C_arr[(startN)*LDC + startM * PacketSize], remMask<PacketSize>(rem_)),
+             zmm.packet[startN * endM + startM], remMask<PacketSize>(rem_));
+    else zmm.packet[startN * endM + startM] =
+        padd(ploadu<vec>(&C_arr[(startN)*LDC + startM * PacketSize]), zmm.packet[startN * endM + startM]);
+    aux_updateC<endM, endN, counter - 1, rem>(C_arr, LDC, zmm, rem_);
+  }
+
+  template <int64_t endM, int64_t endN, int64_t counter, bool rem>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)> aux_updateC(
+      Scalar *C_arr, int64_t LDC, PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm, int64_t rem_ = 0) {
+    EIGEN_UNUSED_VARIABLE(C_arr);
+    EIGEN_UNUSED_VARIABLE(LDC);
+    EIGEN_UNUSED_VARIABLE(zmm);
+    EIGEN_UNUSED_VARIABLE(rem_);
+  }
+
+  /**
+   * aux_storeC
+   *
+   * 2-D unroll
+   *      for(startM = 0; startM < endM; startM++)
+   *        for(startN = 0; startN < endN; startN++)
+   **/
+  template <int64_t endM, int64_t endN, int64_t counter, bool rem>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> aux_storeC(
+      Scalar *C_arr, int64_t LDC, PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm, int64_t rem_ = 0) {
+    EIGEN_UNUSED_VARIABLE(rem_);
+    constexpr int64_t counterReverse = endM * endN - counter;
+    constexpr int64_t startM = counterReverse / (endN);
+    constexpr int64_t startN = counterReverse % endN;
+
+    EIGEN_IF_CONSTEXPR(rem)
+    pstoreu<Scalar>(&C_arr[(startN)*LDC + startM * PacketSize], zmm.packet[startN * endM + startM],
+                    remMask<PacketSize>(rem_));
+    else pstoreu<Scalar>(&C_arr[(startN)*LDC + startM * PacketSize], zmm.packet[startN * endM + startM]);
+    aux_storeC<endM, endN, counter - 1, rem>(C_arr, LDC, zmm, rem_);
+  }
+
+  template <int64_t endM, int64_t endN, int64_t counter, bool rem>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)> aux_storeC(
+      Scalar *C_arr, int64_t LDC, PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm, int64_t rem_ = 0) {
+    EIGEN_UNUSED_VARIABLE(C_arr);
+    EIGEN_UNUSED_VARIABLE(LDC);
+    EIGEN_UNUSED_VARIABLE(zmm);
+    EIGEN_UNUSED_VARIABLE(rem_);
+  }
+
+  /**
+   * aux_startLoadB
+   *
+   * 1-D unroll
+   *      for(startL = 0; startL < endL; startL++)
+   **/
+  template <int64_t unrollM, int64_t unrollN, int64_t endL, int64_t counter, bool rem>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> aux_startLoadB(
+      Scalar *B_t, int64_t LDB, PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm, int64_t rem_ = 0) {
+    EIGEN_UNUSED_VARIABLE(rem_);
+    constexpr int64_t counterReverse = endL - counter;
+    constexpr int64_t startL = counterReverse;
+
+    EIGEN_IF_CONSTEXPR(rem)
+    zmm.packet[unrollM * unrollN + startL] =
+        ploadu<vec>(&B_t[(startL / unrollM) * LDB + (startL % unrollM) * PacketSize], remMask<PacketSize>(rem_));
+    else zmm.packet[unrollM * unrollN + startL] =
+        ploadu<vec>(&B_t[(startL / unrollM) * LDB + (startL % unrollM) * PacketSize]);
+
+    aux_startLoadB<unrollM, unrollN, endL, counter - 1, rem>(B_t, LDB, zmm, rem_);
+  }
+
+  template <int64_t unrollM, int64_t unrollN, int64_t endL, int64_t counter, bool rem>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)> aux_startLoadB(
+      Scalar *B_t, int64_t LDB, PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm, int64_t rem_ = 0) {
+    EIGEN_UNUSED_VARIABLE(B_t);
+    EIGEN_UNUSED_VARIABLE(LDB);
+    EIGEN_UNUSED_VARIABLE(zmm);
+    EIGEN_UNUSED_VARIABLE(rem_);
+  }
+
+  /**
+   * aux_startBCastA
+   *
+   * 1-D unroll
+   *      for(startB = 0; startB < endB; startB++)
+   **/
+  template <bool isARowMajor, int64_t unrollM, int64_t unrollN, int64_t endB, int64_t counter, int64_t numLoad>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> aux_startBCastA(
+      Scalar *A_t, int64_t LDA, PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm) {
+    constexpr int64_t counterReverse = endB - counter;
+    constexpr int64_t startB = counterReverse;
+
+    zmm.packet[unrollM * unrollN + numLoad + startB] = pload1<vec>(&A_t[idA<isARowMajor>(startB, 0, LDA)]);
+
+    aux_startBCastA<isARowMajor, unrollM, unrollN, endB, counter - 1, numLoad>(A_t, LDA, zmm);
+  }
+
+  template <bool isARowMajor, int64_t unrollM, int64_t unrollN, int64_t endB, int64_t counter, int64_t numLoad>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)> aux_startBCastA(
+      Scalar *A_t, int64_t LDA, PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm) {
+    EIGEN_UNUSED_VARIABLE(A_t);
+    EIGEN_UNUSED_VARIABLE(LDA);
+    EIGEN_UNUSED_VARIABLE(zmm);
+  }
+
+  /**
+   * aux_loadB
+   * currK: current K
+   *
+   * 1-D unroll
+   *      for(startM = 0; startM < endM; startM++)
+   **/
+  template <int64_t endM, int64_t counter, int64_t unrollN, int64_t currK, int64_t unrollK, int64_t numLoad,
+            int64_t numBCast, bool rem>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> aux_loadB(
+      Scalar *B_t, int64_t LDB, PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm, int64_t rem_ = 0) {
+    EIGEN_UNUSED_VARIABLE(rem_);
+    if ((numLoad / endM + currK < unrollK)) {
+      constexpr int64_t counterReverse = endM - counter;
+      constexpr int64_t startM = counterReverse;
+
+      EIGEN_IF_CONSTEXPR(rem) {
+        zmm.packet[endM * unrollN + (startM + currK * endM) % numLoad] =
+            ploadu<vec>(&B_t[(numLoad / endM + currK) * LDB + startM * PacketSize], remMask<PacketSize>(rem_));
+      }
+      else {
+        zmm.packet[endM * unrollN + (startM + currK * endM) % numLoad] =
+            ploadu<vec>(&B_t[(numLoad / endM + currK) * LDB + startM * PacketSize]);
+      }
+
+      aux_loadB<endM, counter - 1, unrollN, currK, unrollK, numLoad, numBCast, rem>(B_t, LDB, zmm, rem_);
+    }
+  }
+
+  template <int64_t endM, int64_t counter, int64_t unrollN, int64_t currK, int64_t unrollK, int64_t numLoad,
+            int64_t numBCast, bool rem>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)> aux_loadB(
+      Scalar *B_t, int64_t LDB, PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm, int64_t rem_ = 0) {
+    EIGEN_UNUSED_VARIABLE(B_t);
+    EIGEN_UNUSED_VARIABLE(LDB);
+    EIGEN_UNUSED_VARIABLE(zmm);
+    EIGEN_UNUSED_VARIABLE(rem_);
+  }
+
+  /**
+   * aux_microKernel
+   *
+   * 3-D unroll
+   *      for(startM = 0; startM < endM; startM++)
+   *        for(startN = 0; startN < endN; startN++)
+   *          for(startK = 0; startK < endK; startK++)
+   **/
+  template <bool isARowMajor, int64_t endM, int64_t endN, int64_t endK, int64_t counter, int64_t numLoad,
+            int64_t numBCast, bool rem>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter > 0)> aux_microKernel(
+      Scalar *B_t, Scalar *A_t, int64_t LDB, int64_t LDA, PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm,
+      int64_t rem_ = 0) {
+    EIGEN_UNUSED_VARIABLE(rem_);
+    constexpr int64_t counterReverse = endM * endN * endK - counter;
+    constexpr int startK = counterReverse / (endM * endN);
+    constexpr int startN = (counterReverse / (endM)) % endN;
+    constexpr int startM = counterReverse % endM;
+
+    EIGEN_IF_CONSTEXPR(startK == 0 && startM == 0 && startN == 0) {
+      gemm::template startLoadB<endM, endN, numLoad, rem>(B_t, LDB, zmm, rem_);
+      gemm::template startBCastA<isARowMajor, endM, endN, numBCast, numLoad>(A_t, LDA, zmm);
+    }
+
+    {
+      // Interleave FMA and Bcast
+      EIGEN_IF_CONSTEXPR(isAdd) {
+        zmm.packet[startN * endM + startM] =
+            pmadd(zmm.packet[endM * endN + numLoad + (startN + startK * endN) % numBCast],
+                  zmm.packet[endM * endN + (startM + startK * endM) % numLoad], zmm.packet[startN * endM + startM]);
+      }
+      else {
+        zmm.packet[startN * endM + startM] =
+            pnmadd(zmm.packet[endM * endN + numLoad + (startN + startK * endN) % numBCast],
+                   zmm.packet[endM * endN + (startM + startK * endM) % numLoad], zmm.packet[startN * endM + startM]);
+      }
+      // Bcast
+      EIGEN_IF_CONSTEXPR(startM == endM - 1 && (numBCast + startN + startK * endN < endK * endN)) {
+        zmm.packet[endM * endN + numLoad + (startN + startK * endN) % numBCast] = pload1<vec>(&A_t[idA<isARowMajor>(
+            (numBCast + startN + startK * endN) % endN, (numBCast + startN + startK * endN) / endN, LDA)]);
+      }
+    }
+
+    // We have updated all accumulators, time to load next set of B's
+    EIGEN_IF_CONSTEXPR((startN == endN - 1) && (startM == endM - 1)) {
+      gemm::template loadB<endM, endN, startK, endK, numLoad, numBCast, rem>(B_t, LDB, zmm, rem_);
+    }
+    aux_microKernel<isARowMajor, endM, endN, endK, counter - 1, numLoad, numBCast, rem>(B_t, A_t, LDB, LDA, zmm, rem_);
+  }
+
+  template <bool isARowMajor, int64_t endM, int64_t endN, int64_t endK, int64_t counter, int64_t numLoad,
+            int64_t numBCast, bool rem>
+  static EIGEN_ALWAYS_INLINE std::enable_if_t<(counter <= 0)> aux_microKernel(
+      Scalar *B_t, Scalar *A_t, int64_t LDB, int64_t LDA, PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm,
+      int64_t rem_ = 0) {
+    EIGEN_UNUSED_VARIABLE(B_t);
+    EIGEN_UNUSED_VARIABLE(A_t);
+    EIGEN_UNUSED_VARIABLE(LDB);
+    EIGEN_UNUSED_VARIABLE(LDA);
+    EIGEN_UNUSED_VARIABLE(zmm);
+    EIGEN_UNUSED_VARIABLE(rem_);
+  }
+
+  /********************************************************
+   * Wrappers for aux_XXXX to hide counter parameter
+   ********************************************************/
+
+  template <int64_t endM, int64_t endN>
+  static EIGEN_ALWAYS_INLINE void setzero(PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm) {
+    aux_setzero<endM, endN, endM * endN>(zmm);
+  }
+
+  /**
+   * Ideally the compiler folds these into vaddp{s,d} with an embedded memory load.
+   */
+  template <int64_t endM, int64_t endN, bool rem = false>
+  static EIGEN_ALWAYS_INLINE void updateC(Scalar *C_arr, int64_t LDC,
+                                          PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm,
+                                          int64_t rem_ = 0) {
+    EIGEN_UNUSED_VARIABLE(rem_);
+    aux_updateC<endM, endN, endM * endN, rem>(C_arr, LDC, zmm, rem_);
+  }
+
+  template <int64_t endM, int64_t endN, bool rem = false>
+  static EIGEN_ALWAYS_INLINE void storeC(Scalar *C_arr, int64_t LDC,
+                                         PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm,
+                                         int64_t rem_ = 0) {
+    EIGEN_UNUSED_VARIABLE(rem_);
+    aux_storeC<endM, endN, endM * endN, rem>(C_arr, LDC, zmm, rem_);
+  }
+
+  /**
+   * Use numLoad registers for loading B at start of microKernel
+   */
+  template <int64_t unrollM, int64_t unrollN, int64_t endL, bool rem>
+  static EIGEN_ALWAYS_INLINE void startLoadB(Scalar *B_t, int64_t LDB,
+                                             PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm,
+                                             int64_t rem_ = 0) {
+    EIGEN_UNUSED_VARIABLE(rem_);
+    aux_startLoadB<unrollM, unrollN, endL, endL, rem>(B_t, LDB, zmm, rem_);
+  }
+
+  /**
+   * Use numBCast registers for broadcasting A at start of microKernel
+   */
+  template <bool isARowMajor, int64_t unrollM, int64_t unrollN, int64_t endB, int64_t numLoad>
+  static EIGEN_ALWAYS_INLINE void startBCastA(Scalar *A_t, int64_t LDA,
+                                              PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm) {
+    aux_startBCastA<isARowMajor, unrollM, unrollN, endB, endB, numLoad>(A_t, LDA, zmm);
+  }
+
+  /**
+   * Loads next set of B into vector registers between each K unroll.
+   */
+  template <int64_t endM, int64_t unrollN, int64_t currK, int64_t unrollK, int64_t numLoad, int64_t numBCast, bool rem>
+  static EIGEN_ALWAYS_INLINE void loadB(Scalar *B_t, int64_t LDB,
+                                        PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm,
+                                        int64_t rem_ = 0) {
+    EIGEN_UNUSED_VARIABLE(rem_);
+    aux_loadB<endM, endM, unrollN, currK, unrollK, numLoad, numBCast, rem>(B_t, LDB, zmm, rem_);
+  }
+
+  /**
+   * Generates a microkernel for gemm (row-major) with unrolls {1,2,4,8}x{U1,U2,U3} to compute C -= A*B.
+   * A matrix can be row/col-major. B matrix is assumed row-major.
+   *
+   * isARowMajor: is A row major
+   * endM: Number registers per row
+   * endN: Number of rows
+   * endK: Loop unroll for K.
+   * numLoad: Number of registers for loading B.
+   * numBCast: Number of registers for broadcasting A.
+   *
+   * Ex: microkernel<isARowMajor,0,3,0,4,0,4,6,2>: 8x48 unroll (24 accumulators), k unrolled 4 times,
+   * 6 register for loading B, 2 for broadcasting A.
+   *
+   * Note: Ideally the microkernel should not have any register spilling.
+   * The avx instruction counts should be:
+   *   - endK*endN vbroadcasts{s,d}
+   *   - endK*endM vmovup{s,d}
+   *   - endK*endN*endM FMAs
+   *
+   * From testing, there are no register spills with clang. There are register spills with GNU, which
+   * causes a performance hit.
+   */
+  template <bool isARowMajor, int64_t endM, int64_t endN, int64_t endK, int64_t numLoad, int64_t numBCast,
+            bool rem = false>
+  static EIGEN_ALWAYS_INLINE void microKernel(Scalar *B_t, Scalar *A_t, int64_t LDB, int64_t LDA,
+                                              PacketBlock<vec, EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS> &zmm,
+                                              int64_t rem_ = 0) {
+    EIGEN_UNUSED_VARIABLE(rem_);
+    aux_microKernel<isARowMajor, endM, endN, endK, endM * endN * endK, numLoad, numBCast, rem>(B_t, A_t, LDB, LDA, zmm,
+                                                                                               rem_);
+  }
+};
+}  // namespace unrolls
+
+#endif  // EIGEN_CORE_ARCH_AVX512_TRSM_UNROLLS_H
diff --git a/inst/include/Eigen/src/Core/arch/AVX512/TypeCasting.h b/inst/include/Eigen/src/Core/arch/AVX512/TypeCasting.h
new file mode 100644
index 00000000..fc55fd86
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/AVX512/TypeCasting.h
@@ -0,0 +1,277 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2019 Rasmus Munk Larsen <rmlarsen@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TYPE_CASTING_AVX512_H
+#define EIGEN_TYPE_CASTING_AVX512_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+template <>
+struct type_casting_traits<float, bool> : vectorized_type_casting_traits<float, bool> {};
+template <>
+struct type_casting_traits<bool, float> : vectorized_type_casting_traits<bool, float> {};
+
+template <>
+struct type_casting_traits<float, int> : vectorized_type_casting_traits<float, int> {};
+template <>
+struct type_casting_traits<int, float> : vectorized_type_casting_traits<int, float> {};
+
+template <>
+struct type_casting_traits<float, double> : vectorized_type_casting_traits<float, double> {};
+template <>
+struct type_casting_traits<double, float> : vectorized_type_casting_traits<double, float> {};
+
+template <>
+struct type_casting_traits<double, int> : vectorized_type_casting_traits<double, int> {};
+template <>
+struct type_casting_traits<int, double> : vectorized_type_casting_traits<int, double> {};
+
+template <>
+struct type_casting_traits<double, int64_t> : vectorized_type_casting_traits<double, int64_t> {};
+template <>
+struct type_casting_traits<int64_t, double> : vectorized_type_casting_traits<int64_t, double> {};
+
+template <>
+struct type_casting_traits<half, float> : vectorized_type_casting_traits<half, float> {};
+template <>
+struct type_casting_traits<float, half> : vectorized_type_casting_traits<float, half> {};
+
+template <>
+struct type_casting_traits<bfloat16, float> : vectorized_type_casting_traits<bfloat16, float> {};
+template <>
+struct type_casting_traits<float, bfloat16> : vectorized_type_casting_traits<float, bfloat16> {};
+
+template <>
+EIGEN_STRONG_INLINE Packet16b pcast<Packet16f, Packet16b>(const Packet16f& a) {
+  __mmask16 mask = _mm512_cmpneq_ps_mask(a, pzero(a));
+  return _mm512_maskz_cvtepi32_epi8(mask, _mm512_set1_epi32(1));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pcast<Packet16b, Packet16f>(const Packet16b& a) {
+  return _mm512_cvtepi32_ps(_mm512_and_si512(_mm512_cvtepi8_epi32(a), _mm512_set1_epi32(1)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16i pcast<Packet16f, Packet16i>(const Packet16f& a) {
+  return _mm512_cvttps_epi32(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8d pcast<Packet16f, Packet8d>(const Packet16f& a) {
+  return _mm512_cvtps_pd(_mm512_castps512_ps256(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8d pcast<Packet8f, Packet8d>(const Packet8f& a) {
+  return _mm512_cvtps_pd(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8l pcast<Packet8d, Packet8l>(const Packet8d& a) {
+#if defined(EIGEN_VECTORIZE_AVX512DQ) && defined(EIGEN_VECTORIZE_AVX512VL)
+  return _mm512_cvttpd_epi64(a);
+#else
+  constexpr int kTotalBits = sizeof(double) * CHAR_BIT, kMantissaBits = std::numeric_limits<double>::digits - 1,
+                kExponentBits = kTotalBits - kMantissaBits - 1, kBias = (1 << (kExponentBits - 1)) - 1;
+
+  const __m512i cst_one = _mm512_set1_epi64(1);
+  const __m512i cst_total_bits = _mm512_set1_epi64(kTotalBits);
+  const __m512i cst_bias = _mm512_set1_epi64(kBias);
+
+  __m512i a_bits = _mm512_castpd_si512(a);
+  // shift left by 1 to clear the sign bit, and shift right by kMantissaBits + 1 to recover biased exponent
+  __m512i biased_e = _mm512_srli_epi64(_mm512_slli_epi64(a_bits, 1), kMantissaBits + 1);
+  __m512i e = _mm512_sub_epi64(biased_e, cst_bias);
+
+  // shift to the left by kExponentBits + 1 to clear the sign and exponent bits
+  __m512i shifted_mantissa = _mm512_slli_epi64(a_bits, kExponentBits + 1);
+  // shift to the right by kTotalBits - e to convert the significand to an integer
+  __m512i result_significand = _mm512_srlv_epi64(shifted_mantissa, _mm512_sub_epi64(cst_total_bits, e));
+
+  // add the implied bit
+  __m512i result_exponent = _mm512_sllv_epi64(cst_one, e);
+  // e <= 0 is interpreted as a large positive shift (2's complement), which also conveniently results in zero
+  __m512i result = _mm512_add_epi64(result_significand, result_exponent);
+  // handle negative arguments
+  __mmask8 sign_mask = _mm512_cmplt_epi64_mask(a_bits, _mm512_setzero_si512());
+  result = _mm512_mask_sub_epi64(result, sign_mask, _mm512_setzero_si512(), result);
+  return result;
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pcast<Packet16i, Packet16f>(const Packet16i& a) {
+  return _mm512_cvtepi32_ps(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8d pcast<Packet16i, Packet8d>(const Packet16i& a) {
+  return _mm512_cvtepi32_pd(_mm512_castsi512_si256(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8d pcast<Packet8i, Packet8d>(const Packet8i& a) {
+  return _mm512_cvtepi32_pd(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8d pcast<Packet8l, Packet8d>(const Packet8l& a) {
+#if defined(EIGEN_VECTORIZE_AVX512DQ) && defined(EIGEN_VECTORIZE_AVX512VL)
+  return _mm512_cvtepi64_pd(a);
+#else
+  EIGEN_ALIGN64 int64_t aux[8];
+  pstore(aux, a);
+  return _mm512_set_pd(static_cast<double>(aux[7]), static_cast<double>(aux[6]), static_cast<double>(aux[5]),
+                       static_cast<double>(aux[4]), static_cast<double>(aux[3]), static_cast<double>(aux[2]),
+                       static_cast<double>(aux[1]), static_cast<double>(aux[0]));
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pcast<Packet8d, Packet16f>(const Packet8d& a, const Packet8d& b) {
+  return cat256(_mm512_cvtpd_ps(a), _mm512_cvtpd_ps(b));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16i pcast<Packet8d, Packet16i>(const Packet8d& a, const Packet8d& b) {
+  return cat256i(_mm512_cvttpd_epi32(a), _mm512_cvttpd_epi32(b));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8i pcast<Packet8d, Packet8i>(const Packet8d& a) {
+  return _mm512_cvtpd_epi32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pcast<Packet8d, Packet8f>(const Packet8d& a) {
+  return _mm512_cvtpd_ps(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16i preinterpret<Packet16i, Packet16f>(const Packet16f& a) {
+  return _mm512_castps_si512(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f preinterpret<Packet16f, Packet16i>(const Packet16i& a) {
+  return _mm512_castsi512_ps(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8d preinterpret<Packet8d, Packet16f>(const Packet16f& a) {
+  return _mm512_castps_pd(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8d preinterpret<Packet8d, Packet8l>(const Packet8l& a) {
+  return _mm512_castsi512_pd(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8l preinterpret<Packet8l, Packet8d>(const Packet8d& a) {
+  return _mm512_castpd_si512(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f preinterpret<Packet16f, Packet8d>(const Packet8d& a) {
+  return _mm512_castpd_ps(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8f preinterpret<Packet8f, Packet16f>(const Packet16f& a) {
+  return _mm512_castps512_ps256(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f, Packet16f>(const Packet16f& a) {
+  return _mm512_castps512_ps128(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4d preinterpret<Packet4d, Packet8d>(const Packet8d& a) {
+  return _mm512_castpd512_pd256(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet8d>(const Packet8d& a) {
+  return _mm512_castpd512_pd128(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f preinterpret<Packet16f, Packet8f>(const Packet8f& a) {
+  return _mm512_castps256_ps512(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f preinterpret<Packet16f, Packet4f>(const Packet4f& a) {
+  return _mm512_castps128_ps512(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8d preinterpret<Packet8d, Packet4d>(const Packet4d& a) {
+  return _mm512_castpd256_pd512(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8d preinterpret<Packet8d, Packet2d>(const Packet2d& a) {
+  return _mm512_castpd128_pd512(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8i preinterpret<Packet8i, Packet16i>(const Packet16i& a) {
+  return _mm512_castsi512_si256(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet16i>(const Packet16i& a) {
+  return _mm512_castsi512_si128(a);
+}
+
+#ifndef EIGEN_VECTORIZE_AVX512FP16
+template <>
+EIGEN_STRONG_INLINE Packet8h preinterpret<Packet8h, Packet16h>(const Packet16h& a) {
+  return _mm256_castsi256_si128(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pcast<Packet16h, Packet16f>(const Packet16h& a) {
+  return half2float(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pcast<Packet16f, Packet16h>(const Packet16f& a) {
+  return float2half(a);
+}
+
+#endif
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf preinterpret<Packet8bf, Packet16bf>(const Packet16bf& a) {
+  return _mm256_castsi256_si128(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pcast<Packet16bf, Packet16f>(const Packet16bf& a) {
+  return Bf16ToF32(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16bf pcast<Packet16f, Packet16bf>(const Packet16f& a) {
+  return F32ToBf16(a);
+}
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_TYPE_CASTING_AVX512_H
diff --git a/inst/include/Eigen/src/Core/arch/AVX512/TypeCastingFP16.h b/inst/include/Eigen/src/Core/arch/AVX512/TypeCastingFP16.h
new file mode 100644
index 00000000..f06f13df
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/AVX512/TypeCastingFP16.h
@@ -0,0 +1,130 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2025 The Eigen Authors.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TYPE_CASTING_FP16_AVX512_H
+#define EIGEN_TYPE_CASTING_FP16_AVX512_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+template <>
+EIGEN_STRONG_INLINE Packet32s preinterpret<Packet32s, Packet32h>(const Packet32h& a) {
+  return _mm512_castph_si512(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16s preinterpret<Packet16s, Packet16h>(const Packet16h& a) {
+  return _mm256_castph_si256(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s preinterpret<Packet8s, Packet8h>(const Packet8h& a) {
+  return _mm_castph_si128(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet32h preinterpret<Packet32h, Packet32s>(const Packet32s& a) {
+  return _mm512_castsi512_ph(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16h preinterpret<Packet16h, Packet16s>(const Packet16s& a) {
+  return _mm256_castsi256_ph(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8h preinterpret<Packet8h, Packet8s>(const Packet8s& a) {
+  return _mm_castsi128_ph(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pcast<Packet16h, Packet16f>(const Packet16h& a) {
+  return half2float(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pcast<Packet8h, Packet8f>(const Packet8h& a) {
+  return half2float(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16h pcast<Packet16f, Packet16h>(const Packet16f& a) {
+  return float2half(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8h pcast<Packet8f, Packet8h>(const Packet8f& a) {
+  return float2half(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f pcast<Packet32h, Packet16f>(const Packet32h& a) {
+  // Discard second-half of input.
+  Packet16h low = _mm256_castpd_ph(_mm512_extractf64x4_pd(_mm512_castph_pd(a), 0));
+  return _mm512_cvtxph_ps(low);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pcast<Packet16h, Packet8f>(const Packet16h& a) {
+  // Discard second-half of input.
+  Packet8h low = _mm_castps_ph(_mm256_extractf32x4_ps(_mm256_castph_ps(a), 0));
+  return _mm256_cvtxph_ps(low);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet8h, Packet4f>(const Packet8h& a) {
+  Packet8f full = _mm256_cvtxph_ps(a);
+  // Discard second-half of input.
+  return _mm256_extractf32x4_ps(full, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pcast<Packet16f, Packet32h>(const Packet16f& a, const Packet16f& b) {
+  __m512 result = _mm512_castsi512_ps(_mm512_castsi256_si512(_mm256_castph_si256(_mm512_cvtxps_ph(a))));
+  result = _mm512_insertf32x8(result, _mm256_castph_ps(_mm512_cvtxps_ph(b)), 1);
+  return _mm512_castps_ph(result);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16h pcast<Packet8f, Packet16h>(const Packet8f& a, const Packet8f& b) {
+  __m256 result = _mm256_castsi256_ps(_mm256_castsi128_si256(_mm_castph_si128(_mm256_cvtxps_ph(a))));
+  result = _mm256_insertf32x4(result, _mm_castph_ps(_mm256_cvtxps_ph(b)), 1);
+  return _mm256_castps_ph(result);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8h pcast<Packet4f, Packet8h>(const Packet4f& a, const Packet4f& b) {
+  __m256 result = _mm256_castsi256_ps(_mm256_castsi128_si256(_mm_castps_si128(a)));
+  result = _mm256_insertf128_ps(result, b, 1);
+  return _mm256_cvtxps_ph(result);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet32s pcast<Packet32h, Packet32s>(const Packet32h& a) {
+  return _mm512_cvtph_epi16(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16s pcast<Packet16h, Packet16s>(const Packet16h& a) {
+  return _mm256_cvtph_epi16(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pcast<Packet8h, Packet8s>(const Packet8h& a) {
+  return _mm_cvtph_epi16(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet32h pcast<Packet32s, Packet32h>(const Packet32s& a) {
+  return _mm512_cvtepi16_ph(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16h pcast<Packet16s, Packet16h>(const Packet16s& a) {
+  return _mm256_cvtepi16_ph(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8h pcast<Packet8s, Packet8h>(const Packet8s& a) {
+  return _mm_cvtepi16_ph(a);
+}
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_TYPE_CASTING_FP16_AVX512_H
diff --git a/inst/include/Eigen/src/Core/arch/AltiVec/Complex.h b/inst/include/Eigen/src/Core/arch/AltiVec/Complex.h
index 68d9a2bf..d6df59af 100644
--- a/inst/include/Eigen/src/Core/arch/AltiVec/Complex.h
+++ b/inst/include/Eigen/src/Core/arch/AltiVec/Complex.h
@@ -2,216 +2,652 @@
 // for linear algebra.
 //
 // Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2010-2016 Konstantinos Margaritis <markos@freevec.org>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#ifndef EIGEN_COMPLEX_ALTIVEC_H
-#define EIGEN_COMPLEX_ALTIVEC_H
+#ifndef EIGEN_COMPLEX32_ALTIVEC_H
+#define EIGEN_COMPLEX32_ALTIVEC_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
 
 namespace Eigen {
 
 namespace internal {
 
-static Packet4ui  p4ui_CONJ_XOR = vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_ZERO_);//{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
-static Packet16uc p16uc_COMPLEX_RE   = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
-static Packet16uc p16uc_COMPLEX_IM   = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
-static Packet16uc p16uc_COMPLEX_REV  = vec_sld(p16uc_REVERSE, p16uc_REVERSE, 8);//{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
-static Packet16uc p16uc_COMPLEX_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8);//{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
-static Packet16uc p16uc_PSET_HI = (Packet16uc) vec_mergeh((Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 1));//{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };
-static Packet16uc p16uc_PSET_LO = (Packet16uc) vec_mergeh((Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 2), (Packet4ui) vec_splat((Packet4ui)p16uc_FORWARD, 3));//{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
+inline Packet4ui p4ui_CONJ_XOR() {
+  return vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO);  //{ 0x00000000, 0x80000000, 0x00000000, 0x80000000 };
+}
+#ifdef EIGEN_VECTORIZE_VSX
+#if defined(_BIG_ENDIAN)
+inline Packet2ul p2ul_CONJ_XOR1() {
+  return (Packet2ul)vec_sld((Packet4ui)p2d_MZERO, (Packet4ui)p2l_ZERO,
+                            8);  //{ 0x8000000000000000, 0x0000000000000000 };
+}
+inline Packet2ul p2ul_CONJ_XOR2() {
+  return (Packet2ul)vec_sld((Packet4ui)p2l_ZERO, (Packet4ui)p2d_MZERO,
+                            8);  //{ 0x8000000000000000, 0x0000000000000000 };
+}
+#else
+inline Packet2ul p2ul_CONJ_XOR1() {
+  return (Packet2ul)vec_sld((Packet4ui)p2l_ZERO, (Packet4ui)p2d_MZERO,
+                            8);  //{ 0x8000000000000000, 0x0000000000000000 };
+}
+inline Packet2ul p2ul_CONJ_XOR2() {
+  return (Packet2ul)vec_sld((Packet4ui)p2d_MZERO, (Packet4ui)p2l_ZERO,
+                            8);  //{ 0x8000000000000000, 0x0000000000000000 };
+}
+#endif
+#endif
 
 //---------- float ----------
-struct Packet2cf
-{
-  EIGEN_STRONG_INLINE Packet2cf() {}
+struct Packet2cf {
+  EIGEN_STRONG_INLINE explicit Packet2cf() {}
   EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {}
-  Packet4f  v;
+
+  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) {
+    Packet4f v1, v2;
+
+    // Permute and multiply the real parts of a and b
+    v1 = vec_perm(a.v, a.v, p16uc_PSET32_WODD);
+    // Get the imaginary parts of a
+    v2 = vec_perm(a.v, a.v, p16uc_PSET32_WEVEN);
+    // multiply a_re * b
+    v1 = vec_madd(v1, b.v, p4f_ZERO);
+    // multiply a_im * b and get the conjugate result
+    v2 = vec_madd(v2, b.v, p4f_ZERO);
+    v2 = reinterpret_cast<Packet4f>(pxor(v2, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR())));
+    // permute back to a proper order
+    v2 = vec_perm(v2, v2, p16uc_COMPLEX32_REV);
+
+    return Packet2cf(padd<Packet4f>(v1, v2));
+  }
+
+  EIGEN_STRONG_INLINE Packet2cf& operator*=(const Packet2cf& b) {
+    v = pmul(Packet2cf(*this), b).v;
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet2cf operator*(const Packet2cf& b) const { return Packet2cf(*this) *= b; }
+
+  EIGEN_STRONG_INLINE Packet2cf& operator+=(const Packet2cf& b) {
+    v = padd(v, b.v);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet2cf operator+(const Packet2cf& b) const { return Packet2cf(*this) += b; }
+  EIGEN_STRONG_INLINE Packet2cf& operator-=(const Packet2cf& b) {
+    v = psub(v, b.v);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet2cf operator-(const Packet2cf& b) const { return Packet2cf(*this) -= b; }
+  EIGEN_STRONG_INLINE Packet2cf operator-(void) const { return Packet2cf(-v); }
+
+  Packet4f v;
 };
 
-template<> struct packet_traits<std::complex<float> >  : default_packet_traits
-{
+template <>
+struct packet_traits<std::complex<float> > : default_packet_traits {
   typedef Packet2cf type;
+  typedef Packet2cf half;
+  typedef Packet4f as_real;
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 2,
 
-    HasAdd    = 1,
-    HasSub    = 1,
-    HasMul    = 1,
-    HasDiv    = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
     HasNegate = 1,
-    HasAbs    = 0,
-    HasAbs2   = 0,
-    HasMin    = 0,
-    HasMax    = 0,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 0,
+    HasMax = 0,
+    HasSqrt = 1,
+    HasLog = 1,
+    HasExp = 1,
+#ifdef EIGEN_VECTORIZE_VSX
+    HasBlend = 1,
+#endif
     HasSetLinear = 0
   };
 };
 
-template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; };
+template <>
+struct unpacket_traits<Packet2cf> {
+  typedef std::complex<float> type;
+  enum {
+    size = 2,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+  typedef Packet2cf half;
+  typedef Packet4f as_real;
+};
 
-template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from) {
   Packet2cf res;
-  /* On AltiVec we cannot load 64-bit registers, so wa have to take care of alignment */
-  if((ptrdiff_t(&from) % 16) == 0)
-    res.v = pload<Packet4f>((const float *)&from);
+#ifdef EIGEN_VECTORIZE_VSX
+  // Load a single std::complex<float> from memory and duplicate
+  //
+  // Using pload would read past the end of the reference in this case
+  // Using vec_xl_len + vec_splat, generates poor assembly
+  __asm__("lxvdsx %x0,%y1" : "=wa"(res.v) : "Z"(from));
+#else
+  if ((std::ptrdiff_t(&from) % 16) == 0)
+    res.v = pload<Packet4f>((const float*)&from);
   else
-    res.v = ploadu<Packet4f>((const float *)&from);
-  res.v = vec_perm(res.v, res.v, p16uc_PSET_HI);
+    res.v = ploadu<Packet4f>((const float*)&from);
+  res.v = vec_perm(res.v, res.v, p16uc_PSET64_HI);
+#endif
   return res;
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_add(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_sub(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate(a.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) { return Packet2cf((Packet4f)vec_xor((Packet4ui)a.v, p4ui_CONJ_XOR)); }
-
-template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
-  Packet4f v1, v2;
-
-  // Permute and multiply the real parts of a and b
-  v1 = vec_perm(a.v, a.v, p16uc_COMPLEX_RE);
-  // Get the imaginary parts of a
-  v2 = vec_perm(a.v, a.v, p16uc_COMPLEX_IM);
-  // multiply a_re * b 
-  v1 = vec_madd(v1, b.v, p4f_ZERO);
-  // multiply a_im * b and get the conjugate result
-  v2 = vec_madd(v2, b.v, p4f_ZERO);
-  v2 = (Packet4f) vec_xor((Packet4ui)v2, p4ui_CONJ_XOR);
-  // permute back to a proper order
-  v2 = vec_perm(v2, v2, p16uc_COMPLEX_REV);
-  
-  return Packet2cf(vec_add(v1, v2));
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cf pand   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_and(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf por    <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_or(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pxor   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_xor(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(vec_and(a.v, vec_nor(b.v,b.v))); }
-
-template<> EIGEN_STRONG_INLINE Packet2cf pload <Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>((const float*)from)); }
-template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>((const float*)from)); }
-
-template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>*     from)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>* from) {
+  return Packet2cf(pload<Packet4f>((const float*)from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) {
+  return Packet2cf(ploadu<Packet4f>((const float*)from));
+}
+template <>
+EIGEN_ALWAYS_INLINE Packet2cf pload_partial<Packet2cf>(const std::complex<float>* from, const Index n,
+                                                       const Index offset) {
+  return Packet2cf(pload_partial<Packet4f>((const float*)from, n * 2, offset * 2));
+}
+template <>
+EIGEN_ALWAYS_INLINE Packet2cf ploadu_partial<Packet2cf>(const std::complex<float>* from, const Index n,
+                                                        const Index offset) {
+  return Packet2cf(ploadu_partial<Packet4f>((const float*)from, n * 2, offset * 2));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) {
   return pset1<Packet2cf>(*from);
 }
 
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); }
+template <>
+EIGEN_STRONG_INLINE void pstore<std::complex<float> >(std::complex<float>* to, const Packet2cf& from) {
+  pstore((float*)to, from.v);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet2cf& from) {
+  pstoreu((float*)to, from.v);
+}
+template <>
+EIGEN_ALWAYS_INLINE void pstore_partial<std::complex<float> >(std::complex<float>* to, const Packet2cf& from,
+                                                              const Index n, const Index offset) {
+  pstore_partial((float*)to, from.v, n * 2, offset * 2);
+}
+template <>
+EIGEN_ALWAYS_INLINE void pstoreu_partial<std::complex<float> >(std::complex<float>* to, const Packet2cf& from,
+                                                               const Index n, const Index offset) {
+  pstoreu_partial((float*)to, from.v, n * 2, offset * 2);
+}
 
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *   addr) { vec_dstt((float *)addr, DST_CTRL(2,2,32), DST_CHAN); }
+EIGEN_STRONG_INLINE Packet2cf pload2(const std::complex<float>& from0, const std::complex<float>& from1) {
+  Packet4f res0, res1;
+#ifdef EIGEN_VECTORIZE_VSX
+  // Load two std::complex<float> from memory and combine
+  __asm__("lxsdx %x0,%y1" : "=wa"(res0) : "Z"(from0));
+  __asm__("lxsdx %x0,%y1" : "=wa"(res1) : "Z"(from1));
+#ifdef _BIG_ENDIAN
+  __asm__("xxpermdi %x0, %x1, %x2, 0" : "=wa"(res0) : "wa"(res0), "wa"(res1));
+#else
+  __asm__("xxpermdi %x0, %x2, %x1, 0" : "=wa"(res0) : "wa"(res0), "wa"(res1));
+#endif
+#else
+  *reinterpret_cast<std::complex<float>*>(&res0) = from0;
+  *reinterpret_cast<std::complex<float>*>(&res1) = from1;
+  res0 = vec_perm(res0, res1, p16uc_TRANSPOSE64_HI);
+#endif
+  return Packet2cf(res0);
+}
 
-template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Packet2cf& a)
-{
-  std::complex<float> EIGEN_ALIGN16 res[2];
-  pstore((float *)&res, a.v);
+template <>
+EIGEN_ALWAYS_INLINE Packet2cf pload_ignore<Packet2cf>(const std::complex<float>* from) {
+  Packet2cf res;
+  res.v = pload_ignore<Packet4f>(reinterpret_cast<const float*>(from));
+  return res;
+}
+
+template <typename Scalar, typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pgather_complex_size2(const Scalar* from, Index stride,
+                                                                   const Index n = 2) {
+  eigen_internal_assert(n <= unpacket_traits<Packet>::size && "number of elements will gather past end of packet");
+  EIGEN_ALIGN16 Scalar af[2];
+  for (Index i = 0; i < n; i++) {
+    af[i] = from[i * stride];
+  }
+  return pload_ignore<Packet>(af);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from,
+                                                                                        Index stride) {
+  return pgather_complex_size2<std::complex<float>, Packet2cf>(from, stride);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2cf
+pgather_partial<std::complex<float>, Packet2cf>(const std::complex<float>* from, Index stride, const Index n) {
+  return pgather_complex_size2<std::complex<float>, Packet2cf>(from, stride, n);
+}
+template <typename Scalar, typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_complex_size2(Scalar* to, const Packet& from, Index stride,
+                                                                  const Index n = 2) {
+  eigen_internal_assert(n <= unpacket_traits<Packet>::size && "number of elements will scatter past end of packet");
+  EIGEN_ALIGN16 Scalar af[2];
+  pstore<Scalar>((Scalar*)af, from);
+  for (Index i = 0; i < n; i++) {
+    to[i * stride] = af[i];
+  }
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to,
+                                                                                    const Packet2cf& from,
+                                                                                    Index stride) {
+  pscatter_complex_size2<std::complex<float>, Packet2cf>(to, from, stride);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<std::complex<float>, Packet2cf>(std::complex<float>* to,
+                                                                                            const Packet2cf& from,
+                                                                                            Index stride,
+                                                                                            const Index n) {
+  pscatter_complex_size2<std::complex<float>, Packet2cf>(to, from, stride, n);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return Packet2cf(a.v + b.v);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return Packet2cf(a.v - b.v);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) {
+  return Packet2cf(pnegate(a.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) {
+  return Packet2cf(pxor<Packet4f>(a.v, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR())));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pand<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return Packet2cf(pand<Packet4f>(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf por<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return Packet2cf(por<Packet4f>(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pxor<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return Packet2cf(pxor<Packet4f>(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return Packet2cf(pandnot<Packet4f>(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float>* addr) {
+  EIGEN_PPC_PREFETCH(addr);
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a) {
+  EIGEN_ALIGN16 std::complex<float> res[2];
+  pstore((float*)&res, a.v);
 
   return res[0];
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a)
-{
+template <>
+EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) {
   Packet4f rev_a;
-  rev_a = vec_perm(a.v, a.v, p16uc_COMPLEX_REV2);
+  rev_a = vec_sld(a.v, a.v, 8);
   return Packet2cf(rev_a);
 }
 
-template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
-{
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a) {
   Packet4f b;
-  b = (Packet4f) vec_sld(a.v, a.v, 8);
-  b = padd(a.v, b);
-  return pfirst(Packet2cf(b));
+  b = vec_sld(a.v, a.v, 8);
+  b = padd<Packet4f>(a.v, b);
+  return pfirst<Packet2cf>(Packet2cf(b));
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vecs)
-{
-  Packet4f b1, b2;
-  
-  b1 = (Packet4f) vec_sld(vecs[0].v, vecs[1].v, 8);
-  b2 = (Packet4f) vec_sld(vecs[1].v, vecs[0].v, 8);
-  b2 = (Packet4f) vec_sld(b2, b2, 8);
-  b2 = padd(b1, b2);
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a) {
+  Packet4f b;
+  Packet2cf prod;
+  b = vec_sld(a.v, a.v, 8);
+  prod = pmul<Packet2cf>(a, Packet2cf(b));
 
-  return Packet2cf(b2);
+  return pfirst<Packet2cf>(prod);
 }
 
-template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a)
-{
-  Packet4f b;
-  Packet2cf prod;
-  b = (Packet4f) vec_sld(a.v, a.v, 8);
-  prod = pmul(a, Packet2cf(b));
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf, Packet4f)
 
-  return pfirst(prod);
+template <>
+EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return pdiv_complex(a, b);
 }
 
-template<int Offset>
-struct palign_impl<Offset,Packet2cf>
-{
-  static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second)
-  {
-    if (Offset==1)
-    {
-      first.v = vec_sld(first.v, second.v, 8);
-    }
-  }
-};
+template <>
+EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& x) {
+  return Packet2cf(vec_perm(x.v, x.v, p16uc_COMPLEX32_REV));
+}
 
-template<> struct conj_helper<Packet2cf, Packet2cf, false,true>
-{
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
-  { return padd(pmul(x,y),c); }
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf, 2>& kernel) {
+#ifdef EIGEN_VECTORIZE_VSX
+  Packet4f tmp = reinterpret_cast<Packet4f>(
+      vec_mergeh(reinterpret_cast<Packet2d>(kernel.packet[0].v), reinterpret_cast<Packet2d>(kernel.packet[1].v)));
+  kernel.packet[1].v = reinterpret_cast<Packet4f>(
+      vec_mergel(reinterpret_cast<Packet2d>(kernel.packet[0].v), reinterpret_cast<Packet2d>(kernel.packet[1].v)));
+#else
+  Packet4f tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI);
+  kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO);
+#endif
+  kernel.packet[0].v = tmp;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) {
+  Packet4f eq = reinterpret_cast<Packet4f>(vec_cmpeq(a.v, b.v));
+  return Packet2cf(vec_and(eq, vec_perm(eq, eq, p16uc_COMPLEX32_REV)));
+}
+
+#ifdef EIGEN_VECTORIZE_VSX
+template <>
+EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket,
+                                     const Packet2cf& elsePacket) {
+  Packet2cf result;
+  result.v = reinterpret_cast<Packet4f>(
+      pblend<Packet2d>(ifPacket, reinterpret_cast<Packet2d>(thenPacket.v), reinterpret_cast<Packet2d>(elsePacket.v)));
+  return result;
+}
+#endif
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf psqrt<Packet2cf>(const Packet2cf& a) {
+  return psqrt_complex<Packet2cf>(a);
+}
 
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
-  {
-    return internal::pmul(a, pconj(b));
+template <>
+EIGEN_STRONG_INLINE Packet2cf plog<Packet2cf>(const Packet2cf& a) {
+  return plog_complex<Packet2cf>(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pexp<Packet2cf>(const Packet2cf& a) {
+  return pexp_complex<Packet2cf>(a);
+}
+
+//---------- double ----------
+#ifdef EIGEN_VECTORIZE_VSX
+struct Packet1cd {
+  EIGEN_STRONG_INLINE Packet1cd() {}
+  EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {}
+
+  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) {
+    Packet2d a_re, a_im, v1, v2;
+
+    // Permute and multiply the real parts of a and b
+    a_re = vec_perm(a.v, a.v, p16uc_PSET64_HI);
+    // Get the imaginary parts of a
+    a_im = vec_perm(a.v, a.v, p16uc_PSET64_LO);
+    // multiply a_re * b
+    v1 = vec_madd(a_re, b.v, p2d_ZERO);
+    // multiply a_im * b and get the conjugate result
+    v2 = vec_madd(a_im, b.v, p2d_ZERO);
+    v2 = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(v2), reinterpret_cast<Packet4ui>(v2), 8));
+    v2 = pxor(v2, reinterpret_cast<Packet2d>(p2ul_CONJ_XOR1()));
+
+    return Packet1cd(padd<Packet2d>(v1, v2));
   }
-};
 
-template<> struct conj_helper<Packet2cf, Packet2cf, true,false>
-{
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
-  { return padd(pmul(x,y),c); }
+  EIGEN_STRONG_INLINE Packet1cd& operator*=(const Packet1cd& b) {
+    v = pmul(Packet1cd(*this), b).v;
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet1cd operator*(const Packet1cd& b) const { return Packet1cd(*this) *= b; }
 
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
-  {
-    return internal::pmul(pconj(a), b);
+  EIGEN_STRONG_INLINE Packet1cd& operator+=(const Packet1cd& b) {
+    v = padd(v, b.v);
+    return *this;
   }
+  EIGEN_STRONG_INLINE Packet1cd operator+(const Packet1cd& b) const { return Packet1cd(*this) += b; }
+  EIGEN_STRONG_INLINE Packet1cd& operator-=(const Packet1cd& b) {
+    v = psub(v, b.v);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet1cd operator-(const Packet1cd& b) const { return Packet1cd(*this) -= b; }
+  EIGEN_STRONG_INLINE Packet1cd operator-(void) const { return Packet1cd(-v); }
+
+  Packet2d v;
 };
 
-template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
-{
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
-  { return padd(pmul(x,y),c); }
+template <>
+struct packet_traits<std::complex<double> > : default_packet_traits {
+  typedef Packet1cd type;
+  typedef Packet1cd half;
+  typedef Packet2d as_real;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 0,
+    size = 1,
 
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
-  {
-    return pconj(internal::pmul(a, b));
-  }
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasNegate = 1,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 0,
+    HasMax = 0,
+    HasSqrt = 1,
+    HasLog = 1,
+    HasSetLinear = 0
+  };
+};
+
+template <>
+struct unpacket_traits<Packet1cd> {
+  typedef std::complex<double> type;
+  enum {
+    size = 1,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+  typedef Packet1cd half;
+  typedef Packet2d as_real;
 };
 
-template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
-  // TODO optimize it for AltiVec
-  Packet2cf res = conj_helper<Packet2cf,Packet2cf,false,true>().pmul(a,b);
-  Packet4f s = vec_madd(b.v, b.v, p4f_ZERO);
-  return Packet2cf(pdiv(res.v, vec_add(s,vec_perm(s, s, p16uc_COMPLEX_REV))));
+template <>
+EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from) {
+  return Packet1cd(pload<Packet2d>((const double*)from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) {
+  return Packet1cd(ploadu<Packet2d>((const double*)from));
+}
+template <>
+EIGEN_ALWAYS_INLINE Packet1cd pload_partial<Packet1cd>(const std::complex<double>* from, const Index n,
+                                                       const Index offset) {
+  return Packet1cd(pload_partial<Packet2d>((const double*)from, n * 2, offset * 2));
+}
+template <>
+EIGEN_ALWAYS_INLINE Packet1cd ploadu_partial<Packet1cd>(const std::complex<double>* from, const Index n,
+                                                        const Index offset) {
+  return Packet1cd(ploadu_partial<Packet2d>((const double*)from, n * 2, offset * 2));
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<std::complex<double> >(std::complex<double>* to, const Packet1cd& from) {
+  pstore((double*)to, from.v);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double>* to, const Packet1cd& from) {
+  pstoreu((double*)to, from.v);
+}
+template <>
+EIGEN_ALWAYS_INLINE void pstore_partial<std::complex<double> >(std::complex<double>* to, const Packet1cd& from,
+                                                               const Index n, const Index offset) {
+  pstore_partial((double*)to, from.v, n * 2, offset * 2);
+}
+template <>
+EIGEN_ALWAYS_INLINE void pstoreu_partial<std::complex<double> >(std::complex<double>* to, const Packet1cd& from,
+                                                                const Index n, const Index offset) {
+  pstoreu_partial((double*)to, from.v, n * 2, offset * 2);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd
+pset1<Packet1cd>(const std::complex<double>& from) { /* here we really have to use unaligned loads :( */
+  return ploadu<Packet1cd>(&from);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet1cd
+pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index) {
+  return pload<Packet1cd>(from);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet1cd
+pgather_partial<std::complex<double>, Packet1cd>(const std::complex<double>* from, Index, const Index) {
+  return pload<Packet1cd>(from);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to,
+                                                                                     const Packet1cd& from, Index) {
+  pstore<std::complex<double> >(to, from);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<std::complex<double>, Packet1cd>(std::complex<double>* to,
+                                                                                             const Packet1cd& from,
+                                                                                             Index, const Index) {
+  pstore<std::complex<double> >(to, from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return Packet1cd(a.v + b.v);
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return Packet1cd(a.v - b.v);
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) {
+  return Packet1cd(pnegate(Packet2d(a.v)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) {
+  return Packet1cd(pxor(a.v, reinterpret_cast<Packet2d>(p2ul_CONJ_XOR2())));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pand<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return Packet1cd(pand(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd por<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return Packet1cd(por(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pxor<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return Packet1cd(pxor(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return Packet1cd(pandnot(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) {
+  return pset1<Packet1cd>(*from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double>* addr) {
+  EIGEN_PPC_PREFETCH(addr);
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a) {
+  EIGEN_ALIGN16 std::complex<double> res[1];
+  pstore<std::complex<double> >(res, a);
+
+  return res[0];
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) {
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a) {
+  return pfirst(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) {
+  return pfirst(a);
+}
+
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd, Packet2d)
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return pdiv_complex(a, b);
+}
+
+EIGEN_STRONG_INLINE Packet1cd pcplxflip /*<Packet1cd>*/ (const Packet1cd& x) {
+  return Packet1cd(preverse(Packet2d(x.v)));
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd, 2>& kernel) {
+  Packet2d tmp = vec_mergeh(kernel.packet[0].v, kernel.packet[1].v);
+  kernel.packet[1].v = vec_mergel(kernel.packet[0].v, kernel.packet[1].v);
+  kernel.packet[0].v = tmp;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) {
+  // Compare real and imaginary parts of a and b to get the mask vector:
+  // [re(a)==re(b), im(a)==im(b)]
+  Packet2d eq = reinterpret_cast<Packet2d>(vec_cmpeq(a.v, b.v));
+  // Swap real/imag elements in the mask in to get:
+  // [im(a)==im(b), re(a)==re(b)]
+  Packet2d eq_swapped =
+      reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(eq), reinterpret_cast<Packet4ui>(eq), 8));
+  // Return re(a)==re(b) & im(a)==im(b) by computing bitwise AND of eq and eq_swapped
+  return Packet1cd(vec_and(eq, eq_swapped));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd psqrt<Packet1cd>(const Packet1cd& a) {
+  return psqrt_complex<Packet1cd>(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& x)
-{
-  return Packet2cf(vec_perm(x.v, x.v, p16uc_COMPLEX_REV));
+template <>
+EIGEN_STRONG_INLINE Packet1cd plog<Packet1cd>(const Packet1cd& a) {
+  return plog_complex<Packet1cd>(a);
 }
 
-} // end namespace internal
+#endif  // __VSX__
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_COMPLEX_ALTIVEC_H
+#endif  // EIGEN_COMPLEX32_ALTIVEC_H
diff --git a/inst/include/Eigen/src/Core/arch/AltiVec/MathFunctions.h b/inst/include/Eigen/src/Core/arch/AltiVec/MathFunctions.h
new file mode 100644
index 00000000..c95ee388
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/AltiVec/MathFunctions.h
@@ -0,0 +1,81 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2007 Julien Pommier
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2016 Konstantinos Margaritis <markos@freevec.org>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATH_FUNCTIONS_ALTIVEC_H
+#define EIGEN_MATH_FUNCTIONS_ALTIVEC_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(Packet4f)
+#ifdef EIGEN_VECTORIZE_VSX
+EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_DOUBLE(Packet2d)
+#endif
+
+#ifdef EIGEN_VECTORIZE_VSX
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f psqrt<Packet4f>(const Packet4f& x) {
+  return vec_sqrt(x);
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2d psqrt<Packet2d>(const Packet2d& x) {
+  return vec_sqrt(x);
+}
+
+#if !EIGEN_COMP_CLANG
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f prsqrt<Packet4f>(const Packet4f& x) {
+  return pset1<Packet4f>(1.0f) / psqrt<Packet4f>(x);
+  //  vec_rsqrt returns different results from the generic version
+  //  return  vec_rsqrt(x);
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2d prsqrt<Packet2d>(const Packet2d& x) {
+  return pset1<Packet2d>(1.0) / psqrt<Packet2d>(x);
+  //  vec_rsqrt returns different results from the generic version
+  //  return  vec_rsqrt(x);
+}
+
+#endif
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf psqrt<Packet8bf>(const Packet8bf& a) {
+  BF16_TO_F32_UNARY_OP_WRAPPER(psqrt<Packet4f>, a);
+}
+
+#if !EIGEN_COMP_CLANG
+template <>
+EIGEN_STRONG_INLINE Packet8bf prsqrt<Packet8bf>(const Packet8bf& a) {
+  BF16_TO_F32_UNARY_OP_WRAPPER(prsqrt<Packet4f>, a);
+}
+#endif
+#else
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f psqrt<Packet4f>(const Packet4f& x) {
+  Packet4f a;
+  for (Index i = 0; i < packet_traits<float>::size; i++) {
+    a[i] = numext::sqrt(x[i]);
+  }
+  return a;
+}
+#endif
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_MATH_FUNCTIONS_ALTIVEC_H
diff --git a/inst/include/Eigen/src/Core/arch/AltiVec/MatrixProduct.h b/inst/include/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
new file mode 100644
index 00000000..47255261
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/AltiVec/MatrixProduct.h
@@ -0,0 +1,3686 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2020 Everton Constantino (everton.constantino@ibm.com)
+// Copyright (C) 2021 Chip Kerchner (chip.kerchner@ibm.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATRIX_PRODUCT_ALTIVEC_H
+#define EIGEN_MATRIX_PRODUCT_ALTIVEC_H
+
+#ifndef EIGEN_ALTIVEC_USE_CUSTOM_PACK
+#define EIGEN_ALTIVEC_USE_CUSTOM_PACK 1
+#endif
+
+#if !defined(EIGEN_ALTIVEC_DISABLE_MMA)
+#define EIGEN_ALTIVEC_DISABLE_MMA 0
+#endif
+
+// Check for MMA builtin support.
+#if !EIGEN_ALTIVEC_DISABLE_MMA && defined(__has_builtin)
+#if __has_builtin(__builtin_mma_assemble_acc)
+#define EIGEN_ALTIVEC_MMA_SUPPORT
+#endif
+#endif
+
+// Check if and how we should actually use MMA if supported.
+#if defined(EIGEN_ALTIVEC_MMA_SUPPORT)
+
+#if !defined(EIGEN_ALTIVEC_ENABLE_MMA_DYNAMIC_DISPATCH)
+#define EIGEN_ALTIVEC_ENABLE_MMA_DYNAMIC_DISPATCH 0
+#endif
+
+// Check if we want to enable dynamic dispatch. Not supported by LLVM.
+#if EIGEN_ALTIVEC_ENABLE_MMA_DYNAMIC_DISPATCH && !EIGEN_COMP_LLVM
+#define EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH 1
+// Otherwise, use MMA by default if available.
+#elif defined(__MMA__)
+#define EIGEN_ALTIVEC_MMA_ONLY 1
+#endif
+
+#endif  // EIGEN_ALTIVEC_MMA_SUPPORT
+
+#include "MatrixProductCommon.h"
+
+#if defined(EIGEN_ALTIVEC_MMA_ONLY) || defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH)
+#include "MatrixProductMMA.h"
+#endif
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+/**************************
+ * Constants and typedefs *
+ **************************/
+template <typename Scalar>
+struct quad_traits {
+  typedef typename packet_traits<Scalar>::type vectortype;
+  typedef PacketBlock<vectortype, 4> type;
+  typedef vectortype rhstype;
+  enum { vectorsize = packet_traits<Scalar>::size, size = 4, rows = 4 };
+};
+
+template <>
+struct quad_traits<double> {
+  typedef Packet2d vectortype;
+  typedef PacketBlock<vectortype, 4> type;
+  typedef PacketBlock<Packet2d, 2> rhstype;
+  enum { vectorsize = packet_traits<double>::size, size = 2, rows = 4 };
+};
+
+template <>
+struct quad_traits<bfloat16> {
+  typedef Packet8bf vectortype;
+  typedef PacketBlock<vectortype, 4> type;
+  typedef vectortype rhstype;
+  enum { vectorsize = packet_traits<bfloat16>::size, size = 8, rows = 4 };
+};
+
+// MatrixProduct decomposes real/imaginary vectors into a real vector and an imaginary vector, this turned out
+// to be faster than Eigen's usual approach of having real/imaginary pairs on a single vector. This constants then
+// are responsible to extract from convert between Eigen's and MatrixProduct approach.
+
+const static Packet16uc p16uc_GETREAL32 = {0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27};
+
+const static Packet16uc p16uc_GETIMAG32 = {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31};
+
+const static Packet16uc p16uc_GETREAL32b = {0, 1, 2, 3, 16, 17, 18, 19, 8, 9, 10, 11, 24, 25, 26, 27};
+
+const static Packet16uc p16uc_GETIMAG32b = {4, 5, 6, 7, 20, 21, 22, 23, 12, 13, 14, 15, 28, 29, 30, 31};
+
+/*********************************************
+ * Single precision real and complex packing *
+ * *******************************************/
+
+/**
+ * Symm packing is related to packing of symmetric adjoint blocks, as expected the packing leaves
+ * the diagonal real, whatever is below it is copied from the respective upper diagonal element and
+ * conjugated. There's no PanelMode available for symm packing.
+ *
+ * Packing in general is supposed to leave the lhs block and the rhs block easy to be read by gemm using
+ * its respective rank-update instructions. The float32/64 versions are different because at this moment
+ * the size of the accumulator is fixed at 512-bits so you can't have a 4x4 accumulator of 64-bit elements.
+ *
+ * As mentioned earlier MatrixProduct breaks complex numbers into a real vector and a complex vector so packing has
+ * to take that into account, at the moment, we run pack the real part and then the imaginary part, this is the main
+ * reason why packing for complex is broken down into several different parts, also the reason why we endup having a
+ * float32/64 and complex float32/64 version.
+ **/
+template <typename Scalar, int StorageOrder>
+EIGEN_ALWAYS_INLINE std::complex<Scalar> getAdjointVal(
+    Index i, Index j, const_blas_data_mapper<std::complex<Scalar>, Index, StorageOrder>& dt) {
+  std::complex<Scalar> v;
+  if (i < j) {
+    v.real(dt(j, i).real());
+    v.imag(-dt(j, i).imag());
+  } else if (i > j) {
+    v.real(dt(i, j).real());
+    v.imag(dt(i, j).imag());
+  } else {
+    v.real(dt(i, j).real());
+    v.imag((Scalar)0.0);
+  }
+  return v;
+}
+
+template <typename Scalar, int StorageOrder, int N>
+EIGEN_STRONG_INLINE void symm_pack_complex_rhs_helper(std::complex<Scalar>* blockB, const std::complex<Scalar>* _rhs,
+                                                      Index rhsStride, Index rows, Index cols, Index k2) {
+  const Index depth = k2 + rows;
+  const_blas_data_mapper<std::complex<Scalar>, Index, StorageOrder> rhs(_rhs, rhsStride);
+  const Index vectorSize = N * quad_traits<Scalar>::vectorsize;
+  const Index vectorDelta = vectorSize * rows;
+  Scalar* blockBf = reinterpret_cast<Scalar*>(blockB);
+
+  Index rir = 0, rii, j = 0;
+  for (; j + vectorSize <= cols; j += vectorSize) {
+    rii = rir + vectorDelta;
+
+    for (Index i = k2; i < depth; i++) {
+      for (Index k = 0; k < vectorSize; k++) {
+        std::complex<Scalar> v = getAdjointVal<Scalar, StorageOrder>(i, j + k, rhs);
+
+        blockBf[rir + k] = v.real();
+        blockBf[rii + k] = v.imag();
+      }
+      rir += vectorSize;
+      rii += vectorSize;
+    }
+
+    rir += vectorDelta;
+  }
+
+  for (; j < cols; j++) {
+    rii = rir + rows;
+
+    for (Index i = k2; i < depth; i++) {
+      std::complex<Scalar> v = getAdjointVal<Scalar, StorageOrder>(i, j, rhs);
+
+      blockBf[rir] = v.real();
+      blockBf[rii] = v.imag();
+
+      rir += 1;
+      rii += 1;
+    }
+
+    rir += rows;
+  }
+}
+
+template <typename Scalar, int StorageOrder>
+EIGEN_STRONG_INLINE void symm_pack_complex_lhs_helper(std::complex<Scalar>* blockA, const std::complex<Scalar>* _lhs,
+                                                      Index lhsStride, Index cols, Index rows) {
+  const Index depth = cols;
+  const_blas_data_mapper<std::complex<Scalar>, Index, StorageOrder> lhs(_lhs, lhsStride);
+  const Index vectorSize = quad_traits<Scalar>::vectorsize;
+  const Index vectorDelta = vectorSize * depth;
+  Scalar* blockAf = reinterpret_cast<Scalar*>(blockA);
+
+  Index rir = 0, rii, j = 0;
+  for (; j + vectorSize <= rows; j += vectorSize) {
+    rii = rir + vectorDelta;
+
+    for (Index i = 0; i < depth; i++) {
+      for (Index k = 0; k < vectorSize; k++) {
+        std::complex<Scalar> v = getAdjointVal<Scalar, StorageOrder>(j + k, i, lhs);
+
+        blockAf[rir + k] = v.real();
+        blockAf[rii + k] = v.imag();
+      }
+      rir += vectorSize;
+      rii += vectorSize;
+    }
+
+    rir += vectorDelta;
+  }
+
+  if (j < rows) {
+    rii = rir + ((rows - j) * depth);
+
+    for (Index i = 0; i < depth; i++) {
+      Index k = j;
+      for (; k < rows; k++) {
+        std::complex<Scalar> v = getAdjointVal<Scalar, StorageOrder>(k, i, lhs);
+
+        blockAf[rir] = v.real();
+        blockAf[rii] = v.imag();
+
+        rir += 1;
+        rii += 1;
+      }
+    }
+  }
+}
+
+template <typename Scalar, int StorageOrder, int N>
+EIGEN_STRONG_INLINE void symm_pack_rhs_helper(Scalar* blockB, const Scalar* _rhs, Index rhsStride, Index rows,
+                                              Index cols, Index k2) {
+  const Index depth = k2 + rows;
+  const_blas_data_mapper<Scalar, Index, StorageOrder> rhs(_rhs, rhsStride);
+  const Index vectorSize = quad_traits<Scalar>::vectorsize;
+
+  Index ri = 0, j = 0;
+  for (; j + N * vectorSize <= cols; j += N * vectorSize) {
+    Index i = k2;
+    for (; i < depth; i++) {
+      for (Index k = 0; k < N * vectorSize; k++) {
+        if (i <= j + k)
+          blockB[ri + k] = rhs(j + k, i);
+        else
+          blockB[ri + k] = rhs(i, j + k);
+      }
+      ri += N * vectorSize;
+    }
+  }
+
+  for (; j < cols; j++) {
+    for (Index i = k2; i < depth; i++) {
+      if (j <= i)
+        blockB[ri] = rhs(i, j);
+      else
+        blockB[ri] = rhs(j, i);
+      ri += 1;
+    }
+  }
+}
+
+template <typename Scalar, int StorageOrder>
+EIGEN_STRONG_INLINE void symm_pack_lhs_helper(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols,
+                                              Index rows) {
+  const Index depth = cols;
+  const_blas_data_mapper<Scalar, Index, StorageOrder> lhs(_lhs, lhsStride);
+  const Index vectorSize = quad_traits<Scalar>::vectorsize;
+
+  Index ri = 0, j = 0;
+  for (; j + vectorSize <= rows; j += vectorSize) {
+    Index i = 0;
+
+    for (; i < depth; i++) {
+      for (Index k = 0; k < vectorSize; k++) {
+        if (i <= j + k)
+          blockA[ri + k] = lhs(j + k, i);
+        else
+          blockA[ri + k] = lhs(i, j + k);
+      }
+      ri += vectorSize;
+    }
+  }
+
+  if (j < rows) {
+    for (Index i = 0; i < depth; i++) {
+      Index k = j;
+      for (; k < rows; k++) {
+        if (i <= k)
+          blockA[ri] = lhs(k, i);
+        else
+          blockA[ri] = lhs(i, k);
+        ri += 1;
+      }
+    }
+  }
+}
+
+template <typename Index, int nr, int StorageOrder>
+struct symm_pack_rhs<std::complex<float>, Index, nr, StorageOrder> {
+  void operator()(std::complex<float>* blockB, const std::complex<float>* _rhs, Index rhsStride, Index rows, Index cols,
+                  Index k2) {
+    symm_pack_complex_rhs_helper<float, StorageOrder, 1>(blockB, _rhs, rhsStride, rows, cols, k2);
+  }
+};
+
+template <typename Index, int Pack1, int Pack2_dummy, int StorageOrder>
+struct symm_pack_lhs<std::complex<float>, Index, Pack1, Pack2_dummy, StorageOrder> {
+  void operator()(std::complex<float>* blockA, const std::complex<float>* _lhs, Index lhsStride, Index cols,
+                  Index rows) {
+    symm_pack_complex_lhs_helper<float, StorageOrder>(blockA, _lhs, lhsStride, cols, rows);
+  }
+};
+
+// *********** symm_pack std::complex<float64> ***********
+
+template <typename Index, int nr, int StorageOrder>
+struct symm_pack_rhs<std::complex<double>, Index, nr, StorageOrder> {
+  void operator()(std::complex<double>* blockB, const std::complex<double>* _rhs, Index rhsStride, Index rows,
+                  Index cols, Index k2) {
+    symm_pack_complex_rhs_helper<double, StorageOrder, 2>(blockB, _rhs, rhsStride, rows, cols, k2);
+  }
+};
+
+template <typename Index, int Pack1, int Pack2_dummy, int StorageOrder>
+struct symm_pack_lhs<std::complex<double>, Index, Pack1, Pack2_dummy, StorageOrder> {
+  void operator()(std::complex<double>* blockA, const std::complex<double>* _lhs, Index lhsStride, Index cols,
+                  Index rows) {
+    symm_pack_complex_lhs_helper<double, StorageOrder>(blockA, _lhs, lhsStride, cols, rows);
+  }
+};
+
+// *********** symm_pack float32 ***********
+template <typename Index, int nr, int StorageOrder>
+struct symm_pack_rhs<float, Index, nr, StorageOrder> {
+  void operator()(float* blockB, const float* _rhs, Index rhsStride, Index rows, Index cols, Index k2) {
+    symm_pack_rhs_helper<float, StorageOrder, 1>(blockB, _rhs, rhsStride, rows, cols, k2);
+  }
+};
+
+template <typename Index, int Pack1, int Pack2_dummy, int StorageOrder>
+struct symm_pack_lhs<float, Index, Pack1, Pack2_dummy, StorageOrder> {
+  void operator()(float* blockA, const float* _lhs, Index lhsStride, Index cols, Index rows) {
+    symm_pack_lhs_helper<float, StorageOrder>(blockA, _lhs, lhsStride, cols, rows);
+  }
+};
+
+// *********** symm_pack float64 ***********
+template <typename Index, int nr, int StorageOrder>
+struct symm_pack_rhs<double, Index, nr, StorageOrder> {
+  void operator()(double* blockB, const double* _rhs, Index rhsStride, Index rows, Index cols, Index k2) {
+    symm_pack_rhs_helper<double, StorageOrder, 2>(blockB, _rhs, rhsStride, rows, cols, k2);
+  }
+};
+
+template <typename Index, int Pack1, int Pack2_dummy, int StorageOrder>
+struct symm_pack_lhs<double, Index, Pack1, Pack2_dummy, StorageOrder> {
+  void operator()(double* blockA, const double* _lhs, Index lhsStride, Index cols, Index rows) {
+    symm_pack_lhs_helper<double, StorageOrder>(blockA, _lhs, lhsStride, cols, rows);
+  }
+};
+
+/**
+ * PanelMode
+ * Packing might be called several times before being multiplied by gebp_kernel, this happens because
+ * on special occasions it fills part of block with other parts of the matrix. Two variables control
+ * how PanelMode should behave: offset and stride. The idea is that those variables represent whatever
+ * is going to be the real offset and stride in the future and this is what you should obey. The process
+ * is to behave as you would with normal packing but leave the start of each part with the correct offset
+ * and the end as well respecting the real stride the block will have. Gebp is aware of both blocks stride
+ * and offset and behaves accordingly.
+ **/
+
+template <typename Scalar, typename Packet, int N>
+EIGEN_ALWAYS_INLINE void storeBlock(Scalar* to, PacketBlock<Packet, N>& block) {
+  const Index size = 16 / sizeof(Scalar);
+  pstore<Scalar>(to + (0 * size), block.packet[0]);
+  pstore<Scalar>(to + (1 * size), block.packet[1]);
+  if (N > 2) {
+    pstore<Scalar>(to + (2 * size), block.packet[2]);
+  }
+  if (N > 3) {
+    pstore<Scalar>(to + (3 * size), block.packet[3]);
+  }
+}
+
+// General template for lhs & rhs complex packing.
+template <typename Scalar, typename DataMapper, typename Packet, typename PacketC, int StorageOrder, bool Conjugate,
+          bool PanelMode, bool UseLhs>
+struct dhs_cpack {
+  template <bool transpose>
+  EIGEN_ALWAYS_INLINE void dhs_cblock(PacketBlock<PacketC, 8>& cblock, PacketBlock<Packet, 4>& block,
+                                      Packet16uc permute) {
+    if (transpose) {
+      block.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, permute);
+      block.packet[1] = vec_perm(cblock.packet[2].v, cblock.packet[3].v, permute);
+      block.packet[2] = vec_perm(cblock.packet[4].v, cblock.packet[5].v, permute);
+      block.packet[3] = vec_perm(cblock.packet[6].v, cblock.packet[7].v, permute);
+
+      Packet4f t0, t1, t2, t3;
+#ifdef EIGEN_VECTORIZE_VSX
+      t0 = reinterpret_cast<Packet>(
+          vec_mergeh(reinterpret_cast<Packet2ul>(block.packet[0]), reinterpret_cast<Packet2ul>(block.packet[1])));
+      t1 = reinterpret_cast<Packet>(
+          vec_mergel(reinterpret_cast<Packet2ul>(block.packet[0]), reinterpret_cast<Packet2ul>(block.packet[1])));
+      t2 = reinterpret_cast<Packet>(
+          vec_mergeh(reinterpret_cast<Packet2ul>(block.packet[2]), reinterpret_cast<Packet2ul>(block.packet[3])));
+      t3 = reinterpret_cast<Packet>(
+          vec_mergel(reinterpret_cast<Packet2ul>(block.packet[2]), reinterpret_cast<Packet2ul>(block.packet[3])));
+#else
+      t0 = reinterpret_cast<Packet>(vec_perm(block.packet[0], block.packet[1], p16uc_TRANSPOSE64_HI));
+      t1 = reinterpret_cast<Packet>(vec_perm(block.packet[0], block.packet[1], p16uc_TRANSPOSE64_LO));
+      t2 = reinterpret_cast<Packet>(vec_perm(block.packet[2], block.packet[3], p16uc_TRANSPOSE64_HI));
+      t3 = reinterpret_cast<Packet>(vec_perm(block.packet[2], block.packet[3], p16uc_TRANSPOSE64_LO));
+#endif
+
+      block.packet[0] = t0;
+      block.packet[1] = t1;
+      block.packet[2] = t2;
+      block.packet[3] = t3;
+    } else {
+      block.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[4].v, permute);
+      block.packet[1] = vec_perm(cblock.packet[1].v, cblock.packet[5].v, permute);
+      block.packet[2] = vec_perm(cblock.packet[2].v, cblock.packet[6].v, permute);
+      block.packet[3] = vec_perm(cblock.packet[3].v, cblock.packet[7].v, permute);
+    }
+  }
+
+  EIGEN_ALWAYS_INLINE void dhs_ccopy(Scalar* blockAt, const DataMapper& lhs2, Index& i, Index& rir, Index& rii,
+                                     Index depth, const Index vectorSize) {
+    PacketBlock<Packet, 4> blockr, blocki;
+    PacketBlock<PacketC, 8> cblock;
+
+    for (; i + vectorSize <= depth; i += vectorSize) {
+      if (UseLhs) {
+        bload<DataMapper, PacketC, 2, StorageOrder, true, 4>(cblock, lhs2, 0, i);
+      } else {
+        bload<DataMapper, PacketC, 2, StorageOrder, true, 4>(cblock, lhs2, i, 0);
+      }
+
+      if (((StorageOrder == RowMajor) && UseLhs) || (((StorageOrder == ColMajor) && !UseLhs))) {
+        dhs_cblock<true>(cblock, blockr, p16uc_GETREAL32b);
+        dhs_cblock<true>(cblock, blocki, p16uc_GETIMAG32b);
+      } else {
+        dhs_cblock<false>(cblock, blockr, p16uc_GETREAL32);
+        dhs_cblock<false>(cblock, blocki, p16uc_GETIMAG32);
+      }
+
+      if (Conjugate) {
+        blocki.packet[0] = -blocki.packet[0];
+        blocki.packet[1] = -blocki.packet[1];
+        blocki.packet[2] = -blocki.packet[2];
+        blocki.packet[3] = -blocki.packet[3];
+      }
+
+      storeBlock<Scalar, Packet, 4>(blockAt + rir, blockr);
+      storeBlock<Scalar, Packet, 4>(blockAt + rii, blocki);
+
+      rir += 4 * vectorSize;
+      rii += 4 * vectorSize;
+    }
+  }
+
+  EIGEN_STRONG_INLINE void operator()(std::complex<Scalar>* blockA, const DataMapper& lhs, Index depth, Index rows,
+                                      Index stride, Index offset) {
+    const Index vectorSize = quad_traits<Scalar>::vectorsize;
+    const Index vectorDelta = vectorSize * ((PanelMode) ? stride : depth);
+    Index rir = ((PanelMode) ? (vectorSize * offset) : 0), rii;
+    Scalar* blockAt = reinterpret_cast<Scalar*>(blockA);
+    Index j = 0;
+
+    for (; j + vectorSize <= rows; j += vectorSize) {
+      const DataMapper lhs2 = UseLhs ? lhs.getSubMapper(j, 0) : lhs.getSubMapper(0, j);
+      Index i = 0;
+
+      rii = rir + vectorDelta;
+
+      dhs_ccopy(blockAt, lhs2, i, rir, rii, depth, vectorSize);
+
+      for (; i < depth; i++) {
+        PacketBlock<Packet, 1> blockr, blocki;
+        PacketBlock<PacketC, 2> cblock;
+
+        if (((StorageOrder == ColMajor) && UseLhs) || (((StorageOrder == RowMajor) && !UseLhs))) {
+          if (UseLhs) {
+            cblock.packet[0] = lhs2.template loadPacket<PacketC>(0, i);
+            cblock.packet[1] = lhs2.template loadPacket<PacketC>(2, i);
+          } else {
+            cblock.packet[0] = lhs2.template loadPacket<PacketC>(i, 0);
+            cblock.packet[1] = lhs2.template loadPacket<PacketC>(i, 2);
+          }
+        } else {
+          if (UseLhs) {
+            cblock.packet[0] = pload2(lhs2(0, i), lhs2(1, i));
+            cblock.packet[1] = pload2(lhs2(2, i), lhs2(3, i));
+          } else {
+            cblock.packet[0] = pload2(lhs2(i, 0), lhs2(i, 1));
+            cblock.packet[1] = pload2(lhs2(i, 2), lhs2(i, 3));
+          }
+        }
+
+        blockr.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETREAL32);
+        blocki.packet[0] = vec_perm(cblock.packet[0].v, cblock.packet[1].v, p16uc_GETIMAG32);
+
+        if (Conjugate) {
+          blocki.packet[0] = -blocki.packet[0];
+        }
+
+        pstore<Scalar>(blockAt + rir, blockr.packet[0]);
+        pstore<Scalar>(blockAt + rii, blocki.packet[0]);
+
+        rir += vectorSize;
+        rii += vectorSize;
+      }
+
+      rir += ((PanelMode) ? (vectorSize * (2 * stride - depth)) : vectorDelta);
+    }
+
+    if (!UseLhs) {
+      if (PanelMode) rir -= (offset * (vectorSize - 1));
+
+      for (; j < rows; j++) {
+        const DataMapper lhs2 = lhs.getSubMapper(0, j);
+        rii = rir + ((PanelMode) ? stride : depth);
+
+        for (Index i = 0; i < depth; i++) {
+          blockAt[rir] = lhs2(i, 0).real();
+
+          if (Conjugate)
+            blockAt[rii] = -lhs2(i, 0).imag();
+          else
+            blockAt[rii] = lhs2(i, 0).imag();
+
+          rir += 1;
+          rii += 1;
+        }
+
+        rir += ((PanelMode) ? (2 * stride - depth) : depth);
+      }
+    } else {
+      if (j < rows) {
+        if (PanelMode) rir += (offset * (rows - j - vectorSize));
+        rii = rir + (((PanelMode) ? stride : depth) * (rows - j));
+
+        for (Index i = 0; i < depth; i++) {
+          Index k = j;
+          for (; k < rows; k++) {
+            blockAt[rir] = lhs(k, i).real();
+
+            if (Conjugate)
+              blockAt[rii] = -lhs(k, i).imag();
+            else
+              blockAt[rii] = lhs(k, i).imag();
+
+            rir += 1;
+            rii += 1;
+          }
+        }
+      }
+    }
+  }
+};
+
+// General template for lhs & rhs packing.
+template <typename Scalar, typename DataMapper, typename Packet, int StorageOrder, bool PanelMode, bool UseLhs>
+struct dhs_pack {
+  template <Index n>
+  EIGEN_ALWAYS_INLINE void dhs_copy(Scalar* blockA, const DataMapper& lhs2, Index& i, Index& ri, Index depth,
+                                    const Index vectorSize) {
+    PacketBlock<Packet, 4> block[n];
+
+    for (; i + n * vectorSize <= depth; i += n * vectorSize) {
+      for (Index k = 0; k < n; k++) {
+        if (UseLhs) {
+          bload<DataMapper, Packet, 4, StorageOrder, false, 4>(block[k], lhs2, 0, i + k * vectorSize);
+        } else {
+          bload<DataMapper, Packet, 4, StorageOrder, false, 4>(block[k], lhs2, i + k * vectorSize, 0);
+        }
+      }
+
+      if (((StorageOrder == RowMajor) && UseLhs) || ((StorageOrder == ColMajor) && !UseLhs)) {
+        for (Index k = 0; k < n; k++) {
+          ptranspose(block[k]);
+        }
+      }
+
+      for (Index k = 0; k < n; k++) {
+        storeBlock<Scalar, Packet, 4>(blockA + ri + k * 4 * vectorSize, block[k]);
+      }
+
+      ri += n * 4 * vectorSize;
+    }
+  }
+
+  EIGEN_STRONG_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride,
+                                      Index offset) {
+    const Index vectorSize = quad_traits<Scalar>::vectorsize;
+    Index ri = 0, j = 0;
+
+    for (; j + vectorSize <= rows; j += vectorSize) {
+      const DataMapper lhs2 = UseLhs ? lhs.getSubMapper(j, 0) : lhs.getSubMapper(0, j);
+      Index i = 0;
+
+      if (PanelMode) ri += vectorSize * offset;
+
+      dhs_copy<4>(blockA, lhs2, i, ri, depth, vectorSize);
+      dhs_copy<2>(blockA, lhs2, i, ri, depth, vectorSize);
+      dhs_copy<1>(blockA, lhs2, i, ri, depth, vectorSize);
+
+      for (; i < depth; i++) {
+        if (((StorageOrder == RowMajor) && UseLhs) || ((StorageOrder == ColMajor) && !UseLhs)) {
+          if (UseLhs) {
+            blockA[ri + 0] = lhs2(0, i);
+            blockA[ri + 1] = lhs2(1, i);
+            blockA[ri + 2] = lhs2(2, i);
+            blockA[ri + 3] = lhs2(3, i);
+          } else {
+            blockA[ri + 0] = lhs2(i, 0);
+            blockA[ri + 1] = lhs2(i, 1);
+            blockA[ri + 2] = lhs2(i, 2);
+            blockA[ri + 3] = lhs2(i, 3);
+          }
+        } else {
+          Packet lhsV;
+          if (UseLhs) {
+            lhsV = lhs2.template loadPacket<Packet>(0, i);
+          } else {
+            lhsV = lhs2.template loadPacket<Packet>(i, 0);
+          }
+          pstore<Scalar>(blockA + ri, lhsV);
+        }
+
+        ri += vectorSize;
+      }
+
+      if (PanelMode) ri += vectorSize * (stride - offset - depth);
+    }
+
+    if (!UseLhs) {
+      if (PanelMode) ri += offset;
+
+      for (; j < rows; j++) {
+        const DataMapper lhs2 = lhs.getSubMapper(0, j);
+        for (Index i = 0; i < depth; i++) {
+          blockA[ri] = lhs2(i, 0);
+          ri += 1;
+        }
+
+        if (PanelMode) ri += stride - depth;
+      }
+    } else {
+      if (j < rows) {
+        if (PanelMode) ri += offset * (rows - j);
+
+        for (Index i = 0; i < depth; i++) {
+          Index k = j;
+          for (; k < rows; k++) {
+            blockA[ri] = lhs(k, i);
+            ri += 1;
+          }
+        }
+      }
+    }
+  }
+};
+
+// General template for lhs packing, float64 specialization.
+template <typename DataMapper, int StorageOrder, bool PanelMode>
+struct dhs_pack<double, DataMapper, Packet2d, StorageOrder, PanelMode, true> {
+  template <Index n>
+  EIGEN_ALWAYS_INLINE void dhs_copy(double* blockA, const DataMapper& lhs2, Index& i, Index& ri, Index depth,
+                                    const Index vectorSize) {
+    PacketBlock<Packet2d, 2> block[n];
+
+    for (; i + n * vectorSize <= depth; i += n * vectorSize) {
+      for (Index k = 0; k < n; k++) {
+        if (StorageOrder == RowMajor) {
+          block[k].packet[0] = lhs2.template loadPacket<Packet2d>(0, i + k * vectorSize);
+          block[k].packet[1] = lhs2.template loadPacket<Packet2d>(1, i + k * vectorSize);
+        } else {
+          block[k].packet[0] = lhs2.template loadPacket<Packet2d>(0, i + k * vectorSize + 0);
+          block[k].packet[1] = lhs2.template loadPacket<Packet2d>(0, i + k * vectorSize + 1);
+        }
+      }
+
+      if (StorageOrder == RowMajor) {
+        for (Index k = 0; k < n; k++) {
+          ptranspose(block[k]);
+        }
+      }
+
+      for (Index k = 0; k < n; k++) {
+        storeBlock<double, Packet2d, 2>(blockA + ri + k * 2 * vectorSize, block[k]);
+      }
+
+      ri += n * 2 * vectorSize;
+    }
+  }
+
+  EIGEN_STRONG_INLINE void operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride,
+                                      Index offset) {
+    const Index vectorSize = quad_traits<double>::vectorsize;
+    Index ri = 0, j = 0;
+
+    for (; j + vectorSize <= rows; j += vectorSize) {
+      const DataMapper lhs2 = lhs.getSubMapper(j, 0);
+      Index i = 0;
+
+      if (PanelMode) ri += vectorSize * offset;
+
+      dhs_copy<4>(blockA, lhs2, i, ri, depth, vectorSize);
+      dhs_copy<2>(blockA, lhs2, i, ri, depth, vectorSize);
+      dhs_copy<1>(blockA, lhs2, i, ri, depth, vectorSize);
+
+      for (; i < depth; i++) {
+        if (StorageOrder == RowMajor) {
+          blockA[ri + 0] = lhs2(0, i);
+          blockA[ri + 1] = lhs2(1, i);
+        } else {
+          Packet2d lhsV = lhs2.template loadPacket<Packet2d>(0, i);
+          pstore<double>(blockA + ri, lhsV);
+        }
+
+        ri += vectorSize;
+      }
+
+      if (PanelMode) ri += vectorSize * (stride - offset - depth);
+    }
+
+    if (j < rows) {
+      if (PanelMode) ri += offset * (rows - j);
+
+      for (Index i = 0; i < depth; i++) {
+        Index k = j;
+        for (; k < rows; k++) {
+          blockA[ri] = lhs(k, i);
+          ri += 1;
+        }
+      }
+    }
+  }
+};
+
+// General template for rhs packing, float64 specialization.
+template <typename DataMapper, int StorageOrder, bool PanelMode>
+struct dhs_pack<double, DataMapper, Packet2d, StorageOrder, PanelMode, false> {
+  template <Index n>
+  EIGEN_ALWAYS_INLINE void dhs_copy(double* blockB, const DataMapper& rhs2, Index& i, Index& ri, Index depth,
+                                    const Index vectorSize) {
+    PacketBlock<Packet2d, 2> block1[n], block2[n];
+    PacketBlock<Packet2d, 4> block3[n];
+
+    for (; i + n * vectorSize <= depth; i += n * vectorSize) {
+      for (Index k = 0; k < n; k++) {
+        if (StorageOrder == ColMajor) {
+          block1[k].packet[0] = rhs2.template loadPacket<Packet2d>(i + k * vectorSize, 0);
+          block1[k].packet[1] = rhs2.template loadPacket<Packet2d>(i + k * vectorSize, 1);
+          block2[k].packet[0] = rhs2.template loadPacket<Packet2d>(i + k * vectorSize, 2);
+          block2[k].packet[1] = rhs2.template loadPacket<Packet2d>(i + k * vectorSize, 3);
+        } else {
+          block3[k].packet[0] = rhs2.template loadPacket<Packet2d>(i + k * vectorSize + 0, 0);  //[a1 a2]
+          block3[k].packet[1] = rhs2.template loadPacket<Packet2d>(i + k * vectorSize + 0, 2);  //[a3 a4]
+          block3[k].packet[2] = rhs2.template loadPacket<Packet2d>(i + k * vectorSize + 1, 0);  //[b1 b2]
+          block3[k].packet[3] = rhs2.template loadPacket<Packet2d>(i + k * vectorSize + 1, 2);  //[b3 b4]
+        }
+      }
+
+      if (StorageOrder == ColMajor) {
+        for (Index k = 0; k < n; k++) {
+          ptranspose(block1[k]);
+          ptranspose(block2[k]);
+        }
+      }
+
+      for (Index k = 0; k < n; k++) {
+        if (StorageOrder == ColMajor) {
+          pstore<double>(blockB + ri + k * 4 * vectorSize, block1[k].packet[0]);
+          pstore<double>(blockB + ri + k * 4 * vectorSize + 2, block2[k].packet[0]);
+          pstore<double>(blockB + ri + k * 4 * vectorSize + 4, block1[k].packet[1]);
+          pstore<double>(blockB + ri + k * 4 * vectorSize + 6, block2[k].packet[1]);
+        } else {
+          storeBlock<double, Packet2d, 4>(blockB + ri + k * 4 * vectorSize, block3[k]);
+        }
+      }
+
+      ri += n * 4 * vectorSize;
+    }
+  }
+
+  EIGEN_STRONG_INLINE void operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride,
+                                      Index offset) {
+    const Index vectorSize = quad_traits<double>::vectorsize;
+    Index ri = 0, j = 0;
+
+    for (; j + 2 * vectorSize <= cols; j += 2 * vectorSize) {
+      const DataMapper rhs2 = rhs.getSubMapper(0, j);
+      Index i = 0;
+
+      if (PanelMode) ri += offset * (2 * vectorSize);
+
+      dhs_copy<4>(blockB, rhs2, i, ri, depth, vectorSize);
+      dhs_copy<2>(blockB, rhs2, i, ri, depth, vectorSize);
+      dhs_copy<1>(blockB, rhs2, i, ri, depth, vectorSize);
+
+      for (; i < depth; i++) {
+        if (StorageOrder == ColMajor) {
+          blockB[ri + 0] = rhs2(i, 0);
+          blockB[ri + 1] = rhs2(i, 1);
+
+          ri += vectorSize;
+
+          blockB[ri + 0] = rhs2(i, 2);
+          blockB[ri + 1] = rhs2(i, 3);
+        } else {
+          Packet2d rhsV = rhs2.template loadPacket<Packet2d>(i, 0);
+          pstore<double>(blockB + ri, rhsV);
+
+          ri += vectorSize;
+
+          rhsV = rhs2.template loadPacket<Packet2d>(i, 2);
+          pstore<double>(blockB + ri, rhsV);
+        }
+        ri += vectorSize;
+      }
+
+      if (PanelMode) ri += (2 * vectorSize) * (stride - offset - depth);
+    }
+
+    if (PanelMode) ri += offset;
+
+    for (; j < cols; j++) {
+      const DataMapper rhs2 = rhs.getSubMapper(0, j);
+      for (Index i = 0; i < depth; i++) {
+        blockB[ri] = rhs2(i, 0);
+        ri += 1;
+      }
+
+      if (PanelMode) ri += stride - depth;
+    }
+  }
+};
+
+// General template for lhs packing, bfloat16 specialization.
+template <typename DataMapper, int StorageOrder, bool PanelMode>
+struct dhs_pack<bfloat16, DataMapper, Packet8bf, StorageOrder, PanelMode, true> {
+  EIGEN_STRONG_INLINE void operator()(bfloat16* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride,
+                                      Index offset) {
+    const Index vectorSize = quad_traits<bfloat16>::vectorsize;
+    Index ri = 0, j = 0;
+
+    for (; j + 2 * vectorSize <= rows; j += 2 * vectorSize) {
+      const DataMapper lhs2 = lhs.getSubMapper(j, 0);
+      Index i = 0;
+
+      if (PanelMode) ri += 2 * vectorSize * offset;
+
+      if (StorageOrder == ColMajor) {
+        for (; i + 2 <= depth; i += 2) {
+          PacketBlock<Packet8bf, 4> block;
+
+          block.packet[0] = lhs2.template loadPacket<Packet8bf>(0 * vectorSize, i + 0);
+          block.packet[1] = lhs2.template loadPacket<Packet8bf>(1 * vectorSize, i + 0);
+          block.packet[2] = lhs2.template loadPacket<Packet8bf>(0 * vectorSize, i + 1);
+          block.packet[3] = lhs2.template loadPacket<Packet8bf>(1 * vectorSize, i + 1);
+
+          Packet8bf t0, t1;
+          t0 = vec_mergeh(block.packet[0].m_val, block.packet[2].m_val);
+          t1 = vec_mergel(block.packet[0].m_val, block.packet[2].m_val);
+          block.packet[2] = vec_mergeh(block.packet[1].m_val, block.packet[3].m_val);
+          block.packet[3] = vec_mergel(block.packet[1].m_val, block.packet[3].m_val);
+          block.packet[0] = t0;
+          block.packet[1] = t1;
+
+          storeBlock<bfloat16, Packet8bf, 4>(blockA + ri, block);
+
+          ri += 2 * 2 * vectorSize;
+        }
+        if (depth & 1) {
+          PacketBlock<Packet8bf, 2> block;
+
+          block.packet[0] = lhs2.template loadPacket<Packet8bf>(0 * vectorSize, i + 0);
+          block.packet[1] = lhs2.template loadPacket<Packet8bf>(1 * vectorSize, i + 0);
+
+          storeBlock<bfloat16, Packet8bf, 2>(blockA + ri, block);
+
+          ri += 2 * vectorSize;
+        }
+      } else {
+        for (; i + vectorSize <= depth; i += vectorSize) {
+          PacketBlock<Packet8bf, 8> block1, block2;
+
+          bload<DataMapper, Packet8bf, 8, StorageOrder, false, 8>(block1, lhs2, 0 * vectorSize, i);
+          bload<DataMapper, Packet8bf, 8, StorageOrder, false, 8>(block2, lhs2, 1 * vectorSize, i);
+
+          Packet4ui v1[8], v2[8];
+
+          v1[0] = vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[0].m_val),
+                             reinterpret_cast<Packet4ui>(block1.packet[1].m_val));
+          v1[1] = vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[0].m_val),
+                             reinterpret_cast<Packet4ui>(block1.packet[1].m_val));
+          v1[2] = vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[2].m_val),
+                             reinterpret_cast<Packet4ui>(block1.packet[3].m_val));
+          v1[3] = vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[2].m_val),
+                             reinterpret_cast<Packet4ui>(block1.packet[3].m_val));
+          v1[4] = vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[4].m_val),
+                             reinterpret_cast<Packet4ui>(block1.packet[5].m_val));
+          v1[5] = vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[4].m_val),
+                             reinterpret_cast<Packet4ui>(block1.packet[5].m_val));
+          v1[6] = vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[6].m_val),
+                             reinterpret_cast<Packet4ui>(block1.packet[7].m_val));
+          v1[7] = vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[6].m_val),
+                             reinterpret_cast<Packet4ui>(block1.packet[7].m_val));
+          v2[0] = vec_mergeh(reinterpret_cast<Packet4ui>(block2.packet[0].m_val),
+                             reinterpret_cast<Packet4ui>(block2.packet[1].m_val));
+          v2[1] = vec_mergel(reinterpret_cast<Packet4ui>(block2.packet[0].m_val),
+                             reinterpret_cast<Packet4ui>(block2.packet[1].m_val));
+          v2[2] = vec_mergeh(reinterpret_cast<Packet4ui>(block2.packet[2].m_val),
+                             reinterpret_cast<Packet4ui>(block2.packet[3].m_val));
+          v2[3] = vec_mergel(reinterpret_cast<Packet4ui>(block2.packet[2].m_val),
+                             reinterpret_cast<Packet4ui>(block2.packet[3].m_val));
+          v2[4] = vec_mergeh(reinterpret_cast<Packet4ui>(block2.packet[4].m_val),
+                             reinterpret_cast<Packet4ui>(block2.packet[5].m_val));
+          v2[5] = vec_mergel(reinterpret_cast<Packet4ui>(block2.packet[4].m_val),
+                             reinterpret_cast<Packet4ui>(block2.packet[5].m_val));
+          v2[6] = vec_mergeh(reinterpret_cast<Packet4ui>(block2.packet[6].m_val),
+                             reinterpret_cast<Packet4ui>(block2.packet[7].m_val));
+          v2[7] = vec_mergel(reinterpret_cast<Packet4ui>(block2.packet[6].m_val),
+                             reinterpret_cast<Packet4ui>(block2.packet[7].m_val));
+
+#ifdef EIGEN_VECTORIZE_VSX
+          block1.packet[0] = reinterpret_cast<Packet8us>(
+              vec_mergeh(reinterpret_cast<Packet2ul>(v1[0]), reinterpret_cast<Packet2ul>(v1[2])));
+          block1.packet[2] = reinterpret_cast<Packet8us>(
+              vec_mergel(reinterpret_cast<Packet2ul>(v1[0]), reinterpret_cast<Packet2ul>(v1[2])));
+          block1.packet[4] = reinterpret_cast<Packet8us>(
+              vec_mergeh(reinterpret_cast<Packet2ul>(v1[1]), reinterpret_cast<Packet2ul>(v1[3])));
+          block1.packet[6] = reinterpret_cast<Packet8us>(
+              vec_mergel(reinterpret_cast<Packet2ul>(v1[1]), reinterpret_cast<Packet2ul>(v1[3])));
+          block1.packet[1] = reinterpret_cast<Packet8us>(
+              vec_mergeh(reinterpret_cast<Packet2ul>(v1[4]), reinterpret_cast<Packet2ul>(v1[6])));
+          block1.packet[3] = reinterpret_cast<Packet8us>(
+              vec_mergel(reinterpret_cast<Packet2ul>(v1[4]), reinterpret_cast<Packet2ul>(v1[6])));
+          block1.packet[5] = reinterpret_cast<Packet8us>(
+              vec_mergeh(reinterpret_cast<Packet2ul>(v1[5]), reinterpret_cast<Packet2ul>(v1[7])));
+          block1.packet[7] = reinterpret_cast<Packet8us>(
+              vec_mergel(reinterpret_cast<Packet2ul>(v1[5]), reinterpret_cast<Packet2ul>(v1[7])));
+          block2.packet[0] = reinterpret_cast<Packet8us>(
+              vec_mergeh(reinterpret_cast<Packet2ul>(v2[0]), reinterpret_cast<Packet2ul>(v2[2])));
+          block2.packet[2] = reinterpret_cast<Packet8us>(
+              vec_mergel(reinterpret_cast<Packet2ul>(v2[0]), reinterpret_cast<Packet2ul>(v2[2])));
+          block2.packet[4] = reinterpret_cast<Packet8us>(
+              vec_mergeh(reinterpret_cast<Packet2ul>(v2[1]), reinterpret_cast<Packet2ul>(v2[3])));
+          block2.packet[6] = reinterpret_cast<Packet8us>(
+              vec_mergel(reinterpret_cast<Packet2ul>(v2[1]), reinterpret_cast<Packet2ul>(v2[3])));
+          block2.packet[1] = reinterpret_cast<Packet8us>(
+              vec_mergeh(reinterpret_cast<Packet2ul>(v2[4]), reinterpret_cast<Packet2ul>(v2[6])));
+          block2.packet[3] = reinterpret_cast<Packet8us>(
+              vec_mergel(reinterpret_cast<Packet2ul>(v2[4]), reinterpret_cast<Packet2ul>(v2[6])));
+          block2.packet[5] = reinterpret_cast<Packet8us>(
+              vec_mergeh(reinterpret_cast<Packet2ul>(v2[5]), reinterpret_cast<Packet2ul>(v2[7])));
+          block2.packet[7] = reinterpret_cast<Packet8us>(
+              vec_mergel(reinterpret_cast<Packet2ul>(v2[5]), reinterpret_cast<Packet2ul>(v2[7])));
+#else
+          block1.packet[0] = reinterpret_cast<Packet8us>(vec_perm(v1[0], v1[2], p16uc_TRANSPOSE64_HI));
+          block1.packet[2] = reinterpret_cast<Packet8us>(vec_perm(v1[0], v1[2], p16uc_TRANSPOSE64_LO));
+          block1.packet[4] = reinterpret_cast<Packet8us>(vec_perm(v1[1], v1[3], p16uc_TRANSPOSE64_HI));
+          block1.packet[6] = reinterpret_cast<Packet8us>(vec_perm(v1[1], v1[3], p16uc_TRANSPOSE64_LO));
+          block1.packet[1] = reinterpret_cast<Packet8us>(vec_perm(v1[4], v1[6], p16uc_TRANSPOSE64_HI));
+          block1.packet[3] = reinterpret_cast<Packet8us>(vec_perm(v1[4], v1[6], p16uc_TRANSPOSE64_LO));
+          block1.packet[5] = reinterpret_cast<Packet8us>(vec_perm(v1[5], v1[7], p16uc_TRANSPOSE64_HI));
+          block1.packet[7] = reinterpret_cast<Packet8us>(vec_perm(v1[5], v1[7], p16uc_TRANSPOSE64_LO));
+          block2.packet[0] = reinterpret_cast<Packet8us>(vec_perm(v2[0], v2[2], p16uc_TRANSPOSE64_HI));
+          block2.packet[2] = reinterpret_cast<Packet8us>(vec_perm(v2[0], v2[2], p16uc_TRANSPOSE64_LO));
+          block2.packet[4] = reinterpret_cast<Packet8us>(vec_perm(v2[1], v2[3], p16uc_TRANSPOSE64_HI));
+          block2.packet[6] = reinterpret_cast<Packet8us>(vec_perm(v2[1], v2[3], p16uc_TRANSPOSE64_LO));
+          block2.packet[1] = reinterpret_cast<Packet8us>(vec_perm(v2[4], v2[6], p16uc_TRANSPOSE64_HI));
+          block2.packet[3] = reinterpret_cast<Packet8us>(vec_perm(v2[4], v2[6], p16uc_TRANSPOSE64_LO));
+          block2.packet[5] = reinterpret_cast<Packet8us>(vec_perm(v2[5], v2[7], p16uc_TRANSPOSE64_HI));
+          block2.packet[7] = reinterpret_cast<Packet8us>(vec_perm(v2[5], v2[7], p16uc_TRANSPOSE64_LO));
+#endif
+
+          for (Index M = 0; M < 8; M += 2) {
+            pstore<bfloat16>(blockA + ri + (0 * vectorSize) + (2 * vectorSize * M), block1.packet[M + 0]);
+            pstore<bfloat16>(blockA + ri + (1 * vectorSize) + (2 * vectorSize * M), block1.packet[M + 1]);
+            pstore<bfloat16>(blockA + ri + (2 * vectorSize) + (2 * vectorSize * M), block2.packet[M + 0]);
+            pstore<bfloat16>(blockA + ri + (3 * vectorSize) + (2 * vectorSize * M), block2.packet[M + 1]);
+          }
+
+          ri += 2 * vectorSize * vectorSize;
+        }
+        for (; i + 2 <= depth; i += 2) {
+          for (Index M = 0; M < 2 * vectorSize; M++) {
+            blockA[ri + (M * 2) + 0] = lhs2(M, i + 0);
+            blockA[ri + (M * 2) + 1] = lhs2(M, i + 1);
+          }
+
+          ri += 2 * 2 * vectorSize;
+        }
+        if (depth & 1) {
+          for (Index M = 0; M < 2 * vectorSize; M++) {
+            blockA[ri + M] = lhs2(M, i);
+          }
+          ri += 2 * vectorSize;
+        }
+      }
+
+      if (PanelMode) ri += 2 * vectorSize * (stride - offset - depth);
+    }
+    for (; j + vectorSize <= rows; j += vectorSize) {
+      const DataMapper lhs2 = lhs.getSubMapper(j, 0);
+      Index i = 0;
+
+      if (PanelMode) ri += vectorSize * offset;
+
+      if (StorageOrder == ColMajor) {
+        for (; i + 2 <= depth; i += 2) {
+          PacketBlock<Packet8bf, 2> block;
+
+          block.packet[0] = lhs2.template loadPacket<Packet8bf>(0 * vectorSize, i + 0);
+          block.packet[1] = lhs2.template loadPacket<Packet8bf>(0 * vectorSize, i + 1);
+
+          Packet8bf t0;
+          t0 = vec_mergeh(block.packet[0].m_val, block.packet[1].m_val);
+          block.packet[1] = vec_mergel(block.packet[0].m_val, block.packet[1].m_val);
+          block.packet[0] = t0;
+
+          storeBlock<bfloat16, Packet8bf, 2>(blockA + ri, block);
+
+          ri += 2 * vectorSize;
+        }
+        if (depth & 1) {
+          Packet8bf lhsV = lhs2.template loadPacket<Packet8bf>(0 * vectorSize, i + 0);
+          pstore<bfloat16>(blockA + ri, lhsV);
+
+          ri += vectorSize;
+        }
+      } else {
+        for (; i + vectorSize <= depth; i += vectorSize) {
+          PacketBlock<Packet8bf, 8> block1;
+
+          bload<DataMapper, Packet8bf, 8, StorageOrder, false, 8>(block1, lhs2, 0 * vectorSize, i);
+
+          Packet4ui v1[8];
+
+          // This is transposing and interleaving data
+          v1[0] = vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[0].m_val),
+                             reinterpret_cast<Packet4ui>(block1.packet[1].m_val));
+          v1[1] = vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[0].m_val),
+                             reinterpret_cast<Packet4ui>(block1.packet[1].m_val));
+          v1[2] = vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[2].m_val),
+                             reinterpret_cast<Packet4ui>(block1.packet[3].m_val));
+          v1[3] = vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[2].m_val),
+                             reinterpret_cast<Packet4ui>(block1.packet[3].m_val));
+          v1[4] = vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[4].m_val),
+                             reinterpret_cast<Packet4ui>(block1.packet[5].m_val));
+          v1[5] = vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[4].m_val),
+                             reinterpret_cast<Packet4ui>(block1.packet[5].m_val));
+          v1[6] = vec_mergeh(reinterpret_cast<Packet4ui>(block1.packet[6].m_val),
+                             reinterpret_cast<Packet4ui>(block1.packet[7].m_val));
+          v1[7] = vec_mergel(reinterpret_cast<Packet4ui>(block1.packet[6].m_val),
+                             reinterpret_cast<Packet4ui>(block1.packet[7].m_val));
+
+#ifdef EIGEN_VECTORIZE_VSX
+          block1.packet[0] = reinterpret_cast<Packet8us>(
+              vec_mergeh(reinterpret_cast<Packet2ul>(v1[0]), reinterpret_cast<Packet2ul>(v1[2])));
+          block1.packet[2] = reinterpret_cast<Packet8us>(
+              vec_mergel(reinterpret_cast<Packet2ul>(v1[0]), reinterpret_cast<Packet2ul>(v1[2])));
+          block1.packet[4] = reinterpret_cast<Packet8us>(
+              vec_mergeh(reinterpret_cast<Packet2ul>(v1[1]), reinterpret_cast<Packet2ul>(v1[3])));
+          block1.packet[6] = reinterpret_cast<Packet8us>(
+              vec_mergel(reinterpret_cast<Packet2ul>(v1[1]), reinterpret_cast<Packet2ul>(v1[3])));
+          block1.packet[1] = reinterpret_cast<Packet8us>(
+              vec_mergeh(reinterpret_cast<Packet2ul>(v1[4]), reinterpret_cast<Packet2ul>(v1[6])));
+          block1.packet[3] = reinterpret_cast<Packet8us>(
+              vec_mergel(reinterpret_cast<Packet2ul>(v1[4]), reinterpret_cast<Packet2ul>(v1[6])));
+          block1.packet[5] = reinterpret_cast<Packet8us>(
+              vec_mergeh(reinterpret_cast<Packet2ul>(v1[5]), reinterpret_cast<Packet2ul>(v1[7])));
+          block1.packet[7] = reinterpret_cast<Packet8us>(
+              vec_mergel(reinterpret_cast<Packet2ul>(v1[5]), reinterpret_cast<Packet2ul>(v1[7])));
+#else
+          block1.packet[0] = reinterpret_cast<Packet8us>(vec_perm(v1[0], v1[2], p16uc_TRANSPOSE64_HI));
+          block1.packet[2] = reinterpret_cast<Packet8us>(vec_perm(v1[0], v1[2], p16uc_TRANSPOSE64_LO));
+          block1.packet[4] = reinterpret_cast<Packet8us>(vec_perm(v1[1], v1[3], p16uc_TRANSPOSE64_HI));
+          block1.packet[6] = reinterpret_cast<Packet8us>(vec_perm(v1[1], v1[3], p16uc_TRANSPOSE64_LO));
+          block1.packet[1] = reinterpret_cast<Packet8us>(vec_perm(v1[4], v1[6], p16uc_TRANSPOSE64_HI));
+          block1.packet[3] = reinterpret_cast<Packet8us>(vec_perm(v1[4], v1[6], p16uc_TRANSPOSE64_LO));
+          block1.packet[5] = reinterpret_cast<Packet8us>(vec_perm(v1[5], v1[7], p16uc_TRANSPOSE64_HI));
+          block1.packet[7] = reinterpret_cast<Packet8us>(vec_perm(v1[5], v1[7], p16uc_TRANSPOSE64_LO));
+#endif
+
+          for (Index M = 0; M < 8; M++) {
+            pstore<bfloat16>(blockA + ri + (vectorSize * M), block1.packet[M]);
+          }
+
+          ri += vectorSize * vectorSize;
+        }
+        for (; i + 2 <= depth; i += 2) {
+          for (Index M = 0; M < vectorSize; M++) {
+            blockA[ri + (M * 2) + 0] = lhs2(M, i + 0);
+            blockA[ri + (M * 2) + 1] = lhs2(M, i + 1);
+          }
+
+          ri += 2 * vectorSize;
+        }
+        if (depth & 1) {
+          for (Index M = 0; M < vectorSize; M++) {
+            blockA[ri + M] = lhs2(M, i);
+          }
+
+          ri += vectorSize;
+        }
+      }
+
+      if (PanelMode) ri += vectorSize * (stride - offset - depth);
+    }
+    if (j + 4 <= rows) {
+      const DataMapper lhs2 = lhs.getSubMapper(j, 0);
+      Index i = 0;
+
+      if (PanelMode) ri += 4 * offset;
+
+      for (; i + 2 <= depth; i += 2) {
+        if (StorageOrder == ColMajor) {
+          PacketBlock<Packet8bf, 2> block;
+
+          block.packet[0] = lhs2.template loadPacketPartial<Packet8bf>(0, i + 0, 4);
+          block.packet[1] = lhs2.template loadPacketPartial<Packet8bf>(0, i + 1, 4);
+
+          block.packet[0] = vec_mergeh(block.packet[0].m_val, block.packet[1].m_val);
+
+          pstore<bfloat16>(blockA + ri, block.packet[0]);
+        } else {
+          blockA[ri + 0] = lhs2(0, i + 0);
+          blockA[ri + 1] = lhs2(0, i + 1);
+          blockA[ri + 2] = lhs2(1, i + 0);
+          blockA[ri + 3] = lhs2(1, i + 1);
+          blockA[ri + 4] = lhs2(2, i + 0);
+          blockA[ri + 5] = lhs2(2, i + 1);
+          blockA[ri + 6] = lhs2(3, i + 0);
+          blockA[ri + 7] = lhs2(3, i + 1);
+        }
+
+        ri += 2 * 4;
+      }
+      if (depth & 1) {
+        if (StorageOrder == ColMajor) {
+          Packet8bf lhsV = lhs2.template loadPacketPartial<Packet8bf>(0, i + 0, 4);
+
+          pstore_partial<bfloat16>(blockA + ri, lhsV, 4);
+        } else {
+          blockA[ri + 0] = lhs2(0, i);
+          blockA[ri + 1] = lhs2(1, i);
+          blockA[ri + 2] = lhs2(2, i);
+          blockA[ri + 3] = lhs2(3, i);
+        }
+
+        ri += 4;
+      }
+
+      if (PanelMode) ri += 4 * (stride - offset - depth);
+      j += 4;
+    }
+
+    if (j < rows) {
+      if (PanelMode) ri += offset * (rows - j);
+
+      Index i = 0;
+      for (; i + 2 <= depth; i += 2) {
+        Index k = j;
+        for (; k < rows; k++) {
+          blockA[ri + 0] = lhs(k, i + 0);
+          blockA[ri + 1] = lhs(k, i + 1);
+          ri += 2;
+        }
+      }
+      if (depth & 1) {
+        for (; j < rows; j++) {
+          blockA[ri] = lhs(j, i);
+          ri += 1;
+        }
+      }
+    }
+  }
+};
+
+// General template for rhs packing, bfloat16 specialization.
+template <typename DataMapper, int StorageOrder, bool PanelMode>
+struct dhs_pack<bfloat16, DataMapper, Packet8bf, StorageOrder, PanelMode, false> {
+  EIGEN_STRONG_INLINE void operator()(bfloat16* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride,
+                                      Index offset) {
+    const Index vectorSize = quad_traits<bfloat16>::vectorsize;
+    Index ri = 0, j = 0;
+
+    for (; j + 4 <= cols; j += 4) {
+      const DataMapper rhs2 = rhs.getSubMapper(0, j);
+      Index i = 0;
+
+      if (PanelMode) ri += 4 * offset;
+
+      for (; i + vectorSize <= depth; i += vectorSize) {
+        if (StorageOrder == ColMajor) {
+          PacketBlock<Packet8bf, 4> block;
+
+          bload<DataMapper, Packet8bf, 4, StorageOrder, false, 4>(block, rhs2, i, 0);
+
+          Packet4ui t0, t1, t2, t3;
+
+          t0 = vec_mergeh(reinterpret_cast<Packet4ui>(block.packet[0].m_val),
+                          reinterpret_cast<Packet4ui>(block.packet[1].m_val));
+          t1 = vec_mergel(reinterpret_cast<Packet4ui>(block.packet[0].m_val),
+                          reinterpret_cast<Packet4ui>(block.packet[1].m_val));
+          t2 = vec_mergeh(reinterpret_cast<Packet4ui>(block.packet[2].m_val),
+                          reinterpret_cast<Packet4ui>(block.packet[3].m_val));
+          t3 = vec_mergel(reinterpret_cast<Packet4ui>(block.packet[2].m_val),
+                          reinterpret_cast<Packet4ui>(block.packet[3].m_val));
+
+#ifdef EIGEN_VECTORIZE_VSX
+          block.packet[0] =
+              reinterpret_cast<Packet8us>(vec_mergeh(reinterpret_cast<Packet2ul>(t0), reinterpret_cast<Packet2ul>(t2)));
+          block.packet[1] =
+              reinterpret_cast<Packet8us>(vec_mergel(reinterpret_cast<Packet2ul>(t0), reinterpret_cast<Packet2ul>(t2)));
+          block.packet[2] =
+              reinterpret_cast<Packet8us>(vec_mergeh(reinterpret_cast<Packet2ul>(t1), reinterpret_cast<Packet2ul>(t3)));
+          block.packet[3] =
+              reinterpret_cast<Packet8us>(vec_mergel(reinterpret_cast<Packet2ul>(t1), reinterpret_cast<Packet2ul>(t3)));
+#else
+          block.packet[0] = reinterpret_cast<Packet8us>(vec_perm(t0, t2, p16uc_TRANSPOSE64_HI));
+          block.packet[1] = reinterpret_cast<Packet8us>(vec_perm(t0, t2, p16uc_TRANSPOSE64_LO));
+          block.packet[2] = reinterpret_cast<Packet8us>(vec_perm(t1, t3, p16uc_TRANSPOSE64_HI));
+          block.packet[3] = reinterpret_cast<Packet8us>(vec_perm(t1, t3, p16uc_TRANSPOSE64_LO));
+#endif
+
+          storeBlock<bfloat16, Packet8bf, 4>(blockB + ri, block);
+        } else {
+          PacketBlock<Packet8bf, 8> block;
+
+          for (int M = 0; M < 8; M++) {
+            block.packet[M] = rhs2.template loadPacketPartial<Packet8bf>(i + M, 0, 4);
+          }
+
+          block.packet[0] = vec_mergeh(block.packet[0].m_val, block.packet[1].m_val);
+          block.packet[1] = vec_mergeh(block.packet[2].m_val, block.packet[3].m_val);
+          block.packet[2] = vec_mergeh(block.packet[4].m_val, block.packet[5].m_val);
+          block.packet[3] = vec_mergeh(block.packet[6].m_val, block.packet[7].m_val);
+
+          const Index size = 16 / sizeof(bfloat16);
+
+          for (int M = 0; M < 4; M++) {
+            pstore<bfloat16>(blockB + ri + (M * size), block.packet[M]);
+          }
+        }
+
+        ri += 4 * vectorSize;
+      }
+      for (; i + 2 <= depth; i += 2) {
+        if (StorageOrder == ColMajor) {
+          blockB[ri + 0] = rhs2(i + 0, 0);
+          blockB[ri + 1] = rhs2(i + 1, 0);
+          blockB[ri + 2] = rhs2(i + 0, 1);
+          blockB[ri + 3] = rhs2(i + 1, 1);
+          blockB[ri + 4] = rhs2(i + 0, 2);
+          blockB[ri + 5] = rhs2(i + 1, 2);
+          blockB[ri + 6] = rhs2(i + 0, 3);
+          blockB[ri + 7] = rhs2(i + 1, 3);
+        } else {
+          PacketBlock<Packet8bf, 2> block;
+
+          for (int M = 0; M < 2; M++) {
+            block.packet[M] = rhs2.template loadPacketPartial<Packet8bf>(i + M, 0, 4);
+          }
+
+          block.packet[0] = vec_mergeh(block.packet[0].m_val, block.packet[1].m_val);
+
+          pstore<bfloat16>(blockB + ri, block.packet[0]);
+        }
+
+        ri += 4 * 2;
+      }
+      if (depth & 1) {
+        blockB[ri + 0] = rhs2(i, 0);
+        blockB[ri + 1] = rhs2(i, 1);
+        blockB[ri + 2] = rhs2(i, 2);
+        blockB[ri + 3] = rhs2(i, 3);
+
+        ri += 4;
+      }
+
+      if (PanelMode) ri += 4 * (stride - offset - depth);
+    }
+
+    if (j < cols) {
+      if (PanelMode) ri += offset * (cols - j);
+
+      Index i = 0;
+      for (; i + 2 <= depth; i += 2) {
+        Index k = j;
+        for (; k < cols; k++) {
+          blockB[ri + 0] = rhs(i + 0, k);
+          blockB[ri + 1] = rhs(i + 1, k);
+          ri += 2;
+        }
+      }
+      if (depth & 1) {
+        for (; j < cols; j++) {
+          blockB[ri] = rhs(i, j);
+          ri += 1;
+        }
+      }
+    }
+  }
+};
+
+// General template for lhs complex packing, float64 specialization.
+template <typename DataMapper, typename Packet, typename PacketC, int StorageOrder, bool Conjugate, bool PanelMode>
+struct dhs_cpack<double, DataMapper, Packet, PacketC, StorageOrder, Conjugate, PanelMode, true> {
+  EIGEN_ALWAYS_INLINE void dhs_ccopy(double* blockAt, const DataMapper& lhs2, Index& i, Index& rir, Index& rii,
+                                     Index depth, const Index vectorSize) {
+    PacketBlock<Packet, 2> blockr, blocki;
+    PacketBlock<PacketC, 4> cblock;
+
+    for (; i + vectorSize <= depth; i += vectorSize) {
+      if (StorageOrder == ColMajor) {
+        cblock.packet[0] = lhs2.template loadPacket<PacketC>(0, i + 0);  //[a1 a1i]
+        cblock.packet[1] = lhs2.template loadPacket<PacketC>(0, i + 1);  //[b1 b1i]
+
+        cblock.packet[2] = lhs2.template loadPacket<PacketC>(1, i + 0);  //[a2 a2i]
+        cblock.packet[3] = lhs2.template loadPacket<PacketC>(1, i + 1);  //[b2 b2i]
+
+        blockr.packet[0] = vec_mergeh(cblock.packet[0].v, cblock.packet[2].v);  //[a1 a2]
+        blockr.packet[1] = vec_mergeh(cblock.packet[1].v, cblock.packet[3].v);  //[b1 b2]
+
+        blocki.packet[0] = vec_mergel(cblock.packet[0].v, cblock.packet[2].v);
+        blocki.packet[1] = vec_mergel(cblock.packet[1].v, cblock.packet[3].v);
+      } else {
+        cblock.packet[0] = lhs2.template loadPacket<PacketC>(0, i);  //[a1 a1i]
+        cblock.packet[1] = lhs2.template loadPacket<PacketC>(1, i);  //[a2 a2i]
+
+        cblock.packet[2] = lhs2.template loadPacket<PacketC>(0, i + 1);  //[b1 b1i]
+        cblock.packet[3] = lhs2.template loadPacket<PacketC>(1, i + 1);  //[b2 b2i
+
+        blockr.packet[0] = vec_mergeh(cblock.packet[0].v, cblock.packet[1].v);  //[a1 a2]
+        blockr.packet[1] = vec_mergeh(cblock.packet[2].v, cblock.packet[3].v);  //[b1 b2]
+
+        blocki.packet[0] = vec_mergel(cblock.packet[0].v, cblock.packet[1].v);
+        blocki.packet[1] = vec_mergel(cblock.packet[2].v, cblock.packet[3].v);
+      }
+
+      if (Conjugate) {
+        blocki.packet[0] = -blocki.packet[0];
+        blocki.packet[1] = -blocki.packet[1];
+      }
+
+      storeBlock<double, Packet, 2>(blockAt + rir, blockr);
+      storeBlock<double, Packet, 2>(blockAt + rii, blocki);
+
+      rir += 2 * vectorSize;
+      rii += 2 * vectorSize;
+    }
+  }
+
+  EIGEN_STRONG_INLINE void operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows,
+                                      Index stride, Index offset) {
+    const Index vectorSize = quad_traits<double>::vectorsize;
+    const Index vectorDelta = vectorSize * ((PanelMode) ? stride : depth);
+    Index rir = ((PanelMode) ? (vectorSize * offset) : 0), rii;
+    double* blockAt = reinterpret_cast<double*>(blockA);
+    Index j = 0;
+
+    for (; j + vectorSize <= rows; j += vectorSize) {
+      const DataMapper lhs2 = lhs.getSubMapper(j, 0);
+      Index i = 0;
+
+      rii = rir + vectorDelta;
+
+      dhs_ccopy(blockAt, lhs2, i, rir, rii, depth, vectorSize);
+
+      for (; i < depth; i++) {
+        PacketBlock<Packet, 1> blockr, blocki;
+        PacketBlock<PacketC, 2> cblock;
+
+        cblock.packet[0] = lhs2.template loadPacket<PacketC>(0, i);
+        cblock.packet[1] = lhs2.template loadPacket<PacketC>(1, i);
+
+        blockr.packet[0] = vec_mergeh(cblock.packet[0].v, cblock.packet[1].v);
+        blocki.packet[0] = vec_mergel(cblock.packet[0].v, cblock.packet[1].v);
+
+        if (Conjugate) {
+          blocki.packet[0] = -blocki.packet[0];
+        }
+
+        pstore<double>(blockAt + rir, blockr.packet[0]);
+        pstore<double>(blockAt + rii, blocki.packet[0]);
+
+        rir += vectorSize;
+        rii += vectorSize;
+      }
+
+      rir += ((PanelMode) ? (vectorSize * (2 * stride - depth)) : vectorDelta);
+    }
+
+    if (j < rows) {
+      if (PanelMode) rir += (offset * (rows - j - vectorSize));
+      rii = rir + (((PanelMode) ? stride : depth) * (rows - j));
+
+      for (Index i = 0; i < depth; i++) {
+        Index k = j;
+        for (; k < rows; k++) {
+          blockAt[rir] = lhs(k, i).real();
+
+          if (Conjugate)
+            blockAt[rii] = -lhs(k, i).imag();
+          else
+            blockAt[rii] = lhs(k, i).imag();
+
+          rir += 1;
+          rii += 1;
+        }
+      }
+    }
+  }
+};
+
+// General template for rhs complex packing, float64 specialization.
+template <typename DataMapper, typename Packet, typename PacketC, int StorageOrder, bool Conjugate, bool PanelMode>
+struct dhs_cpack<double, DataMapper, Packet, PacketC, StorageOrder, Conjugate, PanelMode, false> {
+  EIGEN_ALWAYS_INLINE void dhs_ccopy(double* blockBt, const DataMapper& rhs2, Index& i, Index& rir, Index& rii,
+                                     Index depth, const Index vectorSize) {
+    for (; i < depth; i++) {
+      PacketBlock<PacketC, 4> cblock;
+      PacketBlock<Packet, 2> blockr, blocki;
+
+      bload<DataMapper, PacketC, 2, ColMajor, false, 4>(cblock, rhs2, i, 0);
+
+      blockr.packet[0] = vec_mergeh(cblock.packet[0].v, cblock.packet[1].v);
+      blockr.packet[1] = vec_mergeh(cblock.packet[2].v, cblock.packet[3].v);
+
+      blocki.packet[0] = vec_mergel(cblock.packet[0].v, cblock.packet[1].v);
+      blocki.packet[1] = vec_mergel(cblock.packet[2].v, cblock.packet[3].v);
+
+      if (Conjugate) {
+        blocki.packet[0] = -blocki.packet[0];
+        blocki.packet[1] = -blocki.packet[1];
+      }
+
+      storeBlock<double, Packet, 2>(blockBt + rir, blockr);
+      storeBlock<double, Packet, 2>(blockBt + rii, blocki);
+
+      rir += 2 * vectorSize;
+      rii += 2 * vectorSize;
+    }
+  }
+
+  EIGEN_STRONG_INLINE void operator()(std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols,
+                                      Index stride, Index offset) {
+    const Index vectorSize = quad_traits<double>::vectorsize;
+    const Index vectorDelta = 2 * vectorSize * ((PanelMode) ? stride : depth);
+    Index rir = ((PanelMode) ? (2 * vectorSize * offset) : 0), rii;
+    double* blockBt = reinterpret_cast<double*>(blockB);
+    Index j = 0;
+
+    for (; j + 2 * vectorSize <= cols; j += 2 * vectorSize) {
+      const DataMapper rhs2 = rhs.getSubMapper(0, j);
+      Index i = 0;
+
+      rii = rir + vectorDelta;
+
+      dhs_ccopy(blockBt, rhs2, i, rir, rii, depth, vectorSize);
+
+      rir += ((PanelMode) ? (2 * vectorSize * (2 * stride - depth)) : vectorDelta);
+    }
+
+    if (PanelMode) rir -= (offset * (2 * vectorSize - 1));
+
+    for (; j < cols; j++) {
+      const DataMapper rhs2 = rhs.getSubMapper(0, j);
+      rii = rir + ((PanelMode) ? stride : depth);
+
+      for (Index i = 0; i < depth; i++) {
+        blockBt[rir] = rhs2(i, 0).real();
+
+        if (Conjugate)
+          blockBt[rii] = -rhs2(i, 0).imag();
+        else
+          blockBt[rii] = rhs2(i, 0).imag();
+
+        rir += 1;
+        rii += 1;
+      }
+
+      rir += ((PanelMode) ? (2 * stride - depth) : depth);
+    }
+  }
+};
+
+/**************
+ * GEMM utils *
+ **************/
+
+// 512-bits rank1-update of acc. It can either positive or negative accumulate (useful for complex gemm).
+template <typename Packet, bool NegativeAccumulate, int N>
+EIGEN_ALWAYS_INLINE void pger_common(PacketBlock<Packet, N>* acc, const Packet& lhsV, const Packet* rhsV) {
+  if (NegativeAccumulate) {
+    for (int M = 0; M < N; M++) {
+      acc->packet[M] = vec_nmsub(lhsV, rhsV[M], acc->packet[M]);
+    }
+  } else {
+    for (int M = 0; M < N; M++) {
+      acc->packet[M] = vec_madd(lhsV, rhsV[M], acc->packet[M]);
+    }
+  }
+}
+
+template <int N, typename Scalar, typename Packet, bool NegativeAccumulate>
+EIGEN_ALWAYS_INLINE void pger(PacketBlock<Packet, N>* acc, const Scalar* lhs, const Packet* rhsV) {
+  Packet lhsV = pload<Packet>(lhs);
+
+  pger_common<Packet, NegativeAccumulate, N>(acc, lhsV, rhsV);
+}
+
+// 512-bits rank1-update of complex acc. It takes decoupled accumulators as entries. It also takes cares of mixed types
+// real * complex and complex * real.
+template <int N, typename Packet, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_ALWAYS_INLINE void pgerc_common(PacketBlock<Packet, N>* accReal, PacketBlock<Packet, N>* accImag,
+                                      const Packet& lhsV, Packet& lhsVi, const Packet* rhsV, const Packet* rhsVi) {
+  pger_common<Packet, false, N>(accReal, lhsV, rhsV);
+  if (LhsIsReal) {
+    pger_common<Packet, ConjugateRhs, N>(accImag, lhsV, rhsVi);
+    EIGEN_UNUSED_VARIABLE(lhsVi);
+  } else {
+    if (!RhsIsReal) {
+      pger_common<Packet, ConjugateLhs == ConjugateRhs, N>(accReal, lhsVi, rhsVi);
+      pger_common<Packet, ConjugateRhs, N>(accImag, lhsV, rhsVi);
+    } else {
+      EIGEN_UNUSED_VARIABLE(rhsVi);
+    }
+    pger_common<Packet, ConjugateLhs, N>(accImag, lhsVi, rhsV);
+  }
+}
+
+template <int N, typename Scalar, typename Packet, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_ALWAYS_INLINE void pgerc(PacketBlock<Packet, N>* accReal, PacketBlock<Packet, N>* accImag, const Scalar* lhs_ptr,
+                               const Scalar* lhs_ptr_imag, const Packet* rhsV, const Packet* rhsVi) {
+  Packet lhsV = ploadLhs<Packet>(lhs_ptr);
+  Packet lhsVi;
+  if (!LhsIsReal)
+    lhsVi = ploadLhs<Packet>(lhs_ptr_imag);
+  else
+    EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
+
+  pgerc_common<N, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(accReal, accImag, lhsV, lhsVi, rhsV, rhsVi);
+}
+
+template <typename Packet>
+EIGEN_ALWAYS_INLINE Packet ploadLhs(const __UNPACK_TYPE__(Packet) * lhs) {
+  return ploadu<Packet>(lhs);
+}
+
+// Zero the accumulator on PacketBlock.
+template <typename Packet, int N>
+EIGEN_ALWAYS_INLINE void bsetzero(PacketBlock<Packet, N>& acc) {
+  for (int M = 0; M < N; M++) {
+    acc.packet[M] = pset1<Packet>((__UNPACK_TYPE__(Packet))0);
+  }
+}
+
+template <typename Packet, int N>
+EIGEN_ALWAYS_INLINE void bscalec_common(PacketBlock<Packet, N>& acc, PacketBlock<Packet, N>& accZ,
+                                        const Packet& pAlpha) {
+  for (int M = 0; M < N; M++) {
+    acc.packet[M] = vec_mul(accZ.packet[M], pAlpha);
+  }
+}
+
+template <typename Packet, int N>
+EIGEN_ALWAYS_INLINE void band(PacketBlock<Packet, N>& acc, const Packet& pMask) {
+  for (int M = 0; M < N; M++) {
+    acc.packet[M] = pand<Packet>(acc.packet[M], pMask);
+  }
+}
+
+// Complex version of PacketBlock scaling.
+template <typename Packet, int N, bool mask>
+EIGEN_ALWAYS_INLINE void bscalec(PacketBlock<Packet, N>& aReal, PacketBlock<Packet, N>& aImag, const Packet& bReal,
+                                 const Packet& bImag, PacketBlock<Packet, N>& cReal, PacketBlock<Packet, N>& cImag,
+                                 const Packet& pMask) {
+  if (mask && (sizeof(__UNPACK_TYPE__(Packet)) == sizeof(float))) {
+    band<Packet, N>(aReal, pMask);
+    band<Packet, N>(aImag, pMask);
+  } else {
+    EIGEN_UNUSED_VARIABLE(pMask);
+  }
+
+  bscalec_common<Packet, N>(cReal, aReal, bReal);
+
+  bscalec_common<Packet, N>(cImag, aImag, bReal);
+
+  pger_common<Packet, true, N>(&cReal, bImag, aImag.packet);
+
+  pger_common<Packet, false, N>(&cImag, bImag, aReal.packet);
+}
+
+// Load a PacketBlock, the N parameters make tuning gemm easier so we can add more accumulators as needed.
+//
+// full = operate (load) on the entire PacketBlock or only half
+template <typename DataMapper, typename Packet, const Index accCols, int StorageOrder, bool Complex, int N, bool full>
+EIGEN_ALWAYS_INLINE void bload(PacketBlock<Packet, N*(Complex ? 2 : 1)>& acc, const DataMapper& res, Index row,
+                               Index col) {
+  if (StorageOrder == RowMajor) {
+    for (int M = 0; M < N; M++) {
+      acc.packet[M] = res.template loadPacket<Packet>(row + M, col);
+    }
+    if (Complex) {
+      for (int M = 0; M < N; M++) {
+        acc.packet[M + N] = res.template loadPacket<Packet>(row + M, col + accCols);
+      }
+    }
+  } else {
+    for (int M = 0; M < N; M++) {
+      acc.packet[M] = res.template loadPacket<Packet>(row, col + M);
+    }
+    if (Complex && full) {
+      for (int M = 0; M < N; M++) {
+        acc.packet[M + N] = res.template loadPacket<Packet>(row + accCols, col + M);
+      }
+    }
+  }
+}
+
+template <typename DataMapper, typename Packet, int N>
+EIGEN_ALWAYS_INLINE void bstore(PacketBlock<Packet, N>& acc, const DataMapper& res, Index row) {
+  for (int M = 0; M < N; M++) {
+    res.template storePacket<Packet>(row, M, acc.packet[M]);
+  }
+}
+
+#ifdef USE_PARTIAL_PACKETS
+template <typename DataMapper, typename Packet, const Index accCols, bool Complex, Index N, bool full>
+EIGEN_ALWAYS_INLINE void bload_partial(PacketBlock<Packet, N*(Complex ? 2 : 1)>& acc, const DataMapper& res, Index row,
+                                       Index elements) {
+  for (Index M = 0; M < N; M++) {
+    acc.packet[M] = res.template loadPacketPartial<Packet>(row, M, elements);
+  }
+  if (Complex && full) {
+    for (Index M = 0; M < N; M++) {
+      acc.packet[M + N] = res.template loadPacketPartial<Packet>(row + accCols, M, elements);
+    }
+  }
+}
+
+template <typename DataMapper, typename Packet, Index N>
+EIGEN_ALWAYS_INLINE void bstore_partial(PacketBlock<Packet, N>& acc, const DataMapper& res, Index row, Index elements) {
+  for (Index M = 0; M < N; M++) {
+    res.template storePacketPartial<Packet>(row, M, acc.packet[M], elements);
+  }
+}
+#endif
+
+#ifdef _ARCH_PWR10
+#define USE_P10_AND_PVIPR2_0 (EIGEN_COMP_LLVM || (__GNUC__ >= 11))
+#else
+#define USE_P10_AND_PVIPR2_0 0
+#endif
+
+#if !USE_P10_AND_PVIPR2_0
+const static Packet4i mask4[4] = {{0, 0, 0, 0}, {-1, 0, 0, 0}, {-1, -1, 0, 0}, {-1, -1, -1, 0}};
+#endif
+
+template <typename Packet>
+EIGEN_ALWAYS_INLINE Packet bmask(const Index remaining_rows) {
+#if USE_P10_AND_PVIPR2_0
+#ifdef _BIG_ENDIAN
+  return Packet(vec_reve(vec_genwm((1 << remaining_rows) - 1)));
+#else
+  return Packet(vec_genwm((1 << remaining_rows) - 1));
+#endif
+#else
+  return Packet(mask4[remaining_rows]);
+#endif
+}
+
+template <>
+EIGEN_ALWAYS_INLINE Packet2d bmask<Packet2d>(const Index remaining_rows) {
+#if USE_P10_AND_PVIPR2_0
+  Packet2d mask2 = Packet2d(vec_gendm(remaining_rows));
+#ifdef _BIG_ENDIAN
+  return preverse(mask2);
+#else
+  return mask2;
+#endif
+#else
+  Packet2l ret = {-remaining_rows, 0};
+  return Packet2d(ret);
+#endif
+}
+
+template <typename Packet, int N>
+EIGEN_ALWAYS_INLINE void bscale(PacketBlock<Packet, N>& acc, PacketBlock<Packet, N>& accZ, const Packet& pAlpha) {
+  for (int M = 0; M < N; M++) {
+    acc.packet[M] = pmadd<Packet>(pAlpha, accZ.packet[M], acc.packet[M]);
+  }
+}
+
+// Scale the PacketBlock vectors by alpha.
+template <typename Packet, int N, bool mask>
+EIGEN_ALWAYS_INLINE void bscale(PacketBlock<Packet, N>& acc, PacketBlock<Packet, N>& accZ, const Packet& pAlpha,
+                                const Packet& pMask) {
+  if (mask) {
+    band<Packet, N>(accZ, pMask);
+  } else {
+    EIGEN_UNUSED_VARIABLE(pMask);
+  }
+
+  bscale<Packet, N>(acc, accZ, pAlpha);
+}
+
+template <typename Packet, int N, bool real>
+EIGEN_ALWAYS_INLINE void pbroadcastN(const __UNPACK_TYPE__(Packet) * ap0, const __UNPACK_TYPE__(Packet) * ap1,
+                                     const __UNPACK_TYPE__(Packet) * ap2, Packet& a0, Packet& a1, Packet& a2,
+                                     Packet& a3) {
+  a0 = pset1<Packet>(ap0[0]);
+  if (N == 4) {
+    a1 = pset1<Packet>(ap0[1]);
+    a2 = pset1<Packet>(ap0[2]);
+    a3 = pset1<Packet>(ap0[3]);
+    EIGEN_UNUSED_VARIABLE(ap1);
+    EIGEN_UNUSED_VARIABLE(ap2);
+  } else {
+    if (N > 1) {
+      a1 = pset1<Packet>(ap1[0]);
+    } else {
+      EIGEN_UNUSED_VARIABLE(a1);
+      EIGEN_UNUSED_VARIABLE(ap1);
+    }
+    if (N > 2) {
+      a2 = pset1<Packet>(ap2[0]);
+    } else {
+      EIGEN_UNUSED_VARIABLE(a2);
+      EIGEN_UNUSED_VARIABLE(ap2);
+    }
+  }
+}
+
+template <>
+EIGEN_ALWAYS_INLINE void pbroadcastN<Packet4f, 4, true>(const float* ap0, const float*, const float*, Packet4f& a0,
+                                                        Packet4f& a1, Packet4f& a2, Packet4f& a3) {
+  pbroadcast4<Packet4f>(ap0, a0, a1, a2, a3);
+}
+
+template <>
+EIGEN_ALWAYS_INLINE void pbroadcastN<Packet4f, 4, false>(const float* ap0, const float* ap1, const float* ap2,
+                                                         Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) {
+  pbroadcastN<Packet4f, 4, true>(ap0, ap1, ap2, a0, a1, a2, a3);
+}
+
+template <>
+EIGEN_ALWAYS_INLINE void pbroadcastN<Packet2d, 4, false>(const double* ap0, const double*, const double*, Packet2d& a0,
+                                                         Packet2d& a1, Packet2d& a2, Packet2d& a3) {
+  a1 = pload<Packet2d>(ap0);
+  a3 = pload<Packet2d>(ap0 + 2);
+  a0 = vec_splat(a1, 0);
+  a1 = vec_splat(a1, 1);
+  a2 = vec_splat(a3, 0);
+  a3 = vec_splat(a3, 1);
+}
+
+// Grab two decouples real/imaginary PacketBlocks and return two coupled (real/imaginary pairs) PacketBlocks.
+template <typename Packet, typename Packetc, int N, bool full>
+EIGEN_ALWAYS_INLINE void bcouple_common(PacketBlock<Packet, N>& taccReal, PacketBlock<Packet, N>& taccImag,
+                                        PacketBlock<Packetc, N>& acc1, PacketBlock<Packetc, N>& acc2) {
+  for (int M = 0; M < N; M++) {
+    acc1.packet[M].v = vec_mergeh(taccReal.packet[M], taccImag.packet[M]);
+  }
+
+  if (full) {
+    for (int M = 0; M < N; M++) {
+      acc2.packet[M].v = vec_mergel(taccReal.packet[M], taccImag.packet[M]);
+    }
+  }
+}
+
+template <typename Packet, typename Packetc, int N, bool full>
+EIGEN_ALWAYS_INLINE void bcouple(PacketBlock<Packet, N>& taccReal, PacketBlock<Packet, N>& taccImag,
+                                 PacketBlock<Packetc, N * 2>& tRes, PacketBlock<Packetc, N>& acc1,
+                                 PacketBlock<Packetc, N>& acc2) {
+  bcouple_common<Packet, Packetc, N, full>(taccReal, taccImag, acc1, acc2);
+
+  for (int M = 0; M < N; M++) {
+    acc1.packet[M] = padd<Packetc>(tRes.packet[M], acc1.packet[M]);
+  }
+
+  if (full) {
+    for (int M = 0; M < N; M++) {
+      acc2.packet[M] = padd<Packetc>(tRes.packet[M + N], acc2.packet[M]);
+    }
+  }
+}
+
+// PEEL loop factor.
+#define PEEL 7
+#define PEEL_ROW 7
+
+#define MICRO_UNROLL(func) func(0) func(1) func(2) func(3) func(4) func(5) func(6) func(7)
+
+#define MICRO_NORMAL_ROWS accRows == quad_traits<Scalar>::rows || accRows == 1
+
+#define MICRO_NEW_ROWS ((MICRO_NORMAL_ROWS) ? accRows : 1)
+
+#define MICRO_RHS(ptr, N) rhs_##ptr##N
+
+#define MICRO_ZERO_PEEL(peel)                 \
+  if ((PEEL_ROW > peel) && (peel != 0)) {     \
+    bsetzero<Packet, accRows>(accZero##peel); \
+  } else {                                    \
+    EIGEN_UNUSED_VARIABLE(accZero##peel);     \
+  }
+
+#define MICRO_ADD(ptr, N)               \
+  if (MICRO_NORMAL_ROWS) {              \
+    MICRO_RHS(ptr, 0) += (accRows * N); \
+  } else {                              \
+    MICRO_RHS(ptr, 0) += N;             \
+    MICRO_RHS(ptr, 1) += N;             \
+    if (accRows == 3) {                 \
+      MICRO_RHS(ptr, 2) += N;           \
+    }                                   \
+  }
+
+#define MICRO_ADD_ROWS(N) MICRO_ADD(ptr, N)
+
+#define MICRO_BROADCAST1(peel, ptr, rhsV, real)                                                                      \
+  if (MICRO_NORMAL_ROWS) {                                                                                           \
+    pbroadcastN<Packet, accRows, real>(MICRO_RHS(ptr, 0) + (accRows * peel), MICRO_RHS(ptr, 0), MICRO_RHS(ptr, 0),   \
+                                       rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]);                  \
+  } else {                                                                                                           \
+    pbroadcastN<Packet, accRows, real>(MICRO_RHS(ptr, 0) + peel, MICRO_RHS(ptr, 1) + peel, MICRO_RHS(ptr, 2) + peel, \
+                                       rhsV##peel[0], rhsV##peel[1], rhsV##peel[2], rhsV##peel[3]);                  \
+  }
+
+#define MICRO_BROADCAST(peel) MICRO_BROADCAST1(peel, ptr, rhsV, true)
+
+#define MICRO_BROADCAST_EXTRA1(ptr, rhsV, real)                                                                 \
+  pbroadcastN<Packet, accRows, real>(MICRO_RHS(ptr, 0), MICRO_RHS(ptr, 1), MICRO_RHS(ptr, 2), rhsV[0], rhsV[1], \
+                                     rhsV[2], rhsV[3]);
+
+#define MICRO_BROADCAST_EXTRA             \
+  Packet rhsV[4];                         \
+  MICRO_BROADCAST_EXTRA1(ptr, rhsV, true) \
+  MICRO_ADD_ROWS(1)
+
+#define MICRO_SRC2(ptr, N, M)                   \
+  if (MICRO_NORMAL_ROWS) {                      \
+    EIGEN_UNUSED_VARIABLE(strideB);             \
+    EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr, 1));   \
+    EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr, 2));   \
+  } else {                                      \
+    MICRO_RHS(ptr, 1) = rhs_base + N + M;       \
+    if (accRows == 3) {                         \
+      MICRO_RHS(ptr, 2) = rhs_base + N * 2 + M; \
+    } else {                                    \
+      EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr, 2)); \
+    }                                           \
+  }
+
+#define MICRO_SRC2_PTR MICRO_SRC2(ptr, strideB, 0)
+
+#define MICRO_ZERO_PEEL_ROW MICRO_UNROLL(MICRO_ZERO_PEEL)
+
+#define MICRO_WORK_PEEL(peel)                                                                            \
+  if (PEEL_ROW > peel) {                                                                                 \
+    MICRO_BROADCAST(peel)                                                                                \
+    pger<accRows, Scalar, Packet, false>(&accZero##peel, lhs_ptr + (remaining_rows * peel), rhsV##peel); \
+  } else {                                                                                               \
+    EIGEN_UNUSED_VARIABLE(rhsV##peel);                                                                   \
+  }
+
+#define MICRO_WORK_PEEL_ROW                                                              \
+  Packet rhsV0[4], rhsV1[4], rhsV2[4], rhsV3[4], rhsV4[4], rhsV5[4], rhsV6[4], rhsV7[4]; \
+  MICRO_UNROLL(MICRO_WORK_PEEL)                                                          \
+  lhs_ptr += (remaining_rows * PEEL_ROW);                                                \
+  MICRO_ADD_ROWS(PEEL_ROW)
+
+#define MICRO_ADD_PEEL(peel, sum)                        \
+  if (PEEL_ROW > peel) {                                 \
+    for (Index i = 0; i < accRows; i++) {                \
+      accZero##sum.packet[i] += accZero##peel.packet[i]; \
+    }                                                    \
+  }
+
+#define MICRO_ADD_PEEL_ROW \
+  MICRO_ADD_PEEL(4, 0)     \
+  MICRO_ADD_PEEL(5, 1)     \
+  MICRO_ADD_PEEL(6, 2) MICRO_ADD_PEEL(7, 3) MICRO_ADD_PEEL(2, 0) MICRO_ADD_PEEL(3, 1) MICRO_ADD_PEEL(1, 0)
+
+#define MICRO_PREFETCHN1(ptr, N)               \
+  EIGEN_POWER_PREFETCH(MICRO_RHS(ptr, 0));     \
+  if (N == 2 || N == 3) {                      \
+    EIGEN_POWER_PREFETCH(MICRO_RHS(ptr, 1));   \
+    if (N == 3) {                              \
+      EIGEN_POWER_PREFETCH(MICRO_RHS(ptr, 2)); \
+    }                                          \
+  }
+
+#define MICRO_PREFETCHN(N) MICRO_PREFETCHN1(ptr, N)
+
+#define MICRO_COMPLEX_PREFETCHN(N) \
+  MICRO_PREFETCHN1(ptr_real, N);   \
+  if (!RhsIsReal) {                \
+    MICRO_PREFETCHN1(ptr_imag, N); \
+  }
+
+template <typename Scalar, typename Packet, const Index accRows, const Index remaining_rows>
+EIGEN_ALWAYS_INLINE void MICRO_EXTRA_ROW(const Scalar*& lhs_ptr, const Scalar*& rhs_ptr0, const Scalar*& rhs_ptr1,
+                                         const Scalar*& rhs_ptr2, PacketBlock<Packet, accRows>& accZero) {
+  MICRO_BROADCAST_EXTRA
+  pger<accRows, Scalar, Packet, false>(&accZero, lhs_ptr, rhsV);
+  lhs_ptr += remaining_rows;
+}
+
+template <typename Scalar, typename Packet, typename DataMapper, const Index accRows, const Index accCols,
+          const Index remaining_rows>
+EIGEN_ALWAYS_INLINE void gemm_unrolled_row_iteration(const DataMapper& res, const Scalar* lhs_base,
+                                                     const Scalar* rhs_base, Index depth, Index strideA, Index offsetA,
+                                                     Index strideB, Index row, Index rows, const Packet& pAlpha,
+                                                     const Packet& pMask) {
+  const Scalar *rhs_ptr0 = rhs_base, *rhs_ptr1 = NULL, *rhs_ptr2 = NULL;
+  const Scalar* lhs_ptr = lhs_base + row * strideA + remaining_rows * offsetA;
+  PacketBlock<Packet, accRows> accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7, acc;
+
+  MICRO_SRC2_PTR
+  bsetzero<Packet, accRows>(accZero0);
+
+  Index remaining_depth = depth & -quad_traits<Scalar>::rows;
+  Index k = 0;
+  if (remaining_depth >= PEEL_ROW) {
+    MICRO_ZERO_PEEL_ROW
+    do {
+      MICRO_PREFETCHN(accRows)
+      EIGEN_POWER_PREFETCH(lhs_ptr);
+      MICRO_WORK_PEEL_ROW
+    } while ((k += PEEL_ROW) + PEEL_ROW <= remaining_depth);
+    MICRO_ADD_PEEL_ROW
+  }
+  for (; k < depth; k++) {
+    MICRO_EXTRA_ROW<Scalar, Packet, accRows, remaining_rows>(lhs_ptr, rhs_ptr0, rhs_ptr1, rhs_ptr2, accZero0);
+  }
+
+#ifdef USE_PARTIAL_PACKETS
+  EIGEN_UNUSED_VARIABLE(rows);
+  EIGEN_UNUSED_VARIABLE(pMask);
+  bload_partial<DataMapper, Packet, 0, false, accRows>(acc, res, row, remaining_rows);
+  bscale<Packet, accRows>(acc, accZero0, pAlpha);
+  bstore_partial<DataMapper, Packet, accRows>(acc, res, row, remaining_rows);
+#else
+  bload<DataMapper, Packet, 0, ColMajor, false, accRows>(acc, res, row, 0);
+  if ((accRows == 1) || (rows >= accCols)) {
+    bscale<Packet, accRows, true>(acc, accZero0, pAlpha, pMask);
+    bstore<DataMapper, Packet, accRows>(acc, res, row);
+  } else {
+    bscale<Packet, accRows, false>(acc, accZero0, pAlpha, pMask);
+    for (Index j = 0; j < accRows; j++) {
+      for (Index i = 0; i < remaining_rows; i++) {
+        res(row + i, j) = acc.packet[j][i];
+      }
+    }
+  }
+#endif
+}
+
+#define MICRO_EXTRA(MICRO_EXTRA_UNROLL, value, is_col)   \
+  switch (value) {                                       \
+    default:                                             \
+      MICRO_EXTRA_UNROLL(1)                              \
+      break;                                             \
+    case 2:                                              \
+      if (is_col || (sizeof(Scalar) == sizeof(float))) { \
+        MICRO_EXTRA_UNROLL(2)                            \
+      }                                                  \
+      break;                                             \
+    case 3:                                              \
+      if (is_col || (sizeof(Scalar) == sizeof(float))) { \
+        MICRO_EXTRA_UNROLL(3)                            \
+      }                                                  \
+      break;                                             \
+  }
+
+#define MICRO_EXTRA_ROWS(N)                                                     \
+  gemm_unrolled_row_iteration<Scalar, Packet, DataMapper, accRows, accCols, N>( \
+      res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, rows, pAlpha, pMask);
+
+template <typename Scalar, typename Packet, typename DataMapper, const Index accRows, const Index accCols>
+EIGEN_ALWAYS_INLINE void gemm_extra_row(const DataMapper& res, const Scalar* lhs_base, const Scalar* rhs_base,
+                                        Index depth, Index strideA, Index offsetA, Index strideB, Index row, Index rows,
+                                        Index remaining_rows, const Packet& pAlpha, const Packet& pMask) {
+  MICRO_EXTRA(MICRO_EXTRA_ROWS, remaining_rows, false)
+}
+
+#define MICRO_UNROLL_WORK(func, func2, peel) \
+  MICRO_UNROLL(func2);                       \
+  func(0, peel) func(1, peel) func(2, peel) func(3, peel) func(4, peel) func(5, peel) func(6, peel) func(7, peel)
+
+#define MICRO_WORK_ONE(iter, peel)                                               \
+  if (unroll_factor > iter) {                                                    \
+    pger_common<Packet, false, accRows>(&accZero##iter, lhsV##iter, rhsV##peel); \
+  }
+
+#define MICRO_TYPE_PEEL4(func, func2, peel)                        \
+  if (PEEL > peel) {                                               \
+    Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4, lhsV5, lhsV6, lhsV7; \
+    MICRO_BROADCAST(peel)                                          \
+    MICRO_UNROLL_WORK(func, func2, peel)                           \
+  } else {                                                         \
+    EIGEN_UNUSED_VARIABLE(rhsV##peel);                             \
+  }
+
+#define MICRO_UNROLL_TYPE_PEEL(M, func, func1, func2)                                                           \
+  Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M], rhsV4[M], rhsV5[M], rhsV6[M], rhsV7[M];                        \
+  func(func1, func2, 0) func(func1, func2, 1) func(func1, func2, 2) func(func1, func2, 3) func(func1, func2, 4) \
+      func(func1, func2, 5) func(func1, func2, 6) func(func1, func2, 7)
+
+#define MICRO_UNROLL_TYPE_ONE(M, func, func1, func2) \
+  Packet rhsV0[M];                                   \
+  func(func1, func2, 0)
+
+#define MICRO_UNROLL_TYPE(MICRO_TYPE, size)                       \
+  MICRO_TYPE(4, MICRO_TYPE_PEEL4, MICRO_WORK_ONE, MICRO_LOAD_ONE) \
+  MICRO_ADD_ROWS(size)
+
+#define MICRO_ONE_PEEL4 MICRO_UNROLL_TYPE(MICRO_UNROLL_TYPE_PEEL, PEEL)
+
+#define MICRO_ONE4 MICRO_UNROLL_TYPE(MICRO_UNROLL_TYPE_ONE, 1)
+
+#define MICRO_DST_PTR_ONE(iter)               \
+  if (unroll_factor > iter) {                 \
+    bsetzero<Packet, accRows>(accZero##iter); \
+  } else {                                    \
+    EIGEN_UNUSED_VARIABLE(accZero##iter);     \
+  }
+
+#define MICRO_DST_PTR MICRO_UNROLL(MICRO_DST_PTR_ONE)
+
+#define MICRO_SRC_PTR MICRO_UNROLL(MICRO_SRC_PTR_ONE)
+
+#define MICRO_PREFETCH MICRO_UNROLL(MICRO_PREFETCH_ONE)
+
+#ifdef USE_PARTIAL_PACKETS
+#define MICRO_STORE_ONE(iter)                                                                         \
+  if (unroll_factor > iter) {                                                                         \
+    if (MICRO_NORMAL_PARTIAL(iter)) {                                                                 \
+      bload<DataMapper, Packet, 0, ColMajor, false, accRows>(acc, res, row + iter * accCols, 0);      \
+      bscale<Packet, accRows>(acc, accZero##iter, pAlpha);                                            \
+      bstore<DataMapper, Packet, accRows>(acc, res, row + iter * accCols);                            \
+    } else {                                                                                          \
+      bload_partial<DataMapper, Packet, 0, false, accRows>(acc, res, row + iter * accCols, accCols2); \
+      bscale<Packet, accRows>(acc, accZero##iter, pAlpha);                                            \
+      bstore_partial<DataMapper, Packet, accRows>(acc, res, row + iter * accCols, accCols2);          \
+    }                                                                                                 \
+  }
+#else
+#define MICRO_STORE_ONE(iter)                                                                  \
+  if (unroll_factor > iter) {                                                                  \
+    bload<DataMapper, Packet, 0, ColMajor, false, accRows>(acc, res, row + iter * accCols, 0); \
+    bscale<Packet, accRows, !(MICRO_NORMAL(iter))>(acc, accZero##iter, pAlpha, pMask);         \
+    bstore<DataMapper, Packet, accRows>(acc, res, row + iter * accCols);                       \
+  }
+#endif
+
+#define MICRO_STORE MICRO_UNROLL(MICRO_STORE_ONE)
+
+#ifdef USE_PARTIAL_PACKETS
+template <int unroll_factor, typename Scalar, typename Packet, typename DataMapper, const Index accRows,
+          const Index accCols, bool full>
+#else
+template <int unroll_factor, typename Scalar, typename Packet, typename DataMapper, const Index accRows,
+          const Index accCols, const Index accCols2>
+#endif
+EIGEN_ALWAYS_INLINE void gemm_unrolled_iteration(const DataMapper& res, const Scalar* lhs_base, const Scalar* rhs_base,
+                                                 Index depth, Index strideA, Index offsetA, Index strideB, Index& row,
+                                                 const Packet& pAlpha,
+#ifdef USE_PARTIAL_PACKETS
+                                                 Index accCols2
+#else
+                                                 const Packet& pMask
+#endif
+) {
+  const Scalar *rhs_ptr0 = rhs_base, *rhs_ptr1 = NULL, *rhs_ptr2 = NULL;
+  const Scalar *lhs_ptr0 = NULL, *lhs_ptr1 = NULL, *lhs_ptr2 = NULL, *lhs_ptr3 = NULL, *lhs_ptr4 = NULL,
+               *lhs_ptr5 = NULL, *lhs_ptr6 = NULL, *lhs_ptr7 = NULL;
+  PacketBlock<Packet, accRows> accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7;
+  PacketBlock<Packet, accRows> acc;
+
+  MICRO_SRC2_PTR
+  MICRO_SRC_PTR
+  MICRO_DST_PTR
+
+  Index k = 0;
+  for (; k + PEEL <= depth; k += PEEL) {
+    MICRO_PREFETCHN(accRows)
+    MICRO_PREFETCH
+    MICRO_ONE_PEEL4
+  }
+  for (; k < depth; k++) {
+    MICRO_ONE4
+  }
+  MICRO_STORE
+
+  MICRO_UPDATE
+}
+
+#ifdef USE_PARTIAL_PACKETS
+#define MICRO_UNROLL_ITER2(N, M)                                                                              \
+  gemm_unrolled_iteration<N + ((M) ? 1 : 0), Scalar, Packet, DataMapper, accRows, accCols, !M>(               \
+      res3, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, pAlpha, M ? remaining_rows : accCols); \
+  if (M) return;
+#else
+#define MICRO_UNROLL_ITER2(N, M)                                                                             \
+  gemm_unrolled_iteration<N + ((M) ? 1 : 0), Scalar, Packet, DataMapper, accRows, accCols, M ? M : accCols>( \
+      res3, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, pAlpha, pMask);                       \
+  if (M) return;
+#endif
+
+template <typename Scalar, typename Packet, typename DataMapper, const Index accRows, const Index accCols>
+EIGEN_ALWAYS_INLINE void gemm_cols(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index depth,
+                                   Index strideA, Index offsetA, Index strideB, Index offsetB, Index col, Index rows,
+                                   Index remaining_rows, const Packet& pAlpha, const Packet& pMask) {
+  const DataMapper res3 = res.getSubMapper(0, col);
+
+  const Scalar* rhs_base = blockB + col * strideB + MICRO_NEW_ROWS * offsetB;
+  const Scalar* lhs_base = blockA + accCols * offsetA;
+  Index row = 0;
+
+#define MAX_UNROLL 7
+  while (row + MAX_UNROLL * accCols <= rows) {
+    MICRO_UNROLL_ITER2(MAX_UNROLL, 0);
+  }
+  switch ((rows - row) / accCols) {
+#if MAX_UNROLL > 7
+    case 7:
+      MICRO_UNROLL_ITER(MICRO_UNROLL_ITER2, 7)
+      break;
+#endif
+#if MAX_UNROLL > 6
+    case 6:
+      MICRO_UNROLL_ITER(MICRO_UNROLL_ITER2, 6)
+      break;
+#endif
+#if MAX_UNROLL > 5
+    case 5:
+      MICRO_UNROLL_ITER(MICRO_UNROLL_ITER2, 5)
+      break;
+#endif
+#if MAX_UNROLL > 4
+    case 4:
+      MICRO_UNROLL_ITER(MICRO_UNROLL_ITER2, 4)
+      break;
+#endif
+#if MAX_UNROLL > 3
+    case 3:
+      MICRO_UNROLL_ITER(MICRO_UNROLL_ITER2, 3)
+      break;
+#endif
+#if MAX_UNROLL > 2
+    case 2:
+      MICRO_UNROLL_ITER(MICRO_UNROLL_ITER2, 2)
+      break;
+#endif
+#if MAX_UNROLL > 1
+    case 1:
+      MICRO_UNROLL_ITER(MICRO_UNROLL_ITER2, 1)
+      break;
+#endif
+    default:
+      break;
+  }
+#undef MAX_UNROLL
+
+  if (remaining_rows > 0) {
+    gemm_extra_row<Scalar, Packet, DataMapper, accRows, accCols>(res3, blockA, rhs_base, depth, strideA, offsetA,
+                                                                 strideB, row, rows, remaining_rows, pAlpha, pMask);
+  }
+}
+
+#define MICRO_EXTRA_COLS(N)                                                                                         \
+  gemm_cols<Scalar, Packet, DataMapper, N, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, \
+                                                    col, rows, remaining_rows, pAlpha, pMask);
+
+template <typename Scalar, typename Packet, typename DataMapper, const Index accCols>
+EIGEN_ALWAYS_INLINE void gemm_extra_cols(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index depth,
+                                         Index strideA, Index offsetA, Index strideB, Index offsetB, Index col,
+                                         Index rows, Index cols, Index remaining_rows, const Packet& pAlpha,
+                                         const Packet& pMask) {
+  MICRO_EXTRA(MICRO_EXTRA_COLS, cols - col, true)
+}
+
+/****************
+ * GEMM kernels *
+ * **************/
+template <typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows,
+          const Index accCols>
+EIGEN_STRONG_INLINE void gemm(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index rows,
+                              Index depth, Index cols, Scalar alpha, Index strideA, Index strideB, Index offsetA,
+                              Index offsetB) {
+  const Index remaining_rows = rows % accCols;
+
+  if (strideA == -1) strideA = depth;
+  if (strideB == -1) strideB = depth;
+
+  const Packet pAlpha = pset1<Packet>(alpha);
+  const Packet pMask = bmask<Packet>(remaining_rows);
+
+  Index col = 0;
+  for (; col + accRows <= cols; col += accRows) {
+    gemm_cols<Scalar, Packet, DataMapper, accRows, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB,
+                                                            offsetB, col, rows, remaining_rows, pAlpha, pMask);
+  }
+
+  if (col != cols) {
+    gemm_extra_cols<Scalar, Packet, DataMapper, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB,
+                                                         col, rows, cols, remaining_rows, pAlpha, pMask);
+  }
+}
+
+#define accColsC (accCols / 2)
+#define advanceRows ((LhsIsReal) ? 1 : 2)
+#define advanceCols ((RhsIsReal) ? 1 : 2)
+
+// PEEL_COMPLEX loop factor.
+#define PEEL_COMPLEX 3
+#define PEEL_COMPLEX_ROW 3
+
+#define MICRO_COMPLEX_UNROLL(func) func(0) func(1) func(2) func(3)
+
+#define MICRO_COMPLEX_ZERO_PEEL(peel)             \
+  if ((PEEL_COMPLEX_ROW > peel) && (peel != 0)) { \
+    bsetzero<Packet, accRows>(accReal##peel);     \
+    bsetzero<Packet, accRows>(accImag##peel);     \
+  } else {                                        \
+    EIGEN_UNUSED_VARIABLE(accReal##peel);         \
+    EIGEN_UNUSED_VARIABLE(accImag##peel);         \
+  }
+
+#define MICRO_COMPLEX_ADD_ROWS(N, used)            \
+  MICRO_ADD(ptr_real, N)                           \
+  if (!RhsIsReal) {                                \
+    MICRO_ADD(ptr_imag, N)                         \
+  } else if (used) {                               \
+    EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr_imag, 0)); \
+    EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr_imag, 1)); \
+    EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr_imag, 2)); \
+  }
+
+#define MICRO_COMPLEX_BROADCAST(peel)              \
+  MICRO_BROADCAST1(peel, ptr_real, rhsV, false)    \
+  if (!RhsIsReal) {                                \
+    MICRO_BROADCAST1(peel, ptr_imag, rhsVi, false) \
+  } else {                                         \
+    EIGEN_UNUSED_VARIABLE(rhsVi##peel);            \
+  }
+
+#define MICRO_COMPLEX_BROADCAST_EXTRA              \
+  Packet rhsV[4], rhsVi[4];                        \
+  MICRO_BROADCAST_EXTRA1(ptr_real, rhsV, false)    \
+  if (!RhsIsReal) {                                \
+    MICRO_BROADCAST_EXTRA1(ptr_imag, rhsVi, false) \
+  } else {                                         \
+    EIGEN_UNUSED_VARIABLE(rhsVi);                  \
+  }                                                \
+  MICRO_COMPLEX_ADD_ROWS(1, true)
+
+#define MICRO_COMPLEX_SRC2_PTR                                    \
+  MICRO_SRC2(ptr_real, strideB* advanceCols, 0)                   \
+  if (!RhsIsReal) {                                               \
+    MICRO_RHS(ptr_imag, 0) = rhs_base + MICRO_NEW_ROWS * strideB; \
+    MICRO_SRC2(ptr_imag, strideB* advanceCols, strideB)           \
+  } else {                                                        \
+    EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr_imag, 0));                \
+    EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr_imag, 1));                \
+    EIGEN_UNUSED_VARIABLE(MICRO_RHS(ptr_imag, 2));                \
+  }
+
+#define MICRO_COMPLEX_ZERO_PEEL_ROW MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_ZERO_PEEL)
+
+#define MICRO_COMPLEX_WORK_PEEL(peel)                                                 \
+  if (PEEL_COMPLEX_ROW > peel) {                                                      \
+    MICRO_COMPLEX_BROADCAST(peel)                                                     \
+    pgerc<accRows, Scalar, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>( \
+        &accReal##peel, &accImag##peel, lhs_ptr_real + (remaining_rows * peel),       \
+        lhs_ptr_imag + (remaining_rows * peel), rhsV##peel, rhsVi##peel);             \
+  } else {                                                                            \
+    EIGEN_UNUSED_VARIABLE(rhsV##peel);                                                \
+    EIGEN_UNUSED_VARIABLE(rhsVi##peel);                                               \
+  }
+
+#define MICRO_COMPLEX_ADD_COLS(size)         \
+  lhs_ptr_real += (remaining_rows * size);   \
+  if (!LhsIsReal)                            \
+    lhs_ptr_imag += (remaining_rows * size); \
+  else                                       \
+    EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
+
+#define MICRO_COMPLEX_WORK_PEEL_ROW                  \
+  Packet rhsV0[4], rhsV1[4], rhsV2[4], rhsV3[4];     \
+  Packet rhsVi0[4], rhsVi1[4], rhsVi2[4], rhsVi3[4]; \
+  MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_WORK_PEEL)      \
+  MICRO_COMPLEX_ADD_COLS(PEEL_COMPLEX_ROW)           \
+  MICRO_COMPLEX_ADD_ROWS(PEEL_COMPLEX_ROW, false)
+
+#define MICRO_COMPLEX_ADD_PEEL(peel, sum)                \
+  if (PEEL_COMPLEX_ROW > peel) {                         \
+    for (Index i = 0; i < accRows; i++) {                \
+      accReal##sum.packet[i] += accReal##peel.packet[i]; \
+      accImag##sum.packet[i] += accImag##peel.packet[i]; \
+    }                                                    \
+  }
+
+#define MICRO_COMPLEX_ADD_PEEL_ROW \
+  MICRO_COMPLEX_ADD_PEEL(2, 0) MICRO_COMPLEX_ADD_PEEL(3, 1) MICRO_COMPLEX_ADD_PEEL(1, 0)
+
+template <typename Scalar, typename Packet, const Index accRows, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal,
+          bool RhsIsReal, const Index remaining_rows>
+EIGEN_ALWAYS_INLINE void MICRO_COMPLEX_EXTRA_ROW(const Scalar*& lhs_ptr_real, const Scalar*& lhs_ptr_imag,
+                                                 const Scalar*& rhs_ptr_real0, const Scalar*& rhs_ptr_real1,
+                                                 const Scalar*& rhs_ptr_real2, const Scalar*& rhs_ptr_imag0,
+                                                 const Scalar*& rhs_ptr_imag1, const Scalar*& rhs_ptr_imag2,
+                                                 PacketBlock<Packet, accRows>& accReal,
+                                                 PacketBlock<Packet, accRows>& accImag) {
+  MICRO_COMPLEX_BROADCAST_EXTRA
+  pgerc<accRows, Scalar, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(&accReal, &accImag, lhs_ptr_real,
+                                                                                   lhs_ptr_imag, rhsV, rhsVi);
+  MICRO_COMPLEX_ADD_COLS(1)
+}
+
+template <typename Scalar, typename Packet, typename Packetc, typename DataMapper, const Index accRows,
+          const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal,
+          const Index remaining_rows>
+EIGEN_ALWAYS_INLINE void gemm_unrolled_complex_row_iteration(const DataMapper& res, const Scalar* lhs_base,
+                                                             const Scalar* rhs_base, Index depth, Index strideA,
+                                                             Index offsetA, Index strideB, Index row, Index rows,
+                                                             const Packet& pAlphaReal, const Packet& pAlphaImag,
+                                                             const Packet& pMask) {
+  const Scalar *rhs_ptr_real0 = rhs_base, *rhs_ptr_real1 = NULL, *rhs_ptr_real2 = NULL;
+  const Scalar *rhs_ptr_imag0 = NULL, *rhs_ptr_imag1 = NULL, *rhs_ptr_imag2 = NULL;
+  const Scalar* lhs_ptr_real = lhs_base + advanceRows * row * strideA + remaining_rows * offsetA;
+  const Scalar* lhs_ptr_imag = NULL;
+  if (!LhsIsReal)
+    lhs_ptr_imag = lhs_ptr_real + remaining_rows * strideA;
+  else
+    EIGEN_UNUSED_VARIABLE(lhs_ptr_imag);
+  PacketBlock<Packet, accRows> accReal0, accImag0, accReal1, accImag1, accReal2, accImag2, accReal3, accImag3;
+  PacketBlock<Packet, accRows> taccReal, taccImag;
+  PacketBlock<Packetc, accRows> acc0, acc1;
+  PacketBlock<Packetc, accRows * 2> tRes;
+
+  MICRO_COMPLEX_SRC2_PTR
+
+  bsetzero<Packet, accRows>(accReal0);
+  bsetzero<Packet, accRows>(accImag0);
+
+  Index remaining_depth = depth & -quad_traits<Scalar>::rows;
+  Index k = 0;
+  if (remaining_depth >= PEEL_COMPLEX_ROW) {
+    MICRO_COMPLEX_ZERO_PEEL_ROW
+    do {
+      MICRO_COMPLEX_PREFETCHN(accRows)
+      EIGEN_POWER_PREFETCH(lhs_ptr_real);
+      if (!LhsIsReal) {
+        EIGEN_POWER_PREFETCH(lhs_ptr_imag);
+      }
+      MICRO_COMPLEX_WORK_PEEL_ROW
+    } while ((k += PEEL_COMPLEX_ROW) + PEEL_COMPLEX_ROW <= remaining_depth);
+    MICRO_COMPLEX_ADD_PEEL_ROW
+  }
+  for (; k < depth; k++) {
+    MICRO_COMPLEX_EXTRA_ROW<Scalar, Packet, accRows, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal, remaining_rows>(
+        lhs_ptr_real, lhs_ptr_imag, rhs_ptr_real0, rhs_ptr_real1, rhs_ptr_real2, rhs_ptr_imag0, rhs_ptr_imag1,
+        rhs_ptr_imag2, accReal0, accImag0);
+  }
+
+  constexpr bool full = (remaining_rows > accColsC);
+  bload<DataMapper, Packetc, accColsC, ColMajor, true, accRows, full>(tRes, res, row, 0);
+  if ((accRows == 1) || (rows >= accCols)) {
+    bscalec<Packet, accRows, true>(accReal0, accImag0, pAlphaReal, pAlphaImag, taccReal, taccImag, pMask);
+    bcouple<Packet, Packetc, accRows, full>(taccReal, taccImag, tRes, acc0, acc1);
+    bstore<DataMapper, Packetc, accRows>(acc0, res, row + 0);
+    if (full) {
+      bstore<DataMapper, Packetc, accRows>(acc1, res, row + accColsC);
+    }
+  } else {
+    bscalec<Packet, accRows, false>(accReal0, accImag0, pAlphaReal, pAlphaImag, taccReal, taccImag, pMask);
+    bcouple<Packet, Packetc, accRows, full>(taccReal, taccImag, tRes, acc0, acc1);
+
+    if ((sizeof(Scalar) == sizeof(float)) && (remaining_rows == 1)) {
+      for (Index j = 0; j < accRows; j++) {
+        res(row + 0, j) = pfirst<Packetc>(acc0.packet[j]);
+      }
+    } else {
+      bstore<DataMapper, Packetc, accRows>(acc0, res, row + 0);
+      if (full) {
+        for (Index j = 0; j < accRows; j++) {
+          res(row + accColsC, j) = pfirst<Packetc>(acc1.packet[j]);
+        }
+      }
+    }
+  }
+}
+
+#define MICRO_COMPLEX_EXTRA_ROWS(N)                                                                        \
+  gemm_unrolled_complex_row_iteration<Scalar, Packet, Packetc, DataMapper, accRows, accCols, ConjugateLhs, \
+                                      ConjugateRhs, LhsIsReal, RhsIsReal, N>(                              \
+      res, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, rows, pAlphaReal, pAlphaImag, pMask);
+
+template <typename Scalar, typename Packet, typename Packetc, typename DataMapper, const Index accRows,
+          const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_ALWAYS_INLINE void gemm_complex_extra_row(const DataMapper& res, const Scalar* lhs_base, const Scalar* rhs_base,
+                                                Index depth, Index strideA, Index offsetA, Index strideB, Index row,
+                                                Index rows, Index remaining_rows, const Packet& pAlphaReal,
+                                                const Packet& pAlphaImag, const Packet& pMask) {
+  MICRO_EXTRA(MICRO_COMPLEX_EXTRA_ROWS, remaining_rows, false)
+}
+
+#define MICRO_COMPLEX_UNROLL_WORK(func, func2, peel) \
+  MICRO_COMPLEX_UNROLL(func2);                       \
+  func(0, peel) func(1, peel) func(2, peel) func(3, peel)
+
+#define MICRO_COMPLEX_WORK_ONE4(iter, peel)                                                \
+  if (unroll_factor > iter) {                                                              \
+    pgerc_common<accRows, Packet, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(       \
+        &accReal##iter, &accImag##iter, lhsV##iter, lhsVi##iter, rhsV##peel, rhsVi##peel); \
+  }
+
+#define MICRO_COMPLEX_TYPE_PEEL4(func, func2, peel) \
+  if (PEEL_COMPLEX > peel) {                        \
+    Packet lhsV0, lhsV1, lhsV2, lhsV3;              \
+    Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3;          \
+    MICRO_COMPLEX_BROADCAST(peel)                   \
+    MICRO_COMPLEX_UNROLL_WORK(func, func2, peel)    \
+  } else {                                          \
+    EIGEN_UNUSED_VARIABLE(rhsV##peel);              \
+    EIGEN_UNUSED_VARIABLE(rhsVi##peel);             \
+  }
+
+#define MICRO_COMPLEX_UNROLL_TYPE_PEEL(M, func, func1, func2) \
+  Packet rhsV0[M], rhsV1[M], rhsV2[M], rhsV3[M];              \
+  Packet rhsVi0[M], rhsVi1[M], rhsVi2[M], rhsVi3[M];          \
+  func(func1, func2, 0) func(func1, func2, 1) func(func1, func2, 2) func(func1, func2, 3)
+
+#define MICRO_COMPLEX_UNROLL_TYPE_ONE(M, func, func1, func2) \
+  Packet rhsV0[M], rhsVi0[M];                                \
+  func(func1, func2, 0)
+
+#define MICRO_COMPLEX_UNROLL_TYPE(MICRO_COMPLEX_TYPE, size)                                        \
+  MICRO_COMPLEX_TYPE(4, MICRO_COMPLEX_TYPE_PEEL4, MICRO_COMPLEX_WORK_ONE4, MICRO_COMPLEX_LOAD_ONE) \
+  MICRO_COMPLEX_ADD_ROWS(size, false)
+
+#define MICRO_COMPLEX_ONE_PEEL4 MICRO_COMPLEX_UNROLL_TYPE(MICRO_COMPLEX_UNROLL_TYPE_PEEL, PEEL_COMPLEX)
+
+#define MICRO_COMPLEX_ONE4 MICRO_COMPLEX_UNROLL_TYPE(MICRO_COMPLEX_UNROLL_TYPE_ONE, 1)
+
+#define MICRO_COMPLEX_DST_PTR_ONE(iter)       \
+  if (unroll_factor > iter) {                 \
+    bsetzero<Packet, accRows>(accReal##iter); \
+    bsetzero<Packet, accRows>(accImag##iter); \
+  } else {                                    \
+    EIGEN_UNUSED_VARIABLE(accReal##iter);     \
+    EIGEN_UNUSED_VARIABLE(accImag##iter);     \
+  }
+
+#define MICRO_COMPLEX_DST_PTR MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_DST_PTR_ONE)
+
+#define MICRO_COMPLEX_SRC_PTR MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_SRC_PTR_ONE)
+
+#define MICRO_COMPLEX_PREFETCH MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_PREFETCH_ONE)
+
+#define MICRO_COMPLEX_STORE_ONE(iter)                                                                               \
+  if (unroll_factor > iter) {                                                                                       \
+    constexpr bool full = ((MICRO_NORMAL(iter)) || (accCols2 > accColsC));                                          \
+    bload<DataMapper, Packetc, accColsC, ColMajor, true, accRows, full>(tRes, res, row + iter * accCols, 0);        \
+    bscalec<Packet, accRows, !(MICRO_NORMAL(iter))>(accReal##iter, accImag##iter, pAlphaReal, pAlphaImag, taccReal, \
+                                                    taccImag, pMask);                                               \
+    bcouple<Packet, Packetc, accRows, full>(taccReal, taccImag, tRes, acc0, acc1);                                  \
+    bstore<DataMapper, Packetc, accRows>(acc0, res, row + iter * accCols + 0);                                      \
+    if (full) {                                                                                                     \
+      bstore<DataMapper, Packetc, accRows>(acc1, res, row + iter * accCols + accColsC);                             \
+    }                                                                                                               \
+  }
+
+#define MICRO_COMPLEX_STORE MICRO_COMPLEX_UNROLL(MICRO_COMPLEX_STORE_ONE)
+
+template <int unroll_factor, typename Scalar, typename Packet, typename Packetc, typename DataMapper,
+          const Index accRows, const Index accCols, const Index accCols2, bool ConjugateLhs, bool ConjugateRhs,
+          bool LhsIsReal, bool RhsIsReal>
+EIGEN_ALWAYS_INLINE void gemm_complex_unrolled_iteration(const DataMapper& res, const Scalar* lhs_base,
+                                                         const Scalar* rhs_base, Index depth, Index strideA,
+                                                         Index offsetA, Index strideB, Index& row,
+                                                         const Packet& pAlphaReal, const Packet& pAlphaImag,
+                                                         const Packet& pMask) {
+  const Scalar *rhs_ptr_real0 = rhs_base, *rhs_ptr_real1 = NULL, *rhs_ptr_real2 = NULL;
+  const Scalar *rhs_ptr_imag0 = NULL, *rhs_ptr_imag1 = NULL, *rhs_ptr_imag2 = NULL;
+  const Index imag_delta = accCols * strideA;
+  const Index imag_delta2 = accCols2 * strideA;
+  const Scalar *lhs_ptr_real0 = NULL, *lhs_ptr_real1 = NULL;
+  const Scalar *lhs_ptr_real2 = NULL, *lhs_ptr_real3 = NULL;
+  PacketBlock<Packet, accRows> accReal0, accImag0, accReal1, accImag1;
+  PacketBlock<Packet, accRows> accReal2, accImag2, accReal3, accImag3;
+  PacketBlock<Packet, accRows> taccReal, taccImag;
+  PacketBlock<Packetc, accRows> acc0, acc1;
+  PacketBlock<Packetc, accRows * 2> tRes;
+
+  MICRO_COMPLEX_SRC2_PTR
+  MICRO_COMPLEX_SRC_PTR
+  MICRO_COMPLEX_DST_PTR
+
+  Index k = 0;
+  for (; k + PEEL_COMPLEX <= depth; k += PEEL_COMPLEX) {
+    MICRO_COMPLEX_PREFETCHN(accRows)
+    MICRO_COMPLEX_PREFETCH
+    MICRO_COMPLEX_ONE_PEEL4
+  }
+  for (; k < depth; k++) {
+    MICRO_COMPLEX_ONE4
+  }
+  MICRO_COMPLEX_STORE
+
+  MICRO_COMPLEX_UPDATE
+}
+
+#define MICRO_COMPLEX_UNROLL_ITER2(N, M)                                                                  \
+  gemm_complex_unrolled_iteration<N + (M ? 1 : 0), Scalar, Packet, Packetc, DataMapper, accRows, accCols, \
+                                  M ? M : accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(     \
+      res3, lhs_base, rhs_base, depth, strideA, offsetA, strideB, row, pAlphaReal, pAlphaImag, pMask);    \
+  if (M) return;
+
+template <typename Scalar, typename Packet, typename Packetc, typename DataMapper, const Index accRows,
+          const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_ALWAYS_INLINE void gemm_complex_cols(const DataMapper& res, const Scalar* blockA, const Scalar* blockB,
+                                           Index depth, Index strideA, Index offsetA, Index strideB, Index offsetB,
+                                           Index col, Index rows, Index remaining_rows, const Packet& pAlphaReal,
+                                           const Packet& pAlphaImag, const Packet& pMask) {
+  const DataMapper res3 = res.getSubMapper(0, col);
+
+  const Scalar* rhs_base = blockB + advanceCols * col * strideB + MICRO_NEW_ROWS * offsetB;
+  const Scalar* lhs_base = blockA + accCols * offsetA;
+  Index row = 0;
+
+#define MAX_COMPLEX_UNROLL 4
+  while (row + MAX_COMPLEX_UNROLL * accCols <= rows) {
+    MICRO_COMPLEX_UNROLL_ITER2(MAX_COMPLEX_UNROLL, 0);
+  }
+  switch ((rows - row) / accCols) {
+#if MAX_COMPLEX_UNROLL > 4
+    case 4:
+      MICRO_COMPLEX_UNROLL_ITER(MICRO_COMPLEX_UNROLL_ITER2, 4)
+      break;
+#endif
+#if MAX_COMPLEX_UNROLL > 3
+    case 3:
+      MICRO_COMPLEX_UNROLL_ITER(MICRO_COMPLEX_UNROLL_ITER2, 3)
+      break;
+#endif
+#if MAX_COMPLEX_UNROLL > 2
+    case 2:
+      MICRO_COMPLEX_UNROLL_ITER(MICRO_COMPLEX_UNROLL_ITER2, 2)
+      break;
+#endif
+#if MAX_COMPLEX_UNROLL > 1
+    case 1:
+      MICRO_COMPLEX_UNROLL_ITER(MICRO_COMPLEX_UNROLL_ITER2, 1)
+      break;
+#endif
+    default:
+      break;
+  }
+#undef MAX_COMPLEX_UNROLL
+
+  if (remaining_rows > 0) {
+    gemm_complex_extra_row<Scalar, Packet, Packetc, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal,
+                           RhsIsReal>(res3, blockA, rhs_base, depth, strideA, offsetA, strideB, row, rows,
+                                      remaining_rows, pAlphaReal, pAlphaImag, pMask);
+  }
+}
+
+#define MICRO_COMPLEX_EXTRA_COLS(N)                                                                         \
+  gemm_complex_cols<Scalar, Packet, Packetc, DataMapper, N, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, \
+                    RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows,   \
+                               remaining_rows, pAlphaReal, pAlphaImag, pMask);
+
+template <typename Scalar, typename Packet, typename Packetc, typename DataMapper, const Index accCols,
+          bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_ALWAYS_INLINE void gemm_complex_extra_cols(const DataMapper& res, const Scalar* blockA, const Scalar* blockB,
+                                                 Index depth, Index strideA, Index offsetA, Index strideB,
+                                                 Index offsetB, Index col, Index rows, Index cols, Index remaining_rows,
+                                                 const Packet& pAlphaReal, const Packet& pAlphaImag,
+                                                 const Packet& pMask) {
+  MICRO_EXTRA(MICRO_COMPLEX_EXTRA_COLS, cols - col, true)
+}
+
+template <typename LhsScalar, typename RhsScalar, typename Scalarc, typename Scalar, typename Packet, typename Packetc,
+          typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, bool ConjugateLhs,
+          bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_STRONG_INLINE void gemm_complex(const DataMapper& res, const LhsScalar* blockAc, const RhsScalar* blockBc,
+                                      Index rows, Index depth, Index cols, Scalarc alpha, Index strideA, Index strideB,
+                                      Index offsetA, Index offsetB) {
+  const Index remaining_rows = rows % accCols;
+
+  if (strideA == -1) strideA = depth;
+  if (strideB == -1) strideB = depth;
+
+  const Packet pAlphaReal = pset1<Packet>(alpha.real());
+  const Packet pAlphaImag = pset1<Packet>(alpha.imag());
+  const Packet pMask = bmask<Packet>(remaining_rows);
+
+  const Scalar* blockA = (Scalar*)blockAc;
+  const Scalar* blockB = (Scalar*)blockBc;
+
+  Index col = 0;
+  for (; col + accRows <= cols; col += accRows) {
+    gemm_complex_cols<Scalar, Packet, Packetc, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal,
+                      RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows,
+                                 remaining_rows, pAlphaReal, pAlphaImag, pMask);
+  }
+
+  if (col != cols) {
+    gemm_complex_extra_cols<Scalar, Packet, Packetc, DataMapper, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal,
+                            RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols,
+                                       remaining_rows, pAlphaReal, pAlphaImag, pMask);
+  }
+}
+
+#undef accColsC
+#undef advanceCols
+#undef advanceRows
+
+EIGEN_ALWAYS_INLINE bool supportsMMA() {
+#if defined(EIGEN_ALTIVEC_MMA_ONLY)
+  return true;
+#elif defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH) && defined(__BUILTIN_CPU_SUPPORTS__)
+  return __builtin_cpu_supports("arch_3_1") && __builtin_cpu_supports("mma");
+#else
+  return false;  // No dynamic dispatch for LLVM or older GCC
+#endif
+}
+
+EIGEN_ALWAYS_INLINE Packet4f loadAndMultiplyF32(Packet4f acc, const Packet4f pAlpha, float* result) {
+  Packet4f result_block = ploadu<Packet4f>(result);
+  return pmadd(acc, pAlpha, result_block);
+}
+
+template <bool lhsExtraRows>
+EIGEN_ALWAYS_INLINE void storeF32(float*& result, Packet4f result_block, Index rows, Index extra_rows) {
+  if (lhsExtraRows) {
+    pstoreu_partial(result, result_block, extra_rows);
+  } else {
+    pstoreu(result, result_block);
+  }
+  result += rows;
+}
+
+template <bool rhsExtraCols, bool lhsExtraRows>
+EIGEN_ALWAYS_INLINE void storeResults(Packet4f (&acc)[4], Index rows, const Packet4f pAlpha, float* result,
+                                      Index extra_cols, Index extra_rows) {
+  Index x = 0;
+  if (rhsExtraCols) {
+    do {
+      Packet4f result_block = loadAndMultiplyF32(acc[x], pAlpha, result);
+      storeF32<lhsExtraRows>(result, result_block, rows, extra_rows);
+    } while (++x < extra_cols);
+  } else {
+    Packet4f result_block[4];
+    float* result2 = result;
+    do {
+      result_block[x] = loadAndMultiplyF32(acc[x], pAlpha, result);
+      result += rows;
+    } while (++x < 4);
+    x = 0;
+    do {
+      storeF32<lhsExtraRows>(result2, result_block[x], rows, extra_rows);
+    } while (++x < 4);
+  }
+}
+
+EIGEN_ALWAYS_INLINE Packet4f oneConvertBF16Hi(Packet8us data) {
+  Packet8us z = pset1<Packet8us>(0);
+#ifdef _BIG_ENDIAN
+  return reinterpret_cast<Packet4f>(vec_mergeh(data, z));
+#else
+  return reinterpret_cast<Packet4f>(vec_mergeh(z, data));
+#endif
+}
+
+EIGEN_ALWAYS_INLINE Packet4f oneConvertBF16Lo(Packet8us data) {
+  Packet8us z = pset1<Packet8us>(0);
+#ifdef _BIG_ENDIAN
+  return reinterpret_cast<Packet4f>(vec_mergel(data, z));
+#else
+  return reinterpret_cast<Packet4f>(vec_mergel(z, data));
+#endif
+}
+
+template <Index N, Index M>
+EIGEN_ALWAYS_INLINE void storeConvertTwoBF16(float* to, PacketBlock<Packet8bf, (N + 7) / 8>& block, Index extra = 0) {
+  if (N < 4) {
+    pstoreu_partial(to + 0, oneConvertBF16Hi(block.packet[0].m_val), extra);
+  } else if (N >= (M * 8 + 4)) {
+    pstoreu(to + 0, oneConvertBF16Hi(block.packet[M].m_val));
+    if (N >= 8) {
+      pstoreu(to + 4, oneConvertBF16Lo(block.packet[M].m_val));
+    }
+  }
+}
+
+template <Index N>
+EIGEN_ALWAYS_INLINE void storeConvertBlockBF16(float* to, PacketBlock<Packet8bf, (N + 7) / 8>& block, Index extra) {
+  storeConvertTwoBF16<N, 0>(to + 0, block, extra);
+  if (N >= 16) {
+    storeConvertTwoBF16<N, 1>(to + 8, block);
+  }
+  if (N >= 32) {
+    storeConvertTwoBF16<N, 2>(to + 16, block);
+    storeConvertTwoBF16<N, 3>(to + 24, block);
+  }
+}
+
+template <bool non_unit_stride, Index delta>
+EIGEN_ALWAYS_INLINE Packet8bf loadBF16fromResult(bfloat16* src, Index resInc) {
+  if (non_unit_stride) {
+    return pgather<bfloat16, Packet8bf>(src + delta * resInc, resInc);
+  } else {
+    return ploadu<Packet8bf>(src + delta);
+  }
+}
+
+static Packet16uc p16uc_MERGE16_32_1 = {0, 1, 16, 17, 2, 3, 18, 19, 0, 1, 16, 17, 2, 3, 18, 19};
+static Packet16uc p16uc_MERGE16_32_2 = {4, 5, 20, 21, 6, 7, 22, 23, 4, 5, 20, 21, 6, 7, 22, 23};
+static Packet16uc p16uc_MERGE16_32_3 = {8, 9, 24, 25, 10, 11, 26, 27, 8, 9, 24, 25, 10, 11, 26, 27};
+static Packet16uc p16uc_MERGE16_32_4 = {12, 13, 28, 29, 14, 15, 30, 31, 12, 13, 28, 29, 14, 15, 30, 31};
+
+static Packet16uc p16uc_MERGE16_32_5 = {0, 1, 16, 17, 16, 17, 16, 17, 0, 1, 16, 17, 16, 17, 16, 17};
+static Packet16uc p16uc_MERGE16_32_6 = {2, 3, 18, 19, 18, 19, 18, 19, 2, 3, 18, 19, 18, 19, 18, 19};
+static Packet16uc p16uc_MERGE16_32_7 = {4, 5, 20, 21, 20, 21, 20, 21, 4, 5, 20, 21, 20, 21, 20, 21};
+static Packet16uc p16uc_MERGE16_32_8 = {6, 7, 22, 23, 22, 23, 22, 23, 6, 7, 22, 23, 22, 23, 22, 23};
+
+EIGEN_ALWAYS_INLINE Packet4f oneConvertBF16Perm(Packet8us data, Packet16uc mask) {
+  Packet8us z = pset1<Packet8us>(0);
+#ifdef _BIG_ENDIAN
+  return reinterpret_cast<Packet4f>(vec_perm(data, z, mask));
+#else
+  return reinterpret_cast<Packet4f>(vec_perm(z, data, mask));
+#endif
+}
+
+template <bool lhsExtraRows, bool odd, Index size>
+EIGEN_ALWAYS_INLINE void convertArrayPointerBF16toF32DupOne(float* result, Index rows, const bfloat16* src,
+                                                            Index extra_rows) {
+  Packet4f dup[4 * 4];
+  Packet8bf data[4];
+
+  for (Index i = 0; i < size; i++) {
+    data[i] = ploadu<Packet8bf>(src + rows * i);
+  }
+
+  for (Index i = 0, j = 0; i < size; i++, j += 4) {
+    dup[j + 0] = oneConvertBF16Perm(data[i].m_val, odd ? p16uc_MERGE16_32_5 : p16uc_MERGE16_32_1);
+    dup[j + 1] = oneConvertBF16Perm(data[i].m_val, odd ? p16uc_MERGE16_32_6 : p16uc_MERGE16_32_2);
+    dup[j + 2] = oneConvertBF16Perm(data[i].m_val, odd ? p16uc_MERGE16_32_7 : p16uc_MERGE16_32_3);
+    dup[j + 3] = oneConvertBF16Perm(data[i].m_val, odd ? p16uc_MERGE16_32_8 : p16uc_MERGE16_32_4);
+  }
+
+  for (Index j = 0; j < 4 * size; j += 4) {
+    if (lhsExtraRows) {
+      Packet4f z = pset1<Packet4f>(float(0));
+      Index i = 0;
+      do {
+        pstoreu(result + (j + i) * 4, dup[j + i]);
+      } while (++i < extra_rows);
+      do {
+        pstoreu(result + (j + i) * 4, z);
+      } while (++i < 4);
+    } else {
+      for (Index i = 0; i < 4; i++) {
+        pstoreu(result + (j + i) * 4, dup[j + i]);
+      }
+    }
+  }
+}
+
+template <bool lhsExtraRows>
+EIGEN_ALWAYS_INLINE void convertArrayPointerBF16toF32Dup(float* result, Index cols, Index rows, const bfloat16* src,
+                                                         Index delta, Index extra_rows) {
+  Index col = 0;
+  src += delta * 2;
+  for (; col + 4 * 2 <= cols; col += 4 * 2, result += 4 * 4 * 4, src += 4 * rows) {
+    convertArrayPointerBF16toF32DupOne<lhsExtraRows, false, 4>(result, rows, src, extra_rows);
+  }
+  for (; col + 2 <= cols; col += 2, result += 4 * 4, src += rows) {
+    convertArrayPointerBF16toF32DupOne<lhsExtraRows, false, 1>(result, rows, src, extra_rows);
+  }
+  if (cols & 1) {
+    convertArrayPointerBF16toF32DupOne<lhsExtraRows, true, 1>(result, rows, src - delta, extra_rows);
+  }
+}
+
+template <const Index size, bool non_unit_stride>
+EIGEN_ALWAYS_INLINE void convertPointerBF16toF32(Index& i, float* result, Index rows, bfloat16*& src, Index resInc) {
+  constexpr Index extra = ((size < 4) ? 4 : size);
+  while (i + size <= rows) {
+    PacketBlock<Packet8bf, (size + 7) / 8> r32;
+    r32.packet[0] = loadBF16fromResult<non_unit_stride, 0>(src, resInc);
+    if (size >= 16) {
+      r32.packet[1] = loadBF16fromResult<non_unit_stride, 8>(src, resInc);
+    }
+    if (size >= 32) {
+      r32.packet[2] = loadBF16fromResult<non_unit_stride, 16>(src, resInc);
+      r32.packet[3] = loadBF16fromResult<non_unit_stride, 24>(src, resInc);
+    }
+    storeConvertBlockBF16<size>(result + i, r32, rows & 3);
+    i += extra;
+    src += extra * resInc;
+    if (size != 32) break;
+  }
+}
+
+template <bool non_unit_stride>
+EIGEN_ALWAYS_INLINE void convertArrayPointerBF16toF32(float* result, Index cols, Index rows, bfloat16* src,
+                                                      Index resInc) {
+  for (Index col = 0; col < cols; col++, src += (rows * resInc), result += rows) {
+    Index i = 0;
+    bfloat16* src2 = src;
+    convertPointerBF16toF32<32, non_unit_stride>(i, result, rows, src2, resInc);
+    convertPointerBF16toF32<16, non_unit_stride>(i, result, rows, src2, resInc);
+    convertPointerBF16toF32<8, non_unit_stride>(i, result, rows, src2, resInc);
+    convertPointerBF16toF32<4, non_unit_stride>(i, result, rows, src2, resInc);
+    convertPointerBF16toF32<1, non_unit_stride>(i, result, rows, src2, resInc);
+  }
+}
+
+template <Index num_acc, Index size = 4>
+EIGEN_ALWAYS_INLINE void zeroAccumulators(Packet4f (&acc)[num_acc][size]) {
+  Packet4f z = pset1<Packet4f>(float(0));
+
+  for (Index k = 0; k < num_acc; k++) {
+    for (Index j = 0; j < size; j++) {
+      acc[k][j] = z;
+    }
+  }
+}
+
+template <Index num_acc>
+EIGEN_ALWAYS_INLINE void tranposeResults(Packet4f (&acc)[num_acc][4]) {
+  for (Index i = 0; i < num_acc; i++) {
+    Packet4ui t0, t1, t2, t3;
+    t0 = vec_mergeh(reinterpret_cast<Packet4ui>(acc[i][0]), reinterpret_cast<Packet4ui>(acc[i][2]));
+    t1 = vec_mergel(reinterpret_cast<Packet4ui>(acc[i][0]), reinterpret_cast<Packet4ui>(acc[i][2]));
+    t2 = vec_mergeh(reinterpret_cast<Packet4ui>(acc[i][1]), reinterpret_cast<Packet4ui>(acc[i][3]));
+    t3 = vec_mergel(reinterpret_cast<Packet4ui>(acc[i][1]), reinterpret_cast<Packet4ui>(acc[i][3]));
+    acc[i][0] = reinterpret_cast<Packet4f>(vec_mergeh(t0, t2));
+    acc[i][1] = reinterpret_cast<Packet4f>(vec_mergel(t0, t2));
+    acc[i][2] = reinterpret_cast<Packet4f>(vec_mergeh(t1, t3));
+    acc[i][3] = reinterpret_cast<Packet4f>(vec_mergel(t1, t3));
+  }
+}
+
+template <Index num_acc>
+EIGEN_ALWAYS_INLINE void addResults(Packet4f (&acc)[num_acc][4]) {
+  for (Index i = 0, j = 0; j < num_acc; i++, j += 2) {
+    for (Index x = 0, y = 0; x < 2; x++, y += 2) {
+      for (Index w = 0, z = 0; w < 2; w++, z += 2) {
+        acc[i][y + w] = acc[j + x][z + 0] + acc[j + x][z + 1];
+      }
+    }
+  }
+}
+
+template <Index num_acc, bool rhsExtraCols, bool lhsExtraRows, Index num_rhs>
+EIGEN_ALWAYS_INLINE void outputResultsVSX(Packet4f (&acc)[num_acc][4], Index rows, const Packet4f pAlpha, float* result,
+                                          const Index extra_cols, Index extra_rows) {
+  tranposeResults<num_acc>(acc);
+  addResults<num_acc>(acc);
+
+  constexpr Index real_rhs = ((num_rhs / 2) - (rhsExtraCols ? 1 : 0));
+  Index k = 0;
+  for (Index i = 0; i < real_rhs; i++, result += 4 * rows, k++) {
+    storeResults<false, lhsExtraRows>(acc[k], rows, pAlpha, result, extra_cols, extra_rows);
+  }
+  if (rhsExtraCols) {
+    storeResults<rhsExtraCols, lhsExtraRows>(acc[k], rows, pAlpha, result, extra_cols, extra_rows);
+  }
+}
+
+template <bool zero>
+EIGEN_ALWAYS_INLINE void loadTwoRhsFloat32(const float* block, Index strideB, Index i, Packet4f& dhs0, Packet4f& dhs1) {
+  dhs0 = ploadu<Packet4f>(block + strideB * i + 0);
+  if (zero) {
+    Packet4f dhs2 = pset1<Packet4f>(float(0));
+    dhs1 = vec_mergel(dhs0, dhs2);
+    dhs0 = vec_mergeh(dhs0, dhs2);
+  } else {
+    dhs1 = ploadu<Packet4f>(block + strideB * i + 4);
+  }
+}
+
+template <Index num_acc, bool zero, bool rhsExtraCols, Index num_rhs>
+EIGEN_ALWAYS_INLINE void KLoop(const float* indexA, const float* indexB, Packet4f (&acc)[num_acc][4], Index strideB,
+                               Index k, Index offsetB, Index extra_cols) {
+  constexpr Index num_lhs = 4;
+  Packet4f lhs[num_lhs], rhs[num_rhs];
+
+  constexpr Index real_rhs = (num_rhs - (rhsExtraCols ? 2 : 0));
+  for (Index i = 0; i < real_rhs; i += 2) {
+    loadTwoRhsFloat32<zero>(indexB + k * 4, strideB, i, rhs[i + 0], rhs[i + 1]);
+  }
+  if (rhsExtraCols) {
+    loadTwoRhsFloat32<zero>(indexB + k * extra_cols - offsetB, strideB, real_rhs, rhs[real_rhs + 0], rhs[real_rhs + 1]);
+  }
+
+  indexA += 2 * k * 4;
+  for (Index j = 0; j < num_lhs; j++) {
+    lhs[j] = ploadu<Packet4f>(indexA + j * 4);
+  }
+
+  for (Index j = 0; j < num_rhs; j++) {
+    for (Index i = 0; i < num_lhs; i++) {
+      acc[j][i] = pmadd(rhs[j], lhs[i], acc[j][i]);
+    }
+  }
+}
+
+template <const Index num_acc, bool rhsExtraCols, bool lhsExtraRows>
+EIGEN_ALWAYS_INLINE void colVSXLoopBodyIter(Index depth, Index rows, const Packet4f pAlpha, const float* indexA,
+                                            const float* indexB, Index strideB, Index offsetB, float* result,
+                                            const Index extra_cols, const Index extra_rows) {
+  constexpr Index num_rhs = num_acc;
+
+  Packet4f acc[num_acc][4];
+
+  zeroAccumulators<num_acc>(acc);
+
+  Index k;
+  for (k = 0; k + 2 <= depth; k += 2) {
+    KLoop<num_acc, false, rhsExtraCols, num_rhs>(indexA, indexB, acc, strideB, k, offsetB, extra_cols);
+  }
+  if (depth & 1) {
+    KLoop<num_acc, true, rhsExtraCols, num_rhs>(indexA, indexB, acc, strideB, k, offsetB, extra_cols);
+  }
+
+  outputResultsVSX<num_acc, rhsExtraCols, lhsExtraRows, num_rhs>(acc, rows, pAlpha, result, extra_cols, extra_rows);
+}
+
+// No more than 4 (uses 2X the accumulators or 8X the number of VSX registers)
+#define MAX_BFLOAT16_ACC_VSX 4
+
+template <const Index num_acc, bool rhsExtraCols, bool lhsExtraRows>
+void colVSXLoopBody(Index& col, Index depth, Index cols, Index rows, const Packet4f pAlpha, const float* indexA,
+                    const float* indexB, Index strideB, Index offsetB, float* result) {
+  constexpr Index step = (num_acc * 4);  // each accumulator has 4 elements
+  const Index extra_cols = (rhsExtraCols) ? (cols & 3) : 0;
+  const Index extra_rows = (lhsExtraRows) ? (rows & 3) : 0;
+  constexpr bool multiIters = !rhsExtraCols && (num_acc == MAX_BFLOAT16_ACC_VSX);
+
+  do {
+    colVSXLoopBodyIter<num_acc * 2, rhsExtraCols, lhsExtraRows>(depth, rows, pAlpha, indexA, indexB, strideB, offsetB,
+                                                                result, extra_cols, extra_rows);
+
+    indexB += strideB * (num_acc * 2);
+    result += rows * step;
+  } while (multiIters && (step <= cols - (col += step)));
+}
+
+template <const Index num_acc, bool rhsExtraCols, bool lhsExtraRows>
+EIGEN_ALWAYS_INLINE void colVSXLoopBodyExtraN(Index col, Index depth, Index cols, Index rows, const Packet4f pAlpha,
+                                              const float* indexA, const float* blockB, Index strideB, Index offsetB,
+                                              float* result) {
+  if (MAX_BFLOAT16_ACC_VSX > num_acc) {
+    colVSXLoopBody<num_acc + (rhsExtraCols ? 1 : 0), rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA,
+                                                                                 blockB, strideB, offsetB, result);
+  }
+}
+
+template <bool rhsExtraCols, bool lhsExtraRows>
+void colVSXLoopBodyExtra(Index col, Index depth, Index cols, Index rows, const Packet4f pAlpha, const float* indexA,
+                         const float* blockB, Index strideB, Index offsetB, float* result) {
+  switch ((cols - col) >> 2) {
+    case 3:
+      colVSXLoopBodyExtraN<3, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB,
+                                                          offsetB, result);
+      break;
+    case 2:
+      colVSXLoopBodyExtraN<2, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB,
+                                                          offsetB, result);
+      break;
+    case 1:
+      colVSXLoopBodyExtraN<1, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB,
+                                                          offsetB, result);
+      break;
+    default:
+      if (rhsExtraCols) {
+        colVSXLoopBody<1, true, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB, result);
+      }
+      break;
+  }
+}
+
+template <Index size, bool lhsExtraRows = false>
+EIGEN_ALWAYS_INLINE void colVSXLoops(Index depth, Index cols, Index rows, const Packet4f pAlpha, const bfloat16* indexA,
+                                     const float* indexA2, const float* blockB2, Index strideA, Index strideB,
+                                     Index offsetB, float* result2) {
+  Index delta_rows = 2 * (lhsExtraRows ? (rows & 3) : size);
+  for (Index row = 0; row < size; row += 4) {
+    convertArrayPointerBF16toF32Dup<lhsExtraRows>(const_cast<float*>(indexA2), strideA, delta_rows, indexA, row,
+                                                  rows & 3);
+
+    const float* blockB = blockB2;
+    float* result = result2 + row;
+
+    Index col = 0;
+    if (cols >= (MAX_BFLOAT16_ACC_VSX * 4)) {
+      colVSXLoopBody<MAX_BFLOAT16_ACC_VSX, false, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA2, blockB,
+                                                                strideB, 0, result);
+      blockB += (strideB >> 1) * col;
+      result += rows * col;
+    }
+    if (cols & 3) {
+      colVSXLoopBodyExtra<true, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA2, blockB, strideB, offsetB,
+                                              result);
+    } else {
+      colVSXLoopBodyExtra<false, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA2, blockB, strideB, 0, result);
+    }
+  }
+}
+
+template <Index size>
+EIGEN_ALWAYS_INLINE void calcVSXColLoops(const bfloat16*& indexA, const float* indexA2, Index& row, Index depth,
+                                         Index cols, Index rows, const Packet4f pAlpha, const float* indexB,
+                                         Index strideA, Index strideB, Index offsetA, Index offsetB, Index bigSuffix,
+                                         float* result) {
+  if ((size == 16) || (rows & size)) {
+    indexA += size * offsetA;
+    colVSXLoops<size>(depth, cols, rows, pAlpha, indexA, indexA2, indexB, strideA, strideB, offsetB, result + row);
+    row += size;
+    indexA += bigSuffix * size / 16;
+  }
+}
+
+template <const Index size, typename DataMapper>
+EIGEN_ALWAYS_INLINE void convertBF16toF32(Index& i, float* result, Index rows, const DataMapper& src) {
+  constexpr Index extra = ((size < 4) ? 4 : size);
+  while (i + size <= rows) {
+    PacketBlock<Packet8bf, (size + 7) / 8> r32;
+    r32.packet[0] = src.template loadPacket<Packet8bf>(i + 0);
+    if (size >= 16) {
+      r32.packet[1] = src.template loadPacket<Packet8bf>(i + 8);
+    }
+    if (size >= 32) {
+      r32.packet[2] = src.template loadPacket<Packet8bf>(i + 16);
+      r32.packet[3] = src.template loadPacket<Packet8bf>(i + 24);
+    }
+    storeConvertBlockBF16<size>(result + i, r32, rows & 3);
+    i += extra;
+    if (size != 32) break;
+  }
+}
+
+template <typename DataMapper>
+EIGEN_ALWAYS_INLINE void convertArrayBF16toF32(float* result, Index cols, Index rows, const DataMapper& src) {
+  typedef typename DataMapper::LinearMapper LinearMapper;
+  for (Index j = 0; j < cols; j++, result += rows) {
+    const LinearMapper src2 = src.getLinearMapper(0, j);
+    Index i = 0;
+    convertBF16toF32<32, LinearMapper>(i, result, rows, src2);
+    convertBF16toF32<16, LinearMapper>(i, result, rows, src2);
+    convertBF16toF32<8, LinearMapper>(i, result, rows, src2);
+    convertBF16toF32<4, LinearMapper>(i, result, rows, src2);
+    convertBF16toF32<1, LinearMapper>(i, result, rows, src2);
+  }
+}
+
+EIGEN_ALWAYS_INLINE Packet8bf convertF32toBF16VSX(const float* res) {
+  return F32ToBf16Both(ploadu<Packet4f>(res + 0), ploadu<Packet4f>(res + 4));
+}
+
+template <typename DataMapper, const Index size>
+EIGEN_ALWAYS_INLINE void convertArrayF32toBF16ColVSX(float* result, Index col, Index rows, const DataMapper& res) {
+  const DataMapper res2 = res.getSubMapper(0, col);
+  Index row;
+  float* result2 = result + col * rows;
+  for (row = 0; row + 8 <= rows; row += 8, result2 += 8) {
+    // get and save block
+    PacketBlock<Packet8bf, size> block;
+    for (Index j = 0; j < size; j++) {
+      block.packet[j] = convertF32toBF16VSX(result2 + j * rows);
+    }
+    res2.template storePacketBlock<Packet8bf, size>(row, 0, block);
+  }
+  // extra rows
+  if (row < rows) {
+    for (Index j = 0; j < size; j++) {
+      Packet8bf fp16 = convertF32toBF16VSX(result2 + j * rows);
+      res2.template storePacketPartial<Packet8bf>(row, j, fp16, rows & 7);
+    }
+  }
+}
+
+template <typename DataMapper>
+EIGEN_ALWAYS_INLINE void convertArrayF32toBF16VSX(float* result, Index cols, Index rows, const DataMapper& res) {
+  Index col;
+  for (col = 0; col + 4 <= cols; col += 4) {
+    convertArrayF32toBF16ColVSX<DataMapper, 4>(result, col, rows, res);
+  }
+  // extra cols
+  switch (cols - col) {
+    case 1:
+      convertArrayF32toBF16ColVSX<DataMapper, 1>(result, col, rows, res);
+      break;
+    case 2:
+      convertArrayF32toBF16ColVSX<DataMapper, 2>(result, col, rows, res);
+      break;
+    case 3:
+      convertArrayF32toBF16ColVSX<DataMapper, 3>(result, col, rows, res);
+      break;
+  }
+}
+
+template <typename DataMapper>
+void gemmbfloat16(const DataMapper& res, const bfloat16* indexA, const bfloat16* indexB, Index rows, Index depth,
+                  Index cols, bfloat16 alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) {
+  float falpha = Eigen::bfloat16_impl::bfloat16_to_float(alpha);
+  const Packet4f pAlpha = pset1<Packet4f>(falpha);
+
+  if (strideA == -1) strideA = depth;
+  if (strideB == -1) strideB = depth;
+
+  ei_declare_aligned_stack_constructed_variable(float, result, cols* rows, 0);
+  ei_declare_aligned_stack_constructed_variable(float, indexB2, strideB* cols, 0);
+  ei_declare_aligned_stack_constructed_variable(float, indexA2, ((strideA + 1) & -2) * 4 * 2, 0);
+
+  convertArrayBF16toF32<DataMapper>(result, cols, rows, res);
+  convertArrayPointerBF16toF32(indexB2, cols, strideB, const_cast<bfloat16*>(indexB));
+
+  Index bigSuffix = 2 * 8 * (strideA - offsetA);
+  float* indexBF32 = indexB2 + 4 * offsetB;
+  offsetB *= 3;
+  strideB *= 2;
+
+  Index row = 0;
+  // LHS (8x16) block
+  while (row + 16 <= rows) {
+    calcVSXColLoops<16>(indexA, indexA2, row, depth, cols, rows, pAlpha, indexBF32, strideA, strideB, offsetA, offsetB,
+                        bigSuffix, result);
+  }
+  // LHS (8x8) block
+  calcVSXColLoops<8>(indexA, indexA2, row, depth, cols, rows, pAlpha, indexBF32, strideA, strideB, offsetA, offsetB,
+                     bigSuffix, result);
+  // LHS (8x4) block
+  calcVSXColLoops<4>(indexA, indexA2, row, depth, cols, rows, pAlpha, indexBF32, strideA, strideB, offsetA, offsetB,
+                     bigSuffix, result);
+  // extra rows
+  if (rows & 3) {
+    // This index is the beginning of remaining block.
+    colVSXLoops<4, true>(depth, cols, rows, pAlpha, indexA, indexA2, indexBF32, strideA, strideB, offsetB,
+                         result + row);
+  }
+
+  // Convert back to bfloat16
+  convertArrayF32toBF16VSX<DataMapper>(result, cols, rows, res);
+}
+
+#undef MAX_BFLOAT16_ACC_VSX
+
+#include "MatrixVectorProduct.inc"
+
+/************************************
+ * ppc64le template specializations *
+ * **********************************/
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<double, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode> {
+  void operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride = 0, Index offset = 0);
+};
+
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+void gemm_pack_lhs<double, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>::operator()(
+    double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) {
+  dhs_pack<double, DataMapper, Packet2d, ColMajor, PanelMode, true> pack;
+  pack(blockA, lhs, depth, rows, stride, offset);
+}
+
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<double, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode> {
+  void operator()(double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride = 0, Index offset = 0);
+};
+
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+void gemm_pack_lhs<double, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>::operator()(
+    double* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) {
+  dhs_pack<double, DataMapper, Packet2d, RowMajor, PanelMode, true> pack;
+  pack(blockA, lhs, depth, rows, stride, offset);
+}
+
+#if EIGEN_ALTIVEC_USE_CUSTOM_PACK
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<double, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> {
+  void operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0, Index offset = 0);
+};
+
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+void gemm_pack_rhs<double, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>::operator()(
+    double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) {
+  dhs_pack<double, DataMapper, Packet2d, ColMajor, PanelMode, false> pack;
+  pack(blockB, rhs, depth, cols, stride, offset);
+}
+
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<double, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode> {
+  void operator()(double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0, Index offset = 0);
+};
+
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+void gemm_pack_rhs<double, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>::operator()(
+    double* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) {
+  dhs_pack<double, DataMapper, Packet2d, RowMajor, PanelMode, false> pack;
+  pack(blockB, rhs, depth, cols, stride, offset);
+}
+
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<bfloat16, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> {
+  void operator()(bfloat16* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0, Index offset = 0);
+};
+
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+void gemm_pack_rhs<bfloat16, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>::operator()(
+    bfloat16* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) {
+  dhs_pack<bfloat16, DataMapper, Packet8bf, ColMajor, PanelMode, false> pack;
+  pack(blockB, rhs, depth, cols, stride, offset);
+}
+
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<bfloat16, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode> {
+  void operator()(bfloat16* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0, Index offset = 0);
+};
+
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+void gemm_pack_rhs<bfloat16, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>::operator()(
+    bfloat16* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) {
+  dhs_pack<bfloat16, DataMapper, Packet8bf, RowMajor, PanelMode, false> pack;
+  pack(blockB, rhs, depth, cols, stride, offset);
+}
+#endif
+
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<bfloat16, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode> {
+  void operator()(bfloat16* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride = 0, Index offset = 0);
+};
+
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+void gemm_pack_lhs<bfloat16, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>::operator()(
+    bfloat16* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) {
+  dhs_pack<bfloat16, DataMapper, Packet8bf, ColMajor, PanelMode, true> pack;
+  pack(blockA, lhs, depth, rows, stride, offset);
+}
+
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<bfloat16, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode> {
+  void operator()(bfloat16* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride = 0, Index offset = 0);
+};
+
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+void gemm_pack_lhs<bfloat16, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>::operator()(
+    bfloat16* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) {
+  dhs_pack<bfloat16, DataMapper, Packet8bf, RowMajor, PanelMode, true> pack;
+  pack(blockA, lhs, depth, rows, stride, offset);
+}
+
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode> {
+  void operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride = 0, Index offset = 0);
+};
+
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+void gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode>::operator()(
+    float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) {
+  dhs_pack<float, DataMapper, Packet4f, RowMajor, PanelMode, true> pack;
+  pack(blockA, lhs, depth, rows, stride, offset);
+}
+
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode> {
+  void operator()(float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride = 0, Index offset = 0);
+};
+
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+void gemm_pack_lhs<float, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode>::operator()(
+    float* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride, Index offset) {
+  dhs_pack<float, DataMapper, Packet4f, ColMajor, PanelMode, true> pack;
+  pack(blockA, lhs, depth, rows, stride, offset);
+}
+
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<std::complex<float>, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode> {
+  void operator()(std::complex<float>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride = 0,
+                  Index offset = 0);
+};
+
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+void gemm_pack_lhs<std::complex<float>, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate,
+                   PanelMode>::operator()(std::complex<float>* blockA, const DataMapper& lhs, Index depth, Index rows,
+                                          Index stride, Index offset) {
+  dhs_cpack<float, DataMapper, Packet4f, Packet2cf, RowMajor, Conjugate, PanelMode, true> pack;
+  pack(blockA, lhs, depth, rows, stride, offset);
+}
+
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<std::complex<float>, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode> {
+  void operator()(std::complex<float>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride = 0,
+                  Index offset = 0);
+};
+
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+void gemm_pack_lhs<std::complex<float>, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate,
+                   PanelMode>::operator()(std::complex<float>* blockA, const DataMapper& lhs, Index depth, Index rows,
+                                          Index stride, Index offset) {
+  dhs_cpack<float, DataMapper, Packet4f, Packet2cf, ColMajor, Conjugate, PanelMode, true> pack;
+  pack(blockA, lhs, depth, rows, stride, offset);
+}
+
+#if EIGEN_ALTIVEC_USE_CUSTOM_PACK
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<float, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> {
+  void operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0, Index offset = 0);
+};
+
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+void gemm_pack_rhs<float, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>::operator()(
+    float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) {
+  dhs_pack<float, DataMapper, Packet4f, ColMajor, PanelMode, false> pack;
+  pack(blockB, rhs, depth, cols, stride, offset);
+}
+
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<float, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode> {
+  void operator()(float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0, Index offset = 0);
+};
+
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+void gemm_pack_rhs<float, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>::operator()(
+    float* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) {
+  dhs_pack<float, DataMapper, Packet4f, RowMajor, PanelMode, false> pack;
+  pack(blockB, rhs, depth, cols, stride, offset);
+}
+#endif
+
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<std::complex<float>, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> {
+  void operator()(std::complex<float>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0,
+                  Index offset = 0);
+};
+
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+void gemm_pack_rhs<std::complex<float>, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>::operator()(
+    std::complex<float>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) {
+  dhs_cpack<float, DataMapper, Packet4f, Packet2cf, ColMajor, Conjugate, PanelMode, false> pack;
+  pack(blockB, rhs, depth, cols, stride, offset);
+}
+
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<std::complex<float>, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode> {
+  void operator()(std::complex<float>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0,
+                  Index offset = 0);
+};
+
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+void gemm_pack_rhs<std::complex<float>, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>::operator()(
+    std::complex<float>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) {
+  dhs_cpack<float, DataMapper, Packet4f, Packet2cf, RowMajor, Conjugate, PanelMode, false> pack;
+  pack(blockB, rhs, depth, cols, stride, offset);
+}
+
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<std::complex<double>, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode> {
+  void operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride = 0,
+                  Index offset = 0);
+};
+
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+void gemm_pack_lhs<std::complex<double>, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate,
+                   PanelMode>::operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows,
+                                          Index stride, Index offset) {
+  dhs_cpack<double, DataMapper, Packet2d, Packet1cd, RowMajor, Conjugate, PanelMode, true> pack;
+  pack(blockA, lhs, depth, rows, stride, offset);
+}
+
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+struct gemm_pack_lhs<std::complex<double>, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode> {
+  void operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride = 0,
+                  Index offset = 0);
+};
+
+template <typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate, bool PanelMode>
+void gemm_pack_lhs<std::complex<double>, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate,
+                   PanelMode>::operator()(std::complex<double>* blockA, const DataMapper& lhs, Index depth, Index rows,
+                                          Index stride, Index offset) {
+  dhs_cpack<double, DataMapper, Packet2d, Packet1cd, ColMajor, Conjugate, PanelMode, true> pack;
+  pack(blockA, lhs, depth, rows, stride, offset);
+}
+
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<std::complex<double>, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> {
+  void operator()(std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0,
+                  Index offset = 0);
+};
+
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+void gemm_pack_rhs<std::complex<double>, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>::operator()(
+    std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) {
+  dhs_cpack<double, DataMapper, Packet2d, Packet1cd, ColMajor, Conjugate, PanelMode, false> pack;
+  pack(blockB, rhs, depth, cols, stride, offset);
+}
+
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<std::complex<double>, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode> {
+  void operator()(std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0,
+                  Index offset = 0);
+};
+
+template <typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+void gemm_pack_rhs<std::complex<double>, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode>::operator()(
+    std::complex<double>* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) {
+  dhs_cpack<double, DataMapper, Packet2d, Packet1cd, RowMajor, Conjugate, PanelMode, false> pack;
+  pack(blockB, rhs, depth, cols, stride, offset);
+}
+
+// ********* gebp specializations *********
+template <typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<float, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> {
+  typedef typename quad_traits<float>::vectortype Packet;
+  typedef typename quad_traits<float>::rhstype RhsPacket;
+
+  void operator()(const DataMapper& res, const float* blockA, const float* blockB, Index rows, Index depth, Index cols,
+                  float alpha, Index strideA = -1, Index strideB = -1, Index offsetA = 0, Index offsetB = 0);
+};
+
+template <typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+void gebp_kernel<float, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>::operator()(
+    const DataMapper& res, const float* blockA, const float* blockB, Index rows, Index depth, Index cols, float alpha,
+    Index strideA, Index strideB, Index offsetA, Index offsetB) {
+  const Index accRows = quad_traits<float>::rows;
+  const Index accCols = quad_traits<float>::size;
+  static void (*gemm_function)(const DataMapper&, const float*, const float*, Index, Index, Index, float, Index, Index,
+                               Index, Index) =
+#ifdef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
+      (supportsMMA()) ? &Eigen::internal::gemmMMA<float, Packet, RhsPacket, DataMapper, accRows, accCols> :
+#endif
+                      &Eigen::internal::gemm<float, Packet, RhsPacket, DataMapper, accRows, accCols>;
+  gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+}
+
+template <typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<std::complex<float>, std::complex<float>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> {
+  typedef Packet4f Packet;
+  typedef Packet2cf Packetc;
+  typedef Packet4f RhsPacket;
+
+  void operator()(const DataMapper& res, const std::complex<float>* blockA, const std::complex<float>* blockB,
+                  Index rows, Index depth, Index cols, std::complex<float> alpha, Index strideA = -1,
+                  Index strideB = -1, Index offsetA = 0, Index offsetB = 0);
+};
+
+template <typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+void gebp_kernel<std::complex<float>, std::complex<float>, Index, DataMapper, mr, nr, ConjugateLhs,
+                 ConjugateRhs>::operator()(const DataMapper& res, const std::complex<float>* blockA,
+                                           const std::complex<float>* blockB, Index rows, Index depth, Index cols,
+                                           std::complex<float> alpha, Index strideA, Index strideB, Index offsetA,
+                                           Index offsetB) {
+  const Index accRows = quad_traits<float>::rows;
+  const Index accCols = quad_traits<float>::size;
+  static void (*gemm_function)(const DataMapper&, const std::complex<float>*, const std::complex<float>*, Index, Index,
+                               Index, std::complex<float>, Index, Index, Index, Index) =
+#ifdef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
+      (supportsMMA()) ? &Eigen::internal::gemm_complexMMA<std::complex<float>, std::complex<float>, std::complex<float>,
+                                                          float, Packet, Packetc, RhsPacket, DataMapper, accRows,
+                                                          accCols, ConjugateLhs, ConjugateRhs, false, false>
+                      :
+#endif
+                      &Eigen::internal::gemm_complex<std::complex<float>, std::complex<float>, std::complex<float>,
+                                                     float, Packet, Packetc, RhsPacket, DataMapper, accRows, accCols,
+                                                     ConjugateLhs, ConjugateRhs, false, false>;
+  gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+}
+
+template <typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<float, std::complex<float>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> {
+  typedef Packet4f Packet;
+  typedef Packet2cf Packetc;
+  typedef Packet4f RhsPacket;
+
+  void operator()(const DataMapper& res, const float* blockA, const std::complex<float>* blockB, Index rows,
+                  Index depth, Index cols, std::complex<float> alpha, Index strideA = -1, Index strideB = -1,
+                  Index offsetA = 0, Index offsetB = 0);
+};
+
+template <typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+void gebp_kernel<float, std::complex<float>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>::operator()(
+    const DataMapper& res, const float* blockA, const std::complex<float>* blockB, Index rows, Index depth, Index cols,
+    std::complex<float> alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) {
+  const Index accRows = quad_traits<float>::rows;
+  const Index accCols = quad_traits<float>::size;
+  static void (*gemm_function)(const DataMapper&, const float*, const std::complex<float>*, Index, Index, Index,
+                               std::complex<float>, Index, Index, Index, Index) =
+#ifdef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
+      (supportsMMA()) ? &Eigen::internal::gemm_complexMMA<float, std::complex<float>, std::complex<float>, float,
+                                                          Packet, Packetc, RhsPacket, DataMapper, accRows, accCols,
+                                                          ConjugateLhs, ConjugateRhs, true, false>
+                      :
+#endif
+                      &Eigen::internal::gemm_complex<float, std::complex<float>, std::complex<float>, float, Packet,
+                                                     Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs,
+                                                     ConjugateRhs, true, false>;
+  gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+}
+
+template <typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<std::complex<float>, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> {
+  typedef Packet4f Packet;
+  typedef Packet2cf Packetc;
+  typedef Packet4f RhsPacket;
+
+  void operator()(const DataMapper& res, const std::complex<float>* blockA, const float* blockB, Index rows,
+                  Index depth, Index cols, std::complex<float> alpha, Index strideA = -1, Index strideB = -1,
+                  Index offsetA = 0, Index offsetB = 0);
+};
+
+template <typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+void gebp_kernel<std::complex<float>, float, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>::operator()(
+    const DataMapper& res, const std::complex<float>* blockA, const float* blockB, Index rows, Index depth, Index cols,
+    std::complex<float> alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) {
+  const Index accRows = quad_traits<float>::rows;
+  const Index accCols = quad_traits<float>::size;
+  static void (*gemm_function)(const DataMapper&, const std::complex<float>*, const float*, Index, Index, Index,
+                               std::complex<float>, Index, Index, Index, Index) =
+#ifdef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
+      (supportsMMA()) ? &Eigen::internal::gemm_complexMMA<std::complex<float>, float, std::complex<float>, float,
+                                                          Packet, Packetc, RhsPacket, DataMapper, accRows, accCols,
+                                                          ConjugateLhs, ConjugateRhs, false, true>
+                      :
+#endif
+                      &Eigen::internal::gemm_complex<std::complex<float>, float, std::complex<float>, float, Packet,
+                                                     Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs,
+                                                     ConjugateRhs, false, true>;
+  gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+}
+
+template <typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<double, double, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> {
+  typedef typename quad_traits<double>::vectortype Packet;
+  typedef typename quad_traits<double>::rhstype RhsPacket;
+
+  void operator()(const DataMapper& res, const double* blockA, const double* blockB, Index rows, Index depth,
+                  Index cols, double alpha, Index strideA = -1, Index strideB = -1, Index offsetA = 0,
+                  Index offsetB = 0);
+};
+
+template <typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+void gebp_kernel<double, double, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>::operator()(
+    const DataMapper& res, const double* blockA, const double* blockB, Index rows, Index depth, Index cols,
+    double alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) {
+  const Index accRows = quad_traits<double>::rows;
+  const Index accCols = quad_traits<double>::size;
+  static void (*gemm_function)(const DataMapper&, const double*, const double*, Index, Index, Index, double, Index,
+                               Index, Index, Index) =
+#ifdef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
+      (supportsMMA()) ? &Eigen::internal::gemmMMA<double, Packet, RhsPacket, DataMapper, accRows, accCols> :
+#endif
+                      &Eigen::internal::gemm<double, Packet, RhsPacket, DataMapper, accRows, accCols>;
+  gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+}
+
+template <typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<std::complex<double>, std::complex<double>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> {
+  typedef quad_traits<double>::vectortype Packet;
+  typedef Packet1cd Packetc;
+  typedef quad_traits<double>::rhstype RhsPacket;
+
+  void operator()(const DataMapper& res, const std::complex<double>* blockA, const std::complex<double>* blockB,
+                  Index rows, Index depth, Index cols, std::complex<double> alpha, Index strideA = -1,
+                  Index strideB = -1, Index offsetA = 0, Index offsetB = 0);
+};
+
+template <typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+void gebp_kernel<std::complex<double>, std::complex<double>, Index, DataMapper, mr, nr, ConjugateLhs,
+                 ConjugateRhs>::operator()(const DataMapper& res, const std::complex<double>* blockA,
+                                           const std::complex<double>* blockB, Index rows, Index depth, Index cols,
+                                           std::complex<double> alpha, Index strideA, Index strideB, Index offsetA,
+                                           Index offsetB) {
+  const Index accRows = quad_traits<double>::rows;
+  const Index accCols = quad_traits<double>::size;
+  static void (*gemm_function)(const DataMapper&, const std::complex<double>*, const std::complex<double>*, Index,
+                               Index, Index, std::complex<double>, Index, Index, Index, Index) =
+#ifdef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
+      (supportsMMA())
+          ? &Eigen::internal::gemm_complexMMA<std::complex<double>, std::complex<double>, std::complex<double>, double,
+                                              Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs,
+                                              ConjugateRhs, false, false>
+          :
+#endif
+          &Eigen::internal::gemm_complex<std::complex<double>, std::complex<double>, std::complex<double>, double,
+                                         Packet, Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs,
+                                         ConjugateRhs, false, false>;
+  gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+}
+
+template <typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<std::complex<double>, double, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> {
+  typedef quad_traits<double>::vectortype Packet;
+  typedef Packet1cd Packetc;
+  typedef quad_traits<double>::rhstype RhsPacket;
+
+  void operator()(const DataMapper& res, const std::complex<double>* blockA, const double* blockB, Index rows,
+                  Index depth, Index cols, std::complex<double> alpha, Index strideA = -1, Index strideB = -1,
+                  Index offsetA = 0, Index offsetB = 0);
+};
+
+template <typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+void gebp_kernel<std::complex<double>, double, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>::operator()(
+    const DataMapper& res, const std::complex<double>* blockA, const double* blockB, Index rows, Index depth,
+    Index cols, std::complex<double> alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) {
+  const Index accRows = quad_traits<double>::rows;
+  const Index accCols = quad_traits<double>::size;
+  static void (*gemm_function)(const DataMapper&, const std::complex<double>*, const double*, Index, Index, Index,
+                               std::complex<double>, Index, Index, Index, Index) =
+#ifdef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
+      (supportsMMA()) ? &Eigen::internal::gemm_complexMMA<std::complex<double>, double, std::complex<double>, double,
+                                                          Packet, Packetc, RhsPacket, DataMapper, accRows, accCols,
+                                                          ConjugateLhs, ConjugateRhs, false, true>
+                      :
+#endif
+                      &Eigen::internal::gemm_complex<std::complex<double>, double, std::complex<double>, double, Packet,
+                                                     Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs,
+                                                     ConjugateRhs, false, true>;
+  gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+}
+
+template <typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<double, std::complex<double>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> {
+  typedef quad_traits<double>::vectortype Packet;
+  typedef Packet1cd Packetc;
+  typedef quad_traits<double>::rhstype RhsPacket;
+
+  void operator()(const DataMapper& res, const double* blockA, const std::complex<double>* blockB, Index rows,
+                  Index depth, Index cols, std::complex<double> alpha, Index strideA = -1, Index strideB = -1,
+                  Index offsetA = 0, Index offsetB = 0);
+};
+
+template <typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+void gebp_kernel<double, std::complex<double>, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>::operator()(
+    const DataMapper& res, const double* blockA, const std::complex<double>* blockB, Index rows, Index depth,
+    Index cols, std::complex<double> alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) {
+  const Index accRows = quad_traits<double>::rows;
+  const Index accCols = quad_traits<double>::size;
+  static void (*gemm_function)(const DataMapper&, const double*, const std::complex<double>*, Index, Index, Index,
+                               std::complex<double>, Index, Index, Index, Index) =
+#ifdef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
+      (supportsMMA()) ? &Eigen::internal::gemm_complexMMA<double, std::complex<double>, std::complex<double>, double,
+                                                          Packet, Packetc, RhsPacket, DataMapper, accRows, accCols,
+                                                          ConjugateLhs, ConjugateRhs, true, false>
+                      :
+#endif
+                      &Eigen::internal::gemm_complex<double, std::complex<double>, std::complex<double>, double, Packet,
+                                                     Packetc, RhsPacket, DataMapper, accRows, accCols, ConjugateLhs,
+                                                     ConjugateRhs, true, false>;
+  gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+}
+
+template <typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<bfloat16, bfloat16, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> {
+  typedef typename quad_traits<bfloat16>::vectortype Packet;
+  typedef typename quad_traits<bfloat16>::rhstype RhsPacket;
+
+  void operator()(const DataMapper& res, const bfloat16* blockA, const bfloat16* blockB, Index rows, Index depth,
+                  Index cols, bfloat16 alpha, Index strideA = -1, Index strideB = -1, Index offsetA = 0,
+                  Index offsetB = 0);
+};
+
+template <typename Index, typename DataMapper, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
+void gebp_kernel<bfloat16, bfloat16, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs>::operator()(
+    const DataMapper& res, const bfloat16* blockA, const bfloat16* blockB, Index rows, Index depth, Index cols,
+    bfloat16 alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) {
+  static void (*gemm_function)(const DataMapper&, const bfloat16*, const bfloat16*, Index, Index, Index, bfloat16,
+                               Index, Index, Index, Index) =
+#ifdef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
+      (supportsMMA()) ? &Eigen::internal::gemmMMAbfloat16<DataMapper> :
+#endif
+                      &Eigen::internal::gemmbfloat16<DataMapper>;
+  gemm_function(res, blockA, blockB, rows, depth, cols, alpha, strideA, strideB, offsetA, offsetB);
+}
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_MATRIX_PRODUCT_ALTIVEC_H
diff --git a/inst/include/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h b/inst/include/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h
new file mode 100644
index 00000000..e78ca5af
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/AltiVec/MatrixProductCommon.h
@@ -0,0 +1,205 @@
+// #define EIGEN_POWER_USE_PREFETCH  // Use prefetching in gemm routines
+#ifdef EIGEN_POWER_USE_PREFETCH
+#define EIGEN_POWER_PREFETCH(p) prefetch(p)
+#else
+#define EIGEN_POWER_PREFETCH(p)
+#endif
+
+#if defined(_ARCH_PWR9) || defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH)
+#define USE_PARTIAL_PACKETS
+#endif
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename Scalar, typename Packet, typename DataMapper, const Index accRows, const Index accCols>
+EIGEN_ALWAYS_INLINE void gemm_extra_row(const DataMapper& res, const Scalar* lhs_base, const Scalar* rhs_base,
+                                        Index depth, Index strideA, Index offsetA, Index strideB, Index row, Index rows,
+                                        Index remaining_rows, const Packet& pAlpha, const Packet& pMask);
+
+template <typename Scalar, typename Packet, typename DataMapper, const Index accCols>
+EIGEN_ALWAYS_INLINE void gemm_extra_cols(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index depth,
+                                         Index strideA, Index offsetA, Index strideB, Index offsetB, Index col,
+                                         Index rows, Index cols, Index remaining_rows, const Packet& pAlpha,
+                                         const Packet& pMask);
+
+template <typename Packet>
+EIGEN_ALWAYS_INLINE Packet bmask(const Index remaining_rows);
+
+template <typename Scalar, typename Packet, typename Packetc, typename DataMapper, const Index accRows,
+          const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_ALWAYS_INLINE void gemm_complex_extra_row(const DataMapper& res, const Scalar* lhs_base, const Scalar* rhs_base,
+                                                Index depth, Index strideA, Index offsetA, Index strideB, Index row,
+                                                Index rows, Index remaining_rows, const Packet& pAlphaReal,
+                                                const Packet& pAlphaImag, const Packet& pMask);
+
+template <typename Scalar, typename Packet, typename Packetc, typename DataMapper, const Index accCols,
+          bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_ALWAYS_INLINE void gemm_complex_extra_cols(const DataMapper& res, const Scalar* blockA, const Scalar* blockB,
+                                                 Index depth, Index strideA, Index offsetA, Index strideB,
+                                                 Index offsetB, Index col, Index rows, Index cols, Index remaining_rows,
+                                                 const Packet& pAlphaReal, const Packet& pAlphaImag,
+                                                 const Packet& pMask);
+
+template <typename DataMapper>
+EIGEN_ALWAYS_INLINE void convertArrayBF16toF32(float* result, Index cols, Index rows, const DataMapper& src);
+
+template <const Index size, bool non_unit_stride, Index delta>
+EIGEN_ALWAYS_INLINE void storeBF16fromResult(bfloat16* dst, Packet8bf data, Index resInc, Index extra = 0);
+
+template <bool non_unit_stride = false>
+EIGEN_ALWAYS_INLINE void convertArrayPointerBF16toF32(float* result, Index cols, Index rows, bfloat16* src,
+                                                      Index resInc = 1);
+
+template <bool rhsExtraCols, bool lhsExtraRows>
+EIGEN_ALWAYS_INLINE void storeResults(Packet4f (&acc)[4], Index rows, const Packet4f pAlpha, float* result,
+                                      Index extra_cols, Index extra_rows);
+
+template <Index num_acc, bool extraRows, Index size = 4>
+EIGEN_ALWAYS_INLINE void outputVecColResults(Packet4f (&acc)[num_acc][size], float* result, Packet4f pAlpha,
+                                             Index extra_rows);
+
+template <Index num_acc, Index size = 4>
+EIGEN_ALWAYS_INLINE void outputVecResults(Packet4f (&acc)[num_acc][size], float* result, Packet4f pAlpha);
+
+template <typename RhsMapper, bool linear>
+EIGEN_ALWAYS_INLINE Packet8bf loadColData(RhsMapper& rhs, Index j);
+
+template <typename Packet>
+EIGEN_ALWAYS_INLINE Packet ploadLhs(const __UNPACK_TYPE__(Packet) * lhs);
+
+template <typename DataMapper, typename Packet, const Index accCols, int StorageOrder, bool Complex, int N,
+          bool full = true>
+EIGEN_ALWAYS_INLINE void bload(PacketBlock<Packet, N*(Complex ? 2 : 1)>& acc, const DataMapper& res, Index row,
+                               Index col);
+
+template <typename DataMapper, typename Packet, int N>
+EIGEN_ALWAYS_INLINE void bstore(PacketBlock<Packet, N>& acc, const DataMapper& res, Index row);
+
+#ifdef USE_PARTIAL_PACKETS
+template <typename DataMapper, typename Packet, const Index accCols, bool Complex, Index N, bool full = true>
+EIGEN_ALWAYS_INLINE void bload_partial(PacketBlock<Packet, N*(Complex ? 2 : 1)>& acc, const DataMapper& res, Index row,
+                                       Index elements);
+
+template <typename DataMapper, typename Packet, Index N>
+EIGEN_ALWAYS_INLINE void bstore_partial(PacketBlock<Packet, N>& acc, const DataMapper& res, Index row, Index elements);
+#endif
+
+template <typename Packet, int N>
+EIGEN_ALWAYS_INLINE void bscale(PacketBlock<Packet, N>& acc, PacketBlock<Packet, N>& accZ, const Packet& pAlpha);
+
+template <typename Packet, int N, bool mask>
+EIGEN_ALWAYS_INLINE void bscale(PacketBlock<Packet, N>& acc, PacketBlock<Packet, N>& accZ, const Packet& pAlpha,
+                                const Packet& pMask);
+
+template <typename Packet, int N, bool mask>
+EIGEN_ALWAYS_INLINE void bscalec(PacketBlock<Packet, N>& aReal, PacketBlock<Packet, N>& aImag, const Packet& bReal,
+                                 const Packet& bImag, PacketBlock<Packet, N>& cReal, PacketBlock<Packet, N>& cImag,
+                                 const Packet& pMask);
+
+template <typename Packet, typename Packetc, int N, bool full>
+EIGEN_ALWAYS_INLINE void bcouple(PacketBlock<Packet, N>& taccReal, PacketBlock<Packet, N>& taccImag,
+                                 PacketBlock<Packetc, N * 2>& tRes, PacketBlock<Packetc, N>& acc1,
+                                 PacketBlock<Packetc, N>& acc2);
+
+#define MICRO_NORMAL(iter) (accCols == accCols2) || (unroll_factor != (iter + 1))
+
+#define MICRO_UNROLL_ITER1(func, N)          \
+  switch (remaining_rows) {                  \
+    default:                                 \
+      func(N, 0) break;                      \
+    case 1:                                  \
+      func(N, 1) break;                      \
+    case 2:                                  \
+      if (sizeof(Scalar) == sizeof(float)) { \
+        func(N, 2)                           \
+      }                                      \
+      break;                                 \
+    case 3:                                  \
+      if (sizeof(Scalar) == sizeof(float)) { \
+        func(N, 3)                           \
+      }                                      \
+      break;                                 \
+  }
+
+#ifdef USE_PARTIAL_PACKETS
+#define MICRO_UNROLL_ITER(func, N) \
+  if (remaining_rows) {            \
+    func(N, true);                 \
+  } else {                         \
+    func(N, false);                \
+  }
+
+#define MICRO_NORMAL_PARTIAL(iter) full || (unroll_factor != (iter + 1))
+#else
+#define MICRO_UNROLL_ITER(func, N) MICRO_UNROLL_ITER1(func, N)
+#endif
+
+#define MICRO_COMPLEX_UNROLL_ITER(func, N) MICRO_UNROLL_ITER1(func, N)
+
+#define MICRO_NORMAL_COLS(iter, a, b) ((MICRO_NORMAL(iter)) ? a : b)
+
+#define MICRO_LOAD1(lhs_ptr, iter)                               \
+  if (unroll_factor > iter) {                                    \
+    lhsV##iter = ploadLhs<Packet>(lhs_ptr##iter);                \
+    lhs_ptr##iter += MICRO_NORMAL_COLS(iter, accCols, accCols2); \
+  } else {                                                       \
+    EIGEN_UNUSED_VARIABLE(lhsV##iter);                           \
+  }
+
+#define MICRO_LOAD_ONE(iter) MICRO_LOAD1(lhs_ptr, iter)
+
+#define MICRO_COMPLEX_LOAD_ONE(iter)                                                                       \
+  if (!LhsIsReal && (unroll_factor > iter)) {                                                              \
+    lhsVi##iter = ploadLhs<Packet>(lhs_ptr_real##iter + MICRO_NORMAL_COLS(iter, imag_delta, imag_delta2)); \
+  } else {                                                                                                 \
+    EIGEN_UNUSED_VARIABLE(lhsVi##iter);                                                                    \
+  }                                                                                                        \
+  MICRO_LOAD1(lhs_ptr_real, iter)
+
+#define MICRO_SRC_PTR1(lhs_ptr, advRows, iter)                                  \
+  if (unroll_factor > iter) {                                                   \
+    lhs_ptr##iter = lhs_base + (row + (iter * accCols)) * strideA * advRows -   \
+                    MICRO_NORMAL_COLS(iter, 0, (accCols - accCols2) * offsetA); \
+  } else {                                                                      \
+    EIGEN_UNUSED_VARIABLE(lhs_ptr##iter);                                       \
+  }
+
+#define MICRO_SRC_PTR_ONE(iter) MICRO_SRC_PTR1(lhs_ptr, 1, iter)
+
+#define MICRO_COMPLEX_SRC_PTR_ONE(iter) MICRO_SRC_PTR1(lhs_ptr_real, advanceRows, iter)
+
+#define MICRO_PREFETCH1(lhs_ptr, iter)   \
+  if (unroll_factor > iter) {            \
+    EIGEN_POWER_PREFETCH(lhs_ptr##iter); \
+  }
+
+#define MICRO_PREFETCH_ONE(iter) MICRO_PREFETCH1(lhs_ptr, iter)
+
+#define MICRO_COMPLEX_PREFETCH_ONE(iter) MICRO_PREFETCH1(lhs_ptr_real, iter)
+
+#ifdef USE_PARTIAL_PACKETS
+#define MICRO_UPDATE_MASK
+#else
+#define MICRO_UPDATE_MASK EIGEN_UNUSED_VARIABLE(pMask);
+#endif
+
+#define MICRO_UPDATE                \
+  if (accCols == accCols2) {        \
+    MICRO_UPDATE_MASK               \
+    EIGEN_UNUSED_VARIABLE(offsetA); \
+    row += unroll_factor * accCols; \
+  }
+
+#define MICRO_COMPLEX_UPDATE                \
+  MICRO_UPDATE                              \
+  if (LhsIsReal || (accCols == accCols2)) { \
+    EIGEN_UNUSED_VARIABLE(imag_delta2);     \
+  }
+
+}  // end namespace internal
+}  // end namespace Eigen
diff --git a/inst/include/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h b/inst/include/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h
new file mode 100644
index 00000000..94c5dd27
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/AltiVec/MatrixProductMMA.h
@@ -0,0 +1,901 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2020 Everton Constantino (everton.constantino@ibm.com)
+// Copyright (C) 2021 Chip Kerchner (chip.kerchner@ibm.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
+#define EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
+
+// If using dynamic dispatch, set the CPU target.
+#if defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH)
+#pragma GCC push_options
+#pragma GCC target("cpu=power10,htm")
+#endif
+
+#ifdef __has_builtin
+#if !__has_builtin(__builtin_vsx_assemble_pair)
+#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair
+#endif
+#if !__has_builtin(__builtin_vsx_disassemble_pair)
+#define __builtin_vsx_disassemble_pair __builtin_mma_disassemble_pair
+#endif
+#endif
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+#include "MatrixProductMMAbfloat16.h"
+
+namespace Eigen {
+
+namespace internal {
+
+#define accColsC (accCols / 2)
+
+EIGEN_ALWAYS_INLINE void bsetzeroMMA(__vector_quad* acc) { __builtin_mma_xxsetaccz(acc); }
+
+template <typename DataMapper, typename Packet, bool full>
+EIGEN_ALWAYS_INLINE void storeAccumulator(Index i, const DataMapper& data, const Packet& alpha, const Index elements,
+                                          __vector_quad* acc) {
+  PacketBlock<Packet, 4> result;
+  __builtin_mma_disassemble_acc(&result.packet, acc);
+
+  PacketBlock<Packet, 4> tRes;
+  if (full) {
+    EIGEN_UNUSED_VARIABLE(elements);
+    bload<DataMapper, Packet, 0, ColMajor, false, 4>(tRes, data, i, 0);
+    bscale<Packet, 4>(tRes, result, alpha);
+    bstore<DataMapper, Packet, 4>(tRes, data, i);
+  } else {
+    bload_partial<DataMapper, Packet, 0, false, 4>(tRes, data, i, elements);
+    bscale<Packet, 4>(tRes, result, alpha);
+    bstore_partial<DataMapper, Packet, 4>(tRes, data, i, elements);
+  }
+}
+
+template <typename DataMapper, typename Packet, typename Packetc, const Index accCols, const Index accCols2>
+EIGEN_ALWAYS_INLINE void storeComplexAccumulator(Index i, const DataMapper& data, const Packet& alphaReal,
+                                                 const Packet& alphaImag, const Packet& pMask, __vector_quad* accReal,
+                                                 __vector_quad* accImag) {
+  constexpr bool full = (accCols2 > accColsC);
+  PacketBlock<Packet, 4> resultReal, resultImag;
+  __builtin_mma_disassemble_acc(&resultReal.packet, accReal);
+  __builtin_mma_disassemble_acc(&resultImag.packet, accImag);
+
+  PacketBlock<Packetc, 8> tRes;
+  bload<DataMapper, Packetc, accColsC, ColMajor, true, 4, full>(tRes, data, i, 0);
+
+  PacketBlock<Packet, 4> taccReal, taccImag;
+  bscalec<Packet, 4, (accCols != accCols2)>(resultReal, resultImag, alphaReal, alphaImag, taccReal, taccImag, pMask);
+
+  PacketBlock<Packetc, 4> acc1, acc2;
+  bcouple<Packet, Packetc, 4, full>(taccReal, taccImag, tRes, acc1, acc2);
+
+  bstore<DataMapper, Packetc, 4>(acc1, data, i);
+  if (full) {
+    bstore<DataMapper, Packetc, 4>(acc2, data, i + accColsC);
+  }
+}
+
+// Defaults to float32, since Eigen still supports C++03 we can't use default template arguments
+template <typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
+EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad* acc, const RhsPacket& a, const LhsPacket& b) {
+  if (NegativeAccumulate) {
+    __builtin_mma_xvf32gernp(acc, (__vector unsigned char)a, (__vector unsigned char)b);
+  } else {
+    __builtin_mma_xvf32gerpp(acc, (__vector unsigned char)a, (__vector unsigned char)b);
+  }
+}
+
+template <typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
+EIGEN_ALWAYS_INLINE void pgerMMA(__vector_quad* acc, const __vector_pair& a, const Packet2d& b) {
+  if (NegativeAccumulate) {
+    __builtin_mma_xvf64gernp(acc, (__vector_pair)a, (__vector unsigned char)b);
+  } else {
+    __builtin_mma_xvf64gerpp(acc, (__vector_pair)a, (__vector unsigned char)b);
+  }
+}
+
+template <typename Packet, typename RhsPacket, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+EIGEN_ALWAYS_INLINE void pgercMMA(__vector_quad* accReal, __vector_quad* accImag, const Packet& lhsV, Packet& lhsVi,
+                                  const RhsPacket& rhsV, RhsPacket& rhsVi) {
+  pgerMMA<Packet, RhsPacket, false>(accReal, rhsV, lhsV);
+  if (LhsIsReal) {
+    pgerMMA<Packet, RhsPacket, ConjugateRhs>(accImag, rhsVi, lhsV);
+    EIGEN_UNUSED_VARIABLE(lhsVi);
+  } else {
+    if (!RhsIsReal) {
+      pgerMMA<Packet, RhsPacket, ConjugateLhs == ConjugateRhs>(accReal, rhsVi, lhsVi);
+      pgerMMA<Packet, RhsPacket, ConjugateRhs>(accImag, rhsVi, lhsV);
+    } else {
+      EIGEN_UNUSED_VARIABLE(rhsVi);
+    }
+    pgerMMA<Packet, RhsPacket, ConjugateLhs>(accImag, rhsV, lhsVi);
+  }
+}
+
+// This is necessary because ploadRhs for double returns a pair of vectors when MMA is enabled.
+template <typename Packet>
+EIGEN_ALWAYS_INLINE Packet ploadRhs(const __UNPACK_TYPE__(Packet) * rhs) {
+  return ploadu<Packet>(rhs);
+}
+
+template <typename Scalar, typename Packet>
+EIGEN_ALWAYS_INLINE void ploadRhsMMA(const Scalar* rhs, Packet& rhsV) {
+  rhsV = ploadRhs<Packet>(rhs);
+}
+
+template <>
+EIGEN_ALWAYS_INLINE void ploadRhsMMA(const double* rhs, __vector_pair& rhsV) {
+#if EIGEN_COMP_LLVM
+  __builtin_vsx_assemble_pair(
+      &rhsV, reinterpret_cast<__vector unsigned char>(ploadRhs<Packet2d>(rhs + (sizeof(Packet2d) / sizeof(double)))),
+      reinterpret_cast<__vector unsigned char>(ploadRhs<Packet2d>(rhs)));
+#else
+  rhsV = *reinterpret_cast<__vector_pair*>(const_cast<double*>(rhs));
+#endif
+}
+
+EIGEN_ALWAYS_INLINE void ploadLhsMMA(const double* lhs, __vector_pair& lhsV) { ploadRhsMMA(lhs, lhsV); }
+
+#define GEMM_MULTIPLE_COLS
+
+// Disable in GCC until unnecessary register moves are fixed
+// #if (EIGEN_COMP_LLVM || (__GNUC__ >= 11))
+#if EIGEN_COMP_LLVM
+#define VECTOR_PAIR_LOADS_LHS
+#endif
+
+// PEEL_MMA loop factor.
+#ifdef GEMM_MULTIPLE_COLS
+#define PEEL_MMA 8
+#else
+// Register spillage with GCC12+
+#if EIGEN_COMP_LLVM || (__GNUC__ < 12) || defined(VECTOR_PAIR_LOADS_LHS)
+#define PEEL_MMA 7
+#else
+#define PEEL_MMA 6
+#endif
+#endif
+
+#define MICRO_MMA_UNROLL(func) func(0) func(1) func(2) func(3) func(4) func(5) func(6) func(7)
+
+#define MICRO_MMA_WORK(func, type, peel)                                                                        \
+  if (accItr == 1) {                                                                                            \
+    func(0, type, peel, 0, 0) func(1, type, peel, 1, 0) func(2, type, peel, 2, 0) func(3, type, peel, 3, 0)     \
+        func(4, type, peel, 4, 0) func(5, type, peel, 5, 0) func(6, type, peel, 6, 0) func(7, type, peel, 7, 0) \
+  } else if (accItr == 2) {                                                                                     \
+    func(0, type, peel, 0, 0) func(1, type, peel, 0, 1) func(2, type, peel, 1, 0) func(3, type, peel, 1, 1)     \
+        func(4, type, peel, 2, 0) func(5, type, peel, 2, 1) func(6, type, peel, 3, 0) func(7, type, peel, 3, 1) \
+  } else {                                                                                                      \
+    func(0, type, peel, 0, 0) func(1, type, peel, 0, 1) func(2, type, peel, 0, 2) func(3, type, peel, 0, 3)     \
+        func(4, type, peel, 1, 0) func(5, type, peel, 1, 1) func(6, type, peel, 1, 2) func(7, type, peel, 1, 3) \
+  }
+
+#define MICRO_MMA_WORK_ONE(iter, type, peel, left, right)                        \
+  if (unroll_factor > left) {                                                    \
+    pgerMMA<Packet, type, false>(&accZero##iter, rhsV##right[peel], lhsV##left); \
+  }
+
+#ifdef VECTOR_PAIR_LOADS_LHS
+#define MICRO_MMA_WORK_TWO(iter, type, peel, left, right)                                          \
+  if (unroll_factor > left) {                                                                      \
+    pgerMMA<Packet, type, false>(&accZero##iter, rhsV##right[peel], lhsV2##left.packet[peel & 1]); \
+  }
+
+#define MICRO_MMA_LOAD1_TWO(lhs_ptr, left)                                                        \
+  if (unroll_factor > left) {                                                                     \
+    if (MICRO_NORMAL(left)) {                                                                     \
+      ploadLhsMMA(reinterpret_cast<const double*>(lhs_ptr##left), plhsV##left);                   \
+      __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(&lhsV2##left.packet), &plhsV##left); \
+      lhs_ptr##left += accCols * 2;                                                               \
+    } else {                                                                                      \
+      lhsV2##left.packet[0] = ploadLhs<Packet>(lhs_ptr##left);                                    \
+      lhsV2##left.packet[1] = ploadLhs<Packet>(lhs_ptr##left + accCols2);                         \
+      lhs_ptr##left += accCols2 * 2;                                                              \
+      EIGEN_UNUSED_VARIABLE(plhsV##left);                                                         \
+    }                                                                                             \
+  } else {                                                                                        \
+    EIGEN_UNUSED_VARIABLE(lhsV2##left);                                                           \
+    EIGEN_UNUSED_VARIABLE(plhsV##left);                                                           \
+  }
+
+#define MICRO_MMA_LOAD_TWO(left) MICRO_MMA_LOAD1_TWO(lhs_ptr, left)
+#endif
+
+#define MICRO_MMA_UNROLL_ITER(func, val)                       \
+  func(val, 0) if (accItr > 1) {                               \
+    func(val, 1) if (accItr > 2) { func(val, 2) func(val, 3) } \
+  }
+
+#define MICRO_MMA_LOAD_ONE_RHS1(peel, right) ploadRhsMMA(rhs_ptr##right + (accRows * peel), rhsV##right[peel]);
+
+#define MICRO_MMA_LOAD_ONE_RHS(peel) MICRO_MMA_UNROLL_ITER(MICRO_MMA_LOAD_ONE_RHS1, peel)
+
+#define MICRO_MMA_TYPE_PEEL(funcw, funcl, type, peel)              \
+  if (PEEL_MMA > peel) {                                           \
+    Packet lhsV0, lhsV1, lhsV2, lhsV3, lhsV4, lhsV5, lhsV6, lhsV7; \
+    MICRO_MMA_LOAD_ONE_RHS(peel)                                   \
+    MICRO_MMA_UNROLL(funcl)                                        \
+    MICRO_MMA_WORK(funcw, type, peel)                              \
+  }
+
+#ifndef VECTOR_PAIR_LOADS_LHS
+#define MICRO_MMA_UNROLL_TYPE_PEEL(funcw, funcl, type)                                                  \
+  type rhsV0[8], rhsV1[(accItr > 1) ? 8 : 1], rhsV2[(accItr > 2) ? 8 : 1], rhsV3[(accItr > 2) ? 8 : 1]; \
+  MICRO_MMA_TYPE_PEEL(funcw, funcl, type, 0)                                                            \
+  MICRO_MMA_TYPE_PEEL(funcw, funcl, type, 1)                                                            \
+  MICRO_MMA_TYPE_PEEL(funcw, funcl, type, 2)                                                            \
+  MICRO_MMA_TYPE_PEEL(funcw, funcl, type, 3)                                                            \
+  MICRO_MMA_TYPE_PEEL(funcw, funcl, type, 4)                                                            \
+  MICRO_MMA_TYPE_PEEL(funcw, funcl, type, 5)                                                            \
+  MICRO_MMA_TYPE_PEEL(funcw, funcl, type, 6) MICRO_MMA_TYPE_PEEL(funcw, funcl, type, 7)
+#else
+#define MICRO_MMA_LOAD_TWO_RHS(peel1, right)                                                      \
+  ploadRhsMMA(reinterpret_cast<const double*>(rhs_ptr##right + (accRows * peel1)), prhsV##peel1); \
+  __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(&rhsV##right[peel1]), &prhsV##peel1);
+
+#define MICRO_MMA_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type, peel1, peel2)           \
+  if (PEEL_MMA > peel2) {                                                                  \
+    PacketBlock<Packet, 2> lhsV20, lhsV21, lhsV22, lhsV23, lhsV24, lhsV25, lhsV26, lhsV27; \
+    __vector_pair plhsV0, plhsV1, plhsV2, plhsV3, plhsV4, plhsV5, plhsV6, plhsV7;          \
+    if (sizeof(type) == 16) {                                                              \
+      MICRO_MMA_UNROLL_ITER(MICRO_MMA_LOAD_TWO_RHS, peel1)                                 \
+    } else {                                                                               \
+      EIGEN_UNUSED_VARIABLE(prhsV##peel1);                                                 \
+      MICRO_MMA_LOAD_ONE_RHS(peel1)                                                        \
+      MICRO_MMA_LOAD_ONE_RHS(peel2)                                                        \
+    }                                                                                      \
+    MICRO_MMA_UNROLL(funcl2)                                                               \
+    MICRO_MMA_WORK(funcw2, type, peel1)                                                    \
+    MICRO_MMA_WORK(funcw2, type, peel2)                                                    \
+  } else {                                                                                 \
+    EIGEN_UNUSED_VARIABLE(prhsV##peel1);                                                   \
+    MICRO_MMA_TYPE_PEEL(funcw1, funcl1, type, peel1)                                       \
+  }
+
+#define MICRO_MMA_UNROLL_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type)                               \
+  type rhsV0[8], rhsV1[(accItr > 1) ? 8 : 1], rhsV2[(accItr > 2) ? 8 : 1], rhsV3[(accItr > 2) ? 8 : 1]; \
+  __vector_pair prhsV0, prhsV2, prhsV4, prhsV6;                                                         \
+  MICRO_MMA_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type, 0, 1)                                      \
+  MICRO_MMA_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type, 2, 3)                                      \
+  MICRO_MMA_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type, 4, 5)                                      \
+  MICRO_MMA_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type, 6, 7)
+#endif
+
+#define MICRO_MMA_UNROLL_TYPE_ONE(funcw, funcl, type) \
+  type rhsV0[1], rhsV1[1], rhsV2[1], rhsV3[1];        \
+  MICRO_MMA_TYPE_PEEL(funcw, funcl, type, 0)
+
+#define MICRO_MMA_UPDATE_RHS1(size, right) rhs_ptr##right += (accRows * size);
+
+#define MICRO_MMA_UPDATE_RHS(size) MICRO_MMA_UNROLL_ITER(MICRO_MMA_UPDATE_RHS1, size)
+
+#define MICRO_MMA_UNROLL_TYPE(MICRO_MMA_TYPE, size)             \
+  MICRO_MMA_TYPE(MICRO_MMA_WORK_ONE, MICRO_LOAD_ONE, RhsPacket) \
+  MICRO_MMA_UPDATE_RHS(size)
+
+#ifndef VECTOR_PAIR_LOADS_LHS
+#define MICRO_MMA_ONE_PEEL MICRO_MMA_UNROLL_TYPE(MICRO_MMA_UNROLL_TYPE_PEEL, PEEL_MMA)
+#else
+#define MICRO_MMA_UNROLL_TYPE2(MICRO_MMA_TYPE, size)                                                    \
+  MICRO_MMA_TYPE(MICRO_MMA_WORK_ONE, MICRO_LOAD_ONE, MICRO_MMA_WORK_TWO, MICRO_MMA_LOAD_TWO, RhsPacket) \
+  MICRO_MMA_UPDATE_RHS(size)
+
+#define MICRO_MMA_ONE_PEEL MICRO_MMA_UNROLL_TYPE2(MICRO_MMA_UNROLL_TYPE_PEEL2, PEEL_MMA)
+#endif
+
+#define MICRO_MMA_ONE MICRO_MMA_UNROLL_TYPE(MICRO_MMA_UNROLL_TYPE_ONE, 1)
+
+#define MICRO_MMA_DST_PTR_ONE(iter)       \
+  if (unroll_factor * accItr > iter) {    \
+    bsetzeroMMA(&accZero##iter);          \
+  } else {                                \
+    EIGEN_UNUSED_VARIABLE(accZero##iter); \
+  }
+
+#define MICRO_MMA_DST_PTR MICRO_MMA_UNROLL(MICRO_MMA_DST_PTR_ONE)
+
+#define MICRO_MMA_SRC_PTR MICRO_MMA_UNROLL(MICRO_SRC_PTR_ONE)
+
+#define MICRO_MMA_PREFETCH MICRO_MMA_UNROLL(MICRO_PREFETCH_ONE)
+
+#define MICRO_MMA_STORE_ONE(iter, left, right)                                                                 \
+  if (unroll_factor > left) {                                                                                  \
+    storeAccumulator<DataMapper, Packet, MICRO_NORMAL_PARTIAL(left)>(row + left * accCols, res##right, pAlpha, \
+                                                                     accCols2, &accZero##iter);                \
+  }
+
+#define MICRO_MMA_ITER_UNROLL(func)                                                                                 \
+  if (accItr == 1) {                                                                                                \
+    func(0, 0, 0) func(1, 1, 0) func(2, 2, 0) func(3, 3, 0) func(4, 4, 0) func(5, 5, 0) func(6, 6, 0) func(7, 7, 0) \
+  } else if (accItr == 2) {                                                                                         \
+    func(0, 0, 0) func(1, 0, 1) func(2, 1, 0) func(3, 1, 1) func(4, 2, 0) func(5, 2, 1) func(6, 3, 0) func(7, 3, 1) \
+  } else {                                                                                                          \
+    func(0, 0, 0) func(1, 0, 1) func(2, 0, 2) func(3, 0, 3) func(4, 1, 0) func(5, 1, 1) func(6, 1, 2) func(7, 1, 3) \
+  }
+
+#define MICRO_MMA_STORE MICRO_MMA_ITER_UNROLL(MICRO_MMA_STORE_ONE)
+
+#define MICRO_MMA_EXTRA_ROWS(right)                                                                           \
+  gemm_extra_row<Scalar, Packet, DataMapper, accRows, accCols>(                                               \
+      res3##right, blockA, rhs_base + right * accRows * strideB, depth, strideA, offsetA, strideB, row, rows, \
+      remaining_rows, pAlpha, pMask);
+
+#define MICRO_MMA_EXTRA_ROWS1(val, right) MICRO_MMA_EXTRA_ROWS(right);
+
+template <int unroll_factor, typename Scalar, typename Packet, typename RhsPacket, typename DataMapper,
+          const Index accRows, const Index accCols, bool full, const Index accItr>
+EIGEN_ALWAYS_INLINE void gemm_unrolled_MMA_iteration(const DataMapper& res0, const DataMapper& res1,
+                                                     const DataMapper& res2, const DataMapper& res3,
+                                                     const Scalar* lhs_base, const Scalar* rhs_base, Index depth,
+                                                     Index strideA, Index strideB, Index offsetA, Index& row,
+                                                     const Packet& pAlpha, Index accCols2) {
+  const Scalar *rhs_ptr0 = rhs_base, *rhs_ptr1 = NULL, *rhs_ptr2 = NULL, *rhs_ptr3 = NULL;
+  const Scalar *lhs_ptr0 = NULL, *lhs_ptr1 = NULL, *lhs_ptr2 = NULL, *lhs_ptr3 = NULL, *lhs_ptr4 = NULL,
+               *lhs_ptr5 = NULL, *lhs_ptr6 = NULL, *lhs_ptr7 = NULL;
+  __vector_quad accZero0, accZero1, accZero2, accZero3, accZero4, accZero5, accZero6, accZero7;
+
+  if (accItr > 1) {
+    rhs_ptr1 = rhs_base + (accRows * strideB);
+  } else {
+    EIGEN_UNUSED_VARIABLE(strideB);
+    EIGEN_UNUSED_VARIABLE(rhs_ptr1);
+    EIGEN_UNUSED_VARIABLE(res1);
+  }
+  if (accItr > 2) {
+    rhs_ptr2 = rhs_base + (2 * accRows * strideB);
+    rhs_ptr3 = rhs_base + (3 * accRows * strideB);
+  } else {
+    EIGEN_UNUSED_VARIABLE(rhs_ptr2);
+    EIGEN_UNUSED_VARIABLE(rhs_ptr3);
+    EIGEN_UNUSED_VARIABLE(res2);
+    EIGEN_UNUSED_VARIABLE(res3);
+  }
+
+  MICRO_MMA_SRC_PTR
+  MICRO_MMA_DST_PTR
+
+  Index k = 0, depth2 = depth - PEEL_MMA;
+  for (; k <= depth2; k += PEEL_MMA) {
+    EIGEN_POWER_PREFETCH(rhs_ptr);
+    MICRO_MMA_PREFETCH
+    MICRO_MMA_ONE_PEEL
+  }
+  for (; k < depth; k++) {
+    MICRO_MMA_ONE
+  }
+  MICRO_MMA_STORE
+
+  MICRO_UPDATE
+}
+
+#define MICRO_MMA_UNROLL_ITER2(N, M)                                                                                 \
+  gemm_unrolled_MMA_iteration<N + (M ? 1 : 0), Scalar, Packet, RhsPacket, DataMapper, accRows, accCols, !M, accItr>( \
+      res30, res31, res32, res33, lhs_base, rhs_base, depth, strideA, strideB, offsetA, row, pAlpha,                 \
+      M ? remaining_rows : accCols);                                                                                 \
+  if (M) return;
+
+#define MICRO_MMA_ROWS(n)             \
+  while (row + n * accCols <= rows) { \
+    MICRO_MMA_UNROLL_ITER2(n, 0);     \
+  }
+
+template <typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows,
+          const Index accCols, const Index accItr>
+EIGEN_ALWAYS_INLINE void gemmMMA_cols(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index depth,
+                                      Index strideA, Index offsetA, Index strideB, Index offsetB, Index col, Index rows,
+                                      Index remaining_rows, const Packet& pAlpha, const Packet& pMask) {
+  const DataMapper res30 = res.getSubMapper(0, col);
+  const DataMapper res31 = (accItr > 1) ? res30.getSubMapper(0, accRows * 1) : res30;
+  const DataMapper res32 = (accItr > 2) ? res30.getSubMapper(0, accRows * 2) : res30;
+  const DataMapper res33 = (accItr > 2) ? res30.getSubMapper(0, accRows * 3) : res30;
+
+  const Scalar* rhs_base = blockB + col * strideB + accRows * offsetB;
+  const Scalar* lhs_base = blockA + accCols * offsetA;
+  Index row = 0;
+
+#define MAX_MMA_UNROLL 7
+
+#if MAX_MMA_UNROLL < 2
+  if (1) {
+#elif MAX_MMA_UNROLL < 4
+  if (accItr <= 2) {
+#else
+  if (accItr == 1) {
+#endif
+    MICRO_MMA_ROWS(MAX_MMA_UNROLL);
+  } else if (accItr == 2) {
+    MICRO_MMA_ROWS(4);
+  } else {
+    MICRO_MMA_ROWS(2);
+  }
+  switch ((rows - row) / accCols) {
+#if MAX_MMA_UNROLL > 7
+    case 7:
+      if (accItr == 1) {
+        MICRO_UNROLL_ITER(MICRO_MMA_UNROLL_ITER2, 7)
+      }
+      break;
+#endif
+#if MAX_MMA_UNROLL > 6
+    case 6:
+      if (accItr == 1) {
+        MICRO_UNROLL_ITER(MICRO_MMA_UNROLL_ITER2, 6)
+      }
+      break;
+#endif
+#if MAX_MMA_UNROLL > 5
+    case 5:
+      if (accItr == 1) {
+        MICRO_UNROLL_ITER(MICRO_MMA_UNROLL_ITER2, 5)
+      }
+      break;
+#endif
+#if MAX_MMA_UNROLL > 4
+    case 4:
+      if (accItr == 1) {
+        MICRO_UNROLL_ITER(MICRO_MMA_UNROLL_ITER2, 4)
+      }
+      break;
+#endif
+#if MAX_MMA_UNROLL > 3
+    case 3:
+      if (accItr <= 2) {
+        MICRO_UNROLL_ITER(MICRO_MMA_UNROLL_ITER2, 3)
+      }
+      break;
+#endif
+#if MAX_MMA_UNROLL > 2
+    case 2:
+      if (accItr <= 2) {
+        MICRO_UNROLL_ITER(MICRO_MMA_UNROLL_ITER2, 2)
+      }
+      break;
+#endif
+#if MAX_MMA_UNROLL > 1
+    case 1:
+      MICRO_UNROLL_ITER(MICRO_MMA_UNROLL_ITER2, 1)
+      break;
+#endif
+    default:
+      break;
+  }
+#undef MAX_MMA_UNROLL
+
+  if (remaining_rows > 0) {
+    MICRO_MMA_UNROLL_ITER(MICRO_MMA_EXTRA_ROWS1, 0)
+  }
+}
+
+#define MICRO_MMA_COLS(n)                                                                                          \
+  for (; col + n * accRows <= cols; col += n * accRows) {                                                          \
+    gemmMMA_cols<Scalar, Packet, RhsPacket2, DataMapper, accRows, accCols, n>(                                     \
+        res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, remaining_rows, pAlpha, pMask); \
+  }
+
+template <typename Scalar, typename Packet, typename RhsPacket, typename DataMapper, const Index accRows,
+          const Index accCols>
+void gemmMMA(const DataMapper& res, const Scalar* blockA, const Scalar* blockB, Index rows, Index depth, Index cols,
+             Scalar alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) {
+  const Index remaining_rows = rows % accCols;
+
+  if (strideA == -1) strideA = depth;
+  if (strideB == -1) strideB = depth;
+
+  const Packet pAlpha = pset1<Packet>(alpha);
+  const Packet pMask = bmask<Packet>(remaining_rows);
+
+  typedef typename std::conditional_t<(sizeof(Scalar) == sizeof(float)), RhsPacket, __vector_pair> RhsPacket2;
+
+  Index col = 0;
+#ifdef GEMM_MULTIPLE_COLS
+  MICRO_MMA_COLS(4);
+  MICRO_MMA_COLS(2);
+#endif
+  MICRO_MMA_COLS(1);
+
+  if (col != cols) {
+    gemm_extra_cols<Scalar, Packet, DataMapper, accCols>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB,
+                                                         col, rows, cols, remaining_rows, pAlpha, pMask);
+  }
+}
+
+#define advanceRows ((LhsIsReal) ? 1 : 2)
+#define advanceCols ((RhsIsReal) ? 1 : 2)
+
+// PEEL_COMPLEX_MMA loop factor.
+#ifdef GEMM_MULTIPLE_COLS
+#define PEEL_COMPLEX_MMA 4
+#else
+#define PEEL_COMPLEX_MMA 3
+#endif
+
+#define MICRO_COMPLEX_MMA_UNROLL(func) func(0) func(1) func(2) func(3)
+
+#define MICRO_COMPLEX_MMA_WORK(func, type, peel)                                                            \
+  if (accItr == 1) {                                                                                        \
+    func(0, type, peel, 0, 0) func(1, type, peel, 1, 0) func(2, type, peel, 2, 0) func(3, type, peel, 3, 0) \
+  } else if (accItr == 2) {                                                                                 \
+    func(0, type, peel, 0, 0) func(1, type, peel, 0, 1) func(2, type, peel, 1, 0) func(3, type, peel, 1, 1) \
+  } else {                                                                                                  \
+    func(0, type, peel, 0, 0) func(1, type, peel, 0, 1) func(2, type, peel, 0, 2) func(3, type, peel, 0, 3) \
+  }
+
+#define MICRO_COMPLEX_MMA_WORK_ONE(iter, type, peel, left, right)                                        \
+  if (unroll_factor > left) {                                                                            \
+    pgercMMA<Packet, type, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(                            \
+        &accReal##iter, &accImag##iter, lhsV##left, lhsVi##left, rhsV##right[peel], rhsVi##right[peel]); \
+  }
+
+#ifdef VECTOR_PAIR_LOADS_LHS
+#define MICRO_COMPLEX_MMA_WORK_TWO(iter, type, peel, left, right)                                    \
+  if (unroll_factor > left) {                                                                        \
+    pgercMMA<Packet, type, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal>(                        \
+        &accReal##iter, &accImag##iter, lhsV2##left.packet[peel & 1], lhsVi2##left.packet[peel & 1], \
+        rhsV##right[peel], rhsVi##right[peel]);                                                      \
+  }
+
+#define MICRO_COMPLEX_MMA_LOAD1_TWO(lhs_ptr, left)                                                  \
+  if (!LhsIsReal && (unroll_factor > left)) {                                                       \
+    if (MICRO_NORMAL(left)) {                                                                       \
+      ploadLhsMMA(reinterpret_cast<const double*>(lhs_ptr_real##left + imag_delta), plhsVi##left);  \
+      __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(&lhsVi2##left.packet), &plhsVi##left); \
+    } else {                                                                                        \
+      lhsVi2##left.packet[0] = ploadLhs<Packet>(lhs_ptr_real##left + imag_delta2);                  \
+      lhsVi2##left.packet[1] = ploadLhs<Packet>(lhs_ptr_real##left + imag_delta2 + accCols2);       \
+      EIGEN_UNUSED_VARIABLE(plhsVi##left);                                                          \
+    }                                                                                               \
+  } else {                                                                                          \
+    EIGEN_UNUSED_VARIABLE(lhsVi2##left);                                                            \
+    EIGEN_UNUSED_VARIABLE(plhsVi##left);                                                            \
+  }                                                                                                 \
+  MICRO_MMA_LOAD1_TWO(lhs_ptr_real, left)
+
+#define MICRO_COMPLEX_MMA_LOAD_TWO(left) MICRO_COMPLEX_MMA_LOAD1_TWO(lhs_ptr, left)
+#endif
+
+#define MICRO_COMPLEX_MMA_LOAD_RHS1(peel, right)                             \
+  ploadRhsMMA(rhs_ptr_real##right + (accRows * peel), rhsV##right[peel]);    \
+  if (!RhsIsReal) {                                                          \
+    ploadRhsMMA(rhs_ptr_imag##right + (accRows * peel), rhsVi##right[peel]); \
+  }
+
+#define MICRO_COMPLEX_MMA_LOAD_ONE_RHS(peel) MICRO_MMA_UNROLL_ITER(MICRO_COMPLEX_MMA_LOAD_RHS1, peel)
+
+#define MICRO_COMPLEX_MMA_TYPE_PEEL(funcw, funcl, type, peel) \
+  if (PEEL_COMPLEX_MMA > peel) {                              \
+    Packet lhsV0, lhsV1, lhsV2, lhsV3;                        \
+    Packet lhsVi0, lhsVi1, lhsVi2, lhsVi3;                    \
+    MICRO_COMPLEX_MMA_LOAD_ONE_RHS(peel)                      \
+    MICRO_COMPLEX_MMA_UNROLL(funcl)                           \
+    MICRO_COMPLEX_MMA_WORK(funcw, type, peel)                 \
+  }
+
+#ifndef VECTOR_PAIR_LOADS_LHS
+#define MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL(funcw, funcl, type)                                                      \
+  type rhsV0[4], rhsVi0[4], rhsV1[(accItr > 1) ? 4 : 1], rhsVi1[(accItr > 1) ? 4 : 1], rhsV2[(accItr > 2) ? 4 : 1], \
+      rhsVi2[(accItr > 2) ? 4 : 1], rhsV3[(accItr > 2) ? 4 : 1], rhsVi3[(accItr > 2) ? 4 : 1];                      \
+  MICRO_COMPLEX_MMA_TYPE_PEEL(funcw, funcl, type, 0)                                                                \
+  MICRO_COMPLEX_MMA_TYPE_PEEL(funcw, funcl, type, 1)                                                                \
+  MICRO_COMPLEX_MMA_TYPE_PEEL(funcw, funcl, type, 2) MICRO_COMPLEX_MMA_TYPE_PEEL(funcw, funcl, type, 3)
+#else
+#define MICRO_COMPLEX_MMA_LOAD_TWO_RHS(peel1, right)                                                      \
+  ploadRhsMMA(reinterpret_cast<const double*>(rhs_ptr_real##right + (accRows * peel1)), prhsV##peel1);    \
+  __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(&rhsV##right[peel1]), &prhsV##peel1);            \
+  if (!RhsIsReal) {                                                                                       \
+    ploadRhsMMA(reinterpret_cast<const double*>(rhs_ptr_imag##right + (accRows * peel1)), prhsVi##peel1); \
+    __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(&rhsVi##right[peel1]), &prhsVi##peel1);        \
+  } else {                                                                                                \
+    EIGEN_UNUSED_VARIABLE(prhsVi##peel1);                                                                 \
+  }
+
+#define MICRO_COMPLEX_MMA_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type, peel1, peel2) \
+  if (PEEL_COMPLEX_MMA > peel2) {                                                        \
+    PacketBlock<Packet, 2> lhsV20, lhsV21, lhsV22, lhsV23;                               \
+    PacketBlock<Packet, 2> lhsVi20, lhsVi21, lhsVi22, lhsVi23;                           \
+    __vector_pair plhsV0, plhsV1, plhsV2, plhsV3;                                        \
+    __vector_pair plhsVi0, plhsVi1, plhsVi2, plhsVi3;                                    \
+    if (sizeof(type) == 16) {                                                            \
+      MICRO_MMA_UNROLL_ITER(MICRO_COMPLEX_MMA_LOAD_TWO_RHS, peel1)                       \
+    } else {                                                                             \
+      EIGEN_UNUSED_VARIABLE(prhsV##peel1);                                               \
+      EIGEN_UNUSED_VARIABLE(prhsVi##peel1);                                              \
+      MICRO_COMPLEX_MMA_LOAD_ONE_RHS(peel1);                                             \
+      MICRO_COMPLEX_MMA_LOAD_ONE_RHS(peel2);                                             \
+    }                                                                                    \
+    MICRO_COMPLEX_MMA_UNROLL(funcl2)                                                     \
+    MICRO_COMPLEX_MMA_WORK(funcw2, type, peel1)                                          \
+    MICRO_COMPLEX_MMA_WORK(funcw2, type, peel2)                                          \
+  } else {                                                                               \
+    EIGEN_UNUSED_VARIABLE(prhsV##peel1);                                                 \
+    EIGEN_UNUSED_VARIABLE(prhsVi##peel1);                                                \
+    MICRO_COMPLEX_MMA_TYPE_PEEL(funcw1, funcl1, type, peel1)                             \
+  }
+
+#define MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type)                                   \
+  type rhsV0[4], rhsVi0[4], rhsV1[(accItr > 1) ? 4 : 1], rhsVi1[(accItr > 1) ? 4 : 1], rhsV2[(accItr > 2) ? 4 : 1], \
+      rhsVi2[(accItr > 2) ? 4 : 1], rhsV3[(accItr > 2) ? 4 : 1], rhsVi3[(accItr > 2) ? 4 : 1];                      \
+  __vector_pair prhsV0, prhsV2;                                                                                     \
+  __vector_pair prhsVi0, prhsVi2;                                                                                   \
+  MICRO_COMPLEX_MMA_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type, 0, 1)                                          \
+  MICRO_COMPLEX_MMA_TYPE_PEEL2(funcw1, funcl1, funcw2, funcl2, type, 2, 3)
+#endif
+
+#define MICRO_COMPLEX_MMA_UNROLL_TYPE_ONE(funcw, funcl, type)                              \
+  type rhsV0[1], rhsVi0[1], rhsV1[1], rhsVi1[1], rhsV2[1], rhsVi2[1], rhsV3[1], rhsVi3[1]; \
+  MICRO_COMPLEX_MMA_TYPE_PEEL(funcw, funcl, type, 0)
+
+#define MICRO_COMPLEX_MMA_UPDATE_RHS1(size, right) \
+  rhs_ptr_real##right += (accRows * size);         \
+  if (!RhsIsReal) rhs_ptr_imag##right += (accRows * size);
+
+#define MICRO_COMPLEX_MMA_UPDATE_RHS(size) MICRO_MMA_UNROLL_ITER(MICRO_COMPLEX_MMA_UPDATE_RHS1, size)
+
+#define MICRO_COMPLEX_MMA_UNROLL_TYPE(MICRO_COMPLEX_MMA_TYPE, size)                     \
+  MICRO_COMPLEX_MMA_TYPE(MICRO_COMPLEX_MMA_WORK_ONE, MICRO_COMPLEX_LOAD_ONE, RhsPacket) \
+  MICRO_COMPLEX_MMA_UPDATE_RHS(size);
+
+#ifndef VECTOR_PAIR_LOADS_LHS
+#define MICRO_COMPLEX_MMA_ONE_PEEL MICRO_COMPLEX_MMA_UNROLL_TYPE(MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL, PEEL_COMPLEX_MMA)
+#else
+#define MICRO_COMPLEX_MMA_UNROLL_TYPE2(MICRO_COMPLEX_MMA_TYPE, size)                                     \
+  MICRO_COMPLEX_MMA_TYPE(MICRO_COMPLEX_MMA_WORK_ONE, MICRO_COMPLEX_LOAD_ONE, MICRO_COMPLEX_MMA_WORK_TWO, \
+                         MICRO_COMPLEX_MMA_LOAD_TWO, RhsPacket)                                          \
+  MICRO_COMPLEX_MMA_UPDATE_RHS(size);
+
+#define MICRO_COMPLEX_MMA_ONE_PEEL MICRO_COMPLEX_MMA_UNROLL_TYPE2(MICRO_COMPLEX_MMA_UNROLL_TYPE_PEEL2, PEEL_COMPLEX_MMA)
+#endif
+
+#define MICRO_COMPLEX_MMA_ONE MICRO_COMPLEX_MMA_UNROLL_TYPE(MICRO_COMPLEX_MMA_UNROLL_TYPE_ONE, 1)
+
+#define MICRO_COMPLEX_MMA_DST_PTR_ONE(iter) \
+  if (unroll_factor * accItr > iter) {      \
+    bsetzeroMMA(&accReal##iter);            \
+    bsetzeroMMA(&accImag##iter);            \
+  } else {                                  \
+    EIGEN_UNUSED_VARIABLE(accReal##iter);   \
+    EIGEN_UNUSED_VARIABLE(accImag##iter);   \
+  }
+
+#define MICRO_COMPLEX_MMA_DST_PTR MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_MMA_DST_PTR_ONE)
+
+#define MICRO_COMPLEX_MMA_SRC_PTR MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_SRC_PTR_ONE)
+
+#define MICRO_COMPLEX_MMA_PREFETCH MICRO_COMPLEX_MMA_UNROLL(MICRO_COMPLEX_PREFETCH_ONE)
+
+#define MICRO_COMPLEX_MMA_STORE_ONE(iter, left, right)                                                                 \
+  if (unroll_factor > left) {                                                                                          \
+    storeComplexAccumulator<DataMapper, Packet, Packetc, accCols, (unroll_factor != (left + 1)) ? accCols : accCols2>( \
+        row + left * accCols, res##right, pAlphaReal, pAlphaImag, pMask, &accReal##iter, &accImag##iter);              \
+  }
+
+#define MICRO_COMPLEX_MMA_ITER_UNROLL(func)                 \
+  if (accItr == 1) {                                        \
+    func(0, 0, 0) func(1, 1, 0) func(2, 2, 0) func(3, 3, 0) \
+  } else if (accItr == 2) {                                 \
+    func(0, 0, 0) func(1, 0, 1) func(2, 1, 0) func(3, 1, 1) \
+  } else {                                                  \
+    func(0, 0, 0) func(1, 0, 1) func(2, 0, 2) func(3, 0, 3) \
+  }
+
+#define MICRO_COMPLEX_MMA_STORE MICRO_COMPLEX_MMA_ITER_UNROLL(MICRO_COMPLEX_MMA_STORE_ONE)
+
+#define MICRO_COMPLEX_MMA_EXTRA_ROWS(right)                                                                            \
+  gemm_complex_extra_row<Scalar, Packet, Packetc, DataMapper, accRows, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, \
+                         RhsIsReal>(res3##right, blockA, rhs_base + right * accRows * (RhsIsReal ? 1 : 2) * strideB,   \
+                                    depth, strideA, offsetA, strideB, row, rows, remaining_rows, pAlphaReal,           \
+                                    pAlphaImag, pMask);
+
+#define MICRO_COMPLEX_MMA_EXTRA_ROWS1(val, right) MICRO_COMPLEX_MMA_EXTRA_ROWS(right);
+
+template <int unroll_factor, typename Scalar, typename Packet, typename Packetc, typename RhsPacket,
+          typename DataMapper, const Index accRows, const Index accCols, const Index accCols2, bool ConjugateLhs,
+          bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal, const Index accItr>
+EIGEN_ALWAYS_INLINE void gemm_complex_unrolled_MMA_iteration(const DataMapper& res0, const DataMapper& res1,
+                                                             const DataMapper& res2, const DataMapper& res3,
+                                                             const Scalar* lhs_base, const Scalar* rhs_base,
+                                                             Index depth, Index strideA, Index offsetA, Index strideB,
+                                                             Index& row, const Packet& pAlphaReal,
+                                                             const Packet& pAlphaImag, const Packet& pMask) {
+  const Scalar *rhs_ptr_real0 = rhs_base, *rhs_ptr_real1 = NULL, *rhs_ptr_real2 = NULL, *rhs_ptr_real3 = NULL;
+  const Scalar *rhs_ptr_imag0 = NULL, *rhs_ptr_imag1 = NULL, *rhs_ptr_imag2 = NULL, *rhs_ptr_imag3 = NULL;
+  const Index imag_delta = accCols * strideA;
+  const Index imag_delta2 = accCols2 * strideA;
+
+  if (!RhsIsReal) {
+    rhs_ptr_imag0 = rhs_base + accRows * strideB;
+  } else {
+    EIGEN_UNUSED_VARIABLE(rhs_ptr_imag0);
+  }
+  if (accItr > 1) {
+    if (!RhsIsReal) {
+      rhs_ptr_real1 = rhs_base + (2 * accRows * strideB);
+      rhs_ptr_imag1 = rhs_base + (3 * accRows * strideB);
+    } else {
+      rhs_ptr_real1 = rhs_base + accRows * strideB;
+      EIGEN_UNUSED_VARIABLE(rhs_ptr_imag1);
+    }
+  } else {
+    EIGEN_UNUSED_VARIABLE(rhs_ptr_real1);
+    EIGEN_UNUSED_VARIABLE(rhs_ptr_imag1);
+    EIGEN_UNUSED_VARIABLE(res1);
+  }
+  if (accItr > 2) {
+    if (!RhsIsReal) {
+      rhs_ptr_real2 = rhs_base + (4 * accRows * strideB);
+      rhs_ptr_imag2 = rhs_base + (5 * accRows * strideB);
+      rhs_ptr_real3 = rhs_base + (6 * accRows * strideB);
+      rhs_ptr_imag3 = rhs_base + (7 * accRows * strideB);
+    } else {
+      rhs_ptr_real2 = rhs_base + (2 * accRows * strideB);
+      rhs_ptr_real3 = rhs_base + (3 * accRows * strideB);
+      EIGEN_UNUSED_VARIABLE(rhs_ptr_imag2);
+      EIGEN_UNUSED_VARIABLE(rhs_ptr_imag3);
+    }
+  } else {
+    EIGEN_UNUSED_VARIABLE(rhs_ptr_real2);
+    EIGEN_UNUSED_VARIABLE(rhs_ptr_real3);
+    EIGEN_UNUSED_VARIABLE(rhs_ptr_imag2);
+    EIGEN_UNUSED_VARIABLE(rhs_ptr_imag3);
+    EIGEN_UNUSED_VARIABLE(res2);
+    EIGEN_UNUSED_VARIABLE(res3);
+  }
+  const Scalar *lhs_ptr_real0 = NULL, *lhs_ptr_real1 = NULL;
+  const Scalar *lhs_ptr_real2 = NULL, *lhs_ptr_real3 = NULL;
+  __vector_quad accReal0, accImag0, accReal1, accImag1, accReal2, accImag2, accReal3, accImag3;
+
+  MICRO_COMPLEX_MMA_SRC_PTR
+  MICRO_COMPLEX_MMA_DST_PTR
+
+  Index k = 0, depth2 = depth - PEEL_COMPLEX_MMA;
+  for (; k <= depth2; k += PEEL_COMPLEX_MMA) {
+    EIGEN_POWER_PREFETCH(rhs_ptr_real);
+    if (!RhsIsReal) {
+      EIGEN_POWER_PREFETCH(rhs_ptr_imag);
+    }
+    MICRO_COMPLEX_MMA_PREFETCH
+    MICRO_COMPLEX_MMA_ONE_PEEL
+  }
+  for (; k < depth; k++) {
+    MICRO_COMPLEX_MMA_ONE
+  }
+  MICRO_COMPLEX_MMA_STORE
+
+  MICRO_COMPLEX_UPDATE
+}
+
+#define MICRO_COMPLEX_MMA_UNROLL_ITER2(N, M)                                                                           \
+  gemm_complex_unrolled_MMA_iteration<N + (M ? 1 : 0), Scalar, Packet, Packetc, RhsPacket, DataMapper, accRows,        \
+                                      accCols, M ? M : accCols, ConjugateLhs, ConjugateRhs, LhsIsReal, RhsIsReal,      \
+                                      accItr>(res30, res31, res32, res33, lhs_base, rhs_base, depth, strideA, offsetA, \
+                                              strideB, row, pAlphaReal, pAlphaImag, pMask);                            \
+  if (M) return;
+
+#define MICRO_COMPLEX_MMA_ROWS(n)         \
+  while (row + n * accCols <= rows) {     \
+    MICRO_COMPLEX_MMA_UNROLL_ITER2(n, 0); \
+  }
+
+template <typename Scalar, typename Packet, typename Packetc, typename RhsPacket, typename DataMapper,
+          const Index accRows, const Index accCols, bool ConjugateLhs, bool ConjugateRhs, bool LhsIsReal,
+          bool RhsIsReal, const Index accItr>
+EIGEN_ALWAYS_INLINE void gemmMMA_complex_cols(const DataMapper& res, const Scalar* blockA, const Scalar* blockB,
+                                              Index depth, Index strideA, Index offsetA, Index strideB, Index offsetB,
+                                              Index col, Index rows, Index remaining_rows, const Packet& pAlphaReal,
+                                              const Packet& pAlphaImag, const Packet& pMask) {
+  const DataMapper res30 = res.getSubMapper(0, col);
+  const DataMapper res31 = (accItr > 1) ? res30.getSubMapper(0, accRows * 1) : res30;
+  const DataMapper res32 = (accItr > 2) ? res30.getSubMapper(0, accRows * 2) : res30;
+  const DataMapper res33 = (accItr > 2) ? res30.getSubMapper(0, accRows * 3) : res30;
+
+  const Scalar* rhs_base = blockB + advanceCols * col * strideB + accRows * offsetB;
+  const Scalar* lhs_base = blockA + accCols * offsetA;
+  Index row = 0;
+
+#define MAX_COMPLEX_MMA_UNROLL 4
+
+#if MAX_COMPLEX_MMA_UNROLL < 2
+  if (1) {
+#elif MAX_COMPLEX_MMA_UNROLL < 4
+  if (accItr <= 2) {
+#else
+  if (accItr == 1) {
+#endif
+    MICRO_COMPLEX_MMA_ROWS(MAX_COMPLEX_MMA_UNROLL);
+  } else if (accItr == 2) {
+    MICRO_COMPLEX_MMA_ROWS(2);
+  } else {
+    MICRO_COMPLEX_MMA_ROWS(1);
+  }
+  switch ((rows - row) / accCols) {
+#if MAX_COMPLEX_MMA_UNROLL > 3
+    case 3:
+      if (accItr == 1) {
+        MICRO_COMPLEX_UNROLL_ITER(MICRO_COMPLEX_MMA_UNROLL_ITER2, 3)
+      }
+      break;
+#endif
+#if MAX_COMPLEX_MMA_UNROLL > 2
+    case 2:
+      if (accItr == 1) {
+        MICRO_COMPLEX_UNROLL_ITER(MICRO_COMPLEX_MMA_UNROLL_ITER2, 2)
+      }
+      break;
+#endif
+#if MAX_COMPLEX_MMA_UNROLL > 1
+    case 1:
+      if (accItr <= 2) {
+        MICRO_COMPLEX_UNROLL_ITER(MICRO_COMPLEX_MMA_UNROLL_ITER2, 1)
+      }
+      break;
+#endif
+    default:
+      break;
+  }
+#undef MAX_COMPLEX_MMA_UNROLL
+
+  if (remaining_rows > 0) {
+    MICRO_MMA_UNROLL_ITER(MICRO_COMPLEX_MMA_EXTRA_ROWS1, 0)
+  }
+}
+
+#define MICRO_COMPLEX_MMA_COLS(n)                                                                                      \
+  for (; col + n * accRows <= cols; col += n * accRows) {                                                              \
+    gemmMMA_complex_cols<Scalar, Packet, Packetc, RhsPacket2, DataMapper, accRows, accCols, ConjugateLhs,              \
+                         ConjugateRhs, LhsIsReal, RhsIsReal, n>(res, blockA, blockB, depth, strideA, offsetA, strideB, \
+                                                                offsetB, col, rows, remaining_rows, pAlphaReal,        \
+                                                                pAlphaImag, pMask);                                    \
+  }
+
+template <typename LhsScalar, typename RhsScalar, typename Scalarc, typename Scalar, typename Packet, typename Packetc,
+          typename RhsPacket, typename DataMapper, const Index accRows, const Index accCols, bool ConjugateLhs,
+          bool ConjugateRhs, bool LhsIsReal, bool RhsIsReal>
+void gemm_complexMMA(const DataMapper& res, const LhsScalar* blockAc, const RhsScalar* blockBc, Index rows, Index depth,
+                     Index cols, Scalarc alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) {
+  const Index remaining_rows = rows % accCols;
+
+  if (strideA == -1) strideA = depth;
+  if (strideB == -1) strideB = depth;
+
+  const Packet pAlphaReal = pset1<Packet>(alpha.real());
+  const Packet pAlphaImag = pset1<Packet>(alpha.imag());
+  const Packet pMask = bmask<Packet>(remaining_rows);
+
+  const Scalar* blockA = (Scalar*)blockAc;
+  const Scalar* blockB = (Scalar*)blockBc;
+
+  typedef typename std::conditional_t<(sizeof(Scalar) == sizeof(float)), RhsPacket, __vector_pair> RhsPacket2;
+
+  Index col = 0;
+#ifdef GEMM_MULTIPLE_COLS
+  MICRO_COMPLEX_MMA_COLS(4);
+  MICRO_COMPLEX_MMA_COLS(2);
+#endif
+  MICRO_COMPLEX_MMA_COLS(1);
+
+  if (col != cols) {
+    gemm_complex_extra_cols<Scalar, Packet, Packetc, DataMapper, accCols, ConjugateLhs, ConjugateRhs, LhsIsReal,
+                            RhsIsReal>(res, blockA, blockB, depth, strideA, offsetA, strideB, offsetB, col, rows, cols,
+                                       remaining_rows, pAlphaReal, pAlphaImag, pMask);
+  }
+}
+
+#undef accColsC
+#undef advanceRows
+#undef advanceCols
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#if defined(EIGEN_ALTIVEC_MMA_DYNAMIC_DISPATCH)
+#pragma GCC pop_options
+#endif
+
+#endif  // EIGEN_MATRIX_PRODUCT_MMA_ALTIVEC_H
diff --git a/inst/include/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h b/inst/include/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h
new file mode 100644
index 00000000..6ecec0e6
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/AltiVec/MatrixProductMMAbfloat16.h
@@ -0,0 +1,742 @@
+#ifndef EIGEN_MATRIX_PRODUCT_MMA_BFLOAT16_ALTIVEC_H
+#define EIGEN_MATRIX_PRODUCT_MMA_BFLOAT16_ALTIVEC_H
+
+#if EIGEN_COMP_LLVM
+#define BFLOAT16_UNROLL _Pragma("unroll 8")
+#else
+#define BFLOAT16_UNROLL _Pragma("GCC unroll(8)")
+#endif
+
+namespace Eigen {
+
+namespace internal {
+
+template <bool zero>
+EIGEN_ALWAYS_INLINE Packet8bf loadBfloat16(const bfloat16* indexA) {
+  Packet8bf lhs1 = ploadu<Packet8bf>(indexA);
+  if (zero) {
+    Packet8bf lhs2 = pset1<Packet8bf>(Eigen::bfloat16(0));
+    return vec_mergeh(lhs1.m_val, lhs2.m_val);
+  } else {
+    return lhs1;
+  }
+}
+
+template <bool zero>
+EIGEN_ALWAYS_INLINE Packet8bf loadRhsBfloat16(const bfloat16* blockB, Index strideB, Index i) {
+  return loadBfloat16<zero>(blockB + strideB * i);
+}
+
+template <Index num_acc, Index num_packets, bool zero, bool rhsExtraCols, bool lhsExtraRows, Index num_rhs,
+          Index num_lhs>
+EIGEN_ALWAYS_INLINE void KLoop(const bfloat16* indexA, const bfloat16* indexB, __vector_quad (&quad_acc)[num_acc],
+                               Index strideB, Index k, Index offsetB, Index extra_cols, Index extra_rows) {
+  Packet8bf lhs[num_lhs], rhs[num_rhs];
+
+  BFLOAT16_UNROLL
+  for (Index i = 0; i < (num_rhs - (rhsExtraCols ? 1 : 0)); i++) {
+    rhs[i] = loadRhsBfloat16<zero>(indexB + k * 4, strideB, i);
+  }
+  if (rhsExtraCols) {
+    rhs[num_rhs - 1] = loadRhsBfloat16<zero>(indexB + k * extra_cols - offsetB, strideB, num_rhs - 1);
+  }
+
+  indexA += k * (lhsExtraRows ? extra_rows : num_packets);
+  if (num_lhs == 1) {
+    lhs[0] = loadBfloat16<zero>(indexA);
+  } else {
+    BFLOAT16_UNROLL
+    for (Index j = 0; j < num_lhs; j += 2) {
+      Packet8bf lhs1 = ploadu<Packet8bf>(indexA + (j + 0) * (zero ? 4 : 8));
+      if (zero) {
+        Packet8bf lhs2 = pset1<Packet8bf>(Eigen::bfloat16(0));
+        lhs[j + 0] = vec_mergeh(lhs1.m_val, lhs2.m_val);
+        lhs[j + 1] = vec_mergel(lhs1.m_val, lhs2.m_val);
+      } else {
+        lhs[j + 0] = lhs1;
+        lhs[j + 1] = ploadu<Packet8bf>(indexA + (j + 1) * 8);
+      }
+    }
+  }
+
+  BFLOAT16_UNROLL
+  for (Index i = 0, x = 0; i < num_rhs; i++) {
+    BFLOAT16_UNROLL
+    for (Index j = 0; j < num_lhs; j++, x++) {
+      __builtin_mma_xvbf16ger2pp(&(quad_acc[x]), reinterpret_cast<Packet16uc>(rhs[i].m_val),
+                                 reinterpret_cast<Packet16uc>(lhs[j].m_val));
+    }
+  }
+}
+
+template <Index num_acc>
+EIGEN_ALWAYS_INLINE void zeroAccumulators(__vector_quad (&quad_acc)[num_acc]) {
+  BFLOAT16_UNROLL
+  for (Index k = 0; k < num_acc; k++) __builtin_mma_xxsetaccz(&(quad_acc[k]));
+}
+
+template <Index num_acc>
+EIGEN_ALWAYS_INLINE void disassembleAccumulators(__vector_quad (&quad_acc)[num_acc], Packet4f (&acc)[num_acc][4]) {
+  BFLOAT16_UNROLL
+  for (Index k = 0; k < num_acc; k++) __builtin_mma_disassemble_acc((void*)acc[k], &(quad_acc[k]));
+}
+
+template <Index num_acc, bool rhsExtraCols, bool lhsExtraRows, Index num_rhs, Index num_lhs>
+EIGEN_ALWAYS_INLINE void outputResults(Packet4f (&acc)[num_acc][4], Index rows, const Packet4f pAlpha, float* result,
+                                       const Index extra_cols, Index extra_rows) {
+  BFLOAT16_UNROLL
+  for (Index i = 0, k = 0; i < num_rhs - (rhsExtraCols ? 1 : 0); i++, result += 4 * rows) {
+    BFLOAT16_UNROLL
+    for (Index j = 0; j < num_lhs; j++, k++) {
+      storeResults<false, lhsExtraRows>(acc[k], rows, pAlpha, result + j * 4, extra_cols, extra_rows);
+    }
+  }
+  if (rhsExtraCols) {
+    storeResults<rhsExtraCols, lhsExtraRows>(acc[num_acc - 1], rows, pAlpha, result, extra_cols, extra_rows);
+  }
+}
+
+template <const Index num_acc, const Index num_packets, bool rhsExtraCols, bool lhsExtraRows, bool multiIter = false>
+EIGEN_ALWAYS_INLINE void colLoopBodyIter(Index depth, Index rows, const Packet4f pAlpha, const bfloat16* indexA,
+                                         const bfloat16* indexB, Index strideB, Index offsetB, float* result,
+                                         const Index extra_cols, const Index extra_rows) {
+  constexpr Index num_lhs = multiIter ? (num_packets / 4) : 1;
+  constexpr Index num_rhs = (num_acc + num_lhs - 1) / num_lhs;
+
+  for (Index offset_row = 0; offset_row < num_packets; offset_row += 4, indexA += (multiIter ? 0 : 8),
+             indexB += (multiIter ? (num_rhs * strideB) : 0), result += (multiIter ? (4 * rows * num_rhs) : 4)) {
+    Packet4f acc[num_acc][4];
+    __vector_quad quad_acc[num_acc];
+
+    zeroAccumulators<num_acc>(quad_acc);
+
+    Index k;
+    for (k = 0; k + 2 <= depth; k += 2) {
+      KLoop<num_acc, num_packets, false, rhsExtraCols, lhsExtraRows, num_rhs, num_lhs>(
+          indexA, indexB, quad_acc, strideB, k, offsetB, extra_cols, extra_rows);
+    }
+    if (depth & 1) {
+      KLoop<num_acc, num_packets, true, rhsExtraCols, lhsExtraRows, num_rhs, num_lhs>(
+          indexA - (multiIter ? 0 : offset_row), indexB, quad_acc, strideB, k, offsetB, extra_cols, extra_rows);
+    }
+
+    disassembleAccumulators<num_acc>(quad_acc, acc);
+
+    outputResults<num_acc, rhsExtraCols, lhsExtraRows, num_rhs, num_lhs>(acc, rows, pAlpha, result, extra_cols,
+                                                                         extra_rows);
+  }
+}
+
+#define MAX_BFLOAT16_ACC 8
+
+template <const Index num_acc, const Index num_packets, bool rhsExtraCols, bool lhsExtraRows>
+void colLoopBody(Index& col, Index depth, Index cols, Index rows, const Packet4f pAlpha, const bfloat16* indexA,
+                 const bfloat16* indexB, Index strideB, Index offsetB, float* result) {
+  constexpr Index step = (num_acc * 4);  // each accumulator has 4 elements
+  const Index extra_cols = (rhsExtraCols) ? (cols & 3) : 0;
+  const Index extra_rows = (lhsExtraRows) ? (rows & 3) : 0;
+  constexpr bool multiIters = !rhsExtraCols && (num_acc == MAX_BFLOAT16_ACC);
+  constexpr bool normIters = multiIters && ((num_acc % (num_packets / 4)) == 0);
+
+  do {
+    colLoopBodyIter<num_acc, num_packets, rhsExtraCols, lhsExtraRows, normIters>(
+        depth, rows, pAlpha, indexA, indexB, strideB, offsetB, result, extra_cols, extra_rows);
+
+    indexB += strideB * num_acc;
+    result += rows * step;
+  } while (multiIters && (step <= cols - (col += step)));
+}
+
+template <const Index num_acc, const Index num_packets, bool rhsExtraCols, bool lhsExtraRows>
+EIGEN_ALWAYS_INLINE void colLoopBodyExtraN(Index col, Index depth, Index cols, Index rows, const Packet4f pAlpha,
+                                           const bfloat16* indexA, const bfloat16* blockB, Index strideB, Index offsetB,
+                                           float* result) {
+  if (MAX_BFLOAT16_ACC > num_acc) {
+    colLoopBody<num_acc + (rhsExtraCols ? 1 : 0), num_packets, rhsExtraCols, lhsExtraRows>(
+        col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB, result);
+  }
+}
+
+template <const Index num_packets, bool rhsExtraCols, bool lhsExtraRows>
+void colLoopBodyExtra(Index col, Index depth, Index cols, Index rows, const Packet4f pAlpha, const bfloat16* indexA,
+                      const bfloat16* blockB, Index strideB, Index offsetB, float* result) {
+  switch ((cols - col) >> 2) {
+    case 7:
+      colLoopBodyExtraN<7, num_packets, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB,
+                                                                    strideB, offsetB, result);
+      break;
+    case 6:
+      colLoopBodyExtraN<6, num_packets, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB,
+                                                                    strideB, offsetB, result);
+      break;
+    case 5:
+      colLoopBodyExtraN<5, num_packets, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB,
+                                                                    strideB, offsetB, result);
+      break;
+    case 4:
+      colLoopBodyExtraN<4, num_packets, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB,
+                                                                    strideB, offsetB, result);
+      break;
+    case 3:
+      colLoopBodyExtraN<3, num_packets, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB,
+                                                                    strideB, offsetB, result);
+      break;
+    case 2:
+      colLoopBodyExtraN<2, num_packets, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB,
+                                                                    strideB, offsetB, result);
+      break;
+    case 1:
+      colLoopBodyExtraN<1, num_packets, rhsExtraCols, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB,
+                                                                    strideB, offsetB, result);
+      break;
+    default:
+      if (rhsExtraCols) {
+        colLoopBody<1, num_packets, true, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB,
+                                                        offsetB, result);
+      }
+      break;
+  }
+}
+
+template <const Index num_packets, bool lhsExtraRows = false>
+EIGEN_ALWAYS_INLINE void colLoops(Index depth, Index cols, Index rows, const Packet4f pAlpha, const bfloat16* indexA,
+                                  const bfloat16* blockB, Index strideB, Index offsetB, float* result) {
+  Index col = 0;
+  if (cols >= (MAX_BFLOAT16_ACC * 4)) {
+    colLoopBody<MAX_BFLOAT16_ACC, num_packets, false, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB,
+                                                                    strideB, 0, result);
+    blockB += (strideB >> 2) * col;
+    result += rows * col;
+  }
+  if (cols & 3) {
+    colLoopBodyExtra<num_packets, true, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB, offsetB,
+                                                      result);
+  } else {
+    colLoopBodyExtra<num_packets, false, lhsExtraRows>(col, depth, cols, rows, pAlpha, indexA, blockB, strideB, 0,
+                                                       result);
+  }
+}
+
+EIGEN_ALWAYS_INLINE Packet8bf convertF32toBF16(const float* res) {
+  Packet16uc fp16[2];
+  __vector_pair fp16_vp = *reinterpret_cast<__vector_pair*>(const_cast<float*>(res));
+  __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(fp16), &fp16_vp);
+  fp16[0] = __builtin_vsx_xvcvspbf16(fp16[0]);
+  fp16[1] = __builtin_vsx_xvcvspbf16(fp16[1]);
+  return vec_pack(reinterpret_cast<Packet4ui>(fp16[0]), reinterpret_cast<Packet4ui>(fp16[1]));
+}
+
+template <typename DataMapper, const Index size>
+EIGEN_ALWAYS_INLINE void convertArrayF32toBF16Col(float* result, Index col, Index rows, const DataMapper& res) {
+  const DataMapper res2 = res.getSubMapper(0, col);
+  Index row;
+  float* result2 = result + col * rows;
+  for (row = 0; row + 8 <= rows; row += 8, result2 += 8) {
+    // get and save block
+    PacketBlock<Packet8bf, size> block;
+    BFLOAT16_UNROLL
+    for (Index j = 0; j < size; j++) {
+      block.packet[j] = convertF32toBF16(result2 + j * rows);
+    }
+    res2.template storePacketBlock<Packet8bf, size>(row, 0, block);
+  }
+  // extra rows
+  if (row < rows) {
+    BFLOAT16_UNROLL
+    for (Index j = 0; j < size; j++) {
+      Packet8bf fp16 = convertF32toBF16(result2 + j * rows);
+      res2.template storePacketPartial<Packet8bf>(row, j, fp16, rows & 7);
+    }
+  }
+}
+
+template <const Index size, bool non_unit_stride = false>
+EIGEN_ALWAYS_INLINE void convertPointerF32toBF16(Index& i, float* result, Index rows, bfloat16*& dst,
+                                                 Index resInc = 1) {
+  constexpr Index extra = ((size < 8) ? 8 : size);
+  while (i + size <= rows) {
+    PacketBlock<Packet8bf, (size + 7) / 8> r32;
+    r32.packet[0] = convertF32toBF16(result + i + 0);
+    if (size >= 16) {
+      r32.packet[1] = convertF32toBF16(result + i + 8);
+    }
+    if (size >= 32) {
+      r32.packet[2] = convertF32toBF16(result + i + 16);
+      r32.packet[3] = convertF32toBF16(result + i + 24);
+    }
+    storeBF16fromResult<size, non_unit_stride, 0>(dst, r32.packet[0], resInc, rows & 7);
+    if (size >= 16) {
+      storeBF16fromResult<size, non_unit_stride, 8>(dst, r32.packet[1], resInc);
+    }
+    if (size >= 32) {
+      storeBF16fromResult<size, non_unit_stride, 16>(dst, r32.packet[2], resInc);
+      storeBF16fromResult<size, non_unit_stride, 24>(dst, r32.packet[3], resInc);
+    }
+    i += extra;
+    dst += extra * resInc;
+    if (size != 32) break;
+  }
+}
+
+template <bool non_unit_stride = false>
+EIGEN_ALWAYS_INLINE void convertArrayPointerF32toBF16(float* result, Index rows, bfloat16* dst, Index resInc = 1) {
+  Index i = 0;
+  convertPointerF32toBF16<32, non_unit_stride>(i, result, rows, dst, resInc);
+  convertPointerF32toBF16<16, non_unit_stride>(i, result, rows, dst, resInc);
+  convertPointerF32toBF16<8, non_unit_stride>(i, result, rows, dst, resInc);
+  convertPointerF32toBF16<1, non_unit_stride>(i, result, rows, dst, resInc);
+}
+
+template <typename DataMapper>
+EIGEN_ALWAYS_INLINE void convertArrayF32toBF16(float* result, Index cols, Index rows, const DataMapper& res) {
+  Index col;
+  for (col = 0; col + 4 <= cols; col += 4) {
+    convertArrayF32toBF16Col<DataMapper, 4>(result, col, rows, res);
+  }
+  // extra cols
+  switch (cols - col) {
+    case 1:
+      convertArrayF32toBF16Col<DataMapper, 1>(result, col, rows, res);
+      break;
+    case 2:
+      convertArrayF32toBF16Col<DataMapper, 2>(result, col, rows, res);
+      break;
+    case 3:
+      convertArrayF32toBF16Col<DataMapper, 3>(result, col, rows, res);
+      break;
+  }
+}
+
+template <Index size>
+EIGEN_ALWAYS_INLINE void calcColLoops(const bfloat16*& indexA, Index& row, Index depth, Index cols, Index rows,
+                                      const Packet4f pAlpha, const bfloat16* indexB, Index strideB, Index offsetA,
+                                      Index offsetB, Index bigSuffix, float* result) {
+  if ((size == 16) || (rows & size)) {
+    indexA += size * offsetA;
+    colLoops<size>(depth, cols, rows, pAlpha, indexA, indexB, strideB, offsetB, result + row);
+    row += size;
+    indexA += bigSuffix * size / 16;
+  }
+}
+
+template <typename DataMapper>
+void gemmMMAbfloat16(const DataMapper& res, const bfloat16* indexA, const bfloat16* indexB, Index rows, Index depth,
+                     Index cols, bfloat16 alpha, Index strideA, Index strideB, Index offsetA, Index offsetB) {
+  float falpha = Eigen::bfloat16_impl::bfloat16_to_float(alpha);
+  const Packet4f pAlpha = pset1<Packet4f>(falpha);
+  ei_declare_aligned_stack_constructed_variable(float, result, cols* rows, 0);
+
+  convertArrayBF16toF32<DataMapper>(result, cols, rows, res);
+
+  if (strideA == -1) strideA = depth;
+  if (strideB == -1) strideB = depth;
+  // Packing is done in blocks.
+  // There's 4 possible sizes of blocks
+  // Blocks of 8 columns with 16 elements (8x16)
+  // Blocks of 8 columns with 8 elements (8x8). This happens when there's 16 > rows >= 8
+  // Blocks of 8 columns with 4 elements (8x4). This happens when there's 8 > rows >= 4
+  // Blocks of 8 columns with < 4 elements. This happens when there's less than 4 remaining rows
+
+  // Loop for LHS standard block (8x16)
+  Index bigSuffix = (2 * 8) * (strideA - offsetA);
+  indexB += 4 * offsetB;
+  strideB *= 4;
+  offsetB *= 3;
+
+  Index row = 0;
+  while (row + 16 <= rows) {
+    calcColLoops<16>(indexA, row, depth, cols, rows, pAlpha, indexB, strideB, offsetA, offsetB, bigSuffix, result);
+  }
+  // LHS (8x8) block
+  calcColLoops<8>(indexA, row, depth, cols, rows, pAlpha, indexB, strideB, offsetA, offsetB, bigSuffix, result);
+  // LHS (8x4) block
+  calcColLoops<4>(indexA, row, depth, cols, rows, pAlpha, indexB, strideB, offsetA, offsetB, bigSuffix, result);
+  // extra rows
+  if (rows & 3) {
+    // This index is the beginning of remaining block.
+    colLoops<4, true>(depth, cols, rows, pAlpha, indexA, indexB, strideB, offsetB, result + row);
+  }
+
+  // Convert back to bfloat16
+  convertArrayF32toBF16<DataMapper>(result, cols, rows, res);
+}
+
+#undef MAX_BFLOAT16_ACC
+
+#if !EIGEN_ALTIVEC_DISABLE_MMA
+template <Index num_acc, typename LhsMapper, bool zero>
+EIGEN_ALWAYS_INLINE void loadVecLoop(Index k, LhsMapper& lhs, Packet8bf (&a0)[num_acc], Packet8bf b1) {
+  a0[k + 0] = lhs.template loadPacket<Packet8bf>(k * 4, 0);
+  if (!zero) {
+    b1 = lhs.template loadPacket<Packet8bf>(k * 4, 1);
+  }
+  if (num_acc > (k + 1)) {
+    a0[k + 1] = vec_mergel(a0[k + 0].m_val, b1.m_val);
+  }
+  a0[k + 0] = vec_mergeh(a0[k + 0].m_val, b1.m_val);
+}
+
+template <Index num_acc>
+EIGEN_ALWAYS_INLINE void multVec(__vector_quad (&quad_acc)[num_acc], Packet8bf (&a0)[num_acc], Packet8bf b0) {
+  BFLOAT16_UNROLL
+  for (Index k = 0; k < num_acc; k++) {
+    __builtin_mma_xvbf16ger2pp(&(quad_acc[k]), reinterpret_cast<Packet16uc>(b0.m_val),
+                               reinterpret_cast<Packet16uc>(a0[k].m_val));
+  }
+}
+
+template <Index num_acc, typename LhsMapper, typename RhsMapper, bool zero, bool linear>
+EIGEN_ALWAYS_INLINE void vecColLoop(Index j, LhsMapper& lhs, RhsMapper& rhs, __vector_quad (&quad_acc)[num_acc]) {
+  Packet8bf a0[num_acc];
+  Packet8bf b1 = pset1<Packet8bf>(Eigen::bfloat16(0));
+  Packet8bf b0 = loadColData<RhsMapper, linear>(rhs, j);
+
+  if (zero) {
+    b0 = vec_mergeh(b0.m_val, b1.m_val);
+  }
+
+  using LhsSubMapper = typename LhsMapper::SubMapper;
+
+  LhsSubMapper lhs2 = lhs.getSubMapper(0, j);
+  BFLOAT16_UNROLL
+  for (Index k = 0; k < num_acc; k += 2) {
+    loadVecLoop<num_acc, LhsSubMapper, zero>(k, lhs2, a0, b1);
+  }
+
+  multVec<num_acc>(quad_acc, a0, b0);
+}
+
+#define MAX_BFLOAT16_VEC_ACC 8
+
+template <const Index num_acc, typename LhsMapper, typename RhsMapper, bool extraRows, bool linear>
+void colVecColLoopBody(Index& row, Index cend, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha,
+                       float* result) {
+  constexpr Index step = (num_acc * 4);
+  const Index extra_rows = (extraRows) ? (rows & 3) : 0;
+  constexpr bool multiIters = !extraRows && (num_acc == MAX_BFLOAT16_VEC_ACC);
+
+  do {
+    Packet4f acc[num_acc][4];
+    __vector_quad quad_acc[num_acc];
+
+    zeroAccumulators<num_acc>(quad_acc);
+
+    using LhsSubMapper = typename LhsMapper::SubMapper;
+
+    LhsSubMapper lhs2 = lhs.getSubMapper(row, 0);
+    for (Index j = 0; j + 2 <= cend; j += 2) {
+      vecColLoop<num_acc, LhsSubMapper, RhsMapper, false, linear>(j, lhs2, rhs, quad_acc);
+    }
+    if (cend & 1) {
+      vecColLoop<num_acc, LhsSubMapper, RhsMapper, true, linear>(cend - 1, lhs2, rhs, quad_acc);
+    }
+
+    disassembleAccumulators<num_acc>(quad_acc, acc);
+
+    outputVecColResults<num_acc, extraRows>(acc, result, pAlpha, extra_rows);
+
+    result += step;
+  } while (multiIters && (step <= rows - (row += step)));
+}
+
+template <const Index num_acc, typename LhsMapper, typename RhsMapper, bool extraRows, bool linear>
+EIGEN_ALWAYS_INLINE void colVecColLoopBodyExtraN(Index& row, Index cend, Index rows, LhsMapper& lhs, RhsMapper& rhs,
+                                                 const Packet4f pAlpha, float* result) {
+  if (MAX_BFLOAT16_VEC_ACC > num_acc) {
+    colVecColLoopBody<num_acc + (extraRows ? 1 : 0), LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs,
+                                                                                              pAlpha, result);
+  }
+}
+
+template <typename LhsMapper, typename RhsMapper, bool extraRows, bool linear>
+EIGEN_ALWAYS_INLINE void colVecColLoopBodyExtra(Index& row, Index cend, Index rows, LhsMapper& lhs, RhsMapper& rhs,
+                                                const Packet4f pAlpha, float* result) {
+  switch ((rows - row) >> 2) {
+    case 7:
+      colVecColLoopBodyExtraN<7, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+      break;
+    case 6:
+      colVecColLoopBodyExtraN<6, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+      break;
+    case 5:
+      colVecColLoopBodyExtraN<5, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+      break;
+    case 4:
+      colVecColLoopBodyExtraN<4, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+      break;
+    case 3:
+      colVecColLoopBodyExtraN<3, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+      break;
+    case 2:
+      colVecColLoopBodyExtraN<2, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+      break;
+    case 1:
+      colVecColLoopBodyExtraN<1, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+      break;
+    default:
+      if (extraRows) {
+        colVecColLoopBody<1, LhsMapper, RhsMapper, true, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+      }
+      break;
+  }
+}
+
+template <typename LhsMapper, typename RhsMapper, bool linear>
+EIGEN_ALWAYS_INLINE void calcVecColLoops(Index cend, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha,
+                                         float* result) {
+  Index row = 0;
+  if (rows >= (MAX_BFLOAT16_VEC_ACC * 4)) {
+    colVecColLoopBody<MAX_BFLOAT16_VEC_ACC, LhsMapper, RhsMapper, false, linear>(row, cend, rows, lhs, rhs, pAlpha,
+                                                                                 result);
+    result += row;
+  }
+  if (rows & 3) {
+    colVecColLoopBodyExtra<LhsMapper, RhsMapper, true, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+  } else {
+    colVecColLoopBodyExtra<LhsMapper, RhsMapper, false, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+  }
+}
+
+template <typename RhsMapper, typename LhsMapper, typename = void>
+struct UseMMAStride : std::false_type {
+  static EIGEN_ALWAYS_INLINE void run(Index j2, Index jend, Index rows, LhsMapper& lhs, RhsMapper& rhs, Packet4f pAlpha,
+                                      float* result) {
+    using RhsSubMapper = typename RhsMapper::SubMapper;
+
+    RhsSubMapper rhs2 = rhs.getSubMapper(j2, 0);
+    calcVecColLoops<LhsMapper, RhsSubMapper, false>(jend - j2, rows, lhs, rhs2, pAlpha, result);
+  }
+};
+
+template <typename RhsMapper, typename LhsMapper>
+struct UseMMAStride<RhsMapper, LhsMapper,
+                    std::enable_if_t<std::is_member_function_pointer<decltype(&RhsMapper::stride)>::value>>
+    : std::true_type {
+  static EIGEN_ALWAYS_INLINE void run(Index j2, Index jend, Index rows, LhsMapper& lhs, RhsMapper& rhs, Packet4f pAlpha,
+                                      float* result) {
+    using RhsSubMapper = typename RhsMapper::SubMapper;
+
+    RhsSubMapper rhs2 = rhs.getSubMapper(j2, 0);
+    if (rhs.stride() == 1) {
+      calcVecColLoops<LhsMapper, RhsSubMapper, true>(jend - j2, rows, lhs, rhs2, pAlpha, result);
+    } else {
+      calcVecColLoops<LhsMapper, RhsSubMapper, false>(jend - j2, rows, lhs, rhs2, pAlpha, result);
+    }
+  }
+};
+
+template <typename LhsMapper, typename RhsMapper>
+void gemvMMA_bfloat16_col(Index rows, Index cols, const LhsMapper& alhs, const RhsMapper& rhs, bfloat16* res,
+                          Index resIncr, bfloat16 alpha) {
+  EIGEN_UNUSED_VARIABLE(resIncr);
+  eigen_internal_assert(resIncr == 1);
+
+  // The following copy tells the compiler that lhs's attributes are not modified outside this function
+  // This helps GCC to generate proper code.
+  LhsMapper lhs(alhs);
+  RhsMapper rhs2(rhs);
+
+  const Index lhsStride = lhs.stride();
+
+  // TODO: improve the following heuristic:
+  const Index block_cols = cols < 128 ? cols : (lhsStride * sizeof(bfloat16) < 16000 ? 16 : 8);
+  float falpha = Eigen::bfloat16_impl::bfloat16_to_float(alpha);
+  Packet4f pAlpha = pset1<Packet4f>(falpha);
+
+  ei_declare_aligned_stack_constructed_variable(float, result, rows, 0);
+
+  convertArrayPointerBF16toF32(result, 1, rows, res);
+
+  for (Index j2 = 0; j2 < cols; j2 += block_cols) {
+    Index jend = numext::mini(j2 + block_cols, cols);
+
+    using LhsSubMapper = typename LhsMapper::SubMapper;
+
+    LhsSubMapper lhs2 = lhs.getSubMapper(0, j2);
+    UseMMAStride<RhsMapper, LhsSubMapper>::run(j2, jend, rows, lhs2, rhs2, pAlpha, result);
+  }
+
+  convertArrayPointerF32toBF16(result, rows, res);
+}
+
+static Packet16uc p16uc_ELEMENT_VEC3 = {0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f,
+                                        0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f};
+
+template <Index num_acc>
+EIGEN_ALWAYS_INLINE void preduxVecResults2(Packet4f (&acc)[num_acc][4], Index k) {
+  if (num_acc > (k + 1)) {
+    acc[k][0] = vec_mergeh(acc[k][0], acc[k + 1][0]);
+    acc[k][1] = vec_mergeo(acc[k][1], acc[k + 1][1]);
+    acc[k][2] = vec_mergel(acc[k][2], acc[k + 1][2]);
+    acc[k][3] = vec_perm(acc[k][3], acc[k + 1][3], p16uc_ELEMENT_VEC3);
+
+    acc[k][0] = (acc[k][0] + acc[k][2]) + (acc[k][1] + acc[k][3]);
+  } else {
+    acc[k][0] = vec_mergeh(acc[k][0], acc[k][1]);
+    acc[k][0] += vec_mergel(acc[k][2], acc[k][3]);
+#ifdef _BIG_ENDIAN
+    acc[k][0] += vec_sld(acc[k][0], acc[k][0], 12);
+#else
+    acc[k][0] += vec_sld(acc[k][0], acc[k][0], 4);
+#endif
+  }
+}
+
+template <Index num_acc>
+EIGEN_ALWAYS_INLINE void preduxVecResults(Packet4f (&acc)[num_acc][4]) {
+  BFLOAT16_UNROLL
+  for (Index k = 0; k < num_acc; k += 4) {
+    preduxVecResults2<num_acc>(acc, k + 0);
+    if (num_acc > (k + 2)) {
+      preduxVecResults2<num_acc>(acc, k + 2);
+      acc[k + 0][0] = reinterpret_cast<Packet4f>(
+          vec_mergeh(reinterpret_cast<Packet2ul>(acc[k + 0][0]), reinterpret_cast<Packet2ul>(acc[k + 2][0])));
+    }
+  }
+}
+
+template <Index num_acc, typename LhsMapper, typename RhsMapper, bool extra>
+EIGEN_ALWAYS_INLINE void multVecLoop(__vector_quad (&quad_acc)[num_acc], const LhsMapper& lhs, RhsMapper& rhs, Index j,
+                                     Index extra_cols) {
+  Packet8bf a0[num_acc], b0;
+
+  if (extra) {
+    b0 = rhs.template loadPacketPartial<Packet8bf>(j, extra_cols);
+  } else {
+    b0 = rhs.template loadPacket<Packet8bf>(j);
+  }
+
+  const LhsMapper lhs2 = lhs.getSubMapper(0, j);
+  BFLOAT16_UNROLL
+  for (Index k = 0; k < num_acc; k++) {
+    if (extra) {
+      a0[k] = lhs2.template loadPacketPartial<Packet8bf>(k, 0, extra_cols);
+    } else {
+      a0[k] = lhs2.template loadPacket<Packet8bf>(k, 0);
+    }
+  }
+
+  multVec<num_acc>(quad_acc, a0, b0);
+}
+
+template <Index num_acc, typename LhsMapper, typename RhsMapper>
+EIGEN_ALWAYS_INLINE void vecLoop(Index cols, const LhsMapper& lhs, RhsMapper& rhs, __vector_quad (&quad_acc)[num_acc],
+                                 Index extra_cols) {
+  Index j = 0;
+  for (; j + 8 <= cols; j += 8) {
+    multVecLoop<num_acc, LhsMapper, RhsMapper, false>(quad_acc, lhs, rhs, j, extra_cols);
+  }
+
+  if (extra_cols) {
+    multVecLoop<num_acc, LhsMapper, RhsMapper, true>(quad_acc, lhs, rhs, j, extra_cols);
+  }
+}
+
+template <const Index num_acc, typename LhsMapper, typename RhsMapper>
+void colVecLoopBody(Index& row, Index cols, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha,
+                    float* result) {
+  constexpr bool multiIters = (num_acc == MAX_BFLOAT16_VEC_ACC);
+  const Index extra_cols = (cols & 7);
+
+  do {
+    Packet4f acc[num_acc][4];
+    __vector_quad quad_acc[num_acc];
+
+    zeroAccumulators<num_acc>(quad_acc);
+
+    const LhsMapper lhs2 = lhs.getSubMapper(row, 0);
+    vecLoop<num_acc, LhsMapper, RhsMapper>(cols, lhs2, rhs, quad_acc, extra_cols);
+
+    disassembleAccumulators<num_acc>(quad_acc, acc);
+
+    preduxVecResults<num_acc>(acc);
+
+    outputVecResults<num_acc>(acc, result, pAlpha);
+
+    result += num_acc;
+  } while (multiIters && (num_acc <= rows - (row += num_acc)));
+}
+
+template <const Index num_acc, typename LhsMapper, typename RhsMapper>
+EIGEN_ALWAYS_INLINE void colVecLoopBodyExtraN(Index& row, Index cols, Index rows, LhsMapper& lhs, RhsMapper& rhs,
+                                              const Packet4f pAlpha, float* result) {
+  if (MAX_BFLOAT16_VEC_ACC > num_acc) {
+    colVecLoopBody<num_acc, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
+  }
+}
+
+template <typename LhsMapper, typename RhsMapper>
+EIGEN_ALWAYS_INLINE void colVecLoopBodyExtra(Index& row, Index cols, Index rows, LhsMapper& lhs, RhsMapper& rhs,
+                                             const Packet4f pAlpha, float* result) {
+  switch (rows - row) {
+    case 7:
+      colVecLoopBodyExtraN<7, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
+      break;
+    case 6:
+      colVecLoopBodyExtraN<6, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
+      break;
+    case 5:
+      colVecLoopBodyExtraN<5, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
+      break;
+    case 4:
+      colVecLoopBodyExtraN<4, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
+      break;
+    case 3:
+      colVecLoopBodyExtraN<3, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
+      break;
+    case 2:
+      colVecLoopBodyExtraN<2, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
+      break;
+    case 1:
+      colVecLoopBodyExtraN<1, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
+      break;
+  }
+}
+
+template <typename LhsMapper, typename RhsMapper>
+EIGEN_ALWAYS_INLINE void calcVecLoops(Index cols, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha,
+                                      float* result) {
+  Index row = 0;
+  if (rows >= MAX_BFLOAT16_VEC_ACC) {
+    colVecLoopBody<MAX_BFLOAT16_VEC_ACC, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
+    result += row;
+  }
+  colVecLoopBodyExtra<LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
+}
+
+template <typename LhsMapper, typename RhsMapper>
+EIGEN_STRONG_INLINE void gemvMMA_bfloat16_row(Index rows, Index cols, const LhsMapper& alhs, const RhsMapper& rhs,
+                                              bfloat16* res, Index resIncr, bfloat16 alpha) {
+  typedef typename RhsMapper::LinearMapper LinearMapper;
+
+  // The following copy tells the compiler that lhs's attributes are not modified outside this function
+  // This helps GCC to generate proper code.
+  LhsMapper lhs(alhs);
+  LinearMapper rhs2 = rhs.getLinearMapper(0, 0);
+
+  eigen_internal_assert(rhs.stride() == 1);
+
+  float falpha = Eigen::bfloat16_impl::bfloat16_to_float(alpha);
+  const Packet4f pAlpha = pset1<Packet4f>(falpha);
+
+  ei_declare_aligned_stack_constructed_variable(float, result, rows, 0);
+  if (resIncr == 1) {
+    convertArrayPointerBF16toF32(result, 1, rows, res);
+  } else {
+    convertArrayPointerBF16toF32<true>(result, 1, rows, res, resIncr);
+  }
+  calcVecLoops<LhsMapper, LinearMapper>(cols, rows, lhs, rhs2, pAlpha, result);
+  if (resIncr == 1) {
+    convertArrayPointerF32toBF16(result, rows, res);
+  } else {
+    convertArrayPointerF32toBF16<true>(result, rows, res, resIncr);
+  }
+}
+#endif
+
+#undef MAX_BFLOAT16_VEC_ACC
+#undef BFLOAT16_UNROLL
+
+}  // namespace internal
+}  // namespace Eigen
+#endif  // EIGEN_MATRIX_PRODUCT_MMA_BFLOAT16_ALTIVEC_H
diff --git a/inst/include/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.inc b/inst/include/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.inc
new file mode 100644
index 00000000..90c0d392
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/AltiVec/MatrixVectorProduct.inc
@@ -0,0 +1,2818 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2021 Chip Kerchner (chip.kerchner@ibm.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATRIX_VECTOR_PRODUCT_ALTIVEC_H
+#define EIGEN_MATRIX_VECTOR_PRODUCT_ALTIVEC_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+#if defined(__MMA__) && !EIGEN_ALTIVEC_DISABLE_MMA
+#if EIGEN_COMP_LLVM || (__GNUC__ > 10 || __GNUC_MINOR__ >= 3)
+#define USE_GEMV_MMA
+#endif
+
+#if !EIGEN_COMP_LLVM && (__GNUC__ < 11)
+// Only allow one vector_pair in buggy gcc - gcc 10.x has a bug
+#define GCC_ONE_VECTORPAIR_BUG
+#endif
+#endif
+
+// #define USE_SLOWER_GEMV_MMA   // MMA is currently not as fast as VSX in complex double GEMV (revisit when gcc is
+// improved)
+
+// #define EIGEN_POWER_USE_GEMV_PREFETCH
+#ifdef EIGEN_POWER_USE_GEMV_PREFETCH
+#define EIGEN_POWER_GEMV_PREFETCH(p) prefetch(p)
+#else
+#define EIGEN_POWER_GEMV_PREFETCH(p)
+#endif
+
+#ifdef __has_builtin
+#if !__has_builtin(__builtin_vsx_assemble_pair)
+#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair
+#endif
+#if !__has_builtin(__builtin_vsx_disassemble_pair)
+#define __builtin_vsx_disassemble_pair __builtin_mma_disassemble_pair
+#endif
+#endif
+
+#if EIGEN_COMP_LLVM
+#define GEMV_BUILDPAIR_MMA(dst, src1, src2) \
+  __builtin_vsx_assemble_pair(&dst, (__vector unsigned char)src2, (__vector unsigned char)src1)
+#else
+#if (__GNUC__ <= 10)
+#if (__GNUC_MINOR__ > 3)
+#define GEMV_BUILDPAIR_MMA(dst, src1, src2) \
+  __builtin_vsx_assemble_pair(&dst, (__vector unsigned char)src2, (__vector unsigned char)src1)
+#else
+#define GEMV_BUILDPAIR_MMA(dst, src1, src2) \
+  __builtin_vsx_assemble_pair(&dst, (__vector unsigned char)src1, (__vector unsigned char)src2)
+#endif
+#else
+#define GEMV_BUILDPAIR_MMA(dst, src1, src2) \
+  __builtin_vsx_build_pair(&dst, (__vector unsigned char)src1, (__vector unsigned char)src2)
+#endif
+#endif
+
+#define GEMV_IS_COMPLEX_COMPLEX ((sizeof(LhsPacket) == 16) && (sizeof(RhsPacket) == 16))
+#define GEMV_IS_FLOAT (ResPacketSize == (16 / sizeof(float)))
+#define GEMV_IS_SCALAR (sizeof(ResPacket) != 16)
+#define GEMV_IS_COMPLEX_FLOAT (ResPacketSize == (16 / sizeof(std::complex<float>)))
+
+/** \internal multiply and add and store results */
+template <typename ResPacket, typename ResScalar>
+EIGEN_ALWAYS_INLINE void storeMaddData(ResScalar* res, ResPacket& palpha, ResPacket& data) {
+  pstoreu(res, pmadd(data, palpha, ploadu<ResPacket>(res)));
+}
+
+template <typename ResScalar>
+EIGEN_ALWAYS_INLINE void storeMaddData(ResScalar* res, ResScalar& alpha, ResScalar& data) {
+  *res += (alpha * data);
+}
+
+#define GEMV_UNROLL(func, N) func(0, N) func(1, N) func(2, N) func(3, N) func(4, N) func(5, N) func(6, N) func(7, N)
+
+#define GEMV_UNROLL_HALF(func, N) func(0, 0, 1, N) func(1, 2, 3, N) func(2, 4, 5, N) func(3, 6, 7, N)
+
+#define GEMV_GETN(N) (((N) * ResPacketSize) >> 2)
+
+#define GEMV_LOADPACKET_COL(iter) lhs.template load<LhsPacket, LhsAlignment>(i + ((iter) * LhsPacketSize), j)
+
+#ifdef USE_GEMV_MMA
+#define GEMV_UNROLL3(func, N, which)                                                                          \
+  func(0, N, which) func(1, N, which) func(2, N, which) func(3, N, which) func(4, N, which) func(5, N, which) \
+      func(6, N, which) func(7, N, which)
+
+#define GEMV_UNUSED_VAR(iter, N, which) \
+  if (GEMV_GETN(N) <= iter) {           \
+    EIGEN_UNUSED_VARIABLE(which##iter); \
+  }
+
+#define GEMV_UNUSED_EXTRA_VAR(iter, N, which) \
+  if (N <= iter) {                            \
+    EIGEN_UNUSED_VARIABLE(which##iter);       \
+  }
+
+#define GEMV_UNUSED_EXTRA(N, which) GEMV_UNROLL3(GEMV_UNUSED_EXTRA_VAR, N, which)
+
+#define GEMV_UNUSED(N, which) GEMV_UNROLL3(GEMV_UNUSED_VAR, N, which)
+
+#define GEMV_INIT_MMA(iter, N)         \
+  if (GEMV_GETN(N) > iter) {           \
+    __builtin_mma_xxsetaccz(&e##iter); \
+  }
+
+#if EIGEN_COMP_LLVM
+#define GEMV_LOADPAIR_COL_MMA(iter1, iter2) \
+  GEMV_BUILDPAIR_MMA(b##iter1, GEMV_LOADPACKET_COL(iter2), GEMV_LOADPACKET_COL((iter2) + 1));
+#else
+#define GEMV_LOADPAIR_COL_MMA(iter1, iter2)                                     \
+  const LhsScalar& src##iter1 = lhs(i + ((iter1 * 32) / sizeof(LhsScalar)), j); \
+  b##iter1 = *reinterpret_cast<__vector_pair*>(const_cast<LhsScalar*>(&src##iter1));
+#endif
+
+#define GEMV_LOAD1A_COL_MMA(iter, N)         \
+  if (GEMV_GETN(N) > iter) {                 \
+    if (GEMV_IS_FLOAT) {                     \
+      g##iter = GEMV_LOADPACKET_COL(iter);   \
+      EIGEN_UNUSED_VARIABLE(b##iter);        \
+    } else {                                 \
+      GEMV_LOADPAIR_COL_MMA(iter, iter << 1) \
+      EIGEN_UNUSED_VARIABLE(g##iter);        \
+    }                                        \
+  } else {                                   \
+    EIGEN_UNUSED_VARIABLE(b##iter);          \
+    EIGEN_UNUSED_VARIABLE(g##iter);          \
+  }
+
+#define GEMV_WORK1A_COL_MMA(iter, N)                                      \
+  if (GEMV_GETN(N) > iter) {                                              \
+    if (GEMV_IS_FLOAT) {                                                  \
+      pger_vecMMA_acc<LhsPacket, RhsPacket, true>(&e##iter, a0, g##iter); \
+    } else {                                                              \
+      pger_vecMMA_acc<LhsPacket, RhsPacket, true>(&e##iter, b##iter, a0); \
+    }                                                                     \
+  }
+
+#define GEMV_LOAD1B_COL_MMA(iter1, iter2, iter3, N) \
+  if (GEMV_GETN(N) > iter1) {                       \
+    if (GEMV_IS_FLOAT) {                            \
+      GEMV_LOADPAIR_COL_MMA(iter2, iter2)           \
+      EIGEN_UNUSED_VARIABLE(b##iter3);              \
+    } else {                                        \
+      GEMV_LOADPAIR_COL_MMA(iter2, iter2 << 1)      \
+      GEMV_LOADPAIR_COL_MMA(iter3, iter3 << 1)      \
+    }                                               \
+  } else {                                          \
+    EIGEN_UNUSED_VARIABLE(b##iter2);                \
+    EIGEN_UNUSED_VARIABLE(b##iter3);                \
+  }                                                 \
+  EIGEN_UNUSED_VARIABLE(g##iter2);                  \
+  EIGEN_UNUSED_VARIABLE(g##iter3);
+
+#define GEMV_WORK1B_COL_MMA(iter1, iter2, iter3, N)                          \
+  if (GEMV_GETN(N) > iter1) {                                                \
+    if (GEMV_IS_FLOAT) {                                                     \
+      LhsPacket h[2];                                                        \
+      __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(h), &b##iter2); \
+      pger_vecMMA_acc<LhsPacket, RhsPacket, true>(&e##iter2, a0, h[0]);      \
+      pger_vecMMA_acc<LhsPacket, RhsPacket, true>(&e##iter3, a0, h[1]);      \
+    } else {                                                                 \
+      pger_vecMMA_acc<LhsPacket, RhsPacket, true>(&e##iter2, b##iter2, a0);  \
+      pger_vecMMA_acc<LhsPacket, RhsPacket, true>(&e##iter3, b##iter3, a0);  \
+    }                                                                        \
+  }
+
+#if EIGEN_COMP_LLVM
+#define GEMV_LOAD_COL_MMA(N)                        \
+  if (GEMV_GETN(N) > 1) {                           \
+    GEMV_UNROLL_HALF(GEMV_LOAD1B_COL_MMA, (N >> 1)) \
+  } else {                                          \
+    GEMV_UNROLL(GEMV_LOAD1A_COL_MMA, N)             \
+  }
+
+#define GEMV_WORK_COL_MMA(N)                        \
+  if (GEMV_GETN(N) > 1) {                           \
+    GEMV_UNROLL_HALF(GEMV_WORK1B_COL_MMA, (N >> 1)) \
+  } else {                                          \
+    GEMV_UNROLL(GEMV_WORK1A_COL_MMA, N)             \
+  }
+#else
+#define GEMV_LOAD_COL_MMA(N) GEMV_UNROLL(GEMV_LOAD1A_COL_MMA, N)
+
+#define GEMV_WORK_COL_MMA(N) GEMV_UNROLL(GEMV_WORK1A_COL_MMA, N)
+#endif
+
+#define GEMV_DISASSEMBLE_MMA(iter, N)                              \
+  if (GEMV_GETN(N) > iter) {                                       \
+    __builtin_mma_disassemble_acc(&result##iter.packet, &e##iter); \
+    if (!GEMV_IS_FLOAT) {                                          \
+      result##iter.packet[0][1] = result##iter.packet[1][0];       \
+      result##iter.packet[2][1] = result##iter.packet[3][0];       \
+    }                                                              \
+  }
+
+#define GEMV_LOADPAIR2_COL_MMA(iter1, iter2) \
+  b##iter1 = *reinterpret_cast<__vector_pair*>(res + i + ((iter2) * ResPacketSize));
+
+#define GEMV_LOAD2_COL_MMA(iter1, iter2, iter3, N) \
+  if (GEMV_GETN(N) > iter1) {                      \
+    if (GEMV_IS_FLOAT) {                           \
+      GEMV_LOADPAIR2_COL_MMA(iter2, iter2);        \
+      EIGEN_UNUSED_VARIABLE(b##iter3);             \
+    } else {                                       \
+      GEMV_LOADPAIR2_COL_MMA(iter2, iter2 << 1);   \
+      GEMV_LOADPAIR2_COL_MMA(iter3, iter3 << 1);   \
+    }                                              \
+  } else {                                         \
+    EIGEN_UNUSED_VARIABLE(b##iter2);               \
+    EIGEN_UNUSED_VARIABLE(b##iter3);               \
+  }
+
+#if EIGEN_COMP_LLVM
+#define GEMV_WORKPAIR2_COL_MMA(iter2, iter3, iter4)                                         \
+  ResPacket f##iter2[2];                                                                    \
+  __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(f##iter2), &b##iter2);             \
+  f##iter2[0] = pmadd(result##iter2.packet[0], palpha, f##iter2[0]);                        \
+  f##iter2[1] = pmadd(result##iter3.packet[(iter2 == iter3) ? 2 : 0], palpha, f##iter2[1]); \
+  GEMV_BUILDPAIR_MMA(b##iter2, f##iter2[0], f##iter2[1]);
+#else
+#define GEMV_WORKPAIR2_COL_MMA(iter2, iter3, iter4)                                        \
+  if (GEMV_IS_FLOAT) {                                                                     \
+    __asm__("xvmaddasp %0,%x1,%x3\n\txvmaddasp %L0,%x2,%x3"                                \
+            : "+&d"(b##iter2)                                                              \
+            : "wa"(result##iter3.packet[0]), "wa"(result##iter2.packet[0]), "wa"(palpha)); \
+  } else {                                                                                 \
+    __asm__("xvmaddadp %0,%x1,%x3\n\txvmaddadp %L0,%x2,%x3"                                \
+            : "+&d"(b##iter2)                                                              \
+            : "wa"(result##iter2.packet[2]), "wa"(result##iter2.packet[0]), "wa"(palpha)); \
+  }
+#endif
+
+#define GEMV_WORK2_COL_MMA(iter1, iter2, iter3, N)      \
+  if (GEMV_GETN(N) > iter1) {                           \
+    if (GEMV_IS_FLOAT) {                                \
+      GEMV_WORKPAIR2_COL_MMA(iter2, iter3, iter2);      \
+    } else {                                            \
+      GEMV_WORKPAIR2_COL_MMA(iter2, iter2, iter2 << 1); \
+      GEMV_WORKPAIR2_COL_MMA(iter3, iter3, iter3 << 1); \
+    }                                                   \
+  }
+
+#define GEMV_STOREPAIR2_COL_MMA(iter1, iter2) \
+  *reinterpret_cast<__vector_pair*>(res + i + ((iter2) * ResPacketSize)) = b##iter1;
+
+#define GEMV_STORE_COL_MMA(iter, N)                                                                          \
+  if (GEMV_GETN(N) > iter) {                                                                                 \
+    if (GEMV_IS_FLOAT) {                                                                                     \
+      storeMaddData<ResPacket, ResScalar>(res + i + (iter * ResPacketSize), palpha, result##iter.packet[0]); \
+    } else {                                                                                                 \
+      GEMV_LOADPAIR2_COL_MMA(iter, iter << 1)                                                                \
+      GEMV_WORKPAIR2_COL_MMA(iter, iter, iter << 1)                                                          \
+      GEMV_STOREPAIR2_COL_MMA(iter, iter << 1)                                                               \
+    }                                                                                                        \
+  }
+
+#define GEMV_STORE2_COL_MMA(iter1, iter2, iter3, N) \
+  if (GEMV_GETN(N) > iter1) {                       \
+    if (GEMV_IS_FLOAT) {                            \
+      GEMV_STOREPAIR2_COL_MMA(iter2, iter2);        \
+    } else {                                        \
+      GEMV_STOREPAIR2_COL_MMA(iter2, iter2 << 1)    \
+      GEMV_STOREPAIR2_COL_MMA(iter3, iter3 << 1)    \
+    }                                               \
+  }
+
+#define GEMV_PROCESS_COL_ONE_MMA(N)                 \
+  GEMV_UNROLL(GEMV_INIT_MMA, N)                     \
+  Index j = j2;                                     \
+  __vector_pair b0, b1, b2, b3, b4, b5, b6, b7;     \
+  do {                                              \
+    LhsPacket g0, g1, g2, g3, g4, g5, g6, g7;       \
+    RhsPacket a0 = pset1<RhsPacket>(rhs2(j, 0));    \
+    GEMV_UNROLL(GEMV_PREFETCH, N)                   \
+    GEMV_LOAD_COL_MMA(N)                            \
+    GEMV_WORK_COL_MMA(N)                            \
+  } while (++j < jend);                             \
+  GEMV_UNROLL(GEMV_DISASSEMBLE_MMA, N)              \
+  if (GEMV_GETN(N) <= 1) {                          \
+    GEMV_UNROLL(GEMV_STORE_COL_MMA, N)              \
+  } else {                                          \
+    GEMV_UNROLL_HALF(GEMV_LOAD2_COL_MMA, (N >> 1))  \
+    GEMV_UNROLL_HALF(GEMV_WORK2_COL_MMA, (N >> 1))  \
+    GEMV_UNROLL_HALF(GEMV_STORE2_COL_MMA, (N >> 1)) \
+  }                                                 \
+  i += (ResPacketSize * N);
+#endif
+
+#define GEMV_INIT(iter, N)                    \
+  if (N > iter) {                             \
+    c##iter = pset1<ResPacket>(ResScalar(0)); \
+  } else {                                    \
+    EIGEN_UNUSED_VARIABLE(c##iter);           \
+  }
+
+#ifdef EIGEN_POWER_USE_GEMV_PREFETCH
+#define GEMV_PREFETCH(iter, N)                                   \
+  if (GEMV_GETN(N) > ((iter >> 1) + ((N >> 1) * (iter & 1)))) {  \
+    lhs.prefetch(i + (iter * LhsPacketSize) + prefetch_dist, j); \
+  }
+#else
+#define GEMV_PREFETCH(iter, N)
+#endif
+
+#define GEMV_WORK_COL(iter, N)                                   \
+  if (N > iter) {                                                \
+    c##iter = pcj.pmadd(GEMV_LOADPACKET_COL(iter), a0, c##iter); \
+  }
+
+#define GEMV_STORE_COL(iter, N)                                                           \
+  if (N > iter) {                                                                         \
+    pstoreu(res + i + (iter * ResPacketSize),                                             \
+            pmadd(c##iter, palpha, ploadu<ResPacket>(res + i + (iter * ResPacketSize)))); \
+  }
+
+/** \internal main macro for gemv_col - initialize accumulators, multiply and add inputs, and store results */
+#define GEMV_PROCESS_COL_ONE(N)                  \
+  GEMV_UNROLL(GEMV_INIT, N)                      \
+  Index j = j2;                                  \
+  do {                                           \
+    RhsPacket a0 = pset1<RhsPacket>(rhs2(j, 0)); \
+    GEMV_UNROLL(GEMV_PREFETCH, N)                \
+    GEMV_UNROLL(GEMV_WORK_COL, N)                \
+  } while (++j < jend);                          \
+  GEMV_UNROLL(GEMV_STORE_COL, N)                 \
+  i += (ResPacketSize * N);
+
+#ifdef USE_GEMV_MMA
+#define GEMV_PROCESS_COL(N) GEMV_PROCESS_COL_ONE_MMA(N)
+#else
+#define GEMV_PROCESS_COL(N) GEMV_PROCESS_COL_ONE(N)
+#endif
+
+/** \internal perform a matrix multiply and accumulate of packet a and packet b */
+#ifdef USE_GEMV_MMA
+template <typename LhsPacket, typename RhsPacket, bool accumulate>
+EIGEN_ALWAYS_INLINE void pger_vecMMA_acc(__vector_quad* acc, const RhsPacket& a, const LhsPacket& b) {
+  if (accumulate) {
+    __builtin_mma_xvf32gerpp(acc, (__vector unsigned char)a, (__vector unsigned char)b);
+  } else {
+    __builtin_mma_xvf32ger(acc, (__vector unsigned char)a, (__vector unsigned char)b);
+  }
+}
+
+/** \internal perform a matrix multiply and accumulate of vector_pair a and packet b */
+template <typename LhsPacket, typename RhsPacket, bool accumulate>
+EIGEN_ALWAYS_INLINE void pger_vecMMA_acc(__vector_quad* acc, __vector_pair& a, const LhsPacket& b) {
+  if (accumulate) {
+    __builtin_mma_xvf64gerpp(acc, a, (__vector unsigned char)b);
+  } else {
+    __builtin_mma_xvf64ger(acc, a, (__vector unsigned char)b);
+  }
+}
+#endif
+
+template <typename LhsScalar, typename LhsMapper, typename RhsScalar, typename RhsMapper, typename ResScalar>
+EIGEN_STRONG_INLINE void gemv_col(Index rows, Index cols, const LhsMapper& alhs, const RhsMapper& rhs, ResScalar* res,
+                                  Index resIncr, ResScalar alpha) {
+  typedef gemv_traits<LhsScalar, RhsScalar> Traits;
+
+  typedef typename Traits::LhsPacket LhsPacket;
+  typedef typename Traits::RhsPacket RhsPacket;
+  typedef typename Traits::ResPacket ResPacket;
+
+  EIGEN_UNUSED_VARIABLE(resIncr);
+  eigen_internal_assert(resIncr == 1);
+
+  // The following copy tells the compiler that lhs's attributes are not modified outside this function
+  // This helps GCC to generate proper code.
+  LhsMapper lhs(alhs);
+  RhsMapper rhs2(rhs);
+
+  conj_helper<LhsScalar, RhsScalar, false, false> cj;
+  conj_helper<LhsPacket, RhsPacket, false, false> pcj;
+
+  const Index lhsStride = lhs.stride();
+  // TODO: for padded aligned inputs, we could enable aligned reads
+  enum {
+    LhsAlignment = Unaligned,
+    ResPacketSize = Traits::ResPacketSize,
+    LhsPacketSize = Traits::LhsPacketSize,
+    RhsPacketSize = Traits::RhsPacketSize,
+  };
+
+#ifndef GCC_ONE_VECTORPAIR_BUG
+  const Index n8 = rows - 8 * ResPacketSize + 1;
+  const Index n4 = rows - 4 * ResPacketSize + 1;
+  const Index n2 = rows - 2 * ResPacketSize + 1;
+#endif
+  const Index n1 = rows - 1 * ResPacketSize + 1;
+#ifdef EIGEN_POWER_USE_GEMV_PREFETCH
+  const Index prefetch_dist = 64 * LhsPacketSize;
+#endif
+
+  // TODO: improve the following heuristic:
+  const Index block_cols = cols < 128 ? cols : (lhsStride * sizeof(LhsScalar) < 16000 ? 16 : 8);
+  ResPacket palpha = pset1<ResPacket>(alpha);
+
+  for (Index j2 = 0; j2 < cols; j2 += block_cols) {
+    Index jend = numext::mini(j2 + block_cols, cols);
+    Index i = 0;
+    ResPacket c0, c1, c2, c3, c4, c5, c6, c7;
+#ifdef USE_GEMV_MMA
+    __vector_quad e0, e1, e2, e3, e4, e5, e6, e7;
+    PacketBlock<ResPacket, 4> result0, result1, result2, result3, result4, result5, result6, result7;
+    GEMV_UNUSED(8, e)
+    GEMV_UNUSED(8, result)
+    GEMV_UNUSED_EXTRA(1, c)
+#endif
+#ifndef GCC_ONE_VECTORPAIR_BUG
+    while (i < n8) {
+      GEMV_PROCESS_COL(8)
+    }
+    if (i < n4) {
+      GEMV_PROCESS_COL(4)
+    }
+    if (i < n2) {
+      GEMV_PROCESS_COL(2)
+    }
+    if (i < n1)
+#else
+    while (i < n1)
+#endif
+    {
+      GEMV_PROCESS_COL_ONE(1)
+    }
+    for (; i < rows; ++i) {
+      ResScalar d0(0);
+      Index j = j2;
+      do {
+        d0 += cj.pmul(lhs(i, j), rhs2(j, 0));
+      } while (++j < jend);
+      res[i] += alpha * d0;
+    }
+  }
+}
+
+template <bool extraRows>
+EIGEN_ALWAYS_INLINE void outputVecCol(Packet4f acc, float* result, Packet4f pAlpha, Index extra_rows) {
+  Packet4f d0 = ploadu<Packet4f>(result);
+  d0 = pmadd(acc, pAlpha, d0);
+  if (extraRows) {
+    pstoreu_partial(result, d0, extra_rows);
+  } else {
+    pstoreu(result, d0);
+  }
+}
+
+template <Index num_acc, bool extraRows, Index size>
+EIGEN_ALWAYS_INLINE void outputVecColResults(Packet4f (&acc)[num_acc][size], float* result, Packet4f pAlpha,
+                                             Index extra_rows) {
+  constexpr Index real_acc = (num_acc - (extraRows ? 1 : 0));
+  for (Index k = 0; k < real_acc; k++) {
+    outputVecCol<false>(acc[k][0], result + k * 4, pAlpha, extra_rows);
+  }
+  if (extraRows) {
+    outputVecCol<true>(acc[real_acc][0], result + real_acc * 4, pAlpha, extra_rows);
+  }
+}
+
+static Packet16uc p16uc_MERGE16_32_V1 = {0, 1, 16, 17, 0, 1, 16, 17, 0, 1, 16, 17, 0, 1, 16, 17};
+static Packet16uc p16uc_MERGE16_32_V2 = {2, 3, 18, 19, 2, 3, 18, 19, 2, 3, 18, 19, 2, 3, 18, 19};
+
+template <Index num_acc, typename LhsMapper, bool zero>
+EIGEN_ALWAYS_INLINE void loadVecLoopVSX(Index k, LhsMapper& lhs, Packet4f (&a0)[num_acc][2]) {
+  Packet8bf c0 = lhs.template loadPacket<Packet8bf>(k * 4, 0);
+  Packet8bf b1;
+  if (!zero) {
+    b1 = lhs.template loadPacket<Packet8bf>(k * 4, 1);
+
+    a0[k + 0][1] = oneConvertBF16Hi(b1.m_val);
+  }
+  a0[k + 0][0] = oneConvertBF16Hi(c0.m_val);
+
+  if (num_acc > (k + 1)) {
+    a0[k + 1][0] = oneConvertBF16Lo(c0.m_val);
+    if (!zero) {
+      a0[k + 1][1] = oneConvertBF16Lo(b1.m_val);
+    }
+  }
+}
+
+template <Index num_acc, bool zero>
+EIGEN_ALWAYS_INLINE void multVecVSX(Packet4f (&acc)[num_acc][2], Packet4f (&a0)[num_acc][2], Packet4f (&b0)[2]) {
+  for (Index k = 0; k < num_acc; k++) {
+    for (Index i = 0; i < (zero ? 1 : 2); i++) {
+      acc[k][i] = pmadd(b0[i], a0[k][i], acc[k][i]);
+    }
+  }
+}
+
+template <typename RhsMapper, bool linear>
+struct loadColData_impl {
+  // linear == false
+  static EIGEN_ALWAYS_INLINE Packet8bf run(RhsMapper& rhs, Index j) {
+    const Index n = unpacket_traits<Packet8bf>::size;
+    EIGEN_ALIGN16 bfloat16 to[n];
+    LOAD_STORE_UNROLL_16
+    for (Index i = 0; i < n; i++) {
+      to[i] = rhs(j + i, 0);
+    }
+    return pload<Packet8bf>(to);
+  }
+};
+
+template <typename RhsMapper>
+struct loadColData_impl<RhsMapper, true> {
+  // linear == true
+  static EIGEN_ALWAYS_INLINE Packet8bf run(RhsMapper& rhs, Index j) {
+    return rhs.template loadPacket<Packet8bf>(j + 0, 0);
+  }
+};
+
+template <typename RhsMapper, bool linear>
+EIGEN_ALWAYS_INLINE Packet8bf loadColData(RhsMapper& rhs, Index j) {
+  return loadColData_impl<RhsMapper, linear>::run(rhs, j);
+}
+
+template <Index num_acc, typename LhsMapper, typename RhsMapper, bool zero, bool linear>
+EIGEN_ALWAYS_INLINE void vecColLoopVSX(Index j, LhsMapper& lhs, RhsMapper& rhs, Packet4f (&acc)[num_acc][2]) {
+  Packet4f a0[num_acc][2], b0[2];
+  Packet8bf b2 = loadColData<RhsMapper, linear>(rhs, j);
+
+  b0[0] = oneConvertBF16Perm(b2.m_val, p16uc_MERGE16_32_V1);
+  if (!zero) {
+    b0[1] = oneConvertBF16Perm(b2.m_val, p16uc_MERGE16_32_V2);
+  }
+
+  using LhsSubMapper = typename LhsMapper::SubMapper;
+
+  LhsSubMapper lhs2 = lhs.getSubMapper(0, j);
+  for (Index k = 0; k < num_acc; k += 2) {
+    loadVecLoopVSX<num_acc, LhsSubMapper, zero>(k, lhs2, a0);
+  }
+
+  multVecVSX<num_acc, zero>(acc, a0, b0);
+}
+
+template <Index num_acc>
+EIGEN_ALWAYS_INLINE void addResultsVSX(Packet4f (&acc)[num_acc][2]) {
+  for (Index i = 0; i < num_acc; i++) {
+    acc[i][0] = acc[i][0] + acc[i][1];
+  }
+}
+
+// Uses 2X the accumulators or 4X the number of VSX registers
+#define MAX_BFLOAT16_VEC_ACC_VSX 8
+
+template <const Index num_acc, typename LhsMapper, typename RhsMapper, bool extraRows, bool linear>
+void colVSXVecColLoopBody(Index& row, Index cend, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha,
+                          float* result) {
+  constexpr Index step = (num_acc * 4);
+  const Index extra_rows = (extraRows) ? (rows & 3) : 0;
+  constexpr bool multiIters = !extraRows && (num_acc == MAX_BFLOAT16_VEC_ACC_VSX);
+
+  do {
+    Packet4f acc[num_acc][2];
+
+    zeroAccumulators<num_acc, 2>(acc);
+
+    using LhsSubMapper = typename LhsMapper::SubMapper;
+
+    LhsSubMapper lhs2 = lhs.getSubMapper(row, 0);
+    for (Index j = 0; j + 2 <= cend; j += 2) {
+      vecColLoopVSX<num_acc, LhsSubMapper, RhsMapper, false, linear>(j, lhs2, rhs, acc);
+    }
+    if (cend & 1) {
+      vecColLoopVSX<num_acc, LhsSubMapper, RhsMapper, true, linear>(cend - 1, lhs2, rhs, acc);
+    }
+
+    addResultsVSX<num_acc>(acc);
+
+    outputVecColResults<num_acc, extraRows, 2>(acc, result, pAlpha, extra_rows);
+
+    result += step;
+  } while (multiIters && (step <= rows - (row += step)));
+}
+
+template <const Index num_acc, typename LhsMapper, typename RhsMapper, bool extraRows, bool linear>
+EIGEN_ALWAYS_INLINE void colVSXVecColLoopBodyExtraN(Index& row, Index cend, Index rows, LhsMapper& lhs, RhsMapper& rhs,
+                                                    const Packet4f pAlpha, float* result) {
+  if (MAX_BFLOAT16_VEC_ACC_VSX > num_acc) {
+    colVSXVecColLoopBody<num_acc + (extraRows ? 1 : 0), LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs,
+                                                                                                 rhs, pAlpha, result);
+  }
+}
+
+template <typename LhsMapper, typename RhsMapper, bool extraRows, bool linear>
+EIGEN_ALWAYS_INLINE void colVSXVecColLoopBodyExtra(Index& row, Index cend, Index rows, LhsMapper& lhs, RhsMapper& rhs,
+                                                   const Packet4f pAlpha, float* result) {
+  switch ((rows - row) >> 2) {
+    case 7:
+      colVSXVecColLoopBodyExtraN<7, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+      break;
+    case 6:
+      colVSXVecColLoopBodyExtraN<6, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+      break;
+    case 5:
+      colVSXVecColLoopBodyExtraN<5, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+      break;
+    case 4:
+      colVSXVecColLoopBodyExtraN<4, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+      break;
+    case 3:
+      colVSXVecColLoopBodyExtraN<3, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+      break;
+    case 2:
+      colVSXVecColLoopBodyExtraN<2, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+      break;
+    case 1:
+      colVSXVecColLoopBodyExtraN<1, LhsMapper, RhsMapper, extraRows, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+      break;
+    default:
+      if (extraRows) {
+        colVSXVecColLoopBody<1, LhsMapper, RhsMapper, true, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+      }
+      break;
+  }
+}
+
+template <typename LhsMapper, typename RhsMapper, bool linear>
+EIGEN_ALWAYS_INLINE void calcVSXVecColLoops(Index cend, Index rows, LhsMapper& lhs, RhsMapper& rhs,
+                                            const Packet4f pAlpha, float* result) {
+  Index row = 0;
+  if (rows >= (MAX_BFLOAT16_VEC_ACC_VSX * 4)) {
+    colVSXVecColLoopBody<MAX_BFLOAT16_VEC_ACC_VSX, LhsMapper, RhsMapper, false, linear>(row, cend, rows, lhs, rhs,
+                                                                                        pAlpha, result);
+    result += row;
+  }
+  if (rows & 3) {
+    colVSXVecColLoopBodyExtra<LhsMapper, RhsMapper, true, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+  } else {
+    colVSXVecColLoopBodyExtra<LhsMapper, RhsMapper, false, linear>(row, cend, rows, lhs, rhs, pAlpha, result);
+  }
+}
+
+template <const Index size, bool inc, Index delta>
+EIGEN_ALWAYS_INLINE void storeBF16fromResult(bfloat16* dst, Packet8bf data, Index resInc, Index extra) {
+  if (inc) {
+    if (size < 8) {
+      pscatter_partial(dst + delta * resInc, data, resInc, extra);
+    } else {
+      pscatter(dst + delta * resInc, data, resInc);
+    }
+  } else {
+    if (size < 8) {
+      pstoreu_partial(dst + delta, data, extra);
+    } else {
+      pstoreu(dst + delta, data);
+    }
+  }
+}
+
+template <const Index size, bool inc = false>
+EIGEN_ALWAYS_INLINE void convertPointerF32toBF16VSX(Index& i, float* result, Index rows, bfloat16*& dst,
+                                                    Index resInc = 1) {
+  constexpr Index extra = ((size < 8) ? 8 : size);
+  while (i + size <= rows) {
+    PacketBlock<Packet8bf, (size + 7) / 8> r32;
+    r32.packet[0] = convertF32toBF16VSX(result + i + 0);
+    if (size >= 16) {
+      r32.packet[1] = convertF32toBF16VSX(result + i + 8);
+    }
+    if (size >= 32) {
+      r32.packet[2] = convertF32toBF16VSX(result + i + 16);
+      r32.packet[3] = convertF32toBF16VSX(result + i + 24);
+    }
+    storeBF16fromResult<size, inc, 0>(dst, r32.packet[0], resInc, rows & 7);
+    if (size >= 16) {
+      storeBF16fromResult<size, inc, 8>(dst, r32.packet[1], resInc);
+    }
+    if (size >= 32) {
+      storeBF16fromResult<size, inc, 16>(dst, r32.packet[2], resInc);
+      storeBF16fromResult<size, inc, 24>(dst, r32.packet[3], resInc);
+    }
+    i += extra;
+    dst += extra * resInc;
+    if (size != 32) break;
+  }
+}
+
+template <bool inc = false>
+EIGEN_ALWAYS_INLINE void convertArrayPointerF32toBF16VSX(float* result, Index rows, bfloat16* dst, Index resInc = 1) {
+  Index i = 0;
+  convertPointerF32toBF16VSX<32, inc>(i, result, rows, dst, resInc);
+  convertPointerF32toBF16VSX<16, inc>(i, result, rows, dst, resInc);
+  convertPointerF32toBF16VSX<8, inc>(i, result, rows, dst, resInc);
+  convertPointerF32toBF16VSX<1, inc>(i, result, rows, dst, resInc);
+}
+
+template <typename RhsMapper, typename LhsMapper, typename = void>
+struct UseStride : std::false_type {
+  static EIGEN_ALWAYS_INLINE void run(Index j2, Index jend, Index rows, LhsMapper& lhs, RhsMapper& rhs, Packet4f pAlpha,
+                                      float* result) {
+    using RhsSubMapper = typename RhsMapper::SubMapper;
+
+    RhsSubMapper rhs2 = rhs.getSubMapper(j2, 0);
+    calcVSXVecColLoops<LhsMapper, RhsSubMapper, false>(jend - j2, rows, lhs, rhs2, pAlpha, result);
+  }
+};
+
+template <typename RhsMapper, typename LhsMapper>
+struct UseStride<RhsMapper, LhsMapper,
+                 std::enable_if_t<std::is_member_function_pointer<decltype(&RhsMapper::stride)>::value>>
+    : std::true_type {
+  static EIGEN_ALWAYS_INLINE void run(Index j2, Index jend, Index rows, LhsMapper& lhs, RhsMapper& rhs, Packet4f pAlpha,
+                                      float* result) {
+    using RhsSubMapper = typename RhsMapper::SubMapper;
+
+    RhsSubMapper rhs2 = rhs.getSubMapper(j2, 0);
+    if (rhs.stride() == 1) {
+      calcVSXVecColLoops<LhsMapper, RhsSubMapper, true>(jend - j2, rows, lhs, rhs2, pAlpha, result);
+    } else {
+      calcVSXVecColLoops<LhsMapper, RhsSubMapper, false>(jend - j2, rows, lhs, rhs2, pAlpha, result);
+    }
+  }
+};
+
+template <typename LhsMapper, typename RhsMapper>
+void gemv_bfloat16_col(Index rows, Index cols, const LhsMapper& alhs, const RhsMapper& rhs, bfloat16* res,
+                       Index resIncr, bfloat16 alpha) {
+  EIGEN_UNUSED_VARIABLE(resIncr);
+  eigen_internal_assert(resIncr == 1);
+
+  // The following copy tells the compiler that lhs's attributes are not modified outside this function
+  // This helps GCC to generate proper code.
+  LhsMapper lhs(alhs);
+  RhsMapper rhs2(rhs);
+
+  const Index lhsStride = lhs.stride();
+
+  // TODO: improve the following heuristic:
+  const Index block_cols = cols < 128 ? cols : (lhsStride * sizeof(bfloat16) < 16000 ? 16 : 8);
+  float falpha = Eigen::bfloat16_impl::bfloat16_to_float(alpha);
+  Packet4f pAlpha = pset1<Packet4f>(falpha);
+
+  ei_declare_aligned_stack_constructed_variable(float, result, rows, 0);
+
+  convertArrayPointerBF16toF32(result, 1, rows, res);
+
+  for (Index j2 = 0; j2 < cols; j2 += block_cols) {
+    Index jend = numext::mini(j2 + block_cols, cols);
+
+    using LhsSubMapper = typename LhsMapper::SubMapper;
+
+    LhsSubMapper lhs2 = lhs.getSubMapper(0, j2);
+    UseStride<RhsMapper, LhsSubMapper>::run(j2, jend, rows, lhs2, rhs2, pAlpha, result);
+  }
+
+  convertArrayPointerF32toBF16VSX(result, rows, res);
+}
+
+template <Index num_acc, Index size>
+EIGEN_ALWAYS_INLINE void outputVecResults(Packet4f (&acc)[num_acc][size], float* result, Packet4f pAlpha) {
+  constexpr Index extra = num_acc & 3;
+
+  for (Index k = 0; k < num_acc; k += 4) {
+    Packet4f d0 = ploadu<Packet4f>(result + k);
+    d0 = pmadd(acc[k + 0][0], pAlpha, d0);
+
+    if (num_acc > (k + 3)) {
+      pstoreu(result + k, d0);
+    } else {
+      if (extra == 3) {
+        pstoreu_partial(result + k, d0, extra);
+      } else {
+        memcpy((void*)(result + k), (void*)(&d0), sizeof(float) * extra);
+      }
+    }
+  }
+}
+
+template <Index num_acc>
+EIGEN_ALWAYS_INLINE void preduxVecResults2VSX(Packet4f (&acc)[num_acc][2], Index k) {
+  if (num_acc > (k + 1)) {
+    acc[k][1] = vec_mergel(acc[k + 0][0], acc[k + 1][0]);
+    acc[k][0] = vec_mergeh(acc[k + 0][0], acc[k + 1][0]);
+    acc[k][0] = acc[k][0] + acc[k][1];
+    acc[k][0] += vec_sld(acc[k][0], acc[k][0], 8);
+  } else {
+    acc[k][0] += vec_sld(acc[k][0], acc[k][0], 8);
+#ifdef _BIG_ENDIAN
+    acc[k][0] += vec_sld(acc[k][0], acc[k][0], 12);
+#else
+    acc[k][0] += vec_sld(acc[k][0], acc[k][0], 4);
+#endif
+  }
+}
+
+template <Index num_acc>
+EIGEN_ALWAYS_INLINE void preduxVecResultsVSX(Packet4f (&acc)[num_acc][2]) {
+  for (Index k = 0; k < num_acc; k += 4) {
+    preduxVecResults2VSX<num_acc>(acc, k + 0);
+    if (num_acc > (k + 2)) {
+      preduxVecResults2VSX<num_acc>(acc, k + 2);
+#ifdef EIGEN_VECTORIZE_VSX
+      acc[k + 0][0] = reinterpret_cast<Packet4f>(
+          vec_mergeh(reinterpret_cast<Packet2ul>(acc[k + 0][0]), reinterpret_cast<Packet2ul>(acc[k + 2][0])));
+#else
+      acc[k + 0][0] = reinterpret_cast<Packet4f>(vec_perm(acc[k + 0][0], acc[k + 2][0], p16uc_TRANSPOSE64_HI));
+#endif
+    }
+  }
+}
+
+#ifndef _ARCH_PWR9
+EIGEN_ALWAYS_INLINE Packet8us loadPacketPartialZero(Packet8us data, Index extra_cols) {
+  Packet16uc shift = pset1<Packet16uc>(8 * 2 * (8 - extra_cols));
+#ifdef _BIG_ENDIAN
+  return reinterpret_cast<Packet8us>(vec_slo(vec_sro(reinterpret_cast<Packet16uc>(data), shift), shift));
+#else
+  return reinterpret_cast<Packet8us>(vec_sro(vec_slo(reinterpret_cast<Packet16uc>(data), shift), shift));
+#endif
+}
+#endif
+
+template <Index num_acc, typename LhsMapper, typename RhsMapper, bool extra>
+EIGEN_ALWAYS_INLINE void multVSXVecLoop(Packet4f (&acc)[num_acc][2], const LhsMapper& lhs, RhsMapper& rhs, Index j,
+                                        Index extra_cols) {
+  Packet4f a0[num_acc][2], b0[2];
+  Packet8bf a1, b1;
+
+  if (extra) {
+    b1 = rhs.template loadPacketPartial<Packet8bf>(j, extra_cols);
+#ifndef _ARCH_PWR9
+    b1 = loadPacketPartialZero(b1.m_val, extra_cols);
+#endif
+  } else {
+    b1 = rhs.template loadPacket<Packet8bf>(j);
+  }
+  b0[0] = oneConvertBF16Hi(b1.m_val);
+  b0[1] = oneConvertBF16Lo(b1.m_val);
+
+  const LhsMapper lhs2 = lhs.getSubMapper(0, j);
+  for (Index k = 0; k < num_acc; k++) {
+    if (extra) {
+      a1 = lhs2.template loadPacketPartial<Packet8bf>(k, 0, extra_cols);
+#ifndef _ARCH_PWR9
+      a1 = loadPacketPartialZero(a1.m_val, extra_cols);
+#endif
+    } else {
+      a1 = lhs2.template loadPacket<Packet8bf>(k, 0);
+    }
+    a0[k][0] = oneConvertBF16Hi(a1.m_val);
+    a0[k][1] = oneConvertBF16Lo(a1.m_val);
+  }
+
+  multVecVSX<num_acc, false>(acc, a0, b0);
+}
+
+template <Index num_acc, typename LhsMapper, typename RhsMapper>
+EIGEN_ALWAYS_INLINE void vecVSXLoop(Index cols, const LhsMapper& lhs, RhsMapper& rhs, Packet4f (&acc)[num_acc][2],
+                                    Index extra_cols) {
+  Index j = 0;
+  for (; j + 8 <= cols; j += 8) {
+    multVSXVecLoop<num_acc, LhsMapper, RhsMapper, false>(acc, lhs, rhs, j, extra_cols);
+  }
+
+  if (extra_cols) {
+    multVSXVecLoop<num_acc, LhsMapper, RhsMapper, true>(acc, lhs, rhs, j, extra_cols);
+  }
+}
+
+template <const Index num_acc, typename LhsMapper, typename RhsMapper>
+void colVSXVecLoopBody(Index& row, Index cols, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha,
+                       float* result) {
+  constexpr bool multiIters = (num_acc == MAX_BFLOAT16_VEC_ACC_VSX);
+  const Index extra_cols = (cols & 7);
+
+  do {
+    Packet4f acc[num_acc][2];
+
+    zeroAccumulators<num_acc, 2>(acc);
+
+    const LhsMapper lhs2 = lhs.getSubMapper(row, 0);
+    vecVSXLoop<num_acc, LhsMapper, RhsMapper>(cols, lhs2, rhs, acc, extra_cols);
+
+    addResultsVSX<num_acc>(acc);
+
+    preduxVecResultsVSX<num_acc>(acc);
+
+    outputVecResults<num_acc, 2>(acc, result, pAlpha);
+
+    result += num_acc;
+  } while (multiIters && (num_acc <= rows - (row += num_acc)));
+}
+
+template <const Index num_acc, typename LhsMapper, typename RhsMapper>
+EIGEN_ALWAYS_INLINE void colVSXVecLoopBodyExtraN(Index& row, Index cols, Index rows, LhsMapper& lhs, RhsMapper& rhs,
+                                                 const Packet4f pAlpha, float* result) {
+  if (MAX_BFLOAT16_VEC_ACC_VSX > num_acc) {
+    colVSXVecLoopBody<num_acc, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
+  }
+}
+
+template <typename LhsMapper, typename RhsMapper>
+EIGEN_ALWAYS_INLINE void colVSXVecLoopBodyExtra(Index& row, Index cols, Index rows, LhsMapper& lhs, RhsMapper& rhs,
+                                                const Packet4f pAlpha, float* result) {
+  switch (rows - row) {
+    case 7:
+      colVSXVecLoopBodyExtraN<7, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
+      break;
+    case 6:
+      colVSXVecLoopBodyExtraN<6, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
+      break;
+    case 5:
+      colVSXVecLoopBodyExtraN<5, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
+      break;
+    case 4:
+      colVSXVecLoopBodyExtraN<4, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
+      break;
+    case 3:
+      colVSXVecLoopBodyExtraN<3, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
+      break;
+    case 2:
+      colVSXVecLoopBodyExtraN<2, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
+      break;
+    case 1:
+      colVSXVecLoopBodyExtraN<1, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
+      break;
+  }
+}
+
+template <typename LhsMapper, typename RhsMapper>
+EIGEN_ALWAYS_INLINE void calcVSXVecLoops(Index cols, Index rows, LhsMapper& lhs, RhsMapper& rhs, const Packet4f pAlpha,
+                                         float* result) {
+  Index row = 0;
+  if (rows >= MAX_BFLOAT16_VEC_ACC_VSX) {
+    colVSXVecLoopBody<MAX_BFLOAT16_VEC_ACC_VSX, LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
+    result += row;
+  }
+  colVSXVecLoopBodyExtra<LhsMapper, RhsMapper>(row, cols, rows, lhs, rhs, pAlpha, result);
+}
+
+template <typename LhsMapper, typename RhsMapper>
+EIGEN_STRONG_INLINE void gemv_bfloat16_row(Index rows, Index cols, const LhsMapper& alhs, const RhsMapper& rhs,
+                                           bfloat16* res, Index resIncr, bfloat16 alpha) {
+  typedef typename RhsMapper::LinearMapper LinearMapper;
+
+  // The following copy tells the compiler that lhs's attributes are not modified outside this function
+  // This helps GCC to generate proper code.
+  LhsMapper lhs(alhs);
+  LinearMapper rhs2 = rhs.getLinearMapper(0, 0);
+
+  eigen_internal_assert(rhs.stride() == 1);
+
+  float falpha = Eigen::bfloat16_impl::bfloat16_to_float(alpha);
+  const Packet4f pAlpha = pset1<Packet4f>(falpha);
+
+  ei_declare_aligned_stack_constructed_variable(float, result, rows, 0);
+  if (resIncr == 1) {
+    convertArrayPointerBF16toF32(result, 1, rows, res);
+  } else {
+    convertArrayPointerBF16toF32<true>(result, 1, rows, res, resIncr);
+  }
+  calcVSXVecLoops<LhsMapper, LinearMapper>(cols, rows, lhs, rhs2, pAlpha, result);
+  if (resIncr == 1) {
+    convertArrayPointerF32toBF16VSX(result, rows, res);
+  } else {
+    convertArrayPointerF32toBF16VSX<true>(result, rows, res, resIncr);
+  }
+}
+
+#undef MAX_BFLOAT16_VEC_ACC_VSX
+
+const Packet16uc p16uc_COMPLEX32_XORFLIP = {0x44, 0x55, 0x66, 0x77, 0x00, 0x11, 0x22, 0x33,
+                                            0xcc, 0xdd, 0xee, 0xff, 0x88, 0x99, 0xaa, 0xbb};
+const Packet16uc p16uc_COMPLEX64_XORFLIP = {0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff,
+                                            0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77};
+
+#ifdef _BIG_ENDIAN
+const Packet16uc p16uc_COMPLEX32_CONJ_XOR = {0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
+                                             0x00, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00};
+const Packet16uc p16uc_COMPLEX64_CONJ_XOR = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                                             0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+const Packet16uc p16uc_COMPLEX32_CONJ_XOR2 = {0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                                              0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+const Packet16uc p16uc_COMPLEX64_CONJ_XOR2 = {0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                                              0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+const Packet16uc p16uc_COMPLEX32_NEGATE = {0x80, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00,
+                                           0x80, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00};
+const Packet16uc p16uc_COMPLEX64_NEGATE = {0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                                           0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+#else
+const Packet16uc p16uc_COMPLEX32_CONJ_XOR = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80,
+                                             0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80};
+const Packet16uc p16uc_COMPLEX64_CONJ_XOR = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+                                             0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80};
+const Packet16uc p16uc_COMPLEX32_CONJ_XOR2 = {0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00,
+                                              0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x00};
+const Packet16uc p16uc_COMPLEX64_CONJ_XOR2 = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80,
+                                              0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
+const Packet16uc p16uc_COMPLEX32_NEGATE = {0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80,
+                                           0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x80};
+const Packet16uc p16uc_COMPLEX64_NEGATE = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80,
+                                           0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80};
+#endif
+
+#ifdef _BIG_ENDIAN
+#define COMPLEX_DELTA 0
+#else
+#define COMPLEX_DELTA 2
+#endif
+
+/** \internal packet conjugate (same as pconj but uses the constants in pcplxflipconj for better code generation) */
+EIGEN_ALWAYS_INLINE Packet2cf pconj2(const Packet2cf& a) {
+  return Packet2cf(pxor(a.v, reinterpret_cast<Packet4f>(p16uc_COMPLEX32_CONJ_XOR)));
+}
+
+EIGEN_ALWAYS_INLINE Packet1cd pconj2(const Packet1cd& a) {
+  return Packet1cd(pxor(a.v, reinterpret_cast<Packet2d>(p16uc_COMPLEX64_CONJ_XOR)));
+}
+
+/** \internal packet conjugate with real & imaginary operation inverted */
+EIGEN_ALWAYS_INLINE Packet2cf pconjinv(const Packet2cf& a) {
+#ifdef __POWER8_VECTOR__
+  return Packet2cf(Packet4f(vec_neg(Packet2d(a.v))));
+#else
+  return Packet2cf(pxor(a.v, reinterpret_cast<Packet4f>(p16uc_COMPLEX32_CONJ_XOR2)));
+#endif
+}
+
+EIGEN_ALWAYS_INLINE Packet1cd pconjinv(const Packet1cd& a) {
+  return Packet1cd(pxor(a.v, reinterpret_cast<Packet2d>(p16uc_COMPLEX64_CONJ_XOR2)));
+}
+
+#if defined(_ARCH_PWR8) && (!EIGEN_COMP_LLVM || __clang_major__ >= 12)
+#define PERMXOR_GOOD  // Clang had a bug with vec_permxor and endianness prior to version 12
+#endif
+
+/** \internal flip the real & imaginary results and packet conjugate */
+EIGEN_ALWAYS_INLINE Packet2cf pcplxflipconj(Packet2cf a) {
+#ifdef PERMXOR_GOOD
+  return Packet2cf(Packet4f(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX32_CONJ_XOR, p16uc_COMPLEX32_XORFLIP)));
+#else
+  return pcplxflip(pconj2(a));
+#endif
+}
+
+EIGEN_ALWAYS_INLINE Packet1cd pcplxflipconj(Packet1cd a) {
+#ifdef PERMXOR_GOOD
+  return Packet1cd(Packet2d(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX64_CONJ_XOR, p16uc_COMPLEX64_XORFLIP)));
+#else
+  return pcplxflip(pconj2(a));
+#endif
+}
+
+/** \internal packet conjugate and flip the real & imaginary results */
+EIGEN_ALWAYS_INLINE Packet2cf pcplxconjflip(Packet2cf a) {
+#ifdef PERMXOR_GOOD
+  return Packet2cf(Packet4f(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX32_CONJ_XOR2, p16uc_COMPLEX32_XORFLIP)));
+#else
+  return pconj2(pcplxflip(a));
+#endif
+}
+
+EIGEN_ALWAYS_INLINE Packet1cd pcplxconjflip(Packet1cd a) {
+#ifdef PERMXOR_GOOD
+  return Packet1cd(Packet2d(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX64_CONJ_XOR2, p16uc_COMPLEX64_XORFLIP)));
+#else
+  return pconj2(pcplxflip(a));
+#endif
+}
+
+/** \internal packet negate */
+EIGEN_ALWAYS_INLINE Packet2cf pnegate2(Packet2cf a) {
+#ifdef __POWER8_VECTOR__
+  return Packet2cf(vec_neg(a.v));
+#else
+  return Packet2cf(pxor(a.v, reinterpret_cast<Packet4f>(p16uc_COMPLEX32_NEGATE)));
+#endif
+}
+
+EIGEN_ALWAYS_INLINE Packet1cd pnegate2(Packet1cd a) {
+#ifdef __POWER8_VECTOR__
+  return Packet1cd(vec_neg(a.v));
+#else
+  return Packet1cd(pxor(a.v, reinterpret_cast<Packet2d>(p16uc_COMPLEX64_NEGATE)));
+#endif
+}
+
+/** \internal flip the real & imaginary results and negate */
+EIGEN_ALWAYS_INLINE Packet2cf pcplxflipnegate(Packet2cf a) {
+#ifdef PERMXOR_GOOD
+  return Packet2cf(Packet4f(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX32_NEGATE, p16uc_COMPLEX32_XORFLIP)));
+#else
+  return pcplxflip(pnegate2(a));
+#endif
+}
+
+EIGEN_ALWAYS_INLINE Packet1cd pcplxflipnegate(Packet1cd a) {
+#ifdef PERMXOR_GOOD
+  return Packet1cd(Packet2d(vec_permxor(Packet16uc(a.v), p16uc_COMPLEX64_NEGATE, p16uc_COMPLEX64_XORFLIP)));
+#else
+  return pcplxflip(pnegate2(a));
+#endif
+}
+
+/** \internal flip the real & imaginary results */
+EIGEN_ALWAYS_INLINE Packet2cf pcplxflip2(Packet2cf a) {
+  return Packet2cf(Packet4f(vec_perm(Packet16uc(a.v), Packet16uc(a.v), p16uc_COMPLEX32_XORFLIP)));
+}
+
+EIGEN_ALWAYS_INLINE Packet1cd pcplxflip2(Packet1cd a) {
+#ifdef EIGEN_VECTORIZE_VSX
+  return Packet1cd(__builtin_vsx_xxpermdi(a.v, a.v, 2));
+#else
+  return Packet1cd(Packet2d(vec_perm(Packet16uc(a.v), Packet16uc(a.v), p16uc_COMPLEX64_XORFLIP)));
+#endif
+}
+
+/** \internal load half a vector with one complex value */
+EIGEN_ALWAYS_INLINE Packet4f pload_complex_half(std::complex<float>* src) {
+  Packet4f t;
+#ifdef EIGEN_VECTORIZE_VSX
+  // Load float64/two float32 (doubleword alignment)
+  __asm__("lxsdx %x0,%y1" : "=wa"(t) : "Z"(*src));
+#else
+  *reinterpret_cast<std::complex<float>*>(reinterpret_cast<float*>(&t) + COMPLEX_DELTA) = *src;
+#endif
+  return t;
+}
+
+/** \internal load two vectors from the real and imaginary portions of a complex value */
+template <typename RhsScalar>
+EIGEN_ALWAYS_INLINE void pload_realimag(RhsScalar* src, Packet4f& r, Packet4f& i) {
+#ifdef _ARCH_PWR9
+  __asm__("lxvwsx %x0,%y1" : "=wa"(r) : "Z"(*(reinterpret_cast<float*>(src) + 0)));
+  __asm__("lxvwsx %x0,%y1" : "=wa"(i) : "Z"(*(reinterpret_cast<float*>(src) + 1)));
+#else
+  Packet4f t = pload_complex_half(src);
+  r = vec_splat(t, COMPLEX_DELTA + 0);
+  i = vec_splat(t, COMPLEX_DELTA + 1);
+#endif
+}
+
+template <typename RhsScalar>
+EIGEN_ALWAYS_INLINE void pload_realimag(RhsScalar* src, Packet2d& r, Packet2d& i) {
+#ifdef EIGEN_VECTORIZE_VSX
+  __asm__("lxvdsx %x0,%y1" : "=wa"(r) : "Z"(*(reinterpret_cast<double*>(src) + 0)));
+  __asm__("lxvdsx %x0,%y1" : "=wa"(i) : "Z"(*(reinterpret_cast<double*>(src) + 1)));
+#else
+  Packet2d t = ploadu<Packet2d>(reinterpret_cast<double*>(src));
+  r = vec_splat(t, 0);
+  i = vec_splat(t, 1);
+#endif
+}
+
+#ifndef __POWER8_VECTOR__
+const Packet16uc p16uc_MERGEE = {0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13,
+                                 0x08, 0x09, 0x0A, 0x0B, 0x18, 0x19, 0x1A, 0x1B};
+
+const Packet16uc p16uc_MERGEO = {0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17,
+                                 0x0C, 0x0D, 0x0E, 0x0F, 0x1C, 0x1D, 0x1E, 0x1F};
+#endif
+
+/** \internal load two vectors from the interleaved real & imaginary values of src */
+template <typename RhsScalar>
+EIGEN_ALWAYS_INLINE void pload_realimag_row(RhsScalar* src, Packet4f& r, Packet4f& i) {
+  Packet4f t = ploadu<Packet4f>(reinterpret_cast<float*>(src));
+#ifdef __POWER8_VECTOR__
+  r = vec_mergee(t, t);
+  i = vec_mergeo(t, t);
+#else
+  r = vec_perm(t, t, p16uc_MERGEE);
+  i = vec_perm(t, t, p16uc_MERGEO);
+#endif
+}
+
+template <typename RhsScalar>
+EIGEN_ALWAYS_INLINE void pload_realimag_row(RhsScalar* src, Packet2d& r, Packet2d& i) {
+  return pload_realimag(src, r, i);
+}
+
+/** \internal load and splat a complex value into a vector - column-wise */
+EIGEN_ALWAYS_INLINE Packet4f pload_realimag_combine(std::complex<float>* src) {
+#ifdef EIGEN_VECTORIZE_VSX
+  Packet4f ret;
+  __asm__("lxvdsx %x0,%y1" : "=wa"(ret) : "Z"(*(reinterpret_cast<double*>(src) + 0)));
+  return ret;
+#else
+  return Packet4f(ploaddup<Packet2d>(reinterpret_cast<double*>(src)));
+#endif
+}
+
+EIGEN_ALWAYS_INLINE Packet2d pload_realimag_combine(std::complex<double>* src) { return ploadu<Packet1cd>(src).v; }
+
+/** \internal load a complex value into a vector - row-wise */
+EIGEN_ALWAYS_INLINE Packet4f pload_realimag_combine_row(std::complex<float>* src) { return ploadu<Packet2cf>(src).v; }
+
+EIGEN_ALWAYS_INLINE Packet2d pload_realimag_combine_row(std::complex<double>* src) { return ploadu<Packet1cd>(src).v; }
+
+/** \internal load a scalar or a vector from complex location */
+template <typename ResPacket>
+EIGEN_ALWAYS_INLINE Packet4f pload_complex(std::complex<float>* src) {
+  if (GEMV_IS_SCALAR) {
+    return pload_complex_half(src);
+  } else {
+    return ploadu<Packet4f>(reinterpret_cast<float*>(src));
+  }
+}
+
+template <typename ResPacket>
+EIGEN_ALWAYS_INLINE Packet2d pload_complex(std::complex<double>* src) {
+  return ploadu<Packet2d>(reinterpret_cast<double*>(src));
+}
+
+/** \internal load from a complex vector and convert to a real vector */
+template <typename ResPacket>
+EIGEN_ALWAYS_INLINE Packet4f pload_complex(Packet2cf* src) {
+  return src->v;
+}
+
+template <typename ResPacket>
+EIGEN_ALWAYS_INLINE Packet2d pload_complex(Packet1cd* src) {
+  return src->v;
+}
+
+/** \internal load a full vector from complex location - column-wise */
+EIGEN_ALWAYS_INLINE Packet4f pload_complex_full(std::complex<float>* src) {
+  return Packet4f(ploaddup<Packet2d>(reinterpret_cast<double*>(src)));
+}
+
+EIGEN_ALWAYS_INLINE Packet2d pload_complex_full(std::complex<double>* src) { return ploadu<Packet1cd>(src).v; }
+
+/** \internal load a full vector from complex location - row-wise */
+EIGEN_ALWAYS_INLINE Packet4f pload_complex_full_row(std::complex<float>* src) { return ploadu<Packet2cf>(src).v; }
+
+EIGEN_ALWAYS_INLINE Packet2d pload_complex_full_row(std::complex<double>* src) { return pload_complex_full(src); }
+
+/** \internal load a vector from a real-only scalar location - column-wise */
+EIGEN_ALWAYS_INLINE Packet4f pload_real(float* src) { return pset1<Packet4f>(*src); }
+
+EIGEN_ALWAYS_INLINE Packet2d pload_real(double* src) { return pset1<Packet2d>(*src); }
+
+EIGEN_ALWAYS_INLINE Packet4f pload_real(Packet4f& src) { return src; }
+
+EIGEN_ALWAYS_INLINE Packet2d pload_real(Packet2d& src) { return src; }
+
+/** \internal load a vector from a real-only vector location */
+EIGEN_ALWAYS_INLINE Packet4f pload_real_full(float* src) {
+  Packet4f ret = ploadu<Packet4f>(src);
+  return vec_mergeh(ret, ret);
+}
+
+EIGEN_ALWAYS_INLINE Packet2d pload_real_full(double* src) { return pload_real(src); }
+
+EIGEN_ALWAYS_INLINE Packet4f pload_real_full(std::complex<float>* src) {
+  return pload_complex_full(src);  // Just for compilation
+}
+
+EIGEN_ALWAYS_INLINE Packet2d pload_real_full(std::complex<double>* src) {
+  return pload_complex_full(src);  // Just for compilation
+}
+
+/** \internal load a vector from a real-only scalar location - row-wise */
+template <typename ResPacket>
+EIGEN_ALWAYS_INLINE Packet4f pload_real_row(float* src) {
+  if (GEMV_IS_SCALAR) {
+    return pload_real_full(src);
+  } else {
+    return ploadu<Packet4f>(src);
+  }
+}
+
+template <typename ResPacket>
+EIGEN_ALWAYS_INLINE Packet2d pload_real_row(double* src) {
+  return pload_real(src);
+}
+
+EIGEN_ALWAYS_INLINE Packet2cf padd(Packet2cf& a, std::complex<float>& b) {
+  EIGEN_UNUSED_VARIABLE(b);
+  return a;  // Just for compilation
+}
+
+EIGEN_ALWAYS_INLINE Packet1cd padd(Packet1cd& a, std::complex<double>& b) {
+  EIGEN_UNUSED_VARIABLE(b);
+  return a;  // Just for compilation
+}
+
+/** \internal set a scalar from complex location */
+template <typename Scalar, typename ResScalar>
+EIGEN_ALWAYS_INLINE Scalar pset1_realimag(ResScalar& alpha, int which, int conj) {
+  return (which) ? ((conj) ? -alpha.real() : alpha.real()) : ((conj) ? -alpha.imag() : alpha.imag());
+}
+
+/** \internal set a vector from complex location */
+template <typename Scalar, typename ResScalar, typename ResPacket, int which>
+EIGEN_ALWAYS_INLINE Packet2cf pset1_complex(std::complex<float>& alpha) {
+  Packet2cf ret;
+  ret.v[COMPLEX_DELTA + 0] = pset1_realimag<Scalar, ResScalar>(alpha, (which & 0x01), (which & 0x04));
+  ret.v[COMPLEX_DELTA + 1] = pset1_realimag<Scalar, ResScalar>(alpha, (which & 0x02), (which & 0x08));
+  ret.v[2 - COMPLEX_DELTA] = ret.v[COMPLEX_DELTA + 0];
+  ret.v[3 - COMPLEX_DELTA] = ret.v[COMPLEX_DELTA + 1];
+  return ret;
+}
+
+template <typename Scalar, typename ResScalar, typename ResPacket, int which>
+EIGEN_ALWAYS_INLINE Packet1cd pset1_complex(std::complex<double>& alpha) {
+  Packet1cd ret;
+  ret.v[0] = pset1_realimag<Scalar, ResScalar>(alpha, (which & 0x01), (which & 0x04));
+  ret.v[1] = pset1_realimag<Scalar, ResScalar>(alpha, (which & 0x02), (which & 0x08));
+  return ret;
+}
+
+/** \internal zero out a vector for real or complex forms */
+template <typename Packet>
+EIGEN_ALWAYS_INLINE Packet pset_zero() {
+  return pset1<Packet>(__UNPACK_TYPE__(Packet)(0));
+}
+
+template <>
+EIGEN_ALWAYS_INLINE Packet2cf pset_zero<Packet2cf>() {
+  return Packet2cf(pset1<Packet4f>(float(0)));
+}
+
+template <>
+EIGEN_ALWAYS_INLINE Packet1cd pset_zero<Packet1cd>() {
+  return Packet1cd(pset1<Packet2d>(double(0)));
+}
+
+/** \internal initialize a vector from another vector */
+template <typename Packet, typename LhsPacket, typename RhsPacket>
+EIGEN_ALWAYS_INLINE Packet pset_init(Packet& c1) {
+  if (GEMV_IS_COMPLEX_COMPLEX) {
+    EIGEN_UNUSED_VARIABLE(c1);
+    return pset_zero<Packet>();
+  } else {
+    return c1;  // Intentionally left uninitialized
+  }
+}
+
+template <typename PResPacket, typename ResPacket, typename ResScalar, typename Scalar>
+struct alpha_store {
+  alpha_store(ResScalar& alpha) {
+    separate.r = pset1_complex<Scalar, ResScalar, ResPacket, 0x3>(alpha);
+    separate.i = pset1_complex<Scalar, ResScalar, ResPacket, 0x0>(alpha);
+  }
+  struct ri {
+    PResPacket r;
+    PResPacket i;
+  } separate;
+};
+
+/** \internal multiply and add for complex math */
+template <typename ScalarPacket, typename AlphaData>
+EIGEN_ALWAYS_INLINE ScalarPacket pmadd_complex(ScalarPacket& c0, ScalarPacket& c2, ScalarPacket& c4, AlphaData& b0) {
+  return pmadd(c2, b0.separate.i.v, pmadd(c0, b0.separate.r.v, c4));
+}
+
+/** \internal store and madd for complex math */
+template <typename Scalar, typename ScalarPacket, typename PResPacket, typename ResPacket, typename ResScalar,
+          typename AlphaData>
+EIGEN_ALWAYS_INLINE void pstoreu_pmadd_complex(PResPacket& c0, AlphaData& b0, ResScalar* res) {
+  PResPacket c2 = pcplxflipconj(c0);
+  if (GEMV_IS_SCALAR) {
+    ScalarPacket c4 = ploadu<ScalarPacket>(reinterpret_cast<Scalar*>(res));
+    ScalarPacket c3 = pmadd_complex<ScalarPacket, AlphaData>(c0.v, c2.v, c4, b0);
+    pstoreu(reinterpret_cast<Scalar*>(res), c3);
+  } else {
+    ScalarPacket c4 = pload_complex<ResPacket>(res);
+    PResPacket c3 = PResPacket(pmadd_complex<ScalarPacket, AlphaData>(c0.v, c2.v, c4, b0));
+    pstoreu(res, c3);
+  }
+}
+
+template <typename ScalarPacket, typename PResPacket, typename ResPacket, typename ResScalar, typename AlphaData,
+          Index ResPacketSize, Index iter2>
+EIGEN_ALWAYS_INLINE void pstoreu_pmadd_complex(PResPacket& c0, PResPacket& c1, AlphaData& b0, ResScalar* res) {
+  PResPacket c2 = pcplxflipconj(c0);
+  PResPacket c3 = pcplxflipconj(c1);
+#if !defined(_ARCH_PWR10)
+  ScalarPacket c4 = pload_complex<ResPacket>(res + (iter2 * ResPacketSize));
+  ScalarPacket c5 = pload_complex<ResPacket>(res + ((iter2 + 1) * ResPacketSize));
+  PResPacket c6 = PResPacket(pmadd_complex<ScalarPacket, AlphaData>(c0.v, c2.v, c4, b0));
+  PResPacket c7 = PResPacket(pmadd_complex<ScalarPacket, AlphaData>(c1.v, c3.v, c5, b0));
+  pstoreu(res + (iter2 * ResPacketSize), c6);
+  pstoreu(res + ((iter2 + 1) * ResPacketSize), c7);
+#else
+  __vector_pair a = *reinterpret_cast<__vector_pair*>(res + (iter2 * ResPacketSize));
+#if EIGEN_COMP_LLVM
+  PResPacket c6[2];
+  __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(c6), &a);
+  c6[0] = PResPacket(pmadd_complex<ScalarPacket, AlphaData>(c0.v, c2.v, c6[0].v, b0));
+  c6[1] = PResPacket(pmadd_complex<ScalarPacket, AlphaData>(c1.v, c3.v, c6[1].v, b0));
+  GEMV_BUILDPAIR_MMA(a, c6[0].v, c6[1].v);
+#else
+  if (GEMV_IS_COMPLEX_FLOAT) {
+    __asm__("xvmaddasp %L0,%x1,%x2\n\txvmaddasp %0,%x1,%x3" : "+&d"(a) : "wa"(b0.separate.r.v), "wa"(c0.v), "wa"(c1.v));
+    __asm__("xvmaddasp %L0,%x1,%x2\n\txvmaddasp %0,%x1,%x3" : "+&d"(a) : "wa"(b0.separate.i.v), "wa"(c2.v), "wa"(c3.v));
+  } else {
+    __asm__("xvmaddadp %L0,%x1,%x2\n\txvmaddadp %0,%x1,%x3" : "+&d"(a) : "wa"(b0.separate.r.v), "wa"(c0.v), "wa"(c1.v));
+    __asm__("xvmaddadp %L0,%x1,%x2\n\txvmaddadp %0,%x1,%x3" : "+&d"(a) : "wa"(b0.separate.i.v), "wa"(c2.v), "wa"(c3.v));
+  }
+#endif
+  *reinterpret_cast<__vector_pair*>(res + (iter2 * ResPacketSize)) = a;
+#endif
+}
+
+/** \internal load lhs packet */
+template <typename Scalar, typename LhsScalar, typename LhsMapper, typename LhsPacket>
+EIGEN_ALWAYS_INLINE LhsPacket loadLhsPacket(LhsMapper& lhs, Index i, Index j) {
+  if (sizeof(Scalar) == sizeof(LhsScalar)) {
+    const LhsScalar& src = lhs(i + 0, j);
+    return LhsPacket(pload_real_full(const_cast<LhsScalar*>(&src)));
+  }
+  return lhs.template load<LhsPacket, Unaligned>(i + 0, j);
+}
+
+/** \internal madd for complex times complex */
+template <typename ComplexPacket, typename RealPacket, bool ConjugateLhs, bool ConjugateRhs, bool Negate>
+EIGEN_ALWAYS_INLINE RealPacket pmadd_complex_complex(RealPacket& a, RealPacket& b, RealPacket& c) {
+  if (ConjugateLhs && ConjugateRhs) {
+    return vec_madd(a, pconj2(ComplexPacket(b)).v, c);
+  } else if (Negate && !ConjugateLhs && ConjugateRhs) {
+    return vec_nmsub(a, b, c);
+  } else {
+    return vec_madd(a, b, c);
+  }
+}
+
+/** \internal madd for complex times real */
+template <typename ComplexPacket, typename RealPacket, bool Conjugate>
+EIGEN_ALWAYS_INLINE RealPacket pmadd_complex_real(RealPacket& a, RealPacket& b, RealPacket& c) {
+  if (Conjugate) {
+    return vec_madd(a, pconj2(ComplexPacket(b)).v, c);
+  } else {
+    return vec_madd(a, b, c);
+  }
+}
+
+template <typename LhsPacket, typename RhsScalar, typename RhsPacket, typename PResPacket, bool ConjugateLhs,
+          bool ConjugateRhs, int StorageOrder>
+EIGEN_ALWAYS_INLINE void gemv_mult_generic(LhsPacket& a0, RhsScalar* b, PResPacket& c0) {
+  conj_helper<LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs> pcj;
+  RhsPacket b0;
+  if (StorageOrder == ColMajor) {
+    b0 = pset1<RhsPacket>(*b);
+  } else {
+    b0 = ploadu<RhsPacket>(b);
+  }
+  c0 = pcj.pmadd(a0, b0, c0);
+}
+
+/** \internal core multiply operation for vectors - complex times complex */
+template <typename ScalarPacket, typename LhsPacket, typename RhsScalar, typename RhsPacket, typename PResPacket,
+          typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>
+EIGEN_ALWAYS_INLINE void gemv_mult_complex_complex(LhsPacket& a0, RhsScalar* b, PResPacket& c0, ResPacket& c1) {
+  ScalarPacket br, bi;
+  if (StorageOrder == ColMajor) {
+    pload_realimag<RhsScalar>(b, br, bi);
+  } else {
+    pload_realimag_row<RhsScalar>(b, br, bi);
+  }
+  if (ConjugateLhs && !ConjugateRhs) a0 = pconj2(a0);
+  LhsPacket a1 = pcplxflipconj(a0);
+  ScalarPacket cr = pmadd_complex_complex<LhsPacket, ScalarPacket, ConjugateLhs, ConjugateRhs, false>(a0.v, br, c0.v);
+  ScalarPacket ci = pmadd_complex_complex<LhsPacket, ScalarPacket, ConjugateLhs, ConjugateRhs, true>(a1.v, bi, c1.v);
+  c1 = ResPacket(ci);
+  c0 = PResPacket(cr);
+}
+
+/** \internal core multiply operation for vectors - real times complex */
+template <typename ScalarPacket, typename LhsPacket, typename RhsScalar, typename RhsPacket, typename PResPacket,
+          typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>
+EIGEN_ALWAYS_INLINE void gemv_mult_real_complex(LhsPacket& a0, RhsScalar* b, PResPacket& c0) {
+  ScalarPacket b0;
+  if (StorageOrder == ColMajor) {
+    b0 = pload_complex_full(b);
+  } else {
+    b0 = pload_complex_full_row(b);
+  }
+  ScalarPacket cri = pmadd_complex_real<PResPacket, ScalarPacket, ConjugateRhs>(a0, b0, c0.v);
+  c0 = PResPacket(cri);
+}
+
+/** \internal core multiply operation for vectors - complex times real */
+template <typename ScalarPacket, typename LhsPacket, typename RhsScalar, typename RhsPacket, typename PResPacket,
+          typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>
+EIGEN_ALWAYS_INLINE void gemv_mult_complex_real(LhsPacket& a0, RhsScalar* b, PResPacket& c0) {
+  ScalarPacket a1 = pload_complex<ResPacket>(&a0);
+  ScalarPacket b0;
+  if (StorageOrder == ColMajor) {
+    b0 = pload_real(b);
+  } else {
+    b0 = pload_real_row<ResPacket>(b);
+  }
+  ScalarPacket cri = pmadd_complex_real<PResPacket, ScalarPacket, ConjugateLhs>(a1, b0, c0.v);
+  c0 = PResPacket(cri);
+}
+
+#define GEMV_MULT_COMPLEX_COMPLEX(LhsType, RhsType, ResType)                                                        \
+  template <typename ScalarPacket, typename LhsPacket, typename RhsScalar, typename RhsPacket, typename PResPacket, \
+            typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>                             \
+  EIGEN_ALWAYS_INLINE void gemv_mult_complex(LhsType& a0, RhsType* b, ResType& c0, ResType& c1) {                   \
+    gemv_mult_complex_complex<ScalarPacket, LhsPacket, RhsScalar, RhsPacket, PResPacket, ResPacket, ConjugateLhs,   \
+                              ConjugateRhs, StorageOrder>(a0, b, c0, c1);                                           \
+  }
+
+GEMV_MULT_COMPLEX_COMPLEX(Packet2cf, std::complex<float>, Packet2cf)
+GEMV_MULT_COMPLEX_COMPLEX(Packet1cd, std::complex<double>, Packet1cd)
+
+#define GEMV_MULT_REAL_COMPLEX(LhsType, RhsType, ResType)                                                           \
+  template <typename ScalarPacket, typename LhsPacket, typename RhsScalar, typename RhsPacket, typename PResPacket, \
+            typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>                             \
+  EIGEN_ALWAYS_INLINE void gemv_mult_complex(LhsType& a0, RhsType* b, ResType& c0, RhsType&) {                      \
+    gemv_mult_real_complex<ScalarPacket, LhsPacket, RhsScalar, RhsPacket, PResPacket, ResPacket, ConjugateLhs,      \
+                           ConjugateRhs, StorageOrder>(a0, b, c0);                                                  \
+  }
+
+GEMV_MULT_REAL_COMPLEX(float, std::complex<float>, Packet2cf)
+GEMV_MULT_REAL_COMPLEX(double, std::complex<double>, Packet1cd)
+GEMV_MULT_REAL_COMPLEX(Packet4f, std::complex<float>, Packet2cf)
+GEMV_MULT_REAL_COMPLEX(Packet2d, std::complex<double>, Packet1cd)
+
+#define GEMV_MULT_COMPLEX_REAL(LhsType, RhsType, ResType1, ResType2)                                                \
+  template <typename ScalarPacket, typename LhsPacket, typename RhsScalar, typename RhsPacket, typename PResPacket, \
+            typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>                             \
+  EIGEN_ALWAYS_INLINE void gemv_mult_complex(LhsType& a0, RhsType* b, ResType1& c0, ResType2&) {                    \
+    gemv_mult_complex_real<ScalarPacket, LhsPacket, RhsScalar, RhsPacket, PResPacket, ResPacket, ConjugateLhs,      \
+                           ConjugateRhs, StorageOrder>(a0, b, c0);                                                  \
+  }
+
+GEMV_MULT_COMPLEX_REAL(Packet2cf, float, Packet2cf, std::complex<float>)
+GEMV_MULT_COMPLEX_REAL(Packet1cd, double, Packet1cd, std::complex<double>)
+GEMV_MULT_COMPLEX_REAL(std::complex<float>, float, Packet2cf, std::complex<float>)
+GEMV_MULT_COMPLEX_REAL(std::complex<double>, double, Packet1cd, std::complex<double>)
+
+#ifdef USE_GEMV_MMA
+/** \internal convert packet to real form */
+template <typename T>
+EIGEN_ALWAYS_INLINE T convertReal(T a) {
+  return a;
+}
+
+EIGEN_ALWAYS_INLINE Packet4f convertReal(Packet2cf a) { return a.v; }
+
+EIGEN_ALWAYS_INLINE Packet2d convertReal(Packet1cd a) { return a.v; }
+
+/** \internal convert packet to complex form */
+template <typename T>
+EIGEN_ALWAYS_INLINE T convertComplex(T a) {
+  return a;
+}
+
+EIGEN_ALWAYS_INLINE Packet2cf convertComplex(Packet4f a) { return Packet2cf(a); }
+
+EIGEN_ALWAYS_INLINE Packet1cd convertComplex(Packet2d a) { return Packet1cd(a); }
+
+/** \internal load a vector from a complex location (for MMA version) */
+template <typename ScalarPacket, typename LhsPacket, typename SLhsPacket, typename ResPacket>
+EIGEN_ALWAYS_INLINE void pload_complex_MMA(SLhsPacket& a) {
+  a = SLhsPacket(pload_complex<ResPacket>(&a));
+}
+
+template <typename ScalarPacket, typename LhsPacket, typename SLhsPacket, typename ResPacket>
+EIGEN_ALWAYS_INLINE void pload_complex_MMA(__vector_pair&) {
+  // Pass thru
+}
+
+/** \internal perform a matrix multiply and accumulate (positive and negative) of packet a and packet b */
+template <typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
+EIGEN_ALWAYS_INLINE void pger_vecMMA(__vector_quad* acc, RhsPacket& a, LhsPacket& b) {
+  if (NegativeAccumulate) {
+    __builtin_mma_xvf32gernp(acc, (__vector unsigned char)a, (__vector unsigned char)b);
+  } else {
+    __builtin_mma_xvf32gerpp(acc, (__vector unsigned char)a, (__vector unsigned char)b);
+  }
+}
+
+/** \internal perform a matrix multiply and accumulate (positive and negative) of vector_pair a and packet b */
+template <typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
+EIGEN_ALWAYS_INLINE void pger_vecMMA(__vector_quad* acc, __vector_pair& a, Packet2d& b) {
+  if (NegativeAccumulate) {
+    __builtin_mma_xvf64gernp(acc, (__vector_pair)a, (__vector unsigned char)b);
+  } else {
+    __builtin_mma_xvf64gerpp(acc, (__vector_pair)a, (__vector unsigned char)b);
+  }
+}
+
+template <typename LhsPacket, typename RhsPacket, bool NegativeAccumulate>
+EIGEN_ALWAYS_INLINE void pger_vecMMA(__vector_quad*, __vector_pair&, Packet4f&) {
+  // Just for compilation
+}
+
+/** \internal madd for complex times complex (MMA version) */
+template <typename RealPacket, typename LhsPacket, bool ConjugateLhs, bool ConjugateRhs, bool Negate>
+EIGEN_ALWAYS_INLINE void pmadd_complex_complex_MMA(LhsPacket& a, RealPacket& b, __vector_quad* c) {
+  if (ConjugateLhs && ConjugateRhs) {
+    RealPacket b2 = pconj2(convertComplex(b)).v;
+    return pger_vecMMA<RealPacket, RealPacket, false>(c, b2, a.v);
+  } else if (Negate && !ConjugateLhs && ConjugateRhs) {
+    return pger_vecMMA<RealPacket, RealPacket, true>(c, b, a.v);
+  } else {
+    return pger_vecMMA<RealPacket, RealPacket, false>(c, b, a.v);
+  }
+}
+
+template <typename RealPacket, typename LhsPacket, bool ConjugateLhs, bool ConjugateRhs, bool Negate>
+EIGEN_ALWAYS_INLINE void pmadd_complex_complex_MMA(__vector_pair& a, RealPacket& b, __vector_quad* c) {
+  if (ConjugateLhs && ConjugateRhs) {
+    RealPacket b2 = pconj2(convertComplex(b)).v;
+    return pger_vecMMA<RealPacket, __vector_pair, false>(c, a, b2);
+  } else if (Negate && !ConjugateLhs && ConjugateRhs) {
+    return pger_vecMMA<RealPacket, __vector_pair, true>(c, a, b);
+  } else {
+    return pger_vecMMA<RealPacket, __vector_pair, false>(c, a, b);
+  }
+}
+
+/** \internal madd for complex times real (MMA version) */
+template <typename RealPacket, typename LhsPacket, bool Conjugate, int StorageOrder>
+EIGEN_ALWAYS_INLINE void pmadd_complex_real_MMA(LhsPacket& a, RealPacket& b, __vector_quad* c) {
+  RealPacket a2 = convertReal(a);
+  if (Conjugate) {
+    RealPacket b2 = pconj2(convertComplex(b)).v;
+    if (StorageOrder == ColMajor) {
+      return pger_vecMMA<RealPacket, RealPacket, false>(c, b2, a2);
+    } else {
+      return pger_vecMMA<RealPacket, RealPacket, false>(c, a2, b2);
+    }
+  } else {
+    if (StorageOrder == ColMajor) {
+      return pger_vecMMA<RealPacket, RealPacket, false>(c, b, a2);
+    } else {
+      return pger_vecMMA<RealPacket, RealPacket, false>(c, a2, b);
+    }
+  }
+}
+
+/** \internal madd for real times complex (MMA version) */
+template <typename RealPacket, typename LhsPacket, bool Conjugate, int StorageOrder>
+EIGEN_ALWAYS_INLINE void pmadd_complex_real_MMA(__vector_pair& a, RealPacket& b, __vector_quad* c) {
+  if (Conjugate) {
+    RealPacket b2 = pconj2(convertComplex(b)).v;
+    return pger_vecMMA<RealPacket, __vector_pair, false>(c, a, b2);
+  } else {
+    return pger_vecMMA<RealPacket, __vector_pair, false>(c, a, b);
+  }
+}
+
+/** \internal core multiply operation for vectors (MMA version) - complex times complex */
+template <typename ScalarPacket, typename LhsPacket, typename SLhsPacket, typename RhsScalar, typename ResPacket,
+          bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>
+EIGEN_ALWAYS_INLINE void gemv_mult_complex_complex_MMA(SLhsPacket& a0, RhsScalar* b, __vector_quad* c0) {
+  ScalarPacket b0;
+  if (StorageOrder == ColMajor) {
+    b0 = pload_realimag_combine(b);
+  } else {
+    b0 = pload_realimag_combine_row(b);
+  }
+  pmadd_complex_complex_MMA<ScalarPacket, LhsPacket, ConjugateLhs, ConjugateRhs, false>(a0, b0, c0);
+}
+
+/** \internal core multiply operation for vectors (MMA version) - complex times real */
+template <typename ScalarPacket, typename LhsPacket, typename SLhsPacket, typename RhsScalar, typename ResPacket,
+          bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>
+EIGEN_ALWAYS_INLINE void gemv_mult_complex_real_MMA(SLhsPacket& a0, RhsScalar* b, __vector_quad* c0) {
+  pload_complex_MMA<ScalarPacket, LhsPacket, SLhsPacket, ResPacket>(a0);
+  ScalarPacket b0;
+  if (StorageOrder == ColMajor) {
+    b0 = pload_real(b);
+  } else {
+    b0 = pload_real_row<ResPacket>(b);
+  }
+  pmadd_complex_real_MMA<ScalarPacket, LhsPacket, ConjugateLhs, ColMajor>(a0, b0, c0);
+}
+
+/** \internal core multiply operation for vectors (MMA version) - real times complex */
+template <typename ScalarPacket, typename LhsPacket, typename SLhsPacket, typename RhsScalar, typename ResPacket,
+          bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>
+EIGEN_ALWAYS_INLINE void gemv_mult_real_complex_MMA(SLhsPacket& a0, RhsScalar* b, __vector_quad* c0) {
+  ScalarPacket b0;
+  if (StorageOrder == ColMajor) {
+    b0 = pload_complex_full(b);
+  } else {
+    b0 = pload_complex_full_row(b);
+  }
+  pmadd_complex_real_MMA<ScalarPacket, LhsPacket, ConjugateRhs,
+                         (sizeof(RhsScalar) == sizeof(std::complex<float>)) ? StorageOrder : ColMajor>(a0, b0, c0);
+}
+
+#define GEMV_MULT_COMPLEX_COMPLEX_MMA(LhsType, RhsType)                                                             \
+  template <typename ScalarPacket, typename LhsScalar, typename LhsPacket, typename SLhsPacket, typename RhsScalar, \
+            typename RhsPacket, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>         \
+  EIGEN_ALWAYS_INLINE void gemv_mult_complex_MMA(LhsType& a0, RhsType* b, __vector_quad* c0) {                      \
+    gemv_mult_complex_complex_MMA<ScalarPacket, LhsPacket, SLhsPacket, RhsScalar, ResPacket, ConjugateLhs,          \
+                                  ConjugateRhs, StorageOrder>(a0, b, c0);                                           \
+  }
+
+GEMV_MULT_COMPLEX_COMPLEX_MMA(Packet2cf, std::complex<float>)
+GEMV_MULT_COMPLEX_COMPLEX_MMA(__vector_pair, std::complex<float>)
+GEMV_MULT_COMPLEX_COMPLEX_MMA(Packet1cd, std::complex<double>)
+
+/** \internal core multiply operation for vectors (MMA version) - complex times complex */
+template <typename ScalarPacket, typename LhsScalar, typename LhsPacket, typename SLhsPacket, typename RhsScalar,
+          typename RhsPacket, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>
+EIGEN_ALWAYS_INLINE void gemv_mult_complex_MMA(__vector_pair& a0, std::complex<double>* b, __vector_quad* c0) {
+  if (sizeof(LhsScalar) == 16) {
+    gemv_mult_complex_complex_MMA<ScalarPacket, LhsPacket, SLhsPacket, RhsScalar, ResPacket, ConjugateLhs, ConjugateRhs,
+                                  StorageOrder>(a0, b, c0);
+  } else {
+    gemv_mult_real_complex_MMA<ScalarPacket, LhsPacket, SLhsPacket, RhsScalar, ResPacket, ConjugateLhs, ConjugateRhs,
+                               StorageOrder>(a0, b, c0);
+  }
+}
+
+#define GEMV_MULT_REAL_COMPLEX_MMA(LhsType, RhsType)                                                                  \
+  template <typename ScalarPacket, typename LhsScalar, typename LhsPacket, typename SLhsPacket, typename RhsScalar,   \
+            typename RhsPacket, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>           \
+  EIGEN_ALWAYS_INLINE void gemv_mult_complex_MMA(LhsType& a0, RhsType* b, __vector_quad* c0) {                        \
+    gemv_mult_real_complex_MMA<ScalarPacket, LhsPacket, SLhsPacket, RhsScalar, ResPacket, ConjugateLhs, ConjugateRhs, \
+                               StorageOrder>(a0, b, c0);                                                              \
+  }
+
+GEMV_MULT_REAL_COMPLEX_MMA(Packet4f, std::complex<float>)
+GEMV_MULT_REAL_COMPLEX_MMA(Packet2d, std::complex<double>)
+
+#define GEMV_MULT_COMPLEX_REAL_MMA(LhsType, RhsType)                                                                  \
+  template <typename ScalarPacket, typename LhsScalar, typename LhsPacket, typename SLhsPacket, typename RhsScalar,   \
+            typename RhsPacket, typename ResPacket, bool ConjugateLhs, bool ConjugateRhs, int StorageOrder>           \
+  EIGEN_ALWAYS_INLINE void gemv_mult_complex_MMA(LhsType& a0, RhsType* b, __vector_quad* c0) {                        \
+    gemv_mult_complex_real_MMA<ScalarPacket, LhsPacket, SLhsPacket, RhsScalar, ResPacket, ConjugateLhs, ConjugateRhs, \
+                               StorageOrder>(a0, b, c0);                                                              \
+  }
+
+GEMV_MULT_COMPLEX_REAL_MMA(Packet2cf, float)
+GEMV_MULT_COMPLEX_REAL_MMA(Packet1cd, double)
+GEMV_MULT_COMPLEX_REAL_MMA(__vector_pair, float)
+GEMV_MULT_COMPLEX_REAL_MMA(__vector_pair, double)
+
+/** \internal disassemble MMA accumulator results into packets */
+template <typename Scalar, typename ScalarPacket, typename LhsPacket, typename RhsPacket, bool ConjugateLhs,
+          bool ConjugateRhs>
+EIGEN_ALWAYS_INLINE void disassembleResults2(__vector_quad* c0, PacketBlock<ScalarPacket, 4>& result0) {
+  __builtin_mma_disassemble_acc(&result0.packet, c0);
+  if (sizeof(LhsPacket) == 16) {
+    if (sizeof(RhsPacket) == 16) {
+      ScalarPacket tmp0, tmp2;
+      tmp2 = vec_mergeh(result0.packet[2], result0.packet[3]);
+      tmp0 = vec_mergeh(result0.packet[0], result0.packet[1]);
+      result0.packet[3] = vec_mergel(result0.packet[3], result0.packet[2]);
+      result0.packet[1] = vec_mergel(result0.packet[1], result0.packet[0]);
+      result0.packet[2] = tmp2;
+      result0.packet[0] = tmp0;
+
+      if (ConjugateLhs) {
+        result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v;
+        result0.packet[2] = pconj2(convertComplex(result0.packet[2])).v;
+      } else if (ConjugateRhs) {
+        result0.packet[1] = pconj2(convertComplex(result0.packet[1])).v;
+        result0.packet[3] = pconj2(convertComplex(result0.packet[3])).v;
+      } else {
+        result0.packet[1] = pconjinv(convertComplex(result0.packet[1])).v;
+        result0.packet[3] = pconjinv(convertComplex(result0.packet[3])).v;
+      }
+      result0.packet[0] = vec_add(result0.packet[0], result0.packet[1]);
+      result0.packet[2] = vec_add(result0.packet[2], result0.packet[3]);
+    } else {
+      result0.packet[0][1] = result0.packet[1][1];
+      result0.packet[2][1] = result0.packet[3][1];
+    }
+  }
+}
+
+template <typename Scalar, typename ScalarPacket, typename LhsPacket, typename RhsPacket, bool ConjugateLhs,
+          bool ConjugateRhs>
+EIGEN_ALWAYS_INLINE void disassembleResults4(__vector_quad* c0, PacketBlock<ScalarPacket, 4>& result0) {
+  __builtin_mma_disassemble_acc(&result0.packet, c0);
+  if (GEMV_IS_COMPLEX_COMPLEX) {
+    if (ConjugateLhs) {
+      result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v;
+      result0.packet[1] = pcplxflip2(convertComplex(result0.packet[1])).v;
+    } else {
+      if (ConjugateRhs) {
+        result0.packet[1] = pcplxconjflip(convertComplex(result0.packet[1])).v;
+      } else {
+        result0.packet[1] = pcplxflipconj(convertComplex(result0.packet[1])).v;
+      }
+    }
+    result0.packet[0] = vec_add(result0.packet[0], result0.packet[1]);
+  } else if (sizeof(LhsPacket) == sizeof(std::complex<float>)) {
+    if (ConjugateLhs) {
+      result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v;
+    }
+  } else {
+    result0.packet[0] = vec_mergee(result0.packet[0], result0.packet[1]);
+  }
+}
+
+template <typename Scalar, typename ScalarPacket, int ResPacketSize, typename LhsPacket, typename RhsPacket,
+          bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_ALWAYS_INLINE void disassembleResults(__vector_quad* c0, PacketBlock<ScalarPacket, 4>& result0) {
+  if (!GEMV_IS_COMPLEX_FLOAT) {
+    disassembleResults2<Scalar, ScalarPacket, LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs>(c0, result0);
+  } else {
+    disassembleResults4<Scalar, ScalarPacket, LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs>(c0, result0);
+  }
+}
+#endif
+
+#define GEMV_GETN_COMPLEX(N) (((N) * ResPacketSize) >> 1)
+
+#define GEMV_LOADPACKET_COL_COMPLEX(iter) \
+  loadLhsPacket<Scalar, LhsScalar, LhsMapper, PLhsPacket>(lhs, i + ((iter) * ResPacketSize), j)
+
+#define GEMV_LOADPACKET_COL_COMPLEX_DATA(iter) convertReal(GEMV_LOADPACKET_COL_COMPLEX(iter))
+
+#ifdef USE_GEMV_MMA
+#define GEMV_INIT_COL_COMPLEX_MMA(iter, N) \
+  if (GEMV_GETN_COMPLEX(N) > iter) {       \
+    __builtin_mma_xxsetaccz(&e0##iter);    \
+  }
+
+#if EIGEN_COMP_LLVM
+#define GEMV_LOADPAIR_COL_COMPLEX_MMA(iter1, iter2)                     \
+  GEMV_BUILDPAIR_MMA(a##iter1, GEMV_LOADPACKET_COL_COMPLEX_DATA(iter2), \
+                     GEMV_LOADPACKET_COL_COMPLEX_DATA((iter2) + 1));    \
+  EIGEN_UNUSED_VARIABLE(f##iter1);
+#else
+#define GEMV_LOADPAIR_COL_COMPLEX_MMA(iter1, iter2)                                                         \
+  if (sizeof(LhsPacket) == 16) {                                                                            \
+    const LhsScalar& src = lhs(i + ((32 * iter1) / sizeof(LhsScalar)), j);                                  \
+    a##iter1 = *reinterpret_cast<__vector_pair*>(const_cast<LhsScalar*>(&src));                             \
+    EIGEN_UNUSED_VARIABLE(f##iter1);                                                                        \
+  } else {                                                                                                  \
+    f##iter1 = lhs.template load<PLhsPacket, Unaligned>(i + ((iter2) * ResPacketSize), j);                  \
+    GEMV_BUILDPAIR_MMA(a##iter1, vec_splat(convertReal(f##iter1), 0), vec_splat(convertReal(f##iter1), 1)); \
+  }
+#endif
+
+#define GEMV_LOAD1_COL_COMPLEX_MMA(iter, N)          \
+  if (GEMV_GETN_COMPLEX(N) > iter) {                 \
+    if (GEMV_IS_COMPLEX_FLOAT) {                     \
+      f##iter = GEMV_LOADPACKET_COL_COMPLEX(iter);   \
+      EIGEN_UNUSED_VARIABLE(a##iter);                \
+    } else {                                         \
+      GEMV_LOADPAIR_COL_COMPLEX_MMA(iter, iter << 1) \
+    }                                                \
+  } else {                                           \
+    EIGEN_UNUSED_VARIABLE(a##iter);                  \
+    EIGEN_UNUSED_VARIABLE(f##iter);                  \
+  }
+
+#define GEMV_WORK1_COL_COMPLEX_MMA(iter, N)                                                                      \
+  if (GEMV_GETN_COMPLEX(N) > iter) {                                                                             \
+    if (GEMV_IS_COMPLEX_FLOAT) {                                                                                 \
+      gemv_mult_complex_MMA<ScalarPacket, LhsScalar, PLhsPacket, PLhsPacket, RhsScalar, RhsPacket, ResPacket,    \
+                            ConjugateLhs, ConjugateRhs, ColMajor>(f##iter, b, &e0##iter);                        \
+    } else {                                                                                                     \
+      gemv_mult_complex_MMA<ScalarPacket, LhsScalar, PLhsPacket, __vector_pair, RhsScalar, RhsPacket, ResPacket, \
+                            ConjugateLhs, ConjugateRhs, ColMajor>(a##iter, b, &e0##iter);                        \
+    }                                                                                                            \
+  }
+
+#define GEMV_LOADPAIR2_COL_COMPLEX_MMA(iter1, iter2) \
+  GEMV_BUILDPAIR_MMA(a##iter1, GEMV_LOADPACKET_COL_COMPLEX_DATA(iter2), GEMV_LOADPACKET_COL_COMPLEX_DATA((iter2) + 1));
+
+#define GEMV_LOAD2_COL_COMPLEX_MMA(iter1, iter2, iter3, N) \
+  if (GEMV_GETN_COMPLEX(N) > iter1) {                      \
+    if (GEMV_IS_COMPLEX_FLOAT) {                           \
+      GEMV_LOADPAIR2_COL_COMPLEX_MMA(iter2, iter2);        \
+      EIGEN_UNUSED_VARIABLE(a##iter3)                      \
+    } else {                                               \
+      GEMV_LOADPAIR2_COL_COMPLEX_MMA(iter2, iter2 << 1);   \
+      GEMV_LOADPAIR2_COL_COMPLEX_MMA(iter3, iter3 << 1);   \
+    }                                                      \
+  } else {                                                 \
+    EIGEN_UNUSED_VARIABLE(a##iter2);                       \
+    EIGEN_UNUSED_VARIABLE(a##iter3);                       \
+  }                                                        \
+  EIGEN_UNUSED_VARIABLE(f##iter2);                         \
+  EIGEN_UNUSED_VARIABLE(f##iter3);
+
+#define GEMV_WORK2_COL_COMPLEX_MMA(iter1, iter2, iter3, N)                                                       \
+  if (GEMV_GETN_COMPLEX(N) > iter1) {                                                                            \
+    if (GEMV_IS_COMPLEX_FLOAT) {                                                                                 \
+      PLhsPacket g[2];                                                                                           \
+      __builtin_vsx_disassemble_pair(reinterpret_cast<void*>(g), &a##iter2);                                     \
+      gemv_mult_complex_MMA<ScalarPacket, LhsScalar, PLhsPacket, PLhsPacket, RhsScalar, RhsPacket, ResPacket,    \
+                            ConjugateLhs, ConjugateRhs, ColMajor>(g[0], b, &e0##iter2);                          \
+      gemv_mult_complex_MMA<ScalarPacket, LhsScalar, PLhsPacket, PLhsPacket, RhsScalar, RhsPacket, ResPacket,    \
+                            ConjugateLhs, ConjugateRhs, ColMajor>(g[1], b, &e0##iter3);                          \
+    } else {                                                                                                     \
+      gemv_mult_complex_MMA<ScalarPacket, LhsScalar, PLhsPacket, __vector_pair, RhsScalar, RhsPacket, ResPacket, \
+                            ConjugateLhs, ConjugateRhs, ColMajor>(a##iter2, b, &e0##iter2);                      \
+      gemv_mult_complex_MMA<ScalarPacket, LhsScalar, PLhsPacket, __vector_pair, RhsScalar, RhsPacket, ResPacket, \
+                            ConjugateLhs, ConjugateRhs, ColMajor>(a##iter3, b, &e0##iter3);                      \
+    }                                                                                                            \
+  }
+
+#if EIGEN_COMP_LLVM
+#define GEMV_LOAD_COL_COMPLEX_MMA(N)                       \
+  if (GEMV_GETN_COMPLEX(N) > 1) {                          \
+    GEMV_UNROLL_HALF(GEMV_LOAD2_COL_COMPLEX_MMA, (N >> 1)) \
+  } else {                                                 \
+    GEMV_UNROLL(GEMV_LOAD1_COL_COMPLEX_MMA, N)             \
+  }
+
+#define GEMV_WORK_COL_COMPLEX_MMA(N)                       \
+  if (GEMV_GETN_COMPLEX(N) > 1) {                          \
+    GEMV_UNROLL_HALF(GEMV_WORK2_COL_COMPLEX_MMA, (N >> 1)) \
+  } else {                                                 \
+    GEMV_UNROLL(GEMV_WORK1_COL_COMPLEX_MMA, N)             \
+  }
+#else
+#define GEMV_LOAD_COL_COMPLEX_MMA(N) GEMV_UNROLL(GEMV_LOAD1_COL_COMPLEX_MMA, N)
+
+#define GEMV_WORK_COL_COMPLEX_MMA(N) GEMV_UNROLL(GEMV_WORK1_COL_COMPLEX_MMA, N)
+#endif
+
+#define GEMV_DISASSEMBLE_COMPLEX_MMA(iter)                                                                   \
+  disassembleResults<Scalar, ScalarPacket, ResPacketSize, LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs>( \
+      &e0##iter, result0##iter);
+
+#define GEMV_STORE_COL_COMPLEX_MMA(iter, N)                                                     \
+  if (GEMV_GETN_COMPLEX(N) > iter) {                                                            \
+    GEMV_DISASSEMBLE_COMPLEX_MMA(iter);                                                         \
+    c0##iter = PResPacket(result0##iter.packet[0]);                                             \
+    if (GEMV_IS_COMPLEX_FLOAT) {                                                                \
+      pstoreu_pmadd_complex<Scalar, ScalarPacket, PResPacket, ResPacket, ResScalar, AlphaData>( \
+          c0##iter, alpha_data, res + i + (iter * ResPacketSize));                              \
+    } else {                                                                                    \
+      pstoreu_pmadd_complex<Scalar, ScalarPacket, PResPacket, ResPacket, ResScalar, AlphaData>( \
+          c0##iter, alpha_data, res + i + ((iter << 1) * ResPacketSize));                       \
+      c0##iter = PResPacket(result0##iter.packet[2]);                                           \
+      pstoreu_pmadd_complex<Scalar, ScalarPacket, PResPacket, ResPacket, ResScalar, AlphaData>( \
+          c0##iter, alpha_data, res + i + (((iter << 1) + 1) * ResPacketSize));                 \
+    }                                                                                           \
+  }
+
+#define GEMV_STORE2_COL_COMPLEX_MMA(iter1, iter2, iter3, N)                                                        \
+  if (GEMV_GETN_COMPLEX(N) > iter1) {                                                                              \
+    GEMV_DISASSEMBLE_COMPLEX_MMA(iter2);                                                                           \
+    GEMV_DISASSEMBLE_COMPLEX_MMA(iter3);                                                                           \
+    c0##iter2 = PResPacket(result0##iter2.packet[0]);                                                              \
+    if (GEMV_IS_COMPLEX_FLOAT) {                                                                                   \
+      c0##iter3 = PResPacket(result0##iter3.packet[0]);                                                            \
+      pstoreu_pmadd_complex<ScalarPacket, PResPacket, ResPacket, ResScalar, AlphaData, ResPacketSize, iter2>(      \
+          c0##iter2, c0##iter3, alpha_data, res + i);                                                              \
+    } else {                                                                                                       \
+      c0##iter3 = PResPacket(result0##iter2.packet[2]);                                                            \
+      pstoreu_pmadd_complex<ScalarPacket, PResPacket, ResPacket, ResScalar, AlphaData, ResPacketSize, iter2 << 1>( \
+          c0##iter2, c0##iter3, alpha_data, res + i);                                                              \
+      c0##iter2 = PResPacket(result0##iter3.packet[0]);                                                            \
+      c0##iter3 = PResPacket(result0##iter3.packet[2]);                                                            \
+      pstoreu_pmadd_complex<ScalarPacket, PResPacket, ResPacket, ResScalar, AlphaData, ResPacketSize, iter3 << 1>( \
+          c0##iter2, c0##iter3, alpha_data, res + i);                                                              \
+    }                                                                                                              \
+  }
+
+#define GEMV_PROCESS_COL_COMPLEX_ONE_MMA(N)                 \
+  GEMV_UNROLL(GEMV_INIT_COL_COMPLEX_MMA, N)                 \
+  Index j = j2;                                             \
+  do {                                                      \
+    const RhsScalar& b1 = rhs2(j, 0);                       \
+    RhsScalar* b = const_cast<RhsScalar*>(&b1);             \
+    GEMV_UNROLL(GEMV_PREFETCH, N)                           \
+    GEMV_LOAD_COL_COMPLEX_MMA(N)                            \
+    GEMV_WORK_COL_COMPLEX_MMA(N)                            \
+  } while (++j < jend);                                     \
+  if (GEMV_GETN(N) <= 2) {                                  \
+    GEMV_UNROLL(GEMV_STORE_COL_COMPLEX_MMA, N)              \
+  } else {                                                  \
+    GEMV_UNROLL_HALF(GEMV_STORE2_COL_COMPLEX_MMA, (N >> 1)) \
+  }                                                         \
+  i += (ResPacketSize * N);
+#endif
+
+#define GEMV_INIT_COMPLEX(iter, N)                                   \
+  if (N > iter) {                                                    \
+    c0##iter = pset_zero<PResPacket>();                              \
+    c1##iter = pset_init<ResPacket, LhsPacket, RhsPacket>(c1##iter); \
+  } else {                                                           \
+    EIGEN_UNUSED_VARIABLE(c0##iter);                                 \
+    EIGEN_UNUSED_VARIABLE(c1##iter);                                 \
+  }
+
+#define GEMV_WORK_COL_COMPLEX(iter, N)                                                                     \
+  if (N > iter) {                                                                                          \
+    f##iter = GEMV_LOADPACKET_COL_COMPLEX(iter);                                                           \
+    gemv_mult_complex<ScalarPacket, PLhsPacket, RhsScalar, RhsPacket, PResPacket, ResPacket, ConjugateLhs, \
+                      ConjugateRhs, ColMajor>(f##iter, b, c0##iter, c1##iter);                             \
+  } else {                                                                                                 \
+    EIGEN_UNUSED_VARIABLE(f##iter);                                                                        \
+  }
+
+#define GEMV_STORE_COL_COMPLEX(iter, N)                                                       \
+  if (N > iter) {                                                                             \
+    if (GEMV_IS_COMPLEX_COMPLEX) {                                                            \
+      c0##iter = padd(c0##iter, c1##iter);                                                    \
+    }                                                                                         \
+    pstoreu_pmadd_complex<Scalar, ScalarPacket, PResPacket, ResPacket, ResScalar, AlphaData>( \
+        c0##iter, alpha_data, res + i + (iter * ResPacketSize));                              \
+  }
+
+/** \internal main macro for gemv_complex_col - initialize accumulators, multiply and add inputs, and store results */
+#define GEMV_PROCESS_COL_COMPLEX_ONE(N)         \
+  GEMV_UNROLL(GEMV_INIT_COMPLEX, N)             \
+  Index j = j2;                                 \
+  do {                                          \
+    const RhsScalar& b1 = rhs2(j, 0);           \
+    RhsScalar* b = const_cast<RhsScalar*>(&b1); \
+    GEMV_UNROLL(GEMV_PREFETCH, N)               \
+    GEMV_UNROLL(GEMV_WORK_COL_COMPLEX, N)       \
+  } while (++j < jend);                         \
+  GEMV_UNROLL(GEMV_STORE_COL_COMPLEX, N)        \
+  i += (ResPacketSize * N);
+
+#if defined(USE_GEMV_MMA) && (EIGEN_COMP_LLVM || defined(USE_SLOWER_GEMV_MMA))
+#define USE_GEMV_COL_COMPLEX_MMA
+#endif
+
+#ifdef USE_GEMV_COL_COMPLEX_MMA
+#define GEMV_PROCESS_COL_COMPLEX(N) GEMV_PROCESS_COL_COMPLEX_ONE_MMA(N)
+#else
+#if defined(USE_GEMV_MMA) && (__GNUC__ > 10)
+#define GEMV_PROCESS_COL_COMPLEX(N)          \
+  if (sizeof(Scalar) != sizeof(LhsPacket)) { \
+    GEMV_PROCESS_COL_COMPLEX_ONE_MMA(N)      \
+  } else {                                   \
+    GEMV_PROCESS_COL_COMPLEX_ONE(N)          \
+  }
+#else
+#define GEMV_PROCESS_COL_COMPLEX(N) GEMV_PROCESS_COL_COMPLEX_ONE(N)
+#endif
+#endif
+
+template <typename Scalar, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, bool LhsIsReal,
+          typename RhsScalar, typename RhsMapper, bool ConjugateRhs, bool RhsIsReal, typename ResScalar>
+EIGEN_STRONG_INLINE void gemv_complex_col(Index rows, Index cols, const LhsMapper& alhs, const RhsMapper& rhs,
+                                          ResScalar* res, Index resIncr, ResScalar alpha) {
+  typedef gemv_traits<LhsScalar, RhsScalar> Traits;
+
+  typedef typename Traits::LhsPacket LhsPacket;
+  typedef typename Traits::RhsPacket RhsPacket;
+  typedef typename Traits::ResPacket ResPacket;
+
+  typedef typename packet_traits<Scalar>::type ScalarPacket;
+  typedef typename packet_traits<LhsScalar>::type PLhsPacket;
+  typedef typename packet_traits<ResScalar>::type PResPacket;
+  typedef gemv_traits<ResPacket, ResPacket> PTraits;
+
+  EIGEN_UNUSED_VARIABLE(resIncr);
+  eigen_internal_assert(resIncr == 1);
+
+  // The following copy tells the compiler that lhs's attributes are not modified outside this function
+  // This helps GCC to generate proper code.
+  LhsMapper lhs(alhs);
+  RhsMapper rhs2(rhs);
+
+  conj_helper<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs> cj;
+
+  const Index lhsStride = lhs.stride();
+  // TODO: for padded aligned inputs, we could enable aligned reads
+  enum {
+    LhsAlignment = Unaligned,
+    ResPacketSize = PTraits::ResPacketSize,
+    LhsPacketSize = PTraits::LhsPacketSize,
+    RhsPacketSize = PTraits::RhsPacketSize,
+  };
+#ifdef EIGEN_POWER_USE_GEMV_PREFETCH
+  const Index prefetch_dist = 64 * LhsPacketSize;
+#endif
+
+#ifndef GCC_ONE_VECTORPAIR_BUG
+  const Index n8 = rows - 8 * ResPacketSize + 1;
+  const Index n4 = rows - 4 * ResPacketSize + 1;
+  const Index n2 = rows - 2 * ResPacketSize + 1;
+#endif
+  const Index n1 = rows - 1 * ResPacketSize + 1;
+
+  // TODO: improve the following heuristic:
+  const Index block_cols = cols < 128 ? cols : (lhsStride * sizeof(LhsScalar) < 16000 ? 16 : 8);
+
+  typedef alpha_store<PResPacket, ResPacket, ResScalar, Scalar> AlphaData;
+  AlphaData alpha_data(alpha);
+
+  for (Index j2 = 0; j2 < cols; j2 += block_cols) {
+    Index jend = numext::mini(j2 + block_cols, cols);
+    Index i = 0;
+    PResPacket c00, c01, c02, c03, c04, c05, c06, c07;
+    ResPacket c10, c11, c12, c13, c14, c15, c16, c17;
+    PLhsPacket f0, f1, f2, f3, f4, f5, f6, f7;
+#ifdef USE_GEMV_MMA
+    __vector_quad e00, e01, e02, e03, e04, e05, e06, e07;
+    __vector_pair a0, a1, a2, a3, a4, a5, a6, a7;
+    PacketBlock<ScalarPacket, 4> result00, result01, result02, result03, result04, result05, result06, result07;
+    GEMV_UNUSED(8, e0)
+    GEMV_UNUSED(8, result0)
+    GEMV_UNUSED(8, a)
+    GEMV_UNUSED(8, f)
+#if !defined(GCC_ONE_VECTORPAIR_BUG) && defined(USE_GEMV_COL_COMPLEX_MMA)
+    if (GEMV_IS_COMPLEX_COMPLEX || !GEMV_IS_COMPLEX_FLOAT)
+#endif
+#endif
+#ifndef GCC_ONE_VECTORPAIR_BUG
+    {
+      while (i < n8) {
+        GEMV_PROCESS_COL_COMPLEX(8)
+      }
+    }
+    while (i < n4) {
+      GEMV_PROCESS_COL_COMPLEX(4)
+    }
+    if (i < n2) {
+      GEMV_PROCESS_COL_COMPLEX(2)
+    }
+    if (i < n1)
+#else
+    while (i < n1)
+#endif
+    {
+      GEMV_PROCESS_COL_COMPLEX_ONE(1)
+    }
+    for (; i < rows; ++i) {
+      ResScalar d0(0);
+      Index j = j2;
+      do {
+        d0 += cj.pmul(lhs(i, j), rhs2(j, 0));
+      } while (++j < jend);
+      res[i] += alpha * d0;
+    }
+  }
+}
+
+template <typename Scalar, int N>
+struct ScalarBlock {
+  Scalar scalar[N];
+};
+
+#ifdef USE_GEMV_MMA
+static Packet16uc p16uc_ELEMENT_3 = {0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f,
+                                     0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f};
+
+/** \internal predux (add elements of a vector) from a MMA accumulator - real results */
+template <typename ResScalar, typename ResPacket>
+EIGEN_ALWAYS_INLINE ScalarBlock<ResScalar, 2> predux_real(__vector_quad* acc0, __vector_quad* acc1) {
+  PacketBlock<ResPacket, 4> result0, result1;
+  __builtin_mma_disassemble_acc(&result0.packet, acc0);
+  __builtin_mma_disassemble_acc(&result1.packet, acc1);
+  result0.packet[0] = vec_mergeh(result0.packet[0], result1.packet[0]);
+  result0.packet[1] = vec_mergeo(result0.packet[1], result1.packet[1]);
+  result0.packet[2] = vec_mergel(result0.packet[2], result1.packet[2]);
+  result0.packet[3] = vec_perm(result0.packet[3], result1.packet[3], p16uc_ELEMENT_3);
+  result0.packet[0] =
+      vec_add(vec_add(result0.packet[0], result0.packet[2]), vec_add(result0.packet[1], result0.packet[3]));
+  return *reinterpret_cast<ScalarBlock<ResScalar, 2>*>(&result0.packet[0]);
+}
+
+template <>
+EIGEN_ALWAYS_INLINE ScalarBlock<double, 2> predux_real<double, Packet2d>(__vector_quad* acc0, __vector_quad* acc1) {
+  PacketBlock<Packet2d, 4> result0, result1;
+  __builtin_mma_disassemble_acc(&result0.packet, acc0);
+  __builtin_mma_disassemble_acc(&result1.packet, acc1);
+  result0.packet[0] =
+      vec_add(vec_mergeh(result0.packet[0], result1.packet[0]), vec_mergel(result0.packet[1], result1.packet[1]));
+  return *reinterpret_cast<ScalarBlock<double, 2>*>(&result0.packet[0]);
+}
+
+/** \internal add complex results together */
+template <typename LhsPacket, typename RhsPacket, bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_ALWAYS_INLINE ScalarBlock<std::complex<float>, 2> addComplexResults(PacketBlock<Packet4f, 4>& result0,
+                                                                          PacketBlock<Packet4f, 4>& result1) {
+  ScalarBlock<std::complex<float>, 2> cc0;
+  result0.packet[0] = reinterpret_cast<Packet4f>(
+      vec_mergeh(reinterpret_cast<Packet2d>(result0.packet[0]), reinterpret_cast<Packet2d>(result1.packet[0])));
+  result0.packet[2] = reinterpret_cast<Packet4f>(
+      vec_mergel(reinterpret_cast<Packet2d>(result0.packet[2]), reinterpret_cast<Packet2d>(result1.packet[2])));
+  result0.packet[0] = vec_add(result0.packet[0], result0.packet[2]);
+  if (GEMV_IS_COMPLEX_COMPLEX) {
+    result0.packet[1] = reinterpret_cast<Packet4f>(
+        vec_mergeh(reinterpret_cast<Packet2d>(result0.packet[1]), reinterpret_cast<Packet2d>(result1.packet[1])));
+    result0.packet[3] = reinterpret_cast<Packet4f>(
+        vec_mergel(reinterpret_cast<Packet2d>(result0.packet[3]), reinterpret_cast<Packet2d>(result1.packet[3])));
+    result0.packet[1] = vec_add(result0.packet[1], result0.packet[3]);
+    if (ConjugateLhs) {
+      result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v;
+      result0.packet[1] = pcplxflip2(convertComplex(result0.packet[1])).v;
+    } else if (ConjugateRhs) {
+      result0.packet[1] = pcplxconjflip(convertComplex(result0.packet[1])).v;
+    } else {
+      result0.packet[1] = pcplxflipconj(convertComplex(result0.packet[1])).v;
+    }
+    result0.packet[0] = vec_add(result0.packet[0], result0.packet[1]);
+  } else {
+    if (ConjugateLhs && (sizeof(LhsPacket) == sizeof(std::complex<float>))) {
+      result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v;
+    }
+  }
+  cc0.scalar[0].real(result0.packet[0][0]);
+  cc0.scalar[0].imag(result0.packet[0][1]);
+  cc0.scalar[1].real(result0.packet[0][2]);
+  cc0.scalar[1].imag(result0.packet[0][3]);
+  return cc0;
+}
+
+template <typename LhsPacket, typename RhsPacket, bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_ALWAYS_INLINE ScalarBlock<std::complex<double>, 2> addComplexResults(PacketBlock<Packet2d, 4>&,
+                                                                           PacketBlock<Packet2d, 4>&) {
+  ScalarBlock<std::complex<double>, 2> cc0;
+  EIGEN_UNUSED_VARIABLE(cc0);
+  return cc0;  // Just for compilation
+}
+
+/** \internal predux (add elements of a vector) from a MMA accumulator - complex results */
+template <typename ResScalar, typename ResPacket, typename LhsPacket, typename RhsPacket, bool ConjugateLhs,
+          bool ConjugateRhs>
+EIGEN_ALWAYS_INLINE ScalarBlock<ResScalar, 2> predux_complex(__vector_quad* acc0, __vector_quad* acc1) {
+  PacketBlock<ResPacket, 4> result0, result1;
+  __builtin_mma_disassemble_acc(&result0.packet, acc0);
+  __builtin_mma_disassemble_acc(&result1.packet, acc1);
+  return addComplexResults<LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs>(result0, result1);
+}
+
+template <typename ResScalar, typename ResPacket>
+EIGEN_ALWAYS_INLINE ScalarBlock<ResScalar, 2> predux_real(__vector_quad* acc0) {
+  PacketBlock<ResPacket, 4> result0;
+  __builtin_mma_disassemble_acc(&result0.packet, acc0);
+  result0.packet[0] =
+      vec_add(vec_mergeh(result0.packet[0], result0.packet[2]), vec_mergel(result0.packet[1], result0.packet[3]));
+  return *reinterpret_cast<ScalarBlock<ResScalar, 2>*>(&result0.packet[0]);
+}
+
+template <typename ResScalar, typename ResPacket, typename LhsPacket, typename RhsPacket, bool ConjugateLhs,
+          bool ConjugateRhs>
+EIGEN_ALWAYS_INLINE ScalarBlock<ResScalar, 2> predux_complex(__vector_quad* acc0) {
+  ScalarBlock<ResScalar, 2> cc0;
+  PacketBlock<ResPacket, 4> result0;
+  __builtin_mma_disassemble_acc(&result0.packet, acc0);
+  if (GEMV_IS_COMPLEX_COMPLEX) {
+    if (ConjugateLhs) {
+      result0.packet[1] = pconjinv(convertComplex(result0.packet[1])).v;
+      result0.packet[3] = pconjinv(convertComplex(result0.packet[3])).v;
+    } else if (ConjugateRhs) {
+      result0.packet[0] = pconj2(convertComplex(result0.packet[0])).v;
+      result0.packet[2] = pconj2(convertComplex(result0.packet[2])).v;
+    } else {
+      result0.packet[1] = pconj2(convertComplex(result0.packet[1])).v;
+      result0.packet[3] = pconj2(convertComplex(result0.packet[3])).v;
+    }
+    result0.packet[0] = vec_add(result0.packet[0], __builtin_vsx_xxpermdi(result0.packet[1], result0.packet[1], 2));
+    result0.packet[2] = vec_add(result0.packet[2], __builtin_vsx_xxpermdi(result0.packet[3], result0.packet[3], 2));
+  } else {
+    result0.packet[0] = __builtin_vsx_xxpermdi(result0.packet[0], result0.packet[1], 1);
+    result0.packet[2] = __builtin_vsx_xxpermdi(result0.packet[2], result0.packet[3], 1);
+  }
+  cc0.scalar[0].real(result0.packet[0][0]);
+  cc0.scalar[0].imag(result0.packet[0][1]);
+  cc0.scalar[1].real(result0.packet[2][0]);
+  cc0.scalar[1].imag(result0.packet[2][1]);
+  return cc0;
+}
+#endif
+
+template <typename ResScalar, typename ResPacket>
+EIGEN_ALWAYS_INLINE ScalarBlock<ResScalar, 2> predux_real(ResPacket& a, ResPacket& b) {
+  ScalarBlock<ResScalar, 2> cc0;
+  cc0.scalar[0] = predux(a);
+  cc0.scalar[1] = predux(b);
+  return cc0;
+}
+
+template <typename ResScalar, typename ResPacket>
+EIGEN_ALWAYS_INLINE ScalarBlock<ResScalar, 2> predux_complex(ResPacket& a, ResPacket& b) {
+  return predux_real<ResScalar, ResPacket>(a, b);
+}
+
+#define GEMV_UNROLL_ROW(func, N) func(0, N) func(1, N) func(2, N) func(3, N) func(4, N) func(5, N) func(6, N) func(7, N)
+
+#define GEMV_UNROLL_ROW_HALF(func, N) func(0, 0, 1, N) func(1, 2, 3, N) func(2, 4, 5, N) func(3, 6, 7, N)
+
+#define GEMV_LOADPACKET_ROW(iter) lhs.template load<LhsPacket, Unaligned>(i + (iter), j)
+
+#ifdef USE_GEMV_MMA
+#define GEMV_UNROLL3_ROW(func, N, which)                                                                      \
+  func(0, N, which) func(1, N, which) func(2, N, which) func(3, N, which) func(4, N, which) func(5, N, which) \
+      func(6, N, which) func(7, N, which)
+
+#define GEMV_UNUSED_ROW(N, which) GEMV_UNROLL3_ROW(GEMV_UNUSED_VAR, N, which)
+
+#define GEMV_INIT_ROW(iter, N)         \
+  if (GEMV_GETN(N) > iter) {           \
+    __builtin_mma_xxsetaccz(&c##iter); \
+  }
+
+#define GEMV_LOADPAIR_ROW(iter1, iter2) \
+  GEMV_BUILDPAIR_MMA(b##iter1, GEMV_LOADPACKET_ROW(iter2), GEMV_LOADPACKET_ROW((iter2) + 1));
+
+#define GEMV_WORK_ROW(iter, N)                                                              \
+  if (GEMV_GETN(N) > iter) {                                                                \
+    if (GEMV_IS_FLOAT) {                                                                    \
+      pger_vecMMA_acc<LhsPacket, RhsPacket, true>(&c##iter, a0, GEMV_LOADPACKET_ROW(iter)); \
+    } else {                                                                                \
+      __vector_pair b##iter;                                                                \
+      GEMV_LOADPAIR_ROW(iter, iter << 1)                                                    \
+      pger_vecMMA_acc<LhsPacket, RhsPacket, true>(&c##iter, b##iter, a0);                   \
+    }                                                                                       \
+  }
+
+#define GEMV_PREDUX2(iter1, iter2, iter3, N)                               \
+  if (N > iter1) {                                                         \
+    if (GEMV_IS_FLOAT) {                                                   \
+      cc##iter1 = predux_real<ResScalar, ResPacket>(&c##iter2, &c##iter3); \
+    } else {                                                               \
+      cc##iter1 = predux_real<ResScalar, ResPacket>(&c##iter1);            \
+    }                                                                      \
+  } else {                                                                 \
+    EIGEN_UNUSED_VARIABLE(cc##iter1);                                      \
+  }
+#else
+#define GEMV_INIT_ROW(iter, N)                \
+  if (N > iter) {                             \
+    c##iter = pset1<ResPacket>(ResScalar(0)); \
+  } else {                                    \
+    EIGEN_UNUSED_VARIABLE(c##iter);           \
+  }
+
+#define GEMV_WORK_ROW(iter, N)                                   \
+  if (N > iter) {                                                \
+    c##iter = pcj.pmadd(GEMV_LOADPACKET_ROW(iter), a0, c##iter); \
+  }
+
+#define GEMV_PREDUX2(iter1, iter2, iter3, N)                           \
+  if (N > iter1) {                                                     \
+    cc##iter1 = predux_real<ResScalar, ResPacket>(c##iter2, c##iter3); \
+  } else {                                                             \
+    EIGEN_UNUSED_VARIABLE(cc##iter1);                                  \
+  }
+#endif
+
+#define GEMV_MULT(iter1, iter2, iter3, N)                  \
+  if (N > iter1) {                                         \
+    cc##iter1.scalar[0] += cj.pmul(lhs(i + iter2, j), a0); \
+    cc##iter1.scalar[1] += cj.pmul(lhs(i + iter3, j), a0); \
+  }
+
+#define GEMV_STORE_ROW(iter1, iter2, iter3, N)                                           \
+  if (N > iter1) {                                                                       \
+    storeMaddData<ResScalar>(res + ((i + iter2) * resIncr), alpha, cc##iter1.scalar[0]); \
+    storeMaddData<ResScalar>(res + ((i + iter3) * resIncr), alpha, cc##iter1.scalar[1]); \
+  }
+
+/** \internal main macro for gemv_row - initialize accumulators, multiply and add inputs, predux and store results */
+#define GEMV_PROCESS_ROW(N)                                       \
+  for (; i < n##N; i += N) {                                      \
+    GEMV_UNROLL_ROW(GEMV_INIT_ROW, N)                             \
+    Index j = 0;                                                  \
+    for (; j + LhsPacketSize <= cols; j += LhsPacketSize) {       \
+      RhsPacket a0 = rhs2.template load<RhsPacket, Unaligned>(j); \
+      GEMV_UNROLL_ROW(GEMV_WORK_ROW, N)                           \
+    }                                                             \
+    GEMV_UNROLL_ROW_HALF(GEMV_PREDUX2, (N >> 1))                  \
+    for (; j < cols; ++j) {                                       \
+      RhsScalar a0 = rhs2(j);                                     \
+      GEMV_UNROLL_ROW_HALF(GEMV_MULT, (N >> 1))                   \
+    }                                                             \
+    GEMV_UNROLL_ROW_HALF(GEMV_STORE_ROW, (N >> 1))                \
+  }
+
+template <typename LhsScalar, typename LhsMapper, typename RhsScalar, typename RhsMapper, typename ResScalar>
+EIGEN_STRONG_INLINE void gemv_row(Index rows, Index cols, const LhsMapper& alhs, const RhsMapper& rhs, ResScalar* res,
+                                  Index resIncr, ResScalar alpha) {
+  typedef gemv_traits<LhsScalar, RhsScalar> Traits;
+
+  typedef typename Traits::LhsPacket LhsPacket;
+  typedef typename Traits::RhsPacket RhsPacket;
+  typedef typename Traits::ResPacket ResPacket;
+
+  // The following copy tells the compiler that lhs's attributes are not modified outside this function
+  // This helps GCC to generate proper code.
+  LhsMapper lhs(alhs);
+  typename RhsMapper::LinearMapper rhs2 = rhs.getLinearMapper(0, 0);
+
+  eigen_internal_assert(rhs.stride() == 1);
+  conj_helper<LhsScalar, RhsScalar, false, false> cj;
+  conj_helper<LhsPacket, RhsPacket, false, false> pcj;
+
+  // TODO: fine tune the following heuristic. The rationale is that if the matrix is very large,
+  //       processing 8 rows at once might be counter productive wrt cache.
+#ifndef GCC_ONE_VECTORPAIR_BUG
+  const Index n8 = lhs.stride() * sizeof(LhsScalar) > 32000 ? (rows - 7) : (rows - 7);
+  const Index n4 = rows - 3;
+  const Index n2 = rows - 1;
+#endif
+
+  // TODO: for padded aligned inputs, we could enable aligned reads
+  enum {
+    LhsAlignment = Unaligned,
+    ResPacketSize = Traits::ResPacketSize,
+    LhsPacketSize = Traits::LhsPacketSize,
+    RhsPacketSize = Traits::RhsPacketSize,
+  };
+
+  Index i = 0;
+#ifdef USE_GEMV_MMA
+  __vector_quad c0, c1, c2, c3, c4, c5, c6, c7;
+  GEMV_UNUSED_ROW(8, c)
+#else
+  ResPacket c0, c1, c2, c3, c4, c5, c6, c7;
+#endif
+#ifndef GCC_ONE_VECTORPAIR_BUG
+  ScalarBlock<ResScalar, 2> cc0, cc1, cc2, cc3;
+  GEMV_PROCESS_ROW(8)
+  GEMV_PROCESS_ROW(4)
+  GEMV_PROCESS_ROW(2)
+#endif
+  for (; i < rows; ++i) {
+    ResPacket d0 = pset1<ResPacket>(ResScalar(0));
+    Index j = 0;
+    for (; j + LhsPacketSize <= cols; j += LhsPacketSize) {
+      RhsPacket b0 = rhs2.template load<RhsPacket, Unaligned>(j);
+
+      d0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 0, j), b0, d0);
+    }
+    ResScalar dd0 = predux(d0);
+    for (; j < cols; ++j) {
+      dd0 += cj.pmul(lhs(i, j), rhs2(j));
+    }
+    res[i * resIncr] += alpha * dd0;
+  }
+}
+
+#define EIGEN_POWER_GEMV_REAL_SPECIALIZE_COL(Scalar)                                                                   \
+  template <typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version> \
+  struct general_matrix_vector_product<Index, Scalar, LhsMapper, ColMajor, ConjugateLhs, Scalar, RhsMapper,            \
+                                       ConjugateRhs, Version> {                                                        \
+    typedef typename ScalarBinaryOpTraits<Scalar, Scalar>::ReturnType ResScalar;                                       \
+                                                                                                                       \
+    EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(Index rows, Index cols, const LhsMapper& lhs,                  \
+                                                        const RhsMapper& rhs, ResScalar* res, Index resIncr,           \
+                                                        ResScalar alpha) {                                             \
+      gemv_col<Scalar, LhsMapper, Scalar, RhsMapper, ResScalar>(rows, cols, lhs, rhs, res, resIncr, alpha);            \
+    }                                                                                                                  \
+  };
+
+#define EIGEN_POWER_GEMV_REAL_SPECIALIZE_ROW(Scalar)                                                                   \
+  template <typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version> \
+  struct general_matrix_vector_product<Index, Scalar, LhsMapper, RowMajor, ConjugateLhs, Scalar, RhsMapper,            \
+                                       ConjugateRhs, Version> {                                                        \
+    typedef typename ScalarBinaryOpTraits<Scalar, Scalar>::ReturnType ResScalar;                                       \
+                                                                                                                       \
+    EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(Index rows, Index cols, const LhsMapper& lhs,                  \
+                                                        const RhsMapper& rhs, ResScalar* res, Index resIncr,           \
+                                                        ResScalar alpha) {                                             \
+      gemv_row<Scalar, LhsMapper, Scalar, RhsMapper, ResScalar>(rows, cols, lhs, rhs, res, resIncr, alpha);            \
+    }                                                                                                                  \
+  };
+
+EIGEN_POWER_GEMV_REAL_SPECIALIZE_COL(float)
+EIGEN_POWER_GEMV_REAL_SPECIALIZE_COL(double)
+EIGEN_POWER_GEMV_REAL_SPECIALIZE_ROW(float)
+EIGEN_POWER_GEMV_REAL_SPECIALIZE_ROW(double)
+
+#ifdef USE_GEMV_MMA
+#define gemv_bf16_col gemvMMA_bfloat16_col
+#define gemv_bf16_row gemvMMA_bfloat16_row
+#else
+#define gemv_bf16_col gemv_bfloat16_col
+#define gemv_bf16_row gemv_bfloat16_row
+#endif
+
+#define EIGEN_POWER_GEMV_REAL_SPECIALIZE_COL_BFLOAT16()                                                                \
+  template <typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version> \
+  struct general_matrix_vector_product<Index, bfloat16, LhsMapper, ColMajor, ConjugateLhs, bfloat16, RhsMapper,        \
+                                       ConjugateRhs, Version> {                                                        \
+    EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(Index rows, Index cols, const LhsMapper& lhs,                  \
+                                                        const RhsMapper& rhs, bfloat16* res, Index resIncr,            \
+                                                        bfloat16 alpha) {                                              \
+      gemv_bf16_col<LhsMapper, RhsMapper>(rows, cols, lhs, rhs, res, resIncr, alpha);                                  \
+    }                                                                                                                  \
+  };
+
+#define EIGEN_POWER_GEMV_REAL_SPECIALIZE_ROW_BFLOAT16()                                                                \
+  template <typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version> \
+  struct general_matrix_vector_product<Index, bfloat16, LhsMapper, RowMajor, ConjugateLhs, bfloat16, RhsMapper,        \
+                                       ConjugateRhs, Version> {                                                        \
+    EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(Index rows, Index cols, const LhsMapper& lhs,                  \
+                                                        const RhsMapper& rhs, bfloat16* res, Index resIncr,            \
+                                                        bfloat16 alpha) {                                              \
+      gemv_bf16_row<LhsMapper, RhsMapper>(rows, cols, lhs, rhs, res, resIncr, alpha);                                  \
+    }                                                                                                                  \
+  };
+
+EIGEN_POWER_GEMV_REAL_SPECIALIZE_COL_BFLOAT16()
+EIGEN_POWER_GEMV_REAL_SPECIALIZE_ROW_BFLOAT16()
+
+template <typename ResScalar, typename PResPacket, typename ResPacket, typename LhsPacket, typename RhsPacket>
+EIGEN_ALWAYS_INLINE ScalarBlock<ResScalar, 2> predux_complex(PResPacket& a0, PResPacket& b0, ResPacket& a1,
+                                                             ResPacket& b1) {
+  if (GEMV_IS_COMPLEX_COMPLEX) {
+    a0 = padd(a0, a1);
+    b0 = padd(b0, b1);
+  }
+  return predux_complex<ResScalar, PResPacket>(a0, b0);
+}
+
+#define GEMV_LOADPACKET_ROW_COMPLEX(iter) loadLhsPacket<Scalar, LhsScalar, LhsMapper, PLhsPacket>(lhs, i + (iter), j)
+
+#define GEMV_LOADPACKET_ROW_COMPLEX_DATA(iter) convertReal(GEMV_LOADPACKET_ROW_COMPLEX(iter))
+
+#define GEMV_PROCESS_ROW_COMPLEX_SINGLE_WORK(which, N)    \
+  j = 0;                                                  \
+  for (; j + LhsPacketSize <= cols; j += LhsPacketSize) { \
+    const RhsScalar& b1 = rhs2(j);                        \
+    RhsScalar* b = const_cast<RhsScalar*>(&b1);           \
+    GEMV_UNROLL_ROW(which, N)                             \
+  }
+
+#define GEMV_PROCESS_END_ROW_COMPLEX(N)               \
+  for (; j < cols; ++j) {                             \
+    RhsScalar b0 = rhs2(j);                           \
+    GEMV_UNROLL_ROW_HALF(GEMV_MULT_COMPLEX, (N >> 1)) \
+  }                                                   \
+  GEMV_UNROLL_ROW_HALF(GEMV_STORE_ROW_COMPLEX, (N >> 1))
+
+#ifdef USE_GEMV_MMA
+#define GEMV_INIT_ROW_COMPLEX_MMA(iter, N) \
+  if (GEMV_GETN_COMPLEX(N) > iter) {       \
+    __builtin_mma_xxsetaccz(&e0##iter);    \
+  }
+
+#define GEMV_LOADPAIR_ROW_COMPLEX_MMA(iter1, iter2) \
+  GEMV_BUILDPAIR_MMA(a##iter1, GEMV_LOADPACKET_ROW_COMPLEX_DATA(iter2), GEMV_LOADPACKET_ROW_COMPLEX_DATA((iter2) + 1));
+
+#define GEMV_WORK_ROW_COMPLEX_MMA(iter, N)                                                                       \
+  if (GEMV_GETN_COMPLEX(N) > iter) {                                                                             \
+    if (GEMV_IS_COMPLEX_FLOAT) {                                                                                 \
+      PLhsPacket a##iter = GEMV_LOADPACKET_ROW_COMPLEX(iter);                                                    \
+      gemv_mult_complex_MMA<ScalarPacket, LhsScalar, PLhsPacket, PLhsPacket, RhsScalar, RhsPacket, ResPacket,    \
+                            ConjugateLhs, ConjugateRhs, RowMajor>(a##iter, b, &e0##iter);                        \
+    } else {                                                                                                     \
+      __vector_pair a##iter;                                                                                     \
+      GEMV_LOADPAIR_ROW_COMPLEX_MMA(iter, iter << 1)                                                             \
+      gemv_mult_complex_MMA<ScalarPacket, LhsScalar, PLhsPacket, __vector_pair, RhsScalar, RhsPacket, ResPacket, \
+                            ConjugateLhs, ConjugateRhs, RowMajor>(a##iter, b, &e0##iter);                        \
+    }                                                                                                            \
+  }
+
+#define GEMV_PREDUX4_COMPLEX_MMA(iter1, iter2, iter3, N)                                                         \
+  if (N > iter1) {                                                                                               \
+    if (GEMV_IS_COMPLEX_FLOAT) {                                                                                 \
+      cc##iter1 = predux_complex<ResScalar, ScalarPacket, LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs>(     \
+          &e0##iter2, &e0##iter3);                                                                               \
+    } else {                                                                                                     \
+      cc##iter1 =                                                                                                \
+          predux_complex<ResScalar, ScalarPacket, LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs>(&e0##iter1); \
+    }                                                                                                            \
+  } else {                                                                                                       \
+    EIGEN_UNUSED_VARIABLE(cc##iter1);                                                                            \
+  }
+
+#define GEMV_PROCESS_ROW_COMPLEX_SINGLE_MMA(N)  \
+  GEMV_UNROLL_ROW(GEMV_INIT_ROW_COMPLEX_MMA, N) \
+  GEMV_PROCESS_ROW_COMPLEX_SINGLE_WORK(GEMV_WORK_ROW_COMPLEX_MMA, N)
+
+#define GEMV_PROCESS_ROW_COMPLEX_ONE_MMA(N)                  \
+  for (; i < n##N; i += N) {                                 \
+    GEMV_PROCESS_ROW_COMPLEX_SINGLE_MMA(N)                   \
+    GEMV_UNROLL_ROW_HALF(GEMV_PREDUX4_COMPLEX_MMA, (N >> 1)) \
+    GEMV_PROCESS_END_ROW_COMPLEX(N);                         \
+  }
+#endif
+
+#define GEMV_WORK_ROW_COMPLEX(iter, N)                                                                     \
+  if (N > iter) {                                                                                          \
+    PLhsPacket a##iter = GEMV_LOADPACKET_ROW_COMPLEX(iter);                                                \
+    gemv_mult_complex<ScalarPacket, PLhsPacket, RhsScalar, RhsPacket, PResPacket, ResPacket, ConjugateLhs, \
+                      ConjugateRhs, RowMajor>(a##iter, b, c0##iter, c1##iter);                             \
+  }
+
+#define GEMV_PREDUX4_COMPLEX(iter1, iter2, iter3, N)                                                          \
+  if (N > iter1) {                                                                                            \
+    cc##iter1 = predux_complex<ResScalar, PResPacket, ResPacket, LhsPacket, RhsPacket>(c0##iter2, c0##iter3,  \
+                                                                                       c1##iter2, c1##iter3); \
+  } else {                                                                                                    \
+    EIGEN_UNUSED_VARIABLE(cc##iter1);                                                                         \
+  }
+
+#define GEMV_MULT_COMPLEX(iter1, iter2, iter3, N)          \
+  if (N > iter1) {                                         \
+    cc##iter1.scalar[0] += cj.pmul(lhs(i + iter2, j), b0); \
+    cc##iter1.scalar[1] += cj.pmul(lhs(i + iter3, j), b0); \
+  }
+
+#define GEMV_STORE_ROW_COMPLEX(iter1, iter2, iter3, N)                                   \
+  if (N > iter1) {                                                                       \
+    storeMaddData<ResScalar>(res + ((i + iter2) * resIncr), alpha, cc##iter1.scalar[0]); \
+    storeMaddData<ResScalar>(res + ((i + iter3) * resIncr), alpha, cc##iter1.scalar[1]); \
+  }
+
+#define GEMV_PROCESS_ROW_COMPLEX_SINGLE_NEW(N) \
+  GEMV_UNROLL_ROW(GEMV_INIT_COMPLEX, N)        \
+  GEMV_PROCESS_ROW_COMPLEX_SINGLE_WORK(GEMV_WORK_ROW_COMPLEX, N)
+
+/** \internal main macro for gemv_complex_row - initialize accumulators, multiply and add inputs, predux and store
+ * results */
+#define GEMV_PROCESS_ROW_COMPLEX_ONE_NEW(N)              \
+  for (; i < n##N; i += N) {                             \
+    GEMV_PROCESS_ROW_COMPLEX_SINGLE_NEW(N)               \
+    GEMV_UNROLL_ROW_HALF(GEMV_PREDUX4_COMPLEX, (N >> 1)) \
+    GEMV_PROCESS_END_ROW_COMPLEX(N);                     \
+  }
+
+#define GEMV_PROCESS_ROW_COMPLEX_PREDUX_NEW(iter) \
+  if (GEMV_IS_COMPLEX_COMPLEX) {                  \
+    c0##iter = padd(c0##iter, c1##iter);          \
+  }                                               \
+  dd0 = predux(c0##iter);
+
+#if EIGEN_COMP_LLVM
+#define GEMV_PROCESS_ROW_COMPLEX_SINGLE(N) GEMV_PROCESS_ROW_COMPLEX_SINGLE_NEW(N)
+
+#define GEMV_PROCESS_ROW_COMPLEX_ONE(N) GEMV_PROCESS_ROW_COMPLEX_ONE_NEW(N)
+
+#define GEMV_PROCESS_ROW_COMPLEX_PREDUX(iter) GEMV_PROCESS_ROW_COMPLEX_PREDUX_NEW(iter)
+#else
+// gcc seems to be reading and writing registers unnecessarily to memory.
+// Use the old way for complex double until it is fixed.
+
+#define GEMV_LOADPACKET_ROW_COMPLEX_OLD(iter) lhs.template load<LhsPacket, LhsAlignment>(i + (iter), j)
+
+#define GEMV_INIT_COMPLEX_OLD(iter, N) \
+  EIGEN_UNUSED_VARIABLE(c0##iter);     \
+  if (N > iter) {                      \
+    c1##iter = pset_zero<ResPacket>(); \
+  } else {                             \
+    EIGEN_UNUSED_VARIABLE(c1##iter);   \
+  }
+
+#define GEMV_WORK_ROW_COMPLEX_OLD(iter, N)                     \
+  if (N > iter) {                                              \
+    LhsPacket a##iter = GEMV_LOADPACKET_ROW_COMPLEX_OLD(iter); \
+    c1##iter = pcj.pmadd(a##iter, b0, c1##iter);               \
+  }
+
+#define GEMV_PREDUX4_COMPLEX_OLD(iter1, iter2, iter3, N) \
+  if (N > iter1) {                                       \
+    cc##iter1.scalar[0] = predux(c1##iter2);             \
+    cc##iter1.scalar[1] = predux(c1##iter3);             \
+  } else {                                               \
+    EIGEN_UNUSED_VARIABLE(cc##iter1);                    \
+  }
+
+#define GEMV_PROCESS_ROW_COMPLEX_SINGLE_OLD(N)                  \
+  GEMV_UNROLL_ROW(GEMV_INIT_COMPLEX_OLD, N)                     \
+  j = 0;                                                        \
+  for (; j + LhsPacketSize <= cols; j += LhsPacketSize) {       \
+    RhsPacket b0 = rhs2.template load<RhsPacket, Unaligned>(j); \
+    GEMV_UNROLL_ROW(GEMV_WORK_ROW_COMPLEX_OLD, N)               \
+  }
+
+#define GEMV_PROCESS_ROW_COMPLEX_ONE_OLD(N)                  \
+  for (; i < n##N; i += N) {                                 \
+    GEMV_PROCESS_ROW_COMPLEX_SINGLE_OLD(N)                   \
+    GEMV_UNROLL_ROW_HALF(GEMV_PREDUX4_COMPLEX_OLD, (N >> 1)) \
+    GEMV_PROCESS_END_ROW_COMPLEX(N)                          \
+  }
+
+#define GEMV_PROCESS_ROW_COMPLEX_PREDUX_OLD(iter) dd0 = predux(c1##iter);
+
+#if (__GNUC__ > 10)
+#define GEMV_PROCESS_ROW_COMPLEX_IS_NEW 1
+#else
+#define GEMV_PROCESS_ROW_COMPLEX_IS_NEW (sizeof(Scalar) == sizeof(float)) || GEMV_IS_COMPLEX_COMPLEX
+#endif
+
+#define GEMV_PROCESS_ROW_COMPLEX_SINGLE(N) \
+  if (GEMV_PROCESS_ROW_COMPLEX_IS_NEW) {   \
+    GEMV_PROCESS_ROW_COMPLEX_SINGLE_NEW(N) \
+  } else {                                 \
+    GEMV_PROCESS_ROW_COMPLEX_SINGLE_OLD(N) \
+  }
+
+#define GEMV_PROCESS_ROW_COMPLEX_ONE(N)  \
+  if (GEMV_PROCESS_ROW_COMPLEX_IS_NEW) { \
+    GEMV_PROCESS_ROW_COMPLEX_ONE_NEW(N)  \
+  } else {                               \
+    GEMV_PROCESS_ROW_COMPLEX_ONE_OLD(N)  \
+  }
+
+#define GEMV_PROCESS_ROW_COMPLEX_PREDUX(iter) \
+  if (GEMV_PROCESS_ROW_COMPLEX_IS_NEW) {      \
+    GEMV_PROCESS_ROW_COMPLEX_PREDUX_NEW(iter) \
+  } else {                                    \
+    GEMV_PROCESS_ROW_COMPLEX_PREDUX_OLD(iter) \
+  }
+#endif
+
+#ifdef USE_GEMV_MMA
+#define GEMV_PROCESS_ROW_COMPLEX(N) GEMV_PROCESS_ROW_COMPLEX_ONE_MMA(N)
+#else
+#define GEMV_PROCESS_ROW_COMPLEX(N) GEMV_PROCESS_ROW_COMPLEX_ONE(N)
+#endif
+
+template <typename Scalar, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, bool LhsIsReal,
+          typename RhsScalar, typename RhsMapper, bool ConjugateRhs, bool RhsIsReal, typename ResScalar>
+EIGEN_STRONG_INLINE void gemv_complex_row(Index rows, Index cols, const LhsMapper& alhs, const RhsMapper& rhs,
+                                          ResScalar* res, Index resIncr, ResScalar alpha) {
+  typedef gemv_traits<LhsScalar, RhsScalar> Traits;
+
+  typedef typename Traits::LhsPacket LhsPacket;
+  typedef typename Traits::RhsPacket RhsPacket;
+  typedef typename Traits::ResPacket ResPacket;
+
+  typedef typename packet_traits<Scalar>::type ScalarPacket;
+  typedef typename packet_traits<LhsScalar>::type PLhsPacket;
+  typedef typename packet_traits<ResScalar>::type PResPacket;
+  typedef gemv_traits<ResPacket, ResPacket> PTraits;
+
+  // The following copy tells the compiler that lhs's attributes are not modified outside this function
+  // This helps GCC to generate proper code.
+  LhsMapper lhs(alhs);
+  typename RhsMapper::LinearMapper rhs2 = rhs.getLinearMapper(0, 0);
+
+  eigen_internal_assert(rhs.stride() == 1);
+  conj_helper<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs> cj;
+#if !EIGEN_COMP_LLVM
+  conj_helper<LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs> pcj;
+#endif
+
+  // TODO: fine tune the following heuristic. The rationale is that if the matrix is very large,
+  //       processing 8 rows at once might be counter productive wrt cache.
+#ifndef GCC_ONE_VECTORPAIR_BUG
+  const Index n8 = lhs.stride() * sizeof(LhsScalar) > 32000 ? (rows - 7) : (rows - 7);
+  const Index n4 = rows - 3;
+  const Index n2 = rows - 1;
+#endif
+
+  // TODO: for padded aligned inputs, we could enable aligned reads
+  enum {
+    LhsAlignment = Unaligned,
+    ResPacketSize = PTraits::ResPacketSize,
+    LhsPacketSize = PTraits::LhsPacketSize,
+    RhsPacketSize = PTraits::RhsPacketSize,
+  };
+
+  Index i = 0, j;
+  PResPacket c00, c01, c02, c03, c04, c05, c06, c07;
+  ResPacket c10, c11, c12, c13, c14, c15, c16, c17;
+#ifdef USE_GEMV_MMA
+  __vector_quad e00, e01, e02, e03, e04, e05, e06, e07;
+  GEMV_UNUSED_ROW(8, e0)
+  GEMV_UNUSED_EXTRA(1, c0)
+  GEMV_UNUSED_EXTRA(1, c1)
+#endif
+  ResScalar dd0;
+#ifndef GCC_ONE_VECTORPAIR_BUG
+  ScalarBlock<ResScalar, 2> cc0, cc1, cc2, cc3;
+#ifdef USE_GEMV_MMA
+  if (!GEMV_IS_COMPLEX_COMPLEX)
+#endif
+  {
+    GEMV_PROCESS_ROW_COMPLEX(8)
+  }
+  GEMV_PROCESS_ROW_COMPLEX(4)
+  GEMV_PROCESS_ROW_COMPLEX(2)
+#endif
+  for (; i < rows; ++i) {
+    GEMV_PROCESS_ROW_COMPLEX_SINGLE(1)
+    GEMV_PROCESS_ROW_COMPLEX_PREDUX(0)
+    for (; j < cols; ++j) {
+      dd0 += cj.pmul(lhs(i, j), rhs2(j));
+    }
+    res[i * resIncr] += alpha * dd0;
+  }
+}
+
+#define EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(Scalar, LhsScalar, RhsScalar)                                          \
+  template <typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version> \
+  struct general_matrix_vector_product<Index, LhsScalar, LhsMapper, ColMajor, ConjugateLhs, RhsScalar, RhsMapper,      \
+                                       ConjugateRhs, Version> {                                                        \
+    typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;                                 \
+                                                                                                                       \
+    EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(Index rows, Index cols, const LhsMapper& lhs,                  \
+                                                        const RhsMapper& rhs, ResScalar* res, Index resIncr,           \
+                                                        ResScalar alpha) {                                             \
+      gemv_complex_col<Scalar, LhsScalar, LhsMapper, ConjugateLhs, sizeof(Scalar) == sizeof(LhsScalar), RhsScalar,     \
+                       RhsMapper, ConjugateRhs, sizeof(Scalar) == sizeof(RhsScalar), ResScalar>(rows, cols, lhs, rhs,  \
+                                                                                                res, resIncr, alpha);  \
+    }                                                                                                                  \
+  };
+
+#define EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(Scalar, LhsScalar, RhsScalar)                                          \
+  template <typename Index, typename LhsMapper, bool ConjugateLhs, typename RhsMapper, bool ConjugateRhs, int Version> \
+  struct general_matrix_vector_product<Index, LhsScalar, LhsMapper, RowMajor, ConjugateLhs, RhsScalar, RhsMapper,      \
+                                       ConjugateRhs, Version> {                                                        \
+    typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;                                 \
+                                                                                                                       \
+    EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(Index rows, Index cols, const LhsMapper& lhs,                  \
+                                                        const RhsMapper& rhs, ResScalar* res, Index resIncr,           \
+                                                        ResScalar alpha) {                                             \
+      gemv_complex_row<Scalar, LhsScalar, LhsMapper, ConjugateLhs, sizeof(Scalar) == sizeof(LhsScalar), RhsScalar,     \
+                       RhsMapper, ConjugateRhs, sizeof(Scalar) == sizeof(RhsScalar), ResScalar>(rows, cols, lhs, rhs,  \
+                                                                                                res, resIncr, alpha);  \
+    }                                                                                                                  \
+  };
+
+EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(float, float, std::complex<float>)
+EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(float, std::complex<float>, float)
+EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(float, std::complex<float>, std::complex<float>)
+EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(double, double, std::complex<double>)
+EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(double, std::complex<double>, double)
+EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_COL(double, std::complex<double>, std::complex<double>)
+EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(float, float, std::complex<float>)
+EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(float, std::complex<float>, float)
+EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(float, std::complex<float>, std::complex<float>)
+EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(double, double, std::complex<double>)
+EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(double, std::complex<double>, double)
+EIGEN_POWER_GEMV_COMPLEX_SPECIALIZE_ROW(double, std::complex<double>, std::complex<double>)
+
+#endif  // EIGEN_MATRIX_VECTOR_PRODUCT_ALTIVEC_H
diff --git a/inst/include/Eigen/src/Core/arch/AltiVec/PacketMath.h b/inst/include/Eigen/src/Core/arch/AltiVec/PacketMath.h
index e4089962..bb59caeb 100644
--- a/inst/include/Eigen/src/Core/arch/AltiVec/PacketMath.h
+++ b/inst/include/Eigen/src/Core/arch/AltiVec/PacketMath.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008 Konstantinos Margaritis <markos@codex.gr>
+// Copyright (C) 2008-2016 Konstantinos Margaritis <markos@freevec.org>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,6 +10,9 @@
 #ifndef EIGEN_PACKET_MATH_ALTIVEC_H
 #define EIGEN_PACKET_MATH_ALTIVEC_H
 
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
 namespace Eigen {
 
 namespace internal {
@@ -18,484 +21,3709 @@ namespace internal {
 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4
 #endif
 
-#ifndef EIGEN_HAS_FUSE_CJMADD
-#define EIGEN_HAS_FUSE_CJMADD 1
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
 #endif
 
 // NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16
 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
-#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
 #endif
 
-typedef __vector float          Packet4f;
-typedef __vector int            Packet4i;
-typedef __vector unsigned int   Packet4ui;
-typedef __vector __bool int     Packet4bi;
-typedef __vector short int      Packet8i;
-typedef __vector unsigned char  Packet16uc;
+typedef __vector float Packet4f;
+typedef __vector int Packet4i;
+typedef __vector unsigned int Packet4ui;
+typedef __vector __bool int Packet4bi;
+typedef __vector short int Packet8s;
+typedef __vector unsigned short int Packet8us;
+typedef __vector __bool short Packet8bi;
+typedef __vector signed char Packet16c;
+typedef __vector unsigned char Packet16uc;
+typedef eigen_packet_wrapper<__vector unsigned short int, 0> Packet8bf;
 
 // We don't want to write the same code all the time, but we need to reuse the constants
 // and it doesn't really work to declare them global, so we define macros instead
+#define EIGEN_DECLARE_CONST_FAST_Packet4f(NAME, X) Packet4f p4f_##NAME = {X, X, X, X}
+
+#define EIGEN_DECLARE_CONST_FAST_Packet4i(NAME, X) Packet4i p4i_##NAME = vec_splat_s32(X)
+
+#define EIGEN_DECLARE_CONST_FAST_Packet4ui(NAME, X) Packet4ui p4ui_##NAME = {X, X, X, X}
+
+#define EIGEN_DECLARE_CONST_FAST_Packet8us(NAME, X) Packet8us p8us_##NAME = {X, X, X, X, X, X, X, X}
 
-#define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \
-  Packet4f p4f_##NAME = (Packet4f) vec_splat_s32(X)
+#define EIGEN_DECLARE_CONST_FAST_Packet16uc(NAME, X) \
+  Packet16uc p16uc_##NAME = {X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X}
 
-#define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \
-  Packet4i p4i_##NAME = vec_splat_s32(X)
+#define EIGEN_DECLARE_CONST_Packet4f(NAME, X) Packet4f p4f_##NAME = pset1<Packet4f>(X)
 
-#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
-  Packet4f p4f_##NAME = pset1<Packet4f>(X)
+#define EIGEN_DECLARE_CONST_Packet4i(NAME, X) Packet4i p4i_##NAME = pset1<Packet4i>(X)
 
-#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
-  Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int>(X))
+#define EIGEN_DECLARE_CONST_Packet2d(NAME, X) Packet2d p2d_##NAME = pset1<Packet2d>(X)
 
-#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
-  Packet4i p4i_##NAME = pset1<Packet4i>(X)
+#define EIGEN_DECLARE_CONST_Packet2l(NAME, X) Packet2l p2l_##NAME = pset1<Packet2l>(X)
+
+#define EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME, X) \
+  const Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(pset1<Packet4i>(X))
 
 #define DST_CHAN 1
 #define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride))
+#define __UNPACK_TYPE__(PACKETNAME) typename unpacket_traits<PACKETNAME>::type
+
+// These constants are endian-agnostic
+static EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0);       //{ 0.0, 0.0, 0.0, 0.0}
+static EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0);       //{ 0, 0, 0, 0,}
+static EIGEN_DECLARE_CONST_FAST_Packet4i(ONE, 1);        //{ 1, 1, 1, 1}
+static EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16, -16);  //{ -16, -16, -16, -16}
+static EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1, -1);    //{ -1, -1, -1, -1}
+static EIGEN_DECLARE_CONST_FAST_Packet4ui(SIGN, 0x80000000u);
+static EIGEN_DECLARE_CONST_FAST_Packet4ui(PREV0DOT5, 0x3EFFFFFFu);
+static EIGEN_DECLARE_CONST_FAST_Packet8us(ONE, 1);  //{ 1, 1, 1, 1, 1, 1, 1, 1}
+static Packet4f p4f_MZERO =
+    (Packet4f)vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1);  //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000}
+#ifndef __VSX__
+static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0);  //{ 1.0, 1.0, 1.0, 1.0}
+#endif
+
+static Packet4f p4f_COUNTDOWN = {0.0, 1.0, 2.0, 3.0};
+static Packet4i p4i_COUNTDOWN = {0, 1, 2, 3};
+static Packet8s p8s_COUNTDOWN = {0, 1, 2, 3, 4, 5, 6, 7};
+static Packet8us p8us_COUNTDOWN = {0, 1, 2, 3, 4, 5, 6, 7};
+
+static Packet16c p16c_COUNTDOWN = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+static Packet16uc p16uc_COUNTDOWN = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
 
+static Packet16uc p16uc_REVERSE32 = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
+static Packet16uc p16uc_REVERSE16 = {14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1};
+static Packet16uc p16uc_REVERSE8 = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
+
+#ifdef _BIG_ENDIAN
+static Packet16uc p16uc_DUPLICATE32_HI = {0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7};
+#endif
+static const Packet16uc p16uc_DUPLICATE16_EVEN = {0, 1, 0, 1, 4, 5, 4, 5, 8, 9, 8, 9, 12, 13, 12, 13};
+static const Packet16uc p16uc_DUPLICATE16_ODD = {2, 3, 2, 3, 6, 7, 6, 7, 10, 11, 10, 11, 14, 15, 14, 15};
+
+static Packet16uc p16uc_QUADRUPLICATE16_HI = {0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 2, 3, 2, 3, 2, 3};
+static Packet16uc p16uc_QUADRUPLICATE16 = {0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3};
+
+static Packet16uc p16uc_MERGEE16 = {0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29};
+static Packet16uc p16uc_MERGEO16 = {2, 3, 18, 19, 6, 7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31};
+#ifdef _BIG_ENDIAN
+static Packet16uc p16uc_MERGEH16 = {0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29};
+#else
+static Packet16uc p16uc_MERGEL16 = {2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31};
+#endif
+
+// Handle endianness properly while loading constants
 // Define global static constants:
-static Packet4f p4f_COUNTDOWN = { 3.0, 2.0, 1.0, 0.0 };
-static Packet4i p4i_COUNTDOWN = { 3, 2, 1, 0 };
-static Packet16uc p16uc_REVERSE = {12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3};
+#ifdef _BIG_ENDIAN
 static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0);
-static Packet16uc p16uc_DUPLICATE = {0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7};
-
-static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0);
-static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0);
-static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1);
-static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16);
-static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1);
-static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0);
-static Packet4f p4f_ZERO_ = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1);
-
-template<> struct packet_traits<float>  : default_packet_traits
-{
+static Packet16uc p16uc_PSET32_WODD =
+    vec_sld((Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 2),
+            8);  //{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
+static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 3),
+                                               8);  //{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
+static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc)vec_abs(p4i_MINUS16), 3),
+                                              8);  //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
+#else
+static Packet16uc p16uc_FORWARD = p16uc_REVERSE32;
+static Packet16uc p16uc_PSET32_WODD =
+    vec_sld((Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 3),
+            8);  //{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
+static Packet16uc p16uc_PSET32_WEVEN =
+    vec_sld((Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 2),
+            8);  //{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
+static Packet16uc p16uc_HALF64_0_16 = vec_sld(vec_splat((Packet16uc)vec_abs(p4i_MINUS16), 0), (Packet16uc)p4i_ZERO,
+                                              8);  //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
+#endif  // _BIG_ENDIAN
+
+static Packet16uc p16uc_PSET64_HI = (Packet16uc)vec_mergeh(
+    (Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN);  //{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };
+static Packet16uc p16uc_PSET64_LO = (Packet16uc)vec_mergel(
+    (Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN);  //{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
+static Packet16uc p16uc_TRANSPOSE64_HI =
+    p16uc_PSET64_HI + p16uc_HALF64_0_16;  //{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23};
+static Packet16uc p16uc_TRANSPOSE64_LO =
+    p16uc_PSET64_LO + p16uc_HALF64_0_16;  //{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};
+
+static Packet16uc p16uc_COMPLEX32_REV =
+    vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8);  //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
+
+#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
+#define EIGEN_PPC_PREFETCH(ADDR) __builtin_prefetch(ADDR);
+#else
+#define EIGEN_PPC_PREFETCH(ADDR) asm("   dcbt [%[addr]]\n" ::[addr] "r"(ADDR) : "cc");
+#endif
+
+#if EIGEN_COMP_LLVM
+#define LOAD_STORE_UNROLL_16 _Pragma("unroll 16")
+#else
+#define LOAD_STORE_UNROLL_16 _Pragma("GCC unroll(16)")
+#endif
+
+template <>
+struct packet_traits<float> : default_packet_traits {
   typedef Packet4f type;
+  typedef Packet4f half;
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
-    size=4,
-
-    // FIXME check the Has*
-    HasSin  = 0,
-    HasCos  = 0,
-    HasLog  = 0,
-    HasExp  = 0,
-    HasSqrt = 0
+    size = 4,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasAbs = 1,
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+    HasACos = 1,
+    HasASin = 1,
+    HasATan = 1,
+    HasATanh = 1,
+    HasLog = 1,
+    HasExp = 1,
+#ifdef EIGEN_VECTORIZE_VSX
+    HasCmp = 1,
+    HasPow = 1,
+    HasSqrt = 1,
+    HasCbrt = 1,
+#if !EIGEN_COMP_CLANG
+    HasRsqrt = 1,
+#else
+    HasRsqrt = 0,
+#endif
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH,
+    HasErfc = EIGEN_FAST_MATH,
+#else
+    HasSqrt = 0,
+    HasRsqrt = 0,
+    HasTanh = 0,
+    HasErf = 0,
+#endif
+    HasNegate = 1,
+    HasBlend = 1
+  };
+};
+template <>
+struct packet_traits<bfloat16> : default_packet_traits {
+  typedef Packet8bf type;
+  typedef Packet8bf half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 8,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasAbs = 1,
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+    HasLog = 1,
+    HasExp = 1,
+#ifdef EIGEN_VECTORIZE_VSX
+    HasSqrt = 1,
+#if !EIGEN_COMP_CLANG
+    HasRsqrt = 1,
+#else
+    HasRsqrt = 0,
+#endif
+#else
+    HasSqrt = 0,
+    HasRsqrt = 0,
+#endif
+    HasTanh = 0,
+    HasErf = 0,
+    HasNegate = 1,
+    HasBlend = 1
   };
 };
-template<> struct packet_traits<int>    : default_packet_traits
-{
+
+template <>
+struct packet_traits<int> : default_packet_traits {
   typedef Packet4i type;
+  typedef Packet4i half;
   enum {
-    // FIXME check the Has*
     Vectorizable = 1,
     AlignedOnScalar = 1,
-    size=4
+    size = 4,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+#if defined(_ARCH_PWR10) && (EIGEN_COMP_LLVM || EIGEN_GNUC_STRICT_AT_LEAST(11, 0, 0))
+    HasDiv = 1,
+#else
+    HasDiv = 0,
+#endif
+    HasBlend = 1,
+    HasCmp = 1
   };
 };
 
-template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4}; };
-template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4}; };
-/*
-inline std::ostream & operator <<(std::ostream & s, const Packet4f & v)
-{
-  union {
-    Packet4f   v;
-    float n[4];
-  } vt;
-  vt.v = v;
-  s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
-  return s;
+template <>
+struct packet_traits<short int> : default_packet_traits {
+  typedef Packet8s type;
+  typedef Packet8s half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 8,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 0,
+    HasBlend = 1,
+    HasCmp = 1
+  };
+};
+
+template <>
+struct packet_traits<unsigned short int> : default_packet_traits {
+  typedef Packet8us type;
+  typedef Packet8us half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 8,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 0,
+    HasBlend = 1,
+    HasCmp = 1
+  };
+};
+
+template <>
+struct packet_traits<signed char> : default_packet_traits {
+  typedef Packet16c type;
+  typedef Packet16c half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 16,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 0,
+    HasBlend = 1,
+    HasCmp = 1
+  };
+};
+
+template <>
+struct packet_traits<unsigned char> : default_packet_traits {
+  typedef Packet16uc type;
+  typedef Packet16uc half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 16,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 0,
+    HasBlend = 1,
+    HasCmp = 1
+  };
+};
+
+template <>
+struct unpacket_traits<Packet4f> {
+  typedef float type;
+  typedef Packet4f half;
+  typedef Packet4i integer_packet;
+  enum {
+    size = 4,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template <>
+struct unpacket_traits<Packet4i> {
+  typedef int type;
+  typedef Packet4i half;
+  enum {
+    size = 4,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template <>
+struct unpacket_traits<Packet8s> {
+  typedef short int type;
+  typedef Packet8s half;
+  enum {
+    size = 8,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template <>
+struct unpacket_traits<Packet8us> {
+  typedef unsigned short int type;
+  typedef Packet8us half;
+  enum {
+    size = 8,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template <>
+struct unpacket_traits<Packet16c> {
+  typedef signed char type;
+  typedef Packet16c half;
+  enum {
+    size = 16,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template <>
+struct unpacket_traits<Packet16uc> {
+  typedef unsigned char type;
+  typedef Packet16uc half;
+  enum {
+    size = 16,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template <>
+struct unpacket_traits<Packet8bf> {
+  typedef bfloat16 type;
+  typedef Packet8bf half;
+  enum {
+    size = 8,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet pload_common(const __UNPACK_TYPE__(Packet) * from) {
+  // some versions of GCC throw "unused-but-set-parameter".
+  // ignoring these warnings for now.
+  EIGEN_UNUSED_VARIABLE(from);
+  EIGEN_DEBUG_ALIGNED_LOAD
+#ifdef EIGEN_VECTORIZE_VSX
+  return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
+#else
+  return vec_ld(0, from);
+#endif
 }
 
-inline std::ostream & operator <<(std::ostream & s, const Packet4i & v)
-{
-  union {
-    Packet4i   v;
-    int n[4];
-  } vt;
-  vt.v = v;
-  s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
-  return s;
+// Need to define them first or we get specialization after instantiation errors
+template <>
+EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
+  return pload_common<Packet4f>(from);
 }
 
-inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v)
-{
-  union {
-    Packet4ui   v;
-    unsigned int n[4];
-  } vt;
-  vt.v = v;
-  s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
-  return s;
+template <>
+EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) {
+  return pload_common<Packet4i>(from);
 }
 
-inline std::ostream & operator <<(std::ostream & s, const Packetbi & v)
-{
-  union {
-    Packet4bi v;
-    unsigned int n[4];
-  } vt;
-  vt.v = v;
-  s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3];
-  return s;
+template <>
+EIGEN_STRONG_INLINE Packet8s pload<Packet8s>(const short int* from) {
+  return pload_common<Packet8s>(from);
 }
-*/
-template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) {
-  // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
-  float EIGEN_ALIGN16 af[4];
-  af[0] = from;
-  Packet4f vc = vec_ld(0, af);
-  vc = vec_splat(vc, 0);
-  return vc;
+
+template <>
+EIGEN_STRONG_INLINE Packet8us pload<Packet8us>(const unsigned short int* from) {
+  return pload_common<Packet8us>(from);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from)   {
-  int EIGEN_ALIGN16 ai[4];
-  ai[0] = from;
-  Packet4i vc = vec_ld(0, ai);
-  vc = vec_splat(vc, 0);
-  return vc;
+template <>
+EIGEN_STRONG_INLINE Packet16c pload<Packet16c>(const signed char* from) {
+  return pload_common<Packet16c>(from);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a) { return vec_add(pset1<Packet4f>(a), p4f_COUNTDOWN); }
-template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a)     { return vec_add(pset1<Packet4i>(a), p4i_COUNTDOWN); }
+template <>
+EIGEN_STRONG_INLINE Packet16uc pload<Packet16uc>(const unsigned char* from) {
+  return pload_common<Packet16uc>(from);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_add(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_add(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet8bf pload<Packet8bf>(const bfloat16* from) {
+  return pload_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_sub(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_sub(a,b); }
+template <typename Packet>
+EIGEN_ALWAYS_INLINE Packet pload_ignore(const __UNPACK_TYPE__(Packet) * from) {
+  // some versions of GCC throw "unused-but-set-parameter".
+  // ignoring these warnings for now.
+  EIGEN_UNUSED_VARIABLE(from);
+  EIGEN_DEBUG_ALIGNED_LOAD
+  // Ignore partial input memory initialized
+#if !EIGEN_COMP_LLVM
+#pragma GCC diagnostic push
+//#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
+#ifdef EIGEN_VECTORIZE_VSX
+  return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
+#else
+  return vec_ld(0, from);
+#endif
+#if !EIGEN_COMP_LLVM
+#pragma GCC diagnostic pop
+#endif
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return psub<Packet4f>(p4f_ZERO, a); }
-template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return psub<Packet4i>(p4i_ZERO, a); }
+template <>
+EIGEN_ALWAYS_INLINE Packet8bf pload_ignore<Packet8bf>(const bfloat16* from) {
+  return pload_ignore<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
+template <typename Packet>
+EIGEN_ALWAYS_INLINE Packet pload_partial_common(const __UNPACK_TYPE__(Packet) * from, const Index n,
+                                                const Index offset) {
+  // some versions of GCC throw "unused-but-set-parameter".
+  // ignoring these warnings for now.
+  const Index packet_size = unpacket_traits<Packet>::size;
+  eigen_internal_assert(n + offset <= packet_size && "number of elements plus offset will read past end of packet");
+  const Index size = sizeof(__UNPACK_TYPE__(Packet));
+#ifdef _ARCH_PWR9
+  EIGEN_UNUSED_VARIABLE(packet_size);
+  EIGEN_DEBUG_ALIGNED_LOAD
+  EIGEN_UNUSED_VARIABLE(from);
+  Packet load = vec_xl_len(const_cast<__UNPACK_TYPE__(Packet)*>(from), n * size);
+  if (offset) {
+    Packet16uc shift = pset1<Packet16uc>(offset * 8 * size);
+#ifdef _BIG_ENDIAN
+    load = Packet(vec_sro(Packet16uc(load), shift));
+#else
+    load = Packet(vec_slo(Packet16uc(load), shift));
+#endif
+  }
+  return load;
+#else
+  if (n) {
+    EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) load[packet_size];
+    unsigned char* load2 = reinterpret_cast<unsigned char*>(load + offset);
+    unsigned char* from2 = reinterpret_cast<unsigned char*>(const_cast<__UNPACK_TYPE__(Packet)*>(from));
+    Index n2 = n * size;
+    if (16 <= n2) {
+      pstoreu(load2, ploadu<Packet16uc>(from2));
+    } else {
+      memcpy((void*)load2, (void*)from2, n2);
+    }
+    return pload_ignore<Packet>(load);
+  } else {
+    return Packet(pset1<Packet16uc>(0));
+  }
+#endif
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_madd(a,b,p4f_ZERO); }
-/* Commented out: it's actually slower than processing it scalar
- *
-template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b)
-{
-  // Detailed in: http://freevec.org/content/32bit_signed_integer_multiplication_altivec
-  //Set up constants, variables
-  Packet4i a1, b1, bswap, low_prod, high_prod, prod, prod_, v1sel;
+template <>
+EIGEN_ALWAYS_INLINE Packet4f pload_partial<Packet4f>(const float* from, const Index n, const Index offset) {
+  return pload_partial_common<Packet4f>(from, n, offset);
+}
 
-  // Get the absolute values
-  a1  = vec_abs(a);
-  b1  = vec_abs(b);
+template <>
+EIGEN_ALWAYS_INLINE Packet4i pload_partial<Packet4i>(const int* from, const Index n, const Index offset) {
+  return pload_partial_common<Packet4i>(from, n, offset);
+}
 
-  // Get the signs using xor
-  Packet4bi sgn = (Packet4bi) vec_cmplt(vec_xor(a, b), p4i_ZERO);
+template <>
+EIGEN_ALWAYS_INLINE Packet8s pload_partial<Packet8s>(const short int* from, const Index n, const Index offset) {
+  return pload_partial_common<Packet8s>(from, n, offset);
+}
 
-  // Do the multiplication for the asbolute values.
-  bswap = (Packet4i) vec_rl((Packet4ui) b1, (Packet4ui) p4i_MINUS16 );
-  low_prod = vec_mulo((Packet8i) a1, (Packet8i)b1);
-  high_prod = vec_msum((Packet8i) a1, (Packet8i) bswap, p4i_ZERO);
-  high_prod = (Packet4i) vec_sl((Packet4ui) high_prod, (Packet4ui) p4i_MINUS16);
-  prod = vec_add( low_prod, high_prod );
+template <>
+EIGEN_ALWAYS_INLINE Packet8us pload_partial<Packet8us>(const unsigned short int* from, const Index n,
+                                                       const Index offset) {
+  return pload_partial_common<Packet8us>(from, n, offset);
+}
 
-  // NOR the product and select only the negative elements according to the sign mask
-  prod_ = vec_nor(prod, prod);
-  prod_ = vec_sel(p4i_ZERO, prod_, sgn);
+template <>
+EIGEN_ALWAYS_INLINE Packet8bf pload_partial<Packet8bf>(const bfloat16* from, const Index n, const Index offset) {
+  return pload_partial_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from), n, offset);
+}
 
-  // Add 1 to the result to get the negative numbers
-  v1sel = vec_sel(p4i_ZERO, p4i_ONE, sgn);
-  prod_ = vec_add(prod_, v1sel);
+template <>
+EIGEN_ALWAYS_INLINE Packet16c pload_partial<Packet16c>(const signed char* from, const Index n, const Index offset) {
+  return pload_partial_common<Packet16c>(from, n, offset);
+}
 
-  // Merge the results back to the final vector.
-  prod = vec_sel(prod, prod_, sgn);
+template <>
+EIGEN_ALWAYS_INLINE Packet16uc pload_partial<Packet16uc>(const unsigned char* from, const Index n, const Index offset) {
+  return pload_partial_common<Packet16uc>(from, n, offset);
+}
 
-  return prod;
+template <typename Packet>
+EIGEN_STRONG_INLINE void pstore_common(__UNPACK_TYPE__(Packet) * to, const Packet& from) {
+  // some versions of GCC throw "unused-but-set-parameter" (float *to).
+  // ignoring these warnings for now.
+  EIGEN_UNUSED_VARIABLE(to);
+  EIGEN_DEBUG_ALIGNED_STORE
+#ifdef EIGEN_VECTORIZE_VSX
+  vec_xst(from, 0, to);
+#else
+  vec_st(from, 0, to);
+#endif
 }
-*/
-template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
-  Packet4f t, y_0, y_1, res;
 
-  // Altivec does not offer a divide instruction, we have to do a reciprocal approximation
-  y_0 = vec_re(b);
+template <>
+EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
+  pstore_common<Packet4f>(to, from);
+}
 
-  // Do one Newton-Raphson iteration to get the needed accuracy
-  t   = vec_nmsub(y_0, b, p4f_ONE);
-  y_1 = vec_madd(y_0, t, y_0);
+template <>
+EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) {
+  pstore_common<Packet4i>(to, from);
+}
 
-  res = vec_madd(a, y_1, p4f_ZERO);
-  return res;
+template <>
+EIGEN_STRONG_INLINE void pstore<short int>(short int* to, const Packet8s& from) {
+  pstore_common<Packet8s>(to, from);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
-{ eigen_assert(false && "packet integer division are not supported by AltiVec");
-  return pset1<Packet4i>(0);
+template <>
+EIGEN_STRONG_INLINE void pstore<unsigned short int>(unsigned short int* to, const Packet8us& from) {
+  pstore_common<Packet8us>(to, from);
 }
 
-// for some weird raisons, it has to be overloaded for packet of integers
-template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a, b, c); }
-template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); }
+template <>
+EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet8bf& from) {
+  pstore_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from.m_val);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_min(a, b); }
-template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); }
+template <>
+EIGEN_STRONG_INLINE void pstore<signed char>(signed char* to, const Packet16c& from) {
+  pstore_common<Packet16c>(to, from);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_max(a, b); }
-template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); }
+template <>
+EIGEN_STRONG_INLINE void pstore<unsigned char>(unsigned char* to, const Packet16uc& from) {
+  pstore_common<Packet16uc>(to, from);
+}
 
-// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
-template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); }
-template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); }
+template <typename Packet>
+EIGEN_ALWAYS_INLINE void pstore_partial_common(__UNPACK_TYPE__(Packet) * to, const Packet& from, const Index n,
+                                               const Index offset) {
+  // some versions of GCC throw "unused-but-set-parameter" (float *to).
+  // ignoring these warnings for now.
+  const Index packet_size = unpacket_traits<Packet>::size;
+  eigen_internal_assert(n + offset <= packet_size && "number of elements plus offset will write past end of packet");
+  const Index size = sizeof(__UNPACK_TYPE__(Packet));
+#ifdef _ARCH_PWR9
+  EIGEN_UNUSED_VARIABLE(packet_size);
+  EIGEN_UNUSED_VARIABLE(to);
+  EIGEN_DEBUG_ALIGNED_STORE
+  Packet store = from;
+  if (offset) {
+    Packet16uc shift = pset1<Packet16uc>(offset * 8 * size);
+#ifdef _BIG_ENDIAN
+    store = Packet(vec_slo(Packet16uc(store), shift));
+#else
+    store = Packet(vec_sro(Packet16uc(store), shift));
+#endif
+  }
+  vec_xst_len(store, to, n * size);
+#else
+  if (n) {
+    EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) store[packet_size];
+    pstore(store, from);
+    unsigned char* store2 = reinterpret_cast<unsigned char*>(store + offset);
+    unsigned char* to2 = reinterpret_cast<unsigned char*>(to);
+    Index n2 = n * size;
+    if (16 <= n2) {
+      pstore(to2, ploadu<Packet16uc>(store2));
+    } else {
+      memcpy((void*)to2, (void*)store2, n2);
+    }
+  }
+#endif
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_or(a, b); }
-template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); }
+template <>
+EIGEN_ALWAYS_INLINE void pstore_partial<float>(float* to, const Packet4f& from, const Index n, const Index offset) {
+  pstore_partial_common<Packet4f>(to, from, n, offset);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_xor(a, b); }
-template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); }
+template <>
+EIGEN_ALWAYS_INLINE void pstore_partial<int>(int* to, const Packet4i& from, const Index n, const Index offset) {
+  pstore_partial_common<Packet4i>(to, from, n, offset);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); }
-template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); }
+template <>
+EIGEN_ALWAYS_INLINE void pstore_partial<short int>(short int* to, const Packet8s& from, const Index n,
+                                                   const Index offset) {
+  pstore_partial_common<Packet8s>(to, from, n, offset);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); }
-template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from) { EIGEN_DEBUG_ALIGNED_LOAD return vec_ld(0, from); }
+template <>
+EIGEN_ALWAYS_INLINE void pstore_partial<unsigned short int>(unsigned short int* to, const Packet8us& from,
+                                                            const Index n, const Index offset) {
+  pstore_partial_common<Packet8us>(to, from, n, offset);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
-{
-  EIGEN_DEBUG_ALIGNED_LOAD
-  // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
-  Packet16uc MSQ, LSQ;
-  Packet16uc mask;
-  MSQ = vec_ld(0, (unsigned char *)from);          // most significant quadword
-  LSQ = vec_ld(15, (unsigned char *)from);         // least significant quadword
-  mask = vec_lvsl(0, from);                        // create the permute mask
-  return (Packet4f) vec_perm(MSQ, LSQ, mask);           // align the data
+template <>
+EIGEN_ALWAYS_INLINE void pstore_partial<bfloat16>(bfloat16* to, const Packet8bf& from, const Index n,
+                                                  const Index offset) {
+  pstore_partial_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from.m_val, n, offset);
+}
 
+template <>
+EIGEN_ALWAYS_INLINE void pstore_partial<signed char>(signed char* to, const Packet16c& from, const Index n,
+                                                     const Index offset) {
+  pstore_partial_common<Packet16c>(to, from, n, offset);
 }
-template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
-{
-  EIGEN_DEBUG_ALIGNED_LOAD
-  // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
-  Packet16uc MSQ, LSQ;
-  Packet16uc mask;
-  MSQ = vec_ld(0, (unsigned char *)from);          // most significant quadword
-  LSQ = vec_ld(15, (unsigned char *)from);         // least significant quadword
-  mask = vec_lvsl(0, from);                        // create the permute mask
-  return (Packet4i) vec_perm(MSQ, LSQ, mask);    // align the data
-}
-
-template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float*   from)
-{
-  Packet4f p;
-  if((ptrdiff_t(&from) % 16) == 0)  p = pload<Packet4f>(from);
-  else                              p = ploadu<Packet4f>(from);
-  return vec_perm(p, p, p16uc_DUPLICATE);
-}
-template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int*     from)
-{
-  Packet4i p;
-  if((ptrdiff_t(&from) % 16) == 0)  p = pload<Packet4i>(from);
-  else                              p = ploadu<Packet4i>(from);
-  return vec_perm(p, p, p16uc_DUPLICATE);
-}
-
-template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st(from, 0, to); }
-template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vec_st(from, 0, to); }
-
-template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const Packet4f& from)
-{
-  EIGEN_DEBUG_UNALIGNED_STORE
-  // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
-  // Warning: not thread safe!
-  Packet16uc MSQ, LSQ, edges;
-  Packet16uc edgeAlign, align;
 
-  MSQ = vec_ld(0, (unsigned char *)to);                     // most significant quadword
-  LSQ = vec_ld(15, (unsigned char *)to);                    // least significant quadword
-  edgeAlign = vec_lvsl(0, to);                              // permute map to extract edges
-  edges=vec_perm(LSQ,MSQ,edgeAlign);                        // extract the edges
-  align = vec_lvsr( 0, to );                                // permute map to misalign data
-  MSQ = vec_perm(edges,(Packet16uc)from,align);             // misalign the data (MSQ)
-  LSQ = vec_perm((Packet16uc)from,edges,align);             // misalign the data (LSQ)
-  vec_st( LSQ, 15, (unsigned char *)to );                   // Store the LSQ part first
-  vec_st( MSQ, 0, (unsigned char *)to );                    // Store the MSQ part
-}
-template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*      to, const Packet4i& from)
-{
-  EIGEN_DEBUG_UNALIGNED_STORE
-  // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
-  // Warning: not thread safe!
-  Packet16uc MSQ, LSQ, edges;
-  Packet16uc edgeAlign, align;
+template <>
+EIGEN_ALWAYS_INLINE void pstore_partial<unsigned char>(unsigned char* to, const Packet16uc& from, const Index n,
+                                                       const Index offset) {
+  pstore_partial_common<Packet16uc>(to, from, n, offset);
+}
 
-  MSQ = vec_ld(0, (unsigned char *)to);                     // most significant quadword
-  LSQ = vec_ld(15, (unsigned char *)to);                    // least significant quadword
-  edgeAlign = vec_lvsl(0, to);                              // permute map to extract edges
-  edges=vec_perm(LSQ, MSQ, edgeAlign);                      // extract the edges
-  align = vec_lvsr( 0, to );                                // permute map to misalign data
-  MSQ = vec_perm(edges, (Packet16uc) from, align);          // misalign the data (MSQ)
-  LSQ = vec_perm((Packet16uc) from, edges, align);          // misalign the data (LSQ)
-  vec_st( LSQ, 15, (unsigned char *)to );                   // Store the LSQ part first
-  vec_st( MSQ, 0, (unsigned char *)to );                    // Store the MSQ part
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet pset1_size4(const __UNPACK_TYPE__(Packet) & from) {
+  Packet v = {from, from, from, from};
+  return v;
 }
 
-template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { vec_dstt(addr, DST_CTRL(2,2,32), DST_CHAN); }
-template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*     addr) { vec_dstt(addr, DST_CTRL(2,2,32), DST_CHAN); }
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet pset1_size8(const __UNPACK_TYPE__(Packet) & from) {
+  Packet v = {from, from, from, from, from, from, from, from};
+  return v;
+}
 
-template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vec_st(a, 0, x); return x[0]; }
-template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int   EIGEN_ALIGN16 x[4]; vec_st(a, 0, x); return x[0]; }
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet pset1_size16(const __UNPACK_TYPE__(Packet) & from) {
+  Packet v = {from, from, from, from, from, from, from, from, from, from, from, from, from, from, from, from};
+  return v;
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { return (Packet4f)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE); }
-template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) { return (Packet4i)vec_perm((Packet16uc)a,(Packet16uc)a, p16uc_REVERSE); }
+template <>
+EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
+  return pset1_size4<Packet4f>(from);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vec_abs(a); }
-template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); }
+template <>
+EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) {
+  return pset1_size4<Packet4i>(from);
+}
 
-template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
-{
-  Packet4f b, sum;
-  b   = (Packet4f) vec_sld(a, a, 8);
-  sum = vec_add(a, b);
-  b   = (Packet4f) vec_sld(sum, sum, 4);
-  sum = vec_add(sum, b);
-  return pfirst(sum);
+template <>
+EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(const short int& from) {
+  return pset1_size8<Packet8s>(from);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
-{
-  Packet4f v[4], sum[4];
-
-  // It's easier and faster to transpose then add as columns
-  // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation
-  // Do the transpose, first set of moves
-  v[0] = vec_mergeh(vecs[0], vecs[2]);
-  v[1] = vec_mergel(vecs[0], vecs[2]);
-  v[2] = vec_mergeh(vecs[1], vecs[3]);
-  v[3] = vec_mergel(vecs[1], vecs[3]);
-  // Get the resulting vectors
-  sum[0] = vec_mergeh(v[0], v[2]);
-  sum[1] = vec_mergel(v[0], v[2]);
-  sum[2] = vec_mergeh(v[1], v[3]);
-  sum[3] = vec_mergel(v[1], v[3]);
-
-  // Now do the summation:
-  // Lines 0+1
-  sum[0] = vec_add(sum[0], sum[1]);
-  // Lines 2+3
-  sum[1] = vec_add(sum[2], sum[3]);
-  // Add the results
-  sum[0] = vec_add(sum[0], sum[1]);
-
-  return sum[0];
-}
-
-template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
-{
-  Packet4i sum;
-  sum = vec_sums(a, p4i_ZERO);
-  sum = vec_sld(sum, p4i_ZERO, 12);
-  return pfirst(sum);
+template <>
+EIGEN_STRONG_INLINE Packet8us pset1<Packet8us>(const unsigned short int& from) {
+  return pset1_size8<Packet8us>(from);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
-{
-  Packet4i v[4], sum[4];
+template <>
+EIGEN_STRONG_INLINE Packet16c pset1<Packet16c>(const signed char& from) {
+  return pset1_size16<Packet16c>(from);
+}
 
-  // It's easier and faster to transpose then add as columns
-  // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation
-  // Do the transpose, first set of moves
-  v[0] = vec_mergeh(vecs[0], vecs[2]);
-  v[1] = vec_mergel(vecs[0], vecs[2]);
-  v[2] = vec_mergeh(vecs[1], vecs[3]);
-  v[3] = vec_mergel(vecs[1], vecs[3]);
-  // Get the resulting vectors
-  sum[0] = vec_mergeh(v[0], v[2]);
-  sum[1] = vec_mergel(v[0], v[2]);
-  sum[2] = vec_mergeh(v[1], v[3]);
-  sum[3] = vec_mergel(v[1], v[3]);
+template <>
+EIGEN_STRONG_INLINE Packet16uc pset1<Packet16uc>(const unsigned char& from) {
+  return pset1_size16<Packet16uc>(from);
+}
 
-  // Now do the summation:
-  // Lines 0+1
-  sum[0] = vec_add(sum[0], sum[1]);
-  // Lines 2+3
-  sum[1] = vec_add(sum[2], sum[3]);
-  // Add the results
-  sum[0] = vec_add(sum[0], sum[1]);
+template <>
+EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int from) {
+  return reinterpret_cast<Packet4f>(pset1<Packet4i>(from));
+}
 
-  return sum[0];
+template <>
+EIGEN_STRONG_INLINE Packet8bf pset1<Packet8bf>(const bfloat16& from) {
+  return pset1_size8<Packet8us>(reinterpret_cast<const unsigned short int&>(from));
 }
 
-// Other reduction functions:
-// mul
-template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
-{
-  Packet4f prod;
-  prod = pmul(a, (Packet4f)vec_sld(a, a, 8));
-  return pfirst(pmul(prod, (Packet4f)vec_sld(prod, prod, 4)));
+template <typename Packet>
+EIGEN_STRONG_INLINE void pbroadcast4_common(const __UNPACK_TYPE__(Packet) * a, Packet& a0, Packet& a1, Packet& a2,
+                                            Packet& a3) {
+  a3 = pload<Packet>(a);
+  a0 = vec_splat(a3, 0);
+  a1 = vec_splat(a3, 1);
+  a2 = vec_splat(a3, 2);
+  a3 = vec_splat(a3, 3);
 }
 
-template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
-{
-  EIGEN_ALIGN16 int aux[4];
-  pstore(aux, a);
-  return aux[0] * aux[1] * aux[2] * aux[3];
+template <>
+EIGEN_STRONG_INLINE void pbroadcast4<Packet4f>(const float* a, Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) {
+  pbroadcast4_common<Packet4f>(a, a0, a1, a2, a3);
+}
+template <>
+EIGEN_STRONG_INLINE void pbroadcast4<Packet4i>(const int* a, Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3) {
+  pbroadcast4_common<Packet4i>(a, a0, a1, a2, a3);
 }
 
-// min
-template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
-{
-  Packet4f b, res;
-  b = vec_min(a, vec_sld(a, a, 8));
-  res = vec_min(b, vec_sld(b, b, 4));
-  return pfirst(res);
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet pgather_common(const __UNPACK_TYPE__(Packet) * from, Index stride,
+                                                            const Index n = unpacket_traits<Packet>::size) {
+  EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[unpacket_traits<Packet>::size];
+  eigen_internal_assert(n <= unpacket_traits<Packet>::size && "number of elements will gather past end of packet");
+  if (stride == 1) {
+    if (n == unpacket_traits<Packet>::size) {
+      return ploadu<Packet>(from);
+    } else {
+      return ploadu_partial<Packet>(from, n);
+    }
+  } else {
+    LOAD_STORE_UNROLL_16
+    for (Index i = 0; i < n; i++) {
+      a[i] = from[i * stride];
+    }
+    // Leave rest of the array uninitialized
+    return pload_ignore<Packet>(a);
+  }
 }
 
-template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
-{
-  Packet4i b, res;
-  b = vec_min(a, vec_sld(a, a, 8));
-  res = vec_min(b, vec_sld(b, b, 4));
-  return pfirst(res);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
+  return pgather_common<Packet4f>(from, stride);
 }
 
-// max
-template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
-{
-  Packet4f b, res;
-  b = vec_max(a, vec_sld(a, a, 8));
-  res = vec_max(b, vec_sld(b, b, 4));
-  return pfirst(res);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4i pgather<int, Packet4i>(const int* from, Index stride) {
+  return pgather_common<Packet4i>(from, stride);
 }
 
-template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
-{
-  Packet4i b, res;
-  b = vec_max(a, vec_sld(a, a, 8));
-  res = vec_max(b, vec_sld(b, b, 4));
-  return pfirst(res);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8s pgather<short int, Packet8s>(const short int* from, Index stride) {
+  return pgather_common<Packet8s>(from, stride);
 }
 
-template<int Offset>
-struct palign_impl<Offset,Packet4f>
-{
-  static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
-  {
-    if (Offset!=0)
-      first = vec_sld(first, second, Offset*4);
-  }
-};
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8us pgather<unsigned short int, Packet8us>(const unsigned short int* from,
+                                                                                       Index stride) {
+  return pgather_common<Packet8us>(from, stride);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8bf pgather<bfloat16, Packet8bf>(const bfloat16* from, Index stride) {
+  return pgather_common<Packet8bf>(from, stride);
+}
 
-template<int Offset>
-struct palign_impl<Offset,Packet4i>
-{
-  static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second)
-  {
-    if (Offset!=0)
-      first = vec_sld(first, second, Offset*4);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16c pgather<signed char, Packet16c>(const signed char* from, Index stride) {
+  return pgather_common<Packet16c>(from, stride);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16uc pgather<unsigned char, Packet16uc>(const unsigned char* from,
+                                                                                    Index stride) {
+  return pgather_common<Packet16uc>(from, stride);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4f pgather_partial<float, Packet4f>(const float* from, Index stride,
+                                                                                const Index n) {
+  return pgather_common<Packet4f>(from, stride, n);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4i pgather_partial<int, Packet4i>(const int* from, Index stride,
+                                                                              const Index n) {
+  return pgather_common<Packet4i>(from, stride, n);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8s pgather_partial<short int, Packet8s>(const short int* from, Index stride,
+                                                                                    const Index n) {
+  return pgather_common<Packet8s>(from, stride, n);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8us
+pgather_partial<unsigned short int, Packet8us>(const unsigned short int* from, Index stride, const Index n) {
+  return pgather_common<Packet8us>(from, stride, n);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet8bf pgather_partial<bfloat16, Packet8bf>(const bfloat16* from, Index stride,
+                                                                                     const Index n) {
+  return pgather_common<Packet8bf>(from, stride, n);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16c pgather_partial<signed char, Packet16c>(const signed char* from,
+                                                                                        Index stride, const Index n) {
+  return pgather_common<Packet16c>(from, stride, n);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet16uc pgather_partial<unsigned char, Packet16uc>(const unsigned char* from,
+                                                                                            Index stride,
+                                                                                            const Index n) {
+  return pgather_common<Packet16uc>(from, stride, n);
+}
+
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_common(__UNPACK_TYPE__(Packet) * to, const Packet& from,
+                                                           Index stride,
+                                                           const Index n = unpacket_traits<Packet>::size) {
+  EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) a[unpacket_traits<Packet>::size];
+  eigen_internal_assert(n <= unpacket_traits<Packet>::size && "number of elements will scatter past end of packet");
+  if (stride == 1) {
+    if (n == unpacket_traits<Packet>::size) {
+      return pstoreu(to, from);
+    } else {
+      return pstoreu_partial(to, from, n);
+    }
+  } else {
+    pstore<__UNPACK_TYPE__(Packet)>(a, from);
+    LOAD_STORE_UNROLL_16
+    for (Index i = 0; i < n; i++) {
+      to[i * stride] = a[i];
+    }
   }
-};
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) {
+  pscatter_common<Packet4f>(to, from, stride);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride) {
+  pscatter_common<Packet4i>(to, from, stride);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<short int, Packet8s>(short int* to, const Packet8s& from,
+                                                                         Index stride) {
+  pscatter_common<Packet8s>(to, from, stride);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<unsigned short int, Packet8us>(unsigned short int* to,
+                                                                                   const Packet8us& from,
+                                                                                   Index stride) {
+  pscatter_common<Packet8us>(to, from, stride);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<bfloat16, Packet8bf>(bfloat16* to, const Packet8bf& from,
+                                                                         Index stride) {
+  pscatter_common<Packet8bf>(to, from, stride);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<signed char, Packet16c>(signed char* to, const Packet16c& from,
+                                                                            Index stride) {
+  pscatter_common<Packet16c>(to, from, stride);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<unsigned char, Packet16uc>(unsigned char* to,
+                                                                               const Packet16uc& from, Index stride) {
+  pscatter_common<Packet16uc>(to, from, stride);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<float, Packet4f>(float* to, const Packet4f& from,
+                                                                             Index stride, const Index n) {
+  pscatter_common<Packet4f>(to, from, stride, n);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<int, Packet4i>(int* to, const Packet4i& from, Index stride,
+                                                                           const Index n) {
+  pscatter_common<Packet4i>(to, from, stride, n);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<short int, Packet8s>(short int* to, const Packet8s& from,
+                                                                                 Index stride, const Index n) {
+  pscatter_common<Packet8s>(to, from, stride, n);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<unsigned short int, Packet8us>(unsigned short int* to,
+                                                                                           const Packet8us& from,
+                                                                                           Index stride,
+                                                                                           const Index n) {
+  pscatter_common<Packet8us>(to, from, stride, n);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<bfloat16, Packet8bf>(bfloat16* to, const Packet8bf& from,
+                                                                                 Index stride, const Index n) {
+  pscatter_common<Packet8bf>(to, from, stride, n);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<signed char, Packet16c>(signed char* to,
+                                                                                    const Packet16c& from, Index stride,
+                                                                                    const Index n) {
+  pscatter_common<Packet16c>(to, from, stride, n);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<unsigned char, Packet16uc>(unsigned char* to,
+                                                                                       const Packet16uc& from,
+                                                                                       Index stride, const Index n) {
+  pscatter_common<Packet16uc>(to, from, stride, n);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) {
+  return pset1<Packet4f>(a) + p4f_COUNTDOWN;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) {
+  return pset1<Packet4i>(a) + p4i_COUNTDOWN;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s plset<Packet8s>(const short int& a) {
+  return pset1<Packet8s>(a) + p8s_COUNTDOWN;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us plset<Packet8us>(const unsigned short int& a) {
+  return pset1<Packet8us>(a) + p8us_COUNTDOWN;
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c plset<Packet16c>(const signed char& a) {
+  return pset1<Packet16c>(a) + p16c_COUNTDOWN;
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc plset<Packet16uc>(const unsigned char& a) {
+  return pset1<Packet16uc>(a) + p16uc_COUNTDOWN;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return a + b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return a + b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return a + b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s padd<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return a + b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us padd<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return a + b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c padd<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return a + b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc padd<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return a + b;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return a - b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return a - b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s psub<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return a - b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us psub<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return a - b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c psub<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return a - b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc psub<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return a - b;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) {
+#ifdef __POWER8_VECTOR__
+  return vec_neg(a);
+#else
+  return vec_xor(a, p4f_MZERO);
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pnegate(const Packet16c& a) {
+#ifdef __POWER8_VECTOR__
+  return vec_neg(a);
+#else
+  return reinterpret_cast<Packet16c>(p4i_ZERO) - a;
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pnegate(const Packet8s& a) {
+#ifdef __POWER8_VECTOR__
+  return vec_neg(a);
+#else
+  return reinterpret_cast<Packet8s>(p4i_ZERO) - a;
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) {
+#ifdef __POWER8_VECTOR__
+  return vec_neg(a);
+#else
+  return p4i_ZERO - a;
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) {
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return vec_madd(a, b, p4f_MZERO);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return a * b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pmul<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return vec_mul(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pmul<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return vec_mul(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pmul<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return vec_mul(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pmul<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return vec_mul(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
+#ifndef __VSX__  // VSX actually provides a div instruction
+  Packet4f t, y_0, y_1;
+
+  // Altivec does not offer a divide instruction, we have to do a reciprocal approximation
+  y_0 = vec_re(b);
+
+  // Do one Newton-Raphson iteration to get the needed accuracy
+  t = vec_nmsub(y_0, b, p4f_ONE);
+  y_1 = vec_madd(y_0, t, y_0);
+
+  return vec_madd(a, y_1, p4f_MZERO);
+#else
+  return vec_div(a, b);
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) {
+#if defined(_ARCH_PWR10) && (EIGEN_COMP_LLVM || EIGEN_GNUC_STRICT_AT_LEAST(11, 0, 0))
+  return vec_div(a, b);
+#else
+  EIGEN_UNUSED_VARIABLE(a);
+  EIGEN_UNUSED_VARIABLE(b);
+  eigen_assert(false && "packet integer division are not supported by AltiVec");
+  return pset1<Packet4i>(0);
+#endif
+}
+
+// for some weird raisons, it has to be overloaded for packet of integers
+template <>
+EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+  return vec_madd(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
+  return a * b + c;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c) {
+  return vec_madd(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8us& b, const Packet8us& c) {
+  return vec_madd(a, b, c);
+}
+
+#ifdef EIGEN_VECTORIZE_VSX
+template <>
+EIGEN_STRONG_INLINE Packet4f pmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+  return vec_msub(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pnmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+  return vec_nmsub(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pnmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+  return vec_nmadd(a, b, c);
+}
+#endif
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
+#ifdef EIGEN_VECTORIZE_VSX
+  // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN
+  Packet4f ret;
+  __asm__("xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa"(ret) : "wa"(a), "wa"(b));
+  return ret;
+#else
+  return vec_min(a, b);
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return vec_min(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pmin<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return vec_min(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pmin<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return vec_min(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pmin<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return vec_min(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pmin<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return vec_min(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
+#ifdef EIGEN_VECTORIZE_VSX
+  // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN
+  Packet4f ret;
+  __asm__("xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa"(ret) : "wa"(a), "wa"(b));
+  return ret;
+#else
+  return vec_max(a, b);
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return vec_max(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pmax<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return vec_max(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pmax<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return vec_max(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pmax<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return vec_max(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pmax<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return vec_max(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) {
+  return reinterpret_cast<Packet4f>(vec_cmple(a, b));
+}
+// To fix bug with vec_cmplt on older versions
+#ifdef EIGEN_VECTORIZE_VSX
+template <>
+EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) {
+  return reinterpret_cast<Packet4f>(vec_cmplt(a, b));
+}
+#endif
+template <>
+EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) {
+  return reinterpret_cast<Packet4f>(vec_cmpeq(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) {
+  Packet4f c = reinterpret_cast<Packet4f>(vec_cmpge(a, b));
+  return vec_nor(c, c);
+}
+
+#ifdef EIGEN_VECTORIZE_VSX
+template <>
+EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) {
+  return reinterpret_cast<Packet4i>(vec_cmple(a, b));
+}
+#endif
+template <>
+EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) {
+  return reinterpret_cast<Packet4i>(vec_cmplt(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) {
+  return reinterpret_cast<Packet4i>(vec_cmpeq(a, b));
+}
+#ifdef EIGEN_VECTORIZE_VSX
+template <>
+EIGEN_STRONG_INLINE Packet8s pcmp_le(const Packet8s& a, const Packet8s& b) {
+  return reinterpret_cast<Packet8s>(vec_cmple(a, b));
+}
+#endif
+template <>
+EIGEN_STRONG_INLINE Packet8s pcmp_lt(const Packet8s& a, const Packet8s& b) {
+  return reinterpret_cast<Packet8s>(vec_cmplt(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pcmp_eq(const Packet8s& a, const Packet8s& b) {
+  return reinterpret_cast<Packet8s>(vec_cmpeq(a, b));
+}
+#ifdef EIGEN_VECTORIZE_VSX
+template <>
+EIGEN_STRONG_INLINE Packet8us pcmp_le(const Packet8us& a, const Packet8us& b) {
+  return reinterpret_cast<Packet8us>(vec_cmple(a, b));
+}
+#endif
+template <>
+EIGEN_STRONG_INLINE Packet8us pcmp_lt(const Packet8us& a, const Packet8us& b) {
+  return reinterpret_cast<Packet8us>(vec_cmplt(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pcmp_eq(const Packet8us& a, const Packet8us& b) {
+  return reinterpret_cast<Packet8us>(vec_cmpeq(a, b));
+}
+#ifdef EIGEN_VECTORIZE_VSX
+template <>
+EIGEN_STRONG_INLINE Packet16c pcmp_le(const Packet16c& a, const Packet16c& b) {
+  return reinterpret_cast<Packet16c>(vec_cmple(a, b));
+}
+#endif
+template <>
+EIGEN_STRONG_INLINE Packet16c pcmp_lt(const Packet16c& a, const Packet16c& b) {
+  return reinterpret_cast<Packet16c>(vec_cmplt(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pcmp_eq(const Packet16c& a, const Packet16c& b) {
+  return reinterpret_cast<Packet16c>(vec_cmpeq(a, b));
+}
+#ifdef EIGEN_VECTORIZE_VSX
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcmp_le(const Packet16uc& a, const Packet16uc& b) {
+  return reinterpret_cast<Packet16uc>(vec_cmple(a, b));
+}
+#endif
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcmp_lt(const Packet16uc& a, const Packet16uc& b) {
+  return reinterpret_cast<Packet16uc>(vec_cmplt(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcmp_eq(const Packet16uc& a, const Packet16uc& b) {
+  return reinterpret_cast<Packet16uc>(vec_cmpeq(a, b));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return vec_and(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return vec_and(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return vec_and(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pand<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return vec_and(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8bf pand<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+  return pand<Packet8us>(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return vec_or(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return vec_or(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s por<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return vec_or(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us por<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return vec_or(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8bf por<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+  return por<Packet8us>(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return vec_xor(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return vec_xor(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pxor<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return vec_xor(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8bf pxor<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+  return pxor<Packet8us>(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return vec_andc(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return vec_andc(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
+  return vec_sel(b, a, reinterpret_cast<Packet4ui>(mask));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) {
+  Packet4f t = vec_add(
+      reinterpret_cast<Packet4f>(vec_or(vec_and(reinterpret_cast<Packet4ui>(a), p4ui_SIGN), p4ui_PREV0DOT5)), a);
+  Packet4f res;
+
+#ifdef EIGEN_VECTORIZE_VSX
+  __asm__("xvrspiz %x0, %x1\n\t" : "=&wa"(res) : "wa"(t));
+#else
+  __asm__("vrfiz %0, %1\n\t" : "=v"(res) : "v"(t));
+#endif
+
+  return res;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) {
+  return vec_ceil(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) {
+  return vec_floor(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f ptrunc<Packet4f>(const Packet4f& a) {
+  return vec_trunc(a);
+}
+#ifdef EIGEN_VECTORIZE_VSX
+template <>
+EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a) {
+  Packet4f res;
+
+  __asm__("xvrspic %x0, %x1\n\t" : "=&wa"(res) : "wa"(a));
+
+  return res;
+}
+#endif
+
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet ploadu_common(const __UNPACK_TYPE__(Packet) * from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD
+#if defined(EIGEN_VECTORIZE_VSX)
+  return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
+#else
+  Packet16uc MSQ = vec_ld(0, (unsigned char*)from);   // most significant quadword
+  Packet16uc LSQ = vec_ld(15, (unsigned char*)from);  // least significant quadword
+  Packet16uc mask = vec_lvsl(0, from);                // create the permute mask
+  // TODO: Add static_cast here
+  return (Packet)vec_perm(MSQ, LSQ, mask);  // align the data
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
+  return ploadu_common<Packet4f>(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) {
+  return ploadu_common<Packet4i>(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s ploadu<Packet8s>(const short int* from) {
+  return ploadu_common<Packet8s>(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us ploadu<Packet8us>(const unsigned short int* from) {
+  return ploadu_common<Packet8us>(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8bf ploadu<Packet8bf>(const bfloat16* from) {
+  return ploadu_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c ploadu<Packet16c>(const signed char* from) {
+  return ploadu_common<Packet16c>(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc ploadu<Packet16uc>(const unsigned char* from) {
+  return ploadu_common<Packet16uc>(from);
+}
+
+template <typename Packet>
+EIGEN_ALWAYS_INLINE Packet ploadu_partial_common(const __UNPACK_TYPE__(Packet) * from, const Index n,
+                                                 const Index offset) {
+  const Index packet_size = unpacket_traits<Packet>::size;
+  eigen_internal_assert(n + offset <= packet_size && "number of elements plus offset will read past end of packet");
+  const Index size = sizeof(__UNPACK_TYPE__(Packet));
+#ifdef _ARCH_PWR9
+  EIGEN_UNUSED_VARIABLE(packet_size);
+  EIGEN_DEBUG_ALIGNED_LOAD
+  EIGEN_DEBUG_UNALIGNED_LOAD
+  Packet load = vec_xl_len(const_cast<__UNPACK_TYPE__(Packet)*>(from), n * size);
+  if (offset) {
+    Packet16uc shift = pset1<Packet16uc>(offset * 8 * size);
+#ifdef _BIG_ENDIAN
+    load = Packet(vec_sro(Packet16uc(load), shift));
+#else
+    load = Packet(vec_slo(Packet16uc(load), shift));
+#endif
+  }
+  return load;
+#else
+  if (n) {
+    EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) load[packet_size];
+    unsigned char* load2 = reinterpret_cast<unsigned char*>(load + offset);
+    unsigned char* from2 = reinterpret_cast<unsigned char*>(const_cast<__UNPACK_TYPE__(Packet)*>(from));
+    Index n2 = n * size;
+    if (16 <= n2) {
+      pstoreu(load2, ploadu<Packet16uc>(from2));
+    } else {
+      memcpy((void*)load2, (void*)from2, n2);
+    }
+    return pload_ignore<Packet>(load);
+  } else {
+    return Packet(pset1<Packet16uc>(0));
+  }
+#endif
+}
+
+template <>
+EIGEN_ALWAYS_INLINE Packet4f ploadu_partial<Packet4f>(const float* from, const Index n, const Index offset) {
+  return ploadu_partial_common<Packet4f>(from, n, offset);
+}
+template <>
+EIGEN_ALWAYS_INLINE Packet4i ploadu_partial<Packet4i>(const int* from, const Index n, const Index offset) {
+  return ploadu_partial_common<Packet4i>(from, n, offset);
+}
+template <>
+EIGEN_ALWAYS_INLINE Packet8s ploadu_partial<Packet8s>(const short int* from, const Index n, const Index offset) {
+  return ploadu_partial_common<Packet8s>(from, n, offset);
+}
+template <>
+EIGEN_ALWAYS_INLINE Packet8us ploadu_partial<Packet8us>(const unsigned short int* from, const Index n,
+                                                        const Index offset) {
+  return ploadu_partial_common<Packet8us>(from, n, offset);
+}
+template <>
+EIGEN_ALWAYS_INLINE Packet8bf ploadu_partial<Packet8bf>(const bfloat16* from, const Index n, const Index offset) {
+  return ploadu_partial_common<Packet8us>(reinterpret_cast<const unsigned short int*>(from), n, offset);
+}
+template <>
+EIGEN_ALWAYS_INLINE Packet16c ploadu_partial<Packet16c>(const signed char* from, const Index n, const Index offset) {
+  return ploadu_partial_common<Packet16c>(from, n, offset);
+}
+template <>
+EIGEN_ALWAYS_INLINE Packet16uc ploadu_partial<Packet16uc>(const unsigned char* from, const Index n,
+                                                          const Index offset) {
+  return ploadu_partial_common<Packet16uc>(from, n, offset);
+}
+
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet ploaddup_common(const __UNPACK_TYPE__(Packet) * from) {
+  Packet p;
+  if ((std::ptrdiff_t(from) % 16) == 0)
+    p = pload<Packet>(from);
+  else
+    p = ploadu<Packet>(from);
+  return vec_mergeh(p, p);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) {
+  return ploaddup_common<Packet4f>(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from) {
+  return ploaddup_common<Packet4i>(from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8s ploaddup<Packet8s>(const short int* from) {
+  Packet8s p;
+  if ((std::ptrdiff_t(from) % 16) == 0)
+    p = pload<Packet8s>(from);
+  else
+    p = ploadu<Packet8s>(from);
+  return vec_mergeh(p, p);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8us ploaddup<Packet8us>(const unsigned short int* from) {
+  Packet8us p;
+  if ((std::ptrdiff_t(from) % 16) == 0)
+    p = pload<Packet8us>(from);
+  else
+    p = ploadu<Packet8us>(from);
+  return vec_mergeh(p, p);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8s ploadquad<Packet8s>(const short int* from) {
+  Packet8s p;
+  if ((std::ptrdiff_t(from) % 16) == 0)
+    p = pload<Packet8s>(from);
+  else
+    p = ploadu<Packet8s>(from);
+  return vec_perm(p, p, p16uc_QUADRUPLICATE16_HI);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8us ploadquad<Packet8us>(const unsigned short int* from) {
+  Packet8us p;
+  if ((std::ptrdiff_t(from) % 16) == 0)
+    p = pload<Packet8us>(from);
+  else
+    p = ploadu<Packet8us>(from);
+  return vec_perm(p, p, p16uc_QUADRUPLICATE16_HI);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf ploadquad<Packet8bf>(const bfloat16* from) {
+  return ploadquad<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16c ploaddup<Packet16c>(const signed char* from) {
+  Packet16c p;
+  if ((std::ptrdiff_t(from) % 16) == 0)
+    p = pload<Packet16c>(from);
+  else
+    p = ploadu<Packet16c>(from);
+  return vec_mergeh(p, p);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16uc ploaddup<Packet16uc>(const unsigned char* from) {
+  Packet16uc p;
+  if ((std::ptrdiff_t(from) % 16) == 0)
+    p = pload<Packet16uc>(from);
+  else
+    p = ploadu<Packet16uc>(from);
+  return vec_mergeh(p, p);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16c ploadquad<Packet16c>(const signed char* from) {
+  Packet16c p;
+  if ((std::ptrdiff_t(from) % 16) == 0)
+    p = pload<Packet16c>(from);
+  else
+    p = ploadu<Packet16c>(from);
+  return vec_perm(p, p, p16uc_QUADRUPLICATE16);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16uc ploadquad<Packet16uc>(const unsigned char* from) {
+  Packet16uc p;
+  if ((std::ptrdiff_t(from) % 16) == 0)
+    p = pload<Packet16uc>(from);
+  else
+    p = ploadu<Packet16uc>(from);
+  return vec_perm(p, p, p16uc_QUADRUPLICATE16);
+}
+
+template <typename Packet>
+EIGEN_STRONG_INLINE void pstoreu_common(__UNPACK_TYPE__(Packet) * to, const Packet& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE
+#if defined(EIGEN_VECTORIZE_VSX)
+  vec_xst(from, 0, to);
+#else
+  // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html
+  // Warning: not thread safe!
+  Packet16uc MSQ, LSQ, edges;
+  Packet16uc edgeAlign, align;
+
+  MSQ = vec_ld(0, (unsigned char*)to);             // most significant quadword
+  LSQ = vec_ld(15, (unsigned char*)to);            // least significant quadword
+  edgeAlign = vec_lvsl(0, to);                     // permute map to extract edges
+  edges = vec_perm(LSQ, MSQ, edgeAlign);           // extract the edges
+  align = vec_lvsr(0, to);                         // permute map to misalign data
+  MSQ = vec_perm(edges, (Packet16uc)from, align);  // misalign the data (MSQ)
+  LSQ = vec_perm((Packet16uc)from, edges, align);  // misalign the data (LSQ)
+  vec_st(LSQ, 15, (unsigned char*)to);             // Store the LSQ part first
+  vec_st(MSQ, 0, (unsigned char*)to);              // Store the MSQ part second
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) {
+  pstoreu_common<Packet4f>(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) {
+  pstoreu_common<Packet4i>(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<short int>(short int* to, const Packet8s& from) {
+  pstoreu_common<Packet8s>(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<unsigned short int>(unsigned short int* to, const Packet8us& from) {
+  pstoreu_common<Packet8us>(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet8bf& from) {
+  pstoreu_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from.m_val);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<signed char>(signed char* to, const Packet16c& from) {
+  pstoreu_common<Packet16c>(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<unsigned char>(unsigned char* to, const Packet16uc& from) {
+  pstoreu_common<Packet16uc>(to, from);
+}
+
+template <typename Packet>
+EIGEN_ALWAYS_INLINE void pstoreu_partial_common(__UNPACK_TYPE__(Packet) * to, const Packet& from, const Index n,
+                                                const Index offset) {
+  const Index packet_size = unpacket_traits<Packet>::size;
+  eigen_internal_assert(n + offset <= packet_size && "number of elements plus offset will write past end of packet");
+  const Index size = sizeof(__UNPACK_TYPE__(Packet));
+#ifdef _ARCH_PWR9
+  EIGEN_UNUSED_VARIABLE(packet_size);
+  EIGEN_DEBUG_UNALIGNED_STORE
+  Packet store = from;
+  if (offset) {
+    Packet16uc shift = pset1<Packet16uc>(offset * 8 * size);
+#ifdef _BIG_ENDIAN
+    store = Packet(vec_slo(Packet16uc(store), shift));
+#else
+    store = Packet(vec_sro(Packet16uc(store), shift));
+#endif
+  }
+  vec_xst_len(store, to, n * size);
+#else
+  if (n) {
+    EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) store[packet_size];
+    pstore(store, from);
+    unsigned char* store2 = reinterpret_cast<unsigned char*>(store + offset);
+    unsigned char* to2 = reinterpret_cast<unsigned char*>(to);
+    Index n2 = n * size;
+    if (16 <= n2) {
+      pstoreu(to2, ploadu<Packet16uc>(store2));
+    } else {
+      memcpy((void*)to2, (void*)store2, n2);
+    }
+  }
+#endif
+}
+
+template <>
+EIGEN_ALWAYS_INLINE void pstoreu_partial<float>(float* to, const Packet4f& from, const Index n, const Index offset) {
+  pstoreu_partial_common<Packet4f>(to, from, n, offset);
+}
+template <>
+EIGEN_ALWAYS_INLINE void pstoreu_partial<int>(int* to, const Packet4i& from, const Index n, const Index offset) {
+  pstoreu_partial_common<Packet4i>(to, from, n, offset);
+}
+template <>
+EIGEN_ALWAYS_INLINE void pstoreu_partial<short int>(short int* to, const Packet8s& from, const Index n,
+                                                    const Index offset) {
+  pstoreu_partial_common<Packet8s>(to, from, n, offset);
+}
+template <>
+EIGEN_ALWAYS_INLINE void pstoreu_partial<unsigned short int>(unsigned short int* to, const Packet8us& from,
+                                                             const Index n, const Index offset) {
+  pstoreu_partial_common<Packet8us>(to, from, n, offset);
+}
+template <>
+EIGEN_ALWAYS_INLINE void pstoreu_partial<bfloat16>(bfloat16* to, const Packet8bf& from, const Index n,
+                                                   const Index offset) {
+  pstoreu_partial_common<Packet8us>(reinterpret_cast<unsigned short int*>(to), from, n, offset);
+}
+template <>
+EIGEN_ALWAYS_INLINE void pstoreu_partial<signed char>(signed char* to, const Packet16c& from, const Index n,
+                                                      const Index offset) {
+  pstoreu_partial_common<Packet16c>(to, from, n, offset);
+}
+template <>
+EIGEN_ALWAYS_INLINE void pstoreu_partial<unsigned char>(unsigned char* to, const Packet16uc& from, const Index n,
+                                                        const Index offset) {
+  pstoreu_partial_common<Packet16uc>(to, from, n, offset);
+}
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
+  EIGEN_PPC_PREFETCH(addr);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) {
+  EIGEN_PPC_PREFETCH(addr);
+}
+
+template <>
+EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
+  EIGEN_ALIGN16 float x;
+  vec_ste(a, 0, &x);
+  return x;
+}
+template <>
+EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) {
+  EIGEN_ALIGN16 int x;
+  vec_ste(a, 0, &x);
+  return x;
+}
+
+template <typename Packet>
+EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) pfirst_common(const Packet& a) {
+  EIGEN_ALIGN16 __UNPACK_TYPE__(Packet) x;
+  vec_ste(a, 0, &x);
+  return x;
+}
+
+template <>
+EIGEN_STRONG_INLINE short int pfirst<Packet8s>(const Packet8s& a) {
+  return pfirst_common<Packet8s>(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE unsigned short int pfirst<Packet8us>(const Packet8us& a) {
+  return pfirst_common<Packet8us>(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE signed char pfirst<Packet16c>(const Packet16c& a) {
+  return pfirst_common<Packet16c>(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE unsigned char pfirst<Packet16uc>(const Packet16uc& a) {
+  return pfirst_common<Packet16uc>(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
+  return reinterpret_cast<Packet4f>(
+      vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
+  return reinterpret_cast<Packet4i>(
+      vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s preverse(const Packet8s& a) {
+  return reinterpret_cast<Packet8s>(
+      vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE16));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us preverse(const Packet8us& a) {
+  return reinterpret_cast<Packet8us>(
+      vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE16));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c preverse(const Packet16c& a) {
+  return vec_perm(a, a, p16uc_REVERSE8);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc preverse(const Packet16uc& a) {
+  return vec_perm(a, a, p16uc_REVERSE8);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8bf preverse(const Packet8bf& a) {
+  return preverse<Packet8us>(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) {
+  return vec_abs(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) {
+  return vec_abs(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pabs(const Packet8s& a) {
+  return vec_abs(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pabs(const Packet8us& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pabs(const Packet16c& a) {
+  return vec_abs(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pabs(const Packet16uc& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8bf pabs(const Packet8bf& a) {
+  EIGEN_DECLARE_CONST_FAST_Packet8us(abs_mask, 0x7FFF);
+  return pand<Packet8us>(p8us_abs_mask, a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf psignbit(const Packet8bf& a) {
+  return vec_sra(a.m_val, vec_splat_u16(15));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f psignbit(const Packet4f& a) {
+  return (Packet4f)vec_sra((Packet4i)a, vec_splats((unsigned int)(31)));
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a) {
+  return vec_sra(a, reinterpret_cast<Packet4ui>(pset1<Packet4i>(N)));
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4i plogical_shift_right(const Packet4i& a) {
+  return vec_sr(a, reinterpret_cast<Packet4ui>(pset1<Packet4i>(N)));
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4i plogical_shift_left(const Packet4i& a) {
+  return vec_sl(a, reinterpret_cast<Packet4ui>(pset1<Packet4i>(N)));
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4f plogical_shift_left(const Packet4f& a) {
+  const EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
+  Packet4ui r = vec_sl(reinterpret_cast<Packet4ui>(a), p4ui_mask);
+  return reinterpret_cast<Packet4f>(r);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet4f plogical_shift_right(const Packet4f& a) {
+  const EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
+  Packet4ui r = vec_sr(reinterpret_cast<Packet4ui>(a), p4ui_mask);
+  return reinterpret_cast<Packet4f>(r);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(const Packet4ui& a) {
+  const EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
+  return vec_sr(a, p4ui_mask);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(const Packet4ui& a) {
+  const EIGEN_DECLARE_CONST_FAST_Packet4ui(mask, N);
+  return vec_sl(a, p4ui_mask);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet8us plogical_shift_left(const Packet8us& a) {
+  const EIGEN_DECLARE_CONST_FAST_Packet8us(mask, N);
+  return vec_sl(a, p8us_mask);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet8us plogical_shift_right(const Packet8us& a) {
+  const EIGEN_DECLARE_CONST_FAST_Packet8us(mask, N);
+  return vec_sr(a, p8us_mask);
+}
+
+EIGEN_STRONG_INLINE Packet4f Bf16ToF32Even(const Packet8bf& bf) {
+  return plogical_shift_left<16>(reinterpret_cast<Packet4f>(bf.m_val));
+}
+
+EIGEN_STRONG_INLINE Packet4f Bf16ToF32Odd(const Packet8bf& bf) {
+  const EIGEN_DECLARE_CONST_FAST_Packet4ui(high_mask, 0xFFFF0000);
+  return pand<Packet4f>(reinterpret_cast<Packet4f>(bf.m_val), reinterpret_cast<Packet4f>(p4ui_high_mask));
+}
+
+EIGEN_ALWAYS_INLINE Packet8us pmerge(Packet4ui even, Packet4ui odd) {
+#ifdef _BIG_ENDIAN
+  return vec_perm(reinterpret_cast<Packet8us>(odd), reinterpret_cast<Packet8us>(even), p16uc_MERGEO16);
+#else
+  return vec_perm(reinterpret_cast<Packet8us>(even), reinterpret_cast<Packet8us>(odd), p16uc_MERGEE16);
+#endif
+}
+
+// Simple interleaving of bool masks, prevents true values from being
+// converted to NaNs.
+EIGEN_STRONG_INLINE Packet8bf F32ToBf16Bool(Packet4f even, Packet4f odd) {
+  return pmerge(reinterpret_cast<Packet4ui>(even), reinterpret_cast<Packet4ui>(odd));
+}
+
+// #define SUPPORT_BF16_SUBNORMALS
+
+#ifndef __VEC_CLASS_FP_NAN
+#define __VEC_CLASS_FP_NAN (1 << 6)
+#endif
+
+#if defined(SUPPORT_BF16_SUBNORMALS) && !defined(__VEC_CLASS_FP_SUBNORMAL)
+#define __VEC_CLASS_FP_SUBNORMAL_P (1 << 1)
+#define __VEC_CLASS_FP_SUBNORMAL_N (1 << 0)
+
+#define __VEC_CLASS_FP_SUBNORMAL (__VEC_CLASS_FP_SUBNORMAL_P | __VEC_CLASS_FP_SUBNORMAL_N)
+#endif
+
+EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f p4f) {
+#ifdef _ARCH_PWR10
+  return reinterpret_cast<Packet8us>(__builtin_vsx_xvcvspbf16(reinterpret_cast<Packet16uc>(p4f)));
+#else
+  Packet4ui input = reinterpret_cast<Packet4ui>(p4f);
+  Packet4ui lsb = plogical_shift_right<16>(input);
+  lsb = pand<Packet4ui>(lsb, reinterpret_cast<Packet4ui>(p4i_ONE));
+
+  EIGEN_DECLARE_CONST_FAST_Packet4ui(BIAS, 0x7FFFu);
+  Packet4ui rounding_bias = padd<Packet4ui>(lsb, p4ui_BIAS);
+  input = padd<Packet4ui>(input, rounding_bias);
+
+  const EIGEN_DECLARE_CONST_FAST_Packet4ui(nan, 0x7FC00000);
+#if defined(_ARCH_PWR9) && defined(EIGEN_VECTORIZE_VSX)
+  Packet4bi nan_selector = vec_test_data_class(p4f, __VEC_CLASS_FP_NAN);
+  input = vec_sel(input, p4ui_nan, nan_selector);
+
+#ifdef SUPPORT_BF16_SUBNORMALS
+  Packet4bi subnormal_selector = vec_test_data_class(p4f, __VEC_CLASS_FP_SUBNORMAL);
+  input = vec_sel(input, reinterpret_cast<Packet4ui>(p4f), subnormal_selector);
+#endif
+#else
+#ifdef SUPPORT_BF16_SUBNORMALS
+  // Test NaN and Subnormal
+  const EIGEN_DECLARE_CONST_FAST_Packet4ui(exp_mask, 0x7F800000);
+  Packet4ui exp = pand<Packet4ui>(p4ui_exp_mask, reinterpret_cast<Packet4ui>(p4f));
+
+  const EIGEN_DECLARE_CONST_FAST_Packet4ui(mantissa_mask, 0x7FFFFF);
+  Packet4ui mantissa = pand<Packet4ui>(p4ui_mantissa_mask, reinterpret_cast<Packet4ui>(p4f));
+
+  Packet4bi is_max_exp = vec_cmpeq(exp, p4ui_exp_mask);
+  Packet4bi is_mant_zero = vec_cmpeq(mantissa, reinterpret_cast<Packet4ui>(p4i_ZERO));
+
+  Packet4ui nan_selector =
+      pandnot<Packet4ui>(reinterpret_cast<Packet4ui>(is_max_exp), reinterpret_cast<Packet4ui>(is_mant_zero));
+
+  Packet4bi is_zero_exp = vec_cmpeq(exp, reinterpret_cast<Packet4ui>(p4i_ZERO));
+
+  Packet4ui subnormal_selector =
+      pandnot<Packet4ui>(reinterpret_cast<Packet4ui>(is_zero_exp), reinterpret_cast<Packet4ui>(is_mant_zero));
+
+  input = vec_sel(input, p4ui_nan, nan_selector);
+  input = vec_sel(input, reinterpret_cast<Packet4ui>(p4f), subnormal_selector);
+#else
+  // Test only NaN
+  Packet4bi nan_selector = vec_cmpeq(p4f, p4f);
+
+  input = vec_sel(p4ui_nan, input, nan_selector);
+#endif
+#endif
+
+  input = plogical_shift_right<16>(input);
+  return reinterpret_cast<Packet8us>(input);
+#endif
+}
+
+#ifdef _BIG_ENDIAN
+/**
+ * Pack the high portion of two float Packets into one bfloat16 Packet
+ *
+ * @tparam lohi to expect either a low & high OR odd & even order
+ */
+template <bool lohi>
+EIGEN_ALWAYS_INLINE Packet8bf Bf16PackHigh(Packet4f lo, Packet4f hi) {
+  if (lohi) {
+    return vec_perm(reinterpret_cast<Packet8us>(lo), reinterpret_cast<Packet8us>(hi), p16uc_MERGEH16);
+  } else {
+    return vec_perm(reinterpret_cast<Packet8us>(hi), reinterpret_cast<Packet8us>(lo), p16uc_MERGEE16);
+  }
+}
+
+/**
+ * Pack the low portion of two float Packets into one bfloat16 Packet
+ *
+ * @param lohi to expect either a low & high OR odd & even order
+ */
+template <bool lohi>
+EIGEN_ALWAYS_INLINE Packet8bf Bf16PackLow(Packet4f lo, Packet4f hi) {
+  if (lohi) {
+    return vec_pack(reinterpret_cast<Packet4ui>(lo), reinterpret_cast<Packet4ui>(hi));
+  } else {
+    return vec_perm(reinterpret_cast<Packet8us>(hi), reinterpret_cast<Packet8us>(lo), p16uc_MERGEO16);
+  }
+}
+#else
+template <bool lohi>
+EIGEN_ALWAYS_INLINE Packet8bf Bf16PackLow(Packet4f hi, Packet4f lo) {
+  if (lohi) {
+    return vec_pack(reinterpret_cast<Packet4ui>(hi), reinterpret_cast<Packet4ui>(lo));
+  } else {
+    return vec_perm(reinterpret_cast<Packet8us>(hi), reinterpret_cast<Packet8us>(lo), p16uc_MERGEE16);
+  }
+}
+
+template <bool lohi>
+EIGEN_ALWAYS_INLINE Packet8bf Bf16PackHigh(Packet4f hi, Packet4f lo) {
+  if (lohi) {
+    return vec_perm(reinterpret_cast<Packet8us>(hi), reinterpret_cast<Packet8us>(lo), p16uc_MERGEL16);
+  } else {
+    return vec_perm(reinterpret_cast<Packet8us>(hi), reinterpret_cast<Packet8us>(lo), p16uc_MERGEO16);
+  }
+}
+#endif
+
+/**
+ * Convert and pack two float Packets into one bfloat16 Packet
+ *
+ * @tparam lohi to expect either a low & high OR odd & even order
+ */
+template <bool lohi = true>
+EIGEN_ALWAYS_INLINE Packet8bf F32ToBf16Two(Packet4f lo, Packet4f hi) {
+  Packet8us p4f = Bf16PackHigh<lohi>(lo, hi);
+  Packet8us p4f2 = Bf16PackLow<lohi>(lo, hi);
+
+  Packet8us lsb = pand<Packet8us>(p4f, p8us_ONE);
+  EIGEN_DECLARE_CONST_FAST_Packet8us(BIAS, 0x7FFFu);
+  lsb = padd<Packet8us>(lsb, p8us_BIAS);
+  lsb = padd<Packet8us>(lsb, p4f2);
+
+  Packet8bi rounding_bias = vec_cmplt(lsb, p4f2);
+  Packet8us input = psub<Packet8us>(p4f, reinterpret_cast<Packet8us>(rounding_bias));
+
+#if defined(_ARCH_PWR9) && defined(EIGEN_VECTORIZE_VSX)
+  Packet4bi nan_selector_lo = vec_test_data_class(lo, __VEC_CLASS_FP_NAN);
+  Packet4bi nan_selector_hi = vec_test_data_class(hi, __VEC_CLASS_FP_NAN);
+  Packet8us nan_selector =
+      Bf16PackLow<lohi>(reinterpret_cast<Packet4f>(nan_selector_lo), reinterpret_cast<Packet4f>(nan_selector_hi));
+
+  input = vec_sel(input, p8us_BIAS, nan_selector);
+
+#ifdef SUPPORT_BF16_SUBNORMALS
+  Packet4bi subnormal_selector_lo = vec_test_data_class(lo, __VEC_CLASS_FP_SUBNORMAL);
+  Packet4bi subnormal_selector_hi = vec_test_data_class(hi, __VEC_CLASS_FP_SUBNORMAL);
+  Packet8us subnormal_selector = Bf16PackLow<lohi>(reinterpret_cast<Packet4f>(subnormal_selector_lo),
+                                                   reinterpret_cast<Packet4f>(subnormal_selector_hi));
+
+  input = vec_sel(input, reinterpret_cast<Packet8us>(p4f), subnormal_selector);
+#endif
+#else
+#ifdef SUPPORT_BF16_SUBNORMALS
+  // Test NaN and Subnormal
+  const EIGEN_DECLARE_CONST_FAST_Packet8us(exp_mask, 0x7F80);
+  Packet8us exp = pand<Packet8us>(p8us_exp_mask, p4f);
+
+  const EIGEN_DECLARE_CONST_FAST_Packet8us(mantissa_mask, 0x7Fu);
+  Packet8us mantissa = pand<Packet8us>(p8us_mantissa_mask, p4f);
+
+  Packet8bi is_max_exp = vec_cmpeq(exp, p8us_exp_mask);
+  Packet8bi is_mant_zero = vec_cmpeq(mantissa, reinterpret_cast<Packet8us>(p4i_ZERO));
+
+  Packet8us nan_selector =
+      pandnot<Packet8us>(reinterpret_cast<Packet8us>(is_max_exp), reinterpret_cast<Packet8us>(is_mant_zero));
+
+  Packet8bi is_zero_exp = vec_cmpeq(exp, reinterpret_cast<Packet8us>(p4i_ZERO));
+
+  Packet8us subnormal_selector =
+      pandnot<Packet8us>(reinterpret_cast<Packet8us>(is_zero_exp), reinterpret_cast<Packet8us>(is_mant_zero));
+
+  // Using BIAS as NaN (since any or all of the last 7 bits can be set)
+  input = vec_sel(input, p8us_BIAS, nan_selector);
+  input = vec_sel(input, reinterpret_cast<Packet8us>(p4f), subnormal_selector);
+#else
+  // Test only NaN
+  Packet4bi nan_selector_lo = vec_cmpeq(lo, lo);
+  Packet4bi nan_selector_hi = vec_cmpeq(hi, hi);
+  Packet8us nan_selector =
+      Bf16PackLow<lohi>(reinterpret_cast<Packet4f>(nan_selector_lo), reinterpret_cast<Packet4f>(nan_selector_hi));
+
+  input = vec_sel(p8us_BIAS, input, nan_selector);
+#endif
+#endif
+
+  return input;
+}
+
+/**
+ * Convert and pack two float Packets into one bfloat16 Packet - low & high order
+ */
+EIGEN_STRONG_INLINE Packet8bf F32ToBf16Both(Packet4f lo, Packet4f hi) {
+#ifdef _ARCH_PWR10
+  Packet8bf fp16_0 = F32ToBf16(lo);
+  Packet8bf fp16_1 = F32ToBf16(hi);
+  return vec_pack(reinterpret_cast<Packet4ui>(fp16_0.m_val), reinterpret_cast<Packet4ui>(fp16_1.m_val));
+#else
+  return F32ToBf16Two(lo, hi);
+#endif
+}
+
+/**
+ * Convert and pack two float Packets into one bfloat16 Packet - odd & even order
+ */
+EIGEN_STRONG_INLINE Packet8bf F32ToBf16(Packet4f even, Packet4f odd) {
+#ifdef _ARCH_PWR10
+  return pmerge(reinterpret_cast<Packet4ui>(F32ToBf16(even).m_val), reinterpret_cast<Packet4ui>(F32ToBf16(odd).m_val));
+#else
+  return F32ToBf16Two<false>(even, odd);
+#endif
+}
+#define BF16_TO_F32_UNARY_OP_WRAPPER(OP, A) \
+  Packet4f a_even = Bf16ToF32Even(A);       \
+  Packet4f a_odd = Bf16ToF32Odd(A);         \
+  Packet4f op_even = OP(a_even);            \
+  Packet4f op_odd = OP(a_odd);              \
+  return F32ToBf16(op_even, op_odd);
+
+#define BF16_TO_F32_BINARY_OP_WRAPPER(OP, A, B) \
+  Packet4f a_even = Bf16ToF32Even(A);           \
+  Packet4f a_odd = Bf16ToF32Odd(A);             \
+  Packet4f b_even = Bf16ToF32Even(B);           \
+  Packet4f b_odd = Bf16ToF32Odd(B);             \
+  Packet4f op_even = OP(a_even, b_even);        \
+  Packet4f op_odd = OP(a_odd, b_odd);           \
+  return F32ToBf16(op_even, op_odd);
+
+#define BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(OP, A, B) \
+  Packet4f a_even = Bf16ToF32Even(A);                \
+  Packet4f a_odd = Bf16ToF32Odd(A);                  \
+  Packet4f b_even = Bf16ToF32Even(B);                \
+  Packet4f b_odd = Bf16ToF32Odd(B);                  \
+  Packet4f op_even = OP(a_even, b_even);             \
+  Packet4f op_odd = OP(a_odd, b_odd);                \
+  return F32ToBf16Bool(op_even, op_odd);
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf padd<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+  BF16_TO_F32_BINARY_OP_WRAPPER(padd<Packet4f>, a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pmul<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+  BF16_TO_F32_BINARY_OP_WRAPPER(pmul<Packet4f>, a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pdiv<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+  BF16_TO_F32_BINARY_OP_WRAPPER(pdiv<Packet4f>, a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pnegate<Packet8bf>(const Packet8bf& a) {
+  EIGEN_DECLARE_CONST_FAST_Packet8us(neg_mask, 0x8000);
+  return pxor<Packet8us>(p8us_neg_mask, a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf psub<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+  BF16_TO_F32_BINARY_OP_WRAPPER(psub<Packet4f>, a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pexp<Packet8bf>(const Packet8bf& a) {
+  BF16_TO_F32_UNARY_OP_WRAPPER(pexp_float, a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pexp2<Packet8bf>(const Packet8bf& a) {
+  BF16_TO_F32_UNARY_OP_WRAPPER(generic_exp2, a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
+  return pldexp_generic(a, exponent);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8bf pldexp<Packet8bf>(const Packet8bf& a, const Packet8bf& exponent) {
+  BF16_TO_F32_BINARY_OP_WRAPPER(pldexp<Packet4f>, a, exponent);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
+  return pfrexp_generic(a, exponent);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8bf pfrexp<Packet8bf>(const Packet8bf& a, Packet8bf& e) {
+  Packet4f a_even = Bf16ToF32Even(a);
+  Packet4f a_odd = Bf16ToF32Odd(a);
+  Packet4f e_even;
+  Packet4f e_odd;
+  Packet4f op_even = pfrexp<Packet4f>(a_even, e_even);
+  Packet4f op_odd = pfrexp<Packet4f>(a_odd, e_odd);
+  e = F32ToBf16(e_even, e_odd);
+  return F32ToBf16(op_even, op_odd);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf psin<Packet8bf>(const Packet8bf& a) {
+  BF16_TO_F32_UNARY_OP_WRAPPER(psin_float, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8bf pcos<Packet8bf>(const Packet8bf& a) {
+  BF16_TO_F32_UNARY_OP_WRAPPER(pcos_float, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8bf plog<Packet8bf>(const Packet8bf& a) {
+  BF16_TO_F32_UNARY_OP_WRAPPER(plog_float, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8bf pfloor<Packet8bf>(const Packet8bf& a) {
+  BF16_TO_F32_UNARY_OP_WRAPPER(pfloor<Packet4f>, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8bf pceil<Packet8bf>(const Packet8bf& a) {
+  BF16_TO_F32_UNARY_OP_WRAPPER(pceil<Packet4f>, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8bf pround<Packet8bf>(const Packet8bf& a) {
+  BF16_TO_F32_UNARY_OP_WRAPPER(pround<Packet4f>, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8bf ptrunc<Packet8bf>(const Packet8bf& a) {
+  BF16_TO_F32_UNARY_OP_WRAPPER(ptrunc<Packet4f>, a);
+}
+#ifdef EIGEN_VECTORIZE_VSX
+template <>
+EIGEN_STRONG_INLINE Packet8bf print<Packet8bf>(const Packet8bf& a) {
+  BF16_TO_F32_UNARY_OP_WRAPPER(print<Packet4f>, a);
+}
+#endif
+template <>
+EIGEN_STRONG_INLINE Packet8bf pmadd(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
+  Packet4f a_even = Bf16ToF32Even(a);
+  Packet4f a_odd = Bf16ToF32Odd(a);
+  Packet4f b_even = Bf16ToF32Even(b);
+  Packet4f b_odd = Bf16ToF32Odd(b);
+  Packet4f c_even = Bf16ToF32Even(c);
+  Packet4f c_odd = Bf16ToF32Odd(c);
+  Packet4f pmadd_even = pmadd<Packet4f>(a_even, b_even, c_even);
+  Packet4f pmadd_odd = pmadd<Packet4f>(a_odd, b_odd, c_odd);
+  return F32ToBf16(pmadd_even, pmadd_odd);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pmsub(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
+  Packet4f a_even = Bf16ToF32Even(a);
+  Packet4f a_odd = Bf16ToF32Odd(a);
+  Packet4f b_even = Bf16ToF32Even(b);
+  Packet4f b_odd = Bf16ToF32Odd(b);
+  Packet4f c_even = Bf16ToF32Even(c);
+  Packet4f c_odd = Bf16ToF32Odd(c);
+  Packet4f pmadd_even = pmsub<Packet4f>(a_even, b_even, c_even);
+  Packet4f pmadd_odd = pmsub<Packet4f>(a_odd, b_odd, c_odd);
+  return F32ToBf16(pmadd_even, pmadd_odd);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8bf pnmadd(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
+  Packet4f a_even = Bf16ToF32Even(a);
+  Packet4f a_odd = Bf16ToF32Odd(a);
+  Packet4f b_even = Bf16ToF32Even(b);
+  Packet4f b_odd = Bf16ToF32Odd(b);
+  Packet4f c_even = Bf16ToF32Even(c);
+  Packet4f c_odd = Bf16ToF32Odd(c);
+  Packet4f pmadd_even = pnmadd<Packet4f>(a_even, b_even, c_even);
+  Packet4f pmadd_odd = pnmadd<Packet4f>(a_odd, b_odd, c_odd);
+  return F32ToBf16(pmadd_even, pmadd_odd);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pnmsub(const Packet8bf& a, const Packet8bf& b, const Packet8bf& c) {
+  Packet4f a_even = Bf16ToF32Even(a);
+  Packet4f a_odd = Bf16ToF32Odd(a);
+  Packet4f b_even = Bf16ToF32Even(b);
+  Packet4f b_odd = Bf16ToF32Odd(b);
+  Packet4f c_even = Bf16ToF32Even(c);
+  Packet4f c_odd = Bf16ToF32Odd(c);
+  Packet4f pmadd_even = pnmsub<Packet4f>(a_even, b_even, c_even);
+  Packet4f pmadd_odd = pnmsub<Packet4f>(a_odd, b_odd, c_odd);
+  return F32ToBf16(pmadd_even, pmadd_odd);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pmin<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+  BF16_TO_F32_BINARY_OP_WRAPPER(pmin<Packet4f>, a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pmax<Packet8bf>(const Packet8bf& a, const Packet8bf& b) {
+  BF16_TO_F32_BINARY_OP_WRAPPER(pmax<Packet4f>, a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pcmp_lt(const Packet8bf& a, const Packet8bf& b) {
+  BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_lt<Packet4f>, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8bf pcmp_lt_or_nan(const Packet8bf& a, const Packet8bf& b) {
+  BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_lt_or_nan<Packet4f>, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8bf pcmp_le(const Packet8bf& a, const Packet8bf& b) {
+  BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_le<Packet4f>, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8bf pcmp_eq(const Packet8bf& a, const Packet8bf& b) {
+  BF16_TO_F32_BINARY_OP_WRAPPER_BOOL(pcmp_eq<Packet4f>, a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 pfirst(const Packet8bf& a) {
+  return Eigen::bfloat16_impl::raw_uint16_to_bfloat16((pfirst<Packet8us>(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf ploaddup<Packet8bf>(const bfloat16* from) {
+  return ploaddup<Packet8us>(reinterpret_cast<const unsigned short int*>(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf plset<Packet8bf>(const bfloat16& a) {
+  bfloat16 countdown[8] = {bfloat16(0), bfloat16(1), bfloat16(2), bfloat16(3),
+                           bfloat16(4), bfloat16(5), bfloat16(6), bfloat16(7)};
+  return padd<Packet8bf>(pset1<Packet8bf>(a), pload<Packet8bf>(countdown));
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
+  Packet4f b, sum;
+  b = vec_sld(a, a, 8);
+  sum = a + b;
+  b = vec_sld(sum, sum, 4);
+  sum += b;
+  return pfirst(sum);
+}
+
+template <>
+EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) {
+  Packet4i b, sum;
+  b = vec_sld(a, a, 8);
+  sum = a + b;
+  b = vec_sld(sum, sum, 4);
+  sum += b;
+  return pfirst(sum);
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux<Packet8bf>(const Packet8bf& a) {
+  float redux_even = predux<Packet4f>(Bf16ToF32Even(a));
+  float redux_odd = predux<Packet4f>(Bf16ToF32Odd(a));
+  float f32_result = redux_even + redux_odd;
+  return bfloat16(f32_result);
+}
+template <typename Packet>
+EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_size8(const Packet& a) {
+  union {
+    Packet v;
+    __UNPACK_TYPE__(Packet) n[8];
+  } vt;
+  vt.v = a;
+
+  EIGEN_ALIGN16 int first_loader[4] = {vt.n[0], vt.n[1], vt.n[2], vt.n[3]};
+  EIGEN_ALIGN16 int second_loader[4] = {vt.n[4], vt.n[5], vt.n[6], vt.n[7]};
+  Packet4i first_half = pload<Packet4i>(first_loader);
+  Packet4i second_half = pload<Packet4i>(second_loader);
+
+  return static_cast<__UNPACK_TYPE__(Packet)>(predux(first_half) + predux(second_half));
+}
+
+template <>
+EIGEN_STRONG_INLINE short int predux<Packet8s>(const Packet8s& a) {
+  return predux_size8<Packet8s>(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE unsigned short int predux<Packet8us>(const Packet8us& a) {
+  return predux_size8<Packet8us>(a);
+}
+
+template <typename Packet>
+EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_size16(const Packet& a) {
+  union {
+    Packet v;
+    __UNPACK_TYPE__(Packet) n[16];
+  } vt;
+  vt.v = a;
+
+  EIGEN_ALIGN16 int first_loader[4] = {vt.n[0], vt.n[1], vt.n[2], vt.n[3]};
+  EIGEN_ALIGN16 int second_loader[4] = {vt.n[4], vt.n[5], vt.n[6], vt.n[7]};
+  EIGEN_ALIGN16 int third_loader[4] = {vt.n[8], vt.n[9], vt.n[10], vt.n[11]};
+  EIGEN_ALIGN16 int fourth_loader[4] = {vt.n[12], vt.n[13], vt.n[14], vt.n[15]};
+
+  Packet4i first_quarter = pload<Packet4i>(first_loader);
+  Packet4i second_quarter = pload<Packet4i>(second_loader);
+  Packet4i third_quarter = pload<Packet4i>(third_loader);
+  Packet4i fourth_quarter = pload<Packet4i>(fourth_loader);
+
+  return static_cast<__UNPACK_TYPE__(Packet)>(predux(first_quarter) + predux(second_quarter) + predux(third_quarter) +
+                                              predux(fourth_quarter));
+}
+
+template <>
+EIGEN_STRONG_INLINE signed char predux<Packet16c>(const Packet16c& a) {
+  return predux_size16<Packet16c>(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE unsigned char predux<Packet16uc>(const Packet16uc& a) {
+  return predux_size16<Packet16uc>(a);
+}
+
+// Other reduction functions:
+// mul
+template <>
+EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) {
+  Packet4f prod;
+  prod = pmul(a, vec_sld(a, a, 8));
+  return pfirst(pmul(prod, vec_sld(prod, prod, 4)));
+}
+
+template <>
+EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a) {
+  EIGEN_ALIGN16 int aux[4];
+  pstore(aux, a);
+  return aux[0] * aux[1] * aux[2] * aux[3];
+}
+
+template <>
+EIGEN_STRONG_INLINE short int predux_mul<Packet8s>(const Packet8s& a) {
+  Packet8s pair, quad, octo;
+
+  pair = vec_mul(a, vec_sld(a, a, 8));
+  quad = vec_mul(pair, vec_sld(pair, pair, 4));
+  octo = vec_mul(quad, vec_sld(quad, quad, 2));
+
+  return pfirst(octo);
+}
+
+template <>
+EIGEN_STRONG_INLINE unsigned short int predux_mul<Packet8us>(const Packet8us& a) {
+  Packet8us pair, quad, octo;
+
+  pair = vec_mul(a, vec_sld(a, a, 8));
+  quad = vec_mul(pair, vec_sld(pair, pair, 4));
+  octo = vec_mul(quad, vec_sld(quad, quad, 2));
+
+  return pfirst(octo);
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet8bf>(const Packet8bf& a) {
+  float redux_even = predux_mul<Packet4f>(Bf16ToF32Even(a));
+  float redux_odd = predux_mul<Packet4f>(Bf16ToF32Odd(a));
+  float f32_result = redux_even * redux_odd;
+  return bfloat16(f32_result);
+}
+
+template <>
+EIGEN_STRONG_INLINE signed char predux_mul<Packet16c>(const Packet16c& a) {
+  Packet16c pair, quad, octo, result;
+
+  pair = vec_mul(a, vec_sld(a, a, 8));
+  quad = vec_mul(pair, vec_sld(pair, pair, 4));
+  octo = vec_mul(quad, vec_sld(quad, quad, 2));
+  result = vec_mul(octo, vec_sld(octo, octo, 1));
+
+  return pfirst(result);
+}
+
+template <>
+EIGEN_STRONG_INLINE unsigned char predux_mul<Packet16uc>(const Packet16uc& a) {
+  Packet16uc pair, quad, octo, result;
+
+  pair = vec_mul(a, vec_sld(a, a, 8));
+  quad = vec_mul(pair, vec_sld(pair, pair, 4));
+  octo = vec_mul(quad, vec_sld(quad, quad, 2));
+  result = vec_mul(octo, vec_sld(octo, octo, 1));
+
+  return pfirst(result);
+}
+
+// min
+template <typename Packet>
+EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_min4(const Packet& a) {
+  Packet b, res;
+  b = vec_min(a, vec_sld(a, a, 8));
+  res = vec_min(b, vec_sld(b, b, 4));
+  return pfirst(res);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
+  return predux_min4<Packet4f>(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a) {
+  return predux_min4<Packet4i>(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_min<Packet8bf>(const Packet8bf& a) {
+  float redux_even = predux_min<Packet4f>(Bf16ToF32Even(a));
+  float redux_odd = predux_min<Packet4f>(Bf16ToF32Odd(a));
+  float f32_result = (std::min)(redux_even, redux_odd);
+  return bfloat16(f32_result);
+}
+
+template <>
+EIGEN_STRONG_INLINE short int predux_min<Packet8s>(const Packet8s& a) {
+  Packet8s pair, quad, octo;
+
+  // pair = { Min(a0,a4), Min(a1,a5), Min(a2,a6), Min(a3,a7) }
+  pair = vec_min(a, vec_sld(a, a, 8));
+
+  // quad = { Min(a0, a4, a2, a6), Min(a1, a5, a3, a7) }
+  quad = vec_min(pair, vec_sld(pair, pair, 4));
+
+  // octo = { Min(a0, a4, a2, a6, a1, a5, a3, a7) }
+  octo = vec_min(quad, vec_sld(quad, quad, 2));
+  return pfirst(octo);
+}
+
+template <>
+EIGEN_STRONG_INLINE unsigned short int predux_min<Packet8us>(const Packet8us& a) {
+  Packet8us pair, quad, octo;
+
+  // pair = { Min(a0,a4), Min(a1,a5), Min(a2,a6), Min(a3,a7) }
+  pair = vec_min(a, vec_sld(a, a, 8));
+
+  // quad = { Min(a0, a4, a2, a6), Min(a1, a5, a3, a7) }
+  quad = vec_min(pair, vec_sld(pair, pair, 4));
+
+  // octo = { Min(a0, a4, a2, a6, a1, a5, a3, a7) }
+  octo = vec_min(quad, vec_sld(quad, quad, 2));
+  return pfirst(octo);
+}
+
+template <>
+EIGEN_STRONG_INLINE signed char predux_min<Packet16c>(const Packet16c& a) {
+  Packet16c pair, quad, octo, result;
+
+  pair = vec_min(a, vec_sld(a, a, 8));
+  quad = vec_min(pair, vec_sld(pair, pair, 4));
+  octo = vec_min(quad, vec_sld(quad, quad, 2));
+  result = vec_min(octo, vec_sld(octo, octo, 1));
+
+  return pfirst(result);
+}
+
+template <>
+EIGEN_STRONG_INLINE unsigned char predux_min<Packet16uc>(const Packet16uc& a) {
+  Packet16uc pair, quad, octo, result;
+
+  pair = vec_min(a, vec_sld(a, a, 8));
+  quad = vec_min(pair, vec_sld(pair, pair, 4));
+  octo = vec_min(quad, vec_sld(quad, quad, 2));
+  result = vec_min(octo, vec_sld(octo, octo, 1));
+
+  return pfirst(result);
+}
+// max
+template <typename Packet>
+EIGEN_STRONG_INLINE __UNPACK_TYPE__(Packet) predux_max4(const Packet& a) {
+  Packet b, res;
+  b = vec_max(a, vec_sld(a, a, 8));
+  res = vec_max(b, vec_sld(b, b, 4));
+  return pfirst(res);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
+  return predux_max4<Packet4f>(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a) {
+  return predux_max4<Packet4i>(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_max<Packet8bf>(const Packet8bf& a) {
+  float redux_even = predux_max<Packet4f>(Bf16ToF32Even(a));
+  float redux_odd = predux_max<Packet4f>(Bf16ToF32Odd(a));
+  float f32_result = (std::max)(redux_even, redux_odd);
+  return bfloat16(f32_result);
+}
+
+template <>
+EIGEN_STRONG_INLINE short int predux_max<Packet8s>(const Packet8s& a) {
+  Packet8s pair, quad, octo;
+
+  // pair = { Max(a0,a4), Max(a1,a5), Max(a2,a6), Max(a3,a7) }
+  pair = vec_max(a, vec_sld(a, a, 8));
+
+  // quad = { Max(a0, a4, a2, a6), Max(a1, a5, a3, a7) }
+  quad = vec_max(pair, vec_sld(pair, pair, 4));
+
+  // octo = { Max(a0, a4, a2, a6, a1, a5, a3, a7) }
+  octo = vec_max(quad, vec_sld(quad, quad, 2));
+  return pfirst(octo);
+}
+
+template <>
+EIGEN_STRONG_INLINE unsigned short int predux_max<Packet8us>(const Packet8us& a) {
+  Packet8us pair, quad, octo;
+
+  // pair = { Max(a0,a4), Max(a1,a5), Max(a2,a6), Max(a3,a7) }
+  pair = vec_max(a, vec_sld(a, a, 8));
+
+  // quad = { Max(a0, a4, a2, a6), Max(a1, a5, a3, a7) }
+  quad = vec_max(pair, vec_sld(pair, pair, 4));
+
+  // octo = { Max(a0, a4, a2, a6, a1, a5, a3, a7) }
+  octo = vec_max(quad, vec_sld(quad, quad, 2));
+  return pfirst(octo);
+}
+
+template <>
+EIGEN_STRONG_INLINE signed char predux_max<Packet16c>(const Packet16c& a) {
+  Packet16c pair, quad, octo, result;
+
+  pair = vec_max(a, vec_sld(a, a, 8));
+  quad = vec_max(pair, vec_sld(pair, pair, 4));
+  octo = vec_max(quad, vec_sld(quad, quad, 2));
+  result = vec_max(octo, vec_sld(octo, octo, 1));
+
+  return pfirst(result);
+}
+
+template <>
+EIGEN_STRONG_INLINE unsigned char predux_max<Packet16uc>(const Packet16uc& a) {
+  Packet16uc pair, quad, octo, result;
+
+  pair = vec_max(a, vec_sld(a, a, 8));
+  quad = vec_max(pair, vec_sld(pair, pair, 4));
+  octo = vec_max(quad, vec_sld(quad, quad, 2));
+  result = vec_max(octo, vec_sld(octo, octo, 1));
+
+  return pfirst(result);
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) {
+  return vec_any_ne(x, pzero(x));
+}
+
+template <typename T>
+EIGEN_DEVICE_FUNC inline void ptranpose_common(PacketBlock<T, 4>& kernel) {
+  T t0, t1, t2, t3;
+  t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
+  t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
+  t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
+  t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
+  kernel.packet[0] = vec_mergeh(t0, t2);
+  kernel.packet[1] = vec_mergel(t0, t2);
+  kernel.packet[2] = vec_mergeh(t1, t3);
+  kernel.packet[3] = vec_mergel(t1, t3);
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f, 4>& kernel) { ptranpose_common<Packet4f>(kernel); }
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4i, 4>& kernel) { ptranpose_common<Packet4i>(kernel); }
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8s, 4>& kernel) {
+  Packet8s t0, t1, t2, t3;
+  t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
+  t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
+  t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
+  t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
+  kernel.packet[0] = vec_mergeh(t0, t2);
+  kernel.packet[1] = vec_mergel(t0, t2);
+  kernel.packet[2] = vec_mergeh(t1, t3);
+  kernel.packet[3] = vec_mergel(t1, t3);
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8us, 4>& kernel) {
+  Packet8us t0, t1, t2, t3;
+  t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
+  t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
+  t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
+  t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
+  kernel.packet[0] = vec_mergeh(t0, t2);
+  kernel.packet[1] = vec_mergel(t0, t2);
+  kernel.packet[2] = vec_mergeh(t1, t3);
+  kernel.packet[3] = vec_mergel(t1, t3);
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8bf, 4>& kernel) {
+  Packet8us t0, t1, t2, t3;
+
+  t0 = vec_mergeh(kernel.packet[0].m_val, kernel.packet[2].m_val);
+  t1 = vec_mergel(kernel.packet[0].m_val, kernel.packet[2].m_val);
+  t2 = vec_mergeh(kernel.packet[1].m_val, kernel.packet[3].m_val);
+  t3 = vec_mergel(kernel.packet[1].m_val, kernel.packet[3].m_val);
+  kernel.packet[0] = vec_mergeh(t0, t2);
+  kernel.packet[1] = vec_mergel(t0, t2);
+  kernel.packet[2] = vec_mergeh(t1, t3);
+  kernel.packet[3] = vec_mergel(t1, t3);
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16c, 4>& kernel) {
+  Packet16c t0, t1, t2, t3;
+  t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
+  t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
+  t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
+  t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
+  kernel.packet[0] = vec_mergeh(t0, t2);
+  kernel.packet[1] = vec_mergel(t0, t2);
+  kernel.packet[2] = vec_mergeh(t1, t3);
+  kernel.packet[3] = vec_mergel(t1, t3);
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16uc, 4>& kernel) {
+  Packet16uc t0, t1, t2, t3;
+  t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
+  t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
+  t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
+  t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
+  kernel.packet[0] = vec_mergeh(t0, t2);
+  kernel.packet[1] = vec_mergel(t0, t2);
+  kernel.packet[2] = vec_mergeh(t1, t3);
+  kernel.packet[3] = vec_mergel(t1, t3);
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8s, 8>& kernel) {
+  Packet8s v[8], sum[8];
+
+  v[0] = vec_mergeh(kernel.packet[0], kernel.packet[4]);
+  v[1] = vec_mergel(kernel.packet[0], kernel.packet[4]);
+  v[2] = vec_mergeh(kernel.packet[1], kernel.packet[5]);
+  v[3] = vec_mergel(kernel.packet[1], kernel.packet[5]);
+  v[4] = vec_mergeh(kernel.packet[2], kernel.packet[6]);
+  v[5] = vec_mergel(kernel.packet[2], kernel.packet[6]);
+  v[6] = vec_mergeh(kernel.packet[3], kernel.packet[7]);
+  v[7] = vec_mergel(kernel.packet[3], kernel.packet[7]);
+  sum[0] = vec_mergeh(v[0], v[4]);
+  sum[1] = vec_mergel(v[0], v[4]);
+  sum[2] = vec_mergeh(v[1], v[5]);
+  sum[3] = vec_mergel(v[1], v[5]);
+  sum[4] = vec_mergeh(v[2], v[6]);
+  sum[5] = vec_mergel(v[2], v[6]);
+  sum[6] = vec_mergeh(v[3], v[7]);
+  sum[7] = vec_mergel(v[3], v[7]);
+
+  kernel.packet[0] = vec_mergeh(sum[0], sum[4]);
+  kernel.packet[1] = vec_mergel(sum[0], sum[4]);
+  kernel.packet[2] = vec_mergeh(sum[1], sum[5]);
+  kernel.packet[3] = vec_mergel(sum[1], sum[5]);
+  kernel.packet[4] = vec_mergeh(sum[2], sum[6]);
+  kernel.packet[5] = vec_mergel(sum[2], sum[6]);
+  kernel.packet[6] = vec_mergeh(sum[3], sum[7]);
+  kernel.packet[7] = vec_mergel(sum[3], sum[7]);
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8us, 8>& kernel) {
+  Packet8us v[8], sum[8];
+
+  v[0] = vec_mergeh(kernel.packet[0], kernel.packet[4]);
+  v[1] = vec_mergel(kernel.packet[0], kernel.packet[4]);
+  v[2] = vec_mergeh(kernel.packet[1], kernel.packet[5]);
+  v[3] = vec_mergel(kernel.packet[1], kernel.packet[5]);
+  v[4] = vec_mergeh(kernel.packet[2], kernel.packet[6]);
+  v[5] = vec_mergel(kernel.packet[2], kernel.packet[6]);
+  v[6] = vec_mergeh(kernel.packet[3], kernel.packet[7]);
+  v[7] = vec_mergel(kernel.packet[3], kernel.packet[7]);
+  sum[0] = vec_mergeh(v[0], v[4]);
+  sum[1] = vec_mergel(v[0], v[4]);
+  sum[2] = vec_mergeh(v[1], v[5]);
+  sum[3] = vec_mergel(v[1], v[5]);
+  sum[4] = vec_mergeh(v[2], v[6]);
+  sum[5] = vec_mergel(v[2], v[6]);
+  sum[6] = vec_mergeh(v[3], v[7]);
+  sum[7] = vec_mergel(v[3], v[7]);
+
+  kernel.packet[0] = vec_mergeh(sum[0], sum[4]);
+  kernel.packet[1] = vec_mergel(sum[0], sum[4]);
+  kernel.packet[2] = vec_mergeh(sum[1], sum[5]);
+  kernel.packet[3] = vec_mergel(sum[1], sum[5]);
+  kernel.packet[4] = vec_mergeh(sum[2], sum[6]);
+  kernel.packet[5] = vec_mergel(sum[2], sum[6]);
+  kernel.packet[6] = vec_mergeh(sum[3], sum[7]);
+  kernel.packet[7] = vec_mergel(sum[3], sum[7]);
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet8bf, 8>& kernel) {
+  Packet8bf v[8], sum[8];
+
+  v[0] = vec_mergeh(kernel.packet[0].m_val, kernel.packet[4].m_val);
+  v[1] = vec_mergel(kernel.packet[0].m_val, kernel.packet[4].m_val);
+  v[2] = vec_mergeh(kernel.packet[1].m_val, kernel.packet[5].m_val);
+  v[3] = vec_mergel(kernel.packet[1].m_val, kernel.packet[5].m_val);
+  v[4] = vec_mergeh(kernel.packet[2].m_val, kernel.packet[6].m_val);
+  v[5] = vec_mergel(kernel.packet[2].m_val, kernel.packet[6].m_val);
+  v[6] = vec_mergeh(kernel.packet[3].m_val, kernel.packet[7].m_val);
+  v[7] = vec_mergel(kernel.packet[3].m_val, kernel.packet[7].m_val);
+  sum[0] = vec_mergeh(v[0].m_val, v[4].m_val);
+  sum[1] = vec_mergel(v[0].m_val, v[4].m_val);
+  sum[2] = vec_mergeh(v[1].m_val, v[5].m_val);
+  sum[3] = vec_mergel(v[1].m_val, v[5].m_val);
+  sum[4] = vec_mergeh(v[2].m_val, v[6].m_val);
+  sum[5] = vec_mergel(v[2].m_val, v[6].m_val);
+  sum[6] = vec_mergeh(v[3].m_val, v[7].m_val);
+  sum[7] = vec_mergel(v[3].m_val, v[7].m_val);
+
+  kernel.packet[0] = vec_mergeh(sum[0].m_val, sum[4].m_val);
+  kernel.packet[1] = vec_mergel(sum[0].m_val, sum[4].m_val);
+  kernel.packet[2] = vec_mergeh(sum[1].m_val, sum[5].m_val);
+  kernel.packet[3] = vec_mergel(sum[1].m_val, sum[5].m_val);
+  kernel.packet[4] = vec_mergeh(sum[2].m_val, sum[6].m_val);
+  kernel.packet[5] = vec_mergel(sum[2].m_val, sum[6].m_val);
+  kernel.packet[6] = vec_mergeh(sum[3].m_val, sum[7].m_val);
+  kernel.packet[7] = vec_mergel(sum[3].m_val, sum[7].m_val);
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16c, 16>& kernel) {
+  Packet16c step1[16], step2[16], step3[16];
+
+  step1[0] = vec_mergeh(kernel.packet[0], kernel.packet[8]);
+  step1[1] = vec_mergel(kernel.packet[0], kernel.packet[8]);
+  step1[2] = vec_mergeh(kernel.packet[1], kernel.packet[9]);
+  step1[3] = vec_mergel(kernel.packet[1], kernel.packet[9]);
+  step1[4] = vec_mergeh(kernel.packet[2], kernel.packet[10]);
+  step1[5] = vec_mergel(kernel.packet[2], kernel.packet[10]);
+  step1[6] = vec_mergeh(kernel.packet[3], kernel.packet[11]);
+  step1[7] = vec_mergel(kernel.packet[3], kernel.packet[11]);
+  step1[8] = vec_mergeh(kernel.packet[4], kernel.packet[12]);
+  step1[9] = vec_mergel(kernel.packet[4], kernel.packet[12]);
+  step1[10] = vec_mergeh(kernel.packet[5], kernel.packet[13]);
+  step1[11] = vec_mergel(kernel.packet[5], kernel.packet[13]);
+  step1[12] = vec_mergeh(kernel.packet[6], kernel.packet[14]);
+  step1[13] = vec_mergel(kernel.packet[6], kernel.packet[14]);
+  step1[14] = vec_mergeh(kernel.packet[7], kernel.packet[15]);
+  step1[15] = vec_mergel(kernel.packet[7], kernel.packet[15]);
+
+  step2[0] = vec_mergeh(step1[0], step1[8]);
+  step2[1] = vec_mergel(step1[0], step1[8]);
+  step2[2] = vec_mergeh(step1[1], step1[9]);
+  step2[3] = vec_mergel(step1[1], step1[9]);
+  step2[4] = vec_mergeh(step1[2], step1[10]);
+  step2[5] = vec_mergel(step1[2], step1[10]);
+  step2[6] = vec_mergeh(step1[3], step1[11]);
+  step2[7] = vec_mergel(step1[3], step1[11]);
+  step2[8] = vec_mergeh(step1[4], step1[12]);
+  step2[9] = vec_mergel(step1[4], step1[12]);
+  step2[10] = vec_mergeh(step1[5], step1[13]);
+  step2[11] = vec_mergel(step1[5], step1[13]);
+  step2[12] = vec_mergeh(step1[6], step1[14]);
+  step2[13] = vec_mergel(step1[6], step1[14]);
+  step2[14] = vec_mergeh(step1[7], step1[15]);
+  step2[15] = vec_mergel(step1[7], step1[15]);
+
+  step3[0] = vec_mergeh(step2[0], step2[8]);
+  step3[1] = vec_mergel(step2[0], step2[8]);
+  step3[2] = vec_mergeh(step2[1], step2[9]);
+  step3[3] = vec_mergel(step2[1], step2[9]);
+  step3[4] = vec_mergeh(step2[2], step2[10]);
+  step3[5] = vec_mergel(step2[2], step2[10]);
+  step3[6] = vec_mergeh(step2[3], step2[11]);
+  step3[7] = vec_mergel(step2[3], step2[11]);
+  step3[8] = vec_mergeh(step2[4], step2[12]);
+  step3[9] = vec_mergel(step2[4], step2[12]);
+  step3[10] = vec_mergeh(step2[5], step2[13]);
+  step3[11] = vec_mergel(step2[5], step2[13]);
+  step3[12] = vec_mergeh(step2[6], step2[14]);
+  step3[13] = vec_mergel(step2[6], step2[14]);
+  step3[14] = vec_mergeh(step2[7], step2[15]);
+  step3[15] = vec_mergel(step2[7], step2[15]);
+
+  kernel.packet[0] = vec_mergeh(step3[0], step3[8]);
+  kernel.packet[1] = vec_mergel(step3[0], step3[8]);
+  kernel.packet[2] = vec_mergeh(step3[1], step3[9]);
+  kernel.packet[3] = vec_mergel(step3[1], step3[9]);
+  kernel.packet[4] = vec_mergeh(step3[2], step3[10]);
+  kernel.packet[5] = vec_mergel(step3[2], step3[10]);
+  kernel.packet[6] = vec_mergeh(step3[3], step3[11]);
+  kernel.packet[7] = vec_mergel(step3[3], step3[11]);
+  kernel.packet[8] = vec_mergeh(step3[4], step3[12]);
+  kernel.packet[9] = vec_mergel(step3[4], step3[12]);
+  kernel.packet[10] = vec_mergeh(step3[5], step3[13]);
+  kernel.packet[11] = vec_mergel(step3[5], step3[13]);
+  kernel.packet[12] = vec_mergeh(step3[6], step3[14]);
+  kernel.packet[13] = vec_mergel(step3[6], step3[14]);
+  kernel.packet[14] = vec_mergeh(step3[7], step3[15]);
+  kernel.packet[15] = vec_mergel(step3[7], step3[15]);
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet16uc, 16>& kernel) {
+  Packet16uc step1[16], step2[16], step3[16];
+
+  step1[0] = vec_mergeh(kernel.packet[0], kernel.packet[8]);
+  step1[1] = vec_mergel(kernel.packet[0], kernel.packet[8]);
+  step1[2] = vec_mergeh(kernel.packet[1], kernel.packet[9]);
+  step1[3] = vec_mergel(kernel.packet[1], kernel.packet[9]);
+  step1[4] = vec_mergeh(kernel.packet[2], kernel.packet[10]);
+  step1[5] = vec_mergel(kernel.packet[2], kernel.packet[10]);
+  step1[6] = vec_mergeh(kernel.packet[3], kernel.packet[11]);
+  step1[7] = vec_mergel(kernel.packet[3], kernel.packet[11]);
+  step1[8] = vec_mergeh(kernel.packet[4], kernel.packet[12]);
+  step1[9] = vec_mergel(kernel.packet[4], kernel.packet[12]);
+  step1[10] = vec_mergeh(kernel.packet[5], kernel.packet[13]);
+  step1[11] = vec_mergel(kernel.packet[5], kernel.packet[13]);
+  step1[12] = vec_mergeh(kernel.packet[6], kernel.packet[14]);
+  step1[13] = vec_mergel(kernel.packet[6], kernel.packet[14]);
+  step1[14] = vec_mergeh(kernel.packet[7], kernel.packet[15]);
+  step1[15] = vec_mergel(kernel.packet[7], kernel.packet[15]);
+
+  step2[0] = vec_mergeh(step1[0], step1[8]);
+  step2[1] = vec_mergel(step1[0], step1[8]);
+  step2[2] = vec_mergeh(step1[1], step1[9]);
+  step2[3] = vec_mergel(step1[1], step1[9]);
+  step2[4] = vec_mergeh(step1[2], step1[10]);
+  step2[5] = vec_mergel(step1[2], step1[10]);
+  step2[6] = vec_mergeh(step1[3], step1[11]);
+  step2[7] = vec_mergel(step1[3], step1[11]);
+  step2[8] = vec_mergeh(step1[4], step1[12]);
+  step2[9] = vec_mergel(step1[4], step1[12]);
+  step2[10] = vec_mergeh(step1[5], step1[13]);
+  step2[11] = vec_mergel(step1[5], step1[13]);
+  step2[12] = vec_mergeh(step1[6], step1[14]);
+  step2[13] = vec_mergel(step1[6], step1[14]);
+  step2[14] = vec_mergeh(step1[7], step1[15]);
+  step2[15] = vec_mergel(step1[7], step1[15]);
+
+  step3[0] = vec_mergeh(step2[0], step2[8]);
+  step3[1] = vec_mergel(step2[0], step2[8]);
+  step3[2] = vec_mergeh(step2[1], step2[9]);
+  step3[3] = vec_mergel(step2[1], step2[9]);
+  step3[4] = vec_mergeh(step2[2], step2[10]);
+  step3[5] = vec_mergel(step2[2], step2[10]);
+  step3[6] = vec_mergeh(step2[3], step2[11]);
+  step3[7] = vec_mergel(step2[3], step2[11]);
+  step3[8] = vec_mergeh(step2[4], step2[12]);
+  step3[9] = vec_mergel(step2[4], step2[12]);
+  step3[10] = vec_mergeh(step2[5], step2[13]);
+  step3[11] = vec_mergel(step2[5], step2[13]);
+  step3[12] = vec_mergeh(step2[6], step2[14]);
+  step3[13] = vec_mergel(step2[6], step2[14]);
+  step3[14] = vec_mergeh(step2[7], step2[15]);
+  step3[15] = vec_mergel(step2[7], step2[15]);
+
+  kernel.packet[0] = vec_mergeh(step3[0], step3[8]);
+  kernel.packet[1] = vec_mergel(step3[0], step3[8]);
+  kernel.packet[2] = vec_mergeh(step3[1], step3[9]);
+  kernel.packet[3] = vec_mergel(step3[1], step3[9]);
+  kernel.packet[4] = vec_mergeh(step3[2], step3[10]);
+  kernel.packet[5] = vec_mergel(step3[2], step3[10]);
+  kernel.packet[6] = vec_mergeh(step3[3], step3[11]);
+  kernel.packet[7] = vec_mergel(step3[3], step3[11]);
+  kernel.packet[8] = vec_mergeh(step3[4], step3[12]);
+  kernel.packet[9] = vec_mergel(step3[4], step3[12]);
+  kernel.packet[10] = vec_mergeh(step3[5], step3[13]);
+  kernel.packet[11] = vec_mergel(step3[5], step3[13]);
+  kernel.packet[12] = vec_mergeh(step3[6], step3[14]);
+  kernel.packet[13] = vec_mergel(step3[6], step3[14]);
+  kernel.packet[14] = vec_mergeh(step3[7], step3[15]);
+  kernel.packet[15] = vec_mergel(step3[7], step3[15]);
+}
+
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet pblend4(const Selector<4>& ifPacket, const Packet& thenPacket, const Packet& elsePacket) {
+  Packet4ui select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3]};
+  Packet4ui mask = reinterpret_cast<Packet4ui>(pnegate(reinterpret_cast<Packet4i>(select)));
+  return vec_sel(elsePacket, thenPacket, mask);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket,
+                                    const Packet4i& elsePacket) {
+  return pblend4<Packet4i>(ifPacket, thenPacket, elsePacket);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket,
+                                    const Packet4f& elsePacket) {
+  return pblend4<Packet4f>(ifPacket, thenPacket, elsePacket);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8s pblend(const Selector<8>& ifPacket, const Packet8s& thenPacket,
+                                    const Packet8s& elsePacket) {
+  Packet8us select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
+                      ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7]};
+  Packet8us mask = reinterpret_cast<Packet8us>(pnegate(reinterpret_cast<Packet8s>(select)));
+  Packet8s result = vec_sel(elsePacket, thenPacket, mask);
+  return result;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8us pblend(const Selector<8>& ifPacket, const Packet8us& thenPacket,
+                                     const Packet8us& elsePacket) {
+  Packet8us select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3],
+                      ifPacket.select[4], ifPacket.select[5], ifPacket.select[6], ifPacket.select[7]};
+  Packet8us mask = reinterpret_cast<Packet8us>(pnegate(reinterpret_cast<Packet8s>(select)));
+  return vec_sel(elsePacket, thenPacket, mask);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pblend(const Selector<8>& ifPacket, const Packet8bf& thenPacket,
+                                     const Packet8bf& elsePacket) {
+  return pblend<Packet8us>(ifPacket, thenPacket, elsePacket);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16c pblend(const Selector<16>& ifPacket, const Packet16c& thenPacket,
+                                     const Packet16c& elsePacket) {
+  Packet16uc select = {ifPacket.select[0],  ifPacket.select[1],  ifPacket.select[2],  ifPacket.select[3],
+                       ifPacket.select[4],  ifPacket.select[5],  ifPacket.select[6],  ifPacket.select[7],
+                       ifPacket.select[8],  ifPacket.select[9],  ifPacket.select[10], ifPacket.select[11],
+                       ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15]};
+
+  Packet16uc mask = reinterpret_cast<Packet16uc>(pnegate(reinterpret_cast<Packet16c>(select)));
+  return vec_sel(elsePacket, thenPacket, mask);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16uc pblend(const Selector<16>& ifPacket, const Packet16uc& thenPacket,
+                                      const Packet16uc& elsePacket) {
+  Packet16uc select = {ifPacket.select[0],  ifPacket.select[1],  ifPacket.select[2],  ifPacket.select[3],
+                       ifPacket.select[4],  ifPacket.select[5],  ifPacket.select[6],  ifPacket.select[7],
+                       ifPacket.select[8],  ifPacket.select[9],  ifPacket.select[10], ifPacket.select[11],
+                       ifPacket.select[12], ifPacket.select[13], ifPacket.select[14], ifPacket.select[15]};
+
+  Packet16uc mask = reinterpret_cast<Packet16uc>(pnegate(reinterpret_cast<Packet16c>(select)));
+  return vec_sel(elsePacket, thenPacket, mask);
+}
+
+//---------- double ----------
+#ifdef EIGEN_VECTORIZE_VSX
+typedef __vector double Packet2d;
+typedef __vector unsigned long long Packet2ul;
+typedef __vector long long Packet2l;
+#if EIGEN_COMP_CLANG
+typedef Packet2ul Packet2bl;
+#else
+typedef __vector __bool long Packet2bl;
+#endif
+
+static Packet2l p2l_ZERO = reinterpret_cast<Packet2l>(p4i_ZERO);
+static Packet2ul p2ul_SIGN = {0x8000000000000000ull, 0x8000000000000000ull};
+static Packet2ul p2ul_PREV0DOT5 = {0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull};
+static Packet2d p2d_ONE = {1.0, 1.0};
+static Packet2d p2d_ZERO = reinterpret_cast<Packet2d>(p4f_ZERO);
+static Packet2d p2d_MZERO = {numext::bit_cast<double>(0x8000000000000000ull),
+                             numext::bit_cast<double>(0x8000000000000000ull)};
+
+#ifdef _BIG_ENDIAN
+static Packet2d p2d_COUNTDOWN =
+    reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ZERO), reinterpret_cast<Packet4f>(p2d_ONE), 8));
+#else
+static Packet2d p2d_COUNTDOWN =
+    reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ONE), reinterpret_cast<Packet4f>(p2d_ZERO), 8));
+#endif
+
+template <int index>
+Packet2d vec_splat_dbl(Packet2d& a) {
+  return vec_splat(a, index);
+}
+
+template <>
+struct packet_traits<double> : default_packet_traits {
+  typedef Packet2d type;
+  typedef Packet2d half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 2,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasAbs = 1,
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH,
+    HasErfc = EIGEN_FAST_MATH,
+    HasATanh = 1,
+    HasATan = 0,
+    HasLog = 0,
+    HasCmp = 1,
+    HasExp = 1,
+    HasSqrt = 1,
+    HasCbrt = 1,
+#if !EIGEN_COMP_CLANG
+    HasRsqrt = 1,
+#else
+    HasRsqrt = 0,
+#endif
+    HasNegate = 1,
+    HasBlend = 1
+  };
+};
+
+template <>
+struct unpacket_traits<Packet2d> {
+  typedef double type;
+  typedef Packet2l integer_packet;
+  enum {
+    size = 2,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+  typedef Packet2d half;
+};
+template <>
+struct unpacket_traits<Packet2l> {
+  typedef int64_t type;
+  typedef Packet2l half;
+  enum {
+    size = 2,
+    alignment = Aligned16,
+    vectorizable = false,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+inline std::ostream& operator<<(std::ostream& s, const Packet2l& v) {
+  union {
+    Packet2l v;
+    int64_t n[2];
+  } vt;
+  vt.v = v;
+  s << vt.n[0] << ", " << vt.n[1];
+  return s;
+}
+
+inline std::ostream& operator<<(std::ostream& s, const Packet2d& v) {
+  union {
+    Packet2d v;
+    double n[2];
+  } vt;
+  vt.v = v;
+  s << vt.n[0] << ", " << vt.n[1];
+  return s;
+}
+
+// Need to define them first or we get specialization after instantiation errors
+template <>
+EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD
+  return vec_xl(0, const_cast<double*>(from));  // cast needed by Clang
+}
+
+template <>
+EIGEN_ALWAYS_INLINE Packet2d pload_partial<Packet2d>(const double* from, const Index n, const Index offset) {
+  return pload_partial_common<Packet2d>(from, n, offset);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) {
+  EIGEN_DEBUG_ALIGNED_STORE
+  vec_xst(from, 0, to);
+}
+
+template <>
+EIGEN_ALWAYS_INLINE void pstore_partial<double>(double* to, const Packet2d& from, const Index n, const Index offset) {
+  pstore_partial_common<Packet2d>(to, from, n, offset);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
+  Packet2d v = {from, from};
+  return v;
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pset1<Packet2l>(const int64_t& from) {
+  Packet2l v = {from, from};
+  return v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(unsigned long from) {
+  Packet2l v = {static_cast<long long>(from), static_cast<long long>(from)};
+  return reinterpret_cast<Packet2d>(v);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pbroadcast4<Packet2d>(const double* a, Packet2d& a0, Packet2d& a1, Packet2d& a2,
+                                               Packet2d& a3) {
+  // This way is faster than vec_splat (at least for doubles in Power 9)
+  a0 = pset1<Packet2d>(a[0]);
+  a1 = pset1<Packet2d>(a[1]);
+  a2 = pset1<Packet2d>(a[2]);
+  a3 = pset1<Packet2d>(a[3]);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2d pgather<double, Packet2d>(const double* from, Index stride) {
+  return pgather_common<Packet2d>(from, stride);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet2d pgather_partial<double, Packet2d>(const double* from, Index stride,
+                                                                                 const Index n) {
+  return pgather_common<Packet2d>(from, stride, n);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) {
+  pscatter_common<Packet2d>(to, from, stride);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pscatter_partial<double, Packet2d>(double* to, const Packet2d& from,
+                                                                              Index stride, const Index n) {
+  pscatter_common<Packet2d>(to, from, stride, n);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) {
+  return pset1<Packet2d>(a) + p2d_COUNTDOWN;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return a + b;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return a - b;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) {
+#ifdef __POWER8_VECTOR__
+  return vec_neg(a);
+#else
+  return vec_xor(a, p2d_MZERO);
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) {
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return vec_madd(a, b, p2d_MZERO);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return vec_div(a, b);
+}
+
+// for some weird raisons, it has to be overloaded for packet of integers
+template <>
+EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+  return vec_madd(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+  return vec_msub(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pnmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+  return vec_nmsub(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pnmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+  return vec_nmadd(a, b, c);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  // NOTE: about 10% slower than vec_min, but consistent with std::min and SSE regarding NaN
+  Packet2d ret;
+  __asm__("xvcmpgedp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa"(ret) : "wa"(a), "wa"(b));
+  return ret;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  // NOTE: about 10% slower than vec_max, but consistent with std::max and SSE regarding NaN
+  Packet2d ret;
+  __asm__("xvcmpgtdp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa"(ret) : "wa"(a), "wa"(b));
+  return ret;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) {
+  return reinterpret_cast<Packet2d>(vec_cmple(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) {
+  return reinterpret_cast<Packet2d>(vec_cmplt(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) {
+  return reinterpret_cast<Packet2d>(vec_cmpeq(a, b));
+}
+template <>
+#ifdef __POWER8_VECTOR__
+EIGEN_STRONG_INLINE Packet2l pcmp_eq(const Packet2l& a, const Packet2l& b) {
+  return reinterpret_cast<Packet2l>(vec_cmpeq(a, b));
+}
+#else
+EIGEN_STRONG_INLINE Packet2l pcmp_eq(const Packet2l& a, const Packet2l& b) {
+  Packet4i halves = reinterpret_cast<Packet4i>(vec_cmpeq(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(b)));
+  Packet4i flipped = vec_perm(halves, halves, p16uc_COMPLEX32_REV);
+  return reinterpret_cast<Packet2l>(pand(halves, flipped));
+}
+#endif
+template <>
+EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) {
+  Packet2d c = reinterpret_cast<Packet2d>(vec_cmpge(a, b));
+  return vec_nor(c, c);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return vec_and(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return vec_or(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return vec_xor(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return vec_and(a, vec_nor(b, b));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) {
+  Packet2d t = vec_add(
+      reinterpret_cast<Packet2d>(vec_or(vec_and(reinterpret_cast<Packet2ul>(a), p2ul_SIGN), p2ul_PREV0DOT5)), a);
+  Packet2d res;
+
+  __asm__("xvrdpiz %x0, %x1\n\t" : "=&wa"(res) : "wa"(t));
+
+  return res;
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) {
+  return vec_ceil(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) {
+  return vec_floor(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d ptrunc<Packet2d>(const Packet2d& a) {
+  return vec_trunc(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a) {
+  Packet2d res;
+
+  __asm__("xvrdpic %x0, %x1\n\t" : "=&wa"(res) : "wa"(a));
+
+  return res;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD
+  return vec_xl(0, const_cast<double*>(from));
+}
+
+template <>
+EIGEN_ALWAYS_INLINE Packet2d ploadu_partial<Packet2d>(const double* from, const Index n, const Index offset) {
+  return ploadu_partial_common<Packet2d>(from, n, offset);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) {
+  Packet2d p;
+  if ((std::ptrdiff_t(from) % 16) == 0)
+    p = pload<Packet2d>(from);
+  else
+    p = ploadu<Packet2d>(from);
+  return vec_splat_dbl<0>(p);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE
+  vec_xst(from, 0, to);
+}
+
+template <>
+EIGEN_ALWAYS_INLINE void pstoreu_partial<double>(double* to, const Packet2d& from, const Index n, const Index offset) {
+  pstoreu_partial_common<Packet2d>(to, from, n, offset);
+}
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
+  EIGEN_PPC_PREFETCH(addr);
+}
+
+template <>
+EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
+  EIGEN_ALIGN16 double x[2];
+  pstore<double>(x, a);
+  return x[0];
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) {
+  return vec_sld(a, a, 8);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) {
+  return vec_abs(a);
+}
+#ifdef __POWER8_VECTOR__
+template <>
+EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a) {
+  return (Packet2d)vec_sra((Packet2l)a, vec_splats((unsigned long long)(63)));
+}
+#else
+#ifdef _BIG_ENDIAN
+static Packet16uc p16uc_DUPSIGN = {0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
+#else
+static Packet16uc p16uc_DUPSIGN = {7, 7, 7, 7, 7, 7, 7, 7, 15, 15, 15, 15, 15, 15, 15, 15};
+#endif
+
+template <>
+EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a) {
+  Packet16c tmp = vec_sra(reinterpret_cast<Packet16c>(a), vec_splats((unsigned char)(7)));
+  return reinterpret_cast<Packet2d>(vec_perm(tmp, tmp, p16uc_DUPSIGN));
+}
+#endif
+
+template <>
+inline Packet2l pcast<Packet2d, Packet2l>(const Packet2d& x);
+
+template <>
+inline Packet2d pcast<Packet2l, Packet2d>(const Packet2l& x);
+
+// Packet2l shifts.
+// For POWER8 we simply use vec_sr/l.
+//
+// Things are more complicated for POWER7. There is actually a
+// vec_xxsxdi intrinsic but it is not supported by some gcc versions.
+// So we need to shift by N % 32 and rearrage bytes.
+#ifdef __POWER8_VECTOR__
+
+template <int N>
+EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) {
+  const Packet2ul shift = {N, N};
+  return vec_sl(a, shift);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) {
+  const Packet2ul shift = {N, N};
+  return vec_sr(a, shift);
+}
+
+#else
+
+// Shifts [A, B, C, D] to [B, 0, D, 0].
+// Used to implement left shifts for Packet2l.
+EIGEN_ALWAYS_INLINE Packet4i shift_even_left(const Packet4i& a) {
+  static const Packet16uc perm = {0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03,
+                                  0x1c, 0x1d, 0x1e, 0x1f, 0x08, 0x09, 0x0a, 0x0b};
+#ifdef _BIG_ENDIAN
+  return vec_perm(p4i_ZERO, a, perm);
+#else
+  return vec_perm(a, p4i_ZERO, perm);
+#endif
+}
+
+// Shifts [A, B, C, D] to [0, A, 0, C].
+// Used to implement right shifts for Packet2l.
+EIGEN_ALWAYS_INLINE Packet4i shift_odd_right(const Packet4i& a) {
+  static const Packet16uc perm = {0x04, 0x05, 0x06, 0x07, 0x10, 0x11, 0x12, 0x13,
+                                  0x0c, 0x0d, 0x0e, 0x0f, 0x18, 0x19, 0x1a, 0x1b};
+#ifdef _BIG_ENDIAN
+  return vec_perm(p4i_ZERO, a, perm);
+#else
+  return vec_perm(a, p4i_ZERO, perm);
+#endif
+}
+
+template <int N, typename EnableIf = void>
+struct plogical_shift_left_impl;
+
+template <int N>
+struct plogical_shift_left_impl<N, std::enable_if_t<(N < 32) && (N >= 0)> > {
+  static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
+    static const unsigned n = static_cast<unsigned>(N);
+    const Packet4ui shift = {n, n, n, n};
+    const Packet4i ai = reinterpret_cast<Packet4i>(a);
+    static const unsigned m = static_cast<unsigned>(32 - N);
+    const Packet4ui shift_right = {m, m, m, m};
+    const Packet4i out_hi = vec_sl(ai, shift);
+    const Packet4i out_lo = shift_even_left(vec_sr(ai, shift_right));
+    return reinterpret_cast<Packet2l>(por<Packet4i>(out_hi, out_lo));
+  }
+};
+
+template <int N>
+struct plogical_shift_left_impl<N, std::enable_if_t<(N >= 32)> > {
+  static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
+    static const unsigned m = static_cast<unsigned>(N - 32);
+    const Packet4ui shift = {m, m, m, m};
+    const Packet4i ai = reinterpret_cast<Packet4i>(a);
+    return reinterpret_cast<Packet2l>(shift_even_left(vec_sl(ai, shift)));
+  }
+};
+
+template <int N>
+EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) {
+  return plogical_shift_left_impl<N>::run(a);
+}
+
+template <int N, typename EnableIf = void>
+struct plogical_shift_right_impl;
+
+template <int N>
+struct plogical_shift_right_impl<N, std::enable_if_t<(N < 32) && (N >= 0)> > {
+  static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
+    static const unsigned n = static_cast<unsigned>(N);
+    const Packet4ui shift = {n, n, n, n};
+    const Packet4i ai = reinterpret_cast<Packet4i>(a);
+    static const unsigned m = static_cast<unsigned>(32 - N);
+    const Packet4ui shift_left = {m, m, m, m};
+    const Packet4i out_lo = vec_sr(ai, shift);
+    const Packet4i out_hi = shift_odd_right(vec_sl(ai, shift_left));
+    return reinterpret_cast<Packet2l>(por<Packet4i>(out_hi, out_lo));
+  }
+};
+
+template <int N>
+struct plogical_shift_right_impl<N, std::enable_if_t<(N >= 32)> > {
+  static EIGEN_STRONG_INLINE Packet2l run(const Packet2l& a) {
+    static const unsigned m = static_cast<unsigned>(N - 32);
+    const Packet4ui shift = {m, m, m, m};
+    const Packet4i ai = reinterpret_cast<Packet4i>(a);
+    return reinterpret_cast<Packet2l>(shift_odd_right(vec_sr(ai, shift)));
+  }
+};
+
+template <int N>
+EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) {
+  return plogical_shift_right_impl<N>::run(a);
+}
+#endif
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
+  // Clamp exponent to [-2099, 2099]
+  const Packet2d max_exponent = pset1<Packet2d>(2099.0);
+  const Packet2l e = pcast<Packet2d, Packet2l>(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
+
+  // Split 2^e into four factors and multiply:
+  const Packet2l bias = {1023, 1023};
+  Packet2l b = plogical_shift_right<2>(e);  // floor(e/4)
+  Packet2d c = reinterpret_cast<Packet2d>(plogical_shift_left<52>(b + bias));
+  Packet2d out = pmul(pmul(pmul(a, c), c), c);                        // a * 2^(3b)
+  b = psub(psub(psub(e, b), b), b);                                   // e - 3b
+  c = reinterpret_cast<Packet2d>(plogical_shift_left<52>(b + bias));  // 2^(e - 3b)
+  out = pmul(out, c);                                                 // a * 2^e
+  return out;
+}
+
+// Extract exponent without existence of Packet2l.
+template <>
+EIGEN_STRONG_INLINE Packet2d pfrexp_generic_get_biased_exponent(const Packet2d& a) {
+  return pcast<Packet2l, Packet2d>(plogical_shift_right<52>(reinterpret_cast<Packet2l>(pabs(a))));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(const Packet2d& a, Packet2d& exponent) {
+  return pfrexp_generic(a, exponent);
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) {
+  Packet2d b, sum;
+  b = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(a), reinterpret_cast<Packet4f>(a), 8));
+  sum = a + b;
+  return pfirst<Packet2d>(sum);
+}
+
+// Other reduction functions:
+// mul
+template <>
+EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) {
+  return pfirst(
+      pmul(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
+}
+
+// min
+template <>
+EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) {
+  return pfirst(
+      pmin(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
+}
+
+// max
+template <>
+EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) {
+  return pfirst(
+      pmax(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8))));
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
+  Packet2d t0, t1;
+  t0 = vec_mergeh(kernel.packet[0], kernel.packet[1]);
+  t1 = vec_mergel(kernel.packet[0], kernel.packet[1]);
+  kernel.packet[0] = t0;
+  kernel.packet[1] = t1;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket,
+                                    const Packet2d& elsePacket) {
+  Packet2l select = {ifPacket.select[0], ifPacket.select[1]};
+  Packet2ul mask = reinterpret_cast<Packet2ul>(pnegate(reinterpret_cast<Packet2l>(select)));
+  return vec_sel(elsePacket, thenPacket, mask);
+}
 
-} // end namespace internal
+#endif  // __VSX__
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_PACKET_MATH_ALTIVEC_H
+#endif  // EIGEN_PACKET_MATH_ALTIVEC_H
diff --git a/inst/include/Eigen/src/Core/arch/AltiVec/TypeCasting.h b/inst/include/Eigen/src/Core/arch/AltiVec/TypeCasting.h
new file mode 100644
index 00000000..439339ee
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/AltiVec/TypeCasting.h
@@ -0,0 +1,153 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2019 Rasmus Munk Larsen <rmlarsen@google.com>
+// Copyright (C) 2023 Chip Kerchner (chip.kerchner@ibm.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TYPE_CASTING_ALTIVEC_H
+#define EIGEN_TYPE_CASTING_ALTIVEC_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+template <>
+struct type_casting_traits<float, int> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+
+template <>
+struct type_casting_traits<int, float> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+
+template <>
+struct type_casting_traits<bfloat16, unsigned short int> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+
+template <>
+struct type_casting_traits<unsigned short int, bfloat16> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
+  return vec_cts(a, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcast<Packet4f, Packet4ui>(const Packet4f& a) {
+  return vec_ctu(a, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
+  return vec_ctf(a, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet4ui, Packet4f>(const Packet4ui& a) {
+  return vec_ctf(a, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8us pcast<Packet8bf, Packet8us>(const Packet8bf& a) {
+  Packet4f float_even = Bf16ToF32Even(a);
+  Packet4f float_odd = Bf16ToF32Odd(a);
+  Packet4ui int_even = pcast<Packet4f, Packet4ui>(float_even);
+  Packet4ui int_odd = pcast<Packet4f, Packet4ui>(float_odd);
+  const EIGEN_DECLARE_CONST_FAST_Packet4ui(low_mask, 0x0000FFFF);
+  Packet4ui low_even = pand<Packet4ui>(int_even, p4ui_low_mask);
+  Packet4ui low_odd = pand<Packet4ui>(int_odd, p4ui_low_mask);
+
+  // Check values that are bigger than USHRT_MAX (0xFFFF)
+  Packet4bi overflow_selector;
+  if (vec_any_gt(int_even, p4ui_low_mask)) {
+    overflow_selector = vec_cmpgt(int_even, p4ui_low_mask);
+    low_even = vec_sel(low_even, p4ui_low_mask, overflow_selector);
+  }
+  if (vec_any_gt(int_odd, p4ui_low_mask)) {
+    overflow_selector = vec_cmpgt(int_odd, p4ui_low_mask);
+    low_odd = vec_sel(low_even, p4ui_low_mask, overflow_selector);
+  }
+
+  return pmerge(low_even, low_odd);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pcast<Packet8us, Packet8bf>(const Packet8us& a) {
+  // short -> int -> float -> bfloat16
+  const EIGEN_DECLARE_CONST_FAST_Packet4ui(low_mask, 0x0000FFFF);
+  Packet4ui int_cast = reinterpret_cast<Packet4ui>(a);
+  Packet4ui int_even = pand<Packet4ui>(int_cast, p4ui_low_mask);
+  Packet4ui int_odd = plogical_shift_right<16>(int_cast);
+  Packet4f float_even = pcast<Packet4ui, Packet4f>(int_even);
+  Packet4f float_odd = pcast<Packet4ui, Packet4f>(int_odd);
+  return F32ToBf16(float_even, float_odd);
+}
+
+template <>
+struct type_casting_traits<bfloat16, float> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet8bf, Packet4f>(const Packet8bf& a) {
+  Packet8us z = pset1<Packet8us>(0);
+#ifdef _BIG_ENDIAN
+  return reinterpret_cast<Packet4f>(vec_mergeh(a.m_val, z));
+#else
+  return reinterpret_cast<Packet4f>(vec_mergeh(z, a.m_val));
+#endif
+}
+
+template <>
+struct type_casting_traits<float, bfloat16> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet8bf pcast<Packet4f, Packet8bf>(const Packet4f& a, const Packet4f& b) {
+  return F32ToBf16Both(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet4f>(const Packet4f& a) {
+  return reinterpret_cast<Packet4i>(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f, Packet4i>(const Packet4i& a) {
+  return reinterpret_cast<Packet4f>(a);
+}
+
+#ifdef EIGEN_VECTORIZE_VSX
+template <>
+inline Packet2l pcast<Packet2d, Packet2l>(const Packet2d& x) {
+  EIGEN_ALIGN_MAX double dtmp[2];
+  pstore(dtmp, x);
+  EIGEN_ALIGN_MAX long long itmp[2] = {static_cast<long long>(dtmp[0]), static_cast<long long>(dtmp[1])};
+  return vec_xl(0, itmp);
+}
+
+template <>
+inline Packet2d pcast<Packet2l, Packet2d>(const Packet2l& x) {
+  EIGEN_ALIGN_MAX long long itmp[2];
+  vec_xst(x, 0, itmp);
+  EIGEN_ALIGN_MAX double dtmp[2] = {static_cast<double>(itmp[0]), static_cast<double>(itmp[1])};
+  return pload<Packet2d>(dtmp);
+}
+#endif
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_TYPE_CASTING_ALTIVEC_H
diff --git a/inst/include/Eigen/src/Core/arch/Default/BFloat16.h b/inst/include/Eigen/src/Core/arch/Default/BFloat16.h
new file mode 100644
index 00000000..b93c4bc2
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/Default/BFloat16.h
@@ -0,0 +1,866 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef EIGEN_BFLOAT16_H
+#define EIGEN_BFLOAT16_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+#if defined(EIGEN_HAS_HIP_BF16)
+// When compiling with GPU support, the "hip_bfloat16" base class as well as
+// some other routines are defined in the GPU compiler header files
+// (hip_bfloat16.h), and they are not tagged constexpr
+// As a consequence, we get compile failures when compiling Eigen with
+// GPU support. Hence the need to disable EIGEN_CONSTEXPR when building
+// Eigen with GPU support
+#pragma push_macro("EIGEN_CONSTEXPR")
+#undef EIGEN_CONSTEXPR
+#define EIGEN_CONSTEXPR
+#endif
+
+#define BF16_PACKET_FUNCTION(PACKET_F, PACKET_BF16, METHOD)                                         \
+  template <>                                                                                       \
+  EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED PACKET_BF16 METHOD<PACKET_BF16>( \
+      const PACKET_BF16& _x) {                                                                      \
+    return F32ToBf16(METHOD<PACKET_F>(Bf16ToF32(_x)));                                              \
+  }
+
+// Only use HIP GPU bf16 in kernels
+#if defined(EIGEN_HAS_HIP_BF16) && defined(EIGEN_GPU_COMPILE_PHASE)
+#define EIGEN_USE_HIP_BF16
+#endif
+
+namespace Eigen {
+
+struct bfloat16;
+
+namespace numext {
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bit_cast<Eigen::bfloat16, uint16_t>(const uint16_t& src);
+
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC uint16_t bit_cast<uint16_t, Eigen::bfloat16>(const Eigen::bfloat16& src);
+}  // namespace numext
+namespace bfloat16_impl {
+
+#if defined(EIGEN_USE_HIP_BF16)
+
+struct __bfloat16_raw : public hip_bfloat16 {
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw() {}
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw(hip_bfloat16 hb) : hip_bfloat16(hb) {}
+  explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw(unsigned short raw) : hip_bfloat16(raw) {}
+};
+
+#else
+
+// Make our own __bfloat16_raw definition.
+struct __bfloat16_raw {
+#if defined(EIGEN_HAS_HIP_BF16) && !defined(EIGEN_GPU_COMPILE_PHASE)
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw() {}
+#else
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw() : value(0) {}
+#endif
+  explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw(unsigned short raw) : value(raw) {}
+  unsigned short value;
+};
+
+#endif  // defined(EIGEN_USE_HIP_BF16)
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw raw_uint16_to_bfloat16(unsigned short value);
+template <bool AssumeArgumentIsNormalOrInfinityOrZero>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne(float ff);
+// Forward declarations of template specializations, to avoid Visual C++ 2019 errors, saying:
+// > error C2908: explicit specialization; 'float_to_bfloat16_rtne' has already been instantiated
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne<false>(float ff);
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne<true>(float ff);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float bfloat16_to_float(__bfloat16_raw h);
+
+struct bfloat16_base : public __bfloat16_raw {
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16_base() {}
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16_base(const __bfloat16_raw& h) : __bfloat16_raw(h) {}
+};
+
+}  // namespace bfloat16_impl
+
+// Class definition.
+struct bfloat16 : public bfloat16_impl::bfloat16_base {
+  typedef bfloat16_impl::__bfloat16_raw __bfloat16_raw;
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16() {}
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16(const __bfloat16_raw& h) : bfloat16_impl::bfloat16_base(h) {}
+
+  explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16(bool b)
+      : bfloat16_impl::bfloat16_base(bfloat16_impl::raw_uint16_to_bfloat16(b ? 0x3f80 : 0)) {}
+
+  template <class T>
+  explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16(T val)
+      : bfloat16_impl::bfloat16_base(
+            bfloat16_impl::float_to_bfloat16_rtne<internal::is_integral<T>::value>(static_cast<float>(val))) {}
+
+  explicit EIGEN_DEVICE_FUNC bfloat16(float f)
+      : bfloat16_impl::bfloat16_base(bfloat16_impl::float_to_bfloat16_rtne<false>(f)) {}
+
+  // Following the convention of numpy, converting between complex and
+  // float will lead to loss of imag value.
+  template <typename RealScalar>
+  explicit EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bfloat16(const std::complex<RealScalar>& val)
+      : bfloat16_impl::bfloat16_base(bfloat16_impl::float_to_bfloat16_rtne<false>(static_cast<float>(val.real()))) {}
+
+  EIGEN_DEVICE_FUNC operator float() const {  // NOLINT: Allow implicit conversion to float, because it is lossless.
+    return bfloat16_impl::bfloat16_to_float(*this);
+  }
+};
+
+// TODO(majnemer): Get rid of this once we can rely on C++17 inline variables do
+// solve the ODR issue.
+namespace bfloat16_impl {
+template <typename = void>
+struct numeric_limits_bfloat16_impl {
+  static EIGEN_CONSTEXPR const bool is_specialized = true;
+  static EIGEN_CONSTEXPR const bool is_signed = true;
+  static EIGEN_CONSTEXPR const bool is_integer = false;
+  static EIGEN_CONSTEXPR const bool is_exact = false;
+  static EIGEN_CONSTEXPR const bool has_infinity = true;
+  static EIGEN_CONSTEXPR const bool has_quiet_NaN = true;
+  static EIGEN_CONSTEXPR const bool has_signaling_NaN = true;
+  EIGEN_DIAGNOSTICS(push)
+  EIGEN_DISABLE_DEPRECATED_WARNING
+  static EIGEN_CONSTEXPR const std::float_denorm_style has_denorm = std::denorm_present;
+  static EIGEN_CONSTEXPR const bool has_denorm_loss = false;
+  EIGEN_DIAGNOSTICS(pop)
+  static EIGEN_CONSTEXPR const std::float_round_style round_style = std::numeric_limits<float>::round_style;
+  static EIGEN_CONSTEXPR const bool is_iec559 = true;
+  // The C++ standard defines this as "true if the set of values representable
+  // by the type is finite." BFloat16 has finite precision.
+  static EIGEN_CONSTEXPR const bool is_bounded = true;
+  static EIGEN_CONSTEXPR const bool is_modulo = false;
+  static EIGEN_CONSTEXPR const int digits = 8;
+  static EIGEN_CONSTEXPR const int digits10 = 2;
+  static EIGEN_CONSTEXPR const int max_digits10 = 4;
+  static EIGEN_CONSTEXPR const int radix = std::numeric_limits<float>::radix;
+  static EIGEN_CONSTEXPR const int min_exponent = std::numeric_limits<float>::min_exponent;
+  static EIGEN_CONSTEXPR const int min_exponent10 = std::numeric_limits<float>::min_exponent10;
+  static EIGEN_CONSTEXPR const int max_exponent = std::numeric_limits<float>::max_exponent;
+  static EIGEN_CONSTEXPR const int max_exponent10 = std::numeric_limits<float>::max_exponent10;
+  static EIGEN_CONSTEXPR const bool traps = std::numeric_limits<float>::traps;
+  // IEEE754: "The implementer shall choose how tininess is detected, but shall
+  // detect tininess in the same way for all operations in radix two"
+  static EIGEN_CONSTEXPR const bool tinyness_before = std::numeric_limits<float>::tinyness_before;
+
+  static EIGEN_CONSTEXPR Eigen::bfloat16(min)() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x0080); }
+  static EIGEN_CONSTEXPR Eigen::bfloat16 lowest() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0xff7f); }
+  static EIGEN_CONSTEXPR Eigen::bfloat16(max)() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7f7f); }
+  static EIGEN_CONSTEXPR Eigen::bfloat16 epsilon() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x3c00); }
+  static EIGEN_CONSTEXPR Eigen::bfloat16 round_error() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x3f00); }
+  static EIGEN_CONSTEXPR Eigen::bfloat16 infinity() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7f80); }
+  static EIGEN_CONSTEXPR Eigen::bfloat16 quiet_NaN() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7fc0); }
+  static EIGEN_CONSTEXPR Eigen::bfloat16 signaling_NaN() {
+    return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x7fa0);
+  }
+  static EIGEN_CONSTEXPR Eigen::bfloat16 denorm_min() { return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(0x0001); }
+};
+
+template <typename T>
+EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::is_specialized;
+template <typename T>
+EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::is_signed;
+template <typename T>
+EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::is_integer;
+template <typename T>
+EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::is_exact;
+template <typename T>
+EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::has_infinity;
+template <typename T>
+EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::has_quiet_NaN;
+template <typename T>
+EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::has_signaling_NaN;
+EIGEN_DIAGNOSTICS(push)
+EIGEN_DISABLE_DEPRECATED_WARNING
+template <typename T>
+EIGEN_CONSTEXPR const std::float_denorm_style numeric_limits_bfloat16_impl<T>::has_denorm;
+template <typename T>
+EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::has_denorm_loss;
+EIGEN_DIAGNOSTICS(pop)
+template <typename T>
+EIGEN_CONSTEXPR const std::float_round_style numeric_limits_bfloat16_impl<T>::round_style;
+template <typename T>
+EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::is_iec559;
+template <typename T>
+EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::is_bounded;
+template <typename T>
+EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::is_modulo;
+template <typename T>
+EIGEN_CONSTEXPR const int numeric_limits_bfloat16_impl<T>::digits;
+template <typename T>
+EIGEN_CONSTEXPR const int numeric_limits_bfloat16_impl<T>::digits10;
+template <typename T>
+EIGEN_CONSTEXPR const int numeric_limits_bfloat16_impl<T>::max_digits10;
+template <typename T>
+EIGEN_CONSTEXPR const int numeric_limits_bfloat16_impl<T>::radix;
+template <typename T>
+EIGEN_CONSTEXPR const int numeric_limits_bfloat16_impl<T>::min_exponent;
+template <typename T>
+EIGEN_CONSTEXPR const int numeric_limits_bfloat16_impl<T>::min_exponent10;
+template <typename T>
+EIGEN_CONSTEXPR const int numeric_limits_bfloat16_impl<T>::max_exponent;
+template <typename T>
+EIGEN_CONSTEXPR const int numeric_limits_bfloat16_impl<T>::max_exponent10;
+template <typename T>
+EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::traps;
+template <typename T>
+EIGEN_CONSTEXPR const bool numeric_limits_bfloat16_impl<T>::tinyness_before;
+}  // end namespace bfloat16_impl
+}  // end namespace Eigen
+
+namespace std {
+// If std::numeric_limits<T> is specialized, should also specialize
+// std::numeric_limits<const T>, std::numeric_limits<volatile T>, and
+// std::numeric_limits<const volatile T>
+// https://stackoverflow.com/a/16519653/
+template <>
+class numeric_limits<Eigen::bfloat16> : public Eigen::bfloat16_impl::numeric_limits_bfloat16_impl<> {};
+template <>
+class numeric_limits<const Eigen::bfloat16> : public numeric_limits<Eigen::bfloat16> {};
+template <>
+class numeric_limits<volatile Eigen::bfloat16> : public numeric_limits<Eigen::bfloat16> {};
+template <>
+class numeric_limits<const volatile Eigen::bfloat16> : public numeric_limits<Eigen::bfloat16> {};
+}  // end namespace std
+
+namespace Eigen {
+
+namespace bfloat16_impl {
+
+// We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler,
+// invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation
+// of the functions, while the latter can only deal with one of them.
+#if !defined(EIGEN_HAS_NATIVE_BF16) || (EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)  // Emulate support for bfloat16 floats
+
+#if EIGEN_COMP_CLANG && defined(EIGEN_CUDACC)
+// We need to provide emulated *host-side* BF16 operators for clang.
+#pragma push_macro("EIGEN_DEVICE_FUNC")
+#undef EIGEN_DEVICE_FUNC
+#if (defined(EIGEN_HAS_GPU_BF16) && defined(EIGEN_HAS_NATIVE_BF16))
+#define EIGEN_DEVICE_FUNC __host__
+#else  // both host and device need emulated ops.
+#define EIGEN_DEVICE_FUNC __host__ __device__
+#endif
+#endif
+
+// Definitions for CPUs, mostly working through conversion
+// to/from fp32.
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator+(const bfloat16& a, const bfloat16& b) {
+  return bfloat16(float(a) + float(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator+(const bfloat16& a, const int& b) {
+  return bfloat16(float(a) + static_cast<float>(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator+(const int& a, const bfloat16& b) {
+  return bfloat16(static_cast<float>(a) + float(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator*(const bfloat16& a, const bfloat16& b) {
+  return bfloat16(float(a) * float(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator-(const bfloat16& a, const bfloat16& b) {
+  return bfloat16(float(a) - float(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator/(const bfloat16& a, const bfloat16& b) {
+  return bfloat16(float(a) / float(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator-(const bfloat16& a) {
+  numext::uint16_t x = numext::bit_cast<uint16_t>(a) ^ 0x8000;
+  return numext::bit_cast<bfloat16>(x);
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator+=(bfloat16& a, const bfloat16& b) {
+  a = bfloat16(float(a) + float(b));
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator*=(bfloat16& a, const bfloat16& b) {
+  a = bfloat16(float(a) * float(b));
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator-=(bfloat16& a, const bfloat16& b) {
+  a = bfloat16(float(a) - float(b));
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16& operator/=(bfloat16& a, const bfloat16& b) {
+  a = bfloat16(float(a) / float(b));
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator++(bfloat16& a) {
+  a += bfloat16(1);
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator--(bfloat16& a) {
+  a -= bfloat16(1);
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator++(bfloat16& a, int) {
+  bfloat16 original_value = a;
+  ++a;
+  return original_value;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator--(bfloat16& a, int) {
+  bfloat16 original_value = a;
+  --a;
+  return original_value;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator==(const bfloat16& a, const bfloat16& b) {
+  return numext::equal_strict(float(a), float(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator!=(const bfloat16& a, const bfloat16& b) {
+  return numext::not_equal_strict(float(a), float(b));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<(const bfloat16& a, const bfloat16& b) {
+  return float(a) < float(b);
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<=(const bfloat16& a, const bfloat16& b) {
+  return float(a) <= float(b);
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>(const bfloat16& a, const bfloat16& b) {
+  return float(a) > float(b);
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>=(const bfloat16& a, const bfloat16& b) {
+  return float(a) >= float(b);
+}
+
+#if EIGEN_COMP_CLANG && defined(EIGEN_CUDACC)
+#pragma pop_macro("EIGEN_DEVICE_FUNC")
+#endif
+#endif  // Emulate support for bfloat16 floats
+
+// Division by an index. Do it in full float precision to avoid accuracy
+// issues in converting the denominator to bfloat16.
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 operator/(const bfloat16& a, Index b) {
+  return bfloat16(static_cast<float>(a) / static_cast<float>(b));
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw truncate_to_bfloat16(const float v) {
+#if defined(EIGEN_USE_HIP_BF16)
+  return __bfloat16_raw(__bfloat16_raw::round_to_bfloat16(v, __bfloat16_raw::truncate));
+#else
+  __bfloat16_raw output;
+  if (numext::isnan EIGEN_NOT_A_MACRO(v)) {
+    output.value = std::signbit(v) ? 0xFFC0 : 0x7FC0;
+    return output;
+  }
+  output.value = static_cast<numext::uint16_t>(numext::bit_cast<numext::uint32_t>(v) >> 16);
+  return output;
+#endif
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR __bfloat16_raw raw_uint16_to_bfloat16(numext::uint16_t value) {
+#if defined(EIGEN_USE_HIP_BF16)
+  __bfloat16_raw bf;
+  bf.data = value;
+  return bf;
+#else
+  return __bfloat16_raw(value);
+#endif
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR numext::uint16_t raw_bfloat16_as_uint16(
+    const __bfloat16_raw& bf) {
+#if defined(EIGEN_USE_HIP_BF16)
+  return bf.data;
+#else
+  return bf.value;
+#endif
+}
+
+// float_to_bfloat16_rtne template specialization that does not make any
+// assumption about the value of its function argument (ff).
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne<false>(float ff) {
+#if defined(EIGEN_USE_HIP_BF16)
+  return __bfloat16_raw(__bfloat16_raw::round_to_bfloat16(ff));
+#else
+  __bfloat16_raw output;
+
+  if (numext::isnan EIGEN_NOT_A_MACRO(ff)) {
+    // If the value is a NaN, squash it to a qNaN with msb of fraction set,
+    // this makes sure after truncation we don't end up with an inf.
+    //
+    // qNaN magic: All exponent bits set + most significant bit of fraction
+    // set.
+    output.value = std::signbit(ff) ? 0xFFC0 : 0x7FC0;
+  } else {
+    // Fast rounding algorithm that rounds a half value to nearest even. This
+    // reduces expected error when we convert a large number of floats. Here
+    // is how it works:
+    //
+    // Definitions:
+    // To convert a float 32 to bfloat16, a float 32 can be viewed as 32 bits
+    // with the following tags:
+    //
+    // Sign |  Exp (8 bits) | Frac (23 bits)
+    //  S     EEEEEEEE         FFFFFFLRTTTTTTTTTTTTTTT
+    //
+    //  S: Sign bit.
+    //  E: Exponent bits.
+    //  F: First 6 bits of fraction.
+    //  L: Least significant bit of resulting bfloat16 if we truncate away the
+    //  rest of the float32. This is also the 7th bit of fraction
+    //  R: Rounding bit, 8th bit of fraction.
+    //  T: Sticky bits, rest of fraction, 15 bits.
+    //
+    // To round half to nearest even, there are 3 cases where we want to round
+    // down (simply truncate the result of the bits away, which consists of
+    // rounding bit and sticky bits) and two cases where we want to round up
+    // (truncate then add one to the result).
+    //
+    // The fast converting algorithm simply adds lsb (L) to 0x7fff (15 bits of
+    // 1s) as the rounding bias, adds the rounding bias to the input, then
+    // truncates the last 16 bits away.
+    //
+    // To understand how it works, we can analyze this algorithm case by case:
+    //
+    // 1. L = 0, R = 0:
+    //   Expect: round down, this is less than half value.
+    //
+    //   Algorithm:
+    //   - Rounding bias: 0x7fff + 0 = 0x7fff
+    //   - Adding rounding bias to input may create any carry, depending on
+    //   whether there is any value set to 1 in T bits.
+    //   - R may be set to 1 if there is a carry.
+    //   - L remains 0.
+    //   - Note that this case also handles Inf and -Inf, where all fraction
+    //   bits, including L, R and Ts are all 0. The output remains Inf after
+    //   this algorithm.
+    //
+    // 2. L = 1, R = 0:
+    //   Expect: round down, this is less than half value.
+    //
+    //   Algorithm:
+    //   - Rounding bias: 0x7fff + 1 = 0x8000
+    //   - Adding rounding bias to input doesn't change sticky bits but
+    //   adds 1 to rounding bit.
+    //   - L remains 1.
+    //
+    // 3. L = 0, R = 1, all of T are 0:
+    //   Expect: round down, this is exactly at half, the result is already
+    //   even (L=0).
+    //
+    //   Algorithm:
+    //   - Rounding bias: 0x7fff + 0 = 0x7fff
+    //   - Adding rounding bias to input sets all sticky bits to 1, but
+    //   doesn't create a carry.
+    //   - R remains 1.
+    //   - L remains 0.
+    //
+    // 4. L = 1, R = 1:
+    //   Expect: round up, this is exactly at half, the result needs to be
+    //   round to the next even number.
+    //
+    //   Algorithm:
+    //   - Rounding bias: 0x7fff + 1 = 0x8000
+    //   - Adding rounding bias to input doesn't change sticky bits, but
+    //   creates a carry from rounding bit.
+    //   - The carry sets L to 0, creates another carry bit and propagate
+    //   forward to F bits.
+    //   - If all the F bits are 1, a carry then propagates to the exponent
+    //   bits, which then creates the minimum value with the next exponent
+    //   value. Note that we won't have the case where exponents are all 1,
+    //   since that's either a NaN (handled in the other if condition) or inf
+    //   (handled in case 1).
+    //
+    // 5. L = 0, R = 1, any of T is 1:
+    //   Expect: round up, this is greater than half.
+    //
+    //   Algorithm:
+    //   - Rounding bias: 0x7fff + 0 = 0x7fff
+    //   - Adding rounding bias to input creates a carry from sticky bits,
+    //   sets rounding bit to 0, then create another carry.
+    //   - The second carry sets L to 1.
+    //
+    // Examples:
+    //
+    //  Exact half value that is already even:
+    //    Input:
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
+    //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
+    //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 1 0     1000000000000000
+    //
+    //     This falls into case 3. We truncate the rest of 16 bits and no
+    //     carry is created into F and L:
+    //
+    //    Output:
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
+    //     S     E E E E E E E E      F F F F F F L
+    //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 1 0
+    //
+    //  Exact half value, round to next even number:
+    //    Input:
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
+    //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
+    //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 0 1     1000000000000000
+    //
+    //     This falls into case 4. We create a carry from R and T,
+    //     which then propagates into L and F:
+    //
+    //    Output:
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
+    //     S     E E E E E E E E      F F F F F F L
+    //     0     0 0 0 0 0 0 0 0      0 0 0 0 0 1 0
+    //
+    //
+    //  Max denormal value round to min normal value:
+    //    Input:
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
+    //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
+    //     0     0 0 0 0 0 0 0 0      1 1 1 1 1 1 1     1111111111111111
+    //
+    //     This falls into case 4. We create a carry from R and T,
+    //     propagate into L and F, which then propagates into exponent
+    //     bits:
+    //
+    //    Output:
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
+    //     S     E E E E E E E E      F F F F F F L
+    //     0     0 0 0 0 0 0 0 1      0 0 0 0 0 0 0
+    //
+    //  Max normal value round to Inf:
+    //    Input:
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit) | Frac (last 16 bit)
+    //     S     E E E E E E E E      F F F F F F L     RTTTTTTTTTTTTTTT
+    //     0     1 1 1 1 1 1 1 0      1 1 1 1 1 1 1     1111111111111111
+    //
+    //     This falls into case 4. We create a carry from R and T,
+    //     propagate into L and F, which then propagates into exponent
+    //     bits:
+    //
+    //    Sign |  Exp (8 bit)     | Frac (first 7 bit)
+    //     S     E E E E E E E E      F F F F F F L
+    //     0     1 1 1 1 1 1 1 1      0 0 0 0 0 0 0
+
+    // At this point, ff must be either a normal float, or +/-infinity.
+    output = float_to_bfloat16_rtne<true>(ff);
+  }
+  return output;
+#endif
+}
+
+// float_to_bfloat16_rtne template specialization that assumes that its function
+// argument (ff) is either a normal floating point number, or +/-infinity, or
+// zero. Used to improve the runtime performance of conversion from an integer
+// type to bfloat16.
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __bfloat16_raw float_to_bfloat16_rtne<true>(float ff) {
+#if defined(EIGEN_USE_HIP_BF16)
+  return __bfloat16_raw(__bfloat16_raw::round_to_bfloat16(ff));
+#else
+  numext::uint32_t input = numext::bit_cast<numext::uint32_t>(ff);
+  __bfloat16_raw output;
+
+  // Least significant bit of resulting bfloat.
+  numext::uint32_t lsb = (input >> 16) & 1;
+  numext::uint32_t rounding_bias = 0x7fff + lsb;
+  input += rounding_bias;
+  output.value = static_cast<numext::uint16_t>(input >> 16);
+  return output;
+#endif
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float bfloat16_to_float(__bfloat16_raw h) {
+#if defined(EIGEN_USE_HIP_BF16)
+  return static_cast<float>(h);
+#else
+  return numext::bit_cast<float>(static_cast<numext::uint32_t>(h.value) << 16);
+#endif
+}
+
+// --- standard functions ---
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isinf)(const bfloat16& a) {
+  EIGEN_USING_STD(isinf);
+#if defined(EIGEN_USE_HIP_BF16)
+  return (isinf)(a);  // Uses HIP hip_bfloat16 isinf operator
+#else
+  return (isinf)(float(a));
+#endif
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isnan)(const bfloat16& a) {
+  EIGEN_USING_STD(isnan);
+#if defined(EIGEN_USE_HIP_BF16)
+  return (isnan)(a);  // Uses HIP hip_bfloat16 isnan operator
+#else
+  return (isnan)(float(a));
+#endif
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isfinite)(const bfloat16& a) {
+  return !(isinf EIGEN_NOT_A_MACRO(a)) && !(isnan EIGEN_NOT_A_MACRO(a));
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 abs(const bfloat16& a) {
+  numext::uint16_t x = numext::bit_cast<numext::uint16_t>(a) & 0x7FFF;
+  return numext::bit_cast<bfloat16>(x);
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 exp(const bfloat16& a) { return bfloat16(::expf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 exp2(const bfloat16& a) { return bfloat16(::exp2f(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 expm1(const bfloat16& a) { return bfloat16(numext::expm1(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log(const bfloat16& a) { return bfloat16(::logf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log1p(const bfloat16& a) { return bfloat16(numext::log1p(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log10(const bfloat16& a) { return bfloat16(::log10f(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 log2(const bfloat16& a) {
+  return bfloat16(static_cast<float>(EIGEN_LOG2E) * ::logf(float(a)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 sqrt(const bfloat16& a) { return bfloat16(::sqrtf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 pow(const bfloat16& a, const bfloat16& b) {
+  return bfloat16(::powf(float(a), float(b)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 atan2(const bfloat16& a, const bfloat16& b) {
+  return bfloat16(::atan2f(float(a), float(b)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 sin(const bfloat16& a) { return bfloat16(::sinf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 cos(const bfloat16& a) { return bfloat16(::cosf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 tan(const bfloat16& a) { return bfloat16(::tanf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 asin(const bfloat16& a) { return bfloat16(::asinf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 acos(const bfloat16& a) { return bfloat16(::acosf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 atan(const bfloat16& a) { return bfloat16(::atanf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 sinh(const bfloat16& a) { return bfloat16(::sinhf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 cosh(const bfloat16& a) { return bfloat16(::coshf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 tanh(const bfloat16& a) { return bfloat16(::tanhf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 asinh(const bfloat16& a) { return bfloat16(::asinhf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 acosh(const bfloat16& a) { return bfloat16(::acoshf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 atanh(const bfloat16& a) { return bfloat16(::atanhf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 floor(const bfloat16& a) { return bfloat16(::floorf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 ceil(const bfloat16& a) { return bfloat16(::ceilf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 rint(const bfloat16& a) { return bfloat16(::rintf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 round(const bfloat16& a) { return bfloat16(::roundf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 trunc(const bfloat16& a) { return bfloat16(::truncf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 fmod(const bfloat16& a, const bfloat16& b) {
+  return bfloat16(::fmodf(float(a), float(b)));
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16(min)(const bfloat16& a, const bfloat16& b) {
+  const float f1 = static_cast<float>(a);
+  const float f2 = static_cast<float>(b);
+  return f2 < f1 ? b : a;
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16(max)(const bfloat16& a, const bfloat16& b) {
+  const float f1 = static_cast<float>(a);
+  const float f2 = static_cast<float>(b);
+  return f1 < f2 ? b : a;
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 fmin(const bfloat16& a, const bfloat16& b) {
+  const float f1 = static_cast<float>(a);
+  const float f2 = static_cast<float>(b);
+  return bfloat16(::fminf(f1, f2));
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 fmax(const bfloat16& a, const bfloat16& b) {
+  const float f1 = static_cast<float>(a);
+  const float f2 = static_cast<float>(b);
+  return bfloat16(::fmaxf(f1, f2));
+}
+
+EIGEN_DEVICE_FUNC inline bfloat16 fma(const bfloat16& a, const bfloat16& b, const bfloat16& c) {
+  // Emulate FMA via float.
+  return bfloat16(numext::fma(static_cast<float>(a), static_cast<float>(b), static_cast<float>(c)));
+}
+
+#ifndef EIGEN_NO_IO
+EIGEN_ALWAYS_INLINE std::ostream& operator<<(std::ostream& os, const bfloat16& v) {
+  os << static_cast<float>(v);
+  return os;
+}
+#endif
+
+}  // namespace bfloat16_impl
+
+namespace internal {
+
+template <>
+struct is_arithmetic<bfloat16> {
+  enum { value = true };
+};
+
+template <>
+struct random_impl<bfloat16> {
+  enum : int { MantissaBits = 7 };
+  using Impl = random_impl<float>;
+  static EIGEN_DEVICE_FUNC inline bfloat16 run(const bfloat16& x, const bfloat16& y) {
+    float result = Impl::run(x, y, MantissaBits);
+    return bfloat16(result);
+  }
+  static EIGEN_DEVICE_FUNC inline bfloat16 run() {
+    float result = Impl::run(MantissaBits);
+    return bfloat16(result);
+  }
+};
+
+}  // namespace internal
+
+template <>
+struct NumTraits<Eigen::bfloat16> : GenericNumTraits<Eigen::bfloat16> {
+  enum { IsSigned = true, IsInteger = false, IsComplex = false, RequireInitialization = false };
+
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::bfloat16 epsilon() {
+    return bfloat16_impl::raw_uint16_to_bfloat16(0x3c00);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::bfloat16 dummy_precision() {
+    return bfloat16_impl::raw_uint16_to_bfloat16(0x3D4D);  // bfloat16(5e-2f);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::bfloat16 highest() {
+    return bfloat16_impl::raw_uint16_to_bfloat16(0x7F7F);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::bfloat16 lowest() {
+    return bfloat16_impl::raw_uint16_to_bfloat16(0xFF7F);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::bfloat16 infinity() {
+    return bfloat16_impl::raw_uint16_to_bfloat16(0x7f80);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::bfloat16 quiet_NaN() {
+    return bfloat16_impl::raw_uint16_to_bfloat16(0x7fc0);
+  }
+};
+
+}  // namespace Eigen
+
+#if defined(EIGEN_HAS_HIP_BF16)
+#pragma pop_macro("EIGEN_CONSTEXPR")
+#endif
+
+namespace Eigen {
+namespace numext {
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isnan)(const Eigen::bfloat16& h) {
+  return (bfloat16_impl::isnan)(h);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isinf)(const Eigen::bfloat16& h) {
+  return (bfloat16_impl::isinf)(h);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isfinite)(const Eigen::bfloat16& h) {
+  return (bfloat16_impl::isfinite)(h);
+}
+
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bit_cast<Eigen::bfloat16, uint16_t>(const uint16_t& src) {
+  return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(src);
+}
+
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC uint16_t bit_cast<uint16_t, Eigen::bfloat16>(const Eigen::bfloat16& src) {
+  return Eigen::bfloat16_impl::raw_bfloat16_as_uint16(src);
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bfloat16 nextafter(const bfloat16& from, const bfloat16& to) {
+  if (numext::isnan EIGEN_NOT_A_MACRO(from)) {
+    return from;
+  }
+  if (numext::isnan EIGEN_NOT_A_MACRO(to)) {
+    return to;
+  }
+  if (from == to) {
+    return to;
+  }
+  uint16_t from_bits = numext::bit_cast<uint16_t>(from);
+  bool from_sign = from_bits >> 15;
+  // Whether we are adjusting toward the infinity with the same sign as from.
+  bool toward_inf = (to > from) == !from_sign;
+  if (toward_inf) {
+    ++from_bits;
+  } else if ((from_bits & 0x7fff) == 0) {
+    // Adjusting away from inf, but from is zero, so just toggle the sign.
+    from_bits ^= 0x8000;
+  } else {
+    --from_bits;
+  }
+  return numext::bit_cast<bfloat16>(from_bits);
+}
+
+// Specialize multiply-add to match packet operations and reduce conversions to/from float.
+template<>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 madd<Eigen::bfloat16>(const Eigen::bfloat16& x, const Eigen::bfloat16& y, const Eigen::bfloat16& z) {
+  return Eigen::bfloat16(static_cast<float>(x) * static_cast<float>(y) + static_cast<float>(z));
+}
+
+}  // namespace numext
+}  // namespace Eigen
+
+#if EIGEN_HAS_STD_HASH
+namespace std {
+template <>
+struct hash<Eigen::bfloat16> {
+  EIGEN_STRONG_INLINE std::size_t operator()(const Eigen::bfloat16& a) const {
+    return static_cast<std::size_t>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(a));
+  }
+};
+}  // namespace std
+#endif
+
+// Add the missing shfl* intrinsics.
+// The __shfl* functions are only valid on HIP or _CUDA_ARCH_ >= 300.
+//   CUDA defines them for (__CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__))
+//
+// HIP and CUDA prior to SDK 9.0 define
+//    __shfl, __shfl_up, __shfl_down, __shfl_xor for int and float
+// CUDA since 9.0 deprecates those and instead defines
+//    __shfl_sync, __shfl_up_sync, __shfl_down_sync, __shfl_xor_sync,
+//    with native support for __half and __nv_bfloat16
+//
+// Note that the following are __device__ - only functions.
+#if defined(EIGEN_HIPCC)
+
+#if defined(EIGEN_HAS_HIP_BF16)
+
+__device__ EIGEN_STRONG_INLINE Eigen::bfloat16 __shfl(Eigen::bfloat16 var, int srcLane, int width = warpSize) {
+  const int ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var));
+  return Eigen::numext::bit_cast<Eigen::bfloat16>(static_cast<Eigen::numext::uint16_t>(__shfl(ivar, srcLane, width)));
+}
+
+__device__ EIGEN_STRONG_INLINE Eigen::bfloat16 __shfl_up(Eigen::bfloat16 var, unsigned int delta,
+                                                         int width = warpSize) {
+  const int ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var));
+  return Eigen::numext::bit_cast<Eigen::bfloat16>(static_cast<Eigen::numext::uint16_t>(__shfl_up(ivar, delta, width)));
+}
+
+__device__ EIGEN_STRONG_INLINE Eigen::bfloat16 __shfl_down(Eigen::bfloat16 var, unsigned int delta,
+                                                           int width = warpSize) {
+  const int ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var));
+  return Eigen::numext::bit_cast<Eigen::bfloat16>(
+      static_cast<Eigen::numext::uint16_t>(__shfl_down(ivar, delta, width)));
+}
+
+__device__ EIGEN_STRONG_INLINE Eigen::bfloat16 __shfl_xor(Eigen::bfloat16 var, int laneMask, int width = warpSize) {
+  const int ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var));
+  return Eigen::numext::bit_cast<Eigen::bfloat16>(
+      static_cast<Eigen::numext::uint16_t>(__shfl_xor(ivar, laneMask, width)));
+}
+
+#endif  // HIP
+
+#endif  // __shfl*
+
+#if defined(EIGEN_HIPCC)
+EIGEN_STRONG_INLINE __device__ Eigen::bfloat16 __ldg(const Eigen::bfloat16* ptr) {
+  return Eigen::bfloat16_impl::raw_uint16_to_bfloat16(
+      __ldg(Eigen::numext::bit_cast<const Eigen::numext::uint16_t*>(ptr)));
+}
+#endif  // __ldg
+
+#endif  // EIGEN_BFLOAT16_H
diff --git a/inst/include/Eigen/src/Core/arch/Default/ConjHelper.h b/inst/include/Eigen/src/Core/arch/Default/ConjHelper.h
new file mode 100644
index 00000000..fd7923e1
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/Default/ConjHelper.h
@@ -0,0 +1,128 @@
+
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ARCH_CONJ_HELPER_H
+#define EIGEN_ARCH_CONJ_HELPER_H
+
+#define EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(PACKET_CPLX, PACKET_REAL)                                                  \
+  template <>                                                                                                       \
+  struct conj_helper<PACKET_REAL, PACKET_CPLX, false, false> {                                                      \
+    EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_REAL& x, const PACKET_CPLX& y, const PACKET_CPLX& c) const { \
+      return padd(c, this->pmul(x, y));                                                                             \
+    }                                                                                                               \
+    EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_REAL& x, const PACKET_CPLX& y) const {                        \
+      return PACKET_CPLX(Eigen::internal::pmul<PACKET_REAL>(x, y.v));                                               \
+    }                                                                                                               \
+  };                                                                                                                \
+                                                                                                                    \
+  template <>                                                                                                       \
+  struct conj_helper<PACKET_CPLX, PACKET_REAL, false, false> {                                                      \
+    EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_CPLX& x, const PACKET_REAL& y, const PACKET_CPLX& c) const { \
+      return padd(c, this->pmul(x, y));                                                                             \
+    }                                                                                                               \
+    EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_CPLX& x, const PACKET_REAL& y) const {                        \
+      return PACKET_CPLX(Eigen::internal::pmul<PACKET_REAL>(x.v, y));                                               \
+    }                                                                                                               \
+  };
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+template <bool Conjugate>
+struct conj_if;
+
+template <>
+struct conj_if<true> {
+  template <typename T>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const {
+    return numext::conj(x);
+  }
+  template <typename T>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T pconj(const T& x) const {
+    return internal::pconj(x);
+  }
+};
+
+template <>
+struct conj_if<false> {
+  template <typename T>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& operator()(const T& x) const {
+    return x;
+  }
+  template <typename T>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& pconj(const T& x) const {
+    return x;
+  }
+};
+
+// Generic Implementation, assume scalars since the packet-version is
+// specialized below.
+template <typename LhsType, typename RhsType, bool ConjLhs, bool ConjRhs>
+struct conj_helper {
+  typedef typename ScalarBinaryOpTraits<LhsType, RhsType>::ReturnType ResultType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType pmadd(const LhsType& x, const RhsType& y,
+                                                         const ResultType& c) const {
+    return this->pmul(x, y) + c;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType pmul(const LhsType& x, const RhsType& y) const {
+    return conj_if<ConjLhs>()(x) * conj_if<ConjRhs>()(y);
+  }
+};
+
+template <typename LhsScalar, typename RhsScalar>
+struct conj_helper<LhsScalar, RhsScalar, true, true> {
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResultType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType pmadd(const LhsScalar& x, const RhsScalar& y,
+                                                         const ResultType& c) const {
+    return this->pmul(x, y) + c;
+  }
+
+  // We save a conjuation by using the identity conj(a)*conj(b) = conj(a*b).
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType pmul(const LhsScalar& x, const RhsScalar& y) const {
+    return numext::conj(x * y);
+  }
+};
+
+// Implementation with equal type, use packet operations.
+template <typename Packet, bool ConjLhs, bool ConjRhs>
+struct conj_helper<Packet, Packet, ConjLhs, ConjRhs> {
+  typedef Packet ResultType;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmadd(const Packet& x, const Packet& y, const Packet& c) const {
+    return Eigen::internal::pmadd(conj_if<ConjLhs>().pconj(x), conj_if<ConjRhs>().pconj(y), c);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmul(const Packet& x, const Packet& y) const {
+    return Eigen::internal::pmul(conj_if<ConjLhs>().pconj(x), conj_if<ConjRhs>().pconj(y));
+  }
+};
+
+template <typename Packet>
+struct conj_helper<Packet, Packet, true, true> {
+  typedef Packet ResultType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmadd(const Packet& x, const Packet& y, const Packet& c) const {
+    return Eigen::internal::pmadd(pconj(x), pconj(y), c);
+  }
+  // We save a conjuation by using the identity conj(a)*conj(b) = conj(a*b).
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pmul(const Packet& x, const Packet& y) const {
+    return pconj(Eigen::internal::pmul(x, y));
+  }
+};
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_ARCH_CONJ_HELPER_H
diff --git a/inst/include/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/inst/include/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
new file mode 100644
index 00000000..a46a8eff
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
@@ -0,0 +1,2634 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2007 Julien Pommier
+// Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com)
+// Copyright (C) 2009-2019 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/* The exp and log functions of this file initially come from
+ * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
+ */
+
+#ifndef EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_H
+#define EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+// Creates a Scalar integer type with same bit-width.
+template <typename T>
+struct make_integer;
+template <>
+struct make_integer<float> {
+  typedef numext::int32_t type;
+};
+template <>
+struct make_integer<double> {
+  typedef numext::int64_t type;
+};
+template <>
+struct make_integer<half> {
+  typedef numext::int16_t type;
+};
+template <>
+struct make_integer<bfloat16> {
+  typedef numext::int16_t type;
+};
+
+/* polevl (modified for Eigen)
+ *
+ *      Evaluate polynomial
+ *
+ *
+ *
+ * SYNOPSIS:
+ *
+ * int N;
+ * Scalar x, y, coef[N+1];
+ *
+ * y = polevl<decltype(x), N>( x, coef);
+ *
+ *
+ *
+ * DESCRIPTION:
+ *
+ * Evaluates polynomial of degree N:
+ *
+ *                     2          N
+ * y  =  C  + C x + C x  +...+ C x
+ *        0    1     2          N
+ *
+ * Coefficients are stored in reverse order:
+ *
+ * coef[0] = C  , ..., coef[N] = C  .
+ *            N                   0
+ *
+ *  The function p1evl() assumes that coef[N] = 1.0 and is
+ * omitted from the array.  Its calling arguments are
+ * otherwise the same as polevl().
+ *
+ *
+ * The Eigen implementation is templatized.  For best speed, store
+ * coef as a const array (constexpr), e.g.
+ *
+ * const double coef[] = {1.0, 2.0, 3.0, ...};
+ *
+ */
+template <typename Packet, int N>
+struct ppolevl {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x,
+                                                          const typename unpacket_traits<Packet>::type coeff[]) {
+    EIGEN_STATIC_ASSERT((N > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    return pmadd(ppolevl<Packet, N - 1>::run(x, coeff), x, pset1<Packet>(coeff[N]));
+  }
+};
+
+template <typename Packet>
+struct ppolevl<Packet, 0> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x,
+                                                          const typename unpacket_traits<Packet>::type coeff[]) {
+    EIGEN_UNUSED_VARIABLE(x);
+    return pset1<Packet>(coeff[0]);
+  }
+};
+
+/* chbevl (modified for Eigen)
+ *
+ *     Evaluate Chebyshev series
+ *
+ *
+ *
+ * SYNOPSIS:
+ *
+ * int N;
+ * Scalar x, y, coef[N], chebevl();
+ *
+ * y = chbevl( x, coef, N );
+ *
+ *
+ *
+ * DESCRIPTION:
+ *
+ * Evaluates the series
+ *
+ *        N-1
+ *         - '
+ *  y  =   >   coef[i] T (x/2)
+ *         -            i
+ *        i=0
+ *
+ * of Chebyshev polynomials Ti at argument x/2.
+ *
+ * Coefficients are stored in reverse order, i.e. the zero
+ * order term is last in the array.  Note N is the number of
+ * coefficients, not the order.
+ *
+ * If coefficients are for the interval a to b, x must
+ * have been transformed to x -> 2(2x - b - a)/(b-a) before
+ * entering the routine.  This maps x from (a, b) to (-1, 1),
+ * over which the Chebyshev polynomials are defined.
+ *
+ * If the coefficients are for the inverted interval, in
+ * which (a, b) is mapped to (1/b, 1/a), the transformation
+ * required is x -> 2(2ab/x - b - a)/(b-a).  If b is infinity,
+ * this becomes x -> 4a/x - 1.
+ *
+ *
+ *
+ * SPEED:
+ *
+ * Taking advantage of the recurrence properties of the
+ * Chebyshev polynomials, the routine requires one more
+ * addition per loop than evaluating a nested polynomial of
+ * the same degree.
+ *
+ */
+
+template <typename Packet, int N>
+struct pchebevl {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Packet run(Packet x,
+                                                          const typename unpacket_traits<Packet>::type coef[]) {
+    typedef typename unpacket_traits<Packet>::type Scalar;
+    Packet b0 = pset1<Packet>(coef[0]);
+    Packet b1 = pset1<Packet>(static_cast<Scalar>(0.f));
+    Packet b2;
+
+    for (int i = 1; i < N; i++) {
+      b2 = b1;
+      b1 = b0;
+      b0 = psub(pmadd(x, b1, pset1<Packet>(coef[i])), b2);
+    }
+
+    return pmul(pset1<Packet>(static_cast<Scalar>(0.5f)), psub(b0, b2));
+  }
+};
+
+template <typename Packet>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pfrexp_generic_get_biased_exponent(const Packet& a) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  typedef typename unpacket_traits<Packet>::integer_packet PacketI;
+  static constexpr int mantissa_bits = numext::numeric_limits<Scalar>::digits - 1;
+  return pcast<PacketI, Packet>(plogical_shift_right<mantissa_bits>(preinterpret<PacketI>(pabs(a))));
+}
+
+// Safely applies frexp, correctly handles denormals.
+// Assumes IEEE floating point format.
+template <typename Packet>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pfrexp_generic(const Packet& a, Packet& exponent) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  typedef typename make_unsigned<typename make_integer<Scalar>::type>::type ScalarUI;
+  static constexpr int TotalBits = sizeof(Scalar) * CHAR_BIT, MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
+                       ExponentBits = TotalBits - MantissaBits - 1;
+
+  constexpr ScalarUI scalar_sign_mantissa_mask =
+      ~(((ScalarUI(1) << ExponentBits) - ScalarUI(1)) << MantissaBits);  // ~0x7f800000
+  const Packet sign_mantissa_mask = pset1frombits<Packet>(static_cast<ScalarUI>(scalar_sign_mantissa_mask));
+  const Packet half = pset1<Packet>(Scalar(0.5));
+  const Packet zero = pzero(a);
+  const Packet normal_min = pset1<Packet>((numext::numeric_limits<Scalar>::min)());  // Minimum normal value, 2^-126
+
+  // To handle denormals, normalize by multiplying by 2^(int(MantissaBits)+1).
+  const Packet is_denormal = pcmp_lt(pabs(a), normal_min);
+  constexpr ScalarUI scalar_normalization_offset = ScalarUI(MantissaBits + 1);  // 24
+  // The following cannot be constexpr because bfloat16(uint16_t) is not constexpr.
+  const Scalar scalar_normalization_factor = Scalar(ScalarUI(1) << int(scalar_normalization_offset));  // 2^24
+  const Packet normalization_factor = pset1<Packet>(scalar_normalization_factor);
+  const Packet normalized_a = pselect(is_denormal, pmul(a, normalization_factor), a);
+
+  // Determine exponent offset: -126 if normal, -126-24 if denormal
+  const Scalar scalar_exponent_offset = -Scalar((ScalarUI(1) << (ExponentBits - 1)) - ScalarUI(2));  // -126
+  Packet exponent_offset = pset1<Packet>(scalar_exponent_offset);
+  const Packet normalization_offset = pset1<Packet>(-Scalar(scalar_normalization_offset));  // -24
+  exponent_offset = pselect(is_denormal, padd(exponent_offset, normalization_offset), exponent_offset);
+
+  // Determine exponent and mantissa from normalized_a.
+  exponent = pfrexp_generic_get_biased_exponent(normalized_a);
+  // Zero, Inf and NaN return 'a' unmodified, exponent is zero
+  // (technically the exponent is unspecified for inf/NaN, but GCC/Clang set it to zero)
+  const Scalar scalar_non_finite_exponent = Scalar((ScalarUI(1) << ExponentBits) - ScalarUI(1));  // 255
+  const Packet non_finite_exponent = pset1<Packet>(scalar_non_finite_exponent);
+  const Packet is_zero_or_not_finite = por(pcmp_eq(a, zero), pcmp_eq(exponent, non_finite_exponent));
+  const Packet m = pselect(is_zero_or_not_finite, a, por(pand(normalized_a, sign_mantissa_mask), half));
+  exponent = pselect(is_zero_or_not_finite, zero, padd(exponent, exponent_offset));
+  return m;
+}
+
+// Safely applies ldexp, correctly handles overflows, underflows and denormals.
+// Assumes IEEE floating point format.
+template <typename Packet>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pldexp_generic(const Packet& a, const Packet& exponent) {
+  // We want to return a * 2^exponent, allowing for all possible integer
+  // exponents without overflowing or underflowing in intermediate
+  // computations.
+  //
+  // Since 'a' and the output can be denormal, the maximum range of 'exponent'
+  // to consider for a float is:
+  //   -255-23 -> 255+23
+  // Below -278 any finite float 'a' will become zero, and above +278 any
+  // finite float will become inf, including when 'a' is the smallest possible
+  // denormal.
+  //
+  // Unfortunately, 2^(278) cannot be represented using either one or two
+  // finite normal floats, so we must split the scale factor into at least
+  // three parts. It turns out to be faster to split 'exponent' into four
+  // factors, since [exponent>>2] is much faster to compute that [exponent/3].
+  //
+  // Set e = min(max(exponent, -278), 278);
+  //     b = floor(e/4);
+  //   out = ((((a * 2^(b)) * 2^(b)) * 2^(b)) * 2^(e-3*b))
+  //
+  // This will avoid any intermediate overflows and correctly handle 0, inf,
+  // NaN cases.
+  typedef typename unpacket_traits<Packet>::integer_packet PacketI;
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  typedef typename unpacket_traits<PacketI>::type ScalarI;
+  static constexpr int TotalBits = sizeof(Scalar) * CHAR_BIT, MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
+                       ExponentBits = TotalBits - MantissaBits - 1;
+
+  const Packet max_exponent = pset1<Packet>(Scalar((ScalarI(1) << ExponentBits) + ScalarI(MantissaBits - 1)));  // 278
+  const PacketI bias = pset1<PacketI>((ScalarI(1) << (ExponentBits - 1)) - ScalarI(1));                         // 127
+  const PacketI e = pcast<Packet, PacketI>(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
+  PacketI b = parithmetic_shift_right<2>(e);                                          // floor(e/4);
+  Packet c = preinterpret<Packet>(plogical_shift_left<MantissaBits>(padd(b, bias)));  // 2^b
+  Packet out = pmul(pmul(pmul(a, c), c), c);                                          // a * 2^(3b)
+  b = pnmadd(pset1<PacketI>(3), b, e);                                                // e - 3b
+  c = preinterpret<Packet>(plogical_shift_left<MantissaBits>(padd(b, bias)));         // 2^(e-3*b)
+  out = pmul(out, c);
+  return out;
+}
+
+// Explicitly multiplies
+//    a * (2^e)
+// clamping e to the range
+// [NumTraits<Scalar>::min_exponent()-2, NumTraits<Scalar>::max_exponent()]
+//
+// This is approx 7x faster than pldexp_impl, but will prematurely over/underflow
+// if 2^e doesn't fit into a normal floating-point Scalar.
+//
+// Assumes IEEE floating point format
+template <typename Packet>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pldexp_fast(const Packet& a, const Packet& exponent) {
+  typedef typename unpacket_traits<Packet>::integer_packet PacketI;
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  typedef typename unpacket_traits<PacketI>::type ScalarI;
+  static constexpr int TotalBits = sizeof(Scalar) * CHAR_BIT, MantissaBits = numext::numeric_limits<Scalar>::digits - 1,
+                       ExponentBits = TotalBits - MantissaBits - 1;
+
+  const Packet bias = pset1<Packet>(Scalar((ScalarI(1) << (ExponentBits - 1)) - ScalarI(1)));  // 127
+  const Packet limit = pset1<Packet>(Scalar((ScalarI(1) << ExponentBits) - ScalarI(1)));       // 255
+  // restrict biased exponent between 0 and 255 for float.
+  const PacketI e = pcast<Packet, PacketI>(pmin(pmax(padd(exponent, bias), pzero(limit)), limit));  // exponent + 127
+  // return a * (2^e)
+  return pmul(a, preinterpret<Packet>(plogical_shift_left<MantissaBits>(e)));
+}
+
+// This function implements a single step of Halley's iteration for
+// computing x = y^(1/3):
+//   x_{k+1} = x_k - (x_k^3 - y) x_k / (2x_k^3 + y)
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet cbrt_halley_iteration_step(const Packet& x_k,
+                                                                                      const Packet& y) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  Packet x_k_cb = pmul(x_k, pmul(x_k, x_k));
+  Packet denom = pmadd(pset1<Packet>(Scalar(2)), x_k_cb, y);
+  Packet num = psub(x_k_cb, y);
+  Packet r = pdiv(num, denom);
+  return pnmadd(x_k, r, x_k);
+}
+
+// Decompose the input such that x^(1/3) = y^(1/3) * 2^e_div3, and y is in the
+// interval [0.125,1].
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet cbrt_decompose(const Packet& x, Packet& e_div3) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  // Extract the significant s in the range [0.5,1) and exponent e, such that
+  // x = 2^e * s.
+  Packet e, s;
+  s = pfrexp(x, e);
+
+  // Split the exponent into a part divisible by 3 and the remainder.
+  // e = 3*e_div3 + e_mod3.
+  constexpr Scalar kOneThird = Scalar(1) / 3;
+  e_div3 = pceil(pmul(e, pset1<Packet>(kOneThird)));
+  Packet e_mod3 = pnmadd(pset1<Packet>(Scalar(3)), e_div3, e);
+
+  // Replace s by y = (s * 2^e_mod3).
+  return pldexp_fast(s, e_mod3);
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet cbrt_special_cases_and_sign(const Packet& x,
+                                                                                       const Packet& abs_root) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+
+  // Set sign.
+  const Packet sign_mask = pset1<Packet>(Scalar(-0.0));
+  const Packet x_sign = pand(sign_mask, x);
+  Packet root = por(x_sign, abs_root);
+
+  // Pass non-finite and zero values of x straight through.
+  const Packet is_not_finite = por(pisinf(x), pisnan(x));
+  const Packet is_zero = pcmp_eq(pzero(x), x);
+  const Packet use_x = por(is_not_finite, is_zero);
+  return pselect(use_x, x, root);
+}
+
+// Generic implementation of cbrt(x) for float.
+//
+// The algorithm computes the cubic root of the input by first
+// decomposing it into a exponent and significant
+//   x = s * 2^e.
+//
+// We can then write the cube root as
+//
+//   x^(1/3) = 2^(e/3) * s^(1/3)
+//           = 2^((3*e_div3 + e_mod3)/3) * s^(1/3)
+//           = 2^(e_div3) * 2^(e_mod3/3) * s^(1/3)
+//           = 2^(e_div3) * (s * 2^e_mod3)^(1/3)
+//
+// where e_div3 = ceil(e/3) and e_mod3 = e - 3*e_div3.
+//
+// The cube root of the second term y = (s * 2^e_mod3)^(1/3) is coarsely
+// approximated using a cubic polynomial and subsequently refined using a
+// single step of Halley's iteration, and finally the two terms are combined
+// using pldexp_fast.
+//
+// Note: Many alternatives exist for implementing cbrt. See, for example,
+// the excellent discussion in Kahan's note:
+//   https://csclub.uwaterloo.ca/~pbarfuss/qbrt.pdf
+// This particular implementation was found to be very fast and accurate
+// among several alternatives tried, but is probably not "optimal" on all
+// platforms.
+//
+// This is accurate to 2 ULP.
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcbrt_float(const Packet& x) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
+
+  // Decompose the input such that x^(1/3) = y^(1/3) * 2^e_div3, and y is in the
+  // interval [0.125,1].
+  Packet e_div3;
+  const Packet y = cbrt_decompose(pabs(x), e_div3);
+
+  // Compute initial approximation accurate to 5.22e-3.
+  // The polynomial was computed using Rminimax.
+  constexpr float alpha[] = {5.9220016002655029296875e-01f, -1.3859539031982421875e+00f, 1.4581282138824462890625e+00f,
+                             3.408401906490325927734375e-01f};
+  Packet r = ppolevl<Packet, 3>::run(y, alpha);
+
+  // Take one step of Halley's iteration.
+  r = cbrt_halley_iteration_step(r, y);
+
+  // Finally multiply by 2^(e_div3)
+  r = pldexp_fast(r, e_div3);
+
+  return cbrt_special_cases_and_sign(x, r);
+}
+
+// Generic implementation of cbrt(x) for double.
+//
+// The algorithm is identical to the one for float except that a different initial
+// approximation is used for y^(1/3) and two Halley iteration steps are peformed.
+//
+// This is accurate to 1 ULP.
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcbrt_double(const Packet& x) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  static_assert(std::is_same<Scalar, double>::value, "Scalar type must be double");
+
+  // Decompose the input such that x^(1/3) = y^(1/3) * 2^e_div3, and y is in the
+  // interval [0.125,1].
+  Packet e_div3;
+  const Packet y = cbrt_decompose(pabs(x), e_div3);
+
+  // Compute initial approximation accurate to 0.016.
+  // The polynomial was computed using Rminimax.
+  constexpr double alpha[] = {-4.69470621553356115551736138513660989701747894287109375e-01,
+                              1.072314636518546304699839311069808900356292724609375e+00,
+                              3.81249427609571867048288140722434036433696746826171875e-01};
+  Packet r = ppolevl<Packet, 2>::run(y, alpha);
+
+  // Take two steps of Halley's iteration.
+  r = cbrt_halley_iteration_step(r, y);
+  r = cbrt_halley_iteration_step(r, y);
+
+  // Finally multiply by 2^(e_div3).
+  r = pldexp_fast(r, e_div3);
+  return cbrt_special_cases_and_sign(x, r);
+}
+
+// Natural or base 2 logarithm.
+// Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2)
+// and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can
+// be easily approximated by a polynomial centered on m=1 for stability.
+// TODO(gonnet): Further reduce the interval allowing for lower-degree
+//               polynomial interpolants -> ... -> profit!
+template <typename Packet, bool base2>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_impl_float(const Packet _x) {
+  const Packet cst_1 = pset1<Packet>(1.0f);
+  const Packet cst_minus_inf = pset1frombits<Packet>(static_cast<Eigen::numext::uint32_t>(0xff800000u));
+  const Packet cst_pos_inf = pset1frombits<Packet>(static_cast<Eigen::numext::uint32_t>(0x7f800000u));
+
+  const Packet cst_cephes_SQRTHF = pset1<Packet>(0.707106781186547524f);
+  Packet e, x;
+  // extract significant in the range [0.5,1) and exponent
+  x = pfrexp(_x, e);
+
+  // part2: Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
+  // and shift by -1. The values are then centered around 0, which improves
+  // the stability of the polynomial evaluation.
+  //   if( x < SQRTHF ) {
+  //     e -= 1;
+  //     x = x + x - 1.0;
+  //   } else { x = x - 1.0; }
+  Packet mask = pcmp_lt(x, cst_cephes_SQRTHF);
+  Packet tmp = pand(x, mask);
+  x = psub(x, cst_1);
+  e = psub(e, pand(cst_1, mask));
+  x = padd(x, tmp);
+
+  // Polynomial coefficients for rational r(x) = p(x)/q(x)
+  // approximating log(1+x) on [sqrt(0.5)-1;sqrt(2)-1].
+  constexpr float alpha[] = {0.18256296349849254f, 1.0000000190281063f, 1.0000000190281136f};
+  constexpr float beta[] = {0.049616247954120038f, 0.59923249590823520f, 1.4999999999999927f, 1.0f};
+
+  Packet p = ppolevl<Packet, 2>::run(x, alpha);
+  p = pmul(x, p);
+  Packet q = ppolevl<Packet, 3>::run(x, beta);
+  x = pdiv(p, q);
+
+  // Add the logarithm of the exponent back to the result of the interpolation.
+  if (base2) {
+    const Packet cst_log2e = pset1<Packet>(static_cast<float>(EIGEN_LOG2E));
+    x = pmadd(x, cst_log2e, e);
+  } else {
+    const Packet cst_ln2 = pset1<Packet>(static_cast<float>(EIGEN_LN2));
+    x = pmadd(e, cst_ln2, x);
+  }
+
+  Packet invalid_mask = pcmp_lt_or_nan(_x, pzero(_x));
+  Packet iszero_mask = pcmp_eq(_x, pzero(_x));
+  Packet pos_inf_mask = pcmp_eq(_x, cst_pos_inf);
+  // Filter out invalid inputs, i.e.:
+  //  - negative arg will be NAN
+  //  - 0 will be -INF
+  //  - +INF will be +INF
+  return pselect(iszero_mask, cst_minus_inf, por(pselect(pos_inf_mask, cst_pos_inf, x), invalid_mask));
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_float(const Packet _x) {
+  return plog_impl_float<Packet, /* base2 */ false>(_x);
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog2_float(const Packet _x) {
+  return plog_impl_float<Packet, /* base2 */ true>(_x);
+}
+
+/* Returns the base e (2.718...) or base 2 logarithm of x.
+ * The argument is separated into its exponent and fractional parts.
+ * The logarithm of the fraction in the interval [sqrt(1/2), sqrt(2)],
+ * is approximated by
+ *
+ *     log(1+x) = x - 0.5 x**2 + x**3 P(x)/Q(x).
+ *
+ * for more detail see: http://www.netlib.org/cephes/
+ */
+template <typename Packet, bool base2>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_impl_double(const Packet _x) {
+  Packet x = _x;
+
+  const Packet cst_1 = pset1<Packet>(1.0);
+  const Packet cst_neg_half = pset1<Packet>(-0.5);
+  const Packet cst_minus_inf = pset1frombits<Packet>(static_cast<uint64_t>(0xfff0000000000000ull));
+  const Packet cst_pos_inf = pset1frombits<Packet>(static_cast<uint64_t>(0x7ff0000000000000ull));
+
+  // Polynomial Coefficients for log(1+x) = x - x**2/2 + x**3 P(x)/Q(x)
+  //                             1/sqrt(2) <= x < sqrt(2)
+  const Packet cst_cephes_SQRTHF = pset1<Packet>(0.70710678118654752440E0);
+  const Packet cst_cephes_log_p0 = pset1<Packet>(1.01875663804580931796E-4);
+  const Packet cst_cephes_log_p1 = pset1<Packet>(4.97494994976747001425E-1);
+  const Packet cst_cephes_log_p2 = pset1<Packet>(4.70579119878881725854E0);
+  const Packet cst_cephes_log_p3 = pset1<Packet>(1.44989225341610930846E1);
+  const Packet cst_cephes_log_p4 = pset1<Packet>(1.79368678507819816313E1);
+  const Packet cst_cephes_log_p5 = pset1<Packet>(7.70838733755885391666E0);
+
+  const Packet cst_cephes_log_q0 = pset1<Packet>(1.0);
+  const Packet cst_cephes_log_q1 = pset1<Packet>(1.12873587189167450590E1);
+  const Packet cst_cephes_log_q2 = pset1<Packet>(4.52279145837532221105E1);
+  const Packet cst_cephes_log_q3 = pset1<Packet>(8.29875266912776603211E1);
+  const Packet cst_cephes_log_q4 = pset1<Packet>(7.11544750618563894466E1);
+  const Packet cst_cephes_log_q5 = pset1<Packet>(2.31251620126765340583E1);
+
+  Packet e;
+  // extract significant in the range [0.5,1) and exponent
+  x = pfrexp(x, e);
+
+  // Shift the inputs from the range [0.5,1) to [sqrt(1/2),sqrt(2))
+  // and shift by -1. The values are then centered around 0, which improves
+  // the stability of the polynomial evaluation.
+  //   if( x < SQRTHF ) {
+  //     e -= 1;
+  //     x = x + x - 1.0;
+  //   } else { x = x - 1.0; }
+  Packet mask = pcmp_lt(x, cst_cephes_SQRTHF);
+  Packet tmp = pand(x, mask);
+  x = psub(x, cst_1);
+  e = psub(e, pand(cst_1, mask));
+  x = padd(x, tmp);
+
+  Packet x2 = pmul(x, x);
+  Packet x3 = pmul(x2, x);
+
+  // Evaluate the polynomial approximant , probably to improve instruction-level parallelism.
+  // y = x - 0.5*x^2 + x^3 * polevl( x, P, 5 ) / p1evl( x, Q, 5 ) );
+  Packet y, y1, y_;
+  y = pmadd(cst_cephes_log_p0, x, cst_cephes_log_p1);
+  y1 = pmadd(cst_cephes_log_p3, x, cst_cephes_log_p4);
+  y = pmadd(y, x, cst_cephes_log_p2);
+  y1 = pmadd(y1, x, cst_cephes_log_p5);
+  y_ = pmadd(y, x3, y1);
+
+  y = pmadd(cst_cephes_log_q0, x, cst_cephes_log_q1);
+  y1 = pmadd(cst_cephes_log_q3, x, cst_cephes_log_q4);
+  y = pmadd(y, x, cst_cephes_log_q2);
+  y1 = pmadd(y1, x, cst_cephes_log_q5);
+  y = pmadd(y, x3, y1);
+
+  y_ = pmul(y_, x3);
+  y = pdiv(y_, y);
+
+  y = pmadd(cst_neg_half, x2, y);
+  x = padd(x, y);
+
+  // Add the logarithm of the exponent back to the result of the interpolation.
+  if (base2) {
+    const Packet cst_log2e = pset1<Packet>(static_cast<double>(EIGEN_LOG2E));
+    x = pmadd(x, cst_log2e, e);
+  } else {
+    const Packet cst_ln2 = pset1<Packet>(static_cast<double>(EIGEN_LN2));
+    x = pmadd(e, cst_ln2, x);
+  }
+
+  Packet invalid_mask = pcmp_lt_or_nan(_x, pzero(_x));
+  Packet iszero_mask = pcmp_eq(_x, pzero(_x));
+  Packet pos_inf_mask = pcmp_eq(_x, cst_pos_inf);
+  // Filter out invalid inputs, i.e.:
+  //  - negative arg will be NAN
+  //  - 0 will be -INF
+  //  - +INF will be +INF
+  return pselect(iszero_mask, cst_minus_inf, por(pselect(pos_inf_mask, cst_pos_inf, x), invalid_mask));
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_double(const Packet _x) {
+  return plog_impl_double<Packet, /* base2 */ false>(_x);
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog2_double(const Packet _x) {
+  return plog_impl_double<Packet, /* base2 */ true>(_x);
+}
+
+/** \internal \returns log(1 + x) computed using W. Kahan's formula.
+    See: http://www.plunk.org/~hatch/rightway.php
+ */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_log1p(const Packet& x) {
+  typedef typename unpacket_traits<Packet>::type ScalarType;
+  const Packet one = pset1<Packet>(ScalarType(1));
+  Packet xp1 = padd(x, one);
+  Packet small_mask = pcmp_eq(xp1, one);
+  Packet log1 = plog(xp1);
+  Packet inf_mask = pcmp_eq(xp1, log1);
+  Packet log_large = pmul(x, pdiv(log1, psub(xp1, one)));
+  return pselect(por(small_mask, inf_mask), x, log_large);
+}
+
+/** \internal \returns exp(x)-1 computed using W. Kahan's formula.
+    See: http://www.plunk.org/~hatch/rightway.php
+ */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_expm1(const Packet& x) {
+  typedef typename unpacket_traits<Packet>::type ScalarType;
+  const Packet one = pset1<Packet>(ScalarType(1));
+  const Packet neg_one = pset1<Packet>(ScalarType(-1));
+  Packet u = pexp(x);
+  Packet one_mask = pcmp_eq(u, one);
+  Packet u_minus_one = psub(u, one);
+  Packet neg_one_mask = pcmp_eq(u_minus_one, neg_one);
+  Packet logu = plog(u);
+  // The following comparison is to catch the case where
+  // exp(x) = +inf. It is written in this way to avoid having
+  // to form the constant +inf, which depends on the packet
+  // type.
+  Packet pos_inf_mask = pcmp_eq(logu, u);
+  Packet expm1 = pmul(u_minus_one, pdiv(x, logu));
+  expm1 = pselect(pos_inf_mask, u, expm1);
+  return pselect(one_mask, x, pselect(neg_one_mask, neg_one, expm1));
+}
+
+// Exponential function. Works by writing "x = m*log(2) + r" where
+// "m = floor(x/log(2)+1/2)" and "r" is the remainder. The result is then
+// "exp(x) = 2^m*exp(r)" where exp(r) is in the range [-1,1).
+// exp(r) is computed using a 6th order minimax polynomial approximation.
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_float(const Packet _x) {
+  const Packet cst_zero = pset1<Packet>(0.0f);
+  const Packet cst_one = pset1<Packet>(1.0f);
+  const Packet cst_half = pset1<Packet>(0.5f);
+  const Packet cst_exp_hi = pset1<Packet>(88.723f);
+  const Packet cst_exp_lo = pset1<Packet>(-104.f);
+  const Packet cst_pldexp_threshold = pset1<Packet>(87.0);
+
+  const Packet cst_cephes_LOG2EF = pset1<Packet>(1.44269504088896341f);
+  const Packet cst_p2 = pset1<Packet>(0.49999988079071044921875f);
+  const Packet cst_p3 = pset1<Packet>(0.16666518151760101318359375f);
+  const Packet cst_p4 = pset1<Packet>(4.166965186595916748046875e-2f);
+  const Packet cst_p5 = pset1<Packet>(8.36894474923610687255859375e-3f);
+  const Packet cst_p6 = pset1<Packet>(1.37449637986719608306884765625e-3f);
+
+  // Clamp x.
+  Packet zero_mask = pcmp_lt(_x, cst_exp_lo);
+  Packet x = pmin(_x, cst_exp_hi);
+
+  // Express exp(x) as exp(m*ln(2) + r), start by extracting
+  // m = floor(x/ln(2) + 0.5).
+  Packet m = pfloor(pmadd(x, cst_cephes_LOG2EF, cst_half));
+
+  // Get r = x - m*ln(2). If no FMA instructions are available, m*ln(2) is
+  // subtracted out in two parts, m*C1+m*C2 = m*ln(2), to avoid accumulating
+  // truncation errors.
+  const Packet cst_cephes_exp_C1 = pset1<Packet>(-0.693359375f);
+  const Packet cst_cephes_exp_C2 = pset1<Packet>(2.12194440e-4f);
+  Packet r = pmadd(m, cst_cephes_exp_C1, x);
+  r = pmadd(m, cst_cephes_exp_C2, r);
+
+  // Evaluate the 6th order polynomial approximation to exp(r)
+  // with r in the interval [-ln(2)/2;ln(2)/2].
+  const Packet r2 = pmul(r, r);
+  Packet p_even = pmadd(r2, cst_p6, cst_p4);
+  const Packet p_odd = pmadd(r2, cst_p5, cst_p3);
+  p_even = pmadd(r2, p_even, cst_p2);
+  const Packet p_low = padd(r, cst_one);
+  Packet y = pmadd(r, p_odd, p_even);
+  y = pmadd(r2, y, p_low);
+
+  // Return 2^m * exp(r).
+  const Packet fast_pldexp_unsafe = pcmp_lt(cst_pldexp_threshold, pabs(x));
+  if (!predux_any(fast_pldexp_unsafe)) {
+    // For |x| <= 87, we know the result is not zero or inf, and we can safely use
+    // the fast version of pldexp.
+    return pmax(pldexp_fast(y, m), _x);
+  }
+  return pselect(zero_mask, cst_zero, pmax(pldexp(y, m), _x));
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_double(const Packet _x) {
+  Packet x = _x;
+  const Packet cst_zero = pset1<Packet>(0.0);
+  const Packet cst_1 = pset1<Packet>(1.0);
+  const Packet cst_2 = pset1<Packet>(2.0);
+  const Packet cst_half = pset1<Packet>(0.5);
+
+  const Packet cst_exp_hi = pset1<Packet>(709.784);
+  const Packet cst_exp_lo = pset1<Packet>(-745.519);
+  const Packet cst_pldexp_threshold = pset1<Packet>(708.0);
+  const Packet cst_cephes_LOG2EF = pset1<Packet>(1.4426950408889634073599);
+  const Packet cst_cephes_exp_p0 = pset1<Packet>(1.26177193074810590878e-4);
+  const Packet cst_cephes_exp_p1 = pset1<Packet>(3.02994407707441961300e-2);
+  const Packet cst_cephes_exp_p2 = pset1<Packet>(9.99999999999999999910e-1);
+  const Packet cst_cephes_exp_q0 = pset1<Packet>(3.00198505138664455042e-6);
+  const Packet cst_cephes_exp_q1 = pset1<Packet>(2.52448340349684104192e-3);
+  const Packet cst_cephes_exp_q2 = pset1<Packet>(2.27265548208155028766e-1);
+  const Packet cst_cephes_exp_q3 = pset1<Packet>(2.00000000000000000009e0);
+  const Packet cst_cephes_exp_C1 = pset1<Packet>(0.693145751953125);
+  const Packet cst_cephes_exp_C2 = pset1<Packet>(1.42860682030941723212e-6);
+
+  Packet tmp, fx;
+
+  // clamp x
+  Packet zero_mask = pcmp_lt(_x, cst_exp_lo);
+  x = pmin(x, cst_exp_hi);
+  // Express exp(x) as exp(g + n*log(2)).
+  fx = pmadd(cst_cephes_LOG2EF, x, cst_half);
+
+  // Get the integer modulus of log(2), i.e. the "n" described above.
+  fx = pfloor(fx);
+
+  // Get the remainder modulo log(2), i.e. the "g" described above. Subtract
+  // n*log(2) out in two steps, i.e. n*C1 + n*C2, C1+C2=log2 to get the last
+  // digits right.
+  tmp = pmul(fx, cst_cephes_exp_C1);
+  Packet z = pmul(fx, cst_cephes_exp_C2);
+  x = psub(x, tmp);
+  x = psub(x, z);
+
+  Packet x2 = pmul(x, x);
+
+  // Evaluate the numerator polynomial of the rational interpolant.
+  Packet px = cst_cephes_exp_p0;
+  px = pmadd(px, x2, cst_cephes_exp_p1);
+  px = pmadd(px, x2, cst_cephes_exp_p2);
+  px = pmul(px, x);
+
+  // Evaluate the denominator polynomial of the rational interpolant.
+  Packet qx = cst_cephes_exp_q0;
+  qx = pmadd(qx, x2, cst_cephes_exp_q1);
+  qx = pmadd(qx, x2, cst_cephes_exp_q2);
+  qx = pmadd(qx, x2, cst_cephes_exp_q3);
+
+  // I don't really get this bit, copied from the SSE2 routines, so...
+  // TODO(gonnet): Figure out what is going on here, perhaps find a better
+  // rational interpolant?
+  x = pdiv(px, psub(qx, px));
+  x = pmadd(cst_2, x, cst_1);
+
+  // Construct the result 2^n * exp(g) = e * x. The max is used to catch
+  // non-finite values in the input.
+  const Packet fast_pldexp_unsafe = pcmp_lt(cst_pldexp_threshold, pabs(_x));
+  if (!predux_any(fast_pldexp_unsafe)) {
+    // For |x| <= 708, we know the result is not zero or inf, and we can safely use
+    // the fast version of pldexp.
+    return pmax(pldexp_fast(x, fx), _x);
+  }
+  return pselect(zero_mask, cst_zero, pmax(pldexp(x, fx), _x));
+}
+
+// The following code is inspired by the following stack-overflow answer:
+//   https://stackoverflow.com/questions/30463616/payne-hanek-algorithm-implementation-in-c/30465751#30465751
+// It has been largely optimized:
+//  - By-pass calls to frexp.
+//  - Aligned loads of required 96 bits of 2/pi. This is accomplished by
+//    (1) balancing the mantissa and exponent to the required bits of 2/pi are
+//    aligned on 8-bits, and (2) replicating the storage of the bits of 2/pi.
+//  - Avoid a branch in rounding and extraction of the remaining fractional part.
+// Overall, I measured a speed up higher than x2 on x86-64.
+inline float trig_reduce_huge(float xf, Eigen::numext::int32_t* quadrant) {
+  using Eigen::numext::int32_t;
+  using Eigen::numext::int64_t;
+  using Eigen::numext::uint32_t;
+  using Eigen::numext::uint64_t;
+
+  const double pio2_62 = 3.4061215800865545e-19;     // pi/2 * 2^-62
+  const uint64_t zero_dot_five = uint64_t(1) << 61;  // 0.5 in 2.62-bit fixed-point format
+
+  // 192 bits of 2/pi for Payne-Hanek reduction
+  // Bits are introduced by packet of 8 to enable aligned reads.
+  static const uint32_t two_over_pi[] = {
+      0x00000028, 0x000028be, 0x0028be60, 0x28be60db, 0xbe60db93, 0x60db9391, 0xdb939105, 0x9391054a, 0x91054a7f,
+      0x054a7f09, 0x4a7f09d5, 0x7f09d5f4, 0x09d5f47d, 0xd5f47d4d, 0xf47d4d37, 0x7d4d3770, 0x4d377036, 0x377036d8,
+      0x7036d8a5, 0x36d8a566, 0xd8a5664f, 0xa5664f10, 0x664f10e4, 0x4f10e410, 0x10e41000, 0xe4100000};
+
+  uint32_t xi = numext::bit_cast<uint32_t>(xf);
+  // Below, -118 = -126 + 8.
+  //   -126 is to get the exponent,
+  //   +8 is to enable alignment of 2/pi's bits on 8 bits.
+  // This is possible because the fractional part of x as only 24 meaningful bits.
+  uint32_t e = (xi >> 23) - 118;
+  // Extract the mantissa and shift it to align it wrt the exponent
+  xi = ((xi & 0x007fffffu) | 0x00800000u) << (e & 0x7);
+
+  uint32_t i = e >> 3;
+  uint32_t twoopi_1 = two_over_pi[i - 1];
+  uint32_t twoopi_2 = two_over_pi[i + 3];
+  uint32_t twoopi_3 = two_over_pi[i + 7];
+
+  // Compute x * 2/pi in 2.62-bit fixed-point format.
+  uint64_t p;
+  p = uint64_t(xi) * twoopi_3;
+  p = uint64_t(xi) * twoopi_2 + (p >> 32);
+  p = (uint64_t(xi * twoopi_1) << 32) + p;
+
+  // Round to nearest: add 0.5 and extract integral part.
+  uint64_t q = (p + zero_dot_five) >> 62;
+  *quadrant = int(q);
+  // Now it remains to compute "r = x - q*pi/2" with high accuracy,
+  // since we have p=x/(pi/2) with high accuracy, we can more efficiently compute r as:
+  //   r = (p-q)*pi/2,
+  // where the product can be be carried out with sufficient accuracy using double precision.
+  p -= q << 62;
+  return float(double(int64_t(p)) * pio2_62);
+}
+
+template <bool ComputeSine, typename Packet, bool ComputeBoth = false>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+#if EIGEN_COMP_GNUC_STRICT
+    __attribute__((optimize("-fno-unsafe-math-optimizations")))
+#endif
+    Packet
+    psincos_float(const Packet& _x) {
+  typedef typename unpacket_traits<Packet>::integer_packet PacketI;
+
+  const Packet cst_2oPI = pset1<Packet>(0.636619746685028076171875f);  // 2/PI
+  const Packet cst_rounding_magic = pset1<Packet>(12582912);           // 2^23 for rounding
+  const PacketI csti_1 = pset1<PacketI>(1);
+  const Packet cst_sign_mask = pset1frombits<Packet>(static_cast<Eigen::numext::uint32_t>(0x80000000u));
+
+  Packet x = pabs(_x);
+
+  // Scale x by 2/Pi to find x's octant.
+  Packet y = pmul(x, cst_2oPI);
+
+  // Rounding trick to find nearest integer:
+  Packet y_round = padd(y, cst_rounding_magic);
+  EIGEN_OPTIMIZATION_BARRIER(y_round)
+  PacketI y_int = preinterpret<PacketI>(y_round);  // last 23 digits represent integer (if abs(x)<2^24)
+  y = psub(y_round, cst_rounding_magic);           // nearest integer to x * (2/pi)
+
+// Subtract y * Pi/2 to reduce x to the interval -Pi/4 <= x <= +Pi/4
+// using "Extended precision modular arithmetic"
+#if defined(EIGEN_VECTORIZE_FMA)
+  // This version requires true FMA for high accuracy.
+  // It provides a max error of 1ULP up to (with absolute_error < 5.9605e-08):
+  const float huge_th = ComputeSine ? 117435.992f : 71476.0625f;
+  x = pmadd(y, pset1<Packet>(-1.57079601287841796875f), x);
+  x = pmadd(y, pset1<Packet>(-3.1391647326017846353352069854736328125e-07f), x);
+  x = pmadd(y, pset1<Packet>(-5.390302529957764765544681040410068817436695098876953125e-15f), x);
+#else
+  // Without true FMA, the previous set of coefficients maintain 1ULP accuracy
+  // up to x<15.7 (for sin), but accuracy is immediately lost for x>15.7.
+  // We thus use one more iteration to maintain 2ULPs up to reasonably large inputs.
+
+  // The following set of coefficients maintain 1ULP up to 9.43 and 14.16 for sin and cos respectively.
+  // and 2 ULP up to:
+  const float huge_th = ComputeSine ? 25966.f : 18838.f;
+  x = pmadd(y, pset1<Packet>(-1.5703125), x);  // = 0xbfc90000
+  EIGEN_OPTIMIZATION_BARRIER(x)
+  x = pmadd(y, pset1<Packet>(-0.000483989715576171875), x);  // = 0xb9fdc000
+  EIGEN_OPTIMIZATION_BARRIER(x)
+  x = pmadd(y, pset1<Packet>(1.62865035235881805419921875e-07), x);                      // = 0x342ee000
+  x = pmadd(y, pset1<Packet>(5.5644315544167710640977020375430583953857421875e-11), x);  // = 0x2e74b9ee
+
+// For the record, the following set of coefficients maintain 2ULP up
+// to a slightly larger range:
+// const float huge_th = ComputeSine ? 51981.f : 39086.125f;
+// but it slightly fails to maintain 1ULP for two values of sin below pi.
+// x = pmadd(y, pset1<Packet>(-3.140625/2.), x);
+// x = pmadd(y, pset1<Packet>(-0.00048351287841796875), x);
+// x = pmadd(y, pset1<Packet>(-3.13855707645416259765625e-07), x);
+// x = pmadd(y, pset1<Packet>(-6.0771006282767103812147979624569416046142578125e-11), x);
+
+// For the record, with only 3 iterations it is possible to maintain
+// 1 ULP up to 3PI (maybe more) and 2ULP up to 255.
+// The coefficients are: 0xbfc90f80, 0xb7354480, 0x2e74b9ee
+#endif
+
+  if (predux_any(pcmp_le(pset1<Packet>(huge_th), pabs(_x)))) {
+    const int PacketSize = unpacket_traits<Packet>::size;
+    EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float vals[PacketSize];
+    EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) float x_cpy[PacketSize];
+    EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) Eigen::numext::int32_t y_int2[PacketSize];
+    pstoreu(vals, pabs(_x));
+    pstoreu(x_cpy, x);
+    pstoreu(y_int2, y_int);
+    for (int k = 0; k < PacketSize; ++k) {
+      float val = vals[k];
+      if (val >= huge_th && (numext::isfinite)(val)) x_cpy[k] = trig_reduce_huge(val, &y_int2[k]);
+    }
+    x = ploadu<Packet>(x_cpy);
+    y_int = ploadu<PacketI>(y_int2);
+  }
+
+  // Compute the sign to apply to the polynomial.
+  // sin: sign = second_bit(y_int) xor signbit(_x)
+  // cos: sign = second_bit(y_int+1)
+  Packet sign_bit = ComputeSine ? pxor(_x, preinterpret<Packet>(plogical_shift_left<30>(y_int)))
+                                : preinterpret<Packet>(plogical_shift_left<30>(padd(y_int, csti_1)));
+  sign_bit = pand(sign_bit, cst_sign_mask);  // clear all but left most bit
+
+  // Get the polynomial selection mask from the second bit of y_int
+  // We'll calculate both (sin and cos) polynomials and then select from the two.
+  Packet poly_mask = preinterpret<Packet>(pcmp_eq(pand(y_int, csti_1), pzero(y_int)));
+
+  Packet x2 = pmul(x, x);
+
+  // Evaluate the cos(x) polynomial. (-Pi/4 <= x <= Pi/4)
+  Packet y1 = pset1<Packet>(2.4372266125283204019069671630859375e-05f);
+  y1 = pmadd(y1, x2, pset1<Packet>(-0.00138865201734006404876708984375f));
+  y1 = pmadd(y1, x2, pset1<Packet>(0.041666619479656219482421875f));
+  y1 = pmadd(y1, x2, pset1<Packet>(-0.5f));
+  y1 = pmadd(y1, x2, pset1<Packet>(1.f));
+
+  // Evaluate the sin(x) polynomial. (Pi/4 <= x <= Pi/4)
+  // octave/matlab code to compute those coefficients:
+  //    x = (0:0.0001:pi/4)';
+  //    A = [x.^3 x.^5 x.^7];
+  //    w = ((1.-(x/(pi/4)).^2).^5)*2000+1;         # weights trading relative accuracy
+  //    c = (A'*diag(w)*A)\(A'*diag(w)*(sin(x)-x)); # weighted LS, linear coeff forced to 1
+  //    printf('%.64f\n %.64f\n%.64f\n', c(3), c(2), c(1))
+  //
+  Packet y2 = pset1<Packet>(-0.0001959234114083702898469196984621021329076029360294342041015625f);
+  y2 = pmadd(y2, x2, pset1<Packet>(0.0083326873655616851693794799871284340042620897293090820312500000f));
+  y2 = pmadd(y2, x2, pset1<Packet>(-0.1666666203982298255503735617821803316473960876464843750000000000f));
+  y2 = pmul(y2, x2);
+  y2 = pmadd(y2, x, x);
+
+  // Select the correct result from the two polynomials.
+  if (ComputeBoth) {
+    Packet peven = peven_mask(x);
+    Packet ysin = pselect(poly_mask, y2, y1);
+    Packet ycos = pselect(poly_mask, y1, y2);
+    Packet sign_bit_sin = pxor(_x, preinterpret<Packet>(plogical_shift_left<30>(y_int)));
+    Packet sign_bit_cos = preinterpret<Packet>(plogical_shift_left<30>(padd(y_int, csti_1)));
+    sign_bit_sin = pand(sign_bit_sin, cst_sign_mask);  // clear all but left most bit
+    sign_bit_cos = pand(sign_bit_cos, cst_sign_mask);  // clear all but left most bit
+    y = pselect(peven, pxor(ysin, sign_bit_sin), pxor(ycos, sign_bit_cos));
+  } else {
+    y = ComputeSine ? pselect(poly_mask, y2, y1) : pselect(poly_mask, y1, y2);
+    y = pxor(y, sign_bit);
+  }
+  // Update the sign and filter huge inputs
+  return y;
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psin_float(const Packet& x) {
+  return psincos_float<true>(x);
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcos_float(const Packet& x) {
+  return psincos_float<false>(x);
+}
+
+// Trigonometric argument reduction for double for inputs smaller than 15.
+// Reduces trigonometric arguments for double inputs where x < 15. Given an argument x and its corresponding quadrant
+// count n, the function computes and returns the reduced argument t such that x = n * pi/2 + t.
+template <typename Packet>
+Packet trig_reduce_small_double(const Packet& x, const Packet& q) {
+  // Pi/2 split into 2 values
+  const Packet cst_pio2_a = pset1<Packet>(-1.570796325802803);
+  const Packet cst_pio2_b = pset1<Packet>(-9.920935184482005e-10);
+
+  Packet t;
+  t = pmadd(cst_pio2_a, q, x);
+  t = pmadd(cst_pio2_b, q, t);
+  return t;
+}
+
+// Trigonometric argument reduction for double for inputs smaller than 1e14.
+// Reduces trigonometric arguments for double inputs where x < 1e14. Given an argument x and its corresponding quadrant
+// count n, the function computes and returns the reduced argument t such that x = n * pi/2 + t.
+template <typename Packet>
+Packet trig_reduce_medium_double(const Packet& x, const Packet& q_high, const Packet& q_low) {
+  // Pi/2 split into 4 values
+  const Packet cst_pio2_a = pset1<Packet>(-1.570796325802803);
+  const Packet cst_pio2_b = pset1<Packet>(-9.920935184482005e-10);
+  const Packet cst_pio2_c = pset1<Packet>(-6.123234014771656e-17);
+  const Packet cst_pio2_d = pset1<Packet>(1.903488962019325e-25);
+
+  Packet t;
+  t = pmadd(cst_pio2_a, q_high, x);
+  t = pmadd(cst_pio2_a, q_low, t);
+  t = pmadd(cst_pio2_b, q_high, t);
+  t = pmadd(cst_pio2_b, q_low, t);
+  t = pmadd(cst_pio2_c, q_high, t);
+  t = pmadd(cst_pio2_c, q_low, t);
+  t = pmadd(cst_pio2_d, padd(q_low, q_high), t);
+  return t;
+}
+
+template <bool ComputeSine, typename Packet, bool ComputeBoth = false>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+#if EIGEN_COMP_GNUC_STRICT
+    __attribute__((optimize("-fno-unsafe-math-optimizations")))
+#endif
+    Packet
+    psincos_double(const Packet& x) {
+  typedef typename unpacket_traits<Packet>::integer_packet PacketI;
+  typedef typename unpacket_traits<PacketI>::type ScalarI;
+
+  const Packet cst_sign_mask = pset1frombits<Packet>(static_cast<Eigen::numext::uint64_t>(0x8000000000000000u));
+
+  // If the argument is smaller than this value, use a simpler argument reduction
+  const double small_th = 15;
+  // If the argument is bigger than this value, use the non-vectorized std version
+  const double huge_th = 1e14;
+
+  const Packet cst_2oPI = pset1<Packet>(0.63661977236758134307553505349006);  // 2/PI
+  // Integer Packet constants
+  const PacketI cst_one = pset1<PacketI>(ScalarI(1));
+  // Constant for splitting
+  const Packet cst_split = pset1<Packet>(1 << 24);
+
+  Packet x_abs = pabs(x);
+
+  // Scale x by 2/Pi
+  PacketI q_int;
+  Packet s;
+
+  // TODO Implement huge angle argument reduction
+  if (EIGEN_PREDICT_FALSE(predux_any(pcmp_le(pset1<Packet>(small_th), x_abs)))) {
+    Packet q_high = pmul(pfloor(pmul(x_abs, pdiv(cst_2oPI, cst_split))), cst_split);
+    Packet q_low_noround = psub(pmul(x_abs, cst_2oPI), q_high);
+    q_int = pcast<Packet, PacketI>(padd(q_low_noround, pset1<Packet>(0.5)));
+    Packet q_low = pcast<PacketI, Packet>(q_int);
+    s = trig_reduce_medium_double(x_abs, q_high, q_low);
+  } else {
+    Packet qval_noround = pmul(x_abs, cst_2oPI);
+    q_int = pcast<Packet, PacketI>(padd(qval_noround, pset1<Packet>(0.5)));
+    Packet q = pcast<PacketI, Packet>(q_int);
+    s = trig_reduce_small_double(x_abs, q);
+  }
+
+  // All the upcoming approximating polynomials have even exponents
+  Packet ss = pmul(s, s);
+
+  // Padé approximant of cos(x)
+  // Assuring < 1 ULP error on the interval [-pi/4, pi/4]
+  // cos(x) ~= (80737373*x^8 - 13853547000*x^6 + 727718024880*x^4 - 11275015752000*x^2 + 23594700729600)/(147173*x^8 +
+  // 39328920*x^6 + 5772800880*x^4 + 522334612800*x^2 + 23594700729600)
+  // MATLAB code to compute those coefficients:
+  //    syms x;
+  //    cosf = @(x) cos(x);
+  //    pade_cosf = pade(cosf(x), x, 0, 'Order', 8)
+  Packet sc1_num = pmadd(ss, pset1<Packet>(80737373), pset1<Packet>(-13853547000));
+  Packet sc2_num = pmadd(sc1_num, ss, pset1<Packet>(727718024880));
+  Packet sc3_num = pmadd(sc2_num, ss, pset1<Packet>(-11275015752000));
+  Packet sc4_num = pmadd(sc3_num, ss, pset1<Packet>(23594700729600));
+  Packet sc1_denum = pmadd(ss, pset1<Packet>(147173), pset1<Packet>(39328920));
+  Packet sc2_denum = pmadd(sc1_denum, ss, pset1<Packet>(5772800880));
+  Packet sc3_denum = pmadd(sc2_denum, ss, pset1<Packet>(522334612800));
+  Packet sc4_denum = pmadd(sc3_denum, ss, pset1<Packet>(23594700729600));
+  Packet scos = pdiv(sc4_num, sc4_denum);
+
+  // Padé approximant of sin(x)
+  // Assuring < 1 ULP error on the interval [-pi/4, pi/4]
+  // sin(x) ~= (x*(4585922449*x^8 - 1066023933480*x^6 + 83284044283440*x^4 - 2303682236856000*x^2 +
+  // 15605159573203200))/(45*(1029037*x^8 + 345207016*x^6 + 61570292784*x^4 + 6603948711360*x^2 + 346781323848960))
+  // MATLAB code to compute those coefficients:
+  //    syms x;
+  //    sinf = @(x) sin(x);
+  //    pade_sinf = pade(sinf(x), x, 0, 'Order', 8, 'OrderMode', 'relative')
+  Packet ss1_num = pmadd(ss, pset1<Packet>(4585922449), pset1<Packet>(-1066023933480));
+  Packet ss2_num = pmadd(ss1_num, ss, pset1<Packet>(83284044283440));
+  Packet ss3_num = pmadd(ss2_num, ss, pset1<Packet>(-2303682236856000));
+  Packet ss4_num = pmadd(ss3_num, ss, pset1<Packet>(15605159573203200));
+  Packet ss1_denum = pmadd(ss, pset1<Packet>(1029037), pset1<Packet>(345207016));
+  Packet ss2_denum = pmadd(ss1_denum, ss, pset1<Packet>(61570292784));
+  Packet ss3_denum = pmadd(ss2_denum, ss, pset1<Packet>(6603948711360));
+  Packet ss4_denum = pmadd(ss3_denum, ss, pset1<Packet>(346781323848960));
+  Packet ssin = pdiv(pmul(s, ss4_num), pmul(pset1<Packet>(45), ss4_denum));
+
+  Packet poly_mask = preinterpret<Packet>(pcmp_eq(pand(q_int, cst_one), pzero(q_int)));
+
+  Packet sign_sin = pxor(x, preinterpret<Packet>(plogical_shift_left<62>(q_int)));
+  Packet sign_cos = preinterpret<Packet>(plogical_shift_left<62>(padd(q_int, cst_one)));
+  Packet sign_bit, sFinalRes;
+  if (ComputeBoth) {
+    Packet peven = peven_mask(x);
+    sign_bit = pselect((s), sign_sin, sign_cos);
+    sFinalRes = pselect(pxor(peven, poly_mask), ssin, scos);
+  } else {
+    sign_bit = ComputeSine ? sign_sin : sign_cos;
+    sFinalRes = ComputeSine ? pselect(poly_mask, ssin, scos) : pselect(poly_mask, scos, ssin);
+  }
+  sign_bit = pand(sign_bit, cst_sign_mask);  // clear all but left most bit
+  sFinalRes = pxor(sFinalRes, sign_bit);
+
+  // If the inputs values are higher than that a value that the argument reduction can currently address, compute them
+  // using std::sin and std::cos
+  // TODO Remove it when huge angle argument reduction is implemented
+  if (EIGEN_PREDICT_FALSE(predux_any(pcmp_le(pset1<Packet>(huge_th), x_abs)))) {
+    const int PacketSize = unpacket_traits<Packet>::size;
+    EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) double sincos_vals[PacketSize];
+    EIGEN_ALIGN_TO_BOUNDARY(sizeof(Packet)) double x_cpy[PacketSize];
+    pstoreu(x_cpy, x);
+    pstoreu(sincos_vals, sFinalRes);
+    for (int k = 0; k < PacketSize; ++k) {
+      double val = x_cpy[k];
+      if (std::abs(val) > huge_th && (numext::isfinite)(val)) {
+        if (ComputeBoth)
+          sincos_vals[k] = k % 2 == 0 ? std::sin(val) : std::cos(val);
+        else
+          sincos_vals[k] = ComputeSine ? std::sin(val) : std::cos(val);
+      }
+    }
+    sFinalRes = ploadu<Packet>(sincos_vals);
+  }
+  return sFinalRes;
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psin_double(const Packet& x) {
+  return psincos_double<true>(x);
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcos_double(const Packet& x) {
+  return psincos_double<false>(x);
+}
+
+// Generic implementation of acos(x).
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pacos_float(const Packet& x_in) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
+
+  const Packet cst_one = pset1<Packet>(Scalar(1));
+  const Packet cst_pi = pset1<Packet>(Scalar(EIGEN_PI));
+  const Packet p6 = pset1<Packet>(Scalar(2.36423197202384471893310546875e-3));
+  const Packet p5 = pset1<Packet>(Scalar(-1.1368644423782825469970703125e-2));
+  const Packet p4 = pset1<Packet>(Scalar(2.717843465507030487060546875e-2));
+  const Packet p3 = pset1<Packet>(Scalar(-4.8969544470310211181640625e-2));
+  const Packet p2 = pset1<Packet>(Scalar(8.8804088532924652099609375e-2));
+  const Packet p1 = pset1<Packet>(Scalar(-0.214591205120086669921875));
+  const Packet p0 = pset1<Packet>(Scalar(1.57079637050628662109375));
+
+  // For x in [0:1], we approximate acos(x)/sqrt(1-x), which is a smooth
+  // function, by a 6'th order polynomial.
+  // For x in [-1:0) we use that acos(-x) = pi - acos(x).
+  const Packet neg_mask = psignbit(x_in);
+  const Packet abs_x = pabs(x_in);
+
+  // Evaluate the polynomial using Horner's rule:
+  //   P(x) = p0 + x * (p1 +  x * (p2 + ... (p5 + x * p6)) ... ) .
+  // We evaluate even and odd terms independently to increase
+  // instruction level parallelism.
+  Packet x2 = pmul(x_in, x_in);
+  Packet p_even = pmadd(p6, x2, p4);
+  Packet p_odd = pmadd(p5, x2, p3);
+  p_even = pmadd(p_even, x2, p2);
+  p_odd = pmadd(p_odd, x2, p1);
+  p_even = pmadd(p_even, x2, p0);
+  Packet p = pmadd(p_odd, abs_x, p_even);
+
+  // The polynomial approximates acos(x)/sqrt(1-x), so
+  // multiply by sqrt(1-x) to get acos(x).
+  // Conveniently returns NaN for arguments outside [-1:1].
+  Packet denom = psqrt(psub(cst_one, abs_x));
+  Packet result = pmul(denom, p);
+  // Undo mapping for negative arguments.
+  return pselect(neg_mask, psub(cst_pi, result), result);
+}
+
+// Generic implementation of asin(x).
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pasin_float(const Packet& x_in) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
+
+  constexpr float kPiOverTwo = static_cast<float>(EIGEN_PI / 2);
+
+  const Packet cst_half = pset1<Packet>(0.5f);
+  const Packet cst_one = pset1<Packet>(1.0f);
+  const Packet cst_two = pset1<Packet>(2.0f);
+  const Packet cst_pi_over_two = pset1<Packet>(kPiOverTwo);
+
+  const Packet abs_x = pabs(x_in);
+  const Packet sign_mask = pandnot(x_in, abs_x);
+  const Packet invalid_mask = pcmp_lt(cst_one, abs_x);
+
+  // For arguments |x| > 0.5, we map x back to [0:0.5] using
+  // the transformation x_large = sqrt(0.5*(1-x)), and use the
+  // identity
+  //   asin(x) = pi/2 - 2 * asin( sqrt( 0.5 * (1 - x)))
+
+  const Packet x_large = psqrt(pnmadd(cst_half, abs_x, cst_half));
+  const Packet large_mask = pcmp_lt(cst_half, abs_x);
+  const Packet x = pselect(large_mask, x_large, abs_x);
+  const Packet x2 = pmul(x, x);
+
+  // For |x| < 0.5 approximate asin(x)/x by an 8th order polynomial with
+  // even terms only.
+  constexpr float alpha[] = {5.08838854730129241943359375e-2f, 3.95139865577220916748046875e-2f,
+                             7.550220191478729248046875e-2f, 0.16664917767047882080078125f, 1.00000011920928955078125f};
+  Packet p = ppolevl<Packet, 4>::run(x2, alpha);
+  p = pmul(p, x);
+
+  const Packet p_large = pnmadd(cst_two, p, cst_pi_over_two);
+  p = pselect(large_mask, p_large, p);
+  // Flip the sign for negative arguments.
+  p = pxor(p, sign_mask);
+  // Return NaN for arguments outside [-1:1].
+  return por(invalid_mask, p);
+}
+
+template <typename Scalar>
+struct patan_reduced {
+  template <typename Packet>
+  static EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet run(const Packet& x);
+};
+
+template <>
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patan_reduced<double>::run(const Packet& x) {
+  constexpr double alpha[] = {2.6667153866462208e-05, 3.0917513112462781e-03, 5.2574296781008604e-02,
+                              3.0409318473444424e-01, 7.5365702534987022e-01, 8.2704055405494614e-01,
+                              3.3004361289279920e-01};
+
+  constexpr double beta[] = {
+      2.7311202462436667e-04, 1.0899150928962708e-02, 1.1548932646420353e-01, 4.9716458728465573e-01, 1.0,
+      9.3705509168587852e-01, 3.3004361289279920e-01};
+
+  Packet x2 = pmul(x, x);
+  Packet p = ppolevl<Packet, 6>::run(x2, alpha);
+  Packet q = ppolevl<Packet, 6>::run(x2, beta);
+  return pmul(x, pdiv(p, q));
+}
+
+// Computes elementwise atan(x) for x in [-1:1] with 2 ulp accuracy.
+template <>
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patan_reduced<float>::run(const Packet& x) {
+  constexpr float alpha[] = {1.12026982009410858154296875e-01f, 7.296695709228515625e-01f, 8.109951019287109375e-01f};
+
+  constexpr float beta[] = {1.00917108356952667236328125e-02f, 2.8318560123443603515625e-01f, 1.0f,
+                            8.109951019287109375e-01f};
+
+  Packet x2 = pmul(x, x);
+  Packet p = ppolevl<Packet, 2>::run(x2, alpha);
+  Packet q = ppolevl<Packet, 3>::run(x2, beta);
+  return pmul(x, pdiv(p, q));
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_atan(const Packet& x_in) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+
+  constexpr Scalar kPiOverTwo = static_cast<Scalar>(EIGEN_PI / 2);
+
+  const Packet cst_signmask = pset1<Packet>(Scalar(-0.0));
+  const Packet cst_one = pset1<Packet>(Scalar(1));
+  const Packet cst_pi_over_two = pset1<Packet>(kPiOverTwo);
+
+  //   "Large": For |x| > 1, use atan(1/x) = sign(x)*pi/2 - atan(x).
+  //   "Small": For |x| <= 1, approximate atan(x) directly by a polynomial
+  //            calculated using Rminimax.
+
+  const Packet abs_x = pabs(x_in);
+  const Packet x_signmask = pand(x_in, cst_signmask);
+  const Packet large_mask = pcmp_lt(cst_one, abs_x);
+  const Packet x = pselect(large_mask, preciprocal(abs_x), abs_x);
+  const Packet p = patan_reduced<Scalar>::run(x);
+  // Apply transformations according to the range reduction masks.
+  Packet result = pselect(large_mask, psub(cst_pi_over_two, p), p);
+  // Return correct sign
+  return pxor(result, x_signmask);
+}
+
+/** \internal \returns the hyperbolic tan of \a a (coeff-wise)
+    Doesn't do anything fancy, just a 9/8-degree rational interpolant which
+    is accurate up to a couple of ulps in the (approximate) range [-8, 8],
+    outside of which tanh(x) = +/-1 in single precision. The input is clamped
+    to the range [-c, c]. The value c is chosen as the smallest value where
+    the approximation evaluates to exactly 1.
+
+    This implementation works on both scalars and packets.
+*/
+template <typename T>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS T ptanh_float(const T& a_x) {
+  // Clamp the inputs to the range [-c, c] and set everything
+  // outside that range to 1.0. The value c is chosen as the smallest
+  // floating point argument such that the approximation is exactly 1.
+  // This saves clamping the value at the end.
+#ifdef EIGEN_VECTORIZE_FMA
+  const T plus_clamp = pset1<T>(8.01773357391357422f);
+  const T minus_clamp = pset1<T>(-8.01773357391357422f);
+#else
+  const T plus_clamp = pset1<T>(7.90738964080810547f);
+  const T minus_clamp = pset1<T>(-7.90738964080810547f);
+#endif
+  const T x = pmax(pmin(a_x, plus_clamp), minus_clamp);
+
+  // The following rational approximation was generated by rminimax
+  // (https://gitlab.inria.fr/sfilip/rminimax) using the following
+  // command:
+  // $ ratapprox --function="tanh(x)" --dom='[-8.67,8.67]' --num="odd"
+  //   --den="even" --type="[9,8]" --numF="[SG]" --denF="[SG]" --log
+  //   --output=tanhf.sollya --dispCoeff="dec"
+
+  // The monomial coefficients of the numerator polynomial (odd).
+  constexpr float alpha[] = {1.394553628e-8f, 2.102733560e-5f, 3.520756727e-3f, 1.340216100e-1f};
+
+  // The monomial coefficients of the denominator polynomial (even).
+  constexpr float beta[] = {8.015776984e-7f, 3.326951409e-4f, 2.597254514e-2f, 4.673548340e-1f, 1.0f};
+
+  // Since the polynomials are odd/even, we need x^2.
+  const T x2 = pmul(x, x);
+  const T x3 = pmul(x2, x);
+
+  T p = ppolevl<T, 3>::run(x2, alpha);
+  T q = ppolevl<T, 4>::run(x2, beta);
+  // Take advantage of the fact that the constant term in p is 1 to compute
+  // x*(x^2*p + 1) = x^3 * p + x.
+  p = pmadd(x3, p, x);
+
+  // Divide the numerator by the denominator.
+  return pdiv(p, q);
+}
+
+/** \internal \returns the hyperbolic tan of \a a (coeff-wise)
+    This uses a 19/18-degree rational interpolant which
+    is accurate up to a couple of ulps in the (approximate) range [-18.7, 18.7],
+    outside of which tanh(x) = +/-1 in single precision. The input is clamped
+    to the range [-c, c]. The value c is chosen as the smallest value where
+    the approximation evaluates to exactly 1.
+
+    This implementation works on both scalars and packets.
+*/
+template <typename T>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS T ptanh_double(const T& a_x) {
+  // Clamp the inputs to the range [-c, c] and set everything
+  // outside that range to 1.0. The value c is chosen as the smallest
+  // floating point argument such that the approximation is exactly 1.
+  // This saves clamping the value at the end.
+#ifdef EIGEN_VECTORIZE_FMA
+  const T plus_clamp = pset1<T>(17.6610191624600077);
+  const T minus_clamp = pset1<T>(-17.6610191624600077);
+#else
+  const T plus_clamp = pset1<T>(17.714196154005176);
+  const T minus_clamp = pset1<T>(-17.714196154005176);
+#endif
+  const T x = pmax(pmin(a_x, plus_clamp), minus_clamp);
+
+  // The following rational approximation was generated by rminimax
+  // (https://gitlab.inria.fr/sfilip/rminimax) using the following
+  // command:
+  // $ ./ratapprox --function="tanh(x)" --dom='[-18.72,18.72]'
+  //   --num="odd" --den="even" --type="[19,18]" --numF="[D]"
+  //   --denF="[D]" --log --output=tanh.sollya --dispCoeff="dec"
+
+  // The monomial coefficients of the numerator polynomial (odd).
+  constexpr double alpha[] = {2.6158007860482230e-23, 7.6534862268749319e-19, 3.1309488231386680e-15,
+                              4.2303918148209176e-12, 2.4618379131293676e-09, 6.8644367682497074e-07,
+                              9.3839087674268880e-05, 5.9809711724441161e-03, 1.5184719640284322e-01};
+
+  // The monomial coefficients of the denominator polynomial (even).
+  constexpr double beta[] = {6.463747022670968018e-21, 5.782506856739003571e-17,
+                             1.293019623712687916e-13, 1.123643448069621992e-10,
+                             4.492975677839633985e-08, 8.785185266237658698e-06,
+                             8.295161192716231542e-04, 3.437448108450402717e-02,
+                             4.851805297361760360e-01, 1.0};
+
+  // Since the polynomials are odd/even, we need x^2.
+  const T x2 = pmul(x, x);
+  const T x3 = pmul(x2, x);
+
+  // Interleave the evaluation of the numerator polynomial p and
+  // denominator polynomial q.
+  T p = ppolevl<T, 8>::run(x2, alpha);
+  T q = ppolevl<T, 9>::run(x2, beta);
+  // Take advantage of the fact that the constant term in p is 1 to compute
+  // x*(x^2*p + 1) = x^3 * p + x.
+  p = pmadd(x3, p, x);
+
+  // Divide the numerator by the denominator.
+  return pdiv(p, q);
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patanh_float(const Packet& x) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  static_assert(std::is_same<Scalar, float>::value, "Scalar type must be float");
+
+  // For |x| in [0:0.5] we use a polynomial approximation of the form
+  // P(x) = x + x^3*(alpha[4] + x^2 * (alpha[3] + x^2 * (... x^2 * alpha[0]) ... )).
+  constexpr float alpha[] = {0.1819281280040740966796875f, 8.2311116158962249755859375e-2f,
+                             0.14672131836414337158203125f, 0.1997792422771453857421875f, 0.3333373963832855224609375f};
+  const Packet x2 = pmul(x, x);
+  const Packet x3 = pmul(x, x2);
+  Packet p = ppolevl<Packet, 4>::run(x2, alpha);
+  p = pmadd(x3, p, x);
+
+  // For |x| in ]0.5:1.0] we use atanh = 0.5*ln((1+x)/(1-x));
+  const Packet half = pset1<Packet>(0.5f);
+  const Packet one = pset1<Packet>(1.0f);
+  Packet r = pdiv(padd(one, x), psub(one, x));
+  r = pmul(half, plog(r));
+
+  const Packet x_gt_half = pcmp_le(half, pabs(x));
+  const Packet x_eq_one = pcmp_eq(one, pabs(x));
+  const Packet x_gt_one = pcmp_lt(one, pabs(x));
+  const Packet sign_mask = pset1<Packet>(-0.0f);
+  const Packet x_sign = pand(sign_mask, x);
+  const Packet inf = pset1<Packet>(std::numeric_limits<float>::infinity());
+  return por(x_gt_one, pselect(x_eq_one, por(x_sign, inf), pselect(x_gt_half, r, p)));
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patanh_double(const Packet& x) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  static_assert(std::is_same<Scalar, double>::value, "Scalar type must be double");
+  // For x in [-0.5:0.5] we use a rational approximation of the form
+  // R(x) = x + x^3*P(x^2)/Q(x^2), where P is or order 4 and Q is of order 5.
+  constexpr double alpha[] = {3.3071338469301391e-03, -4.7129526768798737e-02, 1.8185306179826699e-01,
+                              -2.5949536095445679e-01, 1.2306328729812676e-01};
+
+  constexpr double beta[] = {-3.8679974580640881e-03, 7.6391885763341910e-02,  -4.2828141436397615e-01,
+                             9.8733495886883648e-01,  -1.0000000000000000e+00, 3.6918986189438030e-01};
+
+  const Packet x2 = pmul(x, x);
+  const Packet x3 = pmul(x, x2);
+  Packet p = ppolevl<Packet, 4>::run(x2, alpha);
+  Packet q = ppolevl<Packet, 5>::run(x2, beta);
+  Packet y_small = pmadd(x3, pdiv(p, q), x);
+
+  // For |x| in ]0.5:1.0] we use atanh = 0.5*ln((1+x)/(1-x));
+  const Packet half = pset1<Packet>(0.5);
+  const Packet one = pset1<Packet>(1.0);
+  Packet y_large = pdiv(padd(one, x), psub(one, x));
+  y_large = pmul(half, plog(y_large));
+
+  const Packet x_gt_half = pcmp_le(half, pabs(x));
+  const Packet x_eq_one = pcmp_eq(one, pabs(x));
+  const Packet x_gt_one = pcmp_lt(one, pabs(x));
+  const Packet sign_mask = pset1<Packet>(-0.0);
+  const Packet x_sign = pand(sign_mask, x);
+  const Packet inf = pset1<Packet>(std::numeric_limits<double>::infinity());
+  return por(x_gt_one, pselect(x_eq_one, por(x_sign, inf), pselect(x_gt_half, y_large, y_small)));
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pdiv_complex(const Packet& x, const Packet& y) {
+  typedef typename unpacket_traits<Packet>::as_real RealPacket;
+  // In the following we annotate the code for the case where the inputs
+  // are a pair length-2 SIMD vectors representing a single pair of complex
+  // numbers x = a + i*b, y = c + i*d.
+  const RealPacket y_abs = pabs(y.v);                        // |c|, |d|
+  const RealPacket y_abs_flip = pcplxflip(Packet(y_abs)).v;  // |d|, |c|
+  const RealPacket y_max = pmax(y_abs, y_abs_flip);          // max(|c|, |d|), max(|c|, |d|)
+  const RealPacket y_scaled = pdiv(y.v, y_max);              // c / max(|c|, |d|), d / max(|c|, |d|)
+  // Compute scaled denominator.
+  const RealPacket y_scaled_sq = pmul(y_scaled, y_scaled);  // c'**2, d'**2
+  const RealPacket denom = padd(y_scaled_sq, pcplxflip(Packet(y_scaled_sq)).v);
+  Packet result_scaled = pmul(x, pconj(Packet(y_scaled)));  // a * c' + b * d', -a * d + b * c
+  // Divide elementwise by denom.
+  result_scaled = Packet(pdiv(result_scaled.v, denom));
+  // Rescale result
+  return Packet(pdiv(result_scaled.v, y_max));
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_complex(const Packet& x) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  typedef typename Scalar::value_type RealScalar;
+  typedef typename unpacket_traits<Packet>::as_real RealPacket;
+
+  RealPacket real_mask_rp = peven_mask(x.v);
+  Packet real_mask(real_mask_rp);
+
+  // Real part
+  RealPacket x_flip = pcplxflip(x).v;  // b, a
+  Packet x_norm = phypot_complex(x);   // sqrt(a^2 + b^2), sqrt(a^2 + b^2)
+  RealPacket xlogr = plog(x_norm.v);   // log(sqrt(a^2 + b^2)), log(sqrt(a^2 + b^2))
+
+  // Imag part
+  RealPacket ximg = patan2(x.v, x_flip);  // atan2(a, b), atan2(b, a)
+
+  const RealPacket cst_pos_inf = pset1<RealPacket>(NumTraits<RealScalar>::infinity());
+  RealPacket x_abs = pabs(x.v);
+  RealPacket is_x_pos_inf = pcmp_eq(x_abs, cst_pos_inf);
+  RealPacket is_y_pos_inf = pcplxflip(Packet(is_x_pos_inf)).v;
+  RealPacket is_any_inf = por(is_x_pos_inf, is_y_pos_inf);
+  RealPacket xreal = pselect(is_any_inf, cst_pos_inf, xlogr);
+
+  Packet xres = pselect(real_mask, Packet(xreal), Packet(ximg));  // log(sqrt(a^2 + b^2)), atan2(b, a)
+  return xres;
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_complex(const Packet& a) {
+  typedef typename unpacket_traits<Packet>::as_real RealPacket;
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  typedef typename Scalar::value_type RealScalar;
+  const RealPacket even_mask = peven_mask(a.v);
+  const RealPacket odd_mask = pcplxflip(Packet(even_mask)).v;
+
+  // Let a = x + iy.
+  // exp(a) = exp(x) * cis(y), plus some special edge-case handling.
+
+  // exp(x):
+  RealPacket x = pand(a.v, even_mask);
+  x = por(x, pcplxflip(Packet(x)).v);
+  RealPacket expx = pexp(x);  // exp(x);
+
+  // cis(y):
+  RealPacket y = pand(odd_mask, a.v);
+  y = por(y, pcplxflip(Packet(y)).v);
+  RealPacket cisy = psincos_float<false, RealPacket, true>(y);
+  cisy = pcplxflip(Packet(cisy)).v;  // cos(y) + i * sin(y)
+
+  const RealPacket cst_pos_inf = pset1<RealPacket>(NumTraits<RealScalar>::infinity());
+  const RealPacket cst_neg_inf = pset1<RealPacket>(-NumTraits<RealScalar>::infinity());
+
+  // If x is -inf, we know that cossin(y) is bounded,
+  //   so the result is (0, +/-0), where the sign of the imaginary part comes
+  //   from the sign of cossin(y).
+  RealPacket cisy_sign = por(pandnot(cisy, pabs(cisy)), pset1<RealPacket>(RealScalar(1)));
+  cisy = pselect(pcmp_eq(x, cst_neg_inf), cisy_sign, cisy);
+
+  // If x is inf, and cos(y) has unknown sign (y is inf or NaN), the result
+  // is (+/-inf, NaN), where the signs are undetermined (take the sign of y).
+  RealPacket y_sign = por(pandnot(y, pabs(y)), pset1<RealPacket>(RealScalar(1)));
+  cisy = pselect(pand(pcmp_eq(x, cst_pos_inf), pisnan(cisy)), pand(y_sign, even_mask), cisy);
+  Packet result = Packet(pmul(expx, cisy));
+
+  // If y is +/- 0, the input is real, so take the real result for consistency.
+  result = pselect(Packet(pcmp_eq(y, pzero(y))), Packet(por(pand(expx, even_mask), pand(y, odd_mask))), result);
+
+  return result;
+}
+
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psqrt_complex(const Packet& a) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  typedef typename Scalar::value_type RealScalar;
+  typedef typename unpacket_traits<Packet>::as_real RealPacket;
+
+  // Computes the principal sqrt of the complex numbers in the input.
+  //
+  // For example, for packets containing 2 complex numbers stored in interleaved format
+  //    a = [a0, a1] = [x0, y0, x1, y1],
+  // where x0 = real(a0), y0 = imag(a0) etc., this function returns
+  //    b = [b0, b1] = [u0, v0, u1, v1],
+  // such that b0^2 = a0, b1^2 = a1.
+  //
+  // To derive the formula for the complex square roots, let's consider the equation for
+  // a single complex square root of the number x + i*y. We want to find real numbers
+  // u and v such that
+  //    (u + i*v)^2 = x + i*y  <=>
+  //    u^2 - v^2 + i*2*u*v = x + i*v.
+  // By equating the real and imaginary parts we get:
+  //    u^2 - v^2 = x
+  //    2*u*v = y.
+  //
+  // For x >= 0, this has the numerically stable solution
+  //    u = sqrt(0.5 * (x + sqrt(x^2 + y^2)))
+  //    v = 0.5 * (y / u)
+  // and for x < 0,
+  //    v = sign(y) * sqrt(0.5 * (-x + sqrt(x^2 + y^2)))
+  //    u = 0.5 * (y / v)
+  //
+  //  To avoid unnecessary over- and underflow, we compute sqrt(x^2 + y^2) as
+  //     l = max(|x|, |y|) * sqrt(1 + (min(|x|, |y|) / max(|x|, |y|))^2) ,
+
+  // In the following, without lack of generality, we have annotated the code, assuming
+  // that the input is a packet of 2 complex numbers.
+  //
+  // Step 1. Compute l = [l0, l0, l1, l1], where
+  //    l0 = sqrt(x0^2 + y0^2),  l1 = sqrt(x1^2 + y1^2)
+  // To avoid over- and underflow, we use the stable formula for each hypotenuse
+  //    l0 = (min0 == 0 ? max0 : max0 * sqrt(1 + (min0/max0)**2)),
+  // where max0 = max(|x0|, |y0|), min0 = min(|x0|, |y0|), and similarly for l1.
+
+  RealPacket a_abs = pabs(a.v);                        // [|x0|, |y0|, |x1|, |y1|]
+  RealPacket a_abs_flip = pcplxflip(Packet(a_abs)).v;  // [|y0|, |x0|, |y1|, |x1|]
+  RealPacket a_max = pmax(a_abs, a_abs_flip);
+  RealPacket a_min = pmin(a_abs, a_abs_flip);
+  RealPacket a_min_zero_mask = pcmp_eq(a_min, pzero(a_min));
+  RealPacket a_max_zero_mask = pcmp_eq(a_max, pzero(a_max));
+  RealPacket r = pdiv(a_min, a_max);
+  const RealPacket cst_one = pset1<RealPacket>(RealScalar(1));
+  RealPacket l = pmul(a_max, psqrt(padd(cst_one, pmul(r, r))));  // [l0, l0, l1, l1]
+  // Set l to a_max if a_min is zero.
+  l = pselect(a_min_zero_mask, a_max, l);
+
+  // Step 2. Compute [rho0, *, rho1, *], where
+  // rho0 = sqrt(0.5 * (l0 + |x0|)), rho1 =  sqrt(0.5 * (l1 + |x1|))
+  // We don't care about the imaginary parts computed here. They will be overwritten later.
+  const RealPacket cst_half = pset1<RealPacket>(RealScalar(0.5));
+  Packet rho;
+  rho.v = psqrt(pmul(cst_half, padd(a_abs, l)));
+
+  // Step 3. Compute [rho0, eta0, rho1, eta1], where
+  // eta0 = (y0 / l0) / 2, and eta1 = (y1 / l1) / 2.
+  // set eta = 0 of input is 0 + i0.
+  RealPacket eta = pandnot(pmul(cst_half, pdiv(a.v, pcplxflip(rho).v)), a_max_zero_mask);
+  RealPacket real_mask = peven_mask(a.v);
+  Packet positive_real_result;
+  // Compute result for inputs with positive real part.
+  positive_real_result.v = pselect(real_mask, rho.v, eta);
+
+  // Step 4. Compute solution for inputs with negative real part:
+  //         [|eta0|, sign(y0)*rho0, |eta1|, sign(y1)*rho1]
+  const RealPacket cst_imag_sign_mask = pset1<Packet>(Scalar(RealScalar(0.0), RealScalar(-0.0))).v;
+  RealPacket imag_signs = pand(a.v, cst_imag_sign_mask);
+  Packet negative_real_result;
+  // Notice that rho is positive, so taking it's absolute value is a noop.
+  negative_real_result.v = por(pabs(pcplxflip(positive_real_result).v), imag_signs);
+
+  // Step 5. Select solution branch based on the sign of the real parts.
+  Packet negative_real_mask;
+  negative_real_mask.v = pcmp_lt(pand(real_mask, a.v), pzero(a.v));
+  negative_real_mask.v = por(negative_real_mask.v, pcplxflip(negative_real_mask).v);
+  Packet result = pselect(negative_real_mask, negative_real_result, positive_real_result);
+
+  // Step 6. Handle special cases for infinities:
+  // * If z is (x,+∞), the result is (+∞,+∞) even if x is NaN
+  // * If z is (x,-∞), the result is (+∞,-∞) even if x is NaN
+  // * If z is (-∞,y), the result is (0*|y|,+∞) for finite or NaN y
+  // * If z is (+∞,y), the result is (+∞,0*|y|) for finite or NaN y
+  const RealPacket cst_pos_inf = pset1<RealPacket>(NumTraits<RealScalar>::infinity());
+  Packet is_inf;
+  is_inf.v = pcmp_eq(a_abs, cst_pos_inf);
+  Packet is_real_inf;
+  is_real_inf.v = pand(is_inf.v, real_mask);
+  is_real_inf = por(is_real_inf, pcplxflip(is_real_inf));
+  // prepare packet of (+∞,0*|y|) or (0*|y|,+∞), depending on the sign of the infinite real part.
+  Packet real_inf_result;
+  real_inf_result.v = pmul(a_abs, pset1<Packet>(Scalar(RealScalar(1.0), RealScalar(0.0))).v);
+  real_inf_result.v = pselect(negative_real_mask.v, pcplxflip(real_inf_result).v, real_inf_result.v);
+  // prepare packet of (+∞,+∞) or (+∞,-∞), depending on the sign of the infinite imaginary part.
+  Packet is_imag_inf;
+  is_imag_inf.v = pandnot(is_inf.v, real_mask);
+  is_imag_inf = por(is_imag_inf, pcplxflip(is_imag_inf));
+  Packet imag_inf_result;
+  imag_inf_result.v = por(pand(cst_pos_inf, real_mask), pandnot(a.v, real_mask));
+  // unless otherwise specified, if either the real or imaginary component is nan, the entire result is nan
+  Packet result_is_nan = pisnan(result);
+  result = por(result_is_nan, result);
+
+  return pselect(is_imag_inf, imag_inf_result, pselect(is_real_inf, real_inf_result, result));
+}
+
+// \internal \returns the norm of a complex number z = x + i*y, defined as sqrt(x^2 + y^2).
+// Implemented using the hypot(a,b) algorithm from https://doi.org/10.48550/arXiv.1904.09481
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet phypot_complex(const Packet& a) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  typedef typename Scalar::value_type RealScalar;
+  typedef typename unpacket_traits<Packet>::as_real RealPacket;
+
+  const RealPacket cst_zero_rp = pset1<RealPacket>(static_cast<RealScalar>(0.0));
+  const RealPacket cst_minus_one_rp = pset1<RealPacket>(static_cast<RealScalar>(-1.0));
+  const RealPacket cst_two_rp = pset1<RealPacket>(static_cast<RealScalar>(2.0));
+  const RealPacket evenmask = peven_mask(a.v);
+
+  RealPacket a_abs = pabs(a.v);
+  RealPacket a_flip = pcplxflip(Packet(a_abs)).v;       // |b|, |a|
+  RealPacket a_all = pselect(evenmask, a_abs, a_flip);  // |a|, |a|
+  RealPacket b_all = pselect(evenmask, a_flip, a_abs);  // |b|, |b|
+
+  RealPacket a2 = pmul(a.v, a.v);                    // |a^2, b^2|
+  RealPacket a2_flip = pcplxflip(Packet(a2)).v;      // |b^2, a^2|
+  RealPacket h = psqrt(padd(a2, a2_flip));           // |sqrt(a^2 + b^2), sqrt(a^2 + b^2)|
+  RealPacket h_sq = pmul(h, h);                      // |a^2 + b^2, a^2 + b^2|
+  RealPacket a_sq = pselect(evenmask, a2, a2_flip);  // |a^2, a^2|
+  RealPacket m_h_sq = pmul(h_sq, cst_minus_one_rp);
+  RealPacket m_a_sq = pmul(a_sq, cst_minus_one_rp);
+  RealPacket x = psub(psub(pmadd(h, h, m_h_sq), pmadd(b_all, b_all, psub(a_sq, h_sq))), pmadd(a_all, a_all, m_a_sq));
+  h = psub(h, pdiv(x, pmul(cst_two_rp, h)));  // |h - x/(2*h), h - x/(2*h)|
+
+  // handle zero-case
+  RealPacket iszero = pcmp_eq(por(a_abs, a_flip), cst_zero_rp);
+
+  h = pandnot(h, iszero);  // |sqrt(a^2+b^2), sqrt(a^2+b^2)|
+  return Packet(h);        // |sqrt(a^2+b^2), sqrt(a^2+b^2)|
+}
+
+template <typename Packet>
+struct psign_impl<Packet, std::enable_if_t<!is_scalar<Packet>::value &&
+                                           !NumTraits<typename unpacket_traits<Packet>::type>::IsComplex &&
+                                           !NumTraits<typename unpacket_traits<Packet>::type>::IsInteger>> {
+  static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a) {
+    using Scalar = typename unpacket_traits<Packet>::type;
+    const Packet cst_one = pset1<Packet>(Scalar(1));
+    const Packet cst_zero = pzero(a);
+
+    const Packet abs_a = pabs(a);
+    const Packet sign_mask = pandnot(a, abs_a);
+    const Packet nonzero_mask = pcmp_lt(cst_zero, abs_a);
+
+    return pselect(nonzero_mask, por(sign_mask, cst_one), abs_a);
+  }
+};
+
+template <typename Packet>
+struct psign_impl<Packet, std::enable_if_t<!is_scalar<Packet>::value &&
+                                           !NumTraits<typename unpacket_traits<Packet>::type>::IsComplex &&
+                                           NumTraits<typename unpacket_traits<Packet>::type>::IsSigned &&
+                                           NumTraits<typename unpacket_traits<Packet>::type>::IsInteger>> {
+  static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a) {
+    using Scalar = typename unpacket_traits<Packet>::type;
+    const Packet cst_one = pset1<Packet>(Scalar(1));
+    const Packet cst_minus_one = pset1<Packet>(Scalar(-1));
+    const Packet cst_zero = pzero(a);
+
+    const Packet positive_mask = pcmp_lt(cst_zero, a);
+    const Packet positive = pand(positive_mask, cst_one);
+    const Packet negative_mask = pcmp_lt(a, cst_zero);
+    const Packet negative = pand(negative_mask, cst_minus_one);
+
+    return por(positive, negative);
+  }
+};
+
+template <typename Packet>
+struct psign_impl<Packet, std::enable_if_t<!is_scalar<Packet>::value &&
+                                           !NumTraits<typename unpacket_traits<Packet>::type>::IsComplex &&
+                                           !NumTraits<typename unpacket_traits<Packet>::type>::IsSigned &&
+                                           NumTraits<typename unpacket_traits<Packet>::type>::IsInteger>> {
+  static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a) {
+    using Scalar = typename unpacket_traits<Packet>::type;
+    const Packet cst_one = pset1<Packet>(Scalar(1));
+    const Packet cst_zero = pzero(a);
+
+    const Packet zero_mask = pcmp_eq(cst_zero, a);
+    return pandnot(cst_one, zero_mask);
+  }
+};
+
+// \internal \returns the the sign of a complex number z, defined as z / abs(z).
+template <typename Packet>
+struct psign_impl<Packet, std::enable_if_t<!is_scalar<Packet>::value &&
+                                           NumTraits<typename unpacket_traits<Packet>::type>::IsComplex &&
+                                           unpacket_traits<Packet>::vectorizable>> {
+  static EIGEN_DEVICE_FUNC inline Packet run(const Packet& a) {
+    typedef typename unpacket_traits<Packet>::type Scalar;
+    typedef typename Scalar::value_type RealScalar;
+    typedef typename unpacket_traits<Packet>::as_real RealPacket;
+
+    // Step 1. Compute (for each element z = x + i*y in a)
+    //     l = abs(z) = sqrt(x^2 + y^2).
+    // To avoid over- and underflow, we use the stable formula for each hypotenuse
+    //    l = (zmin == 0 ? zmax : zmax * sqrt(1 + (zmin/zmax)**2)),
+    // where zmax = max(|x|, |y|), zmin = min(|x|, |y|),
+    RealPacket a_abs = pabs(a.v);
+    RealPacket a_abs_flip = pcplxflip(Packet(a_abs)).v;
+    RealPacket a_max = pmax(a_abs, a_abs_flip);
+    RealPacket a_min = pmin(a_abs, a_abs_flip);
+    RealPacket a_min_zero_mask = pcmp_eq(a_min, pzero(a_min));
+    RealPacket a_max_zero_mask = pcmp_eq(a_max, pzero(a_max));
+    RealPacket r = pdiv(a_min, a_max);
+    const RealPacket cst_one = pset1<RealPacket>(RealScalar(1));
+    RealPacket l = pmul(a_max, psqrt(padd(cst_one, pmul(r, r))));  // [l0, l0, l1, l1]
+    // Set l to a_max if a_min is zero, since the roundtrip sqrt(a_max^2) may be
+    // lossy.
+    l = pselect(a_min_zero_mask, a_max, l);
+    // Step 2 compute a / abs(a).
+    RealPacket sign_as_real = pandnot(pdiv(a.v, l), a_max_zero_mask);
+    Packet sign;
+    sign.v = sign_as_real;
+    return sign;
+  }
+};
+
+// TODO(rmlarsen): The following set of utilities for double word arithmetic
+// should perhaps be refactored as a separate file, since it would be generally
+// useful for special function implementation etc. Writing the algorithms in
+// terms if a double word type would also make the code more readable.
+
+// This function splits x into the nearest integer n and fractional part r,
+// such that x = n + r holds exactly.
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void absolute_split(const Packet& x, Packet& n, Packet& r) {
+  n = pround(x);
+  r = psub(x, n);
+}
+
+// This function computes the sum {s, r}, such that x + y = s_hi + s_lo
+// holds exactly, and s_hi = fl(x+y), if |x| >= |y|.
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void fast_twosum(const Packet& x, const Packet& y, Packet& s_hi, Packet& s_lo) {
+  s_hi = padd(x, y);
+  const Packet t = psub(s_hi, x);
+  s_lo = psub(y, t);
+}
+
+#ifdef EIGEN_VECTORIZE_FMA
+// This function implements the extended precision product of
+// a pair of floating point numbers. Given {x, y}, it computes the pair
+// {p_hi, p_lo} such that x * y = p_hi + p_lo holds exactly and
+// p_hi = fl(x * y).
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void twoprod(const Packet& x, const Packet& y, Packet& p_hi, Packet& p_lo) {
+  p_hi = pmul(x, y);
+  p_lo = pmsub(x, y, p_hi);
+}
+
+// A version of twoprod that takes x, y, and fl(x*y) as input and returns the p_lo such that
+// x * y = xy + p_lo holds exactly.
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet twoprod_low(const Packet& x, const Packet& y, const Packet& xy) {
+  return pmsub(x, y, xy);
+}
+
+#else
+
+// This function implements the Veltkamp splitting. Given a floating point
+// number x it returns the pair {x_hi, x_lo} such that x_hi + x_lo = x holds
+// exactly and that half of the significant of x fits in x_hi.
+// This is Algorithm 3 from Jean-Michel Muller, "Elementary Functions",
+// 3rd edition, Birkh\"auser, 2016.
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void veltkamp_splitting(const Packet& x, Packet& x_hi, Packet& x_lo) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  constexpr int shift = (NumTraits<Scalar>::digits() + 1) / 2;
+  const Scalar shift_scale = Scalar(uint64_t(1) << shift);  // Scalar constructor not necessarily constexpr.
+  const Packet gamma = pmul(pset1<Packet>(shift_scale + Scalar(1)), x);
+  Packet rho = psub(x, gamma);
+  x_hi = padd(rho, gamma);
+  x_lo = psub(x, x_hi);
+}
+
+// This function implements Dekker's algorithm for products x * y.
+// Given floating point numbers {x, y} computes the pair
+// {p_hi, p_lo} such that x * y = p_hi + p_lo holds exactly and
+// p_hi = fl(x * y).
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void twoprod(const Packet& x, const Packet& y, Packet& p_hi, Packet& p_lo) {
+  Packet x_hi, x_lo, y_hi, y_lo;
+  veltkamp_splitting(x, x_hi, x_lo);
+  veltkamp_splitting(y, y_hi, y_lo);
+
+  p_hi = pmul(x, y);
+  p_lo = pmadd(x_hi, y_hi, pnegate(p_hi));
+  p_lo = pmadd(x_hi, y_lo, p_lo);
+  p_lo = pmadd(x_lo, y_hi, p_lo);
+  p_lo = pmadd(x_lo, y_lo, p_lo);
+}
+
+// A version of twoprod that takes x, y, and fl(x*y) as input and returns the p_lo such that
+// x * y = xy + p_lo holds exactly.
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet twoprod_low(const Packet& x, const Packet& y, const Packet& xy) {
+  Packet x_hi, x_lo, y_hi, y_lo;
+  veltkamp_splitting(x, x_hi, x_lo);
+  veltkamp_splitting(y, y_hi, y_lo);
+
+  Packet p_lo = pmadd(x_hi, y_hi, pnegate(xy));
+  p_lo = pmadd(x_hi, y_lo, p_lo);
+  p_lo = pmadd(x_lo, y_hi, p_lo);
+  p_lo = pmadd(x_lo, y_lo, p_lo);
+  return p_lo;
+}
+
+#endif  // EIGEN_VECTORIZE_FMA
+
+// This function implements Dekker's algorithm for the addition
+// of two double word numbers represented by {x_hi, x_lo} and {y_hi, y_lo}.
+// It returns the result as a pair {s_hi, s_lo} such that
+// x_hi + x_lo + y_hi + y_lo = s_hi + s_lo holds exactly.
+// This is Algorithm 5 from Jean-Michel Muller, "Elementary Functions",
+// 3rd edition, Birkh\"auser, 2016.
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void twosum(const Packet& x_hi, const Packet& x_lo, const Packet& y_hi,
+                                                  const Packet& y_lo, Packet& s_hi, Packet& s_lo) {
+  const Packet x_greater_mask = pcmp_lt(pabs(y_hi), pabs(x_hi));
+  Packet r_hi_1, r_lo_1;
+  fast_twosum(x_hi, y_hi, r_hi_1, r_lo_1);
+  Packet r_hi_2, r_lo_2;
+  fast_twosum(y_hi, x_hi, r_hi_2, r_lo_2);
+  const Packet r_hi = pselect(x_greater_mask, r_hi_1, r_hi_2);
+
+  const Packet s1 = padd(padd(y_lo, r_lo_1), x_lo);
+  const Packet s2 = padd(padd(x_lo, r_lo_2), y_lo);
+  const Packet s = pselect(x_greater_mask, s1, s2);
+
+  fast_twosum(r_hi, s, s_hi, s_lo);
+}
+
+// This is a version of twosum for double word numbers,
+// which assumes that |x_hi| >= |y_hi|.
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void fast_twosum(const Packet& x_hi, const Packet& x_lo, const Packet& y_hi,
+                                                       const Packet& y_lo, Packet& s_hi, Packet& s_lo) {
+  Packet r_hi, r_lo;
+  fast_twosum(x_hi, y_hi, r_hi, r_lo);
+  const Packet s = padd(padd(y_lo, r_lo), x_lo);
+  fast_twosum(r_hi, s, s_hi, s_lo);
+}
+
+// This is a version of twosum for adding a floating point number x to
+// double word number {y_hi, y_lo} number, with the assumption
+// that |x| >= |y_hi|.
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void fast_twosum(const Packet& x, const Packet& y_hi, const Packet& y_lo,
+                                                       Packet& s_hi, Packet& s_lo) {
+  Packet r_hi, r_lo;
+  fast_twosum(x, y_hi, r_hi, r_lo);
+  const Packet s = padd(y_lo, r_lo);
+  fast_twosum(r_hi, s, s_hi, s_lo);
+}
+
+// This function implements the multiplication of a double word
+// number represented by {x_hi, x_lo} by a floating point number y.
+// It returns the result as a pair {p_hi, p_lo} such that
+// (x_hi + x_lo) * y = p_hi + p_lo hold with a relative error
+// of less than 2*2^{-2p}, where p is the number of significand bit
+// in the floating point type.
+// This is Algorithm 7 from Jean-Michel Muller, "Elementary Functions",
+// 3rd edition, Birkh\"auser, 2016.
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void twoprod(const Packet& x_hi, const Packet& x_lo, const Packet& y,
+                                                   Packet& p_hi, Packet& p_lo) {
+  Packet c_hi, c_lo1;
+  twoprod(x_hi, y, c_hi, c_lo1);
+  const Packet c_lo2 = pmul(x_lo, y);
+  Packet t_hi, t_lo1;
+  fast_twosum(c_hi, c_lo2, t_hi, t_lo1);
+  const Packet t_lo2 = padd(t_lo1, c_lo1);
+  fast_twosum(t_hi, t_lo2, p_hi, p_lo);
+}
+
+// This function implements the multiplication of two double word
+// numbers represented by {x_hi, x_lo} and {y_hi, y_lo}.
+// It returns the result as a pair {p_hi, p_lo} such that
+// (x_hi + x_lo) * (y_hi + y_lo) = p_hi + p_lo holds with a relative error
+// of less than 2*2^{-2p}, where p is the number of significand bit
+// in the floating point type.
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void twoprod(const Packet& x_hi, const Packet& x_lo, const Packet& y_hi,
+                                                   const Packet& y_lo, Packet& p_hi, Packet& p_lo) {
+  Packet p_hi_hi, p_hi_lo;
+  twoprod(x_hi, x_lo, y_hi, p_hi_hi, p_hi_lo);
+  Packet p_lo_hi, p_lo_lo;
+  twoprod(x_hi, x_lo, y_lo, p_lo_hi, p_lo_lo);
+  fast_twosum(p_hi_hi, p_hi_lo, p_lo_hi, p_lo_lo, p_hi, p_lo);
+}
+
+// This function implements the division of double word {x_hi, x_lo}
+// by float y. This is Algorithm 15 from "Tight and rigorous error bounds
+// for basic building blocks of double-word arithmetic", Joldes, Muller, & Popescu,
+// 2017. https://hal.archives-ouvertes.fr/hal-01351529
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void doubleword_div_fp(const Packet& x_hi, const Packet& x_lo, const Packet& y,
+                                                             Packet& z_hi, Packet& z_lo) {
+  const Packet t_hi = pdiv(x_hi, y);
+  Packet pi_hi, pi_lo;
+  twoprod(t_hi, y, pi_hi, pi_lo);
+  const Packet delta_hi = psub(x_hi, pi_hi);
+  const Packet delta_t = psub(delta_hi, pi_lo);
+  const Packet delta = padd(delta_t, x_lo);
+  const Packet t_lo = pdiv(delta, y);
+  fast_twosum(t_hi, t_lo, z_hi, z_lo);
+}
+
+// This function computes log2(x) and returns the result as a double word.
+template <typename Scalar>
+struct accurate_log2 {
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(const Packet& x, Packet& log2_x_hi, Packet& log2_x_lo) {
+    log2_x_hi = plog2(x);
+    log2_x_lo = pzero(x);
+  }
+};
+
+// This specialization uses a more accurate algorithm to compute log2(x) for
+// floats in [1/sqrt(2);sqrt(2)] with a relative accuracy of ~6.56508e-10.
+// This additional accuracy is needed to counter the error-magnification
+// inherent in multiplying by a potentially large exponent in pow(x,y).
+// The minimax polynomial used was calculated using the Rminimax tool,
+// see https://gitlab.inria.fr/sfilip/rminimax.
+// Command line:
+//   $ ratapprox --function="log2(1+x)/x"  --dom='[-0.2929,0.41422]'
+//   --type=[10,0]
+//       --numF="[D,D,SG]" --denF="[SG]" --log --dispCoeff="dec"
+//
+// The resulting implementation of pow(x,y) is accurate to 3 ulps.
+template <>
+struct accurate_log2<float> {
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(const Packet& z, Packet& log2_x_hi, Packet& log2_x_lo) {
+    // Split the two lowest order constant coefficient into double-word representation.
+    constexpr double kC0 = 1.442695041742110273474963832995854318141937255859375e+00;
+    constexpr float kC0_hi = static_cast<float>(kC0);
+    constexpr float kC0_lo = static_cast<float>(kC0 - static_cast<double>(kC0_hi));
+    const Packet c0_hi = pset1<Packet>(kC0_hi);
+    const Packet c0_lo = pset1<Packet>(kC0_lo);
+
+    constexpr double kC1 = -7.2134751588268664068692714863573201000690460205078125e-01;
+    constexpr float kC1_hi = static_cast<float>(kC1);
+    constexpr float kC1_lo = static_cast<float>(kC1 - static_cast<double>(kC1_hi));
+    const Packet c1_hi = pset1<Packet>(kC1_hi);
+    const Packet c1_lo = pset1<Packet>(kC1_lo);
+
+    constexpr float c[] = {
+        9.7010828554630279541015625e-02,  -1.6896486282348632812500000e-01, 1.7200836539268493652343750e-01,
+        -1.7892272770404815673828125e-01, 2.0505344867706298828125000e-01,  -2.4046677350997924804687500e-01,
+        2.8857553005218505859375000e-01,  -3.6067414283752441406250000e-01, 4.8089790344238281250000000e-01};
+
+    // Evaluate the higher order terms in the polynomial using
+    // standard arithmetic.
+    const Packet one = pset1<Packet>(1.0f);
+    const Packet x = psub(z, one);
+    Packet p = ppolevl<Packet, 8>::run(x, c);
+    // Evaluate the final two step in Horner's rule using double-word
+    // arithmetic.
+    Packet p_hi, p_lo;
+    twoprod(x, p, p_hi, p_lo);
+    fast_twosum(c1_hi, c1_lo, p_hi, p_lo, p_hi, p_lo);
+    twoprod(p_hi, p_lo, x, p_hi, p_lo);
+    fast_twosum(c0_hi, c0_lo, p_hi, p_lo, p_hi, p_lo);
+    // Multiply by x to recover log2(z).
+    twoprod(p_hi, p_lo, x, log2_x_hi, log2_x_lo);
+  }
+};
+
+// This specialization uses a more accurate algorithm to compute log2(x) for
+// floats in [1/sqrt(2);sqrt(2)] with a relative accuracy of ~1.27e-18.
+// This additional accuracy is needed to counter the error-magnification
+// inherent in multiplying by a potentially large exponent in pow(x,y).
+// The minimax polynomial used was calculated using the Sollya tool.
+// See sollya.org.
+
+template <>
+struct accurate_log2<double> {
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(const Packet& x, Packet& log2_x_hi, Packet& log2_x_lo) {
+    // We use a transformation of variables:
+    //    r = c * (x-1) / (x+1),
+    // such that
+    //    log2(x) = log2((1 + r/c) / (1 - r/c)) = f(r).
+    // The function f(r) can be approximated well using an odd polynomial
+    // of the form
+    //   P(r) = ((Q(r^2) * r^2 + C) * r^2 + 1) * r,
+    // For the implementation of log2<double> here, Q is of degree 6 with
+    // coefficient represented in working precision (double), while C is a
+    // constant represented in extra precision as a double word to achieve
+    // full accuracy.
+    //
+    // The polynomial coefficients were computed by the Sollya script:
+    //
+    // c = 2 / log(2);
+    // trans = c * (x-1)/(x+1);
+    // itrans = (1+x/c)/(1-x/c);
+    // interval=[trans(sqrt(0.5)); trans(sqrt(2))];
+    // print(interval);
+    // f = log2(itrans(x));
+    // p=fpminimax(f,[|1,3,5,7,9,11,13,15,17|],[|1,DD,double...|],interval,relative,floating);
+    const Packet q12 = pset1<Packet>(2.87074255468000586e-9);
+    const Packet q10 = pset1<Packet>(2.38957980901884082e-8);
+    const Packet q8 = pset1<Packet>(2.31032094540014656e-7);
+    const Packet q6 = pset1<Packet>(2.27279857398537278e-6);
+    const Packet q4 = pset1<Packet>(2.31271023278625638e-5);
+    const Packet q2 = pset1<Packet>(2.47556738444535513e-4);
+    const Packet q0 = pset1<Packet>(2.88543873228900172e-3);
+    const Packet C_hi = pset1<Packet>(0.0400377511598501157);
+    const Packet C_lo = pset1<Packet>(-4.77726582251425391e-19);
+    const Packet one = pset1<Packet>(1.0);
+
+    const Packet cst_2_log2e_hi = pset1<Packet>(2.88539008177792677);
+    const Packet cst_2_log2e_lo = pset1<Packet>(4.07660016854549667e-17);
+    // c * (x - 1)
+    Packet t_hi, t_lo;
+    // t = c * (x-1)
+    twoprod(cst_2_log2e_hi, cst_2_log2e_lo, psub(x, one), t_hi, t_lo);
+    // r = c * (x-1) / (x+1),
+    Packet r_hi, r_lo;
+    doubleword_div_fp(t_hi, t_lo, padd(x, one), r_hi, r_lo);
+
+    // r2 = r * r
+    Packet r2_hi, r2_lo;
+    twoprod(r_hi, r_lo, r_hi, r_lo, r2_hi, r2_lo);
+    // r4 = r2 * r2
+    Packet r4_hi, r4_lo;
+    twoprod(r2_hi, r2_lo, r2_hi, r2_lo, r4_hi, r4_lo);
+
+    // Evaluate Q(r^2) in working precision. We evaluate it in two parts
+    // (even and odd in r^2) to improve instruction level parallelism.
+    Packet q_even = pmadd(q12, r4_hi, q8);
+    Packet q_odd = pmadd(q10, r4_hi, q6);
+    q_even = pmadd(q_even, r4_hi, q4);
+    q_odd = pmadd(q_odd, r4_hi, q2);
+    q_even = pmadd(q_even, r4_hi, q0);
+    Packet q = pmadd(q_odd, r2_hi, q_even);
+
+    // Now evaluate the low order terms of P(x) in double word precision.
+    // In the following, due to the increasing magnitude of the coefficients
+    // and r being constrained to [-0.5, 0.5] we can use fast_twosum instead
+    // of the slower twosum.
+    // Q(r^2) * r^2
+    Packet p_hi, p_lo;
+    twoprod(r2_hi, r2_lo, q, p_hi, p_lo);
+    // Q(r^2) * r^2 + C
+    Packet p1_hi, p1_lo;
+    fast_twosum(C_hi, C_lo, p_hi, p_lo, p1_hi, p1_lo);
+    // (Q(r^2) * r^2 + C) * r^2
+    Packet p2_hi, p2_lo;
+    twoprod(r2_hi, r2_lo, p1_hi, p1_lo, p2_hi, p2_lo);
+    // ((Q(r^2) * r^2 + C) * r^2 + 1)
+    Packet p3_hi, p3_lo;
+    fast_twosum(one, p2_hi, p2_lo, p3_hi, p3_lo);
+
+    // log(z) ~= ((Q(r^2) * r^2 + C) * r^2 + 1) * r
+    twoprod(p3_hi, p3_lo, r_hi, r_lo, log2_x_hi, log2_x_lo);
+  }
+};
+
+// This function implements the non-trivial case of pow(x,y) where x is
+// positive and y is (possibly) non-integer.
+// Formally, pow(x,y) = exp2(y * log2(x)), where exp2(x) is shorthand for 2^x.
+// TODO(rmlarsen): We should probably add this as a packet up 'ppow', to make it
+// easier to specialize or turn off for specific types and/or backends.x
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_pow_impl(const Packet& x, const Packet& y) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  // Split x into exponent e_x and mantissa m_x.
+  Packet e_x;
+  Packet m_x = pfrexp(x, e_x);
+
+  // Adjust m_x to lie in [1/sqrt(2):sqrt(2)] to minimize absolute error in log2(m_x).
+  constexpr Scalar sqrt_half = Scalar(0.70710678118654752440);
+  const Packet m_x_scale_mask = pcmp_lt(m_x, pset1<Packet>(sqrt_half));
+  m_x = pselect(m_x_scale_mask, pmul(pset1<Packet>(Scalar(2)), m_x), m_x);
+  e_x = pselect(m_x_scale_mask, psub(e_x, pset1<Packet>(Scalar(1))), e_x);
+
+  // Compute log2(m_x) with 6 extra bits of accuracy.
+  Packet rx_hi, rx_lo;
+  accurate_log2<Scalar>()(m_x, rx_hi, rx_lo);
+
+  // Compute the two terms {y * e_x, y * r_x} in f = y * log2(x) with doubled
+  // precision using double word arithmetic.
+  Packet f1_hi, f1_lo, f2_hi, f2_lo;
+  twoprod(e_x, y, f1_hi, f1_lo);
+  twoprod(rx_hi, rx_lo, y, f2_hi, f2_lo);
+  // Sum the two terms in f using double word arithmetic. We know
+  // that |e_x| > |log2(m_x)|, except for the case where e_x==0.
+  // This means that we can use fast_twosum(f1,f2).
+  // In the case e_x == 0, e_x * y = f1 = 0, so we don't lose any
+  // accuracy by violating the assumption of fast_twosum, because
+  // it's a no-op.
+  Packet f_hi, f_lo;
+  fast_twosum(f1_hi, f1_lo, f2_hi, f2_lo, f_hi, f_lo);
+
+  // Split f into integer and fractional parts.
+  Packet n_z, r_z;
+  absolute_split(f_hi, n_z, r_z);
+  r_z = padd(r_z, f_lo);
+  Packet n_r;
+  absolute_split(r_z, n_r, r_z);
+  n_z = padd(n_z, n_r);
+
+  // We now have an accurate split of f = n_z + r_z and can compute
+  //   x^y = 2**{n_z + r_z) = exp2(r_z) * 2**{n_z}.
+  // Multiplication by the second factor can be done exactly using pldexp(), since
+  // it is an integer power of 2.
+  const Packet e_r = generic_exp2(r_z);
+
+  // Since we know that e_r is in [1/sqrt(2); sqrt(2)], we can use the fast version
+  // of pldexp to multiply by 2**{n_z} when |n_z| is sufficiently small.
+  constexpr Scalar kPldExpThresh = std::numeric_limits<Scalar>::max_exponent - 2;
+  const Packet pldexp_fast_unsafe = pcmp_lt(pset1<Packet>(kPldExpThresh), pabs(n_z));
+  if (predux_any(pldexp_fast_unsafe)) {
+    return pldexp(e_r, n_z);
+  }
+  return pldexp_fast(e_r, n_z);
+}
+
+// Generic implementation of pow(x,y).
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS std::enable_if_t<!is_scalar<Packet>::value, Packet> generic_pow(
+    const Packet& x, const Packet& y) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+
+  const Packet cst_inf = pset1<Packet>(NumTraits<Scalar>::infinity());
+  const Packet cst_zero = pset1<Packet>(Scalar(0));
+  const Packet cst_one = pset1<Packet>(Scalar(1));
+  const Packet cst_nan = pset1<Packet>(NumTraits<Scalar>::quiet_NaN());
+
+  const Packet x_abs = pabs(x);
+  Packet pow = generic_pow_impl(x_abs, y);
+
+  // In the following we enforce the special case handling prescribed in
+  // https://en.cppreference.com/w/cpp/numeric/math/pow.
+
+  // Predicates for sign and magnitude of x.
+  const Packet x_is_negative = pcmp_lt(x, cst_zero);
+  const Packet x_is_zero = pcmp_eq(x, cst_zero);
+  const Packet x_is_one = pcmp_eq(x, cst_one);
+  const Packet x_has_signbit = psignbit(x);
+  const Packet x_abs_gt_one = pcmp_lt(cst_one, x_abs);
+  const Packet x_abs_is_inf = pcmp_eq(x_abs, cst_inf);
+
+  // Predicates for sign and magnitude of y.
+  const Packet y_abs = pabs(y);
+  const Packet y_abs_is_inf = pcmp_eq(y_abs, cst_inf);
+  const Packet y_is_negative = pcmp_lt(y, cst_zero);
+  const Packet y_is_zero = pcmp_eq(y, cst_zero);
+  const Packet y_is_one = pcmp_eq(y, cst_one);
+  // Predicates for whether y is integer and odd/even.
+  const Packet y_is_int = pandnot(pcmp_eq(pfloor(y), y), y_abs_is_inf);
+  const Packet y_div_2 = pmul(y, pset1<Packet>(Scalar(0.5)));
+  const Packet y_is_even = pcmp_eq(pround(y_div_2), y_div_2);
+  const Packet y_is_odd_int = pandnot(y_is_int, y_is_even);
+  // Smallest exponent for which (1 + epsilon) overflows to infinity.
+  constexpr Scalar huge_exponent =
+      (NumTraits<Scalar>::max_exponent() * Scalar(EIGEN_LN2)) / NumTraits<Scalar>::epsilon();
+  const Packet y_abs_is_huge = pcmp_le(pset1<Packet>(huge_exponent), y_abs);
+
+  // *  pow(base, exp) returns NaN if base is finite and negative
+  //    and exp is finite and non-integer.
+  pow = pselect(pandnot(x_is_negative, y_is_int), cst_nan, pow);
+
+  // * pow(±0, exp), where exp is negative, finite, and is an even integer or
+  // a non-integer, returns +∞
+  // * pow(±0, exp), where exp is positive non-integer or a positive even
+  // integer, returns +0
+  // * pow(+0, exp), where exp is a negative odd integer, returns +∞
+  // * pow(-0, exp), where exp is a negative odd integer, returns -∞
+  // * pow(+0, exp), where exp is a positive odd integer, returns +0
+  // * pow(-0, exp), where exp is a positive odd integer, returns -0
+  // Sign is flipped by the rule below.
+  pow = pselect(x_is_zero, pselect(y_is_negative, cst_inf, cst_zero), pow);
+
+  // pow(base, exp) returns -pow(abs(base), exp) if base has the sign bit set,
+  // and exp is an odd integer exponent.
+  pow = pselect(pand(x_has_signbit, y_is_odd_int), pnegate(pow), pow);
+
+  // * pow(base, -∞) returns +∞ for any |base|<1
+  // * pow(base, -∞) returns +0 for any |base|>1
+  // * pow(base, +∞) returns +0 for any |base|<1
+  // * pow(base, +∞) returns +∞ for any |base|>1
+  // * pow(±0, -∞) returns +∞
+  // * pow(-1, +-∞) = 1
+  Packet inf_y_val = pselect(por(pand(y_is_negative, x_is_zero), pxor(y_is_negative, x_abs_gt_one)), cst_inf, cst_zero);
+  inf_y_val = pselect(pcmp_eq(x, pset1<Packet>(Scalar(-1.0))), cst_one, inf_y_val);
+  pow = pselect(y_abs_is_huge, inf_y_val, pow);
+
+  // * pow(+∞, exp) returns +0 for any negative exp
+  // * pow(+∞, exp) returns +∞ for any positive exp
+  // * pow(-∞, exp) returns -0 if exp is a negative odd integer.
+  // * pow(-∞, exp) returns +0 if exp is a negative non-integer or negative
+  //     even integer.
+  // * pow(-∞, exp) returns -∞ if exp is a positive odd integer.
+  // * pow(-∞, exp) returns +∞ if exp is a positive non-integer or positive
+  //     even integer.
+  auto x_pos_inf_value = pselect(y_is_negative, cst_zero, cst_inf);
+  auto x_neg_inf_value = pselect(y_is_odd_int, pnegate(x_pos_inf_value), x_pos_inf_value);
+  pow = pselect(x_abs_is_inf, pselect(x_is_negative, x_neg_inf_value, x_pos_inf_value), pow);
+
+  // All cases of NaN inputs return NaN, except the two below.
+  pow = pselect(por(pisnan(x), pisnan(y)), cst_nan, pow);
+
+  // * pow(base, 1) returns base.
+  // * pow(base, +/-0) returns 1, regardless of base, even NaN.
+  // * pow(+1, exp) returns 1, regardless of exponent, even NaN.
+  pow = pselect(y_is_one, x, pselect(por(x_is_one, y_is_zero), cst_one, pow));
+
+  return pow;
+}
+
+template <typename Scalar>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS std::enable_if_t<is_scalar<Scalar>::value, Scalar> generic_pow(
+    const Scalar& x, const Scalar& y) {
+  return numext::pow(x, y);
+}
+
+namespace unary_pow {
+
+template <typename ScalarExponent, bool IsInteger = NumTraits<ScalarExponent>::IsInteger>
+struct exponent_helper {
+  using safe_abs_type = ScalarExponent;
+  static constexpr ScalarExponent one_half = ScalarExponent(0.5);
+  // these routines assume that exp is an integer stored as a floating point type
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScalarExponent safe_abs(const ScalarExponent& exp) {
+    return numext::abs(exp);
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool is_odd(const ScalarExponent& exp) {
+    eigen_assert(((numext::isfinite)(exp) && exp == numext::floor(exp)) && "exp must be an integer");
+    ScalarExponent exp_div_2 = exp * one_half;
+    ScalarExponent floor_exp_div_2 = numext::floor(exp_div_2);
+    return exp_div_2 != floor_exp_div_2;
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScalarExponent floor_div_two(const ScalarExponent& exp) {
+    ScalarExponent exp_div_2 = exp * one_half;
+    return numext::floor(exp_div_2);
+  }
+};
+
+template <typename ScalarExponent>
+struct exponent_helper<ScalarExponent, true> {
+  // if `exp` is a signed integer type, cast it to its unsigned counterpart to safely store its absolute value
+  // consider the (rare) case where `exp` is an int32_t: abs(-2147483648) != 2147483648
+  using safe_abs_type = typename numext::get_integer_by_size<sizeof(ScalarExponent)>::unsigned_type;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE safe_abs_type safe_abs(const ScalarExponent& exp) {
+    ScalarExponent mask = numext::signbit(exp);
+    safe_abs_type result = safe_abs_type(exp ^ mask);
+    return result + safe_abs_type(ScalarExponent(1) & mask);
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool is_odd(const safe_abs_type& exp) {
+    return exp % safe_abs_type(2) != safe_abs_type(0);
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE safe_abs_type floor_div_two(const safe_abs_type& exp) {
+    return exp >> safe_abs_type(1);
+  }
+};
+
+template <typename Packet, typename ScalarExponent,
+          bool ReciprocateIfExponentIsNegative =
+              !NumTraits<typename unpacket_traits<Packet>::type>::IsInteger && NumTraits<ScalarExponent>::IsSigned>
+struct reciprocate {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent& exponent) {
+    using Scalar = typename unpacket_traits<Packet>::type;
+    const Packet cst_pos_one = pset1<Packet>(Scalar(1));
+    return exponent < 0 ? pdiv(cst_pos_one, x) : x;
+  }
+};
+
+template <typename Packet, typename ScalarExponent>
+struct reciprocate<Packet, ScalarExponent, false> {
+  // pdiv not defined, nor necessary for integer base types
+  // if the exponent is unsigned, then the exponent cannot be negative
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent&) { return x; }
+};
+
+template <typename Packet, typename ScalarExponent>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet int_pow(const Packet& x, const ScalarExponent& exponent) {
+  using Scalar = typename unpacket_traits<Packet>::type;
+  using ExponentHelper = exponent_helper<ScalarExponent>;
+  using AbsExponentType = typename ExponentHelper::safe_abs_type;
+  const Packet cst_pos_one = pset1<Packet>(Scalar(1));
+  if (exponent == ScalarExponent(0)) return cst_pos_one;
+
+  Packet result = reciprocate<Packet, ScalarExponent>::run(x, exponent);
+  Packet y = cst_pos_one;
+  AbsExponentType m = ExponentHelper::safe_abs(exponent);
+
+  while (m > 1) {
+    bool odd = ExponentHelper::is_odd(m);
+    if (odd) y = pmul(y, result);
+    result = pmul(result, result);
+    m = ExponentHelper::floor_div_two(m);
+  }
+
+  return pmul(y, result);
+}
+
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<!is_scalar<Packet>::value, Packet> gen_pow(
+    const Packet& x, const typename unpacket_traits<Packet>::type& exponent) {
+  const Packet exponent_packet = pset1<Packet>(exponent);
+  return generic_pow_impl(x, exponent_packet);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<is_scalar<Scalar>::value, Scalar> gen_pow(
+    const Scalar& x, const Scalar& exponent) {
+  return numext::pow(x, exponent);
+}
+
+template <typename Packet, typename ScalarExponent>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet handle_nonint_nonint_errors(const Packet& x, const Packet& powx,
+                                                                         const ScalarExponent& exponent) {
+  using Scalar = typename unpacket_traits<Packet>::type;
+
+  // non-integer base and exponent case
+  const Packet cst_pos_zero = pzero(x);
+  const Packet cst_pos_one = pset1<Packet>(Scalar(1));
+  const Packet cst_pos_inf = pset1<Packet>(NumTraits<Scalar>::infinity());
+  const Packet cst_true = ptrue<Packet>(x);
+
+  const bool exponent_is_not_fin = !(numext::isfinite)(exponent);
+  const bool exponent_is_neg = exponent < ScalarExponent(0);
+  const bool exponent_is_pos = exponent > ScalarExponent(0);
+
+  const Packet exp_is_not_fin = exponent_is_not_fin ? cst_true : cst_pos_zero;
+  const Packet exp_is_neg = exponent_is_neg ? cst_true : cst_pos_zero;
+  const Packet exp_is_pos = exponent_is_pos ? cst_true : cst_pos_zero;
+  const Packet exp_is_inf = pand(exp_is_not_fin, por(exp_is_neg, exp_is_pos));
+  const Packet exp_is_nan = pandnot(exp_is_not_fin, por(exp_is_neg, exp_is_pos));
+
+  const Packet x_is_le_zero = pcmp_le(x, cst_pos_zero);
+  const Packet x_is_ge_zero = pcmp_le(cst_pos_zero, x);
+  const Packet x_is_zero = pand(x_is_le_zero, x_is_ge_zero);
+
+  const Packet abs_x = pabs(x);
+  const Packet abs_x_is_le_one = pcmp_le(abs_x, cst_pos_one);
+  const Packet abs_x_is_ge_one = pcmp_le(cst_pos_one, abs_x);
+  const Packet abs_x_is_inf = pcmp_eq(abs_x, cst_pos_inf);
+  const Packet abs_x_is_one = pand(abs_x_is_le_one, abs_x_is_ge_one);
+
+  Packet pow_is_inf_if_exp_is_neg = por(x_is_zero, pand(abs_x_is_le_one, exp_is_inf));
+  Packet pow_is_inf_if_exp_is_pos = por(abs_x_is_inf, pand(abs_x_is_ge_one, exp_is_inf));
+  Packet pow_is_one = pand(abs_x_is_one, por(exp_is_inf, x_is_ge_zero));
+
+  Packet result = powx;
+  result = por(x_is_le_zero, result);
+  result = pselect(pow_is_inf_if_exp_is_neg, pand(cst_pos_inf, exp_is_neg), result);
+  result = pselect(pow_is_inf_if_exp_is_pos, pand(cst_pos_inf, exp_is_pos), result);
+  result = por(exp_is_nan, result);
+  result = pselect(pow_is_one, cst_pos_one, result);
+  return result;
+}
+
+template <typename Packet, typename ScalarExponent,
+          std::enable_if_t<NumTraits<typename unpacket_traits<Packet>::type>::IsSigned, bool> = true>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet handle_negative_exponent(const Packet& x, const ScalarExponent& exponent) {
+  using Scalar = typename unpacket_traits<Packet>::type;
+
+  // signed integer base, signed integer exponent case
+
+  // This routine handles negative exponents.
+  // The return value is either 0, 1, or -1.
+  const Packet cst_pos_one = pset1<Packet>(Scalar(1));
+  const bool exponent_is_odd = exponent % ScalarExponent(2) != ScalarExponent(0);
+  const Packet exp_is_odd = exponent_is_odd ? ptrue<Packet>(x) : pzero<Packet>(x);
+
+  const Packet abs_x = pabs(x);
+  const Packet abs_x_is_one = pcmp_eq(abs_x, cst_pos_one);
+
+  Packet result = pselect(exp_is_odd, x, abs_x);
+  result = pselect(abs_x_is_one, result, pzero<Packet>(x));
+  return result;
+}
+
+template <typename Packet, typename ScalarExponent,
+          std::enable_if_t<!NumTraits<typename unpacket_traits<Packet>::type>::IsSigned, bool> = true>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet handle_negative_exponent(const Packet& x, const ScalarExponent&) {
+  using Scalar = typename unpacket_traits<Packet>::type;
+
+  // unsigned integer base, signed integer exponent case
+
+  // This routine handles negative exponents.
+  // The return value is either 0 or 1
+
+  const Scalar pos_one = Scalar(1);
+
+  const Packet cst_pos_one = pset1<Packet>(pos_one);
+
+  const Packet x_is_one = pcmp_eq(x, cst_pos_one);
+
+  return pand(x_is_one, x);
+}
+
+}  // end namespace unary_pow
+
+template <typename Packet, typename ScalarExponent,
+          bool BaseIsIntegerType = NumTraits<typename unpacket_traits<Packet>::type>::IsInteger,
+          bool ExponentIsIntegerType = NumTraits<ScalarExponent>::IsInteger,
+          bool ExponentIsSigned = NumTraits<ScalarExponent>::IsSigned>
+struct unary_pow_impl;
+
+template <typename Packet, typename ScalarExponent, bool ExponentIsSigned>
+struct unary_pow_impl<Packet, ScalarExponent, false, false, ExponentIsSigned> {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent& exponent) {
+    const bool exponent_is_integer = (numext::isfinite)(exponent) && numext::round(exponent) == exponent;
+    if (exponent_is_integer) {
+      // The simple recursive doubling implementation is only accurate to 3 ulps
+      // for integer exponents in [-3:7]. Since this is a common case, we
+      // specialize it here.
+      bool use_repeated_squaring =
+          (exponent <= ScalarExponent(7) && (!ExponentIsSigned || exponent >= ScalarExponent(-3)));
+      return use_repeated_squaring ? unary_pow::int_pow(x, exponent) : generic_pow(x, pset1<Packet>(exponent));
+    } else {
+      Packet result = unary_pow::gen_pow(x, exponent);
+      result = unary_pow::handle_nonint_nonint_errors(x, result, exponent);
+      return result;
+    }
+  }
+};
+
+template <typename Packet, typename ScalarExponent, bool ExponentIsSigned>
+struct unary_pow_impl<Packet, ScalarExponent, false, true, ExponentIsSigned> {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent& exponent) {
+    return unary_pow::int_pow(x, exponent);
+  }
+};
+
+template <typename Packet, typename ScalarExponent>
+struct unary_pow_impl<Packet, ScalarExponent, true, true, true> {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent& exponent) {
+    if (exponent < ScalarExponent(0)) {
+      return unary_pow::handle_negative_exponent(x, exponent);
+    } else {
+      return unary_pow::int_pow(x, exponent);
+    }
+  }
+};
+
+template <typename Packet, typename ScalarExponent>
+struct unary_pow_impl<Packet, ScalarExponent, true, true, false> {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run(const Packet& x, const ScalarExponent& exponent) {
+    return unary_pow::int_pow(x, exponent);
+  }
+};
+
+// This function computes exp2(x) = exp(ln(2) * x).
+// To improve accuracy, the product ln(2)*x is computed using the twoprod
+// algorithm, such that ln(2) * x = p_hi + p_lo holds exactly. Then exp2(x) is
+// computed as exp2(x) = exp(p_hi) * exp(p_lo) ~= exp(p_hi) * (1 + p_lo). This
+// correction step this reduces the maximum absolute error as follows:
+//
+// type   | max error (simple product) | max error (twoprod) |
+// -----------------------------------------------------------
+// float  |       35 ulps              |       4 ulps        |
+// double |      363 ulps              |     110 ulps        |
+//
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_exp2(const Packet& _x) {
+  typedef typename unpacket_traits<Packet>::type Scalar;
+  constexpr int max_exponent = std::numeric_limits<Scalar>::max_exponent;
+  constexpr int digits = std::numeric_limits<Scalar>::digits;
+  constexpr Scalar max_cap = Scalar(max_exponent + 1);
+  constexpr Scalar min_cap = -Scalar(max_exponent + digits - 1);
+  Packet x = pmax(pmin(_x, pset1<Packet>(max_cap)), pset1<Packet>(min_cap));
+  Packet p_hi, p_lo;
+  twoprod(pset1<Packet>(Scalar(EIGEN_LN2)), x, p_hi, p_lo);
+  Packet exp2_hi = pexp(p_hi);
+  Packet exp2_lo = padd(pset1<Packet>(Scalar(1)), p_lo);
+  return pmul(exp2_hi, exp2_lo);
+}
+
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_rint(const Packet& a) {
+  using Scalar = typename unpacket_traits<Packet>::type;
+  using IntType = typename numext::get_integer_by_size<sizeof(Scalar)>::signed_type;
+  // Adds and subtracts signum(a) * 2^kMantissaBits to force rounding.
+  const IntType kLimit = IntType(1) << (NumTraits<Scalar>::digits() - 1);
+  const Packet cst_limit = pset1<Packet>(static_cast<Scalar>(kLimit));
+  Packet abs_a = pabs(a);
+  Packet sign_a = pandnot(a, abs_a);
+  Packet rint_a = padd(abs_a, cst_limit);
+  // Don't compile-away addition and subtraction.
+  EIGEN_OPTIMIZATION_BARRIER(rint_a);
+  rint_a = psub(rint_a, cst_limit);
+  rint_a = por(rint_a, sign_a);
+  // If greater than limit (or NaN), simply return a.
+  Packet mask = pcmp_lt(abs_a, cst_limit);
+  Packet result = pselect(mask, rint_a, a);
+  return result;
+}
+
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_floor(const Packet& a) {
+  using Scalar = typename unpacket_traits<Packet>::type;
+  const Packet cst_1 = pset1<Packet>(Scalar(1));
+  Packet rint_a = generic_rint(a);
+  // if a < rint(a), then rint(a) == ceil(a)
+  Packet mask = pcmp_lt(a, rint_a);
+  Packet offset = pand(cst_1, mask);
+  Packet result = psub(rint_a, offset);
+  return result;
+}
+
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_ceil(const Packet& a) {
+  using Scalar = typename unpacket_traits<Packet>::type;
+  const Packet cst_1 = pset1<Packet>(Scalar(1));
+  const Packet sign_mask = pset1<Packet>(static_cast<Scalar>(-0.0));
+  Packet rint_a = generic_rint(a);
+  // if rint(a) < a, then rint(a) == floor(a)
+  Packet mask = pcmp_lt(rint_a, a);
+  Packet offset = pand(cst_1, mask);
+  Packet result = padd(rint_a, offset);
+  // Signed zero must remain signed (e.g. ceil(-0.02) == -0).
+  result = por(result, pand(sign_mask, a));
+  return result;
+}
+
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_trunc(const Packet& a) {
+  Packet abs_a = pabs(a);
+  Packet sign_a = pandnot(a, abs_a);
+  Packet floor_abs_a = generic_floor(abs_a);
+  Packet result = por(floor_abs_a, sign_a);
+  return result;
+}
+
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_round(const Packet& a) {
+  using Scalar = typename unpacket_traits<Packet>::type;
+  const Packet cst_half = pset1<Packet>(Scalar(0.5));
+  const Packet cst_1 = pset1<Packet>(Scalar(1));
+  Packet abs_a = pabs(a);
+  Packet sign_a = pandnot(a, abs_a);
+  Packet floor_abs_a = generic_floor(abs_a);
+  Packet diff = psub(abs_a, floor_abs_a);
+  Packet mask = pcmp_le(cst_half, diff);
+  Packet offset = pand(cst_1, mask);
+  Packet result = padd(floor_abs_a, offset);
+  result = por(result, sign_a);
+  return result;
+}
+
+template <typename Packet>
+struct nearest_integer_packetop_impl<Packet, /*IsScalar*/ false, /*IsInteger*/ false> {
+  using Scalar = typename unpacket_traits<Packet>::type;
+  static_assert(packet_traits<Scalar>::HasRound, "Generic nearest integer functions are disabled for this type.");
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_floor(const Packet& x) { return generic_floor(x); }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_ceil(const Packet& x) { return generic_ceil(x); }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_rint(const Packet& x) { return generic_rint(x); }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_round(const Packet& x) { return generic_round(x); }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_trunc(const Packet& x) { return generic_trunc(x); }
+};
+
+template <typename Packet>
+struct nearest_integer_packetop_impl<Packet, /*IsScalar*/ false, /*IsInteger*/ true> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_floor(const Packet& x) { return x; }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_ceil(const Packet& x) { return x; }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_rint(const Packet& x) { return x; }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_round(const Packet& x) { return x; }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet run_trunc(const Packet& x) { return x; }
+};
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_H
diff --git a/inst/include/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h b/inst/include/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
new file mode 100644
index 00000000..673954e9
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/Default/GenericPacketMathFunctionsFwd.h
@@ -0,0 +1,227 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2019 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_FWD_H
+#define EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_FWD_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+// Forward declarations of the generic math functions
+// implemented in GenericPacketMathFunctions.h
+// This is needed to workaround a circular dependency.
+
+/***************************************************************************
+ * Some generic implementations to be used by implementers
+ ***************************************************************************/
+
+/** Default implementation of pfrexp.
+ * It is expected to be called by implementers of template<> pfrexp.
+ */
+template <typename Packet>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pfrexp_generic(const Packet& a, Packet& exponent);
+
+// Extracts the biased exponent value from Packet p, and casts the results to
+// a floating-point Packet type. Used by pfrexp_generic. Override this if
+// there is no unpacket_traits<Packet>::integer_packet.
+template <typename Packet>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pfrexp_generic_get_biased_exponent(const Packet& p);
+
+/** Default implementation of pldexp.
+ * It is expected to be called by implementers of template<> pldexp.
+ */
+template <typename Packet>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pldexp_generic(const Packet& a, const Packet& exponent);
+
+// Explicitly multiplies
+//    a * (2^e)
+// clamping e to the range
+// [NumTraits<Scalar>::min_exponent()-2, NumTraits<Scalar>::max_exponent()]
+//
+// This is approx 7x faster than pldexp_impl, but will prematurely over/underflow
+// if 2^e doesn't fit into a normal floating-point Scalar.
+//
+// Assumes IEEE floating point format
+template <typename Packet>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet pldexp_fast(const Packet& a, const Packet& exponent);
+
+/** \internal \returns cbrt(x) for single precision float */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcbrt_float(const Packet& x_in);
+
+/** \internal \returns cbrt(x) for double precision float */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcbrt_double(const Packet& x_in);
+
+/** \internal \returns log(x) for single precision float */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_float(const Packet _x);
+
+/** \internal \returns log2(x) for single precision float */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog2_float(const Packet _x);
+
+/** \internal \returns log(x) for single precision float */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_double(const Packet _x);
+
+/** \internal \returns log2(x) for single precision float */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog2_double(const Packet _x);
+
+/** \internal \returns log(1 + x) */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_log1p(const Packet& x);
+
+/** \internal \returns exp(x)-1 */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_expm1(const Packet& x);
+
+/** \internal \returns atan(x) */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_atan(const Packet& x);
+
+/** \internal \returns exp2(x) */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet generic_exp2(const Packet& x);
+
+/** \internal \returns exp(x) for single precision float */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_float(const Packet _x);
+
+/** \internal \returns exp(x) for double precision real numbers */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_double(const Packet _x);
+
+/** \internal \returns sin(x) for single precision float */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psin_float(const Packet& x);
+
+/** \internal \returns cos(x) for single precision float */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcos_float(const Packet& x);
+
+/** \internal \returns sin(x) for double precision float */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psin_double(const Packet& x);
+
+/** \internal \returns cos(x) for double precision float */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pcos_double(const Packet& x);
+
+/** \internal \returns asin(x) for single precision float */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pasin_float(const Packet& x);
+
+/** \internal \returns acos(x) for single precision float */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pacos_float(const Packet& x);
+
+/** \internal \returns tanh(x) for single precision float */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet ptanh_float(const Packet& x);
+
+/** \internal \returns tanh(x) for double precision float */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet ptanh_double(const Packet& x);
+
+/** \internal \returns atanh(x) for single precision float */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patanh_float(const Packet& x);
+
+/** \internal \returns atanh(x) for double precision float */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet patanh_double(const Packet& x);
+
+/** \internal \returns sqrt(x) for complex types */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet psqrt_complex(const Packet& a);
+
+/** \internal \returns x / y for complex types */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pdiv_complex(const Packet& x, const Packet& y);
+
+template <typename Packet, int N>
+struct ppolevl;
+
+/** \internal \returns log(x) for complex types */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plog_complex(const Packet& x);
+
+/** \internal \returns exp(x) for complex types */
+template <typename Packet>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pexp_complex(const Packet& x);
+
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_rint(const Packet& a);
+
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_floor(const Packet& a);
+
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_ceil(const Packet& a);
+
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_trunc(const Packet& a);
+
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet generic_round(const Packet& a);
+
+// Macros for instantiating these generic functions for different backends.
+#define EIGEN_PACKET_FUNCTION(METHOD, SCALAR, PACKET)                                             \
+  template <>                                                                                     \
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_UNUSED PACKET p##METHOD<PACKET>(const PACKET& _x) { \
+    return p##METHOD##_##SCALAR(_x);                                                              \
+  }
+
+// Macros for instantiating these generic functions for different backends.
+#define EIGEN_GENERIC_PACKET_FUNCTION(METHOD, PACKET)                                             \
+  template <>                                                                                     \
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC EIGEN_UNUSED PACKET p##METHOD<PACKET>(const PACKET& _x) { \
+    return generic_##METHOD(_x);                                                                  \
+  }
+
+#define EIGEN_FLOAT_PACKET_FUNCTION(METHOD, PACKET) EIGEN_PACKET_FUNCTION(METHOD, float, PACKET)
+#define EIGEN_DOUBLE_PACKET_FUNCTION(METHOD, PACKET) EIGEN_PACKET_FUNCTION(METHOD, double, PACKET)
+
+#define EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(PACKET) \
+  EIGEN_FLOAT_PACKET_FUNCTION(sin, PACKET)                 \
+  EIGEN_FLOAT_PACKET_FUNCTION(cos, PACKET)                 \
+  EIGEN_FLOAT_PACKET_FUNCTION(asin, PACKET)                \
+  EIGEN_FLOAT_PACKET_FUNCTION(acos, PACKET)                \
+  EIGEN_FLOAT_PACKET_FUNCTION(tanh, PACKET)                \
+  EIGEN_FLOAT_PACKET_FUNCTION(atanh, PACKET)               \
+  EIGEN_FLOAT_PACKET_FUNCTION(log, PACKET)                 \
+  EIGEN_FLOAT_PACKET_FUNCTION(log2, PACKET)                \
+  EIGEN_FLOAT_PACKET_FUNCTION(exp, PACKET)                 \
+  EIGEN_FLOAT_PACKET_FUNCTION(cbrt, PACKET)                \
+  EIGEN_GENERIC_PACKET_FUNCTION(expm1, PACKET)             \
+  EIGEN_GENERIC_PACKET_FUNCTION(exp2, PACKET)              \
+  EIGEN_GENERIC_PACKET_FUNCTION(log1p, PACKET)             \
+  EIGEN_GENERIC_PACKET_FUNCTION(atan, PACKET)
+
+#define EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_DOUBLE(PACKET) \
+  EIGEN_DOUBLE_PACKET_FUNCTION(atanh, PACKET)               \
+  EIGEN_DOUBLE_PACKET_FUNCTION(log, PACKET)                 \
+  EIGEN_DOUBLE_PACKET_FUNCTION(sin, PACKET)                 \
+  EIGEN_DOUBLE_PACKET_FUNCTION(cos, PACKET)                 \
+  EIGEN_DOUBLE_PACKET_FUNCTION(log2, PACKET)                \
+  EIGEN_DOUBLE_PACKET_FUNCTION(exp, PACKET)                 \
+  EIGEN_DOUBLE_PACKET_FUNCTION(tanh, PACKET)                \
+  EIGEN_DOUBLE_PACKET_FUNCTION(cbrt, PACKET)                \
+  EIGEN_GENERIC_PACKET_FUNCTION(atan, PACKET)               \
+  EIGEN_GENERIC_PACKET_FUNCTION(exp2, PACKET)
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_ARCH_GENERIC_PACKET_MATH_FUNCTIONS_FWD_H
diff --git a/inst/include/Eigen/src/Core/arch/Default/Half.h b/inst/include/Eigen/src/Core/arch/Default/Half.h
new file mode 100644
index 00000000..210dfff1
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/Default/Half.h
@@ -0,0 +1,1091 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+//
+// The conversion routines are Copyright (c) Fabian Giesen, 2016.
+// The original license follows:
+//
+// Copyright (c) Fabian Giesen, 2016
+// All rights reserved.
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted.
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Standard 16-bit float type, mostly useful for GPUs. Defines a new
+// type Eigen::half (inheriting either from CUDA's or HIP's __half struct) with
+// operator overloads such that it behaves basically as an arithmetic
+// type. It will be quite slow on CPUs (so it is recommended to stay
+// in fp32 for CPUs, except for simple parameter conversions, I/O
+// to disk and the likes), but fast on GPUs.
+
+#ifndef EIGEN_HALF_H
+#define EIGEN_HALF_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+// When compiling with GPU support, the "__half_raw" base class as well as
+// some other routines are defined in the GPU compiler header files
+// (cuda_fp16.h, hip_fp16.h), and they are not tagged constexpr
+// As a consequence, we get compile failures when compiling Eigen with
+// GPU support. Hence the need to disable EIGEN_CONSTEXPR when building
+// Eigen with GPU support.
+// Any functions that require `numext::bit_cast` may also not be constexpr,
+// including any native types when setting via raw bit values.
+#if defined(EIGEN_HAS_GPU_FP16) || defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
+#define _EIGEN_MAYBE_CONSTEXPR
+#else
+#define _EIGEN_MAYBE_CONSTEXPR constexpr
+#endif
+
+#define F16_PACKET_FUNCTION(PACKET_F, PACKET_F16, METHOD)                                                  \
+  template <>                                                                                              \
+  EIGEN_UNUSED EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC PACKET_F16 METHOD<PACKET_F16>(const PACKET_F16& _x) { \
+    return float2half(METHOD<PACKET_F>(half2float(_x)));                                                   \
+  }
+
+namespace Eigen {
+
+struct half;
+
+namespace half_impl {
+
+// We want to use the __half_raw struct from the HIP header file only during the device compile phase.
+// This is required because of a quirk in the way TensorFlow GPU builds are done.
+// When compiling TensorFlow source code with GPU support, files that
+//  * contain GPU kernels (i.e. *.cu.cc files) are compiled via hipcc
+//  * do not contain GPU kernels ( i.e. *.cc files) are compiled via gcc (typically)
+//
+// Tensorflow uses the Eigen::half type as its FP16 type, and there are functions that
+//  * are defined in a file that gets compiled via hipcc AND
+//  * have Eigen::half as a pass-by-value argument AND
+//  * are called in a file that gets compiled via gcc
+//
+// In the scenario described above the caller and callee will see different versions
+// of the Eigen::half base class __half_raw, and they will be compiled by different compilers
+//
+// There appears to be an ABI mismatch between gcc and clang (which is called by hipcc) that results in
+// the callee getting corrupted values for the Eigen::half argument.
+//
+// Making the host side compile phase of hipcc use the same Eigen::half impl, as the gcc compile, resolves
+// this error, and hence the following convoluted #if condition
+#if !defined(EIGEN_HAS_GPU_FP16) || !defined(EIGEN_GPU_COMPILE_PHASE)
+
+// Make our own __half_raw definition that is similar to CUDA's.
+struct __half_raw {
+  struct construct_from_rep_tag {};
+#if (defined(EIGEN_HAS_GPU_FP16) && !defined(EIGEN_GPU_COMPILE_PHASE))
+  // Eigen::half can be used as the datatype for shared memory declarations (in Eigen and TF)
+  // The element type for shared memory cannot have non-trivial constructors
+  // and hence the following special casing (which skips the zero-initilization).
+  // Note that this check gets done even in the host compilation phase, and
+  // hence the need for this
+  EIGEN_DEVICE_FUNC __half_raw() {}
+#else
+  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR __half_raw() : x(0) {}
+#endif
+
+#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
+  explicit EIGEN_DEVICE_FUNC __half_raw(numext::uint16_t raw) : x(numext::bit_cast<__fp16>(raw)) {}
+  EIGEN_DEVICE_FUNC constexpr __half_raw(construct_from_rep_tag, __fp16 rep) : x{rep} {}
+  __fp16 x;
+#elif defined(EIGEN_HAS_BUILTIN_FLOAT16)
+  explicit EIGEN_DEVICE_FUNC __half_raw(numext::uint16_t raw) : x(numext::bit_cast<_Float16>(raw)) {}
+  EIGEN_DEVICE_FUNC constexpr __half_raw(construct_from_rep_tag, _Float16 rep) : x{rep} {}
+  _Float16 x;
+#else
+  explicit EIGEN_DEVICE_FUNC constexpr __half_raw(numext::uint16_t raw) : x(raw) {}
+  EIGEN_DEVICE_FUNC constexpr __half_raw(construct_from_rep_tag, numext::uint16_t rep) : x{rep} {}
+  numext::uint16_t x;
+#endif
+};
+
+#elif defined(EIGEN_HAS_HIP_FP16)
+// HIP GPU compile phase: nothing to do here.
+// HIP fp16 header file has a definition for __half_raw
+#elif defined(EIGEN_HAS_CUDA_FP16)
+
+// CUDA GPU compile phase.
+#if EIGEN_CUDA_SDK_VER < 90000
+// In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw
+typedef __half __half_raw;
+#endif  // defined(EIGEN_HAS_CUDA_FP16)
+
+#elif defined(SYCL_DEVICE_ONLY)
+typedef cl::sycl::half __half_raw;
+#endif
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR __half_raw raw_uint16_to_half(numext::uint16_t x);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff);
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h);
+
+struct half_base : public __half_raw {
+  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base() {}
+  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base(const __half_raw& h) : __half_raw(h) {}
+
+#if defined(EIGEN_HAS_GPU_FP16)
+#if defined(EIGEN_HAS_HIP_FP16)
+  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base(const __half& h) { x = __half_as_ushort(h); }
+#elif defined(EIGEN_HAS_CUDA_FP16)
+#if EIGEN_CUDA_SDK_VER >= 90000
+  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {}
+#endif
+#endif
+#endif
+};
+
+}  // namespace half_impl
+
+// Class definition.
+struct half : public half_impl::half_base {
+  // Writing this out as separate #if-else blocks to make the code easier to follow
+  // The same applies to most #if-else blocks in this file
+#if !defined(EIGEN_HAS_GPU_FP16) || !defined(EIGEN_GPU_COMPILE_PHASE)
+  // Use the same base class for the following two scenarios
+  // * when compiling without GPU support enabled
+  // * during host compile phase when compiling with GPU support enabled
+  typedef half_impl::__half_raw __half_raw;
+#elif defined(EIGEN_HAS_HIP_FP16)
+  // Nothing to do here
+  // HIP fp16 header file has a definition for __half_raw
+#elif defined(EIGEN_HAS_CUDA_FP16)
+// Note that EIGEN_CUDA_SDK_VER is set to 0 even when compiling with HIP, so
+// (EIGEN_CUDA_SDK_VER < 90000) is true even for HIP!  So keeping this within
+// #if defined(EIGEN_HAS_CUDA_FP16) is needed
+#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
+  typedef half_impl::__half_raw __half_raw;
+#endif
+#endif
+
+  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half() {}
+
+  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(const __half_raw& h) : half_impl::half_base(h) {}
+
+#if defined(EIGEN_HAS_GPU_FP16)
+#if defined(EIGEN_HAS_HIP_FP16)
+  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(const __half& h) : half_impl::half_base(h) {}
+#elif defined(EIGEN_HAS_CUDA_FP16)
+#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
+  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(const __half& h) : half_impl::half_base(h) {}
+#endif
+#endif
+#endif
+
+#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
+  explicit EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(__fp16 b)
+      : half(__half_raw(__half_raw::construct_from_rep_tag(), b)) {}
+#elif defined(EIGEN_HAS_BUILTIN_FLOAT16)
+  explicit EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(_Float16 b)
+      : half(__half_raw(__half_raw::construct_from_rep_tag(), b)) {}
+#endif
+
+  explicit EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR half(bool b)
+      : half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {}
+  template <class T>
+  explicit EIGEN_DEVICE_FUNC half(T val)
+      : half_impl::half_base(half_impl::float_to_half_rtne(static_cast<float>(val))) {}
+  explicit EIGEN_DEVICE_FUNC half(float f) : half_impl::half_base(half_impl::float_to_half_rtne(f)) {}
+
+  // Following the convention of numpy, converting between complex and
+  // float will lead to loss of imag value.
+  template <typename RealScalar>
+  explicit EIGEN_DEVICE_FUNC half(std::complex<RealScalar> c)
+      : half_impl::half_base(half_impl::float_to_half_rtne(static_cast<float>(c.real()))) {}
+
+  EIGEN_DEVICE_FUNC operator float() const {  // NOLINT: Allow implicit conversion to float, because it is lossless.
+    return half_impl::half_to_float(*this);
+  }
+
+#if defined(EIGEN_HAS_GPU_FP16) && !defined(EIGEN_GPU_COMPILE_PHASE)
+  EIGEN_DEVICE_FUNC operator __half() const {
+    ::__half_raw hr;
+    hr.x = x;
+    return __half(hr);
+  }
+#endif
+};
+
+// TODO(majnemer): Get rid of this once we can rely on C++17 inline variables do
+// solve the ODR issue.
+namespace half_impl {
+template <typename = void>
+struct numeric_limits_half_impl {
+  static constexpr const bool is_specialized = true;
+  static constexpr const bool is_signed = true;
+  static constexpr const bool is_integer = false;
+  static constexpr const bool is_exact = false;
+  static constexpr const bool has_infinity = true;
+  static constexpr const bool has_quiet_NaN = true;
+  static constexpr const bool has_signaling_NaN = true;
+  EIGEN_DIAGNOSTICS(push)
+  EIGEN_DISABLE_DEPRECATED_WARNING
+  static constexpr const std::float_denorm_style has_denorm = std::denorm_present;
+  static constexpr const bool has_denorm_loss = false;
+  EIGEN_DIAGNOSTICS(pop)
+  static constexpr const std::float_round_style round_style = std::round_to_nearest;
+  static constexpr const bool is_iec559 = true;
+  // The C++ standard defines this as "true if the set of values representable
+  // by the type is finite." Half has finite precision.
+  static constexpr const bool is_bounded = true;
+  static constexpr const bool is_modulo = false;
+  static constexpr const int digits = 11;
+  static constexpr const int digits10 =
+      3;  // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
+  static constexpr const int max_digits10 =
+      5;  // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html
+  static constexpr const int radix = std::numeric_limits<float>::radix;
+  static constexpr const int min_exponent = -13;
+  static constexpr const int min_exponent10 = -4;
+  static constexpr const int max_exponent = 16;
+  static constexpr const int max_exponent10 = 4;
+  static constexpr const bool traps = std::numeric_limits<float>::traps;
+  // IEEE754: "The implementer shall choose how tininess is detected, but shall
+  // detect tininess in the same way for all operations in radix two"
+  static constexpr const bool tinyness_before = std::numeric_limits<float>::tinyness_before;
+
+  static _EIGEN_MAYBE_CONSTEXPR Eigen::half(min)() { return Eigen::half_impl::raw_uint16_to_half(0x0400); }
+  static _EIGEN_MAYBE_CONSTEXPR Eigen::half lowest() { return Eigen::half_impl::raw_uint16_to_half(0xfbff); }
+  static _EIGEN_MAYBE_CONSTEXPR Eigen::half(max)() { return Eigen::half_impl::raw_uint16_to_half(0x7bff); }
+  static _EIGEN_MAYBE_CONSTEXPR Eigen::half epsilon() { return Eigen::half_impl::raw_uint16_to_half(0x1400); }
+  static _EIGEN_MAYBE_CONSTEXPR Eigen::half round_error() { return Eigen::half_impl::raw_uint16_to_half(0x3800); }
+  static _EIGEN_MAYBE_CONSTEXPR Eigen::half infinity() { return Eigen::half_impl::raw_uint16_to_half(0x7c00); }
+  static _EIGEN_MAYBE_CONSTEXPR Eigen::half quiet_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); }
+  static _EIGEN_MAYBE_CONSTEXPR Eigen::half signaling_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7d00); }
+  static _EIGEN_MAYBE_CONSTEXPR Eigen::half denorm_min() { return Eigen::half_impl::raw_uint16_to_half(0x0001); }
+};
+
+template <typename T>
+constexpr const bool numeric_limits_half_impl<T>::is_specialized;
+template <typename T>
+constexpr const bool numeric_limits_half_impl<T>::is_signed;
+template <typename T>
+constexpr const bool numeric_limits_half_impl<T>::is_integer;
+template <typename T>
+constexpr const bool numeric_limits_half_impl<T>::is_exact;
+template <typename T>
+constexpr const bool numeric_limits_half_impl<T>::has_infinity;
+template <typename T>
+constexpr const bool numeric_limits_half_impl<T>::has_quiet_NaN;
+template <typename T>
+constexpr const bool numeric_limits_half_impl<T>::has_signaling_NaN;
+EIGEN_DIAGNOSTICS(push)
+EIGEN_DISABLE_DEPRECATED_WARNING
+template <typename T>
+constexpr const std::float_denorm_style numeric_limits_half_impl<T>::has_denorm;
+template <typename T>
+constexpr const bool numeric_limits_half_impl<T>::has_denorm_loss;
+EIGEN_DIAGNOSTICS(pop)
+template <typename T>
+constexpr const std::float_round_style numeric_limits_half_impl<T>::round_style;
+template <typename T>
+constexpr const bool numeric_limits_half_impl<T>::is_iec559;
+template <typename T>
+constexpr const bool numeric_limits_half_impl<T>::is_bounded;
+template <typename T>
+constexpr const bool numeric_limits_half_impl<T>::is_modulo;
+template <typename T>
+constexpr const int numeric_limits_half_impl<T>::digits;
+template <typename T>
+constexpr const int numeric_limits_half_impl<T>::digits10;
+template <typename T>
+constexpr const int numeric_limits_half_impl<T>::max_digits10;
+template <typename T>
+constexpr const int numeric_limits_half_impl<T>::radix;
+template <typename T>
+constexpr const int numeric_limits_half_impl<T>::min_exponent;
+template <typename T>
+constexpr const int numeric_limits_half_impl<T>::min_exponent10;
+template <typename T>
+constexpr const int numeric_limits_half_impl<T>::max_exponent;
+template <typename T>
+constexpr const int numeric_limits_half_impl<T>::max_exponent10;
+template <typename T>
+constexpr const bool numeric_limits_half_impl<T>::traps;
+template <typename T>
+constexpr const bool numeric_limits_half_impl<T>::tinyness_before;
+}  // end namespace half_impl
+}  // end namespace Eigen
+
+namespace std {
+// If std::numeric_limits<T> is specialized, should also specialize
+// std::numeric_limits<const T>, std::numeric_limits<volatile T>, and
+// std::numeric_limits<const volatile T>
+// https://stackoverflow.com/a/16519653/
+template <>
+class numeric_limits<Eigen::half> : public Eigen::half_impl::numeric_limits_half_impl<> {};
+template <>
+class numeric_limits<const Eigen::half> : public numeric_limits<Eigen::half> {};
+template <>
+class numeric_limits<volatile Eigen::half> : public numeric_limits<Eigen::half> {};
+template <>
+class numeric_limits<const volatile Eigen::half> : public numeric_limits<Eigen::half> {};
+}  // end namespace std
+
+namespace Eigen {
+
+namespace half_impl {
+
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
+    (defined(EIGEN_HAS_HIP_FP16) && defined(HIP_DEVICE_COMPILE))
+// Note: We deliberately do *not* define this to 1 even if we have Arm's native
+// fp16 type since GPU half types are rather different from native CPU half types.
+#define EIGEN_HAS_NATIVE_GPU_FP16
+#endif
+
+// Intrinsics for native fp16 support. Note that on current hardware,
+// these are no faster than fp32 arithmetic (you need to use the half2
+// versions to get the ALU speed increased), but you do save the
+// conversion steps back and forth.
+
+#if defined(EIGEN_HAS_NATIVE_GPU_FP16)
+EIGEN_STRONG_INLINE __device__ half operator+(const half& a, const half& b) {
+#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
+  return __hadd(::__half(a), ::__half(b));
+#else
+  return __hadd(a, b);
+#endif
+}
+EIGEN_STRONG_INLINE __device__ half operator*(const half& a, const half& b) { return __hmul(a, b); }
+EIGEN_STRONG_INLINE __device__ half operator-(const half& a, const half& b) { return __hsub(a, b); }
+EIGEN_STRONG_INLINE __device__ half operator/(const half& a, const half& b) {
+#if defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER >= 90000
+  return __hdiv(a, b);
+#else
+  float num = __half2float(a);
+  float denom = __half2float(b);
+  return __float2half(num / denom);
+#endif
+}
+EIGEN_STRONG_INLINE __device__ half operator-(const half& a) { return __hneg(a); }
+EIGEN_STRONG_INLINE __device__ half& operator+=(half& a, const half& b) {
+  a = a + b;
+  return a;
+}
+EIGEN_STRONG_INLINE __device__ half& operator*=(half& a, const half& b) {
+  a = a * b;
+  return a;
+}
+EIGEN_STRONG_INLINE __device__ half& operator-=(half& a, const half& b) {
+  a = a - b;
+  return a;
+}
+EIGEN_STRONG_INLINE __device__ half& operator/=(half& a, const half& b) {
+  a = a / b;
+  return a;
+}
+EIGEN_STRONG_INLINE __device__ bool operator==(const half& a, const half& b) { return __heq(a, b); }
+EIGEN_STRONG_INLINE __device__ bool operator!=(const half& a, const half& b) { return __hne(a, b); }
+EIGEN_STRONG_INLINE __device__ bool operator<(const half& a, const half& b) { return __hlt(a, b); }
+EIGEN_STRONG_INLINE __device__ bool operator<=(const half& a, const half& b) { return __hle(a, b); }
+EIGEN_STRONG_INLINE __device__ bool operator>(const half& a, const half& b) { return __hgt(a, b); }
+EIGEN_STRONG_INLINE __device__ bool operator>=(const half& a, const half& b) { return __hge(a, b); }
+
+#endif  // EIGEN_HAS_NATIVE_GPU_FP16
+
+#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) && !defined(EIGEN_GPU_COMPILE_PHASE)
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator+(const half& a, const half& b) { return half(vaddh_f16(a.x, b.x)); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator*(const half& a, const half& b) { return half(vmulh_f16(a.x, b.x)); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator-(const half& a, const half& b) { return half(vsubh_f16(a.x, b.x)); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator/(const half& a, const half& b) { return half(vdivh_f16(a.x, b.x)); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator-(const half& a) { return half(vnegh_f16(a.x)); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator+=(half& a, const half& b) {
+  a = half(vaddh_f16(a.x, b.x));
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator*=(half& a, const half& b) {
+  a = half(vmulh_f16(a.x, b.x));
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator-=(half& a, const half& b) {
+  a = half(vsubh_f16(a.x, b.x));
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator/=(half& a, const half& b) {
+  a = half(vdivh_f16(a.x, b.x));
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator==(const half& a, const half& b) { return vceqh_f16(a.x, b.x); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator!=(const half& a, const half& b) { return !vceqh_f16(a.x, b.x); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<(const half& a, const half& b) { return vclth_f16(a.x, b.x); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<=(const half& a, const half& b) { return vcleh_f16(a.x, b.x); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>(const half& a, const half& b) { return vcgth_f16(a.x, b.x); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>=(const half& a, const half& b) { return vcgeh_f16(a.x, b.x); }
+
+#elif defined(EIGEN_HAS_BUILTIN_FLOAT16) && !defined(EIGEN_GPU_COMPILE_PHASE)
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator+(const half& a, const half& b) { return half(a.x + b.x); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator*(const half& a, const half& b) { return half(a.x * b.x); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator-(const half& a, const half& b) { return half(a.x - b.x); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator/(const half& a, const half& b) { return half(a.x / b.x); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator-(const half& a) { return half(-a.x); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator+=(half& a, const half& b) {
+  a = a + b;
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator*=(half& a, const half& b) {
+  a = a * b;
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator-=(half& a, const half& b) {
+  a = a - b;
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator/=(half& a, const half& b) {
+  a = a / b;
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator==(const half& a, const half& b) { return a.x == b.x; }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator!=(const half& a, const half& b) { return a.x != b.x; }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<(const half& a, const half& b) { return a.x < b.x; }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<=(const half& a, const half& b) { return a.x <= b.x; }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>(const half& a, const half& b) { return a.x > b.x; }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>=(const half& a, const half& b) { return a.x >= b.x; }
+
+// We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler,
+// invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation
+// of the functions, while the latter can only deal with one of them.
+#elif !defined(EIGEN_HAS_NATIVE_GPU_FP16) || (EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)  // Emulate support for half floats
+
+#if EIGEN_COMP_CLANG && defined(EIGEN_GPUCC)
+// We need to provide emulated *host-side* FP16 operators for clang.
+#pragma push_macro("EIGEN_DEVICE_FUNC")
+#undef EIGEN_DEVICE_FUNC
+#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_HAS_NATIVE_GPU_FP16)
+#define EIGEN_DEVICE_FUNC __host__
+#else  // both host and device need emulated ops.
+#define EIGEN_DEVICE_FUNC __host__ __device__
+#endif
+#endif
+
+// Definitions for CPUs and older HIP+CUDA, mostly working through conversion
+// to/from fp32.
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator+(const half& a, const half& b) { return half(float(a) + float(b)); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator*(const half& a, const half& b) { return half(float(a) * float(b)); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator-(const half& a, const half& b) { return half(float(a) - float(b)); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator/(const half& a, const half& b) { return half(float(a) / float(b)); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator-(const half& a) {
+  half result;
+  result.x = a.x ^ 0x8000;
+  return result;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator+=(half& a, const half& b) {
+  a = half(float(a) + float(b));
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator*=(half& a, const half& b) {
+  a = half(float(a) * float(b));
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator-=(half& a, const half& b) {
+  a = half(float(a) - float(b));
+  return a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator/=(half& a, const half& b) {
+  a = half(float(a) / float(b));
+  return a;
+}
+
+// Non-negative floating point numbers have a monotonic mapping to non-negative integers.
+// This property allows floating point numbers to be reinterpreted as integers for comparisons, which is useful if there
+// is no native floating point comparison operator. Floating point signedness is handled by the sign-magnitude
+// representation, whereas integers typically use two's complement. Converting the bit pattern from sign-magnitude to
+// two's complement allows the transformed bit patterns be compared as signed integers. All edge cases (+/-0 and +/-
+// infinity) are handled automatically, except NaN.
+//
+// fp16 uses 1 sign bit, 5 exponent bits, and 10 mantissa bits. The bit pattern conveys NaN when all the exponent
+// bits (5) are set, and at least one mantissa bit is set. The sign bit is irrelevant for determining NaN. To check for
+// NaN, clear the sign bit and check if the integral representation is greater than 01111100000000. To test
+// for non-NaN, clear the sign bit and check if the integeral representation is less than or equal to 01111100000000.
+
+// convert sign-magnitude representation to two's complement
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC int16_t mapToSigned(uint16_t a) {
+  constexpr uint16_t kAbsMask = (1 << 15) - 1;
+  // If the sign bit is set, clear the sign bit and return the (integer) negation. Otherwise, return the input.
+  return (a >> 15) ? -(a & kAbsMask) : a;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool isOrdered(const half& a, const half& b) {
+  constexpr uint16_t kInf = ((1 << 5) - 1) << 10;
+  constexpr uint16_t kAbsMask = (1 << 15) - 1;
+  return numext::maxi(a.x & kAbsMask, b.x & kAbsMask) <= kInf;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator==(const half& a, const half& b) {
+  bool result = mapToSigned(a.x) == mapToSigned(b.x);
+  result &= isOrdered(a, b);
+  return result;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator!=(const half& a, const half& b) { return !(a == b); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<(const half& a, const half& b) {
+  bool result = mapToSigned(a.x) < mapToSigned(b.x);
+  result &= isOrdered(a, b);
+  return result;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator<=(const half& a, const half& b) {
+  bool result = mapToSigned(a.x) <= mapToSigned(b.x);
+  result &= isOrdered(a, b);
+  return result;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>(const half& a, const half& b) {
+  bool result = mapToSigned(a.x) > mapToSigned(b.x);
+  result &= isOrdered(a, b);
+  return result;
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator>=(const half& a, const half& b) {
+  bool result = mapToSigned(a.x) >= mapToSigned(b.x);
+  result &= isOrdered(a, b);
+  return result;
+}
+
+#if EIGEN_COMP_CLANG && defined(EIGEN_GPUCC)
+#pragma pop_macro("EIGEN_DEVICE_FUNC")
+#endif
+
+#endif  // Emulate support for half floats
+
+// Division by an index. Do it in full float precision to avoid accuracy
+// issues in converting the denominator to half.
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator/(const half& a, Index b) {
+  return half(static_cast<float>(a) / static_cast<float>(b));
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator++(half& a) {
+  a += half(1);
+  return a;
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator--(half& a) {
+  a -= half(1);
+  return a;
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator++(half& a, int) {
+  half original_value = a;
+  ++a;
+  return original_value;
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator--(half& a, int) {
+  half original_value = a;
+  --a;
+  return original_value;
+}
+
+// Conversion routines, including fallbacks for the host or older CUDA.
+// Note that newer Intel CPUs (Haswell or newer) have vectorized versions of
+// these in hardware. If we need more performance on older/other CPUs, they are
+// also possible to vectorize directly.
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR __half_raw raw_uint16_to_half(numext::uint16_t x) {
+  // We cannot simply do a "return __half_raw(x)" here, because __half_raw is union type
+  // in the hip_fp16 header file, and that will trigger a compile error
+  // On the other hand, having anything but a return statement also triggers a compile error
+  // because this is constexpr function.
+  // Fortunately, since we need to disable EIGEN_CONSTEXPR for GPU anyway, we can get out
+  // of this catch22 by having separate bodies for GPU / non GPU
+#if defined(EIGEN_HAS_GPU_FP16)
+  __half_raw h;
+  h.x = x;
+  return h;
+#else
+  return __half_raw(x);
+#endif
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC numext::uint16_t raw_half_as_uint16(const __half_raw& h) {
+  // HIP/CUDA/Default have a member 'x' of type uint16_t.
+  // For ARM64 native half, the member 'x' is of type __fp16, so we need to bit-cast.
+  // For SYCL, cl::sycl::half is _Float16, so cast directly.
+#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
+  return numext::bit_cast<numext::uint16_t>(h.x);
+#elif defined(EIGEN_HAS_BUILTIN_FLOAT16)
+  return numext::bit_cast<numext::uint16_t>(h.x);
+#elif defined(SYCL_DEVICE_ONLY)
+  return numext::bit_cast<numext::uint16_t>(h);
+#else
+  return h.x;
+#endif
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) {
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
+    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+  __half tmp_ff = __float2half(ff);
+  return *(__half_raw*)&tmp_ff;
+
+#elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
+  __half_raw h;
+  h.x = static_cast<__fp16>(ff);
+  return h;
+
+#elif defined(EIGEN_HAS_BUILTIN_FLOAT16)
+  __half_raw h;
+  h.x = static_cast<_Float16>(ff);
+  return h;
+
+#elif defined(EIGEN_HAS_FP16_C)
+  __half_raw h;
+#if EIGEN_COMP_MSVC
+  // MSVC does not have scalar instructions.
+  h.x = _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(ff), 0), 0);
+#else
+  h.x = _cvtss_sh(ff, 0);
+#endif
+  return h;
+
+#else
+  uint32_t f_bits = Eigen::numext::bit_cast<uint32_t>(ff);
+  const uint32_t f32infty_bits = {255 << 23};
+  const uint32_t f16max_bits = {(127 + 16) << 23};
+  const uint32_t denorm_magic_bits = {((127 - 15) + (23 - 10) + 1) << 23};
+  const uint32_t sign_mask = 0x80000000u;
+  __half_raw o;
+  o.x = static_cast<uint16_t>(0x0u);
+
+  const uint32_t sign = f_bits & sign_mask;
+  f_bits ^= sign;
+
+  // NOTE all the integer compares in this function can be safely
+  // compiled into signed compares since all operands are below
+  // 0x80000000. Important if you want fast straight SSE2 code
+  // (since there's no unsigned PCMPGTD).
+
+  if (f_bits >= f16max_bits) {                         // result is Inf or NaN (all exponent bits set)
+    o.x = (f_bits > f32infty_bits) ? 0x7e00 : 0x7c00;  // NaN->qNaN and Inf->Inf
+  } else {                                             // (De)normalized number or zero
+    if (f_bits < (113 << 23)) {                        // resulting FP16 is subnormal or zero
+      // use a magic value to align our 10 mantissa bits at the bottom of
+      // the float. as long as FP addition is round-to-nearest-even this
+      // just works.
+      f_bits = Eigen::numext::bit_cast<uint32_t>(Eigen::numext::bit_cast<float>(f_bits) +
+                                                 Eigen::numext::bit_cast<float>(denorm_magic_bits));
+
+      // and one integer subtract of the bias later, we have our final float!
+      o.x = static_cast<numext::uint16_t>(f_bits - denorm_magic_bits);
+    } else {
+      const uint32_t mant_odd = (f_bits >> 13) & 1;  // resulting mantissa is odd
+
+      // update exponent, rounding bias part 1
+      // Equivalent to `f.u += ((unsigned int)(15 - 127) << 23) + 0xfff`, but
+      // without arithmetic overflow.
+      f_bits += 0xc8000fffU;
+      // rounding bias part 2
+      f_bits += mant_odd;
+      // take the bits!
+      o.x = static_cast<numext::uint16_t>(f_bits >> 13);
+    }
+  }
+
+  o.x |= static_cast<numext::uint16_t>(sign >> 16);
+  return o;
+#endif
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) {
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
+    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+  return __half2float(h);
+#elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
+  return static_cast<float>(h.x);
+#elif defined(EIGEN_HAS_FP16_C)
+#if EIGEN_COMP_MSVC
+  // MSVC does not have scalar instructions.
+  return _mm_cvtss_f32(_mm_cvtph_ps(_mm_set1_epi16(h.x)));
+#else
+  return _cvtsh_ss(h.x);
+#endif
+#else
+  const float magic = Eigen::numext::bit_cast<float>(static_cast<uint32_t>(113 << 23));
+  const uint32_t shifted_exp = 0x7c00 << 13;  // exponent mask after shift
+  uint32_t o_bits = (h.x & 0x7fff) << 13;     // exponent/mantissa bits
+  const uint32_t exp = shifted_exp & o_bits;  // just the exponent
+  o_bits += (127 - 15) << 23;                 // exponent adjust
+
+  // handle exponent special cases
+  if (exp == shifted_exp) {      // Inf/NaN?
+    o_bits += (128 - 16) << 23;  // extra exp adjust
+  } else if (exp == 0) {         // Zero/Denormal?
+    o_bits += 1 << 23;           // extra exp adjust
+    // renormalize
+    o_bits = Eigen::numext::bit_cast<uint32_t>(Eigen::numext::bit_cast<float>(o_bits) - magic);
+  }
+
+  o_bits |= (h.x & 0x8000) << 16;  // sign bit
+  return Eigen::numext::bit_cast<float>(o_bits);
+#endif
+}
+
+// --- standard functions ---
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isinf)(const half& a) {
+#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
+  return (numext::bit_cast<numext::uint16_t>(a.x) & 0x7fff) == 0x7c00;
+#else
+  return (a.x & 0x7fff) == 0x7c00;
+#endif
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isnan)(const half& a) {
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530) || \
+    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+  return __hisnan(a);
+#elif defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
+  return (numext::bit_cast<numext::uint16_t>(a.x) & 0x7fff) > 0x7c00;
+#else
+  return (a.x & 0x7fff) > 0x7c00;
+#endif
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool(isfinite)(const half& a) {
+#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC) || defined(EIGEN_HAS_BUILTIN_FLOAT16)
+  return (numext::bit_cast<numext::uint16_t>(a.x) & 0x7fff) < 0x7c00;
+#else
+  return (a.x & 0x7fff) < 0x7c00;
+#endif
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) {
+#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
+  return half(vabsh_f16(a.x));
+#elif defined(EIGEN_HAS_BUILTIN_FLOAT16)
+  half result;
+  result.x =
+      numext::bit_cast<_Float16>(static_cast<numext::uint16_t>(numext::bit_cast<numext::uint16_t>(a.x) & 0x7FFF));
+  return result;
+#else
+  half result;
+  result.x = a.x & 0x7FFF;
+  return result;
+#endif
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) {
+#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
+    defined(EIGEN_HIP_DEVICE_COMPILE)
+  return half(hexp(a));
+#else
+  return half(::expf(float(a)));
+#endif
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp2(const half& a) {
+#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
+    defined(EIGEN_HIP_DEVICE_COMPILE)
+  return half(hexp2(a));
+#else
+  return half(::exp2f(float(a)));
+#endif
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half expm1(const half& a) { return half(numext::expm1(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) {
+#if (defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && \
+     EIGEN_CUDA_ARCH >= 530) ||                                                                 \
+    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+  return half(hlog(a));
+#else
+  return half(::logf(float(a)));
+#endif
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log1p(const half& a) { return half(numext::log1p(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) { return half(::log10f(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log2(const half& a) {
+  return half(static_cast<float>(EIGEN_LOG2E) * ::logf(float(a)));
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) {
+#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530) || \
+    defined(EIGEN_HIP_DEVICE_COMPILE)
+  return half(hsqrt(a));
+#else
+  return half(::sqrtf(float(a)));
+#endif
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half pow(const half& a, const half& b) {
+  return half(::powf(float(a), float(b)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half atan2(const half& a, const half& b) {
+  return half(::atan2f(float(a), float(b)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sin(const half& a) { return half(::sinf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half cos(const half& a) { return half(::cosf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tan(const half& a) { return half(::tanf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) { return half(::tanhf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half asin(const half& a) { return half(::asinf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half acos(const half& a) { return half(::acosf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half atan(const half& a) { return half(::atanf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half atanh(const half& a) { return half(::atanhf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) {
+#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
+    defined(EIGEN_HIP_DEVICE_COMPILE)
+  return half(hfloor(a));
+#else
+  return half(::floorf(float(a)));
+#endif
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) {
+#if (EIGEN_CUDA_SDK_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300) || \
+    defined(EIGEN_HIP_DEVICE_COMPILE)
+  return half(hceil(a));
+#else
+  return half(::ceilf(float(a)));
+#endif
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half rint(const half& a) { return half(::rintf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half round(const half& a) { return half(::roundf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half trunc(const half& a) { return half(::truncf(float(a))); }
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half fmod(const half& a, const half& b) {
+  return half(::fmodf(float(a), float(b)));
+}
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half(min)(const half& a, const half& b) { return b < a ? b : a; }
+
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half(max)(const half& a, const half& b) { return a < b ? b : a; }
+
+EIGEN_DEVICE_FUNC inline half fma(const half& a, const half& b, const half& c) {
+#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
+  return half(vfmah_f16(c.x, a.x, b.x));
+#elif defined(EIGEN_VECTORIZE_AVX512FP16)
+  // Reduces to vfmadd213sh.
+  return half(_mm_cvtsh_h(_mm_fmadd_ph(_mm_set_sh(a.x), _mm_set_sh(b.x), _mm_set_sh(c.x))));
+#else
+  // Emulate FMA via float.
+  return half(numext::fma(static_cast<float>(a), static_cast<float>(b), static_cast<float>(c)));
+#endif
+}
+
+#ifndef EIGEN_NO_IO
+EIGEN_ALWAYS_INLINE std::ostream& operator<<(std::ostream& os, const half& v) {
+  os << static_cast<float>(v);
+  return os;
+}
+#endif
+
+}  // end namespace half_impl
+
+// import Eigen::half_impl::half into Eigen namespace
+// using half_impl::half;
+
+namespace internal {
+
+template <>
+struct is_arithmetic<half> {
+  enum { value = true };
+};
+
+template <>
+struct random_impl<half> {
+  enum : int { MantissaBits = 10 };
+  using Impl = random_impl<float>;
+  static EIGEN_DEVICE_FUNC inline half run(const half& x, const half& y) {
+    float result = Impl::run(x, y, MantissaBits);
+    return half(result);
+  }
+  static EIGEN_DEVICE_FUNC inline half run() {
+    float result = Impl::run(MantissaBits);
+    return half(result);
+  }
+};
+
+}  // end namespace internal
+
+template <>
+struct NumTraits<Eigen::half> : GenericNumTraits<Eigen::half> {
+  enum { IsSigned = true, IsInteger = false, IsComplex = false, RequireInitialization = false };
+
+  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half epsilon() {
+    return half_impl::raw_uint16_to_half(0x0800);
+  }
+  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half dummy_precision() {
+    return half_impl::raw_uint16_to_half(0x211f);  //  Eigen::half(1e-2f);
+  }
+  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half highest() {
+    return half_impl::raw_uint16_to_half(0x7bff);
+  }
+  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half lowest() {
+    return half_impl::raw_uint16_to_half(0xfbff);
+  }
+  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half infinity() {
+    return half_impl::raw_uint16_to_half(0x7c00);
+  }
+  EIGEN_DEVICE_FUNC _EIGEN_MAYBE_CONSTEXPR static EIGEN_STRONG_INLINE Eigen::half quiet_NaN() {
+    return half_impl::raw_uint16_to_half(0x7e00);
+  }
+};
+
+}  // end namespace Eigen
+
+#undef _EIGEN_MAYBE_CONSTEXPR
+
+namespace Eigen {
+namespace numext {
+
+#if defined(EIGEN_GPU_COMPILE_PHASE)
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isnan)(const Eigen::half& h) {
+  return (half_impl::isnan)(h);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isinf)(const Eigen::half& h) {
+  return (half_impl::isinf)(h);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isfinite)(const Eigen::half& h) {
+  return (half_impl::isfinite)(h);
+}
+
+#endif
+
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bit_cast<Eigen::half, uint16_t>(const uint16_t& src) {
+  return Eigen::half(Eigen::half_impl::raw_uint16_to_half(src));
+}
+
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC uint16_t bit_cast<uint16_t, Eigen::half>(const Eigen::half& src) {
+  return Eigen::half_impl::raw_half_as_uint16(src);
+}
+
+// Specialize multiply-add to match packet operations and reduce conversions to/from float.
+template<>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half madd<Eigen::half>(const Eigen::half& x, const Eigen::half& y, const Eigen::half& z) {
+  return Eigen::half(static_cast<float>(x) * static_cast<float>(y) + static_cast<float>(z));
+}
+
+}  // namespace numext
+}  // namespace Eigen
+
+// Add the missing shfl* intrinsics.
+// The __shfl* functions are only valid on HIP or _CUDA_ARCH_ >= 300.
+//   CUDA defines them for (__CUDA_ARCH__ >= 300 || !defined(__CUDA_ARCH__))
+//
+// HIP and CUDA prior to SDK 9.0 define
+//    __shfl, __shfl_up, __shfl_down, __shfl_xor for int and float
+// CUDA since 9.0 deprecates those and instead defines
+//    __shfl_sync, __shfl_up_sync, __shfl_down_sync, __shfl_xor_sync,
+//    with native support for __half and __nv_bfloat16
+//
+// Note that the following are __device__ - only functions.
+#if (defined(EIGEN_CUDACC) && (!defined(EIGEN_CUDA_ARCH) || EIGEN_CUDA_ARCH >= 300)) || defined(EIGEN_HIPCC)
+
+#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDA_SDK_VER >= 90000
+
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_sync(unsigned mask, Eigen::half var, int srcLane,
+                                                       int width = warpSize) {
+  const __half h = var;
+  return static_cast<Eigen::half>(__shfl_sync(mask, h, srcLane, width));
+}
+
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_up_sync(unsigned mask, Eigen::half var, unsigned int delta,
+                                                          int width = warpSize) {
+  const __half h = var;
+  return static_cast<Eigen::half>(__shfl_up_sync(mask, h, delta, width));
+}
+
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_down_sync(unsigned mask, Eigen::half var, unsigned int delta,
+                                                            int width = warpSize) {
+  const __half h = var;
+  return static_cast<Eigen::half>(__shfl_down_sync(mask, h, delta, width));
+}
+
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor_sync(unsigned mask, Eigen::half var, int laneMask,
+                                                           int width = warpSize) {
+  const __half h = var;
+  return static_cast<Eigen::half>(__shfl_xor_sync(mask, h, laneMask, width));
+}
+
+#else  // HIP or CUDA SDK < 9.0
+
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl(Eigen::half var, int srcLane, int width = warpSize) {
+  const int ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var));
+  return Eigen::numext::bit_cast<Eigen::half>(static_cast<Eigen::numext::uint16_t>(__shfl(ivar, srcLane, width)));
+}
+
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_up(Eigen::half var, unsigned int delta, int width = warpSize) {
+  const int ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var));
+  return Eigen::numext::bit_cast<Eigen::half>(static_cast<Eigen::numext::uint16_t>(__shfl_up(ivar, delta, width)));
+}
+
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_down(Eigen::half var, unsigned int delta, int width = warpSize) {
+  const int ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var));
+  return Eigen::numext::bit_cast<Eigen::half>(static_cast<Eigen::numext::uint16_t>(__shfl_down(ivar, delta, width)));
+}
+
+__device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width = warpSize) {
+  const int ivar = static_cast<int>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(var));
+  return Eigen::numext::bit_cast<Eigen::half>(static_cast<Eigen::numext::uint16_t>(__shfl_xor(ivar, laneMask, width)));
+}
+
+#endif  // HIP vs CUDA
+#endif  // __shfl*
+
+// ldg() has an overload for __half_raw, but we also need one for Eigen::half.
+#if (defined(EIGEN_CUDACC) && (!defined(EIGEN_CUDA_ARCH) || EIGEN_CUDA_ARCH >= 350)) || defined(EIGEN_HIPCC)
+EIGEN_STRONG_INLINE __device__ Eigen::half __ldg(const Eigen::half* ptr) {
+  return Eigen::half_impl::raw_uint16_to_half(__ldg(reinterpret_cast<const Eigen::numext::uint16_t*>(ptr)));
+}
+#endif  // __ldg
+
+#if EIGEN_HAS_STD_HASH
+namespace std {
+template <>
+struct hash<Eigen::half> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::size_t operator()(const Eigen::half& a) const {
+    return static_cast<std::size_t>(Eigen::numext::bit_cast<Eigen::numext::uint16_t>(a));
+  }
+};
+}  // end namespace std
+#endif
+
+namespace Eigen {
+namespace internal {
+
+template <>
+struct cast_impl<float, half> {
+  EIGEN_DEVICE_FUNC static inline half run(const float& a) {
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
+    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+    return __float2half(a);
+#else
+    return half(a);
+#endif
+  }
+};
+
+template <>
+struct cast_impl<int, half> {
+  EIGEN_DEVICE_FUNC static inline half run(const int& a) {
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
+    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+    return __float2half(static_cast<float>(a));
+#else
+    return half(static_cast<float>(a));
+#endif
+  }
+};
+
+template <>
+struct cast_impl<half, float> {
+  EIGEN_DEVICE_FUNC static inline float run(const half& a) {
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
+    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+    return __half2float(a);
+#else
+    return static_cast<float>(a);
+#endif
+  }
+};
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_HALF_H
diff --git a/inst/include/Eigen/src/Core/arch/Default/Settings.h b/inst/include/Eigen/src/Core/arch/Default/Settings.h
index 097373c8..7e3a970a 100644
--- a/inst/include/Eigen/src/Core/arch/Default/Settings.h
+++ b/inst/include/Eigen/src/Core/arch/Default/Settings.h
@@ -8,7 +8,6 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-
 /* All the parameters defined in this file can be specialized in the
  * architecture specific files, and/or by the user.
  * More to come... */
@@ -17,33 +16,32 @@
 #define EIGEN_DEFAULT_SETTINGS_H
 
 /** Defines the maximal loop size to enable meta unrolling of loops.
-  * Note that the value here is expressed in Eigen's own notion of "number of FLOPS",
-  * it does not correspond to the number of iterations or the number of instructions
-  */
+ * Note that the value here is expressed in Eigen's own notion of "number of FLOPS",
+ * it does not correspond to the number of iterations or the number of instructions
+ */
 #ifndef EIGEN_UNROLLING_LIMIT
-#define EIGEN_UNROLLING_LIMIT 100
+#define EIGEN_UNROLLING_LIMIT 110
 #endif
 
 /** Defines the threshold between a "small" and a "large" matrix.
-  * This threshold is mainly used to select the proper product implementation.
-  */
+ * This threshold is mainly used to select the proper product implementation.
+ */
 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
 #endif
 
 /** Defines the maximal width of the blocks used in the triangular product and solver
-  * for vectors (level 2 blas xTRMV and xTRSV). The default is 8.
-  */
+ * for vectors (level 2 blas xTRMV and xTRSV). The default is 8.
+ */
 #ifndef EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH
 #define EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH 8
 #endif
 
-
 /** Defines the default number of registers available for that architecture.
-  * Currently it must be 8 or 16. Other values will fail.
-  */
+ * Currently it must be 8 or 16. Other values will fail.
+ */
 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 8
 #endif
 
-#endif // EIGEN_DEFAULT_SETTINGS_H
+#endif  // EIGEN_DEFAULT_SETTINGS_H
diff --git a/inst/include/Eigen/src/Core/arch/GPU/Complex.h b/inst/include/Eigen/src/Core/arch/GPU/Complex.h
new file mode 100644
index 00000000..fa46aec7
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/GPU/Complex.h
@@ -0,0 +1,244 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2021 C. Antonio Sanchez <cantonios@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_COMPLEX_GPU_H
+#define EIGEN_COMPLEX_GPU_H
+
+// Many std::complex methods such as operator+, operator-, operator* and
+// operator/ are not constexpr. Due to this, GCC and older versions of clang do
+// not treat them as device functions and thus Eigen functors making use of
+// these operators fail to compile. Here, we manually specialize these
+// operators and functors for complex types when building for CUDA to enable
+// their use on-device.
+//
+// NOTES:
+//  - Compound assignment operators +=,-=,*=,/=(Scalar) will not work on device,
+//    since they are already specialized in the standard. Using them will result
+//    in silent kernel failures.
+//  - Compiling with MSVC and using +=,-=,*=,/=(std::complex<Scalar>) will lead
+//    to duplicate definition errors, since these are already specialized in
+//    Visual Studio's <complex> header (contrary to the standard).  This is
+//    preferable to removing such definitions, which will lead to silent kernel
+//    failures.
+//  - Compiling with ICC requires defining _USE_COMPLEX_SPECIALIZATION_ prior
+//    to the first inclusion of <complex>.
+
+#if defined(EIGEN_GPUCC) && defined(EIGEN_GPU_COMPILE_PHASE)
+
+// ICC already specializes std::complex<float> and std::complex<double>
+// operators, preventing us from making them device functions here.
+// This will lead to silent runtime errors if the operators are used on device.
+//
+// To allow std::complex operator use on device, define _OVERRIDE_COMPLEX_SPECIALIZATION_
+// prior to first inclusion of <complex>.  This prevents ICC from adding
+// its own specializations, so our custom ones below can be used instead.
+#if !(EIGEN_COMP_ICC && defined(_USE_COMPLEX_SPECIALIZATION_))
+
+// Import Eigen's internal operator specializations.
+#define EIGEN_USING_STD_COMPLEX_OPERATORS           \
+  using Eigen::complex_operator_detail::operator+;  \
+  using Eigen::complex_operator_detail::operator-;  \
+  using Eigen::complex_operator_detail::operator*;  \
+  using Eigen::complex_operator_detail::operator/;  \
+  using Eigen::complex_operator_detail::operator+=; \
+  using Eigen::complex_operator_detail::operator-=; \
+  using Eigen::complex_operator_detail::operator*=; \
+  using Eigen::complex_operator_detail::operator/=; \
+  using Eigen::complex_operator_detail::operator==; \
+  using Eigen::complex_operator_detail::operator!=;
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+// Specialized std::complex overloads.
+namespace complex_operator_detail {
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> complex_multiply(const std::complex<T>& a,
+                                                                       const std::complex<T>& b) {
+  const T a_real = numext::real(a);
+  const T a_imag = numext::imag(a);
+  const T b_real = numext::real(b);
+  const T b_imag = numext::imag(b);
+  return std::complex<T>(a_real * b_real - a_imag * b_imag, a_imag * b_real + a_real * b_imag);
+}
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> complex_divide_fast(const std::complex<T>& a,
+                                                                          const std::complex<T>& b) {
+  const T a_real = numext::real(a);
+  const T a_imag = numext::imag(a);
+  const T b_real = numext::real(b);
+  const T b_imag = numext::imag(b);
+  const T norm = (b_real * b_real + b_imag * b_imag);
+  return std::complex<T>((a_real * b_real + a_imag * b_imag) / norm, (a_imag * b_real - a_real * b_imag) / norm);
+}
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> complex_divide_stable(const std::complex<T>& a,
+                                                                            const std::complex<T>& b) {
+  const T a_real = numext::real(a);
+  const T a_imag = numext::imag(a);
+  const T b_real = numext::real(b);
+  const T b_imag = numext::imag(b);
+  // Smith's complex division (https://arxiv.org/pdf/1210.4539.pdf),
+  // guards against over/under-flow.
+  const bool scale_imag = numext::abs(b_imag) <= numext::abs(b_real);
+  const T rscale = scale_imag ? T(1) : b_real / b_imag;
+  const T iscale = scale_imag ? b_imag / b_real : T(1);
+  const T denominator = b_real * rscale + b_imag * iscale;
+  return std::complex<T>((a_real * rscale + a_imag * iscale) / denominator,
+                         (a_imag * rscale - a_real * iscale) / denominator);
+}
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> complex_divide(const std::complex<T>& a,
+                                                                     const std::complex<T>& b) {
+#if EIGEN_FAST_MATH
+  return complex_divide_fast(a, b);
+#else
+  return complex_divide_stable(a, b);
+#endif
+}
+
+// NOTE: We cannot specialize compound assignment operators with Scalar T,
+//         (i.e.  operator@=(const T&), for @=+,-,*,/)
+//       since they are already specialized for float/double/long double within
+//       the standard <complex> header. We also do not specialize the stream
+//       operators.
+#define EIGEN_CREATE_STD_COMPLEX_OPERATOR_SPECIALIZATIONS(T)                                                        \
+                                                                                                                    \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator+(const std::complex<T>& a) { return a; }           \
+                                                                                                                    \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator-(const std::complex<T>& a) {                       \
+    return std::complex<T>(-numext::real(a), -numext::imag(a));                                                     \
+  }                                                                                                                 \
+                                                                                                                    \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator+(const std::complex<T>& a,                         \
+                                                                  const std::complex<T>& b) {                       \
+    return std::complex<T>(numext::real(a) + numext::real(b), numext::imag(a) + numext::imag(b));                   \
+  }                                                                                                                 \
+                                                                                                                    \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator+(const std::complex<T>& a, const T& b) {           \
+    return std::complex<T>(numext::real(a) + b, numext::imag(a));                                                   \
+  }                                                                                                                 \
+                                                                                                                    \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator+(const T& a, const std::complex<T>& b) {           \
+    return std::complex<T>(a + numext::real(b), numext::imag(b));                                                   \
+  }                                                                                                                 \
+                                                                                                                    \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator-(const std::complex<T>& a,                         \
+                                                                  const std::complex<T>& b) {                       \
+    return std::complex<T>(numext::real(a) - numext::real(b), numext::imag(a) - numext::imag(b));                   \
+  }                                                                                                                 \
+                                                                                                                    \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator-(const std::complex<T>& a, const T& b) {           \
+    return std::complex<T>(numext::real(a) - b, numext::imag(a));                                                   \
+  }                                                                                                                 \
+                                                                                                                    \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator-(const T& a, const std::complex<T>& b) {           \
+    return std::complex<T>(a - numext::real(b), -numext::imag(b));                                                  \
+  }                                                                                                                 \
+                                                                                                                    \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator*(const std::complex<T>& a,                         \
+                                                                  const std::complex<T>& b) {                       \
+    return complex_multiply(a, b);                                                                                  \
+  }                                                                                                                 \
+                                                                                                                    \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator*(const std::complex<T>& a, const T& b) {           \
+    return std::complex<T>(numext::real(a) * b, numext::imag(a) * b);                                               \
+  }                                                                                                                 \
+                                                                                                                    \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator*(const T& a, const std::complex<T>& b) {           \
+    return std::complex<T>(a * numext::real(b), a * numext::imag(b));                                               \
+  }                                                                                                                 \
+                                                                                                                    \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator/(const std::complex<T>& a,                         \
+                                                                  const std::complex<T>& b) {                       \
+    return complex_divide(a, b);                                                                                    \
+  }                                                                                                                 \
+                                                                                                                    \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator/(const std::complex<T>& a, const T& b) {           \
+    return std::complex<T>(numext::real(a) / b, numext::imag(a) / b);                                               \
+  }                                                                                                                 \
+                                                                                                                    \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T> operator/(const T& a, const std::complex<T>& b) {           \
+    return complex_divide(std::complex<T>(a, 0), b);                                                                \
+  }                                                                                                                 \
+                                                                                                                    \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T>& operator+=(std::complex<T>& a, const std::complex<T>& b) { \
+    numext::real_ref(a) += numext::real(b);                                                                         \
+    numext::imag_ref(a) += numext::imag(b);                                                                         \
+    return a;                                                                                                       \
+  }                                                                                                                 \
+                                                                                                                    \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T>& operator-=(std::complex<T>& a, const std::complex<T>& b) { \
+    numext::real_ref(a) -= numext::real(b);                                                                         \
+    numext::imag_ref(a) -= numext::imag(b);                                                                         \
+    return a;                                                                                                       \
+  }                                                                                                                 \
+                                                                                                                    \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T>& operator*=(std::complex<T>& a, const std::complex<T>& b) { \
+    a = complex_multiply(a, b);                                                                                     \
+    return a;                                                                                                       \
+  }                                                                                                                 \
+                                                                                                                    \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<T>& operator/=(std::complex<T>& a, const std::complex<T>& b) { \
+    a = complex_divide(a, b);                                                                                       \
+    return a;                                                                                                       \
+  }                                                                                                                 \
+                                                                                                                    \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator==(const std::complex<T>& a, const std::complex<T>& b) {       \
+    return numext::real(a) == numext::real(b) && numext::imag(a) == numext::imag(b);                                \
+  }                                                                                                                 \
+                                                                                                                    \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator==(const std::complex<T>& a, const T& b) {                     \
+    return numext::real(a) == b && numext::imag(a) == 0;                                                            \
+  }                                                                                                                 \
+                                                                                                                    \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator==(const T& a, const std::complex<T>& b) {                     \
+    return a == numext::real(b) && 0 == numext::imag(b);                                                            \
+  }                                                                                                                 \
+                                                                                                                    \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator!=(const std::complex<T>& a, const std::complex<T>& b) {       \
+    return !(a == b);                                                                                               \
+  }                                                                                                                 \
+                                                                                                                    \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator!=(const std::complex<T>& a, const T& b) { return !(a == b); } \
+                                                                                                                    \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator!=(const T& a, const std::complex<T>& b) { return !(a == b); }
+
+// Do not specialize for long double, since that reduces to double on device.
+EIGEN_CREATE_STD_COMPLEX_OPERATOR_SPECIALIZATIONS(float)
+EIGEN_CREATE_STD_COMPLEX_OPERATOR_SPECIALIZATIONS(double)
+
+#undef EIGEN_CREATE_STD_COMPLEX_OPERATOR_SPECIALIZATIONS
+
+}  // namespace complex_operator_detail
+
+EIGEN_USING_STD_COMPLEX_OPERATORS
+
+namespace numext {
+EIGEN_USING_STD_COMPLEX_OPERATORS
+}  // namespace numext
+
+namespace internal {
+EIGEN_USING_STD_COMPLEX_OPERATORS
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // !(EIGEN_COMP_ICC && _USE_COMPLEX_SPECIALIZATION_)
+
+#endif  // EIGEN_GPUCC && EIGEN_GPU_COMPILE_PHASE
+
+#endif  // EIGEN_COMPLEX_GPU_H
diff --git a/inst/include/Eigen/src/Core/arch/GPU/MathFunctions.h b/inst/include/Eigen/src/Core/arch/GPU/MathFunctions.h
new file mode 100644
index 00000000..81bc8bb5
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/GPU/MathFunctions.h
@@ -0,0 +1,104 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATH_FUNCTIONS_GPU_H
+#define EIGEN_MATH_FUNCTIONS_GPU_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+// Make sure this is only available when targeting a GPU: we don't want to
+// introduce conflicts between these packet_traits definitions and the ones
+// we'll use on the host side (SSE, AVX, ...)
+#if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plog<float4>(const float4& a) {
+  return make_float4(logf(a.x), logf(a.y), logf(a.z), logf(a.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plog<double2>(const double2& a) {
+  using ::log;
+  return make_double2(log(a.x), log(a.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plog1p<float4>(const float4& a) {
+  return make_float4(log1pf(a.x), log1pf(a.y), log1pf(a.z), log1pf(a.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plog1p<double2>(const double2& a) {
+  return make_double2(log1p(a.x), log1p(a.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pexp<float4>(const float4& a) {
+  return make_float4(expf(a.x), expf(a.y), expf(a.z), expf(a.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pexp<double2>(const double2& a) {
+  using ::exp;
+  return make_double2(exp(a.x), exp(a.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pexp2<float4>(const float4& a) {
+  return make_float4(exp2f(a.x), exp2f(a.y), exp2f(a.z), exp2f(a.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pexp2<double2>(const double2& a) {
+  using ::exp;
+  return make_double2(exp2(a.x), exp2(a.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pexpm1<float4>(const float4& a) {
+  return make_float4(expm1f(a.x), expm1f(a.y), expm1f(a.z), expm1f(a.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pexpm1<double2>(const double2& a) {
+  return make_double2(expm1(a.x), expm1(a.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psqrt<float4>(const float4& a) {
+  return make_float4(sqrtf(a.x), sqrtf(a.y), sqrtf(a.z), sqrtf(a.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 psqrt<double2>(const double2& a) {
+  using ::sqrt;
+  return make_double2(sqrt(a.x), sqrt(a.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 prsqrt<float4>(const float4& a) {
+  return make_float4(rsqrtf(a.x), rsqrtf(a.y), rsqrtf(a.z), rsqrtf(a.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 prsqrt<double2>(const double2& a) {
+  return make_double2(rsqrt(a.x), rsqrt(a.y));
+}
+
+#endif
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_MATH_FUNCTIONS_GPU_H
diff --git a/inst/include/Eigen/src/Core/arch/GPU/PacketMath.h b/inst/include/Eigen/src/Core/arch/GPU/PacketMath.h
new file mode 100644
index 00000000..328b1b93
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/GPU/PacketMath.h
@@ -0,0 +1,1712 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_GPU_H
+#define EIGEN_PACKET_MATH_GPU_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+// Read-only data cached load available.
+#if defined(EIGEN_HIP_DEVICE_COMPILE) || (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350)
+#define EIGEN_GPU_HAS_LDG 1
+#endif
+
+// FP16 math available.
+#if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530)
+#define EIGEN_CUDA_HAS_FP16_ARITHMETIC 1
+#endif
+
+#if defined(EIGEN_HIP_DEVICE_COMPILE) || defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
+#define EIGEN_GPU_HAS_FP16_ARITHMETIC 1
+#endif
+
+// We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler,
+// invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation
+// of the functions, while the latter can only deal with one of them.
+#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
+#define EIGEN_HAS_GPU_DEVICE_FUNCTIONS 1
+#else
+#define EIGEN_HAS_GPU_DEVICE_FUNCTIONS 0
+#endif
+
+// Make sure this is only available when targeting a GPU: we don't want to
+// introduce conflicts between these packet_traits definitions and the ones
+// we'll use on the host side (SSE, AVX, ...)
+#if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
+
+template <>
+struct is_arithmetic<float4> {
+  enum { value = true };
+};
+template <>
+struct is_arithmetic<double2> {
+  enum { value = true };
+};
+
+template <>
+struct packet_traits<float> : default_packet_traits {
+  typedef float4 type;
+  typedef float4 half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+
+    HasDiv = 1,
+    HasSin = 0,
+    HasCos = 0,
+    HasLog = 1,
+    HasExp = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasLGamma = 1,
+    HasDiGamma = 1,
+    HasZeta = 1,
+    HasPolygamma = 1,
+    HasErf = 1,
+    HasErfc = 1,
+    HasNdtri = 1,
+    HasBessel = 1,
+    HasIGamma = 1,
+    HasIGammaDerA = 1,
+    HasGammaSampleDerAlpha = 1,
+    HasIGammac = 1,
+    HasBetaInc = 1,
+
+    HasBlend = 0,
+    HasFloor = 1,
+    HasCmp = EIGEN_HAS_GPU_DEVICE_FUNCTIONS
+  };
+};
+
+template <>
+struct packet_traits<double> : default_packet_traits {
+  typedef double2 type;
+  typedef double2 half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 2,
+
+    HasDiv = 1,
+    HasLog = 1,
+    HasExp = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasLGamma = 1,
+    HasDiGamma = 1,
+    HasZeta = 1,
+    HasPolygamma = 1,
+    HasErf = 1,
+    HasErfc = 1,
+    HasNdtri = 1,
+    HasBessel = 1,
+    HasIGamma = 1,
+    HasIGammaDerA = 1,
+    HasGammaSampleDerAlpha = 1,
+    HasIGammac = 1,
+    HasBetaInc = 1,
+    HasBlend = 0,
+  };
+};
+
+template <>
+struct unpacket_traits<float4> {
+  typedef float type;
+  enum {
+    size = 4,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+  typedef float4 half;
+};
+template <>
+struct unpacket_traits<double2> {
+  typedef double type;
+  enum {
+    size = 2,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+  typedef double2 half;
+};
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1<float4>(const float& from) {
+  return make_float4(from, from, from, from);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(const double& from) {
+  return make_double2(from, from);
+}
+
+#if EIGEN_HAS_GPU_DEVICE_FUNCTIONS
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_and(const float& a, const float& b) {
+  return __int_as_float(__float_as_int(a) & __float_as_int(b));
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_and(const double& a, const double& b) {
+  return __longlong_as_double(__double_as_longlong(a) & __double_as_longlong(b));
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_or(const float& a, const float& b) {
+  return __int_as_float(__float_as_int(a) | __float_as_int(b));
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_or(const double& a, const double& b) {
+  return __longlong_as_double(__double_as_longlong(a) | __double_as_longlong(b));
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_xor(const float& a, const float& b) {
+  return __int_as_float(__float_as_int(a) ^ __float_as_int(b));
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_xor(const double& a, const double& b) {
+  return __longlong_as_double(__double_as_longlong(a) ^ __double_as_longlong(b));
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_andnot(const float& a, const float& b) {
+  return __int_as_float(__float_as_int(a) & ~__float_as_int(b));
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_andnot(const double& a, const double& b) {
+  return __longlong_as_double(__double_as_longlong(a) & ~__double_as_longlong(b));
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float eq_mask(const float& a, const float& b) {
+  return __int_as_float(a == b ? 0xffffffffu : 0u);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double eq_mask(const double& a, const double& b) {
+  return __longlong_as_double(a == b ? 0xffffffffffffffffull : 0ull);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float lt_mask(const float& a, const float& b) {
+  return __int_as_float(a < b ? 0xffffffffu : 0u);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double lt_mask(const double& a, const double& b) {
+  return __longlong_as_double(a < b ? 0xffffffffffffffffull : 0ull);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float le_mask(const float& a, const float& b) {
+  return __int_as_float(a <= b ? 0xffffffffu : 0u);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double le_mask(const double& a, const double& b) {
+  return __longlong_as_double(a <= b ? 0xffffffffffffffffull : 0ull);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pand<float4>(const float4& a, const float4& b) {
+  return make_float4(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y), bitwise_and(a.z, b.z), bitwise_and(a.w, b.w));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pand<double2>(const double2& a, const double2& b) {
+  return make_double2(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 por<float4>(const float4& a, const float4& b) {
+  return make_float4(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y), bitwise_or(a.z, b.z), bitwise_or(a.w, b.w));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 por<double2>(const double2& a, const double2& b) {
+  return make_double2(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pxor<float4>(const float4& a, const float4& b) {
+  return make_float4(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y), bitwise_xor(a.z, b.z), bitwise_xor(a.w, b.w));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pxor<double2>(const double2& a, const double2& b) {
+  return make_double2(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pandnot<float4>(const float4& a, const float4& b) {
+  return make_float4(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y), bitwise_andnot(a.z, b.z),
+                     bitwise_andnot(a.w, b.w));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pandnot<double2>(const double2& a, const double2& b) {
+  return make_double2(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_eq<float4>(const float4& a, const float4& b) {
+  return make_float4(eq_mask(a.x, b.x), eq_mask(a.y, b.y), eq_mask(a.z, b.z), eq_mask(a.w, b.w));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_lt<float4>(const float4& a, const float4& b) {
+  return make_float4(lt_mask(a.x, b.x), lt_mask(a.y, b.y), lt_mask(a.z, b.z), lt_mask(a.w, b.w));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_le<float4>(const float4& a, const float4& b) {
+  return make_float4(le_mask(a.x, b.x), le_mask(a.y, b.y), le_mask(a.z, b.z), le_mask(a.w, b.w));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pcmp_eq<double2>(const double2& a, const double2& b) {
+  return make_double2(eq_mask(a.x, b.x), eq_mask(a.y, b.y));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pcmp_lt<double2>(const double2& a, const double2& b) {
+  return make_double2(lt_mask(a.x, b.x), lt_mask(a.y, b.y));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pcmp_le<double2>(const double2& a, const double2& b) {
+  return make_double2(le_mask(a.x, b.x), le_mask(a.y, b.y));
+}
+#endif  // EIGEN_HAS_GPU_DEVICE_FUNCTIONS
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float4>(const float& a) {
+  return make_float4(a, a + 1, a + 2, a + 3);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plset<double2>(const double& a) {
+  return make_double2(a, a + 1);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 padd<float4>(const float4& a, const float4& b) {
+  return make_float4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 padd<double2>(const double2& a, const double2& b) {
+  return make_double2(a.x + b.x, a.y + b.y);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psub<float4>(const float4& a, const float4& b) {
+  return make_float4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 psub<double2>(const double2& a, const double2& b) {
+  return make_double2(a.x - b.x, a.y - b.y);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pnegate(const float4& a) {
+  return make_float4(-a.x, -a.y, -a.z, -a.w);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pnegate(const double2& a) {
+  return make_double2(-a.x, -a.y);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pconj(const float4& a) {
+  return a;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pconj(const double2& a) {
+  return a;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmul<float4>(const float4& a, const float4& b) {
+  return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmul<double2>(const double2& a, const double2& b) {
+  return make_double2(a.x * b.x, a.y * b.y);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pdiv<float4>(const float4& a, const float4& b) {
+  return make_float4(a.x / b.x, a.y / b.y, a.z / b.z, a.w / b.w);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pdiv<double2>(const double2& a, const double2& b) {
+  return make_double2(a.x / b.x, a.y / b.y);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmin<float4>(const float4& a, const float4& b) {
+  return make_float4(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z), fminf(a.w, b.w));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmin<double2>(const double2& a, const double2& b) {
+  return make_double2(fmin(a.x, b.x), fmin(a.y, b.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmax<float4>(const float4& a, const float4& b) {
+  return make_float4(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z), fmaxf(a.w, b.w));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmax<double2>(const double2& a, const double2& b) {
+  return make_double2(fmax(a.x, b.x), fmax(a.y, b.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pload<float4>(const float* from) {
+  return *reinterpret_cast<const float4*>(from);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pload<double2>(const double* from) {
+  return *reinterpret_cast<const double2*>(from);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploadu<float4>(const float* from) {
+  return make_float4(from[0], from[1], from[2], from[3]);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu<double2>(const double* from) {
+  return make_double2(from[0], from[1]);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float* from) {
+  return make_float4(from[0], from[0], from[1], from[1]);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double* from) {
+  return make_double2(from[0], from[0]);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<float>(float* to, const float4& from) {
+  *reinterpret_cast<float4*>(to) = from;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<double>(double* to, const double2& from) {
+  *reinterpret_cast<double2*>(to) = from;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const float4& from) {
+  to[0] = from.x;
+  to[1] = from.y;
+  to[2] = from.z;
+  to[3] = from.w;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const double2& from) {
+  to[0] = from.x;
+  to[1] = from.y;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
+#if defined(EIGEN_GPU_HAS_LDG)
+  return __ldg(reinterpret_cast<const float4*>(from));
+#else
+  return make_float4(from[0], from[1], from[2], from[3]);
+#endif
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
+#if defined(EIGEN_GPU_HAS_LDG)
+  return __ldg(reinterpret_cast<const double2*>(from));
+#else
+  return make_double2(from[0], from[1]);
+#endif
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) {
+#if defined(EIGEN_GPU_HAS_LDG)
+  return make_float4(__ldg(from + 0), __ldg(from + 1), __ldg(from + 2), __ldg(from + 3));
+#else
+  return make_float4(from[0], from[1], from[2], from[3]);
+#endif
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) {
+#if defined(EIGEN_GPU_HAS_LDG)
+  return make_double2(__ldg(from + 0), __ldg(from + 1));
+#else
+  return make_double2(from[0], from[1]);
+#endif
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline float4 pgather<float, float4>(const float* from, Index stride) {
+  return make_float4(from[0 * stride], from[1 * stride], from[2 * stride], from[3 * stride]);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline double2 pgather<double, double2>(const double* from, Index stride) {
+  return make_double2(from[0 * stride], from[1 * stride]);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<float, float4>(float* to, const float4& from, Index stride) {
+  to[stride * 0] = from.x;
+  to[stride * 1] = from.y;
+  to[stride * 2] = from.z;
+  to[stride * 3] = from.w;
+}
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(double* to, const double2& from, Index stride) {
+  to[stride * 0] = from.x;
+  to[stride * 1] = from.y;
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline float pfirst<float4>(const float4& a) {
+  return a.x;
+}
+template <>
+EIGEN_DEVICE_FUNC inline double pfirst<double2>(const double2& a) {
+  return a.x;
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline float predux<float4>(const float4& a) {
+  return a.x + a.y + a.z + a.w;
+}
+template <>
+EIGEN_DEVICE_FUNC inline double predux<double2>(const double2& a) {
+  return a.x + a.y;
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline float predux_max<float4>(const float4& a) {
+  return fmaxf(fmaxf(a.x, a.y), fmaxf(a.z, a.w));
+}
+template <>
+EIGEN_DEVICE_FUNC inline double predux_max<double2>(const double2& a) {
+  return fmax(a.x, a.y);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline float predux_min<float4>(const float4& a) {
+  return fminf(fminf(a.x, a.y), fminf(a.z, a.w));
+}
+template <>
+EIGEN_DEVICE_FUNC inline double predux_min<double2>(const double2& a) {
+  return fmin(a.x, a.y);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline float predux_mul<float4>(const float4& a) {
+  return a.x * a.y * a.z * a.w;
+}
+template <>
+EIGEN_DEVICE_FUNC inline double predux_mul<double2>(const double2& a) {
+  return a.x * a.y;
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline float4 pabs<float4>(const float4& a) {
+  return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
+}
+template <>
+EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
+  return make_double2(fabs(a.x), fabs(a.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline float4 pfloor<float4>(const float4& a) {
+  return make_float4(floorf(a.x), floorf(a.y), floorf(a.z), floorf(a.w));
+}
+template <>
+EIGEN_DEVICE_FUNC inline double2 pfloor<double2>(const double2& a) {
+  return make_double2(floor(a.x), floor(a.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline float4 pceil<float4>(const float4& a) {
+  return make_float4(ceilf(a.x), ceilf(a.y), ceilf(a.z), ceilf(a.w));
+}
+template <>
+EIGEN_DEVICE_FUNC inline double2 pceil<double2>(const double2& a) {
+  return make_double2(ceil(a.x), ceil(a.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline float4 print<float4>(const float4& a) {
+  return make_float4(rintf(a.x), rintf(a.y), rintf(a.z), rintf(a.w));
+}
+template <>
+EIGEN_DEVICE_FUNC inline double2 print<double2>(const double2& a) {
+  return make_double2(rint(a.x), rint(a.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline float4 ptrunc<float4>(const float4& a) {
+  return make_float4(truncf(a.x), truncf(a.y), truncf(a.z), truncf(a.w));
+}
+template <>
+EIGEN_DEVICE_FUNC inline double2 ptrunc<double2>(const double2& a) {
+  return make_double2(trunc(a.x), trunc(a.y));
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<float4, 4>& kernel) {
+  float tmp = kernel.packet[0].y;
+  kernel.packet[0].y = kernel.packet[1].x;
+  kernel.packet[1].x = tmp;
+
+  tmp = kernel.packet[0].z;
+  kernel.packet[0].z = kernel.packet[2].x;
+  kernel.packet[2].x = tmp;
+
+  tmp = kernel.packet[0].w;
+  kernel.packet[0].w = kernel.packet[3].x;
+  kernel.packet[3].x = tmp;
+
+  tmp = kernel.packet[1].z;
+  kernel.packet[1].z = kernel.packet[2].y;
+  kernel.packet[2].y = tmp;
+
+  tmp = kernel.packet[1].w;
+  kernel.packet[1].w = kernel.packet[3].y;
+  kernel.packet[3].y = tmp;
+
+  tmp = kernel.packet[2].w;
+  kernel.packet[2].w = kernel.packet[3].z;
+  kernel.packet[3].z = tmp;
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<double2, 2>& kernel) {
+  double tmp = kernel.packet[0].y;
+  kernel.packet[0].y = kernel.packet[1].x;
+  kernel.packet[1].x = tmp;
+}
+
+#endif  // defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
+
+// Half-packet functions are not available on the host for CUDA 9.0-9.2, only
+// on device. There is no benefit to using them on the host anyways, since they are
+// emulated.
+#if (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE)
+
+typedef ulonglong2 Packet4h2;
+template <>
+struct unpacket_traits<Packet4h2> {
+  typedef Eigen::half type;
+  enum {
+    size = 8,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+  typedef Packet4h2 half;
+};
+template <>
+struct is_arithmetic<Packet4h2> {
+  enum { value = true };
+};
+
+template <>
+struct unpacket_traits<half2> {
+  typedef Eigen::half type;
+  enum {
+    size = 2,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+  typedef half2 half;
+};
+template <>
+struct is_arithmetic<half2> {
+  enum { value = true };
+};
+
+template <>
+struct packet_traits<Eigen::half> : default_packet_traits {
+  typedef Packet4h2 type;
+  typedef Packet4h2 half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 8,
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasExp = 1,
+    HasExpm1 = 1,
+    HasLog = 1,
+    HasLog1p = 1
+  };
+};
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
+  return __half2half2(from);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pset1<Packet4h2>(const Eigen::half& from) {
+  Packet4h2 r;
+  half2* p_alias = reinterpret_cast<half2*>(&r);
+  p_alias[0] = pset1<half2>(from);
+  p_alias[1] = pset1<half2>(from);
+  p_alias[2] = pset1<half2>(from);
+  p_alias[3] = pset1<half2>(from);
+  return r;
+}
+
+namespace {
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) {
+  return *reinterpret_cast<const half2*>(from);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu(const Eigen::half* from) { return __halves2half2(from[0], from[1]); }
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploaddup(const Eigen::half* from) {
+  return __halves2half2(from[0], from[0]);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to, const half2& from) {
+  *reinterpret_cast<half2*>(to) = from;
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to, const half2& from) {
+  to[0] = __low2half(from);
+  to[1] = __high2half(from);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_aligned(const Eigen::half* from) {
+#if defined(EIGEN_GPU_HAS_LDG)
+  // Input is guaranteed to be properly aligned.
+  return __ldg(reinterpret_cast<const half2*>(from));
+#else
+  return __halves2half2(*(from + 0), *(from + 1));
+#endif
+}
+
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_unaligned(const Eigen::half* from) {
+#if defined(EIGEN_GPU_HAS_LDG)
+  return __halves2half2(__ldg(from + 0), __ldg(from + 1));
+#else
+  return __halves2half2(*(from + 0), *(from + 1));
+#endif
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather(const Eigen::half* from, Index stride) {
+  return __halves2half2(from[0 * stride], from[1 * stride]);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(Eigen::half* to, const half2& from, Index stride) {
+  to[stride * 0] = __low2half(from);
+  to[stride * 1] = __high2half(from);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst(const half2& a) { return __low2half(a); }
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs(const half2& a) {
+  half a1 = __low2half(a);
+  half a2 = __high2half(a);
+  half result1 = half_impl::raw_uint16_to_half(a1.x & 0x7FFF);
+  half result2 = half_impl::raw_uint16_to_half(a2.x & 0x7FFF);
+  return __halves2half2(result1, result2);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ptrue(const half2& /*a*/) {
+  half true_half = half_impl::raw_uint16_to_half(0xffffu);
+  return pset1<half2>(true_half);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pzero(const half2& /*a*/) {
+  half false_half = half_impl::raw_uint16_to_half(0x0000u);
+  return pset1<half2>(false_half);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<half2, 2>& kernel) {
+  __half a1 = __low2half(kernel.packet[0]);
+  __half a2 = __high2half(kernel.packet[0]);
+  __half b1 = __low2half(kernel.packet[1]);
+  __half b2 = __high2half(kernel.packet[1]);
+  kernel.packet[0] = __halves2half2(a1, b1);
+  kernel.packet[1] = __halves2half2(a2, b2);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  return __halves2half2(a, __hadd(a, __float2half(1.0f)));
+#else
+  float f = __half2float(a) + 1.0f;
+  return __halves2half2(a, __float2half(f));
+#endif
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pselect(const half2& mask, const half2& a, const half2& b) {
+  half mask_low = __low2half(mask);
+  half mask_high = __high2half(mask);
+  half result_low = mask_low == half(0) ? __low2half(b) : __low2half(a);
+  half result_high = mask_high == half(0) ? __high2half(b) : __high2half(a);
+  return __halves2half2(result_low, result_high);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_eq(const half2& a, const half2& b) {
+  half true_half = half_impl::raw_uint16_to_half(0xffffu);
+  half false_half = half_impl::raw_uint16_to_half(0x0000u);
+  half a1 = __low2half(a);
+  half a2 = __high2half(a);
+  half b1 = __low2half(b);
+  half b2 = __high2half(b);
+  half eq1 = __half2float(a1) == __half2float(b1) ? true_half : false_half;
+  half eq2 = __half2float(a2) == __half2float(b2) ? true_half : false_half;
+  return __halves2half2(eq1, eq2);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_lt(const half2& a, const half2& b) {
+  half true_half = half_impl::raw_uint16_to_half(0xffffu);
+  half false_half = half_impl::raw_uint16_to_half(0x0000u);
+  half a1 = __low2half(a);
+  half a2 = __high2half(a);
+  half b1 = __low2half(b);
+  half b2 = __high2half(b);
+  half eq1 = __half2float(a1) < __half2float(b1) ? true_half : false_half;
+  half eq2 = __half2float(a2) < __half2float(b2) ? true_half : false_half;
+  return __halves2half2(eq1, eq2);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_le(const half2& a, const half2& b) {
+  half true_half = half_impl::raw_uint16_to_half(0xffffu);
+  half false_half = half_impl::raw_uint16_to_half(0x0000u);
+  half a1 = __low2half(a);
+  half a2 = __high2half(a);
+  half b1 = __low2half(b);
+  half b2 = __high2half(b);
+  half eq1 = __half2float(a1) <= __half2float(b1) ? true_half : false_half;
+  half eq2 = __half2float(a2) <= __half2float(b2) ? true_half : false_half;
+  return __halves2half2(eq1, eq2);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pand(const half2& a, const half2& b) {
+  half a1 = __low2half(a);
+  half a2 = __high2half(a);
+  half b1 = __low2half(b);
+  half b2 = __high2half(b);
+  half result1 = half_impl::raw_uint16_to_half(a1.x & b1.x);
+  half result2 = half_impl::raw_uint16_to_half(a2.x & b2.x);
+  return __halves2half2(result1, result2);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 por(const half2& a, const half2& b) {
+  half a1 = __low2half(a);
+  half a2 = __high2half(a);
+  half b1 = __low2half(b);
+  half b2 = __high2half(b);
+  half result1 = half_impl::raw_uint16_to_half(a1.x | b1.x);
+  half result2 = half_impl::raw_uint16_to_half(a2.x | b2.x);
+  return __halves2half2(result1, result2);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pxor(const half2& a, const half2& b) {
+  half a1 = __low2half(a);
+  half a2 = __high2half(a);
+  half b1 = __low2half(b);
+  half b2 = __high2half(b);
+  half result1 = half_impl::raw_uint16_to_half(a1.x ^ b1.x);
+  half result2 = half_impl::raw_uint16_to_half(a2.x ^ b2.x);
+  return __halves2half2(result1, result2);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pandnot(const half2& a, const half2& b) {
+  half a1 = __low2half(a);
+  half a2 = __high2half(a);
+  half b1 = __low2half(b);
+  half b2 = __high2half(b);
+  half result1 = half_impl::raw_uint16_to_half(a1.x & ~b1.x);
+  half result2 = half_impl::raw_uint16_to_half(a2.x & ~b2.x);
+  return __halves2half2(result1, result2);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a, const half2& b) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  return __hadd2(a, b);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 + b1;
+  float r2 = a2 + b2;
+  return __floats2half2_rn(r1, r2);
+#endif
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a, const half2& b) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  return __hsub2(a, b);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 - b1;
+  float r2 = a2 - b2;
+  return __floats2half2_rn(r1, r2);
+#endif
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  return __hneg2(a);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  return __floats2half2_rn(-a1, -a2);
+#endif
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a, const half2& b) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  return __hmul2(a, b);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 * b1;
+  float r2 = a2 * b2;
+  return __floats2half2_rn(r1, r2);
+#endif
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd(const half2& a, const half2& b, const half2& c) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  return __hfma2(a, b, c);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float c1 = __low2float(c);
+  float c2 = __high2float(c);
+  float r1 = a1 * b1 + c1;
+  float r2 = a2 * b2 + c2;
+  return __floats2half2_rn(r1, r2);
+#endif
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a, const half2& b) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  return __h2div(a, b);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 / b1;
+  float r2 = a2 / b2;
+  return __floats2half2_rn(r1, r2);
+#endif
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a, const half2& b) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  __half r1 = a1 < b1 ? __low2half(a) : __low2half(b);
+  __half r2 = a2 < b2 ? __high2half(a) : __high2half(b);
+  return __halves2half2(r1, r2);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a, const half2& b) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  __half r1 = a1 > b1 ? __low2half(a) : __low2half(b);
+  __half r2 = a2 > b2 ? __high2half(a) : __high2half(b);
+  return __halves2half2(r1, r2);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  return __hadd(__low2half(a), __high2half(a));
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  return Eigen::half(__float2half(a1 + a2));
+#endif
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  __half first = __low2half(a);
+  __half second = __high2half(a);
+  return __hgt(first, second) ? first : second;
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  return a1 > a2 ? __low2half(a) : __high2half(a);
+#endif
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  __half first = __low2half(a);
+  __half second = __high2half(a);
+  return __hlt(first, second) ? first : second;
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  return a1 < a2 ? __low2half(a) : __high2half(a);
+#endif
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul(const half2& a) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  return __hmul(__low2half(a), __high2half(a));
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  return Eigen::half(__float2half(a1 * a2));
+#endif
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog1p(const half2& a) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float r1 = log1pf(a1);
+  float r2 = log1pf(a2);
+  return __floats2half2_rn(r1, r2);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexpm1(const half2& a) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float r1 = expm1f(a1);
+  float r2 = expm1f(a2);
+  return __floats2half2_rn(r1, r2);
+}
+
+#if (EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)) || defined(EIGEN_HIP_DEVICE_COMPILE)
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog(const half2& a) { return h2log(a); }
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(const half2& a) { return h2exp(a); }
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt(const half2& a) { return h2sqrt(a); }
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) { return h2rsqrt(a); }
+
+#else
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog(const half2& a) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float r1 = logf(a1);
+  float r2 = logf(a2);
+  return __floats2half2_rn(r1, r2);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(const half2& a) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float r1 = expf(a1);
+  float r2 = expf(a2);
+  return __floats2half2_rn(r1, r2);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt(const half2& a) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float r1 = sqrtf(a1);
+  float r2 = sqrtf(a2);
+  return __floats2half2_rn(r1, r2);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float r1 = rsqrtf(a1);
+  float r2 = rsqrtf(a2);
+  return __floats2half2_rn(r1, r2);
+}
+#endif
+}  // namespace
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pload<Packet4h2>(const Eigen::half* from) {
+  return *reinterpret_cast<const Packet4h2*>(from);
+}
+
+// unaligned load;
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 ploadu<Packet4h2>(const Eigen::half* from) {
+  Packet4h2 r;
+  half2* p_alias = reinterpret_cast<half2*>(&r);
+  p_alias[0] = ploadu(from + 0);
+  p_alias[1] = ploadu(from + 2);
+  p_alias[2] = ploadu(from + 4);
+  p_alias[3] = ploadu(from + 6);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 ploaddup<Packet4h2>(const Eigen::half* from) {
+  Packet4h2 r;
+  half2* p_alias = reinterpret_cast<half2*>(&r);
+  p_alias[0] = ploaddup(from + 0);
+  p_alias[1] = ploaddup(from + 1);
+  p_alias[2] = ploaddup(from + 2);
+  p_alias[3] = ploaddup(from + 3);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet4h2& from) {
+  *reinterpret_cast<Packet4h2*>(to) = from;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet4h2& from) {
+  const half2* from_alias = reinterpret_cast<const half2*>(&from);
+  pstoreu(to + 0, from_alias[0]);
+  pstoreu(to + 2, from_alias[1]);
+  pstoreu(to + 4, from_alias[2]);
+  pstoreu(to + 6, from_alias[3]);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2 ploadt_ro<Packet4h2, Aligned>(const Eigen::half* from) {
+#if defined(EIGEN_GPU_HAS_LDG)
+  Packet4h2 r;
+  r = __ldg(reinterpret_cast<const Packet4h2*>(from));
+  return r;
+#else
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  r_alias[0] = ploadt_ro_aligned(from + 0);
+  r_alias[1] = ploadt_ro_aligned(from + 2);
+  r_alias[2] = ploadt_ro_aligned(from + 4);
+  r_alias[3] = ploadt_ro_aligned(from + 6);
+  return r;
+#endif
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2 ploadt_ro<Packet4h2, Unaligned>(const Eigen::half* from) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  r_alias[0] = ploadt_ro_unaligned(from + 0);
+  r_alias[1] = ploadt_ro_unaligned(from + 2);
+  r_alias[2] = ploadt_ro_unaligned(from + 4);
+  r_alias[3] = ploadt_ro_unaligned(from + 6);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pgather<Eigen::half, Packet4h2>(const Eigen::half* from, Index stride) {
+  Packet4h2 r;
+  half2* p_alias = reinterpret_cast<half2*>(&r);
+  p_alias[0] = __halves2half2(from[0 * stride], from[1 * stride]);
+  p_alias[1] = __halves2half2(from[2 * stride], from[3 * stride]);
+  p_alias[2] = __halves2half2(from[4 * stride], from[5 * stride]);
+  p_alias[3] = __halves2half2(from[6 * stride], from[7 * stride]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4h2>(Eigen::half* to, const Packet4h2& from,
+                                                                            Index stride) {
+  const half2* from_alias = reinterpret_cast<const half2*>(&from);
+  pscatter(to + stride * 0, from_alias[0], stride);
+  pscatter(to + stride * 2, from_alias[1], stride);
+  pscatter(to + stride * 4, from_alias[2], stride);
+  pscatter(to + stride * 6, from_alias[3], stride);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4h2>(const Packet4h2& a) {
+  return pfirst(*(reinterpret_cast<const half2*>(&a)));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pabs<Packet4h2>(const Packet4h2& a) {
+  Packet4h2 r;
+  half2* p_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  p_alias[0] = pabs(a_alias[0]);
+  p_alias[1] = pabs(a_alias[1]);
+  p_alias[2] = pabs(a_alias[2]);
+  p_alias[3] = pabs(a_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 ptrue<Packet4h2>(const Packet4h2& /*a*/) {
+  half true_half = half_impl::raw_uint16_to_half(0xffffu);
+  return pset1<Packet4h2>(true_half);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pzero<Packet4h2>(const Packet4h2& /*a*/) {
+  half false_half = half_impl::raw_uint16_to_half(0x0000u);
+  return pset1<Packet4h2>(false_half);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_double(double* d_row0, double* d_row1, double* d_row2,
+                                                             double* d_row3, double* d_row4, double* d_row5,
+                                                             double* d_row6, double* d_row7) {
+  double d_tmp;
+  d_tmp = d_row0[1];
+  d_row0[1] = d_row4[0];
+  d_row4[0] = d_tmp;
+
+  d_tmp = d_row1[1];
+  d_row1[1] = d_row5[0];
+  d_row5[0] = d_tmp;
+
+  d_tmp = d_row2[1];
+  d_row2[1] = d_row6[0];
+  d_row6[0] = d_tmp;
+
+  d_tmp = d_row3[1];
+  d_row3[1] = d_row7[0];
+  d_row7[0] = d_tmp;
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_half2(half2* f_row0, half2* f_row1, half2* f_row2,
+                                                            half2* f_row3) {
+  half2 f_tmp;
+  f_tmp = f_row0[1];
+  f_row0[1] = f_row2[0];
+  f_row2[0] = f_tmp;
+
+  f_tmp = f_row1[1];
+  f_row1[1] = f_row3[0];
+  f_row3[0] = f_tmp;
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_half(half2& f0, half2& f1) {
+  __half a1 = __low2half(f0);
+  __half a2 = __high2half(f0);
+  __half b1 = __low2half(f1);
+  __half b2 = __high2half(f1);
+  f0 = __halves2half2(a1, b1);
+  f1 = __halves2half2(a2, b2);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4h2, 8>& kernel) {
+  double* d_row0 = reinterpret_cast<double*>(&kernel.packet[0]);
+  double* d_row1 = reinterpret_cast<double*>(&kernel.packet[1]);
+  double* d_row2 = reinterpret_cast<double*>(&kernel.packet[2]);
+  double* d_row3 = reinterpret_cast<double*>(&kernel.packet[3]);
+  double* d_row4 = reinterpret_cast<double*>(&kernel.packet[4]);
+  double* d_row5 = reinterpret_cast<double*>(&kernel.packet[5]);
+  double* d_row6 = reinterpret_cast<double*>(&kernel.packet[6]);
+  double* d_row7 = reinterpret_cast<double*>(&kernel.packet[7]);
+  ptranspose_double(d_row0, d_row1, d_row2, d_row3, d_row4, d_row5, d_row6, d_row7);
+
+  half2* f_row0 = reinterpret_cast<half2*>(d_row0);
+  half2* f_row1 = reinterpret_cast<half2*>(d_row1);
+  half2* f_row2 = reinterpret_cast<half2*>(d_row2);
+  half2* f_row3 = reinterpret_cast<half2*>(d_row3);
+  ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
+  ptranspose_half(f_row0[0], f_row1[0]);
+  ptranspose_half(f_row0[1], f_row1[1]);
+  ptranspose_half(f_row2[0], f_row3[0]);
+  ptranspose_half(f_row2[1], f_row3[1]);
+
+  f_row0 = reinterpret_cast<half2*>(d_row0 + 1);
+  f_row1 = reinterpret_cast<half2*>(d_row1 + 1);
+  f_row2 = reinterpret_cast<half2*>(d_row2 + 1);
+  f_row3 = reinterpret_cast<half2*>(d_row3 + 1);
+  ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
+  ptranspose_half(f_row0[0], f_row1[0]);
+  ptranspose_half(f_row0[1], f_row1[1]);
+  ptranspose_half(f_row2[0], f_row3[0]);
+  ptranspose_half(f_row2[1], f_row3[1]);
+
+  f_row0 = reinterpret_cast<half2*>(d_row4);
+  f_row1 = reinterpret_cast<half2*>(d_row5);
+  f_row2 = reinterpret_cast<half2*>(d_row6);
+  f_row3 = reinterpret_cast<half2*>(d_row7);
+  ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
+  ptranspose_half(f_row0[0], f_row1[0]);
+  ptranspose_half(f_row0[1], f_row1[1]);
+  ptranspose_half(f_row2[0], f_row3[0]);
+  ptranspose_half(f_row2[1], f_row3[1]);
+
+  f_row0 = reinterpret_cast<half2*>(d_row4 + 1);
+  f_row1 = reinterpret_cast<half2*>(d_row5 + 1);
+  f_row2 = reinterpret_cast<half2*>(d_row6 + 1);
+  f_row3 = reinterpret_cast<half2*>(d_row7 + 1);
+  ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
+  ptranspose_half(f_row0[0], f_row1[0]);
+  ptranspose_half(f_row0[1], f_row1[1]);
+  ptranspose_half(f_row2[0], f_row3[0]);
+  ptranspose_half(f_row2[1], f_row3[1]);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 plset<Packet4h2>(const Eigen::half& a) {
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+
+  Packet4h2 r;
+  half2* p_alias = reinterpret_cast<half2*>(&r);
+  p_alias[0] = __halves2half2(a, __hadd(a, __float2half(1.0f)));
+  p_alias[1] = __halves2half2(__hadd(a, __float2half(2.0f)), __hadd(a, __float2half(3.0f)));
+  p_alias[2] = __halves2half2(__hadd(a, __float2half(4.0f)), __hadd(a, __float2half(5.0f)));
+  p_alias[3] = __halves2half2(__hadd(a, __float2half(6.0f)), __hadd(a, __float2half(7.0f)));
+  return r;
+#elif defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+
+  half2 b = pset1<half2>(a);
+  half2 c;
+  half2 half_offset0 = __halves2half2(__float2half(0.0f), __float2half(2.0f));
+  half2 half_offset1 = __halves2half2(__float2half(4.0f), __float2half(6.0f));
+
+  c = __hadd2(b, half_offset0);
+  r_alias[0] = plset(__low2half(c));
+  r_alias[1] = plset(__high2half(c));
+
+  c = __hadd2(b, half_offset1);
+  r_alias[2] = plset(__low2half(c));
+  r_alias[3] = plset(__high2half(c));
+
+  return r;
+
+#else
+  float f = __half2float(a);
+  Packet4h2 r;
+  half2* p_alias = reinterpret_cast<half2*>(&r);
+  p_alias[0] = __halves2half2(a, __float2half(f + 1.0f));
+  p_alias[1] = __halves2half2(__float2half(f + 2.0f), __float2half(f + 3.0f));
+  p_alias[2] = __halves2half2(__float2half(f + 4.0f), __float2half(f + 5.0f));
+  p_alias[3] = __halves2half2(__float2half(f + 6.0f), __float2half(f + 7.0f));
+  return r;
+#endif
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pselect<Packet4h2>(const Packet4h2& mask, const Packet4h2& a,
+                                                                   const Packet4h2& b) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* mask_alias = reinterpret_cast<const half2*>(&mask);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  const half2* b_alias = reinterpret_cast<const half2*>(&b);
+  r_alias[0] = pselect(mask_alias[0], a_alias[0], b_alias[0]);
+  r_alias[1] = pselect(mask_alias[1], a_alias[1], b_alias[1]);
+  r_alias[2] = pselect(mask_alias[2], a_alias[2], b_alias[2]);
+  r_alias[3] = pselect(mask_alias[3], a_alias[3], b_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pcmp_eq<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  const half2* b_alias = reinterpret_cast<const half2*>(&b);
+  r_alias[0] = pcmp_eq(a_alias[0], b_alias[0]);
+  r_alias[1] = pcmp_eq(a_alias[1], b_alias[1]);
+  r_alias[2] = pcmp_eq(a_alias[2], b_alias[2]);
+  r_alias[3] = pcmp_eq(a_alias[3], b_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pcmp_lt<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  const half2* b_alias = reinterpret_cast<const half2*>(&b);
+  r_alias[0] = pcmp_lt(a_alias[0], b_alias[0]);
+  r_alias[1] = pcmp_lt(a_alias[1], b_alias[1]);
+  r_alias[2] = pcmp_lt(a_alias[2], b_alias[2]);
+  r_alias[3] = pcmp_lt(a_alias[3], b_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pcmp_le<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  const half2* b_alias = reinterpret_cast<const half2*>(&b);
+  r_alias[0] = pcmp_le(a_alias[0], b_alias[0]);
+  r_alias[1] = pcmp_le(a_alias[1], b_alias[1]);
+  r_alias[2] = pcmp_le(a_alias[2], b_alias[2]);
+  r_alias[3] = pcmp_le(a_alias[3], b_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pand<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  const half2* b_alias = reinterpret_cast<const half2*>(&b);
+  r_alias[0] = pand(a_alias[0], b_alias[0]);
+  r_alias[1] = pand(a_alias[1], b_alias[1]);
+  r_alias[2] = pand(a_alias[2], b_alias[2]);
+  r_alias[3] = pand(a_alias[3], b_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 por<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  const half2* b_alias = reinterpret_cast<const half2*>(&b);
+  r_alias[0] = por(a_alias[0], b_alias[0]);
+  r_alias[1] = por(a_alias[1], b_alias[1]);
+  r_alias[2] = por(a_alias[2], b_alias[2]);
+  r_alias[3] = por(a_alias[3], b_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pxor<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  const half2* b_alias = reinterpret_cast<const half2*>(&b);
+  r_alias[0] = pxor(a_alias[0], b_alias[0]);
+  r_alias[1] = pxor(a_alias[1], b_alias[1]);
+  r_alias[2] = pxor(a_alias[2], b_alias[2]);
+  r_alias[3] = pxor(a_alias[3], b_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pandnot<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  const half2* b_alias = reinterpret_cast<const half2*>(&b);
+  r_alias[0] = pandnot(a_alias[0], b_alias[0]);
+  r_alias[1] = pandnot(a_alias[1], b_alias[1]);
+  r_alias[2] = pandnot(a_alias[2], b_alias[2]);
+  r_alias[3] = pandnot(a_alias[3], b_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 padd<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  const half2* b_alias = reinterpret_cast<const half2*>(&b);
+  r_alias[0] = padd(a_alias[0], b_alias[0]);
+  r_alias[1] = padd(a_alias[1], b_alias[1]);
+  r_alias[2] = padd(a_alias[2], b_alias[2]);
+  r_alias[3] = padd(a_alias[3], b_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 psub<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  const half2* b_alias = reinterpret_cast<const half2*>(&b);
+  r_alias[0] = psub(a_alias[0], b_alias[0]);
+  r_alias[1] = psub(a_alias[1], b_alias[1]);
+  r_alias[2] = psub(a_alias[2], b_alias[2]);
+  r_alias[3] = psub(a_alias[3], b_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pnegate(const Packet4h2& a) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  r_alias[0] = pnegate(a_alias[0]);
+  r_alias[1] = pnegate(a_alias[1]);
+  r_alias[2] = pnegate(a_alias[2]);
+  r_alias[3] = pnegate(a_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pconj(const Packet4h2& a) {
+  return a;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmul<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  const half2* b_alias = reinterpret_cast<const half2*>(&b);
+  r_alias[0] = pmul(a_alias[0], b_alias[0]);
+  r_alias[1] = pmul(a_alias[1], b_alias[1]);
+  r_alias[2] = pmul(a_alias[2], b_alias[2]);
+  r_alias[3] = pmul(a_alias[3], b_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmadd<Packet4h2>(const Packet4h2& a, const Packet4h2& b,
+                                                                 const Packet4h2& c) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  const half2* b_alias = reinterpret_cast<const half2*>(&b);
+  const half2* c_alias = reinterpret_cast<const half2*>(&c);
+  r_alias[0] = pmadd(a_alias[0], b_alias[0], c_alias[0]);
+  r_alias[1] = pmadd(a_alias[1], b_alias[1], c_alias[1]);
+  r_alias[2] = pmadd(a_alias[2], b_alias[2], c_alias[2]);
+  r_alias[3] = pmadd(a_alias[3], b_alias[3], c_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pdiv<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  const half2* b_alias = reinterpret_cast<const half2*>(&b);
+  r_alias[0] = pdiv(a_alias[0], b_alias[0]);
+  r_alias[1] = pdiv(a_alias[1], b_alias[1]);
+  r_alias[2] = pdiv(a_alias[2], b_alias[2]);
+  r_alias[3] = pdiv(a_alias[3], b_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmin<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  const half2* b_alias = reinterpret_cast<const half2*>(&b);
+  r_alias[0] = pmin(a_alias[0], b_alias[0]);
+  r_alias[1] = pmin(a_alias[1], b_alias[1]);
+  r_alias[2] = pmin(a_alias[2], b_alias[2]);
+  r_alias[3] = pmin(a_alias[3], b_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmax<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  const half2* b_alias = reinterpret_cast<const half2*>(&b);
+  r_alias[0] = pmax(a_alias[0], b_alias[0]);
+  r_alias[1] = pmax(a_alias[1], b_alias[1]);
+  r_alias[2] = pmax(a_alias[2], b_alias[2]);
+  r_alias[3] = pmax(a_alias[3], b_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux<Packet4h2>(const Packet4h2& a) {
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+
+  return predux(a_alias[0]) + predux(a_alias[1]) + predux(a_alias[2]) + predux(a_alias[3]);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<Packet4h2>(const Packet4h2& a) {
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  half2 m0 = __halves2half2(predux_max(a_alias[0]), predux_max(a_alias[1]));
+  half2 m1 = __halves2half2(predux_max(a_alias[2]), predux_max(a_alias[3]));
+  __half first = predux_max(m0);
+  __half second = predux_max(m1);
+#if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
+  return (__hgt(first, second) ? first : second);
+#else
+  float ffirst = __half2float(first);
+  float fsecond = __half2float(second);
+  return (ffirst > fsecond) ? first : second;
+#endif
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<Packet4h2>(const Packet4h2& a) {
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  half2 m0 = __halves2half2(predux_min(a_alias[0]), predux_min(a_alias[1]));
+  half2 m1 = __halves2half2(predux_min(a_alias[2]), predux_min(a_alias[3]));
+  __half first = predux_min(m0);
+  __half second = predux_min(m1);
+#if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
+  return (__hlt(first, second) ? first : second);
+#else
+  float ffirst = __half2float(first);
+  float fsecond = __half2float(second);
+  return (ffirst < fsecond) ? first : second;
+#endif
+}
+
+// likely overflow/underflow
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet4h2>(const Packet4h2& a) {
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  return predux_mul(pmul(pmul(a_alias[0], a_alias[1]), pmul(a_alias[2], a_alias[3])));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 plog1p<Packet4h2>(const Packet4h2& a) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  r_alias[0] = plog1p(a_alias[0]);
+  r_alias[1] = plog1p(a_alias[1]);
+  r_alias[2] = plog1p(a_alias[2]);
+  r_alias[3] = plog1p(a_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pexpm1<Packet4h2>(const Packet4h2& a) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  r_alias[0] = pexpm1(a_alias[0]);
+  r_alias[1] = pexpm1(a_alias[1]);
+  r_alias[2] = pexpm1(a_alias[2]);
+  r_alias[3] = pexpm1(a_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 plog<Packet4h2>(const Packet4h2& a) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  r_alias[0] = plog(a_alias[0]);
+  r_alias[1] = plog(a_alias[1]);
+  r_alias[2] = plog(a_alias[2]);
+  r_alias[3] = plog(a_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pexp<Packet4h2>(const Packet4h2& a) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  r_alias[0] = pexp(a_alias[0]);
+  r_alias[1] = pexp(a_alias[1]);
+  r_alias[2] = pexp(a_alias[2]);
+  r_alias[3] = pexp(a_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 psqrt<Packet4h2>(const Packet4h2& a) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  r_alias[0] = psqrt(a_alias[0]);
+  r_alias[1] = psqrt(a_alias[1]);
+  r_alias[2] = psqrt(a_alias[2]);
+  r_alias[3] = psqrt(a_alias[3]);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 prsqrt<Packet4h2>(const Packet4h2& a) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  r_alias[0] = prsqrt(a_alias[0]);
+  r_alias[1] = prsqrt(a_alias[1]);
+  r_alias[2] = prsqrt(a_alias[2]);
+  r_alias[3] = prsqrt(a_alias[3]);
+  return r;
+}
+
+// The following specialized padd, pmul, pdiv, pmin, pmax, pset1 are needed for
+// the implementation of GPU half reduction.
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  return __hadd2(a, b);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 + b1;
+  float r2 = a2 + b2;
+  return __floats2half2_rn(r1, r2);
+#endif
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  return __hmul2(a, b);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 * b1;
+  float r2 = a2 * b2;
+  return __floats2half2_rn(r1, r2);
+#endif
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
+#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
+  return __h2div(a, b);
+#else
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  float r1 = a1 / b1;
+  float r2 = a2 / b2;
+  return __floats2half2_rn(r1, r2);
+#endif
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  __half r1 = a1 < b1 ? __low2half(a) : __low2half(b);
+  __half r2 = a2 < b2 ? __high2half(a) : __high2half(b);
+  return __halves2half2(r1, r2);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
+  float a1 = __low2float(a);
+  float a2 = __high2float(a);
+  float b1 = __low2float(b);
+  float b2 = __high2float(b);
+  __half r1 = a1 > b1 ? __low2half(a) : __low2half(b);
+  __half r2 = a2 > b2 ? __high2half(a) : __high2half(b);
+  return __halves2half2(r1, r2);
+}
+
+#endif  // (defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)) && defined(EIGEN_GPU_COMPILE_PHASE)
+
+#undef EIGEN_GPU_HAS_LDG
+#undef EIGEN_CUDA_HAS_FP16_ARITHMETIC
+#undef EIGEN_GPU_HAS_FP16_ARITHMETIC
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_PACKET_MATH_GPU_H
diff --git a/inst/include/Eigen/src/Core/arch/GPU/Tuple.h b/inst/include/Eigen/src/Core/arch/GPU/Tuple.h
new file mode 100644
index 00000000..402d92f7
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/GPU/Tuple.h
@@ -0,0 +1,268 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2021 The Eigen Team
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TUPLE_GPU
+#define EIGEN_TUPLE_GPU
+
+#include <type_traits>
+#include <utility>
+
+// This is a replacement of std::tuple that can be used in device code.
+
+namespace Eigen {
+namespace internal {
+namespace tuple_impl {
+
+// Internal tuple implementation.
+template <size_t N, typename... Types>
+class TupleImpl;
+
+// Generic recursive tuple.
+template <size_t N, typename T1, typename... Ts>
+class TupleImpl<N, T1, Ts...> {
+ public:
+  // Tuple may contain Eigen types.
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW
+
+  // Default constructor, enable if all types are default-constructible.
+  template <typename U1 = T1,
+            typename EnableIf = std::enable_if_t<std::is_default_constructible<U1>::value &&
+                                                 reduce_all<std::is_default_constructible<Ts>::value...>::value>>
+  constexpr EIGEN_DEVICE_FUNC TupleImpl() : head_{}, tail_{} {}
+
+  // Element constructor.
+  template <typename U1, typename... Us,
+            // Only enable if...
+            typename EnableIf = std::enable_if_t<
+                // the number of input arguments match, and ...
+                sizeof...(Us) == sizeof...(Ts) && (
+                                                      // this does not look like a copy/move constructor.
+                                                      N > 1 || std::is_convertible<U1, T1>::value)>>
+  constexpr EIGEN_DEVICE_FUNC TupleImpl(U1&& arg1, Us&&... args)
+      : head_(std::forward<U1>(arg1)), tail_(std::forward<Us>(args)...) {}
+
+  // The first stored value.
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T1& head() { return head_; }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const T1& head() const { return head_; }
+
+  // The tail values.
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE TupleImpl<N - 1, Ts...>& tail() { return tail_; }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const TupleImpl<N - 1, Ts...>& tail() const { return tail_; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void swap(TupleImpl& other) {
+    using numext::swap;
+    swap(head_, other.head_);
+    swap(tail_, other.tail_);
+  }
+
+  template <typename... UTypes>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TupleImpl& operator=(const TupleImpl<N, UTypes...>& other) {
+    head_ = other.head_;
+    tail_ = other.tail_;
+    return *this;
+  }
+
+  template <typename... UTypes>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TupleImpl& operator=(TupleImpl<N, UTypes...>&& other) {
+    head_ = std::move(other.head_);
+    tail_ = std::move(other.tail_);
+    return *this;
+  }
+
+ private:
+  // Allow related tuples to reference head_/tail_.
+  template <size_t M, typename... UTypes>
+  friend class TupleImpl;
+
+  T1 head_;
+  TupleImpl<N - 1, Ts...> tail_;
+};
+
+// Empty tuple specialization.
+template <>
+class TupleImpl<size_t(0)> {};
+
+template <typename TupleType>
+struct is_tuple : std::false_type {};
+
+template <typename... Types>
+struct is_tuple<TupleImpl<sizeof...(Types), Types...>> : std::true_type {};
+
+// Gets an element from a tuple.
+template <size_t Idx, typename T1, typename... Ts>
+struct tuple_get_impl {
+  using TupleType = TupleImpl<sizeof...(Ts) + 1, T1, Ts...>;
+  using ReturnType = typename tuple_get_impl<Idx - 1, Ts...>::ReturnType;
+
+  static constexpr EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ReturnType& run(TupleType& tuple) {
+    return tuple_get_impl<Idx - 1, Ts...>::run(tuple.tail());
+  }
+
+  static constexpr EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const ReturnType& run(const TupleType& tuple) {
+    return tuple_get_impl<Idx - 1, Ts...>::run(tuple.tail());
+  }
+};
+
+// Base case, getting the head element.
+template <typename T1, typename... Ts>
+struct tuple_get_impl<0, T1, Ts...> {
+  using TupleType = TupleImpl<sizeof...(Ts) + 1, T1, Ts...>;
+  using ReturnType = T1;
+
+  static constexpr EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T1& run(TupleType& tuple) { return tuple.head(); }
+
+  static constexpr EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const T1& run(const TupleType& tuple) { return tuple.head(); }
+};
+
+// Concatenates N Tuples.
+template <size_t NTuples, typename... Tuples>
+struct tuple_cat_impl;
+
+template <size_t NTuples, size_t N1, typename... Args1, size_t N2, typename... Args2, typename... Tuples>
+struct tuple_cat_impl<NTuples, TupleImpl<N1, Args1...>, TupleImpl<N2, Args2...>, Tuples...> {
+  using TupleType1 = TupleImpl<N1, Args1...>;
+  using TupleType2 = TupleImpl<N2, Args2...>;
+  using MergedTupleType = TupleImpl<N1 + N2, Args1..., Args2...>;
+
+  using ReturnType = typename tuple_cat_impl<NTuples - 1, MergedTupleType, Tuples...>::ReturnType;
+
+  // Uses the index sequences to extract and merge elements from tuple1 and tuple2,
+  // then recursively calls again.
+  template <typename Tuple1, size_t... I1s, typename Tuple2, size_t... I2s, typename... MoreTuples>
+  static constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType run(Tuple1&& tuple1, std::index_sequence<I1s...>,
+                                                                        Tuple2&& tuple2, std::index_sequence<I2s...>,
+                                                                        MoreTuples&&... tuples) {
+    return tuple_cat_impl<NTuples - 1, MergedTupleType, Tuples...>::run(
+        MergedTupleType(tuple_get_impl<I1s, Args1...>::run(std::forward<Tuple1>(tuple1))...,
+                        tuple_get_impl<I2s, Args2...>::run(std::forward<Tuple2>(tuple2))...),
+        std::forward<MoreTuples>(tuples)...);
+  }
+
+  // Concatenates the first two tuples.
+  template <typename Tuple1, typename Tuple2, typename... MoreTuples>
+  static constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType run(Tuple1&& tuple1, Tuple2&& tuple2,
+                                                                        MoreTuples&&... tuples) {
+    return run(std::forward<Tuple1>(tuple1), std::make_index_sequence<N1>{}, std::forward<Tuple2>(tuple2),
+               std::make_index_sequence<N2>{}, std::forward<MoreTuples>(tuples)...);
+  }
+};
+
+// Base case with a single tuple.
+template <size_t N, typename... Args>
+struct tuple_cat_impl<1, TupleImpl<N, Args...>> {
+  using ReturnType = TupleImpl<N, Args...>;
+
+  template <typename Tuple1>
+  static constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType run(Tuple1&& tuple1) {
+    return tuple1;
+  }
+};
+
+// Special case of no tuples.
+template <>
+struct tuple_cat_impl<0> {
+  using ReturnType = TupleImpl<0>;
+  static constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType run() { return ReturnType{}; }
+};
+
+// For use in make_tuple, unwraps a reference_wrapper.
+template <typename T>
+struct unwrap_reference_wrapper {
+  using type = T;
+};
+
+template <typename T>
+struct unwrap_reference_wrapper<std::reference_wrapper<T>> {
+  using type = T&;
+};
+
+// For use in make_tuple, decays a type and unwraps a reference_wrapper.
+template <typename T>
+struct unwrap_decay {
+  using type = typename unwrap_reference_wrapper<typename std::decay<T>::type>::type;
+};
+
+/**
+ * Utility for determining a tuple's size.
+ */
+template <typename Tuple>
+struct tuple_size;
+
+template <typename... Types>
+struct tuple_size<TupleImpl<sizeof...(Types), Types...>> : std::integral_constant<size_t, sizeof...(Types)> {};
+
+/**
+ * Gets an element of a tuple.
+ * \tparam Idx index of the element.
+ * \tparam Types ... tuple element types.
+ * \param tuple the tuple.
+ * \return a reference to the desired element.
+ */
+template <size_t Idx, typename... Types>
+constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename tuple_get_impl<Idx, Types...>::ReturnType& get(
+    const TupleImpl<sizeof...(Types), Types...>& tuple) {
+  return tuple_get_impl<Idx, Types...>::run(tuple);
+}
+
+template <size_t Idx, typename... Types>
+constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename tuple_get_impl<Idx, Types...>::ReturnType& get(
+    TupleImpl<sizeof...(Types), Types...>& tuple) {
+  return tuple_get_impl<Idx, Types...>::run(tuple);
+}
+
+/**
+ * Concatenate multiple tuples.
+ * \param tuples ... list of tuples.
+ * \return concatenated tuple.
+ */
+template <typename... Tuples, typename EnableIf = std::enable_if_t<
+                                  internal::reduce_all<is_tuple<typename std::decay<Tuples>::type>::value...>::value>>
+constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    typename tuple_cat_impl<sizeof...(Tuples), typename std::decay<Tuples>::type...>::ReturnType
+    tuple_cat(Tuples&&... tuples) {
+  return tuple_cat_impl<sizeof...(Tuples), typename std::decay<Tuples>::type...>::run(std::forward<Tuples>(tuples)...);
+}
+
+/**
+ * Tie arguments together into a tuple.
+ */
+template <typename... Args, typename ReturnType = TupleImpl<sizeof...(Args), Args&...>>
+constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType tie(Args&... args) noexcept {
+  return ReturnType{args...};
+}
+
+/**
+ * Create a tuple of l-values with the supplied arguments.
+ */
+template <typename... Args, typename ReturnType = TupleImpl<sizeof...(Args), typename unwrap_decay<Args>::type...>>
+constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ReturnType make_tuple(Args&&... args) {
+  return ReturnType{std::forward<Args>(args)...};
+}
+
+/**
+ * Forward a set of arguments as a tuple.
+ */
+template <typename... Args>
+constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TupleImpl<sizeof...(Args), Args...> forward_as_tuple(Args&&... args) {
+  return TupleImpl<sizeof...(Args), Args...>(std::forward<Args>(args)...);
+}
+
+/**
+ * Alternative to std::tuple that can be used on device.
+ */
+template <typename... Types>
+using tuple = TupleImpl<sizeof...(Types), Types...>;
+
+}  // namespace tuple_impl
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_TUPLE_GPU
diff --git a/inst/include/Eigen/src/Core/arch/GPU/TypeCasting.h b/inst/include/Eigen/src/Core/arch/GPU/TypeCasting.h
new file mode 100644
index 00000000..ae43f8ea
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/GPU/TypeCasting.h
@@ -0,0 +1,77 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TYPE_CASTING_GPU_H
+#define EIGEN_TYPE_CASTING_GPU_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+#if (defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300) || \
+    (defined(EIGEN_HAS_HIP_FP16) && defined(EIGEN_HIP_DEVICE_COMPILE))
+
+template <>
+struct type_casting_traits<Eigen::half, float> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast<half2, float4>(const half2& a, const half2& b) {
+  float2 r1 = __half22float2(a);
+  float2 r2 = __half22float2(b);
+  return make_float4(r1.x, r1.y, r2.x, r2.y);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pcast<float4, Packet4h2>(const float4& a, const float4& b) {
+  Packet4h2 r;
+  half2* r_alias = reinterpret_cast<half2*>(&r);
+  r_alias[0] = __floats2half2_rn(a.x, a.y);
+  r_alias[1] = __floats2half2_rn(a.z, a.w);
+  r_alias[2] = __floats2half2_rn(b.x, b.y);
+  r_alias[3] = __floats2half2_rn(b.z, b.w);
+  return r;
+}
+
+template <>
+struct type_casting_traits<float, Eigen::half> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcast<Packet4h2, float4>(const Packet4h2& a) {
+  // Simply discard the second half of the input
+  float4 r;
+  const half2* a_alias = reinterpret_cast<const half2*>(&a);
+  float2 r1 = __half22float2(a_alias[0]);
+  float2 r2 = __half22float2(a_alias[1]);
+  r.x = static_cast<float>(r1.x);
+  r.y = static_cast<float>(r1.y);
+  r.z = static_cast<float>(r2.x);
+  r.w = static_cast<float>(r2.y);
+  return r;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcast<float4, half2>(const float4& a) {
+  // Simply discard the second half of the input
+  return __floats2half2_rn(a.x, a.y);
+}
+
+#endif
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_TYPE_CASTING_GPU_H
diff --git a/inst/include/Eigen/src/Core/arch/HIP/hcc/math_constants.h b/inst/include/Eigen/src/Core/arch/HIP/hcc/math_constants.h
new file mode 100644
index 00000000..99dd3ae0
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/HIP/hcc/math_constants.h
@@ -0,0 +1,23 @@
+/*
+ * math_constants.h -
+ *  HIP equivalent of the CUDA header of the same name
+ */
+
+#ifndef __MATH_CONSTANTS_H__
+#define __MATH_CONSTANTS_H__
+
+/* single precision constants */
+
+#define HIPRT_INF_F __int_as_float(0x7f800000)
+#define HIPRT_NAN_F __int_as_float(0x7fffffff)
+#define HIPRT_MIN_DENORM_F __int_as_float(0x00000001)
+#define HIPRT_MAX_NORMAL_F __int_as_float(0x7f7fffff)
+#define HIPRT_NEG_ZERO_F __int_as_float(0x80000000)
+#define HIPRT_ZERO_F 0.0f
+#define HIPRT_ONE_F 1.0f
+
+/* double precision constants */
+#define HIPRT_INF __hiloint2double(0x7ff00000, 0x00000000)
+#define HIPRT_NAN __hiloint2double(0xfff80000, 0x00000000)
+
+#endif
diff --git a/inst/include/Eigen/src/Core/arch/HVX/PacketMath.h b/inst/include/Eigen/src/Core/arch/HVX/PacketMath.h
new file mode 100644
index 00000000..9b6ceb32
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/HVX/PacketMath.h
@@ -0,0 +1,1088 @@
+
+#ifndef EIGEN_HVX_PACKET_MATH_H
+#define EIGEN_HVX_PACKET_MATH_H
+
+// Only support 128B HVX now.
+// Floating-point operations are supported only since V68.
+#if defined __HVX__ && (__HVX_LENGTH__ == 128) && __HVX_ARCH__ >= 68
+
+// All the floating-point operations do not support IEEE standard.
+// From HVX document:
+//   There is no concept of infinity or NaN. QFloat saturates to maximum
+//   exponent with maximum positive or minimum negative significand.
+
+#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
+#endif
+
+namespace Eigen {
+namespace internal {
+
+// HVX utilities.
+
+template <int D>
+EIGEN_STRONG_INLINE HVX_Vector HVX_vmem(const void* m) {
+  HVX_Vector v;
+#if EIGEN_COMP_CLANG
+  // Use inlined assembly for aligned vmem load on unaligned memory.
+  // Use type cast to HVX_Vector* may mess up with compiler data alignment.
+  __asm__("%0 = vmem(%1+#%2)" : "=v"(v) : "r"(m), "i"(D) : "memory");
+#else
+  void* aligned_mem =
+      reinterpret_cast<void*>((reinterpret_cast<uintptr_t>(m) & ~(__HVX_LENGTH__ - 1)) + D * __HVX_LENGTH__);
+  memcpy(&v, aligned_mem, __HVX_LENGTH__);
+#endif
+  return v;
+}
+
+template <typename T>
+EIGEN_STRONG_INLINE HVX_Vector HVX_load(const T* mem) {
+  HVX_Vector v;
+  memcpy(&v, reinterpret_cast<const HVX_Vector*>(mem), __HVX_LENGTH__);
+  return v;
+}
+
+template <typename T>
+EIGEN_STRONG_INLINE HVX_Vector HVX_loadu(const T* mem) {
+  HVX_Vector v;
+  memcpy(&v, mem, __HVX_LENGTH__);
+  return v;
+}
+
+template <size_t Size, size_t Alignment, typename T>
+EIGEN_STRONG_INLINE HVX_Vector HVX_load_partial(const T* mem) {
+#if defined(EIGEN_HVX_FAST_PARTIAL_VECTOR_LOAD)
+  // Fast partial vector load through aligned vmem load.
+  // The load may past end of array but is aligned to prevent memory fault.
+  HVX_Vector v0 = HVX_vmem<0>(mem);
+  HVX_Vector v1 = v0;
+  uintptr_t mem_addr = reinterpret_cast<uintptr_t>(mem);
+  EIGEN_IF_CONSTEXPR(Size * sizeof(T) <= Alignment) {
+    // Data size less than alignment will never cross multiple aligned vectors.
+    v1 = v0;
+  }
+  else {
+    uintptr_t left_off = mem_addr & (__HVX_LENGTH__ - 1);
+    if (left_off + Size * sizeof(T) > __HVX_LENGTH__) {
+      v1 = HVX_vmem<1>(mem);
+    } else {
+      v1 = v0;
+    }
+  }
+  return Q6_V_valign_VVR(v1, v0, mem_addr);
+#else
+  HVX_Vector v;
+  memcpy(&v, mem, Size * sizeof(T));
+  return v;
+#endif
+}
+
+template <typename T>
+EIGEN_STRONG_INLINE void HVX_store(T* mem, HVX_Vector v) {
+  memcpy(reinterpret_cast<HVX_Vector*>(mem), &v, __HVX_LENGTH__);
+}
+
+template <typename T>
+EIGEN_STRONG_INLINE void HVX_storeu(T* mem, HVX_Vector v) {
+  memcpy(mem, &v, __HVX_LENGTH__);
+}
+
+template <size_t Size, size_t Alignment, typename T>
+EIGEN_STRONG_INLINE void HVX_store_partial(T* mem, HVX_Vector v) {
+  uintptr_t mem_addr = reinterpret_cast<uintptr_t>(mem);
+  HVX_Vector value = Q6_V_vlalign_VVR(v, v, mem_addr);
+  uintptr_t left_off = mem_addr & (__HVX_LENGTH__ - 1);
+  uintptr_t right_off = left_off + Size * sizeof(T);
+
+  HVX_VectorPred ql_not = Q6_Q_vsetq_R(mem_addr);
+  HVX_VectorPred qr = Q6_Q_vsetq2_R(right_off);
+
+  EIGEN_IF_CONSTEXPR(Size * sizeof(T) > Alignment) {
+    if (right_off > __HVX_LENGTH__) {
+      Q6_vmem_QRIV(qr, mem + __HVX_LENGTH__ / sizeof(T), value);
+      qr = Q6_Q_vcmp_eq_VbVb(value, value);
+    }
+  }
+
+  ql_not = Q6_Q_or_QQn(ql_not, qr);
+  Q6_vmem_QnRIV(ql_not, mem, value);
+}
+
+// Packet definitions.
+enum class HVXPacketSize {
+  Full,
+  Half,
+  Quarter,
+};
+
+// Hexagon compiler uses same HVX_Vector to represent all HVX vector types.
+// Wrap different vector type (float32, int32, etc) to different class with
+// explicit constructor and casting back-and-force to HVX_Vector.
+template <HVXPacketSize T>
+class HVXPacket {
+ public:
+  HVXPacket() = default;
+  static HVXPacket Create(HVX_Vector v) { return HVXPacket(v); }
+  HVX_Vector Get() const { return m_val; }
+
+ private:
+  explicit HVXPacket(HVX_Vector v) : m_val(v) {}
+  HVX_Vector m_val = Q6_V_vzero();
+};
+
+typedef HVXPacket<HVXPacketSize::Full> Packet32f;
+typedef HVXPacket<HVXPacketSize::Half> Packet16f;
+typedef HVXPacket<HVXPacketSize::Quarter> Packet8f;
+
+// Packet traits.
+template <>
+struct packet_traits<float> : default_packet_traits {
+  typedef Packet32f type;
+  typedef Packet16f half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 32,
+
+    HasCmp = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 0,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasArg = 0,
+    HasAbs2 = 0,
+    HasAbsDiff = 0,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 0,
+    HasSetLinear = 0,
+    HasBlend = 0,
+
+    HasDiv = 0,
+
+    HasSin = 0,
+    HasCos = 0,
+    HasACos = 0,
+    HasASin = 0,
+    HasATan = 0,
+    HasATanh = 0,
+    HasLog = 0,
+    HasExp = 0,
+    HasSqrt = 0,
+    HasRsqrt = 0,
+    HasTanh = 0,
+    HasErf = 0,
+    HasBessel = 0,
+    HasNdtri = 0
+  };
+};
+
+template <>
+struct unpacket_traits<Packet32f> {
+  typedef float type;
+  typedef Packet16f half;
+  enum {
+    size = 32,
+    alignment = Aligned128,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template <>
+struct unpacket_traits<Packet16f> {
+  typedef float type;
+  typedef Packet8f half;
+  enum {
+    size = 16,
+    // Many code assume alignment on packet size instead of following trait
+    // So we do not use Aligned128 to optimize aligned load/store,
+    alignment = Aligned64,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template <>
+struct unpacket_traits<Packet8f> {
+  typedef float type;
+  typedef Packet8f half;
+  enum {
+    size = 8,
+    // Many code assume alignment on packet size instead of following trait
+    // So we do not use Aligned128 to optimize aligned load/store,
+    alignment = Aligned32,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+// float32 operations.
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> pzero_hvx(const HVXPacket<T>&) {
+  return HVXPacket<T>::Create(Q6_V_vzero());
+}
+template <>
+EIGEN_STRONG_INLINE Packet32f pzero<Packet32f>(const Packet32f&) {
+  return pzero_hvx(Packet32f());
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f pzero<Packet16f>(const Packet16f&) {
+  return pzero_hvx(Packet16f());
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pzero<Packet8f>(const Packet8f&) {
+  return pzero_hvx(Packet8f());
+}
+
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE typename unpacket_traits<HVXPacket<T>>::half predux_half_dowto4_hvx(const HVXPacket<T>& a) {
+  const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
+  return unpacket_traits<HVXPacket<T>>::half::Create(
+      Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(Q6_V_vror_VR(a.Get(), sizeof(float) * packet_size / 2), a.Get())));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f predux_half_dowto4(const Packet32f& a) {
+  return predux_half_dowto4_hvx(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f predux_half_dowto4(const Packet16f& a) {
+  return predux_half_dowto4_hvx(a);
+}
+
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> pset1_hvx(const float& from) {
+  union {
+    float f;
+    int32_t i;
+  } u;
+  u.f = from;
+  return HVXPacket<T>::Create(Q6_V_vsplat_R(u.i));
+}
+template <>
+EIGEN_STRONG_INLINE Packet32f pset1<Packet32f>(const float& from) {
+  return pset1_hvx<HVXPacketSize::Full>(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f pset1<Packet16f>(const float& from) {
+  return pset1_hvx<HVXPacketSize::Half>(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pset1<Packet8f>(const float& from) {
+  return pset1_hvx<HVXPacketSize::Quarter>(from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet32f pload<Packet32f>(const float* from) {
+  return Packet32f::Create(HVX_load(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f pload<Packet16f>(const float* from) {
+  return Packet16f::Create(
+      HVX_load_partial<unpacket_traits<Packet16f>::size, unpacket_traits<Packet16f>::alignment>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pload<Packet8f>(const float* from) {
+  return Packet8f::Create(
+      HVX_load_partial<unpacket_traits<Packet8f>::size, unpacket_traits<Packet8f>::alignment>(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet32f ploadu<Packet32f>(const float* from) {
+  return Packet32f::Create(HVX_loadu(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f ploadu<Packet16f>(const float* from) {
+  return Packet16f::Create(HVX_load_partial<unpacket_traits<Packet16f>::size, 0>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f ploadu<Packet8f>(const float* from) {
+  return Packet8f::Create(HVX_load_partial<unpacket_traits<Packet8f>::size, 0>(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet32f& from) {
+  HVX_store(to, from.Get());
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet16f& from) {
+  HVX_store_partial<unpacket_traits<Packet16f>::size, unpacket_traits<Packet16f>::alignment>(to, from.Get());
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet8f& from) {
+  HVX_store_partial<unpacket_traits<Packet8f>::size, unpacket_traits<Packet8f>::alignment>(to, from.Get());
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet32f& from) {
+  HVX_storeu(to, from.Get());
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet16f& from) {
+  HVX_store_partial<unpacket_traits<Packet16f>::size, 0>(to, from.Get());
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet8f& from) {
+  HVX_store_partial<unpacket_traits<Packet8f>::size, 0>(to, from.Get());
+}
+
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> pmul_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
+  return HVXPacket<T>::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vmpy_VsfVsf(a.Get(), b.Get())));
+}
+template <>
+EIGEN_STRONG_INLINE Packet32f pmul<Packet32f>(const Packet32f& a, const Packet32f& b) {
+  return pmul_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f pmul<Packet16f>(const Packet16f& a, const Packet16f& b) {
+  return pmul_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pmul<Packet8f>(const Packet8f& a, const Packet8f& b) {
+  return pmul_hvx(a, b);
+}
+
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> padd_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
+  return HVXPacket<T>::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vadd_VsfVsf(a.Get(), b.Get())));
+}
+template <>
+EIGEN_STRONG_INLINE Packet32f padd<Packet32f>(const Packet32f& a, const Packet32f& b) {
+  return padd_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f padd<Packet16f>(const Packet16f& a, const Packet16f& b) {
+  return padd_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f padd<Packet8f>(const Packet8f& a, const Packet8f& b) {
+  return padd_hvx(a, b);
+}
+
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> psub_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
+  return HVXPacket<T>::Create(Q6_Vsf_equals_Vqf32(Q6_Vqf32_vsub_VsfVsf(a.Get(), b.Get())));
+}
+template <>
+EIGEN_STRONG_INLINE Packet32f psub<Packet32f>(const Packet32f& a, const Packet32f& b) {
+  return psub_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f psub<Packet16f>(const Packet16f& a, const Packet16f& b) {
+  return psub_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f psub<Packet8f>(const Packet8f& a, const Packet8f& b) {
+  return psub_hvx(a, b);
+}
+
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> pnegate_hvx(const HVXPacket<T>& a) {
+  return HVXPacket<T>::Create(a.Get() ^ Q6_V_vsplat_R(0x80000000));
+}
+template <>
+EIGEN_STRONG_INLINE Packet32f pnegate(const Packet32f& a) {
+  return pnegate_hvx(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f pnegate(const Packet16f& a) {
+  return pnegate_hvx(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pnegate(const Packet8f& a) {
+  return pnegate_hvx(a);
+}
+
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> ptrue_hvx(const HVXPacket<T>& a) {
+  return HVXPacket<T>::Create(Q6_V_vsplat_R(0x3f800000));
+}
+template <>
+EIGEN_STRONG_INLINE Packet32f ptrue(const Packet32f& a) {
+  return ptrue_hvx(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f ptrue(const Packet16f& a) {
+  return ptrue_hvx(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f ptrue(const Packet8f& a) {
+  return ptrue_hvx(a);
+}
+
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> pcmp_le_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
+  HVX_Vector v_true = ptrue(a).Get();
+  HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(a.Get(), b.Get());
+  return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, Q6_V_vzero(), v_true));
+}
+template <>
+EIGEN_STRONG_INLINE Packet32f pcmp_le(const Packet32f& a, const Packet32f& b) {
+  return pcmp_le_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f pcmp_le(const Packet16f& a, const Packet16f& b) {
+  return pcmp_le_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pcmp_le(const Packet8f& a, const Packet8f& b) {
+  return pcmp_le_hvx(a, b);
+}
+
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> pcmp_eq_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
+  HVX_Vector v_true = ptrue(a).Get();
+  HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(a.Get(), b.Get());
+  return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero()));
+}
+template <>
+EIGEN_STRONG_INLINE Packet32f pcmp_eq(const Packet32f& a, const Packet32f& b) {
+  return pcmp_eq_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f pcmp_eq(const Packet16f& a, const Packet16f& b) {
+  return pcmp_eq_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pcmp_eq(const Packet8f& a, const Packet8f& b) {
+  return pcmp_eq_hvx(a, b);
+}
+
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> pcmp_lt_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
+  HVX_Vector v_true = ptrue(a).Get();
+  HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(b.Get(), a.Get());
+  return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero()));
+}
+template <>
+EIGEN_STRONG_INLINE Packet32f pcmp_lt(const Packet32f& a, const Packet32f& b) {
+  return pcmp_lt_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f pcmp_lt(const Packet16f& a, const Packet16f& b) {
+  return pcmp_lt_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pcmp_lt(const Packet8f& a, const Packet8f& b) {
+  return pcmp_lt_hvx(a, b);
+}
+
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> pcmp_lt_or_nan_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
+  HVX_Vector v_true = ptrue(a).Get();
+  HVX_VectorPred pred = Q6_Q_vcmp_gt_VsfVsf(b.Get(), a.Get());
+  return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, v_true, Q6_V_vzero()));
+}
+template <>
+EIGEN_STRONG_INLINE Packet32f pcmp_lt_or_nan(const Packet32f& a, const Packet32f& b) {
+  return pcmp_lt_or_nan_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f pcmp_lt_or_nan(const Packet16f& a, const Packet16f& b) {
+  return pcmp_lt_or_nan_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pcmp_lt_or_nan(const Packet8f& a, const Packet8f& b) {
+  return pcmp_lt_or_nan_hvx(a, b);
+}
+
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> pabs_hvx(const HVXPacket<T>& a) {
+  return HVXPacket<T>::Create(a.Get() & Q6_V_vsplat_R(0x7FFFFFFF));
+}
+template <>
+EIGEN_STRONG_INLINE Packet32f pabs(const Packet32f& a) {
+  return pabs_hvx(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f pabs(const Packet16f& a) {
+  return pabs_hvx(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pabs(const Packet8f& a) {
+  return pabs_hvx(a);
+}
+
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE float pfirst_hvx(const HVXPacket<T>& a) {
+  union {
+    float array[1];
+    HVX_Vector vector;
+  } HVX_and_array;
+  HVX_and_array.vector = a.Get();
+  return HVX_and_array.array[0];
+}
+template <>
+EIGEN_STRONG_INLINE float pfirst(const Packet32f& a) {
+  return pfirst_hvx(a);
+}
+template <>
+EIGEN_STRONG_INLINE float pfirst(const Packet16f& a) {
+  return pfirst_hvx(a);
+}
+template <>
+EIGEN_STRONG_INLINE float pfirst(const Packet8f& a) {
+  return pfirst_hvx(a);
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet32f, 4>& kernel) {
+  // Shuffle the 32-bit lanes.
+  HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
+  HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
+
+  // Shuffle the 64-bit lanes.
+  HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
+  HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_3_2), HEXAGON_HVX_GET_V1(v_0_1_0), -8);
+  kernel.packet[0] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_1_1_0));
+  kernel.packet[1] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_1_1_0));
+  kernel.packet[2] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_1_3_2));
+  kernel.packet[3] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_1_3_2));
+}
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16f, 4>& kernel) {
+  // Shuffle the 32-bit lanes.
+  HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
+  HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
+
+  // Shuffle the 64-bit lanes.
+  HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
+
+  kernel.packet[0] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_1_0));
+  kernel.packet[1] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 64));
+  kernel.packet[2] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_1_0));
+  kernel.packet[3] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_1_0), HEXAGON_HVX_GET_V1(v_1_1_0), 64));
+}
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8f, 4>& kernel) {
+  // Shuffle the 32-bit lanes.
+  HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
+  HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
+
+  // Shuffle the 64-bit lanes.
+  HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
+
+  kernel.packet[0] = Packet8f::Create(HEXAGON_HVX_GET_V0(v_1_1_0));
+  kernel.packet[1] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 32));
+  kernel.packet[2] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 64));
+  kernel.packet[3] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 96));
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8f, 8>& kernel) {
+  // Shuffle the 32-bit lanes.
+  HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
+  HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
+  HVX_VectorPair v_0_5_4 = Q6_W_vshuff_VVR(kernel.packet[5].Get(), kernel.packet[4].Get(), -4);
+  HVX_VectorPair v_0_7_6 = Q6_W_vshuff_VVR(kernel.packet[7].Get(), kernel.packet[6].Get(), -4);
+
+  // Shuffle the 64-bit lanes.
+  HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
+  HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_7_6), HEXAGON_HVX_GET_V0(v_0_5_4), -8);
+
+  // Shuffle the 128-bit lanes.
+  v_0_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_3_2), HEXAGON_HVX_GET_V0(v_1_1_0), -16);
+
+  kernel.packet[0] = Packet8f::Create(HEXAGON_HVX_GET_V0(v_0_1_0));
+  kernel.packet[1] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_0_1_0), HEXAGON_HVX_GET_V0(v_0_1_0), 32));
+  kernel.packet[2] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_0_1_0), HEXAGON_HVX_GET_V0(v_0_1_0), 64));
+  kernel.packet[3] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_0_1_0), HEXAGON_HVX_GET_V0(v_0_1_0), 96));
+  kernel.packet[4] = Packet8f::Create(HEXAGON_HVX_GET_V1(v_0_1_0));
+  kernel.packet[5] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_0_1_0), HEXAGON_HVX_GET_V1(v_0_1_0), 32));
+  kernel.packet[6] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_0_1_0), HEXAGON_HVX_GET_V1(v_0_1_0), 64));
+  kernel.packet[7] = Packet8f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_0_1_0), HEXAGON_HVX_GET_V1(v_0_1_0), 96));
+}
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16f, 16>& kernel) {
+  // Shuffle the 32-bit lanes.
+  HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
+  HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
+  HVX_VectorPair v_0_5_4 = Q6_W_vshuff_VVR(kernel.packet[5].Get(), kernel.packet[4].Get(), -4);
+  HVX_VectorPair v_0_7_6 = Q6_W_vshuff_VVR(kernel.packet[7].Get(), kernel.packet[6].Get(), -4);
+  HVX_VectorPair v_0_9_8 = Q6_W_vshuff_VVR(kernel.packet[9].Get(), kernel.packet[8].Get(), -4);
+  HVX_VectorPair v_0_11_10 = Q6_W_vshuff_VVR(kernel.packet[11].Get(), kernel.packet[10].Get(), -4);
+  HVX_VectorPair v_0_13_12 = Q6_W_vshuff_VVR(kernel.packet[13].Get(), kernel.packet[12].Get(), -4);
+  HVX_VectorPair v_0_15_14 = Q6_W_vshuff_VVR(kernel.packet[15].Get(), kernel.packet[14].Get(), -4);
+
+  // Shuffle the 64-bit lanes.
+  HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
+  HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_7_6), HEXAGON_HVX_GET_V0(v_0_5_4), -8);
+  HVX_VectorPair v_1_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_11_10), HEXAGON_HVX_GET_V0(v_0_9_8), -8);
+  HVX_VectorPair v_1_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_15_14), HEXAGON_HVX_GET_V0(v_0_13_12), -8);
+
+  // Shuffle the 128-bit lanes.
+  v_0_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_3_2), HEXAGON_HVX_GET_V0(v_1_1_0), -16);
+  v_0_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_3_2), HEXAGON_HVX_GET_V1(v_1_1_0), -16);
+  v_0_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_7_6), HEXAGON_HVX_GET_V0(v_1_5_4), -16);
+  v_0_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_7_6), HEXAGON_HVX_GET_V1(v_1_5_4), -16);
+
+  // Shuffle the 256-bit lanes.
+  v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_9_8), HEXAGON_HVX_GET_V0(v_0_1_0), -32);
+  v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_9_8), HEXAGON_HVX_GET_V1(v_0_1_0), -32);
+  v_1_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_11_10), HEXAGON_HVX_GET_V0(v_0_3_2), -32);
+  v_1_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_11_10), HEXAGON_HVX_GET_V1(v_0_3_2), -32);
+
+  kernel.packet[0] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_1_0));
+  kernel.packet[1] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_1_0), HEXAGON_HVX_GET_V0(v_1_1_0), 64));
+  kernel.packet[2] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_1_0));
+  kernel.packet[3] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_1_0), HEXAGON_HVX_GET_V1(v_1_1_0), 64));
+  kernel.packet[4] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_3_2));
+  kernel.packet[5] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_3_2), HEXAGON_HVX_GET_V0(v_1_3_2), 64));
+  kernel.packet[6] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_3_2));
+  kernel.packet[7] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_3_2), HEXAGON_HVX_GET_V1(v_1_3_2), 64));
+  kernel.packet[8] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_5_4));
+  kernel.packet[9] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_5_4), HEXAGON_HVX_GET_V0(v_1_5_4), 64));
+  kernel.packet[10] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_5_4));
+  kernel.packet[11] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_5_4), HEXAGON_HVX_GET_V1(v_1_5_4), 64));
+  kernel.packet[12] = Packet16f::Create(HEXAGON_HVX_GET_V0(v_1_7_6));
+  kernel.packet[13] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V0(v_1_7_6), HEXAGON_HVX_GET_V0(v_1_7_6), 64));
+  kernel.packet[14] = Packet16f::Create(HEXAGON_HVX_GET_V1(v_1_7_6));
+  kernel.packet[15] = Packet16f::Create(Q6_V_valign_VVR(HEXAGON_HVX_GET_V1(v_1_7_6), HEXAGON_HVX_GET_V1(v_1_7_6), 64));
+}
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet32f, 32>& kernel) {
+  // Shuffle the 32-bit lanes.
+  HVX_VectorPair v_0_1_0 = Q6_W_vshuff_VVR(kernel.packet[1].Get(), kernel.packet[0].Get(), -4);
+  HVX_VectorPair v_0_3_2 = Q6_W_vshuff_VVR(kernel.packet[3].Get(), kernel.packet[2].Get(), -4);
+  HVX_VectorPair v_0_5_4 = Q6_W_vshuff_VVR(kernel.packet[5].Get(), kernel.packet[4].Get(), -4);
+  HVX_VectorPair v_0_7_6 = Q6_W_vshuff_VVR(kernel.packet[7].Get(), kernel.packet[6].Get(), -4);
+  HVX_VectorPair v_0_9_8 = Q6_W_vshuff_VVR(kernel.packet[9].Get(), kernel.packet[8].Get(), -4);
+  HVX_VectorPair v_0_11_10 = Q6_W_vshuff_VVR(kernel.packet[11].Get(), kernel.packet[10].Get(), -4);
+  HVX_VectorPair v_0_13_12 = Q6_W_vshuff_VVR(kernel.packet[13].Get(), kernel.packet[12].Get(), -4);
+  HVX_VectorPair v_0_15_14 = Q6_W_vshuff_VVR(kernel.packet[15].Get(), kernel.packet[14].Get(), -4);
+  HVX_VectorPair v_0_17_16 = Q6_W_vshuff_VVR(kernel.packet[17].Get(), kernel.packet[16].Get(), -4);
+  HVX_VectorPair v_0_19_18 = Q6_W_vshuff_VVR(kernel.packet[19].Get(), kernel.packet[18].Get(), -4);
+  HVX_VectorPair v_0_21_20 = Q6_W_vshuff_VVR(kernel.packet[21].Get(), kernel.packet[20].Get(), -4);
+  HVX_VectorPair v_0_23_22 = Q6_W_vshuff_VVR(kernel.packet[23].Get(), kernel.packet[22].Get(), -4);
+  HVX_VectorPair v_0_25_24 = Q6_W_vshuff_VVR(kernel.packet[25].Get(), kernel.packet[24].Get(), -4);
+  HVX_VectorPair v_0_27_26 = Q6_W_vshuff_VVR(kernel.packet[27].Get(), kernel.packet[26].Get(), -4);
+  HVX_VectorPair v_0_29_28 = Q6_W_vshuff_VVR(kernel.packet[29].Get(), kernel.packet[28].Get(), -4);
+  HVX_VectorPair v_0_31_30 = Q6_W_vshuff_VVR(kernel.packet[31].Get(), kernel.packet[30].Get(), -4);
+
+  // Shuffle the 64-bit lanes.
+  HVX_VectorPair v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_3_2), HEXAGON_HVX_GET_V0(v_0_1_0), -8);
+  HVX_VectorPair v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_3_2), HEXAGON_HVX_GET_V1(v_0_1_0), -8);
+  HVX_VectorPair v_1_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_7_6), HEXAGON_HVX_GET_V0(v_0_5_4), -8);
+  HVX_VectorPair v_1_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_7_6), HEXAGON_HVX_GET_V1(v_0_5_4), -8);
+  HVX_VectorPair v_1_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_11_10), HEXAGON_HVX_GET_V0(v_0_9_8), -8);
+  HVX_VectorPair v_1_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_11_10), HEXAGON_HVX_GET_V1(v_0_9_8), -8);
+  HVX_VectorPair v_1_13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_15_14), HEXAGON_HVX_GET_V0(v_0_13_12), -8);
+  HVX_VectorPair v_1_15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_15_14), HEXAGON_HVX_GET_V1(v_0_13_12), -8);
+  HVX_VectorPair v_1_17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_19_18), HEXAGON_HVX_GET_V0(v_0_17_16), -8);
+  HVX_VectorPair v_1_19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_19_18), HEXAGON_HVX_GET_V1(v_0_17_16), -8);
+  HVX_VectorPair v_1_21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_23_22), HEXAGON_HVX_GET_V0(v_0_21_20), -8);
+  HVX_VectorPair v_1_23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_23_22), HEXAGON_HVX_GET_V1(v_0_21_20), -8);
+  HVX_VectorPair v_1_25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_27_26), HEXAGON_HVX_GET_V0(v_0_25_24), -8);
+  HVX_VectorPair v_1_27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_27_26), HEXAGON_HVX_GET_V1(v_0_25_24), -8);
+  HVX_VectorPair v_1_29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_31_30), HEXAGON_HVX_GET_V0(v_0_29_28), -8);
+  HVX_VectorPair v_1_31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_31_30), HEXAGON_HVX_GET_V1(v_0_29_28), -8);
+
+  // Shuffle the 128-bit lanes.
+  v_0_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_5_4), HEXAGON_HVX_GET_V0(v_1_1_0), -16);
+  v_0_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_5_4), HEXAGON_HVX_GET_V1(v_1_1_0), -16);
+  v_0_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_7_6), HEXAGON_HVX_GET_V0(v_1_3_2), -16);
+  v_0_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_7_6), HEXAGON_HVX_GET_V1(v_1_3_2), -16);
+  v_0_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_13_12), HEXAGON_HVX_GET_V0(v_1_9_8), -16);
+  v_0_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_13_12), HEXAGON_HVX_GET_V1(v_1_9_8), -16);
+  v_0_13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_15_14), HEXAGON_HVX_GET_V0(v_1_11_10), -16);
+  v_0_15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_15_14), HEXAGON_HVX_GET_V1(v_1_11_10), -16);
+  v_0_17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_21_20), HEXAGON_HVX_GET_V0(v_1_17_16), -16);
+  v_0_19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_21_20), HEXAGON_HVX_GET_V1(v_1_17_16), -16);
+  v_0_21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_23_22), HEXAGON_HVX_GET_V0(v_1_19_18), -16);
+  v_0_23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_23_22), HEXAGON_HVX_GET_V1(v_1_19_18), -16);
+  v_0_25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_29_28), HEXAGON_HVX_GET_V0(v_1_25_24), -16);
+  v_0_27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_29_28), HEXAGON_HVX_GET_V1(v_1_25_24), -16);
+  v_0_29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_31_30), HEXAGON_HVX_GET_V0(v_1_27_26), -16);
+  v_0_31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_31_30), HEXAGON_HVX_GET_V1(v_1_27_26), -16);
+
+  // Shuffle the 256-bit lanes.
+  v_1_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_9_8), HEXAGON_HVX_GET_V0(v_0_1_0), -32);
+  v_1_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_9_8), HEXAGON_HVX_GET_V1(v_0_1_0), -32);
+  v_1_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_11_10), HEXAGON_HVX_GET_V0(v_0_3_2), -32);
+  v_1_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_11_10), HEXAGON_HVX_GET_V1(v_0_3_2), -32);
+  v_1_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_13_12), HEXAGON_HVX_GET_V0(v_0_5_4), -32);
+  v_1_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_13_12), HEXAGON_HVX_GET_V1(v_0_5_4), -32);
+  v_1_13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_15_14), HEXAGON_HVX_GET_V0(v_0_7_6), -32);
+  v_1_15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_15_14), HEXAGON_HVX_GET_V1(v_0_7_6), -32);
+  v_1_17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_25_24), HEXAGON_HVX_GET_V0(v_0_17_16), -32);
+  v_1_19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_25_24), HEXAGON_HVX_GET_V1(v_0_17_16), -32);
+  v_1_21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_27_26), HEXAGON_HVX_GET_V0(v_0_19_18), -32);
+  v_1_23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_27_26), HEXAGON_HVX_GET_V1(v_0_19_18), -32);
+  v_1_25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_29_28), HEXAGON_HVX_GET_V0(v_0_21_20), -32);
+  v_1_27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_29_28), HEXAGON_HVX_GET_V1(v_0_21_20), -32);
+  v_1_29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_0_31_30), HEXAGON_HVX_GET_V0(v_0_23_22), -32);
+  v_1_31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_0_31_30), HEXAGON_HVX_GET_V1(v_0_23_22), -32);
+
+  // Shuffle the 512-bit lanes.
+  v_0_1_0 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_17_16), HEXAGON_HVX_GET_V0(v_1_1_0), -64);
+  v_0_3_2 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_17_16), HEXAGON_HVX_GET_V1(v_1_1_0), -64);
+  v_0_5_4 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_19_18), HEXAGON_HVX_GET_V0(v_1_3_2), -64);
+  v_0_7_6 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_19_18), HEXAGON_HVX_GET_V1(v_1_3_2), -64);
+  v_0_9_8 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_21_20), HEXAGON_HVX_GET_V0(v_1_5_4), -64);
+  v_0_11_10 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_21_20), HEXAGON_HVX_GET_V1(v_1_5_4), -64);
+  v_0_13_12 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_23_22), HEXAGON_HVX_GET_V0(v_1_7_6), -64);
+  v_0_15_14 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_23_22), HEXAGON_HVX_GET_V1(v_1_7_6), -64);
+  v_0_17_16 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_25_24), HEXAGON_HVX_GET_V0(v_1_9_8), -64);
+  v_0_19_18 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_25_24), HEXAGON_HVX_GET_V1(v_1_9_8), -64);
+  v_0_21_20 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_27_26), HEXAGON_HVX_GET_V0(v_1_11_10), -64);
+  v_0_23_22 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_27_26), HEXAGON_HVX_GET_V1(v_1_11_10), -64);
+  v_0_25_24 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_29_28), HEXAGON_HVX_GET_V0(v_1_13_12), -64);
+  v_0_27_26 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_29_28), HEXAGON_HVX_GET_V1(v_1_13_12), -64);
+  v_0_29_28 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(v_1_31_30), HEXAGON_HVX_GET_V0(v_1_15_14), -64);
+  v_0_31_30 = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V1(v_1_31_30), HEXAGON_HVX_GET_V1(v_1_15_14), -64);
+
+  kernel.packet[0] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_1_0));
+  kernel.packet[1] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_1_0));
+  kernel.packet[2] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_3_2));
+  kernel.packet[3] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_3_2));
+  kernel.packet[4] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_5_4));
+  kernel.packet[5] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_5_4));
+  kernel.packet[6] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_7_6));
+  kernel.packet[7] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_7_6));
+  kernel.packet[8] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_9_8));
+  kernel.packet[9] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_9_8));
+  kernel.packet[10] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_11_10));
+  kernel.packet[11] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_11_10));
+  kernel.packet[12] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_13_12));
+  kernel.packet[13] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_13_12));
+  kernel.packet[14] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_15_14));
+  kernel.packet[15] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_15_14));
+  kernel.packet[16] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_17_16));
+  kernel.packet[17] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_17_16));
+  kernel.packet[18] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_19_18));
+  kernel.packet[19] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_19_18));
+  kernel.packet[20] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_21_20));
+  kernel.packet[21] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_21_20));
+  kernel.packet[22] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_23_22));
+  kernel.packet[23] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_23_22));
+  kernel.packet[24] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_25_24));
+  kernel.packet[25] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_25_24));
+  kernel.packet[26] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_27_26));
+  kernel.packet[27] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_27_26));
+  kernel.packet[28] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_29_28));
+  kernel.packet[29] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_29_28));
+  kernel.packet[30] = Packet32f::Create(HEXAGON_HVX_GET_V0(v_0_31_30));
+  kernel.packet[31] = Packet32f::Create(HEXAGON_HVX_GET_V1(v_0_31_30));
+}
+
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE float predux_hvx(const HVXPacket<T>& a) {
+  const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
+  HVX_Vector vsum = Q6_Vqf32_vadd_VsfVsf(a.Get(), Q6_V_vror_VR(a.Get(), sizeof(float)));
+  for (int i = 2; i < packet_size; i <<= 1) {
+    vsum = Q6_Vqf32_vadd_Vqf32Vqf32(vsum, Q6_V_vror_VR(vsum, i * sizeof(float)));
+  }
+  return pfirst(HVXPacket<T>::Create(Q6_Vsf_equals_Vqf32(vsum)));
+}
+template <>
+EIGEN_STRONG_INLINE float predux<Packet32f>(const Packet32f& a) {
+  return predux_hvx(a);
+}
+template <>
+EIGEN_STRONG_INLINE float predux<Packet16f>(const Packet16f& a) {
+  return predux_hvx(a);
+}
+template <>
+EIGEN_STRONG_INLINE float predux<Packet8f>(const Packet8f& a) {
+  return predux_hvx(a);
+}
+
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> ploaddup_hvx(const float* from) {
+  constexpr Index size = unpacket_traits<HVXPacket<T>>::size / 2;
+  HVX_Vector load = HVX_load_partial<size, 0>(from);
+  HVX_VectorPair dup = Q6_W_vshuff_VVR(load, load, -4);
+  return HVXPacket<T>::Create(HEXAGON_HVX_GET_V0(dup));
+}
+template <>
+EIGEN_STRONG_INLINE Packet32f ploaddup(const float* from) {
+  return ploaddup_hvx<HVXPacketSize::Full>(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f ploaddup(const float* from) {
+  return ploaddup_hvx<HVXPacketSize::Half>(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f ploaddup(const float* from) {
+  return ploaddup_hvx<HVXPacketSize::Quarter>(from);
+}
+
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> ploadquad_hvx(const float* from) {
+  constexpr Index size = unpacket_traits<HVXPacket<T>>::size / 4;
+  HVX_Vector load = HVX_load_partial<size, 0>(from);
+  HVX_VectorPair dup = Q6_W_vshuff_VVR(load, load, -4);
+  HVX_VectorPair quad = Q6_W_vshuff_VVR(HEXAGON_HVX_GET_V0(dup), HEXAGON_HVX_GET_V0(dup), -8);
+  return HVXPacket<T>::Create(HEXAGON_HVX_GET_V0(quad));
+}
+template <>
+EIGEN_STRONG_INLINE Packet32f ploadquad(const float* from) {
+  return ploadquad_hvx<HVXPacketSize::Full>(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f ploadquad(const float* from) {
+  return ploadquad_hvx<HVXPacketSize::Half>(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f ploadquad(const float* from) {
+  return ploadquad_hvx<HVXPacketSize::Quarter>(from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet32f preverse(const Packet32f& a) {
+  HVX_Vector delta = Q6_Vb_vsplat_R(0x7c);
+  return Packet32f::Create(Q6_V_vdelta_VV(a.Get(), delta));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16f preverse(const Packet16f& a) {
+  HVX_Vector delta = Q6_Vb_vsplat_R(0x3c);
+  return Packet16f::Create(Q6_V_vdelta_VV(a.Get(), delta));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8f preverse(const Packet8f& a) {
+  HVX_Vector delta = Q6_Vb_vsplat_R(0x1c);
+  return Packet8f::Create(Q6_V_vdelta_VV(a.Get(), delta));
+}
+
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> pmin_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
+  return HVXPacket<T>::Create(Q6_Vsf_vmin_VsfVsf(a.Get(), b.Get()));
+}
+template <>
+EIGEN_STRONG_INLINE Packet32f pmin(const Packet32f& a, const Packet32f& b) {
+  return pmin_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f pmin(const Packet16f& a, const Packet16f& b) {
+  return pmin_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pmin(const Packet8f& a, const Packet8f& b) {
+  return pmin_hvx(a, b);
+}
+
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> pmax_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
+  return HVXPacket<T>::Create(Q6_Vsf_vmax_VsfVsf(a.Get(), b.Get()));
+}
+template <>
+EIGEN_STRONG_INLINE Packet32f pmax(const Packet32f& a, const Packet32f& b) {
+  return pmax_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f pmax(const Packet16f& a, const Packet16f& b) {
+  return pmax_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pmax(const Packet8f& a, const Packet8f& b) {
+  return pmax_hvx(a, b);
+}
+
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> pand_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
+  return HVXPacket<T>::Create(a.Get() & b.Get());
+}
+template <>
+EIGEN_STRONG_INLINE Packet32f pand(const Packet32f& a, const Packet32f& b) {
+  return pand_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f pand(const Packet16f& a, const Packet16f& b) {
+  return pand_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pand(const Packet8f& a, const Packet8f& b) {
+  return pand_hvx(a, b);
+}
+
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> por_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
+  return HVXPacket<T>::Create(a.Get() | b.Get());
+}
+template <>
+EIGEN_STRONG_INLINE Packet32f por(const Packet32f& a, const Packet32f& b) {
+  return por_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f por(const Packet16f& a, const Packet16f& b) {
+  return por_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f por(const Packet8f& a, const Packet8f& b) {
+  return por_hvx(a, b);
+}
+
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> pxor_hvx(const HVXPacket<T>& a, const HVXPacket<T>& b) {
+  return HVXPacket<T>::Create(a.Get() ^ b.Get());
+}
+template <>
+EIGEN_STRONG_INLINE Packet32f pxor(const Packet32f& a, const Packet32f& b) {
+  return pxor_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f pxor(const Packet16f& a, const Packet16f& b) {
+  return pxor_hvx(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pxor(const Packet8f& a, const Packet8f& b) {
+  return pxor_hvx(a, b);
+}
+
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> pnot_hvx(const HVXPacket<T>& a) {
+  return HVXPacket<T>::Create(~a.Get());
+}
+template <>
+EIGEN_STRONG_INLINE Packet32f pnot(const Packet32f& a) {
+  return pnot_hvx(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f pnot(const Packet16f& a) {
+  return pnot_hvx(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pnot(const Packet8f& a) {
+  return pnot_hvx(a);
+}
+
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> pselect_hvx(const HVXPacket<T>& mask, const HVXPacket<T>& a, const HVXPacket<T>& b) {
+  HVX_VectorPred pred = Q6_Q_vcmp_eq_VwVw(mask.Get(), Q6_V_vzero());
+  return HVXPacket<T>::Create(Q6_V_vmux_QVV(pred, b.Get(), a.Get()));
+}
+template <>
+EIGEN_STRONG_INLINE Packet32f pselect(const Packet32f& mask, const Packet32f& a, const Packet32f& b) {
+  return pselect_hvx(mask, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f pselect(const Packet16f& mask, const Packet16f& a, const Packet16f& b) {
+  return pselect_hvx(mask, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pselect(const Packet8f& mask, const Packet8f& a, const Packet8f& b) {
+  return pselect_hvx(mask, a, b);
+}
+
+template <HVXPacketSize T, typename Op>
+EIGEN_STRONG_INLINE float predux_generic(const HVXPacket<T>& a, Op op) {
+  const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
+  HVXPacket<T> vredux = a;
+  for (int i = 1; i < packet_size; i <<= 1) {
+    vredux = op(vredux, HVXPacket<T>::Create(Q6_V_vror_VR(vredux.Get(), i * sizeof(float))));
+  }
+  return pfirst(vredux);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_max(const Packet32f& a) {
+  return predux_generic(a, pmax<Packet32f>);
+}
+template <>
+EIGEN_STRONG_INLINE float predux_max(const Packet16f& a) {
+  return predux_generic(a, pmax<Packet16f>);
+}
+template <>
+EIGEN_STRONG_INLINE float predux_max(const Packet8f& a) {
+  return predux_generic(a, pmax<Packet8f>);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_min(const Packet32f& a) {
+  return predux_generic(a, pmin<Packet32f>);
+}
+template <>
+EIGEN_STRONG_INLINE float predux_min(const Packet16f& a) {
+  return predux_generic(a, pmin<Packet16f>);
+}
+template <>
+EIGEN_STRONG_INLINE float predux_min(const Packet8f& a) {
+  return predux_generic(a, pmin<Packet8f>);
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet32f& a) {
+  return predux_generic(a, por<Packet32f>) != 0.0f;
+}
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet16f& a) {
+  return predux_generic(a, por<Packet16f>) != 0.0f;
+}
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet8f& a) {
+  return predux_generic(a, por<Packet8f>) != 0.0f;
+}
+
+static const float index_vsf[32]
+    __attribute__((aligned(__HVX_LENGTH__))) = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+                                                16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31};
+
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> plset_hvx(const float& a) {
+  return padd(pload<HVXPacket<T>>(index_vsf), pset1<HVXPacket<T>>(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet32f plset(const float& a) {
+  return plset_hvx<HVXPacketSize::Full>(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f plset(const float& a) {
+  return plset_hvx<HVXPacketSize::Half>(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f plset(const float& a) {
+  return plset_hvx<HVXPacketSize::Quarter>(a);
+}
+
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE void pscatter_hvx(float* to, const HVXPacket<T>& from, Index stride) {
+  const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
+  float elements[packet_size] __attribute__((aligned(__HVX_LENGTH__)));
+  pstore<float>(elements, from);
+  for (Index i = 0; i < packet_size; ++i) {
+    to[i * stride] = elements[i];
+  }
+}
+template <>
+EIGEN_STRONG_INLINE void pscatter<float, Packet32f>(float* to, const Packet32f& from, Index stride) {
+  pscatter_hvx(to, from, stride);
+}
+template <>
+EIGEN_STRONG_INLINE void pscatter<float, Packet16f>(float* to, const Packet16f& from, Index stride) {
+  pscatter_hvx(to, from, stride);
+}
+template <>
+EIGEN_STRONG_INLINE void pscatter<float, Packet8f>(float* to, const Packet8f& from, Index stride) {
+  pscatter_hvx(to, from, stride);
+}
+
+template <HVXPacketSize T>
+EIGEN_STRONG_INLINE HVXPacket<T> pgather_hvx(const float* from, Index stride) {
+  const Index packet_size = unpacket_traits<HVXPacket<T>>::size;
+  float elements[packet_size] __attribute__((aligned(__HVX_LENGTH__)));
+  for (Index i = 0; i < packet_size; i++) {
+    elements[i] = from[i * stride];
+  }
+  return pload<HVXPacket<T>>(elements);
+}
+template <>
+EIGEN_STRONG_INLINE Packet32f pgather<float, Packet32f>(const float* from, Index stride) {
+  return pgather_hvx<HVXPacketSize::Full>(from, stride);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16f pgather<float, Packet16f>(const float* from, Index stride) {
+  return pgather_hvx<HVXPacketSize::Half>(from, stride);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8f pgather<float, Packet8f>(const float* from, Index stride) {
+  return pgather_hvx<HVXPacketSize::Quarter>(from, stride);
+}
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // __HVX__ && (__HVX_LENGTH__ == 128) && __HVX_ARCH__ >= 68
+
+#endif  // EIGEN_HVX_PACKET_MATH_H
diff --git a/inst/include/Eigen/src/Core/arch/LSX/Complex.h b/inst/include/Eigen/src/Core/arch/LSX/Complex.h
new file mode 100644
index 00000000..0b60a831
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/LSX/Complex.h
@@ -0,0 +1,520 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// copyright (c) 2023 zang ruochen <zangruochen@loongson.cn>
+// copyright (c) 2024 XiWei Gu <guxiwei-hf@loongson.cn>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_COMPLEX_LSX_H
+#define EIGEN_COMPLEX_LSX_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+//---------- float ----------
+struct Packet2cf {
+  EIGEN_STRONG_INLINE Packet2cf() {}
+  EIGEN_STRONG_INLINE explicit Packet2cf(const __m128& a) : v(a) {}
+  Packet4f v;
+};
+
+template <>
+struct packet_traits<std::complex<float> > : default_packet_traits {
+  typedef Packet2cf type;
+  typedef Packet2cf half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 2,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasNegate = 1,
+    HasSqrt = 1,
+    HasExp = 1,
+    HasAbs = 0,
+    HasLog = 1,
+    HasAbs2 = 0,
+    HasMin = 0,
+    HasMax = 0,
+    HasSetLinear = 0
+  };
+};
+
+template <>
+struct unpacket_traits<Packet2cf> {
+  typedef std::complex<float> type;
+  typedef Packet2cf half;
+  typedef Packet4f as_real;
+  enum {
+    size = 2,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return Packet2cf(__lsx_vfadd_s(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return Packet2cf(__lsx_vfsub_s(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) {
+  const uint32_t b[4] = {0x80000000u, 0x80000000u, 0x80000000u, 0x80000000u};
+  Packet4i mask = (Packet4i)__lsx_vld(b, 0);
+  Packet2cf res;
+  res.v = (Packet4f)__lsx_vxor_v((__m128i)a.v, mask);
+  return res;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) {
+  const uint32_t b[4] = {0x00000000u, 0x80000000u, 0x00000000u, 0x80000000u};
+  Packet4i mask = (__m128i)__lsx_vld(b, 0);
+  Packet2cf res;
+  res.v = (Packet4f)__lsx_vxor_v((__m128i)a.v, mask);
+  return res;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  Packet4f part0_tmp = (Packet4f)__lsx_vfmul_s(a.v, b.v);
+  Packet4f part0 = __lsx_vfsub_s(part0_tmp, (__m128)__lsx_vshuf4i_w(part0_tmp, 0x31));
+  Packet4f part1_tmp = __lsx_vfmul_s((__m128)__lsx_vshuf4i_w(a.v, 0xb1), b.v);
+  Packet4f part1 = __lsx_vfadd_s(part1_tmp, (__m128)__lsx_vshuf4i_w(part1_tmp, 0x31));
+  Packet2cf res;
+  res.v = (Packet4f)__lsx_vpackev_w((__m128i)part1, (__m128i)part0);
+  return res;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf ptrue<Packet2cf>(const Packet2cf& a) {
+  return Packet2cf(ptrue(Packet4f(a.v)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pand<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  Packet2cf res;
+  res.v = (Packet4f)__lsx_vand_v((__m128i)a.v, (__m128i)b.v);
+  return res;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf por<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  Packet2cf res;
+  res.v = (Packet4f)__lsx_vor_v((__m128i)a.v, (__m128i)b.v);
+  return res;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pxor<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  Packet2cf res;
+  res.v = (Packet4f)__lsx_vxor_v((__m128i)a.v, (__m128i)b.v);
+  return res;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  Packet2cf res;
+  res.v = (Packet4f)__lsx_vandn_v((__m128i)b.v, (__m128i)a.v);
+  return res;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>(&numext::real_ref(*from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>(&numext::real_ref(*from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from) {
+  float f0 = from.real(), f1 = from.imag();
+  Packet4f re = {f0, f0, f0, f0};
+  Packet4f im = {f1, f1, f1, f1};
+  return Packet2cf((Packet4f)__lsx_vilvl_w((__m128i)im, (__m128i)re));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) {
+  return pset1<Packet2cf>(*from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<std::complex<float> >(std::complex<float>* to, const Packet2cf& from) {
+  EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), Packet4f(from.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet2cf& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), Packet4f(from.v));
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from,
+                                                                           Index stride) {
+  Packet2cf res;
+  __m128i tmp = __lsx_vldrepl_d(from, 0);
+  __m128i tmp1 = __lsx_vldrepl_d(from + stride, 0);
+  tmp = __lsx_vilvl_d(tmp1, tmp);
+  res.v = (__m128)tmp;
+  return res;
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from,
+                                                                       Index stride) {
+  __lsx_vstelm_d((__m128i)from.v, to, 0, 0);
+  __lsx_vstelm_d((__m128i)from.v, to + stride, 0, 1);
+}
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float>* addr) {
+  __builtin_prefetch(addr);
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a) {
+  EIGEN_ALIGN16 std::complex<float> res[2];
+  __lsx_vst(a.v, res, 0);
+  return res[0];
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) {
+  Packet2cf res;
+  res.v = (Packet4f)__lsx_vshuf4i_w(a.v, 0x4e);
+  return res;
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a) {
+  return pfirst(Packet2cf(__lsx_vfadd_s(a.v, vec4f_movehl(a.v, a.v))));
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a) {
+  return pfirst(pmul(a, Packet2cf(vec4f_movehl(a.v, a.v))));
+}
+
+EIGEN_STRONG_INLINE Packet2cf pcplxflip /* <Packet2cf> */ (const Packet2cf& x) {
+  return Packet2cf(vec4f_swizzle1(x.v, 1, 0, 3, 2));
+}
+
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf, Packet4f)
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return pdiv_complex(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf plog<Packet2cf>(const Packet2cf& a) {
+  return plog_complex(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pzero(const Packet2cf& /* a */) {
+  __m128 v = {0.0f, 0.0f, 0.0f, 0.0f};
+  return (Packet2cf)v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pmadd<Packet2cf>(const Packet2cf& a, const Packet2cf& b, const Packet2cf& c) {
+  Packet2cf result, t0, t1, t2;
+  t1 = pzero(t1);
+  t0.v = (__m128)__lsx_vpackev_w((__m128i)a.v, (__m128i)a.v);
+  t2.v = __lsx_vfmadd_s(t0.v, b.v, c.v);
+  result.v = __lsx_vfadd_s(t2.v, t1.v);
+  t1.v = __lsx_vfsub_s(t1.v, a.v);
+  t1.v = (__m128)__lsx_vpackod_w((__m128i)a.v, (__m128i)t1.v);
+  t2.v = (__m128)__lsx_vshuf4i_w((__m128i)b.v, 0xb1);
+  result.v = __lsx_vfmadd_s(t1.v, t2.v, result.v);
+  return result;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pexp<Packet2cf>(const Packet2cf& a) {
+  return pexp_complex(a);
+}
+
+//---------- double ----------
+struct Packet1cd {
+  EIGEN_STRONG_INLINE Packet1cd() {}
+  EIGEN_STRONG_INLINE explicit Packet1cd(const __m128d& a) : v(a) {}
+  Packet2d v;
+};
+
+template <>
+struct packet_traits<std::complex<double> > : default_packet_traits {
+  typedef Packet1cd type;
+  typedef Packet1cd half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 0,
+    size = 1,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasNegate = 1,
+    HasSqrt = 1,
+    HasAbs = 0,
+    HasLog = 1,
+    HasAbs2 = 0,
+    HasMin = 0,
+    HasMax = 0,
+    HasSetLinear = 0
+  };
+};
+
+template <>
+struct unpacket_traits<Packet1cd> {
+  typedef std::complex<double> type;
+  typedef Packet1cd half;
+  typedef Packet2d as_real;
+  enum {
+    size = 1,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return Packet1cd(__lsx_vfadd_d(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return Packet1cd(__lsx_vfsub_d(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) {
+  return Packet1cd(pnegate(Packet2d(a.v)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) {
+  const uint64_t tmp[2] = {0x0000000000000000u, 0x8000000000000000u};
+  __m128i mask = __lsx_vld(tmp, 0);
+  Packet1cd res;
+  res.v = (Packet2d)__lsx_vxor_v((__m128i)a.v, mask);
+  return res;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  Packet2d tmp_real = __lsx_vfmul_d(a.v, b.v);
+  Packet2d real = __lsx_vfsub_d(tmp_real, preverse(tmp_real));
+
+  Packet2d tmp_imag = __lsx_vfmul_d(preverse(a.v), b.v);
+  Packet2d imag = (__m128d)__lsx_vfadd_d((__m128d)tmp_imag, preverse(tmp_imag));
+  Packet1cd res;
+  res.v = (__m128d)__lsx_vilvl_d((__m128i)imag, (__m128i)real);
+  return res;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd ptrue<Packet1cd>(const Packet1cd& a) {
+  return Packet1cd(ptrue(Packet2d(a.v)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pand<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  Packet1cd res;
+  res.v = (Packet2d)__lsx_vand_v((__m128i)a.v, (__m128i)b.v);
+  return res;
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd por<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  Packet1cd res;
+  res.v = (Packet2d)__lsx_vor_v((__m128i)a.v, (__m128i)b.v);
+  return res;
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pxor<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  Packet1cd res;
+  res.v = (Packet2d)__lsx_vxor_v((__m128i)a.v, (__m128i)b.v);
+  return res;
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  Packet1cd res;
+  res.v = (Packet2d)__lsx_vandn_v((__m128i)b.v, (__m128i)a.v);
+  return res;
+}
+
+// FIXME force unaligned load, this is a temporary fix
+template <>
+EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd
+pset1<Packet1cd>(const std::complex<double>& from) { /* here we really have to use unaligned loads :( */
+  return ploadu<Packet1cd>(&from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) {
+  return pset1<Packet1cd>(*from);
+}
+
+// FIXME force unaligned store, this is a temporary fix
+template <>
+EIGEN_STRONG_INLINE void pstore<std::complex<double> >(std::complex<double>* to, const Packet1cd& from) {
+  EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, Packet2d(from.v));
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double>* to, const Packet1cd& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, Packet2d(from.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double>* addr) {
+  __builtin_prefetch(addr);
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a) {
+  EIGEN_ALIGN16 double res[2];
+  __lsx_vst(a.v, res, 0);
+  return std::complex<double>(res[0], res[1]);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) {
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a) {
+  return pfirst(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) {
+  return pfirst(a);
+}
+
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd, Packet2d)
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return pdiv_complex(a, b);
+}
+
+EIGEN_STRONG_INLINE Packet1cd pcplxflip /* <Packet1cd> */ (const Packet1cd& x) {
+  return Packet1cd(preverse(Packet2d(x.v)));
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2cf, 2>& kernel) {
+  Packet4f tmp1 = (Packet4f)__lsx_vilvl_w((__m128i)kernel.packet[1].v, (__m128i)kernel.packet[0].v);
+  Packet4f tmp2 = (Packet4f)__lsx_vilvh_w((__m128i)kernel.packet[1].v, (__m128i)kernel.packet[0].v);
+  kernel.packet[0].v = (Packet4f)__lsx_vshuf4i_w(tmp1, 0xd8);
+  kernel.packet[1].v = (Packet4f)__lsx_vshuf4i_w(tmp2, 0xd8);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) {
+  Packet4f eq = (Packet4f)__lsx_vfcmp_ceq_s(a.v, b.v);
+  return Packet2cf(pand<Packet4f>(eq, vec4f_swizzle1(eq, 1, 0, 3, 2)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) {
+  Packet2d eq = (Packet2d)__lsx_vfcmp_ceq_d(a.v, b.v);
+  return Packet1cd(pand<Packet2d>(eq, preverse(eq)));
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet2cf pselect(const Packet2cf& mask, const Packet2cf& a, const Packet2cf& b) {
+  Packet2cf res;
+  res.v = (Packet4f)__lsx_vbitsel_v((__m128i)b.v, (__m128i)a.v, (__m128i)mask.v);
+  return res;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd psqrt<Packet1cd>(const Packet1cd& a) {
+  return psqrt_complex<Packet1cd>(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf psqrt<Packet2cf>(const Packet2cf& a) {
+  return psqrt_complex<Packet2cf>(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd plog<Packet1cd>(const Packet1cd& a) {
+  return plog_complex(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pzero<Packet1cd>(const Packet1cd& /* a */) {
+  __m128d v = {0.0, 0.0};
+  return (Packet1cd)v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pmadd<Packet1cd>(const Packet1cd& a, const Packet1cd& b, const Packet1cd& c) {
+  Packet1cd result, t0, t1, t2;
+  t1 = pzero(t1);
+  t0.v = (__m128d)__lsx_vpackev_d((__m128i)a.v, (__m128i)a.v);
+  t2.v = __lsx_vfmadd_d(t0.v, b.v, c.v);
+  result.v = __lsx_vfadd_d(t2.v, t1.v);
+  t1.v = __lsx_vfsub_d(t1.v, a.v);
+  t1.v = (__m128d)__lsx_vpackod_d((__m128i)a.v, (__m128i)t1.v);
+  t2.v = (__m128d)__lsx_vshuf4i_d((__m128i)t2.v, (__m128i)b.v, 0xb);
+  result.v = __lsx_vfmadd_d(t1.v, t2.v, result.v);
+  return result;
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from,
+                                                                            Index /* stride */) {
+  Packet1cd res;
+  __m128i tmp = __lsx_vld((void*)from, 0);
+  res.v = (__m128d)tmp;
+  return res;
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from,
+                                                                        Index /* stride */) {
+  __lsx_vst((__m128i)from.v, (void*)to, 0);
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd, 2>& kernel) {
+  Packet2d tmp = (__m128d)__lsx_vilvl_d((__m128i)kernel.packet[1].v, (__m128i)kernel.packet[0].v);
+  kernel.packet[1].v = (__m128d)__lsx_vilvh_d((__m128i)kernel.packet[1].v, (__m128i)kernel.packet[0].v);
+  kernel.packet[0].v = tmp;
+}
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_COMPLEX_LSX_H
diff --git a/inst/include/Eigen/src/Core/arch/LSX/GeneralBlockPanelKernel.h b/inst/include/Eigen/src/Core/arch/LSX/GeneralBlockPanelKernel.h
new file mode 100644
index 00000000..4b070620
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/LSX/GeneralBlockPanelKernel.h
@@ -0,0 +1,23 @@
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+#ifndef EIGEN_LSX_GEBP_NR
+#define EIGEN_LSX_GEBP_NR 8
+#endif
+
+template <>
+struct gebp_traits<float, float, false, false, Architecture::LSX, GEBPPacketFull>
+    : gebp_traits<float, float, false, false, Architecture::Generic, GEBPPacketFull> {
+  enum { nr = EIGEN_LSX_GEBP_NR };
+};
+
+template <>
+struct gebp_traits<double, double, false, false, Architecture::LSX, GEBPPacketFull>
+    : gebp_traits<double, double, false, false, Architecture::Generic, GEBPPacketFull> {
+  enum { nr = EIGEN_LSX_GEBP_NR };
+};
+}  // namespace internal
+}  // namespace Eigen
diff --git a/inst/include/Eigen/src/Core/arch/LSX/MathFunctions.h b/inst/include/Eigen/src/Core/arch/LSX/MathFunctions.h
new file mode 100644
index 00000000..cead4636
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/LSX/MathFunctions.h
@@ -0,0 +1,43 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2024 XiWei Gu (guxiwei-hf@loongson.cn)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATH_FUNCTIONS_LSX_H
+#define EIGEN_MATH_FUNCTIONS_LSX_H
+
+/* The sin and cos functions of this file are loosely derived from
+ * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
+ */
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+EIGEN_DOUBLE_PACKET_FUNCTION(atanh, Packet2d)
+EIGEN_DOUBLE_PACKET_FUNCTION(log, Packet2d)
+EIGEN_DOUBLE_PACKET_FUNCTION(log2, Packet2d)
+EIGEN_DOUBLE_PACKET_FUNCTION(tanh, Packet2d)
+
+EIGEN_FLOAT_PACKET_FUNCTION(atanh, Packet4f)
+EIGEN_FLOAT_PACKET_FUNCTION(log, Packet4f)
+EIGEN_FLOAT_PACKET_FUNCTION(log2, Packet4f)
+EIGEN_FLOAT_PACKET_FUNCTION(tanh, Packet4f)
+
+EIGEN_GENERIC_PACKET_FUNCTION(atan, Packet2d)
+EIGEN_GENERIC_PACKET_FUNCTION(atan, Packet4f)
+EIGEN_GENERIC_PACKET_FUNCTION(exp2, Packet2d)
+EIGEN_GENERIC_PACKET_FUNCTION(exp2, Packet4f)
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_MATH_FUNCTIONS_LSX_H
diff --git a/inst/include/Eigen/src/Core/arch/LSX/PacketMath.h b/inst/include/Eigen/src/Core/arch/LSX/PacketMath.h
new file mode 100644
index 00000000..87232aa2
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/LSX/PacketMath.h
@@ -0,0 +1,2866 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2023 Zang Ruochen <zangruochen@loongson.cn>
+// Copyright (C) 2024 XiWei Gu <guxiwei-hf@loongson.cn>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_LSX_H
+#define EIGEN_PACKET_MATH_LSX_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
+#endif
+
+#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
+#if EIGEN_ARCH_LOONGARCH64
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
+#endif
+#endif
+
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#endif
+
+typedef __m128 Packet4f;
+typedef __m128d Packet2d;
+
+typedef eigen_packet_wrapper<__m128i, 0> Packet16c;
+typedef eigen_packet_wrapper<__m128i, 1> Packet8s;
+typedef eigen_packet_wrapper<__m128i, 2> Packet4i;
+typedef eigen_packet_wrapper<__m128i, 3> Packet2l;
+typedef eigen_packet_wrapper<__m128i, 4> Packet16uc;
+typedef eigen_packet_wrapper<__m128i, 5> Packet8us;
+typedef eigen_packet_wrapper<__m128i, 6> Packet4ui;
+typedef eigen_packet_wrapper<__m128i, 7> Packet2ul;
+
+template <>
+struct is_arithmetic<__m128> {
+  enum { value = true };
+};
+template <>
+struct is_arithmetic<__m128i> {
+  enum { value = true };
+};
+template <>
+struct is_arithmetic<__m128d> {
+  enum { value = true };
+};
+template <>
+struct is_arithmetic<Packet16c> {
+  enum { value = true };
+};
+template <>
+struct is_arithmetic<Packet8s> {
+  enum { value = true };
+};
+template <>
+struct is_arithmetic<Packet4i> {
+  enum { value = true };
+};
+template <>
+struct is_arithmetic<Packet2l> {
+  enum { value = true };
+};
+template <>
+struct is_arithmetic<Packet16uc> {
+  enum { value = false };
+};
+template <>
+struct is_arithmetic<Packet8us> {
+  enum { value = false };
+};
+template <>
+struct is_arithmetic<Packet4ui> {
+  enum { value = false };
+};
+template <>
+struct is_arithmetic<Packet2ul> {
+  enum { value = false };
+};
+
+EIGEN_ALWAYS_INLINE Packet4f make_packet4f(float a, float b, float c, float d) {
+  float from[4] = {a, b, c, d};
+  return (Packet4f)__lsx_vld(from, 0);
+}
+
+EIGEN_STRONG_INLINE Packet4f shuffle1(const Packet4f& m, int mask) {
+  const float* a = reinterpret_cast<const float*>(&m);
+  Packet4f res =
+      make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(a + ((mask >> 6) & 3)));
+  return res;
+}
+
+template <bool interleave>
+EIGEN_STRONG_INLINE Packet4f shuffle2(const Packet4f& m, const Packet4f& n, int mask) {
+  const float* a = reinterpret_cast<const float*>(&m);
+  const float* b = reinterpret_cast<const float*>(&n);
+  Packet4f res =
+      make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(b + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3)));
+  return res;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f shuffle2<true>(const Packet4f& m, const Packet4f& n, int mask) {
+  const float* a = reinterpret_cast<const float*>(&m);
+  const float* b = reinterpret_cast<const float*>(&n);
+  Packet4f res =
+      make_packet4f(*(a + (mask & 3)), *(b + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3)));
+  return res;
+}
+
+EIGEN_STRONG_INLINE static int eigen_lsx_shuffle_mask(int p, int q, int r, int s) {
+  return ((s) << 6 | (r) << 4 | (q) << 2 | (p));
+}
+
+EIGEN_STRONG_INLINE Packet4f vec4f_swizzle1(const Packet4f& a, int p, int q, int r, int s) {
+  return shuffle1(a, eigen_lsx_shuffle_mask(p, q, r, s));
+}
+EIGEN_STRONG_INLINE Packet4f vec4f_swizzle2(const Packet4f& a, const Packet4f& b, int p, int q, int r, int s) {
+  return shuffle2<false>(a, b, eigen_lsx_shuffle_mask(p, q, r, s));
+}
+EIGEN_STRONG_INLINE Packet4f vec4f_movelh(const Packet4f& a, const Packet4f& b) {
+  return shuffle2<false>(a, b, eigen_lsx_shuffle_mask(0, 1, 0, 1));
+}
+EIGEN_STRONG_INLINE Packet4f vec4f_movehl(const Packet4f& a, const Packet4f& b) {
+  return shuffle2<false>(b, a, eigen_lsx_shuffle_mask(2, 3, 2, 3));
+}
+EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(const Packet4f& a, const Packet4f& b) {
+  return shuffle2<true>(a, b, eigen_lsx_shuffle_mask(0, 0, 1, 1));
+}
+EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b) {
+  return shuffle2<true>(a, b, eigen_lsx_shuffle_mask(2, 2, 3, 3));
+}
+
+EIGEN_ALWAYS_INLINE Packet2d make_packet2d(double a, double b) {
+  double from[2] = {a, b};
+  return (Packet2d)__lsx_vld(from, 0);
+}
+
+EIGEN_STRONG_INLINE Packet2d shuffle(const Packet2d& m, const Packet2d& n, int mask) {
+  const double* a = reinterpret_cast<const double*>(&m);
+  const double* b = reinterpret_cast<const double*>(&n);
+  Packet2d res = make_packet2d(*(a + (mask & 1)), *(b + ((mask >> 1) & 1)));
+  return res;
+}
+
+EIGEN_STRONG_INLINE Packet2d vec2d_swizzle2(const Packet2d& a, const Packet2d& b, int mask) {
+  return shuffle(a, b, mask);
+}
+EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(const Packet2d& a, const Packet2d& b) { return shuffle(a, b, 0); }
+EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d& a, const Packet2d& b) { return shuffle(a, b, 3); }
+
+template <>
+struct packet_traits<int8_t> : default_packet_traits {
+  typedef Packet16c type;
+  typedef Packet16c half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 16,
+
+    HasAbs2 = 0,
+    HasSetLinear = 0,
+    HasCmp = 1,
+    HasBlend = 0
+  };
+};
+
+template <>
+struct packet_traits<int16_t> : default_packet_traits {
+  typedef Packet8s type;
+  typedef Packet8s half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 8,
+
+    HasAbs2 = 0,
+    HasSetLinear = 0,
+    HasCmp = 1,
+    HasDiv = 1,
+    HasBlend = 0
+  };
+};
+
+template <>
+struct packet_traits<int32_t> : default_packet_traits {
+  typedef Packet4i type;
+  typedef Packet4i half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+
+    HasAbs2 = 0,
+    HasSetLinear = 0,
+    HasCmp = 1,
+    HasDiv = 1,
+    HasBlend = 0
+  };
+};
+
+template <>
+struct packet_traits<int64_t> : default_packet_traits {
+  typedef Packet2l type;
+  typedef Packet2l half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 2,
+
+    HasAbs2 = 0,
+    HasSetLinear = 0,
+    HasCmp = 1,
+    HasDiv = 1,
+    HasBlend = 0
+  };
+};
+
+template <>
+struct packet_traits<uint8_t> : default_packet_traits {
+  typedef Packet16uc type;
+  typedef Packet16uc half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 16,
+
+    HasAbs2 = 0,
+    HasSetLinear = 0,
+    HasNegate = 0,
+    HasCmp = 1,
+    HasBlend = 0
+  };
+};
+
+template <>
+struct packet_traits<uint16_t> : default_packet_traits {
+  typedef Packet8us type;
+  typedef Packet8us half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 8,
+
+    HasAbs2 = 0,
+    HasSetLinear = 0,
+    HasNegate = 0,
+    HasCmp = 1,
+    HasDiv = 1,
+    HasBlend = 0
+  };
+};
+
+template <>
+struct packet_traits<uint32_t> : default_packet_traits {
+  typedef Packet4ui type;
+  typedef Packet4ui half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+
+    HasAbs2 = 0,
+    HasSetLinear = 0,
+    HasNegate = 0,
+    HasCmp = 1,
+    HasDiv = 1,
+    HasBlend = 0
+  };
+};
+
+template <>
+struct packet_traits<uint64_t> : default_packet_traits {
+  typedef Packet2ul type;
+  typedef Packet2ul half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 2,
+
+    HasAbs2 = 0,
+    HasSetLinear = 0,
+    HasNegate = 0,
+    HasCmp = 1,
+    HasDiv = 1,
+    HasBlend = 0
+  };
+};
+
+template <>
+struct packet_traits<float> : default_packet_traits {
+  typedef Packet4f type;
+  typedef Packet4f half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+
+    HasAbs2 = 0,
+    HasSetLinear = 0,
+    HasBlend = 0,
+    HasSign = 0,
+    HasDiv = 1,
+    HasExp = 1,
+    HasSqrt = 1,
+    HasLog = 1,
+    HasRsqrt = 1
+  };
+};
+
+template <>
+struct packet_traits<double> : default_packet_traits {
+  typedef Packet2d type;
+  typedef Packet2d half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 2,
+
+    HasAbs2 = 0,
+    HasSetLinear = 0,
+    HasBlend = 0,
+    HasSign = 0,
+    HasDiv = 1,
+    HasSqrt = 1,
+    HasLog = 1,
+    HasRsqrt = 1
+  };
+};
+
+template <>
+struct unpacket_traits<Packet16c> {
+  typedef int8_t type;
+  typedef Packet16c half;
+  enum {
+    size = 16,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template <>
+struct unpacket_traits<Packet8s> {
+  typedef int16_t type;
+  typedef Packet8s half;
+  enum {
+    size = 8,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template <>
+struct unpacket_traits<Packet4i> {
+  typedef int32_t type;
+  typedef Packet4i half;
+  enum {
+    size = 4,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template <>
+struct unpacket_traits<Packet2l> {
+  typedef int64_t type;
+  typedef Packet2l half;
+  enum {
+    size = 2,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template <>
+struct unpacket_traits<Packet16uc> {
+  typedef uint8_t type;
+  typedef Packet16uc half;
+  enum {
+    size = 16,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template <>
+struct unpacket_traits<Packet8us> {
+  typedef uint16_t type;
+  typedef Packet8us half;
+  enum {
+    size = 8,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template <>
+struct unpacket_traits<Packet4ui> {
+  typedef uint32_t type;
+  typedef Packet4ui half;
+  enum {
+    size = 4,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template <>
+struct unpacket_traits<Packet2ul> {
+  typedef uint64_t type;
+  typedef Packet2ul half;
+  enum {
+    size = 2,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template <>
+struct unpacket_traits<Packet4f> {
+  typedef float type;
+  typedef Packet4f half;
+  typedef Packet4i integer_packet;
+  enum {
+    size = 4,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template <>
+struct unpacket_traits<Packet2d> {
+  typedef double type;
+  typedef Packet2d half;
+  typedef Packet2l integer_packet;
+  enum {
+    size = 2,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet16c pset1<Packet16c>(const int8_t& from) {
+  return __lsx_vreplgr2vr_b(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(const int16_t& from) {
+  return __lsx_vreplgr2vr_h(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int32_t& from) {
+  return __lsx_vreplgr2vr_w(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pset1<Packet2l>(const int64_t& from) {
+  return __lsx_vreplgr2vr_d(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pset1<Packet16uc>(const uint8_t& from) {
+  return __lsx_vreplgr2vr_b(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pset1<Packet8us>(const uint16_t& from) {
+  return __lsx_vreplgr2vr_h(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pset1<Packet4ui>(const uint32_t& from) {
+  return __lsx_vreplgr2vr_w(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pset1<Packet2ul>(const uint64_t& from) {
+  return __lsx_vreplgr2vr_d(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
+  Packet4f v = {from, from, from, from};
+  return v;
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
+  Packet2d v = {from, from};
+  return v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(uint32_t from) {
+  return reinterpret_cast<__m128>((__m128i)pset1<Packet4ui>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(uint64_t from) {
+  return reinterpret_cast<__m128d>((__m128i)pset1<Packet2ul>(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16c plset<Packet16c>(const int8_t& a) {
+  const int8_t countdown[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+  return __lsx_vadd_b(pset1<Packet16c>(a), __lsx_vld(countdown, 0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s plset<Packet8s>(const int16_t& a) {
+  const int16_t countdown[] = {0, 1, 2, 3, 4, 5, 6, 7};
+  return __lsx_vadd_h(pset1<Packet8s>(a), __lsx_vld(countdown, 0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int32_t& a) {
+  const int32_t countdown[] = {0, 1, 2, 3};
+  return __lsx_vadd_w(pset1<Packet4i>(a), __lsx_vld(countdown, 0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l plset<Packet2l>(const int64_t& a) {
+  const int64_t countdown[] = {0, 1};
+  return __lsx_vadd_d(pset1<Packet2l>(a), __lsx_vld(countdown, 0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc plset<Packet16uc>(const uint8_t& a) {
+  const uint8_t countdown[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+  return __lsx_vadd_b(pset1<Packet16uc>(a), __lsx_vld(countdown, 0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us plset<Packet8us>(const uint16_t& a) {
+  const uint16_t countdown[] = {0, 1, 2, 3, 4, 5, 6, 7};
+  return __lsx_vadd_h(pset1<Packet8us>(a), __lsx_vld(countdown, 0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui plset<Packet4ui>(const uint32_t& a) {
+  const uint32_t countdown[] = {0, 1, 2, 3};
+  return __lsx_vadd_w(pset1<Packet4ui>(a), __lsx_vld(countdown, 0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul plset<Packet2ul>(const uint64_t& a) {
+  const uint64_t countdown[] = {0, 1};
+  return __lsx_vadd_d(pset1<Packet2ul>(a), __lsx_vld(countdown, 0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) {
+  static const Packet4f countdown = {0.0f, 1.0f, 2.0f, 3.0f};
+  return __lsx_vfadd_s(pset1<Packet4f>(a), countdown);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) {
+  static const Packet2d countdown = {0.0f, 1.0f};
+  return __lsx_vfadd_d(pset1<Packet2d>(a), countdown);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16c padd<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return __lsx_vadd_b(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s padd<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return __lsx_vadd_h(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return __lsx_vadd_w(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l padd<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return __lsx_vadd_d(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc padd<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return __lsx_vadd_b(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us padd<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return __lsx_vadd_h(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return __lsx_vadd_w(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul padd<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+  return __lsx_vadd_d(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return __lsx_vfadd_s(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return __lsx_vfadd_d(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16c psub<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return __lsx_vsub_b(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s psub<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return __lsx_vsub_h(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return __lsx_vsub_w(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l psub<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return __lsx_vsub_d(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc psub<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return __lsx_vsub_b(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us psub<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return __lsx_vsub_h(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui psub<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return __lsx_vsub_w(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul psub<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+  return __lsx_vsub_d(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return __lsx_vfsub_s(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return __lsx_vfsub_d(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b);
+template <>
+EIGEN_STRONG_INLINE Packet4f paddsub<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  const Packet4f mask =
+      make_packet4f(numext::bit_cast<float>(0x80000000u), 0.0f, numext::bit_cast<float>(0x80000000u), 0.0f);
+  return padd(a, pxor(mask, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b);
+template <>
+EIGEN_STRONG_INLINE Packet2d paddsub<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  const Packet2d mask = make_packet2d(numext::bit_cast<double>(0x8000000000000000ull), 0.0);
+  return padd(a, pxor(mask, b));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) {
+  Packet4f mask = make_packet4f(numext::bit_cast<float>(0x80000000), numext::bit_cast<float>(0x80000000),
+                                numext::bit_cast<float>(0x80000000), numext::bit_cast<float>(0x80000000));
+  return (Packet4f)__lsx_vxor_v(numext::bit_cast<__m128i>(mask), numext::bit_cast<__m128i>(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) {
+  Packet2d mask =
+      make_packet2d(numext::bit_cast<double>(0x8000000000000000), numext::bit_cast<double>(0x8000000000000000));
+  return (Packet2d)__lsx_vxor_v(numext::bit_cast<__m128i>(mask), numext::bit_cast<__m128i>(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pnegate(const Packet16c& a) {
+  return __lsx_vneg_b(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pnegate(const Packet8s& a) {
+  return __lsx_vneg_h(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) {
+  return __lsx_vneg_w(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pnegate(const Packet2l& a) {
+  return __lsx_vneg_d(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pconj(const Packet16c& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pconj(const Packet8s& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pconj(const Packet2l& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pconj(const Packet16uc& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pconj(const Packet8us& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pconj(const Packet4ui& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pconj(const Packet2ul& a) {
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return __lsx_vfmul_s(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return __lsx_vfmul_d(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pmul<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return __lsx_vmul_b(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pmul<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return __lsx_vmul_h(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return __lsx_vmul_w(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pmul<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return __lsx_vmul_d(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pmul<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return __lsx_vmul_b(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pmul<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return __lsx_vmul_h(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pmul<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return __lsx_vmul_w(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pmul<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+  return __lsx_vmul_d(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return __lsx_vfdiv_s(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return __lsx_vfdiv_d(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pdiv<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return __lsx_vdiv_h(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return __lsx_vdiv_w(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pdiv<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return __lsx_vdiv_d(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pdiv<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return __lsx_vdiv_hu(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pdiv<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return __lsx_vdiv_wu(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pdiv<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+  return __lsx_vdiv_du(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+  return __lsx_vfmadd_s(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+  return __lsx_vfmadd_d(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+  return __lsx_vfmsub_s(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+  return __lsx_vfmsub_d(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pnmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+  return __lsx_vfnmsub_s(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pnmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+  return __lsx_vfnmsub_d(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pnmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+  return __lsx_vfnmadd_s(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pnmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+  return __lsx_vfnmadd_d(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pmadd(const Packet16c& a, const Packet16c& b, const Packet16c& c) {
+  return __lsx_vmadd_b(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c) {
+  return __lsx_vmadd_h(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
+  return __lsx_vmadd_w(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pmadd(const Packet2l& a, const Packet2l& b, const Packet2l& c) {
+  return __lsx_vmadd_d(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pmadd(const Packet16uc& a, const Packet16uc& b, const Packet16uc& c) {
+  return __lsx_vmadd_b(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8us& b, const Packet8us& c) {
+  return __lsx_vmadd_h(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pmadd(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c) {
+  return __lsx_vmadd_w(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pmadd(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c) {
+  return __lsx_vmadd_d(c, a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return (Packet4f)__lsx_vand_v((__m128i)a, (__m128i)b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return (Packet2d)__lsx_vand_v((__m128i)a, (__m128i)b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pand<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return __lsx_vand_v(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pand<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return __lsx_vand_v(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return __lsx_vand_v(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pand<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return __lsx_vand_v(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pand<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return __lsx_vand_v(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pand<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return __lsx_vand_v(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return __lsx_vand_v(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pand<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+  return __lsx_vand_v(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return (Packet4f)__lsx_vor_v((__m128i)a, (__m128i)b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return (Packet2d)__lsx_vor_v((__m128i)a, (__m128i)b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c por<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return __lsx_vor_v(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s por<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return __lsx_vor_v(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return __lsx_vor_v(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l por<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return __lsx_vor_v(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc por<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return __lsx_vor_v(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us por<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return __lsx_vor_v(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui por<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return __lsx_vor_v(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul por<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+  return __lsx_vor_v(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return (Packet4f)__lsx_vxor_v((__m128i)a, (__m128i)b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return (Packet2d)__lsx_vxor_v((__m128i)a, (__m128i)b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pxor<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return __lsx_vxor_v(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pxor<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return __lsx_vxor_v(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return __lsx_vxor_v(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pxor<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return __lsx_vxor_v(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pxor<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return __lsx_vxor_v(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pxor<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return __lsx_vxor_v(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pxor<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return __lsx_vxor_v(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pxor<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+  return __lsx_vxor_v(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return (Packet4f)__lsx_vandn_v((__m128i)b, (__m128i)a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return (Packet2d)__lsx_vandn_v((__m128i)b, (__m128i)a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pandnot<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return __lsx_vandn_v(b, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pandnot<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return __lsx_vandn_v(b, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return __lsx_vandn_v(b, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pandnot<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return __lsx_vandn_v(b, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pandnot<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return __lsx_vandn_v(b, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pandnot<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return __lsx_vandn_v(b, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pandnot<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return __lsx_vandn_v(b, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pandnot<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+  return __lsx_vandn_v(b, a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pcmp_le<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return (Packet4f)__lsx_vfcmp_cle_s(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pcmp_le<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return (Packet2d)__lsx_vfcmp_cle_d(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pcmp_le<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return __lsx_vsle_b(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pcmp_le<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return __lsx_vsle_h(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pcmp_le<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return __lsx_vsle_w(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pcmp_le<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return __lsx_vsle_d(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcmp_le<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return __lsx_vsle_bu(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pcmp_le<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return __lsx_vsle_hu(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcmp_le<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return __lsx_vsle_wu(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcmp_le<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+  return __lsx_vsle_du(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pcmp_lt<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return (Packet4f)__lsx_vfcmp_clt_s(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pcmp_lt<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return (Packet2d)__lsx_vfcmp_clt_d(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pcmp_lt<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return __lsx_vslt_b(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pcmp_lt<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return __lsx_vslt_h(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pcmp_lt<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return __lsx_vslt_w(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pcmp_lt<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return __lsx_vslt_d(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcmp_lt<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return __lsx_vslt_bu(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pcmp_lt<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return __lsx_vslt_hu(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcmp_lt<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return __lsx_vslt_wu(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcmp_lt<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+  return __lsx_vslt_du(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return (Packet4f)__lsx_vfcmp_sult_s(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return (Packet2d)__lsx_vfcmp_sult_d(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pcmp_eq<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return (Packet4f)__lsx_vfcmp_seq_s(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pcmp_eq<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return (Packet2d)__lsx_vfcmp_seq_d(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pcmp_eq<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return __lsx_vseq_b(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pcmp_eq<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return __lsx_vseq_h(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pcmp_eq<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return __lsx_vseq_w(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pcmp_eq<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return __lsx_vseq_d(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcmp_eq<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return __lsx_vseq_b(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pcmp_eq<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return __lsx_vseq_h(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcmp_eq<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return __lsx_vseq_w(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcmp_eq<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+  return __lsx_vseq_d(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16c pmin<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return __lsx_vmin_b(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pmin<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return __lsx_vmin_h(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return __lsx_vmin_w(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pmin<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return __lsx_vmin_d(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pmin<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return __lsx_vmin_bu(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pmin<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return __lsx_vmin_hu(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pmin<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return __lsx_vmin_wu(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pmin<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+  return __lsx_vmin_du(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16c pmax<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return __lsx_vmax_b(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pmax<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return __lsx_vmax_h(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return __lsx_vmax_w(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pmax<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return __lsx_vmax_d(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pmax<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return __lsx_vmax_bu(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pmax<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return __lsx_vmax_hu(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pmax<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return __lsx_vmax_wu(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pmax<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+  return __lsx_vmax_du(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  Packet4i aNaN = __lsx_vfcmp_cun_s(a, a);
+  Packet4i aMinOrNaN = por<Packet4i>(__lsx_vfcmp_clt_s(a, b), aNaN);
+  return (Packet4f)__lsx_vbitsel_v((__m128i)b, (__m128i)a, aMinOrNaN);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  Packet2l aNaN = __lsx_vfcmp_cun_d(a, a);
+  Packet2l aMinOrNaN = por<Packet2l>(__lsx_vfcmp_clt_d(a, b), aNaN);
+  return (Packet2d)__lsx_vbitsel_v((__m128i)b, (__m128i)a, aMinOrNaN);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  Packet4i aNaN = __lsx_vfcmp_cun_s(a, a);
+  Packet4i aMaxOrNaN = por<Packet4i>(__lsx_vfcmp_clt_s(b, a), aNaN);
+  return (Packet4f)__lsx_vbitsel_v((__m128i)b, (__m128i)a, aMaxOrNaN);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  Packet2l aNaN = __lsx_vfcmp_cun_d(a, a);
+  Packet2l aMaxOrNaN = por<Packet2l>(__lsx_vfcmp_clt_d(b, a), aNaN);
+  return (Packet2d)__lsx_vbitsel_v((__m128i)b, (__m128i)a, aMaxOrNaN);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet16c parithmetic_shift_right(const Packet16c& a) {
+  return __lsx_vsrai_b((__m128i)a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet8s parithmetic_shift_right(const Packet8s& a) {
+  return __lsx_vsrai_h((__m128i)a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a) {
+  return __lsx_vsrai_w((__m128i)a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet2l parithmetic_shift_right(const Packet2l& a) {
+  return __lsx_vsrai_d((__m128i)a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet16uc parithmetic_shift_right(const Packet16uc& a) {
+  return __lsx_vsrli_b((__m128i)a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet8us parithmetic_shift_right(const Packet8us& a) {
+  return __lsx_vsrli_h((__m128i)a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4ui parithmetic_shift_right(const Packet4ui& a) {
+  return __lsx_vsrli_w((__m128i)a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet2ul parithmetic_shift_right(const Packet2ul& a) {
+  return __lsx_vsrli_d((__m128i)a, N);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet16c plogical_shift_right(const Packet16c& a) {
+  return __lsx_vsrli_b((__m128i)a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet8s plogical_shift_right(const Packet8s& a) {
+  return __lsx_vsrli_h((__m128i)a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4i plogical_shift_right(const Packet4i& a) {
+  return __lsx_vsrli_w((__m128i)a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) {
+  return __lsx_vsrli_d((__m128i)a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet16uc plogical_shift_right(const Packet16uc& a) {
+  return __lsx_vsrli_b((__m128i)a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet8us plogical_shift_right(const Packet8us& a) {
+  return __lsx_vsrli_h((__m128i)a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(const Packet4ui& a) {
+  return __lsx_vsrli_w((__m128i)a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet2ul plogical_shift_right(const Packet2ul& a) {
+  return __lsx_vsrli_d((__m128i)a, N);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet16c plogical_shift_left(const Packet16c& a) {
+  return __lsx_vslli_b((__m128i)a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet8s plogical_shift_left(const Packet8s& a) {
+  return __lsx_vslli_h((__m128i)a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4i plogical_shift_left(const Packet4i& a) {
+  return __lsx_vslli_w((__m128i)a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) {
+  return __lsx_vslli_d((__m128i)a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet16uc plogical_shift_left(const Packet16uc& a) {
+  return __lsx_vslli_b((__m128i)a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet8us plogical_shift_left(const Packet8us& a) {
+  return __lsx_vslli_h((__m128i)a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(const Packet4ui& a) {
+  return __lsx_vslli_w((__m128i)a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet2ul plogical_shift_left(const Packet2ul& a) {
+  return __lsx_vslli_d((__m128i)a, N);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) {
+  return (Packet4f)__lsx_vbitclri_w((__m128i)a, 31);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) {
+  return (Packet2d)__lsx_vbitclri_d((__m128i)a, 63);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pabs(const Packet16c& a) {
+  return __lsx_vabsd_b(a, pzero(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pabs(const Packet8s& a) {
+  return __lsx_vabsd_h(a, pzero(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) {
+  return __lsx_vabsd_w(a, pzero(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pabs(const Packet2l& a) {
+  return __lsx_vabsd_d(a, pzero(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pabs(const Packet16uc& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pabs(const Packet8us& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pabs(const Packet4ui& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pabs(const Packet2ul& a) {
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return (Packet4f)__lsx_vld(from, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return (Packet2d)__lsx_vld(from, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pload<Packet16c>(const int8_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pload<Packet8s>(const int16_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int32_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pload<Packet2l>(const int64_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pload<Packet16uc>(const uint8_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pload<Packet8us>(const uint16_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pload<Packet4ui>(const uint32_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pload<Packet2ul>(const uint64_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return __lsx_vld(from, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return (Packet4f)__lsx_vld(from, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return (Packet2d)__lsx_vld(from, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c ploadu<Packet16c>(const int8_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s ploadu<Packet8s>(const int16_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int32_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l ploadu<Packet2l>(const int64_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc ploadu<Packet16uc>(const uint8_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us ploadu<Packet8us>(const uint16_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui ploadu<Packet4ui>(const uint32_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul ploadu<Packet2ul>(const uint64_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return __lsx_vld(from, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) {
+  float f0 = from[0], f1 = from[1];
+  return make_packet4f(f0, f0, f1, f1);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) {
+  return pset1<Packet2d>(from[0]);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c ploaddup<Packet16c>(const int8_t* from) {
+  Packet16c tmp = pload<Packet16c>(from);
+  return __lsx_vilvl_b(tmp, tmp);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s ploaddup<Packet8s>(const int16_t* from) {
+  Packet8s tmp = pload<Packet8s>(from);
+  return __lsx_vilvl_h(tmp, tmp);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int32_t* from) {
+  Packet4i tmp = pload<Packet4i>(from);
+  return __lsx_vilvl_w(tmp, tmp);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l ploaddup<Packet2l>(const int64_t* from) {
+  return pset1<Packet2l>(from[0]);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc ploaddup<Packet16uc>(const uint8_t* from) {
+  Packet16uc tmp = pload<Packet16uc>(from);
+  return __lsx_vilvl_b(tmp, tmp);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us ploaddup<Packet8us>(const uint16_t* from) {
+  Packet8us tmp = pload<Packet8us>(from);
+  return __lsx_vilvl_h(tmp, tmp);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui ploaddup<Packet4ui>(const uint32_t* from) {
+  Packet4ui tmp = pload<Packet4ui>(from);
+  return __lsx_vilvl_w(tmp, tmp);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul ploaddup<Packet2ul>(const uint64_t* from) {
+  return pset1<Packet2ul>(from[0]);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
+  EIGEN_DEBUG_ALIGNED_STORE __lsx_vst(from, to, 0);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) {
+  EIGEN_DEBUG_ALIGNED_STORE __lsx_vst(from, to, 0);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet16c& from) {
+  EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<int16_t>(int16_t* to, const Packet8s& from) {
+  EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet4i& from) {
+  EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<int64_t>(int64_t* to, const Packet2l& from) {
+  EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet16uc& from) {
+  EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<uint16_t>(uint16_t* to, const Packet8us& from) {
+  EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet4ui& from) {
+  EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<uint64_t>(uint64_t* to, const Packet2ul& from) {
+  EIGEN_DEBUG_ALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst(from, to, 0);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst(from, to, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int8_t>(int8_t* to, const Packet16c& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int16_t>(int16_t* to, const Packet8s& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet4i& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int64_t>(int64_t* to, const Packet2l& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<uint8_t>(uint8_t* to, const Packet16uc& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<uint16_t>(uint16_t* to, const Packet8us& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<uint32_t>(uint32_t* to, const Packet4ui& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<uint64_t>(uint64_t* to, const Packet2ul& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE __lsx_vst((__m128i)from, to, 0);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
+  Packet4f v = {from[0], from[stride], from[2 * stride], from[3 * stride]};
+  return v;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pgather<double, Packet2d>(const double* from, Index stride) {
+  Packet2d v = {from[0], from[stride]};
+  return v;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pgather<int8_t, Packet16c>(const int8_t* from, Index stride) {
+  int8_t v[16] __attribute__((aligned(16)));
+  v[0] = from[0];
+  v[1] = from[stride];
+  v[2] = from[2 * stride];
+  v[3] = from[3 * stride];
+  v[4] = from[4 * stride];
+  v[5] = from[5 * stride];
+  v[6] = from[6 * stride];
+  v[7] = from[7 * stride];
+  v[8] = from[8 * stride];
+  v[9] = from[9 * stride];
+  v[10] = from[10 * stride];
+  v[11] = from[11 * stride];
+  v[12] = from[12 * stride];
+  v[13] = from[13 * stride];
+  v[14] = from[14 * stride];
+  v[15] = from[15 * stride];
+  return __lsx_vld(v, 0);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pgather<int16_t, Packet8s>(const int16_t* from, Index stride) {
+  int16_t v[8] __attribute__((aligned(16)));
+  v[0] = from[0];
+  v[1] = from[stride];
+  v[2] = from[2 * stride];
+  v[3] = from[3 * stride];
+  v[4] = from[4 * stride];
+  v[5] = from[5 * stride];
+  v[6] = from[6 * stride];
+  v[7] = from[7 * stride];
+  return __lsx_vld(v, 0);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pgather<int32_t, Packet4i>(const int32_t* from, Index stride) {
+  int32_t v[4] __attribute__((aligned(16)));
+  v[0] = from[0];
+  v[1] = from[stride];
+  v[2] = from[2 * stride];
+  v[3] = from[3 * stride];
+  return __lsx_vld(v, 0);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pgather<int64_t, Packet2l>(const int64_t* from, Index stride) {
+  int64_t v[2] __attribute__((aligned(16)));
+  v[0] = from[0];
+  v[1] = from[stride];
+  return __lsx_vld(v, 0);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pgather<uint8_t, Packet16uc>(const uint8_t* from, Index stride) {
+  uint8_t v[16] __attribute__((aligned(16)));
+  v[0] = from[0];
+  v[1] = from[stride];
+  v[2] = from[2 * stride];
+  v[3] = from[3 * stride];
+  v[4] = from[4 * stride];
+  v[5] = from[5 * stride];
+  v[6] = from[6 * stride];
+  v[7] = from[7 * stride];
+  v[8] = from[8 * stride];
+  v[9] = from[9 * stride];
+  v[10] = from[10 * stride];
+  v[11] = from[11 * stride];
+  v[12] = from[12 * stride];
+  v[13] = from[13 * stride];
+  v[14] = from[14 * stride];
+  v[15] = from[15 * stride];
+  return __lsx_vld(v, 0);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pgather<uint16_t, Packet8us>(const uint16_t* from, Index stride) {
+  uint16_t v[8] __attribute__((aligned(16)));
+  v[0] = from[0];
+  v[1] = from[stride];
+  v[2] = from[2 * stride];
+  v[3] = from[3 * stride];
+  v[4] = from[4 * stride];
+  v[5] = from[5 * stride];
+  v[6] = from[6 * stride];
+  v[7] = from[7 * stride];
+  return __lsx_vld(v, 0);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pgather<uint32_t, Packet4ui>(const uint32_t* from, Index stride) {
+  uint32_t v[4] __attribute__((aligned(16)));
+  v[0] = from[0];
+  v[1] = from[stride];
+  v[2] = from[2 * stride];
+  v[3] = from[3 * stride];
+  return __lsx_vld(v, 0);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pgather<uint64_t, Packet2ul>(const uint64_t* from, Index stride) {
+  uint64_t v[2] __attribute__((aligned(16)));
+  v[0] = from[0];
+  v[1] = from[stride];
+  return __lsx_vld(v, 0);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) {
+  __lsx_vstelm_w(from, to, 0, 0);
+  __lsx_vstelm_w(from, to + stride * 1, 0, 1);
+  __lsx_vstelm_w(from, to + stride * 2, 0, 2);
+  __lsx_vstelm_w(from, to + stride * 3, 0, 3);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) {
+  __lsx_vstelm_d(from, to, 0, 0);
+  __lsx_vstelm_d(from, to + stride, 0, 1);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int8_t, Packet16c>(int8_t* to, const Packet16c& from,
+                                                                       Index stride) {
+  __lsx_vstelm_b((__m128i)from, to, 0, 0);
+  __lsx_vstelm_b((__m128i)from, to + stride * 1, 0, 1);
+  __lsx_vstelm_b((__m128i)from, to + stride * 2, 0, 2);
+  __lsx_vstelm_b((__m128i)from, to + stride * 3, 0, 3);
+  __lsx_vstelm_b((__m128i)from, to + stride * 4, 0, 4);
+  __lsx_vstelm_b((__m128i)from, to + stride * 5, 0, 5);
+  __lsx_vstelm_b((__m128i)from, to + stride * 6, 0, 6);
+  __lsx_vstelm_b((__m128i)from, to + stride * 7, 0, 7);
+  __lsx_vstelm_b((__m128i)from, to + stride * 8, 0, 8);
+  __lsx_vstelm_b((__m128i)from, to + stride * 9, 0, 9);
+  __lsx_vstelm_b((__m128i)from, to + stride * 10, 0, 10);
+  __lsx_vstelm_b((__m128i)from, to + stride * 11, 0, 11);
+  __lsx_vstelm_b((__m128i)from, to + stride * 12, 0, 12);
+  __lsx_vstelm_b((__m128i)from, to + stride * 13, 0, 13);
+  __lsx_vstelm_b((__m128i)from, to + stride * 14, 0, 14);
+  __lsx_vstelm_b((__m128i)from, to + stride * 15, 0, 15);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int16_t, Packet8s>(int16_t* to, const Packet8s& from,
+                                                                       Index stride) {
+  __lsx_vstelm_h((__m128i)from, to, 0, 0);
+  __lsx_vstelm_h((__m128i)from, to + stride * 1, 0, 1);
+  __lsx_vstelm_h((__m128i)from, to + stride * 2, 0, 2);
+  __lsx_vstelm_h((__m128i)from, to + stride * 3, 0, 3);
+  __lsx_vstelm_h((__m128i)from, to + stride * 4, 0, 4);
+  __lsx_vstelm_h((__m128i)from, to + stride * 5, 0, 5);
+  __lsx_vstelm_h((__m128i)from, to + stride * 6, 0, 6);
+  __lsx_vstelm_h((__m128i)from, to + stride * 7, 0, 7);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int32_t, Packet4i>(int32_t* to, const Packet4i& from,
+                                                                       Index stride) {
+  __lsx_vstelm_w((__m128i)from, to, 0, 0);
+  __lsx_vstelm_w((__m128i)from, to + stride * 1, 0, 1);
+  __lsx_vstelm_w((__m128i)from, to + stride * 2, 0, 2);
+  __lsx_vstelm_w((__m128i)from, to + stride * 3, 0, 3);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int64_t, Packet2l>(int64_t* to, const Packet2l& from,
+                                                                       Index stride) {
+  __lsx_vstelm_d((__m128i)from, to, 0, 0);
+  __lsx_vstelm_d((__m128i)from, to + stride * 1, 0, 1);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint8_t, Packet16uc>(uint8_t* to, const Packet16uc& from,
+                                                                         Index stride) {
+  __lsx_vstelm_b((__m128i)from, to, 0, 0);
+  __lsx_vstelm_b((__m128i)from, to + stride * 1, 0, 1);
+  __lsx_vstelm_b((__m128i)from, to + stride * 2, 0, 2);
+  __lsx_vstelm_b((__m128i)from, to + stride * 3, 0, 3);
+  __lsx_vstelm_b((__m128i)from, to + stride * 4, 0, 4);
+  __lsx_vstelm_b((__m128i)from, to + stride * 5, 0, 5);
+  __lsx_vstelm_b((__m128i)from, to + stride * 6, 0, 6);
+  __lsx_vstelm_b((__m128i)from, to + stride * 7, 0, 7);
+  __lsx_vstelm_b((__m128i)from, to + stride * 8, 0, 8);
+  __lsx_vstelm_b((__m128i)from, to + stride * 9, 0, 9);
+  __lsx_vstelm_b((__m128i)from, to + stride * 10, 0, 10);
+  __lsx_vstelm_b((__m128i)from, to + stride * 11, 0, 11);
+  __lsx_vstelm_b((__m128i)from, to + stride * 12, 0, 12);
+  __lsx_vstelm_b((__m128i)from, to + stride * 13, 0, 13);
+  __lsx_vstelm_b((__m128i)from, to + stride * 14, 0, 14);
+  __lsx_vstelm_b((__m128i)from, to + stride * 15, 0, 15);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint16_t, Packet8us>(uint16_t* to, const Packet8us& from,
+                                                                         Index stride) {
+  __lsx_vstelm_h((__m128i)from, to, 0, 0);
+  __lsx_vstelm_h((__m128i)from, to + stride * 1, 0, 1);
+  __lsx_vstelm_h((__m128i)from, to + stride * 2, 0, 2);
+  __lsx_vstelm_h((__m128i)from, to + stride * 3, 0, 3);
+  __lsx_vstelm_h((__m128i)from, to + stride * 4, 0, 4);
+  __lsx_vstelm_h((__m128i)from, to + stride * 5, 0, 5);
+  __lsx_vstelm_h((__m128i)from, to + stride * 6, 0, 6);
+  __lsx_vstelm_h((__m128i)from, to + stride * 7, 0, 7);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint32_t, Packet4ui>(uint32_t* to, const Packet4ui& from,
+                                                                         Index stride) {
+  __lsx_vstelm_w((__m128i)from, to, 0, 0);
+  __lsx_vstelm_w((__m128i)from, to + stride * 1, 0, 1);
+  __lsx_vstelm_w((__m128i)from, to + stride * 2, 0, 2);
+  __lsx_vstelm_w((__m128i)from, to + stride * 3, 0, 3);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint64_t, Packet2ul>(uint64_t* to, const Packet2ul& from,
+                                                                         Index stride) {
+  __lsx_vstelm_d((__m128i)from, to, 0, 0);
+  __lsx_vstelm_d((__m128i)from, to + stride * 1, 0, 1);
+}
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
+  __builtin_prefetch(addr);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
+  __builtin_prefetch(addr);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<int8_t>(const int8_t* addr) {
+  __builtin_prefetch(addr);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<int16_t>(const int16_t* addr) {
+  __builtin_prefetch(addr);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<int32_t>(const int32_t* addr) {
+  __builtin_prefetch(addr);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<int64_t>(const int64_t* addr) {
+  __builtin_prefetch(addr);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<uint8_t>(const uint8_t* addr) {
+  __builtin_prefetch(addr);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<uint16_t>(const uint16_t* addr) {
+  __builtin_prefetch(addr);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<uint32_t>(const uint32_t* addr) {
+  __builtin_prefetch(addr);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<uint64_t>(const uint64_t* addr) {
+  __builtin_prefetch(addr);
+}
+
+template <>
+EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
+  float v;
+  __lsx_vstelm_w(a, &v, 0, 0);
+  return v;
+}
+template <>
+EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
+  double v;
+  __lsx_vstelm_d(a, &v, 0, 0);
+  return v;
+}
+
+template <>
+EIGEN_STRONG_INLINE int8_t pfirst<Packet16c>(const Packet16c& a) {
+  return (int8_t)__lsx_vpickve2gr_b((__m128i)a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE int16_t pfirst<Packet8s>(const Packet8s& a) {
+  return (int16_t)__lsx_vpickve2gr_h((__m128i)a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) {
+  return __lsx_vpickve2gr_w((__m128i)a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE int64_t pfirst<Packet2l>(const Packet2l& a) {
+  return __lsx_vpickve2gr_d((__m128i)a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint8_t pfirst<Packet16uc>(const Packet16uc& a) {
+  return (uint8_t)__lsx_vpickve2gr_bu((__m128i)a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint16_t pfirst<Packet8us>(const Packet8us& a) {
+  return (uint16_t)__lsx_vpickve2gr_hu((__m128i)a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) {
+  return __lsx_vpickve2gr_wu((__m128i)a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint64_t pfirst<Packet2ul>(const Packet2ul& a) {
+  return __lsx_vpickve2gr_du((__m128i)a, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
+  return (Packet4f)__lsx_vshuf4i_w(a, 0x1B);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) {
+  return (Packet2d)__lsx_vshuf4i_d(a, a, 0x1);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c preverse(const Packet16c& a) {
+  return __lsx_vshuf4i_b(__lsx_vshuf4i_w((__m128i)a, 0x1B), 0x1B);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s preverse(const Packet8s& a) {
+  return __lsx_vshuf4i_h(__lsx_vshuf4i_d((__m128i)a, (__m128i)a, 0x1), 0x1B);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
+  return __lsx_vshuf4i_w((__m128i)a, 0x1B);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l preverse(const Packet2l& a) {
+  return __lsx_vshuf4i_d((__m128i)a, (__m128i)a, 0x1);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc preverse(const Packet16uc& a) {
+  return __lsx_vshuf4i_b(__lsx_vshuf4i_w((__m128i)a, 0x1B), 0x1B);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us preverse(const Packet8us& a) {
+  return __lsx_vshuf4i_h(__lsx_vshuf4i_d((__m128i)a, (__m128i)a, 0x1), 0x1B);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui preverse(const Packet4ui& a) {
+  return __lsx_vshuf4i_w((__m128i)a, 0x1B);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul preverse(const Packet2ul& a) {
+  return __lsx_vshuf4i_d((__m128i)a, (__m128i)a, 0x1);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
+  Packet4f tmp = __lsx_vfadd_s(a, vec4f_swizzle1(a, 2, 3, 2, 3));
+  return pfirst<Packet4f>(__lsx_vfadd_s(tmp, vec4f_swizzle1(tmp, 1, 1, 1, 1)));
+}
+template <>
+EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) {
+  return pfirst<Packet2d>(__lsx_vfadd_d(a, preverse(a)));
+}
+template <>
+EIGEN_STRONG_INLINE int8_t predux<Packet16c>(const Packet16c& a) {
+  Packet8s tmp1 = __lsx_vhaddw_h_b(a, a);
+  Packet4i tmp2 = __lsx_vhaddw_w_h(tmp1, tmp1);
+  Packet2l tmp3 = __lsx_vhaddw_d_w(tmp2, tmp2);
+  return (int8_t)__lsx_vpickve2gr_d(__lsx_vhaddw_q_d(tmp3, tmp3), 0);
+}
+template <>
+EIGEN_STRONG_INLINE int16_t predux<Packet8s>(const Packet8s& a) {
+  Packet4i tmp1 = __lsx_vhaddw_w_h(a, a);
+  Packet2l tmp2 = __lsx_vhaddw_d_w(tmp1, tmp1);
+  return (int16_t)__lsx_vpickve2gr_d(__lsx_vhaddw_q_d(tmp2, tmp2), 0);
+}
+template <>
+EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a) {
+  Packet2l tmp = __lsx_vhaddw_d_w(a, a);
+  return (int32_t)__lsx_vpickve2gr_d(__lsx_vhaddw_q_d(tmp, tmp), 0);
+}
+template <>
+EIGEN_STRONG_INLINE int64_t predux<Packet2l>(const Packet2l& a) {
+  return (int64_t)__lsx_vpickve2gr_d(__lsx_vhaddw_q_d(a, a), 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint8_t predux<Packet16uc>(const Packet16uc& a) {
+  Packet8us tmp1 = __lsx_vhaddw_hu_bu(a, a);
+  Packet4ui tmp2 = __lsx_vhaddw_wu_hu(tmp1, tmp1);
+  Packet2ul tmp3 = __lsx_vhaddw_du_wu(tmp2, tmp2);
+  return (uint8_t)__lsx_vpickve2gr_d(__lsx_vhaddw_qu_du(tmp3, tmp3), 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint16_t predux<Packet8us>(const Packet8us& a) {
+  Packet4ui tmp1 = __lsx_vhaddw_wu_hu(a, a);
+  Packet2ul tmp2 = __lsx_vhaddw_du_wu(tmp1, tmp1);
+  return (uint16_t)__lsx_vpickve2gr_d(__lsx_vhaddw_qu_du(tmp2, tmp2), 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t predux<Packet4ui>(const Packet4ui& a) {
+  Packet2ul tmp = __lsx_vhaddw_du_wu(a, a);
+  return (uint32_t)__lsx_vpickve2gr_d(__lsx_vhaddw_qu_du(tmp, tmp), 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint64_t predux<Packet2ul>(const Packet2ul& a) {
+  return (uint64_t)__lsx_vpickve2gr_d(__lsx_vhaddw_qu_du(a, a), 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) {
+  Packet4f tmp = __lsx_vfmul_s(a, vec4f_swizzle1(a, 2, 3, 2, 3));
+  return pfirst<Packet4f>(__lsx_vfmul_s(tmp, vec4f_swizzle1(tmp, 1, 1, 1, 1)));
+}
+template <>
+EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) {
+  return pfirst<Packet2d>(__lsx_vfmul_d(a, preverse(a)));
+}
+template <>
+EIGEN_STRONG_INLINE int8_t predux_mul<Packet16c>(const Packet16c& a) {
+  Packet8s tmp1 = __lsx_vmulwev_h_b(a, preverse(a));
+  Packet4i tmp2 = __lsx_vmulwev_w_h(tmp1, preverse(tmp1));
+  Packet2l tmp3 = __lsx_vmulwev_d_w(tmp2, preverse(tmp2));
+  return (int8_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp3, preverse(tmp3)), 0);
+}
+template <>
+EIGEN_STRONG_INLINE int16_t predux_mul<Packet8s>(const Packet8s& a) {
+  Packet4i tmp1 = __lsx_vmulwev_w_h(a, preverse(a));
+  Packet2l tmp2 = __lsx_vmulwev_d_w(tmp1, preverse(tmp1));
+  return (int16_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp2, preverse(tmp2)), 0);
+}
+template <>
+EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(const Packet4i& a) {
+  Packet2l tmp = __lsx_vmulwev_d_w(a, preverse(a));
+  return (int32_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp, preverse(tmp)), 0);
+}
+template <>
+EIGEN_STRONG_INLINE int64_t predux_mul<Packet2l>(const Packet2l& a) {
+  return (int64_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(a, preverse(a)), 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint8_t predux_mul<Packet16uc>(const Packet16uc& a) {
+  Packet8us tmp1 = __lsx_vmulwev_h_bu(a, preverse(a));
+  Packet4ui tmp2 = __lsx_vmulwev_w_h(tmp1, preverse(tmp1));
+  Packet2ul tmp3 = __lsx_vmulwev_d_w(tmp2, preverse(tmp2));
+  return (uint8_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp3, preverse(tmp3)), 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint16_t predux_mul<Packet8us>(const Packet8us& a) {
+  Packet4ui tmp1 = __lsx_vmulwev_w_hu(a, preverse(a));
+  Packet2ul tmp2 = __lsx_vmulwev_d_w(tmp1, preverse(tmp1));
+  return (uint16_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp2, preverse(tmp2)), 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_mul<Packet4ui>(const Packet4ui& a) {
+  Packet2ul tmp = __lsx_vmulwev_d_wu(a, preverse(a));
+  return (uint32_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_d(tmp, preverse(tmp)), 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint64_t predux_mul<Packet2ul>(const Packet2ul& a) {
+  return (uint64_t)__lsx_vpickve2gr_d(__lsx_vmulwev_q_du(a, preverse(a)), 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
+  Packet4f tmp = __lsx_vfmin_s(a, (Packet4f)__lsx_vshuf4i_w(a, 0x4E));
+  return pfirst(__lsx_vfmin_s(tmp, (Packet4f)__lsx_vshuf4i_w(tmp, 0xB1)));
+}
+template <>
+EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) {
+  return pfirst(__lsx_vfmin_d(a, preverse(a)));
+}
+template <>
+EIGEN_STRONG_INLINE int8_t predux_min<Packet16c>(const Packet16c& a) {
+  Packet16c tmp1 = __lsx_vmin_b(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
+  Packet16c tmp2 = __lsx_vmin_b(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
+  Packet16c tmp3 = __lsx_vmin_b(tmp2, __lsx_vshuf4i_b((__m128i)tmp2, 0x4E));
+  return pfirst((Packet16c)__lsx_vmin_b(tmp3, __lsx_vshuf4i_b((__m128i)tmp3, 0xB1)));
+}
+template <>
+EIGEN_STRONG_INLINE int16_t predux_min<Packet8s>(const Packet8s& a) {
+  Packet8s tmp1 = __lsx_vmin_h(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
+  Packet8s tmp2 = __lsx_vmin_h(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
+  return pfirst((Packet8s)__lsx_vmin_h(tmp2, __lsx_vshuf4i_h((__m128i)tmp2, 0xB1)));
+}
+template <>
+EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a) {
+  Packet4i tmp = __lsx_vmin_w(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
+  return pfirst((Packet4i)__lsx_vmin_w(tmp, __lsx_vshuf4i_w((__m128i)tmp, 0xB1)));
+}
+template <>
+EIGEN_STRONG_INLINE int64_t predux_min<Packet2l>(const Packet2l& a) {
+  return pfirst((Packet2l)__lsx_vmin_d(a, preverse(a)));
+}
+template <>
+EIGEN_STRONG_INLINE uint8_t predux_min<Packet16uc>(const Packet16uc& a) {
+  Packet16uc tmp1 = __lsx_vmin_bu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
+  Packet16uc tmp2 = __lsx_vmin_bu(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
+  Packet16uc tmp3 = __lsx_vmin_bu(tmp2, __lsx_vshuf4i_b((__m128i)tmp2, 0x4E));
+  return pfirst((Packet16uc)__lsx_vmin_bu(tmp3, __lsx_vshuf4i_b((__m128i)tmp3, 0xB1)));
+}
+template <>
+EIGEN_STRONG_INLINE uint16_t predux_min<Packet8us>(const Packet8us& a) {
+  Packet8us tmp1 = __lsx_vmin_hu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
+  Packet8us tmp2 = __lsx_vmin_hu(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
+  return pfirst((Packet8us)__lsx_vmin_hu(tmp2, __lsx_vshuf4i_h((__m128i)tmp2, 0xB1)));
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_min<Packet4ui>(const Packet4ui& a) {
+  Packet4ui tmp = __lsx_vmin_wu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
+  return pfirst((Packet4ui)__lsx_vmin_wu(tmp, __lsx_vshuf4i_w((__m128i)tmp, 0xB1)));
+}
+template <>
+EIGEN_STRONG_INLINE uint64_t predux_min<Packet2ul>(const Packet2ul& a) {
+  return pfirst((Packet2ul)__lsx_vmin_du(a, preverse(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
+  Packet4f tmp = __lsx_vfmax_s(a, (Packet4f)__lsx_vshuf4i_w(a, 0x4E));
+  return pfirst(__lsx_vfmax_s(tmp, (Packet4f)__lsx_vshuf4i_w(tmp, 0xB1)));
+}
+template <>
+EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) {
+  return pfirst(__lsx_vfmax_d(a, preverse(a)));
+}
+template <>
+EIGEN_STRONG_INLINE int8_t predux_max<Packet16c>(const Packet16c& a) {
+  Packet16c tmp1 = __lsx_vmax_b(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
+  Packet16c tmp2 = __lsx_vmax_b(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
+  Packet16c tmp3 = __lsx_vmax_b(tmp2, __lsx_vshuf4i_b((__m128i)tmp2, 0x4E));
+  return pfirst((Packet16c)__lsx_vmax_b(tmp3, __lsx_vshuf4i_b((__m128i)tmp3, 0xB1)));
+}
+template <>
+EIGEN_STRONG_INLINE int16_t predux_max<Packet8s>(const Packet8s& a) {
+  Packet8s tmp1 = __lsx_vmax_h(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
+  Packet8s tmp2 = __lsx_vmax_h(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
+  return pfirst((Packet8s)__lsx_vmax_h(tmp2, __lsx_vshuf4i_h((__m128i)tmp2, 0xB1)));
+}
+template <>
+EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a) {
+  Packet4i tmp = __lsx_vmax_w(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
+  return pfirst((Packet4i)__lsx_vmax_w(tmp, __lsx_vshuf4i_w((__m128i)tmp, 0xB1)));
+}
+template <>
+EIGEN_STRONG_INLINE int64_t predux_max<Packet2l>(const Packet2l& a) {
+  return pfirst((Packet2l)__lsx_vmax_d(a, preverse(a)));
+}
+template <>
+EIGEN_STRONG_INLINE uint8_t predux_max<Packet16uc>(const Packet16uc& a) {
+  Packet16uc tmp1 = __lsx_vmax_bu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
+  Packet16uc tmp2 = __lsx_vmax_bu(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
+  Packet16uc tmp3 = __lsx_vmax_bu(tmp2, __lsx_vshuf4i_b((__m128i)tmp2, 0x4E));
+  return pfirst((Packet16uc)__lsx_vmax_bu(tmp3, __lsx_vshuf4i_b((__m128i)tmp3, 0xB1)));
+}
+template <>
+EIGEN_STRONG_INLINE uint16_t predux_max<Packet8us>(const Packet8us& a) {
+  Packet8us tmp1 = __lsx_vmax_hu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
+  Packet8us tmp2 = __lsx_vmax_hu(tmp1, __lsx_vshuf4i_h((__m128i)tmp1, 0x4E));
+  return pfirst((Packet8us)__lsx_vmax_hu(tmp2, __lsx_vshuf4i_h((__m128i)tmp2, 0xB1)));
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_max<Packet4ui>(const Packet4ui& a) {
+  Packet4ui tmp = __lsx_vmax_wu(a, __lsx_vshuf4i_w((__m128i)a, 0x4E));
+  return pfirst((Packet4ui)__lsx_vmax_wu(tmp, __lsx_vshuf4i_w((__m128i)tmp, 0xB1)));
+}
+template <>
+EIGEN_STRONG_INLINE uint64_t predux_max<Packet2ul>(const Packet2ul& a) {
+  return pfirst((Packet2ul)__lsx_vmax_du(a, preverse(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) {
+  return __lsx_vfsqrt_s(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& a) {
+  return __lsx_vfsqrt_d(a);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
+  Packet4f T0 = (Packet4f)__lsx_vilvl_w((__m128i)kernel.packet[1], (__m128i)kernel.packet[0]);
+  Packet4f T1 = (Packet4f)__lsx_vilvh_w((__m128i)kernel.packet[1], (__m128i)kernel.packet[0]);
+  Packet4f T2 = (Packet4f)__lsx_vilvl_w((__m128i)kernel.packet[3], (__m128i)kernel.packet[2]);
+  Packet4f T3 = (Packet4f)__lsx_vilvh_w((__m128i)kernel.packet[3], (__m128i)kernel.packet[2]);
+
+  kernel.packet[0] = (Packet4f)__lsx_vilvl_d((__m128i)T2, (__m128i)T0);
+  kernel.packet[1] = (Packet4f)__lsx_vilvh_d((__m128i)T2, (__m128i)T0);
+  kernel.packet[2] = (Packet4f)__lsx_vilvl_d((__m128i)T3, (__m128i)T1);
+  kernel.packet[3] = (Packet4f)__lsx_vilvh_d((__m128i)T3, (__m128i)T1);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
+  Packet2d tmp = (Packet2d)__lsx_vilvh_d((__m128i)kernel.packet[1], (__m128i)kernel.packet[0]);
+  kernel.packet[0] = (Packet2d)__lsx_vilvl_d((__m128i)kernel.packet[1], (__m128i)kernel.packet[0]);
+  kernel.packet[1] = tmp;
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 16>& kernel) {
+  __m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]);
+  __m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]);
+  __m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]);
+  __m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]);
+  __m128i t4 = __lsx_vilvl_b(kernel.packet[5], kernel.packet[4]);
+  __m128i t5 = __lsx_vilvh_b(kernel.packet[5], kernel.packet[4]);
+  __m128i t6 = __lsx_vilvl_b(kernel.packet[7], kernel.packet[6]);
+  __m128i t7 = __lsx_vilvh_b(kernel.packet[7], kernel.packet[6]);
+  __m128i t8 = __lsx_vilvl_b(kernel.packet[9], kernel.packet[8]);
+  __m128i t9 = __lsx_vilvh_b(kernel.packet[9], kernel.packet[8]);
+  __m128i ta = __lsx_vilvl_b(kernel.packet[11], kernel.packet[10]);
+  __m128i tb = __lsx_vilvh_b(kernel.packet[11], kernel.packet[10]);
+  __m128i tc = __lsx_vilvl_b(kernel.packet[13], kernel.packet[12]);
+  __m128i td = __lsx_vilvh_b(kernel.packet[13], kernel.packet[12]);
+  __m128i te = __lsx_vilvl_b(kernel.packet[15], kernel.packet[14]);
+  __m128i tf = __lsx_vilvh_b(kernel.packet[15], kernel.packet[14]);
+
+  __m128i s0 = __lsx_vilvl_h(t2, t0);
+  __m128i s1 = __lsx_vilvh_h(t2, t0);
+  __m128i s2 = __lsx_vilvl_h(t3, t1);
+  __m128i s3 = __lsx_vilvh_h(t3, t1);
+  __m128i s4 = __lsx_vilvl_h(t6, t4);
+  __m128i s5 = __lsx_vilvh_h(t6, t4);
+  __m128i s6 = __lsx_vilvl_h(t7, t5);
+  __m128i s7 = __lsx_vilvh_h(t7, t5);
+  __m128i s8 = __lsx_vilvl_h(ta, t8);
+  __m128i s9 = __lsx_vilvh_h(ta, t8);
+  __m128i sa = __lsx_vilvl_h(tb, t9);
+  __m128i sb = __lsx_vilvh_h(tb, t9);
+  __m128i sc = __lsx_vilvl_h(te, tc);
+  __m128i sd = __lsx_vilvh_h(te, tc);
+  __m128i se = __lsx_vilvl_h(tf, td);
+  __m128i sf = __lsx_vilvh_h(tf, td);
+
+  __m128i u0 = __lsx_vilvl_w(s4, s0);
+  __m128i u1 = __lsx_vilvh_w(s4, s0);
+  __m128i u2 = __lsx_vilvl_w(s5, s1);
+  __m128i u3 = __lsx_vilvh_w(s5, s1);
+  __m128i u4 = __lsx_vilvl_w(s6, s2);
+  __m128i u5 = __lsx_vilvh_w(s6, s2);
+  __m128i u6 = __lsx_vilvl_w(s7, s3);
+  __m128i u7 = __lsx_vilvh_w(s7, s3);
+  __m128i u8 = __lsx_vilvl_w(sc, s8);
+  __m128i u9 = __lsx_vilvh_w(sc, s8);
+  __m128i ua = __lsx_vilvl_w(sd, s9);
+  __m128i ub = __lsx_vilvh_w(sd, s9);
+  __m128i uc = __lsx_vilvl_w(se, sa);
+  __m128i ud = __lsx_vilvh_w(se, sa);
+  __m128i ue = __lsx_vilvl_w(sf, sb);
+  __m128i uf = __lsx_vilvh_w(sf, sb);
+
+  kernel.packet[0] = __lsx_vilvl_d(u8, u0);
+  kernel.packet[1] = __lsx_vilvh_d(u8, u0);
+  kernel.packet[2] = __lsx_vilvl_d(u9, u1);
+  kernel.packet[3] = __lsx_vilvh_d(u9, u1);
+  kernel.packet[4] = __lsx_vilvl_d(ua, u2);
+  kernel.packet[5] = __lsx_vilvh_d(ua, u2);
+  kernel.packet[6] = __lsx_vilvl_d(ub, u3);
+  kernel.packet[7] = __lsx_vilvh_d(ub, u3);
+  kernel.packet[8] = __lsx_vilvl_d(uc, u4);
+  kernel.packet[9] = __lsx_vilvh_d(uc, u4);
+  kernel.packet[10] = __lsx_vilvl_d(ud, u5);
+  kernel.packet[11] = __lsx_vilvh_d(ud, u5);
+  kernel.packet[12] = __lsx_vilvl_d(ue, u6);
+  kernel.packet[13] = __lsx_vilvh_d(ue, u6);
+  kernel.packet[14] = __lsx_vilvl_d(uf, u7);
+  kernel.packet[15] = __lsx_vilvh_d(uf, u7);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 8>& kernel) {
+  __m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]);
+  __m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]);
+  __m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]);
+  __m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]);
+  __m128i t4 = __lsx_vilvl_b(kernel.packet[5], kernel.packet[4]);
+  __m128i t5 = __lsx_vilvh_b(kernel.packet[5], kernel.packet[4]);
+  __m128i t6 = __lsx_vilvl_b(kernel.packet[7], kernel.packet[6]);
+  __m128i t7 = __lsx_vilvh_b(kernel.packet[7], kernel.packet[6]);
+
+  __m128i s0 = __lsx_vilvl_h(t2, t0);
+  __m128i s1 = __lsx_vilvh_h(t2, t0);
+  __m128i s2 = __lsx_vilvl_h(t3, t1);
+  __m128i s3 = __lsx_vilvh_h(t3, t1);
+  __m128i s4 = __lsx_vilvl_h(t6, t4);
+  __m128i s5 = __lsx_vilvh_h(t6, t4);
+  __m128i s6 = __lsx_vilvl_h(t7, t5);
+  __m128i s7 = __lsx_vilvh_h(t7, t5);
+
+  kernel.packet[0] = __lsx_vilvl_w(s4, s0);
+  kernel.packet[1] = __lsx_vilvh_w(s4, s0);
+  kernel.packet[2] = __lsx_vilvl_w(s5, s1);
+  kernel.packet[3] = __lsx_vilvh_w(s5, s1);
+  kernel.packet[4] = __lsx_vilvl_w(s6, s2);
+  kernel.packet[5] = __lsx_vilvh_w(s6, s2);
+  kernel.packet[6] = __lsx_vilvl_w(s7, s3);
+  kernel.packet[7] = __lsx_vilvh_w(s7, s3);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 4>& kernel) {
+  __m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]);
+  __m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]);
+  __m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]);
+  __m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]);
+
+  kernel.packet[0] = __lsx_vilvl_h(t2, t0);
+  kernel.packet[1] = __lsx_vilvh_h(t2, t0);
+  kernel.packet[2] = __lsx_vilvl_h(t3, t1);
+  kernel.packet[3] = __lsx_vilvh_h(t3, t1);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8s, 8>& kernel) {
+  __m128i t0 = __lsx_vilvl_h(kernel.packet[1], kernel.packet[0]);
+  __m128i t1 = __lsx_vilvh_h(kernel.packet[1], kernel.packet[0]);
+  __m128i t2 = __lsx_vilvl_h(kernel.packet[3], kernel.packet[2]);
+  __m128i t3 = __lsx_vilvh_h(kernel.packet[3], kernel.packet[2]);
+  __m128i t4 = __lsx_vilvl_h(kernel.packet[5], kernel.packet[4]);
+  __m128i t5 = __lsx_vilvh_h(kernel.packet[5], kernel.packet[4]);
+  __m128i t6 = __lsx_vilvl_h(kernel.packet[7], kernel.packet[6]);
+  __m128i t7 = __lsx_vilvh_h(kernel.packet[7], kernel.packet[6]);
+
+  __m128i s0 = __lsx_vilvl_w(t2, t0);
+  __m128i s1 = __lsx_vilvh_w(t2, t0);
+  __m128i s2 = __lsx_vilvl_w(t3, t1);
+  __m128i s3 = __lsx_vilvh_w(t3, t1);
+  __m128i s4 = __lsx_vilvl_w(t6, t4);
+  __m128i s5 = __lsx_vilvh_w(t6, t4);
+  __m128i s6 = __lsx_vilvl_w(t7, t5);
+  __m128i s7 = __lsx_vilvh_w(t7, t5);
+
+  kernel.packet[0] = __lsx_vilvl_d(s4, s0);
+  kernel.packet[1] = __lsx_vilvh_d(s4, s0);
+  kernel.packet[2] = __lsx_vilvl_d(s5, s1);
+  kernel.packet[3] = __lsx_vilvh_d(s5, s1);
+  kernel.packet[4] = __lsx_vilvl_d(s6, s2);
+  kernel.packet[5] = __lsx_vilvh_d(s6, s2);
+  kernel.packet[6] = __lsx_vilvl_d(s7, s3);
+  kernel.packet[7] = __lsx_vilvh_d(s7, s3);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8s, 4>& kernel) {
+  __m128i t0 = __lsx_vilvl_h(kernel.packet[1], kernel.packet[0]);
+  __m128i t1 = __lsx_vilvh_h(kernel.packet[1], kernel.packet[0]);
+  __m128i t2 = __lsx_vilvl_h(kernel.packet[3], kernel.packet[2]);
+  __m128i t3 = __lsx_vilvh_h(kernel.packet[3], kernel.packet[2]);
+
+  kernel.packet[0] = __lsx_vilvl_w(t2, t0);
+  kernel.packet[1] = __lsx_vilvh_w(t2, t0);
+  kernel.packet[2] = __lsx_vilvl_w(t3, t1);
+  kernel.packet[3] = __lsx_vilvh_w(t3, t1);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
+  __m128i T0 = __lsx_vilvl_w(kernel.packet[1], kernel.packet[0]);
+  __m128i T1 = __lsx_vilvh_w(kernel.packet[1], kernel.packet[0]);
+  __m128i T2 = __lsx_vilvl_w(kernel.packet[3], kernel.packet[2]);
+  __m128i T3 = __lsx_vilvh_w(kernel.packet[3], kernel.packet[2]);
+
+  kernel.packet[0] = __lsx_vilvl_d(T2, T0);
+  kernel.packet[1] = __lsx_vilvh_d(T2, T0);
+  kernel.packet[2] = __lsx_vilvl_d(T3, T1);
+  kernel.packet[3] = __lsx_vilvh_d(T3, T1);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2l, 2>& kernel) {
+  __m128i tmp = __lsx_vilvh_d(kernel.packet[1], kernel.packet[0]);
+  kernel.packet[0] = __lsx_vilvl_d(kernel.packet[1], kernel.packet[0]);
+  kernel.packet[1] = tmp;
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 16>& kernel) {
+  __m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]);
+  __m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]);
+  __m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]);
+  __m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]);
+  __m128i t4 = __lsx_vilvl_b(kernel.packet[5], kernel.packet[4]);
+  __m128i t5 = __lsx_vilvh_b(kernel.packet[5], kernel.packet[4]);
+  __m128i t6 = __lsx_vilvl_b(kernel.packet[7], kernel.packet[6]);
+  __m128i t7 = __lsx_vilvh_b(kernel.packet[7], kernel.packet[6]);
+  __m128i t8 = __lsx_vilvl_b(kernel.packet[9], kernel.packet[8]);
+  __m128i t9 = __lsx_vilvh_b(kernel.packet[9], kernel.packet[8]);
+  __m128i ta = __lsx_vilvl_b(kernel.packet[11], kernel.packet[10]);
+  __m128i tb = __lsx_vilvh_b(kernel.packet[11], kernel.packet[10]);
+  __m128i tc = __lsx_vilvl_b(kernel.packet[13], kernel.packet[12]);
+  __m128i td = __lsx_vilvh_b(kernel.packet[13], kernel.packet[12]);
+  __m128i te = __lsx_vilvl_b(kernel.packet[15], kernel.packet[14]);
+  __m128i tf = __lsx_vilvh_b(kernel.packet[15], kernel.packet[14]);
+
+  __m128i s0 = __lsx_vilvl_h(t2, t0);
+  __m128i s1 = __lsx_vilvh_h(t2, t0);
+  __m128i s2 = __lsx_vilvl_h(t3, t1);
+  __m128i s3 = __lsx_vilvh_h(t3, t1);
+  __m128i s4 = __lsx_vilvl_h(t6, t4);
+  __m128i s5 = __lsx_vilvh_h(t6, t4);
+  __m128i s6 = __lsx_vilvl_h(t7, t5);
+  __m128i s7 = __lsx_vilvh_h(t7, t5);
+  __m128i s8 = __lsx_vilvl_h(ta, t8);
+  __m128i s9 = __lsx_vilvh_h(ta, t8);
+  __m128i sa = __lsx_vilvl_h(tb, t9);
+  __m128i sb = __lsx_vilvh_h(tb, t9);
+  __m128i sc = __lsx_vilvl_h(te, tc);
+  __m128i sd = __lsx_vilvh_h(te, tc);
+  __m128i se = __lsx_vilvl_h(tf, td);
+  __m128i sf = __lsx_vilvh_h(tf, td);
+
+  __m128i u0 = __lsx_vilvl_w(s4, s0);
+  __m128i u1 = __lsx_vilvh_w(s4, s0);
+  __m128i u2 = __lsx_vilvl_w(s5, s1);
+  __m128i u3 = __lsx_vilvh_w(s5, s1);
+  __m128i u4 = __lsx_vilvl_w(s6, s2);
+  __m128i u5 = __lsx_vilvh_w(s6, s2);
+  __m128i u6 = __lsx_vilvl_w(s7, s3);
+  __m128i u7 = __lsx_vilvh_w(s7, s3);
+  __m128i u8 = __lsx_vilvl_w(sc, s8);
+  __m128i u9 = __lsx_vilvh_w(sc, s8);
+  __m128i ua = __lsx_vilvl_w(sd, s9);
+  __m128i ub = __lsx_vilvh_w(sd, s9);
+  __m128i uc = __lsx_vilvl_w(se, sa);
+  __m128i ud = __lsx_vilvh_w(se, sa);
+  __m128i ue = __lsx_vilvl_w(sf, sb);
+  __m128i uf = __lsx_vilvh_w(sf, sb);
+
+  kernel.packet[0] = __lsx_vilvl_d(u8, u0);
+  kernel.packet[1] = __lsx_vilvh_d(u8, u0);
+  kernel.packet[2] = __lsx_vilvl_d(u9, u1);
+  kernel.packet[3] = __lsx_vilvh_d(u9, u1);
+  kernel.packet[4] = __lsx_vilvl_d(ua, u2);
+  kernel.packet[5] = __lsx_vilvh_d(ua, u2);
+  kernel.packet[6] = __lsx_vilvl_d(ub, u3);
+  kernel.packet[7] = __lsx_vilvh_d(ub, u3);
+  kernel.packet[8] = __lsx_vilvl_d(uc, u4);
+  kernel.packet[9] = __lsx_vilvh_d(uc, u4);
+  kernel.packet[10] = __lsx_vilvl_d(ud, u5);
+  kernel.packet[11] = __lsx_vilvh_d(ud, u5);
+  kernel.packet[12] = __lsx_vilvl_d(ue, u6);
+  kernel.packet[13] = __lsx_vilvh_d(ue, u6);
+  kernel.packet[14] = __lsx_vilvl_d(uf, u7);
+  kernel.packet[15] = __lsx_vilvh_d(uf, u7);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 8>& kernel) {
+  __m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]);
+  __m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]);
+  __m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]);
+  __m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]);
+  __m128i t4 = __lsx_vilvl_b(kernel.packet[5], kernel.packet[4]);
+  __m128i t5 = __lsx_vilvh_b(kernel.packet[5], kernel.packet[4]);
+  __m128i t6 = __lsx_vilvl_b(kernel.packet[7], kernel.packet[6]);
+  __m128i t7 = __lsx_vilvh_b(kernel.packet[7], kernel.packet[6]);
+
+  __m128i s0 = __lsx_vilvl_h(t2, t0);
+  __m128i s1 = __lsx_vilvh_h(t2, t0);
+  __m128i s2 = __lsx_vilvl_h(t3, t1);
+  __m128i s3 = __lsx_vilvh_h(t3, t1);
+  __m128i s4 = __lsx_vilvl_h(t6, t4);
+  __m128i s5 = __lsx_vilvh_h(t6, t4);
+  __m128i s6 = __lsx_vilvl_h(t7, t5);
+  __m128i s7 = __lsx_vilvh_h(t7, t5);
+
+  kernel.packet[0] = __lsx_vilvl_w(s4, s0);
+  kernel.packet[1] = __lsx_vilvh_w(s4, s0);
+  kernel.packet[2] = __lsx_vilvl_w(s5, s1);
+  kernel.packet[3] = __lsx_vilvh_w(s5, s1);
+  kernel.packet[4] = __lsx_vilvl_w(s6, s2);
+  kernel.packet[5] = __lsx_vilvh_w(s6, s2);
+  kernel.packet[6] = __lsx_vilvl_w(s7, s3);
+  kernel.packet[7] = __lsx_vilvh_w(s7, s3);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 4>& kernel) {
+  __m128i t0 = __lsx_vilvl_b(kernel.packet[1], kernel.packet[0]);
+  __m128i t1 = __lsx_vilvh_b(kernel.packet[1], kernel.packet[0]);
+  __m128i t2 = __lsx_vilvl_b(kernel.packet[3], kernel.packet[2]);
+  __m128i t3 = __lsx_vilvh_b(kernel.packet[3], kernel.packet[2]);
+
+  kernel.packet[0] = __lsx_vilvl_h(t2, t0);
+  kernel.packet[1] = __lsx_vilvh_h(t2, t0);
+  kernel.packet[2] = __lsx_vilvl_h(t3, t1);
+  kernel.packet[3] = __lsx_vilvh_h(t3, t1);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8us, 8>& kernel) {
+  __m128i t0 = __lsx_vilvl_h(kernel.packet[1], kernel.packet[0]);
+  __m128i t1 = __lsx_vilvh_h(kernel.packet[1], kernel.packet[0]);
+  __m128i t2 = __lsx_vilvl_h(kernel.packet[3], kernel.packet[2]);
+  __m128i t3 = __lsx_vilvh_h(kernel.packet[3], kernel.packet[2]);
+  __m128i t4 = __lsx_vilvl_h(kernel.packet[5], kernel.packet[4]);
+  __m128i t5 = __lsx_vilvh_h(kernel.packet[5], kernel.packet[4]);
+  __m128i t6 = __lsx_vilvl_h(kernel.packet[7], kernel.packet[6]);
+  __m128i t7 = __lsx_vilvh_h(kernel.packet[7], kernel.packet[6]);
+
+  __m128i s0 = __lsx_vilvl_w(t2, t0);
+  __m128i s1 = __lsx_vilvh_w(t2, t0);
+  __m128i s2 = __lsx_vilvl_w(t3, t1);
+  __m128i s3 = __lsx_vilvh_w(t3, t1);
+  __m128i s4 = __lsx_vilvl_w(t6, t4);
+  __m128i s5 = __lsx_vilvh_w(t6, t4);
+  __m128i s6 = __lsx_vilvl_w(t7, t5);
+  __m128i s7 = __lsx_vilvh_w(t7, t5);
+
+  kernel.packet[0] = __lsx_vilvl_d(s4, s0);
+  kernel.packet[1] = __lsx_vilvh_d(s4, s0);
+  kernel.packet[2] = __lsx_vilvl_d(s5, s1);
+  kernel.packet[3] = __lsx_vilvh_d(s5, s1);
+  kernel.packet[4] = __lsx_vilvl_d(s6, s2);
+  kernel.packet[5] = __lsx_vilvh_d(s6, s2);
+  kernel.packet[6] = __lsx_vilvl_d(s7, s3);
+  kernel.packet[7] = __lsx_vilvh_d(s7, s3);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8us, 4>& kernel) {
+  __m128i t0 = __lsx_vilvl_h(kernel.packet[1], kernel.packet[0]);
+  __m128i t1 = __lsx_vilvh_h(kernel.packet[1], kernel.packet[0]);
+  __m128i t2 = __lsx_vilvl_h(kernel.packet[3], kernel.packet[2]);
+  __m128i t3 = __lsx_vilvh_h(kernel.packet[3], kernel.packet[2]);
+
+  kernel.packet[0] = __lsx_vilvl_w(t2, t0);
+  kernel.packet[1] = __lsx_vilvh_w(t2, t0);
+  kernel.packet[2] = __lsx_vilvl_w(t3, t1);
+  kernel.packet[3] = __lsx_vilvh_w(t3, t1);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4ui, 4>& kernel) {
+  __m128i T0 = __lsx_vilvl_w(kernel.packet[1], kernel.packet[0]);
+  __m128i T1 = __lsx_vilvh_w(kernel.packet[1], kernel.packet[0]);
+  __m128i T2 = __lsx_vilvl_w(kernel.packet[3], kernel.packet[2]);
+  __m128i T3 = __lsx_vilvh_w(kernel.packet[3], kernel.packet[2]);
+
+  kernel.packet[0] = __lsx_vilvl_d(T2, T0);
+  kernel.packet[1] = __lsx_vilvh_d(T2, T0);
+  kernel.packet[2] = __lsx_vilvl_d(T3, T1);
+  kernel.packet[3] = __lsx_vilvh_d(T3, T1);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2ul, 2>& kernel) {
+  __m128i tmp = __lsx_vilvh_d(kernel.packet[1], kernel.packet[0]);
+  kernel.packet[0] = __lsx_vilvl_d(kernel.packet[1], kernel.packet[0]);
+  kernel.packet[1] = tmp;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f prsqrt(const Packet4f& a) {
+  return __lsx_vfrsqrt_s(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d prsqrt(const Packet2d& a) {
+  return __lsx_vfrsqrt_d(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pfloor(const Packet4f& a) {
+  return __lsx_vfrintrm_s(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pfloor(const Packet2d& a) {
+  return __lsx_vfrintrm_d(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pceil(const Packet4f& a) {
+  return __lsx_vfrintrp_s(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pceil(const Packet2d& a) {
+  return __lsx_vfrintrp_d(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pround(const Packet4f& a) {
+  const Packet4f mask = pset1frombits<Packet4f>(static_cast<numext::uint32_t>(0x80000000u));
+  const Packet4f prev0dot5 = pset1frombits<Packet4f>(static_cast<numext::uint32_t>(0x3EFFFFFFu));
+  return __lsx_vfrintrz_s(padd(pxor(pand(a, mask), prev0dot5), a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pround(const Packet2d& a) {
+  const Packet2d mask = pset1frombits<Packet2d>(static_cast<numext::uint64_t>(0x8000000000000000ull));
+  const Packet2d prev0dot5 = pset1frombits<Packet2d>(static_cast<numext::uint64_t>(0x3FDFFFFFFFFFFFFFull));
+  return __lsx_vfrintrz_d(padd(por(pand(a, mask), prev0dot5), a));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
+  return (Packet4f)__lsx_vbitsel_v((__m128i)b, (__m128i)a, (__m128i)mask);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pselect(const Packet16c& mask, const Packet16c& a, const Packet16c& b) {
+  return (Packet16c)__lsx_vbitsel_v((__m128i)b, (__m128i)a, (__m128i)mask);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16c ploadquad<Packet16c>(const int8_t* from) {
+  int8_t tmp[16] = {*from,       *from,       *from,       *from,       *(from + 1), *(from + 1),
+                    *(from + 1), *(from + 1), *(from + 2), *(from + 2), *(from + 2), *(from + 2),
+                    *(from + 3), *(from + 3), *(from + 3), *(from + 3)};
+  return __lsx_vld(tmp, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc ploadquad<Packet16uc>(const uint8_t* from) {
+  uint8_t tmp[16] = {*from,       *from,       *from,       *from,       *(from + 1), *(from + 1),
+                     *(from + 1), *(from + 1), *(from + 2), *(from + 2), *(from + 2), *(from + 2),
+                     *(from + 3), *(from + 3), *(from + 3), *(from + 3)};
+  return __lsx_vld(tmp, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s ploadquad<Packet8s>(const int16_t* from) {
+  int16_t tmp[8] = {*from, *from, *from, *from, *(from + 1), *(from + 1), *(from + 1), *(from + 1)};
+  return __lsx_vld(tmp, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us ploadquad<Packet8us>(const uint16_t* from) {
+  uint16_t tmp[8] = {*from, *from, *from, *from, *(from + 1), *(from + 1), *(from + 1), *(from + 1)};
+  return __lsx_vld(tmp, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i ploadquad<Packet4i>(const int32_t* from) {
+  int32_t tmp[4] = {*from, *from, *from, *from};
+  return __lsx_vld(tmp, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui ploadquad<Packet4ui>(const uint32_t* from) {
+  uint32_t tmp[4] = {*from, *from, *from, *from};
+  return __lsx_vld(tmp, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16c pnmsub(const Packet16c& a, const Packet16c& b, const Packet16c& c) {
+  return __lsx_vmsub_b(pnegate(c), a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pnmsub(const Packet8s& a, const Packet8s& b, const Packet8s& c) {
+  return __lsx_vmsub_h(pnegate(c), a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pnmsub(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
+  return __lsx_vmsub_w(pnegate(c), a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pnmsub(const Packet2l& a, const Packet2l& b, const Packet2l& c) {
+  return __lsx_vmsub_d(pnegate(c), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16c pmsub(const Packet16c& a, const Packet16c& b, const Packet16c& c) {
+  return __lsx_vmadd_b(pnegate(c), a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pmsub(const Packet8s& a, const Packet8s& b, const Packet8s& c) {
+  return __lsx_vmadd_h(pnegate(c), a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pmsub(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
+  return __lsx_vmadd_w(pnegate(c), a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pmsub(const Packet2l& a, const Packet2l& b, const Packet2l& c) {
+  return __lsx_vmadd_d(pnegate(c), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16c pnmadd(const Packet16c& a, const Packet16c& b, const Packet16c& c) {
+  return __lsx_vmsub_b(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pnmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c) {
+  return __lsx_vmsub_h(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pnmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
+  return __lsx_vmsub_w(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pnmadd(const Packet2l& a, const Packet2l& b, const Packet2l& c) {
+  return __lsx_vmsub_d(c, a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pexp(const Packet4f& _x) {
+  return pexp_float(_x);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pexp(const Packet2d& _x) {
+  return pexp_double(_x);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
+  return pldexp_generic(a, exponent);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(const Packet2d& a, Packet2d& exponent) {
+  return pfrexp_generic(a, exponent);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
+  return pfrexp_generic(a, exponent);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pzero(const Packet4f& /* a */) {
+  Packet4f v = {0.0f, 0.0f, 0.0f, 0.0f};
+  return v;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pabsdiff<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  Packet4f v = psub(a, b);
+  return pabs(v);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pmin<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return pmin<Packet4f>(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pmax<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return pmax<Packet4f>(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f ploadquad<Packet4f>(const float* from) {
+  return (__m128)__lsx_vldrepl_w(from, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f psignbit(const Packet4f& a) {
+  return (__m128)__lsx_vsrai_w((__m128i)a, 31);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a) {
+  return __lsx_vfrintrne_s(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f ptrunc<Packet4f>(const Packet4f& a) {
+  return __lsx_vfrintrz_s(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f preciprocal<Packet4f>(const Packet4f& a) {
+  return __lsx_vfrecip_s(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pzero(const Packet2d& /* a */) {
+  Packet2d v = {0.0, 0.0};
+  return v;
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pmin<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return pmin<Packet2d>(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pmax<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return pmax<Packet2d>(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a) {
+  return (__m128d)(__lsx_vsrai_d((__m128i)a, 63));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pselect(const Packet2d& mask, const Packet2d& a, const Packet2d& b) {
+  return (Packet2d)__lsx_vbitsel_v((__m128i)b, (__m128i)a, (__m128i)mask);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a) {
+  return __lsx_vfrintrne_d(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d ptrunc<Packet2d>(const Packet2d& a) {
+  return __lsx_vfrintrz_d(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
+  return pldexp_generic(a, exponent);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16c pabsdiff<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  Packet16c v = psub(a, b);
+  return pabs(v);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8s pabsdiff<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  Packet8s v = psub(a, b);
+  return pabs(v);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pselect(const Packet8s& mask, const Packet8s& a, const Packet8s& b) {
+  return __lsx_vbitsel_v(b, a, mask);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pabsdiff<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  Packet4i v = psub(a, b);
+  return pabs(v);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b) {
+  return __lsx_vbitsel_v(b, a, mask);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pselect(const Packet2l& mask, const Packet2l& a, const Packet2l& b) {
+  return __lsx_vbitsel_v(b, a, mask);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16uc pdiv<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return __lsx_vdiv_bu(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pabsdiff<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  Packet16uc v = psub(a, b);
+  return pabs(v);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pselect(const Packet16uc& mask, const Packet16uc& a,
+                                                         const Packet16uc& b) {
+  return __lsx_vbitsel_v(b, a, mask);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc psqrt(const Packet16uc& a) {
+  __m128i res = {0, 0};
+  __m128i add = {0x0808080808080808, 0x0808080808080808};
+  for (int i = 0; i < 4; i++) {
+    const __m128i temp = __lsx_vor_v(res, add);
+    const __m128i tmul = __lsx_vpackev_b(__lsx_vmulwod_h_bu(temp, temp), __lsx_vmulwev_h_bu(temp, temp));
+    res = __lsx_vbitsel_v(res, temp, __lsx_vsle_bu(tmul, a));
+    add = __lsx_vsrli_b(add, 1);
+  }
+  return res;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8us pabsdiff<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  Packet8us v = psub(a, b);
+  return pabs(v);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pselect(const Packet8us& mask, const Packet8us& a, const Packet8us& b) {
+  return __lsx_vbitsel_v(b, a, mask);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us psqrt(const Packet8us& a) {
+  __m128i res = {0, 0};
+  __m128i add = {0x0080008000800080, 0x0080008000800080};
+  for (int i = 0; i < 4; i++) {
+    const __m128i temp = __lsx_vor_v(res, add);
+    const __m128i tmul = __lsx_vpackev_h(__lsx_vmulwod_w_hu(temp, temp), __lsx_vmulwev_w_hu(temp, temp));
+    res = __lsx_vbitsel_v(res, temp, __lsx_vsle_hu(tmul, a));
+    add = __lsx_vsrli_h(add, 1);
+  }
+  return res;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4ui pabsdiff<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  Packet4ui v = psub(a, b);
+  return pabs(v);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pselect(const Packet4ui& mask, const Packet4ui& a, const Packet4ui& b) {
+  return __lsx_vbitsel_v(b, a, mask);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui psqrt(const Packet4ui& a) {
+  __m128i res = {0, 0};
+  __m128i add = {0x0000800000008000, 0x0000800000008000};
+  for (int i = 0; i < 4; i++) {
+    const __m128i temp = __lsx_vor_v(res, add);
+    const __m128i tmul = __lsx_vpackev_w(__lsx_vmulwod_d_wu(temp, temp), __lsx_vmulwev_d_wu(temp, temp));
+    res = __lsx_vbitsel_v(res, temp, __lsx_vsle_wu(tmul, a));
+    add = __lsx_vsrli_w(add, 1);
+  }
+  return res;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pselect(const Packet2ul& mask, const Packet2ul& a, const Packet2ul& b) {
+  return __lsx_vbitsel_v(b, a, mask);
+}
+
+}  // namespace internal
+}  // namespace Eigen
+#endif
diff --git a/inst/include/Eigen/src/Core/arch/LSX/TypeCasting.h b/inst/include/Eigen/src/Core/arch/LSX/TypeCasting.h
new file mode 100644
index 00000000..cda86806
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/LSX/TypeCasting.h
@@ -0,0 +1,526 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2023 Zang Ruochen <zangruochen@loongson.cn>
+// Copyright (C) 2024 XiWei Gu <guxiwei-hf@loongson.cn>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TYPE_CASTING_LSX_H
+#define EIGEN_TYPE_CASTING_LSX_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+//==============================================================================
+// preinterpret
+//==============================================================================
+template <>
+EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f, Packet4i>(const Packet4i& a) {
+  return (__m128)((__m128i)a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f, Packet4ui>(const Packet4ui& a) {
+  return (__m128)((__m128i)a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet2l>(const Packet2l& a) {
+  return (__m128d)((__m128i)a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet2ul>(const Packet2ul& a) {
+  return (__m128d)((__m128i)a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet4i>(const Packet4i& a) {
+  return (__m128d)((__m128i)a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c preinterpret<Packet16c, Packet16uc>(const Packet16uc& a) {
+  return (__m128i)a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s preinterpret<Packet8s, Packet8us>(const Packet8us& a) {
+  return (__m128i)a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet4f>(const Packet4f& a) {
+  return (__m128i)a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet4ui>(const Packet4ui& a) {
+  return (__m128i)a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet2d>(const Packet2d& a) {
+  return (__m128i)a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l preinterpret<Packet2l, Packet2d>(const Packet2d& a) {
+  return (__m128i)a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc preinterpret<Packet16uc, Packet16c>(const Packet16c& a) {
+  return (__m128i)a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us preinterpret<Packet8us, Packet8s>(const Packet8s& a) {
+  return (__m128i)a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui preinterpret<Packet4ui, Packet4f>(const Packet4f& a) {
+  return (__m128i)a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui preinterpret<Packet4ui, Packet4i>(const Packet4i& a) {
+  return (__m128i)a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul preinterpret<Packet2ul, Packet2d>(const Packet2d& a) {
+  return (__m128i)a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul preinterpret<Packet2ul, Packet2l>(const Packet2l& a) {
+  return (__m128i)a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2l pcast<Packet4f, Packet2l>(const Packet4f& a) {
+  Packet2d tmp = __lsx_vfcvtl_d_s(a);
+  return __lsx_vftint_l_d(tmp);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcast<Packet4f, Packet2ul>(const Packet4f& a) {
+  Packet2d tmp = __lsx_vfcvtl_d_s(a);
+  return __lsx_vftint_lu_d(tmp);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
+  return __lsx_vftint_w_s(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcast<Packet4f, Packet4ui>(const Packet4f& a) {
+  return __lsx_vftint_wu_s(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pcast<Packet4f, Packet8s>(const Packet4f& a, const Packet4f& b) {
+  return __lsx_vssrlni_h_w(__lsx_vftint_w_s(a), __lsx_vftint_w_s(b), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pcast<Packet4f, Packet8us>(const Packet4f& a, const Packet4f& b) {
+  return __lsx_vssrlni_hu_w(__lsx_vftint_wu_s(a), __lsx_vftint_wu_s(b), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pcast<Packet4f, Packet16c>(const Packet4f& a, const Packet4f& b, const Packet4f& c,
+                                                         const Packet4f& d) {
+  Packet8s tmp1 = __lsx_vssrlni_h_w(__lsx_vftint_w_s(a), __lsx_vftint_w_s(b), 0);
+  Packet8s tmp2 = __lsx_vssrlni_h_w(__lsx_vftint_w_s(c), __lsx_vftint_w_s(d), 0);
+  return __lsx_vssrlni_b_h((__m128i)tmp1, (__m128i)tmp2, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcast<Packet4f, Packet16uc>(const Packet4f& a, const Packet4f& b, const Packet4f& c,
+                                                           const Packet4f& d) {
+  Packet8us tmp1 = __lsx_vssrlni_hu_w(__lsx_vftint_wu_s(a), __lsx_vftint_wu_s(b), 0);
+  Packet8us tmp2 = __lsx_vssrlni_hu_w(__lsx_vftint_wu_s(c), __lsx_vftint_wu_s(d), 0);
+  return __lsx_vssrlni_bu_h((__m128i)tmp1, (__m128i)tmp2, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet16c, Packet4f>(const Packet16c& a) {
+  Packet8s tmp1 = __lsx_vsllwil_h_b((__m128i)a, 0);
+  Packet4i tmp2 = __lsx_vsllwil_w_h((__m128i)tmp1, 0);
+  return __lsx_vffint_s_w(tmp2);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pcast<Packet16c, Packet2l>(const Packet16c& a) {
+  Packet8s tmp1 = __lsx_vsllwil_h_b((__m128i)a, 0);
+  Packet4i tmp2 = __lsx_vsllwil_w_h((__m128i)tmp1, 0);
+  return __lsx_vsllwil_d_w((__m128i)tmp2, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcast<Packet16c, Packet2ul>(const Packet16c& a) {
+  Packet8s tmp1 = __lsx_vsllwil_h_b((__m128i)a, 0);
+  Packet4i tmp2 = __lsx_vsllwil_w_h((__m128i)tmp1, 0);
+  return (Packet2ul)__lsx_vsllwil_d_w((__m128i)tmp2, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet16c, Packet4i>(const Packet16c& a) {
+  Packet8s tmp1 = __lsx_vsllwil_h_b((__m128i)a, 0);
+  return __lsx_vsllwil_w_h((__m128i)tmp1, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcast<Packet16c, Packet4ui>(const Packet16c& a) {
+  Packet8s tmp1 = __lsx_vsllwil_h_b((__m128i)a, 0);
+  return (Packet4ui)__lsx_vsllwil_w_h((__m128i)tmp1, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pcast<Packet16c, Packet8s>(const Packet16c& a) {
+  return __lsx_vsllwil_h_b((__m128i)a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pcast<Packet16c, Packet8us>(const Packet16c& a) {
+  return (Packet8us)__lsx_vsllwil_h_b((__m128i)a, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet16uc, Packet4f>(const Packet16uc& a) {
+  Packet8us tmp1 = __lsx_vsllwil_hu_bu((__m128i)a, 0);
+  Packet4ui tmp2 = __lsx_vsllwil_wu_hu((__m128i)tmp1, 0);
+  return __lsx_vffint_s_wu(tmp2);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcast<Packet16uc, Packet2ul>(const Packet16uc& a) {
+  Packet8us tmp1 = __lsx_vsllwil_hu_bu((__m128i)a, 0);
+  Packet4ui tmp2 = __lsx_vsllwil_wu_hu((__m128i)tmp1, 0);
+  return __lsx_vsllwil_du_wu((__m128i)tmp2, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pcast<Packet16uc, Packet2l>(const Packet16uc& a) {
+  Packet8us tmp1 = __lsx_vsllwil_hu_bu((__m128i)a, 0);
+  Packet4ui tmp2 = __lsx_vsllwil_wu_hu((__m128i)tmp1, 0);
+  return (Packet2l)__lsx_vsllwil_du_wu((__m128i)tmp2, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcast<Packet16uc, Packet4ui>(const Packet16uc& a) {
+  Packet8us tmp1 = __lsx_vsllwil_hu_bu((__m128i)a, 0);
+  return __lsx_vsllwil_wu_hu((__m128i)tmp1, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet16uc, Packet4i>(const Packet16uc& a) {
+  Packet8us tmp1 = __lsx_vsllwil_hu_bu((__m128i)a, 0);
+  return (Packet4i)__lsx_vsllwil_wu_hu((__m128i)tmp1, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pcast<Packet16uc, Packet8us>(const Packet16uc& a) {
+  return __lsx_vsllwil_hu_bu((__m128i)a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pcast<Packet16uc, Packet8s>(const Packet16uc& a) {
+  return (Packet8s)__lsx_vsllwil_hu_bu((__m128i)a, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet8s, Packet4f>(const Packet8s& a) {
+  Packet4i tmp1 = __lsx_vsllwil_w_h((__m128i)a, 0);
+  return __lsx_vffint_s_w(tmp1);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pcast<Packet8s, Packet2l>(const Packet8s& a) {
+  Packet4i tmp1 = __lsx_vsllwil_w_h((__m128i)a, 0);
+  return __lsx_vsllwil_d_w((__m128i)tmp1, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcast<Packet8s, Packet2ul>(const Packet8s& a) {
+  Packet4i tmp1 = __lsx_vsllwil_w_h((__m128i)a, 0);
+  return (Packet2ul)__lsx_vsllwil_d_w((__m128i)tmp1, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet8s, Packet4i>(const Packet8s& a) {
+  return __lsx_vsllwil_w_h((__m128i)a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcast<Packet8s, Packet4ui>(const Packet8s& a) {
+  return (Packet4ui)__lsx_vsllwil_w_h((__m128i)a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pcast<Packet8s, Packet16c>(const Packet8s& a, const Packet8s& b) {
+  return __lsx_vssrlni_b_h((__m128i)a, (__m128i)b, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcast<Packet8s, Packet16uc>(const Packet8s& a, const Packet8s& b) {
+  return (Packet16uc)__lsx_vssrlni_b_h((__m128i)a, (__m128i)b, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet8us, Packet4f>(const Packet8us& a) {
+  Packet4ui tmp1 = __lsx_vsllwil_wu_hu((__m128i)a, 0);
+  return __lsx_vffint_s_wu(tmp1);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcast<Packet8us, Packet2ul>(const Packet8us& a) {
+  Packet4ui tmp1 = __lsx_vsllwil_wu_hu((__m128i)a, 0);
+  return __lsx_vsllwil_du_wu((__m128i)tmp1, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pcast<Packet8us, Packet2l>(const Packet8us& a) {
+  Packet4ui tmp1 = __lsx_vsllwil_wu_hu((__m128i)a, 0);
+  return (Packet2l)__lsx_vsllwil_du_wu((__m128i)tmp1, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcast<Packet8us, Packet4ui>(const Packet8us& a) {
+  return __lsx_vsllwil_wu_hu((__m128i)a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet8us, Packet4i>(const Packet8us& a) {
+  return (Packet4i)__lsx_vsllwil_wu_hu((__m128i)a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcast<Packet8us, Packet16uc>(const Packet8us& a, const Packet8us& b) {
+  return __lsx_vssrlni_bu_h((__m128i)a, (__m128i)b, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pcast<Packet8us, Packet16c>(const Packet8us& a, const Packet8us& b) {
+  return (Packet16c)__lsx_vssrlni_bu_h((__m128i)a, (__m128i)b, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
+  return __lsx_vffint_s_w(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pcast<Packet4i, Packet2l>(const Packet4i& a) {
+  return __lsx_vsllwil_d_w((__m128i)a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcast<Packet4i, Packet2ul>(const Packet4i& a) {
+  return (Packet2ul)__lsx_vsllwil_d_w((__m128i)a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pcast<Packet4i, Packet8s>(const Packet4i& a, const Packet4i& b) {
+  return __lsx_vssrlni_h_w((__m128i)a, (__m128i)b, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pcast<Packet4i, Packet8us>(const Packet4i& a, const Packet4i& b) {
+  return (Packet8us)__lsx_vssrlni_h_w((__m128i)a, (__m128i)b, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pcast<Packet4i, Packet16c>(const Packet4i& a, const Packet4i& b, const Packet4i& c,
+                                                         const Packet4i& d) {
+  Packet8s tmp1 = __lsx_vssrlni_h_w((__m128i)a, (__m128i)b, 0);
+  Packet8s tmp2 = __lsx_vssrlni_h_w((__m128i)c, (__m128i)d, 0);
+  return __lsx_vssrlni_b_h((__m128i)tmp1, (__m128i)tmp2, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcast<Packet4i, Packet16uc>(const Packet4i& a, const Packet4i& b, const Packet4i& c,
+                                                           const Packet4i& d) {
+  Packet8s tmp1 = __lsx_vssrlni_h_w((__m128i)a, (__m128i)b, 0);
+  Packet8s tmp2 = __lsx_vssrlni_h_w((__m128i)c, (__m128i)d, 0);
+  return (Packet16uc)__lsx_vssrlni_b_h((__m128i)tmp1, (__m128i)tmp2, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet4ui, Packet4f>(const Packet4ui& a) {
+  return __lsx_vffint_s_wu(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcast<Packet4ui, Packet2ul>(const Packet4ui& a) {
+  return __lsx_vsllwil_du_wu((__m128i)a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pcast<Packet4ui, Packet2l>(const Packet4ui& a) {
+  return (Packet2l)__lsx_vsllwil_du_wu((__m128i)a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pcast<Packet4ui, Packet8us>(const Packet4ui& a, const Packet4ui& b) {
+  return __lsx_vssrlni_hu_w((__m128i)a, (__m128i)b, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pcast<Packet4ui, Packet8s>(const Packet4ui& a, const Packet4ui& b) {
+  return (Packet8s)__lsx_vssrlni_hu_w((__m128i)a, (__m128i)b, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcast<Packet4ui, Packet16uc>(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c,
+                                                            const Packet4ui& d) {
+  Packet8us tmp1 = __lsx_vssrlni_hu_w((__m128i)a, (__m128i)b, 0);
+  Packet8us tmp2 = __lsx_vssrlni_hu_w((__m128i)c, (__m128i)d, 0);
+  return __lsx_vssrlni_bu_h((__m128i)tmp1, (__m128i)tmp2, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pcast<Packet4ui, Packet16c>(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c,
+                                                          const Packet4ui& d) {
+  Packet8us tmp1 = __lsx_vssrlni_hu_w((__m128i)a, (__m128i)b, 0);
+  Packet8us tmp2 = __lsx_vssrlni_hu_w((__m128i)c, (__m128i)d, 0);
+  return (Packet16c)__lsx_vssrlni_bu_h((__m128i)tmp1, (__m128i)tmp2, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet2l, Packet4f>(const Packet2l& a, const Packet2l& b) {
+  return __lsx_vffint_s_w(__lsx_vssrlni_w_d((__m128i)a, (__m128i)b, 0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet2l, Packet4i>(const Packet2l& a, const Packet2l& b) {
+  return __lsx_vssrlni_w_d((__m128i)a, (__m128i)b, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcast<Packet2l, Packet4ui>(const Packet2l& a, const Packet2l& b) {
+  return (Packet4ui)__lsx_vssrlni_w_d((__m128i)a, (__m128i)b, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pcast<Packet2l, Packet8s>(const Packet2l& a, const Packet2l& b, const Packet2l& c,
+                                                       const Packet2l& d) {
+  Packet4i tmp1 = __lsx_vssrlni_w_d((__m128i)a, (__m128i)b, 0);
+  Packet4i tmp2 = __lsx_vssrlni_w_d((__m128i)c, (__m128i)d, 0);
+  return __lsx_vssrlni_h_w((__m128i)tmp1, (__m128i)tmp2, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pcast<Packet2l, Packet8us>(const Packet2l& a, const Packet2l& b, const Packet2l& c,
+                                                         const Packet2l& d) {
+  Packet4i tmp1 = __lsx_vssrlni_w_d((__m128i)a, (__m128i)b, 0);
+  Packet4i tmp2 = __lsx_vssrlni_w_d((__m128i)c, (__m128i)d, 0);
+  return (Packet8us)__lsx_vssrlni_h_w((__m128i)tmp1, (__m128i)tmp2, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pcast<Packet2l, Packet16c>(const Packet2l& a, const Packet2l& b, const Packet2l& c,
+                                                         const Packet2l& d, const Packet2l& e, const Packet2l& f,
+                                                         const Packet2l& g, const Packet2l& h) {
+  const Packet8s abcd = pcast<Packet2l, Packet8s>(a, b, c, d);
+  const Packet8s efgh = pcast<Packet2l, Packet8s>(e, f, g, h);
+  return __lsx_vssrlni_b_h((__m128i)abcd, (__m128i)efgh, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcast<Packet2l, Packet16uc>(const Packet2l& a, const Packet2l& b, const Packet2l& c,
+                                                           const Packet2l& d, const Packet2l& e, const Packet2l& f,
+                                                           const Packet2l& g, const Packet2l& h) {
+  const Packet8us abcd = pcast<Packet2l, Packet8us>(a, b, c, d);
+  const Packet8us efgh = pcast<Packet2l, Packet8us>(e, f, g, h);
+  return __lsx_vssrlni_bu_h((__m128i)abcd, (__m128i)efgh, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet2ul, Packet4f>(const Packet2ul& a, const Packet2ul& b) {
+  return __lsx_vffint_s_wu(__lsx_vssrlni_w_d((__m128i)a, (__m128i)b, 0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcast<Packet2ul, Packet4ui>(const Packet2ul& a, const Packet2ul& b) {
+  return __lsx_vssrlni_wu_d((__m128i)a, (__m128i)b, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet2ul, Packet4i>(const Packet2ul& a, const Packet2ul& b) {
+  return (Packet4i)__lsx_vssrlni_wu_d((__m128i)a, (__m128i)b, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pcast<Packet2ul, Packet8us>(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c,
+                                                          const Packet2ul& d) {
+  Packet4ui tmp1 = __lsx_vssrlni_wu_d((__m128i)a, (__m128i)b, 0);
+  Packet4ui tmp2 = __lsx_vssrlni_wu_d((__m128i)c, (__m128i)d, 0);
+  return __lsx_vssrlni_hu_w((__m128i)tmp1, (__m128i)tmp2, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pcast<Packet2ul, Packet8s>(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c,
+                                                        const Packet2ul& d) {
+  Packet4ui tmp1 = __lsx_vssrlni_wu_d((__m128i)a, (__m128i)b, 0);
+  Packet4ui tmp2 = __lsx_vssrlni_wu_d((__m128i)c, (__m128i)d, 0);
+  return (Packet8s)__lsx_vssrlni_hu_w((__m128i)tmp1, (__m128i)tmp2, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcast<Packet2ul, Packet16uc>(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c,
+                                                            const Packet2ul& d, const Packet2ul& e, const Packet2ul& f,
+                                                            const Packet2ul& g, const Packet2ul& h) {
+  const Packet8s abcd = pcast<Packet2ul, Packet8s>(a, b, c, d);
+  const Packet8s efgh = pcast<Packet2ul, Packet8s>(e, f, g, h);
+  return __lsx_vssrlni_b_h((__m128i)abcd, (__m128i)efgh, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pcast<Packet2ul, Packet16c>(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c,
+                                                          const Packet2ul& d, const Packet2ul& e, const Packet2ul& f,
+                                                          const Packet2ul& g, const Packet2ul& h) {
+  const Packet8us abcd = pcast<Packet2ul, Packet8us>(a, b, c, d);
+  const Packet8us efgh = pcast<Packet2ul, Packet8us>(e, f, g, h);
+  return __lsx_vssrlni_bu_h((__m128i)abcd, (__m128i)efgh, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet2d, Packet4f>(const Packet2d& a, const Packet2d& b) {
+  return __lsx_vfcvt_s_d(b, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pcast<Packet2d, Packet2l>(const Packet2d& a) {
+  return __lsx_vftint_l_d(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcast<Packet2d, Packet2ul>(const Packet2d& a) {
+  return __lsx_vftint_lu_d(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet2d, Packet4i>(const Packet2d& a, const Packet2d& b) {
+  return __lsx_vssrlni_w_d(__lsx_vftint_l_d(a), __lsx_vftint_l_d(b), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcast<Packet2d, Packet4ui>(const Packet2d& a, const Packet2d& b) {
+  return __lsx_vssrlni_wu_d(__lsx_vftint_lu_d(a), __lsx_vftint_lu_d(b), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pcast<Packet2d, Packet8s>(const Packet2d& a, const Packet2d& b, const Packet2d& c,
+                                                       const Packet2d& d) {
+  Packet4i tmp1 = __lsx_vssrlni_w_d(__lsx_vftint_l_d(a), __lsx_vftint_l_d(b), 0);
+  Packet4i tmp2 = __lsx_vssrlni_w_d(__lsx_vftint_l_d(c), __lsx_vftint_l_d(d), 0);
+  return __lsx_vssrlni_h_w((__m128i)tmp1, (__m128i)tmp2, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pcast<Packet2d, Packet8us>(const Packet2d& a, const Packet2d& b, const Packet2d& c,
+                                                         const Packet2d& d) {
+  Packet4ui tmp1 = __lsx_vssrlni_wu_d(__lsx_vftint_lu_d(a), __lsx_vftint_lu_d(b), 0);
+  Packet4ui tmp2 = __lsx_vssrlni_wu_d(__lsx_vftint_lu_d(c), __lsx_vftint_lu_d(d), 0);
+  return __lsx_vssrlni_hu_w((__m128i)tmp1, (__m128i)tmp2, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pcast<Packet2d, Packet16c>(const Packet2d& a, const Packet2d& b, const Packet2d& c,
+                                                         const Packet2d& d, const Packet2d& e, const Packet2d& f,
+                                                         const Packet2d& g, const Packet2d& h) {
+  const Packet8s abcd = pcast<Packet2d, Packet8s>(a, b, c, d);
+  const Packet8s efgh = pcast<Packet2d, Packet8s>(e, f, g, h);
+  return __lsx_vssrlni_b_h((__m128i)abcd, (__m128i)efgh, 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcast<Packet2d, Packet16uc>(const Packet2d& a, const Packet2d& b, const Packet2d& c,
+                                                           const Packet2d& d, const Packet2d& e, const Packet2d& f,
+                                                           const Packet2d& g, const Packet2d& h) {
+  const Packet8us abcd = pcast<Packet2d, Packet8us>(a, b, c, d);
+  const Packet8us efgh = pcast<Packet2d, Packet8us>(e, f, g, h);
+  return __lsx_vssrlni_bu_h((__m128i)abcd, (__m128i)efgh, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet4f, Packet2d>(const Packet4f& a) {
+  return __lsx_vfcvtl_d_s(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet16c, Packet2d>(const Packet16c& a) {
+  Packet8s tmp1 = __lsx_vsllwil_h_b((__m128i)a, 0);
+  Packet4i tmp2 = __lsx_vsllwil_w_h((__m128i)tmp1, 0);
+  return __lsx_vffint_d_l(__lsx_vsllwil_d_w((__m128i)tmp2, 0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet16uc, Packet2d>(const Packet16uc& a) {
+  Packet8us tmp1 = __lsx_vsllwil_hu_bu((__m128i)a, 0);
+  Packet4ui tmp2 = __lsx_vsllwil_wu_hu((__m128i)tmp1, 0);
+  return __lsx_vffint_d_lu(__lsx_vsllwil_du_wu((__m128i)tmp2, 0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet8s, Packet2d>(const Packet8s& a) {
+  Packet4i tmp = __lsx_vsllwil_w_h((__m128i)a, 0);
+  return __lsx_vffint_d_l(__lsx_vsllwil_d_w((__m128i)tmp, 0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet8us, Packet2d>(const Packet8us& a) {
+  Packet4ui tmp = __lsx_vsllwil_wu_hu((__m128i)a, 0);
+  return __lsx_vffint_d_lu(__lsx_vsllwil_du_wu((__m128i)tmp, 0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet4i, Packet2d>(const Packet4i& a) {
+  return __lsx_vffint_d_l(__lsx_vsllwil_d_w((__m128i)a, 0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet4ui, Packet2d>(const Packet4ui& a) {
+  return __lsx_vffint_d_lu(__lsx_vsllwil_du_wu((__m128i)a, 0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet2l, Packet2d>(const Packet2l& a) {
+  return __lsx_vffint_d_l(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet2ul, Packet2d>(const Packet2ul& a) {
+  return __lsx_vffint_d_lu(a);
+}
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_TYPE_CASTING_LSX_H
diff --git a/inst/include/Eigen/src/Core/arch/MSA/Complex.h b/inst/include/Eigen/src/Core/arch/MSA/Complex.h
new file mode 100644
index 00000000..2d2fbbca
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/MSA/Complex.h
@@ -0,0 +1,620 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018 Wave Computing, Inc.
+// Written by:
+//   Chris Larsen
+//   Alexey Frunze (afrunze@wavecomp.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_COMPLEX_MSA_H
+#define EIGEN_COMPLEX_MSA_H
+
+#include <iostream>
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+//---------- float ----------
+struct Packet2cf {
+  EIGEN_STRONG_INLINE Packet2cf() {}
+  EIGEN_STRONG_INLINE explicit Packet2cf(const std::complex<float>& a, const std::complex<float>& b) {
+    Packet4f t = {std::real(a), std::imag(a), std::real(b), std::imag(b)};
+    v = t;
+  }
+  EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {}
+  EIGEN_STRONG_INLINE Packet2cf(const Packet2cf& a) : v(a.v) {}
+  EIGEN_STRONG_INLINE Packet2cf& operator=(const Packet2cf& b) {
+    v = b.v;
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet2cf conjugate(void) const {
+    return Packet2cf((Packet4f)__builtin_msa_bnegi_d((v2u64)v, 63));
+  }
+  EIGEN_STRONG_INLINE Packet2cf& operator*=(const Packet2cf& b) {
+    Packet4f v1, v2;
+
+    // Get the real values of a | a1_re | a1_re | a2_re | a2_re |
+    v1 = (Packet4f)__builtin_msa_ilvev_w((v4i32)v, (v4i32)v);
+    // Get the imag values of a | a1_im | a1_im | a2_im | a2_im |
+    v2 = (Packet4f)__builtin_msa_ilvod_w((v4i32)v, (v4i32)v);
+    // Multiply the real a with b
+    v1 = pmul(v1, b.v);
+    // Multiply the imag a with b
+    v2 = pmul(v2, b.v);
+    // Conjugate v2
+    v2 = Packet2cf(v2).conjugate().v;
+    // Swap real/imag elements in v2.
+    v2 = (Packet4f)__builtin_msa_shf_w((v4i32)v2, EIGEN_MSA_SHF_I8(1, 0, 3, 2));
+    // Add and return the result
+    v = padd(v1, v2);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet2cf operator*(const Packet2cf& b) const { return Packet2cf(*this) *= b; }
+  EIGEN_STRONG_INLINE Packet2cf& operator+=(const Packet2cf& b) {
+    v = padd(v, b.v);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet2cf operator+(const Packet2cf& b) const { return Packet2cf(*this) += b; }
+  EIGEN_STRONG_INLINE Packet2cf& operator-=(const Packet2cf& b) {
+    v = psub(v, b.v);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet2cf operator-(const Packet2cf& b) const { return Packet2cf(*this) -= b; }
+  EIGEN_STRONG_INLINE Packet2cf operator/(const Packet2cf& b) const { return pdiv_complex(Packet2cf(*this), b); }
+  EIGEN_STRONG_INLINE Packet2cf& operator/=(const Packet2cf& b) {
+    *this = Packet2cf(*this) / b;
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet2cf operator-(void) const { return Packet2cf(pnegate(v)); }
+
+  Packet4f v;
+};
+
+inline std::ostream& operator<<(std::ostream& os, const Packet2cf& value) {
+  os << "[ (" << value.v[0] << ", " << value.v[1]
+     << "i),"
+        "  ("
+     << value.v[2] << ", " << value.v[3] << "i) ]";
+  return os;
+}
+
+template <>
+struct packet_traits<std::complex<float> > : default_packet_traits {
+  typedef Packet2cf type;
+  typedef Packet2cf half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 2,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasNegate = 1,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 0,
+    HasMax = 0,
+    HasSetLinear = 0,
+    HasBlend = 1
+  };
+};
+
+template <>
+struct unpacket_traits<Packet2cf> {
+  typedef std::complex<float> type;
+  enum {
+    size = 2,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+  typedef Packet2cf half;
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from) {
+  EIGEN_MSA_DEBUG;
+
+  float f0 = from.real(), f1 = from.imag();
+  Packet4f v0 = {f0, f0, f0, f0};
+  Packet4f v1 = {f1, f1, f1, f1};
+  return Packet2cf((Packet4f)__builtin_msa_ilvr_w((Packet4i)v1, (Packet4i)v0));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  EIGEN_MSA_DEBUG;
+
+  return a + b;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  EIGEN_MSA_DEBUG;
+
+  return a - b;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) {
+  EIGEN_MSA_DEBUG;
+
+  return -a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) {
+  EIGEN_MSA_DEBUG;
+
+  return a.conjugate();
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  EIGEN_MSA_DEBUG;
+
+  return a * b;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pand<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet2cf(pand(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf por<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet2cf(por(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pxor<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet2cf(pxor(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet2cf(pandnot(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>* from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>((const float*)from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>((const float*)from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) {
+  EIGEN_MSA_DEBUG;
+
+  return pset1<Packet2cf>(*from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<std::complex<float> >(std::complex<float>* to, const Packet2cf& from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_ALIGNED_STORE pstore<float>((float*)to, from.v);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet2cf& from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_UNALIGNED_STORE pstoreu<float>((float*)to, from.v);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from,
+                                                                           Index stride) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet2cf(from[0 * stride], from[1 * stride]);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from,
+                                                                       Index stride) {
+  EIGEN_MSA_DEBUG;
+
+  *to = std::complex<float>(from.v[0], from.v[1]);
+  to += stride;
+  *to = std::complex<float>(from.v[2], from.v[3]);
+}
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float>* addr) {
+  EIGEN_MSA_DEBUG;
+
+  prefetch(reinterpret_cast<const float*>(addr));
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a) {
+  EIGEN_MSA_DEBUG;
+
+  return std::complex<float>(a.v[0], a.v[1]);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet2cf((Packet4f)__builtin_msa_shf_w((v4i32)a.v, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& a) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet2cf((Packet4f)__builtin_msa_shf_w((v4i32)a.v, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a) {
+  EIGEN_MSA_DEBUG;
+
+  Packet4f value = (Packet4f)preverse((Packet2d)a.v);
+  value += a.v;
+  return std::complex<float>(value[0], value[1]);
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a) {
+  EIGEN_MSA_DEBUG;
+
+  return std::complex<float>((a.v[0] * a.v[2]) - (a.v[1] * a.v[3]), (a.v[0] * a.v[3]) + (a.v[1] * a.v[2]));
+}
+
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf, Packet4f)
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  EIGEN_MSA_DEBUG;
+
+  return a / b;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet2cf, 2>& value) {
+  os << "[ " << value.packet[0] << ", " << std::endl << "  " << value.packet[1] << " ]";
+  return os;
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2cf, 2>& kernel) {
+  EIGEN_MSA_DEBUG;
+
+  Packet4f tmp = (Packet4f)__builtin_msa_ilvl_d((v2i64)kernel.packet[1].v, (v2i64)kernel.packet[0].v);
+  kernel.packet[0].v = (Packet4f)__builtin_msa_ilvr_d((v2i64)kernel.packet[1].v, (v2i64)kernel.packet[0].v);
+  kernel.packet[1].v = tmp;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket,
+                                     const Packet2cf& elsePacket) {
+  return (Packet2cf)(Packet4f)pblend<Packet2d>(ifPacket, (Packet2d)thenPacket.v, (Packet2d)elsePacket.v);
+}
+
+//---------- double ----------
+
+struct Packet1cd {
+  EIGEN_STRONG_INLINE Packet1cd() {}
+  EIGEN_STRONG_INLINE explicit Packet1cd(const std::complex<double>& a) {
+    v[0] = std::real(a);
+    v[1] = std::imag(a);
+  }
+  EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {}
+  EIGEN_STRONG_INLINE Packet1cd(const Packet1cd& a) : v(a.v) {}
+  EIGEN_STRONG_INLINE Packet1cd& operator=(const Packet1cd& b) {
+    v = b.v;
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet1cd conjugate(void) const {
+    static const v2u64 p2ul_CONJ_XOR = {0x0, 0x8000000000000000};
+    return (Packet1cd)pxor(v, (Packet2d)p2ul_CONJ_XOR);
+  }
+  EIGEN_STRONG_INLINE Packet1cd& operator*=(const Packet1cd& b) {
+    Packet2d v1, v2;
+
+    // Get the real values of a | a1_re | a1_re
+    v1 = (Packet2d)__builtin_msa_ilvev_d((v2i64)v, (v2i64)v);
+    // Get the imag values of a | a1_im | a1_im
+    v2 = (Packet2d)__builtin_msa_ilvod_d((v2i64)v, (v2i64)v);
+    // Multiply the real a with b
+    v1 = pmul(v1, b.v);
+    // Multiply the imag a with b
+    v2 = pmul(v2, b.v);
+    // Conjugate v2
+    v2 = Packet1cd(v2).conjugate().v;
+    // Swap real/imag elements in v2.
+    v2 = (Packet2d)__builtin_msa_shf_w((v4i32)v2, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
+    // Add and return the result
+    v = padd(v1, v2);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet1cd operator*(const Packet1cd& b) const { return Packet1cd(*this) *= b; }
+  EIGEN_STRONG_INLINE Packet1cd& operator+=(const Packet1cd& b) {
+    v = padd(v, b.v);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet1cd operator+(const Packet1cd& b) const { return Packet1cd(*this) += b; }
+  EIGEN_STRONG_INLINE Packet1cd& operator-=(const Packet1cd& b) {
+    v = psub(v, b.v);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet1cd operator-(const Packet1cd& b) const { return Packet1cd(*this) -= b; }
+  EIGEN_STRONG_INLINE Packet1cd& operator/=(const Packet1cd& b) {
+    *this *= b.conjugate();
+    Packet2d s = pmul<Packet2d>(b.v, b.v);
+    s = padd(s, preverse<Packet2d>(s));
+    v = pdiv(v, s);
+    return *this;
+  }
+  EIGEN_STRONG_INLINE Packet1cd operator/(const Packet1cd& b) const { return Packet1cd(*this) /= b; }
+  EIGEN_STRONG_INLINE Packet1cd operator-(void) const { return Packet1cd(pnegate(v)); }
+
+  Packet2d v;
+};
+
+inline std::ostream& operator<<(std::ostream& os, const Packet1cd& value) {
+  os << "[ (" << value.v[0] << ", " << value.v[1] << "i) ]";
+  return os;
+}
+
+template <>
+struct packet_traits<std::complex<double> > : default_packet_traits {
+  typedef Packet1cd type;
+  typedef Packet1cd half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 0,
+    size = 1,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasNegate = 1,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 0,
+    HasMax = 0,
+    HasSetLinear = 0
+  };
+};
+
+template <>
+struct unpacket_traits<Packet1cd> {
+  typedef std::complex<double> type;
+  enum {
+    size = 1,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+  typedef Packet1cd half;
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>& from) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet1cd(from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  EIGEN_MSA_DEBUG;
+
+  return a + b;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  EIGEN_MSA_DEBUG;
+
+  return a - b;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) {
+  EIGEN_MSA_DEBUG;
+
+  return -a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) {
+  EIGEN_MSA_DEBUG;
+
+  return a.conjugate();
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  EIGEN_MSA_DEBUG;
+
+  return a * b;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pand<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet1cd(pand(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd por<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet1cd(por(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pxor<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet1cd(pxor(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet1cd(pandnot(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) {
+  EIGEN_MSA_DEBUG;
+
+  return pset1<Packet1cd>(*from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<std::complex<double> >(std::complex<double>* to, const Packet1cd& from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_ALIGNED_STORE pstore<double>((double*)to, from.v);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double>* to, const Packet1cd& from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_UNALIGNED_STORE pstoreu<double>((double*)to, from.v);
+}
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double>* addr) {
+  EIGEN_MSA_DEBUG;
+
+  prefetch(reinterpret_cast<const double*>(addr));
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from,
+                                                                            Index stride __attribute__((unused))) {
+  EIGEN_MSA_DEBUG;
+
+  Packet1cd res;
+  res.v[0] = std::real(from[0]);
+  res.v[1] = std::imag(from[0]);
+  return res;
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from,
+                                                                        Index stride __attribute__((unused))) {
+  EIGEN_MSA_DEBUG;
+
+  pstore(to, from);
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a) {
+  EIGEN_MSA_DEBUG;
+
+  return std::complex<double>(a.v[0], a.v[1]);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) {
+  EIGEN_MSA_DEBUG;
+
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a) {
+  EIGEN_MSA_DEBUG;
+
+  return pfirst(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) {
+  EIGEN_MSA_DEBUG;
+
+  return pfirst(a);
+}
+
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd, Packet2d)
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  EIGEN_MSA_DEBUG;
+
+  return a / b;
+}
+
+EIGEN_STRONG_INLINE Packet1cd pcplxflip /*<Packet1cd>*/ (const Packet1cd& x) {
+  EIGEN_MSA_DEBUG;
+
+  return Packet1cd(preverse(Packet2d(x.v)));
+}
+
+inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet1cd, 2>& value) {
+  os << "[ " << value.packet[0] << ", " << std::endl << "  " << value.packet[1] << " ]";
+  return os;
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd, 2>& kernel) {
+  EIGEN_MSA_DEBUG;
+
+  Packet2d v1, v2;
+
+  v1 = (Packet2d)__builtin_msa_ilvev_d((v2i64)kernel.packet[0].v, (v2i64)kernel.packet[1].v);
+  // Get the imag values of a
+  v2 = (Packet2d)__builtin_msa_ilvod_d((v2i64)kernel.packet[0].v, (v2i64)kernel.packet[1].v);
+
+  kernel.packet[0].v = v1;
+  kernel.packet[1].v = v2;
+}
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_COMPLEX_MSA_H
diff --git a/inst/include/Eigen/src/Core/arch/MSA/MathFunctions.h b/inst/include/Eigen/src/Core/arch/MSA/MathFunctions.h
new file mode 100644
index 00000000..f68d254f
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/MSA/MathFunctions.h
@@ -0,0 +1,379 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2007 Julien Pommier
+// Copyright (C) 2014 Pedro Gonnet (pedro.gonnet@gmail.com)
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// Copyright (C) 2018 Wave Computing, Inc.
+// Written by:
+//   Chris Larsen
+//   Alexey Frunze (afrunze@wavecomp.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/* The sin, cos, exp, and log functions of this file come from
+ * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
+ */
+
+/* The tanh function of this file is an adaptation of
+ * template<typename T> T generic_fast_tanh_float(const T&)
+ * from MathFunctionsImpl.h.
+ */
+
+#ifndef EIGEN_MATH_FUNCTIONS_MSA_H
+#define EIGEN_MATH_FUNCTIONS_MSA_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f plog<Packet4f>(const Packet4f& _x) {
+  static EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
+  static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292e-2f);
+  static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, -1.1514610310e-1f);
+  static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740e-1f);
+  static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, -1.2420140846e-1f);
+  static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, +1.4249322787e-1f);
+  static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, -1.6668057665e-1f);
+  static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, +2.0000714765e-1f);
+  static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, -2.4999993993e-1f);
+  static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, +3.3333331174e-1f);
+  static EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
+  static EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
+  static EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
+  static EIGEN_DECLARE_CONST_Packet4f(1, 1.0f);
+
+  // Convert negative argument into NAN (quiet negative, to be specific).
+  Packet4f zero = (Packet4f)__builtin_msa_ldi_w(0);
+  Packet4i neg_mask = __builtin_msa_fclt_w(_x, zero);
+  Packet4i zero_mask = __builtin_msa_fceq_w(_x, zero);
+  Packet4f non_neg_x_or_nan = padd(_x, (Packet4f)neg_mask);  // Add 0.0 or NAN.
+  Packet4f x = non_neg_x_or_nan;
+
+  // Extract exponent from x = mantissa * 2**exponent, where 1.0 <= mantissa < 2.0.
+  // N.B. the exponent is one less of what frexpf() would return.
+  Packet4i e_int = __builtin_msa_ftint_s_w(__builtin_msa_flog2_w(x));
+  // Multiply x by 2**(-exponent-1) to get 0.5 <= x < 1.0 as from frexpf().
+  x = __builtin_msa_fexp2_w(x, (Packet4i)__builtin_msa_nori_b((v16u8)e_int, 0));
+
+  /*
+     if (x < SQRTHF) {
+       x = x + x - 1.0;
+     } else {
+       e += 1;
+       x = x - 1.0;
+     }
+  */
+  Packet4f xx = padd(x, x);
+  Packet4i ge_mask = __builtin_msa_fcle_w(p4f_cephes_SQRTHF, x);
+  e_int = psub(e_int, ge_mask);
+  x = (Packet4f)__builtin_msa_bsel_v((v16u8)ge_mask, (v16u8)xx, (v16u8)x);
+  x = psub(x, p4f_1);
+  Packet4f e = __builtin_msa_ffint_s_w(e_int);
+
+  Packet4f x2 = pmul(x, x);
+  Packet4f x3 = pmul(x2, x);
+
+  Packet4f y, y1, y2;
+  y = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1);
+  y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4);
+  y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7);
+  y = pmadd(y, x, p4f_cephes_log_p2);
+  y1 = pmadd(y1, x, p4f_cephes_log_p5);
+  y2 = pmadd(y2, x, p4f_cephes_log_p8);
+  y = pmadd(y, x3, y1);
+  y = pmadd(y, x3, y2);
+  y = pmul(y, x3);
+
+  y = pmadd(e, p4f_cephes_log_q1, y);
+  x = __builtin_msa_fmsub_w(x, x2, p4f_half);
+  x = padd(x, y);
+  x = pmadd(e, p4f_cephes_log_q2, x);
+
+  // x is now the logarithm result candidate. We still need to handle the
+  // extreme arguments of zero and positive infinity, though.
+  // N.B. if the argument is +INFINITY, x is NAN because the polynomial terms
+  // contain infinities of both signs (see the coefficients and code above).
+  // INFINITY - INFINITY is NAN.
+
+  // If the argument is +INFINITY, make it the new result candidate.
+  // To achieve that we choose the smaller of the result candidate and the
+  // argument.
+  // This is correct for all finite pairs of values (the logarithm is smaller
+  // than the argument).
+  // This is also correct in the special case when the argument is +INFINITY
+  // and the result candidate is NAN. This is because the fmin.df instruction
+  // prefers non-NANs to NANs.
+  x = __builtin_msa_fmin_w(x, non_neg_x_or_nan);
+
+  // If the argument is zero (including -0.0), the result becomes -INFINITY.
+  Packet4i neg_infs = __builtin_msa_slli_w(zero_mask, 23);
+  x = (Packet4f)__builtin_msa_bsel_v((v16u8)zero_mask, (v16u8)x, (v16u8)neg_infs);
+
+  return x;
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f pexp<Packet4f>(const Packet4f& _x) {
+  // Limiting single-precision pexp's argument to [-128, +128] lets pexp
+  // reach 0 and INFINITY naturally.
+  static EIGEN_DECLARE_CONST_Packet4f(exp_lo, -128.0f);
+  static EIGEN_DECLARE_CONST_Packet4f(exp_hi, +128.0f);
+  static EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
+  static EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
+  static EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
+  static EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500e-4f);
+  static EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507e-3f);
+  static EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073e-3f);
+  static EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894e-2f);
+  static EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459e-1f);
+  static EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201e-1f);
+  static EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
+  static EIGEN_DECLARE_CONST_Packet4f(1, 1.0f);
+
+  Packet4f x = _x;
+
+  // Clamp x.
+  x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(x, p4f_exp_lo), (v16u8)x, (v16u8)p4f_exp_lo);
+  x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(p4f_exp_hi, x), (v16u8)x, (v16u8)p4f_exp_hi);
+
+  // Round to nearest integer by adding 0.5 (with x's sign) and truncating.
+  Packet4f x2_add = (Packet4f)__builtin_msa_binsli_w((v4u32)p4f_half, (v4u32)x, 0);
+  Packet4f x2 = pmadd(x, p4f_cephes_LOG2EF, x2_add);
+  Packet4i x2_int = __builtin_msa_ftrunc_s_w(x2);
+  Packet4f x2_int_f = __builtin_msa_ffint_s_w(x2_int);
+
+  x = __builtin_msa_fmsub_w(x, x2_int_f, p4f_cephes_exp_C1);
+  x = __builtin_msa_fmsub_w(x, x2_int_f, p4f_cephes_exp_C2);
+
+  Packet4f z = pmul(x, x);
+
+  Packet4f y = p4f_cephes_exp_p0;
+  y = pmadd(y, x, p4f_cephes_exp_p1);
+  y = pmadd(y, x, p4f_cephes_exp_p2);
+  y = pmadd(y, x, p4f_cephes_exp_p3);
+  y = pmadd(y, x, p4f_cephes_exp_p4);
+  y = pmadd(y, x, p4f_cephes_exp_p5);
+  y = pmadd(y, z, x);
+  y = padd(y, p4f_1);
+
+  // y *= 2**exponent.
+  y = __builtin_msa_fexp2_w(y, x2_int);
+
+  return y;
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f ptanh<Packet4f>(const Packet4f& _x) {
+  static EIGEN_DECLARE_CONST_Packet4f(tanh_tiny, 1e-4f);
+  static EIGEN_DECLARE_CONST_Packet4f(tanh_hi, 9.0f);
+  // The monomial coefficients of the numerator polynomial (odd).
+  static EIGEN_DECLARE_CONST_Packet4f(alpha_1, 4.89352455891786e-3f);
+  static EIGEN_DECLARE_CONST_Packet4f(alpha_3, 6.37261928875436e-4f);
+  static EIGEN_DECLARE_CONST_Packet4f(alpha_5, 1.48572235717979e-5f);
+  static EIGEN_DECLARE_CONST_Packet4f(alpha_7, 5.12229709037114e-8f);
+  static EIGEN_DECLARE_CONST_Packet4f(alpha_9, -8.60467152213735e-11f);
+  static EIGEN_DECLARE_CONST_Packet4f(alpha_11, 2.00018790482477e-13f);
+  static EIGEN_DECLARE_CONST_Packet4f(alpha_13, -2.76076847742355e-16f);
+  // The monomial coefficients of the denominator polynomial (even).
+  static EIGEN_DECLARE_CONST_Packet4f(beta_0, 4.89352518554385e-3f);
+  static EIGEN_DECLARE_CONST_Packet4f(beta_2, 2.26843463243900e-3f);
+  static EIGEN_DECLARE_CONST_Packet4f(beta_4, 1.18534705686654e-4f);
+  static EIGEN_DECLARE_CONST_Packet4f(beta_6, 1.19825839466702e-6f);
+
+  Packet4f x = pabs(_x);
+  Packet4i tiny_mask = __builtin_msa_fclt_w(x, p4f_tanh_tiny);
+
+  // Clamp the inputs to the range [-9, 9] since anything outside
+  // this range is -/+1.0f in single-precision.
+  x = (Packet4f)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_w(p4f_tanh_hi, x), (v16u8)x, (v16u8)p4f_tanh_hi);
+
+  // Since the polynomials are odd/even, we need x**2.
+  Packet4f x2 = pmul(x, x);
+
+  // Evaluate the numerator polynomial p.
+  Packet4f p = pmadd(x2, p4f_alpha_13, p4f_alpha_11);
+  p = pmadd(x2, p, p4f_alpha_9);
+  p = pmadd(x2, p, p4f_alpha_7);
+  p = pmadd(x2, p, p4f_alpha_5);
+  p = pmadd(x2, p, p4f_alpha_3);
+  p = pmadd(x2, p, p4f_alpha_1);
+  p = pmul(x, p);
+
+  // Evaluate the denominator polynomial q.
+  Packet4f q = pmadd(x2, p4f_beta_6, p4f_beta_4);
+  q = pmadd(x2, q, p4f_beta_2);
+  q = pmadd(x2, q, p4f_beta_0);
+
+  // Divide the numerator by the denominator.
+  p = pdiv(p, q);
+
+  // Reinstate the sign.
+  p = (Packet4f)__builtin_msa_binsli_w((v4u32)p, (v4u32)_x, 0);
+
+  // When the argument is very small in magnitude it's more accurate to just return it.
+  p = (Packet4f)__builtin_msa_bsel_v((v16u8)tiny_mask, (v16u8)p, (v16u8)_x);
+
+  return p;
+}
+
+template <bool sine>
+Packet4f psincos_inner_msa_float(const Packet4f& _x) {
+  static EIGEN_DECLARE_CONST_Packet4f(sincos_max_arg, 13176795.0f);  // Approx. (2**24) / (4/Pi).
+  static EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1, -0.78515625f);
+  static EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f);
+  static EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f);
+  static EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891e-4f);
+  static EIGEN_DECLARE_CONST_Packet4f(sincof_p1, 8.3321608736e-3f);
+  static EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611e-1f);
+  static EIGEN_DECLARE_CONST_Packet4f(coscof_p0, 2.443315711809948e-5f);
+  static EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765e-3f);
+  static EIGEN_DECLARE_CONST_Packet4f(coscof_p2, 4.166664568298827e-2f);
+  static EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f);  // 4/Pi.
+  static EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
+  static EIGEN_DECLARE_CONST_Packet4f(1, 1.0f);
+
+  Packet4f x = pabs(_x);
+
+  // Translate infinite arguments into NANs.
+  Packet4f zero_or_nan_if_inf = psub(_x, _x);
+  x = padd(x, zero_or_nan_if_inf);
+  // Prevent sin/cos from generating values larger than 1.0 in magnitude
+  // for very large arguments by setting x to 0.0.
+  Packet4i small_or_nan_mask = __builtin_msa_fcult_w(x, p4f_sincos_max_arg);
+  x = pand(x, (Packet4f)small_or_nan_mask);
+
+  // Scale x by 4/Pi to find x's octant.
+  Packet4f y = pmul(x, p4f_cephes_FOPI);
+  // Get the octant. We'll reduce x by this number of octants or by one more than it.
+  Packet4i y_int = __builtin_msa_ftrunc_s_w(y);
+  // x's from even-numbered octants will translate to octant 0: [0, +Pi/4].
+  // x's from odd-numbered octants will translate to octant -1: [-Pi/4, 0].
+  // Adjustment for odd-numbered octants: octant = (octant + 1) & (~1).
+  Packet4i y_int1 = __builtin_msa_addvi_w(y_int, 1);
+  Packet4i y_int2 = (Packet4i)__builtin_msa_bclri_w((Packet4ui)y_int1, 0);  // bclri = bit-clear
+  y = __builtin_msa_ffint_s_w(y_int2);
+
+  // Compute the sign to apply to the polynomial.
+  Packet4i sign_mask = sine ? pxor(__builtin_msa_slli_w(y_int1, 29), (Packet4i)_x)
+                            : __builtin_msa_slli_w(__builtin_msa_addvi_w(y_int, 3), 29);
+
+  // Get the polynomial selection mask.
+  // We'll calculate both (sin and cos) polynomials and then select from the two.
+  Packet4i poly_mask = __builtin_msa_ceqi_w(__builtin_msa_slli_w(y_int2, 30), 0);
+
+  // Reduce x by y octants to get: -Pi/4 <= x <= +Pi/4.
+  // The magic pass: "Extended precision modular arithmetic"
+  // x = ((x - y * DP1) - y * DP2) - y * DP3
+  Packet4f tmp1 = pmul(y, p4f_minus_cephes_DP1);
+  Packet4f tmp2 = pmul(y, p4f_minus_cephes_DP2);
+  Packet4f tmp3 = pmul(y, p4f_minus_cephes_DP3);
+  x = padd(x, tmp1);
+  x = padd(x, tmp2);
+  x = padd(x, tmp3);
+
+  // Evaluate the cos(x) polynomial.
+  y = p4f_coscof_p0;
+  Packet4f z = pmul(x, x);
+  y = pmadd(y, z, p4f_coscof_p1);
+  y = pmadd(y, z, p4f_coscof_p2);
+  y = pmul(y, z);
+  y = pmul(y, z);
+  y = __builtin_msa_fmsub_w(y, z, p4f_half);
+  y = padd(y, p4f_1);
+
+  // Evaluate the sin(x) polynomial.
+  Packet4f y2 = p4f_sincof_p0;
+  y2 = pmadd(y2, z, p4f_sincof_p1);
+  y2 = pmadd(y2, z, p4f_sincof_p2);
+  y2 = pmul(y2, z);
+  y2 = pmadd(y2, x, x);
+
+  // Select the correct result from the two polynomials.
+  y = sine ? (Packet4f)__builtin_msa_bsel_v((v16u8)poly_mask, (v16u8)y, (v16u8)y2)
+           : (Packet4f)__builtin_msa_bsel_v((v16u8)poly_mask, (v16u8)y2, (v16u8)y);
+
+  // Update the sign.
+  sign_mask = pxor(sign_mask, (Packet4i)y);
+  y = (Packet4f)__builtin_msa_binsli_w((v4u32)y, (v4u32)sign_mask, 0);  // binsli = bit-insert-left
+  return y;
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f psin<Packet4f>(const Packet4f& x) {
+  return psincos_inner_msa_float</* sine */ true>(x);
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f pcos<Packet4f>(const Packet4f& x) {
+  return psincos_inner_msa_float</* sine */ false>(x);
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2d pexp<Packet2d>(const Packet2d& _x) {
+  // Limiting double-precision pexp's argument to [-1024, +1024] lets pexp
+  // reach 0 and INFINITY naturally.
+  static EIGEN_DECLARE_CONST_Packet2d(exp_lo, -1024.0);
+  static EIGEN_DECLARE_CONST_Packet2d(exp_hi, +1024.0);
+  static EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599);
+  static EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
+  static EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
+  static EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4);
+  static EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2);
+  static EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1);
+  static EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6);
+  static EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3);
+  static EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1);
+  static EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0);
+  static EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
+  static EIGEN_DECLARE_CONST_Packet2d(1, 1.0);
+  static EIGEN_DECLARE_CONST_Packet2d(2, 2.0);
+
+  Packet2d x = _x;
+
+  // Clamp x.
+  x = (Packet2d)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_d(x, p2d_exp_lo), (v16u8)x, (v16u8)p2d_exp_lo);
+  x = (Packet2d)__builtin_msa_bsel_v((v16u8)__builtin_msa_fclt_d(p2d_exp_hi, x), (v16u8)x, (v16u8)p2d_exp_hi);
+
+  // Round to nearest integer by adding 0.5 (with x's sign) and truncating.
+  Packet2d x2_add = (Packet2d)__builtin_msa_binsli_d((v2u64)p2d_half, (v2u64)x, 0);
+  Packet2d x2 = pmadd(x, p2d_cephes_LOG2EF, x2_add);
+  Packet2l x2_long = __builtin_msa_ftrunc_s_d(x2);
+  Packet2d x2_long_d = __builtin_msa_ffint_s_d(x2_long);
+
+  x = __builtin_msa_fmsub_d(x, x2_long_d, p2d_cephes_exp_C1);
+  x = __builtin_msa_fmsub_d(x, x2_long_d, p2d_cephes_exp_C2);
+
+  x2 = pmul(x, x);
+
+  Packet2d px = p2d_cephes_exp_p0;
+  px = pmadd(px, x2, p2d_cephes_exp_p1);
+  px = pmadd(px, x2, p2d_cephes_exp_p2);
+  px = pmul(px, x);
+
+  Packet2d qx = p2d_cephes_exp_q0;
+  qx = pmadd(qx, x2, p2d_cephes_exp_q1);
+  qx = pmadd(qx, x2, p2d_cephes_exp_q2);
+  qx = pmadd(qx, x2, p2d_cephes_exp_q3);
+
+  x = pdiv(px, psub(qx, px));
+  x = pmadd(p2d_2, x, p2d_1);
+
+  // x *= 2**exponent.
+  x = __builtin_msa_fexp2_d(x, x2_long);
+
+  return x;
+}
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_MATH_FUNCTIONS_MSA_H
diff --git a/inst/include/Eigen/src/Core/arch/MSA/PacketMath.h b/inst/include/Eigen/src/Core/arch/MSA/PacketMath.h
new file mode 100644
index 00000000..81da24f8
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/MSA/PacketMath.h
@@ -0,0 +1,1237 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018 Wave Computing, Inc.
+// Written by:
+//   Chris Larsen
+//   Alexey Frunze (afrunze@wavecomp.com)
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_MSA_H
+#define EIGEN_PACKET_MATH_MSA_H
+
+#include <iostream>
+#include <string>
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
+#endif
+
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#endif
+
+#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
+#endif
+
+#if 0
+#define EIGEN_MSA_DEBUG                                                             \
+  static bool firstTime = true;                                                     \
+  do {                                                                              \
+    if (firstTime) {                                                                \
+      std::cout << __FILE__ << ':' << __LINE__ << ':' << __FUNCTION__ << std::endl; \
+      firstTime = false;                                                            \
+    }                                                                               \
+  } while (0)
+#else
+#define EIGEN_MSA_DEBUG
+#endif
+
+#define EIGEN_MSA_SHF_I8(a, b, c, d) (((d) << 6) | ((c) << 4) | ((b) << 2) | (a))
+
+typedef v4f32 Packet4f;
+typedef v4i32 Packet4i;
+typedef v4u32 Packet4ui;
+
+#define EIGEN_DECLARE_CONST_Packet4f(NAME, X) const Packet4f p4f_##NAME = {X, X, X, X}
+#define EIGEN_DECLARE_CONST_Packet4i(NAME, X) const Packet4i p4i_##NAME = {X, X, X, X}
+#define EIGEN_DECLARE_CONST_Packet4ui(NAME, X) const Packet4ui p4ui_##NAME = {X, X, X, X}
+
+inline std::ostream& operator<<(std::ostream& os, const Packet4f& value) {
+  os << "[ " << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << " ]";
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const Packet4i& value) {
+  os << "[ " << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << " ]";
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const Packet4ui& value) {
+  os << "[ " << value[0] << ", " << value[1] << ", " << value[2] << ", " << value[3] << " ]";
+  return os;
+}
+
+template <>
+struct packet_traits<float> : default_packet_traits {
+  typedef Packet4f type;
+  typedef Packet4f half;  // Packet2f intrinsics not implemented yet
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+    // FIXME check the Has*
+    HasDiv = 1,
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH,
+    HasLog = 1,
+    HasExp = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasBlend = 1
+  };
+};
+
+template <>
+struct packet_traits<int32_t> : default_packet_traits {
+  typedef Packet4i type;
+  typedef Packet4i half;  // Packet2i intrinsics not implemented yet
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+    // FIXME check the Has*
+    HasDiv = 1,
+    HasBlend = 1
+  };
+};
+
+template <>
+struct unpacket_traits<Packet4f> {
+  typedef float type;
+  enum {
+    size = 4,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+  typedef Packet4f half;
+};
+
+template <>
+struct unpacket_traits<Packet4i> {
+  typedef int32_t type;
+  enum {
+    size = 4,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+  typedef Packet4i half;
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
+  EIGEN_MSA_DEBUG;
+
+  Packet4f v = {from, from, from, from};
+  return v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int32_t& from) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fill_w(from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(const float* from) {
+  EIGEN_MSA_DEBUG;
+
+  float f = *from;
+  Packet4f v = {f, f, f, f};
+  return v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pload1<Packet4i>(const int32_t* from) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fill_w(*from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fadd_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_addv_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) {
+  EIGEN_MSA_DEBUG;
+
+  static const Packet4f countdown = {0.0f, 1.0f, 2.0f, 3.0f};
+  return padd(pset1<Packet4f>(a), countdown);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int32_t& a) {
+  EIGEN_MSA_DEBUG;
+
+  static const Packet4i countdown = {0, 1, 2, 3};
+  return padd(pset1<Packet4i>(a), countdown);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fsub_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_subv_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet4f)__builtin_msa_bnegi_w((v4u32)a, 31);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_addvi_w((v4i32)__builtin_msa_nori_b((v16u8)a, 0), 1);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) {
+  EIGEN_MSA_DEBUG;
+
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) {
+  EIGEN_MSA_DEBUG;
+
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fmul_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_mulv_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fdiv_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_div_s_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fmadd_w(c, a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
+  EIGEN_MSA_DEBUG;
+
+  // Use "asm" construct to avoid __builtin_msa_maddv_w GNU C bug.
+  Packet4i value = c;
+  __asm__("maddv.w %w[value], %w[a], %w[b]\n"
+          // Outputs
+          : [value] "+f"(value)
+          // Inputs
+          : [a] "f"(a), [b] "f"(b));
+  return value;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet4f)__builtin_msa_and_v((v16u8)a, (v16u8)b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet4i)__builtin_msa_and_v((v16u8)a, (v16u8)b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet4f)__builtin_msa_or_v((v16u8)a, (v16u8)b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet4i)__builtin_msa_or_v((v16u8)a, (v16u8)b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet4f)__builtin_msa_xor_v((v16u8)a, (v16u8)b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet4i)__builtin_msa_xor_v((v16u8)a, (v16u8)b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  EIGEN_MSA_DEBUG;
+
+  return pand(a, (Packet4f)__builtin_msa_xori_b((v16u8)b, 255));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  EIGEN_MSA_DEBUG;
+
+  return pand(a, (Packet4i)__builtin_msa_xori_b((v16u8)b, 255));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  EIGEN_MSA_DEBUG;
+
+#if EIGEN_FAST_MATH
+  // This prefers numbers to NaNs.
+  return __builtin_msa_fmin_w(a, b);
+#else
+  // This prefers NaNs to numbers.
+  Packet4i aNaN = __builtin_msa_fcun_w(a, a);
+  Packet4i aMinOrNaN = por(__builtin_msa_fclt_w(a, b), aNaN);
+  return (Packet4f)__builtin_msa_bsel_v((v16u8)aMinOrNaN, (v16u8)b, (v16u8)a);
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_min_s_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  EIGEN_MSA_DEBUG;
+
+#if EIGEN_FAST_MATH
+  // This prefers numbers to NaNs.
+  return __builtin_msa_fmax_w(a, b);
+#else
+  // This prefers NaNs to numbers.
+  Packet4i aNaN = __builtin_msa_fcun_w(a, a);
+  Packet4i aMaxOrNaN = por(__builtin_msa_fclt_w(b, a), aNaN);
+  return (Packet4f)__builtin_msa_bsel_v((v16u8)aMaxOrNaN, (v16u8)b, (v16u8)a);
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_max_s_w(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_ALIGNED_LOAD return (Packet4f)__builtin_msa_ld_w(const_cast<float*>(from), 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int32_t* from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_ALIGNED_LOAD return __builtin_msa_ld_w(const_cast<int32_t*>(from), 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_UNALIGNED_LOAD return (Packet4f)__builtin_msa_ld_w(const_cast<float*>(from), 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int32_t* from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_UNALIGNED_LOAD return (Packet4i)__builtin_msa_ld_w(const_cast<int32_t*>(from), 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) {
+  EIGEN_MSA_DEBUG;
+
+  float f0 = from[0], f1 = from[1];
+  Packet4f v0 = {f0, f0, f0, f0};
+  Packet4f v1 = {f1, f1, f1, f1};
+  return (Packet4f)__builtin_msa_ilvr_d((v2i64)v1, (v2i64)v0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int32_t* from) {
+  EIGEN_MSA_DEBUG;
+
+  int32_t i0 = from[0], i1 = from[1];
+  Packet4i v0 = {i0, i0, i0, i0};
+  Packet4i v1 = {i1, i1, i1, i1};
+  return (Packet4i)__builtin_msa_ilvr_d((v2i64)v1, (v2i64)v0);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_w((Packet4i)from, to, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet4i& from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_w(from, to, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_w((Packet4i)from, to, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet4i& from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_w(from, to, 0);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
+  EIGEN_MSA_DEBUG;
+
+  float f = *from;
+  Packet4f v = {f, f, f, f};
+  v[1] = from[stride];
+  v[2] = from[2 * stride];
+  v[3] = from[3 * stride];
+  return v;
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet4i pgather<int32_t, Packet4i>(const int32_t* from, Index stride) {
+  EIGEN_MSA_DEBUG;
+
+  int32_t i = *from;
+  Packet4i v = {i, i, i, i};
+  v[1] = from[stride];
+  v[2] = from[2 * stride];
+  v[3] = from[3 * stride];
+  return v;
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) {
+  EIGEN_MSA_DEBUG;
+
+  *to = from[0];
+  to += stride;
+  *to = from[1];
+  to += stride;
+  *to = from[2];
+  to += stride;
+  *to = from[3];
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<int32_t, Packet4i>(int32_t* to, const Packet4i& from, Index stride) {
+  EIGEN_MSA_DEBUG;
+
+  *to = from[0];
+  to += stride;
+  *to = from[1];
+  to += stride;
+  *to = from[2];
+  to += stride;
+  *to = from[3];
+}
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
+  EIGEN_MSA_DEBUG;
+
+  __builtin_prefetch(addr);
+}
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<int32_t>(const int32_t* addr) {
+  EIGEN_MSA_DEBUG;
+
+  __builtin_prefetch(addr);
+}
+
+template <>
+EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
+  EIGEN_MSA_DEBUG;
+
+  return a[0];
+}
+
+template <>
+EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) {
+  EIGEN_MSA_DEBUG;
+
+  return a[0];
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(3, 2, 1, 0));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(3, 2, 1, 0));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet4f)__builtin_msa_bclri_w((v4u32)a, 31);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) {
+  EIGEN_MSA_DEBUG;
+
+  Packet4i zero = __builtin_msa_ldi_w(0);
+  return __builtin_msa_add_a_w(zero, a);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
+  EIGEN_MSA_DEBUG;
+
+  Packet4f s = padd(a, (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
+  s = padd(s, (Packet4f)__builtin_msa_shf_w((v4i32)s, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+  return s[0];
+}
+
+template <>
+EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a) {
+  EIGEN_MSA_DEBUG;
+
+  Packet4i s = padd(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
+  s = padd(s, __builtin_msa_shf_w(s, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+  return s[0];
+}
+
+// Other reduction functions:
+// mul
+template <>
+EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) {
+  EIGEN_MSA_DEBUG;
+
+  Packet4f p = pmul(a, (Packet4f)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
+  p = pmul(p, (Packet4f)__builtin_msa_shf_w((v4i32)p, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+  return p[0];
+}
+
+template <>
+EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(const Packet4i& a) {
+  EIGEN_MSA_DEBUG;
+
+  Packet4i p = pmul(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
+  p = pmul(p, __builtin_msa_shf_w(p, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+  return p[0];
+}
+
+// min
+template <>
+EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
+  EIGEN_MSA_DEBUG;
+
+  // Swap 64-bit halves of a.
+  Packet4f swapped = (Packet4f)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
+#if !EIGEN_FAST_MATH
+  // Detect presence of NaNs from pairs a[0]-a[2] and a[1]-a[3] as two 32-bit
+  // masks of all zeroes/ones in low 64 bits.
+  v16u8 unord = (v16u8)__builtin_msa_fcun_w(a, swapped);
+  // Combine the two masks into one: 64 ones if no NaNs, otherwise 64 zeroes.
+  unord = (v16u8)__builtin_msa_ceqi_d((v2i64)unord, 0);
+#endif
+  // Continue with min computation.
+  Packet4f v = __builtin_msa_fmin_w(a, swapped);
+  v = __builtin_msa_fmin_w(v, (Packet4f)__builtin_msa_shf_w((Packet4i)v, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+#if !EIGEN_FAST_MATH
+  // Based on the mask select between v and 4 qNaNs.
+  v16u8 qnans = (v16u8)__builtin_msa_fill_w(0x7FC00000);
+  v = (Packet4f)__builtin_msa_bsel_v(unord, qnans, (v16u8)v);
+#endif
+  return v[0];
+}
+
+template <>
+EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a) {
+  EIGEN_MSA_DEBUG;
+
+  Packet4i m = pmin(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
+  m = pmin(m, __builtin_msa_shf_w(m, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+  return m[0];
+}
+
+// max
+template <>
+EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
+  EIGEN_MSA_DEBUG;
+
+  // Swap 64-bit halves of a.
+  Packet4f swapped = (Packet4f)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
+#if !EIGEN_FAST_MATH
+  // Detect presence of NaNs from pairs a[0]-a[2] and a[1]-a[3] as two 32-bit
+  // masks of all zeroes/ones in low 64 bits.
+  v16u8 unord = (v16u8)__builtin_msa_fcun_w(a, swapped);
+  // Combine the two masks into one: 64 ones if no NaNs, otherwise 64 zeroes.
+  unord = (v16u8)__builtin_msa_ceqi_d((v2i64)unord, 0);
+#endif
+  // Continue with max computation.
+  Packet4f v = __builtin_msa_fmax_w(a, swapped);
+  v = __builtin_msa_fmax_w(v, (Packet4f)__builtin_msa_shf_w((Packet4i)v, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+#if !EIGEN_FAST_MATH
+  // Based on the mask select between v and 4 qNaNs.
+  v16u8 qnans = (v16u8)__builtin_msa_fill_w(0x7FC00000);
+  v = (Packet4f)__builtin_msa_bsel_v(unord, qnans, (v16u8)v);
+#endif
+  return v[0];
+}
+
+template <>
+EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a) {
+  EIGEN_MSA_DEBUG;
+
+  Packet4i m = pmax(a, __builtin_msa_shf_w(a, EIGEN_MSA_SHF_I8(2, 3, 0, 1)));
+  m = pmax(m, __builtin_msa_shf_w(m, EIGEN_MSA_SHF_I8(1, 0, 3, 2)));
+  return m[0];
+}
+
+inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet4f, 4>& value) {
+  os << "[ " << value.packet[0] << "," << std::endl
+     << "  " << value.packet[1] << "," << std::endl
+     << "  " << value.packet[2] << "," << std::endl
+     << "  " << value.packet[3] << " ]";
+  return os;
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
+  EIGEN_MSA_DEBUG;
+
+  v4i32 tmp1, tmp2, tmp3, tmp4;
+
+  tmp1 = __builtin_msa_ilvr_w((v4i32)kernel.packet[1], (v4i32)kernel.packet[0]);
+  tmp2 = __builtin_msa_ilvr_w((v4i32)kernel.packet[3], (v4i32)kernel.packet[2]);
+  tmp3 = __builtin_msa_ilvl_w((v4i32)kernel.packet[1], (v4i32)kernel.packet[0]);
+  tmp4 = __builtin_msa_ilvl_w((v4i32)kernel.packet[3], (v4i32)kernel.packet[2]);
+
+  kernel.packet[0] = (Packet4f)__builtin_msa_ilvr_d((v2i64)tmp2, (v2i64)tmp1);
+  kernel.packet[1] = (Packet4f)__builtin_msa_ilvod_d((v2i64)tmp2, (v2i64)tmp1);
+  kernel.packet[2] = (Packet4f)__builtin_msa_ilvr_d((v2i64)tmp4, (v2i64)tmp3);
+  kernel.packet[3] = (Packet4f)__builtin_msa_ilvod_d((v2i64)tmp4, (v2i64)tmp3);
+}
+
+inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet4i, 4>& value) {
+  os << "[ " << value.packet[0] << "," << std::endl
+     << "  " << value.packet[1] << "," << std::endl
+     << "  " << value.packet[2] << "," << std::endl
+     << "  " << value.packet[3] << " ]";
+  return os;
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
+  EIGEN_MSA_DEBUG;
+
+  v4i32 tmp1, tmp2, tmp3, tmp4;
+
+  tmp1 = __builtin_msa_ilvr_w(kernel.packet[1], kernel.packet[0]);
+  tmp2 = __builtin_msa_ilvr_w(kernel.packet[3], kernel.packet[2]);
+  tmp3 = __builtin_msa_ilvl_w(kernel.packet[1], kernel.packet[0]);
+  tmp4 = __builtin_msa_ilvl_w(kernel.packet[3], kernel.packet[2]);
+
+  kernel.packet[0] = (Packet4i)__builtin_msa_ilvr_d((v2i64)tmp2, (v2i64)tmp1);
+  kernel.packet[1] = (Packet4i)__builtin_msa_ilvod_d((v2i64)tmp2, (v2i64)tmp1);
+  kernel.packet[2] = (Packet4i)__builtin_msa_ilvr_d((v2i64)tmp4, (v2i64)tmp3);
+  kernel.packet[3] = (Packet4i)__builtin_msa_ilvod_d((v2i64)tmp4, (v2i64)tmp3);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fsqrt_w(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f prsqrt(const Packet4f& a) {
+  EIGEN_MSA_DEBUG;
+
+#if EIGEN_FAST_MATH
+  return __builtin_msa_frsqrt_w(a);
+#else
+  Packet4f ones = __builtin_msa_ffint_s_w(__builtin_msa_ldi_w(1));
+  return pdiv(ones, psqrt(a));
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) {
+  Packet4f v = a;
+  int32_t old_mode, new_mode;
+  asm volatile(
+      "cfcmsa  %[old_mode], $1\n"
+      "ori     %[new_mode], %[old_mode], 3\n"  // 3 = round towards -INFINITY.
+      "ctcmsa  $1, %[new_mode]\n"
+      "frint.w %w[v], %w[v]\n"
+      "ctcmsa  $1, %[old_mode]\n"
+      :  // outputs
+      [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
+      [v] "+f"(v)
+      :  // inputs
+      :  // clobbers
+  );
+  return v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) {
+  Packet4f v = a;
+  int32_t old_mode, new_mode;
+  asm volatile(
+      "cfcmsa  %[old_mode], $1\n"
+      "ori     %[new_mode], %[old_mode], 3\n"
+      "xori    %[new_mode], %[new_mode], 1\n"  // 2 = round towards +INFINITY.
+      "ctcmsa  $1, %[new_mode]\n"
+      "frint.w %w[v], %w[v]\n"
+      "ctcmsa  $1, %[old_mode]\n"
+      :  // outputs
+      [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
+      [v] "+f"(v)
+      :  // inputs
+      :  // clobbers
+  );
+  return v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) {
+  Packet4f v = a;
+  int32_t old_mode, new_mode;
+  asm volatile(
+      "cfcmsa  %[old_mode], $1\n"
+      "ori     %[new_mode], %[old_mode], 3\n"
+      "xori    %[new_mode], %[new_mode], 3\n"  // 0 = round to nearest, ties to even.
+      "ctcmsa  $1, %[new_mode]\n"
+      "frint.w %w[v], %w[v]\n"
+      "ctcmsa  $1, %[old_mode]\n"
+      :  // outputs
+      [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
+      [v] "+f"(v)
+      :  // inputs
+      :  // clobbers
+  );
+  return v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket,
+                                    const Packet4f& elsePacket) {
+  Packet4ui select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3]};
+  Packet4i mask = __builtin_msa_ceqi_w((Packet4i)select, 0);
+  return (Packet4f)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket,
+                                    const Packet4i& elsePacket) {
+  Packet4ui select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3]};
+  Packet4i mask = __builtin_msa_ceqi_w((Packet4i)select, 0);
+  return (Packet4i)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket);
+}
+
+//---------- double ----------
+
+typedef v2f64 Packet2d;
+typedef v2i64 Packet2l;
+typedef v2u64 Packet2ul;
+
+#define EIGEN_DECLARE_CONST_Packet2d(NAME, X) const Packet2d p2d_##NAME = {X, X}
+#define EIGEN_DECLARE_CONST_Packet2l(NAME, X) const Packet2l p2l_##NAME = {X, X}
+#define EIGEN_DECLARE_CONST_Packet2ul(NAME, X) const Packet2ul p2ul_##NAME = {X, X}
+
+inline std::ostream& operator<<(std::ostream& os, const Packet2d& value) {
+  os << "[ " << value[0] << ", " << value[1] << " ]";
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const Packet2l& value) {
+  os << "[ " << value[0] << ", " << value[1] << " ]";
+  return os;
+}
+
+inline std::ostream& operator<<(std::ostream& os, const Packet2ul& value) {
+  os << "[ " << value[0] << ", " << value[1] << " ]";
+  return os;
+}
+
+template <>
+struct packet_traits<double> : default_packet_traits {
+  typedef Packet2d type;
+  typedef Packet2d half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 2,
+    // FIXME check the Has*
+    HasDiv = 1,
+    HasExp = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasBlend = 1
+  };
+};
+
+template <>
+struct unpacket_traits<Packet2d> {
+  typedef double type;
+  enum {
+    size = 2,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+  typedef Packet2d half;
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
+  EIGEN_MSA_DEBUG;
+
+  Packet2d value = {from, from};
+  return value;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fadd_d(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) {
+  EIGEN_MSA_DEBUG;
+
+  static const Packet2d countdown = {0.0, 1.0};
+  return padd(pset1<Packet2d>(a), countdown);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fsub_d(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet2d)__builtin_msa_bnegi_d((v2u64)a, 63);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) {
+  EIGEN_MSA_DEBUG;
+
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fmul_d(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fdiv_d(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fmadd_d(c, a, b);
+}
+
+// Logical Operations are not supported for float, so we have to reinterpret casts using MSA
+// intrinsics
+template <>
+EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet2d)__builtin_msa_and_v((v16u8)a, (v16u8)b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet2d)__builtin_msa_or_v((v16u8)a, (v16u8)b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet2d)__builtin_msa_xor_v((v16u8)a, (v16u8)b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  EIGEN_MSA_DEBUG;
+
+  return pand(a, (Packet2d)__builtin_msa_xori_b((v16u8)b, 255));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_UNALIGNED_LOAD return (Packet2d)__builtin_msa_ld_d(const_cast<double*>(from), 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  EIGEN_MSA_DEBUG;
+
+#if EIGEN_FAST_MATH
+  // This prefers numbers to NaNs.
+  return __builtin_msa_fmin_d(a, b);
+#else
+  // This prefers NaNs to numbers.
+  v2i64 aNaN = __builtin_msa_fcun_d(a, a);
+  v2i64 aMinOrNaN = por(__builtin_msa_fclt_d(a, b), aNaN);
+  return (Packet2d)__builtin_msa_bsel_v((v16u8)aMinOrNaN, (v16u8)b, (v16u8)a);
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  EIGEN_MSA_DEBUG;
+
+#if EIGEN_FAST_MATH
+  // This prefers numbers to NaNs.
+  return __builtin_msa_fmax_d(a, b);
+#else
+  // This prefers NaNs to numbers.
+  v2i64 aNaN = __builtin_msa_fcun_d(a, a);
+  v2i64 aMaxOrNaN = por(__builtin_msa_fclt_d(b, a), aNaN);
+  return (Packet2d)__builtin_msa_bsel_v((v16u8)aMaxOrNaN, (v16u8)b, (v16u8)a);
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_UNALIGNED_LOAD return (Packet2d)__builtin_msa_ld_d(const_cast<double*>(from), 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) {
+  EIGEN_MSA_DEBUG;
+
+  Packet2d value = {*from, *from};
+  return value;
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_ALIGNED_STORE __builtin_msa_st_d((v2i64)from, to, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
+  EIGEN_MSA_DEBUG;
+
+  EIGEN_DEBUG_UNALIGNED_STORE __builtin_msa_st_d((v2i64)from, to, 0);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride) {
+  EIGEN_MSA_DEBUG;
+
+  Packet2d value;
+  value[0] = *from;
+  from += stride;
+  value[1] = *from;
+  return value;
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) {
+  EIGEN_MSA_DEBUG;
+
+  *to = from[0];
+  to += stride;
+  *to = from[1];
+}
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
+  EIGEN_MSA_DEBUG;
+
+  __builtin_prefetch(addr);
+}
+
+template <>
+EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
+  EIGEN_MSA_DEBUG;
+
+  return a[0];
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet2d)__builtin_msa_shf_w((v4i32)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) {
+  EIGEN_MSA_DEBUG;
+
+  return (Packet2d)__builtin_msa_bclri_d((v2u64)a, 63);
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) {
+  EIGEN_MSA_DEBUG;
+
+  Packet2d s = padd(a, preverse(a));
+  return s[0];
+}
+
+// Other reduction functions:
+// mul
+template <>
+EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) {
+  EIGEN_MSA_DEBUG;
+
+  Packet2d p = pmul(a, preverse(a));
+  return p[0];
+}
+
+// min
+template <>
+EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) {
+  EIGEN_MSA_DEBUG;
+
+#if EIGEN_FAST_MATH
+  Packet2d swapped = (Packet2d)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
+  Packet2d v = __builtin_msa_fmin_d(a, swapped);
+  return v[0];
+#else
+  double a0 = a[0], a1 = a[1];
+  return ((numext::isnan)(a0) || a0 < a1) ? a0 : a1;
+#endif
+}
+
+// max
+template <>
+EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) {
+  EIGEN_MSA_DEBUG;
+
+#if EIGEN_FAST_MATH
+  Packet2d swapped = (Packet2d)__builtin_msa_shf_w((Packet4i)a, EIGEN_MSA_SHF_I8(2, 3, 0, 1));
+  Packet2d v = __builtin_msa_fmax_d(a, swapped);
+  return v[0];
+#else
+  double a0 = a[0], a1 = a[1];
+  return ((numext::isnan)(a0) || a0 > a1) ? a0 : a1;
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& a) {
+  EIGEN_MSA_DEBUG;
+
+  return __builtin_msa_fsqrt_d(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d prsqrt(const Packet2d& a) {
+  EIGEN_MSA_DEBUG;
+
+#if EIGEN_FAST_MATH
+  return __builtin_msa_frsqrt_d(a);
+#else
+  Packet2d ones = __builtin_msa_ffint_s_d(__builtin_msa_ldi_d(1));
+  return pdiv(ones, psqrt(a));
+#endif
+}
+
+inline std::ostream& operator<<(std::ostream& os, const PacketBlock<Packet2d, 2>& value) {
+  os << "[ " << value.packet[0] << "," << std::endl << "  " << value.packet[1] << " ]";
+  return os;
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
+  EIGEN_MSA_DEBUG;
+
+  Packet2d trn1 = (Packet2d)__builtin_msa_ilvev_d((v2i64)kernel.packet[1], (v2i64)kernel.packet[0]);
+  Packet2d trn2 = (Packet2d)__builtin_msa_ilvod_d((v2i64)kernel.packet[1], (v2i64)kernel.packet[0]);
+  kernel.packet[0] = trn1;
+  kernel.packet[1] = trn2;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) {
+  Packet2d v = a;
+  int32_t old_mode, new_mode;
+  asm volatile(
+      "cfcmsa  %[old_mode], $1\n"
+      "ori     %[new_mode], %[old_mode], 3\n"  // 3 = round towards -INFINITY.
+      "ctcmsa  $1, %[new_mode]\n"
+      "frint.d %w[v], %w[v]\n"
+      "ctcmsa  $1, %[old_mode]\n"
+      :  // outputs
+      [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
+      [v] "+f"(v)
+      :  // inputs
+      :  // clobbers
+  );
+  return v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) {
+  Packet2d v = a;
+  int32_t old_mode, new_mode;
+  asm volatile(
+      "cfcmsa  %[old_mode], $1\n"
+      "ori     %[new_mode], %[old_mode], 3\n"
+      "xori    %[new_mode], %[new_mode], 1\n"  // 2 = round towards +INFINITY.
+      "ctcmsa  $1, %[new_mode]\n"
+      "frint.d %w[v], %w[v]\n"
+      "ctcmsa  $1, %[old_mode]\n"
+      :  // outputs
+      [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
+      [v] "+f"(v)
+      :  // inputs
+      :  // clobbers
+  );
+  return v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) {
+  Packet2d v = a;
+  int32_t old_mode, new_mode;
+  asm volatile(
+      "cfcmsa  %[old_mode], $1\n"
+      "ori     %[new_mode], %[old_mode], 3\n"
+      "xori    %[new_mode], %[new_mode], 3\n"  // 0 = round to nearest, ties to even.
+      "ctcmsa  $1, %[new_mode]\n"
+      "frint.d %w[v], %w[v]\n"
+      "ctcmsa  $1, %[old_mode]\n"
+      :  // outputs
+      [old_mode] "=r"(old_mode), [new_mode] "=r"(new_mode),
+      [v] "+f"(v)
+      :  // inputs
+      :  // clobbers
+  );
+  return v;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket,
+                                    const Packet2d& elsePacket) {
+  Packet2ul select = {ifPacket.select[0], ifPacket.select[1]};
+  Packet2l mask = __builtin_msa_ceqi_d((Packet2l)select, 0);
+  return (Packet2d)__builtin_msa_bsel_v((v16u8)mask, (v16u8)thenPacket, (v16u8)elsePacket);
+}
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_PACKET_MATH_MSA_H
diff --git a/inst/include/Eigen/src/Core/arch/NEON/Complex.h b/inst/include/Eigen/src/Core/arch/NEON/Complex.h
index 8d9255ee..b8655c80 100644
--- a/inst/include/Eigen/src/Core/arch/NEON/Complex.h
+++ b/inst/include/Eigen/src/Core/arch/NEON/Complex.h
@@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2010 Konstantinos Margaritis <markos@freevec.org>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,165 +11,413 @@
 #ifndef EIGEN_COMPLEX_NEON_H
 #define EIGEN_COMPLEX_NEON_H
 
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
 namespace Eigen {
 
 namespace internal {
 
-static uint32x4_t p4ui_CONJ_XOR = EIGEN_INIT_NEON_PACKET4(0x00000000, 0x80000000, 0x00000000, 0x80000000);
-static uint32x2_t p2ui_CONJ_XOR = EIGEN_INIT_NEON_PACKET2(0x00000000, 0x80000000);
+inline uint32x4_t p4ui_CONJ_XOR() {
+// See bug 1325, clang fails to call vld1q_u64.
+#if EIGEN_COMP_CLANG || EIGEN_COMP_CASTXML
+  uint32x4_t ret = {0x00000000, 0x80000000, 0x00000000, 0x80000000};
+  return ret;
+#else
+  static const uint32_t conj_XOR_DATA[] = {0x00000000, 0x80000000, 0x00000000, 0x80000000};
+  return vld1q_u32(conj_XOR_DATA);
+#endif
+}
+
+inline uint32x2_t p2ui_CONJ_XOR() {
+  static const uint32_t conj_XOR_DATA[] = {0x00000000, 0x80000000};
+  return vld1_u32(conj_XOR_DATA);
+}
 
 //---------- float ----------
-struct Packet2cf
-{
+
+struct Packet1cf {
+  EIGEN_STRONG_INLINE Packet1cf() {}
+  EIGEN_STRONG_INLINE explicit Packet1cf(const Packet2f& a) : v(a) {}
+  Packet2f v;
+};
+struct Packet2cf {
   EIGEN_STRONG_INLINE Packet2cf() {}
   EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {}
-  Packet4f  v;
+  Packet4f v;
 };
 
-template<> struct packet_traits<std::complex<float> >  : default_packet_traits
-{
+template <>
+struct packet_traits<std::complex<float>> : default_packet_traits {
   typedef Packet2cf type;
+  typedef Packet1cf half;
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 2,
 
-    HasAdd    = 1,
-    HasSub    = 1,
-    HasMul    = 1,
-    HasDiv    = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
     HasNegate = 1,
-    HasAbs    = 0,
-    HasAbs2   = 0,
-    HasMin    = 0,
-    HasMax    = 0,
+    HasSqrt = 1,
+    HasLog = 1,
+    HasExp = 1,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 0,
+    HasMax = 0,
     HasSetLinear = 0
   };
 };
 
-template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; };
+template <>
+struct unpacket_traits<Packet1cf> : neon_unpacket_default<Packet1cf, std::complex<float>> {
+  using as_real = Packet2f;
+};
+template <>
+struct unpacket_traits<Packet2cf> : neon_unpacket_default<Packet2cf, std::complex<float>> {
+  using half = Packet1cf;
+  using as_real = Packet4f;
+};
 
-template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
-{
-  float32x2_t r64;
-  r64 = vld1_f32((float *)&from);
+template <>
+EIGEN_STRONG_INLINE Packet1cf pcast<float, Packet1cf>(const float& a) {
+  return Packet1cf(vset_lane_f32(a, vdup_n_f32(0.f), 0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pcast<Packet2f, Packet2cf>(const Packet2f& a) {
+  return Packet2cf(vreinterpretq_f32_u64(vmovl_u32(vreinterpret_u32_f32(a))));
+}
 
+template <>
+EIGEN_STRONG_INLINE Packet1cf pzero(const Packet1cf& /*a*/) {
+  return Packet1cf(vdup_n_f32(0.0f));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pzero(const Packet2cf& /*a*/) {
+  return Packet2cf(vdupq_n_f32(0.0f));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cf pset1<Packet1cf>(const std::complex<float>& from) {
+  return Packet1cf(vld1_f32(reinterpret_cast<const float*>(&from)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from) {
+  const float32x2_t r64 = vld1_f32(reinterpret_cast<const float*>(&from));
   return Packet2cf(vcombine_f32(r64, r64));
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(padd<Packet4f>(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(psub<Packet4f>(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) { return Packet2cf(pnegate<Packet4f>(a.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a)
-{
-  Packet4ui b = vreinterpretq_u32_f32(a.v);
-  return Packet2cf(vreinterpretq_f32_u32(veorq_u32(b, p4ui_CONJ_XOR)));
+template <>
+EIGEN_STRONG_INLINE Packet1cf padd<Packet1cf>(const Packet1cf& a, const Packet1cf& b) {
+  return Packet1cf(padd<Packet2f>(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return Packet2cf(padd<Packet4f>(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cf psub<Packet1cf>(const Packet1cf& a, const Packet1cf& b) {
+  return Packet1cf(psub<Packet2f>(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return Packet2cf(psub<Packet4f>(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cf pnegate(const Packet1cf& a) {
+  return Packet1cf(pnegate<Packet2f>(a.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) {
+  return Packet2cf(pnegate<Packet4f>(a.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cf pconj(const Packet1cf& a) {
+  const Packet2ui b = Packet2ui(vreinterpret_u32_f32(a.v));
+  return Packet1cf(vreinterpret_f32_u32(veor_u32(b, p2ui_CONJ_XOR())));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) {
+  const Packet4ui b = Packet4ui(vreinterpretq_u32_f32(a.v));
+  return Packet2cf(vreinterpretq_f32_u32(veorq_u32(b, p4ui_CONJ_XOR())));
+}
+
+#ifdef __ARM_FEATURE_COMPLEX
+template <>
+EIGEN_STRONG_INLINE Packet1cf pmadd<Packet1cf>(const Packet1cf& a, const Packet1cf& b, const Packet1cf& c) {
+  Packet1cf result;
+  result.v = vcmla_f32(c.v, a.v, b.v);
+  result.v = vcmla_rot90_f32(result.v, a.v, b.v);
+  return result;
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
+template <>
+EIGEN_STRONG_INLINE Packet1cf pmul<Packet1cf>(const Packet1cf& a, const Packet1cf& b) {
+  return pmadd(a, b, pzero(a));
+}
+#else
+template <>
+EIGEN_STRONG_INLINE Packet1cf pmul<Packet1cf>(const Packet1cf& a, const Packet1cf& b) {
+  Packet2f v1, v2;
+
+  // Get the real values of a | a1_re | a1_re |
+  v1 = vdup_lane_f32(a.v, 0);
+  // Get the imag values of a | a1_im | a1_im |
+  v2 = vdup_lane_f32(a.v, 1);
+  // Multiply the real a with b
+  v1 = vmul_f32(v1, b.v);
+  // Multiply the imag a with b
+  v2 = vmul_f32(v2, b.v);
+  // Conjugate v2
+  v2 = vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(v2), p2ui_CONJ_XOR()));
+  // Swap real/imag elements in v2.
+  v2 = vrev64_f32(v2);
+  // Add and return the result
+  return Packet1cf(vadd_f32(v1, v2));
+}
+#endif
+
+#ifdef __ARM_FEATURE_COMPLEX
+template <>
+EIGEN_STRONG_INLINE Packet2cf pmadd<Packet2cf>(const Packet2cf& a, const Packet2cf& b, const Packet2cf& c) {
+  Packet2cf result;
+  result.v = vcmlaq_f32(c.v, a.v, b.v);
+  result.v = vcmlaq_rot90_f32(result.v, a.v, b.v);
+  return result;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return pmadd(a, b, pzero(a));
+}
+#else
+template <>
+EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
   Packet4f v1, v2;
 
   // Get the real values of a | a1_re | a1_re | a2_re | a2_re |
   v1 = vcombine_f32(vdup_lane_f32(vget_low_f32(a.v), 0), vdup_lane_f32(vget_high_f32(a.v), 0));
-  // Get the real values of a | a1_im | a1_im | a2_im | a2_im |
+  // Get the imag values of a | a1_im | a1_im | a2_im | a2_im |
   v2 = vcombine_f32(vdup_lane_f32(vget_low_f32(a.v), 1), vdup_lane_f32(vget_high_f32(a.v), 1));
   // Multiply the real a with b
   v1 = vmulq_f32(v1, b.v);
   // Multiply the imag a with b
   v2 = vmulq_f32(v2, b.v);
-  // Conjugate v2 
-  v2 = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v2), p4ui_CONJ_XOR));
+  // Conjugate v2
+  v2 = vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(v2), p4ui_CONJ_XOR()));
   // Swap real/imag elements in v2.
   v2 = vrev64q_f32(v2);
   // Add and return the result
   return Packet2cf(vaddq_f32(v1, v2));
 }
-
-template<> EIGEN_STRONG_INLINE Packet2cf pand   <Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
-  return Packet2cf(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v))));
+#endif
+
+template <>
+EIGEN_STRONG_INLINE Packet1cf pcmp_eq(const Packet1cf& a, const Packet1cf& b) {
+  // Compare real and imaginary parts of a and b to get the mask vector:
+  // [re(a[0])==re(b[0]), im(a[0])==im(b[0])]
+  Packet2f eq = pcmp_eq<Packet2f>(a.v, b.v);
+  // Swap real/imag elements in the mask in to get:
+  // [im(a[0])==im(b[0]), re(a[0])==re(b[0])]
+  Packet2f eq_swapped = vrev64_f32(eq);
+  // Return re(a)==re(b) && im(a)==im(b) by computing bitwise AND of eq and eq_swapped
+  return Packet1cf(pand<Packet2f>(eq, eq_swapped));
 }
-template<> EIGEN_STRONG_INLINE Packet2cf por    <Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
-  return Packet2cf(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v))));
+template <>
+EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) {
+  // Compare real and imaginary parts of a and b to get the mask vector:
+  // [re(a[0])==re(b[0]), im(a[0])==im(b[0]), re(a[1])==re(b[1]), im(a[1])==im(b[1])]
+  Packet4f eq = pcmp_eq<Packet4f>(a.v, b.v);
+  // Swap real/imag elements in the mask in to get:
+  // [im(a[0])==im(b[0]), re(a[0])==re(b[0]), im(a[1])==im(b[1]), re(a[1])==re(b[1])]
+  Packet4f eq_swapped = vrev64q_f32(eq);
+  // Return re(a)==re(b) && im(a)==im(b) by computing bitwise AND of eq and eq_swapped
+  return Packet2cf(pand<Packet4f>(eq, eq_swapped));
 }
-template<> EIGEN_STRONG_INLINE Packet2cf pxor   <Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
-  return Packet2cf(vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v))));
+
+template <>
+EIGEN_STRONG_INLINE Packet1cf pand<Packet1cf>(const Packet1cf& a, const Packet1cf& b) {
+  return Packet1cf(vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(a.v), vreinterpret_u32_f32(b.v))));
 }
-template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
-  return Packet2cf(vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a.v),vreinterpretq_u32_f32(b.v))));
+template <>
+EIGEN_STRONG_INLINE Packet2cf pand<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return Packet2cf(vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v))));
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>((const float*)from)); }
-template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>((const float*)from)); }
+template <>
+EIGEN_STRONG_INLINE Packet1cf por<Packet1cf>(const Packet1cf& a, const Packet1cf& b) {
+  return Packet1cf(vreinterpret_f32_u32(vorr_u32(vreinterpret_u32_f32(a.v), vreinterpret_u32_f32(b.v))));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf por<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return Packet2cf(vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v))));
+}
 
-template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) { return pset1<Packet2cf>(*from); }
+template <>
+EIGEN_STRONG_INLINE Packet1cf pxor<Packet1cf>(const Packet1cf& a, const Packet1cf& b) {
+  return Packet1cf(vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(a.v), vreinterpret_u32_f32(b.v))));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pxor<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return Packet2cf(vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v))));
+}
 
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v); }
+template <>
+EIGEN_STRONG_INLINE Packet1cf pandnot<Packet1cf>(const Packet1cf& a, const Packet1cf& b) {
+  return Packet1cf(vreinterpret_f32_u32(vbic_u32(vreinterpret_u32_f32(a.v), vreinterpret_u32_f32(b.v))));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return Packet2cf(vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a.v), vreinterpretq_u32_f32(b.v))));
+}
 
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *   addr) { EIGEN_ARM_PREFETCH((float *)addr); }
+template <>
+EIGEN_STRONG_INLINE Packet1cf pload<Packet1cf>(const std::complex<float>* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return Packet1cf(
+      pload<Packet2f>(assume_aligned<unpacket_traits<Packet1cf>::alignment>(reinterpret_cast<const float*>(from))));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(
+      pload<Packet4f>(assume_aligned<unpacket_traits<Packet2cf>::alignment>(reinterpret_cast<const float*>(from))));
+}
 
-template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Packet2cf& a)
-{
-  std::complex<float> EIGEN_ALIGN16 x[2];
-  vst1q_f32((float *)x, a.v);
-  return x[0];
+template <>
+EIGEN_STRONG_INLINE Packet1cf ploadu<Packet1cf>(const std::complex<float>* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cf(ploadu<Packet2f>((const float*)from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>(reinterpret_cast<const float*>(from)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a)
-{
-  float32x2_t a_lo, a_hi;
-  Packet4f a_r128;
+template <>
+EIGEN_STRONG_INLINE Packet1cf ploaddup<Packet1cf>(const std::complex<float>* from) {
+  return pset1<Packet1cf>(*from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) {
+  return pset1<Packet2cf>(*from);
+}
 
-  a_lo = vget_low_f32(a.v);
-  a_hi = vget_high_f32(a.v);
-  a_r128 = vcombine_f32(a_hi, a_lo);
+template <>
+EIGEN_STRONG_INLINE void pstore<std::complex<float>>(std::complex<float>* to, const Packet1cf& from) {
+  EIGEN_DEBUG_ALIGNED_STORE pstore(assume_aligned<unpacket_traits<Packet1cf>::alignment>(reinterpret_cast<float*>(to)),
+                                   from.v);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<std::complex<float>>(std::complex<float>* to, const Packet2cf& from) {
+  EIGEN_DEBUG_ALIGNED_STORE pstore(assume_aligned<unpacket_traits<Packet2cf>::alignment>(reinterpret_cast<float*>(to)),
+                                   from.v);
+}
 
-  return Packet2cf(a_r128);
+template <>
+EIGEN_STRONG_INLINE void pstoreu<std::complex<float>>(std::complex<float>* to, const Packet1cf& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<std::complex<float>>(std::complex<float>* to, const Packet2cf& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<float*>(to), from.v);
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& a)
-{
-  return Packet2cf(vrev64q_f32(a.v));
+template <>
+EIGEN_DEVICE_FUNC inline Packet1cf pgather<std::complex<float>, Packet1cf>(const std::complex<float>* from,
+                                                                           Index stride) {
+  const Packet2f tmp = vdup_n_f32(std::real(from[0 * stride]));
+  return Packet1cf(vset_lane_f32(std::imag(from[0 * stride]), tmp, 1));
+}
+template <>
+EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from,
+                                                                           Index stride) {
+  Packet4f res = vdupq_n_f32(std::real(from[0 * stride]));
+  res = vsetq_lane_f32(std::imag(from[0 * stride]), res, 1);
+  res = vsetq_lane_f32(std::real(from[1 * stride]), res, 2);
+  res = vsetq_lane_f32(std::imag(from[1 * stride]), res, 3);
+  return Packet2cf(res);
 }
 
-template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
-{
-  float32x2_t a1, a2;
-  std::complex<float> s;
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet1cf>(std::complex<float>* to, const Packet1cf& from,
+                                                                       Index stride) {
+  to[stride * 0] = std::complex<float>(vget_lane_f32(from.v, 0), vget_lane_f32(from.v, 1));
+}
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from,
+                                                                       Index stride) {
+  to[stride * 0] = std::complex<float>(vgetq_lane_f32(from.v, 0), vgetq_lane_f32(from.v, 1));
+  to[stride * 1] = std::complex<float>(vgetq_lane_f32(from.v, 2), vgetq_lane_f32(from.v, 3));
+}
 
-  a1 = vget_low_f32(a.v);
-  a2 = vget_high_f32(a.v);
-  a2 = vadd_f32(a1, a2);
-  vst1_f32((float *)&s, a2);
+template <>
+EIGEN_STRONG_INLINE void prefetch<std::complex<float>>(const std::complex<float>* addr) {
+  EIGEN_ARM_PREFETCH(reinterpret_cast<const float*>(addr));
+}
 
-  return s;
+template <>
+EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet1cf>(const Packet1cf& a) {
+  EIGEN_ALIGN16 std::complex<float> x;
+  vst1_f32(reinterpret_cast<float*>(&x), a.v);
+  return x;
+}
+template <>
+EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a) {
+  EIGEN_ALIGN16 std::complex<float> x[2];
+  vst1q_f32(reinterpret_cast<float*>(x), a.v);
+  return x[0];
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vecs)
-{
-  Packet4f sum1, sum2, sum;
+template <>
+EIGEN_STRONG_INLINE Packet1cf preverse(const Packet1cf& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) {
+  return Packet2cf(vcombine_f32(vget_high_f32(a.v), vget_low_f32(a.v)));
+}
 
-  // Add the first two 64-bit float32x2_t of vecs[0]
-  sum1 = vcombine_f32(vget_low_f32(vecs[0].v), vget_low_f32(vecs[1].v));
-  sum2 = vcombine_f32(vget_high_f32(vecs[0].v), vget_high_f32(vecs[1].v));
-  sum = vaddq_f32(sum1, sum2);
+template <>
+EIGEN_STRONG_INLINE Packet1cf pcplxflip<Packet1cf>(const Packet1cf& a) {
+  return Packet1cf(vrev64_f32(a.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& a) {
+  return Packet2cf(vrev64q_f32(a.v));
+}
 
-  return Packet2cf(sum);
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux<Packet1cf>(const Packet1cf& a) {
+  std::complex<float> s;
+  vst1_f32((float*)&s, a.v);
+  return s;
+}
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a) {
+  std::complex<float> s;
+  vst1_f32(reinterpret_cast<float*>(&s), vadd_f32(vget_low_f32(a.v), vget_high_f32(a.v)));
+  return s;
 }
 
-template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a)
-{
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet1cf>(const Packet1cf& a) {
+  std::complex<float> s;
+  vst1_f32((float*)&s, a.v);
+  return s;
+}
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a) {
   float32x2_t a1, a2, v1, v2, prod;
   std::complex<float> s;
 
   a1 = vget_low_f32(a.v);
   a2 = vget_high_f32(a.v);
-   // Get the real values of a | a1_re | a1_re | a2_re | a2_re |
+  // Get the real values of a | a1_re | a1_re | a2_re | a2_re |
   v1 = vdup_lane_f32(a1, 0);
   // Get the real values of a | a1_im | a1_im | a2_im | a2_im |
   v2 = vdup_lane_f32(a1, 1);
@@ -176,78 +425,308 @@ template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const P
   v1 = vmul_f32(v1, a2);
   // Multiply the imag a with b
   v2 = vmul_f32(v2, a2);
-  // Conjugate v2 
-  v2 = vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(v2), p2ui_CONJ_XOR));
+  // Conjugate v2
+  v2 = vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(v2), p2ui_CONJ_XOR()));
   // Swap real/imag elements in v2.
   v2 = vrev64_f32(v2);
   // Add v1, v2
   prod = vadd_f32(v1, v2);
 
-  vst1_f32((float *)&s, prod);
+  vst1_f32(reinterpret_cast<float*>(&s), prod);
 
   return s;
 }
 
-template<int Offset>
-struct palign_impl<Offset,Packet2cf>
-{
-  EIGEN_STRONG_INLINE static void run(Packet2cf& first, const Packet2cf& second)
-  {
-    if (Offset==1)
-    {
-      first.v = vextq_f32(first.v, second.v, 2);
-    }
-  }
-};
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cf, Packet2f)
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf, Packet4f)
 
-template<> struct conj_helper<Packet2cf, Packet2cf, false,true>
-{
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
-  { return padd(pmul(x,y),c); }
+template <>
+EIGEN_STRONG_INLINE Packet1cf pdiv<Packet1cf>(const Packet1cf& a, const Packet1cf& b) {
+  return pdiv_complex(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return pdiv_complex(a, b);
+}
 
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
-  {
-    return internal::pmul(a, pconj(b));
-  }
-};
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet1cf, 1>& /*kernel*/) {}
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2cf, 2>& kernel) {
+  Packet4f tmp = vcombine_f32(vget_high_f32(kernel.packet[0].v), vget_high_f32(kernel.packet[1].v));
+  kernel.packet[0].v = vcombine_f32(vget_low_f32(kernel.packet[0].v), vget_low_f32(kernel.packet[1].v));
+  kernel.packet[1].v = tmp;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cf psqrt<Packet1cf>(const Packet1cf& a) {
+  return psqrt_complex<Packet1cf>(a);
+}
 
-template<> struct conj_helper<Packet2cf, Packet2cf, true,false>
-{
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
-  { return padd(pmul(x,y),c); }
+template <>
+EIGEN_STRONG_INLINE Packet2cf psqrt<Packet2cf>(const Packet2cf& a) {
+  return psqrt_complex<Packet2cf>(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cf plog<Packet1cf>(const Packet1cf& a) {
+  return plog_complex(a);
+}
 
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
-  {
-    return internal::pmul(pconj(a), b);
-  }
+template <>
+EIGEN_STRONG_INLINE Packet2cf plog<Packet2cf>(const Packet2cf& a) {
+  return plog_complex(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cf pexp<Packet1cf>(const Packet1cf& a) {
+  return pexp_complex(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pexp<Packet2cf>(const Packet2cf& a) {
+  return pexp_complex(a);
+}
+
+//---------- double ----------
+#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
+
+inline uint64x2_t p2ul_CONJ_XOR() {
+  static const uint64_t p2ul_conj_XOR_DATA[] = {0x0, 0x8000000000000000};
+  return vld1q_u64(p2ul_conj_XOR_DATA);
+}
+
+struct Packet1cd {
+  EIGEN_STRONG_INLINE Packet1cd() {}
+  EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {}
+  Packet2d v;
 };
 
-template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
-{
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
-  { return padd(pmul(x,y),c); }
+template <>
+struct packet_traits<std::complex<double>> : default_packet_traits {
+  typedef Packet1cd type;
+  typedef Packet1cd half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 0,
+    size = 1,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasNegate = 1,
+    HasSqrt = 1,
+    HasLog = 1,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 0,
+    HasMax = 0,
+    HasSetLinear = 0
+  };
+};
 
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
-  {
-    return pconj(internal::pmul(a, b));
-  }
+template <>
+struct unpacket_traits<Packet1cd> : neon_unpacket_default<Packet1cd, std::complex<double>> {
+  using as_real = Packet2d;
 };
 
-template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
-  // TODO optimize it for AltiVec
-  Packet2cf res = conj_helper<Packet2cf,Packet2cf,false,true>().pmul(a,b);
-  Packet4f s, rev_s;
+template <>
+EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(
+      pload<Packet2d>(assume_aligned<unpacket_traits<Packet1cd>::alignment>(reinterpret_cast<const double*>(from))));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>(reinterpret_cast<const double*>(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pzero<Packet1cd>(const Packet1cd& /*a*/) {
+  return Packet1cd(vdupq_n_f64(0.0));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>& from) {
+  /* here we really have to use unaligned loads :( */
+  return ploadu<Packet1cd>(&from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return Packet1cd(padd<Packet2d>(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return Packet1cd(psub<Packet2d>(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) {
+  return Packet1cd(pnegate<Packet2d>(a.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) {
+  return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v), p2ul_CONJ_XOR())));
+}
+
+#ifdef __ARM_FEATURE_COMPLEX
+template <>
+EIGEN_STRONG_INLINE Packet1cd pmadd<Packet1cd>(const Packet1cd& a, const Packet1cd& b, const Packet1cd& c) {
+  Packet1cd result;
+  result.v = vcmlaq_f64(c.v, a.v, b.v);
+  result.v = vcmlaq_rot90_f64(result.v, a.v, b.v);
+  return result;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return pmadd(a, b, pzero(a));
+}
+#else
+template <>
+EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  Packet2d v1, v2;
+
+  // Get the real values of a
+  v1 = vdupq_lane_f64(vget_low_f64(a.v), 0);
+  // Get the imag values of a
+  v2 = vdupq_lane_f64(vget_high_f64(a.v), 0);
+  // Multiply the real a with b
+  v1 = vmulq_f64(v1, b.v);
+  // Multiply the imag a with b
+  v2 = vmulq_f64(v2, b.v);
+  // Conjugate v2
+  v2 = vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(v2), p2ul_CONJ_XOR()));
+  // Swap real/imag elements in v2.
+  v2 = preverse<Packet2d>(v2);
+  // Add and return the result
+  return Packet1cd(vaddq_f64(v1, v2));
+}
+#endif
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) {
+  // Compare real and imaginary parts of a and b to get the mask vector:
+  // [re(a)==re(b), im(a)==im(b)]
+  Packet2d eq = pcmp_eq<Packet2d>(a.v, b.v);
+  // Swap real/imag elements in the mask in to get:
+  // [im(a)==im(b), re(a)==re(b)]
+  Packet2d eq_swapped = vreinterpretq_f64_u32(vrev64q_u32(vreinterpretq_u32_f64(eq)));
+  // Return re(a)==re(b) & im(a)==im(b) by computing bitwise AND of eq and eq_swapped
+  return Packet1cd(pand<Packet2d>(eq, eq_swapped));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pand<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return Packet1cd(vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a.v), vreinterpretq_u64_f64(b.v))));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd por<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return Packet1cd(vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a.v), vreinterpretq_u64_f64(b.v))));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pxor<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return Packet1cd(vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a.v), vreinterpretq_u64_f64(b.v))));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return Packet1cd(vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a.v), vreinterpretq_u64_f64(b.v))));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) {
+  return pset1<Packet1cd>(*from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<std::complex<double>>(std::complex<double>* to, const Packet1cd& from) {
+  EIGEN_DEBUG_ALIGNED_STORE pstore(assume_aligned<unpacket_traits<Packet1cd>::alignment>(reinterpret_cast<double*>(to)),
+                                   from.v);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<std::complex<double>>(std::complex<double>* to, const Packet1cd& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), from.v);
+}
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<std::complex<double>>(const std::complex<double>* addr) {
+  EIGEN_ARM_PREFETCH(reinterpret_cast<const double*>(addr));
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from,
+                                                                            Index stride) {
+  Packet2d res = pset1<Packet2d>(0.0);
+  res = vsetq_lane_f64(std::real(from[0 * stride]), res, 0);
+  res = vsetq_lane_f64(std::imag(from[0 * stride]), res, 1);
+  return Packet1cd(res);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from,
+                                                                        Index stride) {
+  to[stride * 0] = std::complex<double>(vgetq_lane_f64(from.v, 0), vgetq_lane_f64(from.v, 1));
+}
 
-  // this computes the norm
-  s = vmulq_f32(b.v, b.v);
-  rev_s = vrev64q_f32(s);
+template <>
+EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a) {
+  EIGEN_ALIGN16 std::complex<double> res;
+  pstore<std::complex<double>>(&res, a);
+  return res;
+}
 
-  return Packet2cf(pdiv(res.v, vaddq_f32(s,rev_s)));
+template <>
+EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) {
+  return a;
 }
 
-} // end namespace internal
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a) {
+  return pfirst(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) {
+  return pfirst(a);
+}
+
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd, Packet2d)
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return pdiv_complex(a, b);
+}
+
+EIGEN_STRONG_INLINE Packet1cd pcplxflip /*<Packet1cd>*/ (const Packet1cd& x) {
+  return Packet1cd(preverse(Packet2d(x.v)));
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd, 2>& kernel) {
+  Packet2d tmp = vcombine_f64(vget_high_f64(kernel.packet[0].v), vget_high_f64(kernel.packet[1].v));
+  kernel.packet[0].v = vcombine_f64(vget_low_f64(kernel.packet[0].v), vget_low_f64(kernel.packet[1].v));
+  kernel.packet[1].v = tmp;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd psqrt<Packet1cd>(const Packet1cd& a) {
+  return psqrt_complex<Packet1cd>(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd plog<Packet1cd>(const Packet1cd& a) {
+  return plog_complex(a);
+}
+
+#endif  // EIGEN_ARCH_ARM64
+
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_COMPLEX_NEON_H
+#endif  // EIGEN_COMPLEX_NEON_H
diff --git a/inst/include/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h b/inst/include/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h
new file mode 100644
index 00000000..4ecf7d15
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/NEON/GeneralBlockPanelKernel.h
@@ -0,0 +1,243 @@
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+#if EIGEN_ARCH_ARM && EIGEN_COMP_CLANG
+
+// Clang seems to excessively spill registers in the GEBP kernel on 32-bit arm.
+// Here we specialize gebp_traits to eliminate these register spills.
+// See #2138.
+template <>
+struct gebp_traits<float, float, false, false, Architecture::NEON, GEBPPacketFull>
+    : gebp_traits<float, float, false, false, Architecture::Generic, GEBPPacketFull> {
+  EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const {
+    // This volatile inline ASM both acts as a barrier to prevent reordering,
+    // as well as enforces strict register use.
+    asm volatile("vmla.f32 %q[r], %q[c], %q[alpha]" : [r] "+w"(r) : [c] "w"(c), [alpha] "w"(alpha) :);
+  }
+
+  template <typename LaneIdType>
+  EIGEN_STRONG_INLINE void madd(const Packet4f& a, const Packet4f& b, Packet4f& c, Packet4f&, const LaneIdType&) const {
+    acc(a, b, c);
+  }
+
+  template <typename LaneIdType>
+  EIGEN_STRONG_INLINE void madd(const Packet4f& a, const QuadPacket<Packet4f>& b, Packet4f& c, Packet4f& tmp,
+                                const LaneIdType& lane) const {
+    madd(a, b.get(lane), c, tmp, lane);
+  }
+};
+
+#endif  // EIGEN_ARCH_ARM && EIGEN_COMP_CLANG
+
+#if EIGEN_ARCH_ARM64
+
+#ifndef EIGEN_NEON_GEBP_NR
+#define EIGEN_NEON_GEBP_NR 8
+#endif
+
+template <>
+struct gebp_traits<float, float, false, false, Architecture::NEON, GEBPPacketFull>
+    : gebp_traits<float, float, false, false, Architecture::Generic, GEBPPacketFull> {
+  typedef float RhsPacket;
+  typedef float32x4_t RhsPacketx4;
+  enum { nr = EIGEN_NEON_GEBP_NR };
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { dest = *b; }
+
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { dest = vld1q_f32(b); }
+
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const { dest = *b; }
+
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {}
+
+  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { loadRhs(b, dest); }
+
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/,
+                                const FixedInt<0>&) const {
+    c = vfmaq_n_f32(c, a, b);
+  }
+  // NOTE: Template parameter inference failed when compiled with Android NDK:
+  // "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".
+
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
+                                const FixedInt<0>&) const {
+    madd_helper<0>(a, b, c);
+  }
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
+                                const FixedInt<1>&) const {
+    madd_helper<1>(a, b, c);
+  }
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
+                                const FixedInt<2>&) const {
+    madd_helper<2>(a, b, c);
+  }
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
+                                const FixedInt<3>&) const {
+    madd_helper<3>(a, b, c);
+  }
+
+ private:
+  template <int LaneID>
+  EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const {
+#if EIGEN_GNUC_STRICT_LESS_THAN(9, 0, 0)
+    // 1. workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
+    //    vfmaq_laneq_f32 is implemented through a costly dup, which was fixed in gcc9
+    // 2. workaround the gcc register split problem on arm64-neon
+    if (LaneID == 0)
+      asm("fmla %0.4s, %1.4s, %2.s[0]\n" : "+w"(c) : "w"(a), "w"(b) :);
+    else if (LaneID == 1)
+      asm("fmla %0.4s, %1.4s, %2.s[1]\n" : "+w"(c) : "w"(a), "w"(b) :);
+    else if (LaneID == 2)
+      asm("fmla %0.4s, %1.4s, %2.s[2]\n" : "+w"(c) : "w"(a), "w"(b) :);
+    else if (LaneID == 3)
+      asm("fmla %0.4s, %1.4s, %2.s[3]\n" : "+w"(c) : "w"(a), "w"(b) :);
+#else
+    c = vfmaq_laneq_f32(c, a, b, LaneID);
+#endif
+  }
+};
+
+template <>
+struct gebp_traits<double, double, false, false, Architecture::NEON>
+    : gebp_traits<double, double, false, false, Architecture::Generic> {
+  typedef double RhsPacket;
+  enum { nr = EIGEN_NEON_GEBP_NR };
+  struct RhsPacketx4 {
+    float64x2_t B_0, B_1;
+  };
+
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { dest = *b; }
+
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const {
+    dest.B_0 = vld1q_f64(b);
+    dest.B_1 = vld1q_f64(b + 2);
+  }
+
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const { loadRhs(b, dest); }
+
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {}
+
+  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { loadRhs(b, dest); }
+
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/,
+                                const FixedInt<0>&) const {
+    c = vfmaq_n_f64(c, a, b);
+  }
+
+  // NOTE: Template parameter inference failed when compiled with Android NDK:
+  // "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".
+
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
+                                const FixedInt<0>&) const {
+    madd_helper<0>(a, b, c);
+  }
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
+                                const FixedInt<1>&) const {
+    madd_helper<1>(a, b, c);
+  }
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
+                                const FixedInt<2>&) const {
+    madd_helper<2>(a, b, c);
+  }
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
+                                const FixedInt<3>&) const {
+    madd_helper<3>(a, b, c);
+  }
+
+ private:
+  template <int LaneID>
+  EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const {
+#if EIGEN_GNUC_STRICT_LESS_THAN(9, 0, 0)
+    // 1. workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
+    //    vfmaq_laneq_f64 is implemented through a costly dup, which was fixed in gcc9
+    // 2. workaround the gcc register split problem on arm64-neon
+    if (LaneID == 0)
+      asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w"(c) : "w"(a), "w"(b.B_0) :);
+    else if (LaneID == 1)
+      asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w"(c) : "w"(a), "w"(b.B_0) :);
+    else if (LaneID == 2)
+      asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w"(c) : "w"(a), "w"(b.B_1) :);
+    else if (LaneID == 3)
+      asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w"(c) : "w"(a), "w"(b.B_1) :);
+#else
+    if (LaneID == 0)
+      c = vfmaq_laneq_f64(c, a, b.B_0, 0);
+    else if (LaneID == 1)
+      c = vfmaq_laneq_f64(c, a, b.B_0, 1);
+    else if (LaneID == 2)
+      c = vfmaq_laneq_f64(c, a, b.B_1, 0);
+    else if (LaneID == 3)
+      c = vfmaq_laneq_f64(c, a, b.B_1, 1);
+#endif
+  }
+};
+
+// The register at operand 3 of fmla for data type half must be v0~v15, the compiler may not
+// allocate a required register for the '%2' of inline asm 'fmla %0.8h, %1.8h, %2.h[id]',
+// so inline assembly can't be used here to advoid the bug that vfmaq_lane_f16 is implemented
+// through a costly dup in gcc compiler.
+#if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC && EIGEN_COMP_CLANG
+
+template <>
+struct gebp_traits<half, half, false, false, Architecture::NEON>
+    : gebp_traits<half, half, false, false, Architecture::Generic> {
+  typedef half RhsPacket;
+  typedef float16x4_t RhsPacketx4;
+  typedef float16x4_t PacketHalf;
+  enum { nr = EIGEN_NEON_GEBP_NR };
+
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const { dest = *b; }
+
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const { dest = vld1_f16((const __fp16*)b); }
+
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const { dest = *b; }
+
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {}
+
+  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar*, RhsPacket&) const {
+    // If LHS is a Packet8h, we cannot correctly mimic a ploadquad of the RHS
+    // using a single scalar value.
+    eigen_assert(false && "Cannot loadRhsQuad for a scalar RHS.");
+  }
+
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/,
+                                const FixedInt<0>&) const {
+    c = vfmaq_n_f16(c, a, b);
+  }
+  EIGEN_STRONG_INLINE void madd(const PacketHalf& a, const RhsPacket& b, PacketHalf& c, RhsPacket& /*tmp*/,
+                                const FixedInt<0>&) const {
+    c = vfma_n_f16(c, a, b);
+  }
+
+  // NOTE: Template parameter inference failed when compiled with Android NDK:
+  // "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
+                                const FixedInt<0>&) const {
+    madd_helper<0>(a, b, c);
+  }
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
+                                const FixedInt<1>&) const {
+    madd_helper<1>(a, b, c);
+  }
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
+                                const FixedInt<2>&) const {
+    madd_helper<2>(a, b, c);
+  }
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/,
+                                const FixedInt<3>&) const {
+    madd_helper<3>(a, b, c);
+  }
+
+ private:
+  template <int LaneID>
+  EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const {
+    c = vfmaq_lane_f16(c, a, b, LaneID);
+  }
+};
+#endif  // EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC && EIGEN_COMP_CLANG
+#endif  // EIGEN_ARCH_ARM64
+
+}  // namespace internal
+}  // namespace Eigen
diff --git a/inst/include/Eigen/src/Core/arch/NEON/MathFunctions.h b/inst/include/Eigen/src/Core/arch/NEON/MathFunctions.h
new file mode 100644
index 00000000..0046e01e
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/NEON/MathFunctions.h
@@ -0,0 +1,68 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATH_FUNCTIONS_NEON_H
+#define EIGEN_MATH_FUNCTIONS_NEON_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(Packet2f)
+EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(Packet4f)
+
+#if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet4hf ptanh<Packet4hf>(const Packet4hf& x) {
+  // Convert to float, call the float ptanh, and then convert back.
+  return vcvt_f16_f32(ptanh<Packet4f>(vcvt_f32_f16(x)));
+}
+
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Packet8hf ptanh<Packet8hf>(const Packet8hf& x) {
+  // Convert each 4 half types to float, call the float ptanh, and then convert back.
+  return vcombine_f16(vcvt_f16_f32(ptanh<Packet4f>(vcvt_f32_f16(vget_low_f16(x)))),
+                      vcvt_f16_f32(ptanh<Packet4f>(vcvt_high_f32_f16(x))));
+}
+#endif  // EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
+
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, psin)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pcos)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, plog)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pexp)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pexp2)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, ptanh)
+
+template <>
+EIGEN_STRONG_INLINE Packet4bf pfrexp(const Packet4bf& a, Packet4bf& exponent) {
+  Packet4f fexponent;
+  const Packet4bf out = F32ToBf16(pfrexp<Packet4f>(Bf16ToF32(a), fexponent));
+  exponent = F32ToBf16(fexponent);
+  return out;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4bf pldexp(const Packet4bf& a, const Packet4bf& exponent) {
+  return F32ToBf16(pldexp<Packet4f>(Bf16ToF32(a), Bf16ToF32(exponent)));
+}
+
+//---------- double ----------
+
+#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
+
+EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_DOUBLE(Packet2d)
+
+#endif
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_MATH_FUNCTIONS_NEON_H
diff --git a/inst/include/Eigen/src/Core/arch/NEON/PacketMath.h b/inst/include/Eigen/src/Core/arch/NEON/PacketMath.h
index d49670e0..a66af83c 100644
--- a/inst/include/Eigen/src/Core/arch/NEON/PacketMath.h
+++ b/inst/include/Eigen/src/Core/arch/NEON/PacketMath.h
@@ -2,7 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
-// Copyright (C) 2010 Konstantinos Margaritis <markos@codex.gr>
+// Copyright (C) 2010 Konstantinos Margaritis <markos@freevec.org>
 // Heavily based on Gael's SSE version.
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -12,409 +12,6085 @@
 #ifndef EIGEN_PACKET_MATH_NEON_H
 #define EIGEN_PACKET_MATH_NEON_H
 
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
 namespace Eigen {
 
-namespace internal {
+namespace internal {
+
+#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
+#endif
+
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#endif
+
+#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
+#if EIGEN_ARCH_ARM64
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
+#else
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 16
+#endif
+#endif
+
+#if EIGEN_COMP_MSVC_STRICT
+
+// In MSVC's arm_neon.h header file, all NEON vector types
+// are aliases to the same underlying type __n128.
+// We thus have to wrap them to make them different C++ types.
+// (See also bug 1428)
+typedef eigen_packet_wrapper<float32x2_t, 0> Packet2f;
+typedef eigen_packet_wrapper<float32x4_t, 1> Packet4f;
+typedef eigen_packet_wrapper<int32_t, 2> Packet4c;
+typedef eigen_packet_wrapper<int8x8_t, 3> Packet8c;
+typedef eigen_packet_wrapper<int8x16_t, 4> Packet16c;
+typedef eigen_packet_wrapper<uint32_t, 5> Packet4uc;
+typedef eigen_packet_wrapper<uint8x8_t, 6> Packet8uc;
+typedef eigen_packet_wrapper<uint8x16_t, 7> Packet16uc;
+typedef eigen_packet_wrapper<int16x4_t, 8> Packet4s;
+typedef eigen_packet_wrapper<int16x8_t, 9> Packet8s;
+typedef eigen_packet_wrapper<uint16x4_t, 10> Packet4us;
+typedef eigen_packet_wrapper<uint16x8_t, 11> Packet8us;
+typedef eigen_packet_wrapper<int32x2_t, 12> Packet2i;
+typedef eigen_packet_wrapper<int32x4_t, 13> Packet4i;
+typedef eigen_packet_wrapper<uint32x2_t, 14> Packet2ui;
+typedef eigen_packet_wrapper<uint32x4_t, 15> Packet4ui;
+typedef eigen_packet_wrapper<int64x2_t, 16> Packet2l;
+typedef eigen_packet_wrapper<uint64x2_t, 17> Packet2ul;
+
+EIGEN_ALWAYS_INLINE Packet4f make_packet4f(float a, float b, float c, float d) {
+  float from[4] = {a, b, c, d};
+  return vld1q_f32(from);
+}
+
+EIGEN_ALWAYS_INLINE Packet2f make_packet2f(float a, float b) {
+  float from[2] = {a, b};
+  return vld1_f32(from);
+}
+
+#else
+
+typedef float32x2_t Packet2f;
+typedef float32x4_t Packet4f;
+typedef eigen_packet_wrapper<int32_t, 2> Packet4c;
+typedef int8x8_t Packet8c;
+typedef int8x16_t Packet16c;
+typedef eigen_packet_wrapper<uint32_t, 5> Packet4uc;
+typedef uint8x8_t Packet8uc;
+typedef uint8x16_t Packet16uc;
+typedef int16x4_t Packet4s;
+typedef int16x8_t Packet8s;
+typedef uint16x4_t Packet4us;
+typedef uint16x8_t Packet8us;
+typedef int32x2_t Packet2i;
+typedef int32x4_t Packet4i;
+typedef uint32x2_t Packet2ui;
+typedef uint32x4_t Packet4ui;
+typedef int64x2_t Packet2l;
+typedef uint64x2_t Packet2ul;
+
+EIGEN_ALWAYS_INLINE Packet4f make_packet4f(float a, float b, float c, float d) { return Packet4f{a, b, c, d}; }
+EIGEN_ALWAYS_INLINE Packet2f make_packet2f(float a, float b) { return Packet2f{a, b}; }
+
+#endif  // EIGEN_COMP_MSVC_STRICT
+
+EIGEN_STRONG_INLINE Packet4f shuffle1(const Packet4f& m, int mask) {
+  const float* a = reinterpret_cast<const float*>(&m);
+  Packet4f res =
+      make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(a + ((mask >> 6) & 3)));
+  return res;
+}
+
+// functionally equivalent to _mm_shuffle_ps in SSE when interleave
+// == false (i.e. shuffle<false>(m, n, mask) equals _mm_shuffle_ps(m, n, mask)),
+// interleave m and n when interleave == true. Currently used in LU/arch/InverseSize4.h
+// to enable a shared implementation for fast inversion of matrices of size 4.
+template <bool interleave>
+EIGEN_STRONG_INLINE Packet4f shuffle2(const Packet4f& m, const Packet4f& n, int mask) {
+  const float* a = reinterpret_cast<const float*>(&m);
+  const float* b = reinterpret_cast<const float*>(&n);
+  Packet4f res =
+      make_packet4f(*(a + (mask & 3)), *(a + ((mask >> 2) & 3)), *(b + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3)));
+  return res;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f shuffle2<true>(const Packet4f& m, const Packet4f& n, int mask) {
+  const float* a = reinterpret_cast<const float*>(&m);
+  const float* b = reinterpret_cast<const float*>(&n);
+  Packet4f res =
+      make_packet4f(*(a + (mask & 3)), *(b + ((mask >> 2) & 3)), *(a + ((mask >> 4) & 3)), *(b + ((mask >> 6) & 3)));
+  return res;
+}
+
+EIGEN_STRONG_INLINE static int eigen_neon_shuffle_mask(int p, int q, int r, int s) {
+  return ((s) << 6 | (r) << 4 | (q) << 2 | (p));
+}
+
+EIGEN_STRONG_INLINE Packet4f vec4f_swizzle1(const Packet4f& a, int p, int q, int r, int s) {
+  return shuffle1(a, eigen_neon_shuffle_mask(p, q, r, s));
+}
+EIGEN_STRONG_INLINE Packet4f vec4f_swizzle2(const Packet4f& a, const Packet4f& b, int p, int q, int r, int s) {
+  return shuffle2<false>(a, b, eigen_neon_shuffle_mask(p, q, r, s));
+}
+EIGEN_STRONG_INLINE Packet4f vec4f_movelh(const Packet4f& a, const Packet4f& b) {
+  return shuffle2<false>(a, b, eigen_neon_shuffle_mask(0, 1, 0, 1));
+}
+EIGEN_STRONG_INLINE Packet4f vec4f_movehl(const Packet4f& a, const Packet4f& b) {
+  return shuffle2<false>(b, a, eigen_neon_shuffle_mask(2, 3, 2, 3));
+}
+EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(const Packet4f& a, const Packet4f& b) {
+  return shuffle2<true>(a, b, eigen_neon_shuffle_mask(0, 0, 1, 1));
+}
+EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b) {
+  return shuffle2<true>(a, b, eigen_neon_shuffle_mask(2, 2, 3, 3));
+}
+#define vec4f_duplane(a, p) Packet4f(vdupq_lane_f32(vget_low_f32(a), p))
+
+#define EIGEN_DECLARE_CONST_Packet4f(NAME, X) const Packet4f p4f_##NAME = pset1<Packet4f>(X)
+
+#define EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME, X) \
+  const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int32_t>(X))
+
+#define EIGEN_DECLARE_CONST_Packet4i(NAME, X) const Packet4i p4i_##NAME = pset1<Packet4i>(X)
+
+#if EIGEN_ARCH_ARM64 && EIGEN_COMP_GNUC
+// __builtin_prefetch tends to do nothing on ARM64 compilers because the
+// prefetch instructions there are too detailed for __builtin_prefetch to map
+// meaningfully to them.
+#define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__("prfm pldl1keep, [%[addr]]\n" ::[addr] "r"(ADDR) :);
+#elif EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
+#define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR);
+#elif defined __pld
+#define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR)
+#elif EIGEN_ARCH_ARM
+#define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__("pld [%[addr]]\n" ::[addr] "r"(ADDR) :);
+#else
+// by default no explicit prefetching
+#define EIGEN_ARM_PREFETCH(ADDR)
+#endif
+
+template <>
+struct packet_traits<float> : default_packet_traits {
+  typedef Packet4f type;
+  typedef Packet2f half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+
+    HasCmp = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasAbsDiff = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
+    HasSetLinear = 1,
+    HasBlend = 0,
+    HasDiv = 1,
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+    HasACos = 1,
+    HasASin = 1,
+    HasATan = 1,
+    HasATanh = 1,
+    HasLog = 1,
+    HasExp = 1,
+    HasPow = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasCbrt = 1,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH,
+    HasErfc = EIGEN_FAST_MATH,
+    HasBessel = 0,  // Issues with accuracy.
+    HasNdtri = 0
+  };
+};
+
+template <>
+struct packet_traits<int8_t> : default_packet_traits {
+  typedef Packet16c type;
+  typedef Packet8c half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 16,
+
+    HasCmp = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasAbsDiff = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
+    HasSetLinear = 1,
+    HasBlend = 0
+  };
+};
+
+template <>
+struct packet_traits<uint8_t> : default_packet_traits {
+  typedef Packet16uc type;
+  typedef Packet8uc half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 16,
+
+    HasCmp = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 0,
+    HasAbs = 1,
+    HasAbsDiff = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
+    HasSetLinear = 1,
+    HasBlend = 0,
+
+    HasSqrt = 1
+  };
+};
+
+template <>
+struct packet_traits<int16_t> : default_packet_traits {
+  typedef Packet8s type;
+  typedef Packet4s half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 8,
+
+    HasCmp = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasAbsDiff = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
+    HasSetLinear = 1,
+    HasBlend = 0
+  };
+};
+
+template <>
+struct packet_traits<uint16_t> : default_packet_traits {
+  typedef Packet8us type;
+  typedef Packet4us half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 8,
+
+    HasCmp = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 0,
+    HasAbs = 1,
+    HasAbsDiff = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
+    HasSetLinear = 1,
+    HasBlend = 0,
+    HasSqrt = 1
+  };
+};
+
+template <>
+struct packet_traits<int32_t> : default_packet_traits {
+  typedef Packet4i type;
+  typedef Packet2i half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+
+    HasCmp = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasAbsDiff = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
+    HasSetLinear = 1,
+    HasBlend = 0
+  };
+};
+
+template <>
+struct packet_traits<uint32_t> : default_packet_traits {
+  typedef Packet4ui type;
+  typedef Packet2ui half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+
+    HasCmp = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 0,
+    HasAbs = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasAbsDiff = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
+    HasSetLinear = 1,
+    HasBlend = 0,
+
+    HasSqrt = 1
+  };
+};
+
+template <>
+struct packet_traits<int64_t> : default_packet_traits {
+  typedef Packet2l type;
+  typedef Packet2l half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 2,
+
+    HasCmp = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasAbsDiff = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
+    HasSetLinear = 1,
+    HasBlend = 0
+  };
+};
+
+template <>
+struct packet_traits<uint64_t> : default_packet_traits {
+  typedef Packet2ul type;
+  typedef Packet2ul half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 2,
+
+    HasCmp = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 0,
+    HasAbs = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasAbsDiff = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
+    HasSetLinear = 1,
+    HasBlend = 0
+  };
+};
+
+template <typename Packet, typename Scalar>
+struct neon_unpacket_default {
+  using type = Scalar;
+  using half = Packet;
+  static constexpr int size = sizeof(Packet) / sizeof(Scalar);
+  static constexpr int alignment = sizeof(Packet);
+  static constexpr bool vectorizable = true;
+  static constexpr bool masked_load_available = false;
+  static constexpr bool masked_store_available = false;
+};
+
+template <>
+struct unpacket_traits<Packet2f> : neon_unpacket_default<Packet2f, float> {
+  using integer_packet = Packet2i;
+};
+template <>
+struct unpacket_traits<Packet4f> : neon_unpacket_default<Packet4f, float> {
+  using half = Packet2f;
+  using integer_packet = Packet4i;
+};
+template <>
+struct unpacket_traits<Packet4c> : neon_unpacket_default<Packet4c, int8_t> {};
+template <>
+struct unpacket_traits<Packet8c> : neon_unpacket_default<Packet8c, int8_t> {
+  using half = Packet4c;
+};
+template <>
+struct unpacket_traits<Packet16c> : neon_unpacket_default<Packet16c, int8_t> {
+  using half = Packet8c;
+};
+template <>
+struct unpacket_traits<Packet4uc> : neon_unpacket_default<Packet4uc, uint8_t> {};
+template <>
+struct unpacket_traits<Packet8uc> : neon_unpacket_default<Packet8uc, uint8_t> {
+  using half = Packet4uc;
+};
+template <>
+struct unpacket_traits<Packet16uc> : neon_unpacket_default<Packet16uc, uint8_t> {
+  using half = Packet8uc;
+};
+template <>
+struct unpacket_traits<Packet4s> : neon_unpacket_default<Packet4s, int16_t> {};
+template <>
+struct unpacket_traits<Packet8s> : neon_unpacket_default<Packet8s, int16_t> {
+  using half = Packet4s;
+};
+template <>
+struct unpacket_traits<Packet4us> : neon_unpacket_default<Packet4us, uint16_t> {};
+template <>
+struct unpacket_traits<Packet8us> : neon_unpacket_default<Packet8us, uint16_t> {
+  using half = Packet4us;
+};
+template <>
+struct unpacket_traits<Packet2i> : neon_unpacket_default<Packet2i, int32_t> {};
+template <>
+struct unpacket_traits<Packet4i> : neon_unpacket_default<Packet4i, int32_t> {
+  using half = Packet2i;
+};
+template <>
+struct unpacket_traits<Packet2ui> : neon_unpacket_default<Packet2ui, uint32_t> {};
+template <>
+struct unpacket_traits<Packet4ui> : neon_unpacket_default<Packet4ui, uint32_t> {
+  using half = Packet2ui;
+};
+template <>
+struct unpacket_traits<Packet2l> : neon_unpacket_default<Packet2l, int64_t> {};
+template <>
+struct unpacket_traits<Packet2ul> : neon_unpacket_default<Packet2ul, uint64_t> {};
+
+template <>
+EIGEN_STRONG_INLINE Packet2f pzero(const Packet2f& /*a*/) {
+  return vdup_n_f32(0.0f);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pzero(const Packet4f& /*a*/) {
+  return vdupq_n_f32(0.0f);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2f pset1<Packet2f>(const float& from) {
+  return vdup_n_f32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
+  return vdupq_n_f32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c pset1<Packet4c>(const int8_t& from) {
+  return vget_lane_s32(vreinterpret_s32_s8(vdup_n_s8(from)), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pset1<Packet8c>(const int8_t& from) {
+  return vdup_n_s8(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pset1<Packet16c>(const int8_t& from) {
+  return vdupq_n_s8(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pset1<Packet4uc>(const uint8_t& from) {
+  return vget_lane_u32(vreinterpret_u32_u8(vdup_n_u8(from)), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pset1<Packet8uc>(const uint8_t& from) {
+  return vdup_n_u8(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pset1<Packet16uc>(const uint8_t& from) {
+  return vdupq_n_u8(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pset1<Packet4s>(const int16_t& from) {
+  return vdup_n_s16(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pset1<Packet8s>(const int16_t& from) {
+  return vdupq_n_s16(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pset1<Packet4us>(const uint16_t& from) {
+  return vdup_n_u16(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pset1<Packet8us>(const uint16_t& from) {
+  return vdupq_n_u16(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pset1<Packet2i>(const int32_t& from) {
+  return vdup_n_s32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int32_t& from) {
+  return vdupq_n_s32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pset1<Packet2ui>(const uint32_t& from) {
+  return vdup_n_u32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pset1<Packet4ui>(const uint32_t& from) {
+  return vdupq_n_u32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pset1<Packet2l>(const int64_t& from) {
+  return vdupq_n_s64(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pset1<Packet2ul>(const uint64_t& from) {
+  return vdupq_n_u64(from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2f pset1frombits<Packet2f>(uint32_t from) {
+  return vreinterpret_f32_u32(vdup_n_u32(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(uint32_t from) {
+  return vreinterpretq_f32_u32(vdupq_n_u32(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2f plset<Packet2f>(const float& a) {
+  const float c[] = {0.0f, 1.0f};
+  return vadd_f32(pset1<Packet2f>(a), vld1_f32(c));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) {
+  const float c[] = {0.0f, 1.0f, 2.0f, 3.0f};
+  return vaddq_f32(pset1<Packet4f>(a), vld1q_f32(c));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c plset<Packet4c>(const int8_t& a) {
+  return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(vreinterpret_s8_u32(vdup_n_u32(0x03020100)), vdup_n_s8(a))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c plset<Packet8c>(const int8_t& a) {
+  const int8_t c[] = {0, 1, 2, 3, 4, 5, 6, 7};
+  return vadd_s8(pset1<Packet8c>(a), vld1_s8(c));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c plset<Packet16c>(const int8_t& a) {
+  const int8_t c[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+  return vaddq_s8(pset1<Packet16c>(a), vld1q_s8(c));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc plset<Packet4uc>(const uint8_t& a) {
+  return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(vreinterpret_u8_u32(vdup_n_u32(0x03020100)), vdup_n_u8(a))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc plset<Packet8uc>(const uint8_t& a) {
+  const uint8_t c[] = {0, 1, 2, 3, 4, 5, 6, 7};
+  return vadd_u8(pset1<Packet8uc>(a), vld1_u8(c));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc plset<Packet16uc>(const uint8_t& a) {
+  const uint8_t c[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+  return vaddq_u8(pset1<Packet16uc>(a), vld1q_u8(c));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s plset<Packet4s>(const int16_t& a) {
+  const int16_t c[] = {0, 1, 2, 3};
+  return vadd_s16(pset1<Packet4s>(a), vld1_s16(c));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us plset<Packet4us>(const uint16_t& a) {
+  const uint16_t c[] = {0, 1, 2, 3};
+  return vadd_u16(pset1<Packet4us>(a), vld1_u16(c));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s plset<Packet8s>(const int16_t& a) {
+  const int16_t c[] = {0, 1, 2, 3, 4, 5, 6, 7};
+  return vaddq_s16(pset1<Packet8s>(a), vld1q_s16(c));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us plset<Packet8us>(const uint16_t& a) {
+  const uint16_t c[] = {0, 1, 2, 3, 4, 5, 6, 7};
+  return vaddq_u16(pset1<Packet8us>(a), vld1q_u16(c));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i plset<Packet2i>(const int32_t& a) {
+  const int32_t c[] = {0, 1};
+  return vadd_s32(pset1<Packet2i>(a), vld1_s32(c));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int32_t& a) {
+  const int32_t c[] = {0, 1, 2, 3};
+  return vaddq_s32(pset1<Packet4i>(a), vld1q_s32(c));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui plset<Packet2ui>(const uint32_t& a) {
+  const uint32_t c[] = {0, 1};
+  return vadd_u32(pset1<Packet2ui>(a), vld1_u32(c));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui plset<Packet4ui>(const uint32_t& a) {
+  const uint32_t c[] = {0, 1, 2, 3};
+  return vaddq_u32(pset1<Packet4ui>(a), vld1q_u32(c));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l plset<Packet2l>(const int64_t& a) {
+  const int64_t c[] = {0, 1};
+  return vaddq_s64(pset1<Packet2l>(a), vld1q_s64(c));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul plset<Packet2ul>(const uint64_t& a) {
+  const uint64_t c[] = {0, 1};
+  return vaddq_u64(pset1<Packet2ul>(a), vld1q_u64(c));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2f padd<Packet2f>(const Packet2f& a, const Packet2f& b) {
+  return vadd_f32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return vaddq_f32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c padd<Packet4c>(const Packet4c& a, const Packet4c& b) {
+  return vget_lane_s32(
+      vreinterpret_s32_s8(vadd_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c padd<Packet8c>(const Packet8c& a, const Packet8c& b) {
+  return vadd_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c padd<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return vaddq_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc padd<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
+  return vget_lane_u32(
+      vreinterpret_u32_u8(vadd_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc padd<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
+  return vadd_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc padd<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return vaddq_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s padd<Packet4s>(const Packet4s& a, const Packet4s& b) {
+  return vadd_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s padd<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return vaddq_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us padd<Packet4us>(const Packet4us& a, const Packet4us& b) {
+  return vadd_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us padd<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return vaddq_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i padd<Packet2i>(const Packet2i& a, const Packet2i& b) {
+  return vadd_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return vaddq_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui padd<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
+  return vadd_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return vaddq_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l padd<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return vaddq_s64(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul padd<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+  return vaddq_u64(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2f psub<Packet2f>(const Packet2f& a, const Packet2f& b) {
+  return vsub_f32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return vsubq_f32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c psub<Packet4c>(const Packet4c& a, const Packet4c& b) {
+  return vget_lane_s32(
+      vreinterpret_s32_s8(vsub_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c psub<Packet8c>(const Packet8c& a, const Packet8c& b) {
+  return vsub_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c psub<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return vsubq_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc psub<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
+  return vget_lane_u32(
+      vreinterpret_u32_u8(vsub_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc psub<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
+  return vsub_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc psub<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return vsubq_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s psub<Packet4s>(const Packet4s& a, const Packet4s& b) {
+  return vsub_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s psub<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return vsubq_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us psub<Packet4us>(const Packet4us& a, const Packet4us& b) {
+  return vsub_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us psub<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return vsubq_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i psub<Packet2i>(const Packet2i& a, const Packet2i& b) {
+  return vsub_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return vsubq_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui psub<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
+  return vsub_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui psub<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return vsubq_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l psub<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return vsubq_s64(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul psub<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+  return vsubq_u64(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2f pxor<Packet2f>(const Packet2f& a, const Packet2f& b);
+template <>
+EIGEN_STRONG_INLINE Packet2f paddsub<Packet2f>(const Packet2f& a, const Packet2f& b) {
+  Packet2f mask = make_packet2f(numext::bit_cast<float>(0x80000000u), 0.0f);
+  return padd(a, pxor(mask, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b);
+template <>
+EIGEN_STRONG_INLINE Packet4f paddsub<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  Packet4f mask = make_packet4f(numext::bit_cast<float>(0x80000000u), 0.0f, numext::bit_cast<float>(0x80000000u), 0.0f);
+  return padd(a, pxor(mask, b));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2f pnegate(const Packet2f& a) {
+  return vneg_f32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) {
+  return vnegq_f32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c pnegate(const Packet4c& a) {
+  return vget_lane_s32(vreinterpret_s32_s8(vneg_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pnegate(const Packet8c& a) {
+  return vneg_s8(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pnegate(const Packet16c& a) {
+  return vnegq_s8(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pnegate(const Packet4s& a) {
+  return vneg_s16(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pnegate(const Packet8s& a) {
+  return vnegq_s16(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pnegate(const Packet2i& a) {
+  return vneg_s32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) {
+  return vnegq_s32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pnegate(const Packet2l& a) {
+#if EIGEN_ARCH_ARM64
+  return vnegq_s64(a);
+#else
+  return vcombine_s64(vdup_n_s64(-vgetq_lane_s64(a, 0)), vdup_n_s64(-vgetq_lane_s64(a, 1)));
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2f pconj(const Packet2f& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c pconj(const Packet4c& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pconj(const Packet8c& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pconj(const Packet16c& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pconj(const Packet4uc& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pconj(const Packet8uc& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pconj(const Packet16uc& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pconj(const Packet4s& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pconj(const Packet8s& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pconj(const Packet4us& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pconj(const Packet8us& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pconj(const Packet2i& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pconj(const Packet2ui& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pconj(const Packet4ui& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pconj(const Packet2l& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pconj(const Packet2ul& a) {
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2f pmul<Packet2f>(const Packet2f& a, const Packet2f& b) {
+  return vmul_f32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return vmulq_f32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c pmul<Packet4c>(const Packet4c& a, const Packet4c& b) {
+  return vget_lane_s32(
+      vreinterpret_s32_s8(vmul_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pmul<Packet8c>(const Packet8c& a, const Packet8c& b) {
+  return vmul_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pmul<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return vmulq_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pmul<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
+  return vget_lane_u32(
+      vreinterpret_u32_u8(vmul_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pmul<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
+  return vmul_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pmul<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return vmulq_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pmul<Packet4s>(const Packet4s& a, const Packet4s& b) {
+  return vmul_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pmul<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return vmulq_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pmul<Packet4us>(const Packet4us& a, const Packet4us& b) {
+  return vmul_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pmul<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return vmulq_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pmul<Packet2i>(const Packet2i& a, const Packet2i& b) {
+  return vmul_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return vmulq_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pmul<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
+  return vmul_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pmul<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return vmulq_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pmul<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return vcombine_s64(vdup_n_s64(vgetq_lane_s64(a, 0) * vgetq_lane_s64(b, 0)),
+                      vdup_n_s64(vgetq_lane_s64(a, 1) * vgetq_lane_s64(b, 1)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pmul<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+  return vcombine_u64(vdup_n_u64(vgetq_lane_u64(a, 0) * vgetq_lane_u64(b, 0)),
+                      vdup_n_u64(vgetq_lane_u64(a, 1) * vgetq_lane_u64(b, 1)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4c pdiv<Packet4c>(const Packet4c& /*a*/, const Packet4c& /*b*/) {
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet4c>(0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pdiv<Packet8c>(const Packet8c& /*a*/, const Packet8c& /*b*/) {
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet8c>(0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pdiv<Packet16c>(const Packet16c& /*a*/, const Packet16c& /*b*/) {
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet16c>(0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pdiv<Packet4uc>(const Packet4uc& /*a*/, const Packet4uc& /*b*/) {
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet4uc>(0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pdiv<Packet8uc>(const Packet8uc& /*a*/, const Packet8uc& /*b*/) {
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet8uc>(0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pdiv<Packet16uc>(const Packet16uc& /*a*/, const Packet16uc& /*b*/) {
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet16uc>(0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pdiv<Packet4s>(const Packet4s& /*a*/, const Packet4s& /*b*/) {
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet4s>(0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pdiv<Packet8s>(const Packet8s& /*a*/, const Packet8s& /*b*/) {
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet8s>(0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pdiv<Packet4us>(const Packet4us& /*a*/, const Packet4us& /*b*/) {
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet4us>(0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pdiv<Packet8us>(const Packet8us& /*a*/, const Packet8us& /*b*/) {
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet8us>(0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pdiv<Packet2i>(const Packet2i& /*a*/, const Packet2i& /*b*/) {
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet2i>(0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/) {
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet4i>(0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pdiv<Packet2ui>(const Packet2ui& /*a*/, const Packet2ui& /*b*/) {
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet2ui>(0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pdiv<Packet4ui>(const Packet4ui& /*a*/, const Packet4ui& /*b*/) {
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet4ui>(0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pdiv<Packet2l>(const Packet2l& /*a*/, const Packet2l& /*b*/) {
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet2l>(0LL);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pdiv<Packet2ul>(const Packet2ul& /*a*/, const Packet2ul& /*b*/) {
+  eigen_assert(false && "packet integer division are not supported by NEON");
+  return pset1<Packet2ul>(0ULL);
+}
+
+#ifdef EIGEN_VECTORIZE_FMA
+template <>
+EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+  return vfmaq_f32(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
+  return vfma_f32(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pnmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+  return vfmsq_f32(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f pnmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
+  return vfms_f32(c, a, b);
+}
+#else
+template <>
+EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+  return vmlaq_f32(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f pmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
+  return vmla_f32(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pnmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+  return vmlsq_f32(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f pnmadd(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
+  return vmls_f32(c, a, b);
+}
+#endif
+template <>
+EIGEN_STRONG_INLINE Packet4f pmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+  return pnegate(pnmadd(a, b, c));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f pmsub(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
+  return pnegate(pnmadd(a, b, c));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pnmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+  return pnegate(pmadd(a, b, c));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f pnmsub(const Packet2f& a, const Packet2f& b, const Packet2f& c) {
+  return pnegate(pmadd(a, b, c));
+}
+
+// No FMA instruction for int, so use MLA unconditionally.
+template <>
+EIGEN_STRONG_INLINE Packet4c pmadd(const Packet4c& a, const Packet4c& b, const Packet4c& c) {
+  return vget_lane_s32(
+      vreinterpret_s32_s8(vmla_s8(vreinterpret_s8_s32(vdup_n_s32(c)), vreinterpret_s8_s32(vdup_n_s32(a)),
+                                  vreinterpret_s8_s32(vdup_n_s32(b)))),
+      0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pmadd(const Packet8c& a, const Packet8c& b, const Packet8c& c) {
+  return vmla_s8(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pmadd(const Packet16c& a, const Packet16c& b, const Packet16c& c) {
+  return vmlaq_s8(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pmadd(const Packet4uc& a, const Packet4uc& b, const Packet4uc& c) {
+  return vget_lane_u32(
+      vreinterpret_u32_u8(vmla_u8(vreinterpret_u8_u32(vdup_n_u32(c)), vreinterpret_u8_u32(vdup_n_u32(a)),
+                                  vreinterpret_u8_u32(vdup_n_u32(b)))),
+      0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pmadd(const Packet8uc& a, const Packet8uc& b, const Packet8uc& c) {
+  return vmla_u8(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pmadd(const Packet16uc& a, const Packet16uc& b, const Packet16uc& c) {
+  return vmlaq_u8(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pmadd(const Packet4s& a, const Packet4s& b, const Packet4s& c) {
+  return vmla_s16(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pmadd(const Packet8s& a, const Packet8s& b, const Packet8s& c) {
+  return vmlaq_s16(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pmadd(const Packet4us& a, const Packet4us& b, const Packet4us& c) {
+  return vmla_u16(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pmadd(const Packet8us& a, const Packet8us& b, const Packet8us& c) {
+  return vmlaq_u16(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pmadd(const Packet2i& a, const Packet2i& b, const Packet2i& c) {
+  return vmla_s32(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
+  return vmlaq_s32(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pmadd(const Packet2ui& a, const Packet2ui& b, const Packet2ui& c) {
+  return vmla_u32(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pmadd(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c) {
+  return vmlaq_u32(c, a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2f pabsdiff<Packet2f>(const Packet2f& a, const Packet2f& b) {
+  return vabd_f32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pabsdiff<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return vabdq_f32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c pabsdiff<Packet4c>(const Packet4c& a, const Packet4c& b) {
+  return vget_lane_s32(
+      vreinterpret_s32_s8(vabd_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pabsdiff<Packet8c>(const Packet8c& a, const Packet8c& b) {
+  return vabd_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pabsdiff<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return vabdq_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pabsdiff<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
+  return vget_lane_u32(
+      vreinterpret_u32_u8(vabd_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pabsdiff<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
+  return vabd_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pabsdiff<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return vabdq_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pabsdiff<Packet4s>(const Packet4s& a, const Packet4s& b) {
+  return vabd_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pabsdiff<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return vabdq_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pabsdiff<Packet4us>(const Packet4us& a, const Packet4us& b) {
+  return vabd_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pabsdiff<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return vabdq_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pabsdiff<Packet2i>(const Packet2i& a, const Packet2i& b) {
+  return vabd_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pabsdiff<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return vabdq_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pabsdiff<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
+  return vabd_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pabsdiff<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return vabdq_u32(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2f pmin<Packet2f>(const Packet2f& a, const Packet2f& b) {
+  return vmin_f32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return vminq_f32(a, b);
+}
+
+#ifdef __ARM_FEATURE_NUMERIC_MAXMIN
+// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8
+// systems).
+template <>
+EIGEN_STRONG_INLINE Packet4f pmin<PropagateNumbers, Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return vminnmq_f32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f pmin<PropagateNumbers, Packet2f>(const Packet2f& a, const Packet2f& b) {
+  return vminnm_f32(a, b);
+}
+#endif
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pmin<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return pmin<Packet4f>(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2f pmin<PropagateNaN, Packet2f>(const Packet2f& a, const Packet2f& b) {
+  return pmin<Packet2f>(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4c pmin<Packet4c>(const Packet4c& a, const Packet4c& b) {
+  return vget_lane_s32(
+      vreinterpret_s32_s8(vmin_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pmin<Packet8c>(const Packet8c& a, const Packet8c& b) {
+  return vmin_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pmin<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return vminq_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pmin<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
+  return vget_lane_u32(
+      vreinterpret_u32_u8(vmin_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pmin<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
+  return vmin_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pmin<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return vminq_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pmin<Packet4s>(const Packet4s& a, const Packet4s& b) {
+  return vmin_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pmin<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return vminq_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pmin<Packet4us>(const Packet4us& a, const Packet4us& b) {
+  return vmin_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pmin<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return vminq_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pmin<Packet2i>(const Packet2i& a, const Packet2i& b) {
+  return vmin_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return vminq_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pmin<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
+  return vmin_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pmin<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return vminq_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pmin<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return vcombine_s64(vdup_n_s64((std::min)(vgetq_lane_s64(a, 0), vgetq_lane_s64(b, 0))),
+                      vdup_n_s64((std::min)(vgetq_lane_s64(a, 1), vgetq_lane_s64(b, 1))));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pmin<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+  return vcombine_u64(vdup_n_u64((std::min)(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0))),
+                      vdup_n_u64((std::min)(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1))));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2f pmax<Packet2f>(const Packet2f& a, const Packet2f& b) {
+  return vmax_f32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return vmaxq_f32(a, b);
+}
+
+#ifdef __ARM_FEATURE_NUMERIC_MAXMIN
+// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8
+// systems).
+template <>
+EIGEN_STRONG_INLINE Packet4f pmax<PropagateNumbers, Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return vmaxnmq_f32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f pmax<PropagateNumbers, Packet2f>(const Packet2f& a, const Packet2f& b) {
+  return vmaxnm_f32(a, b);
+}
+#endif
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pmax<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return pmax<Packet4f>(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2f pmax<PropagateNaN, Packet2f>(const Packet2f& a, const Packet2f& b) {
+  return pmax<Packet2f>(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4c pmax<Packet4c>(const Packet4c& a, const Packet4c& b) {
+  return vget_lane_s32(
+      vreinterpret_s32_s8(vmax_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pmax<Packet8c>(const Packet8c& a, const Packet8c& b) {
+  return vmax_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pmax<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return vmaxq_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pmax<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
+  return vget_lane_u32(
+      vreinterpret_u32_u8(vmax_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pmax<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
+  return vmax_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pmax<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return vmaxq_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pmax<Packet4s>(const Packet4s& a, const Packet4s& b) {
+  return vmax_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pmax<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return vmaxq_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pmax<Packet4us>(const Packet4us& a, const Packet4us& b) {
+  return vmax_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pmax<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return vmaxq_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pmax<Packet2i>(const Packet2i& a, const Packet2i& b) {
+  return vmax_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return vmaxq_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pmax<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
+  return vmax_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pmax<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return vmaxq_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pmax<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return vcombine_s64(vdup_n_s64((std::max)(vgetq_lane_s64(a, 0), vgetq_lane_s64(b, 0))),
+                      vdup_n_s64((std::max)(vgetq_lane_s64(a, 1), vgetq_lane_s64(b, 1))));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pmax<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+  return vcombine_u64(vdup_n_u64((std::max)(vgetq_lane_u64(a, 0), vgetq_lane_u64(b, 0))),
+                      vdup_n_u64((std::max)(vgetq_lane_u64(a, 1), vgetq_lane_u64(b, 1))));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2f pcmp_le<Packet2f>(const Packet2f& a, const Packet2f& b) {
+  return vreinterpret_f32_u32(vcle_f32(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pcmp_le<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return vreinterpretq_f32_u32(vcleq_f32(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c pcmp_le<Packet4c>(const Packet4c& a, const Packet4c& b) {
+  return vget_lane_s32(
+      vreinterpret_s32_u8(vcle_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pcmp_le<Packet8c>(const Packet8c& a, const Packet8c& b) {
+  return vreinterpret_s8_u8(vcle_s8(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pcmp_le<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return vreinterpretq_s8_u8(vcleq_s8(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pcmp_le<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
+  return vget_lane_u32(
+      vreinterpret_u32_u8(vcle_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pcmp_le<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
+  return vcle_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcmp_le<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return vcleq_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pcmp_le<Packet4s>(const Packet4s& a, const Packet4s& b) {
+  return vreinterpret_s16_u16(vcle_s16(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pcmp_le<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return vreinterpretq_s16_u16(vcleq_s16(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pcmp_le<Packet4us>(const Packet4us& a, const Packet4us& b) {
+  return vcle_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pcmp_le<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return vcleq_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pcmp_le<Packet2i>(const Packet2i& a, const Packet2i& b) {
+  return vreinterpret_s32_u32(vcle_s32(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pcmp_le<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return vreinterpretq_s32_u32(vcleq_s32(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pcmp_le<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
+  return vcle_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcmp_le<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return vcleq_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pcmp_le<Packet2l>(const Packet2l& a, const Packet2l& b) {
+#if EIGEN_ARCH_ARM64
+  return vreinterpretq_s64_u64(vcleq_s64(a, b));
+#else
+  return vcombine_s64(vdup_n_s64(vgetq_lane_s64(a, 0) <= vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0),
+                      vdup_n_s64(vgetq_lane_s64(a, 1) <= vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0));
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcmp_le<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+#if EIGEN_ARCH_ARM64
+  return vcleq_u64(a, b);
+#else
+  return vcombine_u64(vdup_n_u64(vgetq_lane_u64(a, 0) <= vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0),
+                      vdup_n_u64(vgetq_lane_u64(a, 1) <= vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0));
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2f pcmp_lt<Packet2f>(const Packet2f& a, const Packet2f& b) {
+  return vreinterpret_f32_u32(vclt_f32(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pcmp_lt<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return vreinterpretq_f32_u32(vcltq_f32(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c pcmp_lt<Packet4c>(const Packet4c& a, const Packet4c& b) {
+  return vget_lane_s32(
+      vreinterpret_s32_u8(vclt_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pcmp_lt<Packet8c>(const Packet8c& a, const Packet8c& b) {
+  return vreinterpret_s8_u8(vclt_s8(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pcmp_lt<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return vreinterpretq_s8_u8(vcltq_s8(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pcmp_lt<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
+  return vget_lane_u32(
+      vreinterpret_u32_u8(vclt_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pcmp_lt<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
+  return vclt_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcmp_lt<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return vcltq_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pcmp_lt<Packet4s>(const Packet4s& a, const Packet4s& b) {
+  return vreinterpret_s16_u16(vclt_s16(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pcmp_lt<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return vreinterpretq_s16_u16(vcltq_s16(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pcmp_lt<Packet4us>(const Packet4us& a, const Packet4us& b) {
+  return vclt_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pcmp_lt<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return vcltq_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pcmp_lt<Packet2i>(const Packet2i& a, const Packet2i& b) {
+  return vreinterpret_s32_u32(vclt_s32(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pcmp_lt<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return vreinterpretq_s32_u32(vcltq_s32(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pcmp_lt<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
+  return vclt_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcmp_lt<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return vcltq_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pcmp_lt<Packet2l>(const Packet2l& a, const Packet2l& b) {
+#if EIGEN_ARCH_ARM64
+  return vreinterpretq_s64_u64(vcltq_s64(a, b));
+#else
+  return vcombine_s64(vdup_n_s64(vgetq_lane_s64(a, 0) < vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0),
+                      vdup_n_s64(vgetq_lane_s64(a, 1) < vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0));
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcmp_lt<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+#if EIGEN_ARCH_ARM64
+  return vcltq_u64(a, b);
+#else
+  return vcombine_u64(vdup_n_u64(vgetq_lane_u64(a, 0) < vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0),
+                      vdup_n_u64(vgetq_lane_u64(a, 1) < vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0));
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2f pcmp_eq<Packet2f>(const Packet2f& a, const Packet2f& b) {
+  return vreinterpret_f32_u32(vceq_f32(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pcmp_eq<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return vreinterpretq_f32_u32(vceqq_f32(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c pcmp_eq<Packet4c>(const Packet4c& a, const Packet4c& b) {
+  return vget_lane_s32(
+      vreinterpret_s32_u8(vceq_s8(vreinterpret_s8_s32(vdup_n_s32(a)), vreinterpret_s8_s32(vdup_n_s32(b)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pcmp_eq<Packet8c>(const Packet8c& a, const Packet8c& b) {
+  return vreinterpret_s8_u8(vceq_s8(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pcmp_eq<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return vreinterpretq_s8_u8(vceqq_s8(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pcmp_eq<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
+  return vget_lane_u32(
+      vreinterpret_u32_u8(vceq_u8(vreinterpret_u8_u32(vdup_n_u32(a)), vreinterpret_u8_u32(vdup_n_u32(b)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pcmp_eq<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
+  return vceq_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcmp_eq<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return vceqq_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pcmp_eq<Packet4s>(const Packet4s& a, const Packet4s& b) {
+  return vreinterpret_s16_u16(vceq_s16(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pcmp_eq<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return vreinterpretq_s16_u16(vceqq_s16(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pcmp_eq<Packet4us>(const Packet4us& a, const Packet4us& b) {
+  return vceq_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pcmp_eq<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return vceqq_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pcmp_eq<Packet2i>(const Packet2i& a, const Packet2i& b) {
+  return vreinterpret_s32_u32(vceq_s32(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pcmp_eq<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return vreinterpretq_s32_u32(vceqq_s32(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pcmp_eq<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
+  return vceq_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcmp_eq<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return vceqq_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pcmp_eq<Packet2l>(const Packet2l& a, const Packet2l& b) {
+#if EIGEN_ARCH_ARM64
+  return vreinterpretq_s64_u64(vceqq_s64(a, b));
+#else
+  return vcombine_s64(vdup_n_s64(vgetq_lane_s64(a, 0) == vgetq_lane_s64(b, 0) ? numext::int64_t(-1) : 0),
+                      vdup_n_s64(vgetq_lane_s64(a, 1) == vgetq_lane_s64(b, 1) ? numext::int64_t(-1) : 0));
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcmp_eq<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+#if EIGEN_ARCH_ARM64
+  return vceqq_u64(a, b);
+#else
+  return vcombine_u64(vdup_n_u64(vgetq_lane_u64(a, 0) == vgetq_lane_u64(b, 0) ? numext::uint64_t(-1) : 0),
+                      vdup_n_u64(vgetq_lane_u64(a, 1) == vgetq_lane_u64(b, 1) ? numext::uint64_t(-1) : 0));
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2f pcmp_lt_or_nan<Packet2f>(const Packet2f& a, const Packet2f& b) {
+  return vreinterpret_f32_u32(vmvn_u32(vcge_f32(a, b)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return vreinterpretq_f32_u32(vmvnq_u32(vcgeq_f32(a, b)));
+}
+
+// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
+template <>
+EIGEN_STRONG_INLINE Packet2f pand<Packet2f>(const Packet2f& a, const Packet2f& b) {
+  return vreinterpret_f32_u32(vand_u32(vreinterpret_u32_f32(a), vreinterpret_u32_f32(b)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c pand<Packet4c>(const Packet4c& a, const Packet4c& b) {
+  return a & b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pand<Packet8c>(const Packet8c& a, const Packet8c& b) {
+  return vand_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pand<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return vandq_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pand<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
+  return a & b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pand<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
+  return vand_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pand<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return vandq_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pand<Packet4s>(const Packet4s& a, const Packet4s& b) {
+  return vand_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pand<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return vandq_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pand<Packet4us>(const Packet4us& a, const Packet4us& b) {
+  return vand_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pand<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return vandq_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pand<Packet2i>(const Packet2i& a, const Packet2i& b) {
+  return vand_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return vandq_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pand<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
+  return vand_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return vandq_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pand<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return vandq_s64(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pand<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+  return vandq_u64(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2f por<Packet2f>(const Packet2f& a, const Packet2f& b) {
+  return vreinterpret_f32_u32(vorr_u32(vreinterpret_u32_f32(a), vreinterpret_u32_f32(b)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c por<Packet4c>(const Packet4c& a, const Packet4c& b) {
+  return a | b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c por<Packet8c>(const Packet8c& a, const Packet8c& b) {
+  return vorr_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c por<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return vorrq_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc por<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
+  return a | b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc por<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
+  return vorr_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc por<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return vorrq_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s por<Packet4s>(const Packet4s& a, const Packet4s& b) {
+  return vorr_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s por<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return vorrq_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us por<Packet4us>(const Packet4us& a, const Packet4us& b) {
+  return vorr_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us por<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return vorrq_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i por<Packet2i>(const Packet2i& a, const Packet2i& b) {
+  return vorr_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return vorrq_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui por<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
+  return vorr_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui por<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return vorrq_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l por<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return vorrq_s64(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul por<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+  return vorrq_u64(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2f pxor<Packet2f>(const Packet2f& a, const Packet2f& b) {
+  return vreinterpret_f32_u32(veor_u32(vreinterpret_u32_f32(a), vreinterpret_u32_f32(b)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c pxor<Packet4c>(const Packet4c& a, const Packet4c& b) {
+  return a ^ b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pxor<Packet8c>(const Packet8c& a, const Packet8c& b) {
+  return veor_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pxor<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return veorq_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pxor<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
+  return a ^ b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pxor<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
+  return veor_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pxor<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return veorq_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pxor<Packet4s>(const Packet4s& a, const Packet4s& b) {
+  return veor_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pxor<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return veorq_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pxor<Packet4us>(const Packet4us& a, const Packet4us& b) {
+  return veor_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pxor<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return veorq_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pxor<Packet2i>(const Packet2i& a, const Packet2i& b) {
+  return veor_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return veorq_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pxor<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
+  return veor_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pxor<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return veorq_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pxor<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return veorq_s64(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pxor<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+  return veorq_u64(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2f pandnot<Packet2f>(const Packet2f& a, const Packet2f& b) {
+  return vreinterpret_f32_u32(vbic_u32(vreinterpret_u32_f32(a), vreinterpret_u32_f32(b)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a), vreinterpretq_u32_f32(b)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c pandnot<Packet4c>(const Packet4c& a, const Packet4c& b) {
+  return a & ~b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pandnot<Packet8c>(const Packet8c& a, const Packet8c& b) {
+  return vbic_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pandnot<Packet16c>(const Packet16c& a, const Packet16c& b) {
+  return vbicq_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pandnot<Packet4uc>(const Packet4uc& a, const Packet4uc& b) {
+  return a & ~b;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pandnot<Packet8uc>(const Packet8uc& a, const Packet8uc& b) {
+  return vbic_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pandnot<Packet16uc>(const Packet16uc& a, const Packet16uc& b) {
+  return vbicq_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pandnot<Packet4s>(const Packet4s& a, const Packet4s& b) {
+  return vbic_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pandnot<Packet8s>(const Packet8s& a, const Packet8s& b) {
+  return vbicq_s16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pandnot<Packet4us>(const Packet4us& a, const Packet4us& b) {
+  return vbic_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pandnot<Packet8us>(const Packet8us& a, const Packet8us& b) {
+  return vbicq_u16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pandnot<Packet2i>(const Packet2i& a, const Packet2i& b) {
+  return vbic_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return vbicq_s32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pandnot<Packet2ui>(const Packet2ui& a, const Packet2ui& b) {
+  return vbic_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pandnot<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return vbicq_u32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pandnot<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return vbicq_s64(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pandnot<Packet2ul>(const Packet2ul& a, const Packet2ul& b) {
+  return vbicq_u64(a, b);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet4c parithmetic_shift_right(Packet4c& a) {
+  return vget_lane_s32(vreinterpret_s32_s8(vshr_n_s8(vreinterpret_s8_s32(vdup_n_s32(a)), N)), 0);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet8c parithmetic_shift_right(Packet8c a) {
+  return vshr_n_s8(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet16c parithmetic_shift_right(Packet16c a) {
+  return vshrq_n_s8(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4uc parithmetic_shift_right(Packet4uc& a) {
+  return vget_lane_u32(vreinterpret_u32_u8(vshr_n_u8(vreinterpret_u8_u32(vdup_n_u32(a)), N)), 0);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet8uc parithmetic_shift_right(Packet8uc a) {
+  return vshr_n_u8(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet16uc parithmetic_shift_right(Packet16uc a) {
+  return vshrq_n_u8(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4s parithmetic_shift_right(Packet4s a) {
+  return vshr_n_s16(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet8s parithmetic_shift_right(Packet8s a) {
+  return vshrq_n_s16(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4us parithmetic_shift_right(Packet4us a) {
+  return vshr_n_u16(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet8us parithmetic_shift_right(Packet8us a) {
+  return vshrq_n_u16(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet2i parithmetic_shift_right(Packet2i a) {
+  return vshr_n_s32(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(Packet4i a) {
+  return vshrq_n_s32(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet2ui parithmetic_shift_right(Packet2ui a) {
+  return vshr_n_u32(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4ui parithmetic_shift_right(Packet4ui a) {
+  return vshrq_n_u32(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet2l parithmetic_shift_right(Packet2l a) {
+  return vshrq_n_s64(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet2ul parithmetic_shift_right(Packet2ul a) {
+  return vshrq_n_u64(a, N);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet4c plogical_shift_right(Packet4c& a) {
+  return vget_lane_s32(vreinterpret_s32_u8(vshr_n_u8(vreinterpret_u8_s32(vdup_n_s32(a)), N)), 0);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet8c plogical_shift_right(Packet8c a) {
+  return vreinterpret_s8_u8(vshr_n_u8(vreinterpret_u8_s8(a), N));
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet16c plogical_shift_right(Packet16c a) {
+  return vreinterpretq_s8_u8(vshrq_n_u8(vreinterpretq_u8_s8(a), N));
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4uc plogical_shift_right(Packet4uc& a) {
+  return vget_lane_u32(vreinterpret_u32_s8(vshr_n_s8(vreinterpret_s8_u32(vdup_n_u32(a)), N)), 0);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet8uc plogical_shift_right(Packet8uc a) {
+  return vshr_n_u8(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet16uc plogical_shift_right(Packet16uc a) {
+  return vshrq_n_u8(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4s plogical_shift_right(Packet4s a) {
+  return vreinterpret_s16_u16(vshr_n_u16(vreinterpret_u16_s16(a), N));
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet8s plogical_shift_right(Packet8s a) {
+  return vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(a), N));
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4us plogical_shift_right(Packet4us a) {
+  return vshr_n_u16(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet8us plogical_shift_right(Packet8us a) {
+  return vshrq_n_u16(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet2i plogical_shift_right(Packet2i a) {
+  return vreinterpret_s32_u32(vshr_n_u32(vreinterpret_u32_s32(a), N));
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4i plogical_shift_right(Packet4i a) {
+  return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), N));
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet2ui plogical_shift_right(Packet2ui a) {
+  return vshr_n_u32(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(Packet4ui a) {
+  return vshrq_n_u32(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet2l plogical_shift_right(Packet2l a) {
+  return vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(a), N));
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet2ul plogical_shift_right(Packet2ul a) {
+  return vshrq_n_u64(a, N);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet4c plogical_shift_left(Packet4c& a) {
+  return vget_lane_s32(vreinterpret_s32_s8(vshl_n_s8(vreinterpret_s8_s32(vdup_n_s32(a)), N)), 0);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet8c plogical_shift_left(Packet8c a) {
+  return vshl_n_s8(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet16c plogical_shift_left(Packet16c a) {
+  return vshlq_n_s8(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4uc plogical_shift_left(Packet4uc& a) {
+  return vget_lane_u32(vreinterpret_u32_u8(vshl_n_u8(vreinterpret_u8_u32(vdup_n_u32(a)), N)), 0);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet8uc plogical_shift_left(Packet8uc a) {
+  return vshl_n_u8(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet16uc plogical_shift_left(Packet16uc a) {
+  return vshlq_n_u8(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4s plogical_shift_left(Packet4s a) {
+  return vshl_n_s16(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet8s plogical_shift_left(Packet8s a) {
+  return vshlq_n_s16(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4us plogical_shift_left(Packet4us a) {
+  return vshl_n_u16(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet8us plogical_shift_left(Packet8us a) {
+  return vshlq_n_u16(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet2i plogical_shift_left(Packet2i a) {
+  return vshl_n_s32(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4i plogical_shift_left(Packet4i a) {
+  return vshlq_n_s32(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet2ui plogical_shift_left(Packet2ui a) {
+  return vshl_n_u32(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(Packet4ui a) {
+  return vshlq_n_u32(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet2l plogical_shift_left(Packet2l a) {
+  return vshlq_n_s64(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet2ul plogical_shift_left(Packet2ul a) {
+  return vshlq_n_u64(a, N);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2f pload<Packet2f>(const float* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1_f32(assume_aligned<unpacket_traits<Packet2f>::alignment>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(assume_aligned<unpacket_traits<Packet4f>::alignment>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c pload<Packet4c>(const int8_t* from) {
+  Packet4c res;
+  memcpy(&res, from, sizeof(Packet4c));
+  return res;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pload<Packet8c>(const int8_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1_s8(assume_aligned<unpacket_traits<Packet8c>::alignment>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pload<Packet16c>(const int8_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s8(assume_aligned<unpacket_traits<Packet16c>::alignment>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pload<Packet4uc>(const uint8_t* from) {
+  Packet4uc res;
+  memcpy(&res, from, sizeof(Packet4uc));
+  return res;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pload<Packet8uc>(const uint8_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1_u8(assume_aligned<unpacket_traits<Packet8uc>::alignment>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pload<Packet16uc>(const uint8_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u8(assume_aligned<unpacket_traits<Packet16uc>::alignment>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pload<Packet4s>(const int16_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1_s16(assume_aligned<unpacket_traits<Packet4s>::alignment>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pload<Packet8s>(const int16_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s16(assume_aligned<unpacket_traits<Packet8s>::alignment>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pload<Packet4us>(const uint16_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1_u16(assume_aligned<unpacket_traits<Packet4us>::alignment>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pload<Packet8us>(const uint16_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u16(assume_aligned<unpacket_traits<Packet8us>::alignment>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pload<Packet2i>(const int32_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1_s32(assume_aligned<unpacket_traits<Packet2i>::alignment>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int32_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(assume_aligned<unpacket_traits<Packet4i>::alignment>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pload<Packet2ui>(const uint32_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1_u32(assume_aligned<unpacket_traits<Packet2ui>::alignment>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pload<Packet4ui>(const uint32_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u32(assume_aligned<unpacket_traits<Packet4ui>::alignment>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pload<Packet2l>(const int64_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s64(assume_aligned<unpacket_traits<Packet2l>::alignment>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pload<Packet2ul>(const uint64_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_u64(assume_aligned<unpacket_traits<Packet2ul>::alignment>(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2f ploadu<Packet2f>(const float* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return vld1_f32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c ploadu<Packet4c>(const int8_t* from) {
+  Packet4c res;
+  memcpy(&res, from, sizeof(Packet4c));
+  return res;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c ploadu<Packet8c>(const int8_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s8(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c ploadu<Packet16c>(const int8_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s8(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc ploadu<Packet4uc>(const uint8_t* from) {
+  Packet4uc res;
+  memcpy(&res, from, sizeof(Packet4uc));
+  return res;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc ploadu<Packet8uc>(const uint8_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u8(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc ploadu<Packet16uc>(const uint8_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u8(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s ploadu<Packet4s>(const int16_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s16(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s ploadu<Packet8s>(const int16_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s16(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us ploadu<Packet4us>(const uint16_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u16(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us ploadu<Packet8us>(const uint16_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u16(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i ploadu<Packet2i>(const int32_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return vld1_s32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int32_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui ploadu<Packet2ui>(const uint32_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return vld1_u32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui ploadu<Packet4ui>(const uint32_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l ploadu<Packet2l>(const int64_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s64(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul ploadu<Packet2ul>(const uint64_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_u64(from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2f ploaddup<Packet2f>(const float* from) {
+  return vld1_dup_f32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) {
+  return vcombine_f32(vld1_dup_f32(from), vld1_dup_f32(from + 1));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c ploaddup<Packet4c>(const int8_t* from) {
+  const int8x8_t a = vreinterpret_s8_s32(vdup_n_s32(pload<Packet4c>(from)));
+  return vget_lane_s32(vreinterpret_s32_s8(vzip_s8(a, a).val[0]), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c ploaddup<Packet8c>(const int8_t* from) {
+  const int8x8_t a = vld1_s8(from);
+  return vzip_s8(a, a).val[0];
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c ploaddup<Packet16c>(const int8_t* from) {
+  const int8x8_t a = vld1_s8(from);
+  const int8x8x2_t b = vzip_s8(a, a);
+  return vcombine_s8(b.val[0], b.val[1]);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc ploaddup<Packet4uc>(const uint8_t* from) {
+  const uint8x8_t a = vreinterpret_u8_u32(vdup_n_u32(pload<Packet4uc>(from)));
+  return vget_lane_u32(vreinterpret_u32_u8(vzip_u8(a, a).val[0]), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc ploaddup<Packet8uc>(const uint8_t* from) {
+  const uint8x8_t a = vld1_u8(from);
+  return vzip_u8(a, a).val[0];
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc ploaddup<Packet16uc>(const uint8_t* from) {
+  const uint8x8_t a = vld1_u8(from);
+  const uint8x8x2_t b = vzip_u8(a, a);
+  return vcombine_u8(b.val[0], b.val[1]);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s ploaddup<Packet4s>(const int16_t* from) {
+  return vreinterpret_s16_u32(
+      vzip_u32(vreinterpret_u32_s16(vld1_dup_s16(from)), vreinterpret_u32_s16(vld1_dup_s16(from + 1))).val[0]);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s ploaddup<Packet8s>(const int16_t* from) {
+  const int16x4_t a = vld1_s16(from);
+  const int16x4x2_t b = vzip_s16(a, a);
+  return vcombine_s16(b.val[0], b.val[1]);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us ploaddup<Packet4us>(const uint16_t* from) {
+  return vreinterpret_u16_u32(
+      vzip_u32(vreinterpret_u32_u16(vld1_dup_u16(from)), vreinterpret_u32_u16(vld1_dup_u16(from + 1))).val[0]);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us ploaddup<Packet8us>(const uint16_t* from) {
+  const uint16x4_t a = vld1_u16(from);
+  const uint16x4x2_t b = vzip_u16(a, a);
+  return vcombine_u16(b.val[0], b.val[1]);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i ploaddup<Packet2i>(const int32_t* from) {
+  return vld1_dup_s32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int32_t* from) {
+  return vcombine_s32(vld1_dup_s32(from), vld1_dup_s32(from + 1));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui ploaddup<Packet2ui>(const uint32_t* from) {
+  return vld1_dup_u32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui ploaddup<Packet4ui>(const uint32_t* from) {
+  return vcombine_u32(vld1_dup_u32(from), vld1_dup_u32(from + 1));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l ploaddup<Packet2l>(const int64_t* from) {
+  return vld1q_dup_s64(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul ploaddup<Packet2ul>(const uint64_t* from) {
+  return vld1q_dup_u64(from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f ploadquad<Packet4f>(const float* from) {
+  return vld1q_dup_f32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c ploadquad<Packet4c>(const int8_t* from) {
+  return vget_lane_s32(vreinterpret_s32_s8(vld1_dup_s8(from)), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c ploadquad<Packet8c>(const int8_t* from) {
+  return vreinterpret_s8_u32(
+      vzip_u32(vreinterpret_u32_s8(vld1_dup_s8(from)), vreinterpret_u32_s8(vld1_dup_s8(from + 1))).val[0]);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c ploadquad<Packet16c>(const int8_t* from) {
+  const int8x8_t a = vreinterpret_s8_u32(
+      vzip_u32(vreinterpret_u32_s8(vld1_dup_s8(from)), vreinterpret_u32_s8(vld1_dup_s8(from + 1))).val[0]);
+  const int8x8_t b = vreinterpret_s8_u32(
+      vzip_u32(vreinterpret_u32_s8(vld1_dup_s8(from + 2)), vreinterpret_u32_s8(vld1_dup_s8(from + 3))).val[0]);
+  return vcombine_s8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc ploadquad<Packet4uc>(const uint8_t* from) {
+  return vget_lane_u32(vreinterpret_u32_u8(vld1_dup_u8(from)), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc ploadquad<Packet8uc>(const uint8_t* from) {
+  return vreinterpret_u8_u32(
+      vzip_u32(vreinterpret_u32_u8(vld1_dup_u8(from)), vreinterpret_u32_u8(vld1_dup_u8(from + 1))).val[0]);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc ploadquad<Packet16uc>(const uint8_t* from) {
+  const uint8x8_t a = vreinterpret_u8_u32(
+      vzip_u32(vreinterpret_u32_u8(vld1_dup_u8(from)), vreinterpret_u32_u8(vld1_dup_u8(from + 1))).val[0]);
+  const uint8x8_t b = vreinterpret_u8_u32(
+      vzip_u32(vreinterpret_u32_u8(vld1_dup_u8(from + 2)), vreinterpret_u32_u8(vld1_dup_u8(from + 3))).val[0]);
+  return vcombine_u8(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s ploadquad<Packet8s>(const int16_t* from) {
+  return vcombine_s16(vld1_dup_s16(from), vld1_dup_s16(from + 1));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us ploadquad<Packet8us>(const uint16_t* from) {
+  return vcombine_u16(vld1_dup_u16(from), vld1_dup_u16(from + 1));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i ploadquad<Packet4i>(const int32_t* from) {
+  return vld1q_dup_s32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui ploadquad<Packet4ui>(const uint32_t* from) {
+  return vld1q_dup_u32(from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet2f& from) {
+  EIGEN_DEBUG_ALIGNED_STORE vst1_f32(assume_aligned<unpacket_traits<Packet2f>::alignment>(to), from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
+  EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(assume_aligned<unpacket_traits<Packet4f>::alignment>(to), from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet4c& from) {
+  memcpy(to, &from, sizeof(from));
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet8c& from) {
+  EIGEN_DEBUG_ALIGNED_STORE vst1_s8(assume_aligned<unpacket_traits<Packet8c>::alignment>(to), from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<int8_t>(int8_t* to, const Packet16c& from) {
+  EIGEN_DEBUG_ALIGNED_STORE vst1q_s8(assume_aligned<unpacket_traits<Packet16c>::alignment>(to), from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet4uc& from) {
+  memcpy(to, &from, sizeof(from));
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet8uc& from) {
+  EIGEN_DEBUG_ALIGNED_STORE vst1_u8(assume_aligned<unpacket_traits<Packet8uc>::alignment>(to), from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<uint8_t>(uint8_t* to, const Packet16uc& from) {
+  EIGEN_DEBUG_ALIGNED_STORE vst1q_u8(assume_aligned<unpacket_traits<Packet16uc>::alignment>(to), from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<int16_t>(int16_t* to, const Packet4s& from) {
+  EIGEN_DEBUG_ALIGNED_STORE vst1_s16(assume_aligned<unpacket_traits<Packet4s>::alignment>(to), from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<int16_t>(int16_t* to, const Packet8s& from) {
+  EIGEN_DEBUG_ALIGNED_STORE vst1q_s16(assume_aligned<unpacket_traits<Packet8s>::alignment>(to), from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<uint16_t>(uint16_t* to, const Packet4us& from) {
+  EIGEN_DEBUG_ALIGNED_STORE vst1_u16(assume_aligned<unpacket_traits<Packet4us>::alignment>(to), from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<uint16_t>(uint16_t* to, const Packet8us& from) {
+  EIGEN_DEBUG_ALIGNED_STORE vst1q_u16(assume_aligned<unpacket_traits<Packet8us>::alignment>(to), from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet2i& from) {
+  EIGEN_DEBUG_ALIGNED_STORE vst1_s32(assume_aligned<unpacket_traits<Packet2i>::alignment>(to), from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<int32_t>(int32_t* to, const Packet4i& from) {
+  EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(assume_aligned<unpacket_traits<Packet4i>::alignment>(to), from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet2ui& from) {
+  EIGEN_DEBUG_ALIGNED_STORE vst1_u32(assume_aligned<unpacket_traits<Packet2ui>::alignment>(to), from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet4ui& from) {
+  EIGEN_DEBUG_ALIGNED_STORE vst1q_u32(assume_aligned<unpacket_traits<Packet4ui>::alignment>(to), from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<int64_t>(int64_t* to, const Packet2l& from) {
+  EIGEN_DEBUG_ALIGNED_STORE vst1q_s64(assume_aligned<unpacket_traits<Packet2l>::alignment>(to), from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<uint64_t>(uint64_t* to, const Packet2ul& from) {
+  EIGEN_DEBUG_ALIGNED_STORE vst1q_u64(assume_aligned<unpacket_traits<Packet2ul>::alignment>(to), from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet2f& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE vst1_f32(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int8_t>(int8_t* to, const Packet4c& from) {
+  memcpy(to, &from, sizeof(from));
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int8_t>(int8_t* to, const Packet8c& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE vst1_s8(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int8_t>(int8_t* to, const Packet16c& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE vst1q_s8(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<uint8_t>(uint8_t* to, const Packet4uc& from) {
+  memcpy(to, &from, sizeof(from));
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<uint8_t>(uint8_t* to, const Packet8uc& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE vst1_u8(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<uint8_t>(uint8_t* to, const Packet16uc& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE vst1q_u8(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int16_t>(int16_t* to, const Packet4s& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE vst1_s16(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int16_t>(int16_t* to, const Packet8s& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE vst1q_s16(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<uint16_t>(uint16_t* to, const Packet4us& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE vst1_u16(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<uint16_t>(uint16_t* to, const Packet8us& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE vst1q_u16(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet2i& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE vst1_s32(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int32_t>(int32_t* to, const Packet4i& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<uint32_t>(uint32_t* to, const Packet2ui& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE vst1_u32(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<uint32_t>(uint32_t* to, const Packet4ui& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE vst1q_u32(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int64_t>(int64_t* to, const Packet2l& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE vst1q_s64(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<uint64_t>(uint64_t* to, const Packet2ul& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE vst1q_u64(to, from);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2f pgather<float, Packet2f>(const float* from, Index stride) {
+  Packet2f res = vld1_dup_f32(from);
+  res = vld1_lane_f32(from + 1 * stride, res, 1);
+  return res;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
+  Packet4f res = vld1q_dup_f32(from);
+  res = vld1q_lane_f32(from + 1 * stride, res, 1);
+  res = vld1q_lane_f32(from + 2 * stride, res, 2);
+  res = vld1q_lane_f32(from + 3 * stride, res, 3);
+  return res;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c pgather<int8_t, Packet4c>(const int8_t* from, Index stride) {
+  Packet4c res;
+  for (int i = 0; i != 4; i++) reinterpret_cast<int8_t*>(&res)[i] = *(from + i * stride);
+  return res;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c pgather<int8_t, Packet8c>(const int8_t* from, Index stride) {
+  Packet8c res = vld1_dup_s8(from);
+  res = vld1_lane_s8(from + 1 * stride, res, 1);
+  res = vld1_lane_s8(from + 2 * stride, res, 2);
+  res = vld1_lane_s8(from + 3 * stride, res, 3);
+  res = vld1_lane_s8(from + 4 * stride, res, 4);
+  res = vld1_lane_s8(from + 5 * stride, res, 5);
+  res = vld1_lane_s8(from + 6 * stride, res, 6);
+  res = vld1_lane_s8(from + 7 * stride, res, 7);
+  return res;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pgather<int8_t, Packet16c>(const int8_t* from, Index stride) {
+  Packet16c res = vld1q_dup_s8(from);
+  res = vld1q_lane_s8(from + 1 * stride, res, 1);
+  res = vld1q_lane_s8(from + 2 * stride, res, 2);
+  res = vld1q_lane_s8(from + 3 * stride, res, 3);
+  res = vld1q_lane_s8(from + 4 * stride, res, 4);
+  res = vld1q_lane_s8(from + 5 * stride, res, 5);
+  res = vld1q_lane_s8(from + 6 * stride, res, 6);
+  res = vld1q_lane_s8(from + 7 * stride, res, 7);
+  res = vld1q_lane_s8(from + 8 * stride, res, 8);
+  res = vld1q_lane_s8(from + 9 * stride, res, 9);
+  res = vld1q_lane_s8(from + 10 * stride, res, 10);
+  res = vld1q_lane_s8(from + 11 * stride, res, 11);
+  res = vld1q_lane_s8(from + 12 * stride, res, 12);
+  res = vld1q_lane_s8(from + 13 * stride, res, 13);
+  res = vld1q_lane_s8(from + 14 * stride, res, 14);
+  res = vld1q_lane_s8(from + 15 * stride, res, 15);
+  return res;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4uc pgather<uint8_t, Packet4uc>(const uint8_t* from, Index stride) {
+  Packet4uc res;
+  for (int i = 0; i != 4; i++) reinterpret_cast<uint8_t*>(&res)[i] = *(from + i * stride);
+  return res;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc pgather<uint8_t, Packet8uc>(const uint8_t* from, Index stride) {
+  Packet8uc res = vld1_dup_u8(from);
+  res = vld1_lane_u8(from + 1 * stride, res, 1);
+  res = vld1_lane_u8(from + 2 * stride, res, 2);
+  res = vld1_lane_u8(from + 3 * stride, res, 3);
+  res = vld1_lane_u8(from + 4 * stride, res, 4);
+  res = vld1_lane_u8(from + 5 * stride, res, 5);
+  res = vld1_lane_u8(from + 6 * stride, res, 6);
+  res = vld1_lane_u8(from + 7 * stride, res, 7);
+  return res;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pgather<uint8_t, Packet16uc>(const uint8_t* from, Index stride) {
+  Packet16uc res = vld1q_dup_u8(from);
+  res = vld1q_lane_u8(from + 1 * stride, res, 1);
+  res = vld1q_lane_u8(from + 2 * stride, res, 2);
+  res = vld1q_lane_u8(from + 3 * stride, res, 3);
+  res = vld1q_lane_u8(from + 4 * stride, res, 4);
+  res = vld1q_lane_u8(from + 5 * stride, res, 5);
+  res = vld1q_lane_u8(from + 6 * stride, res, 6);
+  res = vld1q_lane_u8(from + 7 * stride, res, 7);
+  res = vld1q_lane_u8(from + 8 * stride, res, 8);
+  res = vld1q_lane_u8(from + 9 * stride, res, 9);
+  res = vld1q_lane_u8(from + 10 * stride, res, 10);
+  res = vld1q_lane_u8(from + 11 * stride, res, 11);
+  res = vld1q_lane_u8(from + 12 * stride, res, 12);
+  res = vld1q_lane_u8(from + 13 * stride, res, 13);
+  res = vld1q_lane_u8(from + 14 * stride, res, 14);
+  res = vld1q_lane_u8(from + 15 * stride, res, 15);
+  return res;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s pgather<int16_t, Packet4s>(const int16_t* from, Index stride) {
+  Packet4s res = vld1_dup_s16(from);
+  res = vld1_lane_s16(from + 1 * stride, res, 1);
+  res = vld1_lane_s16(from + 2 * stride, res, 2);
+  res = vld1_lane_s16(from + 3 * stride, res, 3);
+  return res;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pgather<int16_t, Packet8s>(const int16_t* from, Index stride) {
+  Packet8s res = vld1q_dup_s16(from);
+  res = vld1q_lane_s16(from + 1 * stride, res, 1);
+  res = vld1q_lane_s16(from + 2 * stride, res, 2);
+  res = vld1q_lane_s16(from + 3 * stride, res, 3);
+  res = vld1q_lane_s16(from + 4 * stride, res, 4);
+  res = vld1q_lane_s16(from + 5 * stride, res, 5);
+  res = vld1q_lane_s16(from + 6 * stride, res, 6);
+  res = vld1q_lane_s16(from + 7 * stride, res, 7);
+  return res;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us pgather<uint16_t, Packet4us>(const uint16_t* from, Index stride) {
+  Packet4us res = vld1_dup_u16(from);
+  res = vld1_lane_u16(from + 1 * stride, res, 1);
+  res = vld1_lane_u16(from + 2 * stride, res, 2);
+  res = vld1_lane_u16(from + 3 * stride, res, 3);
+  return res;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pgather<uint16_t, Packet8us>(const uint16_t* from, Index stride) {
+  Packet8us res = vld1q_dup_u16(from);
+  res = vld1q_lane_u16(from + 1 * stride, res, 1);
+  res = vld1q_lane_u16(from + 2 * stride, res, 2);
+  res = vld1q_lane_u16(from + 3 * stride, res, 3);
+  res = vld1q_lane_u16(from + 4 * stride, res, 4);
+  res = vld1q_lane_u16(from + 5 * stride, res, 5);
+  res = vld1q_lane_u16(from + 6 * stride, res, 6);
+  res = vld1q_lane_u16(from + 7 * stride, res, 7);
+  return res;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2i pgather<int32_t, Packet2i>(const int32_t* from, Index stride) {
+  Packet2i res = vld1_dup_s32(from);
+  res = vld1_lane_s32(from + 1 * stride, res, 1);
+  return res;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pgather<int32_t, Packet4i>(const int32_t* from, Index stride) {
+  Packet4i res = vld1q_dup_s32(from);
+  res = vld1q_lane_s32(from + 1 * stride, res, 1);
+  res = vld1q_lane_s32(from + 2 * stride, res, 2);
+  res = vld1q_lane_s32(from + 3 * stride, res, 3);
+  return res;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ui pgather<uint32_t, Packet2ui>(const uint32_t* from, Index stride) {
+  Packet2ui res = vld1_dup_u32(from);
+  res = vld1_lane_u32(from + 1 * stride, res, 1);
+  return res;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pgather<uint32_t, Packet4ui>(const uint32_t* from, Index stride) {
+  Packet4ui res = vld1q_dup_u32(from);
+  res = vld1q_lane_u32(from + 1 * stride, res, 1);
+  res = vld1q_lane_u32(from + 2 * stride, res, 2);
+  res = vld1q_lane_u32(from + 3 * stride, res, 3);
+  return res;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pgather<int64_t, Packet2l>(const int64_t* from, Index stride) {
+  Packet2l res = vld1q_dup_s64(from);
+  res = vld1q_lane_s64(from + 1 * stride, res, 1);
+  return res;
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pgather<uint64_t, Packet2ul>(const uint64_t* from, Index stride) {
+  Packet2ul res = vld1q_dup_u64(from);
+  res = vld1q_lane_u64(from + 1 * stride, res, 1);
+  return res;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<float, Packet2f>(float* to, const Packet2f& from, Index stride) {
+  vst1_lane_f32(to + stride * 0, from, 0);
+  vst1_lane_f32(to + stride * 1, from, 1);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) {
+  vst1q_lane_f32(to + stride * 0, from, 0);
+  vst1q_lane_f32(to + stride * 1, from, 1);
+  vst1q_lane_f32(to + stride * 2, from, 2);
+  vst1q_lane_f32(to + stride * 3, from, 3);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int8_t, Packet4c>(int8_t* to, const Packet4c& from, Index stride) {
+  for (int i = 0; i != 4; i++) *(to + i * stride) = reinterpret_cast<const int8_t*>(&from)[i];
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int8_t, Packet8c>(int8_t* to, const Packet8c& from, Index stride) {
+  vst1_lane_s8(to + stride * 0, from, 0);
+  vst1_lane_s8(to + stride * 1, from, 1);
+  vst1_lane_s8(to + stride * 2, from, 2);
+  vst1_lane_s8(to + stride * 3, from, 3);
+  vst1_lane_s8(to + stride * 4, from, 4);
+  vst1_lane_s8(to + stride * 5, from, 5);
+  vst1_lane_s8(to + stride * 6, from, 6);
+  vst1_lane_s8(to + stride * 7, from, 7);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int8_t, Packet16c>(int8_t* to, const Packet16c& from,
+                                                                       Index stride) {
+  vst1q_lane_s8(to + stride * 0, from, 0);
+  vst1q_lane_s8(to + stride * 1, from, 1);
+  vst1q_lane_s8(to + stride * 2, from, 2);
+  vst1q_lane_s8(to + stride * 3, from, 3);
+  vst1q_lane_s8(to + stride * 4, from, 4);
+  vst1q_lane_s8(to + stride * 5, from, 5);
+  vst1q_lane_s8(to + stride * 6, from, 6);
+  vst1q_lane_s8(to + stride * 7, from, 7);
+  vst1q_lane_s8(to + stride * 8, from, 8);
+  vst1q_lane_s8(to + stride * 9, from, 9);
+  vst1q_lane_s8(to + stride * 10, from, 10);
+  vst1q_lane_s8(to + stride * 11, from, 11);
+  vst1q_lane_s8(to + stride * 12, from, 12);
+  vst1q_lane_s8(to + stride * 13, from, 13);
+  vst1q_lane_s8(to + stride * 14, from, 14);
+  vst1q_lane_s8(to + stride * 15, from, 15);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint8_t, Packet4uc>(uint8_t* to, const Packet4uc& from,
+                                                                        Index stride) {
+  for (int i = 0; i != 4; i++) *(to + i * stride) = reinterpret_cast<const uint8_t*>(&from)[i];
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint8_t, Packet8uc>(uint8_t* to, const Packet8uc& from,
+                                                                        Index stride) {
+  vst1_lane_u8(to + stride * 0, from, 0);
+  vst1_lane_u8(to + stride * 1, from, 1);
+  vst1_lane_u8(to + stride * 2, from, 2);
+  vst1_lane_u8(to + stride * 3, from, 3);
+  vst1_lane_u8(to + stride * 4, from, 4);
+  vst1_lane_u8(to + stride * 5, from, 5);
+  vst1_lane_u8(to + stride * 6, from, 6);
+  vst1_lane_u8(to + stride * 7, from, 7);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint8_t, Packet16uc>(uint8_t* to, const Packet16uc& from,
+                                                                         Index stride) {
+  vst1q_lane_u8(to + stride * 0, from, 0);
+  vst1q_lane_u8(to + stride * 1, from, 1);
+  vst1q_lane_u8(to + stride * 2, from, 2);
+  vst1q_lane_u8(to + stride * 3, from, 3);
+  vst1q_lane_u8(to + stride * 4, from, 4);
+  vst1q_lane_u8(to + stride * 5, from, 5);
+  vst1q_lane_u8(to + stride * 6, from, 6);
+  vst1q_lane_u8(to + stride * 7, from, 7);
+  vst1q_lane_u8(to + stride * 8, from, 8);
+  vst1q_lane_u8(to + stride * 9, from, 9);
+  vst1q_lane_u8(to + stride * 10, from, 10);
+  vst1q_lane_u8(to + stride * 11, from, 11);
+  vst1q_lane_u8(to + stride * 12, from, 12);
+  vst1q_lane_u8(to + stride * 13, from, 13);
+  vst1q_lane_u8(to + stride * 14, from, 14);
+  vst1q_lane_u8(to + stride * 15, from, 15);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int16_t, Packet4s>(int16_t* to, const Packet4s& from,
+                                                                       Index stride) {
+  vst1_lane_s16(to + stride * 0, from, 0);
+  vst1_lane_s16(to + stride * 1, from, 1);
+  vst1_lane_s16(to + stride * 2, from, 2);
+  vst1_lane_s16(to + stride * 3, from, 3);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int16_t, Packet8s>(int16_t* to, const Packet8s& from,
+                                                                       Index stride) {
+  vst1q_lane_s16(to + stride * 0, from, 0);
+  vst1q_lane_s16(to + stride * 1, from, 1);
+  vst1q_lane_s16(to + stride * 2, from, 2);
+  vst1q_lane_s16(to + stride * 3, from, 3);
+  vst1q_lane_s16(to + stride * 4, from, 4);
+  vst1q_lane_s16(to + stride * 5, from, 5);
+  vst1q_lane_s16(to + stride * 6, from, 6);
+  vst1q_lane_s16(to + stride * 7, from, 7);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint16_t, Packet4us>(uint16_t* to, const Packet4us& from,
+                                                                         Index stride) {
+  vst1_lane_u16(to + stride * 0, from, 0);
+  vst1_lane_u16(to + stride * 1, from, 1);
+  vst1_lane_u16(to + stride * 2, from, 2);
+  vst1_lane_u16(to + stride * 3, from, 3);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint16_t, Packet8us>(uint16_t* to, const Packet8us& from,
+                                                                         Index stride) {
+  vst1q_lane_u16(to + stride * 0, from, 0);
+  vst1q_lane_u16(to + stride * 1, from, 1);
+  vst1q_lane_u16(to + stride * 2, from, 2);
+  vst1q_lane_u16(to + stride * 3, from, 3);
+  vst1q_lane_u16(to + stride * 4, from, 4);
+  vst1q_lane_u16(to + stride * 5, from, 5);
+  vst1q_lane_u16(to + stride * 6, from, 6);
+  vst1q_lane_u16(to + stride * 7, from, 7);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int32_t, Packet2i>(int32_t* to, const Packet2i& from,
+                                                                       Index stride) {
+  vst1_lane_s32(to + stride * 0, from, 0);
+  vst1_lane_s32(to + stride * 1, from, 1);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int32_t, Packet4i>(int32_t* to, const Packet4i& from,
+                                                                       Index stride) {
+  vst1q_lane_s32(to + stride * 0, from, 0);
+  vst1q_lane_s32(to + stride * 1, from, 1);
+  vst1q_lane_s32(to + stride * 2, from, 2);
+  vst1q_lane_s32(to + stride * 3, from, 3);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint32_t, Packet2ui>(uint32_t* to, const Packet2ui& from,
+                                                                         Index stride) {
+  vst1_lane_u32(to + stride * 0, from, 0);
+  vst1_lane_u32(to + stride * 1, from, 1);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint32_t, Packet4ui>(uint32_t* to, const Packet4ui& from,
+                                                                         Index stride) {
+  vst1q_lane_u32(to + stride * 0, from, 0);
+  vst1q_lane_u32(to + stride * 1, from, 1);
+  vst1q_lane_u32(to + stride * 2, from, 2);
+  vst1q_lane_u32(to + stride * 3, from, 3);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<int64_t, Packet2l>(int64_t* to, const Packet2l& from,
+                                                                       Index stride) {
+  vst1q_lane_s64(to + stride * 0, from, 0);
+  vst1q_lane_s64(to + stride * 1, from, 1);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<uint64_t, Packet2ul>(uint64_t* to, const Packet2ul& from,
+                                                                         Index stride) {
+  vst1q_lane_u64(to + stride * 0, from, 0);
+  vst1q_lane_u64(to + stride * 1, from, 1);
+}
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
+  EIGEN_ARM_PREFETCH(addr);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<int8_t>(const int8_t* addr) {
+  EIGEN_ARM_PREFETCH(addr);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<uint8_t>(const uint8_t* addr) {
+  EIGEN_ARM_PREFETCH(addr);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<int16_t>(const int16_t* addr) {
+  EIGEN_ARM_PREFETCH(addr);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<uint16_t>(const uint16_t* addr) {
+  EIGEN_ARM_PREFETCH(addr);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<int32_t>(const int32_t* addr) {
+  EIGEN_ARM_PREFETCH(addr);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<uint32_t>(const uint32_t* addr) {
+  EIGEN_ARM_PREFETCH(addr);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<int64_t>(const int64_t* addr) {
+  EIGEN_ARM_PREFETCH(addr);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<uint64_t>(const uint64_t* addr) {
+  EIGEN_ARM_PREFETCH(addr);
+}
+
+template <>
+EIGEN_STRONG_INLINE float pfirst<Packet2f>(const Packet2f& a) {
+  return vget_lane_f32(a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
+  return vgetq_lane_f32(a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE int8_t pfirst<Packet4c>(const Packet4c& a) {
+  return static_cast<int8_t>(a & 0xff);
+}
+template <>
+EIGEN_STRONG_INLINE int8_t pfirst<Packet8c>(const Packet8c& a) {
+  return vget_lane_s8(a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE int8_t pfirst<Packet16c>(const Packet16c& a) {
+  return vgetq_lane_s8(a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint8_t pfirst<Packet4uc>(const Packet4uc& a) {
+  return static_cast<uint8_t>(a & 0xff);
+}
+template <>
+EIGEN_STRONG_INLINE uint8_t pfirst<Packet8uc>(const Packet8uc& a) {
+  return vget_lane_u8(a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint8_t pfirst<Packet16uc>(const Packet16uc& a) {
+  return vgetq_lane_u8(a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE int16_t pfirst<Packet4s>(const Packet4s& a) {
+  return vget_lane_s16(a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE int16_t pfirst<Packet8s>(const Packet8s& a) {
+  return vgetq_lane_s16(a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint16_t pfirst<Packet4us>(const Packet4us& a) {
+  return vget_lane_u16(a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint16_t pfirst<Packet8us>(const Packet8us& a) {
+  return vgetq_lane_u16(a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE int32_t pfirst<Packet2i>(const Packet2i& a) {
+  return vget_lane_s32(a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE int32_t pfirst<Packet4i>(const Packet4i& a) {
+  return vgetq_lane_s32(a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t pfirst<Packet2ui>(const Packet2ui& a) {
+  return vget_lane_u32(a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) {
+  return vgetq_lane_u32(a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE int64_t pfirst<Packet2l>(const Packet2l& a) {
+  return vgetq_lane_s64(a, 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint64_t pfirst<Packet2ul>(const Packet2ul& a) {
+  return vgetq_lane_u64(a, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2f preverse(const Packet2f& a) {
+  return vrev64_f32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
+  const float32x4_t a_r64 = vrev64q_f32(a);
+  return vcombine_f32(vget_high_f32(a_r64), vget_low_f32(a_r64));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c preverse(const Packet4c& a) {
+  return vget_lane_s32(vreinterpret_s32_s8(vrev64_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c preverse(const Packet8c& a) {
+  return vrev64_s8(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c preverse(const Packet16c& a) {
+  const int8x16_t a_r64 = vrev64q_s8(a);
+  return vcombine_s8(vget_high_s8(a_r64), vget_low_s8(a_r64));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc preverse(const Packet4uc& a) {
+  return vget_lane_u32(vreinterpret_u32_u8(vrev64_u8(vreinterpret_u8_u32(vdup_n_u32(a)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc preverse(const Packet8uc& a) {
+  return vrev64_u8(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc preverse(const Packet16uc& a) {
+  const uint8x16_t a_r64 = vrev64q_u8(a);
+  return vcombine_u8(vget_high_u8(a_r64), vget_low_u8(a_r64));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s preverse(const Packet4s& a) {
+  return vrev64_s16(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s preverse(const Packet8s& a) {
+  const int16x8_t a_r64 = vrev64q_s16(a);
+  return vcombine_s16(vget_high_s16(a_r64), vget_low_s16(a_r64));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us preverse(const Packet4us& a) {
+  return vrev64_u16(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us preverse(const Packet8us& a) {
+  const uint16x8_t a_r64 = vrev64q_u16(a);
+  return vcombine_u16(vget_high_u16(a_r64), vget_low_u16(a_r64));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i preverse(const Packet2i& a) {
+  return vrev64_s32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
+  const int32x4_t a_r64 = vrev64q_s32(a);
+  return vcombine_s32(vget_high_s32(a_r64), vget_low_s32(a_r64));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui preverse(const Packet2ui& a) {
+  return vrev64_u32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui preverse(const Packet4ui& a) {
+  const uint32x4_t a_r64 = vrev64q_u32(a);
+  return vcombine_u32(vget_high_u32(a_r64), vget_low_u32(a_r64));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l preverse(const Packet2l& a) {
+  return vcombine_s64(vget_high_s64(a), vget_low_s64(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul preverse(const Packet2ul& a) {
+  return vcombine_u64(vget_high_u64(a), vget_low_u64(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2f pabs(const Packet2f& a) {
+  return vabs_f32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) {
+  return vabsq_f32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c pabs<Packet4c>(const Packet4c& a) {
+  return vget_lane_s32(vreinterpret_s32_s8(vabs_s8(vreinterpret_s8_s32(vdup_n_s32(a)))), 0);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pabs(const Packet8c& a) {
+  return vabs_s8(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c pabs(const Packet16c& a) {
+  return vabsq_s8(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pabs(const Packet4uc& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pabs(const Packet8uc& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc pabs(const Packet16uc& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pabs(const Packet4s& a) {
+  return vabs_s16(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pabs(const Packet8s& a) {
+  return vabsq_s16(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pabs(const Packet4us& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pabs(const Packet8us& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pabs(const Packet2i& a) {
+  return vabs_s32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) {
+  return vabsq_s32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pabs(const Packet2ui& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pabs(const Packet4ui& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pabs(const Packet2l& a) {
+#if EIGEN_ARCH_ARM64
+  return vabsq_s64(a);
+#else
+  return vcombine_s64(vdup_n_s64((std::abs)(vgetq_lane_s64(a, 0))), vdup_n_s64((std::abs)(vgetq_lane_s64(a, 1))));
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pabs(const Packet2ul& a) {
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2f psignbit(const Packet2f& a) {
+  return vreinterpret_f32_s32(vshr_n_s32(vreinterpret_s32_f32(a), 31));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f psignbit(const Packet4f& a) {
+  return vreinterpretq_f32_s32(vshrq_n_s32(vreinterpretq_s32_f32(a), 31));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2f pfrexp<Packet2f>(const Packet2f& a, Packet2f& exponent) {
+  return pfrexp_generic(a, exponent);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
+  return pfrexp_generic(a, exponent);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2f pldexp<Packet2f>(const Packet2f& a, const Packet2f& exponent) {
+  return pldexp_generic(a, exponent);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
+  return pldexp_generic(a, exponent);
+}
+
+#if EIGEN_ARCH_ARM64
+template <>
+EIGEN_STRONG_INLINE float predux<Packet2f>(const Packet2f& a) {
+  return vaddv_f32(a);
+}
+template <>
+EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
+  return vaddvq_f32(a);
+}
+#else
+template <>
+EIGEN_STRONG_INLINE float predux<Packet2f>(const Packet2f& a) {
+  return vget_lane_f32(vpadd_f32(a, a), 0);
+}
+template <>
+EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
+  const float32x2_t sum = vadd_f32(vget_low_f32(a), vget_high_f32(a));
+  return vget_lane_f32(vpadd_f32(sum, sum), 0);
+}
+#endif
+template <>
+EIGEN_STRONG_INLINE int8_t predux<Packet4c>(const Packet4c& a) {
+  const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a));
+  int8x8_t sum = vpadd_s8(a_dup, a_dup);
+  sum = vpadd_s8(sum, sum);
+  return vget_lane_s8(sum, 0);
+}
+#if EIGEN_ARCH_ARM64
+template <>
+EIGEN_STRONG_INLINE int8_t predux<Packet8c>(const Packet8c& a) {
+  return vaddv_s8(a);
+}
+template <>
+EIGEN_STRONG_INLINE int8_t predux<Packet16c>(const Packet16c& a) {
+  return vaddvq_s8(a);
+}
+#else
+template <>
+EIGEN_STRONG_INLINE int8_t predux<Packet8c>(const Packet8c& a) {
+  int8x8_t sum = vpadd_s8(a, a);
+  sum = vpadd_s8(sum, sum);
+  sum = vpadd_s8(sum, sum);
+  return vget_lane_s8(sum, 0);
+}
+template <>
+EIGEN_STRONG_INLINE int8_t predux<Packet16c>(const Packet16c& a) {
+  int8x8_t sum = vadd_s8(vget_low_s8(a), vget_high_s8(a));
+  sum = vpadd_s8(sum, sum);
+  sum = vpadd_s8(sum, sum);
+  sum = vpadd_s8(sum, sum);
+  return vget_lane_s8(sum, 0);
+}
+#endif
+template <>
+EIGEN_STRONG_INLINE uint8_t predux<Packet4uc>(const Packet4uc& a) {
+  const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a));
+  uint8x8_t sum = vpadd_u8(a_dup, a_dup);
+  sum = vpadd_u8(sum, sum);
+  return vget_lane_u8(sum, 0);
+}
+#if EIGEN_ARCH_ARM64
+template <>
+EIGEN_STRONG_INLINE uint8_t predux<Packet8uc>(const Packet8uc& a) {
+  return vaddv_u8(a);
+}
+template <>
+EIGEN_STRONG_INLINE uint8_t predux<Packet16uc>(const Packet16uc& a) {
+  return vaddvq_u8(a);
+}
+template <>
+EIGEN_STRONG_INLINE int16_t predux<Packet4s>(const Packet4s& a) {
+  return vaddv_s16(a);
+}
+template <>
+EIGEN_STRONG_INLINE int16_t predux<Packet8s>(const Packet8s& a) {
+  return vaddvq_s16(a);
+}
+template <>
+EIGEN_STRONG_INLINE uint16_t predux<Packet4us>(const Packet4us& a) {
+  return vaddv_u16(a);
+}
+template <>
+EIGEN_STRONG_INLINE uint16_t predux<Packet8us>(const Packet8us& a) {
+  return vaddvq_u16(a);
+}
+template <>
+EIGEN_STRONG_INLINE int32_t predux<Packet2i>(const Packet2i& a) {
+  return vaddv_s32(a);
+}
+template <>
+EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a) {
+  return vaddvq_s32(a);
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t predux<Packet2ui>(const Packet2ui& a) {
+  return vaddv_u32(a);
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t predux<Packet4ui>(const Packet4ui& a) {
+  return vaddvq_u32(a);
+}
+template <>
+EIGEN_STRONG_INLINE int64_t predux<Packet2l>(const Packet2l& a) {
+  return vaddvq_s64(a);
+}
+template <>
+EIGEN_STRONG_INLINE uint64_t predux<Packet2ul>(const Packet2ul& a) {
+  return vaddvq_u64(a);
+}
+#else
+template <>
+EIGEN_STRONG_INLINE uint8_t predux<Packet8uc>(const Packet8uc& a) {
+  uint8x8_t sum = vpadd_u8(a, a);
+  sum = vpadd_u8(sum, sum);
+  sum = vpadd_u8(sum, sum);
+  return vget_lane_u8(sum, 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint8_t predux<Packet16uc>(const Packet16uc& a) {
+  uint8x8_t sum = vadd_u8(vget_low_u8(a), vget_high_u8(a));
+  sum = vpadd_u8(sum, sum);
+  sum = vpadd_u8(sum, sum);
+  sum = vpadd_u8(sum, sum);
+  return vget_lane_u8(sum, 0);
+}
+template <>
+EIGEN_STRONG_INLINE int16_t predux<Packet4s>(const Packet4s& a) {
+  const int16x4_t sum = vpadd_s16(a, a);
+  return vget_lane_s16(vpadd_s16(sum, sum), 0);
+}
+template <>
+EIGEN_STRONG_INLINE int16_t predux<Packet8s>(const Packet8s& a) {
+  int16x4_t sum = vadd_s16(vget_low_s16(a), vget_high_s16(a));
+  sum = vpadd_s16(sum, sum);
+  sum = vpadd_s16(sum, sum);
+  return vget_lane_s16(sum, 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint16_t predux<Packet4us>(const Packet4us& a) {
+  const uint16x4_t sum = vpadd_u16(a, a);
+  return vget_lane_u16(vpadd_u16(sum, sum), 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint16_t predux<Packet8us>(const Packet8us& a) {
+  uint16x4_t sum = vadd_u16(vget_low_u16(a), vget_high_u16(a));
+  sum = vpadd_u16(sum, sum);
+  sum = vpadd_u16(sum, sum);
+  return vget_lane_u16(sum, 0);
+}
+template <>
+EIGEN_STRONG_INLINE int32_t predux<Packet2i>(const Packet2i& a) {
+  return vget_lane_s32(vpadd_s32(a, a), 0);
+}
+template <>
+EIGEN_STRONG_INLINE int32_t predux<Packet4i>(const Packet4i& a) {
+  const int32x2_t sum = vadd_s32(vget_low_s32(a), vget_high_s32(a));
+  return vget_lane_s32(vpadd_s32(sum, sum), 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t predux<Packet2ui>(const Packet2ui& a) {
+  return vget_lane_u32(vpadd_u32(a, a), 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t predux<Packet4ui>(const Packet4ui& a) {
+  const uint32x2_t sum = vadd_u32(vget_low_u32(a), vget_high_u32(a));
+  return vget_lane_u32(vpadd_u32(sum, sum), 0);
+}
+template <>
+EIGEN_STRONG_INLINE int64_t predux<Packet2l>(const Packet2l& a) {
+  return vgetq_lane_s64(a, 0) + vgetq_lane_s64(a, 1);
+}
+template <>
+EIGEN_STRONG_INLINE uint64_t predux<Packet2ul>(const Packet2ul& a) {
+  return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1);
+}
+#endif
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4c predux_half_dowto4(const Packet8c& a) {
+  return vget_lane_s32(vreinterpret_s32_s8(vadd_s8(a, vreinterpret_s8_s32(vrev64_s32(vreinterpret_s32_s8(a))))), 0);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c predux_half_dowto4(const Packet16c& a) {
+  return vadd_s8(vget_high_s8(a), vget_low_s8(a));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4uc predux_half_dowto4(const Packet8uc& a) {
+  return vget_lane_u32(vreinterpret_u32_u8(vadd_u8(a, vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(a))))), 0);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc predux_half_dowto4(const Packet16uc& a) {
+  return vadd_u8(vget_high_u8(a), vget_low_u8(a));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s predux_half_dowto4(const Packet8s& a) {
+  return vadd_s16(vget_high_s16(a), vget_low_s16(a));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us predux_half_dowto4(const Packet8us& a) {
+  return vadd_u16(vget_high_u16(a), vget_low_u16(a));
+}
+
+// Other reduction functions:
+// mul
+template <>
+EIGEN_STRONG_INLINE float predux_mul<Packet2f>(const Packet2f& a) {
+  return vget_lane_f32(a, 0) * vget_lane_f32(a, 1);
+}
+template <>
+EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) {
+  return predux_mul<Packet2f>(vmul_f32(vget_low_f32(a), vget_high_f32(a)));
+}
+template <>
+EIGEN_STRONG_INLINE int8_t predux_mul<Packet4c>(const Packet4c& a) {
+  int8x8_t prod = vreinterpret_s8_s32(vdup_n_s32(a));
+  prod = vmul_s8(prod, vrev16_s8(prod));
+  return vget_lane_s8(prod, 0) * vget_lane_s8(prod, 2);
+}
+template <>
+EIGEN_STRONG_INLINE int8_t predux_mul<Packet8c>(const Packet8c& a) {
+  int8x8_t prod = vmul_s8(a, vrev16_s8(a));
+  prod = vmul_s8(prod, vrev32_s8(prod));
+  return vget_lane_s8(prod, 0) * vget_lane_s8(prod, 4);
+}
+template <>
+EIGEN_STRONG_INLINE int8_t predux_mul<Packet16c>(const Packet16c& a) {
+  return predux_mul<Packet8c>(vmul_s8(vget_low_s8(a), vget_high_s8(a)));
+}
+template <>
+EIGEN_STRONG_INLINE uint8_t predux_mul<Packet4uc>(const Packet4uc& a) {
+  uint8x8_t prod = vreinterpret_u8_u32(vdup_n_u32(a));
+  prod = vmul_u8(prod, vrev16_u8(prod));
+  return vget_lane_u8(prod, 0) * vget_lane_u8(prod, 2);
+}
+template <>
+EIGEN_STRONG_INLINE uint8_t predux_mul<Packet8uc>(const Packet8uc& a) {
+  uint8x8_t prod = vmul_u8(a, vrev16_u8(a));
+  prod = vmul_u8(prod, vrev32_u8(prod));
+  return vget_lane_u8(prod, 0) * vget_lane_u8(prod, 4);
+}
+template <>
+EIGEN_STRONG_INLINE uint8_t predux_mul<Packet16uc>(const Packet16uc& a) {
+  return predux_mul<Packet8uc>(vmul_u8(vget_low_u8(a), vget_high_u8(a)));
+}
+template <>
+EIGEN_STRONG_INLINE int16_t predux_mul<Packet4s>(const Packet4s& a) {
+  const int16x4_t prod = vmul_s16(a, vrev32_s16(a));
+  return vget_lane_s16(prod, 0) * vget_lane_s16(prod, 2);
+}
+template <>
+EIGEN_STRONG_INLINE int16_t predux_mul<Packet8s>(const Packet8s& a) {
+  int16x4_t prod;
+
+  // Get the product of a_lo * a_hi -> |a1*a5|a2*a6|a3*a7|a4*a8|
+  prod = vmul_s16(vget_low_s16(a), vget_high_s16(a));
+  // Swap and multiply |a1*a5*a2*a6|a3*a7*a4*a8|
+  prod = vmul_s16(prod, vrev32_s16(prod));
+  // Multiply |a1*a5*a2*a6*a3*a7*a4*a8|
+  return vget_lane_s16(prod, 0) * vget_lane_s16(prod, 2);
+}
+template <>
+EIGEN_STRONG_INLINE uint16_t predux_mul<Packet4us>(const Packet4us& a) {
+  const uint16x4_t prod = vmul_u16(a, vrev32_u16(a));
+  return vget_lane_u16(prod, 0) * vget_lane_u16(prod, 2);
+}
+template <>
+EIGEN_STRONG_INLINE uint16_t predux_mul<Packet8us>(const Packet8us& a) {
+  uint16x4_t prod;
+
+  // Get the product of a_lo * a_hi -> |a1*a5|a2*a6|a3*a7|a4*a8|
+  prod = vmul_u16(vget_low_u16(a), vget_high_u16(a));
+  // Swap and multiply |a1*a5*a2*a6|a3*a7*a4*a8|
+  prod = vmul_u16(prod, vrev32_u16(prod));
+  // Multiply |a1*a5*a2*a6*a3*a7*a4*a8|
+  return vget_lane_u16(prod, 0) * vget_lane_u16(prod, 2);
+}
+template <>
+EIGEN_STRONG_INLINE int32_t predux_mul<Packet2i>(const Packet2i& a) {
+  return vget_lane_s32(a, 0) * vget_lane_s32(a, 1);
+}
+template <>
+EIGEN_STRONG_INLINE int32_t predux_mul<Packet4i>(const Packet4i& a) {
+  return predux_mul<Packet2i>(vmul_s32(vget_low_s32(a), vget_high_s32(a)));
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_mul<Packet2ui>(const Packet2ui& a) {
+  return vget_lane_u32(a, 0) * vget_lane_u32(a, 1);
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_mul<Packet4ui>(const Packet4ui& a) {
+  return predux_mul<Packet2ui>(vmul_u32(vget_low_u32(a), vget_high_u32(a)));
+}
+template <>
+EIGEN_STRONG_INLINE int64_t predux_mul<Packet2l>(const Packet2l& a) {
+  return vgetq_lane_s64(a, 0) * vgetq_lane_s64(a, 1);
+}
+template <>
+EIGEN_STRONG_INLINE uint64_t predux_mul<Packet2ul>(const Packet2ul& a) {
+  return vgetq_lane_u64(a, 0) * vgetq_lane_u64(a, 1);
+}
+
+// min
+#if EIGEN_ARCH_ARM64
+template <>
+EIGEN_STRONG_INLINE float predux_min<Packet2f>(const Packet2f& a) {
+  return vminv_f32(a);
+}
+template <>
+EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
+  return vminvq_f32(a);
+}
+#else
+template <>
+EIGEN_STRONG_INLINE float predux_min<Packet2f>(const Packet2f& a) {
+  return vget_lane_f32(vpmin_f32(a, a), 0);
+}
+template <>
+EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
+  const float32x2_t min = vmin_f32(vget_low_f32(a), vget_high_f32(a));
+  return vget_lane_f32(vpmin_f32(min, min), 0);
+}
+#endif
+template <>
+EIGEN_STRONG_INLINE int8_t predux_min<Packet4c>(const Packet4c& a) {
+  const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a));
+  int8x8_t min = vpmin_s8(a_dup, a_dup);
+  min = vpmin_s8(min, min);
+  return vget_lane_s8(min, 0);
+}
+#if EIGEN_ARCH_ARM64
+template <>
+EIGEN_STRONG_INLINE int8_t predux_min<Packet8c>(const Packet8c& a) {
+  return vminv_s8(a);
+}
+template <>
+EIGEN_STRONG_INLINE int8_t predux_min<Packet16c>(const Packet16c& a) {
+  return vminvq_s8(a);
+}
+#else
+template <>
+EIGEN_STRONG_INLINE int8_t predux_min<Packet8c>(const Packet8c& a) {
+  int8x8_t min = vpmin_s8(a, a);
+  min = vpmin_s8(min, min);
+  min = vpmin_s8(min, min);
+  return vget_lane_s8(min, 0);
+}
+template <>
+EIGEN_STRONG_INLINE int8_t predux_min<Packet16c>(const Packet16c& a) {
+  int8x8_t min = vmin_s8(vget_low_s8(a), vget_high_s8(a));
+  min = vpmin_s8(min, min);
+  min = vpmin_s8(min, min);
+  min = vpmin_s8(min, min);
+  return vget_lane_s8(min, 0);
+}
+#endif
+template <>
+EIGEN_STRONG_INLINE uint8_t predux_min<Packet4uc>(const Packet4uc& a) {
+  const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a));
+  uint8x8_t min = vpmin_u8(a_dup, a_dup);
+  min = vpmin_u8(min, min);
+  return vget_lane_u8(min, 0);
+}
+#if EIGEN_ARCH_ARM64
+template <>
+EIGEN_STRONG_INLINE uint8_t predux_min<Packet8uc>(const Packet8uc& a) {
+  return vminv_u8(a);
+}
+template <>
+EIGEN_STRONG_INLINE uint8_t predux_min<Packet16uc>(const Packet16uc& a) {
+  return vminvq_u8(a);
+}
+template <>
+EIGEN_STRONG_INLINE int16_t predux_min<Packet4s>(const Packet4s& a) {
+  return vminv_s16(a);
+}
+template <>
+EIGEN_STRONG_INLINE int16_t predux_min<Packet8s>(const Packet8s& a) {
+  return vminvq_s16(a);
+}
+template <>
+EIGEN_STRONG_INLINE uint16_t predux_min<Packet4us>(const Packet4us& a) {
+  return vminv_u16(a);
+}
+template <>
+EIGEN_STRONG_INLINE uint16_t predux_min<Packet8us>(const Packet8us& a) {
+  return vminvq_u16(a);
+}
+template <>
+EIGEN_STRONG_INLINE int32_t predux_min<Packet2i>(const Packet2i& a) {
+  return vminv_s32(a);
+}
+template <>
+EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a) {
+  return vminvq_s32(a);
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_min<Packet2ui>(const Packet2ui& a) {
+  return vminv_u32(a);
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_min<Packet4ui>(const Packet4ui& a) {
+  return vminvq_u32(a);
+}
+#else
+template <>
+EIGEN_STRONG_INLINE uint8_t predux_min<Packet8uc>(const Packet8uc& a) {
+  uint8x8_t min = vpmin_u8(a, a);
+  min = vpmin_u8(min, min);
+  min = vpmin_u8(min, min);
+  return vget_lane_u8(min, 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint8_t predux_min<Packet16uc>(const Packet16uc& a) {
+  uint8x8_t min = vmin_u8(vget_low_u8(a), vget_high_u8(a));
+  min = vpmin_u8(min, min);
+  min = vpmin_u8(min, min);
+  min = vpmin_u8(min, min);
+  return vget_lane_u8(min, 0);
+}
+template <>
+EIGEN_STRONG_INLINE int16_t predux_min<Packet4s>(const Packet4s& a) {
+  const int16x4_t min = vpmin_s16(a, a);
+  return vget_lane_s16(vpmin_s16(min, min), 0);
+}
+template <>
+EIGEN_STRONG_INLINE int16_t predux_min<Packet8s>(const Packet8s& a) {
+  int16x4_t min = vmin_s16(vget_low_s16(a), vget_high_s16(a));
+  min = vpmin_s16(min, min);
+  min = vpmin_s16(min, min);
+  return vget_lane_s16(min, 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint16_t predux_min<Packet4us>(const Packet4us& a) {
+  const uint16x4_t min = vpmin_u16(a, a);
+  return vget_lane_u16(vpmin_u16(min, min), 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint16_t predux_min<Packet8us>(const Packet8us& a) {
+  uint16x4_t min = vmin_u16(vget_low_u16(a), vget_high_u16(a));
+  min = vpmin_u16(min, min);
+  min = vpmin_u16(min, min);
+  return vget_lane_u16(min, 0);
+}
+template <>
+EIGEN_STRONG_INLINE int32_t predux_min<Packet2i>(const Packet2i& a) {
+  return vget_lane_s32(vpmin_s32(a, a), 0);
+}
+template <>
+EIGEN_STRONG_INLINE int32_t predux_min<Packet4i>(const Packet4i& a) {
+  const int32x2_t min = vmin_s32(vget_low_s32(a), vget_high_s32(a));
+  return vget_lane_s32(vpmin_s32(min, min), 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_min<Packet2ui>(const Packet2ui& a) {
+  return vget_lane_u32(vpmin_u32(a, a), 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_min<Packet4ui>(const Packet4ui& a) {
+  const uint32x2_t min = vmin_u32(vget_low_u32(a), vget_high_u32(a));
+  return vget_lane_u32(vpmin_u32(min, min), 0);
+}
+#endif
+template <>
+EIGEN_STRONG_INLINE int64_t predux_min<Packet2l>(const Packet2l& a) {
+  return (std::min)(vgetq_lane_s64(a, 0), vgetq_lane_s64(a, 1));
+}
+template <>
+EIGEN_STRONG_INLINE uint64_t predux_min<Packet2ul>(const Packet2ul& a) {
+  return (std::min)(vgetq_lane_u64(a, 0), vgetq_lane_u64(a, 1));
+}
+
+// max
+#if EIGEN_ARCH_ARM64
+template <>
+EIGEN_STRONG_INLINE float predux_max<Packet2f>(const Packet2f& a) {
+  return vmaxv_f32(a);
+}
+template <>
+EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
+  return vmaxvq_f32(a);
+}
+#else
+template <>
+EIGEN_STRONG_INLINE float predux_max<Packet2f>(const Packet2f& a) {
+  return vget_lane_f32(vpmax_f32(a, a), 0);
+}
+template <>
+EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
+  const float32x2_t max = vmax_f32(vget_low_f32(a), vget_high_f32(a));
+  return vget_lane_f32(vpmax_f32(max, max), 0);
+}
+#endif
+template <>
+EIGEN_STRONG_INLINE int8_t predux_max<Packet4c>(const Packet4c& a) {
+  const int8x8_t a_dup = vreinterpret_s8_s32(vdup_n_s32(a));
+  int8x8_t max = vpmax_s8(a_dup, a_dup);
+  max = vpmax_s8(max, max);
+  return vget_lane_s8(max, 0);
+}
+#if EIGEN_ARCH_ARM64
+template <>
+EIGEN_STRONG_INLINE int8_t predux_max<Packet8c>(const Packet8c& a) {
+  return vmaxv_s8(a);
+}
+template <>
+EIGEN_STRONG_INLINE int8_t predux_max<Packet16c>(const Packet16c& a) {
+  return vmaxvq_s8(a);
+}
+#else
+template <>
+EIGEN_STRONG_INLINE int8_t predux_max<Packet8c>(const Packet8c& a) {
+  int8x8_t max = vpmax_s8(a, a);
+  max = vpmax_s8(max, max);
+  max = vpmax_s8(max, max);
+  return vget_lane_s8(max, 0);
+}
+template <>
+EIGEN_STRONG_INLINE int8_t predux_max<Packet16c>(const Packet16c& a) {
+  int8x8_t max = vmax_s8(vget_low_s8(a), vget_high_s8(a));
+  max = vpmax_s8(max, max);
+  max = vpmax_s8(max, max);
+  max = vpmax_s8(max, max);
+  return vget_lane_s8(max, 0);
+}
+#endif
+template <>
+EIGEN_STRONG_INLINE uint8_t predux_max<Packet4uc>(const Packet4uc& a) {
+  const uint8x8_t a_dup = vreinterpret_u8_u32(vdup_n_u32(a));
+  uint8x8_t max = vpmax_u8(a_dup, a_dup);
+  max = vpmax_u8(max, max);
+  return vget_lane_u8(max, 0);
+}
+#if EIGEN_ARCH_ARM64
+template <>
+EIGEN_STRONG_INLINE uint8_t predux_max<Packet8uc>(const Packet8uc& a) {
+  return vmaxv_u8(a);
+}
+template <>
+EIGEN_STRONG_INLINE uint8_t predux_max<Packet16uc>(const Packet16uc& a) {
+  return vmaxvq_u8(a);
+}
+template <>
+EIGEN_STRONG_INLINE int16_t predux_max<Packet4s>(const Packet4s& a) {
+  return vmaxv_s16(a);
+}
+template <>
+EIGEN_STRONG_INLINE int16_t predux_max<Packet8s>(const Packet8s& a) {
+  return vmaxvq_s16(a);
+}
+template <>
+EIGEN_STRONG_INLINE uint16_t predux_max<Packet4us>(const Packet4us& a) {
+  return vmaxv_u16(a);
+}
+template <>
+EIGEN_STRONG_INLINE uint16_t predux_max<Packet8us>(const Packet8us& a) {
+  return vmaxvq_u16(a);
+}
+template <>
+EIGEN_STRONG_INLINE int32_t predux_max<Packet2i>(const Packet2i& a) {
+  return vmaxv_s32(a);
+}
+template <>
+EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a) {
+  return vmaxvq_s32(a);
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_max<Packet2ui>(const Packet2ui& a) {
+  return vmaxv_u32(a);
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_max<Packet4ui>(const Packet4ui& a) {
+  return vmaxvq_u32(a);
+}
+#else
+template <>
+EIGEN_STRONG_INLINE uint8_t predux_max<Packet8uc>(const Packet8uc& a) {
+  uint8x8_t max = vpmax_u8(a, a);
+  max = vpmax_u8(max, max);
+  max = vpmax_u8(max, max);
+  return vget_lane_u8(max, 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint8_t predux_max<Packet16uc>(const Packet16uc& a) {
+  uint8x8_t max = vmax_u8(vget_low_u8(a), vget_high_u8(a));
+  max = vpmax_u8(max, max);
+  max = vpmax_u8(max, max);
+  max = vpmax_u8(max, max);
+  return vget_lane_u8(max, 0);
+}
+template <>
+EIGEN_STRONG_INLINE int16_t predux_max<Packet4s>(const Packet4s& a) {
+  const int16x4_t max = vpmax_s16(a, a);
+  return vget_lane_s16(vpmax_s16(max, max), 0);
+}
+template <>
+EIGEN_STRONG_INLINE int16_t predux_max<Packet8s>(const Packet8s& a) {
+  int16x4_t max = vmax_s16(vget_low_s16(a), vget_high_s16(a));
+  max = vpmax_s16(max, max);
+  max = vpmax_s16(max, max);
+  return vget_lane_s16(max, 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint16_t predux_max<Packet4us>(const Packet4us& a) {
+  const uint16x4_t max = vpmax_u16(a, a);
+  return vget_lane_u16(vpmax_u16(max, max), 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint16_t predux_max<Packet8us>(const Packet8us& a) {
+  uint16x4_t max = vmax_u16(vget_low_u16(a), vget_high_u16(a));
+  max = vpmax_u16(max, max);
+  max = vpmax_u16(max, max);
+  return vget_lane_u16(max, 0);
+}
+template <>
+EIGEN_STRONG_INLINE int32_t predux_max<Packet2i>(const Packet2i& a) {
+  return vget_lane_s32(vpmax_s32(a, a), 0);
+}
+template <>
+EIGEN_STRONG_INLINE int32_t predux_max<Packet4i>(const Packet4i& a) {
+  const int32x2_t max = vmax_s32(vget_low_s32(a), vget_high_s32(a));
+  return vget_lane_s32(vpmax_s32(max, max), 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_max<Packet2ui>(const Packet2ui& a) {
+  return vget_lane_u32(vpmax_u32(a, a), 0);
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_max<Packet4ui>(const Packet4ui& a) {
+  const uint32x2_t max = vmax_u32(vget_low_u32(a), vget_high_u32(a));
+  return vget_lane_u32(vpmax_u32(max, max), 0);
+}
+#endif
+template <>
+EIGEN_STRONG_INLINE int64_t predux_max<Packet2l>(const Packet2l& a) {
+  return (std::max)(vgetq_lane_s64(a, 0), vgetq_lane_s64(a, 1));
+}
+template <>
+EIGEN_STRONG_INLINE uint64_t predux_max<Packet2ul>(const Packet2ul& a) {
+  return (std::max)(vgetq_lane_u64(a, 0), vgetq_lane_u64(a, 1));
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet4f& x) {
+  uint32x2_t tmp = vorr_u32(vget_low_u32(vreinterpretq_u32_f32(x)), vget_high_u32(vreinterpretq_u32_f32(x)));
+  return vget_lane_u32(vpmax_u32(tmp, tmp), 0);
+}
+
+// Helpers for ptranspose.
+namespace detail {
+
+template <typename Packet>
+void zip_in_place(Packet& p1, Packet& p2);
+
+template <>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet2f>(Packet2f& p1, Packet2f& p2) {
+  const float32x2x2_t tmp = vzip_f32(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+
+template <>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet4f>(Packet4f& p1, Packet4f& p2) {
+  const float32x4x2_t tmp = vzipq_f32(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+
+template <>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet8c>(Packet8c& p1, Packet8c& p2) {
+  const int8x8x2_t tmp = vzip_s8(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+
+template <>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet16c>(Packet16c& p1, Packet16c& p2) {
+  const int8x16x2_t tmp = vzipq_s8(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+
+template <>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet8uc>(Packet8uc& p1, Packet8uc& p2) {
+  const uint8x8x2_t tmp = vzip_u8(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+
+template <>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet16uc>(Packet16uc& p1, Packet16uc& p2) {
+  const uint8x16x2_t tmp = vzipq_u8(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+
+template <>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet2i>(Packet2i& p1, Packet2i& p2) {
+  const int32x2x2_t tmp = vzip_s32(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+
+template <>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet4i>(Packet4i& p1, Packet4i& p2) {
+  const int32x4x2_t tmp = vzipq_s32(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+
+template <>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet2ui>(Packet2ui& p1, Packet2ui& p2) {
+  const uint32x2x2_t tmp = vzip_u32(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+
+template <>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet4ui>(Packet4ui& p1, Packet4ui& p2) {
+  const uint32x4x2_t tmp = vzipq_u32(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+
+template <>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet4s>(Packet4s& p1, Packet4s& p2) {
+  const int16x4x2_t tmp = vzip_s16(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+
+template <>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet8s>(Packet8s& p1, Packet8s& p2) {
+  const int16x8x2_t tmp = vzipq_s16(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+
+template <>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet4us>(Packet4us& p1, Packet4us& p2) {
+  const uint16x4x2_t tmp = vzip_u16(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+
+template <>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet8us>(Packet8us& p1, Packet8us& p2) {
+  const uint16x8x2_t tmp = vzipq_u16(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+
+template <typename Packet>
+EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 2>& kernel) {
+  zip_in_place(kernel.packet[0], kernel.packet[1]);
+}
+
+template <typename Packet>
+EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 4>& kernel) {
+  zip_in_place(kernel.packet[0], kernel.packet[2]);
+  zip_in_place(kernel.packet[1], kernel.packet[3]);
+  zip_in_place(kernel.packet[0], kernel.packet[1]);
+  zip_in_place(kernel.packet[2], kernel.packet[3]);
+}
+
+template <typename Packet>
+EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 8>& kernel) {
+  zip_in_place(kernel.packet[0], kernel.packet[4]);
+  zip_in_place(kernel.packet[1], kernel.packet[5]);
+  zip_in_place(kernel.packet[2], kernel.packet[6]);
+  zip_in_place(kernel.packet[3], kernel.packet[7]);
+
+  zip_in_place(kernel.packet[0], kernel.packet[2]);
+  zip_in_place(kernel.packet[1], kernel.packet[3]);
+  zip_in_place(kernel.packet[4], kernel.packet[6]);
+  zip_in_place(kernel.packet[5], kernel.packet[7]);
+
+  zip_in_place(kernel.packet[0], kernel.packet[1]);
+  zip_in_place(kernel.packet[2], kernel.packet[3]);
+  zip_in_place(kernel.packet[4], kernel.packet[5]);
+  zip_in_place(kernel.packet[6], kernel.packet[7]);
+}
+
+template <typename Packet>
+EIGEN_ALWAYS_INLINE void ptranspose_impl(PacketBlock<Packet, 16>& kernel) {
+  EIGEN_UNROLL_LOOP
+  for (int i = 0; i < 4; ++i) {
+    const int m = (1 << i);
+    EIGEN_UNROLL_LOOP
+    for (int j = 0; j < m; ++j) {
+      const int n = (1 << (3 - i));
+      EIGEN_UNROLL_LOOP
+      for (int k = 0; k < n; ++k) {
+        const int idx = 2 * j * n + k;
+        zip_in_place(kernel.packet[idx], kernel.packet[idx + n]);
+      }
+    }
+  }
+}
+
+}  // namespace detail
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2f, 2>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4c, 4>& kernel) {
+  const int8x8_t a = vreinterpret_s8_s32(vset_lane_s32(kernel.packet[2], vdup_n_s32(kernel.packet[0]), 1));
+  const int8x8_t b = vreinterpret_s8_s32(vset_lane_s32(kernel.packet[3], vdup_n_s32(kernel.packet[1]), 1));
+
+  const int8x8x2_t zip8 = vzip_s8(a, b);
+  const int16x4x2_t zip16 = vzip_s16(vreinterpret_s16_s8(zip8.val[0]), vreinterpret_s16_s8(zip8.val[1]));
+
+  kernel.packet[0] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[0]), 0);
+  kernel.packet[1] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[0]), 1);
+  kernel.packet[2] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[1]), 0);
+  kernel.packet[3] = vget_lane_s32(vreinterpret_s32_s16(zip16.val[1]), 1);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8c, 8>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8c, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 16>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 8>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16c, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4uc, 4>& kernel) {
+  const uint8x8_t a = vreinterpret_u8_u32(vset_lane_u32(kernel.packet[2], vdup_n_u32(kernel.packet[0]), 1));
+  const uint8x8_t b = vreinterpret_u8_u32(vset_lane_u32(kernel.packet[3], vdup_n_u32(kernel.packet[1]), 1));
+
+  const uint8x8x2_t zip8 = vzip_u8(a, b);
+  const uint16x4x2_t zip16 = vzip_u16(vreinterpret_u16_u8(zip8.val[0]), vreinterpret_u16_u8(zip8.val[1]));
+
+  kernel.packet[0] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[0]), 0);
+  kernel.packet[1] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[0]), 1);
+  kernel.packet[2] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[1]), 0);
+  kernel.packet[3] = vget_lane_u32(vreinterpret_u32_u16(zip16.val[1]), 1);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8uc, 8>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8uc, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 16>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 8>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16uc, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4s, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8s, 8>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8s, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4us, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8us, 8>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8us, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2i, 2>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2ui, 2>& kernel) {
+  detail::zip_in_place(kernel.packet[0], kernel.packet[1]);
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4ui, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2l, 2>& kernel) {
+#if EIGEN_ARCH_ARM64
+  const int64x2_t tmp1 = vzip1q_s64(kernel.packet[0], kernel.packet[1]);
+  kernel.packet[1] = vzip2q_s64(kernel.packet[0], kernel.packet[1]);
+  kernel.packet[0] = tmp1;
+#else
+  const int64x1_t tmp[2][2] = {{vget_low_s64(kernel.packet[0]), vget_high_s64(kernel.packet[0])},
+                               {vget_low_s64(kernel.packet[1]), vget_high_s64(kernel.packet[1])}};
+
+  kernel.packet[0] = vcombine_s64(tmp[0][0], tmp[1][0]);
+  kernel.packet[1] = vcombine_s64(tmp[0][1], tmp[1][1]);
+#endif
+}
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2ul, 2>& kernel) {
+#if EIGEN_ARCH_ARM64
+  const uint64x2_t tmp1 = vzip1q_u64(kernel.packet[0], kernel.packet[1]);
+  kernel.packet[1] = vzip2q_u64(kernel.packet[0], kernel.packet[1]);
+  kernel.packet[0] = tmp1;
+#else
+  const uint64x1_t tmp[2][2] = {{vget_low_u64(kernel.packet[0]), vget_high_u64(kernel.packet[0])},
+                                {vget_low_u64(kernel.packet[1]), vget_high_u64(kernel.packet[1])}};
+
+  kernel.packet[0] = vcombine_u64(tmp[0][0], tmp[1][0]);
+  kernel.packet[1] = vcombine_u64(tmp[0][1], tmp[1][1]);
+#endif
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2f pselect(const Packet2f& mask, const Packet2f& a, const Packet2f& b) {
+  return vbsl_f32(vreinterpret_u32_f32(mask), a, b);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
+  return vbslq_f32(vreinterpretq_u32_f32(mask), a, b);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8c pselect(const Packet8c& mask, const Packet8c& a, const Packet8c& b) {
+  return vbsl_s8(vreinterpret_u8_s8(mask), a, b);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16c pselect(const Packet16c& mask, const Packet16c& a, const Packet16c& b) {
+  return vbslq_s8(vreinterpretq_u8_s8(mask), a, b);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8uc pselect(const Packet8uc& mask, const Packet8uc& a, const Packet8uc& b) {
+  return vbsl_u8(mask, a, b);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet16uc pselect(const Packet16uc& mask, const Packet16uc& a,
+                                                         const Packet16uc& b) {
+  return vbslq_u8(mask, a, b);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4s pselect(const Packet4s& mask, const Packet4s& a, const Packet4s& b) {
+  return vbsl_s16(vreinterpret_u16_s16(mask), a, b);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8s pselect(const Packet8s& mask, const Packet8s& a, const Packet8s& b) {
+  return vbslq_s16(vreinterpretq_u16_s16(mask), a, b);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4us pselect(const Packet4us& mask, const Packet4us& a, const Packet4us& b) {
+  return vbsl_u16(mask, a, b);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8us pselect(const Packet8us& mask, const Packet8us& a, const Packet8us& b) {
+  return vbslq_u16(mask, a, b);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2i pselect(const Packet2i& mask, const Packet2i& a, const Packet2i& b) {
+  return vbsl_s32(vreinterpret_u32_s32(mask), a, b);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b) {
+  return vbslq_s32(vreinterpretq_u32_s32(mask), a, b);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ui pselect(const Packet2ui& mask, const Packet2ui& a, const Packet2ui& b) {
+  return vbsl_u32(mask, a, b);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4ui pselect(const Packet4ui& mask, const Packet4ui& a, const Packet4ui& b) {
+  return vbslq_u32(mask, a, b);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2l pselect(const Packet2l& mask, const Packet2l& a, const Packet2l& b) {
+  return vbslq_s64(vreinterpretq_u64_s64(mask), a, b);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2ul pselect(const Packet2ul& mask, const Packet2ul& a, const Packet2ul& b) {
+  return vbslq_u64(mask, a, b);
+}
+
+// Use armv8 rounding intinsics if available.
+#if EIGEN_ARCH_ARMV8
+template <>
+EIGEN_STRONG_INLINE Packet2f print<Packet2f>(const Packet2f& a) {
+  return vrndn_f32(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a) {
+  return vrndnq_f32(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2f pfloor<Packet2f>(const Packet2f& a) {
+  return vrndm_f32(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) {
+  return vrndmq_f32(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2f pceil<Packet2f>(const Packet2f& a) {
+  return vrndp_f32(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) {
+  return vrndpq_f32(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2f pround<Packet2f>(const Packet2f& a) {
+  return vrnda_f32(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) {
+  return vrndaq_f32(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2f ptrunc<Packet2f>(const Packet2f& a) {
+  return vrnd_f32(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f ptrunc<Packet4f>(const Packet4f& a) {
+  return vrndq_f32(a);
+}
+#endif
+
+/**
+ * Computes the integer square root
+ * @remarks The calculation is performed using an algorithm which iterates through each binary digit of the result
+ *   and tests whether setting that digit to 1 would cause the square of the value to be greater than the argument
+ *   value. The algorithm is described in detail here: http://ww1.microchip.com/downloads/en/AppNotes/91040a.pdf .
+ */
+template <>
+EIGEN_STRONG_INLINE Packet4uc psqrt(const Packet4uc& a) {
+  uint8x8_t x = vreinterpret_u8_u32(vdup_n_u32(a));
+  uint8x8_t res = vdup_n_u8(0);
+  uint8x8_t add = vdup_n_u8(0x8);
+  for (int i = 0; i < 4; i++) {
+    const uint8x8_t temp = vorr_u8(res, add);
+    res = vbsl_u8(vcge_u8(x, vmul_u8(temp, temp)), temp, res);
+    add = vshr_n_u8(add, 1);
+  }
+  return vget_lane_u32(vreinterpret_u32_u8(res), 0);
+}
+/// @copydoc Eigen::internal::psqrt(const Packet4uc& a)
+template <>
+EIGEN_STRONG_INLINE Packet8uc psqrt(const Packet8uc& a) {
+  uint8x8_t res = vdup_n_u8(0);
+  uint8x8_t add = vdup_n_u8(0x8);
+  for (int i = 0; i < 4; i++) {
+    const uint8x8_t temp = vorr_u8(res, add);
+    res = vbsl_u8(vcge_u8(a, vmul_u8(temp, temp)), temp, res);
+    add = vshr_n_u8(add, 1);
+  }
+  return res;
+}
+/// @copydoc Eigen::internal::psqrt(const Packet4uc& a)
+template <>
+EIGEN_STRONG_INLINE Packet16uc psqrt(const Packet16uc& a) {
+  uint8x16_t res = vdupq_n_u8(0);
+  uint8x16_t add = vdupq_n_u8(0x8);
+  for (int i = 0; i < 4; i++) {
+    const uint8x16_t temp = vorrq_u8(res, add);
+    res = vbslq_u8(vcgeq_u8(a, vmulq_u8(temp, temp)), temp, res);
+    add = vshrq_n_u8(add, 1);
+  }
+  return res;
+}
+/// @copydoc Eigen::internal::psqrt(const Packet4uc& a)
+template <>
+EIGEN_STRONG_INLINE Packet4us psqrt(const Packet4us& a) {
+  uint16x4_t res = vdup_n_u16(0);
+  uint16x4_t add = vdup_n_u16(0x80);
+  for (int i = 0; i < 8; i++) {
+    const uint16x4_t temp = vorr_u16(res, add);
+    res = vbsl_u16(vcge_u16(a, vmul_u16(temp, temp)), temp, res);
+    add = vshr_n_u16(add, 1);
+  }
+  return res;
+}
+/// @copydoc Eigen::internal::psqrt(const Packet4uc& a)
+template <>
+EIGEN_STRONG_INLINE Packet8us psqrt(const Packet8us& a) {
+  uint16x8_t res = vdupq_n_u16(0);
+  uint16x8_t add = vdupq_n_u16(0x80);
+  for (int i = 0; i < 8; i++) {
+    const uint16x8_t temp = vorrq_u16(res, add);
+    res = vbslq_u16(vcgeq_u16(a, vmulq_u16(temp, temp)), temp, res);
+    add = vshrq_n_u16(add, 1);
+  }
+  return res;
+}
+/// @copydoc Eigen::internal::psqrt(const Packet4uc& a)
+template <>
+EIGEN_STRONG_INLINE Packet2ui psqrt(const Packet2ui& a) {
+  uint32x2_t res = vdup_n_u32(0);
+  uint32x2_t add = vdup_n_u32(0x8000);
+  for (int i = 0; i < 16; i++) {
+    const uint32x2_t temp = vorr_u32(res, add);
+    res = vbsl_u32(vcge_u32(a, vmul_u32(temp, temp)), temp, res);
+    add = vshr_n_u32(add, 1);
+  }
+  return res;
+}
+/// @copydoc Eigen::internal::psqrt(const Packet4uc& a)
+template <>
+EIGEN_STRONG_INLINE Packet4ui psqrt(const Packet4ui& a) {
+  uint32x4_t res = vdupq_n_u32(0);
+  uint32x4_t add = vdupq_n_u32(0x8000);
+  for (int i = 0; i < 16; i++) {
+    const uint32x4_t temp = vorrq_u32(res, add);
+    res = vbslq_u32(vcgeq_u32(a, vmulq_u32(temp, temp)), temp, res);
+    add = vshrq_n_u32(add, 1);
+  }
+  return res;
+}
+
+EIGEN_STRONG_INLINE Packet4f prsqrt_float_unsafe(const Packet4f& a) {
+  // Compute approximate reciprocal sqrt.
+  // Does not correctly handle +/- 0 or +inf
+  float32x4_t result = vrsqrteq_f32(a);
+  result = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, result), result), result);
+  result = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a, result), result), result);
+  return result;
+}
+
+EIGEN_STRONG_INLINE Packet2f prsqrt_float_unsafe(const Packet2f& a) {
+  // Compute approximate reciprocal sqrt.
+  // Does not correctly handle +/- 0 or +inf
+  float32x2_t result = vrsqrte_f32(a);
+  result = vmul_f32(vrsqrts_f32(vmul_f32(a, result), result), result);
+  result = vmul_f32(vrsqrts_f32(vmul_f32(a, result), result), result);
+  return result;
+}
+
+template <typename Packet>
+Packet prsqrt_float_common(const Packet& a) {
+  const Packet cst_zero = pzero(a);
+  const Packet cst_inf = pset1<Packet>(NumTraits<float>::infinity());
+  Packet return_zero = pcmp_eq(a, cst_inf);
+  Packet return_inf = pcmp_eq(a, cst_zero);
+  Packet result = prsqrt_float_unsafe(a);
+  result = pselect(return_inf, por(cst_inf, a), result);
+  result = pandnot(result, return_zero);
+  return result;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f prsqrt(const Packet4f& a) {
+  return prsqrt_float_common(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2f prsqrt(const Packet2f& a) {
+  return prsqrt_float_common(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f preciprocal<Packet4f>(const Packet4f& a) {
+  // Compute approximate reciprocal.
+  float32x4_t result = vrecpeq_f32(a);
+  result = vmulq_f32(vrecpsq_f32(a, result), result);
+  result = vmulq_f32(vrecpsq_f32(a, result), result);
+  return result;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2f preciprocal<Packet2f>(const Packet2f& a) {
+  // Compute approximate reciprocal.
+  float32x2_t result = vrecpe_f32(a);
+  result = vmul_f32(vrecps_f32(a, result), result);
+  result = vmul_f32(vrecps_f32(a, result), result);
+  return result;
+}
+
+// Unfortunately vsqrt_f32 is only available for A64.
+#if EIGEN_ARCH_ARM64
+template <>
+EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) {
+  return vsqrtq_f32(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& a) {
+  return vsqrt_f32(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pdiv(const Packet4f& a, const Packet4f& b) {
+  return vdivq_f32(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2f pdiv(const Packet2f& a, const Packet2f& b) {
+  return vdiv_f32(a, b);
+}
+#else
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet psqrt_float_common(const Packet& a) {
+  const Packet cst_zero = pzero(a);
+  const Packet cst_inf = pset1<Packet>(NumTraits<float>::infinity());
+
+  Packet result = pmul(a, prsqrt_float_unsafe(a));
+  Packet a_is_zero = pcmp_eq(a, cst_zero);
+  Packet a_is_inf = pcmp_eq(a, cst_inf);
+  Packet return_a = por(a_is_zero, a_is_inf);
+
+  result = pselect(return_a, a, result);
+  return result;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f psqrt(const Packet4f& a) {
+  return psqrt_float_common(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2f psqrt(const Packet2f& a) {
+  return psqrt_float_common(a);
+}
+
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet pdiv_float_common(const Packet& a, const Packet& b) {
+  // if b is large, NEON intrinsics will flush preciprocal(b) to zero
+  // avoid underflow with the following manipulation:
+  // a / b = f * (a * reciprocal(f * b))
+
+  const Packet cst_one = pset1<Packet>(1.0f);
+  const Packet cst_quarter = pset1<Packet>(0.25f);
+  const Packet cst_thresh = pset1<Packet>(NumTraits<float>::highest() / 4.0f);
+
+  Packet b_will_underflow = pcmp_le(cst_thresh, pabs(b));
+  Packet f = pselect(b_will_underflow, cst_quarter, cst_one);
+  Packet result = pmul(f, pmul(a, preciprocal(pmul(b, f))));
+  return result;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return pdiv_float_common(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2f pdiv<Packet2f>(const Packet2f& a, const Packet2f& b) {
+  return pdiv_float_common(a, b);
+}
+#endif
+
+//---------- bfloat16 ----------
+// TODO: Add support for native armv8.6-a bfloat16_t
+
+// TODO: Guard if we have native bfloat16 support
+typedef eigen_packet_wrapper<uint16x4_t, 19> Packet4bf;
+
+template <>
+struct is_arithmetic<Packet4bf> {
+  enum { value = true };
+};
+
+template <>
+struct packet_traits<bfloat16> : default_packet_traits {
+  typedef Packet4bf type;
+  typedef Packet4bf half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+
+    HasCmp = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasAbsDiff = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
+    HasSetLinear = 1,
+    HasBlend = 0,
+    HasDiv = 1,
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+    HasLog = 1,
+    HasExp = 1,
+    HasSqrt = 0,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH,
+    HasBessel = 0,  // Issues with accuracy.
+    HasNdtri = 0
+  };
+};
+
+template <>
+struct unpacket_traits<Packet4bf> : neon_unpacket_default<Packet4bf, bfloat16> {};
+
+namespace detail {
+template <>
+EIGEN_ALWAYS_INLINE void zip_in_place<Packet4bf>(Packet4bf& p1, Packet4bf& p2) {
+  const uint16x4x2_t tmp = vzip_u16(p1, p2);
+  p1 = tmp.val[0];
+  p2 = tmp.val[1];
+}
+}  // namespace detail
+
+EIGEN_STRONG_INLINE Packet4bf F32ToBf16(const Packet4f& p) {
+  // See the scalar implementation in BFloat16.h for a comprehensible explanation
+  // of this fast rounding algorithm
+  Packet4ui input = Packet4ui(vreinterpretq_u32_f32(p));
+
+  // lsb = (input >> 16) & 1
+  Packet4ui lsb = vandq_u32(vshrq_n_u32(input, 16), vdupq_n_u32(1));
+
+  // rounding_bias = 0x7fff + lsb
+  Packet4ui rounding_bias = vaddq_u32(lsb, vdupq_n_u32(0x7fff));
+
+  // input += rounding_bias
+  input = vaddq_u32(input, rounding_bias);
+
+  // input = input >> 16
+  input = vshrq_n_u32(input, 16);
+
+  // Replace float-nans by bfloat16-nans, that is 0x7fc0
+  const Packet4ui bf16_nan = vdupq_n_u32(0x7fc0);
+  const Packet4ui mask = vceqq_f32(p, p);
+  input = vbslq_u32(mask, input, bf16_nan);
+
+  // output = static_cast<uint16_t>(input)
+  return vmovn_u32(input);
+}
+
+EIGEN_STRONG_INLINE Packet4f Bf16ToF32(const Packet4bf& p) {
+  return Packet4f(vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(p), 16)));
+}
+
+EIGEN_STRONG_INLINE Packet4bf F32MaskToBf16Mask(const Packet4f& p) { return vmovn_u32(vreinterpretq_u32_f32(p)); }
+
+template <>
+EIGEN_STRONG_INLINE Packet4bf pset1<Packet4bf>(const bfloat16& from) {
+  return Packet4bf(pset1<Packet4us>(from.value));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 pfirst<Packet4bf>(const Packet4bf& from) {
+  return bfloat16_impl::raw_uint16_to_bfloat16(static_cast<uint16_t>(pfirst<Packet4us>(Packet4us(from))));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4bf pload<Packet4bf>(const bfloat16* from) {
+  return Packet4bf(
+      pload<Packet4us>(reinterpret_cast<const uint16_t*>(assume_aligned<unpacket_traits<Packet4bf>::alignment>(from))));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4bf ploadu<Packet4bf>(const bfloat16* from) {
+  return Packet4bf(ploadu<Packet4us>(reinterpret_cast<const uint16_t*>(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<bfloat16>(bfloat16* to, const Packet4bf& from) {
+  EIGEN_DEBUG_ALIGNED_STORE vst1_u16(
+      reinterpret_cast<uint16_t*>(assume_aligned<unpacket_traits<Packet4bf>::alignment>(to)), from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<bfloat16>(bfloat16* to, const Packet4bf& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE vst1_u16(reinterpret_cast<uint16_t*>(to), from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4bf ploaddup<Packet4bf>(const bfloat16* from) {
+  return Packet4bf(ploaddup<Packet4us>(reinterpret_cast<const uint16_t*>(from)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4bf pabs(const Packet4bf& a) {
+  return F32ToBf16(pabs<Packet4f>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4bf pmin<PropagateNumbers, Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
+  return F32ToBf16(pmin<PropagateNumbers, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4bf pmin<PropagateNaN, Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
+  return F32ToBf16(pmin<PropagateNaN, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4bf pmin<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
+  return F32ToBf16(pmin<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4bf pmax<PropagateNumbers, Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
+  return F32ToBf16(pmax<PropagateNumbers, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4bf pmax<PropagateNaN, Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
+  return F32ToBf16(pmax<PropagateNaN, Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4bf pmax<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
+  return F32ToBf16(pmax<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4bf plset<Packet4bf>(const bfloat16& a) {
+  return F32ToBf16(plset<Packet4f>(static_cast<float>(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4bf por(const Packet4bf& a, const Packet4bf& b) {
+  return Packet4bf(por<Packet4us>(Packet4us(a), Packet4us(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4bf pxor(const Packet4bf& a, const Packet4bf& b) {
+  return Packet4bf(pxor<Packet4us>(Packet4us(a), Packet4us(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4bf pand(const Packet4bf& a, const Packet4bf& b) {
+  return Packet4bf(pand<Packet4us>(Packet4us(a), Packet4us(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4bf pandnot(const Packet4bf& a, const Packet4bf& b) {
+  return Packet4bf(pandnot<Packet4us>(Packet4us(a), Packet4us(b)));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4bf pselect(const Packet4bf& mask, const Packet4bf& a, const Packet4bf& b) {
+  return Packet4bf(pselect<Packet4us>(Packet4us(mask), Packet4us(a), Packet4us(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4bf print<Packet4bf>(const Packet4bf& a) {
+  return F32ToBf16(print<Packet4f>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4bf pfloor<Packet4bf>(const Packet4bf& a) {
+  return F32ToBf16(pfloor<Packet4f>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4bf pceil<Packet4bf>(const Packet4bf& a) {
+  return F32ToBf16(pceil<Packet4f>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4bf pround<Packet4bf>(const Packet4bf& a) {
+  return F32ToBf16(pround<Packet4f>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4bf ptrunc<Packet4bf>(const Packet4bf& a) {
+  return F32ToBf16(ptrunc<Packet4f>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4bf pconj(const Packet4bf& a) {
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4bf padd<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
+  return F32ToBf16(padd<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4bf psub<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
+  return F32ToBf16(psub<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4bf pmul<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
+  return F32ToBf16(pmul<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4bf pmadd<Packet4bf>(const Packet4bf& a, const Packet4bf& b, const Packet4bf& c) {
+  return F32ToBf16(pmadd<Packet4f>(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4bf pmsub<Packet4bf>(const Packet4bf& a, const Packet4bf& b, const Packet4bf& c) {
+  return F32ToBf16(pmsub<Packet4f>(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4bf pnmadd<Packet4bf>(const Packet4bf& a, const Packet4bf& b, const Packet4bf& c) {
+  return F32ToBf16(pnmadd<Packet4f>(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4bf pnmsub<Packet4bf>(const Packet4bf& a, const Packet4bf& b, const Packet4bf& c) {
+  return F32ToBf16(pnmsub<Packet4f>(Bf16ToF32(a), Bf16ToF32(b), Bf16ToF32(c)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4bf pdiv<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
+  return F32ToBf16(pdiv<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4bf pgather<bfloat16, Packet4bf>(const bfloat16* from, Index stride) {
+  return Packet4bf(pgather<uint16_t, Packet4us>(reinterpret_cast<const uint16_t*>(from), stride));
+}
+
+template <>
+EIGEN_STRONG_INLINE void pscatter<bfloat16, Packet4bf>(bfloat16* to, const Packet4bf& from, Index stride) {
+  pscatter<uint16_t, Packet4us>(reinterpret_cast<uint16_t*>(to), Packet4us(from), stride);
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux<Packet4bf>(const Packet4bf& a) {
+  return static_cast<bfloat16>(predux<Packet4f>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_max<Packet4bf>(const Packet4bf& a) {
+  return static_cast<bfloat16>(predux_max<Packet4f>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_min<Packet4bf>(const Packet4bf& a) {
+  return static_cast<bfloat16>(predux_min<Packet4f>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE bfloat16 predux_mul<Packet4bf>(const Packet4bf& a) {
+  return static_cast<bfloat16>(predux_mul<Packet4f>(Bf16ToF32(a)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4bf preverse<Packet4bf>(const Packet4bf& a) {
+  return Packet4bf(preverse<Packet4us>(Packet4us(a)));
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4bf, 4>& kernel) {
+  detail::ptranspose_impl(kernel);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4bf pabsdiff<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
+  return F32ToBf16(pabsdiff<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4bf pcmp_eq<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
+  return F32MaskToBf16Mask(pcmp_eq<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4bf pcmp_lt<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
+  return F32MaskToBf16Mask(pcmp_lt<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4bf pcmp_lt_or_nan<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
+  return F32MaskToBf16Mask(pcmp_lt_or_nan<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4bf pcmp_le<Packet4bf>(const Packet4bf& a, const Packet4bf& b) {
+  return F32MaskToBf16Mask(pcmp_le<Packet4f>(Bf16ToF32(a), Bf16ToF32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4bf pnegate<Packet4bf>(const Packet4bf& a) {
+  return Packet4bf(pxor<Packet4us>(Packet4us(a), pset1<Packet4us>(static_cast<uint16_t>(0x8000))));
+}
+
+//---------- double ----------
+
+// Clang 3.5 in the iOS toolchain has an ICE triggered by NEON intrinsics for double.
+// Confirmed at least with __apple_build_version__ = 6000054.
+#if EIGEN_COMP_CLANGAPPLE
+// Let's hope that by the time __apple_build_version__ hits the 601* range, the bug will be fixed.
+// https://gist.github.com/yamaya/2924292 suggests that the 3 first digits are only updated with
+// major toolchain updates.
+#define EIGEN_APPLE_DOUBLE_NEON_BUG (EIGEN_COMP_CLANGAPPLE < 6010000)
+#else
+#define EIGEN_APPLE_DOUBLE_NEON_BUG 0
+#endif
+
+#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
+
+#if EIGEN_COMP_GNUC
+// Bug 907: workaround missing declarations of the following two functions in the ADK
+// Defining these functions as templates ensures that if these intrinsics are
+// already defined in arm_neon.h, then our workaround doesn't cause a conflict
+// and has lower priority in overload resolution.
+// This doesn't work with MSVC though, since the function names are macros.
+template <typename T>
+uint64x2_t vreinterpretq_u64_f64(T a) {
+  return (uint64x2_t)a;
+}
+
+template <typename T>
+float64x2_t vreinterpretq_f64_u64(T a) {
+  return (float64x2_t)a;
+}
+#endif
+
+#if EIGEN_COMP_MSVC_STRICT
+typedef eigen_packet_wrapper<float64x2_t, 18> Packet2d;
+typedef eigen_packet_wrapper<float64x1_t, 19> Packet1d;
+
+EIGEN_ALWAYS_INLINE Packet2d make_packet2d(double a, double b) {
+  double from[2] = {a, b};
+  return vld1q_f64(from);
+}
+
+#else
+typedef float64x2_t Packet2d;
+typedef float64x1_t Packet1d;
+
+EIGEN_ALWAYS_INLINE Packet2d make_packet2d(double a, double b) { return Packet2d{a, b}; }
+#endif
+
+// functionally equivalent to _mm_shuffle_pd in SSE (i.e. shuffle(m, n, mask) equals _mm_shuffle_pd(m,n,mask))
+// Currently used in LU/arch/InverseSize4.h to enable a shared implementation
+// for fast inversion of matrices of size 4.
+EIGEN_STRONG_INLINE Packet2d shuffle(const Packet2d& m, const Packet2d& n, int mask) {
+  const double* a = reinterpret_cast<const double*>(&m);
+  const double* b = reinterpret_cast<const double*>(&n);
+  Packet2d res = make_packet2d(*(a + (mask & 1)), *(b + ((mask >> 1) & 1)));
+  return res;
+}
+
+EIGEN_STRONG_INLINE Packet2d vec2d_swizzle2(const Packet2d& a, const Packet2d& b, int mask) {
+  return shuffle(a, b, mask);
+}
+EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(const Packet2d& a, const Packet2d& b) { return shuffle(a, b, 0); }
+EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d& a, const Packet2d& b) { return shuffle(a, b, 3); }
+#define vec2d_duplane(a, p) Packet2d(vdupq_laneq_f64(a, p))
+
+template <>
+struct packet_traits<double> : default_packet_traits {
+  typedef Packet2d type;
+  typedef Packet2d half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 2,
+
+    HasCmp = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasAbsDiff = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
+    HasSetLinear = 1,
+    HasBlend = 0,
+
+    HasDiv = 1,
+
+#if EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
+    HasExp = 1,
+    HasLog = 1,
+    HasPow = 1,
+    HasATan = 1,
+    HasATanh = 1,
+#endif
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasCbrt = 1,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH,
+    HasErfc = EIGEN_FAST_MATH
+  };
+};
+
+template <>
+struct unpacket_traits<Packet2d> : neon_unpacket_default<Packet2d, double> {
+  using integer_packet = Packet2l;
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pzero<Packet2d>(const Packet2d& /*a*/) {
+  return vdupq_n_f64(0.0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
+  return vdupq_n_f64(from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) {
+  const double c[] = {0.0, 1.0};
+  return vaddq_f64(pset1<Packet2d>(a), vld1q_f64(c));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return vaddq_f64(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return vsubq_f64(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d&, const Packet2d&);
+template <>
+EIGEN_STRONG_INLINE Packet2d paddsub<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  const Packet2d mask = make_packet2d(numext::bit_cast<double>(0x8000000000000000ull), 0.0);
+  return padd(a, pxor(mask, b));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) {
+  return vnegq_f64(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) {
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return vmulq_f64(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return vdivq_f64(a, b);
+}
+
+#ifdef EIGEN_VECTORIZE_FMA
+// See bug 936. See above comment about FMA for float.
+template <>
+EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+  return vfmaq_f64(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pnmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+  return vfmsq_f64(c, a, b);
+}
+#else
+template <>
+EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+  return vmlaq_f64(c, a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pnmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+  return vmlsq_f64(c, a, b);
+}
+#endif
+template <>
+EIGEN_STRONG_INLINE Packet2d pmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+  return pnegate(pnmadd(a, b, c));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pnmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+  return pnegate(pmadd(a, b, c));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return vminq_f64(a, b);
+}
+
+#ifdef __ARM_FEATURE_NUMERIC_MAXMIN
+// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8
+// systems).
+template <>
+EIGEN_STRONG_INLINE Packet2d pmin<PropagateNumbers, Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return vminnmq_f64(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pmax<PropagateNumbers, Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return vmaxnmq_f64(a, b);
+}
 
-#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
-#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
 #endif
 
-// FIXME NEON has 16 quad registers, but since the current register allocator
-// is so bad, it is much better to reduce it to 8
-#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
-#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 8
-#endif
+template <>
+EIGEN_STRONG_INLINE Packet2d pmin<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return pmin<Packet2d>(a, b);
+}
 
-typedef float32x4_t Packet4f;
-typedef int32x4_t   Packet4i;
-typedef uint32x4_t  Packet4ui;
+template <>
+EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return vmaxq_f64(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pmax<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return pmax<Packet2d>(a, b);
+}
+
+// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
+template <>
+EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(a), vreinterpretq_u64_f64(b)));
+}
 
-#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
-  const Packet4f p4f_##NAME = pset1<Packet4f>(X)
+template <>
+EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(a), vreinterpretq_u64_f64(b)));
+}
 
-#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
-  const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1<int>(X))
+template <>
+EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(a), vreinterpretq_u64_f64(b)));
+}
 
-#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
-  const Packet4i p4i_##NAME = pset1<Packet4i>(X)
+template <>
+EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(a), vreinterpretq_u64_f64(b)));
+}
 
-#if defined(__llvm__) && !defined(__clang__)
-  //Special treatment for Apple's llvm-gcc, its NEON packet types are unions
-  #define EIGEN_INIT_NEON_PACKET2(X, Y)       {{X, Y}}
-  #define EIGEN_INIT_NEON_PACKET4(X, Y, Z, W) {{X, Y, Z, W}}
-#else
-  //Default initializer for packets
-  #define EIGEN_INIT_NEON_PACKET2(X, Y)       {X, Y}
-  #define EIGEN_INIT_NEON_PACKET4(X, Y, Z, W) {X, Y, Z, W}
-#endif
+template <>
+EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) {
+  return vreinterpretq_f64_u64(vcleq_f64(a, b));
+}
 
-// arm64 does have the pld instruction. If available, let's trust the __builtin_prefetch built-in function
-// which available on LLVM and GCC (at least)
-#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || defined(__GNUC__)
-  #define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR);
-#elif defined __pld
-  #define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR)
-#elif !defined(__aarch64__)
-  #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ( "   pld [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" );
+template <>
+EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) {
+  return vreinterpretq_f64_u64(vcltq_f64(a, b));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) {
+  return vreinterpretq_f64_u32(vmvnq_u32(vreinterpretq_u32_u64(vcgeq_f64(a, b))));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) {
+  return vreinterpretq_f64_u64(vceqq_f64(a, b));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f64(assume_aligned<unpacket_traits<Packet2d>::alignment>(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f64(from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) {
+  return vld1q_dup_f64(from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) {
+  EIGEN_DEBUG_ALIGNED_STORE vst1q_f64(assume_aligned<unpacket_traits<Packet2d>::alignment>(to), from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE vst1q_f64(to, from);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pgather<double, Packet2d>(const double* from, Index stride) {
+  Packet2d res = pset1<Packet2d>(0.0);
+  res = vld1q_lane_f64(from + 0 * stride, res, 0);
+  res = vld1q_lane_f64(from + 1 * stride, res, 1);
+  return res;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) {
+  vst1q_lane_f64(to + stride * 0, from, 0);
+  vst1q_lane_f64(to + stride * 1, from, 1);
+}
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
+  EIGEN_ARM_PREFETCH(addr);
+}
+
+// FIXME only store the 2 first elements ?
+template <>
+EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
+  return vgetq_lane_f64(a, 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) {
+  return vcombine_f64(vget_high_f64(a), vget_low_f64(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) {
+  return vabsq_f64(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a) {
+  return vreinterpretq_f64_s64(vshrq_n_s64(vreinterpretq_s64_f64(a), 63));
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) {
+  return vaddvq_f64(a);
+}
+
+// Other reduction functions:
+// mul
+#if EIGEN_COMP_CLANGAPPLE
+template <>
+EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) {
+  return (vget_low_f64(a) * vget_high_f64(a))[0];
+}
 #else
-  // by default no explicit prefetching
-  #define EIGEN_ARM_PREFETCH(ADDR)
+template <>
+EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) {
+  return vget_lane_f64(vmul_f64(vget_low_f64(a), vget_high_f64(a)), 0);
+}
 #endif
 
-template<> struct packet_traits<float>  : default_packet_traits
-{
-  typedef Packet4f type;
+// min
+template <>
+EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) {
+  return vminvq_f64(a);
+}
+
+// max
+template <>
+EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) {
+  return vmaxvq_f64(a);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
+  const float64x2_t tmp1 = vzip1q_f64(kernel.packet[0], kernel.packet[1]);
+  const float64x2_t tmp2 = vzip2q_f64(kernel.packet[0], kernel.packet[1]);
+
+  kernel.packet[0] = tmp1;
+  kernel.packet[1] = tmp2;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet2d pselect(const Packet2d& mask, const Packet2d& a, const Packet2d& b) {
+  return vbslq_f64(vreinterpretq_u64_f64(mask), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a) {
+  return vrndnq_f64(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) {
+  return vrndmq_f64(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) {
+  return vrndpq_f64(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) {
+  return vrndaq_f64(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d ptrunc<Packet2d>(const Packet2d& a) {
+  return vrndq_f64(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
+  return pldexp_generic(a, exponent);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(const Packet2d& a, Packet2d& exponent) {
+  return pfrexp_generic(a, exponent);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(uint64_t from) {
+  return vreinterpretq_f64_u64(vdupq_n_u64(from));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d prsqrt(const Packet2d& a) {
+  // Do Newton iterations for 1/sqrt(x).
+  return generic_rsqrt_newton_step<Packet2d, /*Steps=*/3>::run(a, vrsqrteq_f64(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d psqrt(const Packet2d& _x) {
+  return vsqrtq_f64(_x);
+}
+
+#endif  // EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG
+
+// Do we have an fp16 types and supporting Neon intrinsics?
+#if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
+typedef float16x4_t Packet4hf;
+typedef float16x8_t Packet8hf;
+
+template <>
+struct packet_traits<Eigen::half> : default_packet_traits {
+  typedef Packet8hf type;
+  typedef Packet4hf half;
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
-    size = 4,
-   
-    HasDiv  = 1,
-    // FIXME check the Has*
-    HasSin  = 0,
-    HasCos  = 0,
-    HasLog  = 0,
-    HasExp  = 0,
-    HasSqrt = 0
+    size = 8,
+
+    HasCmp = 1,
+    HasCast = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasAbsDiff = 0,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
+    HasSetLinear = 1,
+    HasBlend = 0,
+    HasInsert = 1,
+    HasReduxp = 1,
+    HasDiv = 1,
+    HasSin = 0,
+    HasCos = 0,
+    HasLog = 0,
+    HasExp = 0,
+    HasTanh = packet_traits<float>::HasTanh,  // tanh<half> calls tanh<float>
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasErf = EIGEN_FAST_MATH,
+    HasBessel = 0,  // Issues with accuracy.
+    HasNdtri = 0
   };
 };
-template<> struct packet_traits<int>    : default_packet_traits
-{
-  typedef Packet4i type;
-  enum {
-    Vectorizable = 1,
-    AlignedOnScalar = 1,
-    size=4
-    // FIXME check the Has*
-  };
+
+template <>
+struct unpacket_traits<Packet4hf> : neon_unpacket_default<Packet4hf, half> {};
+template <>
+struct unpacket_traits<Packet8hf> : neon_unpacket_default<Packet8hf, half> {
+  using half = Packet4hf;
 };
 
-#if EIGEN_GNUC_AT_MOST(4,4) && !defined(__llvm__)
-// workaround gcc 4.2, 4.3 and 4.4 compilatin issue
-EIGEN_STRONG_INLINE float32x4_t vld1q_f32(const float* x) { return ::vld1q_f32((const float32_t*)x); }
-EIGEN_STRONG_INLINE float32x2_t vld1_f32 (const float* x) { return ::vld1_f32 ((const float32_t*)x); }
-EIGEN_STRONG_INLINE void        vst1q_f32(float* to, float32x4_t from) { ::vst1q_f32((float32_t*)to,from); }
-EIGEN_STRONG_INLINE void        vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); }
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf predux_half_dowto4<Packet8hf>(const Packet8hf& a) {
+  return vadd_f16(vget_low_f16(a), vget_high_f16(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf pset1<Packet8hf>(const Eigen::half& from) {
+  return vdupq_n_f16(from.x);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf pset1<Packet4hf>(const Eigen::half& from) {
+  return vdup_n_f16(from.x);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf plset<Packet8hf>(const Eigen::half& a) {
+  const float16_t f[] = {0, 1, 2, 3, 4, 5, 6, 7};
+  Packet8hf countdown = vld1q_f16(f);
+  return vaddq_f16(pset1<Packet8hf>(a), countdown);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf plset<Packet4hf>(const Eigen::half& a) {
+  const float16_t f[] = {0, 1, 2, 3};
+  Packet4hf countdown = vld1_f16(f);
+  return vadd_f16(pset1<Packet4hf>(a), countdown);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf padd<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
+  return vaddq_f16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf padd<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
+  return vadd_f16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf psub<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
+  return vsubq_f16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf psub<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
+  return vsub_f16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf pnegate(const Packet8hf& a) {
+  return vnegq_f16(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf pnegate(const Packet4hf& a) {
+  return vneg_f16(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf pconj(const Packet8hf& a) {
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf pconj(const Packet4hf& a) {
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf pmul<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
+  return vmulq_f16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf pmul<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
+  return vmul_f16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf pdiv<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
+  return vdivq_f16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf pdiv<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
+  return vdiv_f16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf pmadd(const Packet8hf& a, const Packet8hf& b, const Packet8hf& c) {
+  return vfmaq_f16(c, a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf pmadd(const Packet4hf& a, const Packet4hf& b, const Packet4hf& c) {
+  return vfma_f16(c, a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf pnmadd(const Packet8hf& a, const Packet8hf& b, const Packet8hf& c) {
+  return vfmsq_f16(c, a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf pnmadd(const Packet4hf& a, const Packet4hf& b, const Packet4hf& c) {
+  return vfms_f16(c, a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf pmsub(const Packet8hf& a, const Packet8hf& b, const Packet8hf& c) {
+  return pnegate(pnmadd(a, b, c));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf pmsub(const Packet4hf& a, const Packet4hf& b, const Packet4hf& c) {
+  return pnegate(pnmadd(a, b, c));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf pnmsub(const Packet8hf& a, const Packet8hf& b, const Packet8hf& c) {
+  return pnegate(pmadd(a, b, c));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf pnmsub(const Packet4hf& a, const Packet4hf& b, const Packet4hf& c) {
+  return pnegate(pmadd(a, b, c));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf pmin<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
+  return vminq_f16(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf pmin<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
+  return vmin_f16(a, b);
+}
+
+#ifdef __ARM_FEATURE_NUMERIC_MAXMIN
+// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8
+// systems).
+template <>
+EIGEN_STRONG_INLINE Packet4hf pmin<PropagateNumbers, Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
+  return vminnm_f16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8hf pmin<PropagateNumbers, Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
+  return vminnmq_f16(a, b);
+}
 #endif
 
-template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4}; };
-template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4}; };
+template <>
+EIGEN_STRONG_INLINE Packet4hf pmin<PropagateNaN, Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
+  return pmin<Packet4hf>(a, b);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) { return vdupq_n_f32(from); }
-template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from)   { return vdupq_n_s32(from); }
+template <>
+EIGEN_STRONG_INLINE Packet8hf pmin<PropagateNaN, Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
+  return pmin<Packet8hf>(a, b);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a)
-{
-  Packet4f countdown = EIGEN_INIT_NEON_PACKET4(0, 1, 2, 3);
-  return vaddq_f32(pset1<Packet4f>(a), countdown);
+template <>
+EIGEN_STRONG_INLINE Packet8hf pmax<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
+  return vmaxq_f16(a, b);
 }
-template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a)
-{
-  Packet4i countdown = EIGEN_INIT_NEON_PACKET4(0, 1, 2, 3);
-  return vaddq_s32(pset1<Packet4i>(a), countdown);
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf pmax<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
+  return vmax_f16(a, b);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return vaddq_f32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return vaddq_s32(a,b); }
+#ifdef __ARM_FEATURE_NUMERIC_MAXMIN
+// numeric max and min are only available if ARM_FEATURE_NUMERIC_MAXMIN is defined (which can only be the case for Armv8
+// systems).
+template <>
+EIGEN_STRONG_INLINE Packet4hf pmax<PropagateNumbers, Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
+  return vmaxnm_f16(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8hf pmax<PropagateNumbers, Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
+  return vmaxnmq_f16(a, b);
+}
+#endif
 
-template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return vsubq_f32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return vsubq_s32(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet4hf pmax<PropagateNaN, Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
+  return pmax<Packet4hf>(a, b);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return vnegq_f32(a); }
-template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return vnegq_s32(a); }
+template <>
+EIGEN_STRONG_INLINE Packet8hf pmax<PropagateNaN, Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
+  return pmax<Packet8hf>(a, b);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
+#define EIGEN_MAKE_ARM_FP16_CMP_8(name)                                               \
+  template <>                                                                         \
+  EIGEN_STRONG_INLINE Packet8hf pcmp_##name(const Packet8hf& a, const Packet8hf& b) { \
+    return vreinterpretq_f16_u16(vc##name##q_f16(a, b));                              \
+  }
 
-template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmulq_f32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmulq_s32(a,b); }
+#define EIGEN_MAKE_ARM_FP16_CMP_4(name)                                               \
+  template <>                                                                         \
+  EIGEN_STRONG_INLINE Packet4hf pcmp_##name(const Packet4hf& a, const Packet4hf& b) { \
+    return vreinterpret_f16_u16(vc##name##_f16(a, b));                                \
+  }
 
-template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
-  Packet4f inv, restep, div;
+EIGEN_MAKE_ARM_FP16_CMP_8(eq)
+EIGEN_MAKE_ARM_FP16_CMP_8(lt)
+EIGEN_MAKE_ARM_FP16_CMP_8(le)
 
-  // NEON does not offer a divide instruction, we have to do a reciprocal approximation
-  // However NEON in contrast to other SIMD engines (AltiVec/SSE), offers
-  // a reciprocal estimate AND a reciprocal step -which saves a few instructions
-  // vrecpeq_f32() returns an estimate to 1/b, which we will finetune with
-  // Newton-Raphson and vrecpsq_f32()
-  inv = vrecpeq_f32(b);
+EIGEN_MAKE_ARM_FP16_CMP_4(eq)
+EIGEN_MAKE_ARM_FP16_CMP_4(lt)
+EIGEN_MAKE_ARM_FP16_CMP_4(le)
 
-  // This returns a differential, by which we will have to multiply inv to get a better
-  // approximation of 1/b.
-  restep = vrecpsq_f32(b, inv);
-  inv = vmulq_f32(restep, inv);
+#undef EIGEN_MAKE_ARM_FP16_CMP_8
+#undef EIGEN_MAKE_ARM_FP16_CMP_4
 
-  // Finally, multiply a by 1/b and get the wanted result of the division.
-  div = vmulq_f32(a, inv);
+template <>
+EIGEN_STRONG_INLINE Packet8hf pcmp_lt_or_nan<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
+  return vreinterpretq_f16_u16(vmvnq_u16(vcgeq_f16(a, b)));
+}
 
-  return div;
+template <>
+EIGEN_STRONG_INLINE Packet4hf pcmp_lt_or_nan<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
+  return vreinterpret_f16_u16(vmvn_u16(vcge_f16(a, b)));
 }
-template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
-{ eigen_assert(false && "packet integer division are not supported by NEON");
-  return pset1<Packet4i>(0);
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf print<Packet8hf>(const Packet8hf& a) {
+  return vrndnq_f16(a);
 }
 
-// for some weird raisons, it has to be overloaded for packet of integers
-template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vmlaq_f32(c,a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return vmlaq_s32(c,a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet4hf print<Packet4hf>(const Packet4hf& a) {
+  return vrndn_f16(a);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vminq_f32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vminq_s32(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet8hf pfloor<Packet8hf>(const Packet8hf& a) {
+  return vrndmq_f16(a);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vmaxq_f32(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vmaxq_s32(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet4hf pfloor<Packet4hf>(const Packet4hf& a) {
+  return vrndm_f16(a);
+}
 
-// Logical Operations are not supported for float, so we have to reinterpret casts using NEON intrinsics
-template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
-  return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
+template <>
+EIGEN_STRONG_INLINE Packet8hf pceil<Packet8hf>(const Packet8hf& a) {
+  return vrndpq_f16(a);
 }
-template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vandq_s32(a,b); }
 
-template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
-  return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
+template <>
+EIGEN_STRONG_INLINE Packet4hf pceil<Packet4hf>(const Packet4hf& a) {
+  return vrndp_f16(a);
 }
-template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vorrq_s32(a,b); }
 
-template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
-  return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
+template <>
+EIGEN_STRONG_INLINE Packet8hf pround<Packet8hf>(const Packet8hf& a) {
+  return vrndaq_f16(a);
 }
-template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return veorq_s32(a,b); }
 
-template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b)
-{
-  return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(a),vreinterpretq_u32_f32(b)));
+template <>
+EIGEN_STRONG_INLINE Packet4hf pround<Packet4hf>(const Packet4hf& a) {
+  return vrnda_f16(a);
 }
-template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vbicq_s32(a,b); }
 
-template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); }
-template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*   from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); }
+template <>
+EIGEN_STRONG_INLINE Packet8hf ptrunc<Packet8hf>(const Packet8hf& a) {
+  return vrndq_f16(a);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); }
-template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)   { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); }
+template <>
+EIGEN_STRONG_INLINE Packet4hf ptrunc<Packet4hf>(const Packet4hf& a) {
+  return vrnd_f16(a);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float*   from)
-{
-  float32x2_t lo, hi;
-  lo = vld1_dup_f32(from);
-  hi = vld1_dup_f32(from+1);
-  return vcombine_f32(lo, hi);
+template <>
+EIGEN_STRONG_INLINE Packet8hf psqrt<Packet8hf>(const Packet8hf& a) {
+  return vsqrtq_f16(a);
 }
-template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int*     from)
-{
-  int32x2_t lo, hi;
-  lo = vld1_dup_s32(from);
-  hi = vld1_dup_s32(from+1);
-  return vcombine_s32(lo, hi);
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf psqrt<Packet4hf>(const Packet4hf& a) {
+  return vsqrt_f16(a);
 }
 
-template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from); }
-template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from); }
+template <>
+EIGEN_STRONG_INLINE Packet8hf pand<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
+  return vreinterpretq_f16_u16(vandq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)));
+}
 
-template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); }
-template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*      to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); }
+template <>
+EIGEN_STRONG_INLINE Packet4hf pand<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
+  return vreinterpret_f16_u16(vand_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b)));
+}
 
-template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_ARM_PREFETCH(addr); }
-template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*     addr) { EIGEN_ARM_PREFETCH(addr); }
+template <>
+EIGEN_STRONG_INLINE Packet8hf por<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
+  return vreinterpretq_f16_u16(vorrq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)));
+}
 
-// FIXME only store the 2 first elements ?
-template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; }
-template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int   EIGEN_ALIGN16 x[4]; vst1q_s32(x, a); return x[0]; }
+template <>
+EIGEN_STRONG_INLINE Packet4hf por<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
+  return vreinterpret_f16_u16(vorr_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b)));
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
-  float32x2_t a_lo, a_hi;
-  Packet4f a_r64;
+template <>
+EIGEN_STRONG_INLINE Packet8hf pxor<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
+  return vreinterpretq_f16_u16(veorq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)));
+}
 
-  a_r64 = vrev64q_f32(a);
-  a_lo = vget_low_f32(a_r64);
-  a_hi = vget_high_f32(a_r64);
-  return vcombine_f32(a_hi, a_lo);
+template <>
+EIGEN_STRONG_INLINE Packet4hf pxor<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
+  return vreinterpret_f16_u16(veor_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b)));
 }
-template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
-  int32x2_t a_lo, a_hi;
-  Packet4i a_r64;
 
-  a_r64 = vrev64q_s32(a);
-  a_lo = vget_low_s32(a_r64);
-  a_hi = vget_high_s32(a_r64);
-  return vcombine_s32(a_hi, a_lo);
+template <>
+EIGEN_STRONG_INLINE Packet8hf pandnot<Packet8hf>(const Packet8hf& a, const Packet8hf& b) {
+  return vreinterpretq_f16_u16(vbicq_u16(vreinterpretq_u16_f16(a), vreinterpretq_u16_f16(b)));
 }
-template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vabsq_f32(a); }
-template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vabsq_s32(a); }
 
-template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
-{
-  float32x2_t a_lo, a_hi, sum;
+template <>
+EIGEN_STRONG_INLINE Packet4hf pandnot<Packet4hf>(const Packet4hf& a, const Packet4hf& b) {
+  return vreinterpret_f16_u16(vbic_u16(vreinterpret_u16_f16(a), vreinterpret_u16_f16(b)));
+}
 
-  a_lo = vget_low_f32(a);
-  a_hi = vget_high_f32(a);
-  sum = vpadd_f32(a_lo, a_hi);
-  sum = vpadd_f32(sum, sum);
-  return vget_lane_f32(sum, 0);
+template <>
+EIGEN_STRONG_INLINE Packet8hf pload<Packet8hf>(const Eigen::half* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f16(
+      reinterpret_cast<const float16_t*>(assume_aligned<unpacket_traits<Packet8hf>::alignment>(from)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
-{
-  float32x4x2_t vtrn1, vtrn2, res1, res2;
-  Packet4f sum1, sum2, sum;
+template <>
+EIGEN_STRONG_INLINE Packet4hf pload<Packet4hf>(const Eigen::half* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return vld1_f16(
+      reinterpret_cast<const float16_t*>(assume_aligned<unpacket_traits<Packet4hf>::alignment>(from)));
+}
 
-  // NEON zip performs interleaving of the supplied vectors.
-  // We perform two interleaves in a row to acquire the transposed vector
-  vtrn1 = vzipq_f32(vecs[0], vecs[2]);
-  vtrn2 = vzipq_f32(vecs[1], vecs[3]);
-  res1 = vzipq_f32(vtrn1.val[0], vtrn2.val[0]);
-  res2 = vzipq_f32(vtrn1.val[1], vtrn2.val[1]);
+template <>
+EIGEN_STRONG_INLINE Packet8hf ploadu<Packet8hf>(const Eigen::half* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f16(reinterpret_cast<const float16_t*>(from));
+}
 
-  // Do the addition of the resulting vectors
-  sum1 = vaddq_f32(res1.val[0], res1.val[1]);
-  sum2 = vaddq_f32(res2.val[0], res2.val[1]);
-  sum = vaddq_f32(sum1, sum2);
+template <>
+EIGEN_STRONG_INLINE Packet4hf ploadu<Packet4hf>(const Eigen::half* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return vld1_f16(reinterpret_cast<const float16_t*>(from));
+}
 
-  return sum;
+template <>
+EIGEN_STRONG_INLINE Packet8hf ploaddup<Packet8hf>(const Eigen::half* from) {
+  Packet8hf packet;
+  packet[0] = from[0].x;
+  packet[1] = from[0].x;
+  packet[2] = from[1].x;
+  packet[3] = from[1].x;
+  packet[4] = from[2].x;
+  packet[5] = from[2].x;
+  packet[6] = from[3].x;
+  packet[7] = from[3].x;
+  return packet;
 }
 
-template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
-{
-  int32x2_t a_lo, a_hi, sum;
+template <>
+EIGEN_STRONG_INLINE Packet4hf ploaddup<Packet4hf>(const Eigen::half* from) {
+  float16x4_t packet;
+  float16_t* tmp;
+  tmp = (float16_t*)&packet;
+  tmp[0] = from[0].x;
+  tmp[1] = from[0].x;
+  tmp[2] = from[1].x;
+  tmp[3] = from[1].x;
+  return packet;
+}
 
-  a_lo = vget_low_s32(a);
-  a_hi = vget_high_s32(a);
-  sum = vpadd_s32(a_lo, a_hi);
-  sum = vpadd_s32(sum, sum);
-  return vget_lane_s32(sum, 0);
+template <>
+EIGEN_STRONG_INLINE Packet8hf ploadquad<Packet8hf>(const Eigen::half* from) {
+  Packet4hf lo, hi;
+  lo = vld1_dup_f16(reinterpret_cast<const float16_t*>(from));
+  hi = vld1_dup_f16(reinterpret_cast<const float16_t*>(from + 1));
+  return vcombine_f16(lo, hi);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
-{
-  int32x4x2_t vtrn1, vtrn2, res1, res2;
-  Packet4i sum1, sum2, sum;
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pinsertfirst(const Packet8hf& a, Eigen::half b) {
+  return vsetq_lane_f16(b.x, a, 0);
+}
 
-  // NEON zip performs interleaving of the supplied vectors.
-  // We perform two interleaves in a row to acquire the transposed vector
-  vtrn1 = vzipq_s32(vecs[0], vecs[2]);
-  vtrn2 = vzipq_s32(vecs[1], vecs[3]);
-  res1 = vzipq_s32(vtrn1.val[0], vtrn2.val[0]);
-  res2 = vzipq_s32(vtrn1.val[1], vtrn2.val[1]);
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pinsertfirst(const Packet4hf& a, Eigen::half b) {
+  return vset_lane_f16(b.x, a, 0);
+}
 
-  // Do the addition of the resulting vectors
-  sum1 = vaddq_s32(res1.val[0], res1.val[1]);
-  sum2 = vaddq_s32(res2.val[0], res2.val[1]);
-  sum = vaddq_s32(sum1, sum2);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pselect(const Packet8hf& mask, const Packet8hf& a, const Packet8hf& b) {
+  return vbslq_f16(vreinterpretq_u16_f16(mask), a, b);
+}
 
-  return sum;
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pselect(const Packet4hf& mask, const Packet4hf& a, const Packet4hf& b) {
+  return vbsl_f16(vreinterpret_u16_f16(mask), a, b);
 }
 
-// Other reduction functions:
-// mul
-template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
-{
-  float32x2_t a_lo, a_hi, prod;
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pinsertlast(const Packet8hf& a, Eigen::half b) {
+  return vsetq_lane_f16(b.x, a, 7);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pinsertlast(const Packet4hf& a, Eigen::half b) {
+  return vset_lane_f16(b.x, a, 3);
+}
 
-  // Get a_lo = |a1|a2| and a_hi = |a3|a4|
-  a_lo = vget_low_f32(a);
-  a_hi = vget_high_f32(a);
-  // Get the product of a_lo * a_hi -> |a1*a3|a2*a4|
-  prod = vmul_f32(a_lo, a_hi);
-  // Multiply prod with its swapped value |a2*a4|a1*a3|
-  prod = vmul_f32(prod, vrev64_f32(prod));
+template <>
+EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet8hf& from) {
+  EIGEN_DEBUG_ALIGNED_STORE vst1q_f16(
+      reinterpret_cast<float16_t*>(assume_aligned<unpacket_traits<Packet8hf>::alignment>(to)), from);
+}
 
-  return vget_lane_f32(prod, 0);
+template <>
+EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet4hf& from) {
+  EIGEN_DEBUG_ALIGNED_STORE vst1_f16(
+      reinterpret_cast<float16_t*>(assume_aligned<unpacket_traits<Packet4hf>::alignment>(to)), from);
 }
-template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
-{
-  int32x2_t a_lo, a_hi, prod;
 
-  // Get a_lo = |a1|a2| and a_hi = |a3|a4|
-  a_lo = vget_low_s32(a);
-  a_hi = vget_high_s32(a);
-  // Get the product of a_lo * a_hi -> |a1*a3|a2*a4|
-  prod = vmul_s32(a_lo, a_hi);
-  // Multiply prod with its swapped value |a2*a4|a1*a3|
-  prod = vmul_s32(prod, vrev64_s32(prod));
+template <>
+EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet8hf& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE vst1q_f16(reinterpret_cast<float16_t*>(to), from);
+}
 
-  return vget_lane_s32(prod, 0);
+template <>
+EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet4hf& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE vst1_f16(reinterpret_cast<float16_t*>(to), from);
 }
 
-// min
-template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
-{
-  float32x2_t a_lo, a_hi, min;
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf pgather<Eigen::half, Packet8hf>(const Eigen::half* from, Index stride) {
+  Packet8hf res = pset1<Packet8hf>(Eigen::half(0.f));
+  res = vsetq_lane_f16(from[0 * stride].x, res, 0);
+  res = vsetq_lane_f16(from[1 * stride].x, res, 1);
+  res = vsetq_lane_f16(from[2 * stride].x, res, 2);
+  res = vsetq_lane_f16(from[3 * stride].x, res, 3);
+  res = vsetq_lane_f16(from[4 * stride].x, res, 4);
+  res = vsetq_lane_f16(from[5 * stride].x, res, 5);
+  res = vsetq_lane_f16(from[6 * stride].x, res, 6);
+  res = vsetq_lane_f16(from[7 * stride].x, res, 7);
+  return res;
+}
 
-  a_lo = vget_low_f32(a);
-  a_hi = vget_high_f32(a);
-  min = vpmin_f32(a_lo, a_hi);
-  min = vpmin_f32(min, min);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf pgather<Eigen::half, Packet4hf>(const Eigen::half* from, Index stride) {
+  Packet4hf res = pset1<Packet4hf>(Eigen::half(0.f));
+  res = vset_lane_f16(from[0 * stride].x, res, 0);
+  res = vset_lane_f16(from[1 * stride].x, res, 1);
+  res = vset_lane_f16(from[2 * stride].x, res, 2);
+  res = vset_lane_f16(from[3 * stride].x, res, 3);
+  return res;
+}
 
-  return vget_lane_f32(min, 0);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8hf>(Eigen::half* to, const Packet8hf& from,
+                                                                            Index stride) {
+  to[stride * 0].x = vgetq_lane_f16(from, 0);
+  to[stride * 1].x = vgetq_lane_f16(from, 1);
+  to[stride * 2].x = vgetq_lane_f16(from, 2);
+  to[stride * 3].x = vgetq_lane_f16(from, 3);
+  to[stride * 4].x = vgetq_lane_f16(from, 4);
+  to[stride * 5].x = vgetq_lane_f16(from, 5);
+  to[stride * 6].x = vgetq_lane_f16(from, 6);
+  to[stride * 7].x = vgetq_lane_f16(from, 7);
 }
 
-template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
-{
-  int32x2_t a_lo, a_hi, min;
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4hf>(Eigen::half* to, const Packet4hf& from,
+                                                                            Index stride) {
+  to[stride * 0].x = vget_lane_f16(from, 0);
+  to[stride * 1].x = vget_lane_f16(from, 1);
+  to[stride * 2].x = vget_lane_f16(from, 2);
+  to[stride * 3].x = vget_lane_f16(from, 3);
+}
 
-  a_lo = vget_low_s32(a);
-  a_hi = vget_high_s32(a);
-  min = vpmin_s32(a_lo, a_hi);
-  min = vpmin_s32(min, min);
-  
-  return vget_lane_s32(min, 0);
+template <>
+EIGEN_STRONG_INLINE void prefetch<Eigen::half>(const Eigen::half* addr) {
+  EIGEN_ARM_PREFETCH(addr);
 }
 
-// max
-template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
-{
-  float32x2_t a_lo, a_hi, max;
-
-  a_lo = vget_low_f32(a);
-  a_hi = vget_high_f32(a);
-  max = vpmax_f32(a_lo, a_hi);
-  max = vpmax_f32(max, max);
-
-  return vget_lane_f32(max, 0);
-}
-
-template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
-{
-  int32x2_t a_lo, a_hi, max;
-
-  a_lo = vget_low_s32(a);
-  a_hi = vget_high_s32(a);
-  max = vpmax_s32(a_lo, a_hi);
-  max = vpmax_s32(max, max);
-
-  return vget_lane_s32(max, 0);
-}
-
-// this PALIGN_NEON business is to work around a bug in LLVM Clang 3.0 causing incorrect compilation errors,
-// see bug 347 and this LLVM bug: http://llvm.org/bugs/show_bug.cgi?id=11074
-#define PALIGN_NEON(Offset,Type,Command) \
-template<>\
-struct palign_impl<Offset,Type>\
-{\
-    EIGEN_STRONG_INLINE static void run(Type& first, const Type& second)\
-    {\
-        if (Offset!=0)\
-            first = Command(first, second, Offset);\
-    }\
-};\
-
-PALIGN_NEON(0,Packet4f,vextq_f32)
-PALIGN_NEON(1,Packet4f,vextq_f32)
-PALIGN_NEON(2,Packet4f,vextq_f32)
-PALIGN_NEON(3,Packet4f,vextq_f32)
-PALIGN_NEON(0,Packet4i,vextq_s32)
-PALIGN_NEON(1,Packet4i,vextq_s32)
-PALIGN_NEON(2,Packet4i,vextq_s32)
-PALIGN_NEON(3,Packet4i,vextq_s32)
-    
-#undef PALIGN_NEON
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_PACKET_MATH_NEON_H
+template <>
+EIGEN_STRONG_INLINE Eigen::half pfirst<Packet8hf>(const Packet8hf& a) {
+  float16_t x[8];
+  vst1q_f16(x, a);
+  Eigen::half h;
+  h.x = x[0];
+  return h;
+}
+
+template <>
+EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4hf>(const Packet4hf& a) {
+  float16_t x[4];
+  vst1_f16(x, a);
+  Eigen::half h;
+  h.x = x[0];
+  return h;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf preverse(const Packet8hf& a) {
+  float16x4_t a_lo, a_hi;
+  Packet8hf a_r64;
+
+  a_r64 = vrev64q_f16(a);
+  a_lo = vget_low_f16(a_r64);
+  a_hi = vget_high_f16(a_r64);
+  return vcombine_f16(a_hi, a_lo);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf preverse<Packet4hf>(const Packet4hf& a) {
+  return vrev64_f16(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf pabs<Packet8hf>(const Packet8hf& a) {
+  return vabsq_f16(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8hf psignbit(const Packet8hf& a) {
+  return vreinterpretq_f16_s16(vshrq_n_s16(vreinterpretq_s16_f16(a), 15));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf pabs<Packet4hf>(const Packet4hf& a) {
+  return vabs_f16(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4hf psignbit(const Packet4hf& a) {
+  return vreinterpret_f16_s16(vshr_n_s16(vreinterpret_s16_f16(a), 15));
+}
+
+template <>
+EIGEN_STRONG_INLINE Eigen::half predux<Packet8hf>(const Packet8hf& a) {
+  float16x4_t a_lo, a_hi, sum;
+
+  a_lo = vget_low_f16(a);
+  a_hi = vget_high_f16(a);
+  sum = vpadd_f16(a_lo, a_hi);
+  sum = vpadd_f16(sum, sum);
+  sum = vpadd_f16(sum, sum);
+
+  Eigen::half h;
+  h.x = vget_lane_f16(sum, 0);
+  return h;
+}
+
+template <>
+EIGEN_STRONG_INLINE Eigen::half predux<Packet4hf>(const Packet4hf& a) {
+  float16x4_t sum;
+
+  sum = vpadd_f16(a, a);
+  sum = vpadd_f16(sum, sum);
+  Eigen::half h;
+  h.x = vget_lane_f16(sum, 0);
+  return h;
+}
+
+template <>
+EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet8hf>(const Packet8hf& a) {
+  float16x4_t a_lo, a_hi, prod;
+
+  a_lo = vget_low_f16(a);
+  a_hi = vget_high_f16(a);
+  prod = vmul_f16(a_lo, a_hi);
+  prod = vmul_f16(prod, vrev64_f16(prod));
+
+  Eigen::half h;
+  h.x = vmulh_f16(vget_lane_f16(prod, 0), vget_lane_f16(prod, 1));
+  return h;
+}
+
+template <>
+EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet4hf>(const Packet4hf& a) {
+  float16x4_t prod;
+  prod = vmul_f16(a, vrev64_f16(a));
+  Eigen::half h;
+  h.x = vmulh_f16(vget_lane_f16(prod, 0), vget_lane_f16(prod, 1));
+  return h;
+}
+
+template <>
+EIGEN_STRONG_INLINE Eigen::half predux_min<Packet8hf>(const Packet8hf& a) {
+  Eigen::half h;
+  h.x = vminvq_f16(a);
+  return h;
+}
+
+template <>
+EIGEN_STRONG_INLINE Eigen::half predux_min<Packet4hf>(const Packet4hf& a) {
+  Eigen::half h;
+  h.x = vminv_f16(a);
+  return h;
+}
+
+template <>
+EIGEN_STRONG_INLINE Eigen::half predux_max<Packet8hf>(const Packet8hf& a) {
+  Eigen::half h;
+  h.x = vmaxvq_f16(a);
+  return h;
+}
+
+template <>
+EIGEN_STRONG_INLINE Eigen::half predux_max<Packet4hf>(const Packet4hf& a) {
+  Eigen::half h;
+  h.x = vmaxv_f16(a);
+  return h;
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8hf, 4>& kernel) {
+  const float16x8x2_t zip16_1 = vzipq_f16(kernel.packet[0], kernel.packet[1]);
+  const float16x8x2_t zip16_2 = vzipq_f16(kernel.packet[2], kernel.packet[3]);
+
+  const float32x4x2_t zip32_1 = vzipq_f32(vreinterpretq_f32_f16(zip16_1.val[0]), vreinterpretq_f32_f16(zip16_2.val[0]));
+  const float32x4x2_t zip32_2 = vzipq_f32(vreinterpretq_f32_f16(zip16_1.val[1]), vreinterpretq_f32_f16(zip16_2.val[1]));
+
+  kernel.packet[0] = vreinterpretq_f16_f32(zip32_1.val[0]);
+  kernel.packet[1] = vreinterpretq_f16_f32(zip32_1.val[1]);
+  kernel.packet[2] = vreinterpretq_f16_f32(zip32_2.val[0]);
+  kernel.packet[3] = vreinterpretq_f16_f32(zip32_2.val[1]);
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4hf, 4>& kernel) {
+  EIGEN_ALIGN16 float16x4x4_t tmp_x4;
+  float16_t* tmp = (float16_t*)&kernel;
+  tmp_x4 = vld4_f16(tmp);
+
+  kernel.packet[0] = tmp_x4.val[0];
+  kernel.packet[1] = tmp_x4.val[1];
+  kernel.packet[2] = tmp_x4.val[2];
+  kernel.packet[3] = tmp_x4.val[3];
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet8hf, 8>& kernel) {
+  float16x8x2_t T_1[4];
+
+  T_1[0] = vuzpq_f16(kernel.packet[0], kernel.packet[1]);
+  T_1[1] = vuzpq_f16(kernel.packet[2], kernel.packet[3]);
+  T_1[2] = vuzpq_f16(kernel.packet[4], kernel.packet[5]);
+  T_1[3] = vuzpq_f16(kernel.packet[6], kernel.packet[7]);
+
+  float16x8x2_t T_2[4];
+  T_2[0] = vuzpq_f16(T_1[0].val[0], T_1[1].val[0]);
+  T_2[1] = vuzpq_f16(T_1[0].val[1], T_1[1].val[1]);
+  T_2[2] = vuzpq_f16(T_1[2].val[0], T_1[3].val[0]);
+  T_2[3] = vuzpq_f16(T_1[2].val[1], T_1[3].val[1]);
+
+  float16x8x2_t T_3[4];
+  T_3[0] = vuzpq_f16(T_2[0].val[0], T_2[2].val[0]);
+  T_3[1] = vuzpq_f16(T_2[0].val[1], T_2[2].val[1]);
+  T_3[2] = vuzpq_f16(T_2[1].val[0], T_2[3].val[0]);
+  T_3[3] = vuzpq_f16(T_2[1].val[1], T_2[3].val[1]);
+
+  kernel.packet[0] = T_3[0].val[0];
+  kernel.packet[1] = T_3[2].val[0];
+  kernel.packet[2] = T_3[1].val[0];
+  kernel.packet[3] = T_3[3].val[0];
+  kernel.packet[4] = T_3[0].val[1];
+  kernel.packet[5] = T_3[2].val[1];
+  kernel.packet[6] = T_3[1].val[1];
+  kernel.packet[7] = T_3[3].val[1];
+}
+#endif  // end EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_PACKET_MATH_NEON_H
diff --git a/inst/include/Eigen/src/Core/arch/NEON/TypeCasting.h b/inst/include/Eigen/src/Core/arch/NEON/TypeCasting.h
new file mode 100644
index 00000000..58d7b8cc
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/NEON/TypeCasting.h
@@ -0,0 +1,1642 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018 Rasmus Munk Larsen <rmlarsen@google.com>
+// Copyright (C) 2020 Antonio Sanchez <cantonios@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TYPE_CASTING_NEON_H
+#define EIGEN_TYPE_CASTING_NEON_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+//==============================================================================
+// preinterpret (truncation operations)
+//==============================================================================
+
+template <>
+EIGEN_STRONG_INLINE Packet8c preinterpret<Packet8c, Packet16c>(const Packet16c& a) {
+  return Packet8c(vget_low_s8(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c preinterpret<Packet4c, Packet8c>(const Packet8c& a) {
+  return Packet4c(vget_lane_s32(vreinterpret_s32_s8(a), 0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c preinterpret<Packet4c, Packet16c>(const Packet16c& a) {
+  return preinterpret<Packet4c>(preinterpret<Packet8c>(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet8uc preinterpret<Packet8uc, Packet16uc>(const Packet16uc& a) {
+  return Packet8uc(vget_low_u8(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc preinterpret<Packet4uc, Packet8uc>(const Packet8uc& a) {
+  return Packet4uc(vget_lane_u32(vreinterpret_u32_u8(a), 0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc preinterpret<Packet4uc, Packet16uc>(const Packet16uc& a) {
+  return preinterpret<Packet4uc>(preinterpret<Packet8uc>(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4s preinterpret<Packet4s, Packet8s>(const Packet8s& a) {
+  return Packet4s(vget_low_s16(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4us preinterpret<Packet4us, Packet8us>(const Packet8us& a) {
+  return Packet4us(vget_low_u16(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2i preinterpret<Packet2i, Packet4i>(const Packet4i& a) {
+  return Packet2i(vget_low_s32(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui preinterpret<Packet2ui, Packet4ui>(const Packet4ui& a) {
+  return Packet2ui(vget_low_u32(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2f preinterpret<Packet2f, Packet4f>(const Packet4f& a) {
+  return Packet2f(vget_low_f32(a));
+}
+
+//==============================================================================
+// preinterpret
+//==============================================================================
+template <>
+EIGEN_STRONG_INLINE Packet2f preinterpret<Packet2f, Packet2i>(const Packet2i& a) {
+  return Packet2f(vreinterpret_f32_s32(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f preinterpret<Packet2f, Packet2ui>(const Packet2ui& a) {
+  return Packet2f(vreinterpret_f32_u32(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f, Packet4i>(const Packet4i& a) {
+  return Packet4f(vreinterpretq_f32_s32(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f, Packet4ui>(const Packet4ui& a) {
+  return Packet4f(vreinterpretq_f32_u32(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4c preinterpret<Packet4c, Packet4uc>(const Packet4uc& a) {
+  return static_cast<Packet4c>(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c preinterpret<Packet8c, Packet8uc>(const Packet8uc& a) {
+  return Packet8c(vreinterpret_s8_u8(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16c preinterpret<Packet16c, Packet16uc>(const Packet16uc& a) {
+  return Packet16c(vreinterpretq_s8_u8(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4uc preinterpret<Packet4uc, Packet4c>(const Packet4c& a) {
+  return static_cast<Packet4uc>(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc preinterpret<Packet8uc, Packet8c>(const Packet8c& a) {
+  return Packet8uc(vreinterpret_u8_s8(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16uc preinterpret<Packet16uc, Packet16c>(const Packet16c& a) {
+  return Packet16uc(vreinterpretq_u8_s8(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4s preinterpret<Packet4s, Packet4us>(const Packet4us& a) {
+  return Packet4s(vreinterpret_s16_u16(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s preinterpret<Packet8s, Packet8us>(const Packet8us& a) {
+  return Packet8s(vreinterpretq_s16_u16(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us preinterpret<Packet4us, Packet4s>(const Packet4s& a) {
+  return Packet4us(vreinterpret_u16_s16(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us preinterpret<Packet8us, Packet8s>(const Packet8s& a) {
+  return Packet8us(vreinterpretq_u16_s16(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2i preinterpret<Packet2i, Packet2f>(const Packet2f& a) {
+  return Packet2i(vreinterpret_s32_f32(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i preinterpret<Packet2i, Packet2ui>(const Packet2ui& a) {
+  return Packet2i(vreinterpret_s32_u32(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet4f>(const Packet4f& a) {
+  return Packet4i(vreinterpretq_s32_f32(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet4ui>(const Packet4ui& a) {
+  return Packet4i(vreinterpretq_s32_u32(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2ui preinterpret<Packet2ui, Packet2f>(const Packet2f& a) {
+  return Packet2ui(vreinterpret_u32_f32(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui preinterpret<Packet2ui, Packet2i>(const Packet2i& a) {
+  return Packet2ui(vreinterpret_u32_s32(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui preinterpret<Packet4ui, Packet4f>(const Packet4f& a) {
+  return Packet4ui(vreinterpretq_u32_f32(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui preinterpret<Packet4ui, Packet4i>(const Packet4i& a) {
+  return Packet4ui(vreinterpretq_u32_s32(a));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2l preinterpret<Packet2l, Packet2ul>(const Packet2ul& a) {
+  return Packet2l(vreinterpretq_s64_u64(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul preinterpret<Packet2ul, Packet2l>(const Packet2l& a) {
+  return Packet2ul(vreinterpretq_u64_s64(a));
+}
+
+//==============================================================================
+// pcast, SrcType = float
+//==============================================================================
+
+template <>
+struct type_casting_traits<float, numext::int64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+template <>
+struct type_casting_traits<float, numext::uint64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+// If float64 exists, first convert to that to keep as much precision as possible.
+#if EIGEN_ARCH_ARM64
+template <>
+EIGEN_STRONG_INLINE Packet2l pcast<Packet4f, Packet2l>(const Packet4f& a) {
+  // Discard second half of input.
+  return vcvtq_s64_f64(vcvt_f64_f32(vget_low_f32(a)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pcast<Packet2f, Packet2l>(const Packet2f& a) {
+  return vcvtq_s64_f64(vcvt_f64_f32(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcast<Packet4f, Packet2ul>(const Packet4f& a) {
+  // Discard second half of input.
+  return vcvtq_u64_f64(vcvt_f64_f32(vget_low_f32(a)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcast<Packet2f, Packet2ul>(const Packet2f& a) {
+  return vcvtq_u64_f64(vcvt_f64_f32(a));
+}
+#else
+template <>
+EIGEN_STRONG_INLINE Packet2l pcast<Packet4f, Packet2l>(const Packet4f& a) {
+  // Discard second half of input.
+  return vmovl_s32(vget_low_s32(vcvtq_s32_f32(a)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pcast<Packet2f, Packet2l>(const Packet2f& a) {
+  return vmovl_s32(vcvt_s32_f32(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcast<Packet4f, Packet2ul>(const Packet4f& a) {
+  // Discard second half of input.
+  return vmovl_u32(vget_low_u32(vcvtq_u32_f32(a)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcast<Packet2f, Packet2ul>(const Packet2f& a) {
+  // Discard second half of input.
+  return vmovl_u32(vcvt_u32_f32(a));
+}
+#endif  // EIGEN_ARCH_ARM64
+
+template <>
+struct type_casting_traits<float, numext::int32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
+  return vcvtq_s32_f32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pcast<Packet2f, Packet2i>(const Packet2f& a) {
+  return vcvt_s32_f32(a);
+}
+
+template <>
+struct type_casting_traits<float, numext::uint32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcast<Packet4f, Packet4ui>(const Packet4f& a) {
+  return vcvtq_u32_f32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pcast<Packet2f, Packet2ui>(const Packet2f& a) {
+  return vcvt_u32_f32(a);
+}
+
+template <>
+struct type_casting_traits<float, numext::int16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet8s pcast<Packet4f, Packet8s>(const Packet4f& a, const Packet4f& b) {
+  return vcombine_s16(vmovn_s32(vcvtq_s32_f32(a)), vmovn_s32(vcvtq_s32_f32(b)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pcast<Packet4f, Packet4s>(const Packet4f& a) {
+  return vmovn_s32(vcvtq_s32_f32(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pcast<Packet2f, Packet4s>(const Packet2f& a, const Packet2f& b) {
+  return vmovn_s32(vcombine_s32(vcvt_s32_f32(a), vcvt_s32_f32(b)));
+}
+
+template <>
+struct type_casting_traits<float, numext::uint16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet8us pcast<Packet4f, Packet8us>(const Packet4f& a, const Packet4f& b) {
+  return vcombine_u16(vmovn_u32(vcvtq_u32_f32(a)), vmovn_u32(vcvtq_u32_f32(b)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pcast<Packet4f, Packet4us>(const Packet4f& a) {
+  return vmovn_u32(vcvtq_u32_f32(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pcast<Packet2f, Packet4us>(const Packet2f& a, const Packet2f& b) {
+  return vmovn_u32(vcombine_u32(vcvt_u32_f32(a), vcvt_u32_f32(b)));
+}
+
+template <>
+struct type_casting_traits<float, numext::int8_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet16c pcast<Packet4f, Packet16c>(const Packet4f& a, const Packet4f& b, const Packet4f& c,
+                                                         const Packet4f& d) {
+  const int16x8_t ab_s16 = pcast<Packet4f, Packet8s>(a, b);
+  const int16x8_t cd_s16 = pcast<Packet4f, Packet8s>(c, d);
+  return vcombine_s8(vmovn_s16(ab_s16), vmovn_s16(cd_s16));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pcast<Packet4f, Packet8c>(const Packet4f& a, const Packet4f& b) {
+  const int16x8_t ab_s16 = pcast<Packet4f, Packet8s>(a, b);
+  return vmovn_s16(ab_s16);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pcast<Packet2f, Packet8c>(const Packet2f& a, const Packet2f& b, const Packet2f& c,
+                                                       const Packet2f& d) {
+  const int16x4_t ab_s16 = pcast<Packet2f, Packet4s>(a, b);
+  const int16x4_t cd_s16 = pcast<Packet2f, Packet4s>(c, d);
+  return vmovn_s16(vcombine_s16(ab_s16, cd_s16));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c pcast<Packet4f, Packet4c>(const Packet4f& a) {
+  const int32x4_t a_s32x4 = vcvtq_s32_f32(a);
+  const int16x4_t a_s16x4 = vmovn_s32(a_s32x4);
+  const int16x8_t aa_s16x8 = vcombine_s16(a_s16x4, a_s16x4);
+  const int8x8_t aa_s8x8 = vmovn_s16(aa_s16x8);
+  return vget_lane_s32(vreinterpret_s32_s8(aa_s8x8), 0);
+}
+
+template <>
+struct type_casting_traits<float, numext::uint8_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcast<Packet4f, Packet16uc>(const Packet4f& a, const Packet4f& b, const Packet4f& c,
+                                                           const Packet4f& d) {
+  return preinterpret<Packet16uc>(pcast<Packet4f, Packet16c>(a, b, c, d));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pcast<Packet4f, Packet8uc>(const Packet4f& a, const Packet4f& b) {
+  return preinterpret<Packet8uc>(pcast<Packet4f, Packet8c>(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pcast<Packet2f, Packet8uc>(const Packet2f& a, const Packet2f& b, const Packet2f& c,
+                                                         const Packet2f& d) {
+  return preinterpret<Packet8uc>(pcast<Packet2f, Packet8c>(a, b, c, d));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pcast<Packet4f, Packet4uc>(const Packet4f& a) {
+  return static_cast<Packet4uc>(pcast<Packet4f, Packet4c>(a));
+}
+
+//==============================================================================
+// pcast, SrcType = int8_t
+//==============================================================================
+template <>
+struct type_casting_traits<numext::int8_t, float> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet16c, Packet4f>(const Packet16c& a) {
+  // Discard all but first 4 bytes.
+  return vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(a)))));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet4c, Packet4f>(const Packet4c& a) {
+  return vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_s32(vdup_n_s32(a))))));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f pcast<Packet8c, Packet2f>(const Packet8c& a) {
+  // Discard all but first 2 bytes.
+  return vcvt_f32_s32(vget_low_s32(vmovl_s16(vget_low_s16(vmovl_s8(a)))));
+}
+
+template <>
+struct type_casting_traits<numext::int8_t, numext::int64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 8 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2l pcast<Packet16c, Packet2l>(const Packet16c& a) {
+  // Discard all but first two bytes.
+  return vmovl_s32(vget_low_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(a))))));
+}
+
+template <>
+struct type_casting_traits<numext::int8_t, numext::uint64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 8 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcast<Packet16c, Packet2ul>(const Packet16c& a) {
+  return preinterpret<Packet2ul>(pcast<Packet16c, Packet2l>(a));
+}
+
+template <>
+struct type_casting_traits<numext::int8_t, numext::int32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet16c, Packet4i>(const Packet16c& a) {
+  // Discard all but first 4 bytes.
+  return vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(a))));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet8c, Packet4i>(const Packet8c& a) {
+  return vmovl_s16(vget_low_s16(vmovl_s8(a)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet4c, Packet4i>(const Packet4c& a) {
+  return pcast<Packet8c, Packet4i>(vreinterpret_s8_s32(vdup_n_s32(a)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pcast<Packet8c, Packet2i>(const Packet8c& a) {
+  // Discard all but first 2 bytes.
+  return vget_low_s32(vmovl_s16(vget_low_s16(vmovl_s8(a))));
+}
+
+template <>
+struct type_casting_traits<numext::int8_t, numext::uint32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcast<Packet16c, Packet4ui>(const Packet16c& a) {
+  return preinterpret<Packet4ui>(pcast<Packet16c, Packet4i>(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pcast<Packet8c, Packet2ui>(const Packet8c& a) {
+  return preinterpret<Packet2ui>(pcast<Packet8c, Packet2i>(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcast<Packet4c, Packet4ui>(const Packet4c& a) {
+  return preinterpret<Packet4ui>(pcast<Packet4c, Packet4i>(a));
+}
+
+template <>
+struct type_casting_traits<numext::int8_t, numext::int16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet8s pcast<Packet16c, Packet8s>(const Packet16c& a) {
+  // Discard second half of input.
+  return vmovl_s8(vget_low_s8(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pcast<Packet8c, Packet8s>(const Packet8c& a) {
+  return vmovl_s8(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pcast<Packet8c, Packet4s>(const Packet8c& a) {
+  // Discard second half of input.
+  return vget_low_s16(vmovl_s8(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pcast<Packet4c, Packet4s>(const Packet4c& a) {
+  return pcast<Packet8c, Packet4s>(vreinterpret_s8_s32(vdup_n_s32(a)));
+}
+
+template <>
+struct type_casting_traits<numext::int8_t, numext::uint16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet8us pcast<Packet16c, Packet8us>(const Packet16c& a) {
+  return preinterpret<Packet8us>(pcast<Packet16c, Packet8s>(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pcast<Packet8c, Packet8us>(const Packet8c& a) {
+  return preinterpret<Packet8us>(pcast<Packet8c, Packet8s>(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pcast<Packet8c, Packet4us>(const Packet8c& a) {
+  return preinterpret<Packet4us>(pcast<Packet8c, Packet4s>(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pcast<Packet4c, Packet4us>(const Packet4c& a) {
+  return preinterpret<Packet4us>(pcast<Packet4c, Packet4s>(a));
+}
+
+//==============================================================================
+// pcast, SrcType = uint8_t
+//==============================================================================
+template <>
+struct type_casting_traits<numext::uint8_t, float> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet16uc, Packet4f>(const Packet16uc& a) {
+  // Discard all but first 4 bytes.
+  return vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a)))));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet4uc, Packet4f>(const Packet4uc& a) {
+  return vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_u32(vdup_n_u32(a))))));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f pcast<Packet8uc, Packet2f>(const Packet8uc& a) {
+  // Discard all but first 2 bytes.
+  return vcvt_f32_u32(vget_low_u32(vmovl_u16(vget_low_u16(vmovl_u8(a)))));
+}
+
+template <>
+struct type_casting_traits<numext::uint8_t, numext::uint64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 8 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcast<Packet16uc, Packet2ul>(const Packet16uc& a) {
+  // Discard all but first two bytes.
+  return vmovl_u32(vget_low_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a))))));
+}
+
+template <>
+struct type_casting_traits<numext::uint8_t, numext::int64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 8 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2l pcast<Packet16uc, Packet2l>(const Packet16uc& a) {
+  return preinterpret<Packet2l>(pcast<Packet16uc, Packet2ul>(a));
+}
+
+template <>
+struct type_casting_traits<numext::uint8_t, numext::uint32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcast<Packet16uc, Packet4ui>(const Packet16uc& a) {
+  // Discard all but first 4 bytes.
+  return vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(a))));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcast<Packet8uc, Packet4ui>(const Packet8uc& a) {
+  return vmovl_u16(vget_low_u16(vmovl_u8(a)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pcast<Packet8uc, Packet2ui>(const Packet8uc& a) {
+  // Discard all but first 2 bytes.
+  return vget_low_u32(vmovl_u16(vget_low_u16(vmovl_u8(a))));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcast<Packet4uc, Packet4ui>(const Packet4uc& a) {
+  return pcast<Packet8uc, Packet4ui>(vreinterpret_u8_u32(vdup_n_u32(a)));
+}
+
+template <>
+struct type_casting_traits<numext::uint8_t, numext::int32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet16uc, Packet4i>(const Packet16uc& a) {
+  return preinterpret<Packet4i>(pcast<Packet16uc, Packet4ui>(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pcast<Packet8uc, Packet2i>(const Packet8uc& a) {
+  return preinterpret<Packet2i>(pcast<Packet8uc, Packet2ui>(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet4uc, Packet4i>(const Packet4uc& a) {
+  return preinterpret<Packet4i>(pcast<Packet4uc, Packet4ui>(a));
+}
+
+template <>
+struct type_casting_traits<numext::uint8_t, numext::uint16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet8us pcast<Packet16uc, Packet8us>(const Packet16uc& a) {
+  // Discard second half of input.
+  return vmovl_u8(vget_low_u8(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8us pcast<Packet8uc, Packet8us>(const Packet8uc& a) {
+  return vmovl_u8(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pcast<Packet4uc, Packet4us>(const Packet4uc& a) {
+  return vget_low_u16(vmovl_u8(vreinterpret_u8_u32(vdup_n_u32(a))));
+}
+
+template <>
+struct type_casting_traits<numext::uint8_t, numext::int16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet8s pcast<Packet16uc, Packet8s>(const Packet16uc& a) {
+  return preinterpret<Packet8s>(pcast<Packet16uc, Packet8us>(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8s pcast<Packet8uc, Packet8s>(const Packet8uc& a) {
+  return preinterpret<Packet8s>(pcast<Packet8uc, Packet8us>(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pcast<Packet4uc, Packet4s>(const Packet4uc& a) {
+  return preinterpret<Packet4s>(pcast<Packet4uc, Packet4us>(a));
+}
+
+//==============================================================================
+// pcast, SrcType = int16_t
+//==============================================================================
+template <>
+struct type_casting_traits<numext::int16_t, float> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet8s, Packet4f>(const Packet8s& a) {
+  // Discard second half of input.
+  return vcvtq_f32_s32(vmovl_s16(vget_low_s16(a)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet4s, Packet4f>(const Packet4s& a) {
+  return vcvtq_f32_s32(vmovl_s16(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f pcast<Packet4s, Packet2f>(const Packet4s& a) {
+  // Discard second half of input.
+  return vcvt_f32_s32(vget_low_s32(vmovl_s16(a)));
+}
+
+template <>
+struct type_casting_traits<numext::int16_t, numext::int64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2l pcast<Packet8s, Packet2l>(const Packet8s& a) {
+  // Discard all but first two values.
+  return vmovl_s32(vget_low_s32(vmovl_s16(vget_low_s16(a))));
+}
+
+template <>
+struct type_casting_traits<numext::int16_t, numext::uint64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcast<Packet8s, Packet2ul>(const Packet8s& a) {
+  return preinterpret<Packet2ul>(pcast<Packet8s, Packet2l>(a));
+}
+
+template <>
+struct type_casting_traits<numext::int16_t, numext::int32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet8s, Packet4i>(const Packet8s& a) {
+  // Discard second half of input.
+  return vmovl_s16(vget_low_s16(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet4s, Packet4i>(const Packet4s& a) {
+  return vmovl_s16(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pcast<Packet4s, Packet2i>(const Packet4s& a) {
+  // Discard second half of input.
+  return vget_low_s32(vmovl_s16(a));
+}
+
+template <>
+struct type_casting_traits<numext::int16_t, numext::uint32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcast<Packet8s, Packet4ui>(const Packet8s& a) {
+  return preinterpret<Packet4ui>(pcast<Packet8s, Packet4i>(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcast<Packet4s, Packet4ui>(const Packet4s& a) {
+  return preinterpret<Packet4ui>(pcast<Packet4s, Packet4i>(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pcast<Packet4s, Packet2ui>(const Packet4s& a) {
+  return preinterpret<Packet2ui>(pcast<Packet4s, Packet2i>(a));
+}
+
+template <>
+struct type_casting_traits<numext::int16_t, numext::int8_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet16c pcast<Packet8s, Packet16c>(const Packet8s& a, const Packet8s& b) {
+  return vcombine_s8(vmovn_s16(a), vmovn_s16(b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pcast<Packet8s, Packet8c>(const Packet8s& a) {
+  return vmovn_s16(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pcast<Packet4s, Packet8c>(const Packet4s& a, const Packet4s& b) {
+  return vmovn_s16(vcombine_s16(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c pcast<Packet4s, Packet4c>(const Packet4s& a) {
+  const int8x8_t aa_s8x8 = pcast<Packet4s, Packet8c>(a, a);
+  return vget_lane_s32(vreinterpret_s32_s8(aa_s8x8), 0);
+}
+
+template <>
+struct type_casting_traits<numext::int16_t, numext::uint8_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcast<Packet8s, Packet16uc>(const Packet8s& a, const Packet8s& b) {
+  return preinterpret<Packet16uc>(pcast<Packet8s, Packet16c>(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pcast<Packet8s, Packet8uc>(const Packet8s& a) {
+  return preinterpret<Packet8uc>(pcast<Packet8s, Packet8c>(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pcast<Packet4s, Packet8uc>(const Packet4s& a, const Packet4s& b) {
+  return preinterpret<Packet8uc>(pcast<Packet4s, Packet8c>(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pcast<Packet4s, Packet4uc>(const Packet4s& a) {
+  return static_cast<Packet4uc>(pcast<Packet4s, Packet4c>(a));
+}
+
+//==============================================================================
+// pcast, SrcType = uint16_t
+//==============================================================================
+template <>
+struct type_casting_traits<numext::uint16_t, float> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet8us, Packet4f>(const Packet8us& a) {
+  // Discard second half of input.
+  return vcvtq_f32_u32(vmovl_u16(vget_low_u16(a)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet4us, Packet4f>(const Packet4us& a) {
+  return vcvtq_f32_u32(vmovl_u16(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f pcast<Packet4us, Packet2f>(const Packet4us& a) {
+  // Discard second half of input.
+  return vcvt_f32_u32(vget_low_u32(vmovl_u16(a)));
+}
+
+template <>
+struct type_casting_traits<numext::uint16_t, numext::uint64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcast<Packet8us, Packet2ul>(const Packet8us& a) {
+  // Discard all but first two values.
+  return vmovl_u32(vget_low_u32(vmovl_u16(vget_low_u16(a))));
+}
+
+template <>
+struct type_casting_traits<numext::uint16_t, numext::int64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2l pcast<Packet8us, Packet2l>(const Packet8us& a) {
+  return preinterpret<Packet2l>(pcast<Packet8us, Packet2ul>(a));
+}
+
+template <>
+struct type_casting_traits<numext::uint16_t, numext::uint32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcast<Packet8us, Packet4ui>(const Packet8us& a) {
+  // Discard second half of input.
+  return vmovl_u16(vget_low_u16(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcast<Packet4us, Packet4ui>(const Packet4us& a) {
+  return vmovl_u16(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pcast<Packet4us, Packet2ui>(const Packet4us& a) {
+  // Discard second half of input.
+  return vget_low_u32(vmovl_u16(a));
+}
+
+template <>
+struct type_casting_traits<numext::uint16_t, numext::int32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet8us, Packet4i>(const Packet8us& a) {
+  return preinterpret<Packet4i>(pcast<Packet8us, Packet4ui>(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet4us, Packet4i>(const Packet4us& a) {
+  return preinterpret<Packet4i>(pcast<Packet4us, Packet4ui>(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pcast<Packet4us, Packet2i>(const Packet4us& a) {
+  return preinterpret<Packet2i>(pcast<Packet4us, Packet2ui>(a));
+}
+
+template <>
+struct type_casting_traits<numext::uint16_t, numext::uint8_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcast<Packet8us, Packet16uc>(const Packet8us& a, const Packet8us& b) {
+  return vcombine_u8(vmovn_u16(a), vmovn_u16(b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pcast<Packet8us, Packet8uc>(const Packet8us& a) {
+  return vmovn_u16(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pcast<Packet4us, Packet8uc>(const Packet4us& a, const Packet4us& b) {
+  return vmovn_u16(vcombine_u16(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pcast<Packet4us, Packet4uc>(const Packet4us& a) {
+  uint8x8_t aa_u8x8 = pcast<Packet4us, Packet8uc>(a, a);
+  return vget_lane_u32(vreinterpret_u32_u8(aa_u8x8), 0);
+}
+
+template <>
+struct type_casting_traits<numext::uint16_t, numext::int8_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet16c pcast<Packet8us, Packet16c>(const Packet8us& a, const Packet8us& b) {
+  return preinterpret<Packet16c>(pcast<Packet8us, Packet16uc>(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pcast<Packet8us, Packet8c>(const Packet8us& a) {
+  return preinterpret<Packet8c>(pcast<Packet8us, Packet8uc>(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pcast<Packet4us, Packet8c>(const Packet4us& a, const Packet4us& b) {
+  return preinterpret<Packet8c>(pcast<Packet4us, Packet8uc>(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c pcast<Packet4us, Packet4c>(const Packet4us& a) {
+  return static_cast<Packet4c>(pcast<Packet4us, Packet4uc>(a));
+}
+
+//==============================================================================
+// pcast, SrcType = int32_t
+//==============================================================================
+template <>
+struct type_casting_traits<numext::int32_t, float> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
+  return vcvtq_f32_s32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f pcast<Packet2i, Packet2f>(const Packet2i& a) {
+  return vcvt_f32_s32(a);
+}
+
+template <>
+struct type_casting_traits<numext::int32_t, numext::int64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2l pcast<Packet4i, Packet2l>(const Packet4i& a) {
+  // Discard second half of input.
+  return vmovl_s32(vget_low_s32(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pcast<Packet2i, Packet2l>(const Packet2i& a) {
+  return vmovl_s32(a);
+}
+
+template <>
+struct type_casting_traits<numext::int32_t, numext::uint64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcast<Packet4i, Packet2ul>(const Packet4i& a) {
+  return preinterpret<Packet2ul>(pcast<Packet4i, Packet2l>(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcast<Packet2i, Packet2ul>(const Packet2i& a) {
+  return preinterpret<Packet2ul>(pcast<Packet2i, Packet2l>(a));
+}
+
+template <>
+struct type_casting_traits<numext::int32_t, numext::int16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet8s pcast<Packet4i, Packet8s>(const Packet4i& a, const Packet4i& b) {
+  return vcombine_s16(vmovn_s32(a), vmovn_s32(b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pcast<Packet4i, Packet4s>(const Packet4i& a) {
+  return vmovn_s32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pcast<Packet2i, Packet4s>(const Packet2i& a, const Packet2i& b) {
+  return vmovn_s32(vcombine_s32(a, b));
+}
+
+template <>
+struct type_casting_traits<numext::int32_t, numext::uint16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet8us pcast<Packet4i, Packet8us>(const Packet4i& a, const Packet4i& b) {
+  return vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(a)), vmovn_u32(vreinterpretq_u32_s32(b)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pcast<Packet4i, Packet4us>(const Packet4i& a) {
+  return vmovn_u32(vreinterpretq_u32_s32(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pcast<Packet2i, Packet4us>(const Packet2i& a, const Packet2i& b) {
+  return vmovn_u32(vreinterpretq_u32_s32(vcombine_s32(a, b)));
+}
+
+template <>
+struct type_casting_traits<numext::int32_t, numext::int8_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet16c pcast<Packet4i, Packet16c>(const Packet4i& a, const Packet4i& b, const Packet4i& c,
+                                                         const Packet4i& d) {
+  const int16x8_t ab_s16 = pcast<Packet4i, Packet8s>(a, b);
+  const int16x8_t cd_s16 = pcast<Packet4i, Packet8s>(c, d);
+  return vcombine_s8(vmovn_s16(ab_s16), vmovn_s16(cd_s16));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pcast<Packet4i, Packet8c>(const Packet4i& a, const Packet4i& b) {
+  const int16x8_t ab_s16 = pcast<Packet4i, Packet8s>(a, b);
+  return vmovn_s16(ab_s16);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pcast<Packet2i, Packet8c>(const Packet2i& a, const Packet2i& b, const Packet2i& c,
+                                                       const Packet2i& d) {
+  const int16x4_t ab_s16 = vmovn_s32(vcombine_s32(a, b));
+  const int16x4_t cd_s16 = vmovn_s32(vcombine_s32(c, d));
+  return vmovn_s16(vcombine_s16(ab_s16, cd_s16));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c pcast<Packet4i, Packet4c>(const Packet4i& a) {
+  const int16x4_t a_s16x4 = vmovn_s32(a);
+  const int16x8_t aa_s16x8 = vcombine_s16(a_s16x4, a_s16x4);
+  const int8x8_t aa_s8x8 = vmovn_s16(aa_s16x8);
+  return vget_lane_s32(vreinterpret_s32_s8(aa_s8x8), 0);
+}
+
+template <>
+struct type_casting_traits<numext::int32_t, numext::uint8_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcast<Packet4i, Packet16uc>(const Packet4i& a, const Packet4i& b, const Packet4i& c,
+                                                           const Packet4i& d) {
+  return preinterpret<Packet16uc>(pcast<Packet4i, Packet16c>(a, b, c, d));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pcast<Packet4i, Packet8uc>(const Packet4i& a, const Packet4i& b) {
+  return preinterpret<Packet8uc>(pcast<Packet4i, Packet8c>(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pcast<Packet2i, Packet8uc>(const Packet2i& a, const Packet2i& b, const Packet2i& c,
+                                                         const Packet2i& d) {
+  return preinterpret<Packet8uc>(pcast<Packet2i, Packet8c>(a, b, c, d));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pcast<Packet4i, Packet4uc>(const Packet4i& a) {
+  return static_cast<Packet4uc>(pcast<Packet4i, Packet4c>(a));
+}
+
+//==============================================================================
+// pcast, SrcType = uint32_t
+//==============================================================================
+template <>
+struct type_casting_traits<numext::uint32_t, float> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet4ui, Packet4f>(const Packet4ui& a) {
+  return vcvtq_f32_u32(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f pcast<Packet2ui, Packet2f>(const Packet2ui& a) {
+  return vcvt_f32_u32(a);
+}
+
+template <>
+struct type_casting_traits<numext::uint32_t, numext::uint64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcast<Packet4ui, Packet2ul>(const Packet4ui& a) {
+  // Discard second half of input.
+  return vmovl_u32(vget_low_u32(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcast<Packet2ui, Packet2ul>(const Packet2ui& a) {
+  return vmovl_u32(a);
+}
+
+template <>
+struct type_casting_traits<numext::uint32_t, numext::int64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2l pcast<Packet4ui, Packet2l>(const Packet4ui& a) {
+  return preinterpret<Packet2l>(pcast<Packet4ui, Packet2ul>(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pcast<Packet2ui, Packet2l>(const Packet2ui& a) {
+  return preinterpret<Packet2l>(pcast<Packet2ui, Packet2ul>(a));
+}
+
+template <>
+struct type_casting_traits<numext::uint32_t, numext::uint16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet8us pcast<Packet4ui, Packet8us>(const Packet4ui& a, const Packet4ui& b) {
+  return vcombine_u16(vmovn_u32(a), vmovn_u32(b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pcast<Packet2ui, Packet4us>(const Packet2ui& a, const Packet2ui& b) {
+  return vmovn_u32(vcombine_u32(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pcast<Packet4ui, Packet4us>(const Packet4ui& a) {
+  return vmovn_u32(a);
+}
+
+template <>
+struct type_casting_traits<numext::uint32_t, numext::int16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet8s pcast<Packet4ui, Packet8s>(const Packet4ui& a, const Packet4ui& b) {
+  return preinterpret<Packet8s>(pcast<Packet4ui, Packet8us>(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pcast<Packet2ui, Packet4s>(const Packet2ui& a, const Packet2ui& b) {
+  return preinterpret<Packet4s>(pcast<Packet2ui, Packet4us>(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pcast<Packet4ui, Packet4s>(const Packet4ui& a) {
+  return preinterpret<Packet4s>(pcast<Packet4ui, Packet4us>(a));
+}
+
+template <>
+struct type_casting_traits<numext::uint32_t, numext::uint8_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcast<Packet4ui, Packet16uc>(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c,
+                                                            const Packet4ui& d) {
+  const uint16x8_t ab_u16 = vcombine_u16(vmovn_u32(a), vmovn_u32(b));
+  const uint16x8_t cd_u16 = vcombine_u16(vmovn_u32(c), vmovn_u32(d));
+  return vcombine_u8(vmovn_u16(ab_u16), vmovn_u16(cd_u16));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pcast<Packet4ui, Packet8uc>(const Packet4ui& a, const Packet4ui& b) {
+  const uint16x8_t ab_u16 = vcombine_u16(vmovn_u32(a), vmovn_u32(b));
+  return vmovn_u16(ab_u16);
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pcast<Packet2ui, Packet8uc>(const Packet2ui& a, const Packet2ui& b, const Packet2ui& c,
+                                                          const Packet2ui& d) {
+  const uint16x4_t ab_u16 = vmovn_u32(vcombine_u32(a, b));
+  const uint16x4_t cd_u16 = vmovn_u32(vcombine_u32(c, d));
+  return vmovn_u16(vcombine_u16(ab_u16, cd_u16));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pcast<Packet4ui, Packet4uc>(const Packet4ui& a) {
+  const uint16x4_t a_u16x4 = vmovn_u32(a);
+  const uint16x8_t aa_u16x8 = vcombine_u16(a_u16x4, a_u16x4);
+  const uint8x8_t aa_u8x8 = vmovn_u16(aa_u16x8);
+  return vget_lane_u32(vreinterpret_u32_u8(aa_u8x8), 0);
+}
+
+template <>
+struct type_casting_traits<numext::uint32_t, numext::int8_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet16c pcast<Packet4ui, Packet16c>(const Packet4ui& a, const Packet4ui& b, const Packet4ui& c,
+                                                          const Packet4ui& d) {
+  return preinterpret<Packet16c>(pcast<Packet4ui, Packet16uc>(a, b, c, d));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pcast<Packet4ui, Packet8c>(const Packet4ui& a, const Packet4ui& b) {
+  return preinterpret<Packet8c>(pcast<Packet4ui, Packet8uc>(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pcast<Packet2ui, Packet8c>(const Packet2ui& a, const Packet2ui& b, const Packet2ui& c,
+                                                        const Packet2ui& d) {
+  return preinterpret<Packet8c>(pcast<Packet2ui, Packet8uc>(a, b, c, d));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c pcast<Packet4ui, Packet4c>(const Packet4ui& a) {
+  return static_cast<Packet4c>(pcast<Packet4ui, Packet4uc>(a));
+}
+
+//==============================================================================
+// pcast, SrcType = int64_t
+//==============================================================================
+template <>
+struct type_casting_traits<numext::int64_t, float> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet2l, Packet4f>(const Packet2l& a, const Packet2l& b) {
+#if EIGEN_ARCH_ARM64
+  return vcombine_f32(vcvt_f32_f64(vcvtq_f64_s64(a)), vcvt_f32_f64(vcvtq_f64_s64(b)));
+#else
+  EIGEN_ALIGN_MAX int64_t lvals[4];
+  pstore(lvals, a);
+  pstore(lvals + 2, b);
+  EIGEN_ALIGN_MAX float fvals[4] = {static_cast<float>(lvals[0]), static_cast<float>(lvals[1]),
+                                    static_cast<float>(lvals[2]), static_cast<float>(lvals[3])};
+  return pload<Packet4f>(fvals);
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2f pcast<Packet2l, Packet2f>(const Packet2l& a) {
+#if EIGEN_ARCH_ARM64
+  return vcvt_f32_f64(vcvtq_f64_s64(a));
+#else
+  EIGEN_ALIGN_MAX int64_t lvals[2];
+  pstore(lvals, a);
+  EIGEN_ALIGN_MAX float fvals[2] = {static_cast<float>(lvals[0]), static_cast<float>(lvals[1])};
+  return pload<Packet2f>(fvals);
+#endif
+}
+
+template <>
+struct type_casting_traits<numext::int64_t, numext::int32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet2l, Packet4i>(const Packet2l& a, const Packet2l& b) {
+  return vcombine_s32(vmovn_s64(a), vmovn_s64(b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pcast<Packet2l, Packet2i>(const Packet2l& a) {
+  return vmovn_s64(a);
+}
+
+template <>
+struct type_casting_traits<numext::int64_t, numext::uint32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcast<Packet2l, Packet4ui>(const Packet2l& a, const Packet2l& b) {
+  return vcombine_u32(vmovn_u64(vreinterpretq_u64_s64(a)), vmovn_u64(vreinterpretq_u64_s64(b)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pcast<Packet2l, Packet2ui>(const Packet2l& a) {
+  return vmovn_u64(vreinterpretq_u64_s64(a));
+}
+
+template <>
+struct type_casting_traits<numext::int64_t, numext::int16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet8s pcast<Packet2l, Packet8s>(const Packet2l& a, const Packet2l& b, const Packet2l& c,
+                                                       const Packet2l& d) {
+  const int32x4_t ab_s32 = pcast<Packet2l, Packet4i>(a, b);
+  const int32x4_t cd_s32 = pcast<Packet2l, Packet4i>(c, d);
+  return vcombine_s16(vmovn_s32(ab_s32), vmovn_s32(cd_s32));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pcast<Packet2l, Packet4s>(const Packet2l& a, const Packet2l& b) {
+  const int32x4_t ab_s32 = pcast<Packet2l, Packet4i>(a, b);
+  return vmovn_s32(ab_s32);
+}
+
+template <>
+struct type_casting_traits<numext::int64_t, numext::uint16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet8us pcast<Packet2l, Packet8us>(const Packet2l& a, const Packet2l& b, const Packet2l& c,
+                                                         const Packet2l& d) {
+  return preinterpret<Packet8us>(pcast<Packet2l, Packet8s>(a, b, c, d));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pcast<Packet2l, Packet4us>(const Packet2l& a, const Packet2l& b) {
+  return preinterpret<Packet4us>(pcast<Packet2l, Packet4s>(a, b));
+}
+
+template <>
+struct type_casting_traits<numext::int64_t, numext::int8_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 8, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet16c pcast<Packet2l, Packet16c>(const Packet2l& a, const Packet2l& b, const Packet2l& c,
+                                                         const Packet2l& d, const Packet2l& e, const Packet2l& f,
+                                                         const Packet2l& g, const Packet2l& h) {
+  const int16x8_t abcd_s16 = pcast<Packet2l, Packet8s>(a, b, c, d);
+  const int16x8_t efgh_s16 = pcast<Packet2l, Packet8s>(e, f, g, h);
+  return vcombine_s8(vmovn_s16(abcd_s16), vmovn_s16(efgh_s16));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pcast<Packet2l, Packet8c>(const Packet2l& a, const Packet2l& b, const Packet2l& c,
+                                                       const Packet2l& d) {
+  const int16x8_t abcd_s16 = pcast<Packet2l, Packet8s>(a, b, c, d);
+  return vmovn_s16(abcd_s16);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c pcast<Packet2l, Packet4c>(const Packet2l& a, const Packet2l& b) {
+  const int16x4_t ab_s16 = pcast<Packet2l, Packet4s>(a, b);
+  const int16x8_t abab_s16 = vcombine_s16(ab_s16, ab_s16);
+  const int8x8_t abab_s8 = vmovn_s16(abab_s16);
+  return vget_lane_s32(vreinterpret_s32_s8(abab_s8), 0);
+}
+
+template <>
+struct type_casting_traits<numext::int64_t, numext::uint8_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 8, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcast<Packet2l, Packet16uc>(const Packet2l& a, const Packet2l& b, const Packet2l& c,
+                                                           const Packet2l& d, const Packet2l& e, const Packet2l& f,
+                                                           const Packet2l& g, const Packet2l& h) {
+  const uint16x8_t abcd_u16 = pcast<Packet2l, Packet8us>(a, b, c, d);
+  const uint16x8_t efgh_u16 = pcast<Packet2l, Packet8us>(e, f, g, h);
+  return vcombine_u8(vmovn_u16(abcd_u16), vmovn_u16(efgh_u16));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pcast<Packet2l, Packet8uc>(const Packet2l& a, const Packet2l& b, const Packet2l& c,
+                                                         const Packet2l& d) {
+  return preinterpret<Packet8uc>(pcast<Packet2l, Packet8c>(a, b, c, d));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pcast<Packet2l, Packet4uc>(const Packet2l& a, const Packet2l& b) {
+  return static_cast<Packet4uc>(pcast<Packet2l, Packet4c>(a, b));
+}
+
+//==============================================================================
+// pcast, SrcType = uint64_t
+//==============================================================================
+template <>
+struct type_casting_traits<numext::uint64_t, float> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet2ul, Packet4f>(const Packet2ul& a, const Packet2ul& b) {
+#if EIGEN_ARCH_ARM64
+  return vcombine_f32(vcvt_f32_f64(vcvtq_f64_u64(a)), vcvt_f32_f64(vcvtq_f64_u64(b)));
+#else
+  EIGEN_ALIGN_MAX uint64_t uvals[4];
+  pstore(uvals, a);
+  pstore(uvals + 2, b);
+  EIGEN_ALIGN_MAX float fvals[4] = {static_cast<float>(uvals[0]), static_cast<float>(uvals[1]),
+                                    static_cast<float>(uvals[2]), static_cast<float>(uvals[3])};
+  return pload<Packet4f>(fvals);
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f pcast<Packet2ul, Packet2f>(const Packet2ul& a) {
+#if EIGEN_ARCH_ARM64
+  return vcvt_f32_f64(vcvtq_f64_u64(a));
+#else
+  EIGEN_ALIGN_MAX uint64_t uvals[2];
+  pstore(uvals, a);
+  EIGEN_ALIGN_MAX float fvals[2] = {static_cast<float>(uvals[0]), static_cast<float>(uvals[1])};
+  return pload<Packet2f>(fvals);
+#endif
+}
+
+template <>
+struct type_casting_traits<numext::uint64_t, numext::uint32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcast<Packet2ul, Packet4ui>(const Packet2ul& a, const Packet2ul& b) {
+  return vcombine_u32(vmovn_u64(a), vmovn_u64(b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pcast<Packet2ul, Packet2ui>(const Packet2ul& a) {
+  return vmovn_u64(a);
+}
+
+template <>
+struct type_casting_traits<numext::uint64_t, numext::int32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet2ul, Packet4i>(const Packet2ul& a, const Packet2ul& b) {
+  return preinterpret<Packet4i>(pcast<Packet2ul, Packet4ui>(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pcast<Packet2ul, Packet2i>(const Packet2ul& a) {
+  return preinterpret<Packet2i>(pcast<Packet2ul, Packet2ui>(a));
+}
+
+template <>
+struct type_casting_traits<numext::uint64_t, numext::uint16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet8us pcast<Packet2ul, Packet8us>(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c,
+                                                          const Packet2ul& d) {
+  const uint16x4_t ab_u16 = vmovn_u32(vcombine_u32(vmovn_u64(a), vmovn_u64(b)));
+  const uint16x4_t cd_u16 = vmovn_u32(vcombine_u32(vmovn_u64(c), vmovn_u64(d)));
+  return vcombine_u16(ab_u16, cd_u16);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pcast<Packet2ul, Packet4us>(const Packet2ul& a, const Packet2ul& b) {
+  return vmovn_u32(vcombine_u32(vmovn_u64(a), vmovn_u64(b)));
+}
+
+template <>
+struct type_casting_traits<numext::uint64_t, numext::int16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet8s pcast<Packet2ul, Packet8s>(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c,
+                                                        const Packet2ul& d) {
+  return preinterpret<Packet8s>(pcast<Packet2ul, Packet8us>(a, b, c, d));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pcast<Packet2ul, Packet4s>(const Packet2ul& a, const Packet2ul& b) {
+  return preinterpret<Packet4s>(pcast<Packet2ul, Packet4us>(a, b));
+}
+
+template <>
+struct type_casting_traits<numext::uint64_t, numext::uint8_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 8, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcast<Packet2ul, Packet16uc>(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c,
+                                                            const Packet2ul& d, const Packet2ul& e, const Packet2ul& f,
+                                                            const Packet2ul& g, const Packet2ul& h) {
+  const uint16x8_t abcd_u16 = pcast<Packet2ul, Packet8us>(a, b, c, d);
+  const uint16x8_t efgh_u16 = pcast<Packet2ul, Packet8us>(e, f, g, h);
+  return vcombine_u8(vmovn_u16(abcd_u16), vmovn_u16(efgh_u16));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pcast<Packet2ul, Packet8uc>(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c,
+                                                          const Packet2ul& d) {
+  const uint16x8_t abcd_u16 = pcast<Packet2ul, Packet8us>(a, b, c, d);
+  return vmovn_u16(abcd_u16);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pcast<Packet2ul, Packet4uc>(const Packet2ul& a, const Packet2ul& b) {
+  const uint16x4_t ab_u16 = pcast<Packet2ul, Packet4us>(a, b);
+  const uint16x8_t abab_u16 = vcombine_u16(ab_u16, ab_u16);
+  const uint8x8_t abab_u8 = vmovn_u16(abab_u16);
+  return vget_lane_u32(vreinterpret_u32_u8(abab_u8), 0);
+}
+
+template <>
+struct type_casting_traits<numext::uint64_t, numext::int8_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 8, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet16c pcast<Packet2ul, Packet16c>(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c,
+                                                          const Packet2ul& d, const Packet2ul& e, const Packet2ul& f,
+                                                          const Packet2ul& g, const Packet2ul& h) {
+  return preinterpret<Packet16c>(pcast<Packet2ul, Packet16uc>(a, b, c, d, e, f, g, h));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pcast<Packet2ul, Packet8c>(const Packet2ul& a, const Packet2ul& b, const Packet2ul& c,
+                                                        const Packet2ul& d) {
+  return preinterpret<Packet8c>(pcast<Packet2ul, Packet8uc>(a, b, c, d));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c pcast<Packet2ul, Packet4c>(const Packet2ul& a, const Packet2ul& b) {
+  return static_cast<Packet4c>(pcast<Packet2ul, Packet4uc>(a, b));
+}
+
+#if EIGEN_ARCH_ARM64
+
+//==============================================================================
+// pcast/preinterpret, Double
+//==============================================================================
+
+template <>
+EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet2l>(const Packet2l& a) {
+  return Packet2d(vreinterpretq_f64_s64(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet2ul>(const Packet2ul& a) {
+  return Packet2d(vreinterpretq_f64_u64(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l preinterpret<Packet2l, Packet2d>(const Packet2d& a) {
+  return Packet2l(vreinterpretq_s64_f64(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ul preinterpret<Packet2ul, Packet2d>(const Packet2d& a) {
+  return Packet2ul(vreinterpretq_u64_f64(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet4i>(const Packet4i& a) {
+  return Packet2d(vreinterpretq_f64_s32(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet2d>(const Packet2d& a) {
+  return Packet4i(vreinterpretq_s32_f64(a));
+}
+
+template <>
+struct type_casting_traits<double, float> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet2d, Packet4f>(const Packet2d& a, const Packet2d& b) {
+  return vcombine_f32(vcvt_f32_f64(a), vcvt_f32_f64(b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2f pcast<Packet2d, Packet2f>(const Packet2d& a) {
+  return vcvt_f32_f64(a);
+}
+
+template <>
+struct type_casting_traits<double, numext::int64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2l pcast<Packet2d, Packet2l>(const Packet2d& a) {
+  return vcvtq_s64_f64(a);
+}
+
+template <>
+struct type_casting_traits<double, numext::uint64_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2ul pcast<Packet2d, Packet2ul>(const Packet2d& a) {
+  return vcvtq_u64_f64(a);
+}
+
+template <>
+struct type_casting_traits<double, numext::int32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet2d, Packet4i>(const Packet2d& a, const Packet2d& b) {
+  return vcombine_s32(vmovn_s64(vcvtq_s64_f64(a)), vmovn_s64(vcvtq_s64_f64(b)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2i pcast<Packet2d, Packet2i>(const Packet2d& a) {
+  return vmovn_s64(vcvtq_s64_f64(a));
+}
+
+template <>
+struct type_casting_traits<double, numext::uint32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcast<Packet2d, Packet4ui>(const Packet2d& a, const Packet2d& b) {
+  return vcombine_u32(vmovn_u64(vcvtq_u64_f64(a)), vmovn_u64(vcvtq_u64_f64(b)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2ui pcast<Packet2d, Packet2ui>(const Packet2d& a) {
+  return vmovn_u64(vcvtq_u64_f64(a));
+}
+
+template <>
+struct type_casting_traits<double, numext::int16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet8s pcast<Packet2d, Packet8s>(const Packet2d& a, const Packet2d& b, const Packet2d& c,
+                                                       const Packet2d& d) {
+  const int32x4_t ab_s32 = pcast<Packet2d, Packet4i>(a, b);
+  const int32x4_t cd_s32 = pcast<Packet2d, Packet4i>(c, d);
+  return vcombine_s16(vmovn_s32(ab_s32), vmovn_s32(cd_s32));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4s pcast<Packet2d, Packet4s>(const Packet2d& a, const Packet2d& b) {
+  const int32x4_t ab_s32 = pcast<Packet2d, Packet4i>(a, b);
+  return vmovn_s32(ab_s32);
+}
+
+template <>
+struct type_casting_traits<double, numext::uint16_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 4, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet8us pcast<Packet2d, Packet8us>(const Packet2d& a, const Packet2d& b, const Packet2d& c,
+                                                         const Packet2d& d) {
+  return preinterpret<Packet8us>(pcast<Packet2d, Packet8s>(a, b, c, d));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4us pcast<Packet2d, Packet4us>(const Packet2d& a, const Packet2d& b) {
+  return preinterpret<Packet4us>(pcast<Packet2d, Packet4s>(a, b));
+}
+
+template <>
+struct type_casting_traits<double, numext::int8_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 8, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet16c pcast<Packet2d, Packet16c>(const Packet2d& a, const Packet2d& b, const Packet2d& c,
+                                                         const Packet2d& d, const Packet2d& e, const Packet2d& f,
+                                                         const Packet2d& g, const Packet2d& h) {
+  const int16x8_t abcd_s16 = pcast<Packet2d, Packet8s>(a, b, c, d);
+  const int16x8_t efgh_s16 = pcast<Packet2d, Packet8s>(e, f, g, h);
+  return vcombine_s8(vmovn_s16(abcd_s16), vmovn_s16(efgh_s16));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8c pcast<Packet2d, Packet8c>(const Packet2d& a, const Packet2d& b, const Packet2d& c,
+                                                       const Packet2d& d) {
+  const int16x8_t abcd_s16 = pcast<Packet2d, Packet8s>(a, b, c, d);
+  return vmovn_s16(abcd_s16);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4c pcast<Packet2d, Packet4c>(const Packet2d& a, const Packet2d& b) {
+  const int32x4_t ab_s32 = pcast<Packet2d, Packet4i>(a, b);
+  return pcast<Packet4i, Packet4c>(ab_s32);
+}
+
+template <>
+struct type_casting_traits<double, numext::uint8_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 8, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet16uc pcast<Packet2d, Packet16uc>(const Packet2d& a, const Packet2d& b, const Packet2d& c,
+                                                           const Packet2d& d, const Packet2d& e, const Packet2d& f,
+                                                           const Packet2d& g, const Packet2d& h) {
+  const uint16x8_t abcd_u16 = pcast<Packet2d, Packet8us>(a, b, c, d);
+  const uint16x8_t efgh_u16 = pcast<Packet2d, Packet8us>(e, f, g, h);
+  return vcombine_u8(vmovn_u16(abcd_u16), vmovn_u16(efgh_u16));
+}
+template <>
+EIGEN_STRONG_INLINE Packet8uc pcast<Packet2d, Packet8uc>(const Packet2d& a, const Packet2d& b, const Packet2d& c,
+                                                         const Packet2d& d) {
+  return preinterpret<Packet8uc>(pcast<Packet2d, Packet8c>(a, b, c, d));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4uc pcast<Packet2d, Packet4uc>(const Packet2d& a, const Packet2d& b) {
+  return static_cast<Packet4uc>(pcast<Packet2d, Packet4c>(a, b));
+}
+
+template <>
+struct type_casting_traits<float, double> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet4f, Packet2d>(const Packet4f& a) {
+  // Discard second-half of input.
+  return vcvt_f64_f32(vget_low_f32(a));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet2f, Packet2d>(const Packet2f& a) {
+  return vcvt_f64_f32(a);
+}
+
+template <>
+struct type_casting_traits<numext::int8_t, double> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 8 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet16c, Packet2d>(const Packet16c& a) {
+  // Discard all but first two values.
+  // MSVC defines most intrinsics as macros, so we need to do this in two lines for portability.
+  Packet2f tmp = pcast<Packet8c, Packet2f>(vget_low_s8(a));
+  return vcvt_f64_f32(tmp);
+}
+
+template <>
+struct type_casting_traits<numext::uint8_t, double> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 8 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet16uc, Packet2d>(const Packet16uc& a) {
+  // Discard all but first two values.
+  Packet2f tmp = pcast<Packet8uc, Packet2f>(vget_low_u8(a));
+  return vcvt_f64_f32(tmp);
+}
+
+template <>
+struct type_casting_traits<numext::int16_t, double> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet8s, Packet2d>(const Packet8s& a) {
+  // Discard all but first two values.
+  Packet2f tmp = pcast<Packet4s, Packet2f>(vget_low_s16(a));
+  return vcvt_f64_f32(tmp);
+}
+
+template <>
+struct type_casting_traits<numext::uint16_t, double> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 4 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet8us, Packet2d>(const Packet8us& a) {
+  // Discard all but first two values.
+  Packet2f tmp = pcast<Packet4us, Packet2f>(vget_low_u16(a));
+  return vcvt_f64_f32(tmp);
+}
+
+template <>
+struct type_casting_traits<numext::int32_t, double> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet4i, Packet2d>(const Packet4i& a) {
+  // Discard second half of input.
+  return vcvtq_f64_s64(vmovl_s32(vget_low_s32(a)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet2i, Packet2d>(const Packet2i& a) {
+  return vcvtq_f64_s64(vmovl_s32(a));
+}
+
+template <>
+struct type_casting_traits<numext::uint32_t, double> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet4ui, Packet2d>(const Packet4ui& a) {
+  // Discard second half of input.
+  return vcvtq_f64_u64(vmovl_u32(vget_low_u32(a)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet2ui, Packet2d>(const Packet2ui& a) {
+  return vcvtq_f64_u64(vmovl_u32(a));
+}
+
+template <>
+struct type_casting_traits<numext::int64_t, double> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet2l, Packet2d>(const Packet2l& a) {
+  return vcvtq_f64_s64(a);
+}
+
+template <>
+struct type_casting_traits<numext::uint64_t, double> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet2ul, Packet2d>(const Packet2ul& a) {
+  return vcvtq_f64_u64(a);
+}
+
+#endif  // EIGEN_ARCH_ARM64
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_TYPE_CASTING_NEON_H
diff --git a/inst/include/Eigen/src/Core/arch/NEON/UnaryFunctors.h b/inst/include/Eigen/src/Core/arch/NEON/UnaryFunctors.h
new file mode 100644
index 00000000..8be5bb09
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/NEON/UnaryFunctors.h
@@ -0,0 +1,57 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_NEON_UNARY_FUNCTORS_H
+#define EIGEN_NEON_UNARY_FUNCTORS_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+#if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
+/** \internal
+ * \brief Template specialization of the logistic function for Eigen::half.
+ */
+template <>
+struct scalar_logistic_op<Eigen::half> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half operator()(const Eigen::half& x) const {
+    // Convert to float and call scalar_logistic_op<float>.
+    const scalar_logistic_op<float> float_op;
+    return Eigen::half(float_op(float(x)));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half packetOp(const Eigen::half& x) const { return this->operator()(x); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf packetOp(const Packet4hf& x) const {
+    const scalar_logistic_op<float> float_op;
+    return vcvt_f16_f32(float_op.packetOp(vcvt_f32_f16(x)));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf packetOp(const Packet8hf& x) const {
+    const scalar_logistic_op<float> float_op;
+    return vcombine_f16(vcvt_f16_f32(float_op.packetOp(vcvt_f32_f16(vget_low_f16(x)))),
+                        vcvt_f16_f32(float_op.packetOp(vcvt_high_f32_f16(x))));
+  }
+};
+
+template <>
+struct functor_traits<scalar_logistic_op<Eigen::half>> {
+  enum {
+    Cost = functor_traits<scalar_logistic_op<float>>::Cost,
+    PacketAccess = functor_traits<scalar_logistic_op<float>>::PacketAccess,
+  };
+};
+#endif  // EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_NEON_UNARY_FUNCTORS_H
diff --git a/inst/include/Eigen/src/Core/arch/SSE/Complex.h b/inst/include/Eigen/src/Core/arch/SSE/Complex.h
index 91bba5e3..f79da7b8 100644
--- a/inst/include/Eigen/src/Core/arch/SSE/Complex.h
+++ b/inst/include/Eigen/src/Core/arch/SSE/Complex.h
@@ -10,433 +10,494 @@
 #ifndef EIGEN_COMPLEX_SSE_H
 #define EIGEN_COMPLEX_SSE_H
 
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
 namespace Eigen {
 
 namespace internal {
 
 //---------- float ----------
-struct Packet2cf
-{
+struct Packet2cf {
   EIGEN_STRONG_INLINE Packet2cf() {}
   EIGEN_STRONG_INLINE explicit Packet2cf(const __m128& a) : v(a) {}
-  __m128  v;
+  Packet4f v;
 };
 
-template<> struct packet_traits<std::complex<float> >  : default_packet_traits
-{
+// Use the packet_traits defined in AVX/PacketMath.h instead if we're going
+// to leverage AVX instructions.
+#ifndef EIGEN_VECTORIZE_AVX
+template <>
+struct packet_traits<std::complex<float> > : default_packet_traits {
   typedef Packet2cf type;
+  typedef Packet2cf half;
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
     size = 2,
 
-    HasAdd    = 1,
-    HasSub    = 1,
-    HasMul    = 1,
-    HasDiv    = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
     HasNegate = 1,
-    HasAbs    = 0,
-    HasAbs2   = 0,
-    HasMin    = 0,
-    HasMax    = 0,
-    HasSetLinear = 0
+    HasSqrt = 1,
+    HasLog = 1,
+    HasExp = 1,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 0,
+    HasMax = 0,
+    HasSetLinear = 0,
+    HasBlend = 1
   };
 };
-
-template<> struct unpacket_traits<Packet2cf> { typedef std::complex<float> type; enum {size=2}; };
-
-template<> EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_add_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_sub_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a)
-{
-  const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000));
-  return Packet2cf(_mm_xor_ps(a.v,mask));
-}
-template<> EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a)
-{
-  const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000));
-  return Packet2cf(_mm_xor_ps(a.v,mask));
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
-  // TODO optimize it for SSE3 and 4
-  #ifdef EIGEN_VECTORIZE_SSE3
-  return Packet2cf(_mm_addsub_ps(_mm_mul_ps(_mm_moveldup_ps(a.v), b.v),
-                                 _mm_mul_ps(_mm_movehdup_ps(a.v),
-                                            vec4f_swizzle1(b.v, 1, 0, 3, 2))));
-//   return Packet2cf(_mm_addsub_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v),
-//                                  _mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3),
-//                                             vec4f_swizzle1(b.v, 1, 0, 3, 2))));
-  #else
-  const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x00000000,0x80000000,0x00000000));
-  return Packet2cf(_mm_add_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v),
-                              _mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3),
-                                                    vec4f_swizzle1(b.v, 1, 0, 3, 2)), mask)));
-  #endif
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cf pand   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_and_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf por    <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_or_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pxor   <Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_xor_ps(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) { return Packet2cf(_mm_andnot_ps(a.v,b.v)); }
-
-template<> EIGEN_STRONG_INLINE Packet2cf pload <Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>(&numext::real_ref(*from))); }
-template<> EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) { EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>(&numext::real_ref(*from))); }
-
-template<> EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>&  from)
-{
-  Packet2cf res;
-#if EIGEN_GNUC_AT_MOST(4,2)
-  // Workaround annoying "may be used uninitialized in this function" warning with gcc 4.2
-  res.v = _mm_loadl_pi(_mm_set1_ps(0.0f), reinterpret_cast<const __m64*>(&from));
-#elif EIGEN_GNUC_AT_LEAST(4,6)
-  // Suppress annoying "may be used uninitialized in this function" warning with gcc >= 4.6
-  #pragma GCC diagnostic push
-  #pragma GCC diagnostic ignored "-Wuninitialized"
-  res.v = _mm_loadl_pi(res.v, (const __m64*)&from);
-  #pragma GCC diagnostic pop
-#else
-  res.v = _mm_loadl_pi(res.v, (const __m64*)&from);
 #endif
-  return Packet2cf(_mm_movelh_ps(res.v,res.v));
-}
-
-template<> EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) { return pset1<Packet2cf>(*from); }
 
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_ALIGNED_STORE pstore(&numext::real_ref(*to), from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float> *   to, const Packet2cf& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(&numext::real_ref(*to), from.v); }
+template <>
+struct unpacket_traits<Packet2cf> {
+  typedef std::complex<float> type;
+  typedef Packet2cf half;
+  typedef Packet4f as_real;
+  enum {
+    size = 2,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
 
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float> *   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+template <>
+EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return Packet2cf(_mm_add_ps(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return Packet2cf(_mm_sub_ps(a.v, b.v));
+}
 
-template<> EIGEN_STRONG_INLINE std::complex<float>  pfirst<Packet2cf>(const Packet2cf& a)
-{
-  #if EIGEN_GNUC_AT_MOST(4,3)
-  // Workaround gcc 4.2 ICE - this is not performance wise ideal, but who cares...
-  // This workaround also fix invalid code generation with gcc 4.3
-  EIGEN_ALIGN16 std::complex<float> res[2];
-  _mm_store_ps((float*)res, a.v);
-  return res[0];
-  #else
-  std::complex<float> res;
-  _mm_storel_pi((__m64*)&res, a.v);
-  return res;
-  #endif
+template <>
+EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) {
+  const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000));
+  return Packet2cf(_mm_xor_ps(a.v, mask));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) {
+  const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000, 0x80000000, 0x00000000, 0x80000000));
+  return Packet2cf(_mm_xor_ps(a.v, mask));
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) { return Packet2cf(_mm_castpd_ps(preverse(_mm_castps_pd(a.v)))); }
+template <>
+EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) {
+#ifdef EIGEN_VECTORIZE_SSE3
+  __m128 tmp1 = _mm_mul_ps(_mm_movehdup_ps(a.v), vec4f_swizzle1(b.v, 1, 0, 3, 2));
+  __m128 tmp2 = _mm_moveldup_ps(a.v);
+#else
+  __m128 tmp1 = _mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3), vec4f_swizzle1(b.v, 1, 0, 3, 2));
+  __m128 tmp2 = vec4f_swizzle1(a.v, 0, 0, 2, 2);
+#endif
+#ifdef EIGEN_VECTORIZE_FMA
+  __m128 result = _mm_fmaddsub_ps(tmp2, b.v, tmp1);
+#else
+#ifdef EIGEN_VECTORIZE_SSE3
+  __m128 result = _mm_addsub_ps(_mm_mul_ps(tmp2, b.v), tmp1);
+#else
+  const __m128 mask = _mm_setr_ps(-0.0f, 0.0f, -0.0f, 0.0f);
+  __m128 result = _mm_add_ps(_mm_mul_ps(tmp2, b.v), _mm_xor_ps(tmp1, mask));
+#endif
+#endif
+  return Packet2cf(result);
+}
 
-template<> EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a)
-{
-  return pfirst(Packet2cf(_mm_add_ps(a.v, _mm_movehl_ps(a.v,a.v))));
+template <>
+EIGEN_STRONG_INLINE Packet2cf ptrue<Packet2cf>(const Packet2cf& a) {
+  return Packet2cf(ptrue(Packet4f(a.v)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pand<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return Packet2cf(_mm_and_ps(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf por<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return Packet2cf(_mm_or_ps(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pxor<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return Packet2cf(_mm_xor_ps(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return Packet2cf(_mm_andnot_ps(b.v, a.v));
 }
 
-template<> EIGEN_STRONG_INLINE Packet2cf preduxp<Packet2cf>(const Packet2cf* vecs)
-{
-  return Packet2cf(_mm_add_ps(_mm_movelh_ps(vecs[0].v,vecs[1].v), _mm_movehl_ps(vecs[1].v,vecs[0].v)));
+template <>
+EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(_mm_load_ps(&numext::real_ref(*from)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(_mm_loadu_ps(&numext::real_ref(*from)));
 }
 
-template<> EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a)
-{
-  return pfirst(pmul(a, Packet2cf(_mm_movehl_ps(a.v,a.v))));
+template <>
+EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from) {
+  const float re = std::real(from);
+  const float im = std::imag(from);
+  return Packet2cf(_mm_set_ps(im, re, im, re));
 }
 
-template<int Offset>
-struct palign_impl<Offset,Packet2cf>
-{
-  static EIGEN_STRONG_INLINE void run(Packet2cf& first, const Packet2cf& second)
-  {
-    if (Offset==1)
-    {
-      first.v = _mm_movehl_ps(first.v, first.v);
-      first.v = _mm_movelh_ps(first.v, second.v);
-    }
-  }
-};
+template <>
+EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) {
+  return pset1<Packet2cf>(*from);
+}
 
-template<> struct conj_helper<Packet2cf, Packet2cf, false,true>
-{
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
-  {
-    #ifdef EIGEN_VECTORIZE_SSE3
-    return internal::pmul(a, pconj(b));
-    #else
-    const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000));
-    return Packet2cf(_mm_add_ps(_mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), mask),
-                                _mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3),
-                                           vec4f_swizzle1(b.v, 1, 0, 3, 2))));
-    #endif
-  }
-};
+template <>
+EIGEN_STRONG_INLINE void pstore<std::complex<float> >(std::complex<float>* to, const Packet2cf& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(&numext::real_ref(*to), from.v);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet2cf& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_ps(&numext::real_ref(*to), from.v);
+}
 
-template<> struct conj_helper<Packet2cf, Packet2cf, true,false>
-{
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
-  {
-    #ifdef EIGEN_VECTORIZE_SSE3
-    return internal::pmul(pconj(a), b);
-    #else
-    const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000));
-    return Packet2cf(_mm_add_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v),
-                                _mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3),
-                                                      vec4f_swizzle1(b.v, 1, 0, 3, 2)), mask)));
-    #endif
-  }
-};
+template <>
+EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from,
+                                                                           Index stride) {
+  return Packet2cf(_mm_set_ps(std::imag(from[1 * stride]), std::real(from[1 * stride]), std::imag(from[0 * stride]),
+                              std::real(from[0 * stride])));
+}
 
-template<> struct conj_helper<Packet2cf, Packet2cf, true,true>
-{
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet2cf& y, const Packet2cf& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& a, const Packet2cf& b) const
-  {
-    #ifdef EIGEN_VECTORIZE_SSE3
-    return pconj(internal::pmul(a, b));
-    #else
-    const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0x00000000,0x80000000,0x00000000,0x80000000));
-    return Packet2cf(_mm_sub_ps(_mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a.v, 0, 0, 2, 2), b.v), mask),
-                                _mm_mul_ps(vec4f_swizzle1(a.v, 1, 1, 3, 3),
-                                           vec4f_swizzle1(b.v, 1, 0, 3, 2))));
-    #endif
-  }
-};
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from,
+                                                                       Index stride) {
+  to[stride * 0] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 0)),
+                                       _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 1)));
+  to[stride * 1] = std::complex<float>(_mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 2)),
+                                       _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 3)));
+}
 
-template<> struct conj_helper<Packet4f, Packet2cf, false,false>
-{
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet4f& x, const Packet2cf& y, const Packet2cf& c) const
-  { return padd(c, pmul(x,y)); }
+template <>
+EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float>* addr) {
+  _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
+}
 
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const
-  { return Packet2cf(Eigen::internal::pmul(x, y.v)); }
-};
+template <>
+EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a) {
+  alignas(alignof(__m64)) std::complex<float> res;
+  _mm_storel_pi((__m64*)&res, a.v);
+  return res;
+}
 
-template<> struct conj_helper<Packet2cf, Packet4f, false,false>
-{
-  EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet4f& y, const Packet2cf& c) const
-  { return padd(c, pmul(x,y)); }
+template <>
+EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) {
+  return Packet2cf(_mm_castpd_ps(preverse(Packet2d(_mm_castps_pd(a.v)))));
+}
 
-  EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const
-  { return Packet2cf(Eigen::internal::pmul(x.v, y)); }
-};
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a) {
+  return pfirst(Packet2cf(_mm_add_ps(a.v, _mm_movehl_ps(a.v, a.v))));
+}
 
-template<> EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b)
-{
-  // TODO optimize it for SSE3 and 4
-  Packet2cf res = conj_helper<Packet2cf,Packet2cf,false,true>().pmul(a,b);
-  __m128 s = _mm_mul_ps(b.v,b.v);
-  return Packet2cf(_mm_div_ps(res.v,_mm_add_ps(s,_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(s), 0xb1)))));
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a) {
+  return pfirst(pmul(a, Packet2cf(_mm_movehl_ps(a.v, a.v))));
 }
 
-EIGEN_STRONG_INLINE Packet2cf pcplxflip/*<Packet2cf>*/(const Packet2cf& x)
-{
+EIGEN_STRONG_INLINE Packet2cf pcplxflip /* <Packet2cf> */ (const Packet2cf& x) {
   return Packet2cf(vec4f_swizzle1(x.v, 1, 0, 3, 2));
 }
 
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf, Packet4f)
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return pdiv_complex(a, b);
+}
 
 //---------- double ----------
-struct Packet1cd
-{
+struct Packet1cd {
   EIGEN_STRONG_INLINE Packet1cd() {}
   EIGEN_STRONG_INLINE explicit Packet1cd(const __m128d& a) : v(a) {}
-  __m128d  v;
+  Packet2d v;
 };
 
-template<> struct packet_traits<std::complex<double> >  : default_packet_traits
-{
+// Use the packet_traits defined in AVX/PacketMath.h instead if we're going
+// to leverage AVX instructions.
+#ifndef EIGEN_VECTORIZE_AVX
+template <>
+struct packet_traits<std::complex<double> > : default_packet_traits {
   typedef Packet1cd type;
+  typedef Packet1cd half;
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 0,
     size = 1,
 
-    HasAdd    = 1,
-    HasSub    = 1,
-    HasMul    = 1,
-    HasDiv    = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
     HasNegate = 1,
-    HasAbs    = 0,
-    HasAbs2   = 0,
-    HasMin    = 0,
-    HasMax    = 0,
+    HasSqrt = 1,
+    HasLog = 1,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 0,
+    HasMax = 0,
     HasSetLinear = 0
   };
 };
+#endif
+
+template <>
+struct unpacket_traits<Packet1cd> {
+  typedef std::complex<double> type;
+  typedef Packet1cd half;
+  typedef Packet2d as_real;
+  enum {
+    size = 1,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
 
-template<> struct unpacket_traits<Packet1cd> { typedef std::complex<double> type; enum {size=1}; };
-
-template<> EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_add_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_sub_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) { return Packet1cd(pnegate(a.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a)
-{
-  const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
-  return Packet1cd(_mm_xor_pd(a.v,mask));
-}
-
-template<> EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{
-  // TODO optimize it for SSE3 and 4
-  #ifdef EIGEN_VECTORIZE_SSE3
-  return Packet1cd(_mm_addsub_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v),
-                                 _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1),
-                                            vec2d_swizzle1(b.v, 1, 0))));
-  #else
-  const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0));
-  return Packet1cd(_mm_add_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v),
-                              _mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 1, 1),
-                                                    vec2d_swizzle1(b.v, 1, 0)), mask)));
-  #endif
-}
-
-template<> EIGEN_STRONG_INLINE Packet1cd pand   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_and_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd por    <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_or_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pxor   <Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_xor_pd(a.v,b.v)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) { return Packet1cd(_mm_andnot_pd(a.v,b.v)); }
+template <>
+EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return Packet1cd(_mm_add_pd(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return Packet1cd(_mm_sub_pd(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) {
+  return Packet1cd(pnegate(Packet2d(a.v)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) {
+  const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000, 0x0, 0x0, 0x0));
+  return Packet1cd(_mm_xor_pd(a.v, mask));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) {
+  __m128d tmp1 = _mm_mul_pd(_mm_unpackhi_pd(a.v, a.v), vec2d_swizzle1(b.v, 1, 0));
+#ifdef EIGEN_VECTORIZE_SSE3
+  __m128d tmp2 = _mm_movedup_pd(a.v);
+#else
+  __m128d tmp2 = _mm_unpacklo_pd(a.v, a.v);
+#endif
+#ifdef EIGEN_VECTORIZE_FMA
+  __m128d result = _mm_fmaddsub_pd(tmp2, b.v, tmp1);
+#else
+#ifdef EIGEN_VECTORIZE_SSE3
+  __m128d result = _mm_addsub_pd(_mm_mul_pd(tmp2, b.v), tmp1);
+#else
+  const __m128d mask = _mm_setr_pd(-0.0, 0.0);
+  __m128d result = _mm_add_pd(_mm_mul_pd(tmp2, b.v), _mm_xor_pd(tmp1, mask));
+#endif
+#endif
+  return Packet1cd(result);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd ptrue<Packet1cd>(const Packet1cd& a) {
+  return Packet1cd(ptrue(Packet2d(a.v)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pand<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return Packet1cd(_mm_and_pd(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd por<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return Packet1cd(_mm_or_pd(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pxor<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return Packet1cd(_mm_xor_pd(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return Packet1cd(_mm_andnot_pd(b.v, a.v));
+}
 
 // FIXME force unaligned load, this is a temporary fix
-template<> EIGEN_STRONG_INLINE Packet1cd pload <Packet1cd>(const std::complex<double>* from)
-{ EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from)); }
-template<> EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from)
-{ EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from)); }
-template<> EIGEN_STRONG_INLINE Packet1cd pset1<Packet1cd>(const std::complex<double>&  from)
-{ /* here we really have to use unaligned loads :( */ return ploadu<Packet1cd>(&from); }
+template <>
+EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(_mm_load_pd((const double*)from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(_mm_loadu_pd((const double*)from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd
+pset1<Packet1cd>(const std::complex<double>& from) { /* here we really have to use unaligned loads :( */
+  return ploadu<Packet1cd>(&from);
+}
 
-template<> EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) { return pset1<Packet1cd>(*from); }
+template <>
+EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) {
+  return pset1<Packet1cd>(*from);
+}
 
 // FIXME force unaligned store, this is a temporary fix
-template<> EIGEN_STRONG_INLINE void pstore <std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); }
-template<> EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double> *   to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); }
+template <>
+EIGEN_STRONG_INLINE void pstore<std::complex<double> >(std::complex<double>* to, const Packet1cd& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd((double*)to, from.v);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double>* to, const Packet1cd& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_pd((double*)to, from.v);
+}
 
-template<> EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double> *   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
+template <>
+EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double>* addr) {
+  _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
+}
 
-template<> EIGEN_STRONG_INLINE std::complex<double>  pfirst<Packet1cd>(const Packet1cd& a)
-{
+template <>
+EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a) {
   EIGEN_ALIGN16 double res[2];
   _mm_store_pd(res, a.v);
-  return std::complex<double>(res[0],res[1]);
+  return std::complex<double>(res[0], res[1]);
 }
 
-template<> EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) { return a; }
+template <>
+EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) {
+  return a;
+}
 
-template<> EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a)
-{
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a) {
   return pfirst(a);
 }
 
-template<> EIGEN_STRONG_INLINE Packet1cd preduxp<Packet1cd>(const Packet1cd* vecs)
-{
-  return vecs[0];
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) {
+  return pfirst(a);
 }
 
-template<> EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a)
-{
-  return pfirst(a);
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd, Packet2d)
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return pdiv_complex(a, b);
 }
 
-template<int Offset>
-struct palign_impl<Offset,Packet1cd>
-{
-  static EIGEN_STRONG_INLINE void run(Packet1cd& /*first*/, const Packet1cd& /*second*/)
-  {
-    // FIXME is it sure we never have to align a Packet1cd?
-    // Even though a std::complex<double> has 16 bytes, it is not necessarily aligned on a 16 bytes boundary...
-  }
-};
+EIGEN_STRONG_INLINE Packet1cd pcplxflip /* <Packet1cd> */ (const Packet1cd& x) {
+  return Packet1cd(preverse(Packet2d(x.v)));
+}
 
-template<> struct conj_helper<Packet1cd, Packet1cd, false,true>
-{
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
-  {
-    #ifdef EIGEN_VECTORIZE_SSE3
-    return internal::pmul(a, pconj(b));
-    #else
-    const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
-    return Packet1cd(_mm_add_pd(_mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), mask),
-                                _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1),
-                                           vec2d_swizzle1(b.v, 1, 0))));
-    #endif
-  }
-};
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2cf, 2>& kernel) {
+  __m128d w1 = _mm_castps_pd(kernel.packet[0].v);
+  __m128d w2 = _mm_castps_pd(kernel.packet[1].v);
 
-template<> struct conj_helper<Packet1cd, Packet1cd, true,false>
-{
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
-  {
-    #ifdef EIGEN_VECTORIZE_SSE3
-    return internal::pmul(pconj(a), b);
-    #else
-    const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
-    return Packet1cd(_mm_add_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v),
-                                _mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 1, 1),
-                                                      vec2d_swizzle1(b.v, 1, 0)), mask)));
-    #endif
-  }
-};
+  __m128 tmp = _mm_castpd_ps(_mm_unpackhi_pd(w1, w2));
+  kernel.packet[0].v = _mm_castpd_ps(_mm_unpacklo_pd(w1, w2));
+  kernel.packet[1].v = tmp;
+}
 
-template<> struct conj_helper<Packet1cd, Packet1cd, true,true>
-{
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet1cd& y, const Packet1cd& c) const
-  { return padd(pmul(x,y),c); }
-
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& a, const Packet1cd& b) const
-  {
-    #ifdef EIGEN_VECTORIZE_SSE3
-    return pconj(internal::pmul(a, b));
-    #else
-    const __m128d mask = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
-    return Packet1cd(_mm_sub_pd(_mm_xor_pd(_mm_mul_pd(vec2d_swizzle1(a.v, 0, 0), b.v), mask),
-                                _mm_mul_pd(vec2d_swizzle1(a.v, 1, 1),
-                                           vec2d_swizzle1(b.v, 1, 0))));
-    #endif
-  }
-};
+template <>
+EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) {
+  __m128 eq = _mm_cmpeq_ps(a.v, b.v);
+  return Packet2cf(pand<Packet4f>(eq, vec4f_swizzle1(eq, 1, 0, 3, 2)));
+}
 
-template<> struct conj_helper<Packet2d, Packet1cd, false,false>
-{
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet2d& x, const Packet1cd& y, const Packet1cd& c) const
-  { return padd(c, pmul(x,y)); }
+template <>
+EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) {
+  __m128d eq = _mm_cmpeq_pd(a.v, b.v);
+  return Packet1cd(pand<Packet2d>(eq, vec2d_swizzle1(eq, 1, 0)));
+}
 
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const
-  { return Packet1cd(Eigen::internal::pmul(x, y.v)); }
-};
+template <>
+EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket,
+                                     const Packet2cf& elsePacket) {
+  __m128d result = pblend<Packet2d>(ifPacket, _mm_castps_pd(thenPacket.v), _mm_castps_pd(elsePacket.v));
+  return Packet2cf(_mm_castpd_ps(result));
+}
 
-template<> struct conj_helper<Packet1cd, Packet2d, false,false>
-{
-  EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet2d& y, const Packet1cd& c) const
-  { return padd(c, pmul(x,y)); }
+template <>
+EIGEN_STRONG_INLINE Packet1cd psqrt<Packet1cd>(const Packet1cd& a) {
+  return psqrt_complex<Packet1cd>(a);
+}
 
-  EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const
-  { return Packet1cd(Eigen::internal::pmul(x.v, y)); }
-};
+template <>
+EIGEN_STRONG_INLINE Packet2cf psqrt<Packet2cf>(const Packet2cf& a) {
+  return psqrt_complex<Packet2cf>(a);
+}
 
-template<> EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b)
-{
-  // TODO optimize it for SSE3 and 4
-  Packet1cd res = conj_helper<Packet1cd,Packet1cd,false,true>().pmul(a,b);
-  __m128d s = _mm_mul_pd(b.v,b.v);
-  return Packet1cd(_mm_div_pd(res.v, _mm_add_pd(s,_mm_shuffle_pd(s, s, 0x1))));
+template <>
+EIGEN_STRONG_INLINE Packet1cd plog<Packet1cd>(const Packet1cd& a) {
+  return plog_complex<Packet1cd>(a);
 }
 
-EIGEN_STRONG_INLINE Packet1cd pcplxflip/*<Packet1cd>*/(const Packet1cd& x)
-{
-  return Packet1cd(preverse(x.v));
+template <>
+EIGEN_STRONG_INLINE Packet2cf plog<Packet2cf>(const Packet2cf& a) {
+  return plog_complex<Packet2cf>(a);
 }
 
-} // end namespace internal
+template <>
+EIGEN_STRONG_INLINE Packet2cf pexp<Packet2cf>(const Packet2cf& a) {
+  return pexp_complex<Packet2cf>(a);
+}
 
-} // end namespace Eigen
+#ifdef EIGEN_VECTORIZE_FMA
+// std::complex<float>
+template <>
+EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& a, const Packet2cf& b, const Packet2cf& c) {
+  __m128 a_odd = _mm_movehdup_ps(a.v);
+  __m128 a_even = _mm_moveldup_ps(a.v);
+  __m128 b_swap = _mm_permute_ps(b.v, _MM_SHUFFLE(2, 3, 0, 1));
+  __m128 result = _mm_fmaddsub_ps(a_even, b.v, _mm_fmaddsub_ps(a_odd, b_swap, c.v));
+  return Packet2cf(result);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pmsub(const Packet2cf& a, const Packet2cf& b, const Packet2cf& c) {
+  __m128 a_odd = _mm_movehdup_ps(a.v);
+  __m128 a_even = _mm_moveldup_ps(a.v);
+  __m128 b_swap = _mm_permute_ps(b.v, _MM_SHUFFLE(2, 3, 0, 1));
+  __m128 result = _mm_fmaddsub_ps(a_even, b.v, _mm_fmsubadd_ps(a_odd, b_swap, c.v));
+  return Packet2cf(result);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pnmadd(const Packet2cf& a, const Packet2cf& b, const Packet2cf& c) {
+  return pnegate(pmsub(a, b, c));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pnmsub(const Packet2cf& a, const Packet2cf& b, const Packet2cf& c) {
+  return pnegate(pmadd(a, b, c));
+}
+// std::complex<double>
+template <>
+EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& a, const Packet1cd& b, const Packet1cd& c) {
+  __m128d a_odd = _mm_permute_pd(a.v, 0x3);
+  __m128d a_even = _mm_movedup_pd(a.v);
+  __m128d b_swap = _mm_permute_pd(b.v, 0x1);
+  __m128d result = _mm_fmaddsub_pd(a_even, b.v, _mm_fmaddsub_pd(a_odd, b_swap, c.v));
+  return Packet1cd(result);
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pmsub(const Packet1cd& a, const Packet1cd& b, const Packet1cd& c) {
+  __m128d a_odd = _mm_permute_pd(a.v, 0x3);
+  __m128d a_even = _mm_movedup_pd(a.v);
+  __m128d b_swap = _mm_permute_pd(b.v, 0x1);
+  __m128d result = _mm_fmaddsub_pd(a_even, b.v, _mm_fmsubadd_pd(a_odd, b_swap, c.v));
+  return Packet1cd(result);
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pnmadd(const Packet1cd& a, const Packet1cd& b, const Packet1cd& c) {
+  return pnegate(pmsub(a, b, c));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pnmsub(const Packet1cd& a, const Packet1cd& b, const Packet1cd& c) {
+  return pnegate(pmadd(a, b, c));
+}
+#endif
+}  // end namespace internal
+}  // end namespace Eigen
 
-#endif // EIGEN_COMPLEX_SSE_H
+#endif  // EIGEN_COMPLEX_SSE_H
diff --git a/inst/include/Eigen/src/Core/arch/SSE/MathFunctions.h b/inst/include/Eigen/src/Core/arch/SSE/MathFunctions.h
index 2b07168a..30c1f07a 100644
--- a/inst/include/Eigen/src/Core/arch/SSE/MathFunctions.h
+++ b/inst/include/Eigen/src/Core/arch/SSE/MathFunctions.h
@@ -8,468 +8,81 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-/* The sin, cos, exp, and log functions of this file come from
+/* The sin and cos and functions of this file come from
  * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
  */
 
 #ifndef EIGEN_MATH_FUNCTIONS_SSE_H
 #define EIGEN_MATH_FUNCTIONS_SSE_H
 
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
 namespace Eigen {
 
 namespace internal {
 
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet4f plog<Packet4f>(const Packet4f& _x)
-{
-  Packet4f x = _x;
-  _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
-  _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
-  _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
-
-  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000);
-
-  /* the smallest non denormalized float number */
-  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos,  0x00800000);
-  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf,     0xff800000);//-1.f/0.f);
-  
-  /* natural logarithm computed for 4 simultaneous float
-    return NaN for x <= 0
-  */
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, - 1.1514610310E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, - 1.2420140846E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, + 1.4249322787E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, - 1.6668057665E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, + 2.0000714765E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, - 2.4999993993E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, + 3.3333331174E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
-
-
-  Packet4i emm0;
-
-  Packet4f invalid_mask = _mm_cmpnge_ps(x, _mm_setzero_ps()); // not greater equal is true if x is NaN
-  Packet4f iszero_mask = _mm_cmpeq_ps(x, _mm_setzero_ps());
-
-  x = pmax(x, p4f_min_norm_pos);  /* cut off denormalized stuff */
-  emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
-
-  /* keep only the fractional part */
-  x = _mm_and_ps(x, p4f_inv_mant_mask);
-  x = _mm_or_ps(x, p4f_half);
-
-  emm0 = _mm_sub_epi32(emm0, p4i_0x7f);
-  Packet4f e = padd(_mm_cvtepi32_ps(emm0), p4f_1);
-
-  /* part2:
-     if( x < SQRTHF ) {
-       e -= 1;
-       x = x + x - 1.0;
-     } else { x = x - 1.0; }
-  */
-  Packet4f mask = _mm_cmplt_ps(x, p4f_cephes_SQRTHF);
-  Packet4f tmp = _mm_and_ps(x, mask);
-  x = psub(x, p4f_1);
-  e = psub(e, _mm_and_ps(p4f_1, mask));
-  x = padd(x, tmp);
-
-  Packet4f x2 = pmul(x,x);
-  Packet4f x3 = pmul(x2,x);
+EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_FLOAT(Packet4f)
+EIGEN_INSTANTIATE_GENERIC_MATH_FUNCS_DOUBLE(Packet2d)
 
-  Packet4f y, y1, y2;
-  y  = pmadd(p4f_cephes_log_p0, x, p4f_cephes_log_p1);
-  y1 = pmadd(p4f_cephes_log_p3, x, p4f_cephes_log_p4);
-  y2 = pmadd(p4f_cephes_log_p6, x, p4f_cephes_log_p7);
-  y  = pmadd(y , x, p4f_cephes_log_p2);
-  y1 = pmadd(y1, x, p4f_cephes_log_p5);
-  y2 = pmadd(y2, x, p4f_cephes_log_p8);
-  y = pmadd(y, x3, y1);
-  y = pmadd(y, x3, y2);
-  y = pmul(y, x3);
-
-  y1 = pmul(e, p4f_cephes_log_q1);
-  tmp = pmul(x2, p4f_half);
-  y = padd(y, y1);
-  x = psub(x, tmp);
-  y2 = pmul(e, p4f_cephes_log_q2);
-  x = padd(x, y);
-  x = padd(x, y2);
-  // negative arg will be NAN, 0 will be -INF
-  return _mm_or_ps(_mm_andnot_ps(iszero_mask, _mm_or_ps(x, invalid_mask)),
-                   _mm_and_ps(iszero_mask, p4f_minus_inf));
+// Notice that for newer processors, it is counterproductive to use Newton
+// iteration for square root. In particular, Skylake and Zen2 processors
+// have approximately doubled throughput of the _mm_sqrt_ps instruction
+// compared to their predecessors.
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f psqrt<Packet4f>(const Packet4f& x) {
+  return _mm_sqrt_ps(x);
 }
-
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet4f pexp<Packet4f>(const Packet4f& _x)
-{
-  Packet4f x = _x;
-  _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
-  _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
-  _EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
-
-
-  _EIGEN_DECLARE_CONST_Packet4f(exp_hi,  88.3762626647950f);
-  _EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f);
-
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
-
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f);
-
-  Packet4f tmp, fx;
-  Packet4i emm0;
-
-  // clamp x
-  x = pmax(pmin(x, p4f_exp_hi), p4f_exp_lo);
-
-  /* express exp(x) as exp(g + n*log(2)) */
-  fx = pmadd(x, p4f_cephes_LOG2EF, p4f_half);
-
-#ifdef EIGEN_VECTORIZE_SSE4_1
-  fx = _mm_floor_ps(fx);
-#else
-  emm0 = _mm_cvttps_epi32(fx);
-  tmp  = _mm_cvtepi32_ps(emm0);
-  /* if greater, substract 1 */
-  Packet4f mask = _mm_cmpgt_ps(tmp, fx);
-  mask = _mm_and_ps(mask, p4f_1);
-  fx = psub(tmp, mask);
-#endif
-
-  tmp = pmul(fx, p4f_cephes_exp_C1);
-  Packet4f z = pmul(fx, p4f_cephes_exp_C2);
-  x = psub(x, tmp);
-  x = psub(x, z);
-
-  z = pmul(x,x);
-
-  Packet4f y = p4f_cephes_exp_p0;
-  y = pmadd(y, x, p4f_cephes_exp_p1);
-  y = pmadd(y, x, p4f_cephes_exp_p2);
-  y = pmadd(y, x, p4f_cephes_exp_p3);
-  y = pmadd(y, x, p4f_cephes_exp_p4);
-  y = pmadd(y, x, p4f_cephes_exp_p5);
-  y = pmadd(y, z, x);
-  y = padd(y, p4f_1);
-
-  // build 2^n
-  emm0 = _mm_cvttps_epi32(fx);
-  emm0 = _mm_add_epi32(emm0, p4i_0x7f);
-  emm0 = _mm_slli_epi32(emm0, 23);
-  return pmax(pmul(y, Packet4f(_mm_castsi128_ps(emm0))), _x);
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2d psqrt<Packet2d>(const Packet2d& x) {
+  return _mm_sqrt_pd(x);
 }
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet2d pexp<Packet2d>(const Packet2d& _x)
-{
-  Packet2d x = _x;
-
-  _EIGEN_DECLARE_CONST_Packet2d(1 , 1.0);
-  _EIGEN_DECLARE_CONST_Packet2d(2 , 2.0);
-  _EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
-
-  _EIGEN_DECLARE_CONST_Packet2d(exp_hi,  709.437);
-  _EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303);
-
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599);
-
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4);
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2);
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1);
-
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6);
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3);
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1);
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0);
-
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
-  _EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
-  static const __m128i p4i_1023_0 = _mm_setr_epi32(1023, 1023, 0, 0);
-
-  Packet2d tmp, fx;
-  Packet4i emm0;
-
-  // clamp x
-  x = pmax(pmin(x, p2d_exp_hi), p2d_exp_lo);
-  /* express exp(x) as exp(g + n*log(2)) */
-  fx = pmadd(p2d_cephes_LOG2EF, x, p2d_half);
-
-#ifdef EIGEN_VECTORIZE_SSE4_1
-  fx = _mm_floor_pd(fx);
-#else
-  emm0 = _mm_cvttpd_epi32(fx);
-  tmp  = _mm_cvtepi32_pd(emm0);
-  /* if greater, substract 1 */
-  Packet2d mask = _mm_cmpgt_pd(tmp, fx);
-  mask = _mm_and_pd(mask, p2d_1);
-  fx = psub(tmp, mask);
-#endif
-
-  tmp = pmul(fx, p2d_cephes_exp_C1);
-  Packet2d z = pmul(fx, p2d_cephes_exp_C2);
-  x = psub(x, tmp);
-  x = psub(x, z);
-
-  Packet2d x2 = pmul(x,x);
-
-  Packet2d px = p2d_cephes_exp_p0;
-  px = pmadd(px, x2, p2d_cephes_exp_p1);
-  px = pmadd(px, x2, p2d_cephes_exp_p2);
-  px = pmul (px, x);
-
-  Packet2d qx = p2d_cephes_exp_q0;
-  qx = pmadd(qx, x2, p2d_cephes_exp_q1);
-  qx = pmadd(qx, x2, p2d_cephes_exp_q2);
-  qx = pmadd(qx, x2, p2d_cephes_exp_q3);
-
-  x = pdiv(px,psub(qx,px));
-  x = pmadd(p2d_2,x,p2d_1);
-
-  // build 2^n
-  emm0 = _mm_cvttpd_epi32(fx);
-  emm0 = _mm_add_epi32(emm0, p4i_1023_0);
-  emm0 = _mm_slli_epi32(emm0, 20);
-  emm0 = _mm_shuffle_epi32(emm0, _MM_SHUFFLE(1,2,0,3));
-  return pmax(pmul(x, Packet2d(_mm_castsi128_pd(emm0))), _x);
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet16b psqrt<Packet16b>(const Packet16b& x) {
+  return x;
 }
 
-/* evaluation of 4 sines at onces, using SSE2 intrinsics.
-
-   The code is the exact rewriting of the cephes sinf function.
-   Precision is excellent as long as x < 8192 (I did not bother to
-   take into account the special handling they have for greater values
-   -- it does not return garbage for arguments over 8192, though, but
-   the extra precision is missing).
-
-   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
-   surprising but correct result.
-*/
-
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet4f psin<Packet4f>(const Packet4f& _x)
-{
-  Packet4f x = _x;
-  _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
-  _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
-
-  _EIGEN_DECLARE_CONST_Packet4i(1, 1);
-  _EIGEN_DECLARE_CONST_Packet4i(not1, ~1);
-  _EIGEN_DECLARE_CONST_Packet4i(2, 2);
-  _EIGEN_DECLARE_CONST_Packet4i(4, 4);
-
-  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(sign_mask, 0x80000000);
-
-  _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1,-0.78515625f);
-  _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f);
-  _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f);
-  _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891E-4f);
-  _EIGEN_DECLARE_CONST_Packet4f(sincof_p1,  8.3321608736E-3f);
-  _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(coscof_p0,  2.443315711809948E-005f);
-  _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003f);
-  _EIGEN_DECLARE_CONST_Packet4f(coscof_p2,  4.166664568298827E-002f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4 / M_PI
-
-  Packet4f xmm1, xmm2, xmm3, sign_bit, y;
-
-  Packet4i emm0, emm2;
-  sign_bit = x;
-  /* take the absolute value */
-  x = pabs(x);
-
-  /* take the modulo */
-
-  /* extract the sign bit (upper one) */
-  sign_bit = _mm_and_ps(sign_bit, p4f_sign_mask);
-
-  /* scale by 4/Pi */
-  y = pmul(x, p4f_cephes_FOPI);
-
-  /* store the integer part of y in mm0 */
-  emm2 = _mm_cvttps_epi32(y);
-  /* j=(j+1) & (~1) (see the cephes sources) */
-  emm2 = _mm_add_epi32(emm2, p4i_1);
-  emm2 = _mm_and_si128(emm2, p4i_not1);
-  y = _mm_cvtepi32_ps(emm2);
-  /* get the swap sign flag */
-  emm0 = _mm_and_si128(emm2, p4i_4);
-  emm0 = _mm_slli_epi32(emm0, 29);
-  /* get the polynom selection mask
-     there is one polynom for 0 <= x <= Pi/4
-     and another one for Pi/4<x<=Pi/2
-
-     Both branches will be computed.
-  */
-  emm2 = _mm_and_si128(emm2, p4i_2);
-  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
-
-  Packet4f swap_sign_bit = _mm_castsi128_ps(emm0);
-  Packet4f poly_mask = _mm_castsi128_ps(emm2);
-  sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
-
-  /* The magic pass: "Extended precision modular arithmetic"
-     x = ((x - y * DP1) - y * DP2) - y * DP3; */
-  xmm1 = pmul(y, p4f_minus_cephes_DP1);
-  xmm2 = pmul(y, p4f_minus_cephes_DP2);
-  xmm3 = pmul(y, p4f_minus_cephes_DP3);
-  x = padd(x, xmm1);
-  x = padd(x, xmm2);
-  x = padd(x, xmm3);
-
-  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
-  y = p4f_coscof_p0;
-  Packet4f z = _mm_mul_ps(x,x);
-
-  y = pmadd(y, z, p4f_coscof_p1);
-  y = pmadd(y, z, p4f_coscof_p2);
-  y = pmul(y, z);
-  y = pmul(y, z);
-  Packet4f tmp = pmul(z, p4f_half);
-  y = psub(y, tmp);
-  y = padd(y, p4f_1);
-
-  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
-
-  Packet4f y2 = p4f_sincof_p0;
-  y2 = pmadd(y2, z, p4f_sincof_p1);
-  y2 = pmadd(y2, z, p4f_sincof_p2);
-  y2 = pmul(y2, z);
-  y2 = pmul(y2, x);
-  y2 = padd(y2, x);
-
-  /* select the correct result from the two polynoms */
-  y2 = _mm_and_ps(poly_mask, y2);
-  y = _mm_andnot_ps(poly_mask, y);
-  y = _mm_or_ps(y,y2);
-  /* update the sign */
-  return _mm_xor_ps(y, sign_bit);
+#if EIGEN_FAST_MATH
+// Even on Skylake, using Newton iteration is a win for reciprocal square root.
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet4f prsqrt<Packet4f>(const Packet4f& x) {
+  return generic_rsqrt_newton_step<Packet4f, /*Steps=*/1>::run(x, _mm_rsqrt_ps(x));
 }
 
-/* almost the same as psin */
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet4f pcos<Packet4f>(const Packet4f& _x)
-{
-  Packet4f x = _x;
-  _EIGEN_DECLARE_CONST_Packet4f(1 , 1.0f);
-  _EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
-
-  _EIGEN_DECLARE_CONST_Packet4i(1, 1);
-  _EIGEN_DECLARE_CONST_Packet4i(not1, ~1);
-  _EIGEN_DECLARE_CONST_Packet4i(2, 2);
-  _EIGEN_DECLARE_CONST_Packet4i(4, 4);
-
-  _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP1,-0.78515625f);
-  _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP2, -2.4187564849853515625e-4f);
-  _EIGEN_DECLARE_CONST_Packet4f(minus_cephes_DP3, -3.77489497744594108e-8f);
-  _EIGEN_DECLARE_CONST_Packet4f(sincof_p0, -1.9515295891E-4f);
-  _EIGEN_DECLARE_CONST_Packet4f(sincof_p1,  8.3321608736E-3f);
-  _EIGEN_DECLARE_CONST_Packet4f(sincof_p2, -1.6666654611E-1f);
-  _EIGEN_DECLARE_CONST_Packet4f(coscof_p0,  2.443315711809948E-005f);
-  _EIGEN_DECLARE_CONST_Packet4f(coscof_p1, -1.388731625493765E-003f);
-  _EIGEN_DECLARE_CONST_Packet4f(coscof_p2,  4.166664568298827E-002f);
-  _EIGEN_DECLARE_CONST_Packet4f(cephes_FOPI, 1.27323954473516f); // 4 / M_PI
-
-  Packet4f xmm1, xmm2, xmm3, y;
-  Packet4i emm0, emm2;
-
-  x = pabs(x);
-
-  /* scale by 4/Pi */
-  y = pmul(x, p4f_cephes_FOPI);
-
-  /* get the integer part of y */
-  emm2 = _mm_cvttps_epi32(y);
-  /* j=(j+1) & (~1) (see the cephes sources) */
-  emm2 = _mm_add_epi32(emm2, p4i_1);
-  emm2 = _mm_and_si128(emm2, p4i_not1);
-  y = _mm_cvtepi32_ps(emm2);
-
-  emm2 = _mm_sub_epi32(emm2, p4i_2);
-
-  /* get the swap sign flag */
-  emm0 = _mm_andnot_si128(emm2, p4i_4);
-  emm0 = _mm_slli_epi32(emm0, 29);
-  /* get the polynom selection mask */
-  emm2 = _mm_and_si128(emm2, p4i_2);
-  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
-
-  Packet4f sign_bit = _mm_castsi128_ps(emm0);
-  Packet4f poly_mask = _mm_castsi128_ps(emm2);
-
-  /* The magic pass: "Extended precision modular arithmetic"
-     x = ((x - y * DP1) - y * DP2) - y * DP3; */
-  xmm1 = pmul(y, p4f_minus_cephes_DP1);
-  xmm2 = pmul(y, p4f_minus_cephes_DP2);
-  xmm3 = pmul(y, p4f_minus_cephes_DP3);
-  x = padd(x, xmm1);
-  x = padd(x, xmm2);
-  x = padd(x, xmm3);
-
-  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
-  y = p4f_coscof_p0;
-  Packet4f z = pmul(x,x);
-
-  y = pmadd(y,z,p4f_coscof_p1);
-  y = pmadd(y,z,p4f_coscof_p2);
-  y = pmul(y, z);
-  y = pmul(y, z);
-  Packet4f tmp = _mm_mul_ps(z, p4f_half);
-  y = psub(y, tmp);
-  y = padd(y, p4f_1);
-
-  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */
-  Packet4f y2 = p4f_sincof_p0;
-  y2 = pmadd(y2, z, p4f_sincof_p1);
-  y2 = pmadd(y2, z, p4f_sincof_p2);
-  y2 = pmul(y2, z);
-  y2 = pmadd(y2, x, x);
-
-  /* select the correct result from the two polynoms */
-  y2 = _mm_and_ps(poly_mask, y2);
-  y  = _mm_andnot_ps(poly_mask, y);
-  y  = _mm_or_ps(y,y2);
-
-  /* update the sign */
-  return _mm_xor_ps(y, sign_bit);
+#ifdef EIGEN_VECTORIZE_FMA
+// Trying to speed up reciprocal using Newton-Raphson is counterproductive
+// unless FMA is available. Without FMA pdiv(pset1<Packet>(Scalar(1),a)) is
+// 30% faster.
+template <>
+EIGEN_STRONG_INLINE Packet4f preciprocal<Packet4f>(const Packet4f& x) {
+  return generic_reciprocal_newton_step<Packet4f, /*Steps=*/1>::run(x, _mm_rcp_ps(x));
 }
+#endif
 
-#if EIGEN_FAST_MATH
+#endif
 
-// This is based on Quake3's fast inverse square root.
-// For detail see here: http://www.beyond3d.com/content/articles/8/
-// It lacks 1 (or 2 bits in some rare cases) of precision, and does not handle negative, +inf, or denormalized numbers correctly.
-template<> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED
-Packet4f psqrt<Packet4f>(const Packet4f& _x)
-{
-  Packet4f half = pmul(_x, pset1<Packet4f>(.5f));
+}  // end namespace internal
 
-  /* select only the inverse sqrt of non-zero inputs */
-  Packet4f non_zero_mask = _mm_cmpge_ps(_x, pset1<Packet4f>((std::numeric_limits<float>::min)()));
-  Packet4f x = _mm_and_ps(non_zero_mask, _mm_rsqrt_ps(_x));
+namespace numext {
 
-  x = pmul(x, psub(pset1<Packet4f>(1.5f), pmul(half, pmul(x,x))));
-  return pmul(_x,x);
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float sqrt(const float& x) {
+  return internal::pfirst(internal::Packet4f(_mm_sqrt_ss(_mm_set_ss(x))));
 }
 
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double sqrt(const double& x) {
+#if EIGEN_COMP_GNUC_STRICT
+  // This works around a GCC bug generating poor code for _mm_sqrt_pd
+  // See https://gitlab.com/libeigen/eigen/commit/8dca9f97e38970
+  return internal::pfirst(internal::Packet2d(__builtin_ia32_sqrtsd(_mm_set_sd(x))));
 #else
-
-template<> EIGEN_STRONG_INLINE Packet4f psqrt<Packet4f>(const Packet4f& x) { return _mm_sqrt_ps(x); }
-
+  return internal::pfirst(internal::Packet2d(_mm_sqrt_pd(_mm_set_sd(x))));
 #endif
+}
 
-template<> EIGEN_STRONG_INLINE Packet2d psqrt<Packet2d>(const Packet2d& x) { return _mm_sqrt_pd(x); }
-
-} // end namespace internal
+}  // namespace numext
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_MATH_FUNCTIONS_SSE_H
+#endif  // EIGEN_MATH_FUNCTIONS_SSE_H
diff --git a/inst/include/Eigen/src/Core/arch/SSE/PacketMath.h b/inst/include/Eigen/src/Core/arch/SSE/PacketMath.h
index fc8ae50f..a5e4902c 100644
--- a/inst/include/Eigen/src/Core/arch/SSE/PacketMath.h
+++ b/inst/include/Eigen/src/Core/arch/SSE/PacketMath.h
@@ -10,6 +10,10 @@
 #ifndef EIGEN_PACKET_MATH_SSE_H
 #define EIGEN_PACKET_MATH_SSE_H
 
+#include <cstdint>
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
 namespace Eigen {
 
 namespace internal {
@@ -18,632 +22,2363 @@ namespace internal {
 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
 #endif
 
-#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
-#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*))
+#if !defined(EIGEN_VECTORIZE_AVX) && !defined(EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS)
+// 32 bits =>  8 registers
+// 64 bits => 16 registers
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2 * sizeof(void*))
+#endif
+
+#ifdef EIGEN_VECTORIZE_FMA
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#endif
 #endif
 
-typedef __m128  Packet4f;
-typedef __m128i Packet4i;
+#if ((defined EIGEN_VECTORIZE_AVX) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_MINGW || EIGEN_COMP_LCC) && \
+     (__GXX_ABI_VERSION < 1004)) ||                                                                     \
+    EIGEN_OS_QNX
+// With GCC's default ABI version, a __m128 or __m256 are the same types and therefore we cannot
+// have overloads for both types without linking error.
+// One solution is to increase ABI version using -fabi-version=4 (or greater).
+// Otherwise, we workaround this inconvenience by wrapping 128bit types into the following helper
+// structure:
+typedef eigen_packet_wrapper<__m128> Packet4f;
+typedef eigen_packet_wrapper<__m128d> Packet2d;
+#else
+typedef __m128 Packet4f;
 typedef __m128d Packet2d;
+#endif
 
-template<> struct is_arithmetic<__m128>  { enum { value = true }; };
-template<> struct is_arithmetic<__m128i> { enum { value = true }; };
-template<> struct is_arithmetic<__m128d> { enum { value = true }; };
+typedef eigen_packet_wrapper<__m128i, 0> Packet4i;
+typedef eigen_packet_wrapper<__m128i, 1> Packet16b;
+typedef eigen_packet_wrapper<__m128i, 4> Packet4ui;
+typedef eigen_packet_wrapper<__m128i, 5> Packet2l;
 
-#define vec4f_swizzle1(v,p,q,r,s) \
-  (_mm_castsi128_ps(_mm_shuffle_epi32( _mm_castps_si128(v), ((s)<<6|(r)<<4|(q)<<2|(p)))))
+template <>
+struct is_arithmetic<__m128> {
+  enum { value = true };
+};
+template <>
+struct is_arithmetic<__m128i> {
+  enum { value = true };
+};
+template <>
+struct is_arithmetic<__m128d> {
+  enum { value = true };
+};
+template <>
+struct is_arithmetic<Packet4i> {
+  enum { value = true };
+};
+template <>
+struct is_arithmetic<Packet2l> {
+  enum { value = true };
+};
+// Note that `Packet4ui` uses the underlying type `__m128i`, which is
+// interpreted as a vector of _signed_ `int32`s, which breaks some arithmetic
+// operations used in `GenericPacketMath.h`.
+template <>
+struct is_arithmetic<Packet4ui> {
+  enum { value = false };
+};
+template <>
+struct is_arithmetic<Packet16b> {
+  enum { value = true };
+};
 
-#define vec4i_swizzle1(v,p,q,r,s) \
-  (_mm_shuffle_epi32( v, ((s)<<6|(r)<<4|(q)<<2|(p))))
+template <int p, int q, int r, int s>
+struct shuffle_mask {
+  enum { mask = (s) << 6 | (r) << 4 | (q) << 2 | (p) };
+};
 
-#define vec2d_swizzle1(v,p,q) \
-  (_mm_castsi128_pd(_mm_shuffle_epi32( _mm_castpd_si128(v), ((q*2+1)<<6|(q*2)<<4|(p*2+1)<<2|(p*2)))))
-  
-#define vec4f_swizzle2(a,b,p,q,r,s) \
-  (_mm_shuffle_ps( (a), (b), ((s)<<6|(r)<<4|(q)<<2|(p))))
+// TODO: change the implementation of all swizzle* ops from macro to template,
+#define vec4f_swizzle1(v, p, q, r, s) \
+  Packet4f(_mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v), (shuffle_mask<p, q, r, s>::mask))))
 
-#define vec4i_swizzle2(a,b,p,q,r,s) \
-  (_mm_castps_si128( (_mm_shuffle_ps( _mm_castsi128_ps(a), _mm_castsi128_ps(b), ((s)<<6|(r)<<4|(q)<<2|(p))))))
+#define vec4i_swizzle1(v, p, q, r, s) Packet4i(_mm_shuffle_epi32(v, (shuffle_mask<p, q, r, s>::mask)))
 
-#define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \
-  const Packet4f p4f_##NAME = pset1<Packet4f>(X)
+#define vec4ui_swizzle1(v, p, q, r, s) Packet4ui(vec4i_swizzle1(v, p, q, r, s))
 
-#define _EIGEN_DECLARE_CONST_Packet2d(NAME,X) \
-  const Packet2d p2d_##NAME = pset1<Packet2d>(X)
+#define vec2d_swizzle1(v, p, q) \
+  Packet2d(_mm_castsi128_pd(    \
+      _mm_shuffle_epi32(_mm_castpd_si128(v), (shuffle_mask<2 * p, 2 * p + 1, 2 * q, 2 * q + 1>::mask))))
 
-#define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \
-  const Packet4f p4f_##NAME = _mm_castsi128_ps(pset1<Packet4i>(X))
+#define vec4f_swizzle2(a, b, p, q, r, s) Packet4f(_mm_shuffle_ps((a), (b), (shuffle_mask<p, q, r, s>::mask)))
 
-#define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \
-  const Packet4i p4i_##NAME = pset1<Packet4i>(X)
+#define vec4i_swizzle2(a, b, p, q, r, s) \
+  Packet4i(                              \
+      _mm_castps_si128((_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (shuffle_mask<p, q, r, s>::mask)))))
 
+#define vec4ui_swizzle2(a, b, p, q, r, s) Packet4i(vec4i_swizzle2(a, b, p, q, r, s))
 
-template<> struct packet_traits<float>  : default_packet_traits
-{
+EIGEN_STRONG_INLINE Packet4f vec4f_movelh(const Packet4f& a, const Packet4f& b) {
+  return Packet4f(_mm_movelh_ps(a, b));
+}
+EIGEN_STRONG_INLINE Packet4f vec4f_movehl(const Packet4f& a, const Packet4f& b) {
+  return Packet4f(_mm_movehl_ps(a, b));
+}
+EIGEN_STRONG_INLINE Packet4f vec4f_unpacklo(const Packet4f& a, const Packet4f& b) {
+  return Packet4f(_mm_unpacklo_ps(a, b));
+}
+EIGEN_STRONG_INLINE Packet4f vec4f_unpackhi(const Packet4f& a, const Packet4f& b) {
+  return Packet4f(_mm_unpackhi_ps(a, b));
+}
+#define vec4f_duplane(a, p) vec4f_swizzle2(a, a, p, p, p, p)
+
+#define vec2d_swizzle2(a, b, mask) Packet2d(_mm_shuffle_pd(a, b, mask))
+
+EIGEN_STRONG_INLINE Packet2d vec2d_unpacklo(const Packet2d& a, const Packet2d& b) {
+  return Packet2d(_mm_unpacklo_pd(a, b));
+}
+EIGEN_STRONG_INLINE Packet2d vec2d_unpackhi(const Packet2d& a, const Packet2d& b) {
+  return Packet2d(_mm_unpackhi_pd(a, b));
+}
+#define vec2d_duplane(a, p) vec2d_swizzle2(a, a, (p << 1) | p)
+
+#define EIGEN_DECLARE_CONST_Packet4f(NAME, X) const Packet4f p4f_##NAME = pset1<Packet4f>(X)
+
+#define EIGEN_DECLARE_CONST_Packet2d(NAME, X) const Packet2d p2d_##NAME = pset1<Packet2d>(X)
+
+#define EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME, X) const Packet4f p4f_##NAME = pset1frombits<Packet4f>(X)
+
+#define EIGEN_DECLARE_CONST_Packet4i(NAME, X) const Packet4i p4i_##NAME = pset1<Packet4i>(X)
+
+#define EIGEN_DECLARE_CONST_Packet4ui(NAME, X) const Packet4ui p4ui_##NAME = pset1<Packet4ui>(X)
+
+// Work around lack of extract/cvt for epi64 when compiling for 32-bit.
+#if EIGEN_ARCH_x86_64
+EIGEN_ALWAYS_INLINE int64_t _mm_extract_epi64_0(const __m128i& a) { return _mm_cvtsi128_si64(a); }
+#ifdef EIGEN_VECTORIZE_SSE4_1
+EIGEN_ALWAYS_INLINE int64_t _mm_extract_epi64_1(const __m128i& a) { return _mm_extract_epi64(a, 1); }
+#else
+EIGEN_ALWAYS_INLINE int64_t _mm_extract_epi64_1(const __m128i& a) {
+  return _mm_cvtsi128_si64(_mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(a), 0x1)));
+}
+#endif
+#else
+// epi64 instructions are not available.  The following seems to generate the same instructions
+// with -O2 in GCC/Clang.
+EIGEN_ALWAYS_INLINE int64_t _mm_extract_epi64_0(const __m128i& a) {
+  return numext::bit_cast<int64_t>(_mm_cvtsd_f64(_mm_castsi128_pd(a)));
+}
+EIGEN_ALWAYS_INLINE int64_t _mm_extract_epi64_1(const __m128i& a) {
+  return numext::bit_cast<int64_t>(_mm_cvtsd_f64(_mm_shuffle_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(a), 0x1)));
+}
+#endif
+
+// Use the packet_traits defined in AVX/PacketMath.h instead if we're going
+// to leverage AVX instructions.
+#ifndef EIGEN_VECTORIZE_AVX
+template <>
+struct packet_traits<float> : default_packet_traits {
   typedef Packet4f type;
+  typedef Packet4f half;
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
-    size=4,
-
-    HasDiv  = 1,
-    HasSin  = EIGEN_FAST_MATH,
-    HasCos  = EIGEN_FAST_MATH,
-    HasLog  = 1,
-    HasExp  = 1,
-    HasSqrt = 1
+    size = 4,
+
+    HasCmp = 1,
+    HasDiv = 1,
+    HasReciprocal = EIGEN_FAST_MATH,
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+    HasACos = 1,
+    HasASin = 1,
+    HasATan = 1,
+    HasATanh = 1,
+    HasLog = 1,
+    HasLog1p = 1,
+    HasExpm1 = 1,
+    HasNdtri = 1,
+    HasExp = 1,
+    HasPow = 1,
+    HasBessel = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasCbrt = 1,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH,
+    HasErfc = EIGEN_FAST_MATH,
+    HasBlend = 1,
+    HasSign = 0  // The manually vectorized version is slightly slower for SSE.
   };
 };
-template<> struct packet_traits<double> : default_packet_traits
-{
+template <>
+struct packet_traits<double> : default_packet_traits {
   typedef Packet2d type;
+  typedef Packet2d half;
   enum {
     Vectorizable = 1,
     AlignedOnScalar = 1,
-    size=2,
-
-    HasDiv  = 1,
-    HasExp  = 1,
-    HasSqrt = 1
+    size = 2,
+
+    HasCmp = 1,
+    HasDiv = 1,
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+    HasTanh = EIGEN_FAST_MATH,
+    HasLog = 1,
+    HasErf = EIGEN_FAST_MATH,
+    HasErfc = EIGEN_FAST_MATH,
+    HasExp = 1,
+    HasPow = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasCbrt = 1,
+    HasATan = 1,
+    HasATanh = 1,
+    HasBlend = 1
   };
 };
-template<> struct packet_traits<int>    : default_packet_traits
-{
+template <>
+struct packet_traits<int> : default_packet_traits {
   typedef Packet4i type;
+  typedef Packet4i half;
   enum {
-    // FIXME check the Has*
     Vectorizable = 1,
     AlignedOnScalar = 1,
-    size=4
+    size = 4,
+
+    HasCmp = 1,
+    HasDiv = 1,
+    HasShift = 1,
+    HasBlend = 1
   };
 };
+template <>
+struct packet_traits<uint32_t> : default_packet_traits {
+  typedef Packet4ui type;
+  typedef Packet4ui half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
 
-template<> struct unpacket_traits<Packet4f> { typedef float  type; enum {size=4}; };
-template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2}; };
-template<> struct unpacket_traits<Packet4i> { typedef int    type; enum {size=4}; };
+    HasDiv = 0,
+    HasNegate = 0,
+    HasCmp = 1,
+    HasShift = 1,
+    HasBlend = 1
+  };
+};
+template <>
+struct packet_traits<int64_t> : default_packet_traits {
+  typedef Packet2l type;
+  typedef Packet2l half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 2,
 
-#if defined(_MSC_VER) && (_MSC_VER==1500)
-// Workaround MSVC 9 internal compiler error.
-// TODO: It has been detected with win64 builds (amd64), so let's check whether it also happens in 32bits+SSE mode
-// TODO: let's check whether there does not exist a better fix, like adding a pset0() function. (it crashed on pset1(0)).
-template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) { return _mm_set_ps(from,from,from,from); }
-template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set_pd(from,from); }
-template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from) { return _mm_set_epi32(from,from,from,from); }
-#else
-template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float&  from) { return _mm_set1_ps(from); }
-template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { return _mm_set1_pd(from); }
-template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int&    from) { return _mm_set1_epi32(from); }
+    HasDiv = 0,
+    HasCmp = 1,
+    HasShift = 1,
+    HasBlend = 1
+  };
+};
 #endif
+template <>
+struct packet_traits<bool> : default_packet_traits {
+  typedef Packet16b type;
+  typedef Packet16b half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 16,
+
+    HasCmp = 1,
+    HasShift = 0,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 0,
+    HasMax = 0,
+    HasConj = 0,
+    HasSqrt = 1,
+    HasNegate = 0,
+    HasSign = 0  // Don't try to vectorize psign<bool> = identity.
+  };
+};
+
+template <>
+struct unpacket_traits<Packet4f> {
+  typedef float type;
+  typedef Packet4f half;
+  typedef Packet4i integer_packet;
+  enum {
+    size = 4,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template <>
+struct unpacket_traits<Packet2d> {
+  typedef double type;
+  typedef Packet2d half;
+  typedef Packet2l integer_packet;
+  enum {
+    size = 2,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template <>
+struct unpacket_traits<Packet2l> {
+  typedef int64_t type;
+  typedef Packet2l half;
+  enum {
+    size = 2,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template <>
+struct unpacket_traits<Packet4i> {
+  typedef int type;
+  typedef Packet4i half;
+  enum {
+    size = 4,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template <>
+struct unpacket_traits<Packet4ui> {
+  typedef uint32_t type;
+  typedef Packet4ui half;
+  enum {
+    size = 4,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+template <>
+struct unpacket_traits<Packet16b> {
+  typedef bool type;
+  typedef Packet16b half;
+  enum {
+    size = 16,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
 
-template<> EIGEN_STRONG_INLINE Packet4f plset<float>(const float& a) { return _mm_add_ps(pset1<Packet4f>(a), _mm_set_ps(3,2,1,0)); }
-template<> EIGEN_STRONG_INLINE Packet2d plset<double>(const double& a) { return _mm_add_pd(pset1<Packet2d>(a),_mm_set_pd(1,0)); }
-template<> EIGEN_STRONG_INLINE Packet4i plset<int>(const int& a) { return _mm_add_epi32(pset1<Packet4i>(a),_mm_set_epi32(3,2,1,0)); }
+#ifndef EIGEN_VECTORIZE_AVX
+template <>
+struct scalar_div_cost<float, true> {
+  enum { value = 7 };
+};
+template <>
+struct scalar_div_cost<double, true> {
+  enum { value = 8 };
+};
+#endif
 
-template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_add_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_add_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_add_epi32(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
+  return _mm_set_ps1(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
+  return _mm_set1_pd(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pset1<Packet2l>(const int64_t& from) {
+  return _mm_set1_epi64x(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) {
+  return _mm_set1_epi32(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pset1<Packet4ui>(const uint32_t& from) {
+  return _mm_set1_epi32(numext::bit_cast<int32_t>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16b pset1<Packet16b>(const bool& from) {
+  return _mm_set1_epi8(static_cast<char>(from));
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_sub_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_sub_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_sub_epi32(a,b); }
+template <>
+EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(unsigned int from) {
+  return _mm_castsi128_ps(pset1<Packet4i>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(uint64_t from) {
+  return _mm_castsi128_pd(_mm_set1_epi64x(from));
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a)
-{
-  const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000,0x80000000,0x80000000,0x80000000));
-  return _mm_xor_ps(a,mask);
+template <>
+EIGEN_STRONG_INLINE Packet4f peven_mask(const Packet4f& /*a*/) {
+  return _mm_castsi128_ps(_mm_set_epi32(0, -1, 0, -1));
 }
-template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a)
-{
-  const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0,0x80000000,0x0,0x80000000));
-  return _mm_xor_pd(a,mask);
+template <>
+EIGEN_STRONG_INLINE Packet2l peven_mask(const Packet2l& /*a*/) {
+  return _mm_set_epi32(0, 0, -1, -1);
 }
-template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a)
-{
-  return psub(_mm_setr_epi32(0,0,0,0), a);
+template <>
+EIGEN_STRONG_INLINE Packet4i peven_mask(const Packet4i& /*a*/) {
+  return _mm_set_epi32(0, -1, 0, -1);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui peven_mask(const Packet4ui& /*a*/) {
+  return _mm_set_epi32(0, -1, 0, -1);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d peven_mask(const Packet2d& /*a*/) {
+  return _mm_castsi128_pd(_mm_set_epi32(0, 0, -1, -1));
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; }
-template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; }
+template <>
+EIGEN_STRONG_INLINE Packet4f pzero(const Packet4f& /*a*/) {
+  return _mm_setzero_ps();
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pzero(const Packet2d& /*a*/) {
+  return _mm_setzero_pd();
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pzero(const Packet2l& /*a*/) {
+  return _mm_setzero_si128();
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pzero(const Packet4i& /*a*/) {
+  return _mm_setzero_si128();
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pzero(const Packet4ui& /*a*/) {
+  return _mm_setzero_si128();
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_mul_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_mul_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b)
-{
-#ifdef EIGEN_VECTORIZE_SSE4_1
-  return _mm_mullo_epi32(a,b);
-#else
-  // this version is slightly faster than 4 scalar products
-  return vec4i_swizzle1(
-            vec4i_swizzle2(
-              _mm_mul_epu32(a,b),
-              _mm_mul_epu32(vec4i_swizzle1(a,1,0,3,2),
-                            vec4i_swizzle1(b,1,0,3,2)),
-              0,2,0,2),
-            0,2,1,3);
+// GCC generates a shufps instruction for _mm_set1_ps/_mm_load1_ps instead of the more efficient pshufd instruction.
+// However, using inrinsics for pset1 makes gcc to generate crappy code in some cases (see bug 203)
+// Using inline assembly is also not an option because then gcc fails to reorder properly the instructions.
+// Therefore, we introduced the pload1 functions to be used in product kernels for which bug 203 does not apply.
+// Also note that with AVX, we want it to generate a vbroadcastss.
+#if EIGEN_COMP_GNUC_STRICT && (!defined __AVX__)
+template <>
+EIGEN_STRONG_INLINE Packet4f pload1<Packet4f>(const float* from) {
+  return vec4f_swizzle1(_mm_load_ss(from), 0, 0, 0, 0);
+}
 #endif
+
+template <>
+EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) {
+  return _mm_add_ps(pset1<Packet4f>(a), _mm_set_ps(3, 2, 1, 0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) {
+  return _mm_add_pd(pset1<Packet2d>(a), _mm_set_pd(1, 0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l plset<Packet2l>(const int64_t& a) {
+  return _mm_add_epi32(pset1<Packet2l>(a), _mm_set_epi64x(1, 0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) {
+  return _mm_add_epi32(pset1<Packet4i>(a), _mm_set_epi32(3, 2, 1, 0));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui plset<Packet4ui>(const uint32_t& a) {
+  return _mm_add_epi32(pset1<Packet4ui>(a), _mm_set_epi32(3, 2, 1, 0));
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_div_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_div_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/)
-{ eigen_assert(false && "packet integer division are not supported by SSE");
-  return pset1<Packet4i>(0);
+template <>
+EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return _mm_add_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return _mm_add_pd(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l padd<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return _mm_add_epi64(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return _mm_add_epi32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui padd<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return _mm_add_epi32(a, b);
 }
 
-// for some weird raisons, it has to be overloaded for packet of integers
-template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return padd(pmul(a,b), c); }
+template <>
+EIGEN_STRONG_INLINE Packet16b padd<Packet16b>(const Packet16b& a, const Packet16b& b) {
+  return _mm_or_si128(a, b);
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_min_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_min_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b)
-{
-#ifdef EIGEN_VECTORIZE_SSE4_1
-  return _mm_min_epi32(a,b);
-#else
-  // after some bench, this version *is* faster than a scalar implementation
-  Packet4i mask = _mm_cmplt_epi32(a,b);
-  return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b));
-#endif
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet padds(const Packet& a, const Packet& b);
+template <>
+EIGEN_STRONG_INLINE Packet4f padds<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return _mm_add_ss(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d padds<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return _mm_add_sd(a, b);
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_max_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_max_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b)
-{
-#ifdef EIGEN_VECTORIZE_SSE4_1
-  return _mm_max_epi32(a,b);
-#else
-  // after some bench, this version *is* faster than a scalar implementation
-  Packet4i mask = _mm_cmpgt_epi32(a,b);
-  return _mm_or_si128(_mm_and_si128(mask,a),_mm_andnot_si128(mask,b));
-#endif
-}
-
-template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_and_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_and_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_and_si128(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_or_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_or_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_or_si128(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_xor_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_xor_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_xor_si128(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return _mm_andnot_ps(a,b); }
-template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return _mm_andnot_pd(a,b); }
-template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return _mm_andnot_si128(a,b); }
-
-template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float*   from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from); }
-template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double*  from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from); }
-template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int*     from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const Packet4i*>(from)); }
-
-#if defined(_MSC_VER)
-  template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float*  from) {
-    EIGEN_DEBUG_UNALIGNED_LOAD
-    #if (_MSC_VER==1600)
-    // NOTE Some version of MSVC10 generates bad code when using _mm_loadu_ps
-    // (i.e., it does not generate an unaligned load!!
-    // TODO On most architectures this version should also be faster than a single _mm_loadu_ps
-    // so we could also enable it for MSVC08 but first we have to make this later does not generate crap when doing so...
-    __m128 res = _mm_loadl_pi(_mm_set1_ps(0.0f), (const __m64*)(from));
-    res = _mm_loadh_pi(res, (const __m64*)(from+2));
-    return res;
-    #else
-    return _mm_loadu_ps(from);
-    #endif
-  }
-  template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_pd(from); }
-  template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int*    from) { EIGEN_DEBUG_UNALIGNED_LOAD return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from)); }
-#else
-// Fast unaligned loads. Note that here we cannot directly use intrinsics: this would
-// require pointer casting to incompatible pointer types and leads to invalid code
-// because of the strict aliasing rule. The "dummy" stuff are required to enforce
-// a correct instruction dependency.
-// TODO: do the same for MSVC (ICC is compatible)
-// NOTE: with the code below, MSVC's compiler crashes!
+template <>
+EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return _mm_sub_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return _mm_sub_pd(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l psub<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return _mm_sub_epi64(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return _mm_sub_epi32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui psub<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return _mm_sub_epi32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16b psub<Packet16b>(const Packet16b& a, const Packet16b& b) {
+  return _mm_xor_si128(a, b);
+}
 
-#if defined(__GNUC__) && defined(__i386__)
-  // bug 195: gcc/i386 emits weird x87 fldl/fstpl instructions for _mm_load_sd
-  #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 1
-#elif defined(__clang__)
-  // bug 201: Segfaults in __mm_loadh_pd with clang 2.8
-  #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 1
+template <>
+EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b);
+template <>
+EIGEN_STRONG_INLINE Packet4f paddsub<Packet4f>(const Packet4f& a, const Packet4f& b) {
+#ifdef EIGEN_VECTORIZE_SSE3
+  return _mm_addsub_ps(a, b);
 #else
-  #define EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS 0
+  const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000, 0x0, 0x80000000, 0x0));
+  return padd(a, pxor(mask, b));
 #endif
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from)
-{
-  EIGEN_DEBUG_UNALIGNED_LOAD
-#if EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS
-  return _mm_loadu_ps(from);
+template <>
+EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d&, const Packet2d&);
+template <>
+EIGEN_STRONG_INLINE Packet2d paddsub<Packet2d>(const Packet2d& a, const Packet2d& b) {
+#ifdef EIGEN_VECTORIZE_SSE3
+  return _mm_addsub_pd(a, b);
 #else
-  __m128d res;
-  res =  _mm_load_sd((const double*)(from)) ;
-  res =  _mm_loadh_pd(res, (const double*)(from+2)) ;
-  return _mm_castpd_ps(res);
+  const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0, 0x80000000, 0x0, 0x0));
+  return padd(a, pxor(mask, b));
 #endif
 }
-template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from)
-{
-  EIGEN_DEBUG_UNALIGNED_LOAD
-#if EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS
-  return _mm_loadu_pd(from);
-#else
-  __m128d res;
-  res = _mm_load_sd(from) ;
-  res = _mm_loadh_pd(res,from+1);
-  return res;
-#endif
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) {
+  const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x80000000, 0x80000000, 0x80000000, 0x80000000));
+  return _mm_xor_ps(a, mask);
 }
-template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from)
-{
-  EIGEN_DEBUG_UNALIGNED_LOAD
-#if EIGEN_AVOID_CUSTOM_UNALIGNED_LOADS
-  return _mm_loadu_si128(reinterpret_cast<const Packet4i*>(from));
-#else
-  __m128d res;
-  res =  _mm_load_sd((const double*)(from)) ;
-  res =  _mm_loadh_pd(res, (const double*)(from+2)) ;
-  return _mm_castpd_si128(res);
-#endif
+template <>
+EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) {
+  const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0x0, 0x80000000, 0x0, 0x80000000));
+  return _mm_xor_pd(a, mask);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pnegate(const Packet2l& a) {
+  return psub(pzero(a), a);
 }
-#endif
 
-template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float*   from)
-{
-  return vec4f_swizzle1(_mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(from))), 0, 0, 1, 1);
+template <>
+EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) {
+  return psub(pzero(a), a);
 }
-template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double*  from)
-{ return pset1<Packet2d>(from[0]); }
-template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int*     from)
-{
-  Packet4i tmp;
-  tmp = _mm_loadl_epi64(reinterpret_cast<const Packet4i*>(from));
-  return vec4i_swizzle1(tmp, 0, 0, 1, 1);
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pconj(const Packet2l& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) {
+  return a;
 }
 
-template<> EIGEN_STRONG_INLINE void pstore<float>(float*   to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from); }
-template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from); }
-template<> EIGEN_STRONG_INLINE void pstore<int>(int*       to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<Packet4i*>(to), from); }
+template <>
+EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return _mm_mul_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return _mm_mul_pd(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pmul<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  // 64-bit mul requires avx512, so do this with 32-bit multiplication
+  __m128i upper32_a = _mm_srli_epi64(a, 32);
+  __m128i upper32_b = _mm_srli_epi64(b, 32);
+
+  // upper * lower
+  __m128i mul1 = _mm_mul_epu32(upper32_a, b);
+  __m128i mul2 = _mm_mul_epu32(upper32_b, a);
+  // Gives us both upper*upper and lower*lower
+  __m128i mul3 = _mm_mul_epu32(a, b);
+
+  __m128i high = _mm_slli_epi64(_mm_add_epi64(mul1, mul2), 32);
+  return _mm_add_epi64(high, mul3);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) {
+#ifdef EIGEN_VECTORIZE_SSE4_1
+  return _mm_mullo_epi32(a, b);
+#else
+  // this version is slightly faster than 4 scalar products
+  return vec4i_swizzle1(
+      vec4i_swizzle2(_mm_mul_epu32(a, b), _mm_mul_epu32(vec4i_swizzle1(a, 1, 0, 3, 2), vec4i_swizzle1(b, 1, 0, 3, 2)),
+                     0, 2, 0, 2),
+      0, 2, 1, 3);
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pmul<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+#ifdef EIGEN_VECTORIZE_SSE4_1
+  return _mm_mullo_epi32(a, b);
+#else
+  // this version is slightly faster than 4 scalar products
+  return vec4ui_swizzle1(
+      vec4ui_swizzle2(_mm_mul_epu32(a, b),
+                      _mm_mul_epu32(vec4ui_swizzle1(a, 1, 0, 3, 2), vec4ui_swizzle1(b, 1, 0, 3, 2)), 0, 2, 0, 2),
+      0, 2, 1, 3);
+#endif
+}
 
-template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
-  EIGEN_DEBUG_UNALIGNED_STORE
-  _mm_storel_pd((to), from);
-  _mm_storeh_pd((to+1), from);
+template <>
+EIGEN_STRONG_INLINE Packet16b pmul<Packet16b>(const Packet16b& a, const Packet16b& b) {
+  return _mm_and_si128(a, b);
 }
-template<> EIGEN_STRONG_INLINE void pstoreu<float>(float*  to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), _mm_castps_pd(from)); }
-template<> EIGEN_STRONG_INLINE void pstoreu<int>(int*      to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu(reinterpret_cast<double*>(to), _mm_castsi128_pd(from)); }
 
-// some compilers might be tempted to perform multiple moves instead of using a vector path.
-template<> EIGEN_STRONG_INLINE void pstore1<Packet4f>(float* to, const float& a)
-{
-  Packet4f pa = _mm_set_ss(a);
-  pstore(to, vec4f_swizzle1(pa,0,0,0,0));
+template <>
+EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return _mm_div_ps(a, b);
 }
-// some compilers might be tempted to perform multiple moves instead of using a vector path.
-template<> EIGEN_STRONG_INLINE void pstore1<Packet2d>(double* to, const double& a)
-{
-  Packet2d pa = _mm_set_sd(a);
-  pstore(to, vec2d_swizzle1(pa,0,0));
+template <>
+EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return _mm_div_pd(a, b);
 }
 
-template<> EIGEN_STRONG_INLINE void prefetch<float>(const float*   addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
-template<> EIGEN_STRONG_INLINE void prefetch<int>(const int*       addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); }
-
-#if defined(_MSC_VER) && defined(_WIN64) && !defined(__INTEL_COMPILER)
-// The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
-// Direct of the struct members fixed bug #62.
-template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { return a.m128_f32[0]; }
-template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return a.m128d_f64[0]; }
-template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; }
-#elif defined(_MSC_VER) && !defined(__INTEL_COMPILER)
-// The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
-template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { float x = _mm_cvtss_f32(a); return x; }
-template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double x = _mm_cvtsd_f64(a); return x; }
-template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { int x = _mm_cvtsi128_si32(a); return x; }
+template <>
+EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) {
+#ifdef EIGEN_VECTORIZE_AVX
+  return _mm256_cvttpd_epi32(_mm256_div_pd(_mm256_cvtepi32_pd(a), _mm256_cvtepi32_pd(b)));
 #else
-template<> EIGEN_STRONG_INLINE float  pfirst<Packet4f>(const Packet4f& a) { return _mm_cvtss_f32(a); }
-template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { return _mm_cvtsd_f64(a); }
-template<> EIGEN_STRONG_INLINE int    pfirst<Packet4i>(const Packet4i& a) { return _mm_cvtsi128_si32(a); }
+  __m128i q_lo = _mm_cvttpd_epi32(_mm_div_pd(_mm_cvtepi32_pd(a), _mm_cvtepi32_pd(b)));
+  __m128i q_hi = _mm_cvttpd_epi32(
+      _mm_div_pd(_mm_cvtepi32_pd(vec4i_swizzle1(a, 2, 3, 0, 1)), _mm_cvtepi32_pd(vec4i_swizzle1(b, 2, 3, 0, 1))));
+  return vec4i_swizzle1(_mm_unpacklo_epi32(q_lo, q_hi), 0, 2, 1, 3);
 #endif
+}
 
-template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a)
-{ return _mm_shuffle_ps(a,a,0x1B); }
-template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a)
-{ return _mm_shuffle_pd(a,a,0x1); }
-template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a)
-{ return _mm_shuffle_epi32(a,0x1B); }
-
-
-template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a)
-{
-  const Packet4f mask = _mm_castsi128_ps(_mm_setr_epi32(0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF));
-  return _mm_and_ps(a,mask);
+#ifdef EIGEN_VECTORIZE_FMA
+template <>
+EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+  return _mm_fmadd_ps(a, b, c);
 }
-template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a)
-{
-  const Packet2d mask = _mm_castsi128_pd(_mm_setr_epi32(0xFFFFFFFF,0x7FFFFFFF,0xFFFFFFFF,0x7FFFFFFF));
-  return _mm_and_pd(a,mask);
+template <>
+EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+  return _mm_fmadd_pd(a, b, c);
 }
-template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a)
-{
-  #ifdef EIGEN_VECTORIZE_SSSE3
-  return _mm_abs_epi32(a);
-  #else
-  Packet4i aux = _mm_srai_epi32(a,31);
-  return _mm_sub_epi32(_mm_xor_si128(a,aux),aux);
-  #endif
+template <>
+EIGEN_STRONG_INLINE Packet4f pmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+  return _mm_fmsub_ps(a, b, c);
 }
-
-EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs)
-{
-  vecs[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x55));
-  vecs[2] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xAA));
-  vecs[3] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xFF));
-  vecs[0] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x00));
+template <>
+EIGEN_STRONG_INLINE Packet2d pmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+  return _mm_fmsub_pd(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pnmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+  return _mm_fnmadd_ps(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pnmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+  return _mm_fnmadd_pd(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pnmsub(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+  return _mm_fnmsub_ps(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pnmsub(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+  return _mm_fnmsub_pd(a, b, c);
 }
 
-#ifdef EIGEN_VECTORIZE_SSE3
-// TODO implement SSE2 versions as well as integer versions
-template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
-{
-  return _mm_hadd_ps(_mm_hadd_ps(vecs[0], vecs[1]),_mm_hadd_ps(vecs[2], vecs[3]));
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet pmadds(const Packet& a, const Packet& b, const Packet& c);
+template <>
+EIGEN_STRONG_INLINE Packet4f pmadds<Packet4f>(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+  return _mm_fmadd_ss(a, b, c);
 }
-template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
-{
-  return _mm_hadd_pd(vecs[0], vecs[1]);
+template <>
+EIGEN_STRONG_INLINE Packet2d pmadds<Packet2d>(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+  return _mm_fmadd_sd(a, b, c);
 }
-// SSSE3 version:
-// EIGEN_STRONG_INLINE Packet4i preduxp(const Packet4i* vecs)
-// {
-//   return _mm_hadd_epi32(_mm_hadd_epi32(vecs[0], vecs[1]),_mm_hadd_epi32(vecs[2], vecs[3]));
-// }
+#endif
 
-template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
-{
-  Packet4f tmp0 = _mm_hadd_ps(a,a);
-  return pfirst(_mm_hadd_ps(tmp0, tmp0));
+#ifdef EIGEN_VECTORIZE_SSE4_1
+template <>
+EIGEN_STRONG_INLINE Packet4f pselect(const Packet4f& mask, const Packet4f& a, const Packet4f& b) {
+  return _mm_blendv_ps(b, a, mask);
 }
 
-template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) { return pfirst(_mm_hadd_pd(a, a)); }
+template <>
+EIGEN_STRONG_INLINE Packet2l pselect(const Packet2l& mask, const Packet2l& a, const Packet2l& b) {
+  return _mm_castpd_si128(_mm_blendv_pd(_mm_castsi128_pd(b), _mm_castsi128_pd(a), _mm_castsi128_pd(mask)));
+}
 
-// SSSE3 version:
-// EIGEN_STRONG_INLINE float predux(const Packet4i& a)
-// {
-//   Packet4i tmp0 = _mm_hadd_epi32(a,a);
-//   return pfirst(_mm_hadd_epi32(tmp0, tmp0));
-// }
-#else
-// SSE2 versions
-template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a)
-{
-  Packet4f tmp = _mm_add_ps(a, _mm_movehl_ps(a,a));
-  return pfirst(_mm_add_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
+template <>
+EIGEN_STRONG_INLINE Packet4i pselect(const Packet4i& mask, const Packet4i& a, const Packet4i& b) {
+  return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(b), _mm_castsi128_ps(a), _mm_castsi128_ps(mask)));
 }
-template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a)
-{
-  return pfirst(_mm_add_sd(a, _mm_unpackhi_pd(a,a)));
+
+template <>
+EIGEN_STRONG_INLINE Packet4ui pselect(const Packet4ui& mask, const Packet4ui& a, const Packet4ui& b) {
+  return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(b), _mm_castsi128_ps(a), _mm_castsi128_ps(mask)));
 }
 
-template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs)
-{
-  Packet4f tmp0, tmp1, tmp2;
-  tmp0 = _mm_unpacklo_ps(vecs[0], vecs[1]);
-  tmp1 = _mm_unpackhi_ps(vecs[0], vecs[1]);
-  tmp2 = _mm_unpackhi_ps(vecs[2], vecs[3]);
-  tmp0 = _mm_add_ps(tmp0, tmp1);
-  tmp1 = _mm_unpacklo_ps(vecs[2], vecs[3]);
-  tmp1 = _mm_add_ps(tmp1, tmp2);
-  tmp2 = _mm_movehl_ps(tmp1, tmp0);
-  tmp0 = _mm_movelh_ps(tmp0, tmp1);
-  return _mm_add_ps(tmp0, tmp2);
-}
-
-template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs)
-{
-  return _mm_add_pd(_mm_unpacklo_pd(vecs[0], vecs[1]), _mm_unpackhi_pd(vecs[0], vecs[1]));
+template <>
+EIGEN_STRONG_INLINE Packet2d pselect(const Packet2d& mask, const Packet2d& a, const Packet2d& b) {
+  return _mm_blendv_pd(b, a, mask);
 }
-#endif  // SSE3
+#endif
 
-template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a)
-{
-  Packet4i tmp = _mm_add_epi32(a, _mm_unpackhi_epi64(a,a));
-  return pfirst(tmp) + pfirst(_mm_shuffle_epi32(tmp, 1));
+template <>
+EIGEN_STRONG_INLINE Packet2l ptrue<Packet2l>(const Packet2l& a) {
+  return _mm_cmpeq_epi32(a, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i ptrue<Packet4i>(const Packet4i& a) {
+  return _mm_cmpeq_epi32(a, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16b ptrue<Packet16b>(const Packet16b& /*a*/) {
+  return pset1<Packet16b>(true);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f ptrue<Packet4f>(const Packet4f& a) {
+  Packet4i b = _mm_castps_si128(a);
+  return _mm_castsi128_ps(_mm_cmpeq_epi32(b, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d ptrue<Packet2d>(const Packet2d& a) {
+  Packet4i b = _mm_castpd_si128(a);
+  return _mm_castsi128_pd(_mm_cmpeq_epi32(b, b));
 }
 
-template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs)
-{
-  Packet4i tmp0, tmp1, tmp2;
-  tmp0 = _mm_unpacklo_epi32(vecs[0], vecs[1]);
-  tmp1 = _mm_unpackhi_epi32(vecs[0], vecs[1]);
-  tmp2 = _mm_unpackhi_epi32(vecs[2], vecs[3]);
-  tmp0 = _mm_add_epi32(tmp0, tmp1);
-  tmp1 = _mm_unpacklo_epi32(vecs[2], vecs[3]);
-  tmp1 = _mm_add_epi32(tmp1, tmp2);
-  tmp2 = _mm_unpacklo_epi64(tmp0, tmp1);
-  tmp0 = _mm_unpackhi_epi64(tmp0, tmp1);
-  return _mm_add_epi32(tmp0, tmp2);
-}
-
-// Other reduction functions:
-
-// mul
-template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a)
-{
-  Packet4f tmp = _mm_mul_ps(a, _mm_movehl_ps(a,a));
-  return pfirst(_mm_mul_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
+template <>
+EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return _mm_and_ps(a, b);
 }
-template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a)
-{
-  return pfirst(_mm_mul_sd(a, _mm_unpackhi_pd(a,a)));
+template <>
+EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return _mm_and_pd(a, b);
 }
-template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a)
-{
-  // after some experiments, it is seems this is the fastest way to implement it
-  // for GCC (eg., reusing pmul is very slow !)
-  // TODO try to call _mm_mul_epu32 directly
-  EIGEN_ALIGN16 int aux[4];
-  pstore(aux, a);
-  return  (aux[0] * aux[1]) * (aux[2] * aux[3]);;
+template <>
+EIGEN_STRONG_INLINE Packet2l pand<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return _mm_and_si128(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return _mm_and_si128(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pand<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return _mm_and_si128(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16b pand<Packet16b>(const Packet16b& a, const Packet16b& b) {
+  return _mm_and_si128(a, b);
 }
 
-// min
-template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a)
-{
-  Packet4f tmp = _mm_min_ps(a, _mm_movehl_ps(a,a));
-  return pfirst(_mm_min_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
+template <>
+EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return _mm_or_ps(a, b);
 }
-template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a)
-{
-  return pfirst(_mm_min_sd(a, _mm_unpackhi_pd(a,a)));
+template <>
+EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return _mm_or_pd(a, b);
 }
-template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a)
-{
-  // after some experiments, it is seems this is the fastest way to implement it
-  // for GCC (eg., it does not like using std::min after the pstore !!)
-  EIGEN_ALIGN16 int aux[4];
-  pstore(aux, a);
-  int aux0 = aux[0]<aux[1] ? aux[0] : aux[1];
-  int aux2 = aux[2]<aux[3] ? aux[2] : aux[3];
-  return aux0<aux2 ? aux0 : aux2;
+template <>
+EIGEN_STRONG_INLINE Packet2l por<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return _mm_or_si128(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return _mm_or_si128(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui por<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return _mm_or_si128(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16b por<Packet16b>(const Packet16b& a, const Packet16b& b) {
+  return _mm_or_si128(a, b);
 }
 
-// max
-template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a)
-{
-  Packet4f tmp = _mm_max_ps(a, _mm_movehl_ps(a,a));
-  return pfirst(_mm_max_ss(tmp, _mm_shuffle_ps(tmp,tmp, 1)));
+template <>
+EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return _mm_xor_ps(a, b);
 }
-template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a)
-{
-  return pfirst(_mm_max_sd(a, _mm_unpackhi_pd(a,a)));
+template <>
+EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return _mm_xor_pd(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pxor<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return _mm_xor_si128(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return _mm_xor_si128(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pxor<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return _mm_xor_si128(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16b pxor<Packet16b>(const Packet16b& a, const Packet16b& b) {
+  return _mm_xor_si128(a, b);
 }
-template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a)
-{
-  // after some experiments, it is seems this is the fastest way to implement it
-  // for GCC (eg., it does not like using std::min after the pstore !!)
-  EIGEN_ALIGN16 int aux[4];
-  pstore(aux, a);
-  int aux0 = aux[0]>aux[1] ? aux[0] : aux[1];
-  int aux2 = aux[2]>aux[3] ? aux[2] : aux[3];
-  return aux0>aux2 ? aux0 : aux2;
-}
-
-#if (defined __GNUC__)
-// template <> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f&  a, const Packet4f&  b, const Packet4f&  c)
-// {
-//   Packet4f res = b;
-//   asm("mulps %[a], %[b] \n\taddps %[c], %[b]" : [b] "+x" (res) : [a] "x" (a), [c] "x" (c));
-//   return res;
-// }
-// EIGEN_STRONG_INLINE Packet4i _mm_alignr_epi8(const Packet4i&  a, const Packet4i&  b, const int i)
-// {
-//   Packet4i res = a;
-//   asm("palignr %[i], %[a], %[b] " : [b] "+x" (res) : [a] "x" (a), [i] "i" (i));
-//   return res;
-// }
-#endif
 
-#ifdef EIGEN_VECTORIZE_SSSE3
-// SSSE3 versions
-template<int Offset>
-struct palign_impl<Offset,Packet4f>
-{
-  static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
-  {
-    if (Offset!=0)
-      first = _mm_castsi128_ps(_mm_alignr_epi8(_mm_castps_si128(second), _mm_castps_si128(first), Offset*4));
-  }
-};
+template <>
+EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return _mm_andnot_ps(b, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return _mm_andnot_pd(b, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pandnot<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  return _mm_andnot_si128(b, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return _mm_andnot_si128(b, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pandnot<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+  return _mm_andnot_si128(b, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16b pandnot<Packet16b>(const Packet16b& a, const Packet16b& b) {
+  return _mm_andnot_si128(b, a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16b pcmp_lt(const Packet16b& a, const Packet16b& b) {
+  return _mm_andnot_si128(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pcmp_le(const Packet4f& a, const Packet4f& b) {
+  return _mm_cmple_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pcmp_lt(const Packet4f& a, const Packet4f& b) {
+  return _mm_cmplt_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pcmp_lt_or_nan(const Packet4f& a, const Packet4f& b) {
+  return _mm_cmpnge_ps(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pcmp_eq(const Packet4f& a, const Packet4f& b) {
+  return _mm_cmpeq_ps(a, b);
+}
 
-template<int Offset>
-struct palign_impl<Offset,Packet4i>
-{
-  static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second)
-  {
-    if (Offset!=0)
-      first = _mm_alignr_epi8(second,first, Offset*4);
-  }
-};
+template <>
+EIGEN_STRONG_INLINE Packet2d pcmp_le(const Packet2d& a, const Packet2d& b) {
+  return _mm_cmple_pd(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pcmp_lt(const Packet2d& a, const Packet2d& b) {
+  return _mm_cmplt_pd(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pcmp_lt_or_nan(const Packet2d& a, const Packet2d& b) {
+  return _mm_cmpnge_pd(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pcmp_eq(const Packet2d& a, const Packet2d& b) {
+  return _mm_cmpeq_pd(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pcmp_lt(const Packet4i& a, const Packet4i& b) {
+  return _mm_cmplt_epi32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pcmp_eq(const Packet4i& a, const Packet4i& b) {
+  return _mm_cmpeq_epi32(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pcmp_le(const Packet4i& a, const Packet4i& b) {
+#ifdef EIGEN_VECTORIZE_SSE4_1
+  return _mm_cmpeq_epi32(a, _mm_min_epi32(a, b));
+#else
+  return por(pcmp_lt(a, b), pcmp_eq(a, b));
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pcmp_lt(const Packet2l& a, const Packet2l& b) {
+#ifdef EIGEN_VECTORIZE_SSE4_2
+  return _mm_cmpgt_epi64(b, a);
+#else
+  Packet4i eq = pcmp_eq<Packet4i>(Packet4i(a), Packet4i(b));
+  Packet2l hi_eq = Packet2l(_mm_shuffle_epi32(eq, (shuffle_mask<1, 1, 3, 3>::mask)));
+  Packet4i lt = pcmp_lt<Packet4i>(Packet4i(a), Packet4i(b));
+  Packet2l hi_lt = Packet2l(_mm_shuffle_epi32(lt, (shuffle_mask<1, 1, 3, 3>::mask)));
+  Packet2l lo_lt = Packet2l(_mm_shuffle_epi32(lt, (shuffle_mask<0, 0, 2, 2>::mask)));
+  // return hi(a) < hi(b) || (hi(a) == hi(b) && lo(a) < lo(b))
+  return por(hi_lt, pand(hi_eq, lo_lt));
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pcmp_eq(const Packet2l& a, const Packet2l& b) {
+#ifdef EIGEN_VECTORIZE_SSE4_1
+  return _mm_cmpeq_epi64(a, b);
+#else
+  Packet4i tmp = pcmp_eq<Packet4i>(Packet4i(a), Packet4i(b));
+  return Packet2l(pand<Packet4i>(tmp, _mm_shuffle_epi32(tmp, (shuffle_mask<1, 0, 3, 2>::mask))));
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pcmp_le(const Packet2l& a, const Packet2l& b) {
+  return por(pcmp_lt(a, b), pcmp_eq(a, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16b pcmp_eq(const Packet16b& a, const Packet16b& b) {
+  // Mask out invalid bool bits to avoid UB.
+  const Packet16b kBoolMask = pset1<Packet16b>(true);
+  return _mm_and_si128(_mm_cmpeq_epi8(a, b), kBoolMask);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcmp_eq(const Packet4ui& a, const Packet4ui& b) {
+  return _mm_cmpeq_epi32(a, b);
+}
 
-template<int Offset>
-struct palign_impl<Offset,Packet2d>
-{
-  static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second)
-  {
-    if (Offset==1)
-      first = _mm_castsi128_pd(_mm_alignr_epi8(_mm_castpd_si128(second), _mm_castpd_si128(first), 8));
-  }
-};
+template <>
+EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
+#if EIGEN_GNUC_STRICT_LESS_THAN(6, 3, 0)
+// There appears to be a bug in GCC, by which the optimizer may
+// flip the argument order in calls to _mm_min_ps, so we have to
+// resort to inline ASM here. This is supposed to be fixed in gcc6.3,
+// see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
+#ifdef EIGEN_VECTORIZE_AVX
+  Packet4f res;
+  asm("vminps %[a], %[b], %[res]" : [res] "=x"(res) : [a] "x"(a), [b] "x"(b));
 #else
-// SSE2 versions
-template<int Offset>
-struct palign_impl<Offset,Packet4f>
-{
-  static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second)
-  {
-    if (Offset==1)
-    {
-      first = _mm_move_ss(first,second);
-      first = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(first),0x39));
-    }
-    else if (Offset==2)
-    {
-      first = _mm_movehl_ps(first,first);
-      first = _mm_movelh_ps(first,second);
-    }
-    else if (Offset==3)
-    {
-      first = _mm_move_ss(first,second);
-      first = _mm_shuffle_ps(first,second,0x93);
-    }
-  }
+  Packet4f res = b;
+  asm("minps %[a], %[res]" : [res] "+x"(res) : [a] "x"(a));
+#endif
+  return res;
+#else
+  // Arguments are reversed to match NaN propagation behavior of std::min.
+  return _mm_min_ps(b, a);
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
+#if EIGEN_GNUC_STRICT_LESS_THAN(6, 3, 0)
+// There appears to be a bug in GCC, by which the optimizer may
+// flip the argument order in calls to _mm_min_pd, so we have to
+// resort to inline ASM here. This is supposed to be fixed in gcc6.3,
+// see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
+#ifdef EIGEN_VECTORIZE_AVX
+  Packet2d res;
+  asm("vminpd %[a], %[b], %[res]" : [res] "=x"(res) : [a] "x"(a), [b] "x"(b));
+#else
+  Packet2d res = b;
+  asm("minpd %[a], %[res]" : [res] "+x"(res) : [a] "x"(a));
+#endif
+  return res;
+#else
+  // Arguments are reversed to match NaN propagation behavior of std::min.
+  return _mm_min_pd(b, a);
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pmin<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  Packet2l a_lt_mask = pcmp_lt(a, b);
+  return por(pandnot(b, a_lt_mask), pand(a, a_lt_mask));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) {
+#ifdef EIGEN_VECTORIZE_SSE4_1
+  return _mm_min_epi32(a, b);
+#else
+  // after some bench, this version *is* faster than a scalar implementation
+  Packet4i mask = _mm_cmplt_epi32(a, b);
+  return _mm_or_si128(_mm_and_si128(mask, a), _mm_andnot_si128(mask, b));
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pmin<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+#ifdef EIGEN_VECTORIZE_SSE4_1
+  return _mm_min_epu32(a, b);
+#else
+  return padd((Packet4ui)pmin((Packet4i)psub(a, pset1<Packet4ui>(0x80000000UL)),
+                              (Packet4i)psub(b, pset1<Packet4ui>(0x80000000UL))),
+              pset1<Packet4ui>(0x80000000UL));
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
+#if EIGEN_GNUC_STRICT_LESS_THAN(6, 3, 0)
+// There appears to be a bug in GCC, by which the optimizer may
+// flip the argument order in calls to _mm_max_ps, so we have to
+// resort to inline ASM here. This is supposed to be fixed in gcc6.3,
+// see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
+#ifdef EIGEN_VECTORIZE_AVX
+  Packet4f res;
+  asm("vmaxps %[a], %[b], %[res]" : [res] "=x"(res) : [a] "x"(a), [b] "x"(b));
+#else
+  Packet4f res = b;
+  asm("maxps %[a], %[res]" : [res] "+x"(res) : [a] "x"(a));
+#endif
+  return res;
+#else
+  // Arguments are reversed to match NaN propagation behavior of std::max.
+  return _mm_max_ps(b, a);
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
+#if EIGEN_GNUC_STRICT_LESS_THAN(6, 3, 0)
+// There appears to be a bug in GCC, by which the optimizer may
+// flip the argument order in calls to _mm_max_pd, so we have to
+// resort to inline ASM here. This is supposed to be fixed in gcc6.3,
+// see also: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=72867
+#ifdef EIGEN_VECTORIZE_AVX
+  Packet2d res;
+  asm("vmaxpd %[a], %[b], %[res]" : [res] "=x"(res) : [a] "x"(a), [b] "x"(b));
+#else
+  Packet2d res = b;
+  asm("maxpd %[a], %[res]" : [res] "+x"(res) : [a] "x"(a));
+#endif
+  return res;
+#else
+  // Arguments are reversed to match NaN propagation behavior of std::max.
+  return _mm_max_pd(b, a);
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pmax<Packet2l>(const Packet2l& a, const Packet2l& b) {
+  Packet2l a_lt_mask = pcmp_lt(a, b);
+  return por(pandnot(a, a_lt_mask), pand(b, a_lt_mask));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) {
+#ifdef EIGEN_VECTORIZE_SSE4_1
+  return _mm_max_epi32(a, b);
+#else
+  // after some bench, this version *is* faster than a scalar implementation
+  Packet4i mask = _mm_cmpgt_epi32(a, b);
+  return _mm_or_si128(_mm_and_si128(mask, a), _mm_andnot_si128(mask, b));
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pmax<Packet4ui>(const Packet4ui& a, const Packet4ui& b) {
+#ifdef EIGEN_VECTORIZE_SSE4_1
+  return _mm_max_epu32(a, b);
+#else
+  return padd((Packet4ui)pmax((Packet4i)psub(a, pset1<Packet4ui>(0x80000000UL)),
+                              (Packet4i)psub(b, pset1<Packet4ui>(0x80000000UL))),
+              pset1<Packet4ui>(0x80000000UL));
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcmp_lt(const Packet4ui& a, const Packet4ui& b) {
+#ifdef EIGEN_VECTORIZE_SSE4_1
+  return pxor(pcmp_eq(a, pmax(a, b)), ptrue(a));
+#else
+  return (Packet4ui)pcmp_lt((Packet4i)psub(a, pset1<Packet4ui>(0x80000000UL)),
+                            (Packet4i)psub(b, pset1<Packet4ui>(0x80000000UL)));
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pcmp_le(const Packet4ui& a, const Packet4ui& b) {
+#ifdef EIGEN_VECTORIZE_SSE4_1
+  return pcmp_eq(a, pmin(a, b));
+#else
+  return (Packet4ui)pcmp_le((Packet4i)psub(a, pset1<Packet4ui>(0x80000000UL)),
+                            (Packet4i)psub(b, pset1<Packet4ui>(0x80000000UL)));
+#endif
+}
+
+template <typename Packet, typename Op>
+EIGEN_STRONG_INLINE Packet pminmax_propagate_numbers(const Packet& a, const Packet& b, Op op) {
+  // In this implementation, we take advantage of the fact that pmin/pmax for SSE
+  // always return a if either a or b is NaN.
+  Packet not_nan_mask_a = pcmp_eq(a, a);
+  Packet m = op(a, b);
+  return pselect<Packet>(not_nan_mask_a, m, b);
+}
+
+template <typename Packet, typename Op>
+EIGEN_STRONG_INLINE Packet pminmax_propagate_nan(const Packet& a, const Packet& b, Op op) {
+  // In this implementation, we take advantage of the fact that pmin/pmax for SSE
+  // always return a if either a or b is NaN.
+  Packet not_nan_mask_a = pcmp_eq(a, a);
+  Packet m = op(b, a);
+  return pselect<Packet>(not_nan_mask_a, m, a);
+}
+
+// Add specializations for min/max with prescribed NaN propagation.
+template <>
+EIGEN_STRONG_INLINE Packet4f pmin<PropagateNumbers, Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return pminmax_propagate_numbers(a, b, pmin<Packet4f>);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pmin<PropagateNumbers, Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return pminmax_propagate_numbers(a, b, pmin<Packet2d>);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pmax<PropagateNumbers, Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return pminmax_propagate_numbers(a, b, pmax<Packet4f>);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pmax<PropagateNumbers, Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return pminmax_propagate_numbers(a, b, pmax<Packet2d>);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pmin<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return pminmax_propagate_nan(a, b, pmin<Packet4f>);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pmin<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return pminmax_propagate_nan(a, b, pmin<Packet2d>);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pmax<PropagateNaN, Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return pminmax_propagate_nan(a, b, pmax<Packet4f>);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pmax<PropagateNaN, Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return pminmax_propagate_nan(a, b, pmax<Packet2d>);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f psignbit(const Packet4f& a) {
+  return _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(a), 31));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d psignbit(const Packet2d& a) {
+  Packet4f tmp = psignbit<Packet4f>(_mm_castpd_ps(a));
+#ifdef EIGEN_VECTORIZE_AVX
+  return _mm_castps_pd(_mm_permute_ps(tmp, (shuffle_mask<1, 1, 3, 3>::mask)));
+#else
+  return _mm_castps_pd(_mm_shuffle_ps(tmp, tmp, (shuffle_mask<1, 1, 3, 3>::mask)));
+#endif  // EIGEN_VECTORIZE_AVX
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i psignbit(const Packet4i& a) {
+  return _mm_srai_epi32(a, 31);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui psignbit(const Packet4ui& a) {
+  return pzero(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l psignbit(const Packet2l& a) {
+  Packet4i tmp = psignbit<Packet4i>(Packet4i(a));
+  return Packet2l(_mm_shuffle_epi32(tmp, (shuffle_mask<1, 1, 3, 3>::mask)));
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet2l parithmetic_shift_right(const Packet2l& a) {
+  Packet2l signbit = psignbit(a);
+  return por(_mm_slli_epi64(signbit, 64 - N), _mm_srli_epi64(a, N));
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) {
+  return _mm_srli_epi64(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) {
+  return _mm_slli_epi64(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a) {
+  return _mm_srai_epi32(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4i plogical_shift_right(const Packet4i& a) {
+  return _mm_srli_epi32(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4i plogical_shift_left(const Packet4i& a) {
+  return _mm_slli_epi32(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4ui parithmetic_shift_right(const Packet4ui& a) {
+  return _mm_srli_epi32(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4ui plogical_shift_right(const Packet4ui& a) {
+  return _mm_srli_epi32(a, N);
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4ui plogical_shift_left(const Packet4ui& a) {
+  return _mm_slli_epi32(a, N);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) {
+  const __m128i mask = _mm_setr_epi32(0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF);
+  return _mm_castsi128_ps(_mm_and_si128(mask, _mm_castps_si128(a)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) {
+  const __m128i mask = _mm_setr_epi32(0xFFFFFFFF, 0x7FFFFFFF, 0xFFFFFFFF, 0x7FFFFFFF);
+  return _mm_castsi128_pd(_mm_and_si128(mask, _mm_castpd_si128(a)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pabs(const Packet2l& a) {
+  Packet2l signbit = psignbit(a);
+  return _mm_sub_epi64(_mm_xor_si128(a, signbit), signbit);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) {
+#ifdef EIGEN_VECTORIZE_SSSE3
+  return _mm_abs_epi32(a);
+#else
+  Packet4i signbit = psignbit(a);
+  return _mm_sub_epi32(_mm_xor_si128(a, signbit), signbit);
+#endif
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pabs(const Packet4ui& a) {
+  return a;
+}
+
+#ifdef EIGEN_VECTORIZE_SSE4_1
+template <>
+EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) {
+  // Unfortunately _mm_round_ps doesn't have a rounding mode to implement numext::round.
+  const Packet4f mask = pset1frombits<Packet4f>(0x80000000u);
+  const Packet4f prev0dot5 = pset1frombits<Packet4f>(0x3EFFFFFFu);
+  return _mm_round_ps(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) {
+  const Packet2d mask = _mm_castsi128_pd(_mm_set_epi64x(0x8000000000000000ull, 0x8000000000000000ull));
+  const Packet2d prev0dot5 = _mm_castsi128_pd(_mm_set_epi64x(0x3FDFFFFFFFFFFFFFull, 0x3FDFFFFFFFFFFFFFull));
+  return _mm_round_pd(padd(por(pand(a, mask), prev0dot5), a), _MM_FROUND_TO_ZERO);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f print<Packet4f>(const Packet4f& a) {
+  return _mm_round_ps(a, _MM_FROUND_CUR_DIRECTION);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d print<Packet2d>(const Packet2d& a) {
+  return _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) {
+  return _mm_ceil_ps(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) {
+  return _mm_ceil_pd(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) {
+  return _mm_floor_ps(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) {
+  return _mm_floor_pd(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f ptrunc<Packet4f>(const Packet4f& a) {
+  return _mm_round_ps(a, _MM_FROUND_TRUNC);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d ptrunc<Packet2d>(const Packet2d& a) {
+  return _mm_round_pd(a, _MM_FROUND_TRUNC);
+}
+#endif
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_ps(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_pd(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pload<Packet2l>(const int64_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pload<Packet4ui>(const uint32_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16b pload<Packet16b>(const bool* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return _mm_load_si128(reinterpret_cast<const __m128i*>(from));
+}
+
+#if EIGEN_COMP_MSVC
+template <>
+EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD
+  return _mm_loadu_ps(from);
+}
+#else
+// NOTE: with the code below, MSVC's compiler crashes!
+
+template <>
+EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD
+  return _mm_loadu_ps(from);
+}
+#endif
+
+template <>
+EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD
+  return _mm_loadu_pd(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l ploadu<Packet2l>(const int64_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD
+  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD
+  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui ploadu<Packet4ui>(const uint32_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD
+  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet16b ploadu<Packet16b>(const bool* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD
+  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
+}
+
+// Load lower part of packet zero extending.
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet ploadl(const typename unpacket_traits<Packet>::type* from);
+template <>
+EIGEN_STRONG_INLINE Packet4f ploadl<Packet4f>(const float* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(from)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d ploadl<Packet2d>(const double* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm_load_sd(from);
+}
+
+// Load scalar
+template <typename Packet>
+EIGEN_STRONG_INLINE Packet ploads(const typename unpacket_traits<Packet>::type* from);
+template <>
+EIGEN_STRONG_INLINE Packet4f ploads<Packet4f>(const float* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm_load_ss(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d ploads<Packet2d>(const double* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return _mm_load_sd(from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) {
+  return vec4f_swizzle1(_mm_castpd_ps(_mm_load_sd(reinterpret_cast<const double*>(from))), 0, 0, 1, 1);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) {
+  return pset1<Packet2d>(from[0]);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l ploaddup<Packet2l>(const int64_t* from) {
+  return pset1<Packet2l>(from[0]);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from) {
+  Packet4i tmp;
+  tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(from));
+  return vec4i_swizzle1(tmp, 0, 0, 1, 1);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui ploaddup<Packet4ui>(const uint32_t* from) {
+  Packet4ui tmp;
+  tmp = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(from));
+  return vec4ui_swizzle1(tmp, 0, 0, 1, 1);
+}
+
+// Loads 8 bools from memory and returns the packet
+// {b0, b0, b1, b1, b2, b2, b3, b3, b4, b4, b5, b5, b6, b6, b7, b7}
+template <>
+EIGEN_STRONG_INLINE Packet16b ploaddup<Packet16b>(const bool* from) {
+  __m128i tmp = _mm_castpd_si128(pload1<Packet2d>(reinterpret_cast<const double*>(from)));
+  return _mm_unpacklo_epi8(tmp, tmp);
+}
+
+// Loads 4 bools from memory and returns the packet
+// {b0, b0  b0, b0, b1, b1, b1, b1, b2, b2, b2, b2, b3, b3, b3, b3}
+template <>
+EIGEN_STRONG_INLINE Packet16b ploadquad<Packet16b>(const bool* from) {
+  __m128i tmp = _mm_castps_si128(pload1<Packet4f>(reinterpret_cast<const float*>(from)));
+  tmp = _mm_unpacklo_epi8(tmp, tmp);
+  return _mm_unpacklo_epi16(tmp, tmp);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm_store_ps(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm_store_pd(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<int64_t>(int64_t* to, const Packet2l& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<uint32_t>(uint32_t* to, const Packet4ui& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<bool>(bool* to, const Packet16b& from) {
+  EIGEN_DEBUG_ALIGNED_STORE _mm_store_si128(reinterpret_cast<__m128i*>(to), from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_pd(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_ps(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int64_t>(int64_t* to, const Packet2l& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<uint32_t>(uint32_t* to, const Packet4ui& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<bool>(bool* to, const Packet16b& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm_storeu_si128(reinterpret_cast<__m128i*>(to), from);
+}
+
+template <typename Scalar, typename Packet>
+EIGEN_STRONG_INLINE void pstorel(Scalar* to, const Packet& from);
+template <>
+EIGEN_STRONG_INLINE void pstorel(float* to, const Packet4f& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm_storel_pi(reinterpret_cast<__m64*>(to), from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstorel(double* to, const Packet2d& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm_storel_pd(to, from);
+}
+
+template <typename Scalar, typename Packet>
+EIGEN_STRONG_INLINE void pstores(Scalar* to, const Packet& from);
+template <>
+EIGEN_STRONG_INLINE void pstores(float* to, const Packet4f& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm_store_ss(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstores(double* to, const Packet2d& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE _mm_store_sd(to, from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
+  return _mm_shuffle_ps(a, a, 0x1B);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) {
+  return _mm_shuffle_pd(a, a, 0x1);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l preverse(const Packet2l& a) {
+  return _mm_castpd_si128(preverse(_mm_castsi128_pd(a)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
+  return _mm_shuffle_epi32(a, 0x1B);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui preverse(const Packet4ui& a) {
+  return _mm_shuffle_epi32(a, 0x1B);
+}
+template <>
+EIGEN_STRONG_INLINE Packet16b preverse(const Packet16b& a) {
+#ifdef EIGEN_VECTORIZE_SSSE3
+  __m128i mask = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+  return _mm_shuffle_epi8(a, mask);
+#else
+  Packet16b tmp = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 1, 2, 3));
+  tmp = _mm_shufflehi_epi16(_mm_shufflelo_epi16(tmp, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1));
+  return _mm_or_si128(_mm_slli_epi16(tmp, 8), _mm_srli_epi16(tmp, 8));
+#endif
+}
+
+#if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64
+// The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
+// Direct of the struct members fixed bug #62.
+template <>
+EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
+  return a.m128_f32[0];
+}
+template <>
+EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
+  return a.m128d_f64[0];
+}
+template <>
+EIGEN_STRONG_INLINE int64_t pfirst<Packet2l>(const Packet2l& a) {
+  int64_t x = _mm_extract_epi64_0(a);
+  return x;
+}
+template <>
+EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) {
+  int x = _mm_cvtsi128_si32(a);
+  return x;
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) {
+  uint32_t x = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(a));
+  return x;
+}
+#elif EIGEN_COMP_MSVC_STRICT
+// The temporary variable fixes an internal compilation error in vs <= 2008 and a wrong-result bug in vs 2010
+template <>
+EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
+  float x = _mm_cvtss_f32(a);
+  return x;
+}
+template <>
+EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
+  double x = _mm_cvtsd_f64(a);
+  return x;
+}
+template <>
+EIGEN_STRONG_INLINE int64_t pfirst<Packet2l>(const Packet2l& a) {
+  int64_t x = _mm_extract_epi64_0(a);
+  return x;
+}
+template <>
+EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) {
+  int x = _mm_cvtsi128_si32(a);
+  return x;
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) {
+  uint32_t x = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(a));
+  return x;
+}
+#else
+template <>
+EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
+  return _mm_cvtss_f32(a);
+}
+template <>
+EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
+  return _mm_cvtsd_f64(a);
+}
+template <>
+EIGEN_STRONG_INLINE int64_t pfirst<Packet2l>(const Packet2l& a) {
+  return _mm_extract_epi64_0(a);
+}
+template <>
+EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) {
+  return _mm_cvtsi128_si32(a);
+}
+template <>
+EIGEN_STRONG_INLINE uint32_t pfirst<Packet4ui>(const Packet4ui& a) {
+  return numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(a));
+}
+#endif
+template <>
+EIGEN_STRONG_INLINE bool pfirst<Packet16b>(const Packet16b& a) {
+  int x = _mm_cvtsi128_si32(a);
+  return static_cast<bool>(x & 1);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
+  return _mm_set_ps(from[3 * stride], from[2 * stride], from[1 * stride], from[0 * stride]);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pgather<double, Packet2d>(const double* from, Index stride) {
+  return _mm_set_pd(from[1 * stride], from[0 * stride]);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l pgather<int64_t, Packet2l>(const int64_t* from, Index stride) {
+  return _mm_set_epi64x(from[1 * stride], from[0 * stride]);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pgather<int, Packet4i>(const int* from, Index stride) {
+  return _mm_set_epi32(from[3 * stride], from[2 * stride], from[1 * stride], from[0 * stride]);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pgather<uint32_t, Packet4ui>(const uint32_t* from, Index stride) {
+  return _mm_set_epi32(numext::bit_cast<int32_t>(from[3 * stride]), numext::bit_cast<int32_t>(from[2 * stride]),
+                       numext::bit_cast<int32_t>(from[1 * stride]), numext::bit_cast<int32_t>(from[0 * stride]));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet16b pgather<bool, Packet16b>(const bool* from, Index stride) {
+  return _mm_set_epi8(from[15 * stride], from[14 * stride], from[13 * stride], from[12 * stride], from[11 * stride],
+                      from[10 * stride], from[9 * stride], from[8 * stride], from[7 * stride], from[6 * stride],
+                      from[5 * stride], from[4 * stride], from[3 * stride], from[2 * stride], from[1 * stride],
+                      from[0 * stride]);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) {
+  to[stride * 0] = pfirst(from);
+  to[stride * 1] = pfirst(Packet4f(_mm_shuffle_ps(from, from, 1)));
+  to[stride * 2] = pfirst(Packet4f(_mm_shuffle_ps(from, from, 2)));
+  to[stride * 3] = pfirst(Packet4f(_mm_shuffle_ps(from, from, 3)));
+}
+template <>
+EIGEN_STRONG_INLINE void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) {
+  to[stride * 0] = pfirst(from);
+  to[stride * 1] = pfirst(preverse(from));
+}
+template <>
+EIGEN_STRONG_INLINE void pscatter<int64_t, Packet2l>(int64_t* to, const Packet2l& from, Index stride) {
+  to[stride * 0] = pfirst(from);
+  to[stride * 1] = pfirst(preverse(from));
+}
+template <>
+EIGEN_STRONG_INLINE void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride) {
+  to[stride * 0] = _mm_cvtsi128_si32(from);
+  to[stride * 1] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1));
+  to[stride * 2] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2));
+  to[stride * 3] = _mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3));
+}
+template <>
+EIGEN_STRONG_INLINE void pscatter<uint32_t, Packet4ui>(uint32_t* to, const Packet4ui& from, Index stride) {
+  to[stride * 0] = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(from));
+  to[stride * 1] = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(_mm_shuffle_epi32(from, 1)));
+  to[stride * 2] = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(_mm_shuffle_epi32(from, 2)));
+  to[stride * 3] = numext::bit_cast<uint32_t>(_mm_cvtsi128_si32(_mm_shuffle_epi32(from, 3)));
+}
+template <>
+EIGEN_STRONG_INLINE void pscatter<bool, Packet16b>(bool* to, const Packet16b& from, Index stride) {
+  EIGEN_ALIGN16 bool tmp[16];
+  pstore(tmp, from);
+  to[stride * 0] = tmp[0];
+  to[stride * 1] = tmp[1];
+  to[stride * 2] = tmp[2];
+  to[stride * 3] = tmp[3];
+  to[stride * 4] = tmp[4];
+  to[stride * 5] = tmp[5];
+  to[stride * 6] = tmp[6];
+  to[stride * 7] = tmp[7];
+  to[stride * 8] = tmp[8];
+  to[stride * 9] = tmp[9];
+  to[stride * 10] = tmp[10];
+  to[stride * 11] = tmp[11];
+  to[stride * 12] = tmp[12];
+  to[stride * 13] = tmp[13];
+  to[stride * 14] = tmp[14];
+  to[stride * 15] = tmp[15];
+}
+
+// some compilers might be tempted to perform multiple moves instead of using a vector path.
+template <>
+EIGEN_STRONG_INLINE void pstore1<Packet4f>(float* to, const float& a) {
+  Packet4f pa = _mm_set_ss(a);
+  pstore(to, Packet4f(vec4f_swizzle1(pa, 0, 0, 0, 0)));
+}
+// some compilers might be tempted to perform multiple moves instead of using a vector path.
+template <>
+EIGEN_STRONG_INLINE void pstore1<Packet2d>(double* to, const double& a) {
+  Packet2d pa = _mm_set_sd(a);
+  pstore(to, Packet2d(vec2d_swizzle1(pa, 0, 0)));
+}
+
+#if EIGEN_COMP_PGI && EIGEN_COMP_PGI < 1900
+typedef const void* SsePrefetchPtrType;
+#else
+typedef const char* SsePrefetchPtrType;
+#endif
+
+#ifndef EIGEN_VECTORIZE_AVX
+template <>
+EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
+  _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
+  _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) {
+  _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<int64_t>(const int64_t* addr) {
+  _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<uint32_t>(const uint32_t* addr) {
+  _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0);
+}
+#endif
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
+  return pfrexp_generic(a, exponent);
+}
+
+// Extract exponent without existence of Packet2l.
+template <>
+EIGEN_STRONG_INLINE Packet2d pfrexp_generic_get_biased_exponent(const Packet2d& a) {
+  const Packet2d cst_exp_mask = pset1frombits<Packet2d>(static_cast<uint64_t>(0x7ff0000000000000ull));
+  __m128i a_expo = _mm_srli_epi64(_mm_castpd_si128(pand(a, cst_exp_mask)), 52);
+  return _mm_cvtepi32_pd(vec4i_swizzle1(a_expo, 0, 2, 1, 3));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(const Packet2d& a, Packet2d& exponent) {
+  return pfrexp_generic(a, exponent);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
+  return pldexp_generic(a, exponent);
+}
+
+// We specialize pldexp here, since the generic implementation uses Packet2l, which is not well
+// supported by SSE, and has more range than is needed for exponents.
+template <>
+EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
+  // Clamp exponent to [-2099, 2099]
+  const Packet2d max_exponent = pset1<Packet2d>(2099.0);
+  const Packet2d e = pmin(pmax(exponent, pnegate(max_exponent)), max_exponent);
+
+  // Convert e to integer and swizzle to low-order bits.
+  const Packet4i ei = vec4i_swizzle1(_mm_cvtpd_epi32(e), 0, 3, 1, 3);
+
+  // Split 2^e into four factors and multiply:
+  const Packet4i bias = _mm_set_epi32(0, 1023, 0, 1023);
+  Packet4i b = parithmetic_shift_right<2>(ei);                       // floor(e/4)
+  Packet2d c = _mm_castsi128_pd(_mm_slli_epi64(padd(b, bias), 52));  // 2^b
+  Packet2d out = pmul(pmul(pmul(a, c), c), c);                       // a * 2^(3b)
+  b = psub(psub(psub(ei, b), b), b);                                 // e - 3b
+  c = _mm_castsi128_pd(_mm_slli_epi64(padd(b, bias), 52));           // 2^(e - 3b)
+  out = pmul(out, c);                                                // a * 2^e
+  return out;
+}
+
+// We specialize pldexp here, since the generic implementation uses Packet2l, which is not well
+// supported by SSE, and has more range than is needed for exponents.
+template <>
+EIGEN_STRONG_INLINE Packet2d pldexp_fast<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
+  // Clamp exponent to [-1023, 1024]
+  const Packet2d min_exponent = pset1<Packet2d>(-1023.0);
+  const Packet2d max_exponent = pset1<Packet2d>(1024.0);
+  const Packet2d e = pmin(pmax(exponent, min_exponent), max_exponent);
+
+  // Convert e to integer and swizzle to low-order bits.
+  const Packet4i ei = vec4i_swizzle1(_mm_cvtpd_epi32(e), 0, 3, 1, 3);
+
+  // Compute 2^e multiply:
+  const Packet4i bias = _mm_set_epi32(0, 1023, 0, 1023);
+  const Packet2d c = _mm_castsi128_pd(_mm_slli_epi64(padd(ei, bias), 52));  // 2^e
+  return pmul(a, c);
+}
+
+// with AVX, the default implementations based on pload1 are faster
+#ifndef __AVX__
+template <>
+EIGEN_STRONG_INLINE void pbroadcast4<Packet4f>(const float* a, Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) {
+  a3 = pload<Packet4f>(a);
+  a0 = vec4f_swizzle1(a3, 0, 0, 0, 0);
+  a1 = vec4f_swizzle1(a3, 1, 1, 1, 1);
+  a2 = vec4f_swizzle1(a3, 2, 2, 2, 2);
+  a3 = vec4f_swizzle1(a3, 3, 3, 3, 3);
+}
+template <>
+EIGEN_STRONG_INLINE void pbroadcast4<Packet2d>(const double* a, Packet2d& a0, Packet2d& a1, Packet2d& a2,
+                                               Packet2d& a3) {
+#ifdef EIGEN_VECTORIZE_SSE3
+  a0 = _mm_loaddup_pd(a + 0);
+  a1 = _mm_loaddup_pd(a + 1);
+  a2 = _mm_loaddup_pd(a + 2);
+  a3 = _mm_loaddup_pd(a + 3);
+#else
+  a1 = pload<Packet2d>(a);
+  a0 = vec2d_swizzle1(a1, 0, 0);
+  a1 = vec2d_swizzle1(a1, 1, 1);
+  a3 = pload<Packet2d>(a + 2);
+  a2 = vec2d_swizzle1(a3, 0, 0);
+  a3 = vec2d_swizzle1(a3, 1, 1);
+#endif
+}
+#endif
+
+EIGEN_STRONG_INLINE void punpackp(Packet4f* vecs) {
+  vecs[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x55));
+  vecs[2] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xAA));
+  vecs[3] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0xFF));
+  vecs[0] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(vecs[0]), 0x00));
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
+  _MM_TRANSPOSE4_PS(kernel.packet[0], kernel.packet[1], kernel.packet[2], kernel.packet[3]);
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
+  __m128d tmp = _mm_unpackhi_pd(kernel.packet[0], kernel.packet[1]);
+  kernel.packet[0] = _mm_unpacklo_pd(kernel.packet[0], kernel.packet[1]);
+  kernel.packet[1] = tmp;
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2l, 2>& kernel) {
+  __m128i tmp = _mm_unpackhi_epi64(kernel.packet[0], kernel.packet[1]);
+  kernel.packet[0] = _mm_unpacklo_epi64(kernel.packet[0], kernel.packet[1]);
+  kernel.packet[1] = tmp;
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
+  __m128i T0 = _mm_unpacklo_epi32(kernel.packet[0], kernel.packet[1]);
+  __m128i T1 = _mm_unpacklo_epi32(kernel.packet[2], kernel.packet[3]);
+  __m128i T2 = _mm_unpackhi_epi32(kernel.packet[0], kernel.packet[1]);
+  __m128i T3 = _mm_unpackhi_epi32(kernel.packet[2], kernel.packet[3]);
+
+  kernel.packet[0] = _mm_unpacklo_epi64(T0, T1);
+  kernel.packet[1] = _mm_unpackhi_epi64(T0, T1);
+  kernel.packet[2] = _mm_unpacklo_epi64(T2, T3);
+  kernel.packet[3] = _mm_unpackhi_epi64(T2, T3);
+}
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet4ui, 4>& kernel) {
+  ptranspose((PacketBlock<Packet4i, 4>&)kernel);
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16b, 4>& kernel) {
+  __m128i T0 = _mm_unpacklo_epi8(kernel.packet[0], kernel.packet[1]);
+  __m128i T1 = _mm_unpackhi_epi8(kernel.packet[0], kernel.packet[1]);
+  __m128i T2 = _mm_unpacklo_epi8(kernel.packet[2], kernel.packet[3]);
+  __m128i T3 = _mm_unpackhi_epi8(kernel.packet[2], kernel.packet[3]);
+  kernel.packet[0] = _mm_unpacklo_epi16(T0, T2);
+  kernel.packet[1] = _mm_unpackhi_epi16(T0, T2);
+  kernel.packet[2] = _mm_unpacklo_epi16(T1, T3);
+  kernel.packet[3] = _mm_unpackhi_epi16(T1, T3);
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet16b, 16>& kernel) {
+  // If we number the elements in the input thus:
+  // kernel.packet[ 0] = {00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 0a, 0b, 0c, 0d, 0e, 0f}
+  // kernel.packet[ 1] = {10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 1a, 1b, 1c, 1d, 1e, 1f}
+  // ...
+  // kernel.packet[15] = {f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, fa, fb, fc, fd, fe, ff},
+  //
+  // the desired output is:
+  // kernel.packet[ 0] = {00, 10, 20, 30, 40, 50, 60, 70, 80, 90, a0, b0, c0, d0, e0, f0}
+  // kernel.packet[ 1] = {01, 11, 21, 31, 41, 51, 61, 71, 81, 91, a1, b1, c1, d1, e1, f1}
+  // ...
+  // kernel.packet[15] = {0f, 1f, 2f, 3f, 4f, 5f, 6f, 7f, 8f, 9f, af, bf, cf, df, ef, ff},
+  __m128i t0 =
+      _mm_unpacklo_epi8(kernel.packet[0], kernel.packet[1]);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+  __m128i t1 =
+      _mm_unpackhi_epi8(kernel.packet[0], kernel.packet[1]);  // 08 18 09 19 0a 1a 0b 1b 0c 1c 0d 1d 0e 1e 0f 1f
+  __m128i t2 =
+      _mm_unpacklo_epi8(kernel.packet[2], kernel.packet[3]);  // 20 30 21 31 22 32 ...                     27 37
+  __m128i t3 =
+      _mm_unpackhi_epi8(kernel.packet[2], kernel.packet[3]);  // 28 38 29 39 2a 3a ...                     2f 3f
+  __m128i t4 =
+      _mm_unpacklo_epi8(kernel.packet[4], kernel.packet[5]);  // 40 50 41 51 42 52                         47 57
+  __m128i t5 = _mm_unpackhi_epi8(kernel.packet[4], kernel.packet[5]);  // 48 58 49 59 4a 5a
+  __m128i t6 = _mm_unpacklo_epi8(kernel.packet[6], kernel.packet[7]);
+  __m128i t7 = _mm_unpackhi_epi8(kernel.packet[6], kernel.packet[7]);
+  __m128i t8 = _mm_unpacklo_epi8(kernel.packet[8], kernel.packet[9]);
+  __m128i t9 = _mm_unpackhi_epi8(kernel.packet[8], kernel.packet[9]);
+  __m128i ta = _mm_unpacklo_epi8(kernel.packet[10], kernel.packet[11]);
+  __m128i tb = _mm_unpackhi_epi8(kernel.packet[10], kernel.packet[11]);
+  __m128i tc = _mm_unpacklo_epi8(kernel.packet[12], kernel.packet[13]);
+  __m128i td = _mm_unpackhi_epi8(kernel.packet[12], kernel.packet[13]);
+  __m128i te = _mm_unpacklo_epi8(kernel.packet[14], kernel.packet[15]);
+  __m128i tf = _mm_unpackhi_epi8(kernel.packet[14], kernel.packet[15]);
+
+  __m128i s0 = _mm_unpacklo_epi16(t0, t2);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+  __m128i s1 = _mm_unpackhi_epi16(t0, t2);  // 04 14 24 34
+  __m128i s2 = _mm_unpacklo_epi16(t1, t3);  // 08 18 28 38 ...
+  __m128i s3 = _mm_unpackhi_epi16(t1, t3);  // 0c 1c 2c 3c ...
+  __m128i s4 = _mm_unpacklo_epi16(t4, t6);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+  __m128i s5 = _mm_unpackhi_epi16(t4, t6);  // 44 54 64 74 ...
+  __m128i s6 = _mm_unpacklo_epi16(t5, t7);
+  __m128i s7 = _mm_unpackhi_epi16(t5, t7);
+  __m128i s8 = _mm_unpacklo_epi16(t8, ta);
+  __m128i s9 = _mm_unpackhi_epi16(t8, ta);
+  __m128i sa = _mm_unpacklo_epi16(t9, tb);
+  __m128i sb = _mm_unpackhi_epi16(t9, tb);
+  __m128i sc = _mm_unpacklo_epi16(tc, te);
+  __m128i sd = _mm_unpackhi_epi16(tc, te);
+  __m128i se = _mm_unpacklo_epi16(td, tf);
+  __m128i sf = _mm_unpackhi_epi16(td, tf);
+
+  __m128i u0 = _mm_unpacklo_epi32(s0, s4);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+  __m128i u1 = _mm_unpackhi_epi32(s0, s4);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+  __m128i u2 = _mm_unpacklo_epi32(s1, s5);
+  __m128i u3 = _mm_unpackhi_epi32(s1, s5);
+  __m128i u4 = _mm_unpacklo_epi32(s2, s6);
+  __m128i u5 = _mm_unpackhi_epi32(s2, s6);
+  __m128i u6 = _mm_unpacklo_epi32(s3, s7);
+  __m128i u7 = _mm_unpackhi_epi32(s3, s7);
+  __m128i u8 = _mm_unpacklo_epi32(s8, sc);
+  __m128i u9 = _mm_unpackhi_epi32(s8, sc);
+  __m128i ua = _mm_unpacklo_epi32(s9, sd);
+  __m128i ub = _mm_unpackhi_epi32(s9, sd);
+  __m128i uc = _mm_unpacklo_epi32(sa, se);
+  __m128i ud = _mm_unpackhi_epi32(sa, se);
+  __m128i ue = _mm_unpacklo_epi32(sb, sf);
+  __m128i uf = _mm_unpackhi_epi32(sb, sf);
+
+  kernel.packet[0] = _mm_unpacklo_epi64(u0, u8);
+  kernel.packet[1] = _mm_unpackhi_epi64(u0, u8);
+  kernel.packet[2] = _mm_unpacklo_epi64(u1, u9);
+  kernel.packet[3] = _mm_unpackhi_epi64(u1, u9);
+  kernel.packet[4] = _mm_unpacklo_epi64(u2, ua);
+  kernel.packet[5] = _mm_unpackhi_epi64(u2, ua);
+  kernel.packet[6] = _mm_unpacklo_epi64(u3, ub);
+  kernel.packet[7] = _mm_unpackhi_epi64(u3, ub);
+  kernel.packet[8] = _mm_unpacklo_epi64(u4, uc);
+  kernel.packet[9] = _mm_unpackhi_epi64(u4, uc);
+  kernel.packet[10] = _mm_unpacklo_epi64(u5, ud);
+  kernel.packet[11] = _mm_unpackhi_epi64(u5, ud);
+  kernel.packet[12] = _mm_unpacklo_epi64(u6, ue);
+  kernel.packet[13] = _mm_unpackhi_epi64(u6, ue);
+  kernel.packet[14] = _mm_unpacklo_epi64(u7, uf);
+  kernel.packet[15] = _mm_unpackhi_epi64(u7, uf);
+}
+
+EIGEN_STRONG_INLINE __m128i sse_blend_mask(const Selector<2>& ifPacket) {
+  return _mm_set_epi64x(0 - ifPacket.select[1], 0 - ifPacket.select[0]);
+}
+
+EIGEN_STRONG_INLINE __m128i sse_blend_mask(const Selector<4>& ifPacket) {
+  return _mm_set_epi32(0 - ifPacket.select[3], 0 - ifPacket.select[2], 0 - ifPacket.select[1], 0 - ifPacket.select[0]);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2l pblend(const Selector<2>& ifPacket, const Packet2l& thenPacket,
+                                    const Packet2l& elsePacket) {
+  const __m128i true_mask = sse_blend_mask(ifPacket);
+  return pselect<Packet2l>(true_mask, thenPacket, elsePacket);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket,
+                                    const Packet4i& elsePacket) {
+  const __m128i true_mask = sse_blend_mask(ifPacket);
+  return pselect<Packet4i>(true_mask, thenPacket, elsePacket);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4ui pblend(const Selector<4>& ifPacket, const Packet4ui& thenPacket,
+                                     const Packet4ui& elsePacket) {
+  return (Packet4ui)pblend(ifPacket, (Packet4i)thenPacket, (Packet4i)elsePacket);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket,
+                                    const Packet4f& elsePacket) {
+  const __m128i true_mask = sse_blend_mask(ifPacket);
+  return pselect<Packet4f>(_mm_castsi128_ps(true_mask), thenPacket, elsePacket);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket,
+                                    const Packet2d& elsePacket) {
+  const __m128i true_mask = sse_blend_mask(ifPacket);
+  return pselect<Packet2d>(_mm_castsi128_pd(true_mask), thenPacket, elsePacket);
+}
+
+// Scalar path for pmadd with FMA to ensure consistency with vectorized path.
+#if defined(EIGEN_VECTORIZE_FMA)
+template <>
+EIGEN_STRONG_INLINE float pmadd(const float& a, const float& b, const float& c) {
+  return std::fmaf(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE double pmadd(const double& a, const double& b, const double& c) {
+  return std::fma(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE float pmsub(const float& a, const float& b, const float& c) {
+  return std::fmaf(a, b, -c);
+}
+template <>
+EIGEN_STRONG_INLINE double pmsub(const double& a, const double& b, const double& c) {
+  return std::fma(a, b, -c);
+}
+template <>
+EIGEN_STRONG_INLINE float pnmadd(const float& a, const float& b, const float& c) {
+  return std::fmaf(-a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE double pnmadd(const double& a, const double& b, const double& c) {
+  return std::fma(-a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE float pnmsub(const float& a, const float& b, const float& c) {
+  return std::fmaf(-a, b, -c);
+}
+template <>
+EIGEN_STRONG_INLINE double pnmsub(const double& a, const double& b, const double& c) {
+  return std::fma(-a, b, -c);
+}
+#endif
+
+#ifdef EIGEN_VECTORIZE_SSE4_1
+// Helpers for half->float and float->half conversions.
+// Currently only used by the AVX code.
+EIGEN_STRONG_INLINE __m128i half2floatsse(__m128i h) {
+  __m128i input = _mm_cvtepu16_epi32(h);
+
+  // Direct vectorization of half_to_float, C parts in the comments.
+  __m128i shifted_exp = _mm_set1_epi32(0x7c00 << 13);
+  // o.u = (h.x & 0x7fff) << 13; // exponent/mantissa bits
+  __m128i ou = _mm_slli_epi32(_mm_and_si128(input, _mm_set1_epi32(0x7fff)), 13);
+  // exp = shifted_exp & o.u;   // just the exponent
+  __m128i exp = _mm_and_si128(ou, shifted_exp);
+  // o.u += (127 - 15) << 23;
+  ou = _mm_add_epi32(ou, _mm_set1_epi32((127 - 15) << 23));
+
+  // Inf/NaN?
+  __m128i naninf_mask = _mm_cmpeq_epi32(exp, shifted_exp);
+  // Inf/NaN adjust
+  __m128i naninf_adj = _mm_and_si128(_mm_set1_epi32((128 - 16) << 23), naninf_mask);
+  // extra exp adjust for  Inf/NaN
+  ou = _mm_add_epi32(ou, naninf_adj);
+
+  // Zero/Denormal?
+  __m128i zeroden_mask = _mm_cmpeq_epi32(exp, _mm_setzero_si128());
+  __m128i zeroden_adj = _mm_and_si128(zeroden_mask, _mm_set1_epi32(1 << 23));
+  // o.u += 1 << 23;
+  ou = _mm_add_epi32(ou, zeroden_adj);
+  // magic.u = 113 << 23
+  __m128i magic = _mm_and_si128(zeroden_mask, _mm_set1_epi32(113 << 23));
+  // o.f -= magic.f
+  ou = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(ou), _mm_castsi128_ps(magic)));
+
+  __m128i sign = _mm_slli_epi32(_mm_and_si128(input, _mm_set1_epi32(0x8000)), 16);
+  // o.u |= (h.x & 0x8000) << 16;    // sign bit
+  ou = _mm_or_si128(ou, sign);
+  // return o.f;
+  // We are actually returning uint version, to make
+  // _mm256_insertf128_si256 work.
+  return ou;
+}
+
+EIGEN_STRONG_INLINE __m128i float2half(__m128 f) {
+  // unsigned int sign_mask = 0x80000000u;
+  __m128i sign = _mm_set1_epi32(0x80000000u);
+  // unsigned int sign = f.u & sign_mask;
+  sign = _mm_and_si128(sign, _mm_castps_si128(f));
+  // f.u ^= sign;
+  f = _mm_xor_ps(f, _mm_castsi128_ps(sign));
+
+  __m128i fu = _mm_castps_si128(f);
+
+  __m128i f16max = _mm_set1_epi32((127 + 16) << 23);
+  __m128i f32infty = _mm_set1_epi32(255 << 23);
+  // if (f.u >= f16max.u) // result is Inf or NaN (all exponent bits set)
+  // there is no _mm_cmpge_epi32, so use lt and swap operands
+  __m128i infnan_mask = _mm_cmplt_epi32(f16max, _mm_castps_si128(f));
+  __m128i inf_mask = _mm_cmpgt_epi32(_mm_castps_si128(f), f32infty);
+  __m128i nan_mask = _mm_andnot_si128(inf_mask, infnan_mask);
+  __m128i inf_value = _mm_and_si128(inf_mask, _mm_set1_epi32(0x7e00));
+  __m128i nan_value = _mm_and_si128(nan_mask, _mm_set1_epi32(0x7c00));
+  // o.x = (f.u > f32infty.u) ? 0x7e00 : 0x7c00; // NaN->qNaN and Inf->Inf
+  __m128i naninf_value = _mm_or_si128(inf_value, nan_value);
+
+  __m128i denorm_magic = _mm_set1_epi32(((127 - 15) + (23 - 10) + 1) << 23);
+  __m128i subnorm_mask = _mm_cmplt_epi32(_mm_castps_si128(f), _mm_set1_epi32(113 << 23));
+  //  f.f += denorm_magic.f;
+  f = _mm_add_ps(f, _mm_castsi128_ps(denorm_magic));
+  // f.u - denorm_magic.u
+  __m128i o = _mm_sub_epi32(_mm_castps_si128(f), denorm_magic);
+  o = _mm_and_si128(o, subnorm_mask);
+  // Correct result for inf/nan/zero/subnormal, 0 otherwise
+  o = _mm_or_si128(o, naninf_value);
+
+  __m128i mask = _mm_or_si128(infnan_mask, subnorm_mask);
+  o = _mm_and_si128(o, mask);
+
+  // mant_odd = (f.u >> 13) & 1;
+  __m128i mand_odd = _mm_and_si128(_mm_srli_epi32(fu, 13), _mm_set1_epi32(0x1));
+  // f.u += 0xc8000fffU;
+  fu = _mm_add_epi32(fu, _mm_set1_epi32(0xc8000fffU));
+  // f.u += mant_odd;
+  fu = _mm_add_epi32(fu, mand_odd);
+  fu = _mm_andnot_si128(mask, fu);
+  // f.u >> 13
+  fu = _mm_srli_epi32(fu, 13);
+  o = _mm_or_si128(fu, o);
+
+  // o.x |= static_cast<numext::uint16_t>(sign >> 16);
+  o = _mm_or_si128(o, _mm_srli_epi32(sign, 16));
+
+  // 16 bit values
+  return _mm_and_si128(o, _mm_set1_epi32(0xffff));
+}
+#endif
+
+// Packet math for Eigen::half
+// Disable the following code since it's broken on too many platforms / compilers.
+// #elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
+#if 0
+
+typedef struct {
+  __m64 x;
+} Packet4h;
+
+
+template<> struct is_arithmetic<Packet4h> { enum { value = true }; };
+
+template <>
+struct packet_traits<Eigen::half> : default_packet_traits {
+  typedef Packet4h type;
+  // There is no half-size packet for Packet4h.
+  typedef Packet4h half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasDiv    = 1,
+    HasNegate = 0,
+    HasAbs    = 0,
+    HasAbs2   = 0,
+    HasMin    = 0,
+    HasMax    = 0,
+    HasConj   = 0,
+    HasSetLinear = 0,
+  };
 };
 
-template<int Offset>
-struct palign_impl<Offset,Packet4i>
+
+template<> struct unpacket_traits<Packet4h> { typedef Eigen::half type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4h half; };
+
+template<> EIGEN_STRONG_INLINE Packet4h pset1<Packet4h>(const Eigen::half& from) {
+  Packet4h result;
+  result.x = _mm_set1_pi16(from.x);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4h>(const Packet4h& from) {
+  return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm_cvtsi64_si32(from.x)));
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h pconj(const Packet4h& a) { return a; }
+
+template<> EIGEN_STRONG_INLINE Packet4h padd<Packet4h>(const Packet4h& a, const Packet4h& b) {
+  __int64_t a64 = _mm_cvtm64_si64(a.x);
+  __int64_t b64 = _mm_cvtm64_si64(b.x);
+
+  Eigen::half h[4];
+
+  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
+  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
+  h[0] = ha + hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
+  h[1] = ha + hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
+  h[2] = ha + hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
+  h[3] = ha + hb;
+  Packet4h result;
+  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h psub<Packet4h>(const Packet4h& a, const Packet4h& b) {
+  __int64_t a64 = _mm_cvtm64_si64(a.x);
+  __int64_t b64 = _mm_cvtm64_si64(b.x);
+
+  Eigen::half h[4];
+
+  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
+  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
+  h[0] = ha - hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
+  h[1] = ha - hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
+  h[2] = ha - hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
+  h[3] = ha - hb;
+  Packet4h result;
+  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h pmul<Packet4h>(const Packet4h& a, const Packet4h& b) {
+  __int64_t a64 = _mm_cvtm64_si64(a.x);
+  __int64_t b64 = _mm_cvtm64_si64(b.x);
+
+  Eigen::half h[4];
+
+  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
+  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
+  h[0] = ha * hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
+  h[1] = ha * hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
+  h[2] = ha * hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
+  h[3] = ha * hb;
+  Packet4h result;
+  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h pdiv<Packet4h>(const Packet4h& a, const Packet4h& b) {
+  __int64_t a64 = _mm_cvtm64_si64(a.x);
+  __int64_t b64 = _mm_cvtm64_si64(b.x);
+
+  Eigen::half h[4];
+
+  Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
+  Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
+  h[0] = ha / hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 16));
+  h[1] = ha / hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 32));
+  h[2] = ha / hb;
+  ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
+  hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> 48));
+  h[3] = ha / hb;
+  Packet4h result;
+  result.x = _mm_set_pi16(h[3].x, h[2].x, h[1].x, h[0].x);
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h pload<Packet4h>(const Eigen::half* from) {
+  Packet4h result;
+  result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from));
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h ploadu<Packet4h>(const Eigen::half* from) {
+  Packet4h result;
+  result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from));
+  return result;
+}
+
+template<> EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet4h& from) {
+  __int64_t r = _mm_cvtm64_si64(from.x);
+  *(reinterpret_cast<__int64_t*>(to)) = r;
+}
+
+template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet4h& from) {
+  __int64_t r = _mm_cvtm64_si64(from.x);
+  *(reinterpret_cast<__int64_t*>(to)) = r;
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h
+ploadquad<Packet4h>(const Eigen::half* from) {
+  return pset1<Packet4h>(*from);
+}
+
+template<> EIGEN_STRONG_INLINE Packet4h pgather<Eigen::half, Packet4h>(const Eigen::half* from, Index stride)
 {
-  static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second)
-  {
-    if (Offset==1)
-    {
-      first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
-      first = _mm_shuffle_epi32(first,0x39);
-    }
-    else if (Offset==2)
-    {
-      first = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(first)));
-      first = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
-    }
-    else if (Offset==3)
-    {
-      first = _mm_castps_si128(_mm_move_ss(_mm_castsi128_ps(first),_mm_castsi128_ps(second)));
-      first = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(first),_mm_castsi128_ps(second),0x93));
-    }
-  }
-};
+  Packet4h result;
+  result.x = _mm_set_pi16(from[3*stride].x, from[2*stride].x, from[1*stride].x, from[0*stride].x);
+  return result;
+}
 
-template<int Offset>
-struct palign_impl<Offset,Packet2d>
+template<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4h>(Eigen::half* to, const Packet4h& from, Index stride)
 {
-  static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second)
-  {
-    if (Offset==1)
-    {
-      first = _mm_castps_pd(_mm_movehl_ps(_mm_castpd_ps(first),_mm_castpd_ps(first)));
-      first = _mm_castps_pd(_mm_movelh_ps(_mm_castpd_ps(first),_mm_castpd_ps(second)));
-    }
-  }
-};
+  __int64_t a = _mm_cvtm64_si64(from.x);
+  to[stride*0].x = static_cast<unsigned short>(a);
+  to[stride*1].x = static_cast<unsigned short>(a >> 16);
+  to[stride*2].x = static_cast<unsigned short>(a >> 32);
+  to[stride*3].x = static_cast<unsigned short>(a >> 48);
+}
+
+EIGEN_STRONG_INLINE void
+ptranspose(PacketBlock<Packet4h,4>& kernel) {
+  __m64 T0 = _mm_unpacklo_pi16(kernel.packet[0].x, kernel.packet[1].x);
+  __m64 T1 = _mm_unpacklo_pi16(kernel.packet[2].x, kernel.packet[3].x);
+  __m64 T2 = _mm_unpackhi_pi16(kernel.packet[0].x, kernel.packet[1].x);
+  __m64 T3 = _mm_unpackhi_pi16(kernel.packet[2].x, kernel.packet[3].x);
+
+  kernel.packet[0].x = _mm_unpacklo_pi32(T0, T1);
+  kernel.packet[1].x = _mm_unpackhi_pi32(T0, T1);
+  kernel.packet[2].x = _mm_unpacklo_pi32(T2, T3);
+  kernel.packet[3].x = _mm_unpackhi_pi32(T2, T3);
+}
+
 #endif
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
+
+#if EIGEN_COMP_PGI && EIGEN_COMP_PGI < 1900
+// PGI++ does not define the following intrinsics in C++ mode.
+static inline __m128 _mm_castpd_ps(__m128d x) { return reinterpret_cast<__m128&>(x); }
+static inline __m128i _mm_castpd_si128(__m128d x) { return reinterpret_cast<__m128i&>(x); }
+static inline __m128d _mm_castps_pd(__m128 x) { return reinterpret_cast<__m128d&>(x); }
+static inline __m128i _mm_castps_si128(__m128 x) { return reinterpret_cast<__m128i&>(x); }
+static inline __m128 _mm_castsi128_ps(__m128i x) { return reinterpret_cast<__m128&>(x); }
+static inline __m128d _mm_castsi128_pd(__m128i x) { return reinterpret_cast<__m128d&>(x); }
+#endif
 
-#endif // EIGEN_PACKET_MATH_SSE_H
+#endif  // EIGEN_PACKET_MATH_SSE_H
diff --git a/inst/include/Eigen/src/Core/arch/SSE/Reductions.h b/inst/include/Eigen/src/Core/arch/SSE/Reductions.h
new file mode 100644
index 00000000..f38df4e4
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/SSE/Reductions.h
@@ -0,0 +1,324 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2025 Charlie Schlosser <cs.schlosser@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_REDUCTIONS_SSE_H
+#define EIGEN_REDUCTIONS_SSE_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename Packet>
+struct sse_add_wrapper {
+  static EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) { return padd<Packet>(a, b); }
+};
+
+template <typename Packet>
+struct sse_mul_wrapper {
+  static EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) { return pmul<Packet>(a, b); }
+};
+
+template <typename Packet>
+struct sse_min_wrapper {
+  static EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) { return pmin<Packet>(a, b); }
+};
+
+template <int NaNPropagation, typename Packet>
+struct sse_min_prop_wrapper {
+  static EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) {
+    return pmin<NaNPropagation, Packet>(a, b);
+  }
+};
+
+template <typename Packet>
+struct sse_max_wrapper {
+  static EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) { return pmax<Packet>(a, b); }
+};
+
+template <int NaNPropagation, typename Packet>
+struct sse_max_prop_wrapper {
+  static EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) {
+    return pmax<NaNPropagation, Packet>(a, b);
+  }
+};
+
+template <typename Packet, typename Op>
+struct sse_predux_common;
+
+template <typename Packet>
+struct sse_predux_impl : sse_predux_common<Packet, sse_add_wrapper<Packet>> {};
+
+template <typename Packet>
+struct sse_predux_mul_impl : sse_predux_common<Packet, sse_mul_wrapper<Packet>> {};
+
+template <typename Packet>
+struct sse_predux_min_impl : sse_predux_common<Packet, sse_min_wrapper<Packet>> {};
+
+template <int NaNPropagation, typename Packet>
+struct sse_predux_min_prop_impl : sse_predux_common<Packet, sse_min_prop_wrapper<NaNPropagation, Packet>> {};
+
+template <typename Packet>
+struct sse_predux_max_impl : sse_predux_common<Packet, sse_max_wrapper<Packet>> {};
+
+template <int NaNPropagation, typename Packet>
+struct sse_predux_max_prop_impl : sse_predux_common<Packet, sse_max_prop_wrapper<NaNPropagation, Packet>> {};
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet16b -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <>
+EIGEN_STRONG_INLINE bool predux(const Packet16b& a) {
+  Packet4i tmp = _mm_or_si128(a, _mm_unpackhi_epi64(a, a));
+  return (pfirst(tmp) != 0) || (pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1)) != 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_mul(const Packet16b& a) {
+  Packet4i tmp = _mm_and_si128(a, _mm_unpackhi_epi64(a, a));
+  return ((pfirst<Packet4i>(tmp) == 0x01010101) && (pfirst<Packet4i>(_mm_shuffle_epi32(tmp, 1)) == 0x01010101));
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_min(const Packet16b& a) {
+  return predux_mul(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_max(const Packet16b& a) {
+  return predux(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet16b& a) {
+  return predux(a);
+}
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet4i -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <typename Op>
+struct sse_predux_common<Packet4i, Op> {
+  static EIGEN_STRONG_INLINE int run(const Packet4i& a) {
+    Packet4i tmp;
+    tmp = Op::packetOp(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 1, 2, 3)));
+    tmp = Op::packetOp(tmp, _mm_unpackhi_epi32(tmp, tmp));
+    return _mm_cvtsi128_si32(tmp);
+  }
+};
+
+template <>
+EIGEN_STRONG_INLINE int predux(const Packet4i& a) {
+  return sse_predux_impl<Packet4i>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE int predux_mul(const Packet4i& a) {
+  return sse_predux_mul_impl<Packet4i>::run(a);
+}
+
+#ifdef EIGEN_VECTORIZE_SSE4_1
+template <>
+EIGEN_STRONG_INLINE int predux_min(const Packet4i& a) {
+  return sse_predux_min_impl<Packet4i>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE int predux_max(const Packet4i& a) {
+  return sse_predux_max_impl<Packet4i>::run(a);
+}
+#endif
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet4i& a) {
+  return _mm_movemask_ps(_mm_castsi128_ps(a)) != 0x0;
+}
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet4ui -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <typename Op>
+struct sse_predux_common<Packet4ui, Op> {
+  static EIGEN_STRONG_INLINE uint32_t run(const Packet4ui& a) {
+    Packet4ui tmp;
+    tmp = Op::packetOp(a, _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 1, 2, 3)));
+    tmp = Op::packetOp(tmp, _mm_unpackhi_epi32(tmp, tmp));
+    return static_cast<uint32_t>(_mm_cvtsi128_si32(tmp));
+  }
+};
+
+template <>
+EIGEN_STRONG_INLINE uint32_t predux(const Packet4ui& a) {
+  return sse_predux_impl<Packet4ui>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_mul(const Packet4ui& a) {
+  return sse_predux_mul_impl<Packet4ui>::run(a);
+}
+
+#ifdef EIGEN_VECTORIZE_SSE4_1
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_min(const Packet4ui& a) {
+  return sse_predux_min_impl<Packet4ui>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE uint32_t predux_max(const Packet4ui& a) {
+  return sse_predux_max_impl<Packet4ui>::run(a);
+}
+#endif
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet4ui& a) {
+  return _mm_movemask_ps(_mm_castsi128_ps(a)) != 0x0;
+}
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet2l -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <typename Op>
+struct sse_predux_common<Packet2l, Op> {
+  static EIGEN_STRONG_INLINE int64_t run(const Packet2l& a) {
+    Packet2l tmp;
+    tmp = Op::packetOp(a, _mm_unpackhi_epi64(a, a));
+    return pfirst(tmp);
+  }
+};
+
+template <>
+EIGEN_STRONG_INLINE int64_t predux(const Packet2l& a) {
+  return sse_predux_impl<Packet2l>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet2l& a) {
+  return _mm_movemask_pd(_mm_castsi128_pd(a)) != 0x0;
+}
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet4f -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <typename Op>
+struct sse_predux_common<Packet4f, Op> {
+  static EIGEN_STRONG_INLINE float run(const Packet4f& a) {
+    Packet4f tmp;
+    tmp = Op::packetOp(a, _mm_movehl_ps(a, a));
+#ifdef EIGEN_VECTORIZE_SSE3
+    tmp = Op::packetOp(tmp, _mm_movehdup_ps(tmp));
+#else
+    tmp = Op::packetOp(tmp, _mm_shuffle_ps(tmp, tmp, 1));
+#endif
+    return _mm_cvtss_f32(tmp);
+  }
+};
+
+template <>
+EIGEN_STRONG_INLINE float predux(const Packet4f& a) {
+  return sse_predux_impl<Packet4f>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) {
+  return sse_predux_mul_impl<Packet4f>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) {
+  return sse_predux_min_impl<Packet4f>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_min<PropagateNumbers>(const Packet4f& a) {
+  return sse_predux_min_prop_impl<PropagateNumbers, Packet4f>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_min<PropagateNaN>(const Packet4f& a) {
+  return sse_predux_min_prop_impl<PropagateNaN, Packet4f>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) {
+  return sse_predux_max_impl<Packet4f>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_max<PropagateNumbers>(const Packet4f& a) {
+  return sse_predux_max_prop_impl<PropagateNumbers, Packet4f>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_max<PropagateNaN>(const Packet4f& a) {
+  return sse_predux_max_prop_impl<PropagateNaN, Packet4f>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet4f& a) {
+  return _mm_movemask_ps(a) != 0x0;
+}
+
+/* -- -- -- -- -- -- -- -- -- -- -- -- Packet2d -- -- -- -- -- -- -- -- -- -- -- -- */
+
+template <typename Op>
+struct sse_predux_common<Packet2d, Op> {
+  static EIGEN_STRONG_INLINE double run(const Packet2d& a) {
+    Packet2d tmp;
+    tmp = Op::packetOp(a, _mm_unpackhi_pd(a, a));
+    return _mm_cvtsd_f64(tmp);
+  }
+};
+
+template <>
+EIGEN_STRONG_INLINE double predux(const Packet2d& a) {
+  return sse_predux_impl<Packet2d>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_mul(const Packet2d& a) {
+  return sse_predux_mul_impl<Packet2d>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_min(const Packet2d& a) {
+  return sse_predux_min_impl<Packet2d>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_min<PropagateNumbers>(const Packet2d& a) {
+  return sse_predux_min_prop_impl<PropagateNumbers, Packet2d>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_min<PropagateNaN>(const Packet2d& a) {
+  return sse_predux_min_prop_impl<PropagateNaN, Packet2d>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_max(const Packet2d& a) {
+  return sse_predux_max_impl<Packet2d>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_max<PropagateNumbers>(const Packet2d& a) {
+  return sse_predux_max_prop_impl<PropagateNumbers, Packet2d>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_max<PropagateNaN>(const Packet2d& a) {
+  return sse_predux_max_prop_impl<PropagateNaN, Packet2d>::run(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE bool predux_any(const Packet2d& a) {
+  return _mm_movemask_pd(a) != 0x0;
+}
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_REDUCTIONS_SSE_H
diff --git a/inst/include/Eigen/src/Core/arch/SSE/TypeCasting.h b/inst/include/Eigen/src/Core/arch/SSE/TypeCasting.h
new file mode 100644
index 00000000..9a7732a6
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/SSE/TypeCasting.h
@@ -0,0 +1,230 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TYPE_CASTING_SSE_H
+#define EIGEN_TYPE_CASTING_SSE_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+#ifndef EIGEN_VECTORIZE_AVX
+template <>
+struct type_casting_traits<float, bool> : vectorized_type_casting_traits<float, bool> {};
+template <>
+struct type_casting_traits<bool, float> : vectorized_type_casting_traits<bool, float> {};
+
+template <>
+struct type_casting_traits<float, int> : vectorized_type_casting_traits<float, int> {};
+template <>
+struct type_casting_traits<int, float> : vectorized_type_casting_traits<int, float> {};
+
+template <>
+struct type_casting_traits<float, double> : vectorized_type_casting_traits<float, double> {};
+template <>
+struct type_casting_traits<double, float> : vectorized_type_casting_traits<double, float> {};
+
+template <>
+struct type_casting_traits<double, int> : vectorized_type_casting_traits<double, int> {};
+template <>
+struct type_casting_traits<int, double> : vectorized_type_casting_traits<int, double> {};
+
+#ifndef EIGEN_VECTORIZE_AVX2
+template <>
+struct type_casting_traits<double, int64_t> : vectorized_type_casting_traits<double, int64_t> {};
+template <>
+struct type_casting_traits<int64_t, double> : vectorized_type_casting_traits<int64_t, double> {};
+#endif
+#endif
+
+template <>
+EIGEN_STRONG_INLINE Packet16b pcast<Packet4f, Packet16b>(const Packet4f& a, const Packet4f& b, const Packet4f& c,
+                                                         const Packet4f& d) {
+  __m128 zero = pzero(a);
+  __m128 nonzero_a = _mm_cmpneq_ps(a, zero);
+  __m128 nonzero_b = _mm_cmpneq_ps(b, zero);
+  __m128 nonzero_c = _mm_cmpneq_ps(c, zero);
+  __m128 nonzero_d = _mm_cmpneq_ps(d, zero);
+  __m128i ab_bytes = _mm_packs_epi32(_mm_castps_si128(nonzero_a), _mm_castps_si128(nonzero_b));
+  __m128i cd_bytes = _mm_packs_epi32(_mm_castps_si128(nonzero_c), _mm_castps_si128(nonzero_d));
+  __m128i merged = _mm_packs_epi16(ab_bytes, cd_bytes);
+  return _mm_and_si128(merged, _mm_set1_epi8(1));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet16b, Packet4f>(const Packet16b& a) {
+  const __m128 cst_one = _mm_set_ps1(1.0f);
+#ifdef EIGEN_VECTORIZE_SSE4_1
+  __m128i a_extended = _mm_cvtepi8_epi32(a);
+  __m128i abcd = _mm_cmpeq_epi32(a_extended, _mm_setzero_si128());
+#else
+  __m128i abcd_efhg_ijkl_mnop = _mm_cmpeq_epi8(a, _mm_setzero_si128());
+  __m128i aabb_ccdd_eeff_gghh = _mm_unpacklo_epi8(abcd_efhg_ijkl_mnop, abcd_efhg_ijkl_mnop);
+  __m128i abcd = _mm_unpacklo_epi8(aabb_ccdd_eeff_gghh, aabb_ccdd_eeff_gghh);
+#endif
+  __m128 result = _mm_andnot_ps(_mm_castsi128_ps(abcd), cst_one);
+  return result;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet4f, Packet4i>(const Packet4f& a) {
+  return _mm_cvttps_epi32(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pcast<Packet2d, Packet4i>(const Packet2d& a, const Packet2d& b) {
+  return _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(_mm_cvttpd_epi32(a)), _mm_castsi128_ps(_mm_cvttpd_epi32(b)),
+                                         (1 << 2) | (1 << 6)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2l pcast<Packet2d, Packet2l>(const Packet2d& a) {
+#if EIGEN_ARCH_x86_64
+  return _mm_set_epi64x(_mm_cvttsd_si64(preverse(a)), _mm_cvttsd_si64(a));
+#else
+  return _mm_set_epi64x(static_cast<int64_t>(pfirst(preverse(a))), static_cast<int64_t>(pfirst(a)));
+#endif
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet2l, Packet2d>(const Packet2l& a) {
+  EIGEN_ALIGN16 int64_t aux[2];
+  pstore(aux, a);
+  return _mm_set_pd(static_cast<double>(aux[1]), static_cast<double>(aux[0]));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet4i, Packet4f>(const Packet4i& a) {
+  return _mm_cvtepi32_ps(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pcast<Packet2d, Packet4f>(const Packet2d& a, const Packet2d& b) {
+  return _mm_shuffle_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b), (1 << 2) | (1 << 6));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet4i, Packet2d>(const Packet4i& a) {
+  // Simply discard the second half of the input
+  return _mm_cvtepi32_pd(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pcast<Packet4f, Packet2d>(const Packet4f& a) {
+  // Simply discard the second half of the input
+  return _mm_cvtps_pd(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet4f>(const Packet4f& a) {
+  return _mm_castps_pd(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f, Packet2d>(const Packet2d& a) {
+  return _mm_castpd_ps(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet4f>(const Packet4f& a) {
+  return _mm_castps_si128(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f preinterpret<Packet4f, Packet4i>(const Packet4i& a) {
+  return _mm_castsi128_ps(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet4i>(const Packet4i& a) {
+  return _mm_castsi128_pd(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d preinterpret<Packet2d, Packet2l>(const Packet2l& a) {
+  return _mm_castsi128_pd(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2l preinterpret<Packet2l, Packet2d>(const Packet2d& a) {
+  return _mm_castpd_si128(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet2d>(const Packet2d& a) {
+  return _mm_castpd_si128(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4ui preinterpret<Packet4ui, Packet4i>(const Packet4i& a) {
+  return Packet4ui(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i preinterpret<Packet4i, Packet4ui>(const Packet4ui& a) {
+  return Packet4i(a);
+}
+
+// Disable the following code since it's broken on too many platforms / compilers.
+// #elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
+#if 0
+
+template <>
+struct type_casting_traits<Eigen::half, float> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet4f pcast<Packet4h, Packet4f>(const Packet4h& a) {
+  __int64_t a64 = _mm_cvtm64_si64(a.x);
+  Eigen::half h = raw_uint16_to_half(static_cast<unsigned short>(a64));
+  float f1 = static_cast<float>(h);
+  h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 16));
+  float f2 = static_cast<float>(h);
+  h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 32));
+  float f3 = static_cast<float>(h);
+  h = raw_uint16_to_half(static_cast<unsigned short>(a64 >> 48));
+  float f4 = static_cast<float>(h);
+  return _mm_set_ps(f4, f3, f2, f1);
+}
+
+template <>
+struct type_casting_traits<float, Eigen::half> {
+  enum {
+    VectorizedCast = 1,
+    SrcCoeffRatio = 1,
+    TgtCoeffRatio = 1
+  };
+};
+
+template<> EIGEN_STRONG_INLINE Packet4h pcast<Packet4f, Packet4h>(const Packet4f& a) {
+  EIGEN_ALIGN16 float aux[4];
+  pstore(aux, a);
+  Eigen::half h0(aux[0]);
+  Eigen::half h1(aux[1]);
+  Eigen::half h2(aux[2]);
+  Eigen::half h3(aux[3]);
+
+  Packet4h result;
+  result.x = _mm_set_pi16(h3.x, h2.x, h1.x, h0.x);
+  return result;
+}
+
+#endif
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_TYPE_CASTING_SSE_H
diff --git a/inst/include/Eigen/src/Core/arch/SVE/MathFunctions.h b/inst/include/Eigen/src/Core/arch/SVE/MathFunctions.h
new file mode 100644
index 00000000..8c8ed84c
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/SVE/MathFunctions.h
@@ -0,0 +1,48 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2020, Arm Limited and Contributors
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MATH_FUNCTIONS_SVE_H
+#define EIGEN_MATH_FUNCTIONS_SVE_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pexp<PacketXf>(const PacketXf& x) {
+  return pexp_float(x);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf plog<PacketXf>(const PacketXf& x) {
+  return plog_float(x);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf psin<PacketXf>(const PacketXf& x) {
+  return psin_float(x);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pcos<PacketXf>(const PacketXf& x) {
+  return pcos_float(x);
+}
+
+// Hyperbolic Tangent function.
+template <>
+EIGEN_STRONG_INLINE PacketXf ptanh<PacketXf>(const PacketXf& x) {
+  return ptanh_float(x);
+}
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_MATH_FUNCTIONS_SVE_H
diff --git a/inst/include/Eigen/src/Core/arch/SVE/PacketMath.h b/inst/include/Eigen/src/Core/arch/SVE/PacketMath.h
new file mode 100644
index 00000000..6115d1d3
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/SVE/PacketMath.h
@@ -0,0 +1,674 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2020, Arm Limited and Contributors
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_SVE_H
+#define EIGEN_PACKET_MATH_SVE_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 8
+#endif
+
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#endif
+
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
+
+template <typename Scalar, int SVEVectorLength>
+struct sve_packet_size_selector {
+  enum { size = SVEVectorLength / (sizeof(Scalar) * CHAR_BIT) };
+};
+
+/********************************* int32 **************************************/
+typedef svint32_t PacketXi __attribute__((arm_sve_vector_bits(EIGEN_ARM64_SVE_VL)));
+
+template <>
+struct packet_traits<numext::int32_t> : default_packet_traits {
+  typedef PacketXi type;
+  typedef PacketXi half;  // Half not implemented yet
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = sve_packet_size_selector<numext::int32_t, EIGEN_ARM64_SVE_VL>::size,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
+    HasSetLinear = 0,
+    HasBlend = 0,
+    HasReduxp = 0  // Not implemented in SVE
+  };
+};
+
+template <>
+struct unpacket_traits<PacketXi> {
+  typedef numext::int32_t type;
+  typedef PacketXi half;  // Half not yet implemented
+  enum {
+    size = sve_packet_size_selector<numext::int32_t, EIGEN_ARM64_SVE_VL>::size,
+    alignment = Aligned64,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<numext::int32_t>(const numext::int32_t* addr) {
+  svprfw(svptrue_b32(), addr, SV_PLDL1KEEP);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pset1<PacketXi>(const numext::int32_t& from) {
+  return svdup_n_s32(from);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi plset<PacketXi>(const numext::int32_t& a) {
+  numext::int32_t c[packet_traits<numext::int32_t>::size];
+  for (int i = 0; i < packet_traits<numext::int32_t>::size; i++) c[i] = i;
+  return svadd_s32_x(svptrue_b32(), pset1<PacketXi>(a), svld1_s32(svptrue_b32(), c));
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi padd<PacketXi>(const PacketXi& a, const PacketXi& b) {
+  return svadd_s32_x(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi psub<PacketXi>(const PacketXi& a, const PacketXi& b) {
+  return svsub_s32_x(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pnegate(const PacketXi& a) {
+  return svneg_s32_x(svptrue_b32(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pconj(const PacketXi& a) {
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pmul<PacketXi>(const PacketXi& a, const PacketXi& b) {
+  return svmul_s32_x(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pdiv<PacketXi>(const PacketXi& a, const PacketXi& b) {
+  return svdiv_s32_x(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pmadd(const PacketXi& a, const PacketXi& b, const PacketXi& c) {
+  return svmla_s32_x(svptrue_b32(), c, a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pmin<PacketXi>(const PacketXi& a, const PacketXi& b) {
+  return svmin_s32_x(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pmax<PacketXi>(const PacketXi& a, const PacketXi& b) {
+  return svmax_s32_x(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pcmp_le<PacketXi>(const PacketXi& a, const PacketXi& b) {
+  return svdup_n_s32_z(svcmple_s32(svptrue_b32(), a, b), 0xffffffffu);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pcmp_lt<PacketXi>(const PacketXi& a, const PacketXi& b) {
+  return svdup_n_s32_z(svcmplt_s32(svptrue_b32(), a, b), 0xffffffffu);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pcmp_eq<PacketXi>(const PacketXi& a, const PacketXi& b) {
+  return svdup_n_s32_z(svcmpeq_s32(svptrue_b32(), a, b), 0xffffffffu);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi ptrue<PacketXi>(const PacketXi& /*a*/) {
+  return svdup_n_s32_x(svptrue_b32(), 0xffffffffu);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pzero<PacketXi>(const PacketXi& /*a*/) {
+  return svdup_n_s32_x(svptrue_b32(), 0);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pand<PacketXi>(const PacketXi& a, const PacketXi& b) {
+  return svand_s32_x(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi por<PacketXi>(const PacketXi& a, const PacketXi& b) {
+  return svorr_s32_x(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pxor<PacketXi>(const PacketXi& a, const PacketXi& b) {
+  return sveor_s32_x(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pandnot<PacketXi>(const PacketXi& a, const PacketXi& b) {
+  return svbic_s32_x(svptrue_b32(), a, b);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE PacketXi parithmetic_shift_right(PacketXi a) {
+  return svasrd_n_s32_x(svptrue_b32(), a, N);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE PacketXi plogical_shift_right(PacketXi a) {
+  return svreinterpret_s32_u32(svlsr_n_u32_x(svptrue_b32(), svreinterpret_u32_s32(a), N));
+}
+
+template <int N>
+EIGEN_STRONG_INLINE PacketXi plogical_shift_left(PacketXi a) {
+  return svlsl_n_s32_x(svptrue_b32(), a, N);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pload<PacketXi>(const numext::int32_t* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return svld1_s32(svptrue_b32(), from);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi ploadu<PacketXi>(const numext::int32_t* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return svld1_s32(svptrue_b32(), from);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi ploaddup<PacketXi>(const numext::int32_t* from) {
+  svuint32_t indices = svindex_u32(0, 1);  // index {base=0, base+step=1, base+step*2, ...}
+  indices = svzip1_u32(indices, indices);  // index in the format {a0, a0, a1, a1, a2, a2, ...}
+  return svld1_gather_u32index_s32(svptrue_b32(), from, indices);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi ploadquad<PacketXi>(const numext::int32_t* from) {
+  svuint32_t indices = svindex_u32(0, 1);  // index {base=0, base+step=1, base+step*2, ...}
+  indices = svzip1_u32(indices, indices);  // index in the format {a0, a0, a1, a1, a2, a2, ...}
+  indices = svzip1_u32(indices, indices);  // index in the format {a0, a0, a0, a0, a1, a1, a1, a1, ...}
+  return svld1_gather_u32index_s32(svptrue_b32(), from, indices);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<numext::int32_t>(numext::int32_t* to, const PacketXi& from) {
+  EIGEN_DEBUG_ALIGNED_STORE svst1_s32(svptrue_b32(), to, from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<numext::int32_t>(numext::int32_t* to, const PacketXi& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE svst1_s32(svptrue_b32(), to, from);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline PacketXi pgather<numext::int32_t, PacketXi>(const numext::int32_t* from, Index stride) {
+  // Indice format: {base=0, base+stride, base+stride*2, base+stride*3, ...}
+  svint32_t indices = svindex_s32(0, stride);
+  return svld1_gather_s32index_s32(svptrue_b32(), from, indices);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<numext::int32_t, PacketXi>(numext::int32_t* to, const PacketXi& from,
+                                                                  Index stride) {
+  // Indice format: {base=0, base+stride, base+stride*2, base+stride*3, ...}
+  svint32_t indices = svindex_s32(0, stride);
+  svst1_scatter_s32index_s32(svptrue_b32(), to, indices, from);
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int32_t pfirst<PacketXi>(const PacketXi& a) {
+  // svlasta returns the first element if all predicate bits are 0
+  return svlasta_s32(svpfalse_b(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi preverse(const PacketXi& a) {
+  return svrev_s32(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pabs(const PacketXi& a) {
+  return svabs_s32_x(svptrue_b32(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int32_t predux<PacketXi>(const PacketXi& a) {
+  return static_cast<numext::int32_t>(svaddv_s32(svptrue_b32(), a));
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int32_t predux_mul<PacketXi>(const PacketXi& a) {
+  EIGEN_STATIC_ASSERT((EIGEN_ARM64_SVE_VL % 128 == 0), EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT);
+
+  // Multiply the vector by its reverse
+  svint32_t prod = svmul_s32_x(svptrue_b32(), a, svrev_s32(a));
+  svint32_t half_prod;
+
+  // Extract the high half of the vector. Depending on the VL more reductions need to be done
+  if (EIGEN_ARM64_SVE_VL >= 2048) {
+    half_prod = svtbl_s32(prod, svindex_u32(32, 1));
+    prod = svmul_s32_x(svptrue_b32(), prod, half_prod);
+  }
+  if (EIGEN_ARM64_SVE_VL >= 1024) {
+    half_prod = svtbl_s32(prod, svindex_u32(16, 1));
+    prod = svmul_s32_x(svptrue_b32(), prod, half_prod);
+  }
+  if (EIGEN_ARM64_SVE_VL >= 512) {
+    half_prod = svtbl_s32(prod, svindex_u32(8, 1));
+    prod = svmul_s32_x(svptrue_b32(), prod, half_prod);
+  }
+  if (EIGEN_ARM64_SVE_VL >= 256) {
+    half_prod = svtbl_s32(prod, svindex_u32(4, 1));
+    prod = svmul_s32_x(svptrue_b32(), prod, half_prod);
+  }
+  // Last reduction
+  half_prod = svtbl_s32(prod, svindex_u32(2, 1));
+  prod = svmul_s32_x(svptrue_b32(), prod, half_prod);
+
+  // The reduction is done to the first element.
+  return pfirst<PacketXi>(prod);
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int32_t predux_min<PacketXi>(const PacketXi& a) {
+  return svminv_s32(svptrue_b32(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE numext::int32_t predux_max<PacketXi>(const PacketXi& a) {
+  return svmaxv_s32(svptrue_b32(), a);
+}
+
+template <int N>
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<PacketXi, N>& kernel) {
+  int buffer[packet_traits<numext::int32_t>::size * N] = {0};
+  int i = 0;
+
+  PacketXi stride_index = svindex_s32(0, N);
+
+  for (i = 0; i < N; i++) {
+    svst1_scatter_s32index_s32(svptrue_b32(), buffer + i, stride_index, kernel.packet[i]);
+  }
+  for (i = 0; i < N; i++) {
+    kernel.packet[i] = svld1_s32(svptrue_b32(), buffer + i * packet_traits<numext::int32_t>::size);
+  }
+}
+
+/********************************* float32 ************************************/
+
+typedef svfloat32_t PacketXf __attribute__((arm_sve_vector_bits(EIGEN_ARM64_SVE_VL)));
+
+template <>
+struct packet_traits<float> : default_packet_traits {
+  typedef PacketXf type;
+  typedef PacketXf half;
+
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = sve_packet_size_selector<float, EIGEN_ARM64_SVE_VL>::size,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasShift = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasArg = 0,
+    HasAbs2 = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 1,
+    HasSetLinear = 0,
+    HasBlend = 0,
+    HasReduxp = 0,  // Not implemented in SVE
+
+    HasDiv = 1,
+
+    HasCmp = 1,
+    HasSin = EIGEN_FAST_MATH,
+    HasCos = EIGEN_FAST_MATH,
+    HasLog = 1,
+    HasExp = 1,
+    HasPow = 1,
+    HasSqrt = 1,
+    HasTanh = EIGEN_FAST_MATH,
+    HasErf = EIGEN_FAST_MATH,
+    HasErfc = EIGEN_FAST_MATH
+  };
+};
+
+template <>
+struct unpacket_traits<PacketXf> {
+  typedef float type;
+  typedef PacketXf half;  // Half not yet implemented
+  typedef PacketXi integer_packet;
+
+  enum {
+    size = sve_packet_size_selector<float, EIGEN_ARM64_SVE_VL>::size,
+    alignment = Aligned64,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+};
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pset1<PacketXf>(const float& from) {
+  return svdup_n_f32(from);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pset1frombits<PacketXf>(numext::uint32_t from) {
+  return svreinterpret_f32_u32(svdup_n_u32_x(svptrue_b32(), from));
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf plset<PacketXf>(const float& a) {
+  float c[packet_traits<float>::size];
+  for (int i = 0; i < packet_traits<float>::size; i++) c[i] = i;
+  return svadd_f32_x(svptrue_b32(), pset1<PacketXf>(a), svld1_f32(svptrue_b32(), c));
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf padd<PacketXf>(const PacketXf& a, const PacketXf& b) {
+  return svadd_f32_x(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf psub<PacketXf>(const PacketXf& a, const PacketXf& b) {
+  return svsub_f32_x(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pnegate(const PacketXf& a) {
+  return svneg_f32_x(svptrue_b32(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pconj(const PacketXf& a) {
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pmul<PacketXf>(const PacketXf& a, const PacketXf& b) {
+  return svmul_f32_x(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pdiv<PacketXf>(const PacketXf& a, const PacketXf& b) {
+  return svdiv_f32_x(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pmadd(const PacketXf& a, const PacketXf& b, const PacketXf& c) {
+  return svmla_f32_x(svptrue_b32(), c, a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pmin<PacketXf>(const PacketXf& a, const PacketXf& b) {
+  return svmin_f32_x(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pmin<PropagateNaN, PacketXf>(const PacketXf& a, const PacketXf& b) {
+  return pmin<PacketXf>(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pmin<PropagateNumbers, PacketXf>(const PacketXf& a, const PacketXf& b) {
+  return svminnm_f32_x(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pmax<PacketXf>(const PacketXf& a, const PacketXf& b) {
+  return svmax_f32_x(svptrue_b32(), a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pmax<PropagateNaN, PacketXf>(const PacketXf& a, const PacketXf& b) {
+  return pmax<PacketXf>(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pmax<PropagateNumbers, PacketXf>(const PacketXf& a, const PacketXf& b) {
+  return svmaxnm_f32_x(svptrue_b32(), a, b);
+}
+
+// Float comparisons in SVE return svbool (predicate). Use svdup to set active
+// lanes to 1 (0xffffffffu) and inactive lanes to 0.
+template <>
+EIGEN_STRONG_INLINE PacketXf pcmp_le<PacketXf>(const PacketXf& a, const PacketXf& b) {
+  return svreinterpret_f32_u32(svdup_n_u32_z(svcmple_f32(svptrue_b32(), a, b), 0xffffffffu));
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pcmp_lt<PacketXf>(const PacketXf& a, const PacketXf& b) {
+  return svreinterpret_f32_u32(svdup_n_u32_z(svcmplt_f32(svptrue_b32(), a, b), 0xffffffffu));
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pcmp_eq<PacketXf>(const PacketXf& a, const PacketXf& b) {
+  return svreinterpret_f32_u32(svdup_n_u32_z(svcmpeq_f32(svptrue_b32(), a, b), 0xffffffffu));
+}
+
+// Do a predicate inverse (svnot_b_z) on the predicate resulted from the
+// greater/equal comparison (svcmpge_f32). Then fill a float vector with the
+// active elements.
+template <>
+EIGEN_STRONG_INLINE PacketXf pcmp_lt_or_nan<PacketXf>(const PacketXf& a, const PacketXf& b) {
+  return svreinterpret_f32_u32(svdup_n_u32_z(svnot_b_z(svptrue_b32(), svcmpge_f32(svptrue_b32(), a, b)), 0xffffffffu));
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pfloor<PacketXf>(const PacketXf& a) {
+  return svrintm_f32_x(svptrue_b32(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf ptrue<PacketXf>(const PacketXf& /*a*/) {
+  return svreinterpret_f32_u32(svdup_n_u32_x(svptrue_b32(), 0xffffffffu));
+}
+
+// Logical Operations are not supported for float, so reinterpret casts
+template <>
+EIGEN_STRONG_INLINE PacketXf pand<PacketXf>(const PacketXf& a, const PacketXf& b) {
+  return svreinterpret_f32_u32(svand_u32_x(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf por<PacketXf>(const PacketXf& a, const PacketXf& b) {
+  return svreinterpret_f32_u32(svorr_u32_x(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pxor<PacketXf>(const PacketXf& a, const PacketXf& b) {
+  return svreinterpret_f32_u32(sveor_u32_x(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pandnot<PacketXf>(const PacketXf& a, const PacketXf& b) {
+  return svreinterpret_f32_u32(svbic_u32_x(svptrue_b32(), svreinterpret_u32_f32(a), svreinterpret_u32_f32(b)));
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pload<PacketXf>(const float* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return svld1_f32(svptrue_b32(), from);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf ploadu<PacketXf>(const float* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return svld1_f32(svptrue_b32(), from);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf ploaddup<PacketXf>(const float* from) {
+  svuint32_t indices = svindex_u32(0, 1);  // index {base=0, base+step=1, base+step*2, ...}
+  indices = svzip1_u32(indices, indices);  // index in the format {a0, a0, a1, a1, a2, a2, ...}
+  return svld1_gather_u32index_f32(svptrue_b32(), from, indices);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf ploadquad<PacketXf>(const float* from) {
+  svuint32_t indices = svindex_u32(0, 1);  // index {base=0, base+step=1, base+step*2, ...}
+  indices = svzip1_u32(indices, indices);  // index in the format {a0, a0, a1, a1, a2, a2, ...}
+  indices = svzip1_u32(indices, indices);  // index in the format {a0, a0, a0, a0, a1, a1, a1, a1, ...}
+  return svld1_gather_u32index_f32(svptrue_b32(), from, indices);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<float>(float* to, const PacketXf& from) {
+  EIGEN_DEBUG_ALIGNED_STORE svst1_f32(svptrue_b32(), to, from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const PacketXf& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE svst1_f32(svptrue_b32(), to, from);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline PacketXf pgather<float, PacketXf>(const float* from, Index stride) {
+  // Indice format: {base=0, base+stride, base+stride*2, base+stride*3, ...}
+  svint32_t indices = svindex_s32(0, stride);
+  return svld1_gather_s32index_f32(svptrue_b32(), from, indices);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<float, PacketXf>(float* to, const PacketXf& from, Index stride) {
+  // Indice format: {base=0, base+stride, base+stride*2, base+stride*3, ...}
+  svint32_t indices = svindex_s32(0, stride);
+  svst1_scatter_s32index_f32(svptrue_b32(), to, indices, from);
+}
+
+template <>
+EIGEN_STRONG_INLINE float pfirst<PacketXf>(const PacketXf& a) {
+  // svlasta returns the first element if all predicate bits are 0
+  return svlasta_f32(svpfalse_b(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf preverse(const PacketXf& a) {
+  return svrev_f32(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pabs(const PacketXf& a) {
+  return svabs_f32_x(svptrue_b32(), a);
+}
+
+// TODO(tellenbach): Should this go into MathFunctions.h? If so, change for
+// all vector extensions and the generic version.
+template <>
+EIGEN_STRONG_INLINE PacketXf pfrexp<PacketXf>(const PacketXf& a, PacketXf& exponent) {
+  return pfrexp_generic(a, exponent);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux<PacketXf>(const PacketXf& a) {
+  return svaddv_f32(svptrue_b32(), a);
+}
+
+// Other reduction functions:
+// mul
+// Only works for SVE Vls multiple of 128
+template <>
+EIGEN_STRONG_INLINE float predux_mul<PacketXf>(const PacketXf& a) {
+  EIGEN_STATIC_ASSERT((EIGEN_ARM64_SVE_VL % 128 == 0), EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT);
+  // Multiply the vector by its reverse
+  svfloat32_t prod = svmul_f32_x(svptrue_b32(), a, svrev_f32(a));
+  svfloat32_t half_prod;
+
+  // Extract the high half of the vector. Depending on the VL more reductions need to be done
+  if (EIGEN_ARM64_SVE_VL >= 2048) {
+    half_prod = svtbl_f32(prod, svindex_u32(32, 1));
+    prod = svmul_f32_x(svptrue_b32(), prod, half_prod);
+  }
+  if (EIGEN_ARM64_SVE_VL >= 1024) {
+    half_prod = svtbl_f32(prod, svindex_u32(16, 1));
+    prod = svmul_f32_x(svptrue_b32(), prod, half_prod);
+  }
+  if (EIGEN_ARM64_SVE_VL >= 512) {
+    half_prod = svtbl_f32(prod, svindex_u32(8, 1));
+    prod = svmul_f32_x(svptrue_b32(), prod, half_prod);
+  }
+  if (EIGEN_ARM64_SVE_VL >= 256) {
+    half_prod = svtbl_f32(prod, svindex_u32(4, 1));
+    prod = svmul_f32_x(svptrue_b32(), prod, half_prod);
+  }
+  // Last reduction
+  half_prod = svtbl_f32(prod, svindex_u32(2, 1));
+  prod = svmul_f32_x(svptrue_b32(), prod, half_prod);
+
+  // The reduction is done to the first element.
+  return pfirst<PacketXf>(prod);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_min<PacketXf>(const PacketXf& a) {
+  return svminv_f32(svptrue_b32(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_max<PacketXf>(const PacketXf& a) {
+  return svmaxv_f32(svptrue_b32(), a);
+}
+
+template <int N>
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<PacketXf, N>& kernel) {
+  float buffer[packet_traits<float>::size * N] = {0};
+  int i = 0;
+
+  PacketXi stride_index = svindex_s32(0, N);
+
+  for (i = 0; i < N; i++) {
+    svst1_scatter_s32index_f32(svptrue_b32(), buffer + i, stride_index, kernel.packet[i]);
+  }
+
+  for (i = 0; i < N; i++) {
+    kernel.packet[i] = svld1_f32(svptrue_b32(), buffer + i * packet_traits<float>::size);
+  }
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pldexp<PacketXf>(const PacketXf& a, const PacketXf& exponent) {
+  return pldexp_generic(a, exponent);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf psqrt<PacketXf>(const PacketXf& a) {
+  return svsqrt_f32_x(svptrue_b32(), a);
+}
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_PACKET_MATH_SVE_H
diff --git a/inst/include/Eigen/src/Core/arch/SVE/TypeCasting.h b/inst/include/Eigen/src/Core/arch/SVE/TypeCasting.h
new file mode 100644
index 00000000..c7027b30
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/SVE/TypeCasting.h
@@ -0,0 +1,52 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2020, Arm Limited and Contributors
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TYPE_CASTING_SVE_H
+#define EIGEN_TYPE_CASTING_SVE_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+template <>
+struct type_casting_traits<float, numext::int32_t> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+
+template <>
+struct type_casting_traits<numext::int32_t, float> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+
+template <>
+EIGEN_STRONG_INLINE PacketXf pcast<PacketXi, PacketXf>(const PacketXi& a) {
+  return svcvt_f32_s32_x(svptrue_b32(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi pcast<PacketXf, PacketXi>(const PacketXf& a) {
+  return svcvt_s32_f32_x(svptrue_b32(), a);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXf preinterpret<PacketXf, PacketXi>(const PacketXi& a) {
+  return svreinterpret_f32_s32(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE PacketXi preinterpret<PacketXi, PacketXf>(const PacketXf& a) {
+  return svreinterpret_s32_f32(a);
+}
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_TYPE_CASTING_SVE_H
diff --git a/inst/include/Eigen/src/Core/arch/SYCL/InteropHeaders.h b/inst/include/Eigen/src/Core/arch/SYCL/InteropHeaders.h
new file mode 100644
index 00000000..578e0f3a
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/SYCL/InteropHeaders.h
@@ -0,0 +1,227 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*****************************************************************
+ * InteropHeaders.h
+ *
+ * \brief:
+ *  InteropHeaders
+ *
+ *****************************************************************/
+
+#ifndef EIGEN_INTEROP_HEADERS_SYCL_H
+#define EIGEN_INTEROP_HEADERS_SYCL_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+#if !defined(EIGEN_DONT_VECTORIZE_SYCL)
+
+namespace internal {
+
+template <int has_blend, int lengths>
+struct sycl_packet_traits : default_packet_traits {
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = lengths,
+    HasDiv = 1,
+    HasLog = 1,
+    HasExp = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasSin = 1,
+    HasCos = 1,
+    HasTan = 1,
+    HasASin = 1,
+    HasACos = 1,
+    HasATan = 1,
+    HasSinh = 1,
+    HasCosh = 1,
+    HasTanh = 1,
+    HasLGamma = 0,
+    HasDiGamma = 0,
+    HasZeta = 0,
+    HasPolygamma = 0,
+    HasErf = 0,
+    HasErfc = 0,
+    HasNdtri = 0,
+    HasIGamma = 0,
+    HasIGammac = 0,
+    HasBetaInc = 0,
+    HasBlend = has_blend,
+    // This flag is used to indicate whether packet comparison is supported.
+    // pcmp_eq, pcmp_lt and pcmp_le should be defined for it to be true.
+    HasCmp = 1,
+    HasMax = 1,
+    HasMin = 1,
+    HasMul = 1,
+    HasAdd = 1,
+    HasFloor = 1,
+    HasRound = 1,
+    HasRint = 1,
+    HasLog1p = 1,
+    HasExpm1 = 1,
+    HasCeil = 1,
+  };
+};
+
+#ifdef SYCL_DEVICE_ONLY
+#define SYCL_PACKET_TRAITS(packet_type, has_blend, unpacket_type, lengths)       \
+  template <>                                                                    \
+  struct packet_traits<unpacket_type> : sycl_packet_traits<has_blend, lengths> { \
+    typedef packet_type type;                                                    \
+    typedef packet_type half;                                                    \
+  };
+
+SYCL_PACKET_TRAITS(cl::sycl::cl_half8, 1, Eigen::half, 8)
+SYCL_PACKET_TRAITS(cl::sycl::cl_half8, 1, const Eigen::half, 8)
+SYCL_PACKET_TRAITS(cl::sycl::cl_float4, 1, float, 4)
+SYCL_PACKET_TRAITS(cl::sycl::cl_float4, 1, const float, 4)
+SYCL_PACKET_TRAITS(cl::sycl::cl_double2, 0, double, 2)
+SYCL_PACKET_TRAITS(cl::sycl::cl_double2, 0, const double, 2)
+#undef SYCL_PACKET_TRAITS
+
+// Make sure this is only available when targeting a GPU: we don't want to
+// introduce conflicts between these packet_traits definitions and the ones
+// we'll use on the host side (SSE, AVX, ...)
+#define SYCL_ARITHMETIC(packet_type)  \
+  template <>                         \
+  struct is_arithmetic<packet_type> { \
+    enum { value = true };            \
+  };
+SYCL_ARITHMETIC(cl::sycl::cl_half8)
+SYCL_ARITHMETIC(cl::sycl::cl_float4)
+SYCL_ARITHMETIC(cl::sycl::cl_double2)
+#undef SYCL_ARITHMETIC
+
+#define SYCL_UNPACKET_TRAITS(packet_type, unpacket_type, lengths)        \
+  template <>                                                            \
+  struct unpacket_traits<packet_type> {                                  \
+    typedef unpacket_type type;                                          \
+    enum { size = lengths, vectorizable = true, alignment = Aligned16 }; \
+    typedef packet_type half;                                            \
+  };
+SYCL_UNPACKET_TRAITS(cl::sycl::cl_half8, Eigen::half, 8)
+SYCL_UNPACKET_TRAITS(cl::sycl::cl_float4, float, 4)
+SYCL_UNPACKET_TRAITS(cl::sycl::cl_double2, double, 2)
+
+#undef SYCL_UNPACKET_TRAITS
+#endif
+
+}  // end namespace internal
+
+#endif
+
+namespace TensorSycl {
+namespace internal {
+
+template <typename PacketReturnType, int PacketSize>
+struct PacketWrapper;
+// This function should never get called on the device
+#ifndef SYCL_DEVICE_ONLY
+template <typename PacketReturnType, int PacketSize>
+struct PacketWrapper {
+  typedef typename ::Eigen::internal::unpacket_traits<PacketReturnType>::type Scalar;
+  template <typename Index>
+  EIGEN_DEVICE_FUNC static Scalar scalarize(Index, PacketReturnType &) {
+    eigen_assert(false && "THERE IS NO PACKETIZE VERSION FOR  THE CHOSEN TYPE");
+    abort();
+  }
+  EIGEN_DEVICE_FUNC static PacketReturnType convert_to_packet_type(Scalar in, Scalar) {
+    return ::Eigen::internal::template plset<PacketReturnType>(in);
+  }
+  EIGEN_DEVICE_FUNC static void set_packet(PacketReturnType, Scalar *) {
+    eigen_assert(false && "THERE IS NO PACKETIZE VERSION FOR  THE CHOSEN TYPE");
+    abort();
+  }
+};
+
+#elif defined(SYCL_DEVICE_ONLY)
+template <typename PacketReturnType>
+struct PacketWrapper<PacketReturnType, 4> {
+  typedef typename ::Eigen::internal::unpacket_traits<PacketReturnType>::type Scalar;
+  template <typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Scalar scalarize(Index index, PacketReturnType &in) {
+    switch (index) {
+      case 0:
+        return in.x();
+      case 1:
+        return in.y();
+      case 2:
+        return in.z();
+      case 3:
+        return in.w();
+      default:
+        // INDEX MUST BE BETWEEN 0 and 3.There is no abort function in SYCL kernel. so we cannot use abort here.
+        //  The code will never reach here
+        __builtin_unreachable();
+    }
+    __builtin_unreachable();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static PacketReturnType convert_to_packet_type(Scalar in, Scalar other) {
+    return PacketReturnType(in, other, other, other);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void set_packet(PacketReturnType &lhs, Scalar *rhs) {
+    lhs = PacketReturnType(rhs[0], rhs[1], rhs[2], rhs[3]);
+  }
+};
+
+template <typename PacketReturnType>
+struct PacketWrapper<PacketReturnType, 1> {
+  typedef typename ::Eigen::internal::unpacket_traits<PacketReturnType>::type Scalar;
+  template <typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Scalar scalarize(Index, PacketReturnType &in) {
+    return in;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static PacketReturnType convert_to_packet_type(Scalar in, Scalar) {
+    return PacketReturnType(in);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void set_packet(PacketReturnType &lhs, Scalar *rhs) { lhs = rhs[0]; }
+};
+
+template <typename PacketReturnType>
+struct PacketWrapper<PacketReturnType, 2> {
+  typedef typename ::Eigen::internal::unpacket_traits<PacketReturnType>::type Scalar;
+  template <typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Scalar scalarize(Index index, PacketReturnType &in) {
+    switch (index) {
+      case 0:
+        return in.x();
+      case 1:
+        return in.y();
+      default:
+        // INDEX MUST BE BETWEEN 0 and 1.There is no abort function in SYCL kernel. so we cannot use abort here.
+        // The code will never reach here
+        __builtin_unreachable();
+    }
+    __builtin_unreachable();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static PacketReturnType convert_to_packet_type(Scalar in, Scalar other) {
+    return PacketReturnType(in, other);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void set_packet(PacketReturnType &lhs, Scalar *rhs) {
+    lhs = PacketReturnType(rhs[0], rhs[1]);
+  }
+};
+
+#endif
+
+}  // end namespace internal
+}  // end namespace TensorSycl
+}  // end namespace Eigen
+
+#endif  // EIGEN_INTEROP_HEADERS_SYCL_H
diff --git a/inst/include/Eigen/src/Core/arch/SYCL/MathFunctions.h b/inst/include/Eigen/src/Core/arch/SYCL/MathFunctions.h
new file mode 100644
index 00000000..b20c32b3
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/SYCL/MathFunctions.h
@@ -0,0 +1,303 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*****************************************************************
+ * MathFunctions.h
+ *
+ * \brief:
+ *  MathFunctions
+ *
+ *****************************************************************/
+
+#ifndef EIGEN_MATH_FUNCTIONS_SYCL_H
+#define EIGEN_MATH_FUNCTIONS_SYCL_H
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+// Make sure this is only available when targeting a GPU: we don't want to
+// introduce conflicts between these packet_traits definitions and the ones
+// we'll use on the host side (SSE, AVX, ...)
+#if defined(SYCL_DEVICE_ONLY)
+#define SYCL_PLOG(packet_type)                                                                \
+  template <>                                                                                 \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type plog<packet_type>(const packet_type& a) { \
+    return cl::sycl::log(a);                                                                  \
+  }
+
+SYCL_PLOG(cl::sycl::cl_half8)
+SYCL_PLOG(cl::sycl::cl_float4)
+SYCL_PLOG(cl::sycl::cl_double2)
+#undef SYCL_PLOG
+
+#define SYCL_PLOG1P(packet_type)                                                                \
+  template <>                                                                                   \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type plog1p<packet_type>(const packet_type& a) { \
+    return cl::sycl::log1p(a);                                                                  \
+  }
+
+SYCL_PLOG1P(cl::sycl::cl_half8)
+SYCL_PLOG1P(cl::sycl::cl_float4)
+SYCL_PLOG1P(cl::sycl::cl_double2)
+#undef SYCL_PLOG1P
+
+#define SYCL_PLOG10(packet_type)                                                                \
+  template <>                                                                                   \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type plog10<packet_type>(const packet_type& a) { \
+    return cl::sycl::log10(a);                                                                  \
+  }
+
+SYCL_PLOG10(cl::sycl::cl_half8)
+SYCL_PLOG10(cl::sycl::cl_float4)
+SYCL_PLOG10(cl::sycl::cl_double2)
+#undef SYCL_PLOG10
+
+#define SYCL_PEXP(packet_type)                                                                \
+  template <>                                                                                 \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pexp<packet_type>(const packet_type& a) { \
+    return cl::sycl::exp(a);                                                                  \
+  }
+
+SYCL_PEXP(cl::sycl::cl_half8)
+SYCL_PEXP(cl::sycl::cl_half)
+SYCL_PEXP(cl::sycl::cl_float4)
+SYCL_PEXP(cl::sycl::cl_float)
+SYCL_PEXP(cl::sycl::cl_double2)
+#undef SYCL_PEXP
+
+#define SYCL_PEXPM1(packet_type)                                                                \
+  template <>                                                                                   \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pexpm1<packet_type>(const packet_type& a) { \
+    return cl::sycl::expm1(a);                                                                  \
+  }
+
+SYCL_PEXPM1(cl::sycl::cl_half8)
+SYCL_PEXPM1(cl::sycl::cl_float4)
+SYCL_PEXPM1(cl::sycl::cl_double2)
+#undef SYCL_PEXPM1
+
+#define SYCL_PSQRT(packet_type)                                                                \
+  template <>                                                                                  \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type psqrt<packet_type>(const packet_type& a) { \
+    return cl::sycl::sqrt(a);                                                                  \
+  }
+
+SYCL_PSQRT(cl::sycl::cl_half8)
+SYCL_PSQRT(cl::sycl::cl_float4)
+SYCL_PSQRT(cl::sycl::cl_double2)
+#undef SYCL_PSQRT
+
+#define SYCL_PRSQRT(packet_type)                                                                \
+  template <>                                                                                   \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type prsqrt<packet_type>(const packet_type& a) { \
+    return cl::sycl::rsqrt(a);                                                                  \
+  }
+
+SYCL_PRSQRT(cl::sycl::cl_half8)
+SYCL_PRSQRT(cl::sycl::cl_float4)
+SYCL_PRSQRT(cl::sycl::cl_double2)
+#undef SYCL_PRSQRT
+
+/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
+#define SYCL_PSIN(packet_type)                                                                \
+  template <>                                                                                 \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type psin<packet_type>(const packet_type& a) { \
+    return cl::sycl::sin(a);                                                                  \
+  }
+
+SYCL_PSIN(cl::sycl::cl_half8)
+SYCL_PSIN(cl::sycl::cl_float4)
+SYCL_PSIN(cl::sycl::cl_double2)
+#undef SYCL_PSIN
+
+/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */
+#define SYCL_PCOS(packet_type)                                                                \
+  template <>                                                                                 \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pcos<packet_type>(const packet_type& a) { \
+    return cl::sycl::cos(a);                                                                  \
+  }
+
+SYCL_PCOS(cl::sycl::cl_half8)
+SYCL_PCOS(cl::sycl::cl_float4)
+SYCL_PCOS(cl::sycl::cl_double2)
+#undef SYCL_PCOS
+
+/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
+#define SYCL_PTAN(packet_type)                                                                \
+  template <>                                                                                 \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type ptan<packet_type>(const packet_type& a) { \
+    return cl::sycl::tan(a);                                                                  \
+  }
+
+SYCL_PTAN(cl::sycl::cl_half8)
+SYCL_PTAN(cl::sycl::cl_float4)
+SYCL_PTAN(cl::sycl::cl_double2)
+#undef SYCL_PTAN
+
+/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
+#define SYCL_PASIN(packet_type)                                                                \
+  template <>                                                                                  \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pasin<packet_type>(const packet_type& a) { \
+    return cl::sycl::asin(a);                                                                  \
+  }
+
+SYCL_PASIN(cl::sycl::cl_half8)
+SYCL_PASIN(cl::sycl::cl_float4)
+SYCL_PASIN(cl::sycl::cl_double2)
+#undef SYCL_PASIN
+
+/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */
+#define SYCL_PACOS(packet_type)                                                                \
+  template <>                                                                                  \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pacos<packet_type>(const packet_type& a) { \
+    return cl::sycl::acos(a);                                                                  \
+  }
+
+SYCL_PACOS(cl::sycl::cl_half8)
+SYCL_PACOS(cl::sycl::cl_float4)
+SYCL_PACOS(cl::sycl::cl_double2)
+#undef SYCL_PACOS
+
+/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
+#define SYCL_PATAN(packet_type)                                                                \
+  template <>                                                                                  \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type patan<packet_type>(const packet_type& a) { \
+    return cl::sycl::atan(a);                                                                  \
+  }
+
+SYCL_PATAN(cl::sycl::cl_half8)
+SYCL_PATAN(cl::sycl::cl_float4)
+SYCL_PATAN(cl::sycl::cl_double2)
+#undef SYCL_PATAN
+
+/** \internal \returns the hyperbolic sine of \a a (coeff-wise) */
+#define SYCL_PSINH(packet_type)                                                                \
+  template <>                                                                                  \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type psinh<packet_type>(const packet_type& a) { \
+    return cl::sycl::sinh(a);                                                                  \
+  }
+
+SYCL_PSINH(cl::sycl::cl_half8)
+SYCL_PSINH(cl::sycl::cl_float4)
+SYCL_PSINH(cl::sycl::cl_double2)
+#undef SYCL_PSINH
+
+/** \internal \returns the hyperbolic cosine of \a a (coeff-wise) */
+#define SYCL_PCOSH(packet_type)                                                                \
+  template <>                                                                                  \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pcosh<packet_type>(const packet_type& a) { \
+    return cl::sycl::cosh(a);                                                                  \
+  }
+
+SYCL_PCOSH(cl::sycl::cl_half8)
+SYCL_PCOSH(cl::sycl::cl_float4)
+SYCL_PCOSH(cl::sycl::cl_double2)
+#undef SYCL_PCOSH
+
+/** \internal \returns the hyperbolic tan of \a a (coeff-wise) */
+#define SYCL_PTANH(packet_type)                                                                \
+  template <>                                                                                  \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type ptanh<packet_type>(const packet_type& a) { \
+    return cl::sycl::tanh(a);                                                                  \
+  }
+
+SYCL_PTANH(cl::sycl::cl_half8)
+SYCL_PTANH(cl::sycl::cl_float4)
+SYCL_PTANH(cl::sycl::cl_double2)
+#undef SYCL_PTANH
+
+#define SYCL_PCEIL(packet_type)                                                                \
+  template <>                                                                                  \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pceil<packet_type>(const packet_type& a) { \
+    return cl::sycl::ceil(a);                                                                  \
+  }
+
+SYCL_PCEIL(cl::sycl::cl_half)
+SYCL_PCEIL(cl::sycl::cl_float4)
+SYCL_PCEIL(cl::sycl::cl_double2)
+#undef SYCL_PCEIL
+
+#define SYCL_PROUND(packet_type)                                                                \
+  template <>                                                                                   \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pround<packet_type>(const packet_type& a) { \
+    return cl::sycl::round(a);                                                                  \
+  }
+
+SYCL_PROUND(cl::sycl::cl_half8)
+SYCL_PROUND(cl::sycl::cl_float4)
+SYCL_PROUND(cl::sycl::cl_double2)
+#undef SYCL_PROUND
+
+#define SYCL_PRINT(packet_type)                                                                \
+  template <>                                                                                  \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type print<packet_type>(const packet_type& a) { \
+    return cl::sycl::rint(a);                                                                  \
+  }
+
+SYCL_PRINT(cl::sycl::cl_half8)
+SYCL_PRINT(cl::sycl::cl_float4)
+SYCL_PRINT(cl::sycl::cl_double2)
+#undef SYCL_PRINT
+
+#define SYCL_FLOOR(packet_type)                                                                 \
+  template <>                                                                                   \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pfloor<packet_type>(const packet_type& a) { \
+    return cl::sycl::floor(a);                                                                  \
+  }
+
+SYCL_FLOOR(cl::sycl::cl_half8)
+SYCL_FLOOR(cl::sycl::cl_float4)
+SYCL_FLOOR(cl::sycl::cl_double2)
+#undef SYCL_FLOOR
+
+#define SYCL_PMIN(packet_type, expr)                                                                                \
+  template <>                                                                                                       \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pmin<packet_type>(const packet_type& a, const packet_type& b) { \
+    return expr;                                                                                                    \
+  }
+
+SYCL_PMIN(cl::sycl::cl_half8, cl::sycl::fmin(a, b))
+SYCL_PMIN(cl::sycl::cl_float4, cl::sycl::fmin(a, b))
+SYCL_PMIN(cl::sycl::cl_double2, cl::sycl::fmin(a, b))
+#undef SYCL_PMIN
+
+#define SYCL_PMAX(packet_type, expr)                                                                                \
+  template <>                                                                                                       \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pmax<packet_type>(const packet_type& a, const packet_type& b) { \
+    return expr;                                                                                                    \
+  }
+
+SYCL_PMAX(cl::sycl::cl_half8, cl::sycl::fmax(a, b))
+SYCL_PMAX(cl::sycl::cl_float4, cl::sycl::fmax(a, b))
+SYCL_PMAX(cl::sycl::cl_double2, cl::sycl::fmax(a, b))
+#undef SYCL_PMAX
+
+#define SYCL_PLDEXP(packet_type)                                                                                  \
+  template <>                                                                                                     \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pldexp(const packet_type& a, const packet_type& exponent) {   \
+    return cl::sycl::ldexp(a, exponent.template convert<cl::sycl::cl_int, cl::sycl::rounding_mode::automatic>()); \
+  }
+
+SYCL_PLDEXP(cl::sycl::cl_half8)
+SYCL_PLDEXP(cl::sycl::cl_float4)
+SYCL_PLDEXP(cl::sycl::cl_double2)
+#undef SYCL_PLDEXP
+
+#endif
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_MATH_FUNCTIONS_SYCL_H
diff --git a/inst/include/Eigen/src/Core/arch/SYCL/PacketMath.h b/inst/include/Eigen/src/Core/arch/SYCL/PacketMath.h
new file mode 100644
index 00000000..6b6bfe43
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/SYCL/PacketMath.h
@@ -0,0 +1,576 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*****************************************************************
+ * PacketMath.h
+ *
+ * \brief:
+ *  PacketMath
+ *
+ *****************************************************************/
+
+#ifndef EIGEN_PACKET_MATH_SYCL_H
+#define EIGEN_PACKET_MATH_SYCL_H
+#include <type_traits>
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+#ifdef SYCL_DEVICE_ONLY
+#define SYCL_PLOAD(packet_type, AlignedType)                                                                           \
+  template <>                                                                                                          \
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pload##AlignedType<packet_type>(                                   \
+      const typename unpacket_traits<packet_type>::type* from) {                                                       \
+    auto ptr =                                                                                                         \
+        cl::sycl::address_space_cast<cl::sycl::access::address_space::generic_space, cl::sycl::access::decorated::no>( \
+            from);                                                                                                     \
+    packet_type res{};                                                                                                 \
+    res.load(0, ptr);                                                                                                  \
+    return res;                                                                                                        \
+  }
+
+SYCL_PLOAD(cl::sycl::cl_float4, u)
+SYCL_PLOAD(cl::sycl::cl_float4, )
+SYCL_PLOAD(cl::sycl::cl_double2, u)
+SYCL_PLOAD(cl::sycl::cl_double2, )
+#undef SYCL_PLOAD
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8 pload<cl::sycl::cl_half8>(
+    const typename unpacket_traits<cl::sycl::cl_half8>::type* from) {
+  auto ptr =
+      cl::sycl::address_space_cast<cl::sycl::access::address_space::generic_space, cl::sycl::access::decorated::no>(
+          reinterpret_cast<const cl::sycl::cl_half*>(from));
+  cl::sycl::cl_half8 res{};
+  res.load(0, ptr);
+  return res;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8 ploadu<cl::sycl::cl_half8>(
+    const typename unpacket_traits<cl::sycl::cl_half8>::type* from) {
+  auto ptr =
+      cl::sycl::address_space_cast<cl::sycl::access::address_space::generic_space, cl::sycl::access::decorated::no>(
+          reinterpret_cast<const cl::sycl::cl_half*>(from));
+  cl::sycl::cl_half8 res{};
+  res.load(0, ptr);
+  return res;
+}
+
+#define SYCL_PSTORE(scalar, packet_type, alignment)                                                                    \
+  template <>                                                                                                          \
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore##alignment(scalar* to, const packet_type& from) {                  \
+    auto ptr =                                                                                                         \
+        cl::sycl::address_space_cast<cl::sycl::access::address_space::generic_space, cl::sycl::access::decorated::no>( \
+            to);                                                                                                       \
+    from.store(0, ptr);                                                                                                \
+  }
+
+SYCL_PSTORE(float, cl::sycl::cl_float4, )
+SYCL_PSTORE(float, cl::sycl::cl_float4, u)
+SYCL_PSTORE(double, cl::sycl::cl_double2, )
+SYCL_PSTORE(double, cl::sycl::cl_double2, u)
+#undef SYCL_PSTORE
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoreu(Eigen::half* to, const cl::sycl::cl_half8& from) {
+  auto ptr =
+      cl::sycl::address_space_cast<cl::sycl::access::address_space::generic_space, cl::sycl::access::decorated::no>(
+          reinterpret_cast<cl::sycl::cl_half*>(to));
+  from.store(0, ptr);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstore(Eigen::half* to, const cl::sycl::cl_half8& from) {
+  auto ptr =
+      cl::sycl::address_space_cast<cl::sycl::access::address_space::generic_space, cl::sycl::access::decorated::no>(
+          reinterpret_cast<cl::sycl::cl_half*>(to));
+  from.store(0, ptr);
+}
+
+#define SYCL_PSET1(packet_type)                                         \
+  template <>                                                           \
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pset1<packet_type>( \
+      const typename unpacket_traits<packet_type>::type& from) {        \
+    return packet_type(from);                                           \
+  }
+
+// global space
+SYCL_PSET1(cl::sycl::cl_half8)
+SYCL_PSET1(cl::sycl::cl_float4)
+SYCL_PSET1(cl::sycl::cl_double2)
+
+#undef SYCL_PSET1
+
+template <typename packet_type>
+struct get_base_packet {
+  template <typename sycl_multi_pointer>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type get_ploaddup(sycl_multi_pointer) {}
+
+  template <typename sycl_multi_pointer>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type get_pgather(sycl_multi_pointer, Index) {}
+};
+
+template <>
+struct get_base_packet<cl::sycl::cl_half8> {
+  template <typename sycl_multi_pointer>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_half8 get_ploaddup(sycl_multi_pointer from) {
+    return cl::sycl::cl_half8(static_cast<cl::sycl::half>(from[0]), static_cast<cl::sycl::half>(from[0]),
+                              static_cast<cl::sycl::half>(from[1]), static_cast<cl::sycl::half>(from[1]),
+                              static_cast<cl::sycl::half>(from[2]), static_cast<cl::sycl::half>(from[2]),
+                              static_cast<cl::sycl::half>(from[3]), static_cast<cl::sycl::half>(from[3]));
+  }
+  template <typename sycl_multi_pointer>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_half8 get_pgather(sycl_multi_pointer from, Index stride) {
+    return cl::sycl::cl_half8(
+        static_cast<cl::sycl::half>(from[0 * stride]), static_cast<cl::sycl::half>(from[1 * stride]),
+        static_cast<cl::sycl::half>(from[2 * stride]), static_cast<cl::sycl::half>(from[3 * stride]),
+        static_cast<cl::sycl::half>(from[4 * stride]), static_cast<cl::sycl::half>(from[5 * stride]),
+        static_cast<cl::sycl::half>(from[6 * stride]), static_cast<cl::sycl::half>(from[7 * stride]));
+  }
+
+  template <typename sycl_multi_pointer>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_pscatter(sycl_multi_pointer to, const cl::sycl::cl_half8& from,
+                                                                 Index stride) {
+    auto tmp = stride;
+    to[0] = Eigen::half(from.s0());
+    to[tmp] = Eigen::half(from.s1());
+    to[tmp += stride] = Eigen::half(from.s2());
+    to[tmp += stride] = Eigen::half(from.s3());
+    to[tmp += stride] = Eigen::half(from.s4());
+    to[tmp += stride] = Eigen::half(from.s5());
+    to[tmp += stride] = Eigen::half(from.s6());
+    to[tmp += stride] = Eigen::half(from.s7());
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_half8 set_plset(const cl::sycl::half& a) {
+    return cl::sycl::cl_half8(static_cast<cl::sycl::half>(a), static_cast<cl::sycl::half>(a + 1),
+                              static_cast<cl::sycl::half>(a + 2), static_cast<cl::sycl::half>(a + 3),
+                              static_cast<cl::sycl::half>(a + 4), static_cast<cl::sycl::half>(a + 5),
+                              static_cast<cl::sycl::half>(a + 6), static_cast<cl::sycl::half>(a + 7));
+  }
+};
+
+template <>
+struct get_base_packet<cl::sycl::cl_float4> {
+  template <typename sycl_multi_pointer>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 get_ploaddup(sycl_multi_pointer from) {
+    return cl::sycl::cl_float4(from[0], from[0], from[1], from[1]);
+  }
+  template <typename sycl_multi_pointer>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 get_pgather(sycl_multi_pointer from, Index stride) {
+    return cl::sycl::cl_float4(from[0 * stride], from[1 * stride], from[2 * stride], from[3 * stride]);
+  }
+
+  template <typename sycl_multi_pointer>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_pscatter(sycl_multi_pointer to, const cl::sycl::cl_float4& from,
+                                                                 Index stride) {
+    auto tmp = stride;
+    to[0] = from.x();
+    to[tmp] = from.y();
+    to[tmp += stride] = from.z();
+    to[tmp += stride] = from.w();
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_float4 set_plset(const float& a) {
+    return cl::sycl::cl_float4(static_cast<float>(a), static_cast<float>(a + 1), static_cast<float>(a + 2),
+                               static_cast<float>(a + 3));
+  }
+};
+
+template <>
+struct get_base_packet<cl::sycl::cl_double2> {
+  template <typename sycl_multi_pointer>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2 get_ploaddup(const sycl_multi_pointer from) {
+    return cl::sycl::cl_double2(from[0], from[0]);
+  }
+
+  template <typename sycl_multi_pointer, typename Index>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2 get_pgather(const sycl_multi_pointer from,
+                                                                                Index stride) {
+    return cl::sycl::cl_double2(from[0 * stride], from[1 * stride]);
+  }
+
+  template <typename sycl_multi_pointer>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void set_pscatter(sycl_multi_pointer to,
+                                                                 const cl::sycl::cl_double2& from, Index stride) {
+    to[0] = from.x();
+    to[stride] = from.y();
+  }
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE cl::sycl::cl_double2 set_plset(const double& a) {
+    return cl::sycl::cl_double2(static_cast<double>(a), static_cast<double>(a + 1));
+  }
+};
+
+#define SYCL_PLOAD_DUP_SPECILIZE(packet_type)                              \
+  template <>                                                              \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type ploaddup<packet_type>( \
+      const typename unpacket_traits<packet_type>::type* from) {           \
+    return get_base_packet<packet_type>::get_ploaddup(from);               \
+  }
+
+SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_half8)
+SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_float4)
+SYCL_PLOAD_DUP_SPECILIZE(cl::sycl::cl_double2)
+
+#undef SYCL_PLOAD_DUP_SPECILIZE
+
+#define SYCL_PLSET(packet_type)                                         \
+  template <>                                                           \
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type plset<packet_type>( \
+      const typename unpacket_traits<packet_type>::type& a) {           \
+    return get_base_packet<packet_type>::set_plset(a);                  \
+  }
+SYCL_PLSET(cl::sycl::cl_float4)
+SYCL_PLSET(cl::sycl::cl_double2)
+#undef SYCL_PLSET
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8 plset<cl::sycl::cl_half8>(
+    const typename unpacket_traits<cl::sycl::cl_half8>::type& a) {
+  return get_base_packet<cl::sycl::cl_half8>::set_plset((const cl::sycl::half&)a);
+}
+
+#define SYCL_PGATHER_SPECILIZE(scalar, packet_type)                               \
+  template <>                                                                     \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE packet_type pgather<scalar, packet_type>( \
+      const typename unpacket_traits<packet_type>::type* from, Index stride) {    \
+    return get_base_packet<packet_type>::get_pgather(from, stride);               \
+  }
+
+SYCL_PGATHER_SPECILIZE(Eigen::half, cl::sycl::cl_half8)
+SYCL_PGATHER_SPECILIZE(float, cl::sycl::cl_float4)
+SYCL_PGATHER_SPECILIZE(double, cl::sycl::cl_double2)
+#undef SYCL_PGATHER_SPECILIZE
+
+#define SYCL_PSCATTER_SPECILIZE(scalar, packet_type)                                             \
+  template <>                                                                                    \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<scalar, packet_type>(                      \
+      typename unpacket_traits<packet_type>::type * to, const packet_type& from, Index stride) { \
+    get_base_packet<packet_type>::set_pscatter(to, from, stride);                                \
+  }
+
+SYCL_PSCATTER_SPECILIZE(Eigen::half, cl::sycl::cl_half8)
+SYCL_PSCATTER_SPECILIZE(float, cl::sycl::cl_float4)
+SYCL_PSCATTER_SPECILIZE(double, cl::sycl::cl_double2)
+
+#undef SYCL_PSCATTER_SPECILIZE
+
+#define SYCL_PMAD(packet_type)                                                                        \
+  template <>                                                                                         \
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE packet_type pmadd(const packet_type& a, const packet_type& b, \
+                                                          const packet_type& c) {                     \
+    return cl::sycl::mad(a, b, c);                                                                    \
+  }
+
+SYCL_PMAD(cl::sycl::cl_half8)
+SYCL_PMAD(cl::sycl::cl_float4)
+SYCL_PMAD(cl::sycl::cl_double2)
+#undef SYCL_PMAD
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half pfirst<cl::sycl::cl_half8>(const cl::sycl::cl_half8& a) {
+  return Eigen::half(a.s0());
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float pfirst<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
+  return a.x();
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double pfirst<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
+  return a.x();
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half predux<cl::sycl::cl_half8>(const cl::sycl::cl_half8& a) {
+  return Eigen::half(a.s0() + a.s1() + a.s2() + a.s3() + a.s4() + a.s5() + a.s6() + a.s7());
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
+  return a.x() + a.y() + a.z() + a.w();
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
+  return a.x() + a.y();
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half predux_max<cl::sycl::cl_half8>(const cl::sycl::cl_half8& a) {
+  return Eigen::half(cl::sycl::fmax(cl::sycl::fmax(cl::sycl::fmax(a.s0(), a.s1()), cl::sycl::fmax(a.s2(), a.s3())),
+                                    cl::sycl::fmax(cl::sycl::fmax(a.s4(), a.s5()), cl::sycl::fmax(a.s6(), a.s7()))));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_max<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
+  return cl::sycl::fmax(cl::sycl::fmax(a.x(), a.y()), cl::sycl::fmax(a.z(), a.w()));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_max<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
+  return cl::sycl::fmax(a.x(), a.y());
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half predux_min<cl::sycl::cl_half8>(const cl::sycl::cl_half8& a) {
+  return Eigen::half(cl::sycl::fmin(cl::sycl::fmin(cl::sycl::fmin(a.s0(), a.s1()), cl::sycl::fmin(a.s2(), a.s3())),
+                                    cl::sycl::fmin(cl::sycl::fmin(a.s4(), a.s5()), cl::sycl::fmin(a.s6(), a.s7()))));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_min<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
+  return cl::sycl::fmin(cl::sycl::fmin(a.x(), a.y()), cl::sycl::fmin(a.z(), a.w()));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_min<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
+  return cl::sycl::fmin(a.x(), a.y());
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half predux_mul<cl::sycl::cl_half8>(const cl::sycl::cl_half8& a) {
+  return Eigen::half(a.s0() * a.s1() * a.s2() * a.s3() * a.s4() * a.s5() * a.s6() * a.s7());
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float predux_mul<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
+  return a.x() * a.y() * a.z() * a.w();
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double predux_mul<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
+  return a.x() * a.y();
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8 pabs<cl::sycl::cl_half8>(const cl::sycl::cl_half8& a) {
+  return cl::sycl::cl_half8(cl::sycl::fabs(a.s0()), cl::sycl::fabs(a.s1()), cl::sycl::fabs(a.s2()),
+                            cl::sycl::fabs(a.s3()), cl::sycl::fabs(a.s4()), cl::sycl::fabs(a.s5()),
+                            cl::sycl::fabs(a.s6()), cl::sycl::fabs(a.s7()));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4 pabs<cl::sycl::cl_float4>(const cl::sycl::cl_float4& a) {
+  return cl::sycl::cl_float4(cl::sycl::fabs(a.x()), cl::sycl::fabs(a.y()), cl::sycl::fabs(a.z()),
+                             cl::sycl::fabs(a.w()));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_double2 pabs<cl::sycl::cl_double2>(const cl::sycl::cl_double2& a) {
+  return cl::sycl::cl_double2(cl::sycl::fabs(a.x()), cl::sycl::fabs(a.y()));
+}
+
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet sycl_pcmp_le(const Packet& a, const Packet& b) {
+  return (a <= b).template as<Packet>();
+}
+
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet sycl_pcmp_lt(const Packet& a, const Packet& b) {
+  return (a < b).template as<Packet>();
+}
+
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet sycl_pcmp_eq(const Packet& a, const Packet& b) {
+  return (a == b).template as<Packet>();
+}
+
+#define SYCL_PCMP(OP, TYPE)                                                                  \
+  template <>                                                                                \
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE TYPE pcmp_##OP<TYPE>(const TYPE& a, const TYPE& b) { \
+    return sycl_pcmp_##OP<TYPE>(a, b);                                                       \
+  }
+
+SYCL_PCMP(le, cl::sycl::cl_half8)
+SYCL_PCMP(lt, cl::sycl::cl_half8)
+SYCL_PCMP(eq, cl::sycl::cl_half8)
+SYCL_PCMP(le, cl::sycl::cl_float4)
+SYCL_PCMP(lt, cl::sycl::cl_float4)
+SYCL_PCMP(eq, cl::sycl::cl_float4)
+SYCL_PCMP(le, cl::sycl::cl_double2)
+SYCL_PCMP(lt, cl::sycl::cl_double2)
+SYCL_PCMP(eq, cl::sycl::cl_double2)
+#undef SYCL_PCMP
+
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void ptranspose(PacketBlock<cl::sycl::cl_half8, 8>& kernel) {
+  cl::sycl::cl_half tmp = kernel.packet[0].s1();
+  kernel.packet[0].s1() = kernel.packet[1].s0();
+  kernel.packet[1].s0() = tmp;
+
+  tmp = kernel.packet[0].s2();
+  kernel.packet[0].s2() = kernel.packet[2].s0();
+  kernel.packet[2].s0() = tmp;
+
+  tmp = kernel.packet[0].s3();
+  kernel.packet[0].s3() = kernel.packet[3].s0();
+  kernel.packet[3].s0() = tmp;
+
+  tmp = kernel.packet[0].s4();
+  kernel.packet[0].s4() = kernel.packet[4].s0();
+  kernel.packet[4].s0() = tmp;
+
+  tmp = kernel.packet[0].s5();
+  kernel.packet[0].s5() = kernel.packet[5].s0();
+  kernel.packet[5].s0() = tmp;
+
+  tmp = kernel.packet[0].s6();
+  kernel.packet[0].s6() = kernel.packet[6].s0();
+  kernel.packet[6].s0() = tmp;
+
+  tmp = kernel.packet[0].s7();
+  kernel.packet[0].s7() = kernel.packet[7].s0();
+  kernel.packet[7].s0() = tmp;
+
+  tmp = kernel.packet[1].s2();
+  kernel.packet[1].s2() = kernel.packet[2].s1();
+  kernel.packet[2].s1() = tmp;
+
+  tmp = kernel.packet[1].s3();
+  kernel.packet[1].s3() = kernel.packet[3].s1();
+  kernel.packet[3].s1() = tmp;
+
+  tmp = kernel.packet[1].s4();
+  kernel.packet[1].s4() = kernel.packet[4].s1();
+  kernel.packet[4].s1() = tmp;
+
+  tmp = kernel.packet[1].s5();
+  kernel.packet[1].s5() = kernel.packet[5].s1();
+  kernel.packet[5].s1() = tmp;
+
+  tmp = kernel.packet[1].s6();
+  kernel.packet[1].s6() = kernel.packet[6].s1();
+  kernel.packet[6].s1() = tmp;
+
+  tmp = kernel.packet[1].s7();
+  kernel.packet[1].s7() = kernel.packet[7].s1();
+  kernel.packet[7].s1() = tmp;
+
+  tmp = kernel.packet[2].s3();
+  kernel.packet[2].s3() = kernel.packet[3].s2();
+  kernel.packet[3].s2() = tmp;
+
+  tmp = kernel.packet[2].s4();
+  kernel.packet[2].s4() = kernel.packet[4].s2();
+  kernel.packet[4].s2() = tmp;
+
+  tmp = kernel.packet[2].s5();
+  kernel.packet[2].s5() = kernel.packet[5].s2();
+  kernel.packet[5].s2() = tmp;
+
+  tmp = kernel.packet[2].s6();
+  kernel.packet[2].s6() = kernel.packet[6].s2();
+  kernel.packet[6].s2() = tmp;
+
+  tmp = kernel.packet[2].s7();
+  kernel.packet[2].s7() = kernel.packet[7].s2();
+  kernel.packet[7].s2() = tmp;
+
+  tmp = kernel.packet[3].s4();
+  kernel.packet[3].s4() = kernel.packet[4].s3();
+  kernel.packet[4].s3() = tmp;
+
+  tmp = kernel.packet[3].s5();
+  kernel.packet[3].s5() = kernel.packet[5].s3();
+  kernel.packet[5].s3() = tmp;
+
+  tmp = kernel.packet[3].s6();
+  kernel.packet[3].s6() = kernel.packet[6].s3();
+  kernel.packet[6].s3() = tmp;
+
+  tmp = kernel.packet[3].s7();
+  kernel.packet[3].s7() = kernel.packet[7].s3();
+  kernel.packet[7].s3() = tmp;
+
+  tmp = kernel.packet[4].s5();
+  kernel.packet[4].s5() = kernel.packet[5].s4();
+  kernel.packet[5].s4() = tmp;
+
+  tmp = kernel.packet[4].s6();
+  kernel.packet[4].s6() = kernel.packet[6].s4();
+  kernel.packet[6].s4() = tmp;
+
+  tmp = kernel.packet[4].s7();
+  kernel.packet[4].s7() = kernel.packet[7].s4();
+  kernel.packet[7].s4() = tmp;
+
+  tmp = kernel.packet[5].s6();
+  kernel.packet[5].s6() = kernel.packet[6].s5();
+  kernel.packet[6].s5() = tmp;
+
+  tmp = kernel.packet[5].s7();
+  kernel.packet[5].s7() = kernel.packet[7].s5();
+  kernel.packet[7].s5() = tmp;
+
+  tmp = kernel.packet[6].s7();
+  kernel.packet[6].s7() = kernel.packet[7].s6();
+  kernel.packet[7].s6() = tmp;
+}
+
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void ptranspose(PacketBlock<cl::sycl::cl_float4, 4>& kernel) {
+  float tmp = kernel.packet[0].y();
+  kernel.packet[0].y() = kernel.packet[1].x();
+  kernel.packet[1].x() = tmp;
+
+  tmp = kernel.packet[0].z();
+  kernel.packet[0].z() = kernel.packet[2].x();
+  kernel.packet[2].x() = tmp;
+
+  tmp = kernel.packet[0].w();
+  kernel.packet[0].w() = kernel.packet[3].x();
+  kernel.packet[3].x() = tmp;
+
+  tmp = kernel.packet[1].z();
+  kernel.packet[1].z() = kernel.packet[2].y();
+  kernel.packet[2].y() = tmp;
+
+  tmp = kernel.packet[1].w();
+  kernel.packet[1].w() = kernel.packet[3].y();
+  kernel.packet[3].y() = tmp;
+
+  tmp = kernel.packet[2].w();
+  kernel.packet[2].w() = kernel.packet[3].z();
+  kernel.packet[3].z() = tmp;
+}
+
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void ptranspose(PacketBlock<cl::sycl::cl_double2, 2>& kernel) {
+  double tmp = kernel.packet[0].y();
+  kernel.packet[0].y() = kernel.packet[1].x();
+  kernel.packet[1].x() = tmp;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_half8 pblend(
+    const Selector<unpacket_traits<cl::sycl::cl_half8>::size>& ifPacket, const cl::sycl::cl_half8& thenPacket,
+    const cl::sycl::cl_half8& elsePacket) {
+  cl::sycl::cl_short8 condition(ifPacket.select[0] ? 0 : -1, ifPacket.select[1] ? 0 : -1, ifPacket.select[2] ? 0 : -1,
+                                ifPacket.select[3] ? 0 : -1, ifPacket.select[4] ? 0 : -1, ifPacket.select[5] ? 0 : -1,
+                                ifPacket.select[6] ? 0 : -1, ifPacket.select[7] ? 0 : -1);
+  return cl::sycl::select(thenPacket, elsePacket, condition);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4 pblend(
+    const Selector<unpacket_traits<cl::sycl::cl_float4>::size>& ifPacket, const cl::sycl::cl_float4& thenPacket,
+    const cl::sycl::cl_float4& elsePacket) {
+  cl::sycl::cl_int4 condition(ifPacket.select[0] ? 0 : -1, ifPacket.select[1] ? 0 : -1, ifPacket.select[2] ? 0 : -1,
+                              ifPacket.select[3] ? 0 : -1);
+  return cl::sycl::select(thenPacket, elsePacket, condition);
+}
+
+template <>
+inline cl::sycl::cl_double2 pblend(const Selector<unpacket_traits<cl::sycl::cl_double2>::size>& ifPacket,
+                                   const cl::sycl::cl_double2& thenPacket, const cl::sycl::cl_double2& elsePacket) {
+  cl::sycl::cl_long2 condition(ifPacket.select[0] ? 0 : -1, ifPacket.select[1] ? 0 : -1);
+  return cl::sycl::select(thenPacket, elsePacket, condition);
+}
+#endif  // SYCL_DEVICE_ONLY
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_PACKET_MATH_SYCL_H
diff --git a/inst/include/Eigen/src/Core/arch/SYCL/TypeCasting.h b/inst/include/Eigen/src/Core/arch/SYCL/TypeCasting.h
new file mode 100644
index 00000000..6e3fa4fc
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/SYCL/TypeCasting.h
@@ -0,0 +1,83 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*****************************************************************
+ * TypeCasting.h
+ *
+ * \brief:
+ *  TypeCasting
+ *
+ *****************************************************************/
+
+#ifndef EIGEN_TYPE_CASTING_SYCL_H
+#define EIGEN_TYPE_CASTING_SYCL_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+#ifdef SYCL_DEVICE_ONLY
+template <>
+struct type_casting_traits<float, int> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_int4 pcast<cl::sycl::cl_float4, cl::sycl::cl_int4>(
+    const cl::sycl::cl_float4& a) {
+  return a.template convert<cl::sycl::cl_int, cl::sycl::rounding_mode::automatic>();
+}
+
+template <>
+struct type_casting_traits<int, float> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 1 };
+};
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4 pcast<cl::sycl::cl_int4, cl::sycl::cl_float4>(
+    const cl::sycl::cl_int4& a) {
+  return a.template convert<cl::sycl::cl_float, cl::sycl::rounding_mode::automatic>();
+}
+
+template <>
+struct type_casting_traits<double, float> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 2, TgtCoeffRatio = 1 };
+};
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_float4 pcast<cl::sycl::cl_double2, cl::sycl::cl_float4>(
+    const cl::sycl::cl_double2& a, const cl::sycl::cl_double2& b) {
+  auto a1 = a.template convert<cl::sycl::cl_float, cl::sycl::rounding_mode::automatic>();
+  auto b1 = b.template convert<cl::sycl::cl_float, cl::sycl::rounding_mode::automatic>();
+  return cl::sycl::cl_float4(a1.x(), a1.y(), b1.x(), b1.y());
+}
+
+template <>
+struct type_casting_traits<float, double> {
+  enum { VectorizedCast = 1, SrcCoeffRatio = 1, TgtCoeffRatio = 2 };
+};
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE cl::sycl::cl_double2 pcast<cl::sycl::cl_float4, cl::sycl::cl_double2>(
+    const cl::sycl::cl_float4& a) {
+  // Simply discard the second half of the input
+  return cl::sycl::cl_double2(a.x(), a.y());
+}
+
+#endif
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_TYPE_CASTING_SYCL_H
diff --git a/inst/include/Eigen/src/Core/arch/ZVector/Complex.h b/inst/include/Eigen/src/Core/arch/ZVector/Complex.h
new file mode 100644
index 00000000..692f90f3
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/ZVector/Complex.h
@@ -0,0 +1,570 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2016 Konstantinos Margaritis <markos@freevec.org>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_COMPLEX32_ZVECTOR_H
+#define EIGEN_COMPLEX32_ZVECTOR_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
+inline Packet4ui p4ui_CONJ_XOR() {
+  return Packet4ui {0x00000000, 0x80000000, 0x00000000, 0x80000000};  // vec_mergeh((Packet4ui)p4i_ZERO, (Packet4ui)p4f_MZERO);
+}
+#endif
+
+inline Packet2ul p2ul_CONJ_XOR1() {
+  return (Packet2ul)vec_sld((Packet4ui)p2d_ZERO_, (Packet4ui)p2l_ZERO,
+                            8);  //{ 0x8000000000000000, 0x0000000000000000 };
+}
+inline Packet2ul p2ul_CONJ_XOR2() {
+  return (Packet2ul)vec_sld((Packet4ui)p2l_ZERO, (Packet4ui)p2d_ZERO_,
+                            8);  //{ 0x8000000000000000, 0x0000000000000000 };
+}
+
+struct Packet1cd {
+  EIGEN_STRONG_INLINE Packet1cd() {}
+  EIGEN_STRONG_INLINE explicit Packet1cd(const Packet2d& a) : v(a) {}
+  Packet2d v;
+};
+
+struct Packet2cf {
+  EIGEN_STRONG_INLINE Packet2cf() {}
+  EIGEN_STRONG_INLINE explicit Packet2cf(const Packet4f& a) : v(a) {}
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12)
+  union {
+    Packet4f v;
+    Packet1cd cd[2];
+  };
+#else
+  Packet4f v;
+#endif
+};
+
+template <>
+struct packet_traits<std::complex<float> > : default_packet_traits {
+  typedef Packet2cf type;
+  typedef Packet2cf half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 2,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasLog = 1,
+    HasExp = 1,
+    HasNegate = 1,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 0,
+    HasMax = 0,
+    HasBlend = 1,
+    HasSetLinear = 0
+  };
+};
+
+template <>
+struct packet_traits<std::complex<double> > : default_packet_traits {
+  typedef Packet1cd type;
+  typedef Packet1cd half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 1,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasLog = 1,
+    HasNegate = 1,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 0,
+    HasMax = 0,
+    HasSetLinear = 0
+  };
+};
+
+template <>
+struct unpacket_traits<Packet2cf> {
+  typedef std::complex<float> type;
+  enum {
+    size = 2,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+  typedef Packet2cf half;
+  typedef Packet4f as_real;
+};
+template <>
+struct unpacket_traits<Packet1cd> {
+  typedef std::complex<double> type;
+  enum {
+    size = 1,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+  typedef Packet1cd half;
+  typedef Packet2d as_real;
+};
+
+/* Forward declaration */
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf, 2>& kernel);
+
+/* complex<double> first */
+template <>
+EIGEN_STRONG_INLINE Packet1cd pload<Packet1cd>(const std::complex<double>* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return Packet1cd(pload<Packet2d>((const double*)from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd ploadu<Packet1cd>(const std::complex<double>* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return Packet1cd(ploadu<Packet2d>((const double*)from));
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<std::complex<double> >(std::complex<double>* to, const Packet1cd& from) {
+  EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<std::complex<double> >(std::complex<double>* to, const Packet1cd& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd
+pset1<Packet1cd>(const std::complex<double>& from) { /* here we really have to use unaligned loads :( */
+  return ploadu<Packet1cd>(&from);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet1cd pgather<std::complex<double>, Packet1cd>(const std::complex<double>* from,
+                                                                            Index stride EIGEN_UNUSED) {
+  return pload<Packet1cd>(from);
+}
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<std::complex<double>, Packet1cd>(std::complex<double>* to, const Packet1cd& from,
+                                                                        Index stride EIGEN_UNUSED) {
+  pstore<std::complex<double> >(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd padd<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return Packet1cd(a.v + b.v);
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd psub<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return Packet1cd(a.v - b.v);
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pnegate(const Packet1cd& a) {
+  return Packet1cd(pnegate(Packet2d(a.v)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pconj(const Packet1cd& a) {
+  return Packet1cd((Packet2d)vec_xor((Packet2d)a.v, (Packet2d)p2ul_CONJ_XOR2()));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pmul<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  Packet2d a_re, a_im, v1, v2;
+
+  // Permute and multiply the real parts of a and b
+  a_re = vec_perm(a.v, a.v, p16uc_PSET64_HI);
+  // Get the imaginary parts of a
+  a_im = vec_perm(a.v, a.v, p16uc_PSET64_LO);
+  // multiply a_re * b
+  v1 = vec_madd(a_re, b.v, p2d_ZERO);
+  // multiply a_im * b and get the conjugate result
+  v2 = vec_madd(a_im, b.v, p2d_ZERO);
+  v2 = (Packet2d)vec_sld((Packet4ui)v2, (Packet4ui)v2, 8);
+  v2 = (Packet2d)vec_xor((Packet2d)v2, (Packet2d)p2ul_CONJ_XOR1());
+
+  return Packet1cd(v1 + v2);
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pand<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return Packet1cd(vec_and(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd por<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return Packet1cd(vec_or(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pxor<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return Packet1cd(vec_xor(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pandnot<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return Packet1cd(vec_and(a.v, vec_nor(b.v, b.v)));
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd ploaddup<Packet1cd>(const std::complex<double>* from) {
+  return pset1<Packet1cd>(*from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet1cd pcmp_eq(const Packet1cd& a, const Packet1cd& b) {
+  Packet2d eq = vec_cmpeq(a.v, b.v);
+  Packet2d tmp = {eq[1], eq[0]};
+  return (Packet1cd)pand<Packet2d>(eq, tmp);
+}
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<std::complex<double> >(const std::complex<double>* addr) {
+  EIGEN_ZVECTOR_PREFETCH(addr);
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<double> pfirst<Packet1cd>(const Packet1cd& a) {
+  EIGEN_ALIGN16 std::complex<double> res;
+  pstore<std::complex<double> >(&res, a);
+
+  return res;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd preverse(const Packet1cd& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux<Packet1cd>(const Packet1cd& a) {
+  return pfirst(a);
+}
+template <>
+EIGEN_STRONG_INLINE std::complex<double> predux_mul<Packet1cd>(const Packet1cd& a) {
+  return pfirst(a);
+}
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd, Packet2d)
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd pdiv<Packet1cd>(const Packet1cd& a, const Packet1cd& b) {
+  return pdiv_complex(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd psqrt<Packet1cd>(const Packet1cd& a) {
+  return psqrt_complex<Packet1cd>(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf psqrt<Packet2cf>(const Packet2cf& a) {
+  return psqrt_complex<Packet2cf>(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet1cd plog<Packet1cd>(const Packet1cd& a) {
+  return plog_complex<Packet1cd>(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf plog<Packet2cf>(const Packet2cf& a) {
+  return plog_complex<Packet2cf>(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pexp<Packet2cf>(const Packet2cf& a) {
+  return pexp_complex(a);
+}
+
+EIGEN_STRONG_INLINE Packet1cd pcplxflip /*<Packet1cd>*/ (const Packet1cd& x) {
+  return Packet1cd(preverse(Packet2d(x.v)));
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet1cd, 2>& kernel) {
+  Packet2d tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI);
+  kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO);
+  kernel.packet[0].v = tmp;
+}
+
+/* complex<float> follows */
+template <>
+EIGEN_STRONG_INLINE Packet2cf pload<Packet2cf>(const std::complex<float>* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD return Packet2cf(pload<Packet4f>((const float*)from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf ploadu<Packet2cf>(const std::complex<float>* from) {
+  EIGEN_DEBUG_UNALIGNED_LOAD return Packet2cf(ploadu<Packet4f>((const float*)from));
+}
+template <>
+EIGEN_STRONG_INLINE void pstore<std::complex<float> >(std::complex<float>* to, const Packet2cf& from) {
+  EIGEN_DEBUG_ALIGNED_STORE pstore((float*)to, from.v);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<std::complex<float> >(std::complex<float>* to, const Packet2cf& from) {
+  EIGEN_DEBUG_UNALIGNED_STORE pstoreu((float*)to, from.v);
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<float> pfirst<Packet2cf>(const Packet2cf& a) {
+  EIGEN_ALIGN16 std::complex<float> res[2];
+  pstore<std::complex<float> >(res, a);
+
+  return res[0];
+}
+
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12)
+template <>
+EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from) {
+  Packet2cf res;
+  res.cd[0] = Packet1cd(vec_ld2f((const float*)&from));
+  res.cd[1] = res.cd[0];
+  return res;
+}
+#else
+template <>
+EIGEN_STRONG_INLINE Packet2cf pset1<Packet2cf>(const std::complex<float>& from) {
+  Packet2cf res;
+  if ((std::ptrdiff_t(&from) % 16) == 0)
+    res.v = pload<Packet4f>((const float*)&from);
+  else
+    res.v = ploadu<Packet4f>((const float*)&from);
+  res.v = vec_perm(res.v, res.v, p16uc_PSET64_HI);
+  return res;
+}
+#endif
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet2cf pgather<std::complex<float>, Packet2cf>(const std::complex<float>* from,
+                                                                           Index stride) {
+  EIGEN_ALIGN16 std::complex<float> af[2];
+  af[0] = from[0 * stride];
+  af[1] = from[1 * stride];
+  return pload<Packet2cf>(af);
+}
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<std::complex<float>, Packet2cf>(std::complex<float>* to, const Packet2cf& from,
+                                                                       Index stride) {
+  EIGEN_ALIGN16 std::complex<float> af[2];
+  pstore<std::complex<float> >((std::complex<float>*)af, from);
+  to[0 * stride] = af[0];
+  to[1 * stride] = af[1];
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf padd<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return Packet2cf(padd<Packet4f>(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf psub<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return Packet2cf(psub<Packet4f>(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pnegate(const Packet2cf& a) {
+  return Packet2cf(pnegate(Packet4f(a.v)));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pand<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return Packet2cf(pand<Packet4f>(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf por<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return Packet2cf(por<Packet4f>(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pxor<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return Packet2cf(pxor<Packet4f>(a.v, b.v));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pandnot<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return Packet2cf(pandnot<Packet4f>(a.v, b.v));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf ploaddup<Packet2cf>(const std::complex<float>* from) {
+  return pset1<Packet2cf>(*from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<std::complex<float> >(const std::complex<float>* addr) {
+  EIGEN_ZVECTOR_PREFETCH(addr);
+}
+
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12)
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) {
+  Packet4f eq = pcmp_eq<Packet4f>(a.v, b.v);
+  Packet2cf res;
+  Packet2d tmp1 = {eq.v4f[0][1], eq.v4f[0][0]};
+  Packet2d tmp2 = {eq.v4f[1][1], eq.v4f[1][0]};
+  res.v.v4f[0] = pand<Packet2d>(eq.v4f[0], tmp1);
+  res.v.v4f[1] = pand<Packet2d>(eq.v4f[1], tmp2);
+  return res;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) {
+  Packet2cf res;
+  res.v.v4f[0] = pconj(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[0]))).v;
+  res.v.v4f[1] = pconj(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[1]))).v;
+  return res;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  Packet2cf res;
+  res.v.v4f[0] =
+      pmul(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[0])), Packet1cd(reinterpret_cast<Packet2d>(b.v.v4f[0]))).v;
+  res.v.v4f[1] =
+      pmul(Packet1cd(reinterpret_cast<Packet2d>(a.v.v4f[1])), Packet1cd(reinterpret_cast<Packet2d>(b.v.v4f[1]))).v;
+  return res;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) {
+  Packet2cf res;
+  res.cd[0] = a.cd[1];
+  res.cd[1] = a.cd[0];
+  return res;
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a) {
+  std::complex<float> res;
+  Packet1cd b = padd<Packet1cd>(a.cd[0], a.cd[1]);
+  vec_st2f(b.v, (float*)&res);
+  return res;
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a) {
+  std::complex<float> res;
+  Packet1cd b = pmul<Packet1cd>(a.cd[0], a.cd[1]);
+  vec_st2f(b.v, (float*)&res);
+  return res;
+}
+
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf, Packet4f)
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return pdiv_complex(a, b);
+}
+
+EIGEN_STRONG_INLINE Packet2cf pcplxflip /*<Packet2cf>*/ (const Packet2cf& x) {
+  Packet2cf res;
+  res.cd[0] = pcplxflip(x.cd[0]);
+  res.cd[1] = pcplxflip(x.cd[1]);
+  return res;
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf, 2>& kernel) {
+  Packet1cd tmp = kernel.packet[0].cd[1];
+  kernel.packet[0].cd[1] = kernel.packet[1].cd[0];
+  kernel.packet[1].cd[0] = tmp;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket,
+                                     const Packet2cf& elsePacket) {
+  Packet2cf result;
+  const Selector<4> ifPacket4 = {ifPacket.select[0], ifPacket.select[0], ifPacket.select[1], ifPacket.select[1]};
+  result.v = pblend<Packet4f>(ifPacket4, thenPacket.v, elsePacket.v);
+  return result;
+}
+#else
+template <>
+EIGEN_STRONG_INLINE Packet2cf pcmp_eq(const Packet2cf& a, const Packet2cf& b) {
+  Packet4f eq = vec_cmpeq(a.v, b.v);
+  Packet4f tmp = {eq[1], eq[0], eq[3], eq[2]};
+  return (Packet2cf)pand<Packet4f>(eq, tmp);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pconj(const Packet2cf& a) {
+  return Packet2cf(pxor<Packet4f>(a.v, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR())));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2cf pmul<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  Packet4f a_re, a_im, prod, prod_im;
+
+  // Permute and multiply the real parts of a and b
+  a_re = vec_perm(a.v, a.v, p16uc_PSET32_WODD);
+
+  // Get the imaginary parts of a
+  a_im = vec_perm(a.v, a.v, p16uc_PSET32_WEVEN);
+
+  // multiply a_im * b and get the conjugate result
+  prod_im = a_im * b.v;
+  prod_im = pxor<Packet4f>(prod_im, reinterpret_cast<Packet4f>(p4ui_CONJ_XOR()));
+  // permute back to a proper order
+  prod_im = vec_perm(prod_im, prod_im, p16uc_COMPLEX32_REV);
+
+  // multiply a_re * b, add prod_im
+  prod = pmadd<Packet4f>(a_re, b.v, prod_im);
+
+  return Packet2cf(prod);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf preverse(const Packet2cf& a) {
+  Packet4f rev_a;
+  rev_a = vec_perm(a.v, a.v, p16uc_COMPLEX32_REV2);
+  return Packet2cf(rev_a);
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux<Packet2cf>(const Packet2cf& a) {
+  Packet4f b;
+  b = vec_sld(a.v, a.v, 8);
+  b = padd<Packet4f>(a.v, b);
+  return pfirst<Packet2cf>(Packet2cf(b));
+}
+
+template <>
+EIGEN_STRONG_INLINE std::complex<float> predux_mul<Packet2cf>(const Packet2cf& a) {
+  Packet4f b;
+  Packet2cf prod;
+  b = vec_sld(a.v, a.v, 8);
+  prod = pmul<Packet2cf>(a, Packet2cf(b));
+
+  return pfirst<Packet2cf>(prod);
+}
+
+EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf, Packet4f)
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pdiv<Packet2cf>(const Packet2cf& a, const Packet2cf& b) {
+  return pdiv_complex(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pcplxflip<Packet2cf>(const Packet2cf& x) {
+  return Packet2cf(vec_perm(x.v, x.v, p16uc_COMPLEX32_REV));
+}
+
+EIGEN_STRONG_INLINE void ptranspose(PacketBlock<Packet2cf, 2>& kernel) {
+  Packet4f tmp = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_HI);
+  kernel.packet[1].v = vec_perm(kernel.packet[0].v, kernel.packet[1].v, p16uc_TRANSPOSE64_LO);
+  kernel.packet[0].v = tmp;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2cf pblend(const Selector<2>& ifPacket, const Packet2cf& thenPacket,
+                                     const Packet2cf& elsePacket) {
+  Packet2cf result;
+  result.v = reinterpret_cast<Packet4f>(
+      pblend<Packet2d>(ifPacket, reinterpret_cast<Packet2d>(thenPacket.v), reinterpret_cast<Packet2d>(elsePacket.v)));
+  return result;
+}
+#endif
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_COMPLEX32_ZVECTOR_H
diff --git a/inst/include/Eigen/src/Core/arch/ZVector/MathFunctions.h b/inst/include/Eigen/src/Core/arch/ZVector/MathFunctions.h
new file mode 100644
index 00000000..348d643a
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/ZVector/MathFunctions.h
@@ -0,0 +1,244 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2007 Julien Pommier
+// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2016 Konstantinos Margaritis <markos@freevec.org>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/* The sin, cos, exp, and log functions of this file come from
+ * Julien Pommier's sse math library: http://gruntthepeon.free.fr/ssemath/
+ */
+
+#ifndef EIGEN_MATH_FUNCTIONS_ZVECTOR_H
+#define EIGEN_MATH_FUNCTIONS_ZVECTOR_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+EIGEN_DOUBLE_PACKET_FUNCTION(atanh, Packet2d)
+EIGEN_DOUBLE_PACKET_FUNCTION(log, Packet2d)
+EIGEN_DOUBLE_PACKET_FUNCTION(log2, Packet2d)
+EIGEN_DOUBLE_PACKET_FUNCTION(tanh, Packet2d)
+
+EIGEN_FLOAT_PACKET_FUNCTION(atanh, Packet4f)
+EIGEN_FLOAT_PACKET_FUNCTION(log, Packet4f)
+EIGEN_FLOAT_PACKET_FUNCTION(log2, Packet4f)
+
+EIGEN_GENERIC_PACKET_FUNCTION(atan, Packet2d)
+EIGEN_GENERIC_PACKET_FUNCTION(atan, Packet4f)
+EIGEN_GENERIC_PACKET_FUNCTION(exp2, Packet2d)
+EIGEN_GENERIC_PACKET_FUNCTION(exp2, Packet4f)
+
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
+static EIGEN_DECLARE_CONST_Packet4f(1, 1.0f);
+static EIGEN_DECLARE_CONST_Packet4f(half, 0.5f);
+static EIGEN_DECLARE_CONST_Packet4i(0x7f, 0x7f);
+static EIGEN_DECLARE_CONST_Packet4i(23, 23);
+
+static EIGEN_DECLARE_CONST_Packet4f_FROM_INT(inv_mant_mask, ~0x7f800000);
+
+/* the smallest non denormalized float number */
+static EIGEN_DECLARE_CONST_Packet4f_FROM_INT(min_norm_pos, 0x00800000);
+static EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_inf, 0xff800000);  // -1.f/0.f
+static EIGEN_DECLARE_CONST_Packet4f_FROM_INT(minus_nan, 0xffffffff);
+
+/* natural logarithm computed for 4 simultaneous float
+  return NaN for x <= 0
+*/
+static EIGEN_DECLARE_CONST_Packet4f(cephes_SQRTHF, 0.707106781186547524f);
+static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p0, 7.0376836292E-2f);
+static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p1, -1.1514610310E-1f);
+static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p2, 1.1676998740E-1f);
+static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p3, -1.2420140846E-1f);
+static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p4, +1.4249322787E-1f);
+static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p5, -1.6668057665E-1f);
+static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p6, +2.0000714765E-1f);
+static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p7, -2.4999993993E-1f);
+static EIGEN_DECLARE_CONST_Packet4f(cephes_log_p8, +3.3333331174E-1f);
+static EIGEN_DECLARE_CONST_Packet4f(cephes_log_q1, -2.12194440e-4f);
+static EIGEN_DECLARE_CONST_Packet4f(cephes_log_q2, 0.693359375f);
+
+static EIGEN_DECLARE_CONST_Packet4f(exp_hi, 88.3762626647950f);
+static EIGEN_DECLARE_CONST_Packet4f(exp_lo, -88.3762626647949f);
+
+static EIGEN_DECLARE_CONST_Packet4f(cephes_LOG2EF, 1.44269504088896341f);
+static EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C1, 0.693359375f);
+static EIGEN_DECLARE_CONST_Packet4f(cephes_exp_C2, -2.12194440e-4f);
+
+static EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p0, 1.9875691500E-4f);
+static EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p1, 1.3981999507E-3f);
+static EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p2, 8.3334519073E-3f);
+static EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p3, 4.1665795894E-2f);
+static EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p4, 1.6666665459E-1f);
+static EIGEN_DECLARE_CONST_Packet4f(cephes_exp_p5, 5.0000001201E-1f);
+#endif
+
+static EIGEN_DECLARE_CONST_Packet2d(1, 1.0);
+static EIGEN_DECLARE_CONST_Packet2d(2, 2.0);
+static EIGEN_DECLARE_CONST_Packet2d(half, 0.5);
+
+static EIGEN_DECLARE_CONST_Packet2d(exp_hi, 709.437);
+static EIGEN_DECLARE_CONST_Packet2d(exp_lo, -709.436139303);
+
+static EIGEN_DECLARE_CONST_Packet2d(cephes_LOG2EF, 1.4426950408889634073599);
+
+static EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p0, 1.26177193074810590878e-4);
+static EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p1, 3.02994407707441961300e-2);
+static EIGEN_DECLARE_CONST_Packet2d(cephes_exp_p2, 9.99999999999999999910e-1);
+
+static EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q0, 3.00198505138664455042e-6);
+static EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q1, 2.52448340349684104192e-3);
+static EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q2, 2.27265548208155028766e-1);
+static EIGEN_DECLARE_CONST_Packet2d(cephes_exp_q3, 2.00000000000000000009e0);
+
+static EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C1, 0.693145751953125);
+static EIGEN_DECLARE_CONST_Packet2d(cephes_exp_C2, 1.42860682030941723212e-6);
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2d pexp<Packet2d>(const Packet2d& _x) {
+  Packet2d x = _x;
+
+  Packet2d tmp, fx;
+  Packet2l emm0;
+
+  // clamp x
+  x = pmax(pmin(x, p2d_exp_hi), p2d_exp_lo);
+  /* express exp(x) as exp(g + n*log(2)) */
+  fx = pmadd(p2d_cephes_LOG2EF, x, p2d_half);
+
+  fx = vec_floor(fx);
+
+  tmp = pmul(fx, p2d_cephes_exp_C1);
+  Packet2d z = pmul(fx, p2d_cephes_exp_C2);
+  x = psub(x, tmp);
+  x = psub(x, z);
+
+  Packet2d x2 = pmul(x, x);
+
+  Packet2d px = p2d_cephes_exp_p0;
+  px = pmadd(px, x2, p2d_cephes_exp_p1);
+  px = pmadd(px, x2, p2d_cephes_exp_p2);
+  px = pmul(px, x);
+
+  Packet2d qx = p2d_cephes_exp_q0;
+  qx = pmadd(qx, x2, p2d_cephes_exp_q1);
+  qx = pmadd(qx, x2, p2d_cephes_exp_q2);
+  qx = pmadd(qx, x2, p2d_cephes_exp_q3);
+
+  x = pdiv(px, psub(qx, px));
+  x = pmadd(p2d_2, x, p2d_1);
+
+  // build 2^n
+  emm0 = vec_ctsl(fx, 0);
+
+  static const Packet2l p2l_1023 = {1023, 1023};
+  static const Packet2ul p2ul_52 = {52, 52};
+
+  emm0 = emm0 + p2l_1023;
+  emm0 = emm0 << reinterpret_cast<Packet2l>(p2ul_52);
+
+  // Altivec's max & min operators just drop silent NaNs. Check NaNs in
+  // inputs and return them unmodified.
+  Packet2ul isnumber_mask = reinterpret_cast<Packet2ul>(vec_cmpeq(_x, _x));
+  return vec_sel(_x, pmax(pmul(x, reinterpret_cast<Packet2d>(emm0)), _x), isnumber_mask);
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f pexp<Packet4f>(const Packet4f& _x) {
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
+  Packet4f x = _x;
+
+  Packet4f tmp, fx;
+  Packet4i emm0;
+
+  // clamp x
+  x = pmax(pmin(x, p4f_exp_hi), p4f_exp_lo);
+
+  // express exp(x) as exp(g + n*log(2))
+  fx = pmadd(x, p4f_cephes_LOG2EF, p4f_half);
+
+  fx = pfloor(fx);
+
+  tmp = pmul(fx, p4f_cephes_exp_C1);
+  Packet4f z = pmul(fx, p4f_cephes_exp_C2);
+  x = psub(x, tmp);
+  x = psub(x, z);
+
+  z = pmul(x, x);
+
+  Packet4f y = p4f_cephes_exp_p0;
+  y = pmadd(y, x, p4f_cephes_exp_p1);
+  y = pmadd(y, x, p4f_cephes_exp_p2);
+  y = pmadd(y, x, p4f_cephes_exp_p3);
+  y = pmadd(y, x, p4f_cephes_exp_p4);
+  y = pmadd(y, x, p4f_cephes_exp_p5);
+  y = pmadd(y, z, x);
+  y = padd(y, p4f_1);
+
+  // build 2^n
+  emm0 = Packet4i{(int)fx[0], (int)fx[1], (int)fx[2], (int)fx[3]};
+  emm0 = emm0 + p4i_0x7f;
+  emm0 = emm0 << reinterpret_cast<Packet4i>(p4i_23);
+
+  return pmax(pmul(y, reinterpret_cast<Packet4f>(emm0)), _x);
+#else
+  Packet4f res;
+  res.v4f[0] = pexp<Packet2d>(_x.v4f[0]);
+  res.v4f[1] = pexp<Packet2d>(_x.v4f[1]);
+  return res;
+#endif
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2d psqrt<Packet2d>(const Packet2d& x) {
+  return vec_sqrt(x);
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f psqrt<Packet4f>(const Packet4f& x) {
+  Packet4f res;
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
+  res = vec_sqrt(x);
+#else
+  res.v4f[0] = psqrt<Packet2d>(x.v4f[0]);
+  res.v4f[1] = psqrt<Packet2d>(x.v4f[1]);
+#endif
+  return res;
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet2d prsqrt<Packet2d>(const Packet2d& x) {
+  return pset1<Packet2d>(1.0) / psqrt<Packet2d>(x);
+}
+
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f prsqrt<Packet4f>(const Packet4f& x) {
+  Packet4f res;
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
+  res = pset1<Packet4f>(1.0) / psqrt<Packet4f>(x);
+#else
+  res.v4f[0] = prsqrt<Packet2d>(x.v4f[0]);
+  res.v4f[1] = prsqrt<Packet2d>(x.v4f[1]);
+#endif
+  return res;
+}
+
+// Hyperbolic Tangent function.
+template <>
+EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet4f ptanh<Packet4f>(const Packet4f& x) {
+  return ptanh_float(x);
+}
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_MATH_FUNCTIONS_ZVECTOR_H
diff --git a/inst/include/Eigen/src/Core/arch/ZVector/PacketMath.h b/inst/include/Eigen/src/Core/arch/ZVector/PacketMath.h
new file mode 100644
index 00000000..39073ed8
--- /dev/null
+++ b/inst/include/Eigen/src/Core/arch/ZVector/PacketMath.h
@@ -0,0 +1,1413 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Konstantinos Margaritis <markos@freevec.org>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PACKET_MATH_ZVECTOR_H
+#define EIGEN_PACKET_MATH_ZVECTOR_H
+
+// IWYU pragma: private
+#include "../../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+#ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD
+#define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 16
+#endif
+
+#ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#define EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+#endif
+
+#ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS
+#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32
+#endif
+
+typedef __vector int Packet4i;
+typedef __vector unsigned int Packet4ui;
+typedef __vector __bool int Packet4bi;
+typedef __vector short int Packet8i;
+typedef __vector unsigned char Packet16uc;
+typedef __vector double Packet2d;
+typedef __vector unsigned long long Packet2ul;
+typedef __vector long long Packet2l;
+
+// Z14 has builtin support for float vectors
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
+typedef __vector float Packet4f;
+#else
+typedef struct {
+  Packet2d v4f[2];
+} Packet4f;
+#endif
+
+typedef union {
+  numext::int32_t i[4];
+  numext::uint32_t ui[4];
+  numext::int64_t l[2];
+  numext::uint64_t ul[2];
+  double d[2];
+  float f[4];
+  Packet4i v4i;
+  Packet4ui v4ui;
+  Packet2l v2l;
+  Packet2ul v2ul;
+  Packet2d v2d;
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
+  Packet4f v4f;
+#endif
+} Packet;
+
+// We don't want to write the same code all the time, but we need to reuse the constants
+// and it doesn't really work to declare them global, so we define macros instead
+
+#define EIGEN_DECLARE_CONST_FAST_Packet4i(NAME, X) Packet4i p4i_##NAME = reinterpret_cast<Packet4i>(vec_splat_s32(X))
+
+#define EIGEN_DECLARE_CONST_FAST_Packet2d(NAME, X) Packet2d p2d_##NAME = reinterpret_cast<Packet2d>(vec_splat_s64(X))
+
+#define EIGEN_DECLARE_CONST_FAST_Packet2l(NAME, X) Packet2l p2l_##NAME = reinterpret_cast<Packet2l>(vec_splat_s64(X))
+
+#define EIGEN_DECLARE_CONST_Packet4i(NAME, X) Packet4i p4i_##NAME = pset1<Packet4i>(X)
+
+#define EIGEN_DECLARE_CONST_Packet2d(NAME, X) Packet2d p2d_##NAME = pset1<Packet2d>(X)
+
+#define EIGEN_DECLARE_CONST_Packet2l(NAME, X) Packet2l p2l_##NAME = pset1<Packet2l>(X)
+
+// These constants are endian-agnostic
+static EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0);  //{ 0, 0, 0, 0,}
+static EIGEN_DECLARE_CONST_FAST_Packet4i(ONE, 1);   //{ 1, 1, 1, 1}
+
+static EIGEN_DECLARE_CONST_FAST_Packet2d(ZERO, 0);
+static EIGEN_DECLARE_CONST_FAST_Packet2l(ZERO, 0);
+static EIGEN_DECLARE_CONST_FAST_Packet2l(ONE, 1);
+
+static Packet2d p2d_ONE = {1.0, 1.0};
+static Packet2d p2d_ZERO_ = {numext::bit_cast<double>(0x8000000000000000ull),
+                             numext::bit_cast<double>(0x8000000000000000ull)};
+
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
+#define EIGEN_DECLARE_CONST_FAST_Packet4f(NAME, X) Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(vec_splat_s32(X))
+
+#define EIGEN_DECLARE_CONST_Packet4f(NAME, X) Packet4f p4f_##NAME = pset1<Packet4f>(X)
+
+#define EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME, X) \
+  const Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(pset1<Packet4i>(X))
+
+static EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0);     //{ 0.0, 0.0, 0.0, 0.0}
+static EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1, -1);  //{ -1, -1, -1, -1}
+static Packet4f p4f_MZERO = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
+#endif
+
+static Packet4i p4i_COUNTDOWN = {0, 1, 2, 3};
+static Packet4f p4f_COUNTDOWN = {0.0, 1.0, 2.0, 3.0};
+static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(
+    vec_sld(reinterpret_cast<Packet16uc>(p2d_ZERO), reinterpret_cast<Packet16uc>(p2d_ONE), 8));
+
+static Packet16uc p16uc_PSET64_HI = {0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7};
+static Packet16uc p16uc_DUPLICATE32_HI = {0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7};
+
+// Mask alignment
+#define EIGEN_MASK_ALIGNMENT 0xfffffffffffffff0
+
+#define EIGEN_ALIGNED_PTR(x) ((std::ptrdiff_t)(x) & EIGEN_MASK_ALIGNMENT)
+
+// Handle endianness properly while loading constants
+// Define global static constants:
+
+static Packet16uc p16uc_FORWARD = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+static Packet16uc p16uc_REVERSE32 = {12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3};
+static Packet16uc p16uc_REVERSE64 = {8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7};
+
+static Packet16uc p16uc_PSET32_WODD =
+    vec_sld((Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 2),
+            8);  //{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 };
+static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc)vec_splat((Packet4ui)p16uc_FORWARD, 3),
+                                               8);  //{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 };
+/*static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3),
+8);      //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16};
+
+static Packet16uc p16uc_PSET64_HI = (Packet16uc) vec_mergeh((Packet4ui)p16uc_PSET32_WODD,
+(Packet4ui)p16uc_PSET32_WEVEN);     //{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 };*/
+static Packet16uc p16uc_PSET64_LO = (Packet16uc)vec_mergel(
+    (Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN);  //{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 };
+/*static Packet16uc p16uc_TRANSPOSE64_HI = vec_add(p16uc_PSET64_HI, p16uc_HALF64_0_16); //{ 0,1,2,3, 4,5,6,7,
+16,17,18,19, 20,21,22,23}; static Packet16uc p16uc_TRANSPOSE64_LO = vec_add(p16uc_PSET64_LO, p16uc_HALF64_0_16); //{
+8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31};*/
+static Packet16uc p16uc_TRANSPOSE64_HI = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23};
+static Packet16uc p16uc_TRANSPOSE64_LO = {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31};
+
+static Packet16uc p16uc_COMPLEX32_REV =
+    vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8);  //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 };
+
+static Packet16uc p16uc_COMPLEX32_REV2 =
+    vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8);  //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 };
+
+#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC
+#define EIGEN_ZVECTOR_PREFETCH(ADDR) __builtin_prefetch(ADDR);
+#else
+#define EIGEN_ZVECTOR_PREFETCH(ADDR) asm("   pfd [%[addr]]\n" ::[addr] "r"(ADDR) : "cc");
+#endif
+
+template <>
+struct packet_traits<int> : default_packet_traits {
+  typedef Packet4i type;
+  typedef Packet4i half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasBlend = 1
+  };
+};
+
+template <>
+struct packet_traits<float> : default_packet_traits {
+  typedef Packet4f type;
+  typedef Packet4f half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 4,
+
+    HasCmp = 1,
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasAbs = 1,
+    HasSin = 0,
+    HasCos = 0,
+    HasLog = 0,
+    HasExp = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasTanh = 1,
+    HasErf = 1,
+    HasNegate = 1,
+    HasBlend = 1
+  };
+};
+
+template <>
+struct packet_traits<double> : default_packet_traits {
+  typedef Packet2d type;
+  typedef Packet2d half;
+  enum {
+    Vectorizable = 1,
+    AlignedOnScalar = 1,
+    size = 2,
+
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasDiv = 1,
+    HasMin = 1,
+    HasMax = 1,
+    HasAbs = 1,
+    HasSin = 0,
+    HasCos = 0,
+    HasLog = 0,
+    HasExp = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasNegate = 1,
+    HasBlend = 1
+  };
+};
+
+template <>
+struct unpacket_traits<Packet4i> {
+  typedef int type;
+  enum {
+    size = 4,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+  typedef Packet4i half;
+};
+template <>
+struct unpacket_traits<Packet4f> {
+  typedef float type;
+  enum {
+    size = 4,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+  typedef Packet4f half;
+  typedef Packet4i integer_packet;
+};
+template <>
+struct unpacket_traits<Packet2d> {
+  typedef double type;
+  enum {
+    size = 2,
+    alignment = Aligned16,
+    vectorizable = true,
+    masked_load_available = false,
+    masked_store_available = false
+  };
+  typedef Packet2d half;
+  typedef Packet2l integer_packet;
+};
+
+/* Forward declaration */
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f, 4>& kernel);
+
+inline std::ostream& operator<<(std::ostream& s, const Packet4i& v) {
+  Packet vt;
+  vt.v4i = v;
+  s << vt.i[0] << ", " << vt.i[1] << ", " << vt.i[2] << ", " << vt.i[3];
+  return s;
+}
+
+inline std::ostream& operator<<(std::ostream& s, const Packet4ui& v) {
+  Packet vt;
+  vt.v4ui = v;
+  s << vt.ui[0] << ", " << vt.ui[1] << ", " << vt.ui[2] << ", " << vt.ui[3];
+  return s;
+}
+
+inline std::ostream& operator<<(std::ostream& s, const Packet2l& v) {
+  Packet vt;
+  vt.v2l = v;
+  s << vt.l[0] << ", " << vt.l[1];
+  return s;
+}
+
+inline std::ostream& operator<<(std::ostream& s, const Packet2ul& v) {
+  Packet vt;
+  vt.v2ul = v;
+  s << vt.ul[0] << ", " << vt.ul[1];
+  return s;
+}
+
+inline std::ostream& operator<<(std::ostream& s, const Packet2d& v) {
+  Packet vt;
+  vt.v2d = v;
+  s << vt.d[0] << ", " << vt.d[1];
+  return s;
+}
+
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ >= 12)
+inline std::ostream& operator<<(std::ostream& s, const Packet4f& v) {
+  Packet vt;
+  vt.v4f = v;
+  s << vt.f[0] << ", " << vt.f[1] << ", " << vt.f[2] << ", " << vt.f[3];
+  return s;
+}
+#endif
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD
+  return vec_xl(0, from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD
+  return vec_xl(0, from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) {
+  EIGEN_DEBUG_ALIGNED_STORE
+  vec_xst(from, 0, to);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) {
+  EIGEN_DEBUG_ALIGNED_STORE
+  vec_xst(from, 0, to);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pfrexp<Packet4f>(const Packet4f& a, Packet4f& exponent) {
+  return pfrexp_generic(a, exponent);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pfrexp<Packet2d>(const Packet2d& a, Packet2d& exponent) {
+  return pfrexp_generic(a, exponent);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) {
+  return vec_splats(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) {
+  return vec_splats(from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pbroadcast4<Packet4i>(const int* a, Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3) {
+  a3 = pload<Packet4i>(a);
+  a0 = vec_splat(a3, 0);
+  a1 = vec_splat(a3, 1);
+  a2 = vec_splat(a3, 2);
+  a3 = vec_splat(a3, 3);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pbroadcast4<Packet2d>(const double* a, Packet2d& a0, Packet2d& a1, Packet2d& a2,
+                                               Packet2d& a3) {
+  a1 = pload<Packet2d>(a);
+  a0 = vec_splat(a1, 0);
+  a1 = vec_splat(a1, 1);
+  a3 = pload<Packet2d>(a + 2);
+  a2 = vec_splat(a3, 0);
+  a3 = vec_splat(a3, 1);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride) {
+  EIGEN_ALIGN16 int ai[4];
+  ai[0] = from[0 * stride];
+  ai[1] = from[1 * stride];
+  ai[2] = from[2 * stride];
+  ai[3] = from[3 * stride];
+  return pload<Packet4i>(ai);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride) {
+  EIGEN_ALIGN16 double af[2];
+  af[0] = from[0 * stride];
+  af[1] = from[1 * stride];
+  return pload<Packet2d>(af);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride) {
+  EIGEN_ALIGN16 int ai[4];
+  pstore<int>((int*)ai, from);
+  to[0 * stride] = ai[0];
+  to[1 * stride] = ai[1];
+  to[2 * stride] = ai[2];
+  to[3 * stride] = ai[3];
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) {
+  EIGEN_ALIGN16 double af[2];
+  pstore<double>(af, from);
+  to[0 * stride] = af[0];
+  to[1 * stride] = af[1];
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return (a + b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return (a + b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return (a - b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return (a - b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return (a * b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return (a * b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return (a / b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return (a / b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) {
+  return (-a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) {
+  return (-a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) {
+  return a;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) {
+  return padd<Packet4i>(pmul<Packet4i>(a, b), c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) {
+  return vec_madd(a, b, c);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) {
+  return padd<Packet4i>(pset1<Packet4i>(a), p4i_COUNTDOWN);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) {
+  return padd<Packet2d>(pset1<Packet2d>(a), p2d_COUNTDOWN);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return vec_min(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return vec_min(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return vec_max(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return vec_max(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return vec_and(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return vec_and(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return vec_or(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return vec_or(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return vec_xor(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return vec_xor(a, b);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) {
+  return pand<Packet4i>(a, vec_nor(b, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) {
+  return vec_and(a, vec_nor(b, b));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) {
+  /* Uses non-default rounding for vec_round */
+  return __builtin_s390_vfidb(a, 0, 1);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) {
+  return vec_ceil(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) {
+  return vec_floor(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) {
+  return pload<Packet4i>(from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) {
+  return pload<Packet2d>(from);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from) {
+  Packet4i p = pload<Packet4i>(from);
+  return vec_perm(p, p, p16uc_DUPLICATE32_HI);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) {
+  Packet2d p = pload<Packet2d>(from);
+  return vec_perm(p, p, p16uc_PSET64_HI);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) {
+  pstore<int>(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) {
+  pstore<double>(to, from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) {
+  EIGEN_ZVECTOR_PREFETCH(addr);
+}
+template <>
+EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) {
+  EIGEN_ZVECTOR_PREFETCH(addr);
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet2l parithmetic_shift_right(const Packet2l& a) {
+  return Packet2l { parithmetic_shift_right<N>(a[0]), parithmetic_shift_right<N>(a[1]) };
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4i parithmetic_shift_right(const Packet4i& a) {
+  return Packet4i {
+    parithmetic_shift_right<N>(a[0]),
+    parithmetic_shift_right<N>(a[1]),
+    parithmetic_shift_right<N>(a[2]),
+    parithmetic_shift_right<N>(a[3]) };
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet2l plogical_shift_right(const Packet2l& a) {
+  return Packet2l { plogical_shift_right<N>(a[0]), plogical_shift_right<N>(a[1]) };
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4i plogical_shift_right(const Packet4i& a) {
+  return Packet4i {
+    plogical_shift_right<N>(a[0]),
+    plogical_shift_right<N>(a[1]),
+    plogical_shift_right<N>(a[2]),
+    plogical_shift_right<N>(a[3]) };
+}
+
+template <int N>
+EIGEN_STRONG_INLINE Packet2l plogical_shift_left(const Packet2l& a) {
+  return Packet2l { plogical_shift_left<N>(a[0]), plogical_shift_left<N>(a[1]) };
+}
+template <int N>
+EIGEN_STRONG_INLINE Packet4i plogical_shift_left(const Packet4i& a) {
+  return Packet4i {
+    plogical_shift_left<N>(a[0]),
+    plogical_shift_left<N>(a[1]),
+    plogical_shift_left<N>(a[2]),
+    plogical_shift_left<N>(a[3]) };
+}
+
+template <>
+EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) {
+  EIGEN_ALIGN16 int x[4];
+  pstore(x, a);
+  return x[0];
+}
+template <>
+EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) {
+  EIGEN_ALIGN16 double x[2];
+  pstore(x, a);
+  return x[0];
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) {
+  return reinterpret_cast<Packet4i>(
+      vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) {
+  return reinterpret_cast<Packet2d>(
+      vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE64));
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pabs<Packet4i>(const Packet4i& a) {
+  return vec_abs(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pabs<Packet2d>(const Packet2d& a) {
+  return vec_abs(a);
+}
+
+template <>
+EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) {
+  Packet4i b, sum;
+  b = vec_sld(a, a, 8);
+  sum = padd<Packet4i>(a, b);
+  b = vec_sld(sum, sum, 4);
+  sum = padd<Packet4i>(sum, b);
+  return pfirst(sum);
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) {
+  Packet2d b, sum;
+  b = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8));
+  sum = padd<Packet2d>(a, b);
+  return pfirst(sum);
+}
+
+// Other reduction functions:
+// mul
+template <>
+EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a) {
+  EIGEN_ALIGN16 int aux[4];
+  pstore(aux, a);
+  return aux[0] * aux[1] * aux[2] * aux[3];
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) {
+  return pfirst(
+      pmul(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
+}
+
+// min
+template <>
+EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a) {
+  Packet4i b, res;
+  b = pmin<Packet4i>(a, vec_sld(a, a, 8));
+  res = pmin<Packet4i>(b, vec_sld(b, b, 4));
+  return pfirst(res);
+}
+
+template <>
+EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) {
+  return pfirst(pmin<Packet2d>(
+      a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
+}
+
+// max
+template <>
+EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a) {
+  Packet4i b, res;
+  b = pmax<Packet4i>(a, vec_sld(a, a, 8));
+  res = pmax<Packet4i>(b, vec_sld(b, b, 4));
+  return pfirst(res);
+}
+
+// max
+template <>
+EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) {
+  return pfirst(pmax<Packet2d>(
+      a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(a), reinterpret_cast<Packet4i>(a), 8))));
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4i, 4>& kernel) {
+  Packet4i t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
+  Packet4i t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
+  Packet4i t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
+  Packet4i t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
+  kernel.packet[0] = vec_mergeh(t0, t2);
+  kernel.packet[1] = vec_mergel(t0, t2);
+  kernel.packet[2] = vec_mergeh(t1, t3);
+  kernel.packet[3] = vec_mergel(t1, t3);
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet2d, 2>& kernel) {
+  Packet2d t0 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_HI);
+  Packet2d t1 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_LO);
+  kernel.packet[0] = t0;
+  kernel.packet[1] = t1;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket,
+                                    const Packet4i& elsePacket) {
+  Packet4ui select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3]};
+  Packet4ui mask = vec_cmpeq(select, reinterpret_cast<Packet4ui>(p4i_ONE));
+  return vec_sel(elsePacket, thenPacket, mask);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket,
+                                    const Packet2d& elsePacket) {
+  Packet2ul select = {ifPacket.select[0], ifPacket.select[1]};
+  Packet2ul mask = vec_cmpeq(select, reinterpret_cast<Packet2ul>(p2l_ONE));
+  return vec_sel(elsePacket, thenPacket, mask);
+}
+
+/* z13 has no vector float support so we emulate that with double
+   z14 has proper vector float support.
+*/
+#if !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 12)
+/* Helper function to simulate a vec_splat_packet4f
+ */
+template <int element>
+EIGEN_STRONG_INLINE Packet4f vec_splat_packet4f(const Packet4f& from) {
+  Packet4f splat;
+  switch (element) {
+    case 0:
+      splat.v4f[0] = vec_splat(from.v4f[0], 0);
+      splat.v4f[1] = splat.v4f[0];
+      break;
+    case 1:
+      splat.v4f[0] = vec_splat(from.v4f[0], 1);
+      splat.v4f[1] = splat.v4f[0];
+      break;
+    case 2:
+      splat.v4f[0] = vec_splat(from.v4f[1], 0);
+      splat.v4f[1] = splat.v4f[0];
+      break;
+    case 3:
+      splat.v4f[0] = vec_splat(from.v4f[1], 1);
+      splat.v4f[1] = splat.v4f[0];
+      break;
+  }
+  return splat;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
+  // FIXME: No intrinsic yet
+  EIGEN_DEBUG_ALIGNED_LOAD
+  Packet4f vfrom;
+  vfrom.v4f[0] = vec_ld2f(&from[0]);
+  vfrom.v4f[1] = vec_ld2f(&from[2]);
+  return vfrom;
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
+  // FIXME: No intrinsic yet
+  EIGEN_DEBUG_ALIGNED_STORE
+  vec_st2f(from.v4f[0], &to[0]);
+  vec_st2f(from.v4f[1], &to[2]);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
+  Packet4f to;
+  to.v4f[0] = pset1<Packet2d>(static_cast<const double&>(from));
+  to.v4f[1] = to.v4f[0];
+  return to;
+}
+
+template <>
+EIGEN_STRONG_INLINE void pbroadcast4<Packet4f>(const float* a, Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) {
+  a3 = pload<Packet4f>(a);
+  a0 = vec_splat_packet4f<0>(a3);
+  a1 = vec_splat_packet4f<1>(a3);
+  a2 = vec_splat_packet4f<2>(a3);
+  a3 = vec_splat_packet4f<3>(a3);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
+  EIGEN_ALIGN16 float ai[4];
+  ai[0] = from[0 * stride];
+  ai[1] = from[1 * stride];
+  ai[2] = from[2 * stride];
+  ai[3] = from[3 * stride];
+  return pload<Packet4f>(ai);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) {
+  EIGEN_ALIGN16 float ai[4];
+  pstore<float>((float*)ai, from);
+  to[0 * stride] = ai[0];
+  to[1 * stride] = ai[1];
+  to[2 * stride] = ai[2];
+  to[3 * stride] = ai[3];
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  Packet4f c;
+  c.v4f[0] = a.v4f[0] + b.v4f[0];
+  c.v4f[1] = a.v4f[1] + b.v4f[1];
+  return c;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  Packet4f c;
+  c.v4f[0] = a.v4f[0] - b.v4f[0];
+  c.v4f[1] = a.v4f[1] - b.v4f[1];
+  return c;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  Packet4f c;
+  c.v4f[0] = a.v4f[0] * b.v4f[0];
+  c.v4f[1] = a.v4f[1] * b.v4f[1];
+  return c;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  Packet4f c;
+  c.v4f[0] = a.v4f[0] / b.v4f[0];
+  c.v4f[1] = a.v4f[1] / b.v4f[1];
+  return c;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) {
+  Packet4f c;
+  c.v4f[0] = -a.v4f[0];
+  c.v4f[1] = -a.v4f[1];
+  return c;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+  Packet4f res;
+  res.v4f[0] = vec_madd(a.v4f[0], b.v4f[0], c.v4f[0]);
+  res.v4f[1] = vec_madd(a.v4f[1], b.v4f[1], c.v4f[1]);
+  return res;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  Packet4f res;
+  res.v4f[0] = pmin(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pmin(a.v4f[1], b.v4f[1]);
+  return res;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  Packet4f res;
+  res.v4f[0] = pmax(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pmax(a.v4f[1], b.v4f[1]);
+  return res;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  Packet4f res;
+  res.v4f[0] = pand(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pand(a.v4f[1], b.v4f[1]);
+  return res;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  Packet4f res;
+  res.v4f[0] = por(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = por(a.v4f[1], b.v4f[1]);
+  return res;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  Packet4f res;
+  res.v4f[0] = pxor(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pxor(a.v4f[1], b.v4f[1]);
+  return res;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  Packet4f res;
+  res.v4f[0] = pandnot(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pandnot(a.v4f[1], b.v4f[1]);
+  return res;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) {
+  Packet4f res;
+  res.v4f[0] = generic_round(a.v4f[0]);
+  res.v4f[1] = generic_round(a.v4f[1]);
+  return res;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) {
+  Packet4f res;
+  res.v4f[0] = vec_ceil(a.v4f[0]);
+  res.v4f[1] = vec_ceil(a.v4f[1]);
+  return res;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) {
+  Packet4f res;
+  res.v4f[0] = vec_floor(a.v4f[0]);
+  res.v4f[1] = vec_floor(a.v4f[1]);
+  return res;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) {
+  Packet4f p = pload<Packet4f>(from);
+  p.v4f[1] = vec_splat(p.v4f[0], 1);
+  p.v4f[0] = vec_splat(p.v4f[0], 0);
+  return p;
+}
+
+template <>
+EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
+  EIGEN_ALIGN16 float x[2];
+  vec_st2f(a.v4f[0], &x[0]);
+  return x[0];
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
+  Packet4f rev;
+  rev.v4f[0] = preverse<Packet2d>(a.v4f[1]);
+  rev.v4f[1] = preverse<Packet2d>(a.v4f[0]);
+  return rev;
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pabs<Packet4f>(const Packet4f& a) {
+  Packet4f res;
+  res.v4f[0] = pabs(a.v4f[0]);
+  res.v4f[1] = pabs(a.v4f[1]);
+  return res;
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
+  Packet2d sum;
+  sum = padd<Packet2d>(a.v4f[0], a.v4f[1]);
+  double first = predux<Packet2d>(sum);
+  return static_cast<float>(first);
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) {
+  // Return predux_mul<Packet2d> of the subvectors product
+  return static_cast<float>(pfirst(predux_mul(pmul(a.v4f[0], a.v4f[1]))));
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
+  Packet2d b, res;
+  b = pmin<Packet2d>(a.v4f[0], a.v4f[1]);
+  res = pmin<Packet2d>(
+      b, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(b), reinterpret_cast<Packet4i>(b), 8)));
+  return static_cast<float>(pfirst(res));
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
+  Packet2d b, res;
+  b = pmax<Packet2d>(a.v4f[0], a.v4f[1]);
+  res = pmax<Packet2d>(
+      b, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4i>(b), reinterpret_cast<Packet4i>(b), 8)));
+  return static_cast<float>(pfirst(res));
+}
+
+/* Split the Packet4f PacketBlock into 4 Packet2d PacketBlocks and transpose each one
+ */
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
+  PacketBlock<Packet2d, 2> t0, t1, t2, t3;
+  // copy top-left 2x2 Packet2d block
+  t0.packet[0] = kernel.packet[0].v4f[0];
+  t0.packet[1] = kernel.packet[1].v4f[0];
+
+  // copy top-right 2x2 Packet2d block
+  t1.packet[0] = kernel.packet[0].v4f[1];
+  t1.packet[1] = kernel.packet[1].v4f[1];
+
+  // copy bottom-left 2x2 Packet2d block
+  t2.packet[0] = kernel.packet[2].v4f[0];
+  t2.packet[1] = kernel.packet[3].v4f[0];
+
+  // copy bottom-right 2x2 Packet2d block
+  t3.packet[0] = kernel.packet[2].v4f[1];
+  t3.packet[1] = kernel.packet[3].v4f[1];
+
+  // Transpose all 2x2 blocks
+  ptranspose(t0);
+  ptranspose(t1);
+  ptranspose(t2);
+  ptranspose(t3);
+
+  // Copy back transposed blocks, but exchange t1 and t2 due to transposition
+  kernel.packet[0].v4f[0] = t0.packet[0];
+  kernel.packet[0].v4f[1] = t2.packet[0];
+  kernel.packet[1].v4f[0] = t0.packet[1];
+  kernel.packet[1].v4f[1] = t2.packet[1];
+  kernel.packet[2].v4f[0] = t1.packet[0];
+  kernel.packet[2].v4f[1] = t3.packet[0];
+  kernel.packet[3].v4f[0] = t1.packet[1];
+  kernel.packet[3].v4f[1] = t3.packet[1];
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket,
+                                    const Packet4f& elsePacket) {
+  Packet2ul select_hi = {ifPacket.select[0], ifPacket.select[1]};
+  Packet2ul select_lo = {ifPacket.select[2], ifPacket.select[3]};
+  Packet2ul mask_hi = vec_cmpeq(select_hi, reinterpret_cast<Packet2ul>(p2l_ONE));
+  Packet2ul mask_lo = vec_cmpeq(select_lo, reinterpret_cast<Packet2ul>(p2l_ONE));
+  Packet4f result;
+  result.v4f[0] = vec_sel(elsePacket.v4f[0], thenPacket.v4f[0], mask_hi);
+  result.v4f[1] = vec_sel(elsePacket.v4f[1], thenPacket.v4f[1], mask_lo);
+  return result;
+}
+
+template <>
+Packet4f EIGEN_STRONG_INLINE pcmp_le<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  Packet4f res;
+  res.v4f[0] = pcmp_le(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pcmp_le(a.v4f[1], b.v4f[1]);
+  return res;
+}
+
+template <>
+Packet4f EIGEN_STRONG_INLINE pcmp_lt<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  Packet4f res;
+  res.v4f[0] = pcmp_lt(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pcmp_lt(a.v4f[1], b.v4f[1]);
+  return res;
+}
+
+template <>
+Packet4f EIGEN_STRONG_INLINE pcmp_eq<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  Packet4f res;
+  res.v4f[0] = pcmp_eq(a.v4f[0], b.v4f[0]);
+  res.v4f[1] = pcmp_eq(a.v4f[1], b.v4f[1]);
+  return res;
+}
+
+#else
+template <>
+EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) {
+  EIGEN_DEBUG_ALIGNED_LOAD
+  return vec_xl(0, from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) {
+  EIGEN_DEBUG_ALIGNED_STORE
+  vec_xst(from, 0, to);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) {
+  return vec_splats(from);
+}
+
+template <>
+EIGEN_STRONG_INLINE void pbroadcast4<Packet4f>(const float* a, Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) {
+  a3 = pload<Packet4f>(a);
+  a0 = vec_splat(a3, 0);
+  a1 = vec_splat(a3, 1);
+  a2 = vec_splat(a3, 2);
+  a3 = vec_splat(a3, 3);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride) {
+  EIGEN_ALIGN16 float af[4];
+  af[0] = from[0 * stride];
+  af[1] = from[1 * stride];
+  af[2] = from[2 * stride];
+  af[3] = from[3 * stride];
+  return pload<Packet4f>(af);
+}
+
+template <>
+EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) {
+  EIGEN_ALIGN16 float af[4];
+  pstore<float>((float*)af, from);
+  to[0 * stride] = af[0];
+  to[1 * stride] = af[1];
+  to[2 * stride] = af[2];
+  to[3 * stride] = af[3];
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return (a + b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return (a - b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return (a * b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return (a / b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pnegate<Packet4f>(const Packet4f& a) {
+  return (-a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pconj<Packet4f>(const Packet4f& a) {
+  return a;
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pmadd<Packet4f>(const Packet4f& a, const Packet4f& b, const Packet4f& c) {
+  return vec_madd(a, b, c);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return vec_min(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return vec_max(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return vec_and(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return vec_or(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return vec_xor(a, b);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) {
+  return vec_and(a, vec_nor(b, b));
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) {
+  /* Uses non-default rounding for vec_round */
+  return __builtin_s390_vfisb(a, 0, 1);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) {
+  return vec_ceil(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) {
+  return vec_floor(a);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f pabs<Packet4f>(const Packet4f& a) {
+  return vec_abs(a);
+}
+template <>
+EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) {
+  EIGEN_ALIGN16 float x[4];
+  pstore(x, a);
+  return x[0];
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) {
+  Packet4f p = pload<Packet4f>(from);
+  return vec_perm(p, p, p16uc_DUPLICATE32_HI);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) {
+  return reinterpret_cast<Packet4f>(
+      vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32));
+}
+
+template <>
+EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) {
+  Packet4f b, sum;
+  b = vec_sld(a, a, 8);
+  sum = padd<Packet4f>(a, b);
+  b = vec_sld(sum, sum, 4);
+  sum = padd<Packet4f>(sum, b);
+  return pfirst(sum);
+}
+
+// Other reduction functions:
+// mul
+template <>
+EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) {
+  Packet4f prod;
+  prod = pmul(a, vec_sld(a, a, 8));
+  return pfirst(pmul(prod, vec_sld(prod, prod, 4)));
+}
+
+// min
+template <>
+EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) {
+  Packet4f b, res;
+  b = pmin<Packet4f>(a, vec_sld(a, a, 8));
+  res = pmin<Packet4f>(b, vec_sld(b, b, 4));
+  return pfirst(res);
+}
+
+// max
+template <>
+EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) {
+  Packet4f b, res;
+  b = pmax<Packet4f>(a, vec_sld(a, a, 8));
+  res = pmax<Packet4f>(b, vec_sld(b, b, 4));
+  return pfirst(res);
+}
+
+EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock<Packet4f, 4>& kernel) {
+  Packet4f t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]);
+  Packet4f t1 = vec_mergel(kernel.packet[0], kernel.packet[2]);
+  Packet4f t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]);
+  Packet4f t3 = vec_mergel(kernel.packet[1], kernel.packet[3]);
+  kernel.packet[0] = vec_mergeh(t0, t2);
+  kernel.packet[1] = vec_mergel(t0, t2);
+  kernel.packet[2] = vec_mergeh(t1, t3);
+  kernel.packet[3] = vec_mergel(t1, t3);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket,
+                                    const Packet4f& elsePacket) {
+  Packet4ui select = {ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3]};
+  Packet4ui mask = vec_cmpeq(select, reinterpret_cast<Packet4ui>(p4i_ONE));
+  return vec_sel(elsePacket, thenPacket, mask);
+}
+
+#endif
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pldexp<Packet4f>(const Packet4f& a, const Packet4f& exponent) {
+  return pldexp_generic(a, exponent);
+}
+
+template <>
+EIGEN_STRONG_INLINE Packet2d pldexp<Packet2d>(const Packet2d& a, const Packet2d& exponent) {
+  // Clamp exponent to [-2099, 2099]
+  const Packet2d max_exponent = pset1<Packet2d>(2099.0);
+  const Packet2l e = pcast<Packet2d, Packet2l>(pmin(pmax(exponent, pnegate(max_exponent)), max_exponent));
+
+  // Split 2^e into four factors and multiply:
+  const Packet2l bias = {1023, 1023};
+  Packet2l b = plogical_shift_right<2>(e);  // floor(e/4)
+  Packet2d c = reinterpret_cast<Packet2d>(plogical_shift_left<52>(b + bias));
+  Packet2d out = pmul(pmul(pmul(a, c), c), c);                        // a * 2^(3b)
+  b = psub(psub(psub(e, b), b), b);                                   // e - 3b
+  c = reinterpret_cast<Packet2d>(plogical_shift_left<52>(b + bias));  // 2^(e - 3b)
+  out = pmul(out, c);                                                 // a * 2^e
+  return out;
+}
+
+template <>
+EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) {
+  EIGEN_ZVECTOR_PREFETCH(addr);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) {
+  return pload<Packet4f>(from);
+}
+template <>
+EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) {
+  pstore<float>(to, from);
+}
+template <>
+EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) {
+  return padd<Packet4f>(pset1<Packet4f>(a), p4f_COUNTDOWN);
+}
+
+#if !defined(vec_float) || !defined(__ARCH__) || (defined(__ARCH__) && __ARCH__ < 13)
+#pragma GCC warning \
+    "float->int and int->float conversion is simulated. compile for z15 for improved performance"
+template <>
+struct cast_impl<Packet4i, Packet4f> {
+  EIGEN_DEVICE_FUNC static inline Packet4f run(const Packet4i& a) {
+    return Packet4f{float(a[0]), float(a[1]), float(a[2]), float(a[3]) };
+  }
+};
+
+template <>
+struct cast_impl<Packet4f, Packet4i> {
+  EIGEN_DEVICE_FUNC static inline Packet4i run(const Packet4f& a) {
+    return Packet4i{int(a[0]), int(a[1]), int(a[2]), int(a[3]) };
+  }
+};
+
+template <>
+struct cast_impl<Packet2l, Packet2d> {
+  EIGEN_DEVICE_FUNC static inline Packet2d run(const Packet2l& a) {
+    return Packet2d{double(a[0]), double(a[1]) };
+  }
+};
+
+template <>
+struct cast_impl<Packet2d, Packet2l> {
+  EIGEN_DEVICE_FUNC static inline Packet2l run(const Packet2d& a) {
+    return Packet2l{(long long)(a[0]), (long long)(a[1]) };
+  }
+};
+#else
+template <>
+struct cast_impl<Packet4i, Packet4f> {
+  EIGEN_DEVICE_FUNC static inline Packet4f run(const Packet4i& a) {
+    return vec_float(a);
+  }
+};
+
+template <>
+struct cast_impl<Packet4f, Packet4i> {
+  EIGEN_DEVICE_FUNC static inline Packet4i run(const Packet4f& a) {
+    return vec_signed(a);
+  }
+};
+
+template <>
+struct cast_impl<Packet2l, Packet2d> {
+  EIGEN_DEVICE_FUNC static inline Packet2d run(const Packet2l& a) {
+    return vec_double(a);
+  }
+};
+
+template <>
+struct cast_impl<Packet2d, Packet2l> {
+  EIGEN_DEVICE_FUNC static inline Packet2l run(const Packet2d& a) {
+    return vec_signed(a);
+  }
+};
+#endif
+
+template <>
+EIGEN_STRONG_INLINE Packet4f pset1frombits<Packet4f>(uint32_t from) {
+  return pset1<Packet4f>(Eigen::numext::bit_cast<float>(from));
+}
+template <>
+EIGEN_STRONG_INLINE Packet2d pset1frombits<Packet2d>(uint64_t from) {
+  return pset1<Packet2d>(Eigen::numext::bit_cast<double>(from));
+}
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_PACKET_MATH_ZVECTOR_H
diff --git a/inst/include/Eigen/src/Core/functors/AssignmentFunctors.h b/inst/include/Eigen/src/Core/functors/AssignmentFunctors.h
new file mode 100644
index 00000000..0239262a
--- /dev/null
+++ b/inst/include/Eigen/src/Core/functors/AssignmentFunctors.h
@@ -0,0 +1,174 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_ASSIGNMENT_FUNCTORS_H
+#define EIGEN_ASSIGNMENT_FUNCTORS_H
+
+// IWYU pragma: private
+#include "../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+/** \internal
+ * \brief Template functor for scalar/packet assignment
+ *
+ */
+template <typename DstScalar, typename SrcScalar>
+struct assign_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void assignCoeff(DstScalar& a, const SrcScalar& b) const { a = b; }
+
+  template <int Alignment, typename Packet>
+  EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const {
+    pstoret<DstScalar, Packet, Alignment>(a, b);
+  }
+
+  template <int Alignment, typename Packet>
+  EIGEN_STRONG_INLINE void assignPacketSegment(DstScalar* a, const Packet& b, Index begin, Index count) const {
+    pstoretSegment<DstScalar, Packet, Alignment>(a, b, begin, count);
+  }
+};
+
+// Empty overload for void type (used by PermutationMatrix)
+template <typename DstScalar>
+struct assign_op<DstScalar, void> {};
+
+template <typename DstScalar, typename SrcScalar>
+struct functor_traits<assign_op<DstScalar, SrcScalar>> {
+  enum {
+    Cost = NumTraits<DstScalar>::ReadCost,
+    PacketAccess = is_same<DstScalar, SrcScalar>::value && packet_traits<DstScalar>::Vectorizable &&
+                   packet_traits<SrcScalar>::Vectorizable
+  };
+};
+
+/** \internal
+ * \brief Template functor for scalar/packet compound assignment
+ *
+ */
+template <typename DstScalar, typename SrcScalar, typename Func>
+struct compound_assign_op {
+  using traits = functor_traits<compound_assign_op<DstScalar, SrcScalar, Func>>;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void assignCoeff(DstScalar& a, const SrcScalar& b) const {
+    assign_op<DstScalar, DstScalar>().assignCoeff(a, Func().operator()(a, b));
+  }
+
+  template <int Alignment, typename Packet>
+  EIGEN_STRONG_INLINE void assignPacket(DstScalar* a, const Packet& b) const {
+    assign_op<DstScalar, DstScalar>().template assignPacket<Alignment, Packet>(
+        a, Func().packetOp(ploadt<Packet, Alignment>(a), b));
+  }
+
+  template <int Alignment, typename Packet>
+  EIGEN_STRONG_INLINE void assignPacketSegment(DstScalar* a, const Packet& b, Index begin, Index count) const {
+    assign_op<DstScalar, DstScalar>().template assignPacketSegment<Alignment, Packet>(
+        a, Func().packetOp(ploadtSegment<Packet, Alignment>(a, begin, count), b), begin, count);
+  }
+};
+
+template <typename DstScalar, typename SrcScalar, typename Func>
+struct functor_traits<compound_assign_op<DstScalar, SrcScalar, Func>> {
+  enum {
+    Cost = int(functor_traits<assign_op<DstScalar, DstScalar>>::Cost) + int(functor_traits<Func>::Cost),
+    PacketAccess = functor_traits<assign_op<DstScalar, DstScalar>>::PacketAccess && functor_traits<Func>::PacketAccess
+  };
+};
+
+/** \internal
+ * \brief Template functor for scalar/packet assignment with addition
+ *
+ */
+template <typename DstScalar, typename SrcScalar = DstScalar>
+struct add_assign_op : compound_assign_op<DstScalar, SrcScalar, scalar_sum_op<DstScalar, SrcScalar>> {};
+
+template <typename DstScalar, typename SrcScalar>
+struct functor_traits<add_assign_op<DstScalar, SrcScalar>> : add_assign_op<DstScalar, SrcScalar>::traits {};
+
+/** \internal
+ * \brief Template functor for scalar/packet assignment with subtraction
+ *
+ */
+template <typename DstScalar, typename SrcScalar = DstScalar>
+struct sub_assign_op : compound_assign_op<DstScalar, SrcScalar, scalar_difference_op<DstScalar, SrcScalar>> {};
+
+template <typename DstScalar, typename SrcScalar>
+struct functor_traits<sub_assign_op<DstScalar, SrcScalar>> : sub_assign_op<DstScalar, SrcScalar>::traits {};
+
+/** \internal
+ * \brief Template functor for scalar/packet assignment with multiplication
+ *
+ */
+template <typename DstScalar, typename SrcScalar = DstScalar>
+struct mul_assign_op : compound_assign_op<DstScalar, SrcScalar, scalar_product_op<DstScalar, SrcScalar>> {};
+
+template <typename DstScalar, typename SrcScalar>
+struct functor_traits<mul_assign_op<DstScalar, SrcScalar>> : mul_assign_op<DstScalar, SrcScalar>::traits {};
+
+/** \internal
+ * \brief Template functor for scalar/packet assignment with dividing
+ *
+ */
+template <typename DstScalar, typename SrcScalar = DstScalar>
+struct div_assign_op : compound_assign_op<DstScalar, SrcScalar, scalar_quotient_op<DstScalar, SrcScalar>> {};
+
+template <typename DstScalar, typename SrcScalar>
+struct functor_traits<div_assign_op<DstScalar, SrcScalar>> : div_assign_op<DstScalar, SrcScalar>::traits {};
+
+/** \internal
+ * \brief Template functor for scalar/packet assignment with swapping
+ *
+ * It works as follow. For a non-vectorized evaluation loop, we have:
+ *   for(i) func(A.coeffRef(i), B.coeff(i));
+ * where B is a SwapWrapper expression. The trick is to make SwapWrapper::coeff behaves like a non-const coeffRef.
+ * Actually, SwapWrapper might not even be needed since even if B is a plain expression, since it has to be writable
+ * B.coeff already returns a const reference to the underlying scalar value.
+ *
+ * The case of a vectorized loop is more tricky:
+ *   for(i,j) func.assignPacket<A_Align>(&A.coeffRef(i,j), B.packet<B_Align>(i,j));
+ * Here, B must be a SwapWrapper whose packet function actually returns a proxy object holding a Scalar*,
+ * the actual alignment and Packet type.
+ *
+ */
+template <typename Scalar>
+struct swap_assign_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Scalar& a, const Scalar& b) const {
+#ifdef EIGEN_GPUCC
+    // FIXME is there some kind of cuda::swap?
+    Scalar t = b;
+    const_cast<Scalar&>(b) = a;
+    a = t;
+#else
+    using std::swap;
+    swap(a, const_cast<Scalar&>(b));
+#endif
+  }
+};
+template <typename Scalar>
+struct functor_traits<swap_assign_op<Scalar>> {
+  enum {
+    Cost = 3 * NumTraits<Scalar>::ReadCost,
+    PacketAccess =
+#if defined(EIGEN_VECTORIZE_AVX) && (EIGEN_CLANG_STRICT_LESS_THAN(8, 0, 0) || EIGEN_COMP_CLANGAPPLE)
+        // This is a partial workaround for a bug in clang generating bad code
+        // when mixing 256/512 bits loads and 128 bits moves.
+        // See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1684
+        //     https://bugs.llvm.org/show_bug.cgi?id=40815
+    0
+#else
+        packet_traits<Scalar>::Vectorizable
+#endif
+  };
+};
+
+}  // namespace internal
+
+}  // namespace Eigen
+
+#endif  // EIGEN_ASSIGNMENT_FUNCTORS_H
diff --git a/inst/include/Eigen/src/Core/functors/BinaryFunctors.h b/inst/include/Eigen/src/Core/functors/BinaryFunctors.h
new file mode 100644
index 00000000..85e1584e
--- /dev/null
+++ b/inst/include/Eigen/src/Core/functors/BinaryFunctors.h
@@ -0,0 +1,747 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BINARY_FUNCTORS_H
+#define EIGEN_BINARY_FUNCTORS_H
+
+// IWYU pragma: private
+#include "../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+//---------- associative binary functors ----------
+
+template <typename Arg1, typename Arg2>
+struct binary_op_base {
+  typedef Arg1 first_argument_type;
+  typedef Arg2 second_argument_type;
+};
+
+/** \internal
+ * \brief Template functor to compute the sum of two scalars
+ *
+ * \sa class CwiseBinaryOp, MatrixBase::operator+, class VectorwiseOp, DenseBase::sum()
+ */
+template <typename LhsScalar, typename RhsScalar>
+struct scalar_sum_op : binary_op_base<LhsScalar, RhsScalar> {
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar, scalar_sum_op>::ReturnType result_type;
+#ifdef EIGEN_SCALAR_BINARY_OP_PLUGIN
+  scalar_sum_op(){EIGEN_SCALAR_BINARY_OP_PLUGIN}
+#endif
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type
+  operator()(const LhsScalar& a, const RhsScalar& b) const {
+    return a + b;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const {
+    return internal::padd(a, b);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type predux(const Packet& a) const {
+    return internal::predux(a);
+  }
+};
+template <typename LhsScalar, typename RhsScalar>
+struct functor_traits<scalar_sum_op<LhsScalar, RhsScalar>> {
+  enum {
+    Cost = (int(NumTraits<LhsScalar>::AddCost) + int(NumTraits<RhsScalar>::AddCost)) / 2,  // rough estimate!
+    PacketAccess =
+        is_same<LhsScalar, RhsScalar>::value && packet_traits<LhsScalar>::HasAdd && packet_traits<RhsScalar>::HasAdd
+    // TODO vectorize mixed sum
+  };
+};
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool scalar_sum_op<bool, bool>::operator()(const bool& a, const bool& b) const {
+  return a || b;
+}
+
+/** \internal
+ * \brief Template functor to compute the product of two scalars
+ *
+ * \sa class CwiseBinaryOp, Cwise::operator*(), class VectorwiseOp, MatrixBase::redux()
+ */
+template <typename LhsScalar, typename RhsScalar>
+struct scalar_product_op : binary_op_base<LhsScalar, RhsScalar> {
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar, scalar_product_op>::ReturnType result_type;
+#ifdef EIGEN_SCALAR_BINARY_OP_PLUGIN
+  scalar_product_op(){EIGEN_SCALAR_BINARY_OP_PLUGIN}
+#endif
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type
+  operator()(const LhsScalar& a, const RhsScalar& b) const {
+    return a * b;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const {
+    return internal::pmul(a, b);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type predux(const Packet& a) const {
+    return internal::predux_mul(a);
+  }
+};
+template <typename LhsScalar, typename RhsScalar>
+struct functor_traits<scalar_product_op<LhsScalar, RhsScalar>> {
+  enum {
+    Cost = (int(NumTraits<LhsScalar>::MulCost) + int(NumTraits<RhsScalar>::MulCost)) / 2,  // rough estimate!
+    PacketAccess =
+        is_same<LhsScalar, RhsScalar>::value && packet_traits<LhsScalar>::HasMul && packet_traits<RhsScalar>::HasMul
+    // TODO vectorize mixed product
+  };
+};
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool scalar_product_op<bool, bool>::operator()(const bool& a,
+                                                                                     const bool& b) const {
+  return a && b;
+}
+
+/** \internal
+ * \brief Template functor to compute the conjugate product of two scalars
+ *
+ * This is a short cut for conj(x) * y which is needed for optimization purpose; in Eigen2 support mode, this becomes x
+ * * conj(y)
+ */
+template <typename LhsScalar, typename RhsScalar>
+struct scalar_conj_product_op : binary_op_base<LhsScalar, RhsScalar> {
+  enum { Conj = NumTraits<LhsScalar>::IsComplex };
+
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar, scalar_conj_product_op>::ReturnType result_type;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
+    return conj_helper<LhsScalar, RhsScalar, Conj, false>().pmul(a, b);
+  }
+
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const {
+    return conj_helper<Packet, Packet, Conj, false>().pmul(a, b);
+  }
+};
+template <typename LhsScalar, typename RhsScalar>
+struct functor_traits<scalar_conj_product_op<LhsScalar, RhsScalar>> {
+  enum {
+    Cost = NumTraits<LhsScalar>::MulCost,
+    PacketAccess = internal::is_same<LhsScalar, RhsScalar>::value && packet_traits<LhsScalar>::HasMul
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the min of two scalars
+ *
+ * \sa class CwiseBinaryOp, MatrixBase::cwiseMin, class VectorwiseOp, MatrixBase::minCoeff()
+ */
+template <typename LhsScalar, typename RhsScalar, int NaNPropagation>
+struct scalar_min_op : binary_op_base<LhsScalar, RhsScalar> {
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar, scalar_min_op>::ReturnType result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
+    return internal::pmin<NaNPropagation>(a, b);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const {
+    return internal::pmin<NaNPropagation>(a, b);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type predux(const Packet& a) const {
+    return internal::predux_min<NaNPropagation>(a);
+  }
+};
+
+template <typename LhsScalar, typename RhsScalar, int NaNPropagation>
+struct functor_traits<scalar_min_op<LhsScalar, RhsScalar, NaNPropagation>> {
+  enum {
+    Cost = (NumTraits<LhsScalar>::AddCost + NumTraits<RhsScalar>::AddCost) / 2,
+    PacketAccess = internal::is_same<LhsScalar, RhsScalar>::value && packet_traits<LhsScalar>::HasMin
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the max of two scalars
+ *
+ * \sa class CwiseBinaryOp, MatrixBase::cwiseMax, class VectorwiseOp, MatrixBase::maxCoeff()
+ */
+template <typename LhsScalar, typename RhsScalar, int NaNPropagation>
+struct scalar_max_op : binary_op_base<LhsScalar, RhsScalar> {
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar, scalar_max_op>::ReturnType result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
+    return internal::pmax<NaNPropagation>(a, b);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const {
+    return internal::pmax<NaNPropagation>(a, b);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type predux(const Packet& a) const {
+    return internal::predux_max<NaNPropagation>(a);
+  }
+};
+
+template <typename LhsScalar, typename RhsScalar, int NaNPropagation>
+struct functor_traits<scalar_max_op<LhsScalar, RhsScalar, NaNPropagation>> {
+  enum {
+    Cost = (NumTraits<LhsScalar>::AddCost + NumTraits<RhsScalar>::AddCost) / 2,
+    PacketAccess = internal::is_same<LhsScalar, RhsScalar>::value && packet_traits<LhsScalar>::HasMax
+  };
+};
+
+/** \internal
+ * \brief Template functors for comparison of two scalars
+ * \todo Implement packet-comparisons
+ */
+template <typename LhsScalar, typename RhsScalar, ComparisonName cmp, bool UseTypedComparators = false>
+struct scalar_cmp_op;
+
+template <typename LhsScalar, typename RhsScalar, ComparisonName cmp, bool UseTypedComparators>
+struct functor_traits<scalar_cmp_op<LhsScalar, RhsScalar, cmp, UseTypedComparators>> {
+  enum {
+    Cost = (NumTraits<LhsScalar>::AddCost + NumTraits<RhsScalar>::AddCost) / 2,
+    PacketAccess = (UseTypedComparators || is_same<LhsScalar, bool>::value) && is_same<LhsScalar, RhsScalar>::value &&
+                   packet_traits<LhsScalar>::HasCmp
+  };
+};
+
+template <typename LhsScalar, typename RhsScalar, bool UseTypedComparators>
+struct scalar_cmp_op<LhsScalar, RhsScalar, cmp_EQ, UseTypedComparators> : binary_op_base<LhsScalar, RhsScalar> {
+  using result_type = std::conditional_t<UseTypedComparators, LhsScalar, bool>;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
+    return a == b ? result_type(1) : result_type(0);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const {
+    const Packet cst_one = pset1<Packet>(result_type(1));
+    return pand(pcmp_eq(a, b), cst_one);
+  }
+};
+
+template <typename LhsScalar, typename RhsScalar, bool UseTypedComparators>
+struct scalar_cmp_op<LhsScalar, RhsScalar, cmp_LT, UseTypedComparators> : binary_op_base<LhsScalar, RhsScalar> {
+  using result_type = std::conditional_t<UseTypedComparators, LhsScalar, bool>;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
+    return a < b ? result_type(1) : result_type(0);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const {
+    const Packet cst_one = pset1<Packet>(result_type(1));
+    return pand(pcmp_lt(a, b), cst_one);
+  }
+};
+
+template <typename LhsScalar, typename RhsScalar, bool UseTypedComparators>
+struct scalar_cmp_op<LhsScalar, RhsScalar, cmp_LE, UseTypedComparators> : binary_op_base<LhsScalar, RhsScalar> {
+  using result_type = std::conditional_t<UseTypedComparators, LhsScalar, bool>;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
+    return a <= b ? result_type(1) : result_type(0);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const {
+    const Packet cst_one = pset1<Packet>(result_type(1));
+    return pand(cst_one, pcmp_le(a, b));
+  }
+};
+
+template <typename LhsScalar, typename RhsScalar, bool UseTypedComparators>
+struct scalar_cmp_op<LhsScalar, RhsScalar, cmp_GT, UseTypedComparators> : binary_op_base<LhsScalar, RhsScalar> {
+  using result_type = std::conditional_t<UseTypedComparators, LhsScalar, bool>;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
+    return a > b ? result_type(1) : result_type(0);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const {
+    const Packet cst_one = pset1<Packet>(result_type(1));
+    return pand(cst_one, pcmp_lt(b, a));
+  }
+};
+
+template <typename LhsScalar, typename RhsScalar, bool UseTypedComparators>
+struct scalar_cmp_op<LhsScalar, RhsScalar, cmp_GE, UseTypedComparators> : binary_op_base<LhsScalar, RhsScalar> {
+  using result_type = std::conditional_t<UseTypedComparators, LhsScalar, bool>;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
+    return a >= b ? result_type(1) : result_type(0);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const {
+    const Packet cst_one = pset1<Packet>(result_type(1));
+    return pand(cst_one, pcmp_le(b, a));
+  }
+};
+
+template <typename LhsScalar, typename RhsScalar, bool UseTypedComparators>
+struct scalar_cmp_op<LhsScalar, RhsScalar, cmp_UNORD, UseTypedComparators> : binary_op_base<LhsScalar, RhsScalar> {
+  using result_type = std::conditional_t<UseTypedComparators, LhsScalar, bool>;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
+    return !(a <= b || b <= a) ? result_type(1) : result_type(0);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const {
+    const Packet cst_one = pset1<Packet>(result_type(1));
+    return pandnot(cst_one, por(pcmp_le(a, b), pcmp_le(b, a)));
+  }
+};
+
+template <typename LhsScalar, typename RhsScalar, bool UseTypedComparators>
+struct scalar_cmp_op<LhsScalar, RhsScalar, cmp_NEQ, UseTypedComparators> : binary_op_base<LhsScalar, RhsScalar> {
+  using result_type = std::conditional_t<UseTypedComparators, LhsScalar, bool>;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
+    return a != b ? result_type(1) : result_type(0);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const {
+    const Packet cst_one = pset1<Packet>(result_type(1));
+    return pandnot(cst_one, pcmp_eq(a, b));
+  }
+};
+
+/** \internal
+ * \brief Template functor to compute the hypot of two \b positive \b and \b real scalars
+ *
+ * \sa MatrixBase::stableNorm(), class Redux
+ */
+template <typename Scalar>
+struct scalar_hypot_op<Scalar, Scalar> : binary_op_base<Scalar, Scalar> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x, const Scalar& y) const {
+    // This functor is used by hypotNorm only for which it is faster to first apply abs
+    // on all coefficients prior to reduction through hypot.
+    // This way we avoid calling abs on positive and real entries, and this also permits
+    // to seamlessly handle complexes. Otherwise we would have to handle both real and complexes
+    // through the same functor...
+    return internal::positive_real_hypot(x, y);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_hypot_op<Scalar, Scalar>> {
+  enum {
+    Cost = 3 * NumTraits<Scalar>::AddCost + 2 * NumTraits<Scalar>::MulCost + 2 * scalar_div_cost<Scalar, false>::value,
+    PacketAccess = false
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the pow of two scalars
+ * See the specification of pow in https://en.cppreference.com/w/cpp/numeric/math/pow
+ */
+template <typename Scalar, typename Exponent>
+struct scalar_pow_op : binary_op_base<Scalar, Exponent> {
+  typedef typename ScalarBinaryOpTraits<Scalar, Exponent, scalar_pow_op>::ReturnType result_type;
+#ifdef EIGEN_SCALAR_BINARY_OP_PLUGIN
+  scalar_pow_op() {
+    typedef Scalar LhsScalar;
+    typedef Exponent RhsScalar;
+    EIGEN_SCALAR_BINARY_OP_PLUGIN
+  }
+#endif
+
+  EIGEN_DEVICE_FUNC inline result_type operator()(const Scalar& a, const Exponent& b) const {
+    return numext::pow(a, b);
+  }
+
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const {
+    return generic_pow(a, b);
+  }
+};
+
+template <typename Scalar, typename Exponent>
+struct functor_traits<scalar_pow_op<Scalar, Exponent>> {
+  enum {
+    Cost = 5 * NumTraits<Scalar>::MulCost,
+    PacketAccess = (!NumTraits<Scalar>::IsComplex && !NumTraits<Scalar>::IsInteger && packet_traits<Scalar>::HasPow)
+  };
+};
+
+//---------- non associative binary functors ----------
+
+/** \internal
+ * \brief Template functor to compute the difference of two scalars
+ *
+ * \sa class CwiseBinaryOp, MatrixBase::operator-
+ */
+template <typename LhsScalar, typename RhsScalar>
+struct scalar_difference_op : binary_op_base<LhsScalar, RhsScalar> {
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar, scalar_difference_op>::ReturnType result_type;
+#ifdef EIGEN_SCALAR_BINARY_OP_PLUGIN
+  scalar_difference_op(){EIGEN_SCALAR_BINARY_OP_PLUGIN}
+#endif
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type
+  operator()(const LhsScalar& a, const RhsScalar& b) const {
+    return a - b;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const {
+    return internal::psub(a, b);
+  }
+};
+template <typename LhsScalar, typename RhsScalar>
+struct functor_traits<scalar_difference_op<LhsScalar, RhsScalar>> {
+  enum {
+    Cost = (int(NumTraits<LhsScalar>::AddCost) + int(NumTraits<RhsScalar>::AddCost)) / 2,
+    PacketAccess =
+        is_same<LhsScalar, RhsScalar>::value && packet_traits<LhsScalar>::HasSub && packet_traits<RhsScalar>::HasSub
+  };
+};
+
+template <typename Packet, bool IsInteger = NumTraits<typename unpacket_traits<Packet>::type>::IsInteger>
+struct maybe_raise_div_by_zero {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Packet x) { EIGEN_UNUSED_VARIABLE(x); }
+};
+
+#ifndef EIGEN_GPU_COMPILE_PHASE
+template <typename Packet>
+struct maybe_raise_div_by_zero<Packet, true> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Packet x) {
+    if (EIGEN_PREDICT_FALSE(predux_any(pcmp_eq(x, pzero(x))))) {
+      // Use volatile variables to force a division by zero, which will
+      // result in the default platform behaviour (usually SIGFPE).
+      volatile typename unpacket_traits<Packet>::type zero = 0;
+      volatile typename unpacket_traits<Packet>::type val = 1;
+      val = val / zero;
+    }
+  }
+};
+#endif
+
+/** \internal
+ * \brief Template functor to compute the quotient of two scalars
+ *
+ * \sa class CwiseBinaryOp, Cwise::operator/()
+ */
+template <typename LhsScalar, typename RhsScalar>
+struct scalar_quotient_op : binary_op_base<LhsScalar, RhsScalar> {
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar, scalar_quotient_op>::ReturnType result_type;
+#ifdef EIGEN_SCALAR_BINARY_OP_PLUGIN
+  scalar_quotient_op(){EIGEN_SCALAR_BINARY_OP_PLUGIN}
+#endif
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type
+  operator()(const LhsScalar& a, const RhsScalar& b) const {
+    return a / b;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const {
+    return internal::pdiv(a, b);
+  }
+};
+template <typename LhsScalar, typename RhsScalar>
+struct functor_traits<scalar_quotient_op<LhsScalar, RhsScalar>> {
+  typedef typename scalar_quotient_op<LhsScalar, RhsScalar>::result_type result_type;
+  enum {
+    PacketAccess =
+        is_same<LhsScalar, RhsScalar>::value && packet_traits<LhsScalar>::HasDiv && packet_traits<RhsScalar>::HasDiv,
+    Cost = scalar_div_cost<result_type, PacketAccess>::value
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the and of two scalars as if they were booleans
+ *
+ * \sa class CwiseBinaryOp, ArrayBase::operator&&
+ */
+template <typename Scalar>
+struct scalar_boolean_and_op {
+  using result_type = Scalar;
+  // `false` any value `a` that satisfies `a == Scalar(0)`
+  // `true` is the complement of `false`
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a, const Scalar& b) const {
+    return (a != Scalar(0)) && (b != Scalar(0)) ? Scalar(1) : Scalar(0);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const {
+    const Packet cst_one = pset1<Packet>(Scalar(1));
+    // and(a,b) == !or(!a,!b)
+    Packet not_a = pcmp_eq(a, pzero(a));
+    Packet not_b = pcmp_eq(b, pzero(b));
+    Packet a_nand_b = por(not_a, not_b);
+    return pandnot(cst_one, a_nand_b);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_boolean_and_op<Scalar>> {
+  enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = packet_traits<Scalar>::HasCmp };
+};
+
+/** \internal
+ * \brief Template functor to compute the or of two scalars as if they were booleans
+ *
+ * \sa class CwiseBinaryOp, ArrayBase::operator||
+ */
+template <typename Scalar>
+struct scalar_boolean_or_op {
+  using result_type = Scalar;
+  // `false` any value `a` that satisfies `a == Scalar(0)`
+  // `true` is the complement of `false`
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a, const Scalar& b) const {
+    return (a != Scalar(0)) || (b != Scalar(0)) ? Scalar(1) : Scalar(0);
+  }
+  template <typename Packet>
+  EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const {
+    const Packet cst_one = pset1<Packet>(Scalar(1));
+    // if or(a,b) == 0, then a == 0 and b == 0
+    // or(a,b) == !nor(a,b)
+    Packet a_nor_b = pcmp_eq(por(a, b), pzero(a));
+    return pandnot(cst_one, a_nor_b);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_boolean_or_op<Scalar>> {
+  enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = packet_traits<Scalar>::HasCmp };
+};
+
+/** \internal
+ * \brief Template functor to compute the xor of two scalars as if they were booleans
+ *
+ * \sa class CwiseBinaryOp, ArrayBase::operator^
+ */
+template <typename Scalar>
+struct scalar_boolean_xor_op {
+  using result_type = Scalar;
+  // `false` any value `a` that satisfies `a == Scalar(0)`
+  // `true` is the complement of `false`
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a, const Scalar& b) const {
+    return (a != Scalar(0)) != (b != Scalar(0)) ? Scalar(1) : Scalar(0);
+  }
+  template <typename Packet>
+  EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const {
+    const Packet cst_one = pset1<Packet>(Scalar(1));
+    // xor(a,b) == xor(!a,!b)
+    Packet not_a = pcmp_eq(a, pzero(a));
+    Packet not_b = pcmp_eq(b, pzero(b));
+    Packet a_xor_b = pxor(not_a, not_b);
+    return pand(cst_one, a_xor_b);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_boolean_xor_op<Scalar>> {
+  enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = packet_traits<Scalar>::HasCmp };
+};
+
+template <typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
+struct bitwise_binary_impl {
+  static constexpr size_t Size = sizeof(Scalar);
+  using uint_t = typename numext::get_integer_by_size<Size>::unsigned_type;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run_and(const Scalar& a, const Scalar& b) {
+    uint_t a_as_uint = numext::bit_cast<uint_t, Scalar>(a);
+    uint_t b_as_uint = numext::bit_cast<uint_t, Scalar>(b);
+    uint_t result = a_as_uint & b_as_uint;
+    return numext::bit_cast<Scalar, uint_t>(result);
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run_or(const Scalar& a, const Scalar& b) {
+    uint_t a_as_uint = numext::bit_cast<uint_t, Scalar>(a);
+    uint_t b_as_uint = numext::bit_cast<uint_t, Scalar>(b);
+    uint_t result = a_as_uint | b_as_uint;
+    return numext::bit_cast<Scalar, uint_t>(result);
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run_xor(const Scalar& a, const Scalar& b) {
+    uint_t a_as_uint = numext::bit_cast<uint_t, Scalar>(a);
+    uint_t b_as_uint = numext::bit_cast<uint_t, Scalar>(b);
+    uint_t result = a_as_uint ^ b_as_uint;
+    return numext::bit_cast<Scalar, uint_t>(result);
+  }
+};
+
+template <typename Scalar>
+struct bitwise_binary_impl<Scalar, true> {
+  using Real = typename NumTraits<Scalar>::Real;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run_and(const Scalar& a, const Scalar& b) {
+    Real real_result = bitwise_binary_impl<Real>::run_and(numext::real(a), numext::real(b));
+    Real imag_result = bitwise_binary_impl<Real>::run_and(numext::imag(a), numext::imag(b));
+    return Scalar(real_result, imag_result);
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run_or(const Scalar& a, const Scalar& b) {
+    Real real_result = bitwise_binary_impl<Real>::run_or(numext::real(a), numext::real(b));
+    Real imag_result = bitwise_binary_impl<Real>::run_or(numext::imag(a), numext::imag(b));
+    return Scalar(real_result, imag_result);
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run_xor(const Scalar& a, const Scalar& b) {
+    Real real_result = bitwise_binary_impl<Real>::run_xor(numext::real(a), numext::real(b));
+    Real imag_result = bitwise_binary_impl<Real>::run_xor(numext::imag(a), numext::imag(b));
+    return Scalar(real_result, imag_result);
+  }
+};
+
+/** \internal
+ * \brief Template functor to compute the bitwise and of two scalars
+ *
+ * \sa class CwiseBinaryOp, ArrayBase::operator&
+ */
+template <typename Scalar>
+struct scalar_bitwise_and_op {
+  EIGEN_STATIC_ASSERT(!NumTraits<Scalar>::RequireInitialization,
+                      BITWISE OPERATIONS MAY ONLY BE PERFORMED ON PLAIN DATA TYPES)
+  EIGEN_STATIC_ASSERT((!internal::is_same<Scalar, bool>::value), DONT USE BITWISE OPS ON BOOLEAN TYPES)
+  using result_type = Scalar;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a, const Scalar& b) const {
+    return bitwise_binary_impl<Scalar>::run_and(a, b);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const {
+    return pand(a, b);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bitwise_and_op<Scalar>> {
+  enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = true };
+};
+
+/** \internal
+ * \brief Template functor to compute the bitwise or of two scalars
+ *
+ * \sa class CwiseBinaryOp, ArrayBase::operator|
+ */
+template <typename Scalar>
+struct scalar_bitwise_or_op {
+  EIGEN_STATIC_ASSERT(!NumTraits<Scalar>::RequireInitialization,
+                      BITWISE OPERATIONS MAY ONLY BE PERFORMED ON PLAIN DATA TYPES)
+  EIGEN_STATIC_ASSERT((!internal::is_same<Scalar, bool>::value), DONT USE BITWISE OPS ON BOOLEAN TYPES)
+  using result_type = Scalar;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a, const Scalar& b) const {
+    return bitwise_binary_impl<Scalar>::run_or(a, b);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const {
+    return por(a, b);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bitwise_or_op<Scalar>> {
+  enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = true };
+};
+
+/** \internal
+ * \brief Template functor to compute the bitwise xor of two scalars
+ *
+ * \sa class CwiseBinaryOp, ArrayBase::operator^
+ */
+template <typename Scalar>
+struct scalar_bitwise_xor_op {
+  EIGEN_STATIC_ASSERT(!NumTraits<Scalar>::RequireInitialization,
+                      BITWISE OPERATIONS MAY ONLY BE PERFORMED ON PLAIN DATA TYPES)
+  EIGEN_STATIC_ASSERT((!internal::is_same<Scalar, bool>::value), DONT USE BITWISE OPS ON BOOLEAN TYPES)
+  using result_type = Scalar;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a, const Scalar& b) const {
+    return bitwise_binary_impl<Scalar>::run_xor(a, b);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b) const {
+    return pxor(a, b);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bitwise_xor_op<Scalar>> {
+  enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = true };
+};
+
+/** \internal
+ * \brief Template functor to compute the absolute difference of two scalars
+ *
+ * \sa class CwiseBinaryOp, MatrixBase::absolute_difference
+ */
+template <typename LhsScalar, typename RhsScalar>
+struct scalar_absolute_difference_op : binary_op_base<LhsScalar, RhsScalar> {
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar, scalar_absolute_difference_op>::ReturnType result_type;
+#ifdef EIGEN_SCALAR_BINARY_OP_PLUGIN
+  scalar_absolute_difference_op(){EIGEN_SCALAR_BINARY_OP_PLUGIN}
+#endif
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type
+  operator()(const LhsScalar& a, const RhsScalar& b) const {
+    return numext::absdiff(a, b);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& b) const {
+    return internal::pabsdiff(a, b);
+  }
+};
+template <typename LhsScalar, typename RhsScalar>
+struct functor_traits<scalar_absolute_difference_op<LhsScalar, RhsScalar>> {
+  enum {
+    Cost = (NumTraits<LhsScalar>::AddCost + NumTraits<RhsScalar>::AddCost) / 2,
+    PacketAccess = is_same<LhsScalar, RhsScalar>::value && packet_traits<LhsScalar>::HasAbsDiff
+  };
+};
+
+template <typename LhsScalar, typename RhsScalar>
+struct scalar_atan2_op {
+  using Scalar = LhsScalar;
+
+  static constexpr bool Enable =
+      is_same<LhsScalar, RhsScalar>::value && !NumTraits<Scalar>::IsInteger && !NumTraits<Scalar>::IsComplex;
+  EIGEN_STATIC_ASSERT(Enable, "LhsScalar and RhsScalar must be the same non-integer, non-complex type")
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& y, const Scalar& x) const {
+    return numext::atan2(y, x);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& y, const Packet& x) const {
+    return internal::patan2(y, x);
+  }
+};
+
+template <typename LhsScalar, typename RhsScalar>
+struct functor_traits<scalar_atan2_op<LhsScalar, RhsScalar>> {
+  using Scalar = LhsScalar;
+  enum {
+    PacketAccess = is_same<LhsScalar, RhsScalar>::value && packet_traits<Scalar>::HasATan &&
+                   packet_traits<Scalar>::HasDiv && !NumTraits<Scalar>::IsInteger && !NumTraits<Scalar>::IsComplex,
+    Cost = int(scalar_div_cost<Scalar, PacketAccess>::value) + int(functor_traits<scalar_atan_op<Scalar>>::Cost)
+  };
+};
+
+//---------- binary functors bound to a constant, thus appearing as a unary functor ----------
+
+// The following two classes permits to turn any binary functor into a unary one with one argument bound to a constant
+// value. They are analogues to std::binder1st/binder2nd but with the following differences:
+//  - they are compatible with packetOp
+//  - they are portable across C++ versions (the std::binder* are deprecated in C++11)
+template <typename BinaryOp>
+struct bind1st_op : BinaryOp {
+  typedef typename BinaryOp::first_argument_type first_argument_type;
+  typedef typename BinaryOp::second_argument_type second_argument_type;
+  typedef typename BinaryOp::result_type result_type;
+
+  EIGEN_DEVICE_FUNC explicit bind1st_op(const first_argument_type& val) : m_value(val) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator()(const second_argument_type& b) const {
+    return BinaryOp::operator()(m_value, b);
+  }
+
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& b) const {
+    return BinaryOp::packetOp(internal::pset1<Packet>(m_value), b);
+  }
+
+  first_argument_type m_value;
+};
+template <typename BinaryOp>
+struct functor_traits<bind1st_op<BinaryOp>> : functor_traits<BinaryOp> {};
+
+template <typename BinaryOp>
+struct bind2nd_op : BinaryOp {
+  typedef typename BinaryOp::first_argument_type first_argument_type;
+  typedef typename BinaryOp::second_argument_type second_argument_type;
+  typedef typename BinaryOp::result_type result_type;
+
+  EIGEN_DEVICE_FUNC explicit bind2nd_op(const second_argument_type& val) : m_value(val) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator()(const first_argument_type& a) const {
+    return BinaryOp::operator()(a, m_value);
+  }
+
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const {
+    return BinaryOp::packetOp(a, internal::pset1<Packet>(m_value));
+  }
+
+  second_argument_type m_value;
+};
+template <typename BinaryOp>
+struct functor_traits<bind2nd_op<BinaryOp>> : functor_traits<BinaryOp> {};
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_BINARY_FUNCTORS_H
diff --git a/inst/include/Eigen/src/Core/functors/NullaryFunctors.h b/inst/include/Eigen/src/Core/functors/NullaryFunctors.h
new file mode 100644
index 00000000..35dc7386
--- /dev/null
+++ b/inst/include/Eigen/src/Core/functors/NullaryFunctors.h
@@ -0,0 +1,274 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_NULLARY_FUNCTORS_H
+#define EIGEN_NULLARY_FUNCTORS_H
+
+// IWYU pragma: private
+#include "../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename Scalar>
+struct scalar_constant_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_constant_op(const Scalar& other) : m_other(other) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()() const { return m_other; }
+  template <typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType packetOp() const {
+    return internal::pset1<PacketType>(m_other);
+  }
+  const Scalar m_other;
+};
+template <typename Scalar>
+struct functor_traits<scalar_constant_op<Scalar>> {
+  enum {
+    Cost = 0 /* as the constant value should be loaded in register only once for the whole expression */,
+    PacketAccess = packet_traits<Scalar>::Vectorizable,
+    IsRepeatable = true
+  };
+};
+
+template <typename Scalar>
+struct scalar_zero_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_zero_op() = default;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()() const { return Scalar(0); }
+  template <typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const PacketType packetOp() const {
+    return internal::pzero<PacketType>(PacketType());
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_zero_op<Scalar>> : functor_traits<scalar_constant_op<Scalar>> {};
+
+template <typename Scalar>
+struct scalar_identity_op {
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(IndexType row, IndexType col) const {
+    return row == col ? Scalar(1) : Scalar(0);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_identity_op<Scalar>> {
+  enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = false, IsRepeatable = true };
+};
+
+template <typename Scalar, bool IsInteger>
+struct linspaced_op_impl;
+
+template <typename Scalar>
+struct linspaced_op_impl<Scalar, /*IsInteger*/ false> {
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+
+  EIGEN_DEVICE_FUNC linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps)
+      : m_low(low),
+        m_high(high),
+        m_size1(num_steps == 1 ? 1 : num_steps - 1),
+        m_step(num_steps == 1 ? Scalar() : Scalar((high - low) / RealScalar(num_steps - 1))),
+        m_flip(numext::abs(high) < numext::abs(low)) {}
+
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(IndexType i) const {
+    if (m_flip)
+      return (i == 0) ? m_low : Scalar(m_high - RealScalar(m_size1 - i) * m_step);
+    else
+      return (i == m_size1) ? m_high : Scalar(m_low + RealScalar(i) * m_step);
+  }
+
+  template <typename Packet, typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(IndexType i) const {
+    // Principle:
+    // [low, ..., low] + ( [step, ..., step] * ( [i, ..., i] + [0, ..., size] ) )
+    Packet low = pset1<Packet>(m_low);
+    Packet high = pset1<Packet>(m_high);
+    Packet step = pset1<Packet>(m_step);
+    if (m_flip) {
+      Packet pi = plset<Packet>(Scalar(i - m_size1));
+      Packet res = pmadd(step, pi, high);
+      Packet mask = pcmp_lt(pzero(res), plset<Packet>(Scalar(i)));
+      return pselect<Packet>(mask, res, low);
+    } else {
+      Packet pi = plset<Packet>(Scalar(i));
+      Packet res = pmadd(step, pi, low);
+      Packet mask = pcmp_lt(pi, pset1<Packet>(Scalar(m_size1)));
+      return pselect<Packet>(mask, res, high);
+    }
+  }
+
+  const Scalar m_low;
+  const Scalar m_high;
+  const Index m_size1;
+  const Scalar m_step;
+  const bool m_flip;
+};
+
+template <typename Scalar>
+struct linspaced_op_impl<Scalar, /*IsInteger*/ true> {
+  EIGEN_DEVICE_FUNC linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps)
+      : m_low(low),
+        m_multiplier((high - low) / convert_index<Scalar>(num_steps <= 1 ? 1 : num_steps - 1)),
+        m_divisor(convert_index<Scalar>((high >= low ? num_steps : -num_steps) + (high - low)) /
+                  ((numext::abs(high - low) + 1) == 0 ? 1 : (numext::abs(high - low) + 1))),
+        m_use_divisor(num_steps > 1 && (numext::abs(high - low) + 1) < num_steps) {}
+
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(IndexType i) const {
+    if (m_use_divisor)
+      return m_low + convert_index<Scalar>(i) / m_divisor;
+    else
+      return m_low + convert_index<Scalar>(i) * m_multiplier;
+  }
+
+  const Scalar m_low;
+  const Scalar m_multiplier;
+  const Scalar m_divisor;
+  const bool m_use_divisor;
+};
+
+// ----- Linspace functor ----------------------------------------------------------------
+
+// Forward declaration (we default to random access which does not really give
+// us a speed gain when using packet access but it allows to use the functor in
+// nested expressions).
+template <typename Scalar>
+struct linspaced_op;
+template <typename Scalar>
+struct functor_traits<linspaced_op<Scalar>> {
+  enum {
+    Cost = 1,
+    PacketAccess = (!NumTraits<Scalar>::IsInteger) && packet_traits<Scalar>::HasSetLinear,
+    /*&& ((!NumTraits<Scalar>::IsInteger) || packet_traits<Scalar>::HasDiv),*/  // <- vectorization for integer is
+                                                                                // currently disabled
+    IsRepeatable = true
+  };
+};
+template <typename Scalar>
+struct linspaced_op {
+  EIGEN_DEVICE_FUNC linspaced_op(const Scalar& low, const Scalar& high, Index num_steps)
+      : impl((num_steps == 1 ? high : low), high, num_steps) {}
+
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(IndexType i) const {
+    return impl(i);
+  }
+
+  template <typename Packet, typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(IndexType i) const {
+    return impl.template packetOp<Packet>(i);
+  }
+
+  // This proxy object handles the actual required temporaries and the different
+  // implementations (integer vs. floating point).
+  const linspaced_op_impl<Scalar, NumTraits<Scalar>::IsInteger> impl;
+};
+
+template <typename Scalar>
+struct equalspaced_op {
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+
+  EIGEN_DEVICE_FUNC equalspaced_op(const Scalar& start, const Scalar& step) : m_start(start), m_step(step) {}
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(IndexType i) const {
+    return m_start + m_step * static_cast<Scalar>(i);
+  }
+  template <typename Packet, typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(IndexType i) const {
+    const Packet cst_start = pset1<Packet>(m_start);
+    const Packet cst_step = pset1<Packet>(m_step);
+    const Packet cst_lin0 = plset<Packet>(Scalar(0));
+    const Packet cst_offset = pmadd(cst_lin0, cst_step, cst_start);
+
+    Packet i_packet = pset1<Packet>(static_cast<Scalar>(i));
+    return pmadd(i_packet, cst_step, cst_offset);
+  }
+  const Scalar m_start;
+  const Scalar m_step;
+};
+
+template <typename Scalar>
+struct functor_traits<equalspaced_op<Scalar>> {
+  enum {
+    Cost = NumTraits<Scalar>::AddCost + NumTraits<Scalar>::MulCost,
+    PacketAccess =
+        packet_traits<Scalar>::HasSetLinear && packet_traits<Scalar>::HasMul && packet_traits<Scalar>::HasAdd,
+    IsRepeatable = true
+  };
+};
+
+// Linear access is automatically determined from the operator() prototypes available for the given functor.
+// If it exposes an operator()(i,j), then we assume the i and j coefficients are required independently
+// and linear access is not possible. In all other cases, linear access is enabled.
+// Users should not have to deal with this structure.
+template <typename Functor>
+struct functor_has_linear_access {
+  enum { ret = !has_binary_operator<Functor>::value };
+};
+
+// For unreliable compilers, let's specialize the has_*ary_operator
+// helpers so that at least built-in nullary functors work fine.
+#if !(EIGEN_COMP_MSVC || EIGEN_COMP_GNUC || (EIGEN_COMP_ICC >= 1600))
+template <typename Scalar, typename IndexType>
+struct has_nullary_operator<scalar_constant_op<Scalar>, IndexType> {
+  enum { value = 1 };
+};
+template <typename Scalar, typename IndexType>
+struct has_unary_operator<scalar_constant_op<Scalar>, IndexType> {
+  enum { value = 0 };
+};
+template <typename Scalar, typename IndexType>
+struct has_binary_operator<scalar_constant_op<Scalar>, IndexType> {
+  enum { value = 0 };
+};
+
+template <typename Scalar, typename IndexType>
+struct has_nullary_operator<scalar_identity_op<Scalar>, IndexType> {
+  enum { value = 0 };
+};
+template <typename Scalar, typename IndexType>
+struct has_unary_operator<scalar_identity_op<Scalar>, IndexType> {
+  enum { value = 0 };
+};
+template <typename Scalar, typename IndexType>
+struct has_binary_operator<scalar_identity_op<Scalar>, IndexType> {
+  enum { value = 1 };
+};
+
+template <typename Scalar, typename IndexType>
+struct has_nullary_operator<linspaced_op<Scalar>, IndexType> {
+  enum { value = 0 };
+};
+template <typename Scalar, typename IndexType>
+struct has_unary_operator<linspaced_op<Scalar>, IndexType> {
+  enum { value = 1 };
+};
+template <typename Scalar, typename IndexType>
+struct has_binary_operator<linspaced_op<Scalar>, IndexType> {
+  enum { value = 0 };
+};
+
+template <typename Scalar, typename IndexType>
+struct has_nullary_operator<scalar_random_op<Scalar>, IndexType> {
+  enum { value = 1 };
+};
+template <typename Scalar, typename IndexType>
+struct has_unary_operator<scalar_random_op<Scalar>, IndexType> {
+  enum { value = 0 };
+};
+template <typename Scalar, typename IndexType>
+struct has_binary_operator<scalar_random_op<Scalar>, IndexType> {
+  enum { value = 0 };
+};
+#endif
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_NULLARY_FUNCTORS_H
diff --git a/inst/include/Eigen/src/Core/functors/StlFunctors.h b/inst/include/Eigen/src/Core/functors/StlFunctors.h
new file mode 100644
index 00000000..0599ce3f
--- /dev/null
+++ b/inst/include/Eigen/src/Core/functors/StlFunctors.h
@@ -0,0 +1,149 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_STL_FUNCTORS_H
+#define EIGEN_STL_FUNCTORS_H
+
+// IWYU pragma: private
+#include "../InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+// default functor traits for STL functors:
+
+template <typename T>
+struct functor_traits<std::multiplies<T> > {
+  enum { Cost = NumTraits<T>::MulCost, PacketAccess = false };
+};
+
+template <typename T>
+struct functor_traits<std::divides<T> > {
+  enum { Cost = NumTraits<T>::MulCost, PacketAccess = false };
+};
+
+template <typename T>
+struct functor_traits<std::plus<T> > {
+  enum { Cost = NumTraits<T>::AddCost, PacketAccess = false };
+};
+
+template <typename T>
+struct functor_traits<std::minus<T> > {
+  enum { Cost = NumTraits<T>::AddCost, PacketAccess = false };
+};
+
+template <typename T>
+struct functor_traits<std::negate<T> > {
+  enum { Cost = NumTraits<T>::AddCost, PacketAccess = false };
+};
+
+template <typename T>
+struct functor_traits<std::logical_or<T> > {
+  enum { Cost = 1, PacketAccess = false };
+};
+
+template <typename T>
+struct functor_traits<std::logical_and<T> > {
+  enum { Cost = 1, PacketAccess = false };
+};
+
+template <typename T>
+struct functor_traits<std::logical_not<T> > {
+  enum { Cost = 1, PacketAccess = false };
+};
+
+template <typename T>
+struct functor_traits<std::greater<T> > {
+  enum { Cost = 1, PacketAccess = false };
+};
+
+template <typename T>
+struct functor_traits<std::less<T> > {
+  enum { Cost = 1, PacketAccess = false };
+};
+
+template <typename T>
+struct functor_traits<std::greater_equal<T> > {
+  enum { Cost = 1, PacketAccess = false };
+};
+
+template <typename T>
+struct functor_traits<std::less_equal<T> > {
+  enum { Cost = 1, PacketAccess = false };
+};
+
+template <typename T>
+struct functor_traits<std::equal_to<T> > {
+  enum { Cost = 1, PacketAccess = false };
+};
+
+template <typename T>
+struct functor_traits<std::not_equal_to<T> > {
+  enum { Cost = 1, PacketAccess = false };
+};
+
+#if (EIGEN_COMP_CXXVER < 17)
+// std::unary_negate is deprecated since c++17 and will be removed in c++20
+template <typename T>
+struct functor_traits<std::unary_negate<T> > {
+  enum { Cost = 1 + functor_traits<T>::Cost, PacketAccess = false };
+};
+
+// std::binary_negate is deprecated since c++17 and will be removed in c++20
+template <typename T>
+struct functor_traits<std::binary_negate<T> > {
+  enum { Cost = 1 + functor_traits<T>::Cost, PacketAccess = false };
+};
+#endif
+
+#ifdef EIGEN_STDEXT_SUPPORT
+
+template <typename T0, typename T1>
+struct functor_traits<std::project1st<T0, T1> > {
+  enum { Cost = 0, PacketAccess = false };
+};
+
+template <typename T0, typename T1>
+struct functor_traits<std::project2nd<T0, T1> > {
+  enum { Cost = 0, PacketAccess = false };
+};
+
+template <typename T0, typename T1>
+struct functor_traits<std::select2nd<std::pair<T0, T1> > > {
+  enum { Cost = 0, PacketAccess = false };
+};
+
+template <typename T0, typename T1>
+struct functor_traits<std::select1st<std::pair<T0, T1> > > {
+  enum { Cost = 0, PacketAccess = false };
+};
+
+template <typename T0, typename T1>
+struct functor_traits<std::unary_compose<T0, T1> > {
+  enum { Cost = functor_traits<T0>::Cost + functor_traits<T1>::Cost, PacketAccess = false };
+};
+
+template <typename T0, typename T1, typename T2>
+struct functor_traits<std::binary_compose<T0, T1, T2> > {
+  enum { Cost = functor_traits<T0>::Cost + functor_traits<T1>::Cost + functor_traits<T2>::Cost, PacketAccess = false };
+};
+
+#endif  // EIGEN_STDEXT_SUPPORT
+
+// allow to add new functors and specializations of functor_traits from outside Eigen.
+// this macro is really needed because functor_traits must be specialized after it is declared but before it is used...
+#ifdef EIGEN_FUNCTORS_PLUGIN
+#include EIGEN_FUNCTORS_PLUGIN
+#endif
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_STL_FUNCTORS_H
diff --git a/inst/include/Eigen/src/Core/functors/TernaryFunctors.h b/inst/include/Eigen/src/Core/functors/TernaryFunctors.h
new file mode 100644
index 00000000..745779a1
--- /dev/null
+++ b/inst/include/Eigen/src/Core/functors/TernaryFunctors.h
@@ -0,0 +1,52 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Eugene Brevdo <ebrevdo@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_TERNARY_FUNCTORS_H
+#define EIGEN_TERNARY_FUNCTORS_H
+
+// IWYU pragma: private
+#include "../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+//---------- associative ternary functors ----------
+
+template <typename ThenScalar, typename ElseScalar, typename ConditionScalar>
+struct scalar_boolean_select_op {
+  static constexpr bool ThenElseAreSame = is_same<ThenScalar, ElseScalar>::value;
+  EIGEN_STATIC_ASSERT(ThenElseAreSame, THEN AND ELSE MUST BE SAME TYPE)
+  using Scalar = ThenScalar;
+  using result_type = Scalar;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const ThenScalar& a, const ElseScalar& b,
+                                                          const ConditionScalar& cond) const {
+    return cond == ConditionScalar(0) ? b : a;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a, const Packet& b, const Packet& cond) const {
+    return pselect(pcmp_eq(cond, pzero(cond)), b, a);
+  }
+};
+
+template <typename ThenScalar, typename ElseScalar, typename ConditionScalar>
+struct functor_traits<scalar_boolean_select_op<ThenScalar, ElseScalar, ConditionScalar>> {
+  using Scalar = ThenScalar;
+  enum {
+    Cost = 1,
+    PacketAccess = is_same<ThenScalar, ElseScalar>::value && is_same<ConditionScalar, Scalar>::value &&
+                   packet_traits<Scalar>::HasCmp
+  };
+};
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_TERNARY_FUNCTORS_H
diff --git a/inst/include/Eigen/src/Core/functors/UnaryFunctors.h b/inst/include/Eigen/src/Core/functors/UnaryFunctors.h
new file mode 100644
index 00000000..ba7d97a0
--- /dev/null
+++ b/inst/include/Eigen/src/Core/functors/UnaryFunctors.h
@@ -0,0 +1,1420 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_UNARY_FUNCTORS_H
+#define EIGEN_UNARY_FUNCTORS_H
+
+// IWYU pragma: private
+#include "../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+/** \internal
+ * \brief Template functor to compute the opposite of a scalar
+ *
+ * \sa class CwiseUnaryOp, MatrixBase::operator-
+ */
+template <typename Scalar>
+struct scalar_opposite_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a) const { return numext::negate(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const {
+    return internal::pnegate(a);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_opposite_op<Scalar>> {
+  enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = packet_traits<Scalar>::HasNegate };
+};
+
+/** \internal
+ * \brief Template functor to compute the absolute value of a scalar
+ *
+ * \sa class CwiseUnaryOp, Cwise::abs
+ */
+template <typename Scalar>
+struct scalar_abs_op {
+  typedef typename NumTraits<Scalar>::Real result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator()(const Scalar& a) const { return numext::abs(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const {
+    return internal::pabs(a);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_abs_op<Scalar>> {
+  enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = packet_traits<Scalar>::HasAbs };
+};
+
+/** \internal
+ * \brief Template functor to compute the score of a scalar, to chose a pivot
+ *
+ * \sa class CwiseUnaryOp
+ */
+template <typename Scalar>
+struct scalar_score_coeff_op : scalar_abs_op<Scalar> {
+  typedef void Score_is_abs;
+};
+template <typename Scalar>
+struct functor_traits<scalar_score_coeff_op<Scalar>> : functor_traits<scalar_abs_op<Scalar>> {};
+
+/* Avoid recomputing abs when we know the score and they are the same. Not a true Eigen functor.  */
+template <typename Scalar, typename = void>
+struct abs_knowing_score {
+  typedef typename NumTraits<Scalar>::Real result_type;
+  template <typename Score>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator()(const Scalar& a, const Score&) const {
+    return numext::abs(a);
+  }
+};
+template <typename Scalar>
+struct abs_knowing_score<Scalar, typename scalar_score_coeff_op<Scalar>::Score_is_abs> {
+  typedef typename NumTraits<Scalar>::Real result_type;
+  template <typename Scal>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator()(const Scal&, const result_type& a) const {
+    return a;
+  }
+};
+
+/** \internal
+ * \brief Template functor to compute the squared absolute value of a scalar
+ *
+ * \sa class CwiseUnaryOp, Cwise::abs2
+ */
+template <typename Scalar>
+struct scalar_abs2_op {
+  typedef typename NumTraits<Scalar>::Real result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator()(const Scalar& a) const { return numext::abs2(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const {
+    return internal::pmul(a, a);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_abs2_op<Scalar>> {
+  enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasAbs2 };
+};
+
+template <typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
+struct squared_norm_functor {
+  typedef Scalar result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a) const {
+    return Scalar(numext::real(a) * numext::real(a), numext::imag(a) * numext::imag(a));
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const {
+    return Packet(pmul(a.v, a.v));
+  }
+};
+template <typename Scalar>
+struct squared_norm_functor<Scalar, false> : scalar_abs2_op<Scalar> {};
+
+template <typename Scalar>
+struct functor_traits<squared_norm_functor<Scalar>> {
+  using Real = typename NumTraits<Scalar>::Real;
+  enum { Cost = NumTraits<Real>::MulCost, PacketAccess = packet_traits<Real>::HasMul };
+};
+
+/** \internal
+ * \brief Template functor to compute the conjugate of a complex value
+ *
+ * \sa class CwiseUnaryOp, MatrixBase::conjugate()
+ */
+template <typename Scalar>
+struct scalar_conjugate_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a) const { return numext::conj(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const {
+    return internal::pconj(a);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_conjugate_op<Scalar>> {
+  enum {
+    Cost = 0,
+    // Yes the cost is zero even for complexes because in most cases for which
+    // the cost is used, conjugation turns to be a no-op. Some examples:
+    //   cost(a*conj(b)) == cost(a*b)
+    //   cost(a+conj(b)) == cost(a+b)
+    //   <etc.
+    // If we don't set it to zero, then:
+    //   A.conjugate().lazyProduct(B.conjugate())
+    // will bake its operands. We definitely don't want that!
+    PacketAccess = packet_traits<Scalar>::HasConj
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the phase angle of a complex
+ *
+ * \sa class CwiseUnaryOp, Cwise::arg
+ */
+template <typename Scalar>
+struct scalar_arg_op {
+  typedef typename NumTraits<Scalar>::Real result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type operator()(const Scalar& a) const { return numext::arg(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const {
+    return internal::parg(a);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_arg_op<Scalar>> {
+  enum {
+    Cost = NumTraits<Scalar>::IsComplex ? 5 * NumTraits<Scalar>::MulCost : NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasArg
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the complex argument, returned as a complex type
+ *
+ * \sa class CwiseUnaryOp, Cwise::carg
+ */
+template <typename Scalar>
+struct scalar_carg_op {
+  using result_type = Scalar;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a) const {
+    return Scalar(numext::arg(a));
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const {
+    return pcarg(a);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_carg_op<Scalar>> {
+  using RealScalar = typename NumTraits<Scalar>::Real;
+  enum { Cost = functor_traits<scalar_atan2_op<RealScalar>>::Cost, PacketAccess = packet_traits<RealScalar>::HasATan };
+};
+
+/** \internal
+ * \brief Template functor to cast a scalar to another type
+ *
+ * \sa class CwiseUnaryOp, MatrixBase::cast()
+ */
+template <typename Scalar, typename NewType>
+struct scalar_cast_op {
+  typedef NewType result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const NewType operator()(const Scalar& a) const {
+    return cast<Scalar, NewType>(a);
+  }
+};
+
+template <typename Scalar, typename NewType>
+struct functor_traits<scalar_cast_op<Scalar, NewType>> {
+  enum { Cost = is_same<Scalar, NewType>::value ? 0 : NumTraits<NewType>::AddCost, PacketAccess = false };
+};
+
+/** \internal
+ * `core_cast_op` serves to distinguish the vectorized implementation from that of the legacy `scalar_cast_op` for
+ * backwards compatibility. The manner in which packet ops are handled is defined by the specialized unary_evaluator:
+ * `unary_evaluator<CwiseUnaryOp<core_cast_op<SrcType, DstType>, ArgType>, IndexBased>` in CoreEvaluators.h
+ * Otherwise, the non-vectorized behavior is identical to that of `scalar_cast_op`
+ */
+template <typename SrcType, typename DstType>
+struct core_cast_op : scalar_cast_op<SrcType, DstType> {};
+
+template <typename SrcType, typename DstType>
+struct functor_traits<core_cast_op<SrcType, DstType>> {
+  using CastingTraits = type_casting_traits<SrcType, DstType>;
+  enum {
+    Cost = is_same<SrcType, DstType>::value ? 0 : NumTraits<DstType>::AddCost,
+    PacketAccess = CastingTraits::VectorizedCast && (CastingTraits::SrcCoeffRatio <= 8)
+  };
+};
+
+/** \internal
+ * \brief Template functor to arithmetically shift a scalar right by a number of bits
+ *
+ * \sa class CwiseUnaryOp, MatrixBase::shift_right()
+ */
+template <typename Scalar, int N>
+struct scalar_shift_right_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a) const {
+    return numext::arithmetic_shift_right(a);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const {
+    return internal::parithmetic_shift_right<N>(a);
+  }
+};
+template <typename Scalar, int N>
+struct functor_traits<scalar_shift_right_op<Scalar, N>> {
+  enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = packet_traits<Scalar>::HasShift };
+};
+
+/** \internal
+ * \brief Template functor to logically shift a scalar left by a number of bits
+ *
+ * \sa class CwiseUnaryOp, MatrixBase::shift_left()
+ */
+template <typename Scalar, int N>
+struct scalar_shift_left_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a) const {
+    return numext::logical_shift_left(a);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a) const {
+    return internal::plogical_shift_left<N>(a);
+  }
+};
+template <typename Scalar, int N>
+struct functor_traits<scalar_shift_left_op<Scalar, N>> {
+  enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = packet_traits<Scalar>::HasShift };
+};
+
+/** \internal
+ * \brief Template functor to extract the real part of a complex
+ *
+ * \sa class CwiseUnaryOp, MatrixBase::real()
+ */
+template <typename Scalar>
+struct scalar_real_op {
+  typedef typename NumTraits<Scalar>::Real result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator()(const Scalar& a) const { return numext::real(a); }
+};
+template <typename Scalar>
+struct functor_traits<scalar_real_op<Scalar>> {
+  enum { Cost = 0, PacketAccess = false };
+};
+
+/** \internal
+ * \brief Template functor to extract the imaginary part of a complex
+ *
+ * \sa class CwiseUnaryOp, MatrixBase::imag()
+ */
+template <typename Scalar>
+struct scalar_imag_op {
+  typedef typename NumTraits<Scalar>::Real result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator()(const Scalar& a) const { return numext::imag(a); }
+};
+template <typename Scalar>
+struct functor_traits<scalar_imag_op<Scalar>> {
+  enum { Cost = 0, PacketAccess = false };
+};
+
+/** \internal
+ * \brief Template functor to extract the real part of a complex as a reference
+ *
+ * \sa class CwiseUnaryOp, MatrixBase::real()
+ */
+template <typename Scalar>
+struct scalar_real_ref_op {
+  typedef typename NumTraits<Scalar>::Real result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type& operator()(const Scalar& a) const {
+    return numext::real_ref(a);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type& operator()(Scalar& a) const { return numext::real_ref(a); }
+};
+template <typename Scalar>
+struct functor_traits<scalar_real_ref_op<Scalar>> {
+  enum { Cost = 0, PacketAccess = false };
+};
+
+/** \internal
+ * \brief Template functor to extract the imaginary part of a complex as a reference
+ *
+ * \sa class CwiseUnaryOp, MatrixBase::imag()
+ */
+template <typename Scalar>
+struct scalar_imag_ref_op {
+  typedef typename NumTraits<Scalar>::Real result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type& operator()(Scalar& a) const { return numext::imag_ref(a); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const result_type& operator()(const Scalar& a) const {
+    return numext::imag_ref(a);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_imag_ref_op<Scalar>> {
+  enum { Cost = 0, PacketAccess = false };
+};
+
+/** \internal
+ *
+ * \brief Template functor to compute the exponential of a scalar
+ *
+ * \sa class CwiseUnaryOp, Cwise::exp()
+ */
+template <typename Scalar>
+struct scalar_exp_op {
+  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return internal::pexp(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
+    return internal::pexp(a);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_exp_op<Scalar>> {
+  enum {
+    PacketAccess = packet_traits<Scalar>::HasExp,
+  // The following numbers are based on the AVX implementation.
+#ifdef EIGEN_VECTORIZE_FMA
+    // Haswell can issue 2 add/mul/madd per cycle.
+    Cost = (sizeof(Scalar) == 4
+                // float: 8 pmadd, 4 pmul, 2 padd/psub, 6 other
+                ? (8 * NumTraits<Scalar>::AddCost + 6 * NumTraits<Scalar>::MulCost)
+                // double: 7 pmadd, 5 pmul, 3 padd/psub, 1 div,  13 other
+                : (14 * NumTraits<Scalar>::AddCost + 6 * NumTraits<Scalar>::MulCost +
+                   scalar_div_cost<Scalar, packet_traits<Scalar>::HasDiv>::value))
+#else
+    Cost = (sizeof(Scalar) == 4
+                // float: 7 pmadd, 6 pmul, 4 padd/psub, 10 other
+                ? (21 * NumTraits<Scalar>::AddCost + 13 * NumTraits<Scalar>::MulCost)
+                // double: 7 pmadd, 5 pmul, 3 padd/psub, 1 div,  13 other
+                : (23 * NumTraits<Scalar>::AddCost + 12 * NumTraits<Scalar>::MulCost +
+                   scalar_div_cost<Scalar, packet_traits<Scalar>::HasDiv>::value))
+#endif
+  };
+};
+
+template <typename Scalar>
+struct scalar_exp2_op {
+  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return internal::pexp2(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
+    return internal::pexp2(a);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_exp2_op<Scalar>> {
+  enum {
+    PacketAccess = packet_traits<Scalar>::HasExp,
+    Cost = functor_traits<scalar_exp_op<Scalar>>::Cost  // TODO measure cost of exp2
+  };
+};
+
+/** \internal
+ *
+ * \brief Template functor to compute the exponential of a scalar - 1.
+ *
+ * \sa class CwiseUnaryOp, ArrayBase::expm1()
+ */
+template <typename Scalar>
+struct scalar_expm1_op {
+  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::expm1(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
+    return internal::pexpm1(a);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_expm1_op<Scalar>> {
+  enum {
+    PacketAccess = packet_traits<Scalar>::HasExpm1,
+    Cost = functor_traits<scalar_exp_op<Scalar>>::Cost  // TODO measure cost of expm1
+  };
+};
+
+/** \internal
+ *
+ * \brief Template functor to compute the logarithm of a scalar
+ *
+ * \sa class CwiseUnaryOp, ArrayBase::log()
+ */
+template <typename Scalar>
+struct scalar_log_op {
+  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::log(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
+    return internal::plog(a);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_log_op<Scalar>> {
+  enum {
+    PacketAccess = packet_traits<Scalar>::HasLog,
+    Cost = (PacketAccess
+  // The following numbers are based on the AVX implementation.
+#ifdef EIGEN_VECTORIZE_FMA
+                // 8 pmadd, 6 pmul, 8 padd/psub, 16 other, can issue 2 add/mul/madd per cycle.
+                ? (20 * NumTraits<Scalar>::AddCost + 7 * NumTraits<Scalar>::MulCost)
+#else
+                // 8 pmadd, 6 pmul, 8 padd/psub, 20 other
+                ? (36 * NumTraits<Scalar>::AddCost + 14 * NumTraits<Scalar>::MulCost)
+#endif
+                // Measured cost of std::log.
+                : sizeof(Scalar) == 4 ? 40 : 85)
+  };
+};
+
+/** \internal
+ *
+ * \brief Template functor to compute the logarithm of 1 plus a scalar value
+ *
+ * \sa class CwiseUnaryOp, ArrayBase::log1p()
+ */
+template <typename Scalar>
+struct scalar_log1p_op {
+  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::log1p(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
+    return internal::plog1p(a);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_log1p_op<Scalar>> {
+  enum {
+    PacketAccess = packet_traits<Scalar>::HasLog1p,
+    Cost = functor_traits<scalar_log_op<Scalar>>::Cost  // TODO measure cost of log1p
+  };
+};
+
+/** \internal
+ *
+ * \brief Template functor to compute the base-10 logarithm of a scalar
+ *
+ * \sa class CwiseUnaryOp, Cwise::log10()
+ */
+template <typename Scalar>
+struct scalar_log10_op {
+  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { EIGEN_USING_STD(log10) return log10(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
+    return internal::plog10(a);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_log10_op<Scalar>> {
+  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasLog10 };
+};
+
+/** \internal
+ *
+ * \brief Template functor to compute the base-2 logarithm of a scalar
+ *
+ * \sa class CwiseUnaryOp, Cwise::log2()
+ */
+template <typename Scalar>
+struct scalar_log2_op {
+  using RealScalar = typename NumTraits<Scalar>::Real;
+  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const {
+    return Scalar(RealScalar(EIGEN_LOG2E)) * numext::log(a);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
+    return internal::plog2(a);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_log2_op<Scalar>> {
+  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasLog };
+};
+
+/** \internal
+ * \brief Template functor to compute the square root of a scalar
+ * \sa class CwiseUnaryOp, Cwise::sqrt()
+ */
+template <typename Scalar>
+struct scalar_sqrt_op {
+  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::sqrt(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
+    return internal::psqrt(a);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_sqrt_op<Scalar>> {
+  enum {
+#if EIGEN_FAST_MATH
+    // The following numbers are based on the AVX implementation.
+    Cost = (sizeof(Scalar) == 8 ? 28
+                                // 4 pmul, 1 pmadd, 3 other
+                                : (3 * NumTraits<Scalar>::AddCost + 5 * NumTraits<Scalar>::MulCost)),
+#else
+    // The following numbers are based on min VSQRT throughput on Haswell.
+    Cost = (sizeof(Scalar) == 8 ? 28 : 14),
+#endif
+    PacketAccess = packet_traits<Scalar>::HasSqrt
+  };
+};
+
+// Boolean specialization to eliminate -Wimplicit-conversion-floating-point-to-bool warnings.
+template <>
+struct scalar_sqrt_op<bool> {
+  EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline bool operator()(const bool& a) const { return a; }
+  template <typename Packet>
+  EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
+    return a;
+  }
+};
+template <>
+struct functor_traits<scalar_sqrt_op<bool>> {
+  enum { Cost = 1, PacketAccess = packet_traits<bool>::Vectorizable };
+};
+
+/** \internal
+ * \brief Template functor to compute the cube root of a scalar
+ * \sa class CwiseUnaryOp, Cwise::sqrt()
+ */
+template <typename Scalar>
+struct scalar_cbrt_op {
+  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::cbrt(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
+    return internal::pcbrt(a);
+  }
+};
+
+template <typename Scalar>
+struct functor_traits<scalar_cbrt_op<Scalar>> {
+  enum { Cost = 20 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasCbrt };
+};
+
+/** \internal
+ * \brief Template functor to compute the reciprocal square root of a scalar
+ * \sa class CwiseUnaryOp, Cwise::rsqrt()
+ */
+template <typename Scalar>
+struct scalar_rsqrt_op {
+  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::rsqrt(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
+    return internal::prsqrt(a);
+  }
+};
+
+template <typename Scalar>
+struct functor_traits<scalar_rsqrt_op<Scalar>> {
+  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasRsqrt };
+};
+
+/** \internal
+ * \brief Template functor to compute the cosine of a scalar
+ * \sa class CwiseUnaryOp, ArrayBase::cos()
+ */
+template <typename Scalar>
+struct scalar_cos_op {
+  EIGEN_DEVICE_FUNC inline Scalar operator()(const Scalar& a) const { return numext::cos(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
+    return internal::pcos(a);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_cos_op<Scalar>> {
+  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasCos };
+};
+
+/** \internal
+ * \brief Template functor to compute the sine of a scalar
+ * \sa class CwiseUnaryOp, ArrayBase::sin()
+ */
+template <typename Scalar>
+struct scalar_sin_op {
+  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::sin(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
+    return internal::psin(a);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_sin_op<Scalar>> {
+  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasSin };
+};
+
+/** \internal
+ * \brief Template functor to compute the tan of a scalar
+ * \sa class CwiseUnaryOp, ArrayBase::tan()
+ */
+template <typename Scalar>
+struct scalar_tan_op {
+  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::tan(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
+    return internal::ptan(a);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_tan_op<Scalar>> {
+  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasTan };
+};
+
+/** \internal
+ * \brief Template functor to compute the arc cosine of a scalar
+ * \sa class CwiseUnaryOp, ArrayBase::acos()
+ */
+template <typename Scalar>
+struct scalar_acos_op {
+  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::acos(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
+    return internal::pacos(a);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_acos_op<Scalar>> {
+  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasACos };
+};
+
+/** \internal
+ * \brief Template functor to compute the arc sine of a scalar
+ * \sa class CwiseUnaryOp, ArrayBase::asin()
+ */
+template <typename Scalar>
+struct scalar_asin_op {
+  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::asin(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
+    return internal::pasin(a);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_asin_op<Scalar>> {
+  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasASin };
+};
+
+/** \internal
+ * \brief Template functor to compute the atan of a scalar
+ * \sa class CwiseUnaryOp, ArrayBase::atan()
+ */
+template <typename Scalar>
+struct scalar_atan_op {
+  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::atan(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
+    return internal::patan(a);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_atan_op<Scalar>> {
+  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasATan };
+};
+
+/** \internal
+ * \brief Template functor to compute the tanh of a scalar
+ * \sa class CwiseUnaryOp, ArrayBase::tanh()
+ */
+template <typename Scalar>
+struct scalar_tanh_op {
+  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::tanh(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& x) const {
+    return ptanh(x);
+  }
+};
+
+template <typename Scalar>
+struct functor_traits<scalar_tanh_op<Scalar>> {
+  enum {
+    PacketAccess = packet_traits<Scalar>::HasTanh,
+    Cost = ((EIGEN_FAST_MATH && is_same<Scalar, float>::value)
+// The following numbers are based on the AVX implementation,
+#ifdef EIGEN_VECTORIZE_FMA
+                // Haswell can issue 2 add/mul/madd per cycle.
+                // 9 pmadd, 2 pmul, 1 div, 2 other
+                ? (2 * NumTraits<Scalar>::AddCost + 6 * NumTraits<Scalar>::MulCost +
+                   scalar_div_cost<Scalar, packet_traits<Scalar>::HasDiv>::value)
+#else
+                ? (11 * NumTraits<Scalar>::AddCost + 11 * NumTraits<Scalar>::MulCost +
+                   scalar_div_cost<Scalar, packet_traits<Scalar>::HasDiv>::value)
+#endif
+                // This number assumes a naive implementation of tanh
+                : (6 * NumTraits<Scalar>::AddCost + 3 * NumTraits<Scalar>::MulCost +
+                   2 * scalar_div_cost<Scalar, packet_traits<Scalar>::HasDiv>::value +
+                   functor_traits<scalar_exp_op<Scalar>>::Cost))
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the atanh of a scalar
+ * \sa class CwiseUnaryOp, ArrayBase::atanh()
+ */
+template <typename Scalar>
+struct scalar_atanh_op {
+  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::atanh(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& x) const {
+    return patanh(x);
+  }
+};
+
+template <typename Scalar>
+struct functor_traits<scalar_atanh_op<Scalar>> {
+  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasATanh };
+};
+
+/** \internal
+ * \brief Template functor to compute the sinh of a scalar
+ * \sa class CwiseUnaryOp, ArrayBase::sinh()
+ */
+template <typename Scalar>
+struct scalar_sinh_op {
+  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::sinh(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
+    return internal::psinh(a);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_sinh_op<Scalar>> {
+  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasSinh };
+};
+
+/** \internal
+ * \brief Template functor to compute the asinh of a scalar
+ * \sa class CwiseUnaryOp, ArrayBase::asinh()
+ */
+template <typename Scalar>
+struct scalar_asinh_op {
+  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::asinh(a); }
+};
+
+template <typename Scalar>
+struct functor_traits<scalar_asinh_op<Scalar>> {
+  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
+};
+
+/** \internal
+ * \brief Template functor to compute the cosh of a scalar
+ * \sa class CwiseUnaryOp, ArrayBase::cosh()
+ */
+template <typename Scalar>
+struct scalar_cosh_op {
+  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::cosh(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
+    return internal::pcosh(a);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_cosh_op<Scalar>> {
+  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasCosh };
+};
+
+/** \internal
+ * \brief Template functor to compute the acosh of a scalar
+ * \sa class CwiseUnaryOp, ArrayBase::acosh()
+ */
+template <typename Scalar>
+struct scalar_acosh_op {
+  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::acosh(a); }
+};
+
+template <typename Scalar>
+struct functor_traits<scalar_acosh_op<Scalar>> {
+  enum { Cost = 5 * NumTraits<Scalar>::MulCost, PacketAccess = false };
+};
+
+/** \internal
+ * \brief Template functor to compute the inverse of a scalar
+ * \sa class CwiseUnaryOp, Cwise::inverse()
+ */
+template <typename Scalar>
+struct scalar_inverse_op {
+  EIGEN_DEVICE_FUNC inline Scalar operator()(const Scalar& a) const { return Scalar(1) / a; }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const {
+    return internal::preciprocal(a);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_inverse_op<Scalar>> {
+  enum {
+    PacketAccess = packet_traits<Scalar>::HasDiv,
+    // If packet_traits<Scalar>::HasReciprocal then the Estimated cost is that
+    // of computing an approximation plus a single Newton-Raphson step, which
+    // consists of 1 pmul + 1 pmadd.
+    Cost = (packet_traits<Scalar>::HasReciprocal ? 4 * NumTraits<Scalar>::MulCost
+                                                 : scalar_div_cost<Scalar, PacketAccess>::value)
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the square of a scalar
+ * \sa class CwiseUnaryOp, Cwise::square()
+ */
+template <typename Scalar>
+struct scalar_square_op {
+  EIGEN_DEVICE_FUNC inline Scalar operator()(const Scalar& a) const { return a * a; }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const {
+    return internal::pmul(a, a);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_square_op<Scalar>> {
+  enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasMul };
+};
+
+// Boolean specialization to avoid -Wint-in-bool-context warnings on GCC.
+template <>
+struct scalar_square_op<bool> {
+  EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline bool operator()(const bool& a) const { return a; }
+  template <typename Packet>
+  EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const {
+    return a;
+  }
+};
+template <>
+struct functor_traits<scalar_square_op<bool>> {
+  enum { Cost = 0, PacketAccess = packet_traits<bool>::Vectorizable };
+};
+
+/** \internal
+ * \brief Template functor to compute the cube of a scalar
+ * \sa class CwiseUnaryOp, Cwise::cube()
+ */
+template <typename Scalar>
+struct scalar_cube_op {
+  EIGEN_DEVICE_FUNC inline Scalar operator()(const Scalar& a) const { return a * a * a; }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const {
+    return internal::pmul(a, pmul(a, a));
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_cube_op<Scalar>> {
+  enum { Cost = 2 * NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasMul };
+};
+
+// Boolean specialization to avoid -Wint-in-bool-context warnings on GCC.
+template <>
+struct scalar_cube_op<bool> {
+  EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline bool operator()(const bool& a) const { return a; }
+  template <typename Packet>
+  EIGEN_DEPRECATED EIGEN_DEVICE_FUNC inline const Packet packetOp(const Packet& a) const {
+    return a;
+  }
+};
+template <>
+struct functor_traits<scalar_cube_op<bool>> {
+  enum { Cost = 0, PacketAccess = packet_traits<bool>::Vectorizable };
+};
+
+/** \internal
+ * \brief Template functor to compute the rounded value of a scalar
+ * \sa class CwiseUnaryOp, ArrayBase::round()
+ */
+template <typename Scalar>
+struct scalar_round_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a) const { return numext::round(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
+    return internal::pround(a);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_round_op<Scalar>> {
+  enum {
+    Cost = NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasRound || NumTraits<Scalar>::IsInteger
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the floor of a scalar
+ * \sa class CwiseUnaryOp, ArrayBase::floor()
+ */
+template <typename Scalar>
+struct scalar_floor_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a) const { return numext::floor(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
+    return internal::pfloor(a);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_floor_op<Scalar>> {
+  enum {
+    Cost = NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasRound || NumTraits<Scalar>::IsInteger
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the rounded (with current rounding mode)  value of a scalar
+ * \sa class CwiseUnaryOp, ArrayBase::rint()
+ */
+template <typename Scalar>
+struct scalar_rint_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a) const { return numext::rint(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
+    return internal::print(a);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_rint_op<Scalar>> {
+  enum {
+    Cost = NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasRound || NumTraits<Scalar>::IsInteger
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the ceil of a scalar
+ * \sa class CwiseUnaryOp, ArrayBase::ceil()
+ */
+template <typename Scalar>
+struct scalar_ceil_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a) const { return numext::ceil(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
+    return internal::pceil(a);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_ceil_op<Scalar>> {
+  enum {
+    Cost = NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasRound || NumTraits<Scalar>::IsInteger
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the truncation of a scalar
+ * \sa class CwiseUnaryOp, ArrayBase::floor()
+ */
+template <typename Scalar>
+struct scalar_trunc_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a) const { return numext::trunc(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
+    return internal::ptrunc(a);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_trunc_op<Scalar>> {
+  enum {
+    Cost = NumTraits<Scalar>::MulCost,
+    PacketAccess = packet_traits<Scalar>::HasRound || NumTraits<Scalar>::IsInteger
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute whether a scalar is NaN
+ * \sa class CwiseUnaryOp, ArrayBase::isnan()
+ */
+template <typename Scalar, bool UseTypedPredicate = false>
+struct scalar_isnan_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const Scalar& a) const {
+#if defined(SYCL_DEVICE_ONLY)
+    return numext::isnan(a);
+#else
+    return numext::isnan EIGEN_NOT_A_MACRO(a);
+#endif
+  }
+};
+
+template <typename Scalar>
+struct scalar_isnan_op<Scalar, true> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const {
+#if defined(SYCL_DEVICE_ONLY)
+    return (numext::isnan(a) ? ptrue(a) : pzero(a));
+#else
+    return (numext::isnan EIGEN_NOT_A_MACRO(a) ? ptrue(a) : pzero(a));
+#endif
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
+    return pisnan(a);
+  }
+};
+
+template <typename Scalar, bool UseTypedPredicate>
+struct functor_traits<scalar_isnan_op<Scalar, UseTypedPredicate>> {
+  enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasCmp && UseTypedPredicate };
+};
+
+/** \internal
+ * \brief Template functor to check whether a scalar is +/-inf
+ * \sa class CwiseUnaryOp, ArrayBase::isinf()
+ */
+template <typename Scalar, bool UseTypedPredicate = false>
+struct scalar_isinf_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const Scalar& a) const {
+#if defined(SYCL_DEVICE_ONLY)
+    return numext::isinf(a);
+#else
+    return (numext::isinf)(a);
+#endif
+  }
+};
+
+template <typename Scalar>
+struct scalar_isinf_op<Scalar, true> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const {
+#if defined(SYCL_DEVICE_ONLY)
+    return (numext::isinf(a) ? ptrue(a) : pzero(a));
+#else
+    return (numext::isinf EIGEN_NOT_A_MACRO(a) ? ptrue(a) : pzero(a));
+#endif
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
+    return pisinf(a);
+  }
+};
+template <typename Scalar, bool UseTypedPredicate>
+struct functor_traits<scalar_isinf_op<Scalar, UseTypedPredicate>> {
+  enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasCmp && UseTypedPredicate };
+};
+
+/** \internal
+ * \brief Template functor to check whether a scalar has a finite value
+ * \sa class CwiseUnaryOp, ArrayBase::isfinite()
+ */
+template <typename Scalar, bool UseTypedPredicate = false>
+struct scalar_isfinite_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator()(const Scalar& a) const {
+#if defined(SYCL_DEVICE_ONLY)
+    return numext::isfinite(a);
+#else
+    return (numext::isfinite)(a);
+#endif
+  }
+};
+
+template <typename Scalar>
+struct scalar_isfinite_op<Scalar, true> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const {
+#if defined(SYCL_DEVICE_ONLY)
+    return (numext::isfinite(a) ? ptrue(a) : pzero(a));
+#else
+    return (numext::isfinite EIGEN_NOT_A_MACRO(a) ? ptrue(a) : pzero(a));
+#endif
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
+    constexpr Scalar inf = NumTraits<Scalar>::infinity();
+    return pcmp_lt(pabs(a), pset1<Packet>(inf));
+  }
+};
+template <typename Scalar, bool UseTypedPredicate>
+struct functor_traits<scalar_isfinite_op<Scalar, UseTypedPredicate>> {
+  enum { Cost = NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasCmp && UseTypedPredicate };
+};
+
+/** \internal
+ * \brief Template functor to compute the logical not of a scalar as if it were a boolean
+ *
+ * \sa class CwiseUnaryOp, ArrayBase::operator!
+ */
+template <typename Scalar>
+struct scalar_boolean_not_op {
+  using result_type = Scalar;
+  // `false` any value `a` that satisfies `a == Scalar(0)`
+  // `true` is the complement of `false`
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const {
+    return a == Scalar(0) ? Scalar(1) : Scalar(0);
+  }
+  template <typename Packet>
+  EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const {
+    const Packet cst_one = pset1<Packet>(Scalar(1));
+    Packet not_a = pcmp_eq(a, pzero(a));
+    return pand(not_a, cst_one);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_boolean_not_op<Scalar>> {
+  enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = packet_traits<Scalar>::HasCmp };
+};
+
+template <typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
+struct bitwise_unary_impl {
+  static constexpr size_t Size = sizeof(Scalar);
+  using uint_t = typename numext::get_integer_by_size<Size>::unsigned_type;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run_not(const Scalar& a) {
+    uint_t a_as_uint = numext::bit_cast<uint_t, Scalar>(a);
+    uint_t result = ~a_as_uint;
+    return numext::bit_cast<Scalar, uint_t>(result);
+  }
+};
+
+template <typename Scalar>
+struct bitwise_unary_impl<Scalar, true> {
+  using Real = typename NumTraits<Scalar>::Real;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar run_not(const Scalar& a) {
+    Real real_result = bitwise_unary_impl<Real>::run_not(numext::real(a));
+    Real imag_result = bitwise_unary_impl<Real>::run_not(numext::imag(a));
+    return Scalar(real_result, imag_result);
+  }
+};
+
+/** \internal
+ * \brief Template functor to compute the bitwise not of a scalar
+ *
+ * \sa class CwiseUnaryOp, ArrayBase::operator~
+ */
+template <typename Scalar>
+struct scalar_bitwise_not_op {
+  EIGEN_STATIC_ASSERT(!NumTraits<Scalar>::RequireInitialization,
+                      BITWISE OPERATIONS MAY ONLY BE PERFORMED ON PLAIN DATA TYPES)
+  EIGEN_STATIC_ASSERT((!internal::is_same<Scalar, bool>::value), DONT USE BITWISE OPS ON BOOLEAN TYPES)
+  using result_type = Scalar;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const {
+    return bitwise_unary_impl<Scalar>::run_not(a);
+  }
+  template <typename Packet>
+  EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const {
+    return pandnot(ptrue(a), a);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bitwise_not_op<Scalar>> {
+  enum { Cost = NumTraits<Scalar>::AddCost, PacketAccess = true };
+};
+
+/** \internal
+ * \brief Template functor to compute the signum of a scalar
+ * \sa class CwiseUnaryOp, Cwise::sign()
+ */
+template <typename Scalar>
+struct scalar_sign_op {
+  EIGEN_DEVICE_FUNC inline const Scalar operator()(const Scalar& a) const { return numext::sign(a); }
+
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const {
+    return internal::psign(a);
+  }
+};
+
+template <typename Scalar>
+struct functor_traits<scalar_sign_op<Scalar>> {
+  enum {
+    Cost = NumTraits<Scalar>::IsComplex ? (8 * NumTraits<Scalar>::MulCost)  // roughly
+                                        : (3 * NumTraits<Scalar>::AddCost),
+    PacketAccess = packet_traits<Scalar>::HasSign && packet_traits<Scalar>::Vectorizable
+  };
+};
+
+// Real-valued implementation.
+template <typename T, typename EnableIf = void>
+struct scalar_logistic_op_impl {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const { return packetOp(x); }
+
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+    const Packet one = pset1<Packet>(T(1));
+    const Packet inf = pset1<Packet>(NumTraits<T>::infinity());
+    const Packet e = pexp(x);
+    const Packet inf_mask = pcmp_eq(e, inf);
+    return pselect(inf_mask, one, pdiv(e, padd(one, e)));
+  }
+};
+
+// Complex-valud implementation.
+template <typename T>
+struct scalar_logistic_op_impl<T, std::enable_if_t<NumTraits<T>::IsComplex>> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(const T& x) const {
+    const T e = numext::exp(x);
+    return (numext::isinf)(numext::real(e)) ? T(1) : e / (e + T(1));
+  }
+};
+
+/** \internal
+ * \brief Template functor to compute the logistic function of a scalar
+ * \sa class CwiseUnaryOp, ArrayBase::logistic()
+ */
+template <typename T>
+struct scalar_logistic_op : scalar_logistic_op_impl<T> {};
+
+// TODO(rmlarsen): Enable the following on host when integer_packet is defined
+// for the relevant packet types.
+#ifndef EIGEN_GPUCC
+
+/** \internal
+ * \brief Template specialization of the logistic function for float.
+ * Computes S(x) = exp(x) / (1 + exp(x)), where exp(x) is implemented
+ * using an algorithm partly adopted from the implementation of
+ * pexp_float. See the individual steps described in the code below.
+ * Note that compared to pexp, we use an additional outer multiplicative
+ * range reduction step using the identity exp(x) = exp(x/2)^2.
+ * This prevert us from having to call ldexp on values that could produce
+ * a denormal result, which allows us to call the faster implementation in
+ * pldexp_fast_impl<Packet>::run(p, m).
+ * The final squaring, however, doubles the error bound on the final
+ * approximation. Exhaustive testing shows that we have a worst case error
+ * of 4.5 ulps (compared to computing S(x) in double precision), which is
+ * acceptable.
+ */
+template <>
+struct scalar_logistic_op<float> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator()(const float& x) const {
+    // Truncate at the first point where the interpolant is exactly one.
+    const float cst_exp_hi = 16.6355324f;
+    const float e = numext::exp(numext::mini(x, cst_exp_hi));
+    return e / (1.0f + e);
+  }
+
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& _x) const {
+    const Packet cst_zero = pset1<Packet>(0.0f);
+    const Packet cst_one = pset1<Packet>(1.0f);
+    const Packet cst_half = pset1<Packet>(0.5f);
+    // Truncate at the first point where the interpolant is exactly one.
+    const Packet cst_exp_hi = pset1<Packet>(16.6355324f);
+    const Packet cst_exp_lo = pset1<Packet>(-104.f);
+
+    // Clamp x to the non-trivial range where S(x). Outside this
+    // interval the correctly rounded value of S(x) is either zero
+    // or one.
+    Packet zero_mask = pcmp_lt(_x, cst_exp_lo);
+    Packet x = pmin(_x, cst_exp_hi);
+
+    // 1. Multiplicative range reduction:
+    // Reduce the range of x by a factor of 2. This avoids having
+    // to compute exp(x) accurately where the result is a denormalized
+    // value.
+    x = pmul(x, cst_half);
+
+    // 2. Subtractive range reduction:
+    // Express exp(x) as exp(m*ln(2) + r) = 2^m*exp(r), start by extracting
+    // m = floor(x/ln(2) + 0.5), such that x = m*ln(2) + r.
+    const Packet cst_cephes_LOG2EF = pset1<Packet>(1.44269504088896341f);
+    Packet m = pfloor(pmadd(x, cst_cephes_LOG2EF, cst_half));
+    // Get r = x - m*ln(2). We use a trick from Cephes where the term
+    // m*ln(2) is subtracted out in two parts, m*C1+m*C2 = m*ln(2),
+    // to avoid accumulating truncation errors.
+    const Packet cst_cephes_exp_C1 = pset1<Packet>(-0.693359375f);
+    const Packet cst_cephes_exp_C2 = pset1<Packet>(2.12194440e-4f);
+    Packet r = pmadd(m, cst_cephes_exp_C1, x);
+    r = pmadd(m, cst_cephes_exp_C2, r);
+
+    // 3. Compute an approximation to exp(r) using a degree 5 minimax polynomial.
+    // We compute even and odd terms separately to increase instruction level
+    // parallelism.
+    Packet r2 = pmul(r, r);
+    const Packet cst_p2 = pset1<Packet>(0.49999141693115234375f);
+    const Packet cst_p3 = pset1<Packet>(0.16666877269744873046875f);
+    const Packet cst_p4 = pset1<Packet>(4.1898667812347412109375e-2f);
+    const Packet cst_p5 = pset1<Packet>(8.33471305668354034423828125e-3f);
+
+    const Packet p_even = pmadd(r2, cst_p4, cst_p2);
+    const Packet p_odd = pmadd(r2, cst_p5, cst_p3);
+    const Packet p_low = padd(r, cst_one);
+    Packet p = pmadd(r, p_odd, p_even);
+    p = pmadd(r2, p, p_low);
+
+    // 4. Undo subtractive range reduction exp(m*ln(2) + r) = 2^m * exp(r).
+    Packet e = pldexp_fast(p, m);
+
+    // 5. Undo multiplicative range reduction by using exp(r) = exp(r/2)^2.
+    e = pmul(e, e);
+
+    // Return exp(x) / (1 + exp(x))
+    return pselect(zero_mask, cst_zero, pdiv(e, padd(cst_one, e)));
+  }
+};
+#endif  // #ifndef EIGEN_GPU_COMPILE_PHASE
+
+template <typename T>
+struct functor_traits<scalar_logistic_op<T>> {
+  enum {
+    // The cost estimate for float here here is for the common(?) case where
+    // all arguments are greater than -9.
+    Cost = scalar_div_cost<T, packet_traits<T>::HasDiv>::value +
+           (internal::is_same<T, float>::value ? NumTraits<T>::AddCost * 15 + NumTraits<T>::MulCost * 11
+                                               : NumTraits<T>::AddCost * 2 + functor_traits<scalar_exp_op<T>>::Cost),
+    PacketAccess = !NumTraits<T>::IsComplex && packet_traits<T>::HasAdd && packet_traits<T>::HasDiv &&
+                   (internal::is_same<T, float>::value
+                        ? packet_traits<T>::HasMul && packet_traits<T>::HasMax && packet_traits<T>::HasMin
+                        : packet_traits<T>::HasNegate && packet_traits<T>::HasExp)
+  };
+};
+
+template <typename Scalar, typename ExponentScalar, bool IsBaseInteger = NumTraits<Scalar>::IsInteger,
+          bool IsExponentInteger = NumTraits<ExponentScalar>::IsInteger,
+          bool IsBaseComplex = NumTraits<Scalar>::IsComplex,
+          bool IsExponentComplex = NumTraits<ExponentScalar>::IsComplex>
+struct scalar_unary_pow_op {
+  typedef typename internal::promote_scalar_arg<
+      Scalar, ExponentScalar,
+      internal::has_ReturnType<ScalarBinaryOpTraits<Scalar, ExponentScalar, scalar_unary_pow_op>>::value>::type
+      PromotedExponent;
+  typedef typename ScalarBinaryOpTraits<Scalar, PromotedExponent, scalar_unary_pow_op>::ReturnType result_type;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_unary_pow_op(const ExponentScalar& exponent) : m_exponent(exponent) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator()(const Scalar& a) const {
+    EIGEN_USING_STD(pow);
+    return static_cast<result_type>(pow(a, m_exponent));
+  }
+
+ private:
+  const ExponentScalar m_exponent;
+  scalar_unary_pow_op() {}
+};
+
+template <typename T>
+constexpr int exponent_digits() {
+  return CHAR_BIT * sizeof(T) - NumTraits<T>::digits() - NumTraits<T>::IsSigned;
+}
+
+template <typename From, typename To>
+struct is_floating_exactly_representable {
+  // TODO(rmlarsen): Add radix to NumTraits and enable this check.
+  // (NumTraits<To>::radix == NumTraits<From>::radix) &&
+  static constexpr bool value =
+      (exponent_digits<To>() >= exponent_digits<From>() && NumTraits<To>::digits() >= NumTraits<From>::digits());
+};
+
+// Specialization for real, non-integer types, non-complex types.
+template <typename Scalar, typename ExponentScalar>
+struct scalar_unary_pow_op<Scalar, ExponentScalar, false, false, false, false> {
+  template <bool IsExactlyRepresentable = is_floating_exactly_representable<ExponentScalar, Scalar>::value>
+  std::enable_if_t<IsExactlyRepresentable, void> check_is_representable() const {}
+
+  // Issue a deprecation warning if we do a narrowing conversion on the exponent.
+  template <bool IsExactlyRepresentable = is_floating_exactly_representable<ExponentScalar, Scalar>::value>
+  EIGEN_DEPRECATED std::enable_if_t<!IsExactlyRepresentable, void> check_is_representable() const {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_unary_pow_op(const ExponentScalar& exponent)
+      : m_exponent(static_cast<Scalar>(exponent)) {
+    check_is_representable();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const {
+    EIGEN_USING_STD(pow);
+    return static_cast<Scalar>(pow(a, m_exponent));
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const {
+    return unary_pow_impl<Packet, Scalar>::run(a, m_exponent);
+  }
+
+ private:
+  const Scalar m_exponent;
+  scalar_unary_pow_op() {}
+};
+
+template <typename Scalar, typename ExponentScalar, bool BaseIsInteger>
+struct scalar_unary_pow_op<Scalar, ExponentScalar, BaseIsInteger, true, false, false> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE scalar_unary_pow_op(const ExponentScalar& exponent) : m_exponent(exponent) {}
+  // TODO: error handling logic for complex^real_integer
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const {
+    return unary_pow_impl<Scalar, ExponentScalar>::run(a, m_exponent);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const {
+    return unary_pow_impl<Packet, ExponentScalar>::run(a, m_exponent);
+  }
+
+ private:
+  const ExponentScalar m_exponent;
+  scalar_unary_pow_op() {}
+};
+
+template <typename Scalar, typename ExponentScalar>
+struct functor_traits<scalar_unary_pow_op<Scalar, ExponentScalar>> {
+  enum {
+    GenPacketAccess = functor_traits<scalar_pow_op<Scalar, ExponentScalar>>::PacketAccess,
+    IntPacketAccess = !NumTraits<Scalar>::IsComplex && packet_traits<Scalar>::HasMul &&
+                      (packet_traits<Scalar>::HasDiv || NumTraits<Scalar>::IsInteger) && packet_traits<Scalar>::HasCmp,
+    PacketAccess = NumTraits<ExponentScalar>::IsInteger ? IntPacketAccess : (IntPacketAccess && GenPacketAccess),
+    Cost = functor_traits<scalar_pow_op<Scalar, ExponentScalar>>::Cost
+  };
+};
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_FUNCTORS_H
diff --git a/inst/include/Eigen/src/Core/products/CoeffBasedProduct.h b/inst/include/Eigen/src/Core/products/CoeffBasedProduct.h
deleted file mode 100644
index 2a9d65b9..00000000
--- a/inst/include/Eigen/src/Core/products/CoeffBasedProduct.h
+++ /dev/null
@@ -1,476 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_COEFFBASED_PRODUCT_H
-#define EIGEN_COEFFBASED_PRODUCT_H
-
-namespace Eigen { 
-
-namespace internal {
-
-/*********************************************************************************
-*  Coefficient based product implementation.
-*  It is designed for the following use cases:
-*  - small fixed sizes
-*  - lazy products
-*********************************************************************************/
-
-/* Since the all the dimensions of the product are small, here we can rely
- * on the generic Assign mechanism to evaluate the product per coeff (or packet).
- *
- * Note that here the inner-loops should always be unrolled.
- */
-
-template<int Traversal, int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar>
-struct product_coeff_impl;
-
-template<int StorageOrder, int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct product_packet_impl;
-
-template<typename LhsNested, typename RhsNested, int NestingFlags>
-struct traits<CoeffBasedProduct<LhsNested,RhsNested,NestingFlags> >
-{
-  typedef MatrixXpr XprKind;
-  typedef typename remove_all<LhsNested>::type _LhsNested;
-  typedef typename remove_all<RhsNested>::type _RhsNested;
-  typedef typename scalar_product_traits<typename _LhsNested::Scalar, typename _RhsNested::Scalar>::ReturnType Scalar;
-  typedef typename promote_storage_type<typename traits<_LhsNested>::StorageKind,
-                                           typename traits<_RhsNested>::StorageKind>::ret StorageKind;
-  typedef typename promote_index_type<typename traits<_LhsNested>::Index,
-                                         typename traits<_RhsNested>::Index>::type Index;
-
-  enum {
-      LhsCoeffReadCost = _LhsNested::CoeffReadCost,
-      RhsCoeffReadCost = _RhsNested::CoeffReadCost,
-      LhsFlags = _LhsNested::Flags,
-      RhsFlags = _RhsNested::Flags,
-
-      RowsAtCompileTime = _LhsNested::RowsAtCompileTime,
-      ColsAtCompileTime = _RhsNested::ColsAtCompileTime,
-      InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(_LhsNested::ColsAtCompileTime, _RhsNested::RowsAtCompileTime),
-
-      MaxRowsAtCompileTime = _LhsNested::MaxRowsAtCompileTime,
-      MaxColsAtCompileTime = _RhsNested::MaxColsAtCompileTime,
-
-      LhsRowMajor = LhsFlags & RowMajorBit,
-      RhsRowMajor = RhsFlags & RowMajorBit,
-
-      SameType = is_same<typename _LhsNested::Scalar,typename _RhsNested::Scalar>::value,
-
-      CanVectorizeRhs = RhsRowMajor && (RhsFlags & PacketAccessBit)
-                      && (ColsAtCompileTime == Dynamic
-                          || ( (ColsAtCompileTime % packet_traits<Scalar>::size) == 0
-                              && (RhsFlags&AlignedBit)
-                             )
-                         ),
-
-      CanVectorizeLhs = (!LhsRowMajor) && (LhsFlags & PacketAccessBit)
-                      && (RowsAtCompileTime == Dynamic
-                          || ( (RowsAtCompileTime % packet_traits<Scalar>::size) == 0
-                              && (LhsFlags&AlignedBit)
-                             )
-                         ),
-
-      EvalToRowMajor = (MaxRowsAtCompileTime==1&&MaxColsAtCompileTime!=1) ? 1
-                     : (MaxColsAtCompileTime==1&&MaxRowsAtCompileTime!=1) ? 0
-                     : (RhsRowMajor && !CanVectorizeLhs),
-
-      Flags = ((unsigned int)(LhsFlags | RhsFlags) & HereditaryBits & ~RowMajorBit)
-            | (EvalToRowMajor ? RowMajorBit : 0)
-            | NestingFlags
-            | (LhsFlags & RhsFlags & AlignedBit)
-            // TODO enable vectorization for mixed types
-            | (SameType && (CanVectorizeLhs || CanVectorizeRhs) ? PacketAccessBit : 0),
-
-      CoeffReadCost = InnerSize == Dynamic ? Dynamic
-                    : InnerSize == 0 ? 0
-                    : InnerSize * (NumTraits<Scalar>::MulCost + LhsCoeffReadCost + RhsCoeffReadCost)
-                      + (InnerSize - 1) * NumTraits<Scalar>::AddCost,
-
-      /* CanVectorizeInner deserves special explanation. It does not affect the product flags. It is not used outside
-      * of Product. If the Product itself is not a packet-access expression, there is still a chance that the inner
-      * loop of the product might be vectorized. This is the meaning of CanVectorizeInner. Since it doesn't affect
-      * the Flags, it is safe to make this value depend on ActualPacketAccessBit, that doesn't affect the ABI.
-      */
-      CanVectorizeInner =    SameType
-                          && LhsRowMajor
-                          && (!RhsRowMajor)
-                          && (LhsFlags & RhsFlags & ActualPacketAccessBit)
-                          && (LhsFlags & RhsFlags & AlignedBit)
-                          && (InnerSize % packet_traits<Scalar>::size == 0)
-    };
-};
-
-} // end namespace internal
-
-template<typename LhsNested, typename RhsNested, int NestingFlags>
-class CoeffBasedProduct
-  : internal::no_assignment_operator,
-    public MatrixBase<CoeffBasedProduct<LhsNested, RhsNested, NestingFlags> >
-{
-  public:
-
-    typedef MatrixBase<CoeffBasedProduct> Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(CoeffBasedProduct)
-    typedef typename Base::PlainObject PlainObject;
-
-  private:
-
-    typedef typename internal::traits<CoeffBasedProduct>::_LhsNested _LhsNested;
-    typedef typename internal::traits<CoeffBasedProduct>::_RhsNested _RhsNested;
-
-    enum {
-      PacketSize = internal::packet_traits<Scalar>::size,
-      InnerSize  = internal::traits<CoeffBasedProduct>::InnerSize,
-      Unroll = CoeffReadCost != Dynamic && CoeffReadCost <= EIGEN_UNROLLING_LIMIT,
-      CanVectorizeInner = internal::traits<CoeffBasedProduct>::CanVectorizeInner
-    };
-
-    typedef internal::product_coeff_impl<CanVectorizeInner ? InnerVectorizedTraversal : DefaultTraversal,
-                                   Unroll ? InnerSize : Dynamic,
-                                   _LhsNested, _RhsNested, Scalar> ScalarCoeffImpl;
-
-    typedef CoeffBasedProduct<LhsNested,RhsNested,NestByRefBit> LazyCoeffBasedProductType;
-
-  public:
-
-    inline CoeffBasedProduct(const CoeffBasedProduct& other)
-      : Base(), m_lhs(other.m_lhs), m_rhs(other.m_rhs)
-    {}
-
-    template<typename Lhs, typename Rhs>
-    inline CoeffBasedProduct(const Lhs& lhs, const Rhs& rhs)
-      : m_lhs(lhs), m_rhs(rhs)
-    {
-      // we don't allow taking products of matrices of different real types, as that wouldn't be vectorizable.
-      // We still allow to mix T and complex<T>.
-      EIGEN_STATIC_ASSERT((internal::scalar_product_traits<typename Lhs::RealScalar, typename Rhs::RealScalar>::Defined),
-        YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
-      eigen_assert(lhs.cols() == rhs.rows()
-        && "invalid matrix product"
-        && "if you wanted a coeff-wise or a dot product use the respective explicit functions");
-    }
-
-    EIGEN_STRONG_INLINE Index rows() const { return m_lhs.rows(); }
-    EIGEN_STRONG_INLINE Index cols() const { return m_rhs.cols(); }
-
-    EIGEN_STRONG_INLINE const Scalar coeff(Index row, Index col) const
-    {
-      Scalar res;
-      ScalarCoeffImpl::run(row, col, m_lhs, m_rhs, res);
-      return res;
-    }
-
-    /* Allow index-based non-packet access. It is impossible though to allow index-based packed access,
-     * which is why we don't set the LinearAccessBit.
-     */
-    EIGEN_STRONG_INLINE const Scalar coeff(Index index) const
-    {
-      Scalar res;
-      const Index row = RowsAtCompileTime == 1 ? 0 : index;
-      const Index col = RowsAtCompileTime == 1 ? index : 0;
-      ScalarCoeffImpl::run(row, col, m_lhs, m_rhs, res);
-      return res;
-    }
-
-    template<int LoadMode>
-    EIGEN_STRONG_INLINE const PacketScalar packet(Index row, Index col) const
-    {
-      PacketScalar res;
-      internal::product_packet_impl<Flags&RowMajorBit ? RowMajor : ColMajor,
-                              Unroll ? InnerSize : Dynamic,
-                              _LhsNested, _RhsNested, PacketScalar, LoadMode>
-        ::run(row, col, m_lhs, m_rhs, res);
-      return res;
-    }
-
-    // Implicit conversion to the nested type (trigger the evaluation of the product)
-    EIGEN_STRONG_INLINE operator const PlainObject& () const
-    {
-      m_result.lazyAssign(*this);
-      return m_result;
-    }
-
-    const _LhsNested& lhs() const { return m_lhs; }
-    const _RhsNested& rhs() const { return m_rhs; }
-
-    const Diagonal<const LazyCoeffBasedProductType,0> diagonal() const
-    { return reinterpret_cast<const LazyCoeffBasedProductType&>(*this); }
-
-    template<int DiagonalIndex>
-    const Diagonal<const LazyCoeffBasedProductType,DiagonalIndex> diagonal() const
-    { return reinterpret_cast<const LazyCoeffBasedProductType&>(*this); }
-
-    const Diagonal<const LazyCoeffBasedProductType,Dynamic> diagonal(Index index) const
-    { return reinterpret_cast<const LazyCoeffBasedProductType&>(*this).diagonal(index); }
-
-  protected:
-    typename internal::add_const_on_value_type<LhsNested>::type m_lhs;
-    typename internal::add_const_on_value_type<RhsNested>::type m_rhs;
-
-    mutable PlainObject m_result;
-};
-
-namespace internal {
-
-// here we need to overload the nested rule for products
-// such that the nested type is a const reference to a plain matrix
-template<typename Lhs, typename Rhs, int N, typename PlainObject>
-struct nested<CoeffBasedProduct<Lhs,Rhs,EvalBeforeNestingBit|EvalBeforeAssigningBit>, N, PlainObject>
-{
-  typedef PlainObject const& type;
-};
-
-/***************************************************************************
-* Normal product .coeff() implementation (with meta-unrolling)
-***************************************************************************/
-
-/**************************************
-*** Scalar path  - no vectorization ***
-**************************************/
-
-template<int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar>
-struct product_coeff_impl<DefaultTraversal, UnrollingIndex, Lhs, Rhs, RetScalar>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar &res)
-  {
-    product_coeff_impl<DefaultTraversal, UnrollingIndex-1, Lhs, Rhs, RetScalar>::run(row, col, lhs, rhs, res);
-    res += lhs.coeff(row, UnrollingIndex-1) * rhs.coeff(UnrollingIndex-1, col);
-  }
-};
-
-template<typename Lhs, typename Rhs, typename RetScalar>
-struct product_coeff_impl<DefaultTraversal, 1, Lhs, Rhs, RetScalar>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar &res)
-  {
-    res = lhs.coeff(row, 0) * rhs.coeff(0, col);
-  }
-};
-
-template<typename Lhs, typename Rhs, typename RetScalar>
-struct product_coeff_impl<DefaultTraversal, 0, Lhs, Rhs, RetScalar>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, RetScalar &res)
-  {
-    res = RetScalar(0);
-  }
-};
-
-template<typename Lhs, typename Rhs, typename RetScalar>
-struct product_coeff_impl<DefaultTraversal, Dynamic, Lhs, Rhs, RetScalar>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar& res)
-  {
-    res = (lhs.row(row).transpose().cwiseProduct( rhs.col(col) )).sum();
-  }
-};
-
-/*******************************************
-*** Scalar path with inner vectorization ***
-*******************************************/
-
-template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet>
-struct product_coeff_vectorized_unroller
-{
-  typedef typename Lhs::Index Index;
-  enum { PacketSize = packet_traits<typename Lhs::Scalar>::size };
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::PacketScalar &pres)
-  {
-    product_coeff_vectorized_unroller<UnrollingIndex-PacketSize, Lhs, Rhs, Packet>::run(row, col, lhs, rhs, pres);
-    pres = padd(pres, pmul( lhs.template packet<Aligned>(row, UnrollingIndex) , rhs.template packet<Aligned>(UnrollingIndex, col) ));
-  }
-};
-
-template<typename Lhs, typename Rhs, typename Packet>
-struct product_coeff_vectorized_unroller<0, Lhs, Rhs, Packet>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::PacketScalar &pres)
-  {
-    pres = pmul(lhs.template packet<Aligned>(row, 0) , rhs.template packet<Aligned>(0, col));
-  }
-};
-
-template<typename Lhs, typename Rhs, typename RetScalar>
-struct product_coeff_impl<InnerVectorizedTraversal, 0, Lhs, Rhs, RetScalar>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, RetScalar &res)
-  {
-    res = 0;
-  }
-};
-
-template<int UnrollingIndex, typename Lhs, typename Rhs, typename RetScalar>
-struct product_coeff_impl<InnerVectorizedTraversal, UnrollingIndex, Lhs, Rhs, RetScalar>
-{
-  typedef typename Lhs::PacketScalar Packet;
-  typedef typename Lhs::Index Index;
-  enum { PacketSize = packet_traits<typename Lhs::Scalar>::size };
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, RetScalar &res)
-  {
-    Packet pres;
-    product_coeff_vectorized_unroller<UnrollingIndex-PacketSize, Lhs, Rhs, Packet>::run(row, col, lhs, rhs, pres);
-    res = predux(pres);
-  }
-};
-
-template<typename Lhs, typename Rhs, int LhsRows = Lhs::RowsAtCompileTime, int RhsCols = Rhs::ColsAtCompileTime>
-struct product_coeff_vectorized_dyn_selector
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
-  {
-    res = lhs.row(row).transpose().cwiseProduct(rhs.col(col)).sum();
-  }
-};
-
-// NOTE the 3 following specializations are because taking .col(0) on a vector is a bit slower
-// NOTE maybe they are now useless since we have a specialization for Block<Matrix>
-template<typename Lhs, typename Rhs, int RhsCols>
-struct product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,RhsCols>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index /*row*/, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
-  {
-    res = lhs.transpose().cwiseProduct(rhs.col(col)).sum();
-  }
-};
-
-template<typename Lhs, typename Rhs, int LhsRows>
-struct product_coeff_vectorized_dyn_selector<Lhs,Rhs,LhsRows,1>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
-  {
-    res = lhs.row(row).transpose().cwiseProduct(rhs).sum();
-  }
-};
-
-template<typename Lhs, typename Rhs>
-struct product_coeff_vectorized_dyn_selector<Lhs,Rhs,1,1>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
-  {
-    res = lhs.transpose().cwiseProduct(rhs).sum();
-  }
-};
-
-template<typename Lhs, typename Rhs, typename RetScalar>
-struct product_coeff_impl<InnerVectorizedTraversal, Dynamic, Lhs, Rhs, RetScalar>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, typename Lhs::Scalar &res)
-  {
-    product_coeff_vectorized_dyn_selector<Lhs,Rhs>::run(row, col, lhs, rhs, res);
-  }
-};
-
-/*******************
-*** Packet path  ***
-*******************/
-
-template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct product_packet_impl<RowMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res)
-  {
-    product_packet_impl<RowMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, res);
-    res =  pmadd(pset1<Packet>(lhs.coeff(row, UnrollingIndex-1)), rhs.template packet<LoadMode>(UnrollingIndex-1, col), res);
-  }
-};
-
-template<int UnrollingIndex, typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct product_packet_impl<ColMajor, UnrollingIndex, Lhs, Rhs, Packet, LoadMode>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res)
-  {
-    product_packet_impl<ColMajor, UnrollingIndex-1, Lhs, Rhs, Packet, LoadMode>::run(row, col, lhs, rhs, res);
-    res =  pmadd(lhs.template packet<LoadMode>(row, UnrollingIndex-1), pset1<Packet>(rhs.coeff(UnrollingIndex-1, col)), res);
-  }
-};
-
-template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct product_packet_impl<RowMajor, 1, Lhs, Rhs, Packet, LoadMode>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res)
-  {
-    res = pmul(pset1<Packet>(lhs.coeff(row, 0)),rhs.template packet<LoadMode>(0, col));
-  }
-};
-
-template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct product_packet_impl<ColMajor, 1, Lhs, Rhs, Packet, LoadMode>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet &res)
-  {
-    res = pmul(lhs.template packet<LoadMode>(row, 0), pset1<Packet>(rhs.coeff(0, col)));
-  }
-};
-
-template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct product_packet_impl<RowMajor, 0, Lhs, Rhs, Packet, LoadMode>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Packet &res)
-  {
-    res = pset1<Packet>(0);
-  }
-};
-
-template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct product_packet_impl<ColMajor, 0, Lhs, Rhs, Packet, LoadMode>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index /*row*/, Index /*col*/, const Lhs& /*lhs*/, const Rhs& /*rhs*/, Packet &res)
-  {
-    res = pset1<Packet>(0);
-  }
-};
-
-template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct product_packet_impl<RowMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet& res)
-  {
-    res = pset1<Packet>(0);
-    for(Index i = 0; i < lhs.cols(); ++i)
-      res =  pmadd(pset1<Packet>(lhs.coeff(row, i)), rhs.template packet<LoadMode>(i, col), res);
-  }
-};
-
-template<typename Lhs, typename Rhs, typename Packet, int LoadMode>
-struct product_packet_impl<ColMajor, Dynamic, Lhs, Rhs, Packet, LoadMode>
-{
-  typedef typename Lhs::Index Index;
-  static EIGEN_STRONG_INLINE void run(Index row, Index col, const Lhs& lhs, const Rhs& rhs, Packet& res)
-  {
-    res = pset1<Packet>(0);
-    for(Index i = 0; i < lhs.cols(); ++i)
-      res =  pmadd(lhs.template packet<LoadMode>(row, i), pset1<Packet>(rhs.coeff(i, col)), res);
-  }
-};
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_COEFFBASED_PRODUCT_H
diff --git a/inst/include/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/inst/include/Eigen/src/Core/products/GeneralBlockPanelKernel.h
index bcdca5b0..e72c6b48 100644
--- a/inst/include/Eigen/src/Core/products/GeneralBlockPanelKernel.h
+++ b/inst/include/Eigen/src/Core/products/GeneralBlockPanelKernel.h
@@ -10,1095 +10,2538 @@
 #ifndef EIGEN_GENERAL_BLOCK_PANEL_H
 #define EIGEN_GENERAL_BLOCK_PANEL_H
 
-namespace Eigen { 
-  
+// IWYU pragma: private
+#include "../InternalHeaderCheck.h"
+
+namespace Eigen {
+
 namespace internal {
 
-template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs=false, bool _ConjRhs=false>
-class gebp_traits;
+enum GEBPPacketSizeType { GEBPPacketFull = 0, GEBPPacketHalf, GEBPPacketQuarter };
 
+template <typename LhsScalar_, typename RhsScalar_, bool ConjLhs_ = false, bool ConjRhs_ = false,
+          int Arch = Architecture::Target, int PacketSize_ = GEBPPacketFull>
+class gebp_traits;
 
 /** \internal \returns b if a<=0, and returns a otherwise. */
-inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff_t b)
-{
-  return a<=0 ? b : a;
-}
+inline std::ptrdiff_t manage_caching_sizes_helper(std::ptrdiff_t a, std::ptrdiff_t b) { return a <= 0 ? b : a; }
+
+#if defined(EIGEN_DEFAULT_L1_CACHE_SIZE)
+#define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) EIGEN_DEFAULT_L1_CACHE_SIZE
+#else
+#define EIGEN_SET_DEFAULT_L1_CACHE_SIZE(val) val
+#endif  // defined(EIGEN_DEFAULT_L1_CACHE_SIZE)
+
+#if defined(EIGEN_DEFAULT_L2_CACHE_SIZE)
+#define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) EIGEN_DEFAULT_L2_CACHE_SIZE
+#else
+#define EIGEN_SET_DEFAULT_L2_CACHE_SIZE(val) val
+#endif  // defined(EIGEN_DEFAULT_L2_CACHE_SIZE)
+
+#if defined(EIGEN_DEFAULT_L3_CACHE_SIZE)
+#define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) EIGEN_DEFAULT_L3_CACHE_SIZE
+#else
+#define EIGEN_SET_DEFAULT_L3_CACHE_SIZE(val) val
+#endif  // defined(EIGEN_DEFAULT_L3_CACHE_SIZE)
+
+#if EIGEN_ARCH_i386_OR_x86_64
+const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(32 * 1024);
+const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(256 * 1024);
+const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(2 * 1024 * 1024);
+#elif EIGEN_ARCH_PPC
+const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(64 * 1024);
+#ifdef _ARCH_PWR10
+const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(2 * 1024 * 1024);
+const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(8 * 1024 * 1024);
+#else
+const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512 * 1024);
+const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(4 * 1024 * 1024);
+#endif
+#else
+const std::ptrdiff_t defaultL1CacheSize = EIGEN_SET_DEFAULT_L1_CACHE_SIZE(16 * 1024);
+const std::ptrdiff_t defaultL2CacheSize = EIGEN_SET_DEFAULT_L2_CACHE_SIZE(512 * 1024);
+const std::ptrdiff_t defaultL3CacheSize = EIGEN_SET_DEFAULT_L3_CACHE_SIZE(512 * 1024);
+#endif
+
+#undef EIGEN_SET_DEFAULT_L1_CACHE_SIZE
+#undef EIGEN_SET_DEFAULT_L2_CACHE_SIZE
+#undef EIGEN_SET_DEFAULT_L3_CACHE_SIZE
+
+/** \internal */
+struct CacheSizes {
+  CacheSizes() : m_l1(-1), m_l2(-1), m_l3(-1) {
+    int l1CacheSize, l2CacheSize, l3CacheSize;
+    queryCacheSizes(l1CacheSize, l2CacheSize, l3CacheSize);
+    m_l1 = manage_caching_sizes_helper(l1CacheSize, defaultL1CacheSize);
+    m_l2 = manage_caching_sizes_helper(l2CacheSize, defaultL2CacheSize);
+    m_l3 = manage_caching_sizes_helper(l3CacheSize, defaultL3CacheSize);
+  }
+
+  std::ptrdiff_t m_l1;
+  std::ptrdiff_t m_l2;
+  std::ptrdiff_t m_l3;
+};
 
 /** \internal */
-inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1=0, std::ptrdiff_t* l2=0)
-{
-  static std::ptrdiff_t m_l1CacheSize = 0;
-  static std::ptrdiff_t m_l2CacheSize = 0;
-  if(m_l2CacheSize==0)
-  {
-    m_l1CacheSize = manage_caching_sizes_helper(queryL1CacheSize(),8 * 1024);
-    m_l2CacheSize = manage_caching_sizes_helper(queryTopLevelCacheSize(),1*1024*1024);
-  }
-  
-  if(action==SetAction)
-  {
+inline void manage_caching_sizes(Action action, std::ptrdiff_t* l1, std::ptrdiff_t* l2, std::ptrdiff_t* l3) {
+  static CacheSizes m_cacheSizes;
+
+  if (action == SetAction) {
     // set the cpu cache size and cache all block sizes from a global cache size in byte
-    eigen_internal_assert(l1!=0 && l2!=0);
-    m_l1CacheSize = *l1;
-    m_l2CacheSize = *l2;
-  }
-  else if(action==GetAction)
-  {
-    eigen_internal_assert(l1!=0 && l2!=0);
-    *l1 = m_l1CacheSize;
-    *l2 = m_l2CacheSize;
-  }
-  else
-  {
+    eigen_internal_assert(l1 != 0 && l2 != 0);
+    m_cacheSizes.m_l1 = *l1;
+    m_cacheSizes.m_l2 = *l2;
+    m_cacheSizes.m_l3 = *l3;
+  } else if (action == GetAction) {
+    eigen_internal_assert(l1 != 0 && l2 != 0);
+    *l1 = m_cacheSizes.m_l1;
+    *l2 = m_cacheSizes.m_l2;
+    *l3 = m_cacheSizes.m_l3;
+  } else {
     eigen_internal_assert(false);
   }
 }
 
-/** \brief Computes the blocking parameters for a m x k times k x n matrix product
-  *
-  * \param[in,out] k Input: the third dimension of the product. Output: the blocking size along the same dimension.
-  * \param[in,out] m Input: the number of rows of the left hand side. Output: the blocking size along the same dimension.
-  * \param[in,out] n Input: the number of columns of the right hand side. Output: the blocking size along the same dimension.
-  *
-  * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
-  * this function computes the blocking size parameters along the respective dimensions
-  * for matrix products and related algorithms. The blocking sizes depends on various
-  * parameters:
-  * - the L1 and L2 cache sizes,
-  * - the register level blocking sizes defined by gebp_traits,
-  * - the number of scalars that fit into a packet (when vectorization is enabled).
-  *
-  * \sa setCpuCacheSizes */
-template<typename LhsScalar, typename RhsScalar, int KcFactor, typename SizeType>
-void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n)
-{
-  EIGEN_UNUSED_VARIABLE(n);
-  // Explanations:
-  // Let's recall the product algorithms form kc x nc horizontal panels B' on the rhs and
-  // mc x kc blocks A' on the lhs. A' has to fit into L2 cache. Moreover, B' is processed
-  // per kc x nr vertical small panels where nr is the blocking size along the n dimension
-  // at the register level. For vectorization purpose, these small vertical panels are unpacked,
-  // e.g., each coefficient is replicated to fit a packet. This small vertical panel has to
-  // stay in L1 cache.
-  std::ptrdiff_t l1, l2;
-
-  typedef gebp_traits<LhsScalar,RhsScalar> Traits;
-  enum {
-    kdiv = KcFactor * 2 * Traits::nr
-         * Traits::RhsProgress * sizeof(RhsScalar),
-    mr = gebp_traits<LhsScalar,RhsScalar>::mr,
-    mr_mask = (0xffffffff/mr)*mr
-  };
+/* Helper for computeProductBlockingSizes.
+ *
+ * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
+ * this function computes the blocking size parameters along the respective dimensions
+ * for matrix products and related algorithms. The blocking sizes depends on various
+ * parameters:
+ * - the L1 and L2 cache sizes,
+ * - the register level blocking sizes defined by gebp_traits,
+ * - the number of scalars that fit into a packet (when vectorization is enabled).
+ *
+ * \sa setCpuCacheSizes */
 
-  manage_caching_sizes(GetAction, &l1, &l2);
-  k = std::min<SizeType>(k, l1/kdiv);
-  SizeType _m = k>0 ? l2/(4 * sizeof(LhsScalar) * k) : 0;
-  if(_m<m) m = _m & mr_mask;
-}
+template <typename LhsScalar, typename RhsScalar, int KcFactor, typename Index>
+void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index num_threads = 1) {
+  typedef gebp_traits<LhsScalar, RhsScalar> Traits;
 
-template<typename LhsScalar, typename RhsScalar, typename SizeType>
-inline void computeProductBlockingSizes(SizeType& k, SizeType& m, SizeType& n)
-{
-  computeProductBlockingSizes<LhsScalar,RhsScalar,1>(k, m, n);
-}
+  // Explanations:
+  // Let's recall that the product algorithms form mc x kc vertical panels A' on the lhs and
+  // kc x nc blocks B' on the rhs. B' has to fit into L2/L3 cache. Moreover, A' is processed
+  // per mr x kc horizontal small panels where mr is the blocking size along the m dimension
+  // at the register level. This small horizontal panel has to stay within L1 cache.
+  std::ptrdiff_t l1, l2, l3;
+  manage_caching_sizes(GetAction, &l1, &l2, &l3);
+#ifdef EIGEN_VECTORIZE_AVX512
+  // We need to find a rationale for that, but without this adjustment,
+  // performance with AVX512 is pretty bad, like -20% slower.
+  // One reason is that with increasing packet-size, the blocking size k
+  // has to become pretty small if we want that 1 lhs panel fit within L1.
+  // For instance, with the 3pX4 kernel and double, the size of the lhs+rhs panels are:
+  //   k*(3*64 + 4*8) Bytes, with l1=32kBytes, and k%8=0, we have k=144.
+  // This is quite small for a good reuse of the accumulation registers.
+  l1 *= 4;
+#endif
 
-#ifdef EIGEN_HAS_FUSE_CJMADD
-  #define MADD(CJ,A,B,C,T)  C = CJ.pmadd(A,B,C);
-#else
+  if (num_threads > 1) {
+    typedef typename Traits::ResScalar ResScalar;
+    enum {
+      kdiv = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
+      ksub = Traits::mr * (Traits::nr * sizeof(ResScalar)),
+      kr = 8,
+      mr = Traits::mr,
+      nr = Traits::nr
+    };
+    // Increasing k gives us more time to prefetch the content of the "C"
+    // registers. However once the latency is hidden there is no point in
+    // increasing the value of k, so we'll cap it at 320 (value determined
+    // experimentally).
+    // To avoid that k vanishes, we make k_cache at least as big as kr
+    const Index k_cache = numext::maxi<Index>(kr, (numext::mini<Index>)((l1 - ksub) / kdiv, 320));
+    if (k_cache < k) {
+      k = k_cache - (k_cache % kr);
+      eigen_internal_assert(k > 0);
+    }
 
-  // FIXME (a bit overkill maybe ?)
+    const Index n_cache = (l2 - l1) / (nr * sizeof(RhsScalar) * k);
+    const Index n_per_thread = numext::div_ceil(n, num_threads);
+    if (n_cache <= n_per_thread) {
+      // Don't exceed the capacity of the l2 cache.
+      eigen_internal_assert(n_cache >= static_cast<Index>(nr));
+      n = n_cache - (n_cache % nr);
+      eigen_internal_assert(n > 0);
+    } else {
+      n = (numext::mini<Index>)(n, (n_per_thread + nr - 1) - ((n_per_thread + nr - 1) % nr));
+    }
 
-  template<typename CJ, typename A, typename B, typename C, typename T> struct gebp_madd_selector {
-    EIGEN_ALWAYS_INLINE static void run(const CJ& cj, A& a, B& b, C& c, T& /*t*/)
-    {
-      c = cj.pmadd(a,b,c);
+    if (l3 > l2) {
+      // l3 is shared between all cores, so we'll give each thread its own chunk of l3.
+      const Index m_cache = (l3 - l2) / (sizeof(LhsScalar) * k * num_threads);
+      const Index m_per_thread = numext::div_ceil(m, num_threads);
+      if (m_cache < m_per_thread && m_cache >= static_cast<Index>(mr)) {
+        m = m_cache - (m_cache % mr);
+        eigen_internal_assert(m > 0);
+      } else {
+        m = (numext::mini<Index>)(m, (m_per_thread + mr - 1) - ((m_per_thread + mr - 1) % mr));
+      }
     }
-  };
+  } else {
+    // In unit tests we do not want to use extra large matrices,
+    // so we reduce the cache size to check the blocking strategy is not flawed
+#ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
+    l1 = 9 * 1024;
+    l2 = 32 * 1024;
+    l3 = 512 * 1024;
+#endif
 
-  template<typename CJ, typename T> struct gebp_madd_selector<CJ,T,T,T,T> {
-    EIGEN_ALWAYS_INLINE static void run(const CJ& cj, T& a, T& b, T& c, T& t)
-    {
-      t = b; t = cj.pmul(a,t); c = padd(c,t);
+    // Early return for small problems because the computation below are time consuming for small problems.
+    // Perhaps it would make more sense to consider k*n*m??
+    // Note that for very tiny problem, this function should be bypassed anyway
+    // because we use the coefficient-based implementation for them.
+    if ((numext::maxi)(k, (numext::maxi)(m, n)) < 48) return;
+
+    typedef typename Traits::ResScalar ResScalar;
+    enum {
+      k_peeling = 8,
+      k_div = KcFactor * (Traits::mr * sizeof(LhsScalar) + Traits::nr * sizeof(RhsScalar)),
+      k_sub = Traits::mr * (Traits::nr * sizeof(ResScalar))
+    };
+
+    // ---- 1st level of blocking on L1, yields kc ----
+
+    // Blocking on the third dimension (i.e., k) is chosen so that an horizontal panel
+    // of size mr x kc of the lhs plus a vertical panel of kc x nr of the rhs both fits within L1 cache.
+    // We also include a register-level block of the result (mx x nr).
+    // (In an ideal world only the lhs panel would stay in L1)
+    // Moreover, kc has to be a multiple of 8 to be compatible with loop peeling, leading to a maximum blocking size of:
+    const Index max_kc = numext::maxi<Index>(((l1 - k_sub) / k_div) & (~(k_peeling - 1)), 1);
+    const Index old_k = k;
+    if (k > max_kc) {
+      // We are really blocking on the third dimension:
+      // -> reduce blocking size to make sure the last block is as large as possible
+      //    while keeping the same number of sweeps over the result.
+      k = (k % max_kc) == 0 ? max_kc
+                            : max_kc - k_peeling * ((max_kc - 1 - (k % max_kc)) / (k_peeling * (k / max_kc + 1)));
+
+      eigen_internal_assert(((old_k / k) == (old_k / max_kc)) && "the number of sweeps has to remain the same");
     }
-  };
 
-  template<typename CJ, typename A, typename B, typename C, typename T>
-  EIGEN_STRONG_INLINE void gebp_madd(const CJ& cj, A& a, B& b, C& c, T& t)
-  {
-    gebp_madd_selector<CJ,A,B,C,T>::run(cj,a,b,c,t);
+// ---- 2nd level of blocking on max(L2,L3), yields nc ----
+
+// TODO find a reliable way to get the actual amount of cache per core to use for 2nd level blocking, that is:
+//      actual_l2 = max(l2, l3/nb_core_sharing_l3)
+// The number below is quite conservative: it is better to underestimate the cache size rather than overestimating it)
+// For instance, it corresponds to 6MB of L3 shared among 4 cores.
+#ifdef EIGEN_DEBUG_SMALL_PRODUCT_BLOCKS
+    const Index actual_l2 = l3;
+#else
+    const Index actual_l2 = 1572864;  // == 1.5 MB
+#endif
+
+    // Here, nc is chosen such that a block of kc x nc of the rhs fit within half of L2.
+    // The second half is implicitly reserved to access the result and lhs coefficients.
+    // When k<max_kc, then nc can arbitrarily growth. In practice, it seems to be fruitful
+    // to limit this growth: we bound nc to growth by a factor x1.5.
+    // However, if the entire lhs block fit within L1, then we are not going to block on the rows at all,
+    // and it becomes fruitful to keep the packed rhs blocks in L1 if there is enough remaining space.
+    Index max_nc;
+    const Index lhs_bytes = m * k * sizeof(LhsScalar);
+    const Index remaining_l1 = l1 - k_sub - lhs_bytes;
+    if (remaining_l1 >= Index(Traits::nr * sizeof(RhsScalar)) * k) {
+      // L1 blocking
+      max_nc = remaining_l1 / (k * sizeof(RhsScalar));
+    } else {
+      // L2 blocking
+      max_nc = (3 * actual_l2) / (2 * 2 * max_kc * sizeof(RhsScalar));
+    }
+    // WARNING Below, we assume that Traits::nr is a power of two.
+    Index nc = numext::mini<Index>(actual_l2 / (2 * k * sizeof(RhsScalar)), max_nc) & (~(Traits::nr - 1));
+    if (n > nc) {
+      // We are really blocking over the columns:
+      // -> reduce blocking size to make sure the last block is as large as possible
+      //    while keeping the same number of sweeps over the packed lhs.
+      //    Here we allow one more sweep if this gives us a perfect match, thus the commented "-1"
+      n = (n % nc) == 0 ? nc : (nc - Traits::nr * ((nc /*-1*/ - (n % nc)) / (Traits::nr * (n / nc + 1))));
+    } else if (old_k == k) {
+      // So far, no blocking at all, i.e., kc==k, and nc==n.
+      // In this case, let's perform a blocking over the rows such that the packed lhs data is kept in cache L1/L2
+      // TODO: part of this blocking strategy is now implemented within the kernel itself, so the L1-based heuristic
+      // here should be obsolete.
+      Index problem_size = k * n * sizeof(LhsScalar);
+      Index actual_lm = actual_l2;
+      Index max_mc = m;
+      if (problem_size <= 1024) {
+        // problem is small enough to keep in L1
+        // Let's choose m such that lhs's block fit in 1/3 of L1
+        actual_lm = l1;
+      } else if (l3 != 0 && problem_size <= 32768) {
+        // we have both L2 and L3, and problem is small enough to be kept in L2
+        // Let's choose m such that lhs's block fit in 1/3 of L2
+        actual_lm = l2;
+        max_mc = (numext::mini<Index>)(576, max_mc);
+      }
+      Index mc = (numext::mini<Index>)(actual_lm / (3 * k * sizeof(LhsScalar)), max_mc);
+      if (mc > Traits::mr)
+        mc -= mc % Traits::mr;
+      else if (mc == 0)
+        return;
+      m = (m % mc) == 0 ? mc : (mc - Traits::mr * ((mc /*-1*/ - (m % mc)) / (Traits::mr * (m / mc + 1))));
+    }
   }
+}
 
-  #define MADD(CJ,A,B,C,T)  gebp_madd(CJ,A,B,C,T);
-//   #define MADD(CJ,A,B,C,T)  T = B; T = CJ.pmul(A,T); C = padd(C,T);
+template <typename Index>
+inline bool useSpecificBlockingSizes(Index& k, Index& m, Index& n) {
+#ifdef EIGEN_TEST_SPECIFIC_BLOCKING_SIZES
+  if (EIGEN_TEST_SPECIFIC_BLOCKING_SIZES) {
+    k = numext::mini<Index>(k, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_K);
+    m = numext::mini<Index>(m, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_M);
+    n = numext::mini<Index>(n, EIGEN_TEST_SPECIFIC_BLOCKING_SIZE_N);
+    return true;
+  }
+#else
+  EIGEN_UNUSED_VARIABLE(k)
+  EIGEN_UNUSED_VARIABLE(m)
+  EIGEN_UNUSED_VARIABLE(n)
 #endif
+  return false;
+}
+
+/** \brief Computes the blocking parameters for a m x k times k x n matrix product
+ *
+ * \param[in,out] k Input: the third dimension of the product. Output: the blocking size along the same dimension.
+ * \param[in,out] m Input: the number of rows of the left hand side. Output: the blocking size along the same dimension.
+ * \param[in,out] n Input: the number of columns of the right hand side. Output: the blocking size along the same
+ *                         dimension.
+ * \param[in] num_threads Input: the number of threads used for the computation.
+ *
+ * Given a m x k times k x n matrix product of scalar types \c LhsScalar and \c RhsScalar,
+ * this function computes the blocking size parameters along the respective dimensions
+ * for matrix products and related algorithms.
+ *
+ * The blocking size parameters may be evaluated:
+ *   - either by a heuristic based on cache sizes;
+ *   - or using fixed prescribed values (for testing purposes).
+ *
+ * \sa setCpuCacheSizes */
+
+template <typename LhsScalar, typename RhsScalar, int KcFactor, typename Index>
+void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1) {
+  if (!useSpecificBlockingSizes(k, m, n)) {
+    evaluateProductBlockingSizesHeuristic<LhsScalar, RhsScalar, KcFactor, Index>(k, m, n, num_threads);
+  }
+}
+
+template <typename LhsScalar, typename RhsScalar, typename Index>
+inline void computeProductBlockingSizes(Index& k, Index& m, Index& n, Index num_threads = 1) {
+  computeProductBlockingSizes<LhsScalar, RhsScalar, 1, Index>(k, m, n, num_threads);
+}
+
+template <typename RhsPacket, typename RhsPacketx4, int registers_taken>
+struct RhsPanelHelper {
+ private:
+  static constexpr int remaining_registers =
+      (std::max)(int(EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS) - registers_taken, 0);
+
+ public:
+  typedef std::conditional_t<remaining_registers >= 4, RhsPacketx4, RhsPacket> type;
+};
+
+template <typename Packet>
+struct QuadPacket {
+  Packet B_0, B1, B2, B3;
+  const Packet& get(const FixedInt<0>&) const { return B_0; }
+  const Packet& get(const FixedInt<1>&) const { return B1; }
+  const Packet& get(const FixedInt<2>&) const { return B2; }
+  const Packet& get(const FixedInt<3>&) const { return B3; }
+};
+
+template <int N, typename T1, typename T2, typename T3>
+struct packet_conditional {
+  typedef T3 type;
+};
+
+template <typename T1, typename T2, typename T3>
+struct packet_conditional<GEBPPacketFull, T1, T2, T3> {
+  typedef T1 type;
+};
+
+template <typename T1, typename T2, typename T3>
+struct packet_conditional<GEBPPacketHalf, T1, T2, T3> {
+  typedef T2 type;
+};
+
+#define PACKET_DECL_COND_POSTFIX(postfix, name, packet_size)                                               \
+  typedef typename packet_conditional<                                                                     \
+      packet_size, typename packet_traits<name##Scalar>::type, typename packet_traits<name##Scalar>::half, \
+      typename unpacket_traits<typename packet_traits<name##Scalar>::half>::half>::type name##Packet##postfix
+
+#define PACKET_DECL_COND(name, packet_size)                                                                \
+  typedef typename packet_conditional<                                                                     \
+      packet_size, typename packet_traits<name##Scalar>::type, typename packet_traits<name##Scalar>::half, \
+      typename unpacket_traits<typename packet_traits<name##Scalar>::half>::half>::type name##Packet
+
+#define PACKET_DECL_COND_SCALAR_POSTFIX(postfix, packet_size)                                  \
+  typedef typename packet_conditional<                                                         \
+      packet_size, typename packet_traits<Scalar>::type, typename packet_traits<Scalar>::half, \
+      typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type ScalarPacket##postfix
+
+#define PACKET_DECL_COND_SCALAR(packet_size)                                                   \
+  typedef typename packet_conditional<                                                         \
+      packet_size, typename packet_traits<Scalar>::type, typename packet_traits<Scalar>::half, \
+      typename unpacket_traits<typename packet_traits<Scalar>::half>::half>::type ScalarPacket
 
 /* Vectorization logic
  *  real*real: unpack rhs to constant packets, ...
- * 
+ *
  *  cd*cd : unpack rhs to (b_r,b_r), (b_i,b_i), mul to get (a_r b_r,a_i b_r) (a_r b_i,a_i b_i),
  *          storing each res packet into two packets (2x2),
- *          at the end combine them: swap the second and addsub them 
+ *          at the end combine them: swap the second and addsub them
  *  cf*cf : same but with 2x4 blocks
  *  cplx*real : unpack rhs to constant packets, ...
  *  real*cplx : load lhs as (a0,a0,a1,a1), and mul as usual
  */
-template<typename _LhsScalar, typename _RhsScalar, bool _ConjLhs, bool _ConjRhs>
-class gebp_traits
-{
-public:
-  typedef _LhsScalar LhsScalar;
-  typedef _RhsScalar RhsScalar;
-  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+template <typename LhsScalar_, typename RhsScalar_, bool ConjLhs_, bool ConjRhs_, int Arch, int PacketSize_>
+class gebp_traits {
+ public:
+  typedef LhsScalar_ LhsScalar;
+  typedef RhsScalar_ RhsScalar;
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+
+  PACKET_DECL_COND_POSTFIX(_, Lhs, PacketSize_);
+  PACKET_DECL_COND_POSTFIX(_, Rhs, PacketSize_);
+  PACKET_DECL_COND_POSTFIX(_, Res, PacketSize_);
 
   enum {
-    ConjLhs = _ConjLhs,
-    ConjRhs = _ConjRhs,
-    Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
-    LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
-    RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
-    ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
-    
+    ConjLhs = ConjLhs_,
+    ConjRhs = ConjRhs_,
+    Vectorizable = unpacket_traits<LhsPacket_>::vectorizable && unpacket_traits<RhsPacket_>::vectorizable,
+    LhsPacketSize = Vectorizable ? unpacket_traits<LhsPacket_>::size : 1,
+    RhsPacketSize = Vectorizable ? unpacket_traits<RhsPacket_>::size : 1,
+    ResPacketSize = Vectorizable ? unpacket_traits<ResPacket_>::size : 1,
+
     NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
 
-    // register block size along the N direction (must be either 2 or 4)
-    nr = NumberOfRegisters/4,
+    // register block size along the N direction must be 1 or 4
+    nr = 4,
 
     // register block size along the M direction (currently, this one cannot be modified)
-    mr = 2 * LhsPacketSize,
-    
-    WorkSpaceFactor = nr * RhsPacketSize,
+    default_mr = (plain_enum_min(16, NumberOfRegisters) / 2 / nr) * LhsPacketSize,
+#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && \
+    !defined(EIGEN_VECTORIZE_VSX) && ((!EIGEN_COMP_MSVC) || (EIGEN_COMP_MSVC >= 1914))
+    // we assume 16 registers or more
+    // See bug 992, if the scalar type is not vectorizable but that EIGEN_HAS_SINGLE_INSTRUCTION_MADD is defined,
+    // then using 3*LhsPacketSize triggers non-implemented paths in syrk.
+    // Bug 1515: MSVC prior to v19.14 yields to register spilling.
+    mr = Vectorizable ? 3 * LhsPacketSize : default_mr,
+#else
+    mr = default_mr,
+#endif
 
     LhsProgress = LhsPacketSize,
-    RhsProgress = RhsPacketSize
+    RhsProgress = 1
   };
 
-  typedef typename packet_traits<LhsScalar>::type  _LhsPacket;
-  typedef typename packet_traits<RhsScalar>::type  _RhsPacket;
-  typedef typename packet_traits<ResScalar>::type  _ResPacket;
-
-  typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
-  typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
-  typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
+  typedef std::conditional_t<Vectorizable, LhsPacket_, LhsScalar> LhsPacket;
+  typedef std::conditional_t<Vectorizable, RhsPacket_, RhsScalar> RhsPacket;
+  typedef std::conditional_t<Vectorizable, ResPacket_, ResScalar> ResPacket;
+  typedef LhsPacket LhsPacket4Packing;
 
+  typedef QuadPacket<RhsPacket> RhsPacketx4;
   typedef ResPacket AccPacket;
-  
-  EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
-  {
-    p = pset1<ResPacket>(ResScalar(0));
+
+  EIGEN_STRONG_INLINE void initAcc(AccPacket& p) { p = pset1<ResPacket>(ResScalar(0)); }
+
+  template <typename RhsPacketType>
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const {
+    dest = pset1<RhsPacketType>(*b);
+  }
+
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const {
+    pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
+  }
+
+  template <typename RhsPacketType>
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const {
+    loadRhs(b, dest);
   }
 
-  EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b)
-  {
-    for(DenseIndex k=0; k<n; k++)
-      pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]);
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {}
+
+  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { dest = ploadquad<RhsPacket>(b); }
+
+  template <typename LhsPacketType>
+  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacketType& dest) const {
+    dest = pload<LhsPacketType>(a);
   }
 
-  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
-  {
-    dest = pload<RhsPacket>(b);
+  template <typename LhsPacketType>
+  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const {
+    dest = ploadu<LhsPacketType>(a);
   }
 
-  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
-  {
-    dest = pload<LhsPacket>(a);
+  template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
+  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp,
+                                const LaneIdType&) const {
+    conj_helper<LhsPacketType, RhsPacketType, ConjLhs, ConjRhs> cj;
+    // It would be a lot cleaner to call pmadd all the time. Unfortunately if we
+    // let gcc allocate the register in which to store the result of the pmul
+    // (in the case where there is no FMA) gcc fails to figure out how to avoid
+    // spilling register.
+#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+    EIGEN_UNUSED_VARIABLE(tmp);
+    c = cj.pmadd(a, b, c);
+#else
+    tmp = b;
+    tmp = cj.pmul(a, tmp);
+    c = padd(c, tmp);
+#endif
   }
 
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, AccPacket& tmp) const
-  {
-    tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp);
+  template <typename LhsPacketType, typename AccPacketType, typename LaneIdType>
+  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp,
+                                const LaneIdType& lane) const {
+    madd(a, b.get(lane), c, tmp, lane);
   }
 
-  EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
-  {
-    r = pmadd(c,alpha,r);
+  EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const {
+    r = pmadd(c, alpha, r);
   }
 
-protected:
-//   conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
-//   conj_helper<LhsPacket,RhsPacket,ConjLhs,ConjRhs> pcj;
+  template <typename ResPacketHalf>
+  EIGEN_STRONG_INLINE void acc(const ResPacketHalf& c, const ResPacketHalf& alpha, ResPacketHalf& r) const {
+    r = pmadd(c, alpha, r);
+  }
 };
 
-template<typename RealScalar, bool _ConjLhs>
-class gebp_traits<std::complex<RealScalar>, RealScalar, _ConjLhs, false>
-{
-public:
+template <typename RealScalar, bool ConjLhs_, int Arch, int PacketSize_>
+class gebp_traits<std::complex<RealScalar>, RealScalar, ConjLhs_, false, Arch, PacketSize_> {
+ public:
   typedef std::complex<RealScalar> LhsScalar;
   typedef RealScalar RhsScalar;
-  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+
+  PACKET_DECL_COND_POSTFIX(_, Lhs, PacketSize_);
+  PACKET_DECL_COND_POSTFIX(_, Rhs, PacketSize_);
+  PACKET_DECL_COND_POSTFIX(_, Res, PacketSize_);
 
   enum {
-    ConjLhs = _ConjLhs,
+    ConjLhs = ConjLhs_,
     ConjRhs = false,
-    Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable,
-    LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
-    RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
-    ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
-    
+    Vectorizable = unpacket_traits<LhsPacket_>::vectorizable && unpacket_traits<RhsPacket_>::vectorizable,
+    LhsPacketSize = Vectorizable ? unpacket_traits<LhsPacket_>::size : 1,
+    RhsPacketSize = Vectorizable ? unpacket_traits<RhsPacket_>::size : 1,
+    ResPacketSize = Vectorizable ? unpacket_traits<ResPacket_>::size : 1,
+
     NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
-    nr = NumberOfRegisters/4,
-    mr = 2 * LhsPacketSize,
-    WorkSpaceFactor = nr*RhsPacketSize,
+    nr = 4,
+#if defined(EIGEN_HAS_SINGLE_INSTRUCTION_MADD) && !defined(EIGEN_VECTORIZE_ALTIVEC) && !defined(EIGEN_VECTORIZE_VSX)
+    // we assume 16 registers
+    mr = 3 * LhsPacketSize,
+#else
+    mr = (plain_enum_min(16, NumberOfRegisters) / 2 / nr) * LhsPacketSize,
+#endif
 
     LhsProgress = LhsPacketSize,
-    RhsProgress = RhsPacketSize
+    RhsProgress = 1
   };
 
-  typedef typename packet_traits<LhsScalar>::type  _LhsPacket;
-  typedef typename packet_traits<RhsScalar>::type  _RhsPacket;
-  typedef typename packet_traits<ResScalar>::type  _ResPacket;
+  typedef std::conditional_t<Vectorizable, LhsPacket_, LhsScalar> LhsPacket;
+  typedef std::conditional_t<Vectorizable, RhsPacket_, RhsScalar> RhsPacket;
+  typedef std::conditional_t<Vectorizable, ResPacket_, ResScalar> ResPacket;
+  typedef LhsPacket LhsPacket4Packing;
 
-  typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
-  typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
-  typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
+  typedef QuadPacket<RhsPacket> RhsPacketx4;
 
   typedef ResPacket AccPacket;
 
-  EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
-  {
-    p = pset1<ResPacket>(ResScalar(0));
+  EIGEN_STRONG_INLINE void initAcc(AccPacket& p) { p = pset1<ResPacket>(ResScalar(0)); }
+
+  template <typename RhsPacketType>
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const {
+    dest = pset1<RhsPacketType>(*b);
   }
 
-  EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b)
-  {
-    for(DenseIndex k=0; k<n; k++)
-      pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]);
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const {
+    pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
   }
 
-  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
-  {
-    dest = pload<RhsPacket>(b);
+  template <typename RhsPacketType>
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const {
+    loadRhs(b, dest);
   }
 
-  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
-  {
-    dest = pload<LhsPacket>(a);
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {}
+
+  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const {
+    loadRhsQuad_impl(b, dest, std::conditional_t<RhsPacketSize == 16, true_type, false_type>());
   }
 
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
-  {
-    madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
+  EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const true_type&) const {
+    // FIXME we can do better!
+    // what we want here is a ploadheight
+    RhsScalar tmp[4] = {b[0], b[0], b[1], b[1]};
+    dest = ploadquad<RhsPacket>(tmp);
   }
 
-  EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
-  {
-    tmp = b; tmp = pmul(a.v,tmp); c.v = padd(c.v,tmp);
+  EIGEN_STRONG_INLINE void loadRhsQuad_impl(const RhsScalar* b, RhsPacket& dest, const false_type&) const {
+    eigen_internal_assert(RhsPacketSize <= 8);
+    dest = pset1<RhsPacket>(*b);
   }
 
-  EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const
-  {
+  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const { dest = pload<LhsPacket>(a); }
+
+  template <typename LhsPacketType>
+  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const {
+    dest = ploadu<LhsPacketType>(a);
+  }
+
+  template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
+  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp,
+                                const LaneIdType&) const {
+    madd_impl(a, b, c, tmp, std::conditional_t<Vectorizable, true_type, false_type>());
+  }
+
+  template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
+  EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c,
+                                     RhsPacketType& tmp, const true_type&) const {
+#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+    EIGEN_UNUSED_VARIABLE(tmp);
+    c.v = pmadd(a.v, b, c.v);
+#else
+    tmp = b;
+    tmp = pmul(a.v, tmp);
+    c.v = padd(c.v, tmp);
+#endif
+  }
+
+  EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/,
+                                     const false_type&) const {
     c += a * b;
   }
 
-  EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
-  {
-    r = cj.pmadd(c,alpha,r);
+  template <typename LhsPacketType, typename AccPacketType, typename LaneIdType>
+  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp,
+                                const LaneIdType& lane) const {
+    madd(a, b.get(lane), c, tmp, lane);
   }
 
-protected:
-  conj_helper<ResPacket,ResPacket,ConjLhs,false> cj;
+  template <typename ResPacketType, typename AccPacketType>
+  EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const {
+    conj_helper<ResPacketType, ResPacketType, ConjLhs, false> cj;
+    r = cj.pmadd(c, alpha, r);
+  }
+
+ protected:
+};
+
+template <typename Packet>
+struct DoublePacket {
+  Packet first;
+  Packet second;
 };
 
-template<typename RealScalar, bool _ConjLhs, bool _ConjRhs>
-class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, _ConjLhs, _ConjRhs >
-{
-public:
-  typedef std::complex<RealScalar>  Scalar;
-  typedef std::complex<RealScalar>  LhsScalar;
-  typedef std::complex<RealScalar>  RhsScalar;
-  typedef std::complex<RealScalar>  ResScalar;
-  
+template <typename Packet>
+DoublePacket<Packet> padd(const DoublePacket<Packet>& a, const DoublePacket<Packet>& b) {
+  DoublePacket<Packet> res;
+  res.first = padd(a.first, b.first);
+  res.second = padd(a.second, b.second);
+  return res;
+}
+
+// note that for DoublePacket<RealPacket> the "4" in "downto4"
+// corresponds to the number of complexes, so it means "8"
+// it terms of real coefficients.
+
+template <typename Packet>
+const DoublePacket<Packet>& predux_half_dowto4(const DoublePacket<Packet>& a,
+                                               std::enable_if_t<unpacket_traits<Packet>::size <= 8>* = 0) {
+  return a;
+}
+
+template <typename Packet>
+DoublePacket<typename unpacket_traits<Packet>::half> predux_half_dowto4(
+    const DoublePacket<Packet>& a, std::enable_if_t<unpacket_traits<Packet>::size == 16>* = 0) {
+  // yes, that's pretty hackish :(
+  DoublePacket<typename unpacket_traits<Packet>::half> res;
+  typedef std::complex<typename unpacket_traits<Packet>::type> Cplx;
+  typedef typename packet_traits<Cplx>::type CplxPacket;
+  res.first = predux_half_dowto4(CplxPacket(a.first)).v;
+  res.second = predux_half_dowto4(CplxPacket(a.second)).v;
+  return res;
+}
+
+// same here, "quad" actually means "8" in terms of real coefficients
+template <typename Scalar, typename RealPacket>
+void loadQuadToDoublePacket(const Scalar* b, DoublePacket<RealPacket>& dest,
+                            std::enable_if_t<unpacket_traits<RealPacket>::size <= 8>* = 0) {
+  dest.first = pset1<RealPacket>(numext::real(*b));
+  dest.second = pset1<RealPacket>(numext::imag(*b));
+}
+
+template <typename Scalar, typename RealPacket>
+void loadQuadToDoublePacket(const Scalar* b, DoublePacket<RealPacket>& dest,
+                            std::enable_if_t<unpacket_traits<RealPacket>::size == 16>* = 0) {
+  // yes, that's pretty hackish too :(
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  RealScalar r[4] = {numext::real(b[0]), numext::real(b[0]), numext::real(b[1]), numext::real(b[1])};
+  RealScalar i[4] = {numext::imag(b[0]), numext::imag(b[0]), numext::imag(b[1]), numext::imag(b[1])};
+  dest.first = ploadquad<RealPacket>(r);
+  dest.second = ploadquad<RealPacket>(i);
+}
+
+template <typename Packet>
+struct unpacket_traits<DoublePacket<Packet> > {
+  typedef DoublePacket<typename unpacket_traits<Packet>::half> half;
+  enum { size = 2 * unpacket_traits<Packet>::size };
+};
+// template<typename Packet>
+// DoublePacket<Packet> pmadd(const DoublePacket<Packet> &a, const DoublePacket<Packet> &b)
+// {
+//   DoublePacket<Packet> res;
+//   res.first  = padd(a.first, b.first);
+//   res.second = padd(a.second,b.second);
+//   return res;
+// }
+
+template <typename RealScalar, bool ConjLhs_, bool ConjRhs_, int Arch, int PacketSize_>
+class gebp_traits<std::complex<RealScalar>, std::complex<RealScalar>, ConjLhs_, ConjRhs_, Arch, PacketSize_> {
+ public:
+  typedef std::complex<RealScalar> Scalar;
+  typedef std::complex<RealScalar> LhsScalar;
+  typedef std::complex<RealScalar> RhsScalar;
+  typedef std::complex<RealScalar> ResScalar;
+
+  PACKET_DECL_COND_POSTFIX(_, Lhs, PacketSize_);
+  PACKET_DECL_COND_POSTFIX(_, Rhs, PacketSize_);
+  PACKET_DECL_COND_POSTFIX(_, Res, PacketSize_);
+  PACKET_DECL_COND(Real, PacketSize_);
+  PACKET_DECL_COND_SCALAR(PacketSize_);
+
   enum {
-    ConjLhs = _ConjLhs,
-    ConjRhs = _ConjRhs,
-    Vectorizable = packet_traits<RealScalar>::Vectorizable
-                && packet_traits<Scalar>::Vectorizable,
-    RealPacketSize  = Vectorizable ? packet_traits<RealScalar>::size : 1,
-    ResPacketSize   = Vectorizable ? packet_traits<ResScalar>::size : 1,
-    
-    nr = 2,
-    mr = 2 * ResPacketSize,
-    WorkSpaceFactor = Vectorizable ? 2*nr*RealPacketSize : nr,
+    ConjLhs = ConjLhs_,
+    ConjRhs = ConjRhs_,
+    Vectorizable = unpacket_traits<RealPacket>::vectorizable && unpacket_traits<ScalarPacket>::vectorizable,
+    ResPacketSize = Vectorizable ? unpacket_traits<ResPacket_>::size : 1,
+    LhsPacketSize = Vectorizable ? unpacket_traits<LhsPacket_>::size : 1,
+    RhsPacketSize = Vectorizable ? unpacket_traits<RhsScalar>::size : 1,
+    RealPacketSize = Vectorizable ? unpacket_traits<RealPacket>::size : 1,
+    NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
+
+    nr = 4,
+    mr = (plain_enum_min(16, NumberOfRegisters) / 2 / nr) * ResPacketSize,
 
     LhsProgress = ResPacketSize,
-    RhsProgress = Vectorizable ? 2*ResPacketSize : 1
-  };
-  
-  typedef typename packet_traits<RealScalar>::type RealPacket;
-  typedef typename packet_traits<Scalar>::type     ScalarPacket;
-  struct DoublePacket
-  {
-    RealPacket first;
-    RealPacket second;
+    RhsProgress = 1
   };
 
-  typedef typename conditional<Vectorizable,RealPacket,  Scalar>::type LhsPacket;
-  typedef typename conditional<Vectorizable,DoublePacket,Scalar>::type RhsPacket;
-  typedef typename conditional<Vectorizable,ScalarPacket,Scalar>::type ResPacket;
-  typedef typename conditional<Vectorizable,DoublePacket,Scalar>::type AccPacket;
-  
+  typedef DoublePacket<RealPacket> DoublePacketType;
+
+  typedef std::conditional_t<Vectorizable, ScalarPacket, Scalar> LhsPacket4Packing;
+  typedef std::conditional_t<Vectorizable, RealPacket, Scalar> LhsPacket;
+  typedef std::conditional_t<Vectorizable, DoublePacketType, Scalar> RhsPacket;
+  typedef std::conditional_t<Vectorizable, ScalarPacket, Scalar> ResPacket;
+  typedef std::conditional_t<Vectorizable, DoublePacketType, Scalar> AccPacket;
+
+  // this actually holds 8 packets!
+  typedef QuadPacket<RhsPacket> RhsPacketx4;
+
   EIGEN_STRONG_INLINE void initAcc(Scalar& p) { p = Scalar(0); }
 
-  EIGEN_STRONG_INLINE void initAcc(DoublePacket& p)
-  {
-    p.first   = pset1<RealPacket>(RealScalar(0));
-    p.second  = pset1<RealPacket>(RealScalar(0));
-  }
-
-  /* Unpack the rhs coeff such that each complex coefficient is spread into
-   * two packects containing respectively the real and imaginary coefficient
-   * duplicated as many time as needed: (x+iy) => [x, ..., x] [y, ..., y]
-   */
-  EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const Scalar* rhs, Scalar* b)
-  {
-    for(DenseIndex k=0; k<n; k++)
-    {
-      if(Vectorizable)
-      {
-        pstore1<RealPacket>((RealScalar*)&b[k*ResPacketSize*2+0],             real(rhs[k]));
-        pstore1<RealPacket>((RealScalar*)&b[k*ResPacketSize*2+ResPacketSize], imag(rhs[k]));
-      }
-      else
-        b[k] = rhs[k];
-    }
+  EIGEN_STRONG_INLINE void initAcc(DoublePacketType& p) {
+    p.first = pset1<RealPacket>(RealScalar(0));
+    p.second = pset1<RealPacket>(RealScalar(0));
+  }
+
+  // Scalar path
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ScalarPacket& dest) const { dest = pset1<ScalarPacket>(*b); }
+
+  // Vectorized path
+  template <typename RealPacketType>
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket<RealPacketType>& dest) const {
+    dest.first = pset1<RealPacketType>(numext::real(*b));
+    dest.second = pset1<RealPacketType>(numext::imag(*b));
+  }
+
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const {
+    loadRhs(b, dest.B_0);
+    loadRhs(b + 1, dest.B1);
+    loadRhs(b + 2, dest.B2);
+    loadRhs(b + 3, dest.B3);
+  }
+
+  // Scalar path
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, ScalarPacket& dest) const { loadRhs(b, dest); }
+
+  // Vectorized path
+  template <typename RealPacketType>
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, DoublePacket<RealPacketType>& dest) const {
+    loadRhs(b, dest);
   }
 
-  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, ResPacket& dest) const { dest = *b; }
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {}
 
-  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacket& dest) const
-  {
-    dest.first  = pload<RealPacket>((const RealScalar*)b);
-    dest.second = pload<RealPacket>((const RealScalar*)(b+ResPacketSize));
+  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const { loadRhs(b, dest); }
+  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, DoublePacketType& dest) const {
+    loadQuadToDoublePacket(b, dest);
   }
 
   // nothing special here
-  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
-  {
+  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const {
     dest = pload<LhsPacket>((const typename unpacket_traits<LhsPacket>::type*)(a));
   }
 
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, DoublePacket& c, RhsPacket& /*tmp*/) const
-  {
-    c.first   = padd(pmul(a,b.first), c.first);
-    c.second  = padd(pmul(a,b.second),c.second);
+  template <typename LhsPacketType>
+  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const {
+    dest = ploadu<LhsPacketType>((const typename unpacket_traits<LhsPacketType>::type*)(a));
+  }
+
+  template <typename LhsPacketType, typename RhsPacketType, typename ResPacketType, typename TmpType,
+            typename LaneIdType>
+  EIGEN_STRONG_INLINE std::enable_if_t<!is_same<RhsPacketType, RhsPacketx4>::value> madd(const LhsPacketType& a,
+                                                                                         const RhsPacketType& b,
+                                                                                         DoublePacket<ResPacketType>& c,
+                                                                                         TmpType& /*tmp*/,
+                                                                                         const LaneIdType&) const {
+    c.first = pmadd(a, b.first, c.first);
+    c.second = pmadd(a, b.second, c.second);
+  }
+
+  template <typename LaneIdType>
+  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/,
+                                const LaneIdType&) const {
+    c = cj.pmadd(a, b, c);
   }
 
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, ResPacket& c, RhsPacket& /*tmp*/) const
-  {
-    c = cj.pmadd(a,b,c);
+  template <typename LhsPacketType, typename AccPacketType, typename LaneIdType>
+  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp,
+                                const LaneIdType& lane) const {
+    madd(a, b.get(lane), c, tmp, lane);
   }
-  
+
   EIGEN_STRONG_INLINE void acc(const Scalar& c, const Scalar& alpha, Scalar& r) const { r += alpha * c; }
-  
-  EIGEN_STRONG_INLINE void acc(const DoublePacket& c, const ResPacket& alpha, ResPacket& r) const
-  {
+
+  template <typename RealPacketType, typename ResPacketType>
+  EIGEN_STRONG_INLINE void acc(const DoublePacket<RealPacketType>& c, const ResPacketType& alpha,
+                               ResPacketType& r) const {
     // assemble c
-    ResPacket tmp;
-    if((!ConjLhs)&&(!ConjRhs))
-    {
-      tmp = pcplxflip(pconj(ResPacket(c.second)));
-      tmp = padd(ResPacket(c.first),tmp);
-    }
-    else if((!ConjLhs)&&(ConjRhs))
-    {
-      tmp = pconj(pcplxflip(ResPacket(c.second)));
-      tmp = padd(ResPacket(c.first),tmp);
+    ResPacketType tmp;
+    if ((!ConjLhs) && (!ConjRhs)) {
+      tmp = pcplxflip(pconj(ResPacketType(c.second)));
+      tmp = padd(ResPacketType(c.first), tmp);
+    } else if ((!ConjLhs) && (ConjRhs)) {
+      tmp = pconj(pcplxflip(ResPacketType(c.second)));
+      tmp = padd(ResPacketType(c.first), tmp);
+    } else if ((ConjLhs) && (!ConjRhs)) {
+      tmp = pcplxflip(ResPacketType(c.second));
+      tmp = padd(pconj(ResPacketType(c.first)), tmp);
+    } else if ((ConjLhs) && (ConjRhs)) {
+      tmp = pcplxflip(ResPacketType(c.second));
+      tmp = psub(pconj(ResPacketType(c.first)), tmp);
     }
-    else if((ConjLhs)&&(!ConjRhs))
-    {
-      tmp = pcplxflip(ResPacket(c.second));
-      tmp = padd(pconj(ResPacket(c.first)),tmp);
-    }
-    else if((ConjLhs)&&(ConjRhs))
-    {
-      tmp = pcplxflip(ResPacket(c.second));
-      tmp = psub(pconj(ResPacket(c.first)),tmp);
-    }
-    
-    r = pmadd(tmp,alpha,r);
+
+    r = pmadd(tmp, alpha, r);
   }
 
-protected:
-  conj_helper<LhsScalar,RhsScalar,ConjLhs,ConjRhs> cj;
+ protected:
+  conj_helper<LhsScalar, RhsScalar, ConjLhs, ConjRhs> cj;
 };
 
-template<typename RealScalar, bool _ConjRhs>
-class gebp_traits<RealScalar, std::complex<RealScalar>, false, _ConjRhs >
-{
-public:
-  typedef std::complex<RealScalar>  Scalar;
-  typedef RealScalar  LhsScalar;
-  typedef Scalar      RhsScalar;
-  typedef Scalar      ResScalar;
+template <typename RealScalar, bool ConjRhs_, int Arch, int PacketSize_>
+class gebp_traits<RealScalar, std::complex<RealScalar>, false, ConjRhs_, Arch, PacketSize_> {
+ public:
+  typedef std::complex<RealScalar> Scalar;
+  typedef RealScalar LhsScalar;
+  typedef Scalar RhsScalar;
+  typedef Scalar ResScalar;
+
+  PACKET_DECL_COND_POSTFIX(_, Lhs, PacketSize_);
+  PACKET_DECL_COND_POSTFIX(_, Rhs, PacketSize_);
+  PACKET_DECL_COND_POSTFIX(_, Res, PacketSize_);
+  PACKET_DECL_COND_POSTFIX(_, Real, PacketSize_);
+  PACKET_DECL_COND_SCALAR_POSTFIX(_, PacketSize_);
+
+#undef PACKET_DECL_COND_SCALAR_POSTFIX
+#undef PACKET_DECL_COND_POSTFIX
+#undef PACKET_DECL_COND_SCALAR
+#undef PACKET_DECL_COND
 
   enum {
     ConjLhs = false,
-    ConjRhs = _ConjRhs,
-    Vectorizable = packet_traits<RealScalar>::Vectorizable
-                && packet_traits<Scalar>::Vectorizable,
-    LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
-    RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
-    ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1,
-    
+    ConjRhs = ConjRhs_,
+    Vectorizable = unpacket_traits<RealPacket_>::vectorizable && unpacket_traits<ScalarPacket_>::vectorizable,
+    LhsPacketSize = Vectorizable ? unpacket_traits<LhsPacket_>::size : 1,
+    RhsPacketSize = Vectorizable ? unpacket_traits<RhsPacket_>::size : 1,
+    ResPacketSize = Vectorizable ? unpacket_traits<ResPacket_>::size : 1,
+
     NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
+    // FIXME: should depend on NumberOfRegisters
     nr = 4,
-    mr = 2*ResPacketSize,
-    WorkSpaceFactor = nr*RhsPacketSize,
+    mr = (plain_enum_min(16, NumberOfRegisters) / 2 / nr) * ResPacketSize,
 
     LhsProgress = ResPacketSize,
-    RhsProgress = ResPacketSize
+    RhsProgress = 1
   };
 
-  typedef typename packet_traits<LhsScalar>::type  _LhsPacket;
-  typedef typename packet_traits<RhsScalar>::type  _RhsPacket;
-  typedef typename packet_traits<ResScalar>::type  _ResPacket;
-
-  typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
-  typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
-  typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
-
+  typedef std::conditional_t<Vectorizable, LhsPacket_, LhsScalar> LhsPacket;
+  typedef std::conditional_t<Vectorizable, RhsPacket_, RhsScalar> RhsPacket;
+  typedef std::conditional_t<Vectorizable, ResPacket_, ResScalar> ResPacket;
+  typedef LhsPacket LhsPacket4Packing;
+  typedef QuadPacket<RhsPacket> RhsPacketx4;
   typedef ResPacket AccPacket;
 
-  EIGEN_STRONG_INLINE void initAcc(AccPacket& p)
-  {
-    p = pset1<ResPacket>(ResScalar(0));
+  EIGEN_STRONG_INLINE void initAcc(AccPacket& p) { p = pset1<ResPacket>(ResScalar(0)); }
+
+  template <typename RhsPacketType>
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketType& dest) const {
+    dest = pset1<RhsPacketType>(*b);
   }
 
-  EIGEN_STRONG_INLINE void unpackRhs(DenseIndex n, const RhsScalar* rhs, RhsScalar* b)
-  {
-    for(DenseIndex k=0; k<n; k++)
-      pstore1<RhsPacket>(&b[k*RhsPacketSize], rhs[k]);
+  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const {
+    pbroadcast4(b, dest.B_0, dest.B1, dest.B2, dest.B3);
   }
 
-  EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
-  {
-    dest = pload<RhsPacket>(b);
+  template <typename RhsPacketType>
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacketType& dest) const {
+    loadRhs(b, dest);
   }
 
-  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const
-  {
-    dest = ploaddup<LhsPacket>(a);
+  EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const {}
+
+  EIGEN_STRONG_INLINE void loadLhs(const LhsScalar* a, LhsPacket& dest) const { dest = ploaddup<LhsPacket>(a); }
+
+  EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const { dest = ploadquad<RhsPacket>(b); }
+
+  template <typename LhsPacketType>
+  EIGEN_STRONG_INLINE void loadLhsUnaligned(const LhsScalar* a, LhsPacketType& dest) const {
+    dest = ploaddup<LhsPacketType>(a);
   }
 
-  EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp) const
-  {
-    madd_impl(a, b, c, tmp, typename conditional<Vectorizable,true_type,false_type>::type());
+  template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType, typename LaneIdType>
+  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c, RhsPacketType& tmp,
+                                const LaneIdType&) const {
+    madd_impl(a, b, c, tmp, std::conditional_t<Vectorizable, true_type, false_type>());
   }
 
-  EIGEN_STRONG_INLINE void madd_impl(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& tmp, const true_type&) const
-  {
-    tmp = b; tmp.v = pmul(a,tmp.v); c = padd(c,tmp);
+  template <typename LhsPacketType, typename RhsPacketType, typename AccPacketType>
+  EIGEN_STRONG_INLINE void madd_impl(const LhsPacketType& a, const RhsPacketType& b, AccPacketType& c,
+                                     RhsPacketType& tmp, const true_type&) const {
+#ifdef EIGEN_HAS_SINGLE_INSTRUCTION_MADD
+    EIGEN_UNUSED_VARIABLE(tmp);
+    c.v = pmadd(a, b.v, c.v);
+#else
+    tmp = b;
+    tmp.v = pmul(a, tmp.v);
+    c = padd(c, tmp);
+#endif
   }
 
-  EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/, const false_type&) const
-  {
+  EIGEN_STRONG_INLINE void madd_impl(const LhsScalar& a, const RhsScalar& b, ResScalar& c, RhsScalar& /*tmp*/,
+                                     const false_type&) const {
     c += a * b;
   }
 
-  EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
-  {
-    r = cj.pmadd(alpha,c,r);
+  template <typename LhsPacketType, typename AccPacketType, typename LaneIdType>
+  EIGEN_STRONG_INLINE void madd(const LhsPacketType& a, const RhsPacketx4& b, AccPacketType& c, RhsPacket& tmp,
+                                const LaneIdType& lane) const {
+    madd(a, b.get(lane), c, tmp, lane);
+  }
+
+  template <typename ResPacketType, typename AccPacketType>
+  EIGEN_STRONG_INLINE void acc(const AccPacketType& c, const ResPacketType& alpha, ResPacketType& r) const {
+    conj_helper<ResPacketType, ResPacketType, false, ConjRhs> cj;
+    r = cj.pmadd(alpha, c, r);
   }
 
-protected:
-  conj_helper<ResPacket,ResPacket,false,ConjRhs> cj;
+ protected:
 };
 
-/* optimized GEneral packed Block * packed Panel product kernel
+/* optimized General packed Block * packed Panel product kernel
  *
  * Mixing type logic: C += A * B
  *  |  A  |  B  | comments
  *  |real |cplx | no vectorization yet, would require to pack A with duplication
  *  |cplx |real | easy vectorization
  */
-template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-struct gebp_kernel
-{
-  typedef gebp_traits<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> Traits;
+template <typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel {
+  typedef gebp_traits<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target> Traits;
+  typedef gebp_traits<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target, GEBPPacketHalf>
+      HalfTraits;
+  typedef gebp_traits<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target, GEBPPacketQuarter>
+      QuarterTraits;
+
   typedef typename Traits::ResScalar ResScalar;
   typedef typename Traits::LhsPacket LhsPacket;
   typedef typename Traits::RhsPacket RhsPacket;
   typedef typename Traits::ResPacket ResPacket;
   typedef typename Traits::AccPacket AccPacket;
+  typedef typename Traits::RhsPacketx4 RhsPacketx4;
+
+  typedef typename RhsPanelHelper<RhsPacket, RhsPacketx4, 15>::type RhsPanel15;
+  typedef typename RhsPanelHelper<RhsPacket, RhsPacketx4, 27>::type RhsPanel27;
+
+  typedef gebp_traits<RhsScalar, LhsScalar, ConjugateRhs, ConjugateLhs, Architecture::Target> SwappedTraits;
+
+  typedef typename SwappedTraits::ResScalar SResScalar;
+  typedef typename SwappedTraits::LhsPacket SLhsPacket;
+  typedef typename SwappedTraits::RhsPacket SRhsPacket;
+  typedef typename SwappedTraits::ResPacket SResPacket;
+  typedef typename SwappedTraits::AccPacket SAccPacket;
+
+  typedef typename HalfTraits::LhsPacket LhsPacketHalf;
+  typedef typename HalfTraits::RhsPacket RhsPacketHalf;
+  typedef typename HalfTraits::ResPacket ResPacketHalf;
+  typedef typename HalfTraits::AccPacket AccPacketHalf;
+
+  typedef typename QuarterTraits::LhsPacket LhsPacketQuarter;
+  typedef typename QuarterTraits::RhsPacket RhsPacketQuarter;
+  typedef typename QuarterTraits::ResPacket ResPacketQuarter;
+  typedef typename QuarterTraits::AccPacket AccPacketQuarter;
+
+  typedef typename DataMapper::LinearMapper LinearMapper;
 
   enum {
-    Vectorizable  = Traits::Vectorizable,
-    LhsProgress   = Traits::LhsProgress,
-    RhsProgress   = Traits::RhsProgress,
+    Vectorizable = Traits::Vectorizable,
+    LhsProgress = Traits::LhsProgress,
+    LhsProgressHalf = HalfTraits::LhsProgress,
+    LhsProgressQuarter = QuarterTraits::LhsProgress,
+    RhsProgress = Traits::RhsProgress,
+    RhsProgressHalf = HalfTraits::RhsProgress,
+    RhsProgressQuarter = QuarterTraits::RhsProgress,
     ResPacketSize = Traits::ResPacketSize
   };
 
-  EIGEN_DONT_INLINE
-  void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha,
-                  Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0, RhsScalar* unpackedB=0);
+  EIGEN_DONT_INLINE void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB, Index rows,
+                                    Index depth, Index cols, ResScalar alpha, Index strideA = -1, Index strideB = -1,
+                                    Index offsetA = 0, Index offsetB = 0);
 };
 
-template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-EIGEN_DONT_INLINE
-void gebp_kernel<LhsScalar,RhsScalar,Index,mr,nr,ConjugateLhs,ConjugateRhs>
-  ::operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index rows, Index depth, Index cols, ResScalar alpha,
-               Index strideA, Index strideB, Index offsetA, Index offsetB, RhsScalar* unpackedB)
-  {
-    Traits traits;
-    
-    if(strideA==-1) strideA = depth;
-    if(strideB==-1) strideB = depth;
-    conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
-//     conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
-    Index packet_cols = (cols/nr) * nr;
-    const Index peeled_mc = (rows/mr)*mr;
-    // FIXME:
-    const Index peeled_mc2 = peeled_mc + (rows-peeled_mc >= LhsProgress ? LhsProgress : 0);
-    const Index peeled_kc = (depth/4)*4;
-
-    if(unpackedB==0)
-      unpackedB = const_cast<RhsScalar*>(blockB - strideB * nr * RhsProgress);
-
-    // loops on each micro vertical panel of rhs (depth x nr)
-    for(Index j2=0; j2<packet_cols; j2+=nr)
-    {
-      traits.unpackRhs(depth*nr,&blockB[j2*strideB+offsetB*nr],unpackedB); 
-
-      // loops on each largest micro horizontal panel of lhs (mr x depth)
-      // => we select a mr x nr micro block of res which is entirely
-      //    stored into mr/packet_size x nr registers.
-      for(Index i=0; i<peeled_mc; i+=mr)
-      {
-        const LhsScalar* blA = &blockA[i*strideA+offsetA*mr];
-        prefetch(&blA[0]);
+template <typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs,
+          int SwappedLhsProgress =
+              gebp_traits<RhsScalar, LhsScalar, ConjugateRhs, ConjugateLhs, Architecture::Target>::LhsProgress>
+struct last_row_process_16_packets {
+  typedef gebp_traits<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target> Traits;
+  typedef gebp_traits<RhsScalar, LhsScalar, ConjugateRhs, ConjugateLhs, Architecture::Target> SwappedTraits;
 
-        // gets res block as register
-        AccPacket C0, C1, C2, C3, C4, C5, C6, C7;
-                  traits.initAcc(C0);
-                  traits.initAcc(C1);
-        if(nr==4) traits.initAcc(C2);
-        if(nr==4) traits.initAcc(C3);
-                  traits.initAcc(C4);
-                  traits.initAcc(C5);
-        if(nr==4) traits.initAcc(C6);
-        if(nr==4) traits.initAcc(C7);
-
-        ResScalar* r0 = &res[(j2+0)*resStride + i];
-        ResScalar* r1 = r0 + resStride;
-        ResScalar* r2 = r1 + resStride;
-        ResScalar* r3 = r2 + resStride;
-
-        prefetch(r0+16);
-        prefetch(r1+16);
-        prefetch(r2+16);
-        prefetch(r3+16);
-
-        // performs "inner" product
-        // TODO let's check wether the folowing peeled loop could not be
-        //      optimized via optimal prefetching from one loop to the other
-        const RhsScalar* blB = unpackedB;
-        for(Index k=0; k<peeled_kc; k+=4)
-        {
-          if(nr==2)
-          {
-            LhsPacket A0, A1;
-            RhsPacket B_0;
+  typedef typename Traits::ResScalar ResScalar;
+  typedef typename SwappedTraits::LhsPacket SLhsPacket;
+  typedef typename SwappedTraits::RhsPacket SRhsPacket;
+  typedef typename SwappedTraits::ResPacket SResPacket;
+  typedef typename SwappedTraits::AccPacket SAccPacket;
+
+  EIGEN_STRONG_INLINE void operator()(const DataMapper& res, SwappedTraits& straits, const LhsScalar* blA,
+                                      const RhsScalar* blB, Index depth, const Index endk, Index i, Index j2,
+                                      ResScalar alpha, SAccPacket& C0) {
+    EIGEN_UNUSED_VARIABLE(res);
+    EIGEN_UNUSED_VARIABLE(straits);
+    EIGEN_UNUSED_VARIABLE(blA);
+    EIGEN_UNUSED_VARIABLE(blB);
+    EIGEN_UNUSED_VARIABLE(depth);
+    EIGEN_UNUSED_VARIABLE(endk);
+    EIGEN_UNUSED_VARIABLE(i);
+    EIGEN_UNUSED_VARIABLE(j2);
+    EIGEN_UNUSED_VARIABLE(alpha);
+    EIGEN_UNUSED_VARIABLE(C0);
+  }
+};
+
+template <typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+struct last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs, 16> {
+  typedef gebp_traits<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs, Architecture::Target> Traits;
+  typedef gebp_traits<RhsScalar, LhsScalar, ConjugateRhs, ConjugateLhs, Architecture::Target> SwappedTraits;
+
+  typedef typename Traits::ResScalar ResScalar;
+  typedef typename SwappedTraits::LhsPacket SLhsPacket;
+  typedef typename SwappedTraits::RhsPacket SRhsPacket;
+  typedef typename SwappedTraits::ResPacket SResPacket;
+  typedef typename SwappedTraits::AccPacket SAccPacket;
+
+  EIGEN_STRONG_INLINE void operator()(const DataMapper& res, SwappedTraits& straits, const LhsScalar* blA,
+                                      const RhsScalar* blB, Index depth, const Index endk, Index i, Index j2,
+                                      ResScalar alpha, SAccPacket& C0) {
+    typedef typename unpacket_traits<typename unpacket_traits<SResPacket>::half>::half SResPacketQuarter;
+    typedef typename unpacket_traits<typename unpacket_traits<SLhsPacket>::half>::half SLhsPacketQuarter;
+    typedef typename unpacket_traits<typename unpacket_traits<SRhsPacket>::half>::half SRhsPacketQuarter;
+    typedef typename unpacket_traits<typename unpacket_traits<SAccPacket>::half>::half SAccPacketQuarter;
+
+    SResPacketQuarter R = res.template gatherPacket<SResPacketQuarter>(i, j2);
+    SResPacketQuarter alphav = pset1<SResPacketQuarter>(alpha);
+
+    if (depth - endk > 0) {
+      // We have to handle the last row(s) of the rhs, which
+      // correspond to a half-packet
+      SAccPacketQuarter c0 = predux_half_dowto4(predux_half_dowto4(C0));
+
+      for (Index kk = endk; kk < depth; kk++) {
+        SLhsPacketQuarter a0;
+        SRhsPacketQuarter b0;
+        straits.loadLhsUnaligned(blB, a0);
+        straits.loadRhs(blA, b0);
+        straits.madd(a0, b0, c0, b0, fix<0>);
+        blB += SwappedTraits::LhsProgress / 4;
+        blA += 1;
+      }
+      straits.acc(c0, alphav, R);
+    } else {
+      straits.acc(predux_half_dowto4(predux_half_dowto4(C0)), alphav, R);
+    }
+    res.scatterPacket(i, j2, R);
+  }
+};
+
+template <int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar,
+          typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits,
+          typename LinearMapper, typename DataMapper>
+struct lhs_process_one_packet {
+  typedef typename GEBPTraits::RhsPacketx4 RhsPacketx4;
+
+  EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits,
+                                             LhsPacket* A0, RhsPacketx4* rhs_panel, RhsPacket* T0, AccPacket* C0,
+                                             AccPacket* C1, AccPacket* C2, AccPacket* C3) {
+    EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4");
+    EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!");
+    traits.loadLhs(&blA[(0 + 1 * K) * LhsProgress], *A0);
+    traits.loadRhs(&blB[(0 + 4 * K) * RhsProgress], *rhs_panel);
+    traits.madd(*A0, *rhs_panel, *C0, *T0, fix<0>);
+    traits.madd(*A0, *rhs_panel, *C1, *T0, fix<1>);
+    traits.madd(*A0, *rhs_panel, *C2, *T0, fix<2>);
+    traits.madd(*A0, *rhs_panel, *C3, *T0, fix<3>);
+#if EIGEN_GNUC_STRICT_AT_LEAST(6, 0, 0) && defined(EIGEN_VECTORIZE_SSE) && !(EIGEN_COMP_LCC)
+    __asm__("" : "+x,m"(*A0));
+#endif
+    EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4");
+  }
+
+  EIGEN_STRONG_INLINE void operator()(const DataMapper& res, const LhsScalar* blockA, const RhsScalar* blockB,
+                                      ResScalar alpha, Index peelStart, Index peelEnd, Index strideA, Index strideB,
+                                      Index offsetA, Index offsetB, int prefetch_res_offset, Index peeled_kc, Index pk,
+                                      Index cols, Index depth, Index packet_cols4) {
+    GEBPTraits traits;
+    Index packet_cols8 = nr >= 8 ? (cols / 8) * 8 : 0;
+    // loops on each largest micro horizontal panel of lhs
+    // (LhsProgress x depth)
+    for (Index i = peelStart; i < peelEnd; i += LhsProgress) {
+#if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
+      EIGEN_IF_CONSTEXPR(nr >= 8) {
+        for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
+          const LhsScalar* blA = &blockA[i * strideA + offsetA * (LhsProgress)];
+          prefetch(&blA[0]);
+
+          // gets res block as register
+          AccPacket C0, C1, C2, C3, C4, C5, C6, C7;
+          traits.initAcc(C0);
+          traits.initAcc(C1);
+          traits.initAcc(C2);
+          traits.initAcc(C3);
+          traits.initAcc(C4);
+          traits.initAcc(C5);
+          traits.initAcc(C6);
+          traits.initAcc(C7);
+
+          LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
+          LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
+          LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
+          LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
+          LinearMapper r4 = res.getLinearMapper(i, j2 + 4);
+          LinearMapper r5 = res.getLinearMapper(i, j2 + 5);
+          LinearMapper r6 = res.getLinearMapper(i, j2 + 6);
+          LinearMapper r7 = res.getLinearMapper(i, j2 + 7);
+          r0.prefetch(prefetch_res_offset);
+          r1.prefetch(prefetch_res_offset);
+          r2.prefetch(prefetch_res_offset);
+          r3.prefetch(prefetch_res_offset);
+          r4.prefetch(prefetch_res_offset);
+          r5.prefetch(prefetch_res_offset);
+          r6.prefetch(prefetch_res_offset);
+          r7.prefetch(prefetch_res_offset);
+          const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 8];
+          prefetch(&blB[0]);
+
+          LhsPacket A0;
+          for (Index k = 0; k < peeled_kc; k += pk) {
+            RhsPacketx4 rhs_panel;
             RhsPacket T0;
-            
-EIGEN_ASM_COMMENT("mybegin2");
-            traits.loadLhs(&blA[0*LhsProgress], A0);
-            traits.loadLhs(&blA[1*LhsProgress], A1);
-            traits.loadRhs(&blB[0*RhsProgress], B_0);
-            traits.madd(A0,B_0,C0,T0);
-            traits.madd(A1,B_0,C4,B_0);
-            traits.loadRhs(&blB[1*RhsProgress], B_0);
-            traits.madd(A0,B_0,C1,T0);
-            traits.madd(A1,B_0,C5,B_0);
-
-            traits.loadLhs(&blA[2*LhsProgress], A0);
-            traits.loadLhs(&blA[3*LhsProgress], A1);
-            traits.loadRhs(&blB[2*RhsProgress], B_0);
-            traits.madd(A0,B_0,C0,T0);
-            traits.madd(A1,B_0,C4,B_0);
-            traits.loadRhs(&blB[3*RhsProgress], B_0);
-            traits.madd(A0,B_0,C1,T0);
-            traits.madd(A1,B_0,C5,B_0);
-
-            traits.loadLhs(&blA[4*LhsProgress], A0);
-            traits.loadLhs(&blA[5*LhsProgress], A1);
-            traits.loadRhs(&blB[4*RhsProgress], B_0);
-            traits.madd(A0,B_0,C0,T0);
-            traits.madd(A1,B_0,C4,B_0);
-            traits.loadRhs(&blB[5*RhsProgress], B_0);
-            traits.madd(A0,B_0,C1,T0);
-            traits.madd(A1,B_0,C5,B_0);
-
-            traits.loadLhs(&blA[6*LhsProgress], A0);
-            traits.loadLhs(&blA[7*LhsProgress], A1);
-            traits.loadRhs(&blB[6*RhsProgress], B_0);
-            traits.madd(A0,B_0,C0,T0);
-            traits.madd(A1,B_0,C4,B_0);
-            traits.loadRhs(&blB[7*RhsProgress], B_0);
-            traits.madd(A0,B_0,C1,T0);
-            traits.madd(A1,B_0,C5,B_0);
-EIGEN_ASM_COMMENT("myend");
+#define EIGEN_GEBGP_ONESTEP(K)                                    \
+  do {                                                            \
+    EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1pX8");    \
+    traits.loadLhs(&blA[(0 + 1 * K) * LhsProgress], A0);          \
+    traits.loadRhs(&blB[(0 + 8 * K) * RhsProgress], rhs_panel);   \
+    traits.madd(A0, rhs_panel, C0, T0, fix<0>);                   \
+    traits.updateRhs(&blB[(1 + 8 * K) * RhsProgress], rhs_panel); \
+    traits.madd(A0, rhs_panel, C1, T0, fix<1>);                   \
+    traits.updateRhs(&blB[(2 + 8 * K) * RhsProgress], rhs_panel); \
+    traits.madd(A0, rhs_panel, C2, T0, fix<2>);                   \
+    traits.updateRhs(&blB[(3 + 8 * K) * RhsProgress], rhs_panel); \
+    traits.madd(A0, rhs_panel, C3, T0, fix<3>);                   \
+    traits.loadRhs(&blB[(4 + 8 * K) * RhsProgress], rhs_panel);   \
+    traits.madd(A0, rhs_panel, C4, T0, fix<0>);                   \
+    traits.updateRhs(&blB[(5 + 8 * K) * RhsProgress], rhs_panel); \
+    traits.madd(A0, rhs_panel, C5, T0, fix<1>);                   \
+    traits.updateRhs(&blB[(6 + 8 * K) * RhsProgress], rhs_panel); \
+    traits.madd(A0, rhs_panel, C6, T0, fix<2>);                   \
+    traits.updateRhs(&blB[(7 + 8 * K) * RhsProgress], rhs_panel); \
+    traits.madd(A0, rhs_panel, C7, T0, fix<3>);                   \
+    EIGEN_ASM_COMMENT("end step of gebp micro kernel 1pX8");      \
+  } while (false)
+
+            EIGEN_ASM_COMMENT("begin gebp micro kernel 1pX8");
+
+            EIGEN_GEBGP_ONESTEP(0);
+            EIGEN_GEBGP_ONESTEP(1);
+            EIGEN_GEBGP_ONESTEP(2);
+            EIGEN_GEBGP_ONESTEP(3);
+            EIGEN_GEBGP_ONESTEP(4);
+            EIGEN_GEBGP_ONESTEP(5);
+            EIGEN_GEBGP_ONESTEP(6);
+            EIGEN_GEBGP_ONESTEP(7);
+
+            blB += pk * 8 * RhsProgress;
+            blA += pk * (1 * LhsProgress);
+
+            EIGEN_ASM_COMMENT("end gebp micro kernel 1pX8");
           }
-          else
-          {
-EIGEN_ASM_COMMENT("mybegin4");
-            LhsPacket A0, A1;
-            RhsPacket B_0, B1, B2, B3;
+          // process remaining peeled loop
+          for (Index k = peeled_kc; k < depth; k++) {
+            RhsPacketx4 rhs_panel;
             RhsPacket T0;
-            
-            traits.loadLhs(&blA[0*LhsProgress], A0);
-            traits.loadLhs(&blA[1*LhsProgress], A1);
-            traits.loadRhs(&blB[0*RhsProgress], B_0);
-            traits.loadRhs(&blB[1*RhsProgress], B1);
-
-            traits.madd(A0,B_0,C0,T0);
-            traits.loadRhs(&blB[2*RhsProgress], B2);
-            traits.madd(A1,B_0,C4,B_0);
-            traits.loadRhs(&blB[3*RhsProgress], B3);
-            traits.loadRhs(&blB[4*RhsProgress], B_0);
-            traits.madd(A0,B1,C1,T0);
-            traits.madd(A1,B1,C5,B1);
-            traits.loadRhs(&blB[5*RhsProgress], B1);
-            traits.madd(A0,B2,C2,T0);
-            traits.madd(A1,B2,C6,B2);
-            traits.loadRhs(&blB[6*RhsProgress], B2);
-            traits.madd(A0,B3,C3,T0);
-            traits.loadLhs(&blA[2*LhsProgress], A0);
-            traits.madd(A1,B3,C7,B3);
-            traits.loadLhs(&blA[3*LhsProgress], A1);
-            traits.loadRhs(&blB[7*RhsProgress], B3);
-            traits.madd(A0,B_0,C0,T0);
-            traits.madd(A1,B_0,C4,B_0);
-            traits.loadRhs(&blB[8*RhsProgress], B_0);
-            traits.madd(A0,B1,C1,T0);
-            traits.madd(A1,B1,C5,B1);
-            traits.loadRhs(&blB[9*RhsProgress], B1);
-            traits.madd(A0,B2,C2,T0);
-            traits.madd(A1,B2,C6,B2);
-            traits.loadRhs(&blB[10*RhsProgress], B2);
-            traits.madd(A0,B3,C3,T0);
-            traits.loadLhs(&blA[4*LhsProgress], A0);
-            traits.madd(A1,B3,C7,B3);
-            traits.loadLhs(&blA[5*LhsProgress], A1);
-            traits.loadRhs(&blB[11*RhsProgress], B3);
-
-            traits.madd(A0,B_0,C0,T0);
-            traits.madd(A1,B_0,C4,B_0);
-            traits.loadRhs(&blB[12*RhsProgress], B_0);
-            traits.madd(A0,B1,C1,T0);
-            traits.madd(A1,B1,C5,B1);
-            traits.loadRhs(&blB[13*RhsProgress], B1);
-            traits.madd(A0,B2,C2,T0);
-            traits.madd(A1,B2,C6,B2);
-            traits.loadRhs(&blB[14*RhsProgress], B2);
-            traits.madd(A0,B3,C3,T0);
-            traits.loadLhs(&blA[6*LhsProgress], A0);
-            traits.madd(A1,B3,C7,B3);
-            traits.loadLhs(&blA[7*LhsProgress], A1);
-            traits.loadRhs(&blB[15*RhsProgress], B3);
-            traits.madd(A0,B_0,C0,T0);
-            traits.madd(A1,B_0,C4,B_0);
-            traits.madd(A0,B1,C1,T0);
-            traits.madd(A1,B1,C5,B1);
-            traits.madd(A0,B2,C2,T0);
-            traits.madd(A1,B2,C6,B2);
-            traits.madd(A0,B3,C3,T0);
-            traits.madd(A1,B3,C7,B3);
+            EIGEN_GEBGP_ONESTEP(0);
+            blB += 8 * RhsProgress;
+            blA += 1 * LhsProgress;
           }
 
-          blB += 4*nr*RhsProgress;
-          blA += 4*mr;
+#undef EIGEN_GEBGP_ONESTEP
+
+          ResPacket R0, R1;
+          ResPacket alphav = pset1<ResPacket>(alpha);
+
+          R0 = r0.template loadPacket<ResPacket>(0);
+          R1 = r1.template loadPacket<ResPacket>(0);
+          traits.acc(C0, alphav, R0);
+          traits.acc(C1, alphav, R1);
+          r0.storePacket(0, R0);
+          r1.storePacket(0, R1);
+
+          R0 = r2.template loadPacket<ResPacket>(0);
+          R1 = r3.template loadPacket<ResPacket>(0);
+          traits.acc(C2, alphav, R0);
+          traits.acc(C3, alphav, R1);
+          r2.storePacket(0, R0);
+          r3.storePacket(0, R1);
+
+          R0 = r4.template loadPacket<ResPacket>(0);
+          R1 = r5.template loadPacket<ResPacket>(0);
+          traits.acc(C4, alphav, R0);
+          traits.acc(C5, alphav, R1);
+          r4.storePacket(0, R0);
+          r5.storePacket(0, R1);
+
+          R0 = r6.template loadPacket<ResPacket>(0);
+          R1 = r7.template loadPacket<ResPacket>(0);
+          traits.acc(C6, alphav, R0);
+          traits.acc(C7, alphav, R1);
+          r6.storePacket(0, R0);
+          r7.storePacket(0, R1);
+        }
+      }
+#endif
+
+      // loops on each largest micro vertical panel of rhs (depth * nr)
+      for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
+        // We select a LhsProgress x nr micro block of res
+        // which is entirely stored into 1 x nr registers.
+
+        const LhsScalar* blA = &blockA[i * strideA + offsetA * (LhsProgress)];
+        prefetch(&blA[0]);
+
+        // gets res block as register
+        AccPacket C0, C1, C2, C3;
+        traits.initAcc(C0);
+        traits.initAcc(C1);
+        traits.initAcc(C2);
+        traits.initAcc(C3);
+        // To improve instruction pipelining, let's double the accumulation registers:
+        //  even k will accumulate in C*, while odd k will accumulate in D*.
+        // This trick is crucial to get good performance with FMA, otherwise it is
+        // actually faster to perform separated MUL+ADD because of a naturally
+        // better instruction-level parallelism.
+        AccPacket D0, D1, D2, D3;
+        traits.initAcc(D0);
+        traits.initAcc(D1);
+        traits.initAcc(D2);
+        traits.initAcc(D3);
+
+        LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
+        LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
+        LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
+        LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
+
+        r0.prefetch(prefetch_res_offset);
+        r1.prefetch(prefetch_res_offset);
+        r2.prefetch(prefetch_res_offset);
+        r3.prefetch(prefetch_res_offset);
+
+        // performs "inner" products
+        const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 4];
+        prefetch(&blB[0]);
+        LhsPacket A0, A1;
+
+        for (Index k = 0; k < peeled_kc; k += pk) {
+          EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX4");
+          RhsPacketx4 rhs_panel;
+          RhsPacket T0;
+
+          internal::prefetch(blB + (48 + 0));
+          peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
+          peeled_kc_onestep(1, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
+          peeled_kc_onestep(2, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
+          peeled_kc_onestep(3, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
+          internal::prefetch(blB + (48 + 16));
+          peeled_kc_onestep(4, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
+          peeled_kc_onestep(5, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
+          peeled_kc_onestep(6, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
+          peeled_kc_onestep(7, blA, blB, traits, &A1, &rhs_panel, &T0, &D0, &D1, &D2, &D3);
+
+          blB += pk * 4 * RhsProgress;
+          blA += pk * LhsProgress;
+
+          EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX4");
         }
+        C0 = padd(C0, D0);
+        C1 = padd(C1, D1);
+        C2 = padd(C2, D2);
+        C3 = padd(C3, D3);
+
         // process remaining peeled loop
-        for(Index k=peeled_kc; k<depth; k++)
-        {
-          if(nr==2)
-          {
-            LhsPacket A0, A1;
-            RhsPacket B_0;
-            RhsPacket T0;
+        for (Index k = peeled_kc; k < depth; k++) {
+          RhsPacketx4 rhs_panel;
+          RhsPacket T0;
+          peeled_kc_onestep(0, blA, blB, traits, &A0, &rhs_panel, &T0, &C0, &C1, &C2, &C3);
+          blB += 4 * RhsProgress;
+          blA += LhsProgress;
+        }
 
-            traits.loadLhs(&blA[0*LhsProgress], A0);
-            traits.loadLhs(&blA[1*LhsProgress], A1);
-            traits.loadRhs(&blB[0*RhsProgress], B_0);
-            traits.madd(A0,B_0,C0,T0);
-            traits.madd(A1,B_0,C4,B_0);
-            traits.loadRhs(&blB[1*RhsProgress], B_0);
-            traits.madd(A0,B_0,C1,T0);
-            traits.madd(A1,B_0,C5,B_0);
-          }
-          else
-          {
-            LhsPacket A0, A1;
-            RhsPacket B_0, B1, B2, B3;
-            RhsPacket T0;
+        ResPacket R0, R1;
+        ResPacket alphav = pset1<ResPacket>(alpha);
 
-            traits.loadLhs(&blA[0*LhsProgress], A0);
-            traits.loadLhs(&blA[1*LhsProgress], A1);
-            traits.loadRhs(&blB[0*RhsProgress], B_0);
-            traits.loadRhs(&blB[1*RhsProgress], B1);
-
-            traits.madd(A0,B_0,C0,T0);
-            traits.loadRhs(&blB[2*RhsProgress], B2);
-            traits.madd(A1,B_0,C4,B_0);
-            traits.loadRhs(&blB[3*RhsProgress], B3);
-            traits.madd(A0,B1,C1,T0);
-            traits.madd(A1,B1,C5,B1);
-            traits.madd(A0,B2,C2,T0);
-            traits.madd(A1,B2,C6,B2);
-            traits.madd(A0,B3,C3,T0);
-            traits.madd(A1,B3,C7,B3);
-          }
+        R0 = r0.template loadPacket<ResPacket>(0);
+        R1 = r1.template loadPacket<ResPacket>(0);
+        traits.acc(C0, alphav, R0);
+        traits.acc(C1, alphav, R1);
+        r0.storePacket(0, R0);
+        r1.storePacket(0, R1);
+
+        R0 = r2.template loadPacket<ResPacket>(0);
+        R1 = r3.template loadPacket<ResPacket>(0);
+        traits.acc(C2, alphav, R0);
+        traits.acc(C3, alphav, R1);
+        r2.storePacket(0, R0);
+        r3.storePacket(0, R1);
+      }
+
+      // Deal with remaining columns of the rhs
+      for (Index j2 = packet_cols4; j2 < cols; j2++) {
+        // One column at a time
+        const LhsScalar* blA = &blockA[i * strideA + offsetA * (LhsProgress)];
+        prefetch(&blA[0]);
+
+        // gets res block as register
+        AccPacket C0;
+        traits.initAcc(C0);
+
+        LinearMapper r0 = res.getLinearMapper(i, j2);
 
-          blB += nr*RhsProgress;
-          blA += mr;
+        // performs "inner" products
+        const RhsScalar* blB = &blockB[j2 * strideB + offsetB];
+        LhsPacket A0;
+
+        for (Index k = 0; k < peeled_kc; k += pk) {
+          EIGEN_ASM_COMMENT("begin gebp micro kernel 1/half/quarterX1");
+          RhsPacket B_0;
+
+#define EIGEN_GEBGP_ONESTEP(K)                                             \
+  do {                                                                     \
+    EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1/half/quarterX1"); \
+    EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!");    \
+    /* FIXME: why unaligned???? */                                         \
+    traits.loadLhsUnaligned(&blA[(0 + 1 * K) * LhsProgress], A0);          \
+    traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0);                      \
+    traits.madd(A0, B_0, C0, B_0, fix<0>);                                 \
+    EIGEN_ASM_COMMENT("end step of gebp micro kernel 1/half/quarterX1");   \
+  } while (false);
+
+          EIGEN_GEBGP_ONESTEP(0);
+          EIGEN_GEBGP_ONESTEP(1);
+          EIGEN_GEBGP_ONESTEP(2);
+          EIGEN_GEBGP_ONESTEP(3);
+          EIGEN_GEBGP_ONESTEP(4);
+          EIGEN_GEBGP_ONESTEP(5);
+          EIGEN_GEBGP_ONESTEP(6);
+          EIGEN_GEBGP_ONESTEP(7);
+
+          blB += pk * RhsProgress;
+          blA += pk * LhsProgress;
+
+          EIGEN_ASM_COMMENT("end gebp micro kernel 1/half/quarterX1");
         }
 
-        if(nr==4)
-        {
-          ResPacket R0, R1, R2, R3, R4, R5, R6;
-          ResPacket alphav = pset1<ResPacket>(alpha);
+        // process remaining peeled loop
+        for (Index k = peeled_kc; k < depth; k++) {
+          RhsPacket B_0;
+          EIGEN_GEBGP_ONESTEP(0);
+          blB += RhsProgress;
+          blA += LhsProgress;
+        }
+#undef EIGEN_GEBGP_ONESTEP
+        ResPacket R0;
+        ResPacket alphav = pset1<ResPacket>(alpha);
+        R0 = r0.template loadPacket<ResPacket>(0);
+        traits.acc(C0, alphav, R0);
+        r0.storePacket(0, R0);
+      }
+    }
+  }
+};
 
-          R0 = ploadu<ResPacket>(r0);
-          R1 = ploadu<ResPacket>(r1);
-          R2 = ploadu<ResPacket>(r2);
-          R3 = ploadu<ResPacket>(r3);
-          R4 = ploadu<ResPacket>(r0 + ResPacketSize);
-          R5 = ploadu<ResPacket>(r1 + ResPacketSize);
-          R6 = ploadu<ResPacket>(r2 + ResPacketSize);
-          traits.acc(C0, alphav, R0);
-          pstoreu(r0, R0);
-          R0 = ploadu<ResPacket>(r3 + ResPacketSize);
+template <int nr, Index LhsProgress, Index RhsProgress, typename LhsScalar, typename RhsScalar, typename ResScalar,
+          typename AccPacket, typename LhsPacket, typename RhsPacket, typename ResPacket, typename GEBPTraits,
+          typename LinearMapper, typename DataMapper>
+struct lhs_process_fraction_of_packet
+    : lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket,
+                             RhsPacket, ResPacket, GEBPTraits, LinearMapper, DataMapper> {
+  EIGEN_STRONG_INLINE void peeled_kc_onestep(Index K, const LhsScalar* blA, const RhsScalar* blB, GEBPTraits traits,
+                                             LhsPacket* A0, RhsPacket* B_0, RhsPacket* B1, RhsPacket* B2, RhsPacket* B3,
+                                             AccPacket* C0, AccPacket* C1, AccPacket* C2, AccPacket* C3) {
+    EIGEN_ASM_COMMENT("begin step of gebp micro kernel 1X4");
+    EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!");
+    traits.loadLhsUnaligned(&blA[(0 + 1 * K) * (LhsProgress)], *A0);
+    traits.broadcastRhs(&blB[(0 + 4 * K) * RhsProgress], *B_0, *B1, *B2, *B3);
+    traits.madd(*A0, *B_0, *C0, *B_0);
+    traits.madd(*A0, *B1, *C1, *B1);
+    traits.madd(*A0, *B2, *C2, *B2);
+    traits.madd(*A0, *B3, *C3, *B3);
+    EIGEN_ASM_COMMENT("end step of gebp micro kernel 1X4");
+  }
+};
 
-          traits.acc(C1, alphav, R1);
-          traits.acc(C2, alphav, R2);
-          traits.acc(C3, alphav, R3);
-          traits.acc(C4, alphav, R4);
-          traits.acc(C5, alphav, R5);
-          traits.acc(C6, alphav, R6);
-          traits.acc(C7, alphav, R0);
-          
-          pstoreu(r1, R1);
-          pstoreu(r2, R2);
-          pstoreu(r3, R3);
-          pstoreu(r0 + ResPacketSize, R4);
-          pstoreu(r1 + ResPacketSize, R5);
-          pstoreu(r2 + ResPacketSize, R6);
-          pstoreu(r3 + ResPacketSize, R0);
+template <typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs, bool ConjugateRhs>
+EIGEN_DONT_INLINE void gebp_kernel<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs,
+                                   ConjugateRhs>::operator()(const DataMapper& res, const LhsScalar* blockA,
+                                                             const RhsScalar* blockB, Index rows, Index depth,
+                                                             Index cols, ResScalar alpha, Index strideA, Index strideB,
+                                                             Index offsetA, Index offsetB) {
+  Traits traits;
+  SwappedTraits straits;
+
+  if (strideA == -1) strideA = depth;
+  if (strideB == -1) strideB = depth;
+  conj_helper<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs> cj;
+  Index packet_cols4 = nr >= 4 ? (cols / 4) * 4 : 0;
+  Index packet_cols8 = nr >= 8 ? (cols / 8) * 8 : 0;
+  const Index peeled_mc3 = mr >= 3 * Traits::LhsProgress ? (rows / (3 * LhsProgress)) * (3 * LhsProgress) : 0;
+  const Index peeled_mc2 =
+      mr >= 2 * Traits::LhsProgress ? peeled_mc3 + ((rows - peeled_mc3) / (2 * LhsProgress)) * (2 * LhsProgress) : 0;
+  const Index peeled_mc1 =
+      mr >= 1 * Traits::LhsProgress ? peeled_mc2 + ((rows - peeled_mc2) / (1 * LhsProgress)) * (1 * LhsProgress) : 0;
+  const Index peeled_mc_half =
+      mr >= LhsProgressHalf ? peeled_mc1 + ((rows - peeled_mc1) / (LhsProgressHalf)) * (LhsProgressHalf) : 0;
+  const Index peeled_mc_quarter =
+      mr >= LhsProgressQuarter
+          ? peeled_mc_half + ((rows - peeled_mc_half) / (LhsProgressQuarter)) * (LhsProgressQuarter)
+          : 0;
+  enum { pk = 8 };  // NOTE Such a large peeling factor is important for large matrices (~ +5% when >1000 on Haswell)
+  const Index peeled_kc = depth & ~(pk - 1);
+  const int prefetch_res_offset = 32 / sizeof(ResScalar);
+  //     const Index depth2     = depth & ~1;
+
+  //---------- Process 3 * LhsProgress rows at once ----------
+  // This corresponds to 3*LhsProgress x nr register blocks.
+  // Usually, make sense only with FMA
+  if (mr >= 3 * Traits::LhsProgress) {
+    // Here, the general idea is to loop on each largest micro horizontal panel of the lhs (3*Traits::LhsProgress x
+    // depth) and on each largest micro vertical panel of the rhs (depth * nr). Blocking sizes, i.e., 'depth' has been
+    // computed so that the micro horizontal panel of the lhs fit in L1. However, if depth is too small, we can extend
+    // the number of rows of these horizontal panels. This actual number of rows is computed as follow:
+    const Index l1 = defaultL1CacheSize;  // in Bytes, TODO, l1 should be passed to this function.
+    // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
+    // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only
+    // guess), or because we are testing specific blocking sizes.
+    const Index actual_panel_rows =
+        (3 * LhsProgress) * std::max<Index>(1, ((l1 - sizeof(ResScalar) * mr * nr - depth * nr * sizeof(RhsScalar)) /
+                                                (depth * sizeof(LhsScalar) * 3 * LhsProgress)));
+    for (Index i1 = 0; i1 < peeled_mc3; i1 += actual_panel_rows) {
+      const Index actual_panel_end = (std::min)(i1 + actual_panel_rows, peeled_mc3);
+#if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
+      EIGEN_IF_CONSTEXPR(nr >= 8) {
+        for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
+          for (Index i = i1; i < actual_panel_end; i += 3 * LhsProgress) {
+            const LhsScalar* blA = &blockA[i * strideA + offsetA * (3 * LhsProgress)];
+            prefetch(&blA[0]);
+            // gets res block as register
+            AccPacket C0, C1, C2, C3, C4, C5, C6, C7, C8, C9, C10, C11, C12, C13, C14, C15, C16, C17, C18, C19, C20,
+                C21, C22, C23;
+            traits.initAcc(C0);
+            traits.initAcc(C1);
+            traits.initAcc(C2);
+            traits.initAcc(C3);
+            traits.initAcc(C4);
+            traits.initAcc(C5);
+            traits.initAcc(C6);
+            traits.initAcc(C7);
+            traits.initAcc(C8);
+            traits.initAcc(C9);
+            traits.initAcc(C10);
+            traits.initAcc(C11);
+            traits.initAcc(C12);
+            traits.initAcc(C13);
+            traits.initAcc(C14);
+            traits.initAcc(C15);
+            traits.initAcc(C16);
+            traits.initAcc(C17);
+            traits.initAcc(C18);
+            traits.initAcc(C19);
+            traits.initAcc(C20);
+            traits.initAcc(C21);
+            traits.initAcc(C22);
+            traits.initAcc(C23);
+
+            LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
+            LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
+            LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
+            LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
+            LinearMapper r4 = res.getLinearMapper(i, j2 + 4);
+            LinearMapper r5 = res.getLinearMapper(i, j2 + 5);
+            LinearMapper r6 = res.getLinearMapper(i, j2 + 6);
+            LinearMapper r7 = res.getLinearMapper(i, j2 + 7);
+
+            r0.prefetch(0);
+            r1.prefetch(0);
+            r2.prefetch(0);
+            r3.prefetch(0);
+            r4.prefetch(0);
+            r5.prefetch(0);
+            r6.prefetch(0);
+            r7.prefetch(0);
+
+            // performs "inner" products
+            const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 8];
+            prefetch(&blB[0]);
+            LhsPacket A0, A1;
+            for (Index k = 0; k < peeled_kc; k += pk) {
+              EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX8");
+              // 27 registers are taken (24 for acc, 3 for lhs).
+              RhsPanel27 rhs_panel;
+              RhsPacket T0;
+              LhsPacket A2;
+#if EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && EIGEN_GNUC_STRICT_LESS_THAN(9, 0, 0)
+// see http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1633
+// without this workaround A0, A1, and A2 are loaded in the same register,
+// which is not good for pipelining
+#define EIGEN_GEBP_3Px8_REGISTER_ALLOC_WORKAROUND __asm__("" : "+w,m"(A0), "+w,m"(A1), "+w,m"(A2));
+#else
+#define EIGEN_GEBP_3Px8_REGISTER_ALLOC_WORKAROUND
+#endif
+
+#define EIGEN_GEBP_ONESTEP(K)                                                                                     \
+  do {                                                                                                            \
+    EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX8");                                                    \
+    traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0);                                                          \
+    traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1);                                                          \
+    traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2);                                                          \
+    EIGEN_GEBP_3Px8_REGISTER_ALLOC_WORKAROUND traits.loadRhs(blB + (0 + 8 * K) * Traits::RhsProgress, rhs_panel); \
+    traits.madd(A0, rhs_panel, C0, T0, fix<0>);                                                                   \
+    traits.madd(A1, rhs_panel, C8, T0, fix<0>);                                                                   \
+    traits.madd(A2, rhs_panel, C16, T0, fix<0>);                                                                  \
+    traits.updateRhs(blB + (1 + 8 * K) * Traits::RhsProgress, rhs_panel);                                         \
+    traits.madd(A0, rhs_panel, C1, T0, fix<1>);                                                                   \
+    traits.madd(A1, rhs_panel, C9, T0, fix<1>);                                                                   \
+    traits.madd(A2, rhs_panel, C17, T0, fix<1>);                                                                  \
+    traits.updateRhs(blB + (2 + 8 * K) * Traits::RhsProgress, rhs_panel);                                         \
+    traits.madd(A0, rhs_panel, C2, T0, fix<2>);                                                                   \
+    traits.madd(A1, rhs_panel, C10, T0, fix<2>);                                                                  \
+    traits.madd(A2, rhs_panel, C18, T0, fix<2>);                                                                  \
+    traits.updateRhs(blB + (3 + 8 * K) * Traits::RhsProgress, rhs_panel);                                         \
+    traits.madd(A0, rhs_panel, C3, T0, fix<3>);                                                                   \
+    traits.madd(A1, rhs_panel, C11, T0, fix<3>);                                                                  \
+    traits.madd(A2, rhs_panel, C19, T0, fix<3>);                                                                  \
+    traits.loadRhs(blB + (4 + 8 * K) * Traits::RhsProgress, rhs_panel);                                           \
+    traits.madd(A0, rhs_panel, C4, T0, fix<0>);                                                                   \
+    traits.madd(A1, rhs_panel, C12, T0, fix<0>);                                                                  \
+    traits.madd(A2, rhs_panel, C20, T0, fix<0>);                                                                  \
+    traits.updateRhs(blB + (5 + 8 * K) * Traits::RhsProgress, rhs_panel);                                         \
+    traits.madd(A0, rhs_panel, C5, T0, fix<1>);                                                                   \
+    traits.madd(A1, rhs_panel, C13, T0, fix<1>);                                                                  \
+    traits.madd(A2, rhs_panel, C21, T0, fix<1>);                                                                  \
+    traits.updateRhs(blB + (6 + 8 * K) * Traits::RhsProgress, rhs_panel);                                         \
+    traits.madd(A0, rhs_panel, C6, T0, fix<2>);                                                                   \
+    traits.madd(A1, rhs_panel, C14, T0, fix<2>);                                                                  \
+    traits.madd(A2, rhs_panel, C22, T0, fix<2>);                                                                  \
+    traits.updateRhs(blB + (7 + 8 * K) * Traits::RhsProgress, rhs_panel);                                         \
+    traits.madd(A0, rhs_panel, C7, T0, fix<3>);                                                                   \
+    traits.madd(A1, rhs_panel, C15, T0, fix<3>);                                                                  \
+    traits.madd(A2, rhs_panel, C23, T0, fix<3>);                                                                  \
+    EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX8");                                                      \
+  } while (false)
+
+              EIGEN_GEBP_ONESTEP(0);
+              EIGEN_GEBP_ONESTEP(1);
+              EIGEN_GEBP_ONESTEP(2);
+              EIGEN_GEBP_ONESTEP(3);
+              EIGEN_GEBP_ONESTEP(4);
+              EIGEN_GEBP_ONESTEP(5);
+              EIGEN_GEBP_ONESTEP(6);
+              EIGEN_GEBP_ONESTEP(7);
+
+              blB += pk * 8 * RhsProgress;
+              blA += pk * 3 * Traits::LhsProgress;
+              EIGEN_ASM_COMMENT("end gebp micro kernel 3pX8");
+            }
+
+            // process remaining peeled loop
+            for (Index k = peeled_kc; k < depth; k++) {
+              RhsPanel27 rhs_panel;
+              RhsPacket T0;
+              LhsPacket A2;
+              EIGEN_GEBP_ONESTEP(0);
+              blB += 8 * RhsProgress;
+              blA += 3 * Traits::LhsProgress;
+            }
+
+#undef EIGEN_GEBP_ONESTEP
+
+            ResPacket R0, R1, R2;
+            ResPacket alphav = pset1<ResPacket>(alpha);
+
+            R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+            R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+            R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
+            traits.acc(C0, alphav, R0);
+            traits.acc(C8, alphav, R1);
+            traits.acc(C16, alphav, R2);
+            r0.storePacket(0 * Traits::ResPacketSize, R0);
+            r0.storePacket(1 * Traits::ResPacketSize, R1);
+            r0.storePacket(2 * Traits::ResPacketSize, R2);
+
+            R0 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+            R1 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+            R2 = r1.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
+            traits.acc(C1, alphav, R0);
+            traits.acc(C9, alphav, R1);
+            traits.acc(C17, alphav, R2);
+            r1.storePacket(0 * Traits::ResPacketSize, R0);
+            r1.storePacket(1 * Traits::ResPacketSize, R1);
+            r1.storePacket(2 * Traits::ResPacketSize, R2);
+
+            R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+            R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+            R2 = r2.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
+            traits.acc(C2, alphav, R0);
+            traits.acc(C10, alphav, R1);
+            traits.acc(C18, alphav, R2);
+            r2.storePacket(0 * Traits::ResPacketSize, R0);
+            r2.storePacket(1 * Traits::ResPacketSize, R1);
+            r2.storePacket(2 * Traits::ResPacketSize, R2);
+
+            R0 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+            R1 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+            R2 = r3.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
+            traits.acc(C3, alphav, R0);
+            traits.acc(C11, alphav, R1);
+            traits.acc(C19, alphav, R2);
+            r3.storePacket(0 * Traits::ResPacketSize, R0);
+            r3.storePacket(1 * Traits::ResPacketSize, R1);
+            r3.storePacket(2 * Traits::ResPacketSize, R2);
+
+            R0 = r4.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+            R1 = r4.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+            R2 = r4.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
+            traits.acc(C4, alphav, R0);
+            traits.acc(C12, alphav, R1);
+            traits.acc(C20, alphav, R2);
+            r4.storePacket(0 * Traits::ResPacketSize, R0);
+            r4.storePacket(1 * Traits::ResPacketSize, R1);
+            r4.storePacket(2 * Traits::ResPacketSize, R2);
+
+            R0 = r5.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+            R1 = r5.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+            R2 = r5.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
+            traits.acc(C5, alphav, R0);
+            traits.acc(C13, alphav, R1);
+            traits.acc(C21, alphav, R2);
+            r5.storePacket(0 * Traits::ResPacketSize, R0);
+            r5.storePacket(1 * Traits::ResPacketSize, R1);
+            r5.storePacket(2 * Traits::ResPacketSize, R2);
+
+            R0 = r6.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+            R1 = r6.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+            R2 = r6.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
+            traits.acc(C6, alphav, R0);
+            traits.acc(C14, alphav, R1);
+            traits.acc(C22, alphav, R2);
+            r6.storePacket(0 * Traits::ResPacketSize, R0);
+            r6.storePacket(1 * Traits::ResPacketSize, R1);
+            r6.storePacket(2 * Traits::ResPacketSize, R2);
+
+            R0 = r7.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+            R1 = r7.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+            R2 = r7.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
+            traits.acc(C7, alphav, R0);
+            traits.acc(C15, alphav, R1);
+            traits.acc(C23, alphav, R2);
+            r7.storePacket(0 * Traits::ResPacketSize, R0);
+            r7.storePacket(1 * Traits::ResPacketSize, R1);
+            r7.storePacket(2 * Traits::ResPacketSize, R2);
+          }
         }
-        else
-        {
-          ResPacket R0, R1, R4;
+      }
+#endif
+      for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
+        for (Index i = i1; i < actual_panel_end; i += 3 * LhsProgress) {
+          // We selected a 3*Traits::LhsProgress x nr micro block of res which is entirely
+          // stored into 3 x nr registers.
+
+          const LhsScalar* blA = &blockA[i * strideA + offsetA * (3 * LhsProgress)];
+          prefetch(&blA[0]);
+
+          // gets res block as register
+          AccPacket C0, C1, C2, C3, C4, C5, C6, C7, C8, C9, C10, C11;
+          traits.initAcc(C0);
+          traits.initAcc(C1);
+          traits.initAcc(C2);
+          traits.initAcc(C3);
+          traits.initAcc(C4);
+          traits.initAcc(C5);
+          traits.initAcc(C6);
+          traits.initAcc(C7);
+          traits.initAcc(C8);
+          traits.initAcc(C9);
+          traits.initAcc(C10);
+          traits.initAcc(C11);
+
+          LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
+          LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
+          LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
+          LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
+
+          r0.prefetch(0);
+          r1.prefetch(0);
+          r2.prefetch(0);
+          r3.prefetch(0);
+
+          // performs "inner" products
+          const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 4];
+          prefetch(&blB[0]);
+          LhsPacket A0, A1;
+
+          for (Index k = 0; k < peeled_kc; k += pk) {
+            EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX4");
+            // 15 registers are taken (12 for acc, 3 for lhs).
+            RhsPanel15 rhs_panel;
+            RhsPacket T0;
+            LhsPacket A2;
+#if EIGEN_ARCH_ARM64 && defined(EIGEN_VECTORIZE_NEON) && EIGEN_GNUC_STRICT_LESS_THAN(9, 0, 0)
+// see http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1633
+// without this workaround A0, A1, and A2 are loaded in the same register,
+// which is not good for pipelining
+#define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND __asm__("" : "+w,m"(A0), "+w,m"(A1), "+w,m"(A2));
+#else
+#define EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND
+#endif
+#define EIGEN_GEBP_ONESTEP(K)                                             \
+  do {                                                                    \
+    EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX4");            \
+    EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!");   \
+    internal::prefetch(blA + (3 * K + 16) * LhsProgress);                 \
+    if (EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) {                              \
+      internal::prefetch(blB + (4 * K + 16) * RhsProgress);               \
+    } /* Bug 953 */                                                       \
+    traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0);                  \
+    traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1);                  \
+    traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2);                  \
+    EIGEN_GEBP_3PX4_REGISTER_ALLOC_WORKAROUND                             \
+    traits.loadRhs(blB + (0 + 4 * K) * Traits::RhsProgress, rhs_panel);   \
+    traits.madd(A0, rhs_panel, C0, T0, fix<0>);                           \
+    traits.madd(A1, rhs_panel, C4, T0, fix<0>);                           \
+    traits.madd(A2, rhs_panel, C8, T0, fix<0>);                           \
+    traits.updateRhs(blB + (1 + 4 * K) * Traits::RhsProgress, rhs_panel); \
+    traits.madd(A0, rhs_panel, C1, T0, fix<1>);                           \
+    traits.madd(A1, rhs_panel, C5, T0, fix<1>);                           \
+    traits.madd(A2, rhs_panel, C9, T0, fix<1>);                           \
+    traits.updateRhs(blB + (2 + 4 * K) * Traits::RhsProgress, rhs_panel); \
+    traits.madd(A0, rhs_panel, C2, T0, fix<2>);                           \
+    traits.madd(A1, rhs_panel, C6, T0, fix<2>);                           \
+    traits.madd(A2, rhs_panel, C10, T0, fix<2>);                          \
+    traits.updateRhs(blB + (3 + 4 * K) * Traits::RhsProgress, rhs_panel); \
+    traits.madd(A0, rhs_panel, C3, T0, fix<3>);                           \
+    traits.madd(A1, rhs_panel, C7, T0, fix<3>);                           \
+    traits.madd(A2, rhs_panel, C11, T0, fix<3>);                          \
+    EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX4");              \
+  } while (false)
+
+            internal::prefetch(blB);
+            EIGEN_GEBP_ONESTEP(0);
+            EIGEN_GEBP_ONESTEP(1);
+            EIGEN_GEBP_ONESTEP(2);
+            EIGEN_GEBP_ONESTEP(3);
+            EIGEN_GEBP_ONESTEP(4);
+            EIGEN_GEBP_ONESTEP(5);
+            EIGEN_GEBP_ONESTEP(6);
+            EIGEN_GEBP_ONESTEP(7);
+
+            blB += pk * 4 * RhsProgress;
+            blA += pk * 3 * Traits::LhsProgress;
+
+            EIGEN_ASM_COMMENT("end gebp micro kernel 3pX4");
+          }
+          // process remaining peeled loop
+          for (Index k = peeled_kc; k < depth; k++) {
+            RhsPanel15 rhs_panel;
+            RhsPacket T0;
+            LhsPacket A2;
+            EIGEN_GEBP_ONESTEP(0);
+            blB += 4 * RhsProgress;
+            blA += 3 * Traits::LhsProgress;
+          }
+
+#undef EIGEN_GEBP_ONESTEP
+
+          ResPacket R0, R1, R2;
           ResPacket alphav = pset1<ResPacket>(alpha);
 
-          R0 = ploadu<ResPacket>(r0);
-          R1 = ploadu<ResPacket>(r1);
-          R4 = ploadu<ResPacket>(r0 + ResPacketSize);
+          R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+          R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+          R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
           traits.acc(C0, alphav, R0);
-          pstoreu(r0, R0);
-          R0 = ploadu<ResPacket>(r1 + ResPacketSize);
-          traits.acc(C1, alphav, R1);
-          traits.acc(C4, alphav, R4);
-          traits.acc(C5, alphav, R0);
-          pstoreu(r1, R1);
-          pstoreu(r0 + ResPacketSize, R4);
-          pstoreu(r1 + ResPacketSize, R0);
+          traits.acc(C4, alphav, R1);
+          traits.acc(C8, alphav, R2);
+          r0.storePacket(0 * Traits::ResPacketSize, R0);
+          r0.storePacket(1 * Traits::ResPacketSize, R1);
+          r0.storePacket(2 * Traits::ResPacketSize, R2);
+
+          R0 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+          R1 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+          R2 = r1.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
+          traits.acc(C1, alphav, R0);
+          traits.acc(C5, alphav, R1);
+          traits.acc(C9, alphav, R2);
+          r1.storePacket(0 * Traits::ResPacketSize, R0);
+          r1.storePacket(1 * Traits::ResPacketSize, R1);
+          r1.storePacket(2 * Traits::ResPacketSize, R2);
+
+          R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+          R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+          R2 = r2.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
+          traits.acc(C2, alphav, R0);
+          traits.acc(C6, alphav, R1);
+          traits.acc(C10, alphav, R2);
+          r2.storePacket(0 * Traits::ResPacketSize, R0);
+          r2.storePacket(1 * Traits::ResPacketSize, R1);
+          r2.storePacket(2 * Traits::ResPacketSize, R2);
+
+          R0 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+          R1 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+          R2 = r3.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
+          traits.acc(C3, alphav, R0);
+          traits.acc(C7, alphav, R1);
+          traits.acc(C11, alphav, R2);
+          r3.storePacket(0 * Traits::ResPacketSize, R0);
+          r3.storePacket(1 * Traits::ResPacketSize, R1);
+          r3.storePacket(2 * Traits::ResPacketSize, R2);
         }
-        
       }
-      
-      if(rows-peeled_mc>=LhsProgress)
-      {
-        Index i = peeled_mc;
-        const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress];
-        prefetch(&blA[0]);
 
-        // gets res block as register
-        AccPacket C0, C1, C2, C3;
-                  traits.initAcc(C0);
-                  traits.initAcc(C1);
-        if(nr==4) traits.initAcc(C2);
-        if(nr==4) traits.initAcc(C3);
-
-        // performs "inner" product
-        const RhsScalar* blB = unpackedB;
-        for(Index k=0; k<peeled_kc; k+=4)
-        {
-          if(nr==2)
-          {
-            LhsPacket A0;
-            RhsPacket B_0, B1;
+      // Deal with remaining columns of the rhs
+      for (Index j2 = packet_cols4; j2 < cols; j2++) {
+        for (Index i = i1; i < actual_panel_end; i += 3 * LhsProgress) {
+          // One column at a time
+          const LhsScalar* blA = &blockA[i * strideA + offsetA * (3 * Traits::LhsProgress)];
+          prefetch(&blA[0]);
+
+          // gets res block as register
+          AccPacket C0, C4, C8;
+          traits.initAcc(C0);
+          traits.initAcc(C4);
+          traits.initAcc(C8);
+
+          LinearMapper r0 = res.getLinearMapper(i, j2);
+          r0.prefetch(0);
+
+          // performs "inner" products
+          const RhsScalar* blB = &blockB[j2 * strideB + offsetB];
+          LhsPacket A0, A1, A2;
 
-            traits.loadLhs(&blA[0*LhsProgress], A0);
-            traits.loadRhs(&blB[0*RhsProgress], B_0);
-            traits.loadRhs(&blB[1*RhsProgress], B1);
-            traits.madd(A0,B_0,C0,B_0);
-            traits.loadRhs(&blB[2*RhsProgress], B_0);
-            traits.madd(A0,B1,C1,B1);
-            traits.loadLhs(&blA[1*LhsProgress], A0);
-            traits.loadRhs(&blB[3*RhsProgress], B1);
-            traits.madd(A0,B_0,C0,B_0);
-            traits.loadRhs(&blB[4*RhsProgress], B_0);
-            traits.madd(A0,B1,C1,B1);
-            traits.loadLhs(&blA[2*LhsProgress], A0);
-            traits.loadRhs(&blB[5*RhsProgress], B1);
-            traits.madd(A0,B_0,C0,B_0);
-            traits.loadRhs(&blB[6*RhsProgress], B_0);
-            traits.madd(A0,B1,C1,B1);
-            traits.loadLhs(&blA[3*LhsProgress], A0);
-            traits.loadRhs(&blB[7*RhsProgress], B1);
-            traits.madd(A0,B_0,C0,B_0);
-            traits.madd(A0,B1,C1,B1);
+          for (Index k = 0; k < peeled_kc; k += pk) {
+            EIGEN_ASM_COMMENT("begin gebp micro kernel 3pX1");
+            RhsPacket B_0;
+#define EIGEN_GEBGP_ONESTEP(K)                                          \
+  do {                                                                  \
+    EIGEN_ASM_COMMENT("begin step of gebp micro kernel 3pX1");          \
+    EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+    traits.loadLhs(&blA[(0 + 3 * K) * LhsProgress], A0);                \
+    traits.loadLhs(&blA[(1 + 3 * K) * LhsProgress], A1);                \
+    traits.loadLhs(&blA[(2 + 3 * K) * LhsProgress], A2);                \
+    traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0);                   \
+    traits.madd(A0, B_0, C0, B_0, fix<0>);                              \
+    traits.madd(A1, B_0, C4, B_0, fix<0>);                              \
+    traits.madd(A2, B_0, C8, B_0, fix<0>);                              \
+    EIGEN_ASM_COMMENT("end step of gebp micro kernel 3pX1");            \
+  } while (false)
+
+            EIGEN_GEBGP_ONESTEP(0);
+            EIGEN_GEBGP_ONESTEP(1);
+            EIGEN_GEBGP_ONESTEP(2);
+            EIGEN_GEBGP_ONESTEP(3);
+            EIGEN_GEBGP_ONESTEP(4);
+            EIGEN_GEBGP_ONESTEP(5);
+            EIGEN_GEBGP_ONESTEP(6);
+            EIGEN_GEBGP_ONESTEP(7);
+
+            blB += int(pk) * int(RhsProgress);
+            blA += int(pk) * 3 * int(Traits::LhsProgress);
+
+            EIGEN_ASM_COMMENT("end gebp micro kernel 3pX1");
           }
-          else
-          {
-            LhsPacket A0;
-            RhsPacket B_0, B1, B2, B3;
-
-            traits.loadLhs(&blA[0*LhsProgress], A0);
-            traits.loadRhs(&blB[0*RhsProgress], B_0);
-            traits.loadRhs(&blB[1*RhsProgress], B1);
-
-            traits.madd(A0,B_0,C0,B_0);
-            traits.loadRhs(&blB[2*RhsProgress], B2);
-            traits.loadRhs(&blB[3*RhsProgress], B3);
-            traits.loadRhs(&blB[4*RhsProgress], B_0);
-            traits.madd(A0,B1,C1,B1);
-            traits.loadRhs(&blB[5*RhsProgress], B1);
-            traits.madd(A0,B2,C2,B2);
-            traits.loadRhs(&blB[6*RhsProgress], B2);
-            traits.madd(A0,B3,C3,B3);
-            traits.loadLhs(&blA[1*LhsProgress], A0);
-            traits.loadRhs(&blB[7*RhsProgress], B3);
-            traits.madd(A0,B_0,C0,B_0);
-            traits.loadRhs(&blB[8*RhsProgress], B_0);
-            traits.madd(A0,B1,C1,B1);
-            traits.loadRhs(&blB[9*RhsProgress], B1);
-            traits.madd(A0,B2,C2,B2);
-            traits.loadRhs(&blB[10*RhsProgress], B2);
-            traits.madd(A0,B3,C3,B3);
-            traits.loadLhs(&blA[2*LhsProgress], A0);
-            traits.loadRhs(&blB[11*RhsProgress], B3);
-
-            traits.madd(A0,B_0,C0,B_0);
-            traits.loadRhs(&blB[12*RhsProgress], B_0);
-            traits.madd(A0,B1,C1,B1);
-            traits.loadRhs(&blB[13*RhsProgress], B1);
-            traits.madd(A0,B2,C2,B2);
-            traits.loadRhs(&blB[14*RhsProgress], B2);
-            traits.madd(A0,B3,C3,B3);
-
-            traits.loadLhs(&blA[3*LhsProgress], A0);
-            traits.loadRhs(&blB[15*RhsProgress], B3);
-            traits.madd(A0,B_0,C0,B_0);
-            traits.madd(A0,B1,C1,B1);
-            traits.madd(A0,B2,C2,B2);
-            traits.madd(A0,B3,C3,B3);
+
+          // process remaining peeled loop
+          for (Index k = peeled_kc; k < depth; k++) {
+            RhsPacket B_0;
+            EIGEN_GEBGP_ONESTEP(0);
+            blB += RhsProgress;
+            blA += 3 * Traits::LhsProgress;
           }
+#undef EIGEN_GEBGP_ONESTEP
+          ResPacket R0, R1, R2;
+          ResPacket alphav = pset1<ResPacket>(alpha);
 
-          blB += nr*4*RhsProgress;
-          blA += 4*LhsProgress;
+          R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+          R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+          R2 = r0.template loadPacket<ResPacket>(2 * Traits::ResPacketSize);
+          traits.acc(C0, alphav, R0);
+          traits.acc(C4, alphav, R1);
+          traits.acc(C8, alphav, R2);
+          r0.storePacket(0 * Traits::ResPacketSize, R0);
+          r0.storePacket(1 * Traits::ResPacketSize, R1);
+          r0.storePacket(2 * Traits::ResPacketSize, R2);
         }
-        // process remaining peeled loop
-        for(Index k=peeled_kc; k<depth; k++)
-        {
-          if(nr==2)
-          {
-            LhsPacket A0;
-            RhsPacket B_0, B1;
+      }
+    }
+  }
 
-            traits.loadLhs(&blA[0*LhsProgress], A0);
-            traits.loadRhs(&blB[0*RhsProgress], B_0);
-            traits.loadRhs(&blB[1*RhsProgress], B1);
-            traits.madd(A0,B_0,C0,B_0);
-            traits.madd(A0,B1,C1,B1);
+  //---------- Process 2 * LhsProgress rows at once ----------
+  if (mr >= 2 * Traits::LhsProgress) {
+    const Index l1 = defaultL1CacheSize;  // in Bytes, TODO, l1 should be passed to this function.
+    // The max(1, ...) here is needed because we may be using blocking params larger than what our known l1 cache size
+    // suggests we should be using: either because our known l1 cache size is inaccurate (e.g. on Android, we can only
+    // guess), or because we are testing specific blocking sizes.
+    Index actual_panel_rows =
+        (2 * LhsProgress) * std::max<Index>(1, ((l1 - sizeof(ResScalar) * mr * nr - depth * nr * sizeof(RhsScalar)) /
+                                                (depth * sizeof(LhsScalar) * 2 * LhsProgress)));
+
+    for (Index i1 = peeled_mc3; i1 < peeled_mc2; i1 += actual_panel_rows) {
+      Index actual_panel_end = (std::min)(i1 + actual_panel_rows, peeled_mc2);
+#if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
+      EIGEN_IF_CONSTEXPR(nr >= 8) {
+        for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
+          for (Index i = i1; i < actual_panel_end; i += 2 * LhsProgress) {
+            const LhsScalar* blA = &blockA[i * strideA + offsetA * (2 * Traits::LhsProgress)];
+            prefetch(&blA[0]);
+
+            AccPacket C0, C1, C2, C3, C4, C5, C6, C7, C8, C9, C10, C11, C12, C13, C14, C15;
+            traits.initAcc(C0);
+            traits.initAcc(C1);
+            traits.initAcc(C2);
+            traits.initAcc(C3);
+            traits.initAcc(C4);
+            traits.initAcc(C5);
+            traits.initAcc(C6);
+            traits.initAcc(C7);
+            traits.initAcc(C8);
+            traits.initAcc(C9);
+            traits.initAcc(C10);
+            traits.initAcc(C11);
+            traits.initAcc(C12);
+            traits.initAcc(C13);
+            traits.initAcc(C14);
+            traits.initAcc(C15);
+
+            LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
+            LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
+            LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
+            LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
+            LinearMapper r4 = res.getLinearMapper(i, j2 + 4);
+            LinearMapper r5 = res.getLinearMapper(i, j2 + 5);
+            LinearMapper r6 = res.getLinearMapper(i, j2 + 6);
+            LinearMapper r7 = res.getLinearMapper(i, j2 + 7);
+            r0.prefetch(prefetch_res_offset);
+            r1.prefetch(prefetch_res_offset);
+            r2.prefetch(prefetch_res_offset);
+            r3.prefetch(prefetch_res_offset);
+            r4.prefetch(prefetch_res_offset);
+            r5.prefetch(prefetch_res_offset);
+            r6.prefetch(prefetch_res_offset);
+            r7.prefetch(prefetch_res_offset);
+
+            const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 8];
+            prefetch(&blB[0]);
+            LhsPacket A0, A1;
+            for (Index k = 0; k < peeled_kc; k += pk) {
+              RhsPacketx4 rhs_panel;
+              RhsPacket T0;
+// NOTE: the begin/end asm comments below work around bug 935!
+// but they are not enough for gcc>=6 without FMA (bug 1637)
+#if EIGEN_GNUC_STRICT_AT_LEAST(6, 0, 0) && defined(EIGEN_VECTORIZE_SSE)
+#define EIGEN_GEBP_2Px8_SPILLING_WORKAROUND __asm__("" : [a0] "+x,m"(A0), [a1] "+x,m"(A1));
+#else
+#define EIGEN_GEBP_2Px8_SPILLING_WORKAROUND
+#endif
+#define EIGEN_GEBGP_ONESTEP(K)                                                                   \
+  do {                                                                                           \
+    EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX8");                                   \
+    traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0);                                         \
+    traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1);                                         \
+    traits.loadRhs(&blB[(0 + 8 * K) * RhsProgress], rhs_panel);                                  \
+    traits.madd(A0, rhs_panel, C0, T0, fix<0>);                                                  \
+    traits.madd(A1, rhs_panel, C8, T0, fix<0>);                                                  \
+    traits.updateRhs(&blB[(1 + 8 * K) * RhsProgress], rhs_panel);                                \
+    traits.madd(A0, rhs_panel, C1, T0, fix<1>);                                                  \
+    traits.madd(A1, rhs_panel, C9, T0, fix<1>);                                                  \
+    traits.updateRhs(&blB[(2 + 8 * K) * RhsProgress], rhs_panel);                                \
+    traits.madd(A0, rhs_panel, C2, T0, fix<2>);                                                  \
+    traits.madd(A1, rhs_panel, C10, T0, fix<2>);                                                 \
+    traits.updateRhs(&blB[(3 + 8 * K) * RhsProgress], rhs_panel);                                \
+    traits.madd(A0, rhs_panel, C3, T0, fix<3>);                                                  \
+    traits.madd(A1, rhs_panel, C11, T0, fix<3>);                                                 \
+    traits.loadRhs(&blB[(4 + 8 * K) * RhsProgress], rhs_panel);                                  \
+    traits.madd(A0, rhs_panel, C4, T0, fix<0>);                                                  \
+    traits.madd(A1, rhs_panel, C12, T0, fix<0>);                                                 \
+    traits.updateRhs(&blB[(5 + 8 * K) * RhsProgress], rhs_panel);                                \
+    traits.madd(A0, rhs_panel, C5, T0, fix<1>);                                                  \
+    traits.madd(A1, rhs_panel, C13, T0, fix<1>);                                                 \
+    traits.updateRhs(&blB[(6 + 8 * K) * RhsProgress], rhs_panel);                                \
+    traits.madd(A0, rhs_panel, C6, T0, fix<2>);                                                  \
+    traits.madd(A1, rhs_panel, C14, T0, fix<2>);                                                 \
+    traits.updateRhs(&blB[(7 + 8 * K) * RhsProgress], rhs_panel);                                \
+    traits.madd(A0, rhs_panel, C7, T0, fix<3>);                                                  \
+    traits.madd(A1, rhs_panel, C15, T0, fix<3>);                                                 \
+    EIGEN_GEBP_2Px8_SPILLING_WORKAROUND EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX8"); \
+  } while (false)
+
+              EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX8");
+
+              EIGEN_GEBGP_ONESTEP(0);
+              EIGEN_GEBGP_ONESTEP(1);
+              EIGEN_GEBGP_ONESTEP(2);
+              EIGEN_GEBGP_ONESTEP(3);
+              EIGEN_GEBGP_ONESTEP(4);
+              EIGEN_GEBGP_ONESTEP(5);
+              EIGEN_GEBGP_ONESTEP(6);
+              EIGEN_GEBGP_ONESTEP(7);
+
+              blB += pk * 8 * RhsProgress;
+              blA += pk * (2 * Traits::LhsProgress);
+
+              EIGEN_ASM_COMMENT("end gebp micro kernel 2pX8");
+            }
+            // process remaining peeled loop
+            for (Index k = peeled_kc; k < depth; k++) {
+              RhsPacketx4 rhs_panel;
+              RhsPacket T0;
+              EIGEN_GEBGP_ONESTEP(0);
+              blB += 8 * RhsProgress;
+              blA += 2 * Traits::LhsProgress;
+            }
+
+#undef EIGEN_GEBGP_ONESTEP
+
+            ResPacket R0, R1, R2, R3;
+            ResPacket alphav = pset1<ResPacket>(alpha);
+
+            R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+            R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+            R2 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+            R3 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+            traits.acc(C0, alphav, R0);
+            traits.acc(C8, alphav, R1);
+            traits.acc(C1, alphav, R2);
+            traits.acc(C9, alphav, R3);
+            r0.storePacket(0 * Traits::ResPacketSize, R0);
+            r0.storePacket(1 * Traits::ResPacketSize, R1);
+            r1.storePacket(0 * Traits::ResPacketSize, R2);
+            r1.storePacket(1 * Traits::ResPacketSize, R3);
+
+            R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+            R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+            R2 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+            R3 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+            traits.acc(C2, alphav, R0);
+            traits.acc(C10, alphav, R1);
+            traits.acc(C3, alphav, R2);
+            traits.acc(C11, alphav, R3);
+            r2.storePacket(0 * Traits::ResPacketSize, R0);
+            r2.storePacket(1 * Traits::ResPacketSize, R1);
+            r3.storePacket(0 * Traits::ResPacketSize, R2);
+            r3.storePacket(1 * Traits::ResPacketSize, R3);
+
+            R0 = r4.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+            R1 = r4.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+            R2 = r5.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+            R3 = r5.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+            traits.acc(C4, alphav, R0);
+            traits.acc(C12, alphav, R1);
+            traits.acc(C5, alphav, R2);
+            traits.acc(C13, alphav, R3);
+            r4.storePacket(0 * Traits::ResPacketSize, R0);
+            r4.storePacket(1 * Traits::ResPacketSize, R1);
+            r5.storePacket(0 * Traits::ResPacketSize, R2);
+            r5.storePacket(1 * Traits::ResPacketSize, R3);
+
+            R0 = r6.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+            R1 = r6.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+            R2 = r7.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+            R3 = r7.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+            traits.acc(C6, alphav, R0);
+            traits.acc(C14, alphav, R1);
+            traits.acc(C7, alphav, R2);
+            traits.acc(C15, alphav, R3);
+            r6.storePacket(0 * Traits::ResPacketSize, R0);
+            r6.storePacket(1 * Traits::ResPacketSize, R1);
+            r7.storePacket(0 * Traits::ResPacketSize, R2);
+            r7.storePacket(1 * Traits::ResPacketSize, R3);
           }
-          else
-          {
-            LhsPacket A0;
-            RhsPacket B_0, B1, B2, B3;
-
-            traits.loadLhs(&blA[0*LhsProgress], A0);
-            traits.loadRhs(&blB[0*RhsProgress], B_0);
-            traits.loadRhs(&blB[1*RhsProgress], B1);
-            traits.loadRhs(&blB[2*RhsProgress], B2);
-            traits.loadRhs(&blB[3*RhsProgress], B3);
-
-            traits.madd(A0,B_0,C0,B_0);
-            traits.madd(A0,B1,C1,B1);
-            traits.madd(A0,B2,C2,B2);
-            traits.madd(A0,B3,C3,B3);
+        }
+      }
+#endif
+      for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
+        for (Index i = i1; i < actual_panel_end; i += 2 * LhsProgress) {
+          // We selected a 2*Traits::LhsProgress x nr micro block of res which is entirely
+          // stored into 2 x nr registers.
+
+          const LhsScalar* blA = &blockA[i * strideA + offsetA * (2 * Traits::LhsProgress)];
+          prefetch(&blA[0]);
+
+          // gets res block as register
+          AccPacket C0, C1, C2, C3, C4, C5, C6, C7;
+          traits.initAcc(C0);
+          traits.initAcc(C1);
+          traits.initAcc(C2);
+          traits.initAcc(C3);
+          traits.initAcc(C4);
+          traits.initAcc(C5);
+          traits.initAcc(C6);
+          traits.initAcc(C7);
+
+          LinearMapper r0 = res.getLinearMapper(i, j2 + 0);
+          LinearMapper r1 = res.getLinearMapper(i, j2 + 1);
+          LinearMapper r2 = res.getLinearMapper(i, j2 + 2);
+          LinearMapper r3 = res.getLinearMapper(i, j2 + 3);
+
+          r0.prefetch(prefetch_res_offset);
+          r1.prefetch(prefetch_res_offset);
+          r2.prefetch(prefetch_res_offset);
+          r3.prefetch(prefetch_res_offset);
+
+          // performs "inner" products
+          const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 4];
+          prefetch(&blB[0]);
+          LhsPacket A0, A1;
+
+          for (Index k = 0; k < peeled_kc; k += pk) {
+            EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX4");
+            RhsPacketx4 rhs_panel;
+            RhsPacket T0;
+
+// NOTE: the begin/end asm comments below work around bug 935!
+// but they are not enough for gcc>=6 without FMA (bug 1637)
+#if EIGEN_GNUC_STRICT_AT_LEAST(6, 0, 0) && defined(EIGEN_VECTORIZE_SSE) && !(EIGEN_COMP_LCC)
+#define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND __asm__("" : [a0] "+x,m"(A0), [a1] "+x,m"(A1));
+#else
+#define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND
+#endif
+#define EIGEN_GEBGP_ONESTEP(K)                                  \
+  do {                                                          \
+    EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4");  \
+    traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0);        \
+    traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1);        \
+    traits.loadRhs(&blB[(0 + 4 * K) * RhsProgress], rhs_panel); \
+    traits.madd(A0, rhs_panel, C0, T0, fix<0>);                 \
+    traits.madd(A1, rhs_panel, C4, T0, fix<0>);                 \
+    traits.madd(A0, rhs_panel, C1, T0, fix<1>);                 \
+    traits.madd(A1, rhs_panel, C5, T0, fix<1>);                 \
+    traits.madd(A0, rhs_panel, C2, T0, fix<2>);                 \
+    traits.madd(A1, rhs_panel, C6, T0, fix<2>);                 \
+    traits.madd(A0, rhs_panel, C3, T0, fix<3>);                 \
+    traits.madd(A1, rhs_panel, C7, T0, fix<3>);                 \
+    EIGEN_GEBP_2PX4_SPILLING_WORKAROUND                         \
+    EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX4");    \
+  } while (false)
+
+            internal::prefetch(blB + (48 + 0));
+            EIGEN_GEBGP_ONESTEP(0);
+            EIGEN_GEBGP_ONESTEP(1);
+            EIGEN_GEBGP_ONESTEP(2);
+            EIGEN_GEBGP_ONESTEP(3);
+            internal::prefetch(blB + (48 + 16));
+            EIGEN_GEBGP_ONESTEP(4);
+            EIGEN_GEBGP_ONESTEP(5);
+            EIGEN_GEBGP_ONESTEP(6);
+            EIGEN_GEBGP_ONESTEP(7);
+
+            blB += pk * 4 * RhsProgress;
+            blA += pk * (2 * Traits::LhsProgress);
+
+            EIGEN_ASM_COMMENT("end gebp micro kernel 2pX4");
+          }
+          // process remaining peeled loop
+          for (Index k = peeled_kc; k < depth; k++) {
+            RhsPacketx4 rhs_panel;
+            RhsPacket T0;
+            EIGEN_GEBGP_ONESTEP(0);
+            blB += 4 * RhsProgress;
+            blA += 2 * Traits::LhsProgress;
           }
+#undef EIGEN_GEBGP_ONESTEP
 
-          blB += nr*RhsProgress;
-          blA += LhsProgress;
+          ResPacket R0, R1, R2, R3;
+          ResPacket alphav = pset1<ResPacket>(alpha);
+
+          R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+          R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+          R2 = r1.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+          R3 = r1.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+          traits.acc(C0, alphav, R0);
+          traits.acc(C4, alphav, R1);
+          traits.acc(C1, alphav, R2);
+          traits.acc(C5, alphav, R3);
+          r0.storePacket(0 * Traits::ResPacketSize, R0);
+          r0.storePacket(1 * Traits::ResPacketSize, R1);
+          r1.storePacket(0 * Traits::ResPacketSize, R2);
+          r1.storePacket(1 * Traits::ResPacketSize, R3);
+
+          R0 = r2.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+          R1 = r2.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+          R2 = r3.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+          R3 = r3.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+          traits.acc(C2, alphav, R0);
+          traits.acc(C6, alphav, R1);
+          traits.acc(C3, alphav, R2);
+          traits.acc(C7, alphav, R3);
+          r2.storePacket(0 * Traits::ResPacketSize, R0);
+          r2.storePacket(1 * Traits::ResPacketSize, R1);
+          r3.storePacket(0 * Traits::ResPacketSize, R2);
+          r3.storePacket(1 * Traits::ResPacketSize, R3);
         }
+      }
 
-        ResPacket R0, R1, R2, R3;
-        ResPacket alphav = pset1<ResPacket>(alpha);
+      // Deal with remaining columns of the rhs
+      for (Index j2 = packet_cols4; j2 < cols; j2++) {
+        for (Index i = i1; i < actual_panel_end; i += 2 * LhsProgress) {
+          // One column at a time
+          const LhsScalar* blA = &blockA[i * strideA + offsetA * (2 * Traits::LhsProgress)];
+          prefetch(&blA[0]);
 
-        ResScalar* r0 = &res[(j2+0)*resStride + i];
-        ResScalar* r1 = r0 + resStride;
-        ResScalar* r2 = r1 + resStride;
-        ResScalar* r3 = r2 + resStride;
-
-                  R0 = ploadu<ResPacket>(r0);
-                  R1 = ploadu<ResPacket>(r1);
-        if(nr==4) R2 = ploadu<ResPacket>(r2);
-        if(nr==4) R3 = ploadu<ResPacket>(r3);
-
-                  traits.acc(C0, alphav, R0);
-                  traits.acc(C1, alphav, R1);
-        if(nr==4) traits.acc(C2, alphav, R2);
-        if(nr==4) traits.acc(C3, alphav, R3);
-
-                  pstoreu(r0, R0);
-                  pstoreu(r1, R1);
-        if(nr==4) pstoreu(r2, R2);
-        if(nr==4) pstoreu(r3, R3);
-      }
-      for(Index i=peeled_mc2; i<rows; i++)
-      {
-        const LhsScalar* blA = &blockA[i*strideA+offsetA];
-        prefetch(&blA[0]);
+          // gets res block as register
+          AccPacket C0, C4;
+          traits.initAcc(C0);
+          traits.initAcc(C4);
 
-        // gets a 1 x nr res block as registers
-        ResScalar C0(0), C1(0), C2(0), C3(0);
-        // TODO directly use blockB ???
-        const RhsScalar* blB = &blockB[j2*strideB+offsetB*nr];
-        for(Index k=0; k<depth; k++)
-        {
-          if(nr==2)
-          {
-            LhsScalar A0;
-            RhsScalar B_0, B1;
+          LinearMapper r0 = res.getLinearMapper(i, j2);
+          r0.prefetch(prefetch_res_offset);
 
-            A0 = blA[k];
-            B_0 = blB[0];
-            B1 = blB[1];
-            MADD(cj,A0,B_0,C0,B_0);
-            MADD(cj,A0,B1,C1,B1);
+          // performs "inner" products
+          const RhsScalar* blB = &blockB[j2 * strideB + offsetB];
+          LhsPacket A0, A1;
+
+          for (Index k = 0; k < peeled_kc; k += pk) {
+            EIGEN_ASM_COMMENT("begin gebp micro kernel 2pX1");
+            RhsPacket B_0, B1;
+
+#define EIGEN_GEBGP_ONESTEP(K)                                          \
+  do {                                                                  \
+    EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX1");          \
+    EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \
+    traits.loadLhs(&blA[(0 + 2 * K) * LhsProgress], A0);                \
+    traits.loadLhs(&blA[(1 + 2 * K) * LhsProgress], A1);                \
+    traits.loadRhs(&blB[(0 + K) * RhsProgress], B_0);                   \
+    traits.madd(A0, B_0, C0, B1, fix<0>);                               \
+    traits.madd(A1, B_0, C4, B_0, fix<0>);                              \
+    EIGEN_ASM_COMMENT("end step of gebp micro kernel 2pX1");            \
+  } while (false)
+
+            EIGEN_GEBGP_ONESTEP(0);
+            EIGEN_GEBGP_ONESTEP(1);
+            EIGEN_GEBGP_ONESTEP(2);
+            EIGEN_GEBGP_ONESTEP(3);
+            EIGEN_GEBGP_ONESTEP(4);
+            EIGEN_GEBGP_ONESTEP(5);
+            EIGEN_GEBGP_ONESTEP(6);
+            EIGEN_GEBGP_ONESTEP(7);
+
+            blB += int(pk) * int(RhsProgress);
+            blA += int(pk) * 2 * int(Traits::LhsProgress);
+
+            EIGEN_ASM_COMMENT("end gebp micro kernel 2pX1");
           }
-          else
-          {
-            LhsScalar A0;
-            RhsScalar B_0, B1, B2, B3;
 
-            A0 = blA[k];
-            B_0 = blB[0];
-            B1 = blB[1];
-            B2 = blB[2];
-            B3 = blB[3];
-
-            MADD(cj,A0,B_0,C0,B_0);
-            MADD(cj,A0,B1,C1,B1);
-            MADD(cj,A0,B2,C2,B2);
-            MADD(cj,A0,B3,C3,B3);
+          // process remaining peeled loop
+          for (Index k = peeled_kc; k < depth; k++) {
+            RhsPacket B_0, B1;
+            EIGEN_GEBGP_ONESTEP(0);
+            blB += RhsProgress;
+            blA += 2 * Traits::LhsProgress;
           }
+#undef EIGEN_GEBGP_ONESTEP
+          ResPacket R0, R1;
+          ResPacket alphav = pset1<ResPacket>(alpha);
 
-          blB += nr;
+          R0 = r0.template loadPacket<ResPacket>(0 * Traits::ResPacketSize);
+          R1 = r0.template loadPacket<ResPacket>(1 * Traits::ResPacketSize);
+          traits.acc(C0, alphav, R0);
+          traits.acc(C4, alphav, R1);
+          r0.storePacket(0 * Traits::ResPacketSize, R0);
+          r0.storePacket(1 * Traits::ResPacketSize, R1);
         }
-                  res[(j2+0)*resStride + i] += alpha*C0;
-                  res[(j2+1)*resStride + i] += alpha*C1;
-        if(nr==4) res[(j2+2)*resStride + i] += alpha*C2;
-        if(nr==4) res[(j2+3)*resStride + i] += alpha*C3;
       }
     }
-    // process remaining rhs/res columns one at a time
-    // => do the same but with nr==1
-    for(Index j2=packet_cols; j2<cols; j2++)
-    {
-      // unpack B
-      traits.unpackRhs(depth, &blockB[j2*strideB+offsetB], unpackedB);
-
-      for(Index i=0; i<peeled_mc; i+=mr)
-      {
-        const LhsScalar* blA = &blockA[i*strideA+offsetA*mr];
-        prefetch(&blA[0]);
+  }
+  //---------- Process 1 * LhsProgress rows at once ----------
+  if (mr >= 1 * Traits::LhsProgress) {
+    lhs_process_one_packet<nr, LhsProgress, RhsProgress, LhsScalar, RhsScalar, ResScalar, AccPacket, LhsPacket,
+                           RhsPacket, ResPacket, Traits, LinearMapper, DataMapper>
+        p;
+    p(res, blockA, blockB, alpha, peeled_mc2, peeled_mc1, strideA, strideB, offsetA, offsetB, prefetch_res_offset,
+      peeled_kc, pk, cols, depth, packet_cols4);
+  }
+  //---------- Process LhsProgressHalf rows at once ----------
+  if ((LhsProgressHalf < LhsProgress) && mr >= LhsProgressHalf) {
+    lhs_process_fraction_of_packet<nr, LhsProgressHalf, RhsProgressHalf, LhsScalar, RhsScalar, ResScalar, AccPacketHalf,
+                                   LhsPacketHalf, RhsPacketHalf, ResPacketHalf, HalfTraits, LinearMapper, DataMapper>
+        p;
+    p(res, blockA, blockB, alpha, peeled_mc1, peeled_mc_half, strideA, strideB, offsetA, offsetB, prefetch_res_offset,
+      peeled_kc, pk, cols, depth, packet_cols4);
+  }
+  //---------- Process LhsProgressQuarter rows at once ----------
+  if ((LhsProgressQuarter < LhsProgressHalf) && mr >= LhsProgressQuarter) {
+    lhs_process_fraction_of_packet<nr, LhsProgressQuarter, RhsProgressQuarter, LhsScalar, RhsScalar, ResScalar,
+                                   AccPacketQuarter, LhsPacketQuarter, RhsPacketQuarter, ResPacketQuarter,
+                                   QuarterTraits, LinearMapper, DataMapper>
+        p;
+    p(res, blockA, blockB, alpha, peeled_mc_half, peeled_mc_quarter, strideA, strideB, offsetA, offsetB,
+      prefetch_res_offset, peeled_kc, pk, cols, depth, packet_cols4);
+  }
+  //---------- Process remaining rows, 1 at once ----------
+  if (peeled_mc_quarter < rows) {
+#if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
+    EIGEN_IF_CONSTEXPR(nr >= 8) {
+      // loop on each panel of the rhs
+      for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
+        // loop on each row of the lhs (1*LhsProgress x depth)
+        for (Index i = peeled_mc_quarter; i < rows; i += 1) {
+          const LhsScalar* blA = &blockA[i * strideA + offsetA];
+          prefetch(&blA[0]);
+          // gets a 1 x 1 res block as registers
+          ResScalar C0(0), C1(0), C2(0), C3(0), C4(0), C5(0), C6(0), C7(0);
+          const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 8];
+          for (Index k = 0; k < depth; k++) {
+            LhsScalar A0 = blA[k];
+            RhsScalar B_0;
 
-        // TODO move the res loads to the stores
+            B_0 = blB[0];
+            C0 = cj.pmadd(A0, B_0, C0);
 
-        // get res block as registers
-        AccPacket C0, C4;
-        traits.initAcc(C0);
-        traits.initAcc(C4);
+            B_0 = blB[1];
+            C1 = cj.pmadd(A0, B_0, C1);
 
-        const RhsScalar* blB = unpackedB;
-        for(Index k=0; k<depth; k++)
-        {
-          LhsPacket A0, A1;
-          RhsPacket B_0;
-          RhsPacket T0;
+            B_0 = blB[2];
+            C2 = cj.pmadd(A0, B_0, C2);
 
-          traits.loadLhs(&blA[0*LhsProgress], A0);
-          traits.loadLhs(&blA[1*LhsProgress], A1);
-          traits.loadRhs(&blB[0*RhsProgress], B_0);
-          traits.madd(A0,B_0,C0,T0);
-          traits.madd(A1,B_0,C4,B_0);
+            B_0 = blB[3];
+            C3 = cj.pmadd(A0, B_0, C3);
 
-          blB += RhsProgress;
-          blA += 2*LhsProgress;
-        }
-        ResPacket R0, R4;
-        ResPacket alphav = pset1<ResPacket>(alpha);
+            B_0 = blB[4];
+            C4 = cj.pmadd(A0, B_0, C4);
 
-        ResScalar* r0 = &res[(j2+0)*resStride + i];
+            B_0 = blB[5];
+            C5 = cj.pmadd(A0, B_0, C5);
 
-        R0 = ploadu<ResPacket>(r0);
-        R4 = ploadu<ResPacket>(r0+ResPacketSize);
+            B_0 = blB[6];
+            C6 = cj.pmadd(A0, B_0, C6);
 
-        traits.acc(C0, alphav, R0);
-        traits.acc(C4, alphav, R4);
+            B_0 = blB[7];
+            C7 = cj.pmadd(A0, B_0, C7);
 
-        pstoreu(r0,               R0);
-        pstoreu(r0+ResPacketSize, R4);
+            blB += 8;
+          }
+          res(i, j2 + 0) += alpha * C0;
+          res(i, j2 + 1) += alpha * C1;
+          res(i, j2 + 2) += alpha * C2;
+          res(i, j2 + 3) += alpha * C3;
+          res(i, j2 + 4) += alpha * C4;
+          res(i, j2 + 5) += alpha * C5;
+          res(i, j2 + 6) += alpha * C6;
+          res(i, j2 + 7) += alpha * C7;
+        }
       }
-      if(rows-peeled_mc>=LhsProgress)
-      {
-        Index i = peeled_mc;
-        const LhsScalar* blA = &blockA[i*strideA+offsetA*LhsProgress];
+    }
+#endif
+
+    for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
+      // loop on each row of the lhs (1*LhsProgress x depth)
+      for (Index i = peeled_mc_quarter; i < rows; i += 1) {
+        const LhsScalar* blA = &blockA[i * strideA + offsetA];
         prefetch(&blA[0]);
+        const RhsScalar* blB = &blockB[j2 * strideB + offsetB * 4];
+
+        // If LhsProgress is 8 or 16, it assumes that there is a
+        // half or quarter packet, respectively, of the same size as
+        // nr (which is currently 4) for the return type.
+        const int SResPacketHalfSize = unpacket_traits<typename unpacket_traits<SResPacket>::half>::size;
+        const int SResPacketQuarterSize =
+            unpacket_traits<typename unpacket_traits<typename unpacket_traits<SResPacket>::half>::half>::size;
+        // The following code assumes we can load SRhsPacket in such a way that
+        // it multiplies blocks of 4 elements in SLhsPacket.  This is not the
+        // case for some customized kernels (i.e. NEON fp16).  If the assumption
+        // fails, drop down to the scalar path.
+        constexpr bool kCanLoadSRhsQuad =
+            (unpacket_traits<SLhsPacket>::size < 4) ||
+            (unpacket_traits<SRhsPacket>::size % ((std::max<int>)(unpacket_traits<SLhsPacket>::size, 4) / 4)) == 0;
+        if (kCanLoadSRhsQuad && (SwappedTraits::LhsProgress % 4) == 0 && (SwappedTraits::LhsProgress <= 16) &&
+            (SwappedTraits::LhsProgress != 8 || SResPacketHalfSize == nr) &&
+            (SwappedTraits::LhsProgress != 16 || SResPacketQuarterSize == nr)) {
+          SAccPacket C0, C1, C2, C3;
+          straits.initAcc(C0);
+          straits.initAcc(C1);
+          straits.initAcc(C2);
+          straits.initAcc(C3);
+
+          const Index spk = (std::max)(1, SwappedTraits::LhsProgress / 4);
+          const Index endk = (depth / spk) * spk;
+          const Index endk4 = (depth / (spk * 4)) * (spk * 4);
+
+          Index k = 0;
+          for (; k < endk4; k += 4 * spk) {
+            SLhsPacket A0, A1;
+            SRhsPacket B_0, B_1;
+
+            straits.loadLhsUnaligned(blB + 0 * SwappedTraits::LhsProgress, A0);
+            straits.loadLhsUnaligned(blB + 1 * SwappedTraits::LhsProgress, A1);
+
+            straits.loadRhsQuad(blA + 0 * spk, B_0);
+            straits.loadRhsQuad(blA + 1 * spk, B_1);
+            straits.madd(A0, B_0, C0, B_0, fix<0>);
+            straits.madd(A1, B_1, C1, B_1, fix<0>);
+
+            straits.loadLhsUnaligned(blB + 2 * SwappedTraits::LhsProgress, A0);
+            straits.loadLhsUnaligned(blB + 3 * SwappedTraits::LhsProgress, A1);
+            straits.loadRhsQuad(blA + 2 * spk, B_0);
+            straits.loadRhsQuad(blA + 3 * spk, B_1);
+            straits.madd(A0, B_0, C2, B_0, fix<0>);
+            straits.madd(A1, B_1, C3, B_1, fix<0>);
+
+            blB += 4 * SwappedTraits::LhsProgress;
+            blA += 4 * spk;
+          }
+          C0 = padd(padd(C0, C1), padd(C2, C3));
+          for (; k < endk; k += spk) {
+            SLhsPacket A0;
+            SRhsPacket B_0;
 
-        AccPacket C0;
-        traits.initAcc(C0);
+            straits.loadLhsUnaligned(blB, A0);
+            straits.loadRhsQuad(blA, B_0);
+            straits.madd(A0, B_0, C0, B_0, fix<0>);
 
-        const RhsScalar* blB = unpackedB;
-        for(Index k=0; k<depth; k++)
+            blB += SwappedTraits::LhsProgress;
+            blA += spk;
+          }
+          if (SwappedTraits::LhsProgress == 8) {
+            // Special case where we have to first reduce the accumulation register C0
+            typedef std::conditional_t<SwappedTraits::LhsProgress >= 8, typename unpacket_traits<SResPacket>::half,
+                                       SResPacket>
+                SResPacketHalf;
+            typedef std::conditional_t<SwappedTraits::LhsProgress >= 8, typename unpacket_traits<SLhsPacket>::half,
+                                       SLhsPacket>
+                SLhsPacketHalf;
+            typedef std::conditional_t<SwappedTraits::LhsProgress >= 8, typename unpacket_traits<SRhsPacket>::half,
+                                       SRhsPacket>
+                SRhsPacketHalf;
+            typedef std::conditional_t<SwappedTraits::LhsProgress >= 8, typename unpacket_traits<SAccPacket>::half,
+                                       SAccPacket>
+                SAccPacketHalf;
+
+            SResPacketHalf R = res.template gatherPacket<SResPacketHalf>(i, j2);
+            SResPacketHalf alphav = pset1<SResPacketHalf>(alpha);
+
+            if (depth - endk > 0) {
+              // We have to handle the last row of the rhs which corresponds to a half-packet
+              SLhsPacketHalf a0;
+              SRhsPacketHalf b0;
+              straits.loadLhsUnaligned(blB, a0);
+              straits.loadRhs(blA, b0);
+              SAccPacketHalf c0 = predux_half_dowto4(C0);
+              straits.madd(a0, b0, c0, b0, fix<0>);
+              straits.acc(c0, alphav, R);
+            } else {
+              straits.acc(predux_half_dowto4(C0), alphav, R);
+            }
+            res.scatterPacket(i, j2, R);
+          } else if (SwappedTraits::LhsProgress == 16) {
+            // Special case where we have to first reduce the
+            // accumulation register C0. We specialize the block in
+            // template form, so that LhsProgress < 16 paths don't
+            // fail to compile
+            last_row_process_16_packets<LhsScalar, RhsScalar, Index, DataMapper, mr, nr, ConjugateLhs, ConjugateRhs> p;
+            p(res, straits, blA, blB, depth, endk, i, j2, alpha, C0);
+          } else {
+            SResPacket R = res.template gatherPacket<SResPacket>(i, j2);
+            SResPacket alphav = pset1<SResPacket>(alpha);
+            straits.acc(C0, alphav, R);
+            res.scatterPacket(i, j2, R);
+          }
+        } else  // scalar path
         {
-          LhsPacket A0;
-          RhsPacket B_0;
-          traits.loadLhs(blA, A0);
-          traits.loadRhs(blB, B_0);
-          traits.madd(A0, B_0, C0, B_0);
-          blB += RhsProgress;
-          blA += LhsProgress;
-        }
+          // get a 1 x 4 res block as registers
+          ResScalar C0(0), C1(0), C2(0), C3(0);
 
-        ResPacket alphav = pset1<ResPacket>(alpha);
-        ResPacket R0 = ploadu<ResPacket>(&res[(j2+0)*resStride + i]);
-        traits.acc(C0, alphav, R0);
-        pstoreu(&res[(j2+0)*resStride + i], R0);
+          for (Index k = 0; k < depth; k++) {
+            LhsScalar A0;
+            RhsScalar B_0, B_1;
+
+            A0 = blA[k];
+
+            B_0 = blB[0];
+            B_1 = blB[1];
+            C0 = cj.pmadd(A0, B_0, C0);
+            C1 = cj.pmadd(A0, B_1, C1);
+
+            B_0 = blB[2];
+            B_1 = blB[3];
+            C2 = cj.pmadd(A0, B_0, C2);
+            C3 = cj.pmadd(A0, B_1, C3);
+
+            blB += 4;
+          }
+          res(i, j2 + 0) += alpha * C0;
+          res(i, j2 + 1) += alpha * C1;
+          res(i, j2 + 2) += alpha * C2;
+          res(i, j2 + 3) += alpha * C3;
+        }
       }
-      for(Index i=peeled_mc2; i<rows; i++)
-      {
-        const LhsScalar* blA = &blockA[i*strideA+offsetA];
+    }
+    // remaining columns
+    for (Index j2 = packet_cols4; j2 < cols; j2++) {
+      // loop on each row of the lhs (1*LhsProgress x depth)
+      for (Index i = peeled_mc_quarter; i < rows; i += 1) {
+        const LhsScalar* blA = &blockA[i * strideA + offsetA];
         prefetch(&blA[0]);
-
         // gets a 1 x 1 res block as registers
         ResScalar C0(0);
-        // FIXME directly use blockB ??
-        const RhsScalar* blB = &blockB[j2*strideB+offsetB];
-        for(Index k=0; k<depth; k++)
-        {
+        const RhsScalar* blB = &blockB[j2 * strideB + offsetB];
+        for (Index k = 0; k < depth; k++) {
           LhsScalar A0 = blA[k];
           RhsScalar B_0 = blB[k];
-          MADD(cj, A0, B_0, C0, B_0);
+          C0 = cj.pmadd(A0, B_0, C0);
         }
-        res[(j2+0)*resStride + i] += alpha*C0;
+        res(i, j2) += alpha * C0;
       }
     }
   }
-
-
-#undef CJMADD
+}
 
 // pack a block of the lhs
 // The traversal is as follow (mr==4):
@@ -1114,86 +2557,270 @@ EIGEN_ASM_COMMENT("mybegin4");
 //
 //  32 33 34 35 ...
 //  36 36 38 39 ...
-template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder, bool Conjugate, bool PanelMode>
-struct gemm_pack_lhs
-{
-  EIGEN_DONT_INLINE void operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride=0, Index offset=0);
+template <typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate,
+          bool PanelMode>
+struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate, PanelMode> {
+  typedef typename DataMapper::LinearMapper LinearMapper;
+  EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride = 0,
+                                    Index offset = 0);
 };
 
-template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder, bool Conjugate, bool PanelMode>
-EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, StorageOrder, Conjugate, PanelMode>
-  ::operator()(Scalar* blockA, const Scalar* EIGEN_RESTRICT _lhs, Index lhsStride, Index depth, Index rows, Index stride, Index offset)
-{
-  typedef typename packet_traits<Scalar>::type Packet;
-  enum { PacketSize = packet_traits<Scalar>::size };
+template <typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate,
+          bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, ColMajor, Conjugate,
+                                     PanelMode>::operator()(Scalar* blockA, const DataMapper& lhs, Index depth,
+                                                            Index rows, Index stride, Index offset) {
+  typedef typename unpacket_traits<Packet>::half HalfPacket;
+  typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
+  enum {
+    PacketSize = unpacket_traits<Packet>::size,
+    HalfPacketSize = unpacket_traits<HalfPacket>::size,
+    QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
+    HasHalf = (int)HalfPacketSize < (int)PacketSize,
+    HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize
+  };
 
   EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
-  EIGEN_UNUSED_VARIABLE(stride)
-  EIGEN_UNUSED_VARIABLE(offset)
-  eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
-  eigen_assert( (StorageOrder==RowMajor) || ((Pack1%PacketSize)==0 && Pack1<=4*PacketSize) );
+  EIGEN_UNUSED_VARIABLE(stride);
+  EIGEN_UNUSED_VARIABLE(offset);
+  eigen_assert(((!PanelMode) && stride == 0 && offset == 0) || (PanelMode && stride >= depth && offset <= stride));
+  eigen_assert(((Pack1 % PacketSize) == 0 && Pack1 <= 4 * PacketSize) || (Pack1 <= 4));
   conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
-  const_blas_data_mapper<Scalar, Index, StorageOrder> lhs(_lhs,lhsStride);
   Index count = 0;
-  Index peeled_mc = (rows/Pack1)*Pack1;
-  for(Index i=0; i<peeled_mc; i+=Pack1)
-  {
-    if(PanelMode) count += Pack1 * offset;
-
-    if(StorageOrder==ColMajor)
-    {
-      for(Index k=0; k<depth; k++)
-      {
-        Packet A, B, C, D;
-        if(Pack1>=1*PacketSize) A = ploadu<Packet>(&lhs(i+0*PacketSize, k));
-        if(Pack1>=2*PacketSize) B = ploadu<Packet>(&lhs(i+1*PacketSize, k));
-        if(Pack1>=3*PacketSize) C = ploadu<Packet>(&lhs(i+2*PacketSize, k));
-        if(Pack1>=4*PacketSize) D = ploadu<Packet>(&lhs(i+3*PacketSize, k));
-        if(Pack1>=1*PacketSize) { pstore(blockA+count, cj.pconj(A)); count+=PacketSize; }
-        if(Pack1>=2*PacketSize) { pstore(blockA+count, cj.pconj(B)); count+=PacketSize; }
-        if(Pack1>=3*PacketSize) { pstore(blockA+count, cj.pconj(C)); count+=PacketSize; }
-        if(Pack1>=4*PacketSize) { pstore(blockA+count, cj.pconj(D)); count+=PacketSize; }
+
+  const Index peeled_mc3 = Pack1 >= 3 * PacketSize ? (rows / (3 * PacketSize)) * (3 * PacketSize) : 0;
+  const Index peeled_mc2 =
+      Pack1 >= 2 * PacketSize ? peeled_mc3 + ((rows - peeled_mc3) / (2 * PacketSize)) * (2 * PacketSize) : 0;
+  const Index peeled_mc1 =
+      Pack1 >= 1 * PacketSize ? peeled_mc2 + ((rows - peeled_mc2) / (1 * PacketSize)) * (1 * PacketSize) : 0;
+  const Index peeled_mc_half =
+      Pack1 >= HalfPacketSize ? peeled_mc1 + ((rows - peeled_mc1) / (HalfPacketSize)) * (HalfPacketSize) : 0;
+  const Index peeled_mc_quarter = Pack1 >= QuarterPacketSize ? (rows / (QuarterPacketSize)) * (QuarterPacketSize) : 0;
+  const Index last_lhs_progress = rows > peeled_mc_quarter ? (rows - peeled_mc_quarter) & ~1 : 0;
+  const Index peeled_mc0 = Pack2 >= PacketSize              ? peeled_mc_quarter
+                           : Pack2 > 1 && last_lhs_progress ? (rows / last_lhs_progress) * last_lhs_progress
+                                                            : 0;
+
+  Index i = 0;
+
+  // Pack 3 packets
+  if (Pack1 >= 3 * PacketSize) {
+    for (; i < peeled_mc3; i += 3 * PacketSize) {
+      if (PanelMode) count += (3 * PacketSize) * offset;
+
+      for (Index k = 0; k < depth; k++) {
+        Packet A, B, C;
+        A = lhs.template loadPacket<Packet>(i + 0 * PacketSize, k);
+        B = lhs.template loadPacket<Packet>(i + 1 * PacketSize, k);
+        C = lhs.template loadPacket<Packet>(i + 2 * PacketSize, k);
+        pstore(blockA + count, cj.pconj(A));
+        count += PacketSize;
+        pstore(blockA + count, cj.pconj(B));
+        count += PacketSize;
+        pstore(blockA + count, cj.pconj(C));
+        count += PacketSize;
       }
+      if (PanelMode) count += (3 * PacketSize) * (stride - offset - depth);
     }
-    else
-    {
-      for(Index k=0; k<depth; k++)
-      {
-        // TODO add a vectorized transpose here
-        Index w=0;
-        for(; w<Pack1-3; w+=4)
-        {
-          Scalar a(cj(lhs(i+w+0, k))),
-                  b(cj(lhs(i+w+1, k))),
-                  c(cj(lhs(i+w+2, k))),
-                  d(cj(lhs(i+w+3, k)));
+  }
+  // Pack 2 packets
+  if (Pack1 >= 2 * PacketSize) {
+    for (; i < peeled_mc2; i += 2 * PacketSize) {
+      if (PanelMode) count += (2 * PacketSize) * offset;
+
+      for (Index k = 0; k < depth; k++) {
+        Packet A, B;
+        A = lhs.template loadPacket<Packet>(i + 0 * PacketSize, k);
+        B = lhs.template loadPacket<Packet>(i + 1 * PacketSize, k);
+        pstore(blockA + count, cj.pconj(A));
+        count += PacketSize;
+        pstore(blockA + count, cj.pconj(B));
+        count += PacketSize;
+      }
+      if (PanelMode) count += (2 * PacketSize) * (stride - offset - depth);
+    }
+  }
+  // Pack 1 packets
+  if (Pack1 >= 1 * PacketSize) {
+    for (; i < peeled_mc1; i += 1 * PacketSize) {
+      if (PanelMode) count += (1 * PacketSize) * offset;
+
+      for (Index k = 0; k < depth; k++) {
+        Packet A;
+        A = lhs.template loadPacket<Packet>(i + 0 * PacketSize, k);
+        pstore(blockA + count, cj.pconj(A));
+        count += PacketSize;
+      }
+      if (PanelMode) count += (1 * PacketSize) * (stride - offset - depth);
+    }
+  }
+  // Pack half packets
+  if (HasHalf && Pack1 >= HalfPacketSize) {
+    for (; i < peeled_mc_half; i += HalfPacketSize) {
+      if (PanelMode) count += (HalfPacketSize)*offset;
+
+      for (Index k = 0; k < depth; k++) {
+        HalfPacket A;
+        A = lhs.template loadPacket<HalfPacket>(i + 0 * (HalfPacketSize), k);
+        pstoreu(blockA + count, cj.pconj(A));
+        count += HalfPacketSize;
+      }
+      if (PanelMode) count += (HalfPacketSize) * (stride - offset - depth);
+    }
+  }
+  // Pack quarter packets
+  if (HasQuarter && Pack1 >= QuarterPacketSize) {
+    for (; i < peeled_mc_quarter; i += QuarterPacketSize) {
+      if (PanelMode) count += (QuarterPacketSize)*offset;
+
+      for (Index k = 0; k < depth; k++) {
+        QuarterPacket A;
+        A = lhs.template loadPacket<QuarterPacket>(i + 0 * (QuarterPacketSize), k);
+        pstoreu(blockA + count, cj.pconj(A));
+        count += QuarterPacketSize;
+      }
+      if (PanelMode) count += (QuarterPacketSize) * (stride - offset - depth);
+    }
+  }
+  // Pack2 may be *smaller* than PacketSize—that happens for
+  // products like real * complex, where we have to go half the
+  // progress on the lhs in order to duplicate those operands to
+  // address both real & imaginary parts on the rhs. This portion will
+  // pack those half ones until they match the number expected on the
+  // last peeling loop at this point (for the rhs).
+  if (Pack2 < PacketSize && Pack2 > 1) {
+    for (; i < peeled_mc0; i += last_lhs_progress) {
+      if (PanelMode) count += last_lhs_progress * offset;
+
+      for (Index k = 0; k < depth; k++)
+        for (Index w = 0; w < last_lhs_progress; w++) blockA[count++] = cj(lhs(i + w, k));
+
+      if (PanelMode) count += last_lhs_progress * (stride - offset - depth);
+    }
+  }
+  // Pack scalars
+  for (; i < rows; i++) {
+    if (PanelMode) count += offset;
+    for (Index k = 0; k < depth; k++) blockA[count++] = cj(lhs(i, k));
+    if (PanelMode) count += (stride - offset - depth);
+  }
+}
+
+template <typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate,
+          bool PanelMode>
+struct gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate, PanelMode> {
+  typedef typename DataMapper::LinearMapper LinearMapper;
+  EIGEN_DONT_INLINE void operator()(Scalar* blockA, const DataMapper& lhs, Index depth, Index rows, Index stride = 0,
+                                    Index offset = 0);
+};
+
+template <typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, bool Conjugate,
+          bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, DataMapper, Pack1, Pack2, Packet, RowMajor, Conjugate,
+                                     PanelMode>::operator()(Scalar* blockA, const DataMapper& lhs, Index depth,
+                                                            Index rows, Index stride, Index offset) {
+  typedef typename unpacket_traits<Packet>::half HalfPacket;
+  typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
+  enum {
+    PacketSize = unpacket_traits<Packet>::size,
+    HalfPacketSize = unpacket_traits<HalfPacket>::size,
+    QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
+    HasHalf = (int)HalfPacketSize < (int)PacketSize,
+    HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize
+  };
+
+  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK LHS");
+  EIGEN_UNUSED_VARIABLE(stride);
+  EIGEN_UNUSED_VARIABLE(offset);
+  eigen_assert(((!PanelMode) && stride == 0 && offset == 0) || (PanelMode && stride >= depth && offset <= stride));
+  conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
+  Index count = 0;
+  bool gone_half = false, gone_quarter = false, gone_last = false;
+
+  Index i = 0;
+  Index pack = Pack1;
+  Index psize = PacketSize;
+  while (pack > 0) {
+    Index remaining_rows = rows - i;
+    Index peeled_mc = gone_last ? Pack2 > 1 ? (rows / pack) * pack : 0 : i + (remaining_rows / pack) * pack;
+    Index starting_pos = i;
+    for (; i < peeled_mc; i += pack) {
+      if (PanelMode) count += pack * offset;
+
+      Index k = 0;
+      if (pack >= psize && psize >= QuarterPacketSize) {
+        const Index peeled_k = (depth / psize) * psize;
+        for (; k < peeled_k; k += psize) {
+          for (Index m = 0; m < pack; m += psize) {
+            if (psize == PacketSize) {
+              PacketBlock<Packet> kernel;
+              for (Index p = 0; p < psize; ++p) kernel.packet[p] = lhs.template loadPacket<Packet>(i + p + m, k);
+              ptranspose(kernel);
+              for (Index p = 0; p < psize; ++p) pstore(blockA + count + m + (pack)*p, cj.pconj(kernel.packet[p]));
+            } else if (HasHalf && psize == HalfPacketSize) {
+              gone_half = true;
+              PacketBlock<HalfPacket> kernel_half;
+              for (Index p = 0; p < psize; ++p)
+                kernel_half.packet[p] = lhs.template loadPacket<HalfPacket>(i + p + m, k);
+              ptranspose(kernel_half);
+              for (Index p = 0; p < psize; ++p) pstore(blockA + count + m + (pack)*p, cj.pconj(kernel_half.packet[p]));
+            } else if (HasQuarter && psize == QuarterPacketSize) {
+              gone_quarter = true;
+              PacketBlock<QuarterPacket> kernel_quarter;
+              for (Index p = 0; p < psize; ++p)
+                kernel_quarter.packet[p] = lhs.template loadPacket<QuarterPacket>(i + p + m, k);
+              ptranspose(kernel_quarter);
+              for (Index p = 0; p < psize; ++p)
+                pstore(blockA + count + m + (pack)*p, cj.pconj(kernel_quarter.packet[p]));
+            }
+          }
+          count += psize * pack;
+        }
+      }
+
+      for (; k < depth; k++) {
+        Index w = 0;
+        for (; w < pack - 3; w += 4) {
+          Scalar a(cj(lhs(i + w + 0, k))), b(cj(lhs(i + w + 1, k))), c(cj(lhs(i + w + 2, k))), d(cj(lhs(i + w + 3, k)));
           blockA[count++] = a;
           blockA[count++] = b;
           blockA[count++] = c;
           blockA[count++] = d;
         }
-        if(Pack1%4)
-          for(;w<Pack1;++w)
-            blockA[count++] = cj(lhs(i+w, k));
+        if (pack % 4)
+          for (; w < pack; ++w) blockA[count++] = cj(lhs(i + w, k));
+      }
+
+      if (PanelMode) count += pack * (stride - offset - depth);
+    }
+
+    pack -= psize;
+    Index left = rows - i;
+    if (pack <= 0) {
+      if (!gone_last && (starting_pos == i || left >= psize / 2 || left >= psize / 4) &&
+          ((psize / 2 == HalfPacketSize && HasHalf && !gone_half) ||
+           (psize / 2 == QuarterPacketSize && HasQuarter && !gone_quarter))) {
+        psize /= 2;
+        pack = psize;
+        continue;
+      }
+      // Pack2 may be *smaller* than PacketSize—that happens for
+      // products like real * complex, where we have to go half the
+      // progress on the lhs in order to duplicate those operands to
+      // address both real & imaginary parts on the rhs. This portion will
+      // pack those half ones until they match the number expected on the
+      // last peeling loop at this point (for the rhs).
+      if (Pack2 < PacketSize && !gone_last) {
+        gone_last = true;
+        psize = pack = left & ~1;
       }
     }
-    if(PanelMode) count += Pack1 * (stride-offset-depth);
-  }
-  if(rows-peeled_mc>=Pack2)
-  {
-    if(PanelMode) count += Pack2*offset;
-    for(Index k=0; k<depth; k++)
-      for(Index w=0; w<Pack2; w++)
-        blockA[count++] = cj(lhs(peeled_mc+w, k));
-    if(PanelMode) count += Pack2 * (stride-offset-depth);
-    peeled_mc += Pack2;
-  }
-  for(Index i=peeled_mc; i<rows; i++)
-  {
-    if(PanelMode) count += offset;
-    for(Index k=0; k<depth; k++)
-      blockA[count++] = cj(lhs(i, k));
-    if(PanelMode) count += (stride-offset-depth);
+  }
+
+  for (; i < rows; i++) {
+    if (PanelMode) count += offset;
+    for (Index k = 0; k < depth; k++) blockA[count++] = cj(lhs(i, k));
+    if (PanelMode) count += (stride - offset - depth);
   }
 }
 
@@ -1204,138 +2831,323 @@ EIGEN_DONT_INLINE void gemm_pack_lhs<Scalar, Index, Pack1, Pack2, StorageOrder,
 //  4  5  6  7   16 17 18 19   25 28
 //  8  9 10 11   20 21 22 23   26 29
 //  .  .  .  .    .  .  .  .    .  .
-template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
-struct gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode>
-{
+template <typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode> {
   typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename DataMapper::LinearMapper LinearMapper;
   enum { PacketSize = packet_traits<Scalar>::size };
-  EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0);
+  EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0,
+                                    Index offset = 0);
 };
 
-template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
-EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, ColMajor, Conjugate, PanelMode>
-  ::operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride, Index offset)
-{
+template <typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, DataMapper, nr, ColMajor, Conjugate, PanelMode>::operator()(
+    Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride, Index offset) {
   EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS COLMAJOR");
-  EIGEN_UNUSED_VARIABLE(stride)
-  EIGEN_UNUSED_VARIABLE(offset)
-  eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
+  EIGEN_UNUSED_VARIABLE(stride);
+  EIGEN_UNUSED_VARIABLE(offset);
+  eigen_assert(((!PanelMode) && stride == 0 && offset == 0) || (PanelMode && stride >= depth && offset <= stride));
   conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
-  Index packet_cols = (cols/nr) * nr;
+  Index packet_cols8 = nr >= 8 ? (cols / 8) * 8 : 0;
+  Index packet_cols4 = nr >= 4 ? (cols / 4) * 4 : 0;
   Index count = 0;
-  for(Index j2=0; j2<packet_cols; j2+=nr)
-  {
-    // skip what we have before
-    if(PanelMode) count += nr * offset;
-    const Scalar* b0 = &rhs[(j2+0)*rhsStride];
-    const Scalar* b1 = &rhs[(j2+1)*rhsStride];
-    const Scalar* b2 = &rhs[(j2+2)*rhsStride];
-    const Scalar* b3 = &rhs[(j2+3)*rhsStride];
-    for(Index k=0; k<depth; k++)
-    {
-                blockB[count+0] = cj(b0[k]);
-                blockB[count+1] = cj(b1[k]);
-      if(nr==4) blockB[count+2] = cj(b2[k]);
-      if(nr==4) blockB[count+3] = cj(b3[k]);
-      count += nr;
+  const Index peeled_k = (depth / PacketSize) * PacketSize;
+
+#if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
+  EIGEN_IF_CONSTEXPR(nr >= 8) {
+    for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
+      // skip what we have before
+      if (PanelMode) count += 8 * offset;
+      const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
+      const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
+      const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
+      const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
+      const LinearMapper dm4 = rhs.getLinearMapper(0, j2 + 4);
+      const LinearMapper dm5 = rhs.getLinearMapper(0, j2 + 5);
+      const LinearMapper dm6 = rhs.getLinearMapper(0, j2 + 6);
+      const LinearMapper dm7 = rhs.getLinearMapper(0, j2 + 7);
+      Index k = 0;
+      if (PacketSize % 2 == 0 && PacketSize <= 8)  // 2 4 8
+      {
+        for (; k < peeled_k; k += PacketSize) {
+          if (PacketSize == 2) {
+            PacketBlock<Packet, PacketSize == 2 ? 2 : PacketSize> kernel0, kernel1, kernel2, kernel3;
+            kernel0.packet[0 % PacketSize] = dm0.template loadPacket<Packet>(k);
+            kernel0.packet[1 % PacketSize] = dm1.template loadPacket<Packet>(k);
+            kernel1.packet[0 % PacketSize] = dm2.template loadPacket<Packet>(k);
+            kernel1.packet[1 % PacketSize] = dm3.template loadPacket<Packet>(k);
+            kernel2.packet[0 % PacketSize] = dm4.template loadPacket<Packet>(k);
+            kernel2.packet[1 % PacketSize] = dm5.template loadPacket<Packet>(k);
+            kernel3.packet[0 % PacketSize] = dm6.template loadPacket<Packet>(k);
+            kernel3.packet[1 % PacketSize] = dm7.template loadPacket<Packet>(k);
+            ptranspose(kernel0);
+            ptranspose(kernel1);
+            ptranspose(kernel2);
+            ptranspose(kernel3);
+
+            pstoreu(blockB + count + 0 * PacketSize, cj.pconj(kernel0.packet[0 % PacketSize]));
+            pstoreu(blockB + count + 1 * PacketSize, cj.pconj(kernel1.packet[0 % PacketSize]));
+            pstoreu(blockB + count + 2 * PacketSize, cj.pconj(kernel2.packet[0 % PacketSize]));
+            pstoreu(blockB + count + 3 * PacketSize, cj.pconj(kernel3.packet[0 % PacketSize]));
+
+            pstoreu(blockB + count + 4 * PacketSize, cj.pconj(kernel0.packet[1 % PacketSize]));
+            pstoreu(blockB + count + 5 * PacketSize, cj.pconj(kernel1.packet[1 % PacketSize]));
+            pstoreu(blockB + count + 6 * PacketSize, cj.pconj(kernel2.packet[1 % PacketSize]));
+            pstoreu(blockB + count + 7 * PacketSize, cj.pconj(kernel3.packet[1 % PacketSize]));
+            count += 8 * PacketSize;
+          } else if (PacketSize == 4) {
+            PacketBlock<Packet, PacketSize == 4 ? 4 : PacketSize> kernel0, kernel1;
+
+            kernel0.packet[0 % PacketSize] = dm0.template loadPacket<Packet>(k);
+            kernel0.packet[1 % PacketSize] = dm1.template loadPacket<Packet>(k);
+            kernel0.packet[2 % PacketSize] = dm2.template loadPacket<Packet>(k);
+            kernel0.packet[3 % PacketSize] = dm3.template loadPacket<Packet>(k);
+            kernel1.packet[0 % PacketSize] = dm4.template loadPacket<Packet>(k);
+            kernel1.packet[1 % PacketSize] = dm5.template loadPacket<Packet>(k);
+            kernel1.packet[2 % PacketSize] = dm6.template loadPacket<Packet>(k);
+            kernel1.packet[3 % PacketSize] = dm7.template loadPacket<Packet>(k);
+            ptranspose(kernel0);
+            ptranspose(kernel1);
+
+            pstoreu(blockB + count + 0 * PacketSize, cj.pconj(kernel0.packet[0 % PacketSize]));
+            pstoreu(blockB + count + 1 * PacketSize, cj.pconj(kernel1.packet[0 % PacketSize]));
+            pstoreu(blockB + count + 2 * PacketSize, cj.pconj(kernel0.packet[1 % PacketSize]));
+            pstoreu(blockB + count + 3 * PacketSize, cj.pconj(kernel1.packet[1 % PacketSize]));
+            pstoreu(blockB + count + 4 * PacketSize, cj.pconj(kernel0.packet[2 % PacketSize]));
+            pstoreu(blockB + count + 5 * PacketSize, cj.pconj(kernel1.packet[2 % PacketSize]));
+            pstoreu(blockB + count + 6 * PacketSize, cj.pconj(kernel0.packet[3 % PacketSize]));
+            pstoreu(blockB + count + 7 * PacketSize, cj.pconj(kernel1.packet[3 % PacketSize]));
+            count += 8 * PacketSize;
+          } else if (PacketSize == 8) {
+            PacketBlock<Packet, PacketSize == 8 ? 8 : PacketSize> kernel0;
+
+            kernel0.packet[0 % PacketSize] = dm0.template loadPacket<Packet>(k);
+            kernel0.packet[1 % PacketSize] = dm1.template loadPacket<Packet>(k);
+            kernel0.packet[2 % PacketSize] = dm2.template loadPacket<Packet>(k);
+            kernel0.packet[3 % PacketSize] = dm3.template loadPacket<Packet>(k);
+            kernel0.packet[4 % PacketSize] = dm4.template loadPacket<Packet>(k);
+            kernel0.packet[5 % PacketSize] = dm5.template loadPacket<Packet>(k);
+            kernel0.packet[6 % PacketSize] = dm6.template loadPacket<Packet>(k);
+            kernel0.packet[7 % PacketSize] = dm7.template loadPacket<Packet>(k);
+            ptranspose(kernel0);
+
+            pstoreu(blockB + count + 0 * PacketSize, cj.pconj(kernel0.packet[0 % PacketSize]));
+            pstoreu(blockB + count + 1 * PacketSize, cj.pconj(kernel0.packet[1 % PacketSize]));
+            pstoreu(blockB + count + 2 * PacketSize, cj.pconj(kernel0.packet[2 % PacketSize]));
+            pstoreu(blockB + count + 3 * PacketSize, cj.pconj(kernel0.packet[3 % PacketSize]));
+            pstoreu(blockB + count + 4 * PacketSize, cj.pconj(kernel0.packet[4 % PacketSize]));
+            pstoreu(blockB + count + 5 * PacketSize, cj.pconj(kernel0.packet[5 % PacketSize]));
+            pstoreu(blockB + count + 6 * PacketSize, cj.pconj(kernel0.packet[6 % PacketSize]));
+            pstoreu(blockB + count + 7 * PacketSize, cj.pconj(kernel0.packet[7 % PacketSize]));
+            count += 8 * PacketSize;
+          }
+        }
+      }
+
+      for (; k < depth; k++) {
+        blockB[count + 0] = cj(dm0(k));
+        blockB[count + 1] = cj(dm1(k));
+        blockB[count + 2] = cj(dm2(k));
+        blockB[count + 3] = cj(dm3(k));
+        blockB[count + 4] = cj(dm4(k));
+        blockB[count + 5] = cj(dm5(k));
+        blockB[count + 6] = cj(dm6(k));
+        blockB[count + 7] = cj(dm7(k));
+        count += 8;
+      }
+      // skip what we have after
+      if (PanelMode) count += 8 * (stride - offset - depth);
+    }
+  }
+#endif
+
+  EIGEN_IF_CONSTEXPR(nr >= 4) {
+    for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
+      // skip what we have before
+      if (PanelMode) count += 4 * offset;
+      const LinearMapper dm0 = rhs.getLinearMapper(0, j2 + 0);
+      const LinearMapper dm1 = rhs.getLinearMapper(0, j2 + 1);
+      const LinearMapper dm2 = rhs.getLinearMapper(0, j2 + 2);
+      const LinearMapper dm3 = rhs.getLinearMapper(0, j2 + 3);
+
+      Index k = 0;
+      if ((PacketSize % 4) == 0)  // TODO enable vectorized transposition for PacketSize==2 ??
+      {
+        for (; k < peeled_k; k += PacketSize) {
+          PacketBlock<Packet, (PacketSize % 4) == 0 ? 4 : PacketSize> kernel;
+          kernel.packet[0] = dm0.template loadPacket<Packet>(k);
+          kernel.packet[1 % PacketSize] = dm1.template loadPacket<Packet>(k);
+          kernel.packet[2 % PacketSize] = dm2.template loadPacket<Packet>(k);
+          kernel.packet[3 % PacketSize] = dm3.template loadPacket<Packet>(k);
+          ptranspose(kernel);
+          pstoreu(blockB + count + 0 * PacketSize, cj.pconj(kernel.packet[0]));
+          pstoreu(blockB + count + 1 * PacketSize, cj.pconj(kernel.packet[1 % PacketSize]));
+          pstoreu(blockB + count + 2 * PacketSize, cj.pconj(kernel.packet[2 % PacketSize]));
+          pstoreu(blockB + count + 3 * PacketSize, cj.pconj(kernel.packet[3 % PacketSize]));
+          count += 4 * PacketSize;
+        }
+      }
+      for (; k < depth; k++) {
+        blockB[count + 0] = cj(dm0(k));
+        blockB[count + 1] = cj(dm1(k));
+        blockB[count + 2] = cj(dm2(k));
+        blockB[count + 3] = cj(dm3(k));
+        count += 4;
+      }
+      // skip what we have after
+      if (PanelMode) count += 4 * (stride - offset - depth);
     }
-    // skip what we have after
-    if(PanelMode) count += nr * (stride-offset-depth);
   }
 
   // copy the remaining columns one at a time (nr==1)
-  for(Index j2=packet_cols; j2<cols; ++j2)
-  {
-    if(PanelMode) count += offset;
-    const Scalar* b0 = &rhs[(j2+0)*rhsStride];
-    for(Index k=0; k<depth; k++)
-    {
-      blockB[count] = cj(b0[k]);
+  for (Index j2 = packet_cols4; j2 < cols; ++j2) {
+    if (PanelMode) count += offset;
+    const LinearMapper dm0 = rhs.getLinearMapper(0, j2);
+    for (Index k = 0; k < depth; k++) {
+      blockB[count] = cj(dm0(k));
       count += 1;
     }
-    if(PanelMode) count += (stride-offset-depth);
+    if (PanelMode) count += (stride - offset - depth);
   }
 }
 
 // this version is optimized for row major matrices
-template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
-struct gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode>
-{
-  enum { PacketSize = packet_traits<Scalar>::size };
-  EIGEN_DONT_INLINE void operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride=0, Index offset=0);
-};
+template <typename Scalar, typename Index, typename DataMapper, int nr, bool Conjugate, bool PanelMode>
+struct gemm_pack_rhs<Scalar, Index, DataMapper, nr, RowMajor, Conjugate, PanelMode> {
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename unpacket_traits<Packet>::half HalfPacket;
+  typedef typename unpacket_traits<typename unpacket_traits<Packet>::half>::half QuarterPacket;
+  typedef typename DataMapper::LinearMapper LinearMapper;
+  enum {
+    PacketSize = packet_traits<Scalar>::size,
+    HalfPacketSize = unpacket_traits<HalfPacket>::size,
+    QuarterPacketSize = unpacket_traits<QuarterPacket>::size
+  };
+  EIGEN_DONT_INLINE void operator()(Scalar* blockB, const DataMapper& rhs, Index depth, Index cols, Index stride = 0,
+                                    Index offset = 0) {
+    EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
+    EIGEN_UNUSED_VARIABLE(stride);
+    EIGEN_UNUSED_VARIABLE(offset);
+    eigen_assert(((!PanelMode) && stride == 0 && offset == 0) || (PanelMode && stride >= depth && offset <= stride));
+    const bool HasHalf = (int)HalfPacketSize < (int)PacketSize;
+    const bool HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize;
+    conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
+    Index packet_cols8 = nr >= 8 ? (cols / 8) * 8 : 0;
+    Index packet_cols4 = nr >= 4 ? (cols / 4) * 4 : 0;
+    Index count = 0;
+
+#if EIGEN_ARCH_ARM64 || EIGEN_ARCH_LOONGARCH64
+    EIGEN_IF_CONSTEXPR(nr >= 8) {
+      for (Index j2 = 0; j2 < packet_cols8; j2 += 8) {
+        // skip what we have before
+        if (PanelMode) count += 8 * offset;
+        for (Index k = 0; k < depth; k++) {
+          if (PacketSize == 8) {
+            Packet A = rhs.template loadPacket<Packet>(k, j2);
+            pstoreu(blockB + count, cj.pconj(A));
+            count += PacketSize;
+          } else if (PacketSize == 4) {
+            Packet A = rhs.template loadPacket<Packet>(k, j2);
+            Packet B = rhs.template loadPacket<Packet>(k, j2 + 4);
+            pstoreu(blockB + count, cj.pconj(A));
+            pstoreu(blockB + count + PacketSize, cj.pconj(B));
+            count += 2 * PacketSize;
+          } else {
+            const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
+            blockB[count + 0] = cj(dm0(0));
+            blockB[count + 1] = cj(dm0(1));
+            blockB[count + 2] = cj(dm0(2));
+            blockB[count + 3] = cj(dm0(3));
+            blockB[count + 4] = cj(dm0(4));
+            blockB[count + 5] = cj(dm0(5));
+            blockB[count + 6] = cj(dm0(6));
+            blockB[count + 7] = cj(dm0(7));
+            count += 8;
+          }
+        }
+        // skip what we have after
+        if (PanelMode) count += 8 * (stride - offset - depth);
+      }
+    }
+#endif
 
-template<typename Scalar, typename Index, int nr, bool Conjugate, bool PanelMode>
-EIGEN_DONT_INLINE void gemm_pack_rhs<Scalar, Index, nr, RowMajor, Conjugate, PanelMode>
-  ::operator()(Scalar* blockB, const Scalar* rhs, Index rhsStride, Index depth, Index cols, Index stride, Index offset)
-{
-  EIGEN_ASM_COMMENT("EIGEN PRODUCT PACK RHS ROWMAJOR");
-  EIGEN_UNUSED_VARIABLE(stride)
-  EIGEN_UNUSED_VARIABLE(offset)
-  eigen_assert(((!PanelMode) && stride==0 && offset==0) || (PanelMode && stride>=depth && offset<=stride));
-  conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
-  Index packet_cols = (cols/nr) * nr;
-  Index count = 0;
-  for(Index j2=0; j2<packet_cols; j2+=nr)
-  {
-    // skip what we have before
-    if(PanelMode) count += nr * offset;
-    for(Index k=0; k<depth; k++)
-    {
-      const Scalar* b0 = &rhs[k*rhsStride + j2];
-                blockB[count+0] = cj(b0[0]);
-                blockB[count+1] = cj(b0[1]);
-      if(nr==4) blockB[count+2] = cj(b0[2]);
-      if(nr==4) blockB[count+3] = cj(b0[3]);
-      count += nr;
+    if (nr >= 4) {
+      for (Index j2 = packet_cols8; j2 < packet_cols4; j2 += 4) {
+        // skip what we have before
+        if (PanelMode) count += 4 * offset;
+        for (Index k = 0; k < depth; k++) {
+          if (PacketSize == 4) {
+            Packet A = rhs.template loadPacket<Packet>(k, j2);
+            pstoreu(blockB + count, cj.pconj(A));
+            count += PacketSize;
+          } else if (HasHalf && HalfPacketSize == 4) {
+            HalfPacket A = rhs.template loadPacket<HalfPacket>(k, j2);
+            pstoreu(blockB + count, cj.pconj(A));
+            count += HalfPacketSize;
+          } else if (HasQuarter && QuarterPacketSize == 4) {
+            QuarterPacket A = rhs.template loadPacket<QuarterPacket>(k, j2);
+            pstoreu(blockB + count, cj.pconj(A));
+            count += QuarterPacketSize;
+          } else {
+            const LinearMapper dm0 = rhs.getLinearMapper(k, j2);
+            blockB[count + 0] = cj(dm0(0));
+            blockB[count + 1] = cj(dm0(1));
+            blockB[count + 2] = cj(dm0(2));
+            blockB[count + 3] = cj(dm0(3));
+            count += 4;
+          }
+        }
+        // skip what we have after
+        if (PanelMode) count += 4 * (stride - offset - depth);
+      }
     }
-    // skip what we have after
-    if(PanelMode) count += nr * (stride-offset-depth);
-  }
-  // copy the remaining columns one at a time (nr==1)
-  for(Index j2=packet_cols; j2<cols; ++j2)
-  {
-    if(PanelMode) count += offset;
-    const Scalar* b0 = &rhs[j2];
-    for(Index k=0; k<depth; k++)
-    {
-      blockB[count] = cj(b0[k*rhsStride]);
-      count += 1;
+    // copy the remaining columns one at a time (nr==1)
+    for (Index j2 = packet_cols4; j2 < cols; ++j2) {
+      if (PanelMode) count += offset;
+      for (Index k = 0; k < depth; k++) {
+        blockB[count] = cj(rhs(k, j2));
+        count += 1;
+      }
+      if (PanelMode) count += stride - offset - depth;
     }
-    if(PanelMode) count += stride-offset-depth;
   }
-}
+};
 
-} // end namespace internal
+}  // end namespace internal
 
 /** \returns the currently set level 1 cpu cache size (in bytes) used to estimate the ideal blocking size parameters.
-  * \sa setCpuCacheSize */
-inline std::ptrdiff_t l1CacheSize()
-{
-  std::ptrdiff_t l1, l2;
-  internal::manage_caching_sizes(GetAction, &l1, &l2);
+ * \sa setCpuCacheSize */
+inline std::ptrdiff_t l1CacheSize() {
+  std::ptrdiff_t l1, l2, l3;
+  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
   return l1;
 }
 
 /** \returns the currently set level 2 cpu cache size (in bytes) used to estimate the ideal blocking size parameters.
-  * \sa setCpuCacheSize */
-inline std::ptrdiff_t l2CacheSize()
-{
-  std::ptrdiff_t l1, l2;
-  internal::manage_caching_sizes(GetAction, &l1, &l2);
+ * \sa setCpuCacheSize */
+inline std::ptrdiff_t l2CacheSize() {
+  std::ptrdiff_t l1, l2, l3;
+  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
   return l2;
 }
 
+/** \returns the currently set level 3 cpu cache size (in bytes) used to estimate the ideal blocking size parameters.
+ * \sa setCpuCacheSize */
+inline std::ptrdiff_t l3CacheSize() {
+  std::ptrdiff_t l1, l2, l3;
+  internal::manage_caching_sizes(GetAction, &l1, &l2, &l3);
+  return l3;
+}
+
 /** Set the cpu L1 and L2 cache sizes (in bytes).
-  * These values are use to adjust the size of the blocks
-  * for the algorithms working per blocks.
-  *
-  * \sa computeProductBlockingSizes */
-inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2)
-{
-  internal::manage_caching_sizes(SetAction, &l1, &l2);
+ * These values are use to adjust the size of the blocks
+ * for the algorithms working per blocks.
+ *
+ * \sa computeProductBlockingSizes */
+inline void setCpuCacheSizes(std::ptrdiff_t l1, std::ptrdiff_t l2, std::ptrdiff_t l3) {
+  internal::manage_caching_sizes(SetAction, &l1, &l2, &l3);
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_GENERAL_BLOCK_PANEL_H
+#endif  // EIGEN_GENERAL_BLOCK_PANEL_H
diff --git a/inst/include/Eigen/src/Core/products/GeneralMatrixMatrix.h b/inst/include/Eigen/src/Core/products/GeneralMatrixMatrix.h
index 3f5ffcf5..ebfac014 100644
--- a/inst/include/Eigen/src/Core/products/GeneralMatrixMatrix.h
+++ b/inst/include/Eigen/src/Core/products/GeneralMatrixMatrix.h
@@ -10,418 +10,450 @@
 #ifndef EIGEN_GENERAL_MATRIX_MATRIX_H
 #define EIGEN_GENERAL_MATRIX_MATRIX_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "../InternalHeaderCheck.h"
+
+namespace Eigen {
 
 namespace internal {
 
-template<typename _LhsScalar, typename _RhsScalar> class level3_blocking;
+template <typename LhsScalar_, typename RhsScalar_>
+class level3_blocking;
 
 /* Specialization for a row-major destination matrix => simple transposition of the product */
-template<
-  typename Index,
-  typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
-  typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs>
-struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor>
-{
-  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
-  static EIGEN_STRONG_INLINE void run(
-    Index rows, Index cols, Index depth,
-    const LhsScalar* lhs, Index lhsStride,
-    const RhsScalar* rhs, Index rhsStride,
-    ResScalar* res, Index resStride,
-    ResScalar alpha,
-    level3_blocking<RhsScalar,LhsScalar>& blocking,
-    GemmParallelInfo<Index>* info = 0)
-  {
+template <typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, typename RhsScalar,
+          int RhsStorageOrder, bool ConjugateRhs, int ResInnerStride>
+struct general_matrix_matrix_product<Index, LhsScalar, LhsStorageOrder, ConjugateLhs, RhsScalar, RhsStorageOrder,
+                                     ConjugateRhs, RowMajor, ResInnerStride> {
+  typedef gebp_traits<RhsScalar, LhsScalar> Traits;
+
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  static EIGEN_STRONG_INLINE void run(Index rows, Index cols, Index depth, const LhsScalar* lhs, Index lhsStride,
+                                      const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resIncr,
+                                      Index resStride, ResScalar alpha, level3_blocking<RhsScalar, LhsScalar>& blocking,
+                                      GemmParallelInfo<Index>* info = 0) {
     // transpose the product such that the result is column major
-    general_matrix_matrix_product<Index,
-      RhsScalar, RhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateRhs,
-      LhsScalar, LhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateLhs,
-      ColMajor>
-    ::run(cols,rows,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha,blocking,info);
+    general_matrix_matrix_product<Index, RhsScalar, RhsStorageOrder == RowMajor ? ColMajor : RowMajor, ConjugateRhs,
+                                  LhsScalar, LhsStorageOrder == RowMajor ? ColMajor : RowMajor, ConjugateLhs, ColMajor,
+                                  ResInnerStride>::run(cols, rows, depth, rhs, rhsStride, lhs, lhsStride, res, resIncr,
+                                                       resStride, alpha, blocking, info);
   }
 };
 
 /*  Specialization for a col-major destination matrix
  *    => Blocking algorithm following Goto's paper */
-template<
-  typename Index,
-  typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
-  typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs>
-struct general_matrix_matrix_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor>
-{
-
-typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
-static void run(Index rows, Index cols, Index depth,
-  const LhsScalar* _lhs, Index lhsStride,
-  const RhsScalar* _rhs, Index rhsStride,
-  ResScalar* res, Index resStride,
-  ResScalar alpha,
-  level3_blocking<LhsScalar,RhsScalar>& blocking,
-  GemmParallelInfo<Index>* info = 0)
-{
-  const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
-  const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
-
-  typedef gebp_traits<LhsScalar,RhsScalar> Traits;
-
-  Index kc = blocking.kc();                   // cache block size along the K direction
-  Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
-  //Index nc = blocking.nc(); // cache block size along the N direction
-
-  gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
-  gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs;
-  gebp_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
-
-#ifdef EIGEN_HAS_OPENMP
-  if(info)
-  {
-    // this is the parallel version!
-    Index tid = omp_get_thread_num();
-    Index threads = omp_get_num_threads();
-    
-    std::size_t sizeA = kc*mc;
-    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
-    ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, 0);
-    ei_declare_aligned_stack_constructed_variable(RhsScalar, w, sizeW, 0);
-    
-    RhsScalar* blockB = blocking.blockB();
-    eigen_internal_assert(blockB!=0);
-
-    // For each horizontal panel of the rhs, and corresponding vertical panel of the lhs...
-    for(Index k=0; k<depth; k+=kc)
+template <typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, typename RhsScalar,
+          int RhsStorageOrder, bool ConjugateRhs, int ResInnerStride>
+struct general_matrix_matrix_product<Index, LhsScalar, LhsStorageOrder, ConjugateLhs, RhsScalar, RhsStorageOrder,
+                                     ConjugateRhs, ColMajor, ResInnerStride> {
+  typedef gebp_traits<LhsScalar, RhsScalar> Traits;
+
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  static void run(Index rows, Index cols, Index depth, const LhsScalar* lhs_, Index lhsStride, const RhsScalar* rhs_,
+                  Index rhsStride, ResScalar* res_, Index resIncr, Index resStride, ResScalar alpha,
+                  level3_blocking<LhsScalar, RhsScalar>& blocking, GemmParallelInfo<Index>* info = 0) {
+    typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper;
+    typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper;
+    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
+    LhsMapper lhs(lhs_, lhsStride);
+    RhsMapper rhs(rhs_, rhsStride);
+    ResMapper res(res_, resStride, resIncr);
+
+    Index kc = blocking.kc();                    // cache block size along the K direction
+    Index mc = (std::min)(rows, blocking.mc());  // cache block size along the M direction
+    Index nc = (std::min)(cols, blocking.nc());  // cache block size along the N direction
+
+    gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing,
+                  LhsStorageOrder>
+        pack_lhs;
+    gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
+    gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
+
+#if !defined(EIGEN_USE_BLAS) && (defined(EIGEN_HAS_OPENMP) || defined(EIGEN_GEMM_THREADPOOL))
+    if (info) {
+      // this is the parallel version!
+      int tid = info->logical_thread_id;
+      int threads = info->num_threads;
+
+      LhsScalar* blockA = blocking.blockA();
+      eigen_internal_assert(blockA != 0);
+
+      std::size_t sizeB = kc * nc;
+      ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, 0);
+
+      // For each horizontal panel of the rhs, and corresponding vertical panel of the lhs...
+      for (Index k = 0; k < depth; k += kc) {
+        const Index actual_kc = (std::min)(k + kc, depth) - k;  // => rows of B', and cols of the A'
+
+        // In order to reduce the chance that a thread has to wait for the other,
+        // let's start by packing B'.
+        pack_rhs(blockB, rhs.getSubMapper(k, 0), actual_kc, nc);
+
+        // Pack A_k to A' in a parallel fashion:
+        // each thread packs the sub block A_k,i to A'_i where i is the thread id.
+
+        // However, before copying to A'_i, we have to make sure that no other thread is still using it,
+        // i.e., we test that info->task_info[tid].users equals 0.
+        // Then, we set info->task_info[tid].users to the number of threads to mark that all other threads are going to
+        // use it.
+        while (info->task_info[tid].users != 0) {
+          std::this_thread::yield();
+        }
+        info->task_info[tid].users = threads;
+
+        pack_lhs(blockA + info->task_info[tid].lhs_start * actual_kc,
+                 lhs.getSubMapper(info->task_info[tid].lhs_start, k), actual_kc, info->task_info[tid].lhs_length);
+
+        // Notify the other threads that the part A'_i is ready to go.
+        info->task_info[tid].sync = k;
+
+        // Computes C_i += A' * B' per A'_i
+        for (int shift = 0; shift < threads; ++shift) {
+          int i = (tid + shift) % threads;
+
+          // At this point we have to make sure that A'_i has been updated by the thread i,
+          // we use testAndSetOrdered to mimic a volatile access.
+          // However, no need to wait for the B' part which has been updated by the current thread!
+          if (shift > 0) {
+            while (info->task_info[i].sync != k) {
+              std::this_thread::yield();
+            }
+          }
+
+          gebp(res.getSubMapper(info->task_info[i].lhs_start, 0), blockA + info->task_info[i].lhs_start * actual_kc,
+               blockB, info->task_info[i].lhs_length, actual_kc, nc, alpha);
+        }
+
+        // Then keep going as usual with the remaining B'
+        for (Index j = nc; j < cols; j += nc) {
+          const Index actual_nc = (std::min)(j + nc, cols) - j;
+
+          // pack B_k,j to B'
+          pack_rhs(blockB, rhs.getSubMapper(k, j), actual_kc, actual_nc);
+
+          // C_j += A' * B'
+          gebp(res.getSubMapper(0, j), blockA, blockB, rows, actual_kc, actual_nc, alpha);
+        }
+
+        // Release all the sub blocks A'_i of A' for the current thread,
+        // i.e., we simply decrement the number of users by 1
+        for (Index i = 0; i < threads; ++i) info->task_info[i].users -= 1;
+      }
+    } else
+#endif  // defined(EIGEN_HAS_OPENMP) || defined(EIGEN_GEMM_THREADPOOL)
     {
-      const Index actual_kc = (std::min)(k+kc,depth)-k; // => rows of B', and cols of the A'
+      EIGEN_UNUSED_VARIABLE(info);
 
-      // In order to reduce the chance that a thread has to wait for the other,
-      // let's start by packing A'.
-      pack_lhs(blockA, &lhs(0,k), lhsStride, actual_kc, mc);
+      // this is the sequential version!
+      std::size_t sizeA = kc * mc;
+      std::size_t sizeB = kc * nc;
 
-      // Pack B_k to B' in a parallel fashion:
-      // each thread packs the sub block B_k,j to B'_j where j is the thread id.
+      ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA());
+      ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB());
 
-      // However, before copying to B'_j, we have to make sure that no other thread is still using it,
-      // i.e., we test that info[tid].users equals 0.
-      // Then, we set info[tid].users to the number of threads to mark that all other threads are going to use it.
-      while(info[tid].users!=0) {}
-      info[tid].users += threads;
+      const bool pack_rhs_once = mc != rows && kc == depth && nc == cols;
 
-      pack_rhs(blockB+info[tid].rhs_start*actual_kc, &rhs(k,info[tid].rhs_start), rhsStride, actual_kc, info[tid].rhs_length);
+      // For each horizontal panel of the rhs, and corresponding panel of the lhs...
+      for (Index i2 = 0; i2 < rows; i2 += mc) {
+        const Index actual_mc = (std::min)(i2 + mc, rows) - i2;
 
-      // Notify the other threads that the part B'_j is ready to go.
-      info[tid].sync = k;
+        for (Index k2 = 0; k2 < depth; k2 += kc) {
+          const Index actual_kc = (std::min)(k2 + kc, depth) - k2;
 
-      // Computes C_i += A' * B' per B'_j
-      for(Index shift=0; shift<threads; ++shift)
-      {
-        Index j = (tid+shift)%threads;
+          // OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs.
+          // => Pack lhs's panel into a sequential chunk of memory (L2/L3 caching)
+          // Note that this panel will be read as many times as the number of blocks in the rhs's
+          // horizontal panel which is, in practice, a very low number.
+          pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
 
-        // At this point we have to make sure that B'_j has been updated by the thread j,
-        // we use testAndSetOrdered to mimic a volatile access.
-        // However, no need to wait for the B' part which has been updated by the current thread!
-        if(shift>0)
-          while(info[j].sync!=k) {}
-
-        gebp(res+info[j].rhs_start*resStride, resStride, blockA, blockB+info[j].rhs_start*actual_kc, mc, actual_kc, info[j].rhs_length, alpha, -1,-1,0,0, w);
-      }
+          // For each kc x nc block of the rhs's horizontal panel...
+          for (Index j2 = 0; j2 < cols; j2 += nc) {
+            const Index actual_nc = (std::min)(j2 + nc, cols) - j2;
 
-      // Then keep going as usual with the remaining A'
-      for(Index i=mc; i<rows; i+=mc)
-      {
-        const Index actual_mc = (std::min)(i+mc,rows)-i;
+            // We pack the rhs's block into a sequential chunk of memory (L2 caching)
+            // Note that this block will be read a very high number of times, which is equal to the number of
+            // micro horizontal panel of the large rhs's panel (e.g., rows/12 times).
+            if ((!pack_rhs_once) || i2 == 0) pack_rhs(blockB, rhs.getSubMapper(k2, j2), actual_kc, actual_nc);
 
-        // pack A_i,k to A'
-        pack_lhs(blockA, &lhs(i,k), lhsStride, actual_kc, actual_mc);
-
-        // C_i += A' * B'
-        gebp(res+i, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1,-1,0,0, w);
+            // Everything is packed, we can now call the panel * block kernel:
+            gebp(res.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, alpha);
+          }
+        }
       }
-
-      // Release all the sub blocks B'_j of B' for the current thread,
-      // i.e., we simply decrement the number of users by 1
-      for(Index j=0; j<threads; ++j)
-        #pragma omp atomic
-        --(info[j].users);
     }
   }
-  else
-#endif // EIGEN_HAS_OPENMP
-  {
-    EIGEN_UNUSED_VARIABLE(info);
-
-    // this is the sequential version!
-    std::size_t sizeA = kc*mc;
-    std::size_t sizeB = kc*cols;
-    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
-
-    ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA());
-    ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB());
-    ei_declare_aligned_stack_constructed_variable(RhsScalar, blockW, sizeW, blocking.blockW());
-
-    // For each horizontal panel of the rhs, and corresponding panel of the lhs...
-    // (==GEMM_VAR1)
-    for(Index k2=0; k2<depth; k2+=kc)
-    {
-      const Index actual_kc = (std::min)(k2+kc,depth)-k2;
-
-      // OK, here we have selected one horizontal panel of rhs and one vertical panel of lhs.
-      // => Pack rhs's panel into a sequential chunk of memory (L2 caching)
-      // Note that this panel will be read as many times as the number of blocks in the lhs's
-      // vertical panel which is, in practice, a very low number.
-      pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, cols);
-
-      // For each mc x kc block of the lhs's vertical panel...
-      // (==GEPP_VAR1)
-      for(Index i2=0; i2<rows; i2+=mc)
-      {
-        const Index actual_mc = (std::min)(i2+mc,rows)-i2;
-
-        // We pack the lhs's block into a sequential chunk of memory (L1 caching)
-        // Note that this block will be read a very high number of times, which is equal to the number of
-        // micro vertical panel of the large rhs's panel (e.g., cols/4 times).
-        pack_lhs(blockA, &lhs(i2,k2), lhsStride, actual_kc, actual_mc);
-
-        // Everything is packed, we can now call the block * panel kernel:
-        gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0, blockW);
-      }
-    }
-  }
-}
-
 };
 
 /*********************************************************************************
-*  Specialization of GeneralProduct<> for "large" GEMM, i.e.,
-*  implementation of the high level wrapper to general_matrix_matrix_product
-**********************************************************************************/
-
-template<typename Lhs, typename Rhs>
-struct traits<GeneralProduct<Lhs,Rhs,GemmProduct> >
- : traits<ProductBase<GeneralProduct<Lhs,Rhs,GemmProduct>, Lhs, Rhs> >
-{};
-
-template<typename Scalar, typename Index, typename Gemm, typename Lhs, typename Rhs, typename Dest, typename BlockingType>
-struct gemm_functor
-{
-  gemm_functor(const Lhs& lhs, const Rhs& rhs, Dest& dest, const Scalar& actualAlpha,
-                  BlockingType& blocking)
-    : m_lhs(lhs), m_rhs(rhs), m_dest(dest), m_actualAlpha(actualAlpha), m_blocking(blocking)
-  {}
-
-  void initParallelSession() const
-  {
-    m_blocking.allocateB();
+ *  Specialization of generic_product_impl for "large" GEMM, i.e.,
+ *  implementation of the high level wrapper to general_matrix_matrix_product
+ **********************************************************************************/
+
+template <typename Scalar, typename Index, typename Gemm, typename Lhs, typename Rhs, typename Dest,
+          typename BlockingType>
+struct gemm_functor {
+  gemm_functor(const Lhs& lhs, const Rhs& rhs, Dest& dest, const Scalar& actualAlpha, BlockingType& blocking)
+      : m_lhs(lhs), m_rhs(rhs), m_dest(dest), m_actualAlpha(actualAlpha), m_blocking(blocking) {}
+
+  void initParallelSession(Index num_threads) const {
+    m_blocking.initParallel(m_lhs.rows(), m_rhs.cols(), m_lhs.cols(), num_threads);
+    m_blocking.allocateA();
   }
 
-  void operator() (Index row, Index rows, Index col=0, Index cols=-1, GemmParallelInfo<Index>* info=0) const
-  {
-    if(cols==-1)
-      cols = m_rhs.cols();
+  void operator()(Index row, Index rows, Index col = 0, Index cols = -1, GemmParallelInfo<Index>* info = 0) const {
+    if (cols == -1) cols = m_rhs.cols();
 
-    Gemm::run(rows, cols, m_lhs.cols(),
-              /*(const Scalar*)*/&m_lhs.coeffRef(row,0), m_lhs.outerStride(),
-              /*(const Scalar*)*/&m_rhs.coeffRef(0,col), m_rhs.outerStride(),
-              (Scalar*)&(m_dest.coeffRef(row,col)), m_dest.outerStride(),
+    Gemm::run(rows, cols, m_lhs.cols(), &m_lhs.coeffRef(row, 0), m_lhs.outerStride(), &m_rhs.coeffRef(0, col),
+              m_rhs.outerStride(), (Scalar*)&(m_dest.coeffRef(row, col)), m_dest.innerStride(), m_dest.outerStride(),
               m_actualAlpha, m_blocking, info);
   }
 
-  protected:
-    const Lhs& m_lhs;
-    const Rhs& m_rhs;
-    Dest& m_dest;
-    Scalar m_actualAlpha;
-    BlockingType& m_blocking;
-};
+  typedef typename Gemm::Traits Traits;
 
-template<int StorageOrder, typename LhsScalar, typename RhsScalar, int MaxRows, int MaxCols, int MaxDepth, int KcFactor=1,
-bool FiniteAtCompileTime = MaxRows!=Dynamic && MaxCols!=Dynamic && MaxDepth != Dynamic> class gemm_blocking_space;
+ protected:
+  const Lhs& m_lhs;
+  const Rhs& m_rhs;
+  Dest& m_dest;
+  Scalar m_actualAlpha;
+  BlockingType& m_blocking;
+};
 
-template<typename _LhsScalar, typename _RhsScalar>
-class level3_blocking
-{
-    typedef _LhsScalar LhsScalar;
-    typedef _RhsScalar RhsScalar;
+template <int StorageOrder, typename LhsScalar, typename RhsScalar, int MaxRows, int MaxCols, int MaxDepth,
+          int KcFactor = 1, bool FiniteAtCompileTime = MaxRows != Dynamic && MaxCols != Dynamic && MaxDepth != Dynamic>
+class gemm_blocking_space;
 
-  protected:
-    LhsScalar* m_blockA;
-    RhsScalar* m_blockB;
-    RhsScalar* m_blockW;
+template <typename LhsScalar_, typename RhsScalar_>
+class level3_blocking {
+  typedef LhsScalar_ LhsScalar;
+  typedef RhsScalar_ RhsScalar;
 
-    DenseIndex m_mc;
-    DenseIndex m_nc;
-    DenseIndex m_kc;
+ protected:
+  LhsScalar* m_blockA;
+  RhsScalar* m_blockB;
 
-  public:
+  Index m_mc;
+  Index m_nc;
+  Index m_kc;
 
-    level3_blocking()
-      : m_blockA(0), m_blockB(0), m_blockW(0), m_mc(0), m_nc(0), m_kc(0)
-    {}
+ public:
+  level3_blocking() : m_blockA(0), m_blockB(0), m_mc(0), m_nc(0), m_kc(0) {}
 
-    inline DenseIndex mc() const { return m_mc; }
-    inline DenseIndex nc() const { return m_nc; }
-    inline DenseIndex kc() const { return m_kc; }
+  inline Index mc() const { return m_mc; }
+  inline Index nc() const { return m_nc; }
+  inline Index kc() const { return m_kc; }
 
-    inline LhsScalar* blockA() { return m_blockA; }
-    inline RhsScalar* blockB() { return m_blockB; }
-    inline RhsScalar* blockW() { return m_blockW; }
+  inline LhsScalar* blockA() { return m_blockA; }
+  inline RhsScalar* blockB() { return m_blockB; }
 };
 
-template<int StorageOrder, typename _LhsScalar, typename _RhsScalar, int MaxRows, int MaxCols, int MaxDepth, int KcFactor>
-class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, MaxDepth, KcFactor, true>
-  : public level3_blocking<
-      typename conditional<StorageOrder==RowMajor,_RhsScalar,_LhsScalar>::type,
-      typename conditional<StorageOrder==RowMajor,_LhsScalar,_RhsScalar>::type>
-{
-    enum {
-      Transpose = StorageOrder==RowMajor,
-      ActualRows = Transpose ? MaxCols : MaxRows,
-      ActualCols = Transpose ? MaxRows : MaxCols
-    };
-    typedef typename conditional<Transpose,_RhsScalar,_LhsScalar>::type LhsScalar;
-    typedef typename conditional<Transpose,_LhsScalar,_RhsScalar>::type RhsScalar;
-    typedef gebp_traits<LhsScalar,RhsScalar> Traits;
-    enum {
-      SizeA = ActualRows * MaxDepth,
-      SizeB = ActualCols * MaxDepth,
-      SizeW = MaxDepth * Traits::WorkSpaceFactor
-    };
-
-    EIGEN_ALIGN16 LhsScalar m_staticA[SizeA];
-    EIGEN_ALIGN16 RhsScalar m_staticB[SizeB];
-    EIGEN_ALIGN16 RhsScalar m_staticW[SizeW];
-
-  public:
-
-    gemm_blocking_space(DenseIndex /*rows*/, DenseIndex /*cols*/, DenseIndex /*depth*/)
-    {
-      this->m_mc = ActualRows;
-      this->m_nc = ActualCols;
-      this->m_kc = MaxDepth;
-      this->m_blockA = m_staticA;
-      this->m_blockB = m_staticB;
-      this->m_blockW = m_staticW;
-    }
+template <int StorageOrder, typename LhsScalar_, typename RhsScalar_, int MaxRows, int MaxCols, int MaxDepth,
+          int KcFactor>
+class gemm_blocking_space<StorageOrder, LhsScalar_, RhsScalar_, MaxRows, MaxCols, MaxDepth, KcFactor,
+                          true /* == FiniteAtCompileTime */>
+    : public level3_blocking<std::conditional_t<StorageOrder == RowMajor, RhsScalar_, LhsScalar_>,
+                             std::conditional_t<StorageOrder == RowMajor, LhsScalar_, RhsScalar_>> {
+  enum {
+    Transpose = StorageOrder == RowMajor,
+    ActualRows = Transpose ? MaxCols : MaxRows,
+    ActualCols = Transpose ? MaxRows : MaxCols
+  };
+  typedef std::conditional_t<Transpose, RhsScalar_, LhsScalar_> LhsScalar;
+  typedef std::conditional_t<Transpose, LhsScalar_, RhsScalar_> RhsScalar;
+  enum { SizeA = ActualRows * MaxDepth, SizeB = ActualCols * MaxDepth };
+
+#if EIGEN_MAX_STATIC_ALIGN_BYTES >= EIGEN_DEFAULT_ALIGN_BYTES
+  EIGEN_ALIGN_MAX LhsScalar m_staticA[SizeA];
+  EIGEN_ALIGN_MAX RhsScalar m_staticB[SizeB];
+#else
+  EIGEN_ALIGN_MAX char m_staticA[SizeA * sizeof(LhsScalar) + EIGEN_DEFAULT_ALIGN_BYTES - 1];
+  EIGEN_ALIGN_MAX char m_staticB[SizeB * sizeof(RhsScalar) + EIGEN_DEFAULT_ALIGN_BYTES - 1];
+#endif
+
+ public:
+  gemm_blocking_space(Index /*rows*/, Index /*cols*/, Index /*depth*/, Index /*num_threads*/,
+                      bool /*full_rows = false*/) {
+    this->m_mc = ActualRows;
+    this->m_nc = ActualCols;
+    this->m_kc = MaxDepth;
+#if EIGEN_MAX_STATIC_ALIGN_BYTES >= EIGEN_DEFAULT_ALIGN_BYTES
+    this->m_blockA = m_staticA;
+    this->m_blockB = m_staticB;
+#else
+    this->m_blockA = reinterpret_cast<LhsScalar*>((std::uintptr_t(m_staticA) + (EIGEN_DEFAULT_ALIGN_BYTES - 1)) &
+                                                  ~std::size_t(EIGEN_DEFAULT_ALIGN_BYTES - 1));
+    this->m_blockB = reinterpret_cast<RhsScalar*>((std::uintptr_t(m_staticB) + (EIGEN_DEFAULT_ALIGN_BYTES - 1)) &
+                                                  ~std::size_t(EIGEN_DEFAULT_ALIGN_BYTES - 1));
+#endif
+  }
+
+  void initParallel(Index, Index, Index, Index) {}
 
-    inline void allocateA() {}
-    inline void allocateB() {}
-    inline void allocateW() {}
-    inline void allocateAll() {}
+  inline void allocateA() {}
+  inline void allocateB() {}
+  inline void allocateAll() {}
 };
 
-template<int StorageOrder, typename _LhsScalar, typename _RhsScalar, int MaxRows, int MaxCols, int MaxDepth, int KcFactor>
-class gemm_blocking_space<StorageOrder,_LhsScalar,_RhsScalar,MaxRows, MaxCols, MaxDepth, KcFactor, false>
-  : public level3_blocking<
-      typename conditional<StorageOrder==RowMajor,_RhsScalar,_LhsScalar>::type,
-      typename conditional<StorageOrder==RowMajor,_LhsScalar,_RhsScalar>::type>
-{
-    enum {
-      Transpose = StorageOrder==RowMajor
-    };
-    typedef typename conditional<Transpose,_RhsScalar,_LhsScalar>::type LhsScalar;
-    typedef typename conditional<Transpose,_LhsScalar,_RhsScalar>::type RhsScalar;
-    typedef gebp_traits<LhsScalar,RhsScalar> Traits;
-
-    DenseIndex m_sizeA;
-    DenseIndex m_sizeB;
-    DenseIndex m_sizeW;
-
-  public:
-
-    gemm_blocking_space(DenseIndex rows, DenseIndex cols, DenseIndex depth)
+template <int StorageOrder, typename LhsScalar_, typename RhsScalar_, int MaxRows, int MaxCols, int MaxDepth,
+          int KcFactor>
+class gemm_blocking_space<StorageOrder, LhsScalar_, RhsScalar_, MaxRows, MaxCols, MaxDepth, KcFactor, false>
+    : public level3_blocking<std::conditional_t<StorageOrder == RowMajor, RhsScalar_, LhsScalar_>,
+                             std::conditional_t<StorageOrder == RowMajor, LhsScalar_, RhsScalar_>> {
+  enum { Transpose = StorageOrder == RowMajor };
+  typedef std::conditional_t<Transpose, RhsScalar_, LhsScalar_> LhsScalar;
+  typedef std::conditional_t<Transpose, LhsScalar_, RhsScalar_> RhsScalar;
+
+  Index m_sizeA;
+  Index m_sizeB;
+
+ public:
+  gemm_blocking_space(Index rows, Index cols, Index depth, Index num_threads, bool l3_blocking) {
+    this->m_mc = Transpose ? cols : rows;
+    this->m_nc = Transpose ? rows : cols;
+    this->m_kc = depth;
+
+    if (l3_blocking) {
+      computeProductBlockingSizes<LhsScalar, RhsScalar, KcFactor>(this->m_kc, this->m_mc, this->m_nc, num_threads);
+    } else  // no l3 blocking
     {
-      this->m_mc = Transpose ? cols : rows;
-      this->m_nc = Transpose ? rows : cols;
-      this->m_kc = depth;
-
-      computeProductBlockingSizes<LhsScalar,RhsScalar,KcFactor>(this->m_kc, this->m_mc, this->m_nc);
-      m_sizeA = this->m_mc * this->m_kc;
-      m_sizeB = this->m_kc * this->m_nc;
-      m_sizeW = this->m_kc*Traits::WorkSpaceFactor;
+      Index n = this->m_nc;
+      computeProductBlockingSizes<LhsScalar, RhsScalar, KcFactor>(this->m_kc, this->m_mc, n, num_threads);
     }
 
-    void allocateA()
-    {
-      if(this->m_blockA==0)
-        this->m_blockA = aligned_new<LhsScalar>(m_sizeA);
-    }
+    m_sizeA = this->m_mc * this->m_kc;
+    m_sizeB = this->m_kc * this->m_nc;
+  }
 
-    void allocateB()
-    {
-      if(this->m_blockB==0)
-        this->m_blockB = aligned_new<RhsScalar>(m_sizeB);
-    }
+  void initParallel(Index rows, Index cols, Index depth, Index num_threads) {
+    this->m_mc = Transpose ? cols : rows;
+    this->m_nc = Transpose ? rows : cols;
+    this->m_kc = depth;
 
-    void allocateW()
-    {
-      if(this->m_blockW==0)
-        this->m_blockW = aligned_new<RhsScalar>(m_sizeW);
-    }
+    eigen_internal_assert(this->m_blockA == 0 && this->m_blockB == 0);
+    Index m = this->m_mc;
+    computeProductBlockingSizes<LhsScalar, RhsScalar, KcFactor>(this->m_kc, m, this->m_nc, num_threads);
+    m_sizeA = this->m_mc * this->m_kc;
+    m_sizeB = this->m_kc * this->m_nc;
+  }
 
-    void allocateAll()
-    {
-      allocateA();
-      allocateB();
-      allocateW();
-    }
+  void allocateA() {
+    if (this->m_blockA == 0) this->m_blockA = aligned_new<LhsScalar>(m_sizeA);
+  }
 
-    ~gemm_blocking_space()
-    {
-      aligned_delete(this->m_blockA, m_sizeA);
-      aligned_delete(this->m_blockB, m_sizeB);
-      aligned_delete(this->m_blockW, m_sizeW);
-    }
+  void allocateB() {
+    if (this->m_blockB == 0) this->m_blockB = aligned_new<RhsScalar>(m_sizeB);
+  }
+
+  void allocateAll() {
+    allocateA();
+    allocateB();
+  }
+
+  ~gemm_blocking_space() {
+    aligned_delete(this->m_blockA, m_sizeA);
+    aligned_delete(this->m_blockB, m_sizeB);
+  }
 };
 
-} // end namespace internal
-
-template<typename Lhs, typename Rhs>
-class GeneralProduct<Lhs, Rhs, GemmProduct>
-  : public ProductBase<GeneralProduct<Lhs,Rhs,GemmProduct>, Lhs, Rhs>
-{
-    enum {
-      MaxDepthAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED(Lhs::MaxColsAtCompileTime,Rhs::MaxRowsAtCompileTime)
-    };
-  public:
-    EIGEN_PRODUCT_PUBLIC_INTERFACE(GeneralProduct)
-    
-    typedef typename  Lhs::Scalar LhsScalar;
-    typedef typename  Rhs::Scalar RhsScalar;
-    typedef           Scalar      ResScalar;
-
-    GeneralProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs)
-    {
-      typedef internal::scalar_product_op<LhsScalar,RhsScalar> BinOp;
-      EIGEN_CHECK_BINARY_COMPATIBILIY(BinOp,LhsScalar,RhsScalar);
+}  // end namespace internal
+
+namespace internal {
+
+template <typename Lhs, typename Rhs>
+struct generic_product_impl<Lhs, Rhs, DenseShape, DenseShape, GemmProduct>
+    : generic_product_impl_base<Lhs, Rhs, generic_product_impl<Lhs, Rhs, DenseShape, DenseShape, GemmProduct>> {
+  typedef typename Product<Lhs, Rhs>::Scalar Scalar;
+  typedef typename Lhs::Scalar LhsScalar;
+  typedef typename Rhs::Scalar RhsScalar;
+
+  typedef internal::blas_traits<Lhs> LhsBlasTraits;
+  typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
+  typedef internal::remove_all_t<ActualLhsType> ActualLhsTypeCleaned;
+
+  typedef internal::blas_traits<Rhs> RhsBlasTraits;
+  typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
+  typedef internal::remove_all_t<ActualRhsType> ActualRhsTypeCleaned;
+
+  enum { MaxDepthAtCompileTime = min_size_prefer_fixed(Lhs::MaxColsAtCompileTime, Rhs::MaxRowsAtCompileTime) };
+
+  typedef generic_product_impl<Lhs, Rhs, DenseShape, DenseShape, CoeffBasedProductMode> lazyproduct;
+
+  template <typename Dst>
+  static void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) {
+    // See http://eigen.tuxfamily.org/bz/show_bug.cgi?id=404 for a discussion and helper program
+    // to determine the following heuristic.
+    // EIGEN_GEMM_TO_COEFFBASED_THRESHOLD is typically defined to 20 in GeneralProduct.h,
+    // unless it has been specialized by the user or for a given architecture.
+    // Note that the condition rhs.rows()>0 was required because lazy product is (was?) not happy with empty inputs.
+    // I'm not sure it is still required.
+    if ((rhs.rows() + dst.rows() + dst.cols()) < EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows() > 0)
+      lazyproduct::eval_dynamic(dst, lhs, rhs, internal::assign_op<typename Dst::Scalar, Scalar>());
+    else {
+      dst.setZero();
+      scaleAndAddTo(dst, lhs, rhs, Scalar(1));
     }
+  }
 
-    template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const
-    {
-      eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols());
+  template <typename Dst>
+  static void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) {
+    if ((rhs.rows() + dst.rows() + dst.cols()) < EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows() > 0)
+      lazyproduct::eval_dynamic(dst, lhs, rhs, internal::add_assign_op<typename Dst::Scalar, Scalar>());
+    else
+      scaleAndAddTo(dst, lhs, rhs, Scalar(1));
+  }
 
-      typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(m_lhs);
-      typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(m_rhs);
+  template <typename Dst>
+  static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) {
+    if ((rhs.rows() + dst.rows() + dst.cols()) < EIGEN_GEMM_TO_COEFFBASED_THRESHOLD && rhs.rows() > 0)
+      lazyproduct::eval_dynamic(dst, lhs, rhs, internal::sub_assign_op<typename Dst::Scalar, Scalar>());
+    else
+      scaleAndAddTo(dst, lhs, rhs, Scalar(-1));
+  }
 
-      Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs)
-                                 * RhsBlasTraits::extractScalarFactor(m_rhs);
+  template <typename Dest>
+  static void scaleAndAddTo(Dest& dst, const Lhs& a_lhs, const Rhs& a_rhs, const Scalar& alpha) {
+    eigen_assert(dst.rows() == a_lhs.rows() && dst.cols() == a_rhs.cols());
+    if (a_lhs.cols() == 0 || a_lhs.rows() == 0 || a_rhs.cols() == 0) return;
+
+    if (dst.cols() == 1) {
+      // Fallback to GEMV if either the lhs or rhs is a runtime vector
+      typename Dest::ColXpr dst_vec(dst.col(0));
+      return internal::generic_product_impl<Lhs, typename Rhs::ConstColXpr, DenseShape, DenseShape,
+                                            GemvProduct>::scaleAndAddTo(dst_vec, a_lhs, a_rhs.col(0), alpha);
+    } else if (dst.rows() == 1) {
+      // Fallback to GEMV if either the lhs or rhs is a runtime vector
+      typename Dest::RowXpr dst_vec(dst.row(0));
+      return internal::generic_product_impl<typename Lhs::ConstRowXpr, Rhs, DenseShape, DenseShape,
+                                            GemvProduct>::scaleAndAddTo(dst_vec, a_lhs.row(0), a_rhs, alpha);
+    }
 
-      typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,LhsScalar,RhsScalar,
-              Dest::MaxRowsAtCompileTime,Dest::MaxColsAtCompileTime,MaxDepthAtCompileTime> BlockingType;
+    add_const_on_value_type_t<ActualLhsType> lhs = LhsBlasTraits::extract(a_lhs);
+    add_const_on_value_type_t<ActualRhsType> rhs = RhsBlasTraits::extract(a_rhs);
 
-      typedef internal::gemm_functor<
-        Scalar, Index,
-        internal::general_matrix_matrix_product<
-          Index,
-          LhsScalar, (_ActualLhsType::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(LhsBlasTraits::NeedToConjugate),
-          RhsScalar, (_ActualRhsType::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(RhsBlasTraits::NeedToConjugate),
-          (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>,
-        _ActualLhsType, _ActualRhsType, Dest, BlockingType> GemmFunctor;
+    Scalar actualAlpha = combine_scalar_factors(alpha, a_lhs, a_rhs);
 
-      BlockingType blocking(dst.rows(), dst.cols(), lhs.cols());
+    typedef internal::gemm_blocking_space<(Dest::Flags & RowMajorBit) ? RowMajor : ColMajor, LhsScalar, RhsScalar,
+                                          Dest::MaxRowsAtCompileTime, Dest::MaxColsAtCompileTime, MaxDepthAtCompileTime>
+        BlockingType;
 
-      internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime>32 || Dest::MaxRowsAtCompileTime==Dynamic)>(GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), this->rows(), this->cols(), Dest::Flags&RowMajorBit);
-    }
+    typedef internal::gemm_functor<
+        Scalar, Index,
+        internal::general_matrix_matrix_product<
+            Index, LhsScalar, (ActualLhsTypeCleaned::Flags & RowMajorBit) ? RowMajor : ColMajor,
+            bool(LhsBlasTraits::NeedToConjugate), RhsScalar,
+            (ActualRhsTypeCleaned::Flags & RowMajorBit) ? RowMajor : ColMajor, bool(RhsBlasTraits::NeedToConjugate),
+            (Dest::Flags & RowMajorBit) ? RowMajor : ColMajor, Dest::InnerStrideAtCompileTime>,
+        ActualLhsTypeCleaned, ActualRhsTypeCleaned, Dest, BlockingType>
+        GemmFunctor;
+
+    BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true);
+    internal::parallelize_gemm<(Dest::MaxRowsAtCompileTime > 32 || Dest::MaxRowsAtCompileTime == Dynamic)>(
+        GemmFunctor(lhs, rhs, dst, actualAlpha, blocking), a_lhs.rows(), a_rhs.cols(), a_lhs.cols(),
+        Dest::Flags & RowMajorBit);
+  }
 };
 
-} // end namespace Eigen
+}  // end namespace internal
+
+}  // end namespace Eigen
 
-#endif // EIGEN_GENERAL_MATRIX_MATRIX_H
+#endif  // EIGEN_GENERAL_MATRIX_MATRIX_H
diff --git a/inst/include/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/inst/include/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
index 5c376390..bf275675 100644
--- a/inst/include/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
+++ b/inst/include/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h
@@ -10,108 +10,127 @@
 #ifndef EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_H
 #define EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "../InternalHeaderCheck.h"
 
-template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjLhs, bool ConjRhs>
+namespace Eigen {
+
+template <typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjLhs, bool ConjRhs>
 struct selfadjoint_rank1_update;
 
 namespace internal {
 
 /**********************************************************************
-* This file implements a general A * B product while
-* evaluating only one triangular part of the product.
-* This is more general version of self adjoint product (C += A A^T)
-* as the level 3 SYRK Blas routine.
-**********************************************************************/
+ * This file implements a general A * B product while
+ * evaluating only one triangular part of the product.
+ * This is a more general version of self adjoint product (C += A A^T)
+ * as the level 3 SYRK Blas routine.
+ **********************************************************************/
 
 // forward declarations (defined at the end of this file)
-template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjLhs, bool ConjRhs, int UpLo>
+template <typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjLhs, bool ConjRhs,
+          int ResInnerStride, int UpLo>
 struct tribb_kernel;
-  
+
 /* Optimized matrix-matrix product evaluating only one triangular half */
-template <typename Index,
-          typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
-          typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs,
-                              int ResStorageOrder, int  UpLo, int Version = Specialized>
+template <typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, typename RhsScalar,
+          int RhsStorageOrder, bool ConjugateRhs, int ResStorageOrder, int ResInnerStride, int UpLo,
+          int Version = Specialized>
 struct general_matrix_matrix_triangular_product;
 
 // as usual if the result is row major => we transpose the product
-template <typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
-                          typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, int  UpLo, int Version>
-struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,RowMajor,UpLo,Version>
-{
-  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
-  static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* lhs, Index lhsStride,
-                                      const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resStride, const ResScalar& alpha)
-  {
-    general_matrix_matrix_triangular_product<Index,
-        RhsScalar, RhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateRhs,
-        LhsScalar, LhsStorageOrder==RowMajor ? ColMajor : RowMajor, ConjugateLhs,
-        ColMajor, UpLo==Lower?Upper:Lower>
-      ::run(size,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha);
+template <typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, typename RhsScalar,
+          int RhsStorageOrder, bool ConjugateRhs, int ResInnerStride, int UpLo, int Version>
+struct general_matrix_matrix_triangular_product<Index, LhsScalar, LhsStorageOrder, ConjugateLhs, RhsScalar,
+                                                RhsStorageOrder, ConjugateRhs, RowMajor, ResInnerStride, UpLo,
+                                                Version> {
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  static EIGEN_STRONG_INLINE void run(Index size, Index depth, const LhsScalar* lhs, Index lhsStride,
+                                      const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resIncr,
+                                      Index resStride, const ResScalar& alpha,
+                                      level3_blocking<RhsScalar, LhsScalar>& blocking) {
+    general_matrix_matrix_triangular_product<Index, RhsScalar, RhsStorageOrder == RowMajor ? ColMajor : RowMajor,
+                                             ConjugateRhs, LhsScalar, LhsStorageOrder == RowMajor ? ColMajor : RowMajor,
+                                             ConjugateLhs, ColMajor, ResInnerStride,
+                                             UpLo == Lower ? Upper : Lower>::run(size, depth, rhs, rhsStride, lhs,
+                                                                                 lhsStride, res, resIncr, resStride,
+                                                                                 alpha, blocking);
   }
 };
 
-template <typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
-                          typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, int  UpLo, int Version>
-struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,RhsStorageOrder,ConjugateRhs,ColMajor,UpLo,Version>
-{
-  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
-  static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* _lhs, Index lhsStride,
-                                      const RhsScalar* _rhs, Index rhsStride, ResScalar* res, Index resStride, const ResScalar& alpha)
-  {
-    const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
-    const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
-
-    typedef gebp_traits<LhsScalar,RhsScalar> Traits;
-
-    Index kc = depth; // cache block size along the K direction
-    Index mc = size;  // cache block size along the M direction
-    Index nc = size;  // cache block size along the N direction
-    computeProductBlockingSizes<LhsScalar,RhsScalar>(kc, mc, nc);
-    // !!! mc must be a multiple of nr:
-    if(mc > Traits::nr)
-      mc = (mc/Traits::nr)*Traits::nr;
-
-    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
-    std::size_t sizeB = sizeW + kc*size;
-    ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, kc*mc, 0);
-    ei_declare_aligned_stack_constructed_variable(RhsScalar, allocatedBlockB, sizeB, 0);
-    RhsScalar* blockB = allocatedBlockB + sizeW;
-    
-    gemm_pack_lhs<LhsScalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
-    gemm_pack_rhs<RhsScalar, Index, Traits::nr, RhsStorageOrder> pack_rhs;
-    gebp_kernel <LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
-    tribb_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs, UpLo> sybb;
-
-    for(Index k2=0; k2<depth; k2+=kc)
-    {
-      const Index actual_kc = (std::min)(k2+kc,depth)-k2;
+template <typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, typename RhsScalar,
+          int RhsStorageOrder, bool ConjugateRhs, int ResInnerStride, int UpLo, int Version>
+struct general_matrix_matrix_triangular_product<Index, LhsScalar, LhsStorageOrder, ConjugateLhs, RhsScalar,
+                                                RhsStorageOrder, ConjugateRhs, ColMajor, ResInnerStride, UpLo,
+                                                Version> {
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  static EIGEN_STRONG_INLINE void run(Index size, Index depth, const LhsScalar* lhs_, Index lhsStride,
+                                      const RhsScalar* rhs_, Index rhsStride, ResScalar* res_, Index resIncr,
+                                      Index resStride, const ResScalar& alpha,
+                                      level3_blocking<LhsScalar, RhsScalar>& blocking) {
+    if (size == 0) {
+      return;
+    }
+
+    typedef gebp_traits<LhsScalar, RhsScalar> Traits;
+
+    typedef const_blas_data_mapper<LhsScalar, Index, LhsStorageOrder> LhsMapper;
+    typedef const_blas_data_mapper<RhsScalar, Index, RhsStorageOrder> RhsMapper;
+    typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
+    LhsMapper lhs(lhs_, lhsStride);
+    RhsMapper rhs(rhs_, rhsStride);
+    ResMapper res(res_, resStride, resIncr);
+
+    Index kc = blocking.kc();
+    // Ensure that mc >= nr and <= size
+    Index mc = (std::min)(size, (std::max)(static_cast<decltype(blocking.mc())>(Traits::nr), blocking.mc()));
+
+    // !!! mc must be a multiple of nr
+    if (mc > Traits::nr) {
+      using UnsignedIndex = typename make_unsigned<Index>::type;
+      mc = (UnsignedIndex(mc) / Traits::nr) * Traits::nr;
+    }
+
+    std::size_t sizeA = kc * mc;
+    std::size_t sizeB = kc * size;
+
+    ei_declare_aligned_stack_constructed_variable(LhsScalar, blockA, sizeA, blocking.blockA());
+    ei_declare_aligned_stack_constructed_variable(RhsScalar, blockB, sizeB, blocking.blockB());
+
+    gemm_pack_lhs<LhsScalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing,
+                  LhsStorageOrder>
+        pack_lhs;
+    gemm_pack_rhs<RhsScalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
+    gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp;
+    tribb_kernel<LhsScalar, RhsScalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs, ResInnerStride, UpLo>
+        sybb;
+
+    for (Index k2 = 0; k2 < depth; k2 += kc) {
+      const Index actual_kc = (std::min)(k2 + kc, depth) - k2;
 
       // note that the actual rhs is the transpose/adjoint of mat
-      pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, size);
+      pack_rhs(blockB, rhs.getSubMapper(k2, 0), actual_kc, size);
 
-      for(Index i2=0; i2<size; i2+=mc)
-      {
-        const Index actual_mc = (std::min)(i2+mc,size)-i2;
+      for (Index i2 = 0; i2 < size; i2 += mc) {
+        const Index actual_mc = (std::min)(i2 + mc, size) - i2;
 
-        pack_lhs(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc);
+        pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
 
         // the selected actual_mc * size panel of res is split into three different part:
         //  1 - before the diagonal => processed with gebp or skipped
         //  2 - the actual_mc x actual_mc symmetric block => processed with a special kernel
         //  3 - after the diagonal => processed with gebp or skipped
-        if (UpLo==Lower)
-          gebp(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, (std::min)(size,i2), alpha,
-               -1, -1, 0, 0, allocatedBlockB);
+        if (UpLo == Lower)
+          gebp(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, (std::min)(size, i2), alpha, -1, -1, 0,
+               0);
 
-        sybb(res+resStride*i2 + i2, resStride, blockA, blockB + actual_kc*i2, actual_mc, actual_kc, alpha, allocatedBlockB);
+        sybb(res_ + resStride * i2 + resIncr * i2, resIncr, resStride, blockA, blockB + actual_kc * i2, actual_mc,
+             actual_kc, alpha);
 
-        if (UpLo==Upper)
-        {
-          Index j2 = i2+actual_mc;
-          gebp(res+resStride*j2+i2, resStride, blockA, blockB+actual_kc*j2, actual_mc, actual_kc, (std::max)(Index(0), size-j2), alpha,
-               -1, -1, 0, 0, allocatedBlockB);
+        if (UpLo == Upper) {
+          Index j2 = i2 + actual_mc;
+          gebp(res.getSubMapper(i2, j2), blockA, blockB + actual_kc * j2, actual_mc, actual_kc,
+               (std::max)(Index(0), size - j2), alpha, -1, -1, 0, 0);
         }
       }
     }
@@ -127,152 +146,185 @@ struct general_matrix_matrix_triangular_product<Index,LhsScalar,LhsStorageOrder,
 //   while the triangular block overlapping the diagonal is evaluated into a
 //   small temporary buffer which is then accumulated into the result using a
 //   triangular traversal.
-template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjLhs, bool ConjRhs, int UpLo>
-struct tribb_kernel
-{
-  typedef gebp_traits<LhsScalar,RhsScalar,ConjLhs,ConjRhs> Traits;
+template <typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjLhs, bool ConjRhs,
+          int ResInnerStride, int UpLo>
+struct tribb_kernel {
+  typedef gebp_traits<LhsScalar, RhsScalar, ConjLhs, ConjRhs> Traits;
   typedef typename Traits::ResScalar ResScalar;
-  
-  enum {
-    BlockSize  = EIGEN_PLAIN_ENUM_MAX(mr,nr)
-  };
-  void operator()(ResScalar* res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha, RhsScalar* workspace)
-  {
-    gebp_kernel<LhsScalar, RhsScalar, Index, mr, nr, ConjLhs, ConjRhs> gebp_kernel;
-    Matrix<ResScalar,BlockSize,BlockSize,ColMajor> buffer;
+
+  enum { BlockSize = meta_least_common_multiple<plain_enum_max(mr, nr), plain_enum_min(mr, nr)>::ret };
+  void operator()(ResScalar* res_, Index resIncr, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB,
+                  Index size, Index depth, const ResScalar& alpha) {
+    typedef blas_data_mapper<ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
+    typedef blas_data_mapper<ResScalar, Index, ColMajor, Unaligned> BufferMapper;
+    ResMapper res(res_, resStride, resIncr);
+    gebp_kernel<LhsScalar, RhsScalar, Index, ResMapper, mr, nr, ConjLhs, ConjRhs> gebp_kernel1;
+    gebp_kernel<LhsScalar, RhsScalar, Index, BufferMapper, mr, nr, ConjLhs, ConjRhs> gebp_kernel2;
+
+    Matrix<ResScalar, BlockSize, BlockSize, ColMajor> buffer;
 
     // let's process the block per panel of actual_mc x BlockSize,
     // again, each is split into three parts, etc.
-    for (Index j=0; j<size; j+=BlockSize)
-    {
-      Index actualBlockSize = std::min<Index>(BlockSize,size - j);
-      const RhsScalar* actual_b = blockB+j*depth;
+    for (Index j = 0; j < size; j += BlockSize) {
+      Index actualBlockSize = std::min<Index>(BlockSize, size - j);
+      const RhsScalar* actual_b = blockB + j * depth;
 
-      if(UpLo==Upper)
-        gebp_kernel(res+j*resStride, resStride, blockA, actual_b, j, depth, actualBlockSize, alpha,
-                    -1, -1, 0, 0, workspace);
+      if (UpLo == Upper)
+        gebp_kernel1(res.getSubMapper(0, j), blockA, actual_b, j, depth, actualBlockSize, alpha, -1, -1, 0, 0);
 
       // selfadjoint micro block
       {
         Index i = j;
         buffer.setZero();
         // 1 - apply the kernel on the temporary buffer
-        gebp_kernel(buffer.data(), BlockSize, blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha,
-                    -1, -1, 0, 0, workspace);
+        gebp_kernel2(BufferMapper(buffer.data(), BlockSize), blockA + depth * i, actual_b, actualBlockSize, depth,
+                     actualBlockSize, alpha, -1, -1, 0, 0);
+
         // 2 - triangular accumulation
-        for(Index j1=0; j1<actualBlockSize; ++j1)
-        {
-          ResScalar* r = res + (j+j1)*resStride + i;
-          for(Index i1=UpLo==Lower ? j1 : 0;
-              UpLo==Lower ? i1<actualBlockSize : i1<=j1; ++i1)
-            r[i1] += buffer(i1,j1);
+        for (Index j1 = 0; j1 < actualBlockSize; ++j1) {
+          typename ResMapper::LinearMapper r = res.getLinearMapper(i, j + j1);
+          for (Index i1 = UpLo == Lower ? j1 : 0; UpLo == Lower ? i1 < actualBlockSize : i1 <= j1; ++i1)
+            r(i1) += buffer(i1, j1);
         }
       }
 
-      if(UpLo==Lower)
-      {
-        Index i = j+actualBlockSize;
-        gebp_kernel(res+j*resStride+i, resStride, blockA+depth*i, actual_b, size-i, depth, actualBlockSize, alpha,
-                    -1, -1, 0, 0, workspace);
+      if (UpLo == Lower) {
+        Index i = j + actualBlockSize;
+        gebp_kernel1(res.getSubMapper(i, j), blockA + depth * i, actual_b, size - i, depth, actualBlockSize, alpha, -1,
+                     -1, 0, 0);
       }
     }
   }
 };
 
-} // end namespace internal
+}  // end namespace internal
 
 // high level API
 
-template<typename MatrixType, typename ProductType, int UpLo, bool IsOuterProduct>
+template <typename MatrixType, typename ProductType, int UpLo, bool IsOuterProduct>
 struct general_product_to_triangular_selector;
 
-
-template<typename MatrixType, typename ProductType, int UpLo>
-struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,true>
-{
-  static void run(MatrixType& mat, const ProductType& prod, const typename MatrixType::Scalar& alpha)
-  {
+template <typename MatrixType, typename ProductType, int UpLo>
+struct general_product_to_triangular_selector<MatrixType, ProductType, UpLo, true> {
+  static void run(MatrixType& mat, const ProductType& prod, const typename MatrixType::Scalar& alpha, bool beta) {
     typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::Index Index;
-    
-    typedef typename internal::remove_all<typename ProductType::LhsNested>::type Lhs;
+
+    typedef internal::remove_all_t<typename ProductType::LhsNested> Lhs;
     typedef internal::blas_traits<Lhs> LhsBlasTraits;
     typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhs;
-    typedef typename internal::remove_all<ActualLhs>::type _ActualLhs;
-    typename internal::add_const_on_value_type<ActualLhs>::type actualLhs = LhsBlasTraits::extract(prod.lhs());
-    
-    typedef typename internal::remove_all<typename ProductType::RhsNested>::type Rhs;
+    typedef internal::remove_all_t<ActualLhs> ActualLhs_;
+    internal::add_const_on_value_type_t<ActualLhs> actualLhs = LhsBlasTraits::extract(prod.lhs());
+
+    typedef internal::remove_all_t<typename ProductType::RhsNested> Rhs;
     typedef internal::blas_traits<Rhs> RhsBlasTraits;
     typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhs;
-    typedef typename internal::remove_all<ActualRhs>::type _ActualRhs;
-    typename internal::add_const_on_value_type<ActualRhs>::type actualRhs = RhsBlasTraits::extract(prod.rhs());
+    typedef internal::remove_all_t<ActualRhs> ActualRhs_;
+    internal::add_const_on_value_type_t<ActualRhs> actualRhs = RhsBlasTraits::extract(prod.rhs());
+
+    Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs().derived()) *
+                         RhsBlasTraits::extractScalarFactor(prod.rhs().derived());
 
-    Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs().derived()) * RhsBlasTraits::extractScalarFactor(prod.rhs().derived());
+    if (!beta) mat.template triangularView<UpLo>().setZero();
 
     enum {
-      StorageOrder = (internal::traits<MatrixType>::Flags&RowMajorBit) ? RowMajor : ColMajor,
-      UseLhsDirectly = _ActualLhs::InnerStrideAtCompileTime==1,
-      UseRhsDirectly = _ActualRhs::InnerStrideAtCompileTime==1
+      StorageOrder = (internal::traits<MatrixType>::Flags & RowMajorBit) ? RowMajor : ColMajor,
+      UseLhsDirectly = ActualLhs_::InnerStrideAtCompileTime == 1,
+      UseRhsDirectly = ActualRhs_::InnerStrideAtCompileTime == 1
     };
-    
-    internal::gemv_static_vector_if<Scalar,Lhs::SizeAtCompileTime,Lhs::MaxSizeAtCompileTime,!UseLhsDirectly> static_lhs;
-    ei_declare_aligned_stack_constructed_variable(Scalar, actualLhsPtr, actualLhs.size(),
-      (UseLhsDirectly ? const_cast<Scalar*>(actualLhs.data()) : static_lhs.data()));
-    if(!UseLhsDirectly) Map<typename _ActualLhs::PlainObject>(actualLhsPtr, actualLhs.size()) = actualLhs;
-    
-    internal::gemv_static_vector_if<Scalar,Rhs::SizeAtCompileTime,Rhs::MaxSizeAtCompileTime,!UseRhsDirectly> static_rhs;
-    ei_declare_aligned_stack_constructed_variable(Scalar, actualRhsPtr, actualRhs.size(),
-      (UseRhsDirectly ? const_cast<Scalar*>(actualRhs.data()) : static_rhs.data()));
-    if(!UseRhsDirectly) Map<typename _ActualRhs::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs;
-    
-    
-    selfadjoint_rank1_update<Scalar,Index,StorageOrder,UpLo,
-                              LhsBlasTraits::NeedToConjugate && NumTraits<Scalar>::IsComplex,
-                              RhsBlasTraits::NeedToConjugate && NumTraits<Scalar>::IsComplex>
-          ::run(actualLhs.size(), mat.data(), mat.outerStride(), actualLhsPtr, actualRhsPtr, actualAlpha);
+
+    internal::gemv_static_vector_if<Scalar, Lhs::SizeAtCompileTime, Lhs::MaxSizeAtCompileTime, !UseLhsDirectly>
+        static_lhs;
+    ei_declare_aligned_stack_constructed_variable(
+        Scalar, actualLhsPtr, actualLhs.size(),
+        (UseLhsDirectly ? const_cast<Scalar*>(actualLhs.data()) : static_lhs.data()));
+    if (!UseLhsDirectly) Map<typename ActualLhs_::PlainObject>(actualLhsPtr, actualLhs.size()) = actualLhs;
+
+    internal::gemv_static_vector_if<Scalar, Rhs::SizeAtCompileTime, Rhs::MaxSizeAtCompileTime, !UseRhsDirectly>
+        static_rhs;
+    ei_declare_aligned_stack_constructed_variable(
+        Scalar, actualRhsPtr, actualRhs.size(),
+        (UseRhsDirectly ? const_cast<Scalar*>(actualRhs.data()) : static_rhs.data()));
+    if (!UseRhsDirectly) Map<typename ActualRhs_::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs;
+
+    selfadjoint_rank1_update<
+        Scalar, Index, StorageOrder, UpLo, LhsBlasTraits::NeedToConjugate && NumTraits<Scalar>::IsComplex,
+        RhsBlasTraits::NeedToConjugate && NumTraits<Scalar>::IsComplex>::run(actualLhs.size(), mat.data(),
+                                                                             mat.outerStride(), actualLhsPtr,
+                                                                             actualRhsPtr, actualAlpha);
   }
 };
 
-template<typename MatrixType, typename ProductType, int UpLo>
-struct general_product_to_triangular_selector<MatrixType,ProductType,UpLo,false>
-{
-  static void run(MatrixType& mat, const ProductType& prod, const typename MatrixType::Scalar& alpha)
-  {
-    typedef typename MatrixType::Index Index;
-    
-    typedef typename internal::remove_all<typename ProductType::LhsNested>::type Lhs;
+template <typename MatrixType, typename ProductType, int UpLo>
+struct general_product_to_triangular_selector<MatrixType, ProductType, UpLo, false> {
+  static void run(MatrixType& mat, const ProductType& prod, const typename MatrixType::Scalar& alpha, bool beta) {
+    typedef internal::remove_all_t<typename ProductType::LhsNested> Lhs;
     typedef internal::blas_traits<Lhs> LhsBlasTraits;
     typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhs;
-    typedef typename internal::remove_all<ActualLhs>::type _ActualLhs;
-    typename internal::add_const_on_value_type<ActualLhs>::type actualLhs = LhsBlasTraits::extract(prod.lhs());
-    
-    typedef typename internal::remove_all<typename ProductType::RhsNested>::type Rhs;
+    typedef internal::remove_all_t<ActualLhs> ActualLhs_;
+    internal::add_const_on_value_type_t<ActualLhs> actualLhs = LhsBlasTraits::extract(prod.lhs());
+
+    typedef internal::remove_all_t<typename ProductType::RhsNested> Rhs;
     typedef internal::blas_traits<Rhs> RhsBlasTraits;
     typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhs;
-    typedef typename internal::remove_all<ActualRhs>::type _ActualRhs;
-    typename internal::add_const_on_value_type<ActualRhs>::type actualRhs = RhsBlasTraits::extract(prod.rhs());
-
-    typename ProductType::Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs().derived()) * RhsBlasTraits::extractScalarFactor(prod.rhs().derived());
-
-    internal::general_matrix_matrix_triangular_product<Index,
-      typename Lhs::Scalar, _ActualLhs::Flags&RowMajorBit ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate,
-      typename Rhs::Scalar, _ActualRhs::Flags&RowMajorBit ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate,
-      MatrixType::Flags&RowMajorBit ? RowMajor : ColMajor, UpLo>
-      ::run(mat.cols(), actualLhs.cols(),
-            &actualLhs.coeffRef(0,0), actualLhs.outerStride(), &actualRhs.coeffRef(0,0), actualRhs.outerStride(),
-            mat.data(), mat.outerStride(), actualAlpha);
+    typedef internal::remove_all_t<ActualRhs> ActualRhs_;
+    internal::add_const_on_value_type_t<ActualRhs> actualRhs = RhsBlasTraits::extract(prod.rhs());
+
+    typename ProductType::Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs().derived()) *
+                                               RhsBlasTraits::extractScalarFactor(prod.rhs().derived());
+
+    if (!beta) mat.template triangularView<UpLo>().setZero();
+
+    enum {
+      IsRowMajor = (internal::traits<MatrixType>::Flags & RowMajorBit) ? 1 : 0,
+      LhsIsRowMajor = ActualLhs_::Flags & RowMajorBit ? 1 : 0,
+      RhsIsRowMajor = ActualRhs_::Flags & RowMajorBit ? 1 : 0,
+      SkipDiag = (UpLo & (UnitDiag | ZeroDiag)) != 0
+    };
+
+    Index size = mat.cols();
+    if (SkipDiag) size--;
+    Index depth = actualLhs.cols();
+
+    typedef internal::gemm_blocking_space<IsRowMajor ? RowMajor : ColMajor, typename Lhs::Scalar, typename Rhs::Scalar,
+                                          MatrixType::MaxColsAtCompileTime, MatrixType::MaxColsAtCompileTime,
+                                          ActualRhs_::MaxColsAtCompileTime>
+        BlockingType;
+
+    BlockingType blocking(size, size, depth, 1, false);
+
+    internal::general_matrix_matrix_triangular_product<
+        Index, typename Lhs::Scalar, LhsIsRowMajor ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate,
+        typename Rhs::Scalar, RhsIsRowMajor ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate,
+        IsRowMajor ? RowMajor : ColMajor, MatrixType::InnerStrideAtCompileTime,
+        UpLo&(Lower | Upper)>::run(size, depth, &actualLhs.coeffRef(SkipDiag && (UpLo & Lower) == Lower ? 1 : 0, 0),
+                                   actualLhs.outerStride(),
+                                   &actualRhs.coeffRef(0, SkipDiag && (UpLo & Upper) == Upper ? 1 : 0),
+                                   actualRhs.outerStride(),
+                                   mat.data() +
+                                       (SkipDiag ? (bool(IsRowMajor) != ((UpLo & Lower) == Lower) ? mat.innerStride()
+                                                                                                  : mat.outerStride())
+                                                 : 0),
+                                   mat.innerStride(), mat.outerStride(), actualAlpha, blocking);
   }
 };
 
-template<typename MatrixType, unsigned int UpLo>
-template<typename ProductDerived, typename _Lhs, typename _Rhs>
-TriangularView<MatrixType,UpLo>& TriangularView<MatrixType,UpLo>::assignProduct(const ProductBase<ProductDerived, _Lhs,_Rhs>& prod, const Scalar& alpha)
-{
-  general_product_to_triangular_selector<MatrixType, ProductDerived, UpLo, (_Lhs::ColsAtCompileTime==1) || (_Rhs::RowsAtCompileTime==1)>::run(m_matrix.const_cast_derived(), prod.derived(), alpha);
-  
-  return *this;
+template <typename MatrixType_, unsigned int Mode_>
+template <typename ProductType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename TriangularViewImpl<MatrixType_, Mode_, Dense>::TriangularViewType&
+TriangularViewImpl<MatrixType_, Mode_, Dense>::_assignProduct(
+    const ProductType& prod, const typename TriangularViewImpl<MatrixType_, Mode_, Dense>::Scalar& alpha, bool beta) {
+  EIGEN_STATIC_ASSERT((Mode_ & UnitDiag) == 0, WRITING_TO_TRIANGULAR_PART_WITH_UNIT_DIAGONAL_IS_NOT_SUPPORTED);
+  eigen_assert(derived().nestedExpression().rows() == prod.rows() && derived().cols() == prod.cols());
+
+  general_product_to_triangular_selector<MatrixType_, ProductType, Mode_,
+                                         internal::traits<ProductType>::InnerSize == 1>::run(derived()
+                                                                                                 .nestedExpression()
+                                                                                                 .const_cast_derived(),
+                                                                                             prod, alpha, beta);
+
+  return derived();
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_H
+#endif  // EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_H
diff --git a/inst/include/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h b/inst/include/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h
new file mode 100644
index 00000000..6817cc09
--- /dev/null
+++ b/inst/include/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h
@@ -0,0 +1,148 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to BLAS F77
+ *   Level 3 BLAS SYRK/HERK implementation.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_BLAS_H
+#define EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_BLAS_H
+
+// IWYU pragma: private
+#include "../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename Index, typename Scalar, int AStorageOrder, bool ConjugateA, int ResStorageOrder, int UpLo>
+struct general_matrix_matrix_rankupdate
+    : general_matrix_matrix_triangular_product<Index, Scalar, AStorageOrder, ConjugateA, Scalar, AStorageOrder,
+                                               ConjugateA, ResStorageOrder, 1, UpLo, BuiltIn> {};
+
+// try to go to BLAS specialization
+#define EIGEN_BLAS_RANKUPDATE_SPECIALIZE(Scalar)                                                                      \
+  template <typename Index, int LhsStorageOrder, bool ConjugateLhs, int RhsStorageOrder, bool ConjugateRhs, int UpLo> \
+  struct general_matrix_matrix_triangular_product<Index, Scalar, LhsStorageOrder, ConjugateLhs, Scalar,               \
+                                                  RhsStorageOrder, ConjugateRhs, ColMajor, 1, UpLo, Specialized> {    \
+    static EIGEN_STRONG_INLINE void run(Index size, Index depth, const Scalar* lhs, Index lhsStride,                  \
+                                        const Scalar* rhs, Index rhsStride, Scalar* res, Index resIncr,               \
+                                        Index resStride, Scalar alpha, level3_blocking<Scalar, Scalar>& blocking) {   \
+      if (lhs == rhs && ((UpLo & (Lower | Upper)) == UpLo)) {                                                         \
+        general_matrix_matrix_rankupdate<Index, Scalar, LhsStorageOrder, ConjugateLhs, ColMajor, UpLo>::run(          \
+            size, depth, lhs, lhsStride, rhs, rhsStride, res, resStride, alpha, blocking);                            \
+      } else {                                                                                                        \
+        general_matrix_matrix_triangular_product<Index, Scalar, LhsStorageOrder, ConjugateLhs, Scalar,                \
+                                                 RhsStorageOrder, ConjugateRhs, ColMajor, 1, UpLo,                    \
+                                                 BuiltIn>::run(size, depth, lhs, lhsStride, rhs, rhsStride, res,      \
+                                                               resIncr, resStride, alpha, blocking);                  \
+      }                                                                                                               \
+    }                                                                                                                 \
+  };
+
+EIGEN_BLAS_RANKUPDATE_SPECIALIZE(double)
+EIGEN_BLAS_RANKUPDATE_SPECIALIZE(float)
+// TODO handle complex cases
+// EIGEN_BLAS_RANKUPDATE_SPECIALIZE(dcomplex)
+// EIGEN_BLAS_RANKUPDATE_SPECIALIZE(scomplex)
+
+// SYRK for float/double
+#define EIGEN_BLAS_RANKUPDATE_R(EIGTYPE, BLASTYPE, BLASFUNC)                                                        \
+  template <typename Index, int AStorageOrder, bool ConjugateA, int UpLo>                                           \
+  struct general_matrix_matrix_rankupdate<Index, EIGTYPE, AStorageOrder, ConjugateA, ColMajor, UpLo> {              \
+    enum {                                                                                                          \
+      IsLower = (UpLo & Lower) == Lower,                                                                            \
+      LowUp = IsLower ? Lower : Upper,                                                                              \
+      conjA = ((AStorageOrder == ColMajor) && ConjugateA) ? 1 : 0                                                   \
+    };                                                                                                              \
+    static EIGEN_STRONG_INLINE void run(Index size, Index depth, const EIGTYPE* lhs, Index lhsStride,               \
+                                        const EIGTYPE* /*rhs*/, Index /*rhsStride*/, EIGTYPE* res, Index resStride, \
+                                        EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) {           \
+      /* typedef Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> MatrixRhs;*/                                    \
+      if (size == 0 || depth == 0) return;                                                                          \
+      BlasIndex lda = convert_index<BlasIndex>(lhsStride), ldc = convert_index<BlasIndex>(resStride),               \
+                n = convert_index<BlasIndex>(size), k = convert_index<BlasIndex>(depth);                            \
+      char uplo = ((IsLower) ? 'L' : 'U'), trans = ((AStorageOrder == RowMajor) ? 'T' : 'N');                       \
+      EIGTYPE beta(1);                                                                                              \
+      BLASFUNC(&uplo, &trans, &n, &k, (const BLASTYPE*)&numext::real_ref(alpha), lhs, &lda,                         \
+               (const BLASTYPE*)&numext::real_ref(beta), res, &ldc);                                                \
+    }                                                                                                               \
+  };
+
+// HERK for complex data
+#define EIGEN_BLAS_RANKUPDATE_C(EIGTYPE, BLASTYPE, RTYPE, BLASFUNC)                                                 \
+  template <typename Index, int AStorageOrder, bool ConjugateA, int UpLo>                                           \
+  struct general_matrix_matrix_rankupdate<Index, EIGTYPE, AStorageOrder, ConjugateA, ColMajor, UpLo> {              \
+    enum {                                                                                                          \
+      IsLower = (UpLo & Lower) == Lower,                                                                            \
+      LowUp = IsLower ? Lower : Upper,                                                                              \
+      conjA = (((AStorageOrder == ColMajor) && ConjugateA) || ((AStorageOrder == RowMajor) && !ConjugateA)) ? 1 : 0 \
+    };                                                                                                              \
+    static EIGEN_STRONG_INLINE void run(Index size, Index depth, const EIGTYPE* lhs, Index lhsStride,               \
+                                        const EIGTYPE* /*rhs*/, Index /*rhsStride*/, EIGTYPE* res, Index resStride, \
+                                        EIGTYPE alpha, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) {           \
+      typedef Matrix<EIGTYPE, Dynamic, Dynamic, AStorageOrder> MatrixType;                                          \
+      if (size == 0 || depth == 0) return;                                                                          \
+      BlasIndex lda = convert_index<BlasIndex>(lhsStride), ldc = convert_index<BlasIndex>(resStride),               \
+                n = convert_index<BlasIndex>(size), k = convert_index<BlasIndex>(depth);                            \
+      char uplo = ((IsLower) ? 'L' : 'U'), trans = ((AStorageOrder == RowMajor) ? 'C' : 'N');                       \
+      RTYPE alpha_, beta_;                                                                                          \
+      const EIGTYPE* a_ptr;                                                                                         \
+                                                                                                                    \
+      alpha_ = alpha.real();                                                                                        \
+      beta_ = 1.0;                                                                                                  \
+      /* Copy with conjugation in some cases*/                                                                      \
+      MatrixType a;                                                                                                 \
+      if (conjA) {                                                                                                  \
+        Map<const MatrixType, 0, OuterStride<> > mapA(lhs, n, k, OuterStride<>(lhsStride));                         \
+        a = mapA.conjugate();                                                                                       \
+        lda = a.outerStride();                                                                                      \
+        a_ptr = a.data();                                                                                           \
+      } else                                                                                                        \
+        a_ptr = lhs;                                                                                                \
+      BLASFUNC(&uplo, &trans, &n, &k, &alpha_, (BLASTYPE*)a_ptr, &lda, &beta_, (BLASTYPE*)res, &ldc);               \
+    }                                                                                                               \
+  };
+
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_RANKUPDATE_R(double, double, dsyrk)
+EIGEN_BLAS_RANKUPDATE_R(float, float, ssyrk)
+#else
+EIGEN_BLAS_RANKUPDATE_R(double, double, dsyrk_)
+EIGEN_BLAS_RANKUPDATE_R(float, float, ssyrk_)
+#endif
+
+// TODO handle complex cases
+// EIGEN_BLAS_RANKUPDATE_C(dcomplex, double, double, zherk_)
+// EIGEN_BLAS_RANKUPDATE_C(scomplex, float,  float, cherk_)
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_BLAS_H
diff --git a/inst/include/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h b/inst/include/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h
deleted file mode 100644
index 3deed068..00000000
--- a/inst/include/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_MKL.h
+++ /dev/null
@@ -1,146 +0,0 @@
-/*
- Copyright (c) 2011, Intel Corporation. All rights reserved.
-
- Redistribution and use in source and binary forms, with or without modification,
- are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
- * Neither the name of Intel Corporation nor the names of its contributors may
-   be used to endorse or promote products derived from this software without
-   specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
- ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
- ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
- *   Level 3 BLAS SYRK/HERK implementation.
- ********************************************************************************
-*/
-
-#ifndef EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_MKL_H
-#define EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_MKL_H
-
-namespace Eigen { 
-
-namespace internal {
-
-template <typename Index, typename Scalar, int AStorageOrder, bool ConjugateA, int ResStorageOrder, int  UpLo>
-struct general_matrix_matrix_rankupdate :
-       general_matrix_matrix_triangular_product<
-         Index,Scalar,AStorageOrder,ConjugateA,Scalar,AStorageOrder,ConjugateA,ResStorageOrder,UpLo,BuiltIn> {};
-
-
-// try to go to BLAS specialization
-#define EIGEN_MKL_RANKUPDATE_SPECIALIZE(Scalar) \
-template <typename Index, int LhsStorageOrder, bool ConjugateLhs, \
-                          int RhsStorageOrder, bool ConjugateRhs, int  UpLo> \
-struct general_matrix_matrix_triangular_product<Index,Scalar,LhsStorageOrder,ConjugateLhs, \
-               Scalar,RhsStorageOrder,ConjugateRhs,ColMajor,UpLo,Specialized> { \
-  static EIGEN_STRONG_INLINE void run(Index size, Index depth,const Scalar* lhs, Index lhsStride, \
-                          const Scalar* rhs, Index rhsStride, Scalar* res, Index resStride, Scalar alpha) \
-  { \
-    if (lhs==rhs) { \
-      general_matrix_matrix_rankupdate<Index,Scalar,LhsStorageOrder,ConjugateLhs,ColMajor,UpLo> \
-      ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha); \
-    } else { \
-      general_matrix_matrix_triangular_product<Index, \
-        Scalar, LhsStorageOrder, ConjugateLhs, \
-        Scalar, RhsStorageOrder, ConjugateRhs, \
-        ColMajor, UpLo, BuiltIn> \
-      ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha); \
-    } \
-  } \
-};
-
-EIGEN_MKL_RANKUPDATE_SPECIALIZE(double)
-//EIGEN_MKL_RANKUPDATE_SPECIALIZE(dcomplex)
-EIGEN_MKL_RANKUPDATE_SPECIALIZE(float)
-//EIGEN_MKL_RANKUPDATE_SPECIALIZE(scomplex)
-
-// SYRK for float/double
-#define EIGEN_MKL_RANKUPDATE_R(EIGTYPE, MKLTYPE, MKLFUNC) \
-template <typename Index, int AStorageOrder, bool ConjugateA, int  UpLo> \
-struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,ColMajor,UpLo> { \
-  enum { \
-    IsLower = (UpLo&Lower) == Lower, \
-    LowUp = IsLower ? Lower : Upper, \
-    conjA = ((AStorageOrder==ColMajor) && ConjugateA) ? 1 : 0 \
-  }; \
-  static EIGEN_STRONG_INLINE void run(Index size, Index depth,const EIGTYPE* lhs, Index lhsStride, \
-                          const EIGTYPE* rhs, Index rhsStride, EIGTYPE* res, Index resStride, EIGTYPE alpha) \
-  { \
-  /* typedef Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> MatrixRhs;*/ \
-\
-   MKL_INT lda=lhsStride, ldc=resStride, n=size, k=depth; \
-   char uplo=(IsLower) ? 'L' : 'U', trans=(AStorageOrder==RowMajor) ? 'T':'N'; \
-   MKLTYPE alpha_, beta_; \
-\
-/* Set alpha_ & beta_ */ \
-   assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); \
-   assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(beta_, EIGTYPE(1)); \
-   MKLFUNC(&uplo, &trans, &n, &k, &alpha_, lhs, &lda, &beta_, res, &ldc); \
-  } \
-};
-
-// HERK for complex data
-#define EIGEN_MKL_RANKUPDATE_C(EIGTYPE, MKLTYPE, RTYPE, MKLFUNC) \
-template <typename Index, int AStorageOrder, bool ConjugateA, int  UpLo> \
-struct general_matrix_matrix_rankupdate<Index,EIGTYPE,AStorageOrder,ConjugateA,ColMajor,UpLo> { \
-  enum { \
-    IsLower = (UpLo&Lower) == Lower, \
-    LowUp = IsLower ? Lower : Upper, \
-    conjA = (((AStorageOrder==ColMajor) && ConjugateA) || ((AStorageOrder==RowMajor) && !ConjugateA)) ? 1 : 0 \
-  }; \
-  static EIGEN_STRONG_INLINE void run(Index size, Index depth,const EIGTYPE* lhs, Index lhsStride, \
-                          const EIGTYPE* rhs, Index rhsStride, EIGTYPE* res, Index resStride, EIGTYPE alpha) \
-  { \
-   typedef Matrix<EIGTYPE, Dynamic, Dynamic, AStorageOrder> MatrixType; \
-\
-   MKL_INT lda=lhsStride, ldc=resStride, n=size, k=depth; \
-   char uplo=(IsLower) ? 'L' : 'U', trans=(AStorageOrder==RowMajor) ? 'C':'N'; \
-   RTYPE alpha_, beta_; \
-   const EIGTYPE* a_ptr; \
-\
-/* Set alpha_ & beta_ */ \
-/*   assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); */\
-/*   assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(beta_, EIGTYPE(1));*/ \
-   alpha_ = alpha.real(); \
-   beta_ = 1.0; \
-/* Copy with conjugation in some cases*/ \
-   MatrixType a; \
-   if (conjA) { \
-     Map<const MatrixType, 0, OuterStride<> > mapA(lhs,n,k,OuterStride<>(lhsStride)); \
-     a = mapA.conjugate(); \
-     lda = a.outerStride(); \
-     a_ptr = a.data(); \
-   } else a_ptr=lhs; \
-   MKLFUNC(&uplo, &trans, &n, &k, &alpha_, (MKLTYPE*)a_ptr, &lda, &beta_, (MKLTYPE*)res, &ldc); \
-  } \
-};
-
-
-EIGEN_MKL_RANKUPDATE_R(double, double, dsyrk)
-EIGEN_MKL_RANKUPDATE_R(float,  float,  ssyrk)
-
-//EIGEN_MKL_RANKUPDATE_C(dcomplex, MKL_Complex16, double, zherk)
-//EIGEN_MKL_RANKUPDATE_C(scomplex, MKL_Complex8,  double, cherk)
-
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_GENERAL_MATRIX_MATRIX_TRIANGULAR_MKL_H
diff --git a/inst/include/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h b/inst/include/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h
new file mode 100644
index 00000000..913beb69
--- /dev/null
+++ b/inst/include/Eigen/src/Core/products/GeneralMatrixMatrix_BLAS.h
@@ -0,0 +1,205 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to BLAS F77
+ *   General matrix-matrix product functionality based on ?GEMM.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_GENERAL_MATRIX_MATRIX_BLAS_H
+#define EIGEN_GENERAL_MATRIX_MATRIX_BLAS_H
+
+// IWYU pragma: private
+#include "../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+/**********************************************************************
+ * This file implements general matrix-matrix multiplication using BLAS
+ * gemm function via partial specialization of
+ * general_matrix_matrix_product::run(..) method for float, double,
+ * std::complex<float> and std::complex<double> types
+ **********************************************************************/
+
+// gemm specialization
+
+#define GEMM_SPECIALIZATION(EIGTYPE, EIGPREFIX, BLASTYPE, BLASFUNC)                                                 \
+  template <typename Index, int LhsStorageOrder, bool ConjugateLhs, int RhsStorageOrder, bool ConjugateRhs>         \
+  struct general_matrix_matrix_product<Index, EIGTYPE, LhsStorageOrder, ConjugateLhs, EIGTYPE, RhsStorageOrder,     \
+                                       ConjugateRhs, ColMajor, 1> {                                                 \
+    typedef gebp_traits<EIGTYPE, EIGTYPE> Traits;                                                                   \
+                                                                                                                    \
+    static void run(Index rows, Index cols, Index depth, const EIGTYPE* lhs_, Index lhsStride, const EIGTYPE* rhs_, \
+                    Index rhsStride, EIGTYPE* res, Index resIncr, Index resStride, EIGTYPE alpha,                   \
+                    level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/, GemmParallelInfo<Index>* /*info = 0*/) {       \
+      using std::conj;                                                                                              \
+      if (rows == 0 || cols == 0 || depth == 0) return;                                                             \
+      EIGEN_ONLY_USED_FOR_DEBUG(resIncr);                                                                           \
+      eigen_assert(resIncr == 1);                                                                                   \
+      char transa, transb;                                                                                          \
+      BlasIndex m, n, k, lda, ldb, ldc;                                                                             \
+      const EIGTYPE *a, *b;                                                                                         \
+      EIGTYPE beta(1);                                                                                              \
+      MatrixX##EIGPREFIX a_tmp, b_tmp;                                                                              \
+                                                                                                                    \
+      /* Set transpose options */                                                                                   \
+      transa = (LhsStorageOrder == RowMajor) ? ((ConjugateLhs) ? 'C' : 'T') : 'N';                                  \
+      transb = (RhsStorageOrder == RowMajor) ? ((ConjugateRhs) ? 'C' : 'T') : 'N';                                  \
+                                                                                                                    \
+      /* Set m, n, k */                                                                                             \
+      m = convert_index<BlasIndex>(rows);                                                                           \
+      n = convert_index<BlasIndex>(cols);                                                                           \
+      k = convert_index<BlasIndex>(depth);                                                                          \
+                                                                                                                    \
+      /* Set lda, ldb, ldc */                                                                                       \
+      lda = convert_index<BlasIndex>(lhsStride);                                                                    \
+      ldb = convert_index<BlasIndex>(rhsStride);                                                                    \
+      ldc = convert_index<BlasIndex>(resStride);                                                                    \
+                                                                                                                    \
+      /* Set a, b, c */                                                                                             \
+      if ((LhsStorageOrder == ColMajor) && (ConjugateLhs)) {                                                        \
+        Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(lhs_, m, k, OuterStride<>(lhsStride));                 \
+        a_tmp = lhs.conjugate();                                                                                    \
+        a = a_tmp.data();                                                                                           \
+        lda = convert_index<BlasIndex>(a_tmp.outerStride());                                                        \
+      } else                                                                                                        \
+        a = lhs_;                                                                                                   \
+                                                                                                                    \
+      if ((RhsStorageOrder == ColMajor) && (ConjugateRhs)) {                                                        \
+        Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(rhs_, k, n, OuterStride<>(rhsStride));                 \
+        b_tmp = rhs.conjugate();                                                                                    \
+        b = b_tmp.data();                                                                                           \
+        ldb = convert_index<BlasIndex>(b_tmp.outerStride());                                                        \
+      } else                                                                                                        \
+        b = rhs_;                                                                                                   \
+                                                                                                                    \
+      BLASFUNC(&transa, &transb, &m, &n, &k, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda,   \
+               (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc);           \
+    }                                                                                                               \
+  };
+
+#ifdef EIGEN_USE_MKL
+GEMM_SPECIALIZATION(double, d, double, dgemm)
+GEMM_SPECIALIZATION(float, f, float, sgemm)
+GEMM_SPECIALIZATION(dcomplex, cd, MKL_Complex16, zgemm)
+GEMM_SPECIALIZATION(scomplex, cf, MKL_Complex8, cgemm)
+#else
+GEMM_SPECIALIZATION(double, d, double, dgemm_)
+GEMM_SPECIALIZATION(float, f, float, sgemm_)
+GEMM_SPECIALIZATION(dcomplex, cd, double, zgemm_)
+GEMM_SPECIALIZATION(scomplex, cf, float, cgemm_)
+#endif
+
+// If OpenBLAS with BUILD_BFLOAT16=1 support is available,
+// use sbgemm for bfloat16.
+#if EIGEN_USE_OPENBLAS_BFLOAT16
+
+extern "C" {
+// OpenBLAS prototype.
+void sbgemm_(const char* trans_a, const char* trans_b, const int* M, const int* N, const int* K, const float* alpha,
+             const Eigen::bfloat16* A, const int* lda, const Eigen::bfloat16* B, const int* ldb, const float* beta,
+             float* C, const int* ldc);
+}  // extern "C"
+
+template <typename Index, int LhsStorageOrder, bool ConjugateLhs, int RhsStorageOrder, bool ConjugateRhs>
+struct general_matrix_matrix_product<Index, Eigen::bfloat16, LhsStorageOrder, ConjugateLhs, Eigen::bfloat16,
+                                     RhsStorageOrder, ConjugateRhs, ColMajor, 1> {
+  typedef gebp_traits<Eigen::bfloat16, Eigen::bfloat16> Traits;
+
+  static void run(Index rows, Index cols, Index depth, const Eigen::bfloat16* lhs_, Index lhsStride,
+                  const Eigen::bfloat16* rhs_, Index rhsStride, Eigen::bfloat16* res, Index resIncr, Index resStride,
+                  Eigen::bfloat16 alpha, level3_blocking<Eigen::bfloat16, Eigen::bfloat16>& /*blocking*/,
+                  GemmParallelInfo<Index>* /*info = 0*/) {
+    using std::conj;
+    if (rows == 0 || cols == 0 || depth == 0) return;
+    EIGEN_ONLY_USED_FOR_DEBUG(resIncr);
+    eigen_assert(resIncr == 1);
+    char transa, transb;
+    BlasIndex m, n, k, lda, ldb, ldc;
+    const Eigen::bfloat16 *a, *b;
+
+    float falpha = static_cast<float>(alpha);
+    float fbeta = float(1.0);
+
+    using MatrixXbf16 = Matrix<Eigen::bfloat16, Dynamic, Dynamic>;
+    MatrixXbf16 a_tmp, b_tmp;
+    MatrixXf r_tmp;
+
+    /* Set transpose options */
+    transa = (LhsStorageOrder == RowMajor) ? ((ConjugateLhs) ? 'C' : 'T') : 'N';
+    transb = (RhsStorageOrder == RowMajor) ? ((ConjugateRhs) ? 'C' : 'T') : 'N';
+
+    /* Set m, n, k */
+    m = convert_index<BlasIndex>(rows);
+    n = convert_index<BlasIndex>(cols);
+    k = convert_index<BlasIndex>(depth);
+
+    /* Set lda, ldb, ldc */
+    lda = convert_index<BlasIndex>(lhsStride);
+    ldb = convert_index<BlasIndex>(rhsStride);
+    ldc = convert_index<BlasIndex>(m);
+
+    /* Set a, b, c */
+    if ((LhsStorageOrder == ColMajor) && (ConjugateLhs)) {
+      Map<const MatrixXbf16, 0, OuterStride<> > lhs(lhs_, m, k, OuterStride<>(lhsStride));
+      a_tmp = lhs.conjugate();
+      a = a_tmp.data();
+      lda = convert_index<BlasIndex>(a_tmp.outerStride());
+    } else {
+      a = lhs_;
+    }
+
+    if ((RhsStorageOrder == ColMajor) && (ConjugateRhs)) {
+      Map<const MatrixXbf16, 0, OuterStride<> > rhs(rhs_, k, n, OuterStride<>(rhsStride));
+      b_tmp = rhs.conjugate();
+      b = b_tmp.data();
+      ldb = convert_index<BlasIndex>(b_tmp.outerStride());
+    } else {
+      b = rhs_;
+    }
+
+    // Evaluate to a temporary intermediate array.
+    r_tmp.resize(m, n);
+
+    sbgemm_(&transa, &transb, &m, &n, &k, (const float*)&numext::real_ref(falpha), a, &lda, b, &ldb,
+            (const float*)&numext::real_ref(fbeta), r_tmp.data(), &ldc);
+
+    // Cast to the output.
+    Map<MatrixXbf16, 0, OuterStride<> > result(res, m, n, OuterStride<>(resStride));
+    result = r_tmp.cast<Eigen::bfloat16>();
+  }
+};
+
+#endif  // EIGEN_USE_OPENBLAS_SBGEMM
+
+}  // namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_GENERAL_MATRIX_MATRIX_BLAS_H
diff --git a/inst/include/Eigen/src/Core/products/GeneralMatrixMatrix_MKL.h b/inst/include/Eigen/src/Core/products/GeneralMatrixMatrix_MKL.h
deleted file mode 100644
index 060af328..00000000
--- a/inst/include/Eigen/src/Core/products/GeneralMatrixMatrix_MKL.h
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- Copyright (c) 2011, Intel Corporation. All rights reserved.
-
- Redistribution and use in source and binary forms, with or without modification,
- are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
- * Neither the name of Intel Corporation nor the names of its contributors may
-   be used to endorse or promote products derived from this software without
-   specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
- ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
- ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
- *   General matrix-matrix product functionality based on ?GEMM.
- ********************************************************************************
-*/
-
-#ifndef EIGEN_GENERAL_MATRIX_MATRIX_MKL_H
-#define EIGEN_GENERAL_MATRIX_MATRIX_MKL_H
-
-namespace Eigen { 
-
-namespace internal {
-
-/**********************************************************************
-* This file implements general matrix-matrix multiplication using BLAS
-* gemm function via partial specialization of
-* general_matrix_matrix_product::run(..) method for float, double,
-* std::complex<float> and std::complex<double> types
-**********************************************************************/
-
-// gemm specialization
-
-#define GEMM_SPECIALIZATION(EIGTYPE, EIGPREFIX, MKLTYPE, MKLPREFIX) \
-template< \
-  typename Index, \
-  int LhsStorageOrder, bool ConjugateLhs, \
-  int RhsStorageOrder, bool ConjugateRhs> \
-struct general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor> \
-{ \
-static void run(Index rows, Index cols, Index depth, \
-  const EIGTYPE* _lhs, Index lhsStride, \
-  const EIGTYPE* _rhs, Index rhsStride, \
-  EIGTYPE* res, Index resStride, \
-  EIGTYPE alpha, \
-  level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/, \
-  GemmParallelInfo<Index>* /*info = 0*/) \
-{ \
-  using std::conj; \
-\
-  char transa, transb; \
-  MKL_INT m, n, k, lda, ldb, ldc; \
-  const EIGTYPE *a, *b; \
-  MKLTYPE alpha_, beta_; \
-  MatrixX##EIGPREFIX a_tmp, b_tmp; \
-  EIGTYPE myone(1);\
-\
-/* Set transpose options */ \
-  transa = (LhsStorageOrder==RowMajor) ? ((ConjugateLhs) ? 'C' : 'T') : 'N'; \
-  transb = (RhsStorageOrder==RowMajor) ? ((ConjugateRhs) ? 'C' : 'T') : 'N'; \
-\
-/* Set m, n, k */ \
-  m = (MKL_INT)rows;  \
-  n = (MKL_INT)cols;  \
-  k = (MKL_INT)depth; \
-\
-/* Set alpha_ & beta_ */ \
-  assign_scalar_eig2mkl(alpha_, alpha); \
-  assign_scalar_eig2mkl(beta_, myone); \
-\
-/* Set lda, ldb, ldc */ \
-  lda = (MKL_INT)lhsStride; \
-  ldb = (MKL_INT)rhsStride; \
-  ldc = (MKL_INT)resStride; \
-\
-/* Set a, b, c */ \
-  if ((LhsStorageOrder==ColMajor) && (ConjugateLhs)) { \
-    Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,m,k,OuterStride<>(lhsStride)); \
-    a_tmp = lhs.conjugate(); \
-    a = a_tmp.data(); \
-    lda = a_tmp.outerStride(); \
-  } else a = _lhs; \
-\
-  if ((RhsStorageOrder==ColMajor) && (ConjugateRhs)) { \
-    Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,k,n,OuterStride<>(rhsStride)); \
-    b_tmp = rhs.conjugate(); \
-    b = b_tmp.data(); \
-    ldb = b_tmp.outerStride(); \
-  } else b = _rhs; \
-\
-  MKLPREFIX##gemm(&transa, &transb, &m, &n, &k, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \
-}};
-
-GEMM_SPECIALIZATION(double,   d,  double,        d)
-GEMM_SPECIALIZATION(float,    f,  float,         s)
-GEMM_SPECIALIZATION(dcomplex, cd, MKL_Complex16, z)
-GEMM_SPECIALIZATION(scomplex, cf, MKL_Complex8,  c)
-
-} // end namespase internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_GENERAL_MATRIX_MATRIX_MKL_H
diff --git a/inst/include/Eigen/src/Core/products/GeneralMatrixVector.h b/inst/include/Eigen/src/Core/products/GeneralMatrixVector.h
index 09387703..ba72a8a4 100644
--- a/inst/include/Eigen/src/Core/products/GeneralMatrixVector.h
+++ b/inst/include/Eigen/src/Core/products/GeneralMatrixVector.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,15 +10,61 @@
 #ifndef EIGEN_GENERAL_MATRIX_VECTOR_H
 #define EIGEN_GENERAL_MATRIX_VECTOR_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "../InternalHeaderCheck.h"
+
+namespace Eigen {
 
 namespace internal {
 
+enum GEMVPacketSizeType { GEMVPacketFull = 0, GEMVPacketHalf, GEMVPacketQuarter };
+
+template <int N, typename T1, typename T2, typename T3>
+struct gemv_packet_cond {
+  typedef T3 type;
+};
+
+template <typename T1, typename T2, typename T3>
+struct gemv_packet_cond<GEMVPacketFull, T1, T2, T3> {
+  typedef T1 type;
+};
+
+template <typename T1, typename T2, typename T3>
+struct gemv_packet_cond<GEMVPacketHalf, T1, T2, T3> {
+  typedef T2 type;
+};
+
+template <typename LhsScalar, typename RhsScalar, int PacketSize_ = GEMVPacketFull>
+class gemv_traits {
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+
+#define PACKET_DECL_COND_POSTFIX(postfix, name, packet_size)                                               \
+  typedef typename gemv_packet_cond<                                                                       \
+      packet_size, typename packet_traits<name##Scalar>::type, typename packet_traits<name##Scalar>::half, \
+      typename unpacket_traits<typename packet_traits<name##Scalar>::half>::half>::type name##Packet##postfix
+
+  PACKET_DECL_COND_POSTFIX(_, Lhs, PacketSize_);
+  PACKET_DECL_COND_POSTFIX(_, Rhs, PacketSize_);
+  PACKET_DECL_COND_POSTFIX(_, Res, PacketSize_);
+#undef PACKET_DECL_COND_POSTFIX
+
+ public:
+  enum {
+    Vectorizable = unpacket_traits<LhsPacket_>::vectorizable && unpacket_traits<RhsPacket_>::vectorizable &&
+                   int(unpacket_traits<LhsPacket_>::size) == int(unpacket_traits<RhsPacket_>::size),
+    LhsPacketSize = Vectorizable ? unpacket_traits<LhsPacket_>::size : 1,
+    RhsPacketSize = Vectorizable ? unpacket_traits<RhsPacket_>::size : 1,
+    ResPacketSize = Vectorizable ? unpacket_traits<ResPacket_>::size : 1
+  };
+
+  typedef std::conditional_t<Vectorizable, LhsPacket_, LhsScalar> LhsPacket;
+  typedef std::conditional_t<Vectorizable, RhsPacket_, RhsScalar> RhsPacket;
+  typedef std::conditional_t<Vectorizable, ResPacket_, ResScalar> ResPacket;
+};
+
 /* Optimized col-major matrix * vector product:
- * This algorithm processes 4 columns at onces that allows to both reduce
- * the number of load/stores of the result by a factor 4 and to reduce
- * the instruction dependency. Moreover, we know that all bands have the
- * same alignment pattern.
+ * This algorithm processes the matrix per vertical panels,
+ * which are then processed horizontally per chunk of 8*PacketSize x 1 vertical segments.
  *
  * Mixing type logic: C += alpha * A * B
  *  |  A  |  B  |alpha| comments
@@ -26,262 +72,194 @@ namespace internal {
  *  |real |cplx |real | alpha is converted to a cplx when calling the run function, no vectorization
  *  |cplx |real |cplx | invalid, the caller has to do tmp: = A * B; C += alpha*tmp
  *  |cplx |real |real | optimal case, vectorization possible via real-cplx mul
+ *
+ * The same reasoning apply for the transposed case.
  */
-template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
-struct general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>
-{
-typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
-
-enum {
-  Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
-              && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
-  LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
-  RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
-  ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
+template <typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar,
+          typename RhsMapper, bool ConjugateRhs, int Version>
+struct general_matrix_vector_product<Index, LhsScalar, LhsMapper, ColMajor, ConjugateLhs, RhsScalar, RhsMapper,
+                                     ConjugateRhs, Version> {
+  typedef gemv_traits<LhsScalar, RhsScalar> Traits;
+  typedef gemv_traits<LhsScalar, RhsScalar, GEMVPacketHalf> HalfTraits;
+  typedef gemv_traits<LhsScalar, RhsScalar, GEMVPacketQuarter> QuarterTraits;
+
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+
+  typedef typename Traits::LhsPacket LhsPacket;
+  typedef typename Traits::RhsPacket RhsPacket;
+  typedef typename Traits::ResPacket ResPacket;
+
+  typedef typename HalfTraits::LhsPacket LhsPacketHalf;
+  typedef typename HalfTraits::RhsPacket RhsPacketHalf;
+  typedef typename HalfTraits::ResPacket ResPacketHalf;
+
+  typedef typename QuarterTraits::LhsPacket LhsPacketQuarter;
+  typedef typename QuarterTraits::RhsPacket RhsPacketQuarter;
+  typedef typename QuarterTraits::ResPacket ResPacketQuarter;
+
+  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(Index rows, Index cols, const LhsMapper& lhs,
+                                                      const RhsMapper& rhs, ResScalar* res, Index resIncr,
+                                                      RhsScalar alpha);
 };
 
-typedef typename packet_traits<LhsScalar>::type  _LhsPacket;
-typedef typename packet_traits<RhsScalar>::type  _RhsPacket;
-typedef typename packet_traits<ResScalar>::type  _ResPacket;
-
-typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
-typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
-typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
-
-EIGEN_DONT_INLINE static void run(
-  Index rows, Index cols,
-  const LhsScalar* lhs, Index lhsStride,
-  const RhsScalar* rhs, Index rhsIncr,
-  ResScalar* res, Index resIncr, RhsScalar alpha);
-};
-
-template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
-EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>::run(
-  Index rows, Index cols,
-  const LhsScalar* lhs, Index lhsStride,
-  const RhsScalar* rhs, Index rhsIncr,
-  ResScalar* res, Index resIncr, RhsScalar alpha)
-{
-  EIGEN_UNUSED_VARIABLE(resIncr)
-  eigen_internal_assert(resIncr==1);
-  #ifdef _EIGEN_ACCUMULATE_PACKETS
-  #error _EIGEN_ACCUMULATE_PACKETS has already been defined
-  #endif
-  #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) \
-    pstore(&res[j], \
-      padd(pload<ResPacket>(&res[j]), \
-        padd( \
-          padd(pcj.pmul(EIGEN_CAT(ploa , A0)<LhsPacket>(&lhs0[j]),    ptmp0), \
-                  pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs1[j]),   ptmp1)), \
-          padd(pcj.pmul(EIGEN_CAT(ploa , A2)<LhsPacket>(&lhs2[j]),    ptmp2), \
-                  pcj.pmul(EIGEN_CAT(ploa , A13)<LhsPacket>(&lhs3[j]),   ptmp3)) )))
-
-  conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
-  conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
-  if(ConjugateRhs)
-    alpha = numext::conj(alpha);
-
-  enum { AllAligned = 0, EvenAligned, FirstAligned, NoneAligned };
-  const Index columnsAtOnce = 4;
-  const Index peels = 2;
-  const Index LhsPacketAlignedMask = LhsPacketSize-1;
-  const Index ResPacketAlignedMask = ResPacketSize-1;
-//  const Index PeelAlignedMask = ResPacketSize*peels-1;
-  const Index size = rows;
-  
-  // How many coeffs of the result do we have to skip to be aligned.
-  // Here we assume data are at least aligned on the base scalar type.
-  Index alignedStart = internal::first_aligned(res,size);
-  Index alignedSize = ResPacketSize>1 ? alignedStart + ((size-alignedStart) & ~ResPacketAlignedMask) : 0;
-  const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
-
-  const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
-  Index alignmentPattern = alignmentStep==0 ? AllAligned
-                       : alignmentStep==(LhsPacketSize/2) ? EvenAligned
-                       : FirstAligned;
-
-  // we cannot assume the first element is aligned because of sub-matrices
-  const Index lhsAlignmentOffset = internal::first_aligned(lhs,size);
-
-  // find how many columns do we have to skip to be aligned with the result (if possible)
-  Index skipColumns = 0;
-  // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
-  if( (size_t(lhs)%sizeof(LhsScalar)) || (size_t(res)%sizeof(ResScalar)) )
-  {
-    alignedSize = 0;
-    alignedStart = 0;
-  }
-  else if (LhsPacketSize>1)
-  {
-    eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0 || size<LhsPacketSize);
-
-    while (skipColumns<LhsPacketSize &&
-          alignedStart != ((lhsAlignmentOffset + alignmentStep*skipColumns)%LhsPacketSize))
-      ++skipColumns;
-    if (skipColumns==LhsPacketSize)
-    {
-      // nothing can be aligned, no need to skip any column
-      alignmentPattern = NoneAligned;
-      skipColumns = 0;
+template <typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar,
+          typename RhsMapper, bool ConjugateRhs, int Version>
+EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void
+general_matrix_vector_product<Index, LhsScalar, LhsMapper, ColMajor, ConjugateLhs, RhsScalar, RhsMapper, ConjugateRhs,
+                              Version>::run(Index rows, Index cols, const LhsMapper& alhs, const RhsMapper& rhs,
+                                            ResScalar* res, Index resIncr, RhsScalar alpha) {
+  EIGEN_UNUSED_VARIABLE(resIncr);
+  eigen_internal_assert(resIncr == 1);
+
+  // The following copy tells the compiler that lhs's attributes are not modified outside this function
+  // This helps GCC to generate proper code.
+  LhsMapper lhs(alhs);
+
+  conj_helper<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs> cj;
+  conj_helper<LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs> pcj;
+  conj_helper<LhsPacketHalf, RhsPacketHalf, ConjugateLhs, ConjugateRhs> pcj_half;
+  conj_helper<LhsPacketQuarter, RhsPacketQuarter, ConjugateLhs, ConjugateRhs> pcj_quarter;
+
+  const Index lhsStride = lhs.stride();
+  // TODO: for padded aligned inputs, we could enable aligned reads
+  enum {
+    LhsAlignment = Unaligned,
+    ResPacketSize = Traits::ResPacketSize,
+    ResPacketSizeHalf = HalfTraits::ResPacketSize,
+    ResPacketSizeQuarter = QuarterTraits::ResPacketSize,
+    LhsPacketSize = Traits::LhsPacketSize,
+    HasHalf = (int)ResPacketSizeHalf < (int)ResPacketSize,
+    HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf
+  };
+
+  const Index n8 = rows - 8 * ResPacketSize + 1;
+  const Index n4 = rows - 4 * ResPacketSize + 1;
+  const Index n3 = rows - 3 * ResPacketSize + 1;
+  const Index n2 = rows - 2 * ResPacketSize + 1;
+  const Index n1 = rows - 1 * ResPacketSize + 1;
+  const Index n_half = rows - 1 * ResPacketSizeHalf + 1;
+  const Index n_quarter = rows - 1 * ResPacketSizeQuarter + 1;
+
+  // TODO: improve the following heuristic:
+  const Index block_cols = cols < 128 ? cols : (lhsStride * sizeof(LhsScalar) < 32000 ? 16 : 4);
+  ResPacket palpha = pset1<ResPacket>(alpha);
+  ResPacketHalf palpha_half = pset1<ResPacketHalf>(alpha);
+  ResPacketQuarter palpha_quarter = pset1<ResPacketQuarter>(alpha);
+
+  for (Index j2 = 0; j2 < cols; j2 += block_cols) {
+    Index jend = numext::mini(j2 + block_cols, cols);
+    Index i = 0;
+    for (; i < n8; i += ResPacketSize * 8) {
+      ResPacket c0 = pset1<ResPacket>(ResScalar(0)), c1 = pset1<ResPacket>(ResScalar(0)),
+                c2 = pset1<ResPacket>(ResScalar(0)), c3 = pset1<ResPacket>(ResScalar(0)),
+                c4 = pset1<ResPacket>(ResScalar(0)), c5 = pset1<ResPacket>(ResScalar(0)),
+                c6 = pset1<ResPacket>(ResScalar(0)), c7 = pset1<ResPacket>(ResScalar(0));
+
+      for (Index j = j2; j < jend; j += 1) {
+        RhsPacket b0 = pset1<RhsPacket>(rhs(j, 0));
+        c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 0, j), b0, c0);
+        c1 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 1, j), b0, c1);
+        c2 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 2, j), b0, c2);
+        c3 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 3, j), b0, c3);
+        c4 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 4, j), b0, c4);
+        c5 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 5, j), b0, c5);
+        c6 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 6, j), b0, c6);
+        c7 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 7, j), b0, c7);
+      }
+      pstoreu(res + i + ResPacketSize * 0, pmadd(c0, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 0)));
+      pstoreu(res + i + ResPacketSize * 1, pmadd(c1, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 1)));
+      pstoreu(res + i + ResPacketSize * 2, pmadd(c2, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 2)));
+      pstoreu(res + i + ResPacketSize * 3, pmadd(c3, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 3)));
+      pstoreu(res + i + ResPacketSize * 4, pmadd(c4, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 4)));
+      pstoreu(res + i + ResPacketSize * 5, pmadd(c5, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 5)));
+      pstoreu(res + i + ResPacketSize * 6, pmadd(c6, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 6)));
+      pstoreu(res + i + ResPacketSize * 7, pmadd(c7, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 7)));
     }
-    else
-    {
-      skipColumns = (std::min)(skipColumns,cols);
-      // note that the skiped columns are processed later.
+    if (i < n4) {
+      ResPacket c0 = pset1<ResPacket>(ResScalar(0)), c1 = pset1<ResPacket>(ResScalar(0)),
+                c2 = pset1<ResPacket>(ResScalar(0)), c3 = pset1<ResPacket>(ResScalar(0));
+
+      for (Index j = j2; j < jend; j += 1) {
+        RhsPacket b0 = pset1<RhsPacket>(rhs(j, 0));
+        c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 0, j), b0, c0);
+        c1 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 1, j), b0, c1);
+        c2 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 2, j), b0, c2);
+        c3 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 3, j), b0, c3);
+      }
+      pstoreu(res + i + ResPacketSize * 0, pmadd(c0, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 0)));
+      pstoreu(res + i + ResPacketSize * 1, pmadd(c1, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 1)));
+      pstoreu(res + i + ResPacketSize * 2, pmadd(c2, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 2)));
+      pstoreu(res + i + ResPacketSize * 3, pmadd(c3, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 3)));
+
+      i += ResPacketSize * 4;
     }
+    if (i < n3) {
+      ResPacket c0 = pset1<ResPacket>(ResScalar(0)), c1 = pset1<ResPacket>(ResScalar(0)),
+                c2 = pset1<ResPacket>(ResScalar(0));
+
+      for (Index j = j2; j < jend; j += 1) {
+        RhsPacket b0 = pset1<RhsPacket>(rhs(j, 0));
+        c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 0, j), b0, c0);
+        c1 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 1, j), b0, c1);
+        c2 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 2, j), b0, c2);
+      }
+      pstoreu(res + i + ResPacketSize * 0, pmadd(c0, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 0)));
+      pstoreu(res + i + ResPacketSize * 1, pmadd(c1, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 1)));
+      pstoreu(res + i + ResPacketSize * 2, pmadd(c2, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 2)));
 
-    eigen_internal_assert(  (alignmentPattern==NoneAligned)
-                      || (skipColumns + columnsAtOnce >= cols)
-                      || LhsPacketSize > size
-                      || (size_t(lhs+alignedStart+lhsStride*skipColumns)%sizeof(LhsPacket))==0);
-  }
-  else if(Vectorizable)
-  {
-    alignedStart = 0;
-    alignedSize = size;
-    alignmentPattern = AllAligned;
-  }
+      i += ResPacketSize * 3;
+    }
+    if (i < n2) {
+      ResPacket c0 = pset1<ResPacket>(ResScalar(0)), c1 = pset1<ResPacket>(ResScalar(0));
 
-  Index offset1 = (FirstAligned && alignmentStep==1?3:1);
-  Index offset3 = (FirstAligned && alignmentStep==1?1:3);
-
-  Index columnBound = ((cols-skipColumns)/columnsAtOnce)*columnsAtOnce + skipColumns;
-  for (Index i=skipColumns; i<columnBound; i+=columnsAtOnce)
-  {
-    RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[i*rhsIncr]),
-              ptmp1 = pset1<RhsPacket>(alpha*rhs[(i+offset1)*rhsIncr]),
-              ptmp2 = pset1<RhsPacket>(alpha*rhs[(i+2)*rhsIncr]),
-              ptmp3 = pset1<RhsPacket>(alpha*rhs[(i+offset3)*rhsIncr]);
-
-    // this helps a lot generating better binary code
-    const LhsScalar *lhs0 = lhs + i*lhsStride,     *lhs1 = lhs + (i+offset1)*lhsStride,
-                    *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
-
-    if (Vectorizable)
-    {
-      /* explicit vectorization */
-      // process initial unaligned coeffs
-      for (Index j=0; j<alignedStart; ++j)
-      {
-        res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]);
-        res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]);
-        res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]);
-        res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]);
+      for (Index j = j2; j < jend; j += 1) {
+        RhsPacket b0 = pset1<RhsPacket>(rhs(j, 0));
+        c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 0, j), b0, c0);
+        c1 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + LhsPacketSize * 1, j), b0, c1);
       }
-
-      if (alignedSize>alignedStart)
-      {
-        switch(alignmentPattern)
-        {
-          case AllAligned:
-            for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(d,d,d);
-            break;
-          case EvenAligned:
-            for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(d,du,d);
-            break;
-          case FirstAligned:
-          {
-            Index j = alignedStart;
-            if(peels>1)
-            {
-              LhsPacket A00, A01, A02, A03, A10, A11, A12, A13;
-              ResPacket T0, T1;
-
-              A01 = pload<LhsPacket>(&lhs1[alignedStart-1]);
-              A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
-              A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
-
-              for (; j<peeledSize; j+=peels*ResPacketSize)
-              {
-                A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]);  palign<1>(A01,A11);
-                A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]);  palign<2>(A02,A12);
-                A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]);  palign<3>(A03,A13);
-
-                A00 = pload<LhsPacket>(&lhs0[j]);
-                A10 = pload<LhsPacket>(&lhs0[j+LhsPacketSize]);
-                T0  = pcj.pmadd(A00, ptmp0, pload<ResPacket>(&res[j]));
-                T1  = pcj.pmadd(A10, ptmp0, pload<ResPacket>(&res[j+ResPacketSize]));
-
-                T0  = pcj.pmadd(A01, ptmp1, T0);
-                A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]);  palign<1>(A11,A01);
-                T0  = pcj.pmadd(A02, ptmp2, T0);
-                A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]);  palign<2>(A12,A02);
-                T0  = pcj.pmadd(A03, ptmp3, T0);
-                pstore(&res[j],T0);
-                A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]);  palign<3>(A13,A03);
-                T1  = pcj.pmadd(A11, ptmp1, T1);
-                T1  = pcj.pmadd(A12, ptmp2, T1);
-                T1  = pcj.pmadd(A13, ptmp3, T1);
-                pstore(&res[j+ResPacketSize],T1);
-              }
-            }
-            for (; j<alignedSize; j+=ResPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(d,du,du);
-            break;
-          }
-          default:
-            for (Index j = alignedStart; j<alignedSize; j+=ResPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(du,du,du);
-            break;
-        }
+      pstoreu(res + i + ResPacketSize * 0, pmadd(c0, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 0)));
+      pstoreu(res + i + ResPacketSize * 1, pmadd(c1, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 1)));
+      i += ResPacketSize * 2;
+    }
+    if (i < n1) {
+      ResPacket c0 = pset1<ResPacket>(ResScalar(0));
+      for (Index j = j2; j < jend; j += 1) {
+        RhsPacket b0 = pset1<RhsPacket>(rhs(j, 0));
+        c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 0, j), b0, c0);
       }
-    } // end explicit vectorization
-
-    /* process remaining coeffs (or all if there is no explicit vectorization) */
-    for (Index j=alignedSize; j<size; ++j)
-    {
-      res[j] = cj.pmadd(lhs0[j], pfirst(ptmp0), res[j]);
-      res[j] = cj.pmadd(lhs1[j], pfirst(ptmp1), res[j]);
-      res[j] = cj.pmadd(lhs2[j], pfirst(ptmp2), res[j]);
-      res[j] = cj.pmadd(lhs3[j], pfirst(ptmp3), res[j]);
+      pstoreu(res + i + ResPacketSize * 0, pmadd(c0, palpha, ploadu<ResPacket>(res + i + ResPacketSize * 0)));
+      i += ResPacketSize;
     }
-  }
-
-  // process remaining first and last columns (at most columnsAtOnce-1)
-  Index end = cols;
-  Index start = columnBound;
-  do
-  {
-    for (Index k=start; k<end; ++k)
-    {
-      RhsPacket ptmp0 = pset1<RhsPacket>(alpha*rhs[k*rhsIncr]);
-      const LhsScalar* lhs0 = lhs + k*lhsStride;
-
-      if (Vectorizable)
-      {
-        /* explicit vectorization */
-        // process first unaligned result's coeffs
-        for (Index j=0; j<alignedStart; ++j)
-          res[j] += cj.pmul(lhs0[j], pfirst(ptmp0));
-        // process aligned result's coeffs
-        if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0)
-          for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
-            pstore(&res[i], pcj.pmadd(pload<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
-        else
-          for (Index i = alignedStart;i<alignedSize;i+=ResPacketSize)
-            pstore(&res[i], pcj.pmadd(ploadu<LhsPacket>(&lhs0[i]), ptmp0, pload<ResPacket>(&res[i])));
+    if (HasHalf && i < n_half) {
+      ResPacketHalf c0 = pset1<ResPacketHalf>(ResScalar(0));
+      for (Index j = j2; j < jend; j += 1) {
+        RhsPacketHalf b0 = pset1<RhsPacketHalf>(rhs(j, 0));
+        c0 = pcj_half.pmadd(lhs.template load<LhsPacketHalf, LhsAlignment>(i + 0, j), b0, c0);
       }
-
-      // process remaining scalars (or all if no explicit vectorization)
-      for (Index i=alignedSize; i<size; ++i)
-        res[i] += cj.pmul(lhs0[i], pfirst(ptmp0));
+      pstoreu(res + i + ResPacketSizeHalf * 0,
+              pmadd(c0, palpha_half, ploadu<ResPacketHalf>(res + i + ResPacketSizeHalf * 0)));
+      i += ResPacketSizeHalf;
     }
-    if (skipColumns)
-    {
-      start = 0;
-      end = skipColumns;
-      skipColumns = 0;
+    if (HasQuarter && i < n_quarter) {
+      ResPacketQuarter c0 = pset1<ResPacketQuarter>(ResScalar(0));
+      for (Index j = j2; j < jend; j += 1) {
+        RhsPacketQuarter b0 = pset1<RhsPacketQuarter>(rhs(j, 0));
+        c0 = pcj_quarter.pmadd(lhs.template load<LhsPacketQuarter, LhsAlignment>(i + 0, j), b0, c0);
+      }
+      pstoreu(res + i + ResPacketSizeQuarter * 0,
+              pmadd(c0, palpha_quarter, ploadu<ResPacketQuarter>(res + i + ResPacketSizeQuarter * 0)));
+      i += ResPacketSizeQuarter;
+    }
+    for (; i < rows; ++i) {
+      ResScalar c0(0);
+      for (Index j = j2; j < jend; j += 1) c0 += cj.pmul(lhs(i, j), rhs(j, 0));
+      res[i] += alpha * c0;
     }
-    else
-      break;
-  } while(Vectorizable);
-  #undef _EIGEN_ACCUMULATE_PACKETS
+  }
 }
 
 /* Optimized row-major matrix * vector product:
- * This algorithm processes 4 rows at onces that allows to both reduce
+ * This algorithm processes 4 rows at once that allows to both reduce
  * the number of load/stores of the result by a factor 4 and to reduce
  * the instruction dependency. Moreover, we know that all bands have the
  * same alignment pattern.
@@ -290,277 +268,206 @@ EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,ColMajor,Co
  *  - alpha is always a complex (or converted to a complex)
  *  - no vectorization
  */
-template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
-struct general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>
-{
-typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
-
-enum {
-  Vectorizable = packet_traits<LhsScalar>::Vectorizable && packet_traits<RhsScalar>::Vectorizable
-              && int(packet_traits<LhsScalar>::size)==int(packet_traits<RhsScalar>::size),
-  LhsPacketSize = Vectorizable ? packet_traits<LhsScalar>::size : 1,
-  RhsPacketSize = Vectorizable ? packet_traits<RhsScalar>::size : 1,
-  ResPacketSize = Vectorizable ? packet_traits<ResScalar>::size : 1
+template <typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar,
+          typename RhsMapper, bool ConjugateRhs, int Version>
+struct general_matrix_vector_product<Index, LhsScalar, LhsMapper, RowMajor, ConjugateLhs, RhsScalar, RhsMapper,
+                                     ConjugateRhs, Version> {
+  typedef gemv_traits<LhsScalar, RhsScalar> Traits;
+  typedef gemv_traits<LhsScalar, RhsScalar, GEMVPacketHalf> HalfTraits;
+  typedef gemv_traits<LhsScalar, RhsScalar, GEMVPacketQuarter> QuarterTraits;
+
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+
+  typedef typename Traits::LhsPacket LhsPacket;
+  typedef typename Traits::RhsPacket RhsPacket;
+  typedef typename Traits::ResPacket ResPacket;
+
+  typedef typename HalfTraits::LhsPacket LhsPacketHalf;
+  typedef typename HalfTraits::RhsPacket RhsPacketHalf;
+  typedef typename HalfTraits::ResPacket ResPacketHalf;
+
+  typedef typename QuarterTraits::LhsPacket LhsPacketQuarter;
+  typedef typename QuarterTraits::RhsPacket RhsPacketQuarter;
+  typedef typename QuarterTraits::ResPacket ResPacketQuarter;
+
+  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static void run(Index rows, Index cols, const LhsMapper& lhs,
+                                                      const RhsMapper& rhs, ResScalar* res, Index resIncr,
+                                                      ResScalar alpha);
 };
 
-typedef typename packet_traits<LhsScalar>::type  _LhsPacket;
-typedef typename packet_traits<RhsScalar>::type  _RhsPacket;
-typedef typename packet_traits<ResScalar>::type  _ResPacket;
-
-typedef typename conditional<Vectorizable,_LhsPacket,LhsScalar>::type LhsPacket;
-typedef typename conditional<Vectorizable,_RhsPacket,RhsScalar>::type RhsPacket;
-typedef typename conditional<Vectorizable,_ResPacket,ResScalar>::type ResPacket;
-  
-EIGEN_DONT_INLINE static void run(
-  Index rows, Index cols,
-  const LhsScalar* lhs, Index lhsStride,
-  const RhsScalar* rhs, Index rhsIncr,
-  ResScalar* res, Index resIncr,
-  ResScalar alpha);
-};
-
-template<typename Index, typename LhsScalar, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version>
-EIGEN_DONT_INLINE void general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjugateLhs,RhsScalar,ConjugateRhs,Version>::run(
-  Index rows, Index cols,
-  const LhsScalar* lhs, Index lhsStride,
-  const RhsScalar* rhs, Index rhsIncr,
-  ResScalar* res, Index resIncr,
-  ResScalar alpha)
-{
-  EIGEN_UNUSED_VARIABLE(rhsIncr);
-  eigen_internal_assert(rhsIncr==1);
-  #ifdef _EIGEN_ACCUMULATE_PACKETS
-  #error _EIGEN_ACCUMULATE_PACKETS has already been defined
-  #endif
-
-  #define _EIGEN_ACCUMULATE_PACKETS(A0,A13,A2) {\
-    RhsPacket b = pload<RhsPacket>(&rhs[j]); \
-    ptmp0 = pcj.pmadd(EIGEN_CAT(ploa,A0) <LhsPacket>(&lhs0[j]), b, ptmp0); \
-    ptmp1 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs1[j]), b, ptmp1); \
-    ptmp2 = pcj.pmadd(EIGEN_CAT(ploa,A2) <LhsPacket>(&lhs2[j]), b, ptmp2); \
-    ptmp3 = pcj.pmadd(EIGEN_CAT(ploa,A13)<LhsPacket>(&lhs3[j]), b, ptmp3); }
-
-  conj_helper<LhsScalar,RhsScalar,ConjugateLhs,ConjugateRhs> cj;
-  conj_helper<LhsPacket,RhsPacket,ConjugateLhs,ConjugateRhs> pcj;
-
-  enum { AllAligned=0, EvenAligned=1, FirstAligned=2, NoneAligned=3 };
-  const Index rowsAtOnce = 4;
-  const Index peels = 2;
-  const Index RhsPacketAlignedMask = RhsPacketSize-1;
-  const Index LhsPacketAlignedMask = LhsPacketSize-1;
-//   const Index PeelAlignedMask = RhsPacketSize*peels-1;
-  const Index depth = cols;
-
-  // How many coeffs of the result do we have to skip to be aligned.
-  // Here we assume data are at least aligned on the base scalar type
-  // if that's not the case then vectorization is discarded, see below.
-  Index alignedStart = internal::first_aligned(rhs, depth);
-  Index alignedSize = RhsPacketSize>1 ? alignedStart + ((depth-alignedStart) & ~RhsPacketAlignedMask) : 0;
-  const Index peeledSize = alignedSize - RhsPacketSize*peels - RhsPacketSize + 1;
-
-  const Index alignmentStep = LhsPacketSize>1 ? (LhsPacketSize - lhsStride % LhsPacketSize) & LhsPacketAlignedMask : 0;
-  Index alignmentPattern = alignmentStep==0 ? AllAligned
-                         : alignmentStep==(LhsPacketSize/2) ? EvenAligned
-                         : FirstAligned;
-
-  // we cannot assume the first element is aligned because of sub-matrices
-  const Index lhsAlignmentOffset = internal::first_aligned(lhs,depth);
-
-  // find how many rows do we have to skip to be aligned with rhs (if possible)
-  Index skipRows = 0;
-  // if the data cannot be aligned (TODO add some compile time tests when possible, e.g. for floats)
-  if( (sizeof(LhsScalar)!=sizeof(RhsScalar)) || (size_t(lhs)%sizeof(LhsScalar)) || (size_t(rhs)%sizeof(RhsScalar)) )
-  {
-    alignedSize = 0;
-    alignedStart = 0;
-  }
-  else if (LhsPacketSize>1)
-  {
-    eigen_internal_assert(size_t(lhs+lhsAlignmentOffset)%sizeof(LhsPacket)==0  || depth<LhsPacketSize);
-
-    while (skipRows<LhsPacketSize &&
-           alignedStart != ((lhsAlignmentOffset + alignmentStep*skipRows)%LhsPacketSize))
-      ++skipRows;
-    if (skipRows==LhsPacketSize)
-    {
-      // nothing can be aligned, no need to skip any column
-      alignmentPattern = NoneAligned;
-      skipRows = 0;
+template <typename Index, typename LhsScalar, typename LhsMapper, bool ConjugateLhs, typename RhsScalar,
+          typename RhsMapper, bool ConjugateRhs, int Version>
+EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void
+general_matrix_vector_product<Index, LhsScalar, LhsMapper, RowMajor, ConjugateLhs, RhsScalar, RhsMapper, ConjugateRhs,
+                              Version>::run(Index rows, Index cols, const LhsMapper& alhs, const RhsMapper& rhs,
+                                            ResScalar* res, Index resIncr, ResScalar alpha) {
+  // The following copy tells the compiler that lhs's attributes are not modified outside this function
+  // This helps GCC to generate proper code.
+  LhsMapper lhs(alhs);
+
+  eigen_internal_assert(rhs.stride() == 1);
+  conj_helper<LhsScalar, RhsScalar, ConjugateLhs, ConjugateRhs> cj;
+  conj_helper<LhsPacket, RhsPacket, ConjugateLhs, ConjugateRhs> pcj;
+  conj_helper<LhsPacketHalf, RhsPacketHalf, ConjugateLhs, ConjugateRhs> pcj_half;
+  conj_helper<LhsPacketQuarter, RhsPacketQuarter, ConjugateLhs, ConjugateRhs> pcj_quarter;
+
+  // TODO: fine tune the following heuristic. The rationale is that if the matrix is very large,
+  //       processing 8 rows at once might be counter productive wrt cache.
+  const Index n8 = lhs.stride() * sizeof(LhsScalar) > 32000 ? 0 : rows - 7;
+  const Index n4 = rows - 3;
+  const Index n2 = rows - 1;
+
+  // TODO: for padded aligned inputs, we could enable aligned reads
+  enum {
+    LhsAlignment = Unaligned,
+    ResPacketSize = Traits::ResPacketSize,
+    ResPacketSizeHalf = HalfTraits::ResPacketSize,
+    ResPacketSizeQuarter = QuarterTraits::ResPacketSize,
+    LhsPacketSize = Traits::LhsPacketSize,
+    LhsPacketSizeHalf = HalfTraits::LhsPacketSize,
+    LhsPacketSizeQuarter = QuarterTraits::LhsPacketSize,
+    HasHalf = (int)ResPacketSizeHalf < (int)ResPacketSize,
+    HasQuarter = (int)ResPacketSizeQuarter < (int)ResPacketSizeHalf
+  };
+
+  using UnsignedIndex = typename make_unsigned<Index>::type;
+  const Index fullColBlockEnd = LhsPacketSize * (UnsignedIndex(cols) / LhsPacketSize);
+  const Index halfColBlockEnd = LhsPacketSizeHalf * (UnsignedIndex(cols) / LhsPacketSizeHalf);
+  const Index quarterColBlockEnd = LhsPacketSizeQuarter * (UnsignedIndex(cols) / LhsPacketSizeQuarter);
+
+  Index i = 0;
+  for (; i < n8; i += 8) {
+    ResPacket c0 = pset1<ResPacket>(ResScalar(0)), c1 = pset1<ResPacket>(ResScalar(0)),
+              c2 = pset1<ResPacket>(ResScalar(0)), c3 = pset1<ResPacket>(ResScalar(0)),
+              c4 = pset1<ResPacket>(ResScalar(0)), c5 = pset1<ResPacket>(ResScalar(0)),
+              c6 = pset1<ResPacket>(ResScalar(0)), c7 = pset1<ResPacket>(ResScalar(0));
+
+    for (Index j = 0; j < fullColBlockEnd; j += LhsPacketSize) {
+      RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j, 0);
+
+      c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 0, j), b0, c0);
+      c1 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 1, j), b0, c1);
+      c2 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 2, j), b0, c2);
+      c3 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 3, j), b0, c3);
+      c4 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 4, j), b0, c4);
+      c5 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 5, j), b0, c5);
+      c6 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 6, j), b0, c6);
+      c7 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 7, j), b0, c7);
     }
-    else
-    {
-      skipRows = (std::min)(skipRows,Index(rows));
-      // note that the skiped columns are processed later.
+    ResScalar cc0 = predux(c0);
+    ResScalar cc1 = predux(c1);
+    ResScalar cc2 = predux(c2);
+    ResScalar cc3 = predux(c3);
+    ResScalar cc4 = predux(c4);
+    ResScalar cc5 = predux(c5);
+    ResScalar cc6 = predux(c6);
+    ResScalar cc7 = predux(c7);
+
+    for (Index j = fullColBlockEnd; j < cols; ++j) {
+      RhsScalar b0 = rhs(j, 0);
+
+      cc0 += cj.pmul(lhs(i + 0, j), b0);
+      cc1 += cj.pmul(lhs(i + 1, j), b0);
+      cc2 += cj.pmul(lhs(i + 2, j), b0);
+      cc3 += cj.pmul(lhs(i + 3, j), b0);
+      cc4 += cj.pmul(lhs(i + 4, j), b0);
+      cc5 += cj.pmul(lhs(i + 5, j), b0);
+      cc6 += cj.pmul(lhs(i + 6, j), b0);
+      cc7 += cj.pmul(lhs(i + 7, j), b0);
     }
-    eigen_internal_assert(  alignmentPattern==NoneAligned
-                      || LhsPacketSize==1
-                      || (skipRows + rowsAtOnce >= rows)
-                      || LhsPacketSize > depth
-                      || (size_t(lhs+alignedStart+lhsStride*skipRows)%sizeof(LhsPacket))==0);
-  }
-  else if(Vectorizable)
-  {
-    alignedStart = 0;
-    alignedSize = depth;
-    alignmentPattern = AllAligned;
+    res[(i + 0) * resIncr] += alpha * cc0;
+    res[(i + 1) * resIncr] += alpha * cc1;
+    res[(i + 2) * resIncr] += alpha * cc2;
+    res[(i + 3) * resIncr] += alpha * cc3;
+    res[(i + 4) * resIncr] += alpha * cc4;
+    res[(i + 5) * resIncr] += alpha * cc5;
+    res[(i + 6) * resIncr] += alpha * cc6;
+    res[(i + 7) * resIncr] += alpha * cc7;
   }
+  for (; i < n4; i += 4) {
+    ResPacket c0 = pset1<ResPacket>(ResScalar(0)), c1 = pset1<ResPacket>(ResScalar(0)),
+              c2 = pset1<ResPacket>(ResScalar(0)), c3 = pset1<ResPacket>(ResScalar(0));
 
-  Index offset1 = (FirstAligned && alignmentStep==1?3:1);
-  Index offset3 = (FirstAligned && alignmentStep==1?1:3);
-
-  Index rowBound = ((rows-skipRows)/rowsAtOnce)*rowsAtOnce + skipRows;
-  for (Index i=skipRows; i<rowBound; i+=rowsAtOnce)
-  {
-    EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0);
-    ResScalar tmp1 = ResScalar(0), tmp2 = ResScalar(0), tmp3 = ResScalar(0);
-
-    // this helps the compiler generating good binary code
-    const LhsScalar *lhs0 = lhs + i*lhsStride,     *lhs1 = lhs + (i+offset1)*lhsStride,
-                    *lhs2 = lhs + (i+2)*lhsStride, *lhs3 = lhs + (i+offset3)*lhsStride;
-
-    if (Vectorizable)
-    {
-      /* explicit vectorization */
-      ResPacket ptmp0 = pset1<ResPacket>(ResScalar(0)), ptmp1 = pset1<ResPacket>(ResScalar(0)),
-                ptmp2 = pset1<ResPacket>(ResScalar(0)), ptmp3 = pset1<ResPacket>(ResScalar(0));
-
-      // process initial unaligned coeffs
-      // FIXME this loop get vectorized by the compiler !
-      for (Index j=0; j<alignedStart; ++j)
-      {
-        RhsScalar b = rhs[j];
-        tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
-        tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
-      }
+    for (Index j = 0; j < fullColBlockEnd; j += LhsPacketSize) {
+      RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j, 0);
 
-      if (alignedSize>alignedStart)
-      {
-        switch(alignmentPattern)
-        {
-          case AllAligned:
-            for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(d,d,d);
-            break;
-          case EvenAligned:
-            for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(d,du,d);
-            break;
-          case FirstAligned:
-          {
-            Index j = alignedStart;
-            if (peels>1)
-            {
-              /* Here we proccess 4 rows with with two peeled iterations to hide
-               * the overhead of unaligned loads. Moreover unaligned loads are handled
-               * using special shift/move operations between the two aligned packets
-               * overlaping the desired unaligned packet. This is *much* more efficient
-               * than basic unaligned loads.
-               */
-              LhsPacket A01, A02, A03, A11, A12, A13;
-              A01 = pload<LhsPacket>(&lhs1[alignedStart-1]);
-              A02 = pload<LhsPacket>(&lhs2[alignedStart-2]);
-              A03 = pload<LhsPacket>(&lhs3[alignedStart-3]);
-
-              for (; j<peeledSize; j+=peels*RhsPacketSize)
-              {
-                RhsPacket b = pload<RhsPacket>(&rhs[j]);
-                A11 = pload<LhsPacket>(&lhs1[j-1+LhsPacketSize]);  palign<1>(A01,A11);
-                A12 = pload<LhsPacket>(&lhs2[j-2+LhsPacketSize]);  palign<2>(A02,A12);
-                A13 = pload<LhsPacket>(&lhs3[j-3+LhsPacketSize]);  palign<3>(A03,A13);
-
-                ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), b, ptmp0);
-                ptmp1 = pcj.pmadd(A01, b, ptmp1);
-                A01 = pload<LhsPacket>(&lhs1[j-1+2*LhsPacketSize]);  palign<1>(A11,A01);
-                ptmp2 = pcj.pmadd(A02, b, ptmp2);
-                A02 = pload<LhsPacket>(&lhs2[j-2+2*LhsPacketSize]);  palign<2>(A12,A02);
-                ptmp3 = pcj.pmadd(A03, b, ptmp3);
-                A03 = pload<LhsPacket>(&lhs3[j-3+2*LhsPacketSize]);  palign<3>(A13,A03);
-
-                b = pload<RhsPacket>(&rhs[j+RhsPacketSize]);
-                ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j+LhsPacketSize]), b, ptmp0);
-                ptmp1 = pcj.pmadd(A11, b, ptmp1);
-                ptmp2 = pcj.pmadd(A12, b, ptmp2);
-                ptmp3 = pcj.pmadd(A13, b, ptmp3);
-              }
-            }
-            for (; j<alignedSize; j+=RhsPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(d,du,du);
-            break;
-          }
-          default:
-            for (Index j = alignedStart; j<alignedSize; j+=RhsPacketSize)
-              _EIGEN_ACCUMULATE_PACKETS(du,du,du);
-            break;
-        }
-        tmp0 += predux(ptmp0);
-        tmp1 += predux(ptmp1);
-        tmp2 += predux(ptmp2);
-        tmp3 += predux(ptmp3);
-      }
-    } // end explicit vectorization
-
-    // process remaining coeffs (or all if no explicit vectorization)
-    // FIXME this loop get vectorized by the compiler !
-    for (Index j=alignedSize; j<depth; ++j)
-    {
-      RhsScalar b = rhs[j];
-      tmp0 += cj.pmul(lhs0[j],b); tmp1 += cj.pmul(lhs1[j],b);
-      tmp2 += cj.pmul(lhs2[j],b); tmp3 += cj.pmul(lhs3[j],b);
+      c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 0, j), b0, c0);
+      c1 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 1, j), b0, c1);
+      c2 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 2, j), b0, c2);
+      c3 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 3, j), b0, c3);
     }
-    res[i*resIncr]            += alpha*tmp0;
-    res[(i+offset1)*resIncr]  += alpha*tmp1;
-    res[(i+2)*resIncr]        += alpha*tmp2;
-    res[(i+offset3)*resIncr]  += alpha*tmp3;
+    ResScalar cc0 = predux(c0);
+    ResScalar cc1 = predux(c1);
+    ResScalar cc2 = predux(c2);
+    ResScalar cc3 = predux(c3);
+
+    for (Index j = fullColBlockEnd; j < cols; ++j) {
+      RhsScalar b0 = rhs(j, 0);
+
+      cc0 += cj.pmul(lhs(i + 0, j), b0);
+      cc1 += cj.pmul(lhs(i + 1, j), b0);
+      cc2 += cj.pmul(lhs(i + 2, j), b0);
+      cc3 += cj.pmul(lhs(i + 3, j), b0);
+    }
+    res[(i + 0) * resIncr] += alpha * cc0;
+    res[(i + 1) * resIncr] += alpha * cc1;
+    res[(i + 2) * resIncr] += alpha * cc2;
+    res[(i + 3) * resIncr] += alpha * cc3;
   }
+  for (; i < n2; i += 2) {
+    ResPacket c0 = pset1<ResPacket>(ResScalar(0)), c1 = pset1<ResPacket>(ResScalar(0));
 
-  // process remaining first and last rows (at most columnsAtOnce-1)
-  Index end = rows;
-  Index start = rowBound;
-  do
-  {
-    for (Index i=start; i<end; ++i)
-    {
-      EIGEN_ALIGN16 ResScalar tmp0 = ResScalar(0);
-      ResPacket ptmp0 = pset1<ResPacket>(tmp0);
-      const LhsScalar* lhs0 = lhs + i*lhsStride;
-      // process first unaligned result's coeffs
-      // FIXME this loop get vectorized by the compiler !
-      for (Index j=0; j<alignedStart; ++j)
-        tmp0 += cj.pmul(lhs0[j], rhs[j]);
-
-      if (alignedSize>alignedStart)
-      {
-        // process aligned rhs coeffs
-        if ((size_t(lhs0+alignedStart)%sizeof(LhsPacket))==0)
-          for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
-            ptmp0 = pcj.pmadd(pload<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0);
-        else
-          for (Index j = alignedStart;j<alignedSize;j+=RhsPacketSize)
-            ptmp0 = pcj.pmadd(ploadu<LhsPacket>(&lhs0[j]), pload<RhsPacket>(&rhs[j]), ptmp0);
-        tmp0 += predux(ptmp0);
-      }
+    for (Index j = 0; j < fullColBlockEnd; j += LhsPacketSize) {
+      RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j, 0);
 
-      // process remaining scalars
-      // FIXME this loop get vectorized by the compiler !
-      for (Index j=alignedSize; j<depth; ++j)
-        tmp0 += cj.pmul(lhs0[j], rhs[j]);
-      res[i*resIncr] += alpha*tmp0;
+      c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 0, j), b0, c0);
+      c1 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i + 1, j), b0, c1);
     }
-    if (skipRows)
-    {
-      start = 0;
-      end = skipRows;
-      skipRows = 0;
-    }
-    else
-      break;
-  } while(Vectorizable);
+    ResScalar cc0 = predux(c0);
+    ResScalar cc1 = predux(c1);
+
+    for (Index j = fullColBlockEnd; j < cols; ++j) {
+      RhsScalar b0 = rhs(j, 0);
 
-  #undef _EIGEN_ACCUMULATE_PACKETS
+      cc0 += cj.pmul(lhs(i + 0, j), b0);
+      cc1 += cj.pmul(lhs(i + 1, j), b0);
+    }
+    res[(i + 0) * resIncr] += alpha * cc0;
+    res[(i + 1) * resIncr] += alpha * cc1;
+  }
+  for (; i < rows; ++i) {
+    ResPacket c0 = pset1<ResPacket>(ResScalar(0));
+    ResPacketHalf c0_h = pset1<ResPacketHalf>(ResScalar(0));
+    ResPacketQuarter c0_q = pset1<ResPacketQuarter>(ResScalar(0));
+
+    for (Index j = 0; j < fullColBlockEnd; j += LhsPacketSize) {
+      RhsPacket b0 = rhs.template load<RhsPacket, Unaligned>(j, 0);
+      c0 = pcj.pmadd(lhs.template load<LhsPacket, LhsAlignment>(i, j), b0, c0);
+    }
+    ResScalar cc0 = predux(c0);
+    if (HasHalf) {
+      for (Index j = fullColBlockEnd; j < halfColBlockEnd; j += LhsPacketSizeHalf) {
+        RhsPacketHalf b0 = rhs.template load<RhsPacketHalf, Unaligned>(j, 0);
+        c0_h = pcj_half.pmadd(lhs.template load<LhsPacketHalf, LhsAlignment>(i, j), b0, c0_h);
+      }
+      cc0 += predux(c0_h);
+    }
+    if (HasQuarter) {
+      for (Index j = halfColBlockEnd; j < quarterColBlockEnd; j += LhsPacketSizeQuarter) {
+        RhsPacketQuarter b0 = rhs.template load<RhsPacketQuarter, Unaligned>(j, 0);
+        c0_q = pcj_quarter.pmadd(lhs.template load<LhsPacketQuarter, LhsAlignment>(i, j), b0, c0_q);
+      }
+      cc0 += predux(c0_q);
+    }
+    for (Index j = quarterColBlockEnd; j < cols; ++j) {
+      cc0 += cj.pmul(lhs(i, j), rhs(j, 0));
+    }
+    res[i * resIncr] += alpha * cc0;
+  }
 }
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_GENERAL_MATRIX_VECTOR_H
+#endif  // EIGEN_GENERAL_MATRIX_VECTOR_H
diff --git a/inst/include/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h b/inst/include/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h
new file mode 100644
index 00000000..4010a0a6
--- /dev/null
+++ b/inst/include/Eigen/src/Core/products/GeneralMatrixVector_BLAS.h
@@ -0,0 +1,139 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to BLAS F77
+ *   General matrix-vector product functionality based on ?GEMV.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_GENERAL_MATRIX_VECTOR_BLAS_H
+#define EIGEN_GENERAL_MATRIX_VECTOR_BLAS_H
+
+// IWYU pragma: private
+#include "../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+/**********************************************************************
+ * This file implements general matrix-vector multiplication using BLAS
+ * gemv function via partial specialization of
+ * general_matrix_vector_product::run(..) method for float, double,
+ * std::complex<float> and std::complex<double> types
+ **********************************************************************/
+
+// gemv specialization
+
+template <typename Index, typename LhsScalar, int StorageOrder, bool ConjugateLhs, typename RhsScalar,
+          bool ConjugateRhs>
+struct general_matrix_vector_product_gemv;
+
+#define EIGEN_BLAS_GEMV_SPECIALIZE(Scalar)                                                                       \
+  template <typename Index, bool ConjugateLhs, bool ConjugateRhs>                                                \
+  struct general_matrix_vector_product<Index, Scalar, const_blas_data_mapper<Scalar, Index, ColMajor>, ColMajor, \
+                                       ConjugateLhs, Scalar, const_blas_data_mapper<Scalar, Index, RowMajor>,    \
+                                       ConjugateRhs, Specialized> {                                              \
+    static void run(Index rows, Index cols, const const_blas_data_mapper<Scalar, Index, ColMajor>& lhs,          \
+                    const const_blas_data_mapper<Scalar, Index, RowMajor>& rhs, Scalar* res, Index resIncr,      \
+                    Scalar alpha) {                                                                              \
+      if (ConjugateLhs) {                                                                                        \
+        general_matrix_vector_product<Index, Scalar, const_blas_data_mapper<Scalar, Index, ColMajor>, ColMajor,  \
+                                      ConjugateLhs, Scalar, const_blas_data_mapper<Scalar, Index, RowMajor>,     \
+                                      ConjugateRhs, BuiltIn>::run(rows, cols, lhs, rhs, res, resIncr, alpha);    \
+      } else {                                                                                                   \
+        general_matrix_vector_product_gemv<Index, Scalar, ColMajor, ConjugateLhs, Scalar, ConjugateRhs>::run(    \
+            rows, cols, lhs.data(), lhs.stride(), rhs.data(), rhs.stride(), res, resIncr, alpha);                \
+      }                                                                                                          \
+    }                                                                                                            \
+  };                                                                                                             \
+  template <typename Index, bool ConjugateLhs, bool ConjugateRhs>                                                \
+  struct general_matrix_vector_product<Index, Scalar, const_blas_data_mapper<Scalar, Index, RowMajor>, RowMajor, \
+                                       ConjugateLhs, Scalar, const_blas_data_mapper<Scalar, Index, ColMajor>,    \
+                                       ConjugateRhs, Specialized> {                                              \
+    static void run(Index rows, Index cols, const const_blas_data_mapper<Scalar, Index, RowMajor>& lhs,          \
+                    const const_blas_data_mapper<Scalar, Index, ColMajor>& rhs, Scalar* res, Index resIncr,      \
+                    Scalar alpha) {                                                                              \
+      general_matrix_vector_product_gemv<Index, Scalar, RowMajor, ConjugateLhs, Scalar, ConjugateRhs>::run(      \
+          rows, cols, lhs.data(), lhs.stride(), rhs.data(), rhs.stride(), res, resIncr, alpha);                  \
+    }                                                                                                            \
+  };
+
+EIGEN_BLAS_GEMV_SPECIALIZE(double)
+EIGEN_BLAS_GEMV_SPECIALIZE(float)
+EIGEN_BLAS_GEMV_SPECIALIZE(dcomplex)
+EIGEN_BLAS_GEMV_SPECIALIZE(scomplex)
+
+#define EIGEN_BLAS_GEMV_SPECIALIZATION(EIGTYPE, BLASTYPE, BLASFUNC)                                                 \
+  template <typename Index, int LhsStorageOrder, bool ConjugateLhs, bool ConjugateRhs>                              \
+  struct general_matrix_vector_product_gemv<Index, EIGTYPE, LhsStorageOrder, ConjugateLhs, EIGTYPE, ConjugateRhs> { \
+    typedef Matrix<EIGTYPE, Dynamic, 1, ColMajor> GEMVVector;                                                       \
+                                                                                                                    \
+    static void run(Index rows, Index cols, const EIGTYPE* lhs, Index lhsStride, const EIGTYPE* rhs, Index rhsIncr, \
+                    EIGTYPE* res, Index resIncr, EIGTYPE alpha) {                                                   \
+      if (rows == 0 || cols == 0) return;                                                                           \
+      BlasIndex m = convert_index<BlasIndex>(rows), n = convert_index<BlasIndex>(cols),                             \
+                lda = convert_index<BlasIndex>(lhsStride), incx = convert_index<BlasIndex>(rhsIncr),                \
+                incy = convert_index<BlasIndex>(resIncr);                                                           \
+      const EIGTYPE beta(1);                                                                                        \
+      const EIGTYPE* x_ptr;                                                                                         \
+      char trans = (LhsStorageOrder == ColMajor) ? 'N' : (ConjugateLhs) ? 'C' : 'T';                                \
+      if (LhsStorageOrder == RowMajor) {                                                                            \
+        m = convert_index<BlasIndex>(cols);                                                                         \
+        n = convert_index<BlasIndex>(rows);                                                                         \
+      }                                                                                                             \
+      GEMVVector x_tmp;                                                                                             \
+      if (ConjugateRhs) {                                                                                           \
+        Map<const GEMVVector, 0, InnerStride<> > map_x(rhs, cols, 1, InnerStride<>(incx));                          \
+        x_tmp = map_x.conjugate();                                                                                  \
+        x_ptr = x_tmp.data();                                                                                       \
+        incx = 1;                                                                                                   \
+      } else {                                                                                                      \
+        x_ptr = rhs;                                                                                                \
+      }                                                                                                             \
+      BLASFUNC(&trans, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda,               \
+               (const BLASTYPE*)x_ptr, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &incy);     \
+    }                                                                                                               \
+  };
+
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_GEMV_SPECIALIZATION(double, double, dgemv)
+EIGEN_BLAS_GEMV_SPECIALIZATION(float, float, sgemv)
+EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, MKL_Complex16, zgemv)
+EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, MKL_Complex8, cgemv)
+#else
+EIGEN_BLAS_GEMV_SPECIALIZATION(double, double, dgemv_)
+EIGEN_BLAS_GEMV_SPECIALIZATION(float, float, sgemv_)
+EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, double, zgemv_)
+EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, float, cgemv_)
+#endif
+
+}  // namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_GENERAL_MATRIX_VECTOR_BLAS_H
diff --git a/inst/include/Eigen/src/Core/products/GeneralMatrixVector_MKL.h b/inst/include/Eigen/src/Core/products/GeneralMatrixVector_MKL.h
deleted file mode 100644
index 1cb9fe6b..00000000
--- a/inst/include/Eigen/src/Core/products/GeneralMatrixVector_MKL.h
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- Copyright (c) 2011, Intel Corporation. All rights reserved.
-
- Redistribution and use in source and binary forms, with or without modification,
- are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
- * Neither the name of Intel Corporation nor the names of its contributors may
-   be used to endorse or promote products derived from this software without
-   specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
- ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
- ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
- *   General matrix-vector product functionality based on ?GEMV.
- ********************************************************************************
-*/
-
-#ifndef EIGEN_GENERAL_MATRIX_VECTOR_MKL_H
-#define EIGEN_GENERAL_MATRIX_VECTOR_MKL_H
-
-namespace Eigen { 
-
-namespace internal {
-
-/**********************************************************************
-* This file implements general matrix-vector multiplication using BLAS
-* gemv function via partial specialization of
-* general_matrix_vector_product::run(..) method for float, double,
-* std::complex<float> and std::complex<double> types
-**********************************************************************/
-
-// gemv specialization
-
-template<typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs>
-struct general_matrix_vector_product_gemv :
-  general_matrix_vector_product<Index,LhsScalar,LhsStorageOrder,ConjugateLhs,RhsScalar,ConjugateRhs,BuiltIn> {};
-
-#define EIGEN_MKL_GEMV_SPECIALIZE(Scalar) \
-template<typename Index, bool ConjugateLhs, bool ConjugateRhs> \
-struct general_matrix_vector_product<Index,Scalar,ColMajor,ConjugateLhs,Scalar,ConjugateRhs,Specialized> { \
-static void run( \
-  Index rows, Index cols, \
-  const Scalar* lhs, Index lhsStride, \
-  const Scalar* rhs, Index rhsIncr, \
-  Scalar* res, Index resIncr, Scalar alpha) \
-{ \
-  if (ConjugateLhs) { \
-    general_matrix_vector_product<Index,Scalar,ColMajor,ConjugateLhs,Scalar,ConjugateRhs,BuiltIn>::run( \
-      rows, cols, lhs, lhsStride, rhs, rhsIncr, res, resIncr, alpha); \
-  } else { \
-    general_matrix_vector_product_gemv<Index,Scalar,ColMajor,ConjugateLhs,Scalar,ConjugateRhs>::run( \
-      rows, cols, lhs, lhsStride, rhs, rhsIncr, res, resIncr, alpha); \
-  } \
-} \
-}; \
-template<typename Index, bool ConjugateLhs, bool ConjugateRhs> \
-struct general_matrix_vector_product<Index,Scalar,RowMajor,ConjugateLhs,Scalar,ConjugateRhs,Specialized> { \
-static void run( \
-  Index rows, Index cols, \
-  const Scalar* lhs, Index lhsStride, \
-  const Scalar* rhs, Index rhsIncr, \
-  Scalar* res, Index resIncr, Scalar alpha) \
-{ \
-    general_matrix_vector_product_gemv<Index,Scalar,RowMajor,ConjugateLhs,Scalar,ConjugateRhs>::run( \
-      rows, cols, lhs, lhsStride, rhs, rhsIncr, res, resIncr, alpha); \
-} \
-}; \
-
-EIGEN_MKL_GEMV_SPECIALIZE(double)
-EIGEN_MKL_GEMV_SPECIALIZE(float)
-EIGEN_MKL_GEMV_SPECIALIZE(dcomplex)
-EIGEN_MKL_GEMV_SPECIALIZE(scomplex)
-
-#define EIGEN_MKL_GEMV_SPECIALIZATION(EIGTYPE,MKLTYPE,MKLPREFIX) \
-template<typename Index, int LhsStorageOrder, bool ConjugateLhs, bool ConjugateRhs> \
-struct general_matrix_vector_product_gemv<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,ConjugateRhs> \
-{ \
-typedef Matrix<EIGTYPE,Dynamic,1,ColMajor> GEMVVector;\
-\
-static void run( \
-  Index rows, Index cols, \
-  const EIGTYPE* lhs, Index lhsStride, \
-  const EIGTYPE* rhs, Index rhsIncr, \
-  EIGTYPE* res, Index resIncr, EIGTYPE alpha) \
-{ \
-  MKL_INT m=rows, n=cols, lda=lhsStride, incx=rhsIncr, incy=resIncr; \
-  MKLTYPE alpha_, beta_; \
-  const EIGTYPE *x_ptr, myone(1); \
-  char trans=(LhsStorageOrder==ColMajor) ? 'N' : (ConjugateLhs) ? 'C' : 'T'; \
-  if (LhsStorageOrder==RowMajor) { \
-    m=cols; \
-    n=rows; \
-  }\
-  assign_scalar_eig2mkl(alpha_, alpha); \
-  assign_scalar_eig2mkl(beta_, myone); \
-  GEMVVector x_tmp; \
-  if (ConjugateRhs) { \
-    Map<const GEMVVector, 0, InnerStride<> > map_x(rhs,cols,1,InnerStride<>(incx)); \
-    x_tmp=map_x.conjugate(); \
-    x_ptr=x_tmp.data(); \
-    incx=1; \
-  } else x_ptr=rhs; \
-  MKLPREFIX##gemv(&trans, &m, &n, &alpha_, (const MKLTYPE*)lhs, &lda, (const MKLTYPE*)x_ptr, &incx, &beta_, (MKLTYPE*)res, &incy); \
-}\
-};
-
-EIGEN_MKL_GEMV_SPECIALIZATION(double,   double,        d)
-EIGEN_MKL_GEMV_SPECIALIZATION(float,    float,         s)
-EIGEN_MKL_GEMV_SPECIALIZATION(dcomplex, MKL_Complex16, z)
-EIGEN_MKL_GEMV_SPECIALIZATION(scomplex, MKL_Complex8,  c)
-
-} // end namespase internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_GENERAL_MATRIX_VECTOR_MKL_H
diff --git a/inst/include/Eigen/src/Core/products/Parallelizer.h b/inst/include/Eigen/src/Core/products/Parallelizer.h
index 6937ee33..b1b89ef9 100644
--- a/inst/include/Eigen/src/Core/products/Parallelizer.h
+++ b/inst/include/Eigen/src/Core/products/Parallelizer.h
@@ -10,153 +10,273 @@
 #ifndef EIGEN_PARALLELIZER_H
 #define EIGEN_PARALLELIZER_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "../InternalHeaderCheck.h"
 
-namespace internal {
+// Note that in the following, there are 3 different uses of the concept
+// "number of threads":
+//  1. Max number of threads used by OpenMP or ThreadPool.
+//     * For OpenMP this is typically the value set by the OMP_NUM_THREADS
+//       environment variable, or by a call to omp_set_num_threads() prior to
+//       calling Eigen.
+//     * For ThreadPool, this is the number of threads in the ThreadPool.
+//  2. Max number of threads currently allowed to be used by parallel Eigen
+//     operations. This is set by setNbThreads(), and cannot exceed the value
+//     in 1.
+//  3. The actual number of threads used for a given parallel Eigen operation.
+//     This is typically computed on the fly using a cost model and cannot exceed
+//     the value in 2.
+//     * For OpenMP, this is typically the number of threads specified in individual
+//       "omp parallel" pragmas associated with an Eigen operation.
+//     * For ThreadPool, it is the number of concurrent tasks scheduled in the
+//       threadpool for a given Eigen operation. Notice that since the threadpool
+//       uses task stealing, there is no way to limit the number of concurrently
+//       executing tasks to below the number in 1. except by limiting the total
+//       number of tasks in flight.
 
-/** \internal */
-inline void manage_multi_threading(Action action, int* v)
-{
-  static EIGEN_UNUSED int m_maxThreads = -1;
+#if defined(EIGEN_HAS_OPENMP) && defined(EIGEN_GEMM_THREADPOOL)
+#error "EIGEN_HAS_OPENMP and EIGEN_GEMM_THREADPOOL may not both be defined."
+#endif
 
-  if(action==SetAction)
-  {
-    eigen_internal_assert(v!=0);
-    m_maxThreads = *v;
-  }
-  else if(action==GetAction)
-  {
-    eigen_internal_assert(v!=0);
-    #ifdef EIGEN_HAS_OPENMP
-    if(m_maxThreads>0)
-      *v = m_maxThreads;
-    else
-      *v = omp_get_max_threads();
-    #else
-    *v = 1;
-    #endif
-  }
-  else
-  {
-    eigen_internal_assert(false);
-  }
-}
+namespace Eigen {
 
+namespace internal {
+inline void manage_multi_threading(Action action, int* v);
 }
 
+// Public APIs.
+
 /** Must be call first when calling Eigen from multiple threads */
-inline void initParallel()
-{
-  int nbt;
-  internal::manage_multi_threading(GetAction, &nbt);
-  std::ptrdiff_t l1, l2;
-  internal::manage_caching_sizes(GetAction, &l1, &l2);
-}
+EIGEN_DEPRECATED_WITH_REASON("Initialization is no longer needed.") inline void initParallel() {}
 
 /** \returns the max number of threads reserved for Eigen
-  * \sa setNbThreads */
-inline int nbThreads()
-{
+ * \sa setNbThreads */
+inline int nbThreads() {
   int ret;
   internal::manage_multi_threading(GetAction, &ret);
   return ret;
 }
 
 /** Sets the max number of threads reserved for Eigen
-  * \sa nbThreads */
-inline void setNbThreads(int v)
-{
-  internal::manage_multi_threading(SetAction, &v);
+ * \sa nbThreads */
+inline void setNbThreads(int v) { internal::manage_multi_threading(SetAction, &v); }
+
+#ifdef EIGEN_GEMM_THREADPOOL
+// Sets the ThreadPool used by Eigen parallel Gemm.
+//
+// NOTICE: This function has a known race condition with
+// parallelize_gemm below, and should not be called while
+// an instance of that function is running.
+//
+// TODO(rmlarsen): Make the device API available instead of
+// storing a local static pointer variable to avoid this issue.
+inline ThreadPool* setGemmThreadPool(ThreadPool* new_pool) {
+  static ThreadPool* pool = nullptr;
+  if (new_pool != nullptr) {
+    // This will wait for work in all threads in *pool to finish,
+    // then destroy the old ThreadPool, and then replace it with new_pool.
+    pool = new_pool;
+    // Reset the number of threads to the number of threads on the new pool.
+    setNbThreads(pool->NumThreads());
+  }
+  return pool;
 }
 
+// Gets the ThreadPool used by Eigen parallel Gemm.
+inline ThreadPool* getGemmThreadPool() { return setGemmThreadPool(nullptr); }
+#endif
+
 namespace internal {
 
-template<typename Index> struct GemmParallelInfo
-{
-  GemmParallelInfo() : sync(-1), users(0), rhs_start(0), rhs_length(0) {}
+// Implementation.
 
-  int volatile sync;
-  int volatile users;
+#if defined(EIGEN_USE_BLAS) || (!defined(EIGEN_HAS_OPENMP) && !defined(EIGEN_GEMM_THREADPOOL))
 
-  Index rhs_start;
-  Index rhs_length;
+inline void manage_multi_threading(Action action, int* v) {
+  if (action == SetAction) {
+    eigen_internal_assert(v != nullptr);
+  } else if (action == GetAction) {
+    eigen_internal_assert(v != nullptr);
+    *v = 1;
+  } else {
+    eigen_internal_assert(false);
+  }
+}
+template <typename Index>
+struct GemmParallelInfo {};
+template <bool Condition, typename Functor, typename Index>
+EIGEN_STRONG_INLINE void parallelize_gemm(const Functor& func, Index rows, Index cols, Index /*unused*/,
+                                          bool /*unused*/) {
+  func(0, rows, 0, cols);
+}
+
+#else
+
+template <typename Index>
+struct GemmParallelTaskInfo {
+  GemmParallelTaskInfo() : sync(-1), users(0), lhs_start(0), lhs_length(0) {}
+  std::atomic<Index> sync;
+  std::atomic<int> users;
+  Index lhs_start;
+  Index lhs_length;
 };
 
-template<bool Condition, typename Functor, typename Index>
-void parallelize_gemm(const Functor& func, Index rows, Index cols, bool transpose)
-{
-  // TODO when EIGEN_USE_BLAS is defined,
-  // we should still enable OMP for other scalar types
-#if !(defined (EIGEN_HAS_OPENMP)) || defined (EIGEN_USE_BLAS)
-  // FIXME the transpose variable is only needed to properly split
-  // the matrix product when multithreading is enabled. This is a temporary
-  // fix to support row-major destination matrices. This whole
-  // parallelizer mechanism has to be redisigned anyway.
-  EIGEN_UNUSED_VARIABLE(transpose);
-  func(0,rows, 0,cols);
+template <typename Index>
+struct GemmParallelInfo {
+  const int logical_thread_id;
+  const int num_threads;
+  GemmParallelTaskInfo<Index>* task_info;
+
+  GemmParallelInfo(int logical_thread_id_, int num_threads_, GemmParallelTaskInfo<Index>* task_info_)
+      : logical_thread_id(logical_thread_id_), num_threads(num_threads_), task_info(task_info_) {}
+};
+
+inline void manage_multi_threading(Action action, int* v) {
+  static int m_maxThreads = -1;
+  if (action == SetAction) {
+    eigen_internal_assert(v != nullptr);
+#if defined(EIGEN_HAS_OPENMP)
+    // Calling action == SetAction and *v = 0 means
+    // restoring m_maxThreads to the maximum number of threads specified
+    // for OpenMP.
+    eigen_internal_assert(*v >= 0);
+    int omp_threads = omp_get_max_threads();
+    m_maxThreads = (*v == 0 ? omp_threads : std::min(*v, omp_threads));
+#elif defined(EIGEN_GEMM_THREADPOOL)
+    // Calling action == SetAction and *v = 0 means
+    // restoring m_maxThreads to the number of threads in the ThreadPool,
+    // which defaults to 1 if no pool was provided.
+    eigen_internal_assert(*v >= 0);
+    ThreadPool* pool = getGemmThreadPool();
+    int pool_threads = pool != nullptr ? pool->NumThreads() : 1;
+    m_maxThreads = (*v == 0 ? pool_threads : numext::mini(pool_threads, *v));
+#endif
+  } else if (action == GetAction) {
+    eigen_internal_assert(v != nullptr);
+#if defined(EIGEN_HAS_OPENMP)
+    if (m_maxThreads > 0)
+      *v = m_maxThreads;
+    else
+      *v = omp_get_max_threads();
 #else
+    *v = m_maxThreads;
+#endif
+  } else {
+    eigen_internal_assert(false);
+  }
+}
 
-  // Dynamically check whether we should enable or disable OpenMP.
+template <bool Condition, typename Functor, typename Index>
+EIGEN_STRONG_INLINE void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth, bool transpose) {
+  // Dynamically check whether we should even try to execute in parallel.
   // The conditions are:
   // - the max number of threads we can create is greater than 1
   // - we are not already in a parallel code
   // - the sizes are large enough
 
-  // 1- are we already in a parallel session?
-  // FIXME omp_get_num_threads()>1 only works for openmp, what if the user does not use openmp?
-  if((!Condition) || (omp_get_num_threads()>1))
-    return func(0,rows, 0,cols);
+  // compute the maximal number of threads from the size of the product:
+  // This first heuristic takes into account that the product kernel is fully optimized when working with nr columns at
+  // once.
+  Index size = transpose ? rows : cols;
+  Index pb_max_threads = std::max<Index>(1, size / Functor::Traits::nr);
 
-  Index size = transpose ? cols : rows;
+  // compute the maximal number of threads from the total amount of work:
+  double work = static_cast<double>(rows) * static_cast<double>(cols) * static_cast<double>(depth);
+  double kMinTaskSize = 50000;  // FIXME improve this heuristic.
+  pb_max_threads = std::max<Index>(1, std::min<Index>(pb_max_threads, static_cast<Index>(work / kMinTaskSize)));
 
-  // 2- compute the maximal number of threads from the size of the product:
-  // FIXME this has to be fine tuned
-  Index max_threads = std::max<Index>(1,size / 32);
+  // compute the number of threads we are going to use
+  int threads = std::min<int>(nbThreads(), static_cast<int>(pb_max_threads));
 
-  // 3 - compute the number of threads we are going to use
-  Index threads = std::min<Index>(nbThreads(), max_threads);
-
-  if(threads==1)
-    return func(0,rows, 0,cols);
+  // if multi-threading is explicitly disabled, not useful, or if we already are
+  // inside a parallel session, then abort multi-threading
+  bool dont_parallelize = (!Condition) || (threads <= 1);
+#if defined(EIGEN_HAS_OPENMP)
+  // don't parallelize if we are executing in a parallel context already.
+  dont_parallelize |= omp_get_num_threads() > 1;
+#elif defined(EIGEN_GEMM_THREADPOOL)
+  // don't parallelize if we have a trivial threadpool or the current thread id
+  // is != -1, indicating that we are already executing on a thread inside the pool.
+  // In other words, we do not allow nested parallelism, since this would lead to
+  // deadlocks due to the workstealing nature of the threadpool.
+  ThreadPool* pool = getGemmThreadPool();
+  dont_parallelize |= (pool == nullptr || pool->CurrentThreadId() != -1);
+#endif
+  if (dont_parallelize) return func(0, rows, 0, cols);
 
-  Eigen::initParallel();
-  func.initParallelSession();
+  func.initParallelSession(threads);
 
-  if(transpose)
-    std::swap(rows,cols);
+  if (transpose) std::swap(rows, cols);
 
-  GemmParallelInfo<Index>* info = new GemmParallelInfo<Index>[threads];
+  ei_declare_aligned_stack_constructed_variable(GemmParallelTaskInfo<Index>, task_info, threads, 0);
 
-  #pragma omp parallel num_threads(threads)
+#if defined(EIGEN_HAS_OPENMP)
+#pragma omp parallel num_threads(threads)
   {
     Index i = omp_get_thread_num();
-    // Note that the actual number of threads might be lower than the number of request ones.
+    // Note that the actual number of threads might be lower than the number of
+    // requested ones
     Index actual_threads = omp_get_num_threads();
-    
+    GemmParallelInfo<Index> info(static_cast<int>(i), static_cast<int>(actual_threads), task_info);
+
     Index blockCols = (cols / actual_threads) & ~Index(0x3);
-    Index blockRows = (rows / actual_threads) & ~Index(0x7);
-    
-    Index r0 = i*blockRows;
-    Index actualBlockRows = (i+1==actual_threads) ? rows-r0 : blockRows;
+    Index blockRows = (rows / actual_threads);
+    blockRows = (blockRows / Functor::Traits::mr) * Functor::Traits::mr;
+
+    Index r0 = i * blockRows;
+    Index actualBlockRows = (i + 1 == actual_threads) ? rows - r0 : blockRows;
 
-    Index c0 = i*blockCols;
-    Index actualBlockCols = (i+1==actual_threads) ? cols-c0 : blockCols;
+    Index c0 = i * blockCols;
+    Index actualBlockCols = (i + 1 == actual_threads) ? cols - c0 : blockCols;
 
-    info[i].rhs_start = c0;
-    info[i].rhs_length = actualBlockCols;
+    info.task_info[i].lhs_start = r0;
+    info.task_info[i].lhs_length = actualBlockRows;
 
-    if(transpose)
-      func(0, cols, r0, actualBlockRows, info);
+    if (transpose)
+      func(c0, actualBlockCols, 0, rows, &info);
     else
-      func(r0, actualBlockRows, 0,cols, info);
+      func(0, rows, c0, actualBlockCols, &info);
   }
 
-  delete[] info;
+#elif defined(EIGEN_GEMM_THREADPOOL)
+  Barrier barrier(threads);
+  auto task = [=, &func, &barrier, &task_info](int i) {
+    Index actual_threads = threads;
+    GemmParallelInfo<Index> info(i, static_cast<int>(actual_threads), task_info);
+    Index blockCols = (cols / actual_threads) & ~Index(0x3);
+    Index blockRows = (rows / actual_threads);
+    blockRows = (blockRows / Functor::Traits::mr) * Functor::Traits::mr;
+
+    Index r0 = i * blockRows;
+    Index actualBlockRows = (i + 1 == actual_threads) ? rows - r0 : blockRows;
+
+    Index c0 = i * blockCols;
+    Index actualBlockCols = (i + 1 == actual_threads) ? cols - c0 : blockCols;
+
+    info.task_info[i].lhs_start = r0;
+    info.task_info[i].lhs_length = actualBlockRows;
+
+    if (transpose)
+      func(c0, actualBlockCols, 0, rows, &info);
+    else
+      func(0, rows, c0, actualBlockCols, &info);
+
+    barrier.Notify();
+  };
+  // Notice that we do not schedule more than "threads" tasks, which allows us to
+  // limit number of running threads, even if the threadpool itself was constructed
+  // with a larger number of threads.
+  for (int i = 0; i < threads - 1; ++i) {
+    pool->Schedule([=, task = std::move(task)] { task(i); });
+  }
+  task(threads - 1);
+  barrier.Wait();
 #endif
 }
 
-} // end namespace internal
+#endif
 
-} // end namespace Eigen
+}  // end namespace internal
+}  // end namespace Eigen
 
-#endif // EIGEN_PARALLELIZER_H
+#endif  // EIGEN_PARALLELIZER_H
diff --git a/inst/include/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/inst/include/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
index 99cf9e0a..899283dc 100644
--- a/inst/include/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
+++ b/inst/include/Eigen/src/Core/products/SelfadjointMatrixMatrix.h
@@ -10,181 +10,246 @@
 #ifndef EIGEN_SELFADJOINT_MATRIX_MATRIX_H
 #define EIGEN_SELFADJOINT_MATRIX_MATRIX_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "../InternalHeaderCheck.h"
+
+namespace Eigen {
 
 namespace internal {
 
 // pack a selfadjoint block diagonal for use with the gebp_kernel
-template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder>
-struct symm_pack_lhs
-{
-  template<int BlockRows> inline
-  void pack(Scalar* blockA, const const_blas_data_mapper<Scalar,Index,StorageOrder>& lhs, Index cols, Index i, Index& count)
-  {
+template <typename Scalar, typename Index, int Pack1, int Pack2_dummy, int StorageOrder>
+struct symm_pack_lhs {
+  template <int BlockRows>
+  inline void pack(Scalar* blockA, const const_blas_data_mapper<Scalar, Index, StorageOrder>& lhs, Index cols, Index i,
+                   Index& count) {
     // normal copy
-    for(Index k=0; k<i; k++)
-      for(Index w=0; w<BlockRows; w++)
-        blockA[count++] = lhs(i+w,k);           // normal
+    for (Index k = 0; k < i; k++)
+      for (Index w = 0; w < BlockRows; w++) blockA[count++] = lhs(i + w, k);  // normal
     // symmetric copy
     Index h = 0;
-    for(Index k=i; k<i+BlockRows; k++)
-    {
-      for(Index w=0; w<h; w++)
-        blockA[count++] = numext::conj(lhs(k, i+w)); // transposed
+    for (Index k = i; k < i + BlockRows; k++) {
+      for (Index w = 0; w < h; w++) blockA[count++] = numext::conj(lhs(k, i + w));  // transposed
 
-      blockA[count++] = numext::real(lhs(k,k));   // real (diagonal)
+      blockA[count++] = numext::real(lhs(k, k));  // real (diagonal)
 
-      for(Index w=h+1; w<BlockRows; w++)
-        blockA[count++] = lhs(i+w, k);          // normal
+      for (Index w = h + 1; w < BlockRows; w++) blockA[count++] = lhs(i + w, k);  // normal
       ++h;
     }
     // transposed copy
-    for(Index k=i+BlockRows; k<cols; k++)
-      for(Index w=0; w<BlockRows; w++)
-        blockA[count++] = numext::conj(lhs(k, i+w)); // transposed
+    for (Index k = i + BlockRows; k < cols; k++)
+      for (Index w = 0; w < BlockRows; w++) blockA[count++] = numext::conj(lhs(k, i + w));  // transposed
   }
-  void operator()(Scalar* blockA, const Scalar* _lhs, Index lhsStride, Index cols, Index rows)
-  {
-    const_blas_data_mapper<Scalar,Index,StorageOrder> lhs(_lhs,lhsStride);
+  void operator()(Scalar* blockA, const Scalar* lhs_, Index lhsStride, Index cols, Index rows) {
+    typedef typename unpacket_traits<typename packet_traits<Scalar>::type>::half HalfPacket;
+    typedef typename unpacket_traits<typename unpacket_traits<typename packet_traits<Scalar>::type>::half>::half
+        QuarterPacket;
+    enum {
+      PacketSize = packet_traits<Scalar>::size,
+      HalfPacketSize = unpacket_traits<HalfPacket>::size,
+      QuarterPacketSize = unpacket_traits<QuarterPacket>::size,
+      HasHalf = (int)HalfPacketSize < (int)PacketSize,
+      HasQuarter = (int)QuarterPacketSize < (int)HalfPacketSize
+    };
+
+    const_blas_data_mapper<Scalar, Index, StorageOrder> lhs(lhs_, lhsStride);
     Index count = 0;
-    Index peeled_mc = (rows/Pack1)*Pack1;
-    for(Index i=0; i<peeled_mc; i+=Pack1)
-    {
-      pack<Pack1>(blockA, lhs, cols, i, count);
-    }
+    // Index peeled_mc3 = (rows/Pack1)*Pack1;
 
-    if(rows-peeled_mc>=Pack2)
-    {
-      pack<Pack2>(blockA, lhs, cols, peeled_mc, count);
-      peeled_mc += Pack2;
-    }
+    const Index peeled_mc3 = Pack1 >= 3 * PacketSize ? (rows / (3 * PacketSize)) * (3 * PacketSize) : 0;
+    const Index peeled_mc2 =
+        Pack1 >= 2 * PacketSize ? peeled_mc3 + ((rows - peeled_mc3) / (2 * PacketSize)) * (2 * PacketSize) : 0;
+    const Index peeled_mc1 =
+        Pack1 >= 1 * PacketSize ? peeled_mc2 + ((rows - peeled_mc2) / (1 * PacketSize)) * (1 * PacketSize) : 0;
+    const Index peeled_mc_half =
+        Pack1 >= HalfPacketSize ? peeled_mc1 + ((rows - peeled_mc1) / (HalfPacketSize)) * (HalfPacketSize) : 0;
+    const Index peeled_mc_quarter =
+        Pack1 >= QuarterPacketSize
+            ? peeled_mc_half + ((rows - peeled_mc_half) / (QuarterPacketSize)) * (QuarterPacketSize)
+            : 0;
+
+    if (Pack1 >= 3 * PacketSize)
+      for (Index i = 0; i < peeled_mc3; i += 3 * PacketSize) pack<3 * PacketSize>(blockA, lhs, cols, i, count);
+
+    if (Pack1 >= 2 * PacketSize)
+      for (Index i = peeled_mc3; i < peeled_mc2; i += 2 * PacketSize) pack<2 * PacketSize>(blockA, lhs, cols, i, count);
+
+    if (Pack1 >= 1 * PacketSize)
+      for (Index i = peeled_mc2; i < peeled_mc1; i += 1 * PacketSize) pack<1 * PacketSize>(blockA, lhs, cols, i, count);
+
+    if (HasHalf && Pack1 >= HalfPacketSize)
+      for (Index i = peeled_mc1; i < peeled_mc_half; i += HalfPacketSize)
+        pack<HalfPacketSize>(blockA, lhs, cols, i, count);
+
+    if (HasQuarter && Pack1 >= QuarterPacketSize)
+      for (Index i = peeled_mc_half; i < peeled_mc_quarter; i += QuarterPacketSize)
+        pack<QuarterPacketSize>(blockA, lhs, cols, i, count);
 
     // do the same with mr==1
-    for(Index i=peeled_mc; i<rows; i++)
-    {
-      for(Index k=0; k<i; k++)
-        blockA[count++] = lhs(i, k);              // normal
+    for (Index i = peeled_mc_quarter; i < rows; i++) {
+      for (Index k = 0; k < i; k++) blockA[count++] = lhs(i, k);  // normal
 
-      blockA[count++] = numext::real(lhs(i, i));       // real (diagonal)
+      blockA[count++] = numext::real(lhs(i, i));  // real (diagonal)
 
-      for(Index k=i+1; k<cols; k++)
-        blockA[count++] = numext::conj(lhs(k, i));     // transposed
+      for (Index k = i + 1; k < cols; k++) blockA[count++] = numext::conj(lhs(k, i));  // transposed
     }
   }
 };
 
-template<typename Scalar, typename Index, int nr, int StorageOrder>
-struct symm_pack_rhs
-{
+template <typename Scalar, typename Index, int nr, int StorageOrder>
+struct symm_pack_rhs {
   enum { PacketSize = packet_traits<Scalar>::size };
-  void operator()(Scalar* blockB, const Scalar* _rhs, Index rhsStride, Index rows, Index cols, Index k2)
-  {
+  void operator()(Scalar* blockB, const Scalar* rhs_, Index rhsStride, Index rows, Index cols, Index k2) {
     Index end_k = k2 + rows;
     Index count = 0;
-    const_blas_data_mapper<Scalar,Index,StorageOrder> rhs(_rhs,rhsStride);
-    Index packet_cols = (cols/nr)*nr;
+    const_blas_data_mapper<Scalar, Index, StorageOrder> rhs(rhs_, rhsStride);
+    Index packet_cols8 = nr >= 8 ? (cols / 8) * 8 : 0;
+    Index packet_cols4 = nr >= 4 ? (cols / 4) * 4 : 0;
 
     // first part: normal case
-    for(Index j2=0; j2<k2; j2+=nr)
-    {
-      for(Index k=k2; k<end_k; k++)
-      {
-        blockB[count+0] = rhs(k,j2+0);
-        blockB[count+1] = rhs(k,j2+1);
-        if (nr==4)
-        {
-          blockB[count+2] = rhs(k,j2+2);
-          blockB[count+3] = rhs(k,j2+3);
+    for (Index j2 = 0; j2 < k2; j2 += nr) {
+      for (Index k = k2; k < end_k; k++) {
+        blockB[count + 0] = rhs(k, j2 + 0);
+        blockB[count + 1] = rhs(k, j2 + 1);
+        if (nr >= 4) {
+          blockB[count + 2] = rhs(k, j2 + 2);
+          blockB[count + 3] = rhs(k, j2 + 3);
+        }
+        if (nr >= 8) {
+          blockB[count + 4] = rhs(k, j2 + 4);
+          blockB[count + 5] = rhs(k, j2 + 5);
+          blockB[count + 6] = rhs(k, j2 + 6);
+          blockB[count + 7] = rhs(k, j2 + 7);
         }
         count += nr;
       }
     }
 
     // second part: diagonal block
-    for(Index j2=k2; j2<(std::min)(k2+rows,packet_cols); j2+=nr)
-    {
-      // again we can split vertically in three different parts (transpose, symmetric, normal)
-      // transpose
-      for(Index k=k2; k<j2; k++)
-      {
-        blockB[count+0] = numext::conj(rhs(j2+0,k));
-        blockB[count+1] = numext::conj(rhs(j2+1,k));
-        if (nr==4)
-        {
-          blockB[count+2] = numext::conj(rhs(j2+2,k));
-          blockB[count+3] = numext::conj(rhs(j2+3,k));
+    Index end8 = nr >= 8 ? (std::min)(k2 + rows, packet_cols8) : k2;
+    if (nr >= 8) {
+      for (Index j2 = k2; j2 < end8; j2 += 8) {
+        // again we can split vertically in three different parts (transpose, symmetric, normal)
+        // transpose
+        for (Index k = k2; k < j2; k++) {
+          blockB[count + 0] = numext::conj(rhs(j2 + 0, k));
+          blockB[count + 1] = numext::conj(rhs(j2 + 1, k));
+          blockB[count + 2] = numext::conj(rhs(j2 + 2, k));
+          blockB[count + 3] = numext::conj(rhs(j2 + 3, k));
+          blockB[count + 4] = numext::conj(rhs(j2 + 4, k));
+          blockB[count + 5] = numext::conj(rhs(j2 + 5, k));
+          blockB[count + 6] = numext::conj(rhs(j2 + 6, k));
+          blockB[count + 7] = numext::conj(rhs(j2 + 7, k));
+          count += 8;
+        }
+        // symmetric
+        Index h = 0;
+        for (Index k = j2; k < j2 + 8; k++) {
+          // normal
+          for (Index w = 0; w < h; ++w) blockB[count + w] = rhs(k, j2 + w);
+
+          blockB[count + h] = numext::real(rhs(k, k));
+
+          // transpose
+          for (Index w = h + 1; w < 8; ++w) blockB[count + w] = numext::conj(rhs(j2 + w, k));
+          count += 8;
+          ++h;
         }
-        count += nr;
-      }
-      // symmetric
-      Index h = 0;
-      for(Index k=j2; k<j2+nr; k++)
-      {
         // normal
-        for (Index w=0 ; w<h; ++w)
-          blockB[count+w] = rhs(k,j2+w);
-
-        blockB[count+h] = numext::real(rhs(k,k));
-
-        // transpose
-        for (Index w=h+1 ; w<nr; ++w)
-          blockB[count+w] = numext::conj(rhs(j2+w,k));
-        count += nr;
-        ++h;
+        for (Index k = j2 + 8; k < end_k; k++) {
+          blockB[count + 0] = rhs(k, j2 + 0);
+          blockB[count + 1] = rhs(k, j2 + 1);
+          blockB[count + 2] = rhs(k, j2 + 2);
+          blockB[count + 3] = rhs(k, j2 + 3);
+          blockB[count + 4] = rhs(k, j2 + 4);
+          blockB[count + 5] = rhs(k, j2 + 5);
+          blockB[count + 6] = rhs(k, j2 + 6);
+          blockB[count + 7] = rhs(k, j2 + 7);
+          count += 8;
+        }
       }
-      // normal
-      for(Index k=j2+nr; k<end_k; k++)
-      {
-        blockB[count+0] = rhs(k,j2+0);
-        blockB[count+1] = rhs(k,j2+1);
-        if (nr==4)
-        {
-          blockB[count+2] = rhs(k,j2+2);
-          blockB[count+3] = rhs(k,j2+3);
+    }
+    if (nr >= 4) {
+      for (Index j2 = end8; j2 < (std::min)(k2 + rows, packet_cols4); j2 += 4) {
+        // again we can split vertically in three different parts (transpose, symmetric, normal)
+        // transpose
+        for (Index k = k2; k < j2; k++) {
+          blockB[count + 0] = numext::conj(rhs(j2 + 0, k));
+          blockB[count + 1] = numext::conj(rhs(j2 + 1, k));
+          blockB[count + 2] = numext::conj(rhs(j2 + 2, k));
+          blockB[count + 3] = numext::conj(rhs(j2 + 3, k));
+          count += 4;
+        }
+        // symmetric
+        Index h = 0;
+        for (Index k = j2; k < j2 + 4; k++) {
+          // normal
+          for (Index w = 0; w < h; ++w) blockB[count + w] = rhs(k, j2 + w);
+
+          blockB[count + h] = numext::real(rhs(k, k));
+
+          // transpose
+          for (Index w = h + 1; w < 4; ++w) blockB[count + w] = numext::conj(rhs(j2 + w, k));
+          count += 4;
+          ++h;
+        }
+        // normal
+        for (Index k = j2 + 4; k < end_k; k++) {
+          blockB[count + 0] = rhs(k, j2 + 0);
+          blockB[count + 1] = rhs(k, j2 + 1);
+          blockB[count + 2] = rhs(k, j2 + 2);
+          blockB[count + 3] = rhs(k, j2 + 3);
+          count += 4;
         }
-        count += nr;
       }
     }
 
     // third part: transposed
-    for(Index j2=k2+rows; j2<packet_cols; j2+=nr)
-    {
-      for(Index k=k2; k<end_k; k++)
-      {
-        blockB[count+0] = numext::conj(rhs(j2+0,k));
-        blockB[count+1] = numext::conj(rhs(j2+1,k));
-        if (nr==4)
-        {
-          blockB[count+2] = numext::conj(rhs(j2+2,k));
-          blockB[count+3] = numext::conj(rhs(j2+3,k));
+    if (nr >= 8) {
+      for (Index j2 = k2 + rows; j2 < packet_cols8; j2 += 8) {
+        for (Index k = k2; k < end_k; k++) {
+          blockB[count + 0] = numext::conj(rhs(j2 + 0, k));
+          blockB[count + 1] = numext::conj(rhs(j2 + 1, k));
+          blockB[count + 2] = numext::conj(rhs(j2 + 2, k));
+          blockB[count + 3] = numext::conj(rhs(j2 + 3, k));
+          blockB[count + 4] = numext::conj(rhs(j2 + 4, k));
+          blockB[count + 5] = numext::conj(rhs(j2 + 5, k));
+          blockB[count + 6] = numext::conj(rhs(j2 + 6, k));
+          blockB[count + 7] = numext::conj(rhs(j2 + 7, k));
+          count += 8;
+        }
+      }
+    }
+    if (nr >= 4) {
+      for (Index j2 = (std::max)(packet_cols8, k2 + rows); j2 < packet_cols4; j2 += 4) {
+        for (Index k = k2; k < end_k; k++) {
+          blockB[count + 0] = numext::conj(rhs(j2 + 0, k));
+          blockB[count + 1] = numext::conj(rhs(j2 + 1, k));
+          blockB[count + 2] = numext::conj(rhs(j2 + 2, k));
+          blockB[count + 3] = numext::conj(rhs(j2 + 3, k));
+          count += 4;
         }
-        count += nr;
       }
     }
 
     // copy the remaining columns one at a time (=> the same with nr==1)
-    for(Index j2=packet_cols; j2<cols; ++j2)
-    {
+    for (Index j2 = packet_cols4; j2 < cols; ++j2) {
       // transpose
-      Index half = (std::min)(end_k,j2);
-      for(Index k=k2; k<half; k++)
-      {
-        blockB[count] = numext::conj(rhs(j2,k));
+      Index half = (std::min)(end_k, j2);
+      for (Index k = k2; k < half; k++) {
+        blockB[count] = numext::conj(rhs(j2, k));
         count += 1;
       }
 
-      if(half==j2 && half<k2+rows)
-      {
-        blockB[count] = numext::real(rhs(j2,j2));
+      if (half == j2 && half < k2 + rows) {
+        blockB[count] = numext::real(rhs(j2, j2));
         count += 1;
-      }
-      else
+      } else
         half--;
 
       // normal
-      for(Index k=half+1; k<k2+rows; k++)
-      {
-        blockB[count] = rhs(k,j2);
+      for (Index k = half + 1; k < k2 + rows; k++) {
+        blockB[count] = rhs(k, j2);
         count += 1;
       }
     }
@@ -194,243 +259,225 @@ struct symm_pack_rhs
 /* Optimized selfadjoint matrix * matrix (_SYMM) product built on top of
  * the general matrix matrix product.
  */
-template <typename Scalar, typename Index,
-          int LhsStorageOrder, bool LhsSelfAdjoint, bool ConjugateLhs,
-          int RhsStorageOrder, bool RhsSelfAdjoint, bool ConjugateRhs,
-          int ResStorageOrder>
+template <typename Scalar, typename Index, int LhsStorageOrder, bool LhsSelfAdjoint, bool ConjugateLhs,
+          int RhsStorageOrder, bool RhsSelfAdjoint, bool ConjugateRhs, int ResStorageOrder, int ResInnerStride>
 struct product_selfadjoint_matrix;
 
-template <typename Scalar, typename Index,
-          int LhsStorageOrder, bool LhsSelfAdjoint, bool ConjugateLhs,
-          int RhsStorageOrder, bool RhsSelfAdjoint, bool ConjugateRhs>
-struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,LhsSelfAdjoint,ConjugateLhs, RhsStorageOrder,RhsSelfAdjoint,ConjugateRhs,RowMajor>
-{
-
-  static EIGEN_STRONG_INLINE void run(
-    Index rows, Index cols,
-    const Scalar* lhs, Index lhsStride,
-    const Scalar* rhs, Index rhsStride,
-    Scalar* res,       Index resStride,
-    const Scalar& alpha)
-  {
-    product_selfadjoint_matrix<Scalar, Index,
-      EIGEN_LOGICAL_XOR(RhsSelfAdjoint,RhsStorageOrder==RowMajor) ? ColMajor : RowMajor,
-      RhsSelfAdjoint, NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(RhsSelfAdjoint,ConjugateRhs),
-      EIGEN_LOGICAL_XOR(LhsSelfAdjoint,LhsStorageOrder==RowMajor) ? ColMajor : RowMajor,
-      LhsSelfAdjoint, NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(LhsSelfAdjoint,ConjugateLhs),
-      ColMajor>
-      ::run(cols, rows,  rhs, rhsStride,  lhs, lhsStride,  res, resStride,  alpha);
+template <typename Scalar, typename Index, int LhsStorageOrder, bool LhsSelfAdjoint, bool ConjugateLhs,
+          int RhsStorageOrder, bool RhsSelfAdjoint, bool ConjugateRhs, int ResInnerStride>
+struct product_selfadjoint_matrix<Scalar, Index, LhsStorageOrder, LhsSelfAdjoint, ConjugateLhs, RhsStorageOrder,
+                                  RhsSelfAdjoint, ConjugateRhs, RowMajor, ResInnerStride> {
+  static EIGEN_STRONG_INLINE void run(Index rows, Index cols, const Scalar* lhs, Index lhsStride, const Scalar* rhs,
+                                      Index rhsStride, Scalar* res, Index resIncr, Index resStride, const Scalar& alpha,
+                                      level3_blocking<Scalar, Scalar>& blocking) {
+    product_selfadjoint_matrix<
+        Scalar, Index, logical_xor(RhsSelfAdjoint, RhsStorageOrder == RowMajor) ? ColMajor : RowMajor, RhsSelfAdjoint,
+        NumTraits<Scalar>::IsComplex && logical_xor(RhsSelfAdjoint, ConjugateRhs),
+        logical_xor(LhsSelfAdjoint, LhsStorageOrder == RowMajor) ? ColMajor : RowMajor, LhsSelfAdjoint,
+        NumTraits<Scalar>::IsComplex && logical_xor(LhsSelfAdjoint, ConjugateLhs), ColMajor,
+        ResInnerStride>::run(cols, rows, rhs, rhsStride, lhs, lhsStride, res, resIncr, resStride, alpha, blocking);
   }
 };
 
-template <typename Scalar, typename Index,
-          int LhsStorageOrder, bool ConjugateLhs,
-          int RhsStorageOrder, bool ConjugateRhs>
-struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs, RhsStorageOrder,false,ConjugateRhs,ColMajor>
-{
-
-  static EIGEN_DONT_INLINE void run(
-    Index rows, Index cols,
-    const Scalar* _lhs, Index lhsStride,
-    const Scalar* _rhs, Index rhsStride,
-    Scalar* res,        Index resStride,
-    const Scalar& alpha);
+template <typename Scalar, typename Index, int LhsStorageOrder, bool ConjugateLhs, int RhsStorageOrder,
+          bool ConjugateRhs, int ResInnerStride>
+struct product_selfadjoint_matrix<Scalar, Index, LhsStorageOrder, true, ConjugateLhs, RhsStorageOrder, false,
+                                  ConjugateRhs, ColMajor, ResInnerStride> {
+  static EIGEN_DONT_INLINE void run(Index rows, Index cols, const Scalar* lhs_, Index lhsStride, const Scalar* rhs_,
+                                    Index rhsStride, Scalar* res, Index resIncr, Index resStride, const Scalar& alpha,
+                                    level3_blocking<Scalar, Scalar>& blocking);
 };
 
-template <typename Scalar, typename Index,
-          int LhsStorageOrder, bool ConjugateLhs,
-          int RhsStorageOrder, bool ConjugateRhs>
-EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,true,ConjugateLhs, RhsStorageOrder,false,ConjugateRhs,ColMajor>::run(
-    Index rows, Index cols,
-    const Scalar* _lhs, Index lhsStride,
-    const Scalar* _rhs, Index rhsStride,
-    Scalar* res,        Index resStride,
-    const Scalar& alpha)
-  {
-    Index size = rows;
-
-    const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
-    const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
-
-    typedef gebp_traits<Scalar,Scalar> Traits;
-
-    Index kc = size;  // cache block size along the K direction
-    Index mc = rows;  // cache block size along the M direction
-    Index nc = cols;  // cache block size along the N direction
-    computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
-    // kc must smaller than mc
-    kc = (std::min)(kc,mc);
-
-    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
-    std::size_t sizeB = sizeW + kc*cols;
-    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0);
-    ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0);
-    Scalar* blockB = allocatedBlockB + sizeW;
-
-    gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
-    symm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
-    gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
-    gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder==RowMajor?ColMajor:RowMajor, true> pack_lhs_transposed;
-
-    for(Index k2=0; k2<size; k2+=kc)
+template <typename Scalar, typename Index, int LhsStorageOrder, bool ConjugateLhs, int RhsStorageOrder,
+          bool ConjugateRhs, int ResInnerStride>
+EIGEN_DONT_INLINE void
+product_selfadjoint_matrix<Scalar, Index, LhsStorageOrder, true, ConjugateLhs, RhsStorageOrder, false, ConjugateRhs,
+                           ColMajor, ResInnerStride>::run(Index rows, Index cols, const Scalar* lhs_, Index lhsStride,
+                                                          const Scalar* rhs_, Index rhsStride, Scalar* res_,
+                                                          Index resIncr, Index resStride, const Scalar& alpha,
+                                                          level3_blocking<Scalar, Scalar>& blocking) {
+  Index size = rows;
+
+  typedef gebp_traits<Scalar, Scalar> Traits;
+
+  typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
+  typedef const_blas_data_mapper<Scalar, Index, (LhsStorageOrder == RowMajor) ? ColMajor : RowMajor> LhsTransposeMapper;
+  typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
+  typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
+  LhsMapper lhs(lhs_, lhsStride);
+  LhsTransposeMapper lhs_transpose(lhs_, lhsStride);
+  RhsMapper rhs(rhs_, rhsStride);
+  ResMapper res(res_, resStride, resIncr);
+
+  Index kc = blocking.kc();                    // cache block size along the K direction
+  Index mc = (std::min)(rows, blocking.mc());  // cache block size along the M direction
+  // kc must be smaller than mc
+  kc = (std::min)(kc, mc);
+  std::size_t sizeA = kc * mc;
+  std::size_t sizeB = kc * cols;
+  ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
+  ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
+
+  gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
+  symm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
+  gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
+  gemm_pack_lhs<Scalar, Index, LhsTransposeMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing,
+                LhsStorageOrder == RowMajor ? ColMajor : RowMajor, true>
+      pack_lhs_transposed;
+
+  for (Index k2 = 0; k2 < size; k2 += kc) {
+    const Index actual_kc = (std::min)(k2 + kc, size) - k2;
+
+    // we have selected one row panel of rhs and one column panel of lhs
+    // pack rhs's panel into a sequential chunk of memory
+    // and expand each coeff to a constant packet for further reuse
+    pack_rhs(blockB, rhs.getSubMapper(k2, 0), actual_kc, cols);
+
+    // the select lhs's panel has to be split in three different parts:
+    //  1 - the transposed panel above the diagonal block => transposed packed copy
+    //  2 - the diagonal block => special packed copy
+    //  3 - the panel below the diagonal block => generic packed copy
+    for (Index i2 = 0; i2 < k2; i2 += mc) {
+      const Index actual_mc = (std::min)(i2 + mc, k2) - i2;
+      // transposed packed copy
+      pack_lhs_transposed(blockA, lhs_transpose.getSubMapper(i2, k2), actual_kc, actual_mc);
+
+      gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
+    }
+    // the block diagonal
     {
-      const Index actual_kc = (std::min)(k2+kc,size)-k2;
-
-      // we have selected one row panel of rhs and one column panel of lhs
-      // pack rhs's panel into a sequential chunk of memory
-      // and expand each coeff to a constant packet for further reuse
-      pack_rhs(blockB, &rhs(k2,0), rhsStride, actual_kc, cols);
-
-      // the select lhs's panel has to be split in three different parts:
-      //  1 - the transposed panel above the diagonal block => transposed packed copy
-      //  2 - the diagonal block => special packed copy
-      //  3 - the panel below the diagonal block => generic packed copy
-      for(Index i2=0; i2<k2; i2+=mc)
-      {
-        const Index actual_mc = (std::min)(i2+mc,k2)-i2;
-        // transposed packed copy
-        pack_lhs_transposed(blockA, &lhs(k2, i2), lhsStride, actual_kc, actual_mc);
-
-        gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
-      }
-      // the block diagonal
-      {
-        const Index actual_mc = (std::min)(k2+kc,size)-k2;
-        // symmetric packed copy
-        pack_lhs(blockA, &lhs(k2,k2), lhsStride, actual_kc, actual_mc);
+      const Index actual_mc = (std::min)(k2 + kc, size) - k2;
+      // symmetric packed copy
+      pack_lhs(blockA, &lhs(k2, k2), lhsStride, actual_kc, actual_mc);
 
-        gebp_kernel(res+k2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
-      }
+      gebp_kernel(res.getSubMapper(k2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
+    }
 
-      for(Index i2=k2+kc; i2<size; i2+=mc)
-      {
-        const Index actual_mc = (std::min)(i2+mc,size)-i2;
-        gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder,false>()
-          (blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc);
+    for (Index i2 = k2 + kc; i2 < size; i2 += mc) {
+      const Index actual_mc = (std::min)(i2 + mc, size) - i2;
+      gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing,
+                    LhsStorageOrder, false>()(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
 
-        gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
-      }
+      gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
     }
   }
+}
 
 // matrix * selfadjoint product
-template <typename Scalar, typename Index,
-          int LhsStorageOrder, bool ConjugateLhs,
-          int RhsStorageOrder, bool ConjugateRhs>
-struct product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLhs, RhsStorageOrder,true,ConjugateRhs,ColMajor>
-{
-
-  static EIGEN_DONT_INLINE void run(
-    Index rows, Index cols,
-    const Scalar* _lhs, Index lhsStride,
-    const Scalar* _rhs, Index rhsStride,
-    Scalar* res,        Index resStride,
-    const Scalar& alpha);
+template <typename Scalar, typename Index, int LhsStorageOrder, bool ConjugateLhs, int RhsStorageOrder,
+          bool ConjugateRhs, int ResInnerStride>
+struct product_selfadjoint_matrix<Scalar, Index, LhsStorageOrder, false, ConjugateLhs, RhsStorageOrder, true,
+                                  ConjugateRhs, ColMajor, ResInnerStride> {
+  static EIGEN_DONT_INLINE void run(Index rows, Index cols, const Scalar* lhs_, Index lhsStride, const Scalar* rhs_,
+                                    Index rhsStride, Scalar* res, Index resIncr, Index resStride, const Scalar& alpha,
+                                    level3_blocking<Scalar, Scalar>& blocking);
 };
 
-template <typename Scalar, typename Index,
-          int LhsStorageOrder, bool ConjugateLhs,
-          int RhsStorageOrder, bool ConjugateRhs>
-EIGEN_DONT_INLINE void product_selfadjoint_matrix<Scalar,Index,LhsStorageOrder,false,ConjugateLhs, RhsStorageOrder,true,ConjugateRhs,ColMajor>::run(
-    Index rows, Index cols,
-    const Scalar* _lhs, Index lhsStride,
-    const Scalar* _rhs, Index rhsStride,
-    Scalar* res,        Index resStride,
-    const Scalar& alpha)
-  {
-    Index size = cols;
-
-    const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
-
-    typedef gebp_traits<Scalar,Scalar> Traits;
-
-    Index kc = size; // cache block size along the K direction
-    Index mc = rows;  // cache block size along the M direction
-    Index nc = cols;  // cache block size along the N direction
-    computeProductBlockingSizes<Scalar,Scalar>(kc, mc, nc);
-    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
-    std::size_t sizeB = sizeW + kc*cols;
-    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, kc*mc, 0);
-    ei_declare_aligned_stack_constructed_variable(Scalar, allocatedBlockB, sizeB, 0);
-    Scalar* blockB = allocatedBlockB + sizeW;
-
-    gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
-    gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
-    symm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
-
-    for(Index k2=0; k2<size; k2+=kc)
-    {
-      const Index actual_kc = (std::min)(k2+kc,size)-k2;
-
-      pack_rhs(blockB, _rhs, rhsStride, actual_kc, cols, k2);
-
-      // => GEPP
-      for(Index i2=0; i2<rows; i2+=mc)
-      {
-        const Index actual_mc = (std::min)(i2+mc,rows)-i2;
-        pack_lhs(blockA, &lhs(i2, k2), lhsStride, actual_kc, actual_mc);
-
-        gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha);
-      }
+template <typename Scalar, typename Index, int LhsStorageOrder, bool ConjugateLhs, int RhsStorageOrder,
+          bool ConjugateRhs, int ResInnerStride>
+EIGEN_DONT_INLINE void
+product_selfadjoint_matrix<Scalar, Index, LhsStorageOrder, false, ConjugateLhs, RhsStorageOrder, true, ConjugateRhs,
+                           ColMajor, ResInnerStride>::run(Index rows, Index cols, const Scalar* lhs_, Index lhsStride,
+                                                          const Scalar* rhs_, Index rhsStride, Scalar* res_,
+                                                          Index resIncr, Index resStride, const Scalar& alpha,
+                                                          level3_blocking<Scalar, Scalar>& blocking) {
+  Index size = cols;
+
+  typedef gebp_traits<Scalar, Scalar> Traits;
+
+  typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
+  typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
+  LhsMapper lhs(lhs_, lhsStride);
+  ResMapper res(res_, resStride, resIncr);
+
+  Index kc = blocking.kc();                    // cache block size along the K direction
+  Index mc = (std::min)(rows, blocking.mc());  // cache block size along the M direction
+  std::size_t sizeA = kc * mc;
+  std::size_t sizeB = kc * cols;
+  ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
+  ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
+
+  gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
+  gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing,
+                LhsStorageOrder>
+      pack_lhs;
+  symm_pack_rhs<Scalar, Index, Traits::nr, RhsStorageOrder> pack_rhs;
+
+  for (Index k2 = 0; k2 < size; k2 += kc) {
+    const Index actual_kc = (std::min)(k2 + kc, size) - k2;
+
+    pack_rhs(blockB, rhs_, rhsStride, actual_kc, cols, k2);
+
+    // => GEPP
+    for (Index i2 = 0; i2 < rows; i2 += mc) {
+      const Index actual_mc = (std::min)(i2 + mc, rows) - i2;
+      pack_lhs(blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
+
+      gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha);
     }
   }
+}
 
-} // end namespace internal
+}  // end namespace internal
 
 /***************************************************************************
-* Wrapper to product_selfadjoint_matrix
-***************************************************************************/
+ * Wrapper to product_selfadjoint_matrix
+ ***************************************************************************/
 
 namespace internal {
-template<typename Lhs, int LhsMode, typename Rhs, int RhsMode>
-struct traits<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false> >
-  : traits<ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>, Lhs, Rhs> >
-{};
-}
 
-template<typename Lhs, int LhsMode, typename Rhs, int RhsMode>
-struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>
-  : public ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,RhsMode,false>, Lhs, Rhs >
-{
-  EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix)
+template <typename Lhs, int LhsMode, typename Rhs, int RhsMode>
+struct selfadjoint_product_impl<Lhs, LhsMode, false, Rhs, RhsMode, false> {
+  typedef typename Product<Lhs, Rhs>::Scalar Scalar;
 
-  SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
+  typedef internal::blas_traits<Lhs> LhsBlasTraits;
+  typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
+  typedef internal::blas_traits<Rhs> RhsBlasTraits;
+  typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
 
   enum {
-    LhsIsUpper = (LhsMode&(Upper|Lower))==Upper,
-    LhsIsSelfAdjoint = (LhsMode&SelfAdjoint)==SelfAdjoint,
-    RhsIsUpper = (RhsMode&(Upper|Lower))==Upper,
-    RhsIsSelfAdjoint = (RhsMode&SelfAdjoint)==SelfAdjoint
+    LhsIsUpper = (LhsMode & (Upper | Lower)) == Upper,
+    LhsIsSelfAdjoint = (LhsMode & SelfAdjoint) == SelfAdjoint,
+    RhsIsUpper = (RhsMode & (Upper | Lower)) == Upper,
+    RhsIsSelfAdjoint = (RhsMode & SelfAdjoint) == SelfAdjoint
   };
 
-  template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const
-  {
-    eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols());
-
-    typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(m_lhs);
-    typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(m_rhs);
-
-    Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs)
-                               * RhsBlasTraits::extractScalarFactor(m_rhs);
-
-    internal::product_selfadjoint_matrix<Scalar, Index,
-      EIGEN_LOGICAL_XOR(LhsIsUpper,
-                        internal::traits<Lhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, LhsIsSelfAdjoint,
-      NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(LhsIsUpper,bool(LhsBlasTraits::NeedToConjugate)),
-      EIGEN_LOGICAL_XOR(RhsIsUpper,
-                        internal::traits<Rhs>::Flags &RowMajorBit) ? RowMajor : ColMajor, RhsIsSelfAdjoint,
-      NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(RhsIsUpper,bool(RhsBlasTraits::NeedToConjugate)),
-      internal::traits<Dest>::Flags&RowMajorBit  ? RowMajor : ColMajor>
-      ::run(
-        lhs.rows(), rhs.cols(),                 // sizes
-        &lhs.coeffRef(0,0),    lhs.outerStride(),  // lhs info
-        &rhs.coeffRef(0,0),    rhs.outerStride(),  // rhs info
-        &dst.coeffRef(0,0), dst.outerStride(),  // result info
-        actualAlpha                             // alpha
-      );
+  template <typename Dest>
+  static void run(Dest& dst, const Lhs& a_lhs, const Rhs& a_rhs, const Scalar& alpha) {
+    eigen_assert(dst.rows() == a_lhs.rows() && dst.cols() == a_rhs.cols());
+
+    add_const_on_value_type_t<ActualLhsType> lhs = LhsBlasTraits::extract(a_lhs);
+    add_const_on_value_type_t<ActualRhsType> rhs = RhsBlasTraits::extract(a_rhs);
+
+    Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs) * RhsBlasTraits::extractScalarFactor(a_rhs);
+
+    typedef internal::gemm_blocking_space<(Dest::Flags & RowMajorBit) ? RowMajor : ColMajor, Scalar, Scalar,
+                                          Lhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime,
+                                          Lhs::MaxColsAtCompileTime, 1>
+        BlockingType;
+
+    BlockingType blocking(lhs.rows(), rhs.cols(), lhs.cols(), 1, false);
+
+    internal::product_selfadjoint_matrix<
+        Scalar, Index,
+        internal::logical_xor(LhsIsUpper, internal::traits<Lhs>::Flags & RowMajorBit) ? RowMajor : ColMajor,
+        LhsIsSelfAdjoint,
+        NumTraits<Scalar>::IsComplex && internal::logical_xor(LhsIsUpper, bool(LhsBlasTraits::NeedToConjugate)),
+        internal::logical_xor(RhsIsUpper, internal::traits<Rhs>::Flags & RowMajorBit) ? RowMajor : ColMajor,
+        RhsIsSelfAdjoint,
+        NumTraits<Scalar>::IsComplex && internal::logical_xor(RhsIsUpper, bool(RhsBlasTraits::NeedToConjugate)),
+        internal::traits<Dest>::Flags & RowMajorBit ? RowMajor : ColMajor,
+        Dest::InnerStrideAtCompileTime>::run(lhs.rows(), rhs.cols(),                                     // sizes
+                                             &lhs.coeffRef(0, 0), lhs.outerStride(),                     // lhs info
+                                             &rhs.coeffRef(0, 0), rhs.outerStride(),                     // rhs info
+                                             &dst.coeffRef(0, 0), dst.innerStride(), dst.outerStride(),  // result info
+                                             actualAlpha, blocking                                       // alpha
+    );
   }
 };
 
-} // end namespace Eigen
+}  // end namespace internal
+
+}  // end namespace Eigen
 
-#endif // EIGEN_SELFADJOINT_MATRIX_MATRIX_H
+#endif  // EIGEN_SELFADJOINT_MATRIX_MATRIX_H
diff --git a/inst/include/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h b/inst/include/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h
new file mode 100644
index 00000000..c0dbfd18
--- /dev/null
+++ b/inst/include/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h
@@ -0,0 +1,277 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+ ********************************************************************************
+ *   Content : Eigen bindings to BLAS F77
+ *   Self adjoint matrix * matrix product functionality based on ?SYMM/?HEMM.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_SELFADJOINT_MATRIX_MATRIX_BLAS_H
+#define EIGEN_SELFADJOINT_MATRIX_MATRIX_BLAS_H
+
+// IWYU pragma: private
+#include "../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+/* Optimized selfadjoint matrix * matrix (?SYMM/?HEMM) product */
+
+#define EIGEN_BLAS_SYMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC)                                                \
+  template <typename Index, int LhsStorageOrder, bool ConjugateLhs, int RhsStorageOrder, bool ConjugateRhs>      \
+  struct product_selfadjoint_matrix<EIGTYPE, Index, LhsStorageOrder, true, ConjugateLhs, RhsStorageOrder, false, \
+                                    ConjugateRhs, ColMajor, 1> {                                                 \
+    static void run(Index rows, Index cols, const EIGTYPE* _lhs, Index lhsStride, const EIGTYPE* _rhs,           \
+                    Index rhsStride, EIGTYPE* res, Index resIncr, Index resStride, EIGTYPE alpha,                \
+                    level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) {                                           \
+      if (rows == 0 || cols == 0) return;                                                                        \
+      EIGEN_ONLY_USED_FOR_DEBUG(resIncr);                                                                        \
+      eigen_assert(resIncr == 1);                                                                                \
+      char side = 'L', uplo = 'L';                                                                               \
+      BlasIndex m, n, lda, ldb, ldc;                                                                             \
+      const EIGTYPE *a, *b;                                                                                      \
+      EIGTYPE beta(1);                                                                                           \
+      MatrixX##EIGPREFIX b_tmp;                                                                                  \
+                                                                                                                 \
+      /* Set transpose options */                                                                                \
+      /* Set m, n, k */                                                                                          \
+      m = convert_index<BlasIndex>(rows);                                                                        \
+      n = convert_index<BlasIndex>(cols);                                                                        \
+                                                                                                                 \
+      /* Set lda, ldb, ldc */                                                                                    \
+      lda = convert_index<BlasIndex>(lhsStride);                                                                 \
+      ldb = convert_index<BlasIndex>(rhsStride);                                                                 \
+      ldc = convert_index<BlasIndex>(resStride);                                                                 \
+                                                                                                                 \
+      /* Set a, b, c */                                                                                          \
+      if (LhsStorageOrder == RowMajor) uplo = 'U';                                                               \
+      a = _lhs;                                                                                                  \
+                                                                                                                 \
+      if (RhsStorageOrder == RowMajor) {                                                                         \
+        Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs, n, m, OuterStride<>(rhsStride));              \
+        b_tmp = rhs.adjoint();                                                                                   \
+        b = b_tmp.data();                                                                                        \
+        ldb = convert_index<BlasIndex>(b_tmp.outerStride());                                                     \
+      } else                                                                                                     \
+        b = _rhs;                                                                                                \
+                                                                                                                 \
+      BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda,        \
+               (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc);        \
+    }                                                                                                            \
+  };
+
+#define EIGEN_BLAS_HEMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC)                                                  \
+  template <typename Index, int LhsStorageOrder, bool ConjugateLhs, int RhsStorageOrder, bool ConjugateRhs>        \
+  struct product_selfadjoint_matrix<EIGTYPE, Index, LhsStorageOrder, true, ConjugateLhs, RhsStorageOrder, false,   \
+                                    ConjugateRhs, ColMajor, 1> {                                                   \
+    static void run(Index rows, Index cols, const EIGTYPE* _lhs, Index lhsStride, const EIGTYPE* _rhs,             \
+                    Index rhsStride, EIGTYPE* res, Index resIncr, Index resStride, EIGTYPE alpha,                  \
+                    level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) {                                             \
+      if (rows == 0 || cols == 0) return;                                                                          \
+      EIGEN_ONLY_USED_FOR_DEBUG(resIncr);                                                                          \
+      eigen_assert(resIncr == 1);                                                                                  \
+      char side = 'L', uplo = 'L';                                                                                 \
+      BlasIndex m, n, lda, ldb, ldc;                                                                               \
+      const EIGTYPE *a, *b;                                                                                        \
+      EIGTYPE beta(1);                                                                                             \
+      MatrixX##EIGPREFIX b_tmp;                                                                                    \
+      Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder> a_tmp;                                                    \
+                                                                                                                   \
+      /* Set transpose options */                                                                                  \
+      /* Set m, n, k */                                                                                            \
+      m = convert_index<BlasIndex>(rows);                                                                          \
+      n = convert_index<BlasIndex>(cols);                                                                          \
+                                                                                                                   \
+      /* Set lda, ldb, ldc */                                                                                      \
+      lda = convert_index<BlasIndex>(lhsStride);                                                                   \
+      ldb = convert_index<BlasIndex>(rhsStride);                                                                   \
+      ldc = convert_index<BlasIndex>(resStride);                                                                   \
+                                                                                                                   \
+      /* Set a, b, c */                                                                                            \
+      if (((LhsStorageOrder == ColMajor) && ConjugateLhs) || ((LhsStorageOrder == RowMajor) && (!ConjugateLhs))) { \
+        Map<const Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder>, 0, OuterStride<> > lhs(                      \
+            _lhs, m, m, OuterStride<>(lhsStride));                                                                 \
+        a_tmp = lhs.conjugate();                                                                                   \
+        a = a_tmp.data();                                                                                          \
+        lda = convert_index<BlasIndex>(a_tmp.outerStride());                                                       \
+      } else                                                                                                       \
+        a = _lhs;                                                                                                  \
+      if (LhsStorageOrder == RowMajor) uplo = 'U';                                                                 \
+                                                                                                                   \
+      if (RhsStorageOrder == ColMajor && (!ConjugateRhs)) {                                                        \
+        b = _rhs;                                                                                                  \
+      } else {                                                                                                     \
+        if (RhsStorageOrder == ColMajor && ConjugateRhs) {                                                         \
+          Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs, m, n, OuterStride<>(rhsStride));              \
+          b_tmp = rhs.conjugate();                                                                                 \
+        } else if (ConjugateRhs) {                                                                                 \
+          Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs, n, m, OuterStride<>(rhsStride));              \
+          b_tmp = rhs.adjoint();                                                                                   \
+        } else {                                                                                                   \
+          Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs, n, m, OuterStride<>(rhsStride));              \
+          b_tmp = rhs.transpose();                                                                                 \
+        }                                                                                                          \
+        b = b_tmp.data();                                                                                          \
+        ldb = convert_index<BlasIndex>(b_tmp.outerStride());                                                       \
+      }                                                                                                            \
+                                                                                                                   \
+      BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda,          \
+               (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc);          \
+    }                                                                                                              \
+  };
+
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_SYMM_L(double, double, d, dsymm)
+EIGEN_BLAS_SYMM_L(float, float, f, ssymm)
+EIGEN_BLAS_HEMM_L(dcomplex, MKL_Complex16, cd, zhemm)
+EIGEN_BLAS_HEMM_L(scomplex, MKL_Complex8, cf, chemm)
+#else
+EIGEN_BLAS_SYMM_L(double, double, d, dsymm_)
+EIGEN_BLAS_SYMM_L(float, float, f, ssymm_)
+EIGEN_BLAS_HEMM_L(dcomplex, double, cd, zhemm_)
+EIGEN_BLAS_HEMM_L(scomplex, float, cf, chemm_)
+#endif
+
+/* Optimized matrix * selfadjoint matrix (?SYMM/?HEMM) product */
+
+#define EIGEN_BLAS_SYMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC)                                                \
+  template <typename Index, int LhsStorageOrder, bool ConjugateLhs, int RhsStorageOrder, bool ConjugateRhs>      \
+  struct product_selfadjoint_matrix<EIGTYPE, Index, LhsStorageOrder, false, ConjugateLhs, RhsStorageOrder, true, \
+                                    ConjugateRhs, ColMajor, 1> {                                                 \
+    static void run(Index rows, Index cols, const EIGTYPE* _lhs, Index lhsStride, const EIGTYPE* _rhs,           \
+                    Index rhsStride, EIGTYPE* res, Index resIncr, Index resStride, EIGTYPE alpha,                \
+                    level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) {                                           \
+      if (rows == 0 || cols == 0) return;                                                                        \
+      EIGEN_ONLY_USED_FOR_DEBUG(resIncr);                                                                        \
+      eigen_assert(resIncr == 1);                                                                                \
+      char side = 'R', uplo = 'L';                                                                               \
+      BlasIndex m, n, lda, ldb, ldc;                                                                             \
+      const EIGTYPE *a, *b;                                                                                      \
+      EIGTYPE beta(1);                                                                                           \
+      MatrixX##EIGPREFIX b_tmp;                                                                                  \
+                                                                                                                 \
+      /* Set m, n, k */                                                                                          \
+      m = convert_index<BlasIndex>(rows);                                                                        \
+      n = convert_index<BlasIndex>(cols);                                                                        \
+                                                                                                                 \
+      /* Set lda, ldb, ldc */                                                                                    \
+      lda = convert_index<BlasIndex>(rhsStride);                                                                 \
+      ldb = convert_index<BlasIndex>(lhsStride);                                                                 \
+      ldc = convert_index<BlasIndex>(resStride);                                                                 \
+                                                                                                                 \
+      /* Set a, b, c */                                                                                          \
+      if (RhsStorageOrder == RowMajor) uplo = 'U';                                                               \
+      a = _rhs;                                                                                                  \
+                                                                                                                 \
+      if (LhsStorageOrder == RowMajor) {                                                                         \
+        Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs, n, m, OuterStride<>(rhsStride));              \
+        b_tmp = lhs.adjoint();                                                                                   \
+        b = b_tmp.data();                                                                                        \
+        ldb = convert_index<BlasIndex>(b_tmp.outerStride());                                                     \
+      } else                                                                                                     \
+        b = _lhs;                                                                                                \
+                                                                                                                 \
+      BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda,        \
+               (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc);        \
+    }                                                                                                            \
+  };
+
+#define EIGEN_BLAS_HEMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC)                                                  \
+  template <typename Index, int LhsStorageOrder, bool ConjugateLhs, int RhsStorageOrder, bool ConjugateRhs>        \
+  struct product_selfadjoint_matrix<EIGTYPE, Index, LhsStorageOrder, false, ConjugateLhs, RhsStorageOrder, true,   \
+                                    ConjugateRhs, ColMajor, 1> {                                                   \
+    static void run(Index rows, Index cols, const EIGTYPE* _lhs, Index lhsStride, const EIGTYPE* _rhs,             \
+                    Index rhsStride, EIGTYPE* res, Index resIncr, Index resStride, EIGTYPE alpha,                  \
+                    level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) {                                             \
+      EIGEN_ONLY_USED_FOR_DEBUG(resIncr);                                                                          \
+      eigen_assert(resIncr == 1);                                                                                  \
+      char side = 'R', uplo = 'L';                                                                                 \
+      BlasIndex m, n, lda, ldb, ldc;                                                                               \
+      const EIGTYPE *a, *b;                                                                                        \
+      EIGTYPE beta(1);                                                                                             \
+      MatrixX##EIGPREFIX b_tmp;                                                                                    \
+      Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> a_tmp;                                                    \
+                                                                                                                   \
+      /* Set m, n, k */                                                                                            \
+      m = convert_index<BlasIndex>(rows);                                                                          \
+      n = convert_index<BlasIndex>(cols);                                                                          \
+                                                                                                                   \
+      /* Set lda, ldb, ldc */                                                                                      \
+      lda = convert_index<BlasIndex>(rhsStride);                                                                   \
+      ldb = convert_index<BlasIndex>(lhsStride);                                                                   \
+      ldc = convert_index<BlasIndex>(resStride);                                                                   \
+                                                                                                                   \
+      /* Set a, b, c */                                                                                            \
+      if (((RhsStorageOrder == ColMajor) && ConjugateRhs) || ((RhsStorageOrder == RowMajor) && (!ConjugateRhs))) { \
+        Map<const Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder>, 0, OuterStride<> > rhs(                      \
+            _rhs, n, n, OuterStride<>(rhsStride));                                                                 \
+        a_tmp = rhs.conjugate();                                                                                   \
+        a = a_tmp.data();                                                                                          \
+        lda = convert_index<BlasIndex>(a_tmp.outerStride());                                                       \
+      } else                                                                                                       \
+        a = _rhs;                                                                                                  \
+      if (RhsStorageOrder == RowMajor) uplo = 'U';                                                                 \
+                                                                                                                   \
+      if (LhsStorageOrder == ColMajor && (!ConjugateLhs)) {                                                        \
+        b = _lhs;                                                                                                  \
+      } else {                                                                                                     \
+        if (LhsStorageOrder == ColMajor && ConjugateLhs) {                                                         \
+          Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs, m, n, OuterStride<>(lhsStride));              \
+          b_tmp = lhs.conjugate();                                                                                 \
+        } else if (ConjugateLhs) {                                                                                 \
+          Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs, n, m, OuterStride<>(lhsStride));              \
+          b_tmp = lhs.adjoint();                                                                                   \
+        } else {                                                                                                   \
+          Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs, n, m, OuterStride<>(lhsStride));              \
+          b_tmp = lhs.transpose();                                                                                 \
+        }                                                                                                          \
+        b = b_tmp.data();                                                                                          \
+        ldb = convert_index<BlasIndex>(b_tmp.outerStride());                                                       \
+      }                                                                                                            \
+                                                                                                                   \
+      BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda,          \
+               (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc);          \
+    }                                                                                                              \
+  };
+
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_SYMM_R(double, double, d, dsymm)
+EIGEN_BLAS_SYMM_R(float, float, f, ssymm)
+EIGEN_BLAS_HEMM_R(dcomplex, MKL_Complex16, cd, zhemm)
+EIGEN_BLAS_HEMM_R(scomplex, MKL_Complex8, cf, chemm)
+#else
+EIGEN_BLAS_SYMM_R(double, double, d, dsymm_)
+EIGEN_BLAS_SYMM_R(float, float, f, ssymm_)
+EIGEN_BLAS_HEMM_R(dcomplex, double, cd, zhemm_)
+EIGEN_BLAS_HEMM_R(scomplex, float, cf, chemm_)
+#endif
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_SELFADJOINT_MATRIX_MATRIX_BLAS_H
diff --git a/inst/include/Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h b/inst/include/Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h
deleted file mode 100644
index dfa687fe..00000000
--- a/inst/include/Eigen/src/Core/products/SelfadjointMatrixMatrix_MKL.h
+++ /dev/null
@@ -1,295 +0,0 @@
-/*
- Copyright (c) 2011, Intel Corporation. All rights reserved.
-
- Redistribution and use in source and binary forms, with or without modification,
- are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
- * Neither the name of Intel Corporation nor the names of its contributors may
-   be used to endorse or promote products derived from this software without
-   specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
- ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
- ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
- *   Self adjoint matrix * matrix product functionality based on ?SYMM/?HEMM.
- ********************************************************************************
-*/
-
-#ifndef EIGEN_SELFADJOINT_MATRIX_MATRIX_MKL_H
-#define EIGEN_SELFADJOINT_MATRIX_MATRIX_MKL_H
-
-namespace Eigen { 
-
-namespace internal {
-
-
-/* Optimized selfadjoint matrix * matrix (?SYMM/?HEMM) product */
-
-#define EIGEN_MKL_SYMM_L(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \
-template <typename Index, \
-          int LhsStorageOrder, bool ConjugateLhs, \
-          int RhsStorageOrder, bool ConjugateRhs> \
-struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLhs,RhsStorageOrder,false,ConjugateRhs,ColMajor> \
-{\
-\
-  static void run( \
-    Index rows, Index cols, \
-    const EIGTYPE* _lhs, Index lhsStride, \
-    const EIGTYPE* _rhs, Index rhsStride, \
-    EIGTYPE* res,        Index resStride, \
-    EIGTYPE alpha) \
-  { \
-    char side='L', uplo='L'; \
-    MKL_INT m, n, lda, ldb, ldc; \
-    const EIGTYPE *a, *b; \
-    MKLTYPE alpha_, beta_; \
-    MatrixX##EIGPREFIX b_tmp; \
-    EIGTYPE myone(1);\
-\
-/* Set transpose options */ \
-/* Set m, n, k */ \
-    m = (MKL_INT)rows;  \
-    n = (MKL_INT)cols;  \
-\
-/* Set alpha_ & beta_ */ \
-    assign_scalar_eig2mkl(alpha_, alpha); \
-    assign_scalar_eig2mkl(beta_, myone); \
-\
-/* Set lda, ldb, ldc */ \
-    lda = (MKL_INT)lhsStride; \
-    ldb = (MKL_INT)rhsStride; \
-    ldc = (MKL_INT)resStride; \
-\
-/* Set a, b, c */ \
-    if (LhsStorageOrder==RowMajor) uplo='U'; \
-    a = _lhs; \
-\
-    if (RhsStorageOrder==RowMajor) { \
-      Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,n,m,OuterStride<>(rhsStride)); \
-      b_tmp = rhs.adjoint(); \
-      b = b_tmp.data(); \
-      ldb = b_tmp.outerStride(); \
-    } else b = _rhs; \
-\
-    MKLPREFIX##symm(&side, &uplo, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \
-\
-  } \
-};
-
-
-#define EIGEN_MKL_HEMM_L(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \
-template <typename Index, \
-          int LhsStorageOrder, bool ConjugateLhs, \
-          int RhsStorageOrder, bool ConjugateRhs> \
-struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,true,ConjugateLhs,RhsStorageOrder,false,ConjugateRhs,ColMajor> \
-{\
-  static void run( \
-    Index rows, Index cols, \
-    const EIGTYPE* _lhs, Index lhsStride, \
-    const EIGTYPE* _rhs, Index rhsStride, \
-    EIGTYPE* res,        Index resStride, \
-    EIGTYPE alpha) \
-  { \
-    char side='L', uplo='L'; \
-    MKL_INT m, n, lda, ldb, ldc; \
-    const EIGTYPE *a, *b; \
-    MKLTYPE alpha_, beta_; \
-    MatrixX##EIGPREFIX b_tmp; \
-    Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder> a_tmp; \
-    EIGTYPE myone(1); \
-\
-/* Set transpose options */ \
-/* Set m, n, k */ \
-    m = (MKL_INT)rows; \
-    n = (MKL_INT)cols; \
-\
-/* Set alpha_ & beta_ */ \
-    assign_scalar_eig2mkl(alpha_, alpha); \
-    assign_scalar_eig2mkl(beta_, myone); \
-\
-/* Set lda, ldb, ldc */ \
-    lda = (MKL_INT)lhsStride; \
-    ldb = (MKL_INT)rhsStride; \
-    ldc = (MKL_INT)resStride; \
-\
-/* Set a, b, c */ \
-    if (((LhsStorageOrder==ColMajor) && ConjugateLhs) || ((LhsStorageOrder==RowMajor) && (!ConjugateLhs))) { \
-      Map<const Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder>, 0, OuterStride<> > lhs(_lhs,m,m,OuterStride<>(lhsStride)); \
-      a_tmp = lhs.conjugate(); \
-      a = a_tmp.data(); \
-      lda = a_tmp.outerStride(); \
-    } else a = _lhs; \
-    if (LhsStorageOrder==RowMajor) uplo='U'; \
-\
-    if (RhsStorageOrder==ColMajor && (!ConjugateRhs)) { \
-       b = _rhs; } \
-    else { \
-      if (RhsStorageOrder==ColMajor && ConjugateRhs) { \
-        Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,m,n,OuterStride<>(rhsStride)); \
-        b_tmp = rhs.conjugate(); \
-      } else \
-      if (ConjugateRhs) { \
-        Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,n,m,OuterStride<>(rhsStride)); \
-        b_tmp = rhs.adjoint(); \
-      } else { \
-        Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > rhs(_rhs,n,m,OuterStride<>(rhsStride)); \
-        b_tmp = rhs.transpose(); \
-      } \
-      b = b_tmp.data(); \
-      ldb = b_tmp.outerStride(); \
-    } \
-\
-    MKLPREFIX##hemm(&side, &uplo, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \
-\
-  } \
-};
-
-EIGEN_MKL_SYMM_L(double, double, d, d)
-EIGEN_MKL_SYMM_L(float, float, f, s)
-EIGEN_MKL_HEMM_L(dcomplex, MKL_Complex16, cd, z)
-EIGEN_MKL_HEMM_L(scomplex, MKL_Complex8, cf, c)
-
-
-/* Optimized matrix * selfadjoint matrix (?SYMM/?HEMM) product */
-
-#define EIGEN_MKL_SYMM_R(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \
-template <typename Index, \
-          int LhsStorageOrder, bool ConjugateLhs, \
-          int RhsStorageOrder, bool ConjugateRhs> \
-struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateLhs,RhsStorageOrder,true,ConjugateRhs,ColMajor> \
-{\
-\
-  static void run( \
-    Index rows, Index cols, \
-    const EIGTYPE* _lhs, Index lhsStride, \
-    const EIGTYPE* _rhs, Index rhsStride, \
-    EIGTYPE* res,        Index resStride, \
-    EIGTYPE alpha) \
-  { \
-    char side='R', uplo='L'; \
-    MKL_INT m, n, lda, ldb, ldc; \
-    const EIGTYPE *a, *b; \
-    MKLTYPE alpha_, beta_; \
-    MatrixX##EIGPREFIX b_tmp; \
-    EIGTYPE myone(1);\
-\
-/* Set m, n, k */ \
-    m = (MKL_INT)rows;  \
-    n = (MKL_INT)cols;  \
-\
-/* Set alpha_ & beta_ */ \
-    assign_scalar_eig2mkl(alpha_, alpha); \
-    assign_scalar_eig2mkl(beta_, myone); \
-\
-/* Set lda, ldb, ldc */ \
-    lda = (MKL_INT)rhsStride; \
-    ldb = (MKL_INT)lhsStride; \
-    ldc = (MKL_INT)resStride; \
-\
-/* Set a, b, c */ \
-    if (RhsStorageOrder==RowMajor) uplo='U'; \
-    a = _rhs; \
-\
-    if (LhsStorageOrder==RowMajor) { \
-      Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,n,m,OuterStride<>(rhsStride)); \
-      b_tmp = lhs.adjoint(); \
-      b = b_tmp.data(); \
-      ldb = b_tmp.outerStride(); \
-    } else b = _lhs; \
-\
-    MKLPREFIX##symm(&side, &uplo, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \
-\
-  } \
-};
-
-
-#define EIGEN_MKL_HEMM_R(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \
-template <typename Index, \
-          int LhsStorageOrder, bool ConjugateLhs, \
-          int RhsStorageOrder, bool ConjugateRhs> \
-struct product_selfadjoint_matrix<EIGTYPE,Index,LhsStorageOrder,false,ConjugateLhs,RhsStorageOrder,true,ConjugateRhs,ColMajor> \
-{\
-  static void run( \
-    Index rows, Index cols, \
-    const EIGTYPE* _lhs, Index lhsStride, \
-    const EIGTYPE* _rhs, Index rhsStride, \
-    EIGTYPE* res,        Index resStride, \
-    EIGTYPE alpha) \
-  { \
-    char side='R', uplo='L'; \
-    MKL_INT m, n, lda, ldb, ldc; \
-    const EIGTYPE *a, *b; \
-    MKLTYPE alpha_, beta_; \
-    MatrixX##EIGPREFIX b_tmp; \
-    Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> a_tmp; \
-    EIGTYPE myone(1); \
-\
-/* Set m, n, k */ \
-    m = (MKL_INT)rows; \
-    n = (MKL_INT)cols; \
-\
-/* Set alpha_ & beta_ */ \
-    assign_scalar_eig2mkl(alpha_, alpha); \
-    assign_scalar_eig2mkl(beta_, myone); \
-\
-/* Set lda, ldb, ldc */ \
-    lda = (MKL_INT)rhsStride; \
-    ldb = (MKL_INT)lhsStride; \
-    ldc = (MKL_INT)resStride; \
-\
-/* Set a, b, c */ \
-    if (((RhsStorageOrder==ColMajor) && ConjugateRhs) || ((RhsStorageOrder==RowMajor) && (!ConjugateRhs))) { \
-      Map<const Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder>, 0, OuterStride<> > rhs(_rhs,n,n,OuterStride<>(rhsStride)); \
-      a_tmp = rhs.conjugate(); \
-      a = a_tmp.data(); \
-      lda = a_tmp.outerStride(); \
-    } else a = _rhs; \
-    if (RhsStorageOrder==RowMajor) uplo='U'; \
-\
-    if (LhsStorageOrder==ColMajor && (!ConjugateLhs)) { \
-       b = _lhs; } \
-    else { \
-      if (LhsStorageOrder==ColMajor && ConjugateLhs) { \
-        Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,m,n,OuterStride<>(lhsStride)); \
-        b_tmp = lhs.conjugate(); \
-      } else \
-      if (ConjugateLhs) { \
-        Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,n,m,OuterStride<>(lhsStride)); \
-        b_tmp = lhs.adjoint(); \
-      } else { \
-        Map<const MatrixX##EIGPREFIX, 0, OuterStride<> > lhs(_lhs,n,m,OuterStride<>(lhsStride)); \
-        b_tmp = lhs.transpose(); \
-      } \
-      b = b_tmp.data(); \
-      ldb = b_tmp.outerStride(); \
-    } \
-\
-    MKLPREFIX##hemm(&side, &uplo, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)b, &ldb, &beta_, (MKLTYPE*)res, &ldc); \
-  } \
-};
-
-EIGEN_MKL_SYMM_R(double, double, d, d)
-EIGEN_MKL_SYMM_R(float, float, f, s)
-EIGEN_MKL_HEMM_R(dcomplex, MKL_Complex16, cd, z)
-EIGEN_MKL_HEMM_R(scomplex, MKL_Complex8, cf, c)
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_SELFADJOINT_MATRIX_MATRIX_MKL_H
diff --git a/inst/include/Eigen/src/Core/products/SelfadjointMatrixVector.h b/inst/include/Eigen/src/Core/products/SelfadjointMatrixVector.h
index f698f67f..580f6a85 100644
--- a/inst/include/Eigen/src/Core/products/SelfadjointMatrixVector.h
+++ b/inst/include/Eigen/src/Core/products/SelfadjointMatrixVector.h
@@ -10,81 +10,67 @@
 #ifndef EIGEN_SELFADJOINT_MATRIX_VECTOR_H
 #define EIGEN_SELFADJOINT_MATRIX_VECTOR_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "../InternalHeaderCheck.h"
+
+namespace Eigen {
 
 namespace internal {
 
 /* Optimized selfadjoint matrix * vector product:
- * This algorithm processes 2 columns at onces that allows to both reduce
+ * This algorithm processes 2 columns at once that allows to both reduce
  * the number of load/stores of the result by a factor 2 and to reduce
  * the instruction dependency.
  */
 
-template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs, int Version=Specialized>
+template <typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs,
+          int Version = Specialized>
 struct selfadjoint_matrix_vector_product;
 
-template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs, int Version>
+template <typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs,
+          int Version>
 struct selfadjoint_matrix_vector_product
 
 {
-static EIGEN_DONT_INLINE void run(
-  Index size,
-  const Scalar*  lhs, Index lhsStride,
-  const Scalar* _rhs, Index rhsIncr,
-  Scalar* res,
-  Scalar alpha);
+  static EIGEN_DONT_INLINE EIGEN_DEVICE_FUNC void run(Index size, const Scalar* lhs, Index lhsStride, const Scalar* rhs,
+                                                      Scalar* res, Scalar alpha);
 };
 
-template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs, int Version>
-EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,Version>::run(
-  Index size,
-  const Scalar*  lhs, Index lhsStride,
-  const Scalar* _rhs, Index rhsIncr,
-  Scalar* res,
-  Scalar alpha)
-{
+template <typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs,
+          int Version>
+EIGEN_DONT_INLINE EIGEN_DEVICE_FUNC void
+selfadjoint_matrix_vector_product<Scalar, Index, StorageOrder, UpLo, ConjugateLhs, ConjugateRhs, Version>::run(
+    Index size, const Scalar* lhs, Index lhsStride, const Scalar* rhs, Scalar* res, Scalar alpha) {
   typedef typename packet_traits<Scalar>::type Packet;
-  const Index PacketSize = sizeof(Packet)/sizeof(Scalar);
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  const Index PacketSize = sizeof(Packet) / sizeof(Scalar);
 
   enum {
-    IsRowMajor = StorageOrder==RowMajor ? 1 : 0,
+    IsRowMajor = StorageOrder == RowMajor ? 1 : 0,
     IsLower = UpLo == Lower ? 1 : 0,
     FirstTriangular = IsRowMajor == IsLower
   };
 
-  conj_helper<Scalar,Scalar,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs,  IsRowMajor), ConjugateRhs> cj0;
-  conj_helper<Scalar,Scalar,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, !IsRowMajor), ConjugateRhs> cj1;
-  conj_helper<Scalar,Scalar,NumTraits<Scalar>::IsComplex, ConjugateRhs> cjd;
+  conj_helper<Scalar, Scalar, NumTraits<Scalar>::IsComplex && logical_xor(ConjugateLhs, IsRowMajor), ConjugateRhs> cj0;
+  conj_helper<Scalar, Scalar, NumTraits<Scalar>::IsComplex && logical_xor(ConjugateLhs, !IsRowMajor), ConjugateRhs> cj1;
+  conj_helper<RealScalar, Scalar, false, ConjugateRhs> cjd;
 
-  conj_helper<Packet,Packet,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs,  IsRowMajor), ConjugateRhs> pcj0;
-  conj_helper<Packet,Packet,NumTraits<Scalar>::IsComplex && EIGEN_LOGICAL_XOR(ConjugateLhs, !IsRowMajor), ConjugateRhs> pcj1;
+  conj_helper<Packet, Packet, NumTraits<Scalar>::IsComplex && logical_xor(ConjugateLhs, IsRowMajor), ConjugateRhs> pcj0;
+  conj_helper<Packet, Packet, NumTraits<Scalar>::IsComplex && logical_xor(ConjugateLhs, !IsRowMajor), ConjugateRhs>
+      pcj1;
 
   Scalar cjAlpha = ConjugateRhs ? numext::conj(alpha) : alpha;
 
-  // FIXME this copy is now handled outside product_selfadjoint_vector, so it could probably be removed.
-  // if the rhs is not sequentially stored in memory we copy it to a temporary buffer,
-  // this is because we need to extract packets
-  ei_declare_aligned_stack_constructed_variable(Scalar,rhs,size,rhsIncr==1 ? const_cast<Scalar*>(_rhs) : 0);  
-  if (rhsIncr!=1)
-  {
-    const Scalar* it = _rhs;
-    for (Index i=0; i<size; ++i, it+=rhsIncr)
-      rhs[i] = *it;
-  }
-
-  Index bound = (std::max)(Index(0),size-8) & 0xfffffffe;
-  if (FirstTriangular)
-    bound = size - bound;
+  Index bound = numext::maxi(Index(0), size - 8) & 0xfffffffe;
+  if (FirstTriangular) bound = size - bound;
 
-  for (Index j=FirstTriangular ? bound : 0;
-       j<(FirstTriangular ? size : bound);j+=2)
-  {
-    const Scalar* EIGEN_RESTRICT A0 = lhs + j*lhsStride;
-    const Scalar* EIGEN_RESTRICT A1 = lhs + (j+1)*lhsStride;
+  for (Index j = FirstTriangular ? bound : 0; j < (FirstTriangular ? size : bound); j += 2) {
+    const Scalar* EIGEN_RESTRICT A0 = lhs + j * lhsStride;
+    const Scalar* EIGEN_RESTRICT A1 = lhs + (j + 1) * lhsStride;
 
     Scalar t0 = cjAlpha * rhs[j];
     Packet ptmp0 = pset1<Packet>(t0);
-    Scalar t1 = cjAlpha * rhs[j+1];
+    Scalar t1 = cjAlpha * rhs[j + 1];
     Packet ptmp1 = pset1<Packet>(t1);
 
     Scalar t2(0);
@@ -92,69 +78,63 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd
     Scalar t3(0);
     Packet ptmp3 = pset1<Packet>(t3);
 
-    size_t starti = FirstTriangular ? 0 : j+2;
-    size_t endi   = FirstTriangular ? j : size;
-    size_t alignedStart = (starti) + internal::first_aligned(&res[starti], endi-starti);
-    size_t alignedEnd = alignedStart + ((endi-alignedStart)/(PacketSize))*(PacketSize);
-
-    // TODO make sure this product is a real * complex and that the rhs is properly conjugated if needed
-    res[j]   += cjd.pmul(numext::real(A0[j]), t0);
-    res[j+1] += cjd.pmul(numext::real(A1[j+1]), t1);
-    if(FirstTriangular)
-    {
-      res[j]   += cj0.pmul(A1[j],   t1);
-      t3       += cj1.pmul(A1[j],   rhs[j]);
-    }
-    else
-    {
-      res[j+1] += cj0.pmul(A0[j+1],t0);
-      t2 += cj1.pmul(A0[j+1], rhs[j+1]);
+    Index starti = FirstTriangular ? 0 : j + 2;
+    Index endi = FirstTriangular ? j : size;
+    Index alignedStart = (starti) + internal::first_default_aligned(&res[starti], endi - starti);
+    Index alignedEnd = alignedStart + ((endi - alignedStart) / (PacketSize)) * (PacketSize);
+
+    res[j] += cjd.pmul(numext::real(A0[j]), t0);
+    res[j + 1] += cjd.pmul(numext::real(A1[j + 1]), t1);
+    if (FirstTriangular) {
+      res[j] += cj0.pmul(A1[j], t1);
+      t3 += cj1.pmul(A1[j], rhs[j]);
+    } else {
+      res[j + 1] += cj0.pmul(A0[j + 1], t0);
+      t2 += cj1.pmul(A0[j + 1], rhs[j + 1]);
     }
 
-    for (size_t i=starti; i<alignedStart; ++i)
-    {
-      res[i] += t0 * A0[i] + t1 * A1[i];
-      t2 += numext::conj(A0[i]) * rhs[i];
-      t3 += numext::conj(A1[i]) * rhs[i];
+    for (Index i = starti; i < alignedStart; ++i) {
+      res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i], t1);
+      t2 += cj1.pmul(A0[i], rhs[i]);
+      t3 += cj1.pmul(A1[i], rhs[i]);
     }
     // Yes this an optimization for gcc 4.3 and 4.4 (=> huge speed up)
     // gcc 4.2 does this optimization automatically.
-    const Scalar* EIGEN_RESTRICT a0It  = A0  + alignedStart;
-    const Scalar* EIGEN_RESTRICT a1It  = A1  + alignedStart;
+    const Scalar* EIGEN_RESTRICT a0It = A0 + alignedStart;
+    const Scalar* EIGEN_RESTRICT a1It = A1 + alignedStart;
     const Scalar* EIGEN_RESTRICT rhsIt = rhs + alignedStart;
-          Scalar* EIGEN_RESTRICT resIt = res + alignedStart;
-    for (size_t i=alignedStart; i<alignedEnd; i+=PacketSize)
-    {
-      Packet A0i = ploadu<Packet>(a0It);  a0It  += PacketSize;
-      Packet A1i = ploadu<Packet>(a1It);  a1It  += PacketSize;
-      Packet Bi  = ploadu<Packet>(rhsIt); rhsIt += PacketSize; // FIXME should be aligned in most cases
-      Packet Xi  = pload <Packet>(resIt);
-
-      Xi    = pcj0.pmadd(A0i,ptmp0, pcj0.pmadd(A1i,ptmp1,Xi));
-      ptmp2 = pcj1.pmadd(A0i,  Bi, ptmp2);
-      ptmp3 = pcj1.pmadd(A1i,  Bi, ptmp3);
-      pstore(resIt,Xi); resIt += PacketSize;
+    Scalar* EIGEN_RESTRICT resIt = res + alignedStart;
+    for (Index i = alignedStart; i < alignedEnd; i += PacketSize) {
+      Packet A0i = ploadu<Packet>(a0It);
+      a0It += PacketSize;
+      Packet A1i = ploadu<Packet>(a1It);
+      a1It += PacketSize;
+      Packet Bi = ploadu<Packet>(rhsIt);
+      rhsIt += PacketSize;  // FIXME should be aligned in most cases
+      Packet Xi = pload<Packet>(resIt);
+
+      Xi = pcj0.pmadd(A0i, ptmp0, pcj0.pmadd(A1i, ptmp1, Xi));
+      ptmp2 = pcj1.pmadd(A0i, Bi, ptmp2);
+      ptmp3 = pcj1.pmadd(A1i, Bi, ptmp3);
+      pstore(resIt, Xi);
+      resIt += PacketSize;
     }
-    for (size_t i=alignedEnd; i<endi; i++)
-    {
-      res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i],t1);
+    for (Index i = alignedEnd; i < endi; i++) {
+      res[i] += cj0.pmul(A0[i], t0) + cj0.pmul(A1[i], t1);
       t2 += cj1.pmul(A0[i], rhs[i]);
       t3 += cj1.pmul(A1[i], rhs[i]);
     }
 
-    res[j]   += alpha * (t2 + predux(ptmp2));
-    res[j+1] += alpha * (t3 + predux(ptmp3));
+    res[j] += alpha * (t2 + predux(ptmp2));
+    res[j + 1] += alpha * (t3 + predux(ptmp3));
   }
-  for (Index j=FirstTriangular ? 0 : bound;j<(FirstTriangular ? bound : size);j++)
-  {
-    const Scalar* EIGEN_RESTRICT A0 = lhs + j*lhsStride;
+  for (Index j = FirstTriangular ? 0 : bound; j < (FirstTriangular ? bound : size); j++) {
+    const Scalar* EIGEN_RESTRICT A0 = lhs + j * lhsStride;
 
     Scalar t1 = cjAlpha * rhs[j];
     Scalar t2(0);
-    // TODO make sure this product is a real * complex and that the rhs is properly conjugated if needed
     res[j] += cjd.pmul(numext::real(A0[j]), t1);
-    for (Index i=FirstTriangular ? 0 : j+1; i<(FirstTriangular ? j : size); i++)
-    {
+    for (Index i = FirstTriangular ? 0 : j + 1; i < (FirstTriangular ? j : size); i++) {
       res[i] += cj0.pmul(A0[i], t1);
       t2 += cj1.pmul(A0[i], rhs[i]);
     }
@@ -162,120 +142,112 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product<Scalar,Index,StorageOrd
   }
 }
 
-} // end namespace internal 
+}  // end namespace internal
 
 /***************************************************************************
-* Wrapper to product_selfadjoint_vector
-***************************************************************************/
+ * Wrapper to product_selfadjoint_vector
+ ***************************************************************************/
 
 namespace internal {
-template<typename Lhs, int LhsMode, typename Rhs>
-struct traits<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true> >
-  : traits<ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>, Lhs, Rhs> >
-{};
-}
 
-template<typename Lhs, int LhsMode, typename Rhs>
-struct SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>
-  : public ProductBase<SelfadjointProductMatrix<Lhs,LhsMode,false,Rhs,0,true>, Lhs, Rhs >
-{
-  EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix)
+template <typename Lhs, int LhsMode, typename Rhs>
+struct selfadjoint_product_impl<Lhs, LhsMode, false, Rhs, 0, true> {
+  typedef typename Product<Lhs, Rhs>::Scalar Scalar;
 
-  enum {
-    LhsUpLo = LhsMode&(Upper|Lower)
-  };
+  typedef internal::blas_traits<Lhs> LhsBlasTraits;
+  typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
+  typedef internal::remove_all_t<ActualLhsType> ActualLhsTypeCleaned;
 
-  SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
+  typedef internal::blas_traits<Rhs> RhsBlasTraits;
+  typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
+  typedef internal::remove_all_t<ActualRhsType> ActualRhsTypeCleaned;
 
-  template<typename Dest> void scaleAndAddTo(Dest& dest, const Scalar& alpha) const
-  {
+  enum { LhsUpLo = LhsMode & (Upper | Lower) };
+
+  // Verify that the Rhs is a vector in the correct orientation.
+  // Otherwise, we break the assumption that we are multiplying
+  // MxN * Nx1.
+  static_assert(Rhs::ColsAtCompileTime == 1, "The RHS must be a column vector.");
+
+  template <typename Dest>
+  static EIGEN_DEVICE_FUNC void run(Dest& dest, const Lhs& a_lhs, const Rhs& a_rhs, const Scalar& alpha) {
     typedef typename Dest::Scalar ResScalar;
-    typedef typename Base::RhsScalar RhsScalar;
-    typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest;
-    
-    eigen_assert(dest.rows()==m_lhs.rows() && dest.cols()==m_rhs.cols());
+    typedef typename Rhs::Scalar RhsScalar;
+    typedef Map<Matrix<ResScalar, Dynamic, 1>, plain_enum_min(AlignedMax, internal::packet_traits<ResScalar>::size)>
+        MappedDest;
+
+    eigen_assert(dest.rows() == a_lhs.rows() && dest.cols() == a_rhs.cols());
 
-    typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(m_lhs);
-    typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(m_rhs);
+    add_const_on_value_type_t<ActualLhsType> lhs = LhsBlasTraits::extract(a_lhs);
+    add_const_on_value_type_t<ActualRhsType> rhs = RhsBlasTraits::extract(a_rhs);
 
-    Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs)
-                               * RhsBlasTraits::extractScalarFactor(m_rhs);
+    Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs) * RhsBlasTraits::extractScalarFactor(a_rhs);
 
     enum {
-      EvalToDest = (Dest::InnerStrideAtCompileTime==1),
-      UseRhs = (_ActualRhsType::InnerStrideAtCompileTime==1)
+      EvalToDest = (Dest::InnerStrideAtCompileTime == 1),
+      UseRhs = (ActualRhsTypeCleaned::InnerStrideAtCompileTime == 1)
     };
-    
-    internal::gemv_static_vector_if<ResScalar,Dest::SizeAtCompileTime,Dest::MaxSizeAtCompileTime,!EvalToDest> static_dest;
-    internal::gemv_static_vector_if<RhsScalar,_ActualRhsType::SizeAtCompileTime,_ActualRhsType::MaxSizeAtCompileTime,!UseRhs> static_rhs;
 
-    ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(),
+    internal::gemv_static_vector_if<ResScalar, Dest::SizeAtCompileTime, Dest::MaxSizeAtCompileTime, !EvalToDest>
+        static_dest;
+    internal::gemv_static_vector_if<RhsScalar, ActualRhsTypeCleaned::SizeAtCompileTime,
+                                    ActualRhsTypeCleaned::MaxSizeAtCompileTime, !UseRhs>
+        static_rhs;
+
+    ei_declare_aligned_stack_constructed_variable(ResScalar, actualDestPtr, dest.size(),
                                                   EvalToDest ? dest.data() : static_dest.data());
-                                                  
-    ei_declare_aligned_stack_constructed_variable(RhsScalar,actualRhsPtr,rhs.size(),
-        UseRhs ? const_cast<RhsScalar*>(rhs.data()) : static_rhs.data());
-    
-    if(!EvalToDest)
-    {
-      #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
-      int size = dest.size();
+
+    ei_declare_aligned_stack_constructed_variable(RhsScalar, actualRhsPtr, rhs.size(),
+                                                  UseRhs ? const_cast<RhsScalar*>(rhs.data()) : static_rhs.data());
+
+    if (!EvalToDest) {
+#ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+      constexpr int Size = Dest::SizeAtCompileTime;
+      Index size = dest.size();
       EIGEN_DENSE_STORAGE_CTOR_PLUGIN
-      #endif
+#endif
       MappedDest(actualDestPtr, dest.size()) = dest;
     }
-      
-    if(!UseRhs)
-    {
-      #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
-      int size = rhs.size();
+
+    if (!UseRhs) {
+#ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+      constexpr int Size = ActualRhsTypeCleaned::SizeAtCompileTime;
+      Index size = rhs.size();
       EIGEN_DENSE_STORAGE_CTOR_PLUGIN
-      #endif
-      Map<typename _ActualRhsType::PlainObject>(actualRhsPtr, rhs.size()) = rhs;
+#endif
+      Map<typename ActualRhsTypeCleaned::PlainObject>(actualRhsPtr, rhs.size()) = rhs;
     }
-      
-      
-    internal::selfadjoint_matrix_vector_product<Scalar, Index, (internal::traits<_ActualLhsType>::Flags&RowMajorBit) ? RowMajor : ColMajor, int(LhsUpLo), bool(LhsBlasTraits::NeedToConjugate), bool(RhsBlasTraits::NeedToConjugate)>::run
-      (
-        lhs.rows(),                             // size
-        &lhs.coeffRef(0,0),  lhs.outerStride(), // lhs info
-        actualRhsPtr, 1,                        // rhs info
-        actualDestPtr,                          // result info
-        actualAlpha                             // scale factor
-      );
-    
-    if(!EvalToDest)
-      dest = MappedDest(actualDestPtr, dest.size());
+
+    internal::selfadjoint_matrix_vector_product<
+        Scalar, Index, (internal::traits<ActualLhsTypeCleaned>::Flags & RowMajorBit) ? RowMajor : ColMajor,
+        int(LhsUpLo), bool(LhsBlasTraits::NeedToConjugate),
+        bool(RhsBlasTraits::NeedToConjugate)>::run(lhs.rows(),                              // size
+                                                   &lhs.coeffRef(0, 0), lhs.outerStride(),  // lhs info
+                                                   actualRhsPtr,                            // rhs info
+                                                   actualDestPtr,                           // result info
+                                                   actualAlpha                              // scale factor
+    );
+
+    if (!EvalToDest) dest = MappedDest(actualDestPtr, dest.size());
   }
 };
 
-namespace internal {
-template<typename Lhs, typename Rhs, int RhsMode>
-struct traits<SelfadjointProductMatrix<Lhs,0,true,Rhs,RhsMode,false> >
-  : traits<ProductBase<SelfadjointProductMatrix<Lhs,0,true,Rhs,RhsMode,false>, Lhs, Rhs> >
-{};
-}
-
-template<typename Lhs, typename Rhs, int RhsMode>
-struct SelfadjointProductMatrix<Lhs,0,true,Rhs,RhsMode,false>
-  : public ProductBase<SelfadjointProductMatrix<Lhs,0,true,Rhs,RhsMode,false>, Lhs, Rhs >
-{
-  EIGEN_PRODUCT_PUBLIC_INTERFACE(SelfadjointProductMatrix)
+template <typename Lhs, typename Rhs, int RhsMode>
+struct selfadjoint_product_impl<Lhs, 0, true, Rhs, RhsMode, false> {
+  typedef typename Product<Lhs, Rhs>::Scalar Scalar;
+  enum { RhsUpLo = RhsMode & (Upper | Lower) };
 
-  enum {
-    RhsUpLo = RhsMode&(Upper|Lower)
-  };
-
-  SelfadjointProductMatrix(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
-
-  template<typename Dest> void scaleAndAddTo(Dest& dest, const Scalar& alpha) const
-  {
+  template <typename Dest>
+  static void run(Dest& dest, const Lhs& a_lhs, const Rhs& a_rhs, const Scalar& alpha) {
     // let's simply transpose the product
     Transpose<Dest> destT(dest);
-    SelfadjointProductMatrix<Transpose<const Rhs>, int(RhsUpLo)==Upper ? Lower : Upper, false,
-                             Transpose<const Lhs>, 0, true>(m_rhs.transpose(), m_lhs.transpose()).scaleAndAddTo(destT, alpha);
+    selfadjoint_product_impl<Transpose<const Rhs>, int(RhsUpLo) == Upper ? Lower : Upper, false, Transpose<const Lhs>,
+                             0, true>::run(destT, a_rhs.transpose(), a_lhs.transpose(), alpha);
   }
 };
 
-} // end namespace Eigen
+}  // end namespace internal
+
+}  // end namespace Eigen
 
-#endif // EIGEN_SELFADJOINT_MATRIX_VECTOR_H
+#endif  // EIGEN_SELFADJOINT_MATRIX_VECTOR_H
diff --git a/inst/include/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h b/inst/include/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h
new file mode 100644
index 00000000..187c9115
--- /dev/null
+++ b/inst/include/Eigen/src/Core/products/SelfadjointMatrixVector_BLAS.h
@@ -0,0 +1,115 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to BLAS F77
+ *   Selfadjoint matrix-vector product functionality based on ?SYMV/HEMV.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_SELFADJOINT_MATRIX_VECTOR_BLAS_H
+#define EIGEN_SELFADJOINT_MATRIX_VECTOR_BLAS_H
+
+// IWYU pragma: private
+#include "../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+/**********************************************************************
+ * This file implements selfadjoint matrix-vector multiplication using BLAS
+ **********************************************************************/
+
+// symv/hemv specialization
+
+template <typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs>
+struct selfadjoint_matrix_vector_product_symv
+    : selfadjoint_matrix_vector_product<Scalar, Index, StorageOrder, UpLo, ConjugateLhs, ConjugateRhs, BuiltIn> {};
+
+#define EIGEN_BLAS_SYMV_SPECIALIZE(Scalar)                                                                           \
+  template <typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs>                        \
+  struct selfadjoint_matrix_vector_product<Scalar, Index, StorageOrder, UpLo, ConjugateLhs, ConjugateRhs,            \
+                                           Specialized> {                                                            \
+    static void run(Index size, const Scalar* lhs, Index lhsStride, const Scalar* _rhs, Scalar* res, Scalar alpha) { \
+      enum { IsColMajor = StorageOrder == ColMajor };                                                                \
+      if (IsColMajor == ConjugateLhs) {                                                                              \
+        selfadjoint_matrix_vector_product<Scalar, Index, StorageOrder, UpLo, ConjugateLhs, ConjugateRhs,             \
+                                          BuiltIn>::run(size, lhs, lhsStride, _rhs, res, alpha);                     \
+      } else {                                                                                                       \
+        selfadjoint_matrix_vector_product_symv<Scalar, Index, StorageOrder, UpLo, ConjugateLhs, ConjugateRhs>::run(  \
+            size, lhs, lhsStride, _rhs, res, alpha);                                                                 \
+      }                                                                                                              \
+    }                                                                                                                \
+  };
+
+EIGEN_BLAS_SYMV_SPECIALIZE(double)
+EIGEN_BLAS_SYMV_SPECIALIZE(float)
+EIGEN_BLAS_SYMV_SPECIALIZE(dcomplex)
+EIGEN_BLAS_SYMV_SPECIALIZE(scomplex)
+
+#define EIGEN_BLAS_SYMV_SPECIALIZATION(EIGTYPE, BLASTYPE, BLASFUNC)                                                \
+  template <typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs>                      \
+  struct selfadjoint_matrix_vector_product_symv<EIGTYPE, Index, StorageOrder, UpLo, ConjugateLhs, ConjugateRhs> {  \
+    typedef Matrix<EIGTYPE, Dynamic, 1, ColMajor> SYMVVector;                                                      \
+                                                                                                                   \
+    static void run(Index size, const EIGTYPE* lhs, Index lhsStride, const EIGTYPE* _rhs, EIGTYPE* res,            \
+                    EIGTYPE alpha) {                                                                               \
+      if (size == 0) return;                                                                                       \
+      enum { IsRowMajor = StorageOrder == RowMajor ? 1 : 0, IsLower = UpLo == Lower ? 1 : 0 };                     \
+      BlasIndex n = convert_index<BlasIndex>(size), lda = convert_index<BlasIndex>(lhsStride), incx = 1, incy = 1; \
+      EIGTYPE beta(1);                                                                                             \
+      const EIGTYPE* x_ptr;                                                                                        \
+      char uplo = (IsRowMajor) ? (IsLower ? 'U' : 'L') : (IsLower ? 'L' : 'U');                                    \
+      SYMVVector x_tmp;                                                                                            \
+      if (ConjugateRhs) {                                                                                          \
+        Map<const SYMVVector, 0> map_x(_rhs, size, 1);                                                             \
+        x_tmp = map_x.conjugate();                                                                                 \
+        x_ptr = x_tmp.data();                                                                                      \
+      } else                                                                                                       \
+        x_ptr = _rhs;                                                                                              \
+      BLASFUNC(&uplo, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda,                   \
+               (const BLASTYPE*)x_ptr, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &incy);    \
+    }                                                                                                              \
+  };
+
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_SYMV_SPECIALIZATION(double, double, dsymv)
+EIGEN_BLAS_SYMV_SPECIALIZATION(float, float, ssymv)
+EIGEN_BLAS_SYMV_SPECIALIZATION(dcomplex, MKL_Complex16, zhemv)
+EIGEN_BLAS_SYMV_SPECIALIZATION(scomplex, MKL_Complex8, chemv)
+#else
+EIGEN_BLAS_SYMV_SPECIALIZATION(double, double, dsymv_)
+EIGEN_BLAS_SYMV_SPECIALIZATION(float, float, ssymv_)
+EIGEN_BLAS_SYMV_SPECIALIZATION(dcomplex, double, zhemv_)
+EIGEN_BLAS_SYMV_SPECIALIZATION(scomplex, float, chemv_)
+#endif
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_SELFADJOINT_MATRIX_VECTOR_BLAS_H
diff --git a/inst/include/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h b/inst/include/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h
deleted file mode 100644
index 86684b66..00000000
--- a/inst/include/Eigen/src/Core/products/SelfadjointMatrixVector_MKL.h
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- Copyright (c) 2011, Intel Corporation. All rights reserved.
-
- Redistribution and use in source and binary forms, with or without modification,
- are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
- * Neither the name of Intel Corporation nor the names of its contributors may
-   be used to endorse or promote products derived from this software without
-   specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
- ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
- ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
- *   Selfadjoint matrix-vector product functionality based on ?SYMV/HEMV.
- ********************************************************************************
-*/
-
-#ifndef EIGEN_SELFADJOINT_MATRIX_VECTOR_MKL_H
-#define EIGEN_SELFADJOINT_MATRIX_VECTOR_MKL_H
-
-namespace Eigen { 
-
-namespace internal {
-
-/**********************************************************************
-* This file implements selfadjoint matrix-vector multiplication using BLAS
-**********************************************************************/
-
-// symv/hemv specialization
-
-template<typename Scalar, typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs>
-struct selfadjoint_matrix_vector_product_symv :
-  selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,BuiltIn> {};
-
-#define EIGEN_MKL_SYMV_SPECIALIZE(Scalar) \
-template<typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs> \
-struct selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,Specialized> { \
-static void run( \
-  Index size, const Scalar*  lhs, Index lhsStride, \
-  const Scalar* _rhs, Index rhsIncr, Scalar* res, Scalar alpha) { \
-    enum {\
-      IsColMajor = StorageOrder==ColMajor \
-    }; \
-    if (IsColMajor == ConjugateLhs) {\
-      selfadjoint_matrix_vector_product<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs,BuiltIn>::run( \
-        size, lhs, lhsStride, _rhs, rhsIncr, res, alpha);  \
-    } else {\
-      selfadjoint_matrix_vector_product_symv<Scalar,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs>::run( \
-        size, lhs, lhsStride, _rhs, rhsIncr, res, alpha);  \
-    }\
-  } \
-}; \
-
-EIGEN_MKL_SYMV_SPECIALIZE(double)
-EIGEN_MKL_SYMV_SPECIALIZE(float)
-EIGEN_MKL_SYMV_SPECIALIZE(dcomplex)
-EIGEN_MKL_SYMV_SPECIALIZE(scomplex)
-
-#define EIGEN_MKL_SYMV_SPECIALIZATION(EIGTYPE,MKLTYPE,MKLFUNC) \
-template<typename Index, int StorageOrder, int UpLo, bool ConjugateLhs, bool ConjugateRhs> \
-struct selfadjoint_matrix_vector_product_symv<EIGTYPE,Index,StorageOrder,UpLo,ConjugateLhs,ConjugateRhs> \
-{ \
-typedef Matrix<EIGTYPE,Dynamic,1,ColMajor> SYMVVector;\
-\
-static void run( \
-Index size, const EIGTYPE*  lhs, Index lhsStride, \
-const EIGTYPE* _rhs, Index rhsIncr, EIGTYPE* res, EIGTYPE alpha) \
-{ \
-  enum {\
-    IsRowMajor = StorageOrder==RowMajor ? 1 : 0, \
-    IsLower = UpLo == Lower ? 1 : 0 \
-  }; \
-  MKL_INT n=size, lda=lhsStride, incx=rhsIncr, incy=1; \
-  MKLTYPE alpha_, beta_; \
-  const EIGTYPE *x_ptr, myone(1); \
-  char uplo=(IsRowMajor) ? (IsLower ? 'U' : 'L') : (IsLower ? 'L' : 'U'); \
-  assign_scalar_eig2mkl(alpha_, alpha); \
-  assign_scalar_eig2mkl(beta_, myone); \
-  SYMVVector x_tmp; \
-  if (ConjugateRhs) { \
-    Map<const SYMVVector, 0, InnerStride<> > map_x(_rhs,size,1,InnerStride<>(incx)); \
-    x_tmp=map_x.conjugate(); \
-    x_ptr=x_tmp.data(); \
-    incx=1; \
-  } else x_ptr=_rhs; \
-  MKLFUNC(&uplo, &n, &alpha_, (const MKLTYPE*)lhs, &lda, (const MKLTYPE*)x_ptr, &incx, &beta_, (MKLTYPE*)res, &incy); \
-}\
-};
-
-EIGEN_MKL_SYMV_SPECIALIZATION(double,   double,        dsymv)
-EIGEN_MKL_SYMV_SPECIALIZATION(float,    float,         ssymv)
-EIGEN_MKL_SYMV_SPECIALIZATION(dcomplex, MKL_Complex16, zhemv)
-EIGEN_MKL_SYMV_SPECIALIZATION(scomplex, MKL_Complex8,  chemv)
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_SELFADJOINT_MATRIX_VECTOR_MKL_H
diff --git a/inst/include/Eigen/src/Core/products/SelfadjointProduct.h b/inst/include/Eigen/src/Core/products/SelfadjointProduct.h
index 6ca4ae6c..f1034655 100644
--- a/inst/include/Eigen/src/Core/products/SelfadjointProduct.h
+++ b/inst/include/Eigen/src/Core/products/SelfadjointProduct.h
@@ -11,113 +11,123 @@
 #define EIGEN_SELFADJOINT_PRODUCT_H
 
 /**********************************************************************
-* This file implements a self adjoint product: C += A A^T updating only
-* half of the selfadjoint matrix C.
-* It corresponds to the level 3 SYRK and level 2 SYR Blas routines.
-**********************************************************************/
+ * This file implements a self adjoint product: C += A A^T updating only
+ * half of the selfadjoint matrix C.
+ * It corresponds to the level 3 SYRK and level 2 SYR Blas routines.
+ **********************************************************************/
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "../InternalHeaderCheck.h"
 
+namespace Eigen {
 
-template<typename Scalar, typename Index, int UpLo, bool ConjLhs, bool ConjRhs>
-struct selfadjoint_rank1_update<Scalar,Index,ColMajor,UpLo,ConjLhs,ConjRhs>
-{
-  static void run(Index size, Scalar* mat, Index stride, const Scalar* vecX, const Scalar* vecY, const Scalar& alpha)
-  {
+template <typename Scalar, typename Index, int UpLo, bool ConjLhs, bool ConjRhs>
+struct selfadjoint_rank1_update<Scalar, Index, ColMajor, UpLo, ConjLhs, ConjRhs> {
+  static void run(Index size, Scalar* mat, Index stride, const Scalar* vecX, const Scalar* vecY, const Scalar& alpha) {
     internal::conj_if<ConjRhs> cj;
-    typedef Map<const Matrix<Scalar,Dynamic,1> > OtherMap;
-    typedef typename internal::conditional<ConjLhs,typename OtherMap::ConjugateReturnType,const OtherMap&>::type ConjLhsType;
-    for (Index i=0; i<size; ++i)
-    {
-      Map<Matrix<Scalar,Dynamic,1> >(mat+stride*i+(UpLo==Lower ? i : 0), (UpLo==Lower ? size-i : (i+1)))
-          += (alpha * cj(vecY[i])) * ConjLhsType(OtherMap(vecX+(UpLo==Lower ? i : 0),UpLo==Lower ? size-i : (i+1)));
+    typedef Map<const Matrix<Scalar, Dynamic, 1> > OtherMap;
+    typedef std::conditional_t<ConjLhs, typename OtherMap::ConjugateReturnType, const OtherMap&> ConjLhsType;
+    for (Index i = 0; i < size; ++i) {
+      Map<Matrix<Scalar, Dynamic, 1> >(mat + stride * i + (UpLo == Lower ? i : 0),
+                                       (UpLo == Lower ? size - i : (i + 1))) +=
+          (alpha * cj(vecY[i])) *
+          ConjLhsType(OtherMap(vecX + (UpLo == Lower ? i : 0), UpLo == Lower ? size - i : (i + 1)));
     }
   }
 };
 
-template<typename Scalar, typename Index, int UpLo, bool ConjLhs, bool ConjRhs>
-struct selfadjoint_rank1_update<Scalar,Index,RowMajor,UpLo,ConjLhs,ConjRhs>
-{
-  static void run(Index size, Scalar* mat, Index stride, const Scalar* vecX, const Scalar* vecY, const Scalar& alpha)
-  {
-    selfadjoint_rank1_update<Scalar,Index,ColMajor,UpLo==Lower?Upper:Lower,ConjRhs,ConjLhs>::run(size,mat,stride,vecY,vecX,alpha);
+template <typename Scalar, typename Index, int UpLo, bool ConjLhs, bool ConjRhs>
+struct selfadjoint_rank1_update<Scalar, Index, RowMajor, UpLo, ConjLhs, ConjRhs> {
+  static void run(Index size, Scalar* mat, Index stride, const Scalar* vecX, const Scalar* vecY, const Scalar& alpha) {
+    selfadjoint_rank1_update<Scalar, Index, ColMajor, UpLo == Lower ? Upper : Lower, ConjRhs, ConjLhs>::run(
+        size, mat, stride, vecY, vecX, alpha);
   }
 };
 
-template<typename MatrixType, typename OtherType, int UpLo, bool OtherIsVector = OtherType::IsVectorAtCompileTime>
+template <typename MatrixType, typename OtherType, int UpLo, bool OtherIsVector = OtherType::IsVectorAtCompileTime>
 struct selfadjoint_product_selector;
 
-template<typename MatrixType, typename OtherType, int UpLo>
-struct selfadjoint_product_selector<MatrixType,OtherType,UpLo,true>
-{
-  static void run(MatrixType& mat, const OtherType& other, const typename MatrixType::Scalar& alpha)
-  {
+template <typename MatrixType, typename OtherType, int UpLo>
+struct selfadjoint_product_selector<MatrixType, OtherType, UpLo, true> {
+  static void run(MatrixType& mat, const OtherType& other, const typename MatrixType::Scalar& alpha) {
     typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::Index Index;
     typedef internal::blas_traits<OtherType> OtherBlasTraits;
     typedef typename OtherBlasTraits::DirectLinearAccessType ActualOtherType;
-    typedef typename internal::remove_all<ActualOtherType>::type _ActualOtherType;
-    typename internal::add_const_on_value_type<ActualOtherType>::type actualOther = OtherBlasTraits::extract(other.derived());
+    typedef internal::remove_all_t<ActualOtherType> ActualOtherType_;
+    internal::add_const_on_value_type_t<ActualOtherType> actualOther = OtherBlasTraits::extract(other.derived());
 
     Scalar actualAlpha = alpha * OtherBlasTraits::extractScalarFactor(other.derived());
 
     enum {
-      StorageOrder = (internal::traits<MatrixType>::Flags&RowMajorBit) ? RowMajor : ColMajor,
-      UseOtherDirectly = _ActualOtherType::InnerStrideAtCompileTime==1
+      StorageOrder = (internal::traits<MatrixType>::Flags & RowMajorBit) ? RowMajor : ColMajor,
+      UseOtherDirectly = ActualOtherType_::InnerStrideAtCompileTime == 1
     };
-    internal::gemv_static_vector_if<Scalar,OtherType::SizeAtCompileTime,OtherType::MaxSizeAtCompileTime,!UseOtherDirectly> static_other;
-
-    ei_declare_aligned_stack_constructed_variable(Scalar, actualOtherPtr, other.size(),
-      (UseOtherDirectly ? const_cast<Scalar*>(actualOther.data()) : static_other.data()));
-      
-    if(!UseOtherDirectly)
-      Map<typename _ActualOtherType::PlainObject>(actualOtherPtr, actualOther.size()) = actualOther;
-    
-    selfadjoint_rank1_update<Scalar,Index,StorageOrder,UpLo,
-                              OtherBlasTraits::NeedToConjugate  && NumTraits<Scalar>::IsComplex,
-                            (!OtherBlasTraits::NeedToConjugate) && NumTraits<Scalar>::IsComplex>
-          ::run(other.size(), mat.data(), mat.outerStride(), actualOtherPtr, actualOtherPtr, actualAlpha);
+    internal::gemv_static_vector_if<Scalar, OtherType::SizeAtCompileTime, OtherType::MaxSizeAtCompileTime,
+                                    !UseOtherDirectly>
+        static_other;
+
+    ei_declare_aligned_stack_constructed_variable(
+        Scalar, actualOtherPtr, other.size(),
+        (UseOtherDirectly ? const_cast<Scalar*>(actualOther.data()) : static_other.data()));
+
+    if (!UseOtherDirectly)
+      Map<typename ActualOtherType_::PlainObject>(actualOtherPtr, actualOther.size()) = actualOther;
+
+    selfadjoint_rank1_update<
+        Scalar, Index, StorageOrder, UpLo, OtherBlasTraits::NeedToConjugate && NumTraits<Scalar>::IsComplex,
+        (!OtherBlasTraits::NeedToConjugate) && NumTraits<Scalar>::IsComplex>::run(other.size(), mat.data(),
+                                                                                  mat.outerStride(), actualOtherPtr,
+                                                                                  actualOtherPtr, actualAlpha);
   }
 };
 
-template<typename MatrixType, typename OtherType, int UpLo>
-struct selfadjoint_product_selector<MatrixType,OtherType,UpLo,false>
-{
-  static void run(MatrixType& mat, const OtherType& other, const typename MatrixType::Scalar& alpha)
-  {
+template <typename MatrixType, typename OtherType, int UpLo>
+struct selfadjoint_product_selector<MatrixType, OtherType, UpLo, false> {
+  static void run(MatrixType& mat, const OtherType& other, const typename MatrixType::Scalar& alpha) {
     typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::Index Index;
     typedef internal::blas_traits<OtherType> OtherBlasTraits;
     typedef typename OtherBlasTraits::DirectLinearAccessType ActualOtherType;
-    typedef typename internal::remove_all<ActualOtherType>::type _ActualOtherType;
-    typename internal::add_const_on_value_type<ActualOtherType>::type actualOther = OtherBlasTraits::extract(other.derived());
+    typedef internal::remove_all_t<ActualOtherType> ActualOtherType_;
+    internal::add_const_on_value_type_t<ActualOtherType> actualOther = OtherBlasTraits::extract(other.derived());
 
     Scalar actualAlpha = alpha * OtherBlasTraits::extractScalarFactor(other.derived());
 
-    enum { IsRowMajor = (internal::traits<MatrixType>::Flags&RowMajorBit) ? 1 : 0 };
+    enum {
+      IsRowMajor = (internal::traits<MatrixType>::Flags & RowMajorBit) ? 1 : 0,
+      OtherIsRowMajor = ActualOtherType_::Flags & RowMajorBit ? 1 : 0
+    };
+
+    Index size = mat.cols();
+    Index depth = actualOther.cols();
+
+    typedef internal::gemm_blocking_space<IsRowMajor ? RowMajor : ColMajor, Scalar, Scalar,
+                                          MatrixType::MaxColsAtCompileTime, MatrixType::MaxColsAtCompileTime,
+                                          ActualOtherType_::MaxColsAtCompileTime>
+        BlockingType;
+
+    BlockingType blocking(size, size, depth, 1, false);
 
-    internal::general_matrix_matrix_triangular_product<Index,
-      Scalar, _ActualOtherType::Flags&RowMajorBit ? RowMajor : ColMajor,   OtherBlasTraits::NeedToConjugate  && NumTraits<Scalar>::IsComplex,
-      Scalar, _ActualOtherType::Flags&RowMajorBit ? ColMajor : RowMajor, (!OtherBlasTraits::NeedToConjugate) && NumTraits<Scalar>::IsComplex,
-      MatrixType::Flags&RowMajorBit ? RowMajor : ColMajor, UpLo>
-      ::run(mat.cols(), actualOther.cols(),
-            &actualOther.coeffRef(0,0), actualOther.outerStride(), &actualOther.coeffRef(0,0), actualOther.outerStride(),
-            mat.data(), mat.outerStride(), actualAlpha);
+    internal::general_matrix_matrix_triangular_product<
+        Index, Scalar, OtherIsRowMajor ? RowMajor : ColMajor,
+        OtherBlasTraits::NeedToConjugate && NumTraits<Scalar>::IsComplex, Scalar, OtherIsRowMajor ? ColMajor : RowMajor,
+        (!OtherBlasTraits::NeedToConjugate) && NumTraits<Scalar>::IsComplex, IsRowMajor ? RowMajor : ColMajor,
+        MatrixType::InnerStrideAtCompileTime, UpLo>::run(size, depth, actualOther.data(), actualOther.outerStride(),
+                                                         actualOther.data(), actualOther.outerStride(), mat.data(),
+                                                         mat.innerStride(), mat.outerStride(), actualAlpha, blocking);
   }
 };
 
 // high level API
 
-template<typename MatrixType, unsigned int UpLo>
-template<typename DerivedU>
-SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
-::rankUpdate(const MatrixBase<DerivedU>& u, const Scalar& alpha)
-{
-  selfadjoint_product_selector<MatrixType,DerivedU,UpLo>::run(_expression().const_cast_derived(), u.derived(), alpha);
+template <typename MatrixType, unsigned int UpLo>
+template <typename DerivedU>
+EIGEN_DEVICE_FUNC SelfAdjointView<MatrixType, UpLo>& SelfAdjointView<MatrixType, UpLo>::rankUpdate(
+    const MatrixBase<DerivedU>& u, const Scalar& alpha) {
+  selfadjoint_product_selector<MatrixType, DerivedU, UpLo>::run(_expression().const_cast_derived(), u.derived(), alpha);
 
   return *this;
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_SELFADJOINT_PRODUCT_H
+#endif  // EIGEN_SELFADJOINT_PRODUCT_H
diff --git a/inst/include/Eigen/src/Core/products/SelfadjointRank2Update.h b/inst/include/Eigen/src/Core/products/SelfadjointRank2Update.h
index 8594a97c..9c234ec2 100644
--- a/inst/include/Eigen/src/Core/products/SelfadjointRank2Update.h
+++ b/inst/include/Eigen/src/Core/products/SelfadjointRank2Update.h
@@ -10,7 +10,10 @@
 #ifndef EIGEN_SELFADJOINTRANK2UPTADE_H
 #define EIGEN_SELFADJOINTRANK2UPTADE_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "../InternalHeaderCheck.h"
+
+namespace Eigen {
 
 namespace internal {
 
@@ -18,76 +21,75 @@ namespace internal {
  * It corresponds to the Level2 syr2 BLAS routine
  */
 
-template<typename Scalar, typename Index, typename UType, typename VType, int UpLo>
+template <typename Scalar, typename Index, typename UType, typename VType, int UpLo>
 struct selfadjoint_rank2_update_selector;
 
-template<typename Scalar, typename Index, typename UType, typename VType>
-struct selfadjoint_rank2_update_selector<Scalar,Index,UType,VType,Lower>
-{
-  static void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha)
-  {
+template <typename Scalar, typename Index, typename UType, typename VType>
+struct selfadjoint_rank2_update_selector<Scalar, Index, UType, VType, Lower> {
+  static EIGEN_DEVICE_FUNC void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha) {
     const Index size = u.size();
-    for (Index i=0; i<size; ++i)
-    {
-      Map<Matrix<Scalar,Dynamic,1> >(mat+stride*i+i, size-i) +=
-                        (numext::conj(alpha) * numext::conj(u.coeff(i))) * v.tail(size-i)
-                      + (alpha * numext::conj(v.coeff(i))) * u.tail(size-i);
+    for (Index i = 0; i < size; ++i) {
+      Map<Matrix<Scalar, Dynamic, 1>>(mat + stride * i + i, size - i) +=
+          (numext::conj(alpha) * numext::conj(u.coeff(i))) * v.tail(size - i) +
+          (alpha * numext::conj(v.coeff(i))) * u.tail(size - i);
     }
   }
 };
 
-template<typename Scalar, typename Index, typename UType, typename VType>
-struct selfadjoint_rank2_update_selector<Scalar,Index,UType,VType,Upper>
-{
-  static void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha)
-  {
+template <typename Scalar, typename Index, typename UType, typename VType>
+struct selfadjoint_rank2_update_selector<Scalar, Index, UType, VType, Upper> {
+  static void run(Scalar* mat, Index stride, const UType& u, const VType& v, const Scalar& alpha) {
     const Index size = u.size();
-    for (Index i=0; i<size; ++i)
-      Map<Matrix<Scalar,Dynamic,1> >(mat+stride*i, i+1) +=
-                        (numext::conj(alpha)  * numext::conj(u.coeff(i))) * v.head(i+1)
-                      + (alpha * numext::conj(v.coeff(i))) * u.head(i+1);
+    for (Index i = 0; i < size; ++i)
+      Map<Matrix<Scalar, Dynamic, 1>>(mat + stride * i, i + 1) +=
+          (numext::conj(alpha) * numext::conj(u.coeff(i))) * v.head(i + 1) +
+          (alpha * numext::conj(v.coeff(i))) * u.head(i + 1);
   }
 };
 
-template<bool Cond, typename T> struct conj_expr_if
-  : conditional<!Cond, const T&,
-      CwiseUnaryOp<scalar_conjugate_op<typename traits<T>::Scalar>,T> > {};
+template <bool Cond, typename T>
+using conj_expr_if =
+    std::conditional<!Cond, const T&, CwiseUnaryOp<scalar_conjugate_op<typename traits<T>::Scalar>, T>>;
 
-} // end namespace internal
+}  // end namespace internal
 
-template<typename MatrixType, unsigned int UpLo>
-template<typename DerivedU, typename DerivedV>
-SelfAdjointView<MatrixType,UpLo>& SelfAdjointView<MatrixType,UpLo>
-::rankUpdate(const MatrixBase<DerivedU>& u, const MatrixBase<DerivedV>& v, const Scalar& alpha)
-{
+template <typename MatrixType, unsigned int UpLo>
+template <typename DerivedU, typename DerivedV>
+EIGEN_DEVICE_FUNC SelfAdjointView<MatrixType, UpLo>& SelfAdjointView<MatrixType, UpLo>::rankUpdate(
+    const MatrixBase<DerivedU>& u, const MatrixBase<DerivedV>& v, const Scalar& alpha) {
   typedef internal::blas_traits<DerivedU> UBlasTraits;
   typedef typename UBlasTraits::DirectLinearAccessType ActualUType;
-  typedef typename internal::remove_all<ActualUType>::type _ActualUType;
-  typename internal::add_const_on_value_type<ActualUType>::type actualU = UBlasTraits::extract(u.derived());
+  typedef internal::remove_all_t<ActualUType> ActualUType_;
+  internal::add_const_on_value_type_t<ActualUType> actualU = UBlasTraits::extract(u.derived());
 
   typedef internal::blas_traits<DerivedV> VBlasTraits;
   typedef typename VBlasTraits::DirectLinearAccessType ActualVType;
-  typedef typename internal::remove_all<ActualVType>::type _ActualVType;
-  typename internal::add_const_on_value_type<ActualVType>::type actualV = VBlasTraits::extract(v.derived());
+  typedef internal::remove_all_t<ActualVType> ActualVType_;
+  internal::add_const_on_value_type_t<ActualVType> actualV = VBlasTraits::extract(v.derived());
 
   // If MatrixType is row major, then we use the routine for lower triangular in the upper triangular case and
   // vice versa, and take the complex conjugate of all coefficients and vector entries.
 
-  enum { IsRowMajor = (internal::traits<MatrixType>::Flags&RowMajorBit) ? 1 : 0 };
-  Scalar actualAlpha = alpha * UBlasTraits::extractScalarFactor(u.derived())
-                             * numext::conj(VBlasTraits::extractScalarFactor(v.derived()));
-  if (IsRowMajor)
-    actualAlpha = numext::conj(actualAlpha);
-
-  internal::selfadjoint_rank2_update_selector<Scalar, Index,
-    typename internal::remove_all<typename internal::conj_expr_if<IsRowMajor ^ UBlasTraits::NeedToConjugate,_ActualUType>::type>::type,
-    typename internal::remove_all<typename internal::conj_expr_if<IsRowMajor ^ VBlasTraits::NeedToConjugate,_ActualVType>::type>::type,
-    (IsRowMajor ? int(UpLo==Upper ? Lower : Upper) : UpLo)>
-    ::run(_expression().const_cast_derived().data(),_expression().outerStride(),actualU,actualV,actualAlpha);
+  enum { IsRowMajor = (internal::traits<MatrixType>::Flags & RowMajorBit) ? 1 : 0 };
+  Scalar actualAlpha = alpha * UBlasTraits::extractScalarFactor(u.derived()) *
+                       numext::conj(VBlasTraits::extractScalarFactor(v.derived()));
+  if (IsRowMajor) actualAlpha = numext::conj(actualAlpha);
+
+  typedef internal::remove_all_t<
+      typename internal::conj_expr_if<int(IsRowMajor) ^ int(UBlasTraits::NeedToConjugate), ActualUType_>::type>
+      UType;
+  typedef internal::remove_all_t<
+      typename internal::conj_expr_if<int(IsRowMajor) ^ int(VBlasTraits::NeedToConjugate), ActualVType_>::type>
+      VType;
+  internal::selfadjoint_rank2_update_selector<Scalar, Index, UType, VType,
+                                              (IsRowMajor ? int(UpLo == Upper ? Lower : Upper)
+                                                          : UpLo)>::run(_expression().const_cast_derived().data(),
+                                                                        _expression().outerStride(), UType(actualU),
+                                                                        VType(actualV), actualAlpha);
 
   return *this;
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_SELFADJOINTRANK2UPTADE_H
+#endif  // EIGEN_SELFADJOINTRANK2UPTADE_H
diff --git a/inst/include/Eigen/src/Core/products/TriangularMatrixMatrix.h b/inst/include/Eigen/src/Core/products/TriangularMatrixMatrix.h
index 8110507b..a0d05ef8 100644
--- a/inst/include/Eigen/src/Core/products/TriangularMatrixMatrix.h
+++ b/inst/include/Eigen/src/Core/products/TriangularMatrixMatrix.h
@@ -10,7 +10,10 @@
 #ifndef EIGEN_TRIANGULAR_MATRIX_MATRIX_H
 #define EIGEN_TRIANGULAR_MATRIX_MATRIX_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "../InternalHeaderCheck.h"
+
+namespace Eigen {
 
 namespace internal {
 
@@ -18,10 +21,10 @@ namespace internal {
 // struct gemm_pack_lhs_triangular
 // {
 //   Matrix<Scalar,mr,mr,
-//   void operator()(Scalar* blockA, const EIGEN_RESTRICT Scalar* _lhs, int lhsStride, int depth, int rows)
+//   void operator()(Scalar* blockA, const EIGEN_RESTRICT Scalar* lhs_, int lhsStride, int depth, int rows)
 //   {
 //     conj_if<NumTraits<Scalar>::IsComplex && Conjugate> cj;
-//     const_blas_data_mapper<Scalar, StorageOrder> lhs(_lhs,lhsStride);
+//     const_blas_data_mapper<Scalar, StorageOrder> lhs(lhs_,lhsStride);
 //     int count = 0;
 //     const int peeled_mc = (rows/mr)*mr;
 //     for(int i=0; i<peeled_mc; i+=mr)
@@ -41,387 +44,354 @@ namespace internal {
 /* Optimized triangular matrix * matrix (_TRMM++) product built on top of
  * the general matrix matrix product.
  */
-template <typename Scalar, typename Index,
-          int Mode, bool LhsIsTriangular,
-          int LhsStorageOrder, bool ConjugateLhs,
-          int RhsStorageOrder, bool ConjugateRhs,
-          int ResStorageOrder, int Version = Specialized>
+template <typename Scalar, typename Index, int Mode, bool LhsIsTriangular, int LhsStorageOrder, bool ConjugateLhs,
+          int RhsStorageOrder, bool ConjugateRhs, int ResStorageOrder, int ResInnerStride, int Version = Specialized>
 struct product_triangular_matrix_matrix;
 
-template <typename Scalar, typename Index,
-          int Mode, bool LhsIsTriangular,
-          int LhsStorageOrder, bool ConjugateLhs,
-          int RhsStorageOrder, bool ConjugateRhs, int Version>
-struct product_triangular_matrix_matrix<Scalar,Index,Mode,LhsIsTriangular,
-                                           LhsStorageOrder,ConjugateLhs,
-                                           RhsStorageOrder,ConjugateRhs,RowMajor,Version>
-{
-  static EIGEN_STRONG_INLINE void run(
-    Index rows, Index cols, Index depth,
-    const Scalar* lhs, Index lhsStride,
-    const Scalar* rhs, Index rhsStride,
-    Scalar* res,       Index resStride,
-    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
-  {
-    product_triangular_matrix_matrix<Scalar, Index,
-      (Mode&(UnitDiag|ZeroDiag)) | ((Mode&Upper) ? Lower : Upper),
-      (!LhsIsTriangular),
-      RhsStorageOrder==RowMajor ? ColMajor : RowMajor,
-      ConjugateRhs,
-      LhsStorageOrder==RowMajor ? ColMajor : RowMajor,
-      ConjugateLhs,
-      ColMajor>
-      ::run(cols, rows, depth, rhs, rhsStride, lhs, lhsStride, res, resStride, alpha, blocking);
+template <typename Scalar, typename Index, int Mode, bool LhsIsTriangular, int LhsStorageOrder, bool ConjugateLhs,
+          int RhsStorageOrder, bool ConjugateRhs, int ResInnerStride, int Version>
+struct product_triangular_matrix_matrix<Scalar, Index, Mode, LhsIsTriangular, LhsStorageOrder, ConjugateLhs,
+                                        RhsStorageOrder, ConjugateRhs, RowMajor, ResInnerStride, Version> {
+  static EIGEN_STRONG_INLINE void run(Index rows, Index cols, Index depth, const Scalar* lhs, Index lhsStride,
+                                      const Scalar* rhs, Index rhsStride, Scalar* res, Index resIncr, Index resStride,
+                                      const Scalar& alpha, level3_blocking<Scalar, Scalar>& blocking) {
+    product_triangular_matrix_matrix<Scalar, Index, (Mode & (UnitDiag | ZeroDiag)) | ((Mode & Upper) ? Lower : Upper),
+                                     (!LhsIsTriangular), RhsStorageOrder == RowMajor ? ColMajor : RowMajor,
+                                     ConjugateRhs, LhsStorageOrder == RowMajor ? ColMajor : RowMajor, ConjugateLhs,
+                                     ColMajor, ResInnerStride>::run(cols, rows, depth, rhs, rhsStride, lhs, lhsStride,
+                                                                    res, resIncr, resStride, alpha, blocking);
   }
 };
 
 // implements col-major += alpha * op(triangular) * op(general)
-template <typename Scalar, typename Index, int Mode,
-          int LhsStorageOrder, bool ConjugateLhs,
-          int RhsStorageOrder, bool ConjugateRhs, int Version>
-struct product_triangular_matrix_matrix<Scalar,Index,Mode,true,
-                                           LhsStorageOrder,ConjugateLhs,
-                                           RhsStorageOrder,ConjugateRhs,ColMajor,Version>
-{
-  
-  typedef gebp_traits<Scalar,Scalar> Traits;
+template <typename Scalar, typename Index, int Mode, int LhsStorageOrder, bool ConjugateLhs, int RhsStorageOrder,
+          bool ConjugateRhs, int ResInnerStride, int Version>
+struct product_triangular_matrix_matrix<Scalar, Index, Mode, true, LhsStorageOrder, ConjugateLhs, RhsStorageOrder,
+                                        ConjugateRhs, ColMajor, ResInnerStride, Version> {
+  typedef gebp_traits<Scalar, Scalar> Traits;
   enum {
-    SmallPanelWidth   = 2 * EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr),
-    IsLower = (Mode&Lower) == Lower,
-    SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1
+    SmallPanelWidth = 2 * plain_enum_max(Traits::mr, Traits::nr),
+    IsLower = (Mode & Lower) == Lower,
+    SetDiag = (Mode & (ZeroDiag | UnitDiag)) ? 0 : 1
   };
 
-  static EIGEN_DONT_INLINE void run(
-    Index _rows, Index _cols, Index _depth,
-    const Scalar* _lhs, Index lhsStride,
-    const Scalar* _rhs, Index rhsStride,
-    Scalar* res,        Index resStride,
-    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
+  static EIGEN_DONT_INLINE void run(Index _rows, Index _cols, Index _depth, const Scalar* lhs_, Index lhsStride,
+                                    const Scalar* rhs_, Index rhsStride, Scalar* res, Index resIncr, Index resStride,
+                                    const Scalar& alpha, level3_blocking<Scalar, Scalar>& blocking);
 };
 
-template <typename Scalar, typename Index, int Mode,
-          int LhsStorageOrder, bool ConjugateLhs,
-          int RhsStorageOrder, bool ConjugateRhs, int Version>
-EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,true,
-                                                        LhsStorageOrder,ConjugateLhs,
-                                                        RhsStorageOrder,ConjugateRhs,ColMajor,Version>::run(
-    Index _rows, Index _cols, Index _depth,
-    const Scalar* _lhs, Index lhsStride,
-    const Scalar* _rhs, Index rhsStride,
-    Scalar* res,        Index resStride,
-    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
-  {
-    // strip zeros
-    Index diagSize  = (std::min)(_rows,_depth);
-    Index rows      = IsLower ? _rows : diagSize;
-    Index depth     = IsLower ? diagSize : _depth;
-    Index cols      = _cols;
-    
-    const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
-    const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
-
-    Index kc = blocking.kc();                   // cache block size along the K direction
-    Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
-
-    std::size_t sizeA = kc*mc;
-    std::size_t sizeB = kc*cols;
-    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
-
-    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
-    ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
-    ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW());
-
-    Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,LhsStorageOrder> triangularBuffer;
-    triangularBuffer.setZero();
-    if((Mode&ZeroDiag)==ZeroDiag)
-      triangularBuffer.diagonal().setZero();
-    else
-      triangularBuffer.diagonal().setOnes();
-
-    gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
-    gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
-    gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
-
-    for(Index k2=IsLower ? depth : 0;
-        IsLower ? k2>0 : k2<depth;
-        IsLower ? k2-=kc : k2+=kc)
-    {
-      Index actual_kc = (std::min)(IsLower ? k2 : depth-k2, kc);
-      Index actual_k2 = IsLower ? k2-actual_kc : k2;
-
-      // align blocks with the end of the triangular part for trapezoidal lhs
-      if((!IsLower)&&(k2<rows)&&(k2+actual_kc>rows))
-      {
-        actual_kc = rows-k2;
-        k2 = k2+actual_kc-kc;
-      }
+template <typename Scalar, typename Index, int Mode, int LhsStorageOrder, bool ConjugateLhs, int RhsStorageOrder,
+          bool ConjugateRhs, int ResInnerStride, int Version>
+EIGEN_DONT_INLINE void product_triangular_matrix_matrix<
+    Scalar, Index, Mode, true, LhsStorageOrder, ConjugateLhs, RhsStorageOrder, ConjugateRhs, ColMajor, ResInnerStride,
+    Version>::run(Index _rows, Index _cols, Index _depth, const Scalar* lhs_, Index lhsStride, const Scalar* rhs_,
+                  Index rhsStride, Scalar* res_, Index resIncr, Index resStride, const Scalar& alpha,
+                  level3_blocking<Scalar, Scalar>& blocking) {
+  // strip zeros
+  Index diagSize = (std::min)(_rows, _depth);
+  Index rows = IsLower ? _rows : diagSize;
+  Index depth = IsLower ? diagSize : _depth;
+  Index cols = _cols;
+
+  typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
+  typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
+  typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
+  LhsMapper lhs(lhs_, lhsStride);
+  RhsMapper rhs(rhs_, rhsStride);
+  ResMapper res(res_, resStride, resIncr);
+
+  Index kc = blocking.kc();                    // cache block size along the K direction
+  Index mc = (std::min)(rows, blocking.mc());  // cache block size along the M direction
+  // The small panel size must not be larger than blocking size.
+  // Usually this should never be the case because SmallPanelWidth^2 is very small
+  // compared to L2 cache size, but let's be safe:
+  Index panelWidth = (std::min)(Index(SmallPanelWidth), (std::min)(kc, mc));
+
+  std::size_t sizeA = kc * mc;
+  std::size_t sizeB = kc * cols;
+
+  ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
+  ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
+
+  Matrix<Scalar, SmallPanelWidth, SmallPanelWidth, LhsStorageOrder> triangularBuffer;
+  triangularBuffer.setZero();
+  if ((Mode & ZeroDiag) == ZeroDiag)
+    triangularBuffer.diagonal().setZero();
+  else
+    triangularBuffer.diagonal().setOnes();
+
+  gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
+  gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing,
+                LhsStorageOrder>
+      pack_lhs;
+  gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
+
+  for (Index k2 = IsLower ? depth : 0; IsLower ? k2 > 0 : k2 < depth; IsLower ? k2 -= kc : k2 += kc) {
+    Index actual_kc = (std::min)(IsLower ? k2 : depth - k2, kc);
+    Index actual_k2 = IsLower ? k2 - actual_kc : k2;
+
+    // align blocks with the end of the triangular part for trapezoidal lhs
+    if ((!IsLower) && (k2 < rows) && (k2 + actual_kc > rows)) {
+      actual_kc = rows - k2;
+      k2 = k2 + actual_kc - kc;
+    }
 
-      pack_rhs(blockB, &rhs(actual_k2,0), rhsStride, actual_kc, cols);
-
-      // the selected lhs's panel has to be split in three different parts:
-      //  1 - the part which is zero => skip it
-      //  2 - the diagonal block => special kernel
-      //  3 - the dense panel below (lower case) or above (upper case) the diagonal block => GEPP
-
-      // the block diagonal, if any:
-      if(IsLower || actual_k2<rows)
-      {
-        // for each small vertical panels of lhs
-        for (Index k1=0; k1<actual_kc; k1+=SmallPanelWidth)
-        {
-          Index actualPanelWidth = std::min<Index>(actual_kc-k1, SmallPanelWidth);
-          Index lengthTarget = IsLower ? actual_kc-k1-actualPanelWidth : k1;
-          Index startBlock   = actual_k2+k1;
-          Index blockBOffset = k1;
-
-          // => GEBP with the micro triangular block
-          // The trick is to pack this micro block while filling the opposite triangular part with zeros.
-          // To this end we do an extra triangular copy to a small temporary buffer
-          for (Index k=0;k<actualPanelWidth;++k)
-          {
-            if (SetDiag)
-              triangularBuffer.coeffRef(k,k) = lhs(startBlock+k,startBlock+k);
-            for (Index i=IsLower ? k+1 : 0; IsLower ? i<actualPanelWidth : i<k; ++i)
-              triangularBuffer.coeffRef(i,k) = lhs(startBlock+i,startBlock+k);
-          }
-          pack_lhs(blockA, triangularBuffer.data(), triangularBuffer.outerStride(), actualPanelWidth, actualPanelWidth);
-
-          gebp_kernel(res+startBlock, resStride, blockA, blockB, actualPanelWidth, actualPanelWidth, cols, alpha,
-                      actualPanelWidth, actual_kc, 0, blockBOffset, blockW);
-
-          // GEBP with remaining micro panel
-          if (lengthTarget>0)
-          {
-            Index startTarget  = IsLower ? actual_k2+k1+actualPanelWidth : actual_k2;
-
-            pack_lhs(blockA, &lhs(startTarget,startBlock), lhsStride, actualPanelWidth, lengthTarget);
-
-            gebp_kernel(res+startTarget, resStride, blockA, blockB, lengthTarget, actualPanelWidth, cols, alpha,
-                        actualPanelWidth, actual_kc, 0, blockBOffset, blockW);
-          }
+    pack_rhs(blockB, rhs.getSubMapper(actual_k2, 0), actual_kc, cols);
+
+    // the selected lhs's panel has to be split in three different parts:
+    //  1 - the part which is zero => skip it
+    //  2 - the diagonal block => special kernel
+    //  3 - the dense panel below (lower case) or above (upper case) the diagonal block => GEPP
+
+    // the block diagonal, if any:
+    if (IsLower || actual_k2 < rows) {
+      // for each small vertical panels of lhs
+      for (Index k1 = 0; k1 < actual_kc; k1 += panelWidth) {
+        Index actualPanelWidth = std::min<Index>(actual_kc - k1, panelWidth);
+        Index lengthTarget = IsLower ? actual_kc - k1 - actualPanelWidth : k1;
+        Index startBlock = actual_k2 + k1;
+        Index blockBOffset = k1;
+
+        // => GEBP with the micro triangular block
+        // The trick is to pack this micro block while filling the opposite triangular part with zeros.
+        // To this end we do an extra triangular copy to a small temporary buffer
+        for (Index k = 0; k < actualPanelWidth; ++k) {
+          if (SetDiag) triangularBuffer.coeffRef(k, k) = lhs(startBlock + k, startBlock + k);
+          for (Index i = IsLower ? k + 1 : 0; IsLower ? i < actualPanelWidth : i < k; ++i)
+            triangularBuffer.coeffRef(i, k) = lhs(startBlock + i, startBlock + k);
         }
-      }
-      // the part below (lower case) or above (upper case) the diagonal => GEPP
-      {
-        Index start = IsLower ? k2 : 0;
-        Index end   = IsLower ? rows : (std::min)(actual_k2,rows);
-        for(Index i2=start; i2<end; i2+=mc)
-        {
-          const Index actual_mc = (std::min)(i2+mc,end)-i2;
-          gemm_pack_lhs<Scalar, Index, Traits::mr,Traits::LhsProgress, LhsStorageOrder,false>()
-            (blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc);
-
-          gebp_kernel(res+i2, resStride, blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0, blockW);
+        pack_lhs(blockA, LhsMapper(triangularBuffer.data(), triangularBuffer.outerStride()), actualPanelWidth,
+                 actualPanelWidth);
+
+        gebp_kernel(res.getSubMapper(startBlock, 0), blockA, blockB, actualPanelWidth, actualPanelWidth, cols, alpha,
+                    actualPanelWidth, actual_kc, 0, blockBOffset);
+
+        // GEBP with remaining micro panel
+        if (lengthTarget > 0) {
+          Index startTarget = IsLower ? actual_k2 + k1 + actualPanelWidth : actual_k2;
+
+          pack_lhs(blockA, lhs.getSubMapper(startTarget, startBlock), actualPanelWidth, lengthTarget);
+
+          gebp_kernel(res.getSubMapper(startTarget, 0), blockA, blockB, lengthTarget, actualPanelWidth, cols, alpha,
+                      actualPanelWidth, actual_kc, 0, blockBOffset);
         }
       }
     }
+    // the part below (lower case) or above (upper case) the diagonal => GEPP
+    {
+      Index start = IsLower ? k2 : 0;
+      Index end = IsLower ? rows : (std::min)(actual_k2, rows);
+      for (Index i2 = start; i2 < end; i2 += mc) {
+        const Index actual_mc = (std::min)(i2 + mc, end) - i2;
+        gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing,
+                      LhsStorageOrder, false>()(blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc);
+
+        gebp_kernel(res.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, alpha, -1, -1, 0, 0);
+      }
+    }
   }
+}
 
 // implements col-major += alpha * op(general) * op(triangular)
-template <typename Scalar, typename Index, int Mode,
-          int LhsStorageOrder, bool ConjugateLhs,
-          int RhsStorageOrder, bool ConjugateRhs, int Version>
-struct product_triangular_matrix_matrix<Scalar,Index,Mode,false,
-                                        LhsStorageOrder,ConjugateLhs,
-                                        RhsStorageOrder,ConjugateRhs,ColMajor,Version>
-{
-  typedef gebp_traits<Scalar,Scalar> Traits;
+template <typename Scalar, typename Index, int Mode, int LhsStorageOrder, bool ConjugateLhs, int RhsStorageOrder,
+          bool ConjugateRhs, int ResInnerStride, int Version>
+struct product_triangular_matrix_matrix<Scalar, Index, Mode, false, LhsStorageOrder, ConjugateLhs, RhsStorageOrder,
+                                        ConjugateRhs, ColMajor, ResInnerStride, Version> {
+  typedef gebp_traits<Scalar, Scalar> Traits;
   enum {
-    SmallPanelWidth   = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr),
-    IsLower = (Mode&Lower) == Lower,
-    SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1
+    SmallPanelWidth = plain_enum_max(Traits::mr, Traits::nr),
+    IsLower = (Mode & Lower) == Lower,
+    SetDiag = (Mode & (ZeroDiag | UnitDiag)) ? 0 : 1
   };
 
-  static EIGEN_DONT_INLINE void run(
-    Index _rows, Index _cols, Index _depth,
-    const Scalar* _lhs, Index lhsStride,
-    const Scalar* _rhs, Index rhsStride,
-    Scalar* res,        Index resStride,
-    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking);
+  static EIGEN_DONT_INLINE void run(Index _rows, Index _cols, Index _depth, const Scalar* lhs_, Index lhsStride,
+                                    const Scalar* rhs_, Index rhsStride, Scalar* res, Index resIncr, Index resStride,
+                                    const Scalar& alpha, level3_blocking<Scalar, Scalar>& blocking);
 };
 
-template <typename Scalar, typename Index, int Mode,
-          int LhsStorageOrder, bool ConjugateLhs,
-          int RhsStorageOrder, bool ConjugateRhs, int Version>
-EIGEN_DONT_INLINE void product_triangular_matrix_matrix<Scalar,Index,Mode,false,
-                                                        LhsStorageOrder,ConjugateLhs,
-                                                        RhsStorageOrder,ConjugateRhs,ColMajor,Version>::run(
-    Index _rows, Index _cols, Index _depth,
-    const Scalar* _lhs, Index lhsStride,
-    const Scalar* _rhs, Index rhsStride,
-    Scalar* res,        Index resStride,
-    const Scalar& alpha, level3_blocking<Scalar,Scalar>& blocking)
-  {
-    // strip zeros
-    Index diagSize  = (std::min)(_cols,_depth);
-    Index rows      = _rows;
-    Index depth     = IsLower ? _depth : diagSize;
-    Index cols      = IsLower ? diagSize : _cols;
-    
-    const_blas_data_mapper<Scalar, Index, LhsStorageOrder> lhs(_lhs,lhsStride);
-    const_blas_data_mapper<Scalar, Index, RhsStorageOrder> rhs(_rhs,rhsStride);
-
-    Index kc = blocking.kc();                   // cache block size along the K direction
-    Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
-
-    std::size_t sizeA = kc*mc;
-    std::size_t sizeB = kc*cols;
-    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
-
-    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
-    ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
-    ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW());
-
-    Matrix<Scalar,SmallPanelWidth,SmallPanelWidth,RhsStorageOrder> triangularBuffer;
-    triangularBuffer.setZero();
-    if((Mode&ZeroDiag)==ZeroDiag)
-      triangularBuffer.diagonal().setZero();
-    else
-      triangularBuffer.diagonal().setOnes();
-
-    gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
-    gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, LhsStorageOrder> pack_lhs;
-    gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
-    gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel;
-
-    for(Index k2=IsLower ? 0 : depth;
-        IsLower ? k2<depth  : k2>0;
-        IsLower ? k2+=kc   : k2-=kc)
-    {
-      Index actual_kc = (std::min)(IsLower ? depth-k2 : k2, kc);
-      Index actual_k2 = IsLower ? k2 : k2-actual_kc;
-
-      // align blocks with the end of the triangular part for trapezoidal rhs
-      if(IsLower && (k2<cols) && (actual_k2+actual_kc>cols))
-      {
-        actual_kc = cols-k2;
-        k2 = actual_k2 + actual_kc - kc;
-      }
+template <typename Scalar, typename Index, int Mode, int LhsStorageOrder, bool ConjugateLhs, int RhsStorageOrder,
+          bool ConjugateRhs, int ResInnerStride, int Version>
+EIGEN_DONT_INLINE void product_triangular_matrix_matrix<
+    Scalar, Index, Mode, false, LhsStorageOrder, ConjugateLhs, RhsStorageOrder, ConjugateRhs, ColMajor, ResInnerStride,
+    Version>::run(Index _rows, Index _cols, Index _depth, const Scalar* lhs_, Index lhsStride, const Scalar* rhs_,
+                  Index rhsStride, Scalar* res_, Index resIncr, Index resStride, const Scalar& alpha,
+                  level3_blocking<Scalar, Scalar>& blocking) {
+  const Index PacketBytes = packet_traits<Scalar>::size * sizeof(Scalar);
+  // strip zeros
+  Index diagSize = (std::min)(_cols, _depth);
+  Index rows = _rows;
+  Index depth = IsLower ? _depth : diagSize;
+  Index cols = IsLower ? diagSize : _cols;
+
+  typedef const_blas_data_mapper<Scalar, Index, LhsStorageOrder> LhsMapper;
+  typedef const_blas_data_mapper<Scalar, Index, RhsStorageOrder> RhsMapper;
+  typedef blas_data_mapper<typename Traits::ResScalar, Index, ColMajor, Unaligned, ResInnerStride> ResMapper;
+  LhsMapper lhs(lhs_, lhsStride);
+  RhsMapper rhs(rhs_, rhsStride);
+  ResMapper res(res_, resStride, resIncr);
+
+  Index kc = blocking.kc();                    // cache block size along the K direction
+  Index mc = (std::min)(rows, blocking.mc());  // cache block size along the M direction
+
+  std::size_t sizeA = kc * mc;
+  std::size_t sizeB = kc * cols + EIGEN_MAX_ALIGN_BYTES / sizeof(Scalar);
+
+  ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
+  ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
+
+  Matrix<Scalar, SmallPanelWidth, SmallPanelWidth, RhsStorageOrder> triangularBuffer;
+  triangularBuffer.setZero();
+  if ((Mode & ZeroDiag) == ZeroDiag)
+    triangularBuffer.diagonal().setZero();
+  else
+    triangularBuffer.diagonal().setOnes();
+
+  gebp_kernel<Scalar, Scalar, Index, ResMapper, Traits::mr, Traits::nr, ConjugateLhs, ConjugateRhs> gebp_kernel;
+  gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing,
+                LhsStorageOrder>
+      pack_lhs;
+  gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
+  gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr, RhsStorageOrder, false, true> pack_rhs_panel;
+
+  for (Index k2 = IsLower ? 0 : depth; IsLower ? k2 < depth : k2 > 0; IsLower ? k2 += kc : k2 -= kc) {
+    Index actual_kc = (std::min)(IsLower ? depth - k2 : k2, kc);
+    Index actual_k2 = IsLower ? k2 : k2 - actual_kc;
+
+    // align blocks with the end of the triangular part for trapezoidal rhs
+    if (IsLower && (k2 < cols) && (actual_k2 + actual_kc > cols)) {
+      actual_kc = cols - k2;
+      k2 = actual_k2 + actual_kc - kc;
+    }
 
-      // remaining size
-      Index rs = IsLower ? (std::min)(cols,actual_k2) : cols - k2;
-      // size of the triangular part
-      Index ts = (IsLower && actual_k2>=cols) ? 0 : actual_kc;
-
-      Scalar* geb = blockB+ts*ts;
-
-      pack_rhs(geb, &rhs(actual_k2,IsLower ? 0 : k2), rhsStride, actual_kc, rs);
-
-      // pack the triangular part of the rhs padding the unrolled blocks with zeros
-      if(ts>0)
-      {
-        for (Index j2=0; j2<actual_kc; j2+=SmallPanelWidth)
-        {
-          Index actualPanelWidth = std::min<Index>(actual_kc-j2, SmallPanelWidth);
-          Index actual_j2 = actual_k2 + j2;
-          Index panelOffset = IsLower ? j2+actualPanelWidth : 0;
-          Index panelLength = IsLower ? actual_kc-j2-actualPanelWidth : j2;
-          // general part
-          pack_rhs_panel(blockB+j2*actual_kc,
-                         &rhs(actual_k2+panelOffset, actual_j2), rhsStride,
-                         panelLength, actualPanelWidth,
-                         actual_kc, panelOffset);
-
-          // append the triangular part via a temporary buffer
-          for (Index j=0;j<actualPanelWidth;++j)
-          {
-            if (SetDiag)
-              triangularBuffer.coeffRef(j,j) = rhs(actual_j2+j,actual_j2+j);
-            for (Index k=IsLower ? j+1 : 0; IsLower ? k<actualPanelWidth : k<j; ++k)
-              triangularBuffer.coeffRef(k,j) = rhs(actual_j2+k,actual_j2+j);
-          }
-
-          pack_rhs_panel(blockB+j2*actual_kc,
-                         triangularBuffer.data(), triangularBuffer.outerStride(),
-                         actualPanelWidth, actualPanelWidth,
-                         actual_kc, j2);
+    // remaining size
+    Index rs = IsLower ? (std::min)(cols, actual_k2) : cols - k2;
+    // size of the triangular part
+    Index ts = (IsLower && actual_k2 >= cols) ? 0 : actual_kc;
+
+    Scalar* geb = blockB + ts * ts;
+    geb = geb + internal::first_aligned<PacketBytes>(geb, PacketBytes / sizeof(Scalar));
+
+    pack_rhs(geb, rhs.getSubMapper(actual_k2, IsLower ? 0 : k2), actual_kc, rs);
+
+    // pack the triangular part of the rhs padding the unrolled blocks with zeros
+    if (ts > 0) {
+      for (Index j2 = 0; j2 < actual_kc; j2 += SmallPanelWidth) {
+        Index actualPanelWidth = std::min<Index>(actual_kc - j2, SmallPanelWidth);
+        Index actual_j2 = actual_k2 + j2;
+        Index panelOffset = IsLower ? j2 + actualPanelWidth : 0;
+        Index panelLength = IsLower ? actual_kc - j2 - actualPanelWidth : j2;
+        // general part
+        pack_rhs_panel(blockB + j2 * actual_kc, rhs.getSubMapper(actual_k2 + panelOffset, actual_j2), panelLength,
+                       actualPanelWidth, actual_kc, panelOffset);
+
+        // append the triangular part via a temporary buffer
+        for (Index j = 0; j < actualPanelWidth; ++j) {
+          if (SetDiag) triangularBuffer.coeffRef(j, j) = rhs(actual_j2 + j, actual_j2 + j);
+          for (Index k = IsLower ? j + 1 : 0; IsLower ? k < actualPanelWidth : k < j; ++k)
+            triangularBuffer.coeffRef(k, j) = rhs(actual_j2 + k, actual_j2 + j);
         }
+
+        pack_rhs_panel(blockB + j2 * actual_kc, RhsMapper(triangularBuffer.data(), triangularBuffer.outerStride()),
+                       actualPanelWidth, actualPanelWidth, actual_kc, j2);
       }
+    }
+
+    for (Index i2 = 0; i2 < rows; i2 += mc) {
+      const Index actual_mc = (std::min)(mc, rows - i2);
+      pack_lhs(blockA, lhs.getSubMapper(i2, actual_k2), actual_kc, actual_mc);
 
-      for (Index i2=0; i2<rows; i2+=mc)
-      {
-        const Index actual_mc = (std::min)(mc,rows-i2);
-        pack_lhs(blockA, &lhs(i2, actual_k2), lhsStride, actual_kc, actual_mc);
-
-        // triangular kernel
-        if(ts>0)
-        {
-          for (Index j2=0; j2<actual_kc; j2+=SmallPanelWidth)
-          {
-            Index actualPanelWidth = std::min<Index>(actual_kc-j2, SmallPanelWidth);
-            Index panelLength = IsLower ? actual_kc-j2 : j2+actualPanelWidth;
-            Index blockOffset = IsLower ? j2 : 0;
-
-            gebp_kernel(res+i2+(actual_k2+j2)*resStride, resStride,
-                        blockA, blockB+j2*actual_kc,
-                        actual_mc, panelLength, actualPanelWidth,
-                        alpha,
-                        actual_kc, actual_kc,  // strides
-                        blockOffset, blockOffset,// offsets
-                        blockW); // workspace
-          }
+      // triangular kernel
+      if (ts > 0) {
+        for (Index j2 = 0; j2 < actual_kc; j2 += SmallPanelWidth) {
+          Index actualPanelWidth = std::min<Index>(actual_kc - j2, SmallPanelWidth);
+          Index panelLength = IsLower ? actual_kc - j2 : j2 + actualPanelWidth;
+          Index blockOffset = IsLower ? j2 : 0;
+
+          gebp_kernel(res.getSubMapper(i2, actual_k2 + j2), blockA, blockB + j2 * actual_kc, actual_mc, panelLength,
+                      actualPanelWidth, alpha, actual_kc, actual_kc,  // strides
+                      blockOffset, blockOffset);                      // offsets
         }
-        gebp_kernel(res+i2+(IsLower ? 0 : k2)*resStride, resStride,
-                    blockA, geb, actual_mc, actual_kc, rs,
-                    alpha,
-                    -1, -1, 0, 0, blockW);
       }
+      gebp_kernel(res.getSubMapper(i2, IsLower ? 0 : k2), blockA, geb, actual_mc, actual_kc, rs, alpha, -1, -1, 0, 0);
     }
   }
+}
 
 /***************************************************************************
-* Wrapper to product_triangular_matrix_matrix
-***************************************************************************/
-
-template<int Mode, bool LhsIsTriangular, typename Lhs, typename Rhs>
-struct traits<TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false> >
-  : traits<ProductBase<TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false>, Lhs, Rhs> >
-{};
-
-} // end namespace internal
-
-template<int Mode, bool LhsIsTriangular, typename Lhs, typename Rhs>
-struct TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false>
-  : public ProductBase<TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,false>, Lhs, Rhs >
-{
-  EIGEN_PRODUCT_PUBLIC_INTERFACE(TriangularProduct)
-
-  TriangularProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
-
-  template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const
-  {
-    typename internal::add_const_on_value_type<ActualLhsType>::type lhs = LhsBlasTraits::extract(m_lhs);
-    typename internal::add_const_on_value_type<ActualRhsType>::type rhs = RhsBlasTraits::extract(m_rhs);
-
-    Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(m_lhs)
-                               * RhsBlasTraits::extractScalarFactor(m_rhs);
-
-    typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar,
-              Lhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxColsAtCompileTime,4> BlockingType;
-
-    enum { IsLower = (Mode&Lower) == Lower };
-    Index stripedRows  = ((!LhsIsTriangular) || (IsLower))  ? lhs.rows() : (std::min)(lhs.rows(),lhs.cols());
-    Index stripedCols  = ((LhsIsTriangular)  || (!IsLower)) ? rhs.cols() : (std::min)(rhs.cols(),rhs.rows());
-    Index stripedDepth = LhsIsTriangular ? ((!IsLower) ? lhs.cols() : (std::min)(lhs.cols(),lhs.rows()))
-                                         : ((IsLower)  ? rhs.rows() : (std::min)(rhs.rows(),rhs.cols()));
-
-    BlockingType blocking(stripedRows, stripedCols, stripedDepth);
-
-    internal::product_triangular_matrix_matrix<Scalar, Index,
-      Mode, LhsIsTriangular,
-      (internal::traits<_ActualLhsType>::Flags&RowMajorBit) ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate,
-      (internal::traits<_ActualRhsType>::Flags&RowMajorBit) ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate,
-      (internal::traits<Dest          >::Flags&RowMajorBit) ? RowMajor : ColMajor>
-      ::run(
-        stripedRows, stripedCols, stripedDepth,   // sizes
-        &lhs.coeffRef(0,0),    lhs.outerStride(), // lhs info
-        &rhs.coeffRef(0,0),    rhs.outerStride(), // rhs info
-        &dst.coeffRef(0,0), dst.outerStride(),    // result info
-        actualAlpha, blocking
-      );
+ * Wrapper to product_triangular_matrix_matrix
+ ***************************************************************************/
+
+}  // end namespace internal
+
+namespace internal {
+template <int Mode, bool LhsIsTriangular, typename Lhs, typename Rhs>
+struct triangular_product_impl<Mode, LhsIsTriangular, Lhs, false, Rhs, false> {
+  template <typename Dest>
+  static void run(Dest& dst, const Lhs& a_lhs, const Rhs& a_rhs, const typename Dest::Scalar& alpha) {
+    typedef typename Lhs::Scalar LhsScalar;
+    typedef typename Rhs::Scalar RhsScalar;
+    typedef typename Dest::Scalar Scalar;
+
+    typedef internal::blas_traits<Lhs> LhsBlasTraits;
+    typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
+    typedef internal::remove_all_t<ActualLhsType> ActualLhsTypeCleaned;
+    typedef internal::blas_traits<Rhs> RhsBlasTraits;
+    typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
+    typedef internal::remove_all_t<ActualRhsType> ActualRhsTypeCleaned;
+
+    internal::add_const_on_value_type_t<ActualLhsType> lhs = LhsBlasTraits::extract(a_lhs);
+    internal::add_const_on_value_type_t<ActualRhsType> rhs = RhsBlasTraits::extract(a_rhs);
+
+    // Empty product, return early.  Otherwise, we get `nullptr` use errors below when we try to access
+    // coeffRef(0,0).
+    if (lhs.size() == 0 || rhs.size() == 0) {
+      return;
+    }
+
+    LhsScalar lhs_alpha = LhsBlasTraits::extractScalarFactor(a_lhs);
+    RhsScalar rhs_alpha = RhsBlasTraits::extractScalarFactor(a_rhs);
+    Scalar actualAlpha = alpha * lhs_alpha * rhs_alpha;
+
+    typedef internal::gemm_blocking_space<(Dest::Flags & RowMajorBit) ? RowMajor : ColMajor, Scalar, Scalar,
+                                          Lhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime,
+                                          Lhs::MaxColsAtCompileTime, 4>
+        BlockingType;
+
+    enum { IsLower = (Mode & Lower) == Lower };
+    Index stripedRows = ((!LhsIsTriangular) || (IsLower)) ? lhs.rows() : (std::min)(lhs.rows(), lhs.cols());
+    Index stripedCols = ((LhsIsTriangular) || (!IsLower)) ? rhs.cols() : (std::min)(rhs.cols(), rhs.rows());
+    Index stripedDepth = LhsIsTriangular ? ((!IsLower) ? lhs.cols() : (std::min)(lhs.cols(), lhs.rows()))
+                                         : ((IsLower) ? rhs.rows() : (std::min)(rhs.rows(), rhs.cols()));
+
+    BlockingType blocking(stripedRows, stripedCols, stripedDepth, 1, false);
+
+    internal::product_triangular_matrix_matrix<
+        Scalar, Index, Mode, LhsIsTriangular,
+        (internal::traits<ActualLhsTypeCleaned>::Flags & RowMajorBit) ? RowMajor : ColMajor,
+        LhsBlasTraits::NeedToConjugate,
+        (internal::traits<ActualRhsTypeCleaned>::Flags & RowMajorBit) ? RowMajor : ColMajor,
+        RhsBlasTraits::NeedToConjugate, (internal::traits<Dest>::Flags & RowMajorBit) ? RowMajor : ColMajor,
+        Dest::InnerStrideAtCompileTime>::run(stripedRows, stripedCols, stripedDepth,                     // sizes
+                                             &lhs.coeffRef(0, 0), lhs.outerStride(),                     // lhs info
+                                             &rhs.coeffRef(0, 0), rhs.outerStride(),                     // rhs info
+                                             &dst.coeffRef(0, 0), dst.innerStride(), dst.outerStride(),  // result info
+                                             actualAlpha, blocking);
+
+    // Apply correction if the diagonal is unit and a scalar factor was nested:
+    if ((Mode & UnitDiag) == UnitDiag) {
+      if (LhsIsTriangular && !numext::is_exactly_one(lhs_alpha)) {
+        Index diagSize = (std::min)(lhs.rows(), lhs.cols());
+        dst.topRows(diagSize) -= ((lhs_alpha - LhsScalar(1)) * a_rhs).topRows(diagSize);
+      } else if ((!LhsIsTriangular) && !numext::is_exactly_one(rhs_alpha)) {
+        Index diagSize = (std::min)(rhs.rows(), rhs.cols());
+        dst.leftCols(diagSize) -= (rhs_alpha - RhsScalar(1)) * a_lhs.leftCols(diagSize);
+      }
+    }
   }
 };
 
-} // end namespace Eigen
+}  // end namespace internal
+
+}  // end namespace Eigen
 
-#endif // EIGEN_TRIANGULAR_MATRIX_MATRIX_H
+#endif  // EIGEN_TRIANGULAR_MATRIX_MATRIX_H
diff --git a/inst/include/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h b/inst/include/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h
new file mode 100644
index 00000000..3d612b04
--- /dev/null
+++ b/inst/include/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h
@@ -0,0 +1,325 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to BLAS F77
+ *   Triangular matrix * matrix product functionality based on ?TRMM.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_TRIANGULAR_MATRIX_MATRIX_BLAS_H
+#define EIGEN_TRIANGULAR_MATRIX_MATRIX_BLAS_H
+
+// IWYU pragma: private
+#include "../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename Scalar, typename Index, int Mode, bool LhsIsTriangular, int LhsStorageOrder, bool ConjugateLhs,
+          int RhsStorageOrder, bool ConjugateRhs, int ResStorageOrder>
+struct product_triangular_matrix_matrix_trmm
+    : product_triangular_matrix_matrix<Scalar, Index, Mode, LhsIsTriangular, LhsStorageOrder, ConjugateLhs,
+                                       RhsStorageOrder, ConjugateRhs, ResStorageOrder, 1, BuiltIn> {};
+
+// try to go to BLAS specialization
+#define EIGEN_BLAS_TRMM_SPECIALIZE(Scalar, LhsIsTriangular)                                                           \
+  template <typename Index, int Mode, int LhsStorageOrder, bool ConjugateLhs, int RhsStorageOrder, bool ConjugateRhs> \
+  struct product_triangular_matrix_matrix<Scalar, Index, Mode, LhsIsTriangular, LhsStorageOrder, ConjugateLhs,        \
+                                          RhsStorageOrder, ConjugateRhs, ColMajor, 1, Specialized> {                  \
+    static inline void run(Index _rows, Index _cols, Index _depth, const Scalar* _lhs, Index lhsStride,               \
+                           const Scalar* _rhs, Index rhsStride, Scalar* res, Index resIncr, Index resStride,          \
+                           Scalar alpha, level3_blocking<Scalar, Scalar>& blocking) {                                 \
+      EIGEN_ONLY_USED_FOR_DEBUG(resIncr);                                                                             \
+      eigen_assert(resIncr == 1);                                                                                     \
+      product_triangular_matrix_matrix_trmm<Scalar, Index, Mode, LhsIsTriangular, LhsStorageOrder, ConjugateLhs,      \
+                                            RhsStorageOrder, ConjugateRhs, ColMajor>::run(_rows, _cols, _depth, _lhs, \
+                                                                                          lhsStride, _rhs, rhsStride, \
+                                                                                          res, resStride, alpha,      \
+                                                                                          blocking);                  \
+    }                                                                                                                 \
+  };
+
+EIGEN_BLAS_TRMM_SPECIALIZE(double, true)
+EIGEN_BLAS_TRMM_SPECIALIZE(double, false)
+EIGEN_BLAS_TRMM_SPECIALIZE(dcomplex, true)
+EIGEN_BLAS_TRMM_SPECIALIZE(dcomplex, false)
+EIGEN_BLAS_TRMM_SPECIALIZE(float, true)
+EIGEN_BLAS_TRMM_SPECIALIZE(float, false)
+EIGEN_BLAS_TRMM_SPECIALIZE(scomplex, true)
+EIGEN_BLAS_TRMM_SPECIALIZE(scomplex, false)
+
+// implements col-major += alpha * op(triangular) * op(general)
+#define EIGEN_BLAS_TRMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC)                                                      \
+  template <typename Index, int Mode, int LhsStorageOrder, bool ConjugateLhs, int RhsStorageOrder, bool ConjugateRhs>  \
+  struct product_triangular_matrix_matrix_trmm<EIGTYPE, Index, Mode, true, LhsStorageOrder, ConjugateLhs,              \
+                                               RhsStorageOrder, ConjugateRhs, ColMajor> {                              \
+    enum {                                                                                                             \
+      IsLower = (Mode & Lower) == Lower,                                                                               \
+      SetDiag = (Mode & (ZeroDiag | UnitDiag)) ? 0 : 1,                                                                \
+      IsUnitDiag = (Mode & UnitDiag) ? 1 : 0,                                                                          \
+      IsZeroDiag = (Mode & ZeroDiag) ? 1 : 0,                                                                          \
+      LowUp = IsLower ? Lower : Upper,                                                                                 \
+      conjA = ((LhsStorageOrder == ColMajor) && ConjugateLhs) ? 1 : 0                                                  \
+    };                                                                                                                 \
+                                                                                                                       \
+    static void run(Index _rows, Index _cols, Index _depth, const EIGTYPE* _lhs, Index lhsStride, const EIGTYPE* _rhs, \
+                    Index rhsStride, EIGTYPE* res, Index resStride, EIGTYPE alpha,                                     \
+                    level3_blocking<EIGTYPE, EIGTYPE>& blocking) {                                                     \
+      if (_rows == 0 || _cols == 0 || _depth == 0) return;                                                             \
+      Index diagSize = (std::min)(_rows, _depth);                                                                      \
+      Index rows = IsLower ? _rows : diagSize;                                                                         \
+      Index depth = IsLower ? diagSize : _depth;                                                                       \
+      Index cols = _cols;                                                                                              \
+                                                                                                                       \
+      typedef Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder> MatrixLhs;                                            \
+      typedef Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> MatrixRhs;                                            \
+                                                                                                                       \
+      /* Non-square case - doesn't fit to BLAS ?TRMM. Fall to default triangular product or call BLAS ?GEMM*/          \
+      if (rows != depth) {                                                                                             \
+        /* FIXME handle mkl_domain_get_max_threads */                                                                  \
+        /*int nthr = mkl_domain_get_max_threads(EIGEN_BLAS_DOMAIN_BLAS);*/ int nthr = 1;                               \
+                                                                                                                       \
+        if (((nthr == 1) && (((std::max)(rows, depth) - diagSize) / (double)diagSize < 0.5))) {                        \
+          /* Most likely no benefit to call TRMM or GEMM from BLAS */                                                  \
+          product_triangular_matrix_matrix<EIGTYPE, Index, Mode, true, LhsStorageOrder, ConjugateLhs, RhsStorageOrder, \
+                                           ConjugateRhs, ColMajor, 1, BuiltIn>::run(_rows, _cols, _depth, _lhs,        \
+                                                                                    lhsStride, _rhs, rhsStride, res,   \
+                                                                                    1, resStride, alpha, blocking);    \
+          /*std::cout << "TRMM_L: A is not square! Go to Eigen TRMM implementation!\n";*/                              \
+        } else {                                                                                                       \
+          /* Make sense to call GEMM */                                                                                \
+          Map<const MatrixLhs, 0, OuterStride<> > lhsMap(_lhs, rows, depth, OuterStride<>(lhsStride));                 \
+          MatrixLhs aa_tmp = lhsMap.template triangularView<Mode>();                                                   \
+          BlasIndex aStride = convert_index<BlasIndex>(aa_tmp.outerStride());                                          \
+          gemm_blocking_space<ColMajor, EIGTYPE, EIGTYPE, Dynamic, Dynamic, Dynamic> gemm_blocking(_rows, _cols,       \
+                                                                                                   _depth, 1, true);   \
+          general_matrix_matrix_product<Index, EIGTYPE, LhsStorageOrder, ConjugateLhs, EIGTYPE, RhsStorageOrder,       \
+                                        ConjugateRhs, ColMajor, 1>::run(rows, cols, depth, aa_tmp.data(), aStride,     \
+                                                                        _rhs, rhsStride, res, 1, resStride, alpha,     \
+                                                                        gemm_blocking, 0);                             \
+                                                                                                                       \
+          /*std::cout << "TRMM_L: A is not square! Go to BLAS GEMM implementation! " << nthr<<" \n";*/                 \
+        }                                                                                                              \
+        return;                                                                                                        \
+      }                                                                                                                \
+      char side = 'L', transa, uplo, diag = 'N';                                                                       \
+      EIGTYPE* b;                                                                                                      \
+      const EIGTYPE* a;                                                                                                \
+      BlasIndex m, n, lda, ldb;                                                                                        \
+                                                                                                                       \
+      /* Set m, n */                                                                                                   \
+      m = convert_index<BlasIndex>(diagSize);                                                                          \
+      n = convert_index<BlasIndex>(cols);                                                                              \
+                                                                                                                       \
+      /* Set trans */                                                                                                  \
+      transa = (LhsStorageOrder == RowMajor) ? ((ConjugateLhs) ? 'C' : 'T') : 'N';                                     \
+                                                                                                                       \
+      /* Set b, ldb */                                                                                                 \
+      Map<const MatrixRhs, 0, OuterStride<> > rhs(_rhs, depth, cols, OuterStride<>(rhsStride));                        \
+      MatrixX##EIGPREFIX b_tmp;                                                                                        \
+                                                                                                                       \
+      if (ConjugateRhs)                                                                                                \
+        b_tmp = rhs.conjugate();                                                                                       \
+      else                                                                                                             \
+        b_tmp = rhs;                                                                                                   \
+      b = b_tmp.data();                                                                                                \
+      ldb = convert_index<BlasIndex>(b_tmp.outerStride());                                                             \
+                                                                                                                       \
+      /* Set uplo */                                                                                                   \
+      uplo = IsLower ? 'L' : 'U';                                                                                      \
+      if (LhsStorageOrder == RowMajor) uplo = (uplo == 'L') ? 'U' : 'L';                                               \
+      /* Set a, lda */                                                                                                 \
+      Map<const MatrixLhs, 0, OuterStride<> > lhs(_lhs, rows, depth, OuterStride<>(lhsStride));                        \
+      MatrixLhs a_tmp;                                                                                                 \
+                                                                                                                       \
+      if ((conjA != 0) || (SetDiag == 0)) {                                                                            \
+        if (conjA)                                                                                                     \
+          a_tmp = lhs.conjugate();                                                                                     \
+        else                                                                                                           \
+          a_tmp = lhs;                                                                                                 \
+        if (IsZeroDiag)                                                                                                \
+          a_tmp.diagonal().setZero();                                                                                  \
+        else if (IsUnitDiag)                                                                                           \
+          a_tmp.diagonal().setOnes();                                                                                  \
+        a = a_tmp.data();                                                                                              \
+        lda = convert_index<BlasIndex>(a_tmp.outerStride());                                                           \
+      } else {                                                                                                         \
+        a = _lhs;                                                                                                      \
+        lda = convert_index<BlasIndex>(lhsStride);                                                                     \
+      }                                                                                                                \
+      /*std::cout << "TRMM_L: A is square! Go to BLAS TRMM implementation! \n";*/                                      \
+      /* call ?trmm*/                                                                                                  \
+      BLASFUNC(&side, &uplo, &transa, &diag, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a,    \
+               &lda, (BLASTYPE*)b, &ldb);                                                                              \
+                                                                                                                       \
+      /* Add op(a_triangular)*b into res*/                                                                             \
+      Map<MatrixX##EIGPREFIX, 0, OuterStride<> > res_tmp(res, rows, cols, OuterStride<>(resStride));                   \
+      res_tmp = res_tmp + b_tmp;                                                                                       \
+    }                                                                                                                  \
+  };
+
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_TRMM_L(double, double, d, dtrmm)
+EIGEN_BLAS_TRMM_L(dcomplex, MKL_Complex16, cd, ztrmm)
+EIGEN_BLAS_TRMM_L(float, float, f, strmm)
+EIGEN_BLAS_TRMM_L(scomplex, MKL_Complex8, cf, ctrmm)
+#else
+EIGEN_BLAS_TRMM_L(double, double, d, dtrmm_)
+EIGEN_BLAS_TRMM_L(dcomplex, double, cd, ztrmm_)
+EIGEN_BLAS_TRMM_L(float, float, f, strmm_)
+EIGEN_BLAS_TRMM_L(scomplex, float, cf, ctrmm_)
+#endif
+
+// implements col-major += alpha * op(general) * op(triangular)
+#define EIGEN_BLAS_TRMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC)                                                      \
+  template <typename Index, int Mode, int LhsStorageOrder, bool ConjugateLhs, int RhsStorageOrder, bool ConjugateRhs>  \
+  struct product_triangular_matrix_matrix_trmm<EIGTYPE, Index, Mode, false, LhsStorageOrder, ConjugateLhs,             \
+                                               RhsStorageOrder, ConjugateRhs, ColMajor> {                              \
+    enum {                                                                                                             \
+      IsLower = (Mode & Lower) == Lower,                                                                               \
+      SetDiag = (Mode & (ZeroDiag | UnitDiag)) ? 0 : 1,                                                                \
+      IsUnitDiag = (Mode & UnitDiag) ? 1 : 0,                                                                          \
+      IsZeroDiag = (Mode & ZeroDiag) ? 1 : 0,                                                                          \
+      LowUp = IsLower ? Lower : Upper,                                                                                 \
+      conjA = ((RhsStorageOrder == ColMajor) && ConjugateRhs) ? 1 : 0                                                  \
+    };                                                                                                                 \
+                                                                                                                       \
+    static void run(Index _rows, Index _cols, Index _depth, const EIGTYPE* _lhs, Index lhsStride, const EIGTYPE* _rhs, \
+                    Index rhsStride, EIGTYPE* res, Index resStride, EIGTYPE alpha,                                     \
+                    level3_blocking<EIGTYPE, EIGTYPE>& blocking) {                                                     \
+      if (_rows == 0 || _cols == 0 || _depth == 0) return;                                                             \
+      Index diagSize = (std::min)(_cols, _depth);                                                                      \
+      Index rows = _rows;                                                                                              \
+      Index depth = IsLower ? _depth : diagSize;                                                                       \
+      Index cols = IsLower ? diagSize : _cols;                                                                         \
+                                                                                                                       \
+      typedef Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder> MatrixLhs;                                            \
+      typedef Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> MatrixRhs;                                            \
+                                                                                                                       \
+      /* Non-square case - doesn't fit to BLAS ?TRMM. Fall to default triangular product or call BLAS ?GEMM*/          \
+      if (cols != depth) {                                                                                             \
+        int nthr = 1 /*mkl_domain_get_max_threads(EIGEN_BLAS_DOMAIN_BLAS)*/;                                           \
+                                                                                                                       \
+        if ((nthr == 1) && (((std::max)(cols, depth) - diagSize) / (double)diagSize < 0.5)) {                          \
+          /* Most likely no benefit to call TRMM or GEMM from BLAS*/                                                   \
+          product_triangular_matrix_matrix<EIGTYPE, Index, Mode, false, LhsStorageOrder, ConjugateLhs,                 \
+                                           RhsStorageOrder, ConjugateRhs, ColMajor, 1, BuiltIn>::run(_rows, _cols,     \
+                                                                                                     _depth, _lhs,     \
+                                                                                                     lhsStride, _rhs,  \
+                                                                                                     rhsStride, res,   \
+                                                                                                     1, resStride,     \
+                                                                                                     alpha, blocking); \
+          /*std::cout << "TRMM_R: A is not square! Go to Eigen TRMM implementation!\n";*/                              \
+        } else {                                                                                                       \
+          /* Make sense to call GEMM */                                                                                \
+          Map<const MatrixRhs, 0, OuterStride<> > rhsMap(_rhs, depth, cols, OuterStride<>(rhsStride));                 \
+          MatrixRhs aa_tmp = rhsMap.template triangularView<Mode>();                                                   \
+          BlasIndex aStride = convert_index<BlasIndex>(aa_tmp.outerStride());                                          \
+          gemm_blocking_space<ColMajor, EIGTYPE, EIGTYPE, Dynamic, Dynamic, Dynamic> gemm_blocking(_rows, _cols,       \
+                                                                                                   _depth, 1, true);   \
+          general_matrix_matrix_product<Index, EIGTYPE, LhsStorageOrder, ConjugateLhs, EIGTYPE, RhsStorageOrder,       \
+                                        ConjugateRhs, ColMajor, 1>::run(rows, cols, depth, _lhs, lhsStride,            \
+                                                                        aa_tmp.data(), aStride, res, 1, resStride,     \
+                                                                        alpha, gemm_blocking, 0);                      \
+                                                                                                                       \
+          /*std::cout << "TRMM_R: A is not square! Go to BLAS GEMM implementation! " << nthr<<" \n";*/                 \
+        }                                                                                                              \
+        return;                                                                                                        \
+      }                                                                                                                \
+      char side = 'R', transa, uplo, diag = 'N';                                                                       \
+      EIGTYPE* b;                                                                                                      \
+      const EIGTYPE* a;                                                                                                \
+      BlasIndex m, n, lda, ldb;                                                                                        \
+                                                                                                                       \
+      /* Set m, n */                                                                                                   \
+      m = convert_index<BlasIndex>(rows);                                                                              \
+      n = convert_index<BlasIndex>(diagSize);                                                                          \
+                                                                                                                       \
+      /* Set trans */                                                                                                  \
+      transa = (RhsStorageOrder == RowMajor) ? ((ConjugateRhs) ? 'C' : 'T') : 'N';                                     \
+                                                                                                                       \
+      /* Set b, ldb */                                                                                                 \
+      Map<const MatrixLhs, 0, OuterStride<> > lhs(_lhs, rows, depth, OuterStride<>(lhsStride));                        \
+      MatrixX##EIGPREFIX b_tmp;                                                                                        \
+                                                                                                                       \
+      if (ConjugateLhs)                                                                                                \
+        b_tmp = lhs.conjugate();                                                                                       \
+      else                                                                                                             \
+        b_tmp = lhs;                                                                                                   \
+      b = b_tmp.data();                                                                                                \
+      ldb = convert_index<BlasIndex>(b_tmp.outerStride());                                                             \
+                                                                                                                       \
+      /* Set uplo */                                                                                                   \
+      uplo = IsLower ? 'L' : 'U';                                                                                      \
+      if (RhsStorageOrder == RowMajor) uplo = (uplo == 'L') ? 'U' : 'L';                                               \
+      /* Set a, lda */                                                                                                 \
+      Map<const MatrixRhs, 0, OuterStride<> > rhs(_rhs, depth, cols, OuterStride<>(rhsStride));                        \
+      MatrixRhs a_tmp;                                                                                                 \
+                                                                                                                       \
+      if ((conjA != 0) || (SetDiag == 0)) {                                                                            \
+        if (conjA)                                                                                                     \
+          a_tmp = rhs.conjugate();                                                                                     \
+        else                                                                                                           \
+          a_tmp = rhs;                                                                                                 \
+        if (IsZeroDiag)                                                                                                \
+          a_tmp.diagonal().setZero();                                                                                  \
+        else if (IsUnitDiag)                                                                                           \
+          a_tmp.diagonal().setOnes();                                                                                  \
+        a = a_tmp.data();                                                                                              \
+        lda = convert_index<BlasIndex>(a_tmp.outerStride());                                                           \
+      } else {                                                                                                         \
+        a = _rhs;                                                                                                      \
+        lda = convert_index<BlasIndex>(rhsStride);                                                                     \
+      }                                                                                                                \
+      /*std::cout << "TRMM_R: A is square! Go to BLAS TRMM implementation! \n";*/                                      \
+      /* call ?trmm*/                                                                                                  \
+      BLASFUNC(&side, &uplo, &transa, &diag, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a,    \
+               &lda, (BLASTYPE*)b, &ldb);                                                                              \
+                                                                                                                       \
+      /* Add op(a_triangular)*b into res*/                                                                             \
+      Map<MatrixX##EIGPREFIX, 0, OuterStride<> > res_tmp(res, rows, cols, OuterStride<>(resStride));                   \
+      res_tmp = res_tmp + b_tmp;                                                                                       \
+    }                                                                                                                  \
+  };
+
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_TRMM_R(double, double, d, dtrmm)
+EIGEN_BLAS_TRMM_R(dcomplex, MKL_Complex16, cd, ztrmm)
+EIGEN_BLAS_TRMM_R(float, float, f, strmm)
+EIGEN_BLAS_TRMM_R(scomplex, MKL_Complex8, cf, ctrmm)
+#else
+EIGEN_BLAS_TRMM_R(double, double, d, dtrmm_)
+EIGEN_BLAS_TRMM_R(dcomplex, double, cd, ztrmm_)
+EIGEN_BLAS_TRMM_R(float, float, f, strmm_)
+EIGEN_BLAS_TRMM_R(scomplex, float, cf, ctrmm_)
+#endif
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_TRIANGULAR_MATRIX_MATRIX_BLAS_H
diff --git a/inst/include/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h b/inst/include/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h
deleted file mode 100644
index 4cc56a42..00000000
--- a/inst/include/Eigen/src/Core/products/TriangularMatrixMatrix_MKL.h
+++ /dev/null
@@ -1,309 +0,0 @@
-/*
- Copyright (c) 2011, Intel Corporation. All rights reserved.
-
- Redistribution and use in source and binary forms, with or without modification,
- are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
- * Neither the name of Intel Corporation nor the names of its contributors may
-   be used to endorse or promote products derived from this software without
-   specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
- ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
- ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
- *   Triangular matrix * matrix product functionality based on ?TRMM.
- ********************************************************************************
-*/
-
-#ifndef EIGEN_TRIANGULAR_MATRIX_MATRIX_MKL_H
-#define EIGEN_TRIANGULAR_MATRIX_MATRIX_MKL_H
-
-namespace Eigen { 
-
-namespace internal {
-
-
-template <typename Scalar, typename Index,
-          int Mode, bool LhsIsTriangular,
-          int LhsStorageOrder, bool ConjugateLhs,
-          int RhsStorageOrder, bool ConjugateRhs,
-          int ResStorageOrder>
-struct product_triangular_matrix_matrix_trmm :
-       product_triangular_matrix_matrix<Scalar,Index,Mode,
-          LhsIsTriangular,LhsStorageOrder,ConjugateLhs,
-          RhsStorageOrder, ConjugateRhs, ResStorageOrder, BuiltIn> {};
-
-
-// try to go to BLAS specialization
-#define EIGEN_MKL_TRMM_SPECIALIZE(Scalar, LhsIsTriangular) \
-template <typename Index, int Mode, \
-          int LhsStorageOrder, bool ConjugateLhs, \
-          int RhsStorageOrder, bool ConjugateRhs> \
-struct product_triangular_matrix_matrix<Scalar,Index, Mode, LhsIsTriangular, \
-           LhsStorageOrder,ConjugateLhs, RhsStorageOrder,ConjugateRhs,ColMajor,Specialized> { \
-  static inline void run(Index _rows, Index _cols, Index _depth, const Scalar* _lhs, Index lhsStride,\
-    const Scalar* _rhs, Index rhsStride, Scalar* res, Index resStride, Scalar alpha, level3_blocking<Scalar,Scalar>& blocking) { \
-      product_triangular_matrix_matrix_trmm<Scalar,Index,Mode, \
-        LhsIsTriangular,LhsStorageOrder,ConjugateLhs, \
-        RhsStorageOrder, ConjugateRhs, ColMajor>::run( \
-        _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \
-  } \
-};
-
-EIGEN_MKL_TRMM_SPECIALIZE(double, true)
-EIGEN_MKL_TRMM_SPECIALIZE(double, false)
-EIGEN_MKL_TRMM_SPECIALIZE(dcomplex, true)
-EIGEN_MKL_TRMM_SPECIALIZE(dcomplex, false)
-EIGEN_MKL_TRMM_SPECIALIZE(float, true)
-EIGEN_MKL_TRMM_SPECIALIZE(float, false)
-EIGEN_MKL_TRMM_SPECIALIZE(scomplex, true)
-EIGEN_MKL_TRMM_SPECIALIZE(scomplex, false)
-
-// implements col-major += alpha * op(triangular) * op(general)
-#define EIGEN_MKL_TRMM_L(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \
-template <typename Index, int Mode, \
-          int LhsStorageOrder, bool ConjugateLhs, \
-          int RhsStorageOrder, bool ConjugateRhs> \
-struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,true, \
-         LhsStorageOrder,ConjugateLhs,RhsStorageOrder,ConjugateRhs,ColMajor> \
-{ \
-  enum { \
-    IsLower = (Mode&Lower) == Lower, \
-    SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1, \
-    IsUnitDiag  = (Mode&UnitDiag) ? 1 : 0, \
-    IsZeroDiag  = (Mode&ZeroDiag) ? 1 : 0, \
-    LowUp = IsLower ? Lower : Upper, \
-    conjA = ((LhsStorageOrder==ColMajor) && ConjugateLhs) ? 1 : 0 \
-  }; \
-\
-  static void run( \
-    Index _rows, Index _cols, Index _depth, \
-    const EIGTYPE* _lhs, Index lhsStride, \
-    const EIGTYPE* _rhs, Index rhsStride, \
-    EIGTYPE* res,        Index resStride, \
-    EIGTYPE alpha, level3_blocking<EIGTYPE,EIGTYPE>& blocking) \
-  { \
-   Index diagSize  = (std::min)(_rows,_depth); \
-   Index rows      = IsLower ? _rows : diagSize; \
-   Index depth     = IsLower ? diagSize : _depth; \
-   Index cols      = _cols; \
-\
-   typedef Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder> MatrixLhs; \
-   typedef Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> MatrixRhs; \
-\
-/* Non-square case - doesn't fit to MKL ?TRMM. Fall to default triangular product or call MKL ?GEMM*/ \
-   if (rows != depth) { \
-\
-     int nthr = mkl_domain_get_max_threads(EIGEN_MKL_DOMAIN_BLAS); \
-\
-     if (((nthr==1) && (((std::max)(rows,depth)-diagSize)/(double)diagSize < 0.5))) { \
-     /* Most likely no benefit to call TRMM or GEMM from MKL*/ \
-       product_triangular_matrix_matrix<EIGTYPE,Index,Mode,true, \
-       LhsStorageOrder,ConjugateLhs, RhsStorageOrder, ConjugateRhs, ColMajor, BuiltIn>::run( \
-           _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \
-     /*std::cout << "TRMM_L: A is not square! Go to Eigen TRMM implementation!\n";*/ \
-     } else { \
-     /* Make sense to call GEMM */ \
-       Map<const MatrixLhs, 0, OuterStride<> > lhsMap(_lhs,rows,depth,OuterStride<>(lhsStride)); \
-       MatrixLhs aa_tmp=lhsMap.template triangularView<Mode>(); \
-       MKL_INT aStride = aa_tmp.outerStride(); \
-       gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth); \
-       general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor>::run( \
-       rows, cols, depth, aa_tmp.data(), aStride, _rhs, rhsStride, res, resStride, alpha, gemm_blocking, 0); \
-\
-     /*std::cout << "TRMM_L: A is not square! Go to MKL GEMM implementation! " << nthr<<" \n";*/ \
-     } \
-     return; \
-   } \
-   char side = 'L', transa, uplo, diag = 'N'; \
-   EIGTYPE *b; \
-   const EIGTYPE *a; \
-   MKL_INT m, n, lda, ldb; \
-   MKLTYPE alpha_; \
-\
-/* Set alpha_*/ \
-   assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); \
-\
-/* Set m, n */ \
-   m = (MKL_INT)diagSize; \
-   n = (MKL_INT)cols; \
-\
-/* Set trans */ \
-   transa = (LhsStorageOrder==RowMajor) ? ((ConjugateLhs) ? 'C' : 'T') : 'N'; \
-\
-/* Set b, ldb */ \
-   Map<const MatrixRhs, 0, OuterStride<> > rhs(_rhs,depth,cols,OuterStride<>(rhsStride)); \
-   MatrixX##EIGPREFIX b_tmp; \
-\
-   if (ConjugateRhs) b_tmp = rhs.conjugate(); else b_tmp = rhs; \
-   b = b_tmp.data(); \
-   ldb = b_tmp.outerStride(); \
-\
-/* Set uplo */ \
-   uplo = IsLower ? 'L' : 'U'; \
-   if (LhsStorageOrder==RowMajor) uplo = (uplo == 'L') ? 'U' : 'L'; \
-/* Set a, lda */ \
-   Map<const MatrixLhs, 0, OuterStride<> > lhs(_lhs,rows,depth,OuterStride<>(lhsStride)); \
-   MatrixLhs a_tmp; \
-\
-   if ((conjA!=0) || (SetDiag==0)) { \
-     if (conjA) a_tmp = lhs.conjugate(); else a_tmp = lhs; \
-     if (IsZeroDiag) \
-       a_tmp.diagonal().setZero(); \
-     else if (IsUnitDiag) \
-       a_tmp.diagonal().setOnes();\
-     a = a_tmp.data(); \
-     lda = a_tmp.outerStride(); \
-   } else { \
-     a = _lhs; \
-     lda = lhsStride; \
-   } \
-   /*std::cout << "TRMM_L: A is square! Go to MKL TRMM implementation! \n";*/ \
-/* call ?trmm*/ \
-   MKLPREFIX##trmm(&side, &uplo, &transa, &diag, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (MKLTYPE*)b, &ldb); \
-\
-/* Add op(a_triangular)*b into res*/ \
-   Map<MatrixX##EIGPREFIX, 0, OuterStride<> > res_tmp(res,rows,cols,OuterStride<>(resStride)); \
-   res_tmp=res_tmp+b_tmp; \
-  } \
-};
-
-EIGEN_MKL_TRMM_L(double, double, d, d)
-EIGEN_MKL_TRMM_L(dcomplex, MKL_Complex16, cd, z)
-EIGEN_MKL_TRMM_L(float, float, f, s)
-EIGEN_MKL_TRMM_L(scomplex, MKL_Complex8, cf, c)
-
-// implements col-major += alpha * op(general) * op(triangular)
-#define EIGEN_MKL_TRMM_R(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \
-template <typename Index, int Mode, \
-          int LhsStorageOrder, bool ConjugateLhs, \
-          int RhsStorageOrder, bool ConjugateRhs> \
-struct product_triangular_matrix_matrix_trmm<EIGTYPE,Index,Mode,false, \
-         LhsStorageOrder,ConjugateLhs,RhsStorageOrder,ConjugateRhs,ColMajor> \
-{ \
-  enum { \
-    IsLower = (Mode&Lower) == Lower, \
-    SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1, \
-    IsUnitDiag  = (Mode&UnitDiag) ? 1 : 0, \
-    IsZeroDiag  = (Mode&ZeroDiag) ? 1 : 0, \
-    LowUp = IsLower ? Lower : Upper, \
-    conjA = ((RhsStorageOrder==ColMajor) && ConjugateRhs) ? 1 : 0 \
-  }; \
-\
-  static void run( \
-    Index _rows, Index _cols, Index _depth, \
-    const EIGTYPE* _lhs, Index lhsStride, \
-    const EIGTYPE* _rhs, Index rhsStride, \
-    EIGTYPE* res,        Index resStride, \
-    EIGTYPE alpha, level3_blocking<EIGTYPE,EIGTYPE>& blocking) \
-  { \
-   Index diagSize  = (std::min)(_cols,_depth); \
-   Index rows      = _rows; \
-   Index depth     = IsLower ? _depth : diagSize; \
-   Index cols      = IsLower ? diagSize : _cols; \
-\
-   typedef Matrix<EIGTYPE, Dynamic, Dynamic, LhsStorageOrder> MatrixLhs; \
-   typedef Matrix<EIGTYPE, Dynamic, Dynamic, RhsStorageOrder> MatrixRhs; \
-\
-/* Non-square case - doesn't fit to MKL ?TRMM. Fall to default triangular product or call MKL ?GEMM*/ \
-   if (cols != depth) { \
-\
-     int nthr = mkl_domain_get_max_threads(EIGEN_MKL_DOMAIN_BLAS); \
-\
-     if ((nthr==1) && (((std::max)(cols,depth)-diagSize)/(double)diagSize < 0.5)) { \
-     /* Most likely no benefit to call TRMM or GEMM from MKL*/ \
-       product_triangular_matrix_matrix<EIGTYPE,Index,Mode,false, \
-       LhsStorageOrder,ConjugateLhs, RhsStorageOrder, ConjugateRhs, ColMajor, BuiltIn>::run( \
-           _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \
-       /*std::cout << "TRMM_R: A is not square! Go to Eigen TRMM implementation!\n";*/ \
-     } else { \
-     /* Make sense to call GEMM */ \
-       Map<const MatrixRhs, 0, OuterStride<> > rhsMap(_rhs,depth,cols, OuterStride<>(rhsStride)); \
-       MatrixRhs aa_tmp=rhsMap.template triangularView<Mode>(); \
-       MKL_INT aStride = aa_tmp.outerStride(); \
-       gemm_blocking_space<ColMajor,EIGTYPE,EIGTYPE,Dynamic,Dynamic,Dynamic> gemm_blocking(_rows,_cols,_depth); \
-       general_matrix_matrix_product<Index,EIGTYPE,LhsStorageOrder,ConjugateLhs,EIGTYPE,RhsStorageOrder,ConjugateRhs,ColMajor>::run( \
-       rows, cols, depth, _lhs, lhsStride, aa_tmp.data(), aStride, res, resStride, alpha, gemm_blocking, 0); \
-\
-     /*std::cout << "TRMM_R: A is not square! Go to MKL GEMM implementation! " << nthr<<" \n";*/ \
-     } \
-     return; \
-   } \
-   char side = 'R', transa, uplo, diag = 'N'; \
-   EIGTYPE *b; \
-   const EIGTYPE *a; \
-   MKL_INT m, n, lda, ldb; \
-   MKLTYPE alpha_; \
-\
-/* Set alpha_*/ \
-   assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); \
-\
-/* Set m, n */ \
-   m = (MKL_INT)rows; \
-   n = (MKL_INT)diagSize; \
-\
-/* Set trans */ \
-   transa = (RhsStorageOrder==RowMajor) ? ((ConjugateRhs) ? 'C' : 'T') : 'N'; \
-\
-/* Set b, ldb */ \
-   Map<const MatrixLhs, 0, OuterStride<> > lhs(_lhs,rows,depth,OuterStride<>(lhsStride)); \
-   MatrixX##EIGPREFIX b_tmp; \
-\
-   if (ConjugateLhs) b_tmp = lhs.conjugate(); else b_tmp = lhs; \
-   b = b_tmp.data(); \
-   ldb = b_tmp.outerStride(); \
-\
-/* Set uplo */ \
-   uplo = IsLower ? 'L' : 'U'; \
-   if (RhsStorageOrder==RowMajor) uplo = (uplo == 'L') ? 'U' : 'L'; \
-/* Set a, lda */ \
-   Map<const MatrixRhs, 0, OuterStride<> > rhs(_rhs,depth,cols, OuterStride<>(rhsStride)); \
-   MatrixRhs a_tmp; \
-\
-   if ((conjA!=0) || (SetDiag==0)) { \
-     if (conjA) a_tmp = rhs.conjugate(); else a_tmp = rhs; \
-     if (IsZeroDiag) \
-       a_tmp.diagonal().setZero(); \
-     else if (IsUnitDiag) \
-       a_tmp.diagonal().setOnes();\
-     a = a_tmp.data(); \
-     lda = a_tmp.outerStride(); \
-   } else { \
-     a = _rhs; \
-     lda = rhsStride; \
-   } \
-   /*std::cout << "TRMM_R: A is square! Go to MKL TRMM implementation! \n";*/ \
-/* call ?trmm*/ \
-   MKLPREFIX##trmm(&side, &uplo, &transa, &diag, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (MKLTYPE*)b, &ldb); \
-\
-/* Add op(a_triangular)*b into res*/ \
-   Map<MatrixX##EIGPREFIX, 0, OuterStride<> > res_tmp(res,rows,cols,OuterStride<>(resStride)); \
-   res_tmp=res_tmp+b_tmp; \
-  } \
-};
-
-EIGEN_MKL_TRMM_R(double, double, d, d)
-EIGEN_MKL_TRMM_R(dcomplex, MKL_Complex16, cd, z)
-EIGEN_MKL_TRMM_R(float, float, f, s)
-EIGEN_MKL_TRMM_R(scomplex, MKL_Complex8, cf, c)
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_TRIANGULAR_MATRIX_MATRIX_MKL_H
diff --git a/inst/include/Eigen/src/Core/products/TriangularMatrixVector.h b/inst/include/Eigen/src/Core/products/TriangularMatrixVector.h
index 6117d5a8..bef4cbaf 100644
--- a/inst/include/Eigen/src/Core/products/TriangularMatrixVector.h
+++ b/inst/include/Eigen/src/Core/products/TriangularMatrixVector.h
@@ -10,339 +10,338 @@
 #ifndef EIGEN_TRIANGULARMATRIXVECTOR_H
 #define EIGEN_TRIANGULARMATRIXVECTOR_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "../InternalHeaderCheck.h"
+
+namespace Eigen {
 
 namespace internal {
 
-template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int StorageOrder, int Version=Specialized>
+template <typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs,
+          int StorageOrder, int Version = Specialized>
 struct triangular_matrix_vector_product;
 
-template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int Version>
-struct triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,ColMajor,Version>
-{
-  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
-  enum {
-    IsLower = ((Mode&Lower)==Lower),
-    HasUnitDiag = (Mode & UnitDiag)==UnitDiag,
-    HasZeroDiag = (Mode & ZeroDiag)==ZeroDiag
-  };
-  static EIGEN_DONT_INLINE  void run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
-                                     const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha);
+template <typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int Version>
+struct triangular_matrix_vector_product<Index, Mode, LhsScalar, ConjLhs, RhsScalar, ConjRhs, ColMajor, Version> {
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  static constexpr bool IsLower = ((Mode & Lower) == Lower);
+  static constexpr bool HasUnitDiag = (Mode & UnitDiag) == UnitDiag;
+  static constexpr bool HasZeroDiag = (Mode & ZeroDiag) == ZeroDiag;
+  static EIGEN_DONT_INLINE void run(Index _rows, Index _cols, const LhsScalar* lhs_, Index lhsStride,
+                                    const RhsScalar* rhs_, Index rhsIncr, ResScalar* res_, Index resIncr,
+                                    const RhsScalar& alpha);
 };
 
-template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int Version>
-EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,ColMajor,Version>
-  ::run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
-        const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha)
-  {
-    static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH;
-    Index size = (std::min)(_rows,_cols);
-    Index rows = IsLower ? _rows : (std::min)(_rows,_cols);
-    Index cols = IsLower ? (std::min)(_rows,_cols) : _cols;
-
-    typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> > LhsMap;
-    const LhsMap lhs(_lhs,rows,cols,OuterStride<>(lhsStride));
-    typename conj_expr_if<ConjLhs,LhsMap>::type cjLhs(lhs);
-    
-    typedef Map<const Matrix<RhsScalar,Dynamic,1>, 0, InnerStride<> > RhsMap;
-    const RhsMap rhs(_rhs,cols,InnerStride<>(rhsIncr));
-    typename conj_expr_if<ConjRhs,RhsMap>::type cjRhs(rhs);
-
-    typedef Map<Matrix<ResScalar,Dynamic,1> > ResMap;
-    ResMap res(_res,rows);
-
-    for (Index pi=0; pi<size; pi+=PanelWidth)
-    {
-      Index actualPanelWidth = (std::min)(PanelWidth, size-pi);
-      for (Index k=0; k<actualPanelWidth; ++k)
-      {
-        Index i = pi + k;
-        Index s = IsLower ? ((HasUnitDiag||HasZeroDiag) ? i+1 : i ) : pi;
-        Index r = IsLower ? actualPanelWidth-k : k+1;
-        if ((!(HasUnitDiag||HasZeroDiag)) || (--r)>0)
-          res.segment(s,r) += (alpha * cjRhs.coeff(i)) * cjLhs.col(i).segment(s,r);
-        if (HasUnitDiag)
-          res.coeffRef(i) += alpha * cjRhs.coeff(i);
-      }
-      Index r = IsLower ? rows - pi - actualPanelWidth : pi;
-      if (r>0)
-      {
-        Index s = IsLower ? pi+actualPanelWidth : 0;
-        general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjLhs,RhsScalar,ConjRhs,BuiltIn>::run(
-            r, actualPanelWidth,
-            &lhs.coeffRef(s,pi), lhsStride,
-            &rhs.coeffRef(pi), rhsIncr,
-            &res.coeffRef(s), resIncr, alpha);
-      }
+template <typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int Version>
+EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index, Mode, LhsScalar, ConjLhs, RhsScalar, ConjRhs, ColMajor,
+                                                        Version>::run(Index _rows, Index _cols, const LhsScalar* lhs_,
+                                                                      Index lhsStride, const RhsScalar* rhs_,
+                                                                      Index rhsIncr, ResScalar* res_, Index resIncr,
+                                                                      const RhsScalar& alpha) {
+  static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH;
+  Index size = (std::min)(_rows, _cols);
+  Index rows = IsLower ? _rows : (std::min)(_rows, _cols);
+  Index cols = IsLower ? (std::min)(_rows, _cols) : _cols;
+
+  typedef Map<const Matrix<LhsScalar, Dynamic, Dynamic, ColMajor>, 0, OuterStride<> > LhsMap;
+  const LhsMap lhs(lhs_, rows, cols, OuterStride<>(lhsStride));
+  typename conj_expr_if<ConjLhs, LhsMap>::type cjLhs(lhs);
+
+  typedef Map<const Matrix<RhsScalar, Dynamic, 1>, 0, InnerStride<> > RhsMap;
+  const RhsMap rhs(rhs_, cols, InnerStride<>(rhsIncr));
+  typename conj_expr_if<ConjRhs, RhsMap>::type cjRhs(rhs);
+
+  typedef Map<Matrix<ResScalar, Dynamic, 1> > ResMap;
+  ResMap res(res_, rows);
+
+  typedef const_blas_data_mapper<LhsScalar, Index, ColMajor> LhsMapper;
+  typedef const_blas_data_mapper<RhsScalar, Index, RowMajor> RhsMapper;
+
+  for (Index pi = 0; pi < size; pi += PanelWidth) {
+    Index actualPanelWidth = (std::min)(PanelWidth, size - pi);
+    for (Index k = 0; k < actualPanelWidth; ++k) {
+      Index i = pi + k;
+      Index s = IsLower ? ((HasUnitDiag || HasZeroDiag) ? i + 1 : i) : pi;
+      Index r = IsLower ? actualPanelWidth - k : k + 1;
+      if ((!(HasUnitDiag || HasZeroDiag)) || (--r) > 0)
+        res.segment(s, r) += (alpha * cjRhs.coeff(i)) * cjLhs.col(i).segment(s, r);
+      if (HasUnitDiag) res.coeffRef(i) += alpha * cjRhs.coeff(i);
     }
-    if((!IsLower) && cols>size)
-    {
-      general_matrix_vector_product<Index,LhsScalar,ColMajor,ConjLhs,RhsScalar,ConjRhs>::run(
-          rows, cols-size,
-          &lhs.coeffRef(0,size), lhsStride,
-          &rhs.coeffRef(size), rhsIncr,
-          _res, resIncr, alpha);
+    Index r = IsLower ? rows - pi - actualPanelWidth : pi;
+    if (r > 0) {
+      Index s = IsLower ? pi + actualPanelWidth : 0;
+      general_matrix_vector_product<Index, LhsScalar, LhsMapper, ColMajor, ConjLhs, RhsScalar, RhsMapper, ConjRhs,
+                                    BuiltIn>::run(r, actualPanelWidth, LhsMapper(&lhs.coeffRef(s, pi), lhsStride),
+                                                  RhsMapper(&rhs.coeffRef(pi), rhsIncr), &res.coeffRef(s), resIncr,
+                                                  alpha);
     }
   }
-
-template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs,int Version>
-struct triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,RowMajor,Version>
-{
-  typedef typename scalar_product_traits<LhsScalar, RhsScalar>::ReturnType ResScalar;
-  enum {
-    IsLower = ((Mode&Lower)==Lower),
-    HasUnitDiag = (Mode & UnitDiag)==UnitDiag,
-    HasZeroDiag = (Mode & ZeroDiag)==ZeroDiag
-  };
-  static EIGEN_DONT_INLINE void run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
-                                    const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha);
+  if ((!IsLower) && cols > size) {
+    general_matrix_vector_product<Index, LhsScalar, LhsMapper, ColMajor, ConjLhs, RhsScalar, RhsMapper, ConjRhs>::run(
+        rows, cols - size, LhsMapper(&lhs.coeffRef(0, size), lhsStride), RhsMapper(&rhs.coeffRef(size), rhsIncr), res_,
+        resIncr, alpha);
+  }
+}
+
+template <typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int Version>
+struct triangular_matrix_vector_product<Index, Mode, LhsScalar, ConjLhs, RhsScalar, ConjRhs, RowMajor, Version> {
+  typedef typename ScalarBinaryOpTraits<LhsScalar, RhsScalar>::ReturnType ResScalar;
+  static constexpr bool IsLower = ((Mode & Lower) == Lower);
+  static constexpr bool HasUnitDiag = (Mode & UnitDiag) == UnitDiag;
+  static constexpr bool HasZeroDiag = (Mode & ZeroDiag) == ZeroDiag;
+  static EIGEN_DONT_INLINE void run(Index _rows, Index _cols, const LhsScalar* lhs_, Index lhsStride,
+                                    const RhsScalar* rhs_, Index rhsIncr, ResScalar* res_, Index resIncr,
+                                    const ResScalar& alpha);
 };
 
-template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs,int Version>
-EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,RowMajor,Version>
-  ::run(Index _rows, Index _cols, const LhsScalar* _lhs, Index lhsStride,
-        const RhsScalar* _rhs, Index rhsIncr, ResScalar* _res, Index resIncr, const ResScalar& alpha)
-  {
-    static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH;
-    Index diagSize = (std::min)(_rows,_cols);
-    Index rows = IsLower ? _rows : diagSize;
-    Index cols = IsLower ? diagSize : _cols;
-
-    typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,RowMajor>, 0, OuterStride<> > LhsMap;
-    const LhsMap lhs(_lhs,rows,cols,OuterStride<>(lhsStride));
-    typename conj_expr_if<ConjLhs,LhsMap>::type cjLhs(lhs);
-
-    typedef Map<const Matrix<RhsScalar,Dynamic,1> > RhsMap;
-    const RhsMap rhs(_rhs,cols);
-    typename conj_expr_if<ConjRhs,RhsMap>::type cjRhs(rhs);
-
-    typedef Map<Matrix<ResScalar,Dynamic,1>, 0, InnerStride<> > ResMap;
-    ResMap res(_res,rows,InnerStride<>(resIncr));
-    
-    for (Index pi=0; pi<diagSize; pi+=PanelWidth)
-    {
-      Index actualPanelWidth = (std::min)(PanelWidth, diagSize-pi);
-      for (Index k=0; k<actualPanelWidth; ++k)
-      {
-        Index i = pi + k;
-        Index s = IsLower ? pi  : ((HasUnitDiag||HasZeroDiag) ? i+1 : i);
-        Index r = IsLower ? k+1 : actualPanelWidth-k;
-        if ((!(HasUnitDiag||HasZeroDiag)) || (--r)>0)
-          res.coeffRef(i) += alpha * (cjLhs.row(i).segment(s,r).cwiseProduct(cjRhs.segment(s,r).transpose())).sum();
-        if (HasUnitDiag)
-          res.coeffRef(i) += alpha * cjRhs.coeff(i);
-      }
-      Index r = IsLower ? pi : cols - pi - actualPanelWidth;
-      if (r>0)
-      {
-        Index s = IsLower ? 0 : pi + actualPanelWidth;
-        general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjLhs,RhsScalar,ConjRhs,BuiltIn>::run(
-            actualPanelWidth, r,
-            &lhs.coeffRef(pi,s), lhsStride,
-            &rhs.coeffRef(s), rhsIncr,
-            &res.coeffRef(pi), resIncr, alpha);
-      }
+template <typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int Version>
+EIGEN_DONT_INLINE void triangular_matrix_vector_product<Index, Mode, LhsScalar, ConjLhs, RhsScalar, ConjRhs, RowMajor,
+                                                        Version>::run(Index _rows, Index _cols, const LhsScalar* lhs_,
+                                                                      Index lhsStride, const RhsScalar* rhs_,
+                                                                      Index rhsIncr, ResScalar* res_, Index resIncr,
+                                                                      const ResScalar& alpha) {
+  static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH;
+  Index diagSize = (std::min)(_rows, _cols);
+  Index rows = IsLower ? _rows : diagSize;
+  Index cols = IsLower ? diagSize : _cols;
+
+  typedef Map<const Matrix<LhsScalar, Dynamic, Dynamic, RowMajor>, 0, OuterStride<> > LhsMap;
+  const LhsMap lhs(lhs_, rows, cols, OuterStride<>(lhsStride));
+  typename conj_expr_if<ConjLhs, LhsMap>::type cjLhs(lhs);
+
+  typedef Map<const Matrix<RhsScalar, Dynamic, 1> > RhsMap;
+  const RhsMap rhs(rhs_, cols);
+  typename conj_expr_if<ConjRhs, RhsMap>::type cjRhs(rhs);
+
+  typedef Map<Matrix<ResScalar, Dynamic, 1>, 0, InnerStride<> > ResMap;
+  ResMap res(res_, rows, InnerStride<>(resIncr));
+
+  typedef const_blas_data_mapper<LhsScalar, Index, RowMajor> LhsMapper;
+  typedef const_blas_data_mapper<RhsScalar, Index, RowMajor> RhsMapper;
+
+  for (Index pi = 0; pi < diagSize; pi += PanelWidth) {
+    Index actualPanelWidth = (std::min)(PanelWidth, diagSize - pi);
+    for (Index k = 0; k < actualPanelWidth; ++k) {
+      Index i = pi + k;
+      Index s = IsLower ? pi : ((HasUnitDiag || HasZeroDiag) ? i + 1 : i);
+      Index r = IsLower ? k + 1 : actualPanelWidth - k;
+      if ((!(HasUnitDiag || HasZeroDiag)) || (--r) > 0)
+        res.coeffRef(i) += alpha * (cjLhs.row(i).segment(s, r).cwiseProduct(cjRhs.segment(s, r).transpose())).sum();
+      if (HasUnitDiag) res.coeffRef(i) += alpha * cjRhs.coeff(i);
     }
-    if(IsLower && rows>diagSize)
-    {
-      general_matrix_vector_product<Index,LhsScalar,RowMajor,ConjLhs,RhsScalar,ConjRhs>::run(
-            rows-diagSize, cols,
-            &lhs.coeffRef(diagSize,0), lhsStride,
-            &rhs.coeffRef(0), rhsIncr,
-            &res.coeffRef(diagSize), resIncr, alpha);
+    Index r = IsLower ? pi : cols - pi - actualPanelWidth;
+    if (r > 0) {
+      Index s = IsLower ? 0 : pi + actualPanelWidth;
+      general_matrix_vector_product<Index, LhsScalar, LhsMapper, RowMajor, ConjLhs, RhsScalar, RhsMapper, ConjRhs,
+                                    BuiltIn>::run(actualPanelWidth, r, LhsMapper(&lhs.coeffRef(pi, s), lhsStride),
+                                                  RhsMapper(&rhs.coeffRef(s), rhsIncr), &res.coeffRef(pi), resIncr,
+                                                  alpha);
     }
   }
+  if (IsLower && rows > diagSize) {
+    general_matrix_vector_product<Index, LhsScalar, LhsMapper, RowMajor, ConjLhs, RhsScalar, RhsMapper, ConjRhs>::run(
+        rows - diagSize, cols, LhsMapper(&lhs.coeffRef(diagSize, 0), lhsStride), RhsMapper(&rhs.coeffRef(0), rhsIncr),
+        &res.coeffRef(diagSize), resIncr, alpha);
+  }
+}
 
 /***************************************************************************
-* Wrapper to product_triangular_vector
-***************************************************************************/
+ * Wrapper to product_triangular_vector
+ ***************************************************************************/
 
-template<int Mode, bool LhsIsTriangular, typename Lhs, typename Rhs>
-struct traits<TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,true> >
- : traits<ProductBase<TriangularProduct<Mode,LhsIsTriangular,Lhs,false,Rhs,true>, Lhs, Rhs> >
-{};
-
-template<int Mode, bool LhsIsTriangular, typename Lhs, typename Rhs>
-struct traits<TriangularProduct<Mode,LhsIsTriangular,Lhs,true,Rhs,false> >
- : traits<ProductBase<TriangularProduct<Mode,LhsIsTriangular,Lhs,true,Rhs,false>, Lhs, Rhs> >
-{};
-
-
-template<int StorageOrder>
+template <int Mode, int StorageOrder>
 struct trmv_selector;
 
-} // end namespace internal
+}  // end namespace internal
 
-template<int Mode, typename Lhs, typename Rhs>
-struct TriangularProduct<Mode,true,Lhs,false,Rhs,true>
-  : public ProductBase<TriangularProduct<Mode,true,Lhs,false,Rhs,true>, Lhs, Rhs >
-{
-  EIGEN_PRODUCT_PUBLIC_INTERFACE(TriangularProduct)
+namespace internal {
 
-  TriangularProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
+template <int Mode, typename Lhs, typename Rhs>
+struct triangular_product_impl<Mode, true, Lhs, false, Rhs, true> {
+  template <typename Dest>
+  static void run(Dest& dst, const Lhs& lhs, const Rhs& rhs, const typename Dest::Scalar& alpha) {
+    eigen_assert(dst.rows() == lhs.rows() && dst.cols() == rhs.cols());
 
-  template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const
-  {
-    eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols());
-  
-    internal::trmv_selector<(int(internal::traits<Lhs>::Flags)&RowMajorBit) ? RowMajor : ColMajor>::run(*this, dst, alpha);
+    internal::trmv_selector<Mode, (int(internal::traits<Lhs>::Flags) & RowMajorBit) ? RowMajor : ColMajor>::run(
+        lhs, rhs, dst, alpha);
   }
 };
 
-template<int Mode, typename Lhs, typename Rhs>
-struct TriangularProduct<Mode,false,Lhs,true,Rhs,false>
-  : public ProductBase<TriangularProduct<Mode,false,Lhs,true,Rhs,false>, Lhs, Rhs >
-{
-  EIGEN_PRODUCT_PUBLIC_INTERFACE(TriangularProduct)
+template <int Mode, typename Lhs, typename Rhs>
+struct triangular_product_impl<Mode, false, Lhs, true, Rhs, false> {
+  template <typename Dest>
+  static void run(Dest& dst, const Lhs& lhs, const Rhs& rhs, const typename Dest::Scalar& alpha) {
+    eigen_assert(dst.rows() == lhs.rows() && dst.cols() == rhs.cols());
 
-  TriangularProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs) {}
-
-  template<typename Dest> void scaleAndAddTo(Dest& dst, const Scalar& alpha) const
-  {
-    eigen_assert(dst.rows()==m_lhs.rows() && dst.cols()==m_rhs.cols());
-
-    typedef TriangularProduct<(Mode & (UnitDiag|ZeroDiag)) | ((Mode & Lower) ? Upper : Lower),true,Transpose<const Rhs>,false,Transpose<const Lhs>,true> TriangularProductTranspose;
     Transpose<Dest> dstT(dst);
-    internal::trmv_selector<(int(internal::traits<Rhs>::Flags)&RowMajorBit) ? ColMajor : RowMajor>::run(
-      TriangularProductTranspose(m_rhs.transpose(),m_lhs.transpose()), dstT, alpha);
+    internal::trmv_selector<(Mode & (UnitDiag | ZeroDiag)) | ((Mode & Lower) ? Upper : Lower),
+                            (int(internal::traits<Rhs>::Flags) & RowMajorBit) ? ColMajor
+                                                                              : RowMajor>::run(rhs.transpose(),
+                                                                                               lhs.transpose(), dstT,
+                                                                                               alpha);
   }
 };
 
+}  // end namespace internal
+
 namespace internal {
 
 // TODO: find a way to factorize this piece of code with gemv_selector since the logic is exactly the same.
-  
-template<> struct trmv_selector<ColMajor>
-{
-  template<int Mode, typename Lhs, typename Rhs, typename Dest>
-  static void run(const TriangularProduct<Mode,true,Lhs,false,Rhs,true>& prod, Dest& dest, const typename TriangularProduct<Mode,true,Lhs,false,Rhs,true>::Scalar& alpha)
-  {
-    typedef TriangularProduct<Mode,true,Lhs,false,Rhs,true> ProductType;
-    typedef typename ProductType::Index Index;
-    typedef typename ProductType::LhsScalar   LhsScalar;
-    typedef typename ProductType::RhsScalar   RhsScalar;
-    typedef typename ProductType::Scalar      ResScalar;
-    typedef typename ProductType::RealScalar  RealScalar;
-    typedef typename ProductType::ActualLhsType ActualLhsType;
-    typedef typename ProductType::ActualRhsType ActualRhsType;
-    typedef typename ProductType::LhsBlasTraits LhsBlasTraits;
-    typedef typename ProductType::RhsBlasTraits RhsBlasTraits;
-    typedef Map<Matrix<ResScalar,Dynamic,1>, Aligned> MappedDest;
-
-    typename internal::add_const_on_value_type<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(prod.lhs());
-    typename internal::add_const_on_value_type<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(prod.rhs());
-
-    ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs())
-                                  * RhsBlasTraits::extractScalarFactor(prod.rhs());
-
-    enum {
-      // FIXME find a way to allow an inner stride on the result if packet_traits<Scalar>::size==1
-      // on, the other hand it is good for the cache to pack the vector anyways...
-      EvalToDestAtCompileTime = Dest::InnerStrideAtCompileTime==1,
-      ComplexByReal = (NumTraits<LhsScalar>::IsComplex) && (!NumTraits<RhsScalar>::IsComplex),
-      MightCannotUseDest = (Dest::InnerStrideAtCompileTime!=1) || ComplexByReal
-    };
-
-    gemv_static_vector_if<ResScalar,Dest::SizeAtCompileTime,Dest::MaxSizeAtCompileTime,MightCannotUseDest> static_dest;
-
-    bool alphaIsCompatible = (!ComplexByReal) || (numext::imag(actualAlpha)==RealScalar(0));
+
+template <int Mode>
+struct trmv_selector<Mode, ColMajor> {
+  template <typename Lhs, typename Rhs, typename Dest>
+  static void run(const Lhs& lhs, const Rhs& rhs, Dest& dest, const typename Dest::Scalar& alpha) {
+    typedef typename Lhs::Scalar LhsScalar;
+    typedef typename Rhs::Scalar RhsScalar;
+    typedef typename Dest::Scalar ResScalar;
+
+    typedef internal::blas_traits<Lhs> LhsBlasTraits;
+    typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
+    typedef internal::blas_traits<Rhs> RhsBlasTraits;
+    typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
+    constexpr int Alignment = (std::min)(int(AlignedMax), int(internal::packet_traits<ResScalar>::size));
+
+    typedef Map<Matrix<ResScalar, Dynamic, 1>, Alignment> MappedDest;
+
+    add_const_on_value_type_t<ActualLhsType> actualLhs = LhsBlasTraits::extract(lhs);
+    add_const_on_value_type_t<ActualRhsType> actualRhs = RhsBlasTraits::extract(rhs);
+
+    LhsScalar lhs_alpha = LhsBlasTraits::extractScalarFactor(lhs);
+    RhsScalar rhs_alpha = RhsBlasTraits::extractScalarFactor(rhs);
+    ResScalar actualAlpha = alpha * lhs_alpha * rhs_alpha;
+
+    // FIXME find a way to allow an inner stride on the result if packet_traits<Scalar>::size==1
+    // on, the other hand it is good for the cache to pack the vector anyways...
+    constexpr bool EvalToDestAtCompileTime = Dest::InnerStrideAtCompileTime == 1;
+    constexpr bool ComplexByReal = (NumTraits<LhsScalar>::IsComplex) && (!NumTraits<RhsScalar>::IsComplex);
+    constexpr bool MightCannotUseDest = (Dest::InnerStrideAtCompileTime != 1) || ComplexByReal;
+
+    gemv_static_vector_if<ResScalar, Dest::SizeAtCompileTime, Dest::MaxSizeAtCompileTime, MightCannotUseDest>
+        static_dest;
+
+    bool alphaIsCompatible = (!ComplexByReal) || numext::is_exactly_zero(numext::imag(actualAlpha));
     bool evalToDest = EvalToDestAtCompileTime && alphaIsCompatible;
-    
-    RhsScalar compatibleAlpha = get_factor<ResScalar,RhsScalar>::run(actualAlpha);
 
-    ei_declare_aligned_stack_constructed_variable(ResScalar,actualDestPtr,dest.size(),
+    RhsScalar compatibleAlpha = get_factor<ResScalar, RhsScalar>::run(actualAlpha);
+
+    ei_declare_aligned_stack_constructed_variable(ResScalar, actualDestPtr, dest.size(),
                                                   evalToDest ? dest.data() : static_dest.data());
 
-    if(!evalToDest)
-    {
-      #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+    if (!evalToDest) {
+#ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+      constexpr int Size = Dest::SizeAtCompileTime;
       Index size = dest.size();
       EIGEN_DENSE_STORAGE_CTOR_PLUGIN
-      #endif
-      if(!alphaIsCompatible)
-      {
+#endif
+      if (!alphaIsCompatible) {
         MappedDest(actualDestPtr, dest.size()).setZero();
         compatibleAlpha = RhsScalar(1);
-      }
-      else
+      } else
         MappedDest(actualDestPtr, dest.size()) = dest;
     }
-    
-    internal::triangular_matrix_vector_product
-      <Index,Mode,
-       LhsScalar, LhsBlasTraits::NeedToConjugate,
-       RhsScalar, RhsBlasTraits::NeedToConjugate,
-       ColMajor>
-      ::run(actualLhs.rows(),actualLhs.cols(),
-            actualLhs.data(),actualLhs.outerStride(),
-            actualRhs.data(),actualRhs.innerStride(),
-            actualDestPtr,1,compatibleAlpha);
-
-    if (!evalToDest)
-    {
-      if(!alphaIsCompatible)
+
+    internal::triangular_matrix_vector_product<Index, Mode, LhsScalar, LhsBlasTraits::NeedToConjugate, RhsScalar,
+                                               RhsBlasTraits::NeedToConjugate, ColMajor>::run(actualLhs.rows(),
+                                                                                              actualLhs.cols(),
+                                                                                              actualLhs.data(),
+                                                                                              actualLhs.outerStride(),
+                                                                                              actualRhs.data(),
+                                                                                              actualRhs.innerStride(),
+                                                                                              actualDestPtr, 1,
+                                                                                              compatibleAlpha);
+
+    if (!evalToDest) {
+      if (!alphaIsCompatible)
         dest += actualAlpha * MappedDest(actualDestPtr, dest.size());
       else
         dest = MappedDest(actualDestPtr, dest.size());
     }
+
+    if (((Mode & UnitDiag) == UnitDiag) && !numext::is_exactly_one(lhs_alpha)) {
+      Index diagSize = (std::min)(lhs.rows(), lhs.cols());
+      dest.head(diagSize) -= (lhs_alpha - LhsScalar(1)) * rhs.head(diagSize);
+    }
   }
 };
 
-template<> struct trmv_selector<RowMajor>
-{
-  template<int Mode, typename Lhs, typename Rhs, typename Dest>
-  static void run(const TriangularProduct<Mode,true,Lhs,false,Rhs,true>& prod, Dest& dest, const typename TriangularProduct<Mode,true,Lhs,false,Rhs,true>::Scalar& alpha)
-  {
-    typedef TriangularProduct<Mode,true,Lhs,false,Rhs,true> ProductType;
-    typedef typename ProductType::LhsScalar LhsScalar;
-    typedef typename ProductType::RhsScalar RhsScalar;
-    typedef typename ProductType::Scalar    ResScalar;
-    typedef typename ProductType::Index Index;
-    typedef typename ProductType::ActualLhsType ActualLhsType;
-    typedef typename ProductType::ActualRhsType ActualRhsType;
-    typedef typename ProductType::_ActualRhsType _ActualRhsType;
-    typedef typename ProductType::LhsBlasTraits LhsBlasTraits;
-    typedef typename ProductType::RhsBlasTraits RhsBlasTraits;
-
-    typename add_const<ActualLhsType>::type actualLhs = LhsBlasTraits::extract(prod.lhs());
-    typename add_const<ActualRhsType>::type actualRhs = RhsBlasTraits::extract(prod.rhs());
-
-    ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(prod.lhs())
-                                  * RhsBlasTraits::extractScalarFactor(prod.rhs());
-
-    enum {
-      DirectlyUseRhs = _ActualRhsType::InnerStrideAtCompileTime==1
-    };
-
-    gemv_static_vector_if<RhsScalar,_ActualRhsType::SizeAtCompileTime,_ActualRhsType::MaxSizeAtCompileTime,!DirectlyUseRhs> static_rhs;
-
-    ei_declare_aligned_stack_constructed_variable(RhsScalar,actualRhsPtr,actualRhs.size(),
-        DirectlyUseRhs ? const_cast<RhsScalar*>(actualRhs.data()) : static_rhs.data());
-
-    if(!DirectlyUseRhs)
-    {
-      #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
-      int size = actualRhs.size();
+template <int Mode>
+struct trmv_selector<Mode, RowMajor> {
+  template <typename Lhs, typename Rhs, typename Dest>
+  static void run(const Lhs& lhs, const Rhs& rhs, Dest& dest, const typename Dest::Scalar& alpha) {
+    typedef typename Lhs::Scalar LhsScalar;
+    typedef typename Rhs::Scalar RhsScalar;
+    typedef typename Dest::Scalar ResScalar;
+
+    typedef internal::blas_traits<Lhs> LhsBlasTraits;
+    typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType;
+    typedef internal::blas_traits<Rhs> RhsBlasTraits;
+    typedef typename RhsBlasTraits::DirectLinearAccessType ActualRhsType;
+    typedef internal::remove_all_t<ActualRhsType> ActualRhsTypeCleaned;
+
+    std::add_const_t<ActualLhsType> actualLhs = LhsBlasTraits::extract(lhs);
+    std::add_const_t<ActualRhsType> actualRhs = RhsBlasTraits::extract(rhs);
+
+    LhsScalar lhs_alpha = LhsBlasTraits::extractScalarFactor(lhs);
+    RhsScalar rhs_alpha = RhsBlasTraits::extractScalarFactor(rhs);
+    ResScalar actualAlpha = alpha * lhs_alpha * rhs_alpha;
+
+    constexpr bool DirectlyUseRhs = ActualRhsTypeCleaned::InnerStrideAtCompileTime == 1;
+
+    const RhsScalar* actualRhsPtr = actualRhs.data();
+
+    // Potentially create a temporary buffer to copy RHS to contiguous memory.
+    gemv_static_vector_if<RhsScalar, ActualRhsTypeCleaned::SizeAtCompileTime,
+                          ActualRhsTypeCleaned::MaxSizeAtCompileTime, !DirectlyUseRhs>
+        static_rhs;  // Fixed-sized array.
+    RhsScalar* buffer = nullptr;
+    if (!DirectlyUseRhs) {
+      // Maybe used fixed-sized buffer, otherwise allocate.
+      if (static_rhs.data() != nullptr) {
+        buffer = static_rhs.data();
+      } else {
+        // Allocate either with alloca or malloc.
+        Eigen::internal::check_size_for_overflow<RhsScalar>(actualRhs.size());
+#ifdef EIGEN_ALLOCA
+        buffer = static_cast<RhsScalar*>((sizeof(RhsScalar) * actualRhs.size() <= EIGEN_STACK_ALLOCATION_LIMIT)
+                                             ? EIGEN_ALIGNED_ALLOCA(sizeof(RhsScalar) * actualRhs.size())
+                                             : Eigen::internal::aligned_malloc(sizeof(RhsScalar) * actualRhs.size()));
+#else
+        buffer = static_cast<RhsScalar*>(Eigen::internal::aligned_malloc(sizeof(RhsScalar) * actualRhs.size()));
+#endif
+      }
+#ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN
+      constexpr int Size = ActualRhsTypeCleaned::SizeAtCompileTime;
+      Index size = actualRhs.size();
       EIGEN_DENSE_STORAGE_CTOR_PLUGIN
-      #endif
-      Map<typename _ActualRhsType::PlainObject>(actualRhsPtr, actualRhs.size()) = actualRhs;
+#endif
+      Map<typename ActualRhsTypeCleaned::PlainObject, Eigen::AlignedMax>(buffer, actualRhs.size()) = actualRhs;
+      actualRhsPtr = buffer;
+    }
+    // Deallocate only if malloced.
+    Eigen::internal::aligned_stack_memory_handler<RhsScalar> buffer_stack_memory_destructor(
+        buffer, actualRhs.size(),
+        !DirectlyUseRhs && static_rhs.data() == nullptr && actualRhs.size() > EIGEN_STACK_ALLOCATION_LIMIT);
+
+    internal::triangular_matrix_vector_product<Index, Mode, LhsScalar, LhsBlasTraits::NeedToConjugate, RhsScalar,
+                                               RhsBlasTraits::NeedToConjugate, RowMajor>::run(actualLhs.rows(),
+                                                                                              actualLhs.cols(),
+                                                                                              actualLhs.data(),
+                                                                                              actualLhs.outerStride(),
+                                                                                              actualRhsPtr, 1,
+                                                                                              dest.data(),
+                                                                                              dest.innerStride(),
+                                                                                              actualAlpha);
+
+    if (((Mode & UnitDiag) == UnitDiag) && !numext::is_exactly_one(lhs_alpha)) {
+      Index diagSize = (std::min)(lhs.rows(), lhs.cols());
+      dest.head(diagSize) -= (lhs_alpha - LhsScalar(1)) * rhs.head(diagSize);
     }
-    
-    internal::triangular_matrix_vector_product
-      <Index,Mode,
-       LhsScalar, LhsBlasTraits::NeedToConjugate,
-       RhsScalar, RhsBlasTraits::NeedToConjugate,
-       RowMajor>
-      ::run(actualLhs.rows(),actualLhs.cols(),
-            actualLhs.data(),actualLhs.outerStride(),
-            actualRhsPtr,1,
-            dest.data(),dest.innerStride(),
-            actualAlpha);
   }
 };
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_TRIANGULARMATRIXVECTOR_H
+#endif  // EIGEN_TRIANGULARMATRIXVECTOR_H
diff --git a/inst/include/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h b/inst/include/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h
new file mode 100644
index 00000000..1de68803
--- /dev/null
+++ b/inst/include/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h
@@ -0,0 +1,275 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to BLAS F77
+ *   Triangular matrix-vector product functionality based on ?TRMV.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_TRIANGULAR_MATRIX_VECTOR_BLAS_H
+#define EIGEN_TRIANGULAR_MATRIX_VECTOR_BLAS_H
+
+// IWYU pragma: private
+#include "../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+/**********************************************************************
+ * This file implements triangular matrix-vector multiplication using BLAS
+ **********************************************************************/
+
+// trmv/hemv specialization
+
+template <typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs,
+          int StorageOrder>
+struct triangular_matrix_vector_product_trmv
+    : triangular_matrix_vector_product<Index, Mode, LhsScalar, ConjLhs, RhsScalar, ConjRhs, StorageOrder, BuiltIn> {};
+
+#define EIGEN_BLAS_TRMV_SPECIALIZE(Scalar)                                                                            \
+  template <typename Index, int Mode, bool ConjLhs, bool ConjRhs>                                                     \
+  struct triangular_matrix_vector_product<Index, Mode, Scalar, ConjLhs, Scalar, ConjRhs, ColMajor, Specialized> {     \
+    static void run(Index rows_, Index cols_, const Scalar* lhs_, Index lhsStride, const Scalar* rhs_, Index rhsIncr, \
+                    Scalar* res_, Index resIncr, Scalar alpha) {                                                      \
+      triangular_matrix_vector_product_trmv<Index, Mode, Scalar, ConjLhs, Scalar, ConjRhs, ColMajor>::run(            \
+          rows_, cols_, lhs_, lhsStride, rhs_, rhsIncr, res_, resIncr, alpha);                                        \
+    }                                                                                                                 \
+  };                                                                                                                  \
+  template <typename Index, int Mode, bool ConjLhs, bool ConjRhs>                                                     \
+  struct triangular_matrix_vector_product<Index, Mode, Scalar, ConjLhs, Scalar, ConjRhs, RowMajor, Specialized> {     \
+    static void run(Index rows_, Index cols_, const Scalar* lhs_, Index lhsStride, const Scalar* rhs_, Index rhsIncr, \
+                    Scalar* res_, Index resIncr, Scalar alpha) {                                                      \
+      triangular_matrix_vector_product_trmv<Index, Mode, Scalar, ConjLhs, Scalar, ConjRhs, RowMajor>::run(            \
+          rows_, cols_, lhs_, lhsStride, rhs_, rhsIncr, res_, resIncr, alpha);                                        \
+    }                                                                                                                 \
+  };
+
+EIGEN_BLAS_TRMV_SPECIALIZE(double)
+EIGEN_BLAS_TRMV_SPECIALIZE(float)
+EIGEN_BLAS_TRMV_SPECIALIZE(dcomplex)
+EIGEN_BLAS_TRMV_SPECIALIZE(scomplex)
+
+// implements col-major: res += alpha * op(triangular) * vector
+#define EIGEN_BLAS_TRMV_CM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX, BLASPOSTFIX)                                    \
+  template <typename Index, int Mode, bool ConjLhs, bool ConjRhs>                                                    \
+  struct triangular_matrix_vector_product_trmv<Index, Mode, EIGTYPE, ConjLhs, EIGTYPE, ConjRhs, ColMajor> {          \
+    enum {                                                                                                           \
+      IsLower = (Mode & Lower) == Lower,                                                                             \
+      SetDiag = (Mode & (ZeroDiag | UnitDiag)) ? 0 : 1,                                                              \
+      IsUnitDiag = (Mode & UnitDiag) ? 1 : 0,                                                                        \
+      IsZeroDiag = (Mode & ZeroDiag) ? 1 : 0,                                                                        \
+      LowUp = IsLower ? Lower : Upper                                                                                \
+    };                                                                                                               \
+    static void run(Index rows_, Index cols_, const EIGTYPE* lhs_, Index lhsStride, const EIGTYPE* rhs_,             \
+                    Index rhsIncr, EIGTYPE* res_, Index resIncr, EIGTYPE alpha) {                                    \
+      if (rows_ == 0 || cols_ == 0) return;                                                                          \
+      if (ConjLhs || IsZeroDiag) {                                                                                   \
+        triangular_matrix_vector_product<Index, Mode, EIGTYPE, ConjLhs, EIGTYPE, ConjRhs, ColMajor, BuiltIn>::run(   \
+            rows_, cols_, lhs_, lhsStride, rhs_, rhsIncr, res_, resIncr, alpha);                                     \
+        return;                                                                                                      \
+      }                                                                                                              \
+      Index size = (std::min)(rows_, cols_);                                                                         \
+      Index rows = IsLower ? rows_ : size;                                                                           \
+      Index cols = IsLower ? size : cols_;                                                                           \
+                                                                                                                     \
+      typedef VectorX##EIGPREFIX VectorRhs;                                                                          \
+      EIGTYPE *x, *y;                                                                                                \
+                                                                                                                     \
+      /* Set x*/                                                                                                     \
+      Map<const VectorRhs, 0, InnerStride<> > rhs(rhs_, cols, InnerStride<>(rhsIncr));                               \
+      VectorRhs x_tmp;                                                                                               \
+      if (ConjRhs)                                                                                                   \
+        x_tmp = rhs.conjugate();                                                                                     \
+      else                                                                                                           \
+        x_tmp = rhs;                                                                                                 \
+      x = x_tmp.data();                                                                                              \
+                                                                                                                     \
+      /* Square part handling */                                                                                     \
+                                                                                                                     \
+      char trans, uplo, diag;                                                                                        \
+      BlasIndex m, n, lda, incx, incy;                                                                               \
+      EIGTYPE const* a;                                                                                              \
+      EIGTYPE beta(1);                                                                                               \
+                                                                                                                     \
+      /* Set m, n */                                                                                                 \
+      n = convert_index<BlasIndex>(size);                                                                            \
+      lda = convert_index<BlasIndex>(lhsStride);                                                                     \
+      incx = 1;                                                                                                      \
+      incy = convert_index<BlasIndex>(resIncr);                                                                      \
+                                                                                                                     \
+      /* Set uplo, trans and diag*/                                                                                  \
+      trans = 'N';                                                                                                   \
+      uplo = IsLower ? 'L' : 'U';                                                                                    \
+      diag = IsUnitDiag ? 'U' : 'N';                                                                                 \
+                                                                                                                     \
+      /* call ?TRMV*/                                                                                                \
+      BLASPREFIX##trmv##BLASPOSTFIX(&uplo, &trans, &diag, &n, (const BLASTYPE*)lhs_, &lda, (BLASTYPE*)x, &incx);     \
+                                                                                                                     \
+      /* Add op(a_tr)rhs into res*/                                                                                  \
+      BLASPREFIX##axpy##BLASPOSTFIX(&n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)x, &incx,        \
+                                    (BLASTYPE*)res_, &incy);                                                         \
+      /* Non-square case - doesn't fit to BLAS ?TRMV. Fall to default triangular product*/                           \
+      if (size < (std::max)(rows, cols)) {                                                                           \
+        if (ConjRhs)                                                                                                 \
+          x_tmp = rhs.conjugate();                                                                                   \
+        else                                                                                                         \
+          x_tmp = rhs;                                                                                               \
+        x = x_tmp.data();                                                                                            \
+        if (size < rows) {                                                                                           \
+          y = res_ + size * resIncr;                                                                                 \
+          a = lhs_ + size;                                                                                           \
+          m = convert_index<BlasIndex>(rows - size);                                                                 \
+          n = convert_index<BlasIndex>(size);                                                                        \
+        } else {                                                                                                     \
+          x += size;                                                                                                 \
+          y = res_;                                                                                                  \
+          a = lhs_ + size * lda;                                                                                     \
+          m = convert_index<BlasIndex>(size);                                                                        \
+          n = convert_index<BlasIndex>(cols - size);                                                                 \
+        }                                                                                                            \
+        BLASPREFIX##gemv##BLASPOSTFIX(&trans, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, \
+                                      &lda, (const BLASTYPE*)x, &incx, (const BLASTYPE*)&numext::real_ref(beta),     \
+                                      (BLASTYPE*)y, &incy);                                                          \
+      }                                                                                                              \
+    }                                                                                                                \
+  };
+
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_TRMV_CM(double, double, d, d, )
+EIGEN_BLAS_TRMV_CM(dcomplex, MKL_Complex16, cd, z, )
+EIGEN_BLAS_TRMV_CM(float, float, f, s, )
+EIGEN_BLAS_TRMV_CM(scomplex, MKL_Complex8, cf, c, )
+#else
+EIGEN_BLAS_TRMV_CM(double, double, d, d, _)
+EIGEN_BLAS_TRMV_CM(dcomplex, double, cd, z, _)
+EIGEN_BLAS_TRMV_CM(float, float, f, s, _)
+EIGEN_BLAS_TRMV_CM(scomplex, float, cf, c, _)
+#endif
+
+// implements row-major: res += alpha * op(triangular) * vector
+#define EIGEN_BLAS_TRMV_RM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX, BLASPOSTFIX)                                    \
+  template <typename Index, int Mode, bool ConjLhs, bool ConjRhs>                                                    \
+  struct triangular_matrix_vector_product_trmv<Index, Mode, EIGTYPE, ConjLhs, EIGTYPE, ConjRhs, RowMajor> {          \
+    enum {                                                                                                           \
+      IsLower = (Mode & Lower) == Lower,                                                                             \
+      SetDiag = (Mode & (ZeroDiag | UnitDiag)) ? 0 : 1,                                                              \
+      IsUnitDiag = (Mode & UnitDiag) ? 1 : 0,                                                                        \
+      IsZeroDiag = (Mode & ZeroDiag) ? 1 : 0,                                                                        \
+      LowUp = IsLower ? Lower : Upper                                                                                \
+    };                                                                                                               \
+    static void run(Index rows_, Index cols_, const EIGTYPE* lhs_, Index lhsStride, const EIGTYPE* rhs_,             \
+                    Index rhsIncr, EIGTYPE* res_, Index resIncr, EIGTYPE alpha) {                                    \
+      if (rows_ == 0 || cols_ == 0) return;                                                                          \
+      if (IsZeroDiag) {                                                                                              \
+        triangular_matrix_vector_product<Index, Mode, EIGTYPE, ConjLhs, EIGTYPE, ConjRhs, RowMajor, BuiltIn>::run(   \
+            rows_, cols_, lhs_, lhsStride, rhs_, rhsIncr, res_, resIncr, alpha);                                     \
+        return;                                                                                                      \
+      }                                                                                                              \
+      Index size = (std::min)(rows_, cols_);                                                                         \
+      Index rows = IsLower ? rows_ : size;                                                                           \
+      Index cols = IsLower ? size : cols_;                                                                           \
+                                                                                                                     \
+      typedef VectorX##EIGPREFIX VectorRhs;                                                                          \
+      EIGTYPE *x, *y;                                                                                                \
+                                                                                                                     \
+      /* Set x*/                                                                                                     \
+      Map<const VectorRhs, 0, InnerStride<> > rhs(rhs_, cols, InnerStride<>(rhsIncr));                               \
+      VectorRhs x_tmp;                                                                                               \
+      if (ConjRhs)                                                                                                   \
+        x_tmp = rhs.conjugate();                                                                                     \
+      else                                                                                                           \
+        x_tmp = rhs;                                                                                                 \
+      x = x_tmp.data();                                                                                              \
+                                                                                                                     \
+      /* Square part handling */                                                                                     \
+                                                                                                                     \
+      char trans, uplo, diag;                                                                                        \
+      BlasIndex m, n, lda, incx, incy;                                                                               \
+      EIGTYPE const* a;                                                                                              \
+      EIGTYPE beta(1);                                                                                               \
+                                                                                                                     \
+      /* Set m, n */                                                                                                 \
+      n = convert_index<BlasIndex>(size);                                                                            \
+      lda = convert_index<BlasIndex>(lhsStride);                                                                     \
+      incx = 1;                                                                                                      \
+      incy = convert_index<BlasIndex>(resIncr);                                                                      \
+                                                                                                                     \
+      /* Set uplo, trans and diag*/                                                                                  \
+      trans = ConjLhs ? 'C' : 'T';                                                                                   \
+      uplo = IsLower ? 'U' : 'L';                                                                                    \
+      diag = IsUnitDiag ? 'U' : 'N';                                                                                 \
+                                                                                                                     \
+      /* call ?TRMV*/                                                                                                \
+      BLASPREFIX##trmv##BLASPOSTFIX(&uplo, &trans, &diag, &n, (const BLASTYPE*)lhs_, &lda, (BLASTYPE*)x, &incx);     \
+                                                                                                                     \
+      /* Add op(a_tr)rhs into res*/                                                                                  \
+      BLASPREFIX##axpy##BLASPOSTFIX(&n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)x, &incx,        \
+                                    (BLASTYPE*)res_, &incy);                                                         \
+      /* Non-square case - doesn't fit to BLAS ?TRMV. Fall to default triangular product*/                           \
+      if (size < (std::max)(rows, cols)) {                                                                           \
+        if (ConjRhs)                                                                                                 \
+          x_tmp = rhs.conjugate();                                                                                   \
+        else                                                                                                         \
+          x_tmp = rhs;                                                                                               \
+        x = x_tmp.data();                                                                                            \
+        if (size < rows) {                                                                                           \
+          y = res_ + size * resIncr;                                                                                 \
+          a = lhs_ + size * lda;                                                                                     \
+          m = convert_index<BlasIndex>(rows - size);                                                                 \
+          n = convert_index<BlasIndex>(size);                                                                        \
+        } else {                                                                                                     \
+          x += size;                                                                                                 \
+          y = res_;                                                                                                  \
+          a = lhs_ + size;                                                                                           \
+          m = convert_index<BlasIndex>(size);                                                                        \
+          n = convert_index<BlasIndex>(cols - size);                                                                 \
+        }                                                                                                            \
+        BLASPREFIX##gemv##BLASPOSTFIX(&trans, &n, &m, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, \
+                                      &lda, (const BLASTYPE*)x, &incx, (const BLASTYPE*)&numext::real_ref(beta),     \
+                                      (BLASTYPE*)y, &incy);                                                          \
+      }                                                                                                              \
+    }                                                                                                                \
+  };
+
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_TRMV_RM(double, double, d, d, )
+EIGEN_BLAS_TRMV_RM(dcomplex, MKL_Complex16, cd, z, )
+EIGEN_BLAS_TRMV_RM(float, float, f, s, )
+EIGEN_BLAS_TRMV_RM(scomplex, MKL_Complex8, cf, c, )
+#else
+EIGEN_BLAS_TRMV_RM(double, double, d, d, _)
+EIGEN_BLAS_TRMV_RM(dcomplex, double, cd, z, _)
+EIGEN_BLAS_TRMV_RM(float, float, f, s, _)
+EIGEN_BLAS_TRMV_RM(scomplex, float, cf, c, _)
+#endif
+
+}  // namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_TRIANGULAR_MATRIX_VECTOR_BLAS_H
diff --git a/inst/include/Eigen/src/Core/products/TriangularMatrixVector_MKL.h b/inst/include/Eigen/src/Core/products/TriangularMatrixVector_MKL.h
deleted file mode 100644
index 09f110da..00000000
--- a/inst/include/Eigen/src/Core/products/TriangularMatrixVector_MKL.h
+++ /dev/null
@@ -1,247 +0,0 @@
-/*
- Copyright (c) 2011, Intel Corporation. All rights reserved.
-
- Redistribution and use in source and binary forms, with or without modification,
- are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
- * Neither the name of Intel Corporation nor the names of its contributors may
-   be used to endorse or promote products derived from this software without
-   specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
- ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
- ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
- *   Triangular matrix-vector product functionality based on ?TRMV.
- ********************************************************************************
-*/
-
-#ifndef EIGEN_TRIANGULAR_MATRIX_VECTOR_MKL_H
-#define EIGEN_TRIANGULAR_MATRIX_VECTOR_MKL_H
-
-namespace Eigen { 
-
-namespace internal {
-
-/**********************************************************************
-* This file implements triangular matrix-vector multiplication using BLAS
-**********************************************************************/
-
-// trmv/hemv specialization
-
-template<typename Index, int Mode, typename LhsScalar, bool ConjLhs, typename RhsScalar, bool ConjRhs, int StorageOrder>
-struct triangular_matrix_vector_product_trmv :
-  triangular_matrix_vector_product<Index,Mode,LhsScalar,ConjLhs,RhsScalar,ConjRhs,StorageOrder,BuiltIn> {};
-
-#define EIGEN_MKL_TRMV_SPECIALIZE(Scalar) \
-template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
-struct triangular_matrix_vector_product<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,ColMajor,Specialized> { \
- static void run(Index _rows, Index _cols, const Scalar* _lhs, Index lhsStride, \
-                                     const Scalar* _rhs, Index rhsIncr, Scalar* _res, Index resIncr, Scalar alpha) { \
-      triangular_matrix_vector_product_trmv<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,ColMajor>::run( \
-        _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \
-  } \
-}; \
-template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
-struct triangular_matrix_vector_product<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,RowMajor,Specialized> { \
- static void run(Index _rows, Index _cols, const Scalar* _lhs, Index lhsStride, \
-                                     const Scalar* _rhs, Index rhsIncr, Scalar* _res, Index resIncr, Scalar alpha) { \
-      triangular_matrix_vector_product_trmv<Index,Mode,Scalar,ConjLhs,Scalar,ConjRhs,RowMajor>::run( \
-        _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \
-  } \
-};
-
-EIGEN_MKL_TRMV_SPECIALIZE(double)
-EIGEN_MKL_TRMV_SPECIALIZE(float)
-EIGEN_MKL_TRMV_SPECIALIZE(dcomplex)
-EIGEN_MKL_TRMV_SPECIALIZE(scomplex)
-
-// implements col-major: res += alpha * op(triangular) * vector
-#define EIGEN_MKL_TRMV_CM(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \
-template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
-struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,ColMajor> { \
-  enum { \
-    IsLower = (Mode&Lower) == Lower, \
-    SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1, \
-    IsUnitDiag  = (Mode&UnitDiag) ? 1 : 0, \
-    IsZeroDiag  = (Mode&ZeroDiag) ? 1 : 0, \
-    LowUp = IsLower ? Lower : Upper \
-  }; \
- static void run(Index _rows, Index _cols, const EIGTYPE* _lhs, Index lhsStride, \
-                 const EIGTYPE* _rhs, Index rhsIncr, EIGTYPE* _res, Index resIncr, EIGTYPE alpha) \
- { \
-   if (ConjLhs || IsZeroDiag) { \
-     triangular_matrix_vector_product<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,ColMajor,BuiltIn>::run( \
-       _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \
-     return; \
-   }\
-   Index size = (std::min)(_rows,_cols); \
-   Index rows = IsLower ? _rows : size; \
-   Index cols = IsLower ? size : _cols; \
-\
-   typedef VectorX##EIGPREFIX VectorRhs; \
-   EIGTYPE *x, *y;\
-\
-/* Set x*/ \
-   Map<const VectorRhs, 0, InnerStride<> > rhs(_rhs,cols,InnerStride<>(rhsIncr)); \
-   VectorRhs x_tmp; \
-   if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
-   x = x_tmp.data(); \
-\
-/* Square part handling */\
-\
-   char trans, uplo, diag; \
-   MKL_INT m, n, lda, incx, incy; \
-   EIGTYPE const *a; \
-   MKLTYPE alpha_, beta_; \
-   assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); \
-   assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(beta_, EIGTYPE(1)); \
-\
-/* Set m, n */ \
-   n = (MKL_INT)size; \
-   lda = lhsStride; \
-   incx = 1; \
-   incy = resIncr; \
-\
-/* Set uplo, trans and diag*/ \
-   trans = 'N'; \
-   uplo = IsLower ? 'L' : 'U'; \
-   diag = IsUnitDiag ? 'U' : 'N'; \
-\
-/* call ?TRMV*/ \
-   MKLPREFIX##trmv(&uplo, &trans, &diag, &n, (const MKLTYPE*)_lhs, &lda, (MKLTYPE*)x, &incx); \
-\
-/* Add op(a_tr)rhs into res*/ \
-   MKLPREFIX##axpy(&n, &alpha_,(const MKLTYPE*)x, &incx, (MKLTYPE*)_res, &incy); \
-/* Non-square case - doesn't fit to MKL ?TRMV. Fall to default triangular product*/ \
-   if (size<(std::max)(rows,cols)) { \
-     typedef Matrix<EIGTYPE, Dynamic, Dynamic> MatrixLhs; \
-     if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
-     x = x_tmp.data(); \
-     if (size<rows) { \
-       y = _res + size*resIncr; \
-       a = _lhs + size; \
-       m = rows-size; \
-       n = size; \
-     } \
-     else { \
-       x += size; \
-       y = _res; \
-       a = _lhs + size*lda; \
-       m = size; \
-       n = cols-size; \
-     } \
-     MKLPREFIX##gemv(&trans, &m, &n, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)x, &incx, &beta_, (MKLTYPE*)y, &incy); \
-   } \
-  } \
-};
-
-EIGEN_MKL_TRMV_CM(double, double, d, d)
-EIGEN_MKL_TRMV_CM(dcomplex, MKL_Complex16, cd, z)
-EIGEN_MKL_TRMV_CM(float, float, f, s)
-EIGEN_MKL_TRMV_CM(scomplex, MKL_Complex8, cf, c)
-
-// implements row-major: res += alpha * op(triangular) * vector
-#define EIGEN_MKL_TRMV_RM(EIGTYPE, MKLTYPE, EIGPREFIX, MKLPREFIX) \
-template<typename Index, int Mode, bool ConjLhs, bool ConjRhs> \
-struct triangular_matrix_vector_product_trmv<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,RowMajor> { \
-  enum { \
-    IsLower = (Mode&Lower) == Lower, \
-    SetDiag = (Mode&(ZeroDiag|UnitDiag)) ? 0 : 1, \
-    IsUnitDiag  = (Mode&UnitDiag) ? 1 : 0, \
-    IsZeroDiag  = (Mode&ZeroDiag) ? 1 : 0, \
-    LowUp = IsLower ? Lower : Upper \
-  }; \
- static void run(Index _rows, Index _cols, const EIGTYPE* _lhs, Index lhsStride, \
-                 const EIGTYPE* _rhs, Index rhsIncr, EIGTYPE* _res, Index resIncr, EIGTYPE alpha) \
- { \
-   if (IsZeroDiag) { \
-     triangular_matrix_vector_product<Index,Mode,EIGTYPE,ConjLhs,EIGTYPE,ConjRhs,RowMajor,BuiltIn>::run( \
-       _rows, _cols, _lhs, lhsStride, _rhs, rhsIncr, _res, resIncr, alpha); \
-     return; \
-   }\
-   Index size = (std::min)(_rows,_cols); \
-   Index rows = IsLower ? _rows : size; \
-   Index cols = IsLower ? size : _cols; \
-\
-   typedef VectorX##EIGPREFIX VectorRhs; \
-   EIGTYPE *x, *y;\
-\
-/* Set x*/ \
-   Map<const VectorRhs, 0, InnerStride<> > rhs(_rhs,cols,InnerStride<>(rhsIncr)); \
-   VectorRhs x_tmp; \
-   if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
-   x = x_tmp.data(); \
-\
-/* Square part handling */\
-\
-   char trans, uplo, diag; \
-   MKL_INT m, n, lda, incx, incy; \
-   EIGTYPE const *a; \
-   MKLTYPE alpha_, beta_; \
-   assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(alpha_, alpha); \
-   assign_scalar_eig2mkl<MKLTYPE, EIGTYPE>(beta_, EIGTYPE(1)); \
-\
-/* Set m, n */ \
-   n = (MKL_INT)size; \
-   lda = lhsStride; \
-   incx = 1; \
-   incy = resIncr; \
-\
-/* Set uplo, trans and diag*/ \
-   trans = ConjLhs ? 'C' : 'T'; \
-   uplo = IsLower ? 'U' : 'L'; \
-   diag = IsUnitDiag ? 'U' : 'N'; \
-\
-/* call ?TRMV*/ \
-   MKLPREFIX##trmv(&uplo, &trans, &diag, &n, (const MKLTYPE*)_lhs, &lda, (MKLTYPE*)x, &incx); \
-\
-/* Add op(a_tr)rhs into res*/ \
-   MKLPREFIX##axpy(&n, &alpha_,(const MKLTYPE*)x, &incx, (MKLTYPE*)_res, &incy); \
-/* Non-square case - doesn't fit to MKL ?TRMV. Fall to default triangular product*/ \
-   if (size<(std::max)(rows,cols)) { \
-     typedef Matrix<EIGTYPE, Dynamic, Dynamic> MatrixLhs; \
-     if (ConjRhs) x_tmp = rhs.conjugate(); else x_tmp = rhs; \
-     x = x_tmp.data(); \
-     if (size<rows) { \
-       y = _res + size*resIncr; \
-       a = _lhs + size*lda; \
-       m = rows-size; \
-       n = size; \
-     } \
-     else { \
-       x += size; \
-       y = _res; \
-       a = _lhs + size; \
-       m = size; \
-       n = cols-size; \
-     } \
-     MKLPREFIX##gemv(&trans, &n, &m, &alpha_, (const MKLTYPE*)a, &lda, (const MKLTYPE*)x, &incx, &beta_, (MKLTYPE*)y, &incy); \
-   } \
-  } \
-};
-
-EIGEN_MKL_TRMV_RM(double, double, d, d)
-EIGEN_MKL_TRMV_RM(dcomplex, MKL_Complex16, cd, z)
-EIGEN_MKL_TRMV_RM(float, float, f, s)
-EIGEN_MKL_TRMV_RM(scomplex, MKL_Complex8, cf, c)
-
-} // end namespase internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_TRIANGULAR_MATRIX_VECTOR_MKL_H
diff --git a/inst/include/Eigen/src/Core/products/TriangularSolverMatrix.h b/inst/include/Eigen/src/Core/products/TriangularSolverMatrix.h
index 04240ab5..8244758b 100644
--- a/inst/include/Eigen/src/Core/products/TriangularSolverMatrix.h
+++ b/inst/include/Eigen/src/Core/products/TriangularSolverMatrix.h
@@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Modifications Copyright (C) 2022 Intel Corporation
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,323 +11,378 @@
 #ifndef EIGEN_TRIANGULAR_SOLVER_MATRIX_H
 #define EIGEN_TRIANGULAR_SOLVER_MATRIX_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "../InternalHeaderCheck.h"
+
+namespace Eigen {
 
 namespace internal {
 
+template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride,
+          bool Specialized>
+struct trsmKernelL {
+  // Generic Implementation of triangular solve for triangular matrix on left and multiple rhs.
+  // Handles non-packed matrices.
+  static void kernel(Index size, Index otherSize, const Scalar* _tri, Index triStride, Scalar* _other, Index otherIncr,
+                     Index otherStride);
+};
+
+template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride,
+          bool Specialized>
+struct trsmKernelR {
+  // Generic Implementation of triangular solve for triangular matrix on right and multiple lhs.
+  // Handles non-packed matrices.
+  static void kernel(Index size, Index otherSize, const Scalar* _tri, Index triStride, Scalar* _other, Index otherIncr,
+                     Index otherStride);
+};
+
+template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride,
+          bool Specialized>
+EIGEN_STRONG_INLINE void trsmKernelL<Scalar, Index, Mode, Conjugate, TriStorageOrder, OtherInnerStride,
+                                     Specialized>::kernel(Index size, Index otherSize, const Scalar* _tri,
+                                                          Index triStride, Scalar* _other, Index otherIncr,
+                                                          Index otherStride) {
+  typedef const_blas_data_mapper<Scalar, Index, TriStorageOrder> TriMapper;
+  typedef blas_data_mapper<Scalar, Index, ColMajor, Unaligned, OtherInnerStride> OtherMapper;
+  TriMapper tri(_tri, triStride);
+  OtherMapper other(_other, otherStride, otherIncr);
+
+  enum { IsLower = (Mode & Lower) == Lower };
+  conj_if<Conjugate> conj;
+
+  // tr solve
+  for (Index k = 0; k < size; ++k) {
+    // TODO write a small kernel handling this (can be shared with trsv)
+    Index i = IsLower ? k : -k - 1;
+    Index rs = size - k - 1;  // remaining size
+    Index s = TriStorageOrder == RowMajor ? (IsLower ? 0 : i + 1) : IsLower ? i + 1 : i - rs;
+
+    Scalar a = (Mode & UnitDiag) ? Scalar(1) : Scalar(Scalar(1) / conj(tri(i, i)));
+    for (Index j = 0; j < otherSize; ++j) {
+      if (TriStorageOrder == RowMajor) {
+        Scalar b(0);
+        const Scalar* l = &tri(i, s);
+        typename OtherMapper::LinearMapper r = other.getLinearMapper(s, j);
+        for (Index i3 = 0; i3 < k; ++i3) b += conj(l[i3]) * r(i3);
+
+        other(i, j) = (other(i, j) - b) * a;
+      } else {
+        Scalar& otherij = other(i, j);
+        otherij *= a;
+        Scalar b = otherij;
+        typename OtherMapper::LinearMapper r = other.getLinearMapper(s, j);
+        typename TriMapper::LinearMapper l = tri.getLinearMapper(s, i);
+        for (Index i3 = 0; i3 < rs; ++i3) r(i3) -= b * conj(l(i3));
+      }
+    }
+  }
+}
+
+template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride,
+          bool Specialized>
+EIGEN_STRONG_INLINE void trsmKernelR<Scalar, Index, Mode, Conjugate, TriStorageOrder, OtherInnerStride,
+                                     Specialized>::kernel(Index size, Index otherSize, const Scalar* _tri,
+                                                          Index triStride, Scalar* _other, Index otherIncr,
+                                                          Index otherStride) {
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef blas_data_mapper<Scalar, Index, ColMajor, Unaligned, OtherInnerStride> LhsMapper;
+  typedef const_blas_data_mapper<Scalar, Index, TriStorageOrder> RhsMapper;
+  LhsMapper lhs(_other, otherStride, otherIncr);
+  RhsMapper rhs(_tri, triStride);
+
+  enum { RhsStorageOrder = TriStorageOrder, IsLower = (Mode & Lower) == Lower };
+  conj_if<Conjugate> conj;
+
+  for (Index k = 0; k < size; ++k) {
+    Index j = IsLower ? size - k - 1 : k;
+
+    typename LhsMapper::LinearMapper r = lhs.getLinearMapper(0, j);
+    for (Index k3 = 0; k3 < k; ++k3) {
+      Scalar b = conj(rhs(IsLower ? j + 1 + k3 : k3, j));
+      typename LhsMapper::LinearMapper a = lhs.getLinearMapper(0, IsLower ? j + 1 + k3 : k3);
+      for (Index i = 0; i < otherSize; ++i) r(i) -= a(i) * b;
+    }
+    if ((Mode & UnitDiag) == 0) {
+      Scalar inv_rjj = RealScalar(1) / conj(rhs(j, j));
+      for (Index i = 0; i < otherSize; ++i) r(i) *= inv_rjj;
+    }
+  }
+}
+
 // if the rhs is row major, let's transpose the product
-template <typename Scalar, typename Index, int Side, int Mode, bool Conjugate, int TriStorageOrder>
-struct triangular_solve_matrix<Scalar,Index,Side,Mode,Conjugate,TriStorageOrder,RowMajor>
-{
-  static void run(
-    Index size, Index cols,
-    const Scalar*  tri, Index triStride,
-    Scalar* _other, Index otherStride,
-    level3_blocking<Scalar,Scalar>& blocking)
-  {
+template <typename Scalar, typename Index, int Side, int Mode, bool Conjugate, int TriStorageOrder,
+          int OtherInnerStride>
+struct triangular_solve_matrix<Scalar, Index, Side, Mode, Conjugate, TriStorageOrder, RowMajor, OtherInnerStride> {
+  static void run(Index size, Index cols, const Scalar* tri, Index triStride, Scalar* _other, Index otherIncr,
+                  Index otherStride, level3_blocking<Scalar, Scalar>& blocking) {
     triangular_solve_matrix<
-      Scalar, Index, Side==OnTheLeft?OnTheRight:OnTheLeft,
-      (Mode&UnitDiag) | ((Mode&Upper) ? Lower : Upper),
-      NumTraits<Scalar>::IsComplex && Conjugate,
-      TriStorageOrder==RowMajor ? ColMajor : RowMajor, ColMajor>
-      ::run(size, cols, tri, triStride, _other, otherStride, blocking);
+        Scalar, Index, Side == OnTheLeft ? OnTheRight : OnTheLeft, (Mode & UnitDiag) | ((Mode & Upper) ? Lower : Upper),
+        NumTraits<Scalar>::IsComplex && Conjugate, TriStorageOrder == RowMajor ? ColMajor : RowMajor, ColMajor,
+        OtherInnerStride>::run(size, cols, tri, triStride, _other, otherIncr, otherStride, blocking);
   }
 };
 
 /* Optimized triangular solver with multiple right hand side and the triangular matrix on the left
  */
-template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder>
-struct triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor>
-{
-  static EIGEN_DONT_INLINE void run(
-    Index size, Index otherSize,
-    const Scalar* _tri, Index triStride,
-    Scalar* _other, Index otherStride,
-    level3_blocking<Scalar,Scalar>& blocking);
+template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride>
+struct triangular_solve_matrix<Scalar, Index, OnTheLeft, Mode, Conjugate, TriStorageOrder, ColMajor, OtherInnerStride> {
+  static EIGEN_DONT_INLINE void run(Index size, Index otherSize, const Scalar* _tri, Index triStride, Scalar* _other,
+                                    Index otherIncr, Index otherStride, level3_blocking<Scalar, Scalar>& blocking);
 };
-template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder>
-EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor>::run(
-    Index size, Index otherSize,
-    const Scalar* _tri, Index triStride,
-    Scalar* _other, Index otherStride,
-    level3_blocking<Scalar,Scalar>& blocking)
-  {
-    Index cols = otherSize;
-    const_blas_data_mapper<Scalar, Index, TriStorageOrder> tri(_tri,triStride);
-    blas_data_mapper<Scalar, Index, ColMajor> other(_other,otherStride);
-
-    typedef gebp_traits<Scalar,Scalar> Traits;
-    enum {
-      SmallPanelWidth   = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr),
-      IsLower = (Mode&Lower) == Lower
-    };
-
-    Index kc = blocking.kc();                   // cache block size along the K direction
-    Index mc = (std::min)(size,blocking.mc());  // cache block size along the M direction
-
-    std::size_t sizeA = kc*mc;
-    std::size_t sizeB = kc*cols;
-    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
-
-    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
-    ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
-    ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW());
-
-    conj_if<Conjugate> conj;
-    gebp_kernel<Scalar, Scalar, Index, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel;
-    gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, TriStorageOrder> pack_lhs;
-    gemm_pack_rhs<Scalar, Index, Traits::nr, ColMajor, false, true> pack_rhs;
-
-    // the goal here is to subdivise the Rhs panels such that we keep some cache
-    // coherence when accessing the rhs elements
-    std::ptrdiff_t l1, l2;
-    manage_caching_sizes(GetAction, &l1, &l2);
-    Index subcols = cols>0 ? l2/(4 * sizeof(Scalar) * otherStride) : 0;
-    subcols = std::max<Index>((subcols/Traits::nr)*Traits::nr, Traits::nr);
-
-    for(Index k2=IsLower ? 0 : size;
-        IsLower ? k2<size : k2>0;
-        IsLower ? k2+=kc : k2-=kc)
-    {
-      const Index actual_kc = (std::min)(IsLower ? size-k2 : k2, kc);
-
-      // We have selected and packed a big horizontal panel R1 of rhs. Let B be the packed copy of this panel,
-      // and R2 the remaining part of rhs. The corresponding vertical panel of lhs is split into
-      // A11 (the triangular part) and A21 the remaining rectangular part.
-      // Then the high level algorithm is:
-      //  - B = R1                    => general block copy (done during the next step)
-      //  - R1 = A11^-1 B             => tricky part
-      //  - update B from the new R1  => actually this has to be performed continuously during the above step
-      //  - R2 -= A21 * B             => GEPP
-
-      // The tricky part: compute R1 = A11^-1 B while updating B from R1
-      // The idea is to split A11 into multiple small vertical panels.
-      // Each panel can be split into a small triangular part T1k which is processed without optimization,
-      // and the remaining small part T2k which is processed using gebp with appropriate block strides
-      for(Index j2=0; j2<cols; j2+=subcols)
-      {
-        Index actual_cols = (std::min)(cols-j2,subcols);
-        // for each small vertical panels [T1k^T, T2k^T]^T of lhs
-        for (Index k1=0; k1<actual_kc; k1+=SmallPanelWidth)
+
+template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride>
+EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar, Index, OnTheLeft, Mode, Conjugate, TriStorageOrder, ColMajor,
+                                               OtherInnerStride>::run(Index size, Index otherSize, const Scalar* _tri,
+                                                                      Index triStride, Scalar* _other, Index otherIncr,
+                                                                      Index otherStride,
+                                                                      level3_blocking<Scalar, Scalar>& blocking) {
+  Index cols = otherSize;
+
+  std::ptrdiff_t l1, l2, l3;
+  manage_caching_sizes(GetAction, &l1, &l2, &l3);
+
+#if defined(EIGEN_VECTORIZE_AVX512) && EIGEN_USE_AVX512_TRSM_L_KERNELS && EIGEN_ENABLE_AVX512_NOCOPY_TRSM_L_CUTOFFS
+  EIGEN_IF_CONSTEXPR(
+      (OtherInnerStride == 1 && (std::is_same<Scalar, float>::value || std::is_same<Scalar, double>::value))) {
+    // Very rough cutoffs to determine when to call trsm w/o packing
+    // For small problem sizes trsmKernel compiled with clang is generally faster.
+    // TODO: Investigate better heuristics for cutoffs.
+    double L2Cap = 0.5;  // 50% of L2 size
+    if (size < avx512_trsm_cutoff<Scalar>(l2, cols, L2Cap)) {
+      trsmKernelL<Scalar, Index, Mode, Conjugate, TriStorageOrder, 1, /*Specialized=*/true>::kernel(
+          size, cols, _tri, triStride, _other, 1, otherStride);
+      return;
+    }
+  }
+#endif
+
+  typedef const_blas_data_mapper<Scalar, Index, TriStorageOrder> TriMapper;
+  typedef blas_data_mapper<Scalar, Index, ColMajor, Unaligned, OtherInnerStride> OtherMapper;
+  TriMapper tri(_tri, triStride);
+  OtherMapper other(_other, otherStride, otherIncr);
+
+  typedef gebp_traits<Scalar, Scalar> Traits;
+
+  enum { SmallPanelWidth = plain_enum_max(Traits::mr, Traits::nr), IsLower = (Mode & Lower) == Lower };
+
+  Index kc = blocking.kc();                    // cache block size along the K direction
+  Index mc = (std::min)(size, blocking.mc());  // cache block size along the M direction
+
+  std::size_t sizeA = kc * mc;
+  std::size_t sizeB = kc * cols;
+
+  ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
+  ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
+
+  gebp_kernel<Scalar, Scalar, Index, OtherMapper, Traits::mr, Traits::nr, Conjugate, false> gebp_kernel;
+  gemm_pack_lhs<Scalar, Index, TriMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing,
+                TriStorageOrder>
+      pack_lhs;
+  gemm_pack_rhs<Scalar, Index, OtherMapper, Traits::nr, ColMajor, false, true> pack_rhs;
+
+  // the goal here is to subdivise the Rhs panels such that we keep some cache
+  // coherence when accessing the rhs elements
+  Index subcols = cols > 0 ? l2 / (4 * sizeof(Scalar) * std::max<Index>(otherStride, size)) : 0;
+  subcols = std::max<Index>((subcols / Traits::nr) * Traits::nr, Traits::nr);
+
+  for (Index k2 = IsLower ? 0 : size; IsLower ? k2 < size : k2 > 0; IsLower ? k2 += kc : k2 -= kc) {
+    const Index actual_kc = (std::min)(IsLower ? size - k2 : k2, kc);
+
+    // We have selected and packed a big horizontal panel R1 of rhs. Let B be the packed copy of this panel,
+    // and R2 the remaining part of rhs. The corresponding vertical panel of lhs is split into
+    // A11 (the triangular part) and A21 the remaining rectangular part.
+    // Then the high level algorithm is:
+    //  - B = R1                    => general block copy (done during the next step)
+    //  - R1 = A11^-1 B             => tricky part
+    //  - update B from the new R1  => actually this has to be performed continuously during the above step
+    //  - R2 -= A21 * B             => GEPP
+
+    // The tricky part: compute R1 = A11^-1 B while updating B from R1
+    // The idea is to split A11 into multiple small vertical panels.
+    // Each panel can be split into a small triangular part T1k which is processed without optimization,
+    // and the remaining small part T2k which is processed using gebp with appropriate block strides
+    for (Index j2 = 0; j2 < cols; j2 += subcols) {
+      Index actual_cols = (std::min)(cols - j2, subcols);
+      // for each small vertical panels [T1k^T, T2k^T]^T of lhs
+      for (Index k1 = 0; k1 < actual_kc; k1 += SmallPanelWidth) {
+        Index actualPanelWidth = std::min<Index>(actual_kc - k1, SmallPanelWidth);
+        // tr solve
         {
-          Index actualPanelWidth = std::min<Index>(actual_kc-k1, SmallPanelWidth);
-          // tr solve
-          for (Index k=0; k<actualPanelWidth; ++k)
-          {
-            // TODO write a small kernel handling this (can be shared with trsv)
-            Index i  = IsLower ? k2+k1+k : k2-k1-k-1;
-            Index s  = IsLower ? k2+k1 : i+1;
-            Index rs = actualPanelWidth - k - 1; // remaining size
-
-            Scalar a = (Mode & UnitDiag) ? Scalar(1) : Scalar(1)/conj(tri(i,i));
-            for (Index j=j2; j<j2+actual_cols; ++j)
-            {
-              if (TriStorageOrder==RowMajor)
-              {
-                Scalar b(0);
-                const Scalar* l = &tri(i,s);
-                Scalar* r = &other(s,j);
-                for (Index i3=0; i3<k; ++i3)
-                  b += conj(l[i3]) * r[i3];
-
-                other(i,j) = (other(i,j) - b)*a;
-              }
-              else
-              {
-                Index s = IsLower ? i+1 : i-rs;
-                Scalar b = (other(i,j) *= a);
-                Scalar* r = &other(s,j);
-                const Scalar* l = &tri(s,i);
-                for (Index i3=0;i3<rs;++i3)
-                  r[i3] -= b * conj(l[i3]);
-              }
-            }
+          Index i = IsLower ? k2 + k1 : k2 - k1;
+#if defined(EIGEN_VECTORIZE_AVX512) && EIGEN_USE_AVX512_TRSM_L_KERNELS
+          EIGEN_IF_CONSTEXPR(
+              (OtherInnerStride == 1 && (std::is_same<Scalar, float>::value || std::is_same<Scalar, double>::value))) {
+            i = IsLower ? k2 + k1 : k2 - k1 - actualPanelWidth;
           }
+#endif
+          trsmKernelL<Scalar, Index, Mode, Conjugate, TriStorageOrder, OtherInnerStride, /*Specialized=*/true>::kernel(
+              actualPanelWidth, actual_cols, _tri + i + (i)*triStride, triStride,
+              _other + i * OtherInnerStride + j2 * otherStride, otherIncr, otherStride);
+        }
 
-          Index lengthTarget = actual_kc-k1-actualPanelWidth;
-          Index startBlock   = IsLower ? k2+k1 : k2-k1-actualPanelWidth;
-          Index blockBOffset = IsLower ? k1 : lengthTarget;
+        Index lengthTarget = actual_kc - k1 - actualPanelWidth;
+        Index startBlock = IsLower ? k2 + k1 : k2 - k1 - actualPanelWidth;
+        Index blockBOffset = IsLower ? k1 : lengthTarget;
 
-          // update the respective rows of B from other
-          pack_rhs(blockB+actual_kc*j2, &other(startBlock,j2), otherStride, actualPanelWidth, actual_cols, actual_kc, blockBOffset);
+        // update the respective rows of B from other
+        pack_rhs(blockB + actual_kc * j2, other.getSubMapper(startBlock, j2), actualPanelWidth, actual_cols, actual_kc,
+                 blockBOffset);
 
-          // GEBP
-          if (lengthTarget>0)
-          {
-            Index startTarget  = IsLower ? k2+k1+actualPanelWidth : k2-actual_kc;
+        // GEBP
+        if (lengthTarget > 0) {
+          Index startTarget = IsLower ? k2 + k1 + actualPanelWidth : k2 - actual_kc;
 
-            pack_lhs(blockA, &tri(startTarget,startBlock), triStride, actualPanelWidth, lengthTarget);
+          pack_lhs(blockA, tri.getSubMapper(startTarget, startBlock), actualPanelWidth, lengthTarget);
 
-            gebp_kernel(&other(startTarget,j2), otherStride, blockA, blockB+actual_kc*j2, lengthTarget, actualPanelWidth, actual_cols, Scalar(-1),
-                        actualPanelWidth, actual_kc, 0, blockBOffset, blockW);
-          }
+          gebp_kernel(other.getSubMapper(startTarget, j2), blockA, blockB + actual_kc * j2, lengthTarget,
+                      actualPanelWidth, actual_cols, Scalar(-1), actualPanelWidth, actual_kc, 0, blockBOffset);
         }
       }
-      
-      // R2 -= A21 * B => GEPP
-      {
-        Index start = IsLower ? k2+kc : 0;
-        Index end   = IsLower ? size : k2-kc;
-        for(Index i2=start; i2<end; i2+=mc)
-        {
-          const Index actual_mc = (std::min)(mc,end-i2);
-          if (actual_mc>0)
-          {
-            pack_lhs(blockA, &tri(i2, IsLower ? k2 : k2-kc), triStride, actual_kc, actual_mc);
+    }
 
-            gebp_kernel(_other+i2, otherStride, blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1), -1, -1, 0, 0, blockW);
-          }
+    // R2 -= A21 * B => GEPP
+    {
+      Index start = IsLower ? k2 + kc : 0;
+      Index end = IsLower ? size : k2 - kc;
+      for (Index i2 = start; i2 < end; i2 += mc) {
+        const Index actual_mc = (std::min)(mc, end - i2);
+        if (actual_mc > 0) {
+          pack_lhs(blockA, tri.getSubMapper(i2, IsLower ? k2 : k2 - kc), actual_kc, actual_mc);
+
+          gebp_kernel(other.getSubMapper(i2, 0), blockA, blockB, actual_mc, actual_kc, cols, Scalar(-1), -1, -1, 0, 0);
         }
       }
     }
   }
+}
 
-/* Optimized triangular solver with multiple left hand sides and the trinagular matrix on the right
+/* Optimized triangular solver with multiple left hand sides and the triangular matrix on the right
  */
-template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder>
-struct triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor>
-{
-  static EIGEN_DONT_INLINE void run(
-    Index size, Index otherSize,
-    const Scalar* _tri, Index triStride,
-    Scalar* _other, Index otherStride,
-    level3_blocking<Scalar,Scalar>& blocking);
+template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride>
+struct triangular_solve_matrix<Scalar, Index, OnTheRight, Mode, Conjugate, TriStorageOrder, ColMajor,
+                               OtherInnerStride> {
+  static EIGEN_DONT_INLINE void run(Index size, Index otherSize, const Scalar* _tri, Index triStride, Scalar* _other,
+                                    Index otherIncr, Index otherStride, level3_blocking<Scalar, Scalar>& blocking);
 };
-template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder>
-EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor>::run(
-    Index size, Index otherSize,
-    const Scalar* _tri, Index triStride,
-    Scalar* _other, Index otherStride,
-    level3_blocking<Scalar,Scalar>& blocking)
-  {
-    Index rows = otherSize;
-    const_blas_data_mapper<Scalar, Index, TriStorageOrder> rhs(_tri,triStride);
-    blas_data_mapper<Scalar, Index, ColMajor> lhs(_other,otherStride);
-
-    typedef gebp_traits<Scalar,Scalar> Traits;
-    enum {
-      RhsStorageOrder   = TriStorageOrder,
-      SmallPanelWidth   = EIGEN_PLAIN_ENUM_MAX(Traits::mr,Traits::nr),
-      IsLower = (Mode&Lower) == Lower
-    };
-
-    Index kc = blocking.kc();                   // cache block size along the K direction
-    Index mc = (std::min)(rows,blocking.mc());  // cache block size along the M direction
-
-    std::size_t sizeA = kc*mc;
-    std::size_t sizeB = kc*size;
-    std::size_t sizeW = kc*Traits::WorkSpaceFactor;
-
-    ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
-    ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
-    ei_declare_aligned_stack_constructed_variable(Scalar, blockW, sizeW, blocking.blockW());
-
-    conj_if<Conjugate> conj;
-    gebp_kernel<Scalar,Scalar, Index, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel;
-    gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder> pack_rhs;
-    gemm_pack_rhs<Scalar, Index, Traits::nr,RhsStorageOrder,false,true> pack_rhs_panel;
-    gemm_pack_lhs<Scalar, Index, Traits::mr, Traits::LhsProgress, ColMajor, false, true> pack_lhs_panel;
-
-    for(Index k2=IsLower ? size : 0;
-        IsLower ? k2>0 : k2<size;
-        IsLower ? k2-=kc : k2+=kc)
-    {
-      const Index actual_kc = (std::min)(IsLower ? k2 : size-k2, kc);
-      Index actual_k2 = IsLower ? k2-actual_kc : k2 ;
 
-      Index startPanel = IsLower ? 0 : k2+actual_kc;
-      Index rs = IsLower ? actual_k2 : size - actual_k2 - actual_kc;
-      Scalar* geb = blockB+actual_kc*actual_kc;
+template <typename Scalar, typename Index, int Mode, bool Conjugate, int TriStorageOrder, int OtherInnerStride>
+EIGEN_DONT_INLINE void triangular_solve_matrix<Scalar, Index, OnTheRight, Mode, Conjugate, TriStorageOrder, ColMajor,
+                                               OtherInnerStride>::run(Index size, Index otherSize, const Scalar* _tri,
+                                                                      Index triStride, Scalar* _other, Index otherIncr,
+                                                                      Index otherStride,
+                                                                      level3_blocking<Scalar, Scalar>& blocking) {
+  Index rows = otherSize;
+
+#if defined(EIGEN_VECTORIZE_AVX512) && EIGEN_USE_AVX512_TRSM_R_KERNELS && EIGEN_ENABLE_AVX512_NOCOPY_TRSM_R_CUTOFFS
+  EIGEN_IF_CONSTEXPR(
+      (OtherInnerStride == 1 && (std::is_same<Scalar, float>::value || std::is_same<Scalar, double>::value))) {
+    // TODO: Investigate better heuristics for cutoffs.
+    std::ptrdiff_t l1, l2, l3;
+    manage_caching_sizes(GetAction, &l1, &l2, &l3);
+    double L2Cap = 0.5;  // 50% of L2 size
+    if (size < avx512_trsm_cutoff<Scalar>(l2, rows, L2Cap)) {
+      trsmKernelR<Scalar, Index, Mode, Conjugate, TriStorageOrder, OtherInnerStride, /*Specialized=*/true>::kernel(
+          size, rows, _tri, triStride, _other, 1, otherStride);
+      return;
+    }
+  }
+#endif
 
-      if (rs>0) pack_rhs(geb, &rhs(actual_k2,startPanel), triStride, actual_kc, rs);
+  typedef blas_data_mapper<Scalar, Index, ColMajor, Unaligned, OtherInnerStride> LhsMapper;
+  typedef const_blas_data_mapper<Scalar, Index, TriStorageOrder> RhsMapper;
+  LhsMapper lhs(_other, otherStride, otherIncr);
+  RhsMapper rhs(_tri, triStride);
 
-      // triangular packing (we only pack the panels off the diagonal,
-      // neglecting the blocks overlapping the diagonal
-      {
-        for (Index j2=0; j2<actual_kc; j2+=SmallPanelWidth)
-        {
-          Index actualPanelWidth = std::min<Index>(actual_kc-j2, SmallPanelWidth);
-          Index actual_j2 = actual_k2 + j2;
-          Index panelOffset = IsLower ? j2+actualPanelWidth : 0;
-          Index panelLength = IsLower ? actual_kc-j2-actualPanelWidth : j2;
-
-          if (panelLength>0)
-          pack_rhs_panel(blockB+j2*actual_kc,
-                         &rhs(actual_k2+panelOffset, actual_j2), triStride,
-                         panelLength, actualPanelWidth,
-                         actual_kc, panelOffset);
-        }
+  typedef gebp_traits<Scalar, Scalar> Traits;
+  enum {
+    RhsStorageOrder = TriStorageOrder,
+    SmallPanelWidth = plain_enum_max(Traits::mr, Traits::nr),
+    IsLower = (Mode & Lower) == Lower
+  };
+
+  Index kc = blocking.kc();                    // cache block size along the K direction
+  Index mc = (std::min)(rows, blocking.mc());  // cache block size along the M direction
+
+  std::size_t sizeA = kc * mc;
+  std::size_t sizeB = kc * size;
+
+  ei_declare_aligned_stack_constructed_variable(Scalar, blockA, sizeA, blocking.blockA());
+  ei_declare_aligned_stack_constructed_variable(Scalar, blockB, sizeB, blocking.blockB());
+
+  gebp_kernel<Scalar, Scalar, Index, LhsMapper, Traits::mr, Traits::nr, false, Conjugate> gebp_kernel;
+  gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr, RhsStorageOrder> pack_rhs;
+  gemm_pack_rhs<Scalar, Index, RhsMapper, Traits::nr, RhsStorageOrder, false, true> pack_rhs_panel;
+  gemm_pack_lhs<Scalar, Index, LhsMapper, Traits::mr, Traits::LhsProgress, typename Traits::LhsPacket4Packing, ColMajor,
+                false, true>
+      pack_lhs_panel;
+
+  for (Index k2 = IsLower ? size : 0; IsLower ? k2 > 0 : k2 < size; IsLower ? k2 -= kc : k2 += kc) {
+    const Index actual_kc = (std::min)(IsLower ? k2 : size - k2, kc);
+    Index actual_k2 = IsLower ? k2 - actual_kc : k2;
+
+    Index startPanel = IsLower ? 0 : k2 + actual_kc;
+    Index rs = IsLower ? actual_k2 : size - actual_k2 - actual_kc;
+    Scalar* geb = blockB + actual_kc * actual_kc;
+
+    if (rs > 0) pack_rhs(geb, rhs.getSubMapper(actual_k2, startPanel), actual_kc, rs);
+
+    // triangular packing (we only pack the panels off the diagonal,
+    // neglecting the blocks overlapping the diagonal
+    {
+      for (Index j2 = 0; j2 < actual_kc; j2 += SmallPanelWidth) {
+        Index actualPanelWidth = std::min<Index>(actual_kc - j2, SmallPanelWidth);
+        Index actual_j2 = actual_k2 + j2;
+        Index panelOffset = IsLower ? j2 + actualPanelWidth : 0;
+        Index panelLength = IsLower ? actual_kc - j2 - actualPanelWidth : j2;
+
+        if (panelLength > 0)
+          pack_rhs_panel(blockB + j2 * actual_kc, rhs.getSubMapper(actual_k2 + panelOffset, actual_j2), panelLength,
+                         actualPanelWidth, actual_kc, panelOffset);
       }
+    }
 
-      for(Index i2=0; i2<rows; i2+=mc)
+    for (Index i2 = 0; i2 < rows; i2 += mc) {
+      const Index actual_mc = (std::min)(mc, rows - i2);
+
+      // triangular solver kernel
       {
-        const Index actual_mc = (std::min)(mc,rows-i2);
+        // for each small block of the diagonal (=> vertical panels of rhs)
+        for (Index j2 = IsLower ? (actual_kc - ((actual_kc % SmallPanelWidth) ? Index(actual_kc % SmallPanelWidth)
+                                                                              : Index(SmallPanelWidth)))
+                                : 0;
+             IsLower ? j2 >= 0 : j2 < actual_kc; IsLower ? j2 -= SmallPanelWidth : j2 += SmallPanelWidth) {
+          Index actualPanelWidth = std::min<Index>(actual_kc - j2, SmallPanelWidth);
+          Index absolute_j2 = actual_k2 + j2;
+          Index panelOffset = IsLower ? j2 + actualPanelWidth : 0;
+          Index panelLength = IsLower ? actual_kc - j2 - actualPanelWidth : j2;
 
-        // triangular solver kernel
-        {
-          // for each small block of the diagonal (=> vertical panels of rhs)
-          for (Index j2 = IsLower
-                      ? (actual_kc - ((actual_kc%SmallPanelWidth) ? Index(actual_kc%SmallPanelWidth)
-                                                                  : Index(SmallPanelWidth)))
-                      : 0;
-               IsLower ? j2>=0 : j2<actual_kc;
-               IsLower ? j2-=SmallPanelWidth : j2+=SmallPanelWidth)
-          {
-            Index actualPanelWidth = std::min<Index>(actual_kc-j2, SmallPanelWidth);
-            Index absolute_j2 = actual_k2 + j2;
-            Index panelOffset = IsLower ? j2+actualPanelWidth : 0;
-            Index panelLength = IsLower ? actual_kc - j2 - actualPanelWidth : j2;
-
-            // GEBP
-            if(panelLength>0)
-            {
-              gebp_kernel(&lhs(i2,absolute_j2), otherStride,
-                          blockA, blockB+j2*actual_kc,
-                          actual_mc, panelLength, actualPanelWidth,
-                          Scalar(-1),
-                          actual_kc, actual_kc, // strides
-                          panelOffset, panelOffset, // offsets
-                          blockW);  // workspace
-            }
+          // GEBP
+          if (panelLength > 0) {
+            gebp_kernel(lhs.getSubMapper(i2, absolute_j2), blockA, blockB + j2 * actual_kc, actual_mc, panelLength,
+                        actualPanelWidth, Scalar(-1), actual_kc, actual_kc,  // strides
+                        panelOffset, panelOffset);                           // offsets
+          }
 
+          {
             // unblocked triangular solve
-            for (Index k=0; k<actualPanelWidth; ++k)
-            {
-              Index j = IsLower ? absolute_j2+actualPanelWidth-k-1 : absolute_j2+k;
-
-              Scalar* r = &lhs(i2,j);
-              for (Index k3=0; k3<k; ++k3)
-              {
-                Scalar b = conj(rhs(IsLower ? j+1+k3 : absolute_j2+k3,j));
-                Scalar* a = &lhs(i2,IsLower ? j+1+k3 : absolute_j2+k3);
-                for (Index i=0; i<actual_mc; ++i)
-                  r[i] -= a[i] * b;
-              }
-              if((Mode & UnitDiag)==0)
-              {
-                Scalar b = conj(rhs(j,j));
-                for (Index i=0; i<actual_mc; ++i)
-                  r[i] /= b;
-              }
-            }
-
-            // pack the just computed part of lhs to A
-            pack_lhs_panel(blockA, _other+absolute_j2*otherStride+i2, otherStride,
-                           actualPanelWidth, actual_mc,
-                           actual_kc, j2);
+            trsmKernelR<Scalar, Index, Mode, Conjugate, TriStorageOrder, OtherInnerStride,
+                        /*Specialized=*/true>::kernel(actualPanelWidth, actual_mc,
+                                                      _tri + absolute_j2 + absolute_j2 * triStride, triStride,
+                                                      _other + i2 * OtherInnerStride + absolute_j2 * otherStride,
+                                                      otherIncr, otherStride);
           }
+          // pack the just computed part of lhs to A
+          pack_lhs_panel(blockA, lhs.getSubMapper(i2, absolute_j2), actualPanelWidth, actual_mc, actual_kc, j2);
         }
-
-        if (rs>0)
-          gebp_kernel(_other+i2+startPanel*otherStride, otherStride, blockA, geb,
-                      actual_mc, actual_kc, rs, Scalar(-1),
-                      -1, -1, 0, 0, blockW);
       }
+
+      if (rs > 0)
+        gebp_kernel(lhs.getSubMapper(i2, startPanel), blockA, geb, actual_mc, actual_kc, rs, Scalar(-1), -1, -1, 0, 0);
     }
   }
+}
+}  // end namespace internal
 
-} // end namespace internal
-
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_TRIANGULAR_SOLVER_MATRIX_H
+#endif  // EIGEN_TRIANGULAR_SOLVER_MATRIX_H
diff --git a/inst/include/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h b/inst/include/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h
new file mode 100644
index 00000000..9cc15fbd
--- /dev/null
+++ b/inst/include/Eigen/src/Core/products/TriangularSolverMatrix_BLAS.h
@@ -0,0 +1,166 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to BLAS F77
+ *   Triangular matrix * matrix product functionality based on ?TRMM.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_TRIANGULAR_SOLVER_MATRIX_BLAS_H
+#define EIGEN_TRIANGULAR_SOLVER_MATRIX_BLAS_H
+
+// IWYU pragma: private
+#include "../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+// implements LeftSide op(triangular)^-1 * general
+#define EIGEN_BLAS_TRSM_L(EIGTYPE, BLASTYPE, BLASFUNC)                                                              \
+  template <typename Index, int Mode, bool Conjugate, int TriStorageOrder>                                          \
+  struct triangular_solve_matrix<EIGTYPE, Index, OnTheLeft, Mode, Conjugate, TriStorageOrder, ColMajor, 1> {        \
+    enum {                                                                                                          \
+      IsLower = (Mode & Lower) == Lower,                                                                            \
+      IsUnitDiag = (Mode & UnitDiag) ? 1 : 0,                                                                       \
+      IsZeroDiag = (Mode & ZeroDiag) ? 1 : 0,                                                                       \
+      conjA = ((TriStorageOrder == ColMajor) && Conjugate) ? 1 : 0                                                  \
+    };                                                                                                              \
+    static void run(Index size, Index otherSize, const EIGTYPE* _tri, Index triStride, EIGTYPE* _other,             \
+                    Index otherIncr, Index otherStride, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) {          \
+      if (size == 0 || otherSize == 0) return;                                                                      \
+      EIGEN_ONLY_USED_FOR_DEBUG(otherIncr);                                                                         \
+      eigen_assert(otherIncr == 1);                                                                                 \
+      BlasIndex m = convert_index<BlasIndex>(size), n = convert_index<BlasIndex>(otherSize), lda, ldb;              \
+      char side = 'L', uplo, diag = 'N', transa;                                                                    \
+      /* Set alpha_ */                                                                                              \
+      EIGTYPE alpha(1);                                                                                             \
+      ldb = convert_index<BlasIndex>(otherStride);                                                                  \
+                                                                                                                    \
+      const EIGTYPE* a;                                                                                             \
+      /* Set trans */                                                                                               \
+      transa = (TriStorageOrder == RowMajor) ? ((Conjugate) ? 'C' : 'T') : 'N';                                     \
+      /* Set uplo */                                                                                                \
+      uplo = IsLower ? 'L' : 'U';                                                                                   \
+      if (TriStorageOrder == RowMajor) uplo = (uplo == 'L') ? 'U' : 'L';                                            \
+      /* Set a, lda */                                                                                              \
+      typedef Matrix<EIGTYPE, Dynamic, Dynamic, TriStorageOrder> MatrixTri;                                         \
+      Map<const MatrixTri, 0, OuterStride<> > tri(_tri, size, size, OuterStride<>(triStride));                      \
+      MatrixTri a_tmp;                                                                                              \
+                                                                                                                    \
+      if (conjA) {                                                                                                  \
+        a_tmp = tri.conjugate();                                                                                    \
+        a = a_tmp.data();                                                                                           \
+        lda = convert_index<BlasIndex>(a_tmp.outerStride());                                                        \
+      } else {                                                                                                      \
+        a = _tri;                                                                                                   \
+        lda = convert_index<BlasIndex>(triStride);                                                                  \
+      }                                                                                                             \
+      if (IsUnitDiag) diag = 'U';                                                                                   \
+      /* call ?trsm*/                                                                                               \
+      BLASFUNC(&side, &uplo, &transa, &diag, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, \
+               &lda, (BLASTYPE*)_other, &ldb);                                                                      \
+    }                                                                                                               \
+  };
+
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_TRSM_L(double, double, dtrsm)
+EIGEN_BLAS_TRSM_L(dcomplex, MKL_Complex16, ztrsm)
+EIGEN_BLAS_TRSM_L(float, float, strsm)
+EIGEN_BLAS_TRSM_L(scomplex, MKL_Complex8, ctrsm)
+#else
+EIGEN_BLAS_TRSM_L(double, double, dtrsm_)
+EIGEN_BLAS_TRSM_L(dcomplex, double, ztrsm_)
+EIGEN_BLAS_TRSM_L(float, float, strsm_)
+EIGEN_BLAS_TRSM_L(scomplex, float, ctrsm_)
+#endif
+
+// implements RightSide general * op(triangular)^-1
+#define EIGEN_BLAS_TRSM_R(EIGTYPE, BLASTYPE, BLASFUNC)                                                              \
+  template <typename Index, int Mode, bool Conjugate, int TriStorageOrder>                                          \
+  struct triangular_solve_matrix<EIGTYPE, Index, OnTheRight, Mode, Conjugate, TriStorageOrder, ColMajor, 1> {       \
+    enum {                                                                                                          \
+      IsLower = (Mode & Lower) == Lower,                                                                            \
+      IsUnitDiag = (Mode & UnitDiag) ? 1 : 0,                                                                       \
+      IsZeroDiag = (Mode & ZeroDiag) ? 1 : 0,                                                                       \
+      conjA = ((TriStorageOrder == ColMajor) && Conjugate) ? 1 : 0                                                  \
+    };                                                                                                              \
+    static void run(Index size, Index otherSize, const EIGTYPE* _tri, Index triStride, EIGTYPE* _other,             \
+                    Index otherIncr, Index otherStride, level3_blocking<EIGTYPE, EIGTYPE>& /*blocking*/) {          \
+      if (size == 0 || otherSize == 0) return;                                                                      \
+      EIGEN_ONLY_USED_FOR_DEBUG(otherIncr);                                                                         \
+      eigen_assert(otherIncr == 1);                                                                                 \
+      BlasIndex m = convert_index<BlasIndex>(otherSize), n = convert_index<BlasIndex>(size), lda, ldb;              \
+      char side = 'R', uplo, diag = 'N', transa;                                                                    \
+      /* Set alpha_ */                                                                                              \
+      EIGTYPE alpha(1);                                                                                             \
+      ldb = convert_index<BlasIndex>(otherStride);                                                                  \
+                                                                                                                    \
+      const EIGTYPE* a;                                                                                             \
+      /* Set trans */                                                                                               \
+      transa = (TriStorageOrder == RowMajor) ? ((Conjugate) ? 'C' : 'T') : 'N';                                     \
+      /* Set uplo */                                                                                                \
+      uplo = IsLower ? 'L' : 'U';                                                                                   \
+      if (TriStorageOrder == RowMajor) uplo = (uplo == 'L') ? 'U' : 'L';                                            \
+      /* Set a, lda */                                                                                              \
+      typedef Matrix<EIGTYPE, Dynamic, Dynamic, TriStorageOrder> MatrixTri;                                         \
+      Map<const MatrixTri, 0, OuterStride<> > tri(_tri, size, size, OuterStride<>(triStride));                      \
+      MatrixTri a_tmp;                                                                                              \
+                                                                                                                    \
+      if (conjA) {                                                                                                  \
+        a_tmp = tri.conjugate();                                                                                    \
+        a = a_tmp.data();                                                                                           \
+        lda = convert_index<BlasIndex>(a_tmp.outerStride());                                                        \
+      } else {                                                                                                      \
+        a = _tri;                                                                                                   \
+        lda = convert_index<BlasIndex>(triStride);                                                                  \
+      }                                                                                                             \
+      if (IsUnitDiag) diag = 'U';                                                                                   \
+      /* call ?trsm*/                                                                                               \
+      BLASFUNC(&side, &uplo, &transa, &diag, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, \
+               &lda, (BLASTYPE*)_other, &ldb);                                                                      \
+      /*std::cout << "TRMS_L specialization!\n";*/                                                                  \
+    }                                                                                                               \
+  };
+
+#ifdef EIGEN_USE_MKL
+EIGEN_BLAS_TRSM_R(double, double, dtrsm)
+EIGEN_BLAS_TRSM_R(dcomplex, MKL_Complex16, ztrsm)
+EIGEN_BLAS_TRSM_R(float, float, strsm)
+EIGEN_BLAS_TRSM_R(scomplex, MKL_Complex8, ctrsm)
+#else
+EIGEN_BLAS_TRSM_R(double, double, dtrsm_)
+EIGEN_BLAS_TRSM_R(dcomplex, double, ztrsm_)
+EIGEN_BLAS_TRSM_R(float, float, strsm_)
+EIGEN_BLAS_TRSM_R(scomplex, float, ctrsm_)
+#endif
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_TRIANGULAR_SOLVER_MATRIX_BLAS_H
diff --git a/inst/include/Eigen/src/Core/products/TriangularSolverMatrix_MKL.h b/inst/include/Eigen/src/Core/products/TriangularSolverMatrix_MKL.h
deleted file mode 100644
index 6a0bb833..00000000
--- a/inst/include/Eigen/src/Core/products/TriangularSolverMatrix_MKL.h
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- Copyright (c) 2011, Intel Corporation. All rights reserved.
-
- Redistribution and use in source and binary forms, with or without modification,
- are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
- * Neither the name of Intel Corporation nor the names of its contributors may
-   be used to endorse or promote products derived from this software without
-   specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
- ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
- ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
- *   Triangular matrix * matrix product functionality based on ?TRMM.
- ********************************************************************************
-*/
-
-#ifndef EIGEN_TRIANGULAR_SOLVER_MATRIX_MKL_H
-#define EIGEN_TRIANGULAR_SOLVER_MATRIX_MKL_H
-
-namespace Eigen {
-
-namespace internal {
-
-// implements LeftSide op(triangular)^-1 * general
-#define EIGEN_MKL_TRSM_L(EIGTYPE, MKLTYPE, MKLPREFIX) \
-template <typename Index, int Mode, bool Conjugate, int TriStorageOrder> \
-struct triangular_solve_matrix<EIGTYPE,Index,OnTheLeft,Mode,Conjugate,TriStorageOrder,ColMajor> \
-{ \
-  enum { \
-    IsLower = (Mode&Lower) == Lower, \
-    IsUnitDiag  = (Mode&UnitDiag) ? 1 : 0, \
-    IsZeroDiag  = (Mode&ZeroDiag) ? 1 : 0, \
-    conjA = ((TriStorageOrder==ColMajor) && Conjugate) ? 1 : 0 \
-  }; \
-  static void run( \
-      Index size, Index otherSize, \
-      const EIGTYPE* _tri, Index triStride, \
-      EIGTYPE* _other, Index otherStride, level3_blocking<EIGTYPE,EIGTYPE>& /*blocking*/) \
-  { \
-   MKL_INT m = size, n = otherSize, lda, ldb; \
-   char side = 'L', uplo, diag='N', transa; \
-   /* Set alpha_ */ \
-   MKLTYPE alpha; \
-   EIGTYPE myone(1); \
-   assign_scalar_eig2mkl(alpha, myone); \
-   ldb = otherStride;\
-\
-   const EIGTYPE *a; \
-/* Set trans */ \
-   transa = (TriStorageOrder==RowMajor) ? ((Conjugate) ? 'C' : 'T') : 'N'; \
-/* Set uplo */ \
-   uplo = IsLower ? 'L' : 'U'; \
-   if (TriStorageOrder==RowMajor) uplo = (uplo == 'L') ? 'U' : 'L'; \
-/* Set a, lda */ \
-   typedef Matrix<EIGTYPE, Dynamic, Dynamic, TriStorageOrder> MatrixTri; \
-   Map<const MatrixTri, 0, OuterStride<> > tri(_tri,size,size,OuterStride<>(triStride)); \
-   MatrixTri a_tmp; \
-\
-   if (conjA) { \
-     a_tmp = tri.conjugate(); \
-     a = a_tmp.data(); \
-     lda = a_tmp.outerStride(); \
-   } else { \
-     a = _tri; \
-     lda = triStride; \
-   } \
-   if (IsUnitDiag) diag='U'; \
-/* call ?trsm*/ \
-   MKLPREFIX##trsm(&side, &uplo, &transa, &diag, &m, &n, &alpha, (const MKLTYPE*)a, &lda, (MKLTYPE*)_other, &ldb); \
- } \
-};
-
-EIGEN_MKL_TRSM_L(double, double, d)
-EIGEN_MKL_TRSM_L(dcomplex, MKL_Complex16, z)
-EIGEN_MKL_TRSM_L(float, float, s)
-EIGEN_MKL_TRSM_L(scomplex, MKL_Complex8, c)
-
-
-// implements RightSide general * op(triangular)^-1
-#define EIGEN_MKL_TRSM_R(EIGTYPE, MKLTYPE, MKLPREFIX) \
-template <typename Index, int Mode, bool Conjugate, int TriStorageOrder> \
-struct triangular_solve_matrix<EIGTYPE,Index,OnTheRight,Mode,Conjugate,TriStorageOrder,ColMajor> \
-{ \
-  enum { \
-    IsLower = (Mode&Lower) == Lower, \
-    IsUnitDiag  = (Mode&UnitDiag) ? 1 : 0, \
-    IsZeroDiag  = (Mode&ZeroDiag) ? 1 : 0, \
-    conjA = ((TriStorageOrder==ColMajor) && Conjugate) ? 1 : 0 \
-  }; \
-  static void run( \
-      Index size, Index otherSize, \
-      const EIGTYPE* _tri, Index triStride, \
-      EIGTYPE* _other, Index otherStride, level3_blocking<EIGTYPE,EIGTYPE>& /*blocking*/) \
-  { \
-   MKL_INT m = otherSize, n = size, lda, ldb; \
-   char side = 'R', uplo, diag='N', transa; \
-   /* Set alpha_ */ \
-   MKLTYPE alpha; \
-   EIGTYPE myone(1); \
-   assign_scalar_eig2mkl(alpha, myone); \
-   ldb = otherStride;\
-\
-   const EIGTYPE *a; \
-/* Set trans */ \
-   transa = (TriStorageOrder==RowMajor) ? ((Conjugate) ? 'C' : 'T') : 'N'; \
-/* Set uplo */ \
-   uplo = IsLower ? 'L' : 'U'; \
-   if (TriStorageOrder==RowMajor) uplo = (uplo == 'L') ? 'U' : 'L'; \
-/* Set a, lda */ \
-   typedef Matrix<EIGTYPE, Dynamic, Dynamic, TriStorageOrder> MatrixTri; \
-   Map<const MatrixTri, 0, OuterStride<> > tri(_tri,size,size,OuterStride<>(triStride)); \
-   MatrixTri a_tmp; \
-\
-   if (conjA) { \
-     a_tmp = tri.conjugate(); \
-     a = a_tmp.data(); \
-     lda = a_tmp.outerStride(); \
-   } else { \
-     a = _tri; \
-     lda = triStride; \
-   } \
-   if (IsUnitDiag) diag='U'; \
-/* call ?trsm*/ \
-   MKLPREFIX##trsm(&side, &uplo, &transa, &diag, &m, &n, &alpha, (const MKLTYPE*)a, &lda, (MKLTYPE*)_other, &ldb); \
-   /*std::cout << "TRMS_L specialization!\n";*/ \
- } \
-};
-
-EIGEN_MKL_TRSM_R(double, double, d)
-EIGEN_MKL_TRSM_R(dcomplex, MKL_Complex16, z)
-EIGEN_MKL_TRSM_R(float, float, s)
-EIGEN_MKL_TRSM_R(scomplex, MKL_Complex8, c)
-
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_TRIANGULAR_SOLVER_MATRIX_MKL_H
diff --git a/inst/include/Eigen/src/Core/products/TriangularSolverVector.h b/inst/include/Eigen/src/Core/products/TriangularSolverVector.h
index ce4d1008..ff7c43f7 100644
--- a/inst/include/Eigen/src/Core/products/TriangularSolverVector.h
+++ b/inst/include/Eigen/src/Core/products/TriangularSolverVector.h
@@ -10,130 +10,113 @@
 #ifndef EIGEN_TRIANGULAR_SOLVER_VECTOR_H
 #define EIGEN_TRIANGULAR_SOLVER_VECTOR_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "../InternalHeaderCheck.h"
+
+namespace Eigen {
 
 namespace internal {
 
-template<typename LhsScalar, typename RhsScalar, typename Index, int Mode, bool Conjugate, int StorageOrder>
-struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheRight, Mode, Conjugate, StorageOrder>
-{
-  static void run(Index size, const LhsScalar* _lhs, Index lhsStride, RhsScalar* rhs)
-  {
-    triangular_solve_vector<LhsScalar,RhsScalar,Index,OnTheLeft,
-        ((Mode&Upper)==Upper ? Lower : Upper) | (Mode&UnitDiag),
-        Conjugate,StorageOrder==RowMajor?ColMajor:RowMajor
-      >::run(size, _lhs, lhsStride, rhs);
+template <typename LhsScalar, typename RhsScalar, typename Index, int Mode, bool Conjugate, int StorageOrder>
+struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheRight, Mode, Conjugate, StorageOrder> {
+  static void run(Index size, const LhsScalar* _lhs, Index lhsStride, RhsScalar* rhs) {
+    triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft,
+                            ((Mode & Upper) == Upper ? Lower : Upper) | (Mode & UnitDiag), Conjugate,
+                            StorageOrder == RowMajor ? ColMajor : RowMajor>::run(size, _lhs, lhsStride, rhs);
   }
 };
-    
+
 // forward and backward substitution, row-major, rhs is a vector
-template<typename LhsScalar, typename RhsScalar, typename Index, int Mode, bool Conjugate>
-struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Conjugate, RowMajor>
-{
-  enum {
-    IsLower = ((Mode&Lower)==Lower)
-  };
-  static void run(Index size, const LhsScalar* _lhs, Index lhsStride, RhsScalar* rhs)
-  {
-    typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,RowMajor>, 0, OuterStride<> > LhsMap;
-    const LhsMap lhs(_lhs,size,size,OuterStride<>(lhsStride));
-    typename internal::conditional<
-                          Conjugate,
-                          const CwiseUnaryOp<typename internal::scalar_conjugate_op<LhsScalar>,LhsMap>,
-                          const LhsMap&>
-                        ::type cjLhs(lhs);
+template <typename LhsScalar, typename RhsScalar, typename Index, int Mode, bool Conjugate>
+struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Conjugate, RowMajor> {
+  enum { IsLower = ((Mode & Lower) == Lower) };
+  static void run(Index size, const LhsScalar* _lhs, Index lhsStride, RhsScalar* rhs) {
+    typedef Map<const Matrix<LhsScalar, Dynamic, Dynamic, RowMajor>, 0, OuterStride<> > LhsMap;
+    const LhsMap lhs(_lhs, size, size, OuterStride<>(lhsStride));
+
+    typedef const_blas_data_mapper<LhsScalar, Index, RowMajor> LhsMapper;
+    typedef const_blas_data_mapper<RhsScalar, Index, ColMajor> RhsMapper;
+
+    std::conditional_t<Conjugate, const CwiseUnaryOp<typename internal::scalar_conjugate_op<LhsScalar>, LhsMap>,
+                       const LhsMap&>
+        cjLhs(lhs);
     static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH;
-    for(Index pi=IsLower ? 0 : size;
-        IsLower ? pi<size : pi>0;
-        IsLower ? pi+=PanelWidth : pi-=PanelWidth)
-    {
+    for (Index pi = IsLower ? 0 : size; IsLower ? pi < size : pi > 0; IsLower ? pi += PanelWidth : pi -= PanelWidth) {
       Index actualPanelWidth = (std::min)(IsLower ? size - pi : pi, PanelWidth);
 
-      Index r = IsLower ? pi : size - pi; // remaining size
-      if (r > 0)
-      {
+      Index r = IsLower ? pi : size - pi;  // remaining size
+      if (r > 0) {
         // let's directly call the low level product function because:
         // 1 - it is faster to compile
-        // 2 - it is slighlty faster at runtime
-        Index startRow = IsLower ? pi : pi-actualPanelWidth;
+        // 2 - it is slightly faster at runtime
+        Index startRow = IsLower ? pi : pi - actualPanelWidth;
         Index startCol = IsLower ? 0 : pi;
 
-        general_matrix_vector_product<Index,LhsScalar,RowMajor,Conjugate,RhsScalar,false>::run(
-          actualPanelWidth, r,
-          &lhs.coeffRef(startRow,startCol), lhsStride,
-          rhs + startCol, 1,
-          rhs + startRow, 1,
-          RhsScalar(-1));
+        general_matrix_vector_product<Index, LhsScalar, LhsMapper, RowMajor, Conjugate, RhsScalar, RhsMapper,
+                                      false>::run(actualPanelWidth, r,
+                                                  LhsMapper(&lhs.coeffRef(startRow, startCol), lhsStride),
+                                                  RhsMapper(rhs + startCol, 1), rhs + startRow, 1, RhsScalar(-1));
       }
 
-      for(Index k=0; k<actualPanelWidth; ++k)
-      {
-        Index i = IsLower ? pi+k : pi-k-1;
-        Index s = IsLower ? pi   : i+1;
-        if (k>0)
-          rhs[i] -= (cjLhs.row(i).segment(s,k).transpose().cwiseProduct(Map<const Matrix<RhsScalar,Dynamic,1> >(rhs+s,k))).sum();
-        
-        if(!(Mode & UnitDiag))
-          rhs[i] /= cjLhs(i,i);
+      for (Index k = 0; k < actualPanelWidth; ++k) {
+        Index i = IsLower ? pi + k : pi - k - 1;
+        Index s = IsLower ? pi : i + 1;
+        if (k > 0)
+          rhs[i] -= (cjLhs.row(i).segment(s, k).transpose().cwiseProduct(
+                         Map<const Matrix<RhsScalar, Dynamic, 1> >(rhs + s, k)))
+                        .sum();
+
+        if ((!(Mode & UnitDiag)) && !is_identically_zero(rhs[i])) rhs[i] /= cjLhs(i, i);
       }
     }
   }
 };
 
 // forward and backward substitution, column-major, rhs is a vector
-template<typename LhsScalar, typename RhsScalar, typename Index, int Mode, bool Conjugate>
-struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Conjugate, ColMajor>
-{
-  enum {
-    IsLower = ((Mode&Lower)==Lower)
-  };
-  static void run(Index size, const LhsScalar* _lhs, Index lhsStride, RhsScalar* rhs)
-  {
-    typedef Map<const Matrix<LhsScalar,Dynamic,Dynamic,ColMajor>, 0, OuterStride<> > LhsMap;
-    const LhsMap lhs(_lhs,size,size,OuterStride<>(lhsStride));
-    typename internal::conditional<Conjugate,
-                                   const CwiseUnaryOp<typename internal::scalar_conjugate_op<LhsScalar>,LhsMap>,
-                                   const LhsMap&
-                                  >::type cjLhs(lhs);
+template <typename LhsScalar, typename RhsScalar, typename Index, int Mode, bool Conjugate>
+struct triangular_solve_vector<LhsScalar, RhsScalar, Index, OnTheLeft, Mode, Conjugate, ColMajor> {
+  enum { IsLower = ((Mode & Lower) == Lower) };
+  static void run(Index size, const LhsScalar* _lhs, Index lhsStride, RhsScalar* rhs) {
+    typedef Map<const Matrix<LhsScalar, Dynamic, Dynamic, ColMajor>, 0, OuterStride<> > LhsMap;
+    const LhsMap lhs(_lhs, size, size, OuterStride<>(lhsStride));
+    typedef const_blas_data_mapper<LhsScalar, Index, ColMajor> LhsMapper;
+    typedef const_blas_data_mapper<RhsScalar, Index, ColMajor> RhsMapper;
+    std::conditional_t<Conjugate, const CwiseUnaryOp<typename internal::scalar_conjugate_op<LhsScalar>, LhsMap>,
+                       const LhsMap&>
+        cjLhs(lhs);
     static const Index PanelWidth = EIGEN_TUNE_TRIANGULAR_PANEL_WIDTH;
 
-    for(Index pi=IsLower ? 0 : size;
-        IsLower ? pi<size : pi>0;
-        IsLower ? pi+=PanelWidth : pi-=PanelWidth)
-    {
+    for (Index pi = IsLower ? 0 : size; IsLower ? pi < size : pi > 0; IsLower ? pi += PanelWidth : pi -= PanelWidth) {
       Index actualPanelWidth = (std::min)(IsLower ? size - pi : pi, PanelWidth);
-      Index startBlock = IsLower ? pi : pi-actualPanelWidth;
+      Index startBlock = IsLower ? pi : pi - actualPanelWidth;
       Index endBlock = IsLower ? pi + actualPanelWidth : 0;
 
-      for(Index k=0; k<actualPanelWidth; ++k)
-      {
-        Index i = IsLower ? pi+k : pi-k-1;
-        if(!(Mode & UnitDiag))
-          rhs[i] /= cjLhs.coeff(i,i);
+      for (Index k = 0; k < actualPanelWidth; ++k) {
+        Index i = IsLower ? pi + k : pi - k - 1;
+        if (!is_identically_zero(rhs[i])) {
+          if (!(Mode & UnitDiag)) rhs[i] /= cjLhs.coeff(i, i);
 
-        Index r = actualPanelWidth - k - 1; // remaining size
-        Index s = IsLower ? i+1 : i-r;
-        if (r>0)
-          Map<Matrix<RhsScalar,Dynamic,1> >(rhs+s,r) -= rhs[i] * cjLhs.col(i).segment(s,r);
+          Index r = actualPanelWidth - k - 1;  // remaining size
+          Index s = IsLower ? i + 1 : i - r;
+          if (r > 0) Map<Matrix<RhsScalar, Dynamic, 1> >(rhs + s, r) -= rhs[i] * cjLhs.col(i).segment(s, r);
+        }
       }
-      Index r = IsLower ? size - endBlock : startBlock; // remaining size
-      if (r > 0)
-      {
+      Index r = IsLower ? size - endBlock : startBlock;  // remaining size
+      if (r > 0) {
         // let's directly call the low level product function because:
         // 1 - it is faster to compile
-        // 2 - it is slighlty faster at runtime
-        general_matrix_vector_product<Index,LhsScalar,ColMajor,Conjugate,RhsScalar,false>::run(
-            r, actualPanelWidth,
-            &lhs.coeffRef(endBlock,startBlock), lhsStride,
-            rhs+startBlock, 1,
-            rhs+endBlock, 1, RhsScalar(-1));
+        // 2 - it is slightly faster at runtime
+        general_matrix_vector_product<Index, LhsScalar, LhsMapper, ColMajor, Conjugate, RhsScalar, RhsMapper,
+                                      false>::run(r, actualPanelWidth,
+                                                  LhsMapper(&lhs.coeffRef(endBlock, startBlock), lhsStride),
+                                                  RhsMapper(rhs + startBlock, 1), rhs + endBlock, 1, RhsScalar(-1));
       }
     }
   }
 };
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_TRIANGULAR_SOLVER_VECTOR_H
+#endif  // EIGEN_TRIANGULAR_SOLVER_VECTOR_H
diff --git a/inst/include/Eigen/src/Core/util/Assert.h b/inst/include/Eigen/src/Core/util/Assert.h
new file mode 100644
index 00000000..09a411a0
--- /dev/null
+++ b/inst/include/Eigen/src/Core/util/Assert.h
@@ -0,0 +1,158 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2022, The Eigen authors.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CORE_UTIL_ASSERT_H
+#define EIGEN_CORE_UTIL_ASSERT_H
+
+// Eigen custom assert function.
+//
+// The combination of Eigen's relative includes and cassert's `assert` function
+// (or any usage of the __FILE__ macro) can lead to ODR issues:
+// a header included using different relative paths in two different TUs will
+// have two different token-for-token definitions, since __FILE__ is expanded
+// as an in-line string with different values.  Normally this would be
+// harmless - the linker would just choose one definition. However, it breaks
+// with C++20 modules when functions in different modules have different
+// definitions.
+//
+// To get around this, we need to use __builtin_FILE() when available, which is
+// considered a single token, and thus satisfies the ODR.
+
+// Only define eigen_plain_assert if we are debugging, and either
+//  - we are not compiling for GPU, or
+//  - gpu debugging is enabled.
+#if !defined(EIGEN_NO_DEBUG) && (!defined(EIGEN_GPU_COMPILE_PHASE) || !defined(EIGEN_NO_DEBUG_GPU))
+
+#include <cassert>
+
+#ifndef EIGEN_USE_CUSTOM_PLAIN_ASSERT
+// Disable new custom asserts by default for now.
+#define EIGEN_USE_CUSTOM_PLAIN_ASSERT 0
+#endif
+
+#if EIGEN_USE_CUSTOM_PLAIN_ASSERT
+
+#ifndef EIGEN_HAS_BUILTIN_FILE
+// Clang can check if __builtin_FILE() is supported.
+// GCC > 5, MSVC 2019 14.26 (1926) all have __builtin_FILE().
+//
+// For NVCC, it's more complicated.  Through trial-and-error:
+//   - nvcc+gcc supports __builtin_FILE() on host, and on device after CUDA 11.
+//   - nvcc+msvc supports __builtin_FILE() only after CUDA 11.
+#if (EIGEN_HAS_BUILTIN(__builtin_FILE) && (EIGEN_COMP_CLANG || !defined(EIGEN_CUDA_ARCH))) ||            \
+    (EIGEN_GNUC_STRICT_AT_LEAST(5, 0, 0) && (EIGEN_COMP_NVCC >= 110000 || !defined(EIGEN_CUDA_ARCH))) || \
+    (EIGEN_COMP_MSVC >= 1926 && (!EIGEN_COMP_NVCC || EIGEN_COMP_NVCC >= 110000))
+#define EIGEN_HAS_BUILTIN_FILE 1
+#else
+#define EIGEN_HAS_BUILTIN_FILE 0
+#endif
+#endif  // EIGEN_HAS_BUILTIN_FILE
+
+#if EIGEN_HAS_BUILTIN_FILE
+#define EIGEN_BUILTIN_FILE __builtin_FILE()
+#define EIGEN_BUILTIN_LINE __builtin_LINE()
+#else
+// Default (potentially unsafe) values.
+#define EIGEN_BUILTIN_FILE __FILE__
+#define EIGEN_BUILTIN_LINE __LINE__
+#endif
+
+// Use __PRETTY_FUNCTION__ when available, since it is more descriptive, as
+// __builtin_FUNCTION() only returns the undecorated function name.
+// This should still be okay ODR-wise since it is a compiler-specific fixed
+// value.  Mixing compilers will likely lead to ODR violations anyways.
+#if EIGEN_COMP_MSVC
+#define EIGEN_BUILTIN_FUNCTION __FUNCSIG__
+#elif EIGEN_COMP_GNUC
+#define EIGEN_BUILTIN_FUNCTION __PRETTY_FUNCTION__
+#else
+#define EIGEN_BUILTIN_FUNCTION __func__
+#endif
+
+namespace Eigen {
+namespace internal {
+
+// Generic default assert handler.
+template <typename EnableIf = void, typename... EmptyArgs>
+struct assert_handler_impl {
+  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static inline void run(const char* expression, const char* file, unsigned line,
+                                                             const char* function) {
+#ifdef EIGEN_GPU_COMPILE_PHASE
+    // GPU device code doesn't allow stderr or abort, so use printf and raise an
+    // illegal instruction exception to trigger a kernel failure.
+#ifndef EIGEN_NO_IO
+    printf("Assertion failed at %s:%u in %s: %s\n", file == nullptr ? "<file>" : file, line,
+           function == nullptr ? "<function>" : function, expression);
+#endif
+    __trap();
+
+#else  // EIGEN_GPU_COMPILE_PHASE
+
+    // Print to stderr and abort, as specified in <cassert>.
+#ifndef EIGEN_NO_IO
+    fprintf(stderr, "Assertion failed at %s:%u in %s: %s\n", file == nullptr ? "<file>" : file, line,
+            function == nullptr ? "<function>" : function, expression);
+#endif
+    std::abort();
+
+#endif  // EIGEN_GPU_COMPILE_PHASE
+  }
+};
+
+// Use POSIX __assert_fail handler when available.
+//
+// This allows us to integrate with systems that have custom handlers.
+//
+// NOTE: this handler is not always available on all POSIX systems (otherwise
+// we could simply test for __unix__ or similar).  The handler function name
+// seems to depend on the specific toolchain implementation, and differs between
+// compilers, platforms, OSes, etc.  Hence, we detect support via SFINAE.
+template <typename... EmptyArgs>
+struct assert_handler_impl<void_t<decltype(__assert_fail((const char*)nullptr,         // expression
+                                                         (const char*)nullptr,         // file
+                                                         0,                            // line
+                                                         (const char*)nullptr,         // function
+                                                         std::declval<EmptyArgs>()...  // Empty substitution required
+                                                                                       // for SFINAE.
+                                                         ))>,
+                           EmptyArgs...> {
+  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE static inline void run(const char* expression, const char* file, unsigned line,
+                                                             const char* function) {
+    // GCC requires this call to be dependent on the template parameters.
+    __assert_fail(expression, file, line, function, std::declval<EmptyArgs>()...);
+  }
+};
+
+EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE inline void __assert_handler(const char* expression, const char* file,
+                                                                 unsigned line, const char* function) {
+  assert_handler_impl<>::run(expression, file, line, function);
+}
+
+}  // namespace internal
+}  // namespace Eigen
+
+#define eigen_plain_assert(expression)                                                                                \
+  (EIGEN_PREDICT_FALSE(!(expression)) ? Eigen::internal::__assert_handler(#expression, EIGEN_BUILTIN_FILE,            \
+                                                                          EIGEN_BUILTIN_LINE, EIGEN_BUILTIN_FUNCTION) \
+                                      : (void)0)
+
+#else  // EIGEN_USE_CUSTOM_PLAIN_ASSERT
+
+// Use regular assert.
+#define eigen_plain_assert(condition) assert(condition)
+
+#endif  // EIGEN_USE_CUSTOM_PLAIN_ASSERT
+
+#else  // EIGEN_NO_DEBUG
+
+#define eigen_plain_assert(condition) ((void)0)
+
+#endif  // EIGEN_NO_DEBUG
+
+#endif  // EIGEN_CORE_UTIL_ASSERT_H
diff --git a/inst/include/Eigen/src/Core/util/BlasUtil.h b/inst/include/Eigen/src/Core/util/BlasUtil.h
index a28f16fa..19d9917d 100644
--- a/inst/include/Eigen/src/Core/util/BlasUtil.h
+++ b/inst/include/Eigen/src/Core/util/BlasUtil.h
@@ -13,252 +13,610 @@
 // This file contains many lightweight helper classes used to
 // implement and control fast level 2 and level 3 BLAS-like routines.
 
+// IWYU pragma: private
+#include "../InternalHeaderCheck.h"
+
 namespace Eigen {
 
 namespace internal {
 
 // forward declarations
-template<typename LhsScalar, typename RhsScalar, typename Index, int mr, int nr, bool ConjugateLhs=false, bool ConjugateRhs=false>
+template <typename LhsScalar, typename RhsScalar, typename Index, typename DataMapper, int mr, int nr,
+          bool ConjugateLhs = false, bool ConjugateRhs = false>
 struct gebp_kernel;
 
-template<typename Scalar, typename Index, int nr, int StorageOrder, bool Conjugate = false, bool PanelMode=false>
+template <typename Scalar, typename Index, typename DataMapper, int nr, int StorageOrder, bool Conjugate = false,
+          bool PanelMode = false>
 struct gemm_pack_rhs;
 
-template<typename Scalar, typename Index, int Pack1, int Pack2, int StorageOrder, bool Conjugate = false, bool PanelMode = false>
+template <typename Scalar, typename Index, typename DataMapper, int Pack1, int Pack2, typename Packet, int StorageOrder,
+          bool Conjugate = false, bool PanelMode = false>
 struct gemm_pack_lhs;
 
-template<
-  typename Index,
-  typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs,
-  typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs,
-  int ResStorageOrder>
+template <typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, typename RhsScalar,
+          int RhsStorageOrder, bool ConjugateRhs, int ResStorageOrder, int ResInnerStride>
 struct general_matrix_matrix_product;
 
-template<typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, typename RhsScalar, bool ConjugateRhs, int Version=Specialized>
+template <typename Index, typename LhsScalar, typename LhsMapper, int LhsStorageOrder, bool ConjugateLhs,
+          typename RhsScalar, typename RhsMapper, bool ConjugateRhs, int Version = Specialized>
 struct general_matrix_vector_product;
 
-
-template<bool Conjugate> struct conj_if;
-
-template<> struct conj_if<true> {
-  template<typename T>
-  inline T operator()(const T& x) { return numext::conj(x); }
-  template<typename T>
-  inline T pconj(const T& x) { return internal::pconj(x); }
+template <typename From, typename To>
+struct get_factor {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE To run(const From& x) { return To(x); }
 };
 
-template<> struct conj_if<false> {
-  template<typename T>
-  inline const T& operator()(const T& x) { return x; }
-  template<typename T>
-  inline const T& pconj(const T& x) { return x; }
+template <typename Scalar>
+struct get_factor<Scalar, typename NumTraits<Scalar>::Real> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE typename NumTraits<Scalar>::Real run(const Scalar& x) {
+    return numext::real(x);
+  }
 };
 
-template<typename Scalar> struct conj_helper<Scalar,Scalar,false,false>
-{
-  EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const { return internal::pmadd(x,y,c); }
-  EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const { return internal::pmul(x,y); }
-};
+template <typename Scalar, typename Index>
+class BlasVectorMapper {
+ public:
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasVectorMapper(Scalar* data) : m_data(data) {}
 
-template<typename RealScalar> struct conj_helper<std::complex<RealScalar>, std::complex<RealScalar>, false,true>
-{
-  typedef std::complex<RealScalar> Scalar;
-  EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const
-  { return c + pmul(x,y); }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const { return m_data[i]; }
+  template <typename Packet, int AlignmentType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet load(Index i) const {
+    return ploadt<Packet, AlignmentType>(m_data + i);
+  }
+
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC bool aligned(Index i) const {
+    return (std::uintptr_t(m_data + i) % sizeof(Packet)) == 0;
+  }
 
-  EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const
-  { return Scalar(numext::real(x)*numext::real(y) + numext::imag(x)*numext::imag(y), numext::imag(x)*numext::real(y) - numext::real(x)*numext::imag(y)); }
+ protected:
+  Scalar* m_data;
 };
 
-template<typename RealScalar> struct conj_helper<std::complex<RealScalar>, std::complex<RealScalar>, true,false>
-{
-  typedef std::complex<RealScalar> Scalar;
-  EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const
-  { return c + pmul(x,y); }
+template <typename Scalar, typename Index, int AlignmentType, int Incr = 1>
+class BlasLinearMapper;
+
+template <typename Scalar, typename Index, int AlignmentType>
+class BlasLinearMapper<Scalar, Index, AlignmentType> {
+ public:
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar* data, Index incr = 1) : m_data(data) {
+    EIGEN_ONLY_USED_FOR_DEBUG(incr);
+    eigen_assert(incr == 1);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(Index i) const { internal::prefetch(&operator()(i)); }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar& operator()(Index i) const { return m_data[i]; }
+
+  template <typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacket(Index i) const {
+    return ploadt<PacketType, AlignmentType>(m_data + i);
+  }
+
+  template <typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacketPartial(Index i, Index n, Index offset = 0) const {
+    return ploadt_partial<PacketType, AlignmentType>(m_data + i, n, offset);
+  }
+
+  template <typename PacketType, int AlignmentT>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType load(Index i) const {
+    return ploadt<PacketType, AlignmentT>(m_data + i);
+  }
 
-  EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const
-  { return Scalar(numext::real(x)*numext::real(y) + numext::imag(x)*numext::imag(y), numext::real(x)*numext::imag(y) - numext::imag(x)*numext::real(y)); }
+  template <typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const PacketType& p) const {
+    pstoret<Scalar, PacketType, AlignmentType>(m_data + i, p);
+  }
+
+  template <typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacketPartial(Index i, const PacketType& p, Index n,
+                                                                Index offset = 0) const {
+    pstoret_partial<Scalar, PacketType, AlignmentType>(m_data + i, p, n, offset);
+  }
+
+ protected:
+  Scalar* m_data;
 };
 
-template<typename RealScalar> struct conj_helper<std::complex<RealScalar>, std::complex<RealScalar>, true,true>
-{
-  typedef std::complex<RealScalar> Scalar;
-  EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const Scalar& y, const Scalar& c) const
-  { return c + pmul(x,y); }
+// Lightweight helper class to access matrix coefficients.
+template <typename Scalar, typename Index, int StorageOrder, int AlignmentType = Unaligned, int Incr = 1>
+class blas_data_mapper;
+
+// TMP to help PacketBlock store implementation.
+// There's currently no known use case for PacketBlock load.
+// The default implementation assumes ColMajor order.
+// It always store each packet sequentially one `stride` apart.
+template <typename Index, typename Scalar, typename Packet, int n, int idx, int StorageOrder>
+struct PacketBlockManagement {
+  PacketBlockManagement<Index, Scalar, Packet, n, idx - 1, StorageOrder> pbm;
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(Scalar* to, const Index stride, Index i, Index j,
+                                                   const PacketBlock<Packet, n>& block) const {
+    pbm.store(to, stride, i, j, block);
+    pstoreu<Scalar>(to + i + (j + idx) * stride, block.packet[idx]);
+  }
+};
 
-  EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const Scalar& y) const
-  { return Scalar(numext::real(x)*numext::real(y) - numext::imag(x)*numext::imag(y), - numext::real(x)*numext::imag(y) - numext::imag(x)*numext::real(y)); }
+// PacketBlockManagement specialization to take care of RowMajor order without ifs.
+template <typename Index, typename Scalar, typename Packet, int n, int idx>
+struct PacketBlockManagement<Index, Scalar, Packet, n, idx, RowMajor> {
+  PacketBlockManagement<Index, Scalar, Packet, n, idx - 1, RowMajor> pbm;
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(Scalar* to, const Index stride, Index i, Index j,
+                                                   const PacketBlock<Packet, n>& block) const {
+    pbm.store(to, stride, i, j, block);
+    pstoreu<Scalar>(to + j + (i + idx) * stride, block.packet[idx]);
+  }
 };
 
-template<typename RealScalar,bool Conj> struct conj_helper<std::complex<RealScalar>, RealScalar, Conj,false>
-{
-  typedef std::complex<RealScalar> Scalar;
-  EIGEN_STRONG_INLINE Scalar pmadd(const Scalar& x, const RealScalar& y, const Scalar& c) const
-  { return padd(c, pmul(x,y)); }
-  EIGEN_STRONG_INLINE Scalar pmul(const Scalar& x, const RealScalar& y) const
-  { return conj_if<Conj>()(x)*y; }
+template <typename Index, typename Scalar, typename Packet, int n, int StorageOrder>
+struct PacketBlockManagement<Index, Scalar, Packet, n, -1, StorageOrder> {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(Scalar* to, const Index stride, Index i, Index j,
+                                                   const PacketBlock<Packet, n>& block) const {
+    EIGEN_UNUSED_VARIABLE(to);
+    EIGEN_UNUSED_VARIABLE(stride);
+    EIGEN_UNUSED_VARIABLE(i);
+    EIGEN_UNUSED_VARIABLE(j);
+    EIGEN_UNUSED_VARIABLE(block);
+  }
 };
 
-template<typename RealScalar,bool Conj> struct conj_helper<RealScalar, std::complex<RealScalar>, false,Conj>
-{
-  typedef std::complex<RealScalar> Scalar;
-  EIGEN_STRONG_INLINE Scalar pmadd(const RealScalar& x, const Scalar& y, const Scalar& c) const
-  { return padd(c, pmul(x,y)); }
-  EIGEN_STRONG_INLINE Scalar pmul(const RealScalar& x, const Scalar& y) const
-  { return x*conj_if<Conj>()(y); }
+template <typename Index, typename Scalar, typename Packet, int n>
+struct PacketBlockManagement<Index, Scalar, Packet, n, -1, RowMajor> {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(Scalar* to, const Index stride, Index i, Index j,
+                                                   const PacketBlock<Packet, n>& block) const {
+    EIGEN_UNUSED_VARIABLE(to);
+    EIGEN_UNUSED_VARIABLE(stride);
+    EIGEN_UNUSED_VARIABLE(i);
+    EIGEN_UNUSED_VARIABLE(j);
+    EIGEN_UNUSED_VARIABLE(block);
+  }
 };
 
-template<typename From,typename To> struct get_factor {
-  static EIGEN_STRONG_INLINE To run(const From& x) { return x; }
+template <typename Scalar, typename Index, int StorageOrder, int AlignmentType>
+class blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType, 1> {
+ public:
+  typedef BlasLinearMapper<Scalar, Index, AlignmentType> LinearMapper;
+  typedef blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType> SubMapper;
+  typedef BlasVectorMapper<Scalar, Index> VectorMapper;
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride, Index incr = 1)
+      : m_data(data), m_stride(stride) {
+    EIGEN_ONLY_USED_FOR_DEBUG(incr);
+    eigen_assert(incr == 1);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE SubMapper getSubMapper(Index i, Index j) const {
+    return SubMapper(&operator()(i, j), m_stride);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
+    return LinearMapper(&operator()(i, j));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const {
+    return VectorMapper(&operator()(i, j));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(Index i, Index j) const { internal::prefetch(&operator()(i, j)); }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar& operator()(Index i, Index j) const {
+    return m_data[StorageOrder == RowMajor ? j + i * m_stride : i + j * m_stride];
+  }
+
+  template <typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacket(Index i, Index j) const {
+    return ploadt<PacketType, AlignmentType>(&operator()(i, j));
+  }
+
+  template <typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacketPartial(Index i, Index j, Index n,
+                                                                     Index offset = 0) const {
+    return ploadt_partial<PacketType, AlignmentType>(&operator()(i, j), n, offset);
+  }
+
+  template <typename PacketT, int AlignmentT>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i, Index j) const {
+    return ploadt<PacketT, AlignmentT>(&operator()(i, j));
+  }
+
+  template <typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Index j, const PacketType& p) const {
+    pstoret<Scalar, PacketType, AlignmentType>(&operator()(i, j), p);
+  }
+
+  template <typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacketPartial(Index i, Index j, const PacketType& p, Index n,
+                                                                Index offset = 0) const {
+    pstoret_partial<Scalar, PacketType, AlignmentType>(&operator()(i, j), p, n, offset);
+  }
+
+  template <typename SubPacket>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, const SubPacket& p) const {
+    pscatter<Scalar, SubPacket>(&operator()(i, j), p, m_stride);
+  }
+
+  template <typename SubPacket>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE SubPacket gatherPacket(Index i, Index j) const {
+    return pgather<Scalar, SubPacket>(&operator()(i, j), m_stride);
+  }
+
+  EIGEN_DEVICE_FUNC const Index stride() const { return m_stride; }
+  EIGEN_DEVICE_FUNC const Index incr() const { return 1; }
+  EIGEN_DEVICE_FUNC constexpr const Scalar* data() const { return m_data; }
+
+  EIGEN_DEVICE_FUNC Index firstAligned(Index size) const {
+    if (std::uintptr_t(m_data) % sizeof(Scalar)) {
+      return -1;
+    }
+    return internal::first_default_aligned(m_data, size);
+  }
+
+  template <typename SubPacket, int n>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacketBlock(Index i, Index j,
+                                                              const PacketBlock<SubPacket, n>& block) const {
+    PacketBlockManagement<Index, Scalar, SubPacket, n, n - 1, StorageOrder> pbm;
+    pbm.store(m_data, m_stride, i, j, block);
+  }
+
+ protected:
+  Scalar* EIGEN_RESTRICT m_data;
+  const Index m_stride;
 };
 
-template<typename Scalar> struct get_factor<Scalar,typename NumTraits<Scalar>::Real> {
-  static EIGEN_STRONG_INLINE typename NumTraits<Scalar>::Real run(const Scalar& x) { return numext::real(x); }
+// Implementation of non-natural increment (i.e. inner-stride != 1)
+// The exposed API is not complete yet compared to the Incr==1 case
+// because some features makes less sense in this case.
+template <typename Scalar, typename Index, int AlignmentType, int Incr>
+class BlasLinearMapper {
+ public:
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar* data, Index incr) : m_data(data), m_incr(incr) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(int i) const { internal::prefetch(&operator()(i)); }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar& operator()(Index i) const { return m_data[i * m_incr.value()]; }
+
+  template <typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacket(Index i) const {
+    return pgather<Scalar, PacketType>(m_data + i * m_incr.value(), m_incr.value());
+  }
+
+  template <typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacketPartial(Index i, Index n, Index /*offset*/ = 0) const {
+    return pgather_partial<Scalar, PacketType>(m_data + i * m_incr.value(), m_incr.value(), n);
+  }
+
+  template <typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const PacketType& p) const {
+    pscatter<Scalar, PacketType>(m_data + i * m_incr.value(), p, m_incr.value());
+  }
+
+  template <typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacketPartial(Index i, const PacketType& p, Index n,
+                                                                Index /*offset*/ = 0) const {
+    pscatter_partial<Scalar, PacketType>(m_data + i * m_incr.value(), p, m_incr.value(), n);
+  }
+
+ protected:
+  Scalar* m_data;
+  const internal::variable_if_dynamic<Index, Incr> m_incr;
 };
 
-// Lightweight helper class to access matrix coefficients.
-// Yes, this is somehow redundant with Map<>, but this version is much much lighter,
-// and so I hope better compilation performance (time and code quality).
-template<typename Scalar, typename Index, int StorageOrder>
-class blas_data_mapper
-{
-  public:
-    blas_data_mapper(Scalar* data, Index stride) : m_data(data), m_stride(stride) {}
-    EIGEN_STRONG_INLINE Scalar& operator()(Index i, Index j)
-    { return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; }
-  protected:
-    Scalar* EIGEN_RESTRICT m_data;
-    Index m_stride;
+template <typename Scalar, typename Index, int StorageOrder, int AlignmentType, int Incr>
+class blas_data_mapper {
+ public:
+  typedef BlasLinearMapper<Scalar, Index, AlignmentType, Incr> LinearMapper;
+  typedef blas_data_mapper SubMapper;
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride, Index incr)
+      : m_data(data), m_stride(stride), m_incr(incr) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE SubMapper getSubMapper(Index i, Index j) const {
+    return SubMapper(&operator()(i, j), m_stride, m_incr.value());
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
+    return LinearMapper(&operator()(i, j), m_incr.value());
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(Index i, Index j) const { internal::prefetch(&operator()(i, j)); }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar& operator()(Index i, Index j) const {
+    return m_data[StorageOrder == RowMajor ? j * m_incr.value() + i * m_stride : i * m_incr.value() + j * m_stride];
+  }
+
+  template <typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacket(Index i, Index j) const {
+    return pgather<Scalar, PacketType>(&operator()(i, j), m_incr.value());
+  }
+
+  template <typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketType loadPacketPartial(Index i, Index j, Index n,
+                                                                     Index /*offset*/ = 0) const {
+    return pgather_partial<Scalar, PacketType>(&operator()(i, j), m_incr.value(), n);
+  }
+
+  template <typename PacketT, int AlignmentT>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i, Index j) const {
+    return pgather<Scalar, PacketT>(&operator()(i, j), m_incr.value());
+  }
+
+  template <typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, Index j, const PacketType& p) const {
+    pscatter<Scalar, PacketType>(&operator()(i, j), p, m_incr.value());
+  }
+
+  template <typename PacketType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacketPartial(Index i, Index j, const PacketType& p, Index n,
+                                                                Index /*offset*/ = 0) const {
+    pscatter_partial<Scalar, PacketType>(&operator()(i, j), p, m_incr.value(), n);
+  }
+
+  template <typename SubPacket>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, const SubPacket& p) const {
+    pscatter<Scalar, SubPacket>(&operator()(i, j), p, m_stride);
+  }
+
+  template <typename SubPacket>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE SubPacket gatherPacket(Index i, Index j) const {
+    return pgather<Scalar, SubPacket>(&operator()(i, j), m_stride);
+  }
+
+  // storePacketBlock_helper defines a way to access values inside the PacketBlock, this is essentially required by the
+  // Complex types.
+  template <typename SubPacket, typename Scalar_, int n, int idx>
+  struct storePacketBlock_helper {
+    storePacketBlock_helper<SubPacket, Scalar_, n, idx - 1> spbh;
+    EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(
+        const blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType, Incr>* sup, Index i, Index j,
+        const PacketBlock<SubPacket, n>& block) const {
+      spbh.store(sup, i, j, block);
+      sup->template storePacket<SubPacket>(i, j + idx, block.packet[idx]);
+    }
+  };
+
+  template <typename SubPacket, int n, int idx>
+  struct storePacketBlock_helper<SubPacket, std::complex<float>, n, idx> {
+    storePacketBlock_helper<SubPacket, std::complex<float>, n, idx - 1> spbh;
+    EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(
+        const blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType, Incr>* sup, Index i, Index j,
+        const PacketBlock<SubPacket, n>& block) const {
+      spbh.store(sup, i, j, block);
+      sup->template storePacket<SubPacket>(i, j + idx, block.packet[idx]);
+    }
+  };
+
+  template <typename SubPacket, int n, int idx>
+  struct storePacketBlock_helper<SubPacket, std::complex<double>, n, idx> {
+    storePacketBlock_helper<SubPacket, std::complex<double>, n, idx - 1> spbh;
+    EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(
+        const blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType, Incr>* sup, Index i, Index j,
+        const PacketBlock<SubPacket, n>& block) const {
+      spbh.store(sup, i, j, block);
+      for (int l = 0; l < unpacket_traits<SubPacket>::size; l++) {
+        std::complex<double>* v = &sup->operator()(i + l, j + idx);
+        v->real(block.packet[idx].v[2 * l + 0]);
+        v->imag(block.packet[idx].v[2 * l + 1]);
+      }
+    }
+  };
+
+  template <typename SubPacket, typename Scalar_, int n>
+  struct storePacketBlock_helper<SubPacket, Scalar_, n, -1> {
+    EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(
+        const blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType, Incr>*, Index, Index,
+        const PacketBlock<SubPacket, n>&) const {}
+  };
+
+  template <typename SubPacket, int n>
+  struct storePacketBlock_helper<SubPacket, std::complex<float>, n, -1> {
+    EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(
+        const blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType, Incr>*, Index, Index,
+        const PacketBlock<SubPacket, n>&) const {}
+  };
+
+  template <typename SubPacket, int n>
+  struct storePacketBlock_helper<SubPacket, std::complex<double>, n, -1> {
+    EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void store(
+        const blas_data_mapper<Scalar, Index, StorageOrder, AlignmentType, Incr>*, Index, Index,
+        const PacketBlock<SubPacket, n>&) const {}
+  };
+  // This function stores a PacketBlock on m_data, this approach is really quite slow compare to Incr=1 and should be
+  // avoided when possible.
+  template <typename SubPacket, int n>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacketBlock(Index i, Index j,
+                                                              const PacketBlock<SubPacket, n>& block) const {
+    storePacketBlock_helper<SubPacket, Scalar, n, n - 1> spb;
+    spb.store(this, i, j, block);
+  }
+
+  EIGEN_DEVICE_FUNC const Index stride() const { return m_stride; }
+  EIGEN_DEVICE_FUNC const Index incr() const { return m_incr.value(); }
+  EIGEN_DEVICE_FUNC constexpr Scalar* data() const { return m_data; }
+
+ protected:
+  Scalar* EIGEN_RESTRICT m_data;
+  const Index m_stride;
+  const internal::variable_if_dynamic<Index, Incr> m_incr;
 };
 
 // lightweight helper class to access matrix coefficients (const version)
-template<typename Scalar, typename Index, int StorageOrder>
-class const_blas_data_mapper
-{
-  public:
-    const_blas_data_mapper(const Scalar* data, Index stride) : m_data(data), m_stride(stride) {}
-    EIGEN_STRONG_INLINE const Scalar& operator()(Index i, Index j) const
-    { return m_data[StorageOrder==RowMajor ? j + i*m_stride : i + j*m_stride]; }
-  protected:
-    const Scalar* EIGEN_RESTRICT m_data;
-    Index m_stride;
-};
+template <typename Scalar, typename Index, int StorageOrder>
+class const_blas_data_mapper : public blas_data_mapper<const Scalar, Index, StorageOrder> {
+ public:
+  typedef const_blas_data_mapper<Scalar, Index, StorageOrder> SubMapper;
 
+  EIGEN_ALWAYS_INLINE const_blas_data_mapper(const Scalar* data, Index stride)
+      : blas_data_mapper<const Scalar, Index, StorageOrder>(data, stride) {}
+
+  EIGEN_ALWAYS_INLINE SubMapper getSubMapper(Index i, Index j) const {
+    return SubMapper(&(this->operator()(i, j)), this->m_stride);
+  }
+};
 
 /* Helper class to analyze the factors of a Product expression.
  * In particular it allows to pop out operator-, scalar multiples,
  * and conjugate */
-template<typename XprType> struct blas_traits
-{
+template <typename XprType>
+struct blas_traits {
   typedef typename traits<XprType>::Scalar Scalar;
   typedef const XprType& ExtractType;
-  typedef XprType _ExtractType;
+  typedef XprType ExtractType_;
   enum {
     IsComplex = NumTraits<Scalar>::IsComplex,
     IsTransposed = false,
     NeedToConjugate = false,
-    HasUsableDirectAccess = (    (int(XprType::Flags)&DirectAccessBit)
-                              && (   bool(XprType::IsVectorAtCompileTime)
-                                  || int(inner_stride_at_compile_time<XprType>::ret) == 1)
-                             ) ?  1 : 0
+    HasUsableDirectAccess =
+        ((int(XprType::Flags) & DirectAccessBit) &&
+         (bool(XprType::IsVectorAtCompileTime) || int(inner_stride_at_compile_time<XprType>::ret) == 1))
+            ? 1
+            : 0,
+    HasScalarFactor = false
   };
-  typedef typename conditional<bool(HasUsableDirectAccess),
-    ExtractType,
-    typename _ExtractType::PlainObject
-    >::type DirectLinearAccessType;
-  static inline ExtractType extract(const XprType& x) { return x; }
-  static inline const Scalar extractScalarFactor(const XprType&) { return Scalar(1); }
+  typedef std::conditional_t<bool(HasUsableDirectAccess), ExtractType, typename ExtractType_::PlainObject>
+      DirectLinearAccessType;
+  EIGEN_DEVICE_FUNC static inline EIGEN_DEVICE_FUNC ExtractType extract(const XprType& x) { return x; }
+  EIGEN_DEVICE_FUNC static inline EIGEN_DEVICE_FUNC const Scalar extractScalarFactor(const XprType&) {
+    return Scalar(1);
+  }
 };
 
 // pop conjugate
-template<typename Scalar, typename NestedXpr>
-struct blas_traits<CwiseUnaryOp<scalar_conjugate_op<Scalar>, NestedXpr> >
- : blas_traits<NestedXpr>
-{
+template <typename Scalar, typename NestedXpr>
+struct blas_traits<CwiseUnaryOp<scalar_conjugate_op<Scalar>, NestedXpr> > : blas_traits<NestedXpr> {
   typedef blas_traits<NestedXpr> Base;
   typedef CwiseUnaryOp<scalar_conjugate_op<Scalar>, NestedXpr> XprType;
   typedef typename Base::ExtractType ExtractType;
 
-  enum {
-    IsComplex = NumTraits<Scalar>::IsComplex,
-    NeedToConjugate = Base::NeedToConjugate ? 0 : IsComplex
-  };
-  static inline ExtractType extract(const XprType& x) { return Base::extract(x.nestedExpression()); }
-  static inline Scalar extractScalarFactor(const XprType& x) { return conj(Base::extractScalarFactor(x.nestedExpression())); }
+  enum { IsComplex = NumTraits<Scalar>::IsComplex, NeedToConjugate = Base::NeedToConjugate ? 0 : IsComplex };
+  EIGEN_DEVICE_FUNC static inline ExtractType extract(const XprType& x) { return Base::extract(x.nestedExpression()); }
+  EIGEN_DEVICE_FUNC static inline Scalar extractScalarFactor(const XprType& x) {
+    return conj(Base::extractScalarFactor(x.nestedExpression()));
+  }
 };
 
 // pop scalar multiple
-template<typename Scalar, typename NestedXpr>
-struct blas_traits<CwiseUnaryOp<scalar_multiple_op<Scalar>, NestedXpr> >
- : blas_traits<NestedXpr>
-{
+template <typename Scalar, typename NestedXpr, typename Plain>
+struct blas_traits<
+    CwiseBinaryOp<scalar_product_op<Scalar>, const CwiseNullaryOp<scalar_constant_op<Scalar>, Plain>, NestedXpr> >
+    : blas_traits<NestedXpr> {
+  enum { HasScalarFactor = true };
+  typedef blas_traits<NestedXpr> Base;
+  typedef CwiseBinaryOp<scalar_product_op<Scalar>, const CwiseNullaryOp<scalar_constant_op<Scalar>, Plain>, NestedXpr>
+      XprType;
+  typedef typename Base::ExtractType ExtractType;
+  EIGEN_DEVICE_FUNC static inline EIGEN_DEVICE_FUNC ExtractType extract(const XprType& x) {
+    return Base::extract(x.rhs());
+  }
+  EIGEN_DEVICE_FUNC static inline EIGEN_DEVICE_FUNC Scalar extractScalarFactor(const XprType& x) {
+    return x.lhs().functor().m_other * Base::extractScalarFactor(x.rhs());
+  }
+};
+template <typename Scalar, typename NestedXpr, typename Plain>
+struct blas_traits<
+    CwiseBinaryOp<scalar_product_op<Scalar>, NestedXpr, const CwiseNullaryOp<scalar_constant_op<Scalar>, Plain> > >
+    : blas_traits<NestedXpr> {
+  enum { HasScalarFactor = true };
   typedef blas_traits<NestedXpr> Base;
-  typedef CwiseUnaryOp<scalar_multiple_op<Scalar>, NestedXpr> XprType;
+  typedef CwiseBinaryOp<scalar_product_op<Scalar>, NestedXpr, const CwiseNullaryOp<scalar_constant_op<Scalar>, Plain> >
+      XprType;
   typedef typename Base::ExtractType ExtractType;
-  static inline ExtractType extract(const XprType& x) { return Base::extract(x.nestedExpression()); }
-  static inline Scalar extractScalarFactor(const XprType& x)
-  { return x.functor().m_other * Base::extractScalarFactor(x.nestedExpression()); }
+  EIGEN_DEVICE_FUNC static inline ExtractType extract(const XprType& x) { return Base::extract(x.lhs()); }
+  EIGEN_DEVICE_FUNC static inline Scalar extractScalarFactor(const XprType& x) {
+    return Base::extractScalarFactor(x.lhs()) * x.rhs().functor().m_other;
+  }
 };
+template <typename Scalar, typename Plain1, typename Plain2>
+struct blas_traits<CwiseBinaryOp<scalar_product_op<Scalar>, const CwiseNullaryOp<scalar_constant_op<Scalar>, Plain1>,
+                                 const CwiseNullaryOp<scalar_constant_op<Scalar>, Plain2> > >
+    : blas_traits<CwiseNullaryOp<scalar_constant_op<Scalar>, Plain1> > {};
 
 // pop opposite
-template<typename Scalar, typename NestedXpr>
-struct blas_traits<CwiseUnaryOp<scalar_opposite_op<Scalar>, NestedXpr> >
- : blas_traits<NestedXpr>
-{
+template <typename Scalar, typename NestedXpr>
+struct blas_traits<CwiseUnaryOp<scalar_opposite_op<Scalar>, NestedXpr> > : blas_traits<NestedXpr> {
+  enum { HasScalarFactor = true };
   typedef blas_traits<NestedXpr> Base;
   typedef CwiseUnaryOp<scalar_opposite_op<Scalar>, NestedXpr> XprType;
   typedef typename Base::ExtractType ExtractType;
-  static inline ExtractType extract(const XprType& x) { return Base::extract(x.nestedExpression()); }
-  static inline Scalar extractScalarFactor(const XprType& x)
-  { return - Base::extractScalarFactor(x.nestedExpression()); }
+  EIGEN_DEVICE_FUNC static inline ExtractType extract(const XprType& x) { return Base::extract(x.nestedExpression()); }
+  EIGEN_DEVICE_FUNC static inline Scalar extractScalarFactor(const XprType& x) {
+    return -Base::extractScalarFactor(x.nestedExpression());
+  }
 };
 
 // pop/push transpose
-template<typename NestedXpr>
-struct blas_traits<Transpose<NestedXpr> >
- : blas_traits<NestedXpr>
-{
+template <typename NestedXpr>
+struct blas_traits<Transpose<NestedXpr> > : blas_traits<NestedXpr> {
   typedef typename NestedXpr::Scalar Scalar;
   typedef blas_traits<NestedXpr> Base;
   typedef Transpose<NestedXpr> XprType;
-  typedef Transpose<const typename Base::_ExtractType>  ExtractType; // const to get rid of a compile error; anyway blas traits are only used on the RHS
-  typedef Transpose<const typename Base::_ExtractType> _ExtractType;
-  typedef typename conditional<bool(Base::HasUsableDirectAccess),
-    ExtractType,
-    typename ExtractType::PlainObject
-    >::type DirectLinearAccessType;
-  enum {
-    IsTransposed = Base::IsTransposed ? 0 : 1
-  };
-  static inline ExtractType extract(const XprType& x) { return Base::extract(x.nestedExpression()); }
-  static inline Scalar extractScalarFactor(const XprType& x) { return Base::extractScalarFactor(x.nestedExpression()); }
+  typedef Transpose<const typename Base::ExtractType_>
+      ExtractType;  // const to get rid of a compile error; anyway blas traits are only used on the RHS
+  typedef Transpose<const typename Base::ExtractType_> ExtractType_;
+  typedef std::conditional_t<bool(Base::HasUsableDirectAccess), ExtractType, typename ExtractType::PlainObject>
+      DirectLinearAccessType;
+  enum { IsTransposed = Base::IsTransposed ? 0 : 1 };
+  EIGEN_DEVICE_FUNC static inline ExtractType extract(const XprType& x) {
+    return ExtractType(Base::extract(x.nestedExpression()));
+  }
+  EIGEN_DEVICE_FUNC static inline Scalar extractScalarFactor(const XprType& x) {
+    return Base::extractScalarFactor(x.nestedExpression());
+  }
 };
 
-template<typename T>
-struct blas_traits<const T>
-     : blas_traits<T>
-{};
+template <typename T>
+struct blas_traits<const T> : blas_traits<T> {};
 
-template<typename T, bool HasUsableDirectAccess=blas_traits<T>::HasUsableDirectAccess>
+template <typename T, bool HasUsableDirectAccess = blas_traits<T>::HasUsableDirectAccess>
 struct extract_data_selector {
-  static const typename T::Scalar* run(const T& m)
-  {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static const typename T::Scalar* run(const T& m) {
     return blas_traits<T>::extract(m).data();
   }
 };
 
-template<typename T>
-struct extract_data_selector<T,false> {
-  static typename T::Scalar* run(const T&) { return 0; }
+template <typename T>
+struct extract_data_selector<T, false> {
+  EIGEN_DEVICE_FUNC static typename T::Scalar* run(const T&) { return 0; }
 };
 
-template<typename T> const typename T::Scalar* extract_data(const T& m)
-{
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const typename T::Scalar* extract_data(const T& m) {
   return extract_data_selector<T>::run(m);
 }
 
-} // end namespace internal
+/**
+ * \c combine_scalar_factors extracts and multiplies factors from GEMM and GEMV products.
+ * There is a specialization for booleans
+ */
+template <typename ResScalar, typename Lhs, typename Rhs>
+struct combine_scalar_factors_impl {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static ResScalar run(const Lhs& lhs, const Rhs& rhs) {
+    return blas_traits<Lhs>::extractScalarFactor(lhs) * blas_traits<Rhs>::extractScalarFactor(rhs);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static ResScalar run(const ResScalar& alpha, const Lhs& lhs, const Rhs& rhs) {
+    return alpha * blas_traits<Lhs>::extractScalarFactor(lhs) * blas_traits<Rhs>::extractScalarFactor(rhs);
+  }
+};
+template <typename Lhs, typename Rhs>
+struct combine_scalar_factors_impl<bool, Lhs, Rhs> {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run(const Lhs& lhs, const Rhs& rhs) {
+    return blas_traits<Lhs>::extractScalarFactor(lhs) && blas_traits<Rhs>::extractScalarFactor(rhs);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run(const bool& alpha, const Lhs& lhs, const Rhs& rhs) {
+    return alpha && blas_traits<Lhs>::extractScalarFactor(lhs) && blas_traits<Rhs>::extractScalarFactor(rhs);
+  }
+};
+
+template <typename ResScalar, typename Lhs, typename Rhs>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ResScalar combine_scalar_factors(const ResScalar& alpha, const Lhs& lhs,
+                                                                       const Rhs& rhs) {
+  return combine_scalar_factors_impl<ResScalar, Lhs, Rhs>::run(alpha, lhs, rhs);
+}
+template <typename ResScalar, typename Lhs, typename Rhs>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ResScalar combine_scalar_factors(const Lhs& lhs, const Rhs& rhs) {
+  return combine_scalar_factors_impl<ResScalar, Lhs, Rhs>::run(lhs, rhs);
+}
+
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_BLASUTIL_H
+#endif  // EIGEN_BLASUTIL_H
diff --git a/inst/include/Eigen/src/Core/util/ConfigureVectorization.h b/inst/include/Eigen/src/Core/util/ConfigureVectorization.h
new file mode 100644
index 00000000..c2546a08
--- /dev/null
+++ b/inst/include/Eigen/src/Core/util/ConfigureVectorization.h
@@ -0,0 +1,543 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2018 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2020, Arm Limited and Contributors
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CONFIGURE_VECTORIZATION_H
+#define EIGEN_CONFIGURE_VECTORIZATION_H
+
+//------------------------------------------------------------------------------------------
+// Static and dynamic alignment control
+//
+// The main purpose of this section is to define EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES
+// as the maximal boundary in bytes on which dynamically and statically allocated data may be alignment respectively.
+// The values of EIGEN_MAX_ALIGN_BYTES and EIGEN_MAX_STATIC_ALIGN_BYTES can be specified by the user. If not,
+// a default value is automatically computed based on architecture, compiler, and OS.
+//
+// This section also defines macros EIGEN_ALIGN_TO_BOUNDARY(N) and the shortcuts EIGEN_ALIGN{8,16,32,_MAX}
+// to be used to declare statically aligned buffers.
+//------------------------------------------------------------------------------------------
+
+/* EIGEN_ALIGN_TO_BOUNDARY(n) forces data to be n-byte aligned. This is used to satisfy SIMD requirements.
+ * However, we do that EVEN if vectorization (EIGEN_VECTORIZE) is disabled,
+ * so that vectorization doesn't affect binary compatibility.
+ *
+ * If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link
+ * vectorized and non-vectorized code.
+ */
+#if (defined EIGEN_CUDACC)
+#define EIGEN_ALIGN_TO_BOUNDARY(n) __align__(n)
+#define EIGEN_ALIGNOF(x) __alignof(x)
+#else
+#define EIGEN_ALIGN_TO_BOUNDARY(n) alignas(n)
+#define EIGEN_ALIGNOF(x) alignof(x)
+#endif
+
+// Align to the boundary that avoids false sharing.
+//   https://en.cppreference.com/w/cpp/thread/hardware_destructive_interference_size
+// There is a bug in android NDK < r26 where the macro is defined but std::hardware_destructive_interference_size
+// still does not exist.
+#if defined(__cpp_lib_hardware_interference_size) && __cpp_lib_hardware_interference_size >= 201603 && \
+    (!EIGEN_OS_ANDROID || __NDK_MAJOR__ + 0 >= 26)
+#include <new>
+#define EIGEN_ALIGN_TO_AVOID_FALSE_SHARING EIGEN_ALIGN_TO_BOUNDARY(std::hardware_destructive_interference_size)
+#else
+// Overalign for the cache line size of 128 bytes (Apple M1)
+#define EIGEN_ALIGN_TO_AVOID_FALSE_SHARING EIGEN_ALIGN_TO_BOUNDARY(128)
+#endif
+
+// If the user explicitly disable vectorization, then we also disable alignment
+#if defined(EIGEN_DONT_VECTORIZE)
+#if defined(EIGEN_GPUCC)
+// GPU code is always vectorized and requires memory alignment for
+// statically allocated buffers.
+#define EIGEN_IDEAL_MAX_ALIGN_BYTES 16
+#else
+#define EIGEN_IDEAL_MAX_ALIGN_BYTES 0
+#endif
+#elif defined(__AVX512F__)
+// 64 bytes static alignment is preferred only if really required
+#define EIGEN_IDEAL_MAX_ALIGN_BYTES 64
+#elif defined(__AVX__)
+// 32 bytes static alignment is preferred only if really required
+#define EIGEN_IDEAL_MAX_ALIGN_BYTES 32
+#elif defined __HVX__ && (__HVX_LENGTH__ == 128)
+#define EIGEN_IDEAL_MAX_ALIGN_BYTES 128
+#else
+#define EIGEN_IDEAL_MAX_ALIGN_BYTES 16
+#endif
+
+// EIGEN_MIN_ALIGN_BYTES defines the minimal value for which the notion of explicit alignment makes sense
+#define EIGEN_MIN_ALIGN_BYTES 16
+
+// Defined the boundary (in bytes) on which the data needs to be aligned. Note
+// that unless EIGEN_ALIGN is defined and not equal to 0, the data may not be
+// aligned at all regardless of the value of this #define.
+
+#if (defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN)) && defined(EIGEN_MAX_STATIC_ALIGN_BYTES) && \
+    EIGEN_MAX_STATIC_ALIGN_BYTES > 0
+#error EIGEN_MAX_STATIC_ALIGN_BYTES and EIGEN_DONT_ALIGN[_STATICALLY] are both defined with EIGEN_MAX_STATIC_ALIGN_BYTES!=0. Use EIGEN_MAX_STATIC_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN_STATICALLY.
+#endif
+
+// EIGEN_DONT_ALIGN_STATICALLY and EIGEN_DONT_ALIGN are deprecated
+// They imply EIGEN_MAX_STATIC_ALIGN_BYTES=0
+#if defined(EIGEN_DONT_ALIGN_STATICALLY) || defined(EIGEN_DONT_ALIGN)
+#ifdef EIGEN_MAX_STATIC_ALIGN_BYTES
+#undef EIGEN_MAX_STATIC_ALIGN_BYTES
+#endif
+#define EIGEN_MAX_STATIC_ALIGN_BYTES 0
+#endif
+
+#ifndef EIGEN_MAX_STATIC_ALIGN_BYTES
+
+// Try to automatically guess what is the best default value for EIGEN_MAX_STATIC_ALIGN_BYTES
+
+// 16 byte alignment is only useful for vectorization. Since it affects the ABI, we need to enable
+// 16 byte alignment on all platforms where vectorization might be enabled. In theory we could always
+// enable alignment, but it can be a cause of problems on some platforms, so we just disable it in
+// certain common platform (compiler+architecture combinations) to avoid these problems.
+// Only static alignment is really problematic (relies on nonstandard compiler extensions),
+// try to keep heap alignment even when we have to disable static alignment.
+#if EIGEN_COMP_GNUC && !(EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64 || EIGEN_ARCH_PPC || EIGEN_ARCH_IA64 || \
+                         EIGEN_ARCH_MIPS || EIGEN_ARCH_LOONGARCH64)
+#define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1
+#else
+#define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 0
+#endif
+
+// static alignment is completely disabled with GCC 3, Sun Studio, and QCC/QNX
+#if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT && !EIGEN_COMP_SUNCC && !EIGEN_OS_QNX
+#define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 1
+#else
+#define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 0
+#endif
+
+#if EIGEN_ARCH_WANTS_STACK_ALIGNMENT
+#define EIGEN_MAX_STATIC_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES
+#else
+#define EIGEN_MAX_STATIC_ALIGN_BYTES 0
+#endif
+
+#endif
+
+// If EIGEN_MAX_ALIGN_BYTES is defined, then it is considered as an upper bound for EIGEN_MAX_STATIC_ALIGN_BYTES
+#if defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES < EIGEN_MAX_STATIC_ALIGN_BYTES
+#undef EIGEN_MAX_STATIC_ALIGN_BYTES
+#define EIGEN_MAX_STATIC_ALIGN_BYTES EIGEN_MAX_ALIGN_BYTES
+#endif
+
+#if EIGEN_MAX_STATIC_ALIGN_BYTES == 0 && !defined(EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT)
+#define EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT
+#endif
+
+// At this stage, EIGEN_MAX_STATIC_ALIGN_BYTES>0 is the true test whether we want to align arrays on the stack or not.
+// It takes into account both the user choice to explicitly enable/disable alignment (by setting
+// EIGEN_MAX_STATIC_ALIGN_BYTES) and the architecture config (EIGEN_ARCH_WANTS_STACK_ALIGNMENT). Henceforth, only
+// EIGEN_MAX_STATIC_ALIGN_BYTES should be used.
+
+// Shortcuts to EIGEN_ALIGN_TO_BOUNDARY
+#define EIGEN_ALIGN8 EIGEN_ALIGN_TO_BOUNDARY(8)
+#define EIGEN_ALIGN16 EIGEN_ALIGN_TO_BOUNDARY(16)
+#define EIGEN_ALIGN32 EIGEN_ALIGN_TO_BOUNDARY(32)
+#define EIGEN_ALIGN64 EIGEN_ALIGN_TO_BOUNDARY(64)
+#if EIGEN_MAX_STATIC_ALIGN_BYTES > 0
+#define EIGEN_ALIGN_MAX EIGEN_ALIGN_TO_BOUNDARY(EIGEN_MAX_STATIC_ALIGN_BYTES)
+#else
+#define EIGEN_ALIGN_MAX
+#endif
+
+// Dynamic alignment control
+
+#if defined(EIGEN_DONT_ALIGN) && defined(EIGEN_MAX_ALIGN_BYTES) && EIGEN_MAX_ALIGN_BYTES > 0
+#error EIGEN_MAX_ALIGN_BYTES and EIGEN_DONT_ALIGN are both defined with EIGEN_MAX_ALIGN_BYTES!=0. Use EIGEN_MAX_ALIGN_BYTES=0 as a synonym of EIGEN_DONT_ALIGN.
+#endif
+
+#ifdef EIGEN_DONT_ALIGN
+#ifdef EIGEN_MAX_ALIGN_BYTES
+#undef EIGEN_MAX_ALIGN_BYTES
+#endif
+#define EIGEN_MAX_ALIGN_BYTES 0
+#elif !defined(EIGEN_MAX_ALIGN_BYTES)
+#define EIGEN_MAX_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES
+#endif
+
+#if EIGEN_IDEAL_MAX_ALIGN_BYTES > EIGEN_MAX_ALIGN_BYTES
+#define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_IDEAL_MAX_ALIGN_BYTES
+#else
+#define EIGEN_DEFAULT_ALIGN_BYTES EIGEN_MAX_ALIGN_BYTES
+#endif
+
+#ifndef EIGEN_UNALIGNED_VECTORIZE
+#define EIGEN_UNALIGNED_VECTORIZE 1
+#endif
+
+//----------------------------------------------------------------------
+
+// if alignment is disabled, then disable vectorization. Note: EIGEN_MAX_ALIGN_BYTES is the proper check, it takes into
+// account both the user's will (EIGEN_MAX_ALIGN_BYTES,EIGEN_DONT_ALIGN) and our own platform checks
+#if EIGEN_MAX_ALIGN_BYTES == 0
+#ifndef EIGEN_DONT_VECTORIZE
+#define EIGEN_DONT_VECTORIZE
+#endif
+#endif
+
+// The following (except #include <malloc.h> and _M_IX86_FP ??) can likely be
+// removed as gcc 4.1 and msvc 2008 are not supported anyways.
+#if EIGEN_COMP_MSVC
+#include <malloc.h>  // for _aligned_malloc -- need it regardless of whether vectorization is enabled
+// a user reported that in 64-bit mode, MSVC doesn't care to define _M_IX86_FP.
+#if (defined(_M_IX86_FP) && (_M_IX86_FP >= 2)) || EIGEN_ARCH_x86_64
+#define EIGEN_SSE2_ON_MSVC_2008_OR_LATER
+#endif
+#else
+#if defined(__SSE2__)
+#define EIGEN_SSE2_ON_NON_MSVC
+#endif
+#endif
+
+#if !(defined(EIGEN_DONT_VECTORIZE) || defined(EIGEN_GPUCC))
+
+#if defined(EIGEN_SSE2_ON_NON_MSVC) || defined(EIGEN_SSE2_ON_MSVC_2008_OR_LATER)
+
+// Defines symbols for compile-time detection of which instructions are
+// used.
+// EIGEN_VECTORIZE_YY is defined if and only if the instruction set YY is used
+#define EIGEN_VECTORIZE
+#define EIGEN_VECTORIZE_SSE
+#define EIGEN_VECTORIZE_SSE2
+
+// Detect sse3/ssse3/sse4:
+// gcc and icc defines __SSE3__, ...
+// there is no way to know about this on msvc. You can define EIGEN_VECTORIZE_SSE* if you
+// want to force the use of those instructions with msvc.
+#ifdef __SSE3__
+#define EIGEN_VECTORIZE_SSE3
+#endif
+#ifdef __SSSE3__
+#define EIGEN_VECTORIZE_SSSE3
+#endif
+#ifdef __SSE4_1__
+#define EIGEN_VECTORIZE_SSE4_1
+#endif
+#ifdef __SSE4_2__
+#define EIGEN_VECTORIZE_SSE4_2
+#endif
+#ifdef __AVX__
+#if !defined(EIGEN_USE_SYCL) && !EIGEN_COMP_EMSCRIPTEN
+#define EIGEN_VECTORIZE_AVX
+#endif
+#define EIGEN_VECTORIZE_SSE3
+#define EIGEN_VECTORIZE_SSSE3
+#define EIGEN_VECTORIZE_SSE4_1
+#define EIGEN_VECTORIZE_SSE4_2
+#endif
+#ifdef __AVX2__
+#ifndef EIGEN_USE_SYCL
+#define EIGEN_VECTORIZE_AVX2
+#define EIGEN_VECTORIZE_AVX
+#endif
+#define EIGEN_VECTORIZE_SSE3
+#define EIGEN_VECTORIZE_SSSE3
+#define EIGEN_VECTORIZE_SSE4_1
+#define EIGEN_VECTORIZE_SSE4_2
+#endif
+#if defined(__FMA__) || (EIGEN_COMP_MSVC && defined(__AVX2__))
+// MSVC does not expose a switch dedicated for FMA
+// For MSVC, AVX2 => FMA
+#define EIGEN_VECTORIZE_FMA
+#endif
+#if defined(__AVX512F__)
+#ifndef EIGEN_VECTORIZE_FMA
+#if EIGEN_COMP_GNUC
+#error Please add -mfma to your compiler flags: compiling with -mavx512f alone without SSE/AVX FMA is not supported (bug 1638).
+#else
+#error Please enable FMA in your compiler flags (e.g. -mfma): compiling with AVX512 alone without SSE/AVX FMA is not supported (bug 1638).
+#endif
+#endif
+#ifndef EIGEN_USE_SYCL
+#define EIGEN_VECTORIZE_AVX512
+#define EIGEN_VECTORIZE_AVX2
+#define EIGEN_VECTORIZE_AVX
+#endif
+#define EIGEN_VECTORIZE_FMA
+#define EIGEN_VECTORIZE_SSE3
+#define EIGEN_VECTORIZE_SSSE3
+#define EIGEN_VECTORIZE_SSE4_1
+#define EIGEN_VECTORIZE_SSE4_2
+#ifndef EIGEN_USE_SYCL
+#ifdef __AVX512DQ__
+#define EIGEN_VECTORIZE_AVX512DQ
+#endif
+#ifdef __AVX512ER__
+#define EIGEN_VECTORIZE_AVX512ER
+#endif
+#ifdef __AVX512BF16__
+#define EIGEN_VECTORIZE_AVX512BF16
+#endif
+#ifdef __AVX512VL__
+#define EIGEN_VECTORIZE_AVX512VL
+#endif
+#ifdef __AVX512FP16__
+#ifdef __AVX512VL__
+#define EIGEN_VECTORIZE_AVX512FP16
+// Built-in _Float16.
+#define EIGEN_HAS_BUILTIN_FLOAT16 1
+#else
+#if EIGEN_COMP_GNUC
+#error Please add -mavx512vl to your compiler flags: compiling with -mavx512fp16 alone without AVX512-VL is not supported.
+#else
+#error Please enable AVX512-VL in your compiler flags (e.g. -mavx512vl): compiling with AVX512-FP16 alone without AVX512-VL is not supported.
+#endif
+#endif
+#endif
+#endif
+#endif
+
+// Disable AVX support on broken xcode versions
+#if (EIGEN_COMP_CLANGAPPLE == 11000033) && (__MAC_OS_X_VERSION_MIN_REQUIRED == 101500)
+// A nasty bug in the clang compiler shipped with xcode in a common compilation situation
+// when XCode 11.0 and Mac deployment target macOS 10.15 is https://trac.macports.org/ticket/58776#no1
+#ifdef EIGEN_VECTORIZE_AVX
+#undef EIGEN_VECTORIZE_AVX
+#warning \
+    "Disabling AVX support: clang compiler shipped with XCode 11.[012] generates broken assembly with -macosx-version-min=10.15 and AVX enabled. "
+#ifdef EIGEN_VECTORIZE_AVX2
+#undef EIGEN_VECTORIZE_AVX2
+#endif
+#ifdef EIGEN_VECTORIZE_FMA
+#undef EIGEN_VECTORIZE_FMA
+#endif
+#ifdef EIGEN_VECTORIZE_AVX512
+#undef EIGEN_VECTORIZE_AVX512
+#endif
+#ifdef EIGEN_VECTORIZE_AVX512DQ
+#undef EIGEN_VECTORIZE_AVX512DQ
+#endif
+#ifdef EIGEN_VECTORIZE_AVX512ER
+#undef EIGEN_VECTORIZE_AVX512ER
+#endif
+#endif
+// NOTE: Confirmed test failures in XCode 11.0, and XCode 11.2 with  -macosx-version-min=10.15 and AVX
+// NOTE using -macosx-version-min=10.15 with Xcode 11.0 results in runtime segmentation faults in many tests, 11.2
+// produce core dumps in 3 tests NOTE using -macosx-version-min=10.14 produces functioning and passing tests in all
+// cases NOTE __clang_version__ "11.0.0 (clang-1100.0.33.8)"  XCode 11.0 <- Produces many segfault and core dumping
+// tests
+//                                                                    with  -macosx-version-min=10.15 and AVX
+// NOTE __clang_version__ "11.0.0 (clang-1100.0.33.12)" XCode 11.2 <- Produces 3 core dumping tests with
+//                                                                    -macosx-version-min=10.15 and AVX
+#endif
+
+// include files
+
+// This extern "C" works around a MINGW-w64 compilation issue
+// https://sourceforge.net/tracker/index.php?func=detail&aid=3018394&group_id=202880&atid=983354
+// In essence, intrin.h is included by windows.h and also declares intrinsics (just as emmintrin.h etc. below do).
+// However, intrin.h uses an extern "C" declaration, and g++ thus complains of duplicate declarations
+// with conflicting linkage.  The linkage for intrinsics doesn't matter, but at that stage the compiler doesn't know;
+// so, to avoid compile errors when windows.h is included after Eigen/Core, ensure intrinsics are extern "C" here too.
+// notice that since these are C headers, the extern "C" is theoretically needed anyways.
+extern "C" {
+// In theory we should only include immintrin.h and not the other *mmintrin.h header files directly.
+// Doing so triggers some issues with ICC. However old gcc versions seems to not have this file, thus:
+#if EIGEN_COMP_ICC >= 1110 || EIGEN_COMP_EMSCRIPTEN
+#include <immintrin.h>
+#else
+#include <mmintrin.h>
+#include <emmintrin.h>
+#include <xmmintrin.h>
+#ifdef EIGEN_VECTORIZE_SSE3
+#include <pmmintrin.h>
+#endif
+#ifdef EIGEN_VECTORIZE_SSSE3
+#include <tmmintrin.h>
+#endif
+#ifdef EIGEN_VECTORIZE_SSE4_1
+#include <smmintrin.h>
+#endif
+#ifdef EIGEN_VECTORIZE_SSE4_2
+#include <nmmintrin.h>
+#endif
+#if defined(EIGEN_VECTORIZE_AVX) || defined(EIGEN_VECTORIZE_AVX512)
+#include <immintrin.h>
+#endif
+#endif
+}  // end extern "C"
+
+#elif defined(__VSX__) && !defined(__APPLE__)
+
+#define EIGEN_VECTORIZE
+#define EIGEN_VECTORIZE_VSX 1
+#define EIGEN_VECTORIZE_FMA
+#include <altivec.h>
+// We need to #undef all these ugly tokens defined in <altivec.h>
+// => use __vector instead of vector
+#undef bool
+#undef vector
+#undef pixel
+
+#elif defined __ALTIVEC__
+
+#define EIGEN_VECTORIZE
+#define EIGEN_VECTORIZE_ALTIVEC
+#define EIGEN_VECTORIZE_FMA
+#include <altivec.h>
+// We need to #undef all these ugly tokens defined in <altivec.h>
+// => use __vector instead of vector
+#undef bool
+#undef vector
+#undef pixel
+
+#elif ((defined __ARM_NEON) || (defined __ARM_NEON__)) && !(defined EIGEN_ARM64_USE_SVE)
+
+#define EIGEN_VECTORIZE
+#define EIGEN_VECTORIZE_NEON
+#include <arm_neon.h>
+
+// We currently require SVE to be enabled explicitly via EIGEN_ARM64_USE_SVE and
+// will not select the backend automatically
+#elif (defined __ARM_FEATURE_SVE) && (defined EIGEN_ARM64_USE_SVE)
+
+#define EIGEN_VECTORIZE
+#define EIGEN_VECTORIZE_SVE
+#include <arm_sve.h>
+
+// Since we depend on knowing SVE vector lengths at compile-time, we need
+// to ensure a fixed lengths is set
+#if defined __ARM_FEATURE_SVE_BITS
+#define EIGEN_ARM64_SVE_VL __ARM_FEATURE_SVE_BITS
+#else
+#error "Eigen requires a fixed SVE lector length but EIGEN_ARM64_SVE_VL is not set."
+#endif
+
+#elif (defined __s390x__ && defined __VEC__)
+
+#define EIGEN_VECTORIZE
+#define EIGEN_VECTORIZE_ZVECTOR
+#include <vecintrin.h>
+
+#elif defined __mips_msa
+
+// Limit MSA optimizations to little-endian CPUs for now.
+// TODO: Perhaps, eventually support MSA optimizations on big-endian CPUs?
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#if defined(__LP64__)
+#define EIGEN_MIPS_64
+#else
+#define EIGEN_MIPS_32
+#endif
+#define EIGEN_VECTORIZE
+#define EIGEN_VECTORIZE_MSA
+#include <msa.h>
+#endif
+
+#elif (defined __loongarch64 && defined __loongarch_sx)
+
+#define EIGEN_VECTORIZE
+#define EIGEN_VECTORIZE_LSX
+#include <lsxintrin.h>
+
+#elif defined __HVX__ && (__HVX_LENGTH__ == 128)
+
+#define EIGEN_VECTORIZE
+#define EIGEN_VECTORIZE_HVX
+#include <hexagon_types.h>
+
+#endif
+#endif
+
+// Following the Arm ACLE arm_neon.h should also include arm_fp16.h but not all
+// compilers seem to follow this. We therefore include it explicitly.
+// See also: https://bugs.llvm.org/show_bug.cgi?id=47955
+#if defined(EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC)
+#include <arm_fp16.h>
+#endif
+
+// Enable FMA for ARM.
+#if defined(__ARM_FEATURE_FMA)
+#define EIGEN_VECTORIZE_FMA
+#endif
+
+#if defined(__F16C__) && !defined(EIGEN_GPUCC) && (!EIGEN_COMP_CLANG_STRICT || EIGEN_CLANG_STRICT_AT_LEAST(3, 8, 0))
+// We can use the optimized fp16 to float and float to fp16 conversion routines
+#define EIGEN_HAS_FP16_C
+
+#if EIGEN_COMP_GNUC
+// Make sure immintrin.h is included, even if e.g. vectorization is
+// explicitly disabled (see also issue #2395).
+// Note that FP16C intrinsics for gcc and clang are included by immintrin.h,
+// as opposed to emmintrin.h as suggested by Intel:
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#othertechs=FP16C&expand=1711
+#include <immintrin.h>
+#endif
+#endif
+
+#if defined EIGEN_CUDACC
+#define EIGEN_VECTORIZE_GPU
+#include <vector_types.h>
+#if EIGEN_CUDA_SDK_VER >= 70500
+#define EIGEN_HAS_CUDA_FP16
+#endif
+#endif
+
+#if defined(EIGEN_HAS_CUDA_FP16)
+#include <cuda_runtime_api.h>
+#include <cuda_fp16.h>
+#endif
+
+#if defined(EIGEN_HIPCC)
+#define EIGEN_VECTORIZE_GPU
+#include <hip/hip_vector_types.h>
+#define EIGEN_HAS_HIP_FP16
+#include <hip/hip_fp16.h>
+#define EIGEN_HAS_HIP_BF16
+#include <hip/hip_bfloat16.h>
+#endif
+
+/** \brief Namespace containing all symbols from the %Eigen library. */
+// IWYU pragma: private
+#include "../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+inline static const char *SimdInstructionSetsInUse(void) {
+#if defined(EIGEN_VECTORIZE_AVX512)
+  return "AVX512, FMA, AVX2, AVX, SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
+#elif defined(EIGEN_VECTORIZE_AVX)
+  return "AVX SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
+#elif defined(EIGEN_VECTORIZE_SSE4_2)
+  return "SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2";
+#elif defined(EIGEN_VECTORIZE_SSE4_1)
+  return "SSE, SSE2, SSE3, SSSE3, SSE4.1";
+#elif defined(EIGEN_VECTORIZE_SSSE3)
+  return "SSE, SSE2, SSE3, SSSE3";
+#elif defined(EIGEN_VECTORIZE_SSE3)
+  return "SSE, SSE2, SSE3";
+#elif defined(EIGEN_VECTORIZE_SSE2)
+  return "SSE, SSE2";
+#elif defined(EIGEN_VECTORIZE_ALTIVEC)
+  return "AltiVec";
+#elif defined(EIGEN_VECTORIZE_VSX)
+  return "VSX";
+#elif defined(EIGEN_VECTORIZE_NEON)
+  return "ARM NEON";
+#elif defined(EIGEN_VECTORIZE_SVE)
+  return "ARM SVE";
+#elif defined(EIGEN_VECTORIZE_ZVECTOR)
+  return "S390X ZVECTOR";
+#elif defined(EIGEN_VECTORIZE_MSA)
+  return "MIPS MSA";
+#elif defined(EIGEN_VECTORIZE_LSX)
+  return "LOONGARCH64 LSX";
+#else
+  return "None";
+#endif
+}
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CONFIGURE_VECTORIZATION_H
diff --git a/inst/include/Eigen/src/Core/util/Constants.h b/inst/include/Eigen/src/Core/util/Constants.h
index 1e6277c4..fcc2db82 100644
--- a/inst/include/Eigen/src/Core/util/Constants.h
+++ b/inst/include/Eigen/src/Core/util/Constants.h
@@ -1,8 +1,9 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2007-2009 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2020, Arm Limited and Contributors
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -11,225 +12,281 @@
 #ifndef EIGEN_CONSTANTS_H
 #define EIGEN_CONSTANTS_H
 
+// IWYU pragma: private
+#include "../InternalHeaderCheck.h"
+
 namespace Eigen {
 
 /** This value means that a positive quantity (e.g., a size) is not known at compile-time, and that instead the value is
-  * stored in some runtime variable.
-  *
-  * Changing the value of Dynamic breaks the ABI, as Dynamic is often used as a template parameter for Matrix.
-  */
+ * stored in some runtime variable.
+ *
+ * Changing the value of Dynamic breaks the ABI, as Dynamic is often used as a template parameter for Matrix.
+ */
 const int Dynamic = -1;
 
-/** This value means that a signed quantity (e.g., a signed index) is not known at compile-time, and that instead its value
-  * has to be specified at runtime.
-  */
+/** This value means that a signed quantity (e.g., a signed index) is not known at compile-time, and that instead its
+ * value has to be specified at runtime.
+ */
 const int DynamicIndex = 0xffffff;
 
+/** This value means that the requested value is not defined.
+ */
+const int Undefined = 0xfffffe;
+
 /** This value means +Infinity; it is currently used only as the p parameter to MatrixBase::lpNorm<int>().
-  * The value Infinity there means the L-infinity norm.
-  */
+ * The value Infinity there means the L-infinity norm.
+ */
 const int Infinity = -1;
 
+/** This value means that the cost to evaluate an expression coefficient is either very expensive or
+ * cannot be known at compile time.
+ *
+ * This value has to be positive to (1) simplify cost computation, and (2) allow to distinguish between a very expensive
+ * and very very expensive expressions. It thus must also be large enough to make sure unrolling won't happen and that
+ * sub expressions will be evaluated, but not too large to avoid overflow.
+ */
+const int HugeCost = 10000;
+
 /** \defgroup flags Flags
-  * \ingroup Core_Module
-  *
-  * These are the possible bits which can be OR'ed to constitute the flags of a matrix or
-  * expression.
-  *
-  * It is important to note that these flags are a purely compile-time notion. They are a compile-time property of
-  * an expression type, implemented as enum's. They are not stored in memory at runtime, and they do not incur any
-  * runtime overhead.
-  *
-  * \sa MatrixBase::Flags
-  */
+ * \ingroup Core_Module
+ *
+ * These are the possible bits which can be OR'ed to constitute the flags of a matrix or
+ * expression.
+ *
+ * It is important to note that these flags are a purely compile-time notion. They are a compile-time property of
+ * an expression type, implemented as enum's. They are not stored in memory at runtime, and they do not incur any
+ * runtime overhead.
+ *
+ * \sa MatrixBase::Flags
+ */
 
 /** \ingroup flags
-  *
-  * for a matrix, this means that the storage order is row-major.
-  * If this bit is not set, the storage order is column-major.
-  * For an expression, this determines the storage order of
-  * the matrix created by evaluation of that expression. 
-  * \sa \ref TopicStorageOrders */
+ *
+ * for a matrix, this means that the storage order is row-major.
+ * If this bit is not set, the storage order is column-major.
+ * For an expression, this determines the storage order of
+ * the matrix created by evaluation of that expression.
+ * \sa \blank  \ref TopicStorageOrders */
 const unsigned int RowMajorBit = 0x1;
 
 /** \ingroup flags
-  *
-  * means the expression should be evaluated by the calling expression */
+ * means the expression should be evaluated by the calling expression */
 const unsigned int EvalBeforeNestingBit = 0x2;
 
 /** \ingroup flags
-  *
-  * means the expression should be evaluated before any assignment */
-const unsigned int EvalBeforeAssigningBit = 0x4;
+ * \deprecated
+ * means the expression should be evaluated before any assignment */
+EIGEN_DEPRECATED const unsigned int EvalBeforeAssigningBit = 0x4;  // FIXME deprecated
 
 /** \ingroup flags
-  *
-  * Short version: means the expression might be vectorized
-  *
-  * Long version: means that the coefficients can be handled by packets
-  * and start at a memory location whose alignment meets the requirements
-  * of the present CPU architecture for optimized packet access. In the fixed-size
-  * case, there is the additional condition that it be possible to access all the
-  * coefficients by packets (this implies the requirement that the size be a multiple of 16 bytes,
-  * and that any nontrivial strides don't break the alignment). In the dynamic-size case,
-  * there is no such condition on the total size and strides, so it might not be possible to access
-  * all coeffs by packets.
-  *
-  * \note This bit can be set regardless of whether vectorization is actually enabled.
-  *       To check for actual vectorizability, see \a ActualPacketAccessBit.
-  */
+ *
+ * Short version: means the expression might be vectorized
+ *
+ * Long version: means that the coefficients can be handled by packets
+ * and start at a memory location whose alignment meets the requirements
+ * of the present CPU architecture for optimized packet access. In the fixed-size
+ * case, there is the additional condition that it be possible to access all the
+ * coefficients by packets (this implies the requirement that the size be a multiple of 16 bytes,
+ * and that any nontrivial strides don't break the alignment). In the dynamic-size case,
+ * there is no such condition on the total size and strides, so it might not be possible to access
+ * all coeffs by packets.
+ *
+ * \note This bit can be set regardless of whether vectorization is actually enabled.
+ *       To check for actual vectorizability, see \a ActualPacketAccessBit.
+ */
 const unsigned int PacketAccessBit = 0x8;
 
 #ifdef EIGEN_VECTORIZE
 /** \ingroup flags
-  *
-  * If vectorization is enabled (EIGEN_VECTORIZE is defined) this constant
-  * is set to the value \a PacketAccessBit.
-  *
-  * If vectorization is not enabled (EIGEN_VECTORIZE is not defined) this constant
-  * is set to the value 0.
-  */
+ *
+ * If vectorization is enabled (EIGEN_VECTORIZE is defined) this constant
+ * is set to the value \a PacketAccessBit.
+ *
+ * If vectorization is not enabled (EIGEN_VECTORIZE is not defined) this constant
+ * is set to the value 0.
+ */
 const unsigned int ActualPacketAccessBit = PacketAccessBit;
 #else
 const unsigned int ActualPacketAccessBit = 0x0;
 #endif
 
 /** \ingroup flags
-  *
-  * Short version: means the expression can be seen as 1D vector.
-  *
-  * Long version: means that one can access the coefficients
-  * of this expression by coeff(int), and coeffRef(int) in the case of a lvalue expression. These
-  * index-based access methods are guaranteed
-  * to not have to do any runtime computation of a (row, col)-pair from the index, so that it
-  * is guaranteed that whenever it is available, index-based access is at least as fast as
-  * (row,col)-based access. Expressions for which that isn't possible don't have the LinearAccessBit.
-  *
-  * If both PacketAccessBit and LinearAccessBit are set, then the
-  * packets of this expression can be accessed by packet(int), and writePacket(int) in the case of a
-  * lvalue expression.
-  *
-  * Typically, all vector expressions have the LinearAccessBit, but there is one exception:
-  * Product expressions don't have it, because it would be troublesome for vectorization, even when the
-  * Product is a vector expression. Thus, vector Product expressions allow index-based coefficient access but
-  * not index-based packet access, so they don't have the LinearAccessBit.
-  */
+ *
+ * Short version: means the expression can be seen as 1D vector.
+ *
+ * Long version: means that one can access the coefficients
+ * of this expression by coeff(int), and coeffRef(int) in the case of a lvalue expression. These
+ * index-based access methods are guaranteed
+ * to not have to do any runtime computation of a (row, col)-pair from the index, so that it
+ * is guaranteed that whenever it is available, index-based access is at least as fast as
+ * (row,col)-based access. Expressions for which that isn't possible don't have the LinearAccessBit.
+ *
+ * If both PacketAccessBit and LinearAccessBit are set, then the
+ * packets of this expression can be accessed by packet(int), and writePacket(int) in the case of a
+ * lvalue expression.
+ *
+ * Typically, all vector expressions have the LinearAccessBit, but there is one exception:
+ * Product expressions don't have it, because it would be troublesome for vectorization, even when the
+ * Product is a vector expression. Thus, vector Product expressions allow index-based coefficient access but
+ * not index-based packet access, so they don't have the LinearAccessBit.
+ */
 const unsigned int LinearAccessBit = 0x10;
 
 /** \ingroup flags
-  *
-  * Means the expression has a coeffRef() method, i.e. is writable as its individual coefficients are directly addressable.
-  * This rules out read-only expressions.
-  *
-  * Note that DirectAccessBit and LvalueBit are mutually orthogonal, as there are examples of expression having one but note
-  * the other:
-  *   \li writable expressions that don't have a very simple memory layout as a strided array, have LvalueBit but not DirectAccessBit
-  *   \li Map-to-const expressions, for example Map<const Matrix>, have DirectAccessBit but not LvalueBit
-  *
-  * Expressions having LvalueBit also have their coeff() method returning a const reference instead of returning a new value.
-  */
+ *
+ * Means the expression has a coeffRef() method, i.e. is writable as its individual coefficients are directly
+ * addressable. This rules out read-only expressions.
+ *
+ * Note that DirectAccessBit and LvalueBit are mutually orthogonal, as there are examples of expression having one but
+ * not the other: \li writable expressions that don't have a very simple memory layout as a strided array, have
+ * LvalueBit but not DirectAccessBit \li Map-to-const expressions, for example Map<const Matrix>, have DirectAccessBit
+ * but not LvalueBit
+ *
+ * Expressions having LvalueBit also have their coeff() method returning a const reference instead of returning a new
+ * value.
+ */
 const unsigned int LvalueBit = 0x20;
 
 /** \ingroup flags
-  *
-  * Means that the underlying array of coefficients can be directly accessed as a plain strided array. The memory layout
-  * of the array of coefficients must be exactly the natural one suggested by rows(), cols(),
-  * outerStride(), innerStride(), and the RowMajorBit. This rules out expressions such as Diagonal, whose coefficients,
-  * though referencable, do not have such a regular memory layout.
-  *
-  * See the comment on LvalueBit for an explanation of how LvalueBit and DirectAccessBit are mutually orthogonal.
-  */
+ *
+ * Means that the underlying array of coefficients can be directly accessed as a plain strided array. The memory layout
+ * of the array of coefficients must be exactly the natural one suggested by rows(), cols(),
+ * outerStride(), innerStride(), and the RowMajorBit. This rules out expressions such as Diagonal, whose coefficients,
+ * though referenceable, do not have such a regular memory layout.
+ *
+ * See the comment on LvalueBit for an explanation of how LvalueBit and DirectAccessBit are mutually orthogonal.
+ */
 const unsigned int DirectAccessBit = 0x40;
 
-/** \ingroup flags
-  *
-  * means the first coefficient packet is guaranteed to be aligned */
-const unsigned int AlignedBit = 0x80;
+/** \deprecated \ingroup flags
+ *
+ * means the first coefficient packet is guaranteed to be aligned.
+ * An expression cannot have the AlignedBit without the PacketAccessBit flag.
+ * In other words, this means we are allow to perform an aligned packet access to the first element regardless
+ * of the expression kind:
+ * \code
+ * expression.packet<Aligned>(0);
+ * \endcode
+ */
+EIGEN_DEPRECATED const unsigned int AlignedBit = 0x80;
 
 const unsigned int NestByRefBit = 0x100;
 
-// list of flags that are inherited by default
-const unsigned int HereditaryBits = RowMajorBit
-                                  | EvalBeforeNestingBit
-                                  | EvalBeforeAssigningBit;
+/** \ingroup flags
+ *
+ * for an expression, this means that the storage order
+ * can be either row-major or column-major.
+ * The precise choice will be decided at evaluation time or when
+ * combined with other expressions.
+ * \sa \blank  \ref RowMajorBit, \ref TopicStorageOrders */
+const unsigned int NoPreferredStorageOrderBit = 0x200;
 
-/** \defgroup enums Enumerations
-  * \ingroup Core_Module
+/** \ingroup flags
   *
-  * Various enumerations used in %Eigen. Many of these are used as template parameters.
+  * Means that the underlying coefficients can be accessed through pointers to the sparse (un)compressed storage format,
+  * that is, the expression provides:
+  * \code
+    inline const Scalar* valuePtr() const;
+    inline const Index* innerIndexPtr() const;
+    inline const Index* outerIndexPtr() const;
+    inline const Index* innerNonZeroPtr() const;
+    \endcode
   */
+const unsigned int CompressedAccessBit = 0x400;
+
+// list of flags that are inherited by default
+const unsigned int HereditaryBits = RowMajorBit | EvalBeforeNestingBit;
+
+/** \defgroup enums Enumerations
+ * \ingroup Core_Module
+ *
+ * Various enumerations used in %Eigen. Many of these are used as template parameters.
+ */
 
 /** \ingroup enums
-  * Enum containing possible values for the \p Mode parameter of 
-  * MatrixBase::selfadjointView() and MatrixBase::triangularView(). */
-enum {
+ * Enum containing possible values for the \c Mode or \c UpLo parameter of
+ * MatrixBase::selfadjointView() and MatrixBase::triangularView(), and selfadjoint solvers. */
+enum UpLoType {
   /** View matrix as a lower triangular matrix. */
-  Lower=0x1,                      
+  Lower = 0x1,
   /** View matrix as an upper triangular matrix. */
-  Upper=0x2,                      
+  Upper = 0x2,
   /** %Matrix has ones on the diagonal; to be used in combination with #Lower or #Upper. */
-  UnitDiag=0x4, 
+  UnitDiag = 0x4,
   /** %Matrix has zeros on the diagonal; to be used in combination with #Lower or #Upper. */
-  ZeroDiag=0x8,
+  ZeroDiag = 0x8,
   /** View matrix as a lower triangular matrix with ones on the diagonal. */
-  UnitLower=UnitDiag|Lower, 
+  UnitLower = UnitDiag | Lower,
   /** View matrix as an upper triangular matrix with ones on the diagonal. */
-  UnitUpper=UnitDiag|Upper,
+  UnitUpper = UnitDiag | Upper,
   /** View matrix as a lower triangular matrix with zeros on the diagonal. */
-  StrictlyLower=ZeroDiag|Lower, 
+  StrictlyLower = ZeroDiag | Lower,
   /** View matrix as an upper triangular matrix with zeros on the diagonal. */
-  StrictlyUpper=ZeroDiag|Upper,
+  StrictlyUpper = ZeroDiag | Upper,
   /** Used in BandMatrix and SelfAdjointView to indicate that the matrix is self-adjoint. */
-  SelfAdjoint=0x10,
+  SelfAdjoint = 0x10,
   /** Used to support symmetric, non-selfadjoint, complex matrices. */
-  Symmetric=0x20
+  Symmetric = 0x20
 };
 
 /** \ingroup enums
-  * Enum for indicating whether an object is aligned or not. */
-enum { 
-  /** Object is not correctly aligned for vectorization. */
-  Unaligned=0, 
-  /** Object is aligned for vectorization. */
-  Aligned=1 
+ * Enum for indicating whether a buffer is aligned or not. */
+enum AlignmentType {
+  Unaligned = 0,    /**< Data pointer has no specific alignment. */
+  Aligned8 = 8,     /**< Data pointer is aligned on a 8 bytes boundary. */
+  Aligned16 = 16,   /**< Data pointer is aligned on a 16 bytes boundary. */
+  Aligned32 = 32,   /**< Data pointer is aligned on a 32 bytes boundary. */
+  Aligned64 = 64,   /**< Data pointer is aligned on a 64 bytes boundary. */
+  Aligned128 = 128, /**< Data pointer is aligned on a 128 bytes boundary. */
+  AlignedMask = 255,
+  Aligned = 16, /**< \deprecated Synonym for Aligned16. */
+#if EIGEN_MAX_ALIGN_BYTES == 128
+  AlignedMax = Aligned128
+#elif EIGEN_MAX_ALIGN_BYTES == 64
+  AlignedMax = Aligned64
+#elif EIGEN_MAX_ALIGN_BYTES == 32
+  AlignedMax = Aligned32
+#elif EIGEN_MAX_ALIGN_BYTES == 16
+  AlignedMax = Aligned16
+#elif EIGEN_MAX_ALIGN_BYTES == 8
+  AlignedMax = Aligned8
+#elif EIGEN_MAX_ALIGN_BYTES == 0
+  AlignedMax = Unaligned
+#else
+#error Invalid value for EIGEN_MAX_ALIGN_BYTES
+#endif
 };
 
 /** \ingroup enums
- * Enum used by DenseBase::corner() in Eigen2 compatibility mode. */
-// FIXME after the corner() API change, this was not needed anymore, except by AlignedBox
-// TODO: find out what to do with that. Adapt the AlignedBox API ?
-enum CornerType { TopLeft, TopRight, BottomLeft, BottomRight };
-
-/** \ingroup enums
-  * Enum containing possible values for the \p Direction parameter of
-  * Reverse, PartialReduxExpr and VectorwiseOp. */
-enum DirectionType { 
-  /** For Reverse, all columns are reversed; 
-    * for PartialReduxExpr and VectorwiseOp, act on columns. */
-  Vertical, 
-  /** For Reverse, all rows are reversed; 
-    * for PartialReduxExpr and VectorwiseOp, act on rows. */
-  Horizontal, 
-  /** For Reverse, both rows and columns are reversed; 
-    * not used for PartialReduxExpr and VectorwiseOp. */
-  BothDirections 
+ * Enum containing possible values for the \p Direction parameter of
+ * Reverse, PartialReduxExpr and VectorwiseOp. */
+enum DirectionType {
+  /** For Reverse, all columns are reversed;
+   * for PartialReduxExpr and VectorwiseOp, act on columns. */
+  Vertical,
+  /** For Reverse, all rows are reversed;
+   * for PartialReduxExpr and VectorwiseOp, act on rows. */
+  Horizontal,
+  /** For Reverse, both rows and columns are reversed;
+   * not used for PartialReduxExpr and VectorwiseOp. */
+  BothDirections
 };
 
 /** \internal \ingroup enums
-  * Enum to specify how to traverse the entries of a matrix. */
-enum {
+ * Enum to specify how to traverse the entries of a matrix. */
+enum TraversalType {
   /** \internal Default traversal, no vectorization, no index-based access */
   DefaultTraversal,
   /** \internal No vectorization, use index-based access to have only one for loop instead of 2 nested loops */
   LinearTraversal,
   /** \internal Equivalent to a slice vectorization for fixed-size matrices having good alignment
-    * and good size */
+   * and good size */
   InnerVectorizedTraversal,
   /** \internal Vectorization path using a single loop plus scalar loops for the
-    * unaligned boundaries */
+   * unaligned boundaries */
   LinearVectorizedTraversal,
   /** \internal Generic vectorization path using one vectorized loop per row/column with some
-    * scalar loops to handle the unaligned boundaries */
+   * scalar loops to handle the unaligned boundaries */
   SliceVectorizedTraversal,
   /** \internal Special case to properly handle incompatible scalar types or other defecting cases*/
   InvalidTraversal,
@@ -238,28 +295,25 @@ enum {
 };
 
 /** \internal \ingroup enums
-  * Enum to specify whether to unroll loops when traversing over the entries of a matrix. */
-enum {
+ * Enum to specify whether to unroll loops when traversing over the entries of a matrix. */
+enum UnrollingType {
   /** \internal Do not unroll loops. */
   NoUnrolling,
   /** \internal Unroll only the inner loop, but not the outer loop. */
   InnerUnrolling,
-  /** \internal Unroll both the inner and the outer loop. If there is only one loop, 
-    * because linear traversal is used, then unroll that loop. */
+  /** \internal Unroll both the inner and the outer loop. If there is only one loop,
+   * because linear traversal is used, then unroll that loop. */
   CompleteUnrolling
 };
 
 /** \internal \ingroup enums
-  * Enum to specify whether to use the default (built-in) implementation or the specialization. */
-enum {
-  Specialized,
-  BuiltIn
-};
+ * Enum to specify whether to use the default (built-in) implementation or the specialization. */
+enum SpecializedType { Specialized, BuiltIn };
 
 /** \ingroup enums
-  * Enum containing possible values for the \p _Options template parameter of
-  * Matrix, Array and BandMatrix. */
-enum {
+ * Enum containing possible values for the \p Options_ template parameter of
+ * Matrix, Array and BandMatrix. */
+enum StorageOptions {
   /** Storage order is column major (see \ref TopicStorageOrders). */
   ColMajor = 0,
   /** Storage order is row major (see \ref TopicStorageOrders). */
@@ -271,12 +325,23 @@ enum {
 };
 
 /** \ingroup enums
-  * Enum for specifying whether to apply or solve on the left or right. */
-enum {
+ * Enum for specifying whether to apply or solve on the left or right. */
+enum SideType {
   /** Apply transformation on the left. */
-  OnTheLeft = 1,  
+  OnTheLeft = 1,
   /** Apply transformation on the right. */
-  OnTheRight = 2  
+  OnTheRight = 2
+};
+
+/** \ingroup enums
+ * Enum for specifying NaN-propagation behavior, e.g. for coeff-wise min/max. */
+enum NaNPropagationOptions {
+  /**  Implementation defined behavior if NaNs are present. */
+  PropagateFast = 0,
+  /**  Always propagate NaNs. */
+  PropagateNaN,
+  /**  Always propagate not-NaNs. */
+  PropagateNumbers
 };
 
 /* the following used to be written as:
@@ -286,83 +351,82 @@ enum {
  *     EIGEN_UNUSED NoChange_t NoChange;
  *   }
  *
- * on the ground that it feels dangerous to disambiguate overloaded functions on enum/integer types.  
+ * on the ground that it feels dangerous to disambiguate overloaded functions on enum/integer types.
  * However, this leads to "variable declared but never referenced" warnings on Intel Composer XE,
  * and we do not know how to get rid of them (bug 450).
  */
 
-enum NoChange_t   { NoChange };
+enum NoChange_t { NoChange };
 enum Sequential_t { Sequential };
-enum Default_t    { Default };
+enum Default_t { Default };
 
 /** \internal \ingroup enums
-  * Used in AmbiVector. */
-enum {
-  IsDense         = 0,
-  IsSparse
-};
+ * Used in AmbiVector. */
+enum AmbiVectorMode { IsDense = 0, IsSparse };
 
 /** \ingroup enums
-  * Used as template parameter in DenseCoeffBase and MapBase to indicate 
-  * which accessors should be provided. */
+ * Used as template parameter in DenseCoeffBase and MapBase to indicate
+ * which accessors should be provided. */
 enum AccessorLevels {
   /** Read-only access via a member function. */
-  ReadOnlyAccessors, 
+  ReadOnlyAccessors,
   /** Read/write access via member functions. */
-  WriteAccessors, 
+  WriteAccessors,
   /** Direct read-only access to the coefficients. */
-  DirectAccessors, 
+  DirectAccessors,
   /** Direct read/write access to the coefficients. */
   DirectWriteAccessors
 };
 
 /** \ingroup enums
-  * Enum with options to give to various decompositions. */
+ * Enum with options to give to various decompositions. */
 enum DecompositionOptions {
   /** \internal Not used (meant for LDLT?). */
-  Pivoting            = 0x01, 
+  Pivoting = 0x01,
   /** \internal Not used (meant for LDLT?). */
-  NoPivoting          = 0x02, 
+  NoPivoting = 0x02,
   /** Used in JacobiSVD to indicate that the square matrix U is to be computed. */
-  ComputeFullU        = 0x04,
+  ComputeFullU = 0x04,
   /** Used in JacobiSVD to indicate that the thin matrix U is to be computed. */
-  ComputeThinU        = 0x08,
+  ComputeThinU = 0x08,
   /** Used in JacobiSVD to indicate that the square matrix V is to be computed. */
-  ComputeFullV        = 0x10,
+  ComputeFullV = 0x10,
   /** Used in JacobiSVD to indicate that the thin matrix V is to be computed. */
-  ComputeThinV        = 0x20,
+  ComputeThinV = 0x20,
   /** Used in SelfAdjointEigenSolver and GeneralizedSelfAdjointEigenSolver to specify
-    * that only the eigenvalues are to be computed and not the eigenvectors. */
-  EigenvaluesOnly     = 0x40,
+   * that only the eigenvalues are to be computed and not the eigenvectors. */
+  EigenvaluesOnly = 0x40,
   /** Used in SelfAdjointEigenSolver and GeneralizedSelfAdjointEigenSolver to specify
-    * that both the eigenvalues and the eigenvectors are to be computed. */
+   * that both the eigenvalues and the eigenvectors are to be computed. */
   ComputeEigenvectors = 0x80,
   /** \internal */
   EigVecMask = EigenvaluesOnly | ComputeEigenvectors,
   /** Used in GeneralizedSelfAdjointEigenSolver to indicate that it should
-    * solve the generalized eigenproblem \f$ Ax = \lambda B x \f$. */
-  Ax_lBx              = 0x100,
+   * solve the generalized eigenproblem \f$ Ax = \lambda B x \f$. */
+  Ax_lBx = 0x100,
   /** Used in GeneralizedSelfAdjointEigenSolver to indicate that it should
-    * solve the generalized eigenproblem \f$ ABx = \lambda x \f$. */
-  ABx_lx              = 0x200,
+   * solve the generalized eigenproblem \f$ ABx = \lambda x \f$. */
+  ABx_lx = 0x200,
   /** Used in GeneralizedSelfAdjointEigenSolver to indicate that it should
-    * solve the generalized eigenproblem \f$ BAx = \lambda x \f$. */
-  BAx_lx              = 0x400,
+   * solve the generalized eigenproblem \f$ BAx = \lambda x \f$. */
+  BAx_lx = 0x400,
   /** \internal */
   GenEigMask = Ax_lBx | ABx_lx | BAx_lx
 };
 
 /** \ingroup enums
-  * Possible values for the \p QRPreconditioner template parameter of JacobiSVD. */
+ * Possible values for the \p QRPreconditioner template parameter of JacobiSVD. */
 enum QRPreconditioners {
+  /** Use a QR decomposition with column pivoting as the first step. */
+  ColPivHouseholderQRPreconditioner = 0x0,
   /** Do not specify what is to be done if the SVD of a non-square matrix is asked for. */
-  NoQRPreconditioner,
+  NoQRPreconditioner = 0x40,
   /** Use a QR decomposition without pivoting as the first step. */
-  HouseholderQRPreconditioner,
-  /** Use a QR decomposition with column pivoting as the first step. */
-  ColPivHouseholderQRPreconditioner,
+  HouseholderQRPreconditioner = 0x80,
   /** Use a QR decomposition with full pivoting as the first step. */
-  FullPivHouseholderQRPreconditioner
+  FullPivHouseholderQRPreconditioner = 0xC0,
+  /** Used to disable the QR Preconditioner in BDCSVD. */
+  DisableQRDecomposition = NoQRPreconditioner
 };
 
 #ifdef Success
@@ -370,82 +434,165 @@ enum QRPreconditioners {
 #endif
 
 /** \ingroup enums
-  * Enum for reporting the status of a computation. */
+ * Enum for reporting the status of a computation. */
 enum ComputationInfo {
   /** Computation was successful. */
-  Success = 0,        
+  Success = 0,
   /** The provided data did not satisfy the prerequisites. */
-  NumericalIssue = 1, 
+  NumericalIssue = 1,
   /** Iterative procedure did not converge. */
   NoConvergence = 2,
   /** The inputs are invalid, or the algorithm has been improperly called.
-    * When assertions are enabled, such errors trigger an assert. */
+   * When assertions are enabled, such errors trigger an assert. */
   InvalidInput = 3
 };
 
 /** \ingroup enums
-  * Enum used to specify how a particular transformation is stored in a matrix.
-  * \sa Transform, Hyperplane::transform(). */
+ * Enum used to specify how a particular transformation is stored in a matrix.
+ * \sa Transform, Hyperplane::transform(). */
 enum TransformTraits {
   /** Transformation is an isometry. */
-  Isometry      = 0x1,
-  /** Transformation is an affine transformation stored as a (Dim+1)^2 matrix whose last row is 
-    * assumed to be [0 ... 0 1]. */
-  Affine        = 0x2,
+  Isometry = 0x1,
+  /** Transformation is an affine transformation stored as a (Dim+1)^2 matrix whose last row is
+   * assumed to be [0 ... 0 1]. */
+  Affine = 0x2,
   /** Transformation is an affine transformation stored as a (Dim) x (Dim+1) matrix. */
   AffineCompact = 0x10 | Affine,
   /** Transformation is a general projective transformation stored as a (Dim+1)^2 matrix. */
-  Projective    = 0x20
+  Projective = 0x20
 };
 
 /** \internal \ingroup enums
-  * Enum used to choose between implementation depending on the computer architecture. */
-namespace Architecture
-{
-  enum Type {
-    Generic = 0x0,
-    SSE = 0x1,
-    AltiVec = 0x2,
+ * Enum used to choose between implementation depending on the computer architecture. */
+namespace Architecture {
+enum Type {
+  Generic = 0x0,
+  SSE = 0x1,
+  AltiVec = 0x2,
+  VSX = 0x3,
+  NEON = 0x4,
+  MSA = 0x5,
+  SVE = 0x6,
+  HVX = 0x7,
+  LSX = 0x8,
 #if defined EIGEN_VECTORIZE_SSE
-    Target = SSE
+  Target = SSE
 #elif defined EIGEN_VECTORIZE_ALTIVEC
-    Target = AltiVec
+  Target = AltiVec
+#elif defined EIGEN_VECTORIZE_VSX
+  Target = VSX
+#elif defined EIGEN_VECTORIZE_NEON
+  Target = NEON
+#elif defined EIGEN_VECTORIZE_SVE
+  Target = SVE
+#elif defined EIGEN_VECTORIZE_MSA
+  Target = MSA
+#elif defined EIGEN_VECTORIZE_HVX
+  Target = HVX
+#elif defined EIGEN_VECTORIZE_LSX
+  Target = LSX
 #else
-    Target = Generic
+  Target = Generic
 #endif
-  };
-}
+};
+}  // namespace Architecture
 
 /** \internal \ingroup enums
-  * Enum used as template parameter in GeneralProduct. */
-enum { CoeffBasedProductMode, LazyCoeffBasedProductMode, OuterProduct, InnerProduct, GemvProduct, GemmProduct };
+ * Enum used as template parameter in Product and product evaluators. */
+enum ProductImplType {
+  DefaultProduct = 0,
+  LazyProduct,
+  AliasFreeProduct,
+  CoeffBasedProductMode,
+  LazyCoeffBasedProductMode,
+  OuterProduct,
+  InnerProduct,
+  GemvProduct,
+  GemmProduct
+};
 
 /** \internal \ingroup enums
-  * Enum used in experimental parallel implementation. */
-enum Action {GetAction, SetAction};
+ * Enum used in experimental parallel implementation. */
+enum Action { GetAction, SetAction };
 
 /** The type used to identify a dense storage. */
 struct Dense {};
 
+/** The type used to identify a general sparse storage. */
+struct Sparse {};
+
+/** The type used to identify a general solver (factored) storage. */
+struct SolverStorage {};
+
+/** The type used to identify a permutation storage. */
+struct PermutationStorage {};
+
+/** The type used to identify a permutation storage. */
+struct TranspositionsStorage {};
+
 /** The type used to identify a matrix expression */
 struct MatrixXpr {};
 
 /** The type used to identify an array expression */
 struct ArrayXpr {};
 
+// An evaluator must define its shape. By default, it can be one of the following:
+struct DenseShape {
+  static std::string debugName() { return "DenseShape"; }
+};
+struct SolverShape {
+  static std::string debugName() { return "SolverShape"; }
+};
+struct HomogeneousShape {
+  static std::string debugName() { return "HomogeneousShape"; }
+};
+struct DiagonalShape {
+  static std::string debugName() { return "DiagonalShape"; }
+};
+struct SkewSymmetricShape {
+  static std::string debugName() { return "SkewSymmetricShape"; }
+};
+struct BandShape {
+  static std::string debugName() { return "BandShape"; }
+};
+struct TriangularShape {
+  static std::string debugName() { return "TriangularShape"; }
+};
+struct SelfAdjointShape {
+  static std::string debugName() { return "SelfAdjointShape"; }
+};
+struct PermutationShape {
+  static std::string debugName() { return "PermutationShape"; }
+};
+struct TranspositionsShape {
+  static std::string debugName() { return "TranspositionsShape"; }
+};
+struct SparseShape {
+  static std::string debugName() { return "SparseShape"; }
+};
+
 namespace internal {
-  /** \internal
-  * Constants for comparison functors
-  */
-  enum ComparisonName {
-    cmp_EQ = 0,
-    cmp_LT = 1,
-    cmp_LE = 2,
-    cmp_UNORD = 3,
-    cmp_NEQ = 4
-  };
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN_CONSTANTS_H
+
+// random access iterators based on coeff*() accessors.
+struct IndexBased {};
+
+// evaluator based on iterators to access coefficients.
+struct IteratorBased {};
+
+/** \internal
+ * Constants for comparison functors
+ */
+enum ComparisonName : unsigned int {
+  cmp_EQ = 0,
+  cmp_LT = 1,
+  cmp_LE = 2,
+  cmp_UNORD = 3,
+  cmp_NEQ = 4,
+  cmp_GT = 5,
+  cmp_GE = 6
+};
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CONSTANTS_H
diff --git a/inst/include/Eigen/src/Core/util/DisableStupidWarnings.h b/inst/include/Eigen/src/Core/util/DisableStupidWarnings.h
index 6a0bf062..8c27b144 100644
--- a/inst/include/Eigen/src/Core/util/DisableStupidWarnings.h
+++ b/inst/include/Eigen/src/Core/util/DisableStupidWarnings.h
@@ -1,40 +1,146 @@
 #ifndef EIGEN_WARNINGS_DISABLED
 #define EIGEN_WARNINGS_DISABLED
 
-#ifdef _MSC_VER
-  // 4100 - unreferenced formal parameter (occurred e.g. in aligned_allocator::destroy(pointer p))
-  // 4101 - unreferenced local variable
-  // 4127 - conditional expression is constant
-  // 4181 - qualifier applied to reference type ignored
-  // 4211 - nonstandard extension used : redefined extern to static
-  // 4244 - 'argument' : conversion from 'type1' to 'type2', possible loss of data
-  // 4273 - QtAlignedMalloc, inconsistent DLL linkage
-  // 4324 - structure was padded due to declspec(align())
-  // 4512 - assignment operator could not be generated
-  // 4522 - 'class' : multiple assignment operators specified
-  // 4700 - uninitialized local variable 'xyz' used
-  // 4717 - 'function' : recursive on all control paths, function will cause runtime stack overflow
-  #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
-    #pragma warning( push )
-  #endif
-  #pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4512 4522 4700 4717 )
+#if defined(_MSC_VER)
+// 4100 - unreferenced formal parameter (occurred e.g. in aligned_allocator::destroy(pointer p))
+// 4101 - unreferenced local variable
+// 4127 - conditional expression is constant
+// 4181 - qualifier applied to reference type ignored
+// 4211 - nonstandard extension used : redefined extern to static
+// 4244 - 'argument' : conversion from 'type1' to 'type2', possible loss of data
+// 4273 - QtAlignedMalloc, inconsistent DLL linkage
+// 4324 - structure was padded due to declspec(align())
+// 4503 - decorated name length exceeded, name was truncated
+// 4512 - assignment operator could not be generated
+// 4522 - 'class' : multiple assignment operators specified
+// 4700 - uninitialized local variable 'xyz' used
+// 4714 - function marked as __forceinline not inlined
+// 4717 - 'function' : recursive on all control paths, function will cause runtime stack overflow
+// 4800 - 'type' : forcing value to bool 'true' or 'false' (performance warning)
+#ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
+#pragma warning(push)
+#endif
+#pragma warning(disable : 4100 4101 4127 4181 4211 4244 4273 4324 4503 4512 4522 4700 4714 4717 4800)
+// We currently rely on has_denorm in tests, and need it defined correctly for half/bfloat16.
+#ifndef _SILENCE_CXX23_DENORM_DEPRECATION_WARNING
+#define EIGEN_REENABLE_CXX23_DENORM_DEPRECATION_WARNING 1
+#define _SILENCE_CXX23_DENORM_DEPRECATION_WARNING
+#endif
+
 #elif defined __INTEL_COMPILER
-  // 2196 - routine is both "inline" and "noinline" ("noinline" assumed)
-  //        ICC 12 generates this warning even without any inline keyword, when defining class methods 'inline' i.e. inside of class body
-  //        typedef that may be a reference type.
-  // 279  - controlling expression is constant
-  //        ICC 12 generates this warning on assert(constant_expression_depending_on_template_params) and frankly this is a legitimate use case.
-  #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
-    #pragma warning push
-  #endif
-  #pragma warning disable 2196 279
+// 2196 - routine is both "inline" and "noinline" ("noinline" assumed)
+//        ICC 12 generates this warning even without any inline keyword, when defining class methods 'inline' i.e.
+//        inside of class body typedef that may be a reference type.
+// 279  - controlling expression is constant
+//        ICC 12 generates this warning on assert(constant_expression_depending_on_template_params) and frankly this is
+//        a legitimate use case.
+// 1684 - conversion from pointer to same-sized integral type (potential portability problem)
+// 2259 - non-pointer conversion from "Eigen::Index={ptrdiff_t={long}}" to "int" may lose significant bits
+#ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
+#pragma warning push
+#endif
+#pragma warning disable 2196 279 1684 2259
+
 #elif defined __clang__
-  // -Wconstant-logical-operand - warning: use of logical && with constant operand; switch to bitwise & or remove constant
-  //     this is really a stupid warning as it warns on compile-time expressions involving enums
-  #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
-    #pragma clang diagnostic push
-  #endif
-  #pragma clang diagnostic ignored "-Wconstant-logical-operand"
+// #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
+// #pragma clang diagnostic push
+// #endif
+// #if defined(__has_warning)
+// // -Wconstant-logical-operand - warning: use of logical && with constant operand; switch to bitwise & or remove constant
+// //     this is really a stupid warning as it warns on compile-time expressions involving enums
+// #if __has_warning("-Wconstant-logical-operand")
+// #pragma clang diagnostic ignored "-Wconstant-logical-operand"
+// #endif
+// #if __has_warning("-Wimplicit-int-float-conversion")
+// #pragma clang diagnostic ignored "-Wimplicit-int-float-conversion"
+// #endif
+// #if (defined(__ALTIVEC__) || defined(__VSX__)) && (!defined(__STDC_VERSION__) || (__STDC_VERSION__ < 201112L))
+// // warning: generic selections are a C11-specific feature
+// // ignoring warnings thrown at vec_ctf in Altivec/PacketMath.h
+// #if __has_warning("-Wc11-extensions")
+// #pragma clang diagnostic ignored "-Wc11-extensions"
+// #endif
+// #endif
+// #endif
+
+#elif defined __GNUC__ && !defined(__FUJITSU)
+
+// #if (!defined(EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS)) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6))
+// #pragma GCC diagnostic push
+// #endif
+// // g++ warns about local variables shadowing member functions, which is too strict
+// #pragma GCC diagnostic ignored "-Wshadow"
+// #if __GNUC__ == 4 && __GNUC_MINOR__ < 8
+// // Until g++-4.7 there are warnings when comparing unsigned int vs 0, even in templated functions:
+// #pragma GCC diagnostic ignored "-Wtype-limits"
+// #endif
+// #if __GNUC__ >= 6
+// #pragma GCC diagnostic ignored "-Wignored-attributes"
+// #endif
+// #if __GNUC__ == 7
+// // See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89325
+// #pragma GCC diagnostic ignored "-Wattributes"
+// #endif
+#endif
+
+#if defined __NVCC__ && defined __CUDACC__
+// MSVC 14.16 (required by CUDA 9.*) does not support the _Pragma keyword, so
+// we instead use Microsoft's __pragma extension.
+#if defined _MSC_VER
+#define EIGEN_MAKE_PRAGMA(X) __pragma(#X)
+#else
+#define EIGEN_MAKE_PRAGMA(X) _Pragma(#X)
+#endif
+#if defined __NVCC_DIAG_PRAGMA_SUPPORT__
+#define EIGEN_NV_DIAG_SUPPRESS(X) EIGEN_MAKE_PRAGMA(nv_diag_suppress X)
+#else
+#define EIGEN_NV_DIAG_SUPPRESS(X) EIGEN_MAKE_PRAGMA(diag_suppress X)
+#endif
+
+EIGEN_NV_DIAG_SUPPRESS(boolean_controlling_expr_is_constant)
+// Disable the "statement is unreachable" message
+EIGEN_NV_DIAG_SUPPRESS(code_is_unreachable)
+// Disable the "dynamic initialization in unreachable code" message
+EIGEN_NV_DIAG_SUPPRESS(initialization_not_reachable)
+// Disable the "invalid error number" message that we get with older versions of nvcc
+EIGEN_NV_DIAG_SUPPRESS(1222)
+// Disable the "calling a __host__ function from a __host__ __device__ function is not allowed" messages (yes, there are
+// many of them and they seem to change with every version of the compiler)
+EIGEN_NV_DIAG_SUPPRESS(2527)
+EIGEN_NV_DIAG_SUPPRESS(2529)
+EIGEN_NV_DIAG_SUPPRESS(2651)
+EIGEN_NV_DIAG_SUPPRESS(2653)
+EIGEN_NV_DIAG_SUPPRESS(2668)
+EIGEN_NV_DIAG_SUPPRESS(2669)
+EIGEN_NV_DIAG_SUPPRESS(2670)
+EIGEN_NV_DIAG_SUPPRESS(2671)
+EIGEN_NV_DIAG_SUPPRESS(2735)
+EIGEN_NV_DIAG_SUPPRESS(2737)
+EIGEN_NV_DIAG_SUPPRESS(2739)
+EIGEN_NV_DIAG_SUPPRESS(2885)
+EIGEN_NV_DIAG_SUPPRESS(2888)
+EIGEN_NV_DIAG_SUPPRESS(2976)
+EIGEN_NV_DIAG_SUPPRESS(2979)
+EIGEN_NV_DIAG_SUPPRESS(20011)
+EIGEN_NV_DIAG_SUPPRESS(20014)
+// Disable the "// __device__ annotation is ignored on a function(...) that is
+//              explicitly defaulted on its first declaration" message.
+// The __device__ annotation seems to actually be needed in some cases,
+// otherwise resulting in kernel runtime errors.
+EIGEN_NV_DIAG_SUPPRESS(2886)
+EIGEN_NV_DIAG_SUPPRESS(2929)
+EIGEN_NV_DIAG_SUPPRESS(2977)
+EIGEN_NV_DIAG_SUPPRESS(20012)
+#undef EIGEN_NV_DIAG_SUPPRESS
+#undef EIGEN_MAKE_PRAGMA
+#endif
+
+#else
+// warnings already disabled:
+#ifndef EIGEN_WARNINGS_DISABLED_2
+#define EIGEN_WARNINGS_DISABLED_2
+#elif defined(EIGEN_INTERNAL_DEBUGGING)
+#error "Do not include \"DisableStupidWarnings.h\" recursively more than twice!"
 #endif
 
-#endif // not EIGEN_WARNINGS_DISABLED
+#endif  // not EIGEN_WARNINGS_DISABLED
diff --git a/inst/include/Eigen/src/Core/util/EmulateArray.h b/inst/include/Eigen/src/Core/util/EmulateArray.h
new file mode 100644
index 00000000..6c4c22d4
--- /dev/null
+++ b/inst/include/Eigen/src/Core/util/EmulateArray.h
@@ -0,0 +1,270 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_EMULATE_ARRAY_H
+#define EIGEN_EMULATE_ARRAY_H
+
+// CUDA doesn't support the STL containers, so we use our own instead.
+#if defined(EIGEN_GPUCC) || defined(EIGEN_AVOID_STL_ARRAY)
+
+namespace Eigen {
+template <typename T, size_t n>
+class array {
+ public:
+  typedef T value_type;
+  typedef T* iterator;
+  typedef const T* const_iterator;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE iterator begin() { return values; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const_iterator begin() const { return values; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE iterator end() { return values + n; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const_iterator end() const { return values + n; }
+
+  typedef std::reverse_iterator<iterator> reverse_iterator;
+  typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
+
+  EIGEN_STRONG_INLINE reverse_iterator rbegin() { return reverse_iterator(end()); }
+  EIGEN_STRONG_INLINE const_reverse_iterator rbegin() const { return const_reverse_iterator(end()); }
+
+  EIGEN_STRONG_INLINE reverse_iterator rend() { return reverse_iterator(begin()); }
+  EIGEN_STRONG_INLINE const_reverse_iterator rend() const { return const_reverse_iterator(begin()); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& operator[](size_t index) {
+    eigen_internal_assert(index < size());
+    return values[index];
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& operator[](size_t index) const {
+    eigen_internal_assert(index < size());
+    return values[index];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& at(size_t index) {
+    eigen_assert(index < size());
+    return values[index];
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& at(size_t index) const {
+    eigen_assert(index < size());
+    return values[index];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& front() { return values[0]; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& front() const { return values[0]; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& back() { return values[n - 1]; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& back() const { return values[n - 1]; }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static std::size_t size() { return n; }
+
+  T values[n];
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array() {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array(const T& v) {
+    EIGEN_STATIC_ASSERT(n == 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array(const T& v1, const T& v2) {
+    EIGEN_STATIC_ASSERT(n == 2, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3) {
+    EIGEN_STATIC_ASSERT(n == 3, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+    values[2] = v3;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4) {
+    EIGEN_STATIC_ASSERT(n == 4, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+    values[2] = v3;
+    values[3] = v4;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4, const T& v5) {
+    EIGEN_STATIC_ASSERT(n == 5, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+    values[2] = v3;
+    values[3] = v4;
+    values[4] = v5;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4, const T& v5,
+                                              const T& v6) {
+    EIGEN_STATIC_ASSERT(n == 6, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+    values[2] = v3;
+    values[3] = v4;
+    values[4] = v5;
+    values[5] = v6;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4, const T& v5,
+                                              const T& v6, const T& v7) {
+    EIGEN_STATIC_ASSERT(n == 7, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+    values[2] = v3;
+    values[3] = v4;
+    values[4] = v5;
+    values[5] = v6;
+    values[6] = v7;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array(const T& v1, const T& v2, const T& v3, const T& v4, const T& v5,
+                                              const T& v6, const T& v7, const T& v8) {
+    EIGEN_STATIC_ASSERT(n == 8, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    values[0] = v1;
+    values[1] = v2;
+    values[2] = v3;
+    values[3] = v4;
+    values[4] = v5;
+    values[5] = v6;
+    values[6] = v7;
+    values[7] = v8;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array(std::initializer_list<T> l) {
+    eigen_assert(l.size() == n);
+    internal::smart_copy(l.begin(), l.end(), values);
+  }
+};
+
+// Specialize array for zero size
+template <typename T>
+class array<T, 0> {
+ public:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& operator[](size_t) {
+    eigen_assert(false && "Can't index a zero size array");
+    return dummy;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& operator[](size_t) const {
+    eigen_assert(false && "Can't index a zero size array");
+    return dummy;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& front() {
+    eigen_assert(false && "Can't index a zero size array");
+    return dummy;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& front() const {
+    eigen_assert(false && "Can't index a zero size array");
+    return dummy;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& back() {
+    eigen_assert(false && "Can't index a zero size array");
+    return dummy;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& back() const {
+    eigen_assert(false && "Can't index a zero size array");
+    return dummy;
+  }
+
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::size_t size() { return 0; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array() : dummy() {}
+
+  EIGEN_DEVICE_FUNC array(std::initializer_list<T> l) : dummy() {
+    EIGEN_UNUSED_VARIABLE(l);
+    eigen_assert(l.size() == 0);
+  }
+
+ private:
+  T dummy;
+};
+
+// Comparison operator
+// Todo: implement !=, <, <=, >,  and >=
+template <class T, std::size_t N>
+EIGEN_DEVICE_FUNC bool operator==(const array<T, N>& lhs, const array<T, N>& rhs) {
+  for (std::size_t i = 0; i < N; ++i) {
+    if (lhs[i] != rhs[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+namespace internal {
+template <std::size_t I_, class T, std::size_t N>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& array_get(array<T, N>& a) {
+  return a[I_];
+}
+template <std::size_t I_, class T, std::size_t N>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& array_get(const array<T, N>& a) {
+  return a[I_];
+}
+
+template <class T, std::size_t N>
+struct array_size<array<T, N> > {
+  static constexpr Index value = N;
+};
+template <class T, std::size_t N>
+struct array_size<array<T, N>&> {
+  static constexpr Index value = N;
+};
+template <class T, std::size_t N>
+struct array_size<const array<T, N> > {
+  static constexpr Index value = N;
+};
+template <class T, std::size_t N>
+struct array_size<const array<T, N>&> {
+  static constexpr Index value = N;
+};
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#else
+
+// The compiler supports c++11, and we're not targeting cuda: use std::array as Eigen::array
+#include <array>
+
+namespace Eigen {
+
+template <typename T, std::size_t N>
+using array = std::array<T, N>;
+
+namespace internal {
+/* std::get is only constexpr in C++14, not yet in C++11
+ *     - libstdc++ from version 4.7 onwards has it nevertheless,
+ *                                          so use that
+ *     - libstdc++ older versions: use _M_instance directly
+ *     - libc++ all versions so far: use __elems_ directly
+ *     - all other libs: use std::get to be portable, but
+ *                       this may not be constexpr
+ */
+#if defined(__GLIBCXX__) && __GLIBCXX__ < 20120322
+#define STD_GET_ARR_HACK a._M_instance[I_]
+#elif defined(_LIBCPP_VERSION)
+#define STD_GET_ARR_HACK a.__elems_[I_]
+#else
+#define STD_GET_ARR_HACK std::template get<I_, T, N>(a)
+#endif
+
+template <std::size_t I_, class T, std::size_t N>
+constexpr T& array_get(std::array<T, N>& a) {
+  return (T&)STD_GET_ARR_HACK;
+}
+template <std::size_t I_, class T, std::size_t N>
+constexpr T&& array_get(std::array<T, N>&& a) {
+  return (T&&)STD_GET_ARR_HACK;
+}
+template <std::size_t I_, class T, std::size_t N>
+constexpr T const& array_get(std::array<T, N> const& a) {
+  return (T const&)STD_GET_ARR_HACK;
+}
+
+#undef STD_GET_ARR_HACK
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif
+
+#endif  // EIGEN_EMULATE_ARRAY_H
diff --git a/inst/include/Eigen/src/Core/util/ForwardDeclarations.h b/inst/include/Eigen/src/Core/util/ForwardDeclarations.h
index f2777200..e0bc57ea 100644
--- a/inst/include/Eigen/src/Core/util/ForwardDeclarations.h
+++ b/inst/include/Eigen/src/Core/util/ForwardDeclarations.h
@@ -11,292 +11,514 @@
 #ifndef EIGEN_FORWARDDECLARATIONS_H
 #define EIGEN_FORWARDDECLARATIONS_H
 
+// IWYU pragma: private
+#include "../InternalHeaderCheck.h"
+
 namespace Eigen {
 namespace internal {
 
-template<typename T> struct traits;
+template <typename T>
+struct traits;
 
 // here we say once and for all that traits<const T> == traits<T>
 // When constness must affect traits, it has to be constness on template parameters on which T itself depends.
 // For example, traits<Map<const T> > != traits<Map<T> >, but
 //              traits<const Map<T> > == traits<Map<T> >
-template<typename T> struct traits<const T> : traits<T> {};
+template <typename T>
+struct traits<const T> : traits<T> {};
 
-template<typename Derived> struct has_direct_access
-{
+template <typename Derived>
+struct has_direct_access {
   enum { ret = (traits<Derived>::Flags & DirectAccessBit) ? 1 : 0 };
 };
 
-template<typename Derived> struct accessors_level
-{
-  enum { has_direct_access = (traits<Derived>::Flags & DirectAccessBit) ? 1 : 0,
-         has_write_access = (traits<Derived>::Flags & LvalueBit) ? 1 : 0,
-         value = has_direct_access ? (has_write_access ? DirectWriteAccessors : DirectAccessors)
-                                   : (has_write_access ? WriteAccessors       : ReadOnlyAccessors)
+template <typename Derived>
+struct accessors_level {
+  enum {
+    has_direct_access = (traits<Derived>::Flags & DirectAccessBit) ? 1 : 0,
+    has_write_access = (traits<Derived>::Flags & LvalueBit) ? 1 : 0,
+    value = has_direct_access ? (has_write_access ? DirectWriteAccessors : DirectAccessors)
+                              : (has_write_access ? WriteAccessors : ReadOnlyAccessors)
   };
 };
 
-} // end namespace internal
+template <typename T>
+struct evaluator_traits;
 
-template<typename T> struct NumTraits;
+template <typename T>
+struct evaluator;
 
-template<typename Derived> struct EigenBase;
-template<typename Derived> class DenseBase;
-template<typename Derived> class PlainObjectBase;
+}  // end namespace internal
 
+template <typename T>
+struct NumTraits;
 
-template<typename Derived,
-         int Level = internal::accessors_level<Derived>::value >
+template <typename Derived>
+struct EigenBase;
+template <typename Derived>
+class DenseBase;
+template <typename Derived>
+class PlainObjectBase;
+template <typename Derived, int Level>
 class DenseCoeffsBase;
 
-template<typename _Scalar, int _Rows, int _Cols,
-         int _Options = AutoAlign |
-#if defined(__GNUC__) && __GNUC__==3 && __GNUC_MINOR__==4
-    // workaround a bug in at least gcc 3.4.6
-    // the innermost ?: ternary operator is misparsed. We write it slightly
-    // differently and this makes gcc 3.4.6 happy, but it's ugly.
-    // The error would only show up with EIGEN_DEFAULT_TO_ROW_MAJOR is defined
-    // (when EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION is RowMajor)
-                          ( (_Rows==1 && _Cols!=1) ? RowMajor
-                          : !(_Cols==1 && _Rows!=1) ?  EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION
-                          : ColMajor ),
-#else
-                          ( (_Rows==1 && _Cols!=1) ? RowMajor
-                          : (_Cols==1 && _Rows!=1) ? ColMajor
-                          : EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION ),
-#endif
-         int _MaxRows = _Rows,
-         int _MaxCols = _Cols
-> class Matrix;
-
-template<typename Derived> class MatrixBase;
-template<typename Derived> class ArrayBase;
-
-template<typename ExpressionType, unsigned int Added, unsigned int Removed> class Flagged;
-template<typename ExpressionType, template <typename> class StorageBase > class NoAlias;
-template<typename ExpressionType> class NestByValue;
-template<typename ExpressionType> class ForceAlignedAccess;
-template<typename ExpressionType> class SwapWrapper;
-
-template<typename XprType, int BlockRows=Dynamic, int BlockCols=Dynamic, bool InnerPanel = false> class Block;
-
-template<typename MatrixType, int Size=Dynamic> class VectorBlock;
-template<typename MatrixType> class Transpose;
-template<typename MatrixType> class Conjugate;
-template<typename NullaryOp, typename MatrixType>         class CwiseNullaryOp;
-template<typename UnaryOp,   typename MatrixType>         class CwiseUnaryOp;
-template<typename ViewOp,    typename MatrixType>         class CwiseUnaryView;
-template<typename BinaryOp,  typename Lhs, typename Rhs>  class CwiseBinaryOp;
-template<typename BinOp,     typename Lhs, typename Rhs>  class SelfCwiseBinaryOp;
-template<typename Derived,   typename Lhs, typename Rhs>  class ProductBase;
-template<typename Lhs, typename Rhs, int Mode>            class GeneralProduct;
-template<typename Lhs, typename Rhs, int NestingFlags>    class CoeffBasedProduct;
-
-template<typename Derived> class DiagonalBase;
-template<typename _DiagonalVectorType> class DiagonalWrapper;
-template<typename _Scalar, int SizeAtCompileTime, int MaxSizeAtCompileTime=SizeAtCompileTime> class DiagonalMatrix;
-template<typename MatrixType, typename DiagonalType, int ProductOrder> class DiagonalProduct;
-template<typename MatrixType, int Index = 0> class Diagonal;
-template<int SizeAtCompileTime, int MaxSizeAtCompileTime = SizeAtCompileTime, typename IndexType=int> class PermutationMatrix;
-template<int SizeAtCompileTime, int MaxSizeAtCompileTime = SizeAtCompileTime, typename IndexType=int> class Transpositions;
-template<typename Derived> class PermutationBase;
-template<typename Derived> class TranspositionsBase;
-template<typename _IndicesType> class PermutationWrapper;
-template<typename _IndicesType> class TranspositionsWrapper;
-
-template<typename Derived,
-         int Level = internal::accessors_level<Derived>::has_write_access ? WriteAccessors : ReadOnlyAccessors
-> class MapBase;
-template<int InnerStrideAtCompileTime, int OuterStrideAtCompileTime> class Stride;
-template<typename MatrixType, int MapOptions=Unaligned, typename StrideType = Stride<0,0> > class Map;
-
-template<typename Derived> class TriangularBase;
-template<typename MatrixType, unsigned int Mode> class TriangularView;
-template<typename MatrixType, unsigned int Mode> class SelfAdjointView;
-template<typename MatrixType> class SparseView;
-template<typename ExpressionType> class WithFormat;
-template<typename MatrixType> struct CommaInitializer;
-template<typename Derived> class ReturnByValue;
-template<typename ExpressionType> class ArrayWrapper;
-template<typename ExpressionType> class MatrixWrapper;
+template <typename Scalar_, int Rows_, int Cols_,
+          int Options_ = AutoAlign | ((Rows_ == 1 && Cols_ != 1)   ? Eigen::RowMajor
+                                      : (Cols_ == 1 && Rows_ != 1) ? Eigen::ColMajor
+                                                                   : EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION),
+          int MaxRows_ = Rows_, int MaxCols_ = Cols_>
+class Matrix;
+
+template <typename Derived>
+class MatrixBase;
+template <typename Derived>
+class ArrayBase;
+
+template <typename ExpressionType, unsigned int Added, unsigned int Removed>
+class Flagged;
+template <typename ExpressionType, template <typename> class StorageBase>
+class NoAlias;
+template <typename ExpressionType>
+class NestByValue;
+template <typename ExpressionType>
+class ForceAlignedAccess;
+template <typename ExpressionType>
+class SwapWrapper;
+
+template <typename XprType, int BlockRows = Dynamic, int BlockCols = Dynamic, bool InnerPanel = false>
+class Block;
+template <typename XprType, typename RowIndices, typename ColIndices>
+class IndexedView;
+template <typename XprType, int Rows = Dynamic, int Cols = Dynamic, int Order = 0>
+class Reshaped;
+template <typename FirstType, typename SizeType, typename IncrType>
+class ArithmeticSequence;
+
+template <typename MatrixType, int Size = Dynamic>
+class VectorBlock;
+template <typename MatrixType>
+class Transpose;
+template <typename MatrixType>
+class Conjugate;
+template <typename NullaryOp, typename MatrixType>
+class CwiseNullaryOp;
+template <typename UnaryOp, typename MatrixType>
+class CwiseUnaryOp;
+template <typename BinaryOp, typename Lhs, typename Rhs>
+class CwiseBinaryOp;
+template <typename TernaryOp, typename Arg1, typename Arg2, typename Arg3>
+class CwiseTernaryOp;
+template <typename Decomposition, typename Rhstype>
+class Solve;
+template <typename XprType>
+class Inverse;
+
+template <typename Lhs, typename Rhs, int Option = DefaultProduct>
+class Product;
+
+template <typename Derived>
+class DiagonalBase;
+template <typename DiagonalVectorType_>
+class DiagonalWrapper;
+template <typename Scalar_, int SizeAtCompileTime, int MaxSizeAtCompileTime = SizeAtCompileTime>
+class DiagonalMatrix;
+template <typename MatrixType, typename DiagonalType, int ProductOrder>
+class DiagonalProduct;
+template <typename MatrixType, int Index = 0>
+class Diagonal;
+template <typename Derived>
+class SkewSymmetricBase;
+template <typename VectorType_>
+class SkewSymmetricWrapper;
+template <typename Scalar_>
+class SkewSymmetricMatrix3;
+template <int SizeAtCompileTime, int MaxSizeAtCompileTime = SizeAtCompileTime, typename IndexType = int>
+class PermutationMatrix;
+template <int SizeAtCompileTime, int MaxSizeAtCompileTime = SizeAtCompileTime, typename IndexType = int>
+class Transpositions;
+template <typename Derived>
+class PermutationBase;
+template <typename Derived>
+class TranspositionsBase;
+template <typename IndicesType_>
+class PermutationWrapper;
+template <typename IndicesType_>
+class TranspositionsWrapper;
+
+template <typename Derived,
+          int Level = internal::accessors_level<Derived>::has_write_access ? WriteAccessors : ReadOnlyAccessors>
+class MapBase;
+template <int OuterStrideAtCompileTime, int InnerStrideAtCompileTime>
+class Stride;
+template <int Value = Dynamic>
+class InnerStride;
+template <int Value = Dynamic>
+class OuterStride;
+template <typename MatrixType, int MapOptions = Unaligned, typename StrideType = Stride<0, 0>>
+class Map;
+template <typename Derived>
+class RefBase;
+template <typename PlainObjectType, int Options = 0,
+          typename StrideType =
+              typename std::conditional_t<PlainObjectType::IsVectorAtCompileTime, InnerStride<1>, OuterStride<>>>
+class Ref;
+template <typename ViewOp, typename MatrixType, typename StrideType = Stride<0, 0>>
+class CwiseUnaryView;
+
+template <typename Derived>
+class TriangularBase;
+template <typename MatrixType, unsigned int Mode>
+class TriangularView;
+template <typename MatrixType, unsigned int Mode>
+class SelfAdjointView;
+template <typename Derived>
+class RealView;
+template <typename MatrixType>
+class SparseView;
+template <typename ExpressionType>
+class WithFormat;
+template <typename MatrixType>
+struct CommaInitializer;
+template <typename Derived>
+class ReturnByValue;
+template <typename ExpressionType>
+class ArrayWrapper;
+template <typename ExpressionType>
+class MatrixWrapper;
+template <typename Derived>
+class SolverBase;
+template <typename XprType>
+class InnerIterator;
 
 namespace internal {
-template<typename DecompositionType, typename Rhs> struct solve_retval_base;
-template<typename DecompositionType, typename Rhs> struct solve_retval;
-template<typename DecompositionType> struct kernel_retval_base;
-template<typename DecompositionType> struct kernel_retval;
-template<typename DecompositionType> struct image_retval_base;
-template<typename DecompositionType> struct image_retval;
-} // end namespace internal
+template <typename XprType>
+class generic_randaccess_stl_iterator;
+template <typename XprType>
+class pointer_based_stl_iterator;
+template <typename XprType, DirectionType Direction>
+class subvector_stl_iterator;
+template <typename XprType, DirectionType Direction>
+class subvector_stl_reverse_iterator;
+template <typename DecompositionType>
+struct kernel_retval_base;
+template <typename DecompositionType>
+struct kernel_retval;
+template <typename DecompositionType>
+struct image_retval_base;
+template <typename DecompositionType>
+struct image_retval;
+}  // end namespace internal
 
 namespace internal {
-template<typename _Scalar, int Rows=Dynamic, int Cols=Dynamic, int Supers=Dynamic, int Subs=Dynamic, int Options=0> class BandMatrix;
+template <typename Scalar_, int Rows = Dynamic, int Cols = Dynamic, int Supers = Dynamic, int Subs = Dynamic,
+          int Options = 0>
+class BandMatrix;
 }
 
 namespace internal {
-template<typename Lhs, typename Rhs> struct product_type;
-}
-
-template<typename Lhs, typename Rhs,
-         int ProductType = internal::product_type<Lhs,Rhs>::value>
+template <typename Lhs, typename Rhs>
+struct product_type;
+
+template <bool>
+struct EnableIf;
+
+/** \internal
+ * \class product_evaluator
+ * Products need their own evaluator with more template arguments allowing for
+ * easier partial template specializations.
+ */
+template <typename T, int ProductTag = internal::product_type<typename T::Lhs, typename T::Rhs>::ret,
+          typename LhsShape = typename evaluator_traits<typename T::Lhs>::Shape,
+          typename RhsShape = typename evaluator_traits<typename T::Rhs>::Shape,
+          typename LhsScalar = typename traits<typename T::Lhs>::Scalar,
+          typename RhsScalar = typename traits<typename T::Rhs>::Scalar>
+struct product_evaluator;
+}  // namespace internal
+
+template <typename Lhs, typename Rhs, int ProductType = internal::product_type<Lhs, Rhs>::value>
 struct ProductReturnType;
 
 // this is a workaround for sun CC
-template<typename Lhs, typename Rhs> struct LazyProductReturnType;
+template <typename Lhs, typename Rhs>
+struct LazyProductReturnType;
 
 namespace internal {
 
 // Provides scalar/packet-wise product and product with accumulation
 // with optional conjugation of the arguments.
-template<typename LhsScalar, typename RhsScalar, bool ConjLhs=false, bool ConjRhs=false> struct conj_helper;
-
-template<typename Scalar> struct scalar_sum_op;
-template<typename Scalar> struct scalar_difference_op;
-template<typename LhsScalar,typename RhsScalar> struct scalar_conj_product_op;
-template<typename Scalar> struct scalar_opposite_op;
-template<typename Scalar> struct scalar_conjugate_op;
-template<typename Scalar> struct scalar_real_op;
-template<typename Scalar> struct scalar_imag_op;
-template<typename Scalar> struct scalar_abs_op;
-template<typename Scalar> struct scalar_abs2_op;
-template<typename Scalar> struct scalar_sqrt_op;
-template<typename Scalar> struct scalar_exp_op;
-template<typename Scalar> struct scalar_log_op;
-template<typename Scalar> struct scalar_cos_op;
-template<typename Scalar> struct scalar_sin_op;
-template<typename Scalar> struct scalar_acos_op;
-template<typename Scalar> struct scalar_asin_op;
-template<typename Scalar> struct scalar_tan_op;
-template<typename Scalar> struct scalar_pow_op;
-template<typename Scalar> struct scalar_inverse_op;
-template<typename Scalar> struct scalar_square_op;
-template<typename Scalar> struct scalar_cube_op;
-template<typename Scalar, typename NewType> struct scalar_cast_op;
-template<typename Scalar> struct scalar_multiple_op;
-template<typename Scalar> struct scalar_quotient1_op;
-template<typename Scalar> struct scalar_min_op;
-template<typename Scalar> struct scalar_max_op;
-template<typename Scalar> struct scalar_random_op;
-template<typename Scalar> struct scalar_add_op;
-template<typename Scalar> struct scalar_constant_op;
-template<typename Scalar> struct scalar_identity_op;
-
-template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_product_op;
-template<typename LhsScalar,typename RhsScalar> struct scalar_multiple2_op;
-template<typename LhsScalar,typename RhsScalar=LhsScalar> struct scalar_quotient_op;
-
-} // end namespace internal
+template <typename LhsScalar, typename RhsScalar, bool ConjLhs = false, bool ConjRhs = false>
+struct conj_helper;
+
+template <typename LhsScalar, typename RhsScalar = LhsScalar>
+struct scalar_sum_op;
+template <typename LhsScalar, typename RhsScalar = LhsScalar>
+struct scalar_difference_op;
+template <typename LhsScalar, typename RhsScalar = LhsScalar>
+struct scalar_conj_product_op;
+template <typename LhsScalar, typename RhsScalar = LhsScalar, int NaNPropagation = PropagateFast>
+struct scalar_min_op;
+template <typename LhsScalar, typename RhsScalar = LhsScalar, int NaNPropagation = PropagateFast>
+struct scalar_max_op;
+template <typename Scalar>
+struct scalar_opposite_op;
+template <typename Scalar>
+struct scalar_conjugate_op;
+template <typename Scalar>
+struct scalar_real_op;
+template <typename Scalar>
+struct scalar_imag_op;
+template <typename Scalar>
+struct scalar_abs_op;
+template <typename Scalar>
+struct scalar_abs2_op;
+template <typename LhsScalar, typename RhsScalar = LhsScalar>
+struct scalar_absolute_difference_op;
+template <typename Scalar>
+struct scalar_sqrt_op;
+template <typename Scalar>
+struct scalar_cbrt_op;
+template <typename Scalar>
+struct scalar_rsqrt_op;
+template <typename Scalar>
+struct scalar_exp_op;
+template <typename Scalar>
+struct scalar_log_op;
+template <typename Scalar>
+struct scalar_cos_op;
+template <typename Scalar>
+struct scalar_sin_op;
+template <typename Scalar>
+struct scalar_acos_op;
+template <typename Scalar>
+struct scalar_asin_op;
+template <typename Scalar>
+struct scalar_tan_op;
+template <typename Scalar>
+struct scalar_atan_op;
+template <typename LhsScalar, typename RhsScalar = LhsScalar>
+struct scalar_atan2_op;
+template <typename Scalar>
+struct scalar_inverse_op;
+template <typename Scalar>
+struct scalar_square_op;
+template <typename Scalar>
+struct scalar_cube_op;
+template <typename Scalar, typename NewType>
+struct scalar_cast_op;
+template <typename Scalar>
+struct scalar_random_op;
+template <typename Scalar>
+struct scalar_constant_op;
+template <typename Scalar>
+struct scalar_identity_op;
+template <typename Scalar>
+struct scalar_sign_op;
+template <typename Scalar, typename ScalarExponent>
+struct scalar_pow_op;
+template <typename Scalar, typename ScalarExponent, bool BaseIsInteger, bool ExponentIsInteger, bool BaseIsComplex,
+          bool ExponentIsComplex>
+struct scalar_unary_pow_op;
+template <typename LhsScalar, typename RhsScalar = LhsScalar>
+struct scalar_hypot_op;
+template <typename LhsScalar, typename RhsScalar = LhsScalar>
+struct scalar_product_op;
+template <typename LhsScalar, typename RhsScalar = LhsScalar>
+struct scalar_quotient_op;
+// logical and bitwise operations
+template <typename Scalar>
+struct scalar_boolean_and_op;
+template <typename Scalar>
+struct scalar_boolean_or_op;
+template <typename Scalar>
+struct scalar_boolean_xor_op;
+template <typename Scalar>
+struct scalar_boolean_not_op;
+template <typename Scalar>
+struct scalar_bitwise_and_op;
+template <typename Scalar>
+struct scalar_bitwise_or_op;
+template <typename Scalar>
+struct scalar_bitwise_xor_op;
+template <typename Scalar>
+struct scalar_bitwise_not_op;
+
+// SpecialFunctions module
+template <typename Scalar>
+struct scalar_lgamma_op;
+template <typename Scalar>
+struct scalar_digamma_op;
+template <typename Scalar>
+struct scalar_erf_op;
+template <typename Scalar>
+struct scalar_erfc_op;
+template <typename Scalar>
+struct scalar_ndtri_op;
+template <typename Scalar>
+struct scalar_igamma_op;
+template <typename Scalar>
+struct scalar_igammac_op;
+template <typename Scalar>
+struct scalar_zeta_op;
+template <typename Scalar>
+struct scalar_betainc_op;
+
+// Bessel functions in SpecialFunctions module
+template <typename Scalar>
+struct scalar_bessel_i0_op;
+template <typename Scalar>
+struct scalar_bessel_i0e_op;
+template <typename Scalar>
+struct scalar_bessel_i1_op;
+template <typename Scalar>
+struct scalar_bessel_i1e_op;
+template <typename Scalar>
+struct scalar_bessel_j0_op;
+template <typename Scalar>
+struct scalar_bessel_y0_op;
+template <typename Scalar>
+struct scalar_bessel_j1_op;
+template <typename Scalar>
+struct scalar_bessel_y1_op;
+template <typename Scalar>
+struct scalar_bessel_k0_op;
+template <typename Scalar>
+struct scalar_bessel_k0e_op;
+template <typename Scalar>
+struct scalar_bessel_k1_op;
+template <typename Scalar>
+struct scalar_bessel_k1e_op;
+
+}  // end namespace internal
 
 struct IOFormat;
 
 // Array module
-template<typename _Scalar, int _Rows, int _Cols,
-         int _Options = AutoAlign |
-#if defined(__GNUC__) && __GNUC__==3 && __GNUC_MINOR__==4
-    // workaround a bug in at least gcc 3.4.6
-    // the innermost ?: ternary operator is misparsed. We write it slightly
-    // differently and this makes gcc 3.4.6 happy, but it's ugly.
-    // The error would only show up with EIGEN_DEFAULT_TO_ROW_MAJOR is defined
-    // (when EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION is RowMajor)
-                          ( (_Rows==1 && _Cols!=1) ? RowMajor
-                          : !(_Cols==1 && _Rows!=1) ?  EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION
-                          : ColMajor ),
+template <typename Scalar_, int Rows_, int Cols_,
+          int Options_ = AutoAlign | ((Rows_ == 1 && Cols_ != 1)   ? Eigen::RowMajor
+                                      : (Cols_ == 1 && Rows_ != 1) ? Eigen::ColMajor
+                                                                   : EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION),
+          int MaxRows_ = Rows_, int MaxCols_ = Cols_>
+class Array;
+template <typename MatrixType, typename BinaryOp, int Direction>
+class PartialReduxExpr;
+template <typename ExpressionType, int Direction>
+class VectorwiseOp;
+template <typename MatrixType, int RowFactor, int ColFactor>
+class Replicate;
+template <typename MatrixType, int Direction = BothDirections>
+class Reverse;
+
+#if defined(EIGEN_USE_LAPACKE) && defined(lapack_int)
+// Lapacke interface requires StorageIndex to be lapack_int
+typedef lapack_int DefaultPermutationIndex;
 #else
-                          ( (_Rows==1 && _Cols!=1) ? RowMajor
-                          : (_Cols==1 && _Rows!=1) ? ColMajor
-                          : EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION ),
+typedef int DefaultPermutationIndex;
 #endif
-         int _MaxRows = _Rows, int _MaxCols = _Cols> class Array;
-template<typename ConditionMatrixType, typename ThenMatrixType, typename ElseMatrixType> class Select;
-template<typename MatrixType, typename BinaryOp, int Direction> class PartialReduxExpr;
-template<typename ExpressionType, int Direction> class VectorwiseOp;
-template<typename MatrixType,int RowFactor,int ColFactor> class Replicate;
-template<typename MatrixType, int Direction = BothDirections> class Reverse;
-
-template<typename MatrixType> class FullPivLU;
-template<typename MatrixType> class PartialPivLU;
+
+template <typename MatrixType, typename PermutationIndex = DefaultPermutationIndex>
+class FullPivLU;
+template <typename MatrixType, typename PermutationIndex = DefaultPermutationIndex>
+class PartialPivLU;
 namespace internal {
-template<typename MatrixType> struct inverse_impl;
+template <typename MatrixType>
+struct inverse_impl;
 }
-template<typename MatrixType> class HouseholderQR;
-template<typename MatrixType> class ColPivHouseholderQR;
-template<typename MatrixType> class FullPivHouseholderQR;
-template<typename MatrixType, int QRPreconditioner = ColPivHouseholderQRPreconditioner> class JacobiSVD;
-template<typename MatrixType, int UpLo = Lower> class LLT;
-template<typename MatrixType, int UpLo = Lower> class LDLT;
-template<typename VectorsType, typename CoeffsType, int Side=OnTheLeft> class HouseholderSequence;
-template<typename Scalar>     class JacobiRotation;
+template <typename MatrixType>
+class HouseholderQR;
+template <typename MatrixType, typename PermutationIndex = DefaultPermutationIndex>
+class ColPivHouseholderQR;
+template <typename MatrixType, typename PermutationIndex = DefaultPermutationIndex>
+class FullPivHouseholderQR;
+template <typename MatrixType, typename PermutationIndex = DefaultPermutationIndex>
+class CompleteOrthogonalDecomposition;
+template <typename MatrixType>
+class SVDBase;
+template <typename MatrixType, int Options = 0>
+class JacobiSVD;
+template <typename MatrixType, int Options = 0>
+class BDCSVD;
+template <typename MatrixType, int UpLo = Lower>
+class LLT;
+template <typename MatrixType, int UpLo = Lower>
+class LDLT;
+template <typename VectorsType, typename CoeffsType, int Side = OnTheLeft>
+class HouseholderSequence;
+template <typename Scalar>
+class JacobiRotation;
 
 // Geometry module:
-template<typename Derived, int _Dim> class RotationBase;
-template<typename Lhs, typename Rhs> class Cross;
-template<typename Derived> class QuaternionBase;
-template<typename Scalar> class Rotation2D;
-template<typename Scalar> class AngleAxis;
-template<typename Scalar,int Dim> class Translation;
+namespace internal {
+template <typename Derived, typename OtherDerived, int Size = MatrixBase<Derived>::SizeAtCompileTime>
+struct cross_impl;
+}
+template <typename Derived, int Dim_>
+class RotationBase;
+template <typename Derived>
+class QuaternionBase;
+template <typename Scalar>
+class Rotation2D;
+template <typename Scalar>
+class AngleAxis;
+template <typename Scalar, int Dim>
+class Translation;
+template <typename Scalar, int Dim>
+class AlignedBox;
+template <typename Scalar, int Options = AutoAlign>
+class Quaternion;
+template <typename Scalar, int Dim, int Mode, int Options_ = AutoAlign>
+class Transform;
+template <typename Scalar_, int AmbientDim_, int Options = AutoAlign>
+class ParametrizedLine;
+template <typename Scalar_, int AmbientDim_, int Options = AutoAlign>
+class Hyperplane;
+template <typename Scalar>
+class UniformScaling;
+template <typename MatrixType, int Direction>
+class Homogeneous;
 
 // Sparse module:
-template<typename Derived> class SparseMatrixBase;
-
-#ifdef EIGEN2_SUPPORT
-template<typename Derived, int _Dim> class eigen2_RotationBase;
-template<typename Lhs, typename Rhs> class eigen2_Cross;
-template<typename Scalar> class eigen2_Quaternion;
-template<typename Scalar> class eigen2_Rotation2D;
-template<typename Scalar> class eigen2_AngleAxis;
-template<typename Scalar,int Dim> class eigen2_Transform;
-template <typename _Scalar, int _AmbientDim> class eigen2_ParametrizedLine;
-template <typename _Scalar, int _AmbientDim> class eigen2_Hyperplane;
-template<typename Scalar,int Dim> class eigen2_Translation;
-template<typename Scalar,int Dim> class eigen2_Scaling;
-#endif
-
-#if EIGEN2_SUPPORT_STAGE < STAGE20_RESOLVE_API_CONFLICTS
-template<typename Scalar> class Quaternion;
-template<typename Scalar,int Dim> class Transform;
-template <typename _Scalar, int _AmbientDim> class ParametrizedLine;
-template <typename _Scalar, int _AmbientDim> class Hyperplane;
-template<typename Scalar,int Dim> class Scaling;
-#endif
-
-#if EIGEN2_SUPPORT_STAGE > STAGE20_RESOLVE_API_CONFLICTS
-template<typename Scalar, int Options = AutoAlign> class Quaternion;
-template<typename Scalar,int Dim,int Mode,int _Options=AutoAlign> class Transform;
-template <typename _Scalar, int _AmbientDim, int Options=AutoAlign> class ParametrizedLine;
-template <typename _Scalar, int _AmbientDim, int Options=AutoAlign> class Hyperplane;
-template<typename Scalar> class UniformScaling;
-template<typename MatrixType,int Direction> class Homogeneous;
-#endif
+template <typename Derived>
+class SparseMatrixBase;
 
 // MatrixFunctions module
-template<typename Derived> struct MatrixExponentialReturnValue;
-template<typename Derived> class MatrixFunctionReturnValue;
-template<typename Derived> class MatrixSquareRootReturnValue;
-template<typename Derived> class MatrixLogarithmReturnValue;
-template<typename Derived> class MatrixPowerReturnValue;
-template<typename Derived, typename Lhs, typename Rhs> class MatrixPowerProduct;
+template <typename Derived>
+struct MatrixExponentialReturnValue;
+template <typename Derived>
+class MatrixFunctionReturnValue;
+template <typename Derived>
+class MatrixSquareRootReturnValue;
+template <typename Derived>
+class MatrixLogarithmReturnValue;
+template <typename Derived>
+class MatrixPowerReturnValue;
+template <typename Derived>
+class MatrixComplexPowerReturnValue;
 
 namespace internal {
 template <typename Scalar>
-struct stem_function
-{
-  typedef std::complex<typename NumTraits<Scalar>::Real> ComplexScalar;
+struct stem_function {
+  typedef internal::make_complex_t<Scalar> ComplexScalar;
   typedef ComplexScalar type(ComplexScalar, int);
 };
-}
+}  // namespace internal
 
+template <typename XprType, typename Device>
+struct DeviceWrapper;
 
-#ifdef EIGEN2_SUPPORT
-template<typename ExpressionType> class Cwise;
-template<typename MatrixType> class Minor;
-template<typename MatrixType> class LU;
-template<typename MatrixType> class QR;
-template<typename MatrixType> class SVD;
 namespace internal {
-template<typename MatrixType, unsigned int Mode> struct eigen2_part_return_type;
-}
-#endif
-
-} // end namespace Eigen
-
-#endif // EIGEN_FORWARDDECLARATIONS_H
+template <typename Xpr>
+struct eigen_fill_helper;
+template <typename Xpr, bool use_fill = eigen_fill_helper<Xpr>::value>
+struct eigen_fill_impl;
+template <typename Xpr>
+struct eigen_memset_helper;
+template <typename Xpr, bool use_memset = eigen_memset_helper<Xpr>::value>
+struct eigen_zero_impl;
+
+template <typename Packet>
+struct has_packet_segment : std::false_type {};
+}  // namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_FORWARDDECLARATIONS_H
diff --git a/inst/include/Eigen/src/Core/util/GpuHipCudaDefines.inc b/inst/include/Eigen/src/Core/util/GpuHipCudaDefines.inc
new file mode 100644
index 00000000..4e105005
--- /dev/null
+++ b/inst/include/Eigen/src/Core/util/GpuHipCudaDefines.inc
@@ -0,0 +1,101 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2018 Deven Desai <deven.desai.amd@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CORE_GPU_HIP_CUDA_DEFINES_H)
+#define EIGEN_CXX11_TENSOR_GPU_HIP_CUDA_DEFINES_H
+
+// Note that we are using EIGEN_USE_HIP here instead of EIGEN_HIPCC...this is by design
+// There is code in the Tensorflow codebase that will define EIGEN_USE_GPU,  but
+// for some reason gets sent to the gcc/host compiler instead of the gpu/nvcc/hipcc compiler
+// When compiling such files, gcc will end up trying to pick up the CUDA headers by
+// default (see the code within "unsupported/Eigen/CXX11/Tensor" that is guarded by EIGEN_USE_GPU)
+// This will obviously not work when trying to compile tensorflow on a system with no CUDA
+// To work around this issue for HIP systems (and leave the default behaviour intact), the
+// HIP tensorflow build defines EIGEN_USE_HIP when compiling all source files, and
+// "unsupported/Eigen/CXX11/Tensor" has been updated to use HIP header when EIGEN_USE_HIP is
+// defined. In continuation of that requirement, the guard here needs to be EIGEN_USE_HIP as well
+
+#if defined(EIGEN_USE_HIP)
+
+#define gpuStream_t hipStream_t
+#define gpuDeviceProp_t hipDeviceProp_t
+#define gpuError_t hipError_t
+#define gpuSuccess hipSuccess
+#define gpuErrorNotReady hipErrorNotReady
+#define gpuGetDeviceCount hipGetDeviceCount
+#define gpuGetLastError hipGetLastError
+#define gpuPeekAtLastError hipPeekAtLastError
+#define gpuGetErrorName hipGetErrorName
+#define gpuGetErrorString hipGetErrorString
+#define gpuGetDeviceProperties hipGetDeviceProperties
+#define gpuStreamDefault hipStreamDefault
+#define gpuGetDevice hipGetDevice
+#define gpuSetDevice hipSetDevice
+#define gpuMalloc hipMalloc
+#define gpuFree hipFree
+#define gpuMemsetAsync hipMemsetAsync
+#define gpuMemset2DAsync hipMemset2DAsync
+#define gpuMemcpyAsync hipMemcpyAsync
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
+#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+#define gpuStreamQuery hipStreamQuery
+#define gpuSharedMemConfig hipSharedMemConfig
+#define gpuDeviceSetSharedMemConfig hipDeviceSetSharedMemConfig
+#define gpuStreamSynchronize hipStreamSynchronize
+#define gpuDeviceSynchronize hipDeviceSynchronize
+#define gpuMemcpy hipMemcpy
+
+#else
+
+#define gpuStream_t cudaStream_t
+#define gpuDeviceProp_t cudaDeviceProp
+#define gpuError_t cudaError_t
+#define gpuSuccess cudaSuccess
+#define gpuErrorNotReady cudaErrorNotReady
+#define gpuGetDeviceCount cudaGetDeviceCount
+#define gpuGetLastError cudaGetLastError
+#define gpuPeekAtLastError cudaPeekAtLastError
+#define gpuGetErrorName cudaGetErrorName
+#define gpuGetErrorString cudaGetErrorString
+#define gpuGetDeviceProperties cudaGetDeviceProperties
+#define gpuStreamDefault cudaStreamDefault
+#define gpuGetDevice cudaGetDevice
+#define gpuSetDevice cudaSetDevice
+#define gpuMalloc cudaMalloc
+#define gpuFree cudaFree
+#define gpuMemsetAsync cudaMemsetAsync
+#define gpuMemset2DAsync cudaMemset2DAsync
+#define gpuMemcpyAsync cudaMemcpyAsync
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
+#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+#define gpuStreamQuery cudaStreamQuery
+#define gpuSharedMemConfig cudaSharedMemConfig
+#define gpuDeviceSetSharedMemConfig cudaDeviceSetSharedMemConfig
+#define gpuStreamSynchronize cudaStreamSynchronize
+#define gpuDeviceSynchronize cudaDeviceSynchronize
+#define gpuMemcpy cudaMemcpy
+
+#endif
+
+// gpu_assert can be overridden
+#ifndef gpu_assert
+
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+// HIPCC do not support the use of assert on the GPU side.
+#define gpu_assert(COND)
+#else
+#define gpu_assert(COND) eigen_assert(COND)
+#endif
+
+#endif  // gpu_assert
+
+#endif  // EIGEN_CORE_GPU_HIP_CUDA_DEFINES_H
diff --git a/inst/include/Eigen/src/Core/util/GpuHipCudaUndefines.inc b/inst/include/Eigen/src/Core/util/GpuHipCudaUndefines.inc
new file mode 100644
index 00000000..342a323a
--- /dev/null
+++ b/inst/include/Eigen/src/Core/util/GpuHipCudaUndefines.inc
@@ -0,0 +1,45 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2018 Deven Desai <deven.desai.amd@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#if defined(EIGEN_CORE_GPU_HIP_CUDA_DEFINES_H)
+
+#ifndef EIGEN_PERMANENTLY_ENABLE_GPU_HIP_CUDA_DEFINES
+
+#undef gpuStream_t
+#undef gpuDeviceProp_t
+#undef gpuError_t
+#undef gpuSuccess
+#undef gpuErrorNotReady
+#undef gpuGetDeviceCount
+#undef gpuGetErrorString
+#undef gpuGetDeviceProperties
+#undef gpuStreamDefault
+#undef gpuGetDevice
+#undef gpuSetDevice
+#undef gpuMalloc
+#undef gpuFree
+#undef gpuMemsetAsync
+#undef gpuMemset2DAsync
+#undef gpuMemcpyAsync
+#undef gpuMemcpyDeviceToDevice
+#undef gpuMemcpyDeviceToHost
+#undef gpuMemcpyHostToDevice
+#undef gpuStreamQuery
+#undef gpuSharedMemConfig
+#undef gpuDeviceSetSharedMemConfig
+#undef gpuStreamSynchronize
+#undef gpuDeviceSynchronize
+#undef gpuMemcpy
+
+#endif  // EIGEN_PERMANENTLY_ENABLE_GPU_HIP_CUDA_DEFINES
+
+#undef EIGEN_CORE_GPU_HIP_CUDA_DEFINES_H
+
+#endif  // EIGEN_CORE_GPU_HIP_CUDA_DEFINES_H
diff --git a/inst/include/Eigen/src/Core/util/IndexedViewHelper.h b/inst/include/Eigen/src/Core/util/IndexedViewHelper.h
new file mode 100644
index 00000000..abf4b195
--- /dev/null
+++ b/inst/include/Eigen/src/Core/util/IndexedViewHelper.h
@@ -0,0 +1,487 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_INDEXED_VIEW_HELPER_H
+#define EIGEN_INDEXED_VIEW_HELPER_H
+
+// IWYU pragma: private
+#include "../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+struct symbolic_last_tag {};
+
+struct all_t {};
+
+}  // namespace internal
+
+namespace placeholders {
+
+typedef symbolic::SymbolExpr<internal::symbolic_last_tag> last_t;
+
+/** \var last
+ * \ingroup Core_Module
+ *
+ * Can be used as a parameter to Eigen::seq and Eigen::seqN functions to symbolically reference the last
+ * element/row/columns of the underlying vector or matrix once passed to DenseBase::operator()(const RowIndices&, const
+ * ColIndices&).
+ *
+ * This symbolic placeholder supports standard arithmetic operations.
+ *
+ * A typical usage example would be:
+ * \code
+ * using namespace Eigen;
+ * using Eigen::placeholders::last;
+ * VectorXd v(n);
+ * v(seq(2,last-2)).setOnes();
+ * \endcode
+ *
+ * \sa end
+ */
+static constexpr const last_t last;
+
+typedef symbolic::AddExpr<symbolic::SymbolExpr<internal::symbolic_last_tag>,
+                          symbolic::ValueExpr<Eigen::internal::FixedInt<1>>>
+    lastp1_t;
+typedef Eigen::internal::all_t all_t;
+
+/** \var lastp1
+ * \ingroup Core_Module
+ *
+ * Can be used as a parameter to Eigen::seq and Eigen::seqN functions to symbolically
+ * reference the last+1 element/row/columns of the underlying vector or matrix once
+ * passed to DenseBase::operator()(const RowIndices&, const ColIndices&).
+ *
+ * This symbolic placeholder supports standard arithmetic operations.
+ * It is essentially an alias to last+fix<1>.
+ *
+ * \sa last
+ */
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+static constexpr auto lastp1 = last + fix<1>;
+#else
+// Using a FixedExpr<1> expression is important here to make sure the compiler
+// can fully optimize the computation starting indices with zero overhead.
+static constexpr lastp1_t lastp1 = lastp1_t{};
+#endif
+
+/** \var end
+ * \ingroup Core_Module
+ * \sa lastp1
+ */
+static constexpr lastp1_t end = lastp1;
+
+/** \var all
+ * \ingroup Core_Module
+ * Can be used as a parameter to DenseBase::operator()(const RowIndices&, const ColIndices&) to index all rows or
+ * columns
+ */
+static constexpr Eigen::internal::all_t all;
+
+}  // namespace placeholders
+
+namespace internal {
+
+// Evaluate a symbolic expression or constant given the "size" of an object, allowing
+// any symbols like `last` to be evaluated.  The default here assumes a dynamic constant.
+template <typename Expr, int SizeAtCompileTime, typename EnableIf = void>
+struct SymbolicExpressionEvaluator {
+  static constexpr Index ValueAtCompileTime = Undefined;
+  static Index eval(const Expr& expr, Index /*size*/) { return static_cast<Index>(expr); }
+};
+
+// Symbolic expression with size known at compile-time.
+template <typename Expr, int SizeAtCompileTime>
+struct SymbolicExpressionEvaluator<Expr, SizeAtCompileTime, std::enable_if_t<symbolic::is_symbolic<Expr>::value>> {
+  static constexpr Index ValueAtCompileTime =
+      Expr::Derived::eval_at_compile_time(Eigen::placeholders::last = fix<SizeAtCompileTime - 1>);
+  static Index eval(const Expr& expr, Index /*size*/) {
+    return expr.eval(Eigen::placeholders::last = fix<SizeAtCompileTime - 1>);
+  }
+};
+
+// Symbolic expression with dynamic size.
+template <typename Expr>
+struct SymbolicExpressionEvaluator<Expr, Dynamic, std::enable_if_t<symbolic::is_symbolic<Expr>::value>> {
+  static constexpr Index ValueAtCompileTime = Undefined;
+  static Index eval(const Expr& expr, Index size) { return expr.eval(Eigen::placeholders::last = size - 1); }
+};
+
+// Fixed int.
+template <int N, int SizeAtCompileTime>
+struct SymbolicExpressionEvaluator<FixedInt<N>, SizeAtCompileTime, void> {
+  static constexpr Index ValueAtCompileTime = static_cast<Index>(N);
+  static Index eval(const FixedInt<N>& /*expr*/, Index /*size*/) { return ValueAtCompileTime; }
+};
+
+//--------------------------------------------------------------------------------
+// Handling of generic indices (e.g. array)
+//--------------------------------------------------------------------------------
+
+// Potentially wrap indices in a type that is better-suited for IndexedView evaluation.
+template <typename Indices, int NestedSizeAtCompileTime, typename EnableIf = void>
+struct IndexedViewHelperIndicesWrapper {
+  using type = Indices;
+  static const type& CreateIndexSequence(const Indices& indices, Index /*nested_size*/) { return indices; }
+};
+
+// Extract compile-time and runtime first, size, increments.
+template <typename Indices, typename EnableIf = void>
+struct IndexedViewHelper {
+  static constexpr Index FirstAtCompileTime = Undefined;
+  static constexpr Index SizeAtCompileTime = array_size<Indices>::value;
+  static constexpr Index IncrAtCompileTime = Undefined;
+
+  static constexpr Index first(const Indices& indices) { return static_cast<Index>(indices[0]); }
+  static constexpr Index size(const Indices& indices) { return index_list_size(indices); }
+  static constexpr Index incr(const Indices& /*indices*/) { return Undefined; }
+};
+
+//--------------------------------------------------------------------------------
+// Handling of ArithmeticSequence
+//--------------------------------------------------------------------------------
+
+template <Index FirstAtCompileTime_, Index SizeAtCompileTime_, Index IncrAtCompileTime_>
+class ArithmeticSequenceRange {
+ public:
+  static constexpr Index FirstAtCompileTime = FirstAtCompileTime_;
+  static constexpr Index SizeAtCompileTime = SizeAtCompileTime_;
+  static constexpr Index IncrAtCompileTime = IncrAtCompileTime_;
+
+  constexpr ArithmeticSequenceRange(Index first, Index size, Index incr) : first_{first}, size_{size}, incr_{incr} {}
+  constexpr Index operator[](Index i) const { return first() + i * incr(); }
+  constexpr Index first() const noexcept { return first_.value(); }
+  constexpr Index size() const noexcept { return size_.value(); }
+  constexpr Index incr() const noexcept { return incr_.value(); }
+
+ private:
+  variable_if_dynamicindex<Index, int(FirstAtCompileTime)> first_;
+  variable_if_dynamic<Index, int(SizeAtCompileTime)> size_;
+  variable_if_dynamicindex<Index, int(IncrAtCompileTime)> incr_;
+};
+
+template <typename FirstType, typename SizeType, typename IncrType, int NestedSizeAtCompileTime>
+struct IndexedViewHelperIndicesWrapper<ArithmeticSequence<FirstType, SizeType, IncrType>, NestedSizeAtCompileTime,
+                                       void> {
+  static constexpr Index EvalFirstAtCompileTime =
+      SymbolicExpressionEvaluator<FirstType, NestedSizeAtCompileTime>::ValueAtCompileTime;
+  static constexpr Index EvalSizeAtCompileTime =
+      SymbolicExpressionEvaluator<SizeType, NestedSizeAtCompileTime>::ValueAtCompileTime;
+  static constexpr Index EvalIncrAtCompileTime =
+      SymbolicExpressionEvaluator<IncrType, NestedSizeAtCompileTime>::ValueAtCompileTime;
+
+  static constexpr Index FirstAtCompileTime =
+      (int(EvalFirstAtCompileTime) == Undefined) ? Index(DynamicIndex) : EvalFirstAtCompileTime;
+  static constexpr Index SizeAtCompileTime =
+      (int(EvalSizeAtCompileTime) == Undefined) ? Index(Dynamic) : EvalSizeAtCompileTime;
+  static constexpr Index IncrAtCompileTime =
+      (int(EvalIncrAtCompileTime) == Undefined) ? Index(DynamicIndex) : EvalIncrAtCompileTime;
+
+  using Indices = ArithmeticSequence<FirstType, SizeType, IncrType>;
+  using type = ArithmeticSequenceRange<FirstAtCompileTime, SizeAtCompileTime, IncrAtCompileTime>;
+
+  static type CreateIndexSequence(const Indices& indices, Index nested_size) {
+    Index first =
+        SymbolicExpressionEvaluator<FirstType, NestedSizeAtCompileTime>::eval(indices.firstObject(), nested_size);
+    Index size =
+        SymbolicExpressionEvaluator<SizeType, NestedSizeAtCompileTime>::eval(indices.sizeObject(), nested_size);
+    Index incr =
+        SymbolicExpressionEvaluator<IncrType, NestedSizeAtCompileTime>::eval(indices.incrObject(), nested_size);
+    return type(first, size, incr);
+  }
+};
+
+template <Index FirstAtCompileTime_, Index SizeAtCompileTime_, Index IncrAtCompileTime_>
+struct IndexedViewHelper<ArithmeticSequenceRange<FirstAtCompileTime_, SizeAtCompileTime_, IncrAtCompileTime_>, void> {
+ public:
+  using Indices = ArithmeticSequenceRange<FirstAtCompileTime_, SizeAtCompileTime_, IncrAtCompileTime_>;
+  static constexpr Index FirstAtCompileTime = Indices::FirstAtCompileTime;
+  static constexpr Index SizeAtCompileTime = Indices::SizeAtCompileTime;
+  static constexpr Index IncrAtCompileTime = Indices::IncrAtCompileTime;
+  static Index first(const Indices& indices) { return indices.first(); }
+  static Index size(const Indices& indices) { return indices.size(); }
+  static Index incr(const Indices& indices) { return indices.incr(); }
+};
+
+//--------------------------------------------------------------------------------
+// Handling of a single index.
+//--------------------------------------------------------------------------------
+
+template <Index ValueAtCompileTime>
+class SingleRange {
+ public:
+  static constexpr Index FirstAtCompileTime = ValueAtCompileTime;
+  static constexpr Index SizeAtCompileTime = Index(1);
+  static constexpr Index IncrAtCompileTime = Index(1);  // Needs to be 1 to be treated as block-like.
+
+  constexpr SingleRange(Index v) noexcept : value_(v) {}
+  constexpr Index operator[](Index) const noexcept { return first(); }
+  constexpr Index first() const noexcept { return value_.value(); }
+  constexpr Index size() const noexcept { return SizeAtCompileTime; }
+  constexpr Index incr() const noexcept { return IncrAtCompileTime; }
+
+ private:
+  variable_if_dynamicindex<Index, int(ValueAtCompileTime)> value_;
+};
+
+template <typename T>
+struct is_single_range : public std::false_type {};
+
+template <Index ValueAtCompileTime>
+struct is_single_range<SingleRange<ValueAtCompileTime>> : public std::true_type {};
+
+template <typename SingleIndex, int NestedSizeAtCompileTime>
+struct IndexedViewHelperIndicesWrapper<
+    SingleIndex, NestedSizeAtCompileTime,
+    std::enable_if_t<std::is_integral<SingleIndex>::value || symbolic::is_symbolic<SingleIndex>::value>> {
+  static constexpr Index EvalValueAtCompileTime =
+      SymbolicExpressionEvaluator<SingleIndex, NestedSizeAtCompileTime>::ValueAtCompileTime;
+  static constexpr Index ValueAtCompileTime =
+      (int(EvalValueAtCompileTime) == Undefined) ? Index(DynamicIndex) : EvalValueAtCompileTime;
+  using type = SingleRange<ValueAtCompileTime>;
+  static type CreateIndexSequence(const SingleIndex& index, Index nested_size) {
+    return type(SymbolicExpressionEvaluator<SingleIndex, NestedSizeAtCompileTime>::eval(index, nested_size));
+  }
+};
+
+template <int N, int NestedSizeAtCompileTime>
+struct IndexedViewHelperIndicesWrapper<FixedInt<N>, NestedSizeAtCompileTime, void> {
+  using type = SingleRange<Index(N)>;
+  static type CreateIndexSequence(const FixedInt<N>& /*index*/) { return type(Index(N)); }
+};
+
+template <Index ValueAtCompileTime>
+struct IndexedViewHelper<SingleRange<ValueAtCompileTime>, void> {
+  using Indices = SingleRange<ValueAtCompileTime>;
+  static constexpr Index FirstAtCompileTime = Indices::FirstAtCompileTime;
+  static constexpr Index SizeAtCompileTime = Indices::SizeAtCompileTime;
+  static constexpr Index IncrAtCompileTime = Indices::IncrAtCompileTime;
+
+  static constexpr Index first(const Indices& indices) { return indices.first(); }
+  static constexpr Index size(const Indices& /*indices*/) { return SizeAtCompileTime; }
+  static constexpr Index incr(const Indices& /*indices*/) { return IncrAtCompileTime; }
+};
+
+//--------------------------------------------------------------------------------
+// Handling of all
+//--------------------------------------------------------------------------------
+
+// Convert a symbolic 'all' into a usable range type
+template <Index SizeAtCompileTime_>
+class AllRange {
+ public:
+  static constexpr Index FirstAtCompileTime = Index(0);
+  static constexpr Index SizeAtCompileTime = SizeAtCompileTime_;
+  static constexpr Index IncrAtCompileTime = Index(1);
+  constexpr AllRange(Index size) : size_(size) {}
+  constexpr Index operator[](Index i) const noexcept { return i; }
+  constexpr Index first() const noexcept { return FirstAtCompileTime; }
+  constexpr Index size() const noexcept { return size_.value(); }
+  constexpr Index incr() const noexcept { return IncrAtCompileTime; }
+
+ private:
+  variable_if_dynamic<Index, int(SizeAtCompileTime)> size_;
+};
+
+template <int NestedSizeAtCompileTime>
+struct IndexedViewHelperIndicesWrapper<all_t, NestedSizeAtCompileTime, void> {
+  using type = AllRange<Index(NestedSizeAtCompileTime)>;
+  static type CreateIndexSequence(const all_t& /*indices*/, Index nested_size) { return type(nested_size); }
+};
+
+template <Index SizeAtCompileTime_>
+struct IndexedViewHelper<AllRange<SizeAtCompileTime_>, void> {
+  using Indices = AllRange<SizeAtCompileTime_>;
+  static constexpr Index FirstAtCompileTime = Indices::FirstAtCompileTime;
+  static constexpr Index SizeAtCompileTime = Indices::SizeAtCompileTime;
+  static constexpr Index IncrAtCompileTime = Indices::IncrAtCompileTime;
+
+  static Index first(const Indices& indices) { return indices.first(); }
+  static Index size(const Indices& indices) { return indices.size(); }
+  static Index incr(const Indices& indices) { return indices.incr(); }
+};
+
+// this helper class assumes internal::valid_indexed_view_overload<RowIndices, ColIndices>::value == true
+template <typename Derived, typename RowIndices, typename ColIndices, typename EnableIf = void>
+struct IndexedViewSelector;
+
+template <typename Indices, int SizeAtCompileTime>
+using IvcType = typename internal::IndexedViewHelperIndicesWrapper<Indices, SizeAtCompileTime>::type;
+
+template <int SizeAtCompileTime, typename Indices>
+inline IvcType<Indices, SizeAtCompileTime> CreateIndexSequence(size_t size, const Indices& indices) {
+  return internal::IndexedViewHelperIndicesWrapper<Indices, SizeAtCompileTime>::CreateIndexSequence(indices, size);
+}
+
+// Generic
+template <typename Derived, typename RowIndices, typename ColIndices>
+struct IndexedViewSelector<Derived, RowIndices, ColIndices,
+                           std::enable_if_t<internal::traits<
+                               IndexedView<Derived, IvcType<RowIndices, Derived::RowsAtCompileTime>,
+                                           IvcType<ColIndices, Derived::ColsAtCompileTime>>>::ReturnAsIndexedView>> {
+  using ReturnType = IndexedView<Derived, IvcType<RowIndices, Derived::RowsAtCompileTime>,
+                                 IvcType<ColIndices, Derived::ColsAtCompileTime>>;
+  using ConstReturnType = IndexedView<const Derived, IvcType<RowIndices, Derived::RowsAtCompileTime>,
+                                      IvcType<ColIndices, Derived::ColsAtCompileTime>>;
+
+  static inline ReturnType run(Derived& derived, const RowIndices& rowIndices, const ColIndices& colIndices) {
+    return ReturnType(derived, CreateIndexSequence<Derived::RowsAtCompileTime>(derived.rows(), rowIndices),
+                      CreateIndexSequence<Derived::ColsAtCompileTime>(derived.cols(), colIndices));
+  }
+  static inline ConstReturnType run(const Derived& derived, const RowIndices& rowIndices,
+                                    const ColIndices& colIndices) {
+    return ConstReturnType(derived, CreateIndexSequence<Derived::RowsAtCompileTime>(derived.rows(), rowIndices),
+                           CreateIndexSequence<Derived::ColsAtCompileTime>(derived.cols(), colIndices));
+  }
+};
+
+// Block
+template <typename Derived, typename RowIndices, typename ColIndices>
+struct IndexedViewSelector<
+    Derived, RowIndices, ColIndices,
+    std::enable_if_t<internal::traits<IndexedView<Derived, IvcType<RowIndices, Derived::RowsAtCompileTime>,
+                                                  IvcType<ColIndices, Derived::ColsAtCompileTime>>>::ReturnAsBlock>> {
+  using ActualRowIndices = IvcType<RowIndices, Derived::RowsAtCompileTime>;
+  using ActualColIndices = IvcType<ColIndices, Derived::ColsAtCompileTime>;
+  using IndexedViewType = IndexedView<Derived, ActualRowIndices, ActualColIndices>;
+  using ConstIndexedViewType = IndexedView<const Derived, ActualRowIndices, ActualColIndices>;
+  using ReturnType = typename internal::traits<IndexedViewType>::BlockType;
+  using ConstReturnType = typename internal::traits<ConstIndexedViewType>::BlockType;
+  using RowHelper = internal::IndexedViewHelper<ActualRowIndices>;
+  using ColHelper = internal::IndexedViewHelper<ActualColIndices>;
+
+  static inline ReturnType run(Derived& derived, const RowIndices& rowIndices, const ColIndices& colIndices) {
+    auto actualRowIndices = CreateIndexSequence<Derived::RowsAtCompileTime>(derived.rows(), rowIndices);
+    auto actualColIndices = CreateIndexSequence<Derived::ColsAtCompileTime>(derived.cols(), colIndices);
+    return ReturnType(derived, RowHelper::first(actualRowIndices), ColHelper::first(actualColIndices),
+                      RowHelper::size(actualRowIndices), ColHelper::size(actualColIndices));
+  }
+  static inline ConstReturnType run(const Derived& derived, const RowIndices& rowIndices,
+                                    const ColIndices& colIndices) {
+    auto actualRowIndices = CreateIndexSequence<Derived::RowsAtCompileTime>(derived.rows(), rowIndices);
+    auto actualColIndices = CreateIndexSequence<Derived::ColsAtCompileTime>(derived.cols(), colIndices);
+    return ConstReturnType(derived, RowHelper::first(actualRowIndices), ColHelper::first(actualColIndices),
+                           RowHelper::size(actualRowIndices), ColHelper::size(actualColIndices));
+  }
+};
+
+// Scalar
+template <typename Derived, typename RowIndices, typename ColIndices>
+struct IndexedViewSelector<
+    Derived, RowIndices, ColIndices,
+    std::enable_if_t<internal::traits<IndexedView<Derived, IvcType<RowIndices, Derived::RowsAtCompileTime>,
+                                                  IvcType<ColIndices, Derived::ColsAtCompileTime>>>::ReturnAsScalar>> {
+  using ReturnType = typename DenseBase<Derived>::Scalar&;
+  using ConstReturnType = typename DenseBase<Derived>::CoeffReturnType;
+  using ActualRowIndices = IvcType<RowIndices, Derived::RowsAtCompileTime>;
+  using ActualColIndices = IvcType<ColIndices, Derived::ColsAtCompileTime>;
+  using RowHelper = internal::IndexedViewHelper<ActualRowIndices>;
+  using ColHelper = internal::IndexedViewHelper<ActualColIndices>;
+  static inline ReturnType run(Derived& derived, const RowIndices& rowIndices, const ColIndices& colIndices) {
+    auto actualRowIndices = CreateIndexSequence<Derived::RowsAtCompileTime>(derived.rows(), rowIndices);
+    auto actualColIndices = CreateIndexSequence<Derived::ColsAtCompileTime>(derived.cols(), colIndices);
+    return derived(RowHelper::first(actualRowIndices), ColHelper::first(actualColIndices));
+  }
+  static inline ConstReturnType run(const Derived& derived, const RowIndices& rowIndices,
+                                    const ColIndices& colIndices) {
+    auto actualRowIndices = CreateIndexSequence<Derived::RowsAtCompileTime>(derived.rows(), rowIndices);
+    auto actualColIndices = CreateIndexSequence<Derived::ColsAtCompileTime>(derived.cols(), colIndices);
+    return derived(RowHelper::first(actualRowIndices), ColHelper::first(actualColIndices));
+  }
+};
+
+// this helper class assumes internal::is_valid_index_type<Indices>::value == false
+template <typename Derived, typename Indices, typename EnableIf = void>
+struct VectorIndexedViewSelector;
+
+// Generic
+template <typename Derived, typename Indices>
+struct VectorIndexedViewSelector<
+    Derived, Indices,
+    std::enable_if_t<!internal::is_single_range<IvcType<Indices, Derived::SizeAtCompileTime>>::value &&
+                     internal::IndexedViewHelper<IvcType<Indices, Derived::SizeAtCompileTime>>::IncrAtCompileTime !=
+                         1>> {
+  static constexpr bool IsRowMajor = DenseBase<Derived>::IsRowMajor;
+  using ZeroIndex = internal::SingleRange<Index(0)>;
+  using RowMajorReturnType = IndexedView<Derived, ZeroIndex, IvcType<Indices, Derived::SizeAtCompileTime>>;
+  using ConstRowMajorReturnType = IndexedView<const Derived, ZeroIndex, IvcType<Indices, Derived::SizeAtCompileTime>>;
+
+  using ColMajorReturnType = IndexedView<Derived, IvcType<Indices, Derived::SizeAtCompileTime>, ZeroIndex>;
+  using ConstColMajorReturnType = IndexedView<const Derived, IvcType<Indices, Derived::SizeAtCompileTime>, ZeroIndex>;
+
+  using ReturnType = typename internal::conditional<IsRowMajor, RowMajorReturnType, ColMajorReturnType>::type;
+  using ConstReturnType =
+      typename internal::conditional<IsRowMajor, ConstRowMajorReturnType, ConstColMajorReturnType>::type;
+
+  template <bool UseRowMajor = IsRowMajor, std::enable_if_t<UseRowMajor, bool> = true>
+  static inline RowMajorReturnType run(Derived& derived, const Indices& indices) {
+    return RowMajorReturnType(derived, ZeroIndex(0),
+                              CreateIndexSequence<Derived::ColsAtCompileTime>(derived.cols(), indices));
+  }
+  template <bool UseRowMajor = IsRowMajor, std::enable_if_t<UseRowMajor, bool> = true>
+  static inline ConstRowMajorReturnType run(const Derived& derived, const Indices& indices) {
+    return ConstRowMajorReturnType(derived, ZeroIndex(0),
+                                   CreateIndexSequence<Derived::ColsAtCompileTime>(derived.cols(), indices));
+  }
+  template <bool UseRowMajor = IsRowMajor, std::enable_if_t<!UseRowMajor, bool> = true>
+  static inline ColMajorReturnType run(Derived& derived, const Indices& indices) {
+    return ColMajorReturnType(derived, CreateIndexSequence<Derived::RowsAtCompileTime>(derived.rows(), indices),
+                              ZeroIndex(0));
+  }
+  template <bool UseRowMajor = IsRowMajor, std::enable_if_t<!UseRowMajor, bool> = true>
+  static inline ConstColMajorReturnType run(const Derived& derived, const Indices& indices) {
+    return ConstColMajorReturnType(derived, CreateIndexSequence<Derived::RowsAtCompileTime>(derived.rows(), indices),
+                                   ZeroIndex(0));
+  }
+};
+
+// Block
+template <typename Derived, typename Indices>
+struct VectorIndexedViewSelector<
+    Derived, Indices,
+    std::enable_if_t<!internal::is_single_range<IvcType<Indices, Derived::SizeAtCompileTime>>::value &&
+                     internal::IndexedViewHelper<IvcType<Indices, Derived::SizeAtCompileTime>>::IncrAtCompileTime ==
+                         1>> {
+  using Helper = internal::IndexedViewHelper<IvcType<Indices, Derived::SizeAtCompileTime>>;
+  using ReturnType = VectorBlock<Derived, Helper::SizeAtCompileTime>;
+  using ConstReturnType = VectorBlock<const Derived, Helper::SizeAtCompileTime>;
+  static inline ReturnType run(Derived& derived, const Indices& indices) {
+    auto actualIndices = CreateIndexSequence<Derived::SizeAtCompileTime>(derived.size(), indices);
+    return ReturnType(derived, Helper::first(actualIndices), Helper::size(actualIndices));
+  }
+  static inline ConstReturnType run(const Derived& derived, const Indices& indices) {
+    auto actualIndices = CreateIndexSequence<Derived::SizeAtCompileTime>(derived.size(), indices);
+    return ConstReturnType(derived, Helper::first(actualIndices), Helper::size(actualIndices));
+  }
+};
+
+// Symbolic
+template <typename Derived, typename Indices>
+struct VectorIndexedViewSelector<
+    Derived, Indices,
+    std::enable_if_t<internal::is_single_range<IvcType<Indices, Derived::SizeAtCompileTime>>::value>> {
+  using ReturnType = typename DenseBase<Derived>::Scalar&;
+  using ConstReturnType = typename DenseBase<Derived>::CoeffReturnType;
+  using Helper = internal::IndexedViewHelper<IvcType<Indices, Derived::SizeAtCompileTime>>;
+  static inline ReturnType run(Derived& derived, const Indices& indices) {
+    auto actualIndices = CreateIndexSequence<Derived::SizeAtCompileTime>(derived.size(), indices);
+    return derived(Helper::first(actualIndices));
+  }
+  static inline ConstReturnType run(const Derived& derived, const Indices& indices) {
+    auto actualIndices = CreateIndexSequence<Derived::SizeAtCompileTime>(derived.size(), indices);
+    return derived(Helper::first(actualIndices));
+  }
+};
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_INDEXED_VIEW_HELPER_H
diff --git a/inst/include/Eigen/src/Core/util/IntegralConstant.h b/inst/include/Eigen/src/Core/util/IntegralConstant.h
new file mode 100644
index 00000000..53fabd59
--- /dev/null
+++ b/inst/include/Eigen/src/Core/util/IntegralConstant.h
@@ -0,0 +1,279 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_INTEGRAL_CONSTANT_H
+#define EIGEN_INTEGRAL_CONSTANT_H
+
+// IWYU pragma: private
+#include "../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+template <int N>
+class FixedInt;
+template <int N>
+class VariableAndFixedInt;
+
+/** \internal
+ * \class FixedInt
+ *
+ * This class embeds a compile-time integer \c N.
+ *
+ * It is similar to c++11 std::integral_constant<int,N> but with some additional features
+ * such as:
+ *  - implicit conversion to int
+ *  - arithmetic and some bitwise operators: -, +, *, /, %, &, |
+ *  - c++98/14 compatibility with fix<N> and fix<N>() syntax to define integral constants.
+ *
+ * It is strongly discouraged to directly deal with this class FixedInt. Instances are expected to
+ * be created by the user using Eigen::fix<N> or Eigen::fix<N>().
+ * \code
+ * internal::cleanup_index_type<T>::type
+ * internal::cleanup_index_type<T,DynamicKey>::type
+ * \endcode
+ * where T can a FixedInt<N>, a pointer to function FixedInt<N> (*)(), or numerous other integer-like representations.
+ * \c DynamicKey is either Dynamic (default) or DynamicIndex and used to identify true compile-time values.
+ *
+ * For convenience, you can extract the compile-time value \c N in a generic way using the following helper:
+ * \code
+ * internal::get_fixed_value<T,DefaultVal>::value
+ * \endcode
+ * that will give you \c N if T equals FixedInt<N> or FixedInt<N> (*)(), and \c DefaultVal if T does not embed any
+ * compile-time value (e.g., T==int).
+ *
+ * \sa fix<N>, class VariableAndFixedInt
+ */
+template <int N>
+class FixedInt {
+ public:
+  static constexpr int value = N;
+  constexpr operator int() const { return N; }
+
+  constexpr FixedInt() = default;
+  constexpr FixedInt(std::integral_constant<int, N>) {}
+
+  constexpr FixedInt(VariableAndFixedInt<N> other) {
+#ifndef EIGEN_INTERNAL_DEBUGGING
+    EIGEN_UNUSED_VARIABLE(other);
+#endif
+    eigen_internal_assert(int(other) == N);
+  }
+
+  constexpr FixedInt<-N> operator-() const { return FixedInt<-N>(); }
+
+  template <int M>
+  constexpr FixedInt<N + M> operator+(FixedInt<M>) const {
+    return FixedInt<N + M>();
+  }
+
+  template <int M>
+  constexpr FixedInt<N - M> operator-(FixedInt<M>) const {
+    return FixedInt<N - M>();
+  }
+
+  template <int M>
+  constexpr FixedInt<N * M> operator*(FixedInt<M>) const {
+    return FixedInt<N * M>();
+  }
+
+  template <int M>
+  constexpr FixedInt<N / M> operator/(FixedInt<M>) const {
+    return FixedInt<N / M>();
+  }
+
+  template <int M>
+  constexpr FixedInt<N % M> operator%(FixedInt<M>) const {
+    return FixedInt<N % M>();
+  }
+
+  template <int M>
+  constexpr FixedInt<N | M> operator|(FixedInt<M>) const {
+    return FixedInt<N | M>();
+  }
+
+  template <int M>
+  constexpr FixedInt<N & M> operator&(FixedInt<M>) const {
+    return FixedInt<N & M>();
+  }
+
+  // Needed in C++14 to allow fix<N>():
+  constexpr FixedInt operator()() const { return *this; }
+
+  constexpr VariableAndFixedInt<N> operator()(int val) const { return VariableAndFixedInt<N>(val); }
+};
+
+/** \internal
+ * \class VariableAndFixedInt
+ *
+ * This class embeds both a compile-time integer \c N and a runtime integer.
+ * Both values are supposed to be equal unless the compile-time value \c N has a special
+ * value meaning that the runtime-value should be used. Depending on the context, this special
+ * value can be either Eigen::Dynamic (for positive quantities) or Eigen::DynamicIndex (for
+ * quantities that can be negative).
+ *
+ * It is the return-type of the function Eigen::fix<N>(int), and most of the time this is the only
+ * way it is used. It is strongly discouraged to directly deal with instances of VariableAndFixedInt.
+ * Indeed, in order to write generic code, it is the responsibility of the callee to properly convert
+ * it to either a true compile-time quantity (i.e. a FixedInt<N>), or to a runtime quantity (e.g., an Index)
+ * using the following generic helper:
+ * \code
+ * internal::cleanup_index_type<T>::type
+ * internal::cleanup_index_type<T,DynamicKey>::type
+ * \endcode
+ * where T can be a template instantiation of VariableAndFixedInt or numerous other integer-like representations.
+ * \c DynamicKey is either Dynamic (default) or DynamicIndex and used to identify true compile-time values.
+ *
+ * For convenience, you can also extract the compile-time value \c N using the following helper:
+ * \code
+ * internal::get_fixed_value<T,DefaultVal>::value
+ * \endcode
+ * that will give you \c N if T equals VariableAndFixedInt<N>, and \c DefaultVal if T does not embed any compile-time
+ * value (e.g., T==int).
+ *
+ * \sa fix<N>(int), class FixedInt
+ */
+template <int N>
+class VariableAndFixedInt {
+ public:
+  static const int value = N;
+  operator int() const { return m_value; }
+  VariableAndFixedInt(int val) { m_value = val; }
+
+ protected:
+  int m_value;
+};
+
+template <typename T, int Default = Dynamic>
+struct get_fixed_value {
+  static const int value = Default;
+};
+
+template <int N, int Default>
+struct get_fixed_value<FixedInt<N>, Default> {
+  static const int value = N;
+};
+
+template <int N, int Default>
+struct get_fixed_value<VariableAndFixedInt<N>, Default> {
+  static const int value = N;
+};
+
+template <typename T, int N, int Default>
+struct get_fixed_value<variable_if_dynamic<T, N>, Default> {
+  static const int value = N;
+};
+
+template <typename T>
+EIGEN_DEVICE_FUNC Index get_runtime_value(const T &x) {
+  return x;
+}
+
+// Cleanup integer/FixedInt/VariableAndFixedInt/etc types:
+
+// By default, no cleanup:
+template <typename T, int DynamicKey = Dynamic, typename EnableIf = void>
+struct cleanup_index_type {
+  typedef T type;
+};
+
+// Convert any integral type (e.g., short, int, unsigned int, etc.) to Eigen::Index
+template <typename T, int DynamicKey>
+struct cleanup_index_type<T, DynamicKey, std::enable_if_t<internal::is_integral<T>::value>> {
+  typedef Index type;
+};
+
+// If VariableAndFixedInt does not match DynamicKey, then we turn it to a pure compile-time value:
+template <int N, int DynamicKey>
+struct cleanup_index_type<VariableAndFixedInt<N>, DynamicKey> {
+  typedef FixedInt<N> type;
+};
+// If VariableAndFixedInt matches DynamicKey, then we turn it to a pure runtime-value (aka Index):
+template <int DynamicKey>
+struct cleanup_index_type<VariableAndFixedInt<DynamicKey>, DynamicKey> {
+  typedef Index type;
+};
+
+template <int N, int DynamicKey>
+struct cleanup_index_type<std::integral_constant<int, N>, DynamicKey> {
+  typedef FixedInt<N> type;
+};
+
+}  // end namespace internal
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+
+template <int N>
+constexpr internal::FixedInt<N> fix{};
+
+#else  // EIGEN_PARSED_BY_DOXYGEN
+
+/** \var fix<N>()
+ * \ingroup Core_Module
+ *
+ * This \em identifier permits to construct an object embedding a compile-time integer \c N.
+ *
+ * \tparam N the compile-time integer value
+ *
+ * It is typically used in conjunction with the Eigen::seq and Eigen::seqN functions to pass compile-time values to
+ * them: \code seqN(10,fix<4>,fix<-3>)   // <=> [10 7 4 1] \endcode
+ *
+ * See also the function fix(int) to pass both a compile-time and runtime value.
+ *
+ * In c++14, it is implemented as:
+ * \code
+ * template<int N> static const internal::FixedInt<N> fix{};
+ * \endcode
+ * where internal::FixedInt<N> is an internal template class similar to
+ * <a href="http://en.cppreference.com/w/cpp/types/integral_constant">\c std::integral_constant </a><tt> <int,N> </tt>
+ * Here, \c fix<N> is thus an object of type \c internal::FixedInt<N>.
+ *
+ * \sa fix<N>(int), seq, seqN
+ */
+template <int N>
+static const auto fix();
+
+/** \fn fix<N>(int)
+ * \ingroup Core_Module
+ *
+ * This function returns an object embedding both a compile-time integer \c N, and a fallback runtime value \a val.
+ *
+ * \tparam N the compile-time integer value
+ * \param  val the fallback runtime integer value
+ *
+ * This function is a more general version of the \ref fix identifier/function that can be used in template code
+ * where the compile-time value could turn out to actually mean "undefined at compile-time". For positive integers
+ * such as a size or a dimension, this case is identified by Eigen::Dynamic, whereas runtime signed integers
+ * (e.g., an increment/stride) are identified as Eigen::DynamicIndex. In such a case, the runtime value \a val
+ * will be used as a fallback.
+ *
+ * A typical use case would be:
+ * \code
+ * template<typename Derived> void foo(const MatrixBase<Derived> &mat) {
+ *   const int N = Derived::RowsAtCompileTime==Dynamic ? Dynamic : Derived::RowsAtCompileTime/2;
+ *   const int n = mat.rows()/2;
+ *   ... mat( seqN(0,fix<N>(n) ) ...;
+ * }
+ * \endcode
+ * In this example, the function Eigen::seqN knows that the second argument is expected to be a size.
+ * If the passed compile-time value N equals Eigen::Dynamic, then the proxy object returned by fix will be dismissed,
+ * and converted to an Eigen::Index of value \c n. Otherwise, the runtime-value \c n will be dismissed, and the
+ * returned ArithmeticSequence will be of the exact same type as <tt> seqN(0,fix<N>) </tt>.
+ *
+ * \sa fix, seqN, class ArithmeticSequence
+ */
+template <int N>
+static const auto fix(int val);
+
+#endif  // EIGEN_PARSED_BY_DOXYGEN
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_INTEGRAL_CONSTANT_H
diff --git a/inst/include/Eigen/src/Core/util/MKL_support.h b/inst/include/Eigen/src/Core/util/MKL_support.h
index 1ef3b61d..3e44a265 100644
--- a/inst/include/Eigen/src/Core/util/MKL_support.h
+++ b/inst/include/Eigen/src/Core/util/MKL_support.h
@@ -34,46 +34,53 @@
 #define EIGEN_MKL_SUPPORT_H
 
 #ifdef EIGEN_USE_MKL_ALL
-  #ifndef EIGEN_USE_BLAS
-    #define EIGEN_USE_BLAS
-  #endif
-  #ifndef EIGEN_USE_LAPACKE
-    #define EIGEN_USE_LAPACKE
-  #endif
-  #ifndef EIGEN_USE_MKL_VML
-    #define EIGEN_USE_MKL_VML
-  #endif
+#ifndef EIGEN_USE_BLAS
+#define EIGEN_USE_BLAS
+#endif
+#ifndef EIGEN_USE_LAPACKE
+#define EIGEN_USE_LAPACKE
+#endif
+#ifndef EIGEN_USE_MKL_VML
+#define EIGEN_USE_MKL_VML
+#endif
 #endif
 
 #ifdef EIGEN_USE_LAPACKE_STRICT
-  #define EIGEN_USE_LAPACKE
+#define EIGEN_USE_LAPACKE
 #endif
 
-#if defined(EIGEN_USE_BLAS) || defined(EIGEN_USE_LAPACKE) || defined(EIGEN_USE_MKL_VML)
-  #define EIGEN_USE_MKL
+#if defined(EIGEN_USE_MKL_VML) && !defined(EIGEN_USE_MKL)
+#define EIGEN_USE_MKL
 #endif
 
 #if defined EIGEN_USE_MKL
-#   include <mkl.h> 
+#if (!defined MKL_DIRECT_CALL) && (!defined EIGEN_MKL_NO_DIRECT_CALL)
+#define MKL_DIRECT_CALL
+#define MKL_DIRECT_CALL_JUST_SET
+#endif
+#include <mkl.h>
 /*Check IMKL version for compatibility: < 10.3 is not usable with Eigen*/
-#   ifndef INTEL_MKL_VERSION
-#       undef EIGEN_USE_MKL /* INTEL_MKL_VERSION is not even defined on older versions */
-#   elif INTEL_MKL_VERSION < 100305    /* the intel-mkl-103-release-notes say this was when the lapacke.h interface was added*/
-#       undef EIGEN_USE_MKL
-#   endif
-#   ifndef EIGEN_USE_MKL
-    /*If the MKL version is too old, undef everything*/
-#       undef   EIGEN_USE_MKL_ALL
-#       undef   EIGEN_USE_BLAS
-#       undef   EIGEN_USE_LAPACKE
-#       undef   EIGEN_USE_MKL_VML
-#       undef   EIGEN_USE_LAPACKE_STRICT
-#       undef   EIGEN_USE_LAPACKE
-#   endif
+#ifndef INTEL_MKL_VERSION
+#undef EIGEN_USE_MKL /* INTEL_MKL_VERSION is not even defined on older versions */
+#elif INTEL_MKL_VERSION < \
+    100305 /* the intel-mkl-103-release-notes say this was when the lapacke.h interface was added*/
+#undef EIGEN_USE_MKL
+#endif
+#ifndef EIGEN_USE_MKL
+/*If the MKL version is too old, undef everything*/
+#undef EIGEN_USE_MKL_ALL
+#undef EIGEN_USE_LAPACKE
+#undef EIGEN_USE_MKL_VML
+#undef EIGEN_USE_LAPACKE_STRICT
+#undef EIGEN_USE_LAPACKE
+#ifdef MKL_DIRECT_CALL_JUST_SET
+#undef MKL_DIRECT_CALL
+#endif
+#endif
 #endif
 
 #if defined EIGEN_USE_MKL
-#include <mkl_lapacke.h>
+
 #define EIGEN_MKL_VML_THRESHOLD 128
 
 /* MKL_DOMAIN_BLAS, etc are defined only in 10.3 update 7 */
@@ -107,52 +114,26 @@
 #else
 #define EIGEN_MKL_DOMAIN_PARDISO MKL_PARDISO
 #endif
+#endif
 
-namespace Eigen {
-
-typedef std::complex<double> dcomplex;
-typedef std::complex<float>  scomplex;
-
-namespace internal {
-
-template<typename MKLType, typename EigenType>
-static inline void assign_scalar_eig2mkl(MKLType& mklScalar, const EigenType& eigenScalar) {
-  mklScalar=eigenScalar;
-}
-
-template<typename MKLType, typename EigenType>
-static inline void assign_conj_scalar_eig2mkl(MKLType& mklScalar, const EigenType& eigenScalar) {
-  mklScalar=eigenScalar;
-}
-
-template <>
-inline void assign_scalar_eig2mkl<MKL_Complex16,dcomplex>(MKL_Complex16& mklScalar, const dcomplex& eigenScalar) {
-  mklScalar.real=eigenScalar.real();
-  mklScalar.imag=eigenScalar.imag();
-}
-
-template <>
-inline void assign_scalar_eig2mkl<MKL_Complex8,scomplex>(MKL_Complex8& mklScalar, const scomplex& eigenScalar) {
-  mklScalar.real=eigenScalar.real();
-  mklScalar.imag=eigenScalar.imag();
-}
-
-template <>
-inline void assign_conj_scalar_eig2mkl<MKL_Complex16,dcomplex>(MKL_Complex16& mklScalar, const dcomplex& eigenScalar) {
-  mklScalar.real=eigenScalar.real();
-  mklScalar.imag=-eigenScalar.imag();
-}
+#if defined(EIGEN_USE_BLAS) && !defined(EIGEN_USE_MKL)
+#include "../../misc/blas.h"
+#endif
 
-template <>
-inline void assign_conj_scalar_eig2mkl<MKL_Complex8,scomplex>(MKL_Complex8& mklScalar, const scomplex& eigenScalar) {
-  mklScalar.real=eigenScalar.real();
-  mklScalar.imag=-eigenScalar.imag();
-}
+// IWYU pragma: private
+#include "../InternalHeaderCheck.h"
 
-} // end namespace internal
+namespace Eigen {
 
-} // end namespace Eigen
+typedef std::complex<double> dcomplex;
+typedef std::complex<float> scomplex;
 
+#if defined(EIGEN_USE_MKL)
+typedef MKL_INT BlasIndex;
+#else
+typedef int BlasIndex;
 #endif
 
-#endif // EIGEN_MKL_SUPPORT_H
+}  // end namespace Eigen
+
+#endif  // EIGEN_MKL_SUPPORT_H
diff --git a/inst/include/Eigen/src/Core/util/Macros.h b/inst/include/Eigen/src/Core/util/Macros.h
index 53fb5fae..dad36716 100644
--- a/inst/include/Eigen/src/Core/util/Macros.h
+++ b/inst/include/Eigen/src/Core/util/Macros.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -10,127 +10,841 @@
 
 #ifndef EIGEN_MACROS_H
 #define EIGEN_MACROS_H
+// IWYU pragma: private
+#include "../InternalHeaderCheck.h"
 
-#define EIGEN_WORLD_VERSION 3
-#define EIGEN_MAJOR_VERSION 2
-#define EIGEN_MINOR_VERSION 7
+//------------------------------------------------------------------------------------------
+// Eigen version and basic defaults
+//------------------------------------------------------------------------------------------
 
-#define EIGEN_VERSION_AT_LEAST(x,y,z) (EIGEN_WORLD_VERSION>x || (EIGEN_WORLD_VERSION>=x && \
-                                      (EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \
-                                                                 EIGEN_MINOR_VERSION>=z))))
-#ifdef __GNUC__
-  #define EIGEN_GNUC_AT_LEAST(x,y) ((__GNUC__==x && __GNUC_MINOR__>=y) || __GNUC__>x)
+#define EIGEN_VERSION_AT_LEAST(x, y, z) \
+  (EIGEN_MAJOR_VERSION > x ||           \
+   (EIGEN_MAJOR_VERSION >= x && (EIGEN_MINOR_VERSION > y || (EIGEN_MINOR_VERSION >= y && EIGEN_PATCH_VERSION >= z))))
+
+#ifdef EIGEN_DEFAULT_TO_ROW_MAJOR
+#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION Eigen::RowMajor
 #else
-  #define EIGEN_GNUC_AT_LEAST(x,y) 0
+#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION Eigen::ColMajor
+#endif
+
+#ifndef EIGEN_DEFAULT_DENSE_INDEX_TYPE
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE std::ptrdiff_t
+#endif
+
+// Upperbound on the C++ version to use.
+// Expected values are 03, 11, 14, 17, etc.
+// By default, let's use an arbitrarily large C++ version.
+#ifndef EIGEN_MAX_CPP_VER
+#define EIGEN_MAX_CPP_VER 99
+#endif
+
+/** Allows to disable some optimizations which might affect the accuracy of the result.
+ * Such optimization are enabled by default, and set EIGEN_FAST_MATH to 0 to disable them.
+ * They currently include:
+ *   - single precision ArrayBase::sin() and ArrayBase::cos() for SSE and AVX vectorization.
+ */
+#ifndef EIGEN_FAST_MATH
+#define EIGEN_FAST_MATH 1
+#endif
+
+#ifndef EIGEN_STACK_ALLOCATION_LIMIT
+// 131072 == 128 KB
+#define EIGEN_STACK_ALLOCATION_LIMIT 131072
 #endif
- 
+
+/* Specify whether to use std::fma for scalar multiply-add instructions.
+ *
+ * On machines that have FMA as a single instruction, this will generally
+ * improve precision without significant performance implications.
+ *
+ * Without a single instruction, performance has been found to be reduced 2-3x
+ * on Intel CPUs, and up to 30x for WASM.
+ *
+ * If unspecified, defaults to using FMA if hardware support is available.
+ * The default should be used in most cases to ensure consistency between
+ * vectorized and non-vectorized paths.
+ */
+#ifndef EIGEN_SCALAR_MADD_USE_FMA
+#ifdef EIGEN_VECTORIZE_FMA
+#define EIGEN_SCALAR_MADD_USE_FMA 1
+#else
+#define EIGEN_SCALAR_MADD_USE_FMA 0
+#endif
+#endif
+
+//------------------------------------------------------------------------------------------
+// Compiler identification, EIGEN_COMP_*
+//------------------------------------------------------------------------------------------
+
+/// \internal EIGEN_COMP_GNUC set to version (e.g., 951 for GCC 9.5.1) for all compilers compatible with GCC
 #ifdef __GNUC__
-  #define EIGEN_GNUC_AT_MOST(x,y) ((__GNUC__==x && __GNUC_MINOR__<=y) || __GNUC__<x)
+#define EIGEN_COMP_GNUC (__GNUC__ * 100 + __GNUC_MINOR__ * 10 + __GNUC_PATCHLEVEL__)
 #else
-  #define EIGEN_GNUC_AT_MOST(x,y) 0
+#define EIGEN_COMP_GNUC 0
 #endif
 
-#if EIGEN_GNUC_AT_MOST(4,3) && !defined(__clang__)
-  // see bug 89
-  #define EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO 0
+/// \internal EIGEN_COMP_CLANG set to version (e.g., 372 for clang 3.7.2) if the compiler is clang
+#if defined(__clang__)
+#define EIGEN_COMP_CLANG (__clang_major__ * 100 + __clang_minor__ * 10 + __clang_patchlevel__)
 #else
-  #define EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO 1
+#define EIGEN_COMP_CLANG 0
 #endif
 
-#if defined(__GNUC__) && (__GNUC__ <= 3)
-#define EIGEN_GCC3_OR_OLDER 1
+/// \internal EIGEN_COMP_CLANGAPPLE set to the version number (e.g. 9000000 for AppleClang 9.0) if the compiler is
+/// AppleClang
+#if defined(__clang__) && defined(__apple_build_version__)
+#define EIGEN_COMP_CLANGAPPLE __apple_build_version__
 #else
-#define EIGEN_GCC3_OR_OLDER 0
+#define EIGEN_COMP_CLANGAPPLE 0
 #endif
 
-// 16 byte alignment is only useful for vectorization. Since it affects the ABI, we need to enable
-// 16 byte alignment on all platforms where vectorization might be enabled. In theory we could always
-// enable alignment, but it can be a cause of problems on some platforms, so we just disable it in
-// certain common platform (compiler+architecture combinations) to avoid these problems.
-// Only static alignment is really problematic (relies on nonstandard compiler extensions that don't
-// work everywhere, for example don't work on GCC/ARM), try to keep heap alignment even
-// when we have to disable static alignment.
-#if defined(__GNUC__) && !(defined(__i386__) || defined(__x86_64__) || defined(__powerpc__) || defined(__ppc__) || defined(__ia64__))
-#define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 1
+/// \internal EIGEN_COMP_CASTXML set to 1 if being preprocessed by CastXML
+#if defined(__castxml__)
+#define EIGEN_COMP_CASTXML 1
 #else
-#define EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT 0
+#define EIGEN_COMP_CASTXML 0
 #endif
 
-// static alignment is completely disabled with GCC 3, Sun Studio, and QCC/QNX
-#if !EIGEN_GCC_AND_ARCH_DOESNT_WANT_STACK_ALIGNMENT \
- && !EIGEN_GCC3_OR_OLDER \
- && !defined(__SUNPRO_CC) \
- && !defined(__QNXNTO__)
-  #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 1
+/// \internal EIGEN_COMP_LLVM set to 1 if the compiler backend is llvm
+#if defined(__llvm__)
+#define EIGEN_COMP_LLVM 1
 #else
-  #define EIGEN_ARCH_WANTS_STACK_ALIGNMENT 0
+#define EIGEN_COMP_LLVM 0
 #endif
 
-#ifdef EIGEN_DONT_ALIGN
-  #ifndef EIGEN_DONT_ALIGN_STATICALLY
-    #define EIGEN_DONT_ALIGN_STATICALLY
-  #endif
-  #define EIGEN_ALIGN 0
+/// \internal EIGEN_COMP_ICC set to __INTEL_COMPILER if the compiler is Intel icc compiler, 0 otherwise
+#if defined(__INTEL_COMPILER)
+#define EIGEN_COMP_ICC __INTEL_COMPILER
 #else
-  #define EIGEN_ALIGN 1
+#define EIGEN_COMP_ICC 0
 #endif
 
-// EIGEN_ALIGN_STATICALLY is the true test whether we want to align arrays on the stack or not. It takes into account both the user choice to explicitly disable
-// alignment (EIGEN_DONT_ALIGN_STATICALLY) and the architecture config (EIGEN_ARCH_WANTS_STACK_ALIGNMENT). Henceforth, only EIGEN_ALIGN_STATICALLY should be used.
-#if EIGEN_ARCH_WANTS_STACK_ALIGNMENT && !defined(EIGEN_DONT_ALIGN_STATICALLY)
-  #define EIGEN_ALIGN_STATICALLY 1
+/// \internal EIGEN_COMP_CLANGICC set to __INTEL_CLANG_COMPILER if the compiler is Intel icx compiler, 0 otherwise
+#if defined(__INTEL_CLANG_COMPILER)
+#define EIGEN_COMP_CLANGICC __INTEL_CLANG_COMPILER
 #else
-  #define EIGEN_ALIGN_STATICALLY 0
-  #ifndef EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT
-    #define EIGEN_DISABLE_UNALIGNED_ARRAY_ASSERT
-  #endif
+#define EIGEN_COMP_CLANGICC 0
 #endif
 
-#ifdef EIGEN_DEFAULT_TO_ROW_MAJOR
-#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION RowMajor
+/// \internal EIGEN_COMP_MINGW set to 1 if the compiler is mingw
+#if defined(__MINGW32__)
+#define EIGEN_COMP_MINGW 1
 #else
-#define EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION ColMajor
+#define EIGEN_COMP_MINGW 0
 #endif
 
-#ifndef EIGEN_DEFAULT_DENSE_INDEX_TYPE
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE std::ptrdiff_t
+/// \internal EIGEN_COMP_SUNCC set to 1 if the compiler is Solaris Studio
+#if defined(__SUNPRO_CC)
+#define EIGEN_COMP_SUNCC 1
+#else
+#define EIGEN_COMP_SUNCC 0
+#endif
+
+/// \internal EIGEN_COMP_MSVC set to _MSC_VER if the compiler is Microsoft Visual C++, 0 otherwise.
+#if defined(_MSC_VER)
+#define EIGEN_COMP_MSVC _MSC_VER
+#else
+#define EIGEN_COMP_MSVC 0
+#endif
+
+#if defined(__NVCC__)
+#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9)
+#define EIGEN_COMP_NVCC ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100))
+#elif defined(__CUDACC_VER__)
+#define EIGEN_COMP_NVCC __CUDACC_VER__
+#else
+#error "NVCC did not define compiler version."
+#endif
+#else
+#define EIGEN_COMP_NVCC 0
+#endif
+
+// For the record, here is a table summarizing the possible values for EIGEN_COMP_MSVC:
+//  name        ver   MSC_VER
+//  2015        14      1900
+//  "15"        15      1900
+//  2017-14.1   15.0    1910
+//  2017-14.11  15.3    1911
+//  2017-14.12  15.5    1912
+//  2017-14.13  15.6    1913
+//  2017-14.14  15.7    1914
+//  2017        15.8    1915
+//  2017        15.9    1916
+//  2019 RTW    16.0    1920
+
+/// \internal EIGEN_COMP_MSVC_LANG set to _MSVC_LANG if the compiler is Microsoft Visual C++, 0 otherwise.
+#if defined(_MSVC_LANG)
+#define EIGEN_COMP_MSVC_LANG _MSVC_LANG
+#else
+#define EIGEN_COMP_MSVC_LANG 0
+#endif
+
+// For the record, here is a table summarizing the possible values for EIGEN_COMP_MSVC_LANG:
+// MSVC option                          Standard  MSVC_LANG
+// /std:c++14 (default as of VS 2019)   C++14     201402L
+// /std:c++17                           C++17     201703L
+// /std:c++latest                       >C++17    >201703L
+
+/// \internal EIGEN_COMP_MSVC_STRICT set to 1 if the compiler is really Microsoft Visual C++ and not ,e.g., ICC or
+/// clang-cl
+#if EIGEN_COMP_MSVC && !(EIGEN_COMP_ICC || EIGEN_COMP_LLVM || EIGEN_COMP_CLANG)
+#define EIGEN_COMP_MSVC_STRICT _MSC_VER
+#else
+#define EIGEN_COMP_MSVC_STRICT 0
+#endif
+
+/// \internal EIGEN_COMP_IBM set to xlc version if the compiler is IBM XL C++
+// XLC   version
+// 3.1   0x0301
+// 4.5   0x0405
+// 5.0   0x0500
+// 12.1  0x0C01
+#if defined(__IBMCPP__) || defined(__xlc__) || defined(__ibmxl__)
+#define EIGEN_COMP_IBM __xlC__
+#else
+#define EIGEN_COMP_IBM 0
+#endif
+
+/// \internal EIGEN_COMP_PGI set to PGI version if the compiler is Portland Group Compiler
+#if defined(__PGI)
+#define EIGEN_COMP_PGI (__PGIC__ * 100 + __PGIC_MINOR__)
+#else
+#define EIGEN_COMP_PGI 0
+#endif
+
+/// \internal EIGEN_COMP_NVHPC set to NVHPC version if the compiler is nvc++
+#if defined(__NVCOMPILER)
+#define EIGEN_COMP_NVHPC (__NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__)
+#else
+#define EIGEN_COMP_NVHPC 0
+#endif
+
+/// \internal EIGEN_COMP_ARM set to 1 if the compiler is ARM Compiler
+#if defined(__CC_ARM) || defined(__ARMCC_VERSION)
+#define EIGEN_COMP_ARM 1
+#else
+#define EIGEN_COMP_ARM 0
+#endif
+
+/// \internal EIGEN_COMP_EMSCRIPTEN set to 1 if the compiler is Emscripten Compiler
+#if defined(__EMSCRIPTEN__)
+#define EIGEN_COMP_EMSCRIPTEN 1
+#else
+#define EIGEN_COMP_EMSCRIPTEN 0
+#endif
+
+/// \internal EIGEN_COMP_FCC set to FCC version if the compiler is Fujitsu Compiler (traditional mode)
+/// \note The Fujitsu C/C++ compiler uses the traditional mode based
+/// on EDG g++ 6.1 by default or if invoked with the -Nnoclang flag
+#if defined(__FUJITSU)
+#define EIGEN_COMP_FCC (__FCC_major__ * 100 + __FCC_minor__ * 10 + __FCC_patchlevel__)
+#else
+#define EIGEN_COMP_FCC 0
+#endif
+
+/// \internal EIGEN_COMP_CLANGFCC set to FCC version if the compiler is Fujitsu Compiler (Clang mode)
+/// \note The Fujitsu C/C++ compiler uses the non-traditional mode
+/// based on Clang 7.1.0 if invoked with the -Nclang flag
+#if defined(__CLANG_FUJITSU)
+#define EIGEN_COMP_CLANGFCC (__FCC_major__ * 100 + __FCC_minor__ * 10 + __FCC_patchlevel__)
+#else
+#define EIGEN_COMP_CLANGFCC 0
+#endif
+
+/// \internal EIGEN_COMP_CPE set to CPE version if the compiler is HPE Cray Compiler (GCC based)
+/// \note This is the SVE-enabled C/C++ compiler from the HPE Cray
+/// Programming Environment (CPE) based on Cray GCC 8.1
+#if defined(_CRAYC) && !defined(__clang__)
+#define EIGEN_COMP_CPE (_RELEASE_MAJOR * 100 + _RELEASE_MINOR * 10 + _RELEASE_PATCHLEVEL)
+#else
+#define EIGEN_COMP_CPE 0
+#endif
+
+/// \internal EIGEN_COMP_CLANGCPE set to CPE version if the compiler is HPE Cray Compiler (Clang based)
+/// \note This is the C/C++ compiler from the HPE Cray Programming
+/// Environment (CPE) based on Cray Clang 11.0 without SVE-support
+#if defined(_CRAYC) && defined(__clang__)
+#define EIGEN_COMP_CLANGCPE (_RELEASE_MAJOR * 100 + _RELEASE_MINOR * 10 + _RELEASE_PATCHLEVEL)
+#else
+#define EIGEN_COMP_CLANGCPE 0
+#endif
+
+/// \internal EIGEN_COMP_LCC set to 1 if the compiler is MCST-LCC (MCST eLbrus Compiler Collection)
+#if defined(__LCC__) && defined(__MCST__)
+#define EIGEN_COMP_LCC (__LCC__ * 100 + __LCC_MINOR__)
+#else
+#define EIGEN_COMP_LCC 0
+#endif
+
+/// \internal EIGEN_COMP_GNUC_STRICT set to 1 if the compiler is really GCC and not a compatible compiler (e.g., ICC,
+/// clang, mingw, etc.)
+#if EIGEN_COMP_GNUC &&                                                                                      \
+    !(EIGEN_COMP_CLANG || EIGEN_COMP_ICC || EIGEN_COMP_CLANGICC || EIGEN_COMP_MINGW || EIGEN_COMP_PGI ||    \
+      EIGEN_COMP_IBM || EIGEN_COMP_ARM || EIGEN_COMP_EMSCRIPTEN || EIGEN_COMP_FCC || EIGEN_COMP_CLANGFCC || \
+      EIGEN_COMP_CPE || EIGEN_COMP_CLANGCPE || EIGEN_COMP_LCC)
+#define EIGEN_COMP_GNUC_STRICT 1
+#else
+#define EIGEN_COMP_GNUC_STRICT 0
+#endif
+
+// GCC, and compilers that pretend to be it, have different version schemes, so this only makes sense to use with the
+// real GCC.
+#if EIGEN_COMP_GNUC_STRICT
+#define EIGEN_GNUC_STRICT_AT_LEAST(x, y, z)                   \
+  ((__GNUC__ > x) || (__GNUC__ == x && __GNUC_MINOR__ > y) || \
+   (__GNUC__ == x && __GNUC_MINOR__ == y && __GNUC_PATCHLEVEL__ >= z))
+#define EIGEN_GNUC_STRICT_LESS_THAN(x, y, z)                  \
+  ((__GNUC__ < x) || (__GNUC__ == x && __GNUC_MINOR__ < y) || \
+   (__GNUC__ == x && __GNUC_MINOR__ == y && __GNUC_PATCHLEVEL__ < z))
+#else
+#define EIGEN_GNUC_STRICT_AT_LEAST(x, y, z) 0
+#define EIGEN_GNUC_STRICT_LESS_THAN(x, y, z) 0
+#endif
+
+/// \internal EIGEN_COMP_CLANG_STRICT set to 1 if the compiler is really Clang and not a compatible compiler (e.g.,
+/// AppleClang, etc.)
+#if EIGEN_COMP_CLANG && !(EIGEN_COMP_CLANGAPPLE || EIGEN_COMP_CLANGICC || EIGEN_COMP_CLANGFCC || EIGEN_COMP_CLANGCPE)
+#define EIGEN_COMP_CLANG_STRICT 1
+#else
+#define EIGEN_COMP_CLANG_STRICT 0
+#endif
+
+// Clang, and compilers forked from it, have different version schemes, so this only makes sense to use with the real
+// Clang.
+#if EIGEN_COMP_CLANG_STRICT
+#define EIGEN_CLANG_STRICT_AT_LEAST(x, y, z)                                 \
+  ((__clang_major__ > x) || (__clang_major__ == x && __clang_minor__ > y) || \
+   (__clang_major__ == x && __clang_minor__ == y && __clang_patchlevel__ >= z))
+#define EIGEN_CLANG_STRICT_LESS_THAN(x, y, z)                                \
+  ((__clang_major__ < x) || (__clang_major__ == x && __clang_minor__ < y) || \
+   (__clang_major__ == x && __clang_minor__ == y && __clang_patchlevel__ < z))
+#else
+#define EIGEN_CLANG_STRICT_AT_LEAST(x, y, z) 0
+#define EIGEN_CLANG_STRICT_LESS_THAN(x, y, z) 0
+#endif
+
+//------------------------------------------------------------------------------------------
+// Architecture identification, EIGEN_ARCH_*
+//------------------------------------------------------------------------------------------
+
+#if defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC)) || defined(__amd64)
+#define EIGEN_ARCH_x86_64 1
+#else
+#define EIGEN_ARCH_x86_64 0
+#endif
+
+#if defined(__i386__) || defined(_M_IX86) || defined(_X86_) || defined(__i386)
+#define EIGEN_ARCH_i386 1
+#else
+#define EIGEN_ARCH_i386 0
+#endif
+
+#if EIGEN_ARCH_x86_64 || EIGEN_ARCH_i386
+#define EIGEN_ARCH_i386_OR_x86_64 1
+#else
+#define EIGEN_ARCH_i386_OR_x86_64 0
+#endif
+
+/// \internal EIGEN_ARCH_ARM set to 1 if the architecture is ARM
+#if defined(__arm__)
+#define EIGEN_ARCH_ARM 1
+#else
+#define EIGEN_ARCH_ARM 0
+#endif
+
+/// \internal EIGEN_ARCH_ARM64 set to 1 if the architecture is ARM64
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
+#define EIGEN_ARCH_ARM64 1
+#else
+#define EIGEN_ARCH_ARM64 0
+#endif
+
+/// \internal EIGEN_ARCH_ARM_OR_ARM64 set to 1 if the architecture is ARM or ARM64
+#if EIGEN_ARCH_ARM || EIGEN_ARCH_ARM64
+#define EIGEN_ARCH_ARM_OR_ARM64 1
+#else
+#define EIGEN_ARCH_ARM_OR_ARM64 0
+#endif
+
+/// \internal EIGEN_ARCH_ARMV8 set to 1 if the architecture is armv8 or greater.
+#if EIGEN_ARCH_ARM_OR_ARM64 && defined(__ARM_ARCH) && __ARM_ARCH >= 8
+#define EIGEN_ARCH_ARMV8 1
+#else
+#define EIGEN_ARCH_ARMV8 0
+#endif
+
+/// \internal EIGEN_HAS_ARM64_FP16 set to 1 if the architecture provides an IEEE
+/// compliant Arm fp16 type
+#if EIGEN_ARCH_ARM_OR_ARM64
+#ifndef EIGEN_HAS_ARM64_FP16
+#if defined(__ARM_FP16_FORMAT_IEEE)
+#define EIGEN_HAS_ARM64_FP16 1
+#else
+#define EIGEN_HAS_ARM64_FP16 0
+#endif
+#endif
+#endif
+
+/// \internal EIGEN_ARCH_MIPS set to 1 if the architecture is MIPS
+#if defined(__mips__) || defined(__mips)
+#define EIGEN_ARCH_MIPS 1
+#else
+#define EIGEN_ARCH_MIPS 0
+#endif
+
+/// \internal EIGEN_ARCH_LOONGARCH64 set to 1 if the architecture is LOONGARCH64
+#if defined(__loongarch64)
+#define EIGEN_ARCH_LOONGARCH64 1
+#else
+#define EIGEN_ARCH_LOONGARCH64 0
+#endif
+
+/// \internal EIGEN_ARCH_SPARC set to 1 if the architecture is SPARC
+#if defined(__sparc__) || defined(__sparc)
+#define EIGEN_ARCH_SPARC 1
+#else
+#define EIGEN_ARCH_SPARC 0
+#endif
+
+/// \internal EIGEN_ARCH_IA64 set to 1 if the architecture is Intel Itanium
+#if defined(__ia64__)
+#define EIGEN_ARCH_IA64 1
+#else
+#define EIGEN_ARCH_IA64 0
+#endif
+
+/// \internal EIGEN_ARCH_PPC set to 1 if the architecture is PowerPC
+#if defined(__powerpc__) || defined(__ppc__) || defined(_M_PPC) || defined(__POWERPC__)
+#define EIGEN_ARCH_PPC 1
+#else
+#define EIGEN_ARCH_PPC 0
+#endif
+
+//------------------------------------------------------------------------------------------
+// Operating system identification, EIGEN_OS_*
+//------------------------------------------------------------------------------------------
+
+/// \internal EIGEN_OS_UNIX set to 1 if the OS is a unix variant
+#if defined(__unix__) || defined(__unix)
+#define EIGEN_OS_UNIX 1
+#else
+#define EIGEN_OS_UNIX 0
+#endif
+
+/// \internal EIGEN_OS_LINUX set to 1 if the OS is based on Linux kernel
+#if defined(__linux__)
+#define EIGEN_OS_LINUX 1
+#else
+#define EIGEN_OS_LINUX 0
+#endif
+
+/// \internal EIGEN_OS_ANDROID set to 1 if the OS is Android
+// note: ANDROID is defined when using ndk_build, __ANDROID__ is defined when using a standalone toolchain.
+#if defined(__ANDROID__) || defined(ANDROID)
+#define EIGEN_OS_ANDROID 1
+
+// Since NDK r16, `__NDK_MAJOR__` and `__NDK_MINOR__` are defined in
+// <android/ndk-version.h>. For NDK < r16, users should define these macros,
+// e.g. `-D__NDK_MAJOR__=11 -D__NKD_MINOR__=0` for NDK r11.
+#if defined __has_include
+#if __has_include(<android/ndk-version.h>)
+#include <android/ndk-version.h>
+#endif
+#endif
+
+#else
+#define EIGEN_OS_ANDROID 0
+#endif
+
+/// \internal EIGEN_OS_GNULINUX set to 1 if the OS is GNU Linux and not Linux-based OS (e.g., not android)
+#if defined(__gnu_linux__) && !(EIGEN_OS_ANDROID)
+#define EIGEN_OS_GNULINUX 1
+#else
+#define EIGEN_OS_GNULINUX 0
+#endif
+
+/// \internal EIGEN_OS_BSD set to 1 if the OS is a BSD variant
+#if defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || defined(__bsdi__) || defined(__DragonFly__)
+#define EIGEN_OS_BSD 1
+#else
+#define EIGEN_OS_BSD 0
+#endif
+
+/// \internal EIGEN_OS_MAC set to 1 if the OS is MacOS
+#if defined(__APPLE__)
+#define EIGEN_OS_MAC 1
+#else
+#define EIGEN_OS_MAC 0
+#endif
+
+/// \internal EIGEN_OS_QNX set to 1 if the OS is QNX
+#if defined(__QNX__)
+#define EIGEN_OS_QNX 1
+#else
+#define EIGEN_OS_QNX 0
+#endif
+
+/// \internal EIGEN_OS_WIN set to 1 if the OS is Windows based
+#if defined(_WIN32)
+#define EIGEN_OS_WIN 1
+#else
+#define EIGEN_OS_WIN 0
+#endif
+
+/// \internal EIGEN_OS_WIN64 set to 1 if the OS is Windows 64bits
+#if defined(_WIN64)
+#define EIGEN_OS_WIN64 1
+#else
+#define EIGEN_OS_WIN64 0
+#endif
+
+/// \internal EIGEN_OS_WINCE set to 1 if the OS is Windows CE
+#if defined(_WIN32_WCE)
+#define EIGEN_OS_WINCE 1
+#else
+#define EIGEN_OS_WINCE 0
+#endif
+
+/// \internal EIGEN_OS_CYGWIN set to 1 if the OS is Windows/Cygwin
+#if defined(__CYGWIN__)
+#define EIGEN_OS_CYGWIN 1
+#else
+#define EIGEN_OS_CYGWIN 0
+#endif
+
+/// \internal EIGEN_OS_WIN_STRICT set to 1 if the OS is really Windows and not some variants
+#if EIGEN_OS_WIN && !(EIGEN_OS_WINCE || EIGEN_OS_CYGWIN)
+#define EIGEN_OS_WIN_STRICT 1
+#else
+#define EIGEN_OS_WIN_STRICT 0
+#endif
+
+/// \internal EIGEN_OS_SUN set to __SUNPRO_C if the OS is SUN
+// compiler  solaris   __SUNPRO_C
+// version   studio
+// 5.7       10        0x570
+// 5.8       11        0x580
+// 5.9       12        0x590
+// 5.10	     12.1      0x5100
+// 5.11	     12.2      0x5110
+// 5.12	     12.3      0x5120
+#if (defined(sun) || defined(__sun)) && !(defined(__SVR4) || defined(__svr4__))
+#define EIGEN_OS_SUN __SUNPRO_C
+#else
+#define EIGEN_OS_SUN 0
+#endif
+
+/// \internal EIGEN_OS_SOLARIS set to 1 if the OS is Solaris
+#if (defined(sun) || defined(__sun)) && (defined(__SVR4) || defined(__svr4__))
+#define EIGEN_OS_SOLARIS 1
+#else
+#define EIGEN_OS_SOLARIS 0
+#endif
+
+//------------------------------------------------------------------------------------------
+// Detect GPU compilers and architectures
+//------------------------------------------------------------------------------------------
+
+// NVCC is not supported as the target platform for HIPCC
+// Note that this also makes EIGEN_CUDACC and EIGEN_HIPCC mutually exclusive
+#if defined(__NVCC__) && defined(__HIPCC__)
+#error "NVCC as the target platform for HIPCC is currently not supported."
+#endif
+
+#if defined(__CUDACC__) && !defined(EIGEN_NO_CUDA) && !defined(__SYCL_DEVICE_ONLY__)
+// Means the compiler is either nvcc or clang with CUDA enabled
+#define EIGEN_CUDACC __CUDACC__
+#endif
+
+#if defined(__CUDA_ARCH__) && !defined(EIGEN_NO_CUDA) && !defined(__SYCL_DEVICE_ONLY__)
+// Means we are generating code for the device
+#define EIGEN_CUDA_ARCH __CUDA_ARCH__
+#endif
+
+#if defined(EIGEN_CUDACC)
+#include <cuda.h>
+#define EIGEN_CUDA_SDK_VER (CUDA_VERSION * 10)
+#else
+#define EIGEN_CUDA_SDK_VER 0
+#endif
+
+#if defined(__HIPCC__) && !defined(EIGEN_NO_HIP) && !defined(__SYCL_DEVICE_ONLY__)
+// Means the compiler is HIPCC (analogous to EIGEN_CUDACC, but for HIP)
+#define EIGEN_HIPCC __HIPCC__
+
+// We need to include hip_runtime.h here because it pulls in
+// ++ hip_common.h which contains the define for  __HIP_DEVICE_COMPILE__
+// ++ host_defines.h which contains the defines for the __host__ and __device__ macros
+#include <hip/hip_runtime.h>
+
+#if defined(__HIP_DEVICE_COMPILE__) && !defined(__SYCL_DEVICE_ONLY__)
+// analogous to EIGEN_CUDA_ARCH, but for HIP
+#define EIGEN_HIP_DEVICE_COMPILE __HIP_DEVICE_COMPILE__
+#endif
+
+// For HIP (ROCm 3.5 and higher), we need to explicitly set the launch_bounds attribute
+// value to 1024. The compiler assigns a default value of 256 when the attribute is not
+// specified. This results in failures on the HIP platform, for cases when a GPU kernel
+// without an explicit launch_bounds attribute is called with a threads_per_block value
+// greater than 256.
+//
+// This is a regression in functioanlity and is expected to be fixed within the next
+// couple of ROCm releases (compiler will go back to using 1024 value as the default)
+//
+// In the meantime, we will use a "only enabled for HIP" macro to set the launch_bounds
+// attribute.
+
+#define EIGEN_HIP_LAUNCH_BOUNDS_1024 __launch_bounds__(1024)
+
+#endif
+
+#if !defined(EIGEN_HIP_LAUNCH_BOUNDS_1024)
+#define EIGEN_HIP_LAUNCH_BOUNDS_1024
+#endif  // !defined(EIGEN_HIP_LAUNCH_BOUNDS_1024)
+
+// Unify CUDA/HIPCC
+
+#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
+//
+// If either EIGEN_CUDACC or EIGEN_HIPCC is defined, then define EIGEN_GPUCC
+//
+#define EIGEN_GPUCC
+//
+// EIGEN_HIPCC implies the HIP compiler and is used to tweak Eigen code for use in HIP kernels
+// EIGEN_CUDACC implies the CUDA compiler and is used to tweak Eigen code for use in CUDA kernels
+//
+// In most cases the same tweaks are required to the Eigen code to enable in both the HIP and CUDA kernels.
+// For those cases, the corresponding code should be guarded with
+//      #if defined(EIGEN_GPUCC)
+// instead of
+//      #if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)
+//
+// For cases where the tweak is specific to HIP, the code should be guarded with
+//      #if defined(EIGEN_HIPCC)
+//
+// For cases where the tweak is specific to CUDA, the code should be guarded with
+//      #if defined(EIGEN_CUDACC)
+//
+#endif
+
+#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
+//
+// If either EIGEN_CUDA_ARCH or EIGEN_HIP_DEVICE_COMPILE is defined, then define EIGEN_GPU_COMPILE_PHASE
+//
+#define EIGEN_GPU_COMPILE_PHASE
+//
+// GPU compilers (HIPCC, NVCC) typically do two passes over the source code,
+//   + one to compile the source for the "host" (ie CPU)
+//   + another to compile the source for the "device" (ie. GPU)
+//
+// Code that needs to enabled only during the either the "host" or "device" compilation phase
+// needs to be guarded with a macro that indicates the current compilation phase
+//
+// EIGEN_HIP_DEVICE_COMPILE implies the device compilation phase in HIP
+// EIGEN_CUDA_ARCH implies the device compilation phase in CUDA
+//
+// In most cases, the "host" / "device" specific code is the same for both HIP and CUDA
+// For those cases, the code should be guarded with
+//       #if defined(EIGEN_GPU_COMPILE_PHASE)
+// instead of
+//       #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIP_DEVICE_COMPILE)
+//
+// For cases where the tweak is specific to HIP, the code should be guarded with
+//      #if defined(EIGEN_HIP_DEVICE_COMPILE)
+//
+// For cases where the tweak is specific to CUDA, the code should be guarded with
+//      #if defined(EIGEN_CUDA_ARCH)
+//
+#endif
+
+/// \internal EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC set to 1 if the architecture
+/// supports Neon vector intrinsics for fp16.
+#if EIGEN_ARCH_ARM_OR_ARM64
+#ifndef EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
+// Clang only supports FP16 on aarch64, and not all intrinsics are available
+// on A32 anyways even in GCC (e.g. vdiv_f16, vsqrt_f16).
+#if EIGEN_ARCH_ARM64 && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(EIGEN_GPU_COMPILE_PHASE)
+#define EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC 1
+#else
+#define EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC 0
+#endif
+#endif
+#endif
+
+/// \internal EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC set to 1 if the architecture
+/// supports Neon scalar intrinsics for fp16.
+#if EIGEN_ARCH_ARM_OR_ARM64
+#ifndef EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC
+// Clang only supports FP16 on aarch64, and not all intrinsics are available
+// on A32 anyways, even in GCC (e.g. vceqh_f16).
+#if EIGEN_ARCH_ARM64 && defined(__ARM_FEATURE_FP16_SCALAR_ARITHMETIC) && !defined(EIGEN_GPU_COMPILE_PHASE)
+#define EIGEN_HAS_ARM64_FP16_SCALAR_ARITHMETIC 1
+#endif
+#endif
+#endif
+
+#if defined(EIGEN_USE_SYCL) && defined(__SYCL_DEVICE_ONLY__)
+// EIGEN_USE_SYCL is a user-defined macro while __SYCL_DEVICE_ONLY__ is a compiler-defined macro.
+// In most cases we want to check if both macros are defined which can be done using the define below.
+#define SYCL_DEVICE_ONLY
+#endif
+
+//------------------------------------------------------------------------------------------
+// Detect Compiler/Architecture/OS specific features
+//------------------------------------------------------------------------------------------
+
+// Cross compiler wrapper around LLVM's __has_builtin
+#ifdef __has_builtin
+#define EIGEN_HAS_BUILTIN(x) __has_builtin(x)
+#else
+#define EIGEN_HAS_BUILTIN(x) 0
 #endif
 
 // A Clang feature extension to determine compiler features.
 // We use it to determine 'cxx_rvalue_references'
 #ifndef __has_feature
-# define __has_feature(x) 0
+#define __has_feature(x) 0
 #endif
 
-// Do we support r-value references?
-#if (__has_feature(cxx_rvalue_references) || \
-     defined(__GXX_EXPERIMENTAL_CXX0X__) || \
-     (defined(_MSC_VER) && _MSC_VER >= 1600))
-  #define EIGEN_HAVE_RVALUE_REFERENCES
+// The macro EIGEN_CPLUSPLUS is a replacement for __cplusplus/_MSVC_LANG that
+// works for both platforms, indicating the C++ standard version number.
+//
+// With MSVC, without defining /Zc:__cplusplus, the __cplusplus macro will
+// report 199711L regardless of the language standard specified via /std.
+// We need to rely on _MSVC_LANG instead, which is only available after
+// VS2015.3.
+#if EIGEN_COMP_MSVC_LANG > 0
+#define EIGEN_CPLUSPLUS EIGEN_COMP_MSVC_LANG
+#elif EIGEN_COMP_MSVC >= 1900
+#define EIGEN_CPLUSPLUS 201103L
+#elif defined(__cplusplus)
+#define EIGEN_CPLUSPLUS __cplusplus
+#else
+#define EIGEN_CPLUSPLUS 0
 #endif
 
+// The macro EIGEN_COMP_CXXVER defines the c++ version expected by the compiler.
+// For instance, if compiling with gcc and -std=c++17, then EIGEN_COMP_CXXVER
+// is defined to 17.
+#if EIGEN_CPLUSPLUS >= 202002L
+#define EIGEN_COMP_CXXVER 20
+#elif EIGEN_CPLUSPLUS >= 201703L
+#define EIGEN_COMP_CXXVER 17
+#elif EIGEN_CPLUSPLUS >= 201402L
+#define EIGEN_COMP_CXXVER 14
+#elif EIGEN_CPLUSPLUS >= 201103L
+#define EIGEN_COMP_CXXVER 11
+#else
+#define EIGEN_COMP_CXXVER 03
+#endif
 
-// Cross compiler wrapper around LLVM's __has_builtin
-#ifdef __has_builtin
-#  define EIGEN_HAS_BUILTIN(x) __has_builtin(x)
+// The macros EIGEN_HAS_CXX?? defines a rough estimate of available c++ features
+// but in practice we should not rely on them but rather on the availability of
+// individual features as defined later.
+// This is why there is no EIGEN_HAS_CXX17.
+#if EIGEN_MAX_CPP_VER < 14 || EIGEN_COMP_CXXVER < 14 || (EIGEN_COMP_MSVC && EIGEN_COMP_MSVC < 1900) || \
+    (EIGEN_COMP_ICC && EIGEN_COMP_ICC < 1500) || (EIGEN_COMP_NVCC && EIGEN_COMP_NVCC < 80000) ||       \
+    (EIGEN_COMP_CLANG_STRICT && EIGEN_COMP_CLANG < 390) ||                                             \
+    (EIGEN_COMP_CLANGAPPLE && EIGEN_COMP_CLANGAPPLE < 9000000) || (EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC < 510)
+#error Eigen requires at least c++14 support.
+#endif
+
+// Does the compiler support C99?
+// Need to include <cmath> to make sure _GLIBCXX_USE_C99 gets defined
+#include <cmath>
+#ifndef EIGEN_HAS_C99_MATH
+#if ((defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901)) ||                                          \
+     (defined(__GNUC__) && defined(_GLIBCXX_USE_C99)) || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER)) || \
+     (EIGEN_COMP_MSVC) || defined(SYCL_DEVICE_ONLY))
+#define EIGEN_HAS_C99_MATH 1
 #else
-#  define EIGEN_HAS_BUILTIN(x) 0
+#define EIGEN_HAS_C99_MATH 0
+#endif
 #endif
 
-/** Allows to disable some optimizations which might affect the accuracy of the result.
-  * Such optimization are enabled by default, and set EIGEN_FAST_MATH to 0 to disable them.
-  * They currently include:
-  *   - single precision Cwise::sin() and Cwise::cos() when SSE vectorization is enabled.
-  */
-#ifndef EIGEN_FAST_MATH
-#define EIGEN_FAST_MATH 1
+// Does the compiler support std::hash?
+#ifndef EIGEN_HAS_STD_HASH
+// The std::hash struct is defined in C++11 but is not labelled as a __device__
+// function and is not constexpr, so cannot be used on device.
+#if !defined(EIGEN_GPU_COMPILE_PHASE)
+#define EIGEN_HAS_STD_HASH 1
+#else
+#define EIGEN_HAS_STD_HASH 0
+#endif
+#endif  // EIGEN_HAS_STD_HASH
+
+#ifndef EIGEN_HAS_STD_INVOKE_RESULT
+#if EIGEN_MAX_CPP_VER >= 17 && EIGEN_COMP_CXXVER >= 17
+#define EIGEN_HAS_STD_INVOKE_RESULT 1
+#else
+#define EIGEN_HAS_STD_INVOKE_RESULT 0
+#endif
+#endif
+
+#define EIGEN_CONSTEXPR constexpr
+
+// NOTE: the required Apple's clang version is very conservative
+//       and it could be that XCode 9 works just fine.
+// NOTE: the MSVC version is based on https://en.cppreference.com/w/cpp/compiler_support
+//       and not tested.
+// NOTE: Intel C++ Compiler Classic (icc) Version 19.0 and later supports dynamic allocation
+//       for over-aligned data, but not in a manner that is compatible with Eigen.
+//       See https://gitlab.com/libeigen/eigen/-/issues/2575
+#ifndef EIGEN_HAS_CXX17_OVERALIGN
+#if EIGEN_MAX_CPP_VER >= 17 && EIGEN_COMP_CXXVER >= 17 &&                                                            \
+    ((EIGEN_COMP_MSVC >= 1912) || (EIGEN_GNUC_STRICT_AT_LEAST(7, 0, 0)) || (EIGEN_CLANG_STRICT_AT_LEAST(5, 0, 0)) || \
+     (EIGEN_COMP_CLANGAPPLE && EIGEN_COMP_CLANGAPPLE >= 10000000)) &&                                                \
+    !EIGEN_COMP_ICC
+#define EIGEN_HAS_CXX17_OVERALIGN 1
+#else
+#define EIGEN_HAS_CXX17_OVERALIGN 0
+#endif
 #endif
 
+#if defined(EIGEN_CUDACC)
+// While available already with c++11, this is useful mostly starting with c++14 and relaxed constexpr rules
+#if defined(__NVCC__)
+// nvcc considers constexpr functions as __host__ __device__ with the option --expt-relaxed-constexpr
+#ifdef __CUDACC_RELAXED_CONSTEXPR__
+#define EIGEN_CONSTEXPR_ARE_DEVICE_FUNC
+#endif
+#elif defined(__clang__) && defined(__CUDA__) && __has_feature(cxx_relaxed_constexpr)
+// clang++ always considers constexpr functions as implicitly __host__ __device__
+#define EIGEN_CONSTEXPR_ARE_DEVICE_FUNC
+#endif
+#endif
+
+// Does the compiler support the __int128 and __uint128_t extensions for 128-bit
+// integer arithmetic?
+//
+// Clang and GCC define __SIZEOF_INT128__ when these extensions are supported,
+// but we avoid using them in certain cases:
+//
+// * Building using Clang for Windows, where the Clang runtime library has
+//   128-bit support only on LP64 architectures, but Windows is LLP64.
+#ifndef EIGEN_HAS_BUILTIN_INT128
+#if defined(__SIZEOF_INT128__) && !(EIGEN_OS_WIN && EIGEN_COMP_CLANG)
+#define EIGEN_HAS_BUILTIN_INT128 1
+#else
+#define EIGEN_HAS_BUILTIN_INT128 0
+#endif
+#endif
+
+//------------------------------------------------------------------------------------------
+// Preprocessor programming helpers
+//------------------------------------------------------------------------------------------
+
+// This macro can be used to prevent from macro expansion, e.g.:
+//   std::max EIGEN_NOT_A_MACRO(a,b)
+#define EIGEN_NOT_A_MACRO
+
 #define EIGEN_DEBUG_VAR(x) std::cerr << #x << " = " << x << std::endl;
 
 // concatenate two tokens
-#define EIGEN_CAT2(a,b) a ## b
-#define EIGEN_CAT(a,b) EIGEN_CAT2(a,b)
+#define EIGEN_CAT2(a, b) a##b
+#define EIGEN_CAT(a, b) EIGEN_CAT2(a, b)
+
+#define EIGEN_COMMA ,
 
 // convert a token to a string
 #define EIGEN_MAKESTRING2(a) #a
@@ -139,88 +853,82 @@
 // EIGEN_STRONG_INLINE is a stronger version of the inline, using __forceinline on MSVC,
 // but it still doesn't use GCC's always_inline. This is useful in (common) situations where MSVC needs forceinline
 // but GCC is still doing fine with just inline.
-#if (defined _MSC_VER) || (defined __INTEL_COMPILER)
+#ifndef EIGEN_STRONG_INLINE
+#if (EIGEN_COMP_MSVC || EIGEN_COMP_ICC) && !defined(EIGEN_GPUCC)
 #define EIGEN_STRONG_INLINE __forceinline
 #else
 #define EIGEN_STRONG_INLINE inline
 #endif
+#endif
 
-// EIGEN_ALWAYS_INLINE is the stronget, it has the effect of making the function inline and adding every possible
+// EIGEN_ALWAYS_INLINE is the strongest, it has the effect of making the function inline and adding every possible
 // attribute to maximize inlining. This should only be used when really necessary: in particular,
 // it uses __attribute__((always_inline)) on GCC, which most of the time is useless and can severely harm compile times.
 // FIXME with the always_inline attribute,
-// gcc 3.4.x reports the following compilation error:
-//   Eval.h:91: sorry, unimplemented: inlining failed in call to 'const Eigen::Eval<Derived> Eigen::MatrixBase<Scalar, Derived>::eval() const'
-//    : function body not available
-#if EIGEN_GNUC_AT_LEAST(4,0)
+#if EIGEN_COMP_GNUC && !defined(SYCL_DEVICE_ONLY)
 #define EIGEN_ALWAYS_INLINE __attribute__((always_inline)) inline
 #else
 #define EIGEN_ALWAYS_INLINE EIGEN_STRONG_INLINE
 #endif
 
-#if (defined __GNUC__)
+#if EIGEN_COMP_GNUC
 #define EIGEN_DONT_INLINE __attribute__((noinline))
-#elif (defined _MSC_VER)
+#elif EIGEN_COMP_MSVC
 #define EIGEN_DONT_INLINE __declspec(noinline)
 #else
 #define EIGEN_DONT_INLINE
 #endif
 
-#if (defined __GNUC__)
+#if EIGEN_COMP_GNUC
 #define EIGEN_PERMISSIVE_EXPR __extension__
 #else
 #define EIGEN_PERMISSIVE_EXPR
 #endif
 
+// GPU stuff
+
+// Disable some features when compiling with GPU compilers (SYCL/HIPCC)
+#if defined(SYCL_DEVICE_ONLY) || defined(EIGEN_HIP_DEVICE_COMPILE)
+// Do not try asserts on device code
+#ifndef EIGEN_NO_DEBUG
+#define EIGEN_NO_DEBUG
+#endif
+
+#ifdef EIGEN_INTERNAL_DEBUGGING
+#undef EIGEN_INTERNAL_DEBUGGING
+#endif
+#endif
+
+// No exceptions on device.
+#if defined(SYCL_DEVICE_ONLY) || defined(EIGEN_GPU_COMPILE_PHASE)
+#ifdef EIGEN_EXCEPTIONS
+#undef EIGEN_EXCEPTIONS
+#endif
+#endif
+
+#if defined(SYCL_DEVICE_ONLY)
+#ifndef EIGEN_DONT_VECTORIZE
+#define EIGEN_DONT_VECTORIZE
+#endif
+#define EIGEN_DEVICE_FUNC __attribute__((flatten)) __attribute__((always_inline))
+// All functions callable from CUDA/HIP code must be qualified with __device__
+#elif defined(EIGEN_GPUCC)
+#define EIGEN_DEVICE_FUNC __host__ __device__
+#else
+#define EIGEN_DEVICE_FUNC
+#endif
+
 // this macro allows to get rid of linking errors about multiply defined functions.
 //  - static is not very good because it prevents definitions from different object files to be merged.
 //           So static causes the resulting linked executable to be bloated with multiple copies of the same function.
 //  - inline is not perfect either as it unwantedly hints the compiler toward inlining the function.
-#define EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
-#define EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS inline
+#define EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_DEVICE_FUNC
+#define EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_DEVICE_FUNC inline
 
 #ifdef NDEBUG
-# ifndef EIGEN_NO_DEBUG
-#  define EIGEN_NO_DEBUG
-# endif
-#endif
-
-// eigen_plain_assert is where we implement the workaround for the assert() bug in GCC <= 4.3, see bug 89
-#ifdef EIGEN_NO_DEBUG
-  #define eigen_plain_assert(x)
-#else
-  #if EIGEN_SAFE_TO_USE_STANDARD_ASSERT_MACRO
-    namespace Eigen {
-    namespace internal {
-    inline bool copy_bool(bool b) { return b; }
-    }
-    }
-    #define eigen_plain_assert(x) assert(x)
-  #else
-    // work around bug 89
-    #include <cstdlib>   // for abort
-    #include <iostream>  // for std::cerr
-
-    namespace Eigen {
-    namespace internal {
-    // trivial function copying a bool. Must be EIGEN_DONT_INLINE, so we implement it after including Eigen headers.
-    // see bug 89.
-    namespace {
-    EIGEN_DONT_INLINE bool copy_bool(bool b) { return b; }
-    }
-    inline void assert_fail(const char *condition, const char *function, const char *file, int line)
-    {
-      std::cerr << "assertion failed: " << condition << " in function " << function << " at " << file << ":" << line << std::endl;
-      abort();
-    }
-    }
-    }
-    #define eigen_plain_assert(x) \
-      do { \
-        if(!Eigen::internal::copy_bool(x)) \
-          Eigen::internal::assert_fail(EIGEN_MAKESTRING(x), __PRETTY_FUNCTION__, __FILE__, __LINE__); \
-      } while(false)
-  #endif
+#ifndef EIGEN_NO_DEBUG
+#define EIGEN_NO_DEBUG
+#endif
 #endif
 
 // eigen_assert can be overridden
@@ -231,88 +939,171 @@
 #ifdef EIGEN_INTERNAL_DEBUGGING
 #define eigen_internal_assert(x) eigen_assert(x)
 #else
-#define eigen_internal_assert(x)
+#define eigen_internal_assert(x) ((void)0)
 #endif
 
-#ifdef EIGEN_NO_DEBUG
-#define EIGEN_ONLY_USED_FOR_DEBUG(x) (void)x
+#if defined(EIGEN_NO_DEBUG) || (defined(EIGEN_GPU_COMPILE_PHASE) && defined(EIGEN_NO_DEBUG_GPU))
+#define EIGEN_ONLY_USED_FOR_DEBUG(x) EIGEN_UNUSED_VARIABLE(x)
 #else
 #define EIGEN_ONLY_USED_FOR_DEBUG(x)
 #endif
 
 #ifndef EIGEN_NO_DEPRECATED_WARNING
-  #if (defined __GNUC__)
-    #define EIGEN_DEPRECATED __attribute__((deprecated))
-  #elif (defined _MSC_VER)
-    #define EIGEN_DEPRECATED __declspec(deprecated)
-  #else
-    #define EIGEN_DEPRECATED
-  #endif
+#if EIGEN_COMP_GNUC
+#define EIGEN_DEPRECATED __attribute__((deprecated))
+#elif EIGEN_COMP_MSVC
+#define EIGEN_DEPRECATED __declspec(deprecated)
 #else
-  #define EIGEN_DEPRECATED
+#define EIGEN_DEPRECATED
+#endif
+#else
+#define EIGEN_DEPRECATED
 #endif
 
-#if (defined __GNUC__)
+#ifndef EIGEN_NO_DEPRECATED_WARNING
+#if EIGEN_COMP_GNUC
+#define EIGEN_DEPRECATED_WITH_REASON(message) __attribute__((deprecated(message)))
+#elif EIGEN_COMP_MSVC
+#define EIGEN_DEPRECATED_WITH_REASON(message) __declspec(deprecated(message))
+#else
+#define EIGEN_DEPRECATED_WITH_REASON(message)
+#endif
+#else
+#define EIGEN_DEPRECATED_WITH_REASON(message)
+#endif
+
+#if EIGEN_COMP_GNUC
 #define EIGEN_UNUSED __attribute__((unused))
 #else
 #define EIGEN_UNUSED
 #endif
 
+#if EIGEN_COMP_GNUC
+#define EIGEN_PRAGMA(tokens) _Pragma(#tokens)
+#define EIGEN_DIAGNOSTICS(tokens) EIGEN_PRAGMA(GCC diagnostic tokens)
+#define EIGEN_DIAGNOSTICS_OFF(msc, gcc) EIGEN_DIAGNOSTICS(gcc)
+#elif EIGEN_COMP_MSVC
+#define EIGEN_PRAGMA(tokens) __pragma(tokens)
+#define EIGEN_DIAGNOSTICS(tokens) EIGEN_PRAGMA(warning(tokens))
+#define EIGEN_DIAGNOSTICS_OFF(msc, gcc) EIGEN_DIAGNOSTICS(msc)
+#else
+#define EIGEN_PRAGMA(tokens)
+#define EIGEN_DIAGNOSTICS(tokens)
+#define EIGEN_DIAGNOSTICS_OFF(msc, gcc)
+#endif
+
+#define EIGEN_DISABLE_DEPRECATED_WARNING EIGEN_DIAGNOSTICS_OFF(disable : 4996, ignored "-Wdeprecated-declarations")
+
 // Suppresses 'unused variable' warnings.
 namespace Eigen {
-  namespace internal {
-    template<typename T> void ignore_unused_variable(const T&) {}
-  }
-}
+namespace internal {
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr void ignore_unused_variable(const T&) {}
+}  // namespace internal
+}  // namespace Eigen
 #define EIGEN_UNUSED_VARIABLE(var) Eigen::internal::ignore_unused_variable(var);
 
 #if !defined(EIGEN_ASM_COMMENT)
-  #if (defined __GNUC__) && ( defined(__i386__) || defined(__x86_64__) )
-    #define EIGEN_ASM_COMMENT(X)  __asm__("#" X)
-  #else
-    #define EIGEN_ASM_COMMENT(X)
-  #endif
+#if EIGEN_COMP_GNUC && (EIGEN_ARCH_i386_OR_x86_64 || EIGEN_ARCH_ARM_OR_ARM64)
+#define EIGEN_ASM_COMMENT(X) __asm__("#" X)
+#else
+#define EIGEN_ASM_COMMENT(X)
+#endif
 #endif
 
-/* EIGEN_ALIGN_TO_BOUNDARY(n) forces data to be n-byte aligned. This is used to satisfy SIMD requirements.
- * However, we do that EVEN if vectorization (EIGEN_VECTORIZE) is disabled,
- * so that vectorization doesn't affect binary compatibility.
- *
- * If we made alignment depend on whether or not EIGEN_VECTORIZE is defined, it would be impossible to link
- * vectorized and non-vectorized code.
- */
-#if (defined __GNUC__) || (defined __PGI) || (defined __IBMCPP__) || (defined __ARMCC_VERSION)
-  #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
-#elif (defined _MSC_VER)
-  #define EIGEN_ALIGN_TO_BOUNDARY(n) __declspec(align(n))
-#elif (defined __SUNPRO_CC)
-  // FIXME not sure about this one:
-  #define EIGEN_ALIGN_TO_BOUNDARY(n) __attribute__((aligned(n)))
-#else
-  #error Please tell me what is the equivalent of __attribute__((aligned(n))) for your compiler
+// Acts as a barrier preventing operations involving `X` from crossing. This
+// occurs, for example, in the fast rounding trick where a magic constant is
+// added then subtracted, which is otherwise compiled away with -ffast-math.
+//
+// See bug 1674
+#if defined(EIGEN_GPU_COMPILE_PHASE)
+#define EIGEN_OPTIMIZATION_BARRIER(X)
 #endif
 
-#define EIGEN_ALIGN8  EIGEN_ALIGN_TO_BOUNDARY(8)
-#define EIGEN_ALIGN16 EIGEN_ALIGN_TO_BOUNDARY(16)
+#if !defined(EIGEN_OPTIMIZATION_BARRIER)
+// Implement the barrier on GNUC compilers or clang-cl.
+#if EIGEN_COMP_GNUC || (defined(__clang__) && defined(_MSC_VER))
+// According to https://gcc.gnu.org/onlinedocs/gcc/Constraints.html:
+//   X: Any operand whatsoever.
+//   r: A register operand is allowed provided that it is in a general
+//      register.
+//   g: Any register, memory or immediate integer operand is allowed, except
+//      for registers that are not general registers.
+//   w: (AArch32/AArch64) Floating point register, Advanced SIMD vector
+//      register or SVE vector register.
+//   x: (SSE) Any SSE register.
+//      (AArch64) Like w, but restricted to registers 0 to 15 inclusive.
+//   v: (PowerPC) An Altivec vector register.
+//   wa:(PowerPC) A VSX register.
+//
+// "X" (uppercase) should work for all cases, though this seems to fail for
+// some versions of GCC for arm/aarch64 with
+//   "error: inconsistent operand constraints in an 'asm'"
+// Clang x86_64/arm/aarch64 seems to require "g" to support both scalars and
+// vectors, otherwise
+//   "error: non-trivial scalar-to-vector conversion, possible invalid
+//    constraint for vector type"
+//
+// GCC for ppc64le generates an internal compiler error with x/X/g.
+// GCC for AVX generates an internal compiler error with X.
+//
+// Tested on icc/gcc/clang for sse, avx, avx2, avx512dq
+//           gcc for arm, aarch64,
+//           gcc for ppc64le,
+// both vectors and scalars.
+//
+// Note that this is restricted to plain types - this will not work
+// directly for std::complex<T>, Eigen::half, Eigen::bfloat16. For these,
+// you will need to apply to the underlying POD type.
+#if EIGEN_ARCH_PPC && EIGEN_COMP_GNUC_STRICT
+// This seems to be broken on clang. Packet4f is loaded into a single
+//   register rather than a vector, zeroing out some entries. Integer
+//   types also generate a compile error.
+#if EIGEN_OS_MAC
+// General, Altivec for Apple (VSX were added in ISA v2.06):
+#define EIGEN_OPTIMIZATION_BARRIER(X) __asm__("" : "+r,v"(X));
+#else
+// General, Altivec, VSX otherwise:
+#define EIGEN_OPTIMIZATION_BARRIER(X) __asm__("" : "+r,v,wa"(X));
+#endif
+#elif EIGEN_ARCH_ARM_OR_ARM64
+#ifdef __ARM_FP
+// General, VFP or NEON.
+// Clang doesn't like "r",
+//    error: non-trivial scalar-to-vector conversion, possible invalid
+//           constraint for vector typ
+#define EIGEN_OPTIMIZATION_BARRIER(X) __asm__("" : "+g,w"(X));
+#else
+// Arm without VFP or NEON.
+// "w" constraint will not compile.
+#define EIGEN_OPTIMIZATION_BARRIER(X) __asm__("" : "+g"(X));
+#endif
+#elif EIGEN_ARCH_i386_OR_x86_64
+// General, SSE.
+#define EIGEN_OPTIMIZATION_BARRIER(X) __asm__("" : "+g,x"(X));
+#else
+// Not implemented for other architectures.
+#define EIGEN_OPTIMIZATION_BARRIER(X)
+#endif
+#else
+// Not implemented for other compilers.
+#define EIGEN_OPTIMIZATION_BARRIER(X)
+#endif
+#endif
 
-#if EIGEN_ALIGN_STATICALLY
-#define EIGEN_USER_ALIGN_TO_BOUNDARY(n) EIGEN_ALIGN_TO_BOUNDARY(n)
-#define EIGEN_USER_ALIGN16 EIGEN_ALIGN16
+#if EIGEN_COMP_MSVC
+// NOTE MSVC often gives C4127 warnings with compiletime if statements. See bug 1362.
+// This workaround is ugly, but it does the job.
+#define EIGEN_CONST_CONDITIONAL(cond) (void)0, cond
 #else
-#define EIGEN_USER_ALIGN_TO_BOUNDARY(n)
-#define EIGEN_USER_ALIGN16
+#define EIGEN_CONST_CONDITIONAL(cond) cond
 #endif
 
 #ifdef EIGEN_DONT_USE_RESTRICT_KEYWORD
-  #define EIGEN_RESTRICT
+#define EIGEN_RESTRICT
 #endif
 #ifndef EIGEN_RESTRICT
-  #define EIGEN_RESTRICT __restrict
-#endif
-
-#ifndef EIGEN_STACK_ALLOCATION_LIMIT
-// 131072 == 128 KB
-#define EIGEN_STACK_ALLOCATION_LIMIT 131072
+#define EIGEN_RESTRICT __restrict
 #endif
 
 #ifndef EIGEN_DEFAULT_IO_FORMAT
@@ -328,124 +1119,228 @@ namespace Eigen {
 // just an empty macro !
 #define EIGEN_EMPTY
 
-#if defined(_MSC_VER) && (_MSC_VER < 1900) && (!defined(__INTEL_COMPILER))
-#define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
-  using Base::operator =;
-#elif defined(__clang__) // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653)
-#define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
-  using Base::operator =; \
-  EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) { Base::operator=(other); return *this; } \
-  template <typename OtherDerived> \
-  EIGEN_STRONG_INLINE Derived& operator=(const DenseBase<OtherDerived>& other) { Base::operator=(other.derived()); return *this; }
-#else
-#define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \
-  using Base::operator =; \
-  EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) \
-  { \
-    Base::operator=(other); \
-    return *this; \
+// When compiling CUDA/HIP device code with NVCC or HIPCC
+// pull in math functions from the global namespace.
+// In host mode, and when device code is compiled with clang,
+// use the std versions.
+#if (defined(EIGEN_CUDA_ARCH) && defined(__NVCC__)) || defined(EIGEN_HIP_DEVICE_COMPILE)
+#define EIGEN_USING_STD(FUNC) using ::FUNC;
+#else
+#define EIGEN_USING_STD(FUNC) using std::FUNC;
+#endif
+
+#if EIGEN_COMP_CLANG  // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653)
+#define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived)                                           \
+  using Base::operator=;                                                                           \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) {                 \
+    Base::operator=(other);                                                                        \
+    return *this;                                                                                  \
+  }                                                                                                \
+  template <typename OtherDerived>                                                                 \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const DenseBase<OtherDerived>& other) { \
+    Base::operator=(other.derived());                                                              \
+    return *this;                                                                                  \
+  }
+#else
+#define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived)                           \
+  using Base::operator=;                                                           \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) { \
+    Base::operator=(other);                                                        \
+    return *this;                                                                  \
   }
 #endif
 
+/**
+ * \internal
+ * \brief Macro to explicitly define the default copy constructor.
+ * This is necessary, because the implicit definition is deprecated if the copy-assignment is overridden.
+ */
+#define EIGEN_DEFAULT_COPY_CONSTRUCTOR(CLASS) EIGEN_DEVICE_FUNC CLASS(const CLASS&) = default;
+
 /** \internal
  * \brief Macro to manually inherit assignment operators.
- * This is necessary, because the implicitly defined assignment operator gets deleted when a custom operator= is defined.
+ * This is necessary, because the implicitly defined assignment operator gets deleted when a custom operator= is
+ * defined. With C++11 or later this also default-implements the copy-constructor
  */
-#define EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Derived) EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived)
+#define EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Derived) \
+  EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived)  \
+  EIGEN_DEFAULT_COPY_CONSTRUCTOR(Derived)
 
-/**
-* Just a side note. Commenting within defines works only by documenting
-* behind the object (via '!<'). Comments cannot be multi-line and thus
-* we have these extra long lines. What is confusing doxygen over here is
-* that we use '\' and basically have a bunch of typedefs with their
-* documentation in a single line.
-**/
-
-#define EIGEN_GENERIC_PUBLIC_INTERFACE(Derived) \
-  typedef typename Eigen::internal::traits<Derived>::Scalar Scalar; /*!< \brief Numeric type, e.g. float, double, int or std::complex<float>. */ \
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; /*!< \brief The underlying numeric type for composed scalar types. \details In cases where Scalar is e.g. std::complex<T>, T were corresponding to RealScalar. */ \
-  typedef typename Base::CoeffReturnType CoeffReturnType; /*!< \brief The return type for coefficient access. \details Depending on whether the object allows direct coefficient access (e.g. for a MatrixXd), this type is either 'const Scalar&' or simply 'Scalar' for objects that do not allow direct coefficient access. */ \
-  typedef typename Eigen::internal::nested<Derived>::type Nested; \
-  typedef typename Eigen::internal::traits<Derived>::StorageKind StorageKind; \
-  typedef typename Eigen::internal::traits<Derived>::Index Index; \
-  enum { RowsAtCompileTime = Eigen::internal::traits<Derived>::RowsAtCompileTime, \
-        ColsAtCompileTime = Eigen::internal::traits<Derived>::ColsAtCompileTime, \
-        Flags = Eigen::internal::traits<Derived>::Flags, \
-        CoeffReadCost = Eigen::internal::traits<Derived>::CoeffReadCost, \
-        SizeAtCompileTime = Base::SizeAtCompileTime, \
-        MaxSizeAtCompileTime = Base::MaxSizeAtCompileTime, \
-        IsVectorAtCompileTime = Base::IsVectorAtCompileTime };
+/** \internal
+ * \brief Macro to manually define default constructors and destructors.
+ * This is necessary when the copy constructor is re-defined.
+ * For empty helper classes this should usually be protected, to avoid accidentally creating empty objects.
+ *
+ * Hiding the default destructor lead to problems in C++03 mode together with boost::multiprecision
+ */
+#define EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(Derived) \
+  EIGEN_DEVICE_FUNC Derived() = default;                        \
+  EIGEN_DEVICE_FUNC ~Derived() = default;
 
+/**
+ * Just a side note. Commenting within defines works only by documenting
+ * behind the object (via '!<'). Comments cannot be multi-line and thus
+ * we have these extra long lines. What is confusing doxygen over here is
+ * that we use '\' and basically have a bunch of typedefs with their
+ * documentation in a single line.
+ **/
 
-#define EIGEN_DENSE_PUBLIC_INTERFACE(Derived) \
-  typedef typename Eigen::internal::traits<Derived>::Scalar Scalar; /*!< \brief Numeric type, e.g. float, double, int or std::complex<float>. */ \
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; /*!< \brief The underlying numeric type for composed scalar types. \details In cases where Scalar is e.g. std::complex<T>, T were corresponding to RealScalar. */ \
-  typedef typename Base::PacketScalar PacketScalar; \
-  typedef typename Base::CoeffReturnType CoeffReturnType; /*!< \brief The return type for coefficient access. \details Depending on whether the object allows direct coefficient access (e.g. for a MatrixXd), this type is either 'const Scalar&' or simply 'Scalar' for objects that do not allow direct coefficient access. */ \
-  typedef typename Eigen::internal::nested<Derived>::type Nested; \
-  typedef typename Eigen::internal::traits<Derived>::StorageKind StorageKind; \
-  typedef typename Eigen::internal::traits<Derived>::Index Index; \
-  enum { RowsAtCompileTime = Eigen::internal::traits<Derived>::RowsAtCompileTime, \
-        ColsAtCompileTime = Eigen::internal::traits<Derived>::ColsAtCompileTime, \
-        MaxRowsAtCompileTime = Eigen::internal::traits<Derived>::MaxRowsAtCompileTime, \
-        MaxColsAtCompileTime = Eigen::internal::traits<Derived>::MaxColsAtCompileTime, \
-        Flags = Eigen::internal::traits<Derived>::Flags, \
-        CoeffReadCost = Eigen::internal::traits<Derived>::CoeffReadCost, \
-        SizeAtCompileTime = Base::SizeAtCompileTime, \
-        MaxSizeAtCompileTime = Base::MaxSizeAtCompileTime, \
-        IsVectorAtCompileTime = Base::IsVectorAtCompileTime }; \
-  using Base::derived; \
+#define EIGEN_GENERIC_PUBLIC_INTERFACE(Derived)                                                                        \
+  typedef typename Eigen::internal::traits<Derived>::Scalar                                                            \
+      Scalar; /*!< \brief Numeric type, e.g. float, double, int or std::complex<float>. */                             \
+  typedef typename Eigen::NumTraits<Scalar>::Real                                                                      \
+      RealScalar; /*!< \brief The underlying numeric type for composed scalar types. \details In cases where Scalar is \
+                     e.g. std::complex<T>, T were corresponding to RealScalar. */                                      \
+  typedef typename Base::CoeffReturnType                                                                               \
+      CoeffReturnType; /*!< \brief The return type for coefficient access. \details Depending on whether the object    \
+                          allows direct coefficient access (e.g. for a MatrixXd), this type is either 'const Scalar&'  \
+                          or simply 'Scalar' for objects that do not allow direct coefficient access. */               \
+  typedef typename Eigen::internal::ref_selector<Derived>::type Nested;                                                \
+  typedef typename Eigen::internal::traits<Derived>::StorageKind StorageKind;                                          \
+  typedef typename Eigen::internal::traits<Derived>::StorageIndex StorageIndex;                                        \
+  enum CompileTimeTraits {                                                                                             \
+    RowsAtCompileTime = Eigen::internal::traits<Derived>::RowsAtCompileTime,                                           \
+    ColsAtCompileTime = Eigen::internal::traits<Derived>::ColsAtCompileTime,                                           \
+    Flags = Eigen::internal::traits<Derived>::Flags,                                                                   \
+    SizeAtCompileTime = Base::SizeAtCompileTime,                                                                       \
+    MaxSizeAtCompileTime = Base::MaxSizeAtCompileTime,                                                                 \
+    IsVectorAtCompileTime = Base::IsVectorAtCompileTime                                                                \
+  };                                                                                                                   \
+  using Base::derived;                                                                                                 \
   using Base::const_cast_derived;
 
+// FIXME Maybe the EIGEN_DENSE_PUBLIC_INTERFACE could be removed as importing PacketScalar is rarely needed
+#define EIGEN_DENSE_PUBLIC_INTERFACE(Derived) \
+  EIGEN_GENERIC_PUBLIC_INTERFACE(Derived)     \
+  typedef typename Base::PacketScalar PacketScalar;
 
-#define EIGEN_PLAIN_ENUM_MIN(a,b) (((int)a <= (int)b) ? (int)a : (int)b)
-#define EIGEN_PLAIN_ENUM_MAX(a,b) (((int)a >= (int)b) ? (int)a : (int)b)
+#if EIGEN_HAS_BUILTIN(__builtin_expect) || EIGEN_COMP_GNUC
+#define EIGEN_PREDICT_FALSE(x) (__builtin_expect(x, false))
+#define EIGEN_PREDICT_TRUE(x) (__builtin_expect(false || (x), true))
+#else
+#define EIGEN_PREDICT_FALSE(x) (x)
+#define EIGEN_PREDICT_TRUE(x) (x)
+#endif
 
-// EIGEN_SIZE_MIN_PREFER_DYNAMIC gives the min between compile-time sizes. 0 has absolute priority, followed by 1,
-// followed by Dynamic, followed by other finite values. The reason for giving Dynamic the priority over
-// finite values is that min(3, Dynamic) should be Dynamic, since that could be anything between 0 and 3.
-#define EIGEN_SIZE_MIN_PREFER_DYNAMIC(a,b) (((int)a == 0 || (int)b == 0) ? 0 \
-                           : ((int)a == 1 || (int)b == 1) ? 1 \
-                           : ((int)a == Dynamic || (int)b == Dynamic) ? Dynamic \
-                           : ((int)a <= (int)b) ? (int)a : (int)b)
+// the expression type of a standard coefficient wise binary operation
+#define EIGEN_CWISE_BINARY_RETURN_TYPE(LHS, RHS, OPNAME)                                                       \
+  CwiseBinaryOp<EIGEN_CAT(EIGEN_CAT(internal::scalar_, OPNAME), _op) < typename internal::traits<LHS>::Scalar, \
+                typename internal::traits<RHS>::Scalar>,                                                       \
+      const LHS, const RHS >
 
-// EIGEN_SIZE_MIN_PREFER_FIXED is a variant of EIGEN_SIZE_MIN_PREFER_DYNAMIC comparing MaxSizes. The difference is that finite values
-// now have priority over Dynamic, so that min(3, Dynamic) gives 3. Indeed, whatever the actual value is
-// (between 0 and 3), it is not more than 3.
-#define EIGEN_SIZE_MIN_PREFER_FIXED(a,b)  (((int)a == 0 || (int)b == 0) ? 0 \
-                           : ((int)a == 1 || (int)b == 1) ? 1 \
-                           : ((int)a == Dynamic && (int)b == Dynamic) ? Dynamic \
-                           : ((int)a == Dynamic) ? (int)b \
-                           : ((int)b == Dynamic) ? (int)a \
-                           : ((int)a <= (int)b) ? (int)a : (int)b)
+#define EIGEN_MAKE_CWISE_BINARY_OP(METHOD, OPNAME)                                                                \
+  template <typename OtherDerived>                                                                                \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const EIGEN_CWISE_BINARY_RETURN_TYPE(                                     \
+      Derived, OtherDerived, OPNAME)(METHOD)(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const { \
+    return EIGEN_CWISE_BINARY_RETURN_TYPE(Derived, OtherDerived, OPNAME)(derived(), other.derived());             \
+  }
 
-// see EIGEN_SIZE_MIN_PREFER_DYNAMIC. No need for a separate variant for MaxSizes here.
-#define EIGEN_SIZE_MAX(a,b) (((int)a == Dynamic || (int)b == Dynamic) ? Dynamic \
-                           : ((int)a >= (int)b) ? (int)a : (int)b)
+#define EIGEN_SCALAR_BINARY_SUPPORTED(OPNAME, TYPEA, TYPEB)     \
+  (Eigen::internal::has_ReturnType<Eigen::ScalarBinaryOpTraits< \
+       TYPEA, TYPEB, EIGEN_CAT(EIGEN_CAT(Eigen::internal::scalar_, OPNAME), _op) < TYPEA, TYPEB> > > ::value)
 
-#define EIGEN_ADD_COST(a,b) int(a)==Dynamic || int(b)==Dynamic ? Dynamic : int(a)+int(b)
+#define EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(EXPR, SCALAR, OPNAME)                                            \
+  CwiseBinaryOp<EIGEN_CAT(EIGEN_CAT(internal::scalar_, OPNAME), _op) < typename internal::traits<EXPR>::Scalar, \
+                SCALAR>,                                                                                        \
+      const EXPR, const typename internal::plain_constant_type<EXPR, SCALAR>::type >
 
-#define EIGEN_LOGICAL_XOR(a,b) (((a) || (b)) && !((a) && (b)))
+#define EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(SCALAR, EXPR, OPNAME)           \
+  CwiseBinaryOp<EIGEN_CAT(EIGEN_CAT(internal::scalar_, OPNAME), _op) < SCALAR, \
+                typename internal::traits<EXPR>::Scalar>,                      \
+      const typename internal::plain_constant_type<EXPR, SCALAR>::type, const EXPR >
 
-#define EIGEN_IMPLIES(a,b) (!(a) || (b))
+#define EIGEN_MAKE_SCALAR_BINARY_OP_ONTHERIGHT(METHOD, OPNAME)                                                       \
+  template <typename T>                                                                                              \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(                                \
+      Derived,                                                                                                       \
+      typename internal::promote_scalar_arg<Scalar EIGEN_COMMA T EIGEN_COMMA EIGEN_SCALAR_BINARY_SUPPORTED(          \
+          OPNAME, Scalar, T)>::type,                                                                                 \
+      OPNAME)(METHOD)(const T& scalar) const {                                                                       \
+    typedef typename internal::promote_scalar_arg<Scalar, T, EIGEN_SCALAR_BINARY_SUPPORTED(OPNAME, Scalar, T)>::type \
+        PromotedT;                                                                                                   \
+    return EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived, PromotedT, OPNAME)(                                       \
+        derived(), typename internal::plain_constant_type<Derived, PromotedT>::type(                                 \
+                       derived().rows(), derived().cols(), internal::scalar_constant_op<PromotedT>(scalar)));        \
+  }
 
-#define EIGEN_MAKE_CWISE_BINARY_OP(METHOD,FUNCTOR) \
-  template<typename OtherDerived> \
-  EIGEN_STRONG_INLINE const CwiseBinaryOp<FUNCTOR<Scalar>, const Derived, const OtherDerived> \
-  (METHOD)(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const \
-  { \
-    return CwiseBinaryOp<FUNCTOR<Scalar>, const Derived, const OtherDerived>(derived(), other.derived()); \
+#define EIGEN_MAKE_SCALAR_BINARY_OP_ONTHELEFT(METHOD, OPNAME)                                                        \
+  template <typename T>                                                                                              \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend const EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(                         \
+      typename internal::promote_scalar_arg<Scalar EIGEN_COMMA T EIGEN_COMMA EIGEN_SCALAR_BINARY_SUPPORTED(          \
+          OPNAME, T, Scalar)>::type,                                                                                 \
+      Derived, OPNAME)(METHOD)(const T& scalar, const StorageBaseType& matrix) {                                     \
+    typedef typename internal::promote_scalar_arg<Scalar, T, EIGEN_SCALAR_BINARY_SUPPORTED(OPNAME, T, Scalar)>::type \
+        PromotedT;                                                                                                   \
+    return EIGEN_SCALAR_BINARYOP_EXPR_RETURN_TYPE(PromotedT, Derived, OPNAME)(                                       \
+        typename internal::plain_constant_type<Derived, PromotedT>::type(                                            \
+            matrix.derived().rows(), matrix.derived().cols(), internal::scalar_constant_op<PromotedT>(scalar)),      \
+        matrix.derived());                                                                                           \
   }
 
-// the expression type of a cwise product
-#define EIGEN_CWISE_PRODUCT_RETURN_TYPE(LHS,RHS) \
-    CwiseBinaryOp< \
-      internal::scalar_product_op< \
-          typename internal::traits<LHS>::Scalar, \
-          typename internal::traits<RHS>::Scalar \
-      >, \
-      const LHS, \
-      const RHS \
-    >
-
-#endif // EIGEN_MACROS_H
+#define EIGEN_MAKE_SCALAR_BINARY_OP(METHOD, OPNAME)     \
+  EIGEN_MAKE_SCALAR_BINARY_OP_ONTHELEFT(METHOD, OPNAME) \
+  EIGEN_MAKE_SCALAR_BINARY_OP_ONTHERIGHT(METHOD, OPNAME)
+
+#if (defined(_CPPUNWIND) || defined(__EXCEPTIONS)) && !defined(EIGEN_CUDA_ARCH) && !defined(EIGEN_EXCEPTIONS) && \
+    !defined(EIGEN_USE_SYCL) && !defined(EIGEN_HIP_DEVICE_COMPILE)
+#define EIGEN_EXCEPTIONS
+#endif
+
+#ifdef EIGEN_EXCEPTIONS
+#define EIGEN_THROW_X(X) throw X
+#define EIGEN_THROW throw
+#define EIGEN_TRY try
+#define EIGEN_CATCH(X) catch (X)
+#else
+#if defined(EIGEN_CUDA_ARCH)
+#define EIGEN_THROW_X(X) asm("trap;")
+#define EIGEN_THROW asm("trap;")
+#elif defined(EIGEN_HIP_DEVICE_COMPILE)
+#define EIGEN_THROW_X(X) asm("s_trap 0")
+#define EIGEN_THROW asm("s_trap 0")
+#else
+#define EIGEN_THROW_X(X) std::abort()
+#define EIGEN_THROW std::abort()
+#endif
+#define EIGEN_TRY if (true)
+#define EIGEN_CATCH(X) else
+#endif
+
+// The all function is used to enable a variadic version of eigen_assert which can take a parameter pack as its input.
+namespace Eigen {
+namespace internal {
+
+EIGEN_DEVICE_FUNC inline bool all() { return true; }
+
+template <typename T, typename... Ts>
+EIGEN_DEVICE_FUNC bool all(T t, Ts... ts) {
+  return t && all(ts...);
+}
+
+}  // namespace internal
+}  // namespace Eigen
+
+// provide override and final specifiers if they are available:
+#define EIGEN_OVERRIDE override
+#define EIGEN_FINAL final
+
+// Wrapping #pragma unroll in a macro since it is required for SYCL
+#if defined(SYCL_DEVICE_ONLY)
+#if defined(_MSC_VER)
+#define EIGEN_UNROLL_LOOP __pragma(unroll)
+#else
+#define EIGEN_UNROLL_LOOP _Pragma("unroll")
+#endif
+#else
+#define EIGEN_UNROLL_LOOP
+#endif
+
+// Notice: Use this macro with caution. The code in the if body should still
+// compile with C++14.
+#if defined(EIGEN_HAS_CXX17_IFCONSTEXPR)
+#define EIGEN_IF_CONSTEXPR(X) if constexpr (X)
+#else
+#define EIGEN_IF_CONSTEXPR(X) if (X)
+#endif
+
+#endif  // EIGEN_MACROS_H
diff --git a/inst/include/Eigen/src/Core/util/MaxSizeVector.h b/inst/include/Eigen/src/Core/util/MaxSizeVector.h
new file mode 100644
index 00000000..db5bb895
--- /dev/null
+++ b/inst/include/Eigen/src/Core/util/MaxSizeVector.h
@@ -0,0 +1,139 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_FIXEDSIZEVECTOR_H
+#define EIGEN_FIXEDSIZEVECTOR_H
+
+namespace Eigen {
+
+/** \class MaxSizeVector
+ * \ingroup Core_Module
+ *
+ * \brief The MaxSizeVector class.
+ *
+ * The %MaxSizeVector provides a subset of std::vector functionality.
+ *
+ * The goal is to provide basic std::vector operations when using
+ * std::vector is not an option (e.g. on GPU or when compiling using
+ * FMA/AVX, as this can cause either compilation failures or illegal
+ * instruction failures).
+ *
+ * Beware: The constructors are not API compatible with these of
+ * std::vector.
+ */
+template <typename T>
+class MaxSizeVector {
+  static const size_t alignment = internal::plain_enum_max(EIGEN_ALIGNOF(T), sizeof(void*));
+
+ public:
+  // Construct a new MaxSizeVector, reserve n elements.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit MaxSizeVector(size_t n)
+      : reserve_(n), size_(0), data_(static_cast<T*>(internal::handmade_aligned_malloc(n * sizeof(T), alignment))) {}
+
+  // Construct a new MaxSizeVector, reserve and resize to n.
+  // Copy the init value to all elements.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MaxSizeVector(size_t n, const T& init)
+      : reserve_(n), size_(n), data_(static_cast<T*>(internal::handmade_aligned_malloc(n * sizeof(T), alignment))) {
+    size_t i = 0;
+    EIGEN_TRY {
+      for (; i < size_; ++i) {
+        new (&data_[i]) T(init);
+      }
+    }
+    EIGEN_CATCH(...) {
+      // Construction failed, destruct in reverse order:
+      for (; (i + 1) > 0; --i) {
+        data_[i - 1].~T();
+      }
+      internal::handmade_aligned_free(data_);
+      EIGEN_THROW;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ~MaxSizeVector() {
+    for (size_t i = size_; i > 0; --i) {
+      data_[i - 1].~T();
+    }
+    internal::handmade_aligned_free(data_);
+  }
+
+  void resize(size_t n) {
+    eigen_assert(n <= reserve_);
+    for (; size_ < n; ++size_) {
+      new (&data_[size_]) T;
+    }
+    for (; size_ > n; --size_) {
+      data_[size_ - 1].~T();
+    }
+    eigen_assert(size_ == n);
+  }
+
+  // Append new elements (up to reserved size).
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void push_back(const T& t) {
+    eigen_assert(size_ < reserve_);
+    new (&data_[size_++]) T(t);
+  }
+
+  // For C++03 compatibility this only takes one argument
+  template <class X>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void emplace_back(const X& x) {
+    eigen_assert(size_ < reserve_);
+    new (&data_[size_++]) T(x);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& operator[](size_t i) const {
+    eigen_assert(i < size_);
+    return data_[i];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& operator[](size_t i) {
+    eigen_assert(i < size_);
+    return data_[i];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T& back() {
+    eigen_assert(size_ > 0);
+    return data_[size_ - 1];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T& back() const {
+    eigen_assert(size_ > 0);
+    return data_[size_ - 1];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pop_back() {
+    eigen_assert(size_ > 0);
+    data_[--size_].~T();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t size() const { return size_; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool empty() const { return size_ == 0; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr T* data() { return data_; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const T* data() const { return data_; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr T* begin() { return data_; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr T* end() { return data_ + size_; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const T* begin() const { return data_; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr const T* end() const { return data_ + size_; }
+
+ private:
+  size_t reserve_;
+  size_t size_;
+  T* data_;
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_FIXEDSIZEVECTOR_H
diff --git a/inst/include/Eigen/src/Core/util/Memory.h b/inst/include/Eigen/src/Core/util/Memory.h
index b9af5cf8..1492f72c 100644
--- a/inst/include/Eigen/src/Core/util/Memory.h
+++ b/inst/include/Eigen/src/Core/util/Memory.h
@@ -1,17 +1,17 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2008-2009 Benoit Jacob <jacob.benoit.1@gmail.com>
 // Copyright (C) 2009 Kenneth Riddile <kfriddile@yahoo.com>
 // Copyright (C) 2010 Hauke Heibel <hauke.heibel@gmail.com>
 // Copyright (C) 2010 Thomas Capricelli <orzel@freehackers.org>
+// Copyright (C) 2013 Pavel Holoborodko <pavel@holoborodko.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-
 /*****************************************************************************
 *** Platform checks for aligned malloc functions                           ***
 *****************************************************************************/
@@ -30,66 +30,119 @@
 //   http://gcc.fyxm.net/summit/2003/Porting%20to%2064%20bit.pdf
 // page 114, "[The] LP64 model [...] is used by all 64-bit UNIX ports" so it's indeed
 // quite safe, at least within the context of glibc, to equate 64-bit with LP64.
-#if defined(__GLIBC__) && ((__GLIBC__>=2 && __GLIBC_MINOR__ >= 8) || __GLIBC__>2) \
- && defined(__LP64__) && ! defined( __SANITIZE_ADDRESS__ )
-  #define EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED 1
+#if defined(__GLIBC__) && ((__GLIBC__ >= 2 && __GLIBC_MINOR__ >= 8) || __GLIBC__ > 2) && defined(__LP64__) && \
+    !defined(__SANITIZE_ADDRESS__) && (EIGEN_DEFAULT_ALIGN_BYTES == 16)
+#define EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED 1
 #else
-  #define EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED 0
+#define EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED 0
 #endif
 
 // FreeBSD 6 seems to have 16-byte aligned malloc
 //   See http://svn.freebsd.org/viewvc/base/stable/6/lib/libc/stdlib/malloc.c?view=markup
 // FreeBSD 7 seems to have 16-byte aligned malloc except on ARM and MIPS architectures
 //   See http://svn.freebsd.org/viewvc/base/stable/7/lib/libc/stdlib/malloc.c?view=markup
-#if defined(__FreeBSD__) && !defined(__arm__) && !defined(__mips__)
-  #define EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED 1
+#if defined(__FreeBSD__) && !(EIGEN_ARCH_ARM || EIGEN_ARCH_MIPS) && (EIGEN_DEFAULT_ALIGN_BYTES == 16)
+#define EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED 1
 #else
-  #define EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED 0
+#define EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED 0
 #endif
 
-#if defined(__APPLE__) \
- || defined(_WIN64) \
- || EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED \
- || EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED
-  #define EIGEN_MALLOC_ALREADY_ALIGNED 1
+#if (EIGEN_OS_MAC && (EIGEN_DEFAULT_ALIGN_BYTES == 16)) || (EIGEN_OS_WIN64 && (EIGEN_DEFAULT_ALIGN_BYTES == 16)) || \
+    EIGEN_GLIBC_MALLOC_ALREADY_ALIGNED || EIGEN_FREEBSD_MALLOC_ALREADY_ALIGNED
+#define EIGEN_MALLOC_ALREADY_ALIGNED 1
 #else
-  #define EIGEN_MALLOC_ALREADY_ALIGNED 0
+#define EIGEN_MALLOC_ALREADY_ALIGNED 0
 #endif
 
 #endif
 
-// See bug 554 (http://eigen.tuxfamily.org/bz/show_bug.cgi?id=554)
-// It seems to be unsafe to check _POSIX_ADVISORY_INFO without including unistd.h first.
-// Currently, let's include it only on unix systems:
-#if defined(__unix__) || defined(__unix)
-  #include <unistd.h>
-  #if ((defined __QNXNTO__) || (defined _GNU_SOURCE) || (defined __PGI) || ((defined _XOPEN_SOURCE) && (_XOPEN_SOURCE >= 600))) && (defined _POSIX_ADVISORY_INFO) && (_POSIX_ADVISORY_INFO > 0)
-    #define EIGEN_HAS_POSIX_MEMALIGN 1
-  #endif
-#endif
+#ifndef EIGEN_MALLOC_CHECK_THREAD_LOCAL
 
-#ifndef EIGEN_HAS_POSIX_MEMALIGN
-  #define EIGEN_HAS_POSIX_MEMALIGN 0
-#endif
+// Check whether we can use the thread_local keyword to allow or disallow
+// allocating memory with per-thread granularity, by means of the
+// set_is_malloc_allowed() function.
+#ifndef EIGEN_AVOID_THREAD_LOCAL
 
-#ifdef EIGEN_VECTORIZE_SSE
-  #define EIGEN_HAS_MM_MALLOC 1
+#if ((EIGEN_COMP_GNUC) || __has_feature(cxx_thread_local) || EIGEN_COMP_MSVC >= 1900) && \
+    !defined(EIGEN_GPU_COMPILE_PHASE)
+#define EIGEN_MALLOC_CHECK_THREAD_LOCAL thread_local
 #else
-  #define EIGEN_HAS_MM_MALLOC 0
+#define EIGEN_MALLOC_CHECK_THREAD_LOCAL
+#endif
+
+#else  // EIGEN_AVOID_THREAD_LOCAL
+#define EIGEN_MALLOC_CHECK_THREAD_LOCAL
+#endif  // EIGEN_AVOID_THREAD_LOCAL
+
 #endif
 
+// IWYU pragma: private
+#include "../InternalHeaderCheck.h"
+
 namespace Eigen {
 
 namespace internal {
 
-inline void throw_std_bad_alloc()
-{
-  #ifdef EIGEN_EXCEPTIONS
-    throw std::bad_alloc();
-  #else
-    std::size_t huge = -1;
-    new int[huge];
-  #endif
+/*****************************************************************************
+*** Implementation of portable aligned versions of malloc/free/realloc     ***
+*****************************************************************************/
+
+#ifdef EIGEN_NO_MALLOC
+EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed() {
+  eigen_assert(false && "heap allocation is forbidden (EIGEN_NO_MALLOC is defined)");
+}
+EIGEN_DEVICE_FUNC inline void check_that_free_is_allowed() {
+  eigen_assert(false && "heap deallocation is forbidden (EIGEN_NO_MALLOC is defined)");
+}
+#elif defined EIGEN_RUNTIME_NO_MALLOC
+EIGEN_DEVICE_FUNC inline bool is_malloc_allowed_impl(bool update, bool new_value = false) {
+  EIGEN_MALLOC_CHECK_THREAD_LOCAL static bool value = true;
+  if (update == 1) value = new_value;
+  return value;
+}
+EIGEN_DEVICE_FUNC inline bool is_malloc_allowed() { return is_malloc_allowed_impl(false); }
+EIGEN_DEVICE_FUNC inline bool set_is_malloc_allowed(bool new_value) { return is_malloc_allowed_impl(true, new_value); }
+EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed() {
+  eigen_assert(is_malloc_allowed() &&
+               "heap allocation is forbidden (EIGEN_RUNTIME_NO_MALLOC is defined and set_is_malloc_allowed is false)");
+}
+EIGEN_DEVICE_FUNC inline bool is_free_allowed_impl(bool update, bool new_value = false) {
+  EIGEN_MALLOC_CHECK_THREAD_LOCAL static bool value = true;
+  if (update == 1) value = new_value;
+  return value;
+}
+EIGEN_DEVICE_FUNC inline bool is_free_allowed() { return is_free_allowed_impl(false); }
+EIGEN_DEVICE_FUNC inline bool set_is_free_allowed(bool new_value) { return is_free_allowed_impl(true, new_value); }
+EIGEN_DEVICE_FUNC inline void check_that_free_is_allowed() {
+  eigen_assert(is_malloc_allowed() &&
+               "heap deallocation is forbidden (EIGEN_RUNTIME_NO_MALLOC is defined and set_is_free_allowed is false)");
+}
+#else
+EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed() {}
+EIGEN_DEVICE_FUNC inline void check_that_free_is_allowed() {}
+#endif
+
+EIGEN_DEVICE_FUNC inline void throw_std_bad_alloc() {
+#ifdef EIGEN_EXCEPTIONS
+  throw std::bad_alloc();
+#else
+  std::size_t huge = static_cast<std::size_t>(-1);
+#if defined(EIGEN_HIPCC)
+  //
+  // calls to "::operator new" are to be treated as opaque function calls (i.e no inlining),
+  // and as a consequence the code in the #else block triggers the hipcc warning :
+  // "no overloaded function has restriction specifiers that are compatible with the ambient context"
+  //
+  // "throw_std_bad_alloc" has the EIGEN_DEVICE_FUNC attribute, so it seems that hipcc expects
+  // the same on "operator new"
+  // Reverting code back to the old version in this #if block for the hipcc compiler
+  //
+  new int[huge];
+#else
+  void* unused = ::operator new(huge);
+  EIGEN_UNUSED_VARIABLE(unused);
+#endif
+#endif
 }
 
 /*****************************************************************************
@@ -98,193 +151,132 @@ inline void throw_std_bad_alloc()
 
 /* ----- Hand made implementations of aligned malloc/free and realloc ----- */
 
-/** \internal Like malloc, but the returned pointer is guaranteed to be 16-byte aligned.
-  * Fast, but wastes 16 additional bytes of memory. Does not throw any exception.
-  */
-inline void* handmade_aligned_malloc(std::size_t size)
-{
-  void *original = std::malloc(size+16);
-  if (original == 0) return 0;
-  void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(15))) + 16);
-  *(reinterpret_cast<void**>(aligned) - 1) = original;
+/** \internal Like malloc, but the returned pointer is guaranteed to be aligned to `alignment`.
+ * Fast, but wastes `alignment` additional bytes of memory. Does not throw any exception.
+ */
+EIGEN_DEVICE_FUNC inline void* handmade_aligned_malloc(std::size_t size,
+                                                       std::size_t alignment = EIGEN_DEFAULT_ALIGN_BYTES) {
+  eigen_assert(alignment >= sizeof(void*) && alignment <= 256 && (alignment & (alignment - 1)) == 0 &&
+               "Alignment must be at least sizeof(void*), less than or equal to 256, and a power of 2");
+
+  check_that_malloc_is_allowed();
+  EIGEN_USING_STD(malloc)
+  void* original = malloc(size + alignment);
+  if (original == nullptr) return nullptr;
+  std::size_t offset = alignment - (reinterpret_cast<std::size_t>(original) & (alignment - 1));
+  void* aligned = static_cast<void*>(static_cast<uint8_t*>(original) + offset);
+  // Store offset - 1, since it is guaranteed to be at least 1.
+  *(static_cast<uint8_t*>(aligned) - 1) = static_cast<uint8_t>(offset - 1);
   return aligned;
 }
 
 /** \internal Frees memory allocated with handmade_aligned_malloc */
-inline void handmade_aligned_free(void *ptr)
-{
-  if (ptr) std::free(*(reinterpret_cast<void**>(ptr) - 1));
+EIGEN_DEVICE_FUNC inline void handmade_aligned_free(void* ptr) {
+  if (ptr != nullptr) {
+    std::size_t offset = static_cast<std::size_t>(*(static_cast<uint8_t*>(ptr) - 1)) + 1;
+    void* original = static_cast<void*>(static_cast<uint8_t*>(ptr) - offset);
+
+    check_that_free_is_allowed();
+    EIGEN_USING_STD(free)
+    free(original);
+  }
 }
 
 /** \internal
-  * \brief Reallocates aligned memory.
-  * Since we know that our handmade version is based on std::realloc
-  * we can use std::realloc to implement efficient reallocation.
-  */
-inline void* handmade_aligned_realloc(void* ptr, std::size_t size, std::size_t = 0)
-{
-  if (ptr == 0) return handmade_aligned_malloc(size);
-  void *original = *(reinterpret_cast<void**>(ptr) - 1);
-  std::ptrdiff_t previous_offset = static_cast<char *>(ptr)-static_cast<char *>(original);
-  original = std::realloc(original,size+16);
-  if (original == 0) return 0;
-  void *aligned = reinterpret_cast<void*>((reinterpret_cast<std::size_t>(original) & ~(std::size_t(15))) + 16);
-  void *previous_aligned = static_cast<char *>(original)+previous_offset;
-  if(aligned!=previous_aligned)
-    std::memmove(aligned, previous_aligned, size);
-  
-  *(reinterpret_cast<void**>(aligned) - 1) = original;
-  return aligned;
-}
-
-/*****************************************************************************
-*** Implementation of generic aligned realloc (when no realloc can be used)***
-*****************************************************************************/
-
-void* aligned_malloc(std::size_t size);
-void  aligned_free(void *ptr);
-
-/** \internal
-  * \brief Reallocates aligned memory.
-  * Allows reallocation with aligned ptr types. This implementation will
-  * always create a new memory chunk and copy the old data.
-  */
-inline void* generic_aligned_realloc(void* ptr, size_t size, size_t old_size)
-{
-  if (ptr==0)
-    return aligned_malloc(size);
-
-  if (size==0)
-  {
-    aligned_free(ptr);
-    return 0;
-  }
-
-  void* newptr = aligned_malloc(size);
-  if (newptr == 0)
-  {
-    #ifdef EIGEN_HAS_ERRNO
-    errno = ENOMEM; // according to the standard
-    #endif
-    return 0;
-  }
+ * \brief Reallocates aligned memory.
+ * Since we know that our handmade version is based on std::malloc
+ * we can use std::realloc to implement efficient reallocation.
+ */
+EIGEN_DEVICE_FUNC inline void* handmade_aligned_realloc(void* ptr, std::size_t new_size, std::size_t old_size,
+                                                        std::size_t alignment = EIGEN_DEFAULT_ALIGN_BYTES) {
+  if (ptr == nullptr) return handmade_aligned_malloc(new_size, alignment);
+  std::size_t old_offset = static_cast<std::size_t>(*(static_cast<uint8_t*>(ptr) - 1)) + 1;
+  void* old_original = static_cast<uint8_t*>(ptr) - old_offset;
 
-  if (ptr != 0)
-  {
-    std::memcpy(newptr, ptr, (std::min)(size,old_size));
-    aligned_free(ptr);
+  check_that_malloc_is_allowed();
+  EIGEN_USING_STD(realloc)
+  void* original = realloc(old_original, new_size + alignment);
+  if (original == nullptr) return nullptr;
+  if (original == old_original) return ptr;
+  std::size_t offset = alignment - (reinterpret_cast<std::size_t>(original) & (alignment - 1));
+  void* aligned = static_cast<void*>(static_cast<uint8_t*>(original) + offset);
+  if (offset != old_offset) {
+    const void* src = static_cast<const void*>(static_cast<uint8_t*>(original) + old_offset);
+    std::size_t count = (std::min)(new_size, old_size);
+    std::memmove(aligned, src, count);
   }
-
-  return newptr;
+  // Store offset - 1, since it is guaranteed to be at least 1.
+  *(static_cast<uint8_t*>(aligned) - 1) = static_cast<uint8_t>(offset - 1);
+  return aligned;
 }
 
-/*****************************************************************************
-*** Implementation of portable aligned versions of malloc/free/realloc     ***
-*****************************************************************************/
+/** \internal Allocates \a size bytes. The returned pointer is guaranteed to have 16 or 32 bytes alignment depending on
+ * the requirements. On allocation error, the returned pointer is null, and std::bad_alloc is thrown.
+ */
+EIGEN_DEVICE_FUNC inline void* aligned_malloc(std::size_t size) {
+  if (size == 0) return nullptr;
 
-#ifdef EIGEN_NO_MALLOC
-inline void check_that_malloc_is_allowed()
-{
-  eigen_assert(false && "heap allocation is forbidden (EIGEN_NO_MALLOC is defined)");
-}
-#elif defined EIGEN_RUNTIME_NO_MALLOC
-inline bool is_malloc_allowed_impl(bool update, bool new_value = false)
-{
-  static bool value = true;
-  if (update == 1)
-    value = new_value;
-  return value;
-}
-inline bool is_malloc_allowed() { return is_malloc_allowed_impl(false); }
-inline bool set_is_malloc_allowed(bool new_value) { return is_malloc_allowed_impl(true, new_value); }
-inline void check_that_malloc_is_allowed()
-{
-  eigen_assert(is_malloc_allowed() && "heap allocation is forbidden (EIGEN_RUNTIME_NO_MALLOC is defined and g_is_malloc_allowed is false)");
-}
-#else 
-inline void check_that_malloc_is_allowed()
-{}
-#endif
+  void* result;
+#if (EIGEN_DEFAULT_ALIGN_BYTES == 0) || EIGEN_MALLOC_ALREADY_ALIGNED
 
-/** \internal Allocates \a size bytes. The returned pointer is guaranteed to have 16 bytes alignment.
-  * On allocation error, the returned pointer is null, and std::bad_alloc is thrown.
-  */
-inline void* aligned_malloc(size_t size)
-{
   check_that_malloc_is_allowed();
+  EIGEN_USING_STD(malloc)
+  result = malloc(size);
 
-  void *result;
-  #if !EIGEN_ALIGN
-    result = std::malloc(size);
-  #elif EIGEN_MALLOC_ALREADY_ALIGNED
-    result = std::malloc(size);
-  #elif EIGEN_HAS_POSIX_MEMALIGN
-    if(posix_memalign(&result, 16, size)) result = 0;
-  #elif EIGEN_HAS_MM_MALLOC
-    result = _mm_malloc(size, 16);
-  #elif defined(_MSC_VER) && (!defined(_WIN32_WCE))
-    result = _aligned_malloc(size, 16);
-  #else
-    result = handmade_aligned_malloc(size);
-  #endif
-
-  if(!result && size)
-    throw_std_bad_alloc();
+#if EIGEN_DEFAULT_ALIGN_BYTES == 16
+  eigen_assert((size < 16 || (std::size_t(result) % 16) == 0) &&
+               "System's malloc returned an unaligned pointer. Compile with EIGEN_MALLOC_ALREADY_ALIGNED=0 to fallback "
+               "to handmade aligned memory allocator.");
+#endif
+#else
+  result = handmade_aligned_malloc(size);
+#endif
+
+  if (!result && size) throw_std_bad_alloc();
 
   return result;
 }
 
 /** \internal Frees memory allocated with aligned_malloc. */
-inline void aligned_free(void *ptr)
-{
-  #if !EIGEN_ALIGN
-    std::free(ptr);
-  #elif EIGEN_MALLOC_ALREADY_ALIGNED
-    std::free(ptr);
-  #elif EIGEN_HAS_POSIX_MEMALIGN
-    std::free(ptr);
-  #elif EIGEN_HAS_MM_MALLOC
-    _mm_free(ptr);
-  #elif defined(_MSC_VER) && (!defined(_WIN32_WCE))
-    _aligned_free(ptr);
-  #else
-    handmade_aligned_free(ptr);
-  #endif
+EIGEN_DEVICE_FUNC inline void aligned_free(void* ptr) {
+#if (EIGEN_DEFAULT_ALIGN_BYTES == 0) || EIGEN_MALLOC_ALREADY_ALIGNED
+
+  if (ptr != nullptr) {
+    check_that_free_is_allowed();
+    EIGEN_USING_STD(free)
+    free(ptr);
+  }
+
+#else
+  handmade_aligned_free(ptr);
+#endif
 }
 
 /**
-* \internal
-* \brief Reallocates an aligned block of memory.
-* \throws std::bad_alloc on allocation failure
-**/
-inline void* aligned_realloc(void *ptr, size_t new_size, size_t old_size)
-{
-  EIGEN_UNUSED_VARIABLE(old_size);
-
-  void *result;
-#if !EIGEN_ALIGN
-  result = std::realloc(ptr,new_size);
-#elif EIGEN_MALLOC_ALREADY_ALIGNED
-  result = std::realloc(ptr,new_size);
-#elif EIGEN_HAS_POSIX_MEMALIGN
-  result = generic_aligned_realloc(ptr,new_size,old_size);
-#elif EIGEN_HAS_MM_MALLOC
-  // The defined(_mm_free) is just here to verify that this MSVC version
-  // implements _mm_malloc/_mm_free based on the corresponding _aligned_
-  // functions. This may not always be the case and we just try to be safe.
-  #if defined(_MSC_VER) && (!defined(_WIN32_WCE)) && defined(_mm_free)
-    result = _aligned_realloc(ptr,new_size,16);
-  #else
-    result = generic_aligned_realloc(ptr,new_size,old_size);
-  #endif
-#elif defined(_MSC_VER) && (!defined(_WIN32_WCE))
-  result = _aligned_realloc(ptr,new_size,16);
+ * \internal
+ * \brief Reallocates an aligned block of memory.
+ * \throws std::bad_alloc on allocation failure
+ */
+EIGEN_DEVICE_FUNC inline void* aligned_realloc(void* ptr, std::size_t new_size, std::size_t old_size) {
+  if (ptr == nullptr) return aligned_malloc(new_size);
+  if (old_size == new_size) return ptr;
+  if (new_size == 0) {
+    aligned_free(ptr);
+    return nullptr;
+  }
+
+  void* result;
+#if (EIGEN_DEFAULT_ALIGN_BYTES == 0) || EIGEN_MALLOC_ALREADY_ALIGNED
+  EIGEN_UNUSED_VARIABLE(old_size)
+
+  check_that_malloc_is_allowed();
+  EIGEN_USING_STD(realloc)
+  result = realloc(ptr, new_size);
 #else
-  result = handmade_aligned_realloc(ptr,new_size,old_size);
+  result = handmade_aligned_realloc(ptr, new_size, old_size);
 #endif
 
-  if (!result && new_size)
-    throw_std_bad_alloc();
+  if (!result && new_size) throw_std_bad_alloc();
 
   return result;
 }
@@ -294,227 +286,358 @@ inline void* aligned_realloc(void *ptr, size_t new_size, size_t old_size)
 *****************************************************************************/
 
 /** \internal Allocates \a size bytes. If Align is true, then the returned ptr is 16-byte-aligned.
-  * On allocation error, the returned pointer is null, and a std::bad_alloc is thrown.
-  */
-template<bool Align> inline void* conditional_aligned_malloc(size_t size)
-{
+ * On allocation error, the returned pointer is null, and a std::bad_alloc is thrown.
+ */
+template <bool Align>
+EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc(std::size_t size) {
   return aligned_malloc(size);
 }
 
-template<> inline void* conditional_aligned_malloc<false>(size_t size)
-{
+template <>
+EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc<false>(std::size_t size) {
+  if (size == 0) return nullptr;
+
   check_that_malloc_is_allowed();
+  EIGEN_USING_STD(malloc)
+  void* result = malloc(size);
 
-  void *result = std::malloc(size);
-  if(!result && size)
-    throw_std_bad_alloc();
+  if (!result && size) throw_std_bad_alloc();
   return result;
 }
 
 /** \internal Frees memory allocated with conditional_aligned_malloc */
-template<bool Align> inline void conditional_aligned_free(void *ptr)
-{
+template <bool Align>
+EIGEN_DEVICE_FUNC inline void conditional_aligned_free(void* ptr) {
   aligned_free(ptr);
 }
 
-template<> inline void conditional_aligned_free<false>(void *ptr)
-{
-  std::free(ptr);
+template <>
+EIGEN_DEVICE_FUNC inline void conditional_aligned_free<false>(void* ptr) {
+  if (ptr != nullptr) {
+    check_that_free_is_allowed();
+    EIGEN_USING_STD(free)
+    free(ptr);
+  }
 }
 
-template<bool Align> inline void* conditional_aligned_realloc(void* ptr, size_t new_size, size_t old_size)
-{
+template <bool Align>
+EIGEN_DEVICE_FUNC inline void* conditional_aligned_realloc(void* ptr, std::size_t new_size, std::size_t old_size) {
   return aligned_realloc(ptr, new_size, old_size);
 }
 
-template<> inline void* conditional_aligned_realloc<false>(void* ptr, size_t new_size, size_t)
-{
-  return std::realloc(ptr, new_size);
+template <>
+EIGEN_DEVICE_FUNC inline void* conditional_aligned_realloc<false>(void* ptr, std::size_t new_size,
+                                                                  std::size_t old_size) {
+  if (ptr == nullptr) return conditional_aligned_malloc<false>(new_size);
+  if (old_size == new_size) return ptr;
+  if (new_size == 0) {
+    conditional_aligned_free<false>(ptr);
+    return nullptr;
+  }
+
+  check_that_malloc_is_allowed();
+  EIGEN_USING_STD(realloc)
+  return realloc(ptr, new_size);
 }
 
 /*****************************************************************************
 *** Construction/destruction of array elements                             ***
 *****************************************************************************/
 
+/** \internal Destructs the elements of an array.
+ * The \a size parameters tells on how many objects to call the destructor of T.
+ */
+template <typename T>
+EIGEN_DEVICE_FUNC inline void destruct_elements_of_array(T* ptr, std::size_t size) {
+  // always destruct an array starting from the end.
+  if (ptr)
+    while (size) ptr[--size].~T();
+}
+
 /** \internal Constructs the elements of an array.
-  * The \a size parameter tells on how many objects to call the constructor of T.
-  */
-template<typename T> inline T* construct_elements_of_array(T *ptr, size_t size)
-{
-  for (size_t i=0; i < size; ++i) ::new (ptr + i) T;
+ * The \a size parameter tells on how many objects to call the constructor of T.
+ */
+template <typename T>
+EIGEN_DEVICE_FUNC inline T* default_construct_elements_of_array(T* ptr, std::size_t size) {
+  std::size_t i = 0;
+  EIGEN_TRY {
+    for (i = 0; i < size; ++i) ::new (ptr + i) T;
+  }
+  EIGEN_CATCH(...) {
+    destruct_elements_of_array(ptr, i);
+    EIGEN_THROW;
+  }
   return ptr;
 }
 
-/** \internal Destructs the elements of an array.
-  * The \a size parameters tells on how many objects to call the destructor of T.
-  */
-template<typename T> inline void destruct_elements_of_array(T *ptr, size_t size)
-{
-  // always destruct an array starting from the end.
-  if(ptr)
-    while(size) ptr[--size].~T();
+/** \internal Copy-constructs the elements of an array.
+ * The \a size parameter tells on how many objects to copy.
+ */
+template <typename T>
+EIGEN_DEVICE_FUNC inline T* copy_construct_elements_of_array(T* ptr, const T* src, std::size_t size) {
+  std::size_t i = 0;
+  EIGEN_TRY {
+    for (i = 0; i < size; ++i) ::new (ptr + i) T(*(src + i));
+  }
+  EIGEN_CATCH(...) {
+    destruct_elements_of_array(ptr, i);
+    EIGEN_THROW;
+  }
+  return ptr;
+}
+
+/** \internal Move-constructs the elements of an array.
+ * The \a size parameter tells on how many objects to move.
+ */
+template <typename T>
+EIGEN_DEVICE_FUNC inline T* move_construct_elements_of_array(T* ptr, T* src, std::size_t size) {
+  std::size_t i = 0;
+  EIGEN_TRY {
+    for (i = 0; i < size; ++i) ::new (ptr + i) T(std::move(*(src + i)));
+  }
+  EIGEN_CATCH(...) {
+    destruct_elements_of_array(ptr, i);
+    EIGEN_THROW;
+  }
+  return ptr;
 }
 
 /*****************************************************************************
 *** Implementation of aligned new/delete-like functions                    ***
 *****************************************************************************/
 
-template<typename T>
-EIGEN_ALWAYS_INLINE void check_size_for_overflow(size_t size)
-{
-  if(size > size_t(-1) / sizeof(T))
-    throw_std_bad_alloc();
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void check_size_for_overflow(std::size_t size) {
+  constexpr std::size_t max_elements = (std::numeric_limits<std::ptrdiff_t>::max)() / sizeof(T);
+  if (size > max_elements) throw_std_bad_alloc();
 }
 
 /** \internal Allocates \a size objects of type T. The returned pointer is guaranteed to have 16 bytes alignment.
-  * On allocation error, the returned pointer is undefined, but a std::bad_alloc is thrown.
-  * The default constructor of T is called.
-  */
-template<typename T> inline T* aligned_new(size_t size)
-{
+ * On allocation error, the returned pointer is undefined, but a std::bad_alloc is thrown.
+ * The default constructor of T is called.
+ */
+template <typename T>
+EIGEN_DEVICE_FUNC inline T* aligned_new(std::size_t size) {
   check_size_for_overflow<T>(size);
-  T *result = reinterpret_cast<T*>(aligned_malloc(sizeof(T)*size));
-  return construct_elements_of_array(result, size);
+  T* result = static_cast<T*>(aligned_malloc(sizeof(T) * size));
+  EIGEN_TRY { return default_construct_elements_of_array(result, size); }
+  EIGEN_CATCH(...) {
+    aligned_free(result);
+    EIGEN_THROW;
+  }
+  return result;
 }
 
-template<typename T, bool Align> inline T* conditional_aligned_new(size_t size)
-{
+template <typename T, bool Align>
+EIGEN_DEVICE_FUNC inline T* conditional_aligned_new(std::size_t size) {
   check_size_for_overflow<T>(size);
-  T *result = reinterpret_cast<T*>(conditional_aligned_malloc<Align>(sizeof(T)*size));
-  return construct_elements_of_array(result, size);
+  T* result = static_cast<T*>(conditional_aligned_malloc<Align>(sizeof(T) * size));
+  EIGEN_TRY { return default_construct_elements_of_array(result, size); }
+  EIGEN_CATCH(...) {
+    conditional_aligned_free<Align>(result);
+    EIGEN_THROW;
+  }
+  return result;
 }
 
 /** \internal Deletes objects constructed with aligned_new
-  * The \a size parameters tells on how many objects to call the destructor of T.
-  */
-template<typename T> inline void aligned_delete(T *ptr, size_t size)
-{
+ * The \a size parameters tells on how many objects to call the destructor of T.
+ */
+template <typename T>
+EIGEN_DEVICE_FUNC inline void aligned_delete(T* ptr, std::size_t size) {
   destruct_elements_of_array<T>(ptr, size);
   aligned_free(ptr);
 }
 
 /** \internal Deletes objects constructed with conditional_aligned_new
-  * The \a size parameters tells on how many objects to call the destructor of T.
-  */
-template<typename T, bool Align> inline void conditional_aligned_delete(T *ptr, size_t size)
-{
+ * The \a size parameters tells on how many objects to call the destructor of T.
+ */
+template <typename T, bool Align>
+EIGEN_DEVICE_FUNC inline void conditional_aligned_delete(T* ptr, std::size_t size) {
   destruct_elements_of_array<T>(ptr, size);
   conditional_aligned_free<Align>(ptr);
 }
 
-template<typename T, bool Align> inline T* conditional_aligned_realloc_new(T* pts, size_t new_size, size_t old_size)
-{
+template <typename T, bool Align>
+EIGEN_DEVICE_FUNC inline T* conditional_aligned_realloc_new(T* pts, std::size_t new_size, std::size_t old_size) {
   check_size_for_overflow<T>(new_size);
   check_size_for_overflow<T>(old_size);
-  if(new_size < old_size)
-    destruct_elements_of_array(pts+new_size, old_size-new_size);
-  T *result = reinterpret_cast<T*>(conditional_aligned_realloc<Align>(reinterpret_cast<void*>(pts), sizeof(T)*new_size, sizeof(T)*old_size));
-  if(new_size > old_size)
-    construct_elements_of_array(result+old_size, new_size-old_size);
+
+  // If elements need to be explicitly initialized, we cannot simply realloc
+  // (or memcpy) the memory block - each element needs to be reconstructed.
+  // Otherwise, objects that contain internal pointers like mpfr or
+  // AnnoyingScalar can be pointing to the wrong thing.
+  T* result = static_cast<T*>(conditional_aligned_malloc<Align>(sizeof(T) * new_size));
+  EIGEN_TRY {
+    // Move-construct initial elements.
+    std::size_t copy_size = (std::min)(old_size, new_size);
+    move_construct_elements_of_array(result, pts, copy_size);
+
+    // Default-construct remaining elements.
+    if (new_size > old_size) {
+      default_construct_elements_of_array(result + copy_size, new_size - old_size);
+    }
+
+    // Delete old elements.
+    conditional_aligned_delete<T, Align>(pts, old_size);
+  }
+  EIGEN_CATCH(...) {
+    conditional_aligned_free<Align>(result);
+    EIGEN_THROW;
+  }
+
   return result;
 }
 
-
-template<typename T, bool Align> inline T* conditional_aligned_new_auto(size_t size)
-{
-  if(size==0)
-    return 0; // short-cut. Also fixes Bug 884
+template <typename T, bool Align>
+EIGEN_DEVICE_FUNC inline T* conditional_aligned_new_auto(std::size_t size) {
+  if (size == 0) return nullptr;  // short-cut. Also fixes Bug 884
   check_size_for_overflow<T>(size);
-  T *result = reinterpret_cast<T*>(conditional_aligned_malloc<Align>(sizeof(T)*size));
-  if(NumTraits<T>::RequireInitialization)
-    construct_elements_of_array(result, size);
+  T* result = static_cast<T*>(conditional_aligned_malloc<Align>(sizeof(T) * size));
+  if (NumTraits<T>::RequireInitialization) {
+    EIGEN_TRY { default_construct_elements_of_array(result, size); }
+    EIGEN_CATCH(...) {
+      conditional_aligned_free<Align>(result);
+      EIGEN_THROW;
+    }
+  }
   return result;
 }
 
-template<typename T, bool Align> inline T* conditional_aligned_realloc_new_auto(T* pts, size_t new_size, size_t old_size)
-{
+template <typename T, bool Align>
+EIGEN_DEVICE_FUNC inline T* conditional_aligned_realloc_new_auto(T* pts, std::size_t new_size, std::size_t old_size) {
+  if (NumTraits<T>::RequireInitialization) {
+    return conditional_aligned_realloc_new<T, Align>(pts, new_size, old_size);
+  }
+
   check_size_for_overflow<T>(new_size);
   check_size_for_overflow<T>(old_size);
-  if(NumTraits<T>::RequireInitialization && (new_size < old_size))
-    destruct_elements_of_array(pts+new_size, old_size-new_size);
-  T *result = reinterpret_cast<T*>(conditional_aligned_realloc<Align>(reinterpret_cast<void*>(pts), sizeof(T)*new_size, sizeof(T)*old_size));
-  if(NumTraits<T>::RequireInitialization && (new_size > old_size))
-    construct_elements_of_array(result+old_size, new_size-old_size);
-  return result;
+  return static_cast<T*>(
+      conditional_aligned_realloc<Align>(static_cast<void*>(pts), sizeof(T) * new_size, sizeof(T) * old_size));
 }
 
-template<typename T, bool Align> inline void conditional_aligned_delete_auto(T *ptr, size_t size)
-{
-  if(NumTraits<T>::RequireInitialization)
-    destruct_elements_of_array<T>(ptr, size);
+template <typename T, bool Align>
+EIGEN_DEVICE_FUNC inline void conditional_aligned_delete_auto(T* ptr, std::size_t size) {
+  if (NumTraits<T>::RequireInitialization) destruct_elements_of_array<T>(ptr, size);
   conditional_aligned_free<Align>(ptr);
 }
 
 /****************************************************************************/
 
-/** \internal Returns the index of the first element of the array that is well aligned for vectorization.
-  *
-  * \param array the address of the start of the array
-  * \param size the size of the array
-  *
-  * \note If no element of the array is well aligned, the size of the array is returned. Typically,
-  * for example with SSE, "well aligned" means 16-byte-aligned. If vectorization is disabled or if the
-  * packet size for the given scalar type is 1, then everything is considered well-aligned.
-  *
-  * \note If the scalar type is vectorizable, we rely on the following assumptions: sizeof(Scalar) is a
-  * power of 2, the packet size in bytes is also a power of 2, and is a multiple of sizeof(Scalar). On the
-  * other hand, we do not assume that the array address is a multiple of sizeof(Scalar), as that fails for
-  * example with Scalar=double on certain 32-bit platforms, see bug #79.
-  *
-  * There is also the variant first_aligned(const MatrixBase&) defined in DenseCoeffsBase.h.
-  */
-template<typename Scalar, typename Index>
-static inline Index first_aligned(const Scalar* array, Index size)
-{
-  static const Index PacketSize = packet_traits<Scalar>::size;
-  static const Index PacketAlignedMask = PacketSize-1;
-
-  if(PacketSize==1)
-  {
-    // Either there is no vectorization, or a packet consists of exactly 1 scalar so that all elements
-    // of the array have the same alignment.
+/** \internal Returns the index of the first element of the array that is well aligned with respect to the requested \a
+ * Alignment.
+ *
+ * \tparam Alignment requested alignment in Bytes.
+ * \param array the address of the start of the array
+ * \param size the size of the array
+ *
+ * \note If no element of the array is well aligned or the requested alignment is not a multiple of a scalar,
+ * the size of the array is returned. For example with SSE, the requested alignment is typically 16-bytes. If
+ * packet size for the given scalar type is 1, then everything is considered well-aligned.
+ *
+ * \note Otherwise, if the Alignment is larger that the scalar size, we rely on the assumptions that sizeof(Scalar) is a
+ * power of 2. On the other hand, we do not assume that the array address is a multiple of sizeof(Scalar), as that fails
+ * for example with Scalar=double on certain 32-bit platforms, see bug #79.
+ *
+ * There is also the variant first_aligned(const MatrixBase&) defined in DenseCoeffsBase.h.
+ * \sa first_default_aligned()
+ */
+template <int Alignment, typename Scalar, typename Index>
+EIGEN_DEVICE_FUNC inline Index first_aligned(const Scalar* array, Index size) {
+  const Index ScalarSize = sizeof(Scalar);
+  const Index AlignmentSize = Alignment / ScalarSize;
+  const Index AlignmentMask = AlignmentSize - 1;
+
+  if (AlignmentSize <= 1) {
+    // Either the requested alignment if smaller than a scalar, or it exactly match a 1 scalar
+    // so that all elements of the array have the same alignment.
     return 0;
-  }
-  else if(size_t(array) & (sizeof(Scalar)-1))
-  {
-    // There is vectorization for this scalar type, but the array is not aligned to the size of a single scalar.
-    // Consequently, no element of the array is well aligned.
+  } else if ((std::uintptr_t(array) & (sizeof(Scalar) - 1)) || (Alignment % ScalarSize) != 0) {
+    // The array is not aligned to the size of a single scalar, or the requested alignment is not a multiple of the
+    // scalar size. Consequently, no element of the array is well aligned.
     return size;
+  } else {
+    Index first = (AlignmentSize - (Index((std::uintptr_t(array) / sizeof(Scalar))) & AlignmentMask)) & AlignmentMask;
+    return (first < size) ? first : size;
   }
-  else
-  {
-    return std::min<Index>( (PacketSize - (Index((size_t(array)/sizeof(Scalar))) & PacketAlignedMask))
-                           & PacketAlignedMask, size);
-  }
+}
+
+/** \internal Returns the index of the first element of the array that is well aligned with respect the largest packet
+ * requirement. \sa first_aligned(Scalar*,Index) and first_default_aligned(DenseBase<Derived>) */
+template <typename Scalar, typename Index>
+EIGEN_DEVICE_FUNC inline Index first_default_aligned(const Scalar* array, Index size) {
+  typedef typename packet_traits<Scalar>::type DefaultPacketType;
+  return first_aligned<unpacket_traits<DefaultPacketType>::alignment>(array, size);
 }
 
 /** \internal Returns the smallest integer multiple of \a base and greater or equal to \a size
-  */ 
-template<typename Index> 
-inline static Index first_multiple(Index size, Index base)
-{
-  return ((size+base-1)/base)*base;
+ */
+template <typename Index>
+inline Index first_multiple(Index size, Index base) {
+  return ((size + base - 1) / base) * base;
 }
 
 // std::copy is much slower than memcpy, so let's introduce a smart_copy which
 // use memcpy on trivial types, i.e., on types that does not require an initialization ctor.
-template<typename T, bool UseMemcpy> struct smart_copy_helper;
+template <typename T, bool UseMemcpy>
+struct smart_copy_helper;
 
-template<typename T> void smart_copy(const T* start, const T* end, T* target)
-{
-  smart_copy_helper<T,!NumTraits<T>::RequireInitialization>::run(start, end, target);
+template <typename T>
+EIGEN_DEVICE_FUNC void smart_copy(const T* start, const T* end, T* target) {
+  smart_copy_helper<T, !NumTraits<T>::RequireInitialization>::run(start, end, target);
 }
 
-template<typename T> struct smart_copy_helper<T,true> {
-  static inline void run(const T* start, const T* end, T* target)
-  { memcpy(target, start, std::ptrdiff_t(end)-std::ptrdiff_t(start)); }
+template <typename T>
+struct smart_copy_helper<T, true> {
+  EIGEN_DEVICE_FUNC static inline void run(const T* start, const T* end, T* target) {
+    std::intptr_t size = std::intptr_t(end) - std::intptr_t(start);
+    if (size == 0) return;
+    eigen_internal_assert(start != 0 && end != 0 && target != 0);
+    EIGEN_USING_STD(memcpy)
+    memcpy(target, start, size);
+  }
 };
 
-template<typename T> struct smart_copy_helper<T,false> {
-  static inline void run(const T* start, const T* end, T* target)
-  { std::copy(start, end, target); }
+template <typename T>
+struct smart_copy_helper<T, false> {
+  EIGEN_DEVICE_FUNC static inline void run(const T* start, const T* end, T* target) { std::copy(start, end, target); }
 };
 
+// intelligent memmove. falls back to std::memmove for POD types, uses std::copy otherwise.
+template <typename T, bool UseMemmove>
+struct smart_memmove_helper;
+
+template <typename T>
+void smart_memmove(const T* start, const T* end, T* target) {
+  smart_memmove_helper<T, !NumTraits<T>::RequireInitialization>::run(start, end, target);
+}
+
+template <typename T>
+struct smart_memmove_helper<T, true> {
+  static inline void run(const T* start, const T* end, T* target) {
+    std::intptr_t size = std::intptr_t(end) - std::intptr_t(start);
+    if (size == 0) return;
+    eigen_internal_assert(start != 0 && end != 0 && target != 0);
+    std::memmove(target, start, size);
+  }
+};
+
+template <typename T>
+struct smart_memmove_helper<T, false> {
+  static inline void run(const T* start, const T* end, T* target) {
+    if (std::uintptr_t(target) < std::uintptr_t(start)) {
+      std::copy(start, end, target);
+    } else {
+      std::ptrdiff_t count = (std::ptrdiff_t(end) - std::ptrdiff_t(start)) / sizeof(T);
+      std::copy_backward(start, end, target + count);
+    }
+  }
+};
+
+template <typename T>
+EIGEN_DEVICE_FUNC T* smart_move(T* start, T* end, T* target) {
+  return std::move(start, end, target);
+}
 
 /*****************************************************************************
 *** Implementation of runtime stack allocation (falling back to malloc)    ***
@@ -522,456 +645,741 @@ template<typename T> struct smart_copy_helper<T,false> {
 
 // you can overwrite Eigen's default behavior regarding alloca by defining EIGEN_ALLOCA
 // to the appropriate stack allocation function
-#ifndef EIGEN_ALLOCA
-  #if (defined __linux__) || (defined __APPLE__) || (defined alloca)
-    #define EIGEN_ALLOCA alloca
-  #elif defined(_MSC_VER)
-    #define EIGEN_ALLOCA _alloca
-  #endif
+#if !defined EIGEN_ALLOCA && !defined EIGEN_GPU_COMPILE_PHASE
+#if EIGEN_OS_LINUX || EIGEN_OS_MAC || (defined alloca)
+#define EIGEN_ALLOCA alloca
+#elif EIGEN_COMP_MSVC
+#define EIGEN_ALLOCA _alloca
+#endif
+#endif
+
+// With clang -Oz -mthumb, alloca changes the stack pointer in a way that is
+// not allowed in Thumb2. -DEIGEN_STACK_ALLOCATION_LIMIT=0 doesn't work because
+// the compiler still emits bad code because stack allocation checks use "<=".
+// TODO: Eliminate after https://bugs.llvm.org/show_bug.cgi?id=23772
+// is fixed.
+#if defined(__clang__) && defined(__thumb__)
+#undef EIGEN_ALLOCA
 #endif
 
 // This helper class construct the allocated memory, and takes care of destructing and freeing the handled data
 // at destruction time. In practice this helper class is mainly useful to avoid memory leak in case of exceptions.
-template<typename T> class aligned_stack_memory_handler
-{
-  public:
-    /* Creates a stack_memory_handler responsible for the buffer \a ptr of size \a size.
-     * Note that \a ptr can be 0 regardless of the other parameters.
-     * This constructor takes care of constructing/initializing the elements of the buffer if required by the scalar type T (see NumTraits<T>::RequireInitialization).
-     * In this case, the buffer elements will also be destructed when this handler will be destructed.
-     * Finally, if \a dealloc is true, then the pointer \a ptr is freed.
-     **/
-    aligned_stack_memory_handler(T* ptr, size_t size, bool dealloc)
-      : m_ptr(ptr), m_size(size), m_deallocate(dealloc)
-    {
-      if(NumTraits<T>::RequireInitialization && m_ptr)
-        Eigen::internal::construct_elements_of_array(m_ptr, size);
-    }
-    ~aligned_stack_memory_handler()
-    {
-      if(NumTraits<T>::RequireInitialization && m_ptr)
-        Eigen::internal::destruct_elements_of_array<T>(m_ptr, m_size);
-      if(m_deallocate)
-        Eigen::internal::aligned_free(m_ptr);
-    }
-  protected:
-    T* m_ptr;
-    size_t m_size;
-    bool m_deallocate;
-};
+template <typename T>
+class aligned_stack_memory_handler : noncopyable {
+ public:
+  /* Creates a stack_memory_handler responsible for the buffer \a ptr of size \a size.
+   * Note that \a ptr can be 0 regardless of the other parameters.
+   * This constructor takes care of constructing/initializing the elements of the buffer if required by the scalar type
+   *T (see NumTraits<T>::RequireInitialization). In this case, the buffer elements will also be destructed when this
+   *handler will be destructed. Finally, if \a dealloc is true, then the pointer \a ptr is freed.
+   **/
+  EIGEN_DEVICE_FUNC aligned_stack_memory_handler(T* ptr, std::size_t size, bool dealloc)
+      : m_ptr(ptr), m_size(size), m_deallocate(dealloc) {
+    if (NumTraits<T>::RequireInitialization && m_ptr) Eigen::internal::default_construct_elements_of_array(m_ptr, size);
+  }
+  EIGEN_DEVICE_FUNC ~aligned_stack_memory_handler() {
+    if (NumTraits<T>::RequireInitialization && m_ptr) Eigen::internal::destruct_elements_of_array<T>(m_ptr, m_size);
+    if (m_deallocate) Eigen::internal::aligned_free(m_ptr);
+  }
 
-} // end namespace internal
+ protected:
+  T* m_ptr;
+  std::size_t m_size;
+  bool m_deallocate;
+};
 
-/** \internal
-  * Declares, allocates and construct an aligned buffer named NAME of SIZE elements of type TYPE on the stack
-  * if SIZE is smaller than EIGEN_STACK_ALLOCATION_LIMIT, and if stack allocation is supported by the platform
-  * (currently, this is Linux and Visual Studio only). Otherwise the memory is allocated on the heap.
-  * The allocated buffer is automatically deleted when exiting the scope of this declaration.
-  * If BUFFER is non null, then the declared variable is simply an alias for BUFFER, and no allocation/deletion occurs.
-  * Here is an example:
-  * \code
-  * {
-  *   ei_declare_aligned_stack_constructed_variable(float,data,size,0);
-  *   // use data[0] to data[size-1]
-  * }
-  * \endcode
-  * The underlying stack allocation function can controlled with the EIGEN_ALLOCA preprocessor token.
-  */
 #ifdef EIGEN_ALLOCA
 
-  #if defined(__arm__) || defined(_WIN32)
-    #define EIGEN_ALIGNED_ALLOCA(SIZE) reinterpret_cast<void*>((reinterpret_cast<size_t>(EIGEN_ALLOCA(SIZE+16)) & ~(size_t(15))) + 16)
-  #else
-    #define EIGEN_ALIGNED_ALLOCA EIGEN_ALLOCA
-  #endif
+template <typename Xpr, int NbEvaluations,
+          bool MapExternalBuffer = nested_eval<Xpr, NbEvaluations>::Evaluate && Xpr::MaxSizeAtCompileTime == Dynamic>
+struct local_nested_eval_wrapper {
+  static constexpr bool NeedExternalBuffer = false;
+  typedef typename Xpr::Scalar Scalar;
+  typedef typename nested_eval<Xpr, NbEvaluations>::type ObjectType;
+  ObjectType object;
+
+  EIGEN_DEVICE_FUNC local_nested_eval_wrapper(const Xpr& xpr, Scalar* ptr) : object(xpr) {
+    EIGEN_UNUSED_VARIABLE(ptr);
+    eigen_internal_assert(ptr == 0);
+  }
+};
 
-  #define ei_declare_aligned_stack_constructed_variable(TYPE,NAME,SIZE,BUFFER) \
-    Eigen::internal::check_size_for_overflow<TYPE>(SIZE); \
-    TYPE* NAME = (BUFFER)!=0 ? (BUFFER) \
-               : reinterpret_cast<TYPE*>( \
-                      (sizeof(TYPE)*SIZE<=EIGEN_STACK_ALLOCATION_LIMIT) ? EIGEN_ALIGNED_ALLOCA(sizeof(TYPE)*SIZE) \
-                    : Eigen::internal::aligned_malloc(sizeof(TYPE)*SIZE) );  \
-    Eigen::internal::aligned_stack_memory_handler<TYPE> EIGEN_CAT(NAME,_stack_memory_destructor)((BUFFER)==0 ? NAME : 0,SIZE,sizeof(TYPE)*SIZE>EIGEN_STACK_ALLOCATION_LIMIT)
+template <typename Xpr, int NbEvaluations>
+struct local_nested_eval_wrapper<Xpr, NbEvaluations, true> {
+  static constexpr bool NeedExternalBuffer = true;
+  typedef typename Xpr::Scalar Scalar;
+  typedef typename plain_object_eval<Xpr>::type PlainObject;
+  typedef Map<PlainObject, EIGEN_DEFAULT_ALIGN_BYTES> ObjectType;
+  ObjectType object;
+
+  EIGEN_DEVICE_FUNC local_nested_eval_wrapper(const Xpr& xpr, Scalar* ptr)
+      : object(ptr == 0 ? reinterpret_cast<Scalar*>(Eigen::internal::aligned_malloc(sizeof(Scalar) * xpr.size())) : ptr,
+               xpr.rows(), xpr.cols()),
+        m_deallocate(ptr == 0) {
+    if (NumTraits<Scalar>::RequireInitialization && object.data())
+      Eigen::internal::default_construct_elements_of_array(object.data(), object.size());
+    object = xpr;
+  }
 
-#else
+  EIGEN_DEVICE_FUNC ~local_nested_eval_wrapper() {
+    if (NumTraits<Scalar>::RequireInitialization && object.data())
+      Eigen::internal::destruct_elements_of_array(object.data(), object.size());
+    if (m_deallocate) Eigen::internal::aligned_free(object.data());
+  }
 
-  #define ei_declare_aligned_stack_constructed_variable(TYPE,NAME,SIZE,BUFFER) \
-    Eigen::internal::check_size_for_overflow<TYPE>(SIZE); \
-    TYPE* NAME = (BUFFER)!=0 ? BUFFER : reinterpret_cast<TYPE*>(Eigen::internal::aligned_malloc(sizeof(TYPE)*SIZE));    \
-    Eigen::internal::aligned_stack_memory_handler<TYPE> EIGEN_CAT(NAME,_stack_memory_destructor)((BUFFER)==0 ? NAME : 0,SIZE,true)
-    
-#endif
+ private:
+  bool m_deallocate;
+};
 
+#endif  // EIGEN_ALLOCA
 
-/*****************************************************************************
-*** Implementation of EIGEN_MAKE_ALIGNED_OPERATOR_NEW [_IF]                ***
-*****************************************************************************/
+template <typename T>
+class scoped_array : noncopyable {
+  T* m_ptr;
 
-#if EIGEN_ALIGN
-  #ifdef EIGEN_EXCEPTIONS
-    #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \
-      void* operator new(size_t size, const std::nothrow_t&) throw() { \
-        try { return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); } \
-        catch (...) { return 0; } \
-      }
-  #else
-    #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \
-      void* operator new(size_t size, const std::nothrow_t&) throw() { \
-        return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); \
-      }
-  #endif
-
-  #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign) \
-      void *operator new(size_t size) { \
-        return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); \
-      } \
-      void *operator new[](size_t size) { \
-        return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); \
-      } \
-      void operator delete(void * ptr) throw() { Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); } \
-      void operator delete[](void * ptr) throw() { Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); } \
-      /* in-place new and delete. since (at least afaik) there is no actual   */ \
-      /* memory allocated we can safely let the default implementation handle */ \
-      /* this particular case. */ \
-      static void *operator new(size_t size, void *ptr) { return ::operator new(size,ptr); } \
-      static void *operator new[](size_t size, void* ptr) { return ::operator new[](size,ptr); } \
-      void operator delete(void * memory, void *ptr) throw() { return ::operator delete(memory,ptr); } \
-      void operator delete[](void * memory, void *ptr) throw() { return ::operator delete[](memory,ptr); } \
-      /* nothrow-new (returns zero instead of std::bad_alloc) */ \
-      EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \
-      void operator delete(void *ptr, const std::nothrow_t&) throw() { \
-        Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr); \
-      } \
-      typedef void eigen_aligned_operator_new_marker_type;
+ public:
+  explicit scoped_array(std::ptrdiff_t size) { m_ptr = new T[size]; }
+  ~scoped_array() { delete[] m_ptr; }
+  T& operator[](std::ptrdiff_t i) { return m_ptr[i]; }
+  const T& operator[](std::ptrdiff_t i) const { return m_ptr[i]; }
+  T*& ptr() { return m_ptr; }
+  const T* ptr() const { return m_ptr; }
+  operator const T*() const { return m_ptr; }
+};
+
+template <typename T>
+void swap(scoped_array<T>& a, scoped_array<T>& b) {
+  std::swap(a.ptr(), b.ptr());
+}
+
+}  // end namespace internal
+
+/** \internal
+ *
+ * The macro ei_declare_aligned_stack_constructed_variable(TYPE,NAME,SIZE,BUFFER) declares, allocates,
+ * and construct an aligned buffer named NAME of SIZE elements of type TYPE on the stack
+ * if the size in bytes is smaller than EIGEN_STACK_ALLOCATION_LIMIT, and if stack allocation is supported by the
+ * platform (currently, this is Linux, OSX and Visual Studio only). Otherwise the memory is allocated on the heap. The
+ * allocated buffer is automatically deleted when exiting the scope of this declaration. If BUFFER is non null, then the
+ * declared variable is simply an alias for BUFFER, and no allocation/deletion occurs. Here is an example: \code
+ * {
+ *   ei_declare_aligned_stack_constructed_variable(float,data,size,0);
+ *   // use data[0] to data[size-1]
+ * }
+ * \endcode
+ * The underlying stack allocation function can controlled with the EIGEN_ALLOCA preprocessor token.
+ *
+ * The macro ei_declare_local_nested_eval(XPR_T,XPR,N,NAME) is analogue to
+ * \code
+ *   typename internal::nested_eval<XPRT_T,N>::type NAME(XPR);
+ * \endcode
+ * with the advantage of using aligned stack allocation even if the maximal size of XPR at compile time is unknown.
+ * This is accomplished through alloca if this later is supported and if the required number of bytes
+ * is below EIGEN_STACK_ALLOCATION_LIMIT.
+ */
+#if defined(EIGEN_ALLOCA) && !defined(EIGEN_NO_ALLOCA)
+
+#if EIGEN_DEFAULT_ALIGN_BYTES > 0
+// We always manually re-align the result of EIGEN_ALLOCA.
+// If alloca is already aligned, the compiler should be smart enough to optimize away the re-alignment.
+
+#if ((EIGEN_COMP_GNUC || EIGEN_COMP_CLANG) && !EIGEN_COMP_NVHPC)
+#define EIGEN_ALIGNED_ALLOCA(SIZE) __builtin_alloca_with_align(SIZE, CHAR_BIT* EIGEN_DEFAULT_ALIGN_BYTES)
 #else
-  #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* eigen_aligned_alloca_helper(void* ptr) {
+  constexpr std::uintptr_t mask = EIGEN_DEFAULT_ALIGN_BYTES - 1;
+  std::uintptr_t ptr_int = std::uintptr_t(ptr);
+  std::uintptr_t aligned_ptr_int = (ptr_int + mask) & ~mask;
+  std::uintptr_t offset = aligned_ptr_int - ptr_int;
+  return static_cast<void*>(static_cast<uint8_t*>(ptr) + offset);
+}
+#define EIGEN_ALIGNED_ALLOCA(SIZE) eigen_aligned_alloca_helper(EIGEN_ALLOCA(SIZE + EIGEN_DEFAULT_ALIGN_BYTES - 1))
 #endif
 
-#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(true)
-#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar,Size) \
-  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool(((Size)!=Eigen::Dynamic) && ((sizeof(Scalar)*(Size))%16==0)))
+#else
+#define EIGEN_ALIGNED_ALLOCA(SIZE) EIGEN_ALLOCA(SIZE)
+#endif
 
-/****************************************************************************/
+#define ei_declare_aligned_stack_constructed_variable(TYPE, NAME, SIZE, BUFFER)                                       \
+  Eigen::internal::check_size_for_overflow<TYPE>(SIZE);                                                               \
+  TYPE* NAME = (BUFFER) != 0 ? (BUFFER)                                                                               \
+                             : reinterpret_cast<TYPE*>((sizeof(TYPE) * (SIZE) <= EIGEN_STACK_ALLOCATION_LIMIT)        \
+                                                           ? EIGEN_ALIGNED_ALLOCA(sizeof(TYPE) * (SIZE))              \
+                                                           : Eigen::internal::aligned_malloc(sizeof(TYPE) * (SIZE))); \
+  Eigen::internal::aligned_stack_memory_handler<TYPE> EIGEN_CAT(NAME, _stack_memory_destructor)(                      \
+      (BUFFER) == 0 ? NAME : 0, SIZE, sizeof(TYPE) * (SIZE) > EIGEN_STACK_ALLOCATION_LIMIT)
+
+#define ei_declare_local_nested_eval(XPR_T, XPR, N, NAME)                                        \
+  Eigen::internal::local_nested_eval_wrapper<XPR_T, N> EIGEN_CAT(NAME, _wrapper)(                \
+      XPR, reinterpret_cast<typename XPR_T::Scalar*>(                                            \
+               ((Eigen::internal::local_nested_eval_wrapper<XPR_T, N>::NeedExternalBuffer) &&    \
+                ((sizeof(typename XPR_T::Scalar) * XPR.size()) <= EIGEN_STACK_ALLOCATION_LIMIT)) \
+                   ? EIGEN_ALIGNED_ALLOCA(sizeof(typename XPR_T::Scalar) * XPR.size())           \
+                   : 0));                                                                        \
+  typename Eigen::internal::local_nested_eval_wrapper<XPR_T, N>::ObjectType NAME(EIGEN_CAT(NAME, _wrapper).object)
 
-/** \class aligned_allocator
-* \ingroup Core_Module
-*
-* \brief STL compatible allocator to use with with 16 byte aligned types
-*
-* Example:
-* \code
-* // Matrix4f requires 16 bytes alignment:
-* std::map< int, Matrix4f, std::less<int>, 
-*           aligned_allocator<std::pair<const int, Matrix4f> > > my_map_mat4;
-* // Vector3f does not require 16 bytes alignment, no need to use Eigen's allocator:
-* std::map< int, Vector3f > my_map_vec3;
-* \endcode
-*
-* \sa \ref TopicStlContainers.
-*/
-template<class T>
-class aligned_allocator
-{
-public:
-    typedef size_t    size_type;
-    typedef std::ptrdiff_t difference_type;
-    typedef T*        pointer;
-    typedef const T*  const_pointer;
-    typedef T&        reference;
-    typedef const T&  const_reference;
-    typedef T         value_type;
-
-    template<class U>
-    struct rebind
-    {
-        typedef aligned_allocator<U> other;
-    };
+#else
 
-    pointer address( reference value ) const
-    {
-        return &value;
-    }
+#define ei_declare_aligned_stack_constructed_variable(TYPE, NAME, SIZE, BUFFER)                                 \
+  Eigen::internal::check_size_for_overflow<TYPE>(SIZE);                                                         \
+  TYPE* NAME =                                                                                                  \
+      (BUFFER) != 0 ? BUFFER : reinterpret_cast<TYPE*>(Eigen::internal::aligned_malloc(sizeof(TYPE) * (SIZE))); \
+  Eigen::internal::aligned_stack_memory_handler<TYPE> EIGEN_CAT(NAME, _stack_memory_destructor)(                \
+      (BUFFER) == 0 ? NAME : 0, SIZE, true)
 
-    const_pointer address( const_reference value ) const
-    {
-        return &value;
-    }
+#define ei_declare_local_nested_eval(XPR_T, XPR, N, NAME) \
+  typename Eigen::internal::nested_eval<XPR_T, N>::type NAME(XPR)
 
-    aligned_allocator()
-    {
-    }
+#endif
 
-    aligned_allocator( const aligned_allocator& )
-    {
-    }
+/*****************************************************************************
+*** Implementation of EIGEN_MAKE_ALIGNED_OPERATOR_NEW [_IF]                ***
+*****************************************************************************/
 
-    template<class U>
-    aligned_allocator( const aligned_allocator<U>& )
-    {
-    }
+#if EIGEN_HAS_CXX17_OVERALIGN
 
-    ~aligned_allocator()
-    {
-    }
+// C++17 -> no need to bother about alignment anymore :)
 
-    size_type max_size() const
-    {
-        return (std::numeric_limits<size_type>::max)();
-    }
+#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign)
+#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign)
+#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW
+#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar, Size)
 
-    pointer allocate( size_type num, const void* hint = 0 )
-    {
-        EIGEN_UNUSED_VARIABLE(hint);
-        internal::check_size_for_overflow<T>(num);
-        return static_cast<pointer>( internal::aligned_malloc( num * sizeof(T) ) );
-    }
+#else
 
-    void construct( pointer p, const T& value )
-    {
-        ::new( p ) T( value );
-    }
+// HIP does not support new/delete on device.
+#if EIGEN_MAX_ALIGN_BYTES != 0 && !defined(EIGEN_HIP_DEVICE_COMPILE)
+#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign)                              \
+  EIGEN_DEVICE_FUNC void* operator new(std::size_t size, const std::nothrow_t&) noexcept { \
+    EIGEN_TRY { return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size); }  \
+    EIGEN_CATCH(...) { return 0; }                                                         \
+  }
+#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign)                                                              \
+  EIGEN_DEVICE_FUNC void* operator new(std::size_t size) {                                                            \
+    return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size);                                           \
+  }                                                                                                                   \
+  EIGEN_DEVICE_FUNC void* operator new[](std::size_t size) {                                                          \
+    return Eigen::internal::conditional_aligned_malloc<NeedsToAlign>(size);                                           \
+  }                                                                                                                   \
+  EIGEN_DEVICE_FUNC void operator delete(void* ptr) noexcept {                                                        \
+    Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr);                                                     \
+  }                                                                                                                   \
+  EIGEN_DEVICE_FUNC void operator delete[](void* ptr) noexcept {                                                      \
+    Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr);                                                     \
+  }                                                                                                                   \
+  EIGEN_DEVICE_FUNC void operator delete(void* ptr, std::size_t /* sz */) noexcept {                                  \
+    Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr);                                                     \
+  }                                                                                                                   \
+  EIGEN_DEVICE_FUNC void operator delete[](void* ptr, std::size_t /* sz */) noexcept {                                \
+    Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr);                                                     \
+  }                                                                                                                   \
+  /* in-place new and delete. since (at least afaik) there is no actual   */                                          \
+  /* memory allocated we can safely let the default implementation handle */                                          \
+  /* this particular case. */                                                                                         \
+  EIGEN_DEVICE_FUNC static void* operator new(std::size_t size, void* ptr) { return ::operator new(size, ptr); }      \
+  EIGEN_DEVICE_FUNC static void* operator new[](std::size_t size, void* ptr) { return ::operator new[](size, ptr); }  \
+  EIGEN_DEVICE_FUNC void operator delete(void* memory, void* ptr) noexcept { return ::operator delete(memory, ptr); } \
+  EIGEN_DEVICE_FUNC void operator delete[](void* memory, void* ptr) noexcept {                                        \
+    return ::operator delete[](memory, ptr);                                                                          \
+  }                                                                                                                   \
+  /* nothrow-new (returns zero instead of std::bad_alloc) */                                                          \
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign)                                                               \
+  EIGEN_DEVICE_FUNC void operator delete(void* ptr, const std::nothrow_t&) noexcept {                                 \
+    Eigen::internal::conditional_aligned_free<NeedsToAlign>(ptr);                                                     \
+  }                                                                                                                   \
+  typedef void eigen_aligned_operator_new_marker_type;
+#else
+#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign)
+#endif
 
-    void destroy( pointer p )
-    {
-        p->~T();
-    }
+#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(true)
+#define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar, Size)                                 \
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(                                                                            \
+      bool(((Size) != Eigen::Dynamic) &&                                                                         \
+           (((EIGEN_MAX_ALIGN_BYTES >= 16) && ((sizeof(Scalar) * (Size)) % (EIGEN_MAX_ALIGN_BYTES) == 0)) ||     \
+            ((EIGEN_MAX_ALIGN_BYTES >= 32) && ((sizeof(Scalar) * (Size)) % (EIGEN_MAX_ALIGN_BYTES / 2) == 0)) || \
+            ((EIGEN_MAX_ALIGN_BYTES >= 64) && ((sizeof(Scalar) * (Size)) % (EIGEN_MAX_ALIGN_BYTES / 4) == 0)))))
 
-    void deallocate( pointer p, size_type /*num*/ )
-    {
-        internal::aligned_free( p );
-    }
+#endif
+
+/****************************************************************************/
 
-    bool operator!=(const aligned_allocator<T>& ) const
-    { return false; }
+/** \class aligned_allocator
+ * \ingroup Core_Module
+ *
+ * \brief STL compatible allocator to use with types requiring a non-standard alignment.
+ *
+ * The memory is aligned as for dynamically aligned matrix/array types such as MatrixXd.
+ * By default, it will thus provide at least 16 bytes alignment and more in following cases:
+ *  - 32 bytes alignment if AVX is enabled.
+ *  - 64 bytes alignment if AVX512 is enabled.
+ *
+ * This can be controlled using the \c EIGEN_MAX_ALIGN_BYTES macro as documented
+ * \link TopicPreprocessorDirectivesPerformance there \endlink.
+ *
+ * Example:
+ * \code
+ * // Matrix4f requires 16 bytes alignment:
+ * std::map< int, Matrix4f, std::less<int>,
+ *           aligned_allocator<std::pair<const int, Matrix4f> > > my_map_mat4;
+ * // Vector3f does not require 16 bytes alignment, no need to use Eigen's allocator:
+ * std::map< int, Vector3f > my_map_vec3;
+ * \endcode
+ *
+ * \sa \blank \ref TopicStlContainers.
+ */
+template <class T>
+class aligned_allocator {
+ public:
+  typedef std::size_t size_type;
+  typedef std::ptrdiff_t difference_type;
+  typedef T* pointer;
+  typedef const T* const_pointer;
+  typedef T& reference;
+  typedef const T& const_reference;
+  typedef T value_type;
+
+  template <class U>
+  struct rebind {
+    typedef aligned_allocator<U> other;
+  };
+
+  aligned_allocator() = default;
+
+  aligned_allocator(const aligned_allocator&) = default;
+
+  template <class U>
+  aligned_allocator(const aligned_allocator<U>&) {}
+
+  template <class U>
+  constexpr bool operator==(const aligned_allocator<U>&) const noexcept {
+    return true;
+  }
+  template <class U>
+  constexpr bool operator!=(const aligned_allocator<U>&) const noexcept {
+    return false;
+  }
 
-    bool operator==(const aligned_allocator<T>& ) const
-    { return true; }
+#if EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_STRICT_AT_LEAST(7, 0, 0)
+  // In gcc std::allocator::max_size() is bugged making gcc triggers a warning:
+  // eigen/Eigen/src/Core/util/Memory.h:189:12: warning: argument 1 value '18446744073709551612' exceeds maximum object
+  // size 9223372036854775807 See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87544
+  size_type max_size() const { return (std::numeric_limits<std::ptrdiff_t>::max)() / sizeof(T); }
+#endif
+
+  pointer allocate(size_type num, const void* /*hint*/ = 0) {
+    internal::check_size_for_overflow<T>(num);
+    return static_cast<pointer>(internal::aligned_malloc(num * sizeof(T)));
+  }
+
+  void deallocate(pointer p, size_type /*num*/) { internal::aligned_free(p); }
 };
 
 //---------- Cache sizes ----------
 
 #if !defined(EIGEN_NO_CPUID)
-#  if defined(__GNUC__) && ( defined(__i386__) || defined(__x86_64__) )
-#    if defined(__PIC__) && defined(__i386__)
-       // Case for x86 with PIC
-#      define EIGEN_CPUID(abcd,func,id) \
-         __asm__ __volatile__ ("xchgl %%ebx, %k1;cpuid; xchgl %%ebx,%k1": "=a" (abcd[0]), "=&r" (abcd[1]), "=c" (abcd[2]), "=d" (abcd[3]) : "a" (func), "c" (id));
-#    elif defined(__PIC__) && defined(__x86_64__)
-       // Case for x64 with PIC. In theory this is only a problem with recent gcc and with medium or large code model, not with the default small code model.
-       // However, we cannot detect which code model is used, and the xchg overhead is negligible anyway.
-#      define EIGEN_CPUID(abcd,func,id) \
-        __asm__ __volatile__ ("xchg{q}\t{%%}rbx, %q1; cpuid; xchg{q}\t{%%}rbx, %q1": "=a" (abcd[0]), "=&r" (abcd[1]), "=c" (abcd[2]), "=d" (abcd[3]) : "0" (func), "2" (id));
-#    else
-       // Case for x86_64 or x86 w/o PIC
-#      define EIGEN_CPUID(abcd,func,id) \
-         __asm__ __volatile__ ("cpuid": "=a" (abcd[0]), "=b" (abcd[1]), "=c" (abcd[2]), "=d" (abcd[3]) : "0" (func), "2" (id) );
-#    endif
-#  elif defined(_MSC_VER)
-#    if (_MSC_VER > 1500) && ( defined(_M_IX86) || defined(_M_X64) )
-#      define EIGEN_CPUID(abcd,func,id) __cpuidex((int*)abcd,func,id)
-#    endif
-#  endif
+#if EIGEN_COMP_GNUC && EIGEN_ARCH_i386_OR_x86_64
+#if defined(__PIC__) && EIGEN_ARCH_i386
+// Case for x86 with PIC
+#define EIGEN_CPUID(abcd, func, id)                                                  \
+  __asm__ __volatile__("xchgl %%ebx, %k1;cpuid; xchgl %%ebx,%k1"                     \
+                       : "=a"(abcd[0]), "=&r"(abcd[1]), "=c"(abcd[2]), "=d"(abcd[3]) \
+                       : "a"(func), "c"(id));
+#elif defined(__PIC__) && EIGEN_ARCH_x86_64
+// Case for x64 with PIC. In theory this is only a problem with recent gcc and with medium or large code model, not with
+// the default small code model. However, we cannot detect which code model is used, and the xchg overhead is negligible
+// anyway.
+#define EIGEN_CPUID(abcd, func, id)                                                  \
+  __asm__ __volatile__("xchg{q}\t{%%}rbx, %q1; cpuid; xchg{q}\t{%%}rbx, %q1"         \
+                       : "=a"(abcd[0]), "=&r"(abcd[1]), "=c"(abcd[2]), "=d"(abcd[3]) \
+                       : "0"(func), "2"(id));
+#else
+// Case for x86_64 or x86 w/o PIC
+#define EIGEN_CPUID(abcd, func, id) \
+  __asm__ __volatile__("cpuid" : "=a"(abcd[0]), "=b"(abcd[1]), "=c"(abcd[2]), "=d"(abcd[3]) : "0"(func), "2"(id));
+#endif
+#elif EIGEN_COMP_MSVC
+#if EIGEN_ARCH_i386_OR_x86_64
+#define EIGEN_CPUID(abcd, func, id) __cpuidex((int*)abcd, func, id)
+#endif
+#endif
 #endif
 
 namespace internal {
 
 #ifdef EIGEN_CPUID
 
-inline bool cpuid_is_vendor(int abcd[4], const int vendor[3])
-{
-  return abcd[1]==vendor[0] && abcd[3]==vendor[1] && abcd[2]==vendor[2];
+inline bool cpuid_is_vendor(int abcd[4], const int vendor[3]) {
+  return abcd[1] == vendor[0] && abcd[3] == vendor[1] && abcd[2] == vendor[2];
 }
 
-inline void queryCacheSizes_intel_direct(int& l1, int& l2, int& l3)
-{
+inline void queryCacheSizes_intel_direct(int& l1, int& l2, int& l3) {
   int abcd[4];
   l1 = l2 = l3 = 0;
   int cache_id = 0;
   int cache_type = 0;
   do {
     abcd[0] = abcd[1] = abcd[2] = abcd[3] = 0;
-    EIGEN_CPUID(abcd,0x4,cache_id);
-    cache_type  = (abcd[0] & 0x0F) >> 0;
-    if(cache_type==1||cache_type==3) // data or unified cache
+    EIGEN_CPUID(abcd, 0x4, cache_id);
+    cache_type = (abcd[0] & 0x0F) >> 0;
+    if (cache_type == 1 || cache_type == 3)  // data or unified cache
     {
-      int cache_level = (abcd[0] & 0xE0) >> 5;  // A[7:5]
-      int ways        = (abcd[1] & 0xFFC00000) >> 22; // B[31:22]
-      int partitions  = (abcd[1] & 0x003FF000) >> 12; // B[21:12]
-      int line_size   = (abcd[1] & 0x00000FFF) >>  0; // B[11:0]
-      int sets        = (abcd[2]);                    // C[31:0]
-
-      int cache_size = (ways+1) * (partitions+1) * (line_size+1) * (sets+1);
-
-      switch(cache_level)
-      {
-        case 1: l1 = cache_size; break;
-        case 2: l2 = cache_size; break;
-        case 3: l3 = cache_size; break;
-        default: break;
+      int cache_level = (abcd[0] & 0xE0) >> 5;        // A[7:5]
+      int ways = (abcd[1] & 0xFFC00000) >> 22;        // B[31:22]
+      int partitions = (abcd[1] & 0x003FF000) >> 12;  // B[21:12]
+      int line_size = (abcd[1] & 0x00000FFF) >> 0;    // B[11:0]
+      int sets = (abcd[2]);                           // C[31:0]
+
+      int cache_size = (ways + 1) * (partitions + 1) * (line_size + 1) * (sets + 1);
+
+      switch (cache_level) {
+        case 1:
+          l1 = cache_size;
+          break;
+        case 2:
+          l2 = cache_size;
+          break;
+        case 3:
+          l3 = cache_size;
+          break;
+        default:
+          break;
       }
     }
     cache_id++;
-  } while(cache_type>0 && cache_id<16);
+  } while (cache_type > 0 && cache_id < 16);
 }
 
-inline void queryCacheSizes_intel_codes(int& l1, int& l2, int& l3)
-{
+inline void queryCacheSizes_intel_codes(int& l1, int& l2, int& l3) {
   int abcd[4];
   abcd[0] = abcd[1] = abcd[2] = abcd[3] = 0;
   l1 = l2 = l3 = 0;
-  EIGEN_CPUID(abcd,0x00000002,0);
-  unsigned char * bytes = reinterpret_cast<unsigned char *>(abcd)+2;
+  EIGEN_CPUID(abcd, 0x00000002, 0);
+  unsigned char* bytes = reinterpret_cast<unsigned char*>(abcd) + 2;
   bool check_for_p2_core2 = false;
-  for(int i=0; i<14; ++i)
-  {
-    switch(bytes[i])
-    {
-      case 0x0A: l1 = 8; break;   // 0Ah   data L1 cache, 8 KB, 2 ways, 32 byte lines
-      case 0x0C: l1 = 16; break;  // 0Ch   data L1 cache, 16 KB, 4 ways, 32 byte lines
-      case 0x0E: l1 = 24; break;  // 0Eh   data L1 cache, 24 KB, 6 ways, 64 byte lines
-      case 0x10: l1 = 16; break;  // 10h   data L1 cache, 16 KB, 4 ways, 32 byte lines (IA-64)
-      case 0x15: l1 = 16; break;  // 15h   code L1 cache, 16 KB, 4 ways, 32 byte lines (IA-64)
-      case 0x2C: l1 = 32; break;  // 2Ch   data L1 cache, 32 KB, 8 ways, 64 byte lines
-      case 0x30: l1 = 32; break;  // 30h   code L1 cache, 32 KB, 8 ways, 64 byte lines
-      case 0x60: l1 = 16; break;  // 60h   data L1 cache, 16 KB, 8 ways, 64 byte lines, sectored
-      case 0x66: l1 = 8; break;   // 66h   data L1 cache, 8 KB, 4 ways, 64 byte lines, sectored
-      case 0x67: l1 = 16; break;  // 67h   data L1 cache, 16 KB, 4 ways, 64 byte lines, sectored
-      case 0x68: l1 = 32; break;  // 68h   data L1 cache, 32 KB, 4 ways, 64 byte lines, sectored
-      case 0x1A: l2 = 96; break;   // code and data L2 cache, 96 KB, 6 ways, 64 byte lines (IA-64)
-      case 0x22: l3 = 512; break;   // code and data L3 cache, 512 KB, 4 ways (!), 64 byte lines, dual-sectored
-      case 0x23: l3 = 1024; break;   // code and data L3 cache, 1024 KB, 8 ways, 64 byte lines, dual-sectored
-      case 0x25: l3 = 2048; break;   // code and data L3 cache, 2048 KB, 8 ways, 64 byte lines, dual-sectored
-      case 0x29: l3 = 4096; break;   // code and data L3 cache, 4096 KB, 8 ways, 64 byte lines, dual-sectored
-      case 0x39: l2 = 128; break;   // code and data L2 cache, 128 KB, 4 ways, 64 byte lines, sectored
-      case 0x3A: l2 = 192; break;   // code and data L2 cache, 192 KB, 6 ways, 64 byte lines, sectored
-      case 0x3B: l2 = 128; break;   // code and data L2 cache, 128 KB, 2 ways, 64 byte lines, sectored
-      case 0x3C: l2 = 256; break;   // code and data L2 cache, 256 KB, 4 ways, 64 byte lines, sectored
-      case 0x3D: l2 = 384; break;   // code and data L2 cache, 384 KB, 6 ways, 64 byte lines, sectored
-      case 0x3E: l2 = 512; break;   // code and data L2 cache, 512 KB, 4 ways, 64 byte lines, sectored
-      case 0x40: l2 = 0; break;   // no integrated L2 cache (P6 core) or L3 cache (P4 core)
-      case 0x41: l2 = 128; break;   // code and data L2 cache, 128 KB, 4 ways, 32 byte lines
-      case 0x42: l2 = 256; break;   // code and data L2 cache, 256 KB, 4 ways, 32 byte lines
-      case 0x43: l2 = 512; break;   // code and data L2 cache, 512 KB, 4 ways, 32 byte lines
-      case 0x44: l2 = 1024; break;   // code and data L2 cache, 1024 KB, 4 ways, 32 byte lines
-      case 0x45: l2 = 2048; break;   // code and data L2 cache, 2048 KB, 4 ways, 32 byte lines
-      case 0x46: l3 = 4096; break;   // code and data L3 cache, 4096 KB, 4 ways, 64 byte lines
-      case 0x47: l3 = 8192; break;   // code and data L3 cache, 8192 KB, 8 ways, 64 byte lines
-      case 0x48: l2 = 3072; break;   // code and data L2 cache, 3072 KB, 12 ways, 64 byte lines
-      case 0x49: if(l2!=0) l3 = 4096; else {check_for_p2_core2=true; l3 = l2 = 4096;} break;// code and data L3 cache, 4096 KB, 16 ways, 64 byte lines (P4) or L2 for core2
-      case 0x4A: l3 = 6144; break;   // code and data L3 cache, 6144 KB, 12 ways, 64 byte lines
-      case 0x4B: l3 = 8192; break;   // code and data L3 cache, 8192 KB, 16 ways, 64 byte lines
-      case 0x4C: l3 = 12288; break;   // code and data L3 cache, 12288 KB, 12 ways, 64 byte lines
-      case 0x4D: l3 = 16384; break;   // code and data L3 cache, 16384 KB, 16 ways, 64 byte lines
-      case 0x4E: l2 = 6144; break;   // code and data L2 cache, 6144 KB, 24 ways, 64 byte lines
-      case 0x78: l2 = 1024; break;   // code and data L2 cache, 1024 KB, 4 ways, 64 byte lines
-      case 0x79: l2 = 128; break;   // code and data L2 cache, 128 KB, 8 ways, 64 byte lines, dual-sectored
-      case 0x7A: l2 = 256; break;   // code and data L2 cache, 256 KB, 8 ways, 64 byte lines, dual-sectored
-      case 0x7B: l2 = 512; break;   // code and data L2 cache, 512 KB, 8 ways, 64 byte lines, dual-sectored
-      case 0x7C: l2 = 1024; break;   // code and data L2 cache, 1024 KB, 8 ways, 64 byte lines, dual-sectored
-      case 0x7D: l2 = 2048; break;   // code and data L2 cache, 2048 KB, 8 ways, 64 byte lines
-      case 0x7E: l2 = 256; break;   // code and data L2 cache, 256 KB, 8 ways, 128 byte lines, sect. (IA-64)
-      case 0x7F: l2 = 512; break;   // code and data L2 cache, 512 KB, 2 ways, 64 byte lines
-      case 0x80: l2 = 512; break;   // code and data L2 cache, 512 KB, 8 ways, 64 byte lines
-      case 0x81: l2 = 128; break;   // code and data L2 cache, 128 KB, 8 ways, 32 byte lines
-      case 0x82: l2 = 256; break;   // code and data L2 cache, 256 KB, 8 ways, 32 byte lines
-      case 0x83: l2 = 512; break;   // code and data L2 cache, 512 KB, 8 ways, 32 byte lines
-      case 0x84: l2 = 1024; break;   // code and data L2 cache, 1024 KB, 8 ways, 32 byte lines
-      case 0x85: l2 = 2048; break;   // code and data L2 cache, 2048 KB, 8 ways, 32 byte lines
-      case 0x86: l2 = 512; break;   // code and data L2 cache, 512 KB, 4 ways, 64 byte lines
-      case 0x87: l2 = 1024; break;   // code and data L2 cache, 1024 KB, 8 ways, 64 byte lines
-      case 0x88: l3 = 2048; break;   // code and data L3 cache, 2048 KB, 4 ways, 64 byte lines (IA-64)
-      case 0x89: l3 = 4096; break;   // code and data L3 cache, 4096 KB, 4 ways, 64 byte lines (IA-64)
-      case 0x8A: l3 = 8192; break;   // code and data L3 cache, 8192 KB, 4 ways, 64 byte lines (IA-64)
-      case 0x8D: l3 = 3072; break;   // code and data L3 cache, 3072 KB, 12 ways, 128 byte lines (IA-64)
-
-      default: break;
+  for (int i = 0; i < 14; ++i) {
+    switch (bytes[i]) {
+      case 0x0A:
+        l1 = 8;
+        break;  // 0Ah   data L1 cache, 8 KB, 2 ways, 32 byte lines
+      case 0x0C:
+        l1 = 16;
+        break;  // 0Ch   data L1 cache, 16 KB, 4 ways, 32 byte lines
+      case 0x0E:
+        l1 = 24;
+        break;  // 0Eh   data L1 cache, 24 KB, 6 ways, 64 byte lines
+      case 0x10:
+        l1 = 16;
+        break;  // 10h   data L1 cache, 16 KB, 4 ways, 32 byte lines (IA-64)
+      case 0x15:
+        l1 = 16;
+        break;  // 15h   code L1 cache, 16 KB, 4 ways, 32 byte lines (IA-64)
+      case 0x2C:
+        l1 = 32;
+        break;  // 2Ch   data L1 cache, 32 KB, 8 ways, 64 byte lines
+      case 0x30:
+        l1 = 32;
+        break;  // 30h   code L1 cache, 32 KB, 8 ways, 64 byte lines
+      case 0x60:
+        l1 = 16;
+        break;  // 60h   data L1 cache, 16 KB, 8 ways, 64 byte lines, sectored
+      case 0x66:
+        l1 = 8;
+        break;  // 66h   data L1 cache, 8 KB, 4 ways, 64 byte lines, sectored
+      case 0x67:
+        l1 = 16;
+        break;  // 67h   data L1 cache, 16 KB, 4 ways, 64 byte lines, sectored
+      case 0x68:
+        l1 = 32;
+        break;  // 68h   data L1 cache, 32 KB, 4 ways, 64 byte lines, sectored
+      case 0x1A:
+        l2 = 96;
+        break;  // code and data L2 cache, 96 KB, 6 ways, 64 byte lines (IA-64)
+      case 0x22:
+        l3 = 512;
+        break;  // code and data L3 cache, 512 KB, 4 ways (!), 64 byte lines, dual-sectored
+      case 0x23:
+        l3 = 1024;
+        break;  // code and data L3 cache, 1024 KB, 8 ways, 64 byte lines, dual-sectored
+      case 0x25:
+        l3 = 2048;
+        break;  // code and data L3 cache, 2048 KB, 8 ways, 64 byte lines, dual-sectored
+      case 0x29:
+        l3 = 4096;
+        break;  // code and data L3 cache, 4096 KB, 8 ways, 64 byte lines, dual-sectored
+      case 0x39:
+        l2 = 128;
+        break;  // code and data L2 cache, 128 KB, 4 ways, 64 byte lines, sectored
+      case 0x3A:
+        l2 = 192;
+        break;  // code and data L2 cache, 192 KB, 6 ways, 64 byte lines, sectored
+      case 0x3B:
+        l2 = 128;
+        break;  // code and data L2 cache, 128 KB, 2 ways, 64 byte lines, sectored
+      case 0x3C:
+        l2 = 256;
+        break;  // code and data L2 cache, 256 KB, 4 ways, 64 byte lines, sectored
+      case 0x3D:
+        l2 = 384;
+        break;  // code and data L2 cache, 384 KB, 6 ways, 64 byte lines, sectored
+      case 0x3E:
+        l2 = 512;
+        break;  // code and data L2 cache, 512 KB, 4 ways, 64 byte lines, sectored
+      case 0x40:
+        l2 = 0;
+        break;  // no integrated L2 cache (P6 core) or L3 cache (P4 core)
+      case 0x41:
+        l2 = 128;
+        break;  // code and data L2 cache, 128 KB, 4 ways, 32 byte lines
+      case 0x42:
+        l2 = 256;
+        break;  // code and data L2 cache, 256 KB, 4 ways, 32 byte lines
+      case 0x43:
+        l2 = 512;
+        break;  // code and data L2 cache, 512 KB, 4 ways, 32 byte lines
+      case 0x44:
+        l2 = 1024;
+        break;  // code and data L2 cache, 1024 KB, 4 ways, 32 byte lines
+      case 0x45:
+        l2 = 2048;
+        break;  // code and data L2 cache, 2048 KB, 4 ways, 32 byte lines
+      case 0x46:
+        l3 = 4096;
+        break;  // code and data L3 cache, 4096 KB, 4 ways, 64 byte lines
+      case 0x47:
+        l3 = 8192;
+        break;  // code and data L3 cache, 8192 KB, 8 ways, 64 byte lines
+      case 0x48:
+        l2 = 3072;
+        break;  // code and data L2 cache, 3072 KB, 12 ways, 64 byte lines
+      case 0x49:
+        if (l2 != 0)
+          l3 = 4096;
+        else {
+          check_for_p2_core2 = true;
+          l3 = l2 = 4096;
+        }
+        break;  // code and data L3 cache, 4096 KB, 16 ways, 64 byte lines (P4) or L2 for core2
+      case 0x4A:
+        l3 = 6144;
+        break;  // code and data L3 cache, 6144 KB, 12 ways, 64 byte lines
+      case 0x4B:
+        l3 = 8192;
+        break;  // code and data L3 cache, 8192 KB, 16 ways, 64 byte lines
+      case 0x4C:
+        l3 = 12288;
+        break;  // code and data L3 cache, 12288 KB, 12 ways, 64 byte lines
+      case 0x4D:
+        l3 = 16384;
+        break;  // code and data L3 cache, 16384 KB, 16 ways, 64 byte lines
+      case 0x4E:
+        l2 = 6144;
+        break;  // code and data L2 cache, 6144 KB, 24 ways, 64 byte lines
+      case 0x78:
+        l2 = 1024;
+        break;  // code and data L2 cache, 1024 KB, 4 ways, 64 byte lines
+      case 0x79:
+        l2 = 128;
+        break;  // code and data L2 cache, 128 KB, 8 ways, 64 byte lines, dual-sectored
+      case 0x7A:
+        l2 = 256;
+        break;  // code and data L2 cache, 256 KB, 8 ways, 64 byte lines, dual-sectored
+      case 0x7B:
+        l2 = 512;
+        break;  // code and data L2 cache, 512 KB, 8 ways, 64 byte lines, dual-sectored
+      case 0x7C:
+        l2 = 1024;
+        break;  // code and data L2 cache, 1024 KB, 8 ways, 64 byte lines, dual-sectored
+      case 0x7D:
+        l2 = 2048;
+        break;  // code and data L2 cache, 2048 KB, 8 ways, 64 byte lines
+      case 0x7E:
+        l2 = 256;
+        break;  // code and data L2 cache, 256 KB, 8 ways, 128 byte lines, sect. (IA-64)
+      case 0x7F:
+        l2 = 512;
+        break;  // code and data L2 cache, 512 KB, 2 ways, 64 byte lines
+      case 0x80:
+        l2 = 512;
+        break;  // code and data L2 cache, 512 KB, 8 ways, 64 byte lines
+      case 0x81:
+        l2 = 128;
+        break;  // code and data L2 cache, 128 KB, 8 ways, 32 byte lines
+      case 0x82:
+        l2 = 256;
+        break;  // code and data L2 cache, 256 KB, 8 ways, 32 byte lines
+      case 0x83:
+        l2 = 512;
+        break;  // code and data L2 cache, 512 KB, 8 ways, 32 byte lines
+      case 0x84:
+        l2 = 1024;
+        break;  // code and data L2 cache, 1024 KB, 8 ways, 32 byte lines
+      case 0x85:
+        l2 = 2048;
+        break;  // code and data L2 cache, 2048 KB, 8 ways, 32 byte lines
+      case 0x86:
+        l2 = 512;
+        break;  // code and data L2 cache, 512 KB, 4 ways, 64 byte lines
+      case 0x87:
+        l2 = 1024;
+        break;  // code and data L2 cache, 1024 KB, 8 ways, 64 byte lines
+      case 0x88:
+        l3 = 2048;
+        break;  // code and data L3 cache, 2048 KB, 4 ways, 64 byte lines (IA-64)
+      case 0x89:
+        l3 = 4096;
+        break;  // code and data L3 cache, 4096 KB, 4 ways, 64 byte lines (IA-64)
+      case 0x8A:
+        l3 = 8192;
+        break;  // code and data L3 cache, 8192 KB, 4 ways, 64 byte lines (IA-64)
+      case 0x8D:
+        l3 = 3072;
+        break;  // code and data L3 cache, 3072 KB, 12 ways, 128 byte lines (IA-64)
+
+      default:
+        break;
     }
   }
-  if(check_for_p2_core2 && l2 == l3)
-    l3 = 0;
+  if (check_for_p2_core2 && l2 == l3) l3 = 0;
   l1 *= 1024;
   l2 *= 1024;
   l3 *= 1024;
 }
 
-inline void queryCacheSizes_intel(int& l1, int& l2, int& l3, int max_std_funcs)
-{
-  if(max_std_funcs>=4)
-    queryCacheSizes_intel_direct(l1,l2,l3);
+inline void queryCacheSizes_intel(int& l1, int& l2, int& l3, int max_std_funcs) {
+  if (max_std_funcs >= 4)
+    queryCacheSizes_intel_direct(l1, l2, l3);
+  else if (max_std_funcs >= 2)
+    queryCacheSizes_intel_codes(l1, l2, l3);
   else
-    queryCacheSizes_intel_codes(l1,l2,l3);
+    l1 = l2 = l3 = 0;
 }
 
-inline void queryCacheSizes_amd(int& l1, int& l2, int& l3)
-{
+inline void queryCacheSizes_amd(int& l1, int& l2, int& l3) {
   int abcd[4];
   abcd[0] = abcd[1] = abcd[2] = abcd[3] = 0;
-  EIGEN_CPUID(abcd,0x80000005,0);
-  l1 = (abcd[2] >> 24) * 1024; // C[31:24] = L1 size in KB
-  abcd[0] = abcd[1] = abcd[2] = abcd[3] = 0;
-  EIGEN_CPUID(abcd,0x80000006,0);
-  l2 = (abcd[2] >> 16) * 1024; // C[31;16] = l2 cache size in KB
-  l3 = ((abcd[3] & 0xFFFC000) >> 18) * 512 * 1024; // D[31;18] = l3 cache size in 512KB
+
+  // First query the max supported function.
+  EIGEN_CPUID(abcd, 0x80000000, 0);
+  if (static_cast<numext::uint32_t>(abcd[0]) >= static_cast<numext::uint32_t>(0x80000006)) {
+    EIGEN_CPUID(abcd, 0x80000005, 0);
+    l1 = (abcd[2] >> 24) * 1024;  // C[31:24] = L1 size in KB
+    abcd[0] = abcd[1] = abcd[2] = abcd[3] = 0;
+    EIGEN_CPUID(abcd, 0x80000006, 0);
+    l2 = (abcd[2] >> 16) * 1024;                      // C[31;16] = l2 cache size in KB
+    l3 = ((abcd[3] & 0xFFFC000) >> 18) * 512 * 1024;  // D[31;18] = l3 cache size in 512KB
+  } else {
+    l1 = l2 = l3 = 0;
+  }
 }
 #endif
 
 /** \internal
  * Queries and returns the cache sizes in Bytes of the L1, L2, and L3 data caches respectively */
-inline void queryCacheSizes(int& l1, int& l2, int& l3)
-{
-  #ifdef EIGEN_CPUID
+inline void queryCacheSizes(int& l1, int& l2, int& l3) {
+#ifdef EIGEN_CPUID
   int abcd[4];
   const int GenuineIntel[] = {0x756e6547, 0x49656e69, 0x6c65746e};
   const int AuthenticAMD[] = {0x68747541, 0x69746e65, 0x444d4163};
-  const int AMDisbetter_[] = {0x69444d41, 0x74656273, 0x21726574}; // "AMDisbetter!"
+  const int AMDisbetter_[] = {0x69444d41, 0x74656273, 0x21726574};  // "AMDisbetter!"
 
   // identify the CPU vendor
-  EIGEN_CPUID(abcd,0x0,0);
-  int max_std_funcs = abcd[1];
-  if(cpuid_is_vendor(abcd,GenuineIntel))
-    queryCacheSizes_intel(l1,l2,l3,max_std_funcs);
-  else if(cpuid_is_vendor(abcd,AuthenticAMD) || cpuid_is_vendor(abcd,AMDisbetter_))
-    queryCacheSizes_amd(l1,l2,l3);
+  EIGEN_CPUID(abcd, 0x0, 0);
+  int max_std_funcs = abcd[0];
+  if (cpuid_is_vendor(abcd, GenuineIntel))
+    queryCacheSizes_intel(l1, l2, l3, max_std_funcs);
+  else if (cpuid_is_vendor(abcd, AuthenticAMD) || cpuid_is_vendor(abcd, AMDisbetter_))
+    queryCacheSizes_amd(l1, l2, l3);
   else
     // by default let's use Intel's API
-    queryCacheSizes_intel(l1,l2,l3,max_std_funcs);
-
-  // here is the list of other vendors:
-//   ||cpuid_is_vendor(abcd,"VIA VIA VIA ")
-//   ||cpuid_is_vendor(abcd,"CyrixInstead")
-//   ||cpuid_is_vendor(abcd,"CentaurHauls")
-//   ||cpuid_is_vendor(abcd,"GenuineTMx86")
-//   ||cpuid_is_vendor(abcd,"TransmetaCPU")
-//   ||cpuid_is_vendor(abcd,"RiseRiseRise")
-//   ||cpuid_is_vendor(abcd,"Geode by NSC")
-//   ||cpuid_is_vendor(abcd,"SiS SiS SiS ")
-//   ||cpuid_is_vendor(abcd,"UMC UMC UMC ")
-//   ||cpuid_is_vendor(abcd,"NexGenDriven")
-  #else
+    queryCacheSizes_intel(l1, l2, l3, max_std_funcs);
+
+    // here is the list of other vendors:
+    //   ||cpuid_is_vendor(abcd,"VIA VIA VIA ")
+    //   ||cpuid_is_vendor(abcd,"CyrixInstead")
+    //   ||cpuid_is_vendor(abcd,"CentaurHauls")
+    //   ||cpuid_is_vendor(abcd,"GenuineTMx86")
+    //   ||cpuid_is_vendor(abcd,"TransmetaCPU")
+    //   ||cpuid_is_vendor(abcd,"RiseRiseRise")
+    //   ||cpuid_is_vendor(abcd,"Geode by NSC")
+    //   ||cpuid_is_vendor(abcd,"SiS SiS SiS ")
+    //   ||cpuid_is_vendor(abcd,"UMC UMC UMC ")
+    //   ||cpuid_is_vendor(abcd,"NexGenDriven")
+#else
   l1 = l2 = l3 = -1;
-  #endif
+#endif
 }
 
 /** \internal
  * \returns the size in Bytes of the L1 data cache */
-inline int queryL1CacheSize()
-{
+inline int queryL1CacheSize() {
   int l1(-1), l2, l3;
-  queryCacheSizes(l1,l2,l3);
+  queryCacheSizes(l1, l2, l3);
   return l1;
 }
 
 /** \internal
  * \returns the size in Bytes of the L2 or L3 cache if this later is present */
-inline int queryTopLevelCacheSize()
-{
+inline int queryTopLevelCacheSize() {
   int l1, l2(-1), l3(-1);
-  queryCacheSizes(l1,l2,l3);
-  return (std::max)(l2,l3);
+  queryCacheSizes(l1, l2, l3);
+  return (std::max)(l2, l3);
+}
+
+/** \internal
+ * This wraps C++20's std::construct_at, using placement new instead if it is not available.
+ */
+
+#if EIGEN_COMP_CXXVER >= 20 && defined(__cpp_lib_constexpr_dynamic_alloc) && \
+    __cpp_lib_constexpr_dynamic_alloc >= 201907L
+using std::construct_at;
+#else
+template <class T, class... Args>
+EIGEN_DEVICE_FUNC T* construct_at(T* p, Args&&... args) {
+  return ::new (const_cast<void*>(static_cast<const volatile void*>(p))) T(std::forward<Args>(args)...);
+}
+#endif
+
+/** \internal
+ * This wraps C++17's std::destroy_at.  If it's not available it calls the destructor.
+ * The wrapper is not a full replacement for C++20's std::destroy_at as it cannot
+ * be applied to std::array.
+ */
+#if EIGEN_COMP_CXXVER >= 17
+using std::destroy_at;
+#else
+template <class T>
+EIGEN_DEVICE_FUNC void destroy_at(T* p) {
+  p->~T();
 }
+#endif
+
+// FIXME(rmlarsen): Work around missing linker symbol with msan on ARM.
+#if !defined(EIGEN_DONT_ASSUME_ALIGNED) && __has_feature(memory_sanitizer) && \
+    (EIGEN_ARCH_ARM || EIGEN_ARCH_ARM64)
+#define EIGEN_DONT_ASSUME_ALIGNED
+#endif
+
+
+#if !defined(EIGEN_DONT_ASSUME_ALIGNED) && defined(__cpp_lib_assume_aligned) && (__cpp_lib_assume_aligned >= 201811L)
+template <std::size_t N, typename T>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr T* assume_aligned(T* ptr) {
+  return std::assume_aligned<N, T>(ptr);
+}
+#elif !defined(EIGEN_DONT_ASSUME_ALIGNED) && EIGEN_HAS_BUILTIN(__builtin_assume_aligned)
+template <std::size_t N, typename T>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC T* assume_aligned(T* ptr) {
+  return static_cast<T*>(__builtin_assume_aligned(ptr, N));
+}
+#else
+template <std::size_t N, typename T>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr T* assume_aligned(T* ptr) {
+  return ptr;
+}
+#endif
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_MEMORY_H
+#endif  // EIGEN_MEMORY_H
diff --git a/inst/include/Eigen/src/Core/util/Meta.h b/inst/include/Eigen/src/Core/util/Meta.h
index 71d58710..ddbc898e 100644
--- a/inst/include/Eigen/src/Core/util/Meta.h
+++ b/inst/include/Eigen/src/Core/util/Meta.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -11,233 +11,750 @@
 #ifndef EIGEN_META_H
 #define EIGEN_META_H
 
+// IWYU pragma: private
+#include "../InternalHeaderCheck.h"
+
+#if defined(EIGEN_GPU_COMPILE_PHASE)
+
+#include <cfloat>
+
+#if defined(EIGEN_CUDA_ARCH)
+#include <math_constants.h>
+#endif
+
+#if defined(EIGEN_HIP_DEVICE_COMPILE)
+#include "Eigen/src/Core/arch/HIP/hcc/math_constants.h"
+#endif
+
+#endif
+
+// Define portable (u)int{32,64} types
+#include <cstdint>
+
+namespace Eigen {
+namespace numext {
+typedef std::uint8_t uint8_t;
+typedef std::int8_t int8_t;
+typedef std::uint16_t uint16_t;
+typedef std::int16_t int16_t;
+typedef std::uint32_t uint32_t;
+typedef std::int32_t int32_t;
+typedef std::uint64_t uint64_t;
+typedef std::int64_t int64_t;
+
+template <size_t Size>
+struct get_integer_by_size {
+  typedef void signed_type;
+  typedef void unsigned_type;
+};
+template <>
+struct get_integer_by_size<1> {
+  typedef int8_t signed_type;
+  typedef uint8_t unsigned_type;
+};
+template <>
+struct get_integer_by_size<2> {
+  typedef int16_t signed_type;
+  typedef uint16_t unsigned_type;
+};
+template <>
+struct get_integer_by_size<4> {
+  typedef int32_t signed_type;
+  typedef uint32_t unsigned_type;
+};
+template <>
+struct get_integer_by_size<8> {
+  typedef int64_t signed_type;
+  typedef uint64_t unsigned_type;
+};
+}  // namespace numext
+}  // namespace Eigen
+
 namespace Eigen {
 
+typedef EIGEN_DEFAULT_DENSE_INDEX_TYPE DenseIndex;
+
+/**
+ * \brief The Index type as used for the API.
+ * \details To change this, \c \#define the preprocessor symbol \c EIGEN_DEFAULT_DENSE_INDEX_TYPE.
+ * \sa \blank \ref TopicPreprocessorDirectives, StorageIndex.
+ */
+typedef EIGEN_DEFAULT_DENSE_INDEX_TYPE Index;
+
 namespace internal {
 
 /** \internal
-  * \file Meta.h
-  * This file contains generic metaprogramming classes which are not specifically related to Eigen.
-  * \note In case you wonder, yes we're aware that Boost already provides all these features,
-  * we however don't want to add a dependency to Boost.
-  */
-
-struct true_type {  enum { value = 1 }; };
-struct false_type { enum { value = 0 }; };
-
-template<bool Condition, typename Then, typename Else>
-struct conditional { typedef Then type; };
-
-template<typename Then, typename Else>
-struct conditional <false, Then, Else> { typedef Else type; };
-
-template<typename T, typename U> struct is_same { enum { value = 0 }; };
-template<typename T> struct is_same<T,T> { enum { value = 1 }; };
-
-template<typename T> struct remove_reference { typedef T type; };
-template<typename T> struct remove_reference<T&> { typedef T type; };
-
-template<typename T> struct remove_pointer { typedef T type; };
-template<typename T> struct remove_pointer<T*> { typedef T type; };
-template<typename T> struct remove_pointer<T*const> { typedef T type; };
-
-template <class T> struct remove_const { typedef T type; };
-template <class T> struct remove_const<const T> { typedef T type; };
-template <class T> struct remove_const<const T[]> { typedef T type[]; };
-template <class T, unsigned int Size> struct remove_const<const T[Size]> { typedef T type[Size]; };
-
-template<typename T> struct remove_all { typedef T type; };
-template<typename T> struct remove_all<const T>   { typedef typename remove_all<T>::type type; };
-template<typename T> struct remove_all<T const&>  { typedef typename remove_all<T>::type type; };
-template<typename T> struct remove_all<T&>        { typedef typename remove_all<T>::type type; };
-template<typename T> struct remove_all<T const*>  { typedef typename remove_all<T>::type type; };
-template<typename T> struct remove_all<T*>        { typedef typename remove_all<T>::type type; };
-
-template<typename T> struct is_arithmetic      { enum { value = false }; };
-template<> struct is_arithmetic<float>         { enum { value = true }; };
-template<> struct is_arithmetic<double>        { enum { value = true }; };
-template<> struct is_arithmetic<long double>   { enum { value = true }; };
-template<> struct is_arithmetic<bool>          { enum { value = true }; };
-template<> struct is_arithmetic<char>          { enum { value = true }; };
-template<> struct is_arithmetic<signed char>   { enum { value = true }; };
-template<> struct is_arithmetic<unsigned char> { enum { value = true }; };
-template<> struct is_arithmetic<signed short>  { enum { value = true }; };
-template<> struct is_arithmetic<unsigned short>{ enum { value = true }; };
-template<> struct is_arithmetic<signed int>    { enum { value = true }; };
-template<> struct is_arithmetic<unsigned int>  { enum { value = true }; };
-template<> struct is_arithmetic<signed long>   { enum { value = true }; };
-template<> struct is_arithmetic<unsigned long> { enum { value = true }; };
-
-template <typename T> struct add_const { typedef const T type; };
-template <typename T> struct add_const<T&> { typedef T& type; };
-
-template <typename T> struct is_const { enum { value = 0 }; };
-template <typename T> struct is_const<T const> { enum { value = 1 }; };
-
-template<typename T> struct add_const_on_value_type            { typedef const T type;  };
-template<typename T> struct add_const_on_value_type<T&>        { typedef T const& type; };
-template<typename T> struct add_const_on_value_type<T*>        { typedef T const* type; };
-template<typename T> struct add_const_on_value_type<T* const>  { typedef T const* const type; };
-template<typename T> struct add_const_on_value_type<T const* const>  { typedef T const* const type; };
-
-/** \internal Allows to enable/disable an overload
-  * according to a compile time condition.
-  */
-template<bool Condition, typename T> struct enable_if;
-
-template<typename T> struct enable_if<true,T>
-{ typedef T type; };
+ * \file Meta.h
+ * This file contains generic metaprogramming classes which are not specifically related to Eigen.
+ * \note In case you wonder, yes we're aware that Boost already provides all these features,
+ * we however don't want to add a dependency to Boost.
+ */
+
+using std::false_type;
+using std::true_type;
+
+template <bool Condition>
+struct bool_constant;
+
+template <>
+struct bool_constant<true> : true_type {};
+
+template <>
+struct bool_constant<false> : false_type {};
+
+// Third-party libraries rely on these.
+using std::conditional;
+using std::remove_const;
+using std::remove_pointer;
+using std::remove_reference;
 
+template <typename T>
+struct remove_all {
+  typedef T type;
+};
+template <typename T>
+struct remove_all<const T> {
+  typedef typename remove_all<T>::type type;
+};
+template <typename T>
+struct remove_all<T const&> {
+  typedef typename remove_all<T>::type type;
+};
+template <typename T>
+struct remove_all<T&> {
+  typedef typename remove_all<T>::type type;
+};
+template <typename T>
+struct remove_all<T const*> {
+  typedef typename remove_all<T>::type type;
+};
+template <typename T>
+struct remove_all<T*> {
+  typedef typename remove_all<T>::type type;
+};
 
+template <typename T>
+using remove_all_t = typename remove_all<T>::type;
+
+template <typename T>
+struct is_arithmetic {
+  enum { value = false };
+};
+template <>
+struct is_arithmetic<float> {
+  enum { value = true };
+};
+template <>
+struct is_arithmetic<double> {
+  enum { value = true };
+};
+// GPU devices treat `long double` as `double`.
+#ifndef EIGEN_GPU_COMPILE_PHASE
+template <>
+struct is_arithmetic<long double> {
+  enum { value = true };
+};
+#endif
+template <>
+struct is_arithmetic<bool> {
+  enum { value = true };
+};
+template <>
+struct is_arithmetic<char> {
+  enum { value = true };
+};
+template <>
+struct is_arithmetic<signed char> {
+  enum { value = true };
+};
+template <>
+struct is_arithmetic<unsigned char> {
+  enum { value = true };
+};
+template <>
+struct is_arithmetic<signed short> {
+  enum { value = true };
+};
+template <>
+struct is_arithmetic<unsigned short> {
+  enum { value = true };
+};
+template <>
+struct is_arithmetic<signed int> {
+  enum { value = true };
+};
+template <>
+struct is_arithmetic<unsigned int> {
+  enum { value = true };
+};
+template <>
+struct is_arithmetic<signed long> {
+  enum { value = true };
+};
+template <>
+struct is_arithmetic<unsigned long> {
+  enum { value = true };
+};
+
+template <typename T, typename U>
+struct is_same {
+  enum { value = 0 };
+};
+template <typename T>
+struct is_same<T, T> {
+  enum { value = 1 };
+};
+
+template <class T>
+struct is_void : is_same<void, std::remove_const_t<T>> {};
 
 /** \internal
-  * A base class do disable default copy ctor and copy assignement operator.
-  */
-class noncopyable
-{
-  noncopyable(const noncopyable&);
-  const noncopyable& operator=(const noncopyable&);
-protected:
-  noncopyable() {}
-  ~noncopyable() {}
+ * Implementation of std::void_t for SFINAE.
+ *
+ * Pre C++17:
+ * Custom implementation.
+ *
+ * Post C++17: Uses std::void_t
+ */
+#if EIGEN_COMP_CXXVER >= 17 && defined(__cpp_lib_void_t) && __cpp_lib_void_t >= 201411L
+using std::void_t;
+#else
+template <typename...>
+using void_t = void;
+#endif
+
+template <>
+struct is_arithmetic<signed long long> {
+  enum { value = true };
+};
+template <>
+struct is_arithmetic<unsigned long long> {
+  enum { value = true };
 };
+using std::is_integral;
 
+using std::make_unsigned;
+
+template <typename T>
+struct is_const {
+  enum { value = 0 };
+};
+template <typename T>
+struct is_const<T const> {
+  enum { value = 1 };
+};
+
+template <typename T>
+struct add_const_on_value_type {
+  typedef const T type;
+};
+template <typename T>
+struct add_const_on_value_type<T&> {
+  typedef T const& type;
+};
+template <typename T>
+struct add_const_on_value_type<T*> {
+  typedef T const* type;
+};
+template <typename T>
+struct add_const_on_value_type<T* const> {
+  typedef T const* const type;
+};
+template <typename T>
+struct add_const_on_value_type<T const* const> {
+  typedef T const* const type;
+};
+
+template <typename T>
+using add_const_on_value_type_t = typename add_const_on_value_type<T>::type;
+
+using std::is_convertible;
 
 /** \internal
-  * Convenient struct to get the result type of a unary or binary functor.
-  *
-  * It supports both the current STL mechanism (using the result_type member) as well as
-  * upcoming next STL generation (using a templated result member).
-  * If none of these members is provided, then the type of the first argument is returned. FIXME, that behavior is a pretty bad hack.
-  */
-template<typename T> struct result_of {};
+ * A base class do disable default copy ctor and copy assignment operator.
+ */
+class noncopyable {
+  EIGEN_DEVICE_FUNC noncopyable(const noncopyable&);
+  EIGEN_DEVICE_FUNC const noncopyable& operator=(const noncopyable&);
+
+ protected:
+  EIGEN_DEVICE_FUNC noncopyable() {}
+  EIGEN_DEVICE_FUNC ~noncopyable() {}
+};
 
-struct has_none {int a[1];};
-struct has_std_result_type {int a[2];};
-struct has_tr1_result {int a[3];};
+/** \internal
+ * Provides access to the number of elements in the object of as a compile-time constant expression.
+ * It "returns" Eigen::Dynamic if the size cannot be resolved at compile-time (default).
+ *
+ * Similar to std::tuple_size, but more general.
+ *
+ * It currently supports:
+ *  - any types T defining T::SizeAtCompileTime
+ *  - plain C arrays as T[N]
+ *  - std::array (c++11)
+ *  - some internal types such as SingleRange and AllRange
+ *
+ * The second template parameter eases SFINAE-based specializations.
+ */
+template <typename T, typename EnableIf = void>
+struct array_size {
+  static constexpr Index value = Dynamic;
+};
 
-template<typename Func, typename ArgType, int SizeOf=sizeof(has_none)>
-struct unary_result_of_select {typedef ArgType type;};
+template <typename T>
+struct array_size<T, std::enable_if_t<((T::SizeAtCompileTime & 0) == 0)>> {
+  static constexpr Index value = T::SizeAtCompileTime;
+};
 
-template<typename Func, typename ArgType>
-struct unary_result_of_select<Func, ArgType, sizeof(has_std_result_type)> {typedef typename Func::result_type type;};
+template <typename T, int N>
+struct array_size<const T (&)[N]> {
+  static constexpr Index value = N;
+};
+template <typename T, int N>
+struct array_size<T (&)[N]> {
+  static constexpr Index value = N;
+};
 
-template<typename Func, typename ArgType>
-struct unary_result_of_select<Func, ArgType, sizeof(has_tr1_result)> {typedef typename Func::template result<Func(ArgType)>::type type;};
+template <typename T, std::size_t N>
+struct array_size<const std::array<T, N>> {
+  static constexpr Index value = N;
+};
+template <typename T, std::size_t N>
+struct array_size<std::array<T, N>> {
+  static constexpr Index value = N;
+};
 
-template<typename Func, typename ArgType>
-struct result_of<Func(ArgType)> {
-    template<typename T>
-    static has_std_result_type testFunctor(T const *, typename T::result_type const * = 0);
-    template<typename T>
-    static has_tr1_result      testFunctor(T const *, typename T::template result<T(ArgType)>::type const * = 0);
-    static has_none            testFunctor(...);
+/** \internal
+ * Analogue of the std::ssize free function.
+ * It returns the signed size of the container or view \a x of type \c T
+ *
+ * It currently supports:
+ *  - any types T defining a member T::size() const
+ *  - plain C arrays as T[N]
+ *
+ * For C++20, this function just forwards to `std::ssize`, or any ADL discoverable `ssize` function.
+ */
+#if EIGEN_COMP_CXXVER >= 20 && defined(__cpp_lib_ssize) && __cpp_lib_ssize >= 201902L
+
+template <typename T>
+constexpr auto index_list_size(T&& x) {
+  using std::ssize;
+  return ssize(std::forward<T>(x));
+}
+
+#else
+
+template <typename T>
+constexpr auto index_list_size(const T& x) {
+  using R = std::common_type_t<std::ptrdiff_t, std::make_signed_t<decltype(x.size())>>;
+  return static_cast<R>(x.size());
+}
+
+template <typename T, std::ptrdiff_t N>
+constexpr std::ptrdiff_t index_list_size(const T (&)[N]) {
+  return N;
+}
+#endif
 
-    // note that the following indirection is needed for gcc-3.3
-    enum {FunctorType = sizeof(testFunctor(static_cast<Func*>(0)))};
-    typedef typename unary_result_of_select<Func, ArgType, FunctorType>::type type;
+/** \internal
+ * Convenient struct to get the result type of a nullary, unary, binary, or
+ * ternary functor.
+ *
+ * Pre C++17:
+ * This uses std::result_of. However, note the `type` member removes
+ * const and converts references/pointers to their corresponding value type.
+ *
+ * Post C++17: Uses std::invoke_result
+ */
+#if EIGEN_HAS_STD_INVOKE_RESULT
+template <typename T>
+struct result_of;
+
+template <typename F, typename... ArgTypes>
+struct result_of<F(ArgTypes...)> {
+  typedef typename std::invoke_result<F, ArgTypes...>::type type1;
+  typedef remove_all_t<type1> type;
 };
 
-template<typename Func, typename ArgType0, typename ArgType1, int SizeOf=sizeof(has_none)>
-struct binary_result_of_select {typedef ArgType0 type;};
+template <typename F, typename... ArgTypes>
+struct invoke_result {
+  typedef typename std::invoke_result<F, ArgTypes...>::type type1;
+  typedef remove_all_t<type1> type;
+};
+#else
+template <typename T>
+struct result_of {
+  typedef typename std::result_of<T>::type type1;
+  typedef remove_all_t<type1> type;
+};
 
-template<typename Func, typename ArgType0, typename ArgType1>
-struct binary_result_of_select<Func, ArgType0, ArgType1, sizeof(has_std_result_type)>
-{typedef typename Func::result_type type;};
+template <typename F, typename... ArgTypes>
+struct invoke_result {
+  typedef typename result_of<F(ArgTypes...)>::type type1;
+  typedef remove_all_t<type1> type;
+};
+#endif
 
-template<typename Func, typename ArgType0, typename ArgType1>
-struct binary_result_of_select<Func, ArgType0, ArgType1, sizeof(has_tr1_result)>
-{typedef typename Func::template result<Func(ArgType0,ArgType1)>::type type;};
+// Reduces a sequence of bools to true if all are true, false otherwise.
+template <bool... values>
+using reduce_all =
+    std::is_same<std::integer_sequence<bool, values..., true>, std::integer_sequence<bool, true, values...>>;
 
-template<typename Func, typename ArgType0, typename ArgType1>
-struct result_of<Func(ArgType0,ArgType1)> {
-    template<typename T>
-    static has_std_result_type testFunctor(T const *, typename T::result_type const * = 0);
-    template<typename T>
-    static has_tr1_result      testFunctor(T const *, typename T::template result<T(ArgType0,ArgType1)>::type const * = 0);
-    static has_none            testFunctor(...);
+// Reduces a sequence of bools to true if any are true, false if all false.
+template <bool... values>
+using reduce_any = std::integral_constant<bool, !std::is_same<std::integer_sequence<bool, values..., false>,
+                                                              std::integer_sequence<bool, false, values...>>::value>;
 
-    // note that the following indirection is needed for gcc-3.3
-    enum {FunctorType = sizeof(testFunctor(static_cast<Func*>(0)))};
-    typedef typename binary_result_of_select<Func, ArgType0, ArgType1, FunctorType>::type type;
+struct meta_yes {
+  char a[1];
+};
+struct meta_no {
+  char a[2];
 };
 
-/** \internal In short, it computes int(sqrt(\a Y)) with \a Y an integer.
-  * Usage example: \code meta_sqrt<1023>::ret \endcode
-  */
-template<int Y,
-         int InfX = 0,
-         int SupX = ((Y==1) ? 1 : Y/2),
-         bool Done = ((SupX-InfX)<=1 ? true : ((SupX*SupX <= Y) && ((SupX+1)*(SupX+1) > Y))) >
-                                // use ?: instead of || just to shut up a stupid gcc 4.3 warning
-class meta_sqrt
-{
-    enum {
-      MidX = (InfX+SupX)/2,
-      TakeInf = MidX*MidX > Y ? 1 : 0,
-      NewInf = int(TakeInf) ? InfX : int(MidX),
-      NewSup = int(TakeInf) ? int(MidX) : SupX
-    };
-  public:
-    enum { ret = meta_sqrt<Y,NewInf,NewSup>::ret };
-};
-
-template<int Y, int InfX, int SupX>
-class meta_sqrt<Y, InfX, SupX, true> { public:  enum { ret = (SupX*SupX <= Y) ? SupX : InfX }; };
+// Check whether T::ReturnType does exist
+template <typename T>
+struct has_ReturnType {
+  template <typename C>
+  static meta_yes testFunctor(C const*, typename C::ReturnType const* = 0);
+  template <typename C>
+  static meta_no testFunctor(...);
 
-/** \internal determines whether the product of two numeric types is allowed and what the return type is */
-template<typename T, typename U> struct scalar_product_traits
-{
-  enum { Defined = 0 };
+  enum { value = sizeof(testFunctor<T>(static_cast<T*>(0))) == sizeof(meta_yes) };
 };
 
-template<typename T> struct scalar_product_traits<T,T>
-{
-  enum {
-    // Cost = NumTraits<T>::MulCost,
-    Defined = 1
-  };
-  typedef T ReturnType;
+template <typename T>
+const T* return_ptr();
+
+template <typename T, typename IndexType = Index>
+struct has_nullary_operator {
+  template <typename C>
+  static meta_yes testFunctor(C const*, std::enable_if_t<(sizeof(return_ptr<C>()->operator()()) > 0)>* = 0);
+  static meta_no testFunctor(...);
+
+  enum { value = sizeof(testFunctor(static_cast<T*>(0))) == sizeof(meta_yes) };
 };
 
-template<typename T> struct scalar_product_traits<T,std::complex<T> >
-{
-  enum {
-    // Cost = 2*NumTraits<T>::MulCost,
-    Defined = 1
-  };
-  typedef std::complex<T> ReturnType;
+template <typename T, typename IndexType = Index>
+struct has_unary_operator {
+  template <typename C>
+  static meta_yes testFunctor(C const*, std::enable_if_t<(sizeof(return_ptr<C>()->operator()(IndexType(0))) > 0)>* = 0);
+  static meta_no testFunctor(...);
+
+  enum { value = sizeof(testFunctor(static_cast<T*>(0))) == sizeof(meta_yes) };
 };
 
-template<typename T> struct scalar_product_traits<std::complex<T>, T>
-{
+template <typename T, typename IndexType = Index>
+struct has_binary_operator {
+  template <typename C>
+  static meta_yes testFunctor(
+      C const*, std::enable_if_t<(sizeof(return_ptr<C>()->operator()(IndexType(0), IndexType(0))) > 0)>* = 0);
+  static meta_no testFunctor(...);
+
+  enum { value = sizeof(testFunctor(static_cast<T*>(0))) == sizeof(meta_yes) };
+};
+
+/** \internal In short, it computes int(sqrt(\a Y)) with \a Y an integer.
+ * Usage example: \code meta_sqrt<1023>::ret \endcode
+ */
+template <int Y, int InfX = 0, int SupX = ((Y == 1) ? 1 : Y / 2),
+          bool Done = ((SupX - InfX) <= 1 || ((SupX * SupX <= Y) && ((SupX + 1) * (SupX + 1) > Y)))>
+class meta_sqrt {
   enum {
-    // Cost = 2*NumTraits<T>::MulCost,
-    Defined = 1
+    MidX = (InfX + SupX) / 2,
+    TakeInf = MidX * MidX > Y ? 1 : 0,
+    NewInf = int(TakeInf) ? InfX : int(MidX),
+    NewSup = int(TakeInf) ? int(MidX) : SupX
   };
-  typedef std::complex<T> ReturnType;
+
+ public:
+  enum { ret = meta_sqrt<Y, NewInf, NewSup>::ret };
+};
+
+template <int Y, int InfX, int SupX>
+class meta_sqrt<Y, InfX, SupX, true> {
+ public:
+  enum { ret = (SupX * SupX <= Y) ? SupX : InfX };
+};
+
+/** \internal Computes the least common multiple of two positive integer A and B
+ * at compile-time.
+ */
+template <int A, int B, int K = 1, bool Done = ((A * K) % B) == 0, bool Big = (A >= B)>
+struct meta_least_common_multiple {
+  enum { ret = meta_least_common_multiple<A, B, K + 1>::ret };
+};
+template <int A, int B, int K, bool Done>
+struct meta_least_common_multiple<A, B, K, Done, false> {
+  enum { ret = meta_least_common_multiple<B, A, K>::ret };
+};
+template <int A, int B, int K>
+struct meta_least_common_multiple<A, B, K, true, true> {
+  enum { ret = A * K };
+};
+
+/** \internal determines whether the product of two numeric types is allowed and what the return type is */
+template <typename T, typename U>
+struct scalar_product_traits {
+  enum { Defined = 0 };
 };
 
 // FIXME quick workaround around current limitation of result_of
 // template<typename Scalar, typename ArgType0, typename ArgType1>
 // struct result_of<scalar_product_op<Scalar>(ArgType0,ArgType1)> {
-// typedef typename scalar_product_traits<typename remove_all<ArgType0>::type, typename remove_all<ArgType1>::type>::ReturnType type;
+// typedef typename scalar_product_traits<remove_all_t<ArgType0>, remove_all_t<ArgType1>>::ReturnType type;
 // };
 
-template<typename T> struct is_diagonal
-{ enum { ret = false }; };
-
-template<typename T> struct is_diagonal<DiagonalBase<T> >
-{ enum { ret = true }; };
+/** \internal Obtains a POD type suitable to use as storage for an object of a size
+ * of at most Len bytes, aligned as specified by \c Align.
+ */
+template <unsigned Len, unsigned Align>
+struct aligned_storage {
+  struct type {
+    EIGEN_ALIGN_TO_BOUNDARY(Align) unsigned char data[Len];
+  };
+};
 
-template<typename T> struct is_diagonal<DiagonalWrapper<T> >
-{ enum { ret = true }; };
+}  // end namespace internal
+
+template <typename T>
+struct NumTraits;
+
+namespace numext {
+
+#if defined(EIGEN_GPU_COMPILE_PHASE)
+template <typename T>
+EIGEN_DEVICE_FUNC void swap(T& a, T& b) {
+  T tmp = b;
+  b = a;
+  a = tmp;
+}
+#else
+template <typename T>
+EIGEN_STRONG_INLINE void swap(T& a, T& b) {
+  std::swap(a, b);
+}
+#endif
+
+using std::numeric_limits;
+
+// Handle integer comparisons of different signedness.
+template <typename X, typename Y, bool XIsInteger = NumTraits<X>::IsInteger, bool XIsSigned = NumTraits<X>::IsSigned,
+          bool YIsInteger = NumTraits<Y>::IsInteger, bool YIsSigned = NumTraits<Y>::IsSigned>
+struct equal_strict_impl {
+  static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool run(const X& x, const Y& y) { return x == y; }
+};
+template <typename X, typename Y>
+struct equal_strict_impl<X, Y, true, false, true, true> {
+  // X is an unsigned integer
+  // Y is a signed integer
+  // if Y is non-negative, it may be represented exactly as its unsigned counterpart.
+  using UnsignedY = typename internal::make_unsigned<Y>::type;
+  static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool run(const X& x, const Y& y) {
+    return y < Y(0) ? false : (x == static_cast<UnsignedY>(y));
+  }
+};
+template <typename X, typename Y>
+struct equal_strict_impl<X, Y, true, true, true, false> {
+  // X is a signed integer
+  // Y is an unsigned integer
+  static EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool run(const X& x, const Y& y) {
+    return equal_strict_impl<Y, X>::run(y, x);
+  }
+};
 
-template<typename T, int S> struct is_diagonal<DiagonalMatrix<T,S> >
-{ enum { ret = true }; };
+// The aim of the following functions is to bypass -Wfloat-equal warnings
+// when we really want a strict equality comparison on floating points.
+template <typename X, typename Y>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool equal_strict(const X& x, const Y& y) {
+  return equal_strict_impl<X, Y>::run(x, y);
+}
+
+#if !defined(EIGEN_GPU_COMPILE_PHASE) || (!defined(EIGEN_CUDA_ARCH) && defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC))
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool equal_strict(const float& x, const float& y) {
+  return std::equal_to<float>()(x, y);
+}
+
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool equal_strict(const double& x, const double& y) {
+  return std::equal_to<double>()(x, y);
+}
+#endif
+
+/**
+ * \internal Performs an exact comparison of x to zero, e.g. to decide whether a term can be ignored.
+ * Use this to to bypass -Wfloat-equal warnings when exact zero is what needs to be tested.
+ */
+template <typename X>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool is_exactly_zero(const X& x) {
+  return equal_strict(x, typename NumTraits<X>::Literal{0});
+}
+
+/**
+ * \internal Performs an exact comparison of x to one, e.g. to decide whether a factor needs to be multiplied.
+ * Use this to to bypass -Wfloat-equal warnings when exact one is what needs to be tested.
+ */
+template <typename X>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool is_exactly_one(const X& x) {
+  return equal_strict(x, typename NumTraits<X>::Literal{1});
+}
+
+template <typename X, typename Y>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool not_equal_strict(const X& x, const Y& y) {
+  return !equal_strict_impl<X, Y>::run(x, y);
+}
+
+#if !defined(EIGEN_GPU_COMPILE_PHASE) || (!defined(EIGEN_CUDA_ARCH) && defined(EIGEN_CONSTEXPR_ARE_DEVICE_FUNC))
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool not_equal_strict(const float& x, const float& y) {
+  return std::not_equal_to<float>()(x, y);
+}
+
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool not_equal_strict(const double& x, const double& y) {
+  return std::not_equal_to<double>()(x, y);
+}
+#endif
+
+}  // end namespace numext
 
-} // end namespace internal
+namespace internal {
 
-} // end namespace Eigen
+template <typename Scalar>
+struct is_identically_zero_impl {
+  static inline bool run(const Scalar& s) { return numext::is_exactly_zero(s); }
+};
 
-#endif // EIGEN_META_H
+template <typename Scalar>
+EIGEN_STRONG_INLINE bool is_identically_zero(const Scalar& s) {
+  return is_identically_zero_impl<Scalar>::run(s);
+}
+
+/// \internal Returns true if its argument is of integer or enum type.
+/// FIXME this has the same purpose as `is_valid_index_type` in XprHelper.h
+template <typename A>
+constexpr bool is_int_or_enum_v = std::is_enum<A>::value || std::is_integral<A>::value;
+
+template <typename A, typename B>
+constexpr void plain_enum_asserts(A, B) {
+  static_assert(is_int_or_enum_v<A>, "Argument a must be an integer or enum");
+  static_assert(is_int_or_enum_v<B>, "Argument b must be an integer or enum");
+}
+
+/// \internal Gets the minimum of two values which may be integers or enums
+template <typename A, typename B>
+constexpr int plain_enum_min(A a, B b) {
+  plain_enum_asserts(a, b);
+  return ((int)a <= (int)b) ? (int)a : (int)b;
+}
+
+/// \internal Gets the maximum of two values which may be integers or enums
+template <typename A, typename B>
+constexpr int plain_enum_max(A a, B b) {
+  plain_enum_asserts(a, b);
+  return ((int)a >= (int)b) ? (int)a : (int)b;
+}
+
+/**
+ * \internal
+ *  `min_size_prefer_dynamic` gives the min between compile-time sizes. 0 has absolute priority, followed by 1,
+ *  followed by Dynamic, followed by other finite values. The reason for giving Dynamic the priority over
+ *  finite values is that min(3, Dynamic) should be Dynamic, since that could be anything between 0 and 3.
+ */
+template <typename A, typename B>
+constexpr int min_size_prefer_dynamic(A a, B b) {
+  plain_enum_asserts(a, b);
+  if ((int)a == 0 || (int)b == 0) return 0;
+  if ((int)a == 1 || (int)b == 1) return 1;
+  if ((int)a == Dynamic || (int)b == Dynamic) return Dynamic;
+  return plain_enum_min(a, b);
+}
+
+/**
+ * \internal
+ *  min_size_prefer_fixed is a variant of `min_size_prefer_dynamic` comparing MaxSizes. The difference is that finite
+ * values now have priority over Dynamic, so that min(3, Dynamic) gives 3. Indeed, whatever the actual value is (between
+ * 0 and 3), it is not more than 3.
+ */
+template <typename A, typename B>
+constexpr int min_size_prefer_fixed(A a, B b) {
+  plain_enum_asserts(a, b);
+  if ((int)a == 0 || (int)b == 0) return 0;
+  if ((int)a == 1 || (int)b == 1) return 1;
+  if ((int)a == Dynamic && (int)b == Dynamic) return Dynamic;
+  if ((int)a == Dynamic) return (int)b;
+  if ((int)b == Dynamic) return (int)a;
+  return plain_enum_min(a, b);
+}
+
+/// \internal see `min_size_prefer_fixed`. No need for a separate variant for MaxSizes here.
+template <typename A, typename B>
+constexpr int max_size_prefer_dynamic(A a, B b) {
+  plain_enum_asserts(a, b);
+  if ((int)a == Dynamic || (int)b == Dynamic) return Dynamic;
+  return plain_enum_max(a, b);
+}
+
+template <typename A, typename B>
+inline constexpr int size_prefer_fixed(A a, B b) {
+  plain_enum_asserts(a, b);
+  return int(a) == Dynamic ? int(b) : int(a);
+}
+
+template <typename A, typename B>
+inline constexpr bool enum_eq_not_dynamic(A a, B b) {
+  plain_enum_asserts(a, b);
+  if ((int)a == Dynamic || (int)b == Dynamic) return false;
+  return (int)a == (int)b;
+}
+
+template <typename A, typename B>
+constexpr bool enum_lt_not_dynamic(A a, B b) {
+  plain_enum_asserts(a, b);
+  if ((int)a == Dynamic || (int)b == Dynamic) return false;
+  return (int)a < (int)b;
+}
+
+template <typename A, typename B>
+constexpr bool enum_le_not_dynamic(A a, B b) {
+  plain_enum_asserts(a, b);
+  if ((int)a == Dynamic || (int)b == Dynamic) return false;
+  return (int)a <= (int)b;
+}
+
+template <typename A, typename B>
+constexpr bool enum_gt_not_dynamic(A a, B b) {
+  plain_enum_asserts(a, b);
+  if ((int)a == Dynamic || (int)b == Dynamic) return false;
+  return (int)a > (int)b;
+}
+
+template <typename A, typename B>
+constexpr bool enum_ge_not_dynamic(A a, B b) {
+  plain_enum_asserts(a, b);
+  if ((int)a == Dynamic || (int)b == Dynamic) return false;
+  return (int)a >= (int)b;
+}
+
+/// \internal Calculate logical XOR at compile time
+constexpr bool logical_xor(bool a, bool b) { return a != b; }
+
+/// \internal Calculate logical IMPLIES at compile time
+constexpr bool check_implication(bool a, bool b) { return !a || b; }
+
+/// \internal Provide fallback for std::is_constant_evaluated for pre-C++20.
+#if EIGEN_COMP_CXXVER >= 20 && defined(__cpp_lib_is_constant_evaluated) && __cpp_lib_is_constant_evaluated >= 201811L
+using std::is_constant_evaluated;
+#else
+constexpr bool is_constant_evaluated() { return false; }
+#endif
+
+template <typename Scalar>
+using make_complex_t = std::conditional_t<NumTraits<Scalar>::IsComplex, Scalar, std::complex<Scalar>>;
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_META_H
diff --git a/inst/include/Eigen/src/Core/util/MoreMeta.h b/inst/include/Eigen/src/Core/util/MoreMeta.h
new file mode 100644
index 00000000..6823bca9
--- /dev/null
+++ b/inst/include/Eigen/src/Core/util/MoreMeta.h
@@ -0,0 +1,638 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_MOREMETA_H
+#define EIGEN_MOREMETA_H
+
+// IWYU pragma: private
+#include "../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename... tt>
+struct type_list {
+  constexpr static int count = sizeof...(tt);
+};
+
+template <typename t, typename... tt>
+struct type_list<t, tt...> {
+  constexpr static int count = sizeof...(tt) + 1;
+  typedef t first_type;
+};
+
+template <typename T, T... nn>
+struct numeric_list {
+  constexpr static std::size_t count = sizeof...(nn);
+};
+
+template <typename T, T n, T... nn>
+struct numeric_list<T, n, nn...> {
+  static constexpr std::size_t count = sizeof...(nn) + 1;
+  static constexpr T first_value = n;
+};
+
+// Ddoxygen doesn't like the recursive definition of gen_numeric_list.
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+/* numeric list constructors
+ *
+ * equivalencies:
+ *     constructor                                              result
+ *     typename gen_numeric_list<int, 5>::type                  numeric_list<int, 0,1,2,3,4>
+ *     typename gen_numeric_list_reversed<int, 5>::type         numeric_list<int, 4,3,2,1,0>
+ *     typename gen_numeric_list_swapped_pair<int, 5,1,2>::type numeric_list<int, 0,2,1,3,4>
+ *     typename gen_numeric_list_repeated<int, 0, 5>::type      numeric_list<int, 0,0,0,0,0>
+ */
+
+template <typename T, std::size_t n, T start = 0, T... ii>
+struct gen_numeric_list : gen_numeric_list<T, n - 1, start, start + n - 1, ii...> {};
+
+template <typename T, T start, T... ii>
+struct gen_numeric_list<T, 0, start, ii...> {
+  typedef numeric_list<T, ii...> type;
+};
+
+template <typename T, std::size_t n, T start = 0, T... ii>
+struct gen_numeric_list_reversed : gen_numeric_list_reversed<T, n - 1, start, ii..., start + n - 1> {};
+template <typename T, T start, T... ii>
+struct gen_numeric_list_reversed<T, 0, start, ii...> {
+  typedef numeric_list<T, ii...> type;
+};
+
+template <typename T, std::size_t n, T a, T b, T start = 0, T... ii>
+struct gen_numeric_list_swapped_pair
+    : gen_numeric_list_swapped_pair<T, n - 1, a, b, start,
+                                    (start + n - 1) == a ? b : ((start + n - 1) == b ? a : (start + n - 1)), ii...> {};
+template <typename T, T a, T b, T start, T... ii>
+struct gen_numeric_list_swapped_pair<T, 0, a, b, start, ii...> {
+  typedef numeric_list<T, ii...> type;
+};
+
+template <typename T, std::size_t n, T V, T... nn>
+struct gen_numeric_list_repeated : gen_numeric_list_repeated<T, n - 1, V, V, nn...> {};
+template <typename T, T V, T... nn>
+struct gen_numeric_list_repeated<T, 0, V, nn...> {
+  typedef numeric_list<T, nn...> type;
+};
+#else
+template <typename T, std::size_t n, T start = 0, T... ii>
+struct gen_numeric_list;
+#endif  // not EIGEN_PARSED_BY_DOXYGEN
+
+/* list manipulation: concatenate */
+
+template <class a, class b>
+struct concat;
+
+template <typename... as, typename... bs>
+struct concat<type_list<as...>, type_list<bs...>> {
+  typedef type_list<as..., bs...> type;
+};
+template <typename T, T... as, T... bs>
+struct concat<numeric_list<T, as...>, numeric_list<T, bs...>> {
+  typedef numeric_list<T, as..., bs...> type;
+};
+
+template <typename... p>
+struct mconcat;
+template <typename a>
+struct mconcat<a> {
+  typedef a type;
+};
+template <typename a, typename b>
+struct mconcat<a, b> : concat<a, b> {};
+template <typename a, typename b, typename... cs>
+struct mconcat<a, b, cs...> : concat<a, typename mconcat<b, cs...>::type> {};
+
+/* list manipulation: extract slices */
+
+template <int n, typename x>
+struct take;
+
+template <int n, typename a, typename... as>
+struct take<n, type_list<a, as...>> : concat<type_list<a>, typename take<n - 1, type_list<as...>>::type> {};
+
+template <int n>
+struct take<n, type_list<>> {
+  typedef type_list<> type;
+};
+
+template <typename a, typename... as>
+struct take<0, type_list<a, as...>> {
+  typedef type_list<> type;
+};
+
+template <>
+struct take<0, type_list<>> {
+  typedef type_list<> type;
+};
+
+template <typename T, int n, T a, T... as>
+struct take<n, numeric_list<T, a, as...>>
+    : concat<numeric_list<T, a>, typename take<n - 1, numeric_list<T, as...>>::type> {};
+
+template <typename T, T a, T... as>
+struct take<0, numeric_list<T, a, as...>> {
+  typedef numeric_list<T> type;
+};
+
+template <typename T>
+struct take<0, numeric_list<T>> {
+  typedef numeric_list<T> type;
+};
+
+template <typename T, int n, T... ii>
+struct h_skip_helper_numeric;
+template <typename T, int n, T i, T... ii>
+struct h_skip_helper_numeric<T, n, i, ii...> : h_skip_helper_numeric<T, n - 1, ii...> {};
+template <typename T, T i, T... ii>
+struct h_skip_helper_numeric<T, 0, i, ii...> {
+  typedef numeric_list<T, i, ii...> type;
+};
+template <typename T, int n>
+struct h_skip_helper_numeric<T, n> {
+  typedef numeric_list<T> type;
+};
+template <typename T>
+struct h_skip_helper_numeric<T, 0> {
+  typedef numeric_list<T> type;
+};
+
+template <int n, typename... tt>
+struct h_skip_helper_type;
+template <int n, typename t, typename... tt>
+struct h_skip_helper_type<n, t, tt...> : h_skip_helper_type<n - 1, tt...> {};
+template <typename t, typename... tt>
+struct h_skip_helper_type<0, t, tt...> {
+  typedef type_list<t, tt...> type;
+};
+template <int n>
+struct h_skip_helper_type<n> {
+  typedef type_list<> type;
+};
+template <>
+struct h_skip_helper_type<0> {
+  typedef type_list<> type;
+};
+
+template <int n>
+struct h_skip {
+  template <typename T, T... ii>
+  constexpr static EIGEN_STRONG_INLINE typename h_skip_helper_numeric<T, n, ii...>::type helper(
+      numeric_list<T, ii...>) {
+    return typename h_skip_helper_numeric<T, n, ii...>::type();
+  }
+  template <typename... tt>
+  constexpr static EIGEN_STRONG_INLINE typename h_skip_helper_type<n, tt...>::type helper(type_list<tt...>) {
+    return typename h_skip_helper_type<n, tt...>::type();
+  }
+};
+
+template <int n, typename a>
+struct skip {
+  typedef decltype(h_skip<n>::helper(a())) type;
+};
+
+template <int start, int count, typename a>
+struct slice : take<count, typename skip<start, a>::type> {};
+
+/* list manipulation: retrieve single element from list */
+
+template <int n, typename x>
+struct get;
+
+template <int n, typename a, typename... as>
+struct get<n, type_list<a, as...>> : get<n - 1, type_list<as...>> {};
+template <typename a, typename... as>
+struct get<0, type_list<a, as...>> {
+  typedef a type;
+};
+
+template <typename T, int n, T a, T... as>
+struct get<n, numeric_list<T, a, as...>> : get<n - 1, numeric_list<T, as...>> {};
+template <typename T, T a, T... as>
+struct get<0, numeric_list<T, a, as...>> {
+  constexpr static T value = a;
+};
+
+template <std::size_t n, typename T, T a, T... as>
+constexpr T array_get(const numeric_list<T, a, as...>&) {
+  return get<(int)n, numeric_list<T, a, as...>>::value;
+}
+
+/* always get type, regardless of dummy; good for parameter pack expansion */
+
+template <typename T, T dummy, typename t>
+struct id_numeric {
+  typedef t type;
+};
+template <typename dummy, typename t>
+struct id_type {
+  typedef t type;
+};
+
+/* equality checking, flagged version */
+
+template <typename a, typename b>
+struct is_same_gf : is_same<a, b> {
+  constexpr static int global_flags = 0;
+};
+
+/* apply_op to list */
+
+template <bool from_left,  // false
+          template <typename, typename> class op, typename additional_param, typename... values>
+struct h_apply_op_helper {
+  typedef type_list<typename op<values, additional_param>::type...> type;
+};
+template <template <typename, typename> class op, typename additional_param, typename... values>
+struct h_apply_op_helper<true, op, additional_param, values...> {
+  typedef type_list<typename op<additional_param, values>::type...> type;
+};
+
+template <bool from_left, template <typename, typename> class op, typename additional_param>
+struct h_apply_op {
+  template <typename... values>
+  constexpr static typename h_apply_op_helper<from_left, op, additional_param, values...>::type helper(
+      type_list<values...>) {
+    return typename h_apply_op_helper<from_left, op, additional_param, values...>::type();
+  }
+};
+
+template <template <typename, typename> class op, typename additional_param, typename a>
+struct apply_op_from_left {
+  typedef decltype(h_apply_op<true, op, additional_param>::helper(a())) type;
+};
+
+template <template <typename, typename> class op, typename additional_param, typename a>
+struct apply_op_from_right {
+  typedef decltype(h_apply_op<false, op, additional_param>::helper(a())) type;
+};
+
+/* see if an element is in a list */
+
+template <template <typename, typename> class test, typename check_against, typename h_list,
+          bool last_check_positive = false>
+struct contained_in_list;
+
+template <template <typename, typename> class test, typename check_against, typename h_list>
+struct contained_in_list<test, check_against, h_list, true> {
+  constexpr static bool value = true;
+};
+
+template <template <typename, typename> class test, typename check_against, typename a, typename... as>
+struct contained_in_list<test, check_against, type_list<a, as...>, false>
+    : contained_in_list<test, check_against, type_list<as...>, test<check_against, a>::value> {};
+
+template <template <typename, typename> class test, typename check_against, typename... empty>
+struct contained_in_list<test, check_against, type_list<empty...>, false> {
+  constexpr static bool value = false;
+};
+
+/* see if an element is in a list and check for global flags */
+
+template <template <typename, typename> class test, typename check_against, typename h_list, int default_flags = 0,
+          bool last_check_positive = false, int last_check_flags = default_flags>
+struct contained_in_list_gf;
+
+template <template <typename, typename> class test, typename check_against, typename h_list, int default_flags,
+          int last_check_flags>
+struct contained_in_list_gf<test, check_against, h_list, default_flags, true, last_check_flags> {
+  constexpr static bool value = true;
+  constexpr static int global_flags = last_check_flags;
+};
+
+template <template <typename, typename> class test, typename check_against, typename a, typename... as,
+          int default_flags, int last_check_flags>
+struct contained_in_list_gf<test, check_against, type_list<a, as...>, default_flags, false, last_check_flags>
+    : contained_in_list_gf<test, check_against, type_list<as...>, default_flags, test<check_against, a>::value,
+                           test<check_against, a>::global_flags> {};
+
+template <template <typename, typename> class test, typename check_against, typename... empty, int default_flags,
+          int last_check_flags>
+struct contained_in_list_gf<test, check_against, type_list<empty...>, default_flags, false, last_check_flags> {
+  constexpr static bool value = false;
+  constexpr static int global_flags = default_flags;
+};
+
+/* generic reductions */
+
+template <typename Reducer, typename... Ts>
+struct reduce;
+
+template <typename Reducer>
+struct reduce<Reducer> {
+  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE int run() { return Reducer::Identity; }
+};
+
+template <typename Reducer, typename A>
+struct reduce<Reducer, A> {
+  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE A run(A a) { return a; }
+};
+
+template <typename Reducer, typename A, typename... Ts>
+struct reduce<Reducer, A, Ts...> {
+  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(A a, Ts... ts)
+      -> decltype(Reducer::run(a, reduce<Reducer, Ts...>::run(ts...))) {
+    return Reducer::run(a, reduce<Reducer, Ts...>::run(ts...));
+  }
+};
+
+/* generic binary operations */
+
+struct sum_op {
+  template <typename A, typename B>
+  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a + b) {
+    return a + b;
+  }
+  static constexpr int Identity = 0;
+};
+struct product_op {
+  template <typename A, typename B>
+  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a * b) {
+    return a * b;
+  }
+  static constexpr int Identity = 1;
+};
+
+struct logical_and_op {
+  template <typename A, typename B>
+  constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a && b) {
+    return a && b;
+  }
+};
+struct logical_or_op {
+  template <typename A, typename B>
+  constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a || b) {
+    return a || b;
+  }
+};
+
+struct equal_op {
+  template <typename A, typename B>
+  constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a == b) {
+    return a == b;
+  }
+};
+struct not_equal_op {
+  template <typename A, typename B>
+  constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a != b) {
+    return a != b;
+  }
+};
+struct lesser_op {
+  template <typename A, typename B>
+  constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a < b) {
+    return a < b;
+  }
+};
+struct lesser_equal_op {
+  template <typename A, typename B>
+  constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a <= b) {
+    return a <= b;
+  }
+};
+struct greater_op {
+  template <typename A, typename B>
+  constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a > b) {
+    return a > b;
+  }
+};
+struct greater_equal_op {
+  template <typename A, typename B>
+  constexpr static EIGEN_STRONG_INLINE auto run(A a, B b) -> decltype(a >= b) {
+    return a >= b;
+  }
+};
+
+/* generic unary operations */
+
+struct not_op {
+  template <typename A>
+  constexpr static EIGEN_STRONG_INLINE auto run(A a) -> decltype(!a) {
+    return !a;
+  }
+};
+struct negation_op {
+  template <typename A>
+  constexpr static EIGEN_STRONG_INLINE auto run(A a) -> decltype(-a) {
+    return -a;
+  }
+};
+struct greater_equal_zero_op {
+  template <typename A>
+  constexpr static EIGEN_STRONG_INLINE auto run(A a) -> decltype(a >= 0) {
+    return a >= 0;
+  }
+};
+
+/* reductions for lists */
+
+// using auto -> return value spec makes ICC 13.0 and 13.1 crash here, so we have to hack it
+// together in front... (13.0 doesn't work with array_prod/array_reduce/... anyway, but 13.1
+// does...
+template <typename... Ts>
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE decltype(reduce<product_op, Ts...>::run((*((Ts*)0))...)) arg_prod(
+    Ts... ts) {
+  return reduce<product_op, Ts...>::run(ts...);
+}
+
+template <typename... Ts>
+constexpr EIGEN_STRONG_INLINE decltype(reduce<sum_op, Ts...>::run((*((Ts*)0))...)) arg_sum(Ts... ts) {
+  return reduce<sum_op, Ts...>::run(ts...);
+}
+
+/* reverse arrays */
+
+template <typename Array, int... n>
+constexpr EIGEN_STRONG_INLINE Array h_array_reverse(Array arr, numeric_list<int, n...>) {
+  return {{array_get<sizeof...(n) - n - 1>(arr)...}};
+}
+
+template <typename T, std::size_t N>
+constexpr EIGEN_STRONG_INLINE array<T, N> array_reverse(array<T, N> arr) {
+  return h_array_reverse(arr, typename gen_numeric_list<int, N>::type());
+}
+
+/* generic array reductions */
+
+// can't reuse standard reduce() interface above because Intel's Compiler
+// *really* doesn't like it, so we just reimplement the stuff
+// (start from N - 1 and work down to 0 because specialization for
+// n == N - 1 also doesn't work in Intel's compiler, so it goes into
+// an infinite loop)
+template <typename Reducer, typename T, std::size_t N, std::size_t n = N - 1>
+struct h_array_reduce {
+  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE auto run(array<T, N> arr, T identity)
+      -> decltype(Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr, identity), array_get<n>(arr))) {
+    return Reducer::run(h_array_reduce<Reducer, T, N, n - 1>::run(arr, identity), array_get<n>(arr));
+  }
+};
+
+template <typename Reducer, typename T, std::size_t N>
+struct h_array_reduce<Reducer, T, N, 0> {
+  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE T run(const array<T, N>& arr, T) { return array_get<0>(arr); }
+};
+
+template <typename Reducer, typename T>
+struct h_array_reduce<Reducer, T, 0> {
+  EIGEN_DEVICE_FUNC constexpr static EIGEN_STRONG_INLINE T run(const array<T, 0>&, T identity) { return identity; }
+};
+
+template <typename Reducer, typename T, std::size_t N>
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE auto array_reduce(const array<T, N>& arr, T identity)
+    -> decltype(h_array_reduce<Reducer, T, N>::run(arr, identity)) {
+  return h_array_reduce<Reducer, T, N>::run(arr, identity);
+}
+
+/* standard array reductions */
+
+template <typename T, std::size_t N>
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE auto array_sum(const array<T, N>& arr)
+    -> decltype(array_reduce<sum_op, T, N>(arr, static_cast<T>(0))) {
+  return array_reduce<sum_op, T, N>(arr, static_cast<T>(0));
+}
+
+template <typename T, std::size_t N>
+EIGEN_DEVICE_FUNC constexpr EIGEN_STRONG_INLINE auto array_prod(const array<T, N>& arr)
+    -> decltype(array_reduce<product_op, T, N>(arr, static_cast<T>(1))) {
+  return array_reduce<product_op, T, N>(arr, static_cast<T>(1));
+}
+
+template <typename t>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE t array_prod(const std::vector<t>& a) {
+  eigen_assert(a.size() > 0);
+  t prod = 1;
+  for (size_t i = 0; i < a.size(); ++i) {
+    prod *= a[i];
+  }
+  return prod;
+}
+
+/* zip an array */
+
+template <typename Op, typename A, typename B, std::size_t N, int... n>
+constexpr EIGEN_STRONG_INLINE array<decltype(Op::run(A(), B())), N> h_array_zip(array<A, N> a, array<B, N> b,
+                                                                                numeric_list<int, n...>) {
+  return array<decltype(Op::run(A(), B())), N>{{Op::run(array_get<n>(a), array_get<n>(b))...}};
+}
+
+template <typename Op, typename A, typename B, std::size_t N>
+constexpr EIGEN_STRONG_INLINE array<decltype(Op::run(A(), B())), N> array_zip(array<A, N> a, array<B, N> b) {
+  return h_array_zip<Op>(a, b, typename gen_numeric_list<int, N>::type());
+}
+
+/* zip an array and reduce the result */
+
+template <typename Reducer, typename Op, typename A, typename B, std::size_t N, int... n>
+constexpr EIGEN_STRONG_INLINE auto h_array_zip_and_reduce(array<A, N> a, array<B, N> b, numeric_list<int, n...>)
+    -> decltype(reduce<Reducer, typename id_numeric<int, n, decltype(Op::run(A(), B()))>::type...>::run(
+        Op::run(array_get<n>(a), array_get<n>(b))...)) {
+  return reduce<Reducer, typename id_numeric<int, n, decltype(Op::run(A(), B()))>::type...>::run(
+      Op::run(array_get<n>(a), array_get<n>(b))...);
+}
+
+template <typename Reducer, typename Op, typename A, typename B, std::size_t N>
+constexpr EIGEN_STRONG_INLINE auto array_zip_and_reduce(array<A, N> a, array<B, N> b)
+    -> decltype(h_array_zip_and_reduce<Reducer, Op, A, B, N>(a, b, typename gen_numeric_list<int, N>::type())) {
+  return h_array_zip_and_reduce<Reducer, Op, A, B, N>(a, b, typename gen_numeric_list<int, N>::type());
+}
+
+/* apply stuff to an array */
+
+template <typename Op, typename A, std::size_t N, int... n>
+constexpr EIGEN_STRONG_INLINE array<decltype(Op::run(A())), N> h_array_apply(array<A, N> a, numeric_list<int, n...>) {
+  return array<decltype(Op::run(A())), N>{{Op::run(array_get<n>(a))...}};
+}
+
+template <typename Op, typename A, std::size_t N>
+constexpr EIGEN_STRONG_INLINE array<decltype(Op::run(A())), N> array_apply(array<A, N> a) {
+  return h_array_apply<Op>(a, typename gen_numeric_list<int, N>::type());
+}
+
+/* apply stuff to an array and reduce */
+
+template <typename Reducer, typename Op, typename A, std::size_t N, int... n>
+constexpr EIGEN_STRONG_INLINE auto h_array_apply_and_reduce(array<A, N> arr, numeric_list<int, n...>)
+    -> decltype(reduce<Reducer, typename id_numeric<int, n, decltype(Op::run(A()))>::type...>::run(
+        Op::run(array_get<n>(arr))...)) {
+  return reduce<Reducer, typename id_numeric<int, n, decltype(Op::run(A()))>::type...>::run(
+      Op::run(array_get<n>(arr))...);
+}
+
+template <typename Reducer, typename Op, typename A, std::size_t N>
+constexpr EIGEN_STRONG_INLINE auto array_apply_and_reduce(array<A, N> a)
+    -> decltype(h_array_apply_and_reduce<Reducer, Op, A, N>(a, typename gen_numeric_list<int, N>::type())) {
+  return h_array_apply_and_reduce<Reducer, Op, A, N>(a, typename gen_numeric_list<int, N>::type());
+}
+
+/* repeat a value n times (and make an array out of it
+ * usage:
+ *   array<int, 16> = repeat<16>(42);
+ */
+
+template <int n>
+struct h_repeat {
+  template <typename t, int... ii>
+  constexpr static EIGEN_STRONG_INLINE array<t, n> run(t v, numeric_list<int, ii...>) {
+    return {{typename id_numeric<int, ii, t>::type(v)...}};
+  }
+};
+
+template <int n, typename t>
+constexpr array<t, n> repeat(t v) {
+  return h_repeat<n>::run(v, typename gen_numeric_list<int, n>::type());
+}
+
+/* instantiate a class by a C-style array */
+template <class InstType, typename ArrType, std::size_t N, bool Reverse, typename... Ps>
+struct h_instantiate_by_c_array;
+
+template <class InstType, typename ArrType, std::size_t N, typename... Ps>
+struct h_instantiate_by_c_array<InstType, ArrType, N, false, Ps...> {
+  static InstType run(ArrType* arr, Ps... args) {
+    return h_instantiate_by_c_array<InstType, ArrType, N - 1, false, Ps..., ArrType>::run(arr + 1, args..., arr[0]);
+  }
+};
+
+template <class InstType, typename ArrType, std::size_t N, typename... Ps>
+struct h_instantiate_by_c_array<InstType, ArrType, N, true, Ps...> {
+  static InstType run(ArrType* arr, Ps... args) {
+    return h_instantiate_by_c_array<InstType, ArrType, N - 1, false, ArrType, Ps...>::run(arr + 1, arr[0], args...);
+  }
+};
+
+template <class InstType, typename ArrType, typename... Ps>
+struct h_instantiate_by_c_array<InstType, ArrType, 0, false, Ps...> {
+  static InstType run(ArrType* arr, Ps... args) {
+    (void)arr;
+    return InstType(args...);
+  }
+};
+
+template <class InstType, typename ArrType, typename... Ps>
+struct h_instantiate_by_c_array<InstType, ArrType, 0, true, Ps...> {
+  static InstType run(ArrType* arr, Ps... args) {
+    (void)arr;
+    return InstType(args...);
+  }
+};
+
+template <class InstType, typename ArrType, std::size_t N, bool Reverse = false>
+InstType instantiate_by_c_array(ArrType* arr) {
+  return h_instantiate_by_c_array<InstType, ArrType, N, Reverse>::run(arr);
+}
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_MOREMETA_H
diff --git a/inst/include/Eigen/src/Core/util/NonMPL2.h b/inst/include/Eigen/src/Core/util/NonMPL2.h
deleted file mode 100644
index 1af67cf1..00000000
--- a/inst/include/Eigen/src/Core/util/NonMPL2.h
+++ /dev/null
@@ -1,3 +0,0 @@
-#ifdef EIGEN_MPL2_ONLY
-#error Including non-MPL2 code in EIGEN_MPL2_ONLY mode
-#endif
diff --git a/inst/include/Eigen/src/Core/util/ReenableStupidWarnings.h b/inst/include/Eigen/src/Core/util/ReenableStupidWarnings.h
index 5ddfbd4a..0af5a430 100644
--- a/inst/include/Eigen/src/Core/util/ReenableStupidWarnings.h
+++ b/inst/include/Eigen/src/Core/util/ReenableStupidWarnings.h
@@ -1,14 +1,44 @@
-#ifdef EIGEN_WARNINGS_DISABLED
+#ifdef EIGEN_WARNINGS_DISABLED_2
+// "DisableStupidWarnings.h" was included twice recursively: Do not re-enable warnings yet!
+#undef EIGEN_WARNINGS_DISABLED_2
+
+#elif defined(EIGEN_WARNINGS_DISABLED)
 #undef EIGEN_WARNINGS_DISABLED
 
 #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
-  #ifdef _MSC_VER
-    #pragma warning( pop )
-  #elif defined __INTEL_COMPILER
-    #pragma warning pop
-  #elif defined __clang__
-    #pragma clang diagnostic pop
-  #endif
+#ifdef _MSC_VER
+#pragma warning(pop)
+#ifdef EIGEN_REENABLE_CXX23_DENORM_DEPRECATION_WARNING
+#undef EIGEN_REENABLE_CXX23_DENORM_DEPRECATION_WARNING
+#undef _SILENCE_CXX23_DENORM_DEPRECATION_WARNING
+#endif
+
+#elif defined __INTEL_COMPILER
+#pragma warning pop
+#elif defined __clang__
+#pragma clang diagnostic pop
+#elif defined __GNUC__ && !defined(__FUJITSU) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6))
+#pragma GCC diagnostic pop
+#endif
+
+#if defined __NVCC__
+//    Don't re-enable the diagnostic messages, as it turns out these messages need
+//    to be disabled at the point of the template instantiation (i.e the user code)
+//    otherwise they'll be triggered by nvcc.
+//    #define EIGEN_MAKE_PRAGMA(X) _Pragma(#X)
+//    #if __NVCC_DIAG_PRAGMA_SUPPORT__
+//      #define EIGEN_NV_DIAG_DEFAULT(X) EIGEN_MAKE_PRAGMA(nv_diag_default X)
+//    #else
+//      #define EIGEN_NV_DIAG_DEFAULT(X) EIGEN_MAKE_PRAGMA(diag_default X)
+//    #endif
+//    EIGEN_NV_DIAG_DEFAULT(code_is_unreachable)
+//    EIGEN_NV_DIAG_DEFAULT(initialization_not_reachable)
+//    EIGEN_NV_DIAG_DEFAULT(2651)
+//    EIGEN_NV_DIAG_DEFAULT(2653)
+//    #undef EIGEN_NV_DIAG_DEFAULT
+//    #undef EIGEN_MAKE_PRAGMA
+#endif
+
 #endif
 
-#endif // EIGEN_WARNINGS_DISABLED
+#endif  // EIGEN_WARNINGS_DISABLED
diff --git a/inst/include/Eigen/src/Core/util/ReshapedHelper.h b/inst/include/Eigen/src/Core/util/ReshapedHelper.h
new file mode 100644
index 00000000..17479505
--- /dev/null
+++ b/inst/include/Eigen/src/Core/util/ReshapedHelper.h
@@ -0,0 +1,51 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_RESHAPED_HELPER_H
+#define EIGEN_RESHAPED_HELPER_H
+
+// IWYU pragma: private
+#include "../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+enum AutoSize_t { AutoSize };
+const int AutoOrder = 2;
+
+namespace internal {
+
+template <typename SizeType, typename OtherSize, int TotalSize>
+struct get_compiletime_reshape_size {
+  enum { value = get_fixed_value<SizeType>::value };
+};
+
+template <typename SizeType>
+Index get_runtime_reshape_size(SizeType size, Index /*other*/, Index /*total*/) {
+  return internal::get_runtime_value(size);
+}
+
+template <typename OtherSize, int TotalSize>
+struct get_compiletime_reshape_size<AutoSize_t, OtherSize, TotalSize> {
+  enum {
+    other_size = get_fixed_value<OtherSize>::value,
+    value = (TotalSize == Dynamic || other_size == Dynamic) ? Dynamic : TotalSize / other_size
+  };
+};
+
+inline Index get_runtime_reshape_size(AutoSize_t /*size*/, Index other, Index total) { return total / other; }
+
+constexpr int get_compiletime_reshape_order(int flags, int order) {
+  return order == AutoOrder ? flags & RowMajorBit : order;
+}
+
+}  // namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_RESHAPED_HELPER_H
diff --git a/inst/include/Eigen/src/Core/util/Serializer.h b/inst/include/Eigen/src/Core/util/Serializer.h
new file mode 100644
index 00000000..dc3bd130
--- /dev/null
+++ b/inst/include/Eigen/src/Core/util/Serializer.h
@@ -0,0 +1,209 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2021 The Eigen Team
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SERIALIZER_H
+#define EIGEN_SERIALIZER_H
+
+#include <type_traits>
+
+// The Serializer class encodes data into a memory buffer so it can be later
+// reconstructed. This is mainly used to send objects back-and-forth between
+// the CPU and GPU.
+
+namespace Eigen {
+
+/**
+ * Serializes an object to a memory buffer.
+ *
+ * Useful for transferring data (e.g. back-and-forth to a device).
+ */
+template <typename T, typename EnableIf = void>
+class Serializer;
+
+// Specialization for POD types.
+template <typename T>
+class Serializer<T,
+                 typename std::enable_if_t<std::is_trivially_copyable<T>::value && std::is_standard_layout<T>::value>> {
+ public:
+  /**
+   * Determines the required size of the serialization buffer for a value.
+   *
+   * \param value the value to serialize.
+   * \return the required size.
+   */
+  EIGEN_DEVICE_FUNC size_t size(const T& value) const { return sizeof(value); }
+
+  /**
+   * Serializes a value to a byte buffer.
+   * \param dest the destination buffer; if this is nullptr, does nothing.
+   * \param end the end of the destination buffer.
+   * \param value the value to serialize.
+   * \return the next memory address past the end of the serialized data.
+   */
+  EIGEN_DEVICE_FUNC uint8_t* serialize(uint8_t* dest, uint8_t* end, const T& value) {
+    if (EIGEN_PREDICT_FALSE(dest == nullptr)) return nullptr;
+    if (EIGEN_PREDICT_FALSE(dest + sizeof(value) > end)) return nullptr;
+    EIGEN_USING_STD(memcpy)
+    memcpy(dest, &value, sizeof(value));
+    return dest + sizeof(value);
+  }
+
+  /**
+   * Deserializes a value from a byte buffer.
+   * \param src the source buffer; if this is nullptr, does nothing.
+   * \param end the end of the source buffer.
+   * \param value the value to populate.
+   * \return the next unprocessed memory address; nullptr if parsing errors are detected.
+   */
+  EIGEN_DEVICE_FUNC const uint8_t* deserialize(const uint8_t* src, const uint8_t* end, T& value) const {
+    if (EIGEN_PREDICT_FALSE(src == nullptr)) return nullptr;
+    if (EIGEN_PREDICT_FALSE(src + sizeof(value) > end)) return nullptr;
+    EIGEN_USING_STD(memcpy)
+    memcpy(&value, src, sizeof(value));
+    return src + sizeof(value);
+  }
+};
+
+// Specialization for DenseBase.
+// Serializes [rows, cols, data...].
+template <typename Derived>
+class Serializer<DenseBase<Derived>, void> {
+ public:
+  typedef typename Derived::Scalar Scalar;
+
+  struct Header {
+    typename Derived::Index rows;
+    typename Derived::Index cols;
+  };
+
+  EIGEN_DEVICE_FUNC size_t size(const Derived& value) const { return sizeof(Header) + sizeof(Scalar) * value.size(); }
+
+  EIGEN_DEVICE_FUNC uint8_t* serialize(uint8_t* dest, uint8_t* end, const Derived& value) {
+    if (EIGEN_PREDICT_FALSE(dest == nullptr)) return nullptr;
+    if (EIGEN_PREDICT_FALSE(dest + size(value) > end)) return nullptr;
+    const size_t header_bytes = sizeof(Header);
+    const size_t data_bytes = sizeof(Scalar) * value.size();
+    Header header = {value.rows(), value.cols()};
+    EIGEN_USING_STD(memcpy)
+    memcpy(dest, &header, header_bytes);
+    dest += header_bytes;
+    memcpy(dest, value.data(), data_bytes);
+    return dest + data_bytes;
+  }
+
+  EIGEN_DEVICE_FUNC const uint8_t* deserialize(const uint8_t* src, const uint8_t* end, Derived& value) const {
+    if (EIGEN_PREDICT_FALSE(src == nullptr)) return nullptr;
+    if (EIGEN_PREDICT_FALSE(src + sizeof(Header) > end)) return nullptr;
+    const size_t header_bytes = sizeof(Header);
+    Header header;
+    EIGEN_USING_STD(memcpy)
+    memcpy(&header, src, header_bytes);
+    src += header_bytes;
+    const size_t data_bytes = sizeof(Scalar) * header.rows * header.cols;
+    if (EIGEN_PREDICT_FALSE(src + data_bytes > end)) return nullptr;
+    value.resize(header.rows, header.cols);
+    memcpy(value.data(), src, data_bytes);
+    return src + data_bytes;
+  }
+};
+
+template <typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
+class Serializer<Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols>>
+    : public Serializer<DenseBase<Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols>>> {};
+
+template <typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
+class Serializer<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols>>
+    : public Serializer<DenseBase<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols>>> {};
+
+namespace internal {
+
+// Recursive serialization implementation helper.
+template <size_t N, typename... Types>
+struct serialize_impl;
+
+template <size_t N, typename T1, typename... Ts>
+struct serialize_impl<N, T1, Ts...> {
+  using Serializer = Eigen::Serializer<typename std::decay<T1>::type>;
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t serialize_size(const T1& value, const Ts&... args) {
+    Serializer serializer;
+    size_t size = serializer.size(value);
+    return size + serialize_impl<N - 1, Ts...>::serialize_size(args...);
+  }
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint8_t* serialize(uint8_t* dest, uint8_t* end, const T1& value,
+                                                                  const Ts&... args) {
+    Serializer serializer;
+    dest = serializer.serialize(dest, end, value);
+    return serialize_impl<N - 1, Ts...>::serialize(dest, end, args...);
+  }
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const uint8_t* deserialize(const uint8_t* src, const uint8_t* end,
+                                                                          T1& value, Ts&... args) {
+    Serializer serializer;
+    src = serializer.deserialize(src, end, value);
+    return serialize_impl<N - 1, Ts...>::deserialize(src, end, args...);
+  }
+};
+
+// Base case.
+template <>
+struct serialize_impl<0> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t serialize_size() { return 0; }
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint8_t* serialize(uint8_t* dest, uint8_t* /*end*/) { return dest; }
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const uint8_t* deserialize(const uint8_t* src, const uint8_t* /*end*/) {
+    return src;
+  }
+};
+
+}  // namespace internal
+
+/**
+ * Determine the buffer size required to serialize a set of values.
+ *
+ * \param args ... arguments to serialize in sequence.
+ * \return the total size of the required buffer.
+ */
+template <typename... Args>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t serialize_size(const Args&... args) {
+  return internal::serialize_impl<sizeof...(args), Args...>::serialize_size(args...);
+}
+
+/**
+ * Serialize a set of values to the byte buffer.
+ *
+ * \param dest output byte buffer; if this is nullptr, does nothing.
+ * \param end the end of the output byte buffer.
+ * \param args ... arguments to serialize in sequence.
+ * \return the next address after all serialized values.
+ */
+template <typename... Args>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint8_t* serialize(uint8_t* dest, uint8_t* end, const Args&... args) {
+  return internal::serialize_impl<sizeof...(args), Args...>::serialize(dest, end, args...);
+}
+
+/**
+ * Deserialize a set of values from the byte buffer.
+ *
+ * \param src input byte buffer; if this is nullptr, does nothing.
+ * \param end the end of input byte buffer.
+ * \param args ... arguments to deserialize in sequence.
+ * \return the next address after all parsed values; nullptr if parsing errors are detected.
+ */
+template <typename... Args>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const uint8_t* deserialize(const uint8_t* src, const uint8_t* end,
+                                                                 Args&... args) {
+  return internal::serialize_impl<sizeof...(args), Args...>::deserialize(src, end, args...);
+}
+
+}  // namespace Eigen
+
+#endif  // EIGEN_SERIALIZER_H
diff --git a/inst/include/Eigen/src/Core/util/StaticAssert.h b/inst/include/Eigen/src/Core/util/StaticAssert.h
index bac5d9fe..f0623540 100644
--- a/inst/include/Eigen/src/Core/util/StaticAssert.h
+++ b/inst/include/Eigen/src/Core/util/StaticAssert.h
@@ -16,193 +16,90 @@
  *  - in EIGEN_STATIC_ASSERT(CONDITION,MSG) the parameter CONDITION must be a compile time boolean
  *    expression, and MSG an enum listed in struct internal::static_assertion<true>
  *
- *  - define EIGEN_NO_STATIC_ASSERT to disable them (and save compilation time)
- *    in that case, the static assertion is converted to the following runtime assert:
- *      eigen_assert(CONDITION && "MSG")
- *
  *  - currently EIGEN_STATIC_ASSERT can only be used in function scope
  *
  */
 
+#ifndef EIGEN_STATIC_ASSERT
 #ifndef EIGEN_NO_STATIC_ASSERT
 
-  #if defined(__GXX_EXPERIMENTAL_CXX0X__) || (defined(_MSC_VER) && (_MSC_VER >= 1600))
-
-    // if native static_assert is enabled, let's use it
-    #define EIGEN_STATIC_ASSERT(X,MSG) static_assert(X,#MSG);
-
-  #else // not CXX0X
-
-    namespace Eigen {
-
-    namespace internal {
-
-    template<bool condition>
-    struct static_assertion {};
-
-    template<>
-    struct static_assertion<true>
-    {
-      enum {
-        YOU_TRIED_CALLING_A_VECTOR_METHOD_ON_A_MATRIX,
-        YOU_MIXED_VECTORS_OF_DIFFERENT_SIZES,
-        YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES,
-        THIS_METHOD_IS_ONLY_FOR_VECTORS_OF_A_SPECIFIC_SIZE,
-        THIS_METHOD_IS_ONLY_FOR_MATRICES_OF_A_SPECIFIC_SIZE,
-        THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE,
-        YOU_MADE_A_PROGRAMMING_MISTAKE,
-        EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT,
-        EIGEN_INTERNAL_COMPILATION_ERROR_OR_YOU_MADE_A_PROGRAMMING_MISTAKE,
-        YOU_CALLED_A_FIXED_SIZE_METHOD_ON_A_DYNAMIC_SIZE_MATRIX_OR_VECTOR,
-        YOU_CALLED_A_DYNAMIC_SIZE_METHOD_ON_A_FIXED_SIZE_MATRIX_OR_VECTOR,
-        UNALIGNED_LOAD_AND_STORE_OPERATIONS_UNIMPLEMENTED_ON_ALTIVEC,
-        THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES,
-        FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED,
-        NUMERIC_TYPE_MUST_BE_REAL,
-        COEFFICIENT_WRITE_ACCESS_TO_SELFADJOINT_NOT_SUPPORTED,
-        WRITING_TO_TRIANGULAR_PART_WITH_UNIT_DIAGONAL_IS_NOT_SUPPORTED,
-        THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE,
-        INVALID_MATRIX_PRODUCT,
-        INVALID_VECTOR_VECTOR_PRODUCT__IF_YOU_WANTED_A_DOT_OR_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTIONS,
-        INVALID_MATRIX_PRODUCT__IF_YOU_WANTED_A_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTION,
-        YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY,
-        THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES,
-        THIS_METHOD_IS_ONLY_FOR_ROW_MAJOR_MATRICES,
-        INVALID_MATRIX_TEMPLATE_PARAMETERS,
-        INVALID_MATRIXBASE_TEMPLATE_PARAMETERS,
-        BOTH_MATRICES_MUST_HAVE_THE_SAME_STORAGE_ORDER,
-        THIS_METHOD_IS_ONLY_FOR_DIAGONAL_MATRIX,
-        THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE,
-        THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_WITH_DIRECT_MEMORY_ACCESS_SUCH_AS_MAP_OR_PLAIN_MATRICES,
-        YOU_ALREADY_SPECIFIED_THIS_STRIDE,
-        INVALID_STORAGE_ORDER_FOR_THIS_VECTOR_EXPRESSION,
-        THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD,
-        PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1,
-        THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS,
-        YOU_CANNOT_MIX_ARRAYS_AND_MATRICES,
-        YOU_PERFORMED_AN_INVALID_TRANSFORMATION_CONVERSION,
-        THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY,
-        YOU_ARE_TRYING_TO_USE_AN_INDEX_BASED_ACCESSOR_ON_AN_EXPRESSION_THAT_DOES_NOT_SUPPORT_THAT,
-        THIS_METHOD_IS_ONLY_FOR_1x1_EXPRESSIONS,
-        THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL,
-        THIS_METHOD_IS_ONLY_FOR_ARRAYS_NOT_MATRICES,
-        YOU_PASSED_A_ROW_VECTOR_BUT_A_COLUMN_VECTOR_WAS_EXPECTED,
-        YOU_PASSED_A_COLUMN_VECTOR_BUT_A_ROW_VECTOR_WAS_EXPECTED,
-        THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE,
-        THE_STORAGE_ORDER_OF_BOTH_SIDES_MUST_MATCH,
-        OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG,
-        IMPLICIT_CONVERSION_TO_SCALAR_IS_FOR_INNER_PRODUCT_ONLY,
-        STORAGE_LAYOUT_DOES_NOT_MATCH
-      };
-    };
-
-    } // end namespace internal
-
-    } // end namespace Eigen
-
-    // Specialized implementation for MSVC to avoid "conditional
-    // expression is constant" warnings.  This implementation doesn't
-    // appear to work under GCC, hence the multiple implementations.
-    #ifdef _MSC_VER
-
-      #define EIGEN_STATIC_ASSERT(CONDITION,MSG) \
-        {Eigen::internal::static_assertion<bool(CONDITION)>::MSG;}
-
-    #else
-
-      #define EIGEN_STATIC_ASSERT(CONDITION,MSG) \
-        if (Eigen::internal::static_assertion<bool(CONDITION)>::MSG) {}
-
-    #endif
-
-  #endif // not CXX0X
-
-#else // EIGEN_NO_STATIC_ASSERT
-
-  #define EIGEN_STATIC_ASSERT(CONDITION,MSG) eigen_assert((CONDITION) && #MSG);
-
-#endif // EIGEN_NO_STATIC_ASSERT
+#define EIGEN_STATIC_ASSERT(X, MSG) static_assert(X, #MSG);
+
+#else  // EIGEN_NO_STATIC_ASSERT
+
+#define EIGEN_STATIC_ASSERT(CONDITION, MSG)
 
+#endif  // EIGEN_NO_STATIC_ASSERT
+#endif  // EIGEN_STATIC_ASSERT
 
 // static assertion failing if the type \a TYPE is not a vector type
 #define EIGEN_STATIC_ASSERT_VECTOR_ONLY(TYPE) \
-  EIGEN_STATIC_ASSERT(TYPE::IsVectorAtCompileTime, \
-                      YOU_TRIED_CALLING_A_VECTOR_METHOD_ON_A_MATRIX)
+  EIGEN_STATIC_ASSERT(TYPE::IsVectorAtCompileTime, YOU_TRIED_CALLING_A_VECTOR_METHOD_ON_A_MATRIX)
 
 // static assertion failing if the type \a TYPE is not fixed-size
-#define EIGEN_STATIC_ASSERT_FIXED_SIZE(TYPE) \
-  EIGEN_STATIC_ASSERT(TYPE::SizeAtCompileTime!=Eigen::Dynamic, \
+#define EIGEN_STATIC_ASSERT_FIXED_SIZE(TYPE)                     \
+  EIGEN_STATIC_ASSERT(TYPE::SizeAtCompileTime != Eigen::Dynamic, \
                       YOU_CALLED_A_FIXED_SIZE_METHOD_ON_A_DYNAMIC_SIZE_MATRIX_OR_VECTOR)
 
 // static assertion failing if the type \a TYPE is not dynamic-size
-#define EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(TYPE) \
-  EIGEN_STATIC_ASSERT(TYPE::SizeAtCompileTime==Eigen::Dynamic, \
+#define EIGEN_STATIC_ASSERT_DYNAMIC_SIZE(TYPE)                   \
+  EIGEN_STATIC_ASSERT(TYPE::SizeAtCompileTime == Eigen::Dynamic, \
                       YOU_CALLED_A_DYNAMIC_SIZE_METHOD_ON_A_FIXED_SIZE_MATRIX_OR_VECTOR)
 
 // static assertion failing if the type \a TYPE is not a vector type of the given size
-#define EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(TYPE, SIZE) \
-  EIGEN_STATIC_ASSERT(TYPE::IsVectorAtCompileTime && TYPE::SizeAtCompileTime==SIZE, \
+#define EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(TYPE, SIZE)                         \
+  EIGEN_STATIC_ASSERT(TYPE::IsVectorAtCompileTime&& TYPE::SizeAtCompileTime == SIZE, \
                       THIS_METHOD_IS_ONLY_FOR_VECTORS_OF_A_SPECIFIC_SIZE)
 
 // static assertion failing if the type \a TYPE is not a vector type of the given size
-#define EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(TYPE, ROWS, COLS) \
-  EIGEN_STATIC_ASSERT(TYPE::RowsAtCompileTime==ROWS && TYPE::ColsAtCompileTime==COLS, \
+#define EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(TYPE, ROWS, COLS)                        \
+  EIGEN_STATIC_ASSERT(TYPE::RowsAtCompileTime == ROWS && TYPE::ColsAtCompileTime == COLS, \
                       THIS_METHOD_IS_ONLY_FOR_MATRICES_OF_A_SPECIFIC_SIZE)
 
 // static assertion failing if the two vector expression types are not compatible (same fixed-size or dynamic size)
-#define EIGEN_STATIC_ASSERT_SAME_VECTOR_SIZE(TYPE0,TYPE1) \
-  EIGEN_STATIC_ASSERT( \
-      (int(TYPE0::SizeAtCompileTime)==Eigen::Dynamic \
-    || int(TYPE1::SizeAtCompileTime)==Eigen::Dynamic \
-    || int(TYPE0::SizeAtCompileTime)==int(TYPE1::SizeAtCompileTime)),\
-    YOU_MIXED_VECTORS_OF_DIFFERENT_SIZES)
-
-#define EIGEN_PREDICATE_SAME_MATRIX_SIZE(TYPE0,TYPE1) \
-     ( \
-        (int(TYPE0::SizeAtCompileTime)==0 && int(TYPE1::SizeAtCompileTime)==0) \
-    || (\
-          (int(TYPE0::RowsAtCompileTime)==Eigen::Dynamic \
-        || int(TYPE1::RowsAtCompileTime)==Eigen::Dynamic \
-        || int(TYPE0::RowsAtCompileTime)==int(TYPE1::RowsAtCompileTime)) \
-      &&  (int(TYPE0::ColsAtCompileTime)==Eigen::Dynamic \
-        || int(TYPE1::ColsAtCompileTime)==Eigen::Dynamic \
-        || int(TYPE0::ColsAtCompileTime)==int(TYPE1::ColsAtCompileTime))\
-       ) \
-     )
-
-#ifdef EIGEN2_SUPPORT
-  #define EIGEN_STATIC_ASSERT_NON_INTEGER(TYPE) \
-    eigen_assert(!NumTraits<Scalar>::IsInteger);
-#else
-  #define EIGEN_STATIC_ASSERT_NON_INTEGER(TYPE) \
-    EIGEN_STATIC_ASSERT(!NumTraits<TYPE>::IsInteger, THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES)
-#endif
-
-
-// static assertion failing if it is guaranteed at compile-time that the two matrix expression types have different sizes
-#define EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(TYPE0,TYPE1) \
-  EIGEN_STATIC_ASSERT( \
-     EIGEN_PREDICATE_SAME_MATRIX_SIZE(TYPE0,TYPE1),\
-    YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES)
-
-#define EIGEN_STATIC_ASSERT_SIZE_1x1(TYPE) \
-      EIGEN_STATIC_ASSERT((TYPE::RowsAtCompileTime == 1 || TYPE::RowsAtCompileTime == Dynamic) && \
-                          (TYPE::ColsAtCompileTime == 1 || TYPE::ColsAtCompileTime == Dynamic), \
-                          THIS_METHOD_IS_ONLY_FOR_1x1_EXPRESSIONS)
+#define EIGEN_STATIC_ASSERT_SAME_VECTOR_SIZE(TYPE0, TYPE1)                                                   \
+  EIGEN_STATIC_ASSERT(                                                                                       \
+      (int(TYPE0::SizeAtCompileTime) == Eigen::Dynamic || int(TYPE1::SizeAtCompileTime) == Eigen::Dynamic || \
+       int(TYPE0::SizeAtCompileTime) == int(TYPE1::SizeAtCompileTime)),                                      \
+      YOU_MIXED_VECTORS_OF_DIFFERENT_SIZES)
+
+#define EIGEN_PREDICATE_SAME_MATRIX_SIZE(TYPE0, TYPE1)                                                     \
+  ((int(Eigen::internal::size_of_xpr_at_compile_time<TYPE0>::ret) == 0 &&                                  \
+    int(Eigen::internal::size_of_xpr_at_compile_time<TYPE1>::ret) == 0) ||                                 \
+   ((int(TYPE0::RowsAtCompileTime) == Eigen::Dynamic || int(TYPE1::RowsAtCompileTime) == Eigen::Dynamic || \
+     int(TYPE0::RowsAtCompileTime) == int(TYPE1::RowsAtCompileTime)) &&                                    \
+    (int(TYPE0::ColsAtCompileTime) == Eigen::Dynamic || int(TYPE1::ColsAtCompileTime) == Eigen::Dynamic || \
+     int(TYPE0::ColsAtCompileTime) == int(TYPE1::ColsAtCompileTime))))
+
+#define EIGEN_STATIC_ASSERT_NON_INTEGER(TYPE) \
+  EIGEN_STATIC_ASSERT(!Eigen::NumTraits<TYPE>::IsInteger, THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES)
+
+// static assertion failing if it is guaranteed at compile-time that the two matrix expression types have different
+// sizes
+#define EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(TYPE0, TYPE1) \
+  EIGEN_STATIC_ASSERT(EIGEN_PREDICATE_SAME_MATRIX_SIZE(TYPE0, TYPE1), YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES)
+
+#define EIGEN_STATIC_ASSERT_SIZE_1x1(TYPE)                                                             \
+  EIGEN_STATIC_ASSERT((TYPE::RowsAtCompileTime == 1 || TYPE::RowsAtCompileTime == Eigen::Dynamic) &&   \
+                          (TYPE::ColsAtCompileTime == 1 || TYPE::ColsAtCompileTime == Eigen::Dynamic), \
+                      THIS_METHOD_IS_ONLY_FOR_1x1_EXPRESSIONS)
 
 #define EIGEN_STATIC_ASSERT_LVALUE(Derived) \
-      EIGEN_STATIC_ASSERT(internal::is_lvalue<Derived>::value, \
-                          THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY)
+  EIGEN_STATIC_ASSERT(Eigen::internal::is_lvalue<Derived>::value, THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY)
 
-#define EIGEN_STATIC_ASSERT_ARRAYXPR(Derived) \
-      EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Derived>::XprKind, ArrayXpr>::value), \
-                          THIS_METHOD_IS_ONLY_FOR_ARRAYS_NOT_MATRICES)
+#define EIGEN_STATIC_ASSERT_ARRAYXPR(Derived)                                                                          \
+  EIGEN_STATIC_ASSERT((Eigen::internal::is_same<typename Eigen::internal::traits<Derived>::XprKind, ArrayXpr>::value), \
+                      THIS_METHOD_IS_ONLY_FOR_ARRAYS_NOT_MATRICES)
 
-#define EIGEN_STATIC_ASSERT_SAME_XPR_KIND(Derived1, Derived2) \
-      EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Derived1>::XprKind, \
-                                             typename internal::traits<Derived2>::XprKind \
-                                            >::value), \
-                          YOU_CANNOT_MIX_ARRAYS_AND_MATRICES)
+#define EIGEN_STATIC_ASSERT_SAME_XPR_KIND(Derived1, Derived2)                                                 \
+  EIGEN_STATIC_ASSERT((Eigen::internal::is_same<typename Eigen::internal::traits<Derived1>::XprKind,          \
+                                                typename Eigen::internal::traits<Derived2>::XprKind>::value), \
+                      YOU_CANNOT_MIX_ARRAYS_AND_MATRICES)
 
+// Check that a cost value is positive, and that is stay within a reasonable range
+// TODO this check could be enabled for internal debugging only
+#define EIGEN_INTERNAL_CHECK_COST_VALUE(C)                    \
+  EIGEN_STATIC_ASSERT((C) >= 0 && (C) <= HugeCost * HugeCost, \
+                      EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT__INVALID_COST_VALUE);
 
-#endif // EIGEN_STATIC_ASSERT_H
+#endif  // EIGEN_STATIC_ASSERT_H
diff --git a/inst/include/Eigen/src/Core/util/SymbolicIndex.h b/inst/include/Eigen/src/Core/util/SymbolicIndex.h
new file mode 100644
index 00000000..dc204af4
--- /dev/null
+++ b/inst/include/Eigen/src/Core/util/SymbolicIndex.h
@@ -0,0 +1,445 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SYMBOLIC_INDEX_H
+#define EIGEN_SYMBOLIC_INDEX_H
+
+// IWYU pragma: private
+#include "../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \namespace Eigen::symbolic
+ * \ingroup Core_Module
+ *
+ * This namespace defines a set of classes and functions to build and evaluate symbolic expressions of scalar type
+ * Index. Here is a simple example:
+ *
+ * \code
+ * // First step, defines symbols:
+ * struct x_tag {};  static const symbolic::SymbolExpr<x_tag> x;
+ * struct y_tag {};  static const symbolic::SymbolExpr<y_tag> y;
+ * struct z_tag {};  static const symbolic::SymbolExpr<z_tag> z;
+ *
+ * // Defines an expression:
+ * auto expr = (x+3)/y+z;
+ *
+ * // And evaluate it: (c++14)
+ * std::cout << expr.eval(x=6,y=3,z=-13) << "\n";
+ *
+ * \endcode
+ *
+ * It is currently only used internally to define and manipulate the
+ * Eigen::placeholders::last and Eigen::placeholders::lastp1 symbols in
+ * Eigen::seq and Eigen::seqN.
+ *
+ */
+namespace symbolic {
+
+template <typename Tag>
+class Symbol;
+template <typename Tag, typename Type>
+class SymbolValue;
+template <typename Arg0>
+class NegateExpr;
+template <typename Arg1, typename Arg2>
+class AddExpr;
+template <typename Arg1, typename Arg2>
+class ProductExpr;
+template <typename Arg1, typename Arg2>
+class QuotientExpr;
+template <typename IndexType = Index>
+class ValueExpr;
+
+/** \class BaseExpr
+ * \ingroup Core_Module
+ * Common base class of any symbolic expressions
+ */
+template <typename Derived_>
+class BaseExpr {
+ public:
+  using Derived = Derived_;
+  constexpr const Derived& derived() const { return *static_cast<const Derived*>(this); }
+
+  /** Evaluate the expression given the \a values of the symbols.
+   *
+   * \param values defines the values of the symbols, as constructed by SymbolExpr::operator= operator.
+   *
+   */
+  template <typename... Tags, typename... Types>
+  constexpr Index eval(const SymbolValue<Tags, Types>&... values) const {
+    return derived().eval_impl(values...);
+  }
+
+  /** Evaluate the expression at compile time given the \a values of the symbols.
+   *
+   * If a value is not known at compile-time, returns Eigen::Undefined.
+   *
+   */
+  template <typename... Tags, typename... Types>
+  static constexpr Index eval_at_compile_time(const SymbolValue<Tags, Types>&...) {
+    return Derived::eval_at_compile_time_impl(SymbolValue<Tags, Types>{}...);
+  }
+
+  constexpr NegateExpr<Derived> operator-() const { return NegateExpr<Derived>(derived()); }
+
+  constexpr AddExpr<Derived, ValueExpr<>> operator+(Index b) const {
+    return AddExpr<Derived, ValueExpr<>>(derived(), b);
+  }
+  constexpr AddExpr<Derived, ValueExpr<>> operator-(Index a) const {
+    return AddExpr<Derived, ValueExpr<>>(derived(), -a);
+  }
+  constexpr ProductExpr<Derived, ValueExpr<>> operator*(Index a) const {
+    return ProductExpr<Derived, ValueExpr<>>(derived(), a);
+  }
+  constexpr QuotientExpr<Derived, ValueExpr<>> operator/(Index a) const {
+    return QuotientExpr<Derived, ValueExpr<>>(derived(), a);
+  }
+
+  friend constexpr AddExpr<Derived, ValueExpr<>> operator+(Index a, const BaseExpr& b) {
+    return AddExpr<Derived, ValueExpr<>>(b.derived(), a);
+  }
+  friend constexpr AddExpr<NegateExpr<Derived>, ValueExpr<>> operator-(Index a, const BaseExpr& b) {
+    return AddExpr<NegateExpr<Derived>, ValueExpr<>>(-b.derived(), a);
+  }
+  friend constexpr ProductExpr<ValueExpr<>, Derived> operator*(Index a, const BaseExpr& b) {
+    return ProductExpr<ValueExpr<>, Derived>(a, b.derived());
+  }
+  friend constexpr QuotientExpr<ValueExpr<>, Derived> operator/(Index a, const BaseExpr& b) {
+    return QuotientExpr<ValueExpr<>, Derived>(a, b.derived());
+  }
+
+  template <int N>
+  constexpr AddExpr<Derived, ValueExpr<internal::FixedInt<N>>> operator+(internal::FixedInt<N>) const {
+    return AddExpr<Derived, ValueExpr<internal::FixedInt<N>>>(derived(), ValueExpr<internal::FixedInt<N>>());
+  }
+  template <int N>
+  constexpr AddExpr<Derived, ValueExpr<internal::FixedInt<-N>>> operator-(internal::FixedInt<N>) const {
+    return AddExpr<Derived, ValueExpr<internal::FixedInt<-N>>>(derived(), ValueExpr<internal::FixedInt<-N>>());
+  }
+  template <int N>
+  constexpr ProductExpr<Derived, ValueExpr<internal::FixedInt<N>>> operator*(internal::FixedInt<N>) const {
+    return ProductExpr<Derived, ValueExpr<internal::FixedInt<N>>>(derived(), ValueExpr<internal::FixedInt<N>>());
+  }
+  template <int N>
+  constexpr QuotientExpr<Derived, ValueExpr<internal::FixedInt<N>>> operator/(internal::FixedInt<N>) const {
+    return QuotientExpr<Derived, ValueExpr<internal::FixedInt<N>>>(derived(), ValueExpr<internal::FixedInt<N>>());
+  }
+
+  template <int N>
+  friend constexpr AddExpr<Derived, ValueExpr<internal::FixedInt<N>>> operator+(internal::FixedInt<N>,
+                                                                                const BaseExpr& b) {
+    return AddExpr<Derived, ValueExpr<internal::FixedInt<N>>>(b.derived(), ValueExpr<internal::FixedInt<N>>());
+  }
+  template <int N>
+  friend constexpr AddExpr<NegateExpr<Derived>, ValueExpr<internal::FixedInt<N>>> operator-(internal::FixedInt<N>,
+                                                                                            const BaseExpr& b) {
+    return AddExpr<NegateExpr<Derived>, ValueExpr<internal::FixedInt<N>>>(-b.derived(),
+                                                                          ValueExpr<internal::FixedInt<N>>());
+  }
+  template <int N>
+  friend constexpr ProductExpr<ValueExpr<internal::FixedInt<N>>, Derived> operator*(internal::FixedInt<N>,
+                                                                                    const BaseExpr& b) {
+    return ProductExpr<ValueExpr<internal::FixedInt<N>>, Derived>(ValueExpr<internal::FixedInt<N>>(), b.derived());
+  }
+  template <int N>
+  friend constexpr QuotientExpr<ValueExpr<internal::FixedInt<N>>, Derived> operator/(internal::FixedInt<N>,
+                                                                                     const BaseExpr& b) {
+    return QuotientExpr<ValueExpr<internal::FixedInt<N>>, Derived>(ValueExpr<internal::FixedInt<N>>(), b.derived());
+  }
+
+  template <typename OtherDerived>
+  constexpr AddExpr<Derived, OtherDerived> operator+(const BaseExpr<OtherDerived>& b) const {
+    return AddExpr<Derived, OtherDerived>(derived(), b.derived());
+  }
+
+  template <typename OtherDerived>
+  constexpr AddExpr<Derived, NegateExpr<OtherDerived>> operator-(const BaseExpr<OtherDerived>& b) const {
+    return AddExpr<Derived, NegateExpr<OtherDerived>>(derived(), -b.derived());
+  }
+
+  template <typename OtherDerived>
+  constexpr ProductExpr<Derived, OtherDerived> operator*(const BaseExpr<OtherDerived>& b) const {
+    return ProductExpr<Derived, OtherDerived>(derived(), b.derived());
+  }
+
+  template <typename OtherDerived>
+  constexpr QuotientExpr<Derived, OtherDerived> operator/(const BaseExpr<OtherDerived>& b) const {
+    return QuotientExpr<Derived, OtherDerived>(derived(), b.derived());
+  }
+};
+
+template <typename T>
+struct is_symbolic {
+  // BaseExpr has no conversion ctor, so we only have to check whether T can be statically cast to its base class
+  // BaseExpr<T>.
+  enum { value = internal::is_convertible<T, BaseExpr<T>>::value };
+};
+
+// A simple wrapper around an integral value to provide the eval method.
+// We could also use a free-function symbolic_eval...
+template <typename IndexType>
+class ValueExpr : BaseExpr<ValueExpr<IndexType>> {
+ public:
+  constexpr ValueExpr() = default;
+  constexpr ValueExpr(IndexType val) : value_(val) {}
+  template <typename... Tags, typename... Types>
+  constexpr IndexType eval_impl(const SymbolValue<Tags, Types>&...) const {
+    return value_;
+  }
+  template <typename... Tags, typename... Types>
+  static constexpr IndexType eval_at_compile_time_impl(const SymbolValue<Tags, Types>&...) {
+    return IndexType(Undefined);
+  }
+
+ protected:
+  IndexType value_;
+};
+
+// Specialization for compile-time value,
+// It is similar to ValueExpr(N) but this version helps the compiler to generate better code.
+template <int N>
+class ValueExpr<internal::FixedInt<N>> : public BaseExpr<ValueExpr<internal::FixedInt<N>>> {
+ public:
+  constexpr ValueExpr() = default;
+  constexpr ValueExpr(internal::FixedInt<N>) {}
+  template <typename... Tags, typename... Types>
+  constexpr Index eval_impl(const SymbolValue<Tags, Types>&...) const {
+    return Index(N);
+  }
+  template <typename... Tags, typename... Types>
+  static constexpr Index eval_at_compile_time_impl(const SymbolValue<Tags, Types>&...) {
+    return Index(N);
+  }
+};
+
+/** Represents the actual value of a symbol identified by its tag
+ *
+ * It is the return type of SymbolValue::operator=, and most of the time this is only way it is used.
+ */
+template <typename Tag, typename Type>
+class SymbolValue : public BaseExpr<SymbolValue<Tag, Type>> {};
+
+template <typename Tag>
+class SymbolValue<Tag, Index> : public BaseExpr<SymbolValue<Tag, Index>> {
+ public:
+  constexpr SymbolValue() = default;
+
+  /** Default constructor from the value \a val */
+  constexpr SymbolValue(Index val) : value_(val) {}
+
+  /** \returns the stored value of the symbol */
+  constexpr Index value() const { return value_; }
+
+  /** \returns the stored value of the symbol at compile time, or Undefined if not known. */
+  static constexpr Index value_at_compile_time() { return Index(Undefined); }
+
+  template <typename... Tags, typename... Types>
+  constexpr Index eval_impl(const SymbolValue<Tags, Types>&...) const {
+    return value();
+  }
+
+  template <typename... Tags, typename... Types>
+  static constexpr Index eval_at_compile_time_impl(const SymbolValue<Tags, Types>&...) {
+    return value_at_compile_time();
+  }
+
+ protected:
+  Index value_;
+};
+
+template <typename Tag, int N>
+class SymbolValue<Tag, internal::FixedInt<N>> : public BaseExpr<SymbolValue<Tag, internal::FixedInt<N>>> {
+ public:
+  constexpr SymbolValue() = default;
+
+  /** Default constructor from the value \a val */
+  constexpr SymbolValue(internal::FixedInt<N>) {}
+
+  /** \returns the stored value of the symbol */
+  constexpr Index value() const { return static_cast<Index>(N); }
+
+  /** \returns the stored value of the symbol at compile time, or Undefined if not known. */
+  static constexpr Index value_at_compile_time() { return static_cast<Index>(N); }
+
+  template <typename... Tags, typename... Types>
+  constexpr Index eval_impl(const SymbolValue<Tags, Types>&...) const {
+    return value();
+  }
+
+  template <typename... Tags, typename... Types>
+  static constexpr Index eval_at_compile_time_impl(const SymbolValue<Tags, Types>&...) {
+    return value_at_compile_time();
+  }
+};
+
+// Find and return a symbol value based on the tag.
+template <typename Tag, typename... Types>
+struct EvalSymbolValueHelper;
+
+// Empty base case, symbol not found.
+template <typename Tag>
+struct EvalSymbolValueHelper<Tag> {
+  static constexpr Index eval_impl() {
+    eigen_assert(false && "Symbol not found.");
+    return Index(Undefined);
+  }
+  static constexpr Index eval_at_compile_time_impl() { return Index(Undefined); }
+};
+
+// We found a symbol value matching the provided Tag!
+template <typename Tag, typename Type, typename... OtherTypes>
+struct EvalSymbolValueHelper<Tag, SymbolValue<Tag, Type>, OtherTypes...> {
+  static constexpr Index eval_impl(const SymbolValue<Tag, Type>& symbol, const OtherTypes&...) {
+    return symbol.value();
+  }
+  static constexpr Index eval_at_compile_time_impl(const SymbolValue<Tag, Type>& symbol, const OtherTypes&...) {
+    return symbol.value_at_compile_time();
+  }
+};
+
+// No symbol value in first value, recursive search starting with next.
+template <typename Tag, typename T1, typename... OtherTypes>
+struct EvalSymbolValueHelper<Tag, T1, OtherTypes...> {
+  static constexpr Index eval_impl(const T1&, const OtherTypes&... values) {
+    return EvalSymbolValueHelper<Tag, OtherTypes...>::eval_impl(values...);
+  }
+  static constexpr Index eval_at_compile_time_impl(const T1&, const OtherTypes&...) {
+    return EvalSymbolValueHelper<Tag, OtherTypes...>::eval_at_compile_time_impl(OtherTypes{}...);
+  }
+};
+
+/** Expression of a symbol uniquely identified by the template parameter type \c tag */
+template <typename tag>
+class SymbolExpr : public BaseExpr<SymbolExpr<tag>> {
+ public:
+  /** Alias to the template parameter \c tag */
+  typedef tag Tag;
+
+  constexpr SymbolExpr() = default;
+
+  /** Associate the value \a val to the given symbol \c *this, uniquely identified by its \c Tag.
+   *
+   * The returned object should be passed to ExprBase::eval() to evaluate a given expression with this specified
+   * runtime-time value.
+   */
+  constexpr SymbolValue<Tag, Index> operator=(Index val) const { return SymbolValue<Tag, Index>(val); }
+
+  template <int N>
+  constexpr SymbolValue<Tag, internal::FixedInt<N>> operator=(internal::FixedInt<N>) const {
+    return SymbolValue<Tag, internal::FixedInt<N>>{internal::FixedInt<N>{}};
+  }
+
+  template <typename... Tags, typename... Types>
+  constexpr Index eval_impl(const SymbolValue<Tags, Types>&... values) const {
+    return EvalSymbolValueHelper<Tag, SymbolValue<Tags, Types>...>::eval_impl(values...);
+  }
+
+  template <typename... Tags, typename... Types>
+  static constexpr Index eval_at_compile_time_impl(const SymbolValue<Tags, Types>&...) {
+    return EvalSymbolValueHelper<Tag, SymbolValue<Tags, Types>...>::eval_at_compile_time_impl(
+        SymbolValue<Tags, Types>{}...);
+  }
+};
+
+template <typename Arg0>
+class NegateExpr : public BaseExpr<NegateExpr<Arg0>> {
+ public:
+  constexpr NegateExpr() = default;
+  constexpr NegateExpr(const Arg0& arg0) : m_arg0(arg0) {}
+
+  template <typename... Tags, typename... Types>
+  constexpr Index eval_impl(const SymbolValue<Tags, Types>&... values) const {
+    return -m_arg0.eval_impl(values...);
+  }
+
+  template <typename... Tags, typename... Types>
+  static constexpr Index eval_at_compile_time_impl(const SymbolValue<Tags, Types>&...) {
+    constexpr Index v = Arg0::eval_at_compile_time_impl(SymbolValue<Tags, Types>{}...);
+    return (v == Undefined) ? Undefined : -v;
+  }
+
+ protected:
+  Arg0 m_arg0;
+};
+
+template <typename Arg0, typename Arg1>
+class AddExpr : public BaseExpr<AddExpr<Arg0, Arg1>> {
+ public:
+  constexpr AddExpr() = default;
+  constexpr AddExpr(const Arg0& arg0, const Arg1& arg1) : m_arg0(arg0), m_arg1(arg1) {}
+
+  template <typename... Tags, typename... Types>
+  constexpr Index eval_impl(const SymbolValue<Tags, Types>&... values) const {
+    return m_arg0.eval_impl(values...) + m_arg1.eval_impl(values...);
+  }
+
+  template <typename... Tags, typename... Types>
+  static constexpr Index eval_at_compile_time_impl(const SymbolValue<Tags, Types>&...) {
+    constexpr Index v0 = Arg0::eval_at_compile_time_impl(SymbolValue<Tags, Types>{}...);
+    constexpr Index v1 = Arg1::eval_at_compile_time_impl(SymbolValue<Tags, Types>{}...);
+    return (v0 == Undefined || v1 == Undefined) ? Undefined : v0 + v1;
+  }
+
+ protected:
+  Arg0 m_arg0;
+  Arg1 m_arg1;
+};
+
+template <typename Arg0, typename Arg1>
+class ProductExpr : public BaseExpr<ProductExpr<Arg0, Arg1>> {
+ public:
+  constexpr ProductExpr() = default;
+  constexpr ProductExpr(const Arg0& arg0, const Arg1& arg1) : m_arg0(arg0), m_arg1(arg1) {}
+
+  template <typename... Tags, typename... Types>
+  constexpr Index eval_impl(const SymbolValue<Tags, Types>&... values) const {
+    return m_arg0.eval_impl(values...) * m_arg1.eval_impl(values...);
+  }
+
+  template <typename... Tags, typename... Types>
+  static constexpr Index eval_at_compile_time_impl(const SymbolValue<Tags, Types>&...) {
+    constexpr Index v0 = Arg0::eval_at_compile_time_impl(SymbolValue<Tags, Types>{}...);
+    constexpr Index v1 = Arg1::eval_at_compile_time_impl(SymbolValue<Tags, Types>{}...);
+    return (v0 == Undefined || v1 == Undefined) ? Undefined : v0 * v1;
+  }
+
+ protected:
+  Arg0 m_arg0;
+  Arg1 m_arg1;
+};
+
+template <typename Arg0, typename Arg1>
+class QuotientExpr : public BaseExpr<QuotientExpr<Arg0, Arg1>> {
+ public:
+  constexpr QuotientExpr() = default;
+  constexpr QuotientExpr(const Arg0& arg0, const Arg1& arg1) : m_arg0(arg0), m_arg1(arg1) {}
+
+  template <typename... Tags, typename... Types>
+  constexpr Index eval_impl(const SymbolValue<Tags, Types>&... values) const {
+    return m_arg0.eval_impl(values...) / m_arg1.eval_impl(values...);
+  }
+
+  template <typename... Tags, typename... Types>
+  static constexpr Index eval_at_compile_time_impl(const SymbolValue<Tags, Types>&...) {
+    constexpr Index v0 = Arg0::eval_at_compile_time_impl(SymbolValue<Tags, Types>{}...);
+    constexpr Index v1 = Arg1::eval_at_compile_time_impl(SymbolValue<Tags, Types>{}...);
+    return (v0 == Undefined || v1 == Undefined) ? Undefined : v0 / v1;
+  }
+
+ protected:
+  Arg0 m_arg0;
+  Arg1 m_arg1;
+};
+
+}  // end namespace symbolic
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_SYMBOLIC_INDEX_H
diff --git a/inst/include/Eigen/src/Core/util/XprHelper.h b/inst/include/Eigen/src/Core/util/XprHelper.h
index d05f8e5f..a0e160eb 100644
--- a/inst/include/Eigen/src/Core/util/XprHelper.h
+++ b/inst/include/Eigen/src/Core/util/XprHelper.h
@@ -11,459 +11,1083 @@
 #ifndef EIGEN_XPRHELPER_H
 #define EIGEN_XPRHELPER_H
 
-// just a workaround because GCC seems to not really like empty structs
-// FIXME: gcc 4.3 generates bad code when strict-aliasing is enabled
-// so currently we simply disable this optimization for gcc 4.3
-#if (defined __GNUG__) && !((__GNUC__==4) && (__GNUC_MINOR__==3))
-  #define EIGEN_EMPTY_STRUCT_CTOR(X) \
-    EIGEN_STRONG_INLINE X() {} \
-    EIGEN_STRONG_INLINE X(const X& ) {}
-#else
-  #define EIGEN_EMPTY_STRUCT_CTOR(X)
-#endif
+// IWYU pragma: private
+#include "../InternalHeaderCheck.h"
 
 namespace Eigen {
 
-typedef EIGEN_DEFAULT_DENSE_INDEX_TYPE DenseIndex;
-
 namespace internal {
 
-//classes inheriting no_assignment_operator don't generate a default operator=.
-class no_assignment_operator
-{
-  private:
-    no_assignment_operator& operator=(const no_assignment_operator&);
+// useful for unsigned / signed integer comparisons when idx is intended to be non-negative
+template <typename IndexType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename make_unsigned<IndexType>::type returnUnsignedIndexValue(
+    const IndexType& idx) {
+  EIGEN_STATIC_ASSERT((NumTraits<IndexType>::IsInteger), THIS FUNCTION IS FOR INTEGER TYPES)
+  eigen_internal_assert(idx >= 0 && "Index value is negative and target type is unsigned");
+  using UnsignedType = typename make_unsigned<IndexType>::type;
+  return static_cast<UnsignedType>(idx);
+}
+
+template <typename IndexDest, typename IndexSrc, bool IndexDestIsInteger = NumTraits<IndexDest>::IsInteger,
+          bool IndexDestIsSigned = NumTraits<IndexDest>::IsSigned,
+          bool IndexSrcIsInteger = NumTraits<IndexSrc>::IsInteger,
+          bool IndexSrcIsSigned = NumTraits<IndexSrc>::IsSigned>
+struct convert_index_impl {
+  static inline EIGEN_DEVICE_FUNC IndexDest run(const IndexSrc& idx) {
+    eigen_internal_assert(idx <= NumTraits<IndexDest>::highest() && "Index value is too big for target type");
+    return static_cast<IndexDest>(idx);
+  }
+};
+template <typename IndexDest, typename IndexSrc>
+struct convert_index_impl<IndexDest, IndexSrc, true, true, true, false> {
+  // IndexDest is a signed integer
+  // IndexSrc is an unsigned integer
+  static inline EIGEN_DEVICE_FUNC IndexDest run(const IndexSrc& idx) {
+    eigen_internal_assert(idx <= returnUnsignedIndexValue(NumTraits<IndexDest>::highest()) &&
+                          "Index value is too big for target type");
+    return static_cast<IndexDest>(idx);
+  }
+};
+template <typename IndexDest, typename IndexSrc>
+struct convert_index_impl<IndexDest, IndexSrc, true, false, true, true> {
+  // IndexDest is an unsigned integer
+  // IndexSrc is a signed integer
+  static inline EIGEN_DEVICE_FUNC IndexDest run(const IndexSrc& idx) {
+    eigen_internal_assert(returnUnsignedIndexValue(idx) <= NumTraits<IndexDest>::highest() &&
+                          "Index value is too big for target type");
+    return static_cast<IndexDest>(idx);
+  }
+};
+
+template <typename IndexDest, typename IndexSrc>
+EIGEN_DEVICE_FUNC inline IndexDest convert_index(const IndexSrc& idx) {
+  return convert_index_impl<IndexDest, IndexSrc>::run(idx);
+}
+
+// true if T can be considered as an integral index (i.e., and integral type or enum)
+template <typename T>
+struct is_valid_index_type {
+  enum { value = internal::is_integral<T>::value || std::is_enum<T>::value };
+};
+
+// true if both types are not valid index types
+template <typename RowIndices, typename ColIndices>
+struct valid_indexed_view_overload {
+  enum {
+    value = !(internal::is_valid_index_type<RowIndices>::value && internal::is_valid_index_type<ColIndices>::value)
+  };
+};
+
+// promote_scalar_arg is an helper used in operation between an expression and a scalar, like:
+//    expression * scalar
+// Its role is to determine how the type T of the scalar operand should be promoted given the scalar type ExprScalar of
+// the given expression. The IsSupported template parameter must be provided by the caller as:
+// internal::has_ReturnType<ScalarBinaryOpTraits<ExprScalar,T,op> >::value using the proper order for ExprScalar and T.
+// Then the logic is as follows:
+//  - if the operation is natively supported as defined by IsSupported, then the scalar type is not promoted, and T is
+//  returned.
+//  - otherwise, NumTraits<ExprScalar>::Literal is returned if T is implicitly convertible to
+//  NumTraits<ExprScalar>::Literal AND that this does not imply a float to integer conversion.
+//  - otherwise, ExprScalar is returned if T is implicitly convertible to ExprScalar AND that this does not imply a
+//  float to integer conversion.
+//  - In all other cases, the promoted type is not defined, and the respective operation is thus invalid and not
+//  available (SFINAE).
+template <typename ExprScalar, typename T, bool IsSupported>
+struct promote_scalar_arg;
+
+template <typename S, typename T>
+struct promote_scalar_arg<S, T, true> {
+  typedef T type;
+};
+
+// Recursively check safe conversion to PromotedType, and then ExprScalar if they are different.
+template <typename ExprScalar, typename T, typename PromotedType,
+          bool ConvertibleToLiteral = internal::is_convertible<T, PromotedType>::value,
+          bool IsSafe = NumTraits<T>::IsInteger || !NumTraits<PromotedType>::IsInteger>
+struct promote_scalar_arg_unsupported;
+
+// Start recursion with NumTraits<ExprScalar>::Literal
+template <typename S, typename T>
+struct promote_scalar_arg<S, T, false> : promote_scalar_arg_unsupported<S, T, typename NumTraits<S>::Literal> {};
+
+// We found a match!
+template <typename S, typename T, typename PromotedType>
+struct promote_scalar_arg_unsupported<S, T, PromotedType, true, true> {
+  typedef PromotedType type;
+};
+
+// No match, but no real-to-integer issues, and ExprScalar and current PromotedType are different,
+// so let's try to promote to ExprScalar
+template <typename ExprScalar, typename T, typename PromotedType>
+struct promote_scalar_arg_unsupported<ExprScalar, T, PromotedType, false, true>
+    : promote_scalar_arg_unsupported<ExprScalar, T, ExprScalar> {};
+
+// Unsafe real-to-integer, let's stop.
+template <typename S, typename T, typename PromotedType, bool ConvertibleToLiteral>
+struct promote_scalar_arg_unsupported<S, T, PromotedType, ConvertibleToLiteral, false> {};
+
+// T is not even convertible to ExprScalar, let's stop.
+template <typename S, typename T>
+struct promote_scalar_arg_unsupported<S, T, S, false, true> {};
+
+// classes inheriting no_assignment_operator don't generate a default operator=.
+class no_assignment_operator {
+ private:
+  no_assignment_operator& operator=(const no_assignment_operator&);
+
+ protected:
+  EIGEN_DEFAULT_COPY_CONSTRUCTOR(no_assignment_operator)
+  EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(no_assignment_operator)
 };
 
 /** \internal return the index type with the largest number of bits */
-template<typename I1, typename I2>
-struct promote_index_type
-{
-  typedef typename conditional<(sizeof(I1)<sizeof(I2)), I2, I1>::type type;
+template <typename I1, typename I2>
+struct promote_index_type {
+  typedef std::conditional_t<(sizeof(I1) < sizeof(I2)), I2, I1> type;
 };
 
 /** \internal If the template parameter Value is Dynamic, this class is just a wrapper around a T variable that
-  * can be accessed using value() and setValue().
-  * Otherwise, this class is an empty structure and value() just returns the template parameter Value.
-  */
-template<typename T, int Value> class variable_if_dynamic
-{
-  public:
-    EIGEN_EMPTY_STRUCT_CTOR(variable_if_dynamic)
-    explicit variable_if_dynamic(T v) { EIGEN_ONLY_USED_FOR_DEBUG(v); assert(v == T(Value)); }
-    static T value() { return T(Value); }
-    void setValue(T) {}
+ * can be accessed using value() and setValue().
+ * Otherwise, this class is an empty structure and value() just returns the template parameter Value.
+ */
+template <typename T, int Value>
+class variable_if_dynamic {
+ public:
+  EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(variable_if_dynamic)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamic(T v) {
+    EIGEN_ONLY_USED_FOR_DEBUG(v);
+    eigen_assert(v == T(Value));
+  }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr T value() { return T(Value); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr operator T() const { return T(Value); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T v) const {
+    EIGEN_ONLY_USED_FOR_DEBUG(v);
+    eigen_assert(v == T(Value));
+  }
 };
 
-template<typename T> class variable_if_dynamic<T, Dynamic>
-{
-    T m_value;
-    variable_if_dynamic() { assert(false); }
-  public:
-    explicit variable_if_dynamic(T value) : m_value(value) {}
-    T value() const { return m_value; }
-    void setValue(T value) { m_value = value; }
+template <typename T>
+class variable_if_dynamic<T, Dynamic> {
+  T m_value;
+
+ public:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamic(T value = 0) noexcept : m_value(value) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T value() const { return m_value; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE operator T() const { return m_value; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T value) { m_value = value; }
 };
 
 /** \internal like variable_if_dynamic but for DynamicIndex
-  */
-template<typename T, int Value> class variable_if_dynamicindex
-{
-  public:
-    EIGEN_EMPTY_STRUCT_CTOR(variable_if_dynamicindex)
-    explicit variable_if_dynamicindex(T v) { EIGEN_ONLY_USED_FOR_DEBUG(v); assert(v == T(Value)); }
-    static T value() { return T(Value); }
-    void setValue(T) {}
+ */
+template <typename T, int Value>
+class variable_if_dynamicindex {
+ public:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamicindex(T v) {
+    EIGEN_ONLY_USED_FOR_DEBUG(v);
+    eigen_assert(v == T(Value));
+  }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE constexpr T value() { return T(Value); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T) {}
 };
 
-template<typename T> class variable_if_dynamicindex<T, DynamicIndex>
-{
-    T m_value;
-    variable_if_dynamicindex() { assert(false); }
-  public:
-    explicit variable_if_dynamicindex(T value) : m_value(value) {}
-    T value() const { return m_value; }
-    void setValue(T value) { m_value = value; }
+template <typename T>
+class variable_if_dynamicindex<T, DynamicIndex> {
+  T m_value;
+  EIGEN_DEVICE_FUNC variable_if_dynamicindex() { eigen_assert(false); }
+
+ public:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit variable_if_dynamicindex(T value) : m_value(value) {}
+  EIGEN_DEVICE_FUNC T EIGEN_STRONG_INLINE value() const { return m_value; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void setValue(T value) { m_value = value; }
 };
 
-template<typename T> struct functor_traits
-{
-  enum
-  {
-    Cost = 10,
-    PacketAccess = false,
-    IsRepeatable = false
-  };
+template <typename T>
+struct functor_traits {
+  enum { Cost = 10, PacketAccess = false, IsRepeatable = false };
 };
 
-template<typename T> struct packet_traits;
+// estimates the cost of lazily evaluating a generic functor by unwinding the expression
+template <typename Xpr>
+struct nested_functor_cost {
+  static constexpr Index Cost = static_cast<Index>(functor_traits<Xpr>::Cost);
+};
 
-template<typename T> struct unpacket_traits
-{
-  typedef T type;
-  enum {size=1};
+template <typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
+struct nested_functor_cost<Matrix<Scalar, Rows, Cols, Options, MaxRows, MaxCols>> {
+  static constexpr Index Cost = 1;
 };
 
-template<typename _Scalar, int _Rows, int _Cols,
-         int _Options = AutoAlign |
-                          ( (_Rows==1 && _Cols!=1) ? RowMajor
-                          : (_Cols==1 && _Rows!=1) ? ColMajor
-                          : EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION ),
-         int _MaxRows = _Rows,
-         int _MaxCols = _Cols
-> class make_proper_matrix_type
-{
-    enum {
-      IsColVector = _Cols==1 && _Rows!=1,
-      IsRowVector = _Rows==1 && _Cols!=1,
-      Options = IsColVector ? (_Options | ColMajor) & ~RowMajor
-              : IsRowVector ? (_Options | RowMajor) & ~ColMajor
-              : _Options
-    };
-  public:
-    typedef Matrix<_Scalar, _Rows, _Cols, Options, _MaxRows, _MaxCols> type;
-};
-
-template<typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
-class compute_matrix_flags
-{
-    enum {
-      row_major_bit = Options&RowMajor ? RowMajorBit : 0,
-      is_dynamic_size_storage = MaxRows==Dynamic || MaxCols==Dynamic,
-
-      aligned_bit =
-      (
-            ((Options&DontAlign)==0)
-        && (
-#if EIGEN_ALIGN_STATICALLY
-             ((!is_dynamic_size_storage) && (((MaxCols*MaxRows*int(sizeof(Scalar))) % 16) == 0))
-#else
-             0
-#endif
+template <typename Scalar, int Rows, int Cols, int Options, int MaxRows, int MaxCols>
+struct nested_functor_cost<Array<Scalar, Rows, Cols, Options, MaxRows, MaxCols>> {
+  static constexpr Index Cost = 1;
+};
+
+// TODO: assign a cost to the stride type?
+template <typename PlainObjectType, int MapOptions, typename StrideType>
+struct nested_functor_cost<Map<PlainObjectType, MapOptions, StrideType>> : nested_functor_cost<PlainObjectType> {};
+
+template <typename Func, typename Xpr>
+struct nested_functor_cost<CwiseUnaryOp<Func, Xpr>> {
+  using XprCleaned = remove_all_t<Xpr>;
+  using FuncCleaned = remove_all_t<Func>;
+  static constexpr Index Cost = nested_functor_cost<FuncCleaned>::Cost + nested_functor_cost<XprCleaned>::Cost;
+};
+
+template <typename Func, typename Xpr>
+struct nested_functor_cost<CwiseNullaryOp<Func, Xpr>> {
+  using XprCleaned = remove_all_t<Xpr>;
+  using FuncCleaned = remove_all_t<Func>;
+  static constexpr Index Cost = nested_functor_cost<FuncCleaned>::Cost + nested_functor_cost<XprCleaned>::Cost;
+};
 
-          ||
+template <typename Func, typename LhsXpr, typename RhsXpr>
+struct nested_functor_cost<CwiseBinaryOp<Func, LhsXpr, RhsXpr>> {
+  using LhsXprCleaned = remove_all_t<LhsXpr>;
+  using RhsXprCleaned = remove_all_t<RhsXpr>;
+  using FuncCleaned = remove_all_t<Func>;
+  static constexpr Index Cost = nested_functor_cost<FuncCleaned>::Cost + nested_functor_cost<LhsXprCleaned>::Cost +
+                                nested_functor_cost<RhsXprCleaned>::Cost;
+};
 
-#if EIGEN_ALIGN
-             is_dynamic_size_storage
+template <typename Func, typename LhsXpr, typename MidXpr, typename RhsXpr>
+struct nested_functor_cost<CwiseTernaryOp<Func, LhsXpr, MidXpr, RhsXpr>> {
+  using LhsXprCleaned = remove_all_t<LhsXpr>;
+  using MidXprCleaned = remove_all_t<MidXpr>;
+  using RhsXprCleaned = remove_all_t<RhsXpr>;
+  using FuncCleaned = remove_all_t<Func>;
+  static constexpr Index Cost = nested_functor_cost<FuncCleaned>::Cost + nested_functor_cost<LhsXprCleaned>::Cost +
+                                nested_functor_cost<MidXprCleaned>::Cost + nested_functor_cost<RhsXprCleaned>::Cost;
+};
+
+template <typename Xpr>
+struct functor_cost {
+  static constexpr Index Cost = plain_enum_max(nested_functor_cost<Xpr>::Cost, 1);
+};
+
+template <typename T>
+struct packet_traits;
+
+template <typename T>
+struct unpacket_traits;
+
+template <int Size, typename PacketType,
+          bool Stop = Size == Dynamic || (Size % unpacket_traits<PacketType>::size) == 0 ||
+                      is_same<PacketType, typename unpacket_traits<PacketType>::half>::value>
+struct find_best_packet_helper;
+
+template <int Size, typename PacketType>
+struct find_best_packet_helper<Size, PacketType, true> {
+  typedef PacketType type;
+};
+
+template <int Size, typename PacketType>
+struct find_best_packet_helper<Size, PacketType, false> {
+  typedef typename find_best_packet_helper<Size, typename unpacket_traits<PacketType>::half>::type type;
+};
+
+template <typename T, int Size>
+struct find_best_packet {
+  typedef typename find_best_packet_helper<Size, typename packet_traits<T>::type>::type type;
+};
+
+template <int Size, typename PacketType,
+          bool Stop = (Size == unpacket_traits<PacketType>::size) ||
+                      is_same<PacketType, typename unpacket_traits<PacketType>::half>::value>
+struct find_packet_by_size_helper;
+template <int Size, typename PacketType>
+struct find_packet_by_size_helper<Size, PacketType, true> {
+  using type = PacketType;
+};
+template <int Size, typename PacketType>
+struct find_packet_by_size_helper<Size, PacketType, false> {
+  using type = typename find_packet_by_size_helper<Size, typename unpacket_traits<PacketType>::half>::type;
+};
+
+template <typename T, int Size>
+struct find_packet_by_size {
+  using type = typename find_packet_by_size_helper<Size, typename packet_traits<T>::type>::type;
+  static constexpr bool value = (Size == unpacket_traits<type>::size);
+};
+template <typename T>
+struct find_packet_by_size<T, 1> {
+  using type = typename unpacket_traits<T>::type;
+  static constexpr bool value = (unpacket_traits<type>::size == 1);
+};
+
+#if EIGEN_MAX_STATIC_ALIGN_BYTES > 0
+constexpr int compute_default_alignment_helper(int ArrayBytes, int AlignmentBytes) {
+  if ((ArrayBytes % AlignmentBytes) == 0) {
+    return AlignmentBytes;
+  } else if (EIGEN_MIN_ALIGN_BYTES < AlignmentBytes) {
+    return compute_default_alignment_helper(ArrayBytes, AlignmentBytes / 2);
+  } else {
+    return 0;
+  }
+}
 #else
-             0
+// If static alignment is disabled, no need to bother.
+// This also avoids a division by zero
+constexpr int compute_default_alignment_helper(int ArrayBytes, int AlignmentBytes) {
+  EIGEN_UNUSED_VARIABLE(ArrayBytes);
+  EIGEN_UNUSED_VARIABLE(AlignmentBytes);
+  return 0;
+}
 #endif
 
-          )
-      ) ? AlignedBit : 0,
-      packet_access_bit = packet_traits<Scalar>::Vectorizable && aligned_bit ? PacketAccessBit : 0
-    };
+template <typename T, int Size>
+struct compute_default_alignment {
+  enum { value = compute_default_alignment_helper(Size * sizeof(T), EIGEN_MAX_STATIC_ALIGN_BYTES) };
+};
 
-  public:
-    enum { ret = LinearAccessBit | LvalueBit | DirectAccessBit | NestByRefBit | packet_access_bit | row_major_bit | aligned_bit };
+template <typename T>
+struct compute_default_alignment<T, Dynamic> {
+  enum { value = EIGEN_MAX_ALIGN_BYTES };
 };
 
-template<int _Rows, int _Cols> struct size_at_compile_time
-{
-  enum { ret = (_Rows==Dynamic || _Cols==Dynamic) ? Dynamic : _Rows * _Cols };
+template <typename Scalar_, int Rows_, int Cols_,
+          int Options_ = AutoAlign | ((Rows_ == 1 && Cols_ != 1)   ? RowMajor
+                                      : (Cols_ == 1 && Rows_ != 1) ? ColMajor
+                                                                   : EIGEN_DEFAULT_MATRIX_STORAGE_ORDER_OPTION),
+          int MaxRows_ = Rows_, int MaxCols_ = Cols_>
+class make_proper_matrix_type {
+  enum {
+    IsColVector = Cols_ == 1 && Rows_ != 1,
+    IsRowVector = Rows_ == 1 && Cols_ != 1,
+    Options = IsColVector   ? (Options_ | ColMajor) & ~RowMajor
+              : IsRowVector ? (Options_ | RowMajor) & ~ColMajor
+                            : Options_
+  };
+
+ public:
+  typedef Matrix<Scalar_, Rows_, Cols_, Options, MaxRows_, MaxCols_> type;
+};
+
+constexpr unsigned compute_matrix_flags(int Options) {
+  unsigned row_major_bit = Options & RowMajor ? RowMajorBit : 0;
+  // FIXME currently we still have to handle DirectAccessBit at the expression level to handle DenseCoeffsBase<>
+  // and then propagate this information to the evaluator's flags.
+  // However, I (Gael) think that DirectAccessBit should only matter at the evaluation stage.
+  return DirectAccessBit | LvalueBit | NestByRefBit | row_major_bit;
+}
+
+constexpr int size_at_compile_time(int rows, int cols) {
+  if (rows == 0 || cols == 0) return 0;
+  if (rows == Dynamic || cols == Dynamic) return Dynamic;
+  return rows * cols;
+}
+
+template <typename XprType>
+struct size_of_xpr_at_compile_time {
+  enum { ret = size_at_compile_time(traits<XprType>::RowsAtCompileTime, traits<XprType>::ColsAtCompileTime) };
 };
 
 /* plain_matrix_type : the difference from eval is that plain_matrix_type is always a plain matrix type,
  * whereas eval is a const reference in the case of a matrix
  */
 
-template<typename T, typename StorageKind = typename traits<T>::StorageKind> struct plain_matrix_type;
-template<typename T, typename BaseClassType> struct plain_matrix_type_dense;
-template<typename T> struct plain_matrix_type<T,Dense>
-{
-  typedef typename plain_matrix_type_dense<T,typename traits<T>::XprKind>::type type;
+template <typename T, typename StorageKind = typename traits<T>::StorageKind>
+struct plain_matrix_type;
+template <typename T, typename BaseClassType, int Flags>
+struct plain_matrix_type_dense;
+template <typename T>
+struct plain_matrix_type<T, Dense> {
+  typedef typename plain_matrix_type_dense<T, typename traits<T>::XprKind, traits<T>::Flags>::type type;
+};
+template <typename T>
+struct plain_matrix_type<T, DiagonalShape> {
+  typedef typename T::PlainObject type;
 };
 
-template<typename T> struct plain_matrix_type_dense<T,MatrixXpr>
-{
-  typedef Matrix<typename traits<T>::Scalar,
-                traits<T>::RowsAtCompileTime,
-                traits<T>::ColsAtCompileTime,
-                AutoAlign | (traits<T>::Flags&RowMajorBit ? RowMajor : ColMajor),
-                traits<T>::MaxRowsAtCompileTime,
-                traits<T>::MaxColsAtCompileTime
-          > type;
+template <typename T>
+struct plain_matrix_type<T, SkewSymmetricShape> {
+  typedef typename T::PlainObject type;
 };
 
-template<typename T> struct plain_matrix_type_dense<T,ArrayXpr>
-{
-  typedef Array<typename traits<T>::Scalar,
-                traits<T>::RowsAtCompileTime,
-                traits<T>::ColsAtCompileTime,
-                AutoAlign | (traits<T>::Flags&RowMajorBit ? RowMajor : ColMajor),
-                traits<T>::MaxRowsAtCompileTime,
-                traits<T>::MaxColsAtCompileTime
-          > type;
+template <typename T, int Flags>
+struct plain_matrix_type_dense<T, MatrixXpr, Flags> {
+  typedef Matrix<typename traits<T>::Scalar, traits<T>::RowsAtCompileTime, traits<T>::ColsAtCompileTime,
+                 AutoAlign | (Flags & RowMajorBit ? RowMajor : ColMajor), traits<T>::MaxRowsAtCompileTime,
+                 traits<T>::MaxColsAtCompileTime>
+      type;
+};
+
+template <typename T, int Flags>
+struct plain_matrix_type_dense<T, ArrayXpr, Flags> {
+  typedef Array<typename traits<T>::Scalar, traits<T>::RowsAtCompileTime, traits<T>::ColsAtCompileTime,
+                AutoAlign | (Flags & RowMajorBit ? RowMajor : ColMajor), traits<T>::MaxRowsAtCompileTime,
+                traits<T>::MaxColsAtCompileTime>
+      type;
 };
 
 /* eval : the return type of eval(). For matrices, this is just a const reference
  * in order to avoid a useless copy
  */
 
-template<typename T, typename StorageKind = typename traits<T>::StorageKind> struct eval;
+template <typename T, typename StorageKind = typename traits<T>::StorageKind>
+struct eval;
 
-template<typename T> struct eval<T,Dense>
-{
+template <typename T>
+struct eval<T, Dense> {
+  typedef typename plain_matrix_type<T>::type type;
+  //   typedef typename T::PlainObject type;
+  //   typedef T::Matrix<typename traits<T>::Scalar,
+  //                 traits<T>::RowsAtCompileTime,
+  //                 traits<T>::ColsAtCompileTime,
+  //                 AutoAlign | (traits<T>::Flags&RowMajorBit ? RowMajor : ColMajor),
+  //                 traits<T>::MaxRowsAtCompileTime,
+  //                 traits<T>::MaxColsAtCompileTime
+  //           > type;
+};
+
+template <typename T>
+struct eval<T, DiagonalShape> {
+  typedef typename plain_matrix_type<T>::type type;
+};
+
+template <typename T>
+struct eval<T, SkewSymmetricShape> {
   typedef typename plain_matrix_type<T>::type type;
-//   typedef typename T::PlainObject type;
-//   typedef T::Matrix<typename traits<T>::Scalar,
-//                 traits<T>::RowsAtCompileTime,
-//                 traits<T>::ColsAtCompileTime,
-//                 AutoAlign | (traits<T>::Flags&RowMajorBit ? RowMajor : ColMajor),
-//                 traits<T>::MaxRowsAtCompileTime,
-//                 traits<T>::MaxColsAtCompileTime
-//           > type;
 };
 
 // for matrices, no need to evaluate, just use a const reference to avoid a useless copy
-template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
-struct eval<Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>, Dense>
-{
-  typedef const Matrix<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>& type;
+template <typename Scalar_, int Rows_, int Cols_, int Options_, int MaxRows_, int MaxCols_>
+struct eval<Matrix<Scalar_, Rows_, Cols_, Options_, MaxRows_, MaxCols_>, Dense> {
+  typedef const Matrix<Scalar_, Rows_, Cols_, Options_, MaxRows_, MaxCols_>& type;
 };
 
-template<typename _Scalar, int _Rows, int _Cols, int _Options, int _MaxRows, int _MaxCols>
-struct eval<Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>, Dense>
-{
-  typedef const Array<_Scalar, _Rows, _Cols, _Options, _MaxRows, _MaxCols>& type;
+template <typename Scalar_, int Rows_, int Cols_, int Options_, int MaxRows_, int MaxCols_>
+struct eval<Array<Scalar_, Rows_, Cols_, Options_, MaxRows_, MaxCols_>, Dense> {
+  typedef const Array<Scalar_, Rows_, Cols_, Options_, MaxRows_, MaxCols_>& type;
 };
 
+/* similar to plain_matrix_type, but using the evaluator's Flags */
+template <typename T, typename StorageKind = typename traits<T>::StorageKind>
+struct plain_object_eval;
 
+template <typename T>
+struct plain_object_eval<T, Dense> {
+  typedef typename plain_matrix_type_dense<T, typename traits<T>::XprKind, evaluator<T>::Flags>::type type;
+};
 
 /* plain_matrix_type_column_major : same as plain_matrix_type but guaranteed to be column-major
  */
-template<typename T> struct plain_matrix_type_column_major
-{
-  enum { Rows = traits<T>::RowsAtCompileTime,
-         Cols = traits<T>::ColsAtCompileTime,
-         MaxRows = traits<T>::MaxRowsAtCompileTime,
-         MaxCols = traits<T>::MaxColsAtCompileTime
+template <typename T>
+struct plain_matrix_type_column_major {
+  enum {
+    Rows = traits<T>::RowsAtCompileTime,
+    Cols = traits<T>::ColsAtCompileTime,
+    MaxRows = traits<T>::MaxRowsAtCompileTime,
+    MaxCols = traits<T>::MaxColsAtCompileTime
   };
-  typedef Matrix<typename traits<T>::Scalar,
-                Rows,
-                Cols,
-                (MaxRows==1&&MaxCols!=1) ? RowMajor : ColMajor,
-                MaxRows,
-                MaxCols
-          > type;
+  typedef Matrix<typename traits<T>::Scalar, Rows, Cols, (MaxRows == 1 && MaxCols != 1) ? RowMajor : ColMajor, MaxRows,
+                 MaxCols>
+      type;
 };
 
 /* plain_matrix_type_row_major : same as plain_matrix_type but guaranteed to be row-major
  */
-template<typename T> struct plain_matrix_type_row_major
-{
-  enum { Rows = traits<T>::RowsAtCompileTime,
-         Cols = traits<T>::ColsAtCompileTime,
-         MaxRows = traits<T>::MaxRowsAtCompileTime,
-         MaxCols = traits<T>::MaxColsAtCompileTime
+template <typename T>
+struct plain_matrix_type_row_major {
+  enum {
+    Rows = traits<T>::RowsAtCompileTime,
+    Cols = traits<T>::ColsAtCompileTime,
+    MaxRows = traits<T>::MaxRowsAtCompileTime,
+    MaxCols = traits<T>::MaxColsAtCompileTime
   };
-  typedef Matrix<typename traits<T>::Scalar,
-                Rows,
-                Cols,
-                (MaxCols==1&&MaxRows!=1) ? RowMajor : ColMajor,
-                MaxRows,
-                MaxCols
-          > type;
+  typedef Matrix<typename traits<T>::Scalar, Rows, Cols, (MaxCols == 1 && MaxRows != 1) ? ColMajor : RowMajor, MaxRows,
+                 MaxCols>
+      type;
 };
 
-// we should be able to get rid of this one too
-template<typename T> struct must_nest_by_value { enum { ret = false }; };
-
 /** \internal The reference selector for template expressions. The idea is that we don't
-  * need to use references for expressions since they are light weight proxy
-  * objects which should generate no copying overhead. */
+ * need to use references for expressions since they are light weight proxy
+ * objects which should generate no copying overhead. */
 template <typename T>
-struct ref_selector
-{
-  typedef typename conditional<
-    bool(traits<T>::Flags & NestByRefBit),
-    T const&,
-    const T
-  >::type type;
+struct ref_selector {
+  typedef std::conditional_t<bool(traits<T>::Flags& NestByRefBit), T const&, const T> type;
+
+  typedef std::conditional_t<bool(traits<T>::Flags& NestByRefBit), T&, T> non_const_type;
 };
 
 /** \internal Adds the const qualifier on the value-type of T2 if and only if T1 is a const type */
-template<typename T1, typename T2>
-struct transfer_constness
-{
-  typedef typename conditional<
-    bool(internal::is_const<T1>::value),
-    typename internal::add_const_on_value_type<T2>::type,
-    T2
-  >::type type;
-};
-
-/** \internal Determines how a given expression should be nested into another one.
-  * For example, when you do a * (b+c), Eigen will determine how the expression b+c should be
-  * nested into the bigger product expression. The choice is between nesting the expression b+c as-is, or
-  * evaluating that expression b+c into a temporary variable d, and nest d so that the resulting expression is
-  * a*d. Evaluating can be beneficial for example if every coefficient access in the resulting expression causes
-  * many coefficient accesses in the nested expressions -- as is the case with matrix product for example.
-  *
-  * \param T the type of the expression being nested
-  * \param n the number of coefficient accesses in the nested expression for each coefficient access in the bigger expression.
-  *
-  * Note that if no evaluation occur, then the constness of T is preserved.
-  *
-  * Example. Suppose that a, b, and c are of type Matrix3d. The user forms the expression a*(b+c).
-  * b+c is an expression "sum of matrices", which we will denote by S. In order to determine how to nest it,
-  * the Product expression uses: nested<S, 3>::ret, which turns out to be Matrix3d because the internal logic of
-  * nested determined that in this case it was better to evaluate the expression b+c into a temporary. On the other hand,
-  * since a is of type Matrix3d, the Product expression nests it as nested<Matrix3d, 3>::ret, which turns out to be
-  * const Matrix3d&, because the internal logic of nested determined that since a was already a matrix, there was no point
-  * in copying it into another matrix.
-  */
-template<typename T, int n=1, typename PlainObject = typename eval<T>::type> struct nested
-{
+template <typename T1, typename T2>
+struct transfer_constness {
+  typedef std::conditional_t<bool(internal::is_const<T1>::value), add_const_on_value_type_t<T2>, T2> type;
+};
+
+// However, we still need a mechanism to detect whether an expression which is evaluated multiple time
+// has to be evaluated into a temporary.
+// That's the purpose of this new nested_eval helper:
+/** \internal Determines how a given expression should be nested when evaluated multiple times.
+ * For example, when you do a * (b+c), Eigen will determine how the expression b+c should be
+ * evaluated into the bigger product expression. The choice is between nesting the expression b+c as-is, or
+ * evaluating that expression b+c into a temporary variable d, and nest d so that the resulting expression is
+ * a*d. Evaluating can be beneficial for example if every coefficient access in the resulting expression causes
+ * many coefficient accesses in the nested expressions -- as is the case with matrix product for example.
+ *
+ * \tparam T the type of the expression being nested.
+ * \tparam n the number of coefficient accesses in the nested expression for each coefficient access in the bigger
+ * expression. \tparam PlainObject the type of the temporary if needed.
+ */
+template <typename T, int n, typename PlainObject = typename plain_object_eval<T>::type>
+struct nested_eval {
   enum {
-    // for the purpose of this test, to keep it reasonably simple, we arbitrarily choose a value of Dynamic values.
-    // the choice of 10000 makes it larger than any practical fixed value and even most dynamic values.
-    // in extreme cases where these assumptions would be wrong, we would still at worst suffer performance issues
-    // (poor choice of temporaries).
-    // it's important that this value can still be squared without integer overflowing.
-    DynamicAsInteger = 10000,
     ScalarReadCost = NumTraits<typename traits<T>::Scalar>::ReadCost,
-    ScalarReadCostAsInteger = ScalarReadCost == Dynamic ? int(DynamicAsInteger) : int(ScalarReadCost),
-    CoeffReadCost = traits<T>::CoeffReadCost,
-    CoeffReadCostAsInteger = CoeffReadCost == Dynamic ? int(DynamicAsInteger) : int(CoeffReadCost),
-    NAsInteger = n == Dynamic ? int(DynamicAsInteger) : n,
-    CostEvalAsInteger   = (NAsInteger+1) * ScalarReadCostAsInteger + CoeffReadCostAsInteger,
-    CostNoEvalAsInteger = NAsInteger * CoeffReadCostAsInteger
+    CoeffReadCost =
+        evaluator<T>::CoeffReadCost,  // NOTE What if an evaluator evaluate itself into a temporary?
+                                      //      Then CoeffReadCost will be small (e.g., 1) but we still have to evaluate,
+                                      //      especially if n>1. This situation is already taken care by the
+                                      //      EvalBeforeNestingBit flag, which is turned ON for all evaluator creating a
+                                      //      temporary. This flag is then propagated by the parent evaluators. Another
+                                      //      solution could be to count the number of temps?
+    NAsInteger = n == Dynamic ? HugeCost : n,
+    CostEval = (NAsInteger + 1) * ScalarReadCost + CoeffReadCost,
+    CostNoEval = int(NAsInteger) * int(CoeffReadCost),
+    Evaluate = (int(evaluator<T>::Flags) & EvalBeforeNestingBit) || (int(CostEval) < int(CostNoEval))
   };
 
-  typedef typename conditional<
-      ( (int(traits<T>::Flags) & EvalBeforeNestingBit) ||
-        int(CostEvalAsInteger) < int(CostNoEvalAsInteger)
-      ),
-      PlainObject,
-      typename ref_selector<T>::type
-  >::type type;
+  typedef std::conditional_t<Evaluate, PlainObject, typename ref_selector<T>::type> type;
 };
 
-template<typename T>
-inline T* const_cast_ptr(const T* ptr)
-{
+template <typename T>
+EIGEN_DEVICE_FUNC inline T* const_cast_ptr(const T* ptr) {
   return const_cast<T*>(ptr);
 }
 
-template<typename Derived, typename XprKind = typename traits<Derived>::XprKind>
-struct dense_xpr_base
-{
-  /* dense_xpr_base should only ever be used on dense expressions, thus falling either into the MatrixXpr or into the ArrayXpr cases */
+template <typename Derived, typename XprKind = typename traits<Derived>::XprKind>
+struct dense_xpr_base {
+  /* dense_xpr_base should only ever be used on dense expressions, thus falling either into the MatrixXpr or into the
+   * ArrayXpr cases */
 };
 
-template<typename Derived>
-struct dense_xpr_base<Derived, MatrixXpr>
-{
+template <typename Derived>
+struct dense_xpr_base<Derived, MatrixXpr> {
   typedef MatrixBase<Derived> type;
 };
 
-template<typename Derived>
-struct dense_xpr_base<Derived, ArrayXpr>
-{
+template <typename Derived>
+struct dense_xpr_base<Derived, ArrayXpr> {
   typedef ArrayBase<Derived> type;
 };
 
-/** \internal Helper base class to add a scalar multiple operator
-  * overloads for complex types */
-template<typename Derived, typename Scalar, typename OtherScalar, typename BaseType,
-         bool EnableIt = !is_same<Scalar,OtherScalar>::value >
-struct special_scalar_op_base : public BaseType
-{
-  // dummy operator* so that the
-  // "using special_scalar_op_base::operator*" compiles
-  void operator*() const;
+template <typename Derived, typename XprKind = typename traits<Derived>::XprKind,
+          typename StorageKind = typename traits<Derived>::StorageKind>
+struct generic_xpr_base;
+
+template <typename Derived, typename XprKind>
+struct generic_xpr_base<Derived, XprKind, Dense> {
+  typedef typename dense_xpr_base<Derived, XprKind>::type type;
 };
 
-template<typename Derived,typename Scalar,typename OtherScalar, typename BaseType>
-struct special_scalar_op_base<Derived,Scalar,OtherScalar,BaseType,true>  : public BaseType
-{
-  const CwiseUnaryOp<scalar_multiple2_op<Scalar,OtherScalar>, Derived>
-  operator*(const OtherScalar& scalar) const
-  {
-    return CwiseUnaryOp<scalar_multiple2_op<Scalar,OtherScalar>, Derived>
-      (*static_cast<const Derived*>(this), scalar_multiple2_op<Scalar,OtherScalar>(scalar));
-  }
+template <typename XprType, typename CastType>
+struct cast_return_type {
+  typedef typename XprType::Scalar CurrentScalarType;
+  typedef remove_all_t<CastType> CastType_;
+  typedef typename CastType_::Scalar NewScalarType;
+  typedef std::conditional_t<is_same<CurrentScalarType, NewScalarType>::value, const XprType&, CastType> type;
+};
+
+template <typename A, typename B>
+struct promote_storage_type;
 
-  inline friend const CwiseUnaryOp<scalar_multiple2_op<Scalar,OtherScalar>, Derived>
-  operator*(const OtherScalar& scalar, const Derived& matrix)
-  { return static_cast<const special_scalar_op_base&>(matrix).operator*(scalar); }
+template <typename A>
+struct promote_storage_type<A, A> {
+  typedef A ret;
+};
+template <typename A>
+struct promote_storage_type<A, const A> {
+  typedef A ret;
+};
+template <typename A>
+struct promote_storage_type<const A, A> {
+  typedef A ret;
 };
 
-template<typename XprType, typename CastType> struct cast_return_type
-{
-  typedef typename XprType::Scalar CurrentScalarType;
-  typedef typename remove_all<CastType>::type _CastType;
-  typedef typename _CastType::Scalar NewScalarType;
-  typedef typename conditional<is_same<CurrentScalarType,NewScalarType>::value,
-                              const XprType&,CastType>::type type;
+/** \internal Specify the "storage kind" of applying a coefficient-wise
+ * binary operations between two expressions of kinds A and B respectively.
+ * The template parameter Functor permits to specialize the resulting storage kind wrt to
+ * the functor.
+ * The default rules are as follows:
+ * \code
+ * A      op A      -> A
+ * A      op dense  -> dense
+ * dense  op B      -> dense
+ * sparse op dense  -> sparse
+ * dense  op sparse -> sparse
+ * \endcode
+ */
+template <typename A, typename B, typename Functor>
+struct cwise_promote_storage_type;
+
+template <typename A, typename Functor>
+struct cwise_promote_storage_type<A, A, Functor> {
+  typedef A ret;
+};
+template <typename Functor>
+struct cwise_promote_storage_type<Dense, Dense, Functor> {
+  typedef Dense ret;
+};
+template <typename A, typename Functor>
+struct cwise_promote_storage_type<A, Dense, Functor> {
+  typedef Dense ret;
+};
+template <typename B, typename Functor>
+struct cwise_promote_storage_type<Dense, B, Functor> {
+  typedef Dense ret;
+};
+template <typename Functor>
+struct cwise_promote_storage_type<Sparse, Dense, Functor> {
+  typedef Sparse ret;
+};
+template <typename Functor>
+struct cwise_promote_storage_type<Dense, Sparse, Functor> {
+  typedef Sparse ret;
 };
 
-template <typename A, typename B> struct promote_storage_type;
+template <typename LhsKind, typename RhsKind, int LhsOrder, int RhsOrder>
+struct cwise_promote_storage_order {
+  enum { value = LhsOrder };
+};
 
-template <typename A> struct promote_storage_type<A,A>
-{
+template <typename LhsKind, int LhsOrder, int RhsOrder>
+struct cwise_promote_storage_order<LhsKind, Sparse, LhsOrder, RhsOrder> {
+  enum { value = RhsOrder };
+};
+template <typename RhsKind, int LhsOrder, int RhsOrder>
+struct cwise_promote_storage_order<Sparse, RhsKind, LhsOrder, RhsOrder> {
+  enum { value = LhsOrder };
+};
+template <int Order>
+struct cwise_promote_storage_order<Sparse, Sparse, Order, Order> {
+  enum { value = Order };
+};
+
+/** \internal Specify the "storage kind" of multiplying an expression of kind A with kind B.
+ * The template parameter ProductTag permits to specialize the resulting storage kind wrt to
+ * some compile-time properties of the product: GemmProduct, GemvProduct, OuterProduct, InnerProduct.
+ * The default rules are as follows:
+ * \code
+ *  K * K            -> K
+ *  dense * K        -> dense
+ *  K * dense        -> dense
+ *  diag * K         -> K
+ *  K * diag         -> K
+ *  Perm * K         -> K
+ * K * Perm          -> K
+ * \endcode
+ */
+template <typename A, typename B, int ProductTag>
+struct product_promote_storage_type;
+
+template <typename A, int ProductTag>
+struct product_promote_storage_type<A, A, ProductTag> {
+  typedef A ret;
+};
+template <int ProductTag>
+struct product_promote_storage_type<Dense, Dense, ProductTag> {
+  typedef Dense ret;
+};
+template <typename A, int ProductTag>
+struct product_promote_storage_type<A, Dense, ProductTag> {
+  typedef Dense ret;
+};
+template <typename B, int ProductTag>
+struct product_promote_storage_type<Dense, B, ProductTag> {
+  typedef Dense ret;
+};
+
+template <typename A, int ProductTag>
+struct product_promote_storage_type<A, DiagonalShape, ProductTag> {
   typedef A ret;
 };
+template <typename B, int ProductTag>
+struct product_promote_storage_type<DiagonalShape, B, ProductTag> {
+  typedef B ret;
+};
+template <int ProductTag>
+struct product_promote_storage_type<Dense, DiagonalShape, ProductTag> {
+  typedef Dense ret;
+};
+template <int ProductTag>
+struct product_promote_storage_type<DiagonalShape, Dense, ProductTag> {
+  typedef Dense ret;
+};
+
+template <typename A, int ProductTag>
+struct product_promote_storage_type<A, SkewSymmetricShape, ProductTag> {
+  typedef A ret;
+};
+template <typename B, int ProductTag>
+struct product_promote_storage_type<SkewSymmetricShape, B, ProductTag> {
+  typedef B ret;
+};
+template <int ProductTag>
+struct product_promote_storage_type<Dense, SkewSymmetricShape, ProductTag> {
+  typedef Dense ret;
+};
+template <int ProductTag>
+struct product_promote_storage_type<SkewSymmetricShape, Dense, ProductTag> {
+  typedef Dense ret;
+};
+template <int ProductTag>
+struct product_promote_storage_type<SkewSymmetricShape, SkewSymmetricShape, ProductTag> {
+  typedef Dense ret;
+};
+
+template <typename A, int ProductTag>
+struct product_promote_storage_type<A, PermutationStorage, ProductTag> {
+  typedef A ret;
+};
+template <typename B, int ProductTag>
+struct product_promote_storage_type<PermutationStorage, B, ProductTag> {
+  typedef B ret;
+};
+template <int ProductTag>
+struct product_promote_storage_type<Dense, PermutationStorage, ProductTag> {
+  typedef Dense ret;
+};
+template <int ProductTag>
+struct product_promote_storage_type<PermutationStorage, Dense, ProductTag> {
+  typedef Dense ret;
+};
 
 /** \internal gives the plain matrix or array type to store a row/column/diagonal of a matrix type.
-  * \param Scalar optional parameter allowing to pass a different scalar type than the one of the MatrixType.
-  */
-template<typename ExpressionType, typename Scalar = typename ExpressionType::Scalar>
-struct plain_row_type
-{
+ * \tparam Scalar optional parameter allowing to pass a different scalar type than the one of the MatrixType.
+ */
+template <typename ExpressionType, typename Scalar = typename ExpressionType::Scalar>
+struct plain_row_type {
   typedef Matrix<Scalar, 1, ExpressionType::ColsAtCompileTime,
-                 ExpressionType::PlainObject::Options | RowMajor, 1, ExpressionType::MaxColsAtCompileTime> MatrixRowType;
-  typedef Array<Scalar, 1, ExpressionType::ColsAtCompileTime,
-                 ExpressionType::PlainObject::Options | RowMajor, 1, ExpressionType::MaxColsAtCompileTime> ArrayRowType;
+                 int(ExpressionType::PlainObject::Options) | int(RowMajor), 1, ExpressionType::MaxColsAtCompileTime>
+      MatrixRowType;
+  typedef Array<Scalar, 1, ExpressionType::ColsAtCompileTime, int(ExpressionType::PlainObject::Options) | int(RowMajor),
+                1, ExpressionType::MaxColsAtCompileTime>
+      ArrayRowType;
 
-  typedef typename conditional<
-    is_same< typename traits<ExpressionType>::XprKind, MatrixXpr >::value,
-    MatrixRowType,
-    ArrayRowType 
-  >::type type;
+  typedef std::conditional_t<is_same<typename traits<ExpressionType>::XprKind, MatrixXpr>::value, MatrixRowType,
+                             ArrayRowType>
+      type;
 };
 
-template<typename ExpressionType, typename Scalar = typename ExpressionType::Scalar>
-struct plain_col_type
-{
-  typedef Matrix<Scalar, ExpressionType::RowsAtCompileTime, 1,
-                 ExpressionType::PlainObject::Options & ~RowMajor, ExpressionType::MaxRowsAtCompileTime, 1> MatrixColType;
-  typedef Array<Scalar, ExpressionType::RowsAtCompileTime, 1,
-                 ExpressionType::PlainObject::Options & ~RowMajor, ExpressionType::MaxRowsAtCompileTime, 1> ArrayColType;
+template <typename ExpressionType, typename Scalar = typename ExpressionType::Scalar>
+struct plain_col_type {
+  typedef Matrix<Scalar, ExpressionType::RowsAtCompileTime, 1, ExpressionType::PlainObject::Options & ~RowMajor,
+                 ExpressionType::MaxRowsAtCompileTime, 1>
+      MatrixColType;
+  typedef Array<Scalar, ExpressionType::RowsAtCompileTime, 1, ExpressionType::PlainObject::Options & ~RowMajor,
+                ExpressionType::MaxRowsAtCompileTime, 1>
+      ArrayColType;
 
-  typedef typename conditional<
-    is_same< typename traits<ExpressionType>::XprKind, MatrixXpr >::value,
-    MatrixColType,
-    ArrayColType 
-  >::type type;
+  typedef std::conditional_t<is_same<typename traits<ExpressionType>::XprKind, MatrixXpr>::value, MatrixColType,
+                             ArrayColType>
+      type;
 };
 
-template<typename ExpressionType, typename Scalar = typename ExpressionType::Scalar>
-struct plain_diag_type
-{
-  enum { diag_size = EIGEN_SIZE_MIN_PREFER_DYNAMIC(ExpressionType::RowsAtCompileTime, ExpressionType::ColsAtCompileTime),
-         max_diag_size = EIGEN_SIZE_MIN_PREFER_FIXED(ExpressionType::MaxRowsAtCompileTime, ExpressionType::MaxColsAtCompileTime)
+template <typename ExpressionType, typename Scalar = typename ExpressionType::Scalar>
+struct plain_diag_type {
+  enum {
+    diag_size = internal::min_size_prefer_dynamic(ExpressionType::RowsAtCompileTime, ExpressionType::ColsAtCompileTime),
+    max_diag_size = min_size_prefer_fixed(ExpressionType::MaxRowsAtCompileTime, ExpressionType::MaxColsAtCompileTime)
   };
-  typedef Matrix<Scalar, diag_size, 1, ExpressionType::PlainObject::Options & ~RowMajor, max_diag_size, 1> MatrixDiagType;
+  typedef Matrix<Scalar, diag_size, 1, ExpressionType::PlainObject::Options & ~RowMajor, max_diag_size, 1>
+      MatrixDiagType;
   typedef Array<Scalar, diag_size, 1, ExpressionType::PlainObject::Options & ~RowMajor, max_diag_size, 1> ArrayDiagType;
 
-  typedef typename conditional<
-    is_same< typename traits<ExpressionType>::XprKind, MatrixXpr >::value,
-    MatrixDiagType,
-    ArrayDiagType 
-  >::type type;
+  typedef std::conditional_t<is_same<typename traits<ExpressionType>::XprKind, MatrixXpr>::value, MatrixDiagType,
+                             ArrayDiagType>
+      type;
+};
+
+template <typename Expr, typename Scalar = typename Expr::Scalar>
+struct plain_constant_type {
+  enum { Options = (traits<Expr>::Flags & RowMajorBit) ? RowMajor : 0 };
+
+  typedef Array<Scalar, traits<Expr>::RowsAtCompileTime, traits<Expr>::ColsAtCompileTime, Options,
+                traits<Expr>::MaxRowsAtCompileTime, traits<Expr>::MaxColsAtCompileTime>
+      array_type;
+
+  typedef Matrix<Scalar, traits<Expr>::RowsAtCompileTime, traits<Expr>::ColsAtCompileTime, Options,
+                 traits<Expr>::MaxRowsAtCompileTime, traits<Expr>::MaxColsAtCompileTime>
+      matrix_type;
+
+  typedef CwiseNullaryOp<
+      scalar_constant_op<Scalar>,
+      const std::conditional_t<is_same<typename traits<Expr>::XprKind, MatrixXpr>::value, matrix_type, array_type>>
+      type;
+};
+
+template <typename ExpressionType>
+struct is_lvalue {
+  enum { value = (!bool(is_const<ExpressionType>::value)) && bool(traits<ExpressionType>::Flags & LvalueBit) };
+};
+
+template <typename T>
+struct is_diagonal {
+  enum { ret = false };
+};
+
+template <typename T>
+struct is_diagonal<DiagonalBase<T>> {
+  enum { ret = true };
 };
 
-template<typename ExpressionType>
-struct is_lvalue
+template <typename T>
+struct is_diagonal<DiagonalWrapper<T>> {
+  enum { ret = true };
+};
+
+template <typename T, int S>
+struct is_diagonal<DiagonalMatrix<T, S>> {
+  enum { ret = true };
+};
+
+template <typename T>
+struct is_identity {
+  enum { value = false };
+};
+
+template <typename T>
+struct is_identity<CwiseNullaryOp<internal::scalar_identity_op<typename T::Scalar>, T>> {
+  enum { value = true };
+};
+
+template <typename S1, typename S2>
+struct glue_shapes;
+template <>
+struct glue_shapes<DenseShape, TriangularShape> {
+  typedef TriangularShape type;
+};
+
+template <typename T1, typename T2>
+struct possibly_same_dense {
+  enum {
+    value = has_direct_access<T1>::ret && has_direct_access<T2>::ret &&
+            is_same<typename T1::Scalar, typename T2::Scalar>::value
+  };
+};
+
+template <typename T1, typename T2>
+EIGEN_DEVICE_FUNC bool is_same_dense(const T1& mat1, const T2& mat2,
+                                     std::enable_if_t<possibly_same_dense<T1, T2>::value>* = 0) {
+  return (mat1.data() == mat2.data()) && (mat1.innerStride() == mat2.innerStride()) &&
+         (mat1.outerStride() == mat2.outerStride());
+}
+
+template <typename T1, typename T2>
+EIGEN_DEVICE_FUNC bool is_same_dense(const T1&, const T2&, std::enable_if_t<!possibly_same_dense<T1, T2>::value>* = 0) {
+  return false;
+}
+
+// Internal helper defining the cost of a scalar division for the type T.
+// The default heuristic can be specialized for each scalar type and architecture.
+template <typename T, bool Vectorized = false, typename EnableIf = void>
+struct scalar_div_cost {
+  enum { value = 8 * NumTraits<T>::MulCost };
+};
+
+template <typename T, bool Vectorized>
+struct scalar_div_cost<T, Vectorized, std::enable_if_t<NumTraits<T>::IsComplex>> {
+  using RealScalar = typename NumTraits<T>::Real;
+  enum {
+    value =
+        2 * scalar_div_cost<RealScalar>::value + 6 * NumTraits<RealScalar>::MulCost + 3 * NumTraits<RealScalar>::AddCost
+  };
+};
+
+template <bool Vectorized>
+struct scalar_div_cost<signed long, Vectorized, std::conditional_t<sizeof(long) == 8, void, false_type>> {
+  enum { value = 24 };
+};
+template <bool Vectorized>
+struct scalar_div_cost<unsigned long, Vectorized, std::conditional_t<sizeof(long) == 8, void, false_type>> {
+  enum { value = 21 };
+};
+
+#ifdef EIGEN_DEBUG_ASSIGN
+std::string demangle_traversal(int t) {
+  if (t == DefaultTraversal) return "DefaultTraversal";
+  if (t == LinearTraversal) return "LinearTraversal";
+  if (t == InnerVectorizedTraversal) return "InnerVectorizedTraversal";
+  if (t == LinearVectorizedTraversal) return "LinearVectorizedTraversal";
+  if (t == SliceVectorizedTraversal) return "SliceVectorizedTraversal";
+  return "?";
+}
+std::string demangle_unrolling(int t) {
+  if (t == NoUnrolling) return "NoUnrolling";
+  if (t == InnerUnrolling) return "InnerUnrolling";
+  if (t == CompleteUnrolling) return "CompleteUnrolling";
+  return "?";
+}
+std::string demangle_flags(int f) {
+  std::string res;
+  if (f & RowMajorBit) res += " | RowMajor";
+  if (f & PacketAccessBit) res += " | Packet";
+  if (f & LinearAccessBit) res += " | Linear";
+  if (f & LvalueBit) res += " | Lvalue";
+  if (f & DirectAccessBit) res += " | Direct";
+  if (f & NestByRefBit) res += " | NestByRef";
+  if (f & NoPreferredStorageOrderBit) res += " | NoPreferredStorageOrderBit";
+
+  return res;
+}
+#endif
+
+template <typename XprType>
+struct is_block_xpr : std::false_type {};
+
+template <typename XprType, int BlockRows, int BlockCols, bool InnerPanel>
+struct is_block_xpr<Block<XprType, BlockRows, BlockCols, InnerPanel>> : std::true_type {};
+
+template <typename XprType, int BlockRows, int BlockCols, bool InnerPanel>
+struct is_block_xpr<const Block<XprType, BlockRows, BlockCols, InnerPanel>> : std::true_type {};
+
+// Helper utility for constructing non-recursive block expressions.
+template <typename XprType>
+struct block_xpr_helper {
+  using BaseType = XprType;
+
+  // For regular block expressions, simply forward along the InnerPanel argument,
+  // which is set when calling row/column expressions.
+  static constexpr bool is_inner_panel(bool inner_panel) { return inner_panel; }
+
+  // Only enable non-const base function if XprType is not const (otherwise we get a duplicate definition).
+  template <typename T = XprType, typename EnableIf = std::enable_if_t<!std::is_const<T>::value>>
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BaseType& base(XprType& xpr) {
+    return xpr;
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const BaseType& base(const XprType& xpr) { return xpr; }
+  static constexpr EIGEN_ALWAYS_INLINE Index row(const XprType& /*xpr*/, Index r) { return r; }
+  static constexpr EIGEN_ALWAYS_INLINE Index col(const XprType& /*xpr*/, Index c) { return c; }
+};
+
+template <typename XprType, int BlockRows, int BlockCols, bool InnerPanel>
+struct block_xpr_helper<Block<XprType, BlockRows, BlockCols, InnerPanel>> {
+  using BlockXprType = Block<XprType, BlockRows, BlockCols, InnerPanel>;
+  // Recursive helper in case of explicit block-of-block expression.
+  using NestedXprHelper = block_xpr_helper<XprType>;
+  using BaseType = typename NestedXprHelper::BaseType;
+
+  // For block-of-block expressions, we need to combine the InnerPannel trait
+  // with that of the block subexpression.
+  static constexpr bool is_inner_panel(bool inner_panel) { return InnerPanel && inner_panel; }
+
+  // Only enable non-const base function if XprType is not const (otherwise we get a duplicates definition).
+  template <typename T = XprType, typename EnableIf = std::enable_if_t<!std::is_const<T>::value>>
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BaseType& base(BlockXprType& xpr) {
+    return NestedXprHelper::base(xpr.nestedExpression());
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const BaseType& base(const BlockXprType& xpr) {
+    return NestedXprHelper::base(xpr.nestedExpression());
+  }
+  static constexpr EIGEN_ALWAYS_INLINE Index row(const BlockXprType& xpr, Index r) {
+    return xpr.startRow() + NestedXprHelper::row(xpr.nestedExpression(), r);
+  }
+  static constexpr EIGEN_ALWAYS_INLINE Index col(const BlockXprType& xpr, Index c) {
+    return xpr.startCol() + NestedXprHelper::col(xpr.nestedExpression(), c);
+  }
+};
+
+template <typename XprType, int BlockRows, int BlockCols, bool InnerPanel>
+struct block_xpr_helper<const Block<XprType, BlockRows, BlockCols, InnerPanel>>
+    : block_xpr_helper<Block<XprType, BlockRows, BlockCols, InnerPanel>> {};
+
+template <typename XprType>
+struct is_matrix_base_xpr : std::is_base_of<MatrixBase<remove_all_t<XprType>>, remove_all_t<XprType>> {};
+
+template <typename XprType>
+struct is_permutation_base_xpr : std::is_base_of<PermutationBase<remove_all_t<XprType>>, remove_all_t<XprType>> {};
+
+}  // end namespace internal
+
+/** \class ScalarBinaryOpTraits
+  * \ingroup Core_Module
+  *
+  * \brief Determines whether the given binary operation of two numeric types is allowed and what the scalar return type
+  is.
+  *
+  * This class permits to control the scalar return type of any binary operation performed on two different scalar types
+  through (partial) template specializations.
+  *
+  * For instance, let \c U1, \c U2 and \c U3 be three user defined scalar types for which most operations between
+  instances of \c U1 and \c U2 returns an \c U3.
+  * You can let %Eigen knows that by defining:
+    \code
+    template<typename BinaryOp>
+    struct ScalarBinaryOpTraits<U1,U2,BinaryOp> { typedef U3 ReturnType;  };
+    template<typename BinaryOp>
+    struct ScalarBinaryOpTraits<U2,U1,BinaryOp> { typedef U3 ReturnType;  };
+    \endcode
+  * You can then explicitly disable some particular operations to get more explicit error messages:
+    \code
+    template<>
+    struct ScalarBinaryOpTraits<U1,U2,internal::scalar_max_op<U1,U2> > {};
+    \endcode
+  * Or customize the return type for individual operation:
+    \code
+    template<>
+    struct ScalarBinaryOpTraits<U1,U2,internal::scalar_sum_op<U1,U2> > { typedef U1 ReturnType; };
+    \endcode
+  *
+  * By default, the following generic combinations are supported:
+  <table class="manual">
+  <tr><th>ScalarA</th><th>ScalarB</th><th>BinaryOp</th><th>ReturnType</th><th>Note</th></tr>
+  <tr            ><td>\c T </td><td>\c T </td><td>\c * </td><td>\c T </td><td></td></tr>
+  <tr class="alt"><td>\c NumTraits<T>::Real </td><td>\c T </td><td>\c * </td><td>\c T </td><td>Only if \c
+  NumTraits<T>::IsComplex </td></tr> <tr            ><td>\c T </td><td>\c NumTraits<T>::Real </td><td>\c * </td><td>\c T
+  </td><td>Only if \c NumTraits<T>::IsComplex </td></tr>
+  </table>
+  *
+  * \sa CwiseBinaryOp
+  */
+template <typename ScalarA, typename ScalarB, typename BinaryOp = internal::scalar_product_op<ScalarA, ScalarB>>
+struct ScalarBinaryOpTraits
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    // for backward compatibility, use the hints given by the (deprecated) internal::scalar_product_traits class.
+    : internal::scalar_product_traits<ScalarA, ScalarB>
+#endif  // EIGEN_PARSED_BY_DOXYGEN
 {
-  enum { value = !bool(is_const<ExpressionType>::value) &&
-                 bool(traits<ExpressionType>::Flags & LvalueBit) };
 };
 
-} // end namespace internal
+template <typename T, typename BinaryOp>
+struct ScalarBinaryOpTraits<T, T, BinaryOp> {
+  typedef T ReturnType;
+};
+
+template <typename T, typename BinaryOp>
+struct ScalarBinaryOpTraits<T, typename NumTraits<std::enable_if_t<NumTraits<T>::IsComplex, T>>::Real, BinaryOp> {
+  typedef T ReturnType;
+};
+template <typename T, typename BinaryOp>
+struct ScalarBinaryOpTraits<typename NumTraits<std::enable_if_t<NumTraits<T>::IsComplex, T>>::Real, T, BinaryOp> {
+  typedef T ReturnType;
+};
+
+// For Matrix * Permutation
+template <typename T, typename BinaryOp>
+struct ScalarBinaryOpTraits<T, void, BinaryOp> {
+  typedef T ReturnType;
+};
+
+// For Permutation * Matrix
+template <typename T, typename BinaryOp>
+struct ScalarBinaryOpTraits<void, T, BinaryOp> {
+  typedef T ReturnType;
+};
+
+// for Permutation*Permutation
+template <typename BinaryOp>
+struct ScalarBinaryOpTraits<void, void, BinaryOp> {
+  typedef void ReturnType;
+};
+
+// We require Lhs and Rhs to have "compatible" scalar types.
+// It is tempting to always allow mixing different types but remember that this is often impossible in the vectorized
+// paths. So allowing mixing different types gives very unexpected errors when enabling vectorization, when the user
+// tries to add together a float matrix and a double matrix.
+#define EIGEN_CHECK_BINARY_COMPATIBILIY(BINOP, LHS, RHS)                               \
+  EIGEN_STATIC_ASSERT(                                                                 \
+      (Eigen::internal::has_ReturnType<ScalarBinaryOpTraits<LHS, RHS, BINOP>>::value), \
+      YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_XPRHELPER_H
+#endif  // EIGEN_XPRHELPER_H
diff --git a/inst/include/Eigen/src/Eigen2Support/Block.h b/inst/include/Eigen/src/Eigen2Support/Block.h
deleted file mode 100644
index 604456f4..00000000
--- a/inst/include/Eigen/src/Eigen2Support/Block.h
+++ /dev/null
@@ -1,126 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
-// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_BLOCK2_H
-#define EIGEN_BLOCK2_H
-
-namespace Eigen { 
-
-/** \returns a dynamic-size expression of a corner of *this.
-  *
-  * \param type the type of corner. Can be \a Eigen::TopLeft, \a Eigen::TopRight,
-  * \a Eigen::BottomLeft, \a Eigen::BottomRight.
-  * \param cRows the number of rows in the corner
-  * \param cCols the number of columns in the corner
-  *
-  * Example: \include MatrixBase_corner_enum_int_int.cpp
-  * Output: \verbinclude MatrixBase_corner_enum_int_int.out
-  *
-  * \note Even though the returned expression has dynamic size, in the case
-  * when it is applied to a fixed-size matrix, it inherits a fixed maximal size,
-  * which means that evaluating it does not cause a dynamic memory allocation.
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
-template<typename Derived>
-inline Block<Derived> DenseBase<Derived>
-  ::corner(CornerType type, Index cRows, Index cCols)
-{
-  switch(type)
-  {
-    default:
-      eigen_assert(false && "Bad corner type.");
-    case TopLeft:
-      return Block<Derived>(derived(), 0, 0, cRows, cCols);
-    case TopRight:
-      return Block<Derived>(derived(), 0, cols() - cCols, cRows, cCols);
-    case BottomLeft:
-      return Block<Derived>(derived(), rows() - cRows, 0, cRows, cCols);
-    case BottomRight:
-      return Block<Derived>(derived(), rows() - cRows, cols() - cCols, cRows, cCols);
-  }
-}
-
-/** This is the const version of corner(CornerType, Index, Index).*/
-template<typename Derived>
-inline const Block<Derived>
-DenseBase<Derived>::corner(CornerType type, Index cRows, Index cCols) const
-{
-  switch(type)
-  {
-    default:
-      eigen_assert(false && "Bad corner type.");
-    case TopLeft:
-      return Block<Derived>(derived(), 0, 0, cRows, cCols);
-    case TopRight:
-      return Block<Derived>(derived(), 0, cols() - cCols, cRows, cCols);
-    case BottomLeft:
-      return Block<Derived>(derived(), rows() - cRows, 0, cRows, cCols);
-    case BottomRight:
-      return Block<Derived>(derived(), rows() - cRows, cols() - cCols, cRows, cCols);
-  }
-}
-
-/** \returns a fixed-size expression of a corner of *this.
-  *
-  * \param type the type of corner. Can be \a Eigen::TopLeft, \a Eigen::TopRight,
-  * \a Eigen::BottomLeft, \a Eigen::BottomRight.
-  *
-  * The template parameters CRows and CCols arethe number of rows and columns in the corner.
-  *
-  * Example: \include MatrixBase_template_int_int_corner_enum.cpp
-  * Output: \verbinclude MatrixBase_template_int_int_corner_enum.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
-template<typename Derived>
-template<int CRows, int CCols>
-inline Block<Derived, CRows, CCols>
-DenseBase<Derived>::corner(CornerType type)
-{
-  switch(type)
-  {
-    default:
-      eigen_assert(false && "Bad corner type.");
-    case TopLeft:
-      return Block<Derived, CRows, CCols>(derived(), 0, 0);
-    case TopRight:
-      return Block<Derived, CRows, CCols>(derived(), 0, cols() - CCols);
-    case BottomLeft:
-      return Block<Derived, CRows, CCols>(derived(), rows() - CRows, 0);
-    case BottomRight:
-      return Block<Derived, CRows, CCols>(derived(), rows() - CRows, cols() - CCols);
-  }
-}
-
-/** This is the const version of corner<int, int>(CornerType).*/
-template<typename Derived>
-template<int CRows, int CCols>
-inline const Block<Derived, CRows, CCols>
-DenseBase<Derived>::corner(CornerType type) const
-{
-  switch(type)
-  {
-    default:
-      eigen_assert(false && "Bad corner type.");
-    case TopLeft:
-      return Block<Derived, CRows, CCols>(derived(), 0, 0);
-    case TopRight:
-      return Block<Derived, CRows, CCols>(derived(), 0, cols() - CCols);
-    case BottomLeft:
-      return Block<Derived, CRows, CCols>(derived(), rows() - CRows, 0);
-    case BottomRight:
-      return Block<Derived, CRows, CCols>(derived(), rows() - CRows, cols() - CCols);
-  }
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN_BLOCK2_H
diff --git a/inst/include/Eigen/src/Eigen2Support/Cwise.h b/inst/include/Eigen/src/Eigen2Support/Cwise.h
deleted file mode 100644
index d95009b6..00000000
--- a/inst/include/Eigen/src/Eigen2Support/Cwise.h
+++ /dev/null
@@ -1,192 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
-// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_CWISE_H
-#define EIGEN_CWISE_H
-
-namespace Eigen { 
-
-/** \internal
-  * convenient macro to defined the return type of a cwise binary operation */
-#define EIGEN_CWISE_BINOP_RETURN_TYPE(OP) \
-    CwiseBinaryOp<OP<typename internal::traits<ExpressionType>::Scalar>, ExpressionType, OtherDerived>
-
-/** \internal
-  * convenient macro to defined the return type of a cwise unary operation */
-#define EIGEN_CWISE_UNOP_RETURN_TYPE(OP) \
-    CwiseUnaryOp<OP<typename internal::traits<ExpressionType>::Scalar>, ExpressionType>
-
-/** \internal
-  * convenient macro to defined the return type of a cwise comparison to a scalar */
-#define EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(OP) \
-    CwiseBinaryOp<OP<typename internal::traits<ExpressionType>::Scalar>, ExpressionType, \
-        typename ExpressionType::ConstantReturnType >
-
-/** \class Cwise
-  *
-  * \brief Pseudo expression providing additional coefficient-wise operations
-  *
-  * \param ExpressionType the type of the object on which to do coefficient-wise operations
-  *
-  * This class represents an expression with additional coefficient-wise features.
-  * It is the return type of MatrixBase::cwise()
-  * and most of the time this is the only way it is used.
-  *
-  * Example: \include MatrixBase_cwise_const.cpp
-  * Output: \verbinclude MatrixBase_cwise_const.out
-  *
-  * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_CWISE_PLUGIN.
-  *
-  * \sa MatrixBase::cwise() const, MatrixBase::cwise()
-  */
-template<typename ExpressionType> class Cwise
-{
-  public:
-
-    typedef typename internal::traits<ExpressionType>::Scalar Scalar;
-    typedef typename internal::conditional<internal::must_nest_by_value<ExpressionType>::ret,
-        ExpressionType, const ExpressionType&>::type ExpressionTypeNested;
-    typedef CwiseUnaryOp<internal::scalar_add_op<Scalar>, ExpressionType> ScalarAddReturnType;
-
-    inline Cwise(const ExpressionType& matrix) : m_matrix(matrix) {}
-
-    /** \internal */
-    inline const ExpressionType& _expression() const { return m_matrix; }
-
-    template<typename OtherDerived>
-    const EIGEN_CWISE_PRODUCT_RETURN_TYPE(ExpressionType,OtherDerived)
-    operator*(const MatrixBase<OtherDerived> &other) const;
-
-    template<typename OtherDerived>
-    const EIGEN_CWISE_BINOP_RETURN_TYPE(internal::scalar_quotient_op)
-    operator/(const MatrixBase<OtherDerived> &other) const;
-
-    /** \deprecated ArrayBase::min() */
-    template<typename OtherDerived>
-    const EIGEN_CWISE_BINOP_RETURN_TYPE(internal::scalar_min_op)
-    (min)(const MatrixBase<OtherDerived> &other) const
-    { return EIGEN_CWISE_BINOP_RETURN_TYPE(internal::scalar_min_op)(_expression(), other.derived()); }
-
-    /** \deprecated ArrayBase::max() */
-    template<typename OtherDerived>
-    const EIGEN_CWISE_BINOP_RETURN_TYPE(internal::scalar_max_op)
-    (max)(const MatrixBase<OtherDerived> &other) const
-    { return EIGEN_CWISE_BINOP_RETURN_TYPE(internal::scalar_max_op)(_expression(), other.derived()); }
-
-    const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_abs_op)      abs() const;
-    const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_abs2_op)     abs2() const;
-    const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_square_op)   square() const;
-    const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_cube_op)     cube() const;
-    const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_inverse_op)  inverse() const;
-    const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_sqrt_op)     sqrt() const;
-    const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_exp_op)      exp() const;
-    const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_log_op)      log() const;
-    const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_cos_op)      cos() const;
-    const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_sin_op)      sin() const;
-    const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_pow_op)      pow(const Scalar& exponent) const;
-
-    const ScalarAddReturnType
-    operator+(const Scalar& scalar) const;
-
-    /** \relates Cwise */
-    friend const ScalarAddReturnType
-    operator+(const Scalar& scalar, const Cwise& mat)
-    { return mat + scalar; }
-
-    ExpressionType& operator+=(const Scalar& scalar);
-
-    const ScalarAddReturnType
-    operator-(const Scalar& scalar) const;
-
-    ExpressionType& operator-=(const Scalar& scalar);
-
-    template<typename OtherDerived>
-    inline ExpressionType& operator*=(const MatrixBase<OtherDerived> &other);
-
-    template<typename OtherDerived>
-    inline ExpressionType& operator/=(const MatrixBase<OtherDerived> &other);
-
-    template<typename OtherDerived> const EIGEN_CWISE_BINOP_RETURN_TYPE(std::less)
-    operator<(const MatrixBase<OtherDerived>& other) const;
-
-    template<typename OtherDerived> const EIGEN_CWISE_BINOP_RETURN_TYPE(std::less_equal)
-    operator<=(const MatrixBase<OtherDerived>& other) const;
-
-    template<typename OtherDerived> const EIGEN_CWISE_BINOP_RETURN_TYPE(std::greater)
-    operator>(const MatrixBase<OtherDerived>& other) const;
-
-    template<typename OtherDerived> const EIGEN_CWISE_BINOP_RETURN_TYPE(std::greater_equal)
-    operator>=(const MatrixBase<OtherDerived>& other) const;
-
-    template<typename OtherDerived> const EIGEN_CWISE_BINOP_RETURN_TYPE(std::equal_to)
-    operator==(const MatrixBase<OtherDerived>& other) const;
-
-    template<typename OtherDerived> const EIGEN_CWISE_BINOP_RETURN_TYPE(std::not_equal_to)
-    operator!=(const MatrixBase<OtherDerived>& other) const;
-
-    // comparisons to a scalar value
-    const EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::less)
-    operator<(Scalar s) const;
-
-    const EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::less_equal)
-    operator<=(Scalar s) const;
-
-    const EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::greater)
-    operator>(Scalar s) const;
-
-    const EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::greater_equal)
-    operator>=(Scalar s) const;
-
-    const EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::equal_to)
-    operator==(Scalar s) const;
-
-    const EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::not_equal_to)
-    operator!=(Scalar s) const;
-
-    // allow to extend Cwise outside Eigen
-    #ifdef EIGEN_CWISE_PLUGIN
-    #include EIGEN_CWISE_PLUGIN
-    #endif
-
-  protected:
-    ExpressionTypeNested m_matrix;
-};
-
-
-/** \returns a Cwise wrapper of *this providing additional coefficient-wise operations
-  *
-  * Example: \include MatrixBase_cwise_const.cpp
-  * Output: \verbinclude MatrixBase_cwise_const.out
-  *
-  * \sa class Cwise, cwise()
-  */
-template<typename Derived>
-inline const Cwise<Derived> MatrixBase<Derived>::cwise() const
-{
-  return derived();
-}
-
-/** \returns a Cwise wrapper of *this providing additional coefficient-wise operations
-  *
-  * Example: \include MatrixBase_cwise.cpp
-  * Output: \verbinclude MatrixBase_cwise.out
-  *
-  * \sa class Cwise, cwise() const
-  */
-template<typename Derived>
-inline Cwise<Derived> MatrixBase<Derived>::cwise()
-{
-  return derived();
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN_CWISE_H
diff --git a/inst/include/Eigen/src/Eigen2Support/CwiseOperators.h b/inst/include/Eigen/src/Eigen2Support/CwiseOperators.h
deleted file mode 100644
index 482f3064..00000000
--- a/inst/include/Eigen/src/Eigen2Support/CwiseOperators.h
+++ /dev/null
@@ -1,298 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_ARRAY_CWISE_OPERATORS_H
-#define EIGEN_ARRAY_CWISE_OPERATORS_H
-
-namespace Eigen { 
-
-/***************************************************************************
-* The following functions were defined in Core
-***************************************************************************/
-
-
-/** \deprecated ArrayBase::abs() */
-template<typename ExpressionType>
-EIGEN_STRONG_INLINE const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_abs_op)
-Cwise<ExpressionType>::abs() const
-{
-  return _expression();
-}
-
-/** \deprecated ArrayBase::abs2() */
-template<typename ExpressionType>
-EIGEN_STRONG_INLINE const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_abs2_op)
-Cwise<ExpressionType>::abs2() const
-{
-  return _expression();
-}
-
-/** \deprecated ArrayBase::exp() */
-template<typename ExpressionType>
-inline const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_exp_op)
-Cwise<ExpressionType>::exp() const
-{
-  return _expression();
-}
-
-/** \deprecated ArrayBase::log() */
-template<typename ExpressionType>
-inline const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_log_op)
-Cwise<ExpressionType>::log() const
-{
-  return _expression();
-}
-
-/** \deprecated ArrayBase::operator*() */
-template<typename ExpressionType>
-template<typename OtherDerived>
-EIGEN_STRONG_INLINE const EIGEN_CWISE_PRODUCT_RETURN_TYPE(ExpressionType,OtherDerived)
-Cwise<ExpressionType>::operator*(const MatrixBase<OtherDerived> &other) const
-{
-  return EIGEN_CWISE_PRODUCT_RETURN_TYPE(ExpressionType,OtherDerived)(_expression(), other.derived());
-}
-
-/** \deprecated ArrayBase::operator/() */
-template<typename ExpressionType>
-template<typename OtherDerived>
-EIGEN_STRONG_INLINE const EIGEN_CWISE_BINOP_RETURN_TYPE(internal::scalar_quotient_op)
-Cwise<ExpressionType>::operator/(const MatrixBase<OtherDerived> &other) const
-{
-  return EIGEN_CWISE_BINOP_RETURN_TYPE(internal::scalar_quotient_op)(_expression(), other.derived());
-}
-
-/** \deprecated ArrayBase::operator*=() */
-template<typename ExpressionType>
-template<typename OtherDerived>
-inline ExpressionType& Cwise<ExpressionType>::operator*=(const MatrixBase<OtherDerived> &other)
-{
-  return m_matrix.const_cast_derived() = *this * other;
-}
-
-/** \deprecated ArrayBase::operator/=() */
-template<typename ExpressionType>
-template<typename OtherDerived>
-inline ExpressionType& Cwise<ExpressionType>::operator/=(const MatrixBase<OtherDerived> &other)
-{
-  return m_matrix.const_cast_derived() = *this / other;
-}
-
-/***************************************************************************
-* The following functions were defined in Array
-***************************************************************************/
-
-// -- unary operators --
-
-/** \deprecated ArrayBase::sqrt() */
-template<typename ExpressionType>
-inline const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_sqrt_op)
-Cwise<ExpressionType>::sqrt() const
-{
-  return _expression();
-}
-
-/** \deprecated ArrayBase::cos() */
-template<typename ExpressionType>
-inline const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_cos_op)
-Cwise<ExpressionType>::cos() const
-{
-  return _expression();
-}
-
-
-/** \deprecated ArrayBase::sin() */
-template<typename ExpressionType>
-inline const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_sin_op)
-Cwise<ExpressionType>::sin() const
-{
-  return _expression();
-}
-
-
-/** \deprecated ArrayBase::log() */
-template<typename ExpressionType>
-inline const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_pow_op)
-Cwise<ExpressionType>::pow(const Scalar& exponent) const
-{
-  return EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_pow_op)(_expression(), internal::scalar_pow_op<Scalar>(exponent));
-}
-
-
-/** \deprecated ArrayBase::inverse() */
-template<typename ExpressionType>
-inline const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_inverse_op)
-Cwise<ExpressionType>::inverse() const
-{
-  return _expression();
-}
-
-/** \deprecated ArrayBase::square() */
-template<typename ExpressionType>
-inline const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_square_op)
-Cwise<ExpressionType>::square() const
-{
-  return _expression();
-}
-
-/** \deprecated ArrayBase::cube() */
-template<typename ExpressionType>
-inline const EIGEN_CWISE_UNOP_RETURN_TYPE(internal::scalar_cube_op)
-Cwise<ExpressionType>::cube() const
-{
-  return _expression();
-}
-
-
-// -- binary operators --
-
-/** \deprecated ArrayBase::operator<() */
-template<typename ExpressionType>
-template<typename OtherDerived>
-inline const EIGEN_CWISE_BINOP_RETURN_TYPE(std::less)
-Cwise<ExpressionType>::operator<(const MatrixBase<OtherDerived> &other) const
-{
-  return EIGEN_CWISE_BINOP_RETURN_TYPE(std::less)(_expression(), other.derived());
-}
-
-/** \deprecated ArrayBase::<=() */
-template<typename ExpressionType>
-template<typename OtherDerived>
-inline const EIGEN_CWISE_BINOP_RETURN_TYPE(std::less_equal)
-Cwise<ExpressionType>::operator<=(const MatrixBase<OtherDerived> &other) const
-{
-  return EIGEN_CWISE_BINOP_RETURN_TYPE(std::less_equal)(_expression(), other.derived());
-}
-
-/** \deprecated ArrayBase::operator>() */
-template<typename ExpressionType>
-template<typename OtherDerived>
-inline const EIGEN_CWISE_BINOP_RETURN_TYPE(std::greater)
-Cwise<ExpressionType>::operator>(const MatrixBase<OtherDerived> &other) const
-{
-  return EIGEN_CWISE_BINOP_RETURN_TYPE(std::greater)(_expression(), other.derived());
-}
-
-/** \deprecated ArrayBase::operator>=() */
-template<typename ExpressionType>
-template<typename OtherDerived>
-inline const EIGEN_CWISE_BINOP_RETURN_TYPE(std::greater_equal)
-Cwise<ExpressionType>::operator>=(const MatrixBase<OtherDerived> &other) const
-{
-  return EIGEN_CWISE_BINOP_RETURN_TYPE(std::greater_equal)(_expression(), other.derived());
-}
-
-/** \deprecated ArrayBase::operator==() */
-template<typename ExpressionType>
-template<typename OtherDerived>
-inline const EIGEN_CWISE_BINOP_RETURN_TYPE(std::equal_to)
-Cwise<ExpressionType>::operator==(const MatrixBase<OtherDerived> &other) const
-{
-  return EIGEN_CWISE_BINOP_RETURN_TYPE(std::equal_to)(_expression(), other.derived());
-}
-
-/** \deprecated ArrayBase::operator!=() */
-template<typename ExpressionType>
-template<typename OtherDerived>
-inline const EIGEN_CWISE_BINOP_RETURN_TYPE(std::not_equal_to)
-Cwise<ExpressionType>::operator!=(const MatrixBase<OtherDerived> &other) const
-{
-  return EIGEN_CWISE_BINOP_RETURN_TYPE(std::not_equal_to)(_expression(), other.derived());
-}
-
-// comparisons to scalar value
-
-/** \deprecated ArrayBase::operator<(Scalar) */
-template<typename ExpressionType>
-inline const EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::less)
-Cwise<ExpressionType>::operator<(Scalar s) const
-{
-  return EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::less)(_expression(),
-            typename ExpressionType::ConstantReturnType(_expression().rows(), _expression().cols(), s));
-}
-
-/** \deprecated ArrayBase::operator<=(Scalar) */
-template<typename ExpressionType>
-inline const EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::less_equal)
-Cwise<ExpressionType>::operator<=(Scalar s) const
-{
-  return EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::less_equal)(_expression(),
-            typename ExpressionType::ConstantReturnType(_expression().rows(), _expression().cols(), s));
-}
-
-/** \deprecated ArrayBase::operator>(Scalar) */
-template<typename ExpressionType>
-inline const EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::greater)
-Cwise<ExpressionType>::operator>(Scalar s) const
-{
-  return EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::greater)(_expression(),
-            typename ExpressionType::ConstantReturnType(_expression().rows(), _expression().cols(), s));
-}
-
-/** \deprecated ArrayBase::operator>=(Scalar) */
-template<typename ExpressionType>
-inline const EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::greater_equal)
-Cwise<ExpressionType>::operator>=(Scalar s) const
-{
-  return EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::greater_equal)(_expression(),
-            typename ExpressionType::ConstantReturnType(_expression().rows(), _expression().cols(), s));
-}
-
-/** \deprecated ArrayBase::operator==(Scalar) */
-template<typename ExpressionType>
-inline const EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::equal_to)
-Cwise<ExpressionType>::operator==(Scalar s) const
-{
-  return EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::equal_to)(_expression(),
-            typename ExpressionType::ConstantReturnType(_expression().rows(), _expression().cols(), s));
-}
-
-/** \deprecated ArrayBase::operator!=(Scalar) */
-template<typename ExpressionType>
-inline const EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::not_equal_to)
-Cwise<ExpressionType>::operator!=(Scalar s) const
-{
-  return EIGEN_CWISE_COMP_TO_SCALAR_RETURN_TYPE(std::not_equal_to)(_expression(),
-            typename ExpressionType::ConstantReturnType(_expression().rows(), _expression().cols(), s));
-}
-
-// scalar addition
-
-/** \deprecated ArrayBase::operator+(Scalar) */
-template<typename ExpressionType>
-inline const typename Cwise<ExpressionType>::ScalarAddReturnType
-Cwise<ExpressionType>::operator+(const Scalar& scalar) const
-{
-  return typename Cwise<ExpressionType>::ScalarAddReturnType(m_matrix, internal::scalar_add_op<Scalar>(scalar));
-}
-
-/** \deprecated ArrayBase::operator+=(Scalar) */
-template<typename ExpressionType>
-inline ExpressionType& Cwise<ExpressionType>::operator+=(const Scalar& scalar)
-{
-  return m_matrix.const_cast_derived() = *this + scalar;
-}
-
-/** \deprecated ArrayBase::operator-(Scalar) */
-template<typename ExpressionType>
-inline const typename Cwise<ExpressionType>::ScalarAddReturnType
-Cwise<ExpressionType>::operator-(const Scalar& scalar) const
-{
-  return *this + (-scalar);
-}
-
-/** \deprecated ArrayBase::operator-=(Scalar) */
-template<typename ExpressionType>
-inline ExpressionType& Cwise<ExpressionType>::operator-=(const Scalar& scalar)
-{
-  return m_matrix.const_cast_derived() = *this - scalar;
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN_ARRAY_CWISE_OPERATORS_H
diff --git a/inst/include/Eigen/src/Eigen2Support/Geometry/AlignedBox.h b/inst/include/Eigen/src/Eigen2Support/Geometry/AlignedBox.h
deleted file mode 100644
index 2e4309dd..00000000
--- a/inst/include/Eigen/src/Eigen2Support/Geometry/AlignedBox.h
+++ /dev/null
@@ -1,159 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// no include guard, we'll include this twice from All.h from Eigen2Support, and it's internal anyway
-
-namespace Eigen { 
-
-/** \geometry_module \ingroup Geometry_Module
-  * \nonstableyet
-  *
-  * \class AlignedBox
-  *
-  * \brief An axis aligned box
-  *
-  * \param _Scalar the type of the scalar coefficients
-  * \param _AmbientDim the dimension of the ambient space, can be a compile time value or Dynamic.
-  *
-  * This class represents an axis aligned box as a pair of the minimal and maximal corners.
-  */
-template <typename _Scalar, int _AmbientDim>
-class AlignedBox
-{
-public:
-EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim==Dynamic ? Dynamic : _AmbientDim+1)
-  enum { AmbientDimAtCompileTime = _AmbientDim };
-  typedef _Scalar Scalar;
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  typedef Matrix<Scalar,AmbientDimAtCompileTime,1> VectorType;
-
-  /** Default constructor initializing a null box. */
-  inline AlignedBox()
-  { if (AmbientDimAtCompileTime!=Dynamic) setNull(); }
-
-  /** Constructs a null box with \a _dim the dimension of the ambient space. */
-  inline explicit AlignedBox(int _dim) : m_min(_dim), m_max(_dim)
-  { setNull(); }
-
-  /** Constructs a box with extremities \a _min and \a _max. */
-  inline AlignedBox(const VectorType& _min, const VectorType& _max) : m_min(_min), m_max(_max) {}
-
-  /** Constructs a box containing a single point \a p. */
-  inline explicit AlignedBox(const VectorType& p) : m_min(p), m_max(p) {}
-
-  ~AlignedBox() {}
-
-  /** \returns the dimension in which the box holds */
-  inline int dim() const { return AmbientDimAtCompileTime==Dynamic ? m_min.size()-1 : AmbientDimAtCompileTime; }
-
-  /** \returns true if the box is null, i.e, empty. */
-  inline bool isNull() const { return (m_min.cwise() > m_max).any(); }
-
-  /** Makes \c *this a null/empty box. */
-  inline void setNull()
-  {
-    m_min.setConstant( (std::numeric_limits<Scalar>::max)());
-    m_max.setConstant(-(std::numeric_limits<Scalar>::max)());
-  }
-
-  /** \returns the minimal corner */
-  inline const VectorType& (min)() const { return m_min; }
-  /** \returns a non const reference to the minimal corner */
-  inline VectorType& (min)() { return m_min; }
-  /** \returns the maximal corner */
-  inline const VectorType& (max)() const { return m_max; }
-  /** \returns a non const reference to the maximal corner */
-  inline VectorType& (max)() { return m_max; }
-
-  /** \returns true if the point \a p is inside the box \c *this. */
-  inline bool contains(const VectorType& p) const
-  { return (m_min.cwise()<=p).all() && (p.cwise()<=m_max).all(); }
-
-  /** \returns true if the box \a b is entirely inside the box \c *this. */
-  inline bool contains(const AlignedBox& b) const
-  { return (m_min.cwise()<=(b.min)()).all() && ((b.max)().cwise()<=m_max).all(); }
-
-  /** Extends \c *this such that it contains the point \a p and returns a reference to \c *this. */
-  inline AlignedBox& extend(const VectorType& p)
-  { m_min = (m_min.cwise().min)(p); m_max = (m_max.cwise().max)(p); return *this; }
-
-  /** Extends \c *this such that it contains the box \a b and returns a reference to \c *this. */
-  inline AlignedBox& extend(const AlignedBox& b)
-  { m_min = (m_min.cwise().min)(b.m_min); m_max = (m_max.cwise().max)(b.m_max); return *this; }
-
-  /** Clamps \c *this by the box \a b and returns a reference to \c *this. */
-  inline AlignedBox& clamp(const AlignedBox& b)
-  { m_min = (m_min.cwise().max)(b.m_min); m_max = (m_max.cwise().min)(b.m_max); return *this; }
-
-  /** Translate \c *this by the vector \a t and returns a reference to \c *this. */
-  inline AlignedBox& translate(const VectorType& t)
-  { m_min += t; m_max += t; return *this; }
-
-  /** \returns the squared distance between the point \a p and the box \c *this,
-    * and zero if \a p is inside the box.
-    * \sa exteriorDistance()
-    */
-  inline Scalar squaredExteriorDistance(const VectorType& p) const;
-
-  /** \returns the distance between the point \a p and the box \c *this,
-    * and zero if \a p is inside the box.
-    * \sa squaredExteriorDistance()
-    */
-  inline Scalar exteriorDistance(const VectorType& p) const
-  { return ei_sqrt(squaredExteriorDistance(p)); }
-
-  /** \returns \c *this with scalar type casted to \a NewScalarType
-    *
-    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
-    * then this function smartly returns a const reference to \c *this.
-    */
-  template<typename NewScalarType>
-  inline typename internal::cast_return_type<AlignedBox,
-           AlignedBox<NewScalarType,AmbientDimAtCompileTime> >::type cast() const
-  {
-    return typename internal::cast_return_type<AlignedBox,
-                    AlignedBox<NewScalarType,AmbientDimAtCompileTime> >::type(*this);
-  }
-
-  /** Copy constructor with scalar type conversion */
-  template<typename OtherScalarType>
-  inline explicit AlignedBox(const AlignedBox<OtherScalarType,AmbientDimAtCompileTime>& other)
-  {
-    m_min = (other.min)().template cast<Scalar>();
-    m_max = (other.max)().template cast<Scalar>();
-  }
-
-  /** \returns \c true if \c *this is approximately equal to \a other, within the precision
-    * determined by \a prec.
-    *
-    * \sa MatrixBase::isApprox() */
-  bool isApprox(const AlignedBox& other, typename NumTraits<Scalar>::Real prec = precision<Scalar>()) const
-  { return m_min.isApprox(other.m_min, prec) && m_max.isApprox(other.m_max, prec); }
-
-protected:
-
-  VectorType m_min, m_max;
-};
-
-template<typename Scalar,int AmbiantDim>
-inline Scalar AlignedBox<Scalar,AmbiantDim>::squaredExteriorDistance(const VectorType& p) const
-{
-  Scalar dist2(0);
-  Scalar aux;
-  for (int k=0; k<dim(); ++k)
-  {
-    if ((aux = (p[k]-m_min[k]))<Scalar(0))
-      dist2 += aux*aux;
-    else if ( (aux = (m_max[k]-p[k]))<Scalar(0))
-      dist2 += aux*aux;
-  }
-  return dist2;
-}
-
-} // end namespace Eigen
diff --git a/inst/include/Eigen/src/Eigen2Support/Geometry/All.h b/inst/include/Eigen/src/Eigen2Support/Geometry/All.h
deleted file mode 100644
index e0b00fcc..00000000
--- a/inst/include/Eigen/src/Eigen2Support/Geometry/All.h
+++ /dev/null
@@ -1,115 +0,0 @@
-#ifndef EIGEN2_GEOMETRY_MODULE_H
-#define EIGEN2_GEOMETRY_MODULE_H
-
-#include <limits>
-
-#ifndef M_PI
-#define M_PI 3.14159265358979323846
-#endif
-
-#if EIGEN2_SUPPORT_STAGE < STAGE20_RESOLVE_API_CONFLICTS
-#include "RotationBase.h"
-#include "Rotation2D.h"
-#include "Quaternion.h"
-#include "AngleAxis.h"
-#include "Transform.h"
-#include "Translation.h"
-#include "Scaling.h"
-#include "AlignedBox.h"
-#include "Hyperplane.h"
-#include "ParametrizedLine.h"
-#endif
-
-
-#define RotationBase eigen2_RotationBase
-#define Rotation2D eigen2_Rotation2D
-#define Rotation2Df eigen2_Rotation2Df
-#define Rotation2Dd eigen2_Rotation2Dd
-
-#define Quaternion  eigen2_Quaternion
-#define Quaternionf eigen2_Quaternionf
-#define Quaterniond eigen2_Quaterniond
-
-#define AngleAxis eigen2_AngleAxis
-#define AngleAxisf eigen2_AngleAxisf
-#define AngleAxisd eigen2_AngleAxisd
-
-#define Transform   eigen2_Transform
-#define Transform2f eigen2_Transform2f
-#define Transform2d eigen2_Transform2d
-#define Transform3f eigen2_Transform3f
-#define Transform3d eigen2_Transform3d
-
-#define Translation eigen2_Translation
-#define Translation2f eigen2_Translation2f
-#define Translation2d eigen2_Translation2d
-#define Translation3f eigen2_Translation3f
-#define Translation3d eigen2_Translation3d
-
-#define Scaling eigen2_Scaling
-#define Scaling2f eigen2_Scaling2f
-#define Scaling2d eigen2_Scaling2d
-#define Scaling3f eigen2_Scaling3f
-#define Scaling3d eigen2_Scaling3d
-
-#define AlignedBox eigen2_AlignedBox
-
-#define Hyperplane eigen2_Hyperplane
-#define ParametrizedLine eigen2_ParametrizedLine
-
-#define ei_toRotationMatrix eigen2_ei_toRotationMatrix
-#define ei_quaternion_assign_impl eigen2_ei_quaternion_assign_impl
-#define ei_transform_product_impl eigen2_ei_transform_product_impl
-
-#include "RotationBase.h"
-#include "Rotation2D.h"
-#include "Quaternion.h"
-#include "AngleAxis.h"
-#include "Transform.h"
-#include "Translation.h"
-#include "Scaling.h"
-#include "AlignedBox.h"
-#include "Hyperplane.h"
-#include "ParametrizedLine.h"
-
-#undef ei_toRotationMatrix
-#undef ei_quaternion_assign_impl
-#undef ei_transform_product_impl
-
-#undef RotationBase
-#undef Rotation2D
-#undef Rotation2Df
-#undef Rotation2Dd
-
-#undef Quaternion
-#undef Quaternionf
-#undef Quaterniond
-
-#undef AngleAxis
-#undef AngleAxisf
-#undef AngleAxisd
-
-#undef Transform
-#undef Transform2f
-#undef Transform2d
-#undef Transform3f
-#undef Transform3d
-
-#undef Translation
-#undef Translation2f
-#undef Translation2d
-#undef Translation3f
-#undef Translation3d
-
-#undef Scaling
-#undef Scaling2f
-#undef Scaling2d
-#undef Scaling3f
-#undef Scaling3d
-
-#undef AlignedBox
-
-#undef Hyperplane
-#undef ParametrizedLine
-
-#endif // EIGEN2_GEOMETRY_MODULE_H
diff --git a/inst/include/Eigen/src/Eigen2Support/Geometry/AngleAxis.h b/inst/include/Eigen/src/Eigen2Support/Geometry/AngleAxis.h
deleted file mode 100644
index af598a40..00000000
--- a/inst/include/Eigen/src/Eigen2Support/Geometry/AngleAxis.h
+++ /dev/null
@@ -1,214 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// no include guard, we'll include this twice from All.h from Eigen2Support, and it's internal anyway
-
-namespace Eigen { 
-
-/** \geometry_module \ingroup Geometry_Module
-  *
-  * \class AngleAxis
-  *
-  * \brief Represents a 3D rotation as a rotation angle around an arbitrary 3D axis
-  *
-  * \param _Scalar the scalar type, i.e., the type of the coefficients.
-  *
-  * The following two typedefs are provided for convenience:
-  * \li \c AngleAxisf for \c float
-  * \li \c AngleAxisd for \c double
-  *
-  * \addexample AngleAxisForEuler \label How to define a rotation from Euler-angles
-  *
-  * Combined with MatrixBase::Unit{X,Y,Z}, AngleAxis can be used to easily
-  * mimic Euler-angles. Here is an example:
-  * \include AngleAxis_mimic_euler.cpp
-  * Output: \verbinclude AngleAxis_mimic_euler.out
-  *
-  * \note This class is not aimed to be used to store a rotation transformation,
-  * but rather to make easier the creation of other rotation (Quaternion, rotation Matrix)
-  * and transformation objects.
-  *
-  * \sa class Quaternion, class Transform, MatrixBase::UnitX()
-  */
-
-template<typename _Scalar> struct ei_traits<AngleAxis<_Scalar> >
-{
-  typedef _Scalar Scalar;
-};
-
-template<typename _Scalar>
-class AngleAxis : public RotationBase<AngleAxis<_Scalar>,3>
-{
-  typedef RotationBase<AngleAxis<_Scalar>,3> Base;
-
-public:
-
-  using Base::operator*;
-
-  enum { Dim = 3 };
-  /** the scalar type of the coefficients */
-  typedef _Scalar Scalar;
-  typedef Matrix<Scalar,3,3> Matrix3;
-  typedef Matrix<Scalar,3,1> Vector3;
-  typedef Quaternion<Scalar> QuaternionType;
-
-protected:
-
-  Vector3 m_axis;
-  Scalar m_angle;
-
-public:
-
-  /** Default constructor without initialization. */
-  AngleAxis() {}
-  /** Constructs and initialize the angle-axis rotation from an \a angle in radian
-    * and an \a axis which must be normalized. */
-  template<typename Derived>
-  inline AngleAxis(Scalar angle, const MatrixBase<Derived>& axis) : m_axis(axis), m_angle(angle) {}
-  /** Constructs and initialize the angle-axis rotation from a quaternion \a q. */
-  inline AngleAxis(const QuaternionType& q) { *this = q; }
-  /** Constructs and initialize the angle-axis rotation from a 3x3 rotation matrix. */
-  template<typename Derived>
-  inline explicit AngleAxis(const MatrixBase<Derived>& m) { *this = m; }
-
-  Scalar angle() const { return m_angle; }
-  Scalar& angle() { return m_angle; }
-
-  const Vector3& axis() const { return m_axis; }
-  Vector3& axis() { return m_axis; }
-
-  /** Concatenates two rotations */
-  inline QuaternionType operator* (const AngleAxis& other) const
-  { return QuaternionType(*this) * QuaternionType(other); }
-
-  /** Concatenates two rotations */
-  inline QuaternionType operator* (const QuaternionType& other) const
-  { return QuaternionType(*this) * other; }
-
-  /** Concatenates two rotations */
-  friend inline QuaternionType operator* (const QuaternionType& a, const AngleAxis& b)
-  { return a * QuaternionType(b); }
-
-  /** Concatenates two rotations */
-  inline Matrix3 operator* (const Matrix3& other) const
-  { return toRotationMatrix() * other; }
-
-  /** Concatenates two rotations */
-  inline friend Matrix3 operator* (const Matrix3& a, const AngleAxis& b)
-  { return a * b.toRotationMatrix(); }
-
-  /** Applies rotation to vector */
-  inline Vector3 operator* (const Vector3& other) const
-  { return toRotationMatrix() * other; }
-
-  /** \returns the inverse rotation, i.e., an angle-axis with opposite rotation angle */
-  AngleAxis inverse() const
-  { return AngleAxis(-m_angle, m_axis); }
-
-  AngleAxis& operator=(const QuaternionType& q);
-  template<typename Derived>
-  AngleAxis& operator=(const MatrixBase<Derived>& m);
-
-  template<typename Derived>
-  AngleAxis& fromRotationMatrix(const MatrixBase<Derived>& m);
-  Matrix3 toRotationMatrix(void) const;
-
-  /** \returns \c *this with scalar type casted to \a NewScalarType
-    *
-    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
-    * then this function smartly returns a const reference to \c *this.
-    */
-  template<typename NewScalarType>
-  inline typename internal::cast_return_type<AngleAxis,AngleAxis<NewScalarType> >::type cast() const
-  { return typename internal::cast_return_type<AngleAxis,AngleAxis<NewScalarType> >::type(*this); }
-
-  /** Copy constructor with scalar type conversion */
-  template<typename OtherScalarType>
-  inline explicit AngleAxis(const AngleAxis<OtherScalarType>& other)
-  {
-    m_axis = other.axis().template cast<Scalar>();
-    m_angle = Scalar(other.angle());
-  }
-
-  /** \returns \c true if \c *this is approximately equal to \a other, within the precision
-    * determined by \a prec.
-    *
-    * \sa MatrixBase::isApprox() */
-  bool isApprox(const AngleAxis& other, typename NumTraits<Scalar>::Real prec = precision<Scalar>()) const
-  { return m_axis.isApprox(other.m_axis, prec) && ei_isApprox(m_angle,other.m_angle, prec); }
-};
-
-/** \ingroup Geometry_Module
-  * single precision angle-axis type */
-typedef AngleAxis<float> AngleAxisf;
-/** \ingroup Geometry_Module
-  * double precision angle-axis type */
-typedef AngleAxis<double> AngleAxisd;
-
-/** Set \c *this from a quaternion.
-  * The axis is normalized.
-  */
-template<typename Scalar>
-AngleAxis<Scalar>& AngleAxis<Scalar>::operator=(const QuaternionType& q)
-{
-  Scalar n2 = q.vec().squaredNorm();
-  if (n2 < precision<Scalar>()*precision<Scalar>())
-  {
-    m_angle = 0;
-    m_axis << 1, 0, 0;
-  }
-  else
-  {
-    m_angle = 2*std::acos(q.w());
-    m_axis = q.vec() / ei_sqrt(n2);
-  }
-  return *this;
-}
-
-/** Set \c *this from a 3x3 rotation matrix \a mat.
-  */
-template<typename Scalar>
-template<typename Derived>
-AngleAxis<Scalar>& AngleAxis<Scalar>::operator=(const MatrixBase<Derived>& mat)
-{
-  // Since a direct conversion would not be really faster,
-  // let's use the robust Quaternion implementation:
-  return *this = QuaternionType(mat);
-}
-
-/** Constructs and \returns an equivalent 3x3 rotation matrix.
-  */
-template<typename Scalar>
-typename AngleAxis<Scalar>::Matrix3
-AngleAxis<Scalar>::toRotationMatrix(void) const
-{
-  Matrix3 res;
-  Vector3 sin_axis  = ei_sin(m_angle) * m_axis;
-  Scalar c = ei_cos(m_angle);
-  Vector3 cos1_axis = (Scalar(1)-c) * m_axis;
-
-  Scalar tmp;
-  tmp = cos1_axis.x() * m_axis.y();
-  res.coeffRef(0,1) = tmp - sin_axis.z();
-  res.coeffRef(1,0) = tmp + sin_axis.z();
-
-  tmp = cos1_axis.x() * m_axis.z();
-  res.coeffRef(0,2) = tmp + sin_axis.y();
-  res.coeffRef(2,0) = tmp - sin_axis.y();
-
-  tmp = cos1_axis.y() * m_axis.z();
-  res.coeffRef(1,2) = tmp - sin_axis.x();
-  res.coeffRef(2,1) = tmp + sin_axis.x();
-
-  res.diagonal() = (cos1_axis.cwise() * m_axis).cwise() + c;
-
-  return res;
-}
-
-} // end namespace Eigen
diff --git a/inst/include/Eigen/src/Eigen2Support/Geometry/Hyperplane.h b/inst/include/Eigen/src/Eigen2Support/Geometry/Hyperplane.h
deleted file mode 100644
index b95bf00e..00000000
--- a/inst/include/Eigen/src/Eigen2Support/Geometry/Hyperplane.h
+++ /dev/null
@@ -1,254 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// no include guard, we'll include this twice from All.h from Eigen2Support, and it's internal anyway
-
-namespace Eigen { 
-
-/** \geometry_module \ingroup Geometry_Module
-  *
-  * \class Hyperplane
-  *
-  * \brief A hyperplane
-  *
-  * A hyperplane is an affine subspace of dimension n-1 in a space of dimension n.
-  * For example, a hyperplane in a plane is a line; a hyperplane in 3-space is a plane.
-  *
-  * \param _Scalar the scalar type, i.e., the type of the coefficients
-  * \param _AmbientDim the dimension of the ambient space, can be a compile time value or Dynamic.
-  *             Notice that the dimension of the hyperplane is _AmbientDim-1.
-  *
-  * This class represents an hyperplane as the zero set of the implicit equation
-  * \f$ n \cdot x + d = 0 \f$ where \f$ n \f$ is a unit normal vector of the plane (linear part)
-  * and \f$ d \f$ is the distance (offset) to the origin.
-  */
-template <typename _Scalar, int _AmbientDim>
-class Hyperplane
-{
-public:
-  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim==Dynamic ? Dynamic : _AmbientDim+1)
-  enum { AmbientDimAtCompileTime = _AmbientDim };
-  typedef _Scalar Scalar;
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  typedef Matrix<Scalar,AmbientDimAtCompileTime,1> VectorType;
-  typedef Matrix<Scalar,int(AmbientDimAtCompileTime)==Dynamic
-                        ? Dynamic
-                        : int(AmbientDimAtCompileTime)+1,1> Coefficients;
-  typedef Block<Coefficients,AmbientDimAtCompileTime,1> NormalReturnType;
-
-  /** Default constructor without initialization */
-  inline Hyperplane() {}
-
-  /** Constructs a dynamic-size hyperplane with \a _dim the dimension
-    * of the ambient space */
-  inline explicit Hyperplane(int _dim) : m_coeffs(_dim+1) {}
-
-  /** Construct a plane from its normal \a n and a point \a e onto the plane.
-    * \warning the vector normal is assumed to be normalized.
-    */
-  inline Hyperplane(const VectorType& n, const VectorType& e)
-    : m_coeffs(n.size()+1)
-  {
-    normal() = n;
-    offset() = -e.eigen2_dot(n);
-  }
-
-  /** Constructs a plane from its normal \a n and distance to the origin \a d
-    * such that the algebraic equation of the plane is \f$ n \cdot x + d = 0 \f$.
-    * \warning the vector normal is assumed to be normalized.
-    */
-  inline Hyperplane(const VectorType& n, Scalar d)
-    : m_coeffs(n.size()+1)
-  {
-    normal() = n;
-    offset() = d;
-  }
-
-  /** Constructs a hyperplane passing through the two points. If the dimension of the ambient space
-    * is greater than 2, then there isn't uniqueness, so an arbitrary choice is made.
-    */
-  static inline Hyperplane Through(const VectorType& p0, const VectorType& p1)
-  {
-    Hyperplane result(p0.size());
-    result.normal() = (p1 - p0).unitOrthogonal();
-    result.offset() = -result.normal().eigen2_dot(p0);
-    return result;
-  }
-
-  /** Constructs a hyperplane passing through the three points. The dimension of the ambient space
-    * is required to be exactly 3.
-    */
-  static inline Hyperplane Through(const VectorType& p0, const VectorType& p1, const VectorType& p2)
-  {
-    EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(VectorType, 3)
-    Hyperplane result(p0.size());
-    result.normal() = (p2 - p0).cross(p1 - p0).normalized();
-    result.offset() = -result.normal().eigen2_dot(p0);
-    return result;
-  }
-
-  /** Constructs a hyperplane passing through the parametrized line \a parametrized.
-    * If the dimension of the ambient space is greater than 2, then there isn't uniqueness,
-    * so an arbitrary choice is made.
-    */
-  // FIXME to be consitent with the rest this could be implemented as a static Through function ??
-  explicit Hyperplane(const ParametrizedLine<Scalar, AmbientDimAtCompileTime>& parametrized)
-  {
-    normal() = parametrized.direction().unitOrthogonal();
-    offset() = -normal().eigen2_dot(parametrized.origin());
-  }
-
-  ~Hyperplane() {}
-
-  /** \returns the dimension in which the plane holds */
-  inline int dim() const { return int(AmbientDimAtCompileTime)==Dynamic ? m_coeffs.size()-1 : int(AmbientDimAtCompileTime); }
-
-  /** normalizes \c *this */
-  void normalize(void)
-  {
-    m_coeffs /= normal().norm();
-  }
-
-  /** \returns the signed distance between the plane \c *this and a point \a p.
-    * \sa absDistance()
-    */
-  inline Scalar signedDistance(const VectorType& p) const { return p.eigen2_dot(normal()) + offset(); }
-
-  /** \returns the absolute distance between the plane \c *this and a point \a p.
-    * \sa signedDistance()
-    */
-  inline Scalar absDistance(const VectorType& p) const { return ei_abs(signedDistance(p)); }
-
-  /** \returns the projection of a point \a p onto the plane \c *this.
-    */
-  inline VectorType projection(const VectorType& p) const { return p - signedDistance(p) * normal(); }
-
-  /** \returns a constant reference to the unit normal vector of the plane, which corresponds
-    * to the linear part of the implicit equation.
-    */
-  inline const NormalReturnType normal() const { return NormalReturnType(*const_cast<Coefficients*>(&m_coeffs),0,0,dim(),1); }
-
-  /** \returns a non-constant reference to the unit normal vector of the plane, which corresponds
-    * to the linear part of the implicit equation.
-    */
-  inline NormalReturnType normal() { return NormalReturnType(m_coeffs,0,0,dim(),1); }
-
-  /** \returns the distance to the origin, which is also the "constant term" of the implicit equation
-    * \warning the vector normal is assumed to be normalized.
-    */
-  inline const Scalar& offset() const { return m_coeffs.coeff(dim()); }
-
-  /** \returns a non-constant reference to the distance to the origin, which is also the constant part
-    * of the implicit equation */
-  inline Scalar& offset() { return m_coeffs(dim()); }
-
-  /** \returns a constant reference to the coefficients c_i of the plane equation:
-    * \f$ c_0*x_0 + ... + c_{d-1}*x_{d-1} + c_d = 0 \f$
-    */
-  inline const Coefficients& coeffs() const { return m_coeffs; }
-
-  /** \returns a non-constant reference to the coefficients c_i of the plane equation:
-    * \f$ c_0*x_0 + ... + c_{d-1}*x_{d-1} + c_d = 0 \f$
-    */
-  inline Coefficients& coeffs() { return m_coeffs; }
-
-  /** \returns the intersection of *this with \a other.
-    *
-    * \warning The ambient space must be a plane, i.e. have dimension 2, so that \c *this and \a other are lines.
-    *
-    * \note If \a other is approximately parallel to *this, this method will return any point on *this.
-    */
-  VectorType intersection(const Hyperplane& other)
-  {
-    EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(VectorType, 2)
-    Scalar det = coeffs().coeff(0) * other.coeffs().coeff(1) - coeffs().coeff(1) * other.coeffs().coeff(0);
-    // since the line equations ax+by=c are normalized with a^2+b^2=1, the following tests
-    // whether the two lines are approximately parallel.
-    if(ei_isMuchSmallerThan(det, Scalar(1)))
-    {   // special case where the two lines are approximately parallel. Pick any point on the first line.
-        if(ei_abs(coeffs().coeff(1))>ei_abs(coeffs().coeff(0)))
-            return VectorType(coeffs().coeff(1), -coeffs().coeff(2)/coeffs().coeff(1)-coeffs().coeff(0));
-        else
-            return VectorType(-coeffs().coeff(2)/coeffs().coeff(0)-coeffs().coeff(1), coeffs().coeff(0));
-    }
-    else
-    {   // general case
-        Scalar invdet = Scalar(1) / det;
-        return VectorType(invdet*(coeffs().coeff(1)*other.coeffs().coeff(2)-other.coeffs().coeff(1)*coeffs().coeff(2)),
-                          invdet*(other.coeffs().coeff(0)*coeffs().coeff(2)-coeffs().coeff(0)*other.coeffs().coeff(2)));
-    }
-  }
-
-  /** Applies the transformation matrix \a mat to \c *this and returns a reference to \c *this.
-    *
-    * \param mat the Dim x Dim transformation matrix
-    * \param traits specifies whether the matrix \a mat represents an Isometry
-    *               or a more generic Affine transformation. The default is Affine.
-    */
-  template<typename XprType>
-  inline Hyperplane& transform(const MatrixBase<XprType>& mat, TransformTraits traits = Affine)
-  {
-    if (traits==Affine)
-      normal() = mat.inverse().transpose() * normal();
-    else if (traits==Isometry)
-      normal() = mat * normal();
-    else
-    {
-      ei_assert("invalid traits value in Hyperplane::transform()");
-    }
-    return *this;
-  }
-
-  /** Applies the transformation \a t to \c *this and returns a reference to \c *this.
-    *
-    * \param t the transformation of dimension Dim
-    * \param traits specifies whether the transformation \a t represents an Isometry
-    *               or a more generic Affine transformation. The default is Affine.
-    *               Other kind of transformations are not supported.
-    */
-  inline Hyperplane& transform(const Transform<Scalar,AmbientDimAtCompileTime>& t,
-                                TransformTraits traits = Affine)
-  {
-    transform(t.linear(), traits);
-    offset() -= t.translation().eigen2_dot(normal());
-    return *this;
-  }
-
-  /** \returns \c *this with scalar type casted to \a NewScalarType
-    *
-    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
-    * then this function smartly returns a const reference to \c *this.
-    */
-  template<typename NewScalarType>
-  inline typename internal::cast_return_type<Hyperplane,
-           Hyperplane<NewScalarType,AmbientDimAtCompileTime> >::type cast() const
-  {
-    return typename internal::cast_return_type<Hyperplane,
-                    Hyperplane<NewScalarType,AmbientDimAtCompileTime> >::type(*this);
-  }
-
-  /** Copy constructor with scalar type conversion */
-  template<typename OtherScalarType>
-  inline explicit Hyperplane(const Hyperplane<OtherScalarType,AmbientDimAtCompileTime>& other)
-  { m_coeffs = other.coeffs().template cast<Scalar>(); }
-
-  /** \returns \c true if \c *this is approximately equal to \a other, within the precision
-    * determined by \a prec.
-    *
-    * \sa MatrixBase::isApprox() */
-  bool isApprox(const Hyperplane& other, typename NumTraits<Scalar>::Real prec = precision<Scalar>()) const
-  { return m_coeffs.isApprox(other.m_coeffs, prec); }
-
-protected:
-
-  Coefficients m_coeffs;
-};
-
-} // end namespace Eigen
diff --git a/inst/include/Eigen/src/Eigen2Support/Geometry/ParametrizedLine.h b/inst/include/Eigen/src/Eigen2Support/Geometry/ParametrizedLine.h
deleted file mode 100644
index 9b57b7e0..00000000
--- a/inst/include/Eigen/src/Eigen2Support/Geometry/ParametrizedLine.h
+++ /dev/null
@@ -1,141 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// no include guard, we'll include this twice from All.h from Eigen2Support, and it's internal anyway
-
-namespace Eigen { 
-
-/** \geometry_module \ingroup Geometry_Module
-  *
-  * \class ParametrizedLine
-  *
-  * \brief A parametrized line
-  *
-  * A parametrized line is defined by an origin point \f$ \mathbf{o} \f$ and a unit
-  * direction vector \f$ \mathbf{d} \f$ such that the line corresponds to
-  * the set \f$ l(t) = \mathbf{o} + t \mathbf{d} \f$, \f$ l \in \mathbf{R} \f$.
-  *
-  * \param _Scalar the scalar type, i.e., the type of the coefficients
-  * \param _AmbientDim the dimension of the ambient space, can be a compile time value or Dynamic.
-  */
-template <typename _Scalar, int _AmbientDim>
-class ParametrizedLine
-{
-public:
-  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim)
-  enum { AmbientDimAtCompileTime = _AmbientDim };
-  typedef _Scalar Scalar;
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  typedef Matrix<Scalar,AmbientDimAtCompileTime,1> VectorType;
-
-  /** Default constructor without initialization */
-  inline ParametrizedLine() {}
-
-  /** Constructs a dynamic-size line with \a _dim the dimension
-    * of the ambient space */
-  inline explicit ParametrizedLine(int _dim) : m_origin(_dim), m_direction(_dim) {}
-
-  /** Initializes a parametrized line of direction \a direction and origin \a origin.
-    * \warning the vector direction is assumed to be normalized.
-    */
-  ParametrizedLine(const VectorType& origin, const VectorType& direction)
-    : m_origin(origin), m_direction(direction) {}
-
-  explicit ParametrizedLine(const Hyperplane<_Scalar, _AmbientDim>& hyperplane);
-
-  /** Constructs a parametrized line going from \a p0 to \a p1. */
-  static inline ParametrizedLine Through(const VectorType& p0, const VectorType& p1)
-  { return ParametrizedLine(p0, (p1-p0).normalized()); }
-
-  ~ParametrizedLine() {}
-
-  /** \returns the dimension in which the line holds */
-  inline int dim() const { return m_direction.size(); }
-
-  const VectorType& origin() const { return m_origin; }
-  VectorType& origin() { return m_origin; }
-
-  const VectorType& direction() const { return m_direction; }
-  VectorType& direction() { return m_direction; }
-
-  /** \returns the squared distance of a point \a p to its projection onto the line \c *this.
-    * \sa distance()
-    */
-  RealScalar squaredDistance(const VectorType& p) const
-  {
-    VectorType diff = p-origin();
-    return (diff - diff.eigen2_dot(direction())* direction()).squaredNorm();
-  }
-  /** \returns the distance of a point \a p to its projection onto the line \c *this.
-    * \sa squaredDistance()
-    */
-  RealScalar distance(const VectorType& p) const { return ei_sqrt(squaredDistance(p)); }
-
-  /** \returns the projection of a point \a p onto the line \c *this. */
-  VectorType projection(const VectorType& p) const
-  { return origin() + (p-origin()).eigen2_dot(direction()) * direction(); }
-
-  Scalar intersection(const Hyperplane<_Scalar, _AmbientDim>& hyperplane);
-
-  /** \returns \c *this with scalar type casted to \a NewScalarType
-    *
-    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
-    * then this function smartly returns a const reference to \c *this.
-    */
-  template<typename NewScalarType>
-  inline typename internal::cast_return_type<ParametrizedLine,
-           ParametrizedLine<NewScalarType,AmbientDimAtCompileTime> >::type cast() const
-  {
-    return typename internal::cast_return_type<ParametrizedLine,
-                    ParametrizedLine<NewScalarType,AmbientDimAtCompileTime> >::type(*this);
-  }
-
-  /** Copy constructor with scalar type conversion */
-  template<typename OtherScalarType>
-  inline explicit ParametrizedLine(const ParametrizedLine<OtherScalarType,AmbientDimAtCompileTime>& other)
-  {
-    m_origin = other.origin().template cast<Scalar>();
-    m_direction = other.direction().template cast<Scalar>();
-  }
-
-  /** \returns \c true if \c *this is approximately equal to \a other, within the precision
-    * determined by \a prec.
-    *
-    * \sa MatrixBase::isApprox() */
-  bool isApprox(const ParametrizedLine& other, typename NumTraits<Scalar>::Real prec = precision<Scalar>()) const
-  { return m_origin.isApprox(other.m_origin, prec) && m_direction.isApprox(other.m_direction, prec); }
-
-protected:
-
-  VectorType m_origin, m_direction;
-};
-
-/** Constructs a parametrized line from a 2D hyperplane
-  *
-  * \warning the ambient space must have dimension 2 such that the hyperplane actually describes a line
-  */
-template <typename _Scalar, int _AmbientDim>
-inline ParametrizedLine<_Scalar, _AmbientDim>::ParametrizedLine(const Hyperplane<_Scalar, _AmbientDim>& hyperplane)
-{
-  EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(VectorType, 2)
-  direction() = hyperplane.normal().unitOrthogonal();
-  origin() = -hyperplane.normal()*hyperplane.offset();
-}
-
-/** \returns the parameter value of the intersection between \c *this and the given hyperplane
-  */
-template <typename _Scalar, int _AmbientDim>
-inline _Scalar ParametrizedLine<_Scalar, _AmbientDim>::intersection(const Hyperplane<_Scalar, _AmbientDim>& hyperplane)
-{
-  return -(hyperplane.offset()+origin().eigen2_dot(hyperplane.normal()))
-          /(direction().eigen2_dot(hyperplane.normal()));
-}
-
-} // end namespace Eigen
diff --git a/inst/include/Eigen/src/Eigen2Support/Geometry/Quaternion.h b/inst/include/Eigen/src/Eigen2Support/Geometry/Quaternion.h
deleted file mode 100644
index 4b6390cf..00000000
--- a/inst/include/Eigen/src/Eigen2Support/Geometry/Quaternion.h
+++ /dev/null
@@ -1,495 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// no include guard, we'll include this twice from All.h from Eigen2Support, and it's internal anyway
-
-namespace Eigen { 
-
-template<typename Other,
-         int OtherRows=Other::RowsAtCompileTime,
-         int OtherCols=Other::ColsAtCompileTime>
-struct ei_quaternion_assign_impl;
-
-/** \geometry_module \ingroup Geometry_Module
-  *
-  * \class Quaternion
-  *
-  * \brief The quaternion class used to represent 3D orientations and rotations
-  *
-  * \param _Scalar the scalar type, i.e., the type of the coefficients
-  *
-  * This class represents a quaternion \f$ w+xi+yj+zk \f$ that is a convenient representation of
-  * orientations and rotations of objects in three dimensions. Compared to other representations
-  * like Euler angles or 3x3 matrices, quatertions offer the following advantages:
-  * \li \b compact storage (4 scalars)
-  * \li \b efficient to compose (28 flops),
-  * \li \b stable spherical interpolation
-  *
-  * The following two typedefs are provided for convenience:
-  * \li \c Quaternionf for \c float
-  * \li \c Quaterniond for \c double
-  *
-  * \sa  class AngleAxis, class Transform
-  */
-
-template<typename _Scalar> struct ei_traits<Quaternion<_Scalar> >
-{
-  typedef _Scalar Scalar;
-};
-
-template<typename _Scalar>
-class Quaternion : public RotationBase<Quaternion<_Scalar>,3>
-{
-  typedef RotationBase<Quaternion<_Scalar>,3> Base;
-
-public:
-  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,4)
-
-  using Base::operator*;
-
-  /** the scalar type of the coefficients */
-  typedef _Scalar Scalar;
-
-  /** the type of the Coefficients 4-vector */
-  typedef Matrix<Scalar, 4, 1> Coefficients;
-  /** the type of a 3D vector */
-  typedef Matrix<Scalar,3,1> Vector3;
-  /** the equivalent rotation matrix type */
-  typedef Matrix<Scalar,3,3> Matrix3;
-  /** the equivalent angle-axis type */
-  typedef AngleAxis<Scalar> AngleAxisType;
-
-  /** \returns the \c x coefficient */
-  inline Scalar x() const { return m_coeffs.coeff(0); }
-  /** \returns the \c y coefficient */
-  inline Scalar y() const { return m_coeffs.coeff(1); }
-  /** \returns the \c z coefficient */
-  inline Scalar z() const { return m_coeffs.coeff(2); }
-  /** \returns the \c w coefficient */
-  inline Scalar w() const { return m_coeffs.coeff(3); }
-
-  /** \returns a reference to the \c x coefficient */
-  inline Scalar& x() { return m_coeffs.coeffRef(0); }
-  /** \returns a reference to the \c y coefficient */
-  inline Scalar& y() { return m_coeffs.coeffRef(1); }
-  /** \returns a reference to the \c z coefficient */
-  inline Scalar& z() { return m_coeffs.coeffRef(2); }
-  /** \returns a reference to the \c w coefficient */
-  inline Scalar& w() { return m_coeffs.coeffRef(3); }
-
-  /** \returns a read-only vector expression of the imaginary part (x,y,z) */
-  inline const Block<const Coefficients,3,1> vec() const { return m_coeffs.template start<3>(); }
-
-  /** \returns a vector expression of the imaginary part (x,y,z) */
-  inline Block<Coefficients,3,1> vec() { return m_coeffs.template start<3>(); }
-
-  /** \returns a read-only vector expression of the coefficients (x,y,z,w) */
-  inline const Coefficients& coeffs() const { return m_coeffs; }
-
-  /** \returns a vector expression of the coefficients (x,y,z,w) */
-  inline Coefficients& coeffs() { return m_coeffs; }
-
-  /** Default constructor leaving the quaternion uninitialized. */
-  inline Quaternion() {}
-
-  /** Constructs and initializes the quaternion \f$ w+xi+yj+zk \f$ from
-    * its four coefficients \a w, \a x, \a y and \a z.
-    *
-    * \warning Note the order of the arguments: the real \a w coefficient first,
-    * while internally the coefficients are stored in the following order:
-    * [\c x, \c y, \c z, \c w]
-    */
-  inline Quaternion(Scalar w, Scalar x, Scalar y, Scalar z)
-  { m_coeffs << x, y, z, w; }
-
-  /** Copy constructor */
-  inline Quaternion(const Quaternion& other) { m_coeffs = other.m_coeffs; }
-
-  /** Constructs and initializes a quaternion from the angle-axis \a aa */
-  explicit inline Quaternion(const AngleAxisType& aa) { *this = aa; }
-
-  /** Constructs and initializes a quaternion from either:
-    *  - a rotation matrix expression,
-    *  - a 4D vector expression representing quaternion coefficients.
-    * \sa operator=(MatrixBase<Derived>)
-    */
-  template<typename Derived>
-  explicit inline Quaternion(const MatrixBase<Derived>& other) { *this = other; }
-
-  Quaternion& operator=(const Quaternion& other);
-  Quaternion& operator=(const AngleAxisType& aa);
-  template<typename Derived>
-  Quaternion& operator=(const MatrixBase<Derived>& m);
-
-  /** \returns a quaternion representing an identity rotation
-    * \sa MatrixBase::Identity()
-    */
-  static inline Quaternion Identity() { return Quaternion(1, 0, 0, 0); }
-
-  /** \sa Quaternion::Identity(), MatrixBase::setIdentity()
-    */
-  inline Quaternion& setIdentity() { m_coeffs << 0, 0, 0, 1; return *this; }
-
-  /** \returns the squared norm of the quaternion's coefficients
-    * \sa Quaternion::norm(), MatrixBase::squaredNorm()
-    */
-  inline Scalar squaredNorm() const { return m_coeffs.squaredNorm(); }
-
-  /** \returns the norm of the quaternion's coefficients
-    * \sa Quaternion::squaredNorm(), MatrixBase::norm()
-    */
-  inline Scalar norm() const { return m_coeffs.norm(); }
-
-  /** Normalizes the quaternion \c *this
-    * \sa normalized(), MatrixBase::normalize() */
-  inline void normalize() { m_coeffs.normalize(); }
-  /** \returns a normalized version of \c *this
-    * \sa normalize(), MatrixBase::normalized() */
-  inline Quaternion normalized() const { return Quaternion(m_coeffs.normalized()); }
-
-  /** \returns the dot product of \c *this and \a other
-    * Geometrically speaking, the dot product of two unit quaternions
-    * corresponds to the cosine of half the angle between the two rotations.
-    * \sa angularDistance()
-    */
-  inline Scalar eigen2_dot(const Quaternion& other) const { return m_coeffs.eigen2_dot(other.m_coeffs); }
-
-  inline Scalar angularDistance(const Quaternion& other) const;
-
-  Matrix3 toRotationMatrix(void) const;
-
-  template<typename Derived1, typename Derived2>
-  Quaternion& setFromTwoVectors(const MatrixBase<Derived1>& a, const MatrixBase<Derived2>& b);
-
-  inline Quaternion operator* (const Quaternion& q) const;
-  inline Quaternion& operator*= (const Quaternion& q);
-
-  Quaternion inverse(void) const;
-  Quaternion conjugate(void) const;
-
-  Quaternion slerp(Scalar t, const Quaternion& other) const;
-
-  template<typename Derived>
-  Vector3 operator* (const MatrixBase<Derived>& vec) const;
-
-  /** \returns \c *this with scalar type casted to \a NewScalarType
-    *
-    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
-    * then this function smartly returns a const reference to \c *this.
-    */
-  template<typename NewScalarType>
-  inline typename internal::cast_return_type<Quaternion,Quaternion<NewScalarType> >::type cast() const
-  { return typename internal::cast_return_type<Quaternion,Quaternion<NewScalarType> >::type(*this); }
-
-  /** Copy constructor with scalar type conversion */
-  template<typename OtherScalarType>
-  inline explicit Quaternion(const Quaternion<OtherScalarType>& other)
-  { m_coeffs = other.coeffs().template cast<Scalar>(); }
-
-  /** \returns \c true if \c *this is approximately equal to \a other, within the precision
-    * determined by \a prec.
-    *
-    * \sa MatrixBase::isApprox() */
-  bool isApprox(const Quaternion& other, typename NumTraits<Scalar>::Real prec = precision<Scalar>()) const
-  { return m_coeffs.isApprox(other.m_coeffs, prec); }
-
-protected:
-  Coefficients m_coeffs;
-};
-
-/** \ingroup Geometry_Module
-  * single precision quaternion type */
-typedef Quaternion<float> Quaternionf;
-/** \ingroup Geometry_Module
-  * double precision quaternion type */
-typedef Quaternion<double> Quaterniond;
-
-// Generic Quaternion * Quaternion product
-template<typename Scalar> inline Quaternion<Scalar>
-ei_quaternion_product(const Quaternion<Scalar>& a, const Quaternion<Scalar>& b)
-{
-  return Quaternion<Scalar>
-  (
-    a.w() * b.w() - a.x() * b.x() - a.y() * b.y() - a.z() * b.z(),
-    a.w() * b.x() + a.x() * b.w() + a.y() * b.z() - a.z() * b.y(),
-    a.w() * b.y() + a.y() * b.w() + a.z() * b.x() - a.x() * b.z(),
-    a.w() * b.z() + a.z() * b.w() + a.x() * b.y() - a.y() * b.x()
-  );
-}
-
-/** \returns the concatenation of two rotations as a quaternion-quaternion product */
-template <typename Scalar>
-inline Quaternion<Scalar> Quaternion<Scalar>::operator* (const Quaternion& other) const
-{
-  return ei_quaternion_product(*this,other);
-}
-
-/** \sa operator*(Quaternion) */
-template <typename Scalar>
-inline Quaternion<Scalar>& Quaternion<Scalar>::operator*= (const Quaternion& other)
-{
-  return (*this = *this * other);
-}
-
-/** Rotation of a vector by a quaternion.
-  * \remarks If the quaternion is used to rotate several points (>1)
-  * then it is much more efficient to first convert it to a 3x3 Matrix.
-  * Comparison of the operation cost for n transformations:
-  *   - Quaternion:    30n
-  *   - Via a Matrix3: 24 + 15n
-  */
-template <typename Scalar>
-template<typename Derived>
-inline typename Quaternion<Scalar>::Vector3
-Quaternion<Scalar>::operator* (const MatrixBase<Derived>& v) const
-{
-    // Note that this algorithm comes from the optimization by hand
-    // of the conversion to a Matrix followed by a Matrix/Vector product.
-    // It appears to be much faster than the common algorithm found
-    // in the litterature (30 versus 39 flops). It also requires two
-    // Vector3 as temporaries.
-    Vector3 uv;
-    uv = 2 * this->vec().cross(v);
-    return v + this->w() * uv + this->vec().cross(uv);
-}
-
-template<typename Scalar>
-inline Quaternion<Scalar>& Quaternion<Scalar>::operator=(const Quaternion& other)
-{
-  m_coeffs = other.m_coeffs;
-  return *this;
-}
-
-/** Set \c *this from an angle-axis \a aa and returns a reference to \c *this
-  */
-template<typename Scalar>
-inline Quaternion<Scalar>& Quaternion<Scalar>::operator=(const AngleAxisType& aa)
-{
-  Scalar ha = Scalar(0.5)*aa.angle(); // Scalar(0.5) to suppress precision loss warnings
-  this->w() = ei_cos(ha);
-  this->vec() = ei_sin(ha) * aa.axis();
-  return *this;
-}
-
-/** Set \c *this from the expression \a xpr:
-  *   - if \a xpr is a 4x1 vector, then \a xpr is assumed to be a quaternion
-  *   - if \a xpr is a 3x3 matrix, then \a xpr is assumed to be rotation matrix
-  *     and \a xpr is converted to a quaternion
-  */
-template<typename Scalar>
-template<typename Derived>
-inline Quaternion<Scalar>& Quaternion<Scalar>::operator=(const MatrixBase<Derived>& xpr)
-{
-  ei_quaternion_assign_impl<Derived>::run(*this, xpr.derived());
-  return *this;
-}
-
-/** Convert the quaternion to a 3x3 rotation matrix */
-template<typename Scalar>
-inline typename Quaternion<Scalar>::Matrix3
-Quaternion<Scalar>::toRotationMatrix(void) const
-{
-  // NOTE if inlined, then gcc 4.2 and 4.4 get rid of the temporary (not gcc 4.3 !!)
-  // if not inlined then the cost of the return by value is huge ~ +35%,
-  // however, not inlining this function is an order of magnitude slower, so
-  // it has to be inlined, and so the return by value is not an issue
-  Matrix3 res;
-
-  const Scalar tx  = Scalar(2)*this->x();
-  const Scalar ty  = Scalar(2)*this->y();
-  const Scalar tz  = Scalar(2)*this->z();
-  const Scalar twx = tx*this->w();
-  const Scalar twy = ty*this->w();
-  const Scalar twz = tz*this->w();
-  const Scalar txx = tx*this->x();
-  const Scalar txy = ty*this->x();
-  const Scalar txz = tz*this->x();
-  const Scalar tyy = ty*this->y();
-  const Scalar tyz = tz*this->y();
-  const Scalar tzz = tz*this->z();
-
-  res.coeffRef(0,0) = Scalar(1)-(tyy+tzz);
-  res.coeffRef(0,1) = txy-twz;
-  res.coeffRef(0,2) = txz+twy;
-  res.coeffRef(1,0) = txy+twz;
-  res.coeffRef(1,1) = Scalar(1)-(txx+tzz);
-  res.coeffRef(1,2) = tyz-twx;
-  res.coeffRef(2,0) = txz-twy;
-  res.coeffRef(2,1) = tyz+twx;
-  res.coeffRef(2,2) = Scalar(1)-(txx+tyy);
-
-  return res;
-}
-
-/** Sets *this to be a quaternion representing a rotation sending the vector \a a to the vector \a b.
-  *
-  * \returns a reference to *this.
-  *
-  * Note that the two input vectors do \b not have to be normalized.
-  */
-template<typename Scalar>
-template<typename Derived1, typename Derived2>
-inline Quaternion<Scalar>& Quaternion<Scalar>::setFromTwoVectors(const MatrixBase<Derived1>& a, const MatrixBase<Derived2>& b)
-{
-  Vector3 v0 = a.normalized();
-  Vector3 v1 = b.normalized();
-  Scalar c = v0.eigen2_dot(v1);
-
-  // if dot == 1, vectors are the same
-  if (ei_isApprox(c,Scalar(1)))
-  {
-    // set to identity
-    this->w() = 1; this->vec().setZero();
-    return *this;
-  }
-  // if dot == -1, vectors are opposites
-  if (ei_isApprox(c,Scalar(-1)))
-  {
-    this->vec() = v0.unitOrthogonal();
-    this->w() = 0;
-    return *this;
-  }
-
-  Vector3 axis = v0.cross(v1);
-  Scalar s = ei_sqrt((Scalar(1)+c)*Scalar(2));
-  Scalar invs = Scalar(1)/s;
-  this->vec() = axis * invs;
-  this->w() = s * Scalar(0.5);
-
-  return *this;
-}
-
-/** \returns the multiplicative inverse of \c *this
-  * Note that in most cases, i.e., if you simply want the opposite rotation,
-  * and/or the quaternion is normalized, then it is enough to use the conjugate.
-  *
-  * \sa Quaternion::conjugate()
-  */
-template <typename Scalar>
-inline Quaternion<Scalar> Quaternion<Scalar>::inverse() const
-{
-  // FIXME should this function be called multiplicativeInverse and conjugate() be called inverse() or opposite()  ??
-  Scalar n2 = this->squaredNorm();
-  if (n2 > 0)
-    return Quaternion(conjugate().coeffs() / n2);
-  else
-  {
-    // return an invalid result to flag the error
-    return Quaternion(Coefficients::Zero());
-  }
-}
-
-/** \returns the conjugate of the \c *this which is equal to the multiplicative inverse
-  * if the quaternion is normalized.
-  * The conjugate of a quaternion represents the opposite rotation.
-  *
-  * \sa Quaternion::inverse()
-  */
-template <typename Scalar>
-inline Quaternion<Scalar> Quaternion<Scalar>::conjugate() const
-{
-  return Quaternion(this->w(),-this->x(),-this->y(),-this->z());
-}
-
-/** \returns the angle (in radian) between two rotations
-  * \sa eigen2_dot()
-  */
-template <typename Scalar>
-inline Scalar Quaternion<Scalar>::angularDistance(const Quaternion& other) const
-{
-  double d = ei_abs(this->eigen2_dot(other));
-  if (d>=1.0)
-    return 0;
-  return Scalar(2) * std::acos(d);
-}
-
-/** \returns the spherical linear interpolation between the two quaternions
-  * \c *this and \a other at the parameter \a t
-  */
-template <typename Scalar>
-Quaternion<Scalar> Quaternion<Scalar>::slerp(Scalar t, const Quaternion& other) const
-{
-  static const Scalar one = Scalar(1) - machine_epsilon<Scalar>();
-  Scalar d = this->eigen2_dot(other);
-  Scalar absD = ei_abs(d);
-
-  Scalar scale0;
-  Scalar scale1;
-
-  if (absD>=one)
-  {
-    scale0 = Scalar(1) - t;
-    scale1 = t;
-  }
-  else
-  {
-    // theta is the angle between the 2 quaternions
-    Scalar theta = std::acos(absD);
-    Scalar sinTheta = ei_sin(theta);
-
-    scale0 = ei_sin( ( Scalar(1) - t ) * theta) / sinTheta;
-    scale1 = ei_sin( ( t * theta) ) / sinTheta;
-    if (d<0)
-      scale1 = -scale1;
-  }
-
-  return Quaternion<Scalar>(scale0 * coeffs() + scale1 * other.coeffs());
-}
-
-// set from a rotation matrix
-template<typename Other>
-struct ei_quaternion_assign_impl<Other,3,3>
-{
-  typedef typename Other::Scalar Scalar;
-  static inline void run(Quaternion<Scalar>& q, const Other& mat)
-  {
-    // This algorithm comes from  "Quaternion Calculus and Fast Animation",
-    // Ken Shoemake, 1987 SIGGRAPH course notes
-    Scalar t = mat.trace();
-    if (t > 0)
-    {
-      t = ei_sqrt(t + Scalar(1.0));
-      q.w() = Scalar(0.5)*t;
-      t = Scalar(0.5)/t;
-      q.x() = (mat.coeff(2,1) - mat.coeff(1,2)) * t;
-      q.y() = (mat.coeff(0,2) - mat.coeff(2,0)) * t;
-      q.z() = (mat.coeff(1,0) - mat.coeff(0,1)) * t;
-    }
-    else
-    {
-      int i = 0;
-      if (mat.coeff(1,1) > mat.coeff(0,0))
-        i = 1;
-      if (mat.coeff(2,2) > mat.coeff(i,i))
-        i = 2;
-      int j = (i+1)%3;
-      int k = (j+1)%3;
-
-      t = ei_sqrt(mat.coeff(i,i)-mat.coeff(j,j)-mat.coeff(k,k) + Scalar(1.0));
-      q.coeffs().coeffRef(i) = Scalar(0.5) * t;
-      t = Scalar(0.5)/t;
-      q.w() = (mat.coeff(k,j)-mat.coeff(j,k))*t;
-      q.coeffs().coeffRef(j) = (mat.coeff(j,i)+mat.coeff(i,j))*t;
-      q.coeffs().coeffRef(k) = (mat.coeff(k,i)+mat.coeff(i,k))*t;
-    }
-  }
-};
-
-// set from a vector of coefficients assumed to be a quaternion
-template<typename Other>
-struct ei_quaternion_assign_impl<Other,4,1>
-{
-  typedef typename Other::Scalar Scalar;
-  static inline void run(Quaternion<Scalar>& q, const Other& vec)
-  {
-    q.coeffs() = vec;
-  }
-};
-
-} // end namespace Eigen
diff --git a/inst/include/Eigen/src/Eigen2Support/Geometry/Rotation2D.h b/inst/include/Eigen/src/Eigen2Support/Geometry/Rotation2D.h
deleted file mode 100644
index 19b8582a..00000000
--- a/inst/include/Eigen/src/Eigen2Support/Geometry/Rotation2D.h
+++ /dev/null
@@ -1,145 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// no include guard, we'll include this twice from All.h from Eigen2Support, and it's internal anyway
-
-namespace Eigen { 
-
-/** \geometry_module \ingroup Geometry_Module
-  *
-  * \class Rotation2D
-  *
-  * \brief Represents a rotation/orientation in a 2 dimensional space.
-  *
-  * \param _Scalar the scalar type, i.e., the type of the coefficients
-  *
-  * This class is equivalent to a single scalar representing a counter clock wise rotation
-  * as a single angle in radian. It provides some additional features such as the automatic
-  * conversion from/to a 2x2 rotation matrix. Moreover this class aims to provide a similar
-  * interface to Quaternion in order to facilitate the writing of generic algorithms
-  * dealing with rotations.
-  *
-  * \sa class Quaternion, class Transform
-  */
-template<typename _Scalar> struct ei_traits<Rotation2D<_Scalar> >
-{
-  typedef _Scalar Scalar;
-};
-
-template<typename _Scalar>
-class Rotation2D : public RotationBase<Rotation2D<_Scalar>,2>
-{
-  typedef RotationBase<Rotation2D<_Scalar>,2> Base;
-
-public:
-
-  using Base::operator*;
-
-  enum { Dim = 2 };
-  /** the scalar type of the coefficients */
-  typedef _Scalar Scalar;
-  typedef Matrix<Scalar,2,1> Vector2;
-  typedef Matrix<Scalar,2,2> Matrix2;
-
-protected:
-
-  Scalar m_angle;
-
-public:
-
-  /** Construct a 2D counter clock wise rotation from the angle \a a in radian. */
-  inline Rotation2D(Scalar a) : m_angle(a) {}
-
-  /** \returns the rotation angle */
-  inline Scalar angle() const { return m_angle; }
-
-  /** \returns a read-write reference to the rotation angle */
-  inline Scalar& angle() { return m_angle; }
-
-  /** \returns the inverse rotation */
-  inline Rotation2D inverse() const { return -m_angle; }
-
-  /** Concatenates two rotations */
-  inline Rotation2D operator*(const Rotation2D& other) const
-  { return m_angle + other.m_angle; }
-
-  /** Concatenates two rotations */
-  inline Rotation2D& operator*=(const Rotation2D& other)
-  { return m_angle += other.m_angle; return *this; }
-
-  /** Applies the rotation to a 2D vector */
-  Vector2 operator* (const Vector2& vec) const
-  { return toRotationMatrix() * vec; }
-
-  template<typename Derived>
-  Rotation2D& fromRotationMatrix(const MatrixBase<Derived>& m);
-  Matrix2 toRotationMatrix(void) const;
-
-  /** \returns the spherical interpolation between \c *this and \a other using
-    * parameter \a t. It is in fact equivalent to a linear interpolation.
-    */
-  inline Rotation2D slerp(Scalar t, const Rotation2D& other) const
-  { return m_angle * (1-t) + other.angle() * t; }
-
-  /** \returns \c *this with scalar type casted to \a NewScalarType
-    *
-    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
-    * then this function smartly returns a const reference to \c *this.
-    */
-  template<typename NewScalarType>
-  inline typename internal::cast_return_type<Rotation2D,Rotation2D<NewScalarType> >::type cast() const
-  { return typename internal::cast_return_type<Rotation2D,Rotation2D<NewScalarType> >::type(*this); }
-
-  /** Copy constructor with scalar type conversion */
-  template<typename OtherScalarType>
-  inline explicit Rotation2D(const Rotation2D<OtherScalarType>& other)
-  {
-    m_angle = Scalar(other.angle());
-  }
-
-  /** \returns \c true if \c *this is approximately equal to \a other, within the precision
-    * determined by \a prec.
-    *
-    * \sa MatrixBase::isApprox() */
-  bool isApprox(const Rotation2D& other, typename NumTraits<Scalar>::Real prec = precision<Scalar>()) const
-  { return ei_isApprox(m_angle,other.m_angle, prec); }
-};
-
-/** \ingroup Geometry_Module
-  * single precision 2D rotation type */
-typedef Rotation2D<float> Rotation2Df;
-/** \ingroup Geometry_Module
-  * double precision 2D rotation type */
-typedef Rotation2D<double> Rotation2Dd;
-
-/** Set \c *this from a 2x2 rotation matrix \a mat.
-  * In other words, this function extract the rotation angle
-  * from the rotation matrix.
-  */
-template<typename Scalar>
-template<typename Derived>
-Rotation2D<Scalar>& Rotation2D<Scalar>::fromRotationMatrix(const MatrixBase<Derived>& mat)
-{
-  EIGEN_STATIC_ASSERT(Derived::RowsAtCompileTime==2 && Derived::ColsAtCompileTime==2,YOU_MADE_A_PROGRAMMING_MISTAKE)
-  m_angle = ei_atan2(mat.coeff(1,0), mat.coeff(0,0));
-  return *this;
-}
-
-/** Constructs and \returns an equivalent 2x2 rotation matrix.
-  */
-template<typename Scalar>
-typename Rotation2D<Scalar>::Matrix2
-Rotation2D<Scalar>::toRotationMatrix(void) const
-{
-  Scalar sinA = ei_sin(m_angle);
-  Scalar cosA = ei_cos(m_angle);
-  return (Matrix2() << cosA, -sinA, sinA, cosA).finished();
-}
-
-} // end namespace Eigen
diff --git a/inst/include/Eigen/src/Eigen2Support/Geometry/RotationBase.h b/inst/include/Eigen/src/Eigen2Support/Geometry/RotationBase.h
deleted file mode 100644
index b1c8f38d..00000000
--- a/inst/include/Eigen/src/Eigen2Support/Geometry/RotationBase.h
+++ /dev/null
@@ -1,123 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// no include guard, we'll include this twice from All.h from Eigen2Support, and it's internal anyway
-
-namespace Eigen { 
-
-// this file aims to contains the various representations of rotation/orientation
-// in 2D and 3D space excepted Matrix and Quaternion.
-
-/** \class RotationBase
-  *
-  * \brief Common base class for compact rotation representations
-  *
-  * \param Derived is the derived type, i.e., a rotation type
-  * \param _Dim the dimension of the space
-  */
-template<typename Derived, int _Dim>
-class RotationBase
-{
-  public:
-    enum { Dim = _Dim };
-    /** the scalar type of the coefficients */
-    typedef typename ei_traits<Derived>::Scalar Scalar;
-    
-    /** corresponding linear transformation matrix type */
-    typedef Matrix<Scalar,Dim,Dim> RotationMatrixType;
-
-    inline const Derived& derived() const { return *static_cast<const Derived*>(this); }
-    inline Derived& derived() { return *static_cast<Derived*>(this); }
-
-    /** \returns an equivalent rotation matrix */
-    inline RotationMatrixType toRotationMatrix() const { return derived().toRotationMatrix(); }
-
-    /** \returns the inverse rotation */
-    inline Derived inverse() const { return derived().inverse(); }
-
-    /** \returns the concatenation of the rotation \c *this with a translation \a t */
-    inline Transform<Scalar,Dim> operator*(const Translation<Scalar,Dim>& t) const
-    { return toRotationMatrix() * t; }
-
-    /** \returns the concatenation of the rotation \c *this with a scaling \a s */
-    inline RotationMatrixType operator*(const Scaling<Scalar,Dim>& s) const
-    { return toRotationMatrix() * s; }
-
-    /** \returns the concatenation of the rotation \c *this with an affine transformation \a t */
-    inline Transform<Scalar,Dim> operator*(const Transform<Scalar,Dim>& t) const
-    { return toRotationMatrix() * t; }
-};
-
-/** \geometry_module
-  *
-  * Constructs a Dim x Dim rotation matrix from the rotation \a r
-  */
-template<typename _Scalar, int _Rows, int _Cols, int _Storage, int _MaxRows, int _MaxCols>
-template<typename OtherDerived>
-Matrix<_Scalar, _Rows, _Cols, _Storage, _MaxRows, _MaxCols>
-::Matrix(const RotationBase<OtherDerived,ColsAtCompileTime>& r)
-{
-  EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(Matrix,int(OtherDerived::Dim),int(OtherDerived::Dim))
-  *this = r.toRotationMatrix();
-}
-
-/** \geometry_module
-  *
-  * Set a Dim x Dim rotation matrix from the rotation \a r
-  */
-template<typename _Scalar, int _Rows, int _Cols, int _Storage, int _MaxRows, int _MaxCols>
-template<typename OtherDerived>
-Matrix<_Scalar, _Rows, _Cols, _Storage, _MaxRows, _MaxCols>&
-Matrix<_Scalar, _Rows, _Cols, _Storage, _MaxRows, _MaxCols>
-::operator=(const RotationBase<OtherDerived,ColsAtCompileTime>& r)
-{
-  EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(Matrix,int(OtherDerived::Dim),int(OtherDerived::Dim))
-  return *this = r.toRotationMatrix();
-}
-
-/** \internal
-  *
-  * Helper function to return an arbitrary rotation object to a rotation matrix.
-  *
-  * \param Scalar the numeric type of the matrix coefficients
-  * \param Dim the dimension of the current space
-  *
-  * It returns a Dim x Dim fixed size matrix.
-  *
-  * Default specializations are provided for:
-  *   - any scalar type (2D),
-  *   - any matrix expression,
-  *   - any type based on RotationBase (e.g., Quaternion, AngleAxis, Rotation2D)
-  *
-  * Currently ei_toRotationMatrix is only used by Transform.
-  *
-  * \sa class Transform, class Rotation2D, class Quaternion, class AngleAxis
-  */
-template<typename Scalar, int Dim>
-static inline Matrix<Scalar,2,2> ei_toRotationMatrix(const Scalar& s)
-{
-  EIGEN_STATIC_ASSERT(Dim==2,YOU_MADE_A_PROGRAMMING_MISTAKE)
-  return Rotation2D<Scalar>(s).toRotationMatrix();
-}
-
-template<typename Scalar, int Dim, typename OtherDerived>
-static inline Matrix<Scalar,Dim,Dim> ei_toRotationMatrix(const RotationBase<OtherDerived,Dim>& r)
-{
-  return r.toRotationMatrix();
-}
-
-template<typename Scalar, int Dim, typename OtherDerived>
-static inline const MatrixBase<OtherDerived>& ei_toRotationMatrix(const MatrixBase<OtherDerived>& mat)
-{
-  EIGEN_STATIC_ASSERT(OtherDerived::RowsAtCompileTime==Dim && OtherDerived::ColsAtCompileTime==Dim,
-    YOU_MADE_A_PROGRAMMING_MISTAKE)
-  return mat;
-}
-
-} // end namespace Eigen
diff --git a/inst/include/Eigen/src/Eigen2Support/Geometry/Scaling.h b/inst/include/Eigen/src/Eigen2Support/Geometry/Scaling.h
deleted file mode 100644
index b8fa6cd3..00000000
--- a/inst/include/Eigen/src/Eigen2Support/Geometry/Scaling.h
+++ /dev/null
@@ -1,167 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// no include guard, we'll include this twice from All.h from Eigen2Support, and it's internal anyway
-
-namespace Eigen { 
-
-/** \geometry_module \ingroup Geometry_Module
-  *
-  * \class Scaling
-  *
-  * \brief Represents a possibly non uniform scaling transformation
-  *
-  * \param _Scalar the scalar type, i.e., the type of the coefficients.
-  * \param _Dim the  dimension of the space, can be a compile time value or Dynamic
-  *
-  * \note This class is not aimed to be used to store a scaling transformation,
-  * but rather to make easier the constructions and updates of Transform objects.
-  *
-  * \sa class Translation, class Transform
-  */
-template<typename _Scalar, int _Dim>
-class Scaling
-{
-public:
-  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_Dim)
-  /** dimension of the space */
-  enum { Dim = _Dim };
-  /** the scalar type of the coefficients */
-  typedef _Scalar Scalar;
-  /** corresponding vector type */
-  typedef Matrix<Scalar,Dim,1> VectorType;
-  /** corresponding linear transformation matrix type */
-  typedef Matrix<Scalar,Dim,Dim> LinearMatrixType;
-  /** corresponding translation type */
-  typedef Translation<Scalar,Dim> TranslationType;
-  /** corresponding affine transformation type */
-  typedef Transform<Scalar,Dim> TransformType;
-
-protected:
-
-  VectorType m_coeffs;
-
-public:
-
-  /** Default constructor without initialization. */
-  Scaling() {}
-  /** Constructs and initialize a uniform scaling transformation */
-  explicit inline Scaling(const Scalar& s) { m_coeffs.setConstant(s); }
-  /** 2D only */
-  inline Scaling(const Scalar& sx, const Scalar& sy)
-  {
-    ei_assert(Dim==2);
-    m_coeffs.x() = sx;
-    m_coeffs.y() = sy;
-  }
-  /** 3D only */
-  inline Scaling(const Scalar& sx, const Scalar& sy, const Scalar& sz)
-  {
-    ei_assert(Dim==3);
-    m_coeffs.x() = sx;
-    m_coeffs.y() = sy;
-    m_coeffs.z() = sz;
-  }
-  /** Constructs and initialize the scaling transformation from a vector of scaling coefficients */
-  explicit inline Scaling(const VectorType& coeffs) : m_coeffs(coeffs) {}
-
-  const VectorType& coeffs() const { return m_coeffs; }
-  VectorType& coeffs() { return m_coeffs; }
-
-  /** Concatenates two scaling */
-  inline Scaling operator* (const Scaling& other) const
-  { return Scaling(coeffs().cwise() * other.coeffs()); }
-
-  /** Concatenates a scaling and a translation */
-  inline TransformType operator* (const TranslationType& t) const;
-
-  /** Concatenates a scaling and an affine transformation */
-  inline TransformType operator* (const TransformType& t) const;
-
-  /** Concatenates a scaling and a linear transformation matrix */
-  // TODO returns an expression
-  inline LinearMatrixType operator* (const LinearMatrixType& other) const
-  { return coeffs().asDiagonal() * other; }
-
-  /** Concatenates a linear transformation matrix and a scaling */
-  // TODO returns an expression
-  friend inline LinearMatrixType operator* (const LinearMatrixType& other, const Scaling& s)
-  { return other * s.coeffs().asDiagonal(); }
-
-  template<typename Derived>
-  inline LinearMatrixType operator*(const RotationBase<Derived,Dim>& r) const
-  { return *this * r.toRotationMatrix(); }
-
-  /** Applies scaling to vector */
-  inline VectorType operator* (const VectorType& other) const
-  { return coeffs().asDiagonal() * other; }
-
-  /** \returns the inverse scaling */
-  inline Scaling inverse() const
-  { return Scaling(coeffs().cwise().inverse()); }
-
-  inline Scaling& operator=(const Scaling& other)
-  {
-    m_coeffs = other.m_coeffs;
-    return *this;
-  }
-
-  /** \returns \c *this with scalar type casted to \a NewScalarType
-    *
-    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
-    * then this function smartly returns a const reference to \c *this.
-    */
-  template<typename NewScalarType>
-  inline typename internal::cast_return_type<Scaling,Scaling<NewScalarType,Dim> >::type cast() const
-  { return typename internal::cast_return_type<Scaling,Scaling<NewScalarType,Dim> >::type(*this); }
-
-  /** Copy constructor with scalar type conversion */
-  template<typename OtherScalarType>
-  inline explicit Scaling(const Scaling<OtherScalarType,Dim>& other)
-  { m_coeffs = other.coeffs().template cast<Scalar>(); }
-
-  /** \returns \c true if \c *this is approximately equal to \a other, within the precision
-    * determined by \a prec.
-    *
-    * \sa MatrixBase::isApprox() */
-  bool isApprox(const Scaling& other, typename NumTraits<Scalar>::Real prec = precision<Scalar>()) const
-  { return m_coeffs.isApprox(other.m_coeffs, prec); }
-
-};
-
-/** \addtogroup Geometry_Module */
-//@{
-typedef Scaling<float, 2> Scaling2f;
-typedef Scaling<double,2> Scaling2d;
-typedef Scaling<float, 3> Scaling3f;
-typedef Scaling<double,3> Scaling3d;
-//@}
-
-template<typename Scalar, int Dim>
-inline typename Scaling<Scalar,Dim>::TransformType
-Scaling<Scalar,Dim>::operator* (const TranslationType& t) const
-{
-  TransformType res;
-  res.matrix().setZero();
-  res.linear().diagonal() = coeffs();
-  res.translation() = m_coeffs.cwise() * t.vector();
-  res(Dim,Dim) = Scalar(1);
-  return res;
-}
-
-template<typename Scalar, int Dim>
-inline typename Scaling<Scalar,Dim>::TransformType
-Scaling<Scalar,Dim>::operator* (const TransformType& t) const
-{
-  TransformType res = t;
-  res.prescale(m_coeffs);
-  return res;
-}
-
-} // end namespace Eigen
diff --git a/inst/include/Eigen/src/Eigen2Support/Geometry/Transform.h b/inst/include/Eigen/src/Eigen2Support/Geometry/Transform.h
deleted file mode 100644
index fab60b25..00000000
--- a/inst/include/Eigen/src/Eigen2Support/Geometry/Transform.h
+++ /dev/null
@@ -1,786 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// no include guard, we'll include this twice from All.h from Eigen2Support, and it's internal anyway
-
-namespace Eigen { 
-
-// Note that we have to pass Dim and HDim because it is not allowed to use a template
-// parameter to define a template specialization. To be more precise, in the following
-// specializations, it is not allowed to use Dim+1 instead of HDim.
-template< typename Other,
-          int Dim,
-          int HDim,
-          int OtherRows=Other::RowsAtCompileTime,
-          int OtherCols=Other::ColsAtCompileTime>
-struct ei_transform_product_impl;
-
-/** \geometry_module \ingroup Geometry_Module
-  *
-  * \class Transform
-  *
-  * \brief Represents an homogeneous transformation in a N dimensional space
-  *
-  * \param _Scalar the scalar type, i.e., the type of the coefficients
-  * \param _Dim the dimension of the space
-  *
-  * The homography is internally represented and stored as a (Dim+1)^2 matrix which
-  * is available through the matrix() method.
-  *
-  * Conversion methods from/to Qt's QMatrix and QTransform are available if the
-  * preprocessor token EIGEN_QT_SUPPORT is defined.
-  *
-  * \sa class Matrix, class Quaternion
-  */
-template<typename _Scalar, int _Dim>
-class Transform
-{
-public:
-  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_Dim==Dynamic ? Dynamic : (_Dim+1)*(_Dim+1))
-  enum {
-    Dim = _Dim,     ///< space dimension in which the transformation holds
-    HDim = _Dim+1   ///< size of a respective homogeneous vector
-  };
-  /** the scalar type of the coefficients */
-  typedef _Scalar Scalar;
-  /** type of the matrix used to represent the transformation */
-  typedef Matrix<Scalar,HDim,HDim> MatrixType;
-  /** type of the matrix used to represent the linear part of the transformation */
-  typedef Matrix<Scalar,Dim,Dim> LinearMatrixType;
-  /** type of read/write reference to the linear part of the transformation */
-  typedef Block<MatrixType,Dim,Dim> LinearPart;
-  /** type of read/write reference to the linear part of the transformation */
-  typedef const Block<const MatrixType,Dim,Dim> ConstLinearPart;
-  /** type of a vector */
-  typedef Matrix<Scalar,Dim,1> VectorType;
-  /** type of a read/write reference to the translation part of the rotation */
-  typedef Block<MatrixType,Dim,1> TranslationPart;
-  /** type of a read/write reference to the translation part of the rotation */
-  typedef const Block<const MatrixType,Dim,1> ConstTranslationPart;
-  /** corresponding translation type */
-  typedef Translation<Scalar,Dim> TranslationType;
-  /** corresponding scaling transformation type */
-  typedef Scaling<Scalar,Dim> ScalingType;
-
-protected:
-
-  MatrixType m_matrix;
-
-public:
-
-  /** Default constructor without initialization of the coefficients. */
-  inline Transform() { }
-
-  inline Transform(const Transform& other)
-  {
-    m_matrix = other.m_matrix;
-  }
-
-  inline explicit Transform(const TranslationType& t) { *this = t; }
-  inline explicit Transform(const ScalingType& s) { *this = s; }
-  template<typename Derived>
-  inline explicit Transform(const RotationBase<Derived, Dim>& r) { *this = r; }
-
-  inline Transform& operator=(const Transform& other)
-  { m_matrix = other.m_matrix; return *this; }
-
-  template<typename OtherDerived, bool BigMatrix> // MSVC 2005 will commit suicide if BigMatrix has a default value
-  struct construct_from_matrix
-  {
-    static inline void run(Transform *transform, const MatrixBase<OtherDerived>& other)
-    {
-      transform->matrix() = other;
-    }
-  };
-
-  template<typename OtherDerived> struct construct_from_matrix<OtherDerived, true>
-  {
-    static inline void run(Transform *transform, const MatrixBase<OtherDerived>& other)
-    {
-      transform->linear() = other;
-      transform->translation().setZero();
-      transform->matrix()(Dim,Dim) = Scalar(1);
-      transform->matrix().template block<1,Dim>(Dim,0).setZero();
-    }
-  };
-
-  /** Constructs and initializes a transformation from a Dim^2 or a (Dim+1)^2 matrix. */
-  template<typename OtherDerived>
-  inline explicit Transform(const MatrixBase<OtherDerived>& other)
-  {
-    construct_from_matrix<OtherDerived, int(OtherDerived::RowsAtCompileTime) == Dim>::run(this, other);
-  }
-
-  /** Set \c *this from a (Dim+1)^2 matrix. */
-  template<typename OtherDerived>
-  inline Transform& operator=(const MatrixBase<OtherDerived>& other)
-  { m_matrix = other; return *this; }
-
-  #ifdef EIGEN_QT_SUPPORT
-  inline Transform(const QMatrix& other);
-  inline Transform& operator=(const QMatrix& other);
-  inline QMatrix toQMatrix(void) const;
-  inline Transform(const QTransform& other);
-  inline Transform& operator=(const QTransform& other);
-  inline QTransform toQTransform(void) const;
-  #endif
-
-  /** shortcut for m_matrix(row,col);
-    * \sa MatrixBase::operaror(int,int) const */
-  inline Scalar operator() (int row, int col) const { return m_matrix(row,col); }
-  /** shortcut for m_matrix(row,col);
-    * \sa MatrixBase::operaror(int,int) */
-  inline Scalar& operator() (int row, int col) { return m_matrix(row,col); }
-
-  /** \returns a read-only expression of the transformation matrix */
-  inline const MatrixType& matrix() const { return m_matrix; }
-  /** \returns a writable expression of the transformation matrix */
-  inline MatrixType& matrix() { return m_matrix; }
-
-  /** \returns a read-only expression of the linear (linear) part of the transformation */
-  inline ConstLinearPart linear() const { return m_matrix.template block<Dim,Dim>(0,0); }
-  /** \returns a writable expression of the linear (linear) part of the transformation */
-  inline LinearPart linear() { return m_matrix.template block<Dim,Dim>(0,0); }
-
-  /** \returns a read-only expression of the translation vector of the transformation */
-  inline ConstTranslationPart translation() const { return m_matrix.template block<Dim,1>(0,Dim); }
-  /** \returns a writable expression of the translation vector of the transformation */
-  inline TranslationPart translation() { return m_matrix.template block<Dim,1>(0,Dim); }
-
-  /** \returns an expression of the product between the transform \c *this and a matrix expression \a other
-  *
-  * The right hand side \a other might be either:
-  * \li a vector of size Dim,
-  * \li an homogeneous vector of size Dim+1,
-  * \li a transformation matrix of size Dim+1 x Dim+1.
-  */
-  // note: this function is defined here because some compilers cannot find the respective declaration
-  template<typename OtherDerived>
-  inline const typename ei_transform_product_impl<OtherDerived,_Dim,_Dim+1>::ResultType
-  operator * (const MatrixBase<OtherDerived> &other) const
-  { return ei_transform_product_impl<OtherDerived,Dim,HDim>::run(*this,other.derived()); }
-
-  /** \returns the product expression of a transformation matrix \a a times a transform \a b
-    * The transformation matrix \a a must have a Dim+1 x Dim+1 sizes. */
-  template<typename OtherDerived>
-  friend inline const typename ProductReturnType<OtherDerived,MatrixType>::Type
-  operator * (const MatrixBase<OtherDerived> &a, const Transform &b)
-  { return a.derived() * b.matrix(); }
-
-  /** Contatenates two transformations */
-  inline const Transform
-  operator * (const Transform& other) const
-  { return Transform(m_matrix * other.matrix()); }
-
-  /** \sa MatrixBase::setIdentity() */
-  void setIdentity() { m_matrix.setIdentity(); }
-  static const typename MatrixType::IdentityReturnType Identity()
-  {
-    return MatrixType::Identity();
-  }
-
-  template<typename OtherDerived>
-  inline Transform& scale(const MatrixBase<OtherDerived> &other);
-
-  template<typename OtherDerived>
-  inline Transform& prescale(const MatrixBase<OtherDerived> &other);
-
-  inline Transform& scale(Scalar s);
-  inline Transform& prescale(Scalar s);
-
-  template<typename OtherDerived>
-  inline Transform& translate(const MatrixBase<OtherDerived> &other);
-
-  template<typename OtherDerived>
-  inline Transform& pretranslate(const MatrixBase<OtherDerived> &other);
-
-  template<typename RotationType>
-  inline Transform& rotate(const RotationType& rotation);
-
-  template<typename RotationType>
-  inline Transform& prerotate(const RotationType& rotation);
-
-  Transform& shear(Scalar sx, Scalar sy);
-  Transform& preshear(Scalar sx, Scalar sy);
-
-  inline Transform& operator=(const TranslationType& t);
-  inline Transform& operator*=(const TranslationType& t) { return translate(t.vector()); }
-  inline Transform operator*(const TranslationType& t) const;
-
-  inline Transform& operator=(const ScalingType& t);
-  inline Transform& operator*=(const ScalingType& s) { return scale(s.coeffs()); }
-  inline Transform operator*(const ScalingType& s) const;
-  friend inline Transform operator*(const LinearMatrixType& mat, const Transform& t)
-  {
-    Transform res = t;
-    res.matrix().row(Dim) = t.matrix().row(Dim);
-    res.matrix().template block<Dim,HDim>(0,0) = (mat * t.matrix().template block<Dim,HDim>(0,0)).lazy();
-    return res;
-  }
-
-  template<typename Derived>
-  inline Transform& operator=(const RotationBase<Derived,Dim>& r);
-  template<typename Derived>
-  inline Transform& operator*=(const RotationBase<Derived,Dim>& r) { return rotate(r.toRotationMatrix()); }
-  template<typename Derived>
-  inline Transform operator*(const RotationBase<Derived,Dim>& r) const;
-
-  LinearMatrixType rotation() const;
-  template<typename RotationMatrixType, typename ScalingMatrixType>
-  void computeRotationScaling(RotationMatrixType *rotation, ScalingMatrixType *scaling) const;
-  template<typename ScalingMatrixType, typename RotationMatrixType>
-  void computeScalingRotation(ScalingMatrixType *scaling, RotationMatrixType *rotation) const;
-
-  template<typename PositionDerived, typename OrientationType, typename ScaleDerived>
-  Transform& fromPositionOrientationScale(const MatrixBase<PositionDerived> &position,
-    const OrientationType& orientation, const MatrixBase<ScaleDerived> &scale);
-
-  inline const MatrixType inverse(TransformTraits traits = Affine) const;
-
-  /** \returns a const pointer to the column major internal matrix */
-  const Scalar* data() const { return m_matrix.data(); }
-  /** \returns a non-const pointer to the column major internal matrix */
-  Scalar* data() { return m_matrix.data(); }
-
-  /** \returns \c *this with scalar type casted to \a NewScalarType
-    *
-    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
-    * then this function smartly returns a const reference to \c *this.
-    */
-  template<typename NewScalarType>
-  inline typename internal::cast_return_type<Transform,Transform<NewScalarType,Dim> >::type cast() const
-  { return typename internal::cast_return_type<Transform,Transform<NewScalarType,Dim> >::type(*this); }
-
-  /** Copy constructor with scalar type conversion */
-  template<typename OtherScalarType>
-  inline explicit Transform(const Transform<OtherScalarType,Dim>& other)
-  { m_matrix = other.matrix().template cast<Scalar>(); }
-
-  /** \returns \c true if \c *this is approximately equal to \a other, within the precision
-    * determined by \a prec.
-    *
-    * \sa MatrixBase::isApprox() */
-  bool isApprox(const Transform& other, typename NumTraits<Scalar>::Real prec = precision<Scalar>()) const
-  { return m_matrix.isApprox(other.m_matrix, prec); }
-
-  #ifdef EIGEN_TRANSFORM_PLUGIN
-  #include EIGEN_TRANSFORM_PLUGIN
-  #endif
-
-protected:
-
-};
-
-/** \ingroup Geometry_Module */
-typedef Transform<float,2> Transform2f;
-/** \ingroup Geometry_Module */
-typedef Transform<float,3> Transform3f;
-/** \ingroup Geometry_Module */
-typedef Transform<double,2> Transform2d;
-/** \ingroup Geometry_Module */
-typedef Transform<double,3> Transform3d;
-
-/**************************
-*** Optional QT support ***
-**************************/
-
-#ifdef EIGEN_QT_SUPPORT
-/** Initialises \c *this from a QMatrix assuming the dimension is 2.
-  *
-  * This function is available only if the token EIGEN_QT_SUPPORT is defined.
-  */
-template<typename Scalar, int Dim>
-Transform<Scalar,Dim>::Transform(const QMatrix& other)
-{
-  *this = other;
-}
-
-/** Set \c *this from a QMatrix assuming the dimension is 2.
-  *
-  * This function is available only if the token EIGEN_QT_SUPPORT is defined.
-  */
-template<typename Scalar, int Dim>
-Transform<Scalar,Dim>& Transform<Scalar,Dim>::operator=(const QMatrix& other)
-{
-  EIGEN_STATIC_ASSERT(Dim==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
-  m_matrix << other.m11(), other.m21(), other.dx(),
-              other.m12(), other.m22(), other.dy(),
-              0, 0, 1;
-   return *this;
-}
-
-/** \returns a QMatrix from \c *this assuming the dimension is 2.
-  *
-  * \warning this convertion might loss data if \c *this is not affine
-  *
-  * This function is available only if the token EIGEN_QT_SUPPORT is defined.
-  */
-template<typename Scalar, int Dim>
-QMatrix Transform<Scalar,Dim>::toQMatrix(void) const
-{
-  EIGEN_STATIC_ASSERT(Dim==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
-  return QMatrix(m_matrix.coeff(0,0), m_matrix.coeff(1,0),
-                 m_matrix.coeff(0,1), m_matrix.coeff(1,1),
-                 m_matrix.coeff(0,2), m_matrix.coeff(1,2));
-}
-
-/** Initialises \c *this from a QTransform assuming the dimension is 2.
-  *
-  * This function is available only if the token EIGEN_QT_SUPPORT is defined.
-  */
-template<typename Scalar, int Dim>
-Transform<Scalar,Dim>::Transform(const QTransform& other)
-{
-  *this = other;
-}
-
-/** Set \c *this from a QTransform assuming the dimension is 2.
-  *
-  * This function is available only if the token EIGEN_QT_SUPPORT is defined.
-  */
-template<typename Scalar, int Dim>
-Transform<Scalar,Dim>& Transform<Scalar,Dim>::operator=(const QTransform& other)
-{
-  EIGEN_STATIC_ASSERT(Dim==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
-  m_matrix << other.m11(), other.m21(), other.dx(),
-              other.m12(), other.m22(), other.dy(),
-              other.m13(), other.m23(), other.m33();
-   return *this;
-}
-
-/** \returns a QTransform from \c *this assuming the dimension is 2.
-  *
-  * This function is available only if the token EIGEN_QT_SUPPORT is defined.
-  */
-template<typename Scalar, int Dim>
-QTransform Transform<Scalar,Dim>::toQTransform(void) const
-{
-  EIGEN_STATIC_ASSERT(Dim==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
-  return QTransform(m_matrix.coeff(0,0), m_matrix.coeff(1,0), m_matrix.coeff(2,0),
-                    m_matrix.coeff(0,1), m_matrix.coeff(1,1), m_matrix.coeff(2,1),
-                    m_matrix.coeff(0,2), m_matrix.coeff(1,2), m_matrix.coeff(2,2));
-}
-#endif
-
-/*********************
-*** Procedural API ***
-*********************/
-
-/** Applies on the right the non uniform scale transformation represented
-  * by the vector \a other to \c *this and returns a reference to \c *this.
-  * \sa prescale()
-  */
-template<typename Scalar, int Dim>
-template<typename OtherDerived>
-Transform<Scalar,Dim>&
-Transform<Scalar,Dim>::scale(const MatrixBase<OtherDerived> &other)
-{
-  EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,int(Dim))
-  linear() = (linear() * other.asDiagonal()).lazy();
-  return *this;
-}
-
-/** Applies on the right a uniform scale of a factor \a c to \c *this
-  * and returns a reference to \c *this.
-  * \sa prescale(Scalar)
-  */
-template<typename Scalar, int Dim>
-inline Transform<Scalar,Dim>& Transform<Scalar,Dim>::scale(Scalar s)
-{
-  linear() *= s;
-  return *this;
-}
-
-/** Applies on the left the non uniform scale transformation represented
-  * by the vector \a other to \c *this and returns a reference to \c *this.
-  * \sa scale()
-  */
-template<typename Scalar, int Dim>
-template<typename OtherDerived>
-Transform<Scalar,Dim>&
-Transform<Scalar,Dim>::prescale(const MatrixBase<OtherDerived> &other)
-{
-  EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,int(Dim))
-  m_matrix.template block<Dim,HDim>(0,0) = (other.asDiagonal() * m_matrix.template block<Dim,HDim>(0,0)).lazy();
-  return *this;
-}
-
-/** Applies on the left a uniform scale of a factor \a c to \c *this
-  * and returns a reference to \c *this.
-  * \sa scale(Scalar)
-  */
-template<typename Scalar, int Dim>
-inline Transform<Scalar,Dim>& Transform<Scalar,Dim>::prescale(Scalar s)
-{
-  m_matrix.template corner<Dim,HDim>(TopLeft) *= s;
-  return *this;
-}
-
-/** Applies on the right the translation matrix represented by the vector \a other
-  * to \c *this and returns a reference to \c *this.
-  * \sa pretranslate()
-  */
-template<typename Scalar, int Dim>
-template<typename OtherDerived>
-Transform<Scalar,Dim>&
-Transform<Scalar,Dim>::translate(const MatrixBase<OtherDerived> &other)
-{
-  EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,int(Dim))
-  translation() += linear() * other;
-  return *this;
-}
-
-/** Applies on the left the translation matrix represented by the vector \a other
-  * to \c *this and returns a reference to \c *this.
-  * \sa translate()
-  */
-template<typename Scalar, int Dim>
-template<typename OtherDerived>
-Transform<Scalar,Dim>&
-Transform<Scalar,Dim>::pretranslate(const MatrixBase<OtherDerived> &other)
-{
-  EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,int(Dim))
-  translation() += other;
-  return *this;
-}
-
-/** Applies on the right the rotation represented by the rotation \a rotation
-  * to \c *this and returns a reference to \c *this.
-  *
-  * The template parameter \a RotationType is the type of the rotation which
-  * must be known by ei_toRotationMatrix<>.
-  *
-  * Natively supported types includes:
-  *   - any scalar (2D),
-  *   - a Dim x Dim matrix expression,
-  *   - a Quaternion (3D),
-  *   - a AngleAxis (3D)
-  *
-  * This mechanism is easily extendable to support user types such as Euler angles,
-  * or a pair of Quaternion for 4D rotations.
-  *
-  * \sa rotate(Scalar), class Quaternion, class AngleAxis, prerotate(RotationType)
-  */
-template<typename Scalar, int Dim>
-template<typename RotationType>
-Transform<Scalar,Dim>&
-Transform<Scalar,Dim>::rotate(const RotationType& rotation)
-{
-  linear() *= ei_toRotationMatrix<Scalar,Dim>(rotation);
-  return *this;
-}
-
-/** Applies on the left the rotation represented by the rotation \a rotation
-  * to \c *this and returns a reference to \c *this.
-  *
-  * See rotate() for further details.
-  *
-  * \sa rotate()
-  */
-template<typename Scalar, int Dim>
-template<typename RotationType>
-Transform<Scalar,Dim>&
-Transform<Scalar,Dim>::prerotate(const RotationType& rotation)
-{
-  m_matrix.template block<Dim,HDim>(0,0) = ei_toRotationMatrix<Scalar,Dim>(rotation)
-                                         * m_matrix.template block<Dim,HDim>(0,0);
-  return *this;
-}
-
-/** Applies on the right the shear transformation represented
-  * by the vector \a other to \c *this and returns a reference to \c *this.
-  * \warning 2D only.
-  * \sa preshear()
-  */
-template<typename Scalar, int Dim>
-Transform<Scalar,Dim>&
-Transform<Scalar,Dim>::shear(Scalar sx, Scalar sy)
-{
-  EIGEN_STATIC_ASSERT(int(Dim)==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
-  VectorType tmp = linear().col(0)*sy + linear().col(1);
-  linear() << linear().col(0) + linear().col(1)*sx, tmp;
-  return *this;
-}
-
-/** Applies on the left the shear transformation represented
-  * by the vector \a other to \c *this and returns a reference to \c *this.
-  * \warning 2D only.
-  * \sa shear()
-  */
-template<typename Scalar, int Dim>
-Transform<Scalar,Dim>&
-Transform<Scalar,Dim>::preshear(Scalar sx, Scalar sy)
-{
-  EIGEN_STATIC_ASSERT(int(Dim)==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
-  m_matrix.template block<Dim,HDim>(0,0) = LinearMatrixType(1, sx, sy, 1) * m_matrix.template block<Dim,HDim>(0,0);
-  return *this;
-}
-
-/******************************************************
-*** Scaling, Translation and Rotation compatibility ***
-******************************************************/
-
-template<typename Scalar, int Dim>
-inline Transform<Scalar,Dim>& Transform<Scalar,Dim>::operator=(const TranslationType& t)
-{
-  linear().setIdentity();
-  translation() = t.vector();
-  m_matrix.template block<1,Dim>(Dim,0).setZero();
-  m_matrix(Dim,Dim) = Scalar(1);
-  return *this;
-}
-
-template<typename Scalar, int Dim>
-inline Transform<Scalar,Dim> Transform<Scalar,Dim>::operator*(const TranslationType& t) const
-{
-  Transform res = *this;
-  res.translate(t.vector());
-  return res;
-}
-
-template<typename Scalar, int Dim>
-inline Transform<Scalar,Dim>& Transform<Scalar,Dim>::operator=(const ScalingType& s)
-{
-  m_matrix.setZero();
-  linear().diagonal() = s.coeffs();
-  m_matrix.coeffRef(Dim,Dim) = Scalar(1);
-  return *this;
-}
-
-template<typename Scalar, int Dim>
-inline Transform<Scalar,Dim> Transform<Scalar,Dim>::operator*(const ScalingType& s) const
-{
-  Transform res = *this;
-  res.scale(s.coeffs());
-  return res;
-}
-
-template<typename Scalar, int Dim>
-template<typename Derived>
-inline Transform<Scalar,Dim>& Transform<Scalar,Dim>::operator=(const RotationBase<Derived,Dim>& r)
-{
-  linear() = ei_toRotationMatrix<Scalar,Dim>(r);
-  translation().setZero();
-  m_matrix.template block<1,Dim>(Dim,0).setZero();
-  m_matrix.coeffRef(Dim,Dim) = Scalar(1);
-  return *this;
-}
-
-template<typename Scalar, int Dim>
-template<typename Derived>
-inline Transform<Scalar,Dim> Transform<Scalar,Dim>::operator*(const RotationBase<Derived,Dim>& r) const
-{
-  Transform res = *this;
-  res.rotate(r.derived());
-  return res;
-}
-
-/************************
-*** Special functions ***
-************************/
-
-/** \returns the rotation part of the transformation
-  * \nonstableyet
-  *
-  * \svd_module
-  *
-  * \sa computeRotationScaling(), computeScalingRotation(), class SVD
-  */
-template<typename Scalar, int Dim>
-typename Transform<Scalar,Dim>::LinearMatrixType
-Transform<Scalar,Dim>::rotation() const
-{
-  LinearMatrixType result;
-  computeRotationScaling(&result, (LinearMatrixType*)0);
-  return result;
-}
-
-
-/** decomposes the linear part of the transformation as a product rotation x scaling, the scaling being
-  * not necessarily positive.
-  *
-  * If either pointer is zero, the corresponding computation is skipped.
-  *
-  * \nonstableyet
-  *
-  * \svd_module
-  *
-  * \sa computeScalingRotation(), rotation(), class SVD
-  */
-template<typename Scalar, int Dim>
-template<typename RotationMatrixType, typename ScalingMatrixType>
-void Transform<Scalar,Dim>::computeRotationScaling(RotationMatrixType *rotation, ScalingMatrixType *scaling) const
-{
-  JacobiSVD<LinearMatrixType> svd(linear(), ComputeFullU|ComputeFullV);
-  Scalar x = (svd.matrixU() * svd.matrixV().adjoint()).determinant(); // so x has absolute value 1
-  Matrix<Scalar, Dim, 1> sv(svd.singularValues());
-  sv.coeffRef(0) *= x;
-  if(scaling)
-  {
-    scaling->noalias() = svd.matrixV() * sv.asDiagonal() * svd.matrixV().adjoint();
-  }
-  if(rotation)
-  {
-    LinearMatrixType m(svd.matrixU());
-    m.col(0) /= x;
-    rotation->noalias() = m * svd.matrixV().adjoint();
-  }
-}
-
-/** decomposes the linear part of the transformation as a product rotation x scaling, the scaling being
-  * not necessarily positive.
-  *
-  * If either pointer is zero, the corresponding computation is skipped.
-  *
-  * \nonstableyet
-  *
-  * \svd_module
-  *
-  * \sa computeRotationScaling(), rotation(), class SVD
-  */
-template<typename Scalar, int Dim>
-template<typename ScalingMatrixType, typename RotationMatrixType>
-void Transform<Scalar,Dim>::computeScalingRotation(ScalingMatrixType *scaling, RotationMatrixType *rotation) const
-{
-  JacobiSVD<LinearMatrixType> svd(linear(), ComputeFullU|ComputeFullV);
-  Scalar x = (svd.matrixU() * svd.matrixV().adjoint()).determinant(); // so x has absolute value 1
-  Matrix<Scalar, Dim, 1> sv(svd.singularValues());
-  sv.coeffRef(0) *= x;
-  if(scaling)
-  {
-    scaling->noalias() = svd.matrixU() * sv.asDiagonal() * svd.matrixU().adjoint();
-  }
-  if(rotation)
-  {
-    LinearMatrixType m(svd.matrixU());
-    m.col(0) /= x;
-    rotation->noalias() = m * svd.matrixV().adjoint();
-  }
-}
-
-/** Convenient method to set \c *this from a position, orientation and scale
-  * of a 3D object.
-  */
-template<typename Scalar, int Dim>
-template<typename PositionDerived, typename OrientationType, typename ScaleDerived>
-Transform<Scalar,Dim>&
-Transform<Scalar,Dim>::fromPositionOrientationScale(const MatrixBase<PositionDerived> &position,
-  const OrientationType& orientation, const MatrixBase<ScaleDerived> &scale)
-{
-  linear() = ei_toRotationMatrix<Scalar,Dim>(orientation);
-  linear() *= scale.asDiagonal();
-  translation() = position;
-  m_matrix.template block<1,Dim>(Dim,0).setZero();
-  m_matrix(Dim,Dim) = Scalar(1);
-  return *this;
-}
-
-/** \nonstableyet
-  *
-  * \returns the inverse transformation matrix according to some given knowledge
-  * on \c *this.
-  *
-  * \param traits allows to optimize the inversion process when the transformion
-  * is known to be not a general transformation. The possible values are:
-  *  - Projective if the transformation is not necessarily affine, i.e., if the
-  *    last row is not guaranteed to be [0 ... 0 1]
-  *  - Affine is the default, the last row is assumed to be [0 ... 0 1]
-  *  - Isometry if the transformation is only a concatenations of translations
-  *    and rotations.
-  *
-  * \warning unless \a traits is always set to NoShear or NoScaling, this function
-  * requires the generic inverse method of MatrixBase defined in the LU module. If
-  * you forget to include this module, then you will get hard to debug linking errors.
-  *
-  * \sa MatrixBase::inverse()
-  */
-template<typename Scalar, int Dim>
-inline const typename Transform<Scalar,Dim>::MatrixType
-Transform<Scalar,Dim>::inverse(TransformTraits traits) const
-{
-  if (traits == Projective)
-  {
-    return m_matrix.inverse();
-  }
-  else
-  {
-    MatrixType res;
-    if (traits == Affine)
-    {
-      res.template corner<Dim,Dim>(TopLeft) = linear().inverse();
-    }
-    else if (traits == Isometry)
-    {
-      res.template corner<Dim,Dim>(TopLeft) = linear().transpose();
-    }
-    else
-    {
-      ei_assert("invalid traits value in Transform::inverse()");
-    }
-    // translation and remaining parts
-    res.template corner<Dim,1>(TopRight) = - res.template corner<Dim,Dim>(TopLeft) * translation();
-    res.template corner<1,Dim>(BottomLeft).setZero();
-    res.coeffRef(Dim,Dim) = Scalar(1);
-    return res;
-  }
-}
-
-/*****************************************************
-*** Specializations of operator* with a MatrixBase ***
-*****************************************************/
-
-template<typename Other, int Dim, int HDim>
-struct ei_transform_product_impl<Other,Dim,HDim, HDim,HDim>
-{
-  typedef Transform<typename Other::Scalar,Dim> TransformType;
-  typedef typename TransformType::MatrixType MatrixType;
-  typedef typename ProductReturnType<MatrixType,Other>::Type ResultType;
-  static ResultType run(const TransformType& tr, const Other& other)
-  { return tr.matrix() * other; }
-};
-
-template<typename Other, int Dim, int HDim>
-struct ei_transform_product_impl<Other,Dim,HDim, Dim,Dim>
-{
-  typedef Transform<typename Other::Scalar,Dim> TransformType;
-  typedef typename TransformType::MatrixType MatrixType;
-  typedef TransformType ResultType;
-  static ResultType run(const TransformType& tr, const Other& other)
-  {
-    TransformType res;
-    res.translation() = tr.translation();
-    res.matrix().row(Dim) = tr.matrix().row(Dim);
-    res.linear() = (tr.linear() * other).lazy();
-    return res;
-  }
-};
-
-template<typename Other, int Dim, int HDim>
-struct ei_transform_product_impl<Other,Dim,HDim, HDim,1>
-{
-  typedef Transform<typename Other::Scalar,Dim> TransformType;
-  typedef typename TransformType::MatrixType MatrixType;
-  typedef typename ProductReturnType<MatrixType,Other>::Type ResultType;
-  static ResultType run(const TransformType& tr, const Other& other)
-  { return tr.matrix() * other; }
-};
-
-template<typename Other, int Dim, int HDim>
-struct ei_transform_product_impl<Other,Dim,HDim, Dim,1>
-{
-  typedef typename Other::Scalar Scalar;
-  typedef Transform<Scalar,Dim> TransformType;
-  typedef Matrix<Scalar,Dim,1> ResultType;
-  static ResultType run(const TransformType& tr, const Other& other)
-  { return ((tr.linear() * other) + tr.translation())
-          * (Scalar(1) / ( (tr.matrix().template block<1,Dim>(Dim,0) * other).coeff(0) + tr.matrix().coeff(Dim,Dim))); }
-};
-
-} // end namespace Eigen
diff --git a/inst/include/Eigen/src/Eigen2Support/Geometry/Translation.h b/inst/include/Eigen/src/Eigen2Support/Geometry/Translation.h
deleted file mode 100644
index 2b9859f6..00000000
--- a/inst/include/Eigen/src/Eigen2Support/Geometry/Translation.h
+++ /dev/null
@@ -1,184 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// no include guard, we'll include this twice from All.h from Eigen2Support, and it's internal anyway
-
-namespace Eigen { 
-
-/** \geometry_module \ingroup Geometry_Module
-  *
-  * \class Translation
-  *
-  * \brief Represents a translation transformation
-  *
-  * \param _Scalar the scalar type, i.e., the type of the coefficients.
-  * \param _Dim the  dimension of the space, can be a compile time value or Dynamic
-  *
-  * \note This class is not aimed to be used to store a translation transformation,
-  * but rather to make easier the constructions and updates of Transform objects.
-  *
-  * \sa class Scaling, class Transform
-  */
-template<typename _Scalar, int _Dim>
-class Translation
-{
-public:
-  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_Dim)
-  /** dimension of the space */
-  enum { Dim = _Dim };
-  /** the scalar type of the coefficients */
-  typedef _Scalar Scalar;
-  /** corresponding vector type */
-  typedef Matrix<Scalar,Dim,1> VectorType;
-  /** corresponding linear transformation matrix type */
-  typedef Matrix<Scalar,Dim,Dim> LinearMatrixType;
-  /** corresponding scaling transformation type */
-  typedef Scaling<Scalar,Dim> ScalingType;
-  /** corresponding affine transformation type */
-  typedef Transform<Scalar,Dim> TransformType;
-
-protected:
-
-  VectorType m_coeffs;
-
-public:
-
-  /** Default constructor without initialization. */
-  Translation() {}
-  /**  */
-  inline Translation(const Scalar& sx, const Scalar& sy)
-  {
-    ei_assert(Dim==2);
-    m_coeffs.x() = sx;
-    m_coeffs.y() = sy;
-  }
-  /**  */
-  inline Translation(const Scalar& sx, const Scalar& sy, const Scalar& sz)
-  {
-    ei_assert(Dim==3);
-    m_coeffs.x() = sx;
-    m_coeffs.y() = sy;
-    m_coeffs.z() = sz;
-  }
-  /** Constructs and initialize the scaling transformation from a vector of scaling coefficients */
-  explicit inline Translation(const VectorType& vector) : m_coeffs(vector) {}
-
-  const VectorType& vector() const { return m_coeffs; }
-  VectorType& vector() { return m_coeffs; }
-
-  /** Concatenates two translation */
-  inline Translation operator* (const Translation& other) const
-  { return Translation(m_coeffs + other.m_coeffs); }
-
-  /** Concatenates a translation and a scaling */
-  inline TransformType operator* (const ScalingType& other) const;
-
-  /** Concatenates a translation and a linear transformation */
-  inline TransformType operator* (const LinearMatrixType& linear) const;
-
-  template<typename Derived>
-  inline TransformType operator*(const RotationBase<Derived,Dim>& r) const
-  { return *this * r.toRotationMatrix(); }
-
-  /** Concatenates a linear transformation and a translation */
-  // its a nightmare to define a templated friend function outside its declaration
-  friend inline TransformType operator* (const LinearMatrixType& linear, const Translation& t)
-  {
-    TransformType res;
-    res.matrix().setZero();
-    res.linear() = linear;
-    res.translation() = linear * t.m_coeffs;
-    res.matrix().row(Dim).setZero();
-    res(Dim,Dim) = Scalar(1);
-    return res;
-  }
-
-  /** Concatenates a translation and an affine transformation */
-  inline TransformType operator* (const TransformType& t) const;
-
-  /** Applies translation to vector */
-  inline VectorType operator* (const VectorType& other) const
-  { return m_coeffs + other; }
-
-  /** \returns the inverse translation (opposite) */
-  Translation inverse() const { return Translation(-m_coeffs); }
-
-  Translation& operator=(const Translation& other)
-  {
-    m_coeffs = other.m_coeffs;
-    return *this;
-  }
-
-  /** \returns \c *this with scalar type casted to \a NewScalarType
-    *
-    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
-    * then this function smartly returns a const reference to \c *this.
-    */
-  template<typename NewScalarType>
-  inline typename internal::cast_return_type<Translation,Translation<NewScalarType,Dim> >::type cast() const
-  { return typename internal::cast_return_type<Translation,Translation<NewScalarType,Dim> >::type(*this); }
-
-  /** Copy constructor with scalar type conversion */
-  template<typename OtherScalarType>
-  inline explicit Translation(const Translation<OtherScalarType,Dim>& other)
-  { m_coeffs = other.vector().template cast<Scalar>(); }
-
-  /** \returns \c true if \c *this is approximately equal to \a other, within the precision
-    * determined by \a prec.
-    *
-    * \sa MatrixBase::isApprox() */
-  bool isApprox(const Translation& other, typename NumTraits<Scalar>::Real prec = precision<Scalar>()) const
-  { return m_coeffs.isApprox(other.m_coeffs, prec); }
-
-};
-
-/** \addtogroup Geometry_Module */
-//@{
-typedef Translation<float, 2> Translation2f;
-typedef Translation<double,2> Translation2d;
-typedef Translation<float, 3> Translation3f;
-typedef Translation<double,3> Translation3d;
-//@}
-
-
-template<typename Scalar, int Dim>
-inline typename Translation<Scalar,Dim>::TransformType
-Translation<Scalar,Dim>::operator* (const ScalingType& other) const
-{
-  TransformType res;
-  res.matrix().setZero();
-  res.linear().diagonal() = other.coeffs();
-  res.translation() = m_coeffs;
-  res(Dim,Dim) = Scalar(1);
-  return res;
-}
-
-template<typename Scalar, int Dim>
-inline typename Translation<Scalar,Dim>::TransformType
-Translation<Scalar,Dim>::operator* (const LinearMatrixType& linear) const
-{
-  TransformType res;
-  res.matrix().setZero();
-  res.linear() = linear;
-  res.translation() = m_coeffs;
-  res.matrix().row(Dim).setZero();
-  res(Dim,Dim) = Scalar(1);
-  return res;
-}
-
-template<typename Scalar, int Dim>
-inline typename Translation<Scalar,Dim>::TransformType
-Translation<Scalar,Dim>::operator* (const TransformType& t) const
-{
-  TransformType res = t;
-  res.pretranslate(m_coeffs);
-  return res;
-}
-
-} // end namespace Eigen
diff --git a/inst/include/Eigen/src/Eigen2Support/LU.h b/inst/include/Eigen/src/Eigen2Support/LU.h
deleted file mode 100644
index 49f19ad7..00000000
--- a/inst/include/Eigen/src/Eigen2Support/LU.h
+++ /dev/null
@@ -1,120 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2011 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN2_LU_H
-#define EIGEN2_LU_H
-
-namespace Eigen { 
-
-template<typename MatrixType>
-class LU : public FullPivLU<MatrixType>
-{
-  public:
-
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
-    typedef Matrix<int, 1, MatrixType::ColsAtCompileTime, MatrixType::Options, 1, MatrixType::MaxColsAtCompileTime> IntRowVectorType;
-    typedef Matrix<int, MatrixType::RowsAtCompileTime, 1, MatrixType::Options, MatrixType::MaxRowsAtCompileTime, 1> IntColVectorType;
-    typedef Matrix<Scalar, 1, MatrixType::ColsAtCompileTime, MatrixType::Options, 1, MatrixType::MaxColsAtCompileTime> RowVectorType;
-    typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1, MatrixType::Options, MatrixType::MaxRowsAtCompileTime, 1> ColVectorType;
-
-    typedef Matrix<typename MatrixType::Scalar,
-                  MatrixType::ColsAtCompileTime, // the number of rows in the "kernel matrix" is the number of cols of the original matrix
-                                                 // so that the product "matrix * kernel = zero" makes sense
-                  Dynamic,                       // we don't know at compile-time the dimension of the kernel
-                  MatrixType::Options,
-                  MatrixType::MaxColsAtCompileTime, // see explanation for 2nd template parameter
-                  MatrixType::MaxColsAtCompileTime // the kernel is a subspace of the domain space, whose dimension is the number
-                                                   // of columns of the original matrix
-    > KernelResultType;
-
-    typedef Matrix<typename MatrixType::Scalar,
-                   MatrixType::RowsAtCompileTime, // the image is a subspace of the destination space, whose dimension is the number
-                                                  // of rows of the original matrix
-                   Dynamic,                       // we don't know at compile time the dimension of the image (the rank)
-                   MatrixType::Options,
-                   MatrixType::MaxRowsAtCompileTime, // the image matrix will consist of columns from the original matrix,
-                   MatrixType::MaxColsAtCompileTime  // so it has the same number of rows and at most as many columns.
-    > ImageResultType;
-
-    typedef FullPivLU<MatrixType> Base;
-
-    template<typename T>
-    explicit LU(const T& t) : Base(t), m_originalMatrix(t) {}
-
-    template<typename OtherDerived, typename ResultType>
-    bool solve(const MatrixBase<OtherDerived>& b, ResultType *result) const
-    {
-      *result = static_cast<const Base*>(this)->solve(b);
-      return true;
-    }
-
-    template<typename ResultType>
-    inline void computeInverse(ResultType *result) const
-    {
-      solve(MatrixType::Identity(this->rows(), this->cols()), result);
-    }
-    
-    template<typename KernelMatrixType>
-    void computeKernel(KernelMatrixType *result) const
-    {
-      *result = static_cast<const Base*>(this)->kernel();
-    }
-    
-    template<typename ImageMatrixType>
-    void computeImage(ImageMatrixType *result) const
-    {
-      *result = static_cast<const Base*>(this)->image(m_originalMatrix);
-    }
-    
-    const ImageResultType image() const
-    {
-      return static_cast<const Base*>(this)->image(m_originalMatrix);
-    }
-    
-    const MatrixType& m_originalMatrix;
-};
-
-#if EIGEN2_SUPPORT_STAGE < STAGE20_RESOLVE_API_CONFLICTS
-/** \lu_module
-  *
-  * Synonym of partialPivLu().
-  *
-  * \return the partial-pivoting LU decomposition of \c *this.
-  *
-  * \sa class PartialPivLU
-  */
-template<typename Derived>
-inline const LU<typename MatrixBase<Derived>::PlainObject>
-MatrixBase<Derived>::lu() const
-{
-  return LU<PlainObject>(eval());
-}
-#endif
-
-#ifdef EIGEN2_SUPPORT
-/** \lu_module
-  *
-  * Synonym of partialPivLu().
-  *
-  * \return the partial-pivoting LU decomposition of \c *this.
-  *
-  * \sa class PartialPivLU
-  */
-template<typename Derived>
-inline const LU<typename MatrixBase<Derived>::PlainObject>
-MatrixBase<Derived>::eigen2_lu() const
-{
-  return LU<PlainObject>(eval());
-}
-#endif
-
-} // end namespace Eigen
-
-#endif // EIGEN2_LU_H
diff --git a/inst/include/Eigen/src/Eigen2Support/Lazy.h b/inst/include/Eigen/src/Eigen2Support/Lazy.h
deleted file mode 100644
index 593fc78e..00000000
--- a/inst/include/Eigen/src/Eigen2Support/Lazy.h
+++ /dev/null
@@ -1,71 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_LAZY_H
-#define EIGEN_LAZY_H
-
-namespace Eigen { 
-
-/** \deprecated it is only used by lazy() which is deprecated
-  *
-  * \returns an expression of *this with added flags
-  *
-  * Example: \include MatrixBase_marked.cpp
-  * Output: \verbinclude MatrixBase_marked.out
-  *
-  * \sa class Flagged, extract(), part()
-  */
-template<typename Derived>
-template<unsigned int Added>
-inline const Flagged<Derived, Added, 0>
-MatrixBase<Derived>::marked() const
-{
-  return derived();
-}
-
-/** \deprecated use MatrixBase::noalias()
-  *
-  * \returns an expression of *this with the EvalBeforeAssigningBit flag removed.
-  *
-  * Example: \include MatrixBase_lazy.cpp
-  * Output: \verbinclude MatrixBase_lazy.out
-  *
-  * \sa class Flagged, marked()
-  */
-template<typename Derived>
-inline const Flagged<Derived, 0, EvalBeforeAssigningBit>
-MatrixBase<Derived>::lazy() const
-{
-  return derived();
-}
-
-
-/** \internal
-  * Overloaded to perform an efficient C += (A*B).lazy() */
-template<typename Derived>
-template<typename ProductDerived, typename Lhs, typename Rhs>
-Derived& MatrixBase<Derived>::operator+=(const Flagged<ProductBase<ProductDerived, Lhs,Rhs>, 0,
-                                                       EvalBeforeAssigningBit>& other)
-{
-  other._expression().derived().addTo(derived()); return derived();
-}
-
-/** \internal
-  * Overloaded to perform an efficient C -= (A*B).lazy() */
-template<typename Derived>
-template<typename ProductDerived, typename Lhs, typename Rhs>
-Derived& MatrixBase<Derived>::operator-=(const Flagged<ProductBase<ProductDerived, Lhs,Rhs>, 0,
-                                                       EvalBeforeAssigningBit>& other)
-{
-  other._expression().derived().subTo(derived()); return derived();
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN_LAZY_H
diff --git a/inst/include/Eigen/src/Eigen2Support/LeastSquares.h b/inst/include/Eigen/src/Eigen2Support/LeastSquares.h
deleted file mode 100644
index 7992d494..00000000
--- a/inst/include/Eigen/src/Eigen2Support/LeastSquares.h
+++ /dev/null
@@ -1,169 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2006-2009 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN2_LEASTSQUARES_H
-#define EIGEN2_LEASTSQUARES_H
-
-namespace Eigen { 
-
-/** \ingroup LeastSquares_Module
-  *
-  * \leastsquares_module
-  *
-  * For a set of points, this function tries to express
-  * one of the coords as a linear (affine) function of the other coords.
-  *
-  * This is best explained by an example. This function works in full
-  * generality, for points in a space of arbitrary dimension, and also over
-  * the complex numbers, but for this example we will work in dimension 3
-  * over the real numbers (doubles).
-  *
-  * So let us work with the following set of 5 points given by their
-  * \f$(x,y,z)\f$ coordinates:
-  * @code
-    Vector3d points[5];
-    points[0] = Vector3d( 3.02, 6.89, -4.32 );
-    points[1] = Vector3d( 2.01, 5.39, -3.79 );
-    points[2] = Vector3d( 2.41, 6.01, -4.01 );
-    points[3] = Vector3d( 2.09, 5.55, -3.86 );
-    points[4] = Vector3d( 2.58, 6.32, -4.10 );
-  * @endcode
-  * Suppose that we want to express the second coordinate (\f$y\f$) as a linear
-  * expression in \f$x\f$ and \f$z\f$, that is,
-  * \f[ y=ax+bz+c \f]
-  * for some constants \f$a,b,c\f$. Thus, we want to find the best possible
-  * constants \f$a,b,c\f$ so that the plane of equation \f$y=ax+bz+c\f$ fits
-  * best the five above points. To do that, call this function as follows:
-  * @code
-    Vector3d coeffs; // will store the coefficients a, b, c
-    linearRegression(
-      5,
-      &points,
-      &coeffs,
-      1 // the coord to express as a function of
-        // the other ones. 0 means x, 1 means y, 2 means z.
-    );
-  * @endcode
-  * Now the vector \a coeffs is approximately
-  * \f$( 0.495 ,  -1.927 ,  -2.906 )\f$.
-  * Thus, we get \f$a=0.495, b = -1.927, c = -2.906\f$. Let us check for
-  * instance how near points[0] is from the plane of equation \f$y=ax+bz+c\f$.
-  * Looking at the coords of points[0], we see that:
-  * \f[ax+bz+c = 0.495 * 3.02 + (-1.927) * (-4.32) + (-2.906) = 6.91.\f]
-  * On the other hand, we have \f$y=6.89\f$. We see that the values
-  * \f$6.91\f$ and \f$6.89\f$
-  * are near, so points[0] is very near the plane of equation \f$y=ax+bz+c\f$.
-  *
-  * Let's now describe precisely the parameters:
-  * @param numPoints the number of points
-  * @param points the array of pointers to the points on which to perform the linear regression
-  * @param result pointer to the vector in which to store the result.
-                  This vector must be of the same type and size as the
-                  data points. The meaning of its coords is as follows.
-                  For brevity, let \f$n=Size\f$,
-                  \f$r_i=result[i]\f$,
-                  and \f$f=funcOfOthers\f$. Denote by
-                  \f$x_0,\ldots,x_{n-1}\f$
-                  the n coordinates in the n-dimensional space.
-                  Then the resulting equation is:
-                  \f[ x_f = r_0 x_0 + \cdots + r_{f-1}x_{f-1}
-                   + r_{f+1}x_{f+1} + \cdots + r_{n-1}x_{n-1} + r_n. \f]
-  * @param funcOfOthers Determines which coord to express as a function of the
-                        others. Coords are numbered starting from 0, so that a
-                        value of 0 means \f$x\f$, 1 means \f$y\f$,
-                        2 means \f$z\f$, ...
-  *
-  * \sa fitHyperplane()
-  */
-template<typename VectorType>
-void linearRegression(int numPoints,
-                      VectorType **points,
-                      VectorType *result,
-                      int funcOfOthers )
-{
-  typedef typename VectorType::Scalar Scalar;
-  typedef Hyperplane<Scalar, VectorType::SizeAtCompileTime> HyperplaneType;
-  const int size = points[0]->size();
-  result->resize(size);
-  HyperplaneType h(size);
-  fitHyperplane(numPoints, points, &h);
-  for(int i = 0; i < funcOfOthers; i++)
-    result->coeffRef(i) = - h.coeffs()[i] / h.coeffs()[funcOfOthers];
-  for(int i = funcOfOthers; i < size; i++)
-    result->coeffRef(i) = - h.coeffs()[i+1] / h.coeffs()[funcOfOthers];
-}
-
-/** \ingroup LeastSquares_Module
-  *
-  * \leastsquares_module
-  *
-  * This function is quite similar to linearRegression(), so we refer to the
-  * documentation of this function and only list here the differences.
-  *
-  * The main difference from linearRegression() is that this function doesn't
-  * take a \a funcOfOthers argument. Instead, it finds a general equation
-  * of the form
-  * \f[ r_0 x_0 + \cdots + r_{n-1}x_{n-1} + r_n = 0, \f]
-  * where \f$n=Size\f$, \f$r_i=retCoefficients[i]\f$, and we denote by
-  * \f$x_0,\ldots,x_{n-1}\f$ the n coordinates in the n-dimensional space.
-  *
-  * Thus, the vector \a retCoefficients has size \f$n+1\f$, which is another
-  * difference from linearRegression().
-  *
-  * In practice, this function performs an hyper-plane fit in a total least square sense
-  * via the following steps:
-  *  1 - center the data to the mean
-  *  2 - compute the covariance matrix
-  *  3 - pick the eigenvector corresponding to the smallest eigenvalue of the covariance matrix
-  * The ratio of the smallest eigenvalue and the second one gives us a hint about the relevance
-  * of the solution. This value is optionally returned in \a soundness.
-  *
-  * \sa linearRegression()
-  */
-template<typename VectorType, typename HyperplaneType>
-void fitHyperplane(int numPoints,
-                   VectorType **points,
-                   HyperplaneType *result,
-                   typename NumTraits<typename VectorType::Scalar>::Real* soundness = 0)
-{
-  typedef typename VectorType::Scalar Scalar;
-  typedef Matrix<Scalar,VectorType::SizeAtCompileTime,VectorType::SizeAtCompileTime> CovMatrixType;
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(VectorType)
-  ei_assert(numPoints >= 1);
-  int size = points[0]->size();
-  ei_assert(size+1 == result->coeffs().size());
-
-  // compute the mean of the data
-  VectorType mean = VectorType::Zero(size);
-  for(int i = 0; i < numPoints; ++i)
-    mean += *(points[i]);
-  mean /= numPoints;
-
-  // compute the covariance matrix
-  CovMatrixType covMat = CovMatrixType::Zero(size, size);
-  for(int i = 0; i < numPoints; ++i)
-  {
-    VectorType diff = (*(points[i]) - mean).conjugate();
-    covMat += diff * diff.adjoint();
-  }
-
-  // now we just have to pick the eigen vector with smallest eigen value
-  SelfAdjointEigenSolver<CovMatrixType> eig(covMat);
-  result->normal() = eig.eigenvectors().col(0);
-  if (soundness)
-    *soundness = eig.eigenvalues().coeff(0)/eig.eigenvalues().coeff(1);
-
-  // let's compute the constant coefficient such that the
-  // plane pass trough the mean point:
-  result->offset() = - (result->normal().cwise()* mean).sum();
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN2_LEASTSQUARES_H
diff --git a/inst/include/Eigen/src/Eigen2Support/Macros.h b/inst/include/Eigen/src/Eigen2Support/Macros.h
deleted file mode 100644
index 351c32af..00000000
--- a/inst/include/Eigen/src/Eigen2Support/Macros.h
+++ /dev/null
@@ -1,20 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2011 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN2_MACROS_H
-#define EIGEN2_MACROS_H
-
-#define ei_assert eigen_assert
-#define ei_internal_assert eigen_internal_assert
-
-#define EIGEN_ALIGN_128 EIGEN_ALIGN16
-
-#define EIGEN_ARCH_WANTS_ALIGNMENT EIGEN_ALIGN_STATICALLY
-
-#endif // EIGEN2_MACROS_H
diff --git a/inst/include/Eigen/src/Eigen2Support/MathFunctions.h b/inst/include/Eigen/src/Eigen2Support/MathFunctions.h
deleted file mode 100644
index 3544af25..00000000
--- a/inst/include/Eigen/src/Eigen2Support/MathFunctions.h
+++ /dev/null
@@ -1,57 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN2_MATH_FUNCTIONS_H
-#define EIGEN2_MATH_FUNCTIONS_H
-
-namespace Eigen { 
-
-template<typename T> inline typename NumTraits<T>::Real ei_real(const T& x) { return numext::real(x); }
-template<typename T> inline typename NumTraits<T>::Real ei_imag(const T& x) { return numext::imag(x); }
-template<typename T> inline T ei_conj(const T& x) { return numext::conj(x); }
-template<typename T> inline typename NumTraits<T>::Real ei_abs (const T& x) { using std::abs; return abs(x); }
-template<typename T> inline typename NumTraits<T>::Real ei_abs2(const T& x) { return numext::abs2(x); }
-template<typename T> inline T ei_sqrt(const T& x) { using std::sqrt; return sqrt(x); }
-template<typename T> inline T ei_exp (const T& x) { using std::exp;  return exp(x); }
-template<typename T> inline T ei_log (const T& x) { using std::log;  return log(x); }
-template<typename T> inline T ei_sin (const T& x) { using std::sin;  return sin(x); }
-template<typename T> inline T ei_cos (const T& x) { using std::cos;  return cos(x); }
-template<typename T> inline T ei_atan2(const T& x,const T& y) { using std::atan2; return atan2(x,y); }
-template<typename T> inline T ei_pow (const T& x,const T& y) { return numext::pow(x,y); }
-template<typename T> inline T ei_random () { return internal::random<T>(); }
-template<typename T> inline T ei_random (const T& x, const T& y) { return internal::random(x, y); }
-
-template<typename T> inline T precision () { return NumTraits<T>::dummy_precision(); }
-template<typename T> inline T machine_epsilon () { return NumTraits<T>::epsilon(); }
-
-
-template<typename Scalar, typename OtherScalar>
-inline bool ei_isMuchSmallerThan(const Scalar& x, const OtherScalar& y,
-                                   typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision())
-{
-  return internal::isMuchSmallerThan(x, y, precision);
-}
-
-template<typename Scalar>
-inline bool ei_isApprox(const Scalar& x, const Scalar& y,
-                          typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision())
-{
-  return internal::isApprox(x, y, precision);
-}
-
-template<typename Scalar>
-inline bool ei_isApproxOrLessThan(const Scalar& x, const Scalar& y,
-                                    typename NumTraits<Scalar>::Real precision = NumTraits<Scalar>::dummy_precision())
-{
-  return internal::isApproxOrLessThan(x, y, precision);
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN2_MATH_FUNCTIONS_H
diff --git a/inst/include/Eigen/src/Eigen2Support/Memory.h b/inst/include/Eigen/src/Eigen2Support/Memory.h
deleted file mode 100644
index f86372b6..00000000
--- a/inst/include/Eigen/src/Eigen2Support/Memory.h
+++ /dev/null
@@ -1,45 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2011 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN2_MEMORY_H
-#define EIGEN2_MEMORY_H
-
-namespace Eigen { 
-
-inline void* ei_aligned_malloc(size_t size) { return internal::aligned_malloc(size); }
-inline void  ei_aligned_free(void *ptr) { internal::aligned_free(ptr); }
-inline void* ei_aligned_realloc(void *ptr, size_t new_size, size_t old_size) { return internal::aligned_realloc(ptr, new_size, old_size); }
-inline void* ei_handmade_aligned_malloc(size_t size) { return internal::handmade_aligned_malloc(size); }
-inline void  ei_handmade_aligned_free(void *ptr) { internal::handmade_aligned_free(ptr); }
-
-template<bool Align> inline void* ei_conditional_aligned_malloc(size_t size)
-{
-  return internal::conditional_aligned_malloc<Align>(size);
-}
-template<bool Align> inline void ei_conditional_aligned_free(void *ptr)
-{
-  internal::conditional_aligned_free<Align>(ptr);
-}
-template<bool Align> inline void* ei_conditional_aligned_realloc(void* ptr, size_t new_size, size_t old_size)
-{
-  return internal::conditional_aligned_realloc<Align>(ptr, new_size, old_size);
-}
-
-template<typename T> inline T* ei_aligned_new(size_t size)
-{
-  return internal::aligned_new<T>(size);
-}
-template<typename T> inline void ei_aligned_delete(T *ptr, size_t size)
-{
-  return internal::aligned_delete(ptr, size);
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN2_MACROS_H
diff --git a/inst/include/Eigen/src/Eigen2Support/Meta.h b/inst/include/Eigen/src/Eigen2Support/Meta.h
deleted file mode 100644
index fa37cfc9..00000000
--- a/inst/include/Eigen/src/Eigen2Support/Meta.h
+++ /dev/null
@@ -1,75 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2011 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN2_META_H
-#define EIGEN2_META_H
-
-namespace Eigen { 
-
-template<typename T>
-struct ei_traits : internal::traits<T>
-{};
-
-struct ei_meta_true {  enum { ret = 1 }; };
-struct ei_meta_false { enum { ret = 0 }; };
-
-template<bool Condition, typename Then, typename Else>
-struct ei_meta_if { typedef Then ret; };
-
-template<typename Then, typename Else>
-struct ei_meta_if <false, Then, Else> { typedef Else ret; };
-
-template<typename T, typename U> struct ei_is_same_type { enum { ret = 0 }; };
-template<typename T> struct ei_is_same_type<T,T> { enum { ret = 1 }; };
-
-template<typename T> struct ei_unref { typedef T type; };
-template<typename T> struct ei_unref<T&> { typedef T type; };
-
-template<typename T> struct ei_unpointer { typedef T type; };
-template<typename T> struct ei_unpointer<T*> { typedef T type; };
-template<typename T> struct ei_unpointer<T*const> { typedef T type; };
-
-template<typename T> struct ei_unconst { typedef T type; };
-template<typename T> struct ei_unconst<const T> { typedef T type; };
-template<typename T> struct ei_unconst<T const &> { typedef T & type; };
-template<typename T> struct ei_unconst<T const *> { typedef T * type; };
-
-template<typename T> struct ei_cleantype { typedef T type; };
-template<typename T> struct ei_cleantype<const T>   { typedef typename ei_cleantype<T>::type type; };
-template<typename T> struct ei_cleantype<const T&>  { typedef typename ei_cleantype<T>::type type; };
-template<typename T> struct ei_cleantype<T&>        { typedef typename ei_cleantype<T>::type type; };
-template<typename T> struct ei_cleantype<const T*>  { typedef typename ei_cleantype<T>::type type; };
-template<typename T> struct ei_cleantype<T*>        { typedef typename ei_cleantype<T>::type type; };
-
-/** \internal In short, it computes int(sqrt(\a Y)) with \a Y an integer.
-  * Usage example: \code ei_meta_sqrt<1023>::ret \endcode
-  */
-template<int Y,
-         int InfX = 0,
-         int SupX = ((Y==1) ? 1 : Y/2),
-         bool Done = ((SupX-InfX)<=1 ? true : ((SupX*SupX <= Y) && ((SupX+1)*(SupX+1) > Y))) >
-                                // use ?: instead of || just to shut up a stupid gcc 4.3 warning
-class ei_meta_sqrt
-{
-    enum {
-      MidX = (InfX+SupX)/2,
-      TakeInf = MidX*MidX > Y ? 1 : 0,
-      NewInf = int(TakeInf) ? InfX : int(MidX),
-      NewSup = int(TakeInf) ? int(MidX) : SupX
-    };
-  public:
-    enum { ret = ei_meta_sqrt<Y,NewInf,NewSup>::ret };
-};
-
-template<int Y, int InfX, int SupX>
-class ei_meta_sqrt<Y, InfX, SupX, true> { public:  enum { ret = (SupX*SupX <= Y) ? SupX : InfX }; };
-
-} // end namespace Eigen
-
-#endif // EIGEN2_META_H
diff --git a/inst/include/Eigen/src/Eigen2Support/Minor.h b/inst/include/Eigen/src/Eigen2Support/Minor.h
deleted file mode 100644
index 4cded573..00000000
--- a/inst/include/Eigen/src/Eigen2Support/Minor.h
+++ /dev/null
@@ -1,117 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2006-2009 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_MINOR_H
-#define EIGEN_MINOR_H
-
-namespace Eigen { 
-
-/**
-  * \class Minor
-  *
-  * \brief Expression of a minor
-  *
-  * \param MatrixType the type of the object in which we are taking a minor
-  *
-  * This class represents an expression of a minor. It is the return
-  * type of MatrixBase::minor() and most of the time this is the only way it
-  * is used.
-  *
-  * \sa MatrixBase::minor()
-  */
-
-namespace internal {
-template<typename MatrixType>
-struct traits<Minor<MatrixType> >
- : traits<MatrixType>
-{
-  typedef typename nested<MatrixType>::type MatrixTypeNested;
-  typedef typename remove_reference<MatrixTypeNested>::type _MatrixTypeNested;
-  typedef typename MatrixType::StorageKind StorageKind;
-  enum {
-    RowsAtCompileTime = (MatrixType::RowsAtCompileTime != Dynamic) ?
-                          int(MatrixType::RowsAtCompileTime) - 1 : Dynamic,
-    ColsAtCompileTime = (MatrixType::ColsAtCompileTime != Dynamic) ?
-                          int(MatrixType::ColsAtCompileTime) - 1 : Dynamic,
-    MaxRowsAtCompileTime = (MatrixType::MaxRowsAtCompileTime != Dynamic) ?
-                             int(MatrixType::MaxRowsAtCompileTime) - 1 : Dynamic,
-    MaxColsAtCompileTime = (MatrixType::MaxColsAtCompileTime != Dynamic) ?
-                             int(MatrixType::MaxColsAtCompileTime) - 1 : Dynamic,
-    Flags = _MatrixTypeNested::Flags & (HereditaryBits | LvalueBit),
-    CoeffReadCost = _MatrixTypeNested::CoeffReadCost // minor is used typically on tiny matrices,
-      // where loops are unrolled and the 'if' evaluates at compile time
-  };
-};
-}
-
-template<typename MatrixType> class Minor
-  : public MatrixBase<Minor<MatrixType> >
-{
-  public:
-
-    typedef MatrixBase<Minor> Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(Minor)
-
-    inline Minor(const MatrixType& matrix,
-                       Index row, Index col)
-      : m_matrix(matrix), m_row(row), m_col(col)
-    {
-      eigen_assert(row >= 0 && row < matrix.rows()
-          && col >= 0 && col < matrix.cols());
-    }
-
-    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Minor)
-
-    inline Index rows() const { return m_matrix.rows() - 1; }
-    inline Index cols() const { return m_matrix.cols() - 1; }
-
-    inline Scalar& coeffRef(Index row, Index col)
-    {
-      return m_matrix.const_cast_derived().coeffRef(row + (row >= m_row), col + (col >= m_col));
-    }
-
-    inline const Scalar coeff(Index row, Index col) const
-    {
-      return m_matrix.coeff(row + (row >= m_row), col + (col >= m_col));
-    }
-
-  protected:
-    const typename MatrixType::Nested m_matrix;
-    const Index m_row, m_col;
-};
-
-/**
-  * \return an expression of the (\a row, \a col)-minor of *this,
-  * i.e. an expression constructed from *this by removing the specified
-  * row and column.
-  *
-  * Example: \include MatrixBase_minor.cpp
-  * Output: \verbinclude MatrixBase_minor.out
-  *
-  * \sa class Minor
-  */
-template<typename Derived>
-inline Minor<Derived>
-MatrixBase<Derived>::minor(Index row, Index col)
-{
-  return Minor<Derived>(derived(), row, col);
-}
-
-/**
-  * This is the const version of minor(). */
-template<typename Derived>
-inline const Minor<Derived>
-MatrixBase<Derived>::minor(Index row, Index col) const
-{
-  return Minor<Derived>(derived(), row, col);
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN_MINOR_H
diff --git a/inst/include/Eigen/src/Eigen2Support/QR.h b/inst/include/Eigen/src/Eigen2Support/QR.h
deleted file mode 100644
index 2042c985..00000000
--- a/inst/include/Eigen/src/Eigen2Support/QR.h
+++ /dev/null
@@ -1,67 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-// Copyright (C) 2011 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN2_QR_H
-#define EIGEN2_QR_H
-
-namespace Eigen { 
-
-template<typename MatrixType>
-class QR : public HouseholderQR<MatrixType>
-{
-  public:
-
-    typedef HouseholderQR<MatrixType> Base;
-    typedef Block<const MatrixType, MatrixType::ColsAtCompileTime, MatrixType::ColsAtCompileTime> MatrixRBlockType;
-
-    QR() : Base() {}
-
-    template<typename T>
-    explicit QR(const T& t) : Base(t) {}
-
-    template<typename OtherDerived, typename ResultType>
-    bool solve(const MatrixBase<OtherDerived>& b, ResultType *result) const
-    {
-      *result = static_cast<const Base*>(this)->solve(b);
-      return true;
-    }
-
-    MatrixType matrixQ(void) const {
-      MatrixType ret = MatrixType::Identity(this->rows(), this->cols());
-      ret = this->householderQ() * ret;
-      return ret;
-    }
-
-    bool isFullRank() const {
-      return true;
-    }
-    
-    const TriangularView<MatrixRBlockType, UpperTriangular>
-    matrixR(void) const
-    {
-      int cols = this->cols();
-      return MatrixRBlockType(this->matrixQR(), 0, 0, cols, cols).template triangularView<UpperTriangular>();
-    }
-};
-
-/** \return the QR decomposition of \c *this.
-  *
-  * \sa class QR
-  */
-template<typename Derived>
-const QR<typename MatrixBase<Derived>::PlainObject>
-MatrixBase<Derived>::qr() const
-{
-  return QR<PlainObject>(eval());
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN2_QR_H
diff --git a/inst/include/Eigen/src/Eigen2Support/SVD.h b/inst/include/Eigen/src/Eigen2Support/SVD.h
deleted file mode 100644
index 3d03d228..00000000
--- a/inst/include/Eigen/src/Eigen2Support/SVD.h
+++ /dev/null
@@ -1,637 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Gael Guennebaud <g.gael@free.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN2_SVD_H
-#define EIGEN2_SVD_H
-
-namespace Eigen {
-
-/** \ingroup SVD_Module
-  * \nonstableyet
-  *
-  * \class SVD
-  *
-  * \brief Standard SVD decomposition of a matrix and associated features
-  *
-  * \param MatrixType the type of the matrix of which we are computing the SVD decomposition
-  *
-  * This class performs a standard SVD decomposition of a real matrix A of size \c M x \c N
-  * with \c M \>= \c N.
-  *
-  *
-  * \sa MatrixBase::SVD()
-  */
-template<typename MatrixType> class SVD
-{
-  private:
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
-
-    enum {
-      PacketSize = internal::packet_traits<Scalar>::size,
-      AlignmentMask = int(PacketSize)-1,
-      MinSize = EIGEN_SIZE_MIN_PREFER_DYNAMIC(MatrixType::RowsAtCompileTime, MatrixType::ColsAtCompileTime)
-    };
-
-    typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> ColVector;
-    typedef Matrix<Scalar, MatrixType::ColsAtCompileTime, 1> RowVector;
-
-    typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, MinSize> MatrixUType;
-    typedef Matrix<Scalar, MatrixType::ColsAtCompileTime, MatrixType::ColsAtCompileTime> MatrixVType;
-    typedef Matrix<Scalar, MinSize, 1> SingularValuesType;
-
-  public:
-
-    SVD() {} // a user who relied on compiler-generated default compiler reported problems with MSVC in 2.0.7
-    
-    SVD(const MatrixType& matrix)
-      : m_matU(matrix.rows(), (std::min)(matrix.rows(), matrix.cols())),
-        m_matV(matrix.cols(),matrix.cols()),
-        m_sigma((std::min)(matrix.rows(),matrix.cols()))
-    {
-      compute(matrix);
-    }
-
-    template<typename OtherDerived, typename ResultType>
-    bool solve(const MatrixBase<OtherDerived> &b, ResultType* result) const;
-
-    const MatrixUType& matrixU() const { return m_matU; }
-    const SingularValuesType& singularValues() const { return m_sigma; }
-    const MatrixVType& matrixV() const { return m_matV; }
-
-    void compute(const MatrixType& matrix);
-    SVD& sort();
-
-    template<typename UnitaryType, typename PositiveType>
-    void computeUnitaryPositive(UnitaryType *unitary, PositiveType *positive) const;
-    template<typename PositiveType, typename UnitaryType>
-    void computePositiveUnitary(PositiveType *positive, UnitaryType *unitary) const;
-    template<typename RotationType, typename ScalingType>
-    void computeRotationScaling(RotationType *unitary, ScalingType *positive) const;
-    template<typename ScalingType, typename RotationType>
-    void computeScalingRotation(ScalingType *positive, RotationType *unitary) const;
-
-  protected:
-    /** \internal */
-    MatrixUType m_matU;
-    /** \internal */
-    MatrixVType m_matV;
-    /** \internal */
-    SingularValuesType m_sigma;
-};
-
-/** Computes / recomputes the SVD decomposition A = U S V^* of \a matrix
-  *
-  * \note this code has been adapted from JAMA (public domain)
-  */
-template<typename MatrixType>
-void SVD<MatrixType>::compute(const MatrixType& matrix)
-{
-  const int m = matrix.rows();
-  const int n = matrix.cols();
-  const int nu = (std::min)(m,n);
-  ei_assert(m>=n && "In Eigen 2.0, SVD only works for MxN matrices with M>=N. Sorry!");
-  ei_assert(m>1 && "In Eigen 2.0, SVD doesn't work on 1x1 matrices");
-
-  m_matU.resize(m, nu);
-  m_matU.setZero();
-  m_sigma.resize((std::min)(m,n));
-  m_matV.resize(n,n);
-
-  RowVector e(n);
-  ColVector work(m);
-  MatrixType matA(matrix);
-  const bool wantu = true;
-  const bool wantv = true;
-  int i=0, j=0, k=0;
-
-  // Reduce A to bidiagonal form, storing the diagonal elements
-  // in s and the super-diagonal elements in e.
-  int nct = (std::min)(m-1,n);
-  int nrt = (std::max)(0,(std::min)(n-2,m));
-  for (k = 0; k < (std::max)(nct,nrt); ++k)
-  {
-    if (k < nct)
-    {
-      // Compute the transformation for the k-th column and
-      // place the k-th diagonal in m_sigma[k].
-      m_sigma[k] = matA.col(k).end(m-k).norm();
-      if (m_sigma[k] != 0.0) // FIXME
-      {
-        if (matA(k,k) < 0.0)
-          m_sigma[k] = -m_sigma[k];
-        matA.col(k).end(m-k) /= m_sigma[k];
-        matA(k,k) += 1.0;
-      }
-      m_sigma[k] = -m_sigma[k];
-    }
-
-    for (j = k+1; j < n; ++j)
-    {
-      if ((k < nct) && (m_sigma[k] != 0.0))
-      {
-        // Apply the transformation.
-        Scalar t = matA.col(k).end(m-k).eigen2_dot(matA.col(j).end(m-k)); // FIXME dot product or cwise prod + .sum() ??
-        t = -t/matA(k,k);
-        matA.col(j).end(m-k) += t * matA.col(k).end(m-k);
-      }
-
-      // Place the k-th row of A into e for the
-      // subsequent calculation of the row transformation.
-      e[j] = matA(k,j);
-    }
-
-    // Place the transformation in U for subsequent back multiplication.
-    if (wantu & (k < nct))
-      m_matU.col(k).end(m-k) = matA.col(k).end(m-k);
-
-    if (k < nrt)
-    {
-      // Compute the k-th row transformation and place the
-      // k-th super-diagonal in e[k].
-      e[k] = e.end(n-k-1).norm();
-      if (e[k] != 0.0)
-      {
-          if (e[k+1] < 0.0)
-            e[k] = -e[k];
-          e.end(n-k-1) /= e[k];
-          e[k+1] += 1.0;
-      }
-      e[k] = -e[k];
-      if ((k+1 < m) & (e[k] != 0.0))
-      {
-        // Apply the transformation.
-        work.end(m-k-1) = matA.corner(BottomRight,m-k-1,n-k-1) * e.end(n-k-1);
-        for (j = k+1; j < n; ++j)
-          matA.col(j).end(m-k-1) += (-e[j]/e[k+1]) * work.end(m-k-1);
-      }
-
-      // Place the transformation in V for subsequent back multiplication.
-      if (wantv)
-        m_matV.col(k).end(n-k-1) = e.end(n-k-1);
-    }
-  }
-
-
-  // Set up the final bidiagonal matrix or order p.
-  int p = (std::min)(n,m+1);
-  if (nct < n)
-    m_sigma[nct] = matA(nct,nct);
-  if (m < p)
-    m_sigma[p-1] = 0.0;
-  if (nrt+1 < p)
-    e[nrt] = matA(nrt,p-1);
-  e[p-1] = 0.0;
-
-  // If required, generate U.
-  if (wantu)
-  {
-    for (j = nct; j < nu; ++j)
-    {
-      m_matU.col(j).setZero();
-      m_matU(j,j) = 1.0;
-    }
-    for (k = nct-1; k >= 0; k--)
-    {
-      if (m_sigma[k] != 0.0)
-      {
-        for (j = k+1; j < nu; ++j)
-        {
-          Scalar t = m_matU.col(k).end(m-k).eigen2_dot(m_matU.col(j).end(m-k)); // FIXME is it really a dot product we want ?
-          t = -t/m_matU(k,k);
-          m_matU.col(j).end(m-k) += t * m_matU.col(k).end(m-k);
-        }
-        m_matU.col(k).end(m-k) = - m_matU.col(k).end(m-k);
-        m_matU(k,k) = Scalar(1) + m_matU(k,k);
-        if (k-1>0)
-          m_matU.col(k).start(k-1).setZero();
-      }
-      else
-      {
-        m_matU.col(k).setZero();
-        m_matU(k,k) = 1.0;
-      }
-    }
-  }
-
-  // If required, generate V.
-  if (wantv)
-  {
-    for (k = n-1; k >= 0; k--)
-    {
-      if ((k < nrt) & (e[k] != 0.0))
-      {
-        for (j = k+1; j < nu; ++j)
-        {
-          Scalar t = m_matV.col(k).end(n-k-1).eigen2_dot(m_matV.col(j).end(n-k-1)); // FIXME is it really a dot product we want ?
-          t = -t/m_matV(k+1,k);
-          m_matV.col(j).end(n-k-1) += t * m_matV.col(k).end(n-k-1);
-        }
-      }
-      m_matV.col(k).setZero();
-      m_matV(k,k) = 1.0;
-    }
-  }
-
-  // Main iteration loop for the singular values.
-  int pp = p-1;
-  int iter = 0;
-  Scalar eps = ei_pow(Scalar(2),ei_is_same_type<Scalar,float>::ret ? Scalar(-23) : Scalar(-52));
-  while (p > 0)
-  {
-    int k=0;
-    int kase=0;
-
-    // Here is where a test for too many iterations would go.
-
-    // This section of the program inspects for
-    // negligible elements in the s and e arrays.  On
-    // completion the variables kase and k are set as follows.
-
-    // kase = 1     if s(p) and e[k-1] are negligible and k<p
-    // kase = 2     if s(k) is negligible and k<p
-    // kase = 3     if e[k-1] is negligible, k<p, and
-    //              s(k), ..., s(p) are not negligible (qr step).
-    // kase = 4     if e(p-1) is negligible (convergence).
-
-    for (k = p-2; k >= -1; --k)
-    {
-      if (k == -1)
-          break;
-      if (ei_abs(e[k]) <= eps*(ei_abs(m_sigma[k]) + ei_abs(m_sigma[k+1])))
-      {
-          e[k] = 0.0;
-          break;
-      }
-    }
-    if (k == p-2)
-    {
-      kase = 4;
-    }
-    else
-    {
-      int ks;
-      for (ks = p-1; ks >= k; --ks)
-      {
-        if (ks == k)
-          break;
-        Scalar t = (ks != p ? ei_abs(e[ks]) : Scalar(0)) + (ks != k+1 ? ei_abs(e[ks-1]) : Scalar(0));
-        if (ei_abs(m_sigma[ks]) <= eps*t)
-        {
-          m_sigma[ks] = 0.0;
-          break;
-        }
-      }
-      if (ks == k)
-      {
-        kase = 3;
-      }
-      else if (ks == p-1)
-      {
-        kase = 1;
-      }
-      else
-      {
-        kase = 2;
-        k = ks;
-      }
-    }
-    ++k;
-
-    // Perform the task indicated by kase.
-    switch (kase)
-    {
-
-      // Deflate negligible s(p).
-      case 1:
-      {
-        Scalar f(e[p-2]);
-        e[p-2] = 0.0;
-        for (j = p-2; j >= k; --j)
-        {
-          Scalar t(numext::hypot(m_sigma[j],f));
-          Scalar cs(m_sigma[j]/t);
-          Scalar sn(f/t);
-          m_sigma[j] = t;
-          if (j != k)
-          {
-            f = -sn*e[j-1];
-            e[j-1] = cs*e[j-1];
-          }
-          if (wantv)
-          {
-            for (i = 0; i < n; ++i)
-            {
-              t = cs*m_matV(i,j) + sn*m_matV(i,p-1);
-              m_matV(i,p-1) = -sn*m_matV(i,j) + cs*m_matV(i,p-1);
-              m_matV(i,j) = t;
-            }
-          }
-        }
-      }
-      break;
-
-      // Split at negligible s(k).
-      case 2:
-      {
-        Scalar f(e[k-1]);
-        e[k-1] = 0.0;
-        for (j = k; j < p; ++j)
-        {
-          Scalar t(numext::hypot(m_sigma[j],f));
-          Scalar cs( m_sigma[j]/t);
-          Scalar sn(f/t);
-          m_sigma[j] = t;
-          f = -sn*e[j];
-          e[j] = cs*e[j];
-          if (wantu)
-          {
-            for (i = 0; i < m; ++i)
-            {
-              t = cs*m_matU(i,j) + sn*m_matU(i,k-1);
-              m_matU(i,k-1) = -sn*m_matU(i,j) + cs*m_matU(i,k-1);
-              m_matU(i,j) = t;
-            }
-          }
-        }
-      }
-      break;
-
-      // Perform one qr step.
-      case 3:
-      {
-        // Calculate the shift.
-        Scalar scale = (std::max)((std::max)((std::max)((std::max)(
-                        ei_abs(m_sigma[p-1]),ei_abs(m_sigma[p-2])),ei_abs(e[p-2])),
-                        ei_abs(m_sigma[k])),ei_abs(e[k]));
-        Scalar sp = m_sigma[p-1]/scale;
-        Scalar spm1 = m_sigma[p-2]/scale;
-        Scalar epm1 = e[p-2]/scale;
-        Scalar sk = m_sigma[k]/scale;
-        Scalar ek = e[k]/scale;
-        Scalar b = ((spm1 + sp)*(spm1 - sp) + epm1*epm1)/Scalar(2);
-        Scalar c = (sp*epm1)*(sp*epm1);
-        Scalar shift(0);
-        if ((b != 0.0) || (c != 0.0))
-        {
-          shift = ei_sqrt(b*b + c);
-          if (b < 0.0)
-            shift = -shift;
-          shift = c/(b + shift);
-        }
-        Scalar f = (sk + sp)*(sk - sp) + shift;
-        Scalar g = sk*ek;
-
-        // Chase zeros.
-
-        for (j = k; j < p-1; ++j)
-        {
-          Scalar t = numext::hypot(f,g);
-          Scalar cs = f/t;
-          Scalar sn = g/t;
-          if (j != k)
-            e[j-1] = t;
-          f = cs*m_sigma[j] + sn*e[j];
-          e[j] = cs*e[j] - sn*m_sigma[j];
-          g = sn*m_sigma[j+1];
-          m_sigma[j+1] = cs*m_sigma[j+1];
-          if (wantv)
-          {
-            for (i = 0; i < n; ++i)
-            {
-              t = cs*m_matV(i,j) + sn*m_matV(i,j+1);
-              m_matV(i,j+1) = -sn*m_matV(i,j) + cs*m_matV(i,j+1);
-              m_matV(i,j) = t;
-            }
-          }
-          t = numext::hypot(f,g);
-          cs = f/t;
-          sn = g/t;
-          m_sigma[j] = t;
-          f = cs*e[j] + sn*m_sigma[j+1];
-          m_sigma[j+1] = -sn*e[j] + cs*m_sigma[j+1];
-          g = sn*e[j+1];
-          e[j+1] = cs*e[j+1];
-          if (wantu && (j < m-1))
-          {
-            for (i = 0; i < m; ++i)
-            {
-              t = cs*m_matU(i,j) + sn*m_matU(i,j+1);
-              m_matU(i,j+1) = -sn*m_matU(i,j) + cs*m_matU(i,j+1);
-              m_matU(i,j) = t;
-            }
-          }
-        }
-        e[p-2] = f;
-        iter = iter + 1;
-      }
-      break;
-
-      // Convergence.
-      case 4:
-      {
-        // Make the singular values positive.
-        if (m_sigma[k] <= 0.0)
-        {
-          m_sigma[k] = m_sigma[k] < Scalar(0) ? -m_sigma[k] : Scalar(0);
-          if (wantv)
-            m_matV.col(k).start(pp+1) = -m_matV.col(k).start(pp+1);
-        }
-
-        // Order the singular values.
-        while (k < pp)
-        {
-          if (m_sigma[k] >= m_sigma[k+1])
-            break;
-          Scalar t = m_sigma[k];
-          m_sigma[k] = m_sigma[k+1];
-          m_sigma[k+1] = t;
-          if (wantv && (k < n-1))
-            m_matV.col(k).swap(m_matV.col(k+1));
-          if (wantu && (k < m-1))
-            m_matU.col(k).swap(m_matU.col(k+1));
-          ++k;
-        }
-        iter = 0;
-        p--;
-      }
-      break;
-    } // end big switch
-  } // end iterations
-}
-
-template<typename MatrixType>
-SVD<MatrixType>& SVD<MatrixType>::sort()
-{
-  int mu = m_matU.rows();
-  int mv = m_matV.rows();
-  int n  = m_matU.cols();
-
-  for (int i=0; i<n; ++i)
-  {
-    int  k = i;
-    Scalar p = m_sigma.coeff(i);
-
-    for (int j=i+1; j<n; ++j)
-    {
-      if (m_sigma.coeff(j) > p)
-      {
-        k = j;
-        p = m_sigma.coeff(j);
-      }
-    }
-    if (k != i)
-    {
-      m_sigma.coeffRef(k) = m_sigma.coeff(i);  // i.e.
-      m_sigma.coeffRef(i) = p;                 // swaps the i-th and the k-th elements
-
-      int j = mu;
-      for(int s=0; j!=0; ++s, --j)
-        std::swap(m_matU.coeffRef(s,i), m_matU.coeffRef(s,k));
-
-      j = mv;
-      for (int s=0; j!=0; ++s, --j)
-        std::swap(m_matV.coeffRef(s,i), m_matV.coeffRef(s,k));
-    }
-  }
-  return *this;
-}
-
-/** \returns the solution of \f$ A x = b \f$ using the current SVD decomposition of A.
-  * The parts of the solution corresponding to zero singular values are ignored.
-  *
-  * \sa MatrixBase::svd(), LU::solve(), LLT::solve()
-  */
-template<typename MatrixType>
-template<typename OtherDerived, typename ResultType>
-bool SVD<MatrixType>::solve(const MatrixBase<OtherDerived> &b, ResultType* result) const
-{
-  ei_assert(b.rows() == m_matU.rows());
-
-  Scalar maxVal = m_sigma.cwise().abs().maxCoeff();
-  for (int j=0; j<b.cols(); ++j)
-  {
-    Matrix<Scalar,MatrixUType::RowsAtCompileTime,1> aux = m_matU.transpose() * b.col(j);
-
-    for (int i = 0; i <m_matU.cols(); ++i)
-    {
-      Scalar si = m_sigma.coeff(i);
-      if (ei_isMuchSmallerThan(ei_abs(si),maxVal))
-        aux.coeffRef(i) = 0;
-      else
-        aux.coeffRef(i) /= si;
-    }
-
-    result->col(j) = m_matV * aux;
-  }
-  return true;
-}
-
-/** Computes the polar decomposition of the matrix, as a product unitary x positive.
-  *
-  * If either pointer is zero, the corresponding computation is skipped.
-  *
-  * Only for square matrices.
-  *
-  * \sa computePositiveUnitary(), computeRotationScaling()
-  */
-template<typename MatrixType>
-template<typename UnitaryType, typename PositiveType>
-void SVD<MatrixType>::computeUnitaryPositive(UnitaryType *unitary,
-                                             PositiveType *positive) const
-{
-  ei_assert(m_matU.cols() == m_matV.cols() && "Polar decomposition is only for square matrices");
-  if(unitary) *unitary = m_matU * m_matV.adjoint();
-  if(positive) *positive = m_matV * m_sigma.asDiagonal() * m_matV.adjoint();
-}
-
-/** Computes the polar decomposition of the matrix, as a product positive x unitary.
-  *
-  * If either pointer is zero, the corresponding computation is skipped.
-  *
-  * Only for square matrices.
-  *
-  * \sa computeUnitaryPositive(), computeRotationScaling()
-  */
-template<typename MatrixType>
-template<typename UnitaryType, typename PositiveType>
-void SVD<MatrixType>::computePositiveUnitary(UnitaryType *positive,
-                                             PositiveType *unitary) const
-{
-  ei_assert(m_matU.rows() == m_matV.rows() && "Polar decomposition is only for square matrices");
-  if(unitary) *unitary = m_matU * m_matV.adjoint();
-  if(positive) *positive = m_matU * m_sigma.asDiagonal() * m_matU.adjoint();
-}
-
-/** decomposes the matrix as a product rotation x scaling, the scaling being
-  * not necessarily positive.
-  *
-  * If either pointer is zero, the corresponding computation is skipped.
-  *
-  * This method requires the Geometry module.
-  *
-  * \sa computeScalingRotation(), computeUnitaryPositive()
-  */
-template<typename MatrixType>
-template<typename RotationType, typename ScalingType>
-void SVD<MatrixType>::computeRotationScaling(RotationType *rotation, ScalingType *scaling) const
-{
-  ei_assert(m_matU.rows() == m_matV.rows() && "Polar decomposition is only for square matrices");
-  Scalar x = (m_matU * m_matV.adjoint()).determinant(); // so x has absolute value 1
-  Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> sv(m_sigma);
-  sv.coeffRef(0) *= x;
-  if(scaling) scaling->lazyAssign(m_matV * sv.asDiagonal() * m_matV.adjoint());
-  if(rotation)
-  {
-    MatrixType m(m_matU);
-    m.col(0) /= x;
-    rotation->lazyAssign(m * m_matV.adjoint());
-  }
-}
-
-/** decomposes the matrix as a product scaling x rotation, the scaling being
-  * not necessarily positive.
-  *
-  * If either pointer is zero, the corresponding computation is skipped.
-  *
-  * This method requires the Geometry module.
-  *
-  * \sa computeRotationScaling(), computeUnitaryPositive()
-  */
-template<typename MatrixType>
-template<typename ScalingType, typename RotationType>
-void SVD<MatrixType>::computeScalingRotation(ScalingType *scaling, RotationType *rotation) const
-{
-  ei_assert(m_matU.rows() == m_matV.rows() && "Polar decomposition is only for square matrices");
-  Scalar x = (m_matU * m_matV.adjoint()).determinant(); // so x has absolute value 1
-  Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> sv(m_sigma);
-  sv.coeffRef(0) *= x;
-  if(scaling) scaling->lazyAssign(m_matU * sv.asDiagonal() * m_matU.adjoint());
-  if(rotation)
-  {
-    MatrixType m(m_matU);
-    m.col(0) /= x;
-    rotation->lazyAssign(m * m_matV.adjoint());
-  }
-}
-
-
-/** \svd_module
-  * \returns the SVD decomposition of \c *this
-  */
-template<typename Derived>
-inline SVD<typename MatrixBase<Derived>::PlainObject>
-MatrixBase<Derived>::svd() const
-{
-  return SVD<PlainObject>(derived());
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN2_SVD_H
diff --git a/inst/include/Eigen/src/Eigen2Support/TriangularSolver.h b/inst/include/Eigen/src/Eigen2Support/TriangularSolver.h
deleted file mode 100644
index ebbeb3b4..00000000
--- a/inst/include/Eigen/src/Eigen2Support/TriangularSolver.h
+++ /dev/null
@@ -1,42 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_TRIANGULAR_SOLVER2_H
-#define EIGEN_TRIANGULAR_SOLVER2_H
-
-namespace Eigen { 
-
-const unsigned int UnitDiagBit = UnitDiag;
-const unsigned int SelfAdjointBit = SelfAdjoint;
-const unsigned int UpperTriangularBit = Upper;
-const unsigned int LowerTriangularBit = Lower;
-
-const unsigned int UpperTriangular = Upper;
-const unsigned int LowerTriangular = Lower;
-const unsigned int UnitUpperTriangular = UnitUpper;
-const unsigned int UnitLowerTriangular = UnitLower;
-
-template<typename ExpressionType, unsigned int Added, unsigned int Removed>
-template<typename OtherDerived>
-typename ExpressionType::PlainObject
-Flagged<ExpressionType,Added,Removed>::solveTriangular(const MatrixBase<OtherDerived>& other) const
-{
-  return m_matrix.template triangularView<Added>().solve(other.derived());
-}
-
-template<typename ExpressionType, unsigned int Added, unsigned int Removed>
-template<typename OtherDerived>
-void Flagged<ExpressionType,Added,Removed>::solveTriangularInPlace(const MatrixBase<OtherDerived>& other) const
-{
-  m_matrix.template triangularView<Added>().solveInPlace(other.derived());
-}
-
-} // end namespace Eigen
-    
-#endif // EIGEN_TRIANGULAR_SOLVER2_H
diff --git a/inst/include/Eigen/src/Eigen2Support/VectorBlock.h b/inst/include/Eigen/src/Eigen2Support/VectorBlock.h
deleted file mode 100644
index 71a8080a..00000000
--- a/inst/include/Eigen/src/Eigen2Support/VectorBlock.h
+++ /dev/null
@@ -1,94 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
-// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN2_VECTORBLOCK_H
-#define EIGEN2_VECTORBLOCK_H
-
-namespace Eigen { 
-
-/** \deprecated use DenseMase::head(Index) */
-template<typename Derived>
-inline VectorBlock<Derived>
-MatrixBase<Derived>::start(Index size)
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return VectorBlock<Derived>(derived(), 0, size);
-}
-
-/** \deprecated use DenseMase::head(Index) */
-template<typename Derived>
-inline const VectorBlock<const Derived>
-MatrixBase<Derived>::start(Index size) const
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return VectorBlock<const Derived>(derived(), 0, size);
-}
-
-/** \deprecated use DenseMase::tail(Index) */
-template<typename Derived>
-inline VectorBlock<Derived>
-MatrixBase<Derived>::end(Index size)
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return VectorBlock<Derived>(derived(), this->size() - size, size);
-}
-
-/** \deprecated use DenseMase::tail(Index) */
-template<typename Derived>
-inline const VectorBlock<const Derived>
-MatrixBase<Derived>::end(Index size) const
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return VectorBlock<const Derived>(derived(), this->size() - size, size);
-}
-
-/** \deprecated use DenseMase::head() */
-template<typename Derived>
-template<int Size>
-inline VectorBlock<Derived,Size>
-MatrixBase<Derived>::start()
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return VectorBlock<Derived,Size>(derived(), 0);
-}
-
-/** \deprecated use DenseMase::head() */
-template<typename Derived>
-template<int Size>
-inline const VectorBlock<const Derived,Size>
-MatrixBase<Derived>::start() const
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return VectorBlock<const Derived,Size>(derived(), 0);
-}
-
-/** \deprecated use DenseMase::tail() */
-template<typename Derived>
-template<int Size>
-inline VectorBlock<Derived,Size>
-MatrixBase<Derived>::end()
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return VectorBlock<Derived, Size>(derived(), size() - Size);
-}
-
-/** \deprecated use DenseMase::tail() */
-template<typename Derived>
-template<int Size>
-inline const VectorBlock<const Derived,Size>
-MatrixBase<Derived>::end() const
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return VectorBlock<const Derived, Size>(derived(), size() - Size);
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN2_VECTORBLOCK_H
diff --git a/inst/include/Eigen/src/Eigenvalues/ComplexEigenSolver.h b/inst/include/Eigen/src/Eigenvalues/ComplexEigenSolver.h
index 417c7294..50fa3b80 100644
--- a/inst/include/Eigen/src/Eigenvalues/ComplexEigenSolver.h
+++ b/inst/include/Eigen/src/Eigenvalues/ComplexEigenSolver.h
@@ -14,263 +14,245 @@
 
 #include "./ComplexSchur.h"
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 /** \eigenvalues_module \ingroup Eigenvalues_Module
-  *
-  *
-  * \class ComplexEigenSolver
-  *
-  * \brief Computes eigenvalues and eigenvectors of general complex matrices
-  *
-  * \tparam _MatrixType the type of the matrix of which we are
-  * computing the eigendecomposition; this is expected to be an
-  * instantiation of the Matrix class template.
-  *
-  * The eigenvalues and eigenvectors of a matrix \f$ A \f$ are scalars
-  * \f$ \lambda \f$ and vectors \f$ v \f$ such that \f$ Av = \lambda v
-  * \f$.  If \f$ D \f$ is a diagonal matrix with the eigenvalues on
-  * the diagonal, and \f$ V \f$ is a matrix with the eigenvectors as
-  * its columns, then \f$ A V = V D \f$. The matrix \f$ V \f$ is
-  * almost always invertible, in which case we have \f$ A = V D V^{-1}
-  * \f$. This is called the eigendecomposition.
-  *
-  * The main function in this class is compute(), which computes the
-  * eigenvalues and eigenvectors of a given function. The
-  * documentation for that function contains an example showing the
-  * main features of the class.
-  *
-  * \sa class EigenSolver, class SelfAdjointEigenSolver
-  */
-template<typename _MatrixType> class ComplexEigenSolver
-{
-  public:
-
-    /** \brief Synonym for the template parameter \p _MatrixType. */
-    typedef _MatrixType MatrixType;
-
-    enum {
-      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-      Options = MatrixType::Options,
-      MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
-      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
-    };
-
-    /** \brief Scalar type for matrices of type #MatrixType. */
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename NumTraits<Scalar>::Real RealScalar;
-    typedef typename MatrixType::Index Index;
-
-    /** \brief Complex scalar type for #MatrixType.
-      *
-      * This is \c std::complex<Scalar> if #Scalar is real (e.g.,
-      * \c float or \c double) and just \c Scalar if #Scalar is
-      * complex.
-      */
-    typedef std::complex<RealScalar> ComplexScalar;
-
-    /** \brief Type for vector of eigenvalues as returned by eigenvalues().
-      *
-      * This is a column vector with entries of type #ComplexScalar.
-      * The length of the vector is the size of #MatrixType.
-      */
-    typedef Matrix<ComplexScalar, ColsAtCompileTime, 1, Options&(~RowMajor), MaxColsAtCompileTime, 1> EigenvalueType;
-
-    /** \brief Type for matrix of eigenvectors as returned by eigenvectors().
-      *
-      * This is a square matrix with entries of type #ComplexScalar.
-      * The size is the same as the size of #MatrixType.
-      */
-    typedef Matrix<ComplexScalar, RowsAtCompileTime, ColsAtCompileTime, Options, MaxRowsAtCompileTime, MaxColsAtCompileTime> EigenvectorType;
-
-    /** \brief Default constructor.
-      *
-      * The default constructor is useful in cases in which the user intends to
-      * perform decompositions via compute().
-      */
-    ComplexEigenSolver()
-            : m_eivec(),
-              m_eivalues(),
-              m_schur(),
-              m_isInitialized(false),
-              m_eigenvectorsOk(false),
-              m_matX()
-    {}
-
-    /** \brief Default Constructor with memory preallocation
-      *
-      * Like the default constructor but with preallocation of the internal data
-      * according to the specified problem \a size.
-      * \sa ComplexEigenSolver()
-      */
-    ComplexEigenSolver(Index size)
-            : m_eivec(size, size),
-              m_eivalues(size),
-              m_schur(size),
-              m_isInitialized(false),
-              m_eigenvectorsOk(false),
-              m_matX(size, size)
-    {}
-
-    /** \brief Constructor; computes eigendecomposition of given matrix.
-      *
-      * \param[in]  matrix  Square matrix whose eigendecomposition is to be computed.
-      * \param[in]  computeEigenvectors  If true, both the eigenvectors and the
-      *    eigenvalues are computed; if false, only the eigenvalues are
-      *    computed.
-      *
-      * This constructor calls compute() to compute the eigendecomposition.
-      */
-      ComplexEigenSolver(const MatrixType& matrix, bool computeEigenvectors = true)
-            : m_eivec(matrix.rows(),matrix.cols()),
-              m_eivalues(matrix.cols()),
-              m_schur(matrix.rows()),
-              m_isInitialized(false),
-              m_eigenvectorsOk(false),
-              m_matX(matrix.rows(),matrix.cols())
-    {
-      compute(matrix, computeEigenvectors);
-    }
+ *
+ *
+ * \class ComplexEigenSolver
+ *
+ * \brief Computes eigenvalues and eigenvectors of general complex matrices
+ *
+ * \tparam MatrixType_ the type of the matrix of which we are
+ * computing the eigendecomposition; this is expected to be an
+ * instantiation of the Matrix class template.
+ *
+ * The eigenvalues and eigenvectors of a matrix \f$ A \f$ are scalars
+ * \f$ \lambda \f$ and vectors \f$ v \f$ such that \f$ Av = \lambda v
+ * \f$.  If \f$ D \f$ is a diagonal matrix with the eigenvalues on
+ * the diagonal, and \f$ V \f$ is a matrix with the eigenvectors as
+ * its columns, then \f$ A V = V D \f$. The matrix \f$ V \f$ is
+ * almost always invertible, in which case we have \f$ A = V D V^{-1}
+ * \f$. This is called the eigendecomposition.
+ *
+ * The main function in this class is compute(), which computes the
+ * eigenvalues and eigenvectors of a given function. The
+ * documentation for that function contains an example showing the
+ * main features of the class.
+ *
+ * \sa class EigenSolver, class SelfAdjointEigenSolver
+ */
+template <typename MatrixType_>
+class ComplexEigenSolver {
+ public:
+  /** \brief Synonym for the template parameter \p MatrixType_. */
+  typedef MatrixType_ MatrixType;
+
+  enum {
+    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+    ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+    Options = internal::traits<MatrixType>::Options,
+    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+  };
+
+  /** \brief Scalar type for matrices of type #MatrixType. */
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef Eigen::Index Index;  ///< \deprecated since Eigen 3.3
+
+  /** \brief Complex scalar type for #MatrixType.
+   *
+   * This is \c std::complex<Scalar> if #Scalar is real (e.g.,
+   * \c float or \c double) and just \c Scalar if #Scalar is
+   * complex.
+   */
+  typedef internal::make_complex_t<Scalar> ComplexScalar;
+
+  /** \brief Type for vector of eigenvalues as returned by eigenvalues().
+   *
+   * This is a column vector with entries of type #ComplexScalar.
+   * The length of the vector is the size of #MatrixType.
+   */
+  typedef Matrix<ComplexScalar, ColsAtCompileTime, 1, Options & (~RowMajor), MaxColsAtCompileTime, 1> EigenvalueType;
+
+  /** \brief Type for matrix of eigenvectors as returned by eigenvectors().
+   *
+   * This is a square matrix with entries of type #ComplexScalar.
+   * The size is the same as the size of #MatrixType.
+   */
+  typedef Matrix<ComplexScalar, RowsAtCompileTime, ColsAtCompileTime, Options, MaxRowsAtCompileTime,
+                 MaxColsAtCompileTime>
+      EigenvectorType;
+
+  /** \brief Default constructor.
+   *
+   * The default constructor is useful in cases in which the user intends to
+   * perform decompositions via compute().
+   */
+  ComplexEigenSolver()
+      : m_eivec(), m_eivalues(), m_schur(), m_isInitialized(false), m_eigenvectorsOk(false), m_matX() {}
+
+  /** \brief Default Constructor with memory preallocation
+   *
+   * Like the default constructor but with preallocation of the internal data
+   * according to the specified problem \a size.
+   * \sa ComplexEigenSolver()
+   */
+  explicit ComplexEigenSolver(Index size)
+      : m_eivec(size, size),
+        m_eivalues(size),
+        m_schur(size),
+        m_isInitialized(false),
+        m_eigenvectorsOk(false),
+        m_matX(size, size) {}
+
+  /** \brief Constructor; computes eigendecomposition of given matrix.
+   *
+   * \param[in]  matrix  Square matrix whose eigendecomposition is to be computed.
+   * \param[in]  computeEigenvectors  If true, both the eigenvectors and the
+   *    eigenvalues are computed; if false, only the eigenvalues are
+   *    computed.
+   *
+   * This constructor calls compute() to compute the eigendecomposition.
+   */
+  template <typename InputType>
+  explicit ComplexEigenSolver(const EigenBase<InputType>& matrix, bool computeEigenvectors = true)
+      : m_eivec(matrix.rows(), matrix.cols()),
+        m_eivalues(matrix.cols()),
+        m_schur(matrix.rows()),
+        m_isInitialized(false),
+        m_eigenvectorsOk(false),
+        m_matX(matrix.rows(), matrix.cols()) {
+    compute(matrix.derived(), computeEigenvectors);
+  }
 
-    /** \brief Returns the eigenvectors of given matrix.
-      *
-      * \returns  A const reference to the matrix whose columns are the eigenvectors.
-      *
-      * \pre Either the constructor
-      * ComplexEigenSolver(const MatrixType& matrix, bool) or the member
-      * function compute(const MatrixType& matrix, bool) has been called before
-      * to compute the eigendecomposition of a matrix, and
-      * \p computeEigenvectors was set to true (the default).
-      *
-      * This function returns a matrix whose columns are the eigenvectors. Column
-      * \f$ k \f$ is an eigenvector corresponding to eigenvalue number \f$ k
-      * \f$ as returned by eigenvalues().  The eigenvectors are normalized to
-      * have (Euclidean) norm equal to one. The matrix returned by this
-      * function is the matrix \f$ V \f$ in the eigendecomposition \f$ A = V D
-      * V^{-1} \f$, if it exists.
-      *
-      * Example: \include ComplexEigenSolver_eigenvectors.cpp
-      * Output: \verbinclude ComplexEigenSolver_eigenvectors.out
-      */
-    const EigenvectorType& eigenvectors() const
-    {
-      eigen_assert(m_isInitialized && "ComplexEigenSolver is not initialized.");
-      eigen_assert(m_eigenvectorsOk && "The eigenvectors have not been computed together with the eigenvalues.");
-      return m_eivec;
-    }
+  /** \brief Returns the eigenvectors of given matrix.
+   *
+   * \returns  A const reference to the matrix whose columns are the eigenvectors.
+   *
+   * \pre Either the constructor
+   * ComplexEigenSolver(const MatrixType& matrix, bool) or the member
+   * function compute(const MatrixType& matrix, bool) has been called before
+   * to compute the eigendecomposition of a matrix, and
+   * \p computeEigenvectors was set to true (the default).
+   *
+   * This function returns a matrix whose columns are the eigenvectors. Column
+   * \f$ k \f$ is an eigenvector corresponding to eigenvalue number \f$ k
+   * \f$ as returned by eigenvalues().  The eigenvectors are normalized to
+   * have (Euclidean) norm equal to one. The matrix returned by this
+   * function is the matrix \f$ V \f$ in the eigendecomposition \f$ A = V D
+   * V^{-1} \f$, if it exists.
+   *
+   * Example: \include ComplexEigenSolver_eigenvectors.cpp
+   * Output: \verbinclude ComplexEigenSolver_eigenvectors.out
+   */
+  const EigenvectorType& eigenvectors() const {
+    eigen_assert(m_isInitialized && "ComplexEigenSolver is not initialized.");
+    eigen_assert(m_eigenvectorsOk && "The eigenvectors have not been computed together with the eigenvalues.");
+    return m_eivec;
+  }
 
-    /** \brief Returns the eigenvalues of given matrix.
-      *
-      * \returns A const reference to the column vector containing the eigenvalues.
-      *
-      * \pre Either the constructor
-      * ComplexEigenSolver(const MatrixType& matrix, bool) or the member
-      * function compute(const MatrixType& matrix, bool) has been called before
-      * to compute the eigendecomposition of a matrix.
-      *
-      * This function returns a column vector containing the
-      * eigenvalues. Eigenvalues are repeated according to their
-      * algebraic multiplicity, so there are as many eigenvalues as
-      * rows in the matrix. The eigenvalues are not sorted in any particular
-      * order.
-      *
-      * Example: \include ComplexEigenSolver_eigenvalues.cpp
-      * Output: \verbinclude ComplexEigenSolver_eigenvalues.out
-      */
-    const EigenvalueType& eigenvalues() const
-    {
-      eigen_assert(m_isInitialized && "ComplexEigenSolver is not initialized.");
-      return m_eivalues;
-    }
+  /** \brief Returns the eigenvalues of given matrix.
+   *
+   * \returns A const reference to the column vector containing the eigenvalues.
+   *
+   * \pre Either the constructor
+   * ComplexEigenSolver(const MatrixType& matrix, bool) or the member
+   * function compute(const MatrixType& matrix, bool) has been called before
+   * to compute the eigendecomposition of a matrix.
+   *
+   * This function returns a column vector containing the
+   * eigenvalues. Eigenvalues are repeated according to their
+   * algebraic multiplicity, so there are as many eigenvalues as
+   * rows in the matrix. The eigenvalues are not sorted in any particular
+   * order.
+   *
+   * Example: \include ComplexEigenSolver_eigenvalues.cpp
+   * Output: \verbinclude ComplexEigenSolver_eigenvalues.out
+   */
+  const EigenvalueType& eigenvalues() const {
+    eigen_assert(m_isInitialized && "ComplexEigenSolver is not initialized.");
+    return m_eivalues;
+  }
 
-    /** \brief Computes eigendecomposition of given matrix.
-      *
-      * \param[in]  matrix  Square matrix whose eigendecomposition is to be computed.
-      * \param[in]  computeEigenvectors  If true, both the eigenvectors and the
-      *    eigenvalues are computed; if false, only the eigenvalues are
-      *    computed.
-      * \returns    Reference to \c *this
-      *
-      * This function computes the eigenvalues of the complex matrix \p matrix.
-      * The eigenvalues() function can be used to retrieve them.  If
-      * \p computeEigenvectors is true, then the eigenvectors are also computed
-      * and can be retrieved by calling eigenvectors().
-      *
-      * The matrix is first reduced to Schur form using the
-      * ComplexSchur class. The Schur decomposition is then used to
-      * compute the eigenvalues and eigenvectors.
-      *
-      * The cost of the computation is dominated by the cost of the
-      * Schur decomposition, which is \f$ O(n^3) \f$ where \f$ n \f$
-      * is the size of the matrix.
-      *
-      * Example: \include ComplexEigenSolver_compute.cpp
-      * Output: \verbinclude ComplexEigenSolver_compute.out
-      */
-    ComplexEigenSolver& compute(const MatrixType& matrix, bool computeEigenvectors = true);
-
-    /** \brief Reports whether previous computation was successful.
-      *
-      * \returns \c Success if computation was succesful, \c NoConvergence otherwise.
-      */
-    ComputationInfo info() const
-    {
-      eigen_assert(m_isInitialized && "ComplexEigenSolver is not initialized.");
-      return m_schur.info();
-    }
+  /** \brief Computes eigendecomposition of given matrix.
+   *
+   * \param[in]  matrix  Square matrix whose eigendecomposition is to be computed.
+   * \param[in]  computeEigenvectors  If true, both the eigenvectors and the
+   *    eigenvalues are computed; if false, only the eigenvalues are
+   *    computed.
+   * \returns    Reference to \c *this
+   *
+   * This function computes the eigenvalues of the complex matrix \p matrix.
+   * The eigenvalues() function can be used to retrieve them.  If
+   * \p computeEigenvectors is true, then the eigenvectors are also computed
+   * and can be retrieved by calling eigenvectors().
+   *
+   * The matrix is first reduced to Schur form using the
+   * ComplexSchur class. The Schur decomposition is then used to
+   * compute the eigenvalues and eigenvectors.
+   *
+   * The cost of the computation is dominated by the cost of the
+   * Schur decomposition, which is \f$ O(n^3) \f$ where \f$ n \f$
+   * is the size of the matrix.
+   *
+   * Example: \include ComplexEigenSolver_compute.cpp
+   * Output: \verbinclude ComplexEigenSolver_compute.out
+   */
+  template <typename InputType>
+  ComplexEigenSolver& compute(const EigenBase<InputType>& matrix, bool computeEigenvectors = true);
+
+  /** \brief Reports whether previous computation was successful.
+   *
+   * \returns \c Success if computation was successful, \c NoConvergence otherwise.
+   */
+  ComputationInfo info() const {
+    eigen_assert(m_isInitialized && "ComplexEigenSolver is not initialized.");
+    return m_schur.info();
+  }
 
-    /** \brief Sets the maximum number of iterations allowed. */
-    ComplexEigenSolver& setMaxIterations(Index maxIters)
-    {
-      m_schur.setMaxIterations(maxIters);
-      return *this;
-    }
+  /** \brief Sets the maximum number of iterations allowed. */
+  ComplexEigenSolver& setMaxIterations(Index maxIters) {
+    m_schur.setMaxIterations(maxIters);
+    return *this;
+  }
 
-    /** \brief Returns the maximum number of iterations. */
-    Index getMaxIterations()
-    {
-      return m_schur.getMaxIterations();
-    }
+  /** \brief Returns the maximum number of iterations. */
+  Index getMaxIterations() { return m_schur.getMaxIterations(); }
 
-  protected:
-    
-    static void check_template_parameters()
-    {
-      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
-    }
-    
-    EigenvectorType m_eivec;
-    EigenvalueType m_eivalues;
-    ComplexSchur<MatrixType> m_schur;
-    bool m_isInitialized;
-    bool m_eigenvectorsOk;
-    EigenvectorType m_matX;
-
-  private:
-    void doComputeEigenvectors(const RealScalar& matrixnorm);
-    void sortEigenvalues(bool computeEigenvectors);
-};
+ protected:
+  EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
+
+  EigenvectorType m_eivec;
+  EigenvalueType m_eivalues;
+  ComplexSchur<MatrixType> m_schur;
+  bool m_isInitialized;
+  bool m_eigenvectorsOk;
+  EigenvectorType m_matX;
 
+ private:
+  void doComputeEigenvectors(RealScalar matrixnorm);
+  void sortEigenvalues(bool computeEigenvectors);
+};
 
-template<typename MatrixType>
-ComplexEigenSolver<MatrixType>& 
-ComplexEigenSolver<MatrixType>::compute(const MatrixType& matrix, bool computeEigenvectors)
-{
-  check_template_parameters();
-  
+template <typename MatrixType>
+template <typename InputType>
+ComplexEigenSolver<MatrixType>& ComplexEigenSolver<MatrixType>::compute(const EigenBase<InputType>& matrix,
+                                                                        bool computeEigenvectors) {
   // this code is inspired from Jampack
   eigen_assert(matrix.cols() == matrix.rows());
 
   // Do a complex Schur decomposition, A = U T U^*
   // The eigenvalues are on the diagonal of T.
-  m_schur.compute(matrix, computeEigenvectors);
+  m_schur.compute(matrix.derived(), computeEigenvectors);
 
-  if(m_schur.info() == Success)
-  {
+  if (m_schur.info() == Success) {
     m_eivalues = m_schur.matrixT().diagonal();
-    if(computeEigenvectors)
-      doComputeEigenvectors(matrix.norm());
+    if (computeEigenvectors) doComputeEigenvectors(m_schur.matrixT().norm());
     sortEigenvalues(computeEigenvectors);
   }
 
@@ -279,63 +261,55 @@ ComplexEigenSolver<MatrixType>::compute(const MatrixType& matrix, bool computeEi
   return *this;
 }
 
-
-template<typename MatrixType>
-void ComplexEigenSolver<MatrixType>::doComputeEigenvectors(const RealScalar& matrixnorm)
-{
+template <typename MatrixType>
+void ComplexEigenSolver<MatrixType>::doComputeEigenvectors(RealScalar matrixnorm) {
   const Index n = m_eivalues.size();
 
+  matrixnorm = numext::maxi(matrixnorm, (std::numeric_limits<RealScalar>::min)());
+
   // Compute X such that T = X D X^(-1), where D is the diagonal of T.
   // The matrix X is unit triangular.
   m_matX = EigenvectorType::Zero(n, n);
-  for(Index k=n-1 ; k>=0 ; k--)
-  {
-    m_matX.coeffRef(k,k) = ComplexScalar(1.0,0.0);
+  for (Index k = n - 1; k >= 0; k--) {
+    m_matX.coeffRef(k, k) = ComplexScalar(1.0, 0.0);
     // Compute X(i,k) using the (i,k) entry of the equation X T = D X
-    for(Index i=k-1 ; i>=0 ; i--)
-    {
-      m_matX.coeffRef(i,k) = -m_schur.matrixT().coeff(i,k);
-      if(k-i-1>0)
-        m_matX.coeffRef(i,k) -= (m_schur.matrixT().row(i).segment(i+1,k-i-1) * m_matX.col(k).segment(i+1,k-i-1)).value();
-      ComplexScalar z = m_schur.matrixT().coeff(i,i) - m_schur.matrixT().coeff(k,k);
-      if(z==ComplexScalar(0))
-      {
+    for (Index i = k - 1; i >= 0; i--) {
+      m_matX.coeffRef(i, k) = -m_schur.matrixT().coeff(i, k);
+      if (k - i - 1 > 0)
+        m_matX.coeffRef(i, k) -=
+            (m_schur.matrixT().row(i).segment(i + 1, k - i - 1) * m_matX.col(k).segment(i + 1, k - i - 1)).value();
+      ComplexScalar z = m_schur.matrixT().coeff(i, i) - m_schur.matrixT().coeff(k, k);
+      if (z == ComplexScalar(0)) {
         // If the i-th and k-th eigenvalue are equal, then z equals 0.
         // Use a small value instead, to prevent division by zero.
         numext::real_ref(z) = NumTraits<RealScalar>::epsilon() * matrixnorm;
       }
-      m_matX.coeffRef(i,k) = m_matX.coeff(i,k) / z;
+      m_matX.coeffRef(i, k) = m_matX.coeff(i, k) / z;
     }
   }
 
   // Compute V as V = U X; now A = U T U^* = U X D X^(-1) U^* = V D V^(-1)
   m_eivec.noalias() = m_schur.matrixU() * m_matX;
   // .. and normalize the eigenvectors
-  for(Index k=0 ; k<n ; k++)
-  {
-    m_eivec.col(k).normalize();
+  for (Index k = 0; k < n; k++) {
+    m_eivec.col(k).stableNormalize();
   }
 }
 
-
-template<typename MatrixType>
-void ComplexEigenSolver<MatrixType>::sortEigenvalues(bool computeEigenvectors)
-{
-  const Index n =  m_eivalues.size();
-  for (Index i=0; i<n; i++)
-  {
+template <typename MatrixType>
+void ComplexEigenSolver<MatrixType>::sortEigenvalues(bool computeEigenvectors) {
+  const Index n = m_eivalues.size();
+  for (Index i = 0; i < n; i++) {
     Index k;
-    m_eivalues.cwiseAbs().tail(n-i).minCoeff(&k);
-    if (k != 0)
-    {
+    m_eivalues.cwiseAbs().tail(n - i).minCoeff(&k);
+    if (k != 0) {
       k += i;
-      std::swap(m_eivalues[k],m_eivalues[i]);
-      if(computeEigenvectors)
-	m_eivec.col(i).swap(m_eivec.col(k));
+      std::swap(m_eivalues[k], m_eivalues[i]);
+      if (computeEigenvectors) m_eivec.col(i).swap(m_eivec.col(k));
     }
   }
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_COMPLEX_EIGEN_SOLVER_H
+#endif  // EIGEN_COMPLEX_EIGEN_SOLVER_H
diff --git a/inst/include/Eigen/src/Eigenvalues/ComplexSchur.h b/inst/include/Eigen/src/Eigenvalues/ComplexSchur.h
index 89e6cade..22433f2b 100644
--- a/inst/include/Eigen/src/Eigenvalues/ComplexSchur.h
+++ b/inst/include/Eigen/src/Eigenvalues/ComplexSchur.h
@@ -14,410 +14,397 @@
 
 #include "./HessenbergDecomposition.h"
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 namespace internal {
-template<typename MatrixType, bool IsComplex> struct complex_schur_reduce_to_hessenberg;
+template <typename MatrixType, bool IsComplex>
+struct complex_schur_reduce_to_hessenberg;
 }
 
 /** \eigenvalues_module \ingroup Eigenvalues_Module
-  *
-  *
-  * \class ComplexSchur
-  *
-  * \brief Performs a complex Schur decomposition of a real or complex square matrix
-  *
-  * \tparam _MatrixType the type of the matrix of which we are
-  * computing the Schur decomposition; this is expected to be an
-  * instantiation of the Matrix class template.
-  *
-  * Given a real or complex square matrix A, this class computes the
-  * Schur decomposition: \f$ A = U T U^*\f$ where U is a unitary
-  * complex matrix, and T is a complex upper triangular matrix.  The
-  * diagonal of the matrix T corresponds to the eigenvalues of the
-  * matrix A.
-  *
-  * Call the function compute() to compute the Schur decomposition of
-  * a given matrix. Alternatively, you can use the 
-  * ComplexSchur(const MatrixType&, bool) constructor which computes
-  * the Schur decomposition at construction time. Once the
-  * decomposition is computed, you can use the matrixU() and matrixT()
-  * functions to retrieve the matrices U and V in the decomposition.
-  *
-  * \note This code is inspired from Jampack
-  *
-  * \sa class RealSchur, class EigenSolver, class ComplexEigenSolver
-  */
-template<typename _MatrixType> class ComplexSchur
-{
-  public:
-    typedef _MatrixType MatrixType;
-    enum {
-      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-      Options = MatrixType::Options,
-      MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
-      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
-    };
-
-    /** \brief Scalar type for matrices of type \p _MatrixType. */
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename NumTraits<Scalar>::Real RealScalar;
-    typedef typename MatrixType::Index Index;
-
-    /** \brief Complex scalar type for \p _MatrixType. 
-      *
-      * This is \c std::complex<Scalar> if #Scalar is real (e.g.,
-      * \c float or \c double) and just \c Scalar if #Scalar is
-      * complex.
-      */
-    typedef std::complex<RealScalar> ComplexScalar;
-
-    /** \brief Type for the matrices in the Schur decomposition.
-      *
-      * This is a square matrix with entries of type #ComplexScalar. 
-      * The size is the same as the size of \p _MatrixType.
-      */
-    typedef Matrix<ComplexScalar, RowsAtCompileTime, ColsAtCompileTime, Options, MaxRowsAtCompileTime, MaxColsAtCompileTime> ComplexMatrixType;
-
-    /** \brief Default constructor.
-      *
-      * \param [in] size  Positive integer, size of the matrix whose Schur decomposition will be computed.
-      *
-      * The default constructor is useful in cases in which the user
-      * intends to perform decompositions via compute().  The \p size
-      * parameter is only used as a hint. It is not an error to give a
-      * wrong \p size, but it may impair performance.
-      *
-      * \sa compute() for an example.
-      */
-    ComplexSchur(Index size = RowsAtCompileTime==Dynamic ? 1 : RowsAtCompileTime)
-      : m_matT(size,size),
-        m_matU(size,size),
+ *
+ *
+ * \class ComplexSchur
+ *
+ * \brief Performs a complex Schur decomposition of a real or complex square matrix
+ *
+ * \tparam MatrixType_ the type of the matrix of which we are
+ * computing the Schur decomposition; this is expected to be an
+ * instantiation of the Matrix class template.
+ *
+ * Given a real or complex square matrix A, this class computes the
+ * Schur decomposition: \f$ A = U T U^*\f$ where U is a unitary
+ * complex matrix, and T is a complex upper triangular matrix.  The
+ * diagonal of the matrix T corresponds to the eigenvalues of the
+ * matrix A.
+ *
+ * Call the function compute() to compute the Schur decomposition of
+ * a given matrix. Alternatively, you can use the
+ * ComplexSchur(const MatrixType&, bool) constructor which computes
+ * the Schur decomposition at construction time. Once the
+ * decomposition is computed, you can use the matrixU() and matrixT()
+ * functions to retrieve the matrices U and V in the decomposition.
+ *
+ * \note This code is inspired from Jampack
+ *
+ * \sa class RealSchur, class EigenSolver, class ComplexEigenSolver
+ */
+template <typename MatrixType_>
+class ComplexSchur {
+ public:
+  typedef MatrixType_ MatrixType;
+  enum {
+    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+    ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+    Options = internal::traits<MatrixType>::Options,
+    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+  };
+
+  /** \brief Scalar type for matrices of type \p MatrixType_. */
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef Eigen::Index Index;  ///< \deprecated since Eigen 3.3
+
+  /** \brief Complex scalar type for \p MatrixType_.
+   *
+   * This is \c std::complex<Scalar> if #Scalar is real (e.g.,
+   * \c float or \c double) and just \c Scalar if #Scalar is
+   * complex.
+   */
+  typedef internal::make_complex_t<Scalar> ComplexScalar;
+
+  /** \brief Type for the matrices in the Schur decomposition.
+   *
+   * This is a square matrix with entries of type #ComplexScalar.
+   * The size is the same as the size of \p MatrixType_.
+   */
+  typedef Matrix<ComplexScalar, RowsAtCompileTime, ColsAtCompileTime, Options, MaxRowsAtCompileTime,
+                 MaxColsAtCompileTime>
+      ComplexMatrixType;
+
+  /** \brief Default constructor.
+   *
+   * \param [in] size  Positive integer, size of the matrix whose Schur decomposition will be computed.
+   *
+   * The default constructor is useful in cases in which the user
+   * intends to perform decompositions via compute().  The \p size
+   * parameter is only used as a hint. It is not an error to give a
+   * wrong \p size, but it may impair performance.
+   *
+   * \sa compute() for an example.
+   */
+  explicit ComplexSchur(Index size = RowsAtCompileTime == Dynamic ? 1 : RowsAtCompileTime)
+      : m_matT(size, size),
+        m_matU(size, size),
         m_hess(size),
         m_isInitialized(false),
         m_matUisUptodate(false),
-        m_maxIters(-1)
-    {}
-
-    /** \brief Constructor; computes Schur decomposition of given matrix. 
-      * 
-      * \param[in]  matrix    Square matrix whose Schur decomposition is to be computed.
-      * \param[in]  computeU  If true, both T and U are computed; if false, only T is computed.
-      *
-      * This constructor calls compute() to compute the Schur decomposition.
-      *
-      * \sa matrixT() and matrixU() for examples.
-      */
-    ComplexSchur(const MatrixType& matrix, bool computeU = true)
-      : m_matT(matrix.rows(),matrix.cols()),
-        m_matU(matrix.rows(),matrix.cols()),
+        m_maxIters(-1) {}
+
+  /** \brief Constructor; computes Schur decomposition of given matrix.
+   *
+   * \param[in]  matrix    Square matrix whose Schur decomposition is to be computed.
+   * \param[in]  computeU  If true, both T and U are computed; if false, only T is computed.
+   *
+   * This constructor calls compute() to compute the Schur decomposition.
+   *
+   * \sa matrixT() and matrixU() for examples.
+   */
+  template <typename InputType>
+  explicit ComplexSchur(const EigenBase<InputType>& matrix, bool computeU = true)
+      : m_matT(matrix.rows(), matrix.cols()),
+        m_matU(matrix.rows(), matrix.cols()),
         m_hess(matrix.rows()),
         m_isInitialized(false),
         m_matUisUptodate(false),
-        m_maxIters(-1)
-    {
-      compute(matrix, computeU);
-    }
-
-    /** \brief Returns the unitary matrix in the Schur decomposition. 
-      *
-      * \returns A const reference to the matrix U.
-      *
-      * It is assumed that either the constructor
-      * ComplexSchur(const MatrixType& matrix, bool computeU) or the
-      * member function compute(const MatrixType& matrix, bool computeU)
-      * has been called before to compute the Schur decomposition of a
-      * matrix, and that \p computeU was set to true (the default
-      * value).
-      *
-      * Example: \include ComplexSchur_matrixU.cpp
-      * Output: \verbinclude ComplexSchur_matrixU.out
-      */
-    const ComplexMatrixType& matrixU() const
-    {
-      eigen_assert(m_isInitialized && "ComplexSchur is not initialized.");
-      eigen_assert(m_matUisUptodate && "The matrix U has not been computed during the ComplexSchur decomposition.");
-      return m_matU;
-    }
+        m_maxIters(-1) {
+    compute(matrix.derived(), computeU);
+  }
 
-    /** \brief Returns the triangular matrix in the Schur decomposition. 
-      *
-      * \returns A const reference to the matrix T.
-      *
-      * It is assumed that either the constructor
-      * ComplexSchur(const MatrixType& matrix, bool computeU) or the
-      * member function compute(const MatrixType& matrix, bool computeU)
-      * has been called before to compute the Schur decomposition of a
-      * matrix.
-      *
-      * Note that this function returns a plain square matrix. If you want to reference
-      * only the upper triangular part, use:
-      * \code schur.matrixT().triangularView<Upper>() \endcode 
-      *
-      * Example: \include ComplexSchur_matrixT.cpp
-      * Output: \verbinclude ComplexSchur_matrixT.out
-      */
-    const ComplexMatrixType& matrixT() const
-    {
-      eigen_assert(m_isInitialized && "ComplexSchur is not initialized.");
-      return m_matT;
-    }
+  /** \brief Returns the unitary matrix in the Schur decomposition.
+   *
+   * \returns A const reference to the matrix U.
+   *
+   * It is assumed that either the constructor
+   * ComplexSchur(const MatrixType& matrix, bool computeU) or the
+   * member function compute(const MatrixType& matrix, bool computeU)
+   * has been called before to compute the Schur decomposition of a
+   * matrix, and that \p computeU was set to true (the default
+   * value).
+   *
+   * Example: \include ComplexSchur_matrixU.cpp
+   * Output: \verbinclude ComplexSchur_matrixU.out
+   */
+  const ComplexMatrixType& matrixU() const {
+    eigen_assert(m_isInitialized && "ComplexSchur is not initialized.");
+    eigen_assert(m_matUisUptodate && "The matrix U has not been computed during the ComplexSchur decomposition.");
+    return m_matU;
+  }
 
-    /** \brief Computes Schur decomposition of given matrix. 
-      * 
-      * \param[in]  matrix  Square matrix whose Schur decomposition is to be computed.
-      * \param[in]  computeU  If true, both T and U are computed; if false, only T is computed.
-
-      * \returns    Reference to \c *this
-      *
-      * The Schur decomposition is computed by first reducing the
-      * matrix to Hessenberg form using the class
-      * HessenbergDecomposition. The Hessenberg matrix is then reduced
-      * to triangular form by performing QR iterations with a single
-      * shift. The cost of computing the Schur decomposition depends
-      * on the number of iterations; as a rough guide, it may be taken
-      * on the number of iterations; as a rough guide, it may be taken
-      * to be \f$25n^3\f$ complex flops, or \f$10n^3\f$ complex flops
-      * if \a computeU is false.
-      *
-      * Example: \include ComplexSchur_compute.cpp
-      * Output: \verbinclude ComplexSchur_compute.out
-      *
-      * \sa compute(const MatrixType&, bool, Index)
-      */
-    ComplexSchur& compute(const MatrixType& matrix, bool computeU = true);
-    
-    /** \brief Compute Schur decomposition from a given Hessenberg matrix
-     *  \param[in] matrixH Matrix in Hessenberg form H
-     *  \param[in] matrixQ orthogonal matrix Q that transform a matrix A to H : A = Q H Q^T
-     *  \param computeU Computes the matriX U of the Schur vectors
-     * \return Reference to \c *this
-     * 
-     *  This routine assumes that the matrix is already reduced in Hessenberg form matrixH
-     *  using either the class HessenbergDecomposition or another mean. 
-     *  It computes the upper quasi-triangular matrix T of the Schur decomposition of H
-     *  When computeU is true, this routine computes the matrix U such that 
-     *  A = U T U^T =  (QZ) T (QZ)^T = Q H Q^T where A is the initial matrix
-     * 
-     * NOTE Q is referenced if computeU is true; so, if the initial orthogonal matrix
-     * is not available, the user should give an identity matrix (Q.setIdentity())
-     * 
-     * \sa compute(const MatrixType&, bool)
-     */
-    template<typename HessMatrixType, typename OrthMatrixType>
-    ComplexSchur& computeFromHessenberg(const HessMatrixType& matrixH, const OrthMatrixType& matrixQ,  bool computeU=true);
-
-    /** \brief Reports whether previous computation was successful.
-      *
-      * \returns \c Success if computation was succesful, \c NoConvergence otherwise.
-      */
-    ComputationInfo info() const
-    {
-      eigen_assert(m_isInitialized && "ComplexSchur is not initialized.");
-      return m_info;
-    }
+  /** \brief Returns the triangular matrix in the Schur decomposition.
+   *
+   * \returns A const reference to the matrix T.
+   *
+   * It is assumed that either the constructor
+   * ComplexSchur(const MatrixType& matrix, bool computeU) or the
+   * member function compute(const MatrixType& matrix, bool computeU)
+   * has been called before to compute the Schur decomposition of a
+   * matrix.
+   *
+   * Note that this function returns a plain square matrix. If you want to reference
+   * only the upper triangular part, use:
+   * \code schur.matrixT().triangularView<Upper>() \endcode
+   *
+   * Example: \include ComplexSchur_matrixT.cpp
+   * Output: \verbinclude ComplexSchur_matrixT.out
+   */
+  const ComplexMatrixType& matrixT() const {
+    eigen_assert(m_isInitialized && "ComplexSchur is not initialized.");
+    return m_matT;
+  }
 
-    /** \brief Sets the maximum number of iterations allowed. 
-      *
-      * If not specified by the user, the maximum number of iterations is m_maxIterationsPerRow times the size
-      * of the matrix.
-      */
-    ComplexSchur& setMaxIterations(Index maxIters)
-    {
-      m_maxIters = maxIters;
-      return *this;
-    }
+  /** \brief Computes Schur decomposition of given matrix.
+    *
+    * \param[in]  matrix  Square matrix whose Schur decomposition is to be computed.
+    * \param[in]  computeU  If true, both T and U are computed; if false, only T is computed.
+
+    * \returns    Reference to \c *this
+    *
+    * The Schur decomposition is computed by first reducing the
+    * matrix to Hessenberg form using the class
+    * HessenbergDecomposition. The Hessenberg matrix is then reduced
+    * to triangular form by performing QR iterations with a single
+    * shift. The cost of computing the Schur decomposition depends
+    * on the number of iterations; as a rough guide, it may be taken
+    * on the number of iterations; as a rough guide, it may be taken
+    * to be \f$25n^3\f$ complex flops, or \f$10n^3\f$ complex flops
+    * if \a computeU is false.
+    *
+    * Example: \include ComplexSchur_compute.cpp
+    * Output: \verbinclude ComplexSchur_compute.out
+    *
+    * \sa compute(const MatrixType&, bool, Index)
+    */
+  template <typename InputType>
+  ComplexSchur& compute(const EigenBase<InputType>& matrix, bool computeU = true);
+
+  /** \brief Compute Schur decomposition from a given Hessenberg matrix
+   *  \param[in] matrixH Matrix in Hessenberg form H
+   *  \param[in] matrixQ orthogonal matrix Q that transform a matrix A to H : A = Q H Q^T
+   *  \param computeU Computes the matriX U of the Schur vectors
+   * \return Reference to \c *this
+   *
+   *  This routine assumes that the matrix is already reduced in Hessenberg form matrixH
+   *  using either the class HessenbergDecomposition or another mean.
+   *  It computes the upper quasi-triangular matrix T of the Schur decomposition of H
+   *  When computeU is true, this routine computes the matrix U such that
+   *  A = U T U^T =  (QZ) T (QZ)^T = Q H Q^T where A is the initial matrix
+   *
+   * NOTE Q is referenced if computeU is true; so, if the initial orthogonal matrix
+   * is not available, the user should give an identity matrix (Q.setIdentity())
+   *
+   * \sa compute(const MatrixType&, bool)
+   */
+  template <typename HessMatrixType, typename OrthMatrixType>
+  ComplexSchur& computeFromHessenberg(const HessMatrixType& matrixH, const OrthMatrixType& matrixQ,
+                                      bool computeU = true);
+
+  /** \brief Reports whether previous computation was successful.
+   *
+   * \returns \c Success if computation was successful, \c NoConvergence otherwise.
+   */
+  ComputationInfo info() const {
+    eigen_assert(m_isInitialized && "ComplexSchur is not initialized.");
+    return m_info;
+  }
 
-    /** \brief Returns the maximum number of iterations. */
-    Index getMaxIterations()
-    {
-      return m_maxIters;
-    }
+  /** \brief Sets the maximum number of iterations allowed.
+   *
+   * If not specified by the user, the maximum number of iterations is m_maxIterationsPerRow times the size
+   * of the matrix.
+   */
+  ComplexSchur& setMaxIterations(Index maxIters) {
+    m_maxIters = maxIters;
+    return *this;
+  }
 
-    /** \brief Maximum number of iterations per row.
-      *
-      * If not otherwise specified, the maximum number of iterations is this number times the size of the
-      * matrix. It is currently set to 30.
-      */
-    static const int m_maxIterationsPerRow = 30;
-
-  protected:
-    ComplexMatrixType m_matT, m_matU;
-    HessenbergDecomposition<MatrixType> m_hess;
-    ComputationInfo m_info;
-    bool m_isInitialized;
-    bool m_matUisUptodate;
-    Index m_maxIters;
-
-  private:  
-    bool subdiagonalEntryIsNeglegible(Index i);
-    ComplexScalar computeShift(Index iu, Index iter);
-    void reduceToTriangularForm(bool computeU);
-    friend struct internal::complex_schur_reduce_to_hessenberg<MatrixType, NumTraits<Scalar>::IsComplex>;
+  /** \brief Returns the maximum number of iterations. */
+  Index getMaxIterations() { return m_maxIters; }
+
+  /** \brief Maximum number of iterations per row.
+   *
+   * If not otherwise specified, the maximum number of iterations is this number times the size of the
+   * matrix. It is currently set to 30.
+   */
+  static const int m_maxIterationsPerRow = 30;
+
+ protected:
+  ComplexMatrixType m_matT, m_matU;
+  HessenbergDecomposition<MatrixType> m_hess;
+  ComputationInfo m_info;
+  bool m_isInitialized;
+  bool m_matUisUptodate;
+  Index m_maxIters;
+
+ private:
+  bool subdiagonalEntryIsNeglegible(Index i);
+  ComplexScalar computeShift(Index iu, Index iter);
+  void reduceToTriangularForm(bool computeU);
+  friend struct internal::complex_schur_reduce_to_hessenberg<MatrixType, NumTraits<Scalar>::IsComplex>;
 };
 
-/** If m_matT(i+1,i) is neglegible in floating point arithmetic
-  * compared to m_matT(i,i) and m_matT(j,j), then set it to zero and
-  * return true, else return false. */
-template<typename MatrixType>
-inline bool ComplexSchur<MatrixType>::subdiagonalEntryIsNeglegible(Index i)
-{
-  RealScalar d = numext::norm1(m_matT.coeff(i,i)) + numext::norm1(m_matT.coeff(i+1,i+1));
-  RealScalar sd = numext::norm1(m_matT.coeff(i+1,i));
-  if (internal::isMuchSmallerThan(sd, d, NumTraits<RealScalar>::epsilon()))
-  {
-    m_matT.coeffRef(i+1,i) = ComplexScalar(0);
+/** If m_matT(i+1,i) is negligible in floating point arithmetic
+ * compared to m_matT(i,i) and m_matT(j,j), then set it to zero and
+ * return true, else return false. */
+template <typename MatrixType>
+inline bool ComplexSchur<MatrixType>::subdiagonalEntryIsNeglegible(Index i) {
+  RealScalar d = numext::norm1(m_matT.coeff(i, i)) + numext::norm1(m_matT.coeff(i + 1, i + 1));
+  RealScalar sd = numext::norm1(m_matT.coeff(i + 1, i));
+  if (internal::isMuchSmallerThan(sd, d, NumTraits<RealScalar>::epsilon())) {
+    m_matT.coeffRef(i + 1, i) = ComplexScalar(0);
     return true;
   }
   return false;
 }
 
-
 /** Compute the shift in the current QR iteration. */
-template<typename MatrixType>
-typename ComplexSchur<MatrixType>::ComplexScalar ComplexSchur<MatrixType>::computeShift(Index iu, Index iter)
-{
+template <typename MatrixType>
+typename ComplexSchur<MatrixType>::ComplexScalar ComplexSchur<MatrixType>::computeShift(Index iu, Index iter) {
   using std::abs;
-  if (iter == 10 || iter == 20) 
-  {
+  if ((iter == 10 || iter == 20) && iu > 1) {
     // exceptional shift, taken from http://www.netlib.org/eispack/comqr.f
-    return abs(numext::real(m_matT.coeff(iu,iu-1))) + abs(numext::real(m_matT.coeff(iu-1,iu-2)));
+    return abs(numext::real(m_matT.coeff(iu, iu - 1))) + abs(numext::real(m_matT.coeff(iu - 1, iu - 2)));
   }
 
   // compute the shift as one of the eigenvalues of t, the 2x2
   // diagonal block on the bottom of the active submatrix
-  Matrix<ComplexScalar,2,2> t = m_matT.template block<2,2>(iu-1,iu-1);
+  Matrix<ComplexScalar, 2, 2> t = m_matT.template block<2, 2>(iu - 1, iu - 1);
   RealScalar normt = t.cwiseAbs().sum();
-  t /= normt;     // the normalization by sf is to avoid under/overflow
+  t /= normt;  // the normalization by sf is to avoid under/overflow
 
-  ComplexScalar b = t.coeff(0,1) * t.coeff(1,0);
-  ComplexScalar c = t.coeff(0,0) - t.coeff(1,1);
-  ComplexScalar disc = sqrt(c*c + RealScalar(4)*b);
-  ComplexScalar det = t.coeff(0,0) * t.coeff(1,1) - b;
-  ComplexScalar trace = t.coeff(0,0) + t.coeff(1,1);
+  ComplexScalar b = t.coeff(0, 1) * t.coeff(1, 0);
+  ComplexScalar c = t.coeff(0, 0) - t.coeff(1, 1);
+  ComplexScalar disc = sqrt(c * c + RealScalar(4) * b);
+  ComplexScalar det = t.coeff(0, 0) * t.coeff(1, 1) - b;
+  ComplexScalar trace = t.coeff(0, 0) + t.coeff(1, 1);
   ComplexScalar eival1 = (trace + disc) / RealScalar(2);
   ComplexScalar eival2 = (trace - disc) / RealScalar(2);
-
-  if(numext::norm1(eival1) > numext::norm1(eival2))
+  RealScalar eival1_norm = numext::norm1(eival1);
+  RealScalar eival2_norm = numext::norm1(eival2);
+  // A division by zero can only occur if eival1==eival2==0.
+  // In this case, det==0, and all we have to do is checking that eival2_norm!=0
+  if (eival1_norm > eival2_norm)
     eival2 = det / eival1;
-  else
+  else if (!numext::is_exactly_zero(eival2_norm))
     eival1 = det / eival2;
 
   // choose the eigenvalue closest to the bottom entry of the diagonal
-  if(numext::norm1(eival1-t.coeff(1,1)) < numext::norm1(eival2-t.coeff(1,1)))
+  if (numext::norm1(eival1 - t.coeff(1, 1)) < numext::norm1(eival2 - t.coeff(1, 1)))
     return normt * eival1;
   else
     return normt * eival2;
 }
 
-
-template<typename MatrixType>
-ComplexSchur<MatrixType>& ComplexSchur<MatrixType>::compute(const MatrixType& matrix, bool computeU)
-{
+template <typename MatrixType>
+template <typename InputType>
+ComplexSchur<MatrixType>& ComplexSchur<MatrixType>::compute(const EigenBase<InputType>& matrix, bool computeU) {
   m_matUisUptodate = false;
   eigen_assert(matrix.cols() == matrix.rows());
 
-  if(matrix.cols() == 1)
-  {
-    m_matT = matrix.template cast<ComplexScalar>();
-    if(computeU)  m_matU = ComplexMatrixType::Identity(1,1);
+  if (matrix.cols() == 1) {
+    m_matT = matrix.derived().template cast<ComplexScalar>();
+    if (computeU) m_matU = ComplexMatrixType::Identity(1, 1);
     m_info = Success;
     m_isInitialized = true;
     m_matUisUptodate = computeU;
     return *this;
   }
 
-  internal::complex_schur_reduce_to_hessenberg<MatrixType, NumTraits<Scalar>::IsComplex>::run(*this, matrix, computeU);
+  internal::complex_schur_reduce_to_hessenberg<MatrixType, NumTraits<Scalar>::IsComplex>::run(*this, matrix.derived(),
+                                                                                              computeU);
   computeFromHessenberg(m_matT, m_matU, computeU);
   return *this;
 }
 
-template<typename MatrixType>
-template<typename HessMatrixType, typename OrthMatrixType>
-ComplexSchur<MatrixType>& ComplexSchur<MatrixType>::computeFromHessenberg(const HessMatrixType& matrixH, const OrthMatrixType& matrixQ, bool computeU)
-{
+template <typename MatrixType>
+template <typename HessMatrixType, typename OrthMatrixType>
+ComplexSchur<MatrixType>& ComplexSchur<MatrixType>::computeFromHessenberg(const HessMatrixType& matrixH,
+                                                                          const OrthMatrixType& matrixQ,
+                                                                          bool computeU) {
   m_matT = matrixH;
-  if(computeU)
-    m_matU = matrixQ;
+  if (computeU) m_matU = matrixQ;
   reduceToTriangularForm(computeU);
   return *this;
 }
 namespace internal {
 
 /* Reduce given matrix to Hessenberg form */
-template<typename MatrixType, bool IsComplex>
-struct complex_schur_reduce_to_hessenberg
-{
+template <typename MatrixType, bool IsComplex>
+struct complex_schur_reduce_to_hessenberg {
   // this is the implementation for the case IsComplex = true
-  static void run(ComplexSchur<MatrixType>& _this, const MatrixType& matrix, bool computeU)
-  {
+  static void run(ComplexSchur<MatrixType>& _this, const MatrixType& matrix, bool computeU) {
     _this.m_hess.compute(matrix);
     _this.m_matT = _this.m_hess.matrixH();
-    if(computeU)  _this.m_matU = _this.m_hess.matrixQ();
+    if (computeU) _this.m_matU = _this.m_hess.matrixQ();
   }
 };
 
-template<typename MatrixType>
-struct complex_schur_reduce_to_hessenberg<MatrixType, false>
-{
-  static void run(ComplexSchur<MatrixType>& _this, const MatrixType& matrix, bool computeU)
-  {
+template <typename MatrixType>
+struct complex_schur_reduce_to_hessenberg<MatrixType, false> {
+  static void run(ComplexSchur<MatrixType>& _this, const MatrixType& matrix, bool computeU) {
     typedef typename ComplexSchur<MatrixType>::ComplexScalar ComplexScalar;
 
     // Note: m_hess is over RealScalar; m_matT and m_matU is over ComplexScalar
     _this.m_hess.compute(matrix);
     _this.m_matT = _this.m_hess.matrixH().template cast<ComplexScalar>();
-    if(computeU)  
-    {
+    if (computeU) {
       // This may cause an allocation which seems to be avoidable
-      MatrixType Q = _this.m_hess.matrixQ(); 
+      MatrixType Q = _this.m_hess.matrixQ();
       _this.m_matU = Q.template cast<ComplexScalar>();
     }
   }
 };
 
-} // end namespace internal
+}  // end namespace internal
 
 // Reduce the Hessenberg matrix m_matT to triangular form by QR iteration.
-template<typename MatrixType>
-void ComplexSchur<MatrixType>::reduceToTriangularForm(bool computeU)
-{  
+template <typename MatrixType>
+void ComplexSchur<MatrixType>::reduceToTriangularForm(bool computeU) {
   Index maxIters = m_maxIters;
-  if (maxIters == -1)
-    maxIters = m_maxIterationsPerRow * m_matT.rows();
+  if (maxIters == -1) maxIters = m_maxIterationsPerRow * m_matT.rows();
 
-  // The matrix m_matT is divided in three parts. 
-  // Rows 0,...,il-1 are decoupled from the rest because m_matT(il,il-1) is zero. 
+  // The matrix m_matT is divided in three parts.
+  // Rows 0,...,il-1 are decoupled from the rest because m_matT(il,il-1) is zero.
   // Rows il,...,iu is the part we are working on (the active submatrix).
   // Rows iu+1,...,end are already brought in triangular form.
   Index iu = m_matT.cols() - 1;
   Index il;
-  Index iter = 0; // number of iterations we are working on the (iu,iu) element
-  Index totalIter = 0; // number of iterations for whole matrix
+  Index iter = 0;       // number of iterations we are working on the (iu,iu) element
+  Index totalIter = 0;  // number of iterations for whole matrix
 
-  while(true)
-  {
+  while (true) {
     // find iu, the bottom row of the active submatrix
-    while(iu > 0)
-    {
-      if(!subdiagonalEntryIsNeglegible(iu-1)) break;
+    while (iu > 0) {
+      if (!subdiagonalEntryIsNeglegible(iu - 1)) break;
       iter = 0;
       --iu;
     }
 
     // if iu is zero then we are done; the whole matrix is triangularized
-    if(iu==0) break;
+    if (iu == 0) break;
 
     // if we spent too many iterations, we give up
     iter++;
     totalIter++;
-    if(totalIter > maxIters) break;
+    if (totalIter > maxIters) break;
 
     // find il, the top row of the active submatrix
-    il = iu-1;
-    while(il > 0 && !subdiagonalEntryIsNeglegible(il-1))
-    {
+    il = iu - 1;
+    while (il > 0 && !subdiagonalEntryIsNeglegible(il - 1)) {
       --il;
     }
 
@@ -427,22 +414,21 @@ void ComplexSchur<MatrixType>::reduceToTriangularForm(bool computeU)
 
     ComplexScalar shift = computeShift(iu, iter);
     JacobiRotation<ComplexScalar> rot;
-    rot.makeGivens(m_matT.coeff(il,il) - shift, m_matT.coeff(il+1,il));
-    m_matT.rightCols(m_matT.cols()-il).applyOnTheLeft(il, il+1, rot.adjoint());
-    m_matT.topRows((std::min)(il+2,iu)+1).applyOnTheRight(il, il+1, rot);
-    if(computeU) m_matU.applyOnTheRight(il, il+1, rot);
-
-    for(Index i=il+1 ; i<iu ; i++)
-    {
-      rot.makeGivens(m_matT.coeffRef(i,i-1), m_matT.coeffRef(i+1,i-1), &m_matT.coeffRef(i,i-1));
-      m_matT.coeffRef(i+1,i-1) = ComplexScalar(0);
-      m_matT.rightCols(m_matT.cols()-i).applyOnTheLeft(i, i+1, rot.adjoint());
-      m_matT.topRows((std::min)(i+2,iu)+1).applyOnTheRight(i, i+1, rot);
-      if(computeU) m_matU.applyOnTheRight(i, i+1, rot);
+    rot.makeGivens(m_matT.coeff(il, il) - shift, m_matT.coeff(il + 1, il));
+    m_matT.rightCols(m_matT.cols() - il).applyOnTheLeft(il, il + 1, rot.adjoint());
+    m_matT.topRows((std::min)(il + 2, iu) + 1).applyOnTheRight(il, il + 1, rot);
+    if (computeU) m_matU.applyOnTheRight(il, il + 1, rot);
+
+    for (Index i = il + 1; i < iu; i++) {
+      rot.makeGivens(m_matT.coeffRef(i, i - 1), m_matT.coeffRef(i + 1, i - 1), &m_matT.coeffRef(i, i - 1));
+      m_matT.coeffRef(i + 1, i - 1) = ComplexScalar(0);
+      m_matT.rightCols(m_matT.cols() - i).applyOnTheLeft(i, i + 1, rot.adjoint());
+      m_matT.topRows((std::min)(i + 2, iu) + 1).applyOnTheRight(i, i + 1, rot);
+      if (computeU) m_matU.applyOnTheRight(i, i + 1, rot);
     }
   }
 
-  if(totalIter <= maxIters)
+  if (totalIter <= maxIters)
     m_info = Success;
   else
     m_info = NoConvergence;
@@ -451,6 +437,6 @@ void ComplexSchur<MatrixType>::reduceToTriangularForm(bool computeU)
   m_matUisUptodate = computeU;
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_COMPLEX_SCHUR_H
+#endif  // EIGEN_COMPLEX_SCHUR_H
diff --git a/inst/include/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h b/inst/include/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h
new file mode 100644
index 00000000..5f66fd92
--- /dev/null
+++ b/inst/include/Eigen/src/Eigenvalues/ComplexSchur_LAPACKE.h
@@ -0,0 +1,95 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to LAPACKe
+ *    Complex Schur needed to complex unsymmetrical eigenvalues/eigenvectors.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_COMPLEX_SCHUR_LAPACKE_H
+#define EIGEN_COMPLEX_SCHUR_LAPACKE_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \internal Specialization for the data types supported by LAPACKe */
+
+#define EIGEN_LAPACKE_SCHUR_COMPLEX(EIGTYPE, LAPACKE_TYPE, LAPACKE_PREFIX, LAPACKE_PREFIX_U, EIGCOLROW,            \
+                                    LAPACKE_COLROW)                                                                \
+  template <>                                                                                                      \
+  template <typename InputType>                                                                                    \
+  inline ComplexSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >&                                              \
+  ComplexSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const EigenBase<InputType>& matrix,         \
+                                                                       bool computeU) {                            \
+    typedef Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> MatrixType;                                               \
+    typedef MatrixType::RealScalar RealScalar;                                                                     \
+    typedef std::complex<RealScalar> ComplexScalar;                                                                \
+                                                                                                                   \
+    eigen_assert(matrix.cols() == matrix.rows());                                                                  \
+                                                                                                                   \
+    m_matUisUptodate = false;                                                                                      \
+    if (matrix.cols() == 1) {                                                                                      \
+      m_matT = matrix.derived().template cast<ComplexScalar>();                                                    \
+      if (computeU) m_matU = ComplexMatrixType::Identity(1, 1);                                                    \
+      m_info = Success;                                                                                            \
+      m_isInitialized = true;                                                                                      \
+      m_matUisUptodate = computeU;                                                                                 \
+      return *this;                                                                                                \
+    }                                                                                                              \
+    lapack_int n = internal::convert_index<lapack_int>(matrix.cols()), sdim, info;                                 \
+    lapack_int matrix_order = LAPACKE_COLROW;                                                                      \
+    char jobvs, sort = 'N';                                                                                        \
+    LAPACK_##LAPACKE_PREFIX_U##_SELECT1 select = 0;                                                                \
+    jobvs = (computeU) ? 'V' : 'N';                                                                                \
+    m_matU.resize(n, n);                                                                                           \
+    lapack_int ldvs = internal::convert_index<lapack_int>(m_matU.outerStride());                                   \
+    m_matT = matrix;                                                                                               \
+    lapack_int lda = internal::convert_index<lapack_int>(m_matT.outerStride());                                    \
+    Matrix<EIGTYPE, Dynamic, Dynamic> w;                                                                           \
+    w.resize(n, 1);                                                                                                \
+    info = LAPACKE_##LAPACKE_PREFIX##gees(matrix_order, jobvs, sort, select, n, (LAPACKE_TYPE*)m_matT.data(), lda, \
+                                          &sdim, (LAPACKE_TYPE*)w.data(), (LAPACKE_TYPE*)m_matU.data(), ldvs);     \
+    if (info == 0)                                                                                                 \
+      m_info = Success;                                                                                            \
+    else                                                                                                           \
+      m_info = NoConvergence;                                                                                      \
+                                                                                                                   \
+    m_isInitialized = true;                                                                                        \
+    m_matUisUptodate = computeU;                                                                                   \
+    return *this;                                                                                                  \
+  }
+
+EIGEN_LAPACKE_SCHUR_COMPLEX(dcomplex, lapack_complex_double, z, Z, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_SCHUR_COMPLEX(scomplex, lapack_complex_float, c, C, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_SCHUR_COMPLEX(dcomplex, lapack_complex_double, z, Z, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_SCHUR_COMPLEX(scomplex, lapack_complex_float, c, C, RowMajor, LAPACK_ROW_MAJOR)
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_COMPLEX_SCHUR_LAPACKE_H
diff --git a/inst/include/Eigen/src/Eigenvalues/ComplexSchur_MKL.h b/inst/include/Eigen/src/Eigenvalues/ComplexSchur_MKL.h
deleted file mode 100644
index 91496ae5..00000000
--- a/inst/include/Eigen/src/Eigenvalues/ComplexSchur_MKL.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- Copyright (c) 2011, Intel Corporation. All rights reserved.
-
- Redistribution and use in source and binary forms, with or without modification,
- are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
- * Neither the name of Intel Corporation nor the names of its contributors may
-   be used to endorse or promote products derived from this software without
-   specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
- ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
- ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
- *    Complex Schur needed to complex unsymmetrical eigenvalues/eigenvectors.
- ********************************************************************************
-*/
-
-#ifndef EIGEN_COMPLEX_SCHUR_MKL_H
-#define EIGEN_COMPLEX_SCHUR_MKL_H
-
-#include "Eigen/src/Core/util/MKL_support.h"
-
-namespace Eigen { 
-
-/** \internal Specialization for the data types supported by MKL */
-
-#define EIGEN_MKL_SCHUR_COMPLEX(EIGTYPE, MKLTYPE, MKLPREFIX, MKLPREFIX_U, EIGCOLROW, MKLCOLROW) \
-template<> inline \
-ComplexSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >& \
-ComplexSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW>& matrix, bool computeU) \
-{ \
-  typedef Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> MatrixType; \
-  typedef MatrixType::Scalar Scalar; \
-  typedef MatrixType::RealScalar RealScalar; \
-  typedef std::complex<RealScalar> ComplexScalar; \
-\
-  eigen_assert(matrix.cols() == matrix.rows()); \
-\
-  m_matUisUptodate = false; \
-  if(matrix.cols() == 1) \
-  { \
-    m_matT = matrix.cast<ComplexScalar>(); \
-    if(computeU)  m_matU = ComplexMatrixType::Identity(1,1); \
-      m_info = Success; \
-      m_isInitialized = true; \
-      m_matUisUptodate = computeU; \
-      return *this; \
-  } \
-  lapack_int n = matrix.cols(), sdim, info; \
-  lapack_int lda = matrix.outerStride(); \
-  lapack_int matrix_order = MKLCOLROW; \
-  char jobvs, sort='N'; \
-  LAPACK_##MKLPREFIX_U##_SELECT1 select = 0; \
-  jobvs = (computeU) ? 'V' : 'N'; \
-  m_matU.resize(n, n); \
-  lapack_int ldvs  = m_matU.outerStride(); \
-  m_matT = matrix; \
-  Matrix<EIGTYPE, Dynamic, Dynamic> w; \
-  w.resize(n, 1);\
-  info = LAPACKE_##MKLPREFIX##gees( matrix_order, jobvs, sort, select, n, (MKLTYPE*)m_matT.data(), lda, &sdim, (MKLTYPE*)w.data(), (MKLTYPE*)m_matU.data(), ldvs ); \
-  if(info == 0) \
-    m_info = Success; \
-  else \
-    m_info = NoConvergence; \
-\
-  m_isInitialized = true; \
-  m_matUisUptodate = computeU; \
-  return *this; \
-\
-}
-
-EIGEN_MKL_SCHUR_COMPLEX(dcomplex, MKL_Complex16, z, Z, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_MKL_SCHUR_COMPLEX(scomplex, MKL_Complex8,  c, C, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_MKL_SCHUR_COMPLEX(dcomplex, MKL_Complex16, z, Z, RowMajor, LAPACK_ROW_MAJOR)
-EIGEN_MKL_SCHUR_COMPLEX(scomplex, MKL_Complex8,  c, C, RowMajor, LAPACK_ROW_MAJOR)
-
-} // end namespace Eigen
-
-#endif // EIGEN_COMPLEX_SCHUR_MKL_H
diff --git a/inst/include/Eigen/src/Eigenvalues/EigenSolver.h b/inst/include/Eigen/src/Eigenvalues/EigenSolver.h
index 20c59a7a..9dba7bd1 100644
--- a/inst/include/Eigen/src/Eigenvalues/EigenSolver.h
+++ b/inst/include/Eigen/src/Eigenvalues/EigenSolver.h
@@ -13,402 +13,422 @@
 
 #include "./RealSchur.h"
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 /** \eigenvalues_module \ingroup Eigenvalues_Module
-  *
-  *
-  * \class EigenSolver
-  *
-  * \brief Computes eigenvalues and eigenvectors of general matrices
-  *
-  * \tparam _MatrixType the type of the matrix of which we are computing the
-  * eigendecomposition; this is expected to be an instantiation of the Matrix
-  * class template. Currently, only real matrices are supported.
-  *
-  * The eigenvalues and eigenvectors of a matrix \f$ A \f$ are scalars
-  * \f$ \lambda \f$ and vectors \f$ v \f$ such that \f$ Av = \lambda v \f$.  If
-  * \f$ D \f$ is a diagonal matrix with the eigenvalues on the diagonal, and
-  * \f$ V \f$ is a matrix with the eigenvectors as its columns, then \f$ A V =
-  * V D \f$. The matrix \f$ V \f$ is almost always invertible, in which case we
-  * have \f$ A = V D V^{-1} \f$. This is called the eigendecomposition.
-  *
-  * The eigenvalues and eigenvectors of a matrix may be complex, even when the
-  * matrix is real. However, we can choose real matrices \f$ V \f$ and \f$ D
-  * \f$ satisfying \f$ A V = V D \f$, just like the eigendecomposition, if the
-  * matrix \f$ D \f$ is not required to be diagonal, but if it is allowed to
-  * have blocks of the form
-  * \f[ \begin{bmatrix} u & v \\ -v & u \end{bmatrix} \f]
-  * (where \f$ u \f$ and \f$ v \f$ are real numbers) on the diagonal.  These
-  * blocks correspond to complex eigenvalue pairs \f$ u \pm iv \f$. We call
-  * this variant of the eigendecomposition the pseudo-eigendecomposition.
-  *
-  * Call the function compute() to compute the eigenvalues and eigenvectors of
-  * a given matrix. Alternatively, you can use the 
-  * EigenSolver(const MatrixType&, bool) constructor which computes the
-  * eigenvalues and eigenvectors at construction time. Once the eigenvalue and
-  * eigenvectors are computed, they can be retrieved with the eigenvalues() and
-  * eigenvectors() functions. The pseudoEigenvalueMatrix() and
-  * pseudoEigenvectors() methods allow the construction of the
-  * pseudo-eigendecomposition.
-  *
-  * The documentation for EigenSolver(const MatrixType&, bool) contains an
-  * example of the typical use of this class.
-  *
-  * \note The implementation is adapted from
-  * <a href="http://math.nist.gov/javanumerics/jama/">JAMA</a> (public domain).
-  * Their code is based on EISPACK.
-  *
-  * \sa MatrixBase::eigenvalues(), class ComplexEigenSolver, class SelfAdjointEigenSolver
-  */
-template<typename _MatrixType> class EigenSolver
-{
-  public:
-
-    /** \brief Synonym for the template parameter \p _MatrixType. */
-    typedef _MatrixType MatrixType;
-
-    enum {
-      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-      Options = MatrixType::Options,
-      MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
-      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
-    };
-
-    /** \brief Scalar type for matrices of type #MatrixType. */
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename NumTraits<Scalar>::Real RealScalar;
-    typedef typename MatrixType::Index Index;
-
-    /** \brief Complex scalar type for #MatrixType. 
-      *
-      * This is \c std::complex<Scalar> if #Scalar is real (e.g.,
-      * \c float or \c double) and just \c Scalar if #Scalar is
-      * complex.
-      */
-    typedef std::complex<RealScalar> ComplexScalar;
-
-    /** \brief Type for vector of eigenvalues as returned by eigenvalues(). 
-      *
-      * This is a column vector with entries of type #ComplexScalar.
-      * The length of the vector is the size of #MatrixType.
-      */
-    typedef Matrix<ComplexScalar, ColsAtCompileTime, 1, Options & ~RowMajor, MaxColsAtCompileTime, 1> EigenvalueType;
-
-    /** \brief Type for matrix of eigenvectors as returned by eigenvectors(). 
-      *
-      * This is a square matrix with entries of type #ComplexScalar. 
-      * The size is the same as the size of #MatrixType.
-      */
-    typedef Matrix<ComplexScalar, RowsAtCompileTime, ColsAtCompileTime, Options, MaxRowsAtCompileTime, MaxColsAtCompileTime> EigenvectorsType;
-
-    /** \brief Default constructor.
-      *
-      * The default constructor is useful in cases in which the user intends to
-      * perform decompositions via EigenSolver::compute(const MatrixType&, bool).
-      *
-      * \sa compute() for an example.
-      */
- EigenSolver() : m_eivec(), m_eivalues(), m_isInitialized(false), m_realSchur(), m_matT(), m_tmp() {}
-
-    /** \brief Default constructor with memory preallocation
-      *
-      * Like the default constructor but with preallocation of the internal data
-      * according to the specified problem \a size.
-      * \sa EigenSolver()
-      */
-    EigenSolver(Index size)
+ *
+ *
+ * \class EigenSolver
+ *
+ * \brief Computes eigenvalues and eigenvectors of general matrices
+ *
+ * \tparam MatrixType_ the type of the matrix of which we are computing the
+ * eigendecomposition; this is expected to be an instantiation of the Matrix
+ * class template. Currently, only real matrices are supported.
+ *
+ * The eigenvalues and eigenvectors of a matrix \f$ A \f$ are scalars
+ * \f$ \lambda \f$ and vectors \f$ v \f$ such that \f$ Av = \lambda v \f$.  If
+ * \f$ D \f$ is a diagonal matrix with the eigenvalues on the diagonal, and
+ * \f$ V \f$ is a matrix with the eigenvectors as its columns, then \f$ A V =
+ * V D \f$. The matrix \f$ V \f$ is almost always invertible, in which case we
+ * have \f$ A = V D V^{-1} \f$. This is called the eigendecomposition.
+ *
+ * The eigenvalues and eigenvectors of a matrix may be complex, even when the
+ * matrix is real. However, we can choose real matrices \f$ V \f$ and \f$ D
+ * \f$ satisfying \f$ A V = V D \f$, just like the eigendecomposition, if the
+ * matrix \f$ D \f$ is not required to be diagonal, but if it is allowed to
+ * have blocks of the form
+ * \f[ \begin{bmatrix} u & v \\ -v & u \end{bmatrix} \f]
+ * (where \f$ u \f$ and \f$ v \f$ are real numbers) on the diagonal.  These
+ * blocks correspond to complex eigenvalue pairs \f$ u \pm iv \f$. We call
+ * this variant of the eigendecomposition the pseudo-eigendecomposition.
+ *
+ * Call the function compute() to compute the eigenvalues and eigenvectors of
+ * a given matrix. Alternatively, you can use the
+ * EigenSolver(const MatrixType&, bool) constructor which computes the
+ * eigenvalues and eigenvectors at construction time. Once the eigenvalue and
+ * eigenvectors are computed, they can be retrieved with the eigenvalues() and
+ * eigenvectors() functions. The pseudoEigenvalueMatrix() and
+ * pseudoEigenvectors() methods allow the construction of the
+ * pseudo-eigendecomposition.
+ *
+ * The documentation for EigenSolver(const MatrixType&, bool) contains an
+ * example of the typical use of this class.
+ *
+ * \note The implementation is adapted from
+ * <a href="http://math.nist.gov/javanumerics/jama/">JAMA</a> (public domain).
+ * Their code is based on EISPACK.
+ *
+ * \sa MatrixBase::eigenvalues(), class ComplexEigenSolver, class SelfAdjointEigenSolver
+ */
+template <typename MatrixType_>
+class EigenSolver {
+ public:
+  /** \brief Synonym for the template parameter \p MatrixType_. */
+  typedef MatrixType_ MatrixType;
+
+  enum {
+    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+    ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+    Options = internal::traits<MatrixType>::Options,
+    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+  };
+
+  /** \brief Scalar type for matrices of type #MatrixType. */
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef Eigen::Index Index;  ///< \deprecated since Eigen 3.3
+
+  /** \brief Complex scalar type for #MatrixType.
+   *
+   * This is \c std::complex<Scalar> if #Scalar is real (e.g.,
+   * \c float or \c double) and just \c Scalar if #Scalar is
+   * complex.
+   */
+  typedef internal::make_complex_t<Scalar> ComplexScalar;
+
+  /** \brief Type for vector of eigenvalues as returned by eigenvalues().
+   *
+   * This is a column vector with entries of type #ComplexScalar.
+   * The length of the vector is the size of #MatrixType.
+   */
+  typedef Matrix<ComplexScalar, ColsAtCompileTime, 1, Options & ~RowMajor, MaxColsAtCompileTime, 1> EigenvalueType;
+
+  /** \brief Type for matrix of eigenvectors as returned by eigenvectors().
+   *
+   * This is a square matrix with entries of type #ComplexScalar.
+   * The size is the same as the size of #MatrixType.
+   */
+  typedef Matrix<ComplexScalar, RowsAtCompileTime, ColsAtCompileTime, Options, MaxRowsAtCompileTime,
+                 MaxColsAtCompileTime>
+      EigenvectorsType;
+
+  /** \brief Default constructor.
+   *
+   * The default constructor is useful in cases in which the user intends to
+   * perform decompositions via EigenSolver::compute(const MatrixType&, bool).
+   *
+   * \sa compute() for an example.
+   */
+  EigenSolver()
+      : m_eivec(), m_eivalues(), m_isInitialized(false), m_eigenvectorsOk(false), m_realSchur(), m_matT(), m_tmp() {}
+
+  /** \brief Default constructor with memory preallocation
+   *
+   * Like the default constructor but with preallocation of the internal data
+   * according to the specified problem \a size.
+   * \sa EigenSolver()
+   */
+  explicit EigenSolver(Index size)
       : m_eivec(size, size),
         m_eivalues(size),
         m_isInitialized(false),
         m_eigenvectorsOk(false),
         m_realSchur(size),
-        m_matT(size, size), 
-        m_tmp(size)
-    {}
-
-    /** \brief Constructor; computes eigendecomposition of given matrix. 
-      * 
-      * \param[in]  matrix  Square matrix whose eigendecomposition is to be computed.
-      * \param[in]  computeEigenvectors  If true, both the eigenvectors and the
-      *    eigenvalues are computed; if false, only the eigenvalues are
-      *    computed. 
-      *
-      * This constructor calls compute() to compute the eigenvalues
-      * and eigenvectors.
-      *
-      * Example: \include EigenSolver_EigenSolver_MatrixType.cpp
-      * Output: \verbinclude EigenSolver_EigenSolver_MatrixType.out
-      *
-      * \sa compute()
-      */
-    EigenSolver(const MatrixType& matrix, bool computeEigenvectors = true)
+        m_matT(size, size),
+        m_tmp(size) {}
+
+  /** \brief Constructor; computes eigendecomposition of given matrix.
+   *
+   * \param[in]  matrix  Square matrix whose eigendecomposition is to be computed.
+   * \param[in]  computeEigenvectors  If true, both the eigenvectors and the
+   *    eigenvalues are computed; if false, only the eigenvalues are
+   *    computed.
+   *
+   * This constructor calls compute() to compute the eigenvalues
+   * and eigenvectors.
+   *
+   * Example: \include EigenSolver_EigenSolver_MatrixType.cpp
+   * Output: \verbinclude EigenSolver_EigenSolver_MatrixType.out
+   *
+   * \sa compute()
+   */
+  template <typename InputType>
+  explicit EigenSolver(const EigenBase<InputType>& matrix, bool computeEigenvectors = true)
       : m_eivec(matrix.rows(), matrix.cols()),
         m_eivalues(matrix.cols()),
         m_isInitialized(false),
         m_eigenvectorsOk(false),
         m_realSchur(matrix.cols()),
-        m_matT(matrix.rows(), matrix.cols()), 
-        m_tmp(matrix.cols())
-    {
-      compute(matrix, computeEigenvectors);
-    }
+        m_matT(matrix.rows(), matrix.cols()),
+        m_tmp(matrix.cols()) {
+    compute(matrix.derived(), computeEigenvectors);
+  }
 
-    /** \brief Returns the eigenvectors of given matrix. 
-      *
-      * \returns  %Matrix whose columns are the (possibly complex) eigenvectors.
-      *
-      * \pre Either the constructor 
-      * EigenSolver(const MatrixType&,bool) or the member function
-      * compute(const MatrixType&, bool) has been called before, and
-      * \p computeEigenvectors was set to true (the default).
-      *
-      * Column \f$ k \f$ of the returned matrix is an eigenvector corresponding
-      * to eigenvalue number \f$ k \f$ as returned by eigenvalues().  The
-      * eigenvectors are normalized to have (Euclidean) norm equal to one. The
-      * matrix returned by this function is the matrix \f$ V \f$ in the
-      * eigendecomposition \f$ A = V D V^{-1} \f$, if it exists.
-      *
-      * Example: \include EigenSolver_eigenvectors.cpp
-      * Output: \verbinclude EigenSolver_eigenvectors.out
-      *
-      * \sa eigenvalues(), pseudoEigenvectors()
-      */
-    EigenvectorsType eigenvectors() const;
-
-    /** \brief Returns the pseudo-eigenvectors of given matrix. 
-      *
-      * \returns  Const reference to matrix whose columns are the pseudo-eigenvectors.
-      *
-      * \pre Either the constructor 
-      * EigenSolver(const MatrixType&,bool) or the member function
-      * compute(const MatrixType&, bool) has been called before, and
-      * \p computeEigenvectors was set to true (the default).
-      *
-      * The real matrix \f$ V \f$ returned by this function and the
-      * block-diagonal matrix \f$ D \f$ returned by pseudoEigenvalueMatrix()
-      * satisfy \f$ AV = VD \f$.
-      *
-      * Example: \include EigenSolver_pseudoEigenvectors.cpp
-      * Output: \verbinclude EigenSolver_pseudoEigenvectors.out
-      *
-      * \sa pseudoEigenvalueMatrix(), eigenvectors()
-      */
-    const MatrixType& pseudoEigenvectors() const
-    {
-      eigen_assert(m_isInitialized && "EigenSolver is not initialized.");
-      eigen_assert(m_eigenvectorsOk && "The eigenvectors have not been computed together with the eigenvalues.");
-      return m_eivec;
-    }
+  /** \brief Returns the eigenvectors of given matrix.
+   *
+   * \returns  %Matrix whose columns are the (possibly complex) eigenvectors.
+   *
+   * \pre Either the constructor
+   * EigenSolver(const MatrixType&,bool) or the member function
+   * compute(const MatrixType&, bool) has been called before, and
+   * \p computeEigenvectors was set to true (the default).
+   *
+   * Column \f$ k \f$ of the returned matrix is an eigenvector corresponding
+   * to eigenvalue number \f$ k \f$ as returned by eigenvalues().  The
+   * eigenvectors are normalized to have (Euclidean) norm equal to one. The
+   * matrix returned by this function is the matrix \f$ V \f$ in the
+   * eigendecomposition \f$ A = V D V^{-1} \f$, if it exists.
+   *
+   * Example: \include EigenSolver_eigenvectors.cpp
+   * Output: \verbinclude EigenSolver_eigenvectors.out
+   *
+   * \sa eigenvalues(), pseudoEigenvectors()
+   */
+  EigenvectorsType eigenvectors() const;
+
+  /** \brief Returns the pseudo-eigenvectors of given matrix.
+   *
+   * \returns  Const reference to matrix whose columns are the pseudo-eigenvectors.
+   *
+   * \pre Either the constructor
+   * EigenSolver(const MatrixType&,bool) or the member function
+   * compute(const MatrixType&, bool) has been called before, and
+   * \p computeEigenvectors was set to true (the default).
+   *
+   * The real matrix \f$ V \f$ returned by this function and the
+   * block-diagonal matrix \f$ D \f$ returned by pseudoEigenvalueMatrix()
+   * satisfy \f$ AV = VD \f$.
+   *
+   * Example: \include EigenSolver_pseudoEigenvectors.cpp
+   * Output: \verbinclude EigenSolver_pseudoEigenvectors.out
+   *
+   * \sa pseudoEigenvalueMatrix(), eigenvectors()
+   */
+  const MatrixType& pseudoEigenvectors() const {
+    eigen_assert(m_isInitialized && "EigenSolver is not initialized.");
+    eigen_assert(m_eigenvectorsOk && "The eigenvectors have not been computed together with the eigenvalues.");
+    return m_eivec;
+  }
 
-    /** \brief Returns the block-diagonal matrix in the pseudo-eigendecomposition.
-      *
-      * \returns  A block-diagonal matrix.
-      *
-      * \pre Either the constructor 
-      * EigenSolver(const MatrixType&,bool) or the member function
-      * compute(const MatrixType&, bool) has been called before.
-      *
-      * The matrix \f$ D \f$ returned by this function is real and
-      * block-diagonal. The blocks on the diagonal are either 1-by-1 or 2-by-2
-      * blocks of the form
-      * \f$ \begin{bmatrix} u & v \\ -v & u \end{bmatrix} \f$.
-      * These blocks are not sorted in any particular order.
-      * The matrix \f$ D \f$ and the matrix \f$ V \f$ returned by
-      * pseudoEigenvectors() satisfy \f$ AV = VD \f$.
-      *
-      * \sa pseudoEigenvectors() for an example, eigenvalues()
-      */
-    MatrixType pseudoEigenvalueMatrix() const;
-
-    /** \brief Returns the eigenvalues of given matrix. 
-      *
-      * \returns A const reference to the column vector containing the eigenvalues.
-      *
-      * \pre Either the constructor 
-      * EigenSolver(const MatrixType&,bool) or the member function
-      * compute(const MatrixType&, bool) has been called before.
-      *
-      * The eigenvalues are repeated according to their algebraic multiplicity,
-      * so there are as many eigenvalues as rows in the matrix. The eigenvalues 
-      * are not sorted in any particular order.
-      *
-      * Example: \include EigenSolver_eigenvalues.cpp
-      * Output: \verbinclude EigenSolver_eigenvalues.out
-      *
-      * \sa eigenvectors(), pseudoEigenvalueMatrix(),
-      *     MatrixBase::eigenvalues()
-      */
-    const EigenvalueType& eigenvalues() const
-    {
-      eigen_assert(m_isInitialized && "EigenSolver is not initialized.");
-      return m_eivalues;
-    }
+  /** \brief Returns the block-diagonal matrix in the pseudo-eigendecomposition.
+   *
+   * \returns  A block-diagonal matrix.
+   *
+   * \pre Either the constructor
+   * EigenSolver(const MatrixType&,bool) or the member function
+   * compute(const MatrixType&, bool) has been called before.
+   *
+   * The matrix \f$ D \f$ returned by this function is real and
+   * block-diagonal. The blocks on the diagonal are either 1-by-1 or 2-by-2
+   * blocks of the form
+   * \f$ \begin{bmatrix} u & v \\ -v & u \end{bmatrix} \f$.
+   * These blocks are not sorted in any particular order.
+   * The matrix \f$ D \f$ and the matrix \f$ V \f$ returned by
+   * pseudoEigenvectors() satisfy \f$ AV = VD \f$.
+   *
+   * \sa pseudoEigenvectors() for an example, eigenvalues()
+   */
+  MatrixType pseudoEigenvalueMatrix() const;
+
+  /** \brief Returns the eigenvalues of given matrix.
+   *
+   * \returns A const reference to the column vector containing the eigenvalues.
+   *
+   * \pre Either the constructor
+   * EigenSolver(const MatrixType&,bool) or the member function
+   * compute(const MatrixType&, bool) has been called before.
+   *
+   * The eigenvalues are repeated according to their algebraic multiplicity,
+   * so there are as many eigenvalues as rows in the matrix. The eigenvalues
+   * are not sorted in any particular order.
+   *
+   * Example: \include EigenSolver_eigenvalues.cpp
+   * Output: \verbinclude EigenSolver_eigenvalues.out
+   *
+   * \sa eigenvectors(), pseudoEigenvalueMatrix(),
+   *     MatrixBase::eigenvalues()
+   */
+  const EigenvalueType& eigenvalues() const {
+    eigen_assert(m_isInitialized && "EigenSolver is not initialized.");
+    return m_eivalues;
+  }
 
-    /** \brief Computes eigendecomposition of given matrix. 
-      * 
-      * \param[in]  matrix  Square matrix whose eigendecomposition is to be computed.
-      * \param[in]  computeEigenvectors  If true, both the eigenvectors and the
-      *    eigenvalues are computed; if false, only the eigenvalues are
-      *    computed. 
-      * \returns    Reference to \c *this
-      *
-      * This function computes the eigenvalues of the real matrix \p matrix.
-      * The eigenvalues() function can be used to retrieve them.  If 
-      * \p computeEigenvectors is true, then the eigenvectors are also computed
-      * and can be retrieved by calling eigenvectors().
-      *
-      * The matrix is first reduced to real Schur form using the RealSchur
-      * class. The Schur decomposition is then used to compute the eigenvalues
-      * and eigenvectors.
-      *
-      * The cost of the computation is dominated by the cost of the
-      * Schur decomposition, which is very approximately \f$ 25n^3 \f$
-      * (where \f$ n \f$ is the size of the matrix) if \p computeEigenvectors 
-      * is true, and \f$ 10n^3 \f$ if \p computeEigenvectors is false.
-      *
-      * This method reuses of the allocated data in the EigenSolver object.
-      *
-      * Example: \include EigenSolver_compute.cpp
-      * Output: \verbinclude EigenSolver_compute.out
-      */
-    EigenSolver& compute(const MatrixType& matrix, bool computeEigenvectors = true);
-
-    ComputationInfo info() const
-    {
-      eigen_assert(m_isInitialized && "EigenSolver is not initialized.");
-      return m_realSchur.info();
-    }
+  /** \brief Computes eigendecomposition of given matrix.
+   *
+   * \param[in]  matrix  Square matrix whose eigendecomposition is to be computed.
+   * \param[in]  computeEigenvectors  If true, both the eigenvectors and the
+   *    eigenvalues are computed; if false, only the eigenvalues are
+   *    computed.
+   * \returns    Reference to \c *this
+   *
+   * This function computes the eigenvalues of the real matrix \p matrix.
+   * The eigenvalues() function can be used to retrieve them.  If
+   * \p computeEigenvectors is true, then the eigenvectors are also computed
+   * and can be retrieved by calling eigenvectors().
+   *
+   * The matrix is first reduced to real Schur form using the RealSchur
+   * class. The Schur decomposition is then used to compute the eigenvalues
+   * and eigenvectors.
+   *
+   * The cost of the computation is dominated by the cost of the
+   * Schur decomposition, which is very approximately \f$ 25n^3 \f$
+   * (where \f$ n \f$ is the size of the matrix) if \p computeEigenvectors
+   * is true, and \f$ 10n^3 \f$ if \p computeEigenvectors is false.
+   *
+   * This method reuses of the allocated data in the EigenSolver object.
+   *
+   * Example: \include EigenSolver_compute.cpp
+   * Output: \verbinclude EigenSolver_compute.out
+   */
+  template <typename InputType>
+  EigenSolver& compute(const EigenBase<InputType>& matrix, bool computeEigenvectors = true);
+
+  /** \returns NumericalIssue if the input contains INF or NaN values or overflow occurred. Returns Success otherwise.
+   */
+  ComputationInfo info() const {
+    eigen_assert(m_isInitialized && "EigenSolver is not initialized.");
+    return m_info;
+  }
 
-    /** \brief Sets the maximum number of iterations allowed. */
-    EigenSolver& setMaxIterations(Index maxIters)
-    {
-      m_realSchur.setMaxIterations(maxIters);
-      return *this;
-    }
+  /** \brief Sets the maximum number of iterations allowed. */
+  EigenSolver& setMaxIterations(Index maxIters) {
+    m_realSchur.setMaxIterations(maxIters);
+    return *this;
+  }
 
-    /** \brief Returns the maximum number of iterations. */
-    Index getMaxIterations()
-    {
-      return m_realSchur.getMaxIterations();
-    }
+  /** \brief Returns the maximum number of iterations. */
+  Index getMaxIterations() { return m_realSchur.getMaxIterations(); }
 
-  private:
-    void doComputeEigenvectors();
+ private:
+  void doComputeEigenvectors();
 
-  protected:
-    
-    static void check_template_parameters()
-    {
-      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
-      EIGEN_STATIC_ASSERT(!NumTraits<Scalar>::IsComplex, NUMERIC_TYPE_MUST_BE_REAL);
-    }
-    
-    MatrixType m_eivec;
-    EigenvalueType m_eivalues;
-    bool m_isInitialized;
-    bool m_eigenvectorsOk;
-    RealSchur<MatrixType> m_realSchur;
-    MatrixType m_matT;
-
-    typedef Matrix<Scalar, ColsAtCompileTime, 1, Options & ~RowMajor, MaxColsAtCompileTime, 1> ColumnVectorType;
-    ColumnVectorType m_tmp;
+ protected:
+  static void check_template_parameters() {
+    EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+    EIGEN_STATIC_ASSERT(!NumTraits<Scalar>::IsComplex, NUMERIC_TYPE_MUST_BE_REAL);
+  }
+
+  MatrixType m_eivec;
+  EigenvalueType m_eivalues;
+  bool m_isInitialized;
+  bool m_eigenvectorsOk;
+  ComputationInfo m_info;
+  RealSchur<MatrixType> m_realSchur;
+  MatrixType m_matT;
+
+  typedef Matrix<Scalar, ColsAtCompileTime, 1, Options & ~RowMajor, MaxColsAtCompileTime, 1> ColumnVectorType;
+  ColumnVectorType m_tmp;
 };
 
-template<typename MatrixType>
-MatrixType EigenSolver<MatrixType>::pseudoEigenvalueMatrix() const
-{
+template <typename MatrixType>
+MatrixType EigenSolver<MatrixType>::pseudoEigenvalueMatrix() const {
   eigen_assert(m_isInitialized && "EigenSolver is not initialized.");
-  Index n = m_eivalues.rows();
-  MatrixType matD = MatrixType::Zero(n,n);
-  for (Index i=0; i<n; ++i)
-  {
-    if (internal::isMuchSmallerThan(numext::imag(m_eivalues.coeff(i)), numext::real(m_eivalues.coeff(i))))
-      matD.coeffRef(i,i) = numext::real(m_eivalues.coeff(i));
-    else
-    {
-      matD.template block<2,2>(i,i) <<  numext::real(m_eivalues.coeff(i)), numext::imag(m_eivalues.coeff(i)),
-                                       -numext::imag(m_eivalues.coeff(i)), numext::real(m_eivalues.coeff(i));
+  const RealScalar precision = RealScalar(2) * NumTraits<RealScalar>::epsilon();
+  const Index n = m_eivalues.rows();
+  MatrixType matD = MatrixType::Zero(n, n);
+  Index i = 0;
+  for (; i < n - 1; ++i) {
+    RealScalar real = numext::real(m_eivalues.coeff(i));
+    RealScalar imag = numext::imag(m_eivalues.coeff(i));
+    matD.coeffRef(i, i) = real;
+    if (!internal::isMuchSmallerThan(imag, real, precision)) {
+      matD.coeffRef(i, i + 1) = imag;
+      matD.coeffRef(i + 1, i) = -imag;
+      matD.coeffRef(i + 1, i + 1) = real;
       ++i;
     }
   }
+  if (i == n - 1) {
+    matD.coeffRef(i, i) = numext::real(m_eivalues.coeff(i));
+  }
+
   return matD;
 }
 
-template<typename MatrixType>
-typename EigenSolver<MatrixType>::EigenvectorsType EigenSolver<MatrixType>::eigenvectors() const
-{
+template <typename MatrixType>
+typename EigenSolver<MatrixType>::EigenvectorsType EigenSolver<MatrixType>::eigenvectors() const {
   eigen_assert(m_isInitialized && "EigenSolver is not initialized.");
   eigen_assert(m_eigenvectorsOk && "The eigenvectors have not been computed together with the eigenvalues.");
+  const RealScalar precision = RealScalar(2) * NumTraits<RealScalar>::epsilon();
   Index n = m_eivec.cols();
-  EigenvectorsType matV(n,n);
-  for (Index j=0; j<n; ++j)
-  {
-    if (internal::isMuchSmallerThan(numext::imag(m_eivalues.coeff(j)), numext::real(m_eivalues.coeff(j))) || j+1==n)
-    {
+  EigenvectorsType matV(n, n);
+  for (Index j = 0; j < n; ++j) {
+    if (internal::isMuchSmallerThan(numext::imag(m_eivalues.coeff(j)), numext::real(m_eivalues.coeff(j)), precision) ||
+        j + 1 == n) {
       // we have a real eigen value
       matV.col(j) = m_eivec.col(j).template cast<ComplexScalar>();
       matV.col(j).normalize();
-    }
-    else
-    {
+    } else {
       // we have a pair of complex eigen values
-      for (Index i=0; i<n; ++i)
-      {
-        matV.coeffRef(i,j)   = ComplexScalar(m_eivec.coeff(i,j),  m_eivec.coeff(i,j+1));
-        matV.coeffRef(i,j+1) = ComplexScalar(m_eivec.coeff(i,j), -m_eivec.coeff(i,j+1));
+      for (Index i = 0; i < n; ++i) {
+        matV.coeffRef(i, j) = ComplexScalar(m_eivec.coeff(i, j), m_eivec.coeff(i, j + 1));
+        matV.coeffRef(i, j + 1) = ComplexScalar(m_eivec.coeff(i, j), -m_eivec.coeff(i, j + 1));
       }
       matV.col(j).normalize();
-      matV.col(j+1).normalize();
+      matV.col(j + 1).normalize();
       ++j;
     }
   }
   return matV;
 }
 
-template<typename MatrixType>
-EigenSolver<MatrixType>& 
-EigenSolver<MatrixType>::compute(const MatrixType& matrix, bool computeEigenvectors)
-{
+template <typename MatrixType>
+template <typename InputType>
+EigenSolver<MatrixType>& EigenSolver<MatrixType>::compute(const EigenBase<InputType>& matrix,
+                                                          bool computeEigenvectors) {
   check_template_parameters();
-  
-  using std::sqrt;
+
+  using numext::isfinite;
   using std::abs;
+  using std::sqrt;
   eigen_assert(matrix.cols() == matrix.rows());
 
   // Reduce to real Schur form.
-  m_realSchur.compute(matrix, computeEigenvectors);
+  m_realSchur.compute(matrix.derived(), computeEigenvectors);
+
+  m_info = m_realSchur.info();
 
-  if (m_realSchur.info() == Success)
-  {
+  if (m_info == Success) {
     m_matT = m_realSchur.matrixT();
-    if (computeEigenvectors)
-      m_eivec = m_realSchur.matrixU();
-  
+    if (computeEigenvectors) m_eivec = m_realSchur.matrixU();
+
     // Compute eigenvalues from matT
     m_eivalues.resize(matrix.cols());
     Index i = 0;
-    while (i < matrix.cols()) 
-    {
-      if (i == matrix.cols() - 1 || m_matT.coeff(i+1, i) == Scalar(0)) 
-      {
+    while (i < matrix.cols()) {
+      if (i == matrix.cols() - 1 || m_matT.coeff(i + 1, i) == Scalar(0)) {
         m_eivalues.coeffRef(i) = m_matT.coeff(i, i);
+        if (!(isfinite)(m_eivalues.coeffRef(i))) {
+          m_isInitialized = true;
+          m_eigenvectorsOk = false;
+          m_info = NumericalIssue;
+          return *this;
+        }
         ++i;
-      }
-      else
-      {
-        Scalar p = Scalar(0.5) * (m_matT.coeff(i, i) - m_matT.coeff(i+1, i+1));
-        Scalar z = sqrt(abs(p * p + m_matT.coeff(i+1, i) * m_matT.coeff(i, i+1)));
-        m_eivalues.coeffRef(i)   = ComplexScalar(m_matT.coeff(i+1, i+1) + p, z);
-        m_eivalues.coeffRef(i+1) = ComplexScalar(m_matT.coeff(i+1, i+1) + p, -z);
+      } else {
+        Scalar p = Scalar(0.5) * (m_matT.coeff(i, i) - m_matT.coeff(i + 1, i + 1));
+        Scalar z;
+        // Compute z = sqrt(abs(p * p + m_matT.coeff(i+1, i) * m_matT.coeff(i, i+1)));
+        // without overflow
+        {
+          Scalar t0 = m_matT.coeff(i + 1, i);
+          Scalar t1 = m_matT.coeff(i, i + 1);
+          Scalar maxval = numext::maxi<Scalar>(abs(p), numext::maxi<Scalar>(abs(t0), abs(t1)));
+          t0 /= maxval;
+          t1 /= maxval;
+          Scalar p0 = p / maxval;
+          z = maxval * sqrt(abs(p0 * p0 + t0 * t1));
+        }
+
+        m_eivalues.coeffRef(i) = ComplexScalar(m_matT.coeff(i + 1, i + 1) + p, z);
+        m_eivalues.coeffRef(i + 1) = ComplexScalar(m_matT.coeff(i + 1, i + 1) + p, -z);
+        if (!((isfinite)(m_eivalues.coeffRef(i)) && (isfinite)(m_eivalues.coeffRef(i + 1)))) {
+          m_isInitialized = true;
+          m_eigenvectorsOk = false;
+          m_info = NumericalIssue;
+          return *this;
+        }
         i += 2;
       }
     }
-    
+
     // Compute eigenvectors.
-    if (computeEigenvectors)
-      doComputeEigenvectors();
+    if (computeEigenvectors) doComputeEigenvectors();
   }
 
   m_isInitialized = true;
@@ -417,191 +437,143 @@ EigenSolver<MatrixType>::compute(const MatrixType& matrix, bool computeEigenvect
   return *this;
 }
 
-// Complex scalar division.
-template<typename Scalar>
-std::complex<Scalar> cdiv(const Scalar& xr, const Scalar& xi, const Scalar& yr, const Scalar& yi)
-{
-  using std::abs;
-  Scalar r,d;
-  if (abs(yr) > abs(yi))
-  {
-      r = yi/yr;
-      d = yr + r*yi;
-      return std::complex<Scalar>((xr + r*xi)/d, (xi - r*xr)/d);
-  }
-  else
-  {
-      r = yr/yi;
-      d = yi + r*yr;
-      return std::complex<Scalar>((r*xr + xi)/d, (r*xi - xr)/d);
-  }
-}
-
-
-template<typename MatrixType>
-void EigenSolver<MatrixType>::doComputeEigenvectors()
-{
+template <typename MatrixType>
+void EigenSolver<MatrixType>::doComputeEigenvectors() {
   using std::abs;
   const Index size = m_eivec.cols();
   const Scalar eps = NumTraits<Scalar>::epsilon();
 
   // inefficient! this is already computed in RealSchur
   Scalar norm(0);
-  for (Index j = 0; j < size; ++j)
-  {
-    norm += m_matT.row(j).segment((std::max)(j-1,Index(0)), size-(std::max)(j-1,Index(0))).cwiseAbs().sum();
+  for (Index j = 0; j < size; ++j) {
+    norm += m_matT.row(j).segment((std::max)(j - 1, Index(0)), size - (std::max)(j - 1, Index(0))).cwiseAbs().sum();
   }
-  
+
   // Backsubstitute to find vectors of upper triangular form
-  if (norm == 0.0)
-  {
+  if (norm == Scalar(0)) {
     return;
   }
 
-  for (Index n = size-1; n >= 0; n--)
-  {
+  for (Index n = size - 1; n >= 0; n--) {
     Scalar p = m_eivalues.coeff(n).real();
     Scalar q = m_eivalues.coeff(n).imag();
 
     // Scalar vector
-    if (q == Scalar(0))
-    {
+    if (q == Scalar(0)) {
       Scalar lastr(0), lastw(0);
       Index l = n;
 
-      m_matT.coeffRef(n,n) = 1.0;
-      for (Index i = n-1; i >= 0; i--)
-      {
-        Scalar w = m_matT.coeff(i,i) - p;
-        Scalar r = m_matT.row(i).segment(l,n-l+1).dot(m_matT.col(n).segment(l, n-l+1));
+      m_matT.coeffRef(n, n) = Scalar(1);
+      for (Index i = n - 1; i >= 0; i--) {
+        Scalar w = m_matT.coeff(i, i) - p;
+        Scalar r = m_matT.row(i).segment(l, n - l + 1).dot(m_matT.col(n).segment(l, n - l + 1));
 
-        if (m_eivalues.coeff(i).imag() < 0.0)
-        {
+        if (m_eivalues.coeff(i).imag() < Scalar(0)) {
           lastw = w;
           lastr = r;
-        }
-        else
-        {
+        } else {
           l = i;
-          if (m_eivalues.coeff(i).imag() == 0.0)
-          {
-            if (w != 0.0)
-              m_matT.coeffRef(i,n) = -r / w;
+          if (m_eivalues.coeff(i).imag() == Scalar(0)) {
+            if (w != Scalar(0))
+              m_matT.coeffRef(i, n) = -r / w;
             else
-              m_matT.coeffRef(i,n) = -r / (eps * norm);
-          }
-          else // Solve real equations
+              m_matT.coeffRef(i, n) = -r / (eps * norm);
+          } else  // Solve real equations
           {
-            Scalar x = m_matT.coeff(i,i+1);
-            Scalar y = m_matT.coeff(i+1,i);
-            Scalar denom = (m_eivalues.coeff(i).real() - p) * (m_eivalues.coeff(i).real() - p) + m_eivalues.coeff(i).imag() * m_eivalues.coeff(i).imag();
+            Scalar x = m_matT.coeff(i, i + 1);
+            Scalar y = m_matT.coeff(i + 1, i);
+            Scalar denom = (m_eivalues.coeff(i).real() - p) * (m_eivalues.coeff(i).real() - p) +
+                           m_eivalues.coeff(i).imag() * m_eivalues.coeff(i).imag();
             Scalar t = (x * lastr - lastw * r) / denom;
-            m_matT.coeffRef(i,n) = t;
+            m_matT.coeffRef(i, n) = t;
             if (abs(x) > abs(lastw))
-              m_matT.coeffRef(i+1,n) = (-r - w * t) / x;
+              m_matT.coeffRef(i + 1, n) = (-r - w * t) / x;
             else
-              m_matT.coeffRef(i+1,n) = (-lastr - y * t) / lastw;
+              m_matT.coeffRef(i + 1, n) = (-lastr - y * t) / lastw;
           }
 
           // Overflow control
-          Scalar t = abs(m_matT.coeff(i,n));
-          if ((eps * t) * t > Scalar(1))
-            m_matT.col(n).tail(size-i) /= t;
+          Scalar t = abs(m_matT.coeff(i, n));
+          if ((eps * t) * t > Scalar(1)) m_matT.col(n).tail(size - i) /= t;
         }
       }
-    }
-    else if (q < Scalar(0) && n > 0) // Complex vector
+    } else if (q < Scalar(0) && n > 0)  // Complex vector
     {
       Scalar lastra(0), lastsa(0), lastw(0);
-      Index l = n-1;
+      Index l = n - 1;
 
       // Last vector component imaginary so matrix is triangular
-      if (abs(m_matT.coeff(n,n-1)) > abs(m_matT.coeff(n-1,n)))
-      {
-        m_matT.coeffRef(n-1,n-1) = q / m_matT.coeff(n,n-1);
-        m_matT.coeffRef(n-1,n) = -(m_matT.coeff(n,n) - p) / m_matT.coeff(n,n-1);
-      }
-      else
-      {
-        std::complex<Scalar> cc = cdiv<Scalar>(0.0,-m_matT.coeff(n-1,n),m_matT.coeff(n-1,n-1)-p,q);
-        m_matT.coeffRef(n-1,n-1) = numext::real(cc);
-        m_matT.coeffRef(n-1,n) = numext::imag(cc);
+      if (abs(m_matT.coeff(n, n - 1)) > abs(m_matT.coeff(n - 1, n))) {
+        m_matT.coeffRef(n - 1, n - 1) = q / m_matT.coeff(n, n - 1);
+        m_matT.coeffRef(n - 1, n) = -(m_matT.coeff(n, n) - p) / m_matT.coeff(n, n - 1);
+      } else {
+        ComplexScalar cc =
+            ComplexScalar(Scalar(0), -m_matT.coeff(n - 1, n)) / ComplexScalar(m_matT.coeff(n - 1, n - 1) - p, q);
+        m_matT.coeffRef(n - 1, n - 1) = numext::real(cc);
+        m_matT.coeffRef(n - 1, n) = numext::imag(cc);
       }
-      m_matT.coeffRef(n,n-1) = 0.0;
-      m_matT.coeffRef(n,n) = 1.0;
-      for (Index i = n-2; i >= 0; i--)
-      {
-        Scalar ra = m_matT.row(i).segment(l, n-l+1).dot(m_matT.col(n-1).segment(l, n-l+1));
-        Scalar sa = m_matT.row(i).segment(l, n-l+1).dot(m_matT.col(n).segment(l, n-l+1));
-        Scalar w = m_matT.coeff(i,i) - p;
-
-        if (m_eivalues.coeff(i).imag() < 0.0)
-        {
+      m_matT.coeffRef(n, n - 1) = Scalar(0);
+      m_matT.coeffRef(n, n) = Scalar(1);
+      for (Index i = n - 2; i >= 0; i--) {
+        Scalar ra = m_matT.row(i).segment(l, n - l + 1).dot(m_matT.col(n - 1).segment(l, n - l + 1));
+        Scalar sa = m_matT.row(i).segment(l, n - l + 1).dot(m_matT.col(n).segment(l, n - l + 1));
+        Scalar w = m_matT.coeff(i, i) - p;
+
+        if (m_eivalues.coeff(i).imag() < Scalar(0)) {
           lastw = w;
           lastra = ra;
           lastsa = sa;
-        }
-        else
-        {
+        } else {
           l = i;
-          if (m_eivalues.coeff(i).imag() == RealScalar(0))
-          {
-            std::complex<Scalar> cc = cdiv(-ra,-sa,w,q);
-            m_matT.coeffRef(i,n-1) = numext::real(cc);
-            m_matT.coeffRef(i,n) = numext::imag(cc);
-          }
-          else
-          {
+          if (m_eivalues.coeff(i).imag() == RealScalar(0)) {
+            ComplexScalar cc = ComplexScalar(-ra, -sa) / ComplexScalar(w, q);
+            m_matT.coeffRef(i, n - 1) = numext::real(cc);
+            m_matT.coeffRef(i, n) = numext::imag(cc);
+          } else {
             // Solve complex equations
-            Scalar x = m_matT.coeff(i,i+1);
-            Scalar y = m_matT.coeff(i+1,i);
-            Scalar vr = (m_eivalues.coeff(i).real() - p) * (m_eivalues.coeff(i).real() - p) + m_eivalues.coeff(i).imag() * m_eivalues.coeff(i).imag() - q * q;
+            Scalar x = m_matT.coeff(i, i + 1);
+            Scalar y = m_matT.coeff(i + 1, i);
+            Scalar vr = (m_eivalues.coeff(i).real() - p) * (m_eivalues.coeff(i).real() - p) +
+                        m_eivalues.coeff(i).imag() * m_eivalues.coeff(i).imag() - q * q;
             Scalar vi = (m_eivalues.coeff(i).real() - p) * Scalar(2) * q;
-            if ((vr == 0.0) && (vi == 0.0))
+            if ((vr == Scalar(0)) && (vi == Scalar(0)))
               vr = eps * norm * (abs(w) + abs(q) + abs(x) + abs(y) + abs(lastw));
 
-            std::complex<Scalar> cc = cdiv(x*lastra-lastw*ra+q*sa,x*lastsa-lastw*sa-q*ra,vr,vi);
-            m_matT.coeffRef(i,n-1) = numext::real(cc);
-            m_matT.coeffRef(i,n) = numext::imag(cc);
-            if (abs(x) > (abs(lastw) + abs(q)))
-            {
-              m_matT.coeffRef(i+1,n-1) = (-ra - w * m_matT.coeff(i,n-1) + q * m_matT.coeff(i,n)) / x;
-              m_matT.coeffRef(i+1,n) = (-sa - w * m_matT.coeff(i,n) - q * m_matT.coeff(i,n-1)) / x;
-            }
-            else
-            {
-              cc = cdiv(-lastra-y*m_matT.coeff(i,n-1),-lastsa-y*m_matT.coeff(i,n),lastw,q);
-              m_matT.coeffRef(i+1,n-1) = numext::real(cc);
-              m_matT.coeffRef(i+1,n) = numext::imag(cc);
+            ComplexScalar cc = ComplexScalar(x * lastra - lastw * ra + q * sa, x * lastsa - lastw * sa - q * ra) /
+                               ComplexScalar(vr, vi);
+            m_matT.coeffRef(i, n - 1) = numext::real(cc);
+            m_matT.coeffRef(i, n) = numext::imag(cc);
+            if (abs(x) > (abs(lastw) + abs(q))) {
+              m_matT.coeffRef(i + 1, n - 1) = (-ra - w * m_matT.coeff(i, n - 1) + q * m_matT.coeff(i, n)) / x;
+              m_matT.coeffRef(i + 1, n) = (-sa - w * m_matT.coeff(i, n) - q * m_matT.coeff(i, n - 1)) / x;
+            } else {
+              cc = ComplexScalar(-lastra - y * m_matT.coeff(i, n - 1), -lastsa - y * m_matT.coeff(i, n)) /
+                   ComplexScalar(lastw, q);
+              m_matT.coeffRef(i + 1, n - 1) = numext::real(cc);
+              m_matT.coeffRef(i + 1, n) = numext::imag(cc);
             }
           }
 
           // Overflow control
-          using std::max;
-          Scalar t = (max)(abs(m_matT.coeff(i,n-1)),abs(m_matT.coeff(i,n)));
-          if ((eps * t) * t > Scalar(1))
-            m_matT.block(i, n-1, size-i, 2) /= t;
-
+          Scalar t = numext::maxi<Scalar>(abs(m_matT.coeff(i, n - 1)), abs(m_matT.coeff(i, n)));
+          if ((eps * t) * t > Scalar(1)) m_matT.block(i, n - 1, size - i, 2) /= t;
         }
       }
-      
+
       // We handled a pair of complex conjugate eigenvalues, so need to skip them both
       n--;
-    }
-    else
-    {
-      eigen_assert(0 && "Internal bug in EigenSolver"); // this should not happen
+    } else {
+      eigen_assert(0 && "Internal bug in EigenSolver (INF or NaN has not been detected)");  // this should not happen
     }
   }
 
   // Back transformation to get eigenvectors of original matrix
-  for (Index j = size-1; j >= 0; j--)
-  {
-    m_tmp.noalias() = m_eivec.leftCols(j+1) * m_matT.col(j).segment(0, j+1);
+  for (Index j = size - 1; j >= 0; j--) {
+    m_tmp.noalias() = m_eivec.leftCols(j + 1) * m_matT.col(j).segment(0, j + 1);
     m_eivec.col(j) = m_tmp;
   }
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_EIGENSOLVER_H
+#endif  // EIGEN_EIGENSOLVER_H
diff --git a/inst/include/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h b/inst/include/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
index 956e80d9..c0a61dcd 100644
--- a/inst/include/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
+++ b/inst/include/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h
@@ -1,8 +1,9 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2012 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2012-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2010,2012 Jitse Niesen <jitse@maths.leeds.ac.uk>
+// Copyright (C) 2016 Tobias Wood <tobias@spinicist.org.uk>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -13,338 +14,389 @@
 
 #include "./RealQZ.h"
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 /** \eigenvalues_module \ingroup Eigenvalues_Module
-  *
-  *
-  * \class GeneralizedEigenSolver
-  *
-  * \brief Computes the generalized eigenvalues and eigenvectors of a pair of general matrices
-  *
-  * \tparam _MatrixType the type of the matrices of which we are computing the
-  * eigen-decomposition; this is expected to be an instantiation of the Matrix
-  * class template. Currently, only real matrices are supported.
-  *
-  * The generalized eigenvalues and eigenvectors of a matrix pair \f$ A \f$ and \f$ B \f$ are scalars
-  * \f$ \lambda \f$ and vectors \f$ v \f$ such that \f$ Av = \lambda Bv \f$.  If
-  * \f$ D \f$ is a diagonal matrix with the eigenvalues on the diagonal, and
-  * \f$ V \f$ is a matrix with the eigenvectors as its columns, then \f$ A V =
-  * B V D \f$. The matrix \f$ V \f$ is almost always invertible, in which case we
-  * have \f$ A = B V D V^{-1} \f$. This is called the generalized eigen-decomposition.
-  *
-  * The generalized eigenvalues and eigenvectors of a matrix pair may be complex, even when the
-  * matrices are real. Moreover, the generalized eigenvalue might be infinite if the matrix B is
-  * singular. To workaround this difficulty, the eigenvalues are provided as a pair of complex \f$ \alpha \f$
-  * and real \f$ \beta \f$ such that: \f$ \lambda_i = \alpha_i / \beta_i \f$. If \f$ \beta_i \f$ is (nearly) zero,
-  * then one can consider the well defined left eigenvalue \f$ \mu = \beta_i / \alpha_i\f$ such that:
-  * \f$ \mu_i A v_i = B v_i \f$, or even \f$ \mu_i u_i^T A  = u_i^T B \f$ where \f$ u_i \f$ is
-  * called the left eigenvector.
-  *
-  * Call the function compute() to compute the generalized eigenvalues and eigenvectors of
-  * a given matrix pair. Alternatively, you can use the
-  * GeneralizedEigenSolver(const MatrixType&, const MatrixType&, bool) constructor which computes the
-  * eigenvalues and eigenvectors at construction time. Once the eigenvalue and
-  * eigenvectors are computed, they can be retrieved with the eigenvalues() and
-  * eigenvectors() functions.
-  *
-  * Here is an usage example of this class:
-  * Example: \include GeneralizedEigenSolver.cpp
-  * Output: \verbinclude GeneralizedEigenSolver.out
-  *
-  * \sa MatrixBase::eigenvalues(), class ComplexEigenSolver, class SelfAdjointEigenSolver
-  */
-template<typename _MatrixType> class GeneralizedEigenSolver
-{
-  public:
-
-    /** \brief Synonym for the template parameter \p _MatrixType. */
-    typedef _MatrixType MatrixType;
-
-    enum {
-      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-      Options = MatrixType::Options,
-      MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
-      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
-    };
-
-    /** \brief Scalar type for matrices of type #MatrixType. */
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename NumTraits<Scalar>::Real RealScalar;
-    typedef typename MatrixType::Index Index;
-
-    /** \brief Complex scalar type for #MatrixType. 
-      *
-      * This is \c std::complex<Scalar> if #Scalar is real (e.g.,
-      * \c float or \c double) and just \c Scalar if #Scalar is
-      * complex.
-      */
-    typedef std::complex<RealScalar> ComplexScalar;
-
-    /** \brief Type for vector of real scalar values eigenvalues as returned by betas().
-      *
-      * This is a column vector with entries of type #Scalar.
-      * The length of the vector is the size of #MatrixType.
-      */
-    typedef Matrix<Scalar, ColsAtCompileTime, 1, Options & ~RowMajor, MaxColsAtCompileTime, 1> VectorType;
-
-    /** \brief Type for vector of complex scalar values eigenvalues as returned by betas().
-      *
-      * This is a column vector with entries of type #ComplexScalar.
-      * The length of the vector is the size of #MatrixType.
-      */
-    typedef Matrix<ComplexScalar, ColsAtCompileTime, 1, Options & ~RowMajor, MaxColsAtCompileTime, 1> ComplexVectorType;
-
-    /** \brief Expression type for the eigenvalues as returned by eigenvalues().
-      */
-    typedef CwiseBinaryOp<internal::scalar_quotient_op<ComplexScalar,Scalar>,ComplexVectorType,VectorType> EigenvalueType;
-
-    /** \brief Type for matrix of eigenvectors as returned by eigenvectors(). 
-      *
-      * This is a square matrix with entries of type #ComplexScalar. 
-      * The size is the same as the size of #MatrixType.
-      */
-    typedef Matrix<ComplexScalar, RowsAtCompileTime, ColsAtCompileTime, Options, MaxRowsAtCompileTime, MaxColsAtCompileTime> EigenvectorsType;
-
-    /** \brief Default constructor.
-      *
-      * The default constructor is useful in cases in which the user intends to
-      * perform decompositions via EigenSolver::compute(const MatrixType&, bool).
-      *
-      * \sa compute() for an example.
-      */
-    GeneralizedEigenSolver() : m_eivec(), m_alphas(), m_betas(), m_isInitialized(false), m_realQZ(), m_matS(), m_tmp() {}
-
-    /** \brief Default constructor with memory preallocation
-      *
-      * Like the default constructor but with preallocation of the internal data
-      * according to the specified problem \a size.
-      * \sa GeneralizedEigenSolver()
-      */
-    GeneralizedEigenSolver(Index size)
+ *
+ *
+ * \class GeneralizedEigenSolver
+ *
+ * \brief Computes the generalized eigenvalues and eigenvectors of a pair of general matrices
+ *
+ * \tparam MatrixType_ the type of the matrices of which we are computing the
+ * eigen-decomposition; this is expected to be an instantiation of the Matrix
+ * class template. Currently, only real matrices are supported.
+ *
+ * The generalized eigenvalues and eigenvectors of a matrix pair \f$ A \f$ and \f$ B \f$ are scalars
+ * \f$ \lambda \f$ and vectors \f$ v \f$ such that \f$ Av = \lambda Bv \f$.  If
+ * \f$ D \f$ is a diagonal matrix with the eigenvalues on the diagonal, and
+ * \f$ V \f$ is a matrix with the eigenvectors as its columns, then \f$ A V =
+ * B V D \f$. The matrix \f$ V \f$ is almost always invertible, in which case we
+ * have \f$ A = B V D V^{-1} \f$. This is called the generalized eigen-decomposition.
+ *
+ * The generalized eigenvalues and eigenvectors of a matrix pair may be complex, even when the
+ * matrices are real. Moreover, the generalized eigenvalue might be infinite if the matrix B is
+ * singular. To workaround this difficulty, the eigenvalues are provided as a pair of complex \f$ \alpha \f$
+ * and real \f$ \beta \f$ such that: \f$ \lambda_i = \alpha_i / \beta_i \f$. If \f$ \beta_i \f$ is (nearly) zero,
+ * then one can consider the well defined left eigenvalue \f$ \mu = \beta_i / \alpha_i\f$ such that:
+ * \f$ \mu_i A v_i = B v_i \f$, or even \f$ \mu_i u_i^T A  = u_i^T B \f$ where \f$ u_i \f$ is
+ * called the left eigenvector.
+ *
+ * Call the function compute() to compute the generalized eigenvalues and eigenvectors of
+ * a given matrix pair. Alternatively, you can use the
+ * GeneralizedEigenSolver(const MatrixType&, const MatrixType&, bool) constructor which computes the
+ * eigenvalues and eigenvectors at construction time. Once the eigenvalue and
+ * eigenvectors are computed, they can be retrieved with the eigenvalues() and
+ * eigenvectors() functions.
+ *
+ * Here is an usage example of this class:
+ * Example: \include GeneralizedEigenSolver.cpp
+ * Output: \verbinclude GeneralizedEigenSolver.out
+ *
+ * \sa MatrixBase::eigenvalues(), class ComplexEigenSolver, class SelfAdjointEigenSolver
+ */
+template <typename MatrixType_>
+class GeneralizedEigenSolver {
+ public:
+  /** \brief Synonym for the template parameter \p MatrixType_. */
+  typedef MatrixType_ MatrixType;
+
+  enum {
+    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+    ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+    Options = internal::traits<MatrixType>::Options,
+    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+  };
+
+  /** \brief Scalar type for matrices of type #MatrixType. */
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef Eigen::Index Index;  ///< \deprecated since Eigen 3.3
+
+  /** \brief Complex scalar type for #MatrixType.
+   *
+   * This is \c std::complex<Scalar> if #Scalar is real (e.g.,
+   * \c float or \c double) and just \c Scalar if #Scalar is
+   * complex.
+   */
+  typedef internal::make_complex_t<Scalar> ComplexScalar;
+
+  /** \brief Type for vector of real scalar values eigenvalues as returned by betas().
+   *
+   * This is a column vector with entries of type #Scalar.
+   * The length of the vector is the size of #MatrixType.
+   */
+  typedef Matrix<Scalar, ColsAtCompileTime, 1, Options & ~RowMajor, MaxColsAtCompileTime, 1> VectorType;
+
+  /** \brief Type for vector of complex scalar values eigenvalues as returned by alphas().
+   *
+   * This is a column vector with entries of type #ComplexScalar.
+   * The length of the vector is the size of #MatrixType.
+   */
+  typedef Matrix<ComplexScalar, ColsAtCompileTime, 1, Options & ~RowMajor, MaxColsAtCompileTime, 1> ComplexVectorType;
+
+  /** \brief Expression type for the eigenvalues as returned by eigenvalues().
+   */
+  typedef CwiseBinaryOp<internal::scalar_quotient_op<ComplexScalar, Scalar>, ComplexVectorType, VectorType>
+      EigenvalueType;
+
+  /** \brief Type for matrix of eigenvectors as returned by eigenvectors().
+   *
+   * This is a square matrix with entries of type #ComplexScalar.
+   * The size is the same as the size of #MatrixType.
+   */
+  typedef Matrix<ComplexScalar, RowsAtCompileTime, ColsAtCompileTime, Options, MaxRowsAtCompileTime,
+                 MaxColsAtCompileTime>
+      EigenvectorsType;
+
+  /** \brief Default constructor.
+   *
+   * The default constructor is useful in cases in which the user intends to
+   * perform decompositions via EigenSolver::compute(const MatrixType&, bool).
+   *
+   * \sa compute() for an example.
+   */
+  GeneralizedEigenSolver()
+      : m_eivec(), m_alphas(), m_betas(), m_computeEigenvectors(false), m_isInitialized(false), m_realQZ() {}
+
+  /** \brief Default constructor with memory preallocation
+   *
+   * Like the default constructor but with preallocation of the internal data
+   * according to the specified problem \a size.
+   * \sa GeneralizedEigenSolver()
+   */
+  explicit GeneralizedEigenSolver(Index size)
       : m_eivec(size, size),
         m_alphas(size),
         m_betas(size),
+        m_computeEigenvectors(false),
         m_isInitialized(false),
-        m_eigenvectorsOk(false),
         m_realQZ(size),
-        m_matS(size, size),
-        m_tmp(size)
-    {}
-
-    /** \brief Constructor; computes the generalized eigendecomposition of given matrix pair.
-      * 
-      * \param[in]  A  Square matrix whose eigendecomposition is to be computed.
-      * \param[in]  B  Square matrix whose eigendecomposition is to be computed.
-      * \param[in]  computeEigenvectors  If true, both the eigenvectors and the
-      *    eigenvalues are computed; if false, only the eigenvalues are computed.
-      *
-      * This constructor calls compute() to compute the generalized eigenvalues
-      * and eigenvectors.
-      *
-      * \sa compute()
-      */
-    GeneralizedEigenSolver(const MatrixType& A, const MatrixType& B, bool computeEigenvectors = true)
+        m_tmp(size) {}
+
+  /** \brief Constructor; computes the generalized eigendecomposition of given matrix pair.
+   *
+   * \param[in]  A  Square matrix whose eigendecomposition is to be computed.
+   * \param[in]  B  Square matrix whose eigendecomposition is to be computed.
+   * \param[in]  computeEigenvectors  If true, both the eigenvectors and the
+   *    eigenvalues are computed; if false, only the eigenvalues are computed.
+   *
+   * This constructor calls compute() to compute the generalized eigenvalues
+   * and eigenvectors.
+   *
+   * \sa compute()
+   */
+  GeneralizedEigenSolver(const MatrixType& A, const MatrixType& B, bool computeEigenvectors = true)
       : m_eivec(A.rows(), A.cols()),
         m_alphas(A.cols()),
         m_betas(A.cols()),
+        m_computeEigenvectors(false),
         m_isInitialized(false),
-        m_eigenvectorsOk(false),
         m_realQZ(A.cols()),
-        m_matS(A.rows(), A.cols()),
-        m_tmp(A.cols())
-    {
-      compute(A, B, computeEigenvectors);
-    }
+        m_tmp(A.cols()) {
+    compute(A, B, computeEigenvectors);
+  }
 
-    /* \brief Returns the computed generalized eigenvectors.
-      *
-      * \returns  %Matrix whose columns are the (possibly complex) eigenvectors.
-      *
-      * \pre Either the constructor 
-      * GeneralizedEigenSolver(const MatrixType&,const MatrixType&, bool) or the member function
-      * compute(const MatrixType&, const MatrixType& bool) has been called before, and
-      * \p computeEigenvectors was set to true (the default).
-      *
-      * Column \f$ k \f$ of the returned matrix is an eigenvector corresponding
-      * to eigenvalue number \f$ k \f$ as returned by eigenvalues().  The
-      * eigenvectors are normalized to have (Euclidean) norm equal to one. The
-      * matrix returned by this function is the matrix \f$ V \f$ in the
-      * generalized eigendecomposition \f$ A = B V D V^{-1} \f$, if it exists.
-      *
-      * \sa eigenvalues()
-      */
-//    EigenvectorsType eigenvectors() const;
-
-    /** \brief Returns an expression of the computed generalized eigenvalues.
-      *
-      * \returns An expression of the column vector containing the eigenvalues.
-      *
-      * It is a shortcut for \code this->alphas().cwiseQuotient(this->betas()); \endcode
-      * Not that betas might contain zeros. It is therefore not recommended to use this function,
-      * but rather directly deal with the alphas and betas vectors.
-      *
-      * \pre Either the constructor 
-      * GeneralizedEigenSolver(const MatrixType&,const MatrixType&,bool) or the member function
-      * compute(const MatrixType&,const MatrixType&,bool) has been called before.
-      *
-      * The eigenvalues are repeated according to their algebraic multiplicity,
-      * so there are as many eigenvalues as rows in the matrix. The eigenvalues 
-      * are not sorted in any particular order.
-      *
-      * \sa alphas(), betas(), eigenvectors()
-      */
-    EigenvalueType eigenvalues() const
-    {
-      eigen_assert(m_isInitialized && "GeneralizedEigenSolver is not initialized.");
-      return EigenvalueType(m_alphas,m_betas);
-    }
+  /** \brief Returns the computed generalized eigenvectors.
+   *
+   * \returns  %Matrix whose columns are the (possibly complex) right eigenvectors.
+   * i.e. the eigenvectors that solve (A - l*B)x = 0. The ordering matches the eigenvalues.
+   *
+   * \pre Either the constructor
+   * GeneralizedEigenSolver(const MatrixType&,const MatrixType&, bool) or the member function
+   * compute(const MatrixType&, const MatrixType& bool) has been called before, and
+   * \p computeEigenvectors was set to true (the default).
+   *
+   * \sa eigenvalues()
+   */
+  EigenvectorsType eigenvectors() const {
+    eigen_assert(info() == Success && "GeneralizedEigenSolver failed to compute eigenvectors");
+    eigen_assert(m_computeEigenvectors && "Eigenvectors for GeneralizedEigenSolver were not calculated");
+    return m_eivec;
+  }
 
-    /** \returns A const reference to the vectors containing the alpha values
-      *
-      * This vector permits to reconstruct the j-th eigenvalues as alphas(i)/betas(j).
-      *
-      * \sa betas(), eigenvalues() */
-    ComplexVectorType alphas() const
-    {
-      eigen_assert(m_isInitialized && "GeneralizedEigenSolver is not initialized.");
-      return m_alphas;
-    }
+  /** \brief Returns an expression of the computed generalized eigenvalues.
+   *
+   * \returns An expression of the column vector containing the eigenvalues.
+   *
+   * It is a shortcut for \code this->alphas().cwiseQuotient(this->betas()); \endcode
+   * Not that betas might contain zeros. It is therefore not recommended to use this function,
+   * but rather directly deal with the alphas and betas vectors.
+   *
+   * \pre Either the constructor
+   * GeneralizedEigenSolver(const MatrixType&,const MatrixType&,bool) or the member function
+   * compute(const MatrixType&,const MatrixType&,bool) has been called before.
+   *
+   * The eigenvalues are repeated according to their algebraic multiplicity,
+   * so there are as many eigenvalues as rows in the matrix. The eigenvalues
+   * are not sorted in any particular order.
+   *
+   * \sa alphas(), betas(), eigenvectors()
+   */
+  EigenvalueType eigenvalues() const {
+    eigen_assert(info() == Success && "GeneralizedEigenSolver failed to compute eigenvalues.");
+    return EigenvalueType(m_alphas, m_betas);
+  }
 
-    /** \returns A const reference to the vectors containing the beta values
-      *
-      * This vector permits to reconstruct the j-th eigenvalues as alphas(i)/betas(j).
-      *
-      * \sa alphas(), eigenvalues() */
-    VectorType betas() const
-    {
-      eigen_assert(m_isInitialized && "GeneralizedEigenSolver is not initialized.");
-      return m_betas;
-    }
+  /** \returns A const reference to the vectors containing the alpha values
+   *
+   * This vector permits to reconstruct the j-th eigenvalues as alphas(i)/betas(j).
+   *
+   * \sa betas(), eigenvalues() */
+  const ComplexVectorType& alphas() const {
+    eigen_assert(info() == Success && "GeneralizedEigenSolver failed to compute alphas.");
+    return m_alphas;
+  }
 
-    /** \brief Computes generalized eigendecomposition of given matrix.
-      * 
-      * \param[in]  A  Square matrix whose eigendecomposition is to be computed.
-      * \param[in]  B  Square matrix whose eigendecomposition is to be computed.
-      * \param[in]  computeEigenvectors  If true, both the eigenvectors and the
-      *    eigenvalues are computed; if false, only the eigenvalues are
-      *    computed. 
-      * \returns    Reference to \c *this
-      *
-      * This function computes the eigenvalues of the real matrix \p matrix.
-      * The eigenvalues() function can be used to retrieve them.  If 
-      * \p computeEigenvectors is true, then the eigenvectors are also computed
-      * and can be retrieved by calling eigenvectors().
-      *
-      * The matrix is first reduced to real generalized Schur form using the RealQZ
-      * class. The generalized Schur decomposition is then used to compute the eigenvalues
-      * and eigenvectors.
-      *
-      * The cost of the computation is dominated by the cost of the
-      * generalized Schur decomposition.
-      *
-      * This method reuses of the allocated data in the GeneralizedEigenSolver object.
-      */
-    GeneralizedEigenSolver& compute(const MatrixType& A, const MatrixType& B, bool computeEigenvectors = true);
-
-    ComputationInfo info() const
-    {
-      eigen_assert(m_isInitialized && "EigenSolver is not initialized.");
-      return m_realQZ.info();
-    }
+  /** \returns A const reference to the vectors containing the beta values
+   *
+   * This vector permits to reconstruct the j-th eigenvalues as alphas(i)/betas(j).
+   *
+   * \sa alphas(), eigenvalues() */
+  const VectorType& betas() const {
+    eigen_assert(info() == Success && "GeneralizedEigenSolver failed to compute betas.");
+    return m_betas;
+  }
 
-    /** Sets the maximal number of iterations allowed.
-    */
-    GeneralizedEigenSolver& setMaxIterations(Index maxIters)
-    {
-      m_realQZ.setMaxIterations(maxIters);
-      return *this;
-    }
+  /** \brief Computes generalized eigendecomposition of given matrix.
+   *
+   * \param[in]  A  Square matrix whose eigendecomposition is to be computed.
+   * \param[in]  B  Square matrix whose eigendecomposition is to be computed.
+   * \param[in]  computeEigenvectors  If true, both the eigenvectors and the
+   *    eigenvalues are computed; if false, only the eigenvalues are
+   *    computed.
+   * \returns    Reference to \c *this
+   *
+   * This function computes the eigenvalues of the real matrix \p matrix.
+   * The eigenvalues() function can be used to retrieve them.  If
+   * \p computeEigenvectors is true, then the eigenvectors are also computed
+   * and can be retrieved by calling eigenvectors().
+   *
+   * The matrix is first reduced to real generalized Schur form using the RealQZ
+   * class. The generalized Schur decomposition is then used to compute the eigenvalues
+   * and eigenvectors.
+   *
+   * The cost of the computation is dominated by the cost of the
+   * generalized Schur decomposition.
+   *
+   * This method reuses of the allocated data in the GeneralizedEigenSolver object.
+   */
+  GeneralizedEigenSolver& compute(const MatrixType& A, const MatrixType& B, bool computeEigenvectors = true);
+
+  ComputationInfo info() const {
+    eigen_assert(m_isInitialized && "EigenSolver is not initialized.");
+    return m_realQZ.info();
+  }
 
-  protected:
-    
-    static void check_template_parameters()
-    {
-      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
-      EIGEN_STATIC_ASSERT(!NumTraits<Scalar>::IsComplex, NUMERIC_TYPE_MUST_BE_REAL);
-    }
-    
-    MatrixType m_eivec;
-    ComplexVectorType m_alphas;
-    VectorType m_betas;
-    bool m_isInitialized;
-    bool m_eigenvectorsOk;
-    RealQZ<MatrixType> m_realQZ;
-    MatrixType m_matS;
-
-    typedef Matrix<Scalar, ColsAtCompileTime, 1, Options & ~RowMajor, MaxColsAtCompileTime, 1> ColumnVectorType;
-    ColumnVectorType m_tmp;
+  /** Sets the maximal number of iterations allowed.
+   */
+  GeneralizedEigenSolver& setMaxIterations(Index maxIters) {
+    m_realQZ.setMaxIterations(maxIters);
+    return *this;
+  }
+
+ protected:
+  EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
+  EIGEN_STATIC_ASSERT(!NumTraits<Scalar>::IsComplex, NUMERIC_TYPE_MUST_BE_REAL)
+
+  EigenvectorsType m_eivec;
+  ComplexVectorType m_alphas;
+  VectorType m_betas;
+  bool m_computeEigenvectors;
+  bool m_isInitialized;
+  RealQZ<MatrixType> m_realQZ;
+  ComplexVectorType m_tmp;
 };
 
-//template<typename MatrixType>
-//typename GeneralizedEigenSolver<MatrixType>::EigenvectorsType GeneralizedEigenSolver<MatrixType>::eigenvectors() const
-//{
-//  eigen_assert(m_isInitialized && "EigenSolver is not initialized.");
-//  eigen_assert(m_eigenvectorsOk && "The eigenvectors have not been computed together with the eigenvalues.");
-//  Index n = m_eivec.cols();
-//  EigenvectorsType matV(n,n);
-//  // TODO
-//  return matV;
-//}
-
-template<typename MatrixType>
-GeneralizedEigenSolver<MatrixType>&
-GeneralizedEigenSolver<MatrixType>::compute(const MatrixType& A, const MatrixType& B, bool computeEigenvectors)
-{
-  check_template_parameters();
-  
-  using std::sqrt;
+template <typename MatrixType>
+GeneralizedEigenSolver<MatrixType>& GeneralizedEigenSolver<MatrixType>::compute(const MatrixType& A,
+                                                                                const MatrixType& B,
+                                                                                bool computeEigenvectors) {
   using std::abs;
+  using std::sqrt;
   eigen_assert(A.cols() == A.rows() && B.cols() == A.rows() && B.cols() == B.rows());
-
+  Index size = A.cols();
   // Reduce to generalized real Schur form:
   // A = Q S Z and B = Q T Z
   m_realQZ.compute(A, B, computeEigenvectors);
+  if (m_realQZ.info() == Success) {
+    // Resize storage
+    m_alphas.resize(size);
+    m_betas.resize(size);
+    if (computeEigenvectors) {
+      m_eivec.resize(size, size);
+      m_tmp.resize(size);
+    }
+
+    // Aliases:
+    Map<VectorType> v(reinterpret_cast<Scalar*>(m_tmp.data()), size);
+    ComplexVectorType& cv = m_tmp;
+    const MatrixType& mS = m_realQZ.matrixS();
+    const MatrixType& mT = m_realQZ.matrixT();
 
-  if (m_realQZ.info() == Success)
-  {
-    m_matS = m_realQZ.matrixS();
-    if (computeEigenvectors)
-      m_eivec = m_realQZ.matrixZ().transpose();
-  
-    // Compute eigenvalues from matS
-    m_alphas.resize(A.cols());
-    m_betas.resize(A.cols());
     Index i = 0;
-    while (i < A.cols())
-    {
-      if (i == A.cols() - 1 || m_matS.coeff(i+1, i) == Scalar(0))
-      {
-        m_alphas.coeffRef(i) = m_matS.coeff(i, i);
-        m_betas.coeffRef(i)  = m_realQZ.matrixT().coeff(i,i);
+    while (i < size) {
+      if (i == size - 1 || mS.coeff(i + 1, i) == Scalar(0)) {
+        // Real eigenvalue
+        m_alphas.coeffRef(i) = mS.diagonal().coeff(i);
+        m_betas.coeffRef(i) = mT.diagonal().coeff(i);
+        if (computeEigenvectors) {
+          v.setConstant(Scalar(0.0));
+          v.coeffRef(i) = Scalar(1.0);
+          // For singular eigenvalues do nothing more
+          if (abs(m_betas.coeffRef(i)) >= (std::numeric_limits<RealScalar>::min)()) {
+            // Non-singular eigenvalue
+            const Scalar alpha = real(m_alphas.coeffRef(i));
+            const Scalar beta = m_betas.coeffRef(i);
+            for (Index j = i - 1; j >= 0; j--) {
+              const Index st = j + 1;
+              const Index sz = i - j;
+              if (j > 0 && mS.coeff(j, j - 1) != Scalar(0)) {
+                // 2x2 block
+                Matrix<Scalar, 2, 1> rhs = (alpha * mT.template block<2, Dynamic>(j - 1, st, 2, sz) -
+                                            beta * mS.template block<2, Dynamic>(j - 1, st, 2, sz))
+                                               .lazyProduct(v.segment(st, sz));
+                Matrix<Scalar, 2, 2> lhs =
+                    beta * mS.template block<2, 2>(j - 1, j - 1) - alpha * mT.template block<2, 2>(j - 1, j - 1);
+                v.template segment<2>(j - 1) = lhs.partialPivLu().solve(rhs);
+                j--;
+              } else {
+                v.coeffRef(j) = -v.segment(st, sz)
+                                     .transpose()
+                                     .cwiseProduct(beta * mS.block(j, st, 1, sz) - alpha * mT.block(j, st, 1, sz))
+                                     .sum() /
+                                (beta * mS.coeffRef(j, j) - alpha * mT.coeffRef(j, j));
+              }
+            }
+          }
+          m_eivec.col(i).real().noalias() = m_realQZ.matrixZ().transpose() * v;
+          m_eivec.col(i).real().normalize();
+          m_eivec.col(i).imag().setConstant(0);
+        }
         ++i;
-      }
-      else
-      {
-        Scalar p = Scalar(0.5) * (m_matS.coeff(i, i) - m_matS.coeff(i+1, i+1));
-        Scalar z = sqrt(abs(p * p + m_matS.coeff(i+1, i) * m_matS.coeff(i, i+1)));
-        m_alphas.coeffRef(i)   = ComplexScalar(m_matS.coeff(i+1, i+1) + p, z);
-        m_alphas.coeffRef(i+1) = ComplexScalar(m_matS.coeff(i+1, i+1) + p, -z);
-
-        m_betas.coeffRef(i)   = m_realQZ.matrixT().coeff(i,i);
-        m_betas.coeffRef(i+1) = m_realQZ.matrixT().coeff(i,i);
+      } else {
+        // We need to extract the generalized eigenvalues of the pair of a general 2x2 block S and a positive diagonal
+        // 2x2 block T Then taking beta=T_00*T_11, we can avoid any division, and alpha is the eigenvalues of A = (U^-1
+        // * S * U) * diag(T_11,T_00):
+
+        // T =  [a 0]
+        //      [0 b]
+        RealScalar a = mT.diagonal().coeff(i), b = mT.diagonal().coeff(i + 1);
+        const RealScalar beta = m_betas.coeffRef(i) = m_betas.coeffRef(i + 1) = a * b;
+
+        // ^^ NOTE: using diagonal()(i) instead of coeff(i,i) workarounds a MSVC bug.
+        Matrix<RealScalar, 2, 2> S2 = mS.template block<2, 2>(i, i) * Matrix<Scalar, 2, 1>(b, a).asDiagonal();
+
+        Scalar p = Scalar(0.5) * (S2.coeff(0, 0) - S2.coeff(1, 1));
+        Scalar z = sqrt(abs(p * p + S2.coeff(1, 0) * S2.coeff(0, 1)));
+        const ComplexScalar alpha = ComplexScalar(S2.coeff(1, 1) + p, (beta > 0) ? z : -z);
+        m_alphas.coeffRef(i) = conj(alpha);
+        m_alphas.coeffRef(i + 1) = alpha;
+
+        if (computeEigenvectors) {
+          // Compute eigenvector in position (i+1) and then position (i) is just the conjugate
+          cv.setZero();
+          cv.coeffRef(i + 1) = Scalar(1.0);
+          // here, the "static_cast" workaround expression template issues.
+          cv.coeffRef(i) = -(static_cast<Scalar>(beta * mS.coeffRef(i, i + 1)) - alpha * mT.coeffRef(i, i + 1)) /
+                           (static_cast<Scalar>(beta * mS.coeffRef(i, i)) - alpha * mT.coeffRef(i, i));
+          for (Index j = i - 1; j >= 0; j--) {
+            const Index st = j + 1;
+            const Index sz = i + 1 - j;
+            if (j > 0 && mS.coeff(j, j - 1) != Scalar(0)) {
+              // 2x2 block
+              Matrix<ComplexScalar, 2, 1> rhs = (alpha * mT.template block<2, Dynamic>(j - 1, st, 2, sz) -
+                                                 beta * mS.template block<2, Dynamic>(j - 1, st, 2, sz))
+                                                    .lazyProduct(cv.segment(st, sz));
+              Matrix<ComplexScalar, 2, 2> lhs =
+                  beta * mS.template block<2, 2>(j - 1, j - 1) - alpha * mT.template block<2, 2>(j - 1, j - 1);
+              cv.template segment<2>(j - 1) = lhs.partialPivLu().solve(rhs);
+              j--;
+            } else {
+              cv.coeffRef(j) = cv.segment(st, sz)
+                                   .transpose()
+                                   .cwiseProduct(beta * mS.block(j, st, 1, sz) - alpha * mT.block(j, st, 1, sz))
+                                   .sum() /
+                               (alpha * mT.coeffRef(j, j) - static_cast<Scalar>(beta * mS.coeffRef(j, j)));
+            }
+          }
+          m_eivec.col(i + 1).noalias() = (m_realQZ.matrixZ().transpose() * cv);
+          m_eivec.col(i + 1).normalize();
+          m_eivec.col(i) = m_eivec.col(i + 1).conjugate();
+        }
         i += 2;
       }
     }
   }
-
+  m_computeEigenvectors = computeEigenvectors;
   m_isInitialized = true;
-  m_eigenvectorsOk = false;//computeEigenvectors;
-
   return *this;
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_GENERALIZEDEIGENSOLVER_H
+#endif  // EIGEN_GENERALIZEDEIGENSOLVER_H
diff --git a/inst/include/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h b/inst/include/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h
index 07bf1ea0..adff3a3b 100644
--- a/inst/include/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h
+++ b/inst/include/Eigen/src/Eigenvalues/GeneralizedSelfAdjointEigenSolver.h
@@ -13,187 +13,176 @@
 
 #include "./Tridiagonalization.h"
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 /** \eigenvalues_module \ingroup Eigenvalues_Module
-  *
-  *
-  * \class GeneralizedSelfAdjointEigenSolver
-  *
-  * \brief Computes eigenvalues and eigenvectors of the generalized selfadjoint eigen problem
-  *
-  * \tparam _MatrixType the type of the matrix of which we are computing the
-  * eigendecomposition; this is expected to be an instantiation of the Matrix
-  * class template.
-  *
-  * This class solves the generalized eigenvalue problem
-  * \f$ Av = \lambda Bv \f$. In this case, the matrix \f$ A \f$ should be
-  * selfadjoint and the matrix \f$ B \f$ should be positive definite.
-  *
-  * Only the \b lower \b triangular \b part of the input matrix is referenced.
-  *
-  * Call the function compute() to compute the eigenvalues and eigenvectors of
-  * a given matrix. Alternatively, you can use the
-  * GeneralizedSelfAdjointEigenSolver(const MatrixType&, const MatrixType&, int)
-  * constructor which computes the eigenvalues and eigenvectors at construction time.
-  * Once the eigenvalue and eigenvectors are computed, they can be retrieved with the eigenvalues()
-  * and eigenvectors() functions.
-  *
-  * The documentation for GeneralizedSelfAdjointEigenSolver(const MatrixType&, const MatrixType&, int)
-  * contains an example of the typical use of this class.
-  *
-  * \sa class SelfAdjointEigenSolver, class EigenSolver, class ComplexEigenSolver
-  */
-template<typename _MatrixType>
-class GeneralizedSelfAdjointEigenSolver : public SelfAdjointEigenSolver<_MatrixType>
-{
-    typedef SelfAdjointEigenSolver<_MatrixType> Base;
-  public:
-
-    typedef typename Base::Index Index;
-    typedef _MatrixType MatrixType;
-
-    /** \brief Default constructor for fixed-size matrices.
-      *
-      * The default constructor is useful in cases in which the user intends to
-      * perform decompositions via compute(). This constructor
-      * can only be used if \p _MatrixType is a fixed-size matrix; use
-      * GeneralizedSelfAdjointEigenSolver(Index) for dynamic-size matrices.
-      */
-    GeneralizedSelfAdjointEigenSolver() : Base() {}
-
-    /** \brief Constructor, pre-allocates memory for dynamic-size matrices.
-      *
-      * \param [in]  size  Positive integer, size of the matrix whose
-      * eigenvalues and eigenvectors will be computed.
-      *
-      * This constructor is useful for dynamic-size matrices, when the user
-      * intends to perform decompositions via compute(). The \p size
-      * parameter is only used as a hint. It is not an error to give a wrong
-      * \p size, but it may impair performance.
-      *
-      * \sa compute() for an example
-      */
-    GeneralizedSelfAdjointEigenSolver(Index size)
-        : Base(size)
-    {}
-
-    /** \brief Constructor; computes generalized eigendecomposition of given matrix pencil.
-      *
-      * \param[in]  matA  Selfadjoint matrix in matrix pencil.
-      *                   Only the lower triangular part of the matrix is referenced.
-      * \param[in]  matB  Positive-definite matrix in matrix pencil.
-      *                   Only the lower triangular part of the matrix is referenced.
-      * \param[in]  options A or-ed set of flags {#ComputeEigenvectors,#EigenvaluesOnly} | {#Ax_lBx,#ABx_lx,#BAx_lx}.
-      *                     Default is #ComputeEigenvectors|#Ax_lBx.
-      *
-      * This constructor calls compute(const MatrixType&, const MatrixType&, int)
-      * to compute the eigenvalues and (if requested) the eigenvectors of the
-      * generalized eigenproblem \f$ Ax = \lambda B x \f$ with \a matA the
-      * selfadjoint matrix \f$ A \f$ and \a matB the positive definite matrix
-      * \f$ B \f$. Each eigenvector \f$ x \f$ satisfies the property
-      * \f$ x^* B x = 1 \f$. The eigenvectors are computed if
-      * \a options contains ComputeEigenvectors.
-      *
-      * In addition, the two following variants can be solved via \p options:
-      * - \c ABx_lx: \f$ ABx = \lambda x \f$
-      * - \c BAx_lx: \f$ BAx = \lambda x \f$
-      *
-      * Example: \include SelfAdjointEigenSolver_SelfAdjointEigenSolver_MatrixType2.cpp
-      * Output: \verbinclude SelfAdjointEigenSolver_SelfAdjointEigenSolver_MatrixType2.out
-      *
-      * \sa compute(const MatrixType&, const MatrixType&, int)
-      */
-    GeneralizedSelfAdjointEigenSolver(const MatrixType& matA, const MatrixType& matB,
-                                      int options = ComputeEigenvectors|Ax_lBx)
-      : Base(matA.cols())
-    {
-      compute(matA, matB, options);
-    }
-
-    /** \brief Computes generalized eigendecomposition of given matrix pencil.
-      *
-      * \param[in]  matA  Selfadjoint matrix in matrix pencil.
-      *                   Only the lower triangular part of the matrix is referenced.
-      * \param[in]  matB  Positive-definite matrix in matrix pencil.
-      *                   Only the lower triangular part of the matrix is referenced.
-      * \param[in]  options A or-ed set of flags {#ComputeEigenvectors,#EigenvaluesOnly} | {#Ax_lBx,#ABx_lx,#BAx_lx}.
-      *                     Default is #ComputeEigenvectors|#Ax_lBx.
-      *
-      * \returns    Reference to \c *this
-      *
-      * Accoring to \p options, this function computes eigenvalues and (if requested)
-      * the eigenvectors of one of the following three generalized eigenproblems:
-      * - \c Ax_lBx: \f$ Ax = \lambda B x \f$
-      * - \c ABx_lx: \f$ ABx = \lambda x \f$
-      * - \c BAx_lx: \f$ BAx = \lambda x \f$
-      * with \a matA the selfadjoint matrix \f$ A \f$ and \a matB the positive definite
-      * matrix \f$ B \f$.
-      * In addition, each eigenvector \f$ x \f$ satisfies the property \f$ x^* B x = 1 \f$.
-      *
-      * The eigenvalues() function can be used to retrieve
-      * the eigenvalues. If \p options contains ComputeEigenvectors, then the
-      * eigenvectors are also computed and can be retrieved by calling
-      * eigenvectors().
-      *
-      * The implementation uses LLT to compute the Cholesky decomposition
-      * \f$ B = LL^* \f$ and computes the classical eigendecomposition
-      * of the selfadjoint matrix \f$ L^{-1} A (L^*)^{-1} \f$ if \p options contains Ax_lBx
-      * and of \f$ L^{*} A L \f$ otherwise. This solves the
-      * generalized eigenproblem, because any solution of the generalized
-      * eigenproblem \f$ Ax = \lambda B x \f$ corresponds to a solution
-      * \f$ L^{-1} A (L^*)^{-1} (L^* x) = \lambda (L^* x) \f$ of the
-      * eigenproblem for \f$ L^{-1} A (L^*)^{-1} \f$. Similar statements
-      * can be made for the two other variants.
-      *
-      * Example: \include SelfAdjointEigenSolver_compute_MatrixType2.cpp
-      * Output: \verbinclude SelfAdjointEigenSolver_compute_MatrixType2.out
-      *
-      * \sa GeneralizedSelfAdjointEigenSolver(const MatrixType&, const MatrixType&, int)
-      */
-    GeneralizedSelfAdjointEigenSolver& compute(const MatrixType& matA, const MatrixType& matB,
-                                               int options = ComputeEigenvectors|Ax_lBx);
-
-  protected:
+ *
+ *
+ * \class GeneralizedSelfAdjointEigenSolver
+ *
+ * \brief Computes eigenvalues and eigenvectors of the generalized selfadjoint eigen problem
+ *
+ * \tparam MatrixType_ the type of the matrix of which we are computing the
+ * eigendecomposition; this is expected to be an instantiation of the Matrix
+ * class template.
+ *
+ * This class solves the generalized eigenvalue problem
+ * \f$ Av = \lambda Bv \f$. In this case, the matrix \f$ A \f$ should be
+ * selfadjoint and the matrix \f$ B \f$ should be positive definite.
+ *
+ * Only the \b lower \b triangular \b part of the input matrix is referenced.
+ *
+ * Call the function compute() to compute the eigenvalues and eigenvectors of
+ * a given matrix. Alternatively, you can use the
+ * GeneralizedSelfAdjointEigenSolver(const MatrixType&, const MatrixType&, int)
+ * constructor which computes the eigenvalues and eigenvectors at construction time.
+ * Once the eigenvalue and eigenvectors are computed, they can be retrieved with the eigenvalues()
+ * and eigenvectors() functions.
+ *
+ * The documentation for GeneralizedSelfAdjointEigenSolver(const MatrixType&, const MatrixType&, int)
+ * contains an example of the typical use of this class.
+ *
+ * \sa class SelfAdjointEigenSolver, class EigenSolver, class ComplexEigenSolver
+ */
+template <typename MatrixType_>
+class GeneralizedSelfAdjointEigenSolver : public SelfAdjointEigenSolver<MatrixType_> {
+  typedef SelfAdjointEigenSolver<MatrixType_> Base;
+
+ public:
+  typedef MatrixType_ MatrixType;
+
+  /** \brief Default constructor for fixed-size matrices.
+   *
+   * The default constructor is useful in cases in which the user intends to
+   * perform decompositions via compute(). This constructor
+   * can only be used if \p MatrixType_ is a fixed-size matrix; use
+   * GeneralizedSelfAdjointEigenSolver(Index) for dynamic-size matrices.
+   */
+  GeneralizedSelfAdjointEigenSolver() : Base() {}
+
+  /** \brief Constructor, pre-allocates memory for dynamic-size matrices.
+   *
+   * \param [in]  size  Positive integer, size of the matrix whose
+   * eigenvalues and eigenvectors will be computed.
+   *
+   * This constructor is useful for dynamic-size matrices, when the user
+   * intends to perform decompositions via compute(). The \p size
+   * parameter is only used as a hint. It is not an error to give a wrong
+   * \p size, but it may impair performance.
+   *
+   * \sa compute() for an example
+   */
+  explicit GeneralizedSelfAdjointEigenSolver(Index size) : Base(size) {}
+
+  /** \brief Constructor; computes generalized eigendecomposition of given matrix pencil.
+   *
+   * \param[in]  matA  Selfadjoint matrix in matrix pencil.
+   *                   Only the lower triangular part of the matrix is referenced.
+   * \param[in]  matB  Positive-definite matrix in matrix pencil.
+   *                   Only the lower triangular part of the matrix is referenced.
+   * \param[in]  options A or-ed set of flags {#ComputeEigenvectors,#EigenvaluesOnly} | {#Ax_lBx,#ABx_lx,#BAx_lx}.
+   *                     Default is #ComputeEigenvectors|#Ax_lBx.
+   *
+   * This constructor calls compute(const MatrixType&, const MatrixType&, int)
+   * to compute the eigenvalues and (if requested) the eigenvectors of the
+   * generalized eigenproblem \f$ Ax = \lambda B x \f$ with \a matA the
+   * selfadjoint matrix \f$ A \f$ and \a matB the positive definite matrix
+   * \f$ B \f$. Each eigenvector \f$ x \f$ satisfies the property
+   * \f$ x^* B x = 1 \f$. The eigenvectors are computed if
+   * \a options contains ComputeEigenvectors.
+   *
+   * In addition, the two following variants can be solved via \p options:
+   * - \c ABx_lx: \f$ ABx = \lambda x \f$
+   * - \c BAx_lx: \f$ BAx = \lambda x \f$
+   *
+   * Example: \include SelfAdjointEigenSolver_SelfAdjointEigenSolver_MatrixType2.cpp
+   * Output: \verbinclude SelfAdjointEigenSolver_SelfAdjointEigenSolver_MatrixType2.out
+   *
+   * \sa compute(const MatrixType&, const MatrixType&, int)
+   */
+  GeneralizedSelfAdjointEigenSolver(const MatrixType& matA, const MatrixType& matB,
+                                    int options = ComputeEigenvectors | Ax_lBx)
+      : Base(matA.cols()) {
+    compute(matA, matB, options);
+  }
 
+  /** \brief Computes generalized eigendecomposition of given matrix pencil.
+   *
+   * \param[in]  matA  Selfadjoint matrix in matrix pencil.
+   *                   Only the lower triangular part of the matrix is referenced.
+   * \param[in]  matB  Positive-definite matrix in matrix pencil.
+   *                   Only the lower triangular part of the matrix is referenced.
+   * \param[in]  options A or-ed set of flags {#ComputeEigenvectors,#EigenvaluesOnly} | {#Ax_lBx,#ABx_lx,#BAx_lx}.
+   *                     Default is #ComputeEigenvectors|#Ax_lBx.
+   *
+   * \returns    Reference to \c *this
+   *
+   * According to \p options, this function computes eigenvalues and (if requested)
+   * the eigenvectors of one of the following three generalized eigenproblems:
+   * - \c Ax_lBx: \f$ Ax = \lambda B x \f$
+   * - \c ABx_lx: \f$ ABx = \lambda x \f$
+   * - \c BAx_lx: \f$ BAx = \lambda x \f$
+   * with \a matA the selfadjoint matrix \f$ A \f$ and \a matB the positive definite
+   * matrix \f$ B \f$.
+   * In addition, each eigenvector \f$ x \f$ satisfies the property \f$ x^* B x = 1 \f$.
+   *
+   * The eigenvalues() function can be used to retrieve
+   * the eigenvalues. If \p options contains ComputeEigenvectors, then the
+   * eigenvectors are also computed and can be retrieved by calling
+   * eigenvectors().
+   *
+   * The implementation uses LLT to compute the Cholesky decomposition
+   * \f$ B = LL^* \f$ and computes the classical eigendecomposition
+   * of the selfadjoint matrix \f$ L^{-1} A (L^*)^{-1} \f$ if \p options contains Ax_lBx
+   * and of \f$ L^{*} A L \f$ otherwise. This solves the
+   * generalized eigenproblem, because any solution of the generalized
+   * eigenproblem \f$ Ax = \lambda B x \f$ corresponds to a solution
+   * \f$ L^{-1} A (L^*)^{-1} (L^* x) = \lambda (L^* x) \f$ of the
+   * eigenproblem for \f$ L^{-1} A (L^*)^{-1} \f$. Similar statements
+   * can be made for the two other variants.
+   *
+   * Example: \include SelfAdjointEigenSolver_compute_MatrixType2.cpp
+   * Output: \verbinclude SelfAdjointEigenSolver_compute_MatrixType2.out
+   *
+   * \sa GeneralizedSelfAdjointEigenSolver(const MatrixType&, const MatrixType&, int)
+   */
+  GeneralizedSelfAdjointEigenSolver& compute(const MatrixType& matA, const MatrixType& matB,
+                                             int options = ComputeEigenvectors | Ax_lBx);
+
+ protected:
 };
 
+template <typename MatrixType>
+GeneralizedSelfAdjointEigenSolver<MatrixType>& GeneralizedSelfAdjointEigenSolver<MatrixType>::compute(
+    const MatrixType& matA, const MatrixType& matB, int options) {
+  eigen_assert(matA.cols() == matA.rows() && matB.rows() == matA.rows() && matB.cols() == matB.rows());
+  eigen_assert((options & ~(EigVecMask | GenEigMask)) == 0 && (options & EigVecMask) != EigVecMask &&
+               ((options & GenEigMask) == 0 || (options & GenEigMask) == Ax_lBx || (options & GenEigMask) == ABx_lx ||
+                (options & GenEigMask) == BAx_lx) &&
+               "invalid option parameter");
 
-template<typename MatrixType>
-GeneralizedSelfAdjointEigenSolver<MatrixType>& GeneralizedSelfAdjointEigenSolver<MatrixType>::
-compute(const MatrixType& matA, const MatrixType& matB, int options)
-{
-  eigen_assert(matA.cols()==matA.rows() && matB.rows()==matA.rows() && matB.cols()==matB.rows());
-  eigen_assert((options&~(EigVecMask|GenEigMask))==0
-          && (options&EigVecMask)!=EigVecMask
-          && ((options&GenEigMask)==0 || (options&GenEigMask)==Ax_lBx
-           || (options&GenEigMask)==ABx_lx || (options&GenEigMask)==BAx_lx)
-          && "invalid option parameter");
-
-  bool computeEigVecs = ((options&EigVecMask)==0) || ((options&EigVecMask)==ComputeEigenvectors);
+  bool computeEigVecs = ((options & EigVecMask) == 0) || ((options & EigVecMask) == ComputeEigenvectors);
 
   // Compute the cholesky decomposition of matB = L L' = U'U
   LLT<MatrixType> cholB(matB);
 
-  int type = (options&GenEigMask);
-  if(type==0)
-    type = Ax_lBx;
+  int type = (options & GenEigMask);
+  if (type == 0) type = Ax_lBx;
 
-  if(type==Ax_lBx)
-  {
+  if (type == Ax_lBx) {
     // compute C = inv(L) A inv(L')
     MatrixType matC = matA.template selfadjointView<Lower>();
     cholB.matrixL().template solveInPlace<OnTheLeft>(matC);
     cholB.matrixU().template solveInPlace<OnTheRight>(matC);
 
-    Base::compute(matC, computeEigVecs ? ComputeEigenvectors : EigenvaluesOnly );
+    Base::compute(matC, computeEigVecs ? ComputeEigenvectors : EigenvaluesOnly);
 
     // transform back the eigen vectors: evecs = inv(U) * evecs
-    if(computeEigVecs)
-      cholB.matrixU().solveInPlace(Base::m_eivec);
-  }
-  else if(type==ABx_lx)
-  {
+    if (computeEigVecs) cholB.matrixU().solveInPlace(Base::m_eivec);
+  } else if (type == ABx_lx) {
     // compute C = L' A L
     MatrixType matC = matA.template selfadjointView<Lower>();
     matC = matC * cholB.matrixL();
@@ -202,11 +191,8 @@ compute(const MatrixType& matA, const MatrixType& matB, int options)
     Base::compute(matC, computeEigVecs ? ComputeEigenvectors : EigenvaluesOnly);
 
     // transform back the eigen vectors: evecs = inv(U) * evecs
-    if(computeEigVecs)
-      cholB.matrixU().solveInPlace(Base::m_eivec);
-  }
-  else if(type==BAx_lx)
-  {
+    if (computeEigVecs) cholB.matrixU().solveInPlace(Base::m_eivec);
+  } else if (type == BAx_lx) {
     // compute C = L' A L
     MatrixType matC = matA.template selfadjointView<Lower>();
     matC = matC * cholB.matrixL();
@@ -215,13 +201,12 @@ compute(const MatrixType& matA, const MatrixType& matB, int options)
     Base::compute(matC, computeEigVecs ? ComputeEigenvectors : EigenvaluesOnly);
 
     // transform back the eigen vectors: evecs = L * evecs
-    if(computeEigVecs)
-      Base::m_eivec = cholB.matrixL() * Base::m_eivec;
+    if (computeEigVecs) Base::m_eivec = cholB.matrixL() * Base::m_eivec;
   }
 
   return *this;
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_GENERALIZEDSELFADJOINTEIGENSOLVER_H
+#endif  // EIGEN_GENERALIZEDSELFADJOINTEIGENSOLVER_H
diff --git a/inst/include/Eigen/src/Eigenvalues/HessenbergDecomposition.h b/inst/include/Eigen/src/Eigenvalues/HessenbergDecomposition.h
index 3db0c010..f79ee331 100644
--- a/inst/include/Eigen/src/Eigenvalues/HessenbergDecomposition.h
+++ b/inst/include/Eigen/src/Eigenvalues/HessenbergDecomposition.h
@@ -11,297 +11,283 @@
 #ifndef EIGEN_HESSENBERGDECOMPOSITION_H
 #define EIGEN_HESSENBERGDECOMPOSITION_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 namespace internal {
-  
-template<typename MatrixType> struct HessenbergDecompositionMatrixHReturnType;
-template<typename MatrixType>
-struct traits<HessenbergDecompositionMatrixHReturnType<MatrixType> >
-{
+
+template <typename MatrixType>
+struct HessenbergDecompositionMatrixHReturnType;
+template <typename MatrixType>
+struct traits<HessenbergDecompositionMatrixHReturnType<MatrixType>> {
   typedef MatrixType ReturnType;
 };
 
-}
+}  // namespace internal
 
 /** \eigenvalues_module \ingroup Eigenvalues_Module
-  *
-  *
-  * \class HessenbergDecomposition
-  *
-  * \brief Reduces a square matrix to Hessenberg form by an orthogonal similarity transformation
-  *
-  * \tparam _MatrixType the type of the matrix of which we are computing the Hessenberg decomposition
-  *
-  * This class performs an Hessenberg decomposition of a matrix \f$ A \f$. In
-  * the real case, the Hessenberg decomposition consists of an orthogonal
-  * matrix \f$ Q \f$ and a Hessenberg matrix \f$ H \f$ such that \f$ A = Q H
-  * Q^T \f$. An orthogonal matrix is a matrix whose inverse equals its
-  * transpose (\f$ Q^{-1} = Q^T \f$). A Hessenberg matrix has zeros below the
-  * subdiagonal, so it is almost upper triangular. The Hessenberg decomposition
-  * of a complex matrix is \f$ A = Q H Q^* \f$ with \f$ Q \f$ unitary (that is,
-  * \f$ Q^{-1} = Q^* \f$).
-  *
-  * Call the function compute() to compute the Hessenberg decomposition of a
-  * given matrix. Alternatively, you can use the
-  * HessenbergDecomposition(const MatrixType&) constructor which computes the
-  * Hessenberg decomposition at construction time. Once the decomposition is
-  * computed, you can use the matrixH() and matrixQ() functions to construct
-  * the matrices H and Q in the decomposition.
-  *
-  * The documentation for matrixH() contains an example of the typical use of
-  * this class.
-  *
-  * \sa class ComplexSchur, class Tridiagonalization, \ref QR_Module "QR Module"
-  */
-template<typename _MatrixType> class HessenbergDecomposition
-{
-  public:
-
-    /** \brief Synonym for the template parameter \p _MatrixType. */
-    typedef _MatrixType MatrixType;
-
-    enum {
-      Size = MatrixType::RowsAtCompileTime,
-      SizeMinusOne = Size == Dynamic ? Dynamic : Size - 1,
-      Options = MatrixType::Options,
-      MaxSize = MatrixType::MaxRowsAtCompileTime,
-      MaxSizeMinusOne = MaxSize == Dynamic ? Dynamic : MaxSize - 1
-    };
-
-    /** \brief Scalar type for matrices of type #MatrixType. */
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::Index Index;
-
-    /** \brief Type for vector of Householder coefficients.
-      *
-      * This is column vector with entries of type #Scalar. The length of the
-      * vector is one less than the size of #MatrixType, if it is a fixed-side
-      * type.
-      */
-    typedef Matrix<Scalar, SizeMinusOne, 1, Options & ~RowMajor, MaxSizeMinusOne, 1> CoeffVectorType;
-
-    /** \brief Return type of matrixQ() */
-    typedef HouseholderSequence<MatrixType,typename internal::remove_all<typename CoeffVectorType::ConjugateReturnType>::type> HouseholderSequenceType;
-    
-    typedef internal::HessenbergDecompositionMatrixHReturnType<MatrixType> MatrixHReturnType;
-
-    /** \brief Default constructor; the decomposition will be computed later.
-      *
-      * \param [in] size  The size of the matrix whose Hessenberg decomposition will be computed.
-      *
-      * The default constructor is useful in cases in which the user intends to
-      * perform decompositions via compute().  The \p size parameter is only
-      * used as a hint. It is not an error to give a wrong \p size, but it may
-      * impair performance.
-      *
-      * \sa compute() for an example.
-      */
-    HessenbergDecomposition(Index size = Size==Dynamic ? 2 : Size)
-      : m_matrix(size,size),
-        m_temp(size),
-        m_isInitialized(false)
-    {
-      if(size>1)
-        m_hCoeffs.resize(size-1);
-    }
+ *
+ *
+ * \class HessenbergDecomposition
+ *
+ * \brief Reduces a square matrix to Hessenberg form by an orthogonal similarity transformation
+ *
+ * \tparam MatrixType_ the type of the matrix of which we are computing the Hessenberg decomposition
+ *
+ * This class performs an Hessenberg decomposition of a matrix \f$ A \f$. In
+ * the real case, the Hessenberg decomposition consists of an orthogonal
+ * matrix \f$ Q \f$ and a Hessenberg matrix \f$ H \f$ such that \f$ A = Q H
+ * Q^T \f$. An orthogonal matrix is a matrix whose inverse equals its
+ * transpose (\f$ Q^{-1} = Q^T \f$). A Hessenberg matrix has zeros below the
+ * subdiagonal, so it is almost upper triangular. The Hessenberg decomposition
+ * of a complex matrix is \f$ A = Q H Q^* \f$ with \f$ Q \f$ unitary (that is,
+ * \f$ Q^{-1} = Q^* \f$).
+ *
+ * Call the function compute() to compute the Hessenberg decomposition of a
+ * given matrix. Alternatively, you can use the
+ * HessenbergDecomposition(const MatrixType&) constructor which computes the
+ * Hessenberg decomposition at construction time. Once the decomposition is
+ * computed, you can use the matrixH() and matrixQ() functions to construct
+ * the matrices H and Q in the decomposition.
+ *
+ * The documentation for matrixH() contains an example of the typical use of
+ * this class.
+ *
+ * \sa class ComplexSchur, class Tridiagonalization, \ref QR_Module "QR Module"
+ */
+template <typename MatrixType_>
+class HessenbergDecomposition {
+ public:
+  /** \brief Synonym for the template parameter \p MatrixType_. */
+  typedef MatrixType_ MatrixType;
+
+  enum {
+    Size = MatrixType::RowsAtCompileTime,
+    SizeMinusOne = Size == Dynamic ? Dynamic : Size - 1,
+    Options = internal::traits<MatrixType>::Options,
+    MaxSize = MatrixType::MaxRowsAtCompileTime,
+    MaxSizeMinusOne = MaxSize == Dynamic ? Dynamic : MaxSize - 1
+  };
+
+  /** \brief Scalar type for matrices of type #MatrixType. */
+  typedef typename MatrixType::Scalar Scalar;
+  typedef Eigen::Index Index;  ///< \deprecated since Eigen 3.3
+
+  /** \brief Type for vector of Householder coefficients.
+   *
+   * This is column vector with entries of type #Scalar. The length of the
+   * vector is one less than the size of #MatrixType, if it is a fixed-side
+   * type.
+   */
+  typedef Matrix<Scalar, SizeMinusOne, 1, Options & ~RowMajor, MaxSizeMinusOne, 1> CoeffVectorType;
+
+  /** \brief Return type of matrixQ() */
+  typedef HouseholderSequence<MatrixType, internal::remove_all_t<typename CoeffVectorType::ConjugateReturnType>>
+      HouseholderSequenceType;
+
+  typedef internal::HessenbergDecompositionMatrixHReturnType<MatrixType> MatrixHReturnType;
+
+  /** \brief Default constructor; the decomposition will be computed later.
+   *
+   * \param [in] size  The size of the matrix whose Hessenberg decomposition will be computed.
+   *
+   * The default constructor is useful in cases in which the user intends to
+   * perform decompositions via compute().  The \p size parameter is only
+   * used as a hint. It is not an error to give a wrong \p size, but it may
+   * impair performance.
+   *
+   * \sa compute() for an example.
+   */
+  explicit HessenbergDecomposition(Index size = Size == Dynamic ? 2 : Size)
+      : m_matrix(size, size), m_temp(size), m_isInitialized(false) {
+    if (size > 1) m_hCoeffs.resize(size - 1);
+  }
 
-    /** \brief Constructor; computes Hessenberg decomposition of given matrix.
-      *
-      * \param[in]  matrix  Square matrix whose Hessenberg decomposition is to be computed.
-      *
-      * This constructor calls compute() to compute the Hessenberg
-      * decomposition.
-      *
-      * \sa matrixH() for an example.
-      */
-    HessenbergDecomposition(const MatrixType& matrix)
-      : m_matrix(matrix),
-        m_temp(matrix.rows()),
-        m_isInitialized(false)
-    {
-      if(matrix.rows()<2)
-      {
-        m_isInitialized = true;
-        return;
-      }
-      m_hCoeffs.resize(matrix.rows()-1,1);
-      _compute(m_matrix, m_hCoeffs, m_temp);
+  /** \brief Constructor; computes Hessenberg decomposition of given matrix.
+   *
+   * \param[in]  matrix  Square matrix whose Hessenberg decomposition is to be computed.
+   *
+   * This constructor calls compute() to compute the Hessenberg
+   * decomposition.
+   *
+   * \sa matrixH() for an example.
+   */
+  template <typename InputType>
+  explicit HessenbergDecomposition(const EigenBase<InputType>& matrix)
+      : m_matrix(matrix.derived()), m_temp(matrix.rows()), m_isInitialized(false) {
+    if (matrix.rows() < 2) {
       m_isInitialized = true;
+      return;
     }
+    m_hCoeffs.resize(matrix.rows() - 1, 1);
+    _compute(m_matrix, m_hCoeffs, m_temp);
+    m_isInitialized = true;
+  }
 
-    /** \brief Computes Hessenberg decomposition of given matrix.
-      *
-      * \param[in]  matrix  Square matrix whose Hessenberg decomposition is to be computed.
-      * \returns    Reference to \c *this
-      *
-      * The Hessenberg decomposition is computed by bringing the columns of the
-      * matrix successively in the required form using Householder reflections
-      * (see, e.g., Algorithm 7.4.2 in Golub \& Van Loan, <i>%Matrix
-      * Computations</i>). The cost is \f$ 10n^3/3 \f$ flops, where \f$ n \f$
-      * denotes the size of the given matrix.
-      *
-      * This method reuses of the allocated data in the HessenbergDecomposition
-      * object.
-      *
-      * Example: \include HessenbergDecomposition_compute.cpp
-      * Output: \verbinclude HessenbergDecomposition_compute.out
-      */
-    HessenbergDecomposition& compute(const MatrixType& matrix)
-    {
-      m_matrix = matrix;
-      if(matrix.rows()<2)
-      {
-        m_isInitialized = true;
-        return *this;
-      }
-      m_hCoeffs.resize(matrix.rows()-1,1);
-      _compute(m_matrix, m_hCoeffs, m_temp);
+  /** \brief Computes Hessenberg decomposition of given matrix.
+   *
+   * \param[in]  matrix  Square matrix whose Hessenberg decomposition is to be computed.
+   * \returns    Reference to \c *this
+   *
+   * The Hessenberg decomposition is computed by bringing the columns of the
+   * matrix successively in the required form using Householder reflections
+   * (see, e.g., Algorithm 7.4.2 in Golub \& Van Loan, <i>%Matrix
+   * Computations</i>). The cost is \f$ 10n^3/3 \f$ flops, where \f$ n \f$
+   * denotes the size of the given matrix.
+   *
+   * This method reuses of the allocated data in the HessenbergDecomposition
+   * object.
+   *
+   * Example: \include HessenbergDecomposition_compute.cpp
+   * Output: \verbinclude HessenbergDecomposition_compute.out
+   */
+  template <typename InputType>
+  HessenbergDecomposition& compute(const EigenBase<InputType>& matrix) {
+    m_matrix = matrix.derived();
+    if (matrix.rows() < 2) {
       m_isInitialized = true;
       return *this;
     }
+    m_hCoeffs.resize(matrix.rows() - 1, 1);
+    _compute(m_matrix, m_hCoeffs, m_temp);
+    m_isInitialized = true;
+    return *this;
+  }
 
-    /** \brief Returns the Householder coefficients.
-      *
-      * \returns a const reference to the vector of Householder coefficients
-      *
-      * \pre Either the constructor HessenbergDecomposition(const MatrixType&)
-      * or the member function compute(const MatrixType&) has been called
-      * before to compute the Hessenberg decomposition of a matrix.
-      *
-      * The Householder coefficients allow the reconstruction of the matrix
-      * \f$ Q \f$ in the Hessenberg decomposition from the packed data.
-      *
-      * \sa packedMatrix(), \ref Householder_Module "Householder module"
-      */
-    const CoeffVectorType& householderCoefficients() const
-    {
-      eigen_assert(m_isInitialized && "HessenbergDecomposition is not initialized.");
-      return m_hCoeffs;
-    }
-
-    /** \brief Returns the internal representation of the decomposition
-      *
-      *	\returns a const reference to a matrix with the internal representation
-      *	         of the decomposition.
-      *
-      * \pre Either the constructor HessenbergDecomposition(const MatrixType&)
-      * or the member function compute(const MatrixType&) has been called
-      * before to compute the Hessenberg decomposition of a matrix.
-      *
-      * The returned matrix contains the following information:
-      *  - the upper part and lower sub-diagonal represent the Hessenberg matrix H
-      *  - the rest of the lower part contains the Householder vectors that, combined with
-      *    Householder coefficients returned by householderCoefficients(),
-      *    allows to reconstruct the matrix Q as
-      *       \f$ Q = H_{N-1} \ldots H_1 H_0 \f$.
-      *    Here, the matrices \f$ H_i \f$ are the Householder transformations
-      *       \f$ H_i = (I - h_i v_i v_i^T) \f$
-      *    where \f$ h_i \f$ is the \f$ i \f$th Householder coefficient and
-      *    \f$ v_i \f$ is the Householder vector defined by
-      *       \f$ v_i = [ 0, \ldots, 0, 1, M(i+2,i), \ldots, M(N-1,i) ]^T \f$
-      *    with M the matrix returned by this function.
-      *
-      * See LAPACK for further details on this packed storage.
-      *
-      * Example: \include HessenbergDecomposition_packedMatrix.cpp
-      * Output: \verbinclude HessenbergDecomposition_packedMatrix.out
-      *
-      * \sa householderCoefficients()
-      */
-    const MatrixType& packedMatrix() const
-    {
-      eigen_assert(m_isInitialized && "HessenbergDecomposition is not initialized.");
-      return m_matrix;
-    }
+  /** \brief Returns the Householder coefficients.
+   *
+   * \returns a const reference to the vector of Householder coefficients
+   *
+   * \pre Either the constructor HessenbergDecomposition(const MatrixType&)
+   * or the member function compute(const MatrixType&) has been called
+   * before to compute the Hessenberg decomposition of a matrix.
+   *
+   * The Householder coefficients allow the reconstruction of the matrix
+   * \f$ Q \f$ in the Hessenberg decomposition from the packed data.
+   *
+   * \sa packedMatrix(), \ref Householder_Module "Householder module"
+   */
+  const CoeffVectorType& householderCoefficients() const {
+    eigen_assert(m_isInitialized && "HessenbergDecomposition is not initialized.");
+    return m_hCoeffs;
+  }
 
-    /** \brief Reconstructs the orthogonal matrix Q in the decomposition
-      *
-      * \returns object representing the matrix Q
-      *
-      * \pre Either the constructor HessenbergDecomposition(const MatrixType&)
-      * or the member function compute(const MatrixType&) has been called
-      * before to compute the Hessenberg decomposition of a matrix.
-      *
-      * This function returns a light-weight object of template class
-      * HouseholderSequence. You can either apply it directly to a matrix or
-      * you can convert it to a matrix of type #MatrixType.
-      *
-      * \sa matrixH() for an example, class HouseholderSequence
-      */
-    HouseholderSequenceType matrixQ() const
-    {
-      eigen_assert(m_isInitialized && "HessenbergDecomposition is not initialized.");
-      return HouseholderSequenceType(m_matrix, m_hCoeffs.conjugate())
-             .setLength(m_matrix.rows() - 1)
-             .setShift(1);
-    }
+  /** \brief Returns the internal representation of the decomposition
+   *
+   *	\returns a const reference to a matrix with the internal representation
+   *	         of the decomposition.
+   *
+   * \pre Either the constructor HessenbergDecomposition(const MatrixType&)
+   * or the member function compute(const MatrixType&) has been called
+   * before to compute the Hessenberg decomposition of a matrix.
+   *
+   * The returned matrix contains the following information:
+   *  - the upper part and lower sub-diagonal represent the Hessenberg matrix H
+   *  - the rest of the lower part contains the Householder vectors that, combined with
+   *    Householder coefficients returned by householderCoefficients(),
+   *    allows to reconstruct the matrix Q as
+   *       \f$ Q = H_{N-1} \ldots H_1 H_0 \f$.
+   *    Here, the matrices \f$ H_i \f$ are the Householder transformations
+   *       \f$ H_i = (I - h_i v_i v_i^T) \f$
+   *    where \f$ h_i \f$ is the \f$ i \f$th Householder coefficient and
+   *    \f$ v_i \f$ is the Householder vector defined by
+   *       \f$ v_i = [ 0, \ldots, 0, 1, M(i+2,i), \ldots, M(N-1,i) ]^T \f$
+   *    with M the matrix returned by this function.
+   *
+   * See LAPACK for further details on this packed storage.
+   *
+   * Example: \include HessenbergDecomposition_packedMatrix.cpp
+   * Output: \verbinclude HessenbergDecomposition_packedMatrix.out
+   *
+   * \sa householderCoefficients()
+   */
+  const MatrixType& packedMatrix() const {
+    eigen_assert(m_isInitialized && "HessenbergDecomposition is not initialized.");
+    return m_matrix;
+  }
 
-    /** \brief Constructs the Hessenberg matrix H in the decomposition
-      *
-      * \returns expression object representing the matrix H
-      *
-      * \pre Either the constructor HessenbergDecomposition(const MatrixType&)
-      * or the member function compute(const MatrixType&) has been called
-      * before to compute the Hessenberg decomposition of a matrix.
-      *
-      * The object returned by this function constructs the Hessenberg matrix H
-      * when it is assigned to a matrix or otherwise evaluated. The matrix H is
-      * constructed from the packed matrix as returned by packedMatrix(): The
-      * upper part (including the subdiagonal) of the packed matrix contains
-      * the matrix H. It may sometimes be better to directly use the packed
-      * matrix instead of constructing the matrix H.
-      *
-      * Example: \include HessenbergDecomposition_matrixH.cpp
-      * Output: \verbinclude HessenbergDecomposition_matrixH.out
-      *
-      * \sa matrixQ(), packedMatrix()
-      */
-    MatrixHReturnType matrixH() const
-    {
-      eigen_assert(m_isInitialized && "HessenbergDecomposition is not initialized.");
-      return MatrixHReturnType(*this);
-    }
+  /** \brief Reconstructs the orthogonal matrix Q in the decomposition
+   *
+   * \returns object representing the matrix Q
+   *
+   * \pre Either the constructor HessenbergDecomposition(const MatrixType&)
+   * or the member function compute(const MatrixType&) has been called
+   * before to compute the Hessenberg decomposition of a matrix.
+   *
+   * This function returns a light-weight object of template class
+   * HouseholderSequence. You can either apply it directly to a matrix or
+   * you can convert it to a matrix of type #MatrixType.
+   *
+   * \sa matrixH() for an example, class HouseholderSequence
+   */
+  HouseholderSequenceType matrixQ() const {
+    eigen_assert(m_isInitialized && "HessenbergDecomposition is not initialized.");
+    return HouseholderSequenceType(m_matrix, m_hCoeffs.conjugate()).setLength(m_matrix.rows() - 1).setShift(1);
+  }
 
-  private:
+  /** \brief Constructs the Hessenberg matrix H in the decomposition
+   *
+   * \returns expression object representing the matrix H
+   *
+   * \pre Either the constructor HessenbergDecomposition(const MatrixType&)
+   * or the member function compute(const MatrixType&) has been called
+   * before to compute the Hessenberg decomposition of a matrix.
+   *
+   * The object returned by this function constructs the Hessenberg matrix H
+   * when it is assigned to a matrix or otherwise evaluated. The matrix H is
+   * constructed from the packed matrix as returned by packedMatrix(): The
+   * upper part (including the subdiagonal) of the packed matrix contains
+   * the matrix H. It may sometimes be better to directly use the packed
+   * matrix instead of constructing the matrix H.
+   *
+   * Example: \include HessenbergDecomposition_matrixH.cpp
+   * Output: \verbinclude HessenbergDecomposition_matrixH.out
+   *
+   * \sa matrixQ(), packedMatrix()
+   */
+  MatrixHReturnType matrixH() const {
+    eigen_assert(m_isInitialized && "HessenbergDecomposition is not initialized.");
+    return MatrixHReturnType(*this);
+  }
 
-    typedef Matrix<Scalar, 1, Size, Options | RowMajor, 1, MaxSize> VectorType;
-    typedef typename NumTraits<Scalar>::Real RealScalar;
-    static void _compute(MatrixType& matA, CoeffVectorType& hCoeffs, VectorType& temp);
+ private:
+  typedef Matrix<Scalar, 1, Size, int(Options) | int(RowMajor), 1, MaxSize> VectorType;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  static void _compute(MatrixType& matA, CoeffVectorType& hCoeffs, VectorType& temp);
 
-  protected:
-    MatrixType m_matrix;
-    CoeffVectorType m_hCoeffs;
-    VectorType m_temp;
-    bool m_isInitialized;
+ protected:
+  MatrixType m_matrix;
+  CoeffVectorType m_hCoeffs;
+  VectorType m_temp;
+  bool m_isInitialized;
 };
 
 /** \internal
-  * Performs a tridiagonal decomposition of \a matA in place.
-  *
-  * \param matA the input selfadjoint matrix
-  * \param hCoeffs returned Householder coefficients
-  *
-  * The result is written in the lower triangular part of \a matA.
-  *
-  * Implemented from Golub's "%Matrix Computations", algorithm 8.3.1.
-  *
-  * \sa packedMatrix()
-  */
-template<typename MatrixType>
-void HessenbergDecomposition<MatrixType>::_compute(MatrixType& matA, CoeffVectorType& hCoeffs, VectorType& temp)
-{
-  eigen_assert(matA.rows()==matA.cols());
+ * Performs a tridiagonal decomposition of \a matA in place.
+ *
+ * \param matA the input selfadjoint matrix
+ * \param hCoeffs returned Householder coefficients
+ *
+ * The result is written in the lower triangular part of \a matA.
+ *
+ * Implemented from Golub's "%Matrix Computations", algorithm 8.3.1.
+ *
+ * \sa packedMatrix()
+ */
+template <typename MatrixType>
+void HessenbergDecomposition<MatrixType>::_compute(MatrixType& matA, CoeffVectorType& hCoeffs, VectorType& temp) {
+  eigen_assert(matA.rows() == matA.cols());
   Index n = matA.rows();
   temp.resize(n);
-  for (Index i = 0; i<n-1; ++i)
-  {
+  for (Index i = 0; i < n - 1; ++i) {
     // let's consider the vector v = i-th column starting at position i+1
-    Index remainingSize = n-i-1;
+    Index remainingSize = n - i - 1;
     RealScalar beta;
     Scalar h;
     matA.col(i).tail(remainingSize).makeHouseholderInPlace(h, beta);
-    matA.col(i).coeffRef(i+1) = beta;
+    matA.col(i).coeffRef(i + 1) = beta;
     hCoeffs.coeffRef(i) = h;
 
     // Apply similarity transformation to remaining columns,
@@ -309,65 +295,62 @@ void HessenbergDecomposition<MatrixType>::_compute(MatrixType& matA, CoeffVector
 
     // A = H A
     matA.bottomRightCorner(remainingSize, remainingSize)
-        .applyHouseholderOnTheLeft(matA.col(i).tail(remainingSize-1), h, &temp.coeffRef(0));
+        .applyHouseholderOnTheLeft(matA.col(i).tail(remainingSize - 1), h, &temp.coeffRef(0));
 
     // A = A H'
     matA.rightCols(remainingSize)
-        .applyHouseholderOnTheRight(matA.col(i).tail(remainingSize-1).conjugate(), numext::conj(h), &temp.coeffRef(0));
+        .applyHouseholderOnTheRight(matA.col(i).tail(remainingSize - 1), numext::conj(h), &temp.coeffRef(0));
   }
 }
 
 namespace internal {
 
 /** \eigenvalues_module \ingroup Eigenvalues_Module
-  *
-  *
-  * \brief Expression type for return value of HessenbergDecomposition::matrixH()
-  *
-  * \tparam MatrixType type of matrix in the Hessenberg decomposition
-  *
-  * Objects of this type represent the Hessenberg matrix in the Hessenberg
-  * decomposition of some matrix. The object holds a reference to the
-  * HessenbergDecomposition class until the it is assigned or evaluated for
-  * some other reason (the reference should remain valid during the life time
-  * of this object). This class is the return type of
-  * HessenbergDecomposition::matrixH(); there is probably no other use for this
-  * class.
-  */
-template<typename MatrixType> struct HessenbergDecompositionMatrixHReturnType
-: public ReturnByValue<HessenbergDecompositionMatrixHReturnType<MatrixType> >
-{
-    typedef typename MatrixType::Index Index;
-  public:
-    /** \brief Constructor.
-      *
-      * \param[in] hess  Hessenberg decomposition
-      */
-    HessenbergDecompositionMatrixHReturnType(const HessenbergDecomposition<MatrixType>& hess) : m_hess(hess) { }
-
-    /** \brief Hessenberg matrix in decomposition.
-      *
-      * \param[out] result  Hessenberg matrix in decomposition \p hess which
-      *                     was passed to the constructor
-      */
-    template <typename ResultType>
-    inline void evalTo(ResultType& result) const
-    {
-      result = m_hess.packedMatrix();
-      Index n = result.rows();
-      if (n>2)
-        result.bottomLeftCorner(n-2, n-2).template triangularView<Lower>().setZero();
-    }
+ *
+ *
+ * \brief Expression type for return value of HessenbergDecomposition::matrixH()
+ *
+ * \tparam MatrixType type of matrix in the Hessenberg decomposition
+ *
+ * Objects of this type represent the Hessenberg matrix in the Hessenberg
+ * decomposition of some matrix. The object holds a reference to the
+ * HessenbergDecomposition class until the it is assigned or evaluated for
+ * some other reason (the reference should remain valid during the life time
+ * of this object). This class is the return type of
+ * HessenbergDecomposition::matrixH(); there is probably no other use for this
+ * class.
+ */
+template <typename MatrixType>
+struct HessenbergDecompositionMatrixHReturnType
+    : public ReturnByValue<HessenbergDecompositionMatrixHReturnType<MatrixType>> {
+ public:
+  /** \brief Constructor.
+   *
+   * \param[in] hess  Hessenberg decomposition
+   */
+  HessenbergDecompositionMatrixHReturnType(const HessenbergDecomposition<MatrixType>& hess) : m_hess(hess) {}
+
+  /** \brief Hessenberg matrix in decomposition.
+   *
+   * \param[out] result  Hessenberg matrix in decomposition \p hess which
+   *                     was passed to the constructor
+   */
+  template <typename ResultType>
+  inline void evalTo(ResultType& result) const {
+    result = m_hess.packedMatrix();
+    Index n = result.rows();
+    if (n > 2) result.bottomLeftCorner(n - 2, n - 2).template triangularView<Lower>().setZero();
+  }
 
-    Index rows() const { return m_hess.packedMatrix().rows(); }
-    Index cols() const { return m_hess.packedMatrix().cols(); }
+  Index rows() const { return m_hess.packedMatrix().rows(); }
+  Index cols() const { return m_hess.packedMatrix().cols(); }
 
-  protected:
-    const HessenbergDecomposition<MatrixType>& m_hess;
+ protected:
+  const HessenbergDecomposition<MatrixType>& m_hess;
 };
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_HESSENBERGDECOMPOSITION_H
+#endif  // EIGEN_HESSENBERGDECOMPOSITION_H
diff --git a/inst/include/Eigen/src/Eigenvalues/InternalHeaderCheck.h b/inst/include/Eigen/src/Eigenvalues/InternalHeaderCheck.h
new file mode 100644
index 00000000..374cbd45
--- /dev/null
+++ b/inst/include/Eigen/src/Eigenvalues/InternalHeaderCheck.h
@@ -0,0 +1,3 @@
+#ifndef EIGEN_EIGENVALUES_MODULE_H
+#error "Please include Eigen/Eigenvalues instead of including headers inside the src directory directly."
+#endif
diff --git a/inst/include/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h b/inst/include/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h
index 4fec8af0..62227bdc 100644
--- a/inst/include/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h
+++ b/inst/include/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h
@@ -11,150 +11,132 @@
 #ifndef EIGEN_MATRIXBASEEIGENVALUES_H
 #define EIGEN_MATRIXBASEEIGENVALUES_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 namespace internal {
 
-template<typename Derived, bool IsComplex>
-struct eigenvalues_selector
-{
+template <typename Derived, bool IsComplex>
+struct eigenvalues_selector {
   // this is the implementation for the case IsComplex = true
-  static inline typename MatrixBase<Derived>::EigenvaluesReturnType const
-  run(const MatrixBase<Derived>& m)
-  {
+  static inline typename MatrixBase<Derived>::EigenvaluesReturnType const run(const MatrixBase<Derived>& m) {
     typedef typename Derived::PlainObject PlainObject;
     PlainObject m_eval(m);
     return ComplexEigenSolver<PlainObject>(m_eval, false).eigenvalues();
   }
 };
 
-template<typename Derived>
-struct eigenvalues_selector<Derived, false>
-{
-  static inline typename MatrixBase<Derived>::EigenvaluesReturnType const
-  run(const MatrixBase<Derived>& m)
-  {
+template <typename Derived>
+struct eigenvalues_selector<Derived, false> {
+  static inline typename MatrixBase<Derived>::EigenvaluesReturnType const run(const MatrixBase<Derived>& m) {
     typedef typename Derived::PlainObject PlainObject;
     PlainObject m_eval(m);
     return EigenSolver<PlainObject>(m_eval, false).eigenvalues();
   }
 };
 
-} // end namespace internal
+}  // end namespace internal
 
-/** \brief Computes the eigenvalues of a matrix 
-  * \returns Column vector containing the eigenvalues.
-  *
-  * \eigenvalues_module
-  * This function computes the eigenvalues with the help of the EigenSolver
-  * class (for real matrices) or the ComplexEigenSolver class (for complex
-  * matrices). 
-  *
-  * The eigenvalues are repeated according to their algebraic multiplicity,
-  * so there are as many eigenvalues as rows in the matrix.
-  *
-  * The SelfAdjointView class provides a better algorithm for selfadjoint
-  * matrices.
-  *
-  * Example: \include MatrixBase_eigenvalues.cpp
-  * Output: \verbinclude MatrixBase_eigenvalues.out
-  *
-  * \sa EigenSolver::eigenvalues(), ComplexEigenSolver::eigenvalues(),
-  *     SelfAdjointView::eigenvalues()
-  */
-template<typename Derived>
-inline typename MatrixBase<Derived>::EigenvaluesReturnType
-MatrixBase<Derived>::eigenvalues() const
-{
-  typedef typename internal::traits<Derived>::Scalar Scalar;
+/** \brief Computes the eigenvalues of a matrix
+ * \returns Column vector containing the eigenvalues.
+ *
+ * \eigenvalues_module
+ * This function computes the eigenvalues with the help of the EigenSolver
+ * class (for real matrices) or the ComplexEigenSolver class (for complex
+ * matrices).
+ *
+ * The eigenvalues are repeated according to their algebraic multiplicity,
+ * so there are as many eigenvalues as rows in the matrix.
+ *
+ * The SelfAdjointView class provides a better algorithm for selfadjoint
+ * matrices.
+ *
+ * Example: \include MatrixBase_eigenvalues.cpp
+ * Output: \verbinclude MatrixBase_eigenvalues.out
+ *
+ * \sa EigenSolver::eigenvalues(), ComplexEigenSolver::eigenvalues(),
+ *     SelfAdjointView::eigenvalues()
+ */
+template <typename Derived>
+inline typename MatrixBase<Derived>::EigenvaluesReturnType MatrixBase<Derived>::eigenvalues() const {
   return internal::eigenvalues_selector<Derived, NumTraits<Scalar>::IsComplex>::run(derived());
 }
 
 /** \brief Computes the eigenvalues of a matrix
-  * \returns Column vector containing the eigenvalues.
-  *
-  * \eigenvalues_module
-  * This function computes the eigenvalues with the help of the
-  * SelfAdjointEigenSolver class.  The eigenvalues are repeated according to
-  * their algebraic multiplicity, so there are as many eigenvalues as rows in
-  * the matrix.
-  *
-  * Example: \include SelfAdjointView_eigenvalues.cpp
-  * Output: \verbinclude SelfAdjointView_eigenvalues.out
-  *
-  * \sa SelfAdjointEigenSolver::eigenvalues(), MatrixBase::eigenvalues()
-  */
-template<typename MatrixType, unsigned int UpLo> 
-inline typename SelfAdjointView<MatrixType, UpLo>::EigenvaluesReturnType
-SelfAdjointView<MatrixType, UpLo>::eigenvalues() const
-{
-  typedef typename SelfAdjointView<MatrixType, UpLo>::PlainObject PlainObject;
+ * \returns Column vector containing the eigenvalues.
+ *
+ * \eigenvalues_module
+ * This function computes the eigenvalues with the help of the
+ * SelfAdjointEigenSolver class.  The eigenvalues are repeated according to
+ * their algebraic multiplicity, so there are as many eigenvalues as rows in
+ * the matrix.
+ *
+ * Example: \include SelfAdjointView_eigenvalues.cpp
+ * Output: \verbinclude SelfAdjointView_eigenvalues.out
+ *
+ * \sa SelfAdjointEigenSolver::eigenvalues(), MatrixBase::eigenvalues()
+ */
+template <typename MatrixType, unsigned int UpLo>
+EIGEN_DEVICE_FUNC inline typename SelfAdjointView<MatrixType, UpLo>::EigenvaluesReturnType
+SelfAdjointView<MatrixType, UpLo>::eigenvalues() const {
   PlainObject thisAsMatrix(*this);
   return SelfAdjointEigenSolver<PlainObject>(thisAsMatrix, false).eigenvalues();
 }
 
-
-
 /** \brief Computes the L2 operator norm
-  * \returns Operator norm of the matrix.
-  *
-  * \eigenvalues_module
-  * This function computes the L2 operator norm of a matrix, which is also
-  * known as the spectral norm. The norm of a matrix \f$ A \f$ is defined to be
-  * \f[ \|A\|_2 = \max_x \frac{\|Ax\|_2}{\|x\|_2} \f]
-  * where the maximum is over all vectors and the norm on the right is the
-  * Euclidean vector norm. The norm equals the largest singular value, which is
-  * the square root of the largest eigenvalue of the positive semi-definite
-  * matrix \f$ A^*A \f$.
-  *
-  * The current implementation uses the eigenvalues of \f$ A^*A \f$, as computed
-  * by SelfAdjointView::eigenvalues(), to compute the operator norm of a
-  * matrix.  The SelfAdjointView class provides a better algorithm for
-  * selfadjoint matrices.
-  *
-  * Example: \include MatrixBase_operatorNorm.cpp
-  * Output: \verbinclude MatrixBase_operatorNorm.out
-  *
-  * \sa SelfAdjointView::eigenvalues(), SelfAdjointView::operatorNorm()
-  */
-template<typename Derived>
-inline typename MatrixBase<Derived>::RealScalar
-MatrixBase<Derived>::operatorNorm() const
-{
+ * \returns Operator norm of the matrix.
+ *
+ * \eigenvalues_module
+ * This function computes the L2 operator norm of a matrix, which is also
+ * known as the spectral norm. The norm of a matrix \f$ A \f$ is defined to be
+ * \f[ \|A\|_2 = \max_x \frac{\|Ax\|_2}{\|x\|_2} \f]
+ * where the maximum is over all vectors and the norm on the right is the
+ * Euclidean vector norm. The norm equals the largest singular value, which is
+ * the square root of the largest eigenvalue of the positive semi-definite
+ * matrix \f$ A^*A \f$.
+ *
+ * The current implementation uses the eigenvalues of \f$ A^*A \f$, as computed
+ * by SelfAdjointView::eigenvalues(), to compute the operator norm of a
+ * matrix.  The SelfAdjointView class provides a better algorithm for
+ * selfadjoint matrices.
+ *
+ * Example: \include MatrixBase_operatorNorm.cpp
+ * Output: \verbinclude MatrixBase_operatorNorm.out
+ *
+ * \sa SelfAdjointView::eigenvalues(), SelfAdjointView::operatorNorm()
+ */
+template <typename Derived>
+inline typename MatrixBase<Derived>::RealScalar MatrixBase<Derived>::operatorNorm() const {
   using std::sqrt;
   typename Derived::PlainObject m_eval(derived());
   // FIXME if it is really guaranteed that the eigenvalues are already sorted,
   // then we don't need to compute a maxCoeff() here, comparing the 1st and last ones is enough.
-  return sqrt((m_eval*m_eval.adjoint())
-                 .eval()
-		 .template selfadjointView<Lower>()
-		 .eigenvalues()
-		 .maxCoeff()
-		 );
+  return sqrt((m_eval * m_eval.adjoint()).eval().template selfadjointView<Lower>().eigenvalues().maxCoeff());
 }
 
 /** \brief Computes the L2 operator norm
-  * \returns Operator norm of the matrix.
-  *
-  * \eigenvalues_module
-  * This function computes the L2 operator norm of a self-adjoint matrix. For a
-  * self-adjoint matrix, the operator norm is the largest eigenvalue.
-  *
-  * The current implementation uses the eigenvalues of the matrix, as computed
-  * by eigenvalues(), to compute the operator norm of the matrix.
-  *
-  * Example: \include SelfAdjointView_operatorNorm.cpp
-  * Output: \verbinclude SelfAdjointView_operatorNorm.out
-  *
-  * \sa eigenvalues(), MatrixBase::operatorNorm()
-  */
-template<typename MatrixType, unsigned int UpLo>
-inline typename SelfAdjointView<MatrixType, UpLo>::RealScalar
-SelfAdjointView<MatrixType, UpLo>::operatorNorm() const
-{
+ * \returns Operator norm of the matrix.
+ *
+ * \eigenvalues_module
+ * This function computes the L2 operator norm of a self-adjoint matrix. For a
+ * self-adjoint matrix, the operator norm is the largest eigenvalue.
+ *
+ * The current implementation uses the eigenvalues of the matrix, as computed
+ * by eigenvalues(), to compute the operator norm of the matrix.
+ *
+ * Example: \include SelfAdjointView_operatorNorm.cpp
+ * Output: \verbinclude SelfAdjointView_operatorNorm.out
+ *
+ * \sa eigenvalues(), MatrixBase::operatorNorm()
+ */
+template <typename MatrixType, unsigned int UpLo>
+EIGEN_DEVICE_FUNC inline typename SelfAdjointView<MatrixType, UpLo>::RealScalar
+SelfAdjointView<MatrixType, UpLo>::operatorNorm() const {
   return eigenvalues().cwiseAbs().maxCoeff();
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
 #endif
diff --git a/inst/include/Eigen/src/Eigenvalues/RealQZ.h b/inst/include/Eigen/src/Eigenvalues/RealQZ.h
index aa3833eb..a54d82d4 100644
--- a/inst/include/Eigen/src/Eigenvalues/RealQZ.h
+++ b/inst/include/Eigen/src/Eigenvalues/RealQZ.h
@@ -10,615 +10,578 @@
 #ifndef EIGEN_REAL_QZ_H
 #define EIGEN_REAL_QZ_H
 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
 namespace Eigen {
 
-  /** \eigenvalues_module \ingroup Eigenvalues_Module
-   *
-   *
-   * \class RealQZ
-   *
-   * \brief Performs a real QZ decomposition of a pair of square matrices
-   *
-   * \tparam _MatrixType the type of the matrix of which we are computing the
-   * real QZ decomposition; this is expected to be an instantiation of the
-   * Matrix class template.
-   *
-   * Given a real square matrices A and B, this class computes the real QZ
-   * decomposition: \f$ A = Q S Z \f$, \f$ B = Q T Z \f$ where Q and Z are
-   * real orthogonal matrixes, T is upper-triangular matrix, and S is upper
-   * quasi-triangular matrix. An orthogonal matrix is a matrix whose
-   * inverse is equal to its transpose, \f$ U^{-1} = U^T \f$. A quasi-triangular
-   * matrix is a block-triangular matrix whose diagonal consists of 1-by-1
-   * blocks and 2-by-2 blocks where further reduction is impossible due to
-   * complex eigenvalues. 
+/** \eigenvalues_module \ingroup Eigenvalues_Module
+ *
+ *
+ * \class RealQZ
+ *
+ * \brief Performs a real QZ decomposition of a pair of square matrices
+ *
+ * \tparam MatrixType_ the type of the matrix of which we are computing the
+ * real QZ decomposition; this is expected to be an instantiation of the
+ * Matrix class template.
+ *
+ * Given a real square matrices A and B, this class computes the real QZ
+ * decomposition: \f$ A = Q S Z \f$, \f$ B = Q T Z \f$ where Q and Z are
+ * real orthogonal matrixes, T is upper-triangular matrix, and S is upper
+ * quasi-triangular matrix. An orthogonal matrix is a matrix whose
+ * inverse is equal to its transpose, \f$ U^{-1} = U^T \f$. A quasi-triangular
+ * matrix is a block-triangular matrix whose diagonal consists of 1-by-1
+ * blocks and 2-by-2 blocks where further reduction is impossible due to
+ * complex eigenvalues.
+ *
+ * The eigenvalues of the pencil \f$ A - z B \f$ can be obtained from
+ * 1x1 and 2x2 blocks on the diagonals of S and T.
+ *
+ * Call the function compute() to compute the real QZ decomposition of a
+ * given pair of matrices. Alternatively, you can use the
+ * RealQZ(const MatrixType& B, const MatrixType& B, bool computeQZ)
+ * constructor which computes the real QZ decomposition at construction
+ * time. Once the decomposition is computed, you can use the matrixS(),
+ * matrixT(), matrixQ() and matrixZ() functions to retrieve the matrices
+ * S, T, Q and Z in the decomposition. If computeQZ==false, some time
+ * is saved by not computing matrices Q and Z.
+ *
+ * Example: \include RealQZ_compute.cpp
+ * Output: \include RealQZ_compute.out
+ *
+ * \note The implementation is based on the algorithm in "Matrix Computations"
+ * by Gene H. Golub and Charles F. Van Loan, and a paper "An algorithm for
+ * generalized eigenvalue problems" by C.B.Moler and G.W.Stewart.
+ *
+ * \sa class RealSchur, class ComplexSchur, class EigenSolver, class ComplexEigenSolver
+ */
+
+template <typename MatrixType_>
+class RealQZ {
+ public:
+  typedef MatrixType_ MatrixType;
+  enum {
+    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+    ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+    Options = internal::traits<MatrixType>::Options,
+    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+  };
+  typedef typename MatrixType::Scalar Scalar;
+  typedef internal::make_complex_t<Scalar> ComplexScalar;
+  typedef Eigen::Index Index;  ///< \deprecated since Eigen 3.3
+
+  typedef Matrix<ComplexScalar, ColsAtCompileTime, 1, Options & ~RowMajor, MaxColsAtCompileTime, 1> EigenvalueType;
+  typedef Matrix<Scalar, ColsAtCompileTime, 1, Options & ~RowMajor, MaxColsAtCompileTime, 1> ColumnVectorType;
+
+  /** \brief Default constructor.
    *
-   * The eigenvalues of the pencil \f$ A - z B \f$ can be obtained from
-   * 1x1 and 2x2 blocks on the diagonals of S and T.
+   * \param [in] size  Positive integer, size of the matrix whose QZ decomposition will be computed.
    *
-   * Call the function compute() to compute the real QZ decomposition of a
-   * given pair of matrices. Alternatively, you can use the 
-   * RealQZ(const MatrixType& B, const MatrixType& B, bool computeQZ)
-   * constructor which computes the real QZ decomposition at construction
-   * time. Once the decomposition is computed, you can use the matrixS(),
-   * matrixT(), matrixQ() and matrixZ() functions to retrieve the matrices
-   * S, T, Q and Z in the decomposition. If computeQZ==false, some time
-   * is saved by not computing matrices Q and Z.
+   * The default constructor is useful in cases in which the user intends to
+   * perform decompositions via compute().  The \p size parameter is only
+   * used as a hint. It is not an error to give a wrong \p size, but it may
+   * impair performance.
    *
-   * Example: \include RealQZ_compute.cpp
-   * Output: \include RealQZ_compute.out
-   *
-   * \note The implementation is based on the algorithm in "Matrix Computations"
-   * by Gene H. Golub and Charles F. Van Loan, and a paper "An algorithm for
-   * generalized eigenvalue problems" by C.B.Moler and G.W.Stewart.
-   *
-   * \sa class RealSchur, class ComplexSchur, class EigenSolver, class ComplexEigenSolver
+   * \sa compute() for an example.
    */
-
-  template<typename _MatrixType> class RealQZ
-  {
-    public:
-      typedef _MatrixType MatrixType;
-      enum {
-        RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-        ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-        Options = MatrixType::Options,
-        MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
-        MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
-      };
-      typedef typename MatrixType::Scalar Scalar;
-      typedef std::complex<typename NumTraits<Scalar>::Real> ComplexScalar;
-      typedef typename MatrixType::Index Index;
-
-      typedef Matrix<ComplexScalar, ColsAtCompileTime, 1, Options & ~RowMajor, MaxColsAtCompileTime, 1> EigenvalueType;
-      typedef Matrix<Scalar, ColsAtCompileTime, 1, Options & ~RowMajor, MaxColsAtCompileTime, 1> ColumnVectorType;
-
-      /** \brief Default constructor.
-       *
-       * \param [in] size  Positive integer, size of the matrix whose QZ decomposition will be computed.
-       *
-       * The default constructor is useful in cases in which the user intends to
-       * perform decompositions via compute().  The \p size parameter is only
-       * used as a hint. It is not an error to give a wrong \p size, but it may
-       * impair performance.
-       *
-       * \sa compute() for an example.
-       */
-      RealQZ(Index size = RowsAtCompileTime==Dynamic ? 1 : RowsAtCompileTime) : 
-        m_S(size, size),
+  explicit RealQZ(Index size = RowsAtCompileTime == Dynamic ? 1 : RowsAtCompileTime)
+      : m_S(size, size),
         m_T(size, size),
         m_Q(size, size),
         m_Z(size, size),
-        m_workspace(size*2),
+        m_workspace(size * 2),
         m_maxIters(400),
-        m_isInitialized(false)
-        { }
-
-      /** \brief Constructor; computes real QZ decomposition of given matrices
-       * 
-       * \param[in]  A          Matrix A.
-       * \param[in]  B          Matrix B.
-       * \param[in]  computeQZ  If false, A and Z are not computed.
-       *
-       * This constructor calls compute() to compute the QZ decomposition.
-       */
-      RealQZ(const MatrixType& A, const MatrixType& B, bool computeQZ = true) :
-        m_S(A.rows(),A.cols()),
-        m_T(A.rows(),A.cols()),
-        m_Q(A.rows(),A.cols()),
-        m_Z(A.rows(),A.cols()),
-        m_workspace(A.rows()*2),
-        m_maxIters(400),
-        m_isInitialized(false) {
-          compute(A, B, computeQZ);
-        }
+        m_isInitialized(false),
+        m_computeQZ(true) {}
 
-      /** \brief Returns matrix Q in the QZ decomposition. 
-       *
-       * \returns A const reference to the matrix Q.
-       */
-      const MatrixType& matrixQ() const {
-        eigen_assert(m_isInitialized && "RealQZ is not initialized.");
-        eigen_assert(m_computeQZ && "The matrices Q and Z have not been computed during the QZ decomposition.");
-        return m_Q;
-      }
+  /** \brief Constructor; computes real QZ decomposition of given matrices
+   *
+   * \param[in]  A          Matrix A.
+   * \param[in]  B          Matrix B.
+   * \param[in]  computeQZ  If false, A and Z are not computed.
+   *
+   * This constructor calls compute() to compute the QZ decomposition.
+   */
+  RealQZ(const MatrixType& A, const MatrixType& B, bool computeQZ = true)
+      : m_S(A.rows(), A.cols()),
+        m_T(A.rows(), A.cols()),
+        m_Q(A.rows(), A.cols()),
+        m_Z(A.rows(), A.cols()),
+        m_workspace(A.rows() * 2),
+        m_maxIters(400),
+        m_isInitialized(false),
+        m_computeQZ(true) {
+    compute(A, B, computeQZ);
+  }
 
-      /** \brief Returns matrix Z in the QZ decomposition. 
-       *
-       * \returns A const reference to the matrix Z.
-       */
-      const MatrixType& matrixZ() const {
-        eigen_assert(m_isInitialized && "RealQZ is not initialized.");
-        eigen_assert(m_computeQZ && "The matrices Q and Z have not been computed during the QZ decomposition.");
-        return m_Z;
-      }
+  /** \brief Returns matrix Q in the QZ decomposition.
+   *
+   * \returns A const reference to the matrix Q.
+   */
+  const MatrixType& matrixQ() const {
+    eigen_assert(m_isInitialized && "RealQZ is not initialized.");
+    eigen_assert(m_computeQZ && "The matrices Q and Z have not been computed during the QZ decomposition.");
+    return m_Q;
+  }
 
-      /** \brief Returns matrix S in the QZ decomposition. 
-       *
-       * \returns A const reference to the matrix S.
-       */
-      const MatrixType& matrixS() const {
-        eigen_assert(m_isInitialized && "RealQZ is not initialized.");
-        return m_S;
-      }
+  /** \brief Returns matrix Z in the QZ decomposition.
+   *
+   * \returns A const reference to the matrix Z.
+   */
+  const MatrixType& matrixZ() const {
+    eigen_assert(m_isInitialized && "RealQZ is not initialized.");
+    eigen_assert(m_computeQZ && "The matrices Q and Z have not been computed during the QZ decomposition.");
+    return m_Z;
+  }
 
-      /** \brief Returns matrix S in the QZ decomposition. 
-       *
-       * \returns A const reference to the matrix S.
-       */
-      const MatrixType& matrixT() const {
-        eigen_assert(m_isInitialized && "RealQZ is not initialized.");
-        return m_T;
-      }
+  /** \brief Returns matrix S in the QZ decomposition.
+   *
+   * \returns A const reference to the matrix S.
+   */
+  const MatrixType& matrixS() const {
+    eigen_assert(m_isInitialized && "RealQZ is not initialized.");
+    return m_S;
+  }
 
-      /** \brief Computes QZ decomposition of given matrix. 
-       * 
-       * \param[in]  A          Matrix A.
-       * \param[in]  B          Matrix B.
-       * \param[in]  computeQZ  If false, A and Z are not computed.
-       * \returns    Reference to \c *this
-       */
-      RealQZ& compute(const MatrixType& A, const MatrixType& B, bool computeQZ = true);
-
-      /** \brief Reports whether previous computation was successful.
-       *
-       * \returns \c Success if computation was succesful, \c NoConvergence otherwise.
-       */
-      ComputationInfo info() const
-      {
-        eigen_assert(m_isInitialized && "RealQZ is not initialized.");
-        return m_info;
-      }
+  /** \brief Returns matrix S in the QZ decomposition.
+   *
+   * \returns A const reference to the matrix S.
+   */
+  const MatrixType& matrixT() const {
+    eigen_assert(m_isInitialized && "RealQZ is not initialized.");
+    return m_T;
+  }
 
-      /** \brief Returns number of performed QR-like iterations.
-      */
-      Index iterations() const
-      {
-        eigen_assert(m_isInitialized && "RealQZ is not initialized.");
-        return m_global_iter;
-      }
+  /** \brief Computes QZ decomposition of given matrix.
+   *
+   * \param[in]  A          Matrix A.
+   * \param[in]  B          Matrix B.
+   * \param[in]  computeQZ  If false, A and Z are not computed.
+   * \returns    Reference to \c *this
+   */
+  RealQZ& compute(const MatrixType& A, const MatrixType& B, bool computeQZ = true);
 
-      /** Sets the maximal number of iterations allowed to converge to one eigenvalue
-       * or decouple the problem.
-      */
-      RealQZ& setMaxIterations(Index maxIters)
-      {
-        m_maxIters = maxIters;
-        return *this;
-      }
+  /** \brief Reports whether previous computation was successful.
+   *
+   * \returns \c Success if computation was successful, \c NoConvergence otherwise.
+   */
+  ComputationInfo info() const {
+    eigen_assert(m_isInitialized && "RealQZ is not initialized.");
+    return m_info;
+  }
 
-    private:
-
-      MatrixType m_S, m_T, m_Q, m_Z;
-      Matrix<Scalar,Dynamic,1> m_workspace;
-      ComputationInfo m_info;
-      Index m_maxIters;
-      bool m_isInitialized;
-      bool m_computeQZ;
-      Scalar m_normOfT, m_normOfS;
-      Index m_global_iter;
-
-      typedef Matrix<Scalar,3,1> Vector3s;
-      typedef Matrix<Scalar,2,1> Vector2s;
-      typedef Matrix<Scalar,2,2> Matrix2s;
-      typedef JacobiRotation<Scalar> JRs;
-
-      void hessenbergTriangular();
-      void computeNorms();
-      Index findSmallSubdiagEntry(Index iu);
-      Index findSmallDiagEntry(Index f, Index l);
-      void splitOffTwoRows(Index i);
-      void pushDownZero(Index z, Index f, Index l);
-      void step(Index f, Index l, Index iter);
-
-  }; // RealQZ
-
-  /** \internal Reduces S and T to upper Hessenberg - triangular form */
-  template<typename MatrixType>
-    void RealQZ<MatrixType>::hessenbergTriangular()
-    {
+  /** \brief Returns number of performed QR-like iterations.
+   */
+  Index iterations() const {
+    eigen_assert(m_isInitialized && "RealQZ is not initialized.");
+    return m_global_iter;
+  }
 
-      const Index dim = m_S.cols();
-
-      // perform QR decomposition of T, overwrite T with R, save Q
-      HouseholderQR<MatrixType> qrT(m_T);
-      m_T = qrT.matrixQR();
-      m_T.template triangularView<StrictlyLower>().setZero();
-      m_Q = qrT.householderQ();
-      // overwrite S with Q* S
-      m_S.applyOnTheLeft(m_Q.adjoint());
-      // init Z as Identity
-      if (m_computeQZ)
-        m_Z = MatrixType::Identity(dim,dim);
-      // reduce S to upper Hessenberg with Givens rotations
-      for (Index j=0; j<=dim-3; j++) {
-        for (Index i=dim-1; i>=j+2; i--) {
-          JRs G;
-          // kill S(i,j)
-          if(m_S.coeff(i,j) != 0)
-          {
-            G.makeGivens(m_S.coeff(i-1,j), m_S.coeff(i,j), &m_S.coeffRef(i-1, j));
-            m_S.coeffRef(i,j) = Scalar(0.0);
-            m_S.rightCols(dim-j-1).applyOnTheLeft(i-1,i,G.adjoint());
-            m_T.rightCols(dim-i+1).applyOnTheLeft(i-1,i,G.adjoint());
-            // update Q
-            if (m_computeQZ)
-              m_Q.applyOnTheRight(i-1,i,G);
-          }
-          // kill T(i,i-1)
-          if(m_T.coeff(i,i-1)!=Scalar(0))
-          {
-            G.makeGivens(m_T.coeff(i,i), m_T.coeff(i,i-1), &m_T.coeffRef(i,i));
-            m_T.coeffRef(i,i-1) = Scalar(0.0);
-            m_S.applyOnTheRight(i,i-1,G);
-            m_T.topRows(i).applyOnTheRight(i,i-1,G);
-            // update Z
-            if (m_computeQZ)
-              m_Z.applyOnTheLeft(i,i-1,G.adjoint());
-          }
-        }
+  /** Sets the maximal number of iterations allowed to converge to one eigenvalue
+   * or decouple the problem.
+   */
+  RealQZ& setMaxIterations(Index maxIters) {
+    m_maxIters = maxIters;
+    return *this;
+  }
+
+ private:
+  MatrixType m_S, m_T, m_Q, m_Z;
+  Matrix<Scalar, Dynamic, 1> m_workspace;
+  ComputationInfo m_info;
+  Index m_maxIters;
+  bool m_isInitialized;
+  bool m_computeQZ;
+  Scalar m_normOfT, m_normOfS;
+  Index m_global_iter;
+
+  typedef Matrix<Scalar, 3, 1> Vector3s;
+  typedef Matrix<Scalar, 2, 1> Vector2s;
+  typedef Matrix<Scalar, 2, 2> Matrix2s;
+  typedef JacobiRotation<Scalar> JRs;
+
+  void hessenbergTriangular();
+  void computeNorms();
+  Index findSmallSubdiagEntry(Index iu);
+  Index findSmallDiagEntry(Index f, Index l);
+  void splitOffTwoRows(Index i);
+  void pushDownZero(Index z, Index f, Index l);
+  void step(Index f, Index l, Index iter);
+
+};  // RealQZ
+
+/** \internal Reduces S and T to upper Hessenberg - triangular form */
+template <typename MatrixType>
+void RealQZ<MatrixType>::hessenbergTriangular() {
+  const Index dim = m_S.cols();
+
+  // perform QR decomposition of T, overwrite T with R, save Q
+  HouseholderQR<MatrixType> qrT(m_T);
+  m_T = qrT.matrixQR();
+  m_T.template triangularView<StrictlyLower>().setZero();
+  m_Q = qrT.householderQ();
+  // overwrite S with Q* S
+  m_S.applyOnTheLeft(m_Q.adjoint());
+  // init Z as Identity
+  if (m_computeQZ) m_Z = MatrixType::Identity(dim, dim);
+  // reduce S to upper Hessenberg with Givens rotations
+  for (Index j = 0; j <= dim - 3; j++) {
+    for (Index i = dim - 1; i >= j + 2; i--) {
+      JRs G;
+      // kill S(i,j)
+      if (!numext::is_exactly_zero(m_S.coeff(i, j))) {
+        G.makeGivens(m_S.coeff(i - 1, j), m_S.coeff(i, j), &m_S.coeffRef(i - 1, j));
+        m_S.coeffRef(i, j) = Scalar(0.0);
+        m_S.rightCols(dim - j - 1).applyOnTheLeft(i - 1, i, G.adjoint());
+        m_T.rightCols(dim - i + 1).applyOnTheLeft(i - 1, i, G.adjoint());
+        // update Q
+        if (m_computeQZ) m_Q.applyOnTheRight(i - 1, i, G);
       }
-    }
-
-  /** \internal Computes vector L1 norms of S and T when in Hessenberg-Triangular form already */
-  template<typename MatrixType>
-    inline void RealQZ<MatrixType>::computeNorms()
-    {
-      const Index size = m_S.cols();
-      m_normOfS = Scalar(0.0);
-      m_normOfT = Scalar(0.0);
-      for (Index j = 0; j < size; ++j)
-      {
-        m_normOfS += m_S.col(j).segment(0, (std::min)(size,j+2)).cwiseAbs().sum();
-        m_normOfT += m_T.row(j).segment(j, size - j).cwiseAbs().sum();
+      // kill T(i,i-1)
+      if (!numext::is_exactly_zero(m_T.coeff(i, i - 1))) {
+        G.makeGivens(m_T.coeff(i, i), m_T.coeff(i, i - 1), &m_T.coeffRef(i, i));
+        m_T.coeffRef(i, i - 1) = Scalar(0.0);
+        m_S.applyOnTheRight(i, i - 1, G);
+        m_T.topRows(i).applyOnTheRight(i, i - 1, G);
+        // update Z
+        if (m_computeQZ) m_Z.applyOnTheLeft(i, i - 1, G.adjoint());
       }
     }
+  }
+}
+
+/** \internal Computes vector L1 norms of S and T when in Hessenberg-Triangular form already */
+template <typename MatrixType>
+inline void RealQZ<MatrixType>::computeNorms() {
+  const Index size = m_S.cols();
+  m_normOfS = Scalar(0.0);
+  m_normOfT = Scalar(0.0);
+  for (Index j = 0; j < size; ++j) {
+    m_normOfS += m_S.col(j).segment(0, (std::min)(size, j + 2)).cwiseAbs().sum();
+    m_normOfT += m_T.row(j).segment(j, size - j).cwiseAbs().sum();
+  }
+}
+
+/** \internal Look for single small sub-diagonal element S(res, res-1) and return res (or 0) */
+template <typename MatrixType>
+inline Index RealQZ<MatrixType>::findSmallSubdiagEntry(Index iu) {
+  using std::abs;
+  Index res = iu;
+  while (res > 0) {
+    Scalar s = abs(m_S.coeff(res - 1, res - 1)) + abs(m_S.coeff(res, res));
+    if (numext::is_exactly_zero(s)) s = m_normOfS;
+    if (abs(m_S.coeff(res, res - 1)) < NumTraits<Scalar>::epsilon() * s) break;
+    res--;
+  }
+  return res;
+}
+
+/** \internal Look for single small diagonal element T(res, res) for res between f and l, and return res (or f-1)  */
+template <typename MatrixType>
+inline Index RealQZ<MatrixType>::findSmallDiagEntry(Index f, Index l) {
+  using std::abs;
+  Index res = l;
+  while (res >= f) {
+    if (abs(m_T.coeff(res, res)) <= NumTraits<Scalar>::epsilon() * m_normOfT) break;
+    res--;
+  }
+  return res;
+}
+
+/** \internal decouple 2x2 diagonal block in rows i, i+1 if eigenvalues are real */
+template <typename MatrixType>
+inline void RealQZ<MatrixType>::splitOffTwoRows(Index i) {
+  using std::abs;
+  using std::sqrt;
+  const Index dim = m_S.cols();
+  if (numext::is_exactly_zero(abs(m_S.coeff(i + 1, i)))) return;
+  Index j = findSmallDiagEntry(i, i + 1);
+  if (j == i - 1) {
+    // block of (S T^{-1})
+    Matrix2s STi = m_T.template block<2, 2>(i, i).template triangularView<Upper>().template solve<OnTheRight>(
+        m_S.template block<2, 2>(i, i));
+    Scalar p = Scalar(0.5) * (STi(0, 0) - STi(1, 1));
+    Scalar q = p * p + STi(1, 0) * STi(0, 1);
+    if (q >= 0) {
+      Scalar z = sqrt(q);
+      // one QR-like iteration for ABi - lambda I
+      // is enough - when we know exact eigenvalue in advance,
+      // convergence is immediate
+      JRs G;
+      if (p >= 0)
+        G.makeGivens(p + z, STi(1, 0));
+      else
+        G.makeGivens(p - z, STi(1, 0));
+      m_S.rightCols(dim - i).applyOnTheLeft(i, i + 1, G.adjoint());
+      m_T.rightCols(dim - i).applyOnTheLeft(i, i + 1, G.adjoint());
+      // update Q
+      if (m_computeQZ) m_Q.applyOnTheRight(i, i + 1, G);
+
+      G.makeGivens(m_T.coeff(i + 1, i + 1), m_T.coeff(i + 1, i));
+      m_S.topRows(i + 2).applyOnTheRight(i + 1, i, G);
+      m_T.topRows(i + 2).applyOnTheRight(i + 1, i, G);
+      // update Z
+      if (m_computeQZ) m_Z.applyOnTheLeft(i + 1, i, G.adjoint());
 
-
-  /** \internal Look for single small sub-diagonal element S(res, res-1) and return res (or 0) */
-  template<typename MatrixType>
-    inline typename MatrixType::Index RealQZ<MatrixType>::findSmallSubdiagEntry(Index iu)
-    {
-      using std::abs;
-      Index res = iu;
-      while (res > 0)
-      {
-        Scalar s = abs(m_S.coeff(res-1,res-1)) + abs(m_S.coeff(res,res));
-        if (s == Scalar(0.0))
-          s = m_normOfS;
-        if (abs(m_S.coeff(res,res-1)) < NumTraits<Scalar>::epsilon() * s)
-          break;
-        res--;
-      }
-      return res;
+      m_S.coeffRef(i + 1, i) = Scalar(0.0);
+      m_T.coeffRef(i + 1, i) = Scalar(0.0);
     }
-
-  /** \internal Look for single small diagonal element T(res, res) for res between f and l, and return res (or f-1)  */
-  template<typename MatrixType>
-    inline typename MatrixType::Index RealQZ<MatrixType>::findSmallDiagEntry(Index f, Index l)
-    {
-      using std::abs;
-      Index res = l;
-      while (res >= f) {
-        if (abs(m_T.coeff(res,res)) <= NumTraits<Scalar>::epsilon() * m_normOfT)
-          break;
-        res--;
-      }
-      return res;
+  } else {
+    pushDownZero(j, i, i + 1);
+  }
+}
+
+/** \internal use zero in T(z,z) to zero S(l,l-1), working in block f..l */
+template <typename MatrixType>
+inline void RealQZ<MatrixType>::pushDownZero(Index z, Index f, Index l) {
+  JRs G;
+  const Index dim = m_S.cols();
+  for (Index zz = z; zz < l; zz++) {
+    // push 0 down
+    Index firstColS = zz > f ? (zz - 1) : zz;
+    G.makeGivens(m_T.coeff(zz, zz + 1), m_T.coeff(zz + 1, zz + 1));
+    m_S.rightCols(dim - firstColS).applyOnTheLeft(zz, zz + 1, G.adjoint());
+    m_T.rightCols(dim - zz).applyOnTheLeft(zz, zz + 1, G.adjoint());
+    m_T.coeffRef(zz + 1, zz + 1) = Scalar(0.0);
+    // update Q
+    if (m_computeQZ) m_Q.applyOnTheRight(zz, zz + 1, G);
+    // kill S(zz+1, zz-1)
+    if (zz > f) {
+      G.makeGivens(m_S.coeff(zz + 1, zz), m_S.coeff(zz + 1, zz - 1));
+      m_S.topRows(zz + 2).applyOnTheRight(zz, zz - 1, G);
+      m_T.topRows(zz + 1).applyOnTheRight(zz, zz - 1, G);
+      m_S.coeffRef(zz + 1, zz - 1) = Scalar(0.0);
+      // update Z
+      if (m_computeQZ) m_Z.applyOnTheLeft(zz, zz - 1, G.adjoint());
     }
-
-  /** \internal decouple 2x2 diagonal block in rows i, i+1 if eigenvalues are real */
-  template<typename MatrixType>
-    inline void RealQZ<MatrixType>::splitOffTwoRows(Index i)
+  }
+  // finally kill S(l,l-1)
+  G.makeGivens(m_S.coeff(l, l), m_S.coeff(l, l - 1));
+  m_S.applyOnTheRight(l, l - 1, G);
+  m_T.applyOnTheRight(l, l - 1, G);
+  m_S.coeffRef(l, l - 1) = Scalar(0.0);
+  // update Z
+  if (m_computeQZ) m_Z.applyOnTheLeft(l, l - 1, G.adjoint());
+}
+
+/** \internal QR-like iterative step for block f..l */
+template <typename MatrixType>
+inline void RealQZ<MatrixType>::step(Index f, Index l, Index iter) {
+  using std::abs;
+  const Index dim = m_S.cols();
+
+  // x, y, z
+  Scalar x, y, z;
+  if (iter == 10) {
+    // Wilkinson ad hoc shift
+    const Scalar a11 = m_S.coeff(f + 0, f + 0), a12 = m_S.coeff(f + 0, f + 1), a21 = m_S.coeff(f + 1, f + 0),
+                 a22 = m_S.coeff(f + 1, f + 1), a32 = m_S.coeff(f + 2, f + 1), b12 = m_T.coeff(f + 0, f + 1),
+                 b11i = Scalar(1.0) / m_T.coeff(f + 0, f + 0), b22i = Scalar(1.0) / m_T.coeff(f + 1, f + 1),
+                 a87 = m_S.coeff(l - 1, l - 2), a98 = m_S.coeff(l - 0, l - 1),
+                 b77i = Scalar(1.0) / m_T.coeff(l - 2, l - 2), b88i = Scalar(1.0) / m_T.coeff(l - 1, l - 1);
+    Scalar ss = abs(a87 * b77i) + abs(a98 * b88i), lpl = Scalar(1.5) * ss, ll = ss * ss;
+    x = ll + a11 * a11 * b11i * b11i - lpl * a11 * b11i + a12 * a21 * b11i * b22i -
+        a11 * a21 * b12 * b11i * b11i * b22i;
+    y = a11 * a21 * b11i * b11i - lpl * a21 * b11i + a21 * a22 * b11i * b22i - a21 * a21 * b12 * b11i * b11i * b22i;
+    z = a21 * a32 * b11i * b22i;
+  } else if (iter == 16) {
+    // another exceptional shift
+    x = m_S.coeff(f, f) / m_T.coeff(f, f) - m_S.coeff(l, l) / m_T.coeff(l, l) +
+        m_S.coeff(l, l - 1) * m_T.coeff(l - 1, l) / (m_T.coeff(l - 1, l - 1) * m_T.coeff(l, l));
+    y = m_S.coeff(f + 1, f) / m_T.coeff(f, f);
+    z = 0;
+  } else if (iter > 23 && !(iter % 8)) {
+    // extremely exceptional shift
+    x = internal::random<Scalar>(-1.0, 1.0);
+    y = internal::random<Scalar>(-1.0, 1.0);
+    z = internal::random<Scalar>(-1.0, 1.0);
+  } else {
+    // Compute the shifts: (x,y,z,0...) = (AB^-1 - l1 I) (AB^-1 - l2 I) e1
+    // where l1 and l2 are the eigenvalues of the 2x2 matrix C = U V^-1 where
+    // U and V are 2x2 bottom right sub matrices of A and B. Thus:
+    //  = AB^-1AB^-1 + l1 l2 I - (l1+l2)(AB^-1)
+    //  = AB^-1AB^-1 + det(M) - tr(M)(AB^-1)
+    // Since we are only interested in having x, y, z with a correct ratio, we have:
+    const Scalar a11 = m_S.coeff(f, f), a12 = m_S.coeff(f, f + 1), a21 = m_S.coeff(f + 1, f),
+                 a22 = m_S.coeff(f + 1, f + 1), a32 = m_S.coeff(f + 2, f + 1),
+
+                 a88 = m_S.coeff(l - 1, l - 1), a89 = m_S.coeff(l - 1, l), a98 = m_S.coeff(l, l - 1),
+                 a99 = m_S.coeff(l, l),
+
+                 b11 = m_T.coeff(f, f), b12 = m_T.coeff(f, f + 1), b22 = m_T.coeff(f + 1, f + 1),
+
+                 b88 = m_T.coeff(l - 1, l - 1), b89 = m_T.coeff(l - 1, l), b99 = m_T.coeff(l, l);
+
+    x = ((a88 / b88 - a11 / b11) * (a99 / b99 - a11 / b11) - (a89 / b99) * (a98 / b88) +
+         (a98 / b88) * (b89 / b99) * (a11 / b11)) *
+            (b11 / a21) +
+        a12 / b22 - (a11 / b11) * (b12 / b22);
+    y = (a22 / b22 - a11 / b11) - (a21 / b11) * (b12 / b22) - (a88 / b88 - a11 / b11) - (a99 / b99 - a11 / b11) +
+        (a98 / b88) * (b89 / b99);
+    z = a32 / b22;
+  }
+
+  JRs G;
+
+  for (Index k = f; k <= l - 2; k++) {
+    // variables for Householder reflections
+    Vector2s essential2;
+    Scalar tau, beta;
+
+    Vector3s hr(x, y, z);
+
+    // Q_k to annihilate S(k+1,k-1) and S(k+2,k-1)
+    hr.makeHouseholderInPlace(tau, beta);
+    essential2 = hr.template bottomRows<2>();
+    Index fc = (std::max)(k - 1, Index(0));  // first col to update
+    m_S.template middleRows<3>(k).rightCols(dim - fc).applyHouseholderOnTheLeft(essential2, tau, m_workspace.data());
+    m_T.template middleRows<3>(k).rightCols(dim - fc).applyHouseholderOnTheLeft(essential2, tau, m_workspace.data());
+    if (m_computeQZ) m_Q.template middleCols<3>(k).applyHouseholderOnTheRight(essential2, tau, m_workspace.data());
+    if (k > f) m_S.coeffRef(k + 2, k - 1) = m_S.coeffRef(k + 1, k - 1) = Scalar(0.0);
+
+    // Z_{k1} to annihilate T(k+2,k+1) and T(k+2,k)
+    hr << m_T.coeff(k + 2, k + 2), m_T.coeff(k + 2, k), m_T.coeff(k + 2, k + 1);
+    hr.makeHouseholderInPlace(tau, beta);
+    essential2 = hr.template bottomRows<2>();
     {
-      using std::abs;
-      using std::sqrt;
-      const Index dim=m_S.cols();
-      if (abs(m_S.coeff(i+1,i))==Scalar(0))
-        return;
-      Index z = findSmallDiagEntry(i,i+1);
-      if (z==i-1)
-      {
-        // block of (S T^{-1})
-        Matrix2s STi = m_T.template block<2,2>(i,i).template triangularView<Upper>().
-          template solve<OnTheRight>(m_S.template block<2,2>(i,i));
-        Scalar p = Scalar(0.5)*(STi(0,0)-STi(1,1));
-        Scalar q = p*p + STi(1,0)*STi(0,1);
-        if (q>=0) {
-          Scalar z = sqrt(q);
-          // one QR-like iteration for ABi - lambda I
-          // is enough - when we know exact eigenvalue in advance,
-          // convergence is immediate
-          JRs G;
-          if (p>=0)
-            G.makeGivens(p + z, STi(1,0));
-          else
-            G.makeGivens(p - z, STi(1,0));
-          m_S.rightCols(dim-i).applyOnTheLeft(i,i+1,G.adjoint());
-          m_T.rightCols(dim-i).applyOnTheLeft(i,i+1,G.adjoint());
-          // update Q
-          if (m_computeQZ)
-            m_Q.applyOnTheRight(i,i+1,G);
-
-          G.makeGivens(m_T.coeff(i+1,i+1), m_T.coeff(i+1,i));
-          m_S.topRows(i+2).applyOnTheRight(i+1,i,G);
-          m_T.topRows(i+2).applyOnTheRight(i+1,i,G);
-          // update Z
-          if (m_computeQZ)
-            m_Z.applyOnTheLeft(i+1,i,G.adjoint());
-
-          m_S.coeffRef(i+1,i) = Scalar(0.0);
-          m_T.coeffRef(i+1,i) = Scalar(0.0);
-        }
-      }
-      else
-      {
-        pushDownZero(z,i,i+1);
-      }
+      Index lr = (std::min)(k + 4, dim);  // last row to update
+      Map<Matrix<Scalar, Dynamic, 1> > tmp(m_workspace.data(), lr);
+      // S
+      tmp.noalias() = m_S.template middleCols<2>(k).topRows(lr) * essential2;
+      tmp += m_S.col(k + 2).head(lr);
+      m_S.col(k + 2).head(lr) -= tau * tmp;
+      m_S.template middleCols<2>(k).topRows(lr).noalias() -= (tau * tmp) * essential2.adjoint();
+      // T
+      tmp = m_T.template middleCols<2>(k).topRows(lr) * essential2;
+      tmp += m_T.col(k + 2).head(lr);
+      m_T.col(k + 2).head(lr) -= tau * tmp;
+      m_T.template middleCols<2>(k).topRows(lr).noalias() -= (tau * tmp) * essential2.adjoint();
     }
-
-  /** \internal use zero in T(z,z) to zero S(l,l-1), working in block f..l */
-  template<typename MatrixType>
-    inline void RealQZ<MatrixType>::pushDownZero(Index z, Index f, Index l)
-    {
-      JRs G;
-      const Index dim = m_S.cols();
-      for (Index zz=z; zz<l; zz++)
-      {
-        // push 0 down
-        Index firstColS = zz>f ? (zz-1) : zz;
-        G.makeGivens(m_T.coeff(zz, zz+1), m_T.coeff(zz+1, zz+1));
-        m_S.rightCols(dim-firstColS).applyOnTheLeft(zz,zz+1,G.adjoint());
-        m_T.rightCols(dim-zz).applyOnTheLeft(zz,zz+1,G.adjoint());
-        m_T.coeffRef(zz+1,zz+1) = Scalar(0.0);
-        // update Q
-        if (m_computeQZ)
-          m_Q.applyOnTheRight(zz,zz+1,G);
-        // kill S(zz+1, zz-1)
-        if (zz>f)
-        {
-          G.makeGivens(m_S.coeff(zz+1, zz), m_S.coeff(zz+1,zz-1));
-          m_S.topRows(zz+2).applyOnTheRight(zz, zz-1,G);
-          m_T.topRows(zz+1).applyOnTheRight(zz, zz-1,G);
-          m_S.coeffRef(zz+1,zz-1) = Scalar(0.0);
-          // update Z
-          if (m_computeQZ)
-            m_Z.applyOnTheLeft(zz,zz-1,G.adjoint());
-        }
-      }
-      // finally kill S(l,l-1)
-      G.makeGivens(m_S.coeff(l,l), m_S.coeff(l,l-1));
-      m_S.applyOnTheRight(l,l-1,G);
-      m_T.applyOnTheRight(l,l-1,G);
-      m_S.coeffRef(l,l-1)=Scalar(0.0);
-      // update Z
-      if (m_computeQZ)
-        m_Z.applyOnTheLeft(l,l-1,G.adjoint());
+    if (m_computeQZ) {
+      // Z
+      Map<Matrix<Scalar, 1, Dynamic> > tmp(m_workspace.data(), dim);
+      tmp.noalias() = essential2.adjoint() * (m_Z.template middleRows<2>(k));
+      tmp += m_Z.row(k + 2);
+      m_Z.row(k + 2) -= tau * tmp;
+      m_Z.template middleRows<2>(k).noalias() -= essential2 * (tau * tmp);
     }
-
-  /** \internal QR-like iterative step for block f..l */
-  template<typename MatrixType>
-    inline void RealQZ<MatrixType>::step(Index f, Index l, Index iter)
+    m_T.coeffRef(k + 2, k) = m_T.coeffRef(k + 2, k + 1) = Scalar(0.0);
+
+    // Z_{k2} to annihilate T(k+1,k)
+    G.makeGivens(m_T.coeff(k + 1, k + 1), m_T.coeff(k + 1, k));
+    m_S.applyOnTheRight(k + 1, k, G);
+    m_T.applyOnTheRight(k + 1, k, G);
+    // update Z
+    if (m_computeQZ) m_Z.applyOnTheLeft(k + 1, k, G.adjoint());
+    m_T.coeffRef(k + 1, k) = Scalar(0.0);
+
+    // update x,y,z
+    x = m_S.coeff(k + 1, k);
+    y = m_S.coeff(k + 2, k);
+    if (k < l - 2) z = m_S.coeff(k + 3, k);
+  }  // loop over k
+
+  // Q_{n-1} to annihilate y = S(l,l-2)
+  G.makeGivens(x, y);
+  m_S.applyOnTheLeft(l - 1, l, G.adjoint());
+  m_T.applyOnTheLeft(l - 1, l, G.adjoint());
+  if (m_computeQZ) m_Q.applyOnTheRight(l - 1, l, G);
+  m_S.coeffRef(l, l - 2) = Scalar(0.0);
+
+  // Z_{n-1} to annihilate T(l,l-1)
+  G.makeGivens(m_T.coeff(l, l), m_T.coeff(l, l - 1));
+  m_S.applyOnTheRight(l, l - 1, G);
+  m_T.applyOnTheRight(l, l - 1, G);
+  if (m_computeQZ) m_Z.applyOnTheLeft(l, l - 1, G.adjoint());
+  m_T.coeffRef(l, l - 1) = Scalar(0.0);
+}
+
+template <typename MatrixType>
+RealQZ<MatrixType>& RealQZ<MatrixType>::compute(const MatrixType& A_in, const MatrixType& B_in, bool computeQZ) {
+  const Index dim = A_in.cols();
+
+  eigen_assert(A_in.rows() == dim && A_in.cols() == dim && B_in.rows() == dim && B_in.cols() == dim &&
+               "Need square matrices of the same dimension");
+
+  m_isInitialized = true;
+  m_computeQZ = computeQZ;
+  m_S = A_in;
+  m_T = B_in;
+  m_workspace.resize(dim * 2);
+  m_global_iter = 0;
+
+  // entrance point: hessenberg triangular decomposition
+  hessenbergTriangular();
+  // compute L1 vector norms of T, S into m_normOfS, m_normOfT
+  computeNorms();
+
+  Index l = dim - 1, f, local_iter = 0;
+
+  while (l > 0 && local_iter < m_maxIters) {
+    f = findSmallSubdiagEntry(l);
+    // now rows and columns f..l (including) decouple from the rest of the problem
+    if (f > 0) m_S.coeffRef(f, f - 1) = Scalar(0.0);
+    if (f == l)  // One root found
     {
-      using std::abs;
-      const Index dim = m_S.cols();
-
-      // x, y, z
-      Scalar x, y, z;
-      if (iter==10)
-      {
-        // Wilkinson ad hoc shift
-        const Scalar
-          a11=m_S.coeff(f+0,f+0), a12=m_S.coeff(f+0,f+1),
-          a21=m_S.coeff(f+1,f+0), a22=m_S.coeff(f+1,f+1), a32=m_S.coeff(f+2,f+1),
-          b12=m_T.coeff(f+0,f+1),
-          b11i=Scalar(1.0)/m_T.coeff(f+0,f+0),
-          b22i=Scalar(1.0)/m_T.coeff(f+1,f+1),
-          a87=m_S.coeff(l-1,l-2),
-          a98=m_S.coeff(l-0,l-1),
-          b77i=Scalar(1.0)/m_T.coeff(l-2,l-2),
-          b88i=Scalar(1.0)/m_T.coeff(l-1,l-1);
-        Scalar ss = abs(a87*b77i) + abs(a98*b88i),
-               lpl = Scalar(1.5)*ss,
-               ll = ss*ss;
-        x = ll + a11*a11*b11i*b11i - lpl*a11*b11i + a12*a21*b11i*b22i
-          - a11*a21*b12*b11i*b11i*b22i;
-        y = a11*a21*b11i*b11i - lpl*a21*b11i + a21*a22*b11i*b22i 
-          - a21*a21*b12*b11i*b11i*b22i;
-        z = a21*a32*b11i*b22i;
-      }
-      else if (iter==16)
-      {
-        // another exceptional shift
-        x = m_S.coeff(f,f)/m_T.coeff(f,f)-m_S.coeff(l,l)/m_T.coeff(l,l) + m_S.coeff(l,l-1)*m_T.coeff(l-1,l) /
-          (m_T.coeff(l-1,l-1)*m_T.coeff(l,l));
-        y = m_S.coeff(f+1,f)/m_T.coeff(f,f);
-        z = 0;
-      }
-      else if (iter>23 && !(iter%8))
-      {
-        // extremely exceptional shift
-        x = internal::random<Scalar>(-1.0,1.0);
-        y = internal::random<Scalar>(-1.0,1.0);
-        z = internal::random<Scalar>(-1.0,1.0);
-      }
-      else
-      {
-        // Compute the shifts: (x,y,z,0...) = (AB^-1 - l1 I) (AB^-1 - l2 I) e1
-        // where l1 and l2 are the eigenvalues of the 2x2 matrix C = U V^-1 where
-        // U and V are 2x2 bottom right sub matrices of A and B. Thus:
-        //  = AB^-1AB^-1 + l1 l2 I - (l1+l2)(AB^-1)
-        //  = AB^-1AB^-1 + det(M) - tr(M)(AB^-1)
-        // Since we are only interested in having x, y, z with a correct ratio, we have:
-        const Scalar
-          a11 = m_S.coeff(f,f),     a12 = m_S.coeff(f,f+1),
-          a21 = m_S.coeff(f+1,f),   a22 = m_S.coeff(f+1,f+1),
-                                    a32 = m_S.coeff(f+2,f+1),
-
-          a88 = m_S.coeff(l-1,l-1), a89 = m_S.coeff(l-1,l),
-          a98 = m_S.coeff(l,l-1),   a99 = m_S.coeff(l,l),
-
-          b11 = m_T.coeff(f,f),     b12 = m_T.coeff(f,f+1),
-                                    b22 = m_T.coeff(f+1,f+1),
-
-          b88 = m_T.coeff(l-1,l-1), b89 = m_T.coeff(l-1,l),
-                                    b99 = m_T.coeff(l,l);
-
-        x = ( (a88/b88 - a11/b11)*(a99/b99 - a11/b11) - (a89/b99)*(a98/b88) + (a98/b88)*(b89/b99)*(a11/b11) ) * (b11/a21)
-          + a12/b22 - (a11/b11)*(b12/b22);
-        y = (a22/b22-a11/b11) - (a21/b11)*(b12/b22) - (a88/b88-a11/b11) - (a99/b99-a11/b11) + (a98/b88)*(b89/b99);
-        z = a32/b22;
+      l--;
+      local_iter = 0;
+    } else if (f == l - 1)  // Two roots found
+    {
+      splitOffTwoRows(f);
+      l -= 2;
+      local_iter = 0;
+    } else  // No convergence yet
+    {
+      // if there's zero on diagonal of T, we can isolate an eigenvalue with Givens rotations
+      Index z = findSmallDiagEntry(f, l);
+      if (z >= f) {
+        // zero found
+        pushDownZero(z, f, l);
+      } else {
+        // We are sure now that S.block(f,f, l-f+1,l-f+1) is underuced upper-Hessenberg
+        // and T.block(f,f, l-f+1,l-f+1) is invertible uper-triangular, which allows to
+        // apply a QR-like iteration to rows and columns f..l.
+        step(f, l, local_iter);
+        local_iter++;
+        m_global_iter++;
       }
-
-      JRs G;
-
-      for (Index k=f; k<=l-2; k++)
-      {
-        // variables for Householder reflections
-        Vector2s essential2;
-        Scalar tau, beta;
-
-        Vector3s hr(x,y,z);
-
-        // Q_k to annihilate S(k+1,k-1) and S(k+2,k-1)
-        hr.makeHouseholderInPlace(tau, beta);
-        essential2 = hr.template bottomRows<2>();
-        Index fc=(std::max)(k-1,Index(0));  // first col to update
-        m_S.template middleRows<3>(k).rightCols(dim-fc).applyHouseholderOnTheLeft(essential2, tau, m_workspace.data());
-        m_T.template middleRows<3>(k).rightCols(dim-fc).applyHouseholderOnTheLeft(essential2, tau, m_workspace.data());
-        if (m_computeQZ)
-          m_Q.template middleCols<3>(k).applyHouseholderOnTheRight(essential2, tau, m_workspace.data());
-        if (k>f)
-          m_S.coeffRef(k+2,k-1) = m_S.coeffRef(k+1,k-1) = Scalar(0.0);
-
-        // Z_{k1} to annihilate T(k+2,k+1) and T(k+2,k)
-        hr << m_T.coeff(k+2,k+2),m_T.coeff(k+2,k),m_T.coeff(k+2,k+1);
-        hr.makeHouseholderInPlace(tau, beta);
-        essential2 = hr.template bottomRows<2>();
-        {
-          Index lr = (std::min)(k+4,dim); // last row to update
-          Map<Matrix<Scalar,Dynamic,1> > tmp(m_workspace.data(),lr);
-          // S
-          tmp = m_S.template middleCols<2>(k).topRows(lr) * essential2;
-          tmp += m_S.col(k+2).head(lr);
-          m_S.col(k+2).head(lr) -= tau*tmp;
-          m_S.template middleCols<2>(k).topRows(lr) -= (tau*tmp) * essential2.adjoint();
-          // T
-          tmp = m_T.template middleCols<2>(k).topRows(lr) * essential2;
-          tmp += m_T.col(k+2).head(lr);
-          m_T.col(k+2).head(lr) -= tau*tmp;
-          m_T.template middleCols<2>(k).topRows(lr) -= (tau*tmp) * essential2.adjoint();
-        }
-        if (m_computeQZ)
-        {
-          // Z
-          Map<Matrix<Scalar,1,Dynamic> > tmp(m_workspace.data(),dim);
-          tmp = essential2.adjoint()*(m_Z.template middleRows<2>(k));
-          tmp += m_Z.row(k+2);
-          m_Z.row(k+2) -= tau*tmp;
-          m_Z.template middleRows<2>(k) -= essential2 * (tau*tmp);
+    }
+  }
+  // check if we converged before reaching iterations limit
+  m_info = (local_iter < m_maxIters) ? Success : NoConvergence;
+
+  // For each non triangular 2x2 diagonal block of S,
+  //    reduce the respective 2x2 diagonal block of T to positive diagonal form using 2x2 SVD.
+  // This step is not mandatory for QZ, but it does help further extraction of eigenvalues/eigenvectors,
+  // and is in par with Lapack/Matlab QZ.
+  if (m_info == Success) {
+    for (Index i = 0; i < dim - 1; ++i) {
+      if (!numext::is_exactly_zero(m_S.coeff(i + 1, i))) {
+        JacobiRotation<Scalar> j_left, j_right;
+        internal::real_2x2_jacobi_svd(m_T, i, i + 1, &j_left, &j_right);
+
+        // Apply resulting Jacobi rotations
+        m_S.applyOnTheLeft(i, i + 1, j_left);
+        m_S.applyOnTheRight(i, i + 1, j_right);
+        m_T.applyOnTheLeft(i, i + 1, j_left);
+        m_T.applyOnTheRight(i, i + 1, j_right);
+        m_T(i + 1, i) = m_T(i, i + 1) = Scalar(0);
+
+        if (m_computeQZ) {
+          m_Q.applyOnTheRight(i, i + 1, j_left.transpose());
+          m_Z.applyOnTheLeft(i, i + 1, j_right.transpose());
         }
-        m_T.coeffRef(k+2,k) = m_T.coeffRef(k+2,k+1) = Scalar(0.0);
 
-        // Z_{k2} to annihilate T(k+1,k)
-        G.makeGivens(m_T.coeff(k+1,k+1), m_T.coeff(k+1,k));
-        m_S.applyOnTheRight(k+1,k,G);
-        m_T.applyOnTheRight(k+1,k,G);
-        // update Z
-        if (m_computeQZ)
-          m_Z.applyOnTheLeft(k+1,k,G.adjoint());
-        m_T.coeffRef(k+1,k) = Scalar(0.0);
-
-        // update x,y,z
-        x = m_S.coeff(k+1,k);
-        y = m_S.coeff(k+2,k);
-        if (k < l-2)
-          z = m_S.coeff(k+3,k);
-      } // loop over k
-
-      // Q_{n-1} to annihilate y = S(l,l-2)
-      G.makeGivens(x,y);
-      m_S.applyOnTheLeft(l-1,l,G.adjoint());
-      m_T.applyOnTheLeft(l-1,l,G.adjoint());
-      if (m_computeQZ)
-        m_Q.applyOnTheRight(l-1,l,G);
-      m_S.coeffRef(l,l-2) = Scalar(0.0);
-
-      // Z_{n-1} to annihilate T(l,l-1)
-      G.makeGivens(m_T.coeff(l,l),m_T.coeff(l,l-1));
-      m_S.applyOnTheRight(l,l-1,G);
-      m_T.applyOnTheRight(l,l-1,G);
-      if (m_computeQZ)
-        m_Z.applyOnTheLeft(l,l-1,G.adjoint());
-      m_T.coeffRef(l,l-1) = Scalar(0.0);
+        i++;
+      }
     }
+  }
 
+  return *this;
+}  // end compute
 
-  template<typename MatrixType>
-    RealQZ<MatrixType>& RealQZ<MatrixType>::compute(const MatrixType& A_in, const MatrixType& B_in, bool computeQZ)
-    {
-
-      const Index dim = A_in.cols();
-
-      eigen_assert (A_in.rows()==dim && A_in.cols()==dim 
-          && B_in.rows()==dim && B_in.cols()==dim 
-          && "Need square matrices of the same dimension");
-
-      m_isInitialized = true;
-      m_computeQZ = computeQZ;
-      m_S = A_in; m_T = B_in;
-      m_workspace.resize(dim*2);
-      m_global_iter = 0;
-
-      // entrance point: hessenberg triangular decomposition
-      hessenbergTriangular();
-      // compute L1 vector norms of T, S into m_normOfS, m_normOfT
-      computeNorms();
-
-      Index l = dim-1, 
-            f, 
-            local_iter = 0;
-
-      while (l>0 && local_iter<m_maxIters)
-      {
-        f = findSmallSubdiagEntry(l);
-        // now rows and columns f..l (including) decouple from the rest of the problem
-        if (f>0) m_S.coeffRef(f,f-1) = Scalar(0.0);
-        if (f == l) // One root found
-        {
-          l--;
-          local_iter = 0;
-        }
-        else if (f == l-1) // Two roots found
-        {
-          splitOffTwoRows(f);
-          l -= 2;
-          local_iter = 0;
-        }
-        else // No convergence yet
-        {
-          // if there's zero on diagonal of T, we can isolate an eigenvalue with Givens rotations
-          Index z = findSmallDiagEntry(f,l);
-          if (z>=f)
-          {
-            // zero found
-            pushDownZero(z,f,l);
-          }
-          else
-          {
-            // We are sure now that S.block(f,f, l-f+1,l-f+1) is underuced upper-Hessenberg 
-            // and T.block(f,f, l-f+1,l-f+1) is invertible uper-triangular, which allows to
-            // apply a QR-like iteration to rows and columns f..l.
-            step(f,l, local_iter);
-            local_iter++;
-            m_global_iter++;
-          }
-        }
-      }
-      // check if we converged before reaching iterations limit
-      m_info = (local_iter<m_maxIters) ? Success : NoConvergence;
-      return *this;
-    } // end compute
-
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif //EIGEN_REAL_QZ
+#endif  // EIGEN_REAL_QZ
diff --git a/inst/include/Eigen/src/Eigenvalues/RealSchur.h b/inst/include/Eigen/src/Eigenvalues/RealSchur.h
index 16d38753..94bc34dd 100644
--- a/inst/include/Eigen/src/Eigenvalues/RealSchur.h
+++ b/inst/include/Eigen/src/Eigenvalues/RealSchur.h
@@ -13,300 +13,312 @@
 
 #include "./HessenbergDecomposition.h"
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 /** \eigenvalues_module \ingroup Eigenvalues_Module
-  *
-  *
-  * \class RealSchur
-  *
-  * \brief Performs a real Schur decomposition of a square matrix
-  *
-  * \tparam _MatrixType the type of the matrix of which we are computing the
-  * real Schur decomposition; this is expected to be an instantiation of the
-  * Matrix class template.
-  *
-  * Given a real square matrix A, this class computes the real Schur
-  * decomposition: \f$ A = U T U^T \f$ where U is a real orthogonal matrix and
-  * T is a real quasi-triangular matrix. An orthogonal matrix is a matrix whose
-  * inverse is equal to its transpose, \f$ U^{-1} = U^T \f$. A quasi-triangular
-  * matrix is a block-triangular matrix whose diagonal consists of 1-by-1
-  * blocks and 2-by-2 blocks with complex eigenvalues. The eigenvalues of the
-  * blocks on the diagonal of T are the same as the eigenvalues of the matrix
-  * A, and thus the real Schur decomposition is used in EigenSolver to compute
-  * the eigendecomposition of a matrix.
-  *
-  * Call the function compute() to compute the real Schur decomposition of a
-  * given matrix. Alternatively, you can use the RealSchur(const MatrixType&, bool)
-  * constructor which computes the real Schur decomposition at construction
-  * time. Once the decomposition is computed, you can use the matrixU() and
-  * matrixT() functions to retrieve the matrices U and T in the decomposition.
-  *
-  * The documentation of RealSchur(const MatrixType&, bool) contains an example
-  * of the typical use of this class.
-  *
-  * \note The implementation is adapted from
-  * <a href="http://math.nist.gov/javanumerics/jama/">JAMA</a> (public domain).
-  * Their code is based on EISPACK.
-  *
-  * \sa class ComplexSchur, class EigenSolver, class ComplexEigenSolver
-  */
-template<typename _MatrixType> class RealSchur
-{
-  public:
-    typedef _MatrixType MatrixType;
-    enum {
-      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-      Options = MatrixType::Options,
-      MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
-      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
-    };
-    typedef typename MatrixType::Scalar Scalar;
-    typedef std::complex<typename NumTraits<Scalar>::Real> ComplexScalar;
-    typedef typename MatrixType::Index Index;
-
-    typedef Matrix<ComplexScalar, ColsAtCompileTime, 1, Options & ~RowMajor, MaxColsAtCompileTime, 1> EigenvalueType;
-    typedef Matrix<Scalar, ColsAtCompileTime, 1, Options & ~RowMajor, MaxColsAtCompileTime, 1> ColumnVectorType;
-
-    /** \brief Default constructor.
-      *
-      * \param [in] size  Positive integer, size of the matrix whose Schur decomposition will be computed.
-      *
-      * The default constructor is useful in cases in which the user intends to
-      * perform decompositions via compute().  The \p size parameter is only
-      * used as a hint. It is not an error to give a wrong \p size, but it may
-      * impair performance.
-      *
-      * \sa compute() for an example.
-      */
-    RealSchur(Index size = RowsAtCompileTime==Dynamic ? 1 : RowsAtCompileTime)
-            : m_matT(size, size),
-              m_matU(size, size),
-              m_workspaceVector(size),
-              m_hess(size),
-              m_isInitialized(false),
-              m_matUisUptodate(false),
-              m_maxIters(-1)
-    { }
-
-    /** \brief Constructor; computes real Schur decomposition of given matrix. 
-      * 
-      * \param[in]  matrix    Square matrix whose Schur decomposition is to be computed.
-      * \param[in]  computeU  If true, both T and U are computed; if false, only T is computed.
-      *
-      * This constructor calls compute() to compute the Schur decomposition.
-      *
-      * Example: \include RealSchur_RealSchur_MatrixType.cpp
-      * Output: \verbinclude RealSchur_RealSchur_MatrixType.out
-      */
-    RealSchur(const MatrixType& matrix, bool computeU = true)
-            : m_matT(matrix.rows(),matrix.cols()),
-              m_matU(matrix.rows(),matrix.cols()),
-              m_workspaceVector(matrix.rows()),
-              m_hess(matrix.rows()),
-              m_isInitialized(false),
-              m_matUisUptodate(false),
-              m_maxIters(-1)
-    {
-      compute(matrix, computeU);
-    }
+ *
+ *
+ * \class RealSchur
+ *
+ * \brief Performs a real Schur decomposition of a square matrix
+ *
+ * \tparam MatrixType_ the type of the matrix of which we are computing the
+ * real Schur decomposition; this is expected to be an instantiation of the
+ * Matrix class template.
+ *
+ * Given a real square matrix A, this class computes the real Schur
+ * decomposition: \f$ A = U T U^T \f$ where U is a real orthogonal matrix and
+ * T is a real quasi-triangular matrix. An orthogonal matrix is a matrix whose
+ * inverse is equal to its transpose, \f$ U^{-1} = U^T \f$. A quasi-triangular
+ * matrix is a block-triangular matrix whose diagonal consists of 1-by-1
+ * blocks and 2-by-2 blocks with complex eigenvalues. The eigenvalues of the
+ * blocks on the diagonal of T are the same as the eigenvalues of the matrix
+ * A, and thus the real Schur decomposition is used in EigenSolver to compute
+ * the eigendecomposition of a matrix.
+ *
+ * Call the function compute() to compute the real Schur decomposition of a
+ * given matrix. Alternatively, you can use the RealSchur(const MatrixType&, bool)
+ * constructor which computes the real Schur decomposition at construction
+ * time. Once the decomposition is computed, you can use the matrixU() and
+ * matrixT() functions to retrieve the matrices U and T in the decomposition.
+ *
+ * The documentation of RealSchur(const MatrixType&, bool) contains an example
+ * of the typical use of this class.
+ *
+ * \note The implementation is adapted from
+ * <a href="http://math.nist.gov/javanumerics/jama/">JAMA</a> (public domain).
+ * Their code is based on EISPACK.
+ *
+ * \sa class ComplexSchur, class EigenSolver, class ComplexEigenSolver
+ */
+template <typename MatrixType_>
+class RealSchur {
+ public:
+  typedef MatrixType_ MatrixType;
+  enum {
+    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+    ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+    Options = internal::traits<MatrixType>::Options,
+    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+  };
+  typedef typename MatrixType::Scalar Scalar;
+  typedef internal::make_complex_t<Scalar> ComplexScalar;
+  typedef Eigen::Index Index;  ///< \deprecated since Eigen 3.3
+
+  typedef Matrix<ComplexScalar, ColsAtCompileTime, 1, Options & ~RowMajor, MaxColsAtCompileTime, 1> EigenvalueType;
+  typedef Matrix<Scalar, ColsAtCompileTime, 1, Options & ~RowMajor, MaxColsAtCompileTime, 1> ColumnVectorType;
+
+  /** \brief Default constructor.
+   *
+   * \param [in] size  Positive integer, size of the matrix whose Schur decomposition will be computed.
+   *
+   * The default constructor is useful in cases in which the user intends to
+   * perform decompositions via compute().  The \p size parameter is only
+   * used as a hint. It is not an error to give a wrong \p size, but it may
+   * impair performance.
+   *
+   * \sa compute() for an example.
+   */
+  explicit RealSchur(Index size = RowsAtCompileTime == Dynamic ? 1 : RowsAtCompileTime)
+      : m_matT(size, size),
+        m_matU(size, size),
+        m_workspaceVector(size),
+        m_hess(size),
+        m_isInitialized(false),
+        m_matUisUptodate(false),
+        m_maxIters(-1) {}
+
+  /** \brief Constructor; computes real Schur decomposition of given matrix.
+   *
+   * \param[in]  matrix    Square matrix whose Schur decomposition is to be computed.
+   * \param[in]  computeU  If true, both T and U are computed; if false, only T is computed.
+   *
+   * This constructor calls compute() to compute the Schur decomposition.
+   *
+   * Example: \include RealSchur_RealSchur_MatrixType.cpp
+   * Output: \verbinclude RealSchur_RealSchur_MatrixType.out
+   */
+  template <typename InputType>
+  explicit RealSchur(const EigenBase<InputType>& matrix, bool computeU = true)
+      : m_matT(matrix.rows(), matrix.cols()),
+        m_matU(matrix.rows(), matrix.cols()),
+        m_workspaceVector(matrix.rows()),
+        m_hess(matrix.rows()),
+        m_isInitialized(false),
+        m_matUisUptodate(false),
+        m_maxIters(-1) {
+    compute(matrix.derived(), computeU);
+  }
 
-    /** \brief Returns the orthogonal matrix in the Schur decomposition. 
-      *
-      * \returns A const reference to the matrix U.
-      *
-      * \pre Either the constructor RealSchur(const MatrixType&, bool) or the
-      * member function compute(const MatrixType&, bool) has been called before
-      * to compute the Schur decomposition of a matrix, and \p computeU was set
-      * to true (the default value).
-      *
-      * \sa RealSchur(const MatrixType&, bool) for an example
-      */
-    const MatrixType& matrixU() const
-    {
-      eigen_assert(m_isInitialized && "RealSchur is not initialized.");
-      eigen_assert(m_matUisUptodate && "The matrix U has not been computed during the RealSchur decomposition.");
-      return m_matU;
-    }
+  /** \brief Returns the orthogonal matrix in the Schur decomposition.
+   *
+   * \returns A const reference to the matrix U.
+   *
+   * \pre Either the constructor RealSchur(const MatrixType&, bool) or the
+   * member function compute(const MatrixType&, bool) has been called before
+   * to compute the Schur decomposition of a matrix, and \p computeU was set
+   * to true (the default value).
+   *
+   * \sa RealSchur(const MatrixType&, bool) for an example
+   */
+  const MatrixType& matrixU() const {
+    eigen_assert(m_isInitialized && "RealSchur is not initialized.");
+    eigen_assert(m_matUisUptodate && "The matrix U has not been computed during the RealSchur decomposition.");
+    return m_matU;
+  }
 
-    /** \brief Returns the quasi-triangular matrix in the Schur decomposition. 
-      *
-      * \returns A const reference to the matrix T.
-      *
-      * \pre Either the constructor RealSchur(const MatrixType&, bool) or the
-      * member function compute(const MatrixType&, bool) has been called before
-      * to compute the Schur decomposition of a matrix.
-      *
-      * \sa RealSchur(const MatrixType&, bool) for an example
-      */
-    const MatrixType& matrixT() const
-    {
-      eigen_assert(m_isInitialized && "RealSchur is not initialized.");
-      return m_matT;
-    }
-  
-    /** \brief Computes Schur decomposition of given matrix. 
-      * 
-      * \param[in]  matrix    Square matrix whose Schur decomposition is to be computed.
-      * \param[in]  computeU  If true, both T and U are computed; if false, only T is computed.
-      * \returns    Reference to \c *this
-      *
-      * The Schur decomposition is computed by first reducing the matrix to
-      * Hessenberg form using the class HessenbergDecomposition. The Hessenberg
-      * matrix is then reduced to triangular form by performing Francis QR
-      * iterations with implicit double shift. The cost of computing the Schur
-      * decomposition depends on the number of iterations; as a rough guide, it
-      * may be taken to be \f$25n^3\f$ flops if \a computeU is true and
-      * \f$10n^3\f$ flops if \a computeU is false.
-      *
-      * Example: \include RealSchur_compute.cpp
-      * Output: \verbinclude RealSchur_compute.out
-      *
-      * \sa compute(const MatrixType&, bool, Index)
-      */
-    RealSchur& compute(const MatrixType& matrix, bool computeU = true);
-
-    /** \brief Computes Schur decomposition of a Hessenberg matrix H = Z T Z^T
-     *  \param[in] matrixH Matrix in Hessenberg form H
-     *  \param[in] matrixQ orthogonal matrix Q that transform a matrix A to H : A = Q H Q^T
-     *  \param computeU Computes the matriX U of the Schur vectors
-     * \return Reference to \c *this
-     * 
-     *  This routine assumes that the matrix is already reduced in Hessenberg form matrixH
-     *  using either the class HessenbergDecomposition or another mean. 
-     *  It computes the upper quasi-triangular matrix T of the Schur decomposition of H
-     *  When computeU is true, this routine computes the matrix U such that 
-     *  A = U T U^T =  (QZ) T (QZ)^T = Q H Q^T where A is the initial matrix
-     * 
-     * NOTE Q is referenced if computeU is true; so, if the initial orthogonal matrix
-     * is not available, the user should give an identity matrix (Q.setIdentity())
-     * 
-     * \sa compute(const MatrixType&, bool)
-     */
-    template<typename HessMatrixType, typename OrthMatrixType>
-    RealSchur& computeFromHessenberg(const HessMatrixType& matrixH, const OrthMatrixType& matrixQ,  bool computeU);
-    /** \brief Reports whether previous computation was successful.
-      *
-      * \returns \c Success if computation was succesful, \c NoConvergence otherwise.
-      */
-    ComputationInfo info() const
-    {
-      eigen_assert(m_isInitialized && "RealSchur is not initialized.");
-      return m_info;
-    }
+  /** \brief Returns the quasi-triangular matrix in the Schur decomposition.
+   *
+   * \returns A const reference to the matrix T.
+   *
+   * \pre Either the constructor RealSchur(const MatrixType&, bool) or the
+   * member function compute(const MatrixType&, bool) has been called before
+   * to compute the Schur decomposition of a matrix.
+   *
+   * \sa RealSchur(const MatrixType&, bool) for an example
+   */
+  const MatrixType& matrixT() const {
+    eigen_assert(m_isInitialized && "RealSchur is not initialized.");
+    return m_matT;
+  }
 
-    /** \brief Sets the maximum number of iterations allowed. 
-      *
-      * If not specified by the user, the maximum number of iterations is m_maxIterationsPerRow times the size
-      * of the matrix.
-      */
-    RealSchur& setMaxIterations(Index maxIters)
-    {
-      m_maxIters = maxIters;
-      return *this;
-    }
+  /** \brief Computes Schur decomposition of given matrix.
+   *
+   * \param[in]  matrix    Square matrix whose Schur decomposition is to be computed.
+   * \param[in]  computeU  If true, both T and U are computed; if false, only T is computed.
+   * \returns    Reference to \c *this
+   *
+   * The Schur decomposition is computed by first reducing the matrix to
+   * Hessenberg form using the class HessenbergDecomposition. The Hessenberg
+   * matrix is then reduced to triangular form by performing Francis QR
+   * iterations with implicit double shift. The cost of computing the Schur
+   * decomposition depends on the number of iterations; as a rough guide, it
+   * may be taken to be \f$25n^3\f$ flops if \a computeU is true and
+   * \f$10n^3\f$ flops if \a computeU is false.
+   *
+   * Example: \include RealSchur_compute.cpp
+   * Output: \verbinclude RealSchur_compute.out
+   *
+   * \sa compute(const MatrixType&, bool, Index)
+   */
+  template <typename InputType>
+  RealSchur& compute(const EigenBase<InputType>& matrix, bool computeU = true);
+
+  /** \brief Computes Schur decomposition of a Hessenberg matrix H = Z T Z^T
+   *  \param[in] matrixH Matrix in Hessenberg form H
+   *  \param[in] matrixQ orthogonal matrix Q that transform a matrix A to H : A = Q H Q^T
+   *  \param computeU Computes the matriX U of the Schur vectors
+   * \return Reference to \c *this
+   *
+   *  This routine assumes that the matrix is already reduced in Hessenberg form matrixH
+   *  using either the class HessenbergDecomposition or another mean.
+   *  It computes the upper quasi-triangular matrix T of the Schur decomposition of H
+   *  When computeU is true, this routine computes the matrix U such that
+   *  A = U T U^T =  (QZ) T (QZ)^T = Q H Q^T where A is the initial matrix
+   *
+   * NOTE Q is referenced if computeU is true; so, if the initial orthogonal matrix
+   * is not available, the user should give an identity matrix (Q.setIdentity())
+   *
+   * \sa compute(const MatrixType&, bool)
+   */
+  template <typename HessMatrixType, typename OrthMatrixType>
+  RealSchur& computeFromHessenberg(const HessMatrixType& matrixH, const OrthMatrixType& matrixQ, bool computeU);
+  /** \brief Reports whether previous computation was successful.
+   *
+   * \returns \c Success if computation was successful, \c NoConvergence otherwise.
+   */
+  ComputationInfo info() const {
+    eigen_assert(m_isInitialized && "RealSchur is not initialized.");
+    return m_info;
+  }
 
-    /** \brief Returns the maximum number of iterations. */
-    Index getMaxIterations()
-    {
-      return m_maxIters;
-    }
+  /** \brief Sets the maximum number of iterations allowed.
+   *
+   * If not specified by the user, the maximum number of iterations is m_maxIterationsPerRow times the size
+   * of the matrix.
+   */
+  RealSchur& setMaxIterations(Index maxIters) {
+    m_maxIters = maxIters;
+    return *this;
+  }
 
-    /** \brief Maximum number of iterations per row.
-      *
-      * If not otherwise specified, the maximum number of iterations is this number times the size of the
-      * matrix. It is currently set to 40.
-      */
-    static const int m_maxIterationsPerRow = 40;
-
-  private:
-    
-    MatrixType m_matT;
-    MatrixType m_matU;
-    ColumnVectorType m_workspaceVector;
-    HessenbergDecomposition<MatrixType> m_hess;
-    ComputationInfo m_info;
-    bool m_isInitialized;
-    bool m_matUisUptodate;
-    Index m_maxIters;
-
-    typedef Matrix<Scalar,3,1> Vector3s;
-
-    Scalar computeNormOfT();
-    Index findSmallSubdiagEntry(Index iu);
-    void splitOffTwoRows(Index iu, bool computeU, const Scalar& exshift);
-    void computeShift(Index iu, Index iter, Scalar& exshift, Vector3s& shiftInfo);
-    void initFrancisQRStep(Index il, Index iu, const Vector3s& shiftInfo, Index& im, Vector3s& firstHouseholderVector);
-    void performFrancisQRStep(Index il, Index im, Index iu, bool computeU, const Vector3s& firstHouseholderVector, Scalar* workspace);
+  /** \brief Returns the maximum number of iterations. */
+  Index getMaxIterations() { return m_maxIters; }
+
+  /** \brief Maximum number of iterations per row.
+   *
+   * If not otherwise specified, the maximum number of iterations is this number times the size of the
+   * matrix. It is currently set to 40.
+   */
+  static const int m_maxIterationsPerRow = 40;
+
+ private:
+  MatrixType m_matT;
+  MatrixType m_matU;
+  ColumnVectorType m_workspaceVector;
+  HessenbergDecomposition<MatrixType> m_hess;
+  ComputationInfo m_info;
+  bool m_isInitialized;
+  bool m_matUisUptodate;
+  Index m_maxIters;
+
+  typedef Matrix<Scalar, 3, 1> Vector3s;
+
+  Scalar computeNormOfT();
+  Index findSmallSubdiagEntry(Index iu, const Scalar& considerAsZero);
+  void splitOffTwoRows(Index iu, bool computeU, const Scalar& exshift);
+  void computeShift(Index iu, Index iter, Scalar& exshift, Vector3s& shiftInfo);
+  void initFrancisQRStep(Index il, Index iu, const Vector3s& shiftInfo, Index& im, Vector3s& firstHouseholderVector);
+  void performFrancisQRStep(Index il, Index im, Index iu, bool computeU, const Vector3s& firstHouseholderVector,
+                            Scalar* workspace);
 };
 
+template <typename MatrixType>
+template <typename InputType>
+RealSchur<MatrixType>& RealSchur<MatrixType>::compute(const EigenBase<InputType>& matrix, bool computeU) {
+  const Scalar considerAsZero = (std::numeric_limits<Scalar>::min)();
 
-template<typename MatrixType>
-RealSchur<MatrixType>& RealSchur<MatrixType>::compute(const MatrixType& matrix, bool computeU)
-{
   eigen_assert(matrix.cols() == matrix.rows());
   Index maxIters = m_maxIters;
-  if (maxIters == -1)
-    maxIters = m_maxIterationsPerRow * matrix.rows();
+  if (maxIters == -1) maxIters = m_maxIterationsPerRow * matrix.rows();
+
+  Scalar scale = matrix.derived().cwiseAbs().maxCoeff();
+  if (scale < considerAsZero) {
+    m_matT.setZero(matrix.rows(), matrix.cols());
+    if (computeU) m_matU.setIdentity(matrix.rows(), matrix.cols());
+    m_info = Success;
+    m_isInitialized = true;
+    m_matUisUptodate = computeU;
+    return *this;
+  }
 
   // Step 1. Reduce to Hessenberg form
-  m_hess.compute(matrix);
+  m_hess.compute(matrix.derived() / scale);
+
+  // Step 2. Reduce to real Schur form
+  // Note: we copy m_hess.matrixQ() into m_matU here and not in computeFromHessenberg
+  //       to be able to pass our working-space buffer for the Householder to Dense evaluation.
+  m_workspaceVector.resize(matrix.cols());
+  if (computeU) m_hess.matrixQ().evalTo(m_matU, m_workspaceVector);
+  computeFromHessenberg(m_hess.matrixH(), m_matU, computeU);
+
+  m_matT *= scale;
 
-  // Step 2. Reduce to real Schur form  
-  computeFromHessenberg(m_hess.matrixH(), m_hess.matrixQ(), computeU);
-  
   return *this;
 }
-template<typename MatrixType>
-template<typename HessMatrixType, typename OrthMatrixType>
-RealSchur<MatrixType>& RealSchur<MatrixType>::computeFromHessenberg(const HessMatrixType& matrixH, const OrthMatrixType& matrixQ,  bool computeU)
-{  
-  m_matT = matrixH; 
-  if(computeU)
-    m_matU = matrixQ;
-  
-  Index maxIters = m_maxIters;
-  if (maxIters == -1)
-    maxIters = m_maxIterationsPerRow * matrixH.rows();
+template <typename MatrixType>
+template <typename HessMatrixType, typename OrthMatrixType>
+RealSchur<MatrixType>& RealSchur<MatrixType>::computeFromHessenberg(const HessMatrixType& matrixH,
+                                                                    const OrthMatrixType& matrixQ, bool computeU) {
+  using std::abs;
+
+  m_matT = matrixH;
   m_workspaceVector.resize(m_matT.cols());
+  if (computeU && !internal::is_same_dense(m_matU, matrixQ)) m_matU = matrixQ;
+
+  Index maxIters = m_maxIters;
+  if (maxIters == -1) maxIters = m_maxIterationsPerRow * matrixH.rows();
   Scalar* workspace = &m_workspaceVector.coeffRef(0);
 
-  // The matrix m_matT is divided in three parts. 
-  // Rows 0,...,il-1 are decoupled from the rest because m_matT(il,il-1) is zero. 
+  // The matrix m_matT is divided in three parts.
+  // Rows 0,...,il-1 are decoupled from the rest because m_matT(il,il-1) is zero.
   // Rows il,...,iu is the part we are working on (the active window).
   // Rows iu+1,...,end are already brought in triangular form.
   Index iu = m_matT.cols() - 1;
-  Index iter = 0;      // iteration count for current eigenvalue
-  Index totalIter = 0; // iteration count for whole matrix
-  Scalar exshift(0);   // sum of exceptional shifts
+  Index iter = 0;       // iteration count for current eigenvalue
+  Index totalIter = 0;  // iteration count for whole matrix
+  Scalar exshift(0);    // sum of exceptional shifts
   Scalar norm = computeNormOfT();
+  // sub-diagonal entries smaller than considerAsZero will be treated as zero.
+  // We use eps^2 to enable more precision in small eigenvalues.
+  Scalar considerAsZero =
+      numext::maxi<Scalar>(norm * numext::abs2(NumTraits<Scalar>::epsilon()), (std::numeric_limits<Scalar>::min)());
 
-  if(norm!=0)
-  {
-    while (iu >= 0)
-    {
-      Index il = findSmallSubdiagEntry(iu);
+  if (!numext::is_exactly_zero(norm)) {
+    while (iu >= 0) {
+      Index il = findSmallSubdiagEntry(iu, considerAsZero);
 
       // Check for convergence
-      if (il == iu) // One root found
+      if (il == iu)  // One root found
       {
-        m_matT.coeffRef(iu,iu) = m_matT.coeff(iu,iu) + exshift;
-        if (iu > 0)
-          m_matT.coeffRef(iu, iu-1) = Scalar(0);
+        m_matT.coeffRef(iu, iu) = m_matT.coeff(iu, iu) + exshift;
+        if (iu > 0) m_matT.coeffRef(iu, iu - 1) = Scalar(0);
         iu--;
         iter = 0;
-      }
-      else if (il == iu-1) // Two roots found
+      } else if (il == iu - 1)  // Two roots found
       {
         splitOffTwoRows(iu, computeU, exshift);
         iu -= 2;
         iter = 0;
-      }
-      else // No convergence yet
+      } else  // No convergence yet
       {
-        // The firstHouseholderVector vector has to be initialized to something to get rid of a silly GCC warning (-O1 -Wall -DNDEBUG )
-        Vector3s firstHouseholderVector(0,0,0), shiftInfo;
+        // The firstHouseholderVector vector has to be initialized to something to get rid of a silly GCC warning (-O1
+        // -Wall -DNDEBUG )
+        Vector3s firstHouseholderVector = Vector3s::Zero(), shiftInfo;
         computeShift(iu, iter, exshift, shiftInfo);
         iter = iter + 1;
         totalIter = totalIter + 1;
@@ -317,7 +329,7 @@ RealSchur<MatrixType>& RealSchur<MatrixType>::computeFromHessenberg(const HessMa
       }
     }
   }
-  if(totalIter <= maxIters)
+  if (totalIter <= maxIters)
     m_info = Success;
   else
     m_info = NoConvergence;
@@ -328,198 +340,180 @@ RealSchur<MatrixType>& RealSchur<MatrixType>::computeFromHessenberg(const HessMa
 }
 
 /** \internal Computes and returns vector L1 norm of T */
-template<typename MatrixType>
-inline typename MatrixType::Scalar RealSchur<MatrixType>::computeNormOfT()
-{
+template <typename MatrixType>
+inline typename MatrixType::Scalar RealSchur<MatrixType>::computeNormOfT() {
   const Index size = m_matT.cols();
   // FIXME to be efficient the following would requires a triangular reduxion code
-  // Scalar norm = m_matT.upper().cwiseAbs().sum() 
+  // Scalar norm = m_matT.upper().cwiseAbs().sum()
   //               + m_matT.bottomLeftCorner(size-1,size-1).diagonal().cwiseAbs().sum();
   Scalar norm(0);
-  for (Index j = 0; j < size; ++j)
-    norm += m_matT.col(j).segment(0, (std::min)(size,j+2)).cwiseAbs().sum();
+  for (Index j = 0; j < size; ++j) norm += m_matT.col(j).segment(0, (std::min)(size, j + 2)).cwiseAbs().sum();
   return norm;
 }
 
 /** \internal Look for single small sub-diagonal element and returns its index */
-template<typename MatrixType>
-inline typename MatrixType::Index RealSchur<MatrixType>::findSmallSubdiagEntry(Index iu)
-{
+template <typename MatrixType>
+inline Index RealSchur<MatrixType>::findSmallSubdiagEntry(Index iu, const Scalar& considerAsZero) {
   using std::abs;
   Index res = iu;
-  while (res > 0)
-  {
-    Scalar s = abs(m_matT.coeff(res-1,res-1)) + abs(m_matT.coeff(res,res));
-    if (abs(m_matT.coeff(res,res-1)) <= NumTraits<Scalar>::epsilon() * s)
-      break;
+  while (res > 0) {
+    Scalar s = abs(m_matT.coeff(res - 1, res - 1)) + abs(m_matT.coeff(res, res));
+
+    s = numext::maxi<Scalar>(s * NumTraits<Scalar>::epsilon(), considerAsZero);
+
+    if (abs(m_matT.coeff(res, res - 1)) <= s) break;
     res--;
   }
   return res;
 }
 
 /** \internal Update T given that rows iu-1 and iu decouple from the rest. */
-template<typename MatrixType>
-inline void RealSchur<MatrixType>::splitOffTwoRows(Index iu, bool computeU, const Scalar& exshift)
-{
-  using std::sqrt;
+template <typename MatrixType>
+inline void RealSchur<MatrixType>::splitOffTwoRows(Index iu, bool computeU, const Scalar& exshift) {
   using std::abs;
+  using std::sqrt;
   const Index size = m_matT.cols();
 
-  // The eigenvalues of the 2x2 matrix [a b; c d] are 
+  // The eigenvalues of the 2x2 matrix [a b; c d] are
   // trace +/- sqrt(discr/4) where discr = tr^2 - 4*det, tr = a + d, det = ad - bc
-  Scalar p = Scalar(0.5) * (m_matT.coeff(iu-1,iu-1) - m_matT.coeff(iu,iu));
-  Scalar q = p * p + m_matT.coeff(iu,iu-1) * m_matT.coeff(iu-1,iu);   // q = tr^2 / 4 - det = discr/4
-  m_matT.coeffRef(iu,iu) += exshift;
-  m_matT.coeffRef(iu-1,iu-1) += exshift;
+  Scalar p = Scalar(0.5) * (m_matT.coeff(iu - 1, iu - 1) - m_matT.coeff(iu, iu));
+  Scalar q = p * p + m_matT.coeff(iu, iu - 1) * m_matT.coeff(iu - 1, iu);  // q = tr^2 / 4 - det = discr/4
+  m_matT.coeffRef(iu, iu) += exshift;
+  m_matT.coeffRef(iu - 1, iu - 1) += exshift;
 
-  if (q >= Scalar(0)) // Two real eigenvalues
+  if (q >= Scalar(0))  // Two real eigenvalues
   {
     Scalar z = sqrt(abs(q));
     JacobiRotation<Scalar> rot;
     if (p >= Scalar(0))
-      rot.makeGivens(p + z, m_matT.coeff(iu, iu-1));
+      rot.makeGivens(p + z, m_matT.coeff(iu, iu - 1));
     else
-      rot.makeGivens(p - z, m_matT.coeff(iu, iu-1));
+      rot.makeGivens(p - z, m_matT.coeff(iu, iu - 1));
 
-    m_matT.rightCols(size-iu+1).applyOnTheLeft(iu-1, iu, rot.adjoint());
-    m_matT.topRows(iu+1).applyOnTheRight(iu-1, iu, rot);
-    m_matT.coeffRef(iu, iu-1) = Scalar(0); 
-    if (computeU)
-      m_matU.applyOnTheRight(iu-1, iu, rot);
+    m_matT.rightCols(size - iu + 1).applyOnTheLeft(iu - 1, iu, rot.adjoint());
+    m_matT.topRows(iu + 1).applyOnTheRight(iu - 1, iu, rot);
+    m_matT.coeffRef(iu, iu - 1) = Scalar(0);
+    if (computeU) m_matU.applyOnTheRight(iu - 1, iu, rot);
   }
 
-  if (iu > 1) 
-    m_matT.coeffRef(iu-1, iu-2) = Scalar(0);
+  if (iu > 1) m_matT.coeffRef(iu - 1, iu - 2) = Scalar(0);
 }
 
 /** \internal Form shift in shiftInfo, and update exshift if an exceptional shift is performed. */
-template<typename MatrixType>
-inline void RealSchur<MatrixType>::computeShift(Index iu, Index iter, Scalar& exshift, Vector3s& shiftInfo)
-{
-  using std::sqrt;
+template <typename MatrixType>
+inline void RealSchur<MatrixType>::computeShift(Index iu, Index iter, Scalar& exshift, Vector3s& shiftInfo) {
   using std::abs;
-  shiftInfo.coeffRef(0) = m_matT.coeff(iu,iu);
-  shiftInfo.coeffRef(1) = m_matT.coeff(iu-1,iu-1);
-  shiftInfo.coeffRef(2) = m_matT.coeff(iu,iu-1) * m_matT.coeff(iu-1,iu);
-
-  // Wilkinson's original ad hoc shift
-  if (iter == 10)
-  {
-    exshift += shiftInfo.coeff(0);
-    for (Index i = 0; i <= iu; ++i)
-      m_matT.coeffRef(i,i) -= shiftInfo.coeff(0);
-    Scalar s = abs(m_matT.coeff(iu,iu-1)) + abs(m_matT.coeff(iu-1,iu-2));
-    shiftInfo.coeffRef(0) = Scalar(0.75) * s;
-    shiftInfo.coeffRef(1) = Scalar(0.75) * s;
-    shiftInfo.coeffRef(2) = Scalar(-0.4375) * s * s;
-  }
-
-  // MATLAB's new ad hoc shift
-  if (iter == 30)
-  {
-    Scalar s = (shiftInfo.coeff(1) - shiftInfo.coeff(0)) / Scalar(2.0);
-    s = s * s + shiftInfo.coeff(2);
-    if (s > Scalar(0))
-    {
-      s = sqrt(s);
-      if (shiftInfo.coeff(1) < shiftInfo.coeff(0))
-        s = -s;
-      s = s + (shiftInfo.coeff(1) - shiftInfo.coeff(0)) / Scalar(2.0);
-      s = shiftInfo.coeff(0) - shiftInfo.coeff(2) / s;
-      exshift += s;
-      for (Index i = 0; i <= iu; ++i)
-        m_matT.coeffRef(i,i) -= s;
-      shiftInfo.setConstant(Scalar(0.964));
+  using std::sqrt;
+  shiftInfo.coeffRef(0) = m_matT.coeff(iu, iu);
+  shiftInfo.coeffRef(1) = m_matT.coeff(iu - 1, iu - 1);
+  shiftInfo.coeffRef(2) = m_matT.coeff(iu, iu - 1) * m_matT.coeff(iu - 1, iu);
+
+  // Alternate exceptional shifting strategy every 16 iterations.
+  if (iter > 0 && iter % 16 == 0) {
+    // Wilkinson's original ad hoc shift
+    if (iter % 32 != 0) {
+      exshift += shiftInfo.coeff(0);
+      for (Index i = 0; i <= iu; ++i) m_matT.coeffRef(i, i) -= shiftInfo.coeff(0);
+      Scalar s = abs(m_matT.coeff(iu, iu - 1)) + abs(m_matT.coeff(iu - 1, iu - 2));
+      shiftInfo.coeffRef(0) = Scalar(0.75) * s;
+      shiftInfo.coeffRef(1) = Scalar(0.75) * s;
+      shiftInfo.coeffRef(2) = Scalar(-0.4375) * s * s;
+    } else {
+      // MATLAB's new ad hoc shift
+      Scalar s = (shiftInfo.coeff(1) - shiftInfo.coeff(0)) / Scalar(2.0);
+      s = s * s + shiftInfo.coeff(2);
+      if (s > Scalar(0)) {
+        s = sqrt(s);
+        if (shiftInfo.coeff(1) < shiftInfo.coeff(0)) s = -s;
+        s = s + (shiftInfo.coeff(1) - shiftInfo.coeff(0)) / Scalar(2.0);
+        s = shiftInfo.coeff(0) - shiftInfo.coeff(2) / s;
+        exshift += s;
+        for (Index i = 0; i <= iu; ++i) m_matT.coeffRef(i, i) -= s;
+        shiftInfo.setConstant(Scalar(0.964));
+      }
     }
   }
 }
 
 /** \internal Compute index im at which Francis QR step starts and the first Householder vector. */
-template<typename MatrixType>
-inline void RealSchur<MatrixType>::initFrancisQRStep(Index il, Index iu, const Vector3s& shiftInfo, Index& im, Vector3s& firstHouseholderVector)
-{
+template <typename MatrixType>
+inline void RealSchur<MatrixType>::initFrancisQRStep(Index il, Index iu, const Vector3s& shiftInfo, Index& im,
+                                                     Vector3s& firstHouseholderVector) {
   using std::abs;
-  Vector3s& v = firstHouseholderVector; // alias to save typing
+  Vector3s& v = firstHouseholderVector;  // alias to save typing
 
-  for (im = iu-2; im >= il; --im)
-  {
-    const Scalar Tmm = m_matT.coeff(im,im);
+  for (im = iu - 2; im >= il; --im) {
+    const Scalar Tmm = m_matT.coeff(im, im);
     const Scalar r = shiftInfo.coeff(0) - Tmm;
     const Scalar s = shiftInfo.coeff(1) - Tmm;
-    v.coeffRef(0) = (r * s - shiftInfo.coeff(2)) / m_matT.coeff(im+1,im) + m_matT.coeff(im,im+1);
-    v.coeffRef(1) = m_matT.coeff(im+1,im+1) - Tmm - r - s;
-    v.coeffRef(2) = m_matT.coeff(im+2,im+1);
+    v.coeffRef(0) = (r * s - shiftInfo.coeff(2)) / m_matT.coeff(im + 1, im) + m_matT.coeff(im, im + 1);
+    v.coeffRef(1) = m_matT.coeff(im + 1, im + 1) - Tmm - r - s;
+    v.coeffRef(2) = m_matT.coeff(im + 2, im + 1);
     if (im == il) {
       break;
     }
-    const Scalar lhs = m_matT.coeff(im,im-1) * (abs(v.coeff(1)) + abs(v.coeff(2)));
-    const Scalar rhs = v.coeff(0) * (abs(m_matT.coeff(im-1,im-1)) + abs(Tmm) + abs(m_matT.coeff(im+1,im+1)));
-    if (abs(lhs) < NumTraits<Scalar>::epsilon() * rhs)
-      break;
+    const Scalar lhs = m_matT.coeff(im, im - 1) * (abs(v.coeff(1)) + abs(v.coeff(2)));
+    const Scalar rhs = v.coeff(0) * (abs(m_matT.coeff(im - 1, im - 1)) + abs(Tmm) + abs(m_matT.coeff(im + 1, im + 1)));
+    if (abs(lhs) < NumTraits<Scalar>::epsilon() * rhs) break;
   }
 }
 
 /** \internal Perform a Francis QR step involving rows il:iu and columns im:iu. */
-template<typename MatrixType>
-inline void RealSchur<MatrixType>::performFrancisQRStep(Index il, Index im, Index iu, bool computeU, const Vector3s& firstHouseholderVector, Scalar* workspace)
-{
+template <typename MatrixType>
+inline void RealSchur<MatrixType>::performFrancisQRStep(Index il, Index im, Index iu, bool computeU,
+                                                        const Vector3s& firstHouseholderVector, Scalar* workspace) {
   eigen_assert(im >= il);
-  eigen_assert(im <= iu-2);
+  eigen_assert(im <= iu - 2);
 
   const Index size = m_matT.cols();
 
-  for (Index k = im; k <= iu-2; ++k)
-  {
+  for (Index k = im; k <= iu - 2; ++k) {
     bool firstIteration = (k == im);
 
     Vector3s v;
     if (firstIteration)
       v = firstHouseholderVector;
     else
-      v = m_matT.template block<3,1>(k,k-1);
+      v = m_matT.template block<3, 1>(k, k - 1);
 
     Scalar tau, beta;
     Matrix<Scalar, 2, 1> ess;
     v.makeHouseholder(ess, tau, beta);
-    
-    if (beta != Scalar(0)) // if v is not zero
+
+    if (!numext::is_exactly_zero(beta))  // if v is not zero
     {
       if (firstIteration && k > il)
-        m_matT.coeffRef(k,k-1) = -m_matT.coeff(k,k-1);
+        m_matT.coeffRef(k, k - 1) = -m_matT.coeff(k, k - 1);
       else if (!firstIteration)
-        m_matT.coeffRef(k,k-1) = beta;
+        m_matT.coeffRef(k, k - 1) = beta;
 
       // These Householder transformations form the O(n^3) part of the algorithm
-      m_matT.block(k, k, 3, size-k).applyHouseholderOnTheLeft(ess, tau, workspace);
-      m_matT.block(0, k, (std::min)(iu,k+3) + 1, 3).applyHouseholderOnTheRight(ess, tau, workspace);
-      if (computeU)
-        m_matU.block(0, k, size, 3).applyHouseholderOnTheRight(ess, tau, workspace);
+      m_matT.block(k, k, 3, size - k).applyHouseholderOnTheLeft(ess, tau, workspace);
+      m_matT.block(0, k, (std::min)(iu, k + 3) + 1, 3).applyHouseholderOnTheRight(ess, tau, workspace);
+      if (computeU) m_matU.block(0, k, size, 3).applyHouseholderOnTheRight(ess, tau, workspace);
     }
   }
 
-  Matrix<Scalar, 2, 1> v = m_matT.template block<2,1>(iu-1, iu-2);
+  Matrix<Scalar, 2, 1> v = m_matT.template block<2, 1>(iu - 1, iu - 2);
   Scalar tau, beta;
   Matrix<Scalar, 1, 1> ess;
   v.makeHouseholder(ess, tau, beta);
 
-  if (beta != Scalar(0)) // if v is not zero
+  if (!numext::is_exactly_zero(beta))  // if v is not zero
   {
-    m_matT.coeffRef(iu-1, iu-2) = beta;
-    m_matT.block(iu-1, iu-1, 2, size-iu+1).applyHouseholderOnTheLeft(ess, tau, workspace);
-    m_matT.block(0, iu-1, iu+1, 2).applyHouseholderOnTheRight(ess, tau, workspace);
-    if (computeU)
-      m_matU.block(0, iu-1, size, 2).applyHouseholderOnTheRight(ess, tau, workspace);
+    m_matT.coeffRef(iu - 1, iu - 2) = beta;
+    m_matT.block(iu - 1, iu - 1, 2, size - iu + 1).applyHouseholderOnTheLeft(ess, tau, workspace);
+    m_matT.block(0, iu - 1, iu + 1, 2).applyHouseholderOnTheRight(ess, tau, workspace);
+    if (computeU) m_matU.block(0, iu - 1, size, 2).applyHouseholderOnTheRight(ess, tau, workspace);
   }
 
   // clean up pollution due to round-off errors
-  for (Index i = im+2; i <= iu; ++i)
-  {
-    m_matT.coeffRef(i,i-2) = Scalar(0);
-    if (i > im+2)
-      m_matT.coeffRef(i,i-3) = Scalar(0);
+  for (Index i = im + 2; i <= iu; ++i) {
+    m_matT.coeffRef(i, i - 2) = Scalar(0);
+    if (i > im + 2) m_matT.coeffRef(i, i - 3) = Scalar(0);
   }
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_REAL_SCHUR_H
+#endif  // EIGEN_REAL_SCHUR_H
diff --git a/inst/include/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h b/inst/include/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h
new file mode 100644
index 00000000..05a516d1
--- /dev/null
+++ b/inst/include/Eigen/src/Eigenvalues/RealSchur_LAPACKE.h
@@ -0,0 +1,83 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to LAPACKe
+ *    Real Schur needed to real unsymmetrical eigenvalues/eigenvectors.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_REAL_SCHUR_LAPACKE_H
+#define EIGEN_REAL_SCHUR_LAPACKE_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \internal Specialization for the data types supported by LAPACKe */
+
+#define EIGEN_LAPACKE_SCHUR_REAL(EIGTYPE, LAPACKE_TYPE, LAPACKE_PREFIX, LAPACKE_PREFIX_U, EIGCOLROW, LAPACKE_COLROW) \
+  template <>                                                                                                        \
+  template <typename InputType>                                                                                      \
+  inline RealSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >&                                                   \
+  RealSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const EigenBase<InputType>& matrix,              \
+                                                                    bool computeU) {                                 \
+    eigen_assert(matrix.cols() == matrix.rows());                                                                    \
+                                                                                                                     \
+    lapack_int n = internal::convert_index<lapack_int>(matrix.cols()), sdim, info;                                   \
+    lapack_int matrix_order = LAPACKE_COLROW;                                                                        \
+    char jobvs, sort = 'N';                                                                                          \
+    LAPACK_##LAPACKE_PREFIX_U##_SELECT2 select = 0;                                                                  \
+    jobvs = (computeU) ? 'V' : 'N';                                                                                  \
+    m_matU.resize(n, n);                                                                                             \
+    lapack_int ldvs = internal::convert_index<lapack_int>(m_matU.outerStride());                                     \
+    m_matT = matrix;                                                                                                 \
+    lapack_int lda = internal::convert_index<lapack_int>(m_matT.outerStride());                                      \
+    Matrix<EIGTYPE, Dynamic, Dynamic> wr, wi;                                                                        \
+    wr.resize(n, 1);                                                                                                 \
+    wi.resize(n, 1);                                                                                                 \
+    info = LAPACKE_##LAPACKE_PREFIX##gees(matrix_order, jobvs, sort, select, n, (LAPACKE_TYPE*)m_matT.data(), lda,   \
+                                          &sdim, (LAPACKE_TYPE*)wr.data(), (LAPACKE_TYPE*)wi.data(),                 \
+                                          (LAPACKE_TYPE*)m_matU.data(), ldvs);                                       \
+    if (info == 0)                                                                                                   \
+      m_info = Success;                                                                                              \
+    else                                                                                                             \
+      m_info = NoConvergence;                                                                                        \
+                                                                                                                     \
+    m_isInitialized = true;                                                                                          \
+    m_matUisUptodate = computeU;                                                                                     \
+    return *this;                                                                                                    \
+  }
+
+EIGEN_LAPACKE_SCHUR_REAL(double, double, d, D, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_SCHUR_REAL(float, float, s, S, ColMajor, LAPACK_COL_MAJOR)
+EIGEN_LAPACKE_SCHUR_REAL(double, double, d, D, RowMajor, LAPACK_ROW_MAJOR)
+EIGEN_LAPACKE_SCHUR_REAL(float, float, s, S, RowMajor, LAPACK_ROW_MAJOR)
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_REAL_SCHUR_LAPACKE_H
diff --git a/inst/include/Eigen/src/Eigenvalues/RealSchur_MKL.h b/inst/include/Eigen/src/Eigenvalues/RealSchur_MKL.h
deleted file mode 100644
index ad973646..00000000
--- a/inst/include/Eigen/src/Eigenvalues/RealSchur_MKL.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- Copyright (c) 2011, Intel Corporation. All rights reserved.
-
- Redistribution and use in source and binary forms, with or without modification,
- are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
- * Neither the name of Intel Corporation nor the names of its contributors may
-   be used to endorse or promote products derived from this software without
-   specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
- ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
- ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
- *    Real Schur needed to real unsymmetrical eigenvalues/eigenvectors.
- ********************************************************************************
-*/
-
-#ifndef EIGEN_REAL_SCHUR_MKL_H
-#define EIGEN_REAL_SCHUR_MKL_H
-
-#include "Eigen/src/Core/util/MKL_support.h"
-
-namespace Eigen { 
-
-/** \internal Specialization for the data types supported by MKL */
-
-#define EIGEN_MKL_SCHUR_REAL(EIGTYPE, MKLTYPE, MKLPREFIX, MKLPREFIX_U, EIGCOLROW, MKLCOLROW) \
-template<> inline \
-RealSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >& \
-RealSchur<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW>& matrix, bool computeU) \
-{ \
-  typedef Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> MatrixType; \
-  typedef MatrixType::Scalar Scalar; \
-  typedef MatrixType::RealScalar RealScalar; \
-\
-  eigen_assert(matrix.cols() == matrix.rows()); \
-\
-  lapack_int n = matrix.cols(), sdim, info; \
-  lapack_int lda = matrix.outerStride(); \
-  lapack_int matrix_order = MKLCOLROW; \
-  char jobvs, sort='N'; \
-  LAPACK_##MKLPREFIX_U##_SELECT2 select = 0; \
-  jobvs = (computeU) ? 'V' : 'N'; \
-  m_matU.resize(n, n); \
-  lapack_int ldvs  = m_matU.outerStride(); \
-  m_matT = matrix; \
-  Matrix<EIGTYPE, Dynamic, Dynamic> wr, wi; \
-  wr.resize(n, 1); wi.resize(n, 1); \
-  info = LAPACKE_##MKLPREFIX##gees( matrix_order, jobvs, sort, select, n, (MKLTYPE*)m_matT.data(), lda, &sdim, (MKLTYPE*)wr.data(), (MKLTYPE*)wi.data(), (MKLTYPE*)m_matU.data(), ldvs ); \
-  if(info == 0) \
-    m_info = Success; \
-  else \
-    m_info = NoConvergence; \
-\
-  m_isInitialized = true; \
-  m_matUisUptodate = computeU; \
-  return *this; \
-\
-}
-
-EIGEN_MKL_SCHUR_REAL(double,   double, d, D, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_MKL_SCHUR_REAL(float,    float,  s, S, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_MKL_SCHUR_REAL(double,   double, d, D, RowMajor, LAPACK_ROW_MAJOR)
-EIGEN_MKL_SCHUR_REAL(float,    float,  s, S, RowMajor, LAPACK_ROW_MAJOR)
-
-} // end namespace Eigen
-
-#endif // EIGEN_REAL_SCHUR_MKL_H
diff --git a/inst/include/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h b/inst/include/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
index 1131c8af..f84da913 100644
--- a/inst/include/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
+++ b/inst/include/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h
@@ -13,400 +13,415 @@
 
 #include "./Tridiagonalization.h"
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 
-template<typename _MatrixType>
+namespace Eigen {
+
+template <typename MatrixType_>
 class GeneralizedSelfAdjointEigenSolver;
 
 namespace internal {
-template<typename SolverType,int Size,bool IsComplex> struct direct_selfadjoint_eigenvalues;
-}
+template <typename SolverType, int Size, bool IsComplex>
+struct direct_selfadjoint_eigenvalues;
+
+template <typename MatrixType, typename DiagType, typename SubDiagType>
+EIGEN_DEVICE_FUNC ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag,
+                                                              const Index maxIterations, bool computeEigenvectors,
+                                                              MatrixType& eivec);
+}  // namespace internal
 
 /** \eigenvalues_module \ingroup Eigenvalues_Module
-  *
-  *
-  * \class SelfAdjointEigenSolver
-  *
-  * \brief Computes eigenvalues and eigenvectors of selfadjoint matrices
-  *
-  * \tparam _MatrixType the type of the matrix of which we are computing the
-  * eigendecomposition; this is expected to be an instantiation of the Matrix
-  * class template.
-  *
-  * A matrix \f$ A \f$ is selfadjoint if it equals its adjoint. For real
-  * matrices, this means that the matrix is symmetric: it equals its
-  * transpose. This class computes the eigenvalues and eigenvectors of a
-  * selfadjoint matrix. These are the scalars \f$ \lambda \f$ and vectors
-  * \f$ v \f$ such that \f$ Av = \lambda v \f$.  The eigenvalues of a
-  * selfadjoint matrix are always real. If \f$ D \f$ is a diagonal matrix with
-  * the eigenvalues on the diagonal, and \f$ V \f$ is a matrix with the
-  * eigenvectors as its columns, then \f$ A = V D V^{-1} \f$ (for selfadjoint
-  * matrices, the matrix \f$ V \f$ is always invertible). This is called the
-  * eigendecomposition.
-  *
-  * The algorithm exploits the fact that the matrix is selfadjoint, making it
-  * faster and more accurate than the general purpose eigenvalue algorithms
-  * implemented in EigenSolver and ComplexEigenSolver.
-  *
-  * Only the \b lower \b triangular \b part of the input matrix is referenced.
-  *
-  * Call the function compute() to compute the eigenvalues and eigenvectors of
-  * a given matrix. Alternatively, you can use the
-  * SelfAdjointEigenSolver(const MatrixType&, int) constructor which computes
-  * the eigenvalues and eigenvectors at construction time. Once the eigenvalue
-  * and eigenvectors are computed, they can be retrieved with the eigenvalues()
-  * and eigenvectors() functions.
-  *
-  * The documentation for SelfAdjointEigenSolver(const MatrixType&, int)
-  * contains an example of the typical use of this class.
-  *
-  * To solve the \em generalized eigenvalue problem \f$ Av = \lambda Bv \f$ and
-  * the likes, see the class GeneralizedSelfAdjointEigenSolver.
-  *
-  * \sa MatrixBase::eigenvalues(), class EigenSolver, class ComplexEigenSolver
-  */
-template<typename _MatrixType> class SelfAdjointEigenSolver
-{
-  public:
-
-    typedef _MatrixType MatrixType;
-    enum {
-      Size = MatrixType::RowsAtCompileTime,
-      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-      Options = MatrixType::Options,
-      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
-    };
-    
-    /** \brief Scalar type for matrices of type \p _MatrixType. */
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::Index Index;
-    
-    typedef Matrix<Scalar,Size,Size,ColMajor,MaxColsAtCompileTime,MaxColsAtCompileTime> EigenvectorsType;
-
-    /** \brief Real scalar type for \p _MatrixType.
-      *
-      * This is just \c Scalar if #Scalar is real (e.g., \c float or
-      * \c double), and the type of the real part of \c Scalar if #Scalar is
-      * complex.
-      */
-    typedef typename NumTraits<Scalar>::Real RealScalar;
-    
-    friend struct internal::direct_selfadjoint_eigenvalues<SelfAdjointEigenSolver,Size,NumTraits<Scalar>::IsComplex>;
-
-    /** \brief Type for vector of eigenvalues as returned by eigenvalues().
-      *
-      * This is a column vector with entries of type #RealScalar.
-      * The length of the vector is the size of \p _MatrixType.
-      */
-    typedef typename internal::plain_col_type<MatrixType, RealScalar>::type RealVectorType;
-    typedef Tridiagonalization<MatrixType> TridiagonalizationType;
-
-    /** \brief Default constructor for fixed-size matrices.
-      *
-      * The default constructor is useful in cases in which the user intends to
-      * perform decompositions via compute(). This constructor
-      * can only be used if \p _MatrixType is a fixed-size matrix; use
-      * SelfAdjointEigenSolver(Index) for dynamic-size matrices.
-      *
-      * Example: \include SelfAdjointEigenSolver_SelfAdjointEigenSolver.cpp
-      * Output: \verbinclude SelfAdjointEigenSolver_SelfAdjointEigenSolver.out
-      */
-    SelfAdjointEigenSolver()
-        : m_eivec(),
-          m_eivalues(),
-          m_subdiag(),
-          m_isInitialized(false)
-    { }
-
-    /** \brief Constructor, pre-allocates memory for dynamic-size matrices.
-      *
-      * \param [in]  size  Positive integer, size of the matrix whose
-      * eigenvalues and eigenvectors will be computed.
-      *
-      * This constructor is useful for dynamic-size matrices, when the user
-      * intends to perform decompositions via compute(). The \p size
-      * parameter is only used as a hint. It is not an error to give a wrong
-      * \p size, but it may impair performance.
-      *
-      * \sa compute() for an example
-      */
-    SelfAdjointEigenSolver(Index size)
-        : m_eivec(size, size),
-          m_eivalues(size),
-          m_subdiag(size > 1 ? size - 1 : 1),
-          m_isInitialized(false)
-    {}
-
-    /** \brief Constructor; computes eigendecomposition of given matrix.
-      *
-      * \param[in]  matrix  Selfadjoint matrix whose eigendecomposition is to
-      *    be computed. Only the lower triangular part of the matrix is referenced.
-      * \param[in]  options Can be #ComputeEigenvectors (default) or #EigenvaluesOnly.
-      *
-      * This constructor calls compute(const MatrixType&, int) to compute the
-      * eigenvalues of the matrix \p matrix. The eigenvectors are computed if
-      * \p options equals #ComputeEigenvectors.
-      *
-      * Example: \include SelfAdjointEigenSolver_SelfAdjointEigenSolver_MatrixType.cpp
-      * Output: \verbinclude SelfAdjointEigenSolver_SelfAdjointEigenSolver_MatrixType.out
-      *
-      * \sa compute(const MatrixType&, int)
-      */
-    SelfAdjointEigenSolver(const MatrixType& matrix, int options = ComputeEigenvectors)
+ *
+ *
+ * \class SelfAdjointEigenSolver
+ *
+ * \brief Computes eigenvalues and eigenvectors of selfadjoint matrices
+ *
+ * \tparam MatrixType_ the type of the matrix of which we are computing the
+ * eigendecomposition; this is expected to be an instantiation of the Matrix
+ * class template.
+ *
+ * A matrix \f$ A \f$ is selfadjoint if it equals its adjoint. For real
+ * matrices, this means that the matrix is symmetric: it equals its
+ * transpose. This class computes the eigenvalues and eigenvectors of a
+ * selfadjoint matrix. These are the scalars \f$ \lambda \f$ and vectors
+ * \f$ v \f$ such that \f$ Av = \lambda v \f$.  The eigenvalues of a
+ * selfadjoint matrix are always real. If \f$ D \f$ is a diagonal matrix with
+ * the eigenvalues on the diagonal, and \f$ V \f$ is a matrix with the
+ * eigenvectors as its columns, then \f$ A = V D V^{-1} \f$. This is called the
+ * eigendecomposition.
+ *
+ * For a selfadjoint matrix, \f$ V \f$ is unitary, meaning its inverse is equal
+ * to its adjoint, \f$ V^{-1} = V^{\dagger} \f$. If \f$ A \f$ is real, then
+ * \f$ V \f$ is also real and therefore orthogonal, meaning its inverse is
+ * equal to its transpose, \f$ V^{-1} = V^T \f$.
+ *
+ * The algorithm exploits the fact that the matrix is selfadjoint, making it
+ * faster and more accurate than the general purpose eigenvalue algorithms
+ * implemented in EigenSolver and ComplexEigenSolver.
+ *
+ * Only the \b lower \b triangular \b part of the input matrix is referenced.
+ *
+ * Call the function compute() to compute the eigenvalues and eigenvectors of
+ * a given matrix. Alternatively, you can use the
+ * SelfAdjointEigenSolver(const MatrixType&, int) constructor which computes
+ * the eigenvalues and eigenvectors at construction time. Once the eigenvalue
+ * and eigenvectors are computed, they can be retrieved with the eigenvalues()
+ * and eigenvectors() functions.
+ *
+ * The documentation for SelfAdjointEigenSolver(const MatrixType&, int)
+ * contains an example of the typical use of this class.
+ *
+ * To solve the \em generalized eigenvalue problem \f$ Av = \lambda Bv \f$ and
+ * the likes, see the class GeneralizedSelfAdjointEigenSolver.
+ *
+ * \sa MatrixBase::eigenvalues(), class EigenSolver, class ComplexEigenSolver
+ */
+template <typename MatrixType_>
+class SelfAdjointEigenSolver {
+ public:
+  typedef MatrixType_ MatrixType;
+  enum {
+    Size = MatrixType::RowsAtCompileTime,
+    ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+    Options = internal::traits<MatrixType>::Options,
+    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+  };
+
+  /** \brief Scalar type for matrices of type \p MatrixType_. */
+  typedef typename MatrixType::Scalar Scalar;
+  typedef Eigen::Index Index;  ///< \deprecated since Eigen 3.3
+
+  typedef Matrix<Scalar, Size, Size, ColMajor, MaxColsAtCompileTime, MaxColsAtCompileTime> EigenvectorsType;
+
+  /** \brief Real scalar type for \p MatrixType_.
+   *
+   * This is just \c Scalar if #Scalar is real (e.g., \c float or
+   * \c double), and the type of the real part of \c Scalar if #Scalar is
+   * complex.
+   */
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+
+  friend struct internal::direct_selfadjoint_eigenvalues<SelfAdjointEigenSolver, Size, NumTraits<Scalar>::IsComplex>;
+
+  /** \brief Type for vector of eigenvalues as returned by eigenvalues().
+   *
+   * This is a column vector with entries of type #RealScalar.
+   * The length of the vector is the size of \p MatrixType_.
+   */
+  typedef typename internal::plain_col_type<MatrixType, Scalar>::type VectorType;
+  typedef typename internal::plain_col_type<MatrixType, RealScalar>::type RealVectorType;
+  typedef Tridiagonalization<MatrixType> TridiagonalizationType;
+  typedef typename TridiagonalizationType::SubDiagonalType SubDiagonalType;
+
+  /** \brief Default constructor for fixed-size matrices.
+   *
+   * The default constructor is useful in cases in which the user intends to
+   * perform decompositions via compute(). This constructor
+   * can only be used if \p MatrixType_ is a fixed-size matrix; use
+   * SelfAdjointEigenSolver(Index) for dynamic-size matrices.
+   *
+   * Example: \include SelfAdjointEigenSolver_SelfAdjointEigenSolver.cpp
+   * Output: \verbinclude SelfAdjointEigenSolver_SelfAdjointEigenSolver.out
+   */
+  EIGEN_DEVICE_FUNC SelfAdjointEigenSolver()
+      : m_eivec(),
+        m_workspace(),
+        m_eivalues(),
+        m_subdiag(),
+        m_hcoeffs(),
+        m_info(InvalidInput),
+        m_isInitialized(false),
+        m_eigenvectorsOk(false) {}
+
+  /** \brief Constructor, pre-allocates memory for dynamic-size matrices.
+   *
+   * \param [in]  size  Positive integer, size of the matrix whose
+   * eigenvalues and eigenvectors will be computed.
+   *
+   * This constructor is useful for dynamic-size matrices, when the user
+   * intends to perform decompositions via compute(). The \p size
+   * parameter is only used as a hint. It is not an error to give a wrong
+   * \p size, but it may impair performance.
+   *
+   * \sa compute() for an example
+   */
+  EIGEN_DEVICE_FUNC explicit SelfAdjointEigenSolver(Index size)
+      : m_eivec(size, size),
+        m_workspace(size),
+        m_eivalues(size),
+        m_subdiag(size > 1 ? size - 1 : 1),
+        m_hcoeffs(size > 1 ? size - 1 : 1),
+        m_isInitialized(false),
+        m_eigenvectorsOk(false) {}
+
+  /** \brief Constructor; computes eigendecomposition of given matrix.
+   *
+   * \param[in]  matrix  Selfadjoint matrix whose eigendecomposition is to
+   *    be computed. Only the lower triangular part of the matrix is referenced.
+   * \param[in]  options Can be #ComputeEigenvectors (default) or #EigenvaluesOnly.
+   *
+   * This constructor calls compute(const MatrixType&, int) to compute the
+   * eigenvalues of the matrix \p matrix. The eigenvectors are computed if
+   * \p options equals #ComputeEigenvectors.
+   *
+   * Example: \include SelfAdjointEigenSolver_SelfAdjointEigenSolver_MatrixType.cpp
+   * Output: \verbinclude SelfAdjointEigenSolver_SelfAdjointEigenSolver_MatrixType.out
+   *
+   * \sa compute(const MatrixType&, int)
+   */
+  template <typename InputType>
+  EIGEN_DEVICE_FUNC explicit SelfAdjointEigenSolver(const EigenBase<InputType>& matrix,
+                                                    int options = ComputeEigenvectors)
       : m_eivec(matrix.rows(), matrix.cols()),
+        m_workspace(matrix.cols()),
         m_eivalues(matrix.cols()),
         m_subdiag(matrix.rows() > 1 ? matrix.rows() - 1 : 1),
-        m_isInitialized(false)
-    {
-      compute(matrix, options);
-    }
-
-    /** \brief Computes eigendecomposition of given matrix.
-      *
-      * \param[in]  matrix  Selfadjoint matrix whose eigendecomposition is to
-      *    be computed. Only the lower triangular part of the matrix is referenced.
-      * \param[in]  options Can be #ComputeEigenvectors (default) or #EigenvaluesOnly.
-      * \returns    Reference to \c *this
-      *
-      * This function computes the eigenvalues of \p matrix.  The eigenvalues()
-      * function can be used to retrieve them.  If \p options equals #ComputeEigenvectors,
-      * then the eigenvectors are also computed and can be retrieved by
-      * calling eigenvectors().
-      *
-      * This implementation uses a symmetric QR algorithm. The matrix is first
-      * reduced to tridiagonal form using the Tridiagonalization class. The
-      * tridiagonal matrix is then brought to diagonal form with implicit
-      * symmetric QR steps with Wilkinson shift. Details can be found in
-      * Section 8.3 of Golub \& Van Loan, <i>%Matrix Computations</i>.
-      *
-      * The cost of the computation is about \f$ 9n^3 \f$ if the eigenvectors
-      * are required and \f$ 4n^3/3 \f$ if they are not required.
-      *
-      * This method reuses the memory in the SelfAdjointEigenSolver object that
-      * was allocated when the object was constructed, if the size of the
-      * matrix does not change.
-      *
-      * Example: \include SelfAdjointEigenSolver_compute_MatrixType.cpp
-      * Output: \verbinclude SelfAdjointEigenSolver_compute_MatrixType.out
-      *
-      * \sa SelfAdjointEigenSolver(const MatrixType&, int)
-      */
-    SelfAdjointEigenSolver& compute(const MatrixType& matrix, int options = ComputeEigenvectors);
-    
-    /** \brief Computes eigendecomposition of given matrix using a direct algorithm
-      *
-      * This is a variant of compute(const MatrixType&, int options) which
-      * directly solves the underlying polynomial equation.
-      * 
-      * Currently only 3x3 matrices for which the sizes are known at compile time are supported (e.g., Matrix3d).
-      * 
-      * This method is usually significantly faster than the QR algorithm
-      * but it might also be less accurate. It is also worth noting that
-      * for 3x3 matrices it involves trigonometric operations which are
-      * not necessarily available for all scalar types.
-      *
-      * \sa compute(const MatrixType&, int options)
-      */
-    SelfAdjointEigenSolver& computeDirect(const MatrixType& matrix, int options = ComputeEigenvectors);
-
-    /** \brief Returns the eigenvectors of given matrix.
-      *
-      * \returns  A const reference to the matrix whose columns are the eigenvectors.
-      *
-      * \pre The eigenvectors have been computed before.
-      *
-      * Column \f$ k \f$ of the returned matrix is an eigenvector corresponding
-      * to eigenvalue number \f$ k \f$ as returned by eigenvalues().  The
-      * eigenvectors are normalized to have (Euclidean) norm equal to one. If
-      * this object was used to solve the eigenproblem for the selfadjoint
-      * matrix \f$ A \f$, then the matrix returned by this function is the
-      * matrix \f$ V \f$ in the eigendecomposition \f$ A = V D V^{-1} \f$.
-      *
-      * Example: \include SelfAdjointEigenSolver_eigenvectors.cpp
-      * Output: \verbinclude SelfAdjointEigenSolver_eigenvectors.out
-      *
-      * \sa eigenvalues()
-      */
-    const EigenvectorsType& eigenvectors() const
-    {
-      eigen_assert(m_isInitialized && "SelfAdjointEigenSolver is not initialized.");
-      eigen_assert(m_eigenvectorsOk && "The eigenvectors have not been computed together with the eigenvalues.");
-      return m_eivec;
-    }
-
-    /** \brief Returns the eigenvalues of given matrix.
-      *
-      * \returns A const reference to the column vector containing the eigenvalues.
-      *
-      * \pre The eigenvalues have been computed before.
-      *
-      * The eigenvalues are repeated according to their algebraic multiplicity,
-      * so there are as many eigenvalues as rows in the matrix. The eigenvalues
-      * are sorted in increasing order.
-      *
-      * Example: \include SelfAdjointEigenSolver_eigenvalues.cpp
-      * Output: \verbinclude SelfAdjointEigenSolver_eigenvalues.out
-      *
-      * \sa eigenvectors(), MatrixBase::eigenvalues()
-      */
-    const RealVectorType& eigenvalues() const
-    {
-      eigen_assert(m_isInitialized && "SelfAdjointEigenSolver is not initialized.");
-      return m_eivalues;
-    }
-
-    /** \brief Computes the positive-definite square root of the matrix.
-      *
-      * \returns the positive-definite square root of the matrix
-      *
-      * \pre The eigenvalues and eigenvectors of a positive-definite matrix
-      * have been computed before.
-      *
-      * The square root of a positive-definite matrix \f$ A \f$ is the
-      * positive-definite matrix whose square equals \f$ A \f$. This function
-      * uses the eigendecomposition \f$ A = V D V^{-1} \f$ to compute the
-      * square root as \f$ A^{1/2} = V D^{1/2} V^{-1} \f$.
-      *
-      * Example: \include SelfAdjointEigenSolver_operatorSqrt.cpp
-      * Output: \verbinclude SelfAdjointEigenSolver_operatorSqrt.out
-      *
-      * \sa operatorInverseSqrt(),
-      *     \ref MatrixFunctions_Module "MatrixFunctions Module"
-      */
-    MatrixType operatorSqrt() const
-    {
-      eigen_assert(m_isInitialized && "SelfAdjointEigenSolver is not initialized.");
-      eigen_assert(m_eigenvectorsOk && "The eigenvectors have not been computed together with the eigenvalues.");
-      return m_eivec * m_eivalues.cwiseSqrt().asDiagonal() * m_eivec.adjoint();
-    }
+        m_hcoeffs(matrix.cols() > 1 ? matrix.cols() - 1 : 1),
+        m_isInitialized(false),
+        m_eigenvectorsOk(false) {
+    compute(matrix.derived(), options);
+  }
 
-    /** \brief Computes the inverse square root of the matrix.
-      *
-      * \returns the inverse positive-definite square root of the matrix
-      *
-      * \pre The eigenvalues and eigenvectors of a positive-definite matrix
-      * have been computed before.
-      *
-      * This function uses the eigendecomposition \f$ A = V D V^{-1} \f$ to
-      * compute the inverse square root as \f$ V D^{-1/2} V^{-1} \f$. This is
-      * cheaper than first computing the square root with operatorSqrt() and
-      * then its inverse with MatrixBase::inverse().
-      *
-      * Example: \include SelfAdjointEigenSolver_operatorInverseSqrt.cpp
-      * Output: \verbinclude SelfAdjointEigenSolver_operatorInverseSqrt.out
-      *
-      * \sa operatorSqrt(), MatrixBase::inverse(),
-      *     \ref MatrixFunctions_Module "MatrixFunctions Module"
-      */
-    MatrixType operatorInverseSqrt() const
-    {
-      eigen_assert(m_isInitialized && "SelfAdjointEigenSolver is not initialized.");
-      eigen_assert(m_eigenvectorsOk && "The eigenvectors have not been computed together with the eigenvalues.");
-      return m_eivec * m_eivalues.cwiseInverse().cwiseSqrt().asDiagonal() * m_eivec.adjoint();
-    }
+  /** \brief Computes eigendecomposition of given matrix.
+   *
+   * \param[in]  matrix  Selfadjoint matrix whose eigendecomposition is to
+   *    be computed. Only the lower triangular part of the matrix is referenced.
+   * \param[in]  options Can be #ComputeEigenvectors (default) or #EigenvaluesOnly.
+   * \returns    Reference to \c *this
+   *
+   * This function computes the eigenvalues of \p matrix.  The eigenvalues()
+   * function can be used to retrieve them.  If \p options equals #ComputeEigenvectors,
+   * then the eigenvectors are also computed and can be retrieved by
+   * calling eigenvectors().
+   *
+   * This implementation uses a symmetric QR algorithm. The matrix is first
+   * reduced to tridiagonal form using the Tridiagonalization class. The
+   * tridiagonal matrix is then brought to diagonal form with implicit
+   * symmetric QR steps with Wilkinson shift. Details can be found in
+   * Section 8.3 of Golub \& Van Loan, <i>%Matrix Computations</i>.
+   *
+   * The cost of the computation is about \f$ 9n^3 \f$ if the eigenvectors
+   * are required and \f$ 4n^3/3 \f$ if they are not required.
+   *
+   * This method reuses the memory in the SelfAdjointEigenSolver object that
+   * was allocated when the object was constructed, if the size of the
+   * matrix does not change.
+   *
+   * Example: \include SelfAdjointEigenSolver_compute_MatrixType.cpp
+   * Output: \verbinclude SelfAdjointEigenSolver_compute_MatrixType.out
+   *
+   * \sa SelfAdjointEigenSolver(const MatrixType&, int)
+   */
+  template <typename InputType>
+  EIGEN_DEVICE_FUNC SelfAdjointEigenSolver& compute(const EigenBase<InputType>& matrix,
+                                                    int options = ComputeEigenvectors);
+
+  /** \brief Computes eigendecomposition of given matrix using a closed-form algorithm
+   *
+   * This is a variant of compute(const MatrixType&, int options) which
+   * directly solves the underlying polynomial equation.
+   *
+   * Currently only 2x2 and 3x3 matrices for which the sizes are known at compile time are supported (e.g., Matrix3d).
+   *
+   * This method is usually significantly faster than the QR iterative algorithm
+   * but it might also be less accurate. It is also worth noting that
+   * for 3x3 matrices it involves trigonometric operations which are
+   * not necessarily available for all scalar types.
+   *
+   * For the 3x3 case, we observed the following worst case relative error regarding the eigenvalues:
+   *   - double: 1e-8
+   *   - float:  1e-3
+   *
+   * \sa compute(const MatrixType&, int options)
+   */
+  EIGEN_DEVICE_FUNC SelfAdjointEigenSolver& computeDirect(const MatrixType& matrix, int options = ComputeEigenvectors);
+
+  /**
+   *\brief Computes the eigen decomposition from a tridiagonal symmetric matrix
+   *
+   * \param[in] diag The vector containing the diagonal of the matrix.
+   * \param[in] subdiag The subdiagonal of the matrix.
+   * \param[in] options Can be #ComputeEigenvectors (default) or #EigenvaluesOnly.
+   * \returns Reference to \c *this
+   *
+   * This function assumes that the matrix has been reduced to tridiagonal form.
+   *
+   * \sa compute(const MatrixType&, int) for more information
+   */
+  SelfAdjointEigenSolver& computeFromTridiagonal(const RealVectorType& diag, const SubDiagonalType& subdiag,
+                                                 int options = ComputeEigenvectors);
+
+  /** \brief Returns the eigenvectors of given matrix.
+   *
+   * \returns  A const reference to the matrix whose columns are the eigenvectors.
+   *
+   * \pre The eigenvectors have been computed before.
+   *
+   * Column \f$ k \f$ of the returned matrix is an eigenvector corresponding
+   * to eigenvalue number \f$ k \f$ as returned by eigenvalues().  The
+   * eigenvectors are normalized to have (Euclidean) norm equal to one. If
+   * this object was used to solve the eigenproblem for the selfadjoint
+   * matrix \f$ A \f$, then the matrix returned by this function is the
+   * matrix \f$ V \f$ in the eigendecomposition \f$ A = V D V^{-1} \f$.
+   *
+   * For a selfadjoint matrix, \f$ V \f$ is unitary, meaning its inverse is equal
+   * to its adjoint, \f$ V^{-1} = V^{\dagger} \f$. If \f$ A \f$ is real, then
+   * \f$ V \f$ is also real and therefore orthogonal, meaning its inverse is
+   * equal to its transpose, \f$ V^{-1} = V^T \f$.
+   *
+   * Example: \include SelfAdjointEigenSolver_eigenvectors.cpp
+   * Output: \verbinclude SelfAdjointEigenSolver_eigenvectors.out
+   *
+   * \sa eigenvalues()
+   */
+  EIGEN_DEVICE_FUNC const EigenvectorsType& eigenvectors() const {
+    eigen_assert(m_isInitialized && "SelfAdjointEigenSolver is not initialized.");
+    eigen_assert(m_eigenvectorsOk && "The eigenvectors have not been computed together with the eigenvalues.");
+    return m_eivec;
+  }
 
-    /** \brief Reports whether previous computation was successful.
-      *
-      * \returns \c Success if computation was succesful, \c NoConvergence otherwise.
-      */
-    ComputationInfo info() const
-    {
-      eigen_assert(m_isInitialized && "SelfAdjointEigenSolver is not initialized.");
-      return m_info;
-    }
+  /** \brief Returns the eigenvalues of given matrix.
+   *
+   * \returns A const reference to the column vector containing the eigenvalues.
+   *
+   * \pre The eigenvalues have been computed before.
+   *
+   * The eigenvalues are repeated according to their algebraic multiplicity,
+   * so there are as many eigenvalues as rows in the matrix. The eigenvalues
+   * are sorted in increasing order.
+   *
+   * Example: \include SelfAdjointEigenSolver_eigenvalues.cpp
+   * Output: \verbinclude SelfAdjointEigenSolver_eigenvalues.out
+   *
+   * \sa eigenvectors(), MatrixBase::eigenvalues()
+   */
+  EIGEN_DEVICE_FUNC const RealVectorType& eigenvalues() const {
+    eigen_assert(m_isInitialized && "SelfAdjointEigenSolver is not initialized.");
+    return m_eivalues;
+  }
 
-    /** \brief Maximum number of iterations.
-      *
-      * The algorithm terminates if it does not converge within m_maxIterations * n iterations, where n
-      * denotes the size of the matrix. This value is currently set to 30 (copied from LAPACK).
-      */
-    static const int m_maxIterations = 30;
+  /** \brief Computes the positive-definite square root of the matrix.
+   *
+   * \returns the positive-definite square root of the matrix
+   *
+   * \pre The eigenvalues and eigenvectors of a positive-definite matrix
+   * have been computed before.
+   *
+   * The square root of a positive-definite matrix \f$ A \f$ is the
+   * positive-definite matrix whose square equals \f$ A \f$. This function
+   * uses the eigendecomposition \f$ A = V D V^{-1} \f$ to compute the
+   * square root as \f$ A^{1/2} = V D^{1/2} V^{-1} \f$.
+   *
+   * Example: \include SelfAdjointEigenSolver_operatorSqrt.cpp
+   * Output: \verbinclude SelfAdjointEigenSolver_operatorSqrt.out
+   *
+   * \sa operatorInverseSqrt(), <a href="unsupported/group__MatrixFunctions__Module.html">MatrixFunctions Module</a>
+   */
+  EIGEN_DEVICE_FUNC MatrixType operatorSqrt() const {
+    eigen_assert(m_isInitialized && "SelfAdjointEigenSolver is not initialized.");
+    eigen_assert(m_eigenvectorsOk && "The eigenvectors have not been computed together with the eigenvalues.");
+    return m_eivec * m_eivalues.cwiseSqrt().asDiagonal() * m_eivec.adjoint();
+  }
 
-    #ifdef EIGEN2_SUPPORT
-    SelfAdjointEigenSolver(const MatrixType& matrix, bool computeEigenvectors)
-      : m_eivec(matrix.rows(), matrix.cols()),
-        m_eivalues(matrix.cols()),
-        m_subdiag(matrix.rows() > 1 ? matrix.rows() - 1 : 1),
-        m_isInitialized(false)
-    {
-      compute(matrix, computeEigenvectors);
-    }
-    
-    SelfAdjointEigenSolver(const MatrixType& matA, const MatrixType& matB, bool computeEigenvectors = true)
-        : m_eivec(matA.cols(), matA.cols()),
-          m_eivalues(matA.cols()),
-          m_subdiag(matA.cols() > 1 ? matA.cols() - 1 : 1),
-          m_isInitialized(false)
-    {
-      static_cast<GeneralizedSelfAdjointEigenSolver<MatrixType>*>(this)->compute(matA, matB, computeEigenvectors ? ComputeEigenvectors : EigenvaluesOnly);
-    }
-    
-    void compute(const MatrixType& matrix, bool computeEigenvectors)
-    {
-      compute(matrix, computeEigenvectors ? ComputeEigenvectors : EigenvaluesOnly);
-    }
+  /** \brief Computes the inverse square root of the matrix.
+   *
+   * \returns the inverse positive-definite square root of the matrix
+   *
+   * \pre The eigenvalues and eigenvectors of a positive-definite matrix
+   * have been computed before.
+   *
+   * This function uses the eigendecomposition \f$ A = V D V^{-1} \f$ to
+   * compute the inverse square root as \f$ V D^{-1/2} V^{-1} \f$. This is
+   * cheaper than first computing the square root with operatorSqrt() and
+   * then its inverse with MatrixBase::inverse().
+   *
+   * Example: \include SelfAdjointEigenSolver_operatorInverseSqrt.cpp
+   * Output: \verbinclude SelfAdjointEigenSolver_operatorInverseSqrt.out
+   *
+   * \sa operatorSqrt(), MatrixBase::inverse(), <a
+   * href="unsupported/group__MatrixFunctions__Module.html">MatrixFunctions Module</a>
+   */
+  EIGEN_DEVICE_FUNC MatrixType operatorInverseSqrt() const {
+    eigen_assert(m_isInitialized && "SelfAdjointEigenSolver is not initialized.");
+    eigen_assert(m_eigenvectorsOk && "The eigenvectors have not been computed together with the eigenvalues.");
+    return m_eivec * m_eivalues.cwiseInverse().cwiseSqrt().asDiagonal() * m_eivec.adjoint();
+  }
 
-    void compute(const MatrixType& matA, const MatrixType& matB, bool computeEigenvectors = true)
-    {
-      compute(matA, matB, computeEigenvectors ? ComputeEigenvectors : EigenvaluesOnly);
-    }
-    #endif // EIGEN2_SUPPORT
+  /** \brief Reports whether previous computation was successful.
+   *
+   * \returns \c Success if computation was successful, \c NoConvergence otherwise.
+   */
+  EIGEN_DEVICE_FUNC ComputationInfo info() const {
+    eigen_assert(m_isInitialized && "SelfAdjointEigenSolver is not initialized.");
+    return m_info;
+  }
 
-  protected:
-    static void check_template_parameters()
-    {
-      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
-    }
-    
-    EigenvectorsType m_eivec;
-    RealVectorType m_eivalues;
-    typename TridiagonalizationType::SubDiagonalType m_subdiag;
-    ComputationInfo m_info;
-    bool m_isInitialized;
-    bool m_eigenvectorsOk;
+  /** \brief Maximum number of iterations.
+   *
+   * The algorithm terminates if it does not converge within m_maxIterations * n iterations, where n
+   * denotes the size of the matrix. This value is currently set to 30 (copied from LAPACK).
+   */
+  static const int m_maxIterations = 30;
+
+ protected:
+  EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
+
+  EigenvectorsType m_eivec;
+  VectorType m_workspace;
+  RealVectorType m_eivalues;
+  typename TridiagonalizationType::SubDiagonalType m_subdiag;
+  typename TridiagonalizationType::CoeffVectorType m_hcoeffs;
+  ComputationInfo m_info;
+  bool m_isInitialized;
+  bool m_eigenvectorsOk;
 };
 
-/** \internal
-  *
-  * \eigenvalues_module \ingroup Eigenvalues_Module
-  *
-  * Performs a QR step on a tridiagonal symmetric matrix represented as a
-  * pair of two vectors \a diag and \a subdiag.
-  *
-  * \param matA the input selfadjoint matrix
-  * \param hCoeffs returned Householder coefficients
-  *
-  * For compilation efficiency reasons, this procedure does not use eigen expression
-  * for its arguments.
-  *
-  * Implemented from Golub's "Matrix Computations", algorithm 8.3.2:
-  * "implicit symmetric QR step with Wilkinson shift"
-  */
 namespace internal {
-template<typename RealScalar, typename Scalar, typename Index>
-static void tridiagonal_qr_step(RealScalar* diag, RealScalar* subdiag, Index start, Index end, Scalar* matrixQ, Index n);
-}
-
-template<typename MatrixType>
-SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<MatrixType>
-::compute(const MatrixType& matrix, int options)
-{
-  check_template_parameters();
-  
-  using std::abs;
+/** \internal
+ *
+ * \eigenvalues_module \ingroup Eigenvalues_Module
+ *
+ * Performs a QR step on a tridiagonal symmetric matrix represented as a
+ * pair of two vectors \a diag and \a subdiag.
+ *
+ * \param diag the diagonal part of the input selfadjoint tridiagonal matrix
+ * \param subdiag the sub-diagonal part of the input selfadjoint tridiagonal matrix
+ * \param start starting index of the submatrix to work on
+ * \param end last+1 index of the submatrix to work on
+ * \param matrixQ pointer to the column-major matrix holding the eigenvectors, can be 0
+ * \param n size of the input matrix
+ *
+ * For compilation efficiency reasons, this procedure does not use eigen expression
+ * for its arguments.
+ *
+ * Implemented from Golub's "Matrix Computations", algorithm 8.3.2:
+ * "implicit symmetric QR step with Wilkinson shift"
+ */
+template <int StorageOrder, typename RealScalar, typename Scalar, typename Index>
+EIGEN_DEVICE_FUNC static void tridiagonal_qr_step(RealScalar* diag, RealScalar* subdiag, Index start, Index end,
+                                                  Scalar* matrixQ, Index n);
+}  // namespace internal
+
+template <typename MatrixType>
+template <typename InputType>
+EIGEN_DEVICE_FUNC SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<MatrixType>::compute(
+    const EigenBase<InputType>& a_matrix, int options) {
+  const InputType& matrix(a_matrix.derived());
+
+  EIGEN_USING_STD(abs);
   eigen_assert(matrix.cols() == matrix.rows());
-  eigen_assert((options&~(EigVecMask|GenEigMask))==0
-          && (options&EigVecMask)!=EigVecMask
-          && "invalid option parameter");
-  bool computeEigenvectors = (options&ComputeEigenvectors)==ComputeEigenvectors;
+  eigen_assert((options & ~(EigVecMask | GenEigMask)) == 0 && (options & EigVecMask) != EigVecMask &&
+               "invalid option parameter");
+  bool computeEigenvectors = (options & ComputeEigenvectors) == ComputeEigenvectors;
   Index n = matrix.cols();
-  m_eivalues.resize(n,1);
+  m_eivalues.resize(n, 1);
 
-  if(n==1)
-  {
-    m_eivalues.coeffRef(0,0) = numext::real(matrix.coeff(0,0));
-    if(computeEigenvectors)
-      m_eivec.setOnes(n,n);
+  if (n == 1) {
+    m_eivec = matrix;
+    m_eivalues.coeffRef(0, 0) = numext::real(m_eivec.coeff(0, 0));
+    if (computeEigenvectors) m_eivec.setOnes(n, n);
     m_info = Success;
     m_isInitialized = true;
     m_eigenvectorsOk = computeEigenvectors;
@@ -420,135 +435,179 @@ ::compute(const MatrixType& matrix, int options)
   // map the matrix coefficients to [-1:1] to avoid over- and underflow.
   mat = matrix.template triangularView<Lower>();
   RealScalar scale = mat.cwiseAbs().maxCoeff();
-  if(scale==RealScalar(0)) scale = RealScalar(1);
+  if (numext::is_exactly_zero(scale)) scale = RealScalar(1);
   mat.template triangularView<Lower>() /= scale;
-  m_subdiag.resize(n-1);
-  internal::tridiagonalization_inplace(mat, diag, m_subdiag, computeEigenvectors);
-  
-  Index end = n-1;
-  Index start = 0;
-  Index iter = 0; // total number of iterations
+  m_subdiag.resize(n - 1);
+  m_hcoeffs.resize(n - 1);
+  internal::tridiagonalization_inplace(mat, diag, m_subdiag, m_hcoeffs, m_workspace, computeEigenvectors);
+
+  m_info = internal::computeFromTridiagonal_impl(diag, m_subdiag, m_maxIterations, computeEigenvectors, m_eivec);
+
+  // scale back the eigen values
+  m_eivalues *= scale;
+
+  m_isInitialized = true;
+  m_eigenvectorsOk = computeEigenvectors;
+  return *this;
+}
 
-  while (end>0)
-  {
-    for (Index i = start; i<end; ++i)
-      if (internal::isMuchSmallerThan(abs(m_subdiag[i]),(abs(diag[i])+abs(diag[i+1]))))
-        m_subdiag[i] = 0;
+template <typename MatrixType>
+SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<MatrixType>::computeFromTridiagonal(
+    const RealVectorType& diag, const SubDiagonalType& subdiag, int options) {
+  // TODO : Add an option to scale the values beforehand
+  bool computeEigenvectors = (options & ComputeEigenvectors) == ComputeEigenvectors;
 
-    // find the largest unreduced block
-    while (end>0 && m_subdiag[end-1]==0)
-    {
+  m_eivalues = diag;
+  m_subdiag = subdiag;
+  if (computeEigenvectors) {
+    m_eivec.setIdentity(diag.size(), diag.size());
+  }
+  m_info = internal::computeFromTridiagonal_impl(m_eivalues, m_subdiag, m_maxIterations, computeEigenvectors, m_eivec);
+
+  m_isInitialized = true;
+  m_eigenvectorsOk = computeEigenvectors;
+  return *this;
+}
+
+namespace internal {
+/**
+ * \internal
+ * \brief Compute the eigendecomposition from a tridiagonal matrix
+ *
+ * \param[in,out] diag : On input, the diagonal of the matrix, on output the eigenvalues
+ * \param[in,out] subdiag : The subdiagonal part of the matrix (entries are modified during the decomposition)
+ * \param[in] maxIterations : the maximum number of iterations
+ * \param[in] computeEigenvectors : whether the eigenvectors have to be computed or not
+ * \param[out] eivec : The matrix to store the eigenvectors if computeEigenvectors==true. Must be allocated on input.
+ * \returns \c Success or \c NoConvergence
+ */
+template <typename MatrixType, typename DiagType, typename SubDiagType>
+EIGEN_DEVICE_FUNC ComputationInfo computeFromTridiagonal_impl(DiagType& diag, SubDiagType& subdiag,
+                                                              const Index maxIterations, bool computeEigenvectors,
+                                                              MatrixType& eivec) {
+  ComputationInfo info;
+  typedef typename MatrixType::Scalar Scalar;
+
+  Index n = diag.size();
+  Index end = n - 1;
+  Index start = 0;
+  Index iter = 0;  // total number of iterations
+
+  typedef typename DiagType::RealScalar RealScalar;
+  const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
+  const RealScalar precision_inv = RealScalar(1) / NumTraits<RealScalar>::epsilon();
+  while (end > 0) {
+    for (Index i = start; i < end; ++i) {
+      if (numext::abs(subdiag[i]) < considerAsZero) {
+        subdiag[i] = RealScalar(0);
+      } else {
+        // abs(subdiag[i]) <= epsilon * sqrt(abs(diag[i]) + abs(diag[i+1]))
+        // Scaled to prevent underflows.
+        const RealScalar scaled_subdiag = precision_inv * subdiag[i];
+        if (scaled_subdiag * scaled_subdiag <= (numext::abs(diag[i]) + numext::abs(diag[i + 1]))) {
+          subdiag[i] = RealScalar(0);
+        }
+      }
+    }
+
+    // find the largest unreduced block at the end of the matrix.
+    while (end > 0 && numext::is_exactly_zero(subdiag[end - 1])) {
       end--;
     }
-    if (end<=0)
-      break;
+    if (end <= 0) break;
 
     // if we spent too many iterations, we give up
     iter++;
-    if(iter > m_maxIterations * n) break;
+    if (iter > maxIterations * n) break;
 
     start = end - 1;
-    while (start>0 && m_subdiag[start-1]!=0)
-      start--;
+    while (start > 0 && !numext::is_exactly_zero(subdiag[start - 1])) start--;
 
-    internal::tridiagonal_qr_step(diag.data(), m_subdiag.data(), start, end, computeEigenvectors ? m_eivec.data() : (Scalar*)0, n);
+    internal::tridiagonal_qr_step<MatrixType::Flags & RowMajorBit ? RowMajor : ColMajor>(
+        diag.data(), subdiag.data(), start, end, computeEigenvectors ? eivec.data() : (Scalar*)0, n);
   }
-
-  if (iter <= m_maxIterations * n)
-    m_info = Success;
+  if (iter <= maxIterations * n)
+    info = Success;
   else
-    m_info = NoConvergence;
+    info = NoConvergence;
 
   // Sort eigenvalues and corresponding vectors.
   // TODO make the sort optional ?
   // TODO use a better sort algorithm !!
-  if (m_info == Success)
-  {
-    for (Index i = 0; i < n-1; ++i)
-    {
+  if (info == Success) {
+    for (Index i = 0; i < n - 1; ++i) {
       Index k;
-      m_eivalues.segment(i,n-i).minCoeff(&k);
-      if (k > 0)
-      {
-        std::swap(m_eivalues[i], m_eivalues[k+i]);
-        if(computeEigenvectors)
-          m_eivec.col(i).swap(m_eivec.col(k+i));
+      diag.segment(i, n - i).minCoeff(&k);
+      if (k > 0) {
+        numext::swap(diag[i], diag[k + i]);
+        if (computeEigenvectors) eivec.col(i).swap(eivec.col(k + i));
       }
     }
   }
-  
-  // scale back the eigen values
-  m_eivalues *= scale;
-
-  m_isInitialized = true;
-  m_eigenvectorsOk = computeEigenvectors;
-  return *this;
+  return info;
 }
 
-
-namespace internal {
-  
-template<typename SolverType,int Size,bool IsComplex> struct direct_selfadjoint_eigenvalues
-{
-  static inline void run(SolverType& eig, const typename SolverType::MatrixType& A, int options)
-  { eig.compute(A,options); }
+template <typename SolverType, int Size, bool IsComplex>
+struct direct_selfadjoint_eigenvalues {
+  EIGEN_DEVICE_FUNC static inline void run(SolverType& eig, const typename SolverType::MatrixType& A, int options) {
+    eig.compute(A, options);
+  }
 };
 
-template<typename SolverType> struct direct_selfadjoint_eigenvalues<SolverType,3,false>
-{
+template <typename SolverType>
+struct direct_selfadjoint_eigenvalues<SolverType, 3, false> {
   typedef typename SolverType::MatrixType MatrixType;
   typedef typename SolverType::RealVectorType VectorType;
   typedef typename SolverType::Scalar Scalar;
-  typedef typename MatrixType::Index Index;
   typedef typename SolverType::EigenvectorsType EigenvectorsType;
-  
+
   /** \internal
    * Computes the roots of the characteristic polynomial of \a m.
    * For numerical stability m.trace() should be near zero and to avoid over- or underflow m should be normalized.
    */
-  static inline void computeRoots(const MatrixType& m, VectorType& roots)
-  {
-    using std::sqrt;
-    using std::atan2;
-    using std::cos;
-    using std::sin;
-    const Scalar s_inv3 = Scalar(1.0)/Scalar(3.0);
-    const Scalar s_sqrt3 = sqrt(Scalar(3.0));
+  EIGEN_DEVICE_FUNC static inline void computeRoots(const MatrixType& m, VectorType& roots) {
+    EIGEN_USING_STD(sqrt)
+    EIGEN_USING_STD(atan2)
+    EIGEN_USING_STD(cos)
+    EIGEN_USING_STD(sin)
+    const Scalar s_inv3 = Scalar(1) / Scalar(3);
+    const Scalar s_sqrt3 = sqrt(Scalar(3));
 
     // The characteristic equation is x^3 - c2*x^2 + c1*x - c0 = 0.  The
     // eigenvalues are the roots to this equation, all guaranteed to be
     // real-valued, because the matrix is symmetric.
-    Scalar c0 = m(0,0)*m(1,1)*m(2,2) + Scalar(2)*m(1,0)*m(2,0)*m(2,1) - m(0,0)*m(2,1)*m(2,1) - m(1,1)*m(2,0)*m(2,0) - m(2,2)*m(1,0)*m(1,0);
-    Scalar c1 = m(0,0)*m(1,1) - m(1,0)*m(1,0) + m(0,0)*m(2,2) - m(2,0)*m(2,0) + m(1,1)*m(2,2) - m(2,1)*m(2,1);
-    Scalar c2 = m(0,0) + m(1,1) + m(2,2);
+    Scalar c0 = m(0, 0) * m(1, 1) * m(2, 2) + Scalar(2) * m(1, 0) * m(2, 0) * m(2, 1) - m(0, 0) * m(2, 1) * m(2, 1) -
+                m(1, 1) * m(2, 0) * m(2, 0) - m(2, 2) * m(1, 0) * m(1, 0);
+    Scalar c1 = m(0, 0) * m(1, 1) - m(1, 0) * m(1, 0) + m(0, 0) * m(2, 2) - m(2, 0) * m(2, 0) + m(1, 1) * m(2, 2) -
+                m(2, 1) * m(2, 1);
+    Scalar c2 = m(0, 0) + m(1, 1) + m(2, 2);
 
     // Construct the parameters used in classifying the roots of the equation
     // and in solving the equation for the roots in closed form.
-    Scalar c2_over_3 = c2*s_inv3;
-    Scalar a_over_3 = (c2*c2_over_3 - c1)*s_inv3;
-    if(a_over_3<Scalar(0))
-      a_over_3 = Scalar(0);
+    Scalar c2_over_3 = c2 * s_inv3;
+    Scalar a_over_3 = (c2 * c2_over_3 - c1) * s_inv3;
+    a_over_3 = numext::maxi(a_over_3, Scalar(0));
 
-    Scalar half_b = Scalar(0.5)*(c0 + c2_over_3*(Scalar(2)*c2_over_3*c2_over_3 - c1));
+    Scalar half_b = Scalar(0.5) * (c0 + c2_over_3 * (Scalar(2) * c2_over_3 * c2_over_3 - c1));
 
-    Scalar q = a_over_3*a_over_3*a_over_3 - half_b*half_b;
-    if(q<Scalar(0))
-      q = Scalar(0);
+    Scalar q = a_over_3 * a_over_3 * a_over_3 - half_b * half_b;
+    q = numext::maxi(q, Scalar(0));
 
     // Compute the eigenvalues by solving for the roots of the polynomial.
     Scalar rho = sqrt(a_over_3);
-    Scalar theta = atan2(sqrt(q),half_b)*s_inv3;  // since sqrt(q) > 0, atan2 is in [0, pi] and theta is in [0, pi/3]
+    Scalar theta = atan2(sqrt(q), half_b) * s_inv3;  // since sqrt(q) > 0, atan2 is in [0, pi] and theta is in [0, pi/3]
     Scalar cos_theta = cos(theta);
     Scalar sin_theta = sin(theta);
     // roots are already sorted, since cos is monotonically decreasing on [0, pi]
-    roots(0) = c2_over_3 - rho*(cos_theta + s_sqrt3*sin_theta); // == 2*rho*cos(theta+2pi/3)
-    roots(1) = c2_over_3 - rho*(cos_theta - s_sqrt3*sin_theta); // == 2*rho*cos(theta+ pi/3)
-    roots(2) = c2_over_3 + Scalar(2)*rho*cos_theta;
+    roots(0) = c2_over_3 - rho * (cos_theta + s_sqrt3 * sin_theta);  // == 2*rho*cos(theta+2pi/3)
+    roots(1) = c2_over_3 - rho * (cos_theta - s_sqrt3 * sin_theta);  // == 2*rho*cos(theta+ pi/3)
+    roots(2) = c2_over_3 + Scalar(2) * rho * cos_theta;
   }
 
-  static inline bool extract_kernel(MatrixType& mat, Ref<VectorType> res, Ref<VectorType> representative)
-  {
-    using std::abs;
+  EIGEN_DEVICE_FUNC static inline bool extract_kernel(MatrixType& mat, Ref<VectorType> res,
+                                                      Ref<VectorType> representative) {
+    EIGEN_USING_STD(abs);
+    EIGEN_USING_STD(sqrt);
     Index i0;
     // Find non-zero column i0 (by construction, there must exist a non zero coefficient on the diagonal):
     mat.diagonal().cwiseAbs().maxCoeff(&i0);
@@ -557,46 +616,43 @@ template<typename SolverType> struct direct_selfadjoint_eigenvalues<SolverType,3
     representative = mat.col(i0);
     Scalar n0, n1;
     VectorType c0, c1;
-    n0 = (c0 = representative.cross(mat.col((i0+1)%3))).squaredNorm();
-    n1 = (c1 = representative.cross(mat.col((i0+2)%3))).squaredNorm();
-    if(n0>n1) res = c0/std::sqrt(n0);
-    else      res = c1/std::sqrt(n1);
+    n0 = (c0 = representative.cross(mat.col((i0 + 1) % 3))).squaredNorm();
+    n1 = (c1 = representative.cross(mat.col((i0 + 2) % 3))).squaredNorm();
+    if (n0 > n1)
+      res = c0 / sqrt(n0);
+    else
+      res = c1 / sqrt(n1);
 
     return true;
   }
 
-  static inline void run(SolverType& solver, const MatrixType& mat, int options)
-  {
+  EIGEN_DEVICE_FUNC static inline void run(SolverType& solver, const MatrixType& mat, int options) {
     eigen_assert(mat.cols() == 3 && mat.cols() == mat.rows());
-    eigen_assert((options&~(EigVecMask|GenEigMask))==0
-            && (options&EigVecMask)!=EigVecMask
-            && "invalid option parameter");
-    bool computeEigenvectors = (options&ComputeEigenvectors)==ComputeEigenvectors;
-    
+    eigen_assert((options & ~(EigVecMask | GenEigMask)) == 0 && (options & EigVecMask) != EigVecMask &&
+                 "invalid option parameter");
+    bool computeEigenvectors = (options & ComputeEigenvectors) == ComputeEigenvectors;
+
     EigenvectorsType& eivecs = solver.m_eivec;
     VectorType& eivals = solver.m_eivalues;
-  
+
     // Shift the matrix to the mean eigenvalue and map the matrix coefficients to [-1:1] to avoid over- and underflow.
     Scalar shift = mat.trace() / Scalar(3);
-    // TODO Avoid this copy. Currently it is necessary to suppress bogus values when determining maxCoeff and for computing the eigenvectors later
+    // TODO Avoid this copy. Currently it is necessary to suppress bogus values when determining maxCoeff and for
+    // computing the eigenvectors later
     MatrixType scaledMat = mat.template selfadjointView<Lower>();
     scaledMat.diagonal().array() -= shift;
     Scalar scale = scaledMat.cwiseAbs().maxCoeff();
-    if(scale > 0) scaledMat /= scale;   // TODO for scale==0 we could save the remaining operations
+    if (scale > 0) scaledMat /= scale;  // TODO for scale==0 we could save the remaining operations
 
     // compute the eigenvalues
-    computeRoots(scaledMat,eivals);
+    computeRoots(scaledMat, eivals);
 
     // compute the eigenvectors
-    if(computeEigenvectors)
-    {
-      if((eivals(2)-eivals(0))<=Eigen::NumTraits<Scalar>::epsilon())
-      {
+    if (computeEigenvectors) {
+      if ((eivals(2) - eivals(0)) <= Eigen::NumTraits<Scalar>::epsilon()) {
         // All three eigenvalues are numerically the same
         eivecs.setIdentity();
-      }
-      else
-      {
+      } else {
         MatrixType tmp;
         tmp = scaledMat;
 
@@ -604,31 +660,27 @@ template<typename SolverType> struct direct_selfadjoint_eigenvalues<SolverType,3
         Scalar d0 = eivals(2) - eivals(1);
         Scalar d1 = eivals(1) - eivals(0);
         Index k(0), l(2);
-        if(d0 > d1)
-        {
-          std::swap(k,l);
+        if (d0 > d1) {
+          numext::swap(k, l);
           d0 = d1;
         }
 
         // Compute the eigenvector of index k
         {
-          tmp.diagonal().array () -= eivals(k);
+          tmp.diagonal().array() -= eivals(k);
           // By construction, 'tmp' is of rank 2, and its kernel corresponds to the respective eigenvector.
           extract_kernel(tmp, eivecs.col(k), eivecs.col(l));
         }
 
         // Compute eigenvector of index l
-        if(d0<=2*Eigen::NumTraits<Scalar>::epsilon()*d1)
-        {
+        if (d0 <= 2 * Eigen::NumTraits<Scalar>::epsilon() * d1) {
           // If d0 is too small, then the two other eigenvalues are numerically the same,
           // and thus we only have to ortho-normalize the near orthogonal vector we saved above.
-          eivecs.col(l) -= eivecs.col(k).dot(eivecs.col(l))*eivecs.col(l);
+          eivecs.col(l) -= eivecs.col(k).dot(eivecs.col(l)) * eivecs.col(l);
           eivecs.col(l).normalize();
-        }
-        else
-        {
+        } else {
           tmp = scaledMat;
-          tmp.diagonal().array () -= eivals(l);
+          tmp.diagonal().array() -= eivals(l);
 
           VectorType dummy;
           extract_kernel(tmp, eivecs.col(l), dummy);
@@ -642,7 +694,7 @@ template<typename SolverType> struct direct_selfadjoint_eigenvalues<SolverType,3
     // Rescale back to the original size.
     eivals *= scale;
     eivals.array() += shift;
-    
+
     solver.m_info = Success;
     solver.m_isInitialized = true;
     solver.m_eigenvectorsOk = computeEigenvectors;
@@ -650,152 +702,149 @@ template<typename SolverType> struct direct_selfadjoint_eigenvalues<SolverType,3
 };
 
 // 2x2 direct eigenvalues decomposition, code from Hauke Heibel
-template<typename SolverType> struct direct_selfadjoint_eigenvalues<SolverType,2,false>
-{
+template <typename SolverType>
+struct direct_selfadjoint_eigenvalues<SolverType, 2, false> {
   typedef typename SolverType::MatrixType MatrixType;
   typedef typename SolverType::RealVectorType VectorType;
   typedef typename SolverType::Scalar Scalar;
   typedef typename SolverType::EigenvectorsType EigenvectorsType;
-  
-  static inline void computeRoots(const MatrixType& m, VectorType& roots)
-  {
-    using std::sqrt;
-    const Scalar t0 = Scalar(0.5) * sqrt( numext::abs2(m(0,0)-m(1,1)) + Scalar(4)*numext::abs2(m(1,0)));
-    const Scalar t1 = Scalar(0.5) * (m(0,0) + m(1,1));
+
+  EIGEN_DEVICE_FUNC static inline void computeRoots(const MatrixType& m, VectorType& roots) {
+    EIGEN_USING_STD(sqrt);
+    const Scalar t0 = Scalar(0.5) * sqrt(numext::abs2(m(0, 0) - m(1, 1)) + Scalar(4) * numext::abs2(m(1, 0)));
+    const Scalar t1 = Scalar(0.5) * (m(0, 0) + m(1, 1));
     roots(0) = t1 - t0;
     roots(1) = t1 + t0;
   }
-  
-  static inline void run(SolverType& solver, const MatrixType& mat, int options)
-  {
-    using std::sqrt;
-    using std::abs;
+
+  EIGEN_DEVICE_FUNC static inline void run(SolverType& solver, const MatrixType& mat, int options) {
+    EIGEN_USING_STD(sqrt);
+    EIGEN_USING_STD(abs);
 
     eigen_assert(mat.cols() == 2 && mat.cols() == mat.rows());
-    eigen_assert((options&~(EigVecMask|GenEigMask))==0
-            && (options&EigVecMask)!=EigVecMask
-            && "invalid option parameter");
-    bool computeEigenvectors = (options&ComputeEigenvectors)==ComputeEigenvectors;
-    
+    eigen_assert((options & ~(EigVecMask | GenEigMask)) == 0 && (options & EigVecMask) != EigVecMask &&
+                 "invalid option parameter");
+    bool computeEigenvectors = (options & ComputeEigenvectors) == ComputeEigenvectors;
+
     EigenvectorsType& eivecs = solver.m_eivec;
     VectorType& eivals = solver.m_eivalues;
-  
-    // map the matrix coefficients to [-1:1] to avoid over- and underflow.
-    Scalar scale = mat.cwiseAbs().maxCoeff();
-    scale = (std::max)(scale,Scalar(1));
-    MatrixType scaledMat = mat / scale;
-    
+
+    // Shift the matrix to the mean eigenvalue and map the matrix coefficients to [-1:1] to avoid over- and underflow.
+    Scalar shift = mat.trace() / Scalar(2);
+    MatrixType scaledMat = mat;
+    scaledMat.coeffRef(0, 1) = mat.coeff(1, 0);
+    scaledMat.diagonal().array() -= shift;
+    Scalar scale = scaledMat.cwiseAbs().maxCoeff();
+    if (scale > Scalar(0)) scaledMat /= scale;
+
     // Compute the eigenvalues
-    computeRoots(scaledMat,eivals);
-    
+    computeRoots(scaledMat, eivals);
+
     // compute the eigen vectors
-    if(computeEigenvectors)
-    {
-      if((eivals(1)-eivals(0))<=abs(eivals(1))*Eigen::NumTraits<Scalar>::epsilon())
-      {
+    if (computeEigenvectors) {
+      if ((eivals(1) - eivals(0)) <= abs(eivals(1)) * Eigen::NumTraits<Scalar>::epsilon()) {
         eivecs.setIdentity();
-      }
-      else
-      {
-        scaledMat.diagonal().array () -= eivals(1);
-        Scalar a2 = numext::abs2(scaledMat(0,0));
-        Scalar c2 = numext::abs2(scaledMat(1,1));
-        Scalar b2 = numext::abs2(scaledMat(1,0));
-        if(a2>c2)
-        {
-          eivecs.col(1) << -scaledMat(1,0), scaledMat(0,0);
-          eivecs.col(1) /= sqrt(a2+b2);
-        }
-        else
-        {
-          eivecs.col(1) << -scaledMat(1,1), scaledMat(1,0);
-          eivecs.col(1) /= sqrt(c2+b2);
+      } else {
+        scaledMat.diagonal().array() -= eivals(1);
+        Scalar a2 = numext::abs2(scaledMat(0, 0));
+        Scalar c2 = numext::abs2(scaledMat(1, 1));
+        Scalar b2 = numext::abs2(scaledMat(1, 0));
+        if (a2 > c2) {
+          eivecs.col(1) << -scaledMat(1, 0), scaledMat(0, 0);
+          eivecs.col(1) /= sqrt(a2 + b2);
+        } else {
+          eivecs.col(1) << -scaledMat(1, 1), scaledMat(1, 0);
+          eivecs.col(1) /= sqrt(c2 + b2);
         }
 
         eivecs.col(0) << eivecs.col(1).unitOrthogonal();
       }
     }
-    
+
     // Rescale back to the original size.
     eivals *= scale;
-    
+    eivals.array() += shift;
+
     solver.m_info = Success;
     solver.m_isInitialized = true;
     solver.m_eigenvectorsOk = computeEigenvectors;
   }
 };
 
-}
+}  // namespace internal
 
-template<typename MatrixType>
-SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<MatrixType>
-::computeDirect(const MatrixType& matrix, int options)
-{
-  internal::direct_selfadjoint_eigenvalues<SelfAdjointEigenSolver,Size,NumTraits<Scalar>::IsComplex>::run(*this,matrix,options);
+template <typename MatrixType>
+EIGEN_DEVICE_FUNC SelfAdjointEigenSolver<MatrixType>& SelfAdjointEigenSolver<MatrixType>::computeDirect(
+    const MatrixType& matrix, int options) {
+  internal::direct_selfadjoint_eigenvalues<SelfAdjointEigenSolver, Size, NumTraits<Scalar>::IsComplex>::run(
+      *this, matrix, options);
   return *this;
 }
 
 namespace internal {
-template<typename RealScalar, typename Scalar, typename Index>
-static void tridiagonal_qr_step(RealScalar* diag, RealScalar* subdiag, Index start, Index end, Scalar* matrixQ, Index n)
-{
-  using std::abs;
-  RealScalar td = (diag[end-1] - diag[end])*RealScalar(0.5);
-  RealScalar e = subdiag[end-1];
+
+// Francis implicit QR step.
+template <int StorageOrder, typename RealScalar, typename Scalar, typename Index>
+EIGEN_DEVICE_FUNC static void tridiagonal_qr_step(RealScalar* diag, RealScalar* subdiag, Index start, Index end,
+                                                  Scalar* matrixQ, Index n) {
+  // Wilkinson Shift.
+  RealScalar td = (diag[end - 1] - diag[end]) * RealScalar(0.5);
+  RealScalar e = subdiag[end - 1];
   // Note that thanks to scaling, e^2 or td^2 cannot overflow, however they can still
   // underflow thus leading to inf/NaN values when using the following commented code:
-//   RealScalar e2 = numext::abs2(subdiag[end-1]);
-//   RealScalar mu = diag[end] - e2 / (td + (td>0 ? 1 : -1) * sqrt(td*td + e2));
+  //   RealScalar e2 = numext::abs2(subdiag[end-1]);
+  //   RealScalar mu = diag[end] - e2 / (td + (td>0 ? 1 : -1) * sqrt(td*td + e2));
   // This explain the following, somewhat more complicated, version:
   RealScalar mu = diag[end];
-  if(td==0)
-    mu -= abs(e);
-  else
-  {
-    RealScalar e2 = numext::abs2(subdiag[end-1]);
-    RealScalar h = numext::hypot(td,e);
-    if(e2==0)  mu -= (e / (td + (td>0 ? 1 : -1))) * (e / h);
-    else       mu -= e2 / (td + (td>0 ? h : -h));
+  if (numext::is_exactly_zero(td)) {
+    mu -= numext::abs(e);
+  } else if (!numext::is_exactly_zero(e)) {
+    const RealScalar e2 = numext::abs2(e);
+    const RealScalar h = numext::hypot(td, e);
+    if (numext::is_exactly_zero(e2)) {
+      mu -= e / ((td + (td > RealScalar(0) ? h : -h)) / e);
+    } else {
+      mu -= e2 / (td + (td > RealScalar(0) ? h : -h));
+    }
   }
-  
+
   RealScalar x = diag[start] - mu;
   RealScalar z = subdiag[start];
-  for (Index k = start; k < end; ++k)
-  {
+  // If z ever becomes zero, the Givens rotation will be the identity and
+  // z will stay zero for all future iterations.
+  for (Index k = start; k < end && !numext::is_exactly_zero(z); ++k) {
     JacobiRotation<RealScalar> rot;
     rot.makeGivens(x, z);
 
     // do T = G' T G
     RealScalar sdk = rot.s() * diag[k] + rot.c() * subdiag[k];
-    RealScalar dkp1 = rot.s() * subdiag[k] + rot.c() * diag[k+1];
+    RealScalar dkp1 = rot.s() * subdiag[k] + rot.c() * diag[k + 1];
 
-    diag[k] = rot.c() * (rot.c() * diag[k] - rot.s() * subdiag[k]) - rot.s() * (rot.c() * subdiag[k] - rot.s() * diag[k+1]);
-    diag[k+1] = rot.s() * sdk + rot.c() * dkp1;
+    diag[k] =
+        rot.c() * (rot.c() * diag[k] - rot.s() * subdiag[k]) - rot.s() * (rot.c() * subdiag[k] - rot.s() * diag[k + 1]);
+    diag[k + 1] = rot.s() * sdk + rot.c() * dkp1;
     subdiag[k] = rot.c() * sdk - rot.s() * dkp1;
-    
 
-    if (k > start)
-      subdiag[k - 1] = rot.c() * subdiag[k-1] - rot.s() * z;
+    if (k > start) subdiag[k - 1] = rot.c() * subdiag[k - 1] - rot.s() * z;
 
+    // "Chasing the bulge" to return to triangular form.
     x = subdiag[k];
-
-    if (k < end - 1)
-    {
-      z = -rot.s() * subdiag[k+1];
-      subdiag[k + 1] = rot.c() * subdiag[k+1];
+    if (k < end - 1) {
+      z = -rot.s() * subdiag[k + 1];
+      subdiag[k + 1] = rot.c() * subdiag[k + 1];
     }
-    
+
     // apply the givens rotation to the unit matrix Q = Q * G
-    if (matrixQ)
-    {
-      Map<Matrix<Scalar,Dynamic,Dynamic,ColMajor> > q(matrixQ,n,n);
-      q.applyOnTheRight(k,k+1,rot);
+    if (matrixQ) {
+      // FIXME if StorageOrder == RowMajor this operation is not very efficient
+      Map<Matrix<Scalar, Dynamic, Dynamic, StorageOrder> > q(matrixQ, n, n);
+      q.applyOnTheRight(k, k + 1, rot);
     }
   }
 }
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_SELFADJOINTEIGENSOLVER_H
+#endif  // EIGEN_SELFADJOINTEIGENSOLVER_H
diff --git a/inst/include/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h b/inst/include/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h
new file mode 100644
index 00000000..fabc30d6
--- /dev/null
+++ b/inst/include/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h
@@ -0,0 +1,90 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to LAPACKe
+ *    Self-adjoint eigenvalues/eigenvectors.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_SAEIGENSOLVER_LAPACKE_H
+#define EIGEN_SAEIGENSOLVER_LAPACKE_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \internal Specialization for the data types supported by LAPACKe */
+
+#define EIGEN_LAPACKE_EIG_SELFADJ_2(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_NAME, EIGCOLROW)                   \
+  template <>                                                                                                        \
+  template <typename InputType>                                                                                      \
+  inline SelfAdjointEigenSolver<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >&                                      \
+  SelfAdjointEigenSolver<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const EigenBase<InputType>& matrix, \
+                                                                                 int options) {                      \
+    eigen_assert(matrix.cols() == matrix.rows());                                                                    \
+    eigen_assert((options & ~(EigVecMask | GenEigMask)) == 0 && (options & EigVecMask) != EigVecMask &&              \
+                 "invalid option parameter");                                                                        \
+    bool computeEigenvectors = (options & ComputeEigenvectors) == ComputeEigenvectors;                               \
+    lapack_int n = internal::convert_index<lapack_int>(matrix.cols()), lda, info;                                    \
+    m_eivalues.resize(n, 1);                                                                                         \
+    m_subdiag.resize(n - 1);                                                                                         \
+    m_eivec = matrix;                                                                                                \
+                                                                                                                     \
+    if (n == 1) {                                                                                                    \
+      m_eivalues.coeffRef(0, 0) = numext::real(m_eivec.coeff(0, 0));                                                 \
+      if (computeEigenvectors) m_eivec.setOnes(n, n);                                                                \
+      m_info = Success;                                                                                              \
+      m_isInitialized = true;                                                                                        \
+      m_eigenvectorsOk = computeEigenvectors;                                                                        \
+      return *this;                                                                                                  \
+    }                                                                                                                \
+                                                                                                                     \
+    lda = internal::convert_index<lapack_int>(m_eivec.outerStride());                                                \
+    char jobz, uplo = 'L' /*, range='A'*/;                                                                           \
+    jobz = computeEigenvectors ? 'V' : 'N';                                                                          \
+                                                                                                                     \
+    info = LAPACKE_##LAPACKE_NAME(LAPACK_COL_MAJOR, jobz, uplo, n, (LAPACKE_TYPE*)m_eivec.data(), lda,               \
+                                  (LAPACKE_RTYPE*)m_eivalues.data());                                                \
+    m_info = (info == 0) ? Success : NoConvergence;                                                                  \
+    m_isInitialized = true;                                                                                          \
+    m_eigenvectorsOk = computeEigenvectors;                                                                          \
+    return *this;                                                                                                    \
+  }
+
+#define EIGEN_LAPACKE_EIG_SELFADJ(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_NAME)       \
+  EIGEN_LAPACKE_EIG_SELFADJ_2(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_NAME, ColMajor) \
+  EIGEN_LAPACKE_EIG_SELFADJ_2(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_NAME, RowMajor)
+
+EIGEN_LAPACKE_EIG_SELFADJ(double, double, double, dsyev)
+EIGEN_LAPACKE_EIG_SELFADJ(float, float, float, ssyev)
+EIGEN_LAPACKE_EIG_SELFADJ(dcomplex, lapack_complex_double, double, zheev)
+EIGEN_LAPACKE_EIG_SELFADJ(scomplex, lapack_complex_float, float, cheev)
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_SAEIGENSOLVER_H
diff --git a/inst/include/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h b/inst/include/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h
deleted file mode 100644
index 17c0dadd..00000000
--- a/inst/include/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_MKL.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- Copyright (c) 2011, Intel Corporation. All rights reserved.
-
- Redistribution and use in source and binary forms, with or without modification,
- are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
- * Neither the name of Intel Corporation nor the names of its contributors may
-   be used to endorse or promote products derived from this software without
-   specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
- ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
- ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
- *    Self-adjoint eigenvalues/eigenvectors.
- ********************************************************************************
-*/
-
-#ifndef EIGEN_SAEIGENSOLVER_MKL_H
-#define EIGEN_SAEIGENSOLVER_MKL_H
-
-#include "Eigen/src/Core/util/MKL_support.h"
-
-namespace Eigen { 
-
-/** \internal Specialization for the data types supported by MKL */
-
-#define EIGEN_MKL_EIG_SELFADJ(EIGTYPE, MKLTYPE, MKLRTYPE, MKLNAME, EIGCOLROW, MKLCOLROW ) \
-template<> inline \
-SelfAdjointEigenSolver<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >& \
-SelfAdjointEigenSolver<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW> >::compute(const Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW>& matrix, int options) \
-{ \
-  eigen_assert(matrix.cols() == matrix.rows()); \
-  eigen_assert((options&~(EigVecMask|GenEigMask))==0 \
-          && (options&EigVecMask)!=EigVecMask \
-          && "invalid option parameter"); \
-  bool computeEigenvectors = (options&ComputeEigenvectors)==ComputeEigenvectors; \
-  lapack_int n = matrix.cols(), lda, matrix_order, info; \
-  m_eivalues.resize(n,1); \
-  m_subdiag.resize(n-1); \
-  m_eivec = matrix; \
-\
-  if(n==1) \
-  { \
-    m_eivalues.coeffRef(0,0) = numext::real(matrix.coeff(0,0)); \
-    if(computeEigenvectors) m_eivec.setOnes(n,n); \
-    m_info = Success; \
-    m_isInitialized = true; \
-    m_eigenvectorsOk = computeEigenvectors; \
-    return *this; \
-  } \
-\
-  lda = matrix.outerStride(); \
-  matrix_order=MKLCOLROW; \
-  char jobz, uplo='L'/*, range='A'*/; \
-  jobz = computeEigenvectors ? 'V' : 'N'; \
-\
-  info = LAPACKE_##MKLNAME( matrix_order, jobz, uplo, n, (MKLTYPE*)m_eivec.data(), lda, (MKLRTYPE*)m_eivalues.data() ); \
-  m_info = (info==0) ? Success : NoConvergence; \
-  m_isInitialized = true; \
-  m_eigenvectorsOk = computeEigenvectors; \
-  return *this; \
-}
-
-
-EIGEN_MKL_EIG_SELFADJ(double,   double,        double, dsyev, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_MKL_EIG_SELFADJ(float,    float,         float,  ssyev, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_MKL_EIG_SELFADJ(dcomplex, MKL_Complex16, double, zheev, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_MKL_EIG_SELFADJ(scomplex, MKL_Complex8,  float,  cheev, ColMajor, LAPACK_COL_MAJOR)
-
-EIGEN_MKL_EIG_SELFADJ(double,   double,        double, dsyev, RowMajor, LAPACK_ROW_MAJOR)
-EIGEN_MKL_EIG_SELFADJ(float,    float,         float,  ssyev, RowMajor, LAPACK_ROW_MAJOR)
-EIGEN_MKL_EIG_SELFADJ(dcomplex, MKL_Complex16, double, zheev, RowMajor, LAPACK_ROW_MAJOR)
-EIGEN_MKL_EIG_SELFADJ(scomplex, MKL_Complex8,  float,  cheev, RowMajor, LAPACK_ROW_MAJOR)
-
-} // end namespace Eigen
-
-#endif // EIGEN_SAEIGENSOLVER_H
diff --git a/inst/include/Eigen/src/Eigenvalues/Tridiagonalization.h b/inst/include/Eigen/src/Eigenvalues/Tridiagonalization.h
index 192278d6..9cc92011 100644
--- a/inst/include/Eigen/src/Eigenvalues/Tridiagonalization.h
+++ b/inst/include/Eigen/src/Eigenvalues/Tridiagonalization.h
@@ -11,547 +11,519 @@
 #ifndef EIGEN_TRIDIAGONALIZATION_H
 #define EIGEN_TRIDIAGONALIZATION_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 namespace internal {
-  
-template<typename MatrixType> struct TridiagonalizationMatrixTReturnType;
-template<typename MatrixType>
-struct traits<TridiagonalizationMatrixTReturnType<MatrixType> >
-{
-  typedef typename MatrixType::PlainObject ReturnType;
+
+template <typename MatrixType>
+struct TridiagonalizationMatrixTReturnType;
+template <typename MatrixType>
+struct traits<TridiagonalizationMatrixTReturnType<MatrixType>> : public traits<typename MatrixType::PlainObject> {
+  typedef typename MatrixType::PlainObject ReturnType;  // FIXME shall it be a BandMatrix?
+  enum { Flags = 0 };
 };
 
-template<typename MatrixType, typename CoeffVectorType>
-void tridiagonalization_inplace(MatrixType& matA, CoeffVectorType& hCoeffs);
-}
+template <typename MatrixType, typename CoeffVectorType>
+EIGEN_DEVICE_FUNC void tridiagonalization_inplace(MatrixType& matA, CoeffVectorType& hCoeffs);
+}  // namespace internal
 
 /** \eigenvalues_module \ingroup Eigenvalues_Module
-  *
-  *
-  * \class Tridiagonalization
-  *
-  * \brief Tridiagonal decomposition of a selfadjoint matrix
-  *
-  * \tparam _MatrixType the type of the matrix of which we are computing the
-  * tridiagonal decomposition; this is expected to be an instantiation of the
-  * Matrix class template.
-  *
-  * This class performs a tridiagonal decomposition of a selfadjoint matrix \f$ A \f$ such that:
-  * \f$ A = Q T Q^* \f$ where \f$ Q \f$ is unitary and \f$ T \f$ a real symmetric tridiagonal matrix.
-  *
-  * A tridiagonal matrix is a matrix which has nonzero elements only on the
-  * main diagonal and the first diagonal below and above it. The Hessenberg
-  * decomposition of a selfadjoint matrix is in fact a tridiagonal
-  * decomposition. This class is used in SelfAdjointEigenSolver to compute the
-  * eigenvalues and eigenvectors of a selfadjoint matrix.
-  *
-  * Call the function compute() to compute the tridiagonal decomposition of a
-  * given matrix. Alternatively, you can use the Tridiagonalization(const MatrixType&)
-  * constructor which computes the tridiagonal Schur decomposition at
-  * construction time. Once the decomposition is computed, you can use the
-  * matrixQ() and matrixT() functions to retrieve the matrices Q and T in the
-  * decomposition.
-  *
-  * The documentation of Tridiagonalization(const MatrixType&) contains an
-  * example of the typical use of this class.
-  *
-  * \sa class HessenbergDecomposition, class SelfAdjointEigenSolver
-  */
-template<typename _MatrixType> class Tridiagonalization
-{
-  public:
-
-    /** \brief Synonym for the template parameter \p _MatrixType. */
-    typedef _MatrixType MatrixType;
-
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename NumTraits<Scalar>::Real RealScalar;
-    typedef typename MatrixType::Index Index;
-
-    enum {
-      Size = MatrixType::RowsAtCompileTime,
-      SizeMinusOne = Size == Dynamic ? Dynamic : (Size > 1 ? Size - 1 : 1),
-      Options = MatrixType::Options,
-      MaxSize = MatrixType::MaxRowsAtCompileTime,
-      MaxSizeMinusOne = MaxSize == Dynamic ? Dynamic : (MaxSize > 1 ? MaxSize - 1 : 1)
-    };
-
-    typedef Matrix<Scalar, SizeMinusOne, 1, Options & ~RowMajor, MaxSizeMinusOne, 1> CoeffVectorType;
-    typedef typename internal::plain_col_type<MatrixType, RealScalar>::type DiagonalType;
-    typedef Matrix<RealScalar, SizeMinusOne, 1, Options & ~RowMajor, MaxSizeMinusOne, 1> SubDiagonalType;
-    typedef typename internal::remove_all<typename MatrixType::RealReturnType>::type MatrixTypeRealView;
-    typedef internal::TridiagonalizationMatrixTReturnType<MatrixTypeRealView> MatrixTReturnType;
-
-    typedef typename internal::conditional<NumTraits<Scalar>::IsComplex,
-              typename internal::add_const_on_value_type<typename Diagonal<const MatrixType>::RealReturnType>::type,
-              const Diagonal<const MatrixType>
-            >::type DiagonalReturnType;
-
-    typedef typename internal::conditional<NumTraits<Scalar>::IsComplex,
-              typename internal::add_const_on_value_type<typename Diagonal<
-                Block<const MatrixType,SizeMinusOne,SizeMinusOne> >::RealReturnType>::type,
-              const Diagonal<
-                Block<const MatrixType,SizeMinusOne,SizeMinusOne> >
-            >::type SubDiagonalReturnType;
-
-    /** \brief Return type of matrixQ() */
-    typedef HouseholderSequence<MatrixType,typename internal::remove_all<typename CoeffVectorType::ConjugateReturnType>::type> HouseholderSequenceType;
-
-    /** \brief Default constructor.
-      *
-      * \param [in]  size  Positive integer, size of the matrix whose tridiagonal
-      * decomposition will be computed.
-      *
-      * The default constructor is useful in cases in which the user intends to
-      * perform decompositions via compute().  The \p size parameter is only
-      * used as a hint. It is not an error to give a wrong \p size, but it may
-      * impair performance.
-      *
-      * \sa compute() for an example.
-      */
-    Tridiagonalization(Index size = Size==Dynamic ? 2 : Size)
-      : m_matrix(size,size),
-        m_hCoeffs(size > 1 ? size-1 : 1),
-        m_isInitialized(false)
-    {}
-
-    /** \brief Constructor; computes tridiagonal decomposition of given matrix.
-      *
-      * \param[in]  matrix  Selfadjoint matrix whose tridiagonal decomposition
-      * is to be computed.
-      *
-      * This constructor calls compute() to compute the tridiagonal decomposition.
-      *
-      * Example: \include Tridiagonalization_Tridiagonalization_MatrixType.cpp
-      * Output: \verbinclude Tridiagonalization_Tridiagonalization_MatrixType.out
-      */
-    Tridiagonalization(const MatrixType& matrix)
-      : m_matrix(matrix),
-        m_hCoeffs(matrix.cols() > 1 ? matrix.cols()-1 : 1),
-        m_isInitialized(false)
-    {
-      internal::tridiagonalization_inplace(m_matrix, m_hCoeffs);
-      m_isInitialized = true;
-    }
+ *
+ *
+ * \class Tridiagonalization
+ *
+ * \brief Tridiagonal decomposition of a selfadjoint matrix
+ *
+ * \tparam MatrixType_ the type of the matrix of which we are computing the
+ * tridiagonal decomposition; this is expected to be an instantiation of the
+ * Matrix class template.
+ *
+ * This class performs a tridiagonal decomposition of a selfadjoint matrix \f$ A \f$ such that:
+ * \f$ A = Q T Q^* \f$ where \f$ Q \f$ is unitary and \f$ T \f$ a real symmetric tridiagonal matrix.
+ *
+ * A tridiagonal matrix is a matrix which has nonzero elements only on the
+ * main diagonal and the first diagonal below and above it. The Hessenberg
+ * decomposition of a selfadjoint matrix is in fact a tridiagonal
+ * decomposition. This class is used in SelfAdjointEigenSolver to compute the
+ * eigenvalues and eigenvectors of a selfadjoint matrix.
+ *
+ * Call the function compute() to compute the tridiagonal decomposition of a
+ * given matrix. Alternatively, you can use the Tridiagonalization(const MatrixType&)
+ * constructor which computes the tridiagonal Schur decomposition at
+ * construction time. Once the decomposition is computed, you can use the
+ * matrixQ() and matrixT() functions to retrieve the matrices Q and T in the
+ * decomposition.
+ *
+ * The documentation of Tridiagonalization(const MatrixType&) contains an
+ * example of the typical use of this class.
+ *
+ * \sa class HessenbergDecomposition, class SelfAdjointEigenSolver
+ */
+template <typename MatrixType_>
+class Tridiagonalization {
+ public:
+  /** \brief Synonym for the template parameter \p MatrixType_. */
+  typedef MatrixType_ MatrixType;
 
-    /** \brief Computes tridiagonal decomposition of given matrix.
-      *
-      * \param[in]  matrix  Selfadjoint matrix whose tridiagonal decomposition
-      * is to be computed.
-      * \returns    Reference to \c *this
-      *
-      * The tridiagonal decomposition is computed by bringing the columns of
-      * the matrix successively in the required form using Householder
-      * reflections. The cost is \f$ 4n^3/3 \f$ flops, where \f$ n \f$ denotes
-      * the size of the given matrix.
-      *
-      * This method reuses of the allocated data in the Tridiagonalization
-      * object, if the size of the matrix does not change.
-      *
-      * Example: \include Tridiagonalization_compute.cpp
-      * Output: \verbinclude Tridiagonalization_compute.out
-      */
-    Tridiagonalization& compute(const MatrixType& matrix)
-    {
-      m_matrix = matrix;
-      m_hCoeffs.resize(matrix.rows()-1, 1);
-      internal::tridiagonalization_inplace(m_matrix, m_hCoeffs);
-      m_isInitialized = true;
-      return *this;
-    }
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef Eigen::Index Index;  ///< \deprecated since Eigen 3.3
+
+  enum {
+    Size = MatrixType::RowsAtCompileTime,
+    SizeMinusOne = Size == Dynamic ? Dynamic : (Size > 1 ? Size - 1 : 1),
+    Options = internal::traits<MatrixType>::Options,
+    MaxSize = MatrixType::MaxRowsAtCompileTime,
+    MaxSizeMinusOne = MaxSize == Dynamic ? Dynamic : (MaxSize > 1 ? MaxSize - 1 : 1)
+  };
+
+  typedef Matrix<Scalar, SizeMinusOne, 1, Options & ~RowMajor, MaxSizeMinusOne, 1> CoeffVectorType;
+  typedef typename internal::plain_col_type<MatrixType, RealScalar>::type DiagonalType;
+  typedef Matrix<RealScalar, SizeMinusOne, 1, Options & ~RowMajor, MaxSizeMinusOne, 1> SubDiagonalType;
+  typedef internal::remove_all_t<typename MatrixType::RealReturnType> MatrixTypeRealView;
+  typedef internal::TridiagonalizationMatrixTReturnType<MatrixTypeRealView> MatrixTReturnType;
+
+  typedef std::conditional_t<NumTraits<Scalar>::IsComplex,
+                             internal::add_const_on_value_type_t<typename Diagonal<const MatrixType>::RealReturnType>,
+                             const Diagonal<const MatrixType>>
+      DiagonalReturnType;
+
+  typedef std::conditional_t<
+      NumTraits<Scalar>::IsComplex,
+      internal::add_const_on_value_type_t<typename Diagonal<const MatrixType, -1>::RealReturnType>,
+      const Diagonal<const MatrixType, -1>>
+      SubDiagonalReturnType;
+
+  /** \brief Return type of matrixQ() */
+  typedef HouseholderSequence<MatrixType, internal::remove_all_t<typename CoeffVectorType::ConjugateReturnType>>
+      HouseholderSequenceType;
+
+  /** \brief Default constructor.
+   *
+   * \param [in]  size  Positive integer, size of the matrix whose tridiagonal
+   * decomposition will be computed.
+   *
+   * The default constructor is useful in cases in which the user intends to
+   * perform decompositions via compute().  The \p size parameter is only
+   * used as a hint. It is not an error to give a wrong \p size, but it may
+   * impair performance.
+   *
+   * \sa compute() for an example.
+   */
+  explicit Tridiagonalization(Index size = Size == Dynamic ? 2 : Size)
+      : m_matrix(size, size), m_hCoeffs(size > 1 ? size - 1 : 1), m_isInitialized(false) {}
+
+  /** \brief Constructor; computes tridiagonal decomposition of given matrix.
+   *
+   * \param[in]  matrix  Selfadjoint matrix whose tridiagonal decomposition
+   * is to be computed.
+   *
+   * This constructor calls compute() to compute the tridiagonal decomposition.
+   *
+   * Example: \include Tridiagonalization_Tridiagonalization_MatrixType.cpp
+   * Output: \verbinclude Tridiagonalization_Tridiagonalization_MatrixType.out
+   */
+  template <typename InputType>
+  explicit Tridiagonalization(const EigenBase<InputType>& matrix)
+      : m_matrix(matrix.derived()), m_hCoeffs(matrix.cols() > 1 ? matrix.cols() - 1 : 1), m_isInitialized(false) {
+    internal::tridiagonalization_inplace(m_matrix, m_hCoeffs);
+    m_isInitialized = true;
+  }
 
-    /** \brief Returns the Householder coefficients.
-      *
-      * \returns a const reference to the vector of Householder coefficients
-      *
-      * \pre Either the constructor Tridiagonalization(const MatrixType&) or
-      * the member function compute(const MatrixType&) has been called before
-      * to compute the tridiagonal decomposition of a matrix.
-      *
-      * The Householder coefficients allow the reconstruction of the matrix
-      * \f$ Q \f$ in the tridiagonal decomposition from the packed data.
-      *
-      * Example: \include Tridiagonalization_householderCoefficients.cpp
-      * Output: \verbinclude Tridiagonalization_householderCoefficients.out
-      *
-      * \sa packedMatrix(), \ref Householder_Module "Householder module"
-      */
-    inline CoeffVectorType householderCoefficients() const
-    {
-      eigen_assert(m_isInitialized && "Tridiagonalization is not initialized.");
-      return m_hCoeffs;
-    }
+  /** \brief Computes tridiagonal decomposition of given matrix.
+   *
+   * \param[in]  matrix  Selfadjoint matrix whose tridiagonal decomposition
+   * is to be computed.
+   * \returns    Reference to \c *this
+   *
+   * The tridiagonal decomposition is computed by bringing the columns of
+   * the matrix successively in the required form using Householder
+   * reflections. The cost is \f$ 4n^3/3 \f$ flops, where \f$ n \f$ denotes
+   * the size of the given matrix.
+   *
+   * This method reuses of the allocated data in the Tridiagonalization
+   * object, if the size of the matrix does not change.
+   *
+   * Example: \include Tridiagonalization_compute.cpp
+   * Output: \verbinclude Tridiagonalization_compute.out
+   */
+  template <typename InputType>
+  Tridiagonalization& compute(const EigenBase<InputType>& matrix) {
+    m_matrix = matrix.derived();
+    m_hCoeffs.resize(matrix.rows() - 1, 1);
+    internal::tridiagonalization_inplace(m_matrix, m_hCoeffs);
+    m_isInitialized = true;
+    return *this;
+  }
 
-    /** \brief Returns the internal representation of the decomposition
-      *
-      *	\returns a const reference to a matrix with the internal representation
-      *	         of the decomposition.
-      *
-      * \pre Either the constructor Tridiagonalization(const MatrixType&) or
-      * the member function compute(const MatrixType&) has been called before
-      * to compute the tridiagonal decomposition of a matrix.
-      *
-      * The returned matrix contains the following information:
-      *  - the strict upper triangular part is equal to the input matrix A.
-      *  - the diagonal and lower sub-diagonal represent the real tridiagonal
-      *    symmetric matrix T.
-      *  - the rest of the lower part contains the Householder vectors that,
-      *    combined with Householder coefficients returned by
-      *    householderCoefficients(), allows to reconstruct the matrix Q as
-      *       \f$ Q = H_{N-1} \ldots H_1 H_0 \f$.
-      *    Here, the matrices \f$ H_i \f$ are the Householder transformations
-      *       \f$ H_i = (I - h_i v_i v_i^T) \f$
-      *    where \f$ h_i \f$ is the \f$ i \f$th Householder coefficient and
-      *    \f$ v_i \f$ is the Householder vector defined by
-      *       \f$ v_i = [ 0, \ldots, 0, 1, M(i+2,i), \ldots, M(N-1,i) ]^T \f$
-      *    with M the matrix returned by this function.
-      *
-      * See LAPACK for further details on this packed storage.
-      *
-      * Example: \include Tridiagonalization_packedMatrix.cpp
-      * Output: \verbinclude Tridiagonalization_packedMatrix.out
-      *
-      * \sa householderCoefficients()
-      */
-    inline const MatrixType& packedMatrix() const
-    {
-      eigen_assert(m_isInitialized && "Tridiagonalization is not initialized.");
-      return m_matrix;
-    }
+  /** \brief Returns the Householder coefficients.
+   *
+   * \returns a const reference to the vector of Householder coefficients
+   *
+   * \pre Either the constructor Tridiagonalization(const MatrixType&) or
+   * the member function compute(const MatrixType&) has been called before
+   * to compute the tridiagonal decomposition of a matrix.
+   *
+   * The Householder coefficients allow the reconstruction of the matrix
+   * \f$ Q \f$ in the tridiagonal decomposition from the packed data.
+   *
+   * Example: \include Tridiagonalization_householderCoefficients.cpp
+   * Output: \verbinclude Tridiagonalization_householderCoefficients.out
+   *
+   * \sa packedMatrix(), \ref Householder_Module "Householder module"
+   */
+  inline CoeffVectorType householderCoefficients() const {
+    eigen_assert(m_isInitialized && "Tridiagonalization is not initialized.");
+    return m_hCoeffs;
+  }
 
-    /** \brief Returns the unitary matrix Q in the decomposition
-      *
-      * \returns object representing the matrix Q
-      *
-      * \pre Either the constructor Tridiagonalization(const MatrixType&) or
-      * the member function compute(const MatrixType&) has been called before
-      * to compute the tridiagonal decomposition of a matrix.
-      *
-      * This function returns a light-weight object of template class
-      * HouseholderSequence. You can either apply it directly to a matrix or
-      * you can convert it to a matrix of type #MatrixType.
-      *
-      * \sa Tridiagonalization(const MatrixType&) for an example,
-      *     matrixT(), class HouseholderSequence
-      */
-    HouseholderSequenceType matrixQ() const
-    {
-      eigen_assert(m_isInitialized && "Tridiagonalization is not initialized.");
-      return HouseholderSequenceType(m_matrix, m_hCoeffs.conjugate())
-             .setLength(m_matrix.rows() - 1)
-             .setShift(1);
-    }
+  /** \brief Returns the internal representation of the decomposition
+   *
+   *	\returns a const reference to a matrix with the internal representation
+   *	         of the decomposition.
+   *
+   * \pre Either the constructor Tridiagonalization(const MatrixType&) or
+   * the member function compute(const MatrixType&) has been called before
+   * to compute the tridiagonal decomposition of a matrix.
+   *
+   * The returned matrix contains the following information:
+   *  - the strict upper triangular part is equal to the input matrix A.
+   *  - the diagonal and lower sub-diagonal represent the real tridiagonal
+   *    symmetric matrix T.
+   *  - the rest of the lower part contains the Householder vectors that,
+   *    combined with Householder coefficients returned by
+   *    householderCoefficients(), allows to reconstruct the matrix Q as
+   *       \f$ Q = H_{N-1} \ldots H_1 H_0 \f$.
+   *    Here, the matrices \f$ H_i \f$ are the Householder transformations
+   *       \f$ H_i = (I - h_i v_i v_i^T) \f$
+   *    where \f$ h_i \f$ is the \f$ i \f$th Householder coefficient and
+   *    \f$ v_i \f$ is the Householder vector defined by
+   *       \f$ v_i = [ 0, \ldots, 0, 1, M(i+2,i), \ldots, M(N-1,i) ]^T \f$
+   *    with M the matrix returned by this function.
+   *
+   * See LAPACK for further details on this packed storage.
+   *
+   * Example: \include Tridiagonalization_packedMatrix.cpp
+   * Output: \verbinclude Tridiagonalization_packedMatrix.out
+   *
+   * \sa householderCoefficients()
+   */
+  inline const MatrixType& packedMatrix() const {
+    eigen_assert(m_isInitialized && "Tridiagonalization is not initialized.");
+    return m_matrix;
+  }
 
-    /** \brief Returns an expression of the tridiagonal matrix T in the decomposition
-      *
-      * \returns expression object representing the matrix T
-      *
-      * \pre Either the constructor Tridiagonalization(const MatrixType&) or
-      * the member function compute(const MatrixType&) has been called before
-      * to compute the tridiagonal decomposition of a matrix.
-      *
-      * Currently, this function can be used to extract the matrix T from internal
-      * data and copy it to a dense matrix object. In most cases, it may be
-      * sufficient to directly use the packed matrix or the vector expressions
-      * returned by diagonal() and subDiagonal() instead of creating a new
-      * dense copy matrix with this function.
-      *
-      * \sa Tridiagonalization(const MatrixType&) for an example,
-      * matrixQ(), packedMatrix(), diagonal(), subDiagonal()
-      */
-    MatrixTReturnType matrixT() const
-    {
-      eigen_assert(m_isInitialized && "Tridiagonalization is not initialized.");
-      return MatrixTReturnType(m_matrix.real());
-    }
+  /** \brief Returns the unitary matrix Q in the decomposition
+   *
+   * \returns object representing the matrix Q
+   *
+   * \pre Either the constructor Tridiagonalization(const MatrixType&) or
+   * the member function compute(const MatrixType&) has been called before
+   * to compute the tridiagonal decomposition of a matrix.
+   *
+   * This function returns a light-weight object of template class
+   * HouseholderSequence. You can either apply it directly to a matrix or
+   * you can convert it to a matrix of type #MatrixType.
+   *
+   * \sa Tridiagonalization(const MatrixType&) for an example,
+   *     matrixT(), class HouseholderSequence
+   */
+  HouseholderSequenceType matrixQ() const {
+    eigen_assert(m_isInitialized && "Tridiagonalization is not initialized.");
+    return HouseholderSequenceType(m_matrix, m_hCoeffs.conjugate()).setLength(m_matrix.rows() - 1).setShift(1);
+  }
+
+  /** \brief Returns an expression of the tridiagonal matrix T in the decomposition
+   *
+   * \returns expression object representing the matrix T
+   *
+   * \pre Either the constructor Tridiagonalization(const MatrixType&) or
+   * the member function compute(const MatrixType&) has been called before
+   * to compute the tridiagonal decomposition of a matrix.
+   *
+   * Currently, this function can be used to extract the matrix T from internal
+   * data and copy it to a dense matrix object. In most cases, it may be
+   * sufficient to directly use the packed matrix or the vector expressions
+   * returned by diagonal() and subDiagonal() instead of creating a new
+   * dense copy matrix with this function.
+   *
+   * \sa Tridiagonalization(const MatrixType&) for an example,
+   * matrixQ(), packedMatrix(), diagonal(), subDiagonal()
+   */
+  MatrixTReturnType matrixT() const {
+    eigen_assert(m_isInitialized && "Tridiagonalization is not initialized.");
+    return MatrixTReturnType(m_matrix.real());
+  }
 
-    /** \brief Returns the diagonal of the tridiagonal matrix T in the decomposition.
-      *
-      * \returns expression representing the diagonal of T
-      *
-      * \pre Either the constructor Tridiagonalization(const MatrixType&) or
-      * the member function compute(const MatrixType&) has been called before
-      * to compute the tridiagonal decomposition of a matrix.
-      *
-      * Example: \include Tridiagonalization_diagonal.cpp
-      * Output: \verbinclude Tridiagonalization_diagonal.out
-      *
-      * \sa matrixT(), subDiagonal()
-      */
-    DiagonalReturnType diagonal() const;
-
-    /** \brief Returns the subdiagonal of the tridiagonal matrix T in the decomposition.
-      *
-      * \returns expression representing the subdiagonal of T
-      *
-      * \pre Either the constructor Tridiagonalization(const MatrixType&) or
-      * the member function compute(const MatrixType&) has been called before
-      * to compute the tridiagonal decomposition of a matrix.
-      *
-      * \sa diagonal() for an example, matrixT()
-      */
-    SubDiagonalReturnType subDiagonal() const;
-
-  protected:
-
-    MatrixType m_matrix;
-    CoeffVectorType m_hCoeffs;
-    bool m_isInitialized;
+  /** \brief Returns the diagonal of the tridiagonal matrix T in the decomposition.
+   *
+   * \returns expression representing the diagonal of T
+   *
+   * \pre Either the constructor Tridiagonalization(const MatrixType&) or
+   * the member function compute(const MatrixType&) has been called before
+   * to compute the tridiagonal decomposition of a matrix.
+   *
+   * Example: \include Tridiagonalization_diagonal.cpp
+   * Output: \verbinclude Tridiagonalization_diagonal.out
+   *
+   * \sa matrixT(), subDiagonal()
+   */
+  DiagonalReturnType diagonal() const;
+
+  /** \brief Returns the subdiagonal of the tridiagonal matrix T in the decomposition.
+   *
+   * \returns expression representing the subdiagonal of T
+   *
+   * \pre Either the constructor Tridiagonalization(const MatrixType&) or
+   * the member function compute(const MatrixType&) has been called before
+   * to compute the tridiagonal decomposition of a matrix.
+   *
+   * \sa diagonal() for an example, matrixT()
+   */
+  SubDiagonalReturnType subDiagonal() const;
+
+ protected:
+  MatrixType m_matrix;
+  CoeffVectorType m_hCoeffs;
+  bool m_isInitialized;
 };
 
-template<typename MatrixType>
-typename Tridiagonalization<MatrixType>::DiagonalReturnType
-Tridiagonalization<MatrixType>::diagonal() const
-{
+template <typename MatrixType>
+typename Tridiagonalization<MatrixType>::DiagonalReturnType Tridiagonalization<MatrixType>::diagonal() const {
   eigen_assert(m_isInitialized && "Tridiagonalization is not initialized.");
-  return m_matrix.diagonal();
+  return m_matrix.diagonal().real();
 }
 
-template<typename MatrixType>
-typename Tridiagonalization<MatrixType>::SubDiagonalReturnType
-Tridiagonalization<MatrixType>::subDiagonal() const
-{
+template <typename MatrixType>
+typename Tridiagonalization<MatrixType>::SubDiagonalReturnType Tridiagonalization<MatrixType>::subDiagonal() const {
   eigen_assert(m_isInitialized && "Tridiagonalization is not initialized.");
-  Index n = m_matrix.rows();
-  return Block<const MatrixType,SizeMinusOne,SizeMinusOne>(m_matrix, 1, 0, n-1,n-1).diagonal();
+  return m_matrix.template diagonal<-1>().real();
 }
 
 namespace internal {
 
 /** \internal
-  * Performs a tridiagonal decomposition of the selfadjoint matrix \a matA in-place.
-  *
-  * \param[in,out] matA On input the selfadjoint matrix. Only the \b lower triangular part is referenced.
-  *                     On output, the strict upper part is left unchanged, and the lower triangular part
-  *                     represents the T and Q matrices in packed format has detailed below.
-  * \param[out]    hCoeffs returned Householder coefficients (see below)
-  *
-  * On output, the tridiagonal selfadjoint matrix T is stored in the diagonal
-  * and lower sub-diagonal of the matrix \a matA.
-  * The unitary matrix Q is represented in a compact way as a product of
-  * Householder reflectors \f$ H_i \f$ such that:
-  *       \f$ Q = H_{N-1} \ldots H_1 H_0 \f$.
-  * The Householder reflectors are defined as
-  *       \f$ H_i = (I - h_i v_i v_i^T) \f$
-  * where \f$ h_i = hCoeffs[i]\f$ is the \f$ i \f$th Householder coefficient and
-  * \f$ v_i \f$ is the Householder vector defined by
-  *       \f$ v_i = [ 0, \ldots, 0, 1, matA(i+2,i), \ldots, matA(N-1,i) ]^T \f$.
-  *
-  * Implemented from Golub's "Matrix Computations", algorithm 8.3.1.
-  *
-  * \sa Tridiagonalization::packedMatrix()
-  */
-template<typename MatrixType, typename CoeffVectorType>
-void tridiagonalization_inplace(MatrixType& matA, CoeffVectorType& hCoeffs)
-{
+ * Performs a tridiagonal decomposition of the selfadjoint matrix \a matA in-place.
+ *
+ * \param[in,out] matA On input the selfadjoint matrix. Only the \b lower triangular part is referenced.
+ *                     On output, the strict upper part is left unchanged, and the lower triangular part
+ *                     represents the T and Q matrices in packed format has detailed below.
+ * \param[out]    hCoeffs returned Householder coefficients (see below)
+ *
+ * On output, the tridiagonal selfadjoint matrix T is stored in the diagonal
+ * and lower sub-diagonal of the matrix \a matA.
+ * The unitary matrix Q is represented in a compact way as a product of
+ * Householder reflectors \f$ H_i \f$ such that:
+ *       \f$ Q = H_{N-1} \ldots H_1 H_0 \f$.
+ * The Householder reflectors are defined as
+ *       \f$ H_i = (I - h_i v_i v_i^T) \f$
+ * where \f$ h_i = hCoeffs[i]\f$ is the \f$ i \f$th Householder coefficient and
+ * \f$ v_i \f$ is the Householder vector defined by
+ *       \f$ v_i = [ 0, \ldots, 0, 1, matA(i+2,i), \ldots, matA(N-1,i) ]^T \f$.
+ *
+ * Implemented from Golub's "Matrix Computations", algorithm 8.3.1.
+ *
+ * \sa Tridiagonalization::packedMatrix()
+ */
+template <typename MatrixType, typename CoeffVectorType>
+EIGEN_DEVICE_FUNC void tridiagonalization_inplace(MatrixType& matA, CoeffVectorType& hCoeffs) {
   using numext::conj;
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::Scalar Scalar;
   typedef typename MatrixType::RealScalar RealScalar;
   Index n = matA.rows();
-  eigen_assert(n==matA.cols());
-  eigen_assert(n==hCoeffs.size()+1 || n==1);
-  
-  for (Index i = 0; i<n-1; ++i)
-  {
-    Index remainingSize = n-i-1;
+  eigen_assert(n == matA.cols());
+  eigen_assert(n == hCoeffs.size() + 1 || n == 1);
+
+  for (Index i = 0; i < n - 1; ++i) {
+    Index remainingSize = n - i - 1;
     RealScalar beta;
     Scalar h;
     matA.col(i).tail(remainingSize).makeHouseholderInPlace(h, beta);
 
     // Apply similarity transformation to remaining columns,
     // i.e., A = H A H' where H = I - h v v' and v = matA.col(i).tail(n-i-1)
-    matA.col(i).coeffRef(i+1) = 1;
+    matA.col(i).coeffRef(i + 1) = Scalar(1);
 
-    hCoeffs.tail(n-i-1).noalias() = (matA.bottomRightCorner(remainingSize,remainingSize).template selfadjointView<Lower>()
-                                  * (conj(h) * matA.col(i).tail(remainingSize)));
+    hCoeffs.tail(n - i - 1).noalias() =
+        (matA.bottomRightCorner(remainingSize, remainingSize).template selfadjointView<Lower>() *
+         (conj(h) * matA.col(i).tail(remainingSize)));
 
-    hCoeffs.tail(n-i-1) += (conj(h)*Scalar(-0.5)*(hCoeffs.tail(remainingSize).dot(matA.col(i).tail(remainingSize)))) * matA.col(i).tail(n-i-1);
+    hCoeffs.tail(n - i - 1) +=
+        (conj(h) * RealScalar(-0.5) * (hCoeffs.tail(remainingSize).dot(matA.col(i).tail(remainingSize)))) *
+        matA.col(i).tail(n - i - 1);
 
-    matA.bottomRightCorner(remainingSize, remainingSize).template selfadjointView<Lower>()
-      .rankUpdate(matA.col(i).tail(remainingSize), hCoeffs.tail(remainingSize), -1);
+    matA.bottomRightCorner(remainingSize, remainingSize)
+        .template selfadjointView<Lower>()
+        .rankUpdate(matA.col(i).tail(remainingSize), hCoeffs.tail(remainingSize), Scalar(-1));
 
-    matA.col(i).coeffRef(i+1) = beta;
+    matA.col(i).coeffRef(i + 1) = beta;
     hCoeffs.coeffRef(i) = h;
   }
 }
 
 // forward declaration, implementation at the end of this file
-template<typename MatrixType,
-         int Size=MatrixType::ColsAtCompileTime,
-         bool IsComplex=NumTraits<typename MatrixType::Scalar>::IsComplex>
+template <typename MatrixType, int Size = MatrixType::ColsAtCompileTime,
+          bool IsComplex = NumTraits<typename MatrixType::Scalar>::IsComplex>
 struct tridiagonalization_inplace_selector;
 
 /** \brief Performs a full tridiagonalization in place
-  *
-  * \param[in,out]  mat  On input, the selfadjoint matrix whose tridiagonal
-  *    decomposition is to be computed. Only the lower triangular part referenced.
-  *    The rest is left unchanged. On output, the orthogonal matrix Q
-  *    in the decomposition if \p extractQ is true.
-  * \param[out]  diag  The diagonal of the tridiagonal matrix T in the
-  *    decomposition.
-  * \param[out]  subdiag  The subdiagonal of the tridiagonal matrix T in
-  *    the decomposition.
-  * \param[in]  extractQ  If true, the orthogonal matrix Q in the
-  *    decomposition is computed and stored in \p mat.
-  *
-  * Computes the tridiagonal decomposition of the selfadjoint matrix \p mat in place
-  * such that \f$ mat = Q T Q^* \f$ where \f$ Q \f$ is unitary and \f$ T \f$ a real
-  * symmetric tridiagonal matrix.
-  *
-  * The tridiagonal matrix T is passed to the output parameters \p diag and \p subdiag. If
-  * \p extractQ is true, then the orthogonal matrix Q is passed to \p mat. Otherwise the lower
-  * part of the matrix \p mat is destroyed.
-  *
-  * The vectors \p diag and \p subdiag are not resized. The function
-  * assumes that they are already of the correct size. The length of the
-  * vector \p diag should equal the number of rows in \p mat, and the
-  * length of the vector \p subdiag should be one left.
-  *
-  * This implementation contains an optimized path for 3-by-3 matrices
-  * which is especially useful for plane fitting.
-  *
-  * \note Currently, it requires two temporary vectors to hold the intermediate
-  * Householder coefficients, and to reconstruct the matrix Q from the Householder
-  * reflectors.
-  *
-  * Example (this uses the same matrix as the example in
-  *    Tridiagonalization::Tridiagonalization(const MatrixType&)):
-  *    \include Tridiagonalization_decomposeInPlace.cpp
-  * Output: \verbinclude Tridiagonalization_decomposeInPlace.out
-  *
-  * \sa class Tridiagonalization
-  */
-template<typename MatrixType, typename DiagonalType, typename SubDiagonalType>
-void tridiagonalization_inplace(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ)
-{
-  eigen_assert(mat.cols()==mat.rows() && diag.size()==mat.rows() && subdiag.size()==mat.rows()-1);
-  tridiagonalization_inplace_selector<MatrixType>::run(mat, diag, subdiag, extractQ);
+ *
+ * \param[in,out]  mat  On input, the selfadjoint matrix whose tridiagonal
+ *    decomposition is to be computed. Only the lower triangular part referenced.
+ *    The rest is left unchanged. On output, the orthogonal matrix Q
+ *    in the decomposition if \p extractQ is true.
+ * \param[out]  diag  The diagonal of the tridiagonal matrix T in the
+ *    decomposition.
+ * \param[out]  subdiag  The subdiagonal of the tridiagonal matrix T in
+ *    the decomposition.
+ * \param[out]  hcoeffs
+ * \param[out]  workspace
+ * \param[in]  extractQ  If true, the orthogonal matrix Q in the
+ *    decomposition is computed and stored in \p mat.
+ *
+ * Computes the tridiagonal decomposition of the selfadjoint matrix \p mat in place
+ * such that \f$ mat = Q T Q^* \f$ where \f$ Q \f$ is unitary and \f$ T \f$ a real
+ * symmetric tridiagonal matrix.
+ *
+ * The tridiagonal matrix T is passed to the output parameters \p diag and \p subdiag. If
+ * \p extractQ is true, then the orthogonal matrix Q is passed to \p mat. Otherwise the lower
+ * part of the matrix \p mat is destroyed.
+ *
+ * The vectors \p diag and \p subdiag are not resized. The function
+ * assumes that they are already of the correct size. The length of the
+ * vector \p diag should equal the number of rows in \p mat, and the
+ * length of the vector \p subdiag should be one left.
+ *
+ * This implementation contains an optimized path for 3-by-3 matrices
+ * which is especially useful for plane fitting.
+ *
+ * \note Currently, it requires two temporary vectors to hold the intermediate
+ * Householder coefficients, and to reconstruct the matrix Q from the Householder
+ * reflectors.
+ *
+ * Example (this uses the same matrix as the example in
+ *    Tridiagonalization::Tridiagonalization(const MatrixType&)):
+ *    \include Tridiagonalization_decomposeInPlace.cpp
+ * Output: \verbinclude Tridiagonalization_decomposeInPlace.out
+ *
+ * \sa class Tridiagonalization
+ */
+template <typename MatrixType, typename DiagonalType, typename SubDiagonalType, typename CoeffVectorType,
+          typename WorkSpaceType>
+EIGEN_DEVICE_FUNC void tridiagonalization_inplace(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag,
+                                                  CoeffVectorType& hcoeffs, WorkSpaceType& workspace, bool extractQ) {
+  eigen_assert(mat.cols() == mat.rows() && diag.size() == mat.rows() && subdiag.size() == mat.rows() - 1);
+  tridiagonalization_inplace_selector<MatrixType>::run(mat, diag, subdiag, hcoeffs, workspace, extractQ);
 }
 
 /** \internal
-  * General full tridiagonalization
-  */
-template<typename MatrixType, int Size, bool IsComplex>
-struct tridiagonalization_inplace_selector
-{
-  typedef typename Tridiagonalization<MatrixType>::CoeffVectorType CoeffVectorType;
+ * General full tridiagonalization
+ */
+template <typename MatrixType, int Size, bool IsComplex>
+struct tridiagonalization_inplace_selector {
   typedef typename Tridiagonalization<MatrixType>::HouseholderSequenceType HouseholderSequenceType;
-  typedef typename MatrixType::Index Index;
-  template<typename DiagonalType, typename SubDiagonalType>
-  static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ)
-  {
-    CoeffVectorType hCoeffs(mat.cols()-1);
-    tridiagonalization_inplace(mat,hCoeffs);
+  template <typename DiagonalType, typename SubDiagonalType, typename CoeffVectorType, typename WorkSpaceType>
+  static EIGEN_DEVICE_FUNC void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag,
+                                    CoeffVectorType& hCoeffs, WorkSpaceType& workspace, bool extractQ) {
+    tridiagonalization_inplace(mat, hCoeffs);
     diag = mat.diagonal().real();
     subdiag = mat.template diagonal<-1>().real();
-    if(extractQ)
-      mat = HouseholderSequenceType(mat, hCoeffs.conjugate())
-            .setLength(mat.rows() - 1)
-            .setShift(1);
+    if (extractQ) {
+      HouseholderSequenceType(mat, hCoeffs.conjugate()).setLength(mat.rows() - 1).setShift(1).evalTo(mat, workspace);
+    }
   }
 };
 
 /** \internal
-  * Specialization for 3x3 real matrices.
-  * Especially useful for plane fitting.
-  */
-template<typename MatrixType>
-struct tridiagonalization_inplace_selector<MatrixType,3,false>
-{
+ * Specialization for 3x3 real matrices.
+ * Especially useful for plane fitting.
+ */
+template <typename MatrixType>
+struct tridiagonalization_inplace_selector<MatrixType, 3, false> {
   typedef typename MatrixType::Scalar Scalar;
   typedef typename MatrixType::RealScalar RealScalar;
 
-  template<typename DiagonalType, typename SubDiagonalType>
-  static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, bool extractQ)
-  {
+  template <typename DiagonalType, typename SubDiagonalType, typename CoeffVectorType, typename WorkSpaceType>
+  static EIGEN_DEVICE_FUNC void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType& subdiag, CoeffVectorType&,
+                                    WorkSpaceType&, bool extractQ) {
     using std::sqrt;
-    diag[0] = mat(0,0);
-    RealScalar v1norm2 = numext::abs2(mat(2,0));
-    if(v1norm2 == RealScalar(0))
-    {
-      diag[1] = mat(1,1);
-      diag[2] = mat(2,2);
-      subdiag[0] = mat(1,0);
-      subdiag[1] = mat(2,1);
-      if (extractQ)
-        mat.setIdentity();
-    }
-    else
-    {
-      RealScalar beta = sqrt(numext::abs2(mat(1,0)) + v1norm2);
-      RealScalar invBeta = RealScalar(1)/beta;
-      Scalar m01 = mat(1,0) * invBeta;
-      Scalar m02 = mat(2,0) * invBeta;
-      Scalar q = RealScalar(2)*m01*mat(2,1) + m02*(mat(2,2) - mat(1,1));
-      diag[1] = mat(1,1) + m02*q;
-      diag[2] = mat(2,2) - m02*q;
+    const RealScalar tol = (std::numeric_limits<RealScalar>::min)();
+    diag[0] = mat(0, 0);
+    RealScalar v1norm2 = numext::abs2(mat(2, 0));
+    if (v1norm2 <= tol) {
+      diag[1] = mat(1, 1);
+      diag[2] = mat(2, 2);
+      subdiag[0] = mat(1, 0);
+      subdiag[1] = mat(2, 1);
+      if (extractQ) mat.setIdentity();
+    } else {
+      RealScalar beta = sqrt(numext::abs2(mat(1, 0)) + v1norm2);
+      RealScalar invBeta = RealScalar(1) / beta;
+      Scalar m01 = mat(1, 0) * invBeta;
+      Scalar m02 = mat(2, 0) * invBeta;
+      Scalar q = RealScalar(2) * m01 * mat(2, 1) + m02 * (mat(2, 2) - mat(1, 1));
+      diag[1] = mat(1, 1) + m02 * q;
+      diag[2] = mat(2, 2) - m02 * q;
       subdiag[0] = beta;
-      subdiag[1] = mat(2,1) - m01 * q;
-      if (extractQ)
-      {
-        mat << 1,   0,    0,
-               0, m01,  m02,
-               0, m02, -m01;
+      subdiag[1] = mat(2, 1) - m01 * q;
+      if (extractQ) {
+        mat << 1, 0, 0, 0, m01, m02, 0, m02, -m01;
       }
     }
   }
 };
 
 /** \internal
-  * Trivial specialization for 1x1 matrices
-  */
-template<typename MatrixType, bool IsComplex>
-struct tridiagonalization_inplace_selector<MatrixType,1,IsComplex>
-{
+ * Trivial specialization for 1x1 matrices
+ */
+template <typename MatrixType, bool IsComplex>
+struct tridiagonalization_inplace_selector<MatrixType, 1, IsComplex> {
   typedef typename MatrixType::Scalar Scalar;
 
-  template<typename DiagonalType, typename SubDiagonalType>
-  static void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType&, bool extractQ)
-  {
-    diag(0,0) = numext::real(mat(0,0));
-    if(extractQ)
-      mat(0,0) = Scalar(1);
+  template <typename DiagonalType, typename SubDiagonalType, typename CoeffVectorType, typename WorkSpaceType>
+  static EIGEN_DEVICE_FUNC void run(MatrixType& mat, DiagonalType& diag, SubDiagonalType&, CoeffVectorType&,
+                                    WorkSpaceType&, bool extractQ) {
+    diag(0, 0) = numext::real(mat(0, 0));
+    if (extractQ) mat(0, 0) = Scalar(1);
   }
 };
 
 /** \internal
-  * \eigenvalues_module \ingroup Eigenvalues_Module
-  *
-  * \brief Expression type for return value of Tridiagonalization::matrixT()
-  *
-  * \tparam MatrixType type of underlying dense matrix
-  */
-template<typename MatrixType> struct TridiagonalizationMatrixTReturnType
-: public ReturnByValue<TridiagonalizationMatrixTReturnType<MatrixType> >
-{
-    typedef typename MatrixType::Index Index;
-  public:
-    /** \brief Constructor.
-      *
-      * \param[in] mat The underlying dense matrix
-      */
-    TridiagonalizationMatrixTReturnType(const MatrixType& mat) : m_matrix(mat) { }
-
-    template <typename ResultType>
-    inline void evalTo(ResultType& result) const
-    {
-      result.setZero();
-      result.template diagonal<1>() = m_matrix.template diagonal<-1>().conjugate();
-      result.diagonal() = m_matrix.diagonal();
-      result.template diagonal<-1>() = m_matrix.template diagonal<-1>();
-    }
+ * \eigenvalues_module \ingroup Eigenvalues_Module
+ *
+ * \brief Expression type for return value of Tridiagonalization::matrixT()
+ *
+ * \tparam MatrixType type of underlying dense matrix
+ */
+template <typename MatrixType>
+struct TridiagonalizationMatrixTReturnType : public ReturnByValue<TridiagonalizationMatrixTReturnType<MatrixType>> {
+ public:
+  /** \brief Constructor.
+   *
+   * \param[in] mat The underlying dense matrix
+   */
+  TridiagonalizationMatrixTReturnType(const MatrixType& mat) : m_matrix(mat) {}
+
+  template <typename ResultType>
+  inline void evalTo(ResultType& result) const {
+    result.setZero();
+    result.template diagonal<1>() = m_matrix.template diagonal<-1>().conjugate();
+    result.diagonal() = m_matrix.diagonal();
+    result.template diagonal<-1>() = m_matrix.template diagonal<-1>();
+  }
 
-    Index rows() const { return m_matrix.rows(); }
-    Index cols() const { return m_matrix.cols(); }
+  constexpr Index rows() const noexcept { return m_matrix.rows(); }
+  constexpr Index cols() const noexcept { return m_matrix.cols(); }
 
-  protected:
-    typename MatrixType::Nested m_matrix;
+ protected:
+  typename MatrixType::Nested m_matrix;
 };
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_TRIDIAGONALIZATION_H
+#endif  // EIGEN_TRIDIAGONALIZATION_H
diff --git a/inst/include/Eigen/src/Geometry/AlignedBox.h b/inst/include/Eigen/src/Geometry/AlignedBox.h
index 7e1cd9eb..e97a8f29 100644
--- a/inst/include/Eigen/src/Geometry/AlignedBox.h
+++ b/inst/include/Eigen/src/Geometry/AlignedBox.h
@@ -7,153 +7,204 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+// Function void Eigen::AlignedBox::transform(const Transform& transform)
+// is provided under the following license agreement:
+//
+// Software License Agreement (BSD License)
+//
+// Copyright (c) 2011-2014, Willow Garage, Inc.
+// Copyright (c) 2014-2015, Open Source Robotics Foundation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions
+// are met:
+//
+//  * Redistributions of source code must retain the above copyright
+//    notice, this list of conditions and the following disclaimer.
+//  * Redistributions in binary form must reproduce the above
+//    copyright notice, this list of conditions and the following
+//    disclaimer in the documentation and/or other materials provided
+//    with the distribution.
+//  * Neither the name of Open Source Robotics Foundation nor the names of its
+//    contributors may be used to endorse or promote products derived
+//    from this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+// COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+// POSSIBILITY OF SUCH DAMAGE.
+
 #ifndef EIGEN_ALIGNEDBOX_H
 #define EIGEN_ALIGNEDBOX_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 /** \geometry_module \ingroup Geometry_Module
-  *
-  *
-  * \class AlignedBox
-  *
-  * \brief An axis aligned box
-  *
-  * \tparam _Scalar the type of the scalar coefficients
-  * \tparam _AmbientDim the dimension of the ambient space, can be a compile time value or Dynamic.
-  *
-  * This class represents an axis aligned box as a pair of the minimal and maximal corners.
-  * \warning The result of most methods is undefined when applied to an empty box. You can check for empty boxes using isEmpty().
-  * \sa alignedboxtypedefs
-  */
-template <typename _Scalar, int _AmbientDim>
-class AlignedBox
-{
-public:
-EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim)
-  enum { AmbientDimAtCompileTime = _AmbientDim };
-  typedef _Scalar                                   Scalar;
-  typedef NumTraits<Scalar>                         ScalarTraits;
-  typedef DenseIndex                                Index;
-  typedef typename ScalarTraits::Real               RealScalar;
-  typedef typename ScalarTraits::NonInteger      NonInteger;
-  typedef Matrix<Scalar,AmbientDimAtCompileTime,1>  VectorType;
+ *
+ *
+ * \class AlignedBox
+ *
+ * \brief An axis aligned box
+ *
+ * \tparam Scalar_ the type of the scalar coefficients
+ * \tparam AmbientDim_ the dimension of the ambient space, can be a compile time value or Dynamic.
+ *
+ * This class represents an axis aligned box as a pair of the minimal and maximal corners.
+ * \warning The result of most methods is undefined when applied to an empty box. You can check for empty boxes using
+ * isEmpty(). \sa alignedboxtypedefs
+ */
+template <typename Scalar_, int AmbientDim_>
+class AlignedBox {
+ public:
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar_, AmbientDim_)
+  enum { AmbientDimAtCompileTime = AmbientDim_ };
+  typedef Scalar_ Scalar;
+  typedef NumTraits<Scalar> ScalarTraits;
+  typedef Eigen::Index Index;  ///< \deprecated since Eigen 3.3
+  typedef typename ScalarTraits::Real RealScalar;
+  typedef typename ScalarTraits::NonInteger NonInteger;
+  typedef Matrix<Scalar, AmbientDimAtCompileTime, 1> VectorType;
+  typedef CwiseBinaryOp<internal::scalar_sum_op<Scalar>, const VectorType, const VectorType> VectorTypeSum;
 
   /** Define constants to name the corners of a 1D, 2D or 3D axis aligned bounding box */
-  enum CornerType
-  {
+  enum CornerType {
     /** 1D names @{ */
-    Min=0, Max=1,
+    Min = 0,
+    Max = 1,
     /** @} */
 
     /** Identifier for 2D corner @{ */
-    BottomLeft=0, BottomRight=1,
-    TopLeft=2, TopRight=3,
+    BottomLeft = 0,
+    BottomRight = 1,
+    TopLeft = 2,
+    TopRight = 3,
     /** @} */
 
     /** Identifier for 3D corner  @{ */
-    BottomLeftFloor=0, BottomRightFloor=1,
-    TopLeftFloor=2, TopRightFloor=3,
-    BottomLeftCeil=4, BottomRightCeil=5,
-    TopLeftCeil=6, TopRightCeil=7
+    BottomLeftFloor = 0,
+    BottomRightFloor = 1,
+    TopLeftFloor = 2,
+    TopRightFloor = 3,
+    BottomLeftCeil = 4,
+    BottomRightCeil = 5,
+    TopLeftCeil = 6,
+    TopRightCeil = 7
     /** @} */
   };
 
-
   /** Default constructor initializing a null box. */
-  inline AlignedBox()
-  { if (AmbientDimAtCompileTime!=Dynamic) setEmpty(); }
+  EIGEN_DEVICE_FUNC inline AlignedBox() {
+    if (EIGEN_CONST_CONDITIONAL(AmbientDimAtCompileTime != Dynamic)) setEmpty();
+  }
 
   /** Constructs a null box with \a _dim the dimension of the ambient space. */
-  inline explicit AlignedBox(Index _dim) : m_min(_dim), m_max(_dim)
-  { setEmpty(); }
+  EIGEN_DEVICE_FUNC inline explicit AlignedBox(Index _dim) : m_min(_dim), m_max(_dim) { setEmpty(); }
 
   /** Constructs a box with extremities \a _min and \a _max.
-   * \warning If either component of \a _min is larger than the same component of \a _max, the constructed box is empty. */
-  template<typename OtherVectorType1, typename OtherVectorType2>
-  inline AlignedBox(const OtherVectorType1& _min, const OtherVectorType2& _max) : m_min(_min), m_max(_max) {}
+   * \warning If either component of \a _min is larger than the same component of \a _max, the constructed box is empty.
+   */
+  template <typename OtherVectorType1, typename OtherVectorType2>
+  EIGEN_DEVICE_FUNC inline AlignedBox(const OtherVectorType1& _min, const OtherVectorType2& _max)
+      : m_min(_min), m_max(_max) {}
 
   /** Constructs a box containing a single point \a p. */
-  template<typename Derived>
-  inline explicit AlignedBox(const MatrixBase<Derived>& p) : m_min(p), m_max(m_min)
-  { }
+  template <typename Derived>
+  EIGEN_DEVICE_FUNC inline explicit AlignedBox(const MatrixBase<Derived>& p) : m_min(p), m_max(m_min) {}
 
-  ~AlignedBox() {}
+  EIGEN_DEVICE_FUNC ~AlignedBox() {}
 
   /** \returns the dimension in which the box holds */
-  inline Index dim() const { return AmbientDimAtCompileTime==Dynamic ? m_min.size() : Index(AmbientDimAtCompileTime); }
+  EIGEN_DEVICE_FUNC inline Index dim() const {
+    return AmbientDimAtCompileTime == Dynamic ? m_min.size() : Index(AmbientDimAtCompileTime);
+  }
 
   /** \deprecated use isEmpty() */
-  inline bool isNull() const { return isEmpty(); }
+  EIGEN_DEVICE_FUNC inline bool isNull() const { return isEmpty(); }
 
   /** \deprecated use setEmpty() */
-  inline void setNull() { setEmpty(); }
+  EIGEN_DEVICE_FUNC inline void setNull() { setEmpty(); }
 
   /** \returns true if the box is empty.
    * \sa setEmpty */
-  inline bool isEmpty() const { return (m_min.array() > m_max.array()).any(); }
+  EIGEN_DEVICE_FUNC inline bool isEmpty() const { return (m_min.array() > m_max.array()).any(); }
 
   /** Makes \c *this an empty box.
    * \sa isEmpty */
-  inline void setEmpty()
-  {
-    m_min.setConstant( ScalarTraits::highest() );
-    m_max.setConstant( ScalarTraits::lowest() );
+  EIGEN_DEVICE_FUNC inline void setEmpty() {
+    m_min.setConstant(ScalarTraits::highest());
+    m_max.setConstant(ScalarTraits::lowest());
   }
 
   /** \returns the minimal corner */
-  inline const VectorType& (min)() const { return m_min; }
+  EIGEN_DEVICE_FUNC inline const VectorType&(min)() const { return m_min; }
   /** \returns a non const reference to the minimal corner */
-  inline VectorType& (min)() { return m_min; }
+  EIGEN_DEVICE_FUNC inline VectorType&(min)() { return m_min; }
   /** \returns the maximal corner */
-  inline const VectorType& (max)() const { return m_max; }
+  EIGEN_DEVICE_FUNC inline const VectorType&(max)() const { return m_max; }
   /** \returns a non const reference to the maximal corner */
-  inline VectorType& (max)() { return m_max; }
+  EIGEN_DEVICE_FUNC inline VectorType&(max)() { return m_max; }
 
   /** \returns the center of the box */
-  inline const CwiseUnaryOp<internal::scalar_quotient1_op<Scalar>,
-                            const CwiseBinaryOp<internal::scalar_sum_op<Scalar>, const VectorType, const VectorType> >
-  center() const
-  { return (m_min+m_max)/2; }
+  EIGEN_DEVICE_FUNC inline const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(VectorTypeSum, RealScalar, quotient)
+      center() const {
+    return (m_min + m_max) / RealScalar(2);
+  }
 
   /** \returns the lengths of the sides of the bounding box.
-    * Note that this function does not get the same
-    * result for integral or floating scalar types: see
-    */
-  inline const CwiseBinaryOp< internal::scalar_difference_op<Scalar>, const VectorType, const VectorType> sizes() const
-  { return m_max - m_min; }
+   * Note that this function does not get the same
+   * result for integral or floating scalar types: see
+   */
+  EIGEN_DEVICE_FUNC inline const CwiseBinaryOp<internal::scalar_difference_op<Scalar, Scalar>, const VectorType,
+                                               const VectorType>
+  sizes() const {
+    return m_max - m_min;
+  }
 
   /** \returns the volume of the bounding box */
-  inline Scalar volume() const
-  { return sizes().prod(); }
+  EIGEN_DEVICE_FUNC inline Scalar volume() const { return isEmpty() ? Scalar(0) : sizes().prod(); }
 
   /** \returns an expression for the bounding box diagonal vector
-    * if the length of the diagonal is needed: diagonal().norm()
-    * will provide it.
-    */
-  inline CwiseBinaryOp< internal::scalar_difference_op<Scalar>, const VectorType, const VectorType> diagonal() const
-  { return sizes(); }
+   * if the length of the diagonal is needed: diagonal().norm()
+   * will provide it.
+   */
+  EIGEN_DEVICE_FUNC inline CwiseBinaryOp<internal::scalar_difference_op<Scalar, Scalar>, const VectorType,
+                                         const VectorType>
+  diagonal() const {
+    return sizes();
+  }
 
   /** \returns the vertex of the bounding box at the corner defined by
-    * the corner-id corner. It works only for a 1D, 2D or 3D bounding box.
-    * For 1D bounding boxes corners are named by 2 enum constants:
-    * BottomLeft and BottomRight.
-    * For 2D bounding boxes, corners are named by 4 enum constants:
-    * BottomLeft, BottomRight, TopLeft, TopRight.
-    * For 3D bounding boxes, the following names are added:
-    * BottomLeftCeil, BottomRightCeil, TopLeftCeil, TopRightCeil.
-    */
-  inline VectorType corner(CornerType corner) const
-  {
-    EIGEN_STATIC_ASSERT(_AmbientDim <= 3, THIS_METHOD_IS_ONLY_FOR_VECTORS_OF_A_SPECIFIC_SIZE);
+   * the corner-id corner. It works only for a 1D, 2D or 3D bounding box.
+   * For 1D bounding boxes corners are named by 2 enum constants:
+   * BottomLeft and BottomRight.
+   * For 2D bounding boxes, corners are named by 4 enum constants:
+   * BottomLeft, BottomRight, TopLeft, TopRight.
+   * For 3D bounding boxes, the following names are added:
+   * BottomLeftCeil, BottomRightCeil, TopLeftCeil, TopRightCeil.
+   */
+  EIGEN_DEVICE_FUNC inline VectorType corner(CornerType corner) const {
+    EIGEN_STATIC_ASSERT(AmbientDim_ <= 3, THIS_METHOD_IS_ONLY_FOR_VECTORS_OF_A_SPECIFIC_SIZE);
 
     VectorType res;
 
     Index mult = 1;
-    for(Index d=0; d<dim(); ++d)
-    {
-      if( mult & corner ) res[d] = m_max[d];
-      else                res[d] = m_min[d];
+    for (Index d = 0; d < dim(); ++d) {
+      if (mult & corner)
+        res[d] = m_max[d];
+      else
+        res[d] = m_min[d];
       mult *= 2;
     }
     return res;
@@ -161,45 +212,40 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim)
 
   /** \returns a random point inside the bounding box sampled with
    * a uniform distribution */
-  inline VectorType sample() const
-  {
+  EIGEN_DEVICE_FUNC inline VectorType sample() const {
     VectorType r(dim());
-    for(Index d=0; d<dim(); ++d)
-    {
-      if(!ScalarTraits::IsInteger)
-      {
-        r[d] = m_min[d] + (m_max[d]-m_min[d])
-             * internal::random<Scalar>(Scalar(0), Scalar(1));
-      }
-      else
+    for (Index d = 0; d < dim(); ++d) {
+      if (!ScalarTraits::IsInteger) {
+        r[d] = m_min[d] + (m_max[d] - m_min[d]) * internal::random<Scalar>(Scalar(0), Scalar(1));
+      } else
         r[d] = internal::random(m_min[d], m_max[d]);
     }
     return r;
   }
 
   /** \returns true if the point \a p is inside the box \c *this. */
-  template<typename Derived>
-  inline bool contains(const MatrixBase<Derived>& p) const
-  {
-    typename internal::nested<Derived,2>::type p_n(p.derived());
-    return (m_min.array()<=p_n.array()).all() && (p_n.array()<=m_max.array()).all();
+  template <typename Derived>
+  EIGEN_DEVICE_FUNC inline bool contains(const MatrixBase<Derived>& p) const {
+    typename internal::nested_eval<Derived, 2>::type p_n(p.derived());
+    return (m_min.array() <= p_n.array()).all() && (p_n.array() <= m_max.array()).all();
   }
 
   /** \returns true if the box \a b is entirely inside the box \c *this. */
-  inline bool contains(const AlignedBox& b) const
-  { return (m_min.array()<=(b.min)().array()).all() && ((b.max)().array()<=m_max.array()).all(); }
+  EIGEN_DEVICE_FUNC inline bool contains(const AlignedBox& b) const {
+    return (m_min.array() <= (b.min)().array()).all() && ((b.max)().array() <= m_max.array()).all();
+  }
 
   /** \returns true if the box \a b is intersecting the box \c *this.
    * \sa intersection, clamp */
-  inline bool intersects(const AlignedBox& b) const
-  { return (m_min.array()<=(b.max)().array()).all() && ((b.min)().array()<=m_max.array()).all(); }
+  EIGEN_DEVICE_FUNC inline bool intersects(const AlignedBox& b) const {
+    return (m_min.array() <= (b.max)().array()).all() && ((b.min)().array() <= m_max.array()).all();
+  }
 
   /** Extends \c *this such that it contains the point \a p and returns a reference to \c *this.
    * \sa extend(const AlignedBox&) */
-  template<typename Derived>
-  inline AlignedBox& extend(const MatrixBase<Derived>& p)
-  {
-    typename internal::nested<Derived,2>::type p_n(p.derived());
+  template <typename Derived>
+  EIGEN_DEVICE_FUNC inline AlignedBox& extend(const MatrixBase<Derived>& p) {
+    typename internal::nested_eval<Derived, 2>::type p_n(p.derived());
     m_min = m_min.cwiseMin(p_n);
     m_max = m_max.cwiseMax(p_n);
     return *this;
@@ -207,8 +253,7 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim)
 
   /** Extends \c *this such that it contains the box \a b and returns a reference to \c *this.
    * \sa merged, extend(const MatrixBase&) */
-  inline AlignedBox& extend(const AlignedBox& b)
-  {
+  EIGEN_DEVICE_FUNC inline AlignedBox& extend(const AlignedBox& b) {
     m_min = m_min.cwiseMin(b.m_min);
     m_max = m_max.cwiseMax(b.m_max);
     return *this;
@@ -217,8 +262,7 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim)
   /** Clamps \c *this by the box \a b and returns a reference to \c *this.
    * \note If the boxes don't intersect, the resulting box is empty.
    * \sa intersection(), intersects() */
-  inline AlignedBox& clamp(const AlignedBox& b)
-  {
+  EIGEN_DEVICE_FUNC inline AlignedBox& clamp(const AlignedBox& b) {
     m_min = m_min.cwiseMax(b.m_min);
     m_max = m_max.cwiseMin(b.m_max);
     return *this;
@@ -227,166 +271,215 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim)
   /** Returns an AlignedBox that is the intersection of \a b and \c *this
    * \note If the boxes don't intersect, the resulting box is empty.
    * \sa intersects(), clamp, contains()  */
-  inline AlignedBox intersection(const AlignedBox& b) const
-  {return AlignedBox(m_min.cwiseMax(b.m_min), m_max.cwiseMin(b.m_max)); }
+  EIGEN_DEVICE_FUNC inline AlignedBox intersection(const AlignedBox& b) const {
+    return AlignedBox(m_min.cwiseMax(b.m_min), m_max.cwiseMin(b.m_max));
+  }
 
   /** Returns an AlignedBox that is the union of \a b and \c *this.
-   * \note Merging with an empty box may result in a box bigger than \c *this. 
+   * \note Merging with an empty box may result in a box bigger than \c *this.
    * \sa extend(const AlignedBox&) */
-  inline AlignedBox merged(const AlignedBox& b) const
-  { return AlignedBox(m_min.cwiseMin(b.m_min), m_max.cwiseMax(b.m_max)); }
+  EIGEN_DEVICE_FUNC inline AlignedBox merged(const AlignedBox& b) const {
+    return AlignedBox(m_min.cwiseMin(b.m_min), m_max.cwiseMax(b.m_max));
+  }
 
   /** Translate \c *this by the vector \a t and returns a reference to \c *this. */
-  template<typename Derived>
-  inline AlignedBox& translate(const MatrixBase<Derived>& a_t)
-  {
-    const typename internal::nested<Derived,2>::type t(a_t.derived());
+  template <typename Derived>
+  EIGEN_DEVICE_FUNC inline AlignedBox& translate(const MatrixBase<Derived>& a_t) {
+    const typename internal::nested_eval<Derived, 2>::type t(a_t.derived());
     m_min += t;
     m_max += t;
     return *this;
   }
 
+  /** \returns a copy of \c *this translated by the vector \a t. */
+  template <typename Derived>
+  EIGEN_DEVICE_FUNC inline AlignedBox translated(const MatrixBase<Derived>& a_t) const {
+    AlignedBox result(m_min, m_max);
+    result.translate(a_t);
+    return result;
+  }
+
   /** \returns the squared distance between the point \a p and the box \c *this,
-    * and zero if \a p is inside the box.
-    * \sa exteriorDistance(const MatrixBase&), squaredExteriorDistance(const AlignedBox&)
-    */
-  template<typename Derived>
-  inline Scalar squaredExteriorDistance(const MatrixBase<Derived>& p) const;
+   * and zero if \a p is inside the box.
+   * \sa exteriorDistance(const MatrixBase&), squaredExteriorDistance(const AlignedBox&)
+   */
+  template <typename Derived>
+  EIGEN_DEVICE_FUNC inline Scalar squaredExteriorDistance(const MatrixBase<Derived>& p) const;
 
   /** \returns the squared distance between the boxes \a b and \c *this,
-    * and zero if the boxes intersect.
-    * \sa exteriorDistance(const AlignedBox&), squaredExteriorDistance(const MatrixBase&)
-    */
-  inline Scalar squaredExteriorDistance(const AlignedBox& b) const;
+   * and zero if the boxes intersect.
+   * \sa exteriorDistance(const AlignedBox&), squaredExteriorDistance(const MatrixBase&)
+   */
+  EIGEN_DEVICE_FUNC inline Scalar squaredExteriorDistance(const AlignedBox& b) const;
 
   /** \returns the distance between the point \a p and the box \c *this,
-    * and zero if \a p is inside the box.
-    * \sa squaredExteriorDistance(const MatrixBase&), exteriorDistance(const AlignedBox&)
-    */
-  template<typename Derived>
-  inline NonInteger exteriorDistance(const MatrixBase<Derived>& p) const
-  { using std::sqrt; return sqrt(NonInteger(squaredExteriorDistance(p))); }
+   * and zero if \a p is inside the box.
+   * \sa squaredExteriorDistance(const MatrixBase&), exteriorDistance(const AlignedBox&)
+   */
+  template <typename Derived>
+  EIGEN_DEVICE_FUNC inline NonInteger exteriorDistance(const MatrixBase<Derived>& p) const {
+    EIGEN_USING_STD(sqrt) return sqrt(NonInteger(squaredExteriorDistance(p)));
+  }
 
   /** \returns the distance between the boxes \a b and \c *this,
-    * and zero if the boxes intersect.
-    * \sa squaredExteriorDistance(const AlignedBox&), exteriorDistance(const MatrixBase&)
-    */
-  inline NonInteger exteriorDistance(const AlignedBox& b) const
-  { using std::sqrt; return sqrt(NonInteger(squaredExteriorDistance(b))); }
+   * and zero if the boxes intersect.
+   * \sa squaredExteriorDistance(const AlignedBox&), exteriorDistance(const MatrixBase&)
+   */
+  EIGEN_DEVICE_FUNC inline NonInteger exteriorDistance(const AlignedBox& b) const {
+    EIGEN_USING_STD(sqrt) return sqrt(NonInteger(squaredExteriorDistance(b)));
+  }
+
+  /**
+   * Specialization of transform for pure translation.
+   */
+  template <int Mode, int Options>
+  EIGEN_DEVICE_FUNC inline void transform(
+      const typename Transform<Scalar, AmbientDimAtCompileTime, Mode, Options>::TranslationType& translation) {
+    this->translate(translation);
+  }
+
+  /**
+   * Transforms this box by \a transform and recomputes it to
+   * still be an axis-aligned box.
+   *
+   * \note This method is provided under BSD license (see the top of this file).
+   */
+  template <int Mode, int Options>
+  EIGEN_DEVICE_FUNC inline void transform(const Transform<Scalar, AmbientDimAtCompileTime, Mode, Options>& transform) {
+    // Only Affine and Isometry transforms are currently supported.
+    EIGEN_STATIC_ASSERT(Mode == Affine || Mode == AffineCompact || Mode == Isometry,
+                        THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS);
+
+    // Method adapted from FCL src/shape/geometric_shapes_utility.cpp#computeBV<AABB, Box>(...)
+    // https://github.com/flexible-collision-library/fcl/blob/fcl-0.4/src/shape/geometric_shapes_utility.cpp#L292
+    //
+    // Here's a nice explanation why it works: https://zeuxcg.org/2010/10/17/aabb-from-obb-with-component-wise-abs/
+
+    // two times rotated extent
+    const VectorType rotated_extent_2 = transform.linear().cwiseAbs() * sizes();
+    // two times new center
+    const VectorType rotated_center_2 =
+        transform.linear() * (this->m_max + this->m_min) + Scalar(2) * transform.translation();
+
+    this->m_max = (rotated_center_2 + rotated_extent_2) / Scalar(2);
+    this->m_min = (rotated_center_2 - rotated_extent_2) / Scalar(2);
+  }
+
+  /**
+   * \returns a copy of \c *this transformed by \a transform and recomputed to
+   * still be an axis-aligned box.
+   */
+  template <int Mode, int Options>
+  EIGEN_DEVICE_FUNC AlignedBox
+  transformed(const Transform<Scalar, AmbientDimAtCompileTime, Mode, Options>& transform) const {
+    AlignedBox result(m_min, m_max);
+    result.transform(transform);
+    return result;
+  }
 
   /** \returns \c *this with scalar type casted to \a NewScalarType
-    *
-    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
-    * then this function smartly returns a const reference to \c *this.
-    */
-  template<typename NewScalarType>
-  inline typename internal::cast_return_type<AlignedBox,
-           AlignedBox<NewScalarType,AmbientDimAtCompileTime> >::type cast() const
-  {
-    return typename internal::cast_return_type<AlignedBox,
-                    AlignedBox<NewScalarType,AmbientDimAtCompileTime> >::type(*this);
+   *
+   * Note that if \a NewScalarType is equal to the current scalar type of \c *this
+   * then this function smartly returns a const reference to \c *this.
+   */
+  template <typename NewScalarType>
+  EIGEN_DEVICE_FUNC inline
+      typename internal::cast_return_type<AlignedBox, AlignedBox<NewScalarType, AmbientDimAtCompileTime> >::type
+      cast() const {
+    return typename internal::cast_return_type<AlignedBox, AlignedBox<NewScalarType, AmbientDimAtCompileTime> >::type(
+        *this);
   }
 
   /** Copy constructor with scalar type conversion */
-  template<typename OtherScalarType>
-  inline explicit AlignedBox(const AlignedBox<OtherScalarType,AmbientDimAtCompileTime>& other)
-  {
+  template <typename OtherScalarType>
+  EIGEN_DEVICE_FUNC inline explicit AlignedBox(const AlignedBox<OtherScalarType, AmbientDimAtCompileTime>& other) {
     m_min = (other.min)().template cast<Scalar>();
     m_max = (other.max)().template cast<Scalar>();
   }
 
   /** \returns \c true if \c *this is approximately equal to \a other, within the precision
-    * determined by \a prec.
-    *
-    * \sa MatrixBase::isApprox() */
-  bool isApprox(const AlignedBox& other, const RealScalar& prec = ScalarTraits::dummy_precision()) const
-  { return m_min.isApprox(other.m_min, prec) && m_max.isApprox(other.m_max, prec); }
-
-protected:
+   * determined by \a prec.
+   *
+   * \sa MatrixBase::isApprox() */
+  EIGEN_DEVICE_FUNC bool isApprox(const AlignedBox& other,
+                                  const RealScalar& prec = ScalarTraits::dummy_precision()) const {
+    return m_min.isApprox(other.m_min, prec) && m_max.isApprox(other.m_max, prec);
+  }
 
+ protected:
   VectorType m_min, m_max;
 };
 
-
-
-template<typename Scalar,int AmbientDim>
-template<typename Derived>
-inline Scalar AlignedBox<Scalar,AmbientDim>::squaredExteriorDistance(const MatrixBase<Derived>& a_p) const
-{
-  typename internal::nested<Derived,2*AmbientDim>::type p(a_p.derived());
+template <typename Scalar, int AmbientDim>
+template <typename Derived>
+EIGEN_DEVICE_FUNC inline Scalar AlignedBox<Scalar, AmbientDim>::squaredExteriorDistance(
+    const MatrixBase<Derived>& a_p) const {
+  typename internal::nested_eval<Derived, 2 * AmbientDim>::type p(a_p.derived());
   Scalar dist2(0);
   Scalar aux;
-  for (Index k=0; k<dim(); ++k)
-  {
-    if( m_min[k] > p[k] )
-    {
+  for (Index k = 0; k < dim(); ++k) {
+    if (m_min[k] > p[k]) {
       aux = m_min[k] - p[k];
-      dist2 += aux*aux;
-    }
-    else if( p[k] > m_max[k] )
-    {
+      dist2 += aux * aux;
+    } else if (p[k] > m_max[k]) {
       aux = p[k] - m_max[k];
-      dist2 += aux*aux;
+      dist2 += aux * aux;
     }
   }
   return dist2;
 }
 
-template<typename Scalar,int AmbientDim>
-inline Scalar AlignedBox<Scalar,AmbientDim>::squaredExteriorDistance(const AlignedBox& b) const
-{
+template <typename Scalar, int AmbientDim>
+EIGEN_DEVICE_FUNC inline Scalar AlignedBox<Scalar, AmbientDim>::squaredExteriorDistance(const AlignedBox& b) const {
   Scalar dist2(0);
   Scalar aux;
-  for (Index k=0; k<dim(); ++k)
-  {
-    if( m_min[k] > b.m_max[k] )
-    {
+  for (Index k = 0; k < dim(); ++k) {
+    if (m_min[k] > b.m_max[k]) {
       aux = m_min[k] - b.m_max[k];
-      dist2 += aux*aux;
-    }
-    else if( b.m_min[k] > m_max[k] )
-    {
+      dist2 += aux * aux;
+    } else if (b.m_min[k] > m_max[k]) {
       aux = b.m_min[k] - m_max[k];
-      dist2 += aux*aux;
+      dist2 += aux * aux;
     }
   }
   return dist2;
 }
 
 /** \defgroup alignedboxtypedefs Global aligned box typedefs
-  *
-  * \ingroup Geometry_Module
-  *
-  * Eigen defines several typedef shortcuts for most common aligned box types.
-  *
-  * The general patterns are the following:
-  *
-  * \c AlignedBoxSizeType where \c Size can be \c 1, \c 2,\c 3,\c 4 for fixed size boxes or \c X for dynamic size,
-  * and where \c Type can be \c i for integer, \c f for float, \c d for double.
-  *
-  * For example, \c AlignedBox3d is a fixed-size 3x3 aligned box type of doubles, and \c AlignedBoxXf is a dynamic-size aligned box of floats.
-  *
-  * \sa class AlignedBox
-  */
-
-#define EIGEN_MAKE_TYPEDEFS(Type, TypeSuffix, Size, SizeSuffix)    \
-/** \ingroup alignedboxtypedefs */                                 \
-typedef AlignedBox<Type, Size>   AlignedBox##SizeSuffix##TypeSuffix;
+ *
+ * \ingroup Geometry_Module
+ *
+ * Eigen defines several typedef shortcuts for most common aligned box types.
+ *
+ * The general patterns are the following:
+ *
+ * \c AlignedBoxSizeType where \c Size can be \c 1, \c 2,\c 3,\c 4 for fixed size boxes or \c X for dynamic size,
+ * and where \c Type can be \c i for integer, \c f for float, \c d for double.
+ *
+ * For example, \c AlignedBox3d is a fixed-size 3x3 aligned box type of doubles, and \c AlignedBoxXf is a dynamic-size
+ * aligned box of floats.
+ *
+ * \sa class AlignedBox
+ */
+
+#define EIGEN_MAKE_TYPEDEFS(Type, TypeSuffix, Size, SizeSuffix) \
+  /** \ingroup alignedboxtypedefs */                            \
+  typedef AlignedBox<Type, Size> AlignedBox##SizeSuffix##TypeSuffix;
 
 #define EIGEN_MAKE_TYPEDEFS_ALL_SIZES(Type, TypeSuffix) \
-EIGEN_MAKE_TYPEDEFS(Type, TypeSuffix, 1, 1) \
-EIGEN_MAKE_TYPEDEFS(Type, TypeSuffix, 2, 2) \
-EIGEN_MAKE_TYPEDEFS(Type, TypeSuffix, 3, 3) \
-EIGEN_MAKE_TYPEDEFS(Type, TypeSuffix, 4, 4) \
-EIGEN_MAKE_TYPEDEFS(Type, TypeSuffix, Dynamic, X)
+  EIGEN_MAKE_TYPEDEFS(Type, TypeSuffix, 1, 1)           \
+  EIGEN_MAKE_TYPEDEFS(Type, TypeSuffix, 2, 2)           \
+  EIGEN_MAKE_TYPEDEFS(Type, TypeSuffix, 3, 3)           \
+  EIGEN_MAKE_TYPEDEFS(Type, TypeSuffix, 4, 4)           \
+  EIGEN_MAKE_TYPEDEFS(Type, TypeSuffix, Dynamic, X)
 
-EIGEN_MAKE_TYPEDEFS_ALL_SIZES(int,                  i)
-EIGEN_MAKE_TYPEDEFS_ALL_SIZES(float,                f)
-EIGEN_MAKE_TYPEDEFS_ALL_SIZES(double,               d)
+EIGEN_MAKE_TYPEDEFS_ALL_SIZES(int, i)
+EIGEN_MAKE_TYPEDEFS_ALL_SIZES(float, f)
+EIGEN_MAKE_TYPEDEFS_ALL_SIZES(double, d)
 
 #undef EIGEN_MAKE_TYPEDEFS_ALL_SIZES
 #undef EIGEN_MAKE_TYPEDEFS
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_ALIGNEDBOX_H
+#endif  // EIGEN_ALIGNEDBOX_H
diff --git a/inst/include/Eigen/src/Geometry/AngleAxis.h b/inst/include/Eigen/src/Geometry/AngleAxis.h
index bbf6a7ed..a00ed178 100644
--- a/inst/include/Eigen/src/Geometry/AngleAxis.h
+++ b/inst/include/Eigen/src/Geometry/AngleAxis.h
@@ -10,224 +10,236 @@
 #ifndef EIGEN_ANGLEAXIS_H
 #define EIGEN_ANGLEAXIS_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 /** \geometry_module \ingroup Geometry_Module
-  *
-  * \class AngleAxis
-  *
-  * \brief Represents a 3D rotation as a rotation angle around an arbitrary 3D axis
-  *
-  * \param _Scalar the scalar type, i.e., the type of the coefficients.
-  *
-  * \warning When setting up an AngleAxis object, the axis vector \b must \b be \b normalized.
-  *
-  * The following two typedefs are provided for convenience:
-  * \li \c AngleAxisf for \c float
-  * \li \c AngleAxisd for \c double
-  *
-  * Combined with MatrixBase::Unit{X,Y,Z}, AngleAxis can be used to easily
-  * mimic Euler-angles. Here is an example:
-  * \include AngleAxis_mimic_euler.cpp
-  * Output: \verbinclude AngleAxis_mimic_euler.out
-  *
-  * \note This class is not aimed to be used to store a rotation transformation,
-  * but rather to make easier the creation of other rotation (Quaternion, rotation Matrix)
-  * and transformation objects.
-  *
-  * \sa class Quaternion, class Transform, MatrixBase::UnitX()
-  */
+ *
+ * \class AngleAxis
+ *
+ * \brief Represents a 3D rotation as a rotation angle around an arbitrary 3D axis
+ *
+ * \param Scalar_ the scalar type, i.e., the type of the coefficients.
+ *
+ * \warning When setting up an AngleAxis object, the axis vector \b must \b be \b normalized.
+ *
+ * The following two typedefs are provided for convenience:
+ * \li \c AngleAxisf for \c float
+ * \li \c AngleAxisd for \c double
+ *
+ * Combined with MatrixBase::Unit{X,Y,Z}, AngleAxis can be used to easily
+ * mimic Euler-angles. Here is an example:
+ * \include AngleAxis_mimic_euler.cpp
+ * Output: \verbinclude AngleAxis_mimic_euler.out
+ *
+ * \note This class is not aimed to be used to store a rotation transformation,
+ * but rather to make easier the creation of other rotation (Quaternion, rotation Matrix)
+ * and transformation objects.
+ *
+ * \sa class Quaternion, class Transform, MatrixBase::UnitX()
+ */
 
 namespace internal {
-template<typename _Scalar> struct traits<AngleAxis<_Scalar> >
-{
-  typedef _Scalar Scalar;
+template <typename Scalar_>
+struct traits<AngleAxis<Scalar_> > {
+  typedef Scalar_ Scalar;
 };
-}
+}  // namespace internal
 
-template<typename _Scalar>
-class AngleAxis : public RotationBase<AngleAxis<_Scalar>,3>
-{
-  typedef RotationBase<AngleAxis<_Scalar>,3> Base;
-
-public:
+template <typename Scalar_>
+class AngleAxis : public RotationBase<AngleAxis<Scalar_>, 3> {
+  typedef RotationBase<AngleAxis<Scalar_>, 3> Base;
 
+ public:
   using Base::operator*;
 
   enum { Dim = 3 };
   /** the scalar type of the coefficients */
-  typedef _Scalar Scalar;
-  typedef Matrix<Scalar,3,3> Matrix3;
-  typedef Matrix<Scalar,3,1> Vector3;
+  typedef Scalar_ Scalar;
+  typedef Matrix<Scalar, 3, 3> Matrix3;
+  typedef Matrix<Scalar, 3, 1> Vector3;
   typedef Quaternion<Scalar> QuaternionType;
 
-protected:
-
+ protected:
   Vector3 m_axis;
   Scalar m_angle;
 
-public:
-
+ public:
   /** Default constructor without initialization. */
-  AngleAxis() {}
+  EIGEN_DEVICE_FUNC AngleAxis() {}
   /** Constructs and initialize the angle-axis rotation from an \a angle in radian
-    * and an \a axis which \b must \b be \b normalized.
-    *
-    * \warning If the \a axis vector is not normalized, then the angle-axis object
-    *          represents an invalid rotation. */
-  template<typename Derived>
-  inline AngleAxis(const Scalar& angle, const MatrixBase<Derived>& axis) : m_axis(axis), m_angle(angle) {}
-  /** Constructs and initialize the angle-axis rotation from a quaternion \a q. */
-  template<typename QuatDerived> inline explicit AngleAxis(const QuaternionBase<QuatDerived>& q) { *this = q; }
+   * and an \a axis which \b must \b be \b normalized.
+   *
+   * \warning If the \a axis vector is not normalized, then the angle-axis object
+   *          represents an invalid rotation. */
+  template <typename Derived>
+  EIGEN_DEVICE_FUNC inline AngleAxis(const Scalar& angle, const MatrixBase<Derived>& axis)
+      : m_axis(axis), m_angle(angle) {}
+  /** Constructs and initialize the angle-axis rotation from a quaternion \a q.
+   * This function implicitly normalizes the quaternion \a q.
+   */
+  template <typename QuatDerived>
+  EIGEN_DEVICE_FUNC inline explicit AngleAxis(const QuaternionBase<QuatDerived>& q) {
+    *this = q;
+  }
   /** Constructs and initialize the angle-axis rotation from a 3x3 rotation matrix. */
-  template<typename Derived>
-  inline explicit AngleAxis(const MatrixBase<Derived>& m) { *this = m; }
+  template <typename Derived>
+  EIGEN_DEVICE_FUNC inline explicit AngleAxis(const MatrixBase<Derived>& m) {
+    *this = m;
+  }
 
-  Scalar angle() const { return m_angle; }
-  Scalar& angle() { return m_angle; }
+  /** \returns the value of the rotation angle in radian */
+  EIGEN_DEVICE_FUNC Scalar angle() const { return m_angle; }
+  /** \returns a read-write reference to the stored angle in radian */
+  EIGEN_DEVICE_FUNC Scalar& angle() { return m_angle; }
 
-  const Vector3& axis() const { return m_axis; }
-  Vector3& axis() { return m_axis; }
+  /** \returns the rotation axis */
+  EIGEN_DEVICE_FUNC const Vector3& axis() const { return m_axis; }
+  /** \returns a read-write reference to the stored rotation axis.
+   *
+   * \warning The rotation axis must remain a \b unit vector.
+   */
+  EIGEN_DEVICE_FUNC Vector3& axis() { return m_axis; }
 
   /** Concatenates two rotations */
-  inline QuaternionType operator* (const AngleAxis& other) const
-  { return QuaternionType(*this) * QuaternionType(other); }
+  EIGEN_DEVICE_FUNC inline QuaternionType operator*(const AngleAxis& other) const {
+    return QuaternionType(*this) * QuaternionType(other);
+  }
 
   /** Concatenates two rotations */
-  inline QuaternionType operator* (const QuaternionType& other) const
-  { return QuaternionType(*this) * other; }
+  EIGEN_DEVICE_FUNC inline QuaternionType operator*(const QuaternionType& other) const {
+    return QuaternionType(*this) * other;
+  }
 
   /** Concatenates two rotations */
-  friend inline QuaternionType operator* (const QuaternionType& a, const AngleAxis& b)
-  { return a * QuaternionType(b); }
+  friend EIGEN_DEVICE_FUNC inline QuaternionType operator*(const QuaternionType& a, const AngleAxis& b) {
+    return a * QuaternionType(b);
+  }
 
   /** \returns the inverse rotation, i.e., an angle-axis with opposite rotation angle */
-  AngleAxis inverse() const
-  { return AngleAxis(-m_angle, m_axis); }
+  EIGEN_DEVICE_FUNC AngleAxis inverse() const { return AngleAxis(-m_angle, m_axis); }
 
-  template<class QuatDerived>
-  AngleAxis& operator=(const QuaternionBase<QuatDerived>& q);
-  template<typename Derived>
-  AngleAxis& operator=(const MatrixBase<Derived>& m);
+  template <class QuatDerived>
+  EIGEN_DEVICE_FUNC AngleAxis& operator=(const QuaternionBase<QuatDerived>& q);
+  template <typename Derived>
+  EIGEN_DEVICE_FUNC AngleAxis& operator=(const MatrixBase<Derived>& m);
 
-  template<typename Derived>
-  AngleAxis& fromRotationMatrix(const MatrixBase<Derived>& m);
-  Matrix3 toRotationMatrix(void) const;
+  template <typename Derived>
+  EIGEN_DEVICE_FUNC AngleAxis& fromRotationMatrix(const MatrixBase<Derived>& m);
+  EIGEN_DEVICE_FUNC Matrix3 toRotationMatrix(void) const;
 
   /** \returns \c *this with scalar type casted to \a NewScalarType
-    *
-    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
-    * then this function smartly returns a const reference to \c *this.
-    */
-  template<typename NewScalarType>
-  inline typename internal::cast_return_type<AngleAxis,AngleAxis<NewScalarType> >::type cast() const
-  { return typename internal::cast_return_type<AngleAxis,AngleAxis<NewScalarType> >::type(*this); }
+   *
+   * Note that if \a NewScalarType is equal to the current scalar type of \c *this
+   * then this function smartly returns a const reference to \c *this.
+   */
+  template <typename NewScalarType>
+  EIGEN_DEVICE_FUNC inline typename internal::cast_return_type<AngleAxis, AngleAxis<NewScalarType> >::type cast()
+      const {
+    return typename internal::cast_return_type<AngleAxis, AngleAxis<NewScalarType> >::type(*this);
+  }
 
   /** Copy constructor with scalar type conversion */
-  template<typename OtherScalarType>
-  inline explicit AngleAxis(const AngleAxis<OtherScalarType>& other)
-  {
+  template <typename OtherScalarType>
+  EIGEN_DEVICE_FUNC inline explicit AngleAxis(const AngleAxis<OtherScalarType>& other) {
     m_axis = other.axis().template cast<Scalar>();
     m_angle = Scalar(other.angle());
   }
 
-  static inline const AngleAxis Identity() { return AngleAxis(Scalar(0), Vector3::UnitX()); }
+  EIGEN_DEVICE_FUNC static inline const AngleAxis Identity() { return AngleAxis(Scalar(0), Vector3::UnitX()); }
 
   /** \returns \c true if \c *this is approximately equal to \a other, within the precision
-    * determined by \a prec.
-    *
-    * \sa MatrixBase::isApprox() */
-  bool isApprox(const AngleAxis& other, const typename NumTraits<Scalar>::Real& prec = NumTraits<Scalar>::dummy_precision()) const
-  { return m_axis.isApprox(other.m_axis, prec) && internal::isApprox(m_angle,other.m_angle, prec); }
+   * determined by \a prec.
+   *
+   * \sa MatrixBase::isApprox() */
+  EIGEN_DEVICE_FUNC bool isApprox(const AngleAxis& other, const typename NumTraits<Scalar>::Real& prec =
+                                                              NumTraits<Scalar>::dummy_precision()) const {
+    return m_axis.isApprox(other.m_axis, prec) && internal::isApprox(m_angle, other.m_angle, prec);
+  }
 };
 
 /** \ingroup Geometry_Module
-  * single precision angle-axis type */
+ * single precision angle-axis type */
 typedef AngleAxis<float> AngleAxisf;
 /** \ingroup Geometry_Module
-  * double precision angle-axis type */
+ * double precision angle-axis type */
 typedef AngleAxis<double> AngleAxisd;
 
 /** Set \c *this from a \b unit quaternion.
-  * The axis is normalized.
-  * 
-  * \warning As any other method dealing with quaternion, if the input quaternion
-  *          is not normalized then the result is undefined.
-  */
-template<typename Scalar>
-template<typename QuatDerived>
-AngleAxis<Scalar>& AngleAxis<Scalar>::operator=(const QuaternionBase<QuatDerived>& q)
-{
-  using std::acos;
-  using std::min;
-  using std::max;
-  using std::sqrt;
-  Scalar n2 = q.vec().squaredNorm();
-  if (n2 < NumTraits<Scalar>::dummy_precision()*NumTraits<Scalar>::dummy_precision())
-  {
+ *
+ * The resulting axis is normalized, and the computed angle is in the [0,pi] range.
+ *
+ * This function implicitly normalizes the quaternion \a q.
+ */
+template <typename Scalar>
+template <typename QuatDerived>
+EIGEN_DEVICE_FUNC AngleAxis<Scalar>& AngleAxis<Scalar>::operator=(const QuaternionBase<QuatDerived>& q) {
+  EIGEN_USING_STD(atan2)
+  EIGEN_USING_STD(abs)
+  Scalar n = q.vec().norm();
+  if (n < NumTraits<Scalar>::epsilon()) n = q.vec().stableNorm();
+
+  if (n != Scalar(0)) {
+    m_angle = Scalar(2) * atan2(n, abs(q.w()));
+    if (q.w() < Scalar(0)) n = -n;
+    m_axis = q.vec() / n;
+  } else {
     m_angle = Scalar(0);
     m_axis << Scalar(1), Scalar(0), Scalar(0);
   }
-  else
-  {
-    m_angle = Scalar(2)*acos((min)((max)(Scalar(-1),q.w()),Scalar(1)));
-    m_axis = q.vec() / sqrt(n2);
-  }
   return *this;
 }
 
 /** Set \c *this from a 3x3 rotation matrix \a mat.
-  */
-template<typename Scalar>
-template<typename Derived>
-AngleAxis<Scalar>& AngleAxis<Scalar>::operator=(const MatrixBase<Derived>& mat)
-{
+ */
+template <typename Scalar>
+template <typename Derived>
+EIGEN_DEVICE_FUNC AngleAxis<Scalar>& AngleAxis<Scalar>::operator=(const MatrixBase<Derived>& mat) {
   // Since a direct conversion would not be really faster,
   // let's use the robust Quaternion implementation:
   return *this = QuaternionType(mat);
 }
 
 /**
-* \brief Sets \c *this from a 3x3 rotation matrix.
-**/
-template<typename Scalar>
-template<typename Derived>
-AngleAxis<Scalar>& AngleAxis<Scalar>::fromRotationMatrix(const MatrixBase<Derived>& mat)
-{
+ * \brief Sets \c *this from a 3x3 rotation matrix.
+ **/
+template <typename Scalar>
+template <typename Derived>
+EIGEN_DEVICE_FUNC AngleAxis<Scalar>& AngleAxis<Scalar>::fromRotationMatrix(const MatrixBase<Derived>& mat) {
   return *this = QuaternionType(mat);
 }
 
 /** Constructs and \returns an equivalent 3x3 rotation matrix.
-  */
-template<typename Scalar>
-typename AngleAxis<Scalar>::Matrix3
-AngleAxis<Scalar>::toRotationMatrix(void) const
-{
-  using std::sin;
-  using std::cos;
+ */
+template <typename Scalar>
+typename AngleAxis<Scalar>::Matrix3 EIGEN_DEVICE_FUNC AngleAxis<Scalar>::toRotationMatrix(void) const {
+  EIGEN_USING_STD(sin)
+  EIGEN_USING_STD(cos)
   Matrix3 res;
-  Vector3 sin_axis  = sin(m_angle) * m_axis;
+  Vector3 sin_axis = sin(m_angle) * m_axis;
   Scalar c = cos(m_angle);
-  Vector3 cos1_axis = (Scalar(1)-c) * m_axis;
+  Vector3 cos1_axis = (Scalar(1) - c) * m_axis;
 
   Scalar tmp;
   tmp = cos1_axis.x() * m_axis.y();
-  res.coeffRef(0,1) = tmp - sin_axis.z();
-  res.coeffRef(1,0) = tmp + sin_axis.z();
+  res.coeffRef(0, 1) = tmp - sin_axis.z();
+  res.coeffRef(1, 0) = tmp + sin_axis.z();
 
   tmp = cos1_axis.x() * m_axis.z();
-  res.coeffRef(0,2) = tmp + sin_axis.y();
-  res.coeffRef(2,0) = tmp - sin_axis.y();
+  res.coeffRef(0, 2) = tmp + sin_axis.y();
+  res.coeffRef(2, 0) = tmp - sin_axis.y();
 
   tmp = cos1_axis.y() * m_axis.z();
-  res.coeffRef(1,2) = tmp - sin_axis.x();
-  res.coeffRef(2,1) = tmp + sin_axis.x();
+  res.coeffRef(1, 2) = tmp - sin_axis.x();
+  res.coeffRef(2, 1) = tmp + sin_axis.x();
 
   res.diagonal() = (cos1_axis.cwiseProduct(m_axis)).array() + c;
 
   return res;
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_ANGLEAXIS_H
+#endif  // EIGEN_ANGLEAXIS_H
diff --git a/inst/include/Eigen/src/Geometry/EulerAngles.h b/inst/include/Eigen/src/Geometry/EulerAngles.h
index 82802fb4..366a32ce 100644
--- a/inst/include/Eigen/src/Geometry/EulerAngles.h
+++ b/inst/include/Eigen/src/Geometry/EulerAngles.h
@@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2023 Juraj Oršulić, University of Zagreb <juraj.orsulic@fer.hr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,95 +11,193 @@
 #ifndef EIGEN_EULERANGLES_H
 #define EIGEN_EULERANGLES_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 /** \geometry_module \ingroup Geometry_Module
-  *
-  *
-  * \returns the Euler-angles of the rotation matrix \c *this using the convention defined by the triplet (\a a0,\a a1,\a a2)
-  *
-  * Each of the three parameters \a a0,\a a1,\a a2 represents the respective rotation axis as an integer in {0,1,2}.
-  * For instance, in:
-  * \code Vector3f ea = mat.eulerAngles(2, 0, 2); \endcode
-  * "2" represents the z axis and "0" the x axis, etc. The returned angles are such that
-  * we have the following equality:
-  * \code
-  * mat == AngleAxisf(ea[0], Vector3f::UnitZ())
-  *      * AngleAxisf(ea[1], Vector3f::UnitX())
-  *      * AngleAxisf(ea[2], Vector3f::UnitZ()); \endcode
-  * This corresponds to the right-multiply conventions (with right hand side frames).
-  * 
-  * The returned angles are in the ranges [0:pi]x[-pi:pi]x[-pi:pi].
-  * 
-  * \sa class AngleAxis
-  */
-template<typename Derived>
-inline Matrix<typename MatrixBase<Derived>::Scalar,3,1>
-MatrixBase<Derived>::eulerAngles(Index a0, Index a1, Index a2) const
-{
-  using std::atan2;
-  using std::sin;
-  using std::cos;
+ *
+ *
+ * \returns the canonical Euler-angles of the rotation matrix \c *this using the convention defined by the triplet (\a
+ * a0,\a a1,\a a2)
+ *
+ * Each of the three parameters \a a0,\a a1,\a a2 represents the respective rotation axis as an integer in {0,1,2}.
+ * For instance, in:
+ * \code Vector3f ea = mat.eulerAngles(2, 0, 2); \endcode
+ * "2" represents the z axis and "0" the x axis, etc. The returned angles are such that
+ * we have the following equality:
+ * \code
+ * mat == AngleAxisf(ea[0], Vector3f::UnitZ())
+ *      * AngleAxisf(ea[1], Vector3f::UnitX())
+ *      * AngleAxisf(ea[2], Vector3f::UnitZ()); \endcode
+ * This corresponds to the right-multiply conventions (with right hand side frames).
+ *
+ * For Tait-Bryan angle configurations (a0 != a2), the returned angles are in the ranges [-pi:pi]x[-pi/2:pi/2]x[-pi:pi].
+ * For proper Euler angle configurations (a0 == a2), the returned angles are in the ranges [-pi:pi]x[0:pi]x[-pi:pi].
+ *
+ * The approach used is also described here:
+ * https://d3cw3dd2w32x2b.cloudfront.net/wp-content/uploads/2012/07/euler-angles.pdf
+ *
+ * \sa class AngleAxis
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC inline Matrix<typename MatrixBase<Derived>::Scalar, 3, 1> MatrixBase<Derived>::canonicalEulerAngles(
+    Index a0, Index a1, Index a2) const {
   /* Implemented from Graphics Gems IV */
-  EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(Derived,3,3)
+  EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(Derived, 3, 3)
 
-  Matrix<Scalar,3,1> res;
-  typedef Matrix<typename Derived::Scalar,2,1> Vector2;
+  Matrix<Scalar, 3, 1> res;
 
-  const Index odd = ((a0+1)%3 == a1) ? 0 : 1;
+  const Index odd = ((a0 + 1) % 3 == a1) ? 0 : 1;
   const Index i = a0;
-  const Index j = (a0 + 1 + odd)%3;
-  const Index k = (a0 + 2 - odd)%3;
-  
-  if (a0==a2)
-  {
-    res[0] = atan2(coeff(j,i), coeff(k,i));
-    if((odd && res[0]<Scalar(0)) || ((!odd) && res[0]>Scalar(0)))
-    {
-      res[0] = (res[0] > Scalar(0)) ? res[0] - Scalar(M_PI) : res[0] + Scalar(M_PI);
-      Scalar s2 = Vector2(coeff(j,i), coeff(k,i)).norm();
-      res[1] = -atan2(s2, coeff(i,i));
+  const Index j = (a0 + 1 + odd) % 3;
+  const Index k = (a0 + 2 - odd) % 3;
+
+  if (a0 == a2) {
+    // Proper Euler angles (same first and last axis).
+    // The i, j, k indices enable addressing the input matrix as the XYX archetype matrix (see Graphics Gems IV),
+    // where e.g. coeff(k, i) means third column, first row in the XYX archetype matrix:
+    //  c2      s2s1              s2c1
+    //  s2s3   -c2s1s3 + c1c3    -c2c1s3 - s1c3
+    // -s2c3    c2s1c3 + c1s3     c2c1c3 - s1s3
+
+    // Note: s2 is always positive.
+    Scalar s2 = numext::hypot(coeff(j, i), coeff(k, i));
+    if (odd) {
+      res[0] = numext::atan2(coeff(j, i), coeff(k, i));
+      // s2 is always positive, so res[1] will be within the canonical [0, pi] range
+      res[1] = numext::atan2(s2, coeff(i, i));
+    } else {
+      // In the !odd case, signs of all three angles are flipped at the very end. To keep the solution within the
+      // canonical range, we flip the solution and make res[1] always negative here (since s2 is always positive,
+      // -atan2(s2, c2) will always be negative). The final flip at the end due to !odd will thus make res[1] positive
+      // and canonical. NB: in the general case, there are two correct solutions, but only one is canonical. For proper
+      // Euler angles, flipping from one solution to the other involves flipping the sign of the second angle res[1] and
+      // adding/subtracting pi to the first and third angles. The addition/subtraction of pi to the first angle res[0]
+      // is handled here by flipping the signs of arguments to atan2, while the calculation of the third angle does not
+      // need special adjustment since it uses the adjusted res[0] as the input and produces a correct result.
+      res[0] = numext::atan2(-coeff(j, i), -coeff(k, i));
+      res[1] = -numext::atan2(s2, coeff(i, i));
     }
-    else
-    {
-      Scalar s2 = Vector2(coeff(j,i), coeff(k,i)).norm();
-      res[1] = atan2(s2, coeff(i,i));
+
+    // With a=(0,1,0), we have i=0; j=1; k=2, and after computing the first two angles,
+    // we can compute their respective rotation, and apply its inverse to M. Since the result must
+    // be a rotation around x, we have:
+    //
+    //  c2  s1.s2 c1.s2                   1  0   0
+    //  0   c1    -s1       *    M    =   0  c3  s3
+    //  -s2 s1.c2 c1.c2                   0 -s3  c3
+    //
+    //  Thus:  m11.c1 - m21.s1 = c3  &   m12.c1 - m22.s1 = s3
+
+    Scalar s1 = numext::sin(res[0]);
+    Scalar c1 = numext::cos(res[0]);
+    res[2] = numext::atan2(c1 * coeff(j, k) - s1 * coeff(k, k), c1 * coeff(j, j) - s1 * coeff(k, j));
+  } else {
+    // Tait-Bryan angles (all three axes are different; typically used for yaw-pitch-roll calculations).
+    // The i, j, k indices enable addressing the input matrix as the XYZ archetype matrix (see Graphics Gems IV),
+    // where e.g. coeff(k, i) means third column, first row in the XYZ archetype matrix:
+    //  c2c3    s2s1c3 - c1s3     s2c1c3 + s1s3
+    //  c2s3    s2s1s3 + c1c3     s2c1s3 - s1c3
+    // -s2      c2s1              c2c1
+
+    res[0] = numext::atan2(coeff(j, k), coeff(k, k));
+
+    Scalar c2 = numext::hypot(coeff(i, i), coeff(i, j));
+    // c2 is always positive, so the following atan2 will always return a result in the correct canonical middle angle
+    // range [-pi/2, pi/2]
+    res[1] = numext::atan2(-coeff(i, k), c2);
+
+    Scalar s1 = numext::sin(res[0]);
+    Scalar c1 = numext::cos(res[0]);
+    res[2] = numext::atan2(s1 * coeff(k, i) - c1 * coeff(j, i), c1 * coeff(j, j) - s1 * coeff(k, j));
+  }
+  if (!odd) {
+    res = -res;
+  }
+
+  return res;
+}
+
+/** \geometry_module \ingroup Geometry_Module
+ *
+ *
+ * \returns the Euler-angles of the rotation matrix \c *this using the convention defined by the triplet (\a a0,\a a1,\a
+ * a2)
+ *
+ * NB: The returned angles are in non-canonical ranges [0:pi]x[-pi:pi]x[-pi:pi]. For canonical Tait-Bryan/proper Euler
+ * ranges, use canonicalEulerAngles.
+ *
+ * \sa MatrixBase::canonicalEulerAngles
+ * \sa class AngleAxis
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC inline Matrix<typename MatrixBase<Derived>::Scalar, 3, 1> MatrixBase<Derived>::eulerAngles(
+    Index a0, Index a1, Index a2) const {
+  /* Implemented from Graphics Gems IV */
+  EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(Derived, 3, 3)
+
+  Matrix<Scalar, 3, 1> res;
+
+  const Index odd = ((a0 + 1) % 3 == a1) ? 0 : 1;
+  const Index i = a0;
+  const Index j = (a0 + 1 + odd) % 3;
+  const Index k = (a0 + 2 - odd) % 3;
+
+  if (a0 == a2) {
+    res[0] = numext::atan2(coeff(j, i), coeff(k, i));
+    if ((odd && res[0] < Scalar(0)) || ((!odd) && res[0] > Scalar(0))) {
+      if (res[0] > Scalar(0)) {
+        res[0] -= Scalar(EIGEN_PI);
+      } else {
+        res[0] += Scalar(EIGEN_PI);
+      }
+
+      Scalar s2 = numext::hypot(coeff(j, i), coeff(k, i));
+      res[1] = -numext::atan2(s2, coeff(i, i));
+    } else {
+      Scalar s2 = numext::hypot(coeff(j, i), coeff(k, i));
+      res[1] = numext::atan2(s2, coeff(i, i));
     }
-    
+
     // With a=(0,1,0), we have i=0; j=1; k=2, and after computing the first two angles,
     // we can compute their respective rotation, and apply its inverse to M. Since the result must
     // be a rotation around x, we have:
     //
-    //  c2  s1.s2 c1.s2                   1  0   0 
+    //  c2  s1.s2 c1.s2                   1  0   0
     //  0   c1    -s1       *    M    =   0  c3  s3
     //  -s2 s1.c2 c1.c2                   0 -s3  c3
     //
     //  Thus:  m11.c1 - m21.s1 = c3  &   m12.c1 - m22.s1 = s3
-    
-    Scalar s1 = sin(res[0]);
-    Scalar c1 = cos(res[0]);
-    res[2] = atan2(c1*coeff(j,k)-s1*coeff(k,k), c1*coeff(j,j) - s1 * coeff(k,j));
-  } 
-  else
-  {
-    res[0] = atan2(coeff(j,k), coeff(k,k));
-    Scalar c2 = Vector2(coeff(i,i), coeff(i,j)).norm();
-    if((odd && res[0]<Scalar(0)) || ((!odd) && res[0]>Scalar(0))) {
-      res[0] = (res[0] > Scalar(0)) ? res[0] - Scalar(M_PI) : res[0] + Scalar(M_PI);
-      res[1] = atan2(-coeff(i,k), -c2);
+
+    Scalar s1 = numext::sin(res[0]);
+    Scalar c1 = numext::cos(res[0]);
+    res[2] = numext::atan2(c1 * coeff(j, k) - s1 * coeff(k, k), c1 * coeff(j, j) - s1 * coeff(k, j));
+  } else {
+    res[0] = numext::atan2(coeff(j, k), coeff(k, k));
+    Scalar c2 = numext::hypot(coeff(i, i), coeff(i, j));
+    if ((odd && res[0] < Scalar(0)) || ((!odd) && res[0] > Scalar(0))) {
+      if (res[0] > Scalar(0)) {
+        res[0] -= Scalar(EIGEN_PI);
+      } else {
+        res[0] += Scalar(EIGEN_PI);
+      }
+      res[1] = numext::atan2(-coeff(i, k), -c2);
+    } else {
+      res[1] = numext::atan2(-coeff(i, k), c2);
     }
-    else
-      res[1] = atan2(-coeff(i,k), c2);
-    Scalar s1 = sin(res[0]);
-    Scalar c1 = cos(res[0]);
-    res[2] = atan2(s1*coeff(k,i)-c1*coeff(j,i), c1*coeff(j,j) - s1 * coeff(k,j));
+    Scalar s1 = numext::sin(res[0]);
+    Scalar c1 = numext::cos(res[0]);
+    res[2] = numext::atan2(s1 * coeff(k, i) - c1 * coeff(j, i), c1 * coeff(j, j) - s1 * coeff(k, j));
   }
-  if (!odd)
+  if (!odd) {
     res = -res;
-  
+  }
+
   return res;
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_EULERANGLES_H
+#endif  // EIGEN_EULERANGLES_H
diff --git a/inst/include/Eigen/src/Geometry/Homogeneous.h b/inst/include/Eigen/src/Geometry/Homogeneous.h
index 372e422b..4159dc6d 100644
--- a/inst/include/Eigen/src/Geometry/Homogeneous.h
+++ b/inst/include/Eigen/src/Geometry/Homogeneous.h
@@ -10,298 +10,444 @@
 #ifndef EIGEN_HOMOGENEOUS_H
 #define EIGEN_HOMOGENEOUS_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 /** \geometry_module \ingroup Geometry_Module
-  *
-  * \class Homogeneous
-  *
-  * \brief Expression of one (or a set of) homogeneous vector(s)
-  *
-  * \param MatrixType the type of the object in which we are making homogeneous
-  *
-  * This class represents an expression of one (or a set of) homogeneous vector(s).
-  * It is the return type of MatrixBase::homogeneous() and most of the time
-  * this is the only way it is used.
-  *
-  * \sa MatrixBase::homogeneous()
-  */
+ *
+ * \class Homogeneous
+ *
+ * \brief Expression of one (or a set of) homogeneous vector(s)
+ *
+ * \param MatrixType the type of the object in which we are making homogeneous
+ *
+ * This class represents an expression of one (or a set of) homogeneous vector(s).
+ * It is the return type of MatrixBase::homogeneous() and most of the time
+ * this is the only way it is used.
+ *
+ * \sa MatrixBase::homogeneous()
+ */
 
 namespace internal {
 
-template<typename MatrixType,int Direction>
-struct traits<Homogeneous<MatrixType,Direction> >
- : traits<MatrixType>
-{
+template <typename MatrixType, int Direction>
+struct traits<Homogeneous<MatrixType, Direction> > : traits<MatrixType> {
   typedef typename traits<MatrixType>::StorageKind StorageKind;
-  typedef typename nested<MatrixType>::type MatrixTypeNested;
-  typedef typename remove_reference<MatrixTypeNested>::type _MatrixTypeNested;
+  typedef typename ref_selector<MatrixType>::type MatrixTypeNested;
+  typedef std::remove_reference_t<MatrixTypeNested> MatrixTypeNested_;
   enum {
-    RowsPlusOne = (MatrixType::RowsAtCompileTime != Dynamic) ?
-                  int(MatrixType::RowsAtCompileTime) + 1 : Dynamic,
-    ColsPlusOne = (MatrixType::ColsAtCompileTime != Dynamic) ?
-                  int(MatrixType::ColsAtCompileTime) + 1 : Dynamic,
-    RowsAtCompileTime = Direction==Vertical  ?  RowsPlusOne : MatrixType::RowsAtCompileTime,
-    ColsAtCompileTime = Direction==Horizontal ? ColsPlusOne : MatrixType::ColsAtCompileTime,
+    RowsPlusOne = (MatrixType::RowsAtCompileTime != Dynamic) ? int(MatrixType::RowsAtCompileTime) + 1 : Dynamic,
+    ColsPlusOne = (MatrixType::ColsAtCompileTime != Dynamic) ? int(MatrixType::ColsAtCompileTime) + 1 : Dynamic,
+    RowsAtCompileTime = Direction == Vertical ? RowsPlusOne : MatrixType::RowsAtCompileTime,
+    ColsAtCompileTime = Direction == Horizontal ? ColsPlusOne : MatrixType::ColsAtCompileTime,
     MaxRowsAtCompileTime = RowsAtCompileTime,
     MaxColsAtCompileTime = ColsAtCompileTime,
-    TmpFlags = _MatrixTypeNested::Flags & HereditaryBits,
-    Flags = ColsAtCompileTime==1 ? (TmpFlags & ~RowMajorBit)
-          : RowsAtCompileTime==1 ? (TmpFlags | RowMajorBit)
-          : TmpFlags,
-    CoeffReadCost = _MatrixTypeNested::CoeffReadCost
+    TmpFlags = MatrixTypeNested_::Flags & HereditaryBits,
+    Flags = ColsAtCompileTime == 1   ? (TmpFlags & ~RowMajorBit)
+            : RowsAtCompileTime == 1 ? (TmpFlags | RowMajorBit)
+                                     : TmpFlags
   };
 };
 
-template<typename MatrixType,typename Lhs> struct homogeneous_left_product_impl;
-template<typename MatrixType,typename Rhs> struct homogeneous_right_product_impl;
-
-} // end namespace internal
-
-template<typename MatrixType,int _Direction> class Homogeneous
-  : internal::no_assignment_operator, public MatrixBase<Homogeneous<MatrixType,_Direction> >
-{
-  public:
-
-    enum { Direction = _Direction };
-
-    typedef MatrixBase<Homogeneous> Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(Homogeneous)
-
-    inline Homogeneous(const MatrixType& matrix)
-      : m_matrix(matrix)
-    {}
-
-    inline Index rows() const { return m_matrix.rows() + (int(Direction)==Vertical   ? 1 : 0); }
-    inline Index cols() const { return m_matrix.cols() + (int(Direction)==Horizontal ? 1 : 0); }
-
-    inline Scalar coeff(Index row, Index col) const
-    {
-      if(  (int(Direction)==Vertical   && row==m_matrix.rows())
-        || (int(Direction)==Horizontal && col==m_matrix.cols()))
-        return Scalar(1);
-      return m_matrix.coeff(row, col);
-    }
-
-    template<typename Rhs>
-    inline const internal::homogeneous_right_product_impl<Homogeneous,Rhs>
-    operator* (const MatrixBase<Rhs>& rhs) const
-    {
-      eigen_assert(int(Direction)==Horizontal);
-      return internal::homogeneous_right_product_impl<Homogeneous,Rhs>(m_matrix,rhs.derived());
-    }
-
-    template<typename Lhs> friend
-    inline const internal::homogeneous_left_product_impl<Homogeneous,Lhs>
-    operator* (const MatrixBase<Lhs>& lhs, const Homogeneous& rhs)
-    {
-      eigen_assert(int(Direction)==Vertical);
-      return internal::homogeneous_left_product_impl<Homogeneous,Lhs>(lhs.derived(),rhs.m_matrix);
-    }
-
-    template<typename Scalar, int Dim, int Mode, int Options> friend
-    inline const internal::homogeneous_left_product_impl<Homogeneous,Transform<Scalar,Dim,Mode,Options> >
-    operator* (const Transform<Scalar,Dim,Mode,Options>& lhs, const Homogeneous& rhs)
-    {
-      eigen_assert(int(Direction)==Vertical);
-      return internal::homogeneous_left_product_impl<Homogeneous,Transform<Scalar,Dim,Mode,Options> >(lhs,rhs.m_matrix);
-    }
-
-  protected:
-    typename MatrixType::Nested m_matrix;
+template <typename MatrixType, typename Lhs>
+struct homogeneous_left_product_impl;
+template <typename MatrixType, typename Rhs>
+struct homogeneous_right_product_impl;
+
+}  // end namespace internal
+
+template <typename MatrixType, int Direction_>
+class Homogeneous : public MatrixBase<Homogeneous<MatrixType, Direction_> >, internal::no_assignment_operator {
+ public:
+  typedef MatrixType NestedExpression;
+  enum { Direction = Direction_ };
+
+  typedef MatrixBase<Homogeneous> Base;
+  EIGEN_DENSE_PUBLIC_INTERFACE(Homogeneous)
+
+  EIGEN_DEVICE_FUNC explicit inline Homogeneous(const MatrixType& matrix) : m_matrix(matrix) {}
+
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept {
+    return m_matrix.rows() + (int(Direction) == Vertical ? 1 : 0);
+  }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept {
+    return m_matrix.cols() + (int(Direction) == Horizontal ? 1 : 0);
+  }
+
+  EIGEN_DEVICE_FUNC const NestedExpression& nestedExpression() const { return m_matrix; }
+
+  template <typename Rhs>
+  EIGEN_DEVICE_FUNC inline const Product<Homogeneous, Rhs> operator*(const MatrixBase<Rhs>& rhs) const {
+    return Product<Homogeneous, Rhs>(*this, rhs.derived());
+  }
+
+  template <typename Lhs>
+  friend EIGEN_DEVICE_FUNC inline const Product<Lhs, Homogeneous> operator*(const MatrixBase<Lhs>& lhs,
+                                                                            const Homogeneous& rhs) {
+    return Product<Lhs, Homogeneous>(lhs.derived(), rhs);
+  }
+
+  template <typename Scalar, int Dim, int Mode, int Options>
+  friend EIGEN_DEVICE_FUNC inline const Product<Transform<Scalar, Dim, Mode, Options>, Homogeneous> operator*(
+      const Transform<Scalar, Dim, Mode, Options>& lhs, const Homogeneous& rhs) {
+    eigen_assert(int(Direction) == Vertical);
+    return Product<Transform<Scalar, Dim, Mode, Options>, Homogeneous>(lhs, rhs);
+  }
+
+  template <typename Func>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::result_of<Func(Scalar, Scalar)>::type redux(
+      const Func& func) const {
+    return func(m_matrix.redux(func), Scalar(1));
+  }
+
+ protected:
+  typename MatrixType::Nested m_matrix;
 };
 
-/** \geometry_module
-  *
-  * \return an expression of the equivalent homogeneous vector
-  *
-  * \only_for_vectors
-  *
-  * Example: \include MatrixBase_homogeneous.cpp
-  * Output: \verbinclude MatrixBase_homogeneous.out
-  *
-  * \sa class Homogeneous
-  */
-template<typename Derived>
-inline typename MatrixBase<Derived>::HomogeneousReturnType
-MatrixBase<Derived>::homogeneous() const
-{
+/** \geometry_module \ingroup Geometry_Module
+ *
+ * \returns a vector expression that is one longer than the vector argument, with the value 1 symbolically appended as
+ * the last coefficient.
+ *
+ * This can be used to convert affine coordinates to homogeneous coordinates.
+ *
+ * \only_for_vectors
+ *
+ * Example: \include MatrixBase_homogeneous.cpp
+ * Output: \verbinclude MatrixBase_homogeneous.out
+ *
+ * \sa VectorwiseOp::homogeneous(), class Homogeneous
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::HomogeneousReturnType MatrixBase<Derived>::homogeneous() const {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
-  return derived();
+  return HomogeneousReturnType(derived());
+}
+
+/** \geometry_module \ingroup Geometry_Module
+ *
+ * \returns an expression where the value 1 is symbolically appended as the final coefficient to each column (or row) of
+ * the matrix.
+ *
+ * This can be used to convert affine coordinates to homogeneous coordinates.
+ *
+ * Example: \include VectorwiseOp_homogeneous.cpp
+ * Output: \verbinclude VectorwiseOp_homogeneous.out
+ *
+ * \sa MatrixBase::homogeneous(), class Homogeneous */
+template <typename ExpressionType, int Direction>
+EIGEN_DEVICE_FUNC inline Homogeneous<ExpressionType, Direction> VectorwiseOp<ExpressionType, Direction>::homogeneous()
+    const {
+  return HomogeneousReturnType(_expression());
 }
 
-/** \geometry_module
+/** \geometry_module \ingroup Geometry_Module
   *
-  * \returns a matrix expression of homogeneous column (or row) vectors
+  * \brief homogeneous normalization
   *
-  * Example: \include VectorwiseOp_homogeneous.cpp
-  * Output: \verbinclude VectorwiseOp_homogeneous.out
+  * \returns a vector expression of the N-1 first coefficients of \c *this divided by that last coefficient.
   *
-  * \sa MatrixBase::homogeneous() */
-template<typename ExpressionType, int Direction>
-inline Homogeneous<ExpressionType,Direction>
-VectorwiseOp<ExpressionType,Direction>::homogeneous() const
-{
-  return _expression();
-}
-
-/** \geometry_module
+  * This can be used to convert homogeneous coordinates to affine coordinates.
   *
-  * \returns an expression of the homogeneous normalized vector of \c *this
+  * It is essentially a shortcut for:
+  * \code
+    this->head(this->size()-1)/this->coeff(this->size()-1);
+    \endcode
   *
   * Example: \include MatrixBase_hnormalized.cpp
   * Output: \verbinclude MatrixBase_hnormalized.out
   *
   * \sa VectorwiseOp::hnormalized() */
-template<typename Derived>
-inline const typename MatrixBase<Derived>::HNormalizedReturnType
-MatrixBase<Derived>::hnormalized() const
-{
+template <typename Derived>
+EIGEN_DEVICE_FUNC inline const typename MatrixBase<Derived>::HNormalizedReturnType MatrixBase<Derived>::hnormalized()
+    const {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
-  return ConstStartMinusOne(derived(),0,0,
-    ColsAtCompileTime==1?size()-1:1,
-    ColsAtCompileTime==1?1:size()-1) / coeff(size()-1);
+  return ConstStartMinusOne(derived(), 0, 0, ColsAtCompileTime == 1 ? size() - 1 : 1,
+                            ColsAtCompileTime == 1 ? 1 : size() - 1) /
+         coeff(size() - 1);
 }
 
-/** \geometry_module
-  *
-  * \returns an expression of the homogeneous normalized vector of \c *this
-  *
-  * Example: \include DirectionWise_hnormalized.cpp
-  * Output: \verbinclude DirectionWise_hnormalized.out
-  *
-  * \sa MatrixBase::hnormalized() */
-template<typename ExpressionType, int Direction>
-inline const typename VectorwiseOp<ExpressionType,Direction>::HNormalizedReturnType
-VectorwiseOp<ExpressionType,Direction>::hnormalized() const
-{
-  return HNormalized_Block(_expression(),0,0,
-      Direction==Vertical   ? _expression().rows()-1 : _expression().rows(),
-      Direction==Horizontal ? _expression().cols()-1 : _expression().cols()).cwiseQuotient(
-      Replicate<HNormalized_Factors,
-                Direction==Vertical   ? HNormalized_SizeMinusOne : 1,
-                Direction==Horizontal ? HNormalized_SizeMinusOne : 1>
-        (HNormalized_Factors(_expression(),
-          Direction==Vertical    ? _expression().rows()-1:0,
-          Direction==Horizontal  ? _expression().cols()-1:0,
-          Direction==Vertical    ? 1 : _expression().rows(),
-          Direction==Horizontal  ? 1 : _expression().cols()),
-         Direction==Vertical   ? _expression().rows()-1 : 1,
-         Direction==Horizontal ? _expression().cols()-1 : 1));
+/** \geometry_module \ingroup Geometry_Module
+ *
+ * \brief column or row-wise homogeneous normalization
+ *
+ * \returns an expression of the first N-1 coefficients of each column (or row) of \c *this divided by the last
+ * coefficient of each column (or row).
+ *
+ * This can be used to convert homogeneous coordinates to affine coordinates.
+ *
+ * It is conceptually equivalent to calling MatrixBase::hnormalized() to each column (or row) of \c *this.
+ *
+ * Example: \include DirectionWise_hnormalized.cpp
+ * Output: \verbinclude DirectionWise_hnormalized.out
+ *
+ * \sa MatrixBase::hnormalized() */
+template <typename ExpressionType, int Direction>
+EIGEN_DEVICE_FUNC inline const typename VectorwiseOp<ExpressionType, Direction>::HNormalizedReturnType
+VectorwiseOp<ExpressionType, Direction>::hnormalized() const {
+  return HNormalized_Block(_expression(), 0, 0, Direction == Vertical ? _expression().rows() - 1 : _expression().rows(),
+                           Direction == Horizontal ? _expression().cols() - 1 : _expression().cols())
+      .cwiseQuotient(Replicate < HNormalized_Factors, Direction == Vertical ? HNormalized_SizeMinusOne : 1,
+                     Direction == Horizontal
+                         ? HNormalized_SizeMinusOne
+                         : 1 > (HNormalized_Factors(_expression(), Direction == Vertical ? _expression().rows() - 1 : 0,
+                                                    Direction == Horizontal ? _expression().cols() - 1 : 0,
+                                                    Direction == Vertical ? 1 : _expression().rows(),
+                                                    Direction == Horizontal ? 1 : _expression().cols()),
+                                Direction == Vertical ? _expression().rows() - 1 : 1,
+                                Direction == Horizontal ? _expression().cols() - 1 : 1));
 }
 
 namespace internal {
 
-template<typename MatrixOrTransformType>
-struct take_matrix_for_product
-{
+template <typename MatrixOrTransformType>
+struct take_matrix_for_product {
   typedef MatrixOrTransformType type;
-  static const type& run(const type &x) { return x; }
+  EIGEN_DEVICE_FUNC static const type& run(const type& x) { return x; }
 };
 
-template<typename Scalar, int Dim, int Mode,int Options>
-struct take_matrix_for_product<Transform<Scalar, Dim, Mode, Options> >
-{
+template <typename Scalar, int Dim, int Mode, int Options>
+struct take_matrix_for_product<Transform<Scalar, Dim, Mode, Options> > {
   typedef Transform<Scalar, Dim, Mode, Options> TransformType;
-  typedef typename internal::add_const<typename TransformType::ConstAffinePart>::type type;
-  static type run (const TransformType& x) { return x.affine(); }
+  typedef std::add_const_t<typename TransformType::ConstAffinePart> type;
+  EIGEN_DEVICE_FUNC static type run(const TransformType& x) { return x.affine(); }
 };
 
-template<typename Scalar, int Dim, int Options>
-struct take_matrix_for_product<Transform<Scalar, Dim, Projective, Options> >
-{
+template <typename Scalar, int Dim, int Options>
+struct take_matrix_for_product<Transform<Scalar, Dim, Projective, Options> > {
   typedef Transform<Scalar, Dim, Projective, Options> TransformType;
   typedef typename TransformType::MatrixType type;
-  static const type& run (const TransformType& x) { return x.matrix(); }
+  EIGEN_DEVICE_FUNC static const type& run(const TransformType& x) { return x.matrix(); }
 };
 
-template<typename MatrixType,typename Lhs>
-struct traits<homogeneous_left_product_impl<Homogeneous<MatrixType,Vertical>,Lhs> >
-{
+template <typename MatrixType, typename Lhs>
+struct traits<homogeneous_left_product_impl<Homogeneous<MatrixType, Vertical>, Lhs> > {
   typedef typename take_matrix_for_product<Lhs>::type LhsMatrixType;
-  typedef typename remove_all<MatrixType>::type MatrixTypeCleaned;
-  typedef typename remove_all<LhsMatrixType>::type LhsMatrixTypeCleaned;
+  typedef remove_all_t<MatrixType> MatrixTypeCleaned;
+  typedef remove_all_t<LhsMatrixType> LhsMatrixTypeCleaned;
   typedef typename make_proper_matrix_type<
-                 typename traits<MatrixTypeCleaned>::Scalar,
-                 LhsMatrixTypeCleaned::RowsAtCompileTime,
-                 MatrixTypeCleaned::ColsAtCompileTime,
-                 MatrixTypeCleaned::PlainObject::Options,
-                 LhsMatrixTypeCleaned::MaxRowsAtCompileTime,
-                 MatrixTypeCleaned::MaxColsAtCompileTime>::type ReturnType;
+      typename traits<MatrixTypeCleaned>::Scalar, LhsMatrixTypeCleaned::RowsAtCompileTime,
+      MatrixTypeCleaned::ColsAtCompileTime, MatrixTypeCleaned::PlainObject::Options,
+      LhsMatrixTypeCleaned::MaxRowsAtCompileTime, MatrixTypeCleaned::MaxColsAtCompileTime>::type ReturnType;
 };
 
-template<typename MatrixType,typename Lhs>
-struct homogeneous_left_product_impl<Homogeneous<MatrixType,Vertical>,Lhs>
-  : public ReturnByValue<homogeneous_left_product_impl<Homogeneous<MatrixType,Vertical>,Lhs> >
-{
+template <typename MatrixType, typename Lhs>
+struct homogeneous_left_product_impl<Homogeneous<MatrixType, Vertical>, Lhs>
+    : public ReturnByValue<homogeneous_left_product_impl<Homogeneous<MatrixType, Vertical>, Lhs> > {
   typedef typename traits<homogeneous_left_product_impl>::LhsMatrixType LhsMatrixType;
-  typedef typename remove_all<LhsMatrixType>::type LhsMatrixTypeCleaned;
-  typedef typename remove_all<typename LhsMatrixTypeCleaned::Nested>::type LhsMatrixTypeNested;
-  typedef typename MatrixType::Index Index;
-  homogeneous_left_product_impl(const Lhs& lhs, const MatrixType& rhs)
-    : m_lhs(take_matrix_for_product<Lhs>::run(lhs)),
-      m_rhs(rhs)
-  {}
-
-  inline Index rows() const { return m_lhs.rows(); }
-  inline Index cols() const { return m_rhs.cols(); }
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
+  typedef remove_all_t<LhsMatrixType> LhsMatrixTypeCleaned;
+  typedef remove_all_t<typename LhsMatrixTypeCleaned::Nested> LhsMatrixTypeNested;
+  EIGEN_DEVICE_FUNC homogeneous_left_product_impl(const Lhs& lhs, const MatrixType& rhs)
+      : m_lhs(take_matrix_for_product<Lhs>::run(lhs)), m_rhs(rhs) {}
+
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_lhs.rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_rhs.cols(); }
+
+  template <typename Dest>
+  EIGEN_DEVICE_FUNC void evalTo(Dest& dst) const {
     // FIXME investigate how to allow lazy evaluation of this product when possible
-    dst = Block<const LhsMatrixTypeNested,
-              LhsMatrixTypeNested::RowsAtCompileTime,
-              LhsMatrixTypeNested::ColsAtCompileTime==Dynamic?Dynamic:LhsMatrixTypeNested::ColsAtCompileTime-1>
-            (m_lhs,0,0,m_lhs.rows(),m_lhs.cols()-1) * m_rhs;
-    dst += m_lhs.col(m_lhs.cols()-1).rowwise()
-            .template replicate<MatrixType::ColsAtCompileTime>(m_rhs.cols());
+    dst = Block < const LhsMatrixTypeNested, LhsMatrixTypeNested::RowsAtCompileTime,
+    LhsMatrixTypeNested::ColsAtCompileTime == Dynamic
+        ? Dynamic
+        : LhsMatrixTypeNested::ColsAtCompileTime - 1 > (m_lhs, 0, 0, m_lhs.rows(), m_lhs.cols() - 1) * m_rhs;
+    dst += m_lhs.col(m_lhs.cols() - 1).rowwise().template replicate<MatrixType::ColsAtCompileTime>(m_rhs.cols());
   }
 
   typename LhsMatrixTypeCleaned::Nested m_lhs;
   typename MatrixType::Nested m_rhs;
 };
 
-template<typename MatrixType,typename Rhs>
-struct traits<homogeneous_right_product_impl<Homogeneous<MatrixType,Horizontal>,Rhs> >
-{
-  typedef typename make_proper_matrix_type<typename traits<MatrixType>::Scalar,
-                 MatrixType::RowsAtCompileTime,
-                 Rhs::ColsAtCompileTime,
-                 MatrixType::PlainObject::Options,
-                 MatrixType::MaxRowsAtCompileTime,
-                 Rhs::MaxColsAtCompileTime>::type ReturnType;
+template <typename MatrixType, typename Rhs>
+struct traits<homogeneous_right_product_impl<Homogeneous<MatrixType, Horizontal>, Rhs> > {
+  typedef
+      typename make_proper_matrix_type<typename traits<MatrixType>::Scalar, MatrixType::RowsAtCompileTime,
+                                       Rhs::ColsAtCompileTime, MatrixType::PlainObject::Options,
+                                       MatrixType::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime>::type ReturnType;
 };
 
-template<typename MatrixType,typename Rhs>
-struct homogeneous_right_product_impl<Homogeneous<MatrixType,Horizontal>,Rhs>
-  : public ReturnByValue<homogeneous_right_product_impl<Homogeneous<MatrixType,Horizontal>,Rhs> >
-{
-  typedef typename remove_all<typename Rhs::Nested>::type RhsNested;
-  typedef typename MatrixType::Index Index;
-  homogeneous_right_product_impl(const MatrixType& lhs, const Rhs& rhs)
-    : m_lhs(lhs), m_rhs(rhs)
-  {}
-
-  inline Index rows() const { return m_lhs.rows(); }
-  inline Index cols() const { return m_rhs.cols(); }
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
+template <typename MatrixType, typename Rhs>
+struct homogeneous_right_product_impl<Homogeneous<MatrixType, Horizontal>, Rhs>
+    : public ReturnByValue<homogeneous_right_product_impl<Homogeneous<MatrixType, Horizontal>, Rhs> > {
+  typedef remove_all_t<typename Rhs::Nested> RhsNested;
+  EIGEN_DEVICE_FUNC homogeneous_right_product_impl(const MatrixType& lhs, const Rhs& rhs) : m_lhs(lhs), m_rhs(rhs) {}
+
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_lhs.rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_rhs.cols(); }
+
+  template <typename Dest>
+  EIGEN_DEVICE_FUNC void evalTo(Dest& dst) const {
     // FIXME investigate how to allow lazy evaluation of this product when possible
-    dst = m_lhs * Block<const RhsNested,
-                        RhsNested::RowsAtCompileTime==Dynamic?Dynamic:RhsNested::RowsAtCompileTime-1,
-                        RhsNested::ColsAtCompileTime>
-            (m_rhs,0,0,m_rhs.rows()-1,m_rhs.cols());
-    dst += m_rhs.row(m_rhs.rows()-1).colwise()
-            .template replicate<MatrixType::RowsAtCompileTime>(m_lhs.rows());
+    dst = m_lhs * Block < const RhsNested,
+    RhsNested::RowsAtCompileTime == Dynamic ? Dynamic : RhsNested::RowsAtCompileTime - 1,
+    RhsNested::ColsAtCompileTime > (m_rhs, 0, 0, m_rhs.rows() - 1, m_rhs.cols());
+    dst += m_rhs.row(m_rhs.rows() - 1).colwise().template replicate<MatrixType::RowsAtCompileTime>(m_lhs.rows());
   }
 
   typename MatrixType::Nested m_lhs;
   typename Rhs::Nested m_rhs;
 };
 
-} // end namespace internal
+template <typename ArgType, int Direction>
+struct evaluator_traits<Homogeneous<ArgType, Direction> > {
+  typedef typename storage_kind_to_evaluator_kind<typename ArgType::StorageKind>::Kind Kind;
+  typedef HomogeneousShape Shape;
+};
+
+template <>
+struct AssignmentKind<DenseShape, HomogeneousShape> {
+  typedef Dense2Dense Kind;
+};
+
+template <typename ArgType, int Direction>
+struct unary_evaluator<Homogeneous<ArgType, Direction>, IndexBased>
+    : evaluator<typename Homogeneous<ArgType, Direction>::PlainObject> {
+  typedef Homogeneous<ArgType, Direction> XprType;
+  typedef typename XprType::PlainObject PlainObject;
+  typedef evaluator<PlainObject> Base;
+
+  EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& op) : Base(), m_temp(op) {
+    internal::construct_at<Base>(this, m_temp);
+  }
+
+ protected:
+  PlainObject m_temp;
+};
+
+// dense = homogeneous
+template <typename DstXprType, typename ArgType, typename Scalar>
+struct Assignment<DstXprType, Homogeneous<ArgType, Vertical>, internal::assign_op<Scalar, typename ArgType::Scalar>,
+                  Dense2Dense> {
+  typedef Homogeneous<ArgType, Vertical> SrcXprType;
+  EIGEN_DEVICE_FUNC static void run(DstXprType& dst, const SrcXprType& src,
+                                    const internal::assign_op<Scalar, typename ArgType::Scalar>&) {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if ((dst.rows() != dstRows) || (dst.cols() != dstCols)) dst.resize(dstRows, dstCols);
+
+    dst.template topRows<ArgType::RowsAtCompileTime>(src.nestedExpression().rows()) = src.nestedExpression();
+    dst.row(dst.rows() - 1).setOnes();
+  }
+};
+
+// dense = homogeneous
+template <typename DstXprType, typename ArgType, typename Scalar>
+struct Assignment<DstXprType, Homogeneous<ArgType, Horizontal>, internal::assign_op<Scalar, typename ArgType::Scalar>,
+                  Dense2Dense> {
+  typedef Homogeneous<ArgType, Horizontal> SrcXprType;
+  EIGEN_DEVICE_FUNC static void run(DstXprType& dst, const SrcXprType& src,
+                                    const internal::assign_op<Scalar, typename ArgType::Scalar>&) {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if ((dst.rows() != dstRows) || (dst.cols() != dstCols)) dst.resize(dstRows, dstCols);
+
+    dst.template leftCols<ArgType::ColsAtCompileTime>(src.nestedExpression().cols()) = src.nestedExpression();
+    dst.col(dst.cols() - 1).setOnes();
+  }
+};
+
+template <typename LhsArg, typename Rhs, int ProductTag>
+struct generic_product_impl<Homogeneous<LhsArg, Horizontal>, Rhs, HomogeneousShape, DenseShape, ProductTag> {
+  template <typename Dest>
+  EIGEN_DEVICE_FUNC static void evalTo(Dest& dst, const Homogeneous<LhsArg, Horizontal>& lhs, const Rhs& rhs) {
+    homogeneous_right_product_impl<Homogeneous<LhsArg, Horizontal>, Rhs>(lhs.nestedExpression(), rhs).evalTo(dst);
+  }
+};
+
+template <typename Lhs, typename Rhs>
+struct homogeneous_right_product_refactoring_helper {
+  enum { Dim = Lhs::ColsAtCompileTime, Rows = Lhs::RowsAtCompileTime };
+  typedef typename Rhs::template ConstNRowsBlockXpr<Dim>::Type LinearBlockConst;
+  typedef std::remove_const_t<LinearBlockConst> LinearBlock;
+  typedef typename Rhs::ConstRowXpr ConstantColumn;
+  typedef Replicate<const ConstantColumn, Rows, 1> ConstantBlock;
+  typedef Product<Lhs, LinearBlock, LazyProduct> LinearProduct;
+  typedef CwiseBinaryOp<internal::scalar_sum_op<typename Lhs::Scalar, typename Rhs::Scalar>, const LinearProduct,
+                        const ConstantBlock>
+      Xpr;
+};
+
+template <typename Lhs, typename Rhs, int ProductTag>
+struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, HomogeneousShape, DenseShape>
+    : public evaluator<
+          typename homogeneous_right_product_refactoring_helper<typename Lhs::NestedExpression, Rhs>::Xpr> {
+  typedef Product<Lhs, Rhs, LazyProduct> XprType;
+  typedef homogeneous_right_product_refactoring_helper<typename Lhs::NestedExpression, Rhs> helper;
+  typedef typename helper::ConstantBlock ConstantBlock;
+  typedef typename helper::Xpr RefactoredXpr;
+  typedef evaluator<RefactoredXpr> Base;
+
+  EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
+      : Base(xpr.lhs().nestedExpression().lazyProduct(
+                 xpr.rhs().template topRows<helper::Dim>(xpr.lhs().nestedExpression().cols())) +
+             ConstantBlock(xpr.rhs().row(xpr.rhs().rows() - 1), xpr.lhs().rows(), 1)) {}
+};
+
+template <typename Lhs, typename RhsArg, int ProductTag>
+struct generic_product_impl<Lhs, Homogeneous<RhsArg, Vertical>, DenseShape, HomogeneousShape, ProductTag> {
+  template <typename Dest>
+  EIGEN_DEVICE_FUNC static void evalTo(Dest& dst, const Lhs& lhs, const Homogeneous<RhsArg, Vertical>& rhs) {
+    homogeneous_left_product_impl<Homogeneous<RhsArg, Vertical>, Lhs>(lhs, rhs.nestedExpression()).evalTo(dst);
+  }
+};
+
+// TODO: the following specialization is to address a regression from 3.2 to 3.3
+// In the future, this path should be optimized.
+template <typename Lhs, typename RhsArg, int ProductTag>
+struct generic_product_impl<Lhs, Homogeneous<RhsArg, Vertical>, TriangularShape, HomogeneousShape, ProductTag> {
+  template <typename Dest>
+  static void evalTo(Dest& dst, const Lhs& lhs, const Homogeneous<RhsArg, Vertical>& rhs) {
+    dst.noalias() = lhs * rhs.eval();
+  }
+};
+
+template <typename Lhs, typename Rhs>
+struct homogeneous_left_product_refactoring_helper {
+  enum { Dim = Rhs::RowsAtCompileTime, Cols = Rhs::ColsAtCompileTime };
+  typedef typename Lhs::template ConstNColsBlockXpr<Dim>::Type LinearBlockConst;
+  typedef std::remove_const_t<LinearBlockConst> LinearBlock;
+  typedef typename Lhs::ConstColXpr ConstantColumn;
+  typedef Replicate<const ConstantColumn, 1, Cols> ConstantBlock;
+  typedef Product<LinearBlock, Rhs, LazyProduct> LinearProduct;
+  typedef CwiseBinaryOp<internal::scalar_sum_op<typename Lhs::Scalar, typename Rhs::Scalar>, const LinearProduct,
+                        const ConstantBlock>
+      Xpr;
+};
+
+template <typename Lhs, typename Rhs, int ProductTag>
+struct product_evaluator<Product<Lhs, Rhs, LazyProduct>, ProductTag, DenseShape, HomogeneousShape>
+    : public evaluator<typename homogeneous_left_product_refactoring_helper<Lhs, typename Rhs::NestedExpression>::Xpr> {
+  typedef Product<Lhs, Rhs, LazyProduct> XprType;
+  typedef homogeneous_left_product_refactoring_helper<Lhs, typename Rhs::NestedExpression> helper;
+  typedef typename helper::ConstantBlock ConstantBlock;
+  typedef typename helper::Xpr RefactoredXpr;
+  typedef evaluator<RefactoredXpr> Base;
+
+  EIGEN_DEVICE_FUNC explicit product_evaluator(const XprType& xpr)
+      : Base(xpr.lhs()
+                 .template leftCols<helper::Dim>(xpr.rhs().nestedExpression().rows())
+                 .lazyProduct(xpr.rhs().nestedExpression()) +
+             ConstantBlock(xpr.lhs().col(xpr.lhs().cols() - 1), 1, xpr.rhs().cols())) {}
+};
+
+template <typename Scalar, int Dim, int Mode, int Options, typename RhsArg, int ProductTag>
+struct generic_product_impl<Transform<Scalar, Dim, Mode, Options>, Homogeneous<RhsArg, Vertical>, DenseShape,
+                            HomogeneousShape, ProductTag> {
+  typedef Transform<Scalar, Dim, Mode, Options> TransformType;
+  template <typename Dest>
+  EIGEN_DEVICE_FUNC static void evalTo(Dest& dst, const TransformType& lhs, const Homogeneous<RhsArg, Vertical>& rhs) {
+    homogeneous_left_product_impl<Homogeneous<RhsArg, Vertical>, TransformType>(lhs, rhs.nestedExpression())
+        .evalTo(dst);
+  }
+};
+
+template <typename ExpressionType, int Side, bool Transposed>
+struct permutation_matrix_product<ExpressionType, Side, Transposed, HomogeneousShape>
+    : public permutation_matrix_product<ExpressionType, Side, Transposed, DenseShape> {};
+
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_HOMOGENEOUS_H
+#endif  // EIGEN_HOMOGENEOUS_H
diff --git a/inst/include/Eigen/src/Geometry/Hyperplane.h b/inst/include/Eigen/src/Geometry/Hyperplane.h
index 00b7c430..0fa0319a 100644
--- a/inst/include/Eigen/src/Geometry/Hyperplane.h
+++ b/inst/include/Eigen/src/Geometry/Hyperplane.h
@@ -11,82 +11,76 @@
 #ifndef EIGEN_HYPERPLANE_H
 #define EIGEN_HYPERPLANE_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 /** \geometry_module \ingroup Geometry_Module
-  *
-  * \class Hyperplane
-  *
-  * \brief A hyperplane
-  *
-  * A hyperplane is an affine subspace of dimension n-1 in a space of dimension n.
-  * For example, a hyperplane in a plane is a line; a hyperplane in 3-space is a plane.
-  *
-  * \param _Scalar the scalar type, i.e., the type of the coefficients
-  * \param _AmbientDim the dimension of the ambient space, can be a compile time value or Dynamic.
-  *             Notice that the dimension of the hyperplane is _AmbientDim-1.
-  *
-  * This class represents an hyperplane as the zero set of the implicit equation
-  * \f$ n \cdot x + d = 0 \f$ where \f$ n \f$ is a unit normal vector of the plane (linear part)
-  * and \f$ d \f$ is the distance (offset) to the origin.
-  */
-template <typename _Scalar, int _AmbientDim, int _Options>
-class Hyperplane
-{
-public:
-  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim==Dynamic ? Dynamic : _AmbientDim+1)
-  enum {
-    AmbientDimAtCompileTime = _AmbientDim,
-    Options = _Options
-  };
-  typedef _Scalar Scalar;
+ *
+ * \class Hyperplane
+ *
+ * \brief A hyperplane
+ *
+ * A hyperplane is an affine subspace of dimension n-1 in a space of dimension n.
+ * For example, a hyperplane in a plane is a line; a hyperplane in 3-space is a plane.
+ *
+ * \tparam Scalar_ the scalar type, i.e., the type of the coefficients
+ * \tparam AmbientDim_ the dimension of the ambient space, can be a compile time value or Dynamic.
+ *             Notice that the dimension of the hyperplane is AmbientDim_-1.
+ *
+ * This class represents an hyperplane as the zero set of the implicit equation
+ * \f$ n \cdot x + d = 0 \f$ where \f$ n \f$ is a unit normal vector of the plane (linear part)
+ * and \f$ d \f$ is the distance (offset) to the origin.
+ */
+template <typename Scalar_, int AmbientDim_, int Options_>
+class Hyperplane {
+ public:
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar_,
+                                                             AmbientDim_ == Dynamic ? Dynamic : AmbientDim_ + 1)
+  enum { AmbientDimAtCompileTime = AmbientDim_, Options = Options_ };
+  typedef Scalar_ Scalar;
   typedef typename NumTraits<Scalar>::Real RealScalar;
-  typedef DenseIndex Index;
-  typedef Matrix<Scalar,AmbientDimAtCompileTime,1> VectorType;
-  typedef Matrix<Scalar,Index(AmbientDimAtCompileTime)==Dynamic
-                        ? Dynamic
-                        : Index(AmbientDimAtCompileTime)+1,1,Options> Coefficients;
-  typedef Block<Coefficients,AmbientDimAtCompileTime,1> NormalReturnType;
-  typedef const Block<const Coefficients,AmbientDimAtCompileTime,1> ConstNormalReturnType;
+  typedef Eigen::Index Index;  ///< \deprecated since Eigen 3.3
+  typedef Matrix<Scalar, AmbientDimAtCompileTime, 1> VectorType;
+  typedef Matrix<Scalar, Index(AmbientDimAtCompileTime) == Dynamic ? Dynamic : Index(AmbientDimAtCompileTime) + 1, 1,
+                 Options>
+      Coefficients;
+  typedef Block<Coefficients, AmbientDimAtCompileTime, 1> NormalReturnType;
+  typedef const Block<const Coefficients, AmbientDimAtCompileTime, 1> ConstNormalReturnType;
 
   /** Default constructor without initialization */
-  inline Hyperplane() {}
-  
-  template<int OtherOptions>
-  Hyperplane(const Hyperplane<Scalar,AmbientDimAtCompileTime,OtherOptions>& other)
-   : m_coeffs(other.coeffs())
-  {}
+  EIGEN_DEVICE_FUNC inline Hyperplane() {}
+
+  template <int OtherOptions>
+  EIGEN_DEVICE_FUNC Hyperplane(const Hyperplane<Scalar, AmbientDimAtCompileTime, OtherOptions>& other)
+      : m_coeffs(other.coeffs()) {}
 
   /** Constructs a dynamic-size hyperplane with \a _dim the dimension
-    * of the ambient space */
-  inline explicit Hyperplane(Index _dim) : m_coeffs(_dim+1) {}
+   * of the ambient space */
+  EIGEN_DEVICE_FUNC inline explicit Hyperplane(Index _dim) : m_coeffs(_dim + 1) {}
 
   /** Construct a plane from its normal \a n and a point \a e onto the plane.
-    * \warning the vector normal is assumed to be normalized.
-    */
-  inline Hyperplane(const VectorType& n, const VectorType& e)
-    : m_coeffs(n.size()+1)
-  {
+   * \warning the vector normal is assumed to be normalized.
+   */
+  EIGEN_DEVICE_FUNC inline Hyperplane(const VectorType& n, const VectorType& e) : m_coeffs(n.size() + 1) {
     normal() = n;
     offset() = -n.dot(e);
   }
 
   /** Constructs a plane from its normal \a n and distance to the origin \a d
-    * such that the algebraic equation of the plane is \f$ n \cdot x + d = 0 \f$.
-    * \warning the vector normal is assumed to be normalized.
-    */
-  inline Hyperplane(const VectorType& n, const Scalar& d)
-    : m_coeffs(n.size()+1)
-  {
+   * such that the algebraic equation of the plane is \f$ n \cdot x + d = 0 \f$.
+   * \warning the vector normal is assumed to be normalized.
+   */
+  EIGEN_DEVICE_FUNC inline Hyperplane(const VectorType& n, const Scalar& d) : m_coeffs(n.size() + 1) {
     normal() = n;
     offset() = d;
   }
 
   /** Constructs a hyperplane passing through the two points. If the dimension of the ambient space
-    * is greater than 2, then there isn't uniqueness, so an arbitrary choice is made.
-    */
-  static inline Hyperplane Through(const VectorType& p0, const VectorType& p1)
-  {
+   * is greater than 2, then there isn't uniqueness, so an arbitrary choice is made.
+   */
+  EIGEN_DEVICE_FUNC static inline Hyperplane Through(const VectorType& p0, const VectorType& p1) {
     Hyperplane result(p0.size());
     result.normal() = (p1 - p0).unitOrthogonal();
     result.offset() = -p0.dot(result.normal());
@@ -94,187 +88,186 @@ class Hyperplane
   }
 
   /** Constructs a hyperplane passing through the three points. The dimension of the ambient space
-    * is required to be exactly 3.
-    */
-  static inline Hyperplane Through(const VectorType& p0, const VectorType& p1, const VectorType& p2)
-  {
+   * is required to be exactly 3.
+   */
+  EIGEN_DEVICE_FUNC static inline Hyperplane Through(const VectorType& p0, const VectorType& p1, const VectorType& p2) {
     EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(VectorType, 3)
     Hyperplane result(p0.size());
     VectorType v0(p2 - p0), v1(p1 - p0);
     result.normal() = v0.cross(v1);
     RealScalar norm = result.normal().norm();
-    if(norm <= v0.norm() * v1.norm() * NumTraits<RealScalar>::epsilon())
-    {
-      Matrix<Scalar,2,3> m; m << v0.transpose(), v1.transpose();
-      JacobiSVD<Matrix<Scalar,2,3> > svd(m, ComputeFullV);
+    if (norm <= v0.norm() * v1.norm() * NumTraits<RealScalar>::epsilon()) {
+      Matrix<Scalar, 2, 3> m;
+      m << v0.transpose(), v1.transpose();
+      JacobiSVD<Matrix<Scalar, 2, 3>, ComputeFullV> svd(m);
       result.normal() = svd.matrixV().col(2);
-    }
-    else
+    } else
       result.normal() /= norm;
     result.offset() = -p0.dot(result.normal());
     return result;
   }
 
   /** Constructs a hyperplane passing through the parametrized line \a parametrized.
-    * If the dimension of the ambient space is greater than 2, then there isn't uniqueness,
-    * so an arbitrary choice is made.
-    */
-  // FIXME to be consitent with the rest this could be implemented as a static Through function ??
-  explicit Hyperplane(const ParametrizedLine<Scalar, AmbientDimAtCompileTime>& parametrized)
-  {
+   * If the dimension of the ambient space is greater than 2, then there isn't uniqueness,
+   * so an arbitrary choice is made.
+   */
+  // FIXME to be consistent with the rest this could be implemented as a static Through function ??
+  EIGEN_DEVICE_FUNC explicit Hyperplane(const ParametrizedLine<Scalar, AmbientDimAtCompileTime>& parametrized) {
     normal() = parametrized.direction().unitOrthogonal();
     offset() = -parametrized.origin().dot(normal());
   }
 
-  ~Hyperplane() {}
+  EIGEN_DEVICE_FUNC ~Hyperplane() {}
 
   /** \returns the dimension in which the plane holds */
-  inline Index dim() const { return AmbientDimAtCompileTime==Dynamic ? m_coeffs.size()-1 : Index(AmbientDimAtCompileTime); }
+  EIGEN_DEVICE_FUNC inline Index dim() const {
+    return AmbientDimAtCompileTime == Dynamic ? m_coeffs.size() - 1 : Index(AmbientDimAtCompileTime);
+  }
 
   /** normalizes \c *this */
-  void normalize(void)
-  {
-    m_coeffs /= normal().norm();
-  }
+  EIGEN_DEVICE_FUNC void normalize(void) { m_coeffs /= normal().norm(); }
 
   /** \returns the signed distance between the plane \c *this and a point \a p.
-    * \sa absDistance()
-    */
-  inline Scalar signedDistance(const VectorType& p) const { return normal().dot(p) + offset(); }
+   * \sa absDistance()
+   */
+  EIGEN_DEVICE_FUNC inline Scalar signedDistance(const VectorType& p) const { return normal().dot(p) + offset(); }
 
   /** \returns the absolute distance between the plane \c *this and a point \a p.
-    * \sa signedDistance()
-    */
-  inline Scalar absDistance(const VectorType& p) const { using std::abs; return abs(signedDistance(p)); }
+   * \sa signedDistance()
+   */
+  EIGEN_DEVICE_FUNC inline Scalar absDistance(const VectorType& p) const { return numext::abs(signedDistance(p)); }
 
   /** \returns the projection of a point \a p onto the plane \c *this.
-    */
-  inline VectorType projection(const VectorType& p) const { return p - signedDistance(p) * normal(); }
+   */
+  EIGEN_DEVICE_FUNC inline VectorType projection(const VectorType& p) const { return p - signedDistance(p) * normal(); }
 
   /** \returns a constant reference to the unit normal vector of the plane, which corresponds
-    * to the linear part of the implicit equation.
-    */
-  inline ConstNormalReturnType normal() const { return ConstNormalReturnType(m_coeffs,0,0,dim(),1); }
+   * to the linear part of the implicit equation.
+   */
+  EIGEN_DEVICE_FUNC inline ConstNormalReturnType normal() const {
+    return ConstNormalReturnType(m_coeffs, 0, 0, dim(), 1);
+  }
 
   /** \returns a non-constant reference to the unit normal vector of the plane, which corresponds
-    * to the linear part of the implicit equation.
-    */
-  inline NormalReturnType normal() { return NormalReturnType(m_coeffs,0,0,dim(),1); }
+   * to the linear part of the implicit equation.
+   */
+  EIGEN_DEVICE_FUNC inline NormalReturnType normal() { return NormalReturnType(m_coeffs, 0, 0, dim(), 1); }
 
   /** \returns the distance to the origin, which is also the "constant term" of the implicit equation
-    * \warning the vector normal is assumed to be normalized.
-    */
-  inline const Scalar& offset() const { return m_coeffs.coeff(dim()); }
+   * \warning the vector normal is assumed to be normalized.
+   */
+  EIGEN_DEVICE_FUNC inline const Scalar& offset() const { return m_coeffs.coeff(dim()); }
 
   /** \returns a non-constant reference to the distance to the origin, which is also the constant part
-    * of the implicit equation */
-  inline Scalar& offset() { return m_coeffs(dim()); }
+   * of the implicit equation */
+  EIGEN_DEVICE_FUNC inline Scalar& offset() { return m_coeffs(dim()); }
 
   /** \returns a constant reference to the coefficients c_i of the plane equation:
-    * \f$ c_0*x_0 + ... + c_{d-1}*x_{d-1} + c_d = 0 \f$
-    */
-  inline const Coefficients& coeffs() const { return m_coeffs; }
+   * \f$ c_0*x_0 + ... + c_{d-1}*x_{d-1} + c_d = 0 \f$
+   */
+  EIGEN_DEVICE_FUNC inline const Coefficients& coeffs() const { return m_coeffs; }
 
   /** \returns a non-constant reference to the coefficients c_i of the plane equation:
-    * \f$ c_0*x_0 + ... + c_{d-1}*x_{d-1} + c_d = 0 \f$
-    */
-  inline Coefficients& coeffs() { return m_coeffs; }
+   * \f$ c_0*x_0 + ... + c_{d-1}*x_{d-1} + c_d = 0 \f$
+   */
+  EIGEN_DEVICE_FUNC inline Coefficients& coeffs() { return m_coeffs; }
 
   /** \returns the intersection of *this with \a other.
-    *
-    * \warning The ambient space must be a plane, i.e. have dimension 2, so that \c *this and \a other are lines.
-    *
-    * \note If \a other is approximately parallel to *this, this method will return any point on *this.
-    */
-  VectorType intersection(const Hyperplane& other) const
-  {
-    using std::abs;
+   *
+   * \warning The ambient space must be a plane, i.e. have dimension 2, so that \c *this and \a other are lines.
+   *
+   * \note If \a other is approximately parallel to *this, this method will return any point on *this.
+   */
+  EIGEN_DEVICE_FUNC VectorType intersection(const Hyperplane& other) const {
     EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(VectorType, 2)
     Scalar det = coeffs().coeff(0) * other.coeffs().coeff(1) - coeffs().coeff(1) * other.coeffs().coeff(0);
     // since the line equations ax+by=c are normalized with a^2+b^2=1, the following tests
     // whether the two lines are approximately parallel.
-    if(internal::isMuchSmallerThan(det, Scalar(1)))
-    {   // special case where the two lines are approximately parallel. Pick any point on the first line.
-        if(abs(coeffs().coeff(1))>abs(coeffs().coeff(0)))
-            return VectorType(coeffs().coeff(1), -coeffs().coeff(2)/coeffs().coeff(1)-coeffs().coeff(0));
-        else
-            return VectorType(-coeffs().coeff(2)/coeffs().coeff(0)-coeffs().coeff(1), coeffs().coeff(0));
-    }
-    else
-    {   // general case
-        Scalar invdet = Scalar(1) / det;
-        return VectorType(invdet*(coeffs().coeff(1)*other.coeffs().coeff(2)-other.coeffs().coeff(1)*coeffs().coeff(2)),
-                          invdet*(other.coeffs().coeff(0)*coeffs().coeff(2)-coeffs().coeff(0)*other.coeffs().coeff(2)));
+    if (internal::isMuchSmallerThan(det, Scalar(1))) {  // special case where the two lines are approximately parallel.
+                                                        // Pick any point on the first line.
+      if (numext::abs(coeffs().coeff(1)) > numext::abs(coeffs().coeff(0)))
+        return VectorType(coeffs().coeff(1), -coeffs().coeff(2) / coeffs().coeff(1) - coeffs().coeff(0));
+      else
+        return VectorType(-coeffs().coeff(2) / coeffs().coeff(0) - coeffs().coeff(1), coeffs().coeff(0));
+    } else {  // general case
+      Scalar invdet = Scalar(1) / det;
+      return VectorType(
+          invdet * (coeffs().coeff(1) * other.coeffs().coeff(2) - other.coeffs().coeff(1) * coeffs().coeff(2)),
+          invdet * (other.coeffs().coeff(0) * coeffs().coeff(2) - coeffs().coeff(0) * other.coeffs().coeff(2)));
     }
   }
 
   /** Applies the transformation matrix \a mat to \c *this and returns a reference to \c *this.
-    *
-    * \param mat the Dim x Dim transformation matrix
-    * \param traits specifies whether the matrix \a mat represents an #Isometry
-    *               or a more generic #Affine transformation. The default is #Affine.
-    */
-  template<typename XprType>
-  inline Hyperplane& transform(const MatrixBase<XprType>& mat, TransformTraits traits = Affine)
-  {
-    if (traits==Affine)
+   *
+   * \param mat the Dim x Dim transformation matrix
+   * \param traits specifies whether the matrix \a mat represents an #Isometry
+   *               or a more generic #Affine transformation. The default is #Affine.
+   */
+  template <typename XprType>
+  EIGEN_DEVICE_FUNC inline Hyperplane& transform(const MatrixBase<XprType>& mat, TransformTraits traits = Affine) {
+    if (traits == Affine) {
       normal() = mat.inverse().transpose() * normal();
-    else if (traits==Isometry)
+      m_coeffs /= normal().norm();
+    } else if (traits == Isometry)
       normal() = mat * normal();
-    else
-    {
+    else {
       eigen_assert(0 && "invalid traits value in Hyperplane::transform()");
     }
     return *this;
   }
 
   /** Applies the transformation \a t to \c *this and returns a reference to \c *this.
-    *
-    * \param t the transformation of dimension Dim
-    * \param traits specifies whether the transformation \a t represents an #Isometry
-    *               or a more generic #Affine transformation. The default is #Affine.
-    *               Other kind of transformations are not supported.
-    */
-  template<int TrOptions>
-  inline Hyperplane& transform(const Transform<Scalar,AmbientDimAtCompileTime,Affine,TrOptions>& t,
-                                TransformTraits traits = Affine)
-  {
+   *
+   * \param t the transformation of dimension Dim
+   * \param traits specifies whether the transformation \a t represents an #Isometry
+   *               or a more generic #Affine transformation. The default is #Affine.
+   *               Other kind of transformations are not supported.
+   */
+  template <int TrOptions>
+  EIGEN_DEVICE_FUNC inline Hyperplane& transform(const Transform<Scalar, AmbientDimAtCompileTime, Affine, TrOptions>& t,
+                                                 TransformTraits traits = Affine) {
     transform(t.linear(), traits);
     offset() -= normal().dot(t.translation());
     return *this;
   }
 
   /** \returns \c *this with scalar type casted to \a NewScalarType
-    *
-    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
-    * then this function smartly returns a const reference to \c *this.
-    */
-  template<typename NewScalarType>
-  inline typename internal::cast_return_type<Hyperplane,
-           Hyperplane<NewScalarType,AmbientDimAtCompileTime,Options> >::type cast() const
-  {
-    return typename internal::cast_return_type<Hyperplane,
-                    Hyperplane<NewScalarType,AmbientDimAtCompileTime,Options> >::type(*this);
+   *
+   * Note that if \a NewScalarType is equal to the current scalar type of \c *this
+   * then this function smartly returns a const reference to \c *this.
+   */
+  template <typename NewScalarType>
+  EIGEN_DEVICE_FUNC inline
+      typename internal::cast_return_type<Hyperplane,
+                                          Hyperplane<NewScalarType, AmbientDimAtCompileTime, Options> >::type
+      cast() const {
+    return
+        typename internal::cast_return_type<Hyperplane,
+                                            Hyperplane<NewScalarType, AmbientDimAtCompileTime, Options> >::type(*this);
   }
 
   /** Copy constructor with scalar type conversion */
-  template<typename OtherScalarType,int OtherOptions>
-  inline explicit Hyperplane(const Hyperplane<OtherScalarType,AmbientDimAtCompileTime,OtherOptions>& other)
-  { m_coeffs = other.coeffs().template cast<Scalar>(); }
+  template <typename OtherScalarType, int OtherOptions>
+  EIGEN_DEVICE_FUNC inline explicit Hyperplane(
+      const Hyperplane<OtherScalarType, AmbientDimAtCompileTime, OtherOptions>& other) {
+    m_coeffs = other.coeffs().template cast<Scalar>();
+  }
 
   /** \returns \c true if \c *this is approximately equal to \a other, within the precision
-    * determined by \a prec.
-    *
-    * \sa MatrixBase::isApprox() */
-  template<int OtherOptions>
-  bool isApprox(const Hyperplane<Scalar,AmbientDimAtCompileTime,OtherOptions>& other, const typename NumTraits<Scalar>::Real& prec = NumTraits<Scalar>::dummy_precision()) const
-  { return m_coeffs.isApprox(other.m_coeffs, prec); }
-
-protected:
+   * determined by \a prec.
+   *
+   * \sa MatrixBase::isApprox() */
+  template <int OtherOptions>
+  EIGEN_DEVICE_FUNC bool isApprox(
+      const Hyperplane<Scalar, AmbientDimAtCompileTime, OtherOptions>& other,
+      const typename NumTraits<Scalar>::Real& prec = NumTraits<Scalar>::dummy_precision()) const {
+    return m_coeffs.isApprox(other.m_coeffs, prec);
+  }
 
+ protected:
   Coefficients m_coeffs;
 };
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_HYPERPLANE_H
+#endif  // EIGEN_HYPERPLANE_H
diff --git a/inst/include/Eigen/src/Geometry/InternalHeaderCheck.h b/inst/include/Eigen/src/Geometry/InternalHeaderCheck.h
new file mode 100644
index 00000000..a1159a3c
--- /dev/null
+++ b/inst/include/Eigen/src/Geometry/InternalHeaderCheck.h
@@ -0,0 +1,3 @@
+#ifndef EIGEN_GEOMETRY_MODULE_H
+#error "Please include Eigen/Geometry instead of including headers inside the src directory directly."
+#endif
diff --git a/inst/include/Eigen/src/Geometry/OrthoMethods.h b/inst/include/Eigen/src/Geometry/OrthoMethods.h
index 556bc816..fc708ee2 100644
--- a/inst/include/Eigen/src/Geometry/OrthoMethods.h
+++ b/inst/include/Eigen/src/Geometry/OrthoMethods.h
@@ -11,151 +11,193 @@
 #ifndef EIGEN_ORTHOMETHODS_H
 #define EIGEN_ORTHOMETHODS_H
 
-namespace Eigen { 
-
-/** \geometry_module
-  *
-  * \returns the cross product of \c *this and \a other
-  *
-  * Here is a very good explanation of cross-product: http://xkcd.com/199/
-  * \sa MatrixBase::cross3()
-  */
-template<typename Derived>
-template<typename OtherDerived>
-inline typename MatrixBase<Derived>::template cross_product_return_type<OtherDerived>::type
-MatrixBase<Derived>::cross(const MatrixBase<OtherDerived>& other) const
-{
-  EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(Derived,3)
-  EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,3)
-
-  // Note that there is no need for an expression here since the compiler
-  // optimize such a small temporary very well (even within a complex expression)
-  typename internal::nested<Derived,2>::type lhs(derived());
-  typename internal::nested<OtherDerived,2>::type rhs(other.derived());
-  return typename cross_product_return_type<OtherDerived>::type(
-    numext::conj(lhs.coeff(1) * rhs.coeff(2) - lhs.coeff(2) * rhs.coeff(1)),
-    numext::conj(lhs.coeff(2) * rhs.coeff(0) - lhs.coeff(0) * rhs.coeff(2)),
-    numext::conj(lhs.coeff(0) * rhs.coeff(1) - lhs.coeff(1) * rhs.coeff(0))
-  );
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+// Vector3 version (default)
+template <typename Derived, typename OtherDerived, int Size>
+struct cross_impl {
+  typedef typename ScalarBinaryOpTraits<typename internal::traits<Derived>::Scalar,
+                                        typename internal::traits<OtherDerived>::Scalar>::ReturnType Scalar;
+  typedef Matrix<Scalar, MatrixBase<Derived>::RowsAtCompileTime, MatrixBase<Derived>::ColsAtCompileTime> return_type;
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE return_type run(const MatrixBase<Derived>& first,
+                                                               const MatrixBase<OtherDerived>& second) {
+    EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(Derived, 3)
+    EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived, 3)
+
+    // Note that there is no need for an expression here since the compiler
+    // optimize such a small temporary very well (even within a complex expression)
+    typename internal::nested_eval<Derived, 2>::type lhs(first.derived());
+    typename internal::nested_eval<OtherDerived, 2>::type rhs(second.derived());
+    return return_type(numext::conj(lhs.coeff(1) * rhs.coeff(2) - lhs.coeff(2) * rhs.coeff(1)),
+                       numext::conj(lhs.coeff(2) * rhs.coeff(0) - lhs.coeff(0) * rhs.coeff(2)),
+                       numext::conj(lhs.coeff(0) * rhs.coeff(1) - lhs.coeff(1) * rhs.coeff(0)));
+  }
+};
+
+// Vector2 version
+template <typename Derived, typename OtherDerived>
+struct cross_impl<Derived, OtherDerived, 2> {
+  typedef typename ScalarBinaryOpTraits<typename internal::traits<Derived>::Scalar,
+                                        typename internal::traits<OtherDerived>::Scalar>::ReturnType Scalar;
+  typedef Scalar return_type;
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE return_type run(const MatrixBase<Derived>& first,
+                                                               const MatrixBase<OtherDerived>& second) {
+    EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(Derived, 2);
+    EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived, 2);
+    typename internal::nested_eval<Derived, 2>::type lhs(first.derived());
+    typename internal::nested_eval<OtherDerived, 2>::type rhs(second.derived());
+    return numext::conj(lhs.coeff(0) * rhs.coeff(1) - lhs.coeff(1) * rhs.coeff(0));
+  }
+};
+
+}  // end namespace internal
+
+/** \geometry_module \ingroup Geometry_Module
+ *
+ * \returns the cross product of \c *this and \a other. This is either a scalar for size-2 vectors or a size-3 vector
+ * for size-3 vectors.
+ *
+ * This method is implemented for two different cases: between vectors of fixed size 2 and between vectors of fixed
+ * size 3.
+ *
+ * For vectors of size 3, the output is simply the traditional cross product.
+ *
+ * For vectors of size 2, the output is a scalar.
+ * Given vectors \f$ v = \begin{bmatrix} v_1 & v_2 \end{bmatrix} \f$ and \f$ w = \begin{bmatrix} w_1 & w_2 \end{bmatrix}
+ * \f$, the result is simply \f$ v\times w = \overline{v_1 w_2 - v_2 w_1} = \text{conj}\left|\begin{smallmatrix} v_1 &
+ * w_1 \\ v_2 & w_2 \end{smallmatrix}\right| \f$; or, to put it differently, it is the third coordinate of the cross
+ * product of \f$ \begin{bmatrix} v_1 & v_2 & v_3 \end{bmatrix} \f$ and \f$ \begin{bmatrix} w_1 & w_2 & w_3
+ * \end{bmatrix} \f$. For real-valued inputs, the result can be interpreted as the signed area of a parallelogram
+ * spanned by the two vectors.
+ *
+ * \note With complex numbers, the cross product is implemented as
+ * \f[ (\mathbf{a}+i\mathbf{b}) \times (\mathbf{c}+i\mathbf{d}) = (\mathbf{a} \times \mathbf{c} - \mathbf{b} \times
+ * \mathbf{d}) - i(\mathbf{a} \times \mathbf{d} + \mathbf{b} \times \mathbf{c}).\f]
+ * This definition preserves the orthogonality condition that \f$\mathbf{u} \cdot (\mathbf{u} \times \mathbf{v}) =
+ * \mathbf{v} \cdot (\mathbf{u} \times \mathbf{v}) = 0\f$.
+ *
+ * \sa MatrixBase::cross3()
+ */
+template <typename Derived>
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename internal::cross_impl<Derived, OtherDerived>::return_type
+MatrixBase<Derived>::cross(const MatrixBase<OtherDerived>& other) const {
+  return internal::cross_impl<Derived, OtherDerived>::run(*this, other);
 }
 
 namespace internal {
 
-template< int Arch,typename VectorLhs,typename VectorRhs,
-          typename Scalar = typename VectorLhs::Scalar,
-          bool Vectorizable = bool((VectorLhs::Flags&VectorRhs::Flags)&PacketAccessBit)>
+template <int Arch, typename VectorLhs, typename VectorRhs, typename Scalar = typename VectorLhs::Scalar,
+          bool Vectorizable =
+              bool((int(evaluator<VectorLhs>::Flags) & int(evaluator<VectorRhs>::Flags)) & PacketAccessBit)>
 struct cross3_impl {
-  static inline typename internal::plain_matrix_type<VectorLhs>::type
-  run(const VectorLhs& lhs, const VectorRhs& rhs)
-  {
+  EIGEN_DEVICE_FUNC static inline typename internal::plain_matrix_type<VectorLhs>::type run(const VectorLhs& lhs,
+                                                                                            const VectorRhs& rhs) {
     return typename internal::plain_matrix_type<VectorLhs>::type(
-      numext::conj(lhs.coeff(1) * rhs.coeff(2) - lhs.coeff(2) * rhs.coeff(1)),
-      numext::conj(lhs.coeff(2) * rhs.coeff(0) - lhs.coeff(0) * rhs.coeff(2)),
-      numext::conj(lhs.coeff(0) * rhs.coeff(1) - lhs.coeff(1) * rhs.coeff(0)),
-      0
-    );
+        numext::conj(lhs.coeff(1) * rhs.coeff(2) - lhs.coeff(2) * rhs.coeff(1)),
+        numext::conj(lhs.coeff(2) * rhs.coeff(0) - lhs.coeff(0) * rhs.coeff(2)),
+        numext::conj(lhs.coeff(0) * rhs.coeff(1) - lhs.coeff(1) * rhs.coeff(0)), 0);
   }
 };
 
-}
+}  // namespace internal
 
-/** \geometry_module
-  *
-  * \returns the cross product of \c *this and \a other using only the x, y, and z coefficients
-  *
-  * The size of \c *this and \a other must be four. This function is especially useful
-  * when using 4D vectors instead of 3D ones to get advantage of SSE/AltiVec vectorization.
-  *
-  * \sa MatrixBase::cross()
-  */
-template<typename Derived>
-template<typename OtherDerived>
-inline typename MatrixBase<Derived>::PlainObject
-MatrixBase<Derived>::cross3(const MatrixBase<OtherDerived>& other) const
-{
-  EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(Derived,4)
-  EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,4)
-
-  typedef typename internal::nested<Derived,2>::type DerivedNested;
-  typedef typename internal::nested<OtherDerived,2>::type OtherDerivedNested;
+/** \geometry_module \ingroup Geometry_Module
+ *
+ * \returns the cross product of \c *this and \a other using only the x, y, and z coefficients
+ *
+ * The size of \c *this and \a other must be four. This function is especially useful
+ * when using 4D vectors instead of 3D ones to get advantage of SSE/AltiVec vectorization.
+ *
+ * \sa MatrixBase::cross()
+ */
+template <typename Derived>
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC inline typename MatrixBase<Derived>::PlainObject MatrixBase<Derived>::cross3(
+    const MatrixBase<OtherDerived>& other) const {
+  EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(Derived, 4)
+  EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived, 4)
+
+  typedef typename internal::nested_eval<Derived, 2>::type DerivedNested;
+  typedef typename internal::nested_eval<OtherDerived, 2>::type OtherDerivedNested;
   DerivedNested lhs(derived());
   OtherDerivedNested rhs(other.derived());
 
-  return internal::cross3_impl<Architecture::Target,
-                        typename internal::remove_all<DerivedNested>::type,
-                        typename internal::remove_all<OtherDerivedNested>::type>::run(lhs,rhs);
+  return internal::cross3_impl<Architecture::Target, internal::remove_all_t<DerivedNested>,
+                               internal::remove_all_t<OtherDerivedNested>>::run(lhs, rhs);
 }
 
-/** \returns a matrix expression of the cross product of each column or row
-  * of the referenced expression with the \a other vector.
-  *
-  * The referenced matrix must have one dimension equal to 3.
-  * The result matrix has the same dimensions than the referenced one.
-  *
-  * \geometry_module
-  *
-  * \sa MatrixBase::cross() */
-template<typename ExpressionType, int Direction>
-template<typename OtherDerived>
-const typename VectorwiseOp<ExpressionType,Direction>::CrossReturnType
-VectorwiseOp<ExpressionType,Direction>::cross(const MatrixBase<OtherDerived>& other) const
-{
-  EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,3)
-  EIGEN_STATIC_ASSERT((internal::is_same<Scalar, typename OtherDerived::Scalar>::value),
-    YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
-
-  CrossReturnType res(_expression().rows(),_expression().cols());
-  if(Direction==Vertical)
-  {
-    eigen_assert(CrossReturnType::RowsAtCompileTime==3 && "the matrix must have exactly 3 rows");
-    res.row(0) = (_expression().row(1) * other.coeff(2) - _expression().row(2) * other.coeff(1)).conjugate();
-    res.row(1) = (_expression().row(2) * other.coeff(0) - _expression().row(0) * other.coeff(2)).conjugate();
-    res.row(2) = (_expression().row(0) * other.coeff(1) - _expression().row(1) * other.coeff(0)).conjugate();
-  }
-  else
-  {
-    eigen_assert(CrossReturnType::ColsAtCompileTime==3 && "the matrix must have exactly 3 columns");
-    res.col(0) = (_expression().col(1) * other.coeff(2) - _expression().col(2) * other.coeff(1)).conjugate();
-    res.col(1) = (_expression().col(2) * other.coeff(0) - _expression().col(0) * other.coeff(2)).conjugate();
-    res.col(2) = (_expression().col(0) * other.coeff(1) - _expression().col(1) * other.coeff(0)).conjugate();
+/** \geometry_module \ingroup Geometry_Module
+ *
+ * \returns a matrix expression of the cross product of each column or row
+ * of the referenced expression with the \a other vector.
+ *
+ * The referenced matrix must have one dimension equal to 3.
+ * The result matrix has the same dimensions than the referenced one.
+ *
+ * \sa MatrixBase::cross() */
+template <typename ExpressionType, int Direction>
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC const typename VectorwiseOp<ExpressionType, Direction>::CrossReturnType
+VectorwiseOp<ExpressionType, Direction>::cross(const MatrixBase<OtherDerived>& other) const {
+  EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived, 3)
+  EIGEN_STATIC_ASSERT(
+      (internal::is_same<Scalar, typename OtherDerived::Scalar>::value),
+      YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+
+  typename internal::nested_eval<ExpressionType, 2>::type mat(_expression());
+  typename internal::nested_eval<OtherDerived, 2>::type vec(other.derived());
+
+  CrossReturnType res(_expression().rows(), _expression().cols());
+  if (Direction == Vertical) {
+    eigen_assert(CrossReturnType::RowsAtCompileTime == 3 && "the matrix must have exactly 3 rows");
+    res.row(0) = (mat.row(1) * vec.coeff(2) - mat.row(2) * vec.coeff(1)).conjugate();
+    res.row(1) = (mat.row(2) * vec.coeff(0) - mat.row(0) * vec.coeff(2)).conjugate();
+    res.row(2) = (mat.row(0) * vec.coeff(1) - mat.row(1) * vec.coeff(0)).conjugate();
+  } else {
+    eigen_assert(CrossReturnType::ColsAtCompileTime == 3 && "the matrix must have exactly 3 columns");
+    res.col(0) = (mat.col(1) * vec.coeff(2) - mat.col(2) * vec.coeff(1)).conjugate();
+    res.col(1) = (mat.col(2) * vec.coeff(0) - mat.col(0) * vec.coeff(2)).conjugate();
+    res.col(2) = (mat.col(0) * vec.coeff(1) - mat.col(1) * vec.coeff(0)).conjugate();
   }
   return res;
 }
 
 namespace internal {
 
-template<typename Derived, int Size = Derived::SizeAtCompileTime>
-struct unitOrthogonal_selector
-{
+template <typename Derived, int Size = Derived::SizeAtCompileTime>
+struct unitOrthogonal_selector {
   typedef typename plain_matrix_type<Derived>::type VectorType;
   typedef typename traits<Derived>::Scalar Scalar;
   typedef typename NumTraits<Scalar>::Real RealScalar;
-  typedef typename Derived::Index Index;
-  typedef Matrix<Scalar,2,1> Vector2;
-  static inline VectorType run(const Derived& src)
-  {
+  typedef Matrix<Scalar, 2, 1> Vector2;
+  EIGEN_DEVICE_FUNC static inline VectorType run(const Derived& src) {
     VectorType perp = VectorType::Zero(src.size());
     Index maxi = 0;
     Index sndi = 0;
     src.cwiseAbs().maxCoeff(&maxi);
-    if (maxi==0)
-      sndi = 1;
-    RealScalar invnm = RealScalar(1)/(Vector2() << src.coeff(sndi),src.coeff(maxi)).finished().norm();
+    if (maxi == 0) sndi = 1;
+    RealScalar invnm = RealScalar(1) / (Vector2() << src.coeff(sndi), src.coeff(maxi)).finished().norm();
     perp.coeffRef(maxi) = -numext::conj(src.coeff(sndi)) * invnm;
-    perp.coeffRef(sndi) =  numext::conj(src.coeff(maxi)) * invnm;
+    perp.coeffRef(sndi) = numext::conj(src.coeff(maxi)) * invnm;
 
     return perp;
-   }
+  }
 };
 
-template<typename Derived>
-struct unitOrthogonal_selector<Derived,3>
-{
+template <typename Derived>
+struct unitOrthogonal_selector<Derived, 3> {
   typedef typename plain_matrix_type<Derived>::type VectorType;
   typedef typename traits<Derived>::Scalar Scalar;
   typedef typename NumTraits<Scalar>::Real RealScalar;
-  static inline VectorType run(const Derived& src)
-  {
+  EIGEN_DEVICE_FUNC static inline VectorType run(const Derived& src) {
     VectorType perp;
     /* Let us compute the crossed product of *this with a vector
      * that is not too close to being colinear to *this.
@@ -164,55 +206,52 @@ struct unitOrthogonal_selector<Derived,3>
     /* unless the x and y coords are both close to zero, we can
      * simply take ( -y, x, 0 ) and normalize it.
      */
-    if((!isMuchSmallerThan(src.x(), src.z()))
-    || (!isMuchSmallerThan(src.y(), src.z())))
-    {
-      RealScalar invnm = RealScalar(1)/src.template head<2>().norm();
-      perp.coeffRef(0) = -numext::conj(src.y())*invnm;
-      perp.coeffRef(1) = numext::conj(src.x())*invnm;
+    if ((!isMuchSmallerThan(src.x(), src.z())) || (!isMuchSmallerThan(src.y(), src.z()))) {
+      RealScalar invnm = RealScalar(1) / src.template head<2>().norm();
+      perp.coeffRef(0) = -numext::conj(src.y()) * invnm;
+      perp.coeffRef(1) = numext::conj(src.x()) * invnm;
       perp.coeffRef(2) = 0;
     }
     /* if both x and y are close to zero, then the vector is close
      * to the z-axis, so it's far from colinear to the x-axis for instance.
      * So we take the crossed product with (1,0,0) and normalize it.
      */
-    else
-    {
-      RealScalar invnm = RealScalar(1)/src.template tail<2>().norm();
+    else {
+      RealScalar invnm = RealScalar(1) / src.template tail<2>().norm();
       perp.coeffRef(0) = 0;
-      perp.coeffRef(1) = -numext::conj(src.z())*invnm;
-      perp.coeffRef(2) = numext::conj(src.y())*invnm;
+      perp.coeffRef(1) = -numext::conj(src.z()) * invnm;
+      perp.coeffRef(2) = numext::conj(src.y()) * invnm;
     }
 
     return perp;
-   }
+  }
 };
 
-template<typename Derived>
-struct unitOrthogonal_selector<Derived,2>
-{
+template <typename Derived>
+struct unitOrthogonal_selector<Derived, 2> {
   typedef typename plain_matrix_type<Derived>::type VectorType;
-  static inline VectorType run(const Derived& src)
-  { return VectorType(-numext::conj(src.y()), numext::conj(src.x())).normalized(); }
+  EIGEN_DEVICE_FUNC static inline VectorType run(const Derived& src) {
+    return VectorType(-numext::conj(src.y()), numext::conj(src.x())).normalized();
+  }
 };
 
-} // end namespace internal
-
-/** \returns a unit vector which is orthogonal to \c *this
-  *
-  * The size of \c *this must be at least 2. If the size is exactly 2,
-  * then the returned vector is a counter clock wise rotation of \c *this, i.e., (-y,x).normalized().
-  *
-  * \sa cross()
-  */
-template<typename Derived>
-typename MatrixBase<Derived>::PlainObject
-MatrixBase<Derived>::unitOrthogonal() const
-{
+}  // end namespace internal
+
+/** \geometry_module \ingroup Geometry_Module
+ *
+ * \returns a unit vector which is orthogonal to \c *this
+ *
+ * The size of \c *this must be at least 2. If the size is exactly 2,
+ * then the returned vector is a counter clock wise rotation of \c *this, i.e., (-y,x).normalized().
+ *
+ * \sa cross()
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC typename MatrixBase<Derived>::PlainObject MatrixBase<Derived>::unitOrthogonal() const {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   return internal::unitOrthogonal_selector<Derived>::run(derived());
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_ORTHOMETHODS_H
+#endif  // EIGEN_ORTHOMETHODS_H
diff --git a/inst/include/Eigen/src/Geometry/ParametrizedLine.h b/inst/include/Eigen/src/Geometry/ParametrizedLine.h
index 77fa228e..5bbd8743 100644
--- a/inst/include/Eigen/src/Geometry/ParametrizedLine.h
+++ b/inst/include/Eigen/src/Geometry/ParametrizedLine.h
@@ -11,185 +11,222 @@
 #ifndef EIGEN_PARAMETRIZEDLINE_H
 #define EIGEN_PARAMETRIZEDLINE_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 /** \geometry_module \ingroup Geometry_Module
-  *
-  * \class ParametrizedLine
-  *
-  * \brief A parametrized line
-  *
-  * A parametrized line is defined by an origin point \f$ \mathbf{o} \f$ and a unit
-  * direction vector \f$ \mathbf{d} \f$ such that the line corresponds to
-  * the set \f$ l(t) = \mathbf{o} + t \mathbf{d} \f$, \f$ t \in \mathbf{R} \f$.
-  *
-  * \param _Scalar the scalar type, i.e., the type of the coefficients
-  * \param _AmbientDim the dimension of the ambient space, can be a compile time value or Dynamic.
-  */
-template <typename _Scalar, int _AmbientDim, int _Options>
-class ParametrizedLine
-{
-public:
-  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_AmbientDim)
-  enum {
-    AmbientDimAtCompileTime = _AmbientDim,
-    Options = _Options
-  };
-  typedef _Scalar Scalar;
+ *
+ * \class ParametrizedLine
+ *
+ * \brief A parametrized line
+ *
+ * A parametrized line is defined by an origin point \f$ \mathbf{o} \f$ and a unit
+ * direction vector \f$ \mathbf{d} \f$ such that the line corresponds to
+ * the set \f$ l(t) = \mathbf{o} + t \mathbf{d} \f$, \f$ t \in \mathbf{R} \f$.
+ *
+ * \tparam Scalar_ the scalar type, i.e., the type of the coefficients
+ * \tparam AmbientDim_ the dimension of the ambient space, can be a compile time value or Dynamic.
+ */
+template <typename Scalar_, int AmbientDim_, int Options_>
+class ParametrizedLine {
+ public:
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar_, AmbientDim_)
+  enum { AmbientDimAtCompileTime = AmbientDim_, Options = Options_ };
+  typedef Scalar_ Scalar;
   typedef typename NumTraits<Scalar>::Real RealScalar;
-  typedef DenseIndex Index;
-  typedef Matrix<Scalar,AmbientDimAtCompileTime,1,Options> VectorType;
+  typedef Eigen::Index Index;  ///< \deprecated since Eigen 3.3
+  typedef Matrix<Scalar, AmbientDimAtCompileTime, 1, Options> VectorType;
 
   /** Default constructor without initialization */
-  inline ParametrizedLine() {}
-  
-  template<int OtherOptions>
-  ParametrizedLine(const ParametrizedLine<Scalar,AmbientDimAtCompileTime,OtherOptions>& other)
-   : m_origin(other.origin()), m_direction(other.direction())
-  {}
+  EIGEN_DEVICE_FUNC inline ParametrizedLine() {}
+
+  template <int OtherOptions>
+  EIGEN_DEVICE_FUNC ParametrizedLine(const ParametrizedLine<Scalar, AmbientDimAtCompileTime, OtherOptions>& other)
+      : m_origin(other.origin()), m_direction(other.direction()) {}
 
   /** Constructs a dynamic-size line with \a _dim the dimension
-    * of the ambient space */
-  inline explicit ParametrizedLine(Index _dim) : m_origin(_dim), m_direction(_dim) {}
+   * of the ambient space */
+  EIGEN_DEVICE_FUNC inline explicit ParametrizedLine(Index _dim) : m_origin(_dim), m_direction(_dim) {}
 
   /** Initializes a parametrized line of direction \a direction and origin \a origin.
-    * \warning the vector direction is assumed to be normalized.
-    */
-  ParametrizedLine(const VectorType& origin, const VectorType& direction)
-    : m_origin(origin), m_direction(direction) {}
+   * \warning the vector direction is assumed to be normalized.
+   */
+  EIGEN_DEVICE_FUNC ParametrizedLine(const VectorType& origin, const VectorType& direction)
+      : m_origin(origin), m_direction(direction) {}
 
   template <int OtherOptions>
-  explicit ParametrizedLine(const Hyperplane<_Scalar, _AmbientDim, OtherOptions>& hyperplane);
+  EIGEN_DEVICE_FUNC explicit ParametrizedLine(const Hyperplane<Scalar_, AmbientDim_, OtherOptions>& hyperplane);
 
   /** Constructs a parametrized line going from \a p0 to \a p1. */
-  static inline ParametrizedLine Through(const VectorType& p0, const VectorType& p1)
-  { return ParametrizedLine(p0, (p1-p0).normalized()); }
+  EIGEN_DEVICE_FUNC static inline ParametrizedLine Through(const VectorType& p0, const VectorType& p1) {
+    return ParametrizedLine(p0, (p1 - p0).normalized());
+  }
 
-  ~ParametrizedLine() {}
+  EIGEN_DEVICE_FUNC ~ParametrizedLine() {}
 
   /** \returns the dimension in which the line holds */
-  inline Index dim() const { return m_direction.size(); }
+  EIGEN_DEVICE_FUNC inline Index dim() const { return m_direction.size(); }
 
-  const VectorType& origin() const { return m_origin; }
-  VectorType& origin() { return m_origin; }
+  EIGEN_DEVICE_FUNC const VectorType& origin() const { return m_origin; }
+  EIGEN_DEVICE_FUNC VectorType& origin() { return m_origin; }
 
-  const VectorType& direction() const { return m_direction; }
-  VectorType& direction() { return m_direction; }
+  EIGEN_DEVICE_FUNC const VectorType& direction() const { return m_direction; }
+  EIGEN_DEVICE_FUNC VectorType& direction() { return m_direction; }
 
   /** \returns the squared distance of a point \a p to its projection onto the line \c *this.
-    * \sa distance()
-    */
-  RealScalar squaredDistance(const VectorType& p) const
-  {
+   * \sa distance()
+   */
+  EIGEN_DEVICE_FUNC RealScalar squaredDistance(const VectorType& p) const {
     VectorType diff = p - origin();
     return (diff - direction().dot(diff) * direction()).squaredNorm();
   }
   /** \returns the distance of a point \a p to its projection onto the line \c *this.
-    * \sa squaredDistance()
-    */
-  RealScalar distance(const VectorType& p) const { using std::sqrt; return sqrt(squaredDistance(p)); }
+   * \sa squaredDistance()
+   */
+  EIGEN_DEVICE_FUNC RealScalar distance(const VectorType& p) const {
+    EIGEN_USING_STD(sqrt) return sqrt(squaredDistance(p));
+  }
 
   /** \returns the projection of a point \a p onto the line \c *this. */
-  VectorType projection(const VectorType& p) const
-  { return origin() + direction().dot(p-origin()) * direction(); }
+  EIGEN_DEVICE_FUNC VectorType projection(const VectorType& p) const {
+    return origin() + direction().dot(p - origin()) * direction();
+  }
+
+  EIGEN_DEVICE_FUNC VectorType pointAt(const Scalar& t) const;
 
-  VectorType pointAt(const Scalar& t) const;
-  
   template <int OtherOptions>
-  Scalar intersectionParameter(const Hyperplane<_Scalar, _AmbientDim, OtherOptions>& hyperplane) const;
- 
+  EIGEN_DEVICE_FUNC Scalar
+  intersectionParameter(const Hyperplane<Scalar_, AmbientDim_, OtherOptions>& hyperplane) const;
+
   template <int OtherOptions>
-  Scalar intersection(const Hyperplane<_Scalar, _AmbientDim, OtherOptions>& hyperplane) const;
-  
+  EIGEN_DEVICE_FUNC Scalar intersection(const Hyperplane<Scalar_, AmbientDim_, OtherOptions>& hyperplane) const;
+
   template <int OtherOptions>
-  VectorType intersectionPoint(const Hyperplane<_Scalar, _AmbientDim, OtherOptions>& hyperplane) const;
+  EIGEN_DEVICE_FUNC VectorType
+  intersectionPoint(const Hyperplane<Scalar_, AmbientDim_, OtherOptions>& hyperplane) const;
+
+  /** Applies the transformation matrix \a mat to \c *this and returns a reference to \c *this.
+   *
+   * \param mat the Dim x Dim transformation matrix
+   * \param traits specifies whether the matrix \a mat represents an #Isometry
+   *               or a more generic #Affine transformation. The default is #Affine.
+   */
+  template <typename XprType>
+  EIGEN_DEVICE_FUNC inline ParametrizedLine& transform(const MatrixBase<XprType>& mat,
+                                                       TransformTraits traits = Affine) {
+    if (traits == Affine)
+      direction() = (mat * direction()).normalized();
+    else if (traits == Isometry)
+      direction() = mat * direction();
+    else {
+      eigen_assert(0 && "invalid traits value in ParametrizedLine::transform()");
+    }
+    origin() = mat * origin();
+    return *this;
+  }
+
+  /** Applies the transformation \a t to \c *this and returns a reference to \c *this.
+   *
+   * \param t the transformation of dimension Dim
+   * \param traits specifies whether the transformation \a t represents an #Isometry
+   *               or a more generic #Affine transformation. The default is #Affine.
+   *               Other kind of transformations are not supported.
+   */
+  template <int TrOptions>
+  EIGEN_DEVICE_FUNC inline ParametrizedLine& transform(
+      const Transform<Scalar, AmbientDimAtCompileTime, Affine, TrOptions>& t, TransformTraits traits = Affine) {
+    transform(t.linear(), traits);
+    origin() += t.translation();
+    return *this;
+  }
 
   /** \returns \c *this with scalar type casted to \a NewScalarType
-    *
-    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
-    * then this function smartly returns a const reference to \c *this.
-    */
-  template<typename NewScalarType>
-  inline typename internal::cast_return_type<ParametrizedLine,
-           ParametrizedLine<NewScalarType,AmbientDimAtCompileTime,Options> >::type cast() const
-  {
-    return typename internal::cast_return_type<ParametrizedLine,
-                    ParametrizedLine<NewScalarType,AmbientDimAtCompileTime,Options> >::type(*this);
+   *
+   * Note that if \a NewScalarType is equal to the current scalar type of \c *this
+   * then this function smartly returns a const reference to \c *this.
+   */
+  template <typename NewScalarType>
+  EIGEN_DEVICE_FUNC inline
+      typename internal::cast_return_type<ParametrizedLine,
+                                          ParametrizedLine<NewScalarType, AmbientDimAtCompileTime, Options> >::type
+      cast() const {
+    return typename internal::cast_return_type<
+        ParametrizedLine, ParametrizedLine<NewScalarType, AmbientDimAtCompileTime, Options> >::type(*this);
   }
 
   /** Copy constructor with scalar type conversion */
-  template<typename OtherScalarType,int OtherOptions>
-  inline explicit ParametrizedLine(const ParametrizedLine<OtherScalarType,AmbientDimAtCompileTime,OtherOptions>& other)
-  {
+  template <typename OtherScalarType, int OtherOptions>
+  EIGEN_DEVICE_FUNC inline explicit ParametrizedLine(
+      const ParametrizedLine<OtherScalarType, AmbientDimAtCompileTime, OtherOptions>& other) {
     m_origin = other.origin().template cast<Scalar>();
     m_direction = other.direction().template cast<Scalar>();
   }
 
   /** \returns \c true if \c *this is approximately equal to \a other, within the precision
-    * determined by \a prec.
-    *
-    * \sa MatrixBase::isApprox() */
-  bool isApprox(const ParametrizedLine& other, typename NumTraits<Scalar>::Real prec = NumTraits<Scalar>::dummy_precision()) const
-  { return m_origin.isApprox(other.m_origin, prec) && m_direction.isApprox(other.m_direction, prec); }
-
-protected:
+   * determined by \a prec.
+   *
+   * \sa MatrixBase::isApprox() */
+  EIGEN_DEVICE_FUNC bool isApprox(const ParametrizedLine& other, const typename NumTraits<Scalar>::Real& prec =
+                                                                     NumTraits<Scalar>::dummy_precision()) const {
+    return m_origin.isApprox(other.m_origin, prec) && m_direction.isApprox(other.m_direction, prec);
+  }
 
+ protected:
   VectorType m_origin, m_direction;
 };
 
 /** Constructs a parametrized line from a 2D hyperplane
-  *
-  * \warning the ambient space must have dimension 2 such that the hyperplane actually describes a line
-  */
-template <typename _Scalar, int _AmbientDim, int _Options>
+ *
+ * \warning the ambient space must have dimension 2 such that the hyperplane actually describes a line
+ */
+template <typename Scalar_, int AmbientDim_, int Options_>
 template <int OtherOptions>
-inline ParametrizedLine<_Scalar, _AmbientDim,_Options>::ParametrizedLine(const Hyperplane<_Scalar, _AmbientDim,OtherOptions>& hyperplane)
-{
+EIGEN_DEVICE_FUNC inline ParametrizedLine<Scalar_, AmbientDim_, Options_>::ParametrizedLine(
+    const Hyperplane<Scalar_, AmbientDim_, OtherOptions>& hyperplane) {
   EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(VectorType, 2)
   direction() = hyperplane.normal().unitOrthogonal();
-  origin() = -hyperplane.normal()*hyperplane.offset();
+  origin() = -hyperplane.normal() * hyperplane.offset();
 }
 
 /** \returns the point at \a t along this line
-  */
-template <typename _Scalar, int _AmbientDim, int _Options>
-inline typename ParametrizedLine<_Scalar, _AmbientDim,_Options>::VectorType
-ParametrizedLine<_Scalar, _AmbientDim,_Options>::pointAt(const _Scalar& t) const
-{
-  return origin() + (direction()*t); 
+ */
+template <typename Scalar_, int AmbientDim_, int Options_>
+EIGEN_DEVICE_FUNC inline typename ParametrizedLine<Scalar_, AmbientDim_, Options_>::VectorType
+ParametrizedLine<Scalar_, AmbientDim_, Options_>::pointAt(const Scalar_& t) const {
+  return origin() + (direction() * t);
 }
 
 /** \returns the parameter value of the intersection between \c *this and the given \a hyperplane
-  */
-template <typename _Scalar, int _AmbientDim, int _Options>
+ */
+template <typename Scalar_, int AmbientDim_, int Options_>
 template <int OtherOptions>
-inline _Scalar ParametrizedLine<_Scalar, _AmbientDim,_Options>::intersectionParameter(const Hyperplane<_Scalar, _AmbientDim, OtherOptions>& hyperplane) const
-{
-  return -(hyperplane.offset()+hyperplane.normal().dot(origin()))
-          / hyperplane.normal().dot(direction());
+EIGEN_DEVICE_FUNC inline Scalar_ ParametrizedLine<Scalar_, AmbientDim_, Options_>::intersectionParameter(
+    const Hyperplane<Scalar_, AmbientDim_, OtherOptions>& hyperplane) const {
+  return -(hyperplane.offset() + hyperplane.normal().dot(origin())) / hyperplane.normal().dot(direction());
 }
 
-
 /** \deprecated use intersectionParameter()
-  * \returns the parameter value of the intersection between \c *this and the given \a hyperplane
-  */
-template <typename _Scalar, int _AmbientDim, int _Options>
+ * \returns the parameter value of the intersection between \c *this and the given \a hyperplane
+ */
+template <typename Scalar_, int AmbientDim_, int Options_>
 template <int OtherOptions>
-inline _Scalar ParametrizedLine<_Scalar, _AmbientDim,_Options>::intersection(const Hyperplane<_Scalar, _AmbientDim, OtherOptions>& hyperplane) const
-{
+EIGEN_DEVICE_FUNC inline Scalar_ ParametrizedLine<Scalar_, AmbientDim_, Options_>::intersection(
+    const Hyperplane<Scalar_, AmbientDim_, OtherOptions>& hyperplane) const {
   return intersectionParameter(hyperplane);
 }
 
 /** \returns the point of the intersection between \c *this and the given hyperplane
-  */
-template <typename _Scalar, int _AmbientDim, int _Options>
+ */
+template <typename Scalar_, int AmbientDim_, int Options_>
 template <int OtherOptions>
-inline typename ParametrizedLine<_Scalar, _AmbientDim,_Options>::VectorType
-ParametrizedLine<_Scalar, _AmbientDim,_Options>::intersectionPoint(const Hyperplane<_Scalar, _AmbientDim, OtherOptions>& hyperplane) const
-{
+EIGEN_DEVICE_FUNC inline typename ParametrizedLine<Scalar_, AmbientDim_, Options_>::VectorType
+ParametrizedLine<Scalar_, AmbientDim_, Options_>::intersectionPoint(
+    const Hyperplane<Scalar_, AmbientDim_, OtherOptions>& hyperplane) const {
   return pointAt(intersectionParameter(hyperplane));
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_PARAMETRIZEDLINE_H
+#endif  // EIGEN_PARAMETRIZEDLINE_H
diff --git a/inst/include/Eigen/src/Geometry/Quaternion.h b/inst/include/Eigen/src/Geometry/Quaternion.h
index 25ed17bb..f2d2d051 100644
--- a/inst/include/Eigen/src/Geometry/Quaternion.h
+++ b/inst/include/Eigen/src/Geometry/Quaternion.h
@@ -10,226 +10,307 @@
 
 #ifndef EIGEN_QUATERNION_H
 #define EIGEN_QUATERNION_H
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 
+namespace Eigen {
 
 /***************************************************************************
-* Definition of QuaternionBase<Derived>
-* The implementation is at the end of the file
-***************************************************************************/
+ * Definition of QuaternionBase<Derived>
+ * The implementation is at the end of the file
+ ***************************************************************************/
 
 namespace internal {
-template<typename Other,
-         int OtherRows=Other::RowsAtCompileTime,
-         int OtherCols=Other::ColsAtCompileTime>
+template <typename Other, int OtherRows = Other::RowsAtCompileTime, int OtherCols = Other::ColsAtCompileTime>
 struct quaternionbase_assign_impl;
 }
 
 /** \geometry_module \ingroup Geometry_Module
-  * \class QuaternionBase
-  * \brief Base class for quaternion expressions
-  * \tparam Derived derived type (CRTP)
-  * \sa class Quaternion
-  */
-template<class Derived>
-class QuaternionBase : public RotationBase<Derived, 3>
-{
+ * \class QuaternionBase
+ * \brief Base class for quaternion expressions
+ * \tparam Derived derived type (CRTP)
+ * \sa class Quaternion
+ */
+template <class Derived>
+class QuaternionBase : public RotationBase<Derived, 3> {
+ public:
   typedef RotationBase<Derived, 3> Base;
-public:
+
   using Base::operator*;
   using Base::derived;
 
   typedef typename internal::traits<Derived>::Scalar Scalar;
   typedef typename NumTraits<Scalar>::Real RealScalar;
   typedef typename internal::traits<Derived>::Coefficients Coefficients;
-  enum {
-    Flags = Eigen::internal::traits<Derived>::Flags
-  };
+  typedef typename Coefficients::CoeffReturnType CoeffReturnType;
+  typedef std::conditional_t<bool(internal::traits<Derived>::Flags& LvalueBit), Scalar&, CoeffReturnType>
+      NonConstCoeffReturnType;
+
+  enum { Flags = Eigen::internal::traits<Derived>::Flags };
 
- // typedef typename Matrix<Scalar,4,1> Coefficients;
+  // typedef typename Matrix<Scalar,4,1> Coefficients;
   /** the type of a 3D vector */
-  typedef Matrix<Scalar,3,1> Vector3;
+  typedef Matrix<Scalar, 3, 1> Vector3;
   /** the equivalent rotation matrix type */
-  typedef Matrix<Scalar,3,3> Matrix3;
+  typedef Matrix<Scalar, 3, 3> Matrix3;
   /** the equivalent angle-axis type */
   typedef AngleAxis<Scalar> AngleAxisType;
 
-
-
   /** \returns the \c x coefficient */
-  inline Scalar x() const { return this->derived().coeffs().coeff(0); }
+  EIGEN_DEVICE_FUNC constexpr CoeffReturnType x() const { return this->derived().coeffs().coeff(0); }
   /** \returns the \c y coefficient */
-  inline Scalar y() const { return this->derived().coeffs().coeff(1); }
+  EIGEN_DEVICE_FUNC constexpr CoeffReturnType y() const { return this->derived().coeffs().coeff(1); }
   /** \returns the \c z coefficient */
-  inline Scalar z() const { return this->derived().coeffs().coeff(2); }
+  EIGEN_DEVICE_FUNC constexpr CoeffReturnType z() const { return this->derived().coeffs().coeff(2); }
   /** \returns the \c w coefficient */
-  inline Scalar w() const { return this->derived().coeffs().coeff(3); }
+  EIGEN_DEVICE_FUNC constexpr CoeffReturnType w() const { return this->derived().coeffs().coeff(3); }
 
-  /** \returns a reference to the \c x coefficient */
-  inline Scalar& x() { return this->derived().coeffs().coeffRef(0); }
-  /** \returns a reference to the \c y coefficient */
-  inline Scalar& y() { return this->derived().coeffs().coeffRef(1); }
-  /** \returns a reference to the \c z coefficient */
-  inline Scalar& z() { return this->derived().coeffs().coeffRef(2); }
-  /** \returns a reference to the \c w coefficient */
-  inline Scalar& w() { return this->derived().coeffs().coeffRef(3); }
+  /** \returns a reference to the \c x coefficient (if Derived is a non-const lvalue) */
+  EIGEN_DEVICE_FUNC constexpr NonConstCoeffReturnType x() { return this->derived().coeffs().x(); }
+  /** \returns a reference to the \c y coefficient (if Derived is a non-const lvalue) */
+  EIGEN_DEVICE_FUNC constexpr NonConstCoeffReturnType y() { return this->derived().coeffs().y(); }
+  /** \returns a reference to the \c z coefficient (if Derived is a non-const lvalue) */
+  EIGEN_DEVICE_FUNC constexpr NonConstCoeffReturnType z() { return this->derived().coeffs().z(); }
+  /** \returns a reference to the \c w coefficient (if Derived is a non-const lvalue) */
+  EIGEN_DEVICE_FUNC constexpr NonConstCoeffReturnType w() { return this->derived().coeffs().w(); }
 
   /** \returns a read-only vector expression of the imaginary part (x,y,z) */
-  inline const VectorBlock<const Coefficients,3> vec() const { return coeffs().template head<3>(); }
+  EIGEN_DEVICE_FUNC inline const VectorBlock<const Coefficients, 3> vec() const { return coeffs().template head<3>(); }
 
   /** \returns a vector expression of the imaginary part (x,y,z) */
-  inline VectorBlock<Coefficients,3> vec() { return coeffs().template head<3>(); }
+  EIGEN_DEVICE_FUNC inline VectorBlock<Coefficients, 3> vec() { return coeffs().template head<3>(); }
 
   /** \returns a read-only vector expression of the coefficients (x,y,z,w) */
-  inline const typename internal::traits<Derived>::Coefficients& coeffs() const { return derived().coeffs(); }
+  EIGEN_DEVICE_FUNC inline const typename internal::traits<Derived>::Coefficients& coeffs() const {
+    return derived().coeffs();
+  }
+
+  /** \returns a vector containing the coefficients, rearranged into the order [\c w, \c x, \c y, \c z].
+   *
+   * This is the order expected by the \code Quaternion(const Scalar& w, const Scalar& x, const Scalar& y, const Scalar&
+   * z) \endcode constructor, but not the order of the internal vector representation. Therefore, it returns a newly
+   * constructed vector.
+   *
+   * \sa QuaternionBase::coeffsScalarLast()
+   * */
+  EIGEN_DEVICE_FUNC inline typename internal::traits<Derived>::Coefficients coeffsScalarFirst() const {
+    return derived().coeffsScalarFirst();
+  }
+
+  /** \returns a vector containing the coefficients in their original order [\c x, \c y, \c z, \c w].
+   *
+   * This is equivalent to \code coeffs() \endcode, but returns a newly constructed vector for uniformity with \code
+   * coeffsScalarFirst() \endcode.
+   *
+   * \sa QuaternionBase::coeffsScalarFirst()
+   * */
+  EIGEN_DEVICE_FUNC inline typename internal::traits<Derived>::Coefficients coeffsScalarLast() const {
+    return derived().coeffsScalarLast();
+  }
 
   /** \returns a vector expression of the coefficients (x,y,z,w) */
-  inline typename internal::traits<Derived>::Coefficients& coeffs() { return derived().coeffs(); }
+  EIGEN_DEVICE_FUNC inline typename internal::traits<Derived>::Coefficients& coeffs() { return derived().coeffs(); }
 
-  EIGEN_STRONG_INLINE QuaternionBase<Derived>& operator=(const QuaternionBase<Derived>& other);
-  template<class OtherDerived> EIGEN_STRONG_INLINE Derived& operator=(const QuaternionBase<OtherDerived>& other);
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE QuaternionBase<Derived>& operator=(const QuaternionBase<Derived>& other);
+  template <class OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const QuaternionBase<OtherDerived>& other);
 
-// disabled this copy operator as it is giving very strange compilation errors when compiling
-// test_stdvector with GCC 4.4.2. This looks like a GCC bug though, so feel free to re-enable it if it's
-// useful; however notice that we already have the templated operator= above and e.g. in MatrixBase
-// we didn't have to add, in addition to templated operator=, such a non-templated copy operator.
-//  Derived& operator=(const QuaternionBase& other)
-//  { return operator=<Derived>(other); }
+  // disabled this copy operator as it is giving very strange compilation errors when compiling
+  // test_stdvector with GCC 4.4.2. This looks like a GCC bug though, so feel free to re-enable it if it's
+  // useful; however notice that we already have the templated operator= above and e.g. in MatrixBase
+  // we didn't have to add, in addition to templated operator=, such a non-templated copy operator.
+  //  Derived& operator=(const QuaternionBase& other)
+  //  { return operator=<Derived>(other); }
 
-  Derived& operator=(const AngleAxisType& aa);
-  template<class OtherDerived> Derived& operator=(const MatrixBase<OtherDerived>& m);
+  EIGEN_DEVICE_FUNC Derived& operator=(const AngleAxisType& aa);
+  template <class OtherDerived>
+  EIGEN_DEVICE_FUNC Derived& operator=(const MatrixBase<OtherDerived>& m);
 
   /** \returns a quaternion representing an identity rotation
-    * \sa MatrixBase::Identity()
-    */
-  static inline Quaternion<Scalar> Identity() { return Quaternion<Scalar>(Scalar(1), Scalar(0), Scalar(0), Scalar(0)); }
+   * \sa MatrixBase::Identity()
+   */
+  EIGEN_DEVICE_FUNC static inline Quaternion<Scalar> Identity() {
+    return Quaternion<Scalar>(Scalar(1), Scalar(0), Scalar(0), Scalar(0));
+  }
 
   /** \sa QuaternionBase::Identity(), MatrixBase::setIdentity()
-    */
-  inline QuaternionBase& setIdentity() { coeffs() << Scalar(0), Scalar(0), Scalar(0), Scalar(1); return *this; }
+   */
+  EIGEN_DEVICE_FUNC inline QuaternionBase& setIdentity() {
+    coeffs() << Scalar(0), Scalar(0), Scalar(0), Scalar(1);
+    return *this;
+  }
 
   /** \returns the squared norm of the quaternion's coefficients
-    * \sa QuaternionBase::norm(), MatrixBase::squaredNorm()
-    */
-  inline Scalar squaredNorm() const { return coeffs().squaredNorm(); }
+   * \sa QuaternionBase::norm(), MatrixBase::squaredNorm()
+   */
+  EIGEN_DEVICE_FUNC inline Scalar squaredNorm() const { return coeffs().squaredNorm(); }
 
   /** \returns the norm of the quaternion's coefficients
-    * \sa QuaternionBase::squaredNorm(), MatrixBase::norm()
-    */
-  inline Scalar norm() const { return coeffs().norm(); }
+   * \sa QuaternionBase::squaredNorm(), MatrixBase::norm()
+   */
+  EIGEN_DEVICE_FUNC inline Scalar norm() const { return coeffs().norm(); }
 
   /** Normalizes the quaternion \c *this
-    * \sa normalized(), MatrixBase::normalize() */
-  inline void normalize() { coeffs().normalize(); }
+   * \sa normalized(), MatrixBase::normalize() */
+  EIGEN_DEVICE_FUNC inline void normalize() { coeffs().normalize(); }
   /** \returns a normalized copy of \c *this
-    * \sa normalize(), MatrixBase::normalized() */
-  inline Quaternion<Scalar> normalized() const { return Quaternion<Scalar>(coeffs().normalized()); }
-
-    /** \returns the dot product of \c *this and \a other
-    * Geometrically speaking, the dot product of two unit quaternions
-    * corresponds to the cosine of half the angle between the two rotations.
-    * \sa angularDistance()
-    */
-  template<class OtherDerived> inline Scalar dot(const QuaternionBase<OtherDerived>& other) const { return coeffs().dot(other.coeffs()); }
+   * \sa normalize(), MatrixBase::normalized() */
+  EIGEN_DEVICE_FUNC inline Quaternion<Scalar> normalized() const { return Quaternion<Scalar>(coeffs().normalized()); }
+
+  /** \returns the dot product of \c *this and \a other
+   * Geometrically speaking, the dot product of two unit quaternions
+   * corresponds to the cosine of half the angle between the two rotations.
+   * \sa angularDistance()
+   */
+  template <class OtherDerived>
+  EIGEN_DEVICE_FUNC inline Scalar dot(const QuaternionBase<OtherDerived>& other) const {
+    return coeffs().dot(other.coeffs());
+  }
 
-  template<class OtherDerived> Scalar angularDistance(const QuaternionBase<OtherDerived>& other) const;
+  template <class OtherDerived>
+  EIGEN_DEVICE_FUNC Scalar angularDistance(const QuaternionBase<OtherDerived>& other) const;
 
   /** \returns an equivalent 3x3 rotation matrix */
-  Matrix3 toRotationMatrix() const;
+  EIGEN_DEVICE_FUNC inline Matrix3 toRotationMatrix() const;
 
   /** \returns the quaternion which transform \a a into \a b through a rotation */
-  template<typename Derived1, typename Derived2>
-  Derived& setFromTwoVectors(const MatrixBase<Derived1>& a, const MatrixBase<Derived2>& b);
+  template <typename Derived1, typename Derived2>
+  EIGEN_DEVICE_FUNC Derived& setFromTwoVectors(const MatrixBase<Derived1>& a, const MatrixBase<Derived2>& b);
 
-  template<class OtherDerived> EIGEN_STRONG_INLINE Quaternion<Scalar> operator* (const QuaternionBase<OtherDerived>& q) const;
-  template<class OtherDerived> EIGEN_STRONG_INLINE Derived& operator*= (const QuaternionBase<OtherDerived>& q);
+  template <class OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Quaternion<Scalar> operator*(const QuaternionBase<OtherDerived>& q) const;
+  template <class OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator*=(const QuaternionBase<OtherDerived>& q);
 
   /** \returns the quaternion describing the inverse rotation */
-  Quaternion<Scalar> inverse() const;
+  EIGEN_DEVICE_FUNC Quaternion<Scalar> inverse() const;
 
   /** \returns the conjugated quaternion */
-  Quaternion<Scalar> conjugate() const;
+  EIGEN_DEVICE_FUNC Quaternion<Scalar> conjugate() const;
+
+  template <class OtherDerived>
+  EIGEN_DEVICE_FUNC Quaternion<Scalar> slerp(const Scalar& t, const QuaternionBase<OtherDerived>& other) const;
+
+  /** \returns true if each coefficients of \c *this and \a other are all exactly equal.
+   * \warning When using floating point scalar values you probably should rather use a
+   *          fuzzy comparison such as isApprox()
+   * \sa isApprox(), operator!= */
+  template <class OtherDerived>
+  EIGEN_DEVICE_FUNC inline bool operator==(const QuaternionBase<OtherDerived>& other) const {
+    return coeffs() == other.coeffs();
+  }
 
-  template<class OtherDerived> Quaternion<Scalar> slerp(const Scalar& t, const QuaternionBase<OtherDerived>& other) const;
+  /** \returns true if at least one pair of coefficients of \c *this and \a other are not exactly equal to each other.
+   * \warning When using floating point scalar values you probably should rather use a
+   *          fuzzy comparison such as isApprox()
+   * \sa isApprox(), operator== */
+  template <class OtherDerived>
+  EIGEN_DEVICE_FUNC inline bool operator!=(const QuaternionBase<OtherDerived>& other) const {
+    return coeffs() != other.coeffs();
+  }
 
   /** \returns \c true if \c *this is approximately equal to \a other, within the precision
-    * determined by \a prec.
-    *
-    * \sa MatrixBase::isApprox() */
-  template<class OtherDerived>
-  bool isApprox(const QuaternionBase<OtherDerived>& other, const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const
-  { return coeffs().isApprox(other.coeffs(), prec); }
+   * determined by \a prec.
+   *
+   * \sa MatrixBase::isApprox() */
+  template <class OtherDerived>
+  EIGEN_DEVICE_FUNC bool isApprox(const QuaternionBase<OtherDerived>& other,
+                                  const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const {
+    return coeffs().isApprox(other.coeffs(), prec);
+  }
 
-	/** return the result vector of \a v through the rotation*/
-  EIGEN_STRONG_INLINE Vector3 _transformVector(const Vector3& v) const;
+  /** return the result vector of \a v through the rotation*/
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Vector3 _transformVector(const Vector3& v) const;
 
+#ifdef EIGEN_PARSED_BY_DOXYGEN
   /** \returns \c *this with scalar type casted to \a NewScalarType
-    *
-    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
-    * then this function smartly returns a const reference to \c *this.
-    */
-  template<typename NewScalarType>
-  inline typename internal::cast_return_type<Derived,Quaternion<NewScalarType> >::type cast() const
-  {
-    return typename internal::cast_return_type<Derived,Quaternion<NewScalarType> >::type(derived());
+   *
+   * Note that if \a NewScalarType is equal to the current scalar type of \c *this
+   * then this function smartly returns a const reference to \c *this.
+   */
+  template <typename NewScalarType>
+  EIGEN_DEVICE_FUNC inline typename internal::cast_return_type<Derived, Quaternion<NewScalarType> >::type cast() const;
+
+#else
+
+  template <typename NewScalarType>
+  EIGEN_DEVICE_FUNC inline std::enable_if_t<internal::is_same<Scalar, NewScalarType>::value, const Derived&> cast()
+      const {
+    return derived();
   }
 
+  template <typename NewScalarType>
+  EIGEN_DEVICE_FUNC inline std::enable_if_t<!internal::is_same<Scalar, NewScalarType>::value,
+                                            Quaternion<NewScalarType> >
+  cast() const {
+    return Quaternion<NewScalarType>(coeffs().template cast<NewScalarType>());
+  }
+#endif
+
+#ifndef EIGEN_NO_IO
+  friend std::ostream& operator<<(std::ostream& s, const QuaternionBase<Derived>& q) {
+    s << q.x() << "i + " << q.y() << "j + " << q.z() << "k"
+      << " + " << q.w();
+    return s;
+  }
+#endif
+
 #ifdef EIGEN_QUATERNIONBASE_PLUGIN
-# include EIGEN_QUATERNIONBASE_PLUGIN
+#include EIGEN_QUATERNIONBASE_PLUGIN
 #endif
+ protected:
+  EIGEN_DEFAULT_COPY_CONSTRUCTOR(QuaternionBase)
+  EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(QuaternionBase)
 };
 
 /***************************************************************************
-* Definition/implementation of Quaternion<Scalar>
-***************************************************************************/
+ * Definition/implementation of Quaternion<Scalar>
+ ***************************************************************************/
 
 /** \geometry_module \ingroup Geometry_Module
-  *
-  * \class Quaternion
-  *
-  * \brief The quaternion class used to represent 3D orientations and rotations
-  *
-  * \tparam _Scalar the scalar type, i.e., the type of the coefficients
-  * \tparam _Options controls the memory alignment of the coefficients. Can be \# AutoAlign or \# DontAlign. Default is AutoAlign.
-  *
-  * This class represents a quaternion \f$ w+xi+yj+zk \f$ that is a convenient representation of
-  * orientations and rotations of objects in three dimensions. Compared to other representations
-  * like Euler angles or 3x3 matrices, quaternions offer the following advantages:
-  * \li \b compact storage (4 scalars)
-  * \li \b efficient to compose (28 flops),
-  * \li \b stable spherical interpolation
-  *
-  * The following two typedefs are provided for convenience:
-  * \li \c Quaternionf for \c float
-  * \li \c Quaterniond for \c double
-  *
-  * \warning Operations interpreting the quaternion as rotation have undefined behavior if the quaternion is not normalized.
-  *
-  * \sa  class AngleAxis, class Transform
-  */
+ *
+ * \class Quaternion
+ *
+ * \brief The quaternion class used to represent 3D orientations and rotations
+ *
+ * \tparam Scalar_ the scalar type, i.e., the type of the coefficients
+ * \tparam Options_ controls the memory alignment of the coefficients. Can be \# AutoAlign or \# DontAlign. Default is
+ * AutoAlign.
+ *
+ * This class represents a quaternion \f$ w+xi+yj+zk \f$ that is a convenient representation of
+ * orientations and rotations of objects in three dimensions. Compared to other representations
+ * like Euler angles or 3x3 matrices, quaternions offer the following advantages:
+ * \li \b compact storage (4 scalars)
+ * \li \b efficient to compose (28 flops),
+ * \li \b stable spherical interpolation
+ *
+ * The following two typedefs are provided for convenience:
+ * \li \c Quaternionf for \c float
+ * \li \c Quaterniond for \c double
+ *
+ * \warning Operations interpreting the quaternion as rotation have undefined behavior if the quaternion is not
+ * normalized.
+ *
+ * \sa  class AngleAxis, class Transform
+ */
 
 namespace internal {
-template<typename _Scalar,int _Options>
-struct traits<Quaternion<_Scalar,_Options> >
-{
-  typedef Quaternion<_Scalar,_Options> PlainObject;
-  typedef _Scalar Scalar;
-  typedef Matrix<_Scalar,4,1,_Options> Coefficients;
-  enum{
-    IsAligned = internal::traits<Coefficients>::Flags & AlignedBit,
-    Flags = IsAligned ? (AlignedBit | LvalueBit) : LvalueBit
-  };
+template <typename Scalar_, int Options_>
+struct traits<Quaternion<Scalar_, Options_> > {
+  typedef Quaternion<Scalar_, Options_> PlainObject;
+  typedef Scalar_ Scalar;
+  typedef Matrix<Scalar_, 4, 1, Options_> Coefficients;
+  enum { Alignment = internal::traits<Coefficients>::Alignment, Flags = LvalueBit };
 };
-}
+}  // namespace internal
 
-template<typename _Scalar, int _Options>
-class Quaternion : public QuaternionBase<Quaternion<_Scalar,_Options> >
-{
-  typedef QuaternionBase<Quaternion<_Scalar,_Options> > Base;
-  enum { IsAligned = internal::traits<Quaternion>::IsAligned };
+template <typename Scalar_, int Options_>
+class Quaternion : public QuaternionBase<Quaternion<Scalar_, Options_> > {
+ public:
+  typedef QuaternionBase<Quaternion<Scalar_, Options_> > Base;
+  enum { NeedsAlignment = internal::traits<Quaternion>::Alignment > 0 };
 
-public:
-  typedef _Scalar Scalar;
+  typedef Scalar_ Scalar;
 
   EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Quaternion)
   using Base::operator*=;
@@ -238,340 +319,379 @@ class Quaternion : public QuaternionBase<Quaternion<_Scalar,_Options> >
   typedef typename Base::AngleAxisType AngleAxisType;
 
   /** Default constructor leaving the quaternion uninitialized. */
-  inline Quaternion() {}
+  EIGEN_DEVICE_FUNC inline Quaternion() {}
 
   /** Constructs and initializes the quaternion \f$ w+xi+yj+zk \f$ from
-    * its four coefficients \a w, \a x, \a y and \a z.
-    *
-    * \warning Note the order of the arguments: the real \a w coefficient first,
-    * while internally the coefficients are stored in the following order:
-    * [\c x, \c y, \c z, \c w]
-    */
-  inline Quaternion(const Scalar& w, const Scalar& x, const Scalar& y, const Scalar& z) : m_coeffs(x, y, z, w){}
+   * its four coefficients \a w, \a x, \a y and \a z.
+   *
+   * \warning Note the order of the arguments: the real \a w coefficient first,
+   * while internally the coefficients are stored in the following order:
+   * [\c x, \c y, \c z, \c w]
+   */
+  EIGEN_DEVICE_FUNC inline Quaternion(const Scalar& w, const Scalar& x, const Scalar& y, const Scalar& z)
+      : m_coeffs(x, y, z, w) {}
+
+  /** Constructs and initializes a quaternion from its real part as a scalar,
+   *  and its imaginary part as a 3-vector [\c x, \c y, \c z]
+   */
+  template <typename Derived>
+  EIGEN_DEVICE_FUNC inline Quaternion(const Scalar& w, const Eigen::MatrixBase<Derived>& vec)
+      : m_coeffs(vec.x(), vec.y(), vec.z(), w) {
+    EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(Derived, 3);
+  }
 
   /** Constructs and initialize a quaternion from the array data */
-  inline Quaternion(const Scalar* data) : m_coeffs(data) {}
+  EIGEN_DEVICE_FUNC explicit inline Quaternion(const Scalar* data) : m_coeffs(data) {}
 
   /** Copy constructor */
-  template<class Derived> EIGEN_STRONG_INLINE Quaternion(const QuaternionBase<Derived>& other) { this->Base::operator=(other); }
+  template <class Derived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Quaternion(const QuaternionBase<Derived>& other) {
+    this->Base::operator=(other);
+  }
 
   /** Constructs and initializes a quaternion from the angle-axis \a aa */
-  explicit inline Quaternion(const AngleAxisType& aa) { *this = aa; }
+  EIGEN_DEVICE_FUNC explicit inline Quaternion(const AngleAxisType& aa) { *this = aa; }
 
   /** Constructs and initializes a quaternion from either:
-    *  - a rotation matrix expression,
-    *  - a 4D vector expression representing quaternion coefficients.
-    */
-  template<typename Derived>
-  explicit inline Quaternion(const MatrixBase<Derived>& other) { *this = other; }
+   *  - a rotation matrix expression,
+   *  - a 4D vector expression representing quaternion coefficients in the order [\c x, \c y, \c z, \c w].
+   */
+  template <typename Derived>
+  EIGEN_DEVICE_FUNC explicit inline Quaternion(const MatrixBase<Derived>& other) {
+    *this = other;
+  }
 
   /** Explicit copy constructor with scalar conversion */
-  template<typename OtherScalar, int OtherOptions>
-  explicit inline Quaternion(const Quaternion<OtherScalar, OtherOptions>& other)
-  { m_coeffs = other.coeffs().template cast<Scalar>(); }
+  template <typename OtherScalar, int OtherOptions>
+  EIGEN_DEVICE_FUNC explicit inline Quaternion(const Quaternion<OtherScalar, OtherOptions>& other) {
+    m_coeffs = other.coeffs().template cast<Scalar>();
+  }
+
+  // We define a copy constructor, which means we don't get an implicit move constructor or assignment operator.
+  /** Default move constructor */
+  EIGEN_DEVICE_FUNC inline Quaternion(Quaternion&& other) noexcept(std::is_nothrow_move_constructible<Scalar>::value)
+      : m_coeffs(std::move(other.coeffs())) {}
+
+  /** Default move assignment operator */
+  EIGEN_DEVICE_FUNC Quaternion& operator=(Quaternion&& other) noexcept(std::is_nothrow_move_assignable<Scalar>::value) {
+    m_coeffs = std::move(other.coeffs());
+    return *this;
+  }
+
+  EIGEN_DEVICE_FUNC static Quaternion UnitRandom();
+
+  EIGEN_DEVICE_FUNC static Quaternion FromCoeffsScalarLast(const Scalar& x, const Scalar& y, const Scalar& z,
+                                                           const Scalar& w);
+
+  EIGEN_DEVICE_FUNC static Quaternion FromCoeffsScalarFirst(const Scalar& w, const Scalar& x, const Scalar& y,
+                                                            const Scalar& z);
+
+  template <typename Derived1, typename Derived2>
+  EIGEN_DEVICE_FUNC static Quaternion FromTwoVectors(const MatrixBase<Derived1>& a, const MatrixBase<Derived2>& b);
 
-  template<typename Derived1, typename Derived2>
-  static Quaternion FromTwoVectors(const MatrixBase<Derived1>& a, const MatrixBase<Derived2>& b);
+  EIGEN_DEVICE_FUNC inline Coefficients& coeffs() { return m_coeffs; }
+  EIGEN_DEVICE_FUNC inline const Coefficients& coeffs() const { return m_coeffs; }
 
-  inline Coefficients& coeffs() { return m_coeffs;}
-  inline const Coefficients& coeffs() const { return m_coeffs;}
+  EIGEN_DEVICE_FUNC inline Coefficients coeffsScalarLast() const { return m_coeffs; }
 
-  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(IsAligned)
+  EIGEN_DEVICE_FUNC inline Coefficients coeffsScalarFirst() const {
+    return {m_coeffs.w(), m_coeffs.x(), m_coeffs.y(), m_coeffs.z()};
+  }
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(bool(NeedsAlignment))
+
+#ifdef EIGEN_QUATERNION_PLUGIN
+#include EIGEN_QUATERNION_PLUGIN
+#endif
 
-protected:
+ protected:
   Coefficients m_coeffs;
-  
+
 #ifndef EIGEN_PARSED_BY_DOXYGEN
-    static EIGEN_STRONG_INLINE void _check_template_params()
-    {
-      EIGEN_STATIC_ASSERT( (_Options & DontAlign) == _Options,
-        INVALID_MATRIX_TEMPLATE_PARAMETERS)
-    }
+  EIGEN_STATIC_ASSERT((Options_ & DontAlign) == Options_, INVALID_MATRIX_TEMPLATE_PARAMETERS)
 #endif
 };
 
 /** \ingroup Geometry_Module
-  * single precision quaternion type */
+ * single precision quaternion type */
 typedef Quaternion<float> Quaternionf;
 /** \ingroup Geometry_Module
-  * double precision quaternion type */
+ * double precision quaternion type */
 typedef Quaternion<double> Quaterniond;
 
 /***************************************************************************
-* Specialization of Map<Quaternion<Scalar>>
-***************************************************************************/
+ * Specialization of Map<Quaternion<Scalar>>
+ ***************************************************************************/
 
 namespace internal {
-  template<typename _Scalar, int _Options>
-  struct traits<Map<Quaternion<_Scalar>, _Options> > : traits<Quaternion<_Scalar, (int(_Options)&Aligned)==Aligned ? AutoAlign : DontAlign> >
-  {
-    typedef Map<Matrix<_Scalar,4,1>, _Options> Coefficients;
-  };
-}
+template <typename Scalar_, int Options_>
+struct traits<Map<Quaternion<Scalar_>, Options_> >
+    : traits<Quaternion<Scalar_, (int(Options_) & Aligned) == Aligned ? AutoAlign : DontAlign> > {
+  typedef Map<Matrix<Scalar_, 4, 1>, Options_> Coefficients;
+};
+}  // namespace internal
 
 namespace internal {
-  template<typename _Scalar, int _Options>
-  struct traits<Map<const Quaternion<_Scalar>, _Options> > : traits<Quaternion<_Scalar, (int(_Options)&Aligned)==Aligned ? AutoAlign : DontAlign> >
-  {
-    typedef Map<const Matrix<_Scalar,4,1>, _Options> Coefficients;
-    typedef traits<Quaternion<_Scalar, (int(_Options)&Aligned)==Aligned ? AutoAlign : DontAlign> > TraitsBase;
-    enum {
-      Flags = TraitsBase::Flags & ~LvalueBit
-    };
-  };
-}
+template <typename Scalar_, int Options_>
+struct traits<Map<const Quaternion<Scalar_>, Options_> >
+    : traits<Quaternion<Scalar_, (int(Options_) & Aligned) == Aligned ? AutoAlign : DontAlign> > {
+  typedef Map<const Matrix<Scalar_, 4, 1>, Options_> Coefficients;
+  typedef traits<Quaternion<Scalar_, (int(Options_) & Aligned) == Aligned ? AutoAlign : DontAlign> > TraitsBase;
+  enum { Flags = TraitsBase::Flags & ~LvalueBit };
+};
+}  // namespace internal
 
 /** \ingroup Geometry_Module
-  * \brief Quaternion expression mapping a constant memory buffer
-  *
-  * \tparam _Scalar the type of the Quaternion coefficients
-  * \tparam _Options see class Map
-  *
-  * This is a specialization of class Map for Quaternion. This class allows to view
-  * a 4 scalar memory buffer as an Eigen's Quaternion object.
-  *
-  * \sa class Map, class Quaternion, class QuaternionBase
-  */
-template<typename _Scalar, int _Options>
-class Map<const Quaternion<_Scalar>, _Options >
-  : public QuaternionBase<Map<const Quaternion<_Scalar>, _Options> >
-{
-    typedef QuaternionBase<Map<const Quaternion<_Scalar>, _Options> > Base;
-
-  public:
-    typedef _Scalar Scalar;
-    typedef typename internal::traits<Map>::Coefficients Coefficients;
-    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Map)
-    using Base::operator*=;
-
-    /** Constructs a Mapped Quaternion object from the pointer \a coeffs
-      *
-      * The pointer \a coeffs must reference the four coefficients of Quaternion in the following order:
-      * \code *coeffs == {x, y, z, w} \endcode
-      *
-      * If the template parameter _Options is set to #Aligned, then the pointer coeffs must be aligned. */
-    EIGEN_STRONG_INLINE Map(const Scalar* coeffs) : m_coeffs(coeffs) {}
-
-    inline const Coefficients& coeffs() const { return m_coeffs;}
-
-  protected:
-    const Coefficients m_coeffs;
+ * \brief Quaternion expression mapping a constant memory buffer
+ *
+ * \tparam Scalar_ the type of the Quaternion coefficients
+ * \tparam Options_ see class Map
+ *
+ * This is a specialization of class Map for Quaternion. This class allows to view
+ * a 4 scalar memory buffer as an Eigen's Quaternion object.
+ *
+ * \sa class Map, class Quaternion, class QuaternionBase
+ */
+template <typename Scalar_, int Options_>
+class Map<const Quaternion<Scalar_>, Options_> : public QuaternionBase<Map<const Quaternion<Scalar_>, Options_> > {
+ public:
+  typedef QuaternionBase<Map<const Quaternion<Scalar_>, Options_> > Base;
+
+  typedef Scalar_ Scalar;
+  typedef typename internal::traits<Map>::Coefficients Coefficients;
+  EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Map)
+  using Base::operator*=;
+
+  /** Constructs a Mapped Quaternion object from the pointer \a coeffs
+   *
+   * The pointer \a coeffs must reference the four coefficients of Quaternion in the following order:
+   * \code *coeffs == {x, y, z, w} \endcode
+   *
+   * If the template parameter Options_ is set to #Aligned, then the pointer coeffs must be aligned. */
+  EIGEN_DEVICE_FUNC explicit EIGEN_STRONG_INLINE Map(const Scalar* coeffs) : m_coeffs(coeffs) {}
+
+  EIGEN_DEVICE_FUNC inline const Coefficients& coeffs() const { return m_coeffs; }
+
+  EIGEN_DEVICE_FUNC inline Coefficients coeffsScalarLast() const { return m_coeffs; }
+
+  EIGEN_DEVICE_FUNC inline Coefficients coeffsScalarFirst() const {
+    return {m_coeffs.w(), m_coeffs.x(), m_coeffs.y(), m_coeffs.z()};
+  }
+
+ protected:
+  const Coefficients m_coeffs;
 };
 
 /** \ingroup Geometry_Module
-  * \brief Expression of a quaternion from a memory buffer
-  *
-  * \tparam _Scalar the type of the Quaternion coefficients
-  * \tparam _Options see class Map
-  *
-  * This is a specialization of class Map for Quaternion. This class allows to view
-  * a 4 scalar memory buffer as an Eigen's  Quaternion object.
-  *
-  * \sa class Map, class Quaternion, class QuaternionBase
-  */
-template<typename _Scalar, int _Options>
-class Map<Quaternion<_Scalar>, _Options >
-  : public QuaternionBase<Map<Quaternion<_Scalar>, _Options> >
-{
-    typedef QuaternionBase<Map<Quaternion<_Scalar>, _Options> > Base;
-
-  public:
-    typedef _Scalar Scalar;
-    typedef typename internal::traits<Map>::Coefficients Coefficients;
-    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Map)
-    using Base::operator*=;
-
-    /** Constructs a Mapped Quaternion object from the pointer \a coeffs
-      *
-      * The pointer \a coeffs must reference the four coefficients of Quaternion in the following order:
-      * \code *coeffs == {x, y, z, w} \endcode
-      *
-      * If the template parameter _Options is set to #Aligned, then the pointer coeffs must be aligned. */
-    EIGEN_STRONG_INLINE Map(Scalar* coeffs) : m_coeffs(coeffs) {}
-
-    inline Coefficients& coeffs() { return m_coeffs; }
-    inline const Coefficients& coeffs() const { return m_coeffs; }
-
-  protected:
-    Coefficients m_coeffs;
+ * \brief Expression of a quaternion from a memory buffer
+ *
+ * \tparam Scalar_ the type of the Quaternion coefficients
+ * \tparam Options_ see class Map
+ *
+ * This is a specialization of class Map for Quaternion. This class allows to view
+ * a 4 scalar memory buffer as an Eigen's  Quaternion object.
+ *
+ * \sa class Map, class Quaternion, class QuaternionBase
+ */
+template <typename Scalar_, int Options_>
+class Map<Quaternion<Scalar_>, Options_> : public QuaternionBase<Map<Quaternion<Scalar_>, Options_> > {
+ public:
+  typedef QuaternionBase<Map<Quaternion<Scalar_>, Options_> > Base;
+
+  typedef Scalar_ Scalar;
+  typedef typename internal::traits<Map>::Coefficients Coefficients;
+  EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Map)
+  using Base::operator*=;
+
+  /** Constructs a Mapped Quaternion object from the pointer \a coeffs
+   *
+   * The pointer \a coeffs must reference the four coefficients of Quaternion in the following order:
+   * \code *coeffs == {x, y, z, w} \endcode
+   *
+   * If the template parameter Options_ is set to #Aligned, then the pointer coeffs must be aligned. */
+  EIGEN_DEVICE_FUNC explicit EIGEN_STRONG_INLINE Map(Scalar* coeffs) : m_coeffs(coeffs) {}
+
+  EIGEN_DEVICE_FUNC inline Coefficients& coeffs() { return m_coeffs; }
+  EIGEN_DEVICE_FUNC inline const Coefficients& coeffs() const { return m_coeffs; }
+
+  EIGEN_DEVICE_FUNC inline Coefficients coeffsScalarLast() const { return m_coeffs; }
+
+  EIGEN_DEVICE_FUNC inline Coefficients coeffsScalarFirst() const {
+    return {m_coeffs.w(), m_coeffs.x(), m_coeffs.y(), m_coeffs.z()};
+  }
+
+ protected:
+  Coefficients m_coeffs;
 };
 
 /** \ingroup Geometry_Module
-  * Map an unaligned array of single precision scalars as a quaternion */
-typedef Map<Quaternion<float>, 0>         QuaternionMapf;
+ * Map an unaligned array of single precision scalars as a quaternion */
+typedef Map<Quaternion<float>, 0> QuaternionMapf;
 /** \ingroup Geometry_Module
-  * Map an unaligned array of double precision scalars as a quaternion */
-typedef Map<Quaternion<double>, 0>        QuaternionMapd;
+ * Map an unaligned array of double precision scalars as a quaternion */
+typedef Map<Quaternion<double>, 0> QuaternionMapd;
 /** \ingroup Geometry_Module
-  * Map a 16-byte aligned array of single precision scalars as a quaternion */
-typedef Map<Quaternion<float>, Aligned>   QuaternionMapAlignedf;
+ * Map a 16-byte aligned array of single precision scalars as a quaternion */
+typedef Map<Quaternion<float>, Aligned> QuaternionMapAlignedf;
 /** \ingroup Geometry_Module
-  * Map a 16-byte aligned array of double precision scalars as a quaternion */
-typedef Map<Quaternion<double>, Aligned>  QuaternionMapAlignedd;
+ * Map a 16-byte aligned array of double precision scalars as a quaternion */
+typedef Map<Quaternion<double>, Aligned> QuaternionMapAlignedd;
 
 /***************************************************************************
-* Implementation of QuaternionBase methods
-***************************************************************************/
+ * Implementation of QuaternionBase methods
+ ***************************************************************************/
 
 // Generic Quaternion * Quaternion product
 // This product can be specialized for a given architecture via the Arch template argument.
 namespace internal {
-template<int Arch, class Derived1, class Derived2, typename Scalar, int _Options> struct quat_product
-{
-  static EIGEN_STRONG_INLINE Quaternion<Scalar> run(const QuaternionBase<Derived1>& a, const QuaternionBase<Derived2>& b){
-    return Quaternion<Scalar>
-    (
-      a.w() * b.w() - a.x() * b.x() - a.y() * b.y() - a.z() * b.z(),
-      a.w() * b.x() + a.x() * b.w() + a.y() * b.z() - a.z() * b.y(),
-      a.w() * b.y() + a.y() * b.w() + a.z() * b.x() - a.x() * b.z(),
-      a.w() * b.z() + a.z() * b.w() + a.x() * b.y() - a.y() * b.x()
-    );
+template <int Arch, class Derived1, class Derived2, typename Scalar>
+struct quat_product {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Quaternion<Scalar> run(const QuaternionBase<Derived1>& a,
+                                                                      const QuaternionBase<Derived2>& b) {
+    return Quaternion<Scalar>(a.w() * b.w() - a.x() * b.x() - a.y() * b.y() - a.z() * b.z(),
+                              a.w() * b.x() + a.x() * b.w() + a.y() * b.z() - a.z() * b.y(),
+                              a.w() * b.y() + a.y() * b.w() + a.z() * b.x() - a.x() * b.z(),
+                              a.w() * b.z() + a.z() * b.w() + a.x() * b.y() - a.y() * b.x());
   }
 };
-}
+}  // namespace internal
 
 /** \returns the concatenation of two rotations as a quaternion-quaternion product */
 template <class Derived>
 template <class OtherDerived>
-EIGEN_STRONG_INLINE Quaternion<typename internal::traits<Derived>::Scalar>
-QuaternionBase<Derived>::operator* (const QuaternionBase<OtherDerived>& other) const
-{
-  EIGEN_STATIC_ASSERT((internal::is_same<typename Derived::Scalar, typename OtherDerived::Scalar>::value),
-   YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Quaternion<typename internal::traits<Derived>::Scalar>
+QuaternionBase<Derived>::operator*(const QuaternionBase<OtherDerived>& other) const {
+  EIGEN_STATIC_ASSERT(
+      (internal::is_same<typename Derived::Scalar, typename OtherDerived::Scalar>::value),
+      YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
   return internal::quat_product<Architecture::Target, Derived, OtherDerived,
-                         typename internal::traits<Derived>::Scalar,
-                         internal::traits<Derived>::IsAligned && internal::traits<OtherDerived>::IsAligned>::run(*this, other);
+                                typename internal::traits<Derived>::Scalar>::run(*this, other);
 }
 
 /** \sa operator*(Quaternion) */
 template <class Derived>
 template <class OtherDerived>
-EIGEN_STRONG_INLINE Derived& QuaternionBase<Derived>::operator*= (const QuaternionBase<OtherDerived>& other)
-{
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& QuaternionBase<Derived>::operator*=(
+    const QuaternionBase<OtherDerived>& other) {
   derived() = derived() * other.derived();
   return derived();
 }
 
 /** Rotation of a vector by a quaternion.
-  * \remarks If the quaternion is used to rotate several points (>1)
-  * then it is much more efficient to first convert it to a 3x3 Matrix.
-  * Comparison of the operation cost for n transformations:
-  *   - Quaternion2:    30n
-  *   - Via a Matrix3: 24 + 15n
-  */
+ * \remarks If the quaternion is used to rotate several points (>1)
+ * then it is much more efficient to first convert it to a 3x3 Matrix.
+ * Comparison of the operation cost for n transformations:
+ *   - Quaternion2:    30n
+ *   - Via a Matrix3: 24 + 15n
+ */
 template <class Derived>
-EIGEN_STRONG_INLINE typename QuaternionBase<Derived>::Vector3
-QuaternionBase<Derived>::_transformVector(const Vector3& v) const
-{
-    // Note that this algorithm comes from the optimization by hand
-    // of the conversion to a Matrix followed by a Matrix/Vector product.
-    // It appears to be much faster than the common algorithm found
-    // in the literature (30 versus 39 flops). It also requires two
-    // Vector3 as temporaries.
-    Vector3 uv = this->vec().cross(v);
-    uv += uv;
-    return v + this->w() * uv + this->vec().cross(uv);
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename QuaternionBase<Derived>::Vector3
+QuaternionBase<Derived>::_transformVector(const Vector3& v) const {
+  // Note that this algorithm comes from the optimization by hand
+  // of the conversion to a Matrix followed by a Matrix/Vector product.
+  // It appears to be much faster than the common algorithm found
+  // in the literature (30 versus 39 flops). It also requires two
+  // Vector3 as temporaries.
+  Vector3 uv = this->vec().cross(v);
+  uv += uv;
+  return v + this->w() * uv + this->vec().cross(uv);
 }
 
-template<class Derived>
-EIGEN_STRONG_INLINE QuaternionBase<Derived>& QuaternionBase<Derived>::operator=(const QuaternionBase<Derived>& other)
-{
+template <class Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE QuaternionBase<Derived>& QuaternionBase<Derived>::operator=(
+    const QuaternionBase<Derived>& other) {
   coeffs() = other.coeffs();
   return derived();
 }
 
-template<class Derived>
-template<class OtherDerived>
-EIGEN_STRONG_INLINE Derived& QuaternionBase<Derived>::operator=(const QuaternionBase<OtherDerived>& other)
-{
+template <class Derived>
+template <class OtherDerived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& QuaternionBase<Derived>::operator=(
+    const QuaternionBase<OtherDerived>& other) {
   coeffs() = other.coeffs();
   return derived();
 }
 
 /** Set \c *this from an angle-axis \a aa and returns a reference to \c *this
-  */
-template<class Derived>
-EIGEN_STRONG_INLINE Derived& QuaternionBase<Derived>::operator=(const AngleAxisType& aa)
-{
-  using std::cos;
-  using std::sin;
-  Scalar ha = Scalar(0.5)*aa.angle(); // Scalar(0.5) to suppress precision loss warnings
+ */
+template <class Derived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& QuaternionBase<Derived>::operator=(const AngleAxisType& aa) {
+  EIGEN_USING_STD(cos)
+  EIGEN_USING_STD(sin)
+  Scalar ha = Scalar(0.5) * aa.angle();  // Scalar(0.5) to suppress precision loss warnings
   this->w() = cos(ha);
   this->vec() = sin(ha) * aa.axis();
   return derived();
 }
 
 /** Set \c *this from the expression \a xpr:
-  *   - if \a xpr is a 4x1 vector, then \a xpr is assumed to be a quaternion
-  *   - if \a xpr is a 3x3 matrix, then \a xpr is assumed to be rotation matrix
-  *     and \a xpr is converted to a quaternion
-  */
-
-template<class Derived>
-template<class MatrixDerived>
-inline Derived& QuaternionBase<Derived>::operator=(const MatrixBase<MatrixDerived>& xpr)
-{
-  EIGEN_STATIC_ASSERT((internal::is_same<typename Derived::Scalar, typename MatrixDerived::Scalar>::value),
-   YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+ *   - if \a xpr is a 4x1 vector, then \a xpr is assumed to be a quaternion
+ *   - if \a xpr is a 3x3 matrix, then \a xpr is assumed to be rotation matrix
+ *     and \a xpr is converted to a quaternion
+ */
+
+template <class Derived>
+template <class MatrixDerived>
+EIGEN_DEVICE_FUNC inline Derived& QuaternionBase<Derived>::operator=(const MatrixBase<MatrixDerived>& xpr) {
+  EIGEN_STATIC_ASSERT(
+      (internal::is_same<typename Derived::Scalar, typename MatrixDerived::Scalar>::value),
+      YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
   internal::quaternionbase_assign_impl<MatrixDerived>::run(*this, xpr.derived());
   return derived();
 }
 
 /** Convert the quaternion to a 3x3 rotation matrix. The quaternion is required to
-  * be normalized, otherwise the result is undefined.
-  */
-template<class Derived>
-inline typename QuaternionBase<Derived>::Matrix3
-QuaternionBase<Derived>::toRotationMatrix(void) const
-{
+ * be normalized, otherwise the result is undefined.
+ */
+template <class Derived>
+EIGEN_DEVICE_FUNC inline typename QuaternionBase<Derived>::Matrix3 QuaternionBase<Derived>::toRotationMatrix(
+    void) const {
   // NOTE if inlined, then gcc 4.2 and 4.4 get rid of the temporary (not gcc 4.3 !!)
   // if not inlined then the cost of the return by value is huge ~ +35%,
   // however, not inlining this function is an order of magnitude slower, so
   // it has to be inlined, and so the return by value is not an issue
   Matrix3 res;
 
-  const Scalar tx  = Scalar(2)*this->x();
-  const Scalar ty  = Scalar(2)*this->y();
-  const Scalar tz  = Scalar(2)*this->z();
-  const Scalar twx = tx*this->w();
-  const Scalar twy = ty*this->w();
-  const Scalar twz = tz*this->w();
-  const Scalar txx = tx*this->x();
-  const Scalar txy = ty*this->x();
-  const Scalar txz = tz*this->x();
-  const Scalar tyy = ty*this->y();
-  const Scalar tyz = tz*this->y();
-  const Scalar tzz = tz*this->z();
-
-  res.coeffRef(0,0) = Scalar(1)-(tyy+tzz);
-  res.coeffRef(0,1) = txy-twz;
-  res.coeffRef(0,2) = txz+twy;
-  res.coeffRef(1,0) = txy+twz;
-  res.coeffRef(1,1) = Scalar(1)-(txx+tzz);
-  res.coeffRef(1,2) = tyz-twx;
-  res.coeffRef(2,0) = txz-twy;
-  res.coeffRef(2,1) = tyz+twx;
-  res.coeffRef(2,2) = Scalar(1)-(txx+tyy);
+  const Scalar tx = Scalar(2) * this->x();
+  const Scalar ty = Scalar(2) * this->y();
+  const Scalar tz = Scalar(2) * this->z();
+  const Scalar twx = tx * this->w();
+  const Scalar twy = ty * this->w();
+  const Scalar twz = tz * this->w();
+  const Scalar txx = tx * this->x();
+  const Scalar txy = ty * this->x();
+  const Scalar txz = tz * this->x();
+  const Scalar tyy = ty * this->y();
+  const Scalar tyz = tz * this->y();
+  const Scalar tzz = tz * this->z();
+
+  res.coeffRef(0, 0) = Scalar(1) - (tyy + tzz);
+  res.coeffRef(0, 1) = txy - twz;
+  res.coeffRef(0, 2) = txz + twy;
+  res.coeffRef(1, 0) = txy + twz;
+  res.coeffRef(1, 1) = Scalar(1) - (txx + tzz);
+  res.coeffRef(1, 2) = tyz - twx;
+  res.coeffRef(2, 0) = txz - twy;
+  res.coeffRef(2, 1) = tyz + twx;
+  res.coeffRef(2, 2) = Scalar(1) - (txx + tyy);
 
   return res;
 }
 
 /** Sets \c *this to be a quaternion representing a rotation between
-  * the two arbitrary vectors \a a and \a b. In other words, the built
-  * rotation represent a rotation sending the line of direction \a a
-  * to the line of direction \a b, both lines passing through the origin.
-  *
-  * \returns a reference to \c *this.
-  *
-  * Note that the two input vectors do \b not have to be normalized, and
-  * do not need to have the same norm.
-  */
-template<class Derived>
-template<typename Derived1, typename Derived2>
-inline Derived& QuaternionBase<Derived>::setFromTwoVectors(const MatrixBase<Derived1>& a, const MatrixBase<Derived2>& b)
-{
-  using std::max;
-  using std::sqrt;
+ * the two arbitrary vectors \a a and \a b. In other words, the built
+ * rotation represent a rotation sending the line of direction \a a
+ * to the line of direction \a b, both lines passing through the origin.
+ *
+ * \returns a reference to \c *this.
+ *
+ * Note that the two input vectors do \b not have to be normalized, and
+ * do not need to have the same norm.
+ */
+template <class Derived>
+template <typename Derived1, typename Derived2>
+EIGEN_DEVICE_FUNC inline Derived& QuaternionBase<Derived>::setFromTwoVectors(const MatrixBase<Derived1>& a,
+                                                                             const MatrixBase<Derived2>& b) {
+  EIGEN_USING_STD(sqrt)
   Vector3 v0 = a.normalized();
   Vector3 v1 = b.normalized();
   Scalar c = v1.dot(v0);
@@ -584,133 +704,174 @@ inline Derived& QuaternionBase<Derived>::setFromTwoVectors(const MatrixBase<Deri
   //    under the constraint:
   //       ||x|| = 1
   //    which yields a singular value problem
-  if (c < Scalar(-1)+NumTraits<Scalar>::dummy_precision())
-  {
-    c = (max)(c,Scalar(-1));
-    Matrix<Scalar,2,3> m; m << v0.transpose(), v1.transpose();
-    JacobiSVD<Matrix<Scalar,2,3> > svd(m, ComputeFullV);
+  if (c < Scalar(-1) + NumTraits<Scalar>::dummy_precision()) {
+    c = numext::maxi(c, Scalar(-1));
+    Matrix<Scalar, 2, 3> m;
+    m << v0.transpose(), v1.transpose();
+    JacobiSVD<Matrix<Scalar, 2, 3>, ComputeFullV> svd(m);
     Vector3 axis = svd.matrixV().col(2);
 
-    Scalar w2 = (Scalar(1)+c)*Scalar(0.5);
+    Scalar w2 = (Scalar(1) + c) * Scalar(0.5);
     this->w() = sqrt(w2);
     this->vec() = axis * sqrt(Scalar(1) - w2);
     return derived();
   }
   Vector3 axis = v0.cross(v1);
-  Scalar s = sqrt((Scalar(1)+c)*Scalar(2));
-  Scalar invs = Scalar(1)/s;
+  Scalar s = sqrt((Scalar(1) + c) * Scalar(2));
+  Scalar invs = Scalar(1) / s;
   this->vec() = axis * invs;
   this->w() = s * Scalar(0.5);
 
   return derived();
 }
 
+/** \returns a random unit quaternion following a uniform distribution law on SO(3)
+ *
+ * \note The implementation is based on http://planning.cs.uiuc.edu/node198.html
+ */
+template <typename Scalar, int Options>
+EIGEN_DEVICE_FUNC Quaternion<Scalar, Options> Quaternion<Scalar, Options>::UnitRandom() {
+  EIGEN_USING_STD(sqrt)
+  EIGEN_USING_STD(sin)
+  EIGEN_USING_STD(cos)
+  const Scalar u1 = internal::random<Scalar>(0, 1), u2 = internal::random<Scalar>(0, 2 * EIGEN_PI),
+               u3 = internal::random<Scalar>(0, 2 * EIGEN_PI);
+  const Scalar a = sqrt(Scalar(1) - u1), b = sqrt(u1);
+  return Quaternion(a * sin(u2), a * cos(u2), b * sin(u3), b * cos(u3));
+}
 
-/** Returns a quaternion representing a rotation between
-  * the two arbitrary vectors \a a and \a b. In other words, the built
-  * rotation represent a rotation sending the line of direction \a a
-  * to the line of direction \a b, both lines passing through the origin.
-  *
-  * \returns resulting quaternion
-  *
-  * Note that the two input vectors do \b not have to be normalized, and
-  * do not need to have the same norm.
-  */
-template<typename Scalar, int Options>
-template<typename Derived1, typename Derived2>
-Quaternion<Scalar,Options> Quaternion<Scalar,Options>::FromTwoVectors(const MatrixBase<Derived1>& a, const MatrixBase<Derived2>& b)
-{
-    Quaternion quat;
-    quat.setFromTwoVectors(a, b);
-    return quat;
+/** Constructs a quaternion from its coefficients in the order [\c x, \c y, \c z, \c w], i.e. vector part [\c x, \c y,
+ * \c z] first, scalar part \a w LAST.
+ *
+ * This factory accepts the parameters in the same order as the underlying coefficient vector. Consider using this
+ * factory function to make the parameter ordering explicit.
+ */
+template <typename Scalar, int Options>
+EIGEN_DEVICE_FUNC Quaternion<Scalar, Options> Quaternion<Scalar, Options>::FromCoeffsScalarLast(const Scalar& x,
+                                                                                                const Scalar& y,
+                                                                                                const Scalar& z,
+                                                                                                const Scalar& w) {
+  return Quaternion(w, x, y, z);
 }
 
+/** Constructs a quaternion from its coefficients in the order [\c w, \c x, \c y, \c z], i.e. scalar part \a w FIRST,
+ * vector part [\c x, \c y, \c z] last.
+ *
+ * This factory accepts the parameters in the same order as the constructor \code Quaternion(const Scalar& w, const
+ * Scalar& x, const Scalar& y, const Scalar& z) \endcode. Consider using this factory function to make the parameter
+ * ordering explicit.
+ */
+template <typename Scalar, int Options>
+EIGEN_DEVICE_FUNC Quaternion<Scalar, Options> Quaternion<Scalar, Options>::FromCoeffsScalarFirst(const Scalar& w,
+                                                                                                 const Scalar& x,
+                                                                                                 const Scalar& y,
+                                                                                                 const Scalar& z) {
+  return Quaternion(w, x, y, z);
+}
+
+/** Returns a quaternion representing a rotation between
+ * the two arbitrary vectors \a a and \a b. In other words, the built
+ * rotation represent a rotation sending the line of direction \a a
+ * to the line of direction \a b, both lines passing through the origin.
+ *
+ * \returns resulting quaternion
+ *
+ * Note that the two input vectors do \b not have to be normalized, and
+ * do not need to have the same norm.
+ */
+template <typename Scalar, int Options>
+template <typename Derived1, typename Derived2>
+EIGEN_DEVICE_FUNC Quaternion<Scalar, Options> Quaternion<Scalar, Options>::FromTwoVectors(
+    const MatrixBase<Derived1>& a, const MatrixBase<Derived2>& b) {
+  Quaternion quat;
+  quat.setFromTwoVectors(a, b);
+  return quat;
+}
 
 /** \returns the multiplicative inverse of \c *this
-  * Note that in most cases, i.e., if you simply want the opposite rotation,
-  * and/or the quaternion is normalized, then it is enough to use the conjugate.
-  *
-  * \sa QuaternionBase::conjugate()
-  */
+ * Note that in most cases, i.e., if you simply want the opposite rotation,
+ * and/or the quaternion is normalized, then it is enough to use the conjugate.
+ *
+ * \sa QuaternionBase::conjugate()
+ */
 template <class Derived>
-inline Quaternion<typename internal::traits<Derived>::Scalar> QuaternionBase<Derived>::inverse() const
-{
+EIGEN_DEVICE_FUNC inline Quaternion<typename internal::traits<Derived>::Scalar> QuaternionBase<Derived>::inverse()
+    const {
   // FIXME should this function be called multiplicativeInverse and conjugate() be called inverse() or opposite()  ??
   Scalar n2 = this->squaredNorm();
   if (n2 > Scalar(0))
     return Quaternion<Scalar>(conjugate().coeffs() / n2);
-  else
-  {
+  else {
     // return an invalid result to flag the error
     return Quaternion<Scalar>(Coefficients::Zero());
   }
 }
 
+// Generic conjugate of a Quaternion
+namespace internal {
+template <int Arch, class Derived, typename Scalar>
+struct quat_conj {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Quaternion<Scalar> run(const QuaternionBase<Derived>& q) {
+    return Quaternion<Scalar>(q.w(), -q.x(), -q.y(), -q.z());
+  }
+};
+}  // namespace internal
+
 /** \returns the conjugate of the \c *this which is equal to the multiplicative inverse
-  * if the quaternion is normalized.
-  * The conjugate of a quaternion represents the opposite rotation.
-  *
-  * \sa Quaternion2::inverse()
-  */
+ * if the quaternion is normalized.
+ * The conjugate of a quaternion represents the opposite rotation.
+ *
+ * \sa Quaternion2::inverse()
+ */
 template <class Derived>
-inline Quaternion<typename internal::traits<Derived>::Scalar>
-QuaternionBase<Derived>::conjugate() const
-{
-  return Quaternion<Scalar>(this->w(),-this->x(),-this->y(),-this->z());
+EIGEN_DEVICE_FUNC inline Quaternion<typename internal::traits<Derived>::Scalar> QuaternionBase<Derived>::conjugate()
+    const {
+  return internal::quat_conj<Architecture::Target, Derived, typename internal::traits<Derived>::Scalar>::run(*this);
 }
 
 /** \returns the angle (in radian) between two rotations
-  * \sa dot()
-  */
+ * \sa dot()
+ */
 template <class Derived>
 template <class OtherDerived>
-inline typename internal::traits<Derived>::Scalar
-QuaternionBase<Derived>::angularDistance(const QuaternionBase<OtherDerived>& other) const
-{
-  using std::atan2;
-  using std::abs;
+EIGEN_DEVICE_FUNC inline typename internal::traits<Derived>::Scalar QuaternionBase<Derived>::angularDistance(
+    const QuaternionBase<OtherDerived>& other) const {
+  EIGEN_USING_STD(atan2)
   Quaternion<Scalar> d = (*this) * other.conjugate();
-  return Scalar(2) * atan2( d.vec().norm(), abs(d.w()) );
+  return Scalar(2) * atan2(d.vec().norm(), numext::abs(d.w()));
 }
 
- 
-    
 /** \returns the spherical linear interpolation between the two quaternions
-  * \c *this and \a other at the parameter \a t in [0;1].
-  * 
-  * This represents an interpolation for a constant motion between \c *this and \a other,
-  * see also http://en.wikipedia.org/wiki/Slerp.
-  */
+ * \c *this and \a other at the parameter \a t in [0;1].
+ *
+ * This represents an interpolation for a constant motion between \c *this and \a other,
+ * see also http://en.wikipedia.org/wiki/Slerp.
+ */
 template <class Derived>
 template <class OtherDerived>
-Quaternion<typename internal::traits<Derived>::Scalar>
-QuaternionBase<Derived>::slerp(const Scalar& t, const QuaternionBase<OtherDerived>& other) const
-{
-  using std::acos;
-  using std::sin;
-  using std::abs;
-  static const Scalar one = Scalar(1) - NumTraits<Scalar>::epsilon();
+EIGEN_DEVICE_FUNC Quaternion<typename internal::traits<Derived>::Scalar> QuaternionBase<Derived>::slerp(
+    const Scalar& t, const QuaternionBase<OtherDerived>& other) const {
+  EIGEN_USING_STD(acos)
+  EIGEN_USING_STD(sin)
+  const Scalar one = Scalar(1) - NumTraits<Scalar>::epsilon();
   Scalar d = this->dot(other);
-  Scalar absD = abs(d);
+  Scalar absD = numext::abs(d);
 
   Scalar scale0;
   Scalar scale1;
 
-  if(absD>=one)
-  {
+  if (absD >= one) {
     scale0 = Scalar(1) - t;
     scale1 = t;
-  }
-  else
-  {
+  } else {
     // theta is the angle between the 2 quaternions
     Scalar theta = acos(absD);
-    Scalar sinTheta = sin(theta);
+    Scalar sinTheta = numext::sqrt(Scalar(1) - absD * absD);
 
-    scale0 = sin( ( Scalar(1) - t ) * theta) / sinTheta;
-    scale1 = sin( ( t * theta) ) / sinTheta;
+    scale0 = sin((Scalar(1) - t) * theta) / sinTheta;
+    scale1 = sin((t * theta)) / sinTheta;
   }
-  if(d<Scalar(0)) scale1 = -scale1;
+  if (d < Scalar(0)) scale1 = -scale1;
 
   return Quaternion<Scalar>(scale0 * coeffs() + scale1 * other.coeffs());
 }
@@ -718,59 +879,52 @@ QuaternionBase<Derived>::slerp(const Scalar& t, const QuaternionBase<OtherDerive
 namespace internal {
 
 // set from a rotation matrix
-template<typename Other>
-struct quaternionbase_assign_impl<Other,3,3>
-{
+template <typename Other>
+struct quaternionbase_assign_impl<Other, 3, 3> {
   typedef typename Other::Scalar Scalar;
-  typedef DenseIndex Index;
-  template<class Derived> static inline void run(QuaternionBase<Derived>& q, const Other& mat)
-  {
-    using std::sqrt;
+  template <class Derived>
+  EIGEN_DEVICE_FUNC static inline void run(QuaternionBase<Derived>& q, const Other& a_mat) {
+    const typename internal::nested_eval<Other, 2>::type mat(a_mat);
+    EIGEN_USING_STD(sqrt)
     // This algorithm comes from  "Quaternion Calculus and Fast Animation",
     // Ken Shoemake, 1987 SIGGRAPH course notes
     Scalar t = mat.trace();
-    if (t > Scalar(0))
-    {
+    if (t > Scalar(0)) {
       t = sqrt(t + Scalar(1.0));
-      q.w() = Scalar(0.5)*t;
-      t = Scalar(0.5)/t;
-      q.x() = (mat.coeff(2,1) - mat.coeff(1,2)) * t;
-      q.y() = (mat.coeff(0,2) - mat.coeff(2,0)) * t;
-      q.z() = (mat.coeff(1,0) - mat.coeff(0,1)) * t;
-    }
-    else
-    {
-      DenseIndex i = 0;
-      if (mat.coeff(1,1) > mat.coeff(0,0))
-        i = 1;
-      if (mat.coeff(2,2) > mat.coeff(i,i))
-        i = 2;
-      DenseIndex j = (i+1)%3;
-      DenseIndex k = (j+1)%3;
-
-      t = sqrt(mat.coeff(i,i)-mat.coeff(j,j)-mat.coeff(k,k) + Scalar(1.0));
+      q.w() = Scalar(0.5) * t;
+      t = Scalar(0.5) / t;
+      q.x() = (mat.coeff(2, 1) - mat.coeff(1, 2)) * t;
+      q.y() = (mat.coeff(0, 2) - mat.coeff(2, 0)) * t;
+      q.z() = (mat.coeff(1, 0) - mat.coeff(0, 1)) * t;
+    } else {
+      Index i = 0;
+      if (mat.coeff(1, 1) > mat.coeff(0, 0)) i = 1;
+      if (mat.coeff(2, 2) > mat.coeff(i, i)) i = 2;
+      Index j = (i + 1) % 3;
+      Index k = (j + 1) % 3;
+
+      t = sqrt(mat.coeff(i, i) - mat.coeff(j, j) - mat.coeff(k, k) + Scalar(1.0));
       q.coeffs().coeffRef(i) = Scalar(0.5) * t;
-      t = Scalar(0.5)/t;
-      q.w() = (mat.coeff(k,j)-mat.coeff(j,k))*t;
-      q.coeffs().coeffRef(j) = (mat.coeff(j,i)+mat.coeff(i,j))*t;
-      q.coeffs().coeffRef(k) = (mat.coeff(k,i)+mat.coeff(i,k))*t;
+      t = Scalar(0.5) / t;
+      q.w() = (mat.coeff(k, j) - mat.coeff(j, k)) * t;
+      q.coeffs().coeffRef(j) = (mat.coeff(j, i) + mat.coeff(i, j)) * t;
+      q.coeffs().coeffRef(k) = (mat.coeff(k, i) + mat.coeff(i, k)) * t;
     }
   }
 };
 
 // set from a vector of coefficients assumed to be a quaternion
-template<typename Other>
-struct quaternionbase_assign_impl<Other,4,1>
-{
+template <typename Other>
+struct quaternionbase_assign_impl<Other, 4, 1> {
   typedef typename Other::Scalar Scalar;
-  template<class Derived> static inline void run(QuaternionBase<Derived>& q, const Other& vec)
-  {
+  template <class Derived>
+  EIGEN_DEVICE_FUNC static inline void run(QuaternionBase<Derived>& q, const Other& vec) {
     q.coeffs() = vec;
   }
 };
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_QUATERNION_H
+#endif  // EIGEN_QUATERNION_H
diff --git a/inst/include/Eigen/src/Geometry/Rotation2D.h b/inst/include/Eigen/src/Geometry/Rotation2D.h
index a2d59fce..59180253 100644
--- a/inst/include/Eigen/src/Geometry/Rotation2D.h
+++ b/inst/include/Eigen/src/Geometry/Rotation2D.h
@@ -10,151 +10,192 @@
 #ifndef EIGEN_ROTATION2D_H
 #define EIGEN_ROTATION2D_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 /** \geometry_module \ingroup Geometry_Module
-  *
-  * \class Rotation2D
-  *
-  * \brief Represents a rotation/orientation in a 2 dimensional space.
-  *
-  * \param _Scalar the scalar type, i.e., the type of the coefficients
-  *
-  * This class is equivalent to a single scalar representing a counter clock wise rotation
-  * as a single angle in radian. It provides some additional features such as the automatic
-  * conversion from/to a 2x2 rotation matrix. Moreover this class aims to provide a similar
-  * interface to Quaternion in order to facilitate the writing of generic algorithms
-  * dealing with rotations.
-  *
-  * \sa class Quaternion, class Transform
-  */
+ *
+ * \class Rotation2D
+ *
+ * \brief Represents a rotation/orientation in a 2 dimensional space.
+ *
+ * \tparam Scalar_ the scalar type, i.e., the type of the coefficients
+ *
+ * This class is equivalent to a single scalar representing a counter clock wise rotation
+ * as a single angle in radian. It provides some additional features such as the automatic
+ * conversion from/to a 2x2 rotation matrix. Moreover this class aims to provide a similar
+ * interface to Quaternion in order to facilitate the writing of generic algorithms
+ * dealing with rotations.
+ *
+ * \sa class Quaternion, class Transform
+ */
 
 namespace internal {
 
-template<typename _Scalar> struct traits<Rotation2D<_Scalar> >
-{
-  typedef _Scalar Scalar;
+template <typename Scalar_>
+struct traits<Rotation2D<Scalar_> > {
+  typedef Scalar_ Scalar;
 };
-} // end namespace internal
-
-template<typename _Scalar>
-class Rotation2D : public RotationBase<Rotation2D<_Scalar>,2>
-{
-  typedef RotationBase<Rotation2D<_Scalar>,2> Base;
+}  // end namespace internal
 
-public:
+template <typename Scalar_>
+class Rotation2D : public RotationBase<Rotation2D<Scalar_>, 2> {
+  typedef RotationBase<Rotation2D<Scalar_>, 2> Base;
 
+ public:
   using Base::operator*;
 
   enum { Dim = 2 };
   /** the scalar type of the coefficients */
-  typedef _Scalar Scalar;
-  typedef Matrix<Scalar,2,1> Vector2;
-  typedef Matrix<Scalar,2,2> Matrix2;
-
-protected:
+  typedef Scalar_ Scalar;
+  typedef Matrix<Scalar, 2, 1> Vector2;
+  typedef Matrix<Scalar, 2, 2> Matrix2;
 
+ protected:
   Scalar m_angle;
 
-public:
-
+ public:
   /** Construct a 2D counter clock wise rotation from the angle \a a in radian. */
-  inline Rotation2D(const Scalar& a) : m_angle(a) {}
-  
-  /** Default constructor wihtout initialization. The represented rotation is undefined. */
-  Rotation2D() {}
+  EIGEN_DEVICE_FUNC explicit inline Rotation2D(const Scalar& a) : m_angle(a) {}
+
+  /** Default constructor without initialization. The represented rotation is undefined. */
+  EIGEN_DEVICE_FUNC Rotation2D() {}
+
+  /** Construct a 2D rotation from a 2x2 rotation matrix \a mat.
+   *
+   * \sa fromRotationMatrix()
+   */
+  template <typename Derived>
+  EIGEN_DEVICE_FUNC explicit Rotation2D(const MatrixBase<Derived>& m) {
+    fromRotationMatrix(m.derived());
+  }
 
   /** \returns the rotation angle */
-  inline Scalar angle() const { return m_angle; }
+  EIGEN_DEVICE_FUNC inline Scalar angle() const { return m_angle; }
 
   /** \returns a read-write reference to the rotation angle */
-  inline Scalar& angle() { return m_angle; }
+  EIGEN_DEVICE_FUNC inline Scalar& angle() { return m_angle; }
+
+  /** \returns the rotation angle in [0,2pi] */
+  EIGEN_DEVICE_FUNC inline Scalar smallestPositiveAngle() const {
+    Scalar tmp = numext::fmod(m_angle, Scalar(2 * EIGEN_PI));
+    return tmp < Scalar(0) ? tmp + Scalar(2 * EIGEN_PI) : tmp;
+  }
+
+  /** \returns the rotation angle in [-pi,pi] */
+  EIGEN_DEVICE_FUNC inline Scalar smallestAngle() const {
+    Scalar tmp = numext::fmod(m_angle, Scalar(2 * EIGEN_PI));
+    if (tmp > Scalar(EIGEN_PI))
+      tmp -= Scalar(2 * EIGEN_PI);
+    else if (tmp < -Scalar(EIGEN_PI))
+      tmp += Scalar(2 * EIGEN_PI);
+    return tmp;
+  }
 
   /** \returns the inverse rotation */
-  inline Rotation2D inverse() const { return -m_angle; }
+  EIGEN_DEVICE_FUNC inline Rotation2D inverse() const { return Rotation2D(-m_angle); }
 
   /** Concatenates two rotations */
-  inline Rotation2D operator*(const Rotation2D& other) const
-  { return m_angle + other.m_angle; }
+  EIGEN_DEVICE_FUNC inline Rotation2D operator*(const Rotation2D& other) const {
+    return Rotation2D(m_angle + other.m_angle);
+  }
 
   /** Concatenates two rotations */
-  inline Rotation2D& operator*=(const Rotation2D& other)
-  { m_angle += other.m_angle; return *this; }
+  EIGEN_DEVICE_FUNC inline Rotation2D& operator*=(const Rotation2D& other) {
+    m_angle += other.m_angle;
+    return *this;
+  }
 
   /** Applies the rotation to a 2D vector */
-  Vector2 operator* (const Vector2& vec) const
-  { return toRotationMatrix() * vec; }
-  
-  template<typename Derived>
-  Rotation2D& fromRotationMatrix(const MatrixBase<Derived>& m);
-  Matrix2 toRotationMatrix() const;
+  EIGEN_DEVICE_FUNC Vector2 operator*(const Vector2& vec) const { return toRotationMatrix() * vec; }
+
+  template <typename Derived>
+  EIGEN_DEVICE_FUNC Rotation2D& fromRotationMatrix(const MatrixBase<Derived>& m);
+  EIGEN_DEVICE_FUNC Matrix2 toRotationMatrix() const;
+
+  /** Set \c *this from a 2x2 rotation matrix \a mat.
+   * In other words, this function extract the rotation angle from the rotation matrix.
+   *
+   * This method is an alias for fromRotationMatrix()
+   *
+   * \sa fromRotationMatrix()
+   */
+  template <typename Derived>
+  EIGEN_DEVICE_FUNC Rotation2D& operator=(const MatrixBase<Derived>& m) {
+    return fromRotationMatrix(m.derived());
+  }
 
   /** \returns the spherical interpolation between \c *this and \a other using
-    * parameter \a t. It is in fact equivalent to a linear interpolation.
-    */
-  inline Rotation2D slerp(const Scalar& t, const Rotation2D& other) const
-  { return m_angle * (1-t) + other.angle() * t; }
+   * parameter \a t. It is in fact equivalent to a linear interpolation.
+   */
+  EIGEN_DEVICE_FUNC inline Rotation2D slerp(const Scalar& t, const Rotation2D& other) const {
+    Scalar dist = Rotation2D(other.m_angle - m_angle).smallestAngle();
+    return Rotation2D(m_angle + dist * t);
+  }
 
   /** \returns \c *this with scalar type casted to \a NewScalarType
-    *
-    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
-    * then this function smartly returns a const reference to \c *this.
-    */
-  template<typename NewScalarType>
-  inline typename internal::cast_return_type<Rotation2D,Rotation2D<NewScalarType> >::type cast() const
-  { return typename internal::cast_return_type<Rotation2D,Rotation2D<NewScalarType> >::type(*this); }
+   *
+   * Note that if \a NewScalarType is equal to the current scalar type of \c *this
+   * then this function smartly returns a const reference to \c *this.
+   */
+  template <typename NewScalarType>
+  EIGEN_DEVICE_FUNC inline typename internal::cast_return_type<Rotation2D, Rotation2D<NewScalarType> >::type cast()
+      const {
+    return typename internal::cast_return_type<Rotation2D, Rotation2D<NewScalarType> >::type(*this);
+  }
 
   /** Copy constructor with scalar type conversion */
-  template<typename OtherScalarType>
-  inline explicit Rotation2D(const Rotation2D<OtherScalarType>& other)
-  {
+  template <typename OtherScalarType>
+  EIGEN_DEVICE_FUNC inline explicit Rotation2D(const Rotation2D<OtherScalarType>& other) {
     m_angle = Scalar(other.angle());
   }
 
-  static inline Rotation2D Identity() { return Rotation2D(0); }
+  EIGEN_DEVICE_FUNC static inline Rotation2D Identity() { return Rotation2D(0); }
 
   /** \returns \c true if \c *this is approximately equal to \a other, within the precision
-    * determined by \a prec.
-    *
-    * \sa MatrixBase::isApprox() */
-  bool isApprox(const Rotation2D& other, const typename NumTraits<Scalar>::Real& prec = NumTraits<Scalar>::dummy_precision()) const
-  { return internal::isApprox(m_angle,other.m_angle, prec); }
+   * determined by \a prec.
+   *
+   * \sa MatrixBase::isApprox() */
+  EIGEN_DEVICE_FUNC bool isApprox(const Rotation2D& other, const typename NumTraits<Scalar>::Real& prec =
+                                                               NumTraits<Scalar>::dummy_precision()) const {
+    return internal::isApprox(m_angle, other.m_angle, prec);
+  }
 };
 
 /** \ingroup Geometry_Module
-  * single precision 2D rotation type */
+ * single precision 2D rotation type */
 typedef Rotation2D<float> Rotation2Df;
 /** \ingroup Geometry_Module
-  * double precision 2D rotation type */
+ * double precision 2D rotation type */
 typedef Rotation2D<double> Rotation2Dd;
 
 /** Set \c *this from a 2x2 rotation matrix \a mat.
-  * In other words, this function extract the rotation angle
-  * from the rotation matrix.
-  */
-template<typename Scalar>
-template<typename Derived>
-Rotation2D<Scalar>& Rotation2D<Scalar>::fromRotationMatrix(const MatrixBase<Derived>& mat)
-{
-  using std::atan2;
-  EIGEN_STATIC_ASSERT(Derived::RowsAtCompileTime==2 && Derived::ColsAtCompileTime==2,YOU_MADE_A_PROGRAMMING_MISTAKE)
-  m_angle = atan2(mat.coeff(1,0), mat.coeff(0,0));
+ * In other words, this function extract the rotation angle
+ * from the rotation matrix.
+ */
+template <typename Scalar>
+template <typename Derived>
+EIGEN_DEVICE_FUNC Rotation2D<Scalar>& Rotation2D<Scalar>::fromRotationMatrix(const MatrixBase<Derived>& mat) {
+  EIGEN_USING_STD(atan2)
+  EIGEN_STATIC_ASSERT(Derived::RowsAtCompileTime == 2 && Derived::ColsAtCompileTime == 2,
+                      YOU_MADE_A_PROGRAMMING_MISTAKE)
+  m_angle = atan2(mat.coeff(1, 0), mat.coeff(0, 0));
   return *this;
 }
 
 /** Constructs and \returns an equivalent 2x2 rotation matrix.
-  */
-template<typename Scalar>
-typename Rotation2D<Scalar>::Matrix2
-Rotation2D<Scalar>::toRotationMatrix(void) const
-{
-  using std::sin;
-  using std::cos;
+ */
+template <typename Scalar>
+typename Rotation2D<Scalar>::Matrix2 EIGEN_DEVICE_FUNC Rotation2D<Scalar>::toRotationMatrix(void) const {
+  EIGEN_USING_STD(sin)
+  EIGEN_USING_STD(cos)
   Scalar sinA = sin(m_angle);
   Scalar cosA = cos(m_angle);
   return (Matrix2() << cosA, -sinA, sinA, cosA).finished();
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_ROTATION2D_H
+#endif  // EIGEN_ROTATION2D_H
diff --git a/inst/include/Eigen/src/Geometry/RotationBase.h b/inst/include/Eigen/src/Geometry/RotationBase.h
index b88661de..3a3a3e31 100644
--- a/inst/include/Eigen/src/Geometry/RotationBase.h
+++ b/inst/include/Eigen/src/Geometry/RotationBase.h
@@ -10,197 +10,200 @@
 #ifndef EIGEN_ROTATIONBASE_H
 #define EIGEN_ROTATIONBASE_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 // forward declaration
 namespace internal {
-template<typename RotationDerived, typename MatrixType, bool IsVector=MatrixType::IsVectorAtCompileTime>
+template <typename RotationDerived, typename MatrixType, bool IsVector = MatrixType::IsVectorAtCompileTime>
 struct rotation_base_generic_product_selector;
 }
 
 /** \class RotationBase
-  *
-  * \brief Common base class for compact rotation representations
-  *
-  * \param Derived is the derived type, i.e., a rotation type
-  * \param _Dim the dimension of the space
-  */
-template<typename Derived, int _Dim>
-class RotationBase
-{
-  public:
-    enum { Dim = _Dim };
-    /** the scalar type of the coefficients */
-    typedef typename internal::traits<Derived>::Scalar Scalar;
-
-    /** corresponding linear transformation matrix type */
-    typedef Matrix<Scalar,Dim,Dim> RotationMatrixType;
-    typedef Matrix<Scalar,Dim,1> VectorType;
-
-  public:
-    inline const Derived& derived() const { return *static_cast<const Derived*>(this); }
-    inline Derived& derived() { return *static_cast<Derived*>(this); }
-
-    /** \returns an equivalent rotation matrix */
-    inline RotationMatrixType toRotationMatrix() const { return derived().toRotationMatrix(); }
-
-    /** \returns an equivalent rotation matrix 
-      * This function is added to be conform with the Transform class' naming scheme.
-      */
-    inline RotationMatrixType matrix() const { return derived().toRotationMatrix(); }
-
-    /** \returns the inverse rotation */
-    inline Derived inverse() const { return derived().inverse(); }
-
-    /** \returns the concatenation of the rotation \c *this with a translation \a t */
-    inline Transform<Scalar,Dim,Isometry> operator*(const Translation<Scalar,Dim>& t) const
-    { return Transform<Scalar,Dim,Isometry>(*this) * t; }
-
-    /** \returns the concatenation of the rotation \c *this with a uniform scaling \a s */
-    inline RotationMatrixType operator*(const UniformScaling<Scalar>& s) const
-    { return toRotationMatrix() * s.factor(); }
-
-    /** \returns the concatenation of the rotation \c *this with a generic expression \a e
-      * \a e can be:
-      *  - a DimxDim linear transformation matrix
-      *  - a DimxDim diagonal matrix (axis aligned scaling)
-      *  - a vector of size Dim
-      */
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE typename internal::rotation_base_generic_product_selector<Derived,OtherDerived,OtherDerived::IsVectorAtCompileTime>::ReturnType
-    operator*(const EigenBase<OtherDerived>& e) const
-    { return internal::rotation_base_generic_product_selector<Derived,OtherDerived>::run(derived(), e.derived()); }
-
-    /** \returns the concatenation of a linear transformation \a l with the rotation \a r */
-    template<typename OtherDerived> friend
-    inline RotationMatrixType operator*(const EigenBase<OtherDerived>& l, const Derived& r)
-    { return l.derived() * r.toRotationMatrix(); }
-
-    /** \returns the concatenation of a scaling \a l with the rotation \a r */
-    friend inline Transform<Scalar,Dim,Affine> operator*(const DiagonalMatrix<Scalar,Dim>& l, const Derived& r)
-    { 
-      Transform<Scalar,Dim,Affine> res(r);
-      res.linear().applyOnTheLeft(l);
-      return res;
-    }
-
-    /** \returns the concatenation of the rotation \c *this with a transformation \a t */
-    template<int Mode, int Options>
-    inline Transform<Scalar,Dim,Mode> operator*(const Transform<Scalar,Dim,Mode,Options>& t) const
-    { return toRotationMatrix() * t; }
-
-    template<typename OtherVectorType>
-    inline VectorType _transformVector(const OtherVectorType& v) const
-    { return toRotationMatrix() * v; }
+ *
+ * \brief Common base class for compact rotation representations
+ *
+ * \tparam Derived is the derived type, i.e., a rotation type
+ * \tparam Dim_ the dimension of the space
+ */
+template <typename Derived, int Dim_>
+class RotationBase {
+ public:
+  enum { Dim = Dim_ };
+  /** the scalar type of the coefficients */
+  typedef typename internal::traits<Derived>::Scalar Scalar;
+
+  /** corresponding linear transformation matrix type */
+  typedef Matrix<Scalar, Dim, Dim> RotationMatrixType;
+  typedef Matrix<Scalar, Dim, 1> VectorType;
+
+ public:
+  EIGEN_DEVICE_FUNC inline const Derived& derived() const { return *static_cast<const Derived*>(this); }
+  EIGEN_DEVICE_FUNC inline Derived& derived() { return *static_cast<Derived*>(this); }
+
+  /** \returns an equivalent rotation matrix */
+  EIGEN_DEVICE_FUNC inline RotationMatrixType toRotationMatrix() const { return derived().toRotationMatrix(); }
+
+  /** \returns an equivalent rotation matrix
+   * This function is added to be conform with the Transform class' naming scheme.
+   */
+  EIGEN_DEVICE_FUNC inline RotationMatrixType matrix() const { return derived().toRotationMatrix(); }
+
+  /** \returns the inverse rotation */
+  EIGEN_DEVICE_FUNC inline Derived inverse() const { return derived().inverse(); }
+
+  /** \returns the concatenation of the rotation \c *this with a translation \a t */
+  EIGEN_DEVICE_FUNC inline Transform<Scalar, Dim, Isometry> operator*(const Translation<Scalar, Dim>& t) const {
+    return Transform<Scalar, Dim, Isometry>(*this) * t;
+  }
+
+  /** \returns the concatenation of the rotation \c *this with a uniform scaling \a s */
+  EIGEN_DEVICE_FUNC inline RotationMatrixType operator*(const UniformScaling<Scalar>& s) const {
+    return toRotationMatrix() * s.factor();
+  }
+
+  /** \returns the concatenation of the rotation \c *this with a generic expression \a e
+   * \a e can be:
+   *  - a DimxDim linear transformation matrix
+   *  - a DimxDim diagonal matrix (axis aligned scaling)
+   *  - a vector of size Dim
+   */
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+      typename internal::rotation_base_generic_product_selector<Derived, OtherDerived,
+                                                                OtherDerived::IsVectorAtCompileTime>::ReturnType
+      operator*(const EigenBase<OtherDerived>& e) const {
+    return internal::rotation_base_generic_product_selector<Derived, OtherDerived>::run(derived(), e.derived());
+  }
+
+  /** \returns the concatenation of a linear transformation \a l with the rotation \a r */
+  template <typename OtherDerived>
+  friend EIGEN_DEVICE_FUNC inline RotationMatrixType operator*(const EigenBase<OtherDerived>& l, const Derived& r) {
+    return l.derived() * r.toRotationMatrix();
+  }
+
+  /** \returns the concatenation of a scaling \a l with the rotation \a r */
+  EIGEN_DEVICE_FUNC friend inline Transform<Scalar, Dim, Affine> operator*(const DiagonalMatrix<Scalar, Dim>& l,
+                                                                           const Derived& r) {
+    Transform<Scalar, Dim, Affine> res(r);
+    res.linear().applyOnTheLeft(l);
+    return res;
+  }
+
+  /** \returns the concatenation of the rotation \c *this with a transformation \a t */
+  template <int Mode, int Options>
+  EIGEN_DEVICE_FUNC inline Transform<Scalar, Dim, Mode> operator*(
+      const Transform<Scalar, Dim, Mode, Options>& t) const {
+    return toRotationMatrix() * t;
+  }
+
+  template <typename OtherVectorType>
+  EIGEN_DEVICE_FUNC inline VectorType _transformVector(const OtherVectorType& v) const {
+    return toRotationMatrix() * v;
+  }
 };
 
 namespace internal {
 
 // implementation of the generic product rotation * matrix
-template<typename RotationDerived, typename MatrixType>
-struct rotation_base_generic_product_selector<RotationDerived,MatrixType,false>
-{
+template <typename RotationDerived, typename MatrixType>
+struct rotation_base_generic_product_selector<RotationDerived, MatrixType, false> {
   enum { Dim = RotationDerived::Dim };
-  typedef Matrix<typename RotationDerived::Scalar,Dim,Dim> ReturnType;
-  static inline ReturnType run(const RotationDerived& r, const MatrixType& m)
-  { return r.toRotationMatrix() * m; }
+  typedef Matrix<typename RotationDerived::Scalar, Dim, Dim> ReturnType;
+  EIGEN_DEVICE_FUNC static inline ReturnType run(const RotationDerived& r, const MatrixType& m) {
+    return r.toRotationMatrix() * m;
+  }
 };
 
-template<typename RotationDerived, typename Scalar, int Dim, int MaxDim>
-struct rotation_base_generic_product_selector< RotationDerived, DiagonalMatrix<Scalar,Dim,MaxDim>, false >
-{
-  typedef Transform<Scalar,Dim,Affine> ReturnType;
-  static inline ReturnType run(const RotationDerived& r, const DiagonalMatrix<Scalar,Dim,MaxDim>& m)
-  {
+template <typename RotationDerived, typename Scalar, int Dim, int MaxDim>
+struct rotation_base_generic_product_selector<RotationDerived, DiagonalMatrix<Scalar, Dim, MaxDim>, false> {
+  typedef Transform<Scalar, Dim, Affine> ReturnType;
+  EIGEN_DEVICE_FUNC static inline ReturnType run(const RotationDerived& r,
+                                                 const DiagonalMatrix<Scalar, Dim, MaxDim>& m) {
     ReturnType res(r);
     res.linear() *= m;
     return res;
   }
 };
 
-template<typename RotationDerived,typename OtherVectorType>
-struct rotation_base_generic_product_selector<RotationDerived,OtherVectorType,true>
-{
+template <typename RotationDerived, typename OtherVectorType>
+struct rotation_base_generic_product_selector<RotationDerived, OtherVectorType, true> {
   enum { Dim = RotationDerived::Dim };
-  typedef Matrix<typename RotationDerived::Scalar,Dim,1> ReturnType;
-  static EIGEN_STRONG_INLINE ReturnType run(const RotationDerived& r, const OtherVectorType& v)
-  {
+  typedef Matrix<typename RotationDerived::Scalar, Dim, 1> ReturnType;
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE ReturnType run(const RotationDerived& r, const OtherVectorType& v) {
     return r._transformVector(v);
   }
 };
 
-} // end namespace internal
+}  // end namespace internal
 
 /** \geometry_module
-  *
-  * \brief Constructs a Dim x Dim rotation matrix from the rotation \a r
-  */
-template<typename _Scalar, int _Rows, int _Cols, int _Storage, int _MaxRows, int _MaxCols>
-template<typename OtherDerived>
-Matrix<_Scalar, _Rows, _Cols, _Storage, _MaxRows, _MaxCols>
-::Matrix(const RotationBase<OtherDerived,ColsAtCompileTime>& r)
-{
-  EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(Matrix,int(OtherDerived::Dim),int(OtherDerived::Dim))
+ *
+ * \brief Constructs a Dim x Dim rotation matrix from the rotation \a r
+ */
+template <typename Scalar_, int Rows_, int Cols_, int Storage_, int MaxRows_, int MaxCols_>
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC Matrix<Scalar_, Rows_, Cols_, Storage_, MaxRows_, MaxCols_>::Matrix(
+    const RotationBase<OtherDerived, ColsAtCompileTime>& r) {
+  EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(Matrix, int(OtherDerived::Dim), int(OtherDerived::Dim))
   *this = r.toRotationMatrix();
 }
 
 /** \geometry_module
-  *
-  * \brief Set a Dim x Dim rotation matrix from the rotation \a r
-  */
-template<typename _Scalar, int _Rows, int _Cols, int _Storage, int _MaxRows, int _MaxCols>
-template<typename OtherDerived>
-Matrix<_Scalar, _Rows, _Cols, _Storage, _MaxRows, _MaxCols>&
-Matrix<_Scalar, _Rows, _Cols, _Storage, _MaxRows, _MaxCols>
-::operator=(const RotationBase<OtherDerived,ColsAtCompileTime>& r)
-{
-  EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(Matrix,int(OtherDerived::Dim),int(OtherDerived::Dim))
+ *
+ * \brief Set a Dim x Dim rotation matrix from the rotation \a r
+ */
+template <typename Scalar_, int Rows_, int Cols_, int Storage_, int MaxRows_, int MaxCols_>
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC Matrix<Scalar_, Rows_, Cols_, Storage_, MaxRows_, MaxCols_>&
+Matrix<Scalar_, Rows_, Cols_, Storage_, MaxRows_, MaxCols_>::operator=(
+    const RotationBase<OtherDerived, ColsAtCompileTime>& r) {
+  EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(Matrix, int(OtherDerived::Dim), int(OtherDerived::Dim))
   return *this = r.toRotationMatrix();
 }
 
 namespace internal {
 
 /** \internal
-  *
-  * Helper function to return an arbitrary rotation object to a rotation matrix.
-  *
-  * \param Scalar the numeric type of the matrix coefficients
-  * \param Dim the dimension of the current space
-  *
-  * It returns a Dim x Dim fixed size matrix.
-  *
-  * Default specializations are provided for:
-  *   - any scalar type (2D),
-  *   - any matrix expression,
-  *   - any type based on RotationBase (e.g., Quaternion, AngleAxis, Rotation2D)
-  *
-  * Currently toRotationMatrix is only used by Transform.
-  *
-  * \sa class Transform, class Rotation2D, class Quaternion, class AngleAxis
-  */
-template<typename Scalar, int Dim>
-static inline Matrix<Scalar,2,2> toRotationMatrix(const Scalar& s)
-{
-  EIGEN_STATIC_ASSERT(Dim==2,YOU_MADE_A_PROGRAMMING_MISTAKE)
+ *
+ * Helper function to return an arbitrary rotation object to a rotation matrix.
+ *
+ * \tparam Scalar the numeric type of the matrix coefficients
+ * \tparam Dim the dimension of the current space
+ *
+ * It returns a Dim x Dim fixed size matrix.
+ *
+ * Default specializations are provided for:
+ *   - any scalar type (2D),
+ *   - any matrix expression,
+ *   - any type based on RotationBase (e.g., Quaternion, AngleAxis, Rotation2D)
+ *
+ * Currently toRotationMatrix is only used by Transform.
+ *
+ * \sa class Transform, class Rotation2D, class Quaternion, class AngleAxis
+ */
+template <typename Scalar, int Dim>
+EIGEN_DEVICE_FUNC static inline Matrix<Scalar, 2, 2> toRotationMatrix(const Scalar& s) {
+  EIGEN_STATIC_ASSERT(Dim == 2, YOU_MADE_A_PROGRAMMING_MISTAKE)
   return Rotation2D<Scalar>(s).toRotationMatrix();
 }
 
-template<typename Scalar, int Dim, typename OtherDerived>
-static inline Matrix<Scalar,Dim,Dim> toRotationMatrix(const RotationBase<OtherDerived,Dim>& r)
-{
+template <typename Scalar, int Dim, typename OtherDerived>
+EIGEN_DEVICE_FUNC static inline Matrix<Scalar, Dim, Dim> toRotationMatrix(const RotationBase<OtherDerived, Dim>& r) {
   return r.toRotationMatrix();
 }
 
-template<typename Scalar, int Dim, typename OtherDerived>
-static inline const MatrixBase<OtherDerived>& toRotationMatrix(const MatrixBase<OtherDerived>& mat)
-{
-  EIGEN_STATIC_ASSERT(OtherDerived::RowsAtCompileTime==Dim && OtherDerived::ColsAtCompileTime==Dim,
-    YOU_MADE_A_PROGRAMMING_MISTAKE)
+template <typename Scalar, int Dim, typename OtherDerived>
+EIGEN_DEVICE_FUNC static inline const MatrixBase<OtherDerived>& toRotationMatrix(const MatrixBase<OtherDerived>& mat) {
+  EIGEN_STATIC_ASSERT(OtherDerived::RowsAtCompileTime == Dim && OtherDerived::ColsAtCompileTime == Dim,
+                      YOU_MADE_A_PROGRAMMING_MISTAKE)
   return mat;
 }
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_ROTATIONBASE_H
+#endif  // EIGEN_ROTATIONBASE_H
diff --git a/inst/include/Eigen/src/Geometry/Scaling.h b/inst/include/Eigen/src/Geometry/Scaling.h
index 1c25f36f..a0604cee 100644
--- a/inst/include/Eigen/src/Geometry/Scaling.h
+++ b/inst/include/Eigen/src/Geometry/Scaling.h
@@ -10,38 +10,49 @@
 #ifndef EIGEN_SCALING_H
 #define EIGEN_SCALING_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 /** \geometry_module \ingroup Geometry_Module
-  *
-  * \class Scaling
-  *
-  * \brief Represents a generic uniform scaling transformation
-  *
-  * \param _Scalar the scalar type, i.e., the type of the coefficients.
-  *
-  * This class represent a uniform scaling transformation. It is the return
-  * type of Scaling(Scalar), and most of the time this is the only way it
-  * is used. In particular, this class is not aimed to be used to store a scaling transformation,
-  * but rather to make easier the constructions and updates of Transform objects.
-  *
-  * To represent an axis aligned scaling, use the DiagonalMatrix class.
-  *
-  * \sa Scaling(), class DiagonalMatrix, MatrixBase::asDiagonal(), class Translation, class Transform
-  */
-template<typename _Scalar>
-class UniformScaling
-{
-public:
-  /** the scalar type of the coefficients */
-  typedef _Scalar Scalar;
+ *
+ * \class UniformScaling
+ *
+ * \brief Represents a generic uniform scaling transformation
+ *
+ * \tparam Scalar_ the scalar type, i.e., the type of the coefficients.
+ *
+ * This class represent a uniform scaling transformation. It is the return
+ * type of Scaling(Scalar), and most of the time this is the only way it
+ * is used. In particular, this class is not aimed to be used to store a scaling transformation,
+ * but rather to make easier the constructions and updates of Transform objects.
+ *
+ * To represent an axis aligned scaling, use the DiagonalMatrix class.
+ *
+ * \sa Scaling(), class DiagonalMatrix, MatrixBase::asDiagonal(), class Translation, class Transform
+ */
+
+namespace internal {
+// This helper helps nvcc+MSVC to properly parse this file.
+// See bug 1412.
+template <typename Scalar, int Dim, int Mode>
+struct uniformscaling_times_affine_returntype {
+  enum { NewMode = int(Mode) == int(Isometry) ? Affine : Mode };
+  typedef Transform<Scalar, Dim, NewMode> type;
+};
+}  // namespace internal
 
-protected:
+template <typename Scalar_>
+class UniformScaling {
+ public:
+  /** the scalar type of the coefficients */
+  typedef Scalar_ Scalar;
 
+ protected:
   Scalar m_factor;
 
-public:
-
+ public:
   /** Default constructor without initialization. */
   UniformScaling() {}
   /** Constructs and initialize a uniform scaling transformation */
@@ -51,116 +62,134 @@ class UniformScaling
   inline Scalar& factor() { return m_factor; }
 
   /** Concatenates two uniform scaling */
-  inline UniformScaling operator* (const UniformScaling& other) const
-  { return UniformScaling(m_factor * other.factor()); }
+  inline UniformScaling operator*(const UniformScaling& other) const {
+    return UniformScaling(m_factor * other.factor());
+  }
 
   /** Concatenates a uniform scaling and a translation */
-  template<int Dim>
-  inline Transform<Scalar,Dim,Affine> operator* (const Translation<Scalar,Dim>& t) const;
+  template <int Dim>
+  inline Transform<Scalar, Dim, Affine> operator*(const Translation<Scalar, Dim>& t) const;
 
   /** Concatenates a uniform scaling and an affine transformation */
-  template<int Dim, int Mode, int Options>
-  inline Transform<Scalar,Dim,(int(Mode)==int(Isometry)?Affine:Mode)> operator* (const Transform<Scalar,Dim, Mode, Options>& t) const
-  {
-   Transform<Scalar,Dim,(int(Mode)==int(Isometry)?Affine:Mode)> res = t;
-   res.prescale(factor());
-   return res;
-}
+  template <int Dim, int Mode, int Options>
+  inline typename internal::uniformscaling_times_affine_returntype<Scalar, Dim, Mode>::type operator*(
+      const Transform<Scalar, Dim, Mode, Options>& t) const {
+    typename internal::uniformscaling_times_affine_returntype<Scalar, Dim, Mode>::type res = t;
+    res.prescale(factor());
+    return res;
+  }
 
   /** Concatenates a uniform scaling and a linear transformation matrix */
   // TODO returns an expression
-  template<typename Derived>
-  inline typename internal::plain_matrix_type<Derived>::type operator* (const MatrixBase<Derived>& other) const
-  { return other * m_factor; }
+  template <typename Derived>
+  inline typename Eigen::internal::plain_matrix_type<Derived>::type operator*(const MatrixBase<Derived>& other) const {
+    return other * m_factor;
+  }
 
-  template<typename Derived,int Dim>
-  inline Matrix<Scalar,Dim,Dim> operator*(const RotationBase<Derived,Dim>& r) const
-  { return r.toRotationMatrix() * m_factor; }
+  template <typename Derived, int Dim>
+  inline Matrix<Scalar, Dim, Dim> operator*(const RotationBase<Derived, Dim>& r) const {
+    return r.toRotationMatrix() * m_factor;
+  }
 
   /** \returns the inverse scaling */
-  inline UniformScaling inverse() const
-  { return UniformScaling(Scalar(1)/m_factor); }
+  inline UniformScaling inverse() const { return UniformScaling(Scalar(1) / m_factor); }
 
   /** \returns \c *this with scalar type casted to \a NewScalarType
-    *
-    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
-    * then this function smartly returns a const reference to \c *this.
-    */
-  template<typename NewScalarType>
-  inline UniformScaling<NewScalarType> cast() const
-  { return UniformScaling<NewScalarType>(NewScalarType(m_factor)); }
+   *
+   * Note that if \a NewScalarType is equal to the current scalar type of \c *this
+   * then this function smartly returns a const reference to \c *this.
+   */
+  template <typename NewScalarType>
+  inline UniformScaling<NewScalarType> cast() const {
+    return UniformScaling<NewScalarType>(NewScalarType(m_factor));
+  }
 
   /** Copy constructor with scalar type conversion */
-  template<typename OtherScalarType>
-  inline explicit UniformScaling(const UniformScaling<OtherScalarType>& other)
-  { m_factor = Scalar(other.factor()); }
+  template <typename OtherScalarType>
+  inline explicit UniformScaling(const UniformScaling<OtherScalarType>& other) {
+    m_factor = Scalar(other.factor());
+  }
 
   /** \returns \c true if \c *this is approximately equal to \a other, within the precision
-    * determined by \a prec.
-    *
-    * \sa MatrixBase::isApprox() */
-  bool isApprox(const UniformScaling& other, const typename NumTraits<Scalar>::Real& prec = NumTraits<Scalar>::dummy_precision()) const
-  { return internal::isApprox(m_factor, other.factor(), prec); }
-
+   * determined by \a prec.
+   *
+   * \sa MatrixBase::isApprox() */
+  bool isApprox(const UniformScaling& other,
+                const typename NumTraits<Scalar>::Real& prec = NumTraits<Scalar>::dummy_precision()) const {
+    return internal::isApprox(m_factor, other.factor(), prec);
+  }
 };
 
-/** Concatenates a linear transformation matrix and a uniform scaling */
-// NOTE this operator is defiend in MatrixBase and not as a friend function
+/** \addtogroup Geometry_Module */
+//@{
+
+/** Concatenates a linear transformation matrix and a uniform scaling
+ * \relates UniformScaling
+ */
+// NOTE this operator is defined in MatrixBase and not as a friend function
 // of UniformScaling to fix an internal crash of Intel's ICC
-template<typename Derived> typename MatrixBase<Derived>::ScalarMultipleReturnType
-MatrixBase<Derived>::operator*(const UniformScaling<Scalar>& s) const
-{ return derived() * s.factor(); }
+template <typename Derived, typename Scalar>
+EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(Derived, Scalar, product)
+operator*(const MatrixBase<Derived>& matrix, const UniformScaling<Scalar>& s) {
+  return matrix.derived() * s.factor();
+}
 
 /** Constructs a uniform scaling from scale factor \a s */
-static inline UniformScaling<float> Scaling(float s) { return UniformScaling<float>(s); }
+inline UniformScaling<float> Scaling(float s) { return UniformScaling<float>(s); }
 /** Constructs a uniform scaling from scale factor \a s */
-static inline UniformScaling<double> Scaling(double s) { return UniformScaling<double>(s); }
+inline UniformScaling<double> Scaling(double s) { return UniformScaling<double>(s); }
 /** Constructs a uniform scaling from scale factor \a s */
-template<typename RealScalar>
-static inline UniformScaling<std::complex<RealScalar> > Scaling(const std::complex<RealScalar>& s)
-{ return UniformScaling<std::complex<RealScalar> >(s); }
+template <typename RealScalar>
+inline UniformScaling<std::complex<RealScalar> > Scaling(const std::complex<RealScalar>& s) {
+  return UniformScaling<std::complex<RealScalar> >(s);
+}
 
 /** Constructs a 2D axis aligned scaling */
-template<typename Scalar>
-static inline DiagonalMatrix<Scalar,2> Scaling(const Scalar& sx, const Scalar& sy)
-{ return DiagonalMatrix<Scalar,2>(sx, sy); }
+template <typename Scalar>
+inline DiagonalMatrix<Scalar, 2> Scaling(const Scalar& sx, const Scalar& sy) {
+  return DiagonalMatrix<Scalar, 2>(sx, sy);
+}
 /** Constructs a 3D axis aligned scaling */
-template<typename Scalar>
-static inline DiagonalMatrix<Scalar,3> Scaling(const Scalar& sx, const Scalar& sy, const Scalar& sz)
-{ return DiagonalMatrix<Scalar,3>(sx, sy, sz); }
+template <typename Scalar>
+inline DiagonalMatrix<Scalar, 3> Scaling(const Scalar& sx, const Scalar& sy, const Scalar& sz) {
+  return DiagonalMatrix<Scalar, 3>(sx, sy, sz);
+}
 
 /** Constructs an axis aligned scaling expression from vector expression \a coeffs
-  * This is an alias for coeffs.asDiagonal()
-  */
-template<typename Derived>
-static inline const DiagonalWrapper<const Derived> Scaling(const MatrixBase<Derived>& coeffs)
-{ return coeffs.asDiagonal(); }
+ * This is an alias for coeffs.asDiagonal()
+ */
+template <typename Derived>
+inline const DiagonalWrapper<const Derived> Scaling(const MatrixBase<Derived>& coeffs) {
+  return coeffs.asDiagonal();
+}
+
+/** Constructs an axis aligned scaling expression from vector \a coeffs when passed as an rvalue reference */
+template <typename Derived>
+inline typename DiagonalWrapper<const Derived>::PlainObject Scaling(MatrixBase<Derived>&& coeffs) {
+  return typename DiagonalWrapper<const Derived>::PlainObject(std::move(coeffs.derived()));
+}
 
-/** \addtogroup Geometry_Module */
-//@{
 /** \deprecated */
 typedef DiagonalMatrix<float, 2> AlignedScaling2f;
 /** \deprecated */
-typedef DiagonalMatrix<double,2> AlignedScaling2d;
+typedef DiagonalMatrix<double, 2> AlignedScaling2d;
 /** \deprecated */
 typedef DiagonalMatrix<float, 3> AlignedScaling3f;
 /** \deprecated */
-typedef DiagonalMatrix<double,3> AlignedScaling3d;
+typedef DiagonalMatrix<double, 3> AlignedScaling3d;
 //@}
 
-template<typename Scalar>
-template<int Dim>
-inline Transform<Scalar,Dim,Affine>
-UniformScaling<Scalar>::operator* (const Translation<Scalar,Dim>& t) const
-{
-  Transform<Scalar,Dim,Affine> res;
+template <typename Scalar>
+template <int Dim>
+inline Transform<Scalar, Dim, Affine> UniformScaling<Scalar>::operator*(const Translation<Scalar, Dim>& t) const {
+  Transform<Scalar, Dim, Affine> res;
   res.matrix().setZero();
   res.linear().diagonal().fill(factor());
   res.translation() = factor() * t.vector();
-  res(Dim,Dim) = Scalar(1);
+  res(Dim, Dim) = Scalar(1);
   return res;
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_SCALING_H
+#endif  // EIGEN_SCALING_H
diff --git a/inst/include/Eigen/src/Geometry/Transform.h b/inst/include/Eigen/src/Geometry/Transform.h
index e786e535..a5d7b608 100644
--- a/inst/include/Eigen/src/Geometry/Transform.h
+++ b/inst/include/Eigen/src/Geometry/Transform.h
@@ -12,320 +12,316 @@
 #ifndef EIGEN_TRANSFORM_H
 #define EIGEN_TRANSFORM_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 namespace internal {
 
-template<typename Transform>
-struct transform_traits
-{
-  enum
-  {
+template <typename Transform>
+struct transform_traits {
+  enum {
     Dim = Transform::Dim,
     HDim = Transform::HDim,
     Mode = Transform::Mode,
-    IsProjective = (int(Mode)==int(Projective))
+    IsProjective = (int(Mode) == int(Projective))
   };
 };
 
-template< typename TransformType,
-          typename MatrixType,
-          int Case = transform_traits<TransformType>::IsProjective ? 0
-                   : int(MatrixType::RowsAtCompileTime) == int(transform_traits<TransformType>::HDim) ? 1
-                   : 2>
+template <typename TransformType, typename MatrixType,
+          int Case = transform_traits<TransformType>::IsProjective                                      ? 0
+                     : int(MatrixType::RowsAtCompileTime) == int(transform_traits<TransformType>::HDim) ? 1
+                                                                                                        : 2,
+          int RhsCols = MatrixType::ColsAtCompileTime>
 struct transform_right_product_impl;
 
-template< typename Other,
-          int Mode,
-          int Options,
-          int Dim,
-          int HDim,
-          int OtherRows=Other::RowsAtCompileTime,
-          int OtherCols=Other::ColsAtCompileTime>
+template <typename Other, int Mode, int Options, int Dim, int HDim, int OtherRows = Other::RowsAtCompileTime,
+          int OtherCols = Other::ColsAtCompileTime>
 struct transform_left_product_impl;
 
-template< typename Lhs,
-          typename Rhs,
-          bool AnyProjective = 
-            transform_traits<Lhs>::IsProjective ||
-            transform_traits<Rhs>::IsProjective>
+template <typename Lhs, typename Rhs,
+          bool AnyProjective = transform_traits<Lhs>::IsProjective || transform_traits<Rhs>::IsProjective>
 struct transform_transform_product_impl;
 
-template< typename Other,
-          int Mode,
-          int Options,
-          int Dim,
-          int HDim,
-          int OtherRows=Other::RowsAtCompileTime,
-          int OtherCols=Other::ColsAtCompileTime>
+template <typename Other, int Mode, int Options, int Dim, int HDim, int OtherRows = Other::RowsAtCompileTime,
+          int OtherCols = Other::ColsAtCompileTime>
 struct transform_construct_from_matrix;
 
-template<typename TransformType> struct transform_take_affine_part;
+template <typename TransformType>
+struct transform_take_affine_part;
+
+template <typename Scalar_, int Dim_, int Mode_, int Options_>
+struct traits<Transform<Scalar_, Dim_, Mode_, Options_> > {
+  typedef Scalar_ Scalar;
+  typedef Eigen::Index StorageIndex;
+  typedef Dense StorageKind;
+  enum {
+    Dim1 = Dim_ == Dynamic ? Dim_ : Dim_ + 1,
+    RowsAtCompileTime = Mode_ == Projective ? Dim1 : Dim_,
+    ColsAtCompileTime = Dim1,
+    MaxRowsAtCompileTime = RowsAtCompileTime,
+    MaxColsAtCompileTime = ColsAtCompileTime,
+    Flags = 0
+  };
+};
 
-template<int Mode> struct transform_make_affine;
+template <int Mode>
+struct transform_make_affine;
 
-} // end namespace internal
+}  // end namespace internal
 
 /** \geometry_module \ingroup Geometry_Module
-  *
-  * \class Transform
-  *
-  * \brief Represents an homogeneous transformation in a N dimensional space
-  *
-  * \tparam _Scalar the scalar type, i.e., the type of the coefficients
-  * \tparam _Dim the dimension of the space
-  * \tparam _Mode the type of the transformation. Can be:
-  *              - #Affine: the transformation is stored as a (Dim+1)^2 matrix,
-  *                         where the last row is assumed to be [0 ... 0 1].
-  *              - #AffineCompact: the transformation is stored as a (Dim)x(Dim+1) matrix.
-  *              - #Projective: the transformation is stored as a (Dim+1)^2 matrix
-  *                             without any assumption.
-  * \tparam _Options has the same meaning as in class Matrix. It allows to specify DontAlign and/or RowMajor.
-  *                  These Options are passed directly to the underlying matrix type.
-  *
-  * The homography is internally represented and stored by a matrix which
-  * is available through the matrix() method. To understand the behavior of
-  * this class you have to think a Transform object as its internal
-  * matrix representation. The chosen convention is right multiply:
-  *
-  * \code v' = T * v \endcode
-  *
-  * Therefore, an affine transformation matrix M is shaped like this:
-  *
-  * \f$ \left( \begin{array}{cc}
-  * linear & translation\\
-  * 0 ... 0 & 1
-  * \end{array} \right) \f$
-  *
-  * Note that for a projective transformation the last row can be anything,
-  * and then the interpretation of different parts might be sightly different.
-  *
-  * However, unlike a plain matrix, the Transform class provides many features
-  * simplifying both its assembly and usage. In particular, it can be composed
-  * with any other transformations (Transform,Translation,RotationBase,Matrix)
-  * and can be directly used to transform implicit homogeneous vectors. All these
-  * operations are handled via the operator*. For the composition of transformations,
-  * its principle consists to first convert the right/left hand sides of the product
-  * to a compatible (Dim+1)^2 matrix and then perform a pure matrix product.
-  * Of course, internally, operator* tries to perform the minimal number of operations
-  * according to the nature of each terms. Likewise, when applying the transform
-  * to non homogeneous vectors, the latters are automatically promoted to homogeneous
-  * one before doing the matrix product. The convertions to homogeneous representations
-  * are performed as follow:
-  *
-  * \b Translation t (Dim)x(1):
-  * \f$ \left( \begin{array}{cc}
-  * I & t \\
-  * 0\,...\,0 & 1
-  * \end{array} \right) \f$
-  *
-  * \b Rotation R (Dim)x(Dim):
-  * \f$ \left( \begin{array}{cc}
-  * R & 0\\
-  * 0\,...\,0 & 1
-  * \end{array} \right) \f$
-  *
-  * \b Linear \b Matrix L (Dim)x(Dim):
-  * \f$ \left( \begin{array}{cc}
-  * L & 0\\
-  * 0\,...\,0 & 1
-  * \end{array} \right) \f$
-  *
-  * \b Affine \b Matrix A (Dim)x(Dim+1):
-  * \f$ \left( \begin{array}{c}
-  * A\\
-  * 0\,...\,0\,1
-  * \end{array} \right) \f$
-  *
-  * \b Column \b vector v (Dim)x(1):
-  * \f$ \left( \begin{array}{c}
-  * v\\
-  * 1
-  * \end{array} \right) \f$
-  *
-  * \b Set \b of \b column \b vectors V1...Vn (Dim)x(n):
-  * \f$ \left( \begin{array}{ccc}
-  * v_1 & ... & v_n\\
-  * 1 & ... & 1
-  * \end{array} \right) \f$
-  *
-  * The concatenation of a Transform object with any kind of other transformation
-  * always returns a Transform object.
-  *
-  * A little exception to the "as pure matrix product" rule is the case of the
-  * transformation of non homogeneous vectors by an affine transformation. In
-  * that case the last matrix row can be ignored, and the product returns non
-  * homogeneous vectors.
-  *
-  * Since, for instance, a Dim x Dim matrix is interpreted as a linear transformation,
-  * it is not possible to directly transform Dim vectors stored in a Dim x Dim matrix.
-  * The solution is either to use a Dim x Dynamic matrix or explicitly request a
-  * vector transformation by making the vector homogeneous:
-  * \code
-  * m' = T * m.colwise().homogeneous();
-  * \endcode
-  * Note that there is zero overhead.
-  *
-  * Conversion methods from/to Qt's QMatrix and QTransform are available if the
-  * preprocessor token EIGEN_QT_SUPPORT is defined.
-  *
-  * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_TRANSFORM_PLUGIN.
-  *
-  * \sa class Matrix, class Quaternion
-  */
-template<typename _Scalar, int _Dim, int _Mode, int _Options>
-class Transform
-{
-public:
-  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_Dim==Dynamic ? Dynamic : (_Dim+1)*(_Dim+1))
+ *
+ * \class Transform
+ *
+ * \brief Represents an homogeneous transformation in a N dimensional space
+ *
+ * \tparam Scalar_ the scalar type, i.e., the type of the coefficients
+ * \tparam Dim_ the dimension of the space
+ * \tparam Mode_ the type of the transformation. Can be:
+ *              - #Affine: the transformation is stored as a (Dim+1)^2 matrix,
+ *                         where the last row is assumed to be [0 ... 0 1].
+ *              - #AffineCompact: the transformation is stored as a (Dim)x(Dim+1) matrix.
+ *              - #Projective: the transformation is stored as a (Dim+1)^2 matrix
+ *                             without any assumption.
+ *              - #Isometry: same as #Affine with the additional assumption that
+ *                           the linear part represents a rotation. This assumption is exploited
+ *                           to speed up some functions such as inverse() and rotation().
+ * \tparam Options_ has the same meaning as in class Matrix. It allows to specify DontAlign and/or RowMajor.
+ *                  These Options are passed directly to the underlying matrix type.
+ *
+ * The homography is internally represented and stored by a matrix which
+ * is available through the matrix() method. To understand the behavior of
+ * this class you have to think a Transform object as its internal
+ * matrix representation. The chosen convention is right multiply:
+ *
+ * \code v' = T * v \endcode
+ *
+ * Therefore, an affine transformation matrix M is shaped like this:
+ *
+ * \f$ \left( \begin{array}{cc}
+ * linear & translation\\
+ * 0 ... 0 & 1
+ * \end{array} \right) \f$
+ *
+ * Note that for a projective transformation the last row can be anything,
+ * and then the interpretation of different parts might be slightly different.
+ *
+ * However, unlike a plain matrix, the Transform class provides many features
+ * simplifying both its assembly and usage. In particular, it can be composed
+ * with any other transformations (Transform,Translation,RotationBase,DiagonalMatrix)
+ * and can be directly used to transform implicit homogeneous vectors. All these
+ * operations are handled via the operator*. For the composition of transformations,
+ * its principle consists to first convert the right/left hand sides of the product
+ * to a compatible (Dim+1)^2 matrix and then perform a pure matrix product.
+ * Of course, internally, operator* tries to perform the minimal number of operations
+ * according to the nature of each terms. Likewise, when applying the transform
+ * to points, the latters are automatically promoted to homogeneous vectors
+ * before doing the matrix product. The conventions to homogeneous representations
+ * are performed as follow:
+ *
+ * \b Translation t (Dim)x(1):
+ * \f$ \left( \begin{array}{cc}
+ * I & t \\
+ * 0\,...\,0 & 1
+ * \end{array} \right) \f$
+ *
+ * \b Rotation R (Dim)x(Dim):
+ * \f$ \left( \begin{array}{cc}
+ * R & 0\\
+ * 0\,...\,0 & 1
+ * \end{array} \right) \f$
+ *<!--
+ * \b Linear \b Matrix L (Dim)x(Dim):
+ * \f$ \left( \begin{array}{cc}
+ * L & 0\\
+ * 0\,...\,0 & 1
+ * \end{array} \right) \f$
+ *
+ * \b Affine \b Matrix A (Dim)x(Dim+1):
+ * \f$ \left( \begin{array}{c}
+ * A\\
+ * 0\,...\,0\,1
+ * \end{array} \right) \f$
+ *-->
+ * \b Scaling \b DiagonalMatrix S (Dim)x(Dim):
+ * \f$ \left( \begin{array}{cc}
+ * S & 0\\
+ * 0\,...\,0 & 1
+ * \end{array} \right) \f$
+ *
+ * \b Column \b point v (Dim)x(1):
+ * \f$ \left( \begin{array}{c}
+ * v\\
+ * 1
+ * \end{array} \right) \f$
+ *
+ * \b Set \b of \b column \b points V1...Vn (Dim)x(n):
+ * \f$ \left( \begin{array}{ccc}
+ * v_1 & ... & v_n\\
+ * 1 & ... & 1
+ * \end{array} \right) \f$
+ *
+ * The concatenation of a Transform object with any kind of other transformation
+ * always returns a Transform object.
+ *
+ * A little exception to the "as pure matrix product" rule is the case of the
+ * transformation of non homogeneous vectors by an affine transformation. In
+ * that case the last matrix row can be ignored, and the product returns non
+ * homogeneous vectors.
+ *
+ * Since, for instance, a Dim x Dim matrix is interpreted as a linear transformation,
+ * it is not possible to directly transform Dim vectors stored in a Dim x Dim matrix.
+ * The solution is either to use a Dim x Dynamic matrix or explicitly request a
+ * vector transformation by making the vector homogeneous:
+ * \code
+ * m' = T * m.colwise().homogeneous();
+ * \endcode
+ * Note that there is zero overhead.
+ *
+ * Conversion methods from/to Qt's QMatrix and QTransform are available if the
+ * preprocessor token EIGEN_QT_SUPPORT is defined.
+ *
+ * This class can be extended with the help of the plugin mechanism described on the page
+ * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_TRANSFORM_PLUGIN.
+ *
+ * \sa class Matrix, class Quaternion
+ */
+template <typename Scalar_, int Dim_, int Mode_, int Options_>
+class Transform {
+ public:
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar_,
+                                                             Dim_ == Dynamic ? Dynamic : (Dim_ + 1) * (Dim_ + 1))
   enum {
-    Mode = _Mode,
-    Options = _Options,
-    Dim = _Dim,     ///< space dimension in which the transformation holds
-    HDim = _Dim+1,  ///< size of a respective homogeneous vector
-    Rows = int(Mode)==(AffineCompact) ? Dim : HDim
+    Mode = Mode_,
+    Options = Options_,
+    Dim = Dim_,       ///< space dimension in which the transformation holds
+    HDim = Dim_ + 1,  ///< size of a respective homogeneous vector
+    Rows = int(Mode) == (AffineCompact) ? Dim : HDim
   };
   /** the scalar type of the coefficients */
-  typedef _Scalar Scalar;
-  typedef DenseIndex Index;
+  typedef Scalar_ Scalar;
+  typedef Eigen::Index StorageIndex;
+  typedef Eigen::Index Index;  ///< \deprecated since Eigen 3.3
   /** type of the matrix used to represent the transformation */
-  typedef typename internal::make_proper_matrix_type<Scalar,Rows,HDim,Options>::type MatrixType;
+  typedef typename internal::make_proper_matrix_type<Scalar, Rows, HDim, Options>::type MatrixType;
   /** constified MatrixType */
   typedef const MatrixType ConstMatrixType;
   /** type of the matrix used to represent the linear part of the transformation */
-  typedef Matrix<Scalar,Dim,Dim,Options> LinearMatrixType;
+  typedef Matrix<Scalar, Dim, Dim, Options> LinearMatrixType;
   /** type of read/write reference to the linear part of the transformation */
-  typedef Block<MatrixType,Dim,Dim,int(Mode)==(AffineCompact) && (Options&RowMajor)==0> LinearPart;
+  typedef Block<MatrixType, Dim, Dim, int(Mode) == (AffineCompact) && (int(Options) & RowMajor) == 0> LinearPart;
   /** type of read reference to the linear part of the transformation */
-  typedef const Block<ConstMatrixType,Dim,Dim,int(Mode)==(AffineCompact) && (Options&RowMajor)==0> ConstLinearPart;
+  typedef const Block<ConstMatrixType, Dim, Dim, int(Mode) == (AffineCompact) && (int(Options) & RowMajor) == 0>
+      ConstLinearPart;
   /** type of read/write reference to the affine part of the transformation */
-  typedef typename internal::conditional<int(Mode)==int(AffineCompact),
-                              MatrixType&,
-                              Block<MatrixType,Dim,HDim> >::type AffinePart;
+  typedef std::conditional_t<int(Mode) == int(AffineCompact), MatrixType&, Block<MatrixType, Dim, HDim> > AffinePart;
   /** type of read reference to the affine part of the transformation */
-  typedef typename internal::conditional<int(Mode)==int(AffineCompact),
-                              const MatrixType&,
-                              const Block<const MatrixType,Dim,HDim> >::type ConstAffinePart;
+  typedef std::conditional_t<int(Mode) == int(AffineCompact), const MatrixType&,
+                             const Block<const MatrixType, Dim, HDim> >
+      ConstAffinePart;
   /** type of a vector */
-  typedef Matrix<Scalar,Dim,1> VectorType;
+  typedef Matrix<Scalar, Dim, 1> VectorType;
   /** type of a read/write reference to the translation part of the rotation */
-  typedef Block<MatrixType,Dim,1,int(Mode)==(AffineCompact)> TranslationPart;
+  typedef Block<MatrixType, Dim, 1, !(internal::traits<MatrixType>::Flags & RowMajorBit)> TranslationPart;
   /** type of a read reference to the translation part of the rotation */
-  typedef const Block<ConstMatrixType,Dim,1,int(Mode)==(AffineCompact)> ConstTranslationPart;
+  typedef const Block<ConstMatrixType, Dim, 1, !(internal::traits<MatrixType>::Flags & RowMajorBit)>
+      ConstTranslationPart;
   /** corresponding translation type */
-  typedef Translation<Scalar,Dim> TranslationType;
-  
+  typedef Translation<Scalar, Dim> TranslationType;
+
   // this intermediate enum is needed to avoid an ICE with gcc 3.4 and 4.0
-  enum { TransformTimeDiagonalMode = ((Mode==int(Isometry))?Affine:int(Mode)) };
+  enum { TransformTimeDiagonalMode = ((Mode == int(Isometry)) ? Affine : int(Mode)) };
   /** The return type of the product between a diagonal matrix and a transform */
-  typedef Transform<Scalar,Dim,TransformTimeDiagonalMode> TransformTimeDiagonalReturnType;
-
-protected:
+  typedef Transform<Scalar, Dim, TransformTimeDiagonalMode> TransformTimeDiagonalReturnType;
 
+ protected:
   MatrixType m_matrix;
 
-public:
-
+ public:
   /** Default constructor without initialization of the meaningful coefficients.
-    * If Mode==Affine, then the last row is set to [0 ... 0 1] */
-  inline Transform()
-  {
+   * If Mode==Affine or Mode==Isometry, then the last row is set to [0 ... 0 1] */
+  EIGEN_DEVICE_FUNC inline Transform() {
     check_template_params();
-    internal::transform_make_affine<(int(Mode)==Affine) ? Affine : AffineCompact>::run(m_matrix);
+    internal::transform_make_affine<(int(Mode) == Affine || int(Mode) == Isometry) ? Affine : AffineCompact>::run(
+        m_matrix);
   }
 
-  inline Transform(const Transform& other)
-  {
-    check_template_params();
-    m_matrix = other.m_matrix;
-  }
-
-  inline explicit Transform(const TranslationType& t)
-  {
+  EIGEN_DEVICE_FUNC inline explicit Transform(const TranslationType& t) {
     check_template_params();
     *this = t;
   }
-  inline explicit Transform(const UniformScaling<Scalar>& s)
-  {
+  EIGEN_DEVICE_FUNC inline explicit Transform(const UniformScaling<Scalar>& s) {
     check_template_params();
     *this = s;
   }
-  template<typename Derived>
-  inline explicit Transform(const RotationBase<Derived, Dim>& r)
-  {
+  template <typename Derived>
+  EIGEN_DEVICE_FUNC inline explicit Transform(const RotationBase<Derived, Dim>& r) {
     check_template_params();
     *this = r;
   }
 
-  inline Transform& operator=(const Transform& other)
-  { m_matrix = other.m_matrix; return *this; }
-
   typedef internal::transform_take_affine_part<Transform> take_affine_part;
 
   /** Constructs and initializes a transformation from a Dim^2 or a (Dim+1)^2 matrix. */
-  template<typename OtherDerived>
-  inline explicit Transform(const EigenBase<OtherDerived>& other)
-  {
-    EIGEN_STATIC_ASSERT((internal::is_same<Scalar,typename OtherDerived::Scalar>::value),
-      YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY);
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC inline explicit Transform(const EigenBase<OtherDerived>& other) {
+    EIGEN_STATIC_ASSERT(
+        (internal::is_same<Scalar, typename OtherDerived::Scalar>::value),
+        YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY);
 
     check_template_params();
-    internal::transform_construct_from_matrix<OtherDerived,Mode,Options,Dim,HDim>::run(this, other.derived());
+    internal::transform_construct_from_matrix<OtherDerived, Mode, Options, Dim, HDim>::run(this, other.derived());
   }
 
   /** Set \c *this from a Dim^2 or (Dim+1)^2 matrix. */
-  template<typename OtherDerived>
-  inline Transform& operator=(const EigenBase<OtherDerived>& other)
-  {
-    EIGEN_STATIC_ASSERT((internal::is_same<Scalar,typename OtherDerived::Scalar>::value),
-      YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY);
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC inline Transform& operator=(const EigenBase<OtherDerived>& other) {
+    EIGEN_STATIC_ASSERT(
+        (internal::is_same<Scalar, typename OtherDerived::Scalar>::value),
+        YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY);
 
-    internal::transform_construct_from_matrix<OtherDerived,Mode,Options,Dim,HDim>::run(this, other.derived());
+    internal::transform_construct_from_matrix<OtherDerived, Mode, Options, Dim, HDim>::run(this, other.derived());
     return *this;
   }
-  
-  template<int OtherOptions>
-  inline Transform(const Transform<Scalar,Dim,Mode,OtherOptions>& other)
-  {
+
+  template <int OtherOptions>
+  EIGEN_DEVICE_FUNC inline Transform(const Transform<Scalar, Dim, Mode, OtherOptions>& other) {
     check_template_params();
     // only the options change, we can directly copy the matrices
     m_matrix = other.matrix();
   }
 
-  template<int OtherMode,int OtherOptions>
-  inline Transform(const Transform<Scalar,Dim,OtherMode,OtherOptions>& other)
-  {
+  template <int OtherMode, int OtherOptions>
+  EIGEN_DEVICE_FUNC inline Transform(const Transform<Scalar, Dim, OtherMode, OtherOptions>& other) {
     check_template_params();
     // prevent conversions as:
     // Affine | AffineCompact | Isometry = Projective
-    EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(OtherMode==int(Projective), Mode==int(Projective)),
+    EIGEN_STATIC_ASSERT(internal::check_implication(OtherMode == int(Projective), Mode == int(Projective)),
                         YOU_PERFORMED_AN_INVALID_TRANSFORMATION_CONVERSION)
 
     // prevent conversions as:
     // Isometry = Affine | AffineCompact
-    EIGEN_STATIC_ASSERT(EIGEN_IMPLIES(OtherMode==int(Affine)||OtherMode==int(AffineCompact), Mode!=int(Isometry)),
-                        YOU_PERFORMED_AN_INVALID_TRANSFORMATION_CONVERSION)
+    EIGEN_STATIC_ASSERT(
+        internal::check_implication(OtherMode == int(Affine) || OtherMode == int(AffineCompact), Mode != int(Isometry)),
+        YOU_PERFORMED_AN_INVALID_TRANSFORMATION_CONVERSION)
 
-    enum { ModeIsAffineCompact = Mode == int(AffineCompact),
-           OtherModeIsAffineCompact = OtherMode == int(AffineCompact)
+    enum {
+      ModeIsAffineCompact = Mode == int(AffineCompact),
+      OtherModeIsAffineCompact = OtherMode == int(AffineCompact)
     };
 
-    if(ModeIsAffineCompact == OtherModeIsAffineCompact)
-    {
+    if (EIGEN_CONST_CONDITIONAL(ModeIsAffineCompact == OtherModeIsAffineCompact)) {
       // We need the block expression because the code is compiled for all
       // combinations of transformations and will trigger a compile time error
       // if one tries to assign the matrices directly
-      m_matrix.template block<Dim,Dim+1>(0,0) = other.matrix().template block<Dim,Dim+1>(0,0);
+      m_matrix.template block<Dim, Dim + 1>(0, 0) = other.matrix().template block<Dim, Dim + 1>(0, 0);
       makeAffine();
-    }
-    else if(OtherModeIsAffineCompact)
-    {
-      typedef typename Transform<Scalar,Dim,OtherMode,OtherOptions>::MatrixType OtherMatrixType;
-      internal::transform_construct_from_matrix<OtherMatrixType,Mode,Options,Dim,HDim>::run(this, other.matrix());
-    }
-    else
-    {
+    } else if (EIGEN_CONST_CONDITIONAL(OtherModeIsAffineCompact)) {
+      typedef typename Transform<Scalar, Dim, OtherMode, OtherOptions>::MatrixType OtherMatrixType;
+      internal::transform_construct_from_matrix<OtherMatrixType, Mode, Options, Dim, HDim>::run(this, other.matrix());
+    } else {
       // here we know that Mode == AffineCompact and OtherMode != AffineCompact.
       // if OtherMode were Projective, the static assert above would already have caught it.
       // So the only possibility is that OtherMode == Affine
@@ -334,436 +330,465 @@ class Transform
     }
   }
 
-  template<typename OtherDerived>
-  Transform(const ReturnByValue<OtherDerived>& other)
-  {
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC Transform(const ReturnByValue<OtherDerived>& other) {
     check_template_params();
     other.evalTo(*this);
   }
 
-  template<typename OtherDerived>
-  Transform& operator=(const ReturnByValue<OtherDerived>& other)
-  {
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC Transform& operator=(const ReturnByValue<OtherDerived>& other) {
     other.evalTo(*this);
     return *this;
   }
 
-  #ifdef EIGEN_QT_SUPPORT
+#ifdef EIGEN_QT_SUPPORT
+#if (QT_VERSION < QT_VERSION_CHECK(6, 0, 0))
   inline Transform(const QMatrix& other);
   inline Transform& operator=(const QMatrix& other);
   inline QMatrix toQMatrix(void) const;
+#endif
   inline Transform(const QTransform& other);
   inline Transform& operator=(const QTransform& other);
   inline QTransform toQTransform(void) const;
-  #endif
+#endif
+
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept {
+    return int(Mode) == int(Projective) ? m_matrix.cols() : (m_matrix.cols() - 1);
+  }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_matrix.cols(); }
 
   /** shortcut for m_matrix(row,col);
-    * \sa MatrixBase::operator(Index,Index) const */
-  inline Scalar operator() (Index row, Index col) const { return m_matrix(row,col); }
+   * \sa MatrixBase::operator(Index,Index) const */
+  EIGEN_DEVICE_FUNC inline Scalar operator()(Index row, Index col) const { return m_matrix(row, col); }
   /** shortcut for m_matrix(row,col);
-    * \sa MatrixBase::operator(Index,Index) */
-  inline Scalar& operator() (Index row, Index col) { return m_matrix(row,col); }
+   * \sa MatrixBase::operator(Index,Index) */
+  EIGEN_DEVICE_FUNC inline Scalar& operator()(Index row, Index col) { return m_matrix(row, col); }
 
   /** \returns a read-only expression of the transformation matrix */
-  inline const MatrixType& matrix() const { return m_matrix; }
+  EIGEN_DEVICE_FUNC inline const MatrixType& matrix() const { return m_matrix; }
   /** \returns a writable expression of the transformation matrix */
-  inline MatrixType& matrix() { return m_matrix; }
+  EIGEN_DEVICE_FUNC inline MatrixType& matrix() { return m_matrix; }
 
   /** \returns a read-only expression of the linear part of the transformation */
-  inline ConstLinearPart linear() const { return ConstLinearPart(m_matrix,0,0); }
+  EIGEN_DEVICE_FUNC inline ConstLinearPart linear() const { return ConstLinearPart(m_matrix, 0, 0); }
   /** \returns a writable expression of the linear part of the transformation */
-  inline LinearPart linear() { return LinearPart(m_matrix,0,0); }
+  EIGEN_DEVICE_FUNC inline LinearPart linear() { return LinearPart(m_matrix, 0, 0); }
 
   /** \returns a read-only expression of the Dim x HDim affine part of the transformation */
-  inline ConstAffinePart affine() const { return take_affine_part::run(m_matrix); }
+  EIGEN_DEVICE_FUNC inline ConstAffinePart affine() const { return take_affine_part::run(m_matrix); }
   /** \returns a writable expression of the Dim x HDim affine part of the transformation */
-  inline AffinePart affine() { return take_affine_part::run(m_matrix); }
+  EIGEN_DEVICE_FUNC inline AffinePart affine() { return take_affine_part::run(m_matrix); }
 
   /** \returns a read-only expression of the translation vector of the transformation */
-  inline ConstTranslationPart translation() const { return ConstTranslationPart(m_matrix,0,Dim); }
+  EIGEN_DEVICE_FUNC inline ConstTranslationPart translation() const { return ConstTranslationPart(m_matrix, 0, Dim); }
   /** \returns a writable expression of the translation vector of the transformation */
-  inline TranslationPart translation() { return TranslationPart(m_matrix,0,Dim); }
-
-  /** \returns an expression of the product between the transform \c *this and a matrix expression \a other
-    *
-    * The right hand side \a other might be either:
-    * \li a vector of size Dim,
-    * \li an homogeneous vector of size Dim+1,
-    * \li a set of vectors of size Dim x Dynamic,
-    * \li a set of homogeneous vectors of size Dim+1 x Dynamic,
-    * \li a linear transformation matrix of size Dim x Dim,
-    * \li an affine transformation matrix of size Dim x Dim+1,
-    * \li a transformation matrix of size Dim+1 x Dim+1.
-    */
+  EIGEN_DEVICE_FUNC inline TranslationPart translation() { return TranslationPart(m_matrix, 0, Dim); }
+
+  /** \returns an expression of the product between the transform \c *this and a matrix expression \a other.
+   *
+   * The right-hand-side \a other can be either:
+   * \li an homogeneous vector of size Dim+1,
+   * \li a set of homogeneous vectors of size Dim+1 x N,
+   * \li a transformation matrix of size Dim+1 x Dim+1.
+   *
+   * Moreover, if \c *this represents an affine transformation (i.e., Mode!=Projective), then \a other can also be:
+   * \li a point of size Dim (computes: \code this->linear() * other + this->translation()\endcode),
+   * \li a set of N points as a Dim x N matrix (computes: \code (this->linear() * other).colwise() +
+   * this->translation()\endcode),
+   *
+   * In all cases, the return type is a matrix or vector of same sizes as the right-hand-side \a other.
+   *
+   * If you want to interpret \a other as a linear or affine transformation, then first convert it to a Transform<>
+   * type, or do your own cooking.
+   *
+   * Finally, if you want to apply Affine transformations to vectors, then explicitly apply the linear part only:
+   * \code
+   * Affine3f A;
+   * Vector3f v1, v2;
+   * v2 = A.linear() * v1;
+   * \endcode
+   *
+   */
   // note: this function is defined here because some compilers cannot find the respective declaration
-  template<typename OtherDerived>
-  EIGEN_STRONG_INLINE const typename internal::transform_right_product_impl<Transform, OtherDerived>::ResultType
-  operator * (const EigenBase<OtherDerived> &other) const
-  { return internal::transform_right_product_impl<Transform, OtherDerived>::run(*this,other.derived()); }
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename internal::transform_right_product_impl<Transform,
+                                                                                              OtherDerived>::ResultType
+  operator*(const EigenBase<OtherDerived>& other) const {
+    return internal::transform_right_product_impl<Transform, OtherDerived>::run(*this, other.derived());
+  }
 
   /** \returns the product expression of a transformation matrix \a a times a transform \a b
-    *
-    * The left hand side \a other might be either:
-    * \li a linear transformation matrix of size Dim x Dim,
-    * \li an affine transformation matrix of size Dim x Dim+1,
-    * \li a general transformation matrix of size Dim+1 x Dim+1.
-    */
-  template<typename OtherDerived> friend
-  inline const typename internal::transform_left_product_impl<OtherDerived,Mode,Options,_Dim,_Dim+1>::ResultType
-    operator * (const EigenBase<OtherDerived> &a, const Transform &b)
-  { return internal::transform_left_product_impl<OtherDerived,Mode,Options,Dim,HDim>::run(a.derived(),b); }
+   *
+   * The left hand side \a other can be either:
+   * \li a linear transformation matrix of size Dim x Dim,
+   * \li an affine transformation matrix of size Dim x Dim+1,
+   * \li a general transformation matrix of size Dim+1 x Dim+1.
+   */
+  template <typename OtherDerived>
+  friend EIGEN_DEVICE_FUNC inline const typename internal::transform_left_product_impl<OtherDerived, Mode, Options,
+                                                                                       Dim_, Dim_ + 1>::ResultType
+  operator*(const EigenBase<OtherDerived>& a, const Transform& b) {
+    return internal::transform_left_product_impl<OtherDerived, Mode, Options, Dim, HDim>::run(a.derived(), b);
+  }
 
   /** \returns The product expression of a transform \a a times a diagonal matrix \a b
-    *
-    * The rhs diagonal matrix is interpreted as an affine scaling transformation. The
-    * product results in a Transform of the same type (mode) as the lhs only if the lhs 
-    * mode is no isometry. In that case, the returned transform is an affinity.
-    */
-  template<typename DiagonalDerived>
-  inline const TransformTimeDiagonalReturnType
-    operator * (const DiagonalBase<DiagonalDerived> &b) const
-  {
+   *
+   * The rhs diagonal matrix is interpreted as an affine scaling transformation. The
+   * product results in a Transform of the same type (mode) as the lhs only if the lhs
+   * mode is no isometry. In that case, the returned transform is an affinity.
+   */
+  template <typename DiagonalDerived>
+  EIGEN_DEVICE_FUNC inline const TransformTimeDiagonalReturnType operator*(
+      const DiagonalBase<DiagonalDerived>& b) const {
     TransformTimeDiagonalReturnType res(*this);
-    res.linear() *= b;
+    res.linearExt() *= b;
     return res;
   }
 
   /** \returns The product expression of a diagonal matrix \a a times a transform \a b
-    *
-    * The lhs diagonal matrix is interpreted as an affine scaling transformation. The
-    * product results in a Transform of the same type (mode) as the lhs only if the lhs 
-    * mode is no isometry. In that case, the returned transform is an affinity.
-    */
-  template<typename DiagonalDerived>
-  friend inline TransformTimeDiagonalReturnType
-    operator * (const DiagonalBase<DiagonalDerived> &a, const Transform &b)
-  {
+   *
+   * The lhs diagonal matrix is interpreted as an affine scaling transformation. The
+   * product results in a Transform of the same type (mode) as the lhs only if the lhs
+   * mode is no isometry. In that case, the returned transform is an affinity.
+   */
+  template <typename DiagonalDerived>
+  EIGEN_DEVICE_FUNC friend inline TransformTimeDiagonalReturnType operator*(const DiagonalBase<DiagonalDerived>& a,
+                                                                            const Transform& b) {
     TransformTimeDiagonalReturnType res;
-    res.linear().noalias() = a*b.linear();
-    res.translation().noalias() = a*b.translation();
-    if (Mode!=int(AffineCompact))
-      res.matrix().row(Dim) = b.matrix().row(Dim);
+    res.linear().noalias() = a * b.linear();
+    res.translation().noalias() = a * b.translation();
+    if (EIGEN_CONST_CONDITIONAL(Mode != int(AffineCompact))) res.matrix().row(Dim) = b.matrix().row(Dim);
     return res;
   }
 
-  template<typename OtherDerived>
-  inline Transform& operator*=(const EigenBase<OtherDerived>& other) { return *this = *this * other; }
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC inline Transform& operator*=(const EigenBase<OtherDerived>& other) {
+    return *this = *this * other;
+  }
 
   /** Concatenates two transformations */
-  inline const Transform operator * (const Transform& other) const
-  {
-    return internal::transform_transform_product_impl<Transform,Transform>::run(*this,other);
+  EIGEN_DEVICE_FUNC inline const Transform operator*(const Transform& other) const {
+    return internal::transform_transform_product_impl<Transform, Transform>::run(*this, other);
   }
-  
-  #ifdef __INTEL_COMPILER
-private:
+
+#if EIGEN_COMP_ICC
+ private:
   // this intermediate structure permits to workaround a bug in ICC 11:
   //   error: template instantiation resulted in unexpected function type of "Eigen::Transform<double, 3, 32, 0>
   //             (const Eigen::Transform<double, 3, 2, 0> &) const"
   //  (the meaning of a name may have changed since the template declaration -- the type of the template is:
   // "Eigen::internal::transform_transform_product_impl<Eigen::Transform<double, 3, 32, 0>,
-  //     Eigen::Transform<double, 3, Mode, Options>, <expression>>::ResultType (const Eigen::Transform<double, 3, Mode, Options> &) const")
-  // 
-  template<int OtherMode,int OtherOptions> struct icc_11_workaround
-  {
-    typedef internal::transform_transform_product_impl<Transform,Transform<Scalar,Dim,OtherMode,OtherOptions> > ProductType;
+  //     Eigen::Transform<double, 3, Mode, Options>, <expression>>::ResultType (const Eigen::Transform<double, 3, Mode,
+  //     Options> &) const")
+  //
+  template <int OtherMode, int OtherOptions>
+  struct icc_11_workaround {
+    typedef internal::transform_transform_product_impl<Transform, Transform<Scalar, Dim, OtherMode, OtherOptions> >
+        ProductType;
     typedef typename ProductType::ResultType ResultType;
   };
-  
-public:
+
+ public:
   /** Concatenates two different transformations */
-  template<int OtherMode,int OtherOptions>
-  inline typename icc_11_workaround<OtherMode,OtherOptions>::ResultType
-    operator * (const Transform<Scalar,Dim,OtherMode,OtherOptions>& other) const
-  {
-    typedef typename icc_11_workaround<OtherMode,OtherOptions>::ProductType ProductType;
-    return ProductType::run(*this,other);
+  template <int OtherMode, int OtherOptions>
+  inline typename icc_11_workaround<OtherMode, OtherOptions>::ResultType operator*(
+      const Transform<Scalar, Dim, OtherMode, OtherOptions>& other) const {
+    typedef typename icc_11_workaround<OtherMode, OtherOptions>::ProductType ProductType;
+    return ProductType::run(*this, other);
   }
-  #else
+#else
   /** Concatenates two different transformations */
-  template<int OtherMode,int OtherOptions>
-  inline typename internal::transform_transform_product_impl<Transform,Transform<Scalar,Dim,OtherMode,OtherOptions> >::ResultType
-    operator * (const Transform<Scalar,Dim,OtherMode,OtherOptions>& other) const
-  {
-    return internal::transform_transform_product_impl<Transform,Transform<Scalar,Dim,OtherMode,OtherOptions> >::run(*this,other);
+  template <int OtherMode, int OtherOptions>
+  EIGEN_DEVICE_FUNC inline
+      typename internal::transform_transform_product_impl<Transform,
+                                                          Transform<Scalar, Dim, OtherMode, OtherOptions> >::ResultType
+      operator*(const Transform<Scalar, Dim, OtherMode, OtherOptions>& other) const {
+    return internal::transform_transform_product_impl<Transform, Transform<Scalar, Dim, OtherMode, OtherOptions> >::run(
+        *this, other);
   }
-  #endif
+#endif
 
   /** \sa MatrixBase::setIdentity() */
-  void setIdentity() { m_matrix.setIdentity(); }
+  EIGEN_DEVICE_FUNC void setIdentity() { m_matrix.setIdentity(); }
 
   /**
    * \brief Returns an identity transformation.
    * \todo In the future this function should be returning a Transform expression.
    */
-  static const Transform Identity()
-  {
-    return Transform(MatrixType::Identity());
-  }
+  EIGEN_DEVICE_FUNC static const Transform Identity() { return Transform(MatrixType::Identity()); }
+
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC inline Transform& scale(const MatrixBase<OtherDerived>& other);
+
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC inline Transform& prescale(const MatrixBase<OtherDerived>& other);
 
-  template<typename OtherDerived>
-  inline Transform& scale(const MatrixBase<OtherDerived> &other);
+  EIGEN_DEVICE_FUNC inline Transform& scale(const Scalar& s);
+  EIGEN_DEVICE_FUNC inline Transform& prescale(const Scalar& s);
 
-  template<typename OtherDerived>
-  inline Transform& prescale(const MatrixBase<OtherDerived> &other);
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC inline Transform& translate(const MatrixBase<OtherDerived>& other);
 
-  inline Transform& scale(const Scalar& s);
-  inline Transform& prescale(const Scalar& s);
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC inline Transform& pretranslate(const MatrixBase<OtherDerived>& other);
 
-  template<typename OtherDerived>
-  inline Transform& translate(const MatrixBase<OtherDerived> &other);
+  template <typename RotationType>
+  EIGEN_DEVICE_FUNC inline Transform& rotate(const RotationType& rotation);
 
-  template<typename OtherDerived>
-  inline Transform& pretranslate(const MatrixBase<OtherDerived> &other);
+  template <typename RotationType>
+  EIGEN_DEVICE_FUNC inline Transform& prerotate(const RotationType& rotation);
 
-  template<typename RotationType>
-  inline Transform& rotate(const RotationType& rotation);
+  EIGEN_DEVICE_FUNC Transform& shear(const Scalar& sx, const Scalar& sy);
+  EIGEN_DEVICE_FUNC Transform& preshear(const Scalar& sx, const Scalar& sy);
 
-  template<typename RotationType>
-  inline Transform& prerotate(const RotationType& rotation);
+  EIGEN_DEVICE_FUNC inline Transform& operator=(const TranslationType& t);
 
-  Transform& shear(const Scalar& sx, const Scalar& sy);
-  Transform& preshear(const Scalar& sx, const Scalar& sy);
+  EIGEN_DEVICE_FUNC inline Transform& operator*=(const TranslationType& t) { return translate(t.vector()); }
 
-  inline Transform& operator=(const TranslationType& t);
-  inline Transform& operator*=(const TranslationType& t) { return translate(t.vector()); }
-  inline Transform operator*(const TranslationType& t) const;
+  EIGEN_DEVICE_FUNC inline Transform operator*(const TranslationType& t) const;
 
-  inline Transform& operator=(const UniformScaling<Scalar>& t);
-  inline Transform& operator*=(const UniformScaling<Scalar>& s) { return scale(s.factor()); }
-  inline Transform<Scalar,Dim,(int(Mode)==int(Isometry)?int(Affine):int(Mode))> operator*(const UniformScaling<Scalar>& s) const
-  {
-    Transform<Scalar,Dim,(int(Mode)==int(Isometry)?int(Affine):int(Mode)),Options> res = *this;
+  EIGEN_DEVICE_FUNC inline Transform& operator=(const UniformScaling<Scalar>& t);
+
+  EIGEN_DEVICE_FUNC inline Transform& operator*=(const UniformScaling<Scalar>& s) { return scale(s.factor()); }
+
+  EIGEN_DEVICE_FUNC inline TransformTimeDiagonalReturnType operator*(const UniformScaling<Scalar>& s) const {
+    TransformTimeDiagonalReturnType res = *this;
     res.scale(s.factor());
     return res;
   }
 
-  inline Transform& operator*=(const DiagonalMatrix<Scalar,Dim>& s) { linear() *= s; return *this; }
+  EIGEN_DEVICE_FUNC inline Transform& operator*=(const DiagonalMatrix<Scalar, Dim>& s) {
+    linearExt() *= s;
+    return *this;
+  }
+
+  template <typename Derived>
+  EIGEN_DEVICE_FUNC inline Transform& operator=(const RotationBase<Derived, Dim>& r);
+  template <typename Derived>
+  EIGEN_DEVICE_FUNC inline Transform& operator*=(const RotationBase<Derived, Dim>& r) {
+    return rotate(r.toRotationMatrix());
+  }
+  template <typename Derived>
+  EIGEN_DEVICE_FUNC inline Transform operator*(const RotationBase<Derived, Dim>& r) const;
 
-  template<typename Derived>
-  inline Transform& operator=(const RotationBase<Derived,Dim>& r);
-  template<typename Derived>
-  inline Transform& operator*=(const RotationBase<Derived,Dim>& r) { return rotate(r.toRotationMatrix()); }
-  template<typename Derived>
-  inline Transform operator*(const RotationBase<Derived,Dim>& r) const;
+  typedef std::conditional_t<int(Mode) == Isometry, ConstLinearPart, const LinearMatrixType> RotationReturnType;
+  EIGEN_DEVICE_FUNC RotationReturnType rotation() const;
 
-  const LinearMatrixType rotation() const;
-  template<typename RotationMatrixType, typename ScalingMatrixType>
-  void computeRotationScaling(RotationMatrixType *rotation, ScalingMatrixType *scaling) const;
-  template<typename ScalingMatrixType, typename RotationMatrixType>
-  void computeScalingRotation(ScalingMatrixType *scaling, RotationMatrixType *rotation) const;
+  template <typename RotationMatrixType, typename ScalingMatrixType>
+  EIGEN_DEVICE_FUNC void computeRotationScaling(RotationMatrixType* rotation, ScalingMatrixType* scaling) const;
+  template <typename ScalingMatrixType, typename RotationMatrixType>
+  EIGEN_DEVICE_FUNC void computeScalingRotation(ScalingMatrixType* scaling, RotationMatrixType* rotation) const;
 
-  template<typename PositionDerived, typename OrientationType, typename ScaleDerived>
-  Transform& fromPositionOrientationScale(const MatrixBase<PositionDerived> &position,
-    const OrientationType& orientation, const MatrixBase<ScaleDerived> &scale);
+  template <typename PositionDerived, typename OrientationType, typename ScaleDerived>
+  EIGEN_DEVICE_FUNC Transform& fromPositionOrientationScale(const MatrixBase<PositionDerived>& position,
+                                                            const OrientationType& orientation,
+                                                            const MatrixBase<ScaleDerived>& scale);
 
-  inline Transform inverse(TransformTraits traits = (TransformTraits)Mode) const;
+  EIGEN_DEVICE_FUNC inline Transform inverse(TransformTraits traits = (TransformTraits)Mode) const;
 
   /** \returns a const pointer to the column major internal matrix */
-  const Scalar* data() const { return m_matrix.data(); }
+  EIGEN_DEVICE_FUNC constexpr const Scalar* data() const { return m_matrix.data(); }
   /** \returns a non-const pointer to the column major internal matrix */
-  Scalar* data() { return m_matrix.data(); }
+  EIGEN_DEVICE_FUNC constexpr Scalar* data() { return m_matrix.data(); }
 
   /** \returns \c *this with scalar type casted to \a NewScalarType
-    *
-    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
-    * then this function smartly returns a const reference to \c *this.
-    */
-  template<typename NewScalarType>
-  inline typename internal::cast_return_type<Transform,Transform<NewScalarType,Dim,Mode,Options> >::type cast() const
-  { return typename internal::cast_return_type<Transform,Transform<NewScalarType,Dim,Mode,Options> >::type(*this); }
+   *
+   * Note that if \a NewScalarType is equal to the current scalar type of \c *this
+   * then this function smartly returns a const reference to \c *this.
+   */
+  template <typename NewScalarType>
+  EIGEN_DEVICE_FUNC inline
+      typename internal::cast_return_type<Transform, Transform<NewScalarType, Dim, Mode, Options> >::type
+      cast() const {
+    return typename internal::cast_return_type<Transform, Transform<NewScalarType, Dim, Mode, Options> >::type(*this);
+  }
 
   /** Copy constructor with scalar type conversion */
-  template<typename OtherScalarType>
-  inline explicit Transform(const Transform<OtherScalarType,Dim,Mode,Options>& other)
-  {
+  template <typename OtherScalarType>
+  EIGEN_DEVICE_FUNC inline explicit Transform(const Transform<OtherScalarType, Dim, Mode, Options>& other) {
     check_template_params();
     m_matrix = other.matrix().template cast<Scalar>();
   }
 
   /** \returns \c true if \c *this is approximately equal to \a other, within the precision
-    * determined by \a prec.
-    *
-    * \sa MatrixBase::isApprox() */
-  bool isApprox(const Transform& other, const typename NumTraits<Scalar>::Real& prec = NumTraits<Scalar>::dummy_precision()) const
-  { return m_matrix.isApprox(other.m_matrix, prec); }
+   * determined by \a prec.
+   *
+   * \sa MatrixBase::isApprox() */
+  EIGEN_DEVICE_FUNC bool isApprox(const Transform& other, const typename NumTraits<Scalar>::Real& prec =
+                                                              NumTraits<Scalar>::dummy_precision()) const {
+    return m_matrix.isApprox(other.m_matrix, prec);
+  }
 
   /** Sets the last row to [0 ... 0 1]
-    */
-  void makeAffine()
-  {
-    internal::transform_make_affine<int(Mode)>::run(m_matrix);
-  }
+   */
+  EIGEN_DEVICE_FUNC void makeAffine() { internal::transform_make_affine<int(Mode)>::run(m_matrix); }
 
   /** \internal
-    * \returns the Dim x Dim linear part if the transformation is affine,
-    *          and the HDim x Dim part for projective transformations.
-    */
-  inline Block<MatrixType,int(Mode)==int(Projective)?HDim:Dim,Dim> linearExt()
-  { return m_matrix.template block<int(Mode)==int(Projective)?HDim:Dim,Dim>(0,0); }
+   * \returns the Dim x Dim linear part if the transformation is affine,
+   *          and the HDim x Dim part for projective transformations.
+   */
+  EIGEN_DEVICE_FUNC inline Block<MatrixType, int(Mode) == int(Projective) ? HDim : Dim, Dim> linearExt() {
+    return m_matrix.template block < int(Mode) == int(Projective) ? HDim : Dim, Dim > (0, 0);
+  }
   /** \internal
-    * \returns the Dim x Dim linear part if the transformation is affine,
-    *          and the HDim x Dim part for projective transformations.
-    */
-  inline const Block<MatrixType,int(Mode)==int(Projective)?HDim:Dim,Dim> linearExt() const
-  { return m_matrix.template block<int(Mode)==int(Projective)?HDim:Dim,Dim>(0,0); }
+   * \returns the Dim x Dim linear part if the transformation is affine,
+   *          and the HDim x Dim part for projective transformations.
+   */
+  EIGEN_DEVICE_FUNC inline const Block<MatrixType, int(Mode) == int(Projective) ? HDim : Dim, Dim> linearExt() const {
+    return m_matrix.template block < int(Mode) == int(Projective) ? HDim : Dim, Dim > (0, 0);
+  }
 
   /** \internal
-    * \returns the translation part if the transformation is affine,
-    *          and the last column for projective transformations.
-    */
-  inline Block<MatrixType,int(Mode)==int(Projective)?HDim:Dim,1> translationExt()
-  { return m_matrix.template block<int(Mode)==int(Projective)?HDim:Dim,1>(0,Dim); }
+   * \returns the translation part if the transformation is affine,
+   *          and the last column for projective transformations.
+   */
+  EIGEN_DEVICE_FUNC inline Block<MatrixType, int(Mode) == int(Projective) ? HDim : Dim, 1> translationExt() {
+    return m_matrix.template block < int(Mode) == int(Projective) ? HDim : Dim, 1 > (0, Dim);
+  }
   /** \internal
-    * \returns the translation part if the transformation is affine,
-    *          and the last column for projective transformations.
-    */
-  inline const Block<MatrixType,int(Mode)==int(Projective)?HDim:Dim,1> translationExt() const
-  { return m_matrix.template block<int(Mode)==int(Projective)?HDim:Dim,1>(0,Dim); }
-
-
-  #ifdef EIGEN_TRANSFORM_PLUGIN
-  #include EIGEN_TRANSFORM_PLUGIN
-  #endif
-  
-protected:
-  #ifndef EIGEN_PARSED_BY_DOXYGEN
-    static EIGEN_STRONG_INLINE void check_template_params()
-    {
-      EIGEN_STATIC_ASSERT((Options & (DontAlign|RowMajor)) == Options, INVALID_MATRIX_TEMPLATE_PARAMETERS)
-    }
-  #endif
+   * \returns the translation part if the transformation is affine,
+   *          and the last column for projective transformations.
+   */
+  EIGEN_DEVICE_FUNC inline const Block<MatrixType, int(Mode) == int(Projective) ? HDim : Dim, 1> translationExt()
+      const {
+    return m_matrix.template block < int(Mode) == int(Projective) ? HDim : Dim, 1 > (0, Dim);
+  }
+
+#ifdef EIGEN_TRANSFORM_PLUGIN
+#include EIGEN_TRANSFORM_PLUGIN
+#endif
 
+ protected:
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void check_template_params() {
+    EIGEN_STATIC_ASSERT((Options & (DontAlign | RowMajor)) == Options, INVALID_MATRIX_TEMPLATE_PARAMETERS)
+  }
+#endif
 };
 
 /** \ingroup Geometry_Module */
-typedef Transform<float,2,Isometry> Isometry2f;
+typedef Transform<float, 2, Isometry> Isometry2f;
 /** \ingroup Geometry_Module */
-typedef Transform<float,3,Isometry> Isometry3f;
+typedef Transform<float, 3, Isometry> Isometry3f;
 /** \ingroup Geometry_Module */
-typedef Transform<double,2,Isometry> Isometry2d;
+typedef Transform<double, 2, Isometry> Isometry2d;
 /** \ingroup Geometry_Module */
-typedef Transform<double,3,Isometry> Isometry3d;
+typedef Transform<double, 3, Isometry> Isometry3d;
 
 /** \ingroup Geometry_Module */
-typedef Transform<float,2,Affine> Affine2f;
+typedef Transform<float, 2, Affine> Affine2f;
 /** \ingroup Geometry_Module */
-typedef Transform<float,3,Affine> Affine3f;
+typedef Transform<float, 3, Affine> Affine3f;
 /** \ingroup Geometry_Module */
-typedef Transform<double,2,Affine> Affine2d;
+typedef Transform<double, 2, Affine> Affine2d;
 /** \ingroup Geometry_Module */
-typedef Transform<double,3,Affine> Affine3d;
+typedef Transform<double, 3, Affine> Affine3d;
 
 /** \ingroup Geometry_Module */
-typedef Transform<float,2,AffineCompact> AffineCompact2f;
+typedef Transform<float, 2, AffineCompact> AffineCompact2f;
 /** \ingroup Geometry_Module */
-typedef Transform<float,3,AffineCompact> AffineCompact3f;
+typedef Transform<float, 3, AffineCompact> AffineCompact3f;
 /** \ingroup Geometry_Module */
-typedef Transform<double,2,AffineCompact> AffineCompact2d;
+typedef Transform<double, 2, AffineCompact> AffineCompact2d;
 /** \ingroup Geometry_Module */
-typedef Transform<double,3,AffineCompact> AffineCompact3d;
+typedef Transform<double, 3, AffineCompact> AffineCompact3d;
 
 /** \ingroup Geometry_Module */
-typedef Transform<float,2,Projective> Projective2f;
+typedef Transform<float, 2, Projective> Projective2f;
 /** \ingroup Geometry_Module */
-typedef Transform<float,3,Projective> Projective3f;
+typedef Transform<float, 3, Projective> Projective3f;
 /** \ingroup Geometry_Module */
-typedef Transform<double,2,Projective> Projective2d;
+typedef Transform<double, 2, Projective> Projective2d;
 /** \ingroup Geometry_Module */
-typedef Transform<double,3,Projective> Projective3d;
+typedef Transform<double, 3, Projective> Projective3d;
 
 /**************************
 *** Optional QT support ***
 **************************/
 
 #ifdef EIGEN_QT_SUPPORT
+
+#if (QT_VERSION < QT_VERSION_CHECK(6, 0, 0))
 /** Initializes \c *this from a QMatrix assuming the dimension is 2.
-  *
-  * This function is available only if the token EIGEN_QT_SUPPORT is defined.
-  */
-template<typename Scalar, int Dim, int Mode,int Options>
-Transform<Scalar,Dim,Mode,Options>::Transform(const QMatrix& other)
-{
+ *
+ * This function is available only if the token EIGEN_QT_SUPPORT is defined.
+ */
+template <typename Scalar, int Dim, int Mode, int Options>
+Transform<Scalar, Dim, Mode, Options>::Transform(const QMatrix& other) {
   check_template_params();
   *this = other;
 }
 
 /** Set \c *this from a QMatrix assuming the dimension is 2.
-  *
-  * This function is available only if the token EIGEN_QT_SUPPORT is defined.
-  */
-template<typename Scalar, int Dim, int Mode,int Options>
-Transform<Scalar,Dim,Mode,Options>& Transform<Scalar,Dim,Mode,Options>::operator=(const QMatrix& other)
-{
-  EIGEN_STATIC_ASSERT(Dim==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
-  m_matrix << other.m11(), other.m21(), other.dx(),
-              other.m12(), other.m22(), other.dy(),
-              0, 0, 1;
+ *
+ * This function is available only if the token EIGEN_QT_SUPPORT is defined.
+ */
+template <typename Scalar, int Dim, int Mode, int Options>
+Transform<Scalar, Dim, Mode, Options>& Transform<Scalar, Dim, Mode, Options>::operator=(const QMatrix& other) {
+  EIGEN_STATIC_ASSERT(Dim == 2, YOU_MADE_A_PROGRAMMING_MISTAKE)
+  if (EIGEN_CONST_CONDITIONAL(Mode == int(AffineCompact)))
+    m_matrix << other.m11(), other.m21(), other.dx(), other.m12(), other.m22(), other.dy();
+  else
+    m_matrix << other.m11(), other.m21(), other.dx(), other.m12(), other.m22(), other.dy(), 0, 0, 1;
   return *this;
 }
 
 /** \returns a QMatrix from \c *this assuming the dimension is 2.
-  *
-  * \warning this conversion might loss data if \c *this is not affine
-  *
-  * This function is available only if the token EIGEN_QT_SUPPORT is defined.
-  */
-template<typename Scalar, int Dim, int Mode, int Options>
-QMatrix Transform<Scalar,Dim,Mode,Options>::toQMatrix(void) const
-{
+ *
+ * \warning this conversion might loss data if \c *this is not affine
+ *
+ * This function is available only if the token EIGEN_QT_SUPPORT is defined.
+ */
+template <typename Scalar, int Dim, int Mode, int Options>
+QMatrix Transform<Scalar, Dim, Mode, Options>::toQMatrix(void) const {
   check_template_params();
-  EIGEN_STATIC_ASSERT(Dim==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
-  return QMatrix(m_matrix.coeff(0,0), m_matrix.coeff(1,0),
-                 m_matrix.coeff(0,1), m_matrix.coeff(1,1),
-                 m_matrix.coeff(0,2), m_matrix.coeff(1,2));
+  EIGEN_STATIC_ASSERT(Dim == 2, YOU_MADE_A_PROGRAMMING_MISTAKE)
+  return QMatrix(m_matrix.coeff(0, 0), m_matrix.coeff(1, 0), m_matrix.coeff(0, 1), m_matrix.coeff(1, 1),
+                 m_matrix.coeff(0, 2), m_matrix.coeff(1, 2));
 }
+#endif
 
 /** Initializes \c *this from a QTransform assuming the dimension is 2.
-  *
-  * This function is available only if the token EIGEN_QT_SUPPORT is defined.
-  */
-template<typename Scalar, int Dim, int Mode,int Options>
-Transform<Scalar,Dim,Mode,Options>::Transform(const QTransform& other)
-{
+ *
+ * This function is available only if the token EIGEN_QT_SUPPORT is defined.
+ */
+template <typename Scalar, int Dim, int Mode, int Options>
+Transform<Scalar, Dim, Mode, Options>::Transform(const QTransform& other) {
   check_template_params();
   *this = other;
 }
 
 /** Set \c *this from a QTransform assuming the dimension is 2.
-  *
-  * This function is available only if the token EIGEN_QT_SUPPORT is defined.
-  */
-template<typename Scalar, int Dim, int Mode, int Options>
-Transform<Scalar,Dim,Mode,Options>& Transform<Scalar,Dim,Mode,Options>::operator=(const QTransform& other)
-{
+ *
+ * This function is available only if the token EIGEN_QT_SUPPORT is defined.
+ */
+template <typename Scalar, int Dim, int Mode, int Options>
+Transform<Scalar, Dim, Mode, Options>& Transform<Scalar, Dim, Mode, Options>::operator=(const QTransform& other) {
   check_template_params();
-  EIGEN_STATIC_ASSERT(Dim==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
-  if (Mode == int(AffineCompact))
-    m_matrix << other.m11(), other.m21(), other.dx(),
-                other.m12(), other.m22(), other.dy();
+  EIGEN_STATIC_ASSERT(Dim == 2, YOU_MADE_A_PROGRAMMING_MISTAKE)
+  if (EIGEN_CONST_CONDITIONAL(Mode == int(AffineCompact)))
+    m_matrix << other.m11(), other.m21(), other.dx(), other.m12(), other.m22(), other.dy();
   else
-    m_matrix << other.m11(), other.m21(), other.dx(),
-                other.m12(), other.m22(), other.dy(),
-                other.m13(), other.m23(), other.m33();
+    m_matrix << other.m11(), other.m21(), other.dx(), other.m12(), other.m22(), other.dy(), other.m13(), other.m23(),
+        other.m33();
   return *this;
 }
 
 /** \returns a QTransform from \c *this assuming the dimension is 2.
-  *
-  * This function is available only if the token EIGEN_QT_SUPPORT is defined.
-  */
-template<typename Scalar, int Dim, int Mode, int Options>
-QTransform Transform<Scalar,Dim,Mode,Options>::toQTransform(void) const
-{
-  EIGEN_STATIC_ASSERT(Dim==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
-  if (Mode == int(AffineCompact))
-    return QTransform(m_matrix.coeff(0,0), m_matrix.coeff(1,0),
-                      m_matrix.coeff(0,1), m_matrix.coeff(1,1),
-                      m_matrix.coeff(0,2), m_matrix.coeff(1,2));
+ *
+ * This function is available only if the token EIGEN_QT_SUPPORT is defined.
+ */
+template <typename Scalar, int Dim, int Mode, int Options>
+QTransform Transform<Scalar, Dim, Mode, Options>::toQTransform(void) const {
+  EIGEN_STATIC_ASSERT(Dim == 2, YOU_MADE_A_PROGRAMMING_MISTAKE)
+  if (EIGEN_CONST_CONDITIONAL(Mode == int(AffineCompact)))
+    return QTransform(m_matrix.coeff(0, 0), m_matrix.coeff(1, 0), m_matrix.coeff(0, 1), m_matrix.coeff(1, 1),
+                      m_matrix.coeff(0, 2), m_matrix.coeff(1, 2));
   else
-    return QTransform(m_matrix.coeff(0,0), m_matrix.coeff(1,0), m_matrix.coeff(2,0),
-                      m_matrix.coeff(0,1), m_matrix.coeff(1,1), m_matrix.coeff(2,1),
-                      m_matrix.coeff(0,2), m_matrix.coeff(1,2), m_matrix.coeff(2,2));
+    return QTransform(m_matrix.coeff(0, 0), m_matrix.coeff(1, 0), m_matrix.coeff(2, 0), m_matrix.coeff(0, 1),
+                      m_matrix.coeff(1, 1), m_matrix.coeff(2, 1), m_matrix.coeff(0, 2), m_matrix.coeff(1, 2),
+                      m_matrix.coeff(2, 2));
 }
 #endif
 
@@ -772,84 +797,80 @@ QTransform Transform<Scalar,Dim,Mode,Options>::toQTransform(void) const
 *********************/
 
 /** Applies on the right the non uniform scale transformation represented
-  * by the vector \a other to \c *this and returns a reference to \c *this.
-  * \sa prescale()
-  */
-template<typename Scalar, int Dim, int Mode, int Options>
-template<typename OtherDerived>
-Transform<Scalar,Dim,Mode,Options>&
-Transform<Scalar,Dim,Mode,Options>::scale(const MatrixBase<OtherDerived> &other)
-{
-  EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,int(Dim))
-  EIGEN_STATIC_ASSERT(Mode!=int(Isometry), THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS)
+ * by the vector \a other to \c *this and returns a reference to \c *this.
+ * \sa prescale()
+ */
+template <typename Scalar, int Dim, int Mode, int Options>
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC Transform<Scalar, Dim, Mode, Options>& Transform<Scalar, Dim, Mode, Options>::scale(
+    const MatrixBase<OtherDerived>& other) {
+  EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived, int(Dim))
+  EIGEN_STATIC_ASSERT(Mode != int(Isometry), THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS)
   linearExt().noalias() = (linearExt() * other.asDiagonal());
   return *this;
 }
 
 /** Applies on the right a uniform scale of a factor \a c to \c *this
-  * and returns a reference to \c *this.
-  * \sa prescale(Scalar)
-  */
-template<typename Scalar, int Dim, int Mode, int Options>
-inline Transform<Scalar,Dim,Mode,Options>& Transform<Scalar,Dim,Mode,Options>::scale(const Scalar& s)
-{
-  EIGEN_STATIC_ASSERT(Mode!=int(Isometry), THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS)
+ * and returns a reference to \c *this.
+ * \sa prescale(Scalar)
+ */
+template <typename Scalar, int Dim, int Mode, int Options>
+EIGEN_DEVICE_FUNC inline Transform<Scalar, Dim, Mode, Options>& Transform<Scalar, Dim, Mode, Options>::scale(
+    const Scalar& s) {
+  EIGEN_STATIC_ASSERT(Mode != int(Isometry), THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS)
   linearExt() *= s;
   return *this;
 }
 
 /** Applies on the left the non uniform scale transformation represented
-  * by the vector \a other to \c *this and returns a reference to \c *this.
-  * \sa scale()
-  */
-template<typename Scalar, int Dim, int Mode, int Options>
-template<typename OtherDerived>
-Transform<Scalar,Dim,Mode,Options>&
-Transform<Scalar,Dim,Mode,Options>::prescale(const MatrixBase<OtherDerived> &other)
-{
-  EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,int(Dim))
-  EIGEN_STATIC_ASSERT(Mode!=int(Isometry), THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS)
-  m_matrix.template block<Dim,HDim>(0,0).noalias() = (other.asDiagonal() * m_matrix.template block<Dim,HDim>(0,0));
+ * by the vector \a other to \c *this and returns a reference to \c *this.
+ * \sa scale()
+ */
+template <typename Scalar, int Dim, int Mode, int Options>
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC Transform<Scalar, Dim, Mode, Options>& Transform<Scalar, Dim, Mode, Options>::prescale(
+    const MatrixBase<OtherDerived>& other) {
+  EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived, int(Dim))
+  EIGEN_STATIC_ASSERT(Mode != int(Isometry), THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS)
+  affine().noalias() = (other.asDiagonal() * affine());
   return *this;
 }
 
 /** Applies on the left a uniform scale of a factor \a c to \c *this
-  * and returns a reference to \c *this.
-  * \sa scale(Scalar)
-  */
-template<typename Scalar, int Dim, int Mode, int Options>
-inline Transform<Scalar,Dim,Mode,Options>& Transform<Scalar,Dim,Mode,Options>::prescale(const Scalar& s)
-{
-  EIGEN_STATIC_ASSERT(Mode!=int(Isometry), THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS)
+ * and returns a reference to \c *this.
+ * \sa scale(Scalar)
+ */
+template <typename Scalar, int Dim, int Mode, int Options>
+EIGEN_DEVICE_FUNC inline Transform<Scalar, Dim, Mode, Options>& Transform<Scalar, Dim, Mode, Options>::prescale(
+    const Scalar& s) {
+  EIGEN_STATIC_ASSERT(Mode != int(Isometry), THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS)
   m_matrix.template topRows<Dim>() *= s;
   return *this;
 }
 
 /** Applies on the right the translation matrix represented by the vector \a other
-  * to \c *this and returns a reference to \c *this.
-  * \sa pretranslate()
-  */
-template<typename Scalar, int Dim, int Mode, int Options>
-template<typename OtherDerived>
-Transform<Scalar,Dim,Mode,Options>&
-Transform<Scalar,Dim,Mode,Options>::translate(const MatrixBase<OtherDerived> &other)
-{
-  EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,int(Dim))
+ * to \c *this and returns a reference to \c *this.
+ * \sa pretranslate()
+ */
+template <typename Scalar, int Dim, int Mode, int Options>
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC Transform<Scalar, Dim, Mode, Options>& Transform<Scalar, Dim, Mode, Options>::translate(
+    const MatrixBase<OtherDerived>& other) {
+  EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived, int(Dim))
   translationExt() += linearExt() * other;
   return *this;
 }
 
 /** Applies on the left the translation matrix represented by the vector \a other
-  * to \c *this and returns a reference to \c *this.
-  * \sa translate()
-  */
-template<typename Scalar, int Dim, int Mode, int Options>
-template<typename OtherDerived>
-Transform<Scalar,Dim,Mode,Options>&
-Transform<Scalar,Dim,Mode,Options>::pretranslate(const MatrixBase<OtherDerived> &other)
-{
-  EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived,int(Dim))
-  if(int(Mode)==int(Projective))
+ * to \c *this and returns a reference to \c *this.
+ * \sa translate()
+ */
+template <typename Scalar, int Dim, int Mode, int Options>
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC Transform<Scalar, Dim, Mode, Options>& Transform<Scalar, Dim, Mode, Options>::pretranslate(
+    const MatrixBase<OtherDerived>& other) {
+  EIGEN_STATIC_ASSERT_VECTOR_SPECIFIC_SIZE(OtherDerived, int(Dim))
+  if (EIGEN_CONST_CONDITIONAL(int(Mode) == int(Projective)))
     affine() += other * m_matrix.row(Dim);
   else
     translation() += other;
@@ -857,76 +878,73 @@ Transform<Scalar,Dim,Mode,Options>::pretranslate(const MatrixBase<OtherDerived>
 }
 
 /** Applies on the right the rotation represented by the rotation \a rotation
-  * to \c *this and returns a reference to \c *this.
-  *
-  * The template parameter \a RotationType is the type of the rotation which
-  * must be known by internal::toRotationMatrix<>.
-  *
-  * Natively supported types includes:
-  *   - any scalar (2D),
-  *   - a Dim x Dim matrix expression,
-  *   - a Quaternion (3D),
-  *   - a AngleAxis (3D)
-  *
-  * This mechanism is easily extendable to support user types such as Euler angles,
-  * or a pair of Quaternion for 4D rotations.
-  *
-  * \sa rotate(Scalar), class Quaternion, class AngleAxis, prerotate(RotationType)
-  */
-template<typename Scalar, int Dim, int Mode, int Options>
-template<typename RotationType>
-Transform<Scalar,Dim,Mode,Options>&
-Transform<Scalar,Dim,Mode,Options>::rotate(const RotationType& rotation)
-{
-  linearExt() *= internal::toRotationMatrix<Scalar,Dim>(rotation);
+ * to \c *this and returns a reference to \c *this.
+ *
+ * The template parameter \a RotationType is the type of the rotation which
+ * must be known by internal::toRotationMatrix<>.
+ *
+ * Natively supported types includes:
+ *   - any scalar (2D),
+ *   - a Dim x Dim matrix expression,
+ *   - a Quaternion (3D),
+ *   - a AngleAxis (3D)
+ *
+ * This mechanism is easily extendable to support user types such as Euler angles,
+ * or a pair of Quaternion for 4D rotations.
+ *
+ * \sa rotate(Scalar), class Quaternion, class AngleAxis, prerotate(RotationType)
+ */
+template <typename Scalar, int Dim, int Mode, int Options>
+template <typename RotationType>
+EIGEN_DEVICE_FUNC Transform<Scalar, Dim, Mode, Options>& Transform<Scalar, Dim, Mode, Options>::rotate(
+    const RotationType& rotation) {
+  linearExt() *= internal::toRotationMatrix<Scalar, Dim>(rotation);
   return *this;
 }
 
 /** Applies on the left the rotation represented by the rotation \a rotation
-  * to \c *this and returns a reference to \c *this.
-  *
-  * See rotate() for further details.
-  *
-  * \sa rotate()
-  */
-template<typename Scalar, int Dim, int Mode, int Options>
-template<typename RotationType>
-Transform<Scalar,Dim,Mode,Options>&
-Transform<Scalar,Dim,Mode,Options>::prerotate(const RotationType& rotation)
-{
-  m_matrix.template block<Dim,HDim>(0,0) = internal::toRotationMatrix<Scalar,Dim>(rotation)
-                                         * m_matrix.template block<Dim,HDim>(0,0);
+ * to \c *this and returns a reference to \c *this.
+ *
+ * See rotate() for further details.
+ *
+ * \sa rotate()
+ */
+template <typename Scalar, int Dim, int Mode, int Options>
+template <typename RotationType>
+EIGEN_DEVICE_FUNC Transform<Scalar, Dim, Mode, Options>& Transform<Scalar, Dim, Mode, Options>::prerotate(
+    const RotationType& rotation) {
+  m_matrix.template block<Dim, HDim>(0, 0) =
+      internal::toRotationMatrix<Scalar, Dim>(rotation) * m_matrix.template block<Dim, HDim>(0, 0);
   return *this;
 }
 
 /** Applies on the right the shear transformation represented
-  * by the vector \a other to \c *this and returns a reference to \c *this.
-  * \warning 2D only.
-  * \sa preshear()
-  */
-template<typename Scalar, int Dim, int Mode, int Options>
-Transform<Scalar,Dim,Mode,Options>&
-Transform<Scalar,Dim,Mode,Options>::shear(const Scalar& sx, const Scalar& sy)
-{
-  EIGEN_STATIC_ASSERT(int(Dim)==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
-  EIGEN_STATIC_ASSERT(Mode!=int(Isometry), THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS)
-  VectorType tmp = linear().col(0)*sy + linear().col(1);
-  linear() << linear().col(0) + linear().col(1)*sx, tmp;
+ * by the vector \a other to \c *this and returns a reference to \c *this.
+ * \warning 2D only.
+ * \sa preshear()
+ */
+template <typename Scalar, int Dim, int Mode, int Options>
+EIGEN_DEVICE_FUNC Transform<Scalar, Dim, Mode, Options>& Transform<Scalar, Dim, Mode, Options>::shear(
+    const Scalar& sx, const Scalar& sy) {
+  EIGEN_STATIC_ASSERT(int(Dim) == 2, YOU_MADE_A_PROGRAMMING_MISTAKE)
+  EIGEN_STATIC_ASSERT(Mode != int(Isometry), THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS)
+  VectorType tmp = linear().col(0) * sy + linear().col(1);
+  linear() << linear().col(0) + linear().col(1) * sx, tmp;
   return *this;
 }
 
 /** Applies on the left the shear transformation represented
-  * by the vector \a other to \c *this and returns a reference to \c *this.
-  * \warning 2D only.
-  * \sa shear()
-  */
-template<typename Scalar, int Dim, int Mode, int Options>
-Transform<Scalar,Dim,Mode,Options>&
-Transform<Scalar,Dim,Mode,Options>::preshear(const Scalar& sx, const Scalar& sy)
-{
-  EIGEN_STATIC_ASSERT(int(Dim)==2, YOU_MADE_A_PROGRAMMING_MISTAKE)
-  EIGEN_STATIC_ASSERT(Mode!=int(Isometry), THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS)
-  m_matrix.template block<Dim,HDim>(0,0) = LinearMatrixType(1, sx, sy, 1) * m_matrix.template block<Dim,HDim>(0,0);
+ * by the vector \a other to \c *this and returns a reference to \c *this.
+ * \warning 2D only.
+ * \sa shear()
+ */
+template <typename Scalar, int Dim, int Mode, int Options>
+EIGEN_DEVICE_FUNC Transform<Scalar, Dim, Mode, Options>& Transform<Scalar, Dim, Mode, Options>::preshear(
+    const Scalar& sx, const Scalar& sy) {
+  EIGEN_STATIC_ASSERT(int(Dim) == 2, YOU_MADE_A_PROGRAMMING_MISTAKE)
+  EIGEN_STATIC_ASSERT(Mode != int(Isometry), THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS)
+  m_matrix.template block<Dim, HDim>(0, 0) =
+      LinearMatrixType({{1, sy}, {sx, 1}}) * m_matrix.template block<Dim, HDim>(0, 0);
   return *this;
 }
 
@@ -934,46 +952,46 @@ Transform<Scalar,Dim,Mode,Options>::preshear(const Scalar& sx, const Scalar& sy)
 *** Scaling, Translation and Rotation compatibility ***
 ******************************************************/
 
-template<typename Scalar, int Dim, int Mode, int Options>
-inline Transform<Scalar,Dim,Mode,Options>& Transform<Scalar,Dim,Mode,Options>::operator=(const TranslationType& t)
-{
+template <typename Scalar, int Dim, int Mode, int Options>
+EIGEN_DEVICE_FUNC inline Transform<Scalar, Dim, Mode, Options>& Transform<Scalar, Dim, Mode, Options>::operator=(
+    const TranslationType& t) {
   linear().setIdentity();
   translation() = t.vector();
   makeAffine();
   return *this;
 }
 
-template<typename Scalar, int Dim, int Mode, int Options>
-inline Transform<Scalar,Dim,Mode,Options> Transform<Scalar,Dim,Mode,Options>::operator*(const TranslationType& t) const
-{
+template <typename Scalar, int Dim, int Mode, int Options>
+EIGEN_DEVICE_FUNC inline Transform<Scalar, Dim, Mode, Options> Transform<Scalar, Dim, Mode, Options>::operator*(
+    const TranslationType& t) const {
   Transform res = *this;
   res.translate(t.vector());
   return res;
 }
 
-template<typename Scalar, int Dim, int Mode, int Options>
-inline Transform<Scalar,Dim,Mode,Options>& Transform<Scalar,Dim,Mode,Options>::operator=(const UniformScaling<Scalar>& s)
-{
+template <typename Scalar, int Dim, int Mode, int Options>
+EIGEN_DEVICE_FUNC inline Transform<Scalar, Dim, Mode, Options>& Transform<Scalar, Dim, Mode, Options>::operator=(
+    const UniformScaling<Scalar>& s) {
   m_matrix.setZero();
   linear().diagonal().fill(s.factor());
   makeAffine();
   return *this;
 }
 
-template<typename Scalar, int Dim, int Mode, int Options>
-template<typename Derived>
-inline Transform<Scalar,Dim,Mode,Options>& Transform<Scalar,Dim,Mode,Options>::operator=(const RotationBase<Derived,Dim>& r)
-{
-  linear() = internal::toRotationMatrix<Scalar,Dim>(r);
+template <typename Scalar, int Dim, int Mode, int Options>
+template <typename Derived>
+EIGEN_DEVICE_FUNC inline Transform<Scalar, Dim, Mode, Options>& Transform<Scalar, Dim, Mode, Options>::operator=(
+    const RotationBase<Derived, Dim>& r) {
+  linear() = internal::toRotationMatrix<Scalar, Dim>(r);
   translation().setZero();
   makeAffine();
   return *this;
 }
 
-template<typename Scalar, int Dim, int Mode, int Options>
-template<typename Derived>
-inline Transform<Scalar,Dim,Mode,Options> Transform<Scalar,Dim,Mode,Options>::operator*(const RotationBase<Derived,Dim>& r) const
-{
+template <typename Scalar, int Dim, int Mode, int Options>
+template <typename Derived>
+EIGEN_DEVICE_FUNC inline Transform<Scalar, Dim, Mode, Options> Transform<Scalar, Dim, Mode, Options>::operator*(
+    const RotationBase<Derived, Dim>& r) const {
   Transform res = *this;
   res.rotate(r.derived());
   return res;
@@ -983,91 +1001,113 @@ inline Transform<Scalar,Dim,Mode,Options> Transform<Scalar,Dim,Mode,Options>::op
 *** Special functions ***
 ************************/
 
+namespace internal {
+template <int Mode>
+struct transform_rotation_impl {
+  template <typename TransformType>
+  EIGEN_DEVICE_FUNC static inline const typename TransformType::LinearMatrixType run(const TransformType& t) {
+    typedef typename TransformType::LinearMatrixType LinearMatrixType;
+    LinearMatrixType result;
+    t.computeRotationScaling(&result, (LinearMatrixType*)0);
+    return result;
+  }
+};
+template <>
+struct transform_rotation_impl<Isometry> {
+  template <typename TransformType>
+  EIGEN_DEVICE_FUNC static inline typename TransformType::ConstLinearPart run(const TransformType& t) {
+    return t.linear();
+  }
+};
+}  // namespace internal
 /** \returns the rotation part of the transformation
-  *
-  *
-  * \svd_module
-  *
-  * \sa computeRotationScaling(), computeScalingRotation(), class SVD
-  */
-template<typename Scalar, int Dim, int Mode, int Options>
-const typename Transform<Scalar,Dim,Mode,Options>::LinearMatrixType
-Transform<Scalar,Dim,Mode,Options>::rotation() const
-{
-  LinearMatrixType result;
-  computeRotationScaling(&result, (LinearMatrixType*)0);
-  return result;
+ *
+ * If Mode==Isometry, then this method is an alias for linear(),
+ * otherwise it calls computeRotationScaling() to extract the rotation
+ * through a SVD decomposition.
+ *
+ * \svd_module
+ *
+ * \sa computeRotationScaling(), computeScalingRotation(), class SVD
+ */
+template <typename Scalar, int Dim, int Mode, int Options>
+EIGEN_DEVICE_FUNC typename Transform<Scalar, Dim, Mode, Options>::RotationReturnType
+Transform<Scalar, Dim, Mode, Options>::rotation() const {
+  return internal::transform_rotation_impl<Mode>::run(*this);
 }
 
-
 /** decomposes the linear part of the transformation as a product rotation x scaling, the scaling being
-  * not necessarily positive.
-  *
-  * If either pointer is zero, the corresponding computation is skipped.
-  *
-  *
-  *
-  * \svd_module
-  *
-  * \sa computeScalingRotation(), rotation(), class SVD
-  */
-template<typename Scalar, int Dim, int Mode, int Options>
-template<typename RotationMatrixType, typename ScalingMatrixType>
-void Transform<Scalar,Dim,Mode,Options>::computeRotationScaling(RotationMatrixType *rotation, ScalingMatrixType *scaling) const
-{
-  JacobiSVD<LinearMatrixType> svd(linear(), ComputeFullU | ComputeFullV);
-
-  Scalar x = (svd.matrixU() * svd.matrixV().adjoint()).determinant(); // so x has absolute value 1
+ * not necessarily positive.
+ *
+ * If either pointer is zero, the corresponding computation is skipped.
+ *
+ *
+ *
+ * \svd_module
+ *
+ * \sa computeScalingRotation(), rotation(), class SVD
+ */
+template <typename Scalar, int Dim, int Mode, int Options>
+template <typename RotationMatrixType, typename ScalingMatrixType>
+EIGEN_DEVICE_FUNC void Transform<Scalar, Dim, Mode, Options>::computeRotationScaling(RotationMatrixType* rotation,
+                                                                                     ScalingMatrixType* scaling) const {
+  // Note that JacobiSVD is faster than BDCSVD for small matrices.
+  JacobiSVD<LinearMatrixType, ComputeFullU | ComputeFullV> svd(linear());
+
+  Scalar x = (svd.matrixU() * svd.matrixV().adjoint()).determinant() < Scalar(0)
+                 ? Scalar(-1)
+                 : Scalar(1);  // so x has absolute value 1
   VectorType sv(svd.singularValues());
-  sv.coeffRef(0) *= x;
-  if(scaling) scaling->lazyAssign(svd.matrixV() * sv.asDiagonal() * svd.matrixV().adjoint());
-  if(rotation)
-  {
+  sv.coeffRef(Dim - 1) *= x;
+  if (scaling) (*scaling).noalias() = svd.matrixV() * sv.asDiagonal() * svd.matrixV().adjoint();
+  if (rotation) {
     LinearMatrixType m(svd.matrixU());
-    m.col(0) /= x;
-    rotation->lazyAssign(m * svd.matrixV().adjoint());
+    m.col(Dim - 1) *= x;
+    (*rotation).noalias() = m * svd.matrixV().adjoint();
   }
 }
 
-/** decomposes the linear part of the transformation as a product rotation x scaling, the scaling being
-  * not necessarily positive.
-  *
-  * If either pointer is zero, the corresponding computation is skipped.
-  *
-  *
-  *
-  * \svd_module
-  *
-  * \sa computeRotationScaling(), rotation(), class SVD
-  */
-template<typename Scalar, int Dim, int Mode, int Options>
-template<typename ScalingMatrixType, typename RotationMatrixType>
-void Transform<Scalar,Dim,Mode,Options>::computeScalingRotation(ScalingMatrixType *scaling, RotationMatrixType *rotation) const
-{
-  JacobiSVD<LinearMatrixType> svd(linear(), ComputeFullU | ComputeFullV);
-
-  Scalar x = (svd.matrixU() * svd.matrixV().adjoint()).determinant(); // so x has absolute value 1
+/** decomposes the linear part of the transformation as a product scaling x rotation, the scaling being
+ * not necessarily positive.
+ *
+ * If either pointer is zero, the corresponding computation is skipped.
+ *
+ *
+ *
+ * \svd_module
+ *
+ * \sa computeRotationScaling(), rotation(), class SVD
+ */
+template <typename Scalar, int Dim, int Mode, int Options>
+template <typename ScalingMatrixType, typename RotationMatrixType>
+EIGEN_DEVICE_FUNC void Transform<Scalar, Dim, Mode, Options>::computeScalingRotation(
+    ScalingMatrixType* scaling, RotationMatrixType* rotation) const {
+  // Note that JacobiSVD is faster than BDCSVD for small matrices.
+  JacobiSVD<LinearMatrixType, ComputeFullU | ComputeFullV> svd(linear());
+
+  Scalar x = (svd.matrixU() * svd.matrixV().adjoint()).determinant() < Scalar(0)
+                 ? Scalar(-1)
+                 : Scalar(1);  // so x has absolute value 1
   VectorType sv(svd.singularValues());
-  sv.coeffRef(0) *= x;
-  if(scaling) scaling->lazyAssign(svd.matrixU() * sv.asDiagonal() * svd.matrixU().adjoint());
-  if(rotation)
-  {
+  sv.coeffRef(Dim - 1) *= x;
+  if (scaling) *scaling = svd.matrixU() * sv.asDiagonal() * svd.matrixU().adjoint();
+  if (rotation) {
     LinearMatrixType m(svd.matrixU());
-    m.col(0) /= x;
-    rotation->lazyAssign(m * svd.matrixV().adjoint());
+    m.col(Dim - 1) *= x;
+    *rotation = m * svd.matrixV().adjoint();
   }
 }
 
 /** Convenient method to set \c *this from a position, orientation and scale
-  * of a 3D object.
-  */
-template<typename Scalar, int Dim, int Mode, int Options>
-template<typename PositionDerived, typename OrientationType, typename ScaleDerived>
-Transform<Scalar,Dim,Mode,Options>&
-Transform<Scalar,Dim,Mode,Options>::fromPositionOrientationScale(const MatrixBase<PositionDerived> &position,
-  const OrientationType& orientation, const MatrixBase<ScaleDerived> &scale)
-{
-  linear() = internal::toRotationMatrix<Scalar,Dim>(orientation);
+ * of a 3D object.
+ */
+template <typename Scalar, int Dim, int Mode, int Options>
+template <typename PositionDerived, typename OrientationType, typename ScaleDerived>
+EIGEN_DEVICE_FUNC Transform<Scalar, Dim, Mode, Options>&
+Transform<Scalar, Dim, Mode, Options>::fromPositionOrientationScale(const MatrixBase<PositionDerived>& position,
+                                                                    const OrientationType& orientation,
+                                                                    const MatrixBase<ScaleDerived>& scale) {
+  linear() = internal::toRotationMatrix<Scalar, Dim>(orientation);
   linear() *= scale.asDiagonal();
   translation() = position;
   makeAffine();
@@ -1076,91 +1116,75 @@ Transform<Scalar,Dim,Mode,Options>::fromPositionOrientationScale(const MatrixBas
 
 namespace internal {
 
-template<int Mode>
-struct transform_make_affine
-{
-  template<typename MatrixType>
-  static void run(MatrixType &mat)
-  {
-    static const int Dim = MatrixType::ColsAtCompileTime-1;
-    mat.template block<1,Dim>(Dim,0).setZero();
-    mat.coeffRef(Dim,Dim) = typename MatrixType::Scalar(1);
+template <int Mode>
+struct transform_make_affine {
+  template <typename MatrixType>
+  EIGEN_DEVICE_FUNC static void run(MatrixType& mat) {
+    static const int Dim = MatrixType::ColsAtCompileTime - 1;
+    mat.template block<1, Dim>(Dim, 0).setZero();
+    mat.coeffRef(Dim, Dim) = typename MatrixType::Scalar(1);
   }
 };
 
-template<>
-struct transform_make_affine<AffineCompact>
-{
-  template<typename MatrixType> static void run(MatrixType &) { }
+template <>
+struct transform_make_affine<AffineCompact> {
+  template <typename MatrixType>
+  EIGEN_DEVICE_FUNC static void run(MatrixType&) {}
 };
-    
+
 // selector needed to avoid taking the inverse of a 3x4 matrix
-template<typename TransformType, int Mode=TransformType::Mode>
-struct projective_transform_inverse
-{
-  static inline void run(const TransformType&, TransformType&)
-  {}
+template <typename TransformType, int Mode = TransformType::Mode>
+struct projective_transform_inverse {
+  EIGEN_DEVICE_FUNC static inline void run(const TransformType&, TransformType&) {}
 };
 
-template<typename TransformType>
-struct projective_transform_inverse<TransformType, Projective>
-{
-  static inline void run(const TransformType& m, TransformType& res)
-  {
+template <typename TransformType>
+struct projective_transform_inverse<TransformType, Projective> {
+  EIGEN_DEVICE_FUNC static inline void run(const TransformType& m, TransformType& res) {
     res.matrix() = m.matrix().inverse();
   }
 };
 
-} // end namespace internal
-
+}  // end namespace internal
 
 /**
-  *
-  * \returns the inverse transformation according to some given knowledge
-  * on \c *this.
-  *
-  * \param hint allows to optimize the inversion process when the transformation
-  * is known to be not a general transformation (optional). The possible values are:
-  *  - #Projective if the transformation is not necessarily affine, i.e., if the
-  *    last row is not guaranteed to be [0 ... 0 1]
-  *  - #Affine if the last row can be assumed to be [0 ... 0 1]
-  *  - #Isometry if the transformation is only a concatenations of translations
-  *    and rotations.
-  *  The default is the template class parameter \c Mode.
-  *
-  * \warning unless \a traits is always set to NoShear or NoScaling, this function
-  * requires the generic inverse method of MatrixBase defined in the LU module. If
-  * you forget to include this module, then you will get hard to debug linking errors.
-  *
-  * \sa MatrixBase::inverse()
-  */
-template<typename Scalar, int Dim, int Mode, int Options>
-Transform<Scalar,Dim,Mode,Options>
-Transform<Scalar,Dim,Mode,Options>::inverse(TransformTraits hint) const
-{
+ *
+ * \returns the inverse transformation according to some given knowledge
+ * on \c *this.
+ *
+ * \param hint allows to optimize the inversion process when the transformation
+ * is known to be not a general transformation (optional). The possible values are:
+ *  - #Projective if the transformation is not necessarily affine, i.e., if the
+ *    last row is not guaranteed to be [0 ... 0 1]
+ *  - #Affine if the last row can be assumed to be [0 ... 0 1]
+ *  - #Isometry if the transformation is only a concatenations of translations
+ *    and rotations.
+ *  The default is the template class parameter \c Mode.
+ *
+ * \warning unless \a traits is always set to NoShear or NoScaling, this function
+ * requires the generic inverse method of MatrixBase defined in the LU module. If
+ * you forget to include this module, then you will get hard to debug linking errors.
+ *
+ * \sa MatrixBase::inverse()
+ */
+template <typename Scalar, int Dim, int Mode, int Options>
+EIGEN_DEVICE_FUNC Transform<Scalar, Dim, Mode, Options> Transform<Scalar, Dim, Mode, Options>::inverse(
+    TransformTraits hint) const {
   Transform res;
-  if (hint == Projective)
-  {
+  if (hint == Projective) {
     internal::projective_transform_inverse<Transform>::run(*this, res);
-  }
-  else
-  {
-    if (hint == Isometry)
-    {
-      res.matrix().template topLeftCorner<Dim,Dim>() = linear().transpose();
-    }
-    else if(hint&Affine)
-    {
-      res.matrix().template topLeftCorner<Dim,Dim>() = linear().inverse();
-    }
-    else
-    {
+  } else {
+    if (hint == Isometry) {
+      res.matrix().template topLeftCorner<Dim, Dim>() = linear().transpose();
+    } else if (hint & Affine) {
+      res.matrix().template topLeftCorner<Dim, Dim>() = linear().inverse();
+    } else {
       eigen_assert(false && "Invalid transform traits in Transform::Inverse");
     }
     // translation and remaining parts
-    res.matrix().template topRightCorner<Dim,1>()
-      = - res.matrix().template topLeftCorner<Dim,Dim>() * translation();
-    res.makeAffine(); // we do need this, because in the beginning res is uninitialized
+    res.matrix().template topRightCorner<Dim, 1>().noalias() =
+        -res.matrix().template topLeftCorner<Dim, Dim>() * translation();
+    res.makeAffine();  // we do need this, because in the beginning res is uninitialized
   }
   return res;
 }
@@ -1171,95 +1195,93 @@ namespace internal {
 *** Specializations of take affine part            ***
 *****************************************************/
 
-template<typename TransformType> struct transform_take_affine_part {
+template <typename TransformType>
+struct transform_take_affine_part {
   typedef typename TransformType::MatrixType MatrixType;
   typedef typename TransformType::AffinePart AffinePart;
   typedef typename TransformType::ConstAffinePart ConstAffinePart;
-  static inline AffinePart run(MatrixType& m)
-  { return m.template block<TransformType::Dim,TransformType::HDim>(0,0); }
-  static inline ConstAffinePart run(const MatrixType& m)
-  { return m.template block<TransformType::Dim,TransformType::HDim>(0,0); }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AffinePart run(MatrixType& m) {
+    return m.template block<TransformType::Dim, TransformType::HDim>(0, 0);
+  }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ConstAffinePart run(const MatrixType& m) {
+    return m.template block<TransformType::Dim, TransformType::HDim>(0, 0);
+  }
 };
 
-template<typename Scalar, int Dim, int Options>
-struct transform_take_affine_part<Transform<Scalar,Dim,AffineCompact, Options> > {
-  typedef typename Transform<Scalar,Dim,AffineCompact,Options>::MatrixType MatrixType;
-  static inline MatrixType& run(MatrixType& m) { return m; }
-  static inline const MatrixType& run(const MatrixType& m) { return m; }
+template <typename Scalar, int Dim, int Options>
+struct transform_take_affine_part<Transform<Scalar, Dim, AffineCompact, Options> > {
+  typedef typename Transform<Scalar, Dim, AffineCompact, Options>::MatrixType MatrixType;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MatrixType& run(MatrixType& m) { return m; }
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const MatrixType& run(const MatrixType& m) { return m; }
 };
 
 /*****************************************************
 *** Specializations of construct from matrix       ***
 *****************************************************/
 
-template<typename Other, int Mode, int Options, int Dim, int HDim>
-struct transform_construct_from_matrix<Other, Mode,Options,Dim,HDim, Dim,Dim>
-{
-  static inline void run(Transform<typename Other::Scalar,Dim,Mode,Options> *transform, const Other& other)
-  {
+template <typename Other, int Mode, int Options, int Dim, int HDim>
+struct transform_construct_from_matrix<Other, Mode, Options, Dim, HDim, Dim, Dim> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(
+      Transform<typename Other::Scalar, Dim, Mode, Options>* transform, const Other& other) {
     transform->linear() = other;
     transform->translation().setZero();
     transform->makeAffine();
   }
 };
 
-template<typename Other, int Mode, int Options, int Dim, int HDim>
-struct transform_construct_from_matrix<Other, Mode,Options,Dim,HDim, Dim,HDim>
-{
-  static inline void run(Transform<typename Other::Scalar,Dim,Mode,Options> *transform, const Other& other)
-  {
+template <typename Other, int Mode, int Options, int Dim, int HDim>
+struct transform_construct_from_matrix<Other, Mode, Options, Dim, HDim, Dim, HDim> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(
+      Transform<typename Other::Scalar, Dim, Mode, Options>* transform, const Other& other) {
     transform->affine() = other;
     transform->makeAffine();
   }
 };
 
-template<typename Other, int Mode, int Options, int Dim, int HDim>
-struct transform_construct_from_matrix<Other, Mode,Options,Dim,HDim, HDim,HDim>
-{
-  static inline void run(Transform<typename Other::Scalar,Dim,Mode,Options> *transform, const Other& other)
-  { transform->matrix() = other; }
+template <typename Other, int Mode, int Options, int Dim, int HDim>
+struct transform_construct_from_matrix<Other, Mode, Options, Dim, HDim, HDim, HDim> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(
+      Transform<typename Other::Scalar, Dim, Mode, Options>* transform, const Other& other) {
+    transform->matrix() = other;
+  }
 };
 
-template<typename Other, int Options, int Dim, int HDim>
-struct transform_construct_from_matrix<Other, AffineCompact,Options,Dim,HDim, HDim,HDim>
-{
-  static inline void run(Transform<typename Other::Scalar,Dim,AffineCompact,Options> *transform, const Other& other)
-  { transform->matrix() = other.template block<Dim,HDim>(0,0); }
+template <typename Other, int Options, int Dim, int HDim>
+struct transform_construct_from_matrix<Other, AffineCompact, Options, Dim, HDim, HDim, HDim> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(
+      Transform<typename Other::Scalar, Dim, AffineCompact, Options>* transform, const Other& other) {
+    transform->matrix() = other.template block<Dim, HDim>(0, 0);
+  }
 };
 
 /**********************************************************
 ***   Specializations of operator* with rhs EigenBase   ***
 **********************************************************/
 
-template<int LhsMode,int RhsMode>
-struct transform_product_result
-{
-  enum 
-  { 
-    Mode =
-      (LhsMode == (int)Projective    || RhsMode == (int)Projective    ) ? Projective :
-      (LhsMode == (int)Affine        || RhsMode == (int)Affine        ) ? Affine :
-      (LhsMode == (int)AffineCompact || RhsMode == (int)AffineCompact ) ? AffineCompact :
-      (LhsMode == (int)Isometry      || RhsMode == (int)Isometry      ) ? Isometry : Projective
+template <int LhsMode, int RhsMode>
+struct transform_product_result {
+  enum {
+    Mode = (LhsMode == (int)Projective || RhsMode == (int)Projective)         ? Projective
+           : (LhsMode == (int)Affine || RhsMode == (int)Affine)               ? Affine
+           : (LhsMode == (int)AffineCompact || RhsMode == (int)AffineCompact) ? AffineCompact
+           : (LhsMode == (int)Isometry || RhsMode == (int)Isometry)           ? Isometry
+                                                                              : Projective
   };
 };
 
-template< typename TransformType, typename MatrixType >
-struct transform_right_product_impl< TransformType, MatrixType, 0 >
-{
+template <typename TransformType, typename MatrixType, int RhsCols>
+struct transform_right_product_impl<TransformType, MatrixType, 0, RhsCols> {
   typedef typename MatrixType::PlainObject ResultType;
 
-  static EIGEN_STRONG_INLINE ResultType run(const TransformType& T, const MatrixType& other)
-  {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType run(const TransformType& T, const MatrixType& other) {
     return T.matrix() * other;
   }
 };
 
-template< typename TransformType, typename MatrixType >
-struct transform_right_product_impl< TransformType, MatrixType, 1 >
-{
-  enum { 
-    Dim = TransformType::Dim, 
+template <typename TransformType, typename MatrixType, int RhsCols>
+struct transform_right_product_impl<TransformType, MatrixType, 1, RhsCols> {
+  enum {
+    Dim = TransformType::Dim,
     HDim = TransformType::HDim,
     OtherRows = MatrixType::RowsAtCompileTime,
     OtherCols = MatrixType::ColsAtCompileTime
@@ -1267,25 +1289,23 @@ struct transform_right_product_impl< TransformType, MatrixType, 1 >
 
   typedef typename MatrixType::PlainObject ResultType;
 
-  static EIGEN_STRONG_INLINE ResultType run(const TransformType& T, const MatrixType& other)
-  {
-    EIGEN_STATIC_ASSERT(OtherRows==HDim, YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES);
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType run(const TransformType& T, const MatrixType& other) {
+    EIGEN_STATIC_ASSERT(OtherRows == HDim, YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES);
 
-    typedef Block<ResultType, Dim, OtherCols, int(MatrixType::RowsAtCompileTime)==Dim> TopLeftLhs;
+    typedef Block<ResultType, Dim, OtherCols, int(MatrixType::RowsAtCompileTime) == Dim> TopLeftLhs;
 
-    ResultType res(other.rows(),other.cols());
+    ResultType res(other.rows(), other.cols());
     TopLeftLhs(res, 0, 0, Dim, other.cols()).noalias() = T.affine() * other;
-    res.row(OtherRows-1) = other.row(OtherRows-1);
-    
+    res.row(OtherRows - 1) = other.row(OtherRows - 1);
+
     return res;
   }
 };
 
-template< typename TransformType, typename MatrixType >
-struct transform_right_product_impl< TransformType, MatrixType, 2 >
-{
-  enum { 
-    Dim = TransformType::Dim, 
+template <typename TransformType, typename MatrixType, int RhsCols>
+struct transform_right_product_impl<TransformType, MatrixType, 2, RhsCols> {
+  enum {
+    Dim = TransformType::Dim,
     HDim = TransformType::HDim,
     OtherRows = MatrixType::RowsAtCompileTime,
     OtherCols = MatrixType::ColsAtCompileTime
@@ -1293,58 +1313,78 @@ struct transform_right_product_impl< TransformType, MatrixType, 2 >
 
   typedef typename MatrixType::PlainObject ResultType;
 
-  static EIGEN_STRONG_INLINE ResultType run(const TransformType& T, const MatrixType& other)
-  {
-    EIGEN_STATIC_ASSERT(OtherRows==Dim, YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES);
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType run(const TransformType& T, const MatrixType& other) {
+    EIGEN_STATIC_ASSERT(OtherRows == Dim, YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES);
 
     typedef Block<ResultType, Dim, OtherCols, true> TopLeftLhs;
-    ResultType res(Replicate<typename TransformType::ConstTranslationPart, 1, OtherCols>(T.translation(),1,other.cols()));
+    ResultType res(
+        Replicate<typename TransformType::ConstTranslationPart, 1, OtherCols>(T.translation(), 1, other.cols()));
     TopLeftLhs(res, 0, 0, Dim, other.cols()).noalias() += T.linear() * other;
 
     return res;
   }
 };
 
+template <typename TransformType, typename MatrixType>
+struct transform_right_product_impl<TransformType, MatrixType, 2, 1>  // rhs is a vector of size Dim
+{
+  typedef typename TransformType::MatrixType TransformMatrix;
+  enum {
+    Dim = TransformType::Dim,
+    HDim = TransformType::HDim,
+    OtherRows = MatrixType::RowsAtCompileTime,
+    WorkingRows = plain_enum_min(TransformMatrix::RowsAtCompileTime, HDim)
+  };
+
+  typedef typename MatrixType::PlainObject ResultType;
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ResultType run(const TransformType& T, const MatrixType& other) {
+    EIGEN_STATIC_ASSERT(OtherRows == Dim, YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES);
+
+    Matrix<typename ResultType::Scalar, Dim + 1, 1> rhs;
+    rhs.template head<Dim>() = other;
+    rhs[Dim] = typename ResultType::Scalar(1);
+    Matrix<typename ResultType::Scalar, WorkingRows, 1> res(T.matrix() * rhs);
+    return res.template head<Dim>();
+  }
+};
+
 /**********************************************************
 ***   Specializations of operator* with lhs EigenBase   ***
 **********************************************************/
 
 // generic HDim x HDim matrix * T => Projective
-template<typename Other,int Mode, int Options, int Dim, int HDim>
-struct transform_left_product_impl<Other,Mode,Options,Dim,HDim, HDim,HDim>
-{
-  typedef Transform<typename Other::Scalar,Dim,Mode,Options> TransformType;
+template <typename Other, int Mode, int Options, int Dim, int HDim>
+struct transform_left_product_impl<Other, Mode, Options, Dim, HDim, HDim, HDim> {
+  typedef Transform<typename Other::Scalar, Dim, Mode, Options> TransformType;
   typedef typename TransformType::MatrixType MatrixType;
-  typedef Transform<typename Other::Scalar,Dim,Projective,Options> ResultType;
-  static ResultType run(const Other& other,const TransformType& tr)
-  { return ResultType(other * tr.matrix()); }
+  typedef Transform<typename Other::Scalar, Dim, Projective, Options> ResultType;
+  static EIGEN_DEVICE_FUNC ResultType run(const Other& other, const TransformType& tr) {
+    return ResultType(other * tr.matrix());
+  }
 };
 
 // generic HDim x HDim matrix * AffineCompact => Projective
-template<typename Other, int Options, int Dim, int HDim>
-struct transform_left_product_impl<Other,AffineCompact,Options,Dim,HDim, HDim,HDim>
-{
-  typedef Transform<typename Other::Scalar,Dim,AffineCompact,Options> TransformType;
+template <typename Other, int Options, int Dim, int HDim>
+struct transform_left_product_impl<Other, AffineCompact, Options, Dim, HDim, HDim, HDim> {
+  typedef Transform<typename Other::Scalar, Dim, AffineCompact, Options> TransformType;
   typedef typename TransformType::MatrixType MatrixType;
-  typedef Transform<typename Other::Scalar,Dim,Projective,Options> ResultType;
-  static ResultType run(const Other& other,const TransformType& tr)
-  {
+  typedef Transform<typename Other::Scalar, Dim, Projective, Options> ResultType;
+  static EIGEN_DEVICE_FUNC ResultType run(const Other& other, const TransformType& tr) {
     ResultType res;
-    res.matrix().noalias() = other.template block<HDim,Dim>(0,0) * tr.matrix();
+    res.matrix().noalias() = other.template block<HDim, Dim>(0, 0) * tr.matrix();
     res.matrix().col(Dim) += other.col(Dim);
     return res;
   }
 };
 
 // affine matrix * T
-template<typename Other,int Mode, int Options, int Dim, int HDim>
-struct transform_left_product_impl<Other,Mode,Options,Dim,HDim, Dim,HDim>
-{
-  typedef Transform<typename Other::Scalar,Dim,Mode,Options> TransformType;
+template <typename Other, int Mode, int Options, int Dim, int HDim>
+struct transform_left_product_impl<Other, Mode, Options, Dim, HDim, Dim, HDim> {
+  typedef Transform<typename Other::Scalar, Dim, Mode, Options> TransformType;
   typedef typename TransformType::MatrixType MatrixType;
   typedef TransformType ResultType;
-  static ResultType run(const Other& other,const TransformType& tr)
-  {
+  static EIGEN_DEVICE_FUNC ResultType run(const Other& other, const TransformType& tr) {
     ResultType res;
     res.affine().noalias() = other * tr.matrix();
     res.matrix().row(Dim) = tr.matrix().row(Dim);
@@ -1353,35 +1393,29 @@ struct transform_left_product_impl<Other,Mode,Options,Dim,HDim, Dim,HDim>
 };
 
 // affine matrix * AffineCompact
-template<typename Other, int Options, int Dim, int HDim>
-struct transform_left_product_impl<Other,AffineCompact,Options,Dim,HDim, Dim,HDim>
-{
-  typedef Transform<typename Other::Scalar,Dim,AffineCompact,Options> TransformType;
+template <typename Other, int Options, int Dim, int HDim>
+struct transform_left_product_impl<Other, AffineCompact, Options, Dim, HDim, Dim, HDim> {
+  typedef Transform<typename Other::Scalar, Dim, AffineCompact, Options> TransformType;
   typedef typename TransformType::MatrixType MatrixType;
   typedef TransformType ResultType;
-  static ResultType run(const Other& other,const TransformType& tr)
-  {
+  static EIGEN_DEVICE_FUNC ResultType run(const Other& other, const TransformType& tr) {
     ResultType res;
-    res.matrix().noalias() = other.template block<Dim,Dim>(0,0) * tr.matrix();
+    res.matrix().noalias() = other.template block<Dim, Dim>(0, 0) * tr.matrix();
     res.translation() += other.col(Dim);
     return res;
   }
 };
 
 // linear matrix * T
-template<typename Other,int Mode, int Options, int Dim, int HDim>
-struct transform_left_product_impl<Other,Mode,Options,Dim,HDim, Dim,Dim>
-{
-  typedef Transform<typename Other::Scalar,Dim,Mode,Options> TransformType;
+template <typename Other, int Mode, int Options, int Dim, int HDim>
+struct transform_left_product_impl<Other, Mode, Options, Dim, HDim, Dim, Dim> {
+  typedef Transform<typename Other::Scalar, Dim, Mode, Options> TransformType;
   typedef typename TransformType::MatrixType MatrixType;
   typedef TransformType ResultType;
-  static ResultType run(const Other& other, const TransformType& tr)
-  {
+  static EIGEN_DEVICE_FUNC ResultType run(const Other& other, const TransformType& tr) {
     TransformType res;
-    if(Mode!=int(AffineCompact))
-      res.matrix().row(Dim) = tr.matrix().row(Dim);
-    res.matrix().template topRows<Dim>().noalias()
-      = other * tr.matrix().template topRows<Dim>();
+    if (Mode != int(AffineCompact)) res.matrix().row(Dim) = tr.matrix().row(Dim);
+    res.matrix().template topRows<Dim>().noalias() = other * tr.matrix().template topRows<Dim>();
     return res;
   }
 };
@@ -1390,43 +1424,40 @@ struct transform_left_product_impl<Other,Mode,Options,Dim,HDim, Dim,Dim>
 *** Specializations of operator* with another Transform ***
 **********************************************************/
 
-template<typename Scalar, int Dim, int LhsMode, int LhsOptions, int RhsMode, int RhsOptions>
-struct transform_transform_product_impl<Transform<Scalar,Dim,LhsMode,LhsOptions>,Transform<Scalar,Dim,RhsMode,RhsOptions>,false >
-{
-  enum { ResultMode = transform_product_result<LhsMode,RhsMode>::Mode };
-  typedef Transform<Scalar,Dim,LhsMode,LhsOptions> Lhs;
-  typedef Transform<Scalar,Dim,RhsMode,RhsOptions> Rhs;
-  typedef Transform<Scalar,Dim,ResultMode,LhsOptions> ResultType;
-  static ResultType run(const Lhs& lhs, const Rhs& rhs)
-  {
+template <typename Scalar, int Dim, int LhsMode, int LhsOptions, int RhsMode, int RhsOptions>
+struct transform_transform_product_impl<Transform<Scalar, Dim, LhsMode, LhsOptions>,
+                                        Transform<Scalar, Dim, RhsMode, RhsOptions>, false> {
+  enum { ResultMode = transform_product_result<LhsMode, RhsMode>::Mode };
+  typedef Transform<Scalar, Dim, LhsMode, LhsOptions> Lhs;
+  typedef Transform<Scalar, Dim, RhsMode, RhsOptions> Rhs;
+  typedef Transform<Scalar, Dim, ResultMode, LhsOptions> ResultType;
+  static EIGEN_DEVICE_FUNC ResultType run(const Lhs& lhs, const Rhs& rhs) {
     ResultType res;
-    res.linear() = lhs.linear() * rhs.linear();
+    res.linear().noalias() = lhs.linear() * rhs.linear();
     res.translation() = lhs.linear() * rhs.translation() + lhs.translation();
     res.makeAffine();
     return res;
   }
 };
 
-template<typename Scalar, int Dim, int LhsMode, int LhsOptions, int RhsMode, int RhsOptions>
-struct transform_transform_product_impl<Transform<Scalar,Dim,LhsMode,LhsOptions>,Transform<Scalar,Dim,RhsMode,RhsOptions>,true >
-{
-  typedef Transform<Scalar,Dim,LhsMode,LhsOptions> Lhs;
-  typedef Transform<Scalar,Dim,RhsMode,RhsOptions> Rhs;
-  typedef Transform<Scalar,Dim,Projective> ResultType;
-  static ResultType run(const Lhs& lhs, const Rhs& rhs)
-  {
-    return ResultType( lhs.matrix() * rhs.matrix() );
+template <typename Scalar, int Dim, int LhsMode, int LhsOptions, int RhsMode, int RhsOptions>
+struct transform_transform_product_impl<Transform<Scalar, Dim, LhsMode, LhsOptions>,
+                                        Transform<Scalar, Dim, RhsMode, RhsOptions>, true> {
+  typedef Transform<Scalar, Dim, LhsMode, LhsOptions> Lhs;
+  typedef Transform<Scalar, Dim, RhsMode, RhsOptions> Rhs;
+  typedef Transform<Scalar, Dim, Projective> ResultType;
+  static EIGEN_DEVICE_FUNC ResultType run(const Lhs& lhs, const Rhs& rhs) {
+    return ResultType(lhs.matrix() * rhs.matrix());
   }
 };
 
-template<typename Scalar, int Dim, int LhsOptions, int RhsOptions>
-struct transform_transform_product_impl<Transform<Scalar,Dim,AffineCompact,LhsOptions>,Transform<Scalar,Dim,Projective,RhsOptions>,true >
-{
-  typedef Transform<Scalar,Dim,AffineCompact,LhsOptions> Lhs;
-  typedef Transform<Scalar,Dim,Projective,RhsOptions> Rhs;
-  typedef Transform<Scalar,Dim,Projective> ResultType;
-  static ResultType run(const Lhs& lhs, const Rhs& rhs)
-  {
+template <typename Scalar, int Dim, int LhsOptions, int RhsOptions>
+struct transform_transform_product_impl<Transform<Scalar, Dim, AffineCompact, LhsOptions>,
+                                        Transform<Scalar, Dim, Projective, RhsOptions>, true> {
+  typedef Transform<Scalar, Dim, AffineCompact, LhsOptions> Lhs;
+  typedef Transform<Scalar, Dim, Projective, RhsOptions> Rhs;
+  typedef Transform<Scalar, Dim, Projective> ResultType;
+  static EIGEN_DEVICE_FUNC ResultType run(const Lhs& lhs, const Rhs& rhs) {
     ResultType res;
     res.matrix().template topRows<Dim>() = lhs.matrix() * rhs.matrix();
     res.matrix().row(Dim) = rhs.matrix().row(Dim);
@@ -1434,22 +1465,21 @@ struct transform_transform_product_impl<Transform<Scalar,Dim,AffineCompact,LhsOp
   }
 };
 
-template<typename Scalar, int Dim, int LhsOptions, int RhsOptions>
-struct transform_transform_product_impl<Transform<Scalar,Dim,Projective,LhsOptions>,Transform<Scalar,Dim,AffineCompact,RhsOptions>,true >
-{
-  typedef Transform<Scalar,Dim,Projective,LhsOptions> Lhs;
-  typedef Transform<Scalar,Dim,AffineCompact,RhsOptions> Rhs;
-  typedef Transform<Scalar,Dim,Projective> ResultType;
-  static ResultType run(const Lhs& lhs, const Rhs& rhs)
-  {
+template <typename Scalar, int Dim, int LhsOptions, int RhsOptions>
+struct transform_transform_product_impl<Transform<Scalar, Dim, Projective, LhsOptions>,
+                                        Transform<Scalar, Dim, AffineCompact, RhsOptions>, true> {
+  typedef Transform<Scalar, Dim, Projective, LhsOptions> Lhs;
+  typedef Transform<Scalar, Dim, AffineCompact, RhsOptions> Rhs;
+  typedef Transform<Scalar, Dim, Projective> ResultType;
+  static EIGEN_DEVICE_FUNC ResultType run(const Lhs& lhs, const Rhs& rhs) {
     ResultType res(lhs.matrix().template leftCols<Dim>() * rhs.matrix());
     res.matrix().col(Dim) += lhs.matrix().col(Dim);
     return res;
   }
 };
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_TRANSFORM_H
+#endif  // EIGEN_TRANSFORM_H
diff --git a/inst/include/Eigen/src/Geometry/Translation.h b/inst/include/Eigen/src/Geometry/Translation.h
index 7fda179c..d942ac89 100644
--- a/inst/include/Eigen/src/Geometry/Translation.h
+++ b/inst/include/Eigen/src/Geometry/Translation.h
@@ -10,197 +10,195 @@
 #ifndef EIGEN_TRANSLATION_H
 #define EIGEN_TRANSLATION_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 /** \geometry_module \ingroup Geometry_Module
-  *
-  * \class Translation
-  *
-  * \brief Represents a translation transformation
-  *
-  * \param _Scalar the scalar type, i.e., the type of the coefficients.
-  * \param _Dim the  dimension of the space, can be a compile time value or Dynamic
-  *
-  * \note This class is not aimed to be used to store a translation transformation,
-  * but rather to make easier the constructions and updates of Transform objects.
-  *
-  * \sa class Scaling, class Transform
-  */
-template<typename _Scalar, int _Dim>
-class Translation
-{
-public:
-  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_Dim)
+ *
+ * \class Translation
+ *
+ * \brief Represents a translation transformation
+ *
+ * \tparam Scalar_ the scalar type, i.e., the type of the coefficients.
+ * \tparam Dim_ the  dimension of the space, can be a compile time value or Dynamic
+ *
+ * \note This class is not aimed to be used to store a translation transformation,
+ * but rather to make easier the constructions and updates of Transform objects.
+ *
+ * \sa class Scaling, class Transform
+ */
+template <typename Scalar_, int Dim_>
+class Translation {
+ public:
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar_, Dim_)
   /** dimension of the space */
-  enum { Dim = _Dim };
+  enum { Dim = Dim_ };
   /** the scalar type of the coefficients */
-  typedef _Scalar Scalar;
+  typedef Scalar_ Scalar;
   /** corresponding vector type */
-  typedef Matrix<Scalar,Dim,1> VectorType;
+  typedef Matrix<Scalar, Dim, 1> VectorType;
   /** corresponding linear transformation matrix type */
-  typedef Matrix<Scalar,Dim,Dim> LinearMatrixType;
+  typedef Matrix<Scalar, Dim, Dim> LinearMatrixType;
   /** corresponding affine transformation type */
-  typedef Transform<Scalar,Dim,Affine> AffineTransformType;
+  typedef Transform<Scalar, Dim, Affine> AffineTransformType;
   /** corresponding isometric transformation type */
-  typedef Transform<Scalar,Dim,Isometry> IsometryTransformType;
-
-protected:
+  typedef Transform<Scalar, Dim, Isometry> IsometryTransformType;
 
+ protected:
   VectorType m_coeffs;
 
-public:
-
+ public:
   /** Default constructor without initialization. */
-  Translation() {}
+  EIGEN_DEVICE_FUNC Translation() {}
   /**  */
-  inline Translation(const Scalar& sx, const Scalar& sy)
-  {
-    eigen_assert(Dim==2);
+  EIGEN_DEVICE_FUNC inline Translation(const Scalar& sx, const Scalar& sy) {
+    eigen_assert(Dim == 2);
     m_coeffs.x() = sx;
     m_coeffs.y() = sy;
   }
   /**  */
-  inline Translation(const Scalar& sx, const Scalar& sy, const Scalar& sz)
-  {
-    eigen_assert(Dim==3);
+  EIGEN_DEVICE_FUNC inline Translation(const Scalar& sx, const Scalar& sy, const Scalar& sz) {
+    eigen_assert(Dim == 3);
     m_coeffs.x() = sx;
     m_coeffs.y() = sy;
     m_coeffs.z() = sz;
   }
   /** Constructs and initialize the translation transformation from a vector of translation coefficients */
-  explicit inline Translation(const VectorType& vector) : m_coeffs(vector) {}
+  EIGEN_DEVICE_FUNC explicit inline Translation(const VectorType& vector) : m_coeffs(vector) {}
 
-  /** \brief Retruns the x-translation by value. **/
-  inline Scalar x() const { return m_coeffs.x(); }
-  /** \brief Retruns the y-translation by value. **/
-  inline Scalar y() const { return m_coeffs.y(); }
-  /** \brief Retruns the z-translation by value. **/
-  inline Scalar z() const { return m_coeffs.z(); }
+  /** \brief Returns the x-translation by value. **/
+  EIGEN_DEVICE_FUNC constexpr Scalar x() const { return m_coeffs.x(); }
+  /** \brief Returns the y-translation by value. **/
+  EIGEN_DEVICE_FUNC constexpr Scalar y() const { return m_coeffs.y(); }
+  /** \brief Returns the z-translation by value. **/
+  EIGEN_DEVICE_FUNC constexpr Scalar z() const { return m_coeffs.z(); }
 
-  /** \brief Retruns the x-translation as a reference. **/
-  inline Scalar& x() { return m_coeffs.x(); }
-  /** \brief Retruns the y-translation as a reference. **/
-  inline Scalar& y() { return m_coeffs.y(); }
-  /** \brief Retruns the z-translation as a reference. **/
-  inline Scalar& z() { return m_coeffs.z(); }
+  /** \brief Returns the x-translation as a reference. **/
+  EIGEN_DEVICE_FUNC constexpr Scalar& x() { return m_coeffs.x(); }
+  /** \brief Returns the y-translation as a reference. **/
+  EIGEN_DEVICE_FUNC constexpr Scalar& y() { return m_coeffs.y(); }
+  /** \brief Returns the z-translation as a reference. **/
+  EIGEN_DEVICE_FUNC constexpr Scalar& z() { return m_coeffs.z(); }
 
-  const VectorType& vector() const { return m_coeffs; }
-  VectorType& vector() { return m_coeffs; }
+  EIGEN_DEVICE_FUNC const VectorType& vector() const { return m_coeffs; }
+  EIGEN_DEVICE_FUNC VectorType& vector() { return m_coeffs; }
 
-  const VectorType& translation() const { return m_coeffs; }
-  VectorType& translation() { return m_coeffs; }
+  EIGEN_DEVICE_FUNC const VectorType& translation() const { return m_coeffs; }
+  EIGEN_DEVICE_FUNC VectorType& translation() { return m_coeffs; }
 
   /** Concatenates two translation */
-  inline Translation operator* (const Translation& other) const
-  { return Translation(m_coeffs + other.m_coeffs); }
+  EIGEN_DEVICE_FUNC inline Translation operator*(const Translation& other) const {
+    return Translation(m_coeffs + other.m_coeffs);
+  }
 
   /** Concatenates a translation and a uniform scaling */
-  inline AffineTransformType operator* (const UniformScaling<Scalar>& other) const;
+  EIGEN_DEVICE_FUNC inline AffineTransformType operator*(const UniformScaling<Scalar>& other) const;
 
   /** Concatenates a translation and a linear transformation */
-  template<typename OtherDerived>
-  inline AffineTransformType operator* (const EigenBase<OtherDerived>& linear) const;
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC inline AffineTransformType operator*(const EigenBase<OtherDerived>& linear) const;
 
   /** Concatenates a translation and a rotation */
-  template<typename Derived>
-  inline IsometryTransformType operator*(const RotationBase<Derived,Dim>& r) const
-  { return *this * IsometryTransformType(r); }
+  template <typename Derived>
+  EIGEN_DEVICE_FUNC inline IsometryTransformType operator*(const RotationBase<Derived, Dim>& r) const {
+    return *this * IsometryTransformType(r);
+  }
 
   /** \returns the concatenation of a linear transformation \a l with the translation \a t */
   // its a nightmare to define a templated friend function outside its declaration
-  template<typename OtherDerived> friend
-  inline AffineTransformType operator*(const EigenBase<OtherDerived>& linear, const Translation& t)
-  {
+  template <typename OtherDerived>
+  friend EIGEN_DEVICE_FUNC inline AffineTransformType operator*(const EigenBase<OtherDerived>& linear,
+                                                                const Translation& t) {
     AffineTransformType res;
     res.matrix().setZero();
     res.linear() = linear.derived();
     res.translation() = linear.derived() * t.m_coeffs;
     res.matrix().row(Dim).setZero();
-    res(Dim,Dim) = Scalar(1);
+    res(Dim, Dim) = Scalar(1);
     return res;
   }
 
   /** Concatenates a translation and a transformation */
-  template<int Mode, int Options>
-  inline Transform<Scalar,Dim,Mode> operator* (const Transform<Scalar,Dim,Mode,Options>& t) const
-  {
-    Transform<Scalar,Dim,Mode> res = t;
+  template <int Mode, int Options>
+  EIGEN_DEVICE_FUNC inline Transform<Scalar, Dim, Mode> operator*(
+      const Transform<Scalar, Dim, Mode, Options>& t) const {
+    Transform<Scalar, Dim, Mode> res = t;
     res.pretranslate(m_coeffs);
     return res;
   }
 
   /** Applies translation to vector */
-  inline VectorType operator* (const VectorType& other) const
-  { return m_coeffs + other; }
+  template <typename Derived>
+  inline std::enable_if_t<Derived::IsVectorAtCompileTime, VectorType> operator*(const MatrixBase<Derived>& vec) const {
+    return m_coeffs + vec.derived();
+  }
 
   /** \returns the inverse translation (opposite) */
   Translation inverse() const { return Translation(-m_coeffs); }
 
-  Translation& operator=(const Translation& other)
-  {
-    m_coeffs = other.m_coeffs;
-    return *this;
-  }
-
   static const Translation Identity() { return Translation(VectorType::Zero()); }
 
   /** \returns \c *this with scalar type casted to \a NewScalarType
-    *
-    * Note that if \a NewScalarType is equal to the current scalar type of \c *this
-    * then this function smartly returns a const reference to \c *this.
-    */
-  template<typename NewScalarType>
-  inline typename internal::cast_return_type<Translation,Translation<NewScalarType,Dim> >::type cast() const
-  { return typename internal::cast_return_type<Translation,Translation<NewScalarType,Dim> >::type(*this); }
+   *
+   * Note that if \a NewScalarType is equal to the current scalar type of \c *this
+   * then this function smartly returns a const reference to \c *this.
+   */
+  template <typename NewScalarType>
+  EIGEN_DEVICE_FUNC inline typename internal::cast_return_type<Translation, Translation<NewScalarType, Dim> >::type
+  cast() const {
+    return typename internal::cast_return_type<Translation, Translation<NewScalarType, Dim> >::type(*this);
+  }
 
   /** Copy constructor with scalar type conversion */
-  template<typename OtherScalarType>
-  inline explicit Translation(const Translation<OtherScalarType,Dim>& other)
-  { m_coeffs = other.vector().template cast<Scalar>(); }
+  template <typename OtherScalarType>
+  EIGEN_DEVICE_FUNC inline explicit Translation(const Translation<OtherScalarType, Dim>& other) {
+    m_coeffs = other.vector().template cast<Scalar>();
+  }
 
   /** \returns \c true if \c *this is approximately equal to \a other, within the precision
-    * determined by \a prec.
-    *
-    * \sa MatrixBase::isApprox() */
-  bool isApprox(const Translation& other, typename NumTraits<Scalar>::Real prec = NumTraits<Scalar>::dummy_precision()) const
-  { return m_coeffs.isApprox(other.m_coeffs, prec); }
-
+   * determined by \a prec.
+   *
+   * \sa MatrixBase::isApprox() */
+  EIGEN_DEVICE_FUNC bool isApprox(const Translation& other, const typename NumTraits<Scalar>::Real& prec =
+                                                                NumTraits<Scalar>::dummy_precision()) const {
+    return m_coeffs.isApprox(other.m_coeffs, prec);
+  }
 };
 
 /** \addtogroup Geometry_Module */
 //@{
 typedef Translation<float, 2> Translation2f;
-typedef Translation<double,2> Translation2d;
+typedef Translation<double, 2> Translation2d;
 typedef Translation<float, 3> Translation3f;
-typedef Translation<double,3> Translation3d;
+typedef Translation<double, 3> Translation3d;
 //@}
 
-template<typename Scalar, int Dim>
-inline typename Translation<Scalar,Dim>::AffineTransformType
-Translation<Scalar,Dim>::operator* (const UniformScaling<Scalar>& other) const
-{
+template <typename Scalar, int Dim>
+EIGEN_DEVICE_FUNC inline typename Translation<Scalar, Dim>::AffineTransformType Translation<Scalar, Dim>::operator*(
+    const UniformScaling<Scalar>& other) const {
   AffineTransformType res;
   res.matrix().setZero();
   res.linear().diagonal().fill(other.factor());
   res.translation() = m_coeffs;
-  res(Dim,Dim) = Scalar(1);
+  res(Dim, Dim) = Scalar(1);
   return res;
 }
 
-template<typename Scalar, int Dim>
-template<typename OtherDerived>
-inline typename Translation<Scalar,Dim>::AffineTransformType
-Translation<Scalar,Dim>::operator* (const EigenBase<OtherDerived>& linear) const
-{
+template <typename Scalar, int Dim>
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC inline typename Translation<Scalar, Dim>::AffineTransformType Translation<Scalar, Dim>::operator*(
+    const EigenBase<OtherDerived>& linear) const {
   AffineTransformType res;
   res.matrix().setZero();
   res.linear() = linear.derived();
   res.translation() = m_coeffs;
   res.matrix().row(Dim).setZero();
-  res(Dim,Dim) = Scalar(1);
+  res(Dim, Dim) = Scalar(1);
   return res;
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_TRANSLATION_H
+#endif  // EIGEN_TRANSLATION_H
diff --git a/inst/include/Eigen/src/Geometry/Umeyama.h b/inst/include/Eigen/src/Geometry/Umeyama.h
index 5e20662f..8ed63449 100644
--- a/inst/include/Eigen/src/Geometry/Umeyama.h
+++ b/inst/include/Eigen/src/Geometry/Umeyama.h
@@ -10,15 +10,16 @@
 #ifndef EIGEN_UMEYAMA_H
 #define EIGEN_UMEYAMA_H
 
-// This file requires the user to include 
+// This file requires the user to include
 // * Eigen/Core
-// * Eigen/LU 
+// * Eigen/LU
 // * Eigen/SVD
 // * Eigen/Array
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 
-#ifndef EIGEN_PARSED_BY_DOXYGEN
+namespace Eigen {
 
 // These helpers are required since it allows to use mixed types as parameters
 // for the Umeyama. The problem with mixed parameters is that the return type
@@ -28,89 +29,83 @@ namespace internal {
 // Compile time return type deduction for different MatrixBase types.
 // Different means here different alignment and parameters but the same underlying
 // real scalar type.
-template<typename MatrixType, typename OtherMatrixType>
-struct umeyama_transform_matrix_type
-{
+template <typename MatrixType, typename OtherMatrixType>
+struct umeyama_transform_matrix_type {
   enum {
-    MinRowsAtCompileTime = EIGEN_SIZE_MIN_PREFER_DYNAMIC(MatrixType::RowsAtCompileTime, OtherMatrixType::RowsAtCompileTime),
+    MinRowsAtCompileTime =
+        internal::min_size_prefer_dynamic(MatrixType::RowsAtCompileTime, OtherMatrixType::RowsAtCompileTime),
 
     // When possible we want to choose some small fixed size value since the result
-    // is likely to fit on the stack. So here, EIGEN_SIZE_MIN_PREFER_DYNAMIC is not what we want.
-    HomogeneousDimension = int(MinRowsAtCompileTime) == Dynamic ? Dynamic : int(MinRowsAtCompileTime)+1
+    // is likely to fit on the stack. So here, min_size_prefer_dynamic is not what we want.
+    HomogeneousDimension = int(MinRowsAtCompileTime) == Dynamic ? Dynamic : int(MinRowsAtCompileTime) + 1
   };
 
-  typedef Matrix<typename traits<MatrixType>::Scalar,
-    HomogeneousDimension,
-    HomogeneousDimension,
-    AutoAlign | (traits<MatrixType>::Flags & RowMajorBit ? RowMajor : ColMajor),
-    HomogeneousDimension,
-    HomogeneousDimension
-  > type;
+  typedef Matrix<typename traits<MatrixType>::Scalar, HomogeneousDimension, HomogeneousDimension,
+                 AutoAlign | (traits<MatrixType>::Flags & RowMajorBit ? RowMajor : ColMajor), HomogeneousDimension,
+                 HomogeneousDimension>
+      type;
 };
 
-}
-
-#endif
+}  // namespace internal
 
 /**
-* \geometry_module \ingroup Geometry_Module
-*
-* \brief Returns the transformation between two point sets.
-*
-* The algorithm is based on:
-* "Least-squares estimation of transformation parameters between two point patterns",
-* Shinji Umeyama, PAMI 1991, DOI: 10.1109/34.88573
-*
-* It estimates parameters \f$ c, \mathbf{R}, \f$ and \f$ \mathbf{t} \f$ such that
-* \f{align*}
-*   \frac{1}{n} \sum_{i=1}^n \vert\vert y_i - (c\mathbf{R}x_i + \mathbf{t}) \vert\vert_2^2
-* \f}
-* is minimized.
-*
-* The algorithm is based on the analysis of the covariance matrix
-* \f$ \Sigma_{\mathbf{x}\mathbf{y}} \in \mathbb{R}^{d \times d} \f$
-* of the input point sets \f$ \mathbf{x} \f$ and \f$ \mathbf{y} \f$ where 
-* \f$d\f$ is corresponding to the dimension (which is typically small).
-* The analysis is involving the SVD having a complexity of \f$O(d^3)\f$
-* though the actual computational effort lies in the covariance
-* matrix computation which has an asymptotic lower bound of \f$O(dm)\f$ when 
-* the input point sets have dimension \f$d \times m\f$.
-*
-* Currently the method is working only for floating point matrices.
-*
-* \todo Should the return type of umeyama() become a Transform?
-*
-* \param src Source points \f$ \mathbf{x} = \left( x_1, \hdots, x_n \right) \f$.
-* \param dst Destination points \f$ \mathbf{y} = \left( y_1, \hdots, y_n \right) \f$.
-* \param with_scaling Sets \f$ c=1 \f$ when <code>false</code> is passed.
-* \return The homogeneous transformation 
-* \f{align*}
-*   T = \begin{bmatrix} c\mathbf{R} & \mathbf{t} \\ \mathbf{0} & 1 \end{bmatrix}
-* \f}
-* minimizing the resudiual above. This transformation is always returned as an 
-* Eigen::Matrix.
-*/
+ * \geometry_module \ingroup Geometry_Module
+ *
+ * \brief Returns the transformation between two point sets.
+ *
+ * The algorithm is based on:
+ * "Least-squares estimation of transformation parameters between two point patterns",
+ * Shinji Umeyama, PAMI 1991, DOI: 10.1109/34.88573
+ *
+ * It estimates parameters \f$ c, \mathbf{R}, \f$ and \f$ \mathbf{t} \f$ such that
+ * \f{align*}
+ *   \frac{1}{n} \sum_{i=1}^n \vert\vert y_i - (c\mathbf{R}x_i + \mathbf{t}) \vert\vert_2^2
+ * \f}
+ * is minimized.
+ *
+ * The algorithm is based on the analysis of the covariance matrix
+ * \f$ \Sigma_{\mathbf{x}\mathbf{y}} \in \mathbb{R}^{d \times d} \f$
+ * of the input point sets \f$ \mathbf{x} \f$ and \f$ \mathbf{y} \f$ where
+ * \f$d\f$ is corresponding to the dimension (which is typically small).
+ * The analysis is involving the SVD having a complexity of \f$O(d^3)\f$
+ * though the actual computational effort lies in the covariance
+ * matrix computation which has an asymptotic lower bound of \f$O(dm)\f$ when
+ * the input point sets have dimension \f$d \times m\f$.
+ *
+ * Currently the method is working only for floating point matrices.
+ *
+ * \todo Should the return type of umeyama() become a Transform?
+ *
+ * \param src Source points \f$ \mathbf{x} = \left( x_1, \hdots, x_n \right) \f$.
+ * \param dst Destination points \f$ \mathbf{y} = \left( y_1, \hdots, y_n \right) \f$.
+ * \param with_scaling Sets \f$ c=1 \f$ when <code>false</code> is passed.
+ * \return The homogeneous transformation
+ * \f{align*}
+ *   T = \begin{bmatrix} c\mathbf{R} & \mathbf{t} \\ \mathbf{0} & 1 \end{bmatrix}
+ * \f}
+ * minimizing the residual above. This transformation is always returned as an
+ * Eigen::Matrix.
+ */
 template <typename Derived, typename OtherDerived>
-typename internal::umeyama_transform_matrix_type<Derived, OtherDerived>::type
-umeyama(const MatrixBase<Derived>& src, const MatrixBase<OtherDerived>& dst, bool with_scaling = true)
-{
+typename internal::umeyama_transform_matrix_type<Derived, OtherDerived>::type umeyama(
+    const MatrixBase<Derived>& src, const MatrixBase<OtherDerived>& dst, bool with_scaling = true) {
   typedef typename internal::umeyama_transform_matrix_type<Derived, OtherDerived>::type TransformationMatrixType;
   typedef typename internal::traits<TransformationMatrixType>::Scalar Scalar;
   typedef typename NumTraits<Scalar>::Real RealScalar;
-  typedef typename Derived::Index Index;
 
   EIGEN_STATIC_ASSERT(!NumTraits<Scalar>::IsComplex, NUMERIC_TYPE_MUST_BE_REAL)
-  EIGEN_STATIC_ASSERT((internal::is_same<Scalar, typename internal::traits<OtherDerived>::Scalar>::value),
-    YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+  EIGEN_STATIC_ASSERT(
+      (internal::is_same<Scalar, typename internal::traits<OtherDerived>::Scalar>::value),
+      YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
 
-  enum { Dimension = EIGEN_SIZE_MIN_PREFER_DYNAMIC(Derived::RowsAtCompileTime, OtherDerived::RowsAtCompileTime) };
+  enum { Dimension = internal::min_size_prefer_dynamic(Derived::RowsAtCompileTime, OtherDerived::RowsAtCompileTime) };
 
   typedef Matrix<Scalar, Dimension, 1> VectorType;
   typedef Matrix<Scalar, Dimension, Dimension> MatrixType;
   typedef typename internal::plain_matrix_type_row_major<Derived>::type RowMajorMatrixType;
 
-  const Index m = src.rows(); // dimension
-  const Index n = src.cols(); // number of measurements
+  const Index m = src.rows();  // dimension
+  const Index n = src.cols();  // number of measurements
 
   // required for demeaning ...
   const RealScalar one_over_n = RealScalar(1) / static_cast<RealScalar>(n);
@@ -123,55 +118,44 @@ umeyama(const MatrixBase<Derived>& src, const MatrixBase<OtherDerived>& dst, boo
   const RowMajorMatrixType src_demean = src.colwise() - src_mean;
   const RowMajorMatrixType dst_demean = dst.colwise() - dst_mean;
 
-  // Eq. (36)-(37)
-  const Scalar src_var = src_demean.rowwise().squaredNorm().sum() * one_over_n;
-
   // Eq. (38)
   const MatrixType sigma = one_over_n * dst_demean * src_demean.transpose();
 
-  JacobiSVD<MatrixType> svd(sigma, ComputeFullU | ComputeFullV);
+  JacobiSVD<MatrixType, ComputeFullU | ComputeFullV> svd(sigma);
 
   // Initialize the resulting transformation with an identity matrix...
-  TransformationMatrixType Rt = TransformationMatrixType::Identity(m+1,m+1);
+  TransformationMatrixType Rt = TransformationMatrixType::Identity(m + 1, m + 1);
 
   // Eq. (39)
   VectorType S = VectorType::Ones(m);
-  if (sigma.determinant()<Scalar(0)) S(m-1) = Scalar(-1);
 
-  // Eq. (40) and (43)
-  const VectorType& d = svd.singularValues();
-  Index rank = 0; for (Index i=0; i<m; ++i) if (!internal::isMuchSmallerThan(d.coeff(i),d.coeff(0))) ++rank;
-  if (rank == m-1) {
-    if ( svd.matrixU().determinant() * svd.matrixV().determinant() > Scalar(0) ) {
-      Rt.block(0,0,m,m).noalias() = svd.matrixU()*svd.matrixV().transpose();
-    } else {
-      const Scalar s = S(m-1); S(m-1) = Scalar(-1);
-      Rt.block(0,0,m,m).noalias() = svd.matrixU() * S.asDiagonal() * svd.matrixV().transpose();
-      S(m-1) = s;
-    }
-  } else {
-    Rt.block(0,0,m,m).noalias() = svd.matrixU() * S.asDiagonal() * svd.matrixV().transpose();
+  if (svd.matrixU().determinant() * svd.matrixV().determinant() < 0) {
+    Index tmp = m - 1;
+    S(tmp) = -1;
   }
 
-  if (with_scaling)
-  {
+  // Eq. (40) and (43)
+  Rt.block(0, 0, m, m).noalias() = svd.matrixU() * S.asDiagonal() * svd.matrixV().transpose();
+
+  if (with_scaling) {
+    // Eq. (36)-(37)
+    const Scalar src_var = src_demean.rowwise().squaredNorm().sum() * one_over_n;
+
     // Eq. (42)
-    const Scalar c = Scalar(1)/src_var * svd.singularValues().dot(S);
+    const Scalar c = Scalar(1) / src_var * svd.singularValues().dot(S);
 
     // Eq. (41)
     Rt.col(m).head(m) = dst_mean;
-    Rt.col(m).head(m).noalias() -= c*Rt.topLeftCorner(m,m)*src_mean;
-    Rt.block(0,0,m,m) *= c;
-  }
-  else
-  {
+    Rt.col(m).head(m).noalias() -= c * Rt.topLeftCorner(m, m) * src_mean;
+    Rt.block(0, 0, m, m) *= c;
+  } else {
     Rt.col(m).head(m) = dst_mean;
-    Rt.col(m).head(m).noalias() -= Rt.topLeftCorner(m,m)*src_mean;
+    Rt.col(m).head(m).noalias() -= Rt.topLeftCorner(m, m) * src_mean;
   }
 
   return Rt;
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_UMEYAMA_H
+#endif  // EIGEN_UMEYAMA_H
diff --git a/inst/include/Eigen/src/Geometry/arch/Geometry_SIMD.h b/inst/include/Eigen/src/Geometry/arch/Geometry_SIMD.h
new file mode 100644
index 00000000..5601a473
--- /dev/null
+++ b/inst/include/Eigen/src/Geometry/arch/Geometry_SIMD.h
@@ -0,0 +1,154 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Rohit Garg <rpg.314@gmail.com>
+// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_GEOMETRY_SIMD_H
+#define EIGEN_GEOMETRY_SIMD_H
+
+// IWYU pragma: private
+#include "../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+template <class Derived, class OtherDerived>
+struct quat_product<Architecture::Target, Derived, OtherDerived, float> {
+  enum {
+    AAlignment = traits<Derived>::Alignment,
+    BAlignment = traits<OtherDerived>::Alignment,
+    ResAlignment = traits<Quaternion<float> >::Alignment
+  };
+  static inline Quaternion<float> run(const QuaternionBase<Derived>& _a, const QuaternionBase<OtherDerived>& _b) {
+    evaluator<typename Derived::Coefficients> ae(_a.coeffs());
+    evaluator<typename OtherDerived::Coefficients> be(_b.coeffs());
+    Quaternion<float> res;
+    const float neg_zero = numext::bit_cast<float>(0x80000000u);
+    const float arr[4] = {0.f, 0.f, 0.f, neg_zero};
+    const Packet4f mask = ploadu<Packet4f>(arr);
+    Packet4f a = ae.template packet<AAlignment, Packet4f>(0);
+    Packet4f b = be.template packet<BAlignment, Packet4f>(0);
+    Packet4f s1 = pmul(vec4f_swizzle1(a, 1, 2, 0, 2), vec4f_swizzle1(b, 2, 0, 1, 2));
+    Packet4f s2 = pmul(vec4f_swizzle1(a, 3, 3, 3, 1), vec4f_swizzle1(b, 0, 1, 2, 1));
+    pstoret<float, Packet4f, ResAlignment>(
+        &res.x(), padd(psub(pmul(a, vec4f_swizzle1(b, 3, 3, 3, 3)),
+                            pmul(vec4f_swizzle1(a, 2, 0, 1, 0), vec4f_swizzle1(b, 1, 2, 0, 0))),
+                       pxor(mask, padd(s1, s2))));
+
+    return res;
+  }
+};
+
+template <class Derived>
+struct quat_conj<Architecture::Target, Derived, float> {
+  enum { ResAlignment = traits<Quaternion<float> >::Alignment };
+  static inline Quaternion<float> run(const QuaternionBase<Derived>& q) {
+    evaluator<typename Derived::Coefficients> qe(q.coeffs());
+    Quaternion<float> res;
+    const float neg_zero = numext::bit_cast<float>(0x80000000u);
+    const float arr[4] = {neg_zero, neg_zero, neg_zero, 0.f};
+    const Packet4f mask = ploadu<Packet4f>(arr);
+    pstoret<float, Packet4f, ResAlignment>(&res.x(),
+                                           pxor(mask, qe.template packet<traits<Derived>::Alignment, Packet4f>(0)));
+    return res;
+  }
+};
+
+template <typename VectorLhs, typename VectorRhs>
+struct cross3_impl<Architecture::Target, VectorLhs, VectorRhs, float, true> {
+  using DstPlainType = typename plain_matrix_type<VectorLhs>::type;
+  static constexpr int DstAlignment = evaluator<DstPlainType>::Alignment;
+  static constexpr int LhsAlignment = evaluator<VectorLhs>::Alignment;
+  static constexpr int RhsAlignment = evaluator<VectorRhs>::Alignment;
+  static inline DstPlainType run(const VectorLhs& lhs, const VectorRhs& rhs) {
+    evaluator<VectorLhs> lhs_eval(lhs);
+    evaluator<VectorRhs> rhs_eval(rhs);
+    Packet4f a = lhs_eval.template packet<LhsAlignment, Packet4f>(0);
+    Packet4f b = rhs_eval.template packet<RhsAlignment, Packet4f>(0);
+    Packet4f mul1 = pmul(vec4f_swizzle1(a, 1, 2, 0, 3), vec4f_swizzle1(b, 2, 0, 1, 3));
+    Packet4f mul2 = pmul(vec4f_swizzle1(a, 2, 0, 1, 3), vec4f_swizzle1(b, 1, 2, 0, 3));
+    DstPlainType res;
+    pstoret<float, Packet4f, DstAlignment>(res.data(), psub(mul1, mul2));
+    // Ensure last component is 0 in case original a or b contain inf/nan.
+    res[3] = 0.0f;
+    return res;
+  }
+};
+
+#if (defined EIGEN_VECTORIZE_SSE) || (EIGEN_ARCH_ARM64)
+
+template <class Derived, class OtherDerived>
+struct quat_product<Architecture::Target, Derived, OtherDerived, double> {
+  enum { BAlignment = traits<OtherDerived>::Alignment, ResAlignment = traits<Quaternion<double> >::Alignment };
+
+  static inline Quaternion<double> run(const QuaternionBase<Derived>& _a, const QuaternionBase<OtherDerived>& _b) {
+    Quaternion<double> res;
+
+    evaluator<typename Derived::Coefficients> ae(_a.coeffs());
+    evaluator<typename OtherDerived::Coefficients> be(_b.coeffs());
+
+    const double* a = _a.coeffs().data();
+    Packet2d b_xy = be.template packet<BAlignment, Packet2d>(0);
+    Packet2d b_zw = be.template packet<BAlignment, Packet2d>(2);
+    Packet2d a_xx = pset1<Packet2d>(a[0]);
+    Packet2d a_yy = pset1<Packet2d>(a[1]);
+    Packet2d a_zz = pset1<Packet2d>(a[2]);
+    Packet2d a_ww = pset1<Packet2d>(a[3]);
+
+    // two temporaries:
+    Packet2d t1, t2;
+
+    /*
+     * t1 = ww*xy + yy*zw
+     * t2 = zz*xy - xx*zw
+     * res.xy = t1 +/- swap(t2)
+     */
+    t1 = padd(pmul(a_ww, b_xy), pmul(a_yy, b_zw));
+    t2 = psub(pmul(a_zz, b_xy), pmul(a_xx, b_zw));
+    pstoret<double, Packet2d, ResAlignment>(&res.x(), paddsub(t1, preverse(t2)));
+
+    /*
+     * t1 = ww*zw - yy*xy
+     * t2 = zz*zw + xx*xy
+     * res.zw = t1 -/+ swap(t2) = swap( swap(t1) +/- t2)
+     */
+    t1 = psub(pmul(a_ww, b_zw), pmul(a_yy, b_xy));
+    t2 = padd(pmul(a_zz, b_zw), pmul(a_xx, b_xy));
+    pstoret<double, Packet2d, ResAlignment>(&res.z(), preverse(paddsub(preverse(t1), t2)));
+
+    return res;
+  }
+};
+
+template <class Derived>
+struct quat_conj<Architecture::Target, Derived, double> {
+  enum { ResAlignment = traits<Quaternion<double> >::Alignment };
+  static inline Quaternion<double> run(const QuaternionBase<Derived>& q) {
+    evaluator<typename Derived::Coefficients> qe(q.coeffs());
+    Quaternion<double> res;
+    const double neg_zero = numext::bit_cast<double>(0x8000000000000000ull);
+    const double arr1[2] = {neg_zero, neg_zero};
+    const double arr2[2] = {neg_zero, 0.0};
+    const Packet2d mask0 = ploadu<Packet2d>(arr1);
+    const Packet2d mask2 = ploadu<Packet2d>(arr2);
+    pstoret<double, Packet2d, ResAlignment>(&res.x(),
+                                            pxor(mask0, qe.template packet<traits<Derived>::Alignment, Packet2d>(0)));
+    pstoret<double, Packet2d, ResAlignment>(&res.z(),
+                                            pxor(mask2, qe.template packet<traits<Derived>::Alignment, Packet2d>(2)));
+    return res;
+  }
+};
+
+#endif  // end EIGEN_VECTORIZE_SSE_OR_EIGEN_ARCH_ARM64
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_GEOMETRY_SIMD_H
diff --git a/inst/include/Eigen/src/Geometry/arch/Geometry_SSE.h b/inst/include/Eigen/src/Geometry/arch/Geometry_SSE.h
deleted file mode 100644
index 3d8284f2..00000000
--- a/inst/include/Eigen/src/Geometry/arch/Geometry_SSE.h
+++ /dev/null
@@ -1,115 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2009 Rohit Garg <rpg.314@gmail.com>
-// Copyright (C) 2009-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_GEOMETRY_SSE_H
-#define EIGEN_GEOMETRY_SSE_H
-
-namespace Eigen { 
-
-namespace internal {
-
-template<class Derived, class OtherDerived>
-struct quat_product<Architecture::SSE, Derived, OtherDerived, float, Aligned>
-{
-  static inline Quaternion<float> run(const QuaternionBase<Derived>& _a, const QuaternionBase<OtherDerived>& _b)
-  {
-    const __m128 mask = _mm_castsi128_ps(_mm_setr_epi32(0,0,0,0x80000000));
-    Quaternion<float> res;
-    __m128 a = _a.coeffs().template packet<Aligned>(0);
-    __m128 b = _b.coeffs().template packet<Aligned>(0);
-    __m128 flip1 = _mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a,1,2,0,2),
-                                         vec4f_swizzle1(b,2,0,1,2)),mask);
-    __m128 flip2 = _mm_xor_ps(_mm_mul_ps(vec4f_swizzle1(a,3,3,3,1),
-                                         vec4f_swizzle1(b,0,1,2,1)),mask);
-    pstore(&res.x(),
-              _mm_add_ps(_mm_sub_ps(_mm_mul_ps(a,vec4f_swizzle1(b,3,3,3,3)),
-                                    _mm_mul_ps(vec4f_swizzle1(a,2,0,1,0),
-                                               vec4f_swizzle1(b,1,2,0,0))),
-                         _mm_add_ps(flip1,flip2)));
-    return res;
-  }
-};
-
-template<typename VectorLhs,typename VectorRhs>
-struct cross3_impl<Architecture::SSE,VectorLhs,VectorRhs,float,true>
-{
-  static inline typename plain_matrix_type<VectorLhs>::type
-  run(const VectorLhs& lhs, const VectorRhs& rhs)
-  {
-    __m128 a = lhs.template packet<VectorLhs::Flags&AlignedBit ? Aligned : Unaligned>(0);
-    __m128 b = rhs.template packet<VectorRhs::Flags&AlignedBit ? Aligned : Unaligned>(0);
-    __m128 mul1=_mm_mul_ps(vec4f_swizzle1(a,1,2,0,3),vec4f_swizzle1(b,2,0,1,3));
-    __m128 mul2=_mm_mul_ps(vec4f_swizzle1(a,2,0,1,3),vec4f_swizzle1(b,1,2,0,3));
-    typename plain_matrix_type<VectorLhs>::type res;
-    pstore(&res.x(),_mm_sub_ps(mul1,mul2));
-    return res;
-  }
-};
-
-
-
-
-template<class Derived, class OtherDerived>
-struct quat_product<Architecture::SSE, Derived, OtherDerived, double, Aligned>
-{
-  static inline Quaternion<double> run(const QuaternionBase<Derived>& _a, const QuaternionBase<OtherDerived>& _b)
-  {
-  const Packet2d mask = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0));
-
-  Quaternion<double> res;
-
-  const double* a = _a.coeffs().data();
-  Packet2d b_xy = _b.coeffs().template packet<Aligned>(0);
-  Packet2d b_zw = _b.coeffs().template packet<Aligned>(2);
-  Packet2d a_xx = pset1<Packet2d>(a[0]);
-  Packet2d a_yy = pset1<Packet2d>(a[1]);
-  Packet2d a_zz = pset1<Packet2d>(a[2]);
-  Packet2d a_ww = pset1<Packet2d>(a[3]);
-
-  // two temporaries:
-  Packet2d t1, t2;
-
-  /*
-   * t1 = ww*xy + yy*zw
-   * t2 = zz*xy - xx*zw
-   * res.xy = t1 +/- swap(t2)
-   */
-  t1 = padd(pmul(a_ww, b_xy), pmul(a_yy, b_zw));
-  t2 = psub(pmul(a_zz, b_xy), pmul(a_xx, b_zw));
-#ifdef EIGEN_VECTORIZE_SSE3
-  EIGEN_UNUSED_VARIABLE(mask)
-  pstore(&res.x(), _mm_addsub_pd(t1, preverse(t2)));
-#else
-  pstore(&res.x(), padd(t1, pxor(mask,preverse(t2))));
-#endif
-  
-  /*
-   * t1 = ww*zw - yy*xy
-   * t2 = zz*zw + xx*xy
-   * res.zw = t1 -/+ swap(t2) = swap( swap(t1) +/- t2)
-   */
-  t1 = psub(pmul(a_ww, b_zw), pmul(a_yy, b_xy));
-  t2 = padd(pmul(a_zz, b_zw), pmul(a_xx, b_xy));
-#ifdef EIGEN_VECTORIZE_SSE3
-  EIGEN_UNUSED_VARIABLE(mask)
-  pstore(&res.z(), preverse(_mm_addsub_pd(preverse(t1), t2)));
-#else
-  pstore(&res.z(), psub(t1, pxor(mask,preverse(t2))));
-#endif
-
-  return res;
-}
-};
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_GEOMETRY_SSE_H
diff --git a/inst/include/Eigen/src/Householder/BlockHouseholder.h b/inst/include/Eigen/src/Householder/BlockHouseholder.h
index 60dbea5f..8b923049 100644
--- a/inst/include/Eigen/src/Householder/BlockHouseholder.h
+++ b/inst/include/Eigen/src/Householder/BlockHouseholder.h
@@ -13,56 +13,103 @@
 
 // This file contains some helper function to deal with block householder reflectors
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 namespace internal {
 
 /** \internal */
-template<typename TriangularFactorType,typename VectorsType,typename CoeffsType>
-void make_block_householder_triangular_factor(TriangularFactorType& triFactor, const VectorsType& vectors, const CoeffsType& hCoeffs)
-{
-  typedef typename TriangularFactorType::Index Index;
-  typedef typename VectorsType::Scalar Scalar;
+// template<typename TriangularFactorType,typename VectorsType,typename CoeffsType>
+// void make_block_householder_triangular_factor(TriangularFactorType& triFactor, const VectorsType& vectors, const
+// CoeffsType& hCoeffs)
+// {
+//   typedef typename VectorsType::Scalar Scalar;
+//   const Index nbVecs = vectors.cols();
+//   eigen_assert(triFactor.rows() == nbVecs && triFactor.cols() == nbVecs && vectors.rows()>=nbVecs);
+//
+//   for(Index i = 0; i < nbVecs; i++)
+//   {
+//     Index rs = vectors.rows() - i;
+//     // Warning, note that hCoeffs may alias with vectors.
+//     // It is then necessary to copy it before modifying vectors(i,i).
+//     typename CoeffsType::Scalar h = hCoeffs(i);
+//     // This hack permits to pass through nested Block<> and Transpose<> expressions.
+//     Scalar *Vii_ptr = const_cast<Scalar*>(vectors.data() + vectors.outerStride()*i + vectors.innerStride()*i);
+//     Scalar Vii = *Vii_ptr;
+//     *Vii_ptr = Scalar(1);
+//     triFactor.col(i).head(i).noalias() = -h * vectors.block(i, 0, rs, i).adjoint()
+//                                        * vectors.col(i).tail(rs);
+//     *Vii_ptr = Vii;
+//     // FIXME add .noalias() once the triangular product can work inplace
+//     triFactor.col(i).head(i) = triFactor.block(0,0,i,i).template triangularView<Upper>()
+//                              * triFactor.col(i).head(i);
+//     triFactor(i,i) = hCoeffs(i);
+//   }
+// }
+
+/** \internal */
+// This variant avoid modifications in vectors
+template <typename TriangularFactorType, typename VectorsType, typename CoeffsType>
+void make_block_householder_triangular_factor(TriangularFactorType& triFactor, const VectorsType& vectors,
+                                              const CoeffsType& hCoeffs) {
   const Index nbVecs = vectors.cols();
-  eigen_assert(triFactor.rows() == nbVecs && triFactor.cols() == nbVecs && vectors.rows()>=nbVecs);
-
-  for(Index i = 0; i < nbVecs; i++)
-  {
-    Index rs = vectors.rows() - i;
-    Scalar Vii = vectors(i,i);
-    vectors.const_cast_derived().coeffRef(i,i) = Scalar(1);
-    triFactor.col(i).head(i).noalias() = -hCoeffs(i) * vectors.block(i, 0, rs, i).adjoint()
-                                       * vectors.col(i).tail(rs);
-    vectors.const_cast_derived().coeffRef(i, i) = Vii;
-    // FIXME add .noalias() once the triangular product can work inplace
-    triFactor.col(i).head(i) = triFactor.block(0,0,i,i).template triangularView<Upper>()
-                             * triFactor.col(i).head(i);
-    triFactor(i,i) = hCoeffs(i);
+  eigen_assert(triFactor.rows() == nbVecs && triFactor.cols() == nbVecs && vectors.rows() >= nbVecs);
+
+  for (Index i = nbVecs - 1; i >= 0; --i) {
+    Index rs = vectors.rows() - i - 1;
+    Index rt = nbVecs - i - 1;
+
+    if (rt > 0) {
+      triFactor.row(i).tail(rt).noalias() = -hCoeffs(i) * vectors.col(i).tail(rs).adjoint() *
+                                            vectors.bottomRightCorner(rs, rt).template triangularView<UnitLower>();
+
+      // FIXME use the following line with .noalias() once the triangular product can work inplace
+      // triFactor.row(i).tail(rt) = triFactor.row(i).tail(rt) * triFactor.bottomRightCorner(rt,rt).template
+      // triangularView<Upper>();
+      for (Index j = nbVecs - 1; j > i; --j) {
+        typename TriangularFactorType::Scalar z = triFactor(i, j);
+        triFactor(i, j) = z * triFactor(j, j);
+        if (nbVecs - j - 1 > 0) triFactor.row(i).tail(nbVecs - j - 1) += z * triFactor.row(j).tail(nbVecs - j - 1);
+      }
+    }
+    triFactor(i, i) = hCoeffs(i);
   }
 }
 
-/** \internal */
-template<typename MatrixType,typename VectorsType,typename CoeffsType>
-void apply_block_householder_on_the_left(MatrixType& mat, const VectorsType& vectors, const CoeffsType& hCoeffs)
-{
-  typedef typename MatrixType::Index Index;
-  enum { TFactorSize = MatrixType::ColsAtCompileTime };
+/** \internal
+ * if forward then perform   mat = H0 * H1 * H2 * mat
+ * otherwise perform         mat = H2 * H1 * H0 * mat
+ */
+template <typename MatrixType, typename VectorsType, typename CoeffsType>
+void apply_block_householder_on_the_left(MatrixType& mat, const VectorsType& vectors, const CoeffsType& hCoeffs,
+                                         bool forward) {
+  enum { TFactorSize = VectorsType::ColsAtCompileTime };
   Index nbVecs = vectors.cols();
-  Matrix<typename MatrixType::Scalar, TFactorSize, TFactorSize, ColMajor> T(nbVecs,nbVecs);
-  make_block_householder_triangular_factor(T, vectors, hCoeffs);
+  Matrix<typename MatrixType::Scalar, TFactorSize, TFactorSize, RowMajor> T(nbVecs, nbVecs);
 
-  const TriangularView<const VectorsType, UnitLower>& V(vectors);
+  if (forward)
+    make_block_householder_triangular_factor(T, vectors, hCoeffs);
+  else
+    make_block_householder_triangular_factor(T, vectors, hCoeffs.conjugate());
+  const TriangularView<const VectorsType, UnitLower> V(vectors);
 
   // A -= V T V^* A
-  Matrix<typename MatrixType::Scalar,VectorsType::ColsAtCompileTime,MatrixType::ColsAtCompileTime,0,
-         VectorsType::MaxColsAtCompileTime,MatrixType::MaxColsAtCompileTime> tmp = V.adjoint() * mat;
+  Matrix<typename MatrixType::Scalar, VectorsType::ColsAtCompileTime, MatrixType::ColsAtCompileTime,
+         (VectorsType::MaxColsAtCompileTime == 1 && MatrixType::MaxColsAtCompileTime != 1) ? RowMajor : ColMajor,
+         VectorsType::MaxColsAtCompileTime, MatrixType::MaxColsAtCompileTime>
+      tmp = V.adjoint() * mat;
   // FIXME add .noalias() once the triangular product can work inplace
-  tmp = T.template triangularView<Upper>().adjoint() * tmp;
+  if (forward)
+    tmp = T.template triangularView<Upper>() * tmp;
+  else
+    tmp = T.template triangularView<Upper>().adjoint() * tmp;
   mat.noalias() -= V * tmp;
 }
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_BLOCK_HOUSEHOLDER_H
+#endif  // EIGEN_BLOCK_HOUSEHOLDER_H
diff --git a/inst/include/Eigen/src/Householder/Householder.h b/inst/include/Eigen/src/Householder/Householder.h
index 32112af9..e5d2d4fa 100644
--- a/inst/include/Eigen/src/Householder/Householder.h
+++ b/inst/include/Eigen/src/Householder/Householder.h
@@ -11,117 +11,105 @@
 #ifndef EIGEN_HOUSEHOLDER_H
 #define EIGEN_HOUSEHOLDER_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 namespace internal {
-template<int n> struct decrement_size
-{
-  enum {
-    ret = n==Dynamic ? n : n-1
-  };
+template <int n>
+struct decrement_size {
+  enum { ret = n == Dynamic ? n : n - 1 };
 };
-}
+}  // namespace internal
 
 /** Computes the elementary reflector H such that:
-  * \f$ H *this = [ beta 0 ... 0]^T \f$
-  * where the transformation H is:
-  * \f$ H = I - tau v v^*\f$
-  * and the vector v is:
-  * \f$ v^T = [1 essential^T] \f$
-  *
-  * The essential part of the vector \c v is stored in *this.
-  * 
-  * On output:
-  * \param tau the scaling factor of the Householder transformation
-  * \param beta the result of H * \c *this
-  *
-  * \sa MatrixBase::makeHouseholder(), MatrixBase::applyHouseholderOnTheLeft(),
-  *     MatrixBase::applyHouseholderOnTheRight()
-  */
-template<typename Derived>
-void MatrixBase<Derived>::makeHouseholderInPlace(Scalar& tau, RealScalar& beta)
-{
-  VectorBlock<Derived, internal::decrement_size<Base::SizeAtCompileTime>::ret> essentialPart(derived(), 1, size()-1);
+ * \f$ H *this = [ beta 0 ... 0]^T \f$
+ * where the transformation H is:
+ * \f$ H = I - tau v v^*\f$
+ * and the vector v is:
+ * \f$ v^T = [1 essential^T] \f$
+ *
+ * The essential part of the vector \c v is stored in *this.
+ *
+ * On output:
+ * \param tau the scaling factor of the Householder transformation
+ * \param beta the result of H * \c *this
+ *
+ * \sa MatrixBase::makeHouseholder(), MatrixBase::applyHouseholderOnTheLeft(),
+ *     MatrixBase::applyHouseholderOnTheRight()
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC void MatrixBase<Derived>::makeHouseholderInPlace(Scalar& tau, RealScalar& beta) {
+  VectorBlock<Derived, internal::decrement_size<Base::SizeAtCompileTime>::ret> essentialPart(derived(), 1, size() - 1);
   makeHouseholder(essentialPart, tau, beta);
 }
 
 /** Computes the elementary reflector H such that:
-  * \f$ H *this = [ beta 0 ... 0]^T \f$
-  * where the transformation H is:
-  * \f$ H = I - tau v v^*\f$
-  * and the vector v is:
-  * \f$ v^T = [1 essential^T] \f$
-  *
-  * On output:
-  * \param essential the essential part of the vector \c v
-  * \param tau the scaling factor of the Householder transformation
-  * \param beta the result of H * \c *this
-  *
-  * \sa MatrixBase::makeHouseholderInPlace(), MatrixBase::applyHouseholderOnTheLeft(),
-  *     MatrixBase::applyHouseholderOnTheRight()
-  */
-template<typename Derived>
-template<typename EssentialPart>
-void MatrixBase<Derived>::makeHouseholder(
-  EssentialPart& essential,
-  Scalar& tau,
-  RealScalar& beta) const
-{
-  using std::sqrt;
+ * \f$ H *this = [ beta 0 ... 0]^T \f$
+ * where the transformation H is:
+ * \f$ H = I - tau v v^*\f$
+ * and the vector v is:
+ * \f$ v^T = [1 essential^T] \f$
+ *
+ * On output:
+ * \param essential the essential part of the vector \c v
+ * \param tau the scaling factor of the Householder transformation
+ * \param beta the result of H * \c *this
+ *
+ * \sa MatrixBase::makeHouseholderInPlace(), MatrixBase::applyHouseholderOnTheLeft(),
+ *     MatrixBase::applyHouseholderOnTheRight()
+ */
+template <typename Derived>
+template <typename EssentialPart>
+EIGEN_DEVICE_FUNC void MatrixBase<Derived>::makeHouseholder(EssentialPart& essential, Scalar& tau,
+                                                            RealScalar& beta) const {
   using numext::conj;
-  
+
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(EssentialPart)
-  VectorBlock<const Derived, EssentialPart::SizeAtCompileTime> tail(derived(), 1, size()-1);
-  
-  RealScalar tailSqNorm = size()==1 ? RealScalar(0) : tail.squaredNorm();
+  VectorBlock<const Derived, EssentialPart::SizeAtCompileTime> tail(derived(), 1, size() - 1);
+
+  RealScalar tailSqNorm = size() == 1 ? RealScalar(0) : tail.squaredNorm();
   Scalar c0 = coeff(0);
+  const RealScalar tol = (std::numeric_limits<RealScalar>::min)();
 
-  if(tailSqNorm == RealScalar(0) && numext::imag(c0)==RealScalar(0))
-  {
+  if (tailSqNorm <= tol && numext::abs2(numext::imag(c0)) <= tol) {
     tau = RealScalar(0);
     beta = numext::real(c0);
     essential.setZero();
-  }
-  else
-  {
-    beta = sqrt(numext::abs2(c0) + tailSqNorm);
-    if (numext::real(c0)>=RealScalar(0))
-      beta = -beta;
+  } else {
+    beta = numext::sqrt(numext::abs2(c0) + tailSqNorm);
+    if (numext::real(c0) >= RealScalar(0)) beta = -beta;
     essential = tail / (c0 - beta);
     tau = conj((beta - c0) / beta);
   }
 }
 
 /** Apply the elementary reflector H given by
-  * \f$ H = I - tau v v^*\f$
-  * with
-  * \f$ v^T = [1 essential^T] \f$
-  * from the left to a vector or matrix.
-  *
-  * On input:
-  * \param essential the essential part of the vector \c v
-  * \param tau the scaling factor of the Householder transformation
-  * \param workspace a pointer to working space with at least
-  *                  this->cols() * essential.size() entries
-  *
-  * \sa MatrixBase::makeHouseholder(), MatrixBase::makeHouseholderInPlace(), 
-  *     MatrixBase::applyHouseholderOnTheRight()
-  */
-template<typename Derived>
-template<typename EssentialPart>
-void MatrixBase<Derived>::applyHouseholderOnTheLeft(
-  const EssentialPart& essential,
-  const Scalar& tau,
-  Scalar* workspace)
-{
-  if(rows() == 1)
-  {
-    *this *= Scalar(1)-tau;
-  }
-  else
-  {
-    Map<typename internal::plain_row_type<PlainObject>::type> tmp(workspace,cols());
-    Block<Derived, EssentialPart::SizeAtCompileTime, Derived::ColsAtCompileTime> bottom(derived(), 1, 0, rows()-1, cols());
+ * \f$ H = I - tau v v^*\f$
+ * with
+ * \f$ v^T = [1 essential^T] \f$
+ * from the left to a vector or matrix.
+ *
+ * On input:
+ * \param essential the essential part of the vector \c v
+ * \param tau the scaling factor of the Householder transformation
+ * \param workspace a pointer to working space with at least
+ *                  this->cols() entries
+ *
+ * \sa MatrixBase::makeHouseholder(), MatrixBase::makeHouseholderInPlace(),
+ *     MatrixBase::applyHouseholderOnTheRight()
+ */
+template <typename Derived>
+template <typename EssentialPart>
+EIGEN_DEVICE_FUNC void MatrixBase<Derived>::applyHouseholderOnTheLeft(const EssentialPart& essential, const Scalar& tau,
+                                                                      Scalar* workspace) {
+  if (rows() == 1) {
+    *this *= Scalar(1) - tau;
+  } else if (!numext::is_exactly_zero(tau)) {
+    Map<typename internal::plain_row_type<PlainObject>::type> tmp(workspace, cols());
+    Block<Derived, EssentialPart::SizeAtCompileTime, Derived::ColsAtCompileTime> bottom(derived(), 1, 0, rows() - 1,
+                                                                                        cols());
     tmp.noalias() = essential.adjoint() * bottom;
     tmp += this->row(0);
     this->row(0) -= tau * tmp;
@@ -130,42 +118,37 @@ void MatrixBase<Derived>::applyHouseholderOnTheLeft(
 }
 
 /** Apply the elementary reflector H given by
-  * \f$ H = I - tau v v^*\f$
-  * with
-  * \f$ v^T = [1 essential^T] \f$
-  * from the right to a vector or matrix.
-  *
-  * On input:
-  * \param essential the essential part of the vector \c v
-  * \param tau the scaling factor of the Householder transformation
-  * \param workspace a pointer to working space with at least
-  *                  this->cols() * essential.size() entries
-  *
-  * \sa MatrixBase::makeHouseholder(), MatrixBase::makeHouseholderInPlace(), 
-  *     MatrixBase::applyHouseholderOnTheLeft()
-  */
-template<typename Derived>
-template<typename EssentialPart>
-void MatrixBase<Derived>::applyHouseholderOnTheRight(
-  const EssentialPart& essential,
-  const Scalar& tau,
-  Scalar* workspace)
-{
-  if(cols() == 1)
-  {
-    *this *= Scalar(1)-tau;
-  }
-  else
-  {
-    Map<typename internal::plain_col_type<PlainObject>::type> tmp(workspace,rows());
-    Block<Derived, Derived::RowsAtCompileTime, EssentialPart::SizeAtCompileTime> right(derived(), 0, 1, rows(), cols()-1);
-    tmp.noalias() = right * essential.conjugate();
+ * \f$ H = I - tau v v^*\f$
+ * with
+ * \f$ v^T = [1 essential^T] \f$
+ * from the right to a vector or matrix.
+ *
+ * On input:
+ * \param essential the essential part of the vector \c v
+ * \param tau the scaling factor of the Householder transformation
+ * \param workspace a pointer to working space with at least
+ *                  this->rows() entries
+ *
+ * \sa MatrixBase::makeHouseholder(), MatrixBase::makeHouseholderInPlace(),
+ *     MatrixBase::applyHouseholderOnTheLeft()
+ */
+template <typename Derived>
+template <typename EssentialPart>
+EIGEN_DEVICE_FUNC void MatrixBase<Derived>::applyHouseholderOnTheRight(const EssentialPart& essential,
+                                                                       const Scalar& tau, Scalar* workspace) {
+  if (cols() == 1) {
+    *this *= Scalar(1) - tau;
+  } else if (!numext::is_exactly_zero(tau)) {
+    Map<typename internal::plain_col_type<PlainObject>::type> tmp(workspace, rows());
+    Block<Derived, Derived::RowsAtCompileTime, EssentialPart::SizeAtCompileTime> right(derived(), 0, 1, rows(),
+                                                                                       cols() - 1);
+    tmp.noalias() = right * essential;
     tmp += this->col(0);
     this->col(0) -= tau * tmp;
-    right.noalias() -= tau * tmp * essential.transpose();
+    right.noalias() -= tau * tmp * essential.adjoint();
   }
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_HOUSEHOLDER_H
+#endif  // EIGEN_HOUSEHOLDER_H
diff --git a/inst/include/Eigen/src/Householder/HouseholderSequence.h b/inst/include/Eigen/src/Householder/HouseholderSequence.h
index d800ca1f..d49c9615 100644
--- a/inst/include/Eigen/src/Householder/HouseholderSequence.h
+++ b/inst/include/Eigen/src/Householder/HouseholderSequence.h
@@ -11,431 +11,494 @@
 #ifndef EIGEN_HOUSEHOLDER_SEQUENCE_H
 #define EIGEN_HOUSEHOLDER_SEQUENCE_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 /** \ingroup Householder_Module
-  * \householder_module
-  * \class HouseholderSequence
-  * \brief Sequence of Householder reflections acting on subspaces with decreasing size
-  * \tparam VectorsType type of matrix containing the Householder vectors
-  * \tparam CoeffsType  type of vector containing the Householder coefficients
-  * \tparam Side        either OnTheLeft (the default) or OnTheRight
-  *
-  * This class represents a product sequence of Householder reflections where the first Householder reflection
-  * acts on the whole space, the second Householder reflection leaves the one-dimensional subspace spanned by
-  * the first unit vector invariant, the third Householder reflection leaves the two-dimensional subspace
-  * spanned by the first two unit vectors invariant, and so on up to the last reflection which leaves all but
-  * one dimensions invariant and acts only on the last dimension. Such sequences of Householder reflections
-  * are used in several algorithms to zero out certain parts of a matrix. Indeed, the methods
-  * HessenbergDecomposition::matrixQ(), Tridiagonalization::matrixQ(), HouseholderQR::householderQ(),
-  * and ColPivHouseholderQR::householderQ() all return a %HouseholderSequence.
-  *
-  * More precisely, the class %HouseholderSequence represents an \f$ n \times n \f$ matrix \f$ H \f$ of the
-  * form \f$ H = \prod_{i=0}^{n-1} H_i \f$ where the i-th Householder reflection is \f$ H_i = I - h_i v_i
-  * v_i^* \f$. The i-th Householder coefficient \f$ h_i \f$ is a scalar and the i-th Householder vector \f$
-  * v_i \f$ is a vector of the form
-  * \f[ 
-  * v_i = [\underbrace{0, \ldots, 0}_{i-1\mbox{ zeros}}, 1, \underbrace{*, \ldots,*}_{n-i\mbox{ arbitrary entries}} ]. 
-  * \f]
-  * The last \f$ n-i \f$ entries of \f$ v_i \f$ are called the essential part of the Householder vector.
-  *
-  * Typical usages are listed below, where H is a HouseholderSequence:
-  * \code
-  * A.applyOnTheRight(H);             // A = A * H
-  * A.applyOnTheLeft(H);              // A = H * A
-  * A.applyOnTheRight(H.adjoint());   // A = A * H^*
-  * A.applyOnTheLeft(H.adjoint());    // A = H^* * A
-  * MatrixXd Q = H;                   // conversion to a dense matrix
-  * \endcode
-  * In addition to the adjoint, you can also apply the inverse (=adjoint), the transpose, and the conjugate operators.
-  *
-  * See the documentation for HouseholderSequence(const VectorsType&, const CoeffsType&) for an example.
-  *
-  * \sa MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight()
-  */
+ * \householder_module
+ * \class HouseholderSequence
+ * \brief Sequence of Householder reflections acting on subspaces with decreasing size
+ * \tparam VectorsType type of matrix containing the Householder vectors
+ * \tparam CoeffsType  type of vector containing the Householder coefficients
+ * \tparam Side        either OnTheLeft (the default) or OnTheRight
+ *
+ * This class represents a product sequence of Householder reflections where the first Householder reflection
+ * acts on the whole space, the second Householder reflection leaves the one-dimensional subspace spanned by
+ * the first unit vector invariant, the third Householder reflection leaves the two-dimensional subspace
+ * spanned by the first two unit vectors invariant, and so on up to the last reflection which leaves all but
+ * one dimensions invariant and acts only on the last dimension. Such sequences of Householder reflections
+ * are used in several algorithms to zero out certain parts of a matrix. Indeed, the methods
+ * HessenbergDecomposition::matrixQ(), Tridiagonalization::matrixQ(), HouseholderQR::householderQ(),
+ * and ColPivHouseholderQR::householderQ() all return a %HouseholderSequence.
+ *
+ * More precisely, the class %HouseholderSequence represents an \f$ n \times n \f$ matrix \f$ H \f$ of the
+ * form \f$ H = \prod_{i=0}^{n-1} H_i \f$ where the i-th Householder reflection is \f$ H_i = I - h_i v_i
+ * v_i^* \f$. The i-th Householder coefficient \f$ h_i \f$ is a scalar and the i-th Householder vector \f$
+ * v_i \f$ is a vector of the form
+ * \f[
+ * v_i = [\underbrace{0, \ldots, 0}_{i-1\mbox{ zeros}}, 1, \underbrace{*, \ldots,*}_{n-i\mbox{ arbitrary entries}} ].
+ * \f]
+ * The last \f$ n-i \f$ entries of \f$ v_i \f$ are called the essential part of the Householder vector.
+ *
+ * Typical usages are listed below, where H is a HouseholderSequence:
+ * \code
+ * A.applyOnTheRight(H);             // A = A * H
+ * A.applyOnTheLeft(H);              // A = H * A
+ * A.applyOnTheRight(H.adjoint());   // A = A * H^*
+ * A.applyOnTheLeft(H.adjoint());    // A = H^* * A
+ * MatrixXd Q = H;                   // conversion to a dense matrix
+ * \endcode
+ * In addition to the adjoint, you can also apply the inverse (=adjoint), the transpose, and the conjugate operators.
+ *
+ * See the documentation for HouseholderSequence(const VectorsType&, const CoeffsType&) for an example.
+ *
+ * \sa MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight()
+ */
 
 namespace internal {
 
-template<typename VectorsType, typename CoeffsType, int Side>
-struct traits<HouseholderSequence<VectorsType,CoeffsType,Side> >
-{
+template <typename VectorsType, typename CoeffsType, int Side>
+struct traits<HouseholderSequence<VectorsType, CoeffsType, Side> > {
   typedef typename VectorsType::Scalar Scalar;
-  typedef typename VectorsType::Index Index;
+  typedef typename VectorsType::StorageIndex StorageIndex;
   typedef typename VectorsType::StorageKind StorageKind;
   enum {
-    RowsAtCompileTime = Side==OnTheLeft ? traits<VectorsType>::RowsAtCompileTime
-                                        : traits<VectorsType>::ColsAtCompileTime,
+    RowsAtCompileTime =
+        Side == OnTheLeft ? traits<VectorsType>::RowsAtCompileTime : traits<VectorsType>::ColsAtCompileTime,
     ColsAtCompileTime = RowsAtCompileTime,
-    MaxRowsAtCompileTime = Side==OnTheLeft ? traits<VectorsType>::MaxRowsAtCompileTime
-                                           : traits<VectorsType>::MaxColsAtCompileTime,
+    MaxRowsAtCompileTime =
+        Side == OnTheLeft ? traits<VectorsType>::MaxRowsAtCompileTime : traits<VectorsType>::MaxColsAtCompileTime,
     MaxColsAtCompileTime = MaxRowsAtCompileTime,
     Flags = 0
   };
 };
 
-template<typename VectorsType, typename CoeffsType, int Side>
-struct hseq_side_dependent_impl
-{
+struct HouseholderSequenceShape {};
+
+template <typename VectorsType, typename CoeffsType, int Side>
+struct evaluator_traits<HouseholderSequence<VectorsType, CoeffsType, Side> >
+    : public evaluator_traits_base<HouseholderSequence<VectorsType, CoeffsType, Side> > {
+  typedef HouseholderSequenceShape Shape;
+};
+
+template <typename VectorsType, typename CoeffsType, int Side>
+struct hseq_side_dependent_impl {
   typedef Block<const VectorsType, Dynamic, 1> EssentialVectorType;
   typedef HouseholderSequence<VectorsType, CoeffsType, OnTheLeft> HouseholderSequenceType;
-  typedef typename VectorsType::Index Index;
-  static inline const EssentialVectorType essentialVector(const HouseholderSequenceType& h, Index k)
-  {
-    Index start = k+1+h.m_shift;
-    return Block<const VectorsType,Dynamic,1>(h.m_vectors, start, k, h.rows()-start, 1);
+  static EIGEN_DEVICE_FUNC inline const EssentialVectorType essentialVector(const HouseholderSequenceType& h, Index k) {
+    Index start = k + 1 + h.m_shift;
+    return Block<const VectorsType, Dynamic, 1>(h.m_vectors, start, k, h.rows() - start, 1);
   }
 };
 
-template<typename VectorsType, typename CoeffsType>
-struct hseq_side_dependent_impl<VectorsType, CoeffsType, OnTheRight>
-{
+template <typename VectorsType, typename CoeffsType>
+struct hseq_side_dependent_impl<VectorsType, CoeffsType, OnTheRight> {
   typedef Transpose<Block<const VectorsType, 1, Dynamic> > EssentialVectorType;
   typedef HouseholderSequence<VectorsType, CoeffsType, OnTheRight> HouseholderSequenceType;
-  typedef typename VectorsType::Index Index;
-  static inline const EssentialVectorType essentialVector(const HouseholderSequenceType& h, Index k)
-  {
-    Index start = k+1+h.m_shift;
-    return Block<const VectorsType,1,Dynamic>(h.m_vectors, k, start, 1, h.rows()-start).transpose();
+  static inline const EssentialVectorType essentialVector(const HouseholderSequenceType& h, Index k) {
+    Index start = k + 1 + h.m_shift;
+    return Block<const VectorsType, 1, Dynamic>(h.m_vectors, k, start, 1, h.rows() - start).transpose();
   }
 };
 
-template<typename OtherScalarType, typename MatrixType> struct matrix_type_times_scalar_type
-{
-  typedef typename scalar_product_traits<OtherScalarType, typename MatrixType::Scalar>::ReturnType
-    ResultScalar;
-  typedef Matrix<ResultScalar, MatrixType::RowsAtCompileTime, MatrixType::ColsAtCompileTime,
-                 0, MatrixType::MaxRowsAtCompileTime, MatrixType::MaxColsAtCompileTime> Type;
+template <typename OtherScalarType, typename MatrixType>
+struct matrix_type_times_scalar_type {
+  typedef typename ScalarBinaryOpTraits<OtherScalarType, typename MatrixType::Scalar>::ReturnType ResultScalar;
+  typedef Matrix<ResultScalar, MatrixType::RowsAtCompileTime, MatrixType::ColsAtCompileTime, 0,
+                 MatrixType::MaxRowsAtCompileTime, MatrixType::MaxColsAtCompileTime>
+      Type;
 };
 
-} // end namespace internal
-
-template<typename VectorsType, typename CoeffsType, int Side> class HouseholderSequence
-  : public EigenBase<HouseholderSequence<VectorsType,CoeffsType,Side> >
-{
-    typedef typename internal::hseq_side_dependent_impl<VectorsType,CoeffsType,Side>::EssentialVectorType EssentialVectorType;
-  
-  public:
-    enum {
-      RowsAtCompileTime = internal::traits<HouseholderSequence>::RowsAtCompileTime,
-      ColsAtCompileTime = internal::traits<HouseholderSequence>::ColsAtCompileTime,
-      MaxRowsAtCompileTime = internal::traits<HouseholderSequence>::MaxRowsAtCompileTime,
-      MaxColsAtCompileTime = internal::traits<HouseholderSequence>::MaxColsAtCompileTime
-    };
-    typedef typename internal::traits<HouseholderSequence>::Scalar Scalar;
-    typedef typename VectorsType::Index Index;
-
-    typedef HouseholderSequence<
-      typename internal::conditional<NumTraits<Scalar>::IsComplex,
-        typename internal::remove_all<typename VectorsType::ConjugateReturnType>::type,
-        VectorsType>::type,
-      typename internal::conditional<NumTraits<Scalar>::IsComplex,
-        typename internal::remove_all<typename CoeffsType::ConjugateReturnType>::type,
-        CoeffsType>::type,
-      Side
-    > ConjugateReturnType;
-
-    /** \brief Constructor.
-      * \param[in]  v      %Matrix containing the essential parts of the Householder vectors
-      * \param[in]  h      Vector containing the Householder coefficients
-      *
-      * Constructs the Householder sequence with coefficients given by \p h and vectors given by \p v. The
-      * i-th Householder coefficient \f$ h_i \f$ is given by \p h(i) and the essential part of the i-th
-      * Householder vector \f$ v_i \f$ is given by \p v(k,i) with \p k > \p i (the subdiagonal part of the
-      * i-th column). If \p v has fewer columns than rows, then the Householder sequence contains as many
-      * Householder reflections as there are columns.
-      *
-      * \note The %HouseholderSequence object stores \p v and \p h by reference.
-      *
-      * Example: \include HouseholderSequence_HouseholderSequence.cpp
-      * Output: \verbinclude HouseholderSequence_HouseholderSequence.out
-      *
-      * \sa setLength(), setShift()
-      */
-    HouseholderSequence(const VectorsType& v, const CoeffsType& h)
-      : m_vectors(v), m_coeffs(h), m_trans(false), m_length(v.diagonalSize()),
-        m_shift(0)
-    {
-    }
+}  // end namespace internal
+
+template <typename VectorsType, typename CoeffsType, int Side>
+class HouseholderSequence : public EigenBase<HouseholderSequence<VectorsType, CoeffsType, Side> > {
+  typedef typename internal::hseq_side_dependent_impl<VectorsType, CoeffsType, Side>::EssentialVectorType
+      EssentialVectorType;
 
-    /** \brief Copy constructor. */
-    HouseholderSequence(const HouseholderSequence& other)
+ public:
+  enum {
+    RowsAtCompileTime = internal::traits<HouseholderSequence>::RowsAtCompileTime,
+    ColsAtCompileTime = internal::traits<HouseholderSequence>::ColsAtCompileTime,
+    MaxRowsAtCompileTime = internal::traits<HouseholderSequence>::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = internal::traits<HouseholderSequence>::MaxColsAtCompileTime
+  };
+  typedef typename internal::traits<HouseholderSequence>::Scalar Scalar;
+
+  typedef HouseholderSequence<
+      std::conditional_t<NumTraits<Scalar>::IsComplex,
+                         internal::remove_all_t<typename VectorsType::ConjugateReturnType>, VectorsType>,
+      std::conditional_t<NumTraits<Scalar>::IsComplex, internal::remove_all_t<typename CoeffsType::ConjugateReturnType>,
+                         CoeffsType>,
+      Side>
+      ConjugateReturnType;
+
+  typedef HouseholderSequence<
+      VectorsType,
+      std::conditional_t<NumTraits<Scalar>::IsComplex, internal::remove_all_t<typename CoeffsType::ConjugateReturnType>,
+                         CoeffsType>,
+      Side>
+      AdjointReturnType;
+
+  typedef HouseholderSequence<
+      std::conditional_t<NumTraits<Scalar>::IsComplex,
+                         internal::remove_all_t<typename VectorsType::ConjugateReturnType>, VectorsType>,
+      CoeffsType, Side>
+      TransposeReturnType;
+
+  typedef HouseholderSequence<std::add_const_t<VectorsType>, std::add_const_t<CoeffsType>, Side>
+      ConstHouseholderSequence;
+
+  /** \brief Constructor.
+   * \param[in]  v      %Matrix containing the essential parts of the Householder vectors
+   * \param[in]  h      Vector containing the Householder coefficients
+   *
+   * Constructs the Householder sequence with coefficients given by \p h and vectors given by \p v. The
+   * i-th Householder coefficient \f$ h_i \f$ is given by \p h(i) and the essential part of the i-th
+   * Householder vector \f$ v_i \f$ is given by \p v(k,i) with \p k > \p i (the subdiagonal part of the
+   * i-th column). If \p v has fewer columns than rows, then the Householder sequence contains as many
+   * Householder reflections as there are columns.
+   *
+   * \note The %HouseholderSequence object stores \p v and \p h by reference.
+   *
+   * Example: \include HouseholderSequence_HouseholderSequence.cpp
+   * Output: \verbinclude HouseholderSequence_HouseholderSequence.out
+   *
+   * \sa setLength(), setShift()
+   */
+  EIGEN_DEVICE_FUNC HouseholderSequence(const VectorsType& v, const CoeffsType& h)
+      : m_vectors(v), m_coeffs(h), m_reverse(false), m_length(v.diagonalSize()), m_shift(0) {}
+
+  /** \brief Copy constructor. */
+  EIGEN_DEVICE_FUNC HouseholderSequence(const HouseholderSequence& other)
       : m_vectors(other.m_vectors),
         m_coeffs(other.m_coeffs),
-        m_trans(other.m_trans),
+        m_reverse(other.m_reverse),
         m_length(other.m_length),
-        m_shift(other.m_shift)
-    {
-    }
+        m_shift(other.m_shift) {}
+
+  /** \brief Number of rows of transformation viewed as a matrix.
+   * \returns Number of rows
+   * \details This equals the dimension of the space that the transformation acts on.
+   */
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept {
+    return Side == OnTheLeft ? m_vectors.rows() : m_vectors.cols();
+  }
 
-    /** \brief Number of rows of transformation viewed as a matrix.
-      * \returns Number of rows 
-      * \details This equals the dimension of the space that the transformation acts on.
-      */
-    Index rows() const { return Side==OnTheLeft ? m_vectors.rows() : m_vectors.cols(); }
-
-    /** \brief Number of columns of transformation viewed as a matrix.
-      * \returns Number of columns
-      * \details This equals the dimension of the space that the transformation acts on.
-      */
-    Index cols() const { return rows(); }
-
-    /** \brief Essential part of a Householder vector.
-      * \param[in]  k  Index of Householder reflection
-      * \returns    Vector containing non-trivial entries of k-th Householder vector
-      *
-      * This function returns the essential part of the Householder vector \f$ v_i \f$. This is a vector of
-      * length \f$ n-i \f$ containing the last \f$ n-i \f$ entries of the vector
-      * \f[ 
-      * v_i = [\underbrace{0, \ldots, 0}_{i-1\mbox{ zeros}}, 1, \underbrace{*, \ldots,*}_{n-i\mbox{ arbitrary entries}} ]. 
-      * \f]
-      * The index \f$ i \f$ equals \p k + shift(), corresponding to the k-th column of the matrix \p v
-      * passed to the constructor.
-      *
-      * \sa setShift(), shift()
-      */
-    const EssentialVectorType essentialVector(Index k) const
-    {
-      eigen_assert(k >= 0 && k < m_length);
-      return internal::hseq_side_dependent_impl<VectorsType,CoeffsType,Side>::essentialVector(*this, k);
-    }
+  /** \brief Number of columns of transformation viewed as a matrix.
+   * \returns Number of columns
+   * \details This equals the dimension of the space that the transformation acts on.
+   */
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return rows(); }
+
+  /** \brief Essential part of a Householder vector.
+   * \param[in]  k  Index of Householder reflection
+   * \returns    Vector containing non-trivial entries of k-th Householder vector
+   *
+   * This function returns the essential part of the Householder vector \f$ v_i \f$. This is a vector of
+   * length \f$ n-i \f$ containing the last \f$ n-i \f$ entries of the vector
+   * \f[
+   * v_i = [\underbrace{0, \ldots, 0}_{i-1\mbox{ zeros}}, 1, \underbrace{*, \ldots,*}_{n-i\mbox{ arbitrary entries}} ].
+   * \f]
+   * The index \f$ i \f$ equals \p k + shift(), corresponding to the k-th column of the matrix \p v
+   * passed to the constructor.
+   *
+   * \sa setShift(), shift()
+   */
+  EIGEN_DEVICE_FUNC const EssentialVectorType essentialVector(Index k) const {
+    eigen_assert(k >= 0 && k < m_length);
+    return internal::hseq_side_dependent_impl<VectorsType, CoeffsType, Side>::essentialVector(*this, k);
+  }
 
-    /** \brief %Transpose of the Householder sequence. */
-    HouseholderSequence transpose() const
-    {
-      return HouseholderSequence(*this).setTrans(!m_trans);
-    }
+  /** \brief %Transpose of the Householder sequence. */
+  TransposeReturnType transpose() const {
+    return TransposeReturnType(m_vectors.conjugate(), m_coeffs)
+        .setReverseFlag(!m_reverse)
+        .setLength(m_length)
+        .setShift(m_shift);
+  }
 
-    /** \brief Complex conjugate of the Householder sequence. */
-    ConjugateReturnType conjugate() const
-    {
-      return ConjugateReturnType(m_vectors.conjugate(), m_coeffs.conjugate())
-             .setTrans(m_trans)
-             .setLength(m_length)
-             .setShift(m_shift);
-    }
+  /** \brief Complex conjugate of the Householder sequence. */
+  ConjugateReturnType conjugate() const {
+    return ConjugateReturnType(m_vectors.conjugate(), m_coeffs.conjugate())
+        .setReverseFlag(m_reverse)
+        .setLength(m_length)
+        .setShift(m_shift);
+  }
 
-    /** \brief Adjoint (conjugate transpose) of the Householder sequence. */
-    ConjugateReturnType adjoint() const
-    {
-      return conjugate().setTrans(!m_trans);
-    }
+  /** \returns an expression of the complex conjugate of \c *this if Cond==true,
+   *           returns \c *this otherwise.
+   */
+  template <bool Cond>
+  EIGEN_DEVICE_FUNC inline std::conditional_t<Cond, ConjugateReturnType, ConstHouseholderSequence> conjugateIf() const {
+    typedef std::conditional_t<Cond, ConjugateReturnType, ConstHouseholderSequence> ReturnType;
+    return ReturnType(m_vectors.template conjugateIf<Cond>(), m_coeffs.template conjugateIf<Cond>());
+  }
 
-    /** \brief Inverse of the Householder sequence (equals the adjoint). */
-    ConjugateReturnType inverse() const { return adjoint(); }
+  /** \brief Adjoint (conjugate transpose) of the Householder sequence. */
+  AdjointReturnType adjoint() const {
+    return AdjointReturnType(m_vectors, m_coeffs.conjugate())
+        .setReverseFlag(!m_reverse)
+        .setLength(m_length)
+        .setShift(m_shift);
+  }
 
-    /** \internal */
-    template<typename DestType> inline void evalTo(DestType& dst) const
-    {
-      Matrix<Scalar, DestType::RowsAtCompileTime, 1,
-             AutoAlign|ColMajor, DestType::MaxRowsAtCompileTime, 1> workspace(rows());
-      evalTo(dst, workspace);
-    }
+  /** \brief Inverse of the Householder sequence (equals the adjoint). */
+  AdjointReturnType inverse() const { return adjoint(); }
 
-    /** \internal */
-    template<typename Dest, typename Workspace>
-    void evalTo(Dest& dst, Workspace& workspace) const
-    {
-      workspace.resize(rows());
-      Index vecs = m_length;
-      if(    internal::is_same<typename internal::remove_all<VectorsType>::type,Dest>::value
-          && internal::extract_data(dst) == internal::extract_data(m_vectors))
-      {
-        // in-place
-        dst.diagonal().setOnes();
-        dst.template triangularView<StrictlyUpper>().setZero();
-        for(Index k = vecs-1; k >= 0; --k)
-        {
-          Index cornerSize = rows() - k - m_shift;
-          if(m_trans)
-            dst.bottomRightCorner(cornerSize, cornerSize)
-               .applyHouseholderOnTheRight(essentialVector(k), m_coeffs.coeff(k), workspace.data());
-          else
-            dst.bottomRightCorner(cornerSize, cornerSize)
-               .applyHouseholderOnTheLeft(essentialVector(k), m_coeffs.coeff(k), workspace.data());
-
-          // clear the off diagonal vector
-          dst.col(k).tail(rows()-k-1).setZero();
-        }
-        // clear the remaining columns if needed
-        for(Index k = 0; k<cols()-vecs ; ++k)
-          dst.col(k).tail(rows()-k-1).setZero();
+  /** \internal */
+  template <typename DestType>
+  inline EIGEN_DEVICE_FUNC void evalTo(DestType& dst) const {
+    Matrix<Scalar, DestType::RowsAtCompileTime, 1, AutoAlign | ColMajor, DestType::MaxRowsAtCompileTime, 1> workspace(
+        rows());
+    evalTo(dst, workspace);
+  }
+
+  /** \internal */
+  template <typename Dest, typename Workspace>
+  EIGEN_DEVICE_FUNC void evalTo(Dest& dst, Workspace& workspace) const {
+    workspace.resize(rows());
+    Index vecs = m_length;
+    if (internal::is_same_dense(dst, m_vectors)) {
+      // in-place
+      dst.diagonal().setOnes();
+      dst.template triangularView<StrictlyUpper>().setZero();
+      for (Index k = vecs - 1; k >= 0; --k) {
+        Index cornerSize = rows() - k - m_shift;
+        if (m_reverse)
+          dst.bottomRightCorner(cornerSize, cornerSize)
+              .applyHouseholderOnTheRight(essentialVector(k), m_coeffs.coeff(k), workspace.data());
+        else
+          dst.bottomRightCorner(cornerSize, cornerSize)
+              .applyHouseholderOnTheLeft(essentialVector(k), m_coeffs.coeff(k), workspace.data());
+
+        // clear the off diagonal vector
+        dst.col(k).tail(rows() - k - 1).setZero();
       }
+      // clear the remaining columns if needed
+      for (Index k = 0; k < cols() - vecs; ++k) dst.col(k).tail(rows() - k - 1).setZero();
+    } else if (m_length > BlockSize) {
+      dst.setIdentity(rows(), rows());
+      if (m_reverse)
+        applyThisOnTheLeft(dst, workspace, true);
       else
-      {
-        dst.setIdentity(rows(), rows());
-        for(Index k = vecs-1; k >= 0; --k)
-        {
-          Index cornerSize = rows() - k - m_shift;
-          if(m_trans)
-            dst.bottomRightCorner(cornerSize, cornerSize)
-               .applyHouseholderOnTheRight(essentialVector(k), m_coeffs.coeff(k), &workspace.coeffRef(0));
-          else
-            dst.bottomRightCorner(cornerSize, cornerSize)
-               .applyHouseholderOnTheLeft(essentialVector(k), m_coeffs.coeff(k), &workspace.coeffRef(0));
-        }
+        applyThisOnTheLeft(dst, workspace, true);
+    } else {
+      dst.setIdentity(rows(), rows());
+      for (Index k = vecs - 1; k >= 0; --k) {
+        Index cornerSize = rows() - k - m_shift;
+        if (m_reverse)
+          dst.bottomRightCorner(cornerSize, cornerSize)
+              .applyHouseholderOnTheRight(essentialVector(k), m_coeffs.coeff(k), workspace.data());
+        else
+          dst.bottomRightCorner(cornerSize, cornerSize)
+              .applyHouseholderOnTheLeft(essentialVector(k), m_coeffs.coeff(k), workspace.data());
       }
     }
+  }
 
-    /** \internal */
-    template<typename Dest> inline void applyThisOnTheRight(Dest& dst) const
-    {
-      Matrix<Scalar,1,Dest::RowsAtCompileTime,RowMajor,1,Dest::MaxRowsAtCompileTime> workspace(dst.rows());
-      applyThisOnTheRight(dst, workspace);
-    }
+  /** \internal */
+  template <typename Dest>
+  inline void applyThisOnTheRight(Dest& dst) const {
+    Matrix<Scalar, 1, Dest::RowsAtCompileTime, RowMajor, 1, Dest::MaxRowsAtCompileTime> workspace(dst.rows());
+    applyThisOnTheRight(dst, workspace);
+  }
 
-    /** \internal */
-    template<typename Dest, typename Workspace>
-    inline void applyThisOnTheRight(Dest& dst, Workspace& workspace) const
-    {
-      workspace.resize(dst.rows());
-      for(Index k = 0; k < m_length; ++k)
-      {
-        Index actual_k = m_trans ? m_length-k-1 : k;
-        dst.rightCols(rows()-m_shift-actual_k)
-           .applyHouseholderOnTheRight(essentialVector(actual_k), m_coeffs.coeff(actual_k), workspace.data());
-      }
+  /** \internal */
+  template <typename Dest, typename Workspace>
+  inline void applyThisOnTheRight(Dest& dst, Workspace& workspace) const {
+    workspace.resize(dst.rows());
+    for (Index k = 0; k < m_length; ++k) {
+      Index actual_k = m_reverse ? m_length - k - 1 : k;
+      dst.rightCols(rows() - m_shift - actual_k)
+          .applyHouseholderOnTheRight(essentialVector(actual_k), m_coeffs.coeff(actual_k), workspace.data());
     }
+  }
 
-    /** \internal */
-    template<typename Dest> inline void applyThisOnTheLeft(Dest& dst) const
-    {
-      Matrix<Scalar,1,Dest::ColsAtCompileTime,RowMajor,1,Dest::MaxColsAtCompileTime> workspace(dst.cols());
-      applyThisOnTheLeft(dst, workspace);
-    }
+  /** \internal */
+  template <typename Dest>
+  inline void applyThisOnTheLeft(Dest& dst, bool inputIsIdentity = false) const {
+    Matrix<Scalar, 1, Dest::ColsAtCompileTime, RowMajor, 1, Dest::MaxColsAtCompileTime> workspace;
+    applyThisOnTheLeft(dst, workspace, inputIsIdentity);
+  }
 
-    /** \internal */
-    template<typename Dest, typename Workspace>
-    inline void applyThisOnTheLeft(Dest& dst, Workspace& workspace) const
-    {
+  /** \internal */
+  template <typename Dest, typename Workspace>
+  inline void applyThisOnTheLeft(Dest& dst, Workspace& workspace, bool inputIsIdentity = false) const {
+    if (inputIsIdentity && m_reverse) inputIsIdentity = false;
+    // if the entries are large enough, then apply the reflectors by block
+    if (m_length >= BlockSize && dst.cols() > 1) {
+      // Make sure we have at least 2 useful blocks, otherwise it is point-less:
+      Index blockSize = m_length < Index(2 * BlockSize) ? (m_length + 1) / 2 : Index(BlockSize);
+      for (Index i = 0; i < m_length; i += blockSize) {
+        Index end = m_reverse ? (std::min)(m_length, i + blockSize) : m_length - i;
+        Index k = m_reverse ? i : (std::max)(Index(0), end - blockSize);
+        Index bs = end - k;
+        Index start = k + m_shift;
+
+        typedef Block<internal::remove_all_t<VectorsType>, Dynamic, Dynamic> SubVectorsType;
+        SubVectorsType sub_vecs1(m_vectors.const_cast_derived(), Side == OnTheRight ? k : start,
+                                 Side == OnTheRight ? start : k, Side == OnTheRight ? bs : m_vectors.rows() - start,
+                                 Side == OnTheRight ? m_vectors.cols() - start : bs);
+        std::conditional_t<Side == OnTheRight, Transpose<SubVectorsType>, SubVectorsType&> sub_vecs(sub_vecs1);
+
+        Index dstRows = rows() - m_shift - k;
+
+        if (inputIsIdentity) {
+          Block<Dest, Dynamic, Dynamic> sub_dst = dst.bottomRightCorner(dstRows, dstRows);
+          apply_block_householder_on_the_left(sub_dst, sub_vecs, m_coeffs.segment(k, bs), !m_reverse);
+        } else {
+          auto sub_dst = dst.bottomRows(dstRows);
+          apply_block_householder_on_the_left(sub_dst, sub_vecs, m_coeffs.segment(k, bs), !m_reverse);
+        }
+      }
+    } else {
       workspace.resize(dst.cols());
-      for(Index k = 0; k < m_length; ++k)
-      {
-        Index actual_k = m_trans ? k : m_length-k-1;
-        dst.bottomRows(rows()-m_shift-actual_k)
-           .applyHouseholderOnTheLeft(essentialVector(actual_k), m_coeffs.coeff(actual_k), workspace.data());
+      for (Index k = 0; k < m_length; ++k) {
+        Index actual_k = m_reverse ? k : m_length - k - 1;
+        Index dstRows = rows() - m_shift - actual_k;
+
+        if (inputIsIdentity) {
+          Block<Dest, Dynamic, Dynamic> sub_dst = dst.bottomRightCorner(dstRows, dstRows);
+          sub_dst.applyHouseholderOnTheLeft(essentialVector(actual_k), m_coeffs.coeff(actual_k), workspace.data());
+        } else {
+          auto sub_dst = dst.bottomRows(dstRows);
+          sub_dst.applyHouseholderOnTheLeft(essentialVector(actual_k), m_coeffs.coeff(actual_k), workspace.data());
+        }
       }
     }
+  }
 
-    /** \brief Computes the product of a Householder sequence with a matrix.
-      * \param[in]  other  %Matrix being multiplied.
-      * \returns    Expression object representing the product.
-      *
-      * This function computes \f$ HM \f$ where \f$ H \f$ is the Householder sequence represented by \p *this
-      * and \f$ M \f$ is the matrix \p other.
-      */
-    template<typename OtherDerived>
-    typename internal::matrix_type_times_scalar_type<Scalar, OtherDerived>::Type operator*(const MatrixBase<OtherDerived>& other) const
-    {
-      typename internal::matrix_type_times_scalar_type<Scalar, OtherDerived>::Type
-        res(other.template cast<typename internal::matrix_type_times_scalar_type<Scalar,OtherDerived>::ResultScalar>());
-      applyThisOnTheLeft(res);
-      return res;
-    }
+  /** \brief Computes the product of a Householder sequence with a matrix.
+   * \param[in]  other  %Matrix being multiplied.
+   * \returns    Expression object representing the product.
+   *
+   * This function computes \f$ HM \f$ where \f$ H \f$ is the Householder sequence represented by \p *this
+   * and \f$ M \f$ is the matrix \p other.
+   */
+  template <typename OtherDerived>
+  typename internal::matrix_type_times_scalar_type<Scalar, OtherDerived>::Type operator*(
+      const MatrixBase<OtherDerived>& other) const {
+    typename internal::matrix_type_times_scalar_type<Scalar, OtherDerived>::Type res(
+        other.template cast<typename internal::matrix_type_times_scalar_type<Scalar, OtherDerived>::ResultScalar>());
+    applyThisOnTheLeft(res, internal::is_identity<OtherDerived>::value && res.rows() == res.cols());
+    return res;
+  }
 
-    template<typename _VectorsType, typename _CoeffsType, int _Side> friend struct internal::hseq_side_dependent_impl;
-
-    /** \brief Sets the length of the Householder sequence.
-      * \param [in]  length  New value for the length.
-      *
-      * By default, the length \f$ n \f$ of the Householder sequence \f$ H = H_0 H_1 \ldots H_{n-1} \f$ is set
-      * to the number of columns of the matrix \p v passed to the constructor, or the number of rows if that
-      * is smaller. After this function is called, the length equals \p length.
-      *
-      * \sa length()
-      */
-    HouseholderSequence& setLength(Index length)
-    {
-      m_length = length;
-      return *this;
-    }
+  template <typename VectorsType_, typename CoeffsType_, int Side_>
+  friend struct internal::hseq_side_dependent_impl;
+
+  /** \brief Sets the length of the Householder sequence.
+   * \param [in]  length  New value for the length.
+   *
+   * By default, the length \f$ n \f$ of the Householder sequence \f$ H = H_0 H_1 \ldots H_{n-1} \f$ is set
+   * to the number of columns of the matrix \p v passed to the constructor, or the number of rows if that
+   * is smaller. After this function is called, the length equals \p length.
+   *
+   * \sa length()
+   */
+  EIGEN_DEVICE_FUNC HouseholderSequence& setLength(Index length) {
+    m_length = length;
+    return *this;
+  }
 
-    /** \brief Sets the shift of the Householder sequence.
-      * \param [in]  shift  New value for the shift.
-      *
-      * By default, a %HouseholderSequence object represents \f$ H = H_0 H_1 \ldots H_{n-1} \f$ and the i-th
-      * column of the matrix \p v passed to the constructor corresponds to the i-th Householder
-      * reflection. After this function is called, the object represents \f$ H = H_{\mathrm{shift}}
-      * H_{\mathrm{shift}+1} \ldots H_{n-1} \f$ and the i-th column of \p v corresponds to the (shift+i)-th
-      * Householder reflection.
-      *
-      * \sa shift()
-      */
-    HouseholderSequence& setShift(Index shift)
-    {
-      m_shift = shift;
-      return *this;
-    }
+  /** \brief Sets the shift of the Householder sequence.
+   * \param [in]  shift  New value for the shift.
+   *
+   * By default, a %HouseholderSequence object represents \f$ H = H_0 H_1 \ldots H_{n-1} \f$ and the i-th
+   * column of the matrix \p v passed to the constructor corresponds to the i-th Householder
+   * reflection. After this function is called, the object represents \f$ H = H_{\mathrm{shift}}
+   * H_{\mathrm{shift}+1} \ldots H_{n-1} \f$ and the i-th column of \p v corresponds to the (shift+i)-th
+   * Householder reflection.
+   *
+   * \sa shift()
+   */
+  EIGEN_DEVICE_FUNC HouseholderSequence& setShift(Index shift) {
+    m_shift = shift;
+    return *this;
+  }
 
-    Index length() const { return m_length; }  /**< \brief Returns the length of the Householder sequence. */
-    Index shift() const { return m_shift; }    /**< \brief Returns the shift of the Householder sequence. */
-
-    /* Necessary for .adjoint() and .conjugate() */
-    template <typename VectorsType2, typename CoeffsType2, int Side2> friend class HouseholderSequence;
-
-  protected:
-
-    /** \brief Sets the transpose flag.
-      * \param [in]  trans  New value of the transpose flag.
-      *
-      * By default, the transpose flag is not set. If the transpose flag is set, then this object represents 
-      * \f$ H^T = H_{n-1}^T \ldots H_1^T H_0^T \f$ instead of \f$ H = H_0 H_1 \ldots H_{n-1} \f$.
-      *
-      * \sa trans()
-      */
-    HouseholderSequence& setTrans(bool trans)
-    {
-      m_trans = trans;
-      return *this;
-    }
+  EIGEN_DEVICE_FUNC Index length() const {
+    return m_length;
+  } /**< \brief Returns the length of the Householder sequence. */
+
+  EIGEN_DEVICE_FUNC Index shift() const {
+    return m_shift;
+  } /**< \brief Returns the shift of the Householder sequence. */
+
+  /* Necessary for .adjoint() and .conjugate() */
+  template <typename VectorsType2, typename CoeffsType2, int Side2>
+  friend class HouseholderSequence;
+
+ protected:
+  /** \internal
+   * \brief Sets the reverse flag.
+   * \param [in]  reverse  New value of the reverse flag.
+   *
+   * By default, the reverse flag is not set. If the reverse flag is set, then this object represents
+   * \f$ H^r = H_{n-1} \ldots H_1 H_0 \f$ instead of \f$ H = H_0 H_1 \ldots H_{n-1} \f$.
+   * \note For real valued HouseholderSequence this is equivalent to transposing \f$ H \f$.
+   *
+   * \sa reverseFlag(), transpose(), adjoint()
+   */
+  HouseholderSequence& setReverseFlag(bool reverse) {
+    m_reverse = reverse;
+    return *this;
+  }
 
-    bool trans() const { return m_trans; }     /**< \brief Returns the transpose flag. */
+  bool reverseFlag() const { return m_reverse; } /**< \internal \brief Returns the reverse flag. */
 
-    typename VectorsType::Nested m_vectors;
-    typename CoeffsType::Nested m_coeffs;
-    bool m_trans;
-    Index m_length;
-    Index m_shift;
+  typename VectorsType::Nested m_vectors;
+  typename CoeffsType::Nested m_coeffs;
+  bool m_reverse;
+  Index m_length;
+  Index m_shift;
+  enum { BlockSize = 48 };
 };
 
 /** \brief Computes the product of a matrix with a Householder sequence.
-  * \param[in]  other  %Matrix being multiplied.
-  * \param[in]  h      %HouseholderSequence being multiplied.
-  * \returns    Expression object representing the product.
-  *
-  * This function computes \f$ MH \f$ where \f$ M \f$ is the matrix \p other and \f$ H \f$ is the
-  * Householder sequence represented by \p h.
-  */
-template<typename OtherDerived, typename VectorsType, typename CoeffsType, int Side>
-typename internal::matrix_type_times_scalar_type<typename VectorsType::Scalar,OtherDerived>::Type operator*(const MatrixBase<OtherDerived>& other, const HouseholderSequence<VectorsType,CoeffsType,Side>& h)
-{
-  typename internal::matrix_type_times_scalar_type<typename VectorsType::Scalar,OtherDerived>::Type
-    res(other.template cast<typename internal::matrix_type_times_scalar_type<typename VectorsType::Scalar,OtherDerived>::ResultScalar>());
+ * \param[in]  other  %Matrix being multiplied.
+ * \param[in]  h      %HouseholderSequence being multiplied.
+ * \returns    Expression object representing the product.
+ *
+ * This function computes \f$ MH \f$ where \f$ M \f$ is the matrix \p other and \f$ H \f$ is the
+ * Householder sequence represented by \p h.
+ */
+template <typename OtherDerived, typename VectorsType, typename CoeffsType, int Side>
+typename internal::matrix_type_times_scalar_type<typename VectorsType::Scalar, OtherDerived>::Type operator*(
+    const MatrixBase<OtherDerived>& other, const HouseholderSequence<VectorsType, CoeffsType, Side>& h) {
+  typename internal::matrix_type_times_scalar_type<typename VectorsType::Scalar, OtherDerived>::Type res(
+      other.template cast<typename internal::matrix_type_times_scalar_type<typename VectorsType::Scalar,
+                                                                           OtherDerived>::ResultScalar>());
   h.applyThisOnTheRight(res);
   return res;
 }
 
-/** \ingroup Householder_Module \householder_module
-  * \brief Convenience function for constructing a Householder sequence. 
-  * \returns A HouseholderSequence constructed from the specified arguments.
-  */
-template<typename VectorsType, typename CoeffsType>
-HouseholderSequence<VectorsType,CoeffsType> householderSequence(const VectorsType& v, const CoeffsType& h)
-{
-  return HouseholderSequence<VectorsType,CoeffsType,OnTheLeft>(v, h);
+/** \ingroup Householder_Module
+ * \householder_module
+ * \brief Convenience function for constructing a Householder sequence.
+ * \returns A HouseholderSequence constructed from the specified arguments.
+ */
+template <typename VectorsType, typename CoeffsType>
+HouseholderSequence<VectorsType, CoeffsType> householderSequence(const VectorsType& v, const CoeffsType& h) {
+  return HouseholderSequence<VectorsType, CoeffsType, OnTheLeft>(v, h);
 }
 
-/** \ingroup Householder_Module \householder_module
-  * \brief Convenience function for constructing a Householder sequence. 
-  * \returns A HouseholderSequence constructed from the specified arguments.
-  * \details This function differs from householderSequence() in that the template argument \p OnTheSide of
-  * the constructed HouseholderSequence is set to OnTheRight, instead of the default OnTheLeft.
-  */
-template<typename VectorsType, typename CoeffsType>
-HouseholderSequence<VectorsType,CoeffsType,OnTheRight> rightHouseholderSequence(const VectorsType& v, const CoeffsType& h)
-{
-  return HouseholderSequence<VectorsType,CoeffsType,OnTheRight>(v, h);
+/** \ingroup Householder_Module
+ * \householder_module
+ * \brief Convenience function for constructing a Householder sequence.
+ * \returns A HouseholderSequence constructed from the specified arguments.
+ * \details This function differs from householderSequence() in that the template argument \p OnTheSide of
+ * the constructed HouseholderSequence is set to OnTheRight, instead of the default OnTheLeft.
+ */
+template <typename VectorsType, typename CoeffsType>
+HouseholderSequence<VectorsType, CoeffsType, OnTheRight> rightHouseholderSequence(const VectorsType& v,
+                                                                                  const CoeffsType& h) {
+  return HouseholderSequence<VectorsType, CoeffsType, OnTheRight>(v, h);
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_HOUSEHOLDER_SEQUENCE_H
+#endif  // EIGEN_HOUSEHOLDER_SEQUENCE_H
diff --git a/inst/include/Eigen/src/Householder/InternalHeaderCheck.h b/inst/include/Eigen/src/Householder/InternalHeaderCheck.h
new file mode 100644
index 00000000..70de89bf
--- /dev/null
+++ b/inst/include/Eigen/src/Householder/InternalHeaderCheck.h
@@ -0,0 +1,3 @@
+#ifndef EIGEN_HOUSEHOLDER_MODULE_H
+#error "Please include Eigen/Householder instead of including headers inside the src directory directly."
+#endif
diff --git a/inst/include/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h b/inst/include/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
index 1f3c060d..904d853f 100644
--- a/inst/include/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
+++ b/inst/include/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2011-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,140 +10,204 @@
 #ifndef EIGEN_BASIC_PRECONDITIONERS_H
 #define EIGEN_BASIC_PRECONDITIONERS_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 /** \ingroup IterativeLinearSolvers_Module
   * \brief A preconditioner based on the digonal entries
   *
   * This class allows to approximately solve for A.x = b problems assuming A is a diagonal matrix.
   * In other words, this preconditioner neglects all off diagonal entries and, in Eigen's language, solves for:
-  * \code
-  * A.diagonal().asDiagonal() . x = b
-  * \endcode
+    \code
+    A.diagonal().asDiagonal() . x = b
+    \endcode
+  *
+  * \tparam Scalar_ the type of the scalar.
   *
-  * \tparam _Scalar the type of the scalar.
+  * \implsparsesolverconcept
   *
   * This preconditioner is suitable for both selfadjoint and general problems.
   * The diagonal entries are pre-inverted and stored into a dense vector.
   *
   * \note A variant that has yet to be implemented would attempt to preserve the norm of each column.
   *
+  * \sa class LeastSquareDiagonalPreconditioner, class ConjugateGradient
   */
-template <typename _Scalar>
-class DiagonalPreconditioner
-{
-    typedef _Scalar Scalar;
-    typedef Matrix<Scalar,Dynamic,1> Vector;
-    typedef typename Vector::Index Index;
-
-  public:
-    // this typedef is only to export the scalar type and compile-time dimensions to solve_retval
-    typedef Matrix<Scalar,Dynamic,Dynamic> MatrixType;
-
-    DiagonalPreconditioner() : m_isInitialized(false) {}
-
-    template<typename MatType>
-    DiagonalPreconditioner(const MatType& mat) : m_invdiag(mat.cols())
-    {
-      compute(mat);
-    }
+template <typename Scalar_>
+class DiagonalPreconditioner {
+  typedef Scalar_ Scalar;
+  typedef Matrix<Scalar, Dynamic, 1> Vector;
+
+ public:
+  typedef typename Vector::StorageIndex StorageIndex;
+  enum { ColsAtCompileTime = Dynamic, MaxColsAtCompileTime = Dynamic };
+
+  DiagonalPreconditioner() : m_isInitialized(false) {}
+
+  template <typename MatType>
+  explicit DiagonalPreconditioner(const MatType& mat) : m_invdiag(mat.cols()) {
+    compute(mat);
+  }
+
+  constexpr Index rows() const noexcept { return m_invdiag.size(); }
+  constexpr Index cols() const noexcept { return m_invdiag.size(); }
+
+  template <typename MatType>
+  DiagonalPreconditioner& analyzePattern(const MatType&) {
+    return *this;
+  }
 
-    Index rows() const { return m_invdiag.size(); }
-    Index cols() const { return m_invdiag.size(); }
-    
-    template<typename MatType>
-    DiagonalPreconditioner& analyzePattern(const MatType& )
-    {
-      return *this;
+  template <typename MatType>
+  DiagonalPreconditioner& factorize(const MatType& mat) {
+    m_invdiag.resize(mat.cols());
+    for (int j = 0; j < mat.outerSize(); ++j) {
+      typename MatType::InnerIterator it(mat, j);
+      while (it && it.index() != j) ++it;
+      if (it && it.index() == j && it.value() != Scalar(0))
+        m_invdiag(j) = Scalar(1) / it.value();
+      else
+        m_invdiag(j) = Scalar(1);
     }
-    
-    template<typename MatType>
-    DiagonalPreconditioner& factorize(const MatType& mat)
-    {
-      m_invdiag.resize(mat.cols());
-      for(int j=0; j<mat.outerSize(); ++j)
-      {
-        typename MatType::InnerIterator it(mat,j);
-        while(it && it.index()!=j) ++it;
-        if(it && it.index()==j && it.value()!=Scalar(0))
-          m_invdiag(j) = Scalar(1)/it.value();
+    m_isInitialized = true;
+    return *this;
+  }
+
+  template <typename MatType>
+  DiagonalPreconditioner& compute(const MatType& mat) {
+    return factorize(mat);
+  }
+
+  /** \internal */
+  template <typename Rhs, typename Dest>
+  void _solve_impl(const Rhs& b, Dest& x) const {
+    x = m_invdiag.array() * b.array();
+  }
+
+  template <typename Rhs>
+  inline const Solve<DiagonalPreconditioner, Rhs> solve(const MatrixBase<Rhs>& b) const {
+    eigen_assert(m_isInitialized && "DiagonalPreconditioner is not initialized.");
+    eigen_assert(m_invdiag.size() == b.rows() &&
+                 "DiagonalPreconditioner::solve(): invalid number of rows of the right hand side matrix b");
+    return Solve<DiagonalPreconditioner, Rhs>(*this, b.derived());
+  }
+
+  ComputationInfo info() { return Success; }
+
+ protected:
+  Vector m_invdiag;
+  bool m_isInitialized;
+};
+
+/** \ingroup IterativeLinearSolvers_Module
+  * \brief Jacobi preconditioner for LeastSquaresConjugateGradient
+  *
+  * This class allows to approximately solve for A' A x  = A' b problems assuming A' A is a diagonal matrix.
+  * In other words, this preconditioner neglects all off diagonal entries and, in Eigen's language, solves for:
+    \code
+    (A.adjoint() * A).diagonal().asDiagonal() * x = b
+    \endcode
+  *
+  * \tparam Scalar_ the type of the scalar.
+  *
+  * \implsparsesolverconcept
+  *
+  * The diagonal entries are pre-inverted and stored into a dense vector.
+  *
+  * \sa class LeastSquaresConjugateGradient, class DiagonalPreconditioner
+  */
+template <typename Scalar_>
+class LeastSquareDiagonalPreconditioner : public DiagonalPreconditioner<Scalar_> {
+  typedef Scalar_ Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef DiagonalPreconditioner<Scalar_> Base;
+  using Base::m_invdiag;
+
+ public:
+  LeastSquareDiagonalPreconditioner() : Base() {}
+
+  template <typename MatType>
+  explicit LeastSquareDiagonalPreconditioner(const MatType& mat) : Base() {
+    compute(mat);
+  }
+
+  template <typename MatType>
+  LeastSquareDiagonalPreconditioner& analyzePattern(const MatType&) {
+    return *this;
+  }
+
+  template <typename MatType>
+  LeastSquareDiagonalPreconditioner& factorize(const MatType& mat) {
+    // Compute the inverse squared-norm of each column of mat
+    m_invdiag.resize(mat.cols());
+    if (MatType::IsRowMajor) {
+      m_invdiag.setZero();
+      for (Index j = 0; j < mat.outerSize(); ++j) {
+        for (typename MatType::InnerIterator it(mat, j); it; ++it) m_invdiag(it.index()) += numext::abs2(it.value());
+      }
+      for (Index j = 0; j < mat.cols(); ++j)
+        if (numext::real(m_invdiag(j)) > RealScalar(0)) m_invdiag(j) = RealScalar(1) / numext::real(m_invdiag(j));
+    } else {
+      for (Index j = 0; j < mat.outerSize(); ++j) {
+        RealScalar sum = mat.col(j).squaredNorm();
+        if (sum > RealScalar(0))
+          m_invdiag(j) = RealScalar(1) / sum;
         else
-          m_invdiag(j) = Scalar(1);
+          m_invdiag(j) = RealScalar(1);
       }
-      m_isInitialized = true;
-      return *this;
-    }
-    
-    template<typename MatType>
-    DiagonalPreconditioner& compute(const MatType& mat)
-    {
-      return factorize(mat);
     }
+    Base::m_isInitialized = true;
+    return *this;
+  }
 
-    template<typename Rhs, typename Dest>
-    void _solve(const Rhs& b, Dest& x) const
-    {
-      x = m_invdiag.array() * b.array() ;
-    }
+  template <typename MatType>
+  LeastSquareDiagonalPreconditioner& compute(const MatType& mat) {
+    return factorize(mat);
+  }
 
-    template<typename Rhs> inline const internal::solve_retval<DiagonalPreconditioner, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "DiagonalPreconditioner is not initialized.");
-      eigen_assert(m_invdiag.size()==b.rows()
-                && "DiagonalPreconditioner::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::solve_retval<DiagonalPreconditioner, Rhs>(*this, b.derived());
-    }
+  ComputationInfo info() { return Success; }
 
-  protected:
-    Vector m_invdiag;
-    bool m_isInitialized;
+ protected:
 };
 
-namespace internal {
+/** \ingroup IterativeLinearSolvers_Module
+ * \brief A naive preconditioner which approximates any matrix as the identity matrix
+ *
+ * \implsparsesolverconcept
+ *
+ * \sa class DiagonalPreconditioner
+ */
+class IdentityPreconditioner {
+ public:
+  IdentityPreconditioner() {}
+
+  template <typename MatrixType>
+  explicit IdentityPreconditioner(const MatrixType&) {}
+
+  template <typename MatrixType>
+  IdentityPreconditioner& analyzePattern(const MatrixType&) {
+    return *this;
+  }
 
-template<typename _MatrixType, typename Rhs>
-struct solve_retval<DiagonalPreconditioner<_MatrixType>, Rhs>
-  : solve_retval_base<DiagonalPreconditioner<_MatrixType>, Rhs>
-{
-  typedef DiagonalPreconditioner<_MatrixType> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
+  template <typename MatrixType>
+  IdentityPreconditioner& factorize(const MatrixType&) {
+    return *this;
+  }
 
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
+  template <typename MatrixType>
+  IdentityPreconditioner& compute(const MatrixType&) {
+    return *this;
   }
-};
 
-}
+  template <typename Rhs>
+  inline const Rhs& solve(const Rhs& b) const {
+    return b;
+  }
 
-/** \ingroup IterativeLinearSolvers_Module
-  * \brief A naive preconditioner which approximates any matrix as the identity matrix
-  *
-  * \sa class DiagonalPreconditioner
-  */
-class IdentityPreconditioner
-{
-  public:
-
-    IdentityPreconditioner() {}
-
-    template<typename MatrixType>
-    IdentityPreconditioner(const MatrixType& ) {}
-    
-    template<typename MatrixType>
-    IdentityPreconditioner& analyzePattern(const MatrixType& ) { return *this; }
-    
-    template<typename MatrixType>
-    IdentityPreconditioner& factorize(const MatrixType& ) { return *this; }
-
-    template<typename MatrixType>
-    IdentityPreconditioner& compute(const MatrixType& ) { return *this; }
-    
-    template<typename Rhs>
-    inline const Rhs& solve(const Rhs& b) const { return b; }
+  ComputationInfo info() { return Success; }
 };
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_BASIC_PRECONDITIONERS_H
+#endif  // EIGEN_BASIC_PRECONDITIONERS_H
diff --git a/inst/include/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h b/inst/include/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
index 55122190..8fdeb849 100644
--- a/inst/include/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
+++ b/inst/include/Eigen/src/IterativeLinearSolvers/BiCGSTAB.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2011-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -11,253 +11,207 @@
 #ifndef EIGEN_BICGSTAB_H
 #define EIGEN_BICGSTAB_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 namespace internal {
 
 /** \internal Low-level bi conjugate gradient stabilized algorithm
-  * \param mat The matrix A
-  * \param rhs The right hand side vector b
-  * \param x On input and initial solution, on output the computed solution.
-  * \param precond A preconditioner being able to efficiently solve for an
-  *                approximation of Ax=b (regardless of b)
-  * \param iters On input the max number of iteration, on output the number of performed iterations.
-  * \param tol_error On input the tolerance error, on output an estimation of the relative error.
-  * \return false in the case of numerical issue, for example a break down of BiCGSTAB. 
-  */
-template<typename MatrixType, typename Rhs, typename Dest, typename Preconditioner>
-bool bicgstab(const MatrixType& mat, const Rhs& rhs, Dest& x,
-              const Preconditioner& precond, int& iters,
-              typename Dest::RealScalar& tol_error)
-{
-  using std::sqrt;
-  using std::abs;
+ * \param mat The matrix A
+ * \param rhs The right hand side vector b
+ * \param x On input and initial solution, on output the computed solution.
+ * \param precond A preconditioner being able to efficiently solve for an
+ *                approximation of Ax=b (regardless of b)
+ * \param iters On input the max number of iteration, on output the number of performed iterations.
+ * \param tol_error On input the tolerance error, on output an estimation of the relative error.
+ * \return false in the case of numerical issue, for example a break down of BiCGSTAB.
+ */
+template <typename MatrixType, typename Rhs, typename Dest, typename Preconditioner>
+bool bicgstab(const MatrixType& mat, const Rhs& rhs, Dest& x, const Preconditioner& precond, Index& iters,
+              typename Dest::RealScalar& tol_error) {
   typedef typename Dest::RealScalar RealScalar;
   typedef typename Dest::Scalar Scalar;
-  typedef Matrix<Scalar,Dynamic,1> VectorType;
+  typedef Matrix<Scalar, Dynamic, 1> VectorType;
   RealScalar tol = tol_error;
-  int maxIters = iters;
+  Index maxIters = iters;
 
-  int n = mat.cols();
-  VectorType r  = rhs - mat * x;
+  Index n = mat.cols();
+  VectorType r = rhs - mat * x;
   VectorType r0 = r;
-  
-  RealScalar r0_sqnorm = r0.squaredNorm();
-  RealScalar rhs_sqnorm = rhs.squaredNorm();
-  if(rhs_sqnorm == 0)
-  {
+
+  RealScalar r0_norm = r0.stableNorm();
+  RealScalar r_norm = r0_norm;
+  RealScalar rhs_norm = rhs.stableNorm();
+  if (rhs_norm == 0) {
     x.setZero();
     return true;
   }
-  Scalar rho    = 1;
-  Scalar alpha  = 1;
-  Scalar w      = 1;
-  
+  Scalar rho(1);
+  Scalar alpha(0);
+  Scalar w(1);
+
   VectorType v = VectorType::Zero(n), p = VectorType::Zero(n);
-  VectorType y(n),  z(n);
+  VectorType y(n), z(n);
   VectorType kt(n), ks(n);
 
   VectorType s(n), t(n);
 
-  RealScalar tol2 = tol*tol;
-  RealScalar eps2 = NumTraits<Scalar>::epsilon()*NumTraits<Scalar>::epsilon();
-  int i = 0;
-  int restarts = 0;
+  RealScalar eps = NumTraits<Scalar>::epsilon();
+  Index i = 0;
+  Index restarts = 0;
 
-  while ( r.squaredNorm()/rhs_sqnorm > tol2 && i<maxIters )
-  {
+  while (r_norm > tol && i < maxIters) {
     Scalar rho_old = rho;
-
     rho = r0.dot(r);
-    if (abs(rho) < eps2*r0_sqnorm)
-    {
-      // The new residual vector became too orthogonal to the arbitrarily choosen direction r0
+    if (Eigen::numext::abs(rho) / Eigen::numext::maxi(r0_norm, r_norm) < eps * Eigen::numext::mini(r0_norm, r_norm)) {
+      // The new residual vector became too orthogonal to the arbitrarily chosen direction r0
       // Let's restart with a new r0:
+      r = rhs - mat * x;
       r0 = r;
-      rho = r0_sqnorm = r.squaredNorm();
-      if(restarts++ == 0)
-        i = 0;
+      rho = r.squaredNorm();
+      r0_norm = r.stableNorm();
+      alpha = Scalar(0);
+      w = Scalar(1);
+      if (restarts++ == 0) i = 0;
     }
-    Scalar beta = (rho/rho_old) * (alpha / w);
+    Scalar beta = (rho / rho_old) * (alpha / w);
     p = r + beta * (p - w * v);
-    
+
     y = precond.solve(p);
-    
-    v.noalias() = mat * y;
 
-    alpha = rho / r0.dot(v);
+    v.noalias() = mat * y;
+    Scalar theta = r0.dot(v);
+    // For small angles ∠(r0, v) < eps, random restart.
+    RealScalar v_norm = v.stableNorm();
+    if (Eigen::numext::abs(theta) / Eigen::numext::maxi(r0_norm, v_norm) < eps * Eigen::numext::mini(r0_norm, v_norm)) {
+      r = rhs - mat * x;
+      r0.setRandom();
+      r0_norm = r0.stableNorm();
+      rho = Scalar(1);
+      alpha = Scalar(0);
+      w = Scalar(1);
+      if (restarts++ == 0) i = 0;
+      continue;
+    }
+    alpha = rho / theta;
     s = r - alpha * v;
 
     z = precond.solve(s);
     t.noalias() = mat * z;
 
     RealScalar tmp = t.squaredNorm();
-    if(tmp>RealScalar(0))
+    if (tmp > RealScalar(0)) {
       w = t.dot(s) / tmp;
-    else
+    } else {
       w = Scalar(0);
+    }
     x += alpha * y + w * z;
     r = s - w * t;
+    r_norm = r.stableNorm();
     ++i;
   }
-  tol_error = sqrt(r.squaredNorm()/rhs_sqnorm);
+
+  tol_error = r_norm / rhs_norm;
   iters = i;
-  return true; 
+  return true;
 }
 
-}
+}  // namespace internal
 
-template< typename _MatrixType,
-          typename _Preconditioner = DiagonalPreconditioner<typename _MatrixType::Scalar> >
+template <typename MatrixType_, typename Preconditioner_ = DiagonalPreconditioner<typename MatrixType_::Scalar> >
 class BiCGSTAB;
 
 namespace internal {
 
-template< typename _MatrixType, typename _Preconditioner>
-struct traits<BiCGSTAB<_MatrixType,_Preconditioner> >
-{
-  typedef _MatrixType MatrixType;
-  typedef _Preconditioner Preconditioner;
+template <typename MatrixType_, typename Preconditioner_>
+struct traits<BiCGSTAB<MatrixType_, Preconditioner_> > {
+  typedef MatrixType_ MatrixType;
+  typedef Preconditioner_ Preconditioner;
 };
 
-}
+}  // namespace internal
 
 /** \ingroup IterativeLinearSolvers_Module
-  * \brief A bi conjugate gradient stabilized solver for sparse square problems
-  *
-  * This class allows to solve for A.x = b sparse linear problems using a bi conjugate gradient
-  * stabilized algorithm. The vectors x and b can be either dense or sparse.
-  *
-  * \tparam _MatrixType the type of the sparse matrix A, can be a dense or a sparse matrix.
-  * \tparam _Preconditioner the type of the preconditioner. Default is DiagonalPreconditioner
-  *
-  * The maximal number of iterations and tolerance value can be controlled via the setMaxIterations()
-  * and setTolerance() methods. The defaults are the size of the problem for the maximal number of iterations
-  * and NumTraits<Scalar>::epsilon() for the tolerance.
-  * 
-  * This class can be used as the direct solver classes. Here is a typical usage example:
-  * \code
-  * int n = 10000;
-  * VectorXd x(n), b(n);
-  * SparseMatrix<double> A(n,n);
-  * // fill A and b
-  * BiCGSTAB<SparseMatrix<double> > solver;
-  * solver.compute(A);
-  * x = solver.solve(b);
-  * std::cout << "#iterations:     " << solver.iterations() << std::endl;
-  * std::cout << "estimated error: " << solver.error()      << std::endl;
-  * // update b, and solve again
-  * x = solver.solve(b);
-  * \endcode
-  * 
-  * By default the iterations start with x=0 as an initial guess of the solution.
-  * One can control the start using the solveWithGuess() method.
-  * 
-  * \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
-  */
-template< typename _MatrixType, typename _Preconditioner>
-class BiCGSTAB : public IterativeSolverBase<BiCGSTAB<_MatrixType,_Preconditioner> >
-{
+ * \brief A bi conjugate gradient stabilized solver for sparse square problems
+ *
+ * This class allows to solve for A.x = b sparse linear problems using a bi conjugate gradient
+ * stabilized algorithm. The vectors x and b can be either dense or sparse.
+ *
+ * \tparam MatrixType_ the type of the sparse matrix A, can be a dense or a sparse matrix.
+ * \tparam Preconditioner_ the type of the preconditioner. Default is DiagonalPreconditioner
+ *
+ * \implsparsesolverconcept
+ *
+ * The maximal number of iterations and tolerance value can be controlled via the setMaxIterations()
+ * and setTolerance() methods. The defaults are the size of the problem for the maximal number of iterations
+ * and NumTraits<Scalar>::epsilon() for the tolerance.
+ *
+ * The tolerance corresponds to the relative residual error: |Ax-b|/|b|
+ *
+ * \b Performance: when using sparse matrices, best performance is achied for a row-major sparse matrix format.
+ * Moreover, in this case multi-threading can be exploited if the user code is compiled with OpenMP enabled.
+ * See \ref TopicMultiThreading for details.
+ *
+ * This class can be used as the direct solver classes. Here is a typical usage example:
+ * \include BiCGSTAB_simple.cpp
+ *
+ * By default the iterations start with x=0 as an initial guess of the solution.
+ * One can control the start using the solveWithGuess() method.
+ *
+ * BiCGSTAB can also be used in a matrix-free context, see the following \link MatrixfreeSolverExample example \endlink.
+ *
+ * \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
+ */
+template <typename MatrixType_, typename Preconditioner_>
+class BiCGSTAB : public IterativeSolverBase<BiCGSTAB<MatrixType_, Preconditioner_> > {
   typedef IterativeSolverBase<BiCGSTAB> Base;
-  using Base::mp_matrix;
   using Base::m_error;
-  using Base::m_iterations;
   using Base::m_info;
   using Base::m_isInitialized;
-public:
-  typedef _MatrixType MatrixType;
+  using Base::m_iterations;
+  using Base::matrix;
+
+ public:
+  typedef MatrixType_ MatrixType;
   typedef typename MatrixType::Scalar Scalar;
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::RealScalar RealScalar;
-  typedef _Preconditioner Preconditioner;
-
-public:
+  typedef Preconditioner_ Preconditioner;
 
+ public:
   /** Default constructor. */
   BiCGSTAB() : Base() {}
 
   /** Initialize the solver with matrix \a A for further \c Ax=b solving.
-    * 
-    * This constructor is a shortcut for the default constructor followed
-    * by a call to compute().
-    * 
-    * \warning this class stores a reference to the matrix A as well as some
-    * precomputed values that depend on it. Therefore, if \a A is changed
-    * this class becomes invalid. Call compute() to update it with the new
-    * matrix A, or modify a copy of A.
-    */
-  template<typename MatrixDerived>
+   *
+   * This constructor is a shortcut for the default constructor followed
+   * by a call to compute().
+   *
+   * \warning this class stores a reference to the matrix A as well as some
+   * precomputed values that depend on it. Therefore, if \a A is changed
+   * this class becomes invalid. Call compute() to update it with the new
+   * matrix A, or modify a copy of A.
+   */
+  template <typename MatrixDerived>
   explicit BiCGSTAB(const EigenBase<MatrixDerived>& A) : Base(A.derived()) {}
 
   ~BiCGSTAB() {}
-  
-  /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A
-    * \a x0 as an initial solution.
-    *
-    * \sa compute()
-    */
-  template<typename Rhs,typename Guess>
-  inline const internal::solve_retval_with_guess<BiCGSTAB, Rhs, Guess>
-  solveWithGuess(const MatrixBase<Rhs>& b, const Guess& x0) const
-  {
-    eigen_assert(m_isInitialized && "BiCGSTAB is not initialized.");
-    eigen_assert(Base::rows()==b.rows()
-              && "BiCGSTAB::solve(): invalid number of rows of the right hand side matrix b");
-    return internal::solve_retval_with_guess
-            <BiCGSTAB, Rhs, Guess>(*this, b.derived(), x0);
-  }
-  
-  /** \internal */
-  template<typename Rhs,typename Dest>
-  void _solveWithGuess(const Rhs& b, Dest& x) const
-  {    
-    bool failed = false;
-    for(int j=0; j<b.cols(); ++j)
-    {
-      m_iterations = Base::maxIterations();
-      m_error = Base::m_tolerance;
-      
-      typename Dest::ColXpr xj(x,j);
-      if(!internal::bicgstab(*mp_matrix, b.col(j), xj, Base::m_preconditioner, m_iterations, m_error))
-        failed = true;
-    }
-    m_info = failed ? NumericalIssue
-           : m_error <= Base::m_tolerance ? Success
-           : NoConvergence;
-    m_isInitialized = true;
-  }
 
   /** \internal */
-  template<typename Rhs,typename Dest>
-  void _solve(const Rhs& b, Dest& x) const
-  {
-//     x.setZero();
-  x = b;
-    _solveWithGuess(b,x);
-  }
-
-protected:
+  template <typename Rhs, typename Dest>
+  void _solve_vector_with_guess_impl(const Rhs& b, Dest& x) const {
+    m_iterations = Base::maxIterations();
+    m_error = Base::m_tolerance;
 
-};
-
-
-namespace internal {
+    bool ret = internal::bicgstab(matrix(), b, x, Base::m_preconditioner, m_iterations, m_error);
 
-  template<typename _MatrixType, typename _Preconditioner, typename Rhs>
-struct solve_retval<BiCGSTAB<_MatrixType, _Preconditioner>, Rhs>
-  : solve_retval_base<BiCGSTAB<_MatrixType, _Preconditioner>, Rhs>
-{
-  typedef BiCGSTAB<_MatrixType, _Preconditioner> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
+    m_info = (!ret) ? NumericalIssue : m_error <= Base::m_tolerance ? Success : NoConvergence;
   }
-};
 
-} // end namespace internal
+ protected:
+};
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_BICGSTAB_H
+#endif  // EIGEN_BICGSTAB_H
diff --git a/inst/include/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h b/inst/include/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
index 1a7e569c..5bb0efe8 100644
--- a/inst/include/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
+++ b/inst/include/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2011-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,247 +10,208 @@
 #ifndef EIGEN_CONJUGATE_GRADIENT_H
 #define EIGEN_CONJUGATE_GRADIENT_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 namespace internal {
 
 /** \internal Low-level conjugate gradient algorithm
-  * \param mat The matrix A
-  * \param rhs The right hand side vector b
-  * \param x On input and initial solution, on output the computed solution.
-  * \param precond A preconditioner being able to efficiently solve for an
-  *                approximation of Ax=b (regardless of b)
-  * \param iters On input the max number of iteration, on output the number of performed iterations.
-  * \param tol_error On input the tolerance error, on output an estimation of the relative error.
-  */
-template<typename MatrixType, typename Rhs, typename Dest, typename Preconditioner>
-EIGEN_DONT_INLINE
-void conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest& x,
-                        const Preconditioner& precond, int& iters,
-                        typename Dest::RealScalar& tol_error)
-{
-  using std::sqrt;
-  using std::abs;
+ * \param mat The matrix A
+ * \param rhs The right hand side vector b
+ * \param x On input and initial solution, on output the computed solution.
+ * \param precond A preconditioner being able to efficiently solve for an
+ *                approximation of Ax=b (regardless of b)
+ * \param iters On input the max number of iteration, on output the number of performed iterations.
+ * \param tol_error On input the tolerance error, on output an estimation of the relative error.
+ */
+template <typename MatrixType, typename Rhs, typename Dest, typename Preconditioner>
+EIGEN_DONT_INLINE void conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest& x, const Preconditioner& precond,
+                                          Index& iters, typename Dest::RealScalar& tol_error) {
   typedef typename Dest::RealScalar RealScalar;
   typedef typename Dest::Scalar Scalar;
-  typedef Matrix<Scalar,Dynamic,1> VectorType;
-  
+  typedef Matrix<Scalar, Dynamic, 1> VectorType;
+
   RealScalar tol = tol_error;
-  int maxIters = iters;
-  
-  int n = mat.cols();
+  Index maxIters = iters;
 
-  VectorType residual = rhs - mat * x; //initial residual
+  Index n = mat.cols();
+
+  VectorType residual = rhs - mat * x;  // initial residual
 
   RealScalar rhsNorm2 = rhs.squaredNorm();
-  if(rhsNorm2 == 0) 
-  {
+  if (rhsNorm2 == 0) {
     x.setZero();
     iters = 0;
     tol_error = 0;
     return;
   }
-  RealScalar threshold = tol*tol*rhsNorm2;
+  const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
+  RealScalar threshold = numext::maxi(RealScalar(tol * tol * rhsNorm2), considerAsZero);
   RealScalar residualNorm2 = residual.squaredNorm();
-  if (residualNorm2 < threshold)
-  {
+  if (residualNorm2 < threshold) {
     iters = 0;
-    tol_error = sqrt(residualNorm2 / rhsNorm2);
+    tol_error = numext::sqrt(residualNorm2 / rhsNorm2);
     return;
   }
-  
+
   VectorType p(n);
-  p = precond.solve(residual);      //initial search direction
+  p = precond.solve(residual);  // initial search direction
 
   VectorType z(n), tmp(n);
   RealScalar absNew = numext::real(residual.dot(p));  // the square of the absolute value of r scaled by invM
-  int i = 0;
-  while(i < maxIters)
-  {
-    tmp.noalias() = mat * p;              // the bottleneck of the algorithm
-
-    Scalar alpha = absNew / p.dot(tmp);   // the amount we travel on dir
-    x += alpha * p;                       // update solution
-    residual -= alpha * tmp;              // update residue
-    
+  Index i = 0;
+  while (i < maxIters) {
+    tmp.noalias() = mat * p;  // the bottleneck of the algorithm
+
+    Scalar alpha = absNew / p.dot(tmp);  // the amount we travel on dir
+    x += alpha * p;                      // update solution
+    residual -= alpha * tmp;             // update residual
+
     residualNorm2 = residual.squaredNorm();
-    if(residualNorm2 < threshold)
-      break;
-    
-    z = precond.solve(residual);          // approximately solve for "A z = residual"
+    if (residualNorm2 < threshold) break;
+
+    z = precond.solve(residual);  // approximately solve for "A z = residual"
 
     RealScalar absOld = absNew;
-    absNew = numext::real(residual.dot(z));     // update the absolute value of r
-    RealScalar beta = absNew / absOld;            // calculate the Gram-Schmidt value used to create the new search direction
-    p = z + beta * p;                             // update search direction
+    absNew = numext::real(residual.dot(z));  // update the absolute value of r
+    RealScalar beta = absNew / absOld;       // calculate the Gram-Schmidt value used to create the new search direction
+    p = z + beta * p;                        // update search direction
     i++;
   }
-  tol_error = sqrt(residualNorm2 / rhsNorm2);
+  tol_error = numext::sqrt(residualNorm2 / rhsNorm2);
   iters = i;
 }
 
-}
+}  // namespace internal
 
-template< typename _MatrixType, int _UpLo=Lower,
-          typename _Preconditioner = DiagonalPreconditioner<typename _MatrixType::Scalar> >
+template <typename MatrixType_, int UpLo_ = Lower,
+          typename Preconditioner_ = DiagonalPreconditioner<typename MatrixType_::Scalar> >
 class ConjugateGradient;
 
 namespace internal {
 
-template< typename _MatrixType, int _UpLo, typename _Preconditioner>
-struct traits<ConjugateGradient<_MatrixType,_UpLo,_Preconditioner> >
-{
-  typedef _MatrixType MatrixType;
-  typedef _Preconditioner Preconditioner;
+template <typename MatrixType_, int UpLo_, typename Preconditioner_>
+struct traits<ConjugateGradient<MatrixType_, UpLo_, Preconditioner_> > {
+  typedef MatrixType_ MatrixType;
+  typedef Preconditioner_ Preconditioner;
 };
 
-}
+}  // namespace internal
 
 /** \ingroup IterativeLinearSolvers_Module
-  * \brief A conjugate gradient solver for sparse self-adjoint problems
+  * \brief A conjugate gradient solver for sparse (or dense) self-adjoint problems
   *
-  * This class allows to solve for A.x = b sparse linear problems using a conjugate gradient algorithm.
-  * The sparse matrix A must be selfadjoint. The vectors x and b can be either dense or sparse.
+  * This class allows to solve for A.x = b linear problems using an iterative conjugate gradient algorithm.
+  * The matrix A must be selfadjoint. The matrix A and the vectors x and b can be either dense or sparse.
   *
-  * \tparam _MatrixType the type of the matrix A, can be a dense or a sparse matrix.
-  * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower,
-  *               Upper, or Lower|Upper in which the full matrix entries will be considered. Default is Lower.
-  * \tparam _Preconditioner the type of the preconditioner. Default is DiagonalPreconditioner
+  * \tparam MatrixType_ the type of the matrix A, can be a dense or a sparse matrix.
+  * \tparam UpLo_ the triangular part that will be used for the computations. It can be Lower,
+  *               \c Upper, or \c Lower|Upper in which the full matrix entries will be considered.
+  *               Default is \c Lower, best performance is \c Lower|Upper.
+  * \tparam Preconditioner_ the type of the preconditioner. Default is DiagonalPreconditioner
+  *
+  * \implsparsesolverconcept
   *
   * The maximal number of iterations and tolerance value can be controlled via the setMaxIterations()
   * and setTolerance() methods. The defaults are the size of the problem for the maximal number of iterations
   * and NumTraits<Scalar>::epsilon() for the tolerance.
-  * 
+  *
+  * The tolerance corresponds to the relative residual error: |Ax-b|/|b|
+  *
+  * \b Performance: Even though the default value of \c UpLo_ is \c Lower, significantly higher performance is
+  * achieved when using a complete matrix and \b Lower|Upper as the \a UpLo_ template parameter. Moreover, in this
+  * case multi-threading can be exploited if the user code is compiled with OpenMP enabled.
+  * See \ref TopicMultiThreading for details.
+  *
   * This class can be used as the direct solver classes. Here is a typical usage example:
-  * \code
-  * int n = 10000;
-  * VectorXd x(n), b(n);
-  * SparseMatrix<double> A(n,n);
-  * // fill A and b
-  * ConjugateGradient<SparseMatrix<double> > cg;
-  * cg.compute(A);
-  * x = cg.solve(b);
-  * std::cout << "#iterations:     " << cg.iterations() << std::endl;
-  * std::cout << "estimated error: " << cg.error()      << std::endl;
-  * // update b, and solve again
-  * x = cg.solve(b);
-  * \endcode
-  * 
+    \code
+    int n = 10000;
+    VectorXd x(n), b(n);
+    SparseMatrix<double> A(n,n);
+    // fill A and b
+    ConjugateGradient<SparseMatrix<double>, Lower|Upper> cg;
+    cg.compute(A);
+    x = cg.solve(b);
+    std::cout << "#iterations:     " << cg.iterations() << std::endl;
+    std::cout << "estimated error: " << cg.error()      << std::endl;
+    // update b, and solve again
+    x = cg.solve(b);
+    \endcode
+  *
   * By default the iterations start with x=0 as an initial guess of the solution.
   * One can control the start using the solveWithGuess() method.
-  * 
-  * \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
+  *
+  * ConjugateGradient can also be used in a matrix-free context, see the following \link MatrixfreeSolverExample example
+  \endlink.
+  *
+  * \sa class LeastSquaresConjugateGradient, class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
   */
-template< typename _MatrixType, int _UpLo, typename _Preconditioner>
-class ConjugateGradient : public IterativeSolverBase<ConjugateGradient<_MatrixType,_UpLo,_Preconditioner> >
-{
+template <typename MatrixType_, int UpLo_, typename Preconditioner_>
+class ConjugateGradient : public IterativeSolverBase<ConjugateGradient<MatrixType_, UpLo_, Preconditioner_> > {
   typedef IterativeSolverBase<ConjugateGradient> Base;
-  using Base::mp_matrix;
   using Base::m_error;
-  using Base::m_iterations;
   using Base::m_info;
   using Base::m_isInitialized;
-public:
-  typedef _MatrixType MatrixType;
+  using Base::m_iterations;
+  using Base::matrix;
+
+ public:
+  typedef MatrixType_ MatrixType;
   typedef typename MatrixType::Scalar Scalar;
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::RealScalar RealScalar;
-  typedef _Preconditioner Preconditioner;
-
-  enum {
-    UpLo = _UpLo
-  };
+  typedef Preconditioner_ Preconditioner;
 
-public:
+  enum { UpLo = UpLo_ };
 
+ public:
   /** Default constructor. */
   ConjugateGradient() : Base() {}
 
   /** Initialize the solver with matrix \a A for further \c Ax=b solving.
-    * 
-    * This constructor is a shortcut for the default constructor followed
-    * by a call to compute().
-    * 
-    * \warning this class stores a reference to the matrix A as well as some
-    * precomputed values that depend on it. Therefore, if \a A is changed
-    * this class becomes invalid. Call compute() to update it with the new
-    * matrix A, or modify a copy of A.
-    */
-  template<typename MatrixDerived>
+   *
+   * This constructor is a shortcut for the default constructor followed
+   * by a call to compute().
+   *
+   * \warning this class stores a reference to the matrix A as well as some
+   * precomputed values that depend on it. Therefore, if \a A is changed
+   * this class becomes invalid. Call compute() to update it with the new
+   * matrix A, or modify a copy of A.
+   */
+  template <typename MatrixDerived>
   explicit ConjugateGradient(const EigenBase<MatrixDerived>& A) : Base(A.derived()) {}
 
   ~ConjugateGradient() {}
-  
-  /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A
-    * \a x0 as an initial solution.
-    *
-    * \sa compute()
-    */
-  template<typename Rhs,typename Guess>
-  inline const internal::solve_retval_with_guess<ConjugateGradient, Rhs, Guess>
-  solveWithGuess(const MatrixBase<Rhs>& b, const Guess& x0) const
-  {
-    eigen_assert(m_isInitialized && "ConjugateGradient is not initialized.");
-    eigen_assert(Base::rows()==b.rows()
-              && "ConjugateGradient::solve(): invalid number of rows of the right hand side matrix b");
-    return internal::solve_retval_with_guess
-            <ConjugateGradient, Rhs, Guess>(*this, b.derived(), x0);
-  }
 
   /** \internal */
-  template<typename Rhs,typename Dest>
-  void _solveWithGuess(const Rhs& b, Dest& x) const
-  {
-    typedef typename internal::conditional<UpLo==(Lower|Upper),
-                                           const MatrixType&,
-                                           SparseSelfAdjointView<const MatrixType, UpLo>
-                                          >::type MatrixWrapperType;
+  template <typename Rhs, typename Dest>
+  void _solve_vector_with_guess_impl(const Rhs& b, Dest& x) const {
+    typedef typename Base::MatrixWrapper MatrixWrapper;
+    typedef typename Base::ActualMatrixType ActualMatrixType;
+    enum {
+      TransposeInput = (!MatrixWrapper::MatrixFree) && (UpLo == (Lower | Upper)) && (!MatrixType::IsRowMajor) &&
+                       (!NumTraits<Scalar>::IsComplex)
+    };
+    typedef std::conditional_t<TransposeInput, Transpose<const ActualMatrixType>, ActualMatrixType const&>
+        RowMajorWrapper;
+    EIGEN_STATIC_ASSERT(internal::check_implication(MatrixWrapper::MatrixFree, UpLo == (Lower | Upper)),
+                        MATRIX_FREE_CONJUGATE_GRADIENT_IS_COMPATIBLE_WITH_UPPER_UNION_LOWER_MODE_ONLY);
+    typedef std::conditional_t<UpLo == (Lower | Upper), RowMajorWrapper,
+                               typename MatrixWrapper::template ConstSelfAdjointViewReturnType<UpLo>::Type>
+        SelfAdjointWrapper;
+
     m_iterations = Base::maxIterations();
     m_error = Base::m_tolerance;
 
-    for(int j=0; j<b.cols(); ++j)
-    {
-      m_iterations = Base::maxIterations();
-      m_error = Base::m_tolerance;
-
-      typename Dest::ColXpr xj(x,j);
-      internal::conjugate_gradient(MatrixWrapperType(*mp_matrix), b.col(j), xj, Base::m_preconditioner, m_iterations, m_error);
-    }
-
-    m_isInitialized = true;
+    RowMajorWrapper row_mat(matrix());
+    internal::conjugate_gradient(SelfAdjointWrapper(row_mat), b, x, Base::m_preconditioner, m_iterations, m_error);
     m_info = m_error <= Base::m_tolerance ? Success : NoConvergence;
   }
-  
-  /** \internal */
-  template<typename Rhs,typename Dest>
-  void _solve(const Rhs& b, Dest& x) const
-  {
-    x.setZero();
-    _solveWithGuess(b,x);
-  }
-
-protected:
 
+ protected:
 };
 
+}  // end namespace Eigen
 
-namespace internal {
-
-template<typename _MatrixType, int _UpLo, typename _Preconditioner, typename Rhs>
-struct solve_retval<ConjugateGradient<_MatrixType,_UpLo,_Preconditioner>, Rhs>
-  : solve_retval_base<ConjugateGradient<_MatrixType,_UpLo,_Preconditioner>, Rhs>
-{
-  typedef ConjugateGradient<_MatrixType,_UpLo,_Preconditioner> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
-  }
-};
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_CONJUGATE_GRADIENT_H
+#endif  // EIGEN_CONJUGATE_GRADIENT_H
diff --git a/inst/include/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h b/inst/include/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h
new file mode 100644
index 00000000..dd40058a
--- /dev/null
+++ b/inst/include/Eigen/src/IterativeLinearSolvers/IncompleteCholesky.h
@@ -0,0 +1,402 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam@inria.fr>
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_INCOMPLETE_CHOlESKY_H
+#define EIGEN_INCOMPLETE_CHOlESKY_H
+
+#include <vector>
+#include <list>
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+/**
+ * \brief Modified Incomplete Cholesky with dual threshold
+ *
+ * References : C-J. Lin and J. J. Moré, Incomplete Cholesky Factorizations with
+ *              Limited memory, SIAM J. Sci. Comput.  21(1), pp. 24-45, 1999
+ *
+ * \tparam Scalar the scalar type of the input matrices
+ * \tparam UpLo_ The triangular part that will be used for the computations. It can be Lower
+ *               or Upper. Default is Lower.
+ * \tparam OrderingType_ The ordering method to use, either AMDOrdering<> or NaturalOrdering<>. Default is
+ * AMDOrdering<int>.
+ *
+ * \implsparsesolverconcept
+ *
+ * It performs the following incomplete factorization: \f$ S P A P' S + \sigma I \approx L L' \f$
+ * where L is a lower triangular factor, S is a diagonal scaling matrix, P is a
+ * fill-in reducing permutation as computed by the ordering method, and \f$ \sigma \f$ is a shift
+ * for ensuring the decomposed matrix is positive definite.
+ *
+ * \b Shifting \b strategy: Let \f$ B = S P A P' S \f$  be the scaled matrix on which the factorization is carried out,
+ * and \f$ \beta \f$ be the minimum value of the diagonal. If \f$ \beta > 0 \f$ then, the factorization is directly
+ * performed on the matrix B, and \f$ \sigma = 0 \f$. Otherwise, the factorization is performed on the shifted matrix
+ * \f$ B + \sigma I \f$ for a shifting factor  \f$ \sigma \f$.  We start with \f$ \sigma = \sigma_0 - \beta \f$, where
+ * \f$ \sigma_0 \f$ is the initial shift value as returned and set by setInitialShift() method. The default value is \f$
+ * \sigma_0 = 10^{-3} \f$. If the factorization fails, then the shift in doubled until it succeed or a maximum of ten
+ * attempts. If it still fails, as returned by the info() method, then you can either increase the initial shift, or
+ * better use another preconditioning technique.
+ *
+ */
+template <typename Scalar, int UpLo_ = Lower, typename OrderingType_ = AMDOrdering<int> >
+class IncompleteCholesky : public SparseSolverBase<IncompleteCholesky<Scalar, UpLo_, OrderingType_> > {
+ protected:
+  typedef SparseSolverBase<IncompleteCholesky<Scalar, UpLo_, OrderingType_> > Base;
+  using Base::m_isInitialized;
+
+ public:
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef OrderingType_ OrderingType;
+  typedef typename OrderingType::PermutationType PermutationType;
+  typedef typename PermutationType::StorageIndex StorageIndex;
+  typedef SparseMatrix<Scalar, ColMajor, StorageIndex> FactorType;
+  typedef Matrix<Scalar, Dynamic, 1> VectorSx;
+  typedef Matrix<RealScalar, Dynamic, 1> VectorRx;
+  typedef Matrix<StorageIndex, Dynamic, 1> VectorIx;
+  typedef std::vector<std::list<StorageIndex> > VectorList;
+  enum { UpLo = UpLo_ };
+  enum { ColsAtCompileTime = Dynamic, MaxColsAtCompileTime = Dynamic };
+
+ public:
+  /** Default constructor leaving the object in a partly non-initialized stage.
+   *
+   * You must call compute() or the pair analyzePattern()/factorize() to make it valid.
+   *
+   * \sa IncompleteCholesky(const MatrixType&)
+   */
+  IncompleteCholesky() : m_initialShift(1e-3), m_analysisIsOk(false), m_factorizationIsOk(false) {}
+
+  /** Constructor computing the incomplete factorization for the given matrix \a matrix.
+   */
+  template <typename MatrixType>
+  IncompleteCholesky(const MatrixType& matrix)
+      : m_initialShift(1e-3), m_analysisIsOk(false), m_factorizationIsOk(false) {
+    compute(matrix);
+  }
+
+  /** \returns number of rows of the factored matrix */
+  constexpr Index rows() const noexcept { return m_L.rows(); }
+
+  /** \returns number of columns of the factored matrix */
+  constexpr Index cols() const noexcept { return m_L.cols(); }
+
+  /** \brief Reports whether previous computation was successful.
+   *
+   * It triggers an assertion if \c *this has not been initialized through the respective constructor,
+   * or a call to compute() or analyzePattern().
+   *
+   * \returns \c Success if computation was successful,
+   *          \c NumericalIssue if the matrix appears to be negative.
+   */
+  ComputationInfo info() const {
+    eigen_assert(m_isInitialized && "IncompleteCholesky is not initialized.");
+    return m_info;
+  }
+
+  /** \brief Set the initial shift parameter \f$ \sigma \f$.
+   */
+  void setInitialShift(RealScalar shift) { m_initialShift = shift; }
+
+  /** \brief Computes the fill reducing permutation vector using the sparsity pattern of \a mat
+   */
+  template <typename MatrixType>
+  void analyzePattern(const MatrixType& mat) {
+    OrderingType ord;
+    PermutationType pinv;
+    ord(mat.template selfadjointView<UpLo>(), pinv);
+    if (pinv.size() > 0)
+      m_perm = pinv.inverse();
+    else
+      m_perm.resize(0);
+    m_L.resize(mat.rows(), mat.cols());
+    m_analysisIsOk = true;
+    m_isInitialized = true;
+    m_info = Success;
+  }
+
+  /** \brief Performs the numerical factorization of the input matrix \a mat
+   *
+   * The method analyzePattern() or compute() must have been called beforehand
+   * with a matrix having the same pattern.
+   *
+   * \sa compute(), analyzePattern()
+   */
+  template <typename MatrixType>
+  void factorize(const MatrixType& mat);
+
+  /** Computes or re-computes the incomplete Cholesky factorization of the input matrix \a mat
+   *
+   * It is a shortcut for a sequential call to the analyzePattern() and factorize() methods.
+   *
+   * \sa analyzePattern(), factorize()
+   */
+  template <typename MatrixType>
+  void compute(const MatrixType& mat) {
+    analyzePattern(mat);
+    factorize(mat);
+  }
+
+  // internal
+  template <typename Rhs, typename Dest>
+  void _solve_impl(const Rhs& b, Dest& x) const {
+    eigen_assert(m_factorizationIsOk && "factorize() should be called first");
+    if (m_perm.rows() == b.rows())
+      x = m_perm * b;
+    else
+      x = b;
+    x = m_scale.asDiagonal() * x;
+    x = m_L.template triangularView<Lower>().solve(x);
+    x = m_L.adjoint().template triangularView<Upper>().solve(x);
+    x = m_scale.asDiagonal() * x;
+    if (m_perm.rows() == b.rows()) x = m_perm.inverse() * x;
+  }
+
+  /** \returns the sparse lower triangular factor L */
+  const FactorType& matrixL() const {
+    eigen_assert(m_factorizationIsOk && "factorize() should be called first");
+    return m_L;
+  }
+
+  /** \returns a vector representing the scaling factor S */
+  const VectorRx& scalingS() const {
+    eigen_assert(m_factorizationIsOk && "factorize() should be called first");
+    return m_scale;
+  }
+
+  /** \returns the fill-in reducing permutation P (can be empty for a natural ordering) */
+  const PermutationType& permutationP() const {
+    eigen_assert(m_analysisIsOk && "analyzePattern() should be called first");
+    return m_perm;
+  }
+
+  /** \returns the final shift parameter from the computation */
+  RealScalar shift() const { return m_shift; }
+
+ protected:
+  FactorType m_L;             // The lower part stored in CSC
+  VectorRx m_scale;           // The vector for scaling the matrix
+  RealScalar m_initialShift;  // The initial shift parameter
+  bool m_analysisIsOk;
+  bool m_factorizationIsOk;
+  ComputationInfo m_info;
+  PermutationType m_perm;
+  RealScalar m_shift;  // The final shift parameter.
+
+ private:
+  inline void updateList(Ref<const VectorIx> colPtr, Ref<VectorIx> rowIdx, Ref<VectorSx> vals, const Index& col,
+                         const Index& jk, VectorIx& firstElt, VectorList& listCol);
+};
+
+// Based on the following paper:
+//   C-J. Lin and J. J. Moré, Incomplete Cholesky Factorizations with
+//   Limited memory, SIAM J. Sci. Comput.  21(1), pp. 24-45, 1999
+//   http://ftp.mcs.anl.gov/pub/tech_reports/reports/P682.pdf
+template <typename Scalar, int UpLo_, typename OrderingType>
+template <typename MatrixType_>
+void IncompleteCholesky<Scalar, UpLo_, OrderingType>::factorize(const MatrixType_& mat) {
+  using std::sqrt;
+  eigen_assert(m_analysisIsOk && "analyzePattern() should be called first");
+
+  // Dropping strategy : Keep only the p largest elements per column, where p is the number of elements in the column of
+  // the original matrix. Other strategies will be added
+
+  // Apply the fill-reducing permutation computed in analyzePattern()
+  if (m_perm.rows() == mat.rows())  // To detect the null permutation
+  {
+    // The temporary is needed to make sure that the diagonal entry is properly sorted
+    FactorType tmp(mat.rows(), mat.cols());
+    tmp = mat.template selfadjointView<UpLo_>().twistedBy(m_perm);
+    m_L.template selfadjointView<Lower>() = tmp.template selfadjointView<Lower>();
+  } else {
+    m_L.template selfadjointView<Lower>() = mat.template selfadjointView<UpLo_>();
+  }
+
+  // The algorithm will insert increasingly large shifts on the diagonal until
+  // factorization succeeds. Therefore we have to make sure that there is a
+  // space in the datastructure to store such values, even if the original
+  // matrix has a zero on the diagonal.
+  bool modified = false;
+  for (Index i = 0; i < mat.cols(); ++i) {
+    bool inserted = false;
+    m_L.findOrInsertCoeff(i, i, &inserted);
+    if (inserted) {
+      modified = true;
+    }
+  }
+  if (modified) m_L.makeCompressed();
+
+  Index n = m_L.cols();
+  Index nnz = m_L.nonZeros();
+  Map<VectorSx> vals(m_L.valuePtr(), nnz);           // values
+  Map<VectorIx> rowIdx(m_L.innerIndexPtr(), nnz);    // Row indices
+  Map<VectorIx> colPtr(m_L.outerIndexPtr(), n + 1);  // Pointer to the beginning of each row
+  VectorIx firstElt(n - 1);  // for each j, points to the next entry in vals that will be used in the factorization
+  VectorList listCol(n);     // listCol(j) is a linked list of columns to update column j
+  VectorSx col_vals(n);      // Store a  nonzero values in each column
+  VectorIx col_irow(n);      // Row indices of nonzero elements in each column
+  VectorIx col_pattern(n);
+  col_pattern.fill(-1);
+  StorageIndex col_nnz;
+
+  // Computes the scaling factors
+  m_scale.resize(n);
+  m_scale.setZero();
+  for (Index j = 0; j < n; j++)
+    for (Index k = colPtr[j]; k < colPtr[j + 1]; k++) {
+      m_scale(j) += numext::abs2(vals(k));
+      if (rowIdx[k] != j) m_scale(rowIdx[k]) += numext::abs2(vals(k));
+    }
+
+  m_scale = m_scale.cwiseSqrt().cwiseSqrt();
+
+  for (Index j = 0; j < n; ++j)
+    if (m_scale(j) > (std::numeric_limits<RealScalar>::min)())
+      m_scale(j) = RealScalar(1) / m_scale(j);
+    else
+      m_scale(j) = 1;
+
+  // TODO disable scaling if not needed, i.e., if it is roughly uniform? (this will make solve() faster)
+
+  // Scale and compute the shift for the matrix
+  RealScalar mindiag = NumTraits<RealScalar>::highest();
+  for (Index j = 0; j < n; j++) {
+    for (Index k = colPtr[j]; k < colPtr[j + 1]; k++) vals[k] *= (m_scale(j) * m_scale(rowIdx[k]));
+    eigen_internal_assert(rowIdx[colPtr[j]] == j &&
+                          "IncompleteCholesky: only the lower triangular part must be stored");
+    mindiag = numext::mini(numext::real(vals[colPtr[j]]), mindiag);
+  }
+
+  FactorType L_save = m_L;
+
+  m_shift = RealScalar(0);
+  if (mindiag <= RealScalar(0.)) m_shift = m_initialShift - mindiag;
+
+  m_info = NumericalIssue;
+
+  // Try to perform the incomplete factorization using the current shift
+  int iter = 0;
+  do {
+    // Apply the shift to the diagonal elements of the matrix
+    for (Index j = 0; j < n; j++) vals[colPtr[j]] += m_shift;
+
+    // jki version of the Cholesky factorization
+    Index j = 0;
+    for (; j < n; ++j) {
+      // Left-looking factorization of the j-th column
+      // First, load the j-th column into col_vals
+      Scalar diag = vals[colPtr[j]];  // It is assumed that only the lower part is stored
+      col_nnz = 0;
+      for (Index i = colPtr[j] + 1; i < colPtr[j + 1]; i++) {
+        StorageIndex l = rowIdx[i];
+        col_vals(col_nnz) = vals[i];
+        col_irow(col_nnz) = l;
+        col_pattern(l) = col_nnz;
+        col_nnz++;
+      }
+      {
+        typename std::list<StorageIndex>::iterator k;
+        // Browse all previous columns that will update column j
+        for (k = listCol[j].begin(); k != listCol[j].end(); k++) {
+          Index jk = firstElt(*k);  // First element to use in the column
+          eigen_internal_assert(rowIdx[jk] == j);
+          Scalar v_j_jk = numext::conj(vals[jk]);
+
+          jk += 1;
+          for (Index i = jk; i < colPtr[*k + 1]; i++) {
+            StorageIndex l = rowIdx[i];
+            if (col_pattern[l] < 0) {
+              col_vals(col_nnz) = vals[i] * v_j_jk;
+              col_irow[col_nnz] = l;
+              col_pattern(l) = col_nnz;
+              col_nnz++;
+            } else
+              col_vals(col_pattern[l]) -= vals[i] * v_j_jk;
+          }
+          updateList(colPtr, rowIdx, vals, *k, jk, firstElt, listCol);
+        }
+      }
+
+      // Scale the current column
+      if (numext::real(diag) <= 0) {
+        if (++iter >= 10) return;
+
+        // increase shift
+        m_shift = numext::maxi(m_initialShift, RealScalar(2) * m_shift);
+        // restore m_L, col_pattern, and listCol
+        vals = Map<const VectorSx>(L_save.valuePtr(), nnz);
+        rowIdx = Map<const VectorIx>(L_save.innerIndexPtr(), nnz);
+        colPtr = Map<const VectorIx>(L_save.outerIndexPtr(), n + 1);
+        col_pattern.fill(-1);
+        for (Index i = 0; i < n; ++i) listCol[i].clear();
+
+        break;
+      }
+
+      RealScalar rdiag = sqrt(numext::real(diag));
+      vals[colPtr[j]] = rdiag;
+      for (Index k = 0; k < col_nnz; ++k) {
+        Index i = col_irow[k];
+        // Scale
+        col_vals(k) /= rdiag;
+        // Update the remaining diagonals with col_vals
+        vals[colPtr[i]] -= numext::abs2(col_vals(k));
+      }
+      // Select the largest p elements
+      // p is the original number of elements in the column (without the diagonal)
+      Index p = colPtr[j + 1] - colPtr[j] - 1;
+      Ref<VectorSx> cvals = col_vals.head(col_nnz);
+      Ref<VectorIx> cirow = col_irow.head(col_nnz);
+      internal::QuickSplit(cvals, cirow, p);
+      // Insert the largest p elements in the matrix
+      Index cpt = 0;
+      for (Index i = colPtr[j] + 1; i < colPtr[j + 1]; i++) {
+        vals[i] = col_vals(cpt);
+        rowIdx[i] = col_irow(cpt);
+        // restore col_pattern:
+        col_pattern(col_irow(cpt)) = -1;
+        cpt++;
+      }
+      // Get the first smallest row index and put it after the diagonal element
+      Index jk = colPtr(j) + 1;
+      updateList(colPtr, rowIdx, vals, j, jk, firstElt, listCol);
+    }
+
+    if (j == n) {
+      m_factorizationIsOk = true;
+      m_info = Success;
+    }
+  } while (m_info != Success);
+}
+
+template <typename Scalar, int UpLo_, typename OrderingType>
+inline void IncompleteCholesky<Scalar, UpLo_, OrderingType>::updateList(Ref<const VectorIx> colPtr,
+                                                                        Ref<VectorIx> rowIdx, Ref<VectorSx> vals,
+                                                                        const Index& col, const Index& jk,
+                                                                        VectorIx& firstElt, VectorList& listCol) {
+  if (jk < colPtr(col + 1)) {
+    Index p = colPtr(col + 1) - jk;
+    Index minpos;
+    rowIdx.segment(jk, p).minCoeff(&minpos);
+    minpos += jk;
+    if (rowIdx(minpos) != rowIdx(jk)) {
+      // Swap
+      std::swap(rowIdx(jk), rowIdx(minpos));
+      std::swap(vals(jk), vals(minpos));
+    }
+    firstElt(col) = internal::convert_index<StorageIndex, Index>(jk);
+    listCol[rowIdx(jk)].push_back(internal::convert_index<StorageIndex, Index>(col));
+  }
+}
+
+}  // end namespace Eigen
+
+#endif
diff --git a/inst/include/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h b/inst/include/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
index d3f37fea..11ce5e5a 100644
--- a/inst/include/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
+++ b/inst/include/Eigen/src/IterativeLinearSolvers/IncompleteLUT.h
@@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam@inria.fr>
+// Copyright (C) 2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,40 +11,41 @@
 #ifndef EIGEN_INCOMPLETE_LUT_H
 #define EIGEN_INCOMPLETE_LUT_H
 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 
-namespace Eigen { 
+namespace Eigen {
 
 namespace internal {
-    
+
 /** \internal
-  * Compute a quick-sort split of a vector 
-  * On output, the vector row is permuted such that its elements satisfy
-  * abs(row(i)) >= abs(row(ncut)) if i<ncut
-  * abs(row(i)) <= abs(row(ncut)) if i>ncut 
-  * \param row The vector of values
-  * \param ind The array of index for the elements in @p row
-  * \param ncut  The number of largest elements to keep
-  **/ 
-template <typename VectorV, typename VectorI, typename Index>
-Index QuickSplit(VectorV &row, VectorI &ind, Index ncut)
-{
+ * Compute a quick-sort split of a vector
+ * On output, the vector row is permuted such that its elements satisfy
+ * abs(row(i)) >= abs(row(ncut)) if i<ncut
+ * abs(row(i)) <= abs(row(ncut)) if i>ncut
+ * \param row The vector of values
+ * \param ind The array of index for the elements in @p row
+ * \param ncut  The number of largest elements to keep
+ **/
+template <typename VectorV, typename VectorI>
+Index QuickSplit(VectorV& row, VectorI& ind, Index ncut) {
   typedef typename VectorV::RealScalar RealScalar;
-  using std::swap;
   using std::abs;
+  using std::swap;
   Index mid;
   Index n = row.size(); /* length of the vector */
-  Index first, last ;
-  
+  Index first, last;
+
   ncut--; /* to fit the zero-based indices */
-  first = 0; 
-  last = n-1; 
-  if (ncut < first || ncut > last ) return 0;
-  
+  first = 0;
+  last = n - 1;
+  if (ncut < first || ncut > last) return 0;
+
   do {
-    mid = first; 
-    RealScalar abskey = abs(row(mid)); 
+    mid = first;
+    RealScalar abskey = abs(row(mid));
     for (Index j = first + 1; j <= last; j++) {
-      if ( abs(row(j)) > abskey) {
+      if (abs(row(j)) > abskey) {
         ++mid;
         swap(row(mid), row(j));
         swap(ind(mid), ind(j));
@@ -52,220 +54,226 @@ Index QuickSplit(VectorV &row, VectorI &ind, Index ncut)
     /* Interchange for the pivot element */
     swap(row(mid), row(first));
     swap(ind(mid), ind(first));
-    
-    if (mid > ncut) last = mid - 1;
-    else if (mid < ncut ) first = mid + 1; 
-  } while (mid != ncut );
-  
-  return 0; /* mid is equal to ncut */ 
+
+    if (mid > ncut)
+      last = mid - 1;
+    else if (mid < ncut)
+      first = mid + 1;
+  } while (mid != ncut);
+
+  return 0; /* mid is equal to ncut */
 }
 
-}// end namespace internal
+}  // end namespace internal
 
 /** \ingroup IterativeLinearSolvers_Module
-  * \class IncompleteLUT
-  * \brief Incomplete LU factorization with dual-threshold strategy
-  *
-  * During the numerical factorization, two dropping rules are used :
-  *  1) any element whose magnitude is less than some tolerance is dropped.
-  *    This tolerance is obtained by multiplying the input tolerance @p droptol 
-  *    by the average magnitude of all the original elements in the current row.
-  *  2) After the elimination of the row, only the @p fill largest elements in 
-  *    the L part and the @p fill largest elements in the U part are kept 
-  *    (in addition to the diagonal element ). Note that @p fill is computed from 
-  *    the input parameter @p fillfactor which is used the ratio to control the fill_in 
-  *    relatively to the initial number of nonzero elements.
-  * 
-  * The two extreme cases are when @p droptol=0 (to keep all the @p fill*2 largest elements)
-  * and when @p fill=n/2 with @p droptol being different to zero. 
-  * 
-  * References : Yousef Saad, ILUT: A dual threshold incomplete LU factorization, 
-  *              Numerical Linear Algebra with Applications, 1(4), pp 387-402, 1994.
-  * 
-  * NOTE : The following implementation is derived from the ILUT implementation
-  * in the SPARSKIT package, Copyright (C) 2005, the Regents of the University of Minnesota 
-  *  released under the terms of the GNU LGPL: 
-  *    http://www-users.cs.umn.edu/~saad/software/SPARSKIT/README
-  * However, Yousef Saad gave us permission to relicense his ILUT code to MPL2.
-  * See the Eigen mailing list archive, thread: ILUT, date: July 8, 2012:
-  *   http://listengine.tuxfamily.org/lists.tuxfamily.org/eigen/2012/07/msg00064.html
-  * alternatively, on GMANE:
-  *   http://comments.gmane.org/gmane.comp.lib.eigen/3302
-  */
-template <typename _Scalar>
-class IncompleteLUT : internal::noncopyable
-{
-    typedef _Scalar Scalar;
-    typedef typename NumTraits<Scalar>::Real RealScalar;
-    typedef Matrix<Scalar,Dynamic,1> Vector;
-    typedef SparseMatrix<Scalar,RowMajor> FactorType;
-    typedef SparseMatrix<Scalar,ColMajor> PermutType;
-    typedef typename FactorType::Index Index;
-
-  public:
-    typedef Matrix<Scalar,Dynamic,Dynamic> MatrixType;
-    
-    IncompleteLUT()
-      : m_droptol(NumTraits<Scalar>::dummy_precision()), m_fillfactor(10),
-        m_analysisIsOk(false), m_factorizationIsOk(false), m_isInitialized(false)
-    {}
-    
-    template<typename MatrixType>
-    IncompleteLUT(const MatrixType& mat, const RealScalar& droptol=NumTraits<Scalar>::dummy_precision(), int fillfactor = 10)
-      : m_droptol(droptol),m_fillfactor(fillfactor),
-        m_analysisIsOk(false),m_factorizationIsOk(false),m_isInitialized(false)
-    {
-      eigen_assert(fillfactor != 0);
-      compute(mat); 
-    }
-    
-    Index rows() const { return m_lu.rows(); }
-    
-    Index cols() const { return m_lu.cols(); }
-
-    /** \brief Reports whether previous computation was successful.
-      *
-      * \returns \c Success if computation was succesful,
-      *          \c NumericalIssue if the matrix.appears to be negative.
-      */
-    ComputationInfo info() const
-    {
-      eigen_assert(m_isInitialized && "IncompleteLUT is not initialized.");
-      return m_info;
-    }
-    
-    template<typename MatrixType>
-    void analyzePattern(const MatrixType& amat);
-    
-    template<typename MatrixType>
-    void factorize(const MatrixType& amat);
-    
-    /**
-      * Compute an incomplete LU factorization with dual threshold on the matrix mat
-      * No pivoting is done in this version
-      * 
-      **/
-    template<typename MatrixType>
-    IncompleteLUT<Scalar>& compute(const MatrixType& amat)
-    {
-      analyzePattern(amat); 
-      factorize(amat);
-      return *this;
-    }
+ * \class IncompleteLUT
+ * \brief Incomplete LU factorization with dual-threshold strategy
+ *
+ * \implsparsesolverconcept
+ *
+ * During the numerical factorization, two dropping rules are used :
+ *  1) any element whose magnitude is less than some tolerance is dropped.
+ *    This tolerance is obtained by multiplying the input tolerance @p droptol
+ *    by the average magnitude of all the original elements in the current row.
+ *  2) After the elimination of the row, only the @p fill largest elements in
+ *    the L part and the @p fill largest elements in the U part are kept
+ *    (in addition to the diagonal element ). Note that @p fill is computed from
+ *    the input parameter @p fillfactor which is used the ratio to control the fill_in
+ *    relatively to the initial number of nonzero elements.
+ *
+ * The two extreme cases are when @p droptol=0 (to keep all the @p fill*2 largest elements)
+ * and when @p fill=n/2 with @p droptol being different to zero.
+ *
+ * References : Yousef Saad, ILUT: A dual threshold incomplete LU factorization,
+ *              Numerical Linear Algebra with Applications, 1(4), pp 387-402, 1994.
+ *
+ * NOTE : The following implementation is derived from the ILUT implementation
+ * in the SPARSKIT package, Copyright (C) 2005, the Regents of the University of Minnesota
+ *  released under the terms of the GNU LGPL:
+ *    http://www-users.cs.umn.edu/~saad/software/SPARSKIT/README
+ * However, Yousef Saad gave us permission to relicense his ILUT code to MPL2.
+ * See the Eigen mailing list archive, thread: ILUT, date: July 8, 2012:
+ *   http://listengine.tuxfamily.org/lists.tuxfamily.org/eigen/2012/07/msg00064.html
+ * alternatively, on GMANE:
+ *   http://comments.gmane.org/gmane.comp.lib.eigen/3302
+ */
+template <typename Scalar_, typename StorageIndex_ = int>
+class IncompleteLUT : public SparseSolverBase<IncompleteLUT<Scalar_, StorageIndex_> > {
+ protected:
+  typedef SparseSolverBase<IncompleteLUT> Base;
+  using Base::m_isInitialized;
+
+ public:
+  typedef Scalar_ Scalar;
+  typedef StorageIndex_ StorageIndex;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef Matrix<Scalar, Dynamic, 1> Vector;
+  typedef Matrix<StorageIndex, Dynamic, 1> VectorI;
+  typedef SparseMatrix<Scalar, RowMajor, StorageIndex> FactorType;
+
+  enum { ColsAtCompileTime = Dynamic, MaxColsAtCompileTime = Dynamic };
+
+ public:
+  IncompleteLUT()
+      : m_droptol(NumTraits<Scalar>::dummy_precision()),
+        m_fillfactor(10),
+        m_analysisIsOk(false),
+        m_factorizationIsOk(false) {}
+
+  template <typename MatrixType>
+  explicit IncompleteLUT(const MatrixType& mat, const RealScalar& droptol = NumTraits<Scalar>::dummy_precision(),
+                         int fillfactor = 10)
+      : m_droptol(droptol), m_fillfactor(fillfactor), m_analysisIsOk(false), m_factorizationIsOk(false) {
+    eigen_assert(fillfactor != 0);
+    compute(mat);
+  }
 
-    void setDroptol(const RealScalar& droptol); 
-    void setFillfactor(int fillfactor); 
-    
-    template<typename Rhs, typename Dest>
-    void _solve(const Rhs& b, Dest& x) const
-    {
-      x = m_Pinv * b;
-      x = m_lu.template triangularView<UnitLower>().solve(x);
-      x = m_lu.template triangularView<Upper>().solve(x);
-      x = m_P * x; 
-    }
+  /** \brief Extraction Method for L-Factor */
+  const FactorType matrixL() const;
 
-    template<typename Rhs> inline const internal::solve_retval<IncompleteLUT, Rhs>
-     solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "IncompleteLUT is not initialized.");
-      eigen_assert(cols()==b.rows()
-                && "IncompleteLUT::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::solve_retval<IncompleteLUT, Rhs>(*this, b.derived());
-    }
+  /** \brief Extraction Method for U-Factor */
+  const FactorType matrixU() const;
 
-protected:
+  constexpr Index rows() const noexcept { return m_lu.rows(); }
 
-    /** keeps off-diagonal entries; drops diagonal entries */
-    struct keep_diag {
-      inline bool operator() (const Index& row, const Index& col, const Scalar&) const
-      {
-        return row!=col;
-      }
-    };
-
-protected:
-
-    FactorType m_lu;
-    RealScalar m_droptol;
-    int m_fillfactor;
-    bool m_analysisIsOk;
-    bool m_factorizationIsOk;
-    bool m_isInitialized;
-    ComputationInfo m_info;
-    PermutationMatrix<Dynamic,Dynamic,Index> m_P;     // Fill-reducing permutation
-    PermutationMatrix<Dynamic,Dynamic,Index> m_Pinv;  // Inverse permutation
+  constexpr Index cols() const noexcept { return m_lu.cols(); }
+
+  /** \brief Reports whether previous computation was successful.
+   *
+   * \returns \c Success if computation was successful,
+   *          \c NumericalIssue if the matrix.appears to be negative.
+   */
+  ComputationInfo info() const {
+    eigen_assert(m_isInitialized && "IncompleteLUT is not initialized.");
+    return m_info;
+  }
+
+  template <typename MatrixType>
+  void analyzePattern(const MatrixType& amat);
+
+  template <typename MatrixType>
+  void factorize(const MatrixType& amat);
+
+  /**
+   * Compute an incomplete LU factorization with dual threshold on the matrix mat
+   * No pivoting is done in this version
+   *
+   **/
+  template <typename MatrixType>
+  IncompleteLUT& compute(const MatrixType& amat) {
+    analyzePattern(amat);
+    factorize(amat);
+    return *this;
+  }
+
+  void setDroptol(const RealScalar& droptol);
+  void setFillfactor(int fillfactor);
+
+  template <typename Rhs, typename Dest>
+  void _solve_impl(const Rhs& b, Dest& x) const {
+    x = m_Pinv * b;
+    x = m_lu.template triangularView<UnitLower>().solve(x);
+    x = m_lu.template triangularView<Upper>().solve(x);
+    x = m_P * x;
+  }
+
+ protected:
+  /** keeps off-diagonal entries; drops diagonal entries */
+  struct keep_diag {
+    inline bool operator()(const Index& row, const Index& col, const Scalar&) const { return row != col; }
+  };
+
+ protected:
+  FactorType m_lu;
+  RealScalar m_droptol;
+  int m_fillfactor;
+  bool m_analysisIsOk;
+  bool m_factorizationIsOk;
+  ComputationInfo m_info;
+  PermutationMatrix<Dynamic, Dynamic, StorageIndex> m_P;     // Fill-reducing permutation
+  PermutationMatrix<Dynamic, Dynamic, StorageIndex> m_Pinv;  // Inverse permutation
 };
 
 /**
  * Set control parameter droptol
- *  \param droptol   Drop any element whose magnitude is less than this tolerance 
- **/ 
-template<typename Scalar>
-void IncompleteLUT<Scalar>::setDroptol(const RealScalar& droptol)
-{
-  this->m_droptol = droptol;   
+ *  \param droptol   Drop any element whose magnitude is less than this tolerance
+ **/
+template <typename Scalar, typename StorageIndex>
+void IncompleteLUT<Scalar, StorageIndex>::setDroptol(const RealScalar& droptol) {
+  this->m_droptol = droptol;
 }
 
 /**
  * Set control parameter fillfactor
- * \param fillfactor  This is used to compute the  number @p fill_in of largest elements to keep on each row. 
- **/ 
-template<typename Scalar>
-void IncompleteLUT<Scalar>::setFillfactor(int fillfactor)
-{
-  this->m_fillfactor = fillfactor;   
+ * \param fillfactor  This is used to compute the  number @p fill_in of largest elements to keep on each row.
+ **/
+template <typename Scalar, typename StorageIndex>
+void IncompleteLUT<Scalar, StorageIndex>::setFillfactor(int fillfactor) {
+  this->m_fillfactor = fillfactor;
+}
+
+/**
+ * get L-Factor
+ * \return L-Factor is a matrix containing the lower triangular part of the sparse matrix. All elements of the matrix
+ * above the main diagonal are zero.
+ **/
+template <typename Scalar, typename StorageIndex>
+const typename IncompleteLUT<Scalar, StorageIndex>::FactorType IncompleteLUT<Scalar, StorageIndex>::matrixL() const {
+  eigen_assert(m_factorizationIsOk && "factorize() should be called first");
+  return m_lu.template triangularView<UnitLower>();
 }
 
-template <typename Scalar>
-template<typename _MatrixType>
-void IncompleteLUT<Scalar>::analyzePattern(const _MatrixType& amat)
-{
+/**
+ * get U-Factor
+ * \return L-Factor is a matrix containing the upper triangular part of the sparse matrix. All elements of the matrix
+ * below the main diagonal are zero.
+ **/
+template <typename Scalar, typename StorageIndex>
+const typename IncompleteLUT<Scalar, StorageIndex>::FactorType IncompleteLUT<Scalar, StorageIndex>::matrixU() const {
+  eigen_assert(m_factorizationIsOk && "Factorization must be computed first.");
+  return m_lu.template triangularView<Upper>();
+}
+
+template <typename Scalar, typename StorageIndex>
+template <typename MatrixType_>
+void IncompleteLUT<Scalar, StorageIndex>::analyzePattern(const MatrixType_& amat) {
   // Compute the Fill-reducing permutation
   // Since ILUT does not perform any numerical pivoting,
   // it is highly preferable to keep the diagonal through symmetric permutations.
-#ifndef EIGEN_MPL2_ONLY
   // To this end, let's symmetrize the pattern and perform AMD on it.
-  SparseMatrix<Scalar,ColMajor, Index> mat1 = amat;
-  SparseMatrix<Scalar,ColMajor, Index> mat2 = amat.transpose();
+  SparseMatrix<Scalar, ColMajor, StorageIndex> mat1 = amat;
+  SparseMatrix<Scalar, ColMajor, StorageIndex> mat2 = amat.transpose();
   // FIXME for a matrix with nearly symmetric pattern, mat2+mat1 is the appropriate choice.
-  //       on the other hand for a really non-symmetric pattern, mat2*mat1 should be prefered...
-  SparseMatrix<Scalar,ColMajor, Index> AtA = mat2 + mat1;
-  AMDOrdering<Index> ordering;
-  ordering(AtA,m_P);
-  m_Pinv  = m_P.inverse(); // cache the inverse permutation
-#else
-  // If AMD is not available, (MPL2-only), then let's use the slower COLAMD routine.
-  SparseMatrix<Scalar,ColMajor, Index> mat1 = amat;
-  COLAMDOrdering<Index> ordering;
-  ordering(mat1,m_Pinv);
-  m_P = m_Pinv.inverse();
-#endif
-
+  //       on the other hand for a really non-symmetric pattern, mat2*mat1 should be preferred...
+  SparseMatrix<Scalar, ColMajor, StorageIndex> AtA = mat2 + mat1;
+  AMDOrdering<StorageIndex> ordering;
+  ordering(AtA, m_P);
+  m_Pinv = m_P.inverse();  // cache the inverse permutation
   m_analysisIsOk = true;
   m_factorizationIsOk = false;
-  m_isInitialized = false;
+  m_isInitialized = true;
 }
 
-template <typename Scalar>
-template<typename _MatrixType>
-void IncompleteLUT<Scalar>::factorize(const _MatrixType& amat)
-{
+template <typename Scalar, typename StorageIndex>
+template <typename MatrixType_>
+void IncompleteLUT<Scalar, StorageIndex>::factorize(const MatrixType_& amat) {
+  using internal::convert_index;
+  using std::abs;
   using std::sqrt;
   using std::swap;
-  using std::abs;
 
   eigen_assert((amat.rows() == amat.cols()) && "The factorization should be done on a square matrix");
   Index n = amat.cols();  // Size of the matrix
-  m_lu.resize(n,n);
+  m_lu.resize(n, n);
   // Declare Working vectors and variables
-  Vector u(n) ;     // real values of the row -- maximum size is n --
-  VectorXi ju(n);   // column position of the values in u -- maximum size  is n
-  VectorXi jr(n);   // Indicate the position of the nonzero elements in the vector u -- A zero location is indicated by -1
+  Vector u(n);    // real values of the row -- maximum size is n --
+  VectorI ju(n);  // column position of the values in u -- maximum size  is n
+  VectorI jr(n);  // Indicate the position of the nonzero elements in the vector u -- A zero location is indicated by -1
 
   // Apply the fill-reducing permutation
   eigen_assert(m_analysisIsOk && "You must first call analyzePattern()");
-  SparseMatrix<Scalar,RowMajor, Index> mat;
+  SparseMatrix<Scalar, RowMajor, StorageIndex> mat;
   mat = amat.twistedBy(m_Pinv);
 
   // Initialization
@@ -274,57 +282,49 @@ void IncompleteLUT<Scalar>::factorize(const _MatrixType& amat)
   u.fill(0);
 
   // number of largest elements to keep in each row:
-  Index fill_in =   static_cast<Index> (amat.nonZeros()*m_fillfactor)/n+1;
+  Index fill_in = (amat.nonZeros() * m_fillfactor) / n + 1;
   if (fill_in > n) fill_in = n;
 
   // number of largest nonzero elements to keep in the L and the U part of the current row:
-  Index nnzL = fill_in/2;
+  Index nnzL = fill_in / 2;
   Index nnzU = nnzL;
   m_lu.reserve(n * (nnzL + nnzU + 1));
 
   // global loop over the rows of the sparse matrix
-  for (Index ii = 0; ii < n; ii++)
-  {
+  for (Index ii = 0; ii < n; ii++) {
     // 1 - copy the lower and the upper part of the row i of mat in the working vector u
 
-    Index sizeu = 1; // number of nonzero elements in the upper part of the current row
-    Index sizel = 0; // number of nonzero elements in the lower part of the current row
-    ju(ii)    = ii;
-    u(ii)     = 0;
-    jr(ii)    = ii;
+    Index sizeu = 1;  // number of nonzero elements in the upper part of the current row
+    Index sizel = 0;  // number of nonzero elements in the lower part of the current row
+    ju(ii) = convert_index<StorageIndex>(ii);
+    u(ii) = 0;
+    jr(ii) = convert_index<StorageIndex>(ii);
     RealScalar rownorm = 0;
 
-    typename FactorType::InnerIterator j_it(mat, ii); // Iterate through the current row ii
-    for (; j_it; ++j_it)
-    {
+    typename FactorType::InnerIterator j_it(mat, ii);  // Iterate through the current row ii
+    for (; j_it; ++j_it) {
       Index k = j_it.index();
-      if (k < ii)
-      {
+      if (k < ii) {
         // copy the lower part
-        ju(sizel) = k;
+        ju(sizel) = convert_index<StorageIndex>(k);
         u(sizel) = j_it.value();
-        jr(k) = sizel;
+        jr(k) = convert_index<StorageIndex>(sizel);
         ++sizel;
-      }
-      else if (k == ii)
-      {
+      } else if (k == ii) {
         u(ii) = j_it.value();
-      }
-      else
-      {
+      } else {
         // copy the upper part
         Index jpos = ii + sizeu;
-        ju(jpos) = k;
+        ju(jpos) = convert_index<StorageIndex>(k);
         u(jpos) = j_it.value();
-        jr(k) = jpos;
+        jr(k) = convert_index<StorageIndex>(jpos);
         ++sizeu;
       }
       rownorm += numext::abs2(j_it.value());
     }
 
     // 2 - detect possible zero row
-    if(rownorm==0)
-    {
+    if (rownorm == 0) {
       m_info = NumericalIssue;
       return;
     }
@@ -334,19 +334,18 @@ void IncompleteLUT<Scalar>::factorize(const _MatrixType& amat)
     // 3 - eliminate the previous nonzero rows
     Index jj = 0;
     Index len = 0;
-    while (jj < sizel)
-    {
+    while (jj < sizel) {
       // In order to eliminate in the correct order,
       // we must select first the smallest column index among  ju(jj:sizel)
       Index k;
-      Index minrow = ju.segment(jj,sizel-jj).minCoeff(&k); // k is relative to the segment
+      Index minrow = ju.segment(jj, sizel - jj).minCoeff(&k);  // k is relative to the segment
       k += jj;
-      if (minrow != ju(jj))
-      {
+      if (minrow != ju(jj)) {
         // swap the two locations
         Index j = ju(jj);
         swap(ju(jj), ju(k));
-        jr(minrow) = jj;   jr(j) = k;
+        jr(minrow) = convert_index<StorageIndex>(jj);
+        jr(j) = convert_index<StorageIndex>(k);
         swap(u(jj), u(k));
       }
       // Reset this location
@@ -355,55 +354,51 @@ void IncompleteLUT<Scalar>::factorize(const _MatrixType& amat)
       // Start elimination
       typename FactorType::InnerIterator ki_it(m_lu, minrow);
       while (ki_it && ki_it.index() < minrow) ++ki_it;
-      eigen_internal_assert(ki_it && ki_it.col()==minrow);
+      eigen_internal_assert(ki_it && ki_it.col() == minrow);
       Scalar fact = u(jj) / ki_it.value();
 
       // drop too small elements
-      if(abs(fact) <= m_droptol)
-      {
+      if (abs(fact) <= m_droptol) {
         jj++;
         continue;
       }
 
       // linear combination of the current row ii and the row minrow
       ++ki_it;
-      for (; ki_it; ++ki_it)
-      {
+      for (; ki_it; ++ki_it) {
         Scalar prod = fact * ki_it.value();
-        Index j       = ki_it.index();
-        Index jpos    = jr(j);
-        if (jpos == -1) // fill-in element
+        Index j = ki_it.index();
+        Index jpos = jr(j);
+        if (jpos == -1)  // fill-in element
         {
           Index newpos;
-          if (j >= ii) // dealing with the upper part
+          if (j >= ii)  // dealing with the upper part
           {
             newpos = ii + sizeu;
             sizeu++;
-            eigen_internal_assert(sizeu<=n);
-          }
-          else // dealing with the lower part
+            eigen_internal_assert(sizeu <= n);
+          } else  // dealing with the lower part
           {
             newpos = sizel;
             sizel++;
-            eigen_internal_assert(sizel<=ii);
+            eigen_internal_assert(sizel <= ii);
           }
-          ju(newpos) = j;
+          ju(newpos) = convert_index<StorageIndex>(j);
           u(newpos) = -prod;
-          jr(j) = newpos;
-        }
-        else
+          jr(j) = convert_index<StorageIndex>(newpos);
+        } else
           u(jpos) -= prod;
       }
       // store the pivot element
       u(len) = fact;
-      ju(len) = minrow;
+      ju(len) = convert_index<StorageIndex>(minrow);
       ++len;
 
       jj++;
-    } // end of the elimination on the row ii
+    }  // end of the elimination on the row ii
 
     // reset the upper part of the pointer jr to zero
-    for(Index k = 0; k <sizeu; k++) jr(ju(ii+k)) = -1;
+    for (Index k = 0; k < sizeu; k++) jr(ju(ii + k)) = -1;
 
     // 4 - partially sort and insert the elements in the m_lu matrix
 
@@ -411,68 +406,44 @@ void IncompleteLUT<Scalar>::factorize(const _MatrixType& amat)
     sizel = len;
     len = (std::min)(sizel, nnzL);
     typename Vector::SegmentReturnType ul(u.segment(0, sizel));
-    typename VectorXi::SegmentReturnType jul(ju.segment(0, sizel));
+    typename VectorI::SegmentReturnType jul(ju.segment(0, sizel));
     internal::QuickSplit(ul, jul, len);
 
     // store the largest m_fill elements of the L part
     m_lu.startVec(ii);
-    for(Index k = 0; k < len; k++)
-      m_lu.insertBackByOuterInnerUnordered(ii,ju(k)) = u(k);
+    for (Index k = 0; k < len; k++) m_lu.insertBackByOuterInnerUnordered(ii, ju(k)) = u(k);
 
     // store the diagonal element
     // apply a shifting rule to avoid zero pivots (we are doing an incomplete factorization)
-    if (u(ii) == Scalar(0))
-      u(ii) = sqrt(m_droptol) * rownorm;
+    if (u(ii) == Scalar(0)) u(ii) = sqrt(m_droptol) * rownorm;
     m_lu.insertBackByOuterInnerUnordered(ii, ii) = u(ii);
 
     // sort the U-part of the row
     // apply the dropping rule first
     len = 0;
-    for(Index k = 1; k < sizeu; k++)
-    {
-      if(abs(u(ii+k)) > m_droptol * rownorm )
-      {
+    for (Index k = 1; k < sizeu; k++) {
+      if (abs(u(ii + k)) > m_droptol * rownorm) {
         ++len;
-        u(ii + len)  = u(ii + k);
+        u(ii + len) = u(ii + k);
         ju(ii + len) = ju(ii + k);
       }
     }
-    sizeu = len + 1; // +1 to take into account the diagonal element
+    sizeu = len + 1;  // +1 to take into account the diagonal element
     len = (std::min)(sizeu, nnzU);
-    typename Vector::SegmentReturnType uu(u.segment(ii+1, sizeu-1));
-    typename VectorXi::SegmentReturnType juu(ju.segment(ii+1, sizeu-1));
+    typename Vector::SegmentReturnType uu(u.segment(ii + 1, sizeu - 1));
+    typename VectorI::SegmentReturnType juu(ju.segment(ii + 1, sizeu - 1));
     internal::QuickSplit(uu, juu, len);
 
     // store the largest elements of the U part
-    for(Index k = ii + 1; k < ii + len; k++)
-      m_lu.insertBackByOuterInnerUnordered(ii,ju(k)) = u(k);
+    for (Index k = ii + 1; k < ii + len; k++) m_lu.insertBackByOuterInnerUnordered(ii, ju(k)) = u(k);
   }
-
   m_lu.finalize();
   m_lu.makeCompressed();
 
   m_factorizationIsOk = true;
-  m_isInitialized = m_factorizationIsOk;
   m_info = Success;
 }
 
-namespace internal {
-
-template<typename _MatrixType, typename Rhs>
-struct solve_retval<IncompleteLUT<_MatrixType>, Rhs>
-  : solve_retval_base<IncompleteLUT<_MatrixType>, Rhs>
-{
-  typedef IncompleteLUT<_MatrixType> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
-  }
-};
-
-} // end namespace internal
-
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_INCOMPLETE_LUT_H
+#endif  // EIGEN_INCOMPLETE_LUT_H
diff --git a/inst/include/Eigen/src/IterativeLinearSolvers/InternalHeaderCheck.h b/inst/include/Eigen/src/IterativeLinearSolvers/InternalHeaderCheck.h
new file mode 100644
index 00000000..b657e84d
--- /dev/null
+++ b/inst/include/Eigen/src/IterativeLinearSolvers/InternalHeaderCheck.h
@@ -0,0 +1,3 @@
+#ifndef EIGEN_ITERATIVELINEARSOLVERS_MODULE_H
+#error "Please include Eigen/IterativeLinearSolvers instead of including headers inside the src directory directly."
+#endif
diff --git a/inst/include/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h b/inst/include/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
index 501ef2f8..5caa3965 100644
--- a/inst/include/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
+++ b/inst/include/Eigen/src/IterativeLinearSolvers/IterativeSolverBase.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2011-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,273 +10,386 @@
 #ifndef EIGEN_ITERATIVE_SOLVER_BASE_H
 #define EIGEN_ITERATIVE_SOLVER_BASE_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename MatrixType>
+struct is_ref_compatible_impl {
+ private:
+  template <typename T0>
+  struct any_conversion {
+    template <typename T>
+    any_conversion(const volatile T&);
+    template <typename T>
+    any_conversion(T&);
+  };
+  struct yes {
+    int a[1];
+  };
+  struct no {
+    int a[2];
+  };
+
+  template <typename T>
+  static yes test(const Ref<const T>&, int);
+  template <typename T>
+  static no test(any_conversion<T>, ...);
+
+ public:
+  static MatrixType ms_from;
+  enum { value = sizeof(test<MatrixType>(ms_from, 0)) == sizeof(yes) };
+};
+
+template <typename MatrixType>
+struct is_ref_compatible {
+  enum { value = is_ref_compatible_impl<remove_all_t<MatrixType>>::value };
+};
+
+template <typename MatrixType, bool MatrixFree = !internal::is_ref_compatible<MatrixType>::value>
+class generic_matrix_wrapper;
+
+// We have an explicit matrix at hand, compatible with Ref<>
+template <typename MatrixType>
+class generic_matrix_wrapper<MatrixType, false> {
+ public:
+  typedef Ref<const MatrixType> ActualMatrixType;
+  template <int UpLo>
+  struct ConstSelfAdjointViewReturnType {
+    typedef typename ActualMatrixType::template ConstSelfAdjointViewReturnType<UpLo>::Type Type;
+  };
+
+  enum { MatrixFree = false };
+
+  generic_matrix_wrapper() : m_dummy(0, 0), m_matrix(m_dummy) {}
+
+  template <typename InputType>
+  generic_matrix_wrapper(const InputType& mat) : m_matrix(mat) {}
+
+  const ActualMatrixType& matrix() const { return m_matrix; }
+
+  template <typename MatrixDerived>
+  void grab(const EigenBase<MatrixDerived>& mat) {
+    internal::destroy_at(&m_matrix);
+    internal::construct_at(&m_matrix, mat.derived());
+  }
+
+  void grab(const Ref<const MatrixType>& mat) {
+    if (&(mat.derived()) != &m_matrix) {
+      internal::destroy_at(&m_matrix);
+      internal::construct_at(&m_matrix, mat);
+    }
+  }
+
+ protected:
+  MatrixType m_dummy;  // used to default initialize the Ref<> object
+  ActualMatrixType m_matrix;
+};
+
+// MatrixType is not compatible with Ref<> -> matrix-free wrapper
+template <typename MatrixType>
+class generic_matrix_wrapper<MatrixType, true> {
+ public:
+  typedef MatrixType ActualMatrixType;
+  template <int UpLo>
+  struct ConstSelfAdjointViewReturnType {
+    typedef ActualMatrixType Type;
+  };
+
+  enum { MatrixFree = true };
+
+  generic_matrix_wrapper() : mp_matrix(0) {}
+
+  generic_matrix_wrapper(const MatrixType& mat) : mp_matrix(&mat) {}
+
+  const ActualMatrixType& matrix() const { return *mp_matrix; }
+
+  void grab(const MatrixType& mat) { mp_matrix = &mat; }
+
+ protected:
+  const ActualMatrixType* mp_matrix;
+};
+
+}  // namespace internal
 
 /** \ingroup IterativeLinearSolvers_Module
-  * \brief Base class for linear iterative solvers
-  *
-  * \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
-  */
-template< typename Derived>
-class IterativeSolverBase : internal::noncopyable
-{
-public:
+ * \brief Base class for linear iterative solvers
+ *
+ * \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
+ */
+template <typename Derived>
+class IterativeSolverBase : public SparseSolverBase<Derived> {
+ protected:
+  typedef SparseSolverBase<Derived> Base;
+  using Base::m_isInitialized;
+
+ public:
   typedef typename internal::traits<Derived>::MatrixType MatrixType;
   typedef typename internal::traits<Derived>::Preconditioner Preconditioner;
   typedef typename MatrixType::Scalar Scalar;
-  typedef typename MatrixType::Index Index;
+  typedef typename MatrixType::StorageIndex StorageIndex;
   typedef typename MatrixType::RealScalar RealScalar;
 
-public:
+  enum { ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime };
 
-  Derived& derived() { return *static_cast<Derived*>(this); }
-  const Derived& derived() const { return *static_cast<const Derived*>(this); }
+ public:
+  using Base::derived;
 
   /** Default constructor. */
-  IterativeSolverBase()
-    : mp_matrix(0)
-  {
-    init();
-  }
+  IterativeSolverBase() { init(); }
 
   /** Initialize the solver with matrix \a A for further \c Ax=b solving.
-    * 
-    * This constructor is a shortcut for the default constructor followed
-    * by a call to compute().
-    * 
-    * \warning this class stores a reference to the matrix A as well as some
-    * precomputed values that depend on it. Therefore, if \a A is changed
-    * this class becomes invalid. Call compute() to update it with the new
-    * matrix A, or modify a copy of A.
-    */
-  template<typename InputDerived>
-  IterativeSolverBase(const EigenBase<InputDerived>& A)
-  {
+   *
+   * This constructor is a shortcut for the default constructor followed
+   * by a call to compute().
+   *
+   * \warning this class stores a reference to the matrix A as well as some
+   * precomputed values that depend on it. Therefore, if \a A is changed
+   * this class becomes invalid. Call compute() to update it with the new
+   * matrix A, or modify a copy of A.
+   */
+  template <typename MatrixDerived>
+  explicit IterativeSolverBase(const EigenBase<MatrixDerived>& A) : m_matrixWrapper(A.derived()) {
     init();
-    compute(A.derived());
+    compute(matrix());
   }
 
+  IterativeSolverBase(IterativeSolverBase&&) = default;
+
   ~IterativeSolverBase() {}
-  
-  /** Initializes the iterative solver for the sparcity pattern of the matrix \a A for further solving \c Ax=b problems.
-    *
-    * Currently, this function mostly call analyzePattern on the preconditioner. In the future
-    * we might, for instance, implement column reodering for faster matrix vector products.
-    */
-  template<typename InputDerived>
-  Derived& analyzePattern(const EigenBase<InputDerived>& A)
-  {
-    grabInput(A.derived());
-    m_preconditioner.analyzePattern(*mp_matrix);
+
+  /** Initializes the iterative solver for the sparsity pattern of the matrix \a A for further solving \c Ax=b problems.
+   *
+   * Currently, this function mostly calls analyzePattern on the preconditioner. In the future
+   * we might, for instance, implement column reordering for faster matrix vector products.
+   */
+  template <typename MatrixDerived>
+  Derived& analyzePattern(const EigenBase<MatrixDerived>& A) {
+    grab(A.derived());
+    m_preconditioner.analyzePattern(matrix());
     m_isInitialized = true;
     m_analysisIsOk = true;
-    m_info = Success;
+    m_info = m_preconditioner.info();
     return derived();
   }
-  
-  /** Initializes the iterative solver with the numerical values of the matrix \a A for further solving \c Ax=b problems.
-    *
-    * Currently, this function mostly call factorize on the preconditioner.
-    *
-    * \warning this class stores a reference to the matrix A as well as some
-    * precomputed values that depend on it. Therefore, if \a A is changed
-    * this class becomes invalid. Call compute() to update it with the new
-    * matrix A, or modify a copy of A.
-    */
-  template<typename InputDerived>
-  Derived& factorize(const EigenBase<InputDerived>& A)
-  {
-    grabInput(A.derived());
-    eigen_assert(m_analysisIsOk && "You must first call analyzePattern()"); 
-    m_preconditioner.factorize(*mp_matrix);
+
+  /** Initializes the iterative solver with the numerical values of the matrix \a A for further solving \c Ax=b
+   * problems.
+   *
+   * Currently, this function mostly calls factorize on the preconditioner.
+   *
+   * \warning this class stores a reference to the matrix A as well as some
+   * precomputed values that depend on it. Therefore, if \a A is changed
+   * this class becomes invalid. Call compute() to update it with the new
+   * matrix A, or modify a copy of A.
+   */
+  template <typename MatrixDerived>
+  Derived& factorize(const EigenBase<MatrixDerived>& A) {
+    eigen_assert(m_analysisIsOk && "You must first call analyzePattern()");
+    grab(A.derived());
+    m_preconditioner.factorize(matrix());
     m_factorizationIsOk = true;
-    m_info = Success;
+    m_info = m_preconditioner.info();
     return derived();
   }
 
   /** Initializes the iterative solver with the matrix \a A for further solving \c Ax=b problems.
-    *
-    * Currently, this function mostly initialized/compute the preconditioner. In the future
-    * we might, for instance, implement column reodering for faster matrix vector products.
-    *
-    * \warning this class stores a reference to the matrix A as well as some
-    * precomputed values that depend on it. Therefore, if \a A is changed
-    * this class becomes invalid. Call compute() to update it with the new
-    * matrix A, or modify a copy of A.
-    */
-  template<typename InputDerived>
-  Derived& compute(const EigenBase<InputDerived>& A)
-  {
-    grabInput(A.derived());
-    m_preconditioner.compute(*mp_matrix);
+   *
+   * Currently, this function mostly initializes/computes the preconditioner. In the future
+   * we might, for instance, implement column reordering for faster matrix vector products.
+   *
+   * \warning this class stores a reference to the matrix A as well as some
+   * precomputed values that depend on it. Therefore, if \a A is changed
+   * this class becomes invalid. Call compute() to update it with the new
+   * matrix A, or modify a copy of A.
+   */
+  template <typename MatrixDerived>
+  Derived& compute(const EigenBase<MatrixDerived>& A) {
+    grab(A.derived());
+    m_preconditioner.compute(matrix());
     m_isInitialized = true;
     m_analysisIsOk = true;
     m_factorizationIsOk = true;
-    m_info = Success;
+    m_info = m_preconditioner.info();
     return derived();
   }
 
   /** \internal */
-  Index rows() const { return mp_matrix ? mp_matrix->rows() : 0; }
+  constexpr Index rows() const noexcept { return matrix().rows(); }
+
   /** \internal */
-  Index cols() const { return mp_matrix ? mp_matrix->cols() : 0; }
+  constexpr Index cols() const noexcept { return matrix().cols(); }
 
-  /** \returns the tolerance threshold used by the stopping criteria */
+  /** \returns the tolerance threshold used by the stopping criteria.
+   * \sa setTolerance()
+   */
   RealScalar tolerance() const { return m_tolerance; }
-  
-  /** Sets the tolerance threshold used by the stopping criteria */
-  Derived& setTolerance(const RealScalar& tolerance)
-  {
+
+  /** Sets the tolerance threshold used by the stopping criteria.
+   *
+   * This value is used as an upper bound to the relative residual error: |Ax-b|/|b|.
+   * The default value is the machine precision given by NumTraits<Scalar>::epsilon()
+   */
+  Derived& setTolerance(const RealScalar& tolerance) {
     m_tolerance = tolerance;
     return derived();
   }
 
   /** \returns a read-write reference to the preconditioner for custom configuration. */
   Preconditioner& preconditioner() { return m_preconditioner; }
-  
+
   /** \returns a read-only reference to the preconditioner. */
   const Preconditioner& preconditioner() const { return m_preconditioner; }
 
-  /** \returns the max number of iterations */
-  int maxIterations() const
-  {
-    return (mp_matrix && m_maxIterations<0) ? mp_matrix->cols() : m_maxIterations;
-  }
-  
-  /** Sets the max number of iterations */
-  Derived& setMaxIterations(int maxIters)
-  {
+  /** \returns the max number of iterations.
+   * It is either the value set by setMaxIterations or, by default,
+   * twice the number of columns of the matrix.
+   */
+  Index maxIterations() const { return (m_maxIterations < 0) ? 2 * matrix().cols() : m_maxIterations; }
+
+  /** Sets the max number of iterations.
+   * Default is twice the number of columns of the matrix.
+   */
+  Derived& setMaxIterations(Index maxIters) {
     m_maxIterations = maxIters;
     return derived();
   }
 
   /** \returns the number of iterations performed during the last solve */
-  int iterations() const
-  {
-    eigen_assert(m_isInitialized && "ConjugateGradient is not initialized.");
+  Index iterations() const {
+    eigen_assert(m_isInitialized && "IterativeSolverBase is not initialized.");
     return m_iterations;
   }
 
-  /** \returns the tolerance error reached during the last solve */
-  RealScalar error() const
-  {
-    eigen_assert(m_isInitialized && "ConjugateGradient is not initialized.");
+  /** \returns the tolerance error reached during the last solve.
+   * It is a close approximation of the true relative residual error |Ax-b|/|b|.
+   */
+  RealScalar error() const {
+    eigen_assert(m_isInitialized && "IterativeSolverBase is not initialized.");
     return m_error;
   }
 
-  /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
-    *
-    * \sa compute()
-    */
-  template<typename Rhs> inline const internal::solve_retval<Derived, Rhs>
-  solve(const MatrixBase<Rhs>& b) const
-  {
-    eigen_assert(m_isInitialized && "IterativeSolverBase is not initialized.");
-    eigen_assert(rows()==b.rows()
-              && "IterativeSolverBase::solve(): invalid number of rows of the right hand side matrix b");
-    return internal::solve_retval<Derived, Rhs>(derived(), b.derived());
-  }
-  
-  /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
-    *
-    * \sa compute()
-    */
-  template<typename Rhs>
-  inline const internal::sparse_solve_retval<IterativeSolverBase, Rhs>
-  solve(const SparseMatrixBase<Rhs>& b) const
-  {
-    eigen_assert(m_isInitialized && "IterativeSolverBase is not initialized.");
-    eigen_assert(rows()==b.rows()
-              && "IterativeSolverBase::solve(): invalid number of rows of the right hand side matrix b");
-    return internal::sparse_solve_retval<IterativeSolverBase, Rhs>(*this, b.derived());
+  /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A
+   * and \a x0 as an initial solution.
+   *
+   * \sa solve(), compute()
+   */
+  template <typename Rhs, typename Guess>
+  inline const SolveWithGuess<Derived, Rhs, Guess> solveWithGuess(const MatrixBase<Rhs>& b, const Guess& x0) const {
+    eigen_assert(m_isInitialized && "Solver is not initialized.");
+    eigen_assert(derived().rows() == b.rows() && "solve(): invalid number of rows of the right hand side matrix b");
+    return SolveWithGuess<Derived, Rhs, Guess>(derived(), b.derived(), x0);
   }
 
   /** \returns Success if the iterations converged, and NoConvergence otherwise. */
-  ComputationInfo info() const
-  {
+  ComputationInfo info() const {
     eigen_assert(m_isInitialized && "IterativeSolverBase is not initialized.");
     return m_info;
   }
-  
+
   /** \internal */
-  template<typename Rhs, typename DestScalar, int DestOptions, typename DestIndex>
-  void _solve_sparse(const Rhs& b, SparseMatrix<DestScalar,DestOptions,DestIndex> &dest) const
-  {
-    eigen_assert(rows()==b.rows());
-    
-    int rhsCols = b.cols();
-    int size = b.rows();
-    Eigen::Matrix<DestScalar,Dynamic,1> tb(size);
-    Eigen::Matrix<DestScalar,Dynamic,1> tx(size);
-    for(int k=0; k<rhsCols; ++k)
-    {
+  template <typename Rhs, typename DestDerived>
+  void _solve_with_guess_impl(const Rhs& b, SparseMatrixBase<DestDerived>& aDest) const {
+    eigen_assert(rows() == b.rows());
+
+    Index rhsCols = b.cols();
+    Index size = b.rows();
+    DestDerived& dest(aDest.derived());
+    typedef typename DestDerived::Scalar DestScalar;
+    Eigen::Matrix<DestScalar, Dynamic, 1> tb(size);
+    Eigen::Matrix<DestScalar, Dynamic, 1> tx(cols());
+    // We do not directly fill dest because sparse expressions have to be free of aliasing issue.
+    // For non square least-square problems, b and dest might not have the same size whereas they might alias
+    // each-other.
+    typename DestDerived::PlainObject tmp(cols(), rhsCols);
+    ComputationInfo global_info = Success;
+    for (Index k = 0; k < rhsCols; ++k) {
       tb = b.col(k);
-      tx = derived().solve(tb);
-      dest.col(k) = tx.sparseView(0);
+      tx = dest.col(k);
+      derived()._solve_vector_with_guess_impl(tb, tx);
+      tmp.col(k) = tx.sparseView(0);
+
+      // The call to _solve_vector_with_guess_impl updates m_info, so if it failed for a previous column
+      // we need to restore it to the worst value.
+      if (m_info == NumericalIssue)
+        global_info = NumericalIssue;
+      else if (m_info == NoConvergence)
+        global_info = NoConvergence;
     }
+    m_info = global_info;
+    dest.swap(tmp);
   }
 
-protected:
+  template <typename Rhs, typename DestDerived>
+  std::enable_if_t<Rhs::ColsAtCompileTime != 1 && DestDerived::ColsAtCompileTime != 1> _solve_with_guess_impl(
+      const Rhs& b, MatrixBase<DestDerived>& aDest) const {
+    eigen_assert(rows() == b.rows());
 
-  template<typename InputDerived>
-  void grabInput(const EigenBase<InputDerived>& A)
-  {
-    // we const cast to prevent the creation of a MatrixType temporary by the compiler.
-    grabInput_impl(A.const_cast_derived());
+    Index rhsCols = b.cols();
+    DestDerived& dest(aDest.derived());
+    ComputationInfo global_info = Success;
+    for (Index k = 0; k < rhsCols; ++k) {
+      typename DestDerived::ColXpr xk(dest, k);
+      typename Rhs::ConstColXpr bk(b, k);
+      derived()._solve_vector_with_guess_impl(bk, xk);
+
+      // The call to _solve_vector_with_guess updates m_info, so if it failed for a previous column
+      // we need to restore it to the worst value.
+      if (m_info == NumericalIssue)
+        global_info = NumericalIssue;
+      else if (m_info == NoConvergence)
+        global_info = NoConvergence;
+    }
+    m_info = global_info;
   }
 
-  template<typename InputDerived>
-  void grabInput_impl(const EigenBase<InputDerived>& A)
-  {
-    m_copyMatrix = A;
-    mp_matrix = &m_copyMatrix;
+  template <typename Rhs, typename DestDerived>
+  std::enable_if_t<Rhs::ColsAtCompileTime == 1 || DestDerived::ColsAtCompileTime == 1> _solve_with_guess_impl(
+      const Rhs& b, MatrixBase<DestDerived>& dest) const {
+    derived()._solve_vector_with_guess_impl(b, dest.derived());
   }
 
-  void grabInput_impl(MatrixType& A)
-  {
-    if(MatrixType::RowsAtCompileTime==Dynamic && MatrixType::ColsAtCompileTime==Dynamic)
-      m_copyMatrix.resize(0,0);
-    mp_matrix = &A;
+  /** \internal default initial guess = 0 */
+  template <typename Rhs, typename Dest>
+  void _solve_impl(const Rhs& b, Dest& x) const {
+    x.setZero();
+    derived()._solve_with_guess_impl(b, x);
   }
 
-  void init()
-  {
+ protected:
+  void init() {
     m_isInitialized = false;
     m_analysisIsOk = false;
     m_factorizationIsOk = false;
     m_maxIterations = -1;
     m_tolerance = NumTraits<Scalar>::epsilon();
   }
-  MatrixType m_copyMatrix;
-  const MatrixType* mp_matrix;
+
+  typedef internal::generic_matrix_wrapper<MatrixType> MatrixWrapper;
+  typedef typename MatrixWrapper::ActualMatrixType ActualMatrixType;
+
+  const ActualMatrixType& matrix() const { return m_matrixWrapper.matrix(); }
+
+  template <typename InputType>
+  void grab(const InputType& A) {
+    m_matrixWrapper.grab(A);
+  }
+
+  MatrixWrapper m_matrixWrapper;
   Preconditioner m_preconditioner;
 
-  int m_maxIterations;
+  Index m_maxIterations;
   RealScalar m_tolerance;
-  
+
   mutable RealScalar m_error;
-  mutable int m_iterations;
+  mutable Index m_iterations;
   mutable ComputationInfo m_info;
-  mutable bool m_isInitialized, m_analysisIsOk, m_factorizationIsOk;
+  mutable bool m_analysisIsOk, m_factorizationIsOk;
 };
 
-namespace internal {
- 
-template<typename Derived, typename Rhs>
-struct sparse_solve_retval<IterativeSolverBase<Derived>, Rhs>
-  : sparse_solve_retval_base<IterativeSolverBase<Derived>, Rhs>
-{
-  typedef IterativeSolverBase<Derived> Dec;
-  EIGEN_MAKE_SPARSE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec().derived()._solve_sparse(rhs(),dst);
-  }
-};
-
-} // end namespace internal
-
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_ITERATIVE_SOLVER_BASE_H
+#endif  // EIGEN_ITERATIVE_SOLVER_BASE_H
diff --git a/inst/include/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h b/inst/include/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h
new file mode 100644
index 00000000..182f3190
--- /dev/null
+++ b/inst/include/Eigen/src/IterativeLinearSolvers/LeastSquareConjugateGradient.h
@@ -0,0 +1,193 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_LEAST_SQUARE_CONJUGATE_GRADIENT_H
+#define EIGEN_LEAST_SQUARE_CONJUGATE_GRADIENT_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+/** \internal Low-level conjugate gradient algorithm for least-square problems
+ * \param mat The matrix A
+ * \param rhs The right hand side vector b
+ * \param x On input and initial solution, on output the computed solution.
+ * \param precond A preconditioner being able to efficiently solve for an
+ *                approximation of A'Ax=b (regardless of b)
+ * \param iters On input the max number of iteration, on output the number of performed iterations.
+ * \param tol_error On input the tolerance error, on output an estimation of the relative error.
+ */
+template <typename MatrixType, typename Rhs, typename Dest, typename Preconditioner>
+EIGEN_DONT_INLINE void least_square_conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest& x,
+                                                       const Preconditioner& precond, Index& iters,
+                                                       typename Dest::RealScalar& tol_error) {
+  using std::abs;
+  using std::sqrt;
+  typedef typename Dest::RealScalar RealScalar;
+  typedef typename Dest::Scalar Scalar;
+  typedef Matrix<Scalar, Dynamic, 1> VectorType;
+
+  RealScalar tol = tol_error;
+  Index maxIters = iters;
+
+  Index m = mat.rows(), n = mat.cols();
+
+  VectorType residual = rhs - mat * x;
+  VectorType normal_residual = mat.adjoint() * residual;
+
+  RealScalar rhsNorm2 = (mat.adjoint() * rhs).squaredNorm();
+  if (rhsNorm2 == 0) {
+    x.setZero();
+    iters = 0;
+    tol_error = 0;
+    return;
+  }
+  RealScalar threshold = tol * tol * rhsNorm2;
+  RealScalar residualNorm2 = normal_residual.squaredNorm();
+  if (residualNorm2 < threshold) {
+    iters = 0;
+    tol_error = sqrt(residualNorm2 / rhsNorm2);
+    return;
+  }
+
+  VectorType p(n);
+  p = precond.solve(normal_residual);  // initial search direction
+
+  VectorType z(n), tmp(m);
+  RealScalar absNew = numext::real(normal_residual.dot(p));  // the square of the absolute value of r scaled by invM
+  Index i = 0;
+  while (i < maxIters) {
+    tmp.noalias() = mat * p;
+
+    Scalar alpha = absNew / tmp.squaredNorm();             // the amount we travel on dir
+    x += alpha * p;                                        // update solution
+    residual -= alpha * tmp;                               // update residual
+    normal_residual.noalias() = mat.adjoint() * residual;  // update residual of the normal equation
+
+    residualNorm2 = normal_residual.squaredNorm();
+    if (residualNorm2 < threshold) break;
+
+    z = precond.solve(normal_residual);  // approximately solve for "A'A z = normal_residual"
+
+    RealScalar absOld = absNew;
+    absNew = numext::real(normal_residual.dot(z));  // update the absolute value of r
+    RealScalar beta = absNew / absOld;  // calculate the Gram-Schmidt value used to create the new search direction
+    p = z + beta * p;                   // update search direction
+    i++;
+  }
+  tol_error = sqrt(residualNorm2 / rhsNorm2);
+  iters = i;
+}
+
+}  // namespace internal
+
+template <typename MatrixType_,
+          typename Preconditioner_ = LeastSquareDiagonalPreconditioner<typename MatrixType_::Scalar> >
+class LeastSquaresConjugateGradient;
+
+namespace internal {
+
+template <typename MatrixType_, typename Preconditioner_>
+struct traits<LeastSquaresConjugateGradient<MatrixType_, Preconditioner_> > {
+  typedef MatrixType_ MatrixType;
+  typedef Preconditioner_ Preconditioner;
+};
+
+}  // namespace internal
+
+/** \ingroup IterativeLinearSolvers_Module
+  * \brief A conjugate gradient solver for sparse (or dense) least-square problems
+  *
+  * This class solves for the least-squares solution to A x = b using an iterative conjugate gradient algorithm.
+  * The matrix A can be non symmetric and rectangular, but the matrix A' A should be positive-definite to guaranty
+  stability.
+  * Otherwise, the SparseLU or SparseQR classes might be preferable.
+  * The matrix A and the vectors x and b can be either dense or sparse.
+  *
+  * \tparam MatrixType_ the type of the matrix A, can be a dense or a sparse matrix.
+  * \tparam Preconditioner_ the type of the preconditioner. Default is LeastSquareDiagonalPreconditioner
+  *
+  * \implsparsesolverconcept
+  *
+  * The maximal number of iterations and tolerance value can be controlled via the setMaxIterations()
+  * and setTolerance() methods. The defaults are the size of the problem for the maximal number of iterations
+  * and NumTraits<Scalar>::epsilon() for the tolerance.
+  *
+  * This class can be used as the direct solver classes. Here is a typical usage example:
+    \code
+    int m=1000000, n = 10000;
+    VectorXd x(n), b(m);
+    SparseMatrix<double> A(m,n);
+    // fill A and b
+    LeastSquaresConjugateGradient<SparseMatrix<double> > lscg;
+    lscg.compute(A);
+    x = lscg.solve(b);
+    std::cout << "#iterations:     " << lscg.iterations() << std::endl;
+    std::cout << "estimated error: " << lscg.error()      << std::endl;
+    // update b, and solve again
+    x = lscg.solve(b);
+    \endcode
+  *
+  * By default the iterations start with x=0 as an initial guess of the solution.
+  * One can control the start using the solveWithGuess() method.
+  *
+  * \sa class ConjugateGradient, SparseLU, SparseQR
+  */
+template <typename MatrixType_, typename Preconditioner_>
+class LeastSquaresConjugateGradient
+    : public IterativeSolverBase<LeastSquaresConjugateGradient<MatrixType_, Preconditioner_> > {
+  typedef IterativeSolverBase<LeastSquaresConjugateGradient> Base;
+  using Base::m_error;
+  using Base::m_info;
+  using Base::m_isInitialized;
+  using Base::m_iterations;
+  using Base::matrix;
+
+ public:
+  typedef MatrixType_ MatrixType;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef Preconditioner_ Preconditioner;
+
+ public:
+  /** Default constructor. */
+  LeastSquaresConjugateGradient() : Base() {}
+
+  /** Initialize the solver with matrix \a A for further \c Ax=b solving.
+   *
+   * This constructor is a shortcut for the default constructor followed
+   * by a call to compute().
+   *
+   * \warning this class stores a reference to the matrix A as well as some
+   * precomputed values that depend on it. Therefore, if \a A is changed
+   * this class becomes invalid. Call compute() to update it with the new
+   * matrix A, or modify a copy of A.
+   */
+  template <typename MatrixDerived>
+  explicit LeastSquaresConjugateGradient(const EigenBase<MatrixDerived>& A) : Base(A.derived()) {}
+
+  ~LeastSquaresConjugateGradient() {}
+
+  /** \internal */
+  template <typename Rhs, typename Dest>
+  void _solve_vector_with_guess_impl(const Rhs& b, Dest& x) const {
+    m_iterations = Base::maxIterations();
+    m_error = Base::m_tolerance;
+
+    internal::least_square_conjugate_gradient(matrix(), b, x, Base::m_preconditioner, m_iterations, m_error);
+    m_info = m_error <= Base::m_tolerance ? Success : NoConvergence;
+  }
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_LEAST_SQUARE_CONJUGATE_GRADIENT_H
diff --git a/inst/include/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h b/inst/include/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h
new file mode 100644
index 00000000..271679fe
--- /dev/null
+++ b/inst/include/Eigen/src/IterativeLinearSolvers/SolveWithGuess.h
@@ -0,0 +1,111 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SOLVEWITHGUESS_H
+#define EIGEN_SOLVEWITHGUESS_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+template <typename Decomposition, typename RhsType, typename GuessType>
+class SolveWithGuess;
+
+/** \class SolveWithGuess
+ * \ingroup IterativeLinearSolvers_Module
+ *
+ * \brief Pseudo expression representing a solving operation
+ *
+ * \tparam Decomposition the type of the matrix or decomposition object
+ * \tparam Rhstype the type of the right-hand side
+ *
+ * This class represents an expression of A.solve(B)
+ * and most of the time this is the only way it is used.
+ *
+ */
+namespace internal {
+
+template <typename Decomposition, typename RhsType, typename GuessType>
+struct traits<SolveWithGuess<Decomposition, RhsType, GuessType> > : traits<Solve<Decomposition, RhsType> > {};
+
+}  // namespace internal
+
+template <typename Decomposition, typename RhsType, typename GuessType>
+class SolveWithGuess : public internal::generic_xpr_base<SolveWithGuess<Decomposition, RhsType, GuessType>, MatrixXpr,
+                                                         typename internal::traits<RhsType>::StorageKind>::type {
+ public:
+  typedef typename internal::traits<SolveWithGuess>::Scalar Scalar;
+  typedef typename internal::traits<SolveWithGuess>::PlainObject PlainObject;
+  typedef typename internal::generic_xpr_base<SolveWithGuess<Decomposition, RhsType, GuessType>, MatrixXpr,
+                                              typename internal::traits<RhsType>::StorageKind>::type Base;
+  typedef typename internal::ref_selector<SolveWithGuess>::type Nested;
+
+  SolveWithGuess(const Decomposition &dec, const RhsType &rhs, const GuessType &guess)
+      : m_dec(dec), m_rhs(rhs), m_guess(guess) {}
+
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_dec.cols(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_rhs.cols(); }
+
+  EIGEN_DEVICE_FUNC const Decomposition &dec() const { return m_dec; }
+  EIGEN_DEVICE_FUNC const RhsType &rhs() const { return m_rhs; }
+  EIGEN_DEVICE_FUNC const GuessType &guess() const { return m_guess; }
+
+ protected:
+  const Decomposition &m_dec;
+  const RhsType &m_rhs;
+  const GuessType &m_guess;
+
+ private:
+  Scalar coeff(Index row, Index col) const;
+  Scalar coeff(Index i) const;
+};
+
+namespace internal {
+
+// Evaluator of SolveWithGuess -> eval into a temporary
+template <typename Decomposition, typename RhsType, typename GuessType>
+struct evaluator<SolveWithGuess<Decomposition, RhsType, GuessType> >
+    : public evaluator<typename SolveWithGuess<Decomposition, RhsType, GuessType>::PlainObject> {
+  typedef SolveWithGuess<Decomposition, RhsType, GuessType> SolveType;
+  typedef typename SolveType::PlainObject PlainObject;
+  typedef evaluator<PlainObject> Base;
+
+  evaluator(const SolveType &solve) : m_result(solve.rows(), solve.cols()) {
+    internal::construct_at<Base>(this, m_result);
+    m_result = solve.guess();
+    solve.dec()._solve_with_guess_impl(solve.rhs(), m_result);
+  }
+
+ protected:
+  PlainObject m_result;
+};
+
+// Specialization for "dst = dec.solveWithGuess(rhs)"
+// NOTE we need to specialize it for Dense2Dense to avoid ambiguous specialization error and a Sparse2Sparse
+// specialization must exist somewhere
+template <typename DstXprType, typename DecType, typename RhsType, typename GuessType, typename Scalar>
+struct Assignment<DstXprType, SolveWithGuess<DecType, RhsType, GuessType>, internal::assign_op<Scalar, Scalar>,
+                  Dense2Dense> {
+  typedef SolveWithGuess<DecType, RhsType, GuessType> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar, Scalar> &) {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if ((dst.rows() != dstRows) || (dst.cols() != dstCols)) dst.resize(dstRows, dstCols);
+
+    dst = src.guess();
+    src.dec()._solve_with_guess_impl(src.rhs(), dst /*, src.guess()*/);
+  }
+};
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_SOLVEWITHGUESS_H
diff --git a/inst/include/Eigen/src/Jacobi/InternalHeaderCheck.h b/inst/include/Eigen/src/Jacobi/InternalHeaderCheck.h
new file mode 100644
index 00000000..b17b1f27
--- /dev/null
+++ b/inst/include/Eigen/src/Jacobi/InternalHeaderCheck.h
@@ -0,0 +1,3 @@
+#ifndef EIGEN_JACOBI_MODULE_H
+#error "Please include Eigen/Jacobi instead of including headers inside the src directory directly."
+#endif
diff --git a/inst/include/Eigen/src/Jacobi/Jacobi.h b/inst/include/Eigen/src/Jacobi/Jacobi.h
index 956f72d5..2686a523 100644
--- a/inst/include/Eigen/src/Jacobi/Jacobi.h
+++ b/inst/include/Eigen/src/Jacobi/Jacobi.h
@@ -11,423 +11,417 @@
 #ifndef EIGEN_JACOBI_H
 #define EIGEN_JACOBI_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 /** \ingroup Jacobi_Module
-  * \jacobi_module
-  * \class JacobiRotation
-  * \brief Rotation given by a cosine-sine pair.
-  *
-  * This class represents a Jacobi or Givens rotation.
-  * This is a 2D rotation in the plane \c J of angle \f$ \theta \f$ defined by
-  * its cosine \c c and sine \c s as follow:
-  * \f$ J = \left ( \begin{array}{cc} c & \overline s \\ -s  & \overline c \end{array} \right ) \f$
-  *
-  * You can apply the respective counter-clockwise rotation to a column vector \c v by
-  * applying its adjoint on the left: \f$ v = J^* v \f$ that translates to the following Eigen code:
-  * \code
-  * v.applyOnTheLeft(J.adjoint());
-  * \endcode
-  *
-  * \sa MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight()
-  */
-template<typename Scalar> class JacobiRotation
-{
-  public:
-    typedef typename NumTraits<Scalar>::Real RealScalar;
-
-    /** Default constructor without any initialization. */
-    JacobiRotation() {}
-
-    /** Construct a planar rotation from a cosine-sine pair (\a c, \c s). */
-    JacobiRotation(const Scalar& c, const Scalar& s) : m_c(c), m_s(s) {}
-
-    Scalar& c() { return m_c; }
-    Scalar c() const { return m_c; }
-    Scalar& s() { return m_s; }
-    Scalar s() const { return m_s; }
-
-    /** Concatenates two planar rotation */
-    JacobiRotation operator*(const JacobiRotation& other)
-    {
-      using numext::conj;
-      return JacobiRotation(m_c * other.m_c - conj(m_s) * other.m_s,
-                            conj(m_c * conj(other.m_s) + conj(m_s) * conj(other.m_c)));
-    }
+ * \jacobi_module
+ * \class JacobiRotation
+ * \brief Rotation given by a cosine-sine pair.
+ *
+ * This class represents a Jacobi or Givens rotation.
+ * This is a 2D rotation in the plane \c J of angle \f$ \theta \f$ defined by
+ * its cosine \c c and sine \c s as follow:
+ * \f$ J = \left ( \begin{array}{cc} c & \overline s \\ -s  & \overline c \end{array} \right ) \f$
+ *
+ * You can apply the respective counter-clockwise rotation to a column vector \c v by
+ * applying its adjoint on the left: \f$ v = J^* v \f$ that translates to the following Eigen code:
+ * \code
+ * v.applyOnTheLeft(J.adjoint());
+ * \endcode
+ *
+ * \sa MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight()
+ */
+template <typename Scalar>
+class JacobiRotation {
+ public:
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+
+  /** Default constructor without any initialization. */
+  EIGEN_DEVICE_FUNC JacobiRotation() {}
+
+  /** Construct a planar rotation from a cosine-sine pair (\a c, \c s). */
+  EIGEN_DEVICE_FUNC JacobiRotation(const Scalar& c, const Scalar& s) : m_c(c), m_s(s) {}
 
-    /** Returns the transposed transformation */
-    JacobiRotation transpose() const { using numext::conj; return JacobiRotation(m_c, -conj(m_s)); }
+  EIGEN_DEVICE_FUNC Scalar& c() { return m_c; }
+  EIGEN_DEVICE_FUNC Scalar c() const { return m_c; }
+  EIGEN_DEVICE_FUNC Scalar& s() { return m_s; }
+  EIGEN_DEVICE_FUNC Scalar s() const { return m_s; }
+
+  /** Concatenates two planar rotation */
+  EIGEN_DEVICE_FUNC JacobiRotation operator*(const JacobiRotation& other) {
+    using numext::conj;
+    return JacobiRotation(m_c * other.m_c - conj(m_s) * other.m_s,
+                          conj(m_c * conj(other.m_s) + conj(m_s) * conj(other.m_c)));
+  }
+
+  /** Returns the transposed transformation */
+  EIGEN_DEVICE_FUNC JacobiRotation transpose() const {
+    using numext::conj;
+    return JacobiRotation(m_c, -conj(m_s));
+  }
 
-    /** Returns the adjoint transformation */
-    JacobiRotation adjoint() const { using numext::conj; return JacobiRotation(conj(m_c), -m_s); }
+  /** Returns the adjoint transformation */
+  EIGEN_DEVICE_FUNC JacobiRotation adjoint() const {
+    using numext::conj;
+    return JacobiRotation(conj(m_c), -m_s);
+  }
 
-    template<typename Derived>
-    bool makeJacobi(const MatrixBase<Derived>&, typename Derived::Index p, typename Derived::Index q);
-    bool makeJacobi(const RealScalar& x, const Scalar& y, const RealScalar& z);
+  template <typename Derived>
+  EIGEN_DEVICE_FUNC bool makeJacobi(const MatrixBase<Derived>&, Index p, Index q);
+  EIGEN_DEVICE_FUNC bool makeJacobi(const RealScalar& x, const Scalar& y, const RealScalar& z);
 
-    void makeGivens(const Scalar& p, const Scalar& q, Scalar* z=0);
+  EIGEN_DEVICE_FUNC void makeGivens(const Scalar& p, const Scalar& q, Scalar* r = 0);
 
-  protected:
-    void makeGivens(const Scalar& p, const Scalar& q, Scalar* z, internal::true_type);
-    void makeGivens(const Scalar& p, const Scalar& q, Scalar* z, internal::false_type);
+ protected:
+  EIGEN_DEVICE_FUNC void makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::true_type);
+  EIGEN_DEVICE_FUNC void makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::false_type);
 
-    Scalar m_c, m_s;
+  Scalar m_c, m_s;
 };
 
-/** Makes \c *this as a Jacobi rotation \a J such that applying \a J on both the right and left sides of the selfadjoint 2x2 matrix
-  * \f$ B = \left ( \begin{array}{cc} x & y \\ \overline y & z \end{array} \right )\f$ yields a diagonal matrix \f$ A = J^* B J \f$
-  *
-  * \sa MatrixBase::makeJacobi(const MatrixBase<Derived>&, Index, Index), MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight()
-  */
-template<typename Scalar>
-bool JacobiRotation<Scalar>::makeJacobi(const RealScalar& x, const Scalar& y, const RealScalar& z)
-{
-  using std::sqrt;
+/** Makes \c *this as a Jacobi rotation \a J such that applying \a J on both the right and left sides of the selfadjoint
+ * 2x2 matrix \f$ B = \left ( \begin{array}{cc} x & y \\ \overline y & z \end{array} \right )\f$ yields a diagonal
+ * matrix \f$ A = J^* B J \f$
+ *
+ * \sa MatrixBase::makeJacobi(const MatrixBase<Derived>&, Index, Index), MatrixBase::applyOnTheLeft(),
+ * MatrixBase::applyOnTheRight()
+ */
+template <typename Scalar>
+EIGEN_DEVICE_FUNC bool JacobiRotation<Scalar>::makeJacobi(const RealScalar& x, const Scalar& y, const RealScalar& z) {
   using std::abs;
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  if(y == Scalar(0))
-  {
+  using std::sqrt;
+
+  RealScalar deno = RealScalar(2) * abs(y);
+  if (deno < (std::numeric_limits<RealScalar>::min)()) {
     m_c = Scalar(1);
     m_s = Scalar(0);
     return false;
-  }
-  else
-  {
-    RealScalar tau = (x-z)/(RealScalar(2)*abs(y));
+  } else {
+    RealScalar tau = (x - z) / deno;
     RealScalar w = sqrt(numext::abs2(tau) + RealScalar(1));
     RealScalar t;
-    if(tau>RealScalar(0))
-    {
+    if (tau > RealScalar(0)) {
       t = RealScalar(1) / (tau + w);
-    }
-    else
-    {
+    } else {
       t = RealScalar(1) / (tau - w);
     }
     RealScalar sign_t = t > RealScalar(0) ? RealScalar(1) : RealScalar(-1);
-    RealScalar n = RealScalar(1) / sqrt(numext::abs2(t)+RealScalar(1));
-    m_s = - sign_t * (numext::conj(y) / abs(y)) * abs(t) * n;
+    RealScalar n = RealScalar(1) / sqrt(numext::abs2(t) + RealScalar(1));
+    m_s = -sign_t * (numext::conj(y) / abs(y)) * abs(t) * n;
     m_c = n;
     return true;
   }
 }
 
-/** Makes \c *this as a Jacobi rotation \c J such that applying \a J on both the right and left sides of the 2x2 selfadjoint matrix
-  * \f$ B = \left ( \begin{array}{cc} \text{this}_{pp} & \text{this}_{pq} \\ (\text{this}_{pq})^* & \text{this}_{qq} \end{array} \right )\f$ yields
-  * a diagonal matrix \f$ A = J^* B J \f$
-  *
-  * Example: \include Jacobi_makeJacobi.cpp
-  * Output: \verbinclude Jacobi_makeJacobi.out
-  *
-  * \sa JacobiRotation::makeJacobi(RealScalar, Scalar, RealScalar), MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight()
-  */
-template<typename Scalar>
-template<typename Derived>
-inline bool JacobiRotation<Scalar>::makeJacobi(const MatrixBase<Derived>& m, typename Derived::Index p, typename Derived::Index q)
-{
-  return makeJacobi(numext::real(m.coeff(p,p)), m.coeff(p,q), numext::real(m.coeff(q,q)));
+/** Makes \c *this as a Jacobi rotation \c J such that applying \a J on both the right and left sides of the 2x2
+ * selfadjoint matrix \f$ B = \left ( \begin{array}{cc} \text{this}_{pp} & \text{this}_{pq} \\ (\text{this}_{pq})^* &
+ * \text{this}_{qq} \end{array} \right )\f$ yields a diagonal matrix \f$ A = J^* B J \f$
+ *
+ * Example: \include Jacobi_makeJacobi.cpp
+ * Output: \verbinclude Jacobi_makeJacobi.out
+ *
+ * \sa JacobiRotation::makeJacobi(RealScalar, Scalar, RealScalar), MatrixBase::applyOnTheLeft(),
+ * MatrixBase::applyOnTheRight()
+ */
+template <typename Scalar>
+template <typename Derived>
+EIGEN_DEVICE_FUNC inline bool JacobiRotation<Scalar>::makeJacobi(const MatrixBase<Derived>& m, Index p, Index q) {
+  return makeJacobi(numext::real(m.coeff(p, p)), m.coeff(p, q), numext::real(m.coeff(q, q)));
 }
 
 /** Makes \c *this as a Givens rotation \c G such that applying \f$ G^* \f$ to the left of the vector
-  * \f$ V = \left ( \begin{array}{c} p \\ q \end{array} \right )\f$ yields:
-  * \f$ G^* V = \left ( \begin{array}{c} r \\ 0 \end{array} \right )\f$.
-  *
-  * The value of \a z is returned if \a z is not null (the default is null).
-  * Also note that G is built such that the cosine is always real.
-  *
-  * Example: \include Jacobi_makeGivens.cpp
-  * Output: \verbinclude Jacobi_makeGivens.out
-  *
-  * This function implements the continuous Givens rotation generation algorithm
-  * found in Anderson (2000), Discontinuous Plane Rotations and the Symmetric Eigenvalue Problem.
-  * LAPACK Working Note 150, University of Tennessee, UT-CS-00-454, December 4, 2000.
-  *
-  * \sa MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight()
-  */
-template<typename Scalar>
-void JacobiRotation<Scalar>::makeGivens(const Scalar& p, const Scalar& q, Scalar* z)
-{
-  makeGivens(p, q, z, typename internal::conditional<NumTraits<Scalar>::IsComplex, internal::true_type, internal::false_type>::type());
+ * \f$ V = \left ( \begin{array}{c} p \\ q \end{array} \right )\f$ yields:
+ * \f$ G^* V = \left ( \begin{array}{c} r \\ 0 \end{array} \right )\f$.
+ *
+ * The value of \a r is returned if \a r is not null (the default is null).
+ * Also note that G is built such that the cosine is always real.
+ *
+ * Example: \include Jacobi_makeGivens.cpp
+ * Output: \verbinclude Jacobi_makeGivens.out
+ *
+ * This function implements the continuous Givens rotation generation algorithm
+ * found in Anderson (2000), Discontinuous Plane Rotations and the Symmetric Eigenvalue Problem.
+ * LAPACK Working Note 150, University of Tennessee, UT-CS-00-454, December 4, 2000.
+ *
+ * \sa MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight()
+ */
+template <typename Scalar>
+EIGEN_DEVICE_FUNC void JacobiRotation<Scalar>::makeGivens(const Scalar& p, const Scalar& q, Scalar* r) {
+  makeGivens(p, q, r, std::conditional_t<NumTraits<Scalar>::IsComplex, internal::true_type, internal::false_type>());
 }
 
-
 // specialization for complexes
-template<typename Scalar>
-void JacobiRotation<Scalar>::makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::true_type)
-{
-  using std::sqrt;
-  using std::abs;
+template <typename Scalar>
+EIGEN_DEVICE_FUNC void JacobiRotation<Scalar>::makeGivens(const Scalar& p, const Scalar& q, Scalar* r,
+                                                          internal::true_type) {
   using numext::conj;
-  
-  if(q==Scalar(0))
-  {
-    m_c = numext::real(p)<0 ? Scalar(-1) : Scalar(1);
+  using std::abs;
+  using std::sqrt;
+
+  if (q == Scalar(0)) {
+    m_c = numext::real(p) < 0 ? Scalar(-1) : Scalar(1);
     m_s = 0;
-    if(r) *r = m_c * p;
-  }
-  else if(p==Scalar(0))
-  {
+    if (r) *r = m_c * p;
+  } else if (p == Scalar(0)) {
     m_c = 0;
-    m_s = -q/abs(q);
-    if(r) *r = abs(q);
-  }
-  else
-  {
+    m_s = -q / abs(q);
+    if (r) *r = abs(q);
+  } else {
     RealScalar p1 = numext::norm1(p);
     RealScalar q1 = numext::norm1(q);
-    if(p1>=q1)
-    {
+    if (p1 >= q1) {
       Scalar ps = p / p1;
       RealScalar p2 = numext::abs2(ps);
       Scalar qs = q / p1;
       RealScalar q2 = numext::abs2(qs);
 
-      RealScalar u = sqrt(RealScalar(1) + q2/p2);
-      if(numext::real(p)<RealScalar(0))
-        u = -u;
+      RealScalar u = sqrt(RealScalar(1) + q2 / p2);
+      if (numext::real(p) < RealScalar(0)) u = -u;
 
-      m_c = Scalar(1)/u;
-      m_s = -qs*conj(ps)*(m_c/p2);
-      if(r) *r = p * u;
-    }
-    else
-    {
+      m_c = Scalar(1) / u;
+      m_s = -qs * conj(ps) * (m_c / p2);
+      if (r) *r = p * u;
+    } else {
       Scalar ps = p / q1;
       RealScalar p2 = numext::abs2(ps);
       Scalar qs = q / q1;
       RealScalar q2 = numext::abs2(qs);
 
       RealScalar u = q1 * sqrt(p2 + q2);
-      if(numext::real(p)<RealScalar(0))
-        u = -u;
+      if (numext::real(p) < RealScalar(0)) u = -u;
 
       p1 = abs(p);
-      ps = p/p1;
-      m_c = p1/u;
-      m_s = -conj(ps) * (q/u);
-      if(r) *r = ps * u;
+      ps = p / p1;
+      m_c = p1 / u;
+      m_s = -conj(ps) * (q / u);
+      if (r) *r = ps * u;
     }
   }
 }
 
 // specialization for reals
-template<typename Scalar>
-void JacobiRotation<Scalar>::makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::false_type)
-{
-  using std::sqrt;
+template <typename Scalar>
+EIGEN_DEVICE_FUNC void JacobiRotation<Scalar>::makeGivens(const Scalar& p, const Scalar& q, Scalar* r,
+                                                          internal::false_type) {
   using std::abs;
-  if(q==Scalar(0))
-  {
-    m_c = p<Scalar(0) ? Scalar(-1) : Scalar(1);
+  using std::sqrt;
+  if (numext::is_exactly_zero(q)) {
+    m_c = p < Scalar(0) ? Scalar(-1) : Scalar(1);
     m_s = Scalar(0);
-    if(r) *r = abs(p);
-  }
-  else if(p==Scalar(0))
-  {
+    if (r) *r = abs(p);
+  } else if (numext::is_exactly_zero(p)) {
     m_c = Scalar(0);
-    m_s = q<Scalar(0) ? Scalar(1) : Scalar(-1);
-    if(r) *r = abs(q);
-  }
-  else if(abs(p) > abs(q))
-  {
-    Scalar t = q/p;
+    m_s = q < Scalar(0) ? Scalar(1) : Scalar(-1);
+    if (r) *r = abs(q);
+  } else if (abs(p) > abs(q)) {
+    Scalar t = q / p;
     Scalar u = sqrt(Scalar(1) + numext::abs2(t));
-    if(p<Scalar(0))
-      u = -u;
-    m_c = Scalar(1)/u;
+    if (p < Scalar(0)) u = -u;
+    m_c = Scalar(1) / u;
     m_s = -t * m_c;
-    if(r) *r = p * u;
-  }
-  else
-  {
-    Scalar t = p/q;
+    if (r) *r = p * u;
+  } else {
+    Scalar t = p / q;
     Scalar u = sqrt(Scalar(1) + numext::abs2(t));
-    if(q<Scalar(0))
-      u = -u;
-    m_s = -Scalar(1)/u;
+    if (q < Scalar(0)) u = -u;
+    m_s = -Scalar(1) / u;
     m_c = -t * m_s;
-    if(r) *r = q * u;
+    if (r) *r = q * u;
   }
-
 }
 
 /****************************************************************************************
-*   Implementation of MatrixBase methods
-****************************************************************************************/
+ *   Implementation of MatrixBase methods
+ ****************************************************************************************/
 
-/** \jacobi_module
-  * Applies the clock wise 2D rotation \a j to the set of 2D vectors of cordinates \a x and \a y:
-  * \f$ \left ( \begin{array}{cc} x \\ y \end{array} \right )  =  J \left ( \begin{array}{cc} x \\ y \end{array} \right ) \f$
-  *
-  * \sa MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight()
-  */
 namespace internal {
-template<typename VectorX, typename VectorY, typename OtherScalar>
-void apply_rotation_in_the_plane(VectorX& _x, VectorY& _y, const JacobiRotation<OtherScalar>& j);
-}
+/** \jacobi_module
+ * Applies the clock wise 2D rotation \a j to the set of 2D vectors of coordinates \a x and \a y:
+ * \f$ \left ( \begin{array}{cc} x \\ y \end{array} \right )  =  J \left ( \begin{array}{cc} x \\ y \end{array} \right )
+ * \f$
+ *
+ * \sa MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight()
+ */
+template <typename VectorX, typename VectorY, typename OtherScalar>
+EIGEN_DEVICE_FUNC void apply_rotation_in_the_plane(DenseBase<VectorX>& xpr_x, DenseBase<VectorY>& xpr_y,
+                                                   const JacobiRotation<OtherScalar>& j);
+}  // namespace internal
 
 /** \jacobi_module
-  * Applies the rotation in the plane \a j to the rows \a p and \a q of \c *this, i.e., it computes B = J * B,
-  * with \f$ B = \left ( \begin{array}{cc} \text{*this.row}(p) \\ \text{*this.row}(q) \end{array} \right ) \f$.
-  *
-  * \sa class JacobiRotation, MatrixBase::applyOnTheRight(), internal::apply_rotation_in_the_plane()
-  */
-template<typename Derived>
-template<typename OtherScalar>
-inline void MatrixBase<Derived>::applyOnTheLeft(Index p, Index q, const JacobiRotation<OtherScalar>& j)
-{
+ * Applies the rotation in the plane \a j to the rows \a p and \a q of \c *this, i.e., it computes B = J * B,
+ * with \f$ B = \left ( \begin{array}{cc} \text{*this.row}(p) \\ \text{*this.row}(q) \end{array} \right ) \f$.
+ *
+ * \sa class JacobiRotation, MatrixBase::applyOnTheRight(), internal::apply_rotation_in_the_plane()
+ */
+template <typename Derived>
+template <typename OtherScalar>
+EIGEN_DEVICE_FUNC inline void MatrixBase<Derived>::applyOnTheLeft(Index p, Index q,
+                                                                  const JacobiRotation<OtherScalar>& j) {
   RowXpr x(this->row(p));
   RowXpr y(this->row(q));
   internal::apply_rotation_in_the_plane(x, y, j);
 }
 
-/** \ingroup Jacobi_Module
-  * Applies the rotation in the plane \a j to the columns \a p and \a q of \c *this, i.e., it computes B = B * J
-  * with \f$ B = \left ( \begin{array}{cc} \text{*this.col}(p) & \text{*this.col}(q) \end{array} \right ) \f$.
-  *
-  * \sa class JacobiRotation, MatrixBase::applyOnTheLeft(), internal::apply_rotation_in_the_plane()
-  */
-template<typename Derived>
-template<typename OtherScalar>
-inline void MatrixBase<Derived>::applyOnTheRight(Index p, Index q, const JacobiRotation<OtherScalar>& j)
-{
+/** \jacobi_module
+ * Applies the rotation in the plane \a j to the columns \a p and \a q of \c *this, i.e., it computes B = B * J
+ * with \f$ B = \left ( \begin{array}{cc} \text{*this.col}(p) & \text{*this.col}(q) \end{array} \right ) \f$.
+ *
+ * \sa class JacobiRotation, MatrixBase::applyOnTheLeft(), internal::apply_rotation_in_the_plane()
+ */
+template <typename Derived>
+template <typename OtherScalar>
+EIGEN_DEVICE_FUNC inline void MatrixBase<Derived>::applyOnTheRight(Index p, Index q,
+                                                                   const JacobiRotation<OtherScalar>& j) {
   ColXpr x(this->col(p));
   ColXpr y(this->col(q));
   internal::apply_rotation_in_the_plane(x, y, j.transpose());
 }
 
 namespace internal {
-template<typename VectorX, typename VectorY, typename OtherScalar>
-void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(VectorX& _x, VectorY& _y, const JacobiRotation<OtherScalar>& j)
-{
-  typedef typename VectorX::Index Index;
-  typedef typename VectorX::Scalar Scalar;
-  enum { PacketSize = packet_traits<Scalar>::size };
-  typedef typename packet_traits<Scalar>::type Packet;
-  eigen_assert(_x.size() == _y.size());
-  Index size = _x.size();
-  Index incrx = _x.innerStride();
-  Index incry = _y.innerStride();
-
-  Scalar* EIGEN_RESTRICT x = &_x.coeffRef(0);
-  Scalar* EIGEN_RESTRICT y = &_y.coeffRef(0);
-  
-  OtherScalar c = j.c();
-  OtherScalar s = j.s();
-  if (c==OtherScalar(1) && s==OtherScalar(0))
-    return;
-
-  /*** dynamic-size vectorized paths ***/
-
-  if(VectorX::SizeAtCompileTime == Dynamic &&
-    (VectorX::Flags & VectorY::Flags & PacketAccessBit) &&
-    ((incrx==1 && incry==1) || PacketSize == 1))
-  {
-    // both vectors are sequentially stored in memory => vectorization
-    enum { Peeling = 2 };
-
-    Index alignedStart = internal::first_aligned(y, size);
-    Index alignedEnd = alignedStart + ((size-alignedStart)/PacketSize)*PacketSize;
-
-    const Packet pc = pset1<Packet>(c);
-    const Packet ps = pset1<Packet>(s);
-    conj_helper<Packet,Packet,NumTraits<Scalar>::IsComplex,false> pcj;
-
-    for(Index i=0; i<alignedStart; ++i)
-    {
-      Scalar xi = x[i];
-      Scalar yi = y[i];
-      x[i] =  c * xi + numext::conj(s) * yi;
-      y[i] = -s * xi + numext::conj(c) * yi;
+
+template <typename Scalar, typename OtherScalar, int SizeAtCompileTime, int MinAlignment, bool Vectorizable>
+struct apply_rotation_in_the_plane_selector {
+  static EIGEN_DEVICE_FUNC inline void run(Scalar* x, Index incrx, Scalar* y, Index incry, Index size, OtherScalar c,
+                                           OtherScalar s) {
+    for (Index i = 0; i < size; ++i) {
+      Scalar xi = *x;
+      Scalar yi = *y;
+      *x = c * xi + numext::conj(s) * yi;
+      *y = -s * xi + numext::conj(c) * yi;
+      x += incrx;
+      y += incry;
     }
+  }
+};
 
-    Scalar* EIGEN_RESTRICT px = x + alignedStart;
-    Scalar* EIGEN_RESTRICT py = y + alignedStart;
+template <typename Scalar, typename OtherScalar, int SizeAtCompileTime, int MinAlignment>
+struct apply_rotation_in_the_plane_selector<Scalar, OtherScalar, SizeAtCompileTime, MinAlignment,
+                                            true /* vectorizable */> {
+  static inline void run(Scalar* x, Index incrx, Scalar* y, Index incry, Index size, OtherScalar c, OtherScalar s) {
+    typedef typename packet_traits<Scalar>::type Packet;
+    typedef typename packet_traits<OtherScalar>::type OtherPacket;
+
+    constexpr int RequiredAlignment =
+        (std::max)(unpacket_traits<Packet>::alignment, unpacket_traits<OtherPacket>::alignment);
+    constexpr Index PacketSize = packet_traits<Scalar>::size;
+
+    /*** dynamic-size vectorized paths ***/
+    if (size >= 2 * PacketSize && SizeAtCompileTime == Dynamic && ((incrx == 1 && incry == 1) || PacketSize == 1)) {
+      // both vectors are sequentially stored in memory => vectorization
+      constexpr Index Peeling = 2;
+
+      Index alignedStart = internal::first_default_aligned(y, size);
+      Index alignedEnd = alignedStart + ((size - alignedStart) / PacketSize) * PacketSize;
+
+      const OtherPacket pc = pset1<OtherPacket>(c);
+      const OtherPacket ps = pset1<OtherPacket>(s);
+      conj_helper<OtherPacket, Packet, NumTraits<OtherScalar>::IsComplex, false> pcj;
+      conj_helper<OtherPacket, Packet, false, false> pm;
+
+      for (Index i = 0; i < alignedStart; ++i) {
+        Scalar xi = x[i];
+        Scalar yi = y[i];
+        x[i] = c * xi + numext::conj(s) * yi;
+        y[i] = -s * xi + numext::conj(c) * yi;
+      }
+
+      Scalar* EIGEN_RESTRICT px = x + alignedStart;
+      Scalar* EIGEN_RESTRICT py = y + alignedStart;
+
+      if (internal::first_default_aligned(x, size) == alignedStart) {
+        for (Index i = alignedStart; i < alignedEnd; i += PacketSize) {
+          Packet xi = pload<Packet>(px);
+          Packet yi = pload<Packet>(py);
+          pstore(px, padd(pm.pmul(pc, xi), pcj.pmul(ps, yi)));
+          pstore(py, psub(pcj.pmul(pc, yi), pm.pmul(ps, xi)));
+          px += PacketSize;
+          py += PacketSize;
+        }
+      } else {
+        Index peelingEnd = alignedStart + ((size - alignedStart) / (Peeling * PacketSize)) * (Peeling * PacketSize);
+        for (Index i = alignedStart; i < peelingEnd; i += Peeling * PacketSize) {
+          Packet xi = ploadu<Packet>(px);
+          Packet xi1 = ploadu<Packet>(px + PacketSize);
+          Packet yi = pload<Packet>(py);
+          Packet yi1 = pload<Packet>(py + PacketSize);
+          pstoreu(px, padd(pm.pmul(pc, xi), pcj.pmul(ps, yi)));
+          pstoreu(px + PacketSize, padd(pm.pmul(pc, xi1), pcj.pmul(ps, yi1)));
+          pstore(py, psub(pcj.pmul(pc, yi), pm.pmul(ps, xi)));
+          pstore(py + PacketSize, psub(pcj.pmul(pc, yi1), pm.pmul(ps, xi1)));
+          px += Peeling * PacketSize;
+          py += Peeling * PacketSize;
+        }
+        if (alignedEnd != peelingEnd) {
+          Packet xi = ploadu<Packet>(x + peelingEnd);
+          Packet yi = pload<Packet>(y + peelingEnd);
+          pstoreu(x + peelingEnd, padd(pm.pmul(pc, xi), pcj.pmul(ps, yi)));
+          pstore(y + peelingEnd, psub(pcj.pmul(pc, yi), pm.pmul(ps, xi)));
+        }
+      }
 
-    if(internal::first_aligned(x, size)==alignedStart)
-    {
-      for(Index i=alignedStart; i<alignedEnd; i+=PacketSize)
-      {
+      for (Index i = alignedEnd; i < size; ++i) {
+        Scalar xi = x[i];
+        Scalar yi = y[i];
+        x[i] = c * xi + numext::conj(s) * yi;
+        y[i] = -s * xi + numext::conj(c) * yi;
+      }
+    }
+
+    /*** fixed-size vectorized path ***/
+    else if (SizeAtCompileTime != Dynamic && MinAlignment >= RequiredAlignment) {
+      const OtherPacket pc = pset1<OtherPacket>(c);
+      const OtherPacket ps = pset1<OtherPacket>(s);
+      conj_helper<OtherPacket, Packet, NumTraits<OtherScalar>::IsComplex, false> pcj;
+      conj_helper<OtherPacket, Packet, false, false> pm;
+      Scalar* EIGEN_RESTRICT px = x;
+      Scalar* EIGEN_RESTRICT py = y;
+      for (Index i = 0; i < size; i += PacketSize) {
         Packet xi = pload<Packet>(px);
         Packet yi = pload<Packet>(py);
-        pstore(px, padd(pmul(pc,xi),pcj.pmul(ps,yi)));
-        pstore(py, psub(pcj.pmul(pc,yi),pmul(ps,xi)));
+        pstore(px, padd(pm.pmul(pc, xi), pcj.pmul(ps, yi)));
+        pstore(py, psub(pcj.pmul(pc, yi), pm.pmul(ps, xi)));
         px += PacketSize;
         py += PacketSize;
       }
     }
-    else
-    {
-      Index peelingEnd = alignedStart + ((size-alignedStart)/(Peeling*PacketSize))*(Peeling*PacketSize);
-      for(Index i=alignedStart; i<peelingEnd; i+=Peeling*PacketSize)
-      {
-        Packet xi   = ploadu<Packet>(px);
-        Packet xi1  = ploadu<Packet>(px+PacketSize);
-        Packet yi   = pload <Packet>(py);
-        Packet yi1  = pload <Packet>(py+PacketSize);
-        pstoreu(px, padd(pmul(pc,xi),pcj.pmul(ps,yi)));
-        pstoreu(px+PacketSize, padd(pmul(pc,xi1),pcj.pmul(ps,yi1)));
-        pstore (py, psub(pcj.pmul(pc,yi),pmul(ps,xi)));
-        pstore (py+PacketSize, psub(pcj.pmul(pc,yi1),pmul(ps,xi1)));
-        px += Peeling*PacketSize;
-        py += Peeling*PacketSize;
-      }
-      if(alignedEnd!=peelingEnd)
-      {
-        Packet xi = ploadu<Packet>(x+peelingEnd);
-        Packet yi = pload <Packet>(y+peelingEnd);
-        pstoreu(x+peelingEnd, padd(pmul(pc,xi),pcj.pmul(ps,yi)));
-        pstore (y+peelingEnd, psub(pcj.pmul(pc,yi),pmul(ps,xi)));
-      }
-    }
 
-    for(Index i=alignedEnd; i<size; ++i)
-    {
-      Scalar xi = x[i];
-      Scalar yi = y[i];
-      x[i] =  c * xi + numext::conj(s) * yi;
-      y[i] = -s * xi + numext::conj(c) * yi;
+    /*** non-vectorized path ***/
+    else {
+      apply_rotation_in_the_plane_selector<Scalar, OtherScalar, SizeAtCompileTime, MinAlignment, false>::run(
+          x, incrx, y, incry, size, c, s);
     }
   }
+};
 
-  /*** fixed-size vectorized path ***/
-  else if(VectorX::SizeAtCompileTime != Dynamic &&
-          (VectorX::Flags & VectorY::Flags & PacketAccessBit) &&
-          (VectorX::Flags & VectorY::Flags & AlignedBit))
-  {
-    const Packet pc = pset1<Packet>(c);
-    const Packet ps = pset1<Packet>(s);
-    conj_helper<Packet,Packet,NumTraits<Scalar>::IsComplex,false> pcj;
-    Scalar* EIGEN_RESTRICT px = x;
-    Scalar* EIGEN_RESTRICT py = y;
-    for(Index i=0; i<size; i+=PacketSize)
-    {
-      Packet xi = pload<Packet>(px);
-      Packet yi = pload<Packet>(py);
-      pstore(px, padd(pmul(pc,xi),pcj.pmul(ps,yi)));
-      pstore(py, psub(pcj.pmul(pc,yi),pmul(ps,xi)));
-      px += PacketSize;
-      py += PacketSize;
-    }
-  }
+template <typename VectorX, typename VectorY, typename OtherScalar>
+EIGEN_DEVICE_FUNC void inline apply_rotation_in_the_plane(DenseBase<VectorX>& xpr_x, DenseBase<VectorY>& xpr_y,
+                                                          const JacobiRotation<OtherScalar>& j) {
+  typedef typename VectorX::Scalar Scalar;
+  constexpr bool Vectorizable = (int(evaluator<VectorX>::Flags) & int(evaluator<VectorY>::Flags) & PacketAccessBit) &&
+                                (int(packet_traits<Scalar>::size) == int(packet_traits<OtherScalar>::size));
 
-  /*** non-vectorized path ***/
-  else
-  {
-    for(Index i=0; i<size; ++i)
-    {
-      Scalar xi = *x;
-      Scalar yi = *y;
-      *x =  c * xi + numext::conj(s) * yi;
-      *y = -s * xi + numext::conj(c) * yi;
-      x += incrx;
-      y += incry;
-    }
-  }
+  eigen_assert(xpr_x.size() == xpr_y.size());
+  Index size = xpr_x.size();
+  Index incrx = xpr_x.derived().innerStride();
+  Index incry = xpr_y.derived().innerStride();
+
+  Scalar* EIGEN_RESTRICT x = &xpr_x.derived().coeffRef(0);
+  Scalar* EIGEN_RESTRICT y = &xpr_y.derived().coeffRef(0);
+
+  OtherScalar c = j.c();
+  OtherScalar s = j.s();
+  if (numext::is_exactly_one(c) && numext::is_exactly_zero(s)) return;
+
+  constexpr int Alignment = (std::min)(int(evaluator<VectorX>::Alignment), int(evaluator<VectorY>::Alignment));
+  apply_rotation_in_the_plane_selector<Scalar, OtherScalar, VectorX::SizeAtCompileTime, Alignment, Vectorizable>::run(
+      x, incrx, y, incry, size, c, s);
 }
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_JACOBI_H
+#endif  // EIGEN_JACOBI_H
diff --git a/inst/include/Eigen/src/KLUSupport/InternalHeaderCheck.h b/inst/include/Eigen/src/KLUSupport/InternalHeaderCheck.h
new file mode 100644
index 00000000..eb1d6715
--- /dev/null
+++ b/inst/include/Eigen/src/KLUSupport/InternalHeaderCheck.h
@@ -0,0 +1,3 @@
+#ifndef EIGEN_KLUSUPPORT_MODULE_H
+#error "Please include Eigen/KLUSupport instead of including headers inside the src directory directly."
+#endif
diff --git a/inst/include/Eigen/src/KLUSupport/KLUSupport.h b/inst/include/Eigen/src/KLUSupport/KLUSupport.h
new file mode 100644
index 00000000..21324ab7
--- /dev/null
+++ b/inst/include/Eigen/src/KLUSupport/KLUSupport.h
@@ -0,0 +1,339 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Kyle Macfarlan <kyle.macfarlan@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_KLUSUPPORT_H
+#define EIGEN_KLUSUPPORT_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/* TODO extract L, extract U, compute det, etc... */
+
+/** \ingroup KLUSupport_Module
+ * \brief A sparse LU factorization and solver based on KLU
+ *
+ * This class allows to solve for A.X = B sparse linear problems via a LU factorization
+ * using the KLU library. The sparse matrix A must be squared and full rank.
+ * The vectors or matrices X and B can be either dense or sparse.
+ *
+ * \warning The input matrix A should be in a \b compressed and \b column-major form.
+ * Otherwise an expensive copy will be made. You can call the inexpensive makeCompressed() to get a compressed matrix.
+ * \tparam MatrixType_ the type of the sparse matrix A, it must be a SparseMatrix<>
+ *
+ * \implsparsesolverconcept
+ *
+ * \sa \ref TutorialSparseSolverConcept, class UmfPackLU, class SparseLU
+ */
+
+inline int klu_solve(klu_symbolic *Symbolic, klu_numeric *Numeric, Index ldim, Index nrhs, double B[],
+                     klu_common *Common, double) {
+  return klu_solve(Symbolic, Numeric, internal::convert_index<int>(ldim), internal::convert_index<int>(nrhs), B,
+                   Common);
+}
+
+inline int klu_solve(klu_symbolic *Symbolic, klu_numeric *Numeric, Index ldim, Index nrhs, std::complex<double> B[],
+                     klu_common *Common, std::complex<double>) {
+  return klu_z_solve(Symbolic, Numeric, internal::convert_index<int>(ldim), internal::convert_index<int>(nrhs),
+                     &numext::real_ref(B[0]), Common);
+}
+
+inline int klu_tsolve(klu_symbolic *Symbolic, klu_numeric *Numeric, Index ldim, Index nrhs, double B[],
+                      klu_common *Common, double) {
+  return klu_tsolve(Symbolic, Numeric, internal::convert_index<int>(ldim), internal::convert_index<int>(nrhs), B,
+                    Common);
+}
+
+inline int klu_tsolve(klu_symbolic *Symbolic, klu_numeric *Numeric, Index ldim, Index nrhs, std::complex<double> B[],
+                      klu_common *Common, std::complex<double>) {
+  return klu_z_tsolve(Symbolic, Numeric, internal::convert_index<int>(ldim), internal::convert_index<int>(nrhs),
+                      &numext::real_ref(B[0]), 0, Common);
+}
+
+inline klu_numeric *klu_factor(int Ap[], int Ai[], double Ax[], klu_symbolic *Symbolic, klu_common *Common, double) {
+  return klu_factor(Ap, Ai, Ax, Symbolic, Common);
+}
+
+inline klu_numeric *klu_factor(int Ap[], int Ai[], std::complex<double> Ax[], klu_symbolic *Symbolic,
+                               klu_common *Common, std::complex<double>) {
+  return klu_z_factor(Ap, Ai, &numext::real_ref(Ax[0]), Symbolic, Common);
+}
+
+template <typename MatrixType_>
+class KLU : public SparseSolverBase<KLU<MatrixType_> > {
+ protected:
+  typedef SparseSolverBase<KLU<MatrixType_> > Base;
+  using Base::m_isInitialized;
+
+ public:
+  using Base::_solve_impl;
+  typedef MatrixType_ MatrixType;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef typename MatrixType::StorageIndex StorageIndex;
+  typedef Matrix<Scalar, Dynamic, 1> Vector;
+  typedef Matrix<int, 1, MatrixType::ColsAtCompileTime> IntRowVectorType;
+  typedef Matrix<int, MatrixType::RowsAtCompileTime, 1> IntColVectorType;
+  typedef SparseMatrix<Scalar> LUMatrixType;
+  typedef SparseMatrix<Scalar, ColMajor, int> KLUMatrixType;
+  typedef Ref<const KLUMatrixType, StandardCompressedFormat> KLUMatrixRef;
+  enum { ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime };
+
+ public:
+  KLU() : m_dummy(0, 0), mp_matrix(m_dummy) { init(); }
+
+  template <typename InputMatrixType>
+  explicit KLU(const InputMatrixType &matrix) : mp_matrix(matrix) {
+    init();
+    compute(matrix);
+  }
+
+  ~KLU() {
+    if (m_symbolic) klu_free_symbolic(&m_symbolic, &m_common);
+    if (m_numeric) klu_free_numeric(&m_numeric, &m_common);
+  }
+
+  constexpr Index rows() const noexcept { return mp_matrix.rows(); }
+  constexpr Index cols() const noexcept { return mp_matrix.cols(); }
+
+  /** \brief Reports whether previous computation was successful.
+   *
+   * \returns \c Success if computation was successful,
+   *          \c NumericalIssue if the matrix.appears to be negative.
+   */
+  ComputationInfo info() const {
+    eigen_assert(m_isInitialized && "Decomposition is not initialized.");
+    return m_info;
+  }
+#if 0  // not implemented yet
+    inline const LUMatrixType& matrixL() const
+    {
+      if (m_extractedDataAreDirty) extractData();
+      return m_l;
+    }
+
+    inline const LUMatrixType& matrixU() const
+    {
+      if (m_extractedDataAreDirty) extractData();
+      return m_u;
+    }
+
+    inline const IntColVectorType& permutationP() const
+    {
+      if (m_extractedDataAreDirty) extractData();
+      return m_p;
+    }
+
+    inline const IntRowVectorType& permutationQ() const
+    {
+      if (m_extractedDataAreDirty) extractData();
+      return m_q;
+    }
+#endif
+  /** Computes the sparse Cholesky decomposition of \a matrix
+   *  Note that the matrix should be column-major, and in compressed format for best performance.
+   *  \sa SparseMatrix::makeCompressed().
+   */
+  template <typename InputMatrixType>
+  void compute(const InputMatrixType &matrix) {
+    if (m_symbolic) klu_free_symbolic(&m_symbolic, &m_common);
+    if (m_numeric) klu_free_numeric(&m_numeric, &m_common);
+    grab(matrix.derived());
+    analyzePattern_impl();
+    factorize_impl();
+  }
+
+  /** Performs a symbolic decomposition on the sparsity of \a matrix.
+   *
+   * This function is particularly useful when solving for several problems having the same structure.
+   *
+   * \sa factorize(), compute()
+   */
+  template <typename InputMatrixType>
+  void analyzePattern(const InputMatrixType &matrix) {
+    if (m_symbolic) klu_free_symbolic(&m_symbolic, &m_common);
+    if (m_numeric) klu_free_numeric(&m_numeric, &m_common);
+
+    grab(matrix.derived());
+
+    analyzePattern_impl();
+  }
+
+  /** Provides access to the control settings array used by KLU.
+   *
+   * See KLU documentation for details.
+   */
+  inline const klu_common &kluCommon() const { return m_common; }
+
+  /** Provides access to the control settings array used by UmfPack.
+   *
+   * If this array contains NaN's, the default values are used.
+   *
+   * See KLU documentation for details.
+   */
+  inline klu_common &kluCommon() { return m_common; }
+
+  /** Performs a numeric decomposition of \a matrix
+   *
+   * The given matrix must have the same sparsity than the matrix on which the pattern anylysis has been performed.
+   *
+   * \sa analyzePattern(), compute()
+   */
+  template <typename InputMatrixType>
+  void factorize(const InputMatrixType &matrix) {
+    eigen_assert(m_analysisIsOk && "KLU: you must first call analyzePattern()");
+    if (m_numeric) klu_free_numeric(&m_numeric, &m_common);
+
+    grab(matrix.derived());
+
+    factorize_impl();
+  }
+
+  /** \internal */
+  template <typename BDerived, typename XDerived>
+  bool _solve_impl(const MatrixBase<BDerived> &b, MatrixBase<XDerived> &x) const;
+
+#if 0  // not implemented yet
+    Scalar determinant() const;
+
+    void extractData() const;
+#endif
+
+ protected:
+  void init() {
+    m_info = InvalidInput;
+    m_isInitialized = false;
+    m_numeric = 0;
+    m_symbolic = 0;
+    m_extractedDataAreDirty = true;
+
+    klu_defaults(&m_common);
+  }
+
+  void analyzePattern_impl() {
+    m_info = InvalidInput;
+    m_analysisIsOk = false;
+    m_factorizationIsOk = false;
+    m_symbolic = klu_analyze(internal::convert_index<int>(mp_matrix.rows()),
+                             const_cast<StorageIndex *>(mp_matrix.outerIndexPtr()),
+                             const_cast<StorageIndex *>(mp_matrix.innerIndexPtr()), &m_common);
+    if (m_symbolic) {
+      m_isInitialized = true;
+      m_info = Success;
+      m_analysisIsOk = true;
+      m_extractedDataAreDirty = true;
+    }
+  }
+
+  void factorize_impl() {
+    m_numeric = klu_factor(const_cast<StorageIndex *>(mp_matrix.outerIndexPtr()),
+                           const_cast<StorageIndex *>(mp_matrix.innerIndexPtr()),
+                           const_cast<Scalar *>(mp_matrix.valuePtr()), m_symbolic, &m_common, Scalar());
+
+    m_info = m_numeric ? Success : NumericalIssue;
+    m_factorizationIsOk = m_numeric ? 1 : 0;
+    m_extractedDataAreDirty = true;
+  }
+
+  template <typename MatrixDerived>
+  void grab(const EigenBase<MatrixDerived> &A) {
+    internal::destroy_at(&mp_matrix);
+    internal::construct_at(&mp_matrix, A.derived());
+  }
+
+  void grab(const KLUMatrixRef &A) {
+    if (&(A.derived()) != &mp_matrix) {
+      internal::destroy_at(&mp_matrix);
+      internal::construct_at(&mp_matrix, A);
+    }
+  }
+
+  // cached data to reduce reallocation, etc.
+#if 0  // not implemented yet
+    mutable LUMatrixType m_l;
+    mutable LUMatrixType m_u;
+    mutable IntColVectorType m_p;
+    mutable IntRowVectorType m_q;
+#endif
+
+  KLUMatrixType m_dummy;
+  KLUMatrixRef mp_matrix;
+
+  klu_numeric *m_numeric;
+  klu_symbolic *m_symbolic;
+  klu_common m_common;
+  mutable ComputationInfo m_info;
+  int m_factorizationIsOk;
+  int m_analysisIsOk;
+  mutable bool m_extractedDataAreDirty;
+
+ private:
+  KLU(const KLU &) {}
+};
+
+#if 0  // not implemented yet
+template<typename MatrixType>
+void KLU<MatrixType>::extractData() const
+{
+  if (m_extractedDataAreDirty)
+  {
+     eigen_assert(false && "KLU: extractData Not Yet Implemented");
+
+    // get size of the data
+    int lnz, unz, rows, cols, nz_udiag;
+    umfpack_get_lunz(&lnz, &unz, &rows, &cols, &nz_udiag, m_numeric, Scalar());
+
+    // allocate data
+    m_l.resize(rows,(std::min)(rows,cols));
+    m_l.resizeNonZeros(lnz);
+
+    m_u.resize((std::min)(rows,cols),cols);
+    m_u.resizeNonZeros(unz);
+
+    m_p.resize(rows);
+    m_q.resize(cols);
+
+    // extract
+    umfpack_get_numeric(m_l.outerIndexPtr(), m_l.innerIndexPtr(), m_l.valuePtr(),
+                        m_u.outerIndexPtr(), m_u.innerIndexPtr(), m_u.valuePtr(),
+                        m_p.data(), m_q.data(), 0, 0, 0, m_numeric);
+
+    m_extractedDataAreDirty = false;
+  }
+}
+
+template<typename MatrixType>
+typename KLU<MatrixType>::Scalar KLU<MatrixType>::determinant() const
+{
+  eigen_assert(false && "KLU: extractData Not Yet Implemented");
+  return Scalar();
+}
+#endif
+
+template <typename MatrixType>
+template <typename BDerived, typename XDerived>
+bool KLU<MatrixType>::_solve_impl(const MatrixBase<BDerived> &b, MatrixBase<XDerived> &x) const {
+  Index rhsCols = b.cols();
+  EIGEN_STATIC_ASSERT((XDerived::Flags & RowMajorBit) == 0, THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES);
+  eigen_assert(m_factorizationIsOk &&
+               "The decomposition is not in a valid state for solving, you must first call either compute() or "
+               "analyzePattern()/factorize()");
+
+  x = b;
+  int info = klu_solve(m_symbolic, m_numeric, b.rows(), rhsCols, x.const_cast_derived().data(),
+                       const_cast<klu_common *>(&m_common), Scalar());
+
+  m_info = info != 0 ? Success : NumericalIssue;
+  return true;
+}
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_KLUSUPPORT_H
diff --git a/inst/include/Eigen/src/LU/Determinant.h b/inst/include/Eigen/src/LU/Determinant.h
index bb8e78a8..ae4fee38 100644
--- a/inst/include/Eigen/src/LU/Determinant.h
+++ b/inst/include/Eigen/src/LU/Determinant.h
@@ -10,92 +10,89 @@
 #ifndef EIGEN_DETERMINANT_H
 #define EIGEN_DETERMINANT_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 
-namespace internal {
+namespace Eigen {
 
-template<typename Derived>
-inline const typename Derived::Scalar bruteforce_det3_helper
-(const MatrixBase<Derived>& matrix, int a, int b, int c)
-{
-  return matrix.coeff(0,a)
-         * (matrix.coeff(1,b) * matrix.coeff(2,c) - matrix.coeff(1,c) * matrix.coeff(2,b));
-}
+namespace internal {
 
-template<typename Derived>
-const typename Derived::Scalar bruteforce_det4_helper
-(const MatrixBase<Derived>& matrix, int j, int k, int m, int n)
-{
-  return (matrix.coeff(j,0) * matrix.coeff(k,1) - matrix.coeff(k,0) * matrix.coeff(j,1))
-       * (matrix.coeff(m,2) * matrix.coeff(n,3) - matrix.coeff(n,2) * matrix.coeff(m,3));
+template <typename Derived>
+EIGEN_DEVICE_FUNC inline const typename Derived::Scalar bruteforce_det3_helper(const MatrixBase<Derived>& matrix, int a,
+                                                                               int b, int c) {
+  return matrix.coeff(0, a) * (matrix.coeff(1, b) * matrix.coeff(2, c) - matrix.coeff(1, c) * matrix.coeff(2, b));
 }
 
-template<typename Derived,
-         int DeterminantType = Derived::RowsAtCompileTime
-> struct determinant_impl
-{
-  static inline typename traits<Derived>::Scalar run(const Derived& m)
-  {
-    if(Derived::ColsAtCompileTime==Dynamic && m.rows()==0)
-      return typename traits<Derived>::Scalar(1);
+template <typename Derived, int DeterminantType = Derived::RowsAtCompileTime>
+struct determinant_impl {
+  static inline typename traits<Derived>::Scalar run(const Derived& m) {
+    if (Derived::ColsAtCompileTime == Dynamic && m.rows() == 0) return typename traits<Derived>::Scalar(1);
     return m.partialPivLu().determinant();
   }
 };
 
-template<typename Derived> struct determinant_impl<Derived, 1>
-{
-  static inline typename traits<Derived>::Scalar run(const Derived& m)
-  {
-    return m.coeff(0,0);
-  }
+template <typename Derived>
+struct determinant_impl<Derived, 1> {
+  static inline EIGEN_DEVICE_FUNC typename traits<Derived>::Scalar run(const Derived& m) { return m.coeff(0, 0); }
 };
 
-template<typename Derived> struct determinant_impl<Derived, 2>
-{
-  static inline typename traits<Derived>::Scalar run(const Derived& m)
-  {
-    return m.coeff(0,0) * m.coeff(1,1) - m.coeff(1,0) * m.coeff(0,1);
+template <typename Derived>
+struct determinant_impl<Derived, 2> {
+  static inline EIGEN_DEVICE_FUNC typename traits<Derived>::Scalar run(const Derived& m) {
+    return m.coeff(0, 0) * m.coeff(1, 1) - m.coeff(1, 0) * m.coeff(0, 1);
   }
 };
 
-template<typename Derived> struct determinant_impl<Derived, 3>
-{
-  static inline typename traits<Derived>::Scalar run(const Derived& m)
-  {
-    return bruteforce_det3_helper(m,0,1,2)
-          - bruteforce_det3_helper(m,1,0,2)
-          + bruteforce_det3_helper(m,2,0,1);
+template <typename Derived>
+struct determinant_impl<Derived, 3> {
+  static inline EIGEN_DEVICE_FUNC typename traits<Derived>::Scalar run(const Derived& m) {
+    return bruteforce_det3_helper(m, 0, 1, 2) - bruteforce_det3_helper(m, 1, 0, 2) + bruteforce_det3_helper(m, 2, 0, 1);
   }
 };
 
-template<typename Derived> struct determinant_impl<Derived, 4>
-{
-  static typename traits<Derived>::Scalar run(const Derived& m)
-  {
-    // trick by Martin Costabel to compute 4x4 det with only 30 muls
-    return bruteforce_det4_helper(m,0,1,2,3)
-          - bruteforce_det4_helper(m,0,2,1,3)
-          + bruteforce_det4_helper(m,0,3,1,2)
-          + bruteforce_det4_helper(m,1,2,0,3)
-          - bruteforce_det4_helper(m,1,3,0,2)
-          + bruteforce_det4_helper(m,2,3,0,1);
+template <typename Derived>
+struct determinant_impl<Derived, 4> {
+  typedef typename traits<Derived>::Scalar Scalar;
+  static EIGEN_DEVICE_FUNC Scalar run(const Derived& m) {
+    Scalar d2_01 = det2(m, 0, 1);
+    Scalar d2_02 = det2(m, 0, 2);
+    Scalar d2_03 = det2(m, 0, 3);
+    Scalar d2_12 = det2(m, 1, 2);
+    Scalar d2_13 = det2(m, 1, 3);
+    Scalar d2_23 = det2(m, 2, 3);
+    Scalar d3_0 = det3(m, 1, d2_23, 2, d2_13, 3, d2_12);
+    Scalar d3_1 = det3(m, 0, d2_23, 2, d2_03, 3, d2_02);
+    Scalar d3_2 = det3(m, 0, d2_13, 1, d2_03, 3, d2_01);
+    Scalar d3_3 = det3(m, 0, d2_12, 1, d2_02, 2, d2_01);
+    return internal::pmadd(static_cast<Scalar>(-m(0, 3)), d3_0, static_cast<Scalar>(m(1, 3) * d3_1)) +
+           internal::pmadd(static_cast<Scalar>(-m(2, 3)), d3_2, static_cast<Scalar>(m(3, 3) * d3_3));
+  }
+
+ protected:
+  static EIGEN_DEVICE_FUNC Scalar det2(const Derived& m, Index i0, Index i1) {
+    return m(i0, 0) * m(i1, 1) - m(i1, 0) * m(i0, 1);
+  }
+
+  static EIGEN_DEVICE_FUNC Scalar det3(const Derived& m, Index i0, const Scalar& d0, Index i1, const Scalar& d1,
+                                       Index i2, const Scalar& d2) {
+    return internal::pmadd(m(i0, 2), d0,
+                           internal::pmadd(static_cast<Scalar>(-m(i1, 2)), d1, static_cast<Scalar>(m(i2, 2) * d2)));
   }
 };
 
-} // end namespace internal
+}  // end namespace internal
 
 /** \lu_module
-  *
-  * \returns the determinant of this matrix
-  */
-template<typename Derived>
-inline typename internal::traits<Derived>::Scalar MatrixBase<Derived>::determinant() const
-{
+ *
+ * \returns the determinant of this matrix
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC inline typename internal::traits<Derived>::Scalar MatrixBase<Derived>::determinant() const {
   eigen_assert(rows() == cols());
-  typedef typename internal::nested<Derived,Base::RowsAtCompileTime>::type Nested;
-  return internal::determinant_impl<typename internal::remove_all<Nested>::type>::run(derived());
+  typedef typename internal::nested_eval<Derived, Base::RowsAtCompileTime>::type Nested;
+  return internal::determinant_impl<internal::remove_all_t<Nested>>::run(derived());
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_DETERMINANT_H
+#endif  // EIGEN_DETERMINANT_H
diff --git a/inst/include/Eigen/src/LU/FullPivLU.h b/inst/include/Eigen/src/LU/FullPivLU.h
index 26bc7144..786cd76d 100644
--- a/inst/include/Eigen/src/LU/FullPivLU.h
+++ b/inst/include/Eigen/src/LU/FullPivLU.h
@@ -10,479 +10,524 @@
 #ifndef EIGEN_LU_H
 #define EIGEN_LU_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+template <typename MatrixType_, typename PermutationIndex_>
+struct traits<FullPivLU<MatrixType_, PermutationIndex_> > : traits<MatrixType_> {
+  typedef MatrixXpr XprKind;
+  typedef SolverStorage StorageKind;
+  typedef PermutationIndex_ StorageIndex;
+  enum { Flags = 0 };
+};
+
+}  // end namespace internal
 
 /** \ingroup LU_Module
-  *
-  * \class FullPivLU
-  *
-  * \brief LU decomposition of a matrix with complete pivoting, and related features
-  *
-  * \param MatrixType the type of the matrix of which we are computing the LU decomposition
-  *
-  * This class represents a LU decomposition of any matrix, with complete pivoting: the matrix A is
-  * decomposed as \f$ A = P^{-1} L U Q^{-1} \f$ where L is unit-lower-triangular, U is
-  * upper-triangular, and P and Q are permutation matrices. This is a rank-revealing LU
-  * decomposition. The eigenvalues (diagonal coefficients) of U are sorted in such a way that any
-  * zeros are at the end.
-  *
-  * This decomposition provides the generic approach to solving systems of linear equations, computing
-  * the rank, invertibility, inverse, kernel, and determinant.
-  *
-  * This LU decomposition is very stable and well tested with large matrices. However there are use cases where the SVD
-  * decomposition is inherently more stable and/or flexible. For example, when computing the kernel of a matrix,
-  * working with the SVD allows to select the smallest singular values of the matrix, something that
-  * the LU decomposition doesn't see.
-  *
-  * The data of the LU decomposition can be directly accessed through the methods matrixLU(),
-  * permutationP(), permutationQ().
-  *
-  * As an exemple, here is how the original matrix can be retrieved:
-  * \include class_FullPivLU.cpp
-  * Output: \verbinclude class_FullPivLU.out
-  *
-  * \sa MatrixBase::fullPivLu(), MatrixBase::determinant(), MatrixBase::inverse()
-  */
-template<typename _MatrixType> class FullPivLU
-{
-  public:
-    typedef _MatrixType MatrixType;
-    enum {
-      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-      Options = MatrixType::Options,
-      MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
-      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
-    };
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
-    typedef typename internal::traits<MatrixType>::StorageKind StorageKind;
-    typedef typename MatrixType::Index Index;
-    typedef typename internal::plain_row_type<MatrixType, Index>::type IntRowVectorType;
-    typedef typename internal::plain_col_type<MatrixType, Index>::type IntColVectorType;
-    typedef PermutationMatrix<ColsAtCompileTime, MaxColsAtCompileTime> PermutationQType;
-    typedef PermutationMatrix<RowsAtCompileTime, MaxRowsAtCompileTime> PermutationPType;
-
-    /**
-      * \brief Default Constructor.
-      *
-      * The default constructor is useful in cases in which the user intends to
-      * perform decompositions via LU::compute(const MatrixType&).
-      */
-    FullPivLU();
-
-    /** \brief Default Constructor with memory preallocation
-      *
-      * Like the default constructor but with preallocation of the internal data
-      * according to the specified problem \a size.
-      * \sa FullPivLU()
-      */
-    FullPivLU(Index rows, Index cols);
-
-    /** Constructor.
-      *
-      * \param matrix the matrix of which to compute the LU decomposition.
-      *               It is required to be nonzero.
-      */
-    FullPivLU(const MatrixType& matrix);
-
-    /** Computes the LU decomposition of the given matrix.
-      *
-      * \param matrix the matrix of which to compute the LU decomposition.
-      *               It is required to be nonzero.
-      *
-      * \returns a reference to *this
-      */
-    FullPivLU& compute(const MatrixType& matrix);
-
-    /** \returns the LU decomposition matrix: the upper-triangular part is U, the
-      * unit-lower-triangular part is L (at least for square matrices; in the non-square
-      * case, special care is needed, see the documentation of class FullPivLU).
-      *
-      * \sa matrixL(), matrixU()
-      */
-    inline const MatrixType& matrixLU() const
-    {
-      eigen_assert(m_isInitialized && "LU is not initialized.");
-      return m_lu;
-    }
+ *
+ * \class FullPivLU
+ *
+ * \brief LU decomposition of a matrix with complete pivoting, and related features
+ *
+ * \tparam MatrixType_ the type of the matrix of which we are computing the LU decomposition
+ *
+ * This class represents a LU decomposition of any matrix, with complete pivoting: the matrix A is
+ * decomposed as \f$ A = P^{-1} L U Q^{-1} \f$ where L is unit-lower-triangular, U is
+ * upper-triangular, and P and Q are permutation matrices. This is a rank-revealing LU
+ * decomposition. The eigenvalues (diagonal coefficients) of U are sorted in such a way that any
+ * zeros are at the end.
+ *
+ * This decomposition provides the generic approach to solving systems of linear equations, computing
+ * the rank, invertibility, inverse, kernel, and determinant.
+ *
+ * This LU decomposition is very stable and well tested with large matrices. However there are use cases where the SVD
+ * decomposition is inherently more stable and/or flexible. For example, when computing the kernel of a matrix,
+ * working with the SVD allows to select the smallest singular values of the matrix, something that
+ * the LU decomposition doesn't see.
+ *
+ * The data of the LU decomposition can be directly accessed through the methods matrixLU(),
+ * permutationP(), permutationQ().
+ *
+ * As an example, here is how the original matrix can be retrieved:
+ * \include class_FullPivLU.cpp
+ * Output: \verbinclude class_FullPivLU.out
+ *
+ * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
+ *
+ * \sa MatrixBase::fullPivLu(), MatrixBase::determinant(), MatrixBase::inverse()
+ */
+template <typename MatrixType_, typename PermutationIndex_>
+class FullPivLU : public SolverBase<FullPivLU<MatrixType_, PermutationIndex_> > {
+ public:
+  typedef MatrixType_ MatrixType;
+  typedef SolverBase<FullPivLU> Base;
+  friend class SolverBase<FullPivLU>;
+
+  EIGEN_GENERIC_PUBLIC_INTERFACE(FullPivLU)
+  enum {
+    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+  };
+  using PermutationIndex = PermutationIndex_;
+  typedef typename internal::plain_row_type<MatrixType, PermutationIndex>::type IntRowVectorType;
+  typedef typename internal::plain_col_type<MatrixType, PermutationIndex>::type IntColVectorType;
+  typedef PermutationMatrix<ColsAtCompileTime, MaxColsAtCompileTime, PermutationIndex> PermutationQType;
+  typedef PermutationMatrix<RowsAtCompileTime, MaxRowsAtCompileTime, PermutationIndex> PermutationPType;
+  typedef typename MatrixType::PlainObject PlainObject;
+
+  /** \brief Reports whether the LU factorization was successful.
+   *
+   * \note This function always returns \c Success. It is provided for compatibility
+   * with other factorization routines.
+   * \returns \c Success
+   */
+  ComputationInfo info() const {
+    eigen_assert(m_isInitialized && "FullPivLU is not initialized.");
+    return Success;
+  }
 
-    /** \returns the number of nonzero pivots in the LU decomposition.
-      * Here nonzero is meant in the exact sense, not in a fuzzy sense.
-      * So that notion isn't really intrinsically interesting, but it is
-      * still useful when implementing algorithms.
-      *
-      * \sa rank()
-      */
-    inline Index nonzeroPivots() const
-    {
-      eigen_assert(m_isInitialized && "LU is not initialized.");
-      return m_nonzero_pivots;
-    }
+  /**
+   * \brief Default Constructor.
+   *
+   * The default constructor is useful in cases in which the user intends to
+   * perform decompositions via LU::compute(const MatrixType&).
+   */
+  FullPivLU();
+
+  /** \brief Default Constructor with memory preallocation
+   *
+   * Like the default constructor but with preallocation of the internal data
+   * according to the specified problem \a size.
+   * \sa FullPivLU()
+   */
+  FullPivLU(Index rows, Index cols);
+
+  /** Constructor.
+   *
+   * \param matrix the matrix of which to compute the LU decomposition.
+   *               It is required to be nonzero.
+   */
+  template <typename InputType>
+  explicit FullPivLU(const EigenBase<InputType>& matrix);
+
+  /** \brief Constructs a LU factorization from a given matrix
+   *
+   * This overloaded constructor is provided for \link InplaceDecomposition inplace decomposition \endlink when \c
+   * MatrixType is a Eigen::Ref.
+   *
+   * \sa FullPivLU(const EigenBase&)
+   */
+  template <typename InputType>
+  explicit FullPivLU(EigenBase<InputType>& matrix);
+
+  /** Computes the LU decomposition of the given matrix.
+   *
+   * \param matrix the matrix of which to compute the LU decomposition.
+   *               It is required to be nonzero.
+   *
+   * \returns a reference to *this
+   */
+  template <typename InputType>
+  FullPivLU& compute(const EigenBase<InputType>& matrix) {
+    m_lu = matrix.derived();
+    computeInPlace();
+    return *this;
+  }
 
-    /** \returns the absolute value of the biggest pivot, i.e. the biggest
-      *          diagonal coefficient of U.
-      */
-    RealScalar maxPivot() const { return m_maxpivot; }
-
-    /** \returns the permutation matrix P
-      *
-      * \sa permutationQ()
-      */
-    inline const PermutationPType& permutationP() const
-    {
-      eigen_assert(m_isInitialized && "LU is not initialized.");
-      return m_p;
-    }
+  /** \returns the LU decomposition matrix: the upper-triangular part is U, the
+   * unit-lower-triangular part is L (at least for square matrices; in the non-square
+   * case, special care is needed, see the documentation of class FullPivLU).
+   *
+   * \sa matrixL(), matrixU()
+   */
+  inline const MatrixType& matrixLU() const {
+    eigen_assert(m_isInitialized && "LU is not initialized.");
+    return m_lu;
+  }
 
-    /** \returns the permutation matrix Q
-      *
-      * \sa permutationP()
-      */
-    inline const PermutationQType& permutationQ() const
-    {
-      eigen_assert(m_isInitialized && "LU is not initialized.");
-      return m_q;
-    }
+  /** \returns the number of nonzero pivots in the LU decomposition.
+   * Here nonzero is meant in the exact sense, not in a fuzzy sense.
+   * So that notion isn't really intrinsically interesting, but it is
+   * still useful when implementing algorithms.
+   *
+   * \sa rank()
+   */
+  inline Index nonzeroPivots() const {
+    eigen_assert(m_isInitialized && "LU is not initialized.");
+    return m_nonzero_pivots;
+  }
 
-    /** \returns the kernel of the matrix, also called its null-space. The columns of the returned matrix
-      * will form a basis of the kernel.
-      *
-      * \note If the kernel has dimension zero, then the returned matrix is a column-vector filled with zeros.
-      *
-      * \note This method has to determine which pivots should be considered nonzero.
-      *       For that, it uses the threshold value that you can control by calling
-      *       setThreshold(const RealScalar&).
-      *
-      * Example: \include FullPivLU_kernel.cpp
-      * Output: \verbinclude FullPivLU_kernel.out
-      *
-      * \sa image()
-      */
-    inline const internal::kernel_retval<FullPivLU> kernel() const
-    {
-      eigen_assert(m_isInitialized && "LU is not initialized.");
-      return internal::kernel_retval<FullPivLU>(*this);
-    }
+  /** \returns the absolute value of the biggest pivot, i.e. the biggest
+   *          diagonal coefficient of U.
+   */
+  RealScalar maxPivot() const { return m_maxpivot; }
+
+  /** \returns the permutation matrix P
+   *
+   * \sa permutationQ()
+   */
+  EIGEN_DEVICE_FUNC inline const PermutationPType& permutationP() const {
+    eigen_assert(m_isInitialized && "LU is not initialized.");
+    return m_p;
+  }
 
-    /** \returns the image of the matrix, also called its column-space. The columns of the returned matrix
-      * will form a basis of the kernel.
-      *
-      * \param originalMatrix the original matrix, of which *this is the LU decomposition.
-      *                       The reason why it is needed to pass it here, is that this allows
-      *                       a large optimization, as otherwise this method would need to reconstruct it
-      *                       from the LU decomposition.
-      *
-      * \note If the image has dimension zero, then the returned matrix is a column-vector filled with zeros.
-      *
-      * \note This method has to determine which pivots should be considered nonzero.
-      *       For that, it uses the threshold value that you can control by calling
-      *       setThreshold(const RealScalar&).
-      *
-      * Example: \include FullPivLU_image.cpp
-      * Output: \verbinclude FullPivLU_image.out
-      *
-      * \sa kernel()
-      */
-    inline const internal::image_retval<FullPivLU>
-      image(const MatrixType& originalMatrix) const
-    {
-      eigen_assert(m_isInitialized && "LU is not initialized.");
-      return internal::image_retval<FullPivLU>(*this, originalMatrix);
-    }
+  /** \returns the permutation matrix Q
+   *
+   * \sa permutationP()
+   */
+  inline const PermutationQType& permutationQ() const {
+    eigen_assert(m_isInitialized && "LU is not initialized.");
+    return m_q;
+  }
 
-    /** \return a solution x to the equation Ax=b, where A is the matrix of which
-      * *this is the LU decomposition.
-      *
-      * \param b the right-hand-side of the equation to solve. Can be a vector or a matrix,
-      *          the only requirement in order for the equation to make sense is that
-      *          b.rows()==A.rows(), where A is the matrix of which *this is the LU decomposition.
-      *
-      * \returns a solution.
-      *
-      * \note_about_checking_solutions
-      *
-      * \note_about_arbitrary_choice_of_solution
-      * \note_about_using_kernel_to_study_multiple_solutions
-      *
-      * Example: \include FullPivLU_solve.cpp
-      * Output: \verbinclude FullPivLU_solve.out
-      *
-      * \sa TriangularView::solve(), kernel(), inverse()
-      */
-    template<typename Rhs>
-    inline const internal::solve_retval<FullPivLU, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "LU is not initialized.");
-      return internal::solve_retval<FullPivLU, Rhs>(*this, b.derived());
-    }
+  /** \returns the kernel of the matrix, also called its null-space. The columns of the returned matrix
+   * will form a basis of the kernel.
+   *
+   * \note If the kernel has dimension zero, then the returned matrix is a column-vector filled with zeros.
+   *
+   * \note This method has to determine which pivots should be considered nonzero.
+   *       For that, it uses the threshold value that you can control by calling
+   *       setThreshold(const RealScalar&).
+   *
+   * Example: \include FullPivLU_kernel.cpp
+   * Output: \verbinclude FullPivLU_kernel.out
+   *
+   * \sa image()
+   */
+  inline const internal::kernel_retval<FullPivLU> kernel() const {
+    eigen_assert(m_isInitialized && "LU is not initialized.");
+    return internal::kernel_retval<FullPivLU>(*this);
+  }
 
-    /** \returns the determinant of the matrix of which
-      * *this is the LU decomposition. It has only linear complexity
-      * (that is, O(n) where n is the dimension of the square matrix)
-      * as the LU decomposition has already been computed.
-      *
-      * \note This is only for square matrices.
-      *
-      * \note For fixed-size matrices of size up to 4, MatrixBase::determinant() offers
-      *       optimized paths.
-      *
-      * \warning a determinant can be very big or small, so for matrices
-      * of large enough dimension, there is a risk of overflow/underflow.
-      *
-      * \sa MatrixBase::determinant()
-      */
-    typename internal::traits<MatrixType>::Scalar determinant() const;
-
-    /** Allows to prescribe a threshold to be used by certain methods, such as rank(),
-      * who need to determine when pivots are to be considered nonzero. This is not used for the
-      * LU decomposition itself.
-      *
-      * When it needs to get the threshold value, Eigen calls threshold(). By default, this
-      * uses a formula to automatically determine a reasonable threshold.
-      * Once you have called the present method setThreshold(const RealScalar&),
-      * your value is used instead.
-      *
-      * \param threshold The new value to use as the threshold.
-      *
-      * A pivot will be considered nonzero if its absolute value is strictly greater than
-      *  \f$ \vert pivot \vert \leqslant threshold \times \vert maxpivot \vert \f$
-      * where maxpivot is the biggest pivot.
-      *
-      * If you want to come back to the default behavior, call setThreshold(Default_t)
-      */
-    FullPivLU& setThreshold(const RealScalar& threshold)
-    {
-      m_usePrescribedThreshold = true;
-      m_prescribedThreshold = threshold;
-      return *this;
-    }
+  /** \returns the image of the matrix, also called its column-space. The columns of the returned matrix
+   * will form a basis of the image (column-space).
+   *
+   * \param originalMatrix the original matrix, of which *this is the LU decomposition.
+   *                       The reason why it is needed to pass it here, is that this allows
+   *                       a large optimization, as otherwise this method would need to reconstruct it
+   *                       from the LU decomposition.
+   *
+   * \note If the image has dimension zero, then the returned matrix is a column-vector filled with zeros.
+   *
+   * \note This method has to determine which pivots should be considered nonzero.
+   *       For that, it uses the threshold value that you can control by calling
+   *       setThreshold(const RealScalar&).
+   *
+   * Example: \include FullPivLU_image.cpp
+   * Output: \verbinclude FullPivLU_image.out
+   *
+   * \sa kernel()
+   */
+  inline const internal::image_retval<FullPivLU> image(const MatrixType& originalMatrix) const {
+    eigen_assert(m_isInitialized && "LU is not initialized.");
+    return internal::image_retval<FullPivLU>(*this, originalMatrix);
+  }
 
-    /** Allows to come back to the default behavior, letting Eigen use its default formula for
-      * determining the threshold.
-      *
-      * You should pass the special object Eigen::Default as parameter here.
-      * \code lu.setThreshold(Eigen::Default); \endcode
-      *
-      * See the documentation of setThreshold(const RealScalar&).
-      */
-    FullPivLU& setThreshold(Default_t)
-    {
-      m_usePrescribedThreshold = false;
-      return *this;
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+  /** \return a solution x to the equation Ax=b, where A is the matrix of which
+   * *this is the LU decomposition.
+   *
+   * \param b the right-hand-side of the equation to solve. Can be a vector or a matrix,
+   *          the only requirement in order for the equation to make sense is that
+   *          b.rows()==A.rows(), where A is the matrix of which *this is the LU decomposition.
+   *
+   * \returns a solution.
+   *
+   * \note_about_checking_solutions
+   *
+   * \note_about_arbitrary_choice_of_solution
+   * \note_about_using_kernel_to_study_multiple_solutions
+   *
+   * Example: \include FullPivLU_solve.cpp
+   * Output: \verbinclude FullPivLU_solve.out
+   *
+   * \sa TriangularView::solve(), kernel(), inverse()
+   */
+  template <typename Rhs>
+  inline const Solve<FullPivLU, Rhs> solve(const MatrixBase<Rhs>& b) const;
+#endif
+
+  /** \returns an estimate of the reciprocal condition number of the matrix of which \c *this is
+      the LU decomposition.
+    */
+  inline RealScalar rcond() const {
+    eigen_assert(m_isInitialized && "FullPivLU is not initialized.");
+    if (!isInvertible()) {
+      return RealScalar(0);
     }
+    return internal::rcond_estimate_helper(m_l1_norm, *this);
+  }
 
-    /** Returns the threshold that will be used by certain methods such as rank().
-      *
-      * See the documentation of setThreshold(const RealScalar&).
-      */
-    RealScalar threshold() const
-    {
-      eigen_assert(m_isInitialized || m_usePrescribedThreshold);
-      return m_usePrescribedThreshold ? m_prescribedThreshold
-      // this formula comes from experimenting (see "LU precision tuning" thread on the list)
-      // and turns out to be identical to Higham's formula used already in LDLt.
-                                      : NumTraits<Scalar>::epsilon() * m_lu.diagonalSize();
-    }
+  /** \returns the determinant of the matrix of which
+   * *this is the LU decomposition. It has only linear complexity
+   * (that is, O(n) where n is the dimension of the square matrix)
+   * as the LU decomposition has already been computed.
+   *
+   * \note This is only for square matrices.
+   *
+   * \note For fixed-size matrices of size up to 4, MatrixBase::determinant() offers
+   *       optimized paths.
+   *
+   * \warning a determinant can be very big or small, so for matrices
+   * of large enough dimension, there is a risk of overflow/underflow.
+   *
+   * \sa MatrixBase::determinant()
+   */
+  typename internal::traits<MatrixType>::Scalar determinant() const;
+
+  /** Allows to prescribe a threshold to be used by certain methods, such as rank(),
+   * who need to determine when pivots are to be considered nonzero. This is not used for the
+   * LU decomposition itself.
+   *
+   * When it needs to get the threshold value, Eigen calls threshold(). By default, this
+   * uses a formula to automatically determine a reasonable threshold.
+   * Once you have called the present method setThreshold(const RealScalar&),
+   * your value is used instead.
+   *
+   * \param threshold The new value to use as the threshold.
+   *
+   * A pivot will be considered nonzero if its absolute value is strictly greater than
+   *  \f$ \vert pivot \vert \leqslant threshold \times \vert maxpivot \vert \f$
+   * where maxpivot is the biggest pivot.
+   *
+   * If you want to come back to the default behavior, call setThreshold(Default_t)
+   */
+  FullPivLU& setThreshold(const RealScalar& threshold) {
+    m_usePrescribedThreshold = true;
+    m_prescribedThreshold = threshold;
+    return *this;
+  }
 
-    /** \returns the rank of the matrix of which *this is the LU decomposition.
-      *
-      * \note This method has to determine which pivots should be considered nonzero.
-      *       For that, it uses the threshold value that you can control by calling
-      *       setThreshold(const RealScalar&).
-      */
-    inline Index rank() const
-    {
-      using std::abs;
-      eigen_assert(m_isInitialized && "LU is not initialized.");
-      RealScalar premultiplied_threshold = abs(m_maxpivot) * threshold();
-      Index result = 0;
-      for(Index i = 0; i < m_nonzero_pivots; ++i)
-        result += (abs(m_lu.coeff(i,i)) > premultiplied_threshold);
-      return result;
-    }
+  /** Allows to come back to the default behavior, letting Eigen use its default formula for
+   * determining the threshold.
+   *
+   * You should pass the special object Eigen::Default as parameter here.
+   * \code lu.setThreshold(Eigen::Default); \endcode
+   *
+   * See the documentation of setThreshold(const RealScalar&).
+   */
+  FullPivLU& setThreshold(Default_t) {
+    m_usePrescribedThreshold = false;
+    return *this;
+  }
 
-    /** \returns the dimension of the kernel of the matrix of which *this is the LU decomposition.
-      *
-      * \note This method has to determine which pivots should be considered nonzero.
-      *       For that, it uses the threshold value that you can control by calling
-      *       setThreshold(const RealScalar&).
-      */
-    inline Index dimensionOfKernel() const
-    {
-      eigen_assert(m_isInitialized && "LU is not initialized.");
-      return cols() - rank();
-    }
+  /** Returns the threshold that will be used by certain methods such as rank().
+   *
+   * See the documentation of setThreshold(const RealScalar&).
+   */
+  RealScalar threshold() const {
+    eigen_assert(m_isInitialized || m_usePrescribedThreshold);
+    return m_usePrescribedThreshold ? m_prescribedThreshold
+                                    // this formula comes from experimenting (see "LU precision tuning" thread on the
+                                    // list) and turns out to be identical to Higham's formula used already in LDLt.
+                                    : NumTraits<Scalar>::epsilon() * RealScalar(m_lu.diagonalSize());
+  }
 
-    /** \returns true if the matrix of which *this is the LU decomposition represents an injective
-      *          linear map, i.e. has trivial kernel; false otherwise.
-      *
-      * \note This method has to determine which pivots should be considered nonzero.
-      *       For that, it uses the threshold value that you can control by calling
-      *       setThreshold(const RealScalar&).
-      */
-    inline bool isInjective() const
-    {
-      eigen_assert(m_isInitialized && "LU is not initialized.");
-      return rank() == cols();
-    }
+  /** \returns the rank of the matrix of which *this is the LU decomposition.
+   *
+   * \note This method has to determine which pivots should be considered nonzero.
+   *       For that, it uses the threshold value that you can control by calling
+   *       setThreshold(const RealScalar&).
+   */
+  inline Index rank() const {
+    using std::abs;
+    eigen_assert(m_isInitialized && "LU is not initialized.");
+    RealScalar premultiplied_threshold = abs(m_maxpivot) * threshold();
+    Index result = 0;
+    for (Index i = 0; i < m_nonzero_pivots; ++i) result += (abs(m_lu.coeff(i, i)) > premultiplied_threshold);
+    return result;
+  }
 
-    /** \returns true if the matrix of which *this is the LU decomposition represents a surjective
-      *          linear map; false otherwise.
-      *
-      * \note This method has to determine which pivots should be considered nonzero.
-      *       For that, it uses the threshold value that you can control by calling
-      *       setThreshold(const RealScalar&).
-      */
-    inline bool isSurjective() const
-    {
-      eigen_assert(m_isInitialized && "LU is not initialized.");
-      return rank() == rows();
-    }
+  /** \returns the dimension of the kernel of the matrix of which *this is the LU decomposition.
+   *
+   * \note This method has to determine which pivots should be considered nonzero.
+   *       For that, it uses the threshold value that you can control by calling
+   *       setThreshold(const RealScalar&).
+   */
+  inline Index dimensionOfKernel() const {
+    eigen_assert(m_isInitialized && "LU is not initialized.");
+    return cols() - rank();
+  }
 
-    /** \returns true if the matrix of which *this is the LU decomposition is invertible.
-      *
-      * \note This method has to determine which pivots should be considered nonzero.
-      *       For that, it uses the threshold value that you can control by calling
-      *       setThreshold(const RealScalar&).
-      */
-    inline bool isInvertible() const
-    {
-      eigen_assert(m_isInitialized && "LU is not initialized.");
-      return isInjective() && (m_lu.rows() == m_lu.cols());
-    }
+  /** \returns true if the matrix of which *this is the LU decomposition represents an injective
+   *          linear map, i.e. has trivial kernel; false otherwise.
+   *
+   * \note This method has to determine which pivots should be considered nonzero.
+   *       For that, it uses the threshold value that you can control by calling
+   *       setThreshold(const RealScalar&).
+   */
+  inline bool isInjective() const {
+    eigen_assert(m_isInitialized && "LU is not initialized.");
+    return rank() == cols();
+  }
 
-    /** \returns the inverse of the matrix of which *this is the LU decomposition.
-      *
-      * \note If this matrix is not invertible, the returned matrix has undefined coefficients.
-      *       Use isInvertible() to first determine whether this matrix is invertible.
-      *
-      * \sa MatrixBase::inverse()
-      */
-    inline const internal::solve_retval<FullPivLU,typename MatrixType::IdentityReturnType> inverse() const
-    {
-      eigen_assert(m_isInitialized && "LU is not initialized.");
-      eigen_assert(m_lu.rows() == m_lu.cols() && "You can't take the inverse of a non-square matrix!");
-      return internal::solve_retval<FullPivLU,typename MatrixType::IdentityReturnType>
-               (*this, MatrixType::Identity(m_lu.rows(), m_lu.cols()));
-    }
+  /** \returns true if the matrix of which *this is the LU decomposition represents a surjective
+   *          linear map; false otherwise.
+   *
+   * \note This method has to determine which pivots should be considered nonzero.
+   *       For that, it uses the threshold value that you can control by calling
+   *       setThreshold(const RealScalar&).
+   */
+  inline bool isSurjective() const {
+    eigen_assert(m_isInitialized && "LU is not initialized.");
+    return rank() == rows();
+  }
 
-    MatrixType reconstructedMatrix() const;
+  /** \returns true if the matrix of which *this is the LU decomposition is invertible.
+   *
+   * \note This method has to determine which pivots should be considered nonzero.
+   *       For that, it uses the threshold value that you can control by calling
+   *       setThreshold(const RealScalar&).
+   */
+  inline bool isInvertible() const {
+    eigen_assert(m_isInitialized && "LU is not initialized.");
+    return isInjective() && (m_lu.rows() == m_lu.cols());
+  }
 
-    inline Index rows() const { return m_lu.rows(); }
-    inline Index cols() const { return m_lu.cols(); }
+  /** \returns the inverse of the matrix of which *this is the LU decomposition.
+   *
+   * \note If this matrix is not invertible, the returned matrix has undefined coefficients.
+   *       Use isInvertible() to first determine whether this matrix is invertible.
+   *
+   * \sa MatrixBase::inverse()
+   */
+  inline const Inverse<FullPivLU> inverse() const {
+    eigen_assert(m_isInitialized && "LU is not initialized.");
+    eigen_assert(m_lu.rows() == m_lu.cols() && "You can't take the inverse of a non-square matrix!");
+    return Inverse<FullPivLU>(*this);
+  }
 
-  protected:
-    
-    static void check_template_parameters()
-    {
-      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
-    }
-    
-    MatrixType m_lu;
-    PermutationPType m_p;
-    PermutationQType m_q;
-    IntColVectorType m_rowsTranspositions;
-    IntRowVectorType m_colsTranspositions;
-    Index m_det_pq, m_nonzero_pivots;
-    RealScalar m_maxpivot, m_prescribedThreshold;
-    bool m_isInitialized, m_usePrescribedThreshold;
+  MatrixType reconstructedMatrix() const;
+
+  EIGEN_DEVICE_FUNC constexpr Index rows() const noexcept { return m_lu.rows(); }
+  EIGEN_DEVICE_FUNC constexpr Index cols() const noexcept { return m_lu.cols(); }
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  template <typename RhsType, typename DstType>
+  void _solve_impl(const RhsType& rhs, DstType& dst) const;
+
+  template <bool Conjugate, typename RhsType, typename DstType>
+  void _solve_impl_transposed(const RhsType& rhs, DstType& dst) const;
+#endif
+
+ protected:
+  EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
+
+  void computeInPlace();
+
+  MatrixType m_lu;
+  PermutationPType m_p;
+  PermutationQType m_q;
+  IntColVectorType m_rowsTranspositions;
+  IntRowVectorType m_colsTranspositions;
+  Index m_nonzero_pivots;
+  RealScalar m_l1_norm;
+  RealScalar m_maxpivot, m_prescribedThreshold;
+  signed char m_det_pq;
+  bool m_isInitialized, m_usePrescribedThreshold;
 };
 
-template<typename MatrixType>
-FullPivLU<MatrixType>::FullPivLU()
-  : m_isInitialized(false), m_usePrescribedThreshold(false)
-{
+template <typename MatrixType, typename PermutationIndex>
+FullPivLU<MatrixType, PermutationIndex>::FullPivLU() : m_isInitialized(false), m_usePrescribedThreshold(false) {}
+
+template <typename MatrixType, typename PermutationIndex>
+FullPivLU<MatrixType, PermutationIndex>::FullPivLU(Index rows, Index cols)
+    : m_lu(rows, cols),
+      m_p(rows),
+      m_q(cols),
+      m_rowsTranspositions(rows),
+      m_colsTranspositions(cols),
+      m_isInitialized(false),
+      m_usePrescribedThreshold(false) {}
+
+template <typename MatrixType, typename PermutationIndex>
+template <typename InputType>
+FullPivLU<MatrixType, PermutationIndex>::FullPivLU(const EigenBase<InputType>& matrix)
+    : m_lu(matrix.rows(), matrix.cols()),
+      m_p(matrix.rows()),
+      m_q(matrix.cols()),
+      m_rowsTranspositions(matrix.rows()),
+      m_colsTranspositions(matrix.cols()),
+      m_isInitialized(false),
+      m_usePrescribedThreshold(false) {
+  compute(matrix.derived());
 }
 
-template<typename MatrixType>
-FullPivLU<MatrixType>::FullPivLU(Index rows, Index cols)
-  : m_lu(rows, cols),
-    m_p(rows),
-    m_q(cols),
-    m_rowsTranspositions(rows),
-    m_colsTranspositions(cols),
-    m_isInitialized(false),
-    m_usePrescribedThreshold(false)
-{
+template <typename MatrixType, typename PermutationIndex>
+template <typename InputType>
+FullPivLU<MatrixType, PermutationIndex>::FullPivLU(EigenBase<InputType>& matrix)
+    : m_lu(matrix.derived()),
+      m_p(matrix.rows()),
+      m_q(matrix.cols()),
+      m_rowsTranspositions(matrix.rows()),
+      m_colsTranspositions(matrix.cols()),
+      m_isInitialized(false),
+      m_usePrescribedThreshold(false) {
+  computeInPlace();
 }
 
-template<typename MatrixType>
-FullPivLU<MatrixType>::FullPivLU(const MatrixType& matrix)
-  : m_lu(matrix.rows(), matrix.cols()),
-    m_p(matrix.rows()),
-    m_q(matrix.cols()),
-    m_rowsTranspositions(matrix.rows()),
-    m_colsTranspositions(matrix.cols()),
-    m_isInitialized(false),
-    m_usePrescribedThreshold(false)
-{
-  compute(matrix);
-}
+template <typename MatrixType, typename PermutationIndex>
+void FullPivLU<MatrixType, PermutationIndex>::computeInPlace() {
+  eigen_assert(m_lu.rows() <= NumTraits<PermutationIndex>::highest() &&
+               m_lu.cols() <= NumTraits<PermutationIndex>::highest());
 
-template<typename MatrixType>
-FullPivLU<MatrixType>& FullPivLU<MatrixType>::compute(const MatrixType& matrix)
-{
-  check_template_parameters();
-  
-  // the permutations are stored as int indices, so just to be sure:
-  eigen_assert(matrix.rows()<=NumTraits<int>::highest() && matrix.cols()<=NumTraits<int>::highest());
-  
-  m_isInitialized = true;
-  m_lu = matrix;
+  m_l1_norm = m_lu.cwiseAbs().colwise().sum().maxCoeff();
 
-  const Index size = matrix.diagonalSize();
-  const Index rows = matrix.rows();
-  const Index cols = matrix.cols();
+  const Index size = m_lu.diagonalSize();
+  const Index rows = m_lu.rows();
+  const Index cols = m_lu.cols();
 
   // will store the transpositions, before we accumulate them at the end.
   // can't accumulate on-the-fly because that will be done in reverse order for the rows.
-  m_rowsTranspositions.resize(matrix.rows());
-  m_colsTranspositions.resize(matrix.cols());
-  Index number_of_transpositions = 0; // number of NONTRIVIAL transpositions, i.e. m_rowsTranspositions[i]!=i
+  m_rowsTranspositions.resize(m_lu.rows());
+  m_colsTranspositions.resize(m_lu.cols());
+  Index number_of_transpositions = 0;  // number of NONTRIVIAL transpositions, i.e. m_rowsTranspositions[i]!=i
 
-  m_nonzero_pivots = size; // the generic case is that in which all pivots are nonzero (invertible case)
+  m_nonzero_pivots = size;  // the generic case is that in which all pivots are nonzero (invertible case)
   m_maxpivot = RealScalar(0);
 
-  for(Index k = 0; k < size; ++k)
-  {
+  for (Index k = 0; k < size; ++k) {
     // First, we need to find the pivot.
 
     // biggest coefficient in the remaining bottom-right corner (starting at row k, col k)
     Index row_of_biggest_in_corner, col_of_biggest_in_corner;
-    RealScalar biggest_in_corner;
-    biggest_in_corner = m_lu.bottomRightCorner(rows-k, cols-k)
-                        .cwiseAbs()
-                        .maxCoeff(&row_of_biggest_in_corner, &col_of_biggest_in_corner);
-    row_of_biggest_in_corner += k; // correct the values! since they were computed in the corner,
-    col_of_biggest_in_corner += k; // need to add k to them.
-
-    if(biggest_in_corner==RealScalar(0))
-    {
+    typedef internal::scalar_score_coeff_op<Scalar> Scoring;
+    typedef typename Scoring::result_type Score;
+    Score biggest_in_corner;
+    biggest_in_corner = m_lu.bottomRightCorner(rows - k, cols - k)
+                            .unaryExpr(Scoring())
+                            .maxCoeff(&row_of_biggest_in_corner, &col_of_biggest_in_corner);
+    row_of_biggest_in_corner += k;  // correct the values! since they were computed in the corner,
+    col_of_biggest_in_corner += k;  // need to add k to them.
+
+    if (numext::is_exactly_zero(biggest_in_corner)) {
       // before exiting, make sure to initialize the still uninitialized transpositions
       // in a sane state without destroying what we already have.
       m_nonzero_pivots = k;
-      for(Index i = k; i < size; ++i)
-      {
-        m_rowsTranspositions.coeffRef(i) = i;
-        m_colsTranspositions.coeffRef(i) = i;
+      for (Index i = k; i < size; ++i) {
+        m_rowsTranspositions.coeffRef(i) = internal::convert_index<StorageIndex>(i);
+        m_colsTranspositions.coeffRef(i) = internal::convert_index<StorageIndex>(i);
       }
       break;
     }
 
-    if(biggest_in_corner > m_maxpivot) m_maxpivot = biggest_in_corner;
+    RealScalar abs_pivot = internal::abs_knowing_score<Scalar>()(
+        m_lu(row_of_biggest_in_corner, col_of_biggest_in_corner), biggest_in_corner);
+    if (abs_pivot > m_maxpivot) m_maxpivot = abs_pivot;
 
     // Now that we've found the pivot, we need to apply the row/col swaps to
     // bring it to the location (k,k).
 
-    m_rowsTranspositions.coeffRef(k) = row_of_biggest_in_corner;
-    m_colsTranspositions.coeffRef(k) = col_of_biggest_in_corner;
-    if(k != row_of_biggest_in_corner) {
+    m_rowsTranspositions.coeffRef(k) = internal::convert_index<StorageIndex>(row_of_biggest_in_corner);
+    m_colsTranspositions.coeffRef(k) = internal::convert_index<StorageIndex>(col_of_biggest_in_corner);
+    if (k != row_of_biggest_in_corner) {
       m_lu.row(k).swap(m_lu.row(row_of_biggest_in_corner));
       ++number_of_transpositions;
     }
-    if(k != col_of_biggest_in_corner) {
+    if (k != col_of_biggest_in_corner) {
       m_lu.col(k).swap(m_lu.col(col_of_biggest_in_corner));
       ++number_of_transpositions;
     }
@@ -490,30 +535,28 @@ FullPivLU<MatrixType>& FullPivLU<MatrixType>::compute(const MatrixType& matrix)
     // Now that the pivot is at the right location, we update the remaining
     // bottom-right corner by Gaussian elimination.
 
-    if(k<rows-1)
-      m_lu.col(k).tail(rows-k-1) /= m_lu.coeff(k,k);
-    if(k<size-1)
-      m_lu.block(k+1,k+1,rows-k-1,cols-k-1).noalias() -= m_lu.col(k).tail(rows-k-1) * m_lu.row(k).tail(cols-k-1);
+    if (k < rows - 1) m_lu.col(k).tail(rows - k - 1) /= m_lu.coeff(k, k);
+    if (k < size - 1)
+      m_lu.block(k + 1, k + 1, rows - k - 1, cols - k - 1).noalias() -=
+          m_lu.col(k).tail(rows - k - 1) * m_lu.row(k).tail(cols - k - 1);
   }
 
   // the main loop is over, we still have to accumulate the transpositions to find the
   // permutations P and Q
 
   m_p.setIdentity(rows);
-  for(Index k = size-1; k >= 0; --k)
-    m_p.applyTranspositionOnTheRight(k, m_rowsTranspositions.coeff(k));
+  for (Index k = size - 1; k >= 0; --k) m_p.applyTranspositionOnTheRight(k, m_rowsTranspositions.coeff(k));
 
   m_q.setIdentity(cols);
-  for(Index k = 0; k < size; ++k)
-    m_q.applyTranspositionOnTheRight(k, m_colsTranspositions.coeff(k));
+  for (Index k = 0; k < size; ++k) m_q.applyTranspositionOnTheRight(k, m_colsTranspositions.coeff(k));
 
-  m_det_pq = (number_of_transpositions%2) ? -1 : 1;
-  return *this;
+  m_det_pq = (number_of_transpositions % 2) ? -1 : 1;
+
+  m_isInitialized = true;
 }
 
-template<typename MatrixType>
-typename internal::traits<MatrixType>::Scalar FullPivLU<MatrixType>::determinant() const
-{
+template <typename MatrixType, typename PermutationIndex>
+typename internal::traits<MatrixType>::Scalar FullPivLU<MatrixType, PermutationIndex>::determinant() const {
   eigen_assert(m_isInitialized && "LU is not initialized.");
   eigen_assert(m_lu.rows() == m_lu.cols() && "You can't take the determinant of a non-square matrix!");
   return Scalar(m_det_pq) * Scalar(m_lu.diagonal().prod());
@@ -522,18 +565,15 @@ typename internal::traits<MatrixType>::Scalar FullPivLU<MatrixType>::determinant
 /** \returns the matrix represented by the decomposition,
  * i.e., it returns the product: \f$ P^{-1} L U Q^{-1} \f$.
  * This function is provided for debug purposes. */
-template<typename MatrixType>
-MatrixType FullPivLU<MatrixType>::reconstructedMatrix() const
-{
+template <typename MatrixType, typename PermutationIndex>
+MatrixType FullPivLU<MatrixType, PermutationIndex>::reconstructedMatrix() const {
   eigen_assert(m_isInitialized && "LU is not initialized.");
   const Index smalldim = (std::min)(m_lu.rows(), m_lu.cols());
   // LU
-  MatrixType res(m_lu.rows(),m_lu.cols());
+  MatrixType res(m_lu.rows(), m_lu.cols());
   // FIXME the .toDenseMatrix() should not be needed...
-  res = m_lu.leftCols(smalldim)
-            .template triangularView<UnitLower>().toDenseMatrix()
-      * m_lu.topRows(smalldim)
-            .template triangularView<Upper>().toDenseMatrix();
+  res = m_lu.leftCols(smalldim).template triangularView<UnitLower>().toDenseMatrix() *
+        m_lu.topRows(smalldim).template triangularView<Upper>().toDenseMatrix();
 
   // P^{-1}(LU)
   res = m_p.inverse() * res;
@@ -547,23 +587,21 @@ MatrixType FullPivLU<MatrixType>::reconstructedMatrix() const
 /********* Implementation of kernel() **************************************************/
 
 namespace internal {
-template<typename _MatrixType>
-struct kernel_retval<FullPivLU<_MatrixType> >
-  : kernel_retval_base<FullPivLU<_MatrixType> >
-{
-  EIGEN_MAKE_KERNEL_HELPERS(FullPivLU<_MatrixType>)
-
-  enum { MaxSmallDimAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED(
-            MatrixType::MaxColsAtCompileTime,
-            MatrixType::MaxRowsAtCompileTime)
+template <typename MatrixType_, typename PermutationIndex_>
+struct kernel_retval<FullPivLU<MatrixType_, PermutationIndex_> >
+    : kernel_retval_base<FullPivLU<MatrixType_, PermutationIndex_> > {
+  using DecompositionType = FullPivLU<MatrixType_, PermutationIndex_>;
+  EIGEN_MAKE_KERNEL_HELPERS(DecompositionType)
+
+  enum {
+    MaxSmallDimAtCompileTime = min_size_prefer_fixed(MatrixType::MaxColsAtCompileTime, MatrixType::MaxRowsAtCompileTime)
   };
 
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
+  template <typename Dest>
+  void evalTo(Dest& dst) const {
     using std::abs;
     const Index cols = dec().matrixLU().cols(), dimker = cols - rank();
-    if(dimker == 0)
-    {
+    if (dimker == 0) {
       // The Kernel is just {0}, so it doesn't have a basis properly speaking, but let's
       // avoid crashing/asserting as that depends on floating point calculations. Let's
       // just return a single column vector filled with zeros.
@@ -572,83 +610,74 @@ struct kernel_retval<FullPivLU<_MatrixType> >
     }
 
     /* Let us use the following lemma:
-      *
-      * Lemma: If the matrix A has the LU decomposition PAQ = LU,
-      * then Ker A = Q(Ker U).
-      *
-      * Proof: trivial: just keep in mind that P, Q, L are invertible.
-      */
+     *
+     * Lemma: If the matrix A has the LU decomposition PAQ = LU,
+     * then Ker A = Q(Ker U).
+     *
+     * Proof: trivial: just keep in mind that P, Q, L are invertible.
+     */
 
     /* Thus, all we need to do is to compute Ker U, and then apply Q.
-      *
-      * U is upper triangular, with eigenvalues sorted so that any zeros appear at the end.
-      * Thus, the diagonal of U ends with exactly
-      * dimKer zero's. Let us use that to construct dimKer linearly
-      * independent vectors in Ker U.
-      */
+     *
+     * U is upper triangular, with eigenvalues sorted so that any zeros appear at the end.
+     * Thus, the diagonal of U ends with exactly
+     * dimKer zero's. Let us use that to construct dimKer linearly
+     * independent vectors in Ker U.
+     */
 
     Matrix<Index, Dynamic, 1, 0, MaxSmallDimAtCompileTime, 1> pivots(rank());
     RealScalar premultiplied_threshold = dec().maxPivot() * dec().threshold();
     Index p = 0;
-    for(Index i = 0; i < dec().nonzeroPivots(); ++i)
-      if(abs(dec().matrixLU().coeff(i,i)) > premultiplied_threshold)
-        pivots.coeffRef(p++) = i;
+    for (Index i = 0; i < dec().nonzeroPivots(); ++i)
+      if (abs(dec().matrixLU().coeff(i, i)) > premultiplied_threshold) pivots.coeffRef(p++) = i;
     eigen_internal_assert(p == rank());
 
     // we construct a temporaty trapezoid matrix m, by taking the U matrix and
     // permuting the rows and cols to bring the nonnegligible pivots to the top of
     // the main diagonal. We need that to be able to apply our triangular solvers.
     // FIXME when we get triangularView-for-rectangular-matrices, this can be simplified
-    Matrix<typename MatrixType::Scalar, Dynamic, Dynamic, MatrixType::Options,
-           MaxSmallDimAtCompileTime, MatrixType::MaxColsAtCompileTime>
-      m(dec().matrixLU().block(0, 0, rank(), cols));
-    for(Index i = 0; i < rank(); ++i)
-    {
-      if(i) m.row(i).head(i).setZero();
-      m.row(i).tail(cols-i) = dec().matrixLU().row(pivots.coeff(i)).tail(cols-i);
+    Matrix<typename MatrixType::Scalar, Dynamic, Dynamic, traits<MatrixType>::Options, MaxSmallDimAtCompileTime,
+           MatrixType::MaxColsAtCompileTime>
+        m(dec().matrixLU().block(0, 0, rank(), cols));
+    for (Index i = 0; i < rank(); ++i) {
+      if (i) m.row(i).head(i).setZero();
+      m.row(i).tail(cols - i) = dec().matrixLU().row(pivots.coeff(i)).tail(cols - i);
     }
     m.block(0, 0, rank(), rank());
     m.block(0, 0, rank(), rank()).template triangularView<StrictlyLower>().setZero();
-    for(Index i = 0; i < rank(); ++i)
-      m.col(i).swap(m.col(pivots.coeff(i)));
+    for (Index i = 0; i < rank(); ++i) m.col(i).swap(m.col(pivots.coeff(i)));
 
     // ok, we have our trapezoid matrix, we can apply the triangular solver.
     // notice that the math behind this suggests that we should apply this to the
     // negative of the RHS, but for performance we just put the negative sign elsewhere, see below.
-    m.topLeftCorner(rank(), rank())
-     .template triangularView<Upper>().solveInPlace(
-        m.topRightCorner(rank(), dimker)
-      );
+    m.topLeftCorner(rank(), rank()).template triangularView<Upper>().solveInPlace(m.topRightCorner(rank(), dimker));
 
     // now we must undo the column permutation that we had applied!
-    for(Index i = rank()-1; i >= 0; --i)
-      m.col(i).swap(m.col(pivots.coeff(i)));
+    for (Index i = rank() - 1; i >= 0; --i) m.col(i).swap(m.col(pivots.coeff(i)));
 
     // see the negative sign in the next line, that's what we were talking about above.
-    for(Index i = 0; i < rank(); ++i) dst.row(dec().permutationQ().indices().coeff(i)) = -m.row(i).tail(dimker);
-    for(Index i = rank(); i < cols; ++i) dst.row(dec().permutationQ().indices().coeff(i)).setZero();
-    for(Index k = 0; k < dimker; ++k) dst.coeffRef(dec().permutationQ().indices().coeff(rank()+k), k) = Scalar(1);
+    for (Index i = 0; i < rank(); ++i) dst.row(dec().permutationQ().indices().coeff(i)) = -m.row(i).tail(dimker);
+    for (Index i = rank(); i < cols; ++i) dst.row(dec().permutationQ().indices().coeff(i)).setZero();
+    for (Index k = 0; k < dimker; ++k) dst.coeffRef(dec().permutationQ().indices().coeff(rank() + k), k) = Scalar(1);
   }
 };
 
 /***** Implementation of image() *****************************************************/
 
-template<typename _MatrixType>
-struct image_retval<FullPivLU<_MatrixType> >
-  : image_retval_base<FullPivLU<_MatrixType> >
-{
-  EIGEN_MAKE_IMAGE_HELPERS(FullPivLU<_MatrixType>)
+template <typename MatrixType_, typename PermutationIndex_>
+struct image_retval<FullPivLU<MatrixType_, PermutationIndex_> >
+    : image_retval_base<FullPivLU<MatrixType_, PermutationIndex_> > {
+  using DecompositionType = FullPivLU<MatrixType_, PermutationIndex_>;
+  EIGEN_MAKE_IMAGE_HELPERS(DecompositionType)
 
-  enum { MaxSmallDimAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED(
-            MatrixType::MaxColsAtCompileTime,
-            MatrixType::MaxRowsAtCompileTime)
+  enum {
+    MaxSmallDimAtCompileTime = min_size_prefer_fixed(MatrixType::MaxColsAtCompileTime, MatrixType::MaxRowsAtCompileTime)
   };
 
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
+  template <typename Dest>
+  void evalTo(Dest& dst) const {
     using std::abs;
-    if(rank() == 0)
-    {
+    if (rank() == 0) {
       // The Image is just {0}, so it doesn't have a basis properly speaking, but let's
       // avoid crashing/asserting as that depends on floating point calculations. Let's
       // just return a single column vector filled with zeros.
@@ -659,93 +688,139 @@ struct image_retval<FullPivLU<_MatrixType> >
     Matrix<Index, Dynamic, 1, 0, MaxSmallDimAtCompileTime, 1> pivots(rank());
     RealScalar premultiplied_threshold = dec().maxPivot() * dec().threshold();
     Index p = 0;
-    for(Index i = 0; i < dec().nonzeroPivots(); ++i)
-      if(abs(dec().matrixLU().coeff(i,i)) > premultiplied_threshold)
-        pivots.coeffRef(p++) = i;
+    for (Index i = 0; i < dec().nonzeroPivots(); ++i)
+      if (abs(dec().matrixLU().coeff(i, i)) > premultiplied_threshold) pivots.coeffRef(p++) = i;
     eigen_internal_assert(p == rank());
 
-    for(Index i = 0; i < rank(); ++i)
+    for (Index i = 0; i < rank(); ++i)
       dst.col(i) = originalMatrix().col(dec().permutationQ().indices().coeff(pivots.coeff(i)));
   }
 };
 
 /***** Implementation of solve() *****************************************************/
 
-template<typename _MatrixType, typename Rhs>
-struct solve_retval<FullPivLU<_MatrixType>, Rhs>
-  : solve_retval_base<FullPivLU<_MatrixType>, Rhs>
-{
-  EIGEN_MAKE_SOLVE_HELPERS(FullPivLU<_MatrixType>,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    /* The decomposition PAQ = LU can be rewritten as A = P^{-1} L U Q^{-1}.
-     * So we proceed as follows:
-     * Step 1: compute c = P * rhs.
-     * Step 2: replace c by the solution x to Lx = c. Exists because L is invertible.
-     * Step 3: replace c by the solution x to Ux = c. May or may not exist.
-     * Step 4: result = Q * c;
-     */
+}  // end namespace internal
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+template <typename MatrixType_, typename PermutationIndex_>
+template <typename RhsType, typename DstType>
+void FullPivLU<MatrixType_, PermutationIndex_>::_solve_impl(const RhsType& rhs, DstType& dst) const {
+  /* The decomposition PAQ = LU can be rewritten as A = P^{-1} L U Q^{-1}.
+   * So we proceed as follows:
+   * Step 1: compute c = P * rhs.
+   * Step 2: replace c by the solution x to Lx = c. Exists because L is invertible.
+   * Step 3: replace c by the solution x to Ux = c. May or may not exist.
+   * Step 4: result = Q * c;
+   */
+
+  const Index rows = this->rows(), cols = this->cols(), nonzero_pivots = this->rank();
+  const Index smalldim = (std::min)(rows, cols);
+
+  if (nonzero_pivots == 0) {
+    dst.setZero();
+    return;
+  }
 
-    const Index rows = dec().rows(), cols = dec().cols(),
-              nonzero_pivots = dec().nonzeroPivots();
-    eigen_assert(rhs().rows() == rows);
-    const Index smalldim = (std::min)(rows, cols);
+  typename RhsType::PlainObject c(rhs.rows(), rhs.cols());
 
-    if(nonzero_pivots == 0)
-    {
-      dst.setZero();
-      return;
-    }
+  // Step 1
+  c = permutationP() * rhs;
 
-    typename Rhs::PlainObject c(rhs().rows(), rhs().cols());
-
-    // Step 1
-    c = dec().permutationP() * rhs();
-
-    // Step 2
-    dec().matrixLU()
-        .topLeftCorner(smalldim,smalldim)
-        .template triangularView<UnitLower>()
-        .solveInPlace(c.topRows(smalldim));
-    if(rows>cols)
-    {
-      c.bottomRows(rows-cols)
-        -= dec().matrixLU().bottomRows(rows-cols)
-         * c.topRows(cols);
-    }
+  // Step 2
+  m_lu.topLeftCorner(smalldim, smalldim).template triangularView<UnitLower>().solveInPlace(c.topRows(smalldim));
+  if (rows > cols) c.bottomRows(rows - cols).noalias() -= m_lu.bottomRows(rows - cols) * c.topRows(cols);
+
+  // Step 3
+  m_lu.topLeftCorner(nonzero_pivots, nonzero_pivots)
+      .template triangularView<Upper>()
+      .solveInPlace(c.topRows(nonzero_pivots));
+
+  // Step 4
+  for (Index i = 0; i < nonzero_pivots; ++i) dst.row(permutationQ().indices().coeff(i)) = c.row(i);
+  for (Index i = nonzero_pivots; i < m_lu.cols(); ++i) dst.row(permutationQ().indices().coeff(i)).setZero();
+}
 
-    // Step 3
-    dec().matrixLU()
-        .topLeftCorner(nonzero_pivots, nonzero_pivots)
-        .template triangularView<Upper>()
-        .solveInPlace(c.topRows(nonzero_pivots));
-
-    // Step 4
-    for(Index i = 0; i < nonzero_pivots; ++i)
-      dst.row(dec().permutationQ().indices().coeff(i)) = c.row(i);
-    for(Index i = nonzero_pivots; i < dec().matrixLU().cols(); ++i)
-      dst.row(dec().permutationQ().indices().coeff(i)).setZero();
+template <typename MatrixType_, typename PermutationIndex_>
+template <bool Conjugate, typename RhsType, typename DstType>
+void FullPivLU<MatrixType_, PermutationIndex_>::_solve_impl_transposed(const RhsType& rhs, DstType& dst) const {
+  /* The decomposition PAQ = LU can be rewritten as A = P^{-1} L U Q^{-1},
+   * and since permutations are real and unitary, we can write this
+   * as   A^T = Q U^T L^T P,
+   * So we proceed as follows:
+   * Step 1: compute c = Q^T rhs.
+   * Step 2: replace c by the solution x to U^T x = c. May or may not exist.
+   * Step 3: replace c by the solution x to L^T x = c.
+   * Step 4: result = P^T c.
+   * If Conjugate is true, replace "^T" by "^*" above.
+   */
+
+  const Index rows = this->rows(), cols = this->cols(), nonzero_pivots = this->rank();
+  const Index smalldim = (std::min)(rows, cols);
+
+  if (nonzero_pivots == 0) {
+    dst.setZero();
+    return;
   }
-};
 
-} // end namespace internal
+  typename RhsType::PlainObject c(rhs.rows(), rhs.cols());
+
+  // Step 1
+  c = permutationQ().inverse() * rhs;
+
+  // Step 2
+  m_lu.topLeftCorner(nonzero_pivots, nonzero_pivots)
+      .template triangularView<Upper>()
+      .transpose()
+      .template conjugateIf<Conjugate>()
+      .solveInPlace(c.topRows(nonzero_pivots));
+
+  // Step 3
+  m_lu.topLeftCorner(smalldim, smalldim)
+      .template triangularView<UnitLower>()
+      .transpose()
+      .template conjugateIf<Conjugate>()
+      .solveInPlace(c.topRows(smalldim));
+
+  // Step 4
+  PermutationPType invp = permutationP().inverse().eval();
+  for (Index i = 0; i < smalldim; ++i) dst.row(invp.indices().coeff(i)) = c.row(i);
+  for (Index i = smalldim; i < rows; ++i) dst.row(invp.indices().coeff(i)).setZero();
+}
+
+#endif
+
+namespace internal {
+
+/***** Implementation of inverse() *****************************************************/
+template <typename DstXprType, typename MatrixType, typename PermutationIndex>
+struct Assignment<
+    DstXprType, Inverse<FullPivLU<MatrixType, PermutationIndex> >,
+    internal::assign_op<typename DstXprType::Scalar, typename FullPivLU<MatrixType, PermutationIndex>::Scalar>,
+    Dense2Dense> {
+  typedef FullPivLU<MatrixType, PermutationIndex> LuType;
+  typedef Inverse<LuType> SrcXprType;
+  static void run(DstXprType& dst, const SrcXprType& src,
+                  const internal::assign_op<typename DstXprType::Scalar, typename MatrixType::Scalar>&) {
+    dst = src.nestedExpression().solve(MatrixType::Identity(src.rows(), src.cols()));
+  }
+};
+}  // end namespace internal
 
 /******* MatrixBase methods *****************************************************************/
 
 /** \lu_module
-  *
-  * \return the full-pivoting LU decomposition of \c *this.
-  *
-  * \sa class FullPivLU
-  */
-template<typename Derived>
-inline const FullPivLU<typename MatrixBase<Derived>::PlainObject>
-MatrixBase<Derived>::fullPivLu() const
-{
-  return FullPivLU<PlainObject>(eval());
+ *
+ * \return the full-pivoting LU decomposition of \c *this.
+ *
+ * \sa class FullPivLU
+ */
+template <typename Derived>
+template <typename PermutationIndex>
+inline const FullPivLU<typename MatrixBase<Derived>::PlainObject, PermutationIndex> MatrixBase<Derived>::fullPivLu()
+    const {
+  return FullPivLU<PlainObject, PermutationIndex>(eval());
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_LU_H
+#endif  // EIGEN_LU_H
diff --git a/inst/include/Eigen/src/LU/InternalHeaderCheck.h b/inst/include/Eigen/src/LU/InternalHeaderCheck.h
new file mode 100644
index 00000000..f346b179
--- /dev/null
+++ b/inst/include/Eigen/src/LU/InternalHeaderCheck.h
@@ -0,0 +1,3 @@
+#ifndef EIGEN_LU_MODULE_H
+#error "Please include Eigen/LU instead of including headers inside the src directory directly."
+#endif
diff --git a/inst/include/Eigen/src/LU/Inverse.h b/inst/include/Eigen/src/LU/Inverse.h
deleted file mode 100644
index 3cf88719..00000000
--- a/inst/include/Eigen/src/LU/Inverse.h
+++ /dev/null
@@ -1,400 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_INVERSE_H
-#define EIGEN_INVERSE_H
-
-namespace Eigen { 
-
-namespace internal {
-
-/**********************************
-*** General case implementation ***
-**********************************/
-
-template<typename MatrixType, typename ResultType, int Size = MatrixType::RowsAtCompileTime>
-struct compute_inverse
-{
-  static inline void run(const MatrixType& matrix, ResultType& result)
-  {
-    result = matrix.partialPivLu().inverse();
-  }
-};
-
-template<typename MatrixType, typename ResultType, int Size = MatrixType::RowsAtCompileTime>
-struct compute_inverse_and_det_with_check { /* nothing! general case not supported. */ };
-
-/****************************
-*** Size 1 implementation ***
-****************************/
-
-template<typename MatrixType, typename ResultType>
-struct compute_inverse<MatrixType, ResultType, 1>
-{
-  static inline void run(const MatrixType& matrix, ResultType& result)
-  {
-    typedef typename MatrixType::Scalar Scalar;
-    result.coeffRef(0,0) = Scalar(1) / matrix.coeff(0,0);
-  }
-};
-
-template<typename MatrixType, typename ResultType>
-struct compute_inverse_and_det_with_check<MatrixType, ResultType, 1>
-{
-  static inline void run(
-    const MatrixType& matrix,
-    const typename MatrixType::RealScalar& absDeterminantThreshold,
-    ResultType& result,
-    typename ResultType::Scalar& determinant,
-    bool& invertible
-  )
-  {
-    using std::abs;
-    determinant = matrix.coeff(0,0);
-    invertible = abs(determinant) > absDeterminantThreshold;
-    if(invertible) result.coeffRef(0,0) = typename ResultType::Scalar(1) / determinant;
-  }
-};
-
-/****************************
-*** Size 2 implementation ***
-****************************/
-
-template<typename MatrixType, typename ResultType>
-inline void compute_inverse_size2_helper(
-    const MatrixType& matrix, const typename ResultType::Scalar& invdet,
-    ResultType& result)
-{
-  result.coeffRef(0,0) = matrix.coeff(1,1) * invdet;
-  result.coeffRef(1,0) = -matrix.coeff(1,0) * invdet;
-  result.coeffRef(0,1) = -matrix.coeff(0,1) * invdet;
-  result.coeffRef(1,1) = matrix.coeff(0,0) * invdet;
-}
-
-template<typename MatrixType, typename ResultType>
-struct compute_inverse<MatrixType, ResultType, 2>
-{
-  static inline void run(const MatrixType& matrix, ResultType& result)
-  {
-    typedef typename ResultType::Scalar Scalar;
-    const Scalar invdet = typename MatrixType::Scalar(1) / matrix.determinant();
-    compute_inverse_size2_helper(matrix, invdet, result);
-  }
-};
-
-template<typename MatrixType, typename ResultType>
-struct compute_inverse_and_det_with_check<MatrixType, ResultType, 2>
-{
-  static inline void run(
-    const MatrixType& matrix,
-    const typename MatrixType::RealScalar& absDeterminantThreshold,
-    ResultType& inverse,
-    typename ResultType::Scalar& determinant,
-    bool& invertible
-  )
-  {
-    using std::abs;
-    typedef typename ResultType::Scalar Scalar;
-    determinant = matrix.determinant();
-    invertible = abs(determinant) > absDeterminantThreshold;
-    if(!invertible) return;
-    const Scalar invdet = Scalar(1) / determinant;
-    compute_inverse_size2_helper(matrix, invdet, inverse);
-  }
-};
-
-/****************************
-*** Size 3 implementation ***
-****************************/
-
-template<typename MatrixType, int i, int j>
-inline typename MatrixType::Scalar cofactor_3x3(const MatrixType& m)
-{
-  enum {
-    i1 = (i+1) % 3,
-    i2 = (i+2) % 3,
-    j1 = (j+1) % 3,
-    j2 = (j+2) % 3
-  };
-  return m.coeff(i1, j1) * m.coeff(i2, j2)
-       - m.coeff(i1, j2) * m.coeff(i2, j1);
-}
-
-template<typename MatrixType, typename ResultType>
-inline void compute_inverse_size3_helper(
-    const MatrixType& matrix,
-    const typename ResultType::Scalar& invdet,
-    const Matrix<typename ResultType::Scalar,3,1>& cofactors_col0,
-    ResultType& result)
-{
-  result.row(0) = cofactors_col0 * invdet;
-  result.coeffRef(1,0) =  cofactor_3x3<MatrixType,0,1>(matrix) * invdet;
-  result.coeffRef(1,1) =  cofactor_3x3<MatrixType,1,1>(matrix) * invdet;
-  result.coeffRef(1,2) =  cofactor_3x3<MatrixType,2,1>(matrix) * invdet;
-  result.coeffRef(2,0) =  cofactor_3x3<MatrixType,0,2>(matrix) * invdet;
-  result.coeffRef(2,1) =  cofactor_3x3<MatrixType,1,2>(matrix) * invdet;
-  result.coeffRef(2,2) =  cofactor_3x3<MatrixType,2,2>(matrix) * invdet;
-}
-
-template<typename MatrixType, typename ResultType>
-struct compute_inverse<MatrixType, ResultType, 3>
-{
-  static inline void run(const MatrixType& matrix, ResultType& result)
-  {
-    typedef typename ResultType::Scalar Scalar;
-    Matrix<typename MatrixType::Scalar,3,1> cofactors_col0;
-    cofactors_col0.coeffRef(0) =  cofactor_3x3<MatrixType,0,0>(matrix);
-    cofactors_col0.coeffRef(1) =  cofactor_3x3<MatrixType,1,0>(matrix);
-    cofactors_col0.coeffRef(2) =  cofactor_3x3<MatrixType,2,0>(matrix);
-    const Scalar det = (cofactors_col0.cwiseProduct(matrix.col(0))).sum();
-    const Scalar invdet = Scalar(1) / det;
-    compute_inverse_size3_helper(matrix, invdet, cofactors_col0, result);
-  }
-};
-
-template<typename MatrixType, typename ResultType>
-struct compute_inverse_and_det_with_check<MatrixType, ResultType, 3>
-{
-  static inline void run(
-    const MatrixType& matrix,
-    const typename MatrixType::RealScalar& absDeterminantThreshold,
-    ResultType& inverse,
-    typename ResultType::Scalar& determinant,
-    bool& invertible
-  )
-  {
-    using std::abs;
-    typedef typename ResultType::Scalar Scalar;
-    Matrix<Scalar,3,1> cofactors_col0;
-    cofactors_col0.coeffRef(0) =  cofactor_3x3<MatrixType,0,0>(matrix);
-    cofactors_col0.coeffRef(1) =  cofactor_3x3<MatrixType,1,0>(matrix);
-    cofactors_col0.coeffRef(2) =  cofactor_3x3<MatrixType,2,0>(matrix);
-    determinant = (cofactors_col0.cwiseProduct(matrix.col(0))).sum();
-    invertible = abs(determinant) > absDeterminantThreshold;
-    if(!invertible) return;
-    const Scalar invdet = Scalar(1) / determinant;
-    compute_inverse_size3_helper(matrix, invdet, cofactors_col0, inverse);
-  }
-};
-
-/****************************
-*** Size 4 implementation ***
-****************************/
-
-template<typename Derived>
-inline const typename Derived::Scalar general_det3_helper
-(const MatrixBase<Derived>& matrix, int i1, int i2, int i3, int j1, int j2, int j3)
-{
-  return matrix.coeff(i1,j1)
-         * (matrix.coeff(i2,j2) * matrix.coeff(i3,j3) - matrix.coeff(i2,j3) * matrix.coeff(i3,j2));
-}
-
-template<typename MatrixType, int i, int j>
-inline typename MatrixType::Scalar cofactor_4x4(const MatrixType& matrix)
-{
-  enum {
-    i1 = (i+1) % 4,
-    i2 = (i+2) % 4,
-    i3 = (i+3) % 4,
-    j1 = (j+1) % 4,
-    j2 = (j+2) % 4,
-    j3 = (j+3) % 4
-  };
-  return general_det3_helper(matrix, i1, i2, i3, j1, j2, j3)
-       + general_det3_helper(matrix, i2, i3, i1, j1, j2, j3)
-       + general_det3_helper(matrix, i3, i1, i2, j1, j2, j3);
-}
-
-template<int Arch, typename Scalar, typename MatrixType, typename ResultType>
-struct compute_inverse_size4
-{
-  static void run(const MatrixType& matrix, ResultType& result)
-  {
-    result.coeffRef(0,0) =  cofactor_4x4<MatrixType,0,0>(matrix);
-    result.coeffRef(1,0) = -cofactor_4x4<MatrixType,0,1>(matrix);
-    result.coeffRef(2,0) =  cofactor_4x4<MatrixType,0,2>(matrix);
-    result.coeffRef(3,0) = -cofactor_4x4<MatrixType,0,3>(matrix);
-    result.coeffRef(0,2) =  cofactor_4x4<MatrixType,2,0>(matrix);
-    result.coeffRef(1,2) = -cofactor_4x4<MatrixType,2,1>(matrix);
-    result.coeffRef(2,2) =  cofactor_4x4<MatrixType,2,2>(matrix);
-    result.coeffRef(3,2) = -cofactor_4x4<MatrixType,2,3>(matrix);
-    result.coeffRef(0,1) = -cofactor_4x4<MatrixType,1,0>(matrix);
-    result.coeffRef(1,1) =  cofactor_4x4<MatrixType,1,1>(matrix);
-    result.coeffRef(2,1) = -cofactor_4x4<MatrixType,1,2>(matrix);
-    result.coeffRef(3,1) =  cofactor_4x4<MatrixType,1,3>(matrix);
-    result.coeffRef(0,3) = -cofactor_4x4<MatrixType,3,0>(matrix);
-    result.coeffRef(1,3) =  cofactor_4x4<MatrixType,3,1>(matrix);
-    result.coeffRef(2,3) = -cofactor_4x4<MatrixType,3,2>(matrix);
-    result.coeffRef(3,3) =  cofactor_4x4<MatrixType,3,3>(matrix);
-    result /= (matrix.col(0).cwiseProduct(result.row(0).transpose())).sum();
-  }
-};
-
-template<typename MatrixType, typename ResultType>
-struct compute_inverse<MatrixType, ResultType, 4>
- : compute_inverse_size4<Architecture::Target, typename MatrixType::Scalar,
-                            MatrixType, ResultType>
-{
-};
-
-template<typename MatrixType, typename ResultType>
-struct compute_inverse_and_det_with_check<MatrixType, ResultType, 4>
-{
-  static inline void run(
-    const MatrixType& matrix,
-    const typename MatrixType::RealScalar& absDeterminantThreshold,
-    ResultType& inverse,
-    typename ResultType::Scalar& determinant,
-    bool& invertible
-  )
-  {
-    using std::abs;
-    determinant = matrix.determinant();
-    invertible = abs(determinant) > absDeterminantThreshold;
-    if(invertible) compute_inverse<MatrixType, ResultType>::run(matrix, inverse);
-  }
-};
-
-/*************************
-*** MatrixBase methods ***
-*************************/
-
-template<typename MatrixType>
-struct traits<inverse_impl<MatrixType> >
-{
-  typedef typename MatrixType::PlainObject ReturnType;
-};
-
-template<typename MatrixType>
-struct inverse_impl : public ReturnByValue<inverse_impl<MatrixType> >
-{
-  typedef typename MatrixType::Index Index;
-  typedef typename internal::eval<MatrixType>::type MatrixTypeNested;
-  typedef typename remove_all<MatrixTypeNested>::type MatrixTypeNestedCleaned;
-  MatrixTypeNested m_matrix;
-
-  inverse_impl(const MatrixType& matrix)
-    : m_matrix(matrix)
-  {}
-
-  inline Index rows() const { return m_matrix.rows(); }
-  inline Index cols() const { return m_matrix.cols(); }
-
-  template<typename Dest> inline void evalTo(Dest& dst) const
-  {
-    const int Size = EIGEN_PLAIN_ENUM_MIN(MatrixType::ColsAtCompileTime,Dest::ColsAtCompileTime);
-    EIGEN_ONLY_USED_FOR_DEBUG(Size);
-    eigen_assert(( (Size<=1) || (Size>4) || (extract_data(m_matrix)!=extract_data(dst)))
-              && "Aliasing problem detected in inverse(), you need to do inverse().eval() here.");
-
-    compute_inverse<MatrixTypeNestedCleaned, Dest>::run(m_matrix, dst);
-  }
-};
-
-} // end namespace internal
-
-/** \lu_module
-  *
-  * \returns the matrix inverse of this matrix.
-  *
-  * For small fixed sizes up to 4x4, this method uses cofactors.
-  * In the general case, this method uses class PartialPivLU.
-  *
-  * \note This matrix must be invertible, otherwise the result is undefined. If you need an
-  * invertibility check, do the following:
-  * \li for fixed sizes up to 4x4, use computeInverseAndDetWithCheck().
-  * \li for the general case, use class FullPivLU.
-  *
-  * Example: \include MatrixBase_inverse.cpp
-  * Output: \verbinclude MatrixBase_inverse.out
-  *
-  * \sa computeInverseAndDetWithCheck()
-  */
-template<typename Derived>
-inline const internal::inverse_impl<Derived> MatrixBase<Derived>::inverse() const
-{
-  EIGEN_STATIC_ASSERT(!NumTraits<Scalar>::IsInteger,THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES)
-  eigen_assert(rows() == cols());
-  return internal::inverse_impl<Derived>(derived());
-}
-
-/** \lu_module
-  *
-  * Computation of matrix inverse and determinant, with invertibility check.
-  *
-  * This is only for fixed-size square matrices of size up to 4x4.
-  *
-  * \param inverse Reference to the matrix in which to store the inverse.
-  * \param determinant Reference to the variable in which to store the determinant.
-  * \param invertible Reference to the bool variable in which to store whether the matrix is invertible.
-  * \param absDeterminantThreshold Optional parameter controlling the invertibility check.
-  *                                The matrix will be declared invertible if the absolute value of its
-  *                                determinant is greater than this threshold.
-  *
-  * Example: \include MatrixBase_computeInverseAndDetWithCheck.cpp
-  * Output: \verbinclude MatrixBase_computeInverseAndDetWithCheck.out
-  *
-  * \sa inverse(), computeInverseWithCheck()
-  */
-template<typename Derived>
-template<typename ResultType>
-inline void MatrixBase<Derived>::computeInverseAndDetWithCheck(
-    ResultType& inverse,
-    typename ResultType::Scalar& determinant,
-    bool& invertible,
-    const RealScalar& absDeterminantThreshold
-  ) const
-{
-  // i'd love to put some static assertions there, but SFINAE means that they have no effect...
-  eigen_assert(rows() == cols());
-  // for 2x2, it's worth giving a chance to avoid evaluating.
-  // for larger sizes, evaluating has negligible cost and limits code size.
-  typedef typename internal::conditional<
-    RowsAtCompileTime == 2,
-    typename internal::remove_all<typename internal::nested<Derived, 2>::type>::type,
-    PlainObject
-  >::type MatrixType;
-  internal::compute_inverse_and_det_with_check<MatrixType, ResultType>::run
-    (derived(), absDeterminantThreshold, inverse, determinant, invertible);
-}
-
-/** \lu_module
-  *
-  * Computation of matrix inverse, with invertibility check.
-  *
-  * This is only for fixed-size square matrices of size up to 4x4.
-  *
-  * \param inverse Reference to the matrix in which to store the inverse.
-  * \param invertible Reference to the bool variable in which to store whether the matrix is invertible.
-  * \param absDeterminantThreshold Optional parameter controlling the invertibility check.
-  *                                The matrix will be declared invertible if the absolute value of its
-  *                                determinant is greater than this threshold.
-  *
-  * Example: \include MatrixBase_computeInverseWithCheck.cpp
-  * Output: \verbinclude MatrixBase_computeInverseWithCheck.out
-  *
-  * \sa inverse(), computeInverseAndDetWithCheck()
-  */
-template<typename Derived>
-template<typename ResultType>
-inline void MatrixBase<Derived>::computeInverseWithCheck(
-    ResultType& inverse,
-    bool& invertible,
-    const RealScalar& absDeterminantThreshold
-  ) const
-{
-  RealScalar determinant;
-  // i'd love to put some static assertions there, but SFINAE means that they have no effect...
-  eigen_assert(rows() == cols());
-  computeInverseAndDetWithCheck(inverse,determinant,invertible,absDeterminantThreshold);
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN_INVERSE_H
diff --git a/inst/include/Eigen/src/LU/InverseImpl.h b/inst/include/Eigen/src/LU/InverseImpl.h
new file mode 100644
index 00000000..fe8859e9
--- /dev/null
+++ b/inst/include/Eigen/src/LU/InverseImpl.h
@@ -0,0 +1,353 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_INVERSE_IMPL_H
+#define EIGEN_INVERSE_IMPL_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+/**********************************
+*** General case implementation ***
+**********************************/
+
+template <typename MatrixType, typename ResultType, int Size = MatrixType::RowsAtCompileTime>
+struct compute_inverse {
+  EIGEN_DEVICE_FUNC static inline void run(const MatrixType& matrix, ResultType& result) {
+    result = matrix.partialPivLu().inverse();
+  }
+};
+
+template <typename MatrixType, typename ResultType, int Size = MatrixType::RowsAtCompileTime>
+struct compute_inverse_and_det_with_check { /* nothing! general case not supported. */
+};
+
+/****************************
+*** Size 1 implementation ***
+****************************/
+
+template <typename MatrixType, typename ResultType>
+struct compute_inverse<MatrixType, ResultType, 1> {
+  EIGEN_DEVICE_FUNC static inline void run(const MatrixType& matrix, ResultType& result) {
+    typedef typename MatrixType::Scalar Scalar;
+    internal::evaluator<MatrixType> matrixEval(matrix);
+    result.coeffRef(0, 0) = Scalar(1) / matrixEval.coeff(0, 0);
+  }
+};
+
+template <typename MatrixType, typename ResultType>
+struct compute_inverse_and_det_with_check<MatrixType, ResultType, 1> {
+  EIGEN_DEVICE_FUNC static inline void run(const MatrixType& matrix,
+                                           const typename MatrixType::RealScalar& absDeterminantThreshold,
+                                           ResultType& result, typename ResultType::Scalar& determinant,
+                                           bool& invertible) {
+    using std::abs;
+    determinant = matrix.coeff(0, 0);
+    invertible = abs(determinant) > absDeterminantThreshold;
+    if (invertible) result.coeffRef(0, 0) = typename ResultType::Scalar(1) / determinant;
+  }
+};
+
+/****************************
+*** Size 2 implementation ***
+****************************/
+
+template <typename MatrixType, typename ResultType>
+EIGEN_DEVICE_FUNC inline void compute_inverse_size2_helper(const MatrixType& matrix,
+                                                           const typename ResultType::Scalar& invdet,
+                                                           ResultType& result) {
+  typename ResultType::Scalar temp = matrix.coeff(0, 0);
+  result.coeffRef(0, 0) = matrix.coeff(1, 1) * invdet;
+  result.coeffRef(1, 0) = -matrix.coeff(1, 0) * invdet;
+  result.coeffRef(0, 1) = -matrix.coeff(0, 1) * invdet;
+  result.coeffRef(1, 1) = temp * invdet;
+}
+
+template <typename MatrixType, typename ResultType>
+struct compute_inverse<MatrixType, ResultType, 2> {
+  EIGEN_DEVICE_FUNC static inline void run(const MatrixType& matrix, ResultType& result) {
+    typedef typename ResultType::Scalar Scalar;
+    const Scalar invdet = typename MatrixType::Scalar(1) / matrix.determinant();
+    compute_inverse_size2_helper(matrix, invdet, result);
+  }
+};
+
+template <typename MatrixType, typename ResultType>
+struct compute_inverse_and_det_with_check<MatrixType, ResultType, 2> {
+  EIGEN_DEVICE_FUNC static inline void run(const MatrixType& matrix,
+                                           const typename MatrixType::RealScalar& absDeterminantThreshold,
+                                           ResultType& inverse, typename ResultType::Scalar& determinant,
+                                           bool& invertible) {
+    using std::abs;
+    typedef typename ResultType::Scalar Scalar;
+    determinant = matrix.determinant();
+    invertible = abs(determinant) > absDeterminantThreshold;
+    if (!invertible) return;
+    const Scalar invdet = Scalar(1) / determinant;
+    compute_inverse_size2_helper(matrix, invdet, inverse);
+  }
+};
+
+/****************************
+*** Size 3 implementation ***
+****************************/
+
+template <typename MatrixType, int i, int j>
+EIGEN_DEVICE_FUNC inline typename MatrixType::Scalar cofactor_3x3(const MatrixType& m) {
+  enum { i1 = (i + 1) % 3, i2 = (i + 2) % 3, j1 = (j + 1) % 3, j2 = (j + 2) % 3 };
+  return m.coeff(i1, j1) * m.coeff(i2, j2) - m.coeff(i1, j2) * m.coeff(i2, j1);
+}
+
+template <typename MatrixType, typename ResultType>
+EIGEN_DEVICE_FUNC inline void compute_inverse_size3_helper(
+    const MatrixType& matrix, const typename ResultType::Scalar& invdet,
+    const Matrix<typename ResultType::Scalar, 3, 1>& cofactors_col0, ResultType& result) {
+  // Compute cofactors in a way that avoids aliasing issues.
+  typedef typename ResultType::Scalar Scalar;
+  const Scalar c01 = cofactor_3x3<MatrixType, 0, 1>(matrix) * invdet;
+  const Scalar c11 = cofactor_3x3<MatrixType, 1, 1>(matrix) * invdet;
+  const Scalar c02 = cofactor_3x3<MatrixType, 0, 2>(matrix) * invdet;
+  result.coeffRef(1, 2) = cofactor_3x3<MatrixType, 2, 1>(matrix) * invdet;
+  result.coeffRef(2, 1) = cofactor_3x3<MatrixType, 1, 2>(matrix) * invdet;
+  result.coeffRef(2, 2) = cofactor_3x3<MatrixType, 2, 2>(matrix) * invdet;
+  result.coeffRef(1, 0) = c01;
+  result.coeffRef(1, 1) = c11;
+  result.coeffRef(2, 0) = c02;
+  result.row(0) = cofactors_col0 * invdet;
+}
+
+template <typename MatrixType, typename ResultType>
+struct compute_inverse<MatrixType, ResultType, 3> {
+  EIGEN_DEVICE_FUNC static inline void run(const MatrixType& matrix, ResultType& result) {
+    typedef typename ResultType::Scalar Scalar;
+    Matrix<typename MatrixType::Scalar, 3, 1> cofactors_col0;
+    cofactors_col0.coeffRef(0) = cofactor_3x3<MatrixType, 0, 0>(matrix);
+    cofactors_col0.coeffRef(1) = cofactor_3x3<MatrixType, 1, 0>(matrix);
+    cofactors_col0.coeffRef(2) = cofactor_3x3<MatrixType, 2, 0>(matrix);
+    const Scalar det = (cofactors_col0.cwiseProduct(matrix.col(0))).sum();
+    const Scalar invdet = Scalar(1) / det;
+    compute_inverse_size3_helper(matrix, invdet, cofactors_col0, result);
+  }
+};
+
+template <typename MatrixType, typename ResultType>
+struct compute_inverse_and_det_with_check<MatrixType, ResultType, 3> {
+  EIGEN_DEVICE_FUNC static inline void run(const MatrixType& matrix,
+                                           const typename MatrixType::RealScalar& absDeterminantThreshold,
+                                           ResultType& inverse, typename ResultType::Scalar& determinant,
+                                           bool& invertible) {
+    typedef typename ResultType::Scalar Scalar;
+    Matrix<Scalar, 3, 1> cofactors_col0;
+    cofactors_col0.coeffRef(0) = cofactor_3x3<MatrixType, 0, 0>(matrix);
+    cofactors_col0.coeffRef(1) = cofactor_3x3<MatrixType, 1, 0>(matrix);
+    cofactors_col0.coeffRef(2) = cofactor_3x3<MatrixType, 2, 0>(matrix);
+    determinant = (cofactors_col0.cwiseProduct(matrix.col(0))).sum();
+    invertible = Eigen::numext::abs(determinant) > absDeterminantThreshold;
+    if (!invertible) return;
+    const Scalar invdet = Scalar(1) / determinant;
+    compute_inverse_size3_helper(matrix, invdet, cofactors_col0, inverse);
+  }
+};
+
+/****************************
+*** Size 4 implementation ***
+****************************/
+
+template <typename Derived>
+EIGEN_DEVICE_FUNC inline const typename Derived::Scalar general_det3_helper(const MatrixBase<Derived>& matrix, int i1,
+                                                                            int i2, int i3, int j1, int j2, int j3) {
+  return matrix.coeff(i1, j1) *
+         (matrix.coeff(i2, j2) * matrix.coeff(i3, j3) - matrix.coeff(i2, j3) * matrix.coeff(i3, j2));
+}
+
+template <typename MatrixType, int i, int j>
+EIGEN_DEVICE_FUNC inline typename MatrixType::Scalar cofactor_4x4(const MatrixType& matrix) {
+  enum { i1 = (i + 1) % 4, i2 = (i + 2) % 4, i3 = (i + 3) % 4, j1 = (j + 1) % 4, j2 = (j + 2) % 4, j3 = (j + 3) % 4 };
+  return general_det3_helper(matrix, i1, i2, i3, j1, j2, j3) + general_det3_helper(matrix, i2, i3, i1, j1, j2, j3) +
+         general_det3_helper(matrix, i3, i1, i2, j1, j2, j3);
+}
+
+template <int Arch, typename Scalar, typename MatrixType, typename ResultType>
+struct compute_inverse_size4 {
+  EIGEN_DEVICE_FUNC static void run(const MatrixType& matrix, ResultType& result) {
+    result.coeffRef(0, 0) = cofactor_4x4<MatrixType, 0, 0>(matrix);
+    result.coeffRef(1, 0) = -cofactor_4x4<MatrixType, 0, 1>(matrix);
+    result.coeffRef(2, 0) = cofactor_4x4<MatrixType, 0, 2>(matrix);
+    result.coeffRef(3, 0) = -cofactor_4x4<MatrixType, 0, 3>(matrix);
+    result.coeffRef(0, 2) = cofactor_4x4<MatrixType, 2, 0>(matrix);
+    result.coeffRef(1, 2) = -cofactor_4x4<MatrixType, 2, 1>(matrix);
+    result.coeffRef(2, 2) = cofactor_4x4<MatrixType, 2, 2>(matrix);
+    result.coeffRef(3, 2) = -cofactor_4x4<MatrixType, 2, 3>(matrix);
+    result.coeffRef(0, 1) = -cofactor_4x4<MatrixType, 1, 0>(matrix);
+    result.coeffRef(1, 1) = cofactor_4x4<MatrixType, 1, 1>(matrix);
+    result.coeffRef(2, 1) = -cofactor_4x4<MatrixType, 1, 2>(matrix);
+    result.coeffRef(3, 1) = cofactor_4x4<MatrixType, 1, 3>(matrix);
+    result.coeffRef(0, 3) = -cofactor_4x4<MatrixType, 3, 0>(matrix);
+    result.coeffRef(1, 3) = cofactor_4x4<MatrixType, 3, 1>(matrix);
+    result.coeffRef(2, 3) = -cofactor_4x4<MatrixType, 3, 2>(matrix);
+    result.coeffRef(3, 3) = cofactor_4x4<MatrixType, 3, 3>(matrix);
+    result /= (matrix.col(0).cwiseProduct(result.row(0).transpose())).sum();
+  }
+};
+
+template <typename MatrixType, typename ResultType>
+struct compute_inverse<MatrixType, ResultType, 4>
+    : compute_inverse_size4<Architecture::Target, typename MatrixType::Scalar, MatrixType, ResultType> {};
+
+template <typename MatrixType, typename ResultType>
+struct compute_inverse_and_det_with_check<MatrixType, ResultType, 4> {
+  EIGEN_DEVICE_FUNC static inline void run(const MatrixType& matrix,
+                                           const typename MatrixType::RealScalar& absDeterminantThreshold,
+                                           ResultType& inverse, typename ResultType::Scalar& determinant,
+                                           bool& invertible) {
+    using std::abs;
+    determinant = matrix.determinant();
+    invertible = abs(determinant) > absDeterminantThreshold;
+    if (invertible && extract_data(matrix) != extract_data(inverse)) {
+      compute_inverse<MatrixType, ResultType>::run(matrix, inverse);
+    } else if (invertible) {
+      MatrixType matrix_t = matrix;
+      compute_inverse<MatrixType, ResultType>::run(matrix_t, inverse);
+    }
+  }
+};
+
+/*************************
+*** MatrixBase methods ***
+*************************/
+
+}  // end namespace internal
+
+namespace internal {
+
+// Specialization for "dense = dense_xpr.inverse()"
+template <typename DstXprType, typename XprType>
+struct Assignment<DstXprType, Inverse<XprType>,
+                  internal::assign_op<typename DstXprType::Scalar, typename XprType::Scalar>, Dense2Dense> {
+  typedef Inverse<XprType> SrcXprType;
+  EIGEN_DEVICE_FUNC static void run(DstXprType& dst, const SrcXprType& src,
+                                    const internal::assign_op<typename DstXprType::Scalar, typename XprType::Scalar>&) {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if ((dst.rows() != dstRows) || (dst.cols() != dstCols)) dst.resize(dstRows, dstCols);
+
+    const int Size = plain_enum_min(XprType::ColsAtCompileTime, DstXprType::ColsAtCompileTime);
+    EIGEN_ONLY_USED_FOR_DEBUG(Size);
+    eigen_assert(((Size <= 1) || (Size > 4) || (extract_data(src.nestedExpression()) != extract_data(dst))) &&
+                 "Aliasing problem detected in inverse(), you need to do inverse().eval() here.");
+
+    typedef typename internal::nested_eval<XprType, XprType::ColsAtCompileTime>::type ActualXprType;
+    typedef internal::remove_all_t<ActualXprType> ActualXprTypeCleanded;
+
+    ActualXprType actual_xpr(src.nestedExpression());
+
+    compute_inverse<ActualXprTypeCleanded, DstXprType>::run(actual_xpr, dst);
+  }
+};
+
+}  // end namespace internal
+
+/** \lu_module
+ *
+ * \returns the matrix inverse of this matrix.
+ *
+ * For small fixed sizes up to 4x4, this method uses cofactors.
+ * In the general case, this method uses class PartialPivLU.
+ *
+ * \note This matrix must be invertible, otherwise the result is undefined. If you need an
+ * invertibility check, do the following:
+ * \li for fixed sizes up to 4x4, use computeInverseAndDetWithCheck().
+ * \li for the general case, use class PartialPivLU.
+ *
+ * Example: \include MatrixBase_inverse.cpp
+ * Output: \verbinclude MatrixBase_inverse.out
+ *
+ * \sa computeInverseAndDetWithCheck()
+ */
+template <typename Derived>
+EIGEN_DEVICE_FUNC inline const Inverse<Derived> MatrixBase<Derived>::inverse() const {
+  EIGEN_STATIC_ASSERT(!NumTraits<Scalar>::IsInteger, THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES)
+  eigen_assert(rows() == cols());
+  return Inverse<Derived>(derived());
+}
+
+/** \lu_module
+ *
+ * Computation of matrix inverse and determinant, with invertibility check.
+ *
+ * This is only for fixed-size square matrices of size up to 4x4.
+ *
+ * Notice that it will trigger a copy of input matrix when trying to do the inverse in place.
+ *
+ * \param inverse Reference to the matrix in which to store the inverse.
+ * \param determinant Reference to the variable in which to store the determinant.
+ * \param invertible Reference to the bool variable in which to store whether the matrix is invertible.
+ * \param absDeterminantThreshold Optional parameter controlling the invertibility check.
+ *                                The matrix will be declared invertible if the absolute value of its
+ *                                determinant is greater than this threshold.
+ *
+ * Example: \include MatrixBase_computeInverseAndDetWithCheck.cpp
+ * Output: \verbinclude MatrixBase_computeInverseAndDetWithCheck.out
+ *
+ * \sa inverse(), computeInverseWithCheck()
+ */
+template <typename Derived>
+template <typename ResultType>
+inline void MatrixBase<Derived>::computeInverseAndDetWithCheck(ResultType& inverse,
+                                                               typename ResultType::Scalar& determinant,
+                                                               bool& invertible,
+                                                               const RealScalar& absDeterminantThreshold) const {
+  // i'd love to put some static assertions there, but SFINAE means that they have no effect...
+  eigen_assert(rows() == cols());
+  // for 2x2, it's worth giving a chance to avoid evaluating.
+  // for larger sizes, evaluating has negligible cost and limits code size.
+  typedef std::conditional_t<RowsAtCompileTime == 2,
+                             internal::remove_all_t<typename internal::nested_eval<Derived, 2>::type>, PlainObject>
+      MatrixType;
+  internal::compute_inverse_and_det_with_check<MatrixType, ResultType>::run(derived(), absDeterminantThreshold, inverse,
+                                                                            determinant, invertible);
+}
+
+/** \lu_module
+ *
+ * Computation of matrix inverse, with invertibility check.
+ *
+ * This is only for fixed-size square matrices of size up to 4x4.
+ *
+ * Notice that it will trigger a copy of input matrix when trying to do the inverse in place.
+ *
+ * \param inverse Reference to the matrix in which to store the inverse.
+ * \param invertible Reference to the bool variable in which to store whether the matrix is invertible.
+ * \param absDeterminantThreshold Optional parameter controlling the invertibility check.
+ *                                The matrix will be declared invertible if the absolute value of its
+ *                                determinant is greater than this threshold.
+ *
+ * Example: \include MatrixBase_computeInverseWithCheck.cpp
+ * Output: \verbinclude MatrixBase_computeInverseWithCheck.out
+ *
+ * \sa inverse(), computeInverseAndDetWithCheck()
+ */
+template <typename Derived>
+template <typename ResultType>
+inline void MatrixBase<Derived>::computeInverseWithCheck(ResultType& inverse, bool& invertible,
+                                                         const RealScalar& absDeterminantThreshold) const {
+  Scalar determinant;
+  // i'd love to put some static assertions there, but SFINAE means that they have no effect...
+  eigen_assert(rows() == cols());
+  computeInverseAndDetWithCheck(inverse, determinant, invertible, absDeterminantThreshold);
+}
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_INVERSE_IMPL_H
diff --git a/inst/include/Eigen/src/LU/PartialPivLU.h b/inst/include/Eigen/src/LU/PartialPivLU.h
index 7d1db948..7ea14f57 100644
--- a/inst/include/Eigen/src/LU/PartialPivLU.h
+++ b/inst/include/Eigen/src/LU/PartialPivLU.h
@@ -11,306 +11,402 @@
 #ifndef EIGEN_PARTIALLU_H
 #define EIGEN_PARTIALLU_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+template <typename MatrixType_, typename PermutationIndex_>
+struct traits<PartialPivLU<MatrixType_, PermutationIndex_> > : traits<MatrixType_> {
+  typedef MatrixXpr XprKind;
+  typedef SolverStorage StorageKind;
+  typedef PermutationIndex_ StorageIndex;
+  typedef traits<MatrixType_> BaseTraits;
+  enum { Flags = BaseTraits::Flags & RowMajorBit, CoeffReadCost = Dynamic };
+};
+
+template <typename T, typename Derived>
+struct enable_if_ref;
+// {
+//   typedef Derived type;
+// };
+
+template <typename T, typename Derived>
+struct enable_if_ref<Ref<T>, Derived> {
+  typedef Derived type;
+};
+
+}  // end namespace internal
 
 /** \ingroup LU_Module
-  *
-  * \class PartialPivLU
-  *
-  * \brief LU decomposition of a matrix with partial pivoting, and related features
-  *
-  * \param MatrixType the type of the matrix of which we are computing the LU decomposition
-  *
-  * This class represents a LU decomposition of a \b square \b invertible matrix, with partial pivoting: the matrix A
-  * is decomposed as A = PLU where L is unit-lower-triangular, U is upper-triangular, and P
-  * is a permutation matrix.
-  *
-  * Typically, partial pivoting LU decomposition is only considered numerically stable for square invertible
-  * matrices. Thus LAPACK's dgesv and dgesvx require the matrix to be square and invertible. The present class
-  * does the same. It will assert that the matrix is square, but it won't (actually it can't) check that the
-  * matrix is invertible: it is your task to check that you only use this decomposition on invertible matrices.
-  *
-  * The guaranteed safe alternative, working for all matrices, is the full pivoting LU decomposition, provided
-  * by class FullPivLU.
-  *
-  * This is \b not a rank-revealing LU decomposition. Many features are intentionally absent from this class,
-  * such as rank computation. If you need these features, use class FullPivLU.
-  *
-  * This LU decomposition is suitable to invert invertible matrices. It is what MatrixBase::inverse() uses
-  * in the general case.
-  * On the other hand, it is \b not suitable to determine whether a given matrix is invertible.
-  *
-  * The data of the LU decomposition can be directly accessed through the methods matrixLU(), permutationP().
-  *
-  * \sa MatrixBase::partialPivLu(), MatrixBase::determinant(), MatrixBase::inverse(), MatrixBase::computeInverse(), class FullPivLU
-  */
-template<typename _MatrixType> class PartialPivLU
-{
-  public:
-
-    typedef _MatrixType MatrixType;
-    enum {
-      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-      Options = MatrixType::Options,
-      MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
-      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
-    };
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
-    typedef typename internal::traits<MatrixType>::StorageKind StorageKind;
-    typedef typename MatrixType::Index Index;
-    typedef PermutationMatrix<RowsAtCompileTime, MaxRowsAtCompileTime> PermutationType;
-    typedef Transpositions<RowsAtCompileTime, MaxRowsAtCompileTime> TranspositionType;
-
-
-    /**
-    * \brief Default Constructor.
-    *
-    * The default constructor is useful in cases in which the user intends to
-    * perform decompositions via PartialPivLU::compute(const MatrixType&).
+ *
+ * \class PartialPivLU
+ *
+ * \brief LU decomposition of a matrix with partial pivoting, and related features
+ *
+ * \tparam MatrixType_ the type of the matrix of which we are computing the LU decomposition
+ *
+ * This class represents a LU decomposition of a \b square \b invertible matrix, with partial pivoting: the matrix A
+ * is decomposed as A = PLU where L is unit-lower-triangular, U is upper-triangular, and P
+ * is a permutation matrix.
+ *
+ * Typically, partial pivoting LU decomposition is only considered numerically stable for square invertible
+ * matrices. Thus LAPACK's dgesv and dgesvx require the matrix to be square and invertible. The present class
+ * does the same. It will assert that the matrix is square, but it won't (actually it can't) check that the
+ * matrix is invertible: it is your task to check that you only use this decomposition on invertible matrices.
+ *
+ * The guaranteed safe alternative, working for all matrices, is the full pivoting LU decomposition, provided
+ * by class FullPivLU.
+ *
+ * This is \b not a rank-revealing LU decomposition. Many features are intentionally absent from this class,
+ * such as rank computation. If you need these features, use class FullPivLU.
+ *
+ * This LU decomposition is suitable to invert invertible matrices. It is what MatrixBase::inverse() uses
+ * in the general case.
+ * On the other hand, it is \b not suitable to determine whether a given matrix is invertible.
+ *
+ * The data of the LU decomposition can be directly accessed through the methods matrixLU(), permutationP().
+ *
+ * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
+ *
+ * \sa MatrixBase::partialPivLu(), MatrixBase::determinant(), MatrixBase::inverse(), MatrixBase::computeInverse(), class
+ * FullPivLU
+ */
+template <typename MatrixType_, typename PermutationIndex_>
+class PartialPivLU : public SolverBase<PartialPivLU<MatrixType_, PermutationIndex_> > {
+ public:
+  typedef MatrixType_ MatrixType;
+  typedef SolverBase<PartialPivLU> Base;
+  friend class SolverBase<PartialPivLU>;
+
+  EIGEN_GENERIC_PUBLIC_INTERFACE(PartialPivLU)
+  enum {
+    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+  };
+  using PermutationIndex = PermutationIndex_;
+  typedef PermutationMatrix<RowsAtCompileTime, MaxRowsAtCompileTime, PermutationIndex> PermutationType;
+  typedef Transpositions<RowsAtCompileTime, MaxRowsAtCompileTime, PermutationIndex> TranspositionType;
+  typedef typename MatrixType::PlainObject PlainObject;
+
+  /** \brief Reports whether the LU factorization was successful.
+   *
+   * \note This function always returns \c Success. It is provided for compatibility
+   * with other factorization routines.
+   * \returns \c Success
+   */
+  ComputationInfo info() const {
+    eigen_assert(m_isInitialized && "PartialPivLU is not initialized.");
+    return Success;
+  }
+
+  /**
+   * \brief Default Constructor.
+   *
+   * The default constructor is useful in cases in which the user intends to
+   * perform decompositions via PartialPivLU::compute(const MatrixType&).
+   */
+  PartialPivLU();
+
+  /** \brief Default Constructor with memory preallocation
+   *
+   * Like the default constructor but with preallocation of the internal data
+   * according to the specified problem \a size.
+   * \sa PartialPivLU()
+   */
+  explicit PartialPivLU(Index size);
+
+  /** Constructor.
+   *
+   * \param matrix the matrix of which to compute the LU decomposition.
+   *
+   * \warning The matrix should have full rank (e.g. if it's square, it should be invertible).
+   * If you need to deal with non-full rank, use class FullPivLU instead.
+   */
+  template <typename InputType>
+  explicit PartialPivLU(const EigenBase<InputType>& matrix);
+
+  /** Constructor for \link InplaceDecomposition inplace decomposition \endlink
+   *
+   * \param matrix the matrix of which to compute the LU decomposition.
+   *
+   * \warning The matrix should have full rank (e.g. if it's square, it should be invertible).
+   * If you need to deal with non-full rank, use class FullPivLU instead.
+   */
+  template <typename InputType>
+  explicit PartialPivLU(EigenBase<InputType>& matrix);
+
+  template <typename InputType>
+  PartialPivLU& compute(const EigenBase<InputType>& matrix) {
+    m_lu = matrix.derived();
+    compute();
+    return *this;
+  }
+
+  /** \returns the LU decomposition matrix: the upper-triangular part is U, the
+   * unit-lower-triangular part is L (at least for square matrices; in the non-square
+   * case, special care is needed, see the documentation of class FullPivLU).
+   *
+   * \sa matrixL(), matrixU()
+   */
+  inline const MatrixType& matrixLU() const {
+    eigen_assert(m_isInitialized && "PartialPivLU is not initialized.");
+    return m_lu;
+  }
+
+  /** \returns the permutation matrix P.
+   */
+  inline const PermutationType& permutationP() const {
+    eigen_assert(m_isInitialized && "PartialPivLU is not initialized.");
+    return m_p;
+  }
+
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+  /** This method returns the solution x to the equation Ax=b, where A is the matrix of which
+   * *this is the LU decomposition.
+   *
+   * \param b the right-hand-side of the equation to solve. Can be a vector or a matrix,
+   *          the only requirement in order for the equation to make sense is that
+   *          b.rows()==A.rows(), where A is the matrix of which *this is the LU decomposition.
+   *
+   * \returns the solution.
+   *
+   * Example: \include PartialPivLU_solve.cpp
+   * Output: \verbinclude PartialPivLU_solve.out
+   *
+   * Since this PartialPivLU class assumes anyway that the matrix A is invertible, the solution
+   * theoretically exists and is unique regardless of b.
+   *
+   * \sa TriangularView::solve(), inverse(), computeInverse()
+   */
+  template <typename Rhs>
+  inline const Solve<PartialPivLU, Rhs> solve(const MatrixBase<Rhs>& b) const;
+#endif
+
+  /** \returns an estimate of the reciprocal condition number of the matrix of which \c *this is
+      the LU decomposition.
     */
-    PartialPivLU();
-
-    /** \brief Default Constructor with memory preallocation
-      *
-      * Like the default constructor but with preallocation of the internal data
-      * according to the specified problem \a size.
-      * \sa PartialPivLU()
-      */
-    PartialPivLU(Index size);
-
-    /** Constructor.
-      *
-      * \param matrix the matrix of which to compute the LU decomposition.
-      *
-      * \warning The matrix should have full rank (e.g. if it's square, it should be invertible).
-      * If you need to deal with non-full rank, use class FullPivLU instead.
-      */
-    PartialPivLU(const MatrixType& matrix);
-
-    PartialPivLU& compute(const MatrixType& matrix);
-
-    /** \returns the LU decomposition matrix: the upper-triangular part is U, the
-      * unit-lower-triangular part is L (at least for square matrices; in the non-square
-      * case, special care is needed, see the documentation of class FullPivLU).
-      *
-      * \sa matrixL(), matrixU()
-      */
-    inline const MatrixType& matrixLU() const
-    {
-      eigen_assert(m_isInitialized && "PartialPivLU is not initialized.");
-      return m_lu;
-    }
+  inline RealScalar rcond() const {
+    eigen_assert(m_isInitialized && "PartialPivLU is not initialized.");
+    return internal::rcond_estimate_helper(m_l1_norm, *this);
+  }
 
-    /** \returns the permutation matrix P.
-      */
-    inline const PermutationType& permutationP() const
-    {
-      eigen_assert(m_isInitialized && "PartialPivLU is not initialized.");
-      return m_p;
-    }
+  /** \returns the inverse of the matrix of which *this is the LU decomposition.
+   *
+   * \warning The matrix being decomposed here is assumed to be invertible. If you need to check for
+   *          invertibility, use class FullPivLU instead.
+   *
+   * \sa MatrixBase::inverse(), LU::inverse()
+   */
+  inline const Inverse<PartialPivLU> inverse() const {
+    eigen_assert(m_isInitialized && "PartialPivLU is not initialized.");
+    return Inverse<PartialPivLU>(*this);
+  }
 
-    /** This method returns the solution x to the equation Ax=b, where A is the matrix of which
-      * *this is the LU decomposition.
-      *
-      * \param b the right-hand-side of the equation to solve. Can be a vector or a matrix,
-      *          the only requirement in order for the equation to make sense is that
-      *          b.rows()==A.rows(), where A is the matrix of which *this is the LU decomposition.
-      *
-      * \returns the solution.
-      *
-      * Example: \include PartialPivLU_solve.cpp
-      * Output: \verbinclude PartialPivLU_solve.out
-      *
-      * Since this PartialPivLU class assumes anyway that the matrix A is invertible, the solution
-      * theoretically exists and is unique regardless of b.
-      *
-      * \sa TriangularView::solve(), inverse(), computeInverse()
-      */
-    template<typename Rhs>
-    inline const internal::solve_retval<PartialPivLU, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "PartialPivLU is not initialized.");
-      return internal::solve_retval<PartialPivLU, Rhs>(*this, b.derived());
-    }
+  /** \returns the determinant of the matrix of which
+   * *this is the LU decomposition. It has only linear complexity
+   * (that is, O(n) where n is the dimension of the square matrix)
+   * as the LU decomposition has already been computed.
+   *
+   * \note For fixed-size matrices of size up to 4, MatrixBase::determinant() offers
+   *       optimized paths.
+   *
+   * \warning a determinant can be very big or small, so for matrices
+   * of large enough dimension, there is a risk of overflow/underflow.
+   *
+   * \sa MatrixBase::determinant()
+   */
+  Scalar determinant() const;
+
+  MatrixType reconstructedMatrix() const;
+
+  constexpr Index rows() const noexcept { return m_lu.rows(); }
+  constexpr Index cols() const noexcept { return m_lu.cols(); }
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  template <typename RhsType, typename DstType>
+  EIGEN_DEVICE_FUNC void _solve_impl(const RhsType& rhs, DstType& dst) const {
+    /* The decomposition PA = LU can be rewritten as A = P^{-1} L U.
+     * So we proceed as follows:
+     * Step 1: compute c = Pb.
+     * Step 2: replace c by the solution x to Lx = c.
+     * Step 3: replace c by the solution x to Ux = c.
+     */
 
-    /** \returns the inverse of the matrix of which *this is the LU decomposition.
-      *
-      * \warning The matrix being decomposed here is assumed to be invertible. If you need to check for
-      *          invertibility, use class FullPivLU instead.
-      *
-      * \sa MatrixBase::inverse(), LU::inverse()
-      */
-    inline const internal::solve_retval<PartialPivLU,typename MatrixType::IdentityReturnType> inverse() const
-    {
-      eigen_assert(m_isInitialized && "PartialPivLU is not initialized.");
-      return internal::solve_retval<PartialPivLU,typename MatrixType::IdentityReturnType>
-               (*this, MatrixType::Identity(m_lu.rows(), m_lu.cols()));
-    }
+    // Step 1
+    dst = permutationP() * rhs;
 
-    /** \returns the determinant of the matrix of which
-      * *this is the LU decomposition. It has only linear complexity
-      * (that is, O(n) where n is the dimension of the square matrix)
-      * as the LU decomposition has already been computed.
-      *
-      * \note For fixed-size matrices of size up to 4, MatrixBase::determinant() offers
-      *       optimized paths.
-      *
-      * \warning a determinant can be very big or small, so for matrices
-      * of large enough dimension, there is a risk of overflow/underflow.
-      *
-      * \sa MatrixBase::determinant()
-      */
-    typename internal::traits<MatrixType>::Scalar determinant() const;
-
-    MatrixType reconstructedMatrix() const;
-
-    inline Index rows() const { return m_lu.rows(); }
-    inline Index cols() const { return m_lu.cols(); }
-
-  protected:
-    
-    static void check_template_parameters()
-    {
-      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
-    }
-    
-    MatrixType m_lu;
-    PermutationType m_p;
-    TranspositionType m_rowsTranspositions;
-    Index m_det_p;
-    bool m_isInitialized;
-};
+    // Step 2
+    m_lu.template triangularView<UnitLower>().solveInPlace(dst);
 
-template<typename MatrixType>
-PartialPivLU<MatrixType>::PartialPivLU()
-  : m_lu(),
-    m_p(),
-    m_rowsTranspositions(),
-    m_det_p(0),
-    m_isInitialized(false)
-{
-}
+    // Step 3
+    m_lu.template triangularView<Upper>().solveInPlace(dst);
+  }
+
+  template <bool Conjugate, typename RhsType, typename DstType>
+  EIGEN_DEVICE_FUNC void _solve_impl_transposed(const RhsType& rhs, DstType& dst) const {
+    /* The decomposition PA = LU can be rewritten as A^T = U^T L^T P.
+     * So we proceed as follows:
+     * Step 1: compute c as the solution to L^T c = b
+     * Step 2: replace c by the solution x to U^T x = c.
+     * Step 3: update  c = P^-1 c.
+     */
 
-template<typename MatrixType>
-PartialPivLU<MatrixType>::PartialPivLU(Index size)
-  : m_lu(size, size),
-    m_p(size),
-    m_rowsTranspositions(size),
-    m_det_p(0),
-    m_isInitialized(false)
-{
+    eigen_assert(rhs.rows() == m_lu.cols());
+
+    // Step 1
+    dst = m_lu.template triangularView<Upper>().transpose().template conjugateIf<Conjugate>().solve(rhs);
+    // Step 2
+    m_lu.template triangularView<UnitLower>().transpose().template conjugateIf<Conjugate>().solveInPlace(dst);
+    // Step 3
+    dst = permutationP().transpose() * dst;
+  }
+#endif
+
+ protected:
+  EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
+
+  void compute();
+
+  MatrixType m_lu;
+  PermutationType m_p;
+  TranspositionType m_rowsTranspositions;
+  RealScalar m_l1_norm;
+  signed char m_det_p;
+  bool m_isInitialized;
+};
+
+template <typename MatrixType, typename PermutationIndex>
+PartialPivLU<MatrixType, PermutationIndex>::PartialPivLU()
+    : m_lu(), m_p(), m_rowsTranspositions(), m_l1_norm(0), m_det_p(0), m_isInitialized(false) {}
+
+template <typename MatrixType, typename PermutationIndex>
+PartialPivLU<MatrixType, PermutationIndex>::PartialPivLU(Index size)
+    : m_lu(size, size), m_p(size), m_rowsTranspositions(size), m_l1_norm(0), m_det_p(0), m_isInitialized(false) {}
+
+template <typename MatrixType, typename PermutationIndex>
+template <typename InputType>
+PartialPivLU<MatrixType, PermutationIndex>::PartialPivLU(const EigenBase<InputType>& matrix)
+    : m_lu(matrix.rows(), matrix.cols()),
+      m_p(matrix.rows()),
+      m_rowsTranspositions(matrix.rows()),
+      m_l1_norm(0),
+      m_det_p(0),
+      m_isInitialized(false) {
+  compute(matrix.derived());
 }
 
-template<typename MatrixType>
-PartialPivLU<MatrixType>::PartialPivLU(const MatrixType& matrix)
-  : m_lu(matrix.rows(), matrix.rows()),
-    m_p(matrix.rows()),
-    m_rowsTranspositions(matrix.rows()),
-    m_det_p(0),
-    m_isInitialized(false)
-{
-  compute(matrix);
+template <typename MatrixType, typename PermutationIndex>
+template <typename InputType>
+PartialPivLU<MatrixType, PermutationIndex>::PartialPivLU(EigenBase<InputType>& matrix)
+    : m_lu(matrix.derived()),
+      m_p(matrix.rows()),
+      m_rowsTranspositions(matrix.rows()),
+      m_l1_norm(0),
+      m_det_p(0),
+      m_isInitialized(false) {
+  compute();
 }
 
 namespace internal {
 
 /** \internal This is the blocked version of fullpivlu_unblocked() */
-template<typename Scalar, int StorageOrder, typename PivIndex>
-struct partial_lu_impl
-{
-  // FIXME add a stride to Map, so that the following mapping becomes easier,
-  // another option would be to create an expression being able to automatically
-  // warp any Map, Matrix, and Block expressions as a unique type, but since that's exactly
-  // a Map + stride, why not adding a stride to Map, and convenient ctors from a Matrix,
-  // and Block.
-  typedef Map<Matrix<Scalar, Dynamic, Dynamic, StorageOrder> > MapLU;
-  typedef Block<MapLU, Dynamic, Dynamic> MatrixType;
-  typedef Block<MatrixType,Dynamic,Dynamic> BlockType;
+template <typename Scalar, int StorageOrder, typename PivIndex, int SizeAtCompileTime = Dynamic>
+struct partial_lu_impl {
+  static constexpr int UnBlockedBound = 16;
+  static constexpr bool UnBlockedAtCompileTime = SizeAtCompileTime != Dynamic && SizeAtCompileTime <= UnBlockedBound;
+  static constexpr int ActualSizeAtCompileTime = UnBlockedAtCompileTime ? SizeAtCompileTime : Dynamic;
+  // Remaining rows and columns at compile-time:
+  static constexpr int RRows = SizeAtCompileTime == 2 ? 1 : Dynamic;
+  static constexpr int RCols = SizeAtCompileTime == 2 ? 1 : Dynamic;
+  typedef Matrix<Scalar, ActualSizeAtCompileTime, ActualSizeAtCompileTime, StorageOrder> MatrixType;
+  typedef Ref<MatrixType> MatrixTypeRef;
+  typedef Ref<Matrix<Scalar, Dynamic, Dynamic, StorageOrder> > BlockType;
   typedef typename MatrixType::RealScalar RealScalar;
-  typedef typename MatrixType::Index Index;
 
   /** \internal performs the LU decomposition in-place of the matrix \a lu
-    * using an unblocked algorithm.
-    *
-    * In addition, this function returns the row transpositions in the
-    * vector \a row_transpositions which must have a size equal to the number
-    * of columns of the matrix \a lu, and an integer \a nb_transpositions
-    * which returns the actual number of transpositions.
-    *
-    * \returns The index of the first pivot which is exactly zero if any, or a negative number otherwise.
-    */
-  static Index unblocked_lu(MatrixType& lu, PivIndex* row_transpositions, PivIndex& nb_transpositions)
-  {
+   * using an unblocked algorithm.
+   *
+   * In addition, this function returns the row transpositions in the
+   * vector \a row_transpositions which must have a size equal to the number
+   * of columns of the matrix \a lu, and an integer \a nb_transpositions
+   * which returns the actual number of transpositions.
+   *
+   * \returns The index of the first pivot which is exactly zero if any, or a negative number otherwise.
+   */
+  static Index unblocked_lu(MatrixTypeRef& lu, PivIndex* row_transpositions, PivIndex& nb_transpositions) {
+    typedef scalar_score_coeff_op<Scalar> Scoring;
+    typedef typename Scoring::result_type Score;
     const Index rows = lu.rows();
     const Index cols = lu.cols();
-    const Index size = (std::min)(rows,cols);
+    const Index size = (std::min)(rows, cols);
+    // For small compile-time matrices it is worth processing the last row separately:
+    //  speedup: +100% for 2x2, +10% for others.
+    const Index endk = UnBlockedAtCompileTime ? size - 1 : size;
     nb_transpositions = 0;
     Index first_zero_pivot = -1;
-    for(Index k = 0; k < size; ++k)
-    {
-      Index rrows = rows-k-1;
-      Index rcols = cols-k-1;
-        
+    for (Index k = 0; k < endk; ++k) {
+      int rrows = internal::convert_index<int>(rows - k - 1);
+      int rcols = internal::convert_index<int>(cols - k - 1);
+
       Index row_of_biggest_in_col;
-      RealScalar biggest_in_corner
-        = lu.col(k).tail(rows-k).cwiseAbs().maxCoeff(&row_of_biggest_in_col);
+      Score biggest_in_corner = lu.col(k).tail(rows - k).unaryExpr(Scoring()).maxCoeff(&row_of_biggest_in_col);
       row_of_biggest_in_col += k;
 
       row_transpositions[k] = PivIndex(row_of_biggest_in_col);
 
-      if(biggest_in_corner != RealScalar(0))
-      {
-        if(k != row_of_biggest_in_col)
-        {
+      if (!numext::is_exactly_zero(biggest_in_corner)) {
+        if (k != row_of_biggest_in_col) {
           lu.row(k).swap(lu.row(row_of_biggest_in_col));
           ++nb_transpositions;
         }
 
-        // FIXME shall we introduce a safe quotient expression in cas 1/lu.coeff(k,k)
-        // overflow but not the actual quotient?
-        lu.col(k).tail(rrows) /= lu.coeff(k,k);
-      }
-      else if(first_zero_pivot==-1)
-      {
+        lu.col(k).tail(fix<RRows>(rrows)) /= lu.coeff(k, k);
+      } else if (first_zero_pivot == -1) {
         // the pivot is exactly zero, we record the index of the first pivot which is exactly 0,
         // and continue the factorization such we still have A = PLU
         first_zero_pivot = k;
       }
 
-      if(k<rows-1)
-        lu.bottomRightCorner(rrows,rcols).noalias() -= lu.col(k).tail(rrows) * lu.row(k).tail(rcols);
+      if (k < rows - 1)
+        lu.bottomRightCorner(fix<RRows>(rrows), fix<RCols>(rcols)).noalias() -=
+            lu.col(k).tail(fix<RRows>(rrows)) * lu.row(k).tail(fix<RCols>(rcols));
     }
+
+    // special handling of the last entry
+    if (UnBlockedAtCompileTime) {
+      Index k = endk;
+      row_transpositions[k] = PivIndex(k);
+      if (numext::is_exactly_zero(Scoring()(lu(k, k))) && first_zero_pivot == -1) first_zero_pivot = k;
+    }
+
     return first_zero_pivot;
   }
 
   /** \internal performs the LU decomposition in-place of the matrix represented
-    * by the variables \a rows, \a cols, \a lu_data, and \a lu_stride using a
-    * recursive, blocked algorithm.
-    *
-    * In addition, this function returns the row transpositions in the
-    * vector \a row_transpositions which must have a size equal to the number
-    * of columns of the matrix \a lu, and an integer \a nb_transpositions
-    * which returns the actual number of transpositions.
-    *
-    * \returns The index of the first pivot which is exactly zero if any, or a negative number otherwise.
-    *
-    * \note This very low level interface using pointers, etc. is to:
-    *   1 - reduce the number of instanciations to the strict minimum
-    *   2 - avoid infinite recursion of the instanciations with Block<Block<Block<...> > >
-    */
-  static Index blocked_lu(Index rows, Index cols, Scalar* lu_data, Index luStride, PivIndex* row_transpositions, PivIndex& nb_transpositions, Index maxBlockSize=256)
-  {
-    MapLU lu1(lu_data,StorageOrder==RowMajor?rows:luStride,StorageOrder==RowMajor?luStride:cols);
-    MatrixType lu(lu1,0,0,rows,cols);
-
-    const Index size = (std::min)(rows,cols);
+   * by the variables \a rows, \a cols, \a lu_data, and \a lu_stride using a
+   * recursive, blocked algorithm.
+   *
+   * In addition, this function returns the row transpositions in the
+   * vector \a row_transpositions which must have a size equal to the number
+   * of columns of the matrix \a lu, and an integer \a nb_transpositions
+   * which returns the actual number of transpositions.
+   *
+   * \returns The index of the first pivot which is exactly zero if any, or a negative number otherwise.
+   *
+   * \note This very low level interface using pointers, etc. is to:
+   *   1 - reduce the number of instantiations to the strict minimum
+   *   2 - avoid infinite recursion of the instantiations with Block<Block<Block<...> > >
+   */
+  static Index blocked_lu(Index rows, Index cols, Scalar* lu_data, Index luStride, PivIndex* row_transpositions,
+                          PivIndex& nb_transpositions, Index maxBlockSize = 256) {
+    MatrixTypeRef lu = MatrixType::Map(lu_data, rows, cols, OuterStride<>(luStride));
+
+    const Index size = (std::min)(rows, cols);
 
     // if the matrix is too small, no blocking:
-    if(size<=16)
-    {
+    if (UnBlockedAtCompileTime || size <= UnBlockedBound) {
       return unblocked_lu(lu, row_transpositions, nb_transpositions);
     }
 
@@ -318,51 +414,46 @@ struct partial_lu_impl
     // of the matrix so that there is enough sub blocks:
     Index blockSize;
     {
-      blockSize = size/8;
-      blockSize = (blockSize/16)*16;
-      blockSize = (std::min)((std::max)(blockSize,Index(8)), maxBlockSize);
+      blockSize = size / 8;
+      blockSize = (blockSize / 16) * 16;
+      blockSize = (std::min)((std::max)(blockSize, Index(8)), maxBlockSize);
     }
 
     nb_transpositions = 0;
     Index first_zero_pivot = -1;
-    for(Index k = 0; k < size; k+=blockSize)
-    {
-      Index bs = (std::min)(size-k,blockSize); // actual size of the block
-      Index trows = rows - k - bs; // trailing rows
-      Index tsize = size - k - bs; // trailing size
+    for (Index k = 0; k < size; k += blockSize) {
+      Index bs = (std::min)(size - k, blockSize);  // actual size of the block
+      Index trows = rows - k - bs;                 // trailing rows
+      Index tsize = size - k - bs;                 // trailing size
 
       // partition the matrix:
       //                          A00 | A01 | A02
       // lu  = A_0 | A_1 | A_2 =  A10 | A11 | A12
       //                          A20 | A21 | A22
-      BlockType A_0(lu,0,0,rows,k);
-      BlockType A_2(lu,0,k+bs,rows,tsize);
-      BlockType A11(lu,k,k,bs,bs);
-      BlockType A12(lu,k,k+bs,bs,tsize);
-      BlockType A21(lu,k+bs,k,trows,bs);
-      BlockType A22(lu,k+bs,k+bs,trows,tsize);
+      BlockType A_0 = lu.block(0, 0, rows, k);
+      BlockType A_2 = lu.block(0, k + bs, rows, tsize);
+      BlockType A11 = lu.block(k, k, bs, bs);
+      BlockType A12 = lu.block(k, k + bs, bs, tsize);
+      BlockType A21 = lu.block(k + bs, k, trows, bs);
+      BlockType A22 = lu.block(k + bs, k + bs, trows, tsize);
 
       PivIndex nb_transpositions_in_panel;
       // recursively call the blocked LU algorithm on [A11^T A21^T]^T
       // with a very small blocking size:
-      Index ret = blocked_lu(trows+bs, bs, &lu.coeffRef(k,k), luStride,
-                   row_transpositions+k, nb_transpositions_in_panel, 16);
-      if(ret>=0 && first_zero_pivot==-1)
-        first_zero_pivot = k+ret;
+      Index ret = blocked_lu(trows + bs, bs, &lu.coeffRef(k, k), luStride, row_transpositions + k,
+                             nb_transpositions_in_panel, 16);
+      if (ret >= 0 && first_zero_pivot == -1) first_zero_pivot = k + ret;
 
       nb_transpositions += nb_transpositions_in_panel;
       // update permutations and apply them to A_0
-      for(Index i=k; i<k+bs; ++i)
-      {
-        Index piv = (row_transpositions[i] += k);
+      for (Index i = k; i < k + bs; ++i) {
+        Index piv = (row_transpositions[i] += internal::convert_index<PivIndex>(k));
         A_0.row(i).swap(A_0.row(piv));
       }
 
-      if(trows)
-      {
+      if (trows) {
         // apply permutations to A_2
-        for(Index i=k;i<k+bs; ++i)
-          A_2.row(i).swap(A_2.row(row_transpositions[i]));
+        for (Index i = k; i < k + bs; ++i) A_2.row(i).swap(A_2.row(row_transpositions[i]));
 
         // A12 = A11^-1 A12
         A11.template triangularView<UnitLower>().solveInPlace(A12);
@@ -375,48 +466,54 @@ struct partial_lu_impl
 };
 
 /** \internal performs the LU decomposition with partial pivoting in-place.
-  */
-template<typename MatrixType, typename TranspositionType>
-void partial_lu_inplace(MatrixType& lu, TranspositionType& row_transpositions, typename TranspositionType::Index& nb_transpositions)
-{
+ */
+template <typename MatrixType, typename TranspositionType>
+void partial_lu_inplace(MatrixType& lu, TranspositionType& row_transpositions,
+                        typename TranspositionType::StorageIndex& nb_transpositions) {
+  // Special-case of zero matrix.
+  if (lu.rows() == 0 || lu.cols() == 0) {
+    nb_transpositions = 0;
+    return;
+  }
   eigen_assert(lu.cols() == row_transpositions.size());
-  eigen_assert((&row_transpositions.coeffRef(1)-&row_transpositions.coeffRef(0)) == 1);
-
-  partial_lu_impl
-    <typename MatrixType::Scalar, MatrixType::Flags&RowMajorBit?RowMajor:ColMajor, typename TranspositionType::Index>
-    ::blocked_lu(lu.rows(), lu.cols(), &lu.coeffRef(0,0), lu.outerStride(), &row_transpositions.coeffRef(0), nb_transpositions);
+  eigen_assert(row_transpositions.size() < 2 ||
+               (&row_transpositions.coeffRef(1) - &row_transpositions.coeffRef(0)) == 1);
+
+  partial_lu_impl<typename MatrixType::Scalar, MatrixType::Flags & RowMajorBit ? RowMajor : ColMajor,
+                  typename TranspositionType::StorageIndex,
+                  internal::min_size_prefer_fixed(MatrixType::RowsAtCompileTime, MatrixType::ColsAtCompileTime)>::
+      blocked_lu(lu.rows(), lu.cols(), &lu.coeffRef(0, 0), lu.outerStride(), &row_transpositions.coeffRef(0),
+                 nb_transpositions);
 }
 
-} // end namespace internal
+}  // end namespace internal
+
+template <typename MatrixType, typename PermutationIndex>
+void PartialPivLU<MatrixType, PermutationIndex>::compute() {
+  eigen_assert(m_lu.rows() < NumTraits<PermutationIndex>::highest());
 
-template<typename MatrixType>
-PartialPivLU<MatrixType>& PartialPivLU<MatrixType>::compute(const MatrixType& matrix)
-{
-  check_template_parameters();
-  
-  // the row permutation is stored as int indices, so just to be sure:
-  eigen_assert(matrix.rows()<NumTraits<int>::highest());
-  
-  m_lu = matrix;
+  if (m_lu.cols() > 0)
+    m_l1_norm = m_lu.cwiseAbs().colwise().sum().maxCoeff();
+  else
+    m_l1_norm = RealScalar(0);
 
-  eigen_assert(matrix.rows() == matrix.cols() && "PartialPivLU is only for square (and moreover invertible) matrices");
-  const Index size = matrix.rows();
+  eigen_assert(m_lu.rows() == m_lu.cols() && "PartialPivLU is only for square (and moreover invertible) matrices");
+  const Index size = m_lu.rows();
 
   m_rowsTranspositions.resize(size);
 
-  typename TranspositionType::Index nb_transpositions;
+  typename TranspositionType::StorageIndex nb_transpositions;
   internal::partial_lu_inplace(m_lu, m_rowsTranspositions, nb_transpositions);
-  m_det_p = (nb_transpositions%2) ? -1 : 1;
+  m_det_p = (nb_transpositions % 2) ? -1 : 1;
 
   m_p = m_rowsTranspositions;
 
   m_isInitialized = true;
-  return *this;
 }
 
-template<typename MatrixType>
-typename internal::traits<MatrixType>::Scalar PartialPivLU<MatrixType>::determinant() const
-{
+template <typename MatrixType, typename PermutationIndex>
+typename PartialPivLU<MatrixType, PermutationIndex>::Scalar PartialPivLU<MatrixType, PermutationIndex>::determinant()
+    const {
   eigen_assert(m_isInitialized && "PartialPivLU is not initialized.");
   return Scalar(m_det_p) * m_lu.diagonal().prod();
 }
@@ -424,13 +521,11 @@ typename internal::traits<MatrixType>::Scalar PartialPivLU<MatrixType>::determin
 /** \returns the matrix represented by the decomposition,
  * i.e., it returns the product: P^{-1} L U.
  * This function is provided for debug purpose. */
-template<typename MatrixType>
-MatrixType PartialPivLU<MatrixType>::reconstructedMatrix() const
-{
+template <typename MatrixType, typename PermutationIndex>
+MatrixType PartialPivLU<MatrixType, PermutationIndex>::reconstructedMatrix() const {
   eigen_assert(m_isInitialized && "LU is not initialized.");
   // LU
-  MatrixType res = m_lu.template triangularView<UnitLower>().toDenseMatrix()
-                 * m_lu.template triangularView<Upper>();
+  MatrixType res = m_lu.template triangularView<UnitLower>().toDenseMatrix() * m_lu.template triangularView<Upper>();
 
   // P^{-1}(LU)
   res = m_p.inverse() * res;
@@ -438,72 +533,54 @@ MatrixType PartialPivLU<MatrixType>::reconstructedMatrix() const
   return res;
 }
 
-/***** Implementation of solve() *****************************************************/
+/***** Implementation details *****************************************************/
 
 namespace internal {
 
-template<typename _MatrixType, typename Rhs>
-struct solve_retval<PartialPivLU<_MatrixType>, Rhs>
-  : solve_retval_base<PartialPivLU<_MatrixType>, Rhs>
-{
-  EIGEN_MAKE_SOLVE_HELPERS(PartialPivLU<_MatrixType>,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    /* The decomposition PA = LU can be rewritten as A = P^{-1} L U.
-    * So we proceed as follows:
-    * Step 1: compute c = Pb.
-    * Step 2: replace c by the solution x to Lx = c.
-    * Step 3: replace c by the solution x to Ux = c.
-    */
-
-    eigen_assert(rhs().rows() == dec().matrixLU().rows());
-
-    // Step 1
-    dst = dec().permutationP() * rhs();
-
-    // Step 2
-    dec().matrixLU().template triangularView<UnitLower>().solveInPlace(dst);
-
-    // Step 3
-    dec().matrixLU().template triangularView<Upper>().solveInPlace(dst);
+/***** Implementation of inverse() *****************************************************/
+template <typename DstXprType, typename MatrixType, typename PermutationIndex>
+struct Assignment<
+    DstXprType, Inverse<PartialPivLU<MatrixType, PermutationIndex> >,
+    internal::assign_op<typename DstXprType::Scalar, typename PartialPivLU<MatrixType, PermutationIndex>::Scalar>,
+    Dense2Dense> {
+  typedef PartialPivLU<MatrixType, PermutationIndex> LuType;
+  typedef Inverse<LuType> SrcXprType;
+  static void run(DstXprType& dst, const SrcXprType& src,
+                  const internal::assign_op<typename DstXprType::Scalar, typename LuType::Scalar>&) {
+    dst = src.nestedExpression().solve(MatrixType::Identity(src.rows(), src.cols()));
   }
 };
-
-} // end namespace internal
+}  // end namespace internal
 
 /******** MatrixBase methods *******/
 
 /** \lu_module
-  *
-  * \return the partial-pivoting LU decomposition of \c *this.
-  *
-  * \sa class PartialPivLU
-  */
-template<typename Derived>
-inline const PartialPivLU<typename MatrixBase<Derived>::PlainObject>
-MatrixBase<Derived>::partialPivLu() const
-{
-  return PartialPivLU<PlainObject>(eval());
+ *
+ * \return the partial-pivoting LU decomposition of \c *this.
+ *
+ * \sa class PartialPivLU
+ */
+template <typename Derived>
+template <typename PermutationIndex>
+inline const PartialPivLU<typename MatrixBase<Derived>::PlainObject, PermutationIndex>
+MatrixBase<Derived>::partialPivLu() const {
+  return PartialPivLU<PlainObject, PermutationIndex>(eval());
 }
 
-#if EIGEN2_SUPPORT_STAGE > STAGE20_RESOLVE_API_CONFLICTS
 /** \lu_module
-  *
-  * Synonym of partialPivLu().
-  *
-  * \return the partial-pivoting LU decomposition of \c *this.
-  *
-  * \sa class PartialPivLU
-  */
-template<typename Derived>
-inline const PartialPivLU<typename MatrixBase<Derived>::PlainObject>
-MatrixBase<Derived>::lu() const
-{
-  return PartialPivLU<PlainObject>(eval());
+ *
+ * Synonym of partialPivLu().
+ *
+ * \return the partial-pivoting LU decomposition of \c *this.
+ *
+ * \sa class PartialPivLU
+ */
+template <typename Derived>
+template <typename PermutationIndex>
+inline const PartialPivLU<typename MatrixBase<Derived>::PlainObject, PermutationIndex> MatrixBase<Derived>::lu() const {
+  return PartialPivLU<PlainObject, PermutationIndex>(eval());
 }
-#endif
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_PARTIALLU_H
+#endif  // EIGEN_PARTIALLU_H
diff --git a/inst/include/Eigen/src/LU/PartialPivLU_LAPACKE.h b/inst/include/Eigen/src/LU/PartialPivLU_LAPACKE.h
new file mode 100644
index 00000000..086c3c3c
--- /dev/null
+++ b/inst/include/Eigen/src/LU/PartialPivLU_LAPACKE.h
@@ -0,0 +1,97 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to LAPACKe
+ *     LU decomposition with partial pivoting based on LAPACKE_?getrf function.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_PARTIALLU_LAPACK_H
+#define EIGEN_PARTIALLU_LAPACK_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+namespace lapacke_helpers {
+// -------------------------------------------------------------------------------------------------------------------
+//        Generic lapacke partial lu implementation that converts arguments and dispatches to the function above
+// -------------------------------------------------------------------------------------------------------------------
+
+template <typename Scalar, int StorageOrder>
+struct lapacke_partial_lu {
+  /** \internal performs the LU decomposition in-place of the matrix represented */
+  static lapack_int blocked_lu(Index rows, Index cols, Scalar* lu_data, Index luStride, lapack_int* row_transpositions,
+                               lapack_int& nb_transpositions, lapack_int maxBlockSize = 256) {
+    EIGEN_UNUSED_VARIABLE(maxBlockSize);
+    // Set up parameters for getrf
+    lapack_int matrix_order = StorageOrder == RowMajor ? LAPACK_ROW_MAJOR : LAPACK_COL_MAJOR;
+    lapack_int lda = to_lapack(luStride);
+    Scalar* a = lu_data;
+    lapack_int* ipiv = row_transpositions;
+    lapack_int m = to_lapack(rows);
+    lapack_int n = to_lapack(cols);
+    nb_transpositions = 0;
+
+    lapack_int info = getrf(matrix_order, m, n, to_lapack(a), lda, ipiv);
+    eigen_assert(info >= 0);
+
+    for (int i = 0; i < m; i++) {
+      ipiv[i]--;
+      if (ipiv[i] != i) nb_transpositions++;
+    }
+    lapack_int first_zero_pivot = info;
+    return first_zero_pivot;
+  }
+};
+}  // end namespace lapacke_helpers
+
+/*
+ * Here, we just put the generic implementation from lapacke_partial_lu into a partial specialization of the
+ * partial_lu_impl type. This specialization is more specialized than the generic implementations that Eigen implements,
+ * so if the Scalar type matches they will be chosen.
+ */
+#define EIGEN_LAPACKE_PARTIAL_LU(EIGTYPE)                            \
+  template <int StorageOrder>                                        \
+  struct partial_lu_impl<EIGTYPE, StorageOrder, lapack_int, Dynamic> \
+      : public lapacke_helpers::lapacke_partial_lu<EIGTYPE, StorageOrder> {};
+
+EIGEN_LAPACKE_PARTIAL_LU(double)
+EIGEN_LAPACKE_PARTIAL_LU(float)
+EIGEN_LAPACKE_PARTIAL_LU(std::complex<double>)
+EIGEN_LAPACKE_PARTIAL_LU(std::complex<float>)
+
+#undef EIGEN_LAPACKE_PARTIAL_LU
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_PARTIALLU_LAPACK_H
diff --git a/inst/include/Eigen/src/LU/PartialPivLU_MKL.h b/inst/include/Eigen/src/LU/PartialPivLU_MKL.h
deleted file mode 100644
index 9035953c..00000000
--- a/inst/include/Eigen/src/LU/PartialPivLU_MKL.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- Copyright (c) 2011, Intel Corporation. All rights reserved.
-
- Redistribution and use in source and binary forms, with or without modification,
- are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
- * Neither the name of Intel Corporation nor the names of its contributors may
-   be used to endorse or promote products derived from this software without
-   specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
- ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
- ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
- *     LU decomposition with partial pivoting based on LAPACKE_?getrf function.
- ********************************************************************************
-*/
-
-#ifndef EIGEN_PARTIALLU_LAPACK_H
-#define EIGEN_PARTIALLU_LAPACK_H
-
-#include "Eigen/src/Core/util/MKL_support.h"
-
-namespace Eigen { 
-
-namespace internal {
-
-/** \internal Specialization for the data types supported by MKL */
-
-#define EIGEN_MKL_LU_PARTPIV(EIGTYPE, MKLTYPE, MKLPREFIX) \
-template<int StorageOrder> \
-struct partial_lu_impl<EIGTYPE, StorageOrder, lapack_int> \
-{ \
-  /* \internal performs the LU decomposition in-place of the matrix represented */ \
-  static lapack_int blocked_lu(lapack_int rows, lapack_int cols, EIGTYPE* lu_data, lapack_int luStride, lapack_int* row_transpositions, lapack_int& nb_transpositions, lapack_int maxBlockSize=256) \
-  { \
-    EIGEN_UNUSED_VARIABLE(maxBlockSize);\
-    lapack_int matrix_order, first_zero_pivot; \
-    lapack_int m, n, lda, *ipiv, info; \
-    EIGTYPE* a; \
-/* Set up parameters for ?getrf */ \
-    matrix_order = StorageOrder==RowMajor ? LAPACK_ROW_MAJOR : LAPACK_COL_MAJOR; \
-    lda = luStride; \
-    a = lu_data; \
-    ipiv = row_transpositions; \
-    m = rows; \
-    n = cols; \
-    nb_transpositions = 0; \
-\
-    info = LAPACKE_##MKLPREFIX##getrf( matrix_order, m, n, (MKLTYPE*)a, lda, ipiv ); \
-\
-    for(int i=0;i<m;i++) { ipiv[i]--; if (ipiv[i]!=i) nb_transpositions++; } \
-\
-    eigen_assert(info >= 0); \
-/* something should be done with nb_transpositions */ \
-\
-    first_zero_pivot = info; \
-    return first_zero_pivot; \
-  } \
-};
-
-EIGEN_MKL_LU_PARTPIV(double, double, d)
-EIGEN_MKL_LU_PARTPIV(float, float, s)
-EIGEN_MKL_LU_PARTPIV(dcomplex, MKL_Complex16, z)
-EIGEN_MKL_LU_PARTPIV(scomplex, MKL_Complex8, c)
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_PARTIALLU_LAPACK_H
diff --git a/inst/include/Eigen/src/LU/arch/InverseSize4.h b/inst/include/Eigen/src/LU/arch/InverseSize4.h
new file mode 100644
index 00000000..29c9b036
--- /dev/null
+++ b/inst/include/Eigen/src/LU/arch/InverseSize4.h
@@ -0,0 +1,353 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2001 Intel Corporation
+// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+//
+// The algorithm below is a reimplementation of former \src\LU\Inverse_SSE.h using PacketMath.
+// inv(M) = M#/|M|, where inv(M), M# and |M| denote the inverse of M,
+// adjugate of M and determinant of M respectively. M# is computed block-wise
+// using specific formulae. For proof, see:
+// https://lxjk.github.io/2017/09/03/Fast-4x4-Matrix-Inverse-with-SSE-SIMD-Explained.html
+// Variable names are adopted from \src\LU\Inverse_SSE.h.
+//
+// The SSE code for the 4x4 float and double matrix inverse in former (deprecated) \src\LU\Inverse_SSE.h
+// comes from the following Intel's library:
+// http://software.intel.com/en-us/articles/optimized-matrix-library-for-use-with-the-intel-pentiumr-4-processors-sse2-instructions/
+//
+// Here is the respective copyright and license statement:
+//
+//   Copyright (c) 2001 Intel Corporation.
+//
+// Permission is granted to use, copy, distribute and prepare derivative works
+// of this library for any purpose and without fee, provided, that the above
+// copyright notice and this statement appear in all copies.
+// Intel makes no representations about the suitability of this software for
+// any purpose, and specifically disclaims all warranties.
+// See LEGAL.TXT for all the legal information.
+//
+// TODO: Unify implementations of different data types (i.e. float and double).
+#ifndef EIGEN_INVERSE_SIZE_4_H
+#define EIGEN_INVERSE_SIZE_4_H
+
+// IWYU pragma: private
+#include "../InternalHeaderCheck.h"
+
+#if EIGEN_COMP_GNUC_STRICT
+// These routines requires bit manipulation of the sign, which is not compatible
+// with fastmath.
+#pragma GCC push_options
+#pragma GCC optimize("no-fast-math")
+#endif
+
+namespace Eigen {
+namespace internal {
+template <typename MatrixType, typename ResultType>
+struct compute_inverse_size4<Architecture::Target, float, MatrixType, ResultType> {
+  enum {
+    MatrixAlignment = traits<MatrixType>::Alignment,
+    ResultAlignment = traits<ResultType>::Alignment,
+    StorageOrdersMatch = (MatrixType::Flags & RowMajorBit) == (ResultType::Flags & RowMajorBit)
+  };
+  typedef std::conditional_t<(MatrixType::Flags & LinearAccessBit), MatrixType const &,
+                             typename MatrixType::PlainObject>
+      ActualMatrixType;
+
+  static void run(const MatrixType &mat, ResultType &result) {
+    ActualMatrixType matrix(mat);
+
+    const float *data = matrix.data();
+    const Index stride = matrix.innerStride();
+    Packet4f L1 = ploadt<Packet4f, MatrixAlignment>(data);
+    Packet4f L2 = ploadt<Packet4f, MatrixAlignment>(data + stride * 4);
+    Packet4f L3 = ploadt<Packet4f, MatrixAlignment>(data + stride * 8);
+    Packet4f L4 = ploadt<Packet4f, MatrixAlignment>(data + stride * 12);
+
+    // Four 2x2 sub-matrices of the input matrix
+    // input = [[A, B],
+    //          [C, D]]
+    Packet4f A, B, C, D;
+
+    if (!StorageOrdersMatch) {
+      A = vec4f_unpacklo(L1, L2);
+      B = vec4f_unpacklo(L3, L4);
+      C = vec4f_unpackhi(L1, L2);
+      D = vec4f_unpackhi(L3, L4);
+    } else {
+      A = vec4f_movelh(L1, L2);
+      B = vec4f_movehl(L2, L1);
+      C = vec4f_movelh(L3, L4);
+      D = vec4f_movehl(L4, L3);
+    }
+
+    Packet4f AB, DC;
+
+    // AB = A# * B, where A# denotes the adjugate of A, and * denotes matrix product.
+    AB = pmul(vec4f_swizzle2(A, A, 3, 3, 0, 0), B);
+    AB = psub(AB, pmul(vec4f_swizzle2(A, A, 1, 1, 2, 2), vec4f_swizzle2(B, B, 2, 3, 0, 1)));
+
+    // DC = D#*C
+    DC = pmul(vec4f_swizzle2(D, D, 3, 3, 0, 0), C);
+    DC = psub(DC, pmul(vec4f_swizzle2(D, D, 1, 1, 2, 2), vec4f_swizzle2(C, C, 2, 3, 0, 1)));
+
+    // determinants of the sub-matrices
+    Packet4f dA, dB, dC, dD;
+
+    dA = pmul(vec4f_swizzle2(A, A, 3, 3, 1, 1), A);
+    dA = psub(dA, vec4f_movehl(dA, dA));
+
+    dB = pmul(vec4f_swizzle2(B, B, 3, 3, 1, 1), B);
+    dB = psub(dB, vec4f_movehl(dB, dB));
+
+    dC = pmul(vec4f_swizzle2(C, C, 3, 3, 1, 1), C);
+    dC = psub(dC, vec4f_movehl(dC, dC));
+
+    dD = pmul(vec4f_swizzle2(D, D, 3, 3, 1, 1), D);
+    dD = psub(dD, vec4f_movehl(dD, dD));
+
+    Packet4f d, d1, d2;
+
+    d = pmul(vec4f_swizzle2(DC, DC, 0, 2, 1, 3), AB);
+    d = padd(d, vec4f_movehl(d, d));
+    d = padd(d, vec4f_swizzle2(d, d, 1, 0, 0, 0));
+    d1 = pmul(dA, dD);
+    d2 = pmul(dB, dC);
+
+    // determinant of the input matrix, det = |A||D| + |B||C| - trace(A#*B*D#*C)
+    Packet4f det = vec4f_duplane(psub(padd(d1, d2), d), 0);
+
+    // reciprocal of the determinant of the input matrix, rd = 1/det
+    Packet4f rd = preciprocal(det);
+
+    // Four sub-matrices of the inverse
+    Packet4f iA, iB, iC, iD;
+
+    // iD = D*|A| - C*A#*B
+    iD = pmul(vec4f_swizzle2(C, C, 0, 0, 2, 2), vec4f_movelh(AB, AB));
+    iD = padd(iD, pmul(vec4f_swizzle2(C, C, 1, 1, 3, 3), vec4f_movehl(AB, AB)));
+    iD = psub(pmul(D, vec4f_duplane(dA, 0)), iD);
+
+    // iA = A*|D| - B*D#*C
+    iA = pmul(vec4f_swizzle2(B, B, 0, 0, 2, 2), vec4f_movelh(DC, DC));
+    iA = padd(iA, pmul(vec4f_swizzle2(B, B, 1, 1, 3, 3), vec4f_movehl(DC, DC)));
+    iA = psub(pmul(A, vec4f_duplane(dD, 0)), iA);
+
+    // iB = C*|B| - D * (A#B)# = C*|B| - D*B#*A
+    iB = pmul(D, vec4f_swizzle2(AB, AB, 3, 0, 3, 0));
+    iB = psub(iB, pmul(vec4f_swizzle2(D, D, 1, 0, 3, 2), vec4f_swizzle2(AB, AB, 2, 1, 2, 1)));
+    iB = psub(pmul(C, vec4f_duplane(dB, 0)), iB);
+
+    // iC = B*|C| - A * (D#C)# = B*|C| - A*C#*D
+    iC = pmul(A, vec4f_swizzle2(DC, DC, 3, 0, 3, 0));
+    iC = psub(iC, pmul(vec4f_swizzle2(A, A, 1, 0, 3, 2), vec4f_swizzle2(DC, DC, 2, 1, 2, 1)));
+    iC = psub(pmul(B, vec4f_duplane(dC, 0)), iC);
+
+    EIGEN_ALIGN_MAX const float sign_mask[4] = {0.0f, -0.0f, -0.0f, 0.0f};
+    const Packet4f p4f_sign_PNNP = pload<Packet4f>(sign_mask);
+    rd = pxor(rd, p4f_sign_PNNP);
+    iA = pmul(iA, rd);
+    iB = pmul(iB, rd);
+    iC = pmul(iC, rd);
+    iD = pmul(iD, rd);
+
+    Index res_stride = result.outerStride();
+    float *res = result.data();
+
+    pstoret<float, Packet4f, ResultAlignment>(res + 0, vec4f_swizzle2(iA, iB, 3, 1, 3, 1));
+    pstoret<float, Packet4f, ResultAlignment>(res + res_stride, vec4f_swizzle2(iA, iB, 2, 0, 2, 0));
+    pstoret<float, Packet4f, ResultAlignment>(res + 2 * res_stride, vec4f_swizzle2(iC, iD, 3, 1, 3, 1));
+    pstoret<float, Packet4f, ResultAlignment>(res + 3 * res_stride, vec4f_swizzle2(iC, iD, 2, 0, 2, 0));
+  }
+};
+
+#if !(defined EIGEN_VECTORIZE_NEON && !(EIGEN_ARCH_ARM64 && !EIGEN_APPLE_DOUBLE_NEON_BUG))
+// same algorithm as above, except that each operand is split into
+// halves for two registers to hold.
+template <typename MatrixType, typename ResultType>
+struct compute_inverse_size4<Architecture::Target, double, MatrixType, ResultType> {
+  enum {
+    MatrixAlignment = traits<MatrixType>::Alignment,
+    ResultAlignment = traits<ResultType>::Alignment,
+    StorageOrdersMatch = (MatrixType::Flags & RowMajorBit) == (ResultType::Flags & RowMajorBit)
+  };
+  typedef std::conditional_t<(MatrixType::Flags & LinearAccessBit), MatrixType const &,
+                             typename MatrixType::PlainObject>
+      ActualMatrixType;
+
+  static void run(const MatrixType &mat, ResultType &result) {
+    ActualMatrixType matrix(mat);
+
+    // Four 2x2 sub-matrices of the input matrix, each is further divided into upper and lower
+    // row e.g. A1, upper row of A, A2, lower row of A
+    // input = [[A, B],  =  [[[A1, [B1,
+    //          [C, D]]        A2], B2]],
+    //                       [[C1, [D1,
+    //                         C2], D2]]]
+
+    Packet2d A1, A2, B1, B2, C1, C2, D1, D2;
+
+    const double *data = matrix.data();
+    const Index stride = matrix.innerStride();
+    if (StorageOrdersMatch) {
+      A1 = ploadt<Packet2d, MatrixAlignment>(data + stride * 0);
+      B1 = ploadt<Packet2d, MatrixAlignment>(data + stride * 2);
+      A2 = ploadt<Packet2d, MatrixAlignment>(data + stride * 4);
+      B2 = ploadt<Packet2d, MatrixAlignment>(data + stride * 6);
+      C1 = ploadt<Packet2d, MatrixAlignment>(data + stride * 8);
+      D1 = ploadt<Packet2d, MatrixAlignment>(data + stride * 10);
+      C2 = ploadt<Packet2d, MatrixAlignment>(data + stride * 12);
+      D2 = ploadt<Packet2d, MatrixAlignment>(data + stride * 14);
+    } else {
+      Packet2d temp;
+      A1 = ploadt<Packet2d, MatrixAlignment>(data + stride * 0);
+      C1 = ploadt<Packet2d, MatrixAlignment>(data + stride * 2);
+      A2 = ploadt<Packet2d, MatrixAlignment>(data + stride * 4);
+      C2 = ploadt<Packet2d, MatrixAlignment>(data + stride * 6);
+      temp = A1;
+      A1 = vec2d_unpacklo(A1, A2);
+      A2 = vec2d_unpackhi(temp, A2);
+
+      temp = C1;
+      C1 = vec2d_unpacklo(C1, C2);
+      C2 = vec2d_unpackhi(temp, C2);
+
+      B1 = ploadt<Packet2d, MatrixAlignment>(data + stride * 8);
+      D1 = ploadt<Packet2d, MatrixAlignment>(data + stride * 10);
+      B2 = ploadt<Packet2d, MatrixAlignment>(data + stride * 12);
+      D2 = ploadt<Packet2d, MatrixAlignment>(data + stride * 14);
+
+      temp = B1;
+      B1 = vec2d_unpacklo(B1, B2);
+      B2 = vec2d_unpackhi(temp, B2);
+
+      temp = D1;
+      D1 = vec2d_unpacklo(D1, D2);
+      D2 = vec2d_unpackhi(temp, D2);
+    }
+
+    // determinants of the sub-matrices
+    Packet2d dA, dB, dC, dD;
+
+    dA = vec2d_swizzle2(A2, A2, 1);
+    dA = pmul(A1, dA);
+    dA = psub(dA, vec2d_duplane(dA, 1));
+
+    dB = vec2d_swizzle2(B2, B2, 1);
+    dB = pmul(B1, dB);
+    dB = psub(dB, vec2d_duplane(dB, 1));
+
+    dC = vec2d_swizzle2(C2, C2, 1);
+    dC = pmul(C1, dC);
+    dC = psub(dC, vec2d_duplane(dC, 1));
+
+    dD = vec2d_swizzle2(D2, D2, 1);
+    dD = pmul(D1, dD);
+    dD = psub(dD, vec2d_duplane(dD, 1));
+
+    Packet2d DC1, DC2, AB1, AB2;
+
+    // AB = A# * B, where A# denotes the adjugate of A, and * denotes matrix product.
+    AB1 = pmul(B1, vec2d_duplane(A2, 1));
+    AB2 = pmul(B2, vec2d_duplane(A1, 0));
+    AB1 = psub(AB1, pmul(B2, vec2d_duplane(A1, 1)));
+    AB2 = psub(AB2, pmul(B1, vec2d_duplane(A2, 0)));
+
+    // DC = D#*C
+    DC1 = pmul(C1, vec2d_duplane(D2, 1));
+    DC2 = pmul(C2, vec2d_duplane(D1, 0));
+    DC1 = psub(DC1, pmul(C2, vec2d_duplane(D1, 1)));
+    DC2 = psub(DC2, pmul(C1, vec2d_duplane(D2, 0)));
+
+    Packet2d d1, d2;
+
+    // determinant of the input matrix, det = |A||D| + |B||C| - trace(A#*B*D#*C)
+    Packet2d det;
+
+    // reciprocal of the determinant of the input matrix, rd = 1/det
+    Packet2d rd;
+
+    d1 = pmul(AB1, vec2d_swizzle2(DC1, DC2, 0));
+    d2 = pmul(AB2, vec2d_swizzle2(DC1, DC2, 3));
+    rd = padd(d1, d2);
+    rd = padd(rd, vec2d_duplane(rd, 1));
+
+    d1 = pmul(dA, dD);
+    d2 = pmul(dB, dC);
+
+    det = padd(d1, d2);
+    det = psub(det, rd);
+    det = vec2d_duplane(det, 0);
+    rd = pdiv(pset1<Packet2d>(1.0), det);
+
+    // rows of four sub-matrices of the inverse
+    Packet2d iA1, iA2, iB1, iB2, iC1, iC2, iD1, iD2;
+
+    // iD = D*|A| - C*A#*B
+    iD1 = pmul(AB1, vec2d_duplane(C1, 0));
+    iD2 = pmul(AB1, vec2d_duplane(C2, 0));
+    iD1 = padd(iD1, pmul(AB2, vec2d_duplane(C1, 1)));
+    iD2 = padd(iD2, pmul(AB2, vec2d_duplane(C2, 1)));
+    dA = vec2d_duplane(dA, 0);
+    iD1 = psub(pmul(D1, dA), iD1);
+    iD2 = psub(pmul(D2, dA), iD2);
+
+    // iA = A*|D| - B*D#*C
+    iA1 = pmul(DC1, vec2d_duplane(B1, 0));
+    iA2 = pmul(DC1, vec2d_duplane(B2, 0));
+    iA1 = padd(iA1, pmul(DC2, vec2d_duplane(B1, 1)));
+    iA2 = padd(iA2, pmul(DC2, vec2d_duplane(B2, 1)));
+    dD = vec2d_duplane(dD, 0);
+    iA1 = psub(pmul(A1, dD), iA1);
+    iA2 = psub(pmul(A2, dD), iA2);
+
+    // iB = C*|B| - D * (A#B)# = C*|B| - D*B#*A
+    iB1 = pmul(D1, vec2d_swizzle2(AB2, AB1, 1));
+    iB2 = pmul(D2, vec2d_swizzle2(AB2, AB1, 1));
+    iB1 = psub(iB1, pmul(vec2d_swizzle2(D1, D1, 1), vec2d_swizzle2(AB2, AB1, 2)));
+    iB2 = psub(iB2, pmul(vec2d_swizzle2(D2, D2, 1), vec2d_swizzle2(AB2, AB1, 2)));
+    dB = vec2d_duplane(dB, 0);
+    iB1 = psub(pmul(C1, dB), iB1);
+    iB2 = psub(pmul(C2, dB), iB2);
+
+    // iC = B*|C| - A * (D#C)# = B*|C| - A*C#*D
+    iC1 = pmul(A1, vec2d_swizzle2(DC2, DC1, 1));
+    iC2 = pmul(A2, vec2d_swizzle2(DC2, DC1, 1));
+    iC1 = psub(iC1, pmul(vec2d_swizzle2(A1, A1, 1), vec2d_swizzle2(DC2, DC1, 2)));
+    iC2 = psub(iC2, pmul(vec2d_swizzle2(A2, A2, 1), vec2d_swizzle2(DC2, DC1, 2)));
+    dC = vec2d_duplane(dC, 0);
+    iC1 = psub(pmul(B1, dC), iC1);
+    iC2 = psub(pmul(B2, dC), iC2);
+
+    EIGEN_ALIGN_MAX const double sign_mask1[2] = {0.0, -0.0};
+    EIGEN_ALIGN_MAX const double sign_mask2[2] = {-0.0, 0.0};
+    const Packet2d sign_PN = pload<Packet2d>(sign_mask1);
+    const Packet2d sign_NP = pload<Packet2d>(sign_mask2);
+    d1 = pxor(rd, sign_PN);
+    d2 = pxor(rd, sign_NP);
+
+    Index res_stride = result.outerStride();
+    double *res = result.data();
+    pstoret<double, Packet2d, ResultAlignment>(res + 0, pmul(vec2d_swizzle2(iA2, iA1, 3), d1));
+    pstoret<double, Packet2d, ResultAlignment>(res + res_stride, pmul(vec2d_swizzle2(iA2, iA1, 0), d2));
+    pstoret<double, Packet2d, ResultAlignment>(res + 2, pmul(vec2d_swizzle2(iB2, iB1, 3), d1));
+    pstoret<double, Packet2d, ResultAlignment>(res + res_stride + 2, pmul(vec2d_swizzle2(iB2, iB1, 0), d2));
+    pstoret<double, Packet2d, ResultAlignment>(res + 2 * res_stride, pmul(vec2d_swizzle2(iC2, iC1, 3), d1));
+    pstoret<double, Packet2d, ResultAlignment>(res + 3 * res_stride, pmul(vec2d_swizzle2(iC2, iC1, 0), d2));
+    pstoret<double, Packet2d, ResultAlignment>(res + 2 * res_stride + 2, pmul(vec2d_swizzle2(iD2, iD1, 3), d1));
+    pstoret<double, Packet2d, ResultAlignment>(res + 3 * res_stride + 2, pmul(vec2d_swizzle2(iD2, iD1, 0), d2));
+  }
+};
+#endif
+}  // namespace internal
+}  // namespace Eigen
+
+#if EIGEN_COMP_GNUC_STRICT
+#pragma GCC pop_options
+#endif
+
+#endif
diff --git a/inst/include/Eigen/src/LU/arch/Inverse_SSE.h b/inst/include/Eigen/src/LU/arch/Inverse_SSE.h
deleted file mode 100644
index 60b7a237..00000000
--- a/inst/include/Eigen/src/LU/arch/Inverse_SSE.h
+++ /dev/null
@@ -1,329 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2001 Intel Corporation
-// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
-// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// The SSE code for the 4x4 float and double matrix inverse in this file
-// comes from the following Intel's library:
-// http://software.intel.com/en-us/articles/optimized-matrix-library-for-use-with-the-intel-pentiumr-4-processors-sse2-instructions/
-//
-// Here is the respective copyright and license statement:
-//
-//   Copyright (c) 2001 Intel Corporation.
-//
-// Permition is granted to use, copy, distribute and prepare derivative works
-// of this library for any purpose and without fee, provided, that the above
-// copyright notice and this statement appear in all copies.
-// Intel makes no representations about the suitability of this software for
-// any purpose, and specifically disclaims all warranties.
-// See LEGAL.TXT for all the legal information.
-
-#ifndef EIGEN_INVERSE_SSE_H
-#define EIGEN_INVERSE_SSE_H
-
-namespace Eigen { 
-
-namespace internal {
-
-template<typename MatrixType, typename ResultType>
-struct compute_inverse_size4<Architecture::SSE, float, MatrixType, ResultType>
-{
-  enum {
-    MatrixAlignment     = bool(MatrixType::Flags&AlignedBit),
-    ResultAlignment     = bool(ResultType::Flags&AlignedBit),
-    StorageOrdersMatch  = (MatrixType::Flags&RowMajorBit) == (ResultType::Flags&RowMajorBit)
-  };
-  
-  static void run(const MatrixType& matrix, ResultType& result)
-  {
-    EIGEN_ALIGN16 const unsigned int _Sign_PNNP[4] = { 0x00000000, 0x80000000, 0x80000000, 0x00000000 };
-
-    // Load the full matrix into registers
-    __m128 _L1 = matrix.template packet<MatrixAlignment>( 0);
-    __m128 _L2 = matrix.template packet<MatrixAlignment>( 4);
-    __m128 _L3 = matrix.template packet<MatrixAlignment>( 8);
-    __m128 _L4 = matrix.template packet<MatrixAlignment>(12);
-
-    // The inverse is calculated using "Divide and Conquer" technique. The
-    // original matrix is divide into four 2x2 sub-matrices. Since each
-    // register holds four matrix element, the smaller matrices are
-    // represented as a registers. Hence we get a better locality of the
-    // calculations.
-
-    __m128 A, B, C, D; // the four sub-matrices
-    if(!StorageOrdersMatch)
-    {
-      A = _mm_unpacklo_ps(_L1, _L2);
-      B = _mm_unpacklo_ps(_L3, _L4);
-      C = _mm_unpackhi_ps(_L1, _L2);
-      D = _mm_unpackhi_ps(_L3, _L4);
-    }
-    else
-    {
-      A = _mm_movelh_ps(_L1, _L2);
-      B = _mm_movehl_ps(_L2, _L1);
-      C = _mm_movelh_ps(_L3, _L4);
-      D = _mm_movehl_ps(_L4, _L3);
-    }
-
-    __m128 iA, iB, iC, iD,                 // partial inverse of the sub-matrices
-            DC, AB;
-    __m128 dA, dB, dC, dD;                 // determinant of the sub-matrices
-    __m128 det, d, d1, d2;
-    __m128 rd;                             // reciprocal of the determinant
-
-    //  AB = A# * B
-    AB = _mm_mul_ps(_mm_shuffle_ps(A,A,0x0F), B);
-    AB = _mm_sub_ps(AB,_mm_mul_ps(_mm_shuffle_ps(A,A,0xA5), _mm_shuffle_ps(B,B,0x4E)));
-    //  DC = D# * C
-    DC = _mm_mul_ps(_mm_shuffle_ps(D,D,0x0F), C);
-    DC = _mm_sub_ps(DC,_mm_mul_ps(_mm_shuffle_ps(D,D,0xA5), _mm_shuffle_ps(C,C,0x4E)));
-
-    //  dA = |A|
-    dA = _mm_mul_ps(_mm_shuffle_ps(A, A, 0x5F),A);
-    dA = _mm_sub_ss(dA, _mm_movehl_ps(dA,dA));
-    //  dB = |B|
-    dB = _mm_mul_ps(_mm_shuffle_ps(B, B, 0x5F),B);
-    dB = _mm_sub_ss(dB, _mm_movehl_ps(dB,dB));
-
-    //  dC = |C|
-    dC = _mm_mul_ps(_mm_shuffle_ps(C, C, 0x5F),C);
-    dC = _mm_sub_ss(dC, _mm_movehl_ps(dC,dC));
-    //  dD = |D|
-    dD = _mm_mul_ps(_mm_shuffle_ps(D, D, 0x5F),D);
-    dD = _mm_sub_ss(dD, _mm_movehl_ps(dD,dD));
-
-    //  d = trace(AB*DC) = trace(A#*B*D#*C)
-    d = _mm_mul_ps(_mm_shuffle_ps(DC,DC,0xD8),AB);
-
-    //  iD = C*A#*B
-    iD = _mm_mul_ps(_mm_shuffle_ps(C,C,0xA0), _mm_movelh_ps(AB,AB));
-    iD = _mm_add_ps(iD,_mm_mul_ps(_mm_shuffle_ps(C,C,0xF5), _mm_movehl_ps(AB,AB)));
-    //  iA = B*D#*C
-    iA = _mm_mul_ps(_mm_shuffle_ps(B,B,0xA0), _mm_movelh_ps(DC,DC));
-    iA = _mm_add_ps(iA,_mm_mul_ps(_mm_shuffle_ps(B,B,0xF5), _mm_movehl_ps(DC,DC)));
-
-    //  d = trace(AB*DC) = trace(A#*B*D#*C) [continue]
-    d  = _mm_add_ps(d, _mm_movehl_ps(d, d));
-    d  = _mm_add_ss(d, _mm_shuffle_ps(d, d, 1));
-    d1 = _mm_mul_ss(dA,dD);
-    d2 = _mm_mul_ss(dB,dC);
-
-    //  iD = D*|A| - C*A#*B
-    iD = _mm_sub_ps(_mm_mul_ps(D,_mm_shuffle_ps(dA,dA,0)), iD);
-
-    //  iA = A*|D| - B*D#*C;
-    iA = _mm_sub_ps(_mm_mul_ps(A,_mm_shuffle_ps(dD,dD,0)), iA);
-
-    //  det = |A|*|D| + |B|*|C| - trace(A#*B*D#*C)
-    det = _mm_sub_ss(_mm_add_ss(d1,d2),d);
-    rd  = _mm_div_ss(_mm_set_ss(1.0f), det);
-
-//     #ifdef ZERO_SINGULAR
-//         rd = _mm_and_ps(_mm_cmpneq_ss(det,_mm_setzero_ps()), rd);
-//     #endif
-
-    //  iB = D * (A#B)# = D*B#*A
-    iB = _mm_mul_ps(D, _mm_shuffle_ps(AB,AB,0x33));
-    iB = _mm_sub_ps(iB, _mm_mul_ps(_mm_shuffle_ps(D,D,0xB1), _mm_shuffle_ps(AB,AB,0x66)));
-    //  iC = A * (D#C)# = A*C#*D
-    iC = _mm_mul_ps(A, _mm_shuffle_ps(DC,DC,0x33));
-    iC = _mm_sub_ps(iC, _mm_mul_ps(_mm_shuffle_ps(A,A,0xB1), _mm_shuffle_ps(DC,DC,0x66)));
-
-    rd = _mm_shuffle_ps(rd,rd,0);
-    rd = _mm_xor_ps(rd, _mm_load_ps((float*)_Sign_PNNP));
-
-    //  iB = C*|B| - D*B#*A
-    iB = _mm_sub_ps(_mm_mul_ps(C,_mm_shuffle_ps(dB,dB,0)), iB);
-
-    //  iC = B*|C| - A*C#*D;
-    iC = _mm_sub_ps(_mm_mul_ps(B,_mm_shuffle_ps(dC,dC,0)), iC);
-
-    //  iX = iX / det
-    iA = _mm_mul_ps(rd,iA);
-    iB = _mm_mul_ps(rd,iB);
-    iC = _mm_mul_ps(rd,iC);
-    iD = _mm_mul_ps(rd,iD);
-
-    result.template writePacket<ResultAlignment>( 0, _mm_shuffle_ps(iA,iB,0x77));
-    result.template writePacket<ResultAlignment>( 4, _mm_shuffle_ps(iA,iB,0x22));
-    result.template writePacket<ResultAlignment>( 8, _mm_shuffle_ps(iC,iD,0x77));
-    result.template writePacket<ResultAlignment>(12, _mm_shuffle_ps(iC,iD,0x22));
-  }
-
-};
-
-template<typename MatrixType, typename ResultType>
-struct compute_inverse_size4<Architecture::SSE, double, MatrixType, ResultType>
-{
-  enum {
-    MatrixAlignment = bool(MatrixType::Flags&AlignedBit),
-    ResultAlignment = bool(ResultType::Flags&AlignedBit),
-    StorageOrdersMatch  = (MatrixType::Flags&RowMajorBit) == (ResultType::Flags&RowMajorBit)
-  };
-  static void run(const MatrixType& matrix, ResultType& result)
-  {
-    const __m128d _Sign_NP = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0));
-    const __m128d _Sign_PN = _mm_castsi128_pd(_mm_set_epi32(0x80000000,0x0,0x0,0x0));
-
-    // The inverse is calculated using "Divide and Conquer" technique. The
-    // original matrix is divide into four 2x2 sub-matrices. Since each
-    // register of the matrix holds two element, the smaller matrices are
-    // consisted of two registers. Hence we get a better locality of the
-    // calculations.
-
-    // the four sub-matrices
-    __m128d A1, A2, B1, B2, C1, C2, D1, D2;
-    
-    if(StorageOrdersMatch)
-    {
-      A1 = matrix.template packet<MatrixAlignment>( 0); B1 = matrix.template packet<MatrixAlignment>( 2);
-      A2 = matrix.template packet<MatrixAlignment>( 4); B2 = matrix.template packet<MatrixAlignment>( 6);
-      C1 = matrix.template packet<MatrixAlignment>( 8); D1 = matrix.template packet<MatrixAlignment>(10);
-      C2 = matrix.template packet<MatrixAlignment>(12); D2 = matrix.template packet<MatrixAlignment>(14);
-    }
-    else
-    {
-      __m128d tmp;
-      A1 = matrix.template packet<MatrixAlignment>( 0); C1 = matrix.template packet<MatrixAlignment>( 2);
-      A2 = matrix.template packet<MatrixAlignment>( 4); C2 = matrix.template packet<MatrixAlignment>( 6);
-      tmp = A1;
-      A1 = _mm_unpacklo_pd(A1,A2);
-      A2 = _mm_unpackhi_pd(tmp,A2);
-      tmp = C1;
-      C1 = _mm_unpacklo_pd(C1,C2);
-      C2 = _mm_unpackhi_pd(tmp,C2);
-      
-      B1 = matrix.template packet<MatrixAlignment>( 8); D1 = matrix.template packet<MatrixAlignment>(10);
-      B2 = matrix.template packet<MatrixAlignment>(12); D2 = matrix.template packet<MatrixAlignment>(14);
-      tmp = B1;
-      B1 = _mm_unpacklo_pd(B1,B2);
-      B2 = _mm_unpackhi_pd(tmp,B2);
-      tmp = D1;
-      D1 = _mm_unpacklo_pd(D1,D2);
-      D2 = _mm_unpackhi_pd(tmp,D2);
-    }
-    
-    __m128d iA1, iA2, iB1, iB2, iC1, iC2, iD1, iD2,     // partial invese of the sub-matrices
-            DC1, DC2, AB1, AB2;
-    __m128d dA, dB, dC, dD;     // determinant of the sub-matrices
-    __m128d det, d1, d2, rd;
-
-    //  dA = |A|
-    dA = _mm_shuffle_pd(A2, A2, 1);
-    dA = _mm_mul_pd(A1, dA);
-    dA = _mm_sub_sd(dA, _mm_shuffle_pd(dA,dA,3));
-    //  dB = |B|
-    dB = _mm_shuffle_pd(B2, B2, 1);
-    dB = _mm_mul_pd(B1, dB);
-    dB = _mm_sub_sd(dB, _mm_shuffle_pd(dB,dB,3));
-
-    //  AB = A# * B
-    AB1 = _mm_mul_pd(B1, _mm_shuffle_pd(A2,A2,3));
-    AB2 = _mm_mul_pd(B2, _mm_shuffle_pd(A1,A1,0));
-    AB1 = _mm_sub_pd(AB1, _mm_mul_pd(B2, _mm_shuffle_pd(A1,A1,3)));
-    AB2 = _mm_sub_pd(AB2, _mm_mul_pd(B1, _mm_shuffle_pd(A2,A2,0)));
-
-    //  dC = |C|
-    dC = _mm_shuffle_pd(C2, C2, 1);
-    dC = _mm_mul_pd(C1, dC);
-    dC = _mm_sub_sd(dC, _mm_shuffle_pd(dC,dC,3));
-    //  dD = |D|
-    dD = _mm_shuffle_pd(D2, D2, 1);
-    dD = _mm_mul_pd(D1, dD);
-    dD = _mm_sub_sd(dD, _mm_shuffle_pd(dD,dD,3));
-
-    //  DC = D# * C
-    DC1 = _mm_mul_pd(C1, _mm_shuffle_pd(D2,D2,3));
-    DC2 = _mm_mul_pd(C2, _mm_shuffle_pd(D1,D1,0));
-    DC1 = _mm_sub_pd(DC1, _mm_mul_pd(C2, _mm_shuffle_pd(D1,D1,3)));
-    DC2 = _mm_sub_pd(DC2, _mm_mul_pd(C1, _mm_shuffle_pd(D2,D2,0)));
-
-    //  rd = trace(AB*DC) = trace(A#*B*D#*C)
-    d1 = _mm_mul_pd(AB1, _mm_shuffle_pd(DC1, DC2, 0));
-    d2 = _mm_mul_pd(AB2, _mm_shuffle_pd(DC1, DC2, 3));
-    rd = _mm_add_pd(d1, d2);
-    rd = _mm_add_sd(rd, _mm_shuffle_pd(rd, rd,3));
-
-    //  iD = C*A#*B
-    iD1 = _mm_mul_pd(AB1, _mm_shuffle_pd(C1,C1,0));
-    iD2 = _mm_mul_pd(AB1, _mm_shuffle_pd(C2,C2,0));
-    iD1 = _mm_add_pd(iD1, _mm_mul_pd(AB2, _mm_shuffle_pd(C1,C1,3)));
-    iD2 = _mm_add_pd(iD2, _mm_mul_pd(AB2, _mm_shuffle_pd(C2,C2,3)));
-
-    //  iA = B*D#*C
-    iA1 = _mm_mul_pd(DC1, _mm_shuffle_pd(B1,B1,0));
-    iA2 = _mm_mul_pd(DC1, _mm_shuffle_pd(B2,B2,0));
-    iA1 = _mm_add_pd(iA1, _mm_mul_pd(DC2, _mm_shuffle_pd(B1,B1,3)));
-    iA2 = _mm_add_pd(iA2, _mm_mul_pd(DC2, _mm_shuffle_pd(B2,B2,3)));
-
-    //  iD = D*|A| - C*A#*B
-    dA = _mm_shuffle_pd(dA,dA,0);
-    iD1 = _mm_sub_pd(_mm_mul_pd(D1, dA), iD1);
-    iD2 = _mm_sub_pd(_mm_mul_pd(D2, dA), iD2);
-
-    //  iA = A*|D| - B*D#*C;
-    dD = _mm_shuffle_pd(dD,dD,0);
-    iA1 = _mm_sub_pd(_mm_mul_pd(A1, dD), iA1);
-    iA2 = _mm_sub_pd(_mm_mul_pd(A2, dD), iA2);
-
-    d1 = _mm_mul_sd(dA, dD);
-    d2 = _mm_mul_sd(dB, dC);
-
-    //  iB = D * (A#B)# = D*B#*A
-    iB1 = _mm_mul_pd(D1, _mm_shuffle_pd(AB2,AB1,1));
-    iB2 = _mm_mul_pd(D2, _mm_shuffle_pd(AB2,AB1,1));
-    iB1 = _mm_sub_pd(iB1, _mm_mul_pd(_mm_shuffle_pd(D1,D1,1), _mm_shuffle_pd(AB2,AB1,2)));
-    iB2 = _mm_sub_pd(iB2, _mm_mul_pd(_mm_shuffle_pd(D2,D2,1), _mm_shuffle_pd(AB2,AB1,2)));
-
-    //  det = |A|*|D| + |B|*|C| - trace(A#*B*D#*C)
-    det = _mm_add_sd(d1, d2);
-    det = _mm_sub_sd(det, rd);
-
-    //  iC = A * (D#C)# = A*C#*D
-    iC1 = _mm_mul_pd(A1, _mm_shuffle_pd(DC2,DC1,1));
-    iC2 = _mm_mul_pd(A2, _mm_shuffle_pd(DC2,DC1,1));
-    iC1 = _mm_sub_pd(iC1, _mm_mul_pd(_mm_shuffle_pd(A1,A1,1), _mm_shuffle_pd(DC2,DC1,2)));
-    iC2 = _mm_sub_pd(iC2, _mm_mul_pd(_mm_shuffle_pd(A2,A2,1), _mm_shuffle_pd(DC2,DC1,2)));
-
-    rd = _mm_div_sd(_mm_set_sd(1.0), det);
-//     #ifdef ZERO_SINGULAR
-//         rd = _mm_and_pd(_mm_cmpneq_sd(det,_mm_setzero_pd()), rd);
-//     #endif
-    rd = _mm_shuffle_pd(rd,rd,0);
-
-    //  iB = C*|B| - D*B#*A
-    dB = _mm_shuffle_pd(dB,dB,0);
-    iB1 = _mm_sub_pd(_mm_mul_pd(C1, dB), iB1);
-    iB2 = _mm_sub_pd(_mm_mul_pd(C2, dB), iB2);
-
-    d1 = _mm_xor_pd(rd, _Sign_PN);
-    d2 = _mm_xor_pd(rd, _Sign_NP);
-
-    //  iC = B*|C| - A*C#*D;
-    dC = _mm_shuffle_pd(dC,dC,0);
-    iC1 = _mm_sub_pd(_mm_mul_pd(B1, dC), iC1);
-    iC2 = _mm_sub_pd(_mm_mul_pd(B2, dC), iC2);
-
-    result.template writePacket<ResultAlignment>( 0, _mm_mul_pd(_mm_shuffle_pd(iA2, iA1, 3), d1));     // iA# / det
-    result.template writePacket<ResultAlignment>( 4, _mm_mul_pd(_mm_shuffle_pd(iA2, iA1, 0), d2));
-    result.template writePacket<ResultAlignment>( 2, _mm_mul_pd(_mm_shuffle_pd(iB2, iB1, 3), d1));     // iB# / det
-    result.template writePacket<ResultAlignment>( 6, _mm_mul_pd(_mm_shuffle_pd(iB2, iB1, 0), d2));
-    result.template writePacket<ResultAlignment>( 8, _mm_mul_pd(_mm_shuffle_pd(iC2, iC1, 3), d1));     // iC# / det
-    result.template writePacket<ResultAlignment>(12, _mm_mul_pd(_mm_shuffle_pd(iC2, iC1, 0), d2));
-    result.template writePacket<ResultAlignment>(10, _mm_mul_pd(_mm_shuffle_pd(iD2, iD1, 3), d1));     // iD# / det
-    result.template writePacket<ResultAlignment>(14, _mm_mul_pd(_mm_shuffle_pd(iD2, iD1, 0), d2));
-  }
-};
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_INVERSE_SSE_H
diff --git a/inst/include/Eigen/src/MetisSupport/InternalHeaderCheck.h b/inst/include/Eigen/src/MetisSupport/InternalHeaderCheck.h
new file mode 100644
index 00000000..9d34825f
--- /dev/null
+++ b/inst/include/Eigen/src/MetisSupport/InternalHeaderCheck.h
@@ -0,0 +1,3 @@
+#ifndef EIGEN_METISSUPPORT_MODULE_H
+#error "Please include Eigen/MetisSupport instead of including headers inside the src directory directly."
+#endif
diff --git a/inst/include/Eigen/src/MetisSupport/MetisSupport.h b/inst/include/Eigen/src/MetisSupport/MetisSupport.h
index f2bbef20..6c7bf946 100644
--- a/inst/include/Eigen/src/MetisSupport/MetisSupport.h
+++ b/inst/include/Eigen/src/MetisSupport/MetisSupport.h
@@ -9,129 +9,117 @@
 #ifndef METIS_SUPPORT_H
 #define METIS_SUPPORT_H
 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
 namespace Eigen {
 /**
  * Get the fill-reducing ordering from the METIS package
- * 
- * If A is the original matrix and Ap is the permuted matrix, 
+ *
+ * If A is the original matrix and Ap is the permuted matrix,
  * the fill-reducing permutation is defined as follows :
- * Row (column) i of A is the matperm(i) row (column) of Ap. 
+ * Row (column) i of A is the matperm(i) row (column) of Ap.
  * WARNING: As computed by METIS, this corresponds to the vector iperm (instead of perm)
  */
-template <typename Index>
-class MetisOrdering
-{
-public:
-  typedef PermutationMatrix<Dynamic,Dynamic,Index> PermutationType;
-  typedef Matrix<Index,Dynamic,1> IndexVector; 
-  
+template <typename StorageIndex>
+class MetisOrdering {
+ public:
+  typedef PermutationMatrix<Dynamic, Dynamic, StorageIndex> PermutationType;
+  typedef Matrix<StorageIndex, Dynamic, 1> IndexVector;
+
   template <typename MatrixType>
-  void get_symmetrized_graph(const MatrixType& A)
-  {
-    Index m = A.cols(); 
+  void get_symmetrized_graph(const MatrixType& A) {
+    Index m = A.cols();
     eigen_assert((A.rows() == A.cols()) && "ONLY FOR SQUARED MATRICES");
-    // Get the transpose of the input matrix 
-    MatrixType At = A.transpose(); 
+    // Get the transpose of the input matrix
+    MatrixType At = A.transpose();
     // Get the number of nonzeros elements in each row/col of At+A
-    Index TotNz = 0; 
-    IndexVector visited(m); 
-    visited.setConstant(-1); 
-    for (int j = 0; j < m; j++)
-    {
+    Index TotNz = 0;
+    IndexVector visited(m);
+    visited.setConstant(-1);
+    for (StorageIndex j = 0; j < m; j++) {
       // Compute the union structure of of A(j,:) and At(j,:)
-      visited(j) = j; // Do not include the diagonal element
+      visited(j) = j;  // Do not include the diagonal element
       // Get the nonzeros in row/column j of A
-      for (typename MatrixType::InnerIterator it(A, j); it; ++it)
-      {
-        Index idx = it.index(); // Get the row index (for column major) or column index (for row major)
-        if (visited(idx) != j ) 
-        {
-          visited(idx) = j; 
-          ++TotNz; 
+      for (typename MatrixType::InnerIterator it(A, j); it; ++it) {
+        Index idx = it.index();  // Get the row index (for column major) or column index (for row major)
+        if (visited(idx) != j) {
+          visited(idx) = j;
+          ++TotNz;
         }
       }
-      //Get the nonzeros in row/column j of At
-      for (typename MatrixType::InnerIterator it(At, j); it; ++it)
-      {
-        Index idx = it.index(); 
-        if(visited(idx) != j)
-        {
-          visited(idx) = j; 
-          ++TotNz; 
+      // Get the nonzeros in row/column j of At
+      for (typename MatrixType::InnerIterator it(At, j); it; ++it) {
+        Index idx = it.index();
+        if (visited(idx) != j) {
+          visited(idx) = j;
+          ++TotNz;
         }
       }
     }
     // Reserve place for A + At
-    m_indexPtr.resize(m+1);
-    m_innerIndices.resize(TotNz); 
+    m_indexPtr.resize(m + 1);
+    m_innerIndices.resize(TotNz);
+
+    // Now compute the real adjacency list of each column/row
+    visited.setConstant(-1);
+    StorageIndex CurNz = 0;
+    for (StorageIndex j = 0; j < m; j++) {
+      m_indexPtr(j) = CurNz;
 
-    // Now compute the real adjacency list of each column/row 
-    visited.setConstant(-1); 
-    Index CurNz = 0; 
-    for (int j = 0; j < m; j++)
-    {
-      m_indexPtr(j) = CurNz; 
-      
-      visited(j) = j; // Do not include the diagonal element
+      visited(j) = j;  // Do not include the diagonal element
       // Add the pattern of row/column j of A to A+At
-      for (typename MatrixType::InnerIterator it(A,j); it; ++it)
-      {
-        Index idx = it.index(); // Get the row index (for column major) or column index (for row major)
-        if (visited(idx) != j ) 
-        {
-          visited(idx) = j; 
-          m_innerIndices(CurNz) = idx; 
-          CurNz++; 
+      for (typename MatrixType::InnerIterator it(A, j); it; ++it) {
+        StorageIndex idx = it.index();  // Get the row index (for column major) or column index (for row major)
+        if (visited(idx) != j) {
+          visited(idx) = j;
+          m_innerIndices(CurNz) = idx;
+          CurNz++;
         }
       }
-      //Add the pattern of row/column j of At to A+At
-      for (typename MatrixType::InnerIterator it(At, j); it; ++it)
-      {
-        Index idx = it.index(); 
-        if(visited(idx) != j)
-        {
-          visited(idx) = j; 
-          m_innerIndices(CurNz) = idx; 
-          ++CurNz; 
+      // Add the pattern of row/column j of At to A+At
+      for (typename MatrixType::InnerIterator it(At, j); it; ++it) {
+        StorageIndex idx = it.index();
+        if (visited(idx) != j) {
+          visited(idx) = j;
+          m_innerIndices(CurNz) = idx;
+          ++CurNz;
         }
       }
     }
-    m_indexPtr(m) = CurNz;    
+    m_indexPtr(m) = CurNz;
   }
-  
+
   template <typename MatrixType>
-  void operator() (const MatrixType& A, PermutationType& matperm)
-  {
-     Index m = A.cols();
-     IndexVector perm(m),iperm(m); 
-    // First, symmetrize the matrix graph. 
-     get_symmetrized_graph(A); 
-     int output_error;
-     
-     // Call the fill-reducing routine from METIS 
-     output_error = METIS_NodeND(&m, m_indexPtr.data(), m_innerIndices.data(), NULL, NULL, perm.data(), iperm.data());
-     
-    if(output_error != METIS_OK) 
-    {
-      //FIXME The ordering interface should define a class of possible errors 
-     std::cerr << "ERROR WHILE CALLING THE METIS PACKAGE \n"; 
-     return; 
+  void operator()(const MatrixType& A, PermutationType& matperm) {
+    StorageIndex m = internal::convert_index<StorageIndex>(
+        A.cols());  // must be StorageIndex, because it is passed by address to METIS
+    IndexVector perm(m), iperm(m);
+    // First, symmetrize the matrix graph.
+    get_symmetrized_graph(A);
+    int output_error;
+
+    // Call the fill-reducing routine from METIS
+    output_error = METIS_NodeND(&m, m_indexPtr.data(), m_innerIndices.data(), NULL, NULL, perm.data(), iperm.data());
+
+    if (output_error != METIS_OK) {
+      // FIXME The ordering interface should define a class of possible errors
+      std::cerr << "ERROR WHILE CALLING THE METIS PACKAGE \n";
+      return;
     }
-    
-    // Get the fill-reducing permutation 
-    //NOTE:  If Ap is the permuted matrix then perm and iperm vectors are defined as follows 
+
+    // Get the fill-reducing permutation
+    // NOTE:  If Ap is the permuted matrix then perm and iperm vectors are defined as follows
     // Row (column) i of Ap is the perm(i) row(column) of A, and row (column) i of A is the iperm(i) row(column) of Ap
-    
-     matperm.resize(m);
-     for (int j = 0; j < m; j++)
-       matperm.indices()(iperm(j)) = j;
-   
+
+    matperm.resize(m);
+    for (int j = 0; j < m; j++) matperm.indices()(iperm(j)) = j;
   }
-  
-  protected:
-    IndexVector m_indexPtr; // Pointer to the adjacenccy list of each row/column
-    IndexVector m_innerIndices; // Adjacency list 
+
+ protected:
+  IndexVector m_indexPtr;      // Pointer to the adjacenccy list of each row/column
+  IndexVector m_innerIndices;  // Adjacency list
 };
 
-}// end namespace eigen 
+}  // namespace Eigen
 #endif
diff --git a/inst/include/Eigen/src/OrderingMethods/Amd.h b/inst/include/Eigen/src/OrderingMethods/Amd.h
index 70550b8a..0b0bf02e 100644
--- a/inst/include/Eigen/src/OrderingMethods/Amd.h
+++ b/inst/include/Eigen/src/OrderingMethods/Amd.h
@@ -2,443 +2,412 @@
 // for linear algebra.
 //
 // Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 /*
-
 NOTE: this routine has been adapted from the CSparse library:
 
 Copyright (c) 2006, Timothy A. Davis.
-http://www.cise.ufl.edu/research/sparse/CSparse
-
-CSparse is free software; you can redistribute it and/or
-modify it under the terms of the GNU Lesser General Public
-License as published by the Free Software Foundation; either
-version 2.1 of the License, or (at your option) any later version.
-
-CSparse is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-Lesser General Public License for more details.
-
-You should have received a copy of the GNU Lesser General Public
-License along with this Module; if not, write to the Free Software
-Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+http://www.suitesparse.com
 
+The author of CSparse, Timothy A. Davis., has executed a license with Google LLC
+to permit distribution of this code and derivative works as part of Eigen under
+the Mozilla Public License v. 2.0, as stated at the top of this file.
 */
 
-#include "../Core/util/NonMPL2.h"
-
 #ifndef EIGEN_SPARSE_AMD_H
 #define EIGEN_SPARSE_AMD_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 namespace internal {
-  
-template<typename T> inline T amd_flip(const T& i) { return -i-2; }
-template<typename T> inline T amd_unflip(const T& i) { return i<0 ? amd_flip(i) : i; }
-template<typename T0, typename T1> inline bool amd_marked(const T0* w, const T1& j) { return w[j]<0; }
-template<typename T0, typename T1> inline void amd_mark(const T0* w, const T1& j) { return w[j] = amd_flip(w[j]); }
+
+template <typename T>
+inline T amd_flip(const T& i) {
+  return -i - 2;
+}
+template <typename T>
+inline T amd_unflip(const T& i) {
+  return i < 0 ? amd_flip(i) : i;
+}
+template <typename T0, typename T1>
+inline bool amd_marked(const T0* w, const T1& j) {
+  return w[j] < 0;
+}
+template <typename T0, typename T1>
+inline void amd_mark(const T0* w, const T1& j) {
+  return w[j] = amd_flip(w[j]);
+}
 
 /* clear w */
-template<typename Index>
-static int cs_wclear (Index mark, Index lemax, Index *w, Index n)
-{
-  Index k;
-  if(mark < 2 || (mark + lemax < 0))
-  {
-    for(k = 0; k < n; k++)
-      if(w[k] != 0)
-        w[k] = 1;
+template <typename StorageIndex>
+static StorageIndex cs_wclear(StorageIndex mark, StorageIndex lemax, StorageIndex* w, StorageIndex n) {
+  StorageIndex k;
+  if (mark < 2 || (mark + lemax < 0)) {
+    for (k = 0; k < n; k++)
+      if (w[k] != 0) w[k] = 1;
     mark = 2;
   }
-  return (mark);     /* at this point, w[0..n-1] < mark holds */
+  return (mark); /* at this point, w[0..n-1] < mark holds */
 }
 
 /* depth-first search and postorder of a tree rooted at node j */
-template<typename Index>
-Index cs_tdfs(Index j, Index k, Index *head, const Index *next, Index *post, Index *stack)
-{
-  int i, p, top = 0;
-  if(!head || !next || !post || !stack) return (-1);    /* check inputs */
-  stack[0] = j;                 /* place j on the stack */
-  while (top >= 0)                /* while (stack is not empty) */
+template <typename StorageIndex>
+StorageIndex cs_tdfs(StorageIndex j, StorageIndex k, StorageIndex* head, const StorageIndex* next, StorageIndex* post,
+                     StorageIndex* stack) {
+  StorageIndex i, p, top = 0;
+  if (!head || !next || !post || !stack) return (-1); /* check inputs */
+  stack[0] = j;                                       /* place j on the stack */
+  while (top >= 0)                                    /* while (stack is not empty) */
   {
-    p = stack[top];           /* p = top of stack */
-    i = head[p];              /* i = youngest child of p */
-    if(i == -1)
-    {
-      top--;                 /* p has no unordered children left */
-      post[k++] = p;        /* node p is the kth postordered node */
-    }
-    else
-    {
-      head[p] = next[i];   /* remove i from children of p */
-      stack[++top] = i;     /* start dfs on child node i */
+    p = stack[top]; /* p = top of stack */
+    i = head[p];    /* i = youngest child of p */
+    if (i == -1) {
+      top--;         /* p has no unordered children left */
+      post[k++] = p; /* node p is the kth postordered node */
+    } else {
+      head[p] = next[i]; /* remove i from children of p */
+      stack[++top] = i;  /* start dfs on child node i */
     }
   }
   return k;
 }
 
-
 /** \internal
-  * \ingroup OrderingMethods_Module 
-  * Approximate minimum degree ordering algorithm.
-  * \returns the permutation P reducing the fill-in of the input matrix \a C
-  * The input matrix \a C must be a selfadjoint compressed column major SparseMatrix object. Both the upper and lower parts have to be stored, but the diagonal entries are optional.
-  * On exit the values of C are destroyed */
-template<typename Scalar, typename Index>
-void minimum_degree_ordering(SparseMatrix<Scalar,ColMajor,Index>& C, PermutationMatrix<Dynamic,Dynamic,Index>& perm)
-{
+ * \ingroup OrderingMethods_Module
+ * Approximate minimum degree ordering algorithm.
+ *
+ * \param[in] C the input selfadjoint matrix stored in compressed column major format.
+ * \param[out] perm the permutation P reducing the fill-in of the input matrix \a C
+ *
+ * Note that the input matrix \a C must be complete, that is both the upper and lower parts have to be stored, as well
+ * as the diagonal entries. On exit the values of C are destroyed */
+template <typename Scalar, typename StorageIndex>
+void minimum_degree_ordering(SparseMatrix<Scalar, ColMajor, StorageIndex>& C,
+                             PermutationMatrix<Dynamic, Dynamic, StorageIndex>& perm) {
   using std::sqrt;
-  
-  int d, dk, dext, lemax = 0, e, elenk, eln, i, j, k, k1,
-      k2, k3, jlast, ln, dense, nzmax, mindeg = 0, nvi, nvj, nvk, mark, wnvi,
-      ok, nel = 0, p, p1, p2, p3, p4, pj, pk, pk1, pk2, pn, q, t;
-  unsigned int h;
-  
-  Index n = C.cols();
-  dense = std::max<Index> (16, Index(10 * sqrt(double(n))));   /* find dense threshold */
-  dense = std::min<Index> (n-2, dense);
-  
-  Index cnz = C.nonZeros();
-  perm.resize(n+1);
-  t = cnz + cnz/5 + 2*n;                 /* add elbow room to C */
+
+  StorageIndex d, dk, dext, lemax = 0, e, elenk, eln, i, j, k, k1, k2, k3, jlast, ln, dense, nzmax, mindeg = 0, nvi,
+                            nvj, nvk, mark, wnvi, ok, nel = 0, p, p1, p2, p3, p4, pj, pk, pk1, pk2, pn, q, t, h;
+
+  StorageIndex n = StorageIndex(C.cols());
+  dense = std::max<StorageIndex>(16, StorageIndex(10 * sqrt(double(n)))); /* find dense threshold */
+  dense = (std::min)(n - 2, dense);
+
+  StorageIndex cnz = StorageIndex(C.nonZeros());
+  perm.resize(n + 1);
+  t = cnz + cnz / 5 + 2 * n; /* add elbow room to C */
   C.resizeNonZeros(t);
-  
-  Index* W       = new Index[8*(n+1)]; /* get workspace */
-  Index* len     = W;
-  Index* nv      = W +   (n+1);
-  Index* next    = W + 2*(n+1);
-  Index* head    = W + 3*(n+1);
-  Index* elen    = W + 4*(n+1);
-  Index* degree  = W + 5*(n+1);
-  Index* w       = W + 6*(n+1);
-  Index* hhead   = W + 7*(n+1);
-  Index* last    = perm.indices().data();                              /* use P as workspace for last */
-  
+
+  // get workspace
+  ei_declare_aligned_stack_constructed_variable(StorageIndex, W, 8 * (n + 1), 0);
+  StorageIndex* len = W;
+  StorageIndex* nv = W + (n + 1);
+  StorageIndex* next = W + 2 * (n + 1);
+  StorageIndex* head = W + 3 * (n + 1);
+  StorageIndex* elen = W + 4 * (n + 1);
+  StorageIndex* degree = W + 5 * (n + 1);
+  StorageIndex* w = W + 6 * (n + 1);
+  StorageIndex* hhead = W + 7 * (n + 1);
+  StorageIndex* last = perm.indices().data(); /* use P as workspace for last */
+
   /* --- Initialize quotient graph ---------------------------------------- */
-  Index* Cp = C.outerIndexPtr();
-  Index* Ci = C.innerIndexPtr();
-  for(k = 0; k < n; k++)
-    len[k] = Cp[k+1] - Cp[k];
+  StorageIndex* Cp = C.outerIndexPtr();
+  StorageIndex* Ci = C.innerIndexPtr();
+  for (k = 0; k < n; k++) len[k] = Cp[k + 1] - Cp[k];
   len[n] = 0;
   nzmax = t;
-  
-  for(i = 0; i <= n; i++)
-  {
-    head[i]   = -1;                     // degree list i is empty
-    last[i]   = -1;
-    next[i]   = -1;
-    hhead[i]  = -1;                     // hash list i is empty 
-    nv[i]     = 1;                      // node i is just one node
-    w[i]      = 1;                      // node i is alive
-    elen[i]   = 0;                      // Ek of node i is empty
-    degree[i] = len[i];                 // degree of node i
+
+  for (i = 0; i <= n; i++) {
+    head[i] = -1;  // degree list i is empty
+    last[i] = -1;
+    next[i] = -1;
+    hhead[i] = -1;       // hash list i is empty
+    nv[i] = 1;           // node i is just one node
+    w[i] = 1;            // node i is alive
+    elen[i] = 0;         // Ek of node i is empty
+    degree[i] = len[i];  // degree of node i
   }
-  mark = internal::cs_wclear<Index>(0, 0, w, n);         /* clear w */
-  
+  mark = internal::cs_wclear<StorageIndex>(0, 0, w, n); /* clear w */
+
   /* --- Initialize degree lists ------------------------------------------ */
-  for(i = 0; i < n; i++)
-  {
+  for (i = 0; i < n; i++) {
     bool has_diag = false;
-    for(p = Cp[i]; p<Cp[i+1]; ++p)
-      if(Ci[p]==i)
-      {
+    for (p = Cp[i]; p < Cp[i + 1]; ++p)
+      if (Ci[p] == i) {
         has_diag = true;
         break;
       }
-   
+
     d = degree[i];
-    if(d == 1 && has_diag)           /* node i is empty */
+    if (d == 1 && has_diag) /* node i is empty */
     {
-      elen[i] = -2;                 /* element i is dead */
+      elen[i] = -2; /* element i is dead */
       nel++;
-      Cp[i] = -1;                   /* i is a root of assembly tree */
+      Cp[i] = -1; /* i is a root of assembly tree */
       w[i] = 0;
-    }
-    else if(d > dense || !has_diag)  /* node i is dense or has no structural diagonal element */
+    } else if (d > dense || !has_diag) /* node i is dense or has no structural diagonal element */
     {
-      nv[i] = 0;                    /* absorb i into element n */
-      elen[i] = -1;                 /* node i is dead */
+      nv[i] = 0;    /* absorb i into element n */
+      elen[i] = -1; /* node i is dead */
       nel++;
-      Cp[i] = amd_flip (n);
+      Cp[i] = amd_flip(n);
       nv[n]++;
-    }
-    else
-    {
-      if(head[d] != -1) last[head[d]] = i;
-      next[i] = head[d];           /* put node i in degree list d */
+    } else {
+      if (head[d] != -1) last[head[d]] = i;
+      next[i] = head[d]; /* put node i in degree list d */
       head[d] = i;
     }
   }
-  
-  elen[n] = -2;                         /* n is a dead element */
-  Cp[n] = -1;                           /* n is a root of assembly tree */
-  w[n] = 0;                             /* n is a dead element */
-  
-  while (nel < n)                         /* while (selecting pivots) do */
+
+  elen[n] = -2; /* n is a dead element */
+  Cp[n] = -1;   /* n is a root of assembly tree */
+  w[n] = 0;     /* n is a dead element */
+
+  while (nel < n) /* while (selecting pivots) do */
   {
     /* --- Select node of minimum approximate degree -------------------- */
-    for(k = -1; mindeg < n && (k = head[mindeg]) == -1; mindeg++) {}
-    if(next[k] != -1) last[next[k]] = -1;
-    head[mindeg] = next[k];          /* remove k from degree list */
-    elenk = elen[k];                  /* elenk = |Ek| */
-    nvk = nv[k];                      /* # of nodes k represents */
-    nel += nvk;                        /* nv[k] nodes of A eliminated */
-    
+    for (k = -1; mindeg < n && (k = head[mindeg]) == -1; mindeg++) {
+    }
+    if (next[k] != -1) last[next[k]] = -1;
+    head[mindeg] = next[k]; /* remove k from degree list */
+    elenk = elen[k];        /* elenk = |Ek| */
+    nvk = nv[k];            /* # of nodes k represents */
+    nel += nvk;             /* nv[k] nodes of A eliminated */
+
     /* --- Garbage collection ------------------------------------------- */
-    if(elenk > 0 && cnz + mindeg >= nzmax)
-    {
-      for(j = 0; j < n; j++)
-      {
-        if((p = Cp[j]) >= 0)      /* j is a live node or element */
+    if (elenk > 0 && cnz + mindeg >= nzmax) {
+      for (j = 0; j < n; j++) {
+        if ((p = Cp[j]) >= 0) /* j is a live node or element */
         {
-          Cp[j] = Ci[p];          /* save first entry of object */
-          Ci[p] = amd_flip (j);    /* first entry is now amd_flip(j) */
+          Cp[j] = Ci[p];       /* save first entry of object */
+          Ci[p] = amd_flip(j); /* first entry is now amd_flip(j) */
         }
       }
-      for(q = 0, p = 0; p < cnz; ) /* scan all of memory */
+      for (q = 0, p = 0; p < cnz;) /* scan all of memory */
       {
-        if((j = amd_flip (Ci[p++])) >= 0)  /* found object j */
+        if ((j = amd_flip(Ci[p++])) >= 0) /* found object j */
         {
-          Ci[q] = Cp[j];       /* restore first entry of object */
-          Cp[j] = q++;          /* new pointer to object j */
-          for(k3 = 0; k3 < len[j]-1; k3++) Ci[q++] = Ci[p++];
+          Ci[q] = Cp[j]; /* restore first entry of object */
+          Cp[j] = q++;   /* new pointer to object j */
+          for (k3 = 0; k3 < len[j] - 1; k3++) Ci[q++] = Ci[p++];
         }
       }
-      cnz = q;                       /* Ci[cnz...nzmax-1] now free */
+      cnz = q; /* Ci[cnz...nzmax-1] now free */
     }
-    
+
     /* --- Construct new element ---------------------------------------- */
     dk = 0;
-    nv[k] = -nvk;                     /* flag k as in Lk */
+    nv[k] = -nvk; /* flag k as in Lk */
     p = Cp[k];
-    pk1 = (elenk == 0) ? p : cnz;      /* do in place if elen[k] == 0 */
+    pk1 = (elenk == 0) ? p : cnz; /* do in place if elen[k] == 0 */
     pk2 = pk1;
-    for(k1 = 1; k1 <= elenk + 1; k1++)
-    {
-      if(k1 > elenk)
-      {
-        e = k;                     /* search the nodes in k */
-        pj = p;                    /* list of nodes starts at Ci[pj]*/
-        ln = len[k] - elenk;      /* length of list of nodes in k */
-      }
-      else
-      {
-        e = Ci[p++];              /* search the nodes in e */
+    for (k1 = 1; k1 <= elenk + 1; k1++) {
+      if (k1 > elenk) {
+        e = k;               /* search the nodes in k */
+        pj = p;              /* list of nodes starts at Ci[pj]*/
+        ln = len[k] - elenk; /* length of list of nodes in k */
+      } else {
+        e = Ci[p++]; /* search the nodes in e */
         pj = Cp[e];
-        ln = len[e];              /* length of list of nodes in e */
+        ln = len[e]; /* length of list of nodes in e */
       }
-      for(k2 = 1; k2 <= ln; k2++)
-      {
+      for (k2 = 1; k2 <= ln; k2++) {
         i = Ci[pj++];
-        if((nvi = nv[i]) <= 0) continue; /* node i dead, or seen */
-        dk += nvi;                 /* degree[Lk] += size of node i */
-        nv[i] = -nvi;             /* negate nv[i] to denote i in Lk*/
-        Ci[pk2++] = i;            /* place i in Lk */
-        if(next[i] != -1) last[next[i]] = last[i];
-        if(last[i] != -1)         /* remove i from degree list */
+        if ((nvi = nv[i]) <= 0) continue; /* node i dead, or seen */
+        dk += nvi;                        /* degree[Lk] += size of node i */
+        nv[i] = -nvi;                     /* negate nv[i] to denote i in Lk*/
+        Ci[pk2++] = i;                    /* place i in Lk */
+        if (next[i] != -1) last[next[i]] = last[i];
+        if (last[i] != -1) /* remove i from degree list */
         {
           next[last[i]] = next[i];
-        }
-        else
-        {
+        } else {
           head[degree[i]] = next[i];
         }
       }
-      if(e != k)
-      {
-        Cp[e] = amd_flip (k);      /* absorb e into k */
-        w[e] = 0;                 /* e is now a dead element */
+      if (e != k) {
+        Cp[e] = amd_flip(k); /* absorb e into k */
+        w[e] = 0;            /* e is now a dead element */
       }
     }
-    if(elenk != 0) cnz = pk2;         /* Ci[cnz...nzmax] is free */
-    degree[k] = dk;                   /* external degree of k - |Lk\i| */
-    Cp[k] = pk1;                      /* element k is in Ci[pk1..pk2-1] */
+    if (elenk != 0) cnz = pk2; /* Ci[cnz...nzmax] is free */
+    degree[k] = dk;            /* external degree of k - |Lk\i| */
+    Cp[k] = pk1;               /* element k is in Ci[pk1..pk2-1] */
     len[k] = pk2 - pk1;
-    elen[k] = -2;                     /* k is now an element */
-    
+    elen[k] = -2; /* k is now an element */
+
     /* --- Find set differences ----------------------------------------- */
-    mark = internal::cs_wclear<Index>(mark, lemax, w, n);  /* clear w if necessary */
-    for(pk = pk1; pk < pk2; pk++)    /* scan 1: find |Le\Lk| */
+    mark = internal::cs_wclear<StorageIndex>(mark, lemax, w, n); /* clear w if necessary */
+    for (pk = pk1; pk < pk2; pk++)                               /* scan 1: find |Le\Lk| */
     {
       i = Ci[pk];
-      if((eln = elen[i]) <= 0) continue;/* skip if elen[i] empty */
-      nvi = -nv[i];                      /* nv[i] was negated */
+      if ((eln = elen[i]) <= 0) continue; /* skip if elen[i] empty */
+      nvi = -nv[i];                       /* nv[i] was negated */
       wnvi = mark - nvi;
-      for(p = Cp[i]; p <= Cp[i] + eln - 1; p++)  /* scan Ei */
+      for (p = Cp[i]; p <= Cp[i] + eln - 1; p++) /* scan Ei */
       {
         e = Ci[p];
-        if(w[e] >= mark)
-        {
-          w[e] -= nvi;          /* decrement |Le\Lk| */
-        }
-        else if(w[e] != 0)        /* ensure e is a live element */
+        if (w[e] >= mark) {
+          w[e] -= nvi;        /* decrement |Le\Lk| */
+        } else if (w[e] != 0) /* ensure e is a live element */
         {
           w[e] = degree[e] + wnvi; /* 1st time e seen in scan 1 */
         }
       }
     }
-    
+
     /* --- Degree update ------------------------------------------------ */
-    for(pk = pk1; pk < pk2; pk++)    /* scan2: degree update */
+    for (pk = pk1; pk < pk2; pk++) /* scan2: degree update */
     {
-      i = Ci[pk];                   /* consider node i in Lk */
+      i = Ci[pk]; /* consider node i in Lk */
       p1 = Cp[i];
       p2 = p1 + elen[i] - 1;
       pn = p1;
-      for(h = 0, d = 0, p = p1; p <= p2; p++)    /* scan Ei */
+      for (h = 0, d = 0, p = p1; p <= p2; p++) /* scan Ei */
       {
         e = Ci[p];
-        if(w[e] != 0)             /* e is an unabsorbed element */
+        if (w[e] != 0) /* e is an unabsorbed element */
         {
-          dext = w[e] - mark;   /* dext = |Le\Lk| */
-          if(dext > 0)
-          {
-            d += dext;         /* sum up the set differences */
-            Ci[pn++] = e;     /* keep e in Ei */
-            h += e;            /* compute the hash of node i */
-          }
-          else
-          {
-            Cp[e] = amd_flip (k);  /* aggressive absorb. e->k */
-            w[e] = 0;             /* e is a dead element */
+          dext = w[e] - mark; /* dext = |Le\Lk| */
+          if (dext > 0) {
+            d += dext;    /* sum up the set differences */
+            Ci[pn++] = e; /* keep e in Ei */
+            h += e;       /* compute the hash of node i */
+          } else {
+            Cp[e] = amd_flip(k); /* aggressive absorb. e->k */
+            w[e] = 0;            /* e is a dead element */
           }
         }
       }
-      elen[i] = pn - p1 + 1;        /* elen[i] = |Ei| */
+      elen[i] = pn - p1 + 1; /* elen[i] = |Ei| */
       p3 = pn;
       p4 = p1 + len[i];
-      for(p = p2 + 1; p < p4; p++) /* prune edges in Ai */
+      for (p = p2 + 1; p < p4; p++) /* prune edges in Ai */
       {
         j = Ci[p];
-        if((nvj = nv[j]) <= 0) continue; /* node j dead or in Lk */
-        d += nvj;                  /* degree(i) += |j| */
-        Ci[pn++] = j;             /* place j in node list of i */
-        h += j;                    /* compute hash for node i */
+        if ((nvj = nv[j]) <= 0) continue; /* node j dead or in Lk */
+        d += nvj;                         /* degree(i) += |j| */
+        Ci[pn++] = j;                     /* place j in node list of i */
+        h += j;                           /* compute hash for node i */
       }
-      if(d == 0)                     /* check for mass elimination */
+      if (d == 0) /* check for mass elimination */
       {
-        Cp[i] = amd_flip (k);      /* absorb i into k */
+        Cp[i] = amd_flip(k); /* absorb i into k */
         nvi = -nv[i];
-        dk -= nvi;                 /* |Lk| -= |i| */
-        nvk += nvi;                /* |k| += nv[i] */
+        dk -= nvi;  /* |Lk| -= |i| */
+        nvk += nvi; /* |k| += nv[i] */
         nel += nvi;
         nv[i] = 0;
-        elen[i] = -1;             /* node i is dead */
-      }
-      else
-      {
-        degree[i] = std::min<Index> (degree[i], d);   /* update degree(i) */
-        Ci[pn] = Ci[p3];         /* move first node to end */
-        Ci[p3] = Ci[p1];         /* move 1st el. to end of Ei */
-        Ci[p1] = k;               /* add k as 1st element in of Ei */
-        len[i] = pn - p1 + 1;     /* new len of adj. list of node i */
-        h %= n;                    /* finalize hash of i */
-        next[i] = hhead[h];      /* place i in hash bucket */
+        elen[i] = -1; /* node i is dead */
+      } else {
+        degree[i] = std::min<StorageIndex>(degree[i], d); /* update degree(i) */
+        Ci[pn] = Ci[p3];                                  /* move first node to end */
+        Ci[p3] = Ci[p1];                                  /* move 1st el. to end of Ei */
+        Ci[p1] = k;                                       /* add k as 1st element in of Ei */
+        len[i] = pn - p1 + 1;                             /* new len of adj. list of node i */
+        h %= n;                                           /* finalize hash of i */
+        next[i] = hhead[h];                               /* place i in hash bucket */
         hhead[h] = i;
-        last[i] = h;              /* save hash of i in last[i] */
+        last[i] = h; /* save hash of i in last[i] */
       }
-    }                                   /* scan2 is done */
-    degree[k] = dk;                   /* finalize |Lk| */
-    lemax = std::max<Index>(lemax, dk);
-    mark = internal::cs_wclear<Index>(mark+lemax, lemax, w, n);    /* clear w */
-    
+    }               /* scan2 is done */
+    degree[k] = dk; /* finalize |Lk| */
+    lemax = std::max<StorageIndex>(lemax, dk);
+    mark = internal::cs_wclear<StorageIndex>(mark + lemax, lemax, w, n); /* clear w */
+
     /* --- Supernode detection ------------------------------------------ */
-    for(pk = pk1; pk < pk2; pk++)
-    {
+    for (pk = pk1; pk < pk2; pk++) {
       i = Ci[pk];
-      if(nv[i] >= 0) continue;         /* skip if i is dead */
-      h = last[i];                      /* scan hash bucket of node i */
+      if (nv[i] >= 0) continue; /* skip if i is dead */
+      h = last[i];              /* scan hash bucket of node i */
       i = hhead[h];
-      hhead[h] = -1;                    /* hash bucket will be empty */
-      for(; i != -1 && next[i] != -1; i = next[i], mark++)
-      {
+      hhead[h] = -1; /* hash bucket will be empty */
+      for (; i != -1 && next[i] != -1; i = next[i], mark++) {
         ln = len[i];
         eln = elen[i];
-        for(p = Cp[i]+1; p <= Cp[i] + ln-1; p++) w[Ci[p]] = mark;
+        for (p = Cp[i] + 1; p <= Cp[i] + ln - 1; p++) w[Ci[p]] = mark;
         jlast = i;
-        for(j = next[i]; j != -1; ) /* compare i with all j */
+        for (j = next[i]; j != -1;) /* compare i with all j */
         {
           ok = (len[j] == ln) && (elen[j] == eln);
-          for(p = Cp[j] + 1; ok && p <= Cp[j] + ln - 1; p++)
-          {
-            if(w[Ci[p]] != mark) ok = 0;    /* compare i and j*/
+          for (p = Cp[j] + 1; ok && p <= Cp[j] + ln - 1; p++) {
+            if (w[Ci[p]] != mark) ok = 0; /* compare i and j*/
           }
-          if(ok)                     /* i and j are identical */
+          if (ok) /* i and j are identical */
           {
-            Cp[j] = amd_flip (i);  /* absorb j into i */
+            Cp[j] = amd_flip(i); /* absorb j into i */
             nv[i] += nv[j];
             nv[j] = 0;
-            elen[j] = -1;         /* node j is dead */
-            j = next[j];          /* delete j from hash bucket */
+            elen[j] = -1; /* node j is dead */
+            j = next[j];  /* delete j from hash bucket */
             next[jlast] = j;
-          }
-          else
-          {
-            jlast = j;             /* j and i are different */
+          } else {
+            jlast = j; /* j and i are different */
             j = next[j];
           }
         }
       }
     }
-    
+
     /* --- Finalize new element------------------------------------------ */
-    for(p = pk1, pk = pk1; pk < pk2; pk++)   /* finalize Lk */
+    for (p = pk1, pk = pk1; pk < pk2; pk++) /* finalize Lk */
     {
       i = Ci[pk];
-      if((nvi = -nv[i]) <= 0) continue;/* skip if i is dead */
-      nv[i] = nvi;                      /* restore nv[i] */
-      d = degree[i] + dk - nvi;         /* compute external degree(i) */
-      d = std::min<Index> (d, n - nel - nvi);
-      if(head[d] != -1) last[head[d]] = i;
-      next[i] = head[d];               /* put i back in degree list */
+      if ((nvi = -nv[i]) <= 0) continue; /* skip if i is dead */
+      nv[i] = nvi;                       /* restore nv[i] */
+      d = degree[i] + dk - nvi;          /* compute external degree(i) */
+      d = std::min<StorageIndex>(d, n - nel - nvi);
+      if (head[d] != -1) last[head[d]] = i;
+      next[i] = head[d]; /* put i back in degree list */
       last[i] = -1;
       head[d] = i;
-      mindeg = std::min<Index> (mindeg, d);       /* find new minimum degree */
+      mindeg = std::min<StorageIndex>(mindeg, d); /* find new minimum degree */
       degree[i] = d;
-      Ci[p++] = i;                      /* place i in Lk */
+      Ci[p++] = i; /* place i in Lk */
     }
-    nv[k] = nvk;                      /* # nodes absorbed into k */
-    if((len[k] = p-pk1) == 0)         /* length of adj list of element k*/
+    nv[k] = nvk;                 /* # nodes absorbed into k */
+    if ((len[k] = p - pk1) == 0) /* length of adj list of element k*/
     {
-      Cp[k] = -1;                   /* k is a root of the tree */
-      w[k] = 0;                     /* k is now a dead element */
+      Cp[k] = -1; /* k is a root of the tree */
+      w[k] = 0;   /* k is now a dead element */
     }
-    if(elenk != 0) cnz = p;           /* free unused space in Lk */
+    if (elenk != 0) cnz = p; /* free unused space in Lk */
   }
-  
+
   /* --- Postordering ----------------------------------------------------- */
-  for(i = 0; i < n; i++) Cp[i] = amd_flip (Cp[i]);/* fix assembly tree */
-  for(j = 0; j <= n; j++) head[j] = -1;
-  for(j = n; j >= 0; j--)              /* place unordered nodes in lists */
+  for (i = 0; i < n; i++) Cp[i] = amd_flip(Cp[i]); /* fix assembly tree */
+  for (j = 0; j <= n; j++) head[j] = -1;
+  for (j = n; j >= 0; j--) /* place unordered nodes in lists */
   {
-    if(nv[j] > 0) continue;          /* skip if j is an element */
-    next[j] = head[Cp[j]];          /* place j in list of its parent */
+    if (nv[j] > 0) continue; /* skip if j is an element */
+    next[j] = head[Cp[j]];   /* place j in list of its parent */
     head[Cp[j]] = j;
   }
-  for(e = n; e >= 0; e--)              /* place elements in lists */
+  for (e = n; e >= 0; e--) /* place elements in lists */
   {
-    if(nv[e] <= 0) continue;         /* skip unless e is an element */
-    if(Cp[e] != -1)
-    {
-      next[e] = head[Cp[e]];      /* place e in list of its parent */
+    if (nv[e] <= 0) continue; /* skip unless e is an element */
+    if (Cp[e] != -1) {
+      next[e] = head[Cp[e]]; /* place e in list of its parent */
       head[Cp[e]] = e;
     }
   }
-  for(k = 0, i = 0; i <= n; i++)       /* postorder the assembly tree */
+  for (k = 0, i = 0; i <= n; i++) /* postorder the assembly tree */
   {
-    if(Cp[i] == -1) k = internal::cs_tdfs<Index>(i, k, head, next, perm.indices().data(), w);
+    if (Cp[i] == -1) k = internal::cs_tdfs<StorageIndex>(i, k, head, next, perm.indices().data(), w);
   }
-  
-  perm.indices().conservativeResize(n);
 
-  delete[] W;
+  perm.indices().conservativeResize(n);
 }
 
-} // namespace internal
+}  // namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_SPARSE_AMD_H
+#endif  // EIGEN_SPARSE_AMD_H
diff --git a/inst/include/Eigen/src/OrderingMethods/Eigen_Colamd.h b/inst/include/Eigen/src/OrderingMethods/Eigen_Colamd.h
index 44548f66..f1ea2ee5 100644
--- a/inst/include/Eigen/src/OrderingMethods/Eigen_Colamd.h
+++ b/inst/include/Eigen/src/OrderingMethods/Eigen_Colamd.h
@@ -13,184 +13,179 @@
 //   Davis (davis@cise.ufl.edu), University of Florida.  The algorithm was
 //   developed in collaboration with John Gilbert, Xerox PARC, and Esmond
 //   Ng, Oak Ridge National Laboratory.
-// 
+//
 //     Date:
-// 
+//
 //   September 8, 2003.  Version 2.3.
-// 
+//
 //     Acknowledgements:
-// 
+//
 //   This work was supported by the National Science Foundation, under
 //   grants DMS-9504974 and DMS-9803599.
-// 
+//
 //     Notice:
-// 
+//
 //   Copyright (c) 1998-2003 by the University of Florida.
 //   All Rights Reserved.
-// 
+//
 //   THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
 //   EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
-// 
+//
 //   Permission is hereby granted to use, copy, modify, and/or distribute
 //   this program, provided that the Copyright, this License, and the
 //   Availability of the original version is retained on all copies and made
 //   accessible to the end-user of any code or package that includes COLAMD
-//   or any modified version of COLAMD. 
-// 
+//   or any modified version of COLAMD.
+//
 //     Availability:
-// 
+//
 //   The colamd/symamd library is available at
-// 
-//       http://www.cise.ufl.edu/research/sparse/colamd/
-
-//   This is the http://www.cise.ufl.edu/research/sparse/colamd/colamd.h
-//   file.  It is required by the colamd.c, colamdmex.c, and symamdmex.c
-//   files, and by any C code that calls the routines whose prototypes are
-//   listed below, or that uses the colamd/symamd definitions listed below.
-  
+//
+//       http://www.suitesparse.com
+
 #ifndef EIGEN_COLAMD_H
 #define EIGEN_COLAMD_H
 
+namespace Eigen {
 namespace internal {
+namespace Colamd {
+
 /* Ensure that debugging is turned off: */
 #ifndef COLAMD_NDEBUG
 #define COLAMD_NDEBUG
 #endif /* NDEBUG */
+
 /* ========================================================================== */
 /* === Knob and statistics definitions ====================================== */
 /* ========================================================================== */
 
 /* size of the knobs [ ] array.  Only knobs [0..1] are currently used. */
-#define COLAMD_KNOBS 20
+const int NKnobs = 20;
 
 /* number of output statistics.  Only stats [0..6] are currently used. */
-#define COLAMD_STATS 20 
+const int NStats = 20;
 
-/* knobs [0] and stats [0]: dense row knob and output statistic. */
-#define COLAMD_DENSE_ROW 0
+/* Indices into knobs and stats array. */
+enum KnobsStatsIndex {
+  /* knobs [0] and stats [0]: dense row knob and output statistic. */
+  DenseRow = 0,
 
-/* knobs [1] and stats [1]: dense column knob and output statistic. */
-#define COLAMD_DENSE_COL 1
+  /* knobs [1] and stats [1]: dense column knob and output statistic. */
+  DenseCol = 1,
 
-/* stats [2]: memory defragmentation count output statistic */
-#define COLAMD_DEFRAG_COUNT 2
+  /* stats [2]: memory defragmentation count output statistic */
+  DefragCount = 2,
 
-/* stats [3]: colamd status:  zero OK, > 0 warning or notice, < 0 error */
-#define COLAMD_STATUS 3
+  /* stats [3]: colamd status:  zero OK, > 0 warning or notice, < 0 error */
+  Status = 3,
 
-/* stats [4..6]: error info, or info on jumbled columns */ 
-#define COLAMD_INFO1 4
-#define COLAMD_INFO2 5
-#define COLAMD_INFO3 6
+  /* stats [4..6]: error info, or info on jumbled columns */
+  Info1 = 4,
+  Info2 = 5,
+  Info3 = 6
+};
 
 /* error codes returned in stats [3]: */
-#define COLAMD_OK       (0)
-#define COLAMD_OK_BUT_JUMBLED     (1)
-#define COLAMD_ERROR_A_not_present    (-1)
-#define COLAMD_ERROR_p_not_present    (-2)
-#define COLAMD_ERROR_nrow_negative    (-3)
-#define COLAMD_ERROR_ncol_negative    (-4)
-#define COLAMD_ERROR_nnz_negative   (-5)
-#define COLAMD_ERROR_p0_nonzero     (-6)
-#define COLAMD_ERROR_A_too_small    (-7)
-#define COLAMD_ERROR_col_length_negative  (-8)
-#define COLAMD_ERROR_row_index_out_of_bounds  (-9)
-#define COLAMD_ERROR_out_of_memory    (-10)
-#define COLAMD_ERROR_internal_error   (-999)
-
+enum Status {
+  Ok = 0,
+  OkButJumbled = 1,
+  ErrorANotPresent = -1,
+  ErrorPNotPresent = -2,
+  ErrorNrowNegative = -3,
+  ErrorNcolNegative = -4,
+  ErrorNnzNegative = -5,
+  ErrorP0Nonzero = -6,
+  ErrorATooSmall = -7,
+  ErrorColLengthNegative = -8,
+  ErrorRowIndexOutOfBounds = -9,
+  ErrorOutOfMemory = -10,
+  ErrorInternalError = -999
+};
 /* ========================================================================== */
 /* === Definitions ========================================================== */
 /* ========================================================================== */
 
-#define COLAMD_MAX(a,b) (((a) > (b)) ? (a) : (b))
-#define COLAMD_MIN(a,b) (((a) < (b)) ? (a) : (b))
-
-#define ONES_COMPLEMENT(r) (-(r)-1)
+template <typename IndexType>
+IndexType ones_complement(const IndexType r) {
+  return (-(r)-1);
+}
 
 /* -------------------------------------------------------------------------- */
-
-#define COLAMD_EMPTY (-1)
+const int Empty = -1;
 
 /* Row and column status */
-#define ALIVE (0)
-#define DEAD  (-1)
+enum RowColumnStatus { Alive = 0, Dead = -1 };
 
 /* Column status */
-#define DEAD_PRINCIPAL    (-1)
-#define DEAD_NON_PRINCIPAL  (-2)
-
-/* Macros for row and column status update and checking. */
-#define ROW_IS_DEAD(r)      ROW_IS_MARKED_DEAD (Row[r].shared2.mark)
-#define ROW_IS_MARKED_DEAD(row_mark)  (row_mark < ALIVE)
-#define ROW_IS_ALIVE(r)     (Row [r].shared2.mark >= ALIVE)
-#define COL_IS_DEAD(c)      (Col [c].start < ALIVE)
-#define COL_IS_ALIVE(c)     (Col [c].start >= ALIVE)
-#define COL_IS_DEAD_PRINCIPAL(c)  (Col [c].start == DEAD_PRINCIPAL)
-#define KILL_ROW(r)     { Row [r].shared2.mark = DEAD ; }
-#define KILL_PRINCIPAL_COL(c)   { Col [c].start = DEAD_PRINCIPAL ; }
-#define KILL_NON_PRINCIPAL_COL(c) { Col [c].start = DEAD_NON_PRINCIPAL ; }
+enum ColumnStatus { DeadPrincipal = -1, DeadNonPrincipal = -2 };
 
 /* ========================================================================== */
 /* === Colamd reporting mechanism =========================================== */
 /* ========================================================================== */
 
 // == Row and Column structures ==
-template <typename Index>
-struct colamd_col
-{
-  Index start ;   /* index for A of first row in this column, or DEAD */
+template <typename IndexType>
+struct ColStructure {
+  IndexType start; /* index for A of first row in this column, or Dead */
   /* if column is dead */
-  Index length ;  /* number of rows in this column */
-  union
-  {
-    Index thickness ; /* number of original columns represented by this */
+  IndexType length; /* number of rows in this column */
+  union {
+    IndexType thickness; /* number of original columns represented by this */
     /* col, if the column is alive */
-    Index parent ;  /* parent in parent tree super-column structure, if */
+    IndexType parent; /* parent in parent tree super-column structure, if */
     /* the column is dead */
-  } shared1 ;
-  union
-  {
-    Index score ; /* the score used to maintain heap, if col is alive */
-    Index order ; /* pivot ordering of this column, if col is dead */
-  } shared2 ;
-  union
-  {
-    Index headhash ;  /* head of a hash bucket, if col is at the head of */
+  } shared1;
+  union {
+    IndexType score; /* the score used to maintain heap, if col is alive */
+    IndexType order; /* pivot ordering of this column, if col is dead */
+  } shared2;
+  union {
+    IndexType headhash; /* head of a hash bucket, if col is at the head of */
     /* a degree list */
-    Index hash ;  /* hash value, if col is not in a degree list */
-    Index prev ;  /* previous column in degree list, if col is in a */
+    IndexType hash; /* hash value, if col is not in a degree list */
+    IndexType prev; /* previous column in degree list, if col is in a */
     /* degree list (but not at the head of a degree list) */
-  } shared3 ;
-  union
-  {
-    Index degree_next ; /* next column, if col is in a degree list */
-    Index hash_next ;   /* next column, if col is in a hash list */
-  } shared4 ;
-  
+  } shared3;
+  union {
+    IndexType degree_next; /* next column, if col is in a degree list */
+    IndexType hash_next;   /* next column, if col is in a hash list */
+  } shared4;
+
+  inline bool is_dead() const { return start < Alive; }
+
+  inline bool is_alive() const { return start >= Alive; }
+
+  inline bool is_dead_principal() const { return start == DeadPrincipal; }
+
+  inline void kill_principal() { start = DeadPrincipal; }
+
+  inline void kill_non_principal() { start = DeadNonPrincipal; }
 };
- 
-template <typename Index>
-struct Colamd_Row
-{
-  Index start ;   /* index for A of first col in this row */
-  Index length ;  /* number of principal columns in this row */
-  union
-  {
-    Index degree ;  /* number of principal & non-principal columns in row */
-    Index p ;   /* used as a row pointer in init_rows_cols () */
-  } shared1 ;
-  union
-  {
-    Index mark ;  /* for computing set differences and marking dead rows*/
-    Index first_column ;/* first column in row (used in garbage collection) */
-  } shared2 ;
-  
+
+template <typename IndexType>
+struct RowStructure {
+  IndexType start;  /* index for A of first col in this row */
+  IndexType length; /* number of principal columns in this row */
+  union {
+    IndexType degree; /* number of principal & non-principal columns in row */
+    IndexType p;      /* used as a row pointer in init_rows_cols () */
+  } shared1;
+  union {
+    IndexType mark;         /* for computing set differences and marking dead rows*/
+    IndexType first_column; /* first column in row (used in garbage collection) */
+  } shared2;
+
+  inline bool is_dead() const { return shared2.mark < Alive; }
+
+  inline bool is_alive() const { return shared2.mark >= Alive; }
+
+  inline void kill() { shared2.mark = Dead; }
 };
- 
+
 /* ========================================================================== */
 /* === Colamd recommended memory size ======================================= */
 /* ========================================================================== */
- 
+
 /*
   The recommended length Alen of the array A passed to colamd is given by
   the COLAMD_RECOMMENDED (nnz, n_row, n_col) macro.  It returns -1 if any
@@ -199,41 +194,50 @@ struct Colamd_Row
   required for the Col and Row arrays, respectively, which are internal to
   colamd.  An additional n_col space is the minimal amount of "elbow room",
   and nnz/5 more space is recommended for run time efficiency.
-  
+
   This macro is not needed when using symamd.
-  
-  Explicit typecast to Index added Sept. 23, 2002, COLAMD version 2.2, to avoid
+
+  Explicit typecast to IndexType added Sept. 23, 2002, COLAMD version 2.2, to avoid
   gcc -pedantic warning messages.
 */
-template <typename Index>
-inline Index colamd_c(Index n_col) 
-{ return Index( ((n_col) + 1) * sizeof (colamd_col<Index>) / sizeof (Index) ) ; }
+template <typename IndexType>
+inline IndexType colamd_c(IndexType n_col) {
+  return IndexType(((n_col) + 1) * sizeof(ColStructure<IndexType>) / sizeof(IndexType));
+}
 
-template <typename Index>
-inline Index  colamd_r(Index n_row)
-{ return Index(((n_row) + 1) * sizeof (Colamd_Row<Index>) / sizeof (Index)); }
+template <typename IndexType>
+inline IndexType colamd_r(IndexType n_row) {
+  return IndexType(((n_row) + 1) * sizeof(RowStructure<IndexType>) / sizeof(IndexType));
+}
 
 // Prototypes of non-user callable routines
-template <typename Index>
-static Index init_rows_cols (Index n_row, Index n_col, Colamd_Row<Index> Row [], colamd_col<Index> col [], Index A [], Index p [], Index stats[COLAMD_STATS] ); 
+template <typename IndexType>
+static IndexType init_rows_cols(IndexType n_row, IndexType n_col, RowStructure<IndexType> Row[],
+                                ColStructure<IndexType> col[], IndexType A[], IndexType p[], IndexType stats[NStats]);
 
-template <typename Index>
-static void init_scoring (Index n_row, Index n_col, Colamd_Row<Index> Row [], colamd_col<Index> Col [], Index A [], Index head [], double knobs[COLAMD_KNOBS], Index *p_n_row2, Index *p_n_col2, Index *p_max_deg);
+template <typename IndexType>
+static void init_scoring(IndexType n_row, IndexType n_col, RowStructure<IndexType> Row[], ColStructure<IndexType> Col[],
+                         IndexType A[], IndexType head[], double knobs[NKnobs], IndexType *p_n_row2,
+                         IndexType *p_n_col2, IndexType *p_max_deg);
 
-template <typename Index>
-static Index find_ordering (Index n_row, Index n_col, Index Alen, Colamd_Row<Index> Row [], colamd_col<Index> Col [], Index A [], Index head [], Index n_col2, Index max_deg, Index pfree);
+template <typename IndexType>
+static IndexType find_ordering(IndexType n_row, IndexType n_col, IndexType Alen, RowStructure<IndexType> Row[],
+                               ColStructure<IndexType> Col[], IndexType A[], IndexType head[], IndexType n_col2,
+                               IndexType max_deg, IndexType pfree);
 
-template <typename Index>
-static void order_children (Index n_col, colamd_col<Index> Col [], Index p []);
+template <typename IndexType>
+static void order_children(IndexType n_col, ColStructure<IndexType> Col[], IndexType p[]);
 
-template <typename Index>
-static void detect_super_cols (colamd_col<Index> Col [], Index A [], Index head [], Index row_start, Index row_length ) ;
+template <typename IndexType>
+static void detect_super_cols(ColStructure<IndexType> Col[], IndexType A[], IndexType head[], IndexType row_start,
+                              IndexType row_length);
 
-template <typename Index>
-static Index garbage_collection (Index n_row, Index n_col, Colamd_Row<Index> Row [], colamd_col<Index> Col [], Index A [], Index *pfree) ;
+template <typename IndexType>
+static IndexType garbage_collection(IndexType n_row, IndexType n_col, RowStructure<IndexType> Row[],
+                                    ColStructure<IndexType> Col[], IndexType A[], IndexType *pfree);
 
-template <typename Index>
-static inline  Index clear_mark (Index n_row, Colamd_Row<Index> Row [] ) ;
+template <typename IndexType>
+static inline IndexType clear_mark(IndexType n_row, RowStructure<IndexType> Row[]);
 
 /* === No debugging ========================================================= */
 
@@ -243,41 +247,39 @@ static inline  Index clear_mark (Index n_row, Colamd_Row<Index> Row [] ) ;
 #define COLAMD_DEBUG3(params) ;
 #define COLAMD_DEBUG4(params) ;
 
-#define COLAMD_ASSERT(expression) ((void) 0)
-
+#define COLAMD_ASSERT(expression) ((void)0)
 
 /**
- * \brief Returns the recommended value of Alen 
- * 
- * Returns recommended value of Alen for use by colamd.  
- * Returns -1 if any input argument is negative.  
- * The use of this routine or macro is optional.  
- * Note that the macro uses its arguments   more than once, 
- * so be careful for side effects, if you pass expressions as arguments to COLAMD_RECOMMENDED.  
- * 
+ * \brief Returns the recommended value of Alen
+ *
+ * Returns recommended value of Alen for use by colamd.
+ * Returns -1 if any input argument is negative.
+ * The use of this routine or macro is optional.
+ * Note that the macro uses its arguments   more than once,
+ * so be careful for side effects, if you pass expressions as arguments to COLAMD_RECOMMENDED.
+ *
  * \param nnz nonzeros in A
  * \param n_row number of rows in A
  * \param n_col number of columns in A
  * \return recommended value of Alen for use by colamd
  */
-template <typename Index>
-inline Index colamd_recommended ( Index nnz, Index n_row, Index n_col)
-{
+template <typename IndexType>
+inline IndexType recommended(IndexType nnz, IndexType n_row, IndexType n_col) {
   if ((nnz) < 0 || (n_row) < 0 || (n_col) < 0)
     return (-1);
   else
-    return (2 * (nnz) + colamd_c (n_col) + colamd_r (n_row) + (n_col) + ((nnz) / 5)); 
+    return (2 * (nnz) + colamd_c(n_col) + colamd_r(n_row) + (n_col) + ((nnz) / 5));
 }
 
 /**
  * \brief set default parameters  The use of this routine is optional.
- * 
- * Colamd: rows with more than (knobs [COLAMD_DENSE_ROW] * n_col)
+ *
+ * Colamd: rows with more than (knobs [DenseRow] * n_col)
  * entries are removed prior to ordering.  Columns with more than
- * (knobs [COLAMD_DENSE_COL] * n_row) entries are removed prior to
- * ordering, and placed last in the output column ordering. 
+ * (knobs [DenseCol] * n_row) entries are removed prior to
+ * ordering, and placed last in the output column ordering.
  *
- * COLAMD_DENSE_ROW and COLAMD_DENSE_COL are defined as 0 and 1,
+ * DenseRow and DenseCol are defined as 0 and 1,
  * respectively, in colamd.h.  Default values of these two knobs
  * are both 0.5.  Currently, only knobs [0] and knobs [1] are
  * used, but future versions may use more knobs.  If so, they will
@@ -286,184 +288,172 @@ inline Index colamd_recommended ( Index nnz, Index n_row, Index n_col)
  * not need to change, assuming that you either use
  * colamd_set_defaults, or pass a (double *) NULL pointer as the
  * knobs array to colamd or symamd.
- * 
+ *
  * \param knobs parameter settings for colamd
  */
 
-static inline void colamd_set_defaults(double knobs[COLAMD_KNOBS])
-{
+static inline void set_defaults(double knobs[NKnobs]) {
   /* === Local variables ================================================== */
-  
-  int i ;
 
-  if (!knobs)
-  {
-    return ;      /* no knobs to initialize */
+  int i;
+
+  if (!knobs) {
+    return; /* no knobs to initialize */
   }
-  for (i = 0 ; i < COLAMD_KNOBS ; i++)
-  {
-    knobs [i] = 0 ;
+  for (i = 0; i < NKnobs; i++) {
+    knobs[i] = 0;
   }
-  knobs [COLAMD_DENSE_ROW] = 0.5 ;  /* ignore rows over 50% dense */
-  knobs [COLAMD_DENSE_COL] = 0.5 ;  /* ignore columns over 50% dense */
+  knobs[Colamd::DenseRow] = 0.5; /* ignore rows over 50% dense */
+  knobs[Colamd::DenseCol] = 0.5; /* ignore columns over 50% dense */
 }
 
-/** 
+/**
  * \brief  Computes a column ordering using the column approximate minimum degree ordering
- * 
+ *
  * Computes a column ordering (Q) of A such that P(AQ)=LU or
  * (AQ)'AQ=LL' have less fill-in and require fewer floating point
  * operations than factorizing the unpermuted matrix A or A'A,
  * respectively.
- * 
- * 
+ *
+ *
  * \param n_row number of rows in A
  * \param n_col number of columns in A
- * \param Alen, size of the array A
+ * \param Alen size of the array A
  * \param A row indices of the matrix, of size ALen
  * \param p column pointers of A, of size n_col+1
  * \param knobs parameter settings for colamd
  * \param stats colamd output statistics and error codes
  */
-template <typename Index>
-static bool colamd(Index n_row, Index n_col, Index Alen, Index *A, Index *p, double knobs[COLAMD_KNOBS], Index stats[COLAMD_STATS])
-{
+template <typename IndexType>
+static bool compute_ordering(IndexType n_row, IndexType n_col, IndexType Alen, IndexType *A, IndexType *p,
+                             double knobs[NKnobs], IndexType stats[NStats]) {
   /* === Local variables ================================================== */
-  
-  Index i ;     /* loop index */
-  Index nnz ;     /* nonzeros in A */
-  Index Row_size ;    /* size of Row [], in integers */
-  Index Col_size ;    /* size of Col [], in integers */
-  Index need ;      /* minimum required length of A */
-  Colamd_Row<Index> *Row ;   /* pointer into A of Row [0..n_row] array */
-  colamd_col<Index> *Col ;   /* pointer into A of Col [0..n_col] array */
-  Index n_col2 ;    /* number of non-dense, non-empty columns */
-  Index n_row2 ;    /* number of non-dense, non-empty rows */
-  Index ngarbage ;    /* number of garbage collections performed */
-  Index max_deg ;   /* maximum row degree */
-  double default_knobs [COLAMD_KNOBS] ; /* default knobs array */
-  
-  
+
+  IndexType i;                          /* loop index */
+  IndexType nnz;                        /* nonzeros in A */
+  IndexType Row_size;                   /* size of Row [], in integers */
+  IndexType Col_size;                   /* size of Col [], in integers */
+  IndexType need;                       /* minimum required length of A */
+  Colamd::RowStructure<IndexType> *Row; /* pointer into A of Row [0..n_row] array */
+  Colamd::ColStructure<IndexType> *Col; /* pointer into A of Col [0..n_col] array */
+  IndexType n_col2;                     /* number of non-dense, non-empty columns */
+  IndexType n_row2;                     /* number of non-dense, non-empty rows */
+  IndexType ngarbage;                   /* number of garbage collections performed */
+  IndexType max_deg;                    /* maximum row degree */
+  double default_knobs[NKnobs];         /* default knobs array */
+
   /* === Check the input arguments ======================================== */
-  
-  if (!stats)
-  {
-    COLAMD_DEBUG0 (("colamd: stats not present\n")) ;
-    return (false) ;
+
+  if (!stats) {
+    COLAMD_DEBUG0(("colamd: stats not present\n"));
+    return (false);
   }
-  for (i = 0 ; i < COLAMD_STATS ; i++)
-  {
-    stats [i] = 0 ;
+  for (i = 0; i < NStats; i++) {
+    stats[i] = 0;
   }
-  stats [COLAMD_STATUS] = COLAMD_OK ;
-  stats [COLAMD_INFO1] = -1 ;
-  stats [COLAMD_INFO2] = -1 ;
-  
-  if (!A)   /* A is not present */
+  stats[Colamd::Status] = Colamd::Ok;
+  stats[Colamd::Info1] = -1;
+  stats[Colamd::Info2] = -1;
+
+  if (!A) /* A is not present */
   {
-    stats [COLAMD_STATUS] = COLAMD_ERROR_A_not_present ;
-    COLAMD_DEBUG0 (("colamd: A not present\n")) ;
-    return (false) ;
+    stats[Colamd::Status] = Colamd::ErrorANotPresent;
+    COLAMD_DEBUG0(("colamd: A not present\n"));
+    return (false);
   }
-  
-  if (!p)   /* p is not present */
+
+  if (!p) /* p is not present */
   {
-    stats [COLAMD_STATUS] = COLAMD_ERROR_p_not_present ;
-    COLAMD_DEBUG0 (("colamd: p not present\n")) ;
-    return (false) ;
+    stats[Colamd::Status] = Colamd::ErrorPNotPresent;
+    COLAMD_DEBUG0(("colamd: p not present\n"));
+    return (false);
   }
-  
-  if (n_row < 0)  /* n_row must be >= 0 */
+
+  if (n_row < 0) /* n_row must be >= 0 */
   {
-    stats [COLAMD_STATUS] = COLAMD_ERROR_nrow_negative ;
-    stats [COLAMD_INFO1] = n_row ;
-    COLAMD_DEBUG0 (("colamd: nrow negative %d\n", n_row)) ;
-    return (false) ;
+    stats[Colamd::Status] = Colamd::ErrorNrowNegative;
+    stats[Colamd::Info1] = n_row;
+    COLAMD_DEBUG0(("colamd: nrow negative %d\n", n_row));
+    return (false);
   }
-  
-  if (n_col < 0)  /* n_col must be >= 0 */
+
+  if (n_col < 0) /* n_col must be >= 0 */
   {
-    stats [COLAMD_STATUS] = COLAMD_ERROR_ncol_negative ;
-    stats [COLAMD_INFO1] = n_col ;
-    COLAMD_DEBUG0 (("colamd: ncol negative %d\n", n_col)) ;
-    return (false) ;
+    stats[Colamd::Status] = Colamd::ErrorNcolNegative;
+    stats[Colamd::Info1] = n_col;
+    COLAMD_DEBUG0(("colamd: ncol negative %d\n", n_col));
+    return (false);
   }
-  
-  nnz = p [n_col] ;
-  if (nnz < 0)  /* nnz must be >= 0 */
+
+  nnz = p[n_col];
+  if (nnz < 0) /* nnz must be >= 0 */
   {
-    stats [COLAMD_STATUS] = COLAMD_ERROR_nnz_negative ;
-    stats [COLAMD_INFO1] = nnz ;
-    COLAMD_DEBUG0 (("colamd: number of entries negative %d\n", nnz)) ;
-    return (false) ;
+    stats[Colamd::Status] = Colamd::ErrorNnzNegative;
+    stats[Colamd::Info1] = nnz;
+    COLAMD_DEBUG0(("colamd: number of entries negative %d\n", nnz));
+    return (false);
   }
-  
-  if (p [0] != 0)
-  {
-    stats [COLAMD_STATUS] = COLAMD_ERROR_p0_nonzero ;
-    stats [COLAMD_INFO1] = p [0] ;
-    COLAMD_DEBUG0 (("colamd: p[0] not zero %d\n", p [0])) ;
-    return (false) ;
+
+  if (p[0] != 0) {
+    stats[Colamd::Status] = Colamd::ErrorP0Nonzero;
+    stats[Colamd::Info1] = p[0];
+    COLAMD_DEBUG0(("colamd: p[0] not zero %d\n", p[0]));
+    return (false);
   }
-  
+
   /* === If no knobs, set default knobs =================================== */
-  
-  if (!knobs)
-  {
-    colamd_set_defaults (default_knobs) ;
-    knobs = default_knobs ;
+
+  if (!knobs) {
+    set_defaults(default_knobs);
+    knobs = default_knobs;
   }
-  
+
   /* === Allocate the Row and Col arrays from array A ===================== */
-  
-  Col_size = colamd_c (n_col) ;
-  Row_size = colamd_r (n_row) ;
-  need = 2*nnz + n_col + Col_size + Row_size ;
-  
-  if (need > Alen)
-  {
+
+  Col_size = colamd_c(n_col);
+  Row_size = colamd_r(n_row);
+  need = 2 * nnz + n_col + Col_size + Row_size;
+
+  if (need > Alen) {
     /* not enough space in array A to perform the ordering */
-    stats [COLAMD_STATUS] = COLAMD_ERROR_A_too_small ;
-    stats [COLAMD_INFO1] = need ;
-    stats [COLAMD_INFO2] = Alen ;
-    COLAMD_DEBUG0 (("colamd: Need Alen >= %d, given only Alen = %d\n", need,Alen));
-    return (false) ;
+    stats[Colamd::Status] = Colamd::ErrorATooSmall;
+    stats[Colamd::Info1] = need;
+    stats[Colamd::Info2] = Alen;
+    COLAMD_DEBUG0(("colamd: Need Alen >= %d, given only Alen = %d\n", need, Alen));
+    return (false);
   }
-  
-  Alen -= Col_size + Row_size ;
-  Col = (colamd_col<Index> *) &A [Alen] ;
-  Row = (Colamd_Row<Index> *) &A [Alen + Col_size] ;
+
+  Alen -= Col_size + Row_size;
+  Col = (ColStructure<IndexType> *)&A[Alen];
+  Row = (RowStructure<IndexType> *)&A[Alen + Col_size];
 
   /* === Construct the row and column data structures ===================== */
-  
-  if (!Eigen::internal::init_rows_cols (n_row, n_col, Row, Col, A, p, stats))
-  {
+
+  if (!Colamd::init_rows_cols(n_row, n_col, Row, Col, A, p, stats)) {
     /* input matrix is invalid */
-    COLAMD_DEBUG0 (("colamd: Matrix invalid\n")) ;
-    return (false) ;
+    COLAMD_DEBUG0(("colamd: Matrix invalid\n"));
+    return (false);
   }
-  
+
   /* === Initialize scores, kill dense rows/columns ======================= */
 
-  Eigen::internal::init_scoring (n_row, n_col, Row, Col, A, p, knobs,
-		&n_row2, &n_col2, &max_deg) ;
-  
+  Colamd::init_scoring(n_row, n_col, Row, Col, A, p, knobs, &n_row2, &n_col2, &max_deg);
+
   /* === Order the supercolumns =========================================== */
-  
-  ngarbage = Eigen::internal::find_ordering (n_row, n_col, Alen, Row, Col, A, p,
-			    n_col2, max_deg, 2*nnz) ;
-  
+
+  ngarbage = Colamd::find_ordering(n_row, n_col, Alen, Row, Col, A, p, n_col2, max_deg, 2 * nnz);
+
   /* === Order the non-principal columns ================================== */
-  
-  Eigen::internal::order_children (n_col, Col, p) ;
-  
+
+  Colamd::order_children(n_col, Col, p);
+
   /* === Return statistics in stats ======================================= */
-  
-  stats [COLAMD_DENSE_ROW] = n_row - n_row2 ;
-  stats [COLAMD_DENSE_COL] = n_col - n_col2 ;
-  stats [COLAMD_DEFRAG_COUNT] = ngarbage ;
-  COLAMD_DEBUG0 (("colamd: done.\n")) ; 
-  return (true) ;
+
+  stats[Colamd::DenseRow] = n_row - n_row2;
+  stats[Colamd::DenseCol] = n_col - n_col2;
+  stats[Colamd::DefragCount] = ngarbage;
+  COLAMD_DEBUG0(("colamd: done.\n"));
+  return (true);
 }
 
 /* ========================================================================== */
@@ -472,7 +462,6 @@ static bool colamd(Index n_row, Index n_col, Index Alen, Index *A, Index *p, dou
 
 /* There are no user-callable routines beyond this point in the file */
 
-
 /* ========================================================================== */
 /* === init_rows_cols ======================================================= */
 /* ========================================================================== */
@@ -485,113 +474,103 @@ static bool colamd(Index n_row, Index n_col, Index Alen, Index *A, Index *p, dou
   column form of the matrix.  Returns false if the matrix is invalid,
   true otherwise.  Not user-callable.
 */
-template <typename Index>
-static Index init_rows_cols  /* returns true if OK, or false otherwise */
-  (
-    /* === Parameters ======================================================= */
-
-    Index n_row,      /* number of rows of A */
-    Index n_col,      /* number of columns of A */
-    Colamd_Row<Index> Row [],    /* of size n_row+1 */
-    colamd_col<Index> Col [],    /* of size n_col+1 */
-    Index A [],     /* row indices of A, of size Alen */
-    Index p [],     /* pointers to columns in A, of size n_col+1 */
-    Index stats [COLAMD_STATS]  /* colamd statistics */ 
-    )
-{
+template <typename IndexType>
+static IndexType init_rows_cols /* returns true if OK, or false otherwise */
+    (
+        /* === Parameters ======================================================= */
+
+        IndexType n_row,               /* number of rows of A */
+        IndexType n_col,               /* number of columns of A */
+        RowStructure<IndexType> Row[], /* of size n_row+1 */
+        ColStructure<IndexType> Col[], /* of size n_col+1 */
+        IndexType A[],                 /* row indices of A, of size Alen */
+        IndexType p[],                 /* pointers to columns in A, of size n_col+1 */
+        IndexType stats[NStats]        /* colamd statistics */
+    ) {
   /* === Local variables ================================================== */
 
-  Index col ;     /* a column index */
-  Index row ;     /* a row index */
-  Index *cp ;     /* a column pointer */
-  Index *cp_end ;   /* a pointer to the end of a column */
-  Index *rp ;     /* a row pointer */
-  Index *rp_end ;   /* a pointer to the end of a row */
-  Index last_row ;    /* previous row */
+  IndexType col;      /* a column index */
+  IndexType row;      /* a row index */
+  IndexType *cp;      /* a column pointer */
+  IndexType *cp_end;  /* a pointer to the end of a column */
+  IndexType *rp;      /* a row pointer */
+  IndexType *rp_end;  /* a pointer to the end of a row */
+  IndexType last_row; /* previous row */
 
   /* === Initialize columns, and check column pointers ==================== */
 
-  for (col = 0 ; col < n_col ; col++)
-  {
-    Col [col].start = p [col] ;
-    Col [col].length = p [col+1] - p [col] ;
+  for (col = 0; col < n_col; col++) {
+    Col[col].start = p[col];
+    Col[col].length = p[col + 1] - p[col];
 
-    if (Col [col].length < 0)
+    if ((Col[col].length) < 0)  // extra parentheses to work-around gcc bug 10200
     {
       /* column pointers must be non-decreasing */
-      stats [COLAMD_STATUS] = COLAMD_ERROR_col_length_negative ;
-      stats [COLAMD_INFO1] = col ;
-      stats [COLAMD_INFO2] = Col [col].length ;
-      COLAMD_DEBUG0 (("colamd: col %d length %d < 0\n", col, Col [col].length)) ;
-      return (false) ;
+      stats[Colamd::Status] = Colamd::ErrorColLengthNegative;
+      stats[Colamd::Info1] = col;
+      stats[Colamd::Info2] = Col[col].length;
+      COLAMD_DEBUG0(("colamd: col %d length %d < 0\n", col, Col[col].length));
+      return (false);
     }
 
-    Col [col].shared1.thickness = 1 ;
-    Col [col].shared2.score = 0 ;
-    Col [col].shared3.prev = COLAMD_EMPTY ;
-    Col [col].shared4.degree_next = COLAMD_EMPTY ;
+    Col[col].shared1.thickness = 1;
+    Col[col].shared2.score = 0;
+    Col[col].shared3.prev = Empty;
+    Col[col].shared4.degree_next = Empty;
   }
 
   /* p [0..n_col] no longer needed, used as "head" in subsequent routines */
 
   /* === Scan columns, compute row degrees, and check row indices ========= */
 
-  stats [COLAMD_INFO3] = 0 ;  /* number of duplicate or unsorted row indices*/
+  stats[Info3] = 0; /* number of duplicate or unsorted row indices*/
 
-  for (row = 0 ; row < n_row ; row++)
-  {
-    Row [row].length = 0 ;
-    Row [row].shared2.mark = -1 ;
+  for (row = 0; row < n_row; row++) {
+    Row[row].length = 0;
+    Row[row].shared2.mark = -1;
   }
 
-  for (col = 0 ; col < n_col ; col++)
-  {
-    last_row = -1 ;
+  for (col = 0; col < n_col; col++) {
+    last_row = -1;
 
-    cp = &A [p [col]] ;
-    cp_end = &A [p [col+1]] ;
+    cp = &A[p[col]];
+    cp_end = &A[p[col + 1]];
 
-    while (cp < cp_end)
-    {
-      row = *cp++ ;
+    while (cp < cp_end) {
+      row = *cp++;
 
       /* make sure row indices within range */
-      if (row < 0 || row >= n_row)
-      {
-	stats [COLAMD_STATUS] = COLAMD_ERROR_row_index_out_of_bounds ;
-	stats [COLAMD_INFO1] = col ;
-	stats [COLAMD_INFO2] = row ;
-	stats [COLAMD_INFO3] = n_row ;
-	COLAMD_DEBUG0 (("colamd: row %d col %d out of bounds\n", row, col)) ;
-	return (false) ;
+      if (row < 0 || row >= n_row) {
+        stats[Colamd::Status] = Colamd::ErrorRowIndexOutOfBounds;
+        stats[Colamd::Info1] = col;
+        stats[Colamd::Info2] = row;
+        stats[Colamd::Info3] = n_row;
+        COLAMD_DEBUG0(("colamd: row %d col %d out of bounds\n", row, col));
+        return (false);
       }
 
-      if (row <= last_row || Row [row].shared2.mark == col)
-      {
-	/* row index are unsorted or repeated (or both), thus col */
-	/* is jumbled.  This is a notice, not an error condition. */
-	stats [COLAMD_STATUS] = COLAMD_OK_BUT_JUMBLED ;
-	stats [COLAMD_INFO1] = col ;
-	stats [COLAMD_INFO2] = row ;
-	(stats [COLAMD_INFO3]) ++ ;
-	COLAMD_DEBUG1 (("colamd: row %d col %d unsorted/duplicate\n",row,col));
+      if (row <= last_row || Row[row].shared2.mark == col) {
+        /* row index are unsorted or repeated (or both), thus col */
+        /* is jumbled.  This is a notice, not an error condition. */
+        stats[Colamd::Status] = Colamd::OkButJumbled;
+        stats[Colamd::Info1] = col;
+        stats[Colamd::Info2] = row;
+        (stats[Colamd::Info3])++;
+        COLAMD_DEBUG1(("colamd: row %d col %d unsorted/duplicate\n", row, col));
       }
 
-      if (Row [row].shared2.mark != col)
-      {
-	Row [row].length++ ;
-      }
-      else
-      {
-	/* this is a repeated entry in the column, */
-	/* it will be removed */
-	Col [col].length-- ;
+      if (Row[row].shared2.mark != col) {
+        Row[row].length++;
+      } else {
+        /* this is a repeated entry in the column, */
+        /* it will be removed */
+        Col[col].length--;
       }
 
       /* mark the row as having been seen in this column */
-      Row [row].shared2.mark = col ;
+      Row[row].shared2.mark = col;
 
-      last_row = row ;
+      last_row = row;
     }
   }
 
@@ -599,64 +578,52 @@ static Index init_rows_cols  /* returns true if OK, or false otherwise */
 
   /* row form of the matrix starts directly after the column */
   /* form of matrix in A */
-  Row [0].start = p [n_col] ;
-  Row [0].shared1.p = Row [0].start ;
-  Row [0].shared2.mark = -1 ;
-  for (row = 1 ; row < n_row ; row++)
-  {
-    Row [row].start = Row [row-1].start + Row [row-1].length ;
-    Row [row].shared1.p = Row [row].start ;
-    Row [row].shared2.mark = -1 ;
+  Row[0].start = p[n_col];
+  Row[0].shared1.p = Row[0].start;
+  Row[0].shared2.mark = -1;
+  for (row = 1; row < n_row; row++) {
+    Row[row].start = Row[row - 1].start + Row[row - 1].length;
+    Row[row].shared1.p = Row[row].start;
+    Row[row].shared2.mark = -1;
   }
 
   /* === Create row form ================================================== */
 
-  if (stats [COLAMD_STATUS] == COLAMD_OK_BUT_JUMBLED)
-  {
+  if (stats[Status] == OkButJumbled) {
     /* if cols jumbled, watch for repeated row indices */
-    for (col = 0 ; col < n_col ; col++)
-    {
-      cp = &A [p [col]] ;
-      cp_end = &A [p [col+1]] ;
-      while (cp < cp_end)
-      {
-	row = *cp++ ;
-	if (Row [row].shared2.mark != col)
-	{
-	  A [(Row [row].shared1.p)++] = col ;
-	  Row [row].shared2.mark = col ;
-	}
+    for (col = 0; col < n_col; col++) {
+      cp = &A[p[col]];
+      cp_end = &A[p[col + 1]];
+      while (cp < cp_end) {
+        row = *cp++;
+        if (Row[row].shared2.mark != col) {
+          A[(Row[row].shared1.p)++] = col;
+          Row[row].shared2.mark = col;
+        }
       }
     }
-  }
-  else
-  {
+  } else {
     /* if cols not jumbled, we don't need the mark (this is faster) */
-    for (col = 0 ; col < n_col ; col++)
-    {
-      cp = &A [p [col]] ;
-      cp_end = &A [p [col+1]] ;
-      while (cp < cp_end)
-      {
-	A [(Row [*cp++].shared1.p)++] = col ;
+    for (col = 0; col < n_col; col++) {
+      cp = &A[p[col]];
+      cp_end = &A[p[col + 1]];
+      while (cp < cp_end) {
+        A[(Row[*cp++].shared1.p)++] = col;
       }
     }
   }
 
   /* === Clear the row marks and set row degrees ========================== */
 
-  for (row = 0 ; row < n_row ; row++)
-  {
-    Row [row].shared2.mark = 0 ;
-    Row [row].shared1.degree = Row [row].length ;
+  for (row = 0; row < n_row; row++) {
+    Row[row].shared2.mark = 0;
+    Row[row].shared1.degree = Row[row].length;
   }
 
   /* === See if we need to re-create columns ============================== */
 
-  if (stats [COLAMD_STATUS] == COLAMD_OK_BUT_JUMBLED)
-  {
-    COLAMD_DEBUG0 (("colamd: reconstructing column form, matrix jumbled\n")) ;
-
+  if (stats[Status] == OkButJumbled) {
+    COLAMD_DEBUG0(("colamd: reconstructing column form, matrix jumbled\n"));
 
     /* === Compute col pointers ========================================= */
 
@@ -664,35 +631,31 @@ static Index init_rows_cols  /* returns true if OK, or false otherwise */
     /* Note, we may have a gap between the col form and the row */
     /* form if there were duplicate entries, if so, it will be */
     /* removed upon the first garbage collection */
-    Col [0].start = 0 ;
-    p [0] = Col [0].start ;
-    for (col = 1 ; col < n_col ; col++)
-    {
+    Col[0].start = 0;
+    p[0] = Col[0].start;
+    for (col = 1; col < n_col; col++) {
       /* note that the lengths here are for pruned columns, i.e. */
       /* no duplicate row indices will exist for these columns */
-      Col [col].start = Col [col-1].start + Col [col-1].length ;
-      p [col] = Col [col].start ;
+      Col[col].start = Col[col - 1].start + Col[col - 1].length;
+      p[col] = Col[col].start;
     }
 
     /* === Re-create col form =========================================== */
 
-    for (row = 0 ; row < n_row ; row++)
-    {
-      rp = &A [Row [row].start] ;
-      rp_end = rp + Row [row].length ;
-      while (rp < rp_end)
-      {
-	A [(p [*rp++])++] = row ;
+    for (row = 0; row < n_row; row++) {
+      rp = &A[Row[row].start];
+      rp_end = rp + Row[row].length;
+      while (rp < rp_end) {
+        A[(p[*rp++])++] = row;
       }
     }
   }
 
   /* === Done.  Matrix is not (or no longer) jumbled ====================== */
 
-  return (true) ;
+  return (true);
 }
 
-
 /* ========================================================================== */
 /* === init_scoring ========================================================= */
 /* ========================================================================== */
@@ -701,113 +664,100 @@ static Index init_rows_cols  /* returns true if OK, or false otherwise */
   Kills dense or empty columns and rows, calculates an initial score for
   each column, and places all columns in the degree lists.  Not user-callable.
 */
-template <typename Index>
-static void init_scoring
-  (
+template <typename IndexType>
+static void init_scoring(
     /* === Parameters ======================================================= */
 
-    Index n_row,      /* number of rows of A */
-    Index n_col,      /* number of columns of A */
-    Colamd_Row<Index> Row [],    /* of size n_row+1 */
-    colamd_col<Index> Col [],    /* of size n_col+1 */
-    Index A [],     /* column form and row form of A */
-    Index head [],    /* of size n_col+1 */
-    double knobs [COLAMD_KNOBS],/* parameters */
-    Index *p_n_row2,    /* number of non-dense, non-empty rows */
-    Index *p_n_col2,    /* number of non-dense, non-empty columns */
-    Index *p_max_deg    /* maximum row degree */
-    )
-{
+    IndexType n_row,               /* number of rows of A */
+    IndexType n_col,               /* number of columns of A */
+    RowStructure<IndexType> Row[], /* of size n_row+1 */
+    ColStructure<IndexType> Col[], /* of size n_col+1 */
+    IndexType A[],                 /* column form and row form of A */
+    IndexType head[],              /* of size n_col+1 */
+    double knobs[NKnobs],          /* parameters */
+    IndexType *p_n_row2,           /* number of non-dense, non-empty rows */
+    IndexType *p_n_col2,           /* number of non-dense, non-empty columns */
+    IndexType *p_max_deg           /* maximum row degree */
+) {
   /* === Local variables ================================================== */
 
-  Index c ;     /* a column index */
-  Index r, row ;    /* a row index */
-  Index *cp ;     /* a column pointer */
-  Index deg ;     /* degree of a row or column */
-  Index *cp_end ;   /* a pointer to the end of a column */
-  Index *new_cp ;   /* new column pointer */
-  Index col_length ;    /* length of pruned column */
-  Index score ;     /* current column score */
-  Index n_col2 ;    /* number of non-dense, non-empty columns */
-  Index n_row2 ;    /* number of non-dense, non-empty rows */
-  Index dense_row_count ; /* remove rows with more entries than this */
-  Index dense_col_count ; /* remove cols with more entries than this */
-  Index min_score ;   /* smallest column score */
-  Index max_deg ;   /* maximum row degree */
-  Index next_col ;    /* Used to add to degree list.*/
-
+  IndexType c;               /* a column index */
+  IndexType r, row;          /* a row index */
+  IndexType *cp;             /* a column pointer */
+  IndexType deg;             /* degree of a row or column */
+  IndexType *cp_end;         /* a pointer to the end of a column */
+  IndexType *new_cp;         /* new column pointer */
+  IndexType col_length;      /* length of pruned column */
+  IndexType score;           /* current column score */
+  IndexType n_col2;          /* number of non-dense, non-empty columns */
+  IndexType n_row2;          /* number of non-dense, non-empty rows */
+  IndexType dense_row_count; /* remove rows with more entries than this */
+  IndexType dense_col_count; /* remove cols with more entries than this */
+  IndexType min_score;       /* smallest column score */
+  IndexType max_deg;         /* maximum row degree */
+  IndexType next_col;        /* Used to add to degree list.*/
 
   /* === Extract knobs ==================================================== */
 
-  dense_row_count = COLAMD_MAX (0, COLAMD_MIN (knobs [COLAMD_DENSE_ROW] * n_col, n_col)) ;
-  dense_col_count = COLAMD_MAX (0, COLAMD_MIN (knobs [COLAMD_DENSE_COL] * n_row, n_row)) ;
-  COLAMD_DEBUG1 (("colamd: densecount: %d %d\n", dense_row_count, dense_col_count)) ;
-  max_deg = 0 ;
-  n_col2 = n_col ;
-  n_row2 = n_row ;
+  dense_row_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs[Colamd::DenseRow] * n_col), n_col));
+  dense_col_count = numext::maxi(IndexType(0), numext::mini(IndexType(knobs[Colamd::DenseCol] * n_row), n_row));
+  COLAMD_DEBUG1(("colamd: densecount: %d %d\n", dense_row_count, dense_col_count));
+  max_deg = 0;
+  n_col2 = n_col;
+  n_row2 = n_row;
 
   /* === Kill empty columns =============================================== */
 
   /* Put the empty columns at the end in their natural order, so that LU */
   /* factorization can proceed as far as possible. */
-  for (c = n_col-1 ; c >= 0 ; c--)
-  {
-    deg = Col [c].length ;
-    if (deg == 0)
-    {
+  for (c = n_col - 1; c >= 0; c--) {
+    deg = Col[c].length;
+    if (deg == 0) {
       /* this is a empty column, kill and order it last */
-      Col [c].shared2.order = --n_col2 ;
-      KILL_PRINCIPAL_COL (c) ;
+      Col[c].shared2.order = --n_col2;
+      Col[c].kill_principal();
     }
   }
-  COLAMD_DEBUG1 (("colamd: null columns killed: %d\n", n_col - n_col2)) ;
+  COLAMD_DEBUG1(("colamd: null columns killed: %d\n", n_col - n_col2));
 
   /* === Kill dense columns =============================================== */
 
   /* Put the dense columns at the end, in their natural order */
-  for (c = n_col-1 ; c >= 0 ; c--)
-  {
+  for (c = n_col - 1; c >= 0; c--) {
     /* skip any dead columns */
-    if (COL_IS_DEAD (c))
-    {
-      continue ;
+    if (Col[c].is_dead()) {
+      continue;
     }
-    deg = Col [c].length ;
-    if (deg > dense_col_count)
-    {
+    deg = Col[c].length;
+    if (deg > dense_col_count) {
       /* this is a dense column, kill and order it last */
-      Col [c].shared2.order = --n_col2 ;
+      Col[c].shared2.order = --n_col2;
       /* decrement the row degrees */
-      cp = &A [Col [c].start] ;
-      cp_end = cp + Col [c].length ;
-      while (cp < cp_end)
-      {
-	Row [*cp++].shared1.degree-- ;
+      cp = &A[Col[c].start];
+      cp_end = cp + Col[c].length;
+      while (cp < cp_end) {
+        Row[*cp++].shared1.degree--;
       }
-      KILL_PRINCIPAL_COL (c) ;
+      Col[c].kill_principal();
     }
   }
-  COLAMD_DEBUG1 (("colamd: Dense and null columns killed: %d\n", n_col - n_col2)) ;
+  COLAMD_DEBUG1(("colamd: Dense and null columns killed: %d\n", n_col - n_col2));
 
   /* === Kill dense and empty rows ======================================== */
 
-  for (r = 0 ; r < n_row ; r++)
-  {
-    deg = Row [r].shared1.degree ;
-    COLAMD_ASSERT (deg >= 0 && deg <= n_col) ;
-    if (deg > dense_row_count || deg == 0)
-    {
+  for (r = 0; r < n_row; r++) {
+    deg = Row[r].shared1.degree;
+    COLAMD_ASSERT(deg >= 0 && deg <= n_col);
+    if (deg > dense_row_count || deg == 0) {
       /* kill a dense or empty row */
-      KILL_ROW (r) ;
-      --n_row2 ;
-    }
-    else
-    {
+      Row[r].kill();
+      --n_row2;
+    } else {
       /* keep track of max degree of remaining rows */
-      max_deg = COLAMD_MAX (max_deg, deg) ;
+      max_deg = numext::maxi(max_deg, deg);
     }
   }
-  COLAMD_DEBUG1 (("colamd: Dense and null rows killed: %d\n", n_row - n_row2)) ;
+  COLAMD_DEBUG1(("colamd: Dense and null rows killed: %d\n", n_row - n_row2));
 
   /* === Compute initial column scores ==================================== */
 
@@ -817,54 +767,46 @@ static void init_scoring
   /* pruned in the code below. */
 
   /* now find the initial matlab score for each column */
-  for (c = n_col-1 ; c >= 0 ; c--)
-  {
+  for (c = n_col - 1; c >= 0; c--) {
     /* skip dead column */
-    if (COL_IS_DEAD (c))
-    {
-      continue ;
+    if (Col[c].is_dead()) {
+      continue;
     }
-    score = 0 ;
-    cp = &A [Col [c].start] ;
-    new_cp = cp ;
-    cp_end = cp + Col [c].length ;
-    while (cp < cp_end)
-    {
+    score = 0;
+    cp = &A[Col[c].start];
+    new_cp = cp;
+    cp_end = cp + Col[c].length;
+    while (cp < cp_end) {
       /* get a row */
-      row = *cp++ ;
+      row = *cp++;
       /* skip if dead */
-      if (ROW_IS_DEAD (row))
-      {
-	continue ;
+      if (Row[row].is_dead()) {
+        continue;
       }
       /* compact the column */
-      *new_cp++ = row ;
+      *new_cp++ = row;
       /* add row's external degree */
-      score += Row [row].shared1.degree - 1 ;
+      score += Row[row].shared1.degree - 1;
       /* guard against integer overflow */
-      score = COLAMD_MIN (score, n_col) ;
+      score = numext::mini(score, n_col);
     }
     /* determine pruned column length */
-    col_length = (Index) (new_cp - &A [Col [c].start]) ;
-    if (col_length == 0)
-    {
+    col_length = (IndexType)(new_cp - &A[Col[c].start]);
+    if (col_length == 0) {
       /* a newly-made null column (all rows in this col are "dense" */
       /* and have already been killed) */
-      COLAMD_DEBUG2 (("Newly null killed: %d\n", c)) ;
-      Col [c].shared2.order = --n_col2 ;
-      KILL_PRINCIPAL_COL (c) ;
-    }
-    else
-    {
+      COLAMD_DEBUG2(("Newly null killed: %d\n", c));
+      Col[c].shared2.order = --n_col2;
+      Col[c].kill_principal();
+    } else {
       /* set column length and set score */
-      COLAMD_ASSERT (score >= 0) ;
-      COLAMD_ASSERT (score <= n_col) ;
-      Col [c].length = col_length ;
-      Col [c].shared2.score = score ;
+      COLAMD_ASSERT(score >= 0);
+      COLAMD_ASSERT(score <= n_col);
+      Col[c].length = col_length;
+      Col[c].shared2.score = score;
     }
   }
-  COLAMD_DEBUG1 (("colamd: Dense, null, and newly-null columns killed: %d\n",
-		  n_col-n_col2)) ;
+  COLAMD_DEBUG1(("colamd: Dense, null, and newly-null columns killed: %d\n", n_col - n_col2));
 
   /* At this point, all empty rows and columns are dead.  All live columns */
   /* are "clean" (containing no dead rows) and simplicial (no supercolumns */
@@ -873,62 +815,52 @@ static void init_scoring
 
   /* === Initialize degree lists ========================================== */
 
-
   /* clear the hash buckets */
-  for (c = 0 ; c <= n_col ; c++)
-  {
-    head [c] = COLAMD_EMPTY ;
+  for (c = 0; c <= n_col; c++) {
+    head[c] = Empty;
   }
-  min_score = n_col ;
+  min_score = n_col;
   /* place in reverse order, so low column indices are at the front */
   /* of the lists.  This is to encourage natural tie-breaking */
-  for (c = n_col-1 ; c >= 0 ; c--)
-  {
+  for (c = n_col - 1; c >= 0; c--) {
     /* only add principal columns to degree lists */
-    if (COL_IS_ALIVE (c))
-    {
-      COLAMD_DEBUG4 (("place %d score %d minscore %d ncol %d\n",
-		      c, Col [c].shared2.score, min_score, n_col)) ;
+    if (Col[c].is_alive()) {
+      COLAMD_DEBUG4(("place %d score %d minscore %d ncol %d\n", c, Col[c].shared2.score, min_score, n_col));
 
       /* === Add columns score to DList =============================== */
 
-      score = Col [c].shared2.score ;
+      score = Col[c].shared2.score;
 
-      COLAMD_ASSERT (min_score >= 0) ;
-      COLAMD_ASSERT (min_score <= n_col) ;
-      COLAMD_ASSERT (score >= 0) ;
-      COLAMD_ASSERT (score <= n_col) ;
-      COLAMD_ASSERT (head [score] >= COLAMD_EMPTY) ;
+      COLAMD_ASSERT(min_score >= 0);
+      COLAMD_ASSERT(min_score <= n_col);
+      COLAMD_ASSERT(score >= 0);
+      COLAMD_ASSERT(score <= n_col);
+      COLAMD_ASSERT(head[score] >= Empty);
 
       /* now add this column to dList at proper score location */
-      next_col = head [score] ;
-      Col [c].shared3.prev = COLAMD_EMPTY ;
-      Col [c].shared4.degree_next = next_col ;
+      next_col = head[score];
+      Col[c].shared3.prev = Empty;
+      Col[c].shared4.degree_next = next_col;
 
       /* if there already was a column with the same score, set its */
       /* previous pointer to this new column */
-      if (next_col != COLAMD_EMPTY)
-      {
-	Col [next_col].shared3.prev = c ;
+      if (next_col != Empty) {
+        Col[next_col].shared3.prev = c;
       }
-      head [score] = c ;
+      head[score] = c;
 
       /* see if this score is less than current min */
-      min_score = COLAMD_MIN (min_score, score) ;
-
-
+      min_score = numext::mini(min_score, score);
     }
   }
 
-
   /* === Return number of remaining columns, and max row degree =========== */
 
-  *p_n_col2 = n_col2 ;
-  *p_n_row2 = n_row2 ;
-  *p_max_deg = max_deg ;
+  *p_n_col2 = n_col2;
+  *p_n_row2 = n_row2;
+  *p_max_deg = max_deg;
 }
 
-
 /* ========================================================================== */
 /* === find_ordering ======================================================== */
 /* ========================================================================== */
@@ -938,199 +870,182 @@ static void init_scoring
   (no supercolumns on input).  Uses a minimum approximate column minimum
   degree ordering method.  Not user-callable.
 */
-template <typename Index>
-static Index find_ordering /* return the number of garbage collections */
-  (
-    /* === Parameters ======================================================= */
-
-    Index n_row,      /* number of rows of A */
-    Index n_col,      /* number of columns of A */
-    Index Alen,     /* size of A, 2*nnz + n_col or larger */
-    Colamd_Row<Index> Row [],    /* of size n_row+1 */
-    colamd_col<Index> Col [],    /* of size n_col+1 */
-    Index A [],     /* column form and row form of A */
-    Index head [],    /* of size n_col+1 */
-    Index n_col2,     /* Remaining columns to order */
-    Index max_deg,    /* Maximum row degree */
-    Index pfree     /* index of first free slot (2*nnz on entry) */
-    )
-{
+template <typename IndexType>
+static IndexType find_ordering /* return the number of garbage collections */
+    (
+        /* === Parameters ======================================================= */
+
+        IndexType n_row,               /* number of rows of A */
+        IndexType n_col,               /* number of columns of A */
+        IndexType Alen,                /* size of A, 2*nnz + n_col or larger */
+        RowStructure<IndexType> Row[], /* of size n_row+1 */
+        ColStructure<IndexType> Col[], /* of size n_col+1 */
+        IndexType A[],                 /* column form and row form of A */
+        IndexType head[],              /* of size n_col+1 */
+        IndexType n_col2,              /* Remaining columns to order */
+        IndexType max_deg,             /* Maximum row degree */
+        IndexType pfree                /* index of first free slot (2*nnz on entry) */
+    ) {
   /* === Local variables ================================================== */
 
-  Index k ;     /* current pivot ordering step */
-  Index pivot_col ;   /* current pivot column */
-  Index *cp ;     /* a column pointer */
-  Index *rp ;     /* a row pointer */
-  Index pivot_row ;   /* current pivot row */
-  Index *new_cp ;   /* modified column pointer */
-  Index *new_rp ;   /* modified row pointer */
-  Index pivot_row_start ; /* pointer to start of pivot row */
-  Index pivot_row_degree ;  /* number of columns in pivot row */
-  Index pivot_row_length ;  /* number of supercolumns in pivot row */
-  Index pivot_col_score ; /* score of pivot column */
-  Index needed_memory ;   /* free space needed for pivot row */
-  Index *cp_end ;   /* pointer to the end of a column */
-  Index *rp_end ;   /* pointer to the end of a row */
-  Index row ;     /* a row index */
-  Index col ;     /* a column index */
-  Index max_score ;   /* maximum possible score */
-  Index cur_score ;   /* score of current column */
-  unsigned int hash ;   /* hash value for supernode detection */
-  Index head_column ;   /* head of hash bucket */
-  Index first_col ;   /* first column in hash bucket */
-  Index tag_mark ;    /* marker value for mark array */
-  Index row_mark ;    /* Row [row].shared2.mark */
-  Index set_difference ;  /* set difference size of row with pivot row */
-  Index min_score ;   /* smallest column score */
-  Index col_thickness ;   /* "thickness" (no. of columns in a supercol) */
-  Index max_mark ;    /* maximum value of tag_mark */
-  Index pivot_col_thickness ; /* number of columns represented by pivot col */
-  Index prev_col ;    /* Used by Dlist operations. */
-  Index next_col ;    /* Used by Dlist operations. */
-  Index ngarbage ;    /* number of garbage collections performed */
-
+  IndexType k;                   /* current pivot ordering step */
+  IndexType pivot_col;           /* current pivot column */
+  IndexType *cp;                 /* a column pointer */
+  IndexType *rp;                 /* a row pointer */
+  IndexType pivot_row;           /* current pivot row */
+  IndexType *new_cp;             /* modified column pointer */
+  IndexType *new_rp;             /* modified row pointer */
+  IndexType pivot_row_start;     /* pointer to start of pivot row */
+  IndexType pivot_row_degree;    /* number of columns in pivot row */
+  IndexType pivot_row_length;    /* number of supercolumns in pivot row */
+  IndexType pivot_col_score;     /* score of pivot column */
+  IndexType needed_memory;       /* free space needed for pivot row */
+  IndexType *cp_end;             /* pointer to the end of a column */
+  IndexType *rp_end;             /* pointer to the end of a row */
+  IndexType row;                 /* a row index */
+  IndexType col;                 /* a column index */
+  IndexType max_score;           /* maximum possible score */
+  IndexType cur_score;           /* score of current column */
+  unsigned int hash;             /* hash value for supernode detection */
+  IndexType head_column;         /* head of hash bucket */
+  IndexType first_col;           /* first column in hash bucket */
+  IndexType tag_mark;            /* marker value for mark array */
+  IndexType row_mark;            /* Row [row].shared2.mark */
+  IndexType set_difference;      /* set difference size of row with pivot row */
+  IndexType min_score;           /* smallest column score */
+  IndexType col_thickness;       /* "thickness" (no. of columns in a supercol) */
+  IndexType max_mark;            /* maximum value of tag_mark */
+  IndexType pivot_col_thickness; /* number of columns represented by pivot col */
+  IndexType prev_col;            /* Used by Dlist operations. */
+  IndexType next_col;            /* Used by Dlist operations. */
+  IndexType ngarbage;            /* number of garbage collections performed */
 
   /* === Initialization and clear mark ==================================== */
 
-  max_mark = INT_MAX - n_col ;  /* INT_MAX defined in <limits.h> */
-  tag_mark = Eigen::internal::clear_mark (n_row, Row) ;
-  min_score = 0 ;
-  ngarbage = 0 ;
-  COLAMD_DEBUG1 (("colamd: Ordering, n_col2=%d\n", n_col2)) ;
+  max_mark = INT_MAX - n_col; /* INT_MAX defined in <limits.h> */
+  tag_mark = Colamd::clear_mark(n_row, Row);
+  min_score = 0;
+  ngarbage = 0;
+  COLAMD_DEBUG1(("colamd: Ordering, n_col2=%d\n", n_col2));
 
   /* === Order the columns ================================================ */
 
-  for (k = 0 ; k < n_col2 ; /* 'k' is incremented below */)
-  {
-
+  for (k = 0; k < n_col2; /* 'k' is incremented below */) {
     /* === Select pivot column, and order it ============================ */
 
     /* make sure degree list isn't empty */
-    COLAMD_ASSERT (min_score >= 0) ;
-    COLAMD_ASSERT (min_score <= n_col) ;
-    COLAMD_ASSERT (head [min_score] >= COLAMD_EMPTY) ;
+    COLAMD_ASSERT(min_score >= 0);
+    COLAMD_ASSERT(min_score <= n_col);
+    COLAMD_ASSERT(head[min_score] >= Empty);
 
     /* get pivot column from head of minimum degree list */
-    while (head [min_score] == COLAMD_EMPTY && min_score < n_col)
-    {
-      min_score++ ;
+    while (min_score < n_col && head[min_score] == Empty) {
+      min_score++;
     }
-    pivot_col = head [min_score] ;
-    COLAMD_ASSERT (pivot_col >= 0 && pivot_col <= n_col) ;
-    next_col = Col [pivot_col].shared4.degree_next ;
-    head [min_score] = next_col ;
-    if (next_col != COLAMD_EMPTY)
-    {
-      Col [next_col].shared3.prev = COLAMD_EMPTY ;
+    pivot_col = head[min_score];
+    COLAMD_ASSERT(pivot_col >= 0 && pivot_col <= n_col);
+    next_col = Col[pivot_col].shared4.degree_next;
+    head[min_score] = next_col;
+    if (next_col != Empty) {
+      Col[next_col].shared3.prev = Empty;
     }
 
-    COLAMD_ASSERT (COL_IS_ALIVE (pivot_col)) ;
-    COLAMD_DEBUG3 (("Pivot col: %d\n", pivot_col)) ;
+    COLAMD_ASSERT(Col[pivot_col].is_alive());
+    COLAMD_DEBUG3(("Pivot col: %d\n", pivot_col));
 
     /* remember score for defrag check */
-    pivot_col_score = Col [pivot_col].shared2.score ;
+    pivot_col_score = Col[pivot_col].shared2.score;
 
     /* the pivot column is the kth column in the pivot order */
-    Col [pivot_col].shared2.order = k ;
+    Col[pivot_col].shared2.order = k;
 
     /* increment order count by column thickness */
-    pivot_col_thickness = Col [pivot_col].shared1.thickness ;
-    k += pivot_col_thickness ;
-    COLAMD_ASSERT (pivot_col_thickness > 0) ;
+    pivot_col_thickness = Col[pivot_col].shared1.thickness;
+    k += pivot_col_thickness;
+    COLAMD_ASSERT(pivot_col_thickness > 0);
 
     /* === Garbage_collection, if necessary ============================= */
 
-    needed_memory = COLAMD_MIN (pivot_col_score, n_col - k) ;
-    if (pfree + needed_memory >= Alen)
-    {
-      pfree = Eigen::internal::garbage_collection (n_row, n_col, Row, Col, A, &A [pfree]) ;
-      ngarbage++ ;
+    needed_memory = numext::mini(pivot_col_score, n_col - k);
+    if (pfree + needed_memory >= Alen) {
+      pfree = Colamd::garbage_collection(n_row, n_col, Row, Col, A, &A[pfree]);
+      ngarbage++;
       /* after garbage collection we will have enough */
-      COLAMD_ASSERT (pfree + needed_memory < Alen) ;
+      COLAMD_ASSERT(pfree + needed_memory < Alen);
       /* garbage collection has wiped out the Row[].shared2.mark array */
-      tag_mark = Eigen::internal::clear_mark (n_row, Row) ;
-
+      tag_mark = Colamd::clear_mark(n_row, Row);
     }
 
     /* === Compute pivot row pattern ==================================== */
 
     /* get starting location for this new merged row */
-    pivot_row_start = pfree ;
+    pivot_row_start = pfree;
 
     /* initialize new row counts to zero */
-    pivot_row_degree = 0 ;
+    pivot_row_degree = 0;
 
     /* tag pivot column as having been visited so it isn't included */
     /* in merged pivot row */
-    Col [pivot_col].shared1.thickness = -pivot_col_thickness ;
+    Col[pivot_col].shared1.thickness = -pivot_col_thickness;
 
     /* pivot row is the union of all rows in the pivot column pattern */
-    cp = &A [Col [pivot_col].start] ;
-    cp_end = cp + Col [pivot_col].length ;
-    while (cp < cp_end)
-    {
+    cp = &A[Col[pivot_col].start];
+    cp_end = cp + Col[pivot_col].length;
+    while (cp < cp_end) {
       /* get a row */
-      row = *cp++ ;
-      COLAMD_DEBUG4 (("Pivot col pattern %d %d\n", ROW_IS_ALIVE (row), row)) ;
+      row = *cp++;
+      COLAMD_DEBUG4(("Pivot col pattern %d %d\n", Row[row].is_alive(), row));
       /* skip if row is dead */
-      if (ROW_IS_DEAD (row))
-      {
-	continue ;
+      if (Row[row].is_dead()) {
+        continue;
       }
-      rp = &A [Row [row].start] ;
-      rp_end = rp + Row [row].length ;
-      while (rp < rp_end)
-      {
-	/* get a column */
-	col = *rp++ ;
-	/* add the column, if alive and untagged */
-	col_thickness = Col [col].shared1.thickness ;
-	if (col_thickness > 0 && COL_IS_ALIVE (col))
-	{
-	  /* tag column in pivot row */
-	  Col [col].shared1.thickness = -col_thickness ;
-	  COLAMD_ASSERT (pfree < Alen) ;
-	  /* place column in pivot row */
-	  A [pfree++] = col ;
-	  pivot_row_degree += col_thickness ;
-	}
+      rp = &A[Row[row].start];
+      rp_end = rp + Row[row].length;
+      while (rp < rp_end) {
+        /* get a column */
+        col = *rp++;
+        /* add the column, if alive and untagged */
+        col_thickness = Col[col].shared1.thickness;
+        if (col_thickness > 0 && Col[col].is_alive()) {
+          /* tag column in pivot row */
+          Col[col].shared1.thickness = -col_thickness;
+          COLAMD_ASSERT(pfree < Alen);
+          /* place column in pivot row */
+          A[pfree++] = col;
+          pivot_row_degree += col_thickness;
+        }
       }
     }
 
     /* clear tag on pivot column */
-    Col [pivot_col].shared1.thickness = pivot_col_thickness ;
-    max_deg = COLAMD_MAX (max_deg, pivot_row_degree) ;
-
+    Col[pivot_col].shared1.thickness = pivot_col_thickness;
+    max_deg = numext::maxi(max_deg, pivot_row_degree);
 
     /* === Kill all rows used to construct pivot row ==================== */
 
     /* also kill pivot row, temporarily */
-    cp = &A [Col [pivot_col].start] ;
-    cp_end = cp + Col [pivot_col].length ;
-    while (cp < cp_end)
-    {
+    cp = &A[Col[pivot_col].start];
+    cp_end = cp + Col[pivot_col].length;
+    while (cp < cp_end) {
       /* may be killing an already dead row */
-      row = *cp++ ;
-      COLAMD_DEBUG3 (("Kill row in pivot col: %d\n", row)) ;
-      KILL_ROW (row) ;
+      row = *cp++;
+      COLAMD_DEBUG3(("Kill row in pivot col: %d\n", row));
+      Row[row].kill();
     }
 
     /* === Select a row index to use as the new pivot row =============== */
 
-    pivot_row_length = pfree - pivot_row_start ;
-    if (pivot_row_length > 0)
-    {
+    pivot_row_length = pfree - pivot_row_start;
+    if (pivot_row_length > 0) {
       /* pick the "pivot" row arbitrarily (first row in col) */
-      pivot_row = A [Col [pivot_col].start] ;
-      COLAMD_DEBUG3 (("Pivotal row is %d\n", pivot_row)) ;
-    }
-    else
-    {
+      pivot_row = A[Col[pivot_col].start];
+      COLAMD_DEBUG3(("Pivotal row is %d\n", pivot_row));
+    } else {
       /* there is no pivot row, since it is of zero length */
-      pivot_row = COLAMD_EMPTY ;
-      COLAMD_ASSERT (pivot_row_length == 0) ;
+      pivot_row = Empty;
+      COLAMD_ASSERT(pivot_row_length == 0);
     }
-    COLAMD_ASSERT (Col [pivot_col].length > 0 || pivot_row_length == 0) ;
+    COLAMD_ASSERT(Col[pivot_col].length > 0 || pivot_row_length == 0);
 
     /* === Approximate degree computation =============================== */
 
@@ -1153,180 +1068,159 @@ static Index find_ordering /* return the number of garbage collections */
 
     /* === Compute set differences ====================================== */
 
-    COLAMD_DEBUG3 (("** Computing set differences phase. **\n")) ;
+    COLAMD_DEBUG3(("** Computing set differences phase. **\n"));
 
     /* pivot row is currently dead - it will be revived later. */
 
-    COLAMD_DEBUG3 (("Pivot row: ")) ;
+    COLAMD_DEBUG3(("Pivot row: "));
     /* for each column in pivot row */
-    rp = &A [pivot_row_start] ;
-    rp_end = rp + pivot_row_length ;
-    while (rp < rp_end)
-    {
-      col = *rp++ ;
-      COLAMD_ASSERT (COL_IS_ALIVE (col) && col != pivot_col) ;
-      COLAMD_DEBUG3 (("Col: %d\n", col)) ;
+    rp = &A[pivot_row_start];
+    rp_end = rp + pivot_row_length;
+    while (rp < rp_end) {
+      col = *rp++;
+      COLAMD_ASSERT(Col[col].is_alive() && col != pivot_col);
+      COLAMD_DEBUG3(("Col: %d\n", col));
 
       /* clear tags used to construct pivot row pattern */
-      col_thickness = -Col [col].shared1.thickness ;
-      COLAMD_ASSERT (col_thickness > 0) ;
-      Col [col].shared1.thickness = col_thickness ;
+      col_thickness = -Col[col].shared1.thickness;
+      COLAMD_ASSERT(col_thickness > 0);
+      Col[col].shared1.thickness = col_thickness;
 
       /* === Remove column from degree list =========================== */
 
-      cur_score = Col [col].shared2.score ;
-      prev_col = Col [col].shared3.prev ;
-      next_col = Col [col].shared4.degree_next ;
-      COLAMD_ASSERT (cur_score >= 0) ;
-      COLAMD_ASSERT (cur_score <= n_col) ;
-      COLAMD_ASSERT (cur_score >= COLAMD_EMPTY) ;
-      if (prev_col == COLAMD_EMPTY)
-      {
-	head [cur_score] = next_col ;
-      }
-      else
-      {
-	Col [prev_col].shared4.degree_next = next_col ;
+      cur_score = Col[col].shared2.score;
+      prev_col = Col[col].shared3.prev;
+      next_col = Col[col].shared4.degree_next;
+      COLAMD_ASSERT(cur_score >= 0);
+      COLAMD_ASSERT(cur_score <= n_col);
+      COLAMD_ASSERT(cur_score >= Empty);
+      if (prev_col == Empty) {
+        head[cur_score] = next_col;
+      } else {
+        Col[prev_col].shared4.degree_next = next_col;
       }
-      if (next_col != COLAMD_EMPTY)
-      {
-	Col [next_col].shared3.prev = prev_col ;
+      if (next_col != Empty) {
+        Col[next_col].shared3.prev = prev_col;
       }
 
       /* === Scan the column ========================================== */
 
-      cp = &A [Col [col].start] ;
-      cp_end = cp + Col [col].length ;
-      while (cp < cp_end)
-      {
-	/* get a row */
-	row = *cp++ ;
-	row_mark = Row [row].shared2.mark ;
-	/* skip if dead */
-	if (ROW_IS_MARKED_DEAD (row_mark))
-	{
-	  continue ;
-	}
-	COLAMD_ASSERT (row != pivot_row) ;
-	set_difference = row_mark - tag_mark ;
-	/* check if the row has been seen yet */
-	if (set_difference < 0)
-	{
-	  COLAMD_ASSERT (Row [row].shared1.degree <= max_deg) ;
-	  set_difference = Row [row].shared1.degree ;
-	}
-	/* subtract column thickness from this row's set difference */
-	set_difference -= col_thickness ;
-	COLAMD_ASSERT (set_difference >= 0) ;
-	/* absorb this row if the set difference becomes zero */
-	if (set_difference == 0)
-	{
-	  COLAMD_DEBUG3 (("aggressive absorption. Row: %d\n", row)) ;
-	  KILL_ROW (row) ;
-	}
-	else
-	{
-	  /* save the new mark */
-	  Row [row].shared2.mark = set_difference + tag_mark ;
-	}
+      cp = &A[Col[col].start];
+      cp_end = cp + Col[col].length;
+      while (cp < cp_end) {
+        /* get a row */
+        row = *cp++;
+        /* skip if dead */
+        if (Row[row].is_dead()) {
+          continue;
+        }
+        row_mark = Row[row].shared2.mark;
+        COLAMD_ASSERT(row != pivot_row);
+        set_difference = row_mark - tag_mark;
+        /* check if the row has been seen yet */
+        if (set_difference < 0) {
+          COLAMD_ASSERT(Row[row].shared1.degree <= max_deg);
+          set_difference = Row[row].shared1.degree;
+        }
+        /* subtract column thickness from this row's set difference */
+        set_difference -= col_thickness;
+        COLAMD_ASSERT(set_difference >= 0);
+        /* absorb this row if the set difference becomes zero */
+        if (set_difference == 0) {
+          COLAMD_DEBUG3(("aggressive absorption. Row: %d\n", row));
+          Row[row].kill();
+        } else {
+          /* save the new mark */
+          Row[row].shared2.mark = set_difference + tag_mark;
+        }
       }
     }
 
-
     /* === Add up set differences for each column ======================= */
 
-    COLAMD_DEBUG3 (("** Adding set differences phase. **\n")) ;
+    COLAMD_DEBUG3(("** Adding set differences phase. **\n"));
 
     /* for each column in pivot row */
-    rp = &A [pivot_row_start] ;
-    rp_end = rp + pivot_row_length ;
-    while (rp < rp_end)
-    {
+    rp = &A[pivot_row_start];
+    rp_end = rp + pivot_row_length;
+    while (rp < rp_end) {
       /* get a column */
-      col = *rp++ ;
-      COLAMD_ASSERT (COL_IS_ALIVE (col) && col != pivot_col) ;
-      hash = 0 ;
-      cur_score = 0 ;
-      cp = &A [Col [col].start] ;
+      col = *rp++;
+      COLAMD_ASSERT(Col[col].is_alive() && col != pivot_col);
+      hash = 0;
+      cur_score = 0;
+      cp = &A[Col[col].start];
       /* compact the column */
-      new_cp = cp ;
-      cp_end = cp + Col [col].length ;
-
-      COLAMD_DEBUG4 (("Adding set diffs for Col: %d.\n", col)) ;
-
-      while (cp < cp_end)
-      {
-	/* get a row */
-	row = *cp++ ;
-	COLAMD_ASSERT(row >= 0 && row < n_row) ;
-	row_mark = Row [row].shared2.mark ;
-	/* skip if dead */
-	if (ROW_IS_MARKED_DEAD (row_mark))
-	{
-	  continue ;
-	}
-	COLAMD_ASSERT (row_mark > tag_mark) ;
-	/* compact the column */
-	*new_cp++ = row ;
-	/* compute hash function */
-	hash += row ;
-	/* add set difference */
-	cur_score += row_mark - tag_mark ;
-	/* integer overflow... */
-	cur_score = COLAMD_MIN (cur_score, n_col) ;
+      new_cp = cp;
+      cp_end = cp + Col[col].length;
+
+      COLAMD_DEBUG4(("Adding set diffs for Col: %d.\n", col));
+
+      while (cp < cp_end) {
+        /* get a row */
+        row = *cp++;
+        COLAMD_ASSERT(row >= 0 && row < n_row);
+        /* skip if dead */
+        if (Row[row].is_dead()) {
+          continue;
+        }
+        row_mark = Row[row].shared2.mark;
+        COLAMD_ASSERT(row_mark > tag_mark);
+        /* compact the column */
+        *new_cp++ = row;
+        /* compute hash function */
+        hash += row;
+        /* add set difference */
+        cur_score += row_mark - tag_mark;
+        /* integer overflow... */
+        cur_score = numext::mini(cur_score, n_col);
       }
 
       /* recompute the column's length */
-      Col [col].length = (Index) (new_cp - &A [Col [col].start]) ;
+      Col[col].length = (IndexType)(new_cp - &A[Col[col].start]);
 
       /* === Further mass elimination ================================= */
 
-      if (Col [col].length == 0)
-      {
-	COLAMD_DEBUG4 (("further mass elimination. Col: %d\n", col)) ;
-	/* nothing left but the pivot row in this column */
-	KILL_PRINCIPAL_COL (col) ;
-	pivot_row_degree -= Col [col].shared1.thickness ;
-	COLAMD_ASSERT (pivot_row_degree >= 0) ;
-	/* order it */
-	Col [col].shared2.order = k ;
-	/* increment order count by column thickness */
-	k += Col [col].shared1.thickness ;
-      }
-      else
-      {
-	/* === Prepare for supercolumn detection ==================== */
-
-	COLAMD_DEBUG4 (("Preparing supercol detection for Col: %d.\n", col)) ;
-
-	/* save score so far */
-	Col [col].shared2.score = cur_score ;
-
-	/* add column to hash table, for supercolumn detection */
-	hash %= n_col + 1 ;
-
-	COLAMD_DEBUG4 ((" Hash = %d, n_col = %d.\n", hash, n_col)) ;
-	COLAMD_ASSERT (hash <= n_col) ;
-
-	head_column = head [hash] ;
-	if (head_column > COLAMD_EMPTY)
-	{
-	  /* degree list "hash" is non-empty, use prev (shared3) of */
-	  /* first column in degree list as head of hash bucket */
-	  first_col = Col [head_column].shared3.headhash ;
-	  Col [head_column].shared3.headhash = col ;
-	}
-	else
-	{
-	  /* degree list "hash" is empty, use head as hash bucket */
-	  first_col = - (head_column + 2) ;
-	  head [hash] = - (col + 2) ;
-	}
-	Col [col].shared4.hash_next = first_col ;
-
-	/* save hash function in Col [col].shared3.hash */
-	Col [col].shared3.hash = (Index) hash ;
-	COLAMD_ASSERT (COL_IS_ALIVE (col)) ;
+      if (Col[col].length == 0) {
+        COLAMD_DEBUG4(("further mass elimination. Col: %d\n", col));
+        /* nothing left but the pivot row in this column */
+        Col[col].kill_principal();
+        pivot_row_degree -= Col[col].shared1.thickness;
+        COLAMD_ASSERT(pivot_row_degree >= 0);
+        /* order it */
+        Col[col].shared2.order = k;
+        /* increment order count by column thickness */
+        k += Col[col].shared1.thickness;
+      } else {
+        /* === Prepare for supercolumn detection ==================== */
+
+        COLAMD_DEBUG4(("Preparing supercol detection for Col: %d.\n", col));
+
+        /* save score so far */
+        Col[col].shared2.score = cur_score;
+
+        /* add column to hash table, for supercolumn detection */
+        hash %= n_col + 1;
+
+        COLAMD_DEBUG4((" Hash = %d, n_col = %d.\n", hash, n_col));
+        COLAMD_ASSERT(hash <= n_col);
+
+        head_column = head[hash];
+        if (head_column > Empty) {
+          /* degree list "hash" is non-empty, use prev (shared3) of */
+          /* first column in degree list as head of hash bucket */
+          first_col = Col[head_column].shared3.headhash;
+          Col[head_column].shared3.headhash = col;
+        } else {
+          /* degree list "hash" is empty, use head as hash bucket */
+          first_col = -(head_column + 2);
+          head[hash] = -(col + 2);
+        }
+        Col[col].shared4.hash_next = first_col;
+
+        /* save hash function in Col [col].shared3.hash */
+        Col[col].shared3.hash = (IndexType)hash;
+        COLAMD_ASSERT(Col[col].is_alive());
       }
     }
 
@@ -1334,105 +1228,98 @@ static Index find_ordering /* return the number of garbage collections */
 
     /* === Supercolumn detection ======================================== */
 
-    COLAMD_DEBUG3 (("** Supercolumn detection phase. **\n")) ;
+    COLAMD_DEBUG3(("** Supercolumn detection phase. **\n"));
 
-    Eigen::internal::detect_super_cols (Col, A, head, pivot_row_start, pivot_row_length) ;
+    Colamd::detect_super_cols(Col, A, head, pivot_row_start, pivot_row_length);
 
     /* === Kill the pivotal column ====================================== */
 
-    KILL_PRINCIPAL_COL (pivot_col) ;
+    Col[pivot_col].kill_principal();
 
     /* === Clear mark =================================================== */
 
-    tag_mark += (max_deg + 1) ;
-    if (tag_mark >= max_mark)
-    {
-      COLAMD_DEBUG2 (("clearing tag_mark\n")) ;
-      tag_mark = Eigen::internal::clear_mark (n_row, Row) ;
+    tag_mark += (max_deg + 1);
+    if (tag_mark >= max_mark) {
+      COLAMD_DEBUG2(("clearing tag_mark\n"));
+      tag_mark = Colamd::clear_mark(n_row, Row);
     }
 
     /* === Finalize the new pivot row, and column scores ================ */
 
-    COLAMD_DEBUG3 (("** Finalize scores phase. **\n")) ;
+    COLAMD_DEBUG3(("** Finalize scores phase. **\n"));
 
     /* for each column in pivot row */
-    rp = &A [pivot_row_start] ;
+    rp = &A[pivot_row_start];
     /* compact the pivot row */
-    new_rp = rp ;
-    rp_end = rp + pivot_row_length ;
-    while (rp < rp_end)
-    {
-      col = *rp++ ;
+    new_rp = rp;
+    rp_end = rp + pivot_row_length;
+    while (rp < rp_end) {
+      col = *rp++;
       /* skip dead columns */
-      if (COL_IS_DEAD (col))
-      {
-	continue ;
+      if (Col[col].is_dead()) {
+        continue;
       }
-      *new_rp++ = col ;
+      *new_rp++ = col;
       /* add new pivot row to column */
-      A [Col [col].start + (Col [col].length++)] = pivot_row ;
+      A[Col[col].start + (Col[col].length++)] = pivot_row;
 
       /* retrieve score so far and add on pivot row's degree. */
       /* (we wait until here for this in case the pivot */
       /* row's degree was reduced due to mass elimination). */
-      cur_score = Col [col].shared2.score + pivot_row_degree ;
+      cur_score = Col[col].shared2.score + pivot_row_degree;
 
       /* calculate the max possible score as the number of */
       /* external columns minus the 'k' value minus the */
       /* columns thickness */
-      max_score = n_col - k - Col [col].shared1.thickness ;
+      max_score = n_col - k - Col[col].shared1.thickness;
 
       /* make the score the external degree of the union-of-rows */
-      cur_score -= Col [col].shared1.thickness ;
+      cur_score -= Col[col].shared1.thickness;
 
       /* make sure score is less or equal than the max score */
-      cur_score = COLAMD_MIN (cur_score, max_score) ;
-      COLAMD_ASSERT (cur_score >= 0) ;
+      cur_score = numext::mini(cur_score, max_score);
+      COLAMD_ASSERT(cur_score >= 0);
 
       /* store updated score */
-      Col [col].shared2.score = cur_score ;
+      Col[col].shared2.score = cur_score;
 
       /* === Place column back in degree list ========================= */
 
-      COLAMD_ASSERT (min_score >= 0) ;
-      COLAMD_ASSERT (min_score <= n_col) ;
-      COLAMD_ASSERT (cur_score >= 0) ;
-      COLAMD_ASSERT (cur_score <= n_col) ;
-      COLAMD_ASSERT (head [cur_score] >= COLAMD_EMPTY) ;
-      next_col = head [cur_score] ;
-      Col [col].shared4.degree_next = next_col ;
-      Col [col].shared3.prev = COLAMD_EMPTY ;
-      if (next_col != COLAMD_EMPTY)
-      {
-	Col [next_col].shared3.prev = col ;
+      COLAMD_ASSERT(min_score >= 0);
+      COLAMD_ASSERT(min_score <= n_col);
+      COLAMD_ASSERT(cur_score >= 0);
+      COLAMD_ASSERT(cur_score <= n_col);
+      COLAMD_ASSERT(head[cur_score] >= Empty);
+      next_col = head[cur_score];
+      Col[col].shared4.degree_next = next_col;
+      Col[col].shared3.prev = Empty;
+      if (next_col != Empty) {
+        Col[next_col].shared3.prev = col;
       }
-      head [cur_score] = col ;
+      head[cur_score] = col;
 
       /* see if this score is less than current min */
-      min_score = COLAMD_MIN (min_score, cur_score) ;
-
+      min_score = numext::mini(min_score, cur_score);
     }
 
     /* === Resurrect the new pivot row ================================== */
 
-    if (pivot_row_degree > 0)
-    {
+    if (pivot_row_degree > 0) {
       /* update pivot row length to reflect any cols that were killed */
       /* during super-col detection and mass elimination */
-      Row [pivot_row].start  = pivot_row_start ;
-      Row [pivot_row].length = (Index) (new_rp - &A[pivot_row_start]) ;
-      Row [pivot_row].shared1.degree = pivot_row_degree ;
-      Row [pivot_row].shared2.mark = 0 ;
+      Row[pivot_row].start = pivot_row_start;
+      Row[pivot_row].length = (IndexType)(new_rp - &A[pivot_row_start]);
+      Row[pivot_row].shared1.degree = pivot_row_degree;
+      Row[pivot_row].shared2.mark = 0;
       /* pivot row is no longer dead */
     }
   }
 
   /* === All principal columns have now been ordered ====================== */
 
-  return (ngarbage) ;
+  return (ngarbage);
 }
 
-
 /* ========================================================================== */
 /* === order_children ======================================================= */
 /* ========================================================================== */
@@ -1449,75 +1336,67 @@ static Index find_ordering /* return the number of garbage collections */
   taken by this routine is O (n_col), that is, linear in the number of
   columns.  Not user-callable.
 */
-template <typename Index>
-static inline  void order_children
-(
-  /* === Parameters ======================================================= */
-
-  Index n_col,      /* number of columns of A */
-  colamd_col<Index> Col [],    /* of size n_col+1 */
-  Index p []      /* p [0 ... n_col-1] is the column permutation*/
-  )
-{
+template <typename IndexType>
+static inline void order_children(
+    /* === Parameters ======================================================= */
+
+    IndexType n_col,               /* number of columns of A */
+    ColStructure<IndexType> Col[], /* of size n_col+1 */
+    IndexType p[]                  /* p [0 ... n_col-1] is the column permutation*/
+) {
   /* === Local variables ================================================== */
 
-  Index i ;     /* loop counter for all columns */
-  Index c ;     /* column index */
-  Index parent ;    /* index of column's parent */
-  Index order ;     /* column's order */
+  IndexType i;      /* loop counter for all columns */
+  IndexType c;      /* column index */
+  IndexType parent; /* index of column's parent */
+  IndexType order;  /* column's order */
 
   /* === Order each non-principal column ================================== */
 
-  for (i = 0 ; i < n_col ; i++)
-  {
+  for (i = 0; i < n_col; i++) {
     /* find an un-ordered non-principal column */
-    COLAMD_ASSERT (COL_IS_DEAD (i)) ;
-    if (!COL_IS_DEAD_PRINCIPAL (i) && Col [i].shared2.order == COLAMD_EMPTY)
-    {
-      parent = i ;
+    COLAMD_ASSERT(col_is_dead(Col, i));
+    if (!Col[i].is_dead_principal() && Col[i].shared2.order == Empty) {
+      parent = i;
       /* once found, find its principal parent */
-      do
-      {
-	parent = Col [parent].shared1.parent ;
-      } while (!COL_IS_DEAD_PRINCIPAL (parent)) ;
+      do {
+        parent = Col[parent].shared1.parent;
+      } while (!Col[parent].is_dead_principal());
 
       /* now, order all un-ordered non-principal columns along path */
       /* to this parent.  collapse tree at the same time */
-      c = i ;
+      c = i;
       /* get order of parent */
-      order = Col [parent].shared2.order ;
+      order = Col[parent].shared2.order;
 
-      do
-      {
-	COLAMD_ASSERT (Col [c].shared2.order == COLAMD_EMPTY) ;
+      do {
+        COLAMD_ASSERT(Col[c].shared2.order == Empty);
 
-	/* order this column */
-	Col [c].shared2.order = order++ ;
-	/* collaps tree */
-	Col [c].shared1.parent = parent ;
+        /* order this column */
+        Col[c].shared2.order = order++;
+        /* collapse tree */
+        Col[c].shared1.parent = parent;
 
-	/* get immediate parent of this column */
-	c = Col [c].shared1.parent ;
+        /* get immediate parent of this column */
+        c = Col[c].shared1.parent;
 
-	/* continue until we hit an ordered column.  There are */
-	/* guarranteed not to be anymore unordered columns */
-	/* above an ordered column */
-      } while (Col [c].shared2.order == COLAMD_EMPTY) ;
+        /* continue until we hit an ordered column.  There are */
+        /* guaranteed not to be anymore unordered columns */
+        /* above an ordered column */
+      } while (Col[c].shared2.order == Empty);
 
       /* re-order the super_col parent to largest order for this group */
-      Col [parent].shared2.order = order ;
+      Col[parent].shared2.order = order;
     }
   }
 
   /* === Generate the permutation ========================================= */
 
-  for (c = 0 ; c < n_col ; c++)
-  {
-    p [Col [c].shared2.order] = c ;
+  for (c = 0; c < n_col; c++) {
+    p[Col[c].shared2.order] = c;
   }
 }
 
-
 /* ========================================================================== */
 /* === detect_super_cols ==================================================== */
 /* ========================================================================== */
@@ -1550,270 +1429,233 @@ static inline  void order_children
   just been computed in the approximate degree computation.
   Not user-callable.
 */
-template <typename Index>
-static void detect_super_cols
-(
-  /* === Parameters ======================================================= */
-  
-  colamd_col<Index> Col [],    /* of size n_col+1 */
-  Index A [],     /* row indices of A */
-  Index head [],    /* head of degree lists and hash buckets */
-  Index row_start,    /* pointer to set of columns to check */
-  Index row_length    /* number of columns to check */
-)
-{
+template <typename IndexType>
+static void detect_super_cols(
+    /* === Parameters ======================================================= */
+
+    ColStructure<IndexType> Col[], /* of size n_col+1 */
+    IndexType A[],                 /* row indices of A */
+    IndexType head[],              /* head of degree lists and hash buckets */
+    IndexType row_start,           /* pointer to set of columns to check */
+    IndexType row_length           /* number of columns to check */
+) {
   /* === Local variables ================================================== */
 
-  Index hash ;      /* hash value for a column */
-  Index *rp ;     /* pointer to a row */
-  Index c ;     /* a column index */
-  Index super_c ;   /* column index of the column to absorb into */
-  Index *cp1 ;      /* column pointer for column super_c */
-  Index *cp2 ;      /* column pointer for column c */
-  Index length ;    /* length of column super_c */
-  Index prev_c ;    /* column preceding c in hash bucket */
-  Index i ;     /* loop counter */
-  Index *rp_end ;   /* pointer to the end of the row */
-  Index col ;     /* a column index in the row to check */
-  Index head_column ;   /* first column in hash bucket or degree list */
-  Index first_col ;   /* first column in hash bucket */
+  IndexType hash;        /* hash value for a column */
+  IndexType *rp;         /* pointer to a row */
+  IndexType c;           /* a column index */
+  IndexType super_c;     /* column index of the column to absorb into */
+  IndexType *cp1;        /* column pointer for column super_c */
+  IndexType *cp2;        /* column pointer for column c */
+  IndexType length;      /* length of column super_c */
+  IndexType prev_c;      /* column preceding c in hash bucket */
+  IndexType i;           /* loop counter */
+  IndexType *rp_end;     /* pointer to the end of the row */
+  IndexType col;         /* a column index in the row to check */
+  IndexType head_column; /* first column in hash bucket or degree list */
+  IndexType first_col;   /* first column in hash bucket */
 
   /* === Consider each column in the row ================================== */
 
-  rp = &A [row_start] ;
-  rp_end = rp + row_length ;
-  while (rp < rp_end)
-  {
-    col = *rp++ ;
-    if (COL_IS_DEAD (col))
-    {
-      continue ;
+  rp = &A[row_start];
+  rp_end = rp + row_length;
+  while (rp < rp_end) {
+    col = *rp++;
+    if (Col[col].is_dead()) {
+      continue;
     }
 
     /* get hash number for this column */
-    hash = Col [col].shared3.hash ;
-    COLAMD_ASSERT (hash <= n_col) ;
+    hash = Col[col].shared3.hash;
+    COLAMD_ASSERT(hash <= n_col);
 
     /* === Get the first column in this hash bucket ===================== */
 
-    head_column = head [hash] ;
-    if (head_column > COLAMD_EMPTY)
-    {
-      first_col = Col [head_column].shared3.headhash ;
-    }
-    else
-    {
-      first_col = - (head_column + 2) ;
+    head_column = head[hash];
+    if (head_column > Empty) {
+      first_col = Col[head_column].shared3.headhash;
+    } else {
+      first_col = -(head_column + 2);
     }
 
     /* === Consider each column in the hash bucket ====================== */
 
-    for (super_c = first_col ; super_c != COLAMD_EMPTY ;
-	 super_c = Col [super_c].shared4.hash_next)
-    {
-      COLAMD_ASSERT (COL_IS_ALIVE (super_c)) ;
-      COLAMD_ASSERT (Col [super_c].shared3.hash == hash) ;
-      length = Col [super_c].length ;
+    for (super_c = first_col; super_c != Empty; super_c = Col[super_c].shared4.hash_next) {
+      COLAMD_ASSERT(Col[super_c].is_alive());
+      COLAMD_ASSERT(Col[super_c].shared3.hash == hash);
+      length = Col[super_c].length;
 
       /* prev_c is the column preceding column c in the hash bucket */
-      prev_c = super_c ;
+      prev_c = super_c;
 
       /* === Compare super_c with all columns after it ================ */
 
-      for (c = Col [super_c].shared4.hash_next ;
-	   c != COLAMD_EMPTY ; c = Col [c].shared4.hash_next)
-      {
-	COLAMD_ASSERT (c != super_c) ;
-	COLAMD_ASSERT (COL_IS_ALIVE (c)) ;
-	COLAMD_ASSERT (Col [c].shared3.hash == hash) ;
-
-	/* not identical if lengths or scores are different */
-	if (Col [c].length != length ||
-	    Col [c].shared2.score != Col [super_c].shared2.score)
-	{
-	  prev_c = c ;
-	  continue ;
-	}
-
-	/* compare the two columns */
-	cp1 = &A [Col [super_c].start] ;
-	cp2 = &A [Col [c].start] ;
-
-	for (i = 0 ; i < length ; i++)
-	{
-	  /* the columns are "clean" (no dead rows) */
-	  COLAMD_ASSERT (ROW_IS_ALIVE (*cp1))  ;
-	  COLAMD_ASSERT (ROW_IS_ALIVE (*cp2))  ;
-	  /* row indices will same order for both supercols, */
-	  /* no gather scatter nessasary */
-	  if (*cp1++ != *cp2++)
-	  {
-	    break ;
-	  }
-	}
-
-	/* the two columns are different if the for-loop "broke" */
-	if (i != length)
-	{
-	  prev_c = c ;
-	  continue ;
-	}
-
-	/* === Got it!  two columns are identical =================== */
-
-	COLAMD_ASSERT (Col [c].shared2.score == Col [super_c].shared2.score) ;
-
-	Col [super_c].shared1.thickness += Col [c].shared1.thickness ;
-	Col [c].shared1.parent = super_c ;
-	KILL_NON_PRINCIPAL_COL (c) ;
-	/* order c later, in order_children() */
-	Col [c].shared2.order = COLAMD_EMPTY ;
-	/* remove c from hash bucket */
-	Col [prev_c].shared4.hash_next = Col [c].shared4.hash_next ;
+      for (c = Col[super_c].shared4.hash_next; c != Empty; c = Col[c].shared4.hash_next) {
+        COLAMD_ASSERT(c != super_c);
+        COLAMD_ASSERT(Col[c].is_alive());
+        COLAMD_ASSERT(Col[c].shared3.hash == hash);
+
+        /* not identical if lengths or scores are different */
+        if (Col[c].length != length || Col[c].shared2.score != Col[super_c].shared2.score) {
+          prev_c = c;
+          continue;
+        }
+
+        /* compare the two columns */
+        cp1 = &A[Col[super_c].start];
+        cp2 = &A[Col[c].start];
+
+        for (i = 0; i < length; i++) {
+          /* the columns are "clean" (no dead rows) */
+          COLAMD_ASSERT(cp1->is_alive());
+          COLAMD_ASSERT(cp2->is_alive());
+          /* row indices will same order for both supercols, */
+          /* no gather scatter necessary */
+          if (*cp1++ != *cp2++) {
+            break;
+          }
+        }
+
+        /* the two columns are different if the for-loop "broke" */
+        if (i != length) {
+          prev_c = c;
+          continue;
+        }
+
+        /* === Got it!  two columns are identical =================== */
+
+        COLAMD_ASSERT(Col[c].shared2.score == Col[super_c].shared2.score);
+
+        Col[super_c].shared1.thickness += Col[c].shared1.thickness;
+        Col[c].shared1.parent = super_c;
+        Col[c].kill_non_principal();
+        /* order c later, in order_children() */
+        Col[c].shared2.order = Empty;
+        /* remove c from hash bucket */
+        Col[prev_c].shared4.hash_next = Col[c].shared4.hash_next;
       }
     }
 
     /* === Empty this hash bucket ======================================= */
 
-    if (head_column > COLAMD_EMPTY)
-    {
+    if (head_column > Empty) {
       /* corresponding degree list "hash" is not empty */
-      Col [head_column].shared3.headhash = COLAMD_EMPTY ;
-    }
-    else
-    {
+      Col[head_column].shared3.headhash = Empty;
+    } else {
       /* corresponding degree list "hash" is empty */
-      head [hash] = COLAMD_EMPTY ;
+      head[hash] = Empty;
     }
   }
 }
 
-
 /* ========================================================================== */
 /* === garbage_collection =================================================== */
 /* ========================================================================== */
 
 /*
   Defragments and compacts columns and rows in the workspace A.  Used when
-  all avaliable memory has been used while performing row merging.  Returns
+  all available memory has been used while performing row merging.  Returns
   the index of the first free position in A, after garbage collection.  The
   time taken by this routine is linear is the size of the array A, which is
   itself linear in the number of nonzeros in the input matrix.
   Not user-callable.
 */
-template <typename Index>
-static Index garbage_collection  /* returns the new value of pfree */
-  (
-    /* === Parameters ======================================================= */
-    
-    Index n_row,      /* number of rows */
-    Index n_col,      /* number of columns */
-    Colamd_Row<Index> Row [],    /* row info */
-    colamd_col<Index> Col [],    /* column info */
-    Index A [],     /* A [0 ... Alen-1] holds the matrix */
-    Index *pfree      /* &A [0] ... pfree is in use */
-    )
-{
+template <typename IndexType>
+static IndexType garbage_collection /* returns the new value of pfree */
+    (
+        /* === Parameters ======================================================= */
+
+        IndexType n_row,               /* number of rows */
+        IndexType n_col,               /* number of columns */
+        RowStructure<IndexType> Row[], /* row info */
+        ColStructure<IndexType> Col[], /* column info */
+        IndexType A[],                 /* A [0 ... Alen-1] holds the matrix */
+        IndexType *pfree               /* &A [0] ... pfree is in use */
+    ) {
   /* === Local variables ================================================== */
 
-  Index *psrc ;     /* source pointer */
-  Index *pdest ;    /* destination pointer */
-  Index j ;     /* counter */
-  Index r ;     /* a row index */
-  Index c ;     /* a column index */
-  Index length ;    /* length of a row or column */
+  IndexType *psrc;  /* source pointer */
+  IndexType *pdest; /* destination pointer */
+  IndexType j;      /* counter */
+  IndexType r;      /* a row index */
+  IndexType c;      /* a column index */
+  IndexType length; /* length of a row or column */
 
   /* === Defragment the columns =========================================== */
 
-  pdest = &A[0] ;
-  for (c = 0 ; c < n_col ; c++)
-  {
-    if (COL_IS_ALIVE (c))
-    {
-      psrc = &A [Col [c].start] ;
+  pdest = &A[0];
+  for (c = 0; c < n_col; c++) {
+    if (Col[c].is_alive()) {
+      psrc = &A[Col[c].start];
 
       /* move and compact the column */
-      COLAMD_ASSERT (pdest <= psrc) ;
-      Col [c].start = (Index) (pdest - &A [0]) ;
-      length = Col [c].length ;
-      for (j = 0 ; j < length ; j++)
-      {
-	r = *psrc++ ;
-	if (ROW_IS_ALIVE (r))
-	{
-	  *pdest++ = r ;
-	}
+      COLAMD_ASSERT(pdest <= psrc);
+      Col[c].start = (IndexType)(pdest - &A[0]);
+      length = Col[c].length;
+      for (j = 0; j < length; j++) {
+        r = *psrc++;
+        if (Row[r].is_alive()) {
+          *pdest++ = r;
+        }
       }
-      Col [c].length = (Index) (pdest - &A [Col [c].start]) ;
+      Col[c].length = (IndexType)(pdest - &A[Col[c].start]);
     }
   }
 
   /* === Prepare to defragment the rows =================================== */
 
-  for (r = 0 ; r < n_row ; r++)
-  {
-    if (ROW_IS_ALIVE (r))
-    {
-      if (Row [r].length == 0)
-      {
-	/* this row is of zero length.  cannot compact it, so kill it */
-	COLAMD_DEBUG3 (("Defrag row kill\n")) ;
-	KILL_ROW (r) ;
-      }
-      else
-      {
-	/* save first column index in Row [r].shared2.first_column */
-	psrc = &A [Row [r].start] ;
-	Row [r].shared2.first_column = *psrc ;
-	COLAMD_ASSERT (ROW_IS_ALIVE (r)) ;
-	/* flag the start of the row with the one's complement of row */
-	*psrc = ONES_COMPLEMENT (r) ;
-
+  for (r = 0; r < n_row; r++) {
+    if (Row[r].is_alive()) {
+      if (Row[r].length == 0) {
+        /* this row is of zero length.  cannot compact it, so kill it */
+        COLAMD_DEBUG3(("Defrag row kill\n"));
+        Row[r].kill();
+      } else {
+        /* save first column index in Row [r].shared2.first_column */
+        psrc = &A[Row[r].start];
+        Row[r].shared2.first_column = *psrc;
+        COLAMD_ASSERT(Row[r].is_alive());
+        /* flag the start of the row with the one's complement of row */
+        *psrc = ones_complement(r);
       }
     }
   }
 
   /* === Defragment the rows ============================================== */
 
-  psrc = pdest ;
-  while (psrc < pfree)
-  {
+  psrc = pdest;
+  while (psrc < pfree) {
     /* find a negative number ... the start of a row */
-    if (*psrc++ < 0)
-    {
-      psrc-- ;
+    if (*psrc++ < 0) {
+      psrc--;
       /* get the row index */
-      r = ONES_COMPLEMENT (*psrc) ;
-      COLAMD_ASSERT (r >= 0 && r < n_row) ;
+      r = ones_complement(*psrc);
+      COLAMD_ASSERT(r >= 0 && r < n_row);
       /* restore first column index */
-      *psrc = Row [r].shared2.first_column ;
-      COLAMD_ASSERT (ROW_IS_ALIVE (r)) ;
+      *psrc = Row[r].shared2.first_column;
+      COLAMD_ASSERT(Row[r].is_alive());
 
       /* move and compact the row */
-      COLAMD_ASSERT (pdest <= psrc) ;
-      Row [r].start = (Index) (pdest - &A [0]) ;
-      length = Row [r].length ;
-      for (j = 0 ; j < length ; j++)
-      {
-	c = *psrc++ ;
-	if (COL_IS_ALIVE (c))
-	{
-	  *pdest++ = c ;
-	}
+      COLAMD_ASSERT(pdest <= psrc);
+      Row[r].start = (IndexType)(pdest - &A[0]);
+      length = Row[r].length;
+      for (j = 0; j < length; j++) {
+        c = *psrc++;
+        if (Col[c].is_alive()) {
+          *pdest++ = c;
+        }
       }
-      Row [r].length = (Index) (pdest - &A [Row [r].start]) ;
-
+      Row[r].length = (IndexType)(pdest - &A[Row[r].start]);
     }
   }
   /* ensure we found all the rows */
-  COLAMD_ASSERT (debug_rows == 0) ;
+  COLAMD_ASSERT(debug_rows == 0);
 
   /* === Return the new value of pfree ==================================== */
 
-  return ((Index) (pdest - &A [0])) ;
+  return ((IndexType)(pdest - &A[0]));
 }
 
-
 /* ========================================================================== */
 /* === clear_mark =========================================================== */
 /* ========================================================================== */
@@ -1822,29 +1664,27 @@ static Index garbage_collection  /* returns the new value of pfree */
   Clears the Row [].shared2.mark array, and returns the new tag_mark.
   Return value is the new tag_mark.  Not user-callable.
 */
-template <typename Index>
-static inline  Index clear_mark  /* return the new value for tag_mark */
-  (
-      /* === Parameters ======================================================= */
-
-    Index n_row,    /* number of rows in A */
-    Colamd_Row<Index> Row [] /* Row [0 ... n_row-1].shared2.mark is set to zero */
-    )
-{
+template <typename IndexType>
+static inline IndexType clear_mark /* return the new value for tag_mark */
+    (
+        /* === Parameters ======================================================= */
+
+        IndexType n_row,              /* number of rows in A */
+        RowStructure<IndexType> Row[] /* Row [0 ... n_row-1].shared2.mark is set to zero */
+    ) {
   /* === Local variables ================================================== */
 
-  Index r ;
+  IndexType r;
 
-  for (r = 0 ; r < n_row ; r++)
-  {
-    if (ROW_IS_ALIVE (r))
-    {
-      Row [r].shared2.mark = 0 ;
+  for (r = 0; r < n_row; r++) {
+    if (Row[r].is_alive()) {
+      Row[r].shared2.mark = 0;
     }
   }
-  return (1) ;
+  return (1);
 }
 
-
-} // namespace internal 
+}  // namespace Colamd
+}  // namespace internal
+}  // namespace Eigen
 #endif
diff --git a/inst/include/Eigen/src/OrderingMethods/InternalHeaderCheck.h b/inst/include/Eigen/src/OrderingMethods/InternalHeaderCheck.h
new file mode 100644
index 00000000..713c4479
--- /dev/null
+++ b/inst/include/Eigen/src/OrderingMethods/InternalHeaderCheck.h
@@ -0,0 +1,3 @@
+#ifndef EIGEN_ORDERINGMETHODS_MODULE_H
+#error "Please include Eigen/OrderingMethods instead of including headers inside the src directory directly."
+#endif
diff --git a/inst/include/Eigen/src/OrderingMethods/Ordering.h b/inst/include/Eigen/src/OrderingMethods/Ordering.h
index f3c31f9c..1a650077 100644
--- a/inst/include/Eigen/src/OrderingMethods/Ordering.h
+++ b/inst/include/Eigen/src/OrderingMethods/Ordering.h
@@ -1,4 +1,4 @@
- 
+
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
@@ -11,144 +11,136 @@
 #ifndef EIGEN_ORDERING_H
 #define EIGEN_ORDERING_H
 
-namespace Eigen {
-  
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 #include "Eigen_Colamd.h"
 
+namespace Eigen {
 namespace internal {
-    
+
 /** \internal
-  * \ingroup OrderingMethods_Module
-  * \returns the symmetric pattern A^T+A from the input matrix A. 
-  * FIXME: The values should not be considered here
-  */
-template<typename MatrixType> 
-void ordering_helper_at_plus_a(const MatrixType& mat, MatrixType& symmat)
-{
+ * \ingroup OrderingMethods_Module
+ * \param[in] A the input non-symmetric matrix
+ * \param[out] symmat the symmetric pattern A^T+A from the input matrix \a A.
+ * FIXME: The values should not be considered here
+ */
+template <typename MatrixType>
+void ordering_helper_at_plus_a(const MatrixType& A, MatrixType& symmat) {
   MatrixType C;
-  C = mat.transpose(); // NOTE: Could be  costly
-  for (int i = 0; i < C.rows(); i++) 
-  {
-      for (typename MatrixType::InnerIterator it(C, i); it; ++it)
-        it.valueRef() = 0.0;
+  C = A.transpose();  // NOTE: Could be  costly
+  for (int i = 0; i < C.rows(); i++) {
+    for (typename MatrixType::InnerIterator it(C, i); it; ++it) it.valueRef() = typename MatrixType::Scalar(0);
   }
-  symmat = C + mat; 
-}
-    
+  symmat = C + A;
 }
 
-#ifndef EIGEN_MPL2_ONLY
+}  // namespace internal
 
 /** \ingroup OrderingMethods_Module
-  * \class AMDOrdering
-  *
-  * Functor computing the \em approximate \em minimum \em degree ordering
-  * If the matrix is not structurally symmetric, an ordering of A^T+A is computed
-  * \tparam  Index The type of indices of the matrix 
-  * \sa COLAMDOrdering
-  */
-template <typename Index>
-class AMDOrdering
-{
-  public:
-    typedef PermutationMatrix<Dynamic, Dynamic, Index> PermutationType;
-    
-    /** Compute the permutation vector from a sparse matrix
-     * This routine is much faster if the input matrix is column-major     
-     */
-    template <typename MatrixType>
-    void operator()(const MatrixType& mat, PermutationType& perm)
-    {
-      // Compute the symmetric pattern
-      SparseMatrix<typename MatrixType::Scalar, ColMajor, Index> symm;
-      internal::ordering_helper_at_plus_a(mat,symm); 
-    
-      // Call the AMD routine 
-      //m_mat.prune(keep_diag());
-      internal::minimum_degree_ordering(symm, perm);
-    }
-    
-    /** Compute the permutation with a selfadjoint matrix */
-    template <typename SrcType, unsigned int SrcUpLo> 
-    void operator()(const SparseSelfAdjointView<SrcType, SrcUpLo>& mat, PermutationType& perm)
-    { 
-      SparseMatrix<typename SrcType::Scalar, ColMajor, Index> C; C = mat;
-      
-      // Call the AMD routine 
-      // m_mat.prune(keep_diag()); //Remove the diagonal elements 
-      internal::minimum_degree_ordering(C, perm);
-    }
-};
+ * \class AMDOrdering
+ *
+ * Functor computing the \em approximate \em minimum \em degree ordering
+ * If the matrix is not structurally symmetric, an ordering of A^T+A is computed
+ * \tparam  StorageIndex The type of indices of the matrix
+ * \sa COLAMDOrdering
+ */
+template <typename StorageIndex>
+class AMDOrdering {
+ public:
+  typedef PermutationMatrix<Dynamic, Dynamic, StorageIndex> PermutationType;
+
+  /** Compute the permutation vector from a sparse matrix
+   * This routine is much faster if the input matrix is column-major
+   */
+  template <typename MatrixType>
+  void operator()(const MatrixType& mat, PermutationType& perm) {
+    // Compute the symmetric pattern
+    SparseMatrix<typename MatrixType::Scalar, ColMajor, StorageIndex> symm;
+    internal::ordering_helper_at_plus_a(mat, symm);
 
-#endif // EIGEN_MPL2_ONLY
+    // Call the AMD routine
+    // m_mat.prune(keep_diag());
+    internal::minimum_degree_ordering(symm, perm);
+  }
+
+  /** Compute the permutation with a selfadjoint matrix */
+  template <typename SrcType, unsigned int SrcUpLo>
+  void operator()(const SparseSelfAdjointView<SrcType, SrcUpLo>& mat, PermutationType& perm) {
+    SparseMatrix<typename SrcType::Scalar, ColMajor, StorageIndex> C;
+    C = mat;
+
+    // Call the AMD routine
+    // m_mat.prune(keep_diag()); //Remove the diagonal elements
+    internal::minimum_degree_ordering(C, perm);
+  }
+};
 
 /** \ingroup OrderingMethods_Module
-  * \class NaturalOrdering
-  *
-  * Functor computing the natural ordering (identity)
-  * 
-  * \note Returns an empty permutation matrix
-  * \tparam  Index The type of indices of the matrix 
-  */
-template <typename Index>
-class NaturalOrdering
-{
-  public:
-    typedef PermutationMatrix<Dynamic, Dynamic, Index> PermutationType;
-    
-    /** Compute the permutation vector from a column-major sparse matrix */
-    template <typename MatrixType>
-    void operator()(const MatrixType& /*mat*/, PermutationType& perm)
-    {
-      perm.resize(0); 
-    }
-    
+ * \class NaturalOrdering
+ *
+ * Functor computing the natural ordering (identity)
+ *
+ * \note Returns an empty permutation matrix
+ * \tparam  StorageIndex The type of indices of the matrix
+ */
+template <typename StorageIndex>
+class NaturalOrdering {
+ public:
+  typedef PermutationMatrix<Dynamic, Dynamic, StorageIndex> PermutationType;
+
+  /** Compute the permutation vector from a column-major sparse matrix */
+  template <typename MatrixType>
+  void operator()(const MatrixType& /*mat*/, PermutationType& perm) {
+    perm.resize(0);
+  }
 };
 
 /** \ingroup OrderingMethods_Module
-  * \class COLAMDOrdering
-  *
-  * Functor computing the \em column \em approximate \em minimum \em degree ordering 
-  * The matrix should be in column-major and \b compressed format (see SparseMatrix::makeCompressed()).
-  */
-template<typename Index>
-class COLAMDOrdering
-{
-  public:
-    typedef PermutationMatrix<Dynamic, Dynamic, Index> PermutationType; 
-    typedef Matrix<Index, Dynamic, 1> IndexVector;
-    
-    /** Compute the permutation vector \a perm form the sparse matrix \a mat
-      * \warning The input sparse matrix \a mat must be in compressed mode (see SparseMatrix::makeCompressed()).
-      */
-    template <typename MatrixType>
-    void operator() (const MatrixType& mat, PermutationType& perm)
-    {
-      eigen_assert(mat.isCompressed() && "COLAMDOrdering requires a sparse matrix in compressed mode. Call .makeCompressed() before passing it to COLAMDOrdering");
-      
-      Index m = mat.rows();
-      Index n = mat.cols();
-      Index nnz = mat.nonZeros();
-      // Get the recommended value of Alen to be used by colamd
-      Index Alen = internal::colamd_recommended(nnz, m, n); 
-      // Set the default parameters
-      double knobs [COLAMD_KNOBS]; 
-      Index stats [COLAMD_STATS];
-      internal::colamd_set_defaults(knobs);
-      
-      IndexVector p(n+1), A(Alen); 
-      for(Index i=0; i <= n; i++)   p(i) = mat.outerIndexPtr()[i];
-      for(Index i=0; i < nnz; i++)  A(i) = mat.innerIndexPtr()[i];
-      // Call Colamd routine to compute the ordering 
-      Index info = internal::colamd(m, n, Alen, A.data(), p.data(), knobs, stats); 
-      EIGEN_UNUSED_VARIABLE(info);
-      eigen_assert( info && "COLAMD failed " );
-      
-      perm.resize(n);
-      for (Index i = 0; i < n; i++) perm.indices()(p(i)) = i;
-    }
+ * \class COLAMDOrdering
+ *
+ * \tparam  StorageIndex The type of indices of the matrix
+ *
+ * Functor computing the \em column \em approximate \em minimum \em degree ordering
+ * The matrix should be in column-major and \b compressed format (see SparseMatrix::makeCompressed()).
+ */
+template <typename StorageIndex>
+class COLAMDOrdering {
+ public:
+  typedef PermutationMatrix<Dynamic, Dynamic, StorageIndex> PermutationType;
+  typedef Matrix<StorageIndex, Dynamic, 1> IndexVector;
+
+  /** Compute the permutation vector \a perm form the sparse matrix \a mat
+   * \warning The input sparse matrix \a mat must be in compressed mode (see SparseMatrix::makeCompressed()).
+   */
+  template <typename MatrixType>
+  void operator()(const MatrixType& mat, PermutationType& perm) {
+    eigen_assert(mat.isCompressed() &&
+                 "COLAMDOrdering requires a sparse matrix in compressed mode. Call .makeCompressed() before passing it "
+                 "to COLAMDOrdering");
+
+    StorageIndex m = StorageIndex(mat.rows());
+    StorageIndex n = StorageIndex(mat.cols());
+    StorageIndex nnz = StorageIndex(mat.nonZeros());
+    // Get the recommended value of Alen to be used by colamd
+    StorageIndex Alen = internal::Colamd::recommended(nnz, m, n);
+    // Set the default parameters
+    double knobs[internal::Colamd::NKnobs];
+    StorageIndex stats[internal::Colamd::NStats];
+    internal::Colamd::set_defaults(knobs);
+
+    IndexVector p(n + 1), A(Alen);
+    for (StorageIndex i = 0; i <= n; i++) p(i) = mat.outerIndexPtr()[i];
+    for (StorageIndex i = 0; i < nnz; i++) A(i) = mat.innerIndexPtr()[i];
+    // Call Colamd routine to compute the ordering
+    StorageIndex info = internal::Colamd::compute_ordering(m, n, Alen, A.data(), p.data(), knobs, stats);
+    EIGEN_UNUSED_VARIABLE(info);
+    eigen_assert(info && "COLAMD failed ");
+
+    perm.resize(n);
+    for (StorageIndex i = 0; i < n; i++) perm.indices()(p(i)) = i;
+  }
 };
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
 #endif
diff --git a/inst/include/Eigen/src/PaStiXSupport/InternalHeaderCheck.h b/inst/include/Eigen/src/PaStiXSupport/InternalHeaderCheck.h
new file mode 100644
index 00000000..f588e502
--- /dev/null
+++ b/inst/include/Eigen/src/PaStiXSupport/InternalHeaderCheck.h
@@ -0,0 +1,3 @@
+#ifndef EIGEN_PASTIXSUPPORT_MODULE_H
+#error "Please include Eigen/PaStiXSupport instead of including headers inside the src directory directly."
+#endif
diff --git a/inst/include/Eigen/src/PaStiXSupport/PaStiXSupport.h b/inst/include/Eigen/src/PaStiXSupport/PaStiXSupport.h
index a955287d..fb751331 100644
--- a/inst/include/Eigen/src/PaStiXSupport/PaStiXSupport.h
+++ b/inst/include/Eigen/src/PaStiXSupport/PaStiXSupport.h
@@ -10,372 +10,339 @@
 #ifndef EIGEN_PASTIXSUPPORT_H
 #define EIGEN_PASTIXSUPPORT_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+#if defined(DCOMPLEX)
+#define PASTIX_COMPLEX COMPLEX
+#define PASTIX_DCOMPLEX DCOMPLEX
+#else
+#define PASTIX_COMPLEX std::complex<float>
+#define PASTIX_DCOMPLEX std::complex<double>
+#endif
 
 /** \ingroup PaStiXSupport_Module
-  * \brief Interface to the PaStix solver
-  * 
-  * This class is used to solve the linear systems A.X = B via the PaStix library. 
-  * The matrix can be either real or complex, symmetric or not.
-  *
-  * \sa TutorialSparseDirectSolvers
-  */
-template<typename _MatrixType, bool IsStrSym = false> class PastixLU;
-template<typename _MatrixType, int Options> class PastixLLT;
-template<typename _MatrixType, int Options> class PastixLDLT;
-
-namespace internal
-{
-    
-  template<class Pastix> struct pastix_traits;
-
-  template<typename _MatrixType>
-  struct pastix_traits< PastixLU<_MatrixType> >
-  {
-    typedef _MatrixType MatrixType;
-    typedef typename _MatrixType::Scalar Scalar;
-    typedef typename _MatrixType::RealScalar RealScalar;
-    typedef typename _MatrixType::Index Index;
-  };
-
-  template<typename _MatrixType, int Options>
-  struct pastix_traits< PastixLLT<_MatrixType,Options> >
-  {
-    typedef _MatrixType MatrixType;
-    typedef typename _MatrixType::Scalar Scalar;
-    typedef typename _MatrixType::RealScalar RealScalar;
-    typedef typename _MatrixType::Index Index;
-  };
-
-  template<typename _MatrixType, int Options>
-  struct pastix_traits< PastixLDLT<_MatrixType,Options> >
-  {
-    typedef _MatrixType MatrixType;
-    typedef typename _MatrixType::Scalar Scalar;
-    typedef typename _MatrixType::RealScalar RealScalar;
-    typedef typename _MatrixType::Index Index;
-  };
-  
-  void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, float *vals, int *perm, int * invp, float *x, int nbrhs, int *iparm, double *dparm)
-  {
-    if (n == 0) { ptr = NULL; idx = NULL; vals = NULL; }
-    if (nbrhs == 0) {x = NULL; nbrhs=1;}
-    s_pastix(pastix_data, pastix_comm, n, ptr, idx, vals, perm, invp, x, nbrhs, iparm, dparm); 
-  }
-  
-  void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, double *vals, int *perm, int * invp, double *x, int nbrhs, int *iparm, double *dparm)
-  {
-    if (n == 0) { ptr = NULL; idx = NULL; vals = NULL; }
-    if (nbrhs == 0) {x = NULL; nbrhs=1;}
-    d_pastix(pastix_data, pastix_comm, n, ptr, idx, vals, perm, invp, x, nbrhs, iparm, dparm); 
-  }
-  
-  void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, std::complex<float> *vals, int *perm, int * invp, std::complex<float> *x, int nbrhs, int *iparm, double *dparm)
-  {
-    if (n == 0) { ptr = NULL; idx = NULL; vals = NULL; }
-    if (nbrhs == 0) {x = NULL; nbrhs=1;}
-    c_pastix(pastix_data, pastix_comm, n, ptr, idx, reinterpret_cast<COMPLEX*>(vals), perm, invp, reinterpret_cast<COMPLEX*>(x), nbrhs, iparm, dparm); 
-  }
-  
-  void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, std::complex<double> *vals, int *perm, int * invp, std::complex<double> *x, int nbrhs, int *iparm, double *dparm)
-  {
-    if (n == 0) { ptr = NULL; idx = NULL; vals = NULL; }
-    if (nbrhs == 0) {x = NULL; nbrhs=1;}
-    z_pastix(pastix_data, pastix_comm, n, ptr, idx, reinterpret_cast<DCOMPLEX*>(vals), perm, invp, reinterpret_cast<DCOMPLEX*>(x), nbrhs, iparm, dparm); 
-  }
-
-  // Convert the matrix  to Fortran-style Numbering
-  template <typename MatrixType>
-  void c_to_fortran_numbering (MatrixType& mat)
-  {
-    if ( !(mat.outerIndexPtr()[0]) ) 
-    { 
-      int i;
-      for(i = 0; i <= mat.rows(); ++i)
-        ++mat.outerIndexPtr()[i];
-      for(i = 0; i < mat.nonZeros(); ++i)
-        ++mat.innerIndexPtr()[i];
-    }
+ * \brief Interface to the PaStix solver
+ *
+ * This class is used to solve the linear systems A.X = B via the PaStix library.
+ * The matrix can be either real or complex, symmetric or not.
+ *
+ * \sa TutorialSparseDirectSolvers
+ */
+template <typename MatrixType_, bool IsStrSym = false>
+class PastixLU;
+template <typename MatrixType_, int Options>
+class PastixLLT;
+template <typename MatrixType_, int Options>
+class PastixLDLT;
+
+namespace internal {
+
+template <class Pastix>
+struct pastix_traits;
+
+template <typename MatrixType_>
+struct pastix_traits<PastixLU<MatrixType_> > {
+  typedef MatrixType_ MatrixType;
+  typedef typename MatrixType_::Scalar Scalar;
+  typedef typename MatrixType_::RealScalar RealScalar;
+  typedef typename MatrixType_::StorageIndex StorageIndex;
+};
+
+template <typename MatrixType_, int Options>
+struct pastix_traits<PastixLLT<MatrixType_, Options> > {
+  typedef MatrixType_ MatrixType;
+  typedef typename MatrixType_::Scalar Scalar;
+  typedef typename MatrixType_::RealScalar RealScalar;
+  typedef typename MatrixType_::StorageIndex StorageIndex;
+};
+
+template <typename MatrixType_, int Options>
+struct pastix_traits<PastixLDLT<MatrixType_, Options> > {
+  typedef MatrixType_ MatrixType;
+  typedef typename MatrixType_::Scalar Scalar;
+  typedef typename MatrixType_::RealScalar RealScalar;
+  typedef typename MatrixType_::StorageIndex StorageIndex;
+};
+
+inline void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, float *vals,
+                         int *perm, int *invp, float *x, int nbrhs, int *iparm, double *dparm) {
+  if (n == 0) {
+    ptr = NULL;
+    idx = NULL;
+    vals = NULL;
   }
-  
-  // Convert to C-style Numbering
-  template <typename MatrixType>
-  void fortran_to_c_numbering (MatrixType& mat)
-  {
-    // Check the Numbering
-    if ( mat.outerIndexPtr()[0] == 1 ) 
-    { // Convert to C-style numbering
-      int i;
-      for(i = 0; i <= mat.rows(); ++i)
-        --mat.outerIndexPtr()[i];
-      for(i = 0; i < mat.nonZeros(); ++i)
-        --mat.innerIndexPtr()[i];
-    }
+  if (nbrhs == 0) {
+    x = NULL;
+    nbrhs = 1;
+  }
+  s_pastix(pastix_data, pastix_comm, n, ptr, idx, vals, perm, invp, x, nbrhs, iparm, dparm);
+}
+
+inline void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, double *vals,
+                         int *perm, int *invp, double *x, int nbrhs, int *iparm, double *dparm) {
+  if (n == 0) {
+    ptr = NULL;
+    idx = NULL;
+    vals = NULL;
   }
+  if (nbrhs == 0) {
+    x = NULL;
+    nbrhs = 1;
+  }
+  d_pastix(pastix_data, pastix_comm, n, ptr, idx, vals, perm, invp, x, nbrhs, iparm, dparm);
+}
+
+inline void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx,
+                         std::complex<float> *vals, int *perm, int *invp, std::complex<float> *x, int nbrhs, int *iparm,
+                         double *dparm) {
+  if (n == 0) {
+    ptr = NULL;
+    idx = NULL;
+    vals = NULL;
+  }
+  if (nbrhs == 0) {
+    x = NULL;
+    nbrhs = 1;
+  }
+  c_pastix(pastix_data, pastix_comm, n, ptr, idx, reinterpret_cast<PASTIX_COMPLEX *>(vals), perm, invp,
+           reinterpret_cast<PASTIX_COMPLEX *>(x), nbrhs, iparm, dparm);
+}
+
+inline void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx,
+                         std::complex<double> *vals, int *perm, int *invp, std::complex<double> *x, int nbrhs,
+                         int *iparm, double *dparm) {
+  if (n == 0) {
+    ptr = NULL;
+    idx = NULL;
+    vals = NULL;
+  }
+  if (nbrhs == 0) {
+    x = NULL;
+    nbrhs = 1;
+  }
+  z_pastix(pastix_data, pastix_comm, n, ptr, idx, reinterpret_cast<PASTIX_DCOMPLEX *>(vals), perm, invp,
+           reinterpret_cast<PASTIX_DCOMPLEX *>(x), nbrhs, iparm, dparm);
 }
 
-// This is the base class to interface with PaStiX functions. 
-// Users should not used this class directly. 
+// Convert the matrix  to Fortran-style Numbering
+template <typename MatrixType>
+void c_to_fortran_numbering(MatrixType &mat) {
+  if (!(mat.outerIndexPtr()[0])) {
+    int i;
+    for (i = 0; i <= mat.rows(); ++i) ++mat.outerIndexPtr()[i];
+    for (i = 0; i < mat.nonZeros(); ++i) ++mat.innerIndexPtr()[i];
+  }
+}
+
+// Convert to C-style Numbering
+template <typename MatrixType>
+void fortran_to_c_numbering(MatrixType &mat) {
+  // Check the Numbering
+  if (mat.outerIndexPtr()[0] == 1) {  // Convert to C-style numbering
+    int i;
+    for (i = 0; i <= mat.rows(); ++i) --mat.outerIndexPtr()[i];
+    for (i = 0; i < mat.nonZeros(); ++i) --mat.innerIndexPtr()[i];
+  }
+}
+}  // namespace internal
+
+// This is the base class to interface with PaStiX functions.
+// Users should not used this class directly.
 template <class Derived>
-class PastixBase : internal::noncopyable
-{
-  public:
-    typedef typename internal::pastix_traits<Derived>::MatrixType _MatrixType;
-    typedef _MatrixType MatrixType;
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
-    typedef Matrix<Scalar,Dynamic,1> Vector;
-    typedef SparseMatrix<Scalar, ColMajor> ColSpMatrix;
-    
-  public:
-    
-    PastixBase() : m_initisOk(false), m_analysisIsOk(false), m_factorizationIsOk(false), m_isInitialized(false), m_pastixdata(0), m_size(0)
-    {
-      init();
-    }
-    
-    ~PastixBase() 
-    {
-      clean();
-    }
+class PastixBase : public SparseSolverBase<Derived> {
+ protected:
+  typedef SparseSolverBase<Derived> Base;
+  using Base::derived;
+  using Base::m_isInitialized;
+
+ public:
+  using Base::_solve_impl;
+
+  typedef typename internal::pastix_traits<Derived>::MatrixType MatrixType_;
+  typedef MatrixType_ MatrixType;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef typename MatrixType::StorageIndex StorageIndex;
+  typedef Matrix<Scalar, Dynamic, 1> Vector;
+  typedef SparseMatrix<Scalar, ColMajor> ColSpMatrix;
+  enum { ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime };
+
+ public:
+  PastixBase() : m_initisOk(false), m_analysisIsOk(false), m_factorizationIsOk(false), m_pastixdata(0), m_size(0) {
+    init();
+  }
 
-    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
-      *
-      * \sa compute()
-      */
-    template<typename Rhs>
-    inline const internal::solve_retval<PastixBase, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "Pastix solver is not initialized.");
-      eigen_assert(rows()==b.rows()
-                && "PastixBase::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::solve_retval<PastixBase, Rhs>(*this, b.derived());
-    }
-    
-    template<typename Rhs,typename Dest>
-    bool _solve (const MatrixBase<Rhs> &b, MatrixBase<Dest> &x) const;
-    
-    Derived& derived()
-    {
-      return *static_cast<Derived*>(this);
-    }
-    const Derived& derived() const
-    {
-      return *static_cast<const Derived*>(this);
-    }
+  ~PastixBase() { clean(); }
 
-    /** Returns a reference to the integer vector IPARM of PaStiX parameters
-      * to modify the default parameters. 
-      * The statistics related to the different phases of factorization and solve are saved here as well
-      * \sa analyzePattern() factorize()
-      */
-    Array<Index,IPARM_SIZE,1>& iparm()
-    {
-      return m_iparm; 
-    }
-    
-    /** Return a reference to a particular index parameter of the IPARM vector 
-     * \sa iparm()
-     */
-    
-    int& iparm(int idxparam)
-    {
-      return m_iparm(idxparam);
-    }
-    
-     /** Returns a reference to the double vector DPARM of PaStiX parameters 
-      * The statistics related to the different phases of factorization and solve are saved here as well
-      * \sa analyzePattern() factorize()
-      */
-    Array<RealScalar,IPARM_SIZE,1>& dparm()
-    {
-      return m_dparm; 
-    }
-    
-    
-    /** Return a reference to a particular index parameter of the DPARM vector 
-     * \sa dparm()
-     */
-    double& dparm(int idxparam)
-    {
-      return m_dparm(idxparam);
-    }
-    
-    inline Index cols() const { return m_size; }
-    inline Index rows() const { return m_size; }
-    
-     /** \brief Reports whether previous computation was successful.
-      *
-      * \returns \c Success if computation was succesful,
-      *          \c NumericalIssue if the PaStiX reports a problem
-      *          \c InvalidInput if the input matrix is invalid
-      *
-      * \sa iparm()          
-      */
-    ComputationInfo info() const
-    {
-      eigen_assert(m_isInitialized && "Decomposition is not initialized.");
-      return m_info;
-    }
-    
-    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
-      *
-      * \sa compute()
-      */
-    template<typename Rhs>
-    inline const internal::sparse_solve_retval<PastixBase, Rhs>
-    solve(const SparseMatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "Pastix LU, LLT or LDLT is not initialized.");
-      eigen_assert(rows()==b.rows()
-                && "PastixBase::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::sparse_solve_retval<PastixBase, Rhs>(*this, b.derived());
-    }
-    
-  protected:
-
-    // Initialize the Pastix data structure, check the matrix
-    void init(); 
-    
-    // Compute the ordering and the symbolic factorization
-    void analyzePattern(ColSpMatrix& mat);
-    
-    // Compute the numerical factorization
-    void factorize(ColSpMatrix& mat);
-    
-    // Free all the data allocated by Pastix
-    void clean()
-    {
-      eigen_assert(m_initisOk && "The Pastix structure should be allocated first"); 
-      m_iparm(IPARM_START_TASK) = API_TASK_CLEAN;
-      m_iparm(IPARM_END_TASK) = API_TASK_CLEAN;
-      internal::eigen_pastix(&m_pastixdata, MPI_COMM_WORLD, 0, 0, 0, (Scalar*)0,
-                             m_perm.data(), m_invp.data(), 0, 0, m_iparm.data(), m_dparm.data());
-    }
-    
-    void compute(ColSpMatrix& mat);
-    
-    int m_initisOk; 
-    int m_analysisIsOk;
-    int m_factorizationIsOk;
-    bool m_isInitialized;
-    mutable ComputationInfo m_info; 
-    mutable pastix_data_t *m_pastixdata; // Data structure for pastix
-    mutable int m_comm; // The MPI communicator identifier
-    mutable Matrix<int,IPARM_SIZE,1> m_iparm; // integer vector for the input parameters
-    mutable Matrix<double,DPARM_SIZE,1> m_dparm; // Scalar vector for the input parameters
-    mutable Matrix<Index,Dynamic,1> m_perm;  // Permutation vector
-    mutable Matrix<Index,Dynamic,1> m_invp;  // Inverse permutation vector
-    mutable int m_size; // Size of the matrix 
-}; 
-
- /** Initialize the PaStiX data structure. 
-   *A first call to this function fills iparm and dparm with the default PaStiX parameters
-   * \sa iparm() dparm()
+  template <typename Rhs, typename Dest>
+  bool _solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest> &x) const;
+
+  /** Returns a reference to the integer vector IPARM of PaStiX parameters
+   * to modify the default parameters.
+   * The statistics related to the different phases of factorization and solve are saved here as well
+   * \sa analyzePattern() factorize()
+   */
+  Array<StorageIndex, IPARM_SIZE, 1> &iparm() { return m_iparm; }
+
+  /** Return a reference to a particular index parameter of the IPARM vector
+   * \sa iparm()
+   */
+
+  int &iparm(int idxparam) { return m_iparm(idxparam); }
+
+  /** Returns a reference to the double vector DPARM of PaStiX parameters
+   * The statistics related to the different phases of factorization and solve are saved here as well
+   * \sa analyzePattern() factorize()
+   */
+  Array<double, DPARM_SIZE, 1> &dparm() { return m_dparm; }
+
+  /** Return a reference to a particular index parameter of the DPARM vector
+   * \sa dparm()
    */
+  double &dparm(int idxparam) { return m_dparm(idxparam); }
+
+  inline Index cols() const { return m_size; }
+  inline Index rows() const { return m_size; }
+
+  /** \brief Reports whether previous computation was successful.
+   *
+   * \returns \c Success if computation was successful,
+   *          \c NumericalIssue if the PaStiX reports a problem
+   *          \c InvalidInput if the input matrix is invalid
+   *
+   * \sa iparm()
+   */
+  ComputationInfo info() const {
+    eigen_assert(m_isInitialized && "Decomposition is not initialized.");
+    return m_info;
+  }
+
+ protected:
+  // Initialize the Pastix data structure, check the matrix
+  void init();
+
+  // Compute the ordering and the symbolic factorization
+  void analyzePattern(ColSpMatrix &mat);
+
+  // Compute the numerical factorization
+  void factorize(ColSpMatrix &mat);
+
+  // Free all the data allocated by Pastix
+  void clean() {
+    eigen_assert(m_initisOk && "The Pastix structure should be allocated first");
+    m_iparm(IPARM_START_TASK) = API_TASK_CLEAN;
+    m_iparm(IPARM_END_TASK) = API_TASK_CLEAN;
+    internal::eigen_pastix(&m_pastixdata, MPI_COMM_WORLD, 0, 0, 0, (Scalar *)0, m_perm.data(), m_invp.data(), 0, 0,
+                           m_iparm.data(), m_dparm.data());
+  }
+
+  void compute(ColSpMatrix &mat);
+
+  int m_initisOk;
+  int m_analysisIsOk;
+  int m_factorizationIsOk;
+  mutable ComputationInfo m_info;
+  mutable pastix_data_t *m_pastixdata;              // Data structure for pastix
+  mutable int m_comm;                               // The MPI communicator identifier
+  mutable Array<int, IPARM_SIZE, 1> m_iparm;        // integer vector for the input parameters
+  mutable Array<double, DPARM_SIZE, 1> m_dparm;     // Scalar vector for the input parameters
+  mutable Matrix<StorageIndex, Dynamic, 1> m_perm;  // Permutation vector
+  mutable Matrix<StorageIndex, Dynamic, 1> m_invp;  // Inverse permutation vector
+  mutable int m_size;                               // Size of the matrix
+};
+
+/** Initialize the PaStiX data structure.
+ *A first call to this function fills iparm and dparm with the default PaStiX parameters
+ * \sa iparm() dparm()
+ */
 template <class Derived>
-void PastixBase<Derived>::init()
-{
-  m_size = 0; 
+void PastixBase<Derived>::init() {
+  m_size = 0;
   m_iparm.setZero(IPARM_SIZE);
   m_dparm.setZero(DPARM_SIZE);
-  
+
   m_iparm(IPARM_MODIFY_PARAMETER) = API_NO;
-  pastix(&m_pastixdata, MPI_COMM_WORLD,
-         0, 0, 0, 0,
-         0, 0, 0, 1, m_iparm.data(), m_dparm.data());
-  
+  pastix(&m_pastixdata, MPI_COMM_WORLD, 0, 0, 0, 0, 0, 0, 0, 1, m_iparm.data(), m_dparm.data());
+
   m_iparm[IPARM_MATRIX_VERIFICATION] = API_NO;
-  m_iparm[IPARM_VERBOSE]             = 2;
-  m_iparm[IPARM_ORDERING]            = API_ORDER_SCOTCH;
-  m_iparm[IPARM_INCOMPLETE]          = API_NO;
-  m_iparm[IPARM_OOC_LIMIT]           = 2000;
-  m_iparm[IPARM_RHS_MAKING]          = API_RHS_B;
+  m_iparm[IPARM_VERBOSE] = API_VERBOSE_NOT;
+  m_iparm[IPARM_ORDERING] = API_ORDER_SCOTCH;
+  m_iparm[IPARM_INCOMPLETE] = API_NO;
+  m_iparm[IPARM_OOC_LIMIT] = 2000;
+  m_iparm[IPARM_RHS_MAKING] = API_RHS_B;
   m_iparm(IPARM_MATRIX_VERIFICATION) = API_NO;
-  
+
   m_iparm(IPARM_START_TASK) = API_TASK_INIT;
   m_iparm(IPARM_END_TASK) = API_TASK_INIT;
-  internal::eigen_pastix(&m_pastixdata, MPI_COMM_WORLD, 0, 0, 0, (Scalar*)0,
-                         0, 0, 0, 0, m_iparm.data(), m_dparm.data());
-  
+  internal::eigen_pastix(&m_pastixdata, MPI_COMM_WORLD, 0, 0, 0, (Scalar *)0, 0, 0, 0, 0, m_iparm.data(),
+                         m_dparm.data());
+
   // Check the returned error
-  if(m_iparm(IPARM_ERROR_NUMBER)) {
+  if (m_iparm(IPARM_ERROR_NUMBER)) {
     m_info = InvalidInput;
     m_initisOk = false;
-  }
-  else { 
+  } else {
     m_info = Success;
     m_initisOk = true;
   }
 }
 
 template <class Derived>
-void PastixBase<Derived>::compute(ColSpMatrix& mat)
-{
+void PastixBase<Derived>::compute(ColSpMatrix &mat) {
   eigen_assert(mat.rows() == mat.cols() && "The input matrix should be squared");
-  
-  analyzePattern(mat);  
+
+  analyzePattern(mat);
   factorize(mat);
-  
+
   m_iparm(IPARM_MATRIX_VERIFICATION) = API_NO;
-  m_isInitialized = m_factorizationIsOk;
 }
 
-
 template <class Derived>
-void PastixBase<Derived>::analyzePattern(ColSpMatrix& mat)
-{                         
+void PastixBase<Derived>::analyzePattern(ColSpMatrix &mat) {
   eigen_assert(m_initisOk && "The initialization of PaSTiX failed");
-  
+
   // clean previous calls
-  if(m_size>0)
-    clean();
-  
-  m_size = mat.rows();
+  if (m_size > 0) clean();
+
+  m_size = internal::convert_index<int>(mat.rows());
   m_perm.resize(m_size);
   m_invp.resize(m_size);
-  
+
   m_iparm(IPARM_START_TASK) = API_TASK_ORDERING;
   m_iparm(IPARM_END_TASK) = API_TASK_ANALYSE;
   internal::eigen_pastix(&m_pastixdata, MPI_COMM_WORLD, m_size, mat.outerIndexPtr(), mat.innerIndexPtr(),
-               mat.valuePtr(), m_perm.data(), m_invp.data(), 0, 0, m_iparm.data(), m_dparm.data());
-  
+                         mat.valuePtr(), m_perm.data(), m_invp.data(), 0, 0, m_iparm.data(), m_dparm.data());
+
   // Check the returned error
-  if(m_iparm(IPARM_ERROR_NUMBER))
-  {
+  if (m_iparm(IPARM_ERROR_NUMBER)) {
     m_info = NumericalIssue;
     m_analysisIsOk = false;
-  }
-  else
-  { 
+  } else {
     m_info = Success;
     m_analysisIsOk = true;
   }
 }
 
 template <class Derived>
-void PastixBase<Derived>::factorize(ColSpMatrix& mat)
-{
-//   if(&m_cpyMat != &mat) m_cpyMat = mat;
+void PastixBase<Derived>::factorize(ColSpMatrix &mat) {
+  //   if(&m_cpyMat != &mat) m_cpyMat = mat;
   eigen_assert(m_analysisIsOk && "The analysis phase should be called before the factorization phase");
   m_iparm(IPARM_START_TASK) = API_TASK_NUMFACT;
   m_iparm(IPARM_END_TASK) = API_TASK_NUMFACT;
-  m_size = mat.rows();
-  
+  m_size = internal::convert_index<int>(mat.rows());
+
   internal::eigen_pastix(&m_pastixdata, MPI_COMM_WORLD, m_size, mat.outerIndexPtr(), mat.innerIndexPtr(),
-               mat.valuePtr(), m_perm.data(), m_invp.data(), 0, 0, m_iparm.data(), m_dparm.data());
-  
+                         mat.valuePtr(), m_perm.data(), m_invp.data(), 0, 0, m_iparm.data(), m_dparm.data());
+
   // Check the returned error
-  if(m_iparm(IPARM_ERROR_NUMBER))
-  {
+  if (m_iparm(IPARM_ERROR_NUMBER)) {
     m_info = NumericalIssue;
     m_factorizationIsOk = false;
     m_isInitialized = false;
-  }
-  else
-  {
+  } else {
     m_info = Success;
     m_factorizationIsOk = true;
     m_isInitialized = true;
@@ -383,339 +350,283 @@ void PastixBase<Derived>::factorize(ColSpMatrix& mat)
 }
 
 /* Solve the system */
-template<typename Base>
-template<typename Rhs,typename Dest>
-bool PastixBase<Base>::_solve (const MatrixBase<Rhs> &b, MatrixBase<Dest> &x) const
-{
+template <typename Base>
+template <typename Rhs, typename Dest>
+bool PastixBase<Base>::_solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest> &x) const {
   eigen_assert(m_isInitialized && "The matrix should be factorized first");
-  EIGEN_STATIC_ASSERT((Dest::Flags&RowMajorBit)==0,
-                     THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES);
+  EIGEN_STATIC_ASSERT((Dest::Flags & RowMajorBit) == 0, THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES);
   int rhs = 1;
-  
+
   x = b; /* on return, x is overwritten by the computed solution */
-  
-  for (int i = 0; i < b.cols(); i++){
-    m_iparm[IPARM_START_TASK]          = API_TASK_SOLVE;
-    m_iparm[IPARM_END_TASK]            = API_TASK_REFINE;
-  
-    internal::eigen_pastix(&m_pastixdata, MPI_COMM_WORLD, x.rows(), 0, 0, 0,
+
+  for (int i = 0; i < b.cols(); i++) {
+    m_iparm[IPARM_START_TASK] = API_TASK_SOLVE;
+    m_iparm[IPARM_END_TASK] = API_TASK_REFINE;
+
+    internal::eigen_pastix(&m_pastixdata, MPI_COMM_WORLD, internal::convert_index<int>(x.rows()), 0, 0, 0,
                            m_perm.data(), m_invp.data(), &x(0, i), rhs, m_iparm.data(), m_dparm.data());
   }
-  
+
   // Check the returned error
-  m_info = m_iparm(IPARM_ERROR_NUMBER)==0 ? Success : NumericalIssue;
-  
-  return m_iparm(IPARM_ERROR_NUMBER)==0;
+  m_info = m_iparm(IPARM_ERROR_NUMBER) == 0 ? Success : NumericalIssue;
+
+  return m_iparm(IPARM_ERROR_NUMBER) == 0;
 }
 
 /** \ingroup PaStiXSupport_Module
-  * \class PastixLU
-  * \brief Sparse direct LU solver based on PaStiX library
-  * 
-  * This class is used to solve the linear systems A.X = B with a supernodal LU 
-  * factorization in the PaStiX library. The matrix A should be squared and nonsingular
-  * PaStiX requires that the matrix A has a symmetric structural pattern. 
-  * This interface can symmetrize the input matrix otherwise. 
-  * The vectors or matrices X and B can be either dense or sparse.
-  * 
-  * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
-  * \tparam IsStrSym Indicates if the input matrix has a symmetric pattern, default is false
-  * NOTE : Note that if the analysis and factorization phase are called separately, 
-  * the input matrix will be symmetrized at each call, hence it is advised to 
-  * symmetrize the matrix in a end-user program and set \p IsStrSym to true
-  * 
-  * \sa \ref TutorialSparseDirectSolvers
-  * 
-  */
-template<typename _MatrixType, bool IsStrSym>
-class PastixLU : public PastixBase< PastixLU<_MatrixType> >
-{
-  public:
-    typedef _MatrixType MatrixType;
-    typedef PastixBase<PastixLU<MatrixType> > Base;
-    typedef typename Base::ColSpMatrix ColSpMatrix;
-    typedef typename MatrixType::Index Index;
-    
-  public:
-    PastixLU() : Base()
-    {
-      init();
-    }
-    
-    PastixLU(const MatrixType& matrix):Base()
-    {
-      init();
-      compute(matrix);
-    }
-    /** Compute the LU supernodal factorization of \p matrix. 
-      * iparm and dparm can be used to tune the PaStiX parameters. 
-      * see the PaStiX user's manual
-      * \sa analyzePattern() factorize()
-      */
-    void compute (const MatrixType& matrix)
-    {
-      m_structureIsUptodate = false;
-      ColSpMatrix temp;
-      grabMatrix(matrix, temp);
-      Base::compute(temp);
-    }
-    /** Compute the LU symbolic factorization of \p matrix using its sparsity pattern. 
-      * Several ordering methods can be used at this step. See the PaStiX user's manual. 
-      * The result of this operation can be used with successive matrices having the same pattern as \p matrix
-      * \sa factorize()
-      */
-    void analyzePattern(const MatrixType& matrix)
-    {
-      m_structureIsUptodate = false;
-      ColSpMatrix temp;
-      grabMatrix(matrix, temp);
-      Base::analyzePattern(temp);
-    }
+ * \class PastixLU
+ * \brief Sparse direct LU solver based on PaStiX library
+ *
+ * This class is used to solve the linear systems A.X = B with a supernodal LU
+ * factorization in the PaStiX library. The matrix A should be squared and nonsingular
+ * PaStiX requires that the matrix A has a symmetric structural pattern.
+ * This interface can symmetrize the input matrix otherwise.
+ * The vectors or matrices X and B can be either dense or sparse.
+ *
+ * \tparam MatrixType_ the type of the sparse matrix A, it must be a SparseMatrix<>
+ * \tparam IsStrSym Indicates if the input matrix has a symmetric pattern, default is false
+ * NOTE : Note that if the analysis and factorization phase are called separately,
+ * the input matrix will be symmetrized at each call, hence it is advised to
+ * symmetrize the matrix in a end-user program and set \p IsStrSym to true
+ *
+ * \implsparsesolverconcept
+ *
+ * \sa \ref TutorialSparseSolverConcept, class SparseLU
+ *
+ */
+template <typename MatrixType_, bool IsStrSym>
+class PastixLU : public PastixBase<PastixLU<MatrixType_> > {
+ public:
+  typedef MatrixType_ MatrixType;
+  typedef PastixBase<PastixLU<MatrixType> > Base;
+  typedef typename Base::ColSpMatrix ColSpMatrix;
+  typedef typename MatrixType::StorageIndex StorageIndex;
+
+ public:
+  PastixLU() : Base() { init(); }
+
+  explicit PastixLU(const MatrixType &matrix) : Base() {
+    init();
+    compute(matrix);
+  }
+  /** Compute the LU supernodal factorization of \p matrix.
+   * iparm and dparm can be used to tune the PaStiX parameters.
+   * see the PaStiX user's manual
+   * \sa analyzePattern() factorize()
+   */
+  void compute(const MatrixType &matrix) {
+    m_structureIsUptodate = false;
+    ColSpMatrix temp;
+    grabMatrix(matrix, temp);
+    Base::compute(temp);
+  }
+  /** Compute the LU symbolic factorization of \p matrix using its sparsity pattern.
+   * Several ordering methods can be used at this step. See the PaStiX user's manual.
+   * The result of this operation can be used with successive matrices having the same pattern as \p matrix
+   * \sa factorize()
+   */
+  void analyzePattern(const MatrixType &matrix) {
+    m_structureIsUptodate = false;
+    ColSpMatrix temp;
+    grabMatrix(matrix, temp);
+    Base::analyzePattern(temp);
+  }
 
-    /** Compute the LU supernodal factorization of \p matrix
-      * WARNING The matrix \p matrix should have the same structural pattern 
-      * as the same used in the analysis phase.
-      * \sa analyzePattern()
-      */ 
-    void factorize(const MatrixType& matrix)
-    {
-      ColSpMatrix temp;
-      grabMatrix(matrix, temp);
-      Base::factorize(temp);
-    }
-  protected:
-    
-    void init()
-    {
-      m_structureIsUptodate = false;
-      m_iparm(IPARM_SYM) = API_SYM_NO;
-      m_iparm(IPARM_FACTORIZATION) = API_FACT_LU;
-    }
-    
-    void grabMatrix(const MatrixType& matrix, ColSpMatrix& out)
-    {
-      if(IsStrSym)
-        out = matrix;
-      else
-      {
-        if(!m_structureIsUptodate)
-        {
-          // update the transposed structure
-          m_transposedStructure = matrix.transpose();
-          
-          // Set the elements of the matrix to zero 
-          for (Index j=0; j<m_transposedStructure.outerSize(); ++j) 
-            for(typename ColSpMatrix::InnerIterator it(m_transposedStructure, j); it; ++it)
-              it.valueRef() = 0.0;
-
-          m_structureIsUptodate = true;
-        }
-        
-        out = m_transposedStructure + matrix;
+  /** Compute the LU supernodal factorization of \p matrix
+   * WARNING The matrix \p matrix should have the same structural pattern
+   * as the same used in the analysis phase.
+   * \sa analyzePattern()
+   */
+  void factorize(const MatrixType &matrix) {
+    ColSpMatrix temp;
+    grabMatrix(matrix, temp);
+    Base::factorize(temp);
+  }
+
+ protected:
+  void init() {
+    m_structureIsUptodate = false;
+    m_iparm(IPARM_SYM) = API_SYM_NO;
+    m_iparm(IPARM_FACTORIZATION) = API_FACT_LU;
+  }
+
+  void grabMatrix(const MatrixType &matrix, ColSpMatrix &out) {
+    if (IsStrSym)
+      out = matrix;
+    else {
+      if (!m_structureIsUptodate) {
+        // update the transposed structure
+        m_transposedStructure = matrix.transpose();
+
+        // Set the elements of the matrix to zero
+        for (Index j = 0; j < m_transposedStructure.outerSize(); ++j)
+          for (typename ColSpMatrix::InnerIterator it(m_transposedStructure, j); it; ++it) it.valueRef() = 0.0;
+
+        m_structureIsUptodate = true;
       }
-      internal::c_to_fortran_numbering(out);
-    }
-    
-    using Base::m_iparm;
-    using Base::m_dparm;
-    
-    ColSpMatrix m_transposedStructure;
-    bool m_structureIsUptodate;
-};
 
-/** \ingroup PaStiXSupport_Module
-  * \class PastixLLT
-  * \brief A sparse direct supernodal Cholesky (LLT) factorization and solver based on the PaStiX library
-  * 
-  * This class is used to solve the linear systems A.X = B via a LL^T supernodal Cholesky factorization
-  * available in the PaStiX library. The matrix A should be symmetric and positive definite
-  * WARNING Selfadjoint complex matrices are not supported in the current version of PaStiX
-  * The vectors or matrices X and B can be either dense or sparse
-  * 
-  * \tparam MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
-  * \tparam UpLo The part of the matrix to use : Lower or Upper. The default is Lower as required by PaStiX
-  * 
-  * \sa \ref TutorialSparseDirectSolvers
-  */
-template<typename _MatrixType, int _UpLo>
-class PastixLLT : public PastixBase< PastixLLT<_MatrixType, _UpLo> >
-{
-  public:
-    typedef _MatrixType MatrixType;
-    typedef PastixBase<PastixLLT<MatrixType, _UpLo> > Base;
-    typedef typename Base::ColSpMatrix ColSpMatrix;
-    
-  public:
-    enum { UpLo = _UpLo };
-    PastixLLT() : Base()
-    {
-      init();
-    }
-    
-    PastixLLT(const MatrixType& matrix):Base()
-    {
-      init();
-      compute(matrix);
+      out = m_transposedStructure + matrix;
     }
+    internal::c_to_fortran_numbering(out);
+  }
 
-    /** Compute the L factor of the LL^T supernodal factorization of \p matrix 
-      * \sa analyzePattern() factorize()
-      */
-    void compute (const MatrixType& matrix)
-    {
-      ColSpMatrix temp;
-      grabMatrix(matrix, temp);
-      Base::compute(temp);
-    }
+  using Base::m_dparm;
+  using Base::m_iparm;
 
-     /** Compute the LL^T symbolic factorization of \p matrix using its sparsity pattern
-      * The result of this operation can be used with successive matrices having the same pattern as \p matrix
-      * \sa factorize()
-      */
-    void analyzePattern(const MatrixType& matrix)
-    {
-      ColSpMatrix temp;
-      grabMatrix(matrix, temp);
-      Base::analyzePattern(temp);
-    }
-      /** Compute the LL^T supernodal numerical factorization of \p matrix 
-        * \sa analyzePattern()
-        */
-    void factorize(const MatrixType& matrix)
-    {
-      ColSpMatrix temp;
-      grabMatrix(matrix, temp);
-      Base::factorize(temp);
-    }
-  protected:
-    using Base::m_iparm;
-    
-    void init()
-    {
-      m_iparm(IPARM_SYM) = API_SYM_YES;
-      m_iparm(IPARM_FACTORIZATION) = API_FACT_LLT;
-    }
-    
-    void grabMatrix(const MatrixType& matrix, ColSpMatrix& out)
-    {
-      // Pastix supports only lower, column-major matrices 
-      out.template selfadjointView<Lower>() = matrix.template selfadjointView<UpLo>();
-      internal::c_to_fortran_numbering(out);
-    }
+  ColSpMatrix m_transposedStructure;
+  bool m_structureIsUptodate;
 };
 
 /** \ingroup PaStiXSupport_Module
-  * \class PastixLDLT
-  * \brief A sparse direct supernodal Cholesky (LLT) factorization and solver based on the PaStiX library
-  * 
-  * This class is used to solve the linear systems A.X = B via a LDL^T supernodal Cholesky factorization
-  * available in the PaStiX library. The matrix A should be symmetric and positive definite
-  * WARNING Selfadjoint complex matrices are not supported in the current version of PaStiX
-  * The vectors or matrices X and B can be either dense or sparse
-  * 
-  * \tparam MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
-  * \tparam UpLo The part of the matrix to use : Lower or Upper. The default is Lower as required by PaStiX
-  * 
-  * \sa \ref TutorialSparseDirectSolvers
-  */
-template<typename _MatrixType, int _UpLo>
-class PastixLDLT : public PastixBase< PastixLDLT<_MatrixType, _UpLo> >
-{
-  public:
-    typedef _MatrixType MatrixType;
-    typedef PastixBase<PastixLDLT<MatrixType, _UpLo> > Base; 
-    typedef typename Base::ColSpMatrix ColSpMatrix;
-    
-  public:
-    enum { UpLo = _UpLo };
-    PastixLDLT():Base()
-    {
-      init();
-    }
-    
-    PastixLDLT(const MatrixType& matrix):Base()
-    {
-      init();
-      compute(matrix);
-    }
+ * \class PastixLLT
+ * \brief A sparse direct supernodal Cholesky (LLT) factorization and solver based on the PaStiX library
+ *
+ * This class is used to solve the linear systems A.X = B via a LL^T supernodal Cholesky factorization
+ * available in the PaStiX library. The matrix A should be symmetric and positive definite
+ * WARNING Selfadjoint complex matrices are not supported in the current version of PaStiX
+ * The vectors or matrices X and B can be either dense or sparse
+ *
+ * \tparam MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
+ * \tparam UpLo The part of the matrix to use : Lower or Upper. The default is Lower as required by PaStiX
+ *
+ * \implsparsesolverconcept
+ *
+ * \sa \ref TutorialSparseSolverConcept, class SimplicialLLT
+ */
+template <typename MatrixType_, int UpLo_>
+class PastixLLT : public PastixBase<PastixLLT<MatrixType_, UpLo_> > {
+ public:
+  typedef MatrixType_ MatrixType;
+  typedef PastixBase<PastixLLT<MatrixType, UpLo_> > Base;
+  typedef typename Base::ColSpMatrix ColSpMatrix;
+
+ public:
+  enum { UpLo = UpLo_ };
+  PastixLLT() : Base() { init(); }
+
+  explicit PastixLLT(const MatrixType &matrix) : Base() {
+    init();
+    compute(matrix);
+  }
 
-    /** Compute the L and D factors of the LDL^T factorization of \p matrix 
-      * \sa analyzePattern() factorize()
-      */
-    void compute (const MatrixType& matrix)
-    {
-      ColSpMatrix temp;
-      grabMatrix(matrix, temp);
-      Base::compute(temp);
-    }
+  /** Compute the L factor of the LL^T supernodal factorization of \p matrix
+   * \sa analyzePattern() factorize()
+   */
+  void compute(const MatrixType &matrix) {
+    ColSpMatrix temp;
+    grabMatrix(matrix, temp);
+    Base::compute(temp);
+  }
 
-    /** Compute the LDL^T symbolic factorization of \p matrix using its sparsity pattern
-      * The result of this operation can be used with successive matrices having the same pattern as \p matrix
-      * \sa factorize()
-      */
-    void analyzePattern(const MatrixType& matrix)
-    { 
-      ColSpMatrix temp;
-      grabMatrix(matrix, temp);
-      Base::analyzePattern(temp);
-    }
-    /** Compute the LDL^T supernodal numerical factorization of \p matrix 
-      * 
-      */
-    void factorize(const MatrixType& matrix)
-    {
-      ColSpMatrix temp;
-      grabMatrix(matrix, temp);
-      Base::factorize(temp);
-    }
+  /** Compute the LL^T symbolic factorization of \p matrix using its sparsity pattern
+   * The result of this operation can be used with successive matrices having the same pattern as \p matrix
+   * \sa factorize()
+   */
+  void analyzePattern(const MatrixType &matrix) {
+    ColSpMatrix temp;
+    grabMatrix(matrix, temp);
+    Base::analyzePattern(temp);
+  }
+  /** Compute the LL^T supernodal numerical factorization of \p matrix
+   * \sa analyzePattern()
+   */
+  void factorize(const MatrixType &matrix) {
+    ColSpMatrix temp;
+    grabMatrix(matrix, temp);
+    Base::factorize(temp);
+  }
 
-  protected:
-    using Base::m_iparm;
-    
-    void init()
-    {
-      m_iparm(IPARM_SYM) = API_SYM_YES;
-      m_iparm(IPARM_FACTORIZATION) = API_FACT_LDLT;
-    }
-    
-    void grabMatrix(const MatrixType& matrix, ColSpMatrix& out)
-    {
-      // Pastix supports only lower, column-major matrices 
-      out.template selfadjointView<Lower>() = matrix.template selfadjointView<UpLo>();
-      internal::c_to_fortran_numbering(out);
-    }
+ protected:
+  using Base::m_iparm;
+
+  void init() {
+    m_iparm(IPARM_SYM) = API_SYM_YES;
+    m_iparm(IPARM_FACTORIZATION) = API_FACT_LLT;
+  }
+
+  void grabMatrix(const MatrixType &matrix, ColSpMatrix &out) {
+    out.resize(matrix.rows(), matrix.cols());
+    // Pastix supports only lower, column-major matrices
+    out.template selfadjointView<Lower>() = matrix.template selfadjointView<UpLo>();
+    internal::c_to_fortran_numbering(out);
+  }
 };
 
-namespace internal {
+/** \ingroup PaStiXSupport_Module
+ * \class PastixLDLT
+ * \brief A sparse direct supernodal Cholesky (LLT) factorization and solver based on the PaStiX library
+ *
+ * This class is used to solve the linear systems A.X = B via a LDL^T supernodal Cholesky factorization
+ * available in the PaStiX library. The matrix A should be symmetric and positive definite
+ * WARNING Selfadjoint complex matrices are not supported in the current version of PaStiX
+ * The vectors or matrices X and B can be either dense or sparse
+ *
+ * \tparam MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
+ * \tparam UpLo The part of the matrix to use : Lower or Upper. The default is Lower as required by PaStiX
+ *
+ * \implsparsesolverconcept
+ *
+ * \sa \ref TutorialSparseSolverConcept, class SimplicialLDLT
+ */
+template <typename MatrixType_, int UpLo_>
+class PastixLDLT : public PastixBase<PastixLDLT<MatrixType_, UpLo_> > {
+ public:
+  typedef MatrixType_ MatrixType;
+  typedef PastixBase<PastixLDLT<MatrixType, UpLo_> > Base;
+  typedef typename Base::ColSpMatrix ColSpMatrix;
+
+ public:
+  enum { UpLo = UpLo_ };
+  PastixLDLT() : Base() { init(); }
+
+  explicit PastixLDLT(const MatrixType &matrix) : Base() {
+    init();
+    compute(matrix);
+  }
 
-template<typename _MatrixType, typename Rhs>
-struct solve_retval<PastixBase<_MatrixType>, Rhs>
-  : solve_retval_base<PastixBase<_MatrixType>, Rhs>
-{
-  typedef PastixBase<_MatrixType> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
+  /** Compute the L and D factors of the LDL^T factorization of \p matrix
+   * \sa analyzePattern() factorize()
+   */
+  void compute(const MatrixType &matrix) {
+    ColSpMatrix temp;
+    grabMatrix(matrix, temp);
+    Base::compute(temp);
+  }
 
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
+  /** Compute the LDL^T symbolic factorization of \p matrix using its sparsity pattern
+   * The result of this operation can be used with successive matrices having the same pattern as \p matrix
+   * \sa factorize()
+   */
+  void analyzePattern(const MatrixType &matrix) {
+    ColSpMatrix temp;
+    grabMatrix(matrix, temp);
+    Base::analyzePattern(temp);
+  }
+  /** Compute the LDL^T supernodal numerical factorization of \p matrix
+   *
+   */
+  void factorize(const MatrixType &matrix) {
+    ColSpMatrix temp;
+    grabMatrix(matrix, temp);
+    Base::factorize(temp);
   }
-};
 
-template<typename _MatrixType, typename Rhs>
-struct sparse_solve_retval<PastixBase<_MatrixType>, Rhs>
-  : sparse_solve_retval_base<PastixBase<_MatrixType>, Rhs>
-{
-  typedef PastixBase<_MatrixType> Dec;
-  EIGEN_MAKE_SPARSE_SOLVE_HELPERS(Dec,Rhs)
+ protected:
+  using Base::m_iparm;
 
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    this->defaultEvalTo(dst);
+  void init() {
+    m_iparm(IPARM_SYM) = API_SYM_YES;
+    m_iparm(IPARM_FACTORIZATION) = API_FACT_LDLT;
   }
-};
 
-} // end namespace internal
+  void grabMatrix(const MatrixType &matrix, ColSpMatrix &out) {
+    // Pastix supports only lower, column-major matrices
+    out.resize(matrix.rows(), matrix.cols());
+    out.template selfadjointView<Lower>() = matrix.template selfadjointView<UpLo>();
+    internal::c_to_fortran_numbering(out);
+  }
+};
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
 #endif
diff --git a/inst/include/Eigen/src/PardisoSupport/InternalHeaderCheck.h b/inst/include/Eigen/src/PardisoSupport/InternalHeaderCheck.h
new file mode 100644
index 00000000..8ef33f05
--- /dev/null
+++ b/inst/include/Eigen/src/PardisoSupport/InternalHeaderCheck.h
@@ -0,0 +1,3 @@
+#ifndef EIGEN_PARDISOSUPPORT_MODULE_H
+#error "Please include Eigen/PardisoSupport instead of including headers inside the src directory directly."
+#endif
diff --git a/inst/include/Eigen/src/PardisoSupport/PardisoSupport.h b/inst/include/Eigen/src/PardisoSupport/PardisoSupport.h
index 18cd7d88..2f5d83ee 100644
--- a/inst/include/Eigen/src/PardisoSupport/PardisoSupport.h
+++ b/inst/include/Eigen/src/PardisoSupport/PardisoSupport.h
@@ -32,561 +32,469 @@
 #ifndef EIGEN_PARDISOSUPPORT_H
 #define EIGEN_PARDISOSUPPORT_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 
-template<typename _MatrixType> class PardisoLU;
-template<typename _MatrixType, int Options=Upper> class PardisoLLT;
-template<typename _MatrixType, int Options=Upper> class PardisoLDLT;
+namespace Eigen {
 
-namespace internal
-{
-  template<typename Index>
-  struct pardiso_run_selector
-  {
-    static Index run( _MKL_DSS_HANDLE_t pt, Index maxfct, Index mnum, Index type, Index phase, Index n, void *a,
-                      Index *ia, Index *ja, Index *perm, Index nrhs, Index *iparm, Index msglvl, void *b, void *x)
-    {
-      Index error = 0;
-      ::pardiso(pt, &maxfct, &mnum, &type, &phase, &n, a, ia, ja, perm, &nrhs, iparm, &msglvl, b, x, &error);
-      return error;
-    }
-  };
-  template<>
-  struct pardiso_run_selector<long long int>
-  {
-    typedef long long int Index;
-    static Index run( _MKL_DSS_HANDLE_t pt, Index maxfct, Index mnum, Index type, Index phase, Index n, void *a,
-                      Index *ia, Index *ja, Index *perm, Index nrhs, Index *iparm, Index msglvl, void *b, void *x)
-    {
-      Index error = 0;
-      ::pardiso_64(pt, &maxfct, &mnum, &type, &phase, &n, a, ia, ja, perm, &nrhs, iparm, &msglvl, b, x, &error);
-      return error;
-    }
-  };
+template <typename MatrixType_>
+class PardisoLU;
+template <typename MatrixType_, int Options = Upper>
+class PardisoLLT;
+template <typename MatrixType_, int Options = Upper>
+class PardisoLDLT;
 
-  template<class Pardiso> struct pardiso_traits;
-
-  template<typename _MatrixType>
-  struct pardiso_traits< PardisoLU<_MatrixType> >
-  {
-    typedef _MatrixType MatrixType;
-    typedef typename _MatrixType::Scalar Scalar;
-    typedef typename _MatrixType::RealScalar RealScalar;
-    typedef typename _MatrixType::Index Index;
-  };
-
-  template<typename _MatrixType, int Options>
-  struct pardiso_traits< PardisoLLT<_MatrixType, Options> >
-  {
-    typedef _MatrixType MatrixType;
-    typedef typename _MatrixType::Scalar Scalar;
-    typedef typename _MatrixType::RealScalar RealScalar;
-    typedef typename _MatrixType::Index Index;
-  };
-
-  template<typename _MatrixType, int Options>
-  struct pardiso_traits< PardisoLDLT<_MatrixType, Options> >
-  {
-    typedef _MatrixType MatrixType;
-    typedef typename _MatrixType::Scalar Scalar;
-    typedef typename _MatrixType::RealScalar RealScalar;
-    typedef typename _MatrixType::Index Index;    
-  };
+namespace internal {
+template <typename IndexType>
+struct pardiso_run_selector {
+  static IndexType run(_MKL_DSS_HANDLE_t pt, IndexType maxfct, IndexType mnum, IndexType type, IndexType phase,
+                       IndexType n, void* a, IndexType* ia, IndexType* ja, IndexType* perm, IndexType nrhs,
+                       IndexType* iparm, IndexType msglvl, void* b, void* x) {
+    IndexType error = 0;
+    ::pardiso(pt, &maxfct, &mnum, &type, &phase, &n, a, ia, ja, perm, &nrhs, iparm, &msglvl, b, x, &error);
+    return error;
+  }
+};
+template <>
+struct pardiso_run_selector<long long int> {
+  typedef long long int IndexType;
+  static IndexType run(_MKL_DSS_HANDLE_t pt, IndexType maxfct, IndexType mnum, IndexType type, IndexType phase,
+                       IndexType n, void* a, IndexType* ia, IndexType* ja, IndexType* perm, IndexType nrhs,
+                       IndexType* iparm, IndexType msglvl, void* b, void* x) {
+    IndexType error = 0;
+    ::pardiso_64(pt, &maxfct, &mnum, &type, &phase, &n, a, ia, ja, perm, &nrhs, iparm, &msglvl, b, x, &error);
+    return error;
+  }
+};
 
-}
+template <class Pardiso>
+struct pardiso_traits;
 
-template<class Derived>
-class PardisoImpl
-{
-    typedef internal::pardiso_traits<Derived> Traits;
-  public:
-    typedef typename Traits::MatrixType MatrixType;
-    typedef typename Traits::Scalar Scalar;
-    typedef typename Traits::RealScalar RealScalar;
-    typedef typename Traits::Index Index;
-    typedef SparseMatrix<Scalar,RowMajor,Index> SparseMatrixType;
-    typedef Matrix<Scalar,Dynamic,1> VectorType;
-    typedef Matrix<Index, 1, MatrixType::ColsAtCompileTime> IntRowVectorType;
-    typedef Matrix<Index, MatrixType::RowsAtCompileTime, 1> IntColVectorType;
-    typedef Array<Index,64,1,DontAlign> ParameterType;
-    enum {
-      ScalarIsComplex = NumTraits<Scalar>::IsComplex
-    };
-
-    PardisoImpl()
-    {
-      eigen_assert((sizeof(Index) >= sizeof(_INTEGER_t) && sizeof(Index) <= 8) && "Non-supported index type");
-      m_iparm.setZero();
-      m_msglvl = 0; // No output
-      m_initialized = false;
-    }
+template <typename MatrixType_>
+struct pardiso_traits<PardisoLU<MatrixType_> > {
+  typedef MatrixType_ MatrixType;
+  typedef typename MatrixType_::Scalar Scalar;
+  typedef typename MatrixType_::RealScalar RealScalar;
+  typedef typename MatrixType_::StorageIndex StorageIndex;
+};
 
-    ~PardisoImpl()
-    {
-      pardisoRelease();
-    }
+template <typename MatrixType_, int Options>
+struct pardiso_traits<PardisoLLT<MatrixType_, Options> > {
+  typedef MatrixType_ MatrixType;
+  typedef typename MatrixType_::Scalar Scalar;
+  typedef typename MatrixType_::RealScalar RealScalar;
+  typedef typename MatrixType_::StorageIndex StorageIndex;
+};
 
-    inline Index cols() const { return m_size; }
-    inline Index rows() const { return m_size; }
-  
-    /** \brief Reports whether previous computation was successful.
-      *
-      * \returns \c Success if computation was succesful,
-      *          \c NumericalIssue if the matrix appears to be negative.
-      */
-    ComputationInfo info() const
-    {
-      eigen_assert(m_initialized && "Decomposition is not initialized.");
-      return m_info;
-    }
+template <typename MatrixType_, int Options>
+struct pardiso_traits<PardisoLDLT<MatrixType_, Options> > {
+  typedef MatrixType_ MatrixType;
+  typedef typename MatrixType_::Scalar Scalar;
+  typedef typename MatrixType_::RealScalar RealScalar;
+  typedef typename MatrixType_::StorageIndex StorageIndex;
+};
 
-    /** \warning for advanced usage only.
-      * \returns a reference to the parameter array controlling PARDISO.
-      * See the PARDISO manual to know how to use it. */
-    ParameterType& pardisoParameterArray()
-    {
-      return m_iparm;
-    }
-    
-    /** Performs a symbolic decomposition on the sparcity of \a matrix.
-      *
-      * This function is particularly useful when solving for several problems having the same structure.
-      * 
-      * \sa factorize()
-      */
-    Derived& analyzePattern(const MatrixType& matrix);
-    
-    /** Performs a numeric decomposition of \a matrix
-      *
-      * The given matrix must has the same sparcity than the matrix on which the symbolic decomposition has been performed.
-      *
-      * \sa analyzePattern()
-      */
-    Derived& factorize(const MatrixType& matrix);
-
-    Derived& compute(const MatrixType& matrix);
-    
-    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
-      *
-      * \sa compute()
-      */
-    template<typename Rhs>
-    inline const internal::solve_retval<PardisoImpl, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_initialized && "Pardiso solver is not initialized.");
-      eigen_assert(rows()==b.rows()
-                && "PardisoImpl::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::solve_retval<PardisoImpl, Rhs>(*this, b.derived());
-    }
+}  // end namespace internal
+
+template <class Derived>
+class PardisoImpl : public SparseSolverBase<Derived> {
+ protected:
+  typedef SparseSolverBase<Derived> Base;
+  using Base::derived;
+  using Base::m_isInitialized;
+
+  typedef internal::pardiso_traits<Derived> Traits;
+
+ public:
+  using Base::_solve_impl;
+
+  typedef typename Traits::MatrixType MatrixType;
+  typedef typename Traits::Scalar Scalar;
+  typedef typename Traits::RealScalar RealScalar;
+  typedef typename Traits::StorageIndex StorageIndex;
+  typedef SparseMatrix<Scalar, RowMajor, StorageIndex> SparseMatrixType;
+  typedef Matrix<Scalar, Dynamic, 1> VectorType;
+  typedef Matrix<StorageIndex, 1, MatrixType::ColsAtCompileTime> IntRowVectorType;
+  typedef Matrix<StorageIndex, MatrixType::RowsAtCompileTime, 1> IntColVectorType;
+  typedef Array<StorageIndex, 64, 1, DontAlign> ParameterType;
+  enum { ScalarIsComplex = NumTraits<Scalar>::IsComplex, ColsAtCompileTime = Dynamic, MaxColsAtCompileTime = Dynamic };
+
+  PardisoImpl() : m_analysisIsOk(false), m_factorizationIsOk(false) {
+    eigen_assert((sizeof(StorageIndex) >= sizeof(_INTEGER_t) && sizeof(StorageIndex) <= 8) &&
+                 "Non-supported index type");
+    m_iparm.setZero();
+    m_msglvl = 0;  // No output
+    m_isInitialized = false;
+  }
 
-    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
-      *
-      * \sa compute()
-      */
-    template<typename Rhs>
-    inline const internal::sparse_solve_retval<PardisoImpl, Rhs>
-    solve(const SparseMatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_initialized && "Pardiso solver is not initialized.");
-      eigen_assert(rows()==b.rows()
-                && "PardisoImpl::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::sparse_solve_retval<PardisoImpl, Rhs>(*this, b.derived());
-    }
+  ~PardisoImpl() { pardisoRelease(); }
 
-    Derived& derived()
-    {
-      return *static_cast<Derived*>(this);
-    }
-    const Derived& derived() const
-    {
-      return *static_cast<const Derived*>(this);
-    }
+  inline Index cols() const { return m_size; }
+  inline Index rows() const { return m_size; }
 
-    template<typename BDerived, typename XDerived>
-    bool _solve(const MatrixBase<BDerived> &b, MatrixBase<XDerived>& x) const;
+  /** \brief Reports whether previous computation was successful.
+   *
+   * \returns \c Success if computation was successful,
+   *          \c NumericalIssue if the matrix appears to be negative.
+   */
+  ComputationInfo info() const {
+    eigen_assert(m_isInitialized && "Decomposition is not initialized.");
+    return m_info;
+  }
 
-  protected:
-    void pardisoRelease()
+  /** \warning for advanced usage only.
+   * \returns a reference to the parameter array controlling PARDISO.
+   * See the PARDISO manual to know how to use it. */
+  ParameterType& pardisoParameterArray() { return m_iparm; }
+
+  /** Performs a symbolic decomposition on the sparsity of \a matrix.
+   *
+   * This function is particularly useful when solving for several problems having the same structure.
+   *
+   * \sa factorize()
+   */
+  Derived& analyzePattern(const MatrixType& matrix);
+
+  /** Performs a numeric decomposition of \a matrix
+   *
+   * The given matrix must have the same sparsity than the matrix on which the symbolic decomposition has been
+   * performed.
+   *
+   * \sa analyzePattern()
+   */
+  Derived& factorize(const MatrixType& matrix);
+
+  Derived& compute(const MatrixType& matrix);
+
+  template <typename Rhs, typename Dest>
+  void _solve_impl(const MatrixBase<Rhs>& b, MatrixBase<Dest>& dest) const;
+
+ protected:
+  void pardisoRelease() {
+    if (m_isInitialized)  // Factorization ran at least once
     {
-      if(m_initialized) // Factorization ran at least once
-      {
-        internal::pardiso_run_selector<Index>::run(m_pt, 1, 1, m_type, -1, m_size, 0, 0, 0, m_perm.data(), 0,
-                                                   m_iparm.data(), m_msglvl, 0, 0);
-      }
+      internal::pardiso_run_selector<StorageIndex>::run(m_pt, 1, 1, m_type, -1,
+                                                        internal::convert_index<StorageIndex>(m_size), 0, 0, 0,
+                                                        m_perm.data(), 0, m_iparm.data(), m_msglvl, NULL, NULL);
+      m_isInitialized = false;
     }
+  }
 
-    void pardisoInit(int type)
-    {
-      m_type = type;
-      bool symmetric = std::abs(m_type) < 10;
-      m_iparm[0] = 1;   // No solver default
-      m_iparm[1] = 3;   // use Metis for the ordering
-      m_iparm[2] = 1;   // Numbers of processors, value of OMP_NUM_THREADS
-      m_iparm[3] = 0;   // No iterative-direct algorithm
-      m_iparm[4] = 0;   // No user fill-in reducing permutation
-      m_iparm[5] = 0;   // Write solution into x
-      m_iparm[6] = 0;   // Not in use
-      m_iparm[7] = 2;   // Max numbers of iterative refinement steps
-      m_iparm[8] = 0;   // Not in use
-      m_iparm[9] = 13;  // Perturb the pivot elements with 1E-13
-      m_iparm[10] = symmetric ? 0 : 1; // Use nonsymmetric permutation and scaling MPS
-      m_iparm[11] = 0;  // Not in use
-      m_iparm[12] = symmetric ? 0 : 1;  // Maximum weighted matching algorithm is switched-off (default for symmetric).
-                                        // Try m_iparm[12] = 1 in case of inappropriate accuracy
-      m_iparm[13] = 0;  // Output: Number of perturbed pivots
-      m_iparm[14] = 0;  // Not in use
-      m_iparm[15] = 0;  // Not in use
-      m_iparm[16] = 0;  // Not in use
-      m_iparm[17] = -1; // Output: Number of nonzeros in the factor LU
-      m_iparm[18] = -1; // Output: Mflops for LU factorization
-      m_iparm[19] = 0;  // Output: Numbers of CG Iterations
-      
-      m_iparm[20] = 0;  // 1x1 pivoting
-      m_iparm[26] = 0;  // No matrix checker
-      m_iparm[27] = (sizeof(RealScalar) == 4) ? 1 : 0;
-      m_iparm[34] = 1;  // C indexing
-      m_iparm[59] = 1;  // Automatic switch between In-Core and Out-of-Core modes
-    }
+  void pardisoInit(int type) {
+    m_type = type;
+    bool symmetric = std::abs(m_type) < 10;
+    m_iparm[0] = 1;                   // No solver default
+    m_iparm[1] = 2;                   // use Metis for the ordering
+    m_iparm[2] = 0;                   // Reserved. Set to zero. (??Numbers of processors, value of OMP_NUM_THREADS??)
+    m_iparm[3] = 0;                   // No iterative-direct algorithm
+    m_iparm[4] = 0;                   // No user fill-in reducing permutation
+    m_iparm[5] = 0;                   // Write solution into x, b is left unchanged
+    m_iparm[6] = 0;                   // Not in use
+    m_iparm[7] = 2;                   // Max numbers of iterative refinement steps
+    m_iparm[8] = 0;                   // Not in use
+    m_iparm[9] = 13;                  // Perturb the pivot elements with 1E-13
+    m_iparm[10] = symmetric ? 0 : 1;  // Use nonsymmetric permutation and scaling MPS
+    m_iparm[11] = 0;                  // Not in use
+    m_iparm[12] = symmetric ? 0 : 1;  // Maximum weighted matching algorithm is switched-off (default for symmetric).
+                                      // Try m_iparm[12] = 1 in case of inappropriate accuracy
+    m_iparm[13] = 0;                  // Output: Number of perturbed pivots
+    m_iparm[14] = 0;                  // Not in use
+    m_iparm[15] = 0;                  // Not in use
+    m_iparm[16] = 0;                  // Not in use
+    m_iparm[17] = -1;                 // Output: Number of nonzeros in the factor LU
+    m_iparm[18] = -1;                 // Output: Mflops for LU factorization
+    m_iparm[19] = 0;                  // Output: Numbers of CG Iterations
+
+    m_iparm[20] = 0;  // 1x1 pivoting
+    m_iparm[26] = 0;  // No matrix checker
+    m_iparm[27] = (sizeof(RealScalar) == 4) ? 1 : 0;
+    m_iparm[34] = 1;  // C indexing
+    m_iparm[36] = 0;  // CSR
+    m_iparm[59] = 0;  // 0 - In-Core ; 1 - Automatic switch between In-Core and Out-of-Core modes ; 2 - Out-of-Core
+
+    memset(m_pt, 0, sizeof(m_pt));
+  }
 
-  protected:
-    // cached data to reduce reallocation, etc.
-    
-    void manageErrorCode(Index error)
-    {
-      switch(error)
-      {
-        case 0:
-          m_info = Success;
-          break;
-        case -4:
-        case -7:
-          m_info = NumericalIssue;
-          break;
-        default:
-          m_info = InvalidInput;
-      }
+ protected:
+  // cached data to reduce reallocation, etc.
+
+  void manageErrorCode(Index error) const {
+    switch (error) {
+      case 0:
+        m_info = Success;
+        break;
+      case -4:
+      case -7:
+        m_info = NumericalIssue;
+        break;
+      default:
+        m_info = InvalidInput;
     }
+  }
 
-    mutable SparseMatrixType m_matrix;
-    ComputationInfo m_info;
-    bool m_initialized, m_analysisIsOk, m_factorizationIsOk;
-    Index m_type, m_msglvl;
-    mutable void *m_pt[64];
-    mutable ParameterType m_iparm;
-    mutable IntColVectorType m_perm;
-    Index m_size;
-    
-  private:
-    PardisoImpl(PardisoImpl &) {}
+  mutable SparseMatrixType m_matrix;
+  mutable ComputationInfo m_info;
+  bool m_analysisIsOk, m_factorizationIsOk;
+  StorageIndex m_type, m_msglvl;
+  mutable void* m_pt[64];
+  mutable ParameterType m_iparm;
+  mutable IntColVectorType m_perm;
+  Index m_size;
 };
 
-template<class Derived>
-Derived& PardisoImpl<Derived>::compute(const MatrixType& a)
-{
+template <class Derived>
+Derived& PardisoImpl<Derived>::compute(const MatrixType& a) {
   m_size = a.rows();
   eigen_assert(a.rows() == a.cols());
 
   pardisoRelease();
-  memset(m_pt, 0, sizeof(m_pt));
   m_perm.setZero(m_size);
   derived().getMatrix(a);
-  
-  Index error;
-  error = internal::pardiso_run_selector<Index>::run(m_pt, 1, 1, m_type, 12, m_size,
-                                                     m_matrix.valuePtr(), m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(),
-                                                     m_perm.data(), 0, m_iparm.data(), m_msglvl, NULL, NULL);
 
+  Index error;
+  error = internal::pardiso_run_selector<StorageIndex>::run(
+      m_pt, 1, 1, m_type, 12, internal::convert_index<StorageIndex>(m_size), m_matrix.valuePtr(),
+      m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(), m_perm.data(), 0, m_iparm.data(), m_msglvl, NULL, NULL);
   manageErrorCode(error);
-  m_analysisIsOk = true;
-  m_factorizationIsOk = true;
-  m_initialized = true;
+  m_analysisIsOk = m_info == Eigen::Success;
+  m_factorizationIsOk = m_info == Eigen::Success;
+  m_isInitialized = true;
   return derived();
 }
 
-template<class Derived>
-Derived& PardisoImpl<Derived>::analyzePattern(const MatrixType& a)
-{
+template <class Derived>
+Derived& PardisoImpl<Derived>::analyzePattern(const MatrixType& a) {
   m_size = a.rows();
   eigen_assert(m_size == a.cols());
 
   pardisoRelease();
-  memset(m_pt, 0, sizeof(m_pt));
   m_perm.setZero(m_size);
   derived().getMatrix(a);
-  
+
   Index error;
-  error = internal::pardiso_run_selector<Index>::run(m_pt, 1, 1, m_type, 11, m_size,
-                                                     m_matrix.valuePtr(), m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(),
-                                                     m_perm.data(), 0, m_iparm.data(), m_msglvl, NULL, NULL);
-  
+  error = internal::pardiso_run_selector<StorageIndex>::run(
+      m_pt, 1, 1, m_type, 11, internal::convert_index<StorageIndex>(m_size), m_matrix.valuePtr(),
+      m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(), m_perm.data(), 0, m_iparm.data(), m_msglvl, NULL, NULL);
+
   manageErrorCode(error);
-  m_analysisIsOk = true;
+  m_analysisIsOk = m_info == Eigen::Success;
   m_factorizationIsOk = false;
-  m_initialized = true;
+  m_isInitialized = true;
   return derived();
 }
 
-template<class Derived>
-Derived& PardisoImpl<Derived>::factorize(const MatrixType& a)
-{
+template <class Derived>
+Derived& PardisoImpl<Derived>::factorize(const MatrixType& a) {
   eigen_assert(m_analysisIsOk && "You must first call analyzePattern()");
   eigen_assert(m_size == a.rows() && m_size == a.cols());
-  
+
   derived().getMatrix(a);
 
-  Index error;  
-  error = internal::pardiso_run_selector<Index>::run(m_pt, 1, 1, m_type, 22, m_size,
-                                                     m_matrix.valuePtr(), m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(),
-                                                     m_perm.data(), 0, m_iparm.data(), m_msglvl, NULL, NULL);
-  
+  Index error;
+  error = internal::pardiso_run_selector<StorageIndex>::run(
+      m_pt, 1, 1, m_type, 22, internal::convert_index<StorageIndex>(m_size), m_matrix.valuePtr(),
+      m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(), m_perm.data(), 0, m_iparm.data(), m_msglvl, NULL, NULL);
+
   manageErrorCode(error);
-  m_factorizationIsOk = true;
+  m_factorizationIsOk = m_info == Eigen::Success;
   return derived();
 }
 
-template<class Base>
-template<typename BDerived,typename XDerived>
-bool PardisoImpl<Base>::_solve(const MatrixBase<BDerived> &b, MatrixBase<XDerived>& x) const
-{
-  if(m_iparm[0] == 0) // Factorization was not computed
-    return false;
+template <class Derived>
+template <typename BDerived, typename XDerived>
+void PardisoImpl<Derived>::_solve_impl(const MatrixBase<BDerived>& b, MatrixBase<XDerived>& x) const {
+  if (m_iparm[0] == 0)  // Factorization was not computed
+  {
+    m_info = InvalidInput;
+    return;
+  }
 
-  //Index n = m_matrix.rows();
+  // Index n = m_matrix.rows();
   Index nrhs = Index(b.cols());
-  eigen_assert(m_size==b.rows());
-  eigen_assert(((MatrixBase<BDerived>::Flags & RowMajorBit) == 0 || nrhs == 1) && "Row-major right hand sides are not supported");
-  eigen_assert(((MatrixBase<XDerived>::Flags & RowMajorBit) == 0 || nrhs == 1) && "Row-major matrices of unknowns are not supported");
+  eigen_assert(m_size == b.rows());
+  eigen_assert(((MatrixBase<BDerived>::Flags & RowMajorBit) == 0 || nrhs == 1) &&
+               "Row-major right hand sides are not supported");
+  eigen_assert(((MatrixBase<XDerived>::Flags & RowMajorBit) == 0 || nrhs == 1) &&
+               "Row-major matrices of unknowns are not supported");
   eigen_assert(((nrhs == 1) || b.outerStride() == b.rows()));
 
-
-//  switch (transposed) {
-//    case SvNoTrans    : m_iparm[11] = 0 ; break;
-//    case SvTranspose  : m_iparm[11] = 2 ; break;
-//    case SvAdjoint    : m_iparm[11] = 1 ; break;
-//    default:
-//      //std::cerr << "Eigen: transposition  option \"" << transposed << "\" not supported by the PARDISO backend\n";
-//      m_iparm[11] = 0;
-//  }
+  //  switch (transposed) {
+  //    case SvNoTrans    : m_iparm[11] = 0 ; break;
+  //    case SvTranspose  : m_iparm[11] = 2 ; break;
+  //    case SvAdjoint    : m_iparm[11] = 1 ; break;
+  //    default:
+  //      //std::cerr << "Eigen: transposition  option \"" << transposed << "\" not supported by the PARDISO backend\n";
+  //      m_iparm[11] = 0;
+  //  }
 
   Scalar* rhs_ptr = const_cast<Scalar*>(b.derived().data());
-  Matrix<Scalar,Dynamic,Dynamic,ColMajor> tmp;
-  
+  Matrix<Scalar, Dynamic, Dynamic, ColMajor> tmp;
+
   // Pardiso cannot solve in-place
-  if(rhs_ptr == x.derived().data())
-  {
+  if (rhs_ptr == x.derived().data()) {
     tmp = b;
     rhs_ptr = tmp.data();
   }
-  
+
   Index error;
-  error = internal::pardiso_run_selector<Index>::run(m_pt, 1, 1, m_type, 33, m_size,
-                                                     m_matrix.valuePtr(), m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(),
-                                                     m_perm.data(), nrhs, m_iparm.data(), m_msglvl,
-                                                     rhs_ptr, x.derived().data());
+  error = internal::pardiso_run_selector<StorageIndex>::run(
+      m_pt, 1, 1, m_type, 33, internal::convert_index<StorageIndex>(m_size), m_matrix.valuePtr(),
+      m_matrix.outerIndexPtr(), m_matrix.innerIndexPtr(), m_perm.data(), internal::convert_index<StorageIndex>(nrhs),
+      m_iparm.data(), m_msglvl, rhs_ptr, x.derived().data());
 
-  return error==0;
+  manageErrorCode(error);
 }
 
-
 /** \ingroup PardisoSupport_Module
-  * \class PardisoLU
-  * \brief A sparse direct LU factorization and solver based on the PARDISO library
-  *
-  * This class allows to solve for A.X = B sparse linear problems via a direct LU factorization
-  * using the Intel MKL PARDISO library. The sparse matrix A must be squared and invertible.
-  * The vectors or matrices X and B can be either dense or sparse.
-  *
-  * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
-  *
-  * \sa \ref TutorialSparseDirectSolvers
-  */
-template<typename MatrixType>
-class PardisoLU : public PardisoImpl< PardisoLU<MatrixType> >
-{
-  protected:
-    typedef PardisoImpl< PardisoLU<MatrixType> > Base;
-    typedef typename Base::Scalar Scalar;
-    typedef typename Base::RealScalar RealScalar;
-    using Base::pardisoInit;
-    using Base::m_matrix;
-    friend class PardisoImpl< PardisoLU<MatrixType> >;
-
-  public:
-
-    using Base::compute;
-    using Base::solve;
-
-    PardisoLU()
-      : Base()
-    {
-      pardisoInit(Base::ScalarIsComplex ? 13 : 11);
-    }
+ * \class PardisoLU
+ * \brief A sparse direct LU factorization and solver based on the PARDISO library
+ *
+ * This class allows to solve for A.X = B sparse linear problems via a direct LU factorization
+ * using the Intel MKL PARDISO library. The sparse matrix A must be squared and invertible.
+ * The vectors or matrices X and B can be either dense or sparse.
+ *
+ * By default, it runs in in-core mode. To enable PARDISO's out-of-core feature, set:
+ * \code solver.pardisoParameterArray()[59] = 1; \endcode
+ *
+ * \tparam MatrixType_ the type of the sparse matrix A, it must be a SparseMatrix<>
+ *
+ * \implsparsesolverconcept
+ *
+ * \sa \ref TutorialSparseSolverConcept, class SparseLU
+ */
+template <typename MatrixType>
+class PardisoLU : public PardisoImpl<PardisoLU<MatrixType> > {
+ protected:
+  typedef PardisoImpl<PardisoLU> Base;
+  using Base::m_matrix;
+  using Base::pardisoInit;
+  friend class PardisoImpl<PardisoLU<MatrixType> >;
+
+ public:
+  typedef typename Base::Scalar Scalar;
+  typedef typename Base::RealScalar RealScalar;
+
+  using Base::compute;
+  using Base::solve;
+
+  PardisoLU() : Base() { pardisoInit(Base::ScalarIsComplex ? 13 : 11); }
+
+  explicit PardisoLU(const MatrixType& matrix) : Base() {
+    pardisoInit(Base::ScalarIsComplex ? 13 : 11);
+    compute(matrix);
+  }
 
-    PardisoLU(const MatrixType& matrix)
-      : Base()
-    {
-      pardisoInit(Base::ScalarIsComplex ? 13 : 11);
-      compute(matrix);
-    }
-  protected:
-    void getMatrix(const MatrixType& matrix)
-    {
-      m_matrix = matrix;
-    }
-    
-  private:
-    PardisoLU(PardisoLU& ) {}
+ protected:
+  void getMatrix(const MatrixType& matrix) {
+    m_matrix = matrix;
+    m_matrix.makeCompressed();
+  }
 };
 
 /** \ingroup PardisoSupport_Module
-  * \class PardisoLLT
-  * \brief A sparse direct Cholesky (LLT) factorization and solver based on the PARDISO library
-  *
-  * This class allows to solve for A.X = B sparse linear problems via a LL^T Cholesky factorization
-  * using the Intel MKL PARDISO library. The sparse matrix A must be selfajoint and positive definite.
-  * The vectors or matrices X and B can be either dense or sparse.
-  *
-  * \tparam MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
-  * \tparam UpLo can be any bitwise combination of Upper, Lower. The default is Upper, meaning only the upper triangular part has to be used.
-  *         Upper|Lower can be used to tell both triangular parts can be used as input.
-  *
-  * \sa \ref TutorialSparseDirectSolvers
-  */
-template<typename MatrixType, int _UpLo>
-class PardisoLLT : public PardisoImpl< PardisoLLT<MatrixType,_UpLo> >
-{
-  protected:
-    typedef PardisoImpl< PardisoLLT<MatrixType,_UpLo> > Base;
-    typedef typename Base::Scalar Scalar;
-    typedef typename Base::Index Index;
-    typedef typename Base::RealScalar RealScalar;
-    using Base::pardisoInit;
-    using Base::m_matrix;
-    friend class PardisoImpl< PardisoLLT<MatrixType,_UpLo> >;
-
-  public:
-
-    enum { UpLo = _UpLo };
-    using Base::compute;
-    using Base::solve;
-
-    PardisoLLT()
-      : Base()
-    {
-      pardisoInit(Base::ScalarIsComplex ? 4 : 2);
-    }
+ * \class PardisoLLT
+ * \brief A sparse direct Cholesky (LLT) factorization and solver based on the PARDISO library
+ *
+ * This class allows to solve for A.X = B sparse linear problems via a LL^T Cholesky factorization
+ * using the Intel MKL PARDISO library. The sparse matrix A must be selfajoint and positive definite.
+ * The vectors or matrices X and B can be either dense or sparse.
+ *
+ * By default, it runs in in-core mode. To enable PARDISO's out-of-core feature, set:
+ * \code solver.pardisoParameterArray()[59] = 1; \endcode
+ *
+ * \tparam MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
+ * \tparam UpLo can be any bitwise combination of Upper, Lower. The default is Upper, meaning only the upper triangular
+ * part has to be used. Upper|Lower can be used to tell both triangular parts can be used as input.
+ *
+ * \implsparsesolverconcept
+ *
+ * \sa \ref TutorialSparseSolverConcept, class SimplicialLLT
+ */
+template <typename MatrixType, int UpLo_>
+class PardisoLLT : public PardisoImpl<PardisoLLT<MatrixType, UpLo_> > {
+ protected:
+  typedef PardisoImpl<PardisoLLT<MatrixType, UpLo_> > Base;
+  using Base::m_matrix;
+  using Base::pardisoInit;
+  friend class PardisoImpl<PardisoLLT<MatrixType, UpLo_> >;
+
+ public:
+  typedef typename Base::Scalar Scalar;
+  typedef typename Base::RealScalar RealScalar;
+  typedef typename Base::StorageIndex StorageIndex;
+  enum { UpLo = UpLo_ };
+  using Base::compute;
+
+  PardisoLLT() : Base() { pardisoInit(Base::ScalarIsComplex ? 4 : 2); }
+
+  explicit PardisoLLT(const MatrixType& matrix) : Base() {
+    pardisoInit(Base::ScalarIsComplex ? 4 : 2);
+    compute(matrix);
+  }
 
-    PardisoLLT(const MatrixType& matrix)
-      : Base()
-    {
-      pardisoInit(Base::ScalarIsComplex ? 4 : 2);
-      compute(matrix);
-    }
-    
-  protected:
-    
-    void getMatrix(const MatrixType& matrix)
-    {
-      // PARDISO supports only upper, row-major matrices
-      PermutationMatrix<Dynamic,Dynamic,Index> p_null;
-      m_matrix.resize(matrix.rows(), matrix.cols());
-      m_matrix.template selfadjointView<Upper>() = matrix.template selfadjointView<UpLo>().twistedBy(p_null);
-    }
-    
-  private:
-    PardisoLLT(PardisoLLT& ) {}
+ protected:
+  void getMatrix(const MatrixType& matrix) {
+    // PARDISO supports only upper, row-major matrices
+    PermutationMatrix<Dynamic, Dynamic, StorageIndex> p_null;
+    m_matrix.resize(matrix.rows(), matrix.cols());
+    m_matrix.template selfadjointView<Upper>() = matrix.template selfadjointView<UpLo>().twistedBy(p_null);
+    m_matrix.makeCompressed();
+  }
 };
 
 /** \ingroup PardisoSupport_Module
-  * \class PardisoLDLT
-  * \brief A sparse direct Cholesky (LDLT) factorization and solver based on the PARDISO library
-  *
-  * This class allows to solve for A.X = B sparse linear problems via a LDL^T Cholesky factorization
-  * using the Intel MKL PARDISO library. The sparse matrix A is assumed to be selfajoint and positive definite.
-  * For complex matrices, A can also be symmetric only, see the \a Options template parameter.
-  * The vectors or matrices X and B can be either dense or sparse.
-  *
-  * \tparam MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
-  * \tparam Options can be any bitwise combination of Upper, Lower, and Symmetric. The default is Upper, meaning only the upper triangular part has to be used.
-  *         Symmetric can be used for symmetric, non-selfadjoint complex matrices, the default being to assume a selfadjoint matrix.
-  *         Upper|Lower can be used to tell both triangular parts can be used as input.
-  *
-  * \sa \ref TutorialSparseDirectSolvers
-  */
-template<typename MatrixType, int Options>
-class PardisoLDLT : public PardisoImpl< PardisoLDLT<MatrixType,Options> >
-{
-  protected:
-    typedef PardisoImpl< PardisoLDLT<MatrixType,Options> > Base;
-    typedef typename Base::Scalar Scalar;
-    typedef typename Base::Index Index;
-    typedef typename Base::RealScalar RealScalar;
-    using Base::pardisoInit;
-    using Base::m_matrix;
-    friend class PardisoImpl< PardisoLDLT<MatrixType,Options> >;
-
-  public:
-
-    using Base::compute;
-    using Base::solve;
-    enum { UpLo = Options&(Upper|Lower) };
-
-    PardisoLDLT()
-      : Base()
-    {
-      pardisoInit(Base::ScalarIsComplex ? ( bool(Options&Symmetric) ? 6 : -4 ) : -2);
-    }
-
-    PardisoLDLT(const MatrixType& matrix)
-      : Base()
-    {
-      pardisoInit(Base::ScalarIsComplex ? ( bool(Options&Symmetric) ? 6 : -4 ) : -2);
-      compute(matrix);
-    }
-    
-    void getMatrix(const MatrixType& matrix)
-    {
-      // PARDISO supports only upper, row-major matrices
-      PermutationMatrix<Dynamic,Dynamic,Index> p_null;
-      m_matrix.resize(matrix.rows(), matrix.cols());
-      m_matrix.template selfadjointView<Upper>() = matrix.template selfadjointView<UpLo>().twistedBy(p_null);
-    }
-    
-  private:
-    PardisoLDLT(PardisoLDLT& ) {}
-};
-
-namespace internal {
-  
-template<typename _Derived, typename Rhs>
-struct solve_retval<PardisoImpl<_Derived>, Rhs>
-  : solve_retval_base<PardisoImpl<_Derived>, Rhs>
-{
-  typedef PardisoImpl<_Derived> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
+ * \class PardisoLDLT
+ * \brief A sparse direct Cholesky (LDLT) factorization and solver based on the PARDISO library
+ *
+ * This class allows to solve for A.X = B sparse linear problems via a LDL^T Cholesky factorization
+ * using the Intel MKL PARDISO library. The sparse matrix A is assumed to be selfajoint and positive definite.
+ * For complex matrices, A can also be symmetric only, see the \a Options template parameter.
+ * The vectors or matrices X and B can be either dense or sparse.
+ *
+ * By default, it runs in in-core mode. To enable PARDISO's out-of-core feature, set:
+ * \code solver.pardisoParameterArray()[59] = 1; \endcode
+ *
+ * \tparam MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
+ * \tparam Options can be any bitwise combination of Upper, Lower, and Symmetric. The default is Upper, meaning only the
+ * upper triangular part has to be used. Symmetric can be used for symmetric, non-selfadjoint complex matrices, the
+ * default being to assume a selfadjoint matrix. Upper|Lower can be used to tell both triangular parts can be used as
+ * input.
+ *
+ * \implsparsesolverconcept
+ *
+ * \sa \ref TutorialSparseSolverConcept, class SimplicialLDLT
+ */
+template <typename MatrixType, int Options>
+class PardisoLDLT : public PardisoImpl<PardisoLDLT<MatrixType, Options> > {
+ protected:
+  typedef PardisoImpl<PardisoLDLT<MatrixType, Options> > Base;
+  using Base::m_matrix;
+  using Base::pardisoInit;
+  friend class PardisoImpl<PardisoLDLT<MatrixType, Options> >;
+
+ public:
+  typedef typename Base::Scalar Scalar;
+  typedef typename Base::RealScalar RealScalar;
+  typedef typename Base::StorageIndex StorageIndex;
+  using Base::compute;
+  enum { UpLo = Options & (Upper | Lower) };
+
+  PardisoLDLT() : Base() { pardisoInit(Base::ScalarIsComplex ? (bool(Options & Symmetric) ? 6 : -4) : -2); }
+
+  explicit PardisoLDLT(const MatrixType& matrix) : Base() {
+    pardisoInit(Base::ScalarIsComplex ? (bool(Options & Symmetric) ? 6 : -4) : -2);
+    compute(matrix);
   }
-};
-
-template<typename Derived, typename Rhs>
-struct sparse_solve_retval<PardisoImpl<Derived>, Rhs>
-  : sparse_solve_retval_base<PardisoImpl<Derived>, Rhs>
-{
-  typedef PardisoImpl<Derived> Dec;
-  EIGEN_MAKE_SPARSE_SOLVE_HELPERS(Dec,Rhs)
 
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    this->defaultEvalTo(dst);
+  void getMatrix(const MatrixType& matrix) {
+    // PARDISO supports only upper, row-major matrices
+    PermutationMatrix<Dynamic, Dynamic, StorageIndex> p_null;
+    m_matrix.resize(matrix.rows(), matrix.cols());
+    m_matrix.template selfadjointView<Upper>() = matrix.template selfadjointView<UpLo>().twistedBy(p_null);
+    m_matrix.makeCompressed();
   }
 };
 
-} // end namespace internal
-
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_PARDISOSUPPORT_H
+#endif  // EIGEN_PARDISOSUPPORT_H
diff --git a/inst/include/Eigen/src/QR/ColPivHouseholderQR.h b/inst/include/Eigen/src/QR/ColPivHouseholderQR.h
index 567eab7c..092c29d6 100644
--- a/inst/include/Eigen/src/QR/ColPivHouseholderQR.h
+++ b/inst/include/Eigen/src/QR/ColPivHouseholderQR.h
@@ -11,570 +11,664 @@
 #ifndef EIGEN_COLPIVOTINGHOUSEHOLDERQR_H
 #define EIGEN_COLPIVOTINGHOUSEHOLDERQR_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+template <typename MatrixType_, typename PermutationIndex_>
+struct traits<ColPivHouseholderQR<MatrixType_, PermutationIndex_>> : traits<MatrixType_> {
+  typedef MatrixXpr XprKind;
+  typedef SolverStorage StorageKind;
+  typedef PermutationIndex_ PermutationIndex;
+  enum { Flags = 0 };
+};
+
+}  // end namespace internal
 
 /** \ingroup QR_Module
-  *
-  * \class ColPivHouseholderQR
-  *
-  * \brief Householder rank-revealing QR decomposition of a matrix with column-pivoting
-  *
-  * \param MatrixType the type of the matrix of which we are computing the QR decomposition
-  *
-  * This class performs a rank-revealing QR decomposition of a matrix \b A into matrices \b P, \b Q and \b R
-  * such that 
-  * \f[
-  *  \mathbf{A} \, \mathbf{P} = \mathbf{Q} \, \mathbf{R}
-  * \f]
-  * by using Householder transformations. Here, \b P is a permutation matrix, \b Q a unitary matrix and \b R an 
-  * upper triangular matrix.
-  *
-  * This decomposition performs column pivoting in order to be rank-revealing and improve
-  * numerical stability. It is slower than HouseholderQR, and faster than FullPivHouseholderQR.
-  *
-  * \sa MatrixBase::colPivHouseholderQr()
-  */
-template<typename _MatrixType> class ColPivHouseholderQR
-{
-  public:
-
-    typedef _MatrixType MatrixType;
-    enum {
-      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-      Options = MatrixType::Options,
-      MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
-      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
-    };
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
-    typedef Matrix<Scalar, RowsAtCompileTime, RowsAtCompileTime, Options, MaxRowsAtCompileTime, MaxRowsAtCompileTime> MatrixQType;
-    typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
-    typedef PermutationMatrix<ColsAtCompileTime, MaxColsAtCompileTime> PermutationType;
-    typedef typename internal::plain_row_type<MatrixType, Index>::type IntRowVectorType;
-    typedef typename internal::plain_row_type<MatrixType>::type RowVectorType;
-    typedef typename internal::plain_row_type<MatrixType, RealScalar>::type RealRowVectorType;
-    typedef HouseholderSequence<MatrixType,typename internal::remove_all<typename HCoeffsType::ConjugateReturnType>::type> HouseholderSequenceType;
-    
-  private:
-    
-    typedef typename PermutationType::Index PermIndexType;
-    
-  public:
-
-    /**
-    * \brief Default Constructor.
-    *
-    * The default constructor is useful in cases in which the user intends to
-    * perform decompositions via ColPivHouseholderQR::compute(const MatrixType&).
-    */
-    ColPivHouseholderQR()
+ *
+ * \class ColPivHouseholderQR
+ *
+ * \brief Householder rank-revealing QR decomposition of a matrix with column-pivoting
+ *
+ * \tparam MatrixType_ the type of the matrix of which we are computing the QR decomposition
+ *
+ * This class performs a rank-revealing QR decomposition of a matrix \b A into matrices \b P, \b Q and \b R
+ * such that
+ * \f[
+ *  \mathbf{A} \, \mathbf{P} = \mathbf{Q} \, \mathbf{R}
+ * \f]
+ * by using Householder transformations. Here, \b P is a permutation matrix, \b Q a unitary matrix and \b R an
+ * upper triangular matrix.
+ *
+ * This decomposition performs column pivoting in order to be rank-revealing and improve
+ * numerical stability. It is slower than HouseholderQR, and faster than FullPivHouseholderQR.
+ *
+ * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
+ *
+ * \sa MatrixBase::colPivHouseholderQr()
+ */
+template <typename MatrixType_, typename PermutationIndex_>
+class ColPivHouseholderQR : public SolverBase<ColPivHouseholderQR<MatrixType_, PermutationIndex_>> {
+ public:
+  typedef MatrixType_ MatrixType;
+  typedef SolverBase<ColPivHouseholderQR> Base;
+  friend class SolverBase<ColPivHouseholderQR>;
+  typedef PermutationIndex_ PermutationIndex;
+  EIGEN_GENERIC_PUBLIC_INTERFACE(ColPivHouseholderQR)
+
+  enum {
+    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+  };
+  typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
+  typedef PermutationMatrix<ColsAtCompileTime, MaxColsAtCompileTime, PermutationIndex> PermutationType;
+  typedef typename internal::plain_row_type<MatrixType, PermutationIndex>::type IntRowVectorType;
+  typedef typename internal::plain_row_type<MatrixType>::type RowVectorType;
+  typedef typename internal::plain_row_type<MatrixType, RealScalar>::type RealRowVectorType;
+  typedef HouseholderSequence<MatrixType, internal::remove_all_t<typename HCoeffsType::ConjugateReturnType>>
+      HouseholderSequenceType;
+  typedef typename MatrixType::PlainObject PlainObject;
+
+ private:
+  void init(Index rows, Index cols) {
+    Index diag = numext::mini(rows, cols);
+    m_hCoeffs.resize(diag);
+    m_colsPermutation.resize(cols);
+    m_colsTranspositions.resize(cols);
+    m_temp.resize(cols);
+    m_colNormsUpdated.resize(cols);
+    m_colNormsDirect.resize(cols);
+    m_isInitialized = false;
+    m_usePrescribedThreshold = false;
+  }
+
+ public:
+  /**
+   * \brief Default Constructor.
+   *
+   * The default constructor is useful in cases in which the user intends to
+   * perform decompositions via ColPivHouseholderQR::compute(const MatrixType&).
+   */
+  ColPivHouseholderQR()
       : m_qr(),
         m_hCoeffs(),
         m_colsPermutation(),
         m_colsTranspositions(),
         m_temp(),
-        m_colSqNorms(),
+        m_colNormsUpdated(),
+        m_colNormsDirect(),
         m_isInitialized(false),
         m_usePrescribedThreshold(false) {}
 
-    /** \brief Default Constructor with memory preallocation
-      *
-      * Like the default constructor but with preallocation of the internal data
-      * according to the specified problem \a size.
-      * \sa ColPivHouseholderQR()
-      */
-    ColPivHouseholderQR(Index rows, Index cols)
-      : m_qr(rows, cols),
-        m_hCoeffs((std::min)(rows,cols)),
-        m_colsPermutation(PermIndexType(cols)),
-        m_colsTranspositions(cols),
-        m_temp(cols),
-        m_colSqNorms(cols),
-        m_isInitialized(false),
-        m_usePrescribedThreshold(false) {}
+  /** \brief Default Constructor with memory preallocation
+   *
+   * Like the default constructor but with preallocation of the internal data
+   * according to the specified problem \a size.
+   * \sa ColPivHouseholderQR()
+   */
+  ColPivHouseholderQR(Index rows, Index cols) : m_qr(rows, cols) { init(rows, cols); }
+
+  /** \brief Constructs a QR factorization from a given matrix
+   *
+   * This constructor computes the QR factorization of the matrix \a matrix by calling
+   * the method compute(). It is a short cut for:
+   *
+   * \code
+   * ColPivHouseholderQR<MatrixType> qr(matrix.rows(), matrix.cols());
+   * qr.compute(matrix);
+   * \endcode
+   *
+   * \sa compute()
+   */
+  template <typename InputType>
+  explicit ColPivHouseholderQR(const EigenBase<InputType>& matrix) : m_qr(matrix.rows(), matrix.cols()) {
+    init(matrix.rows(), matrix.cols());
+    compute(matrix.derived());
+  }
 
-    /** \brief Constructs a QR factorization from a given matrix
-      *
-      * This constructor computes the QR factorization of the matrix \a matrix by calling
-      * the method compute(). It is a short cut for:
-      * 
-      * \code
-      * ColPivHouseholderQR<MatrixType> qr(matrix.rows(), matrix.cols());
-      * qr.compute(matrix);
-      * \endcode
-      * 
-      * \sa compute()
-      */
-    ColPivHouseholderQR(const MatrixType& matrix)
-      : m_qr(matrix.rows(), matrix.cols()),
-        m_hCoeffs((std::min)(matrix.rows(),matrix.cols())),
-        m_colsPermutation(PermIndexType(matrix.cols())),
-        m_colsTranspositions(matrix.cols()),
-        m_temp(matrix.cols()),
-        m_colSqNorms(matrix.cols()),
-        m_isInitialized(false),
-        m_usePrescribedThreshold(false)
-    {
-      compute(matrix);
-    }
+  /** \brief Constructs a QR factorization from a given matrix
+   *
+   * This overloaded constructor is provided for \link InplaceDecomposition inplace decomposition \endlink when \c
+   * MatrixType is a Eigen::Ref.
+   *
+   * \sa ColPivHouseholderQR(const EigenBase&)
+   */
+  template <typename InputType>
+  explicit ColPivHouseholderQR(EigenBase<InputType>& matrix) : m_qr(matrix.derived()) {
+    init(matrix.rows(), matrix.cols());
+    computeInPlace();
+  }
 
-    /** This method finds a solution x to the equation Ax=b, where A is the matrix of which
-      * *this is the QR decomposition, if any exists.
-      *
-      * \param b the right-hand-side of the equation to solve.
-      *
-      * \returns a solution.
-      *
-      * \note The case where b is a matrix is not yet implemented. Also, this
-      *       code is space inefficient.
-      *
-      * \note_about_checking_solutions
-      *
-      * \note_about_arbitrary_choice_of_solution
-      *
-      * Example: \include ColPivHouseholderQR_solve.cpp
-      * Output: \verbinclude ColPivHouseholderQR_solve.out
-      */
-    template<typename Rhs>
-    inline const internal::solve_retval<ColPivHouseholderQR, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
-      return internal::solve_retval<ColPivHouseholderQR, Rhs>(*this, b.derived());
-    }
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+  /** This method finds a solution x to the equation Ax=b, where A is the matrix of which
+   * *this is the QR decomposition, if any exists.
+   *
+   * \param b the right-hand-side of the equation to solve.
+   *
+   * \returns a solution.
+   *
+   * \note_about_checking_solutions
+   *
+   * \note_about_arbitrary_choice_of_solution
+   *
+   * Example: \include ColPivHouseholderQR_solve.cpp
+   * Output: \verbinclude ColPivHouseholderQR_solve.out
+   */
+  template <typename Rhs>
+  inline const Solve<ColPivHouseholderQR, Rhs> solve(const MatrixBase<Rhs>& b) const;
+#endif
+
+  HouseholderSequenceType householderQ() const;
+  HouseholderSequenceType matrixQ() const { return householderQ(); }
+
+  /** \returns a reference to the matrix where the Householder QR decomposition is stored
+   */
+  const MatrixType& matrixQR() const {
+    eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
+    return m_qr;
+  }
 
-    HouseholderSequenceType householderQ(void) const;
-    HouseholderSequenceType matrixQ(void) const
-    {
-      return householderQ(); 
-    }
+  /** \returns a reference to the matrix where the result Householder QR is stored
+   * \warning The strict lower part of this matrix contains internal values.
+   * Only the upper triangular part should be referenced. To get it, use
+   * \code matrixR().template triangularView<Upper>() \endcode
+   * For rank-deficient matrices, use
+   * \code
+   * matrixR().topLeftCorner(rank(), rank()).template triangularView<Upper>()
+   * \endcode
+   */
+  const MatrixType& matrixR() const {
+    eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
+    return m_qr;
+  }
 
-    /** \returns a reference to the matrix where the Householder QR decomposition is stored
-      */
-    const MatrixType& matrixQR() const
-    {
-      eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
-      return m_qr;
-    }
-    
-    /** \returns a reference to the matrix where the result Householder QR is stored 
-     * \warning The strict lower part of this matrix contains internal values. 
-     * Only the upper triangular part should be referenced. To get it, use
-     * \code matrixR().template triangularView<Upper>() \endcode
-     * For rank-deficient matrices, use 
-     * \code 
-     * matrixR().topLeftCorner(rank(), rank()).template triangularView<Upper>() 
-     * \endcode
-     */
-    const MatrixType& matrixR() const
-    {
-      eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
-      return m_qr;
-    }
-    
-    ColPivHouseholderQR& compute(const MatrixType& matrix);
-
-    /** \returns a const reference to the column permutation matrix */
-    const PermutationType& colsPermutation() const
-    {
-      eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
-      return m_colsPermutation;
-    }
+  template <typename InputType>
+  ColPivHouseholderQR& compute(const EigenBase<InputType>& matrix);
 
-    /** \returns the absolute value of the determinant of the matrix of which
-      * *this is the QR decomposition. It has only linear complexity
-      * (that is, O(n) where n is the dimension of the square matrix)
-      * as the QR decomposition has already been computed.
-      *
-      * \note This is only for square matrices.
-      *
-      * \warning a determinant can be very big or small, so for matrices
-      * of large enough dimension, there is a risk of overflow/underflow.
-      * One way to work around that is to use logAbsDeterminant() instead.
-      *
-      * \sa logAbsDeterminant(), MatrixBase::determinant()
-      */
-    typename MatrixType::RealScalar absDeterminant() const;
-
-    /** \returns the natural log of the absolute value of the determinant of the matrix of which
-      * *this is the QR decomposition. It has only linear complexity
-      * (that is, O(n) where n is the dimension of the square matrix)
-      * as the QR decomposition has already been computed.
-      *
-      * \note This is only for square matrices.
-      *
-      * \note This method is useful to work around the risk of overflow/underflow that's inherent
-      * to determinant computation.
-      *
-      * \sa absDeterminant(), MatrixBase::determinant()
-      */
-    typename MatrixType::RealScalar logAbsDeterminant() const;
-
-    /** \returns the rank of the matrix of which *this is the QR decomposition.
-      *
-      * \note This method has to determine which pivots should be considered nonzero.
-      *       For that, it uses the threshold value that you can control by calling
-      *       setThreshold(const RealScalar&).
-      */
-    inline Index rank() const
-    {
-      using std::abs;
-      eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
-      RealScalar premultiplied_threshold = abs(m_maxpivot) * threshold();
-      Index result = 0;
-      for(Index i = 0; i < m_nonzero_pivots; ++i)
-        result += (abs(m_qr.coeff(i,i)) > premultiplied_threshold);
-      return result;
-    }
+  /** \returns a const reference to the column permutation matrix */
+  const PermutationType& colsPermutation() const {
+    eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
+    return m_colsPermutation;
+  }
 
-    /** \returns the dimension of the kernel of the matrix of which *this is the QR decomposition.
-      *
-      * \note This method has to determine which pivots should be considered nonzero.
-      *       For that, it uses the threshold value that you can control by calling
-      *       setThreshold(const RealScalar&).
-      */
-    inline Index dimensionOfKernel() const
-    {
-      eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
-      return cols() - rank();
-    }
+  /** \returns the determinant of the matrix of which
+   * *this is the QR decomposition. It has only linear complexity
+   * (that is, O(n) where n is the dimension of the square matrix)
+   * as the QR decomposition has already been computed.
+   *
+   * \note This is only for square matrices.
+   *
+   * \warning a determinant can be very big or small, so for matrices
+   * of large enough dimension, there is a risk of overflow/underflow.
+   * One way to work around that is to use logAbsDeterminant() instead.
+   *
+   * \sa absDeterminant(), logAbsDeterminant(), MatrixBase::determinant()
+   */
+  typename MatrixType::Scalar determinant() const;
+
+  /** \returns the absolute value of the determinant of the matrix of which
+   * *this is the QR decomposition. It has only linear complexity
+   * (that is, O(n) where n is the dimension of the square matrix)
+   * as the QR decomposition has already been computed.
+   *
+   * \note This is only for square matrices.
+   *
+   * \warning a determinant can be very big or small, so for matrices
+   * of large enough dimension, there is a risk of overflow/underflow.
+   * One way to work around that is to use logAbsDeterminant() instead.
+   *
+   * \sa determinant(), logAbsDeterminant(), MatrixBase::determinant()
+   */
+  typename MatrixType::RealScalar absDeterminant() const;
+
+  /** \returns the natural log of the absolute value of the determinant of the matrix of which
+   * *this is the QR decomposition. It has only linear complexity
+   * (that is, O(n) where n is the dimension of the square matrix)
+   * as the QR decomposition has already been computed.
+   *
+   * \note This is only for square matrices.
+   *
+   * \note This method is useful to work around the risk of overflow/underflow that's inherent
+   * to determinant computation.
+   *
+   * \sa determinant(), absDeterminant(), MatrixBase::determinant()
+   */
+  typename MatrixType::RealScalar logAbsDeterminant() const;
+
+  /** \returns the sign of the determinant of the matrix of which
+   * *this is the QR decomposition. It has only linear complexity
+   * (that is, O(n) where n is the dimension of the square matrix)
+   * as the QR decomposition has already been computed.
+   *
+   * \note This is only for square matrices.
+   *
+   * \note This method is useful to work around the risk of overflow/underflow that's inherent
+   * to determinant computation.
+   *
+   * \sa determinant(), absDeterminant(), logAbsDeterminant(), MatrixBase::determinant()
+   */
+  typename MatrixType::Scalar signDeterminant() const;
+
+  /** \returns the rank of the matrix of which *this is the QR decomposition.
+   *
+   * \note This method has to determine which pivots should be considered nonzero.
+   *       For that, it uses the threshold value that you can control by calling
+   *       setThreshold(const RealScalar&).
+   */
+  inline Index rank() const {
+    using std::abs;
+    eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
+    RealScalar premultiplied_threshold = abs(m_maxpivot) * threshold();
+    Index result = 0;
+    for (Index i = 0; i < m_nonzero_pivots; ++i) result += (abs(m_qr.coeff(i, i)) > premultiplied_threshold);
+    return result;
+  }
 
-    /** \returns true if the matrix of which *this is the QR decomposition represents an injective
-      *          linear map, i.e. has trivial kernel; false otherwise.
-      *
-      * \note This method has to determine which pivots should be considered nonzero.
-      *       For that, it uses the threshold value that you can control by calling
-      *       setThreshold(const RealScalar&).
-      */
-    inline bool isInjective() const
-    {
-      eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
-      return rank() == cols();
-    }
+  /** \returns the dimension of the kernel of the matrix of which *this is the QR decomposition.
+   *
+   * \note This method has to determine which pivots should be considered nonzero.
+   *       For that, it uses the threshold value that you can control by calling
+   *       setThreshold(const RealScalar&).
+   */
+  inline Index dimensionOfKernel() const {
+    eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
+    return cols() - rank();
+  }
 
-    /** \returns true if the matrix of which *this is the QR decomposition represents a surjective
-      *          linear map; false otherwise.
-      *
-      * \note This method has to determine which pivots should be considered nonzero.
-      *       For that, it uses the threshold value that you can control by calling
-      *       setThreshold(const RealScalar&).
-      */
-    inline bool isSurjective() const
-    {
-      eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
-      return rank() == rows();
-    }
+  /** \returns true if the matrix of which *this is the QR decomposition represents an injective
+   *          linear map, i.e. has trivial kernel; false otherwise.
+   *
+   * \note This method has to determine which pivots should be considered nonzero.
+   *       For that, it uses the threshold value that you can control by calling
+   *       setThreshold(const RealScalar&).
+   */
+  inline bool isInjective() const {
+    eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
+    return rank() == cols();
+  }
 
-    /** \returns true if the matrix of which *this is the QR decomposition is invertible.
-      *
-      * \note This method has to determine which pivots should be considered nonzero.
-      *       For that, it uses the threshold value that you can control by calling
-      *       setThreshold(const RealScalar&).
-      */
-    inline bool isInvertible() const
-    {
-      eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
-      return isInjective() && isSurjective();
-    }
+  /** \returns true if the matrix of which *this is the QR decomposition represents a surjective
+   *          linear map; false otherwise.
+   *
+   * \note This method has to determine which pivots should be considered nonzero.
+   *       For that, it uses the threshold value that you can control by calling
+   *       setThreshold(const RealScalar&).
+   */
+  inline bool isSurjective() const {
+    eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
+    return rank() == rows();
+  }
 
-    /** \returns the inverse of the matrix of which *this is the QR decomposition.
-      *
-      * \note If this matrix is not invertible, the returned matrix has undefined coefficients.
-      *       Use isInvertible() to first determine whether this matrix is invertible.
-      */
-    inline const
-    internal::solve_retval<ColPivHouseholderQR, typename MatrixType::IdentityReturnType>
-    inverse() const
-    {
-      eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
-      return internal::solve_retval<ColPivHouseholderQR,typename MatrixType::IdentityReturnType>
-               (*this, MatrixType::Identity(m_qr.rows(), m_qr.cols()));
-    }
+  /** \returns true if the matrix of which *this is the QR decomposition is invertible.
+   *
+   * \note This method has to determine which pivots should be considered nonzero.
+   *       For that, it uses the threshold value that you can control by calling
+   *       setThreshold(const RealScalar&).
+   */
+  inline bool isInvertible() const {
+    eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
+    return isInjective() && isSurjective();
+  }
 
-    inline Index rows() const { return m_qr.rows(); }
-    inline Index cols() const { return m_qr.cols(); }
-    
-    /** \returns a const reference to the vector of Householder coefficients used to represent the factor \c Q.
-      * 
-      * For advanced uses only.
-      */
-    const HCoeffsType& hCoeffs() const { return m_hCoeffs; }
-
-    /** Allows to prescribe a threshold to be used by certain methods, such as rank(),
-      * who need to determine when pivots are to be considered nonzero. This is not used for the
-      * QR decomposition itself.
-      *
-      * When it needs to get the threshold value, Eigen calls threshold(). By default, this
-      * uses a formula to automatically determine a reasonable threshold.
-      * Once you have called the present method setThreshold(const RealScalar&),
-      * your value is used instead.
-      *
-      * \param threshold The new value to use as the threshold.
-      *
-      * A pivot will be considered nonzero if its absolute value is strictly greater than
-      *  \f$ \vert pivot \vert \leqslant threshold \times \vert maxpivot \vert \f$
-      * where maxpivot is the biggest pivot.
-      *
-      * If you want to come back to the default behavior, call setThreshold(Default_t)
-      */
-    ColPivHouseholderQR& setThreshold(const RealScalar& threshold)
-    {
-      m_usePrescribedThreshold = true;
-      m_prescribedThreshold = threshold;
-      return *this;
-    }
+  /** \returns the inverse of the matrix of which *this is the QR decomposition.
+   *
+   * \note If this matrix is not invertible, the returned matrix has undefined coefficients.
+   *       Use isInvertible() to first determine whether this matrix is invertible.
+   */
+  inline const Inverse<ColPivHouseholderQR> inverse() const {
+    eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
+    return Inverse<ColPivHouseholderQR>(*this);
+  }
 
-    /** Allows to come back to the default behavior, letting Eigen use its default formula for
-      * determining the threshold.
-      *
-      * You should pass the special object Eigen::Default as parameter here.
-      * \code qr.setThreshold(Eigen::Default); \endcode
-      *
-      * See the documentation of setThreshold(const RealScalar&).
-      */
-    ColPivHouseholderQR& setThreshold(Default_t)
-    {
-      m_usePrescribedThreshold = false;
-      return *this;
-    }
+  inline Index rows() const { return m_qr.rows(); }
+  inline Index cols() const { return m_qr.cols(); }
+
+  /** \returns a const reference to the vector of Householder coefficients used to represent the factor \c Q.
+   *
+   * For advanced uses only.
+   */
+  const HCoeffsType& hCoeffs() const { return m_hCoeffs; }
+
+  /** Allows to prescribe a threshold to be used by certain methods, such as rank(),
+   * who need to determine when pivots are to be considered nonzero. This is not used for the
+   * QR decomposition itself.
+   *
+   * When it needs to get the threshold value, Eigen calls threshold(). By default, this
+   * uses a formula to automatically determine a reasonable threshold.
+   * Once you have called the present method setThreshold(const RealScalar&),
+   * your value is used instead.
+   *
+   * \param threshold The new value to use as the threshold.
+   *
+   * A pivot will be considered nonzero if its absolute value is strictly greater than
+   *  \f$ \vert pivot \vert \leqslant threshold \times \vert maxpivot \vert \f$
+   * where maxpivot is the biggest pivot.
+   *
+   * If you want to come back to the default behavior, call setThreshold(Default_t)
+   */
+  ColPivHouseholderQR& setThreshold(const RealScalar& threshold) {
+    m_usePrescribedThreshold = true;
+    m_prescribedThreshold = threshold;
+    return *this;
+  }
 
-    /** Returns the threshold that will be used by certain methods such as rank().
-      *
-      * See the documentation of setThreshold(const RealScalar&).
-      */
-    RealScalar threshold() const
-    {
-      eigen_assert(m_isInitialized || m_usePrescribedThreshold);
-      return m_usePrescribedThreshold ? m_prescribedThreshold
-      // this formula comes from experimenting (see "LU precision tuning" thread on the list)
-      // and turns out to be identical to Higham's formula used already in LDLt.
-                                      : NumTraits<Scalar>::epsilon() * RealScalar(m_qr.diagonalSize());
-    }
+  /** Allows to come back to the default behavior, letting Eigen use its default formula for
+   * determining the threshold.
+   *
+   * You should pass the special object Eigen::Default as parameter here.
+   * \code qr.setThreshold(Eigen::Default); \endcode
+   *
+   * See the documentation of setThreshold(const RealScalar&).
+   */
+  ColPivHouseholderQR& setThreshold(Default_t) {
+    m_usePrescribedThreshold = false;
+    return *this;
+  }
 
-    /** \returns the number of nonzero pivots in the QR decomposition.
-      * Here nonzero is meant in the exact sense, not in a fuzzy sense.
-      * So that notion isn't really intrinsically interesting, but it is
-      * still useful when implementing algorithms.
-      *
-      * \sa rank()
-      */
-    inline Index nonzeroPivots() const
-    {
-      eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
-      return m_nonzero_pivots;
-    }
+  /** Returns the threshold that will be used by certain methods such as rank().
+   *
+   * See the documentation of setThreshold(const RealScalar&).
+   */
+  RealScalar threshold() const {
+    eigen_assert(m_isInitialized || m_usePrescribedThreshold);
+    return m_usePrescribedThreshold ? m_prescribedThreshold
+                                    // this formula comes from experimenting (see "LU precision tuning" thread on the
+                                    // list) and turns out to be identical to Higham's formula used already in LDLt.
+                                    : NumTraits<Scalar>::epsilon() * RealScalar(m_qr.diagonalSize());
+  }
 
-    /** \returns the absolute value of the biggest pivot, i.e. the biggest
-      *          diagonal coefficient of R.
-      */
-    RealScalar maxPivot() const { return m_maxpivot; }
-    
-    /** \brief Reports whether the QR factorization was succesful.
-      *
-      * \note This function always returns \c Success. It is provided for compatibility 
-      * with other factorization routines.
-      * \returns \c Success 
-      */
-    ComputationInfo info() const
-    {
-      eigen_assert(m_isInitialized && "Decomposition is not initialized.");
-      return Success;
-    }
+  /** \returns the number of nonzero pivots in the QR decomposition.
+   * Here nonzero is meant in the exact sense, not in a fuzzy sense.
+   * So that notion isn't really intrinsically interesting, but it is
+   * still useful when implementing algorithms.
+   *
+   * \sa rank()
+   */
+  inline Index nonzeroPivots() const {
+    eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
+    return m_nonzero_pivots;
+  }
 
-  protected:
-    
-    static void check_template_parameters()
-    {
-      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
-    }
-    
-    MatrixType m_qr;
-    HCoeffsType m_hCoeffs;
-    PermutationType m_colsPermutation;
-    IntRowVectorType m_colsTranspositions;
-    RowVectorType m_temp;
-    RealRowVectorType m_colSqNorms;
-    bool m_isInitialized, m_usePrescribedThreshold;
-    RealScalar m_prescribedThreshold, m_maxpivot;
-    Index m_nonzero_pivots;
-    Index m_det_pq;
+  /** \returns the absolute value of the biggest pivot, i.e. the biggest
+   *          diagonal coefficient of R.
+   */
+  RealScalar maxPivot() const { return m_maxpivot; }
+
+  /** \brief Reports whether the QR factorization was successful.
+   *
+   * \note This function always returns \c Success. It is provided for compatibility
+   * with other factorization routines.
+   * \returns \c Success
+   */
+  ComputationInfo info() const {
+    eigen_assert(m_isInitialized && "Decomposition is not initialized.");
+    return Success;
+  }
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  template <typename RhsType, typename DstType>
+  void _solve_impl(const RhsType& rhs, DstType& dst) const;
+
+  template <bool Conjugate, typename RhsType, typename DstType>
+  void _solve_impl_transposed(const RhsType& rhs, DstType& dst) const;
+#endif
+
+ protected:
+  friend class CompleteOrthogonalDecomposition<MatrixType, PermutationIndex>;
+
+  EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
+
+  void computeInPlace();
+
+  MatrixType m_qr;
+  HCoeffsType m_hCoeffs;
+  PermutationType m_colsPermutation;
+  IntRowVectorType m_colsTranspositions;
+  RowVectorType m_temp;
+  RealRowVectorType m_colNormsUpdated;
+  RealRowVectorType m_colNormsDirect;
+  bool m_isInitialized, m_usePrescribedThreshold;
+  RealScalar m_prescribedThreshold, m_maxpivot;
+  Index m_nonzero_pivots;
+  Index m_det_p;
 };
 
-template<typename MatrixType>
-typename MatrixType::RealScalar ColPivHouseholderQR<MatrixType>::absDeterminant() const
-{
+template <typename MatrixType, typename PermutationIndex>
+typename MatrixType::Scalar ColPivHouseholderQR<MatrixType, PermutationIndex>::determinant() const {
+  eigen_assert(m_isInitialized && "HouseholderQR is not initialized.");
+  eigen_assert(m_qr.rows() == m_qr.cols() && "You can't take the determinant of a non-square matrix!");
+  Scalar detQ;
+  internal::householder_determinant<HCoeffsType, Scalar, NumTraits<Scalar>::IsComplex>::run(m_hCoeffs, detQ);
+  return isInjective() ? (detQ * Scalar(m_det_p)) * m_qr.diagonal().prod() : Scalar(0);
+}
+
+template <typename MatrixType, typename PermutationIndex>
+typename MatrixType::RealScalar ColPivHouseholderQR<MatrixType, PermutationIndex>::absDeterminant() const {
   using std::abs;
   eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
   eigen_assert(m_qr.rows() == m_qr.cols() && "You can't take the determinant of a non-square matrix!");
-  return abs(m_qr.diagonal().prod());
+  return isInjective() ? abs(m_qr.diagonal().prod()) : RealScalar(0);
+}
+
+template <typename MatrixType, typename PermutationIndex>
+typename MatrixType::RealScalar ColPivHouseholderQR<MatrixType, PermutationIndex>::logAbsDeterminant() const {
+  eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
+  eigen_assert(m_qr.rows() == m_qr.cols() && "You can't take the determinant of a non-square matrix!");
+  return isInjective() ? m_qr.diagonal().cwiseAbs().array().log().sum() : -NumTraits<RealScalar>::infinity();
 }
 
-template<typename MatrixType>
-typename MatrixType::RealScalar ColPivHouseholderQR<MatrixType>::logAbsDeterminant() const
-{
+template <typename MatrixType, typename PermutationIndex>
+typename MatrixType::Scalar ColPivHouseholderQR<MatrixType, PermutationIndex>::signDeterminant() const {
   eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
   eigen_assert(m_qr.rows() == m_qr.cols() && "You can't take the determinant of a non-square matrix!");
-  return m_qr.diagonal().cwiseAbs().array().log().sum();
+  Scalar detQ;
+  internal::householder_determinant<HCoeffsType, Scalar, NumTraits<Scalar>::IsComplex>::run(m_hCoeffs, detQ);
+  return isInjective() ? (detQ * Scalar(m_det_p)) * m_qr.diagonal().array().sign().prod() : Scalar(0);
 }
 
 /** Performs the QR factorization of the given matrix \a matrix. The result of
-  * the factorization is stored into \c *this, and a reference to \c *this
-  * is returned.
-  *
-  * \sa class ColPivHouseholderQR, ColPivHouseholderQR(const MatrixType&)
-  */
-template<typename MatrixType>
-ColPivHouseholderQR<MatrixType>& ColPivHouseholderQR<MatrixType>::compute(const MatrixType& matrix)
-{
-  check_template_parameters();
-  
+ * the factorization is stored into \c *this, and a reference to \c *this
+ * is returned.
+ *
+ * \sa class ColPivHouseholderQR, ColPivHouseholderQR(const MatrixType&)
+ */
+template <typename MatrixType, typename PermutationIndex>
+template <typename InputType>
+ColPivHouseholderQR<MatrixType, PermutationIndex>& ColPivHouseholderQR<MatrixType, PermutationIndex>::compute(
+    const EigenBase<InputType>& matrix) {
+  m_qr = matrix.derived();
+  computeInPlace();
+  return *this;
+}
+
+template <typename MatrixType, typename PermutationIndex>
+void ColPivHouseholderQR<MatrixType, PermutationIndex>::computeInPlace() {
+  eigen_assert(m_qr.cols() <= NumTraits<PermutationIndex>::highest());
+
   using std::abs;
-  Index rows = matrix.rows();
-  Index cols = matrix.cols();
-  Index size = matrix.diagonalSize();
-  
-  // the column permutation is stored as int indices, so just to be sure:
-  eigen_assert(cols<=NumTraits<int>::highest());
-
-  m_qr = matrix;
+
+  Index rows = m_qr.rows();
+  Index cols = m_qr.cols();
+  Index size = m_qr.diagonalSize();
+
   m_hCoeffs.resize(size);
 
   m_temp.resize(cols);
 
-  m_colsTranspositions.resize(matrix.cols());
+  m_colsTranspositions.resize(m_qr.cols());
   Index number_of_transpositions = 0;
 
-  m_colSqNorms.resize(cols);
-  for(Index k = 0; k < cols; ++k)
-    m_colSqNorms.coeffRef(k) = m_qr.col(k).squaredNorm();
+  m_colNormsUpdated.resize(cols);
+  m_colNormsDirect.resize(cols);
+  for (Index k = 0; k < cols; ++k) {
+    // colNormsDirect(k) caches the most recent directly computed norm of
+    // column k.
+    m_colNormsDirect.coeffRef(k) = m_qr.col(k).norm();
+    m_colNormsUpdated.coeffRef(k) = m_colNormsDirect.coeffRef(k);
+  }
 
-  RealScalar threshold_helper = m_colSqNorms.maxCoeff() * numext::abs2(NumTraits<Scalar>::epsilon()) / RealScalar(rows);
+  RealScalar threshold_helper =
+      numext::abs2<RealScalar>(m_colNormsUpdated.maxCoeff() * NumTraits<RealScalar>::epsilon()) / RealScalar(rows);
+  RealScalar norm_downdate_threshold = numext::sqrt(NumTraits<RealScalar>::epsilon());
 
-  m_nonzero_pivots = size; // the generic case is that in which all pivots are nonzero (invertible case)
+  m_nonzero_pivots = size;  // the generic case is that in which all pivots are nonzero (invertible case)
   m_maxpivot = RealScalar(0);
 
-  for(Index k = 0; k < size; ++k)
-  {
-    // first, we look up in our table m_colSqNorms which column has the biggest squared norm
+  for (Index k = 0; k < size; ++k) {
+    // first, we look up in our table m_colNormsUpdated which column has the biggest norm
     Index biggest_col_index;
-    RealScalar biggest_col_sq_norm = m_colSqNorms.tail(cols-k).maxCoeff(&biggest_col_index);
+    RealScalar biggest_col_sq_norm = numext::abs2(m_colNormsUpdated.tail(cols - k).maxCoeff(&biggest_col_index));
     biggest_col_index += k;
 
-    // since our table m_colSqNorms accumulates imprecision at every step, we must now recompute
-    // the actual squared norm of the selected column.
-    // Note that not doing so does result in solve() sometimes returning inf/nan values
-    // when running the unit test with 1000 repetitions.
-    biggest_col_sq_norm = m_qr.col(biggest_col_index).tail(rows-k).squaredNorm();
-
-    // we store that back into our table: it can't hurt to correct our table.
-    m_colSqNorms.coeffRef(biggest_col_index) = biggest_col_sq_norm;
-
     // Track the number of meaningful pivots but do not stop the decomposition to make
     // sure that the initial matrix is properly reproduced. See bug 941.
-    if(m_nonzero_pivots==size && biggest_col_sq_norm < threshold_helper * RealScalar(rows-k))
-      m_nonzero_pivots = k;
+    if (m_nonzero_pivots == size && biggest_col_sq_norm < threshold_helper * RealScalar(rows - k)) m_nonzero_pivots = k;
 
     // apply the transposition to the columns
-    m_colsTranspositions.coeffRef(k) = biggest_col_index;
-    if(k != biggest_col_index) {
+    m_colsTranspositions.coeffRef(k) = static_cast<PermutationIndex>(biggest_col_index);
+    if (k != biggest_col_index) {
       m_qr.col(k).swap(m_qr.col(biggest_col_index));
-      std::swap(m_colSqNorms.coeffRef(k), m_colSqNorms.coeffRef(biggest_col_index));
+      std::swap(m_colNormsUpdated.coeffRef(k), m_colNormsUpdated.coeffRef(biggest_col_index));
+      std::swap(m_colNormsDirect.coeffRef(k), m_colNormsDirect.coeffRef(biggest_col_index));
       ++number_of_transpositions;
     }
 
     // generate the householder vector, store it below the diagonal
     RealScalar beta;
-    m_qr.col(k).tail(rows-k).makeHouseholderInPlace(m_hCoeffs.coeffRef(k), beta);
+    m_qr.col(k).tail(rows - k).makeHouseholderInPlace(m_hCoeffs.coeffRef(k), beta);
 
     // apply the householder transformation to the diagonal coefficient
-    m_qr.coeffRef(k,k) = beta;
+    m_qr.coeffRef(k, k) = beta;
 
     // remember the maximum absolute value of diagonal coefficients
-    if(abs(beta) > m_maxpivot) m_maxpivot = abs(beta);
+    if (abs(beta) > m_maxpivot) m_maxpivot = abs(beta);
 
     // apply the householder transformation
-    m_qr.bottomRightCorner(rows-k, cols-k-1)
-        .applyHouseholderOnTheLeft(m_qr.col(k).tail(rows-k-1), m_hCoeffs.coeffRef(k), &m_temp.coeffRef(k+1));
-
-    // update our table of squared norms of the columns
-    m_colSqNorms.tail(cols-k-1) -= m_qr.row(k).tail(cols-k-1).cwiseAbs2();
+    m_qr.bottomRightCorner(rows - k, cols - k - 1)
+        .applyHouseholderOnTheLeft(m_qr.col(k).tail(rows - k - 1), m_hCoeffs.coeffRef(k), &m_temp.coeffRef(k + 1));
+
+    // update our table of norms of the columns
+    for (Index j = k + 1; j < cols; ++j) {
+      // The following implements the stable norm downgrade step discussed in
+      // http://www.netlib.org/lapack/lawnspdf/lawn176.pdf
+      // and used in LAPACK routines xGEQPF and xGEQP3.
+      // See lines 278-297 in http://www.netlib.org/lapack/explore-html/dc/df4/sgeqpf_8f_source.html
+      if (!numext::is_exactly_zero(m_colNormsUpdated.coeffRef(j))) {
+        RealScalar temp = abs(m_qr.coeffRef(k, j)) / m_colNormsUpdated.coeffRef(j);
+        temp = (RealScalar(1) + temp) * (RealScalar(1) - temp);
+        temp = temp < RealScalar(0) ? RealScalar(0) : temp;
+        RealScalar temp2 =
+            temp * numext::abs2<RealScalar>(m_colNormsUpdated.coeffRef(j) / m_colNormsDirect.coeffRef(j));
+        if (temp2 <= norm_downdate_threshold) {
+          // The updated norm has become too inaccurate so re-compute the column
+          // norm directly.
+          m_colNormsDirect.coeffRef(j) = m_qr.col(j).tail(rows - k - 1).norm();
+          m_colNormsUpdated.coeffRef(j) = m_colNormsDirect.coeffRef(j);
+        } else {
+          m_colNormsUpdated.coeffRef(j) *= numext::sqrt(temp);
+        }
+      }
+    }
   }
 
-  m_colsPermutation.setIdentity(PermIndexType(cols));
-  for(PermIndexType k = 0; k < size/*m_nonzero_pivots*/; ++k)
-    m_colsPermutation.applyTranspositionOnTheRight(k, PermIndexType(m_colsTranspositions.coeff(k)));
+  m_colsPermutation.setIdentity(cols);
+  for (Index k = 0; k < size /*m_nonzero_pivots*/; ++k)
+    m_colsPermutation.applyTranspositionOnTheRight(k, static_cast<Index>(m_colsTranspositions.coeff(k)));
 
-  m_det_pq = (number_of_transpositions%2) ? -1 : 1;
+  m_det_p = (number_of_transpositions % 2) ? -1 : 1;
   m_isInitialized = true;
-
-  return *this;
 }
 
-namespace internal {
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+template <typename MatrixType_, typename PermutationIndex_>
+template <typename RhsType, typename DstType>
+void ColPivHouseholderQR<MatrixType_, PermutationIndex_>::_solve_impl(const RhsType& rhs, DstType& dst) const {
+  const Index nonzero_pivots = nonzeroPivots();
+
+  if (nonzero_pivots == 0) {
+    dst.setZero();
+    return;
+  }
 
-template<typename _MatrixType, typename Rhs>
-struct solve_retval<ColPivHouseholderQR<_MatrixType>, Rhs>
-  : solve_retval_base<ColPivHouseholderQR<_MatrixType>, Rhs>
-{
-  EIGEN_MAKE_SOLVE_HELPERS(ColPivHouseholderQR<_MatrixType>,Rhs)
+  typename RhsType::PlainObject c(rhs);
 
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    eigen_assert(rhs().rows() == dec().rows());
+  c.applyOnTheLeft(householderQ().setLength(nonzero_pivots).adjoint());
 
-    const Index cols = dec().cols(),
-				nonzero_pivots = dec().nonzeroPivots();
+  m_qr.topLeftCorner(nonzero_pivots, nonzero_pivots)
+      .template triangularView<Upper>()
+      .solveInPlace(c.topRows(nonzero_pivots));
 
-    if(nonzero_pivots == 0)
-    {
-      dst.setZero();
-      return;
-    }
+  for (Index i = 0; i < nonzero_pivots; ++i) dst.row(m_colsPermutation.indices().coeff(i)) = c.row(i);
+  for (Index i = nonzero_pivots; i < cols(); ++i) dst.row(m_colsPermutation.indices().coeff(i)).setZero();
+}
+
+template <typename MatrixType_, typename PermutationIndex_>
+template <bool Conjugate, typename RhsType, typename DstType>
+void ColPivHouseholderQR<MatrixType_, PermutationIndex_>::_solve_impl_transposed(const RhsType& rhs,
+                                                                                 DstType& dst) const {
+  const Index nonzero_pivots = nonzeroPivots();
+
+  if (nonzero_pivots == 0) {
+    dst.setZero();
+    return;
+  }
+
+  typename RhsType::PlainObject c(m_colsPermutation.transpose() * rhs);
 
-    typename Rhs::PlainObject c(rhs());
+  m_qr.topLeftCorner(nonzero_pivots, nonzero_pivots)
+      .template triangularView<Upper>()
+      .transpose()
+      .template conjugateIf<Conjugate>()
+      .solveInPlace(c.topRows(nonzero_pivots));
 
-    // Note that the matrix Q = H_0^* H_1^*... so its inverse is Q^* = (H_0 H_1 ...)^T
-    c.applyOnTheLeft(householderSequence(dec().matrixQR(), dec().hCoeffs())
-                     .setLength(dec().nonzeroPivots())
-		     .transpose()
-      );
+  dst.topRows(nonzero_pivots) = c.topRows(nonzero_pivots);
+  dst.bottomRows(rows() - nonzero_pivots).setZero();
 
-    dec().matrixR()
-       .topLeftCorner(nonzero_pivots, nonzero_pivots)
-       .template triangularView<Upper>()
-       .solveInPlace(c.topRows(nonzero_pivots));
+  dst.applyOnTheLeft(householderQ().setLength(nonzero_pivots).template conjugateIf<!Conjugate>());
+}
+#endif
+
+namespace internal {
 
-    for(Index i = 0; i < nonzero_pivots; ++i) dst.row(dec().colsPermutation().indices().coeff(i)) = c.row(i);
-    for(Index i = nonzero_pivots; i < cols; ++i) dst.row(dec().colsPermutation().indices().coeff(i)).setZero();
+template <typename DstXprType, typename MatrixType, typename PermutationIndex>
+struct Assignment<DstXprType, Inverse<ColPivHouseholderQR<MatrixType, PermutationIndex>>,
+                  internal::assign_op<typename DstXprType::Scalar,
+                                      typename ColPivHouseholderQR<MatrixType, PermutationIndex>::Scalar>,
+                  Dense2Dense> {
+  typedef ColPivHouseholderQR<MatrixType, PermutationIndex> QrType;
+  typedef Inverse<QrType> SrcXprType;
+  static void run(DstXprType& dst, const SrcXprType& src,
+                  const internal::assign_op<typename DstXprType::Scalar, typename QrType::Scalar>&) {
+    dst = src.nestedExpression().solve(MatrixType::Identity(src.rows(), src.cols()));
   }
 };
 
-} // end namespace internal
+}  // end namespace internal
 
 /** \returns the matrix Q as a sequence of householder transformations.
-  * You can extract the meaningful part only by using:
-  * \code qr.householderQ().setLength(qr.nonzeroPivots()) \endcode*/
-template<typename MatrixType>
-typename ColPivHouseholderQR<MatrixType>::HouseholderSequenceType ColPivHouseholderQR<MatrixType>
-  ::householderQ() const
-{
+ * You can extract the meaningful part only by using:
+ * \code qr.householderQ().setLength(qr.nonzeroPivots()) \endcode*/
+template <typename MatrixType, typename PermutationIndex>
+typename ColPivHouseholderQR<MatrixType, PermutationIndex>::HouseholderSequenceType
+ColPivHouseholderQR<MatrixType, PermutationIndex>::householderQ() const {
   eigen_assert(m_isInitialized && "ColPivHouseholderQR is not initialized.");
   return HouseholderSequenceType(m_qr, m_hCoeffs.conjugate());
 }
 
 /** \return the column-pivoting Householder QR decomposition of \c *this.
-  *
-  * \sa class ColPivHouseholderQR
-  */
-template<typename Derived>
-const ColPivHouseholderQR<typename MatrixBase<Derived>::PlainObject>
-MatrixBase<Derived>::colPivHouseholderQr() const
-{
-  return ColPivHouseholderQR<PlainObject>(eval());
+ *
+ * \sa class ColPivHouseholderQR
+ */
+template <typename Derived>
+template <typename PermutationIndexType>
+const ColPivHouseholderQR<typename MatrixBase<Derived>::PlainObject, PermutationIndexType>
+MatrixBase<Derived>::colPivHouseholderQr() const {
+  return ColPivHouseholderQR<PlainObject, PermutationIndexType>(eval());
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_COLPIVOTINGHOUSEHOLDERQR_H
+#endif  // EIGEN_COLPIVOTINGHOUSEHOLDERQR_H
diff --git a/inst/include/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h b/inst/include/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h
new file mode 100644
index 00000000..37ac55fa
--- /dev/null
+++ b/inst/include/Eigen/src/QR/ColPivHouseholderQR_LAPACKE.h
@@ -0,0 +1,161 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to LAPACKe
+ *    Householder QR decomposition of a matrix with column pivoting based on
+ *    LAPACKE_?geqp3 function.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_COLPIVOTINGHOUSEHOLDERQR_LAPACKE_H
+#define EIGEN_COLPIVOTINGHOUSEHOLDERQR_LAPACKE_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+#if defined(EIGEN_USE_LAPACKE)
+
+template <typename Scalar>
+inline lapack_int call_geqp3(int matrix_layout, lapack_int m, lapack_int n, Scalar* a, lapack_int lda, lapack_int* jpvt,
+                             Scalar* tau);
+template <>
+inline lapack_int call_geqp3(int matrix_layout, lapack_int m, lapack_int n, float* a, lapack_int lda, lapack_int* jpvt,
+                             float* tau) {
+  return LAPACKE_sgeqp3(matrix_layout, m, n, a, lda, jpvt, tau);
+}
+template <>
+inline lapack_int call_geqp3(int matrix_layout, lapack_int m, lapack_int n, double* a, lapack_int lda, lapack_int* jpvt,
+                             double* tau) {
+  return LAPACKE_dgeqp3(matrix_layout, m, n, a, lda, jpvt, tau);
+}
+template <>
+inline lapack_int call_geqp3(int matrix_layout, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                             lapack_int* jpvt, lapack_complex_float* tau) {
+  return LAPACKE_cgeqp3(matrix_layout, m, n, a, lda, jpvt, tau);
+}
+template <>
+inline lapack_int call_geqp3(int matrix_layout, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                             lapack_int* jpvt, lapack_complex_double* tau) {
+  return LAPACKE_zgeqp3(matrix_layout, m, n, a, lda, jpvt, tau);
+}
+
+template <typename MatrixType>
+struct ColPivHouseholderQR_LAPACKE_impl {
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef typename internal::lapacke_helpers::translate_type_imp<Scalar>::type LapackeType;
+  static constexpr int LapackeStorage = MatrixType::IsRowMajor ? (LAPACK_ROW_MAJOR) : (LAPACK_COL_MAJOR);
+
+  typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
+  typedef PermutationMatrix<Dynamic, Dynamic, lapack_int> PermutationType;
+
+  static void run(MatrixType& qr, HCoeffsType& hCoeffs, PermutationType& colsPermutation, Index& nonzero_pivots,
+                  RealScalar& maxpivot, bool usePrescribedThreshold, RealScalar prescribedThreshold, Index& det_p,
+                  bool& isInitialized) {
+    isInitialized = false;
+    hCoeffs.resize(qr.diagonalSize());
+    nonzero_pivots = 0;
+    maxpivot = RealScalar(0);
+    colsPermutation.resize(qr.cols());
+    colsPermutation.indices().setZero();
+
+    lapack_int rows = internal::lapacke_helpers::to_lapack(qr.rows());
+    lapack_int cols = internal::lapacke_helpers::to_lapack(qr.cols());
+    LapackeType* qr_data = (LapackeType*)(qr.data());
+    lapack_int lda = internal::lapacke_helpers::to_lapack(qr.outerStride());
+    lapack_int* perm_data = colsPermutation.indices().data();
+    LapackeType* hCoeffs_data = (LapackeType*)(hCoeffs.data());
+
+    lapack_int info = call_geqp3(LapackeStorage, rows, cols, qr_data, lda, perm_data, hCoeffs_data);
+    if (info != 0) return;
+
+    maxpivot = qr.diagonal().cwiseAbs().maxCoeff();
+    hCoeffs.adjointInPlace();
+    RealScalar defaultThreshold = NumTraits<RealScalar>::epsilon() * RealScalar(qr.diagonalSize());
+    RealScalar threshold = usePrescribedThreshold ? prescribedThreshold : defaultThreshold;
+    RealScalar premultiplied_threshold = maxpivot * threshold;
+    nonzero_pivots = (qr.diagonal().cwiseAbs().array() > premultiplied_threshold).count();
+    colsPermutation.indices().array() -= 1;
+    det_p = colsPermutation.determinant();
+    isInitialized = true;
+  };
+
+  static void init(Index rows, Index cols, HCoeffsType& hCoeffs, PermutationType& colsPermutation,
+                   bool& usePrescribedThreshold, bool& isInitialized) {
+    Index diag = numext::mini(rows, cols);
+    hCoeffs.resize(diag);
+    colsPermutation.resize(cols);
+    usePrescribedThreshold = false;
+    isInitialized = false;
+  }
+};
+
+#define COLPIVQR_LAPACKE_COMPUTEINPLACE(EIGTYPE)                                                                   \
+  template <>                                                                                                      \
+  inline void ColPivHouseholderQR<EIGTYPE, lapack_int>::computeInPlace() {                                         \
+    ColPivHouseholderQR_LAPACKE_impl<MatrixType>::run(m_qr, m_hCoeffs, m_colsPermutation, m_nonzero_pivots,        \
+                                                      m_maxpivot, m_usePrescribedThreshold, m_prescribedThreshold, \
+                                                      m_det_p, m_isInitialized);                                   \
+  }
+
+#define COLPIVQR_LAPACKE_INIT(EIGTYPE)                                                                            \
+  template <>                                                                                                     \
+  inline void ColPivHouseholderQR<EIGTYPE, lapack_int>::init(Index rows, Index cols) {                            \
+    ColPivHouseholderQR_LAPACKE_impl<MatrixType>::init(rows, cols, m_hCoeffs, m_colsPermutation, m_isInitialized, \
+                                                       m_usePrescribedThreshold);                                 \
+  }
+
+#define COLPIVQR_LAPACKE(EIGTYPE)               \
+  COLPIVQR_LAPACKE_COMPUTEINPLACE(EIGTYPE)      \
+  COLPIVQR_LAPACKE_INIT(EIGTYPE)                \
+  COLPIVQR_LAPACKE_COMPUTEINPLACE(Ref<EIGTYPE>) \
+  COLPIVQR_LAPACKE_INIT(Ref<EIGTYPE>)
+
+typedef Matrix<float, Dynamic, Dynamic, ColMajor> MatrixXfC;
+typedef Matrix<double, Dynamic, Dynamic, ColMajor> MatrixXdC;
+typedef Matrix<std::complex<float>, Dynamic, Dynamic, ColMajor> MatrixXcfC;
+typedef Matrix<std::complex<double>, Dynamic, Dynamic, ColMajor> MatrixXcdC;
+typedef Matrix<float, Dynamic, Dynamic, RowMajor> MatrixXfR;
+typedef Matrix<double, Dynamic, Dynamic, RowMajor> MatrixXdR;
+typedef Matrix<std::complex<float>, Dynamic, Dynamic, RowMajor> MatrixXcfR;
+typedef Matrix<std::complex<double>, Dynamic, Dynamic, RowMajor> MatrixXcdR;
+
+COLPIVQR_LAPACKE(MatrixXfC)
+COLPIVQR_LAPACKE(MatrixXdC)
+COLPIVQR_LAPACKE(MatrixXcfC)
+COLPIVQR_LAPACKE(MatrixXcdC)
+COLPIVQR_LAPACKE(MatrixXfR)
+COLPIVQR_LAPACKE(MatrixXdR)
+COLPIVQR_LAPACKE(MatrixXcfR)
+COLPIVQR_LAPACKE(MatrixXcdR)
+
+#endif
+}  // end namespace Eigen
+
+#endif  // EIGEN_COLPIVOTINGHOUSEHOLDERQR_LAPACKE_H
diff --git a/inst/include/Eigen/src/QR/ColPivHouseholderQR_MKL.h b/inst/include/Eigen/src/QR/ColPivHouseholderQR_MKL.h
deleted file mode 100644
index b5b19832..00000000
--- a/inst/include/Eigen/src/QR/ColPivHouseholderQR_MKL.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- Copyright (c) 2011, Intel Corporation. All rights reserved.
-
- Redistribution and use in source and binary forms, with or without modification,
- are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
- * Neither the name of Intel Corporation nor the names of its contributors may
-   be used to endorse or promote products derived from this software without
-   specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
- ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
- ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
- *    Householder QR decomposition of a matrix with column pivoting based on
- *    LAPACKE_?geqp3 function.
- ********************************************************************************
-*/
-
-#ifndef EIGEN_COLPIVOTINGHOUSEHOLDERQR_MKL_H
-#define EIGEN_COLPIVOTINGHOUSEHOLDERQR_MKL_H
-
-#include "Eigen/src/Core/util/MKL_support.h"
-
-namespace Eigen { 
-
-/** \internal Specialization for the data types supported by MKL */
-
-#define EIGEN_MKL_QR_COLPIV(EIGTYPE, MKLTYPE, MKLPREFIX, EIGCOLROW, MKLCOLROW) \
-template<> inline \
-ColPivHouseholderQR<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic> >& \
-ColPivHouseholderQR<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic> >::compute( \
-              const Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>& matrix) \
-\
-{ \
-  using std::abs; \
-  typedef Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic> MatrixType; \
-  typedef MatrixType::Scalar Scalar; \
-  typedef MatrixType::RealScalar RealScalar; \
-  Index rows = matrix.rows();\
-  Index cols = matrix.cols();\
-  Index size = matrix.diagonalSize();\
-\
-  m_qr = matrix;\
-  m_hCoeffs.resize(size);\
-\
-  m_colsTranspositions.resize(cols);\
-  /*Index number_of_transpositions = 0;*/ \
-\
-  m_nonzero_pivots = 0; \
-  m_maxpivot = RealScalar(0);\
-  m_colsPermutation.resize(cols); \
-  m_colsPermutation.indices().setZero(); \
-\
-  lapack_int lda = m_qr.outerStride(), i; \
-  lapack_int matrix_order = MKLCOLROW; \
-  LAPACKE_##MKLPREFIX##geqp3( matrix_order, rows, cols, (MKLTYPE*)m_qr.data(), lda, (lapack_int*)m_colsPermutation.indices().data(), (MKLTYPE*)m_hCoeffs.data()); \
-  m_isInitialized = true; \
-  m_maxpivot=m_qr.diagonal().cwiseAbs().maxCoeff(); \
-  m_hCoeffs.adjointInPlace(); \
-  RealScalar premultiplied_threshold = abs(m_maxpivot) * threshold(); \
-  lapack_int *perm = m_colsPermutation.indices().data(); \
-  for(i=0;i<size;i++) { \
-    m_nonzero_pivots += (abs(m_qr.coeff(i,i)) > premultiplied_threshold);\
-  } \
-  for(i=0;i<cols;i++) perm[i]--;\
-\
-  /*m_det_pq = (number_of_transpositions%2) ? -1 : 1;  // TODO: It's not needed now; fix upon availability in Eigen */ \
-\
-  return *this; \
-}
-
-EIGEN_MKL_QR_COLPIV(double,   double,        d, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_MKL_QR_COLPIV(float,    float,         s, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_MKL_QR_COLPIV(dcomplex, MKL_Complex16, z, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_MKL_QR_COLPIV(scomplex, MKL_Complex8,  c, ColMajor, LAPACK_COL_MAJOR)
-
-EIGEN_MKL_QR_COLPIV(double,   double,        d, RowMajor, LAPACK_ROW_MAJOR)
-EIGEN_MKL_QR_COLPIV(float,    float,         s, RowMajor, LAPACK_ROW_MAJOR)
-EIGEN_MKL_QR_COLPIV(dcomplex, MKL_Complex16, z, RowMajor, LAPACK_ROW_MAJOR)
-EIGEN_MKL_QR_COLPIV(scomplex, MKL_Complex8,  c, RowMajor, LAPACK_ROW_MAJOR)
-
-} // end namespace Eigen
-
-#endif // EIGEN_COLPIVOTINGHOUSEHOLDERQR_MKL_H
diff --git a/inst/include/Eigen/src/QR/CompleteOrthogonalDecomposition.h b/inst/include/Eigen/src/QR/CompleteOrthogonalDecomposition.h
new file mode 100644
index 00000000..960ccb1e
--- /dev/null
+++ b/inst/include/Eigen/src/QR/CompleteOrthogonalDecomposition.h
@@ -0,0 +1,648 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Rasmus Munk Larsen <rmlarsen@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_COMPLETEORTHOGONALDECOMPOSITION_H
+#define EIGEN_COMPLETEORTHOGONALDECOMPOSITION_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+template <typename MatrixType_, typename PermutationIndex_>
+struct traits<CompleteOrthogonalDecomposition<MatrixType_, PermutationIndex_>> : traits<MatrixType_> {
+  typedef MatrixXpr XprKind;
+  typedef SolverStorage StorageKind;
+  typedef PermutationIndex_ PermutationIndex;
+  enum { Flags = 0 };
+};
+
+}  // end namespace internal
+
+/** \ingroup QR_Module
+ *
+ * \class CompleteOrthogonalDecomposition
+ *
+ * \brief Complete orthogonal decomposition (COD) of a matrix.
+ *
+ * \tparam MatrixType_ the type of the matrix of which we are computing the COD.
+ *
+ * This class performs a rank-revealing complete orthogonal decomposition of a
+ * matrix  \b A into matrices \b P, \b Q, \b T, and \b Z such that
+ * \f[
+ *  \mathbf{A} \, \mathbf{P} = \mathbf{Q} \,
+ *                     \begin{bmatrix} \mathbf{T} &  \mathbf{0} \\
+ *                                     \mathbf{0} & \mathbf{0} \end{bmatrix} \, \mathbf{Z}
+ * \f]
+ * by using Householder transformations. Here, \b P is a permutation matrix,
+ * \b Q and \b Z are unitary matrices and \b T an upper triangular matrix of
+ * size rank-by-rank. \b A may be rank deficient.
+ *
+ * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
+ *
+ * \sa MatrixBase::completeOrthogonalDecomposition()
+ */
+template <typename MatrixType_, typename PermutationIndex_>
+class CompleteOrthogonalDecomposition
+    : public SolverBase<CompleteOrthogonalDecomposition<MatrixType_, PermutationIndex_>> {
+ public:
+  typedef MatrixType_ MatrixType;
+  typedef SolverBase<CompleteOrthogonalDecomposition> Base;
+
+  template <typename Derived>
+  friend struct internal::solve_assertion;
+  typedef PermutationIndex_ PermutationIndex;
+  EIGEN_GENERIC_PUBLIC_INTERFACE(CompleteOrthogonalDecomposition)
+  enum {
+    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+  };
+  typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
+  typedef PermutationMatrix<ColsAtCompileTime, MaxColsAtCompileTime, PermutationIndex> PermutationType;
+  typedef typename internal::plain_row_type<MatrixType, Index>::type IntRowVectorType;
+  typedef typename internal::plain_row_type<MatrixType>::type RowVectorType;
+  typedef typename internal::plain_row_type<MatrixType, RealScalar>::type RealRowVectorType;
+  typedef HouseholderSequence<MatrixType, internal::remove_all_t<typename HCoeffsType::ConjugateReturnType>>
+      HouseholderSequenceType;
+  typedef typename MatrixType::PlainObject PlainObject;
+
+ public:
+  /**
+   * \brief Default Constructor.
+   *
+   * The default constructor is useful in cases in which the user intends to
+   * perform decompositions via
+   * \c CompleteOrthogonalDecomposition::compute(const* MatrixType&).
+   */
+  CompleteOrthogonalDecomposition() : m_cpqr(), m_zCoeffs(), m_temp() {}
+
+  /** \brief Default Constructor with memory preallocation
+   *
+   * Like the default constructor but with preallocation of the internal data
+   * according to the specified problem \a size.
+   * \sa CompleteOrthogonalDecomposition()
+   */
+  CompleteOrthogonalDecomposition(Index rows, Index cols)
+      : m_cpqr(rows, cols), m_zCoeffs((std::min)(rows, cols)), m_temp(cols) {}
+
+  /** \brief Constructs a complete orthogonal decomposition from a given
+   * matrix.
+   *
+   * This constructor computes the complete orthogonal decomposition of the
+   * matrix \a matrix by calling the method compute(). The default
+   * threshold for rank determination will be used. It is a short cut for:
+   *
+   * \code
+   * CompleteOrthogonalDecomposition<MatrixType> cod(matrix.rows(),
+   *                                                 matrix.cols());
+   * cod.setThreshold(Default);
+   * cod.compute(matrix);
+   * \endcode
+   *
+   * \sa compute()
+   */
+  template <typename InputType>
+  explicit CompleteOrthogonalDecomposition(const EigenBase<InputType>& matrix)
+      : m_cpqr(matrix.rows(), matrix.cols()),
+        m_zCoeffs((std::min)(matrix.rows(), matrix.cols())),
+        m_temp(matrix.cols()) {
+    compute(matrix.derived());
+  }
+
+  /** \brief Constructs a complete orthogonal decomposition from a given matrix
+   *
+   * This overloaded constructor is provided for \link InplaceDecomposition inplace decomposition \endlink when \c
+   * MatrixType is a Eigen::Ref.
+   *
+   * \sa CompleteOrthogonalDecomposition(const EigenBase&)
+   */
+  template <typename InputType>
+  explicit CompleteOrthogonalDecomposition(EigenBase<InputType>& matrix)
+      : m_cpqr(matrix.derived()), m_zCoeffs((std::min)(matrix.rows(), matrix.cols())), m_temp(matrix.cols()) {
+    computeInPlace();
+  }
+
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+  /** This method computes the minimum-norm solution X to a least squares
+   * problem \f[\mathrm{minimize} \|A X - B\|, \f] where \b A is the matrix of
+   * which \c *this is the complete orthogonal decomposition.
+   *
+   * \param b the right-hand sides of the problem to solve.
+   *
+   * \returns a solution.
+   *
+   */
+  template <typename Rhs>
+  inline const Solve<CompleteOrthogonalDecomposition, Rhs> solve(const MatrixBase<Rhs>& b) const;
+#endif
+
+  HouseholderSequenceType householderQ(void) const;
+  HouseholderSequenceType matrixQ(void) const { return m_cpqr.householderQ(); }
+
+  /** \returns the matrix \b Z.
+   */
+  MatrixType matrixZ() const {
+    MatrixType Z = MatrixType::Identity(m_cpqr.cols(), m_cpqr.cols());
+    applyZOnTheLeftInPlace<false>(Z);
+    return Z;
+  }
+
+  /** \returns a reference to the matrix where the complete orthogonal
+   * decomposition is stored
+   */
+  const MatrixType& matrixQTZ() const { return m_cpqr.matrixQR(); }
+
+  /** \returns a reference to the matrix where the complete orthogonal
+   * decomposition is stored.
+   * \warning The strict lower part and \code cols() - rank() \endcode right
+   * columns of this matrix contains internal values.
+   * Only the upper triangular part should be referenced. To get it, use
+   * \code matrixT().template triangularView<Upper>() \endcode
+   * For rank-deficient matrices, use
+   * \code
+   * matrixT().topLeftCorner(rank(), rank()).template triangularView<Upper>()
+   * \endcode
+   */
+  const MatrixType& matrixT() const { return m_cpqr.matrixQR(); }
+
+  template <typename InputType>
+  CompleteOrthogonalDecomposition& compute(const EigenBase<InputType>& matrix) {
+    // Compute the column pivoted QR factorization A P = Q R.
+    m_cpqr.compute(matrix);
+    computeInPlace();
+    return *this;
+  }
+
+  /** \returns a const reference to the column permutation matrix */
+  const PermutationType& colsPermutation() const { return m_cpqr.colsPermutation(); }
+
+  /** \returns the determinant of the matrix of which
+   * *this is the complete orthogonal decomposition. It has only linear
+   * complexity (that is, O(n) where n is the dimension of the square matrix)
+   * as the complete orthogonal decomposition has already been computed.
+   *
+   * \note This is only for square matrices.
+   *
+   * \warning a determinant can be very big or small, so for matrices
+   * of large enough dimension, there is a risk of overflow/underflow.
+   * One way to work around that is to use logAbsDeterminant() instead.
+   *
+   * \sa absDeterminant(), logAbsDeterminant(), MatrixBase::determinant()
+   */
+  typename MatrixType::Scalar determinant() const;
+
+  /** \returns the absolute value of the determinant of the matrix of which
+   * *this is the complete orthogonal decomposition. It has only linear
+   * complexity (that is, O(n) where n is the dimension of the square matrix)
+   * as the complete orthogonal decomposition has already been computed.
+   *
+   * \note This is only for square matrices.
+   *
+   * \warning a determinant can be very big or small, so for matrices
+   * of large enough dimension, there is a risk of overflow/underflow.
+   * One way to work around that is to use logAbsDeterminant() instead.
+   *
+   * \sa determinant(), logAbsDeterminant(), MatrixBase::determinant()
+   */
+  typename MatrixType::RealScalar absDeterminant() const;
+
+  /** \returns the natural log of the absolute value of the determinant of the
+   * matrix of which *this is the complete orthogonal decomposition. It has
+   * only linear complexity (that is, O(n) where n is the dimension of the
+   * square matrix) as the complete orthogonal decomposition has already been
+   * computed.
+   *
+   * \note This is only for square matrices.
+   *
+   * \note This method is useful to work around the risk of overflow/underflow
+   * that's inherent to determinant computation.
+   *
+   * \sa determinant(), absDeterminant(), MatrixBase::determinant()
+   */
+  typename MatrixType::RealScalar logAbsDeterminant() const;
+
+  /** \returns the sign of the determinant of the
+   * matrix of which *this is the complete orthogonal decomposition. It has
+   * only linear complexity (that is, O(n) where n is the dimension of the
+   * square matrix) as the complete orthogonal decomposition has already been
+   * computed.
+   *
+   * \note This is only for square matrices.
+   *
+   * \note This method is useful to work around the risk of overflow/underflow
+   * that's inherent to determinant computation.
+   *
+   * \sa determinant(), absDeterminant(), logAbsDeterminant(), MatrixBase::determinant()
+   */
+  typename MatrixType::Scalar signDeterminant() const;
+
+  /** \returns the rank of the matrix of which *this is the complete orthogonal
+   * decomposition.
+   *
+   * \note This method has to determine which pivots should be considered
+   * nonzero. For that, it uses the threshold value that you can control by
+   * calling setThreshold(const RealScalar&).
+   */
+  inline Index rank() const { return m_cpqr.rank(); }
+
+  /** \returns the dimension of the kernel of the matrix of which *this is the
+   * complete orthogonal decomposition.
+   *
+   * \note This method has to determine which pivots should be considered
+   * nonzero. For that, it uses the threshold value that you can control by
+   * calling setThreshold(const RealScalar&).
+   */
+  inline Index dimensionOfKernel() const { return m_cpqr.dimensionOfKernel(); }
+
+  /** \returns true if the matrix of which *this is the decomposition represents
+   * an injective linear map, i.e. has trivial kernel; false otherwise.
+   *
+   * \note This method has to determine which pivots should be considered
+   * nonzero. For that, it uses the threshold value that you can control by
+   * calling setThreshold(const RealScalar&).
+   */
+  inline bool isInjective() const { return m_cpqr.isInjective(); }
+
+  /** \returns true if the matrix of which *this is the decomposition represents
+   * a surjective linear map; false otherwise.
+   *
+   * \note This method has to determine which pivots should be considered
+   * nonzero. For that, it uses the threshold value that you can control by
+   * calling setThreshold(const RealScalar&).
+   */
+  inline bool isSurjective() const { return m_cpqr.isSurjective(); }
+
+  /** \returns true if the matrix of which *this is the complete orthogonal
+   * decomposition is invertible.
+   *
+   * \note This method has to determine which pivots should be considered
+   * nonzero. For that, it uses the threshold value that you can control by
+   * calling setThreshold(const RealScalar&).
+   */
+  inline bool isInvertible() const { return m_cpqr.isInvertible(); }
+
+  /** \returns the pseudo-inverse of the matrix of which *this is the complete
+   * orthogonal decomposition.
+   * \warning: Do not compute \c this->pseudoInverse()*rhs to solve a linear systems.
+   * It is more efficient and numerically stable to call \c this->solve(rhs).
+   */
+  inline const Inverse<CompleteOrthogonalDecomposition> pseudoInverse() const {
+    eigen_assert(m_cpqr.m_isInitialized && "CompleteOrthogonalDecomposition is not initialized.");
+    return Inverse<CompleteOrthogonalDecomposition>(*this);
+  }
+
+  inline Index rows() const { return m_cpqr.rows(); }
+  inline Index cols() const { return m_cpqr.cols(); }
+
+  /** \returns a const reference to the vector of Householder coefficients used
+   * to represent the factor \c Q.
+   *
+   * For advanced uses only.
+   */
+  inline const HCoeffsType& hCoeffs() const { return m_cpqr.hCoeffs(); }
+
+  /** \returns a const reference to the vector of Householder coefficients
+   * used to represent the factor \c Z.
+   *
+   * For advanced uses only.
+   */
+  const HCoeffsType& zCoeffs() const { return m_zCoeffs; }
+
+  /** Allows to prescribe a threshold to be used by certain methods, such as
+   * rank(), who need to determine when pivots are to be considered nonzero.
+   * Most be called before calling compute().
+   *
+   * When it needs to get the threshold value, Eigen calls threshold(). By
+   * default, this uses a formula to automatically determine a reasonable
+   * threshold. Once you have called the present method
+   * setThreshold(const RealScalar&), your value is used instead.
+   *
+   * \param threshold The new value to use as the threshold.
+   *
+   * A pivot will be considered nonzero if its absolute value is strictly
+   * greater than
+   *  \f$ \vert pivot \vert \leqslant threshold \times \vert maxpivot \vert \f$
+   * where maxpivot is the biggest pivot.
+   *
+   * If you want to come back to the default behavior, call
+   * setThreshold(Default_t)
+   */
+  CompleteOrthogonalDecomposition& setThreshold(const RealScalar& threshold) {
+    m_cpqr.setThreshold(threshold);
+    return *this;
+  }
+
+  /** Allows to come back to the default behavior, letting Eigen use its default
+   * formula for determining the threshold.
+   *
+   * You should pass the special object Eigen::Default as parameter here.
+   * \code qr.setThreshold(Eigen::Default); \endcode
+   *
+   * See the documentation of setThreshold(const RealScalar&).
+   */
+  CompleteOrthogonalDecomposition& setThreshold(Default_t) {
+    m_cpqr.setThreshold(Default);
+    return *this;
+  }
+
+  /** Returns the threshold that will be used by certain methods such as rank().
+   *
+   * See the documentation of setThreshold(const RealScalar&).
+   */
+  RealScalar threshold() const { return m_cpqr.threshold(); }
+
+  /** \returns the number of nonzero pivots in the complete orthogonal
+   * decomposition. Here nonzero is meant in the exact sense, not in a
+   * fuzzy sense. So that notion isn't really intrinsically interesting,
+   * but it is still useful when implementing algorithms.
+   *
+   * \sa rank()
+   */
+  inline Index nonzeroPivots() const { return m_cpqr.nonzeroPivots(); }
+
+  /** \returns the absolute value of the biggest pivot, i.e. the biggest
+   *          diagonal coefficient of R.
+   */
+  inline RealScalar maxPivot() const { return m_cpqr.maxPivot(); }
+
+  /** \brief Reports whether the complete orthogonal decomposition was
+   * successful.
+   *
+   * \note This function always returns \c Success. It is provided for
+   * compatibility
+   * with other factorization routines.
+   * \returns \c Success
+   */
+  ComputationInfo info() const {
+    eigen_assert(m_cpqr.m_isInitialized && "Decomposition is not initialized.");
+    return Success;
+  }
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  template <typename RhsType, typename DstType>
+  void _solve_impl(const RhsType& rhs, DstType& dst) const;
+
+  template <bool Conjugate, typename RhsType, typename DstType>
+  void _solve_impl_transposed(const RhsType& rhs, DstType& dst) const;
+#endif
+
+ protected:
+  EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
+
+  template <bool Transpose_, typename Rhs>
+  void _check_solve_assertion(const Rhs& b) const {
+    EIGEN_ONLY_USED_FOR_DEBUG(b);
+    eigen_assert(m_cpqr.m_isInitialized && "CompleteOrthogonalDecomposition is not initialized.");
+    eigen_assert((Transpose_ ? derived().cols() : derived().rows()) == b.rows() &&
+                 "CompleteOrthogonalDecomposition::solve(): invalid number of rows of the right hand side matrix b");
+  }
+
+  void computeInPlace();
+
+  /** Overwrites \b rhs with \f$ \mathbf{Z} * \mathbf{rhs} \f$ or
+   *  \f$ \mathbf{\overline Z} * \mathbf{rhs} \f$ if \c Conjugate
+   *  is set to \c true.
+   */
+  template <bool Conjugate, typename Rhs>
+  void applyZOnTheLeftInPlace(Rhs& rhs) const;
+
+  /** Overwrites \b rhs with \f$ \mathbf{Z}^* * \mathbf{rhs} \f$.
+   */
+  template <typename Rhs>
+  void applyZAdjointOnTheLeftInPlace(Rhs& rhs) const;
+
+  ColPivHouseholderQR<MatrixType, PermutationIndex> m_cpqr;
+  HCoeffsType m_zCoeffs;
+  RowVectorType m_temp;
+};
+
+template <typename MatrixType, typename PermutationIndex>
+typename MatrixType::Scalar CompleteOrthogonalDecomposition<MatrixType, PermutationIndex>::determinant() const {
+  return m_cpqr.determinant();
+}
+
+template <typename MatrixType, typename PermutationIndex>
+typename MatrixType::RealScalar CompleteOrthogonalDecomposition<MatrixType, PermutationIndex>::absDeterminant() const {
+  return m_cpqr.absDeterminant();
+}
+
+template <typename MatrixType, typename PermutationIndex>
+typename MatrixType::RealScalar CompleteOrthogonalDecomposition<MatrixType, PermutationIndex>::logAbsDeterminant()
+    const {
+  return m_cpqr.logAbsDeterminant();
+}
+
+template <typename MatrixType, typename PermutationIndex>
+typename MatrixType::Scalar CompleteOrthogonalDecomposition<MatrixType, PermutationIndex>::signDeterminant() const {
+  return m_cpqr.signDeterminant();
+}
+
+/** Performs the complete orthogonal decomposition of the given matrix \a
+ * matrix. The result of the factorization is stored into \c *this, and a
+ * reference to \c *this is returned.
+ *
+ * \sa class CompleteOrthogonalDecomposition,
+ * CompleteOrthogonalDecomposition(const MatrixType&)
+ */
+template <typename MatrixType, typename PermutationIndex>
+void CompleteOrthogonalDecomposition<MatrixType, PermutationIndex>::computeInPlace() {
+  eigen_assert(m_cpqr.cols() <= NumTraits<PermutationIndex>::highest());
+
+  const Index rank = m_cpqr.rank();
+  const Index cols = m_cpqr.cols();
+  const Index rows = m_cpqr.rows();
+  m_zCoeffs.resize((std::min)(rows, cols));
+  m_temp.resize(cols);
+
+  if (rank < cols) {
+    // We have reduced the (permuted) matrix to the form
+    //   [R11 R12]
+    //   [ 0  R22]
+    // where R11 is r-by-r (r = rank) upper triangular, R12 is
+    // r-by-(n-r), and R22 is empty or the norm of R22 is negligible.
+    // We now compute the complete orthogonal decomposition by applying
+    // Householder transformations from the right to the upper trapezoidal
+    // matrix X = [R11 R12] to zero out R12 and obtain the factorization
+    // [R11 R12] = [T11 0] * Z, where T11 is r-by-r upper triangular and
+    // Z = Z(0) * Z(1) ... Z(r-1) is an n-by-n orthogonal matrix.
+    // We store the data representing Z in R12 and m_zCoeffs.
+    for (Index k = rank - 1; k >= 0; --k) {
+      if (k != rank - 1) {
+        // Given the API for Householder reflectors, it is more convenient if
+        // we swap the leading parts of columns k and r-1 (zero-based) to form
+        // the matrix X_k = [X(0:k, k), X(0:k, r:n)]
+        m_cpqr.m_qr.col(k).head(k + 1).swap(m_cpqr.m_qr.col(rank - 1).head(k + 1));
+      }
+      // Construct Householder reflector Z(k) to zero out the last row of X_k,
+      // i.e. choose Z(k) such that
+      // [X(k, k), X(k, r:n)] * Z(k) = [beta, 0, .., 0].
+      RealScalar beta;
+      m_cpqr.m_qr.row(k).tail(cols - rank + 1).makeHouseholderInPlace(m_zCoeffs(k), beta);
+      m_cpqr.m_qr(k, rank - 1) = beta;
+      if (k > 0) {
+        // Apply Z(k) to the first k rows of X_k
+        m_cpqr.m_qr.topRightCorner(k, cols - rank + 1)
+            .applyHouseholderOnTheRight(m_cpqr.m_qr.row(k).tail(cols - rank).adjoint(), m_zCoeffs(k), &m_temp(0));
+      }
+      if (k != rank - 1) {
+        // Swap X(0:k,k) back to its proper location.
+        m_cpqr.m_qr.col(k).head(k + 1).swap(m_cpqr.m_qr.col(rank - 1).head(k + 1));
+      }
+    }
+  }
+}
+
+template <typename MatrixType, typename PermutationIndex>
+template <bool Conjugate, typename Rhs>
+void CompleteOrthogonalDecomposition<MatrixType, PermutationIndex>::applyZOnTheLeftInPlace(Rhs& rhs) const {
+  const Index cols = this->cols();
+  const Index nrhs = rhs.cols();
+  const Index rank = this->rank();
+  Matrix<typename Rhs::Scalar, Dynamic, 1> temp((std::max)(cols, nrhs));
+  for (Index k = rank - 1; k >= 0; --k) {
+    if (k != rank - 1) {
+      rhs.row(k).swap(rhs.row(rank - 1));
+    }
+    rhs.middleRows(rank - 1, cols - rank + 1)
+        .applyHouseholderOnTheLeft(matrixQTZ().row(k).tail(cols - rank).transpose().template conjugateIf<!Conjugate>(),
+                                   zCoeffs().template conjugateIf<Conjugate>()(k), &temp(0));
+    if (k != rank - 1) {
+      rhs.row(k).swap(rhs.row(rank - 1));
+    }
+  }
+}
+
+template <typename MatrixType, typename PermutationIndex>
+template <typename Rhs>
+void CompleteOrthogonalDecomposition<MatrixType, PermutationIndex>::applyZAdjointOnTheLeftInPlace(Rhs& rhs) const {
+  const Index cols = this->cols();
+  const Index nrhs = rhs.cols();
+  const Index rank = this->rank();
+  Matrix<typename Rhs::Scalar, Dynamic, 1> temp((std::max)(cols, nrhs));
+  for (Index k = 0; k < rank; ++k) {
+    if (k != rank - 1) {
+      rhs.row(k).swap(rhs.row(rank - 1));
+    }
+    rhs.middleRows(rank - 1, cols - rank + 1)
+        .applyHouseholderOnTheLeft(matrixQTZ().row(k).tail(cols - rank).adjoint(), zCoeffs()(k), &temp(0));
+    if (k != rank - 1) {
+      rhs.row(k).swap(rhs.row(rank - 1));
+    }
+  }
+}
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+template <typename MatrixType_, typename PermutationIndex_>
+template <typename RhsType, typename DstType>
+void CompleteOrthogonalDecomposition<MatrixType_, PermutationIndex_>::_solve_impl(const RhsType& rhs,
+                                                                                  DstType& dst) const {
+  const Index rank = this->rank();
+  if (rank == 0) {
+    dst.setZero();
+    return;
+  }
+
+  // Compute c = Q^* * rhs
+  typename RhsType::PlainObject c(rhs);
+  c.applyOnTheLeft(matrixQ().setLength(rank).adjoint());
+
+  // Solve T z = c(1:rank, :)
+  dst.topRows(rank) = matrixT().topLeftCorner(rank, rank).template triangularView<Upper>().solve(c.topRows(rank));
+
+  const Index cols = this->cols();
+  if (rank < cols) {
+    // Compute y = Z^* * [ z ]
+    //                   [ 0 ]
+    dst.bottomRows(cols - rank).setZero();
+    applyZAdjointOnTheLeftInPlace(dst);
+  }
+
+  // Undo permutation to get x = P^{-1} * y.
+  dst = colsPermutation() * dst;
+}
+
+template <typename MatrixType_, typename PermutationIndex_>
+template <bool Conjugate, typename RhsType, typename DstType>
+void CompleteOrthogonalDecomposition<MatrixType_, PermutationIndex_>::_solve_impl_transposed(const RhsType& rhs,
+                                                                                             DstType& dst) const {
+  const Index rank = this->rank();
+
+  if (rank == 0) {
+    dst.setZero();
+    return;
+  }
+
+  typename RhsType::PlainObject c(colsPermutation().transpose() * rhs);
+
+  if (rank < cols()) {
+    applyZOnTheLeftInPlace<!Conjugate>(c);
+  }
+
+  matrixT()
+      .topLeftCorner(rank, rank)
+      .template triangularView<Upper>()
+      .transpose()
+      .template conjugateIf<Conjugate>()
+      .solveInPlace(c.topRows(rank));
+
+  dst.topRows(rank) = c.topRows(rank);
+  dst.bottomRows(rows() - rank).setZero();
+
+  dst.applyOnTheLeft(householderQ().setLength(rank).template conjugateIf<!Conjugate>());
+}
+#endif
+
+namespace internal {
+
+template <typename MatrixType, typename PermutationIndex>
+struct traits<Inverse<CompleteOrthogonalDecomposition<MatrixType, PermutationIndex>>>
+    : traits<typename Transpose<typename MatrixType::PlainObject>::PlainObject> {
+  enum { Flags = 0 };
+};
+
+template <typename DstXprType, typename MatrixType, typename PermutationIndex>
+struct Assignment<DstXprType, Inverse<CompleteOrthogonalDecomposition<MatrixType, PermutationIndex>>,
+                  internal::assign_op<typename DstXprType::Scalar,
+                                      typename CompleteOrthogonalDecomposition<MatrixType, PermutationIndex>::Scalar>,
+                  Dense2Dense> {
+  typedef CompleteOrthogonalDecomposition<MatrixType, PermutationIndex> CodType;
+  typedef Inverse<CodType> SrcXprType;
+  static void run(DstXprType& dst, const SrcXprType& src,
+                  const internal::assign_op<typename DstXprType::Scalar, typename CodType::Scalar>&) {
+    typedef Matrix<typename CodType::Scalar, CodType::RowsAtCompileTime, CodType::RowsAtCompileTime, 0,
+                   CodType::MaxRowsAtCompileTime, CodType::MaxRowsAtCompileTime>
+        IdentityMatrixType;
+    dst = src.nestedExpression().solve(IdentityMatrixType::Identity(src.cols(), src.cols()));
+  }
+};
+
+}  // end namespace internal
+
+/** \returns the matrix Q as a sequence of householder transformations */
+template <typename MatrixType, typename PermutationIndex>
+typename CompleteOrthogonalDecomposition<MatrixType, PermutationIndex>::HouseholderSequenceType
+CompleteOrthogonalDecomposition<MatrixType, PermutationIndex>::householderQ() const {
+  return m_cpqr.householderQ();
+}
+
+/** \return the complete orthogonal decomposition of \c *this.
+ *
+ * \sa class CompleteOrthogonalDecomposition
+ */
+template <typename Derived>
+template <typename PermutationIndex>
+const CompleteOrthogonalDecomposition<typename MatrixBase<Derived>::PlainObject, PermutationIndex>
+MatrixBase<Derived>::completeOrthogonalDecomposition() const {
+  return CompleteOrthogonalDecomposition<PlainObject>(eval());
+}
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_COMPLETEORTHOGONALDECOMPOSITION_H
diff --git a/inst/include/Eigen/src/QR/FullPivHouseholderQR.h b/inst/include/Eigen/src/QR/FullPivHouseholderQR.h
index 0b39966e..d1734445 100644
--- a/inst/include/Eigen/src/QR/FullPivHouseholderQR.h
+++ b/inst/include/Eigen/src/QR/FullPivHouseholderQR.h
@@ -11,71 +11,94 @@
 #ifndef EIGEN_FULLPIVOTINGHOUSEHOLDERQR_H
 #define EIGEN_FULLPIVOTINGHOUSEHOLDERQR_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 namespace internal {
 
-template<typename MatrixType> struct FullPivHouseholderQRMatrixQReturnType;
+template <typename MatrixType_, typename PermutationIndex_>
+struct traits<FullPivHouseholderQR<MatrixType_, PermutationIndex_> > : traits<MatrixType_> {
+  typedef MatrixXpr XprKind;
+  typedef SolverStorage StorageKind;
+  typedef PermutationIndex_ PermutationIndex;
+  enum { Flags = 0 };
+};
+
+template <typename MatrixType, typename PermutationIndex>
+struct FullPivHouseholderQRMatrixQReturnType;
 
-template<typename MatrixType>
-struct traits<FullPivHouseholderQRMatrixQReturnType<MatrixType> >
-{
+template <typename MatrixType, typename PermutationIndex>
+struct traits<FullPivHouseholderQRMatrixQReturnType<MatrixType, PermutationIndex> > {
   typedef typename MatrixType::PlainObject ReturnType;
 };
 
-}
+}  // end namespace internal
 
 /** \ingroup QR_Module
-  *
-  * \class FullPivHouseholderQR
-  *
-  * \brief Householder rank-revealing QR decomposition of a matrix with full pivoting
-  *
-  * \param MatrixType the type of the matrix of which we are computing the QR decomposition
-  *
-  * This class performs a rank-revealing QR decomposition of a matrix \b A into matrices \b P, \b Q and \b R
-  * such that 
-  * \f[
-  *  \mathbf{A} \, \mathbf{P} = \mathbf{Q} \, \mathbf{R}
-  * \f]
-  * by using Householder transformations. Here, \b P is a permutation matrix, \b Q a unitary matrix and \b R an 
-  * upper triangular matrix.
-  *
-  * This decomposition performs a very prudent full pivoting in order to be rank-revealing and achieve optimal
-  * numerical stability. The trade-off is that it is slower than HouseholderQR and ColPivHouseholderQR.
-  *
-  * \sa MatrixBase::fullPivHouseholderQr()
-  */
-template<typename _MatrixType> class FullPivHouseholderQR
-{
-  public:
-
-    typedef _MatrixType MatrixType;
-    enum {
-      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-      Options = MatrixType::Options,
-      MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
-      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
-    };
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
-    typedef internal::FullPivHouseholderQRMatrixQReturnType<MatrixType> MatrixQReturnType;
-    typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
-    typedef Matrix<Index, 1,
-                   EIGEN_SIZE_MIN_PREFER_DYNAMIC(ColsAtCompileTime,RowsAtCompileTime), RowMajor, 1,
-                   EIGEN_SIZE_MIN_PREFER_FIXED(MaxColsAtCompileTime,MaxRowsAtCompileTime)> IntDiagSizeVectorType;
-    typedef PermutationMatrix<ColsAtCompileTime, MaxColsAtCompileTime> PermutationType;
-    typedef typename internal::plain_row_type<MatrixType>::type RowVectorType;
-    typedef typename internal::plain_col_type<MatrixType>::type ColVectorType;
-
-    /** \brief Default Constructor.
-      *
-      * The default constructor is useful in cases in which the user intends to
-      * perform decompositions via FullPivHouseholderQR::compute(const MatrixType&).
-      */
-    FullPivHouseholderQR()
+ *
+ * \class FullPivHouseholderQR
+ *
+ * \brief Householder rank-revealing QR decomposition of a matrix with full pivoting
+ *
+ * \tparam MatrixType_ the type of the matrix of which we are computing the QR decomposition
+ *
+ * This class performs a rank-revealing QR decomposition of a matrix \b A into matrices \b P, \b P', \b Q and \b R
+ * such that
+ * \f[
+ *  \mathbf{P} \, \mathbf{A} \, \mathbf{P}' = \mathbf{Q} \, \mathbf{R}
+ * \f]
+ * by using Householder transformations. Here, \b P and \b P' are permutation matrices, \b Q a unitary matrix
+ * and \b R an upper triangular matrix.
+ *
+ * This decomposition performs a very prudent full pivoting in order to be rank-revealing and achieve optimal
+ * numerical stability. The trade-off is that it is slower than HouseholderQR and ColPivHouseholderQR.
+ *
+ * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
+ *
+ * \sa MatrixBase::fullPivHouseholderQr()
+ */
+template <typename MatrixType_, typename PermutationIndex_>
+class FullPivHouseholderQR : public SolverBase<FullPivHouseholderQR<MatrixType_, PermutationIndex_> > {
+ public:
+  typedef MatrixType_ MatrixType;
+  typedef SolverBase<FullPivHouseholderQR> Base;
+  friend class SolverBase<FullPivHouseholderQR>;
+  typedef PermutationIndex_ PermutationIndex;
+  EIGEN_GENERIC_PUBLIC_INTERFACE(FullPivHouseholderQR)
+
+  enum {
+    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+  };
+  typedef internal::FullPivHouseholderQRMatrixQReturnType<MatrixType, PermutationIndex> MatrixQReturnType;
+  typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
+  typedef Matrix<PermutationIndex, 1, internal::min_size_prefer_dynamic(ColsAtCompileTime, RowsAtCompileTime), RowMajor,
+                 1, internal::min_size_prefer_fixed(MaxColsAtCompileTime, MaxRowsAtCompileTime)>
+      IntDiagSizeVectorType;
+  typedef PermutationMatrix<ColsAtCompileTime, MaxColsAtCompileTime, PermutationIndex> PermutationType;
+  typedef typename internal::plain_row_type<MatrixType>::type RowVectorType;
+  typedef typename internal::plain_col_type<MatrixType>::type ColVectorType;
+  typedef typename MatrixType::PlainObject PlainObject;
+
+  /** \brief Reports whether the QR factorization was successful.
+   *
+   * \note This function always returns \c Success. It is provided for compatibility
+   * with other factorization routines.
+   * \returns \c Success
+   */
+  ComputationInfo info() const {
+    eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
+    return Success;
+  }
+
+  /** \brief Default Constructor.
+   *
+   * The default constructor is useful in cases in which the user intends to
+   * perform decompositions via FullPivHouseholderQR::compute(const MatrixType&).
+   */
+  FullPivHouseholderQR()
       : m_qr(),
         m_hCoeffs(),
         m_rows_transpositions(),
@@ -85,35 +108,36 @@ template<typename _MatrixType> class FullPivHouseholderQR
         m_isInitialized(false),
         m_usePrescribedThreshold(false) {}
 
-    /** \brief Default Constructor with memory preallocation
-      *
-      * Like the default constructor but with preallocation of the internal data
-      * according to the specified problem \a size.
-      * \sa FullPivHouseholderQR()
-      */
-    FullPivHouseholderQR(Index rows, Index cols)
+  /** \brief Default Constructor with memory preallocation
+   *
+   * Like the default constructor but with preallocation of the internal data
+   * according to the specified problem \a size.
+   * \sa FullPivHouseholderQR()
+   */
+  FullPivHouseholderQR(Index rows, Index cols)
       : m_qr(rows, cols),
-        m_hCoeffs((std::min)(rows,cols)),
-        m_rows_transpositions((std::min)(rows,cols)),
-        m_cols_transpositions((std::min)(rows,cols)),
+        m_hCoeffs((std::min)(rows, cols)),
+        m_rows_transpositions((std::min)(rows, cols)),
+        m_cols_transpositions((std::min)(rows, cols)),
         m_cols_permutation(cols),
         m_temp(cols),
         m_isInitialized(false),
         m_usePrescribedThreshold(false) {}
 
-    /** \brief Constructs a QR factorization from a given matrix
-      *
-      * This constructor computes the QR factorization of the matrix \a matrix by calling
-      * the method compute(). It is a short cut for:
-      * 
-      * \code
-      * FullPivHouseholderQR<MatrixType> qr(matrix.rows(), matrix.cols());
-      * qr.compute(matrix);
-      * \endcode
-      * 
-      * \sa compute()
-      */
-    FullPivHouseholderQR(const MatrixType& matrix)
+  /** \brief Constructs a QR factorization from a given matrix
+   *
+   * This constructor computes the QR factorization of the matrix \a matrix by calling
+   * the method compute(). It is a short cut for:
+   *
+   * \code
+   * FullPivHouseholderQR<MatrixType> qr(matrix.rows(), matrix.cols());
+   * qr.compute(matrix);
+   * \endcode
+   *
+   * \sa compute()
+   */
+  template <typename InputType>
+  explicit FullPivHouseholderQR(const EigenBase<InputType>& matrix)
       : m_qr(matrix.rows(), matrix.cols()),
         m_hCoeffs((std::min)(matrix.rows(), matrix.cols())),
         m_rows_transpositions((std::min)(matrix.rows(), matrix.cols())),
@@ -121,306 +145,361 @@ template<typename _MatrixType> class FullPivHouseholderQR
         m_cols_permutation(matrix.cols()),
         m_temp(matrix.cols()),
         m_isInitialized(false),
-        m_usePrescribedThreshold(false)
-    {
-      compute(matrix);
-    }
-
-    /** This method finds a solution x to the equation Ax=b, where A is the matrix of which
-      * \c *this is the QR decomposition.
-      *
-      * \param b the right-hand-side of the equation to solve.
-      *
-      * \returns the exact or least-square solution if the rank is greater or equal to the number of columns of A,
-      * and an arbitrary solution otherwise.
-      *
-      * \note The case where b is a matrix is not yet implemented. Also, this
-      *       code is space inefficient.
-      *
-      * \note_about_checking_solutions
-      *
-      * \note_about_arbitrary_choice_of_solution
-      *
-      * Example: \include FullPivHouseholderQR_solve.cpp
-      * Output: \verbinclude FullPivHouseholderQR_solve.out
-      */
-    template<typename Rhs>
-    inline const internal::solve_retval<FullPivHouseholderQR, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
-      return internal::solve_retval<FullPivHouseholderQR, Rhs>(*this, b.derived());
-    }
-
-    /** \returns Expression object representing the matrix Q
-      */
-    MatrixQReturnType matrixQ(void) const;
+        m_usePrescribedThreshold(false) {
+    compute(matrix.derived());
+  }
 
-    /** \returns a reference to the matrix where the Householder QR decomposition is stored
-      */
-    const MatrixType& matrixQR() const
-    {
-      eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
-      return m_qr;
-    }
+  /** \brief Constructs a QR factorization from a given matrix
+   *
+   * This overloaded constructor is provided for \link InplaceDecomposition inplace decomposition \endlink when \c
+   * MatrixType is a Eigen::Ref.
+   *
+   * \sa FullPivHouseholderQR(const EigenBase&)
+   */
+  template <typename InputType>
+  explicit FullPivHouseholderQR(EigenBase<InputType>& matrix)
+      : m_qr(matrix.derived()),
+        m_hCoeffs((std::min)(matrix.rows(), matrix.cols())),
+        m_rows_transpositions((std::min)(matrix.rows(), matrix.cols())),
+        m_cols_transpositions((std::min)(matrix.rows(), matrix.cols())),
+        m_cols_permutation(matrix.cols()),
+        m_temp(matrix.cols()),
+        m_isInitialized(false),
+        m_usePrescribedThreshold(false) {
+    computeInPlace();
+  }
 
-    FullPivHouseholderQR& compute(const MatrixType& matrix);
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+  /** This method finds a solution x to the equation Ax=b, where A is the matrix of which
+   * \c *this is the QR decomposition.
+   *
+   * \param b the right-hand-side of the equation to solve.
+   *
+   * \returns the exact or least-square solution if the rank is greater or equal to the number of columns of A,
+   * and an arbitrary solution otherwise.
+   *
+   * \note_about_checking_solutions
+   *
+   * \note_about_arbitrary_choice_of_solution
+   *
+   * Example: \include FullPivHouseholderQR_solve.cpp
+   * Output: \verbinclude FullPivHouseholderQR_solve.out
+   */
+  template <typename Rhs>
+  inline const Solve<FullPivHouseholderQR, Rhs> solve(const MatrixBase<Rhs>& b) const;
+#endif
+
+  /** \returns Expression object representing the matrix Q
+   */
+  MatrixQReturnType matrixQ(void) const;
+
+  /** \returns a reference to the matrix where the Householder QR decomposition is stored
+   */
+  const MatrixType& matrixQR() const {
+    eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
+    return m_qr;
+  }
 
-    /** \returns a const reference to the column permutation matrix */
-    const PermutationType& colsPermutation() const
-    {
-      eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
-      return m_cols_permutation;
-    }
+  template <typename InputType>
+  FullPivHouseholderQR& compute(const EigenBase<InputType>& matrix);
 
-    /** \returns a const reference to the vector of indices representing the rows transpositions */
-    const IntDiagSizeVectorType& rowsTranspositions() const
-    {
-      eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
-      return m_rows_transpositions;
-    }
+  /** \returns a const reference to the column permutation matrix */
+  const PermutationType& colsPermutation() const {
+    eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
+    return m_cols_permutation;
+  }
 
-    /** \returns the absolute value of the determinant of the matrix of which
-      * *this is the QR decomposition. It has only linear complexity
-      * (that is, O(n) where n is the dimension of the square matrix)
-      * as the QR decomposition has already been computed.
-      *
-      * \note This is only for square matrices.
-      *
-      * \warning a determinant can be very big or small, so for matrices
-      * of large enough dimension, there is a risk of overflow/underflow.
-      * One way to work around that is to use logAbsDeterminant() instead.
-      *
-      * \sa logAbsDeterminant(), MatrixBase::determinant()
-      */
-    typename MatrixType::RealScalar absDeterminant() const;
-
-    /** \returns the natural log of the absolute value of the determinant of the matrix of which
-      * *this is the QR decomposition. It has only linear complexity
-      * (that is, O(n) where n is the dimension of the square matrix)
-      * as the QR decomposition has already been computed.
-      *
-      * \note This is only for square matrices.
-      *
-      * \note This method is useful to work around the risk of overflow/underflow that's inherent
-      * to determinant computation.
-      *
-      * \sa absDeterminant(), MatrixBase::determinant()
-      */
-    typename MatrixType::RealScalar logAbsDeterminant() const;
-
-    /** \returns the rank of the matrix of which *this is the QR decomposition.
-      *
-      * \note This method has to determine which pivots should be considered nonzero.
-      *       For that, it uses the threshold value that you can control by calling
-      *       setThreshold(const RealScalar&).
-      */
-    inline Index rank() const
-    {
-      using std::abs;
-      eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
-      RealScalar premultiplied_threshold = abs(m_maxpivot) * threshold();
-      Index result = 0;
-      for(Index i = 0; i < m_nonzero_pivots; ++i)
-        result += (abs(m_qr.coeff(i,i)) > premultiplied_threshold);
-      return result;
-    }
+  /** \returns a const reference to the vector of indices representing the rows transpositions */
+  const IntDiagSizeVectorType& rowsTranspositions() const {
+    eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
+    return m_rows_transpositions;
+  }
 
-    /** \returns the dimension of the kernel of the matrix of which *this is the QR decomposition.
-      *
-      * \note This method has to determine which pivots should be considered nonzero.
-      *       For that, it uses the threshold value that you can control by calling
-      *       setThreshold(const RealScalar&).
-      */
-    inline Index dimensionOfKernel() const
-    {
-      eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
-      return cols() - rank();
-    }
+  /** \returns the determinant of the matrix of which
+   * *this is the QR decomposition. It has only linear complexity
+   * (that is, O(n) where n is the dimension of the square matrix)
+   * as the QR decomposition has already been computed.
+   *
+   * \note This is only for square matrices.
+   *
+   * \warning a determinant can be very big or small, so for matrices
+   * of large enough dimension, there is a risk of overflow/underflow.
+   * One way to work around that is to use logAbsDeterminant() instead.
+   *
+   * \sa absDeterminant(), logAbsDeterminant(), MatrixBase::determinant()
+   */
+  typename MatrixType::Scalar determinant() const;
+
+  /** \returns the absolute value of the determinant of the matrix of which
+   * *this is the QR decomposition. It has only linear complexity
+   * (that is, O(n) where n is the dimension of the square matrix)
+   * as the QR decomposition has already been computed.
+   *
+   * \note This is only for square matrices.
+   *
+   * \warning a determinant can be very big or small, so for matrices
+   * of large enough dimension, there is a risk of overflow/underflow.
+   * One way to work around that is to use logAbsDeterminant() instead.
+   *
+   * \sa determinant(), logAbsDeterminant(), MatrixBase::determinant()
+   */
+  typename MatrixType::RealScalar absDeterminant() const;
+
+  /** \returns the natural log of the absolute value of the determinant of the matrix of which
+   * *this is the QR decomposition. It has only linear complexity
+   * (that is, O(n) where n is the dimension of the square matrix)
+   * as the QR decomposition has already been computed.
+   *
+   * \note This is only for square matrices.
+   *
+   * \note This method is useful to work around the risk of overflow/underflow that's inherent
+   * to determinant computation.
+   *
+   * \sa determinant(), absDeterminant(), MatrixBase::determinant()
+   */
+  typename MatrixType::RealScalar logAbsDeterminant() const;
+
+  /** \returns the sign of the determinant of the matrix of which
+   * *this is the QR decomposition. It has only linear complexity
+   * (that is, O(n) where n is the dimension of the square matrix)
+   * as the QR decomposition has already been computed.
+   *
+   * \note This is only for square matrices.
+   *
+   * \note This method is useful to work around the risk of overflow/underflow that's inherent
+   * to determinant computation.
+   *
+   * \sa determinant(), absDeterminant(), logAbsDeterminant(), MatrixBase::determinant()
+   */
+  typename MatrixType::Scalar signDeterminant() const;
+
+  /** \returns the rank of the matrix of which *this is the QR decomposition.
+   *
+   * \note This method has to determine which pivots should be considered nonzero.
+   *       For that, it uses the threshold value that you can control by calling
+   *       setThreshold(const RealScalar&).
+   */
+  inline Index rank() const {
+    using std::abs;
+    eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
+    RealScalar premultiplied_threshold = abs(m_maxpivot) * threshold();
+    Index result = 0;
+    for (Index i = 0; i < m_nonzero_pivots; ++i) result += (abs(m_qr.coeff(i, i)) > premultiplied_threshold);
+    return result;
+  }
 
-    /** \returns true if the matrix of which *this is the QR decomposition represents an injective
-      *          linear map, i.e. has trivial kernel; false otherwise.
-      *
-      * \note This method has to determine which pivots should be considered nonzero.
-      *       For that, it uses the threshold value that you can control by calling
-      *       setThreshold(const RealScalar&).
-      */
-    inline bool isInjective() const
-    {
-      eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
-      return rank() == cols();
-    }
+  /** \returns the dimension of the kernel of the matrix of which *this is the QR decomposition.
+   *
+   * \note This method has to determine which pivots should be considered nonzero.
+   *       For that, it uses the threshold value that you can control by calling
+   *       setThreshold(const RealScalar&).
+   */
+  inline Index dimensionOfKernel() const {
+    eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
+    return cols() - rank();
+  }
 
-    /** \returns true if the matrix of which *this is the QR decomposition represents a surjective
-      *          linear map; false otherwise.
-      *
-      * \note This method has to determine which pivots should be considered nonzero.
-      *       For that, it uses the threshold value that you can control by calling
-      *       setThreshold(const RealScalar&).
-      */
-    inline bool isSurjective() const
-    {
-      eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
-      return rank() == rows();
-    }
+  /** \returns true if the matrix of which *this is the QR decomposition represents an injective
+   *          linear map, i.e. has trivial kernel; false otherwise.
+   *
+   * \note This method has to determine which pivots should be considered nonzero.
+   *       For that, it uses the threshold value that you can control by calling
+   *       setThreshold(const RealScalar&).
+   */
+  inline bool isInjective() const {
+    eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
+    return rank() == cols();
+  }
 
-    /** \returns true if the matrix of which *this is the QR decomposition is invertible.
-      *
-      * \note This method has to determine which pivots should be considered nonzero.
-      *       For that, it uses the threshold value that you can control by calling
-      *       setThreshold(const RealScalar&).
-      */
-    inline bool isInvertible() const
-    {
-      eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
-      return isInjective() && isSurjective();
-    }
+  /** \returns true if the matrix of which *this is the QR decomposition represents a surjective
+   *          linear map; false otherwise.
+   *
+   * \note This method has to determine which pivots should be considered nonzero.
+   *       For that, it uses the threshold value that you can control by calling
+   *       setThreshold(const RealScalar&).
+   */
+  inline bool isSurjective() const {
+    eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
+    return rank() == rows();
+  }
 
-    /** \returns the inverse of the matrix of which *this is the QR decomposition.
-      *
-      * \note If this matrix is not invertible, the returned matrix has undefined coefficients.
-      *       Use isInvertible() to first determine whether this matrix is invertible.
-      */    inline const
-    internal::solve_retval<FullPivHouseholderQR, typename MatrixType::IdentityReturnType>
-    inverse() const
-    {
-      eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
-      return internal::solve_retval<FullPivHouseholderQR,typename MatrixType::IdentityReturnType>
-               (*this, MatrixType::Identity(m_qr.rows(), m_qr.cols()));
-    }
+  /** \returns true if the matrix of which *this is the QR decomposition is invertible.
+   *
+   * \note This method has to determine which pivots should be considered nonzero.
+   *       For that, it uses the threshold value that you can control by calling
+   *       setThreshold(const RealScalar&).
+   */
+  inline bool isInvertible() const {
+    eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
+    return isInjective() && isSurjective();
+  }
 
-    inline Index rows() const { return m_qr.rows(); }
-    inline Index cols() const { return m_qr.cols(); }
-    
-    /** \returns a const reference to the vector of Householder coefficients used to represent the factor \c Q.
-      * 
-      * For advanced uses only.
-      */
-    const HCoeffsType& hCoeffs() const { return m_hCoeffs; }
-
-    /** Allows to prescribe a threshold to be used by certain methods, such as rank(),
-      * who need to determine when pivots are to be considered nonzero. This is not used for the
-      * QR decomposition itself.
-      *
-      * When it needs to get the threshold value, Eigen calls threshold(). By default, this
-      * uses a formula to automatically determine a reasonable threshold.
-      * Once you have called the present method setThreshold(const RealScalar&),
-      * your value is used instead.
-      *
-      * \param threshold The new value to use as the threshold.
-      *
-      * A pivot will be considered nonzero if its absolute value is strictly greater than
-      *  \f$ \vert pivot \vert \leqslant threshold \times \vert maxpivot \vert \f$
-      * where maxpivot is the biggest pivot.
-      *
-      * If you want to come back to the default behavior, call setThreshold(Default_t)
-      */
-    FullPivHouseholderQR& setThreshold(const RealScalar& threshold)
-    {
-      m_usePrescribedThreshold = true;
-      m_prescribedThreshold = threshold;
-      return *this;
-    }
+  /** \returns the inverse of the matrix of which *this is the QR decomposition.
+   *
+   * \note If this matrix is not invertible, the returned matrix has undefined coefficients.
+   *       Use isInvertible() to first determine whether this matrix is invertible.
+   */
+  inline const Inverse<FullPivHouseholderQR> inverse() const {
+    eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
+    return Inverse<FullPivHouseholderQR>(*this);
+  }
 
-    /** Allows to come back to the default behavior, letting Eigen use its default formula for
-      * determining the threshold.
-      *
-      * You should pass the special object Eigen::Default as parameter here.
-      * \code qr.setThreshold(Eigen::Default); \endcode
-      *
-      * See the documentation of setThreshold(const RealScalar&).
-      */
-    FullPivHouseholderQR& setThreshold(Default_t)
-    {
-      m_usePrescribedThreshold = false;
-      return *this;
-    }
+  inline Index rows() const { return m_qr.rows(); }
+  inline Index cols() const { return m_qr.cols(); }
+
+  /** \returns a const reference to the vector of Householder coefficients used to represent the factor \c Q.
+   *
+   * For advanced uses only.
+   */
+  const HCoeffsType& hCoeffs() const { return m_hCoeffs; }
+
+  /** Allows to prescribe a threshold to be used by certain methods, such as rank(),
+   * who need to determine when pivots are to be considered nonzero. This is not used for the
+   * QR decomposition itself.
+   *
+   * When it needs to get the threshold value, Eigen calls threshold(). By default, this
+   * uses a formula to automatically determine a reasonable threshold.
+   * Once you have called the present method setThreshold(const RealScalar&),
+   * your value is used instead.
+   *
+   * \param threshold The new value to use as the threshold.
+   *
+   * A pivot will be considered nonzero if its absolute value is strictly greater than
+   *  \f$ \vert pivot \vert \leqslant threshold \times \vert maxpivot \vert \f$
+   * where maxpivot is the biggest pivot.
+   *
+   * If you want to come back to the default behavior, call setThreshold(Default_t)
+   */
+  FullPivHouseholderQR& setThreshold(const RealScalar& threshold) {
+    m_usePrescribedThreshold = true;
+    m_prescribedThreshold = threshold;
+    return *this;
+  }
 
-    /** Returns the threshold that will be used by certain methods such as rank().
-      *
-      * See the documentation of setThreshold(const RealScalar&).
-      */
-    RealScalar threshold() const
-    {
-      eigen_assert(m_isInitialized || m_usePrescribedThreshold);
-      return m_usePrescribedThreshold ? m_prescribedThreshold
-      // this formula comes from experimenting (see "LU precision tuning" thread on the list)
-      // and turns out to be identical to Higham's formula used already in LDLt.
-                                      : NumTraits<Scalar>::epsilon() * RealScalar(m_qr.diagonalSize());
-    }
+  /** Allows to come back to the default behavior, letting Eigen use its default formula for
+   * determining the threshold.
+   *
+   * You should pass the special object Eigen::Default as parameter here.
+   * \code qr.setThreshold(Eigen::Default); \endcode
+   *
+   * See the documentation of setThreshold(const RealScalar&).
+   */
+  FullPivHouseholderQR& setThreshold(Default_t) {
+    m_usePrescribedThreshold = false;
+    return *this;
+  }
 
-    /** \returns the number of nonzero pivots in the QR decomposition.
-      * Here nonzero is meant in the exact sense, not in a fuzzy sense.
-      * So that notion isn't really intrinsically interesting, but it is
-      * still useful when implementing algorithms.
-      *
-      * \sa rank()
-      */
-    inline Index nonzeroPivots() const
-    {
-      eigen_assert(m_isInitialized && "LU is not initialized.");
-      return m_nonzero_pivots;
-    }
+  /** Returns the threshold that will be used by certain methods such as rank().
+   *
+   * See the documentation of setThreshold(const RealScalar&).
+   */
+  RealScalar threshold() const {
+    eigen_assert(m_isInitialized || m_usePrescribedThreshold);
+    return m_usePrescribedThreshold ? m_prescribedThreshold
+                                    // this formula comes from experimenting (see "LU precision tuning" thread on the
+                                    // list) and turns out to be identical to Higham's formula used already in LDLt.
+                                    : NumTraits<Scalar>::epsilon() * RealScalar(m_qr.diagonalSize());
+  }
 
-    /** \returns the absolute value of the biggest pivot, i.e. the biggest
-      *          diagonal coefficient of U.
-      */
-    RealScalar maxPivot() const { return m_maxpivot; }
+  /** \returns the number of nonzero pivots in the QR decomposition.
+   * Here nonzero is meant in the exact sense, not in a fuzzy sense.
+   * So that notion isn't really intrinsically interesting, but it is
+   * still useful when implementing algorithms.
+   *
+   * \sa rank()
+   */
+  inline Index nonzeroPivots() const {
+    eigen_assert(m_isInitialized && "LU is not initialized.");
+    return m_nonzero_pivots;
+  }
 
-  protected:
-    
-    static void check_template_parameters()
-    {
-      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
-    }
-    
-    MatrixType m_qr;
-    HCoeffsType m_hCoeffs;
-    IntDiagSizeVectorType m_rows_transpositions;
-    IntDiagSizeVectorType m_cols_transpositions;
-    PermutationType m_cols_permutation;
-    RowVectorType m_temp;
-    bool m_isInitialized, m_usePrescribedThreshold;
-    RealScalar m_prescribedThreshold, m_maxpivot;
-    Index m_nonzero_pivots;
-    RealScalar m_precision;
-    Index m_det_pq;
+  /** \returns the absolute value of the biggest pivot, i.e. the biggest
+   *          diagonal coefficient of U.
+   */
+  RealScalar maxPivot() const { return m_maxpivot; }
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  template <typename RhsType, typename DstType>
+  void _solve_impl(const RhsType& rhs, DstType& dst) const;
+
+  template <bool Conjugate, typename RhsType, typename DstType>
+  void _solve_impl_transposed(const RhsType& rhs, DstType& dst) const;
+#endif
+
+ protected:
+  EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
+
+  void computeInPlace();
+
+  MatrixType m_qr;
+  HCoeffsType m_hCoeffs;
+  IntDiagSizeVectorType m_rows_transpositions;
+  IntDiagSizeVectorType m_cols_transpositions;
+  PermutationType m_cols_permutation;
+  RowVectorType m_temp;
+  bool m_isInitialized, m_usePrescribedThreshold;
+  RealScalar m_prescribedThreshold, m_maxpivot;
+  Index m_nonzero_pivots;
+  RealScalar m_precision;
+  Index m_det_p;
 };
 
-template<typename MatrixType>
-typename MatrixType::RealScalar FullPivHouseholderQR<MatrixType>::absDeterminant() const
-{
+template <typename MatrixType, typename PermutationIndex>
+typename MatrixType::Scalar FullPivHouseholderQR<MatrixType, PermutationIndex>::determinant() const {
+  eigen_assert(m_isInitialized && "HouseholderQR is not initialized.");
+  eigen_assert(m_qr.rows() == m_qr.cols() && "You can't take the determinant of a non-square matrix!");
+  Scalar detQ;
+  internal::householder_determinant<HCoeffsType, Scalar, NumTraits<Scalar>::IsComplex>::run(m_hCoeffs, detQ);
+  return isInjective() ? (detQ * Scalar(m_det_p)) * m_qr.diagonal().prod() : Scalar(0);
+}
+
+template <typename MatrixType, typename PermutationIndex>
+typename MatrixType::RealScalar FullPivHouseholderQR<MatrixType, PermutationIndex>::absDeterminant() const {
   using std::abs;
   eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
   eigen_assert(m_qr.rows() == m_qr.cols() && "You can't take the determinant of a non-square matrix!");
-  return abs(m_qr.diagonal().prod());
+  return isInjective() ? abs(m_qr.diagonal().prod()) : RealScalar(0);
+}
+
+template <typename MatrixType, typename PermutationIndex>
+typename MatrixType::RealScalar FullPivHouseholderQR<MatrixType, PermutationIndex>::logAbsDeterminant() const {
+  eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
+  eigen_assert(m_qr.rows() == m_qr.cols() && "You can't take the determinant of a non-square matrix!");
+  return isInjective() ? m_qr.diagonal().cwiseAbs().array().log().sum() : -NumTraits<RealScalar>::infinity();
 }
 
-template<typename MatrixType>
-typename MatrixType::RealScalar FullPivHouseholderQR<MatrixType>::logAbsDeterminant() const
-{
+template <typename MatrixType, typename PermutationIndex>
+typename MatrixType::Scalar FullPivHouseholderQR<MatrixType, PermutationIndex>::signDeterminant() const {
   eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
   eigen_assert(m_qr.rows() == m_qr.cols() && "You can't take the determinant of a non-square matrix!");
-  return m_qr.diagonal().cwiseAbs().array().log().sum();
+  Scalar detQ;
+  internal::householder_determinant<HCoeffsType, Scalar, NumTraits<Scalar>::IsComplex>::run(m_hCoeffs, detQ);
+  return isInjective() ? (detQ * Scalar(m_det_p)) * m_qr.diagonal().array().sign().prod() : Scalar(0);
 }
 
 /** Performs the QR factorization of the given matrix \a matrix. The result of
-  * the factorization is stored into \c *this, and a reference to \c *this
-  * is returned.
-  *
-  * \sa class FullPivHouseholderQR, FullPivHouseholderQR(const MatrixType&)
-  */
-template<typename MatrixType>
-FullPivHouseholderQR<MatrixType>& FullPivHouseholderQR<MatrixType>::compute(const MatrixType& matrix)
-{
-  check_template_parameters();
-  
+ * the factorization is stored into \c *this, and a reference to \c *this
+ * is returned.
+ *
+ * \sa class FullPivHouseholderQR, FullPivHouseholderQR(const MatrixType&)
+ */
+template <typename MatrixType, typename PermutationIndex>
+template <typename InputType>
+FullPivHouseholderQR<MatrixType, PermutationIndex>& FullPivHouseholderQR<MatrixType, PermutationIndex>::compute(
+    const EigenBase<InputType>& matrix) {
+  m_qr = matrix.derived();
+  computeInPlace();
+  return *this;
+}
+
+template <typename MatrixType, typename PermutationIndex>
+void FullPivHouseholderQR<MatrixType, PermutationIndex>::computeInPlace() {
+  eigen_assert(m_qr.cols() <= NumTraits<PermutationIndex>::highest());
   using std::abs;
-  Index rows = matrix.rows();
-  Index cols = matrix.cols();
-  Index size = (std::min)(rows,cols);
+  Index rows = m_qr.rows();
+  Index cols = m_qr.cols();
+  Index size = (std::min)(rows, cols);
 
-  m_qr = matrix;
   m_hCoeffs.resize(size);
 
   m_temp.resize(cols);
@@ -433,144 +512,172 @@ FullPivHouseholderQR<MatrixType>& FullPivHouseholderQR<MatrixType>::compute(cons
 
   RealScalar biggest(0);
 
-  m_nonzero_pivots = size; // the generic case is that in which all pivots are nonzero (invertible case)
+  m_nonzero_pivots = size;  // the generic case is that in which all pivots are nonzero (invertible case)
   m_maxpivot = RealScalar(0);
 
-  for (Index k = 0; k < size; ++k)
-  {
+  for (Index k = 0; k < size; ++k) {
     Index row_of_biggest_in_corner, col_of_biggest_in_corner;
-    RealScalar biggest_in_corner;
+    typedef internal::scalar_score_coeff_op<Scalar> Scoring;
+    typedef typename Scoring::result_type Score;
 
-    biggest_in_corner = m_qr.bottomRightCorner(rows-k, cols-k)
-                            .cwiseAbs()
-                            .maxCoeff(&row_of_biggest_in_corner, &col_of_biggest_in_corner);
+    Score score = m_qr.bottomRightCorner(rows - k, cols - k)
+                      .unaryExpr(Scoring())
+                      .maxCoeff(&row_of_biggest_in_corner, &col_of_biggest_in_corner);
     row_of_biggest_in_corner += k;
     col_of_biggest_in_corner += k;
-    if(k==0) biggest = biggest_in_corner;
+    RealScalar biggest_in_corner =
+        internal::abs_knowing_score<Scalar>()(m_qr(row_of_biggest_in_corner, col_of_biggest_in_corner), score);
+    if (k == 0) biggest = biggest_in_corner;
 
     // if the corner is negligible, then we have less than full rank, and we can finish early
-    if(internal::isMuchSmallerThan(biggest_in_corner, biggest, m_precision))
-    {
+    if (internal::isMuchSmallerThan(biggest_in_corner, biggest, m_precision)) {
       m_nonzero_pivots = k;
-      for(Index i = k; i < size; i++)
-      {
-        m_rows_transpositions.coeffRef(i) = i;
-        m_cols_transpositions.coeffRef(i) = i;
+      for (Index i = k; i < size; i++) {
+        m_rows_transpositions.coeffRef(i) = internal::convert_index<PermutationIndex>(i);
+        m_cols_transpositions.coeffRef(i) = internal::convert_index<PermutationIndex>(i);
         m_hCoeffs.coeffRef(i) = Scalar(0);
       }
       break;
     }
 
-    m_rows_transpositions.coeffRef(k) = row_of_biggest_in_corner;
-    m_cols_transpositions.coeffRef(k) = col_of_biggest_in_corner;
-    if(k != row_of_biggest_in_corner) {
-      m_qr.row(k).tail(cols-k).swap(m_qr.row(row_of_biggest_in_corner).tail(cols-k));
+    m_rows_transpositions.coeffRef(k) = internal::convert_index<PermutationIndex>(row_of_biggest_in_corner);
+    m_cols_transpositions.coeffRef(k) = internal::convert_index<PermutationIndex>(col_of_biggest_in_corner);
+    if (k != row_of_biggest_in_corner) {
+      m_qr.row(k).tail(cols - k).swap(m_qr.row(row_of_biggest_in_corner).tail(cols - k));
       ++number_of_transpositions;
     }
-    if(k != col_of_biggest_in_corner) {
+    if (k != col_of_biggest_in_corner) {
       m_qr.col(k).swap(m_qr.col(col_of_biggest_in_corner));
       ++number_of_transpositions;
     }
 
     RealScalar beta;
-    m_qr.col(k).tail(rows-k).makeHouseholderInPlace(m_hCoeffs.coeffRef(k), beta);
-    m_qr.coeffRef(k,k) = beta;
+    m_qr.col(k).tail(rows - k).makeHouseholderInPlace(m_hCoeffs.coeffRef(k), beta);
+    m_qr.coeffRef(k, k) = beta;
 
     // remember the maximum absolute value of diagonal coefficients
-    if(abs(beta) > m_maxpivot) m_maxpivot = abs(beta);
+    if (abs(beta) > m_maxpivot) m_maxpivot = abs(beta);
 
-    m_qr.bottomRightCorner(rows-k, cols-k-1)
-        .applyHouseholderOnTheLeft(m_qr.col(k).tail(rows-k-1), m_hCoeffs.coeffRef(k), &m_temp.coeffRef(k+1));
+    m_qr.bottomRightCorner(rows - k, cols - k - 1)
+        .applyHouseholderOnTheLeft(m_qr.col(k).tail(rows - k - 1), m_hCoeffs.coeffRef(k), &m_temp.coeffRef(k + 1));
   }
 
   m_cols_permutation.setIdentity(cols);
-  for(Index k = 0; k < size; ++k)
-    m_cols_permutation.applyTranspositionOnTheRight(k, m_cols_transpositions.coeff(k));
+  for (Index k = 0; k < size; ++k) m_cols_permutation.applyTranspositionOnTheRight(k, m_cols_transpositions.coeff(k));
 
-  m_det_pq = (number_of_transpositions%2) ? -1 : 1;
+  m_det_p = (number_of_transpositions % 2) ? -1 : 1;
   m_isInitialized = true;
+}
 
-  return *this;
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+template <typename MatrixType_, typename PermutationIndex_>
+template <typename RhsType, typename DstType>
+void FullPivHouseholderQR<MatrixType_, PermutationIndex_>::_solve_impl(const RhsType& rhs, DstType& dst) const {
+  const Index l_rank = rank();
+
+  // FIXME introduce nonzeroPivots() and use it here. and more generally,
+  // make the same improvements in this dec as in FullPivLU.
+  if (l_rank == 0) {
+    dst.setZero();
+    return;
+  }
+
+  typename RhsType::PlainObject c(rhs);
+
+  Matrix<typename RhsType::Scalar, 1, RhsType::ColsAtCompileTime> temp(rhs.cols());
+  for (Index k = 0; k < l_rank; ++k) {
+    Index remainingSize = rows() - k;
+    c.row(k).swap(c.row(m_rows_transpositions.coeff(k)));
+    c.bottomRightCorner(remainingSize, rhs.cols())
+        .applyHouseholderOnTheLeft(m_qr.col(k).tail(remainingSize - 1), m_hCoeffs.coeff(k), &temp.coeffRef(0));
+  }
+
+  m_qr.topLeftCorner(l_rank, l_rank).template triangularView<Upper>().solveInPlace(c.topRows(l_rank));
+
+  for (Index i = 0; i < l_rank; ++i) dst.row(m_cols_permutation.indices().coeff(i)) = c.row(i);
+  for (Index i = l_rank; i < cols(); ++i) dst.row(m_cols_permutation.indices().coeff(i)).setZero();
 }
 
-namespace internal {
+template <typename MatrixType_, typename PermutationIndex_>
+template <bool Conjugate, typename RhsType, typename DstType>
+void FullPivHouseholderQR<MatrixType_, PermutationIndex_>::_solve_impl_transposed(const RhsType& rhs,
+                                                                                  DstType& dst) const {
+  const Index l_rank = rank();
 
-template<typename _MatrixType, typename Rhs>
-struct solve_retval<FullPivHouseholderQR<_MatrixType>, Rhs>
-  : solve_retval_base<FullPivHouseholderQR<_MatrixType>, Rhs>
-{
-  EIGEN_MAKE_SOLVE_HELPERS(FullPivHouseholderQR<_MatrixType>,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    const Index rows = dec().rows(), cols = dec().cols();
-    eigen_assert(rhs().rows() == rows);
-
-    // FIXME introduce nonzeroPivots() and use it here. and more generally,
-    // make the same improvements in this dec as in FullPivLU.
-    if(dec().rank()==0)
-    {
-      dst.setZero();
-      return;
-    }
+  if (l_rank == 0) {
+    dst.setZero();
+    return;
+  }
 
-    typename Rhs::PlainObject c(rhs());
+  typename RhsType::PlainObject c(m_cols_permutation.transpose() * rhs);
 
-    Matrix<Scalar,1,Rhs::ColsAtCompileTime> temp(rhs().cols());
-    for (Index k = 0; k < dec().rank(); ++k)
-    {
-      Index remainingSize = rows-k;
-      c.row(k).swap(c.row(dec().rowsTranspositions().coeff(k)));
-      c.bottomRightCorner(remainingSize, rhs().cols())
-       .applyHouseholderOnTheLeft(dec().matrixQR().col(k).tail(remainingSize-1),
-                                  dec().hCoeffs().coeff(k), &temp.coeffRef(0));
-    }
+  m_qr.topLeftCorner(l_rank, l_rank)
+      .template triangularView<Upper>()
+      .transpose()
+      .template conjugateIf<Conjugate>()
+      .solveInPlace(c.topRows(l_rank));
+
+  dst.topRows(l_rank) = c.topRows(l_rank);
+  dst.bottomRows(rows() - l_rank).setZero();
+
+  Matrix<Scalar, 1, DstType::ColsAtCompileTime> temp(dst.cols());
+  const Index size = (std::min)(rows(), cols());
+  for (Index k = size - 1; k >= 0; --k) {
+    Index remainingSize = rows() - k;
 
-    dec().matrixQR()
-       .topLeftCorner(dec().rank(), dec().rank())
-       .template triangularView<Upper>()
-       .solveInPlace(c.topRows(dec().rank()));
+    dst.bottomRightCorner(remainingSize, dst.cols())
+        .applyHouseholderOnTheLeft(m_qr.col(k).tail(remainingSize - 1).template conjugateIf<!Conjugate>(),
+                                   m_hCoeffs.template conjugateIf<Conjugate>().coeff(k), &temp.coeffRef(0));
 
-    for(Index i = 0; i < dec().rank(); ++i) dst.row(dec().colsPermutation().indices().coeff(i)) = c.row(i);
-    for(Index i = dec().rank(); i < cols; ++i) dst.row(dec().colsPermutation().indices().coeff(i)).setZero();
+    dst.row(k).swap(dst.row(m_rows_transpositions.coeff(k)));
+  }
+}
+#endif
+
+namespace internal {
+
+template <typename DstXprType, typename MatrixType, typename PermutationIndex>
+struct Assignment<DstXprType, Inverse<FullPivHouseholderQR<MatrixType, PermutationIndex> >,
+                  internal::assign_op<typename DstXprType::Scalar,
+                                      typename FullPivHouseholderQR<MatrixType, PermutationIndex>::Scalar>,
+                  Dense2Dense> {
+  typedef FullPivHouseholderQR<MatrixType, PermutationIndex> QrType;
+  typedef Inverse<QrType> SrcXprType;
+  static void run(DstXprType& dst, const SrcXprType& src,
+                  const internal::assign_op<typename DstXprType::Scalar, typename QrType::Scalar>&) {
+    dst = src.nestedExpression().solve(MatrixType::Identity(src.rows(), src.cols()));
   }
 };
 
 /** \ingroup QR_Module
-  *
-  * \brief Expression type for return value of FullPivHouseholderQR::matrixQ()
-  *
-  * \tparam MatrixType type of underlying dense matrix
-  */
-template<typename MatrixType> struct FullPivHouseholderQRMatrixQReturnType
-  : public ReturnByValue<FullPivHouseholderQRMatrixQReturnType<MatrixType> >
-{
-public:
-  typedef typename MatrixType::Index Index;
-  typedef typename FullPivHouseholderQR<MatrixType>::IntDiagSizeVectorType IntDiagSizeVectorType;
+ *
+ * \brief Expression type for return value of FullPivHouseholderQR::matrixQ()
+ *
+ * \tparam MatrixType type of underlying dense matrix
+ */
+template <typename MatrixType, typename PermutationIndex>
+struct FullPivHouseholderQRMatrixQReturnType
+    : public ReturnByValue<FullPivHouseholderQRMatrixQReturnType<MatrixType, PermutationIndex> > {
+ public:
+  typedef typename FullPivHouseholderQR<MatrixType, PermutationIndex>::IntDiagSizeVectorType IntDiagSizeVectorType;
   typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
   typedef Matrix<typename MatrixType::Scalar, 1, MatrixType::RowsAtCompileTime, RowMajor, 1,
-                 MatrixType::MaxRowsAtCompileTime> WorkVectorType;
+                 MatrixType::MaxRowsAtCompileTime>
+      WorkVectorType;
 
-  FullPivHouseholderQRMatrixQReturnType(const MatrixType&       qr,
-                                        const HCoeffsType&      hCoeffs,
+  FullPivHouseholderQRMatrixQReturnType(const MatrixType& qr, const HCoeffsType& hCoeffs,
                                         const IntDiagSizeVectorType& rowsTranspositions)
-    : m_qr(qr),
-      m_hCoeffs(hCoeffs),
-      m_rowsTranspositions(rowsTranspositions)
-      {}
+      : m_qr(qr), m_hCoeffs(hCoeffs), m_rowsTranspositions(rowsTranspositions) {}
 
   template <typename ResultType>
-  void evalTo(ResultType& result) const
-  {
+  void evalTo(ResultType& result) const {
     const Index rows = m_qr.rows();
     WorkVectorType workspace(rows);
     evalTo(result, workspace);
   }
 
   template <typename ResultType>
-  void evalTo(ResultType& result, WorkVectorType& workspace) const
-  {
+  void evalTo(ResultType& result, WorkVectorType& workspace) const {
     using numext::conj;
     // compute the product H'_0 H'_1 ... H'_n-1,
     // where H_k is the k-th Householder transformation I - h_k v_k v_k'
@@ -580,43 +687,47 @@ template<typename MatrixType> struct FullPivHouseholderQRMatrixQReturnType
     const Index size = (std::min)(rows, cols);
     workspace.resize(rows);
     result.setIdentity(rows, rows);
-    for (Index k = size-1; k >= 0; k--)
-    {
-      result.block(k, k, rows-k, rows-k)
-            .applyHouseholderOnTheLeft(m_qr.col(k).tail(rows-k-1), conj(m_hCoeffs.coeff(k)), &workspace.coeffRef(k));
+    for (Index k = size - 1; k >= 0; k--) {
+      result.block(k, k, rows - k, rows - k)
+          .applyHouseholderOnTheLeft(m_qr.col(k).tail(rows - k - 1), conj(m_hCoeffs.coeff(k)), &workspace.coeffRef(k));
       result.row(k).swap(result.row(m_rowsTranspositions.coeff(k)));
     }
   }
 
-    Index rows() const { return m_qr.rows(); }
-    Index cols() const { return m_qr.rows(); }
+  Index rows() const { return m_qr.rows(); }
+  Index cols() const { return m_qr.rows(); }
 
-protected:
+ protected:
   typename MatrixType::Nested m_qr;
   typename HCoeffsType::Nested m_hCoeffs;
   typename IntDiagSizeVectorType::Nested m_rowsTranspositions;
 };
 
-} // end namespace internal
+// template<typename MatrixType>
+// struct evaluator<FullPivHouseholderQRMatrixQReturnType<MatrixType> >
+//  : public evaluator<ReturnByValue<FullPivHouseholderQRMatrixQReturnType<MatrixType> > >
+// {};
+
+}  // end namespace internal
 
-template<typename MatrixType>
-inline typename FullPivHouseholderQR<MatrixType>::MatrixQReturnType FullPivHouseholderQR<MatrixType>::matrixQ() const
-{
+template <typename MatrixType, typename PermutationIndex>
+inline typename FullPivHouseholderQR<MatrixType, PermutationIndex>::MatrixQReturnType
+FullPivHouseholderQR<MatrixType, PermutationIndex>::matrixQ() const {
   eigen_assert(m_isInitialized && "FullPivHouseholderQR is not initialized.");
   return MatrixQReturnType(m_qr, m_hCoeffs, m_rows_transpositions);
 }
 
 /** \return the full-pivoting Householder QR decomposition of \c *this.
-  *
-  * \sa class FullPivHouseholderQR
-  */
-template<typename Derived>
-const FullPivHouseholderQR<typename MatrixBase<Derived>::PlainObject>
-MatrixBase<Derived>::fullPivHouseholderQr() const
-{
-  return FullPivHouseholderQR<PlainObject>(eval());
+ *
+ * \sa class FullPivHouseholderQR
+ */
+template <typename Derived>
+template <typename PermutationIndex>
+const FullPivHouseholderQR<typename MatrixBase<Derived>::PlainObject, PermutationIndex>
+MatrixBase<Derived>::fullPivHouseholderQr() const {
+  return FullPivHouseholderQR<PlainObject, PermutationIndex>(eval());
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_FULLPIVOTINGHOUSEHOLDERQR_H
+#endif  // EIGEN_FULLPIVOTINGHOUSEHOLDERQR_H
diff --git a/inst/include/Eigen/src/QR/HouseholderQR.h b/inst/include/Eigen/src/QR/HouseholderQR.h
index 343a6649..497085db 100644
--- a/inst/include/Eigen/src/QR/HouseholderQR.h
+++ b/inst/include/Eigen/src/QR/HouseholderQR.h
@@ -12,285 +12,442 @@
 #ifndef EIGEN_QR_H
 #define EIGEN_QR_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+template <typename MatrixType_>
+struct traits<HouseholderQR<MatrixType_>> : traits<MatrixType_> {
+  typedef MatrixXpr XprKind;
+  typedef SolverStorage StorageKind;
+  typedef int StorageIndex;
+  enum { Flags = 0 };
+};
+
+}  // end namespace internal
 
 /** \ingroup QR_Module
-  *
-  *
-  * \class HouseholderQR
-  *
-  * \brief Householder QR decomposition of a matrix
-  *
-  * \param MatrixType the type of the matrix of which we are computing the QR decomposition
-  *
-  * This class performs a QR decomposition of a matrix \b A into matrices \b Q and \b R
-  * such that 
-  * \f[
-  *  \mathbf{A} = \mathbf{Q} \, \mathbf{R}
-  * \f]
-  * by using Householder transformations. Here, \b Q a unitary matrix and \b R an upper triangular matrix.
-  * The result is stored in a compact way compatible with LAPACK.
-  *
-  * Note that no pivoting is performed. This is \b not a rank-revealing decomposition.
-  * If you want that feature, use FullPivHouseholderQR or ColPivHouseholderQR instead.
-  *
-  * This Householder QR decomposition is faster, but less numerically stable and less feature-full than
-  * FullPivHouseholderQR or ColPivHouseholderQR.
-  *
-  * \sa MatrixBase::householderQr()
-  */
-template<typename _MatrixType> class HouseholderQR
-{
-  public:
-
-    typedef _MatrixType MatrixType;
-    enum {
-      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-      Options = MatrixType::Options,
-      MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
-      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
-    };
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
-    typedef Matrix<Scalar, RowsAtCompileTime, RowsAtCompileTime, (MatrixType::Flags&RowMajorBit) ? RowMajor : ColMajor, MaxRowsAtCompileTime, MaxRowsAtCompileTime> MatrixQType;
-    typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
-    typedef typename internal::plain_row_type<MatrixType>::type RowVectorType;
-    typedef HouseholderSequence<MatrixType,typename internal::remove_all<typename HCoeffsType::ConjugateReturnType>::type> HouseholderSequenceType;
-
-    /**
-      * \brief Default Constructor.
-      *
-      * The default constructor is useful in cases in which the user intends to
-      * perform decompositions via HouseholderQR::compute(const MatrixType&).
-      */
-    HouseholderQR() : m_qr(), m_hCoeffs(), m_temp(), m_isInitialized(false) {}
-
-    /** \brief Default Constructor with memory preallocation
-      *
-      * Like the default constructor but with preallocation of the internal data
-      * according to the specified problem \a size.
-      * \sa HouseholderQR()
-      */
-    HouseholderQR(Index rows, Index cols)
-      : m_qr(rows, cols),
-        m_hCoeffs((std::min)(rows,cols)),
-        m_temp(cols),
-        m_isInitialized(false) {}
-
-    /** \brief Constructs a QR factorization from a given matrix
-      *
-      * This constructor computes the QR factorization of the matrix \a matrix by calling
-      * the method compute(). It is a short cut for:
-      * 
-      * \code
-      * HouseholderQR<MatrixType> qr(matrix.rows(), matrix.cols());
-      * qr.compute(matrix);
-      * \endcode
-      * 
-      * \sa compute()
-      */
-    HouseholderQR(const MatrixType& matrix)
+ *
+ *
+ * \class HouseholderQR
+ *
+ * \brief Householder QR decomposition of a matrix
+ *
+ * \tparam MatrixType_ the type of the matrix of which we are computing the QR decomposition
+ *
+ * This class performs a QR decomposition of a matrix \b A into matrices \b Q and \b R
+ * such that
+ * \f[
+ *  \mathbf{A} = \mathbf{Q} \, \mathbf{R}
+ * \f]
+ * by using Householder transformations. Here, \b Q a unitary matrix and \b R an upper triangular matrix.
+ * The result is stored in a compact way compatible with LAPACK.
+ *
+ * Note that no pivoting is performed. This is \b not a rank-revealing decomposition.
+ * If you want that feature, use FullPivHouseholderQR or ColPivHouseholderQR instead.
+ *
+ * This Householder QR decomposition is faster, but less numerically stable and less feature-full than
+ * FullPivHouseholderQR or ColPivHouseholderQR.
+ *
+ * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism.
+ *
+ * \sa MatrixBase::householderQr()
+ */
+template <typename MatrixType_>
+class HouseholderQR : public SolverBase<HouseholderQR<MatrixType_>> {
+ public:
+  typedef MatrixType_ MatrixType;
+  typedef SolverBase<HouseholderQR> Base;
+  friend class SolverBase<HouseholderQR>;
+
+  EIGEN_GENERIC_PUBLIC_INTERFACE(HouseholderQR)
+  enum {
+    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+  };
+  typedef Matrix<Scalar, RowsAtCompileTime, RowsAtCompileTime, (MatrixType::Flags & RowMajorBit) ? RowMajor : ColMajor,
+                 MaxRowsAtCompileTime, MaxRowsAtCompileTime>
+      MatrixQType;
+  typedef typename internal::plain_diag_type<MatrixType>::type HCoeffsType;
+  typedef typename internal::plain_row_type<MatrixType>::type RowVectorType;
+  typedef HouseholderSequence<MatrixType, internal::remove_all_t<typename HCoeffsType::ConjugateReturnType>>
+      HouseholderSequenceType;
+
+  /** \brief Reports whether the QR factorization was successful.
+   *
+   * \note This function always returns \c Success. It is provided for compatibility
+   * with other factorization routines.
+   * \returns \c Success
+   */
+  ComputationInfo info() const {
+    eigen_assert(m_isInitialized && "HouseHolderQR is not initialized.");
+    return Success;
+  }
+
+  /**
+   * \brief Default Constructor.
+   *
+   * The default constructor is useful in cases in which the user intends to
+   * perform decompositions via HouseholderQR::compute(const MatrixType&).
+   */
+  HouseholderQR() : m_qr(), m_hCoeffs(), m_temp(), m_isInitialized(false) {}
+
+  /** \brief Default Constructor with memory preallocation
+   *
+   * Like the default constructor but with preallocation of the internal data
+   * according to the specified problem \a size.
+   * \sa HouseholderQR()
+   */
+  HouseholderQR(Index rows, Index cols)
+      : m_qr(rows, cols), m_hCoeffs((std::min)(rows, cols)), m_temp(cols), m_isInitialized(false) {}
+
+  /** \brief Constructs a QR factorization from a given matrix
+   *
+   * This constructor computes the QR factorization of the matrix \a matrix by calling
+   * the method compute(). It is a short cut for:
+   *
+   * \code
+   * HouseholderQR<MatrixType> qr(matrix.rows(), matrix.cols());
+   * qr.compute(matrix);
+   * \endcode
+   *
+   * \sa compute()
+   */
+  template <typename InputType>
+  explicit HouseholderQR(const EigenBase<InputType>& matrix)
       : m_qr(matrix.rows(), matrix.cols()),
-        m_hCoeffs((std::min)(matrix.rows(),matrix.cols())),
+        m_hCoeffs((std::min)(matrix.rows(), matrix.cols())),
         m_temp(matrix.cols()),
-        m_isInitialized(false)
-    {
-      compute(matrix);
-    }
+        m_isInitialized(false) {
+    compute(matrix.derived());
+  }
 
-    /** This method finds a solution x to the equation Ax=b, where A is the matrix of which
-      * *this is the QR decomposition, if any exists.
-      *
-      * \param b the right-hand-side of the equation to solve.
-      *
-      * \returns a solution.
-      *
-      * \note The case where b is a matrix is not yet implemented. Also, this
-      *       code is space inefficient.
-      *
-      * \note_about_checking_solutions
-      *
-      * \note_about_arbitrary_choice_of_solution
-      *
-      * Example: \include HouseholderQR_solve.cpp
-      * Output: \verbinclude HouseholderQR_solve.out
-      */
-    template<typename Rhs>
-    inline const internal::solve_retval<HouseholderQR, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "HouseholderQR is not initialized.");
-      return internal::solve_retval<HouseholderQR, Rhs>(*this, b.derived());
-    }
+  /** \brief Constructs a QR factorization from a given matrix
+   *
+   * This overloaded constructor is provided for \link InplaceDecomposition inplace decomposition \endlink when
+   * \c MatrixType is a Eigen::Ref.
+   *
+   * \sa HouseholderQR(const EigenBase&)
+   */
+  template <typename InputType>
+  explicit HouseholderQR(EigenBase<InputType>& matrix)
+      : m_qr(matrix.derived()),
+        m_hCoeffs((std::min)(matrix.rows(), matrix.cols())),
+        m_temp(matrix.cols()),
+        m_isInitialized(false) {
+    computeInPlace();
+  }
 
-    /** This method returns an expression of the unitary matrix Q as a sequence of Householder transformations.
-      *
-      * The returned expression can directly be used to perform matrix products. It can also be assigned to a dense Matrix object.
-      * Here is an example showing how to recover the full or thin matrix Q, as well as how to perform matrix products using operator*:
-      *
-      * Example: \include HouseholderQR_householderQ.cpp
-      * Output: \verbinclude HouseholderQR_householderQ.out
-      */
-    HouseholderSequenceType householderQ() const
-    {
-      eigen_assert(m_isInitialized && "HouseholderQR is not initialized.");
-      return HouseholderSequenceType(m_qr, m_hCoeffs.conjugate());
-    }
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+  /** This method finds a solution x to the equation Ax=b, where A is the matrix of which
+   * *this is the QR decomposition, if any exists.
+   *
+   * \param b the right-hand-side of the equation to solve.
+   *
+   * \returns a solution.
+   *
+   * \note_about_checking_solutions
+   *
+   * \note_about_arbitrary_choice_of_solution
+   *
+   * Example: \include HouseholderQR_solve.cpp
+   * Output: \verbinclude HouseholderQR_solve.out
+   */
+  template <typename Rhs>
+  inline const Solve<HouseholderQR, Rhs> solve(const MatrixBase<Rhs>& b) const;
+#endif
+
+  /** This method returns an expression of the unitary matrix Q as a sequence of Householder transformations.
+   *
+   * The returned expression can directly be used to perform matrix products. It can also be assigned to a dense Matrix
+   * object. Here is an example showing how to recover the full or thin matrix Q, as well as how to perform matrix
+   * products using operator*:
+   *
+   * Example: \include HouseholderQR_householderQ.cpp
+   * Output: \verbinclude HouseholderQR_householderQ.out
+   */
+  HouseholderSequenceType householderQ() const {
+    eigen_assert(m_isInitialized && "HouseholderQR is not initialized.");
+    return HouseholderSequenceType(m_qr, m_hCoeffs.conjugate());
+  }
 
-    /** \returns a reference to the matrix where the Householder QR decomposition is stored
-      * in a LAPACK-compatible way.
-      */
-    const MatrixType& matrixQR() const
-    {
-        eigen_assert(m_isInitialized && "HouseholderQR is not initialized.");
-        return m_qr;
+  /** \returns a reference to the matrix where the Householder QR decomposition is stored
+   * in a LAPACK-compatible way.
+   */
+  const MatrixType& matrixQR() const {
+    eigen_assert(m_isInitialized && "HouseholderQR is not initialized.");
+    return m_qr;
+  }
+
+  template <typename InputType>
+  HouseholderQR& compute(const EigenBase<InputType>& matrix) {
+    m_qr = matrix.derived();
+    computeInPlace();
+    return *this;
+  }
+
+  /** \returns the determinant of the matrix of which
+   * *this is the QR decomposition. It has only linear complexity
+   * (that is, O(n) where n is the dimension of the square matrix)
+   * as the QR decomposition has already been computed.
+   *
+   * \note This is only for square matrices.
+   *
+   * \warning a determinant can be very big or small, so for matrices
+   * of large enough dimension, there is a risk of overflow/underflow.
+   * One way to work around that is to use logAbsDeterminant() instead.
+   * Also, do not rely on the determinant being exactly zero for testing
+   * singularity or rank-deficiency.
+   *
+   * \sa absDeterminant(), logAbsDeterminant(), MatrixBase::determinant()
+   */
+  typename MatrixType::Scalar determinant() const;
+
+  /** \returns the absolute value of the determinant of the matrix of which
+   * *this is the QR decomposition. It has only linear complexity
+   * (that is, O(n) where n is the dimension of the square matrix)
+   * as the QR decomposition has already been computed.
+   *
+   * \note This is only for square matrices.
+   *
+   * \warning a determinant can be very big or small, so for matrices
+   * of large enough dimension, there is a risk of overflow/underflow.
+   * One way to work around that is to use logAbsDeterminant() instead.
+   * Also, do not rely on the determinant being exactly zero for testing
+   * singularity or rank-deficiency.
+   *
+   * \sa determinant(), logAbsDeterminant(), MatrixBase::determinant()
+   */
+  typename MatrixType::RealScalar absDeterminant() const;
+
+  /** \returns the natural log of the absolute value of the determinant of the matrix of which
+   * *this is the QR decomposition. It has only linear complexity
+   * (that is, O(n) where n is the dimension of the square matrix)
+   * as the QR decomposition has already been computed.
+   *
+   * \note This is only for square matrices.
+   *
+   * \note This method is useful to work around the risk of overflow/underflow that's inherent
+   * to determinant computation.
+   *
+   * \warning Do not rely on the determinant being exactly zero for testing
+   * singularity or rank-deficiency.
+   *
+   * \sa determinant(), absDeterminant(), MatrixBase::determinant()
+   */
+  typename MatrixType::RealScalar logAbsDeterminant() const;
+
+  /** \returns the sign of the determinant of the matrix of which
+   * *this is the QR decomposition. It has only linear complexity
+   * (that is, O(n) where n is the dimension of the square matrix)
+   * as the QR decomposition has already been computed.
+   *
+   * \note This is only for square matrices.
+   *
+   * \note This method is useful to work around the risk of overflow/underflow that's inherent
+   * to determinant computation.
+   *
+   * \warning Do not rely on the determinant being exactly zero for testing
+   * singularity or rank-deficiency.
+   *
+   * \sa determinant(), absDeterminant(), MatrixBase::determinant()
+   */
+  typename MatrixType::Scalar signDeterminant() const;
+
+  inline Index rows() const { return m_qr.rows(); }
+  inline Index cols() const { return m_qr.cols(); }
+
+  /** \returns a const reference to the vector of Householder coefficients used to represent the factor \c Q.
+   *
+   * For advanced uses only.
+   */
+  const HCoeffsType& hCoeffs() const { return m_hCoeffs; }
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  template <typename RhsType, typename DstType>
+  void _solve_impl(const RhsType& rhs, DstType& dst) const;
+
+  template <bool Conjugate, typename RhsType, typename DstType>
+  void _solve_impl_transposed(const RhsType& rhs, DstType& dst) const;
+#endif
+
+ protected:
+  EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
+
+  void computeInPlace();
+
+  MatrixType m_qr;
+  HCoeffsType m_hCoeffs;
+  RowVectorType m_temp;
+  bool m_isInitialized;
+};
+
+namespace internal {
+
+/** \internal */
+template <typename HCoeffs, typename Scalar, bool IsComplex>
+struct householder_determinant {
+  static void run(const HCoeffs& hCoeffs, Scalar& out_det) {
+    out_det = Scalar(1);
+    Index size = hCoeffs.rows();
+    for (Index i = 0; i < size; i++) {
+      // For each valid reflection Q_n,
+      // det(Q_n) = - conj(h_n) / h_n
+      // where h_n is the Householder coefficient.
+      if (hCoeffs(i) != Scalar(0)) out_det *= -numext::conj(hCoeffs(i)) / hCoeffs(i);
     }
+  }
+};
 
-    HouseholderQR& compute(const MatrixType& matrix);
-
-    /** \returns the absolute value of the determinant of the matrix of which
-      * *this is the QR decomposition. It has only linear complexity
-      * (that is, O(n) where n is the dimension of the square matrix)
-      * as the QR decomposition has already been computed.
-      *
-      * \note This is only for square matrices.
-      *
-      * \warning a determinant can be very big or small, so for matrices
-      * of large enough dimension, there is a risk of overflow/underflow.
-      * One way to work around that is to use logAbsDeterminant() instead.
-      *
-      * \sa logAbsDeterminant(), MatrixBase::determinant()
-      */
-    typename MatrixType::RealScalar absDeterminant() const;
-
-    /** \returns the natural log of the absolute value of the determinant of the matrix of which
-      * *this is the QR decomposition. It has only linear complexity
-      * (that is, O(n) where n is the dimension of the square matrix)
-      * as the QR decomposition has already been computed.
-      *
-      * \note This is only for square matrices.
-      *
-      * \note This method is useful to work around the risk of overflow/underflow that's inherent
-      * to determinant computation.
-      *
-      * \sa absDeterminant(), MatrixBase::determinant()
-      */
-    typename MatrixType::RealScalar logAbsDeterminant() const;
-
-    inline Index rows() const { return m_qr.rows(); }
-    inline Index cols() const { return m_qr.cols(); }
-    
-    /** \returns a const reference to the vector of Householder coefficients used to represent the factor \c Q.
-      * 
-      * For advanced uses only.
-      */
-    const HCoeffsType& hCoeffs() const { return m_hCoeffs; }
-
-  protected:
-    
-    static void check_template_parameters()
-    {
-      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
+/** \internal */
+template <typename HCoeffs, typename Scalar>
+struct householder_determinant<HCoeffs, Scalar, false> {
+  static void run(const HCoeffs& hCoeffs, Scalar& out_det) {
+    bool negated = false;
+    Index size = hCoeffs.rows();
+    for (Index i = 0; i < size; i++) {
+      // Each valid reflection negates the determinant.
+      if (hCoeffs(i) != Scalar(0)) negated ^= true;
     }
-    
-    MatrixType m_qr;
-    HCoeffsType m_hCoeffs;
-    RowVectorType m_temp;
-    bool m_isInitialized;
+    out_det = negated ? Scalar(-1) : Scalar(1);
+  }
 };
 
-template<typename MatrixType>
-typename MatrixType::RealScalar HouseholderQR<MatrixType>::absDeterminant() const
-{
+}  // end namespace internal
+
+template <typename MatrixType>
+typename MatrixType::Scalar HouseholderQR<MatrixType>::determinant() const {
+  eigen_assert(m_isInitialized && "HouseholderQR is not initialized.");
+  eigen_assert(m_qr.rows() == m_qr.cols() && "You can't take the determinant of a non-square matrix!");
+  Scalar detQ;
+  internal::householder_determinant<HCoeffsType, Scalar, NumTraits<Scalar>::IsComplex>::run(m_hCoeffs, detQ);
+  return m_qr.diagonal().prod() * detQ;
+}
+
+template <typename MatrixType>
+typename MatrixType::RealScalar HouseholderQR<MatrixType>::absDeterminant() const {
   using std::abs;
   eigen_assert(m_isInitialized && "HouseholderQR is not initialized.");
   eigen_assert(m_qr.rows() == m_qr.cols() && "You can't take the determinant of a non-square matrix!");
   return abs(m_qr.diagonal().prod());
 }
 
-template<typename MatrixType>
-typename MatrixType::RealScalar HouseholderQR<MatrixType>::logAbsDeterminant() const
-{
+template <typename MatrixType>
+typename MatrixType::RealScalar HouseholderQR<MatrixType>::logAbsDeterminant() const {
   eigen_assert(m_isInitialized && "HouseholderQR is not initialized.");
   eigen_assert(m_qr.rows() == m_qr.cols() && "You can't take the determinant of a non-square matrix!");
   return m_qr.diagonal().cwiseAbs().array().log().sum();
 }
 
+template <typename MatrixType>
+typename MatrixType::Scalar HouseholderQR<MatrixType>::signDeterminant() const {
+  eigen_assert(m_isInitialized && "HouseholderQR is not initialized.");
+  eigen_assert(m_qr.rows() == m_qr.cols() && "You can't take the determinant of a non-square matrix!");
+  Scalar detQ;
+  internal::householder_determinant<HCoeffsType, Scalar, NumTraits<Scalar>::IsComplex>::run(m_hCoeffs, detQ);
+  return detQ * m_qr.diagonal().array().sign().prod();
+}
+
 namespace internal {
 
 /** \internal */
-template<typename MatrixQR, typename HCoeffs>
-void householder_qr_inplace_unblocked(MatrixQR& mat, HCoeffs& hCoeffs, typename MatrixQR::Scalar* tempData = 0)
-{
-  typedef typename MatrixQR::Index Index;
+template <typename MatrixQR, typename HCoeffs>
+void householder_qr_inplace_unblocked(MatrixQR& mat, HCoeffs& hCoeffs, typename MatrixQR::Scalar* tempData = 0) {
   typedef typename MatrixQR::Scalar Scalar;
   typedef typename MatrixQR::RealScalar RealScalar;
   Index rows = mat.rows();
   Index cols = mat.cols();
-  Index size = (std::min)(rows,cols);
+  Index size = (std::min)(rows, cols);
 
   eigen_assert(hCoeffs.size() == size);
 
-  typedef Matrix<Scalar,MatrixQR::ColsAtCompileTime,1> TempType;
+  typedef Matrix<Scalar, MatrixQR::ColsAtCompileTime, 1> TempType;
   TempType tempVector;
-  if(tempData==0)
-  {
+  if (tempData == 0) {
     tempVector.resize(cols);
     tempData = tempVector.data();
   }
 
-  for(Index k = 0; k < size; ++k)
-  {
+  for (Index k = 0; k < size; ++k) {
     Index remainingRows = rows - k;
     Index remainingCols = cols - k - 1;
 
     RealScalar beta;
     mat.col(k).tail(remainingRows).makeHouseholderInPlace(hCoeffs.coeffRef(k), beta);
-    mat.coeffRef(k,k) = beta;
+    mat.coeffRef(k, k) = beta;
 
     // apply H to remaining part of m_qr from the left
     mat.bottomRightCorner(remainingRows, remainingCols)
-        .applyHouseholderOnTheLeft(mat.col(k).tail(remainingRows-1), hCoeffs.coeffRef(k), tempData+k+1);
+        .applyHouseholderOnTheLeft(mat.col(k).tail(remainingRows - 1), hCoeffs.coeffRef(k), tempData + k + 1);
+  }
+}
+
+// TODO: add a corresponding public API for updating a QR factorization
+/** \internal
+ * Basically a modified copy of @c Eigen::internal::householder_qr_inplace_unblocked that
+ * performs a rank-1 update of the QR matrix in compact storage. This function assumes, that
+ * the first @c k-1 columns of the matrix @c mat contain the QR decomposition of \f$A^N\f$ up to
+ * column k-1. Then the QR decomposition of the k-th column (given by @c newColumn) is computed by
+ * applying the k-1 Householder projectors on it and finally compute the projector \f$H_k\f$ of
+ * it. On exit the matrix @c mat and the vector @c hCoeffs contain the QR decomposition of the
+ * first k columns of \f$A^N\f$. The \a tempData argument must point to at least mat.cols() scalars.  */
+template <typename MatrixQR, typename HCoeffs, typename VectorQR>
+void householder_qr_inplace_update(MatrixQR& mat, HCoeffs& hCoeffs, const VectorQR& newColumn,
+                                   typename MatrixQR::Index k, typename MatrixQR::Scalar* tempData) {
+  typedef typename MatrixQR::Index Index;
+  typedef typename MatrixQR::RealScalar RealScalar;
+  Index rows = mat.rows();
+
+  eigen_assert(k < mat.cols());
+  eigen_assert(k < rows);
+  eigen_assert(hCoeffs.size() == mat.cols());
+  eigen_assert(newColumn.size() == rows);
+  eigen_assert(tempData);
+
+  // Store new column in mat at column k
+  mat.col(k) = newColumn;
+  // Apply H = H_1...H_{k-1} on newColumn (skip if k=0)
+  for (Index i = 0; i < k; ++i) {
+    Index remainingRows = rows - i;
+    mat.col(k)
+        .tail(remainingRows)
+        .applyHouseholderOnTheLeft(mat.col(i).tail(remainingRows - 1), hCoeffs.coeffRef(i), tempData + i + 1);
   }
+  // Construct Householder projector in-place in column k
+  RealScalar beta;
+  mat.col(k).tail(rows - k).makeHouseholderInPlace(hCoeffs.coeffRef(k), beta);
+  mat.coeffRef(k, k) = beta;
 }
 
 /** \internal */
-template<typename MatrixQR, typename HCoeffs,
-  typename MatrixQRScalar = typename MatrixQR::Scalar,
-  bool InnerStrideIsOne = (MatrixQR::InnerStrideAtCompileTime == 1 && HCoeffs::InnerStrideAtCompileTime == 1)>
-struct householder_qr_inplace_blocked
-{
-  // This is specialized for MKL-supported Scalar types in HouseholderQR_MKL.h
-  static void run(MatrixQR& mat, HCoeffs& hCoeffs,
-      typename MatrixQR::Index maxBlockSize=32,
-      typename MatrixQR::Scalar* tempData = 0)
-  {
-    typedef typename MatrixQR::Index Index;
+template <typename MatrixQR, typename HCoeffs, typename MatrixQRScalar = typename MatrixQR::Scalar,
+          bool InnerStrideIsOne = (MatrixQR::InnerStrideAtCompileTime == 1 && HCoeffs::InnerStrideAtCompileTime == 1)>
+struct householder_qr_inplace_blocked {
+  // This is specialized for LAPACK-supported Scalar types in HouseholderQR_LAPACKE.h
+  static void run(MatrixQR& mat, HCoeffs& hCoeffs, Index maxBlockSize = 32, typename MatrixQR::Scalar* tempData = 0) {
     typedef typename MatrixQR::Scalar Scalar;
-    typedef Block<MatrixQR,Dynamic,Dynamic> BlockType;
+    typedef Block<MatrixQR, Dynamic, Dynamic> BlockType;
 
     Index rows = mat.rows();
     Index cols = mat.cols();
     Index size = (std::min)(rows, cols);
 
-    typedef Matrix<Scalar,Dynamic,1,ColMajor,MatrixQR::MaxColsAtCompileTime,1> TempType;
+    typedef Matrix<Scalar, Dynamic, 1, ColMajor, MatrixQR::MaxColsAtCompileTime, 1> TempType;
     TempType tempVector;
-    if(tempData==0)
-    {
+    if (tempData == 0) {
       tempVector.resize(cols);
       tempData = tempVector.data();
     }
 
-    Index blockSize = (std::min)(maxBlockSize,size);
+    Index blockSize = (std::min)(maxBlockSize, size);
 
     Index k = 0;
-    for (k = 0; k < size; k += blockSize)
-    {
-      Index bs = (std::min)(size-k,blockSize);  // actual size of the block
-      Index tcols = cols - k - bs;            // trailing columns
-      Index brows = rows-k;                   // rows of the block
+    for (k = 0; k < size; k += blockSize) {
+      Index bs = (std::min)(size - k, blockSize);  // actual size of the block
+      Index tcols = cols - k - bs;                 // trailing columns
+      Index brows = rows - k;                      // rows of the block
 
       // partition the matrix:
       //        A00 | A01 | A02
@@ -300,68 +457,69 @@ struct householder_qr_inplace_blocked
       // and update [A21^T A22^T]^T using level 3 operations.
       // Finally, the algorithm continue on A22
 
-      BlockType A11_21 = mat.block(k,k,brows,bs);
-      Block<HCoeffs,Dynamic,1> hCoeffsSegment = hCoeffs.segment(k,bs);
+      BlockType A11_21 = mat.block(k, k, brows, bs);
+      Block<HCoeffs, Dynamic, 1> hCoeffsSegment = hCoeffs.segment(k, bs);
 
       householder_qr_inplace_unblocked(A11_21, hCoeffsSegment, tempData);
 
-      if(tcols)
-      {
-        BlockType A21_22 = mat.block(k,k+bs,brows,tcols);
-        apply_block_householder_on_the_left(A21_22,A11_21,hCoeffsSegment.adjoint());
+      if (tcols) {
+        BlockType A21_22 = mat.block(k, k + bs, brows, tcols);
+        apply_block_householder_on_the_left(A21_22, A11_21, hCoeffsSegment, false);  // false == backward
       }
     }
   }
 };
 
-template<typename _MatrixType, typename Rhs>
-struct solve_retval<HouseholderQR<_MatrixType>, Rhs>
-  : solve_retval_base<HouseholderQR<_MatrixType>, Rhs>
-{
-  EIGEN_MAKE_SOLVE_HELPERS(HouseholderQR<_MatrixType>,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    const Index rows = dec().rows(), cols = dec().cols();
-    const Index rank = (std::min)(rows, cols);
-    eigen_assert(rhs().rows() == rows);
-
-    typename Rhs::PlainObject c(rhs());
-
-    // Note that the matrix Q = H_0^* H_1^*... so its inverse is Q^* = (H_0 H_1 ...)^T
-    c.applyOnTheLeft(householderSequence(
-      dec().matrixQR().leftCols(rank),
-      dec().hCoeffs().head(rank)).transpose()
-    );
-
-    dec().matrixQR()
-       .topLeftCorner(rank, rank)
-       .template triangularView<Upper>()
-       .solveInPlace(c.topRows(rank));
-
-    dst.topRows(rank) = c.topRows(rank);
-    dst.bottomRows(cols-rank).setZero();
-  }
-};
+}  // end namespace internal
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+template <typename MatrixType_>
+template <typename RhsType, typename DstType>
+void HouseholderQR<MatrixType_>::_solve_impl(const RhsType& rhs, DstType& dst) const {
+  const Index rank = (std::min)(rows(), cols());
+
+  typename RhsType::PlainObject c(rhs);
+
+  c.applyOnTheLeft(householderQ().setLength(rank).adjoint());
+
+  m_qr.topLeftCorner(rank, rank).template triangularView<Upper>().solveInPlace(c.topRows(rank));
+
+  dst.topRows(rank) = c.topRows(rank);
+  dst.bottomRows(cols() - rank).setZero();
+}
+
+template <typename MatrixType_>
+template <bool Conjugate, typename RhsType, typename DstType>
+void HouseholderQR<MatrixType_>::_solve_impl_transposed(const RhsType& rhs, DstType& dst) const {
+  const Index rank = (std::min)(rows(), cols());
+
+  typename RhsType::PlainObject c(rhs);
+
+  m_qr.topLeftCorner(rank, rank)
+      .template triangularView<Upper>()
+      .transpose()
+      .template conjugateIf<Conjugate>()
+      .solveInPlace(c.topRows(rank));
 
-} // end namespace internal
+  dst.topRows(rank) = c.topRows(rank);
+  dst.bottomRows(rows() - rank).setZero();
+
+  dst.applyOnTheLeft(householderQ().setLength(rank).template conjugateIf<!Conjugate>());
+}
+#endif
 
 /** Performs the QR factorization of the given matrix \a matrix. The result of
-  * the factorization is stored into \c *this, and a reference to \c *this
-  * is returned.
-  *
-  * \sa class HouseholderQR, HouseholderQR(const MatrixType&)
-  */
-template<typename MatrixType>
-HouseholderQR<MatrixType>& HouseholderQR<MatrixType>::compute(const MatrixType& matrix)
-{
-  check_template_parameters();
-  
-  Index rows = matrix.rows();
-  Index cols = matrix.cols();
-  Index size = (std::min)(rows,cols);
-
-  m_qr = matrix;
+ * the factorization is stored into \c *this, and a reference to \c *this
+ * is returned.
+ *
+ * \sa class HouseholderQR, HouseholderQR(const MatrixType&)
+ */
+template <typename MatrixType>
+void HouseholderQR<MatrixType>::computeInPlace() {
+  Index rows = m_qr.rows();
+  Index cols = m_qr.cols();
+  Index size = (std::min)(rows, cols);
+
   m_hCoeffs.resize(size);
 
   m_temp.resize(cols);
@@ -369,20 +527,17 @@ HouseholderQR<MatrixType>& HouseholderQR<MatrixType>::compute(const MatrixType&
   internal::householder_qr_inplace_blocked<MatrixType, HCoeffsType>::run(m_qr, m_hCoeffs, 48, m_temp.data());
 
   m_isInitialized = true;
-  return *this;
 }
 
 /** \return the Householder QR decomposition of \c *this.
-  *
-  * \sa class HouseholderQR
-  */
-template<typename Derived>
-const HouseholderQR<typename MatrixBase<Derived>::PlainObject>
-MatrixBase<Derived>::householderQr() const
-{
+ *
+ * \sa class HouseholderQR
+ */
+template <typename Derived>
+const HouseholderQR<typename MatrixBase<Derived>::PlainObject> MatrixBase<Derived>::householderQr() const {
   return HouseholderQR<PlainObject>(eval());
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_QR_H
+#endif  // EIGEN_QR_H
diff --git a/inst/include/Eigen/src/QR/HouseholderQR_MKL.h b/inst/include/Eigen/src/QR/HouseholderQR_LAPACKE.h
similarity index 57%
rename from inst/include/Eigen/src/QR/HouseholderQR_MKL.h
rename to inst/include/Eigen/src/QR/HouseholderQR_LAPACKE.h
index b80f1b48..3b621757 100644
--- a/inst/include/Eigen/src/QR/HouseholderQR_MKL.h
+++ b/inst/include/Eigen/src/QR/HouseholderQR_LAPACKE.h
@@ -25,47 +25,53 @@
  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
  ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
+ *   Content : Eigen bindings to LAPACKe
  *    Householder QR decomposition of a matrix w/o pivoting based on
  *    LAPACKE_?geqrf function.
  ********************************************************************************
 */
 
-#ifndef EIGEN_QR_MKL_H
-#define EIGEN_QR_MKL_H
+#ifndef EIGEN_QR_LAPACKE_H
+#define EIGEN_QR_LAPACKE_H
 
-#include "../Core/util/MKL_support.h"
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 
-namespace Eigen { 
+namespace Eigen {
 
-  namespace internal {
+namespace internal {
 
-    /** \internal Specialization for the data types supported by MKL */
+namespace lapacke_helpers {
 
-#define EIGEN_MKL_QR_NOPIV(EIGTYPE, MKLTYPE, MKLPREFIX) \
-template<typename MatrixQR, typename HCoeffs> \
-struct householder_qr_inplace_blocked<MatrixQR, HCoeffs, EIGTYPE, true> \
-{ \
-  static void run(MatrixQR& mat, HCoeffs& hCoeffs, \
-      typename MatrixQR::Index = 32, \
-      typename MatrixQR::Scalar* = 0) \
-  { \
-    lapack_int m = (lapack_int) mat.rows(); \
-    lapack_int n = (lapack_int) mat.cols(); \
-    lapack_int lda = (lapack_int) mat.outerStride(); \
-    lapack_int matrix_order = (MatrixQR::IsRowMajor) ? LAPACK_ROW_MAJOR : LAPACK_COL_MAJOR; \
-    LAPACKE_##MKLPREFIX##geqrf( matrix_order, m, n, (MKLTYPE*)mat.data(), lda, (MKLTYPE*)hCoeffs.data()); \
-    hCoeffs.adjointInPlace(); \
-  } \
+template <typename MatrixQR, typename HCoeffs>
+struct lapacke_hqr {
+  static void run(MatrixQR& mat, HCoeffs& hCoeffs, Index = 32, typename MatrixQR::Scalar* = 0) {
+    lapack_int m = to_lapack(mat.rows());
+    lapack_int n = to_lapack(mat.cols());
+    lapack_int lda = to_lapack(mat.outerStride());
+    lapack_int matrix_order = lapack_storage_of(mat);
+    geqrf(matrix_order, m, n, to_lapack(mat.data()), lda, to_lapack(hCoeffs.data()));
+    hCoeffs.adjointInPlace();
+  }
 };
 
-EIGEN_MKL_QR_NOPIV(double, double, d)
-EIGEN_MKL_QR_NOPIV(float, float, s)
-EIGEN_MKL_QR_NOPIV(dcomplex, MKL_Complex16, z)
-EIGEN_MKL_QR_NOPIV(scomplex, MKL_Complex8, c)
+}  // namespace lapacke_helpers
 
-} // end namespace internal
+/** \internal Specialization for the data types supported by LAPACKe */
+#define EIGEN_LAPACKE_HH_QR(EIGTYPE)                                      \
+  template <typename MatrixQR, typename HCoeffs>                          \
+  struct householder_qr_inplace_blocked<MatrixQR, HCoeffs, EIGTYPE, true> \
+      : public lapacke_helpers::lapacke_hqr<MatrixQR, HCoeffs> {};
 
-} // end namespace Eigen
+EIGEN_LAPACKE_HH_QR(double)
+EIGEN_LAPACKE_HH_QR(float)
+EIGEN_LAPACKE_HH_QR(std::complex<double>)
+EIGEN_LAPACKE_HH_QR(std::complex<float>)
 
-#endif // EIGEN_QR_MKL_H
+#undef EIGEN_LAPACKE_HH_QR
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_QR_LAPACKE_H
diff --git a/inst/include/Eigen/src/QR/InternalHeaderCheck.h b/inst/include/Eigen/src/QR/InternalHeaderCheck.h
new file mode 100644
index 00000000..bf8df01c
--- /dev/null
+++ b/inst/include/Eigen/src/QR/InternalHeaderCheck.h
@@ -0,0 +1,3 @@
+#ifndef EIGEN_QR_MODULE_H
+#error "Please include Eigen/QR instead of including headers inside the src directory directly."
+#endif
diff --git a/inst/include/Eigen/src/SPQRSupport/InternalHeaderCheck.h b/inst/include/Eigen/src/SPQRSupport/InternalHeaderCheck.h
new file mode 100644
index 00000000..8d94ba4b
--- /dev/null
+++ b/inst/include/Eigen/src/SPQRSupport/InternalHeaderCheck.h
@@ -0,0 +1,3 @@
+#ifndef EIGEN_SPQRSUPPORT_MODULE_H
+#error "Please include Eigen/SPQRSupport instead of including headers inside the src directory directly."
+#endif
diff --git a/inst/include/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h b/inst/include/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h
index 36138101..31327948 100644
--- a/inst/include/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h
+++ b/inst/include/Eigen/src/SPQRSupport/SuiteSparseQRSupport.h
@@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2012 Desire Nuentsa <desire.nuentsa_wakam@inria.fr>
+// Copyright (C) 2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,329 +11,305 @@
 #ifndef EIGEN_SUITESPARSEQRSUPPORT_H
 #define EIGEN_SUITESPARSEQRSUPPORT_H
 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
 namespace Eigen {
-  
-  template<typename MatrixType> class SPQR; 
-  template<typename SPQRType> struct SPQRMatrixQReturnType; 
-  template<typename SPQRType> struct SPQRMatrixQTransposeReturnType; 
-  template <typename SPQRType, typename Derived> struct SPQR_QProduct;
-  namespace internal {
-    template <typename SPQRType> struct traits<SPQRMatrixQReturnType<SPQRType> >
-    {
-      typedef typename SPQRType::MatrixType ReturnType;
-    };
-    template <typename SPQRType> struct traits<SPQRMatrixQTransposeReturnType<SPQRType> >
-    {
-      typedef typename SPQRType::MatrixType ReturnType;
-    };
-    template <typename SPQRType, typename Derived> struct traits<SPQR_QProduct<SPQRType, Derived> >
-    {
-      typedef typename Derived::PlainObject ReturnType;
-    };
-  } // End namespace internal
-  
+
+template <typename MatrixType>
+class SPQR;
+template <typename SPQRType>
+struct SPQRMatrixQReturnType;
+template <typename SPQRType>
+struct SPQRMatrixQTransposeReturnType;
+template <typename SPQRType, typename Derived>
+struct SPQR_QProduct;
+namespace internal {
+template <typename SPQRType>
+struct traits<SPQRMatrixQReturnType<SPQRType> > {
+  typedef typename SPQRType::MatrixType ReturnType;
+};
+template <typename SPQRType>
+struct traits<SPQRMatrixQTransposeReturnType<SPQRType> > {
+  typedef typename SPQRType::MatrixType ReturnType;
+};
+template <typename SPQRType, typename Derived>
+struct traits<SPQR_QProduct<SPQRType, Derived> > {
+  typedef typename Derived::PlainObject ReturnType;
+};
+}  // End namespace internal
+
 /**
  * \ingroup SPQRSupport_Module
  * \class SPQR
  * \brief Sparse QR factorization based on SuiteSparseQR library
- * 
- * This class is used to perform a multithreaded and multifrontal rank-revealing QR decomposition 
+ *
+ * This class is used to perform a multithreaded and multifrontal rank-revealing QR decomposition
  * of sparse matrices. The result is then used to solve linear leasts_square systems.
  * Clearly, a QR factorization is returned such that A*P = Q*R where :
- * 
+ *
  * P is the column permutation. Use colsPermutation() to get it.
- * 
- * Q is the orthogonal matrix represented as Householder reflectors. 
+ *
+ * Q is the orthogonal matrix represented as Householder reflectors.
  * Use matrixQ() to get an expression and matrixQ().transpose() to get the transpose.
  * You can then apply it to a vector.
- * 
+ *
  * R is the sparse triangular factor. Use matrixQR() to get it as SparseMatrix.
  * NOTE : The Index type of R is always SuiteSparse_long. You can get it with SPQR::Index
- * 
- * \tparam _MatrixType The type of the sparse matrix A, must be a column-major SparseMatrix<>
- * NOTE 
- * 
+ *
+ * \tparam MatrixType_ The type of the sparse matrix A, must be a column-major SparseMatrix<>
+ *
+ * \implsparsesolverconcept
+ *
+ *
  */
-template<typename _MatrixType>
-class SPQR
-{
-  public:
-    typedef typename _MatrixType::Scalar Scalar;
-    typedef typename _MatrixType::RealScalar RealScalar;
-    typedef SuiteSparse_long Index ;
-    typedef SparseMatrix<Scalar, ColMajor, Index> MatrixType;
-    typedef PermutationMatrix<Dynamic, Dynamic> PermutationType;
-  public:
-    SPQR() 
-      : m_isInitialized(false), m_ordering(SPQR_ORDERING_DEFAULT), m_allow_tol(SPQR_DEFAULT_TOL), m_tolerance (NumTraits<Scalar>::epsilon()), m_useDefaultThreshold(true)
-    { 
-      cholmod_l_start(&m_cc);
-    }
-    
-    SPQR(const _MatrixType& matrix)
-    : m_isInitialized(false), m_ordering(SPQR_ORDERING_DEFAULT), m_allow_tol(SPQR_DEFAULT_TOL), m_tolerance (NumTraits<Scalar>::epsilon()), m_useDefaultThreshold(true)
-    {
-      cholmod_l_start(&m_cc);
-      compute(matrix);
-    }
-    
-    ~SPQR()
-    {
-      SPQR_free();
-      cholmod_l_finish(&m_cc);
-    }
-    void SPQR_free()
-    {
-      cholmod_l_free_sparse(&m_H, &m_cc);
-      cholmod_l_free_sparse(&m_cR, &m_cc);
-      cholmod_l_free_dense(&m_HTau, &m_cc);
-      std::free(m_E);
-      std::free(m_HPinv);
-    }
+template <typename MatrixType_>
+class SPQR : public SparseSolverBase<SPQR<MatrixType_> > {
+ protected:
+  typedef SparseSolverBase<SPQR<MatrixType_> > Base;
+  using Base::m_isInitialized;
 
-    void compute(const _MatrixType& matrix)
-    {
-      if(m_isInitialized) SPQR_free();
+ public:
+  typedef typename MatrixType_::Scalar Scalar;
+  typedef typename MatrixType_::RealScalar RealScalar;
+  typedef SuiteSparse_long StorageIndex;
+  typedef SparseMatrix<Scalar, ColMajor, StorageIndex> MatrixType;
+  typedef Map<PermutationMatrix<Dynamic, Dynamic, StorageIndex> > PermutationType;
+  enum { ColsAtCompileTime = Dynamic, MaxColsAtCompileTime = Dynamic };
 
-      MatrixType mat(matrix);
-      
-      /* Compute the default threshold as in MatLab, see:
-       * Tim Davis, "Algorithm 915, SuiteSparseQR: Multifrontal Multithreaded Rank-Revealing
-       * Sparse QR Factorization, ACM Trans. on Math. Soft. 38(1), 2011, Page 8:3 
-       */
-      RealScalar pivotThreshold = m_tolerance;
-      if(m_useDefaultThreshold) 
-      {
-        using std::max;
-        RealScalar max2Norm = 0.0;
-        for (int j = 0; j < mat.cols(); j++) max2Norm = (max)(max2Norm, mat.col(j).norm());
-        if(max2Norm==RealScalar(0))
-          max2Norm = RealScalar(1);
-        pivotThreshold = 20 * (mat.rows() + mat.cols()) * max2Norm * NumTraits<RealScalar>::epsilon();
-      }
-      
-      cholmod_sparse A; 
-      A = viewAsCholmod(mat);
-      Index col = matrix.cols();
-      m_rank = SuiteSparseQR<Scalar>(m_ordering, pivotThreshold, col, &A, 
-                             &m_cR, &m_E, &m_H, &m_HPinv, &m_HTau, &m_cc);
+ public:
+  SPQR()
+      : m_analysisIsOk(false),
+        m_factorizationIsOk(false),
+        m_isRUpToDate(false),
+        m_ordering(SPQR_ORDERING_DEFAULT),
+        m_allow_tol(SPQR_DEFAULT_TOL),
+        m_tolerance(NumTraits<Scalar>::epsilon()),
+        m_cR(0),
+        m_E(0),
+        m_H(0),
+        m_HPinv(0),
+        m_HTau(0),
+        m_useDefaultThreshold(true) {
+    cholmod_l_start(&m_cc);
+  }
 
-      if (!m_cR)
-      {
-        m_info = NumericalIssue; 
-        m_isInitialized = false;
-        return;
-      }
-      m_info = Success;
-      m_isInitialized = true;
-      m_isRUpToDate = false;
-    }
-    /** 
-     * Get the number of rows of the input matrix and the Q matrix
-     */
-    inline Index rows() const {return m_cR->nrow; }
-    
-    /** 
-     * Get the number of columns of the input matrix. 
-     */
-    inline Index cols() const { return m_cR->ncol; }
-   
-      /** \returns the solution X of \f$ A X = B \f$ using the current decomposition of A.
-      *
-      * \sa compute()
-      */
-    template<typename Rhs>
-    inline const internal::solve_retval<SPQR, Rhs> solve(const MatrixBase<Rhs>& B) const 
-    {
-      eigen_assert(m_isInitialized && " The QR factorization should be computed first, call compute()");
-      eigen_assert(this->rows()==B.rows()
-                    && "SPQR::solve(): invalid number of rows of the right hand side matrix B");
-          return internal::solve_retval<SPQR, Rhs>(*this, B.derived());
-    }
-    
-    template<typename Rhs, typename Dest>
-    void _solve(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const
-    {
-      eigen_assert(m_isInitialized && " The QR factorization should be computed first, call compute()");
-      eigen_assert(b.cols()==1 && "This method is for vectors only");
+  explicit SPQR(const MatrixType_& matrix)
+      : m_analysisIsOk(false),
+        m_factorizationIsOk(false),
+        m_isRUpToDate(false),
+        m_ordering(SPQR_ORDERING_DEFAULT),
+        m_allow_tol(SPQR_DEFAULT_TOL),
+        m_tolerance(NumTraits<Scalar>::epsilon()),
+        m_cR(0),
+        m_E(0),
+        m_H(0),
+        m_HPinv(0),
+        m_HTau(0),
+        m_useDefaultThreshold(true) {
+    cholmod_l_start(&m_cc);
+    compute(matrix);
+  }
 
-      //Compute Q^T * b
-      typename Dest::PlainObject y, y2;
-      y = matrixQ().transpose() * b;
-      
-      // Solves with the triangular matrix R
-      Index rk = this->rank();
-      y2 = y;
-      y.resize((std::max)(cols(),Index(y.rows())),y.cols());
-      y.topRows(rk) = this->matrixR().topLeftCorner(rk, rk).template triangularView<Upper>().solve(y2.topRows(rk));
+  ~SPQR() {
+    SPQR_free();
+    cholmod_l_finish(&m_cc);
+  }
+  void SPQR_free() {
+    cholmod_l_free_sparse(&m_H, &m_cc);
+    cholmod_l_free_sparse(&m_cR, &m_cc);
+    cholmod_l_free_dense(&m_HTau, &m_cc);
+    std::free(m_E);
+    std::free(m_HPinv);
+  }
 
-      // Apply the column permutation 
-      // colsPermutation() performs a copy of the permutation,
-      // so let's apply it manually:
-      for(Index i = 0; i < rk; ++i) dest.row(m_E[i]) = y.row(i);
-      for(Index i = rk; i < cols(); ++i) dest.row(m_E[i]).setZero();
-      
-//       y.bottomRows(y.rows()-rk).setZero();
-//       dest = colsPermutation() * y.topRows(cols());
-      
-      m_info = Success;
-    }
-    
-    /** \returns the sparse triangular factor R. It is a sparse matrix
-     */
-    const MatrixType matrixR() const
-    {
-      eigen_assert(m_isInitialized && " The QR factorization should be computed first, call compute()");
-      if(!m_isRUpToDate) {
-        m_R = viewAsEigen<Scalar,ColMajor, typename MatrixType::Index>(*m_cR);
-        m_isRUpToDate = true;
-      }
-      return m_R;
-    }
-    /// Get an expression of the matrix Q
-    SPQRMatrixQReturnType<SPQR> matrixQ() const
-    {
-      return SPQRMatrixQReturnType<SPQR>(*this);
-    }
-    /// Get the permutation that was applied to columns of A
-    PermutationType colsPermutation() const
-    { 
-      eigen_assert(m_isInitialized && "Decomposition is not initialized.");
-      Index n = m_cR->ncol;
-      PermutationType colsPerm(n);
-      for(Index j = 0; j <n; j++) colsPerm.indices()(j) = m_E[j];
-      return colsPerm; 
-      
-    }
-    /**
-     * Gets the rank of the matrix. 
-     * It should be equal to matrixQR().cols if the matrix is full-rank
+  void compute(const MatrixType_& matrix) {
+    if (m_isInitialized) SPQR_free();
+
+    MatrixType mat(matrix);
+
+    /* Compute the default threshold as in MatLab, see:
+     * Tim Davis, "Algorithm 915, SuiteSparseQR: Multifrontal Multithreaded Rank-Revealing
+     * Sparse QR Factorization, ACM Trans. on Math. Soft. 38(1), 2011, Page 8:3
      */
-    Index rank() const
-    {
-      eigen_assert(m_isInitialized && "Decomposition is not initialized.");
-      return m_cc.SPQR_istat[4];
+    RealScalar pivotThreshold = m_tolerance;
+    if (m_useDefaultThreshold) {
+      RealScalar max2Norm = 0.0;
+      for (int j = 0; j < mat.cols(); j++) max2Norm = numext::maxi(max2Norm, mat.col(j).norm());
+      if (numext::is_exactly_zero(max2Norm)) max2Norm = RealScalar(1);
+      pivotThreshold = 20 * (mat.rows() + mat.cols()) * max2Norm * NumTraits<RealScalar>::epsilon();
     }
-    /// Set the fill-reducing ordering method to be used
-    void setSPQROrdering(int ord) { m_ordering = ord;}
-    /// Set the tolerance tol to treat columns with 2-norm < =tol as zero
-    void setPivotThreshold(const RealScalar& tol)
-    {
-      m_useDefaultThreshold = false;
-      m_tolerance = tol;
+    cholmod_sparse A;
+    A = viewAsCholmod(mat);
+    m_rows = matrix.rows();
+    m_rank = SuiteSparseQR<Scalar>(m_ordering, pivotThreshold, internal::convert_index<StorageIndex>(matrix.cols()), &A,
+                                   &m_cR, &m_E, &m_H, &m_HPinv, &m_HTau, &m_cc);
+
+    if (!m_cR) {
+      m_info = NumericalIssue;
+      m_isInitialized = false;
+      return;
     }
-    
-    /** \returns a pointer to the SPQR workspace */
-    cholmod_common *cholmodCommon() const { return &m_cc; }
-    
-    
-    /** \brief Reports whether previous computation was successful.
-      *
-      * \returns \c Success if computation was succesful,
-      *          \c NumericalIssue if the sparse QR can not be computed
-      */
-    ComputationInfo info() const
-    {
-      eigen_assert(m_isInitialized && "Decomposition is not initialized.");
-      return m_info;
+    m_info = Success;
+    m_isInitialized = true;
+    m_isRUpToDate = false;
+  }
+  /**
+   * Get the number of rows of the input matrix and the Q matrix
+   */
+  inline Index rows() const { return m_rows; }
+
+  /**
+   * Get the number of columns of the input matrix.
+   */
+  inline Index cols() const { return m_cR->ncol; }
+
+  template <typename Rhs, typename Dest>
+  void _solve_impl(const MatrixBase<Rhs>& b, MatrixBase<Dest>& dest) const {
+    eigen_assert(m_isInitialized && " The QR factorization should be computed first, call compute()");
+    eigen_assert(b.cols() == 1 && "This method is for vectors only");
+
+    // Compute Q^T * b
+    typename Dest::PlainObject y, y2;
+    y = matrixQ().transpose() * b;
+
+    // Solves with the triangular matrix R
+    Index rk = this->rank();
+    y2 = y;
+    y.resize((std::max)(cols(), Index(y.rows())), y.cols());
+    y.topRows(rk) = this->matrixR().topLeftCorner(rk, rk).template triangularView<Upper>().solve(y2.topRows(rk));
+
+    // Apply the column permutation
+    // colsPermutation() performs a copy of the permutation,
+    // so let's apply it manually:
+    for (Index i = 0; i < rk; ++i) dest.row(m_E[i]) = y.row(i);
+    for (Index i = rk; i < cols(); ++i) dest.row(m_E[i]).setZero();
+
+    //       y.bottomRows(y.rows()-rk).setZero();
+    //       dest = colsPermutation() * y.topRows(cols());
+
+    m_info = Success;
+  }
+
+  /** \returns the sparse triangular factor R. It is a sparse matrix
+   */
+  const MatrixType matrixR() const {
+    eigen_assert(m_isInitialized && " The QR factorization should be computed first, call compute()");
+    if (!m_isRUpToDate) {
+      m_R = viewAsEigen<Scalar, StorageIndex>(*m_cR);
+      m_isRUpToDate = true;
     }
-  protected:
-    bool m_isInitialized;
-    bool m_analysisIsOk;
-    bool m_factorizationIsOk;
-    mutable bool m_isRUpToDate;
-    mutable ComputationInfo m_info;
-    int m_ordering; // Ordering method to use, see SPQR's manual
-    int m_allow_tol; // Allow to use some tolerance during numerical factorization.
-    RealScalar m_tolerance; // treat columns with 2-norm below this tolerance as zero
-    mutable cholmod_sparse *m_cR; // The sparse R factor in cholmod format
-    mutable MatrixType m_R; // The sparse matrix R in Eigen format
-    mutable Index *m_E; // The permutation applied to columns
-    mutable cholmod_sparse *m_H;  //The householder vectors
-    mutable Index *m_HPinv; // The row permutation of H
-    mutable cholmod_dense *m_HTau; // The Householder coefficients
-    mutable Index m_rank; // The rank of the matrix
-    mutable cholmod_common m_cc; // Workspace and parameters
-    bool m_useDefaultThreshold;     // Use default threshold
-    template<typename ,typename > friend struct SPQR_QProduct;
+    return m_R;
+  }
+  /// Get an expression of the matrix Q
+  SPQRMatrixQReturnType<SPQR> matrixQ() const { return SPQRMatrixQReturnType<SPQR>(*this); }
+  /// Get the permutation that was applied to columns of A
+  PermutationType colsPermutation() const {
+    eigen_assert(m_isInitialized && "Decomposition is not initialized.");
+    return PermutationType(m_E, m_cR->ncol);
+  }
+  /**
+   * Gets the rank of the matrix.
+   * It should be equal to matrixQR().cols if the matrix is full-rank
+   */
+  Index rank() const {
+    eigen_assert(m_isInitialized && "Decomposition is not initialized.");
+    return m_cc.SPQR_istat[4];
+  }
+  /// Set the fill-reducing ordering method to be used
+  void setSPQROrdering(int ord) { m_ordering = ord; }
+  /// Set the tolerance tol to treat columns with 2-norm < =tol as zero
+  void setPivotThreshold(const RealScalar& tol) {
+    m_useDefaultThreshold = false;
+    m_tolerance = tol;
+  }
+
+  /** \returns a pointer to the SPQR workspace */
+  cholmod_common* cholmodCommon() const { return &m_cc; }
+
+  /** \brief Reports whether previous computation was successful.
+   *
+   * \returns \c Success if computation was successful,
+   *          \c NumericalIssue if the sparse QR can not be computed
+   */
+  ComputationInfo info() const {
+    eigen_assert(m_isInitialized && "Decomposition is not initialized.");
+    return m_info;
+  }
+
+ protected:
+  bool m_analysisIsOk;
+  bool m_factorizationIsOk;
+  mutable bool m_isRUpToDate;
+  mutable ComputationInfo m_info;
+  int m_ordering;                           // Ordering method to use, see SPQR's manual
+  int m_allow_tol;                          // Allow to use some tolerance during numerical factorization.
+  RealScalar m_tolerance;                   // treat columns with 2-norm below this tolerance as zero
+  mutable cholmod_sparse* m_cR = nullptr;   // The sparse R factor in cholmod format
+  mutable MatrixType m_R;                   // The sparse matrix R in Eigen format
+  mutable StorageIndex* m_E = nullptr;      // The permutation applied to columns
+  mutable cholmod_sparse* m_H = nullptr;    // The householder vectors
+  mutable StorageIndex* m_HPinv = nullptr;  // The row permutation of H
+  mutable cholmod_dense* m_HTau = nullptr;  // The Householder coefficients
+  mutable Index m_rank;                     // The rank of the matrix
+  mutable cholmod_common m_cc;              // Workspace and parameters
+  bool m_useDefaultThreshold;               // Use default threshold
+  Index m_rows;
+  template <typename, typename>
+  friend struct SPQR_QProduct;
 };
 
 template <typename SPQRType, typename Derived>
-struct SPQR_QProduct : ReturnByValue<SPQR_QProduct<SPQRType,Derived> >
-{
+struct SPQR_QProduct : ReturnByValue<SPQR_QProduct<SPQRType, Derived> > {
   typedef typename SPQRType::Scalar Scalar;
-  typedef typename SPQRType::Index Index;
-  //Define the constructor to get reference to argument types
-  SPQR_QProduct(const SPQRType& spqr, const Derived& other, bool transpose) : m_spqr(spqr),m_other(other),m_transpose(transpose) {}
-  
+  typedef typename SPQRType::StorageIndex StorageIndex;
+  // Define the constructor to get reference to argument types
+  SPQR_QProduct(const SPQRType& spqr, const Derived& other, bool transpose)
+      : m_spqr(spqr), m_other(other), m_transpose(transpose) {}
+
   inline Index rows() const { return m_transpose ? m_spqr.rows() : m_spqr.cols(); }
   inline Index cols() const { return m_other.cols(); }
   // Assign to a vector
-  template<typename ResType>
-  void evalTo(ResType& res) const
-  {
+  template <typename ResType>
+  void evalTo(ResType& res) const {
     cholmod_dense y_cd;
-    cholmod_dense *x_cd; 
-    int method = m_transpose ? SPQR_QTX : SPQR_QX; 
-    cholmod_common *cc = m_spqr.cholmodCommon();
+    cholmod_dense* x_cd;
+    int method = m_transpose ? SPQR_QTX : SPQR_QX;
+    cholmod_common* cc = m_spqr.cholmodCommon();
     y_cd = viewAsCholmod(m_other.const_cast_derived());
     x_cd = SuiteSparseQR_qmult<Scalar>(method, m_spqr.m_H, m_spqr.m_HTau, m_spqr.m_HPinv, &y_cd, cc);
-    res = Matrix<Scalar,ResType::RowsAtCompileTime,ResType::ColsAtCompileTime>::Map(reinterpret_cast<Scalar*>(x_cd->x), x_cd->nrow, x_cd->ncol);
+    res = Matrix<Scalar, ResType::RowsAtCompileTime, ResType::ColsAtCompileTime>::Map(
+        reinterpret_cast<Scalar*>(x_cd->x), x_cd->nrow, x_cd->ncol);
     cholmod_l_free_dense(&x_cd, cc);
   }
-  const SPQRType& m_spqr; 
-  const Derived& m_other; 
-  bool m_transpose; 
-  
+  const SPQRType& m_spqr;
+  const Derived& m_other;
+  bool m_transpose;
 };
-template<typename SPQRType>
-struct SPQRMatrixQReturnType{
-  
+template <typename SPQRType>
+struct SPQRMatrixQReturnType {
   SPQRMatrixQReturnType(const SPQRType& spqr) : m_spqr(spqr) {}
-  template<typename Derived>
-  SPQR_QProduct<SPQRType, Derived> operator*(const MatrixBase<Derived>& other)
-  {
-    return SPQR_QProduct<SPQRType,Derived>(m_spqr,other.derived(),false);
-  }
-  SPQRMatrixQTransposeReturnType<SPQRType> adjoint() const
-  {
-    return SPQRMatrixQTransposeReturnType<SPQRType>(m_spqr);
+  template <typename Derived>
+  SPQR_QProduct<SPQRType, Derived> operator*(const MatrixBase<Derived>& other) {
+    return SPQR_QProduct<SPQRType, Derived>(m_spqr, other.derived(), false);
   }
+  SPQRMatrixQTransposeReturnType<SPQRType> adjoint() const { return SPQRMatrixQTransposeReturnType<SPQRType>(m_spqr); }
   // To use for operations with the transpose of Q
-  SPQRMatrixQTransposeReturnType<SPQRType> transpose() const
-  {
+  SPQRMatrixQTransposeReturnType<SPQRType> transpose() const {
     return SPQRMatrixQTransposeReturnType<SPQRType>(m_spqr);
   }
   const SPQRType& m_spqr;
 };
 
-template<typename SPQRType>
-struct SPQRMatrixQTransposeReturnType{
+template <typename SPQRType>
+struct SPQRMatrixQTransposeReturnType {
   SPQRMatrixQTransposeReturnType(const SPQRType& spqr) : m_spqr(spqr) {}
-  template<typename Derived>
-  SPQR_QProduct<SPQRType,Derived> operator*(const MatrixBase<Derived>& other)
-  {
-    return SPQR_QProduct<SPQRType,Derived>(m_spqr,other.derived(), true);
+  template <typename Derived>
+  SPQR_QProduct<SPQRType, Derived> operator*(const MatrixBase<Derived>& other) {
+    return SPQR_QProduct<SPQRType, Derived>(m_spqr, other.derived(), true);
   }
   const SPQRType& m_spqr;
 };
 
-namespace internal {
-  
-template<typename _MatrixType, typename Rhs>
-struct solve_retval<SPQR<_MatrixType>, Rhs>
-  : solve_retval_base<SPQR<_MatrixType>, Rhs>
-{
-  typedef SPQR<_MatrixType> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
-  }
-};
-
-} // end namespace internal
-
-}// End namespace Eigen
+}  // End namespace Eigen
 #endif
diff --git a/inst/include/Eigen/src/SVD/BDCSVD.h b/inst/include/Eigen/src/SVD/BDCSVD.h
new file mode 100644
index 00000000..db1e4a26
--- /dev/null
+++ b/inst/include/Eigen/src/SVD/BDCSVD.h
@@ -0,0 +1,1470 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// We used the "A Divide-And-Conquer Algorithm for the Bidiagonal SVD"
+// research report written by Ming Gu and Stanley C.Eisenstat
+// The code variable names correspond to the names they used in their
+// report
+//
+// Copyright (C) 2013 Gauthier Brun <brun.gauthier@gmail.com>
+// Copyright (C) 2013 Nicolas Carre <nicolas.carre@ensimag.fr>
+// Copyright (C) 2013 Jean Ceccato <jean.ceccato@ensimag.fr>
+// Copyright (C) 2013 Pierre Zoppitelli <pierre.zoppitelli@ensimag.fr>
+// Copyright (C) 2013 Jitse Niesen <jitse@maths.leeds.ac.uk>
+// Copyright (C) 2014-2017 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BDCSVD_H
+#define EIGEN_BDCSVD_H
+// #define EIGEN_BDCSVD_DEBUG_VERBOSE
+// #define EIGEN_BDCSVD_SANITY_CHECKS
+
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+#undef eigen_internal_assert
+#define eigen_internal_assert(X) assert(X);
+#endif
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+#include <iostream>
+#endif
+
+namespace Eigen {
+
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+IOFormat bdcsvdfmt(8, 0, ", ", "\n", "  [", "]");
+#endif
+
+template <typename MatrixType_, int Options>
+class BDCSVD;
+
+namespace internal {
+
+template <typename MatrixType_, int Options>
+struct traits<BDCSVD<MatrixType_, Options> > : svd_traits<MatrixType_, Options> {
+  typedef MatrixType_ MatrixType;
+};
+
+}  // end namespace internal
+
+/** \ingroup SVD_Module
+ *
+ *
+ * \class BDCSVD
+ *
+ * \brief class Bidiagonal Divide and Conquer SVD
+ *
+ * \tparam MatrixType_ the type of the matrix of which we are computing the SVD decomposition
+ *
+ * \tparam Options_ this optional parameter allows one to specify options for computing unitaries \a U and \a V.
+ *                  Possible values are #ComputeThinU, #ComputeThinV, #ComputeFullU, #ComputeFullV, and
+ *                  #DisableQRDecomposition. It is not possible to request both the thin and full version of \a U or
+ *                  \a V. By default, unitaries are not computed. BDCSVD uses R-Bidiagonalization to improve
+ *                  performance on tall and wide matrices. For backwards compatility, the option
+ *                  #DisableQRDecomposition can be used to disable this optimization.
+ *
+ * This class first reduces the input matrix to bi-diagonal form using class UpperBidiagonalization,
+ * and then performs a divide-and-conquer diagonalization. Small blocks are diagonalized using class JacobiSVD.
+ * You can control the switching size with the setSwitchSize() method, default is 16.
+ * For small matrice (<16), it is thus preferable to directly use JacobiSVD. For larger ones, BDCSVD is highly
+ * recommended and can several order of magnitude faster.
+ *
+ * \warning this algorithm is unlikely to provide accurate result when compiled with unsafe math optimizations.
+ * For instance, this concerns Intel's compiler (ICC), which performs such optimization by default unless
+ * you compile with the \c -fp-model \c precise option. Likewise, the \c -ffast-math option of GCC or clang will
+ * significantly degrade the accuracy.
+ *
+ * \sa class JacobiSVD
+ */
+template <typename MatrixType_, int Options_>
+class BDCSVD : public SVDBase<BDCSVD<MatrixType_, Options_> > {
+  typedef SVDBase<BDCSVD> Base;
+
+ public:
+  using Base::cols;
+  using Base::computeU;
+  using Base::computeV;
+  using Base::diagSize;
+  using Base::rows;
+
+  typedef MatrixType_ MatrixType;
+  typedef typename Base::Scalar Scalar;
+  typedef typename Base::RealScalar RealScalar;
+  typedef typename NumTraits<RealScalar>::Literal Literal;
+  typedef typename Base::Index Index;
+  enum {
+    Options = Options_,
+    QRDecomposition = Options & internal::QRPreconditionerBits,
+    ComputationOptions = Options & internal::ComputationOptionsBits,
+    RowsAtCompileTime = Base::RowsAtCompileTime,
+    ColsAtCompileTime = Base::ColsAtCompileTime,
+    DiagSizeAtCompileTime = Base::DiagSizeAtCompileTime,
+    MaxRowsAtCompileTime = Base::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = Base::MaxColsAtCompileTime,
+    MaxDiagSizeAtCompileTime = Base::MaxDiagSizeAtCompileTime,
+    MatrixOptions = Base::MatrixOptions
+  };
+
+  typedef typename Base::MatrixUType MatrixUType;
+  typedef typename Base::MatrixVType MatrixVType;
+  typedef typename Base::SingularValuesType SingularValuesType;
+
+  typedef Matrix<Scalar, Dynamic, Dynamic, ColMajor> MatrixX;
+  typedef Matrix<RealScalar, Dynamic, Dynamic, ColMajor> MatrixXr;
+  typedef Matrix<RealScalar, Dynamic, 1> VectorType;
+  typedef Array<RealScalar, Dynamic, 1> ArrayXr;
+  typedef Array<Index, 1, Dynamic> ArrayXi;
+  typedef Ref<ArrayXr> ArrayRef;
+  typedef Ref<ArrayXi> IndicesRef;
+
+  /** \brief Default Constructor.
+   *
+   * The default constructor is useful in cases in which the user intends to
+   * perform decompositions via BDCSVD::compute(const MatrixType&).
+   */
+  BDCSVD() : m_algoswap(16), m_isTranspose(false), m_compU(false), m_compV(false), m_numIters(0) {}
+
+  /** \brief Default Constructor with memory preallocation
+   *
+   * Like the default constructor but with preallocation of the internal data
+   * according to the specified problem size and \a Options template parameter.
+   * \sa BDCSVD()
+   */
+  BDCSVD(Index rows, Index cols) : m_algoswap(16), m_numIters(0) {
+    allocate(rows, cols, internal::get_computation_options(Options));
+  }
+
+  /** \brief Default Constructor with memory preallocation
+   *
+   * Like the default constructor but with preallocation of the internal data
+   * according to the specified problem size and the \a computationOptions.
+   *
+   * One \b cannot request unitaries using both the \a Options template parameter
+   * and the constructor. If possible, prefer using the \a Options template parameter.
+   *
+   * \param rows number of rows for the input matrix
+   * \param cols number of columns for the input matrix
+   * \param computationOptions specification for computing Thin/Full unitaries U/V
+   * \sa BDCSVD()
+   *
+   * \deprecated Will be removed in the next major Eigen version. Options should
+   * be specified in the \a Options template parameter.
+   */
+  EIGEN_DEPRECATED_WITH_REASON("Options should be specified using the class template parameter.")
+  BDCSVD(Index rows, Index cols, unsigned int computationOptions) : m_algoswap(16), m_numIters(0) {
+    internal::check_svd_options_assertions<MatrixType, Options>(computationOptions, rows, cols);
+    allocate(rows, cols, computationOptions);
+  }
+
+  /** \brief Constructor performing the decomposition of given matrix, using the custom options specified
+   *         with the \a Options template parameter.
+   *
+   * \param matrix the matrix to decompose
+   */
+  template <typename Derived>
+  BDCSVD(const MatrixBase<Derived>& matrix) : m_algoswap(16), m_numIters(0) {
+    compute_impl(matrix, internal::get_computation_options(Options));
+  }
+
+  /** \brief Constructor performing the decomposition of given matrix using specified options
+   *         for computing unitaries.
+   *
+   *  One \b cannot request unitaries using both the \a Options template parameter
+   *  and the constructor. If possible, prefer using the \a Options template parameter.
+   *
+   * \param matrix the matrix to decompose
+   * \param computationOptions specification for computing Thin/Full unitaries U/V
+   *
+   * \deprecated Will be removed in the next major Eigen version. Options should
+   * be specified in the \a Options template parameter.
+   */
+  template <typename Derived>
+  EIGEN_DEPRECATED_WITH_REASON("Options should be specified using the class template parameter.")
+  BDCSVD(const MatrixBase<Derived>& matrix, unsigned int computationOptions) : m_algoswap(16), m_numIters(0) {
+    internal::check_svd_options_assertions<MatrixType, Options>(computationOptions, matrix.rows(), matrix.cols());
+    compute_impl(matrix, computationOptions);
+  }
+
+  ~BDCSVD() {}
+
+  /** \brief Method performing the decomposition of given matrix. Computes Thin/Full unitaries U/V if specified
+   *         using the \a Options template parameter or the class constructor.
+   *
+   * \param matrix the matrix to decompose
+   */
+  template <typename Derived>
+  BDCSVD& compute(const MatrixBase<Derived>& matrix) {
+    return compute_impl(matrix, m_computationOptions);
+  }
+
+  /** \brief Method performing the decomposition of given matrix, as specified by
+   *         the `computationOptions` parameter.
+   *
+   * \param matrix the matrix to decompose
+   * \param computationOptions specify whether to compute Thin/Full unitaries U/V
+   *
+   * \deprecated Will be removed in the next major Eigen version. Options should
+   * be specified in the \a Options template parameter.
+   */
+  template <typename Derived>
+  EIGEN_DEPRECATED_WITH_REASON("Options should be specified using the class template parameter.")
+  BDCSVD& compute(const MatrixBase<Derived>& matrix, unsigned int computationOptions) {
+    internal::check_svd_options_assertions<MatrixType, Options>(computationOptions, matrix.rows(), matrix.cols());
+    return compute_impl(matrix, computationOptions);
+  }
+
+  void setSwitchSize(int s) {
+    eigen_assert(s >= 3 && "BDCSVD the size of the algo switch has to be at least 3.");
+    m_algoswap = s;
+  }
+
+ private:
+  template <typename Derived>
+  BDCSVD& compute_impl(const MatrixBase<Derived>& matrix, unsigned int computationOptions);
+  void divide(Index firstCol, Index lastCol, Index firstRowW, Index firstColW, Index shift);
+  void computeSVDofM(Index firstCol, Index n, MatrixXr& U, VectorType& singVals, MatrixXr& V);
+  void computeSingVals(const ArrayRef& col0, const ArrayRef& diag, const IndicesRef& perm, VectorType& singVals,
+                       ArrayRef shifts, ArrayRef mus);
+  void perturbCol0(const ArrayRef& col0, const ArrayRef& diag, const IndicesRef& perm, const VectorType& singVals,
+                   const ArrayRef& shifts, const ArrayRef& mus, ArrayRef zhat);
+  void computeSingVecs(const ArrayRef& zhat, const ArrayRef& diag, const IndicesRef& perm, const VectorType& singVals,
+                       const ArrayRef& shifts, const ArrayRef& mus, MatrixXr& U, MatrixXr& V);
+  void deflation43(Index firstCol, Index shift, Index i, Index size);
+  void deflation44(Index firstColu, Index firstColm, Index firstRowW, Index firstColW, Index i, Index j, Index size);
+  void deflation(Index firstCol, Index lastCol, Index k, Index firstRowW, Index firstColW, Index shift);
+  template <typename HouseholderU, typename HouseholderV, typename NaiveU, typename NaiveV>
+  void copyUV(const HouseholderU& householderU, const HouseholderV& householderV, const NaiveU& naiveU,
+              const NaiveV& naivev);
+  void structured_update(Block<MatrixXr, Dynamic, Dynamic> A, const MatrixXr& B, Index n1);
+  static RealScalar secularEq(RealScalar x, const ArrayRef& col0, const ArrayRef& diag, const IndicesRef& perm,
+                              const ArrayRef& diagShifted, RealScalar shift);
+  template <typename SVDType>
+  void computeBaseCase(SVDType& svd, Index n, Index firstCol, Index firstRowW, Index firstColW, Index shift);
+
+ protected:
+  void allocate(Index rows, Index cols, unsigned int computationOptions);
+  MatrixXr m_naiveU, m_naiveV;
+  MatrixXr m_computed;
+  Index m_nRec;
+  ArrayXr m_workspace;
+  ArrayXi m_workspaceI;
+  int m_algoswap;
+  bool m_isTranspose, m_compU, m_compV, m_useQrDecomp;
+  JacobiSVD<MatrixType, ComputationOptions> smallSvd;
+  HouseholderQR<MatrixX> qrDecomp;
+  internal::UpperBidiagonalization<MatrixX> bid;
+  MatrixX copyWorkspace;
+  MatrixX reducedTriangle;
+
+  using Base::m_computationOptions;
+  using Base::m_computeThinU;
+  using Base::m_computeThinV;
+  using Base::m_info;
+  using Base::m_isInitialized;
+  using Base::m_matrixU;
+  using Base::m_matrixV;
+  using Base::m_nonzeroSingularValues;
+  using Base::m_singularValues;
+
+ public:
+  int m_numIters;
+};  // end class BDCSVD
+
+// Method to allocate and initialize matrix and attributes
+template <typename MatrixType, int Options>
+void BDCSVD<MatrixType, Options>::allocate(Index rows, Index cols, unsigned int computationOptions) {
+  if (Base::allocate(rows, cols, computationOptions)) return;
+
+  if (cols < m_algoswap)
+    smallSvd.allocate(rows, cols, Options == 0 ? computationOptions : internal::get_computation_options(Options));
+
+  m_computed = MatrixXr::Zero(diagSize() + 1, diagSize());
+  m_compU = computeV();
+  m_compV = computeU();
+  m_isTranspose = (cols > rows);
+  if (m_isTranspose) std::swap(m_compU, m_compV);
+
+  // kMinAspectRatio is the crossover point that determines if we perform R-Bidiagonalization
+  // or bidiagonalize the input matrix directly.
+  // It is based off of LAPACK's dgesdd routine, which uses 11.0/6.0
+  // we use a larger scalar to prevent a regression for relatively square matrices.
+  constexpr Index kMinAspectRatio = 4;
+  constexpr bool disableQrDecomp = static_cast<int>(QRDecomposition) == static_cast<int>(DisableQRDecomposition);
+  m_useQrDecomp = !disableQrDecomp && ((rows / kMinAspectRatio > cols) || (cols / kMinAspectRatio > rows));
+  if (m_useQrDecomp) {
+    qrDecomp = HouseholderQR<MatrixX>((std::max)(rows, cols), (std::min)(rows, cols));
+    reducedTriangle = MatrixX(diagSize(), diagSize());
+  }
+
+  copyWorkspace = MatrixX(m_isTranspose ? cols : rows, m_isTranspose ? rows : cols);
+  bid = internal::UpperBidiagonalization<MatrixX>(m_useQrDecomp ? diagSize() : copyWorkspace.rows(),
+                                                  m_useQrDecomp ? diagSize() : copyWorkspace.cols());
+
+  if (m_compU)
+    m_naiveU = MatrixXr::Zero(diagSize() + 1, diagSize() + 1);
+  else
+    m_naiveU = MatrixXr::Zero(2, diagSize() + 1);
+
+  if (m_compV) m_naiveV = MatrixXr::Zero(diagSize(), diagSize());
+
+  m_workspace.resize((diagSize() + 1) * (diagSize() + 1) * 3);
+  m_workspaceI.resize(3 * diagSize());
+}  // end allocate
+
+template <typename MatrixType, int Options>
+template <typename Derived>
+BDCSVD<MatrixType, Options>& BDCSVD<MatrixType, Options>::compute_impl(const MatrixBase<Derived>& matrix,
+                                                                       unsigned int computationOptions) {
+  EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Derived, MatrixType);
+  EIGEN_STATIC_ASSERT((std::is_same<typename Derived::Scalar, typename MatrixType::Scalar>::value),
+                      Input matrix must have the same Scalar type as the BDCSVD object.);
+
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+  std::cout << "\n\n\n================================================================================================="
+               "=====================\n\n\n";
+#endif
+  using std::abs;
+
+  allocate(matrix.rows(), matrix.cols(), computationOptions);
+
+  const RealScalar considerZero = (std::numeric_limits<RealScalar>::min)();
+
+  //**** step -1 - If the problem is too small, directly falls back to JacobiSVD and return
+  if (matrix.cols() < m_algoswap) {
+    smallSvd.compute(matrix);
+    m_isInitialized = true;
+    m_info = smallSvd.info();
+    if (m_info == Success || m_info == NoConvergence) {
+      if (computeU()) m_matrixU = smallSvd.matrixU();
+      if (computeV()) m_matrixV = smallSvd.matrixV();
+      m_singularValues = smallSvd.singularValues();
+      m_nonzeroSingularValues = smallSvd.nonzeroSingularValues();
+    }
+    return *this;
+  }
+
+  //**** step 0 - Copy the input matrix and apply scaling to reduce over/under-flows
+  RealScalar scale = matrix.cwiseAbs().template maxCoeff<PropagateNaN>();
+  if (!(numext::isfinite)(scale)) {
+    m_isInitialized = true;
+    m_info = InvalidInput;
+    return *this;
+  }
+
+  if (numext::is_exactly_zero(scale)) scale = Literal(1);
+
+  if (m_isTranspose)
+    copyWorkspace = matrix.adjoint() / scale;
+  else
+    copyWorkspace = matrix / scale;
+
+  //**** step 1 - Bidiagonalization.
+  // If the problem is sufficiently rectangular, we perform R-Bidiagonalization: compute A = Q(R/0)
+  // and then bidiagonalize R. Otherwise, if the problem is relatively square, we
+  // bidiagonalize the input matrix directly.
+  if (m_useQrDecomp) {
+    qrDecomp.compute(copyWorkspace);
+    reducedTriangle = qrDecomp.matrixQR().topRows(diagSize());
+    reducedTriangle.template triangularView<StrictlyLower>().setZero();
+    bid.compute(reducedTriangle);
+  } else {
+    bid.compute(copyWorkspace);
+  }
+
+  //**** step 2 - Divide & Conquer
+  m_naiveU.setZero();
+  m_naiveV.setZero();
+  // FIXME this line involves a temporary matrix
+  m_computed.topRows(diagSize()) = bid.bidiagonal().toDenseMatrix().transpose();
+  m_computed.template bottomRows<1>().setZero();
+  divide(0, diagSize() - 1, 0, 0, 0);
+  if (m_info != Success && m_info != NoConvergence) {
+    m_isInitialized = true;
+    return *this;
+  }
+
+  //**** step 3 - Copy singular values and vectors
+  for (int i = 0; i < diagSize(); i++) {
+    RealScalar a = abs(m_computed.coeff(i, i));
+    m_singularValues.coeffRef(i) = a * scale;
+    if (a < considerZero) {
+      m_nonzeroSingularValues = i;
+      m_singularValues.tail(diagSize() - i - 1).setZero();
+      break;
+    } else if (i == diagSize() - 1) {
+      m_nonzeroSingularValues = i + 1;
+      break;
+    }
+  }
+
+  //**** step 4 - Finalize unitaries U and V
+  if (m_isTranspose)
+    copyUV(bid.householderV(), bid.householderU(), m_naiveV, m_naiveU);
+  else
+    copyUV(bid.householderU(), bid.householderV(), m_naiveU, m_naiveV);
+
+  if (m_useQrDecomp) {
+    if (m_isTranspose && computeV())
+      m_matrixV.applyOnTheLeft(qrDecomp.householderQ());
+    else if (!m_isTranspose && computeU())
+      m_matrixU.applyOnTheLeft(qrDecomp.householderQ());
+  }
+
+  m_isInitialized = true;
+  return *this;
+}  // end compute
+
+template <typename MatrixType, int Options>
+template <typename HouseholderU, typename HouseholderV, typename NaiveU, typename NaiveV>
+void BDCSVD<MatrixType, Options>::copyUV(const HouseholderU& householderU, const HouseholderV& householderV,
+                                         const NaiveU& naiveU, const NaiveV& naiveV) {
+  // Note exchange of U and V: m_matrixU is set from m_naiveV and vice versa
+  if (computeU()) {
+    Index Ucols = m_computeThinU ? diagSize() : rows();
+    m_matrixU = MatrixX::Identity(rows(), Ucols);
+    m_matrixU.topLeftCorner(diagSize(), diagSize()) =
+        naiveV.template cast<Scalar>().topLeftCorner(diagSize(), diagSize());
+    // FIXME the following conditionals involve temporary buffers
+    if (m_useQrDecomp)
+      m_matrixU.topLeftCorner(householderU.cols(), diagSize()).applyOnTheLeft(householderU);
+    else
+      m_matrixU.applyOnTheLeft(householderU);
+  }
+  if (computeV()) {
+    Index Vcols = m_computeThinV ? diagSize() : cols();
+    m_matrixV = MatrixX::Identity(cols(), Vcols);
+    m_matrixV.topLeftCorner(diagSize(), diagSize()) =
+        naiveU.template cast<Scalar>().topLeftCorner(diagSize(), diagSize());
+    // FIXME the following conditionals involve temporary buffers
+    if (m_useQrDecomp)
+      m_matrixV.topLeftCorner(householderV.cols(), diagSize()).applyOnTheLeft(householderV);
+    else
+      m_matrixV.applyOnTheLeft(householderV);
+  }
+}
+
+/** \internal
+ * Performs A = A * B exploiting the special structure of the matrix A. Splitting A as:
+ *  A = [A1]
+ *      [A2]
+ * such that A1.rows()==n1, then we assume that at least half of the columns of A1 and A2 are zeros.
+ * We can thus pack them prior to the the matrix product. However, this is only worth the effort if the matrix is large
+ * enough.
+ */
+template <typename MatrixType, int Options>
+void BDCSVD<MatrixType, Options>::structured_update(Block<MatrixXr, Dynamic, Dynamic> A, const MatrixXr& B, Index n1) {
+  Index n = A.rows();
+  if (n > 100) {
+    // If the matrices are large enough, let's exploit the sparse structure of A by
+    // splitting it in half (wrt n1), and packing the non-zero columns.
+    Index n2 = n - n1;
+    Map<MatrixXr> A1(m_workspace.data(), n1, n);
+    Map<MatrixXr> A2(m_workspace.data() + n1 * n, n2, n);
+    Map<MatrixXr> B1(m_workspace.data() + n * n, n, n);
+    Map<MatrixXr> B2(m_workspace.data() + 2 * n * n, n, n);
+    Index k1 = 0, k2 = 0;
+    for (Index j = 0; j < n; ++j) {
+      if ((A.col(j).head(n1).array() != Literal(0)).any()) {
+        A1.col(k1) = A.col(j).head(n1);
+        B1.row(k1) = B.row(j);
+        ++k1;
+      }
+      if ((A.col(j).tail(n2).array() != Literal(0)).any()) {
+        A2.col(k2) = A.col(j).tail(n2);
+        B2.row(k2) = B.row(j);
+        ++k2;
+      }
+    }
+
+    A.topRows(n1).noalias() = A1.leftCols(k1) * B1.topRows(k1);
+    A.bottomRows(n2).noalias() = A2.leftCols(k2) * B2.topRows(k2);
+  } else {
+    Map<MatrixXr, Aligned> tmp(m_workspace.data(), n, n);
+    tmp.noalias() = A * B;
+    A = tmp;
+  }
+}
+
+template <typename MatrixType, int Options>
+template <typename SVDType>
+void BDCSVD<MatrixType, Options>::computeBaseCase(SVDType& svd, Index n, Index firstCol, Index firstRowW,
+                                                  Index firstColW, Index shift) {
+  svd.compute(m_computed.block(firstCol, firstCol, n + 1, n));
+  m_info = svd.info();
+  if (m_info != Success && m_info != NoConvergence) return;
+  if (m_compU)
+    m_naiveU.block(firstCol, firstCol, n + 1, n + 1).real() = svd.matrixU();
+  else {
+    m_naiveU.row(0).segment(firstCol, n + 1).real() = svd.matrixU().row(0);
+    m_naiveU.row(1).segment(firstCol, n + 1).real() = svd.matrixU().row(n);
+  }
+  if (m_compV) m_naiveV.block(firstRowW, firstColW, n, n).real() = svd.matrixV();
+  m_computed.block(firstCol + shift, firstCol + shift, n + 1, n).setZero();
+  m_computed.diagonal().segment(firstCol + shift, n) = svd.singularValues().head(n);
+}
+
+// The divide algorithm is done "in place", we are always working on subsets of the same matrix. The divide methods
+// takes as argument the place of the submatrix we are currently working on.
+
+//@param firstCol : The Index of the first column of the submatrix of m_computed and for m_naiveU;
+//@param lastCol : The Index of the last column of the submatrix of m_computed and for m_naiveU;
+// lastCol + 1 - firstCol is the size of the submatrix.
+//@param firstRowW : The Index of the first row of the matrix W that we are to change. (see the reference paper section
+// 1 for more information on W)
+//@param firstColW : Same as firstRowW with the column.
+//@param shift : Each time one takes the left submatrix, one must add 1 to the shift. Why? Because! We actually want the
+// last column of the U submatrix
+// to become the first column (*coeff) and to shift all the other columns to the right. There are more details on the
+// reference paper.
+template <typename MatrixType, int Options>
+void BDCSVD<MatrixType, Options>::divide(Index firstCol, Index lastCol, Index firstRowW, Index firstColW, Index shift) {
+  // requires rows = cols + 1;
+  using std::abs;
+  using std::pow;
+  using std::sqrt;
+  const Index n = lastCol - firstCol + 1;
+  const Index k = n / 2;
+  const RealScalar considerZero = (std::numeric_limits<RealScalar>::min)();
+  RealScalar alphaK;
+  RealScalar betaK;
+  RealScalar r0;
+  RealScalar lambda, phi, c0, s0;
+  VectorType l, f;
+  // We use the other algorithm which is more efficient for small
+  // matrices.
+  if (n < m_algoswap) {
+    // FIXME this block involves temporaries
+    if (m_compV) {
+      JacobiSVD<MatrixXr, ComputeFullU | ComputeFullV> baseSvd;
+      computeBaseCase(baseSvd, n, firstCol, firstRowW, firstColW, shift);
+    } else {
+      JacobiSVD<MatrixXr, ComputeFullU> baseSvd;
+      computeBaseCase(baseSvd, n, firstCol, firstRowW, firstColW, shift);
+    }
+    return;
+  }
+  // We use the divide and conquer algorithm
+  alphaK = m_computed(firstCol + k, firstCol + k);
+  betaK = m_computed(firstCol + k + 1, firstCol + k);
+  // The divide must be done in that order in order to have good results. Divide change the data inside the submatrices
+  // and the divide of the right submatrice reads one column of the left submatrice. That's why we need to treat the
+  // right submatrix before the left one.
+  divide(k + 1 + firstCol, lastCol, k + 1 + firstRowW, k + 1 + firstColW, shift);
+  if (m_info != Success && m_info != NoConvergence) return;
+  divide(firstCol, k - 1 + firstCol, firstRowW, firstColW + 1, shift + 1);
+  if (m_info != Success && m_info != NoConvergence) return;
+
+  if (m_compU) {
+    lambda = m_naiveU(firstCol + k, firstCol + k);
+    phi = m_naiveU(firstCol + k + 1, lastCol + 1);
+  } else {
+    lambda = m_naiveU(1, firstCol + k);
+    phi = m_naiveU(0, lastCol + 1);
+  }
+  r0 = sqrt((abs(alphaK * lambda) * abs(alphaK * lambda)) + abs(betaK * phi) * abs(betaK * phi));
+  if (m_compU) {
+    l = m_naiveU.row(firstCol + k).segment(firstCol, k);
+    f = m_naiveU.row(firstCol + k + 1).segment(firstCol + k + 1, n - k - 1);
+  } else {
+    l = m_naiveU.row(1).segment(firstCol, k);
+    f = m_naiveU.row(0).segment(firstCol + k + 1, n - k - 1);
+  }
+  if (m_compV) m_naiveV(firstRowW + k, firstColW) = Literal(1);
+  if (r0 < considerZero) {
+    c0 = Literal(1);
+    s0 = Literal(0);
+  } else {
+    c0 = alphaK * lambda / r0;
+    s0 = betaK * phi / r0;
+  }
+
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+  eigen_internal_assert(m_naiveU.allFinite());
+  eigen_internal_assert(m_naiveV.allFinite());
+  eigen_internal_assert(m_computed.allFinite());
+#endif
+
+  if (m_compU) {
+    MatrixXr q1(m_naiveU.col(firstCol + k).segment(firstCol, k + 1));
+    // we shiftW Q1 to the right
+    for (Index i = firstCol + k - 1; i >= firstCol; i--)
+      m_naiveU.col(i + 1).segment(firstCol, k + 1) = m_naiveU.col(i).segment(firstCol, k + 1);
+    // we shift q1 at the left with a factor c0
+    m_naiveU.col(firstCol).segment(firstCol, k + 1) = (q1 * c0);
+    // last column = q1 * - s0
+    m_naiveU.col(lastCol + 1).segment(firstCol, k + 1) = (q1 * (-s0));
+    // first column = q2 * s0
+    m_naiveU.col(firstCol).segment(firstCol + k + 1, n - k) =
+        m_naiveU.col(lastCol + 1).segment(firstCol + k + 1, n - k) * s0;
+    // q2 *= c0
+    m_naiveU.col(lastCol + 1).segment(firstCol + k + 1, n - k) *= c0;
+  } else {
+    RealScalar q1 = m_naiveU(0, firstCol + k);
+    // we shift Q1 to the right
+    for (Index i = firstCol + k - 1; i >= firstCol; i--) m_naiveU(0, i + 1) = m_naiveU(0, i);
+    // we shift q1 at the left with a factor c0
+    m_naiveU(0, firstCol) = (q1 * c0);
+    // last column = q1 * - s0
+    m_naiveU(0, lastCol + 1) = (q1 * (-s0));
+    // first column = q2 * s0
+    m_naiveU(1, firstCol) = m_naiveU(1, lastCol + 1) * s0;
+    // q2 *= c0
+    m_naiveU(1, lastCol + 1) *= c0;
+    m_naiveU.row(1).segment(firstCol + 1, k).setZero();
+    m_naiveU.row(0).segment(firstCol + k + 1, n - k - 1).setZero();
+  }
+
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+  eigen_internal_assert(m_naiveU.allFinite());
+  eigen_internal_assert(m_naiveV.allFinite());
+  eigen_internal_assert(m_computed.allFinite());
+#endif
+
+  m_computed(firstCol + shift, firstCol + shift) = r0;
+  m_computed.col(firstCol + shift).segment(firstCol + shift + 1, k) = alphaK * l.transpose().real();
+  m_computed.col(firstCol + shift).segment(firstCol + shift + k + 1, n - k - 1) = betaK * f.transpose().real();
+
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+  ArrayXr tmp1 = (m_computed.block(firstCol + shift, firstCol + shift, n, n)).jacobiSvd().singularValues();
+#endif
+  // Second part: try to deflate singular values in combined matrix
+  deflation(firstCol, lastCol, k, firstRowW, firstColW, shift);
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+  ArrayXr tmp2 = (m_computed.block(firstCol + shift, firstCol + shift, n, n)).jacobiSvd().singularValues();
+  std::cout << "\n\nj1 = " << tmp1.transpose().format(bdcsvdfmt) << "\n";
+  std::cout << "j2 = " << tmp2.transpose().format(bdcsvdfmt) << "\n\n";
+  std::cout << "err:      " << ((tmp1 - tmp2).abs() > 1e-12 * tmp2.abs()).transpose() << "\n";
+  static int count = 0;
+  std::cout << "# " << ++count << "\n\n";
+  eigen_internal_assert((tmp1 - tmp2).matrix().norm() < 1e-14 * tmp2.matrix().norm());
+//   eigen_internal_assert(count<681);
+//   eigen_internal_assert(((tmp1-tmp2).abs()<1e-13*tmp2.abs()).all());
+#endif
+
+  // Third part: compute SVD of combined matrix
+  MatrixXr UofSVD, VofSVD;
+  VectorType singVals;
+  computeSVDofM(firstCol + shift, n, UofSVD, singVals, VofSVD);
+
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+  eigen_internal_assert(UofSVD.allFinite());
+  eigen_internal_assert(VofSVD.allFinite());
+#endif
+
+  if (m_compU)
+    structured_update(m_naiveU.block(firstCol, firstCol, n + 1, n + 1), UofSVD, (n + 2) / 2);
+  else {
+    Map<Matrix<RealScalar, 2, Dynamic>, Aligned> tmp(m_workspace.data(), 2, n + 1);
+    tmp.noalias() = m_naiveU.middleCols(firstCol, n + 1) * UofSVD;
+    m_naiveU.middleCols(firstCol, n + 1) = tmp;
+  }
+
+  if (m_compV) structured_update(m_naiveV.block(firstRowW, firstColW, n, n), VofSVD, (n + 1) / 2);
+
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+  eigen_internal_assert(m_naiveU.allFinite());
+  eigen_internal_assert(m_naiveV.allFinite());
+  eigen_internal_assert(m_computed.allFinite());
+#endif
+
+  m_computed.block(firstCol + shift, firstCol + shift, n, n).setZero();
+  m_computed.block(firstCol + shift, firstCol + shift, n, n).diagonal() = singVals;
+}  // end divide
+
+// Compute SVD of m_computed.block(firstCol, firstCol, n + 1, n); this block only has non-zeros in
+// the first column and on the diagonal and has undergone deflation, so diagonal is in increasing
+// order except for possibly the (0,0) entry. The computed SVD is stored U, singVals and V, except
+// that if m_compV is false, then V is not computed. Singular values are sorted in decreasing order.
+//
+// TODO Opportunities for optimization: better root finding algo, better stopping criterion, better
+// handling of round-off errors, be consistent in ordering
+// For instance, to solve the secular equation using FMM, see
+// http://www.stat.uchicago.edu/~lekheng/courses/302/classics/greengard-rokhlin.pdf
+template <typename MatrixType, int Options>
+void BDCSVD<MatrixType, Options>::computeSVDofM(Index firstCol, Index n, MatrixXr& U, VectorType& singVals,
+                                                MatrixXr& V) {
+  const RealScalar considerZero = (std::numeric_limits<RealScalar>::min)();
+  using std::abs;
+  ArrayRef col0 = m_computed.col(firstCol).segment(firstCol, n);
+  m_workspace.head(n) = m_computed.block(firstCol, firstCol, n, n).diagonal();
+  ArrayRef diag = m_workspace.head(n);
+  diag(0) = Literal(0);
+
+  // Allocate space for singular values and vectors
+  singVals.resize(n);
+  U.resize(n + 1, n + 1);
+  if (m_compV) V.resize(n, n);
+
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+  if (col0.hasNaN() || diag.hasNaN()) std::cout << "\n\nHAS NAN\n\n";
+#endif
+
+  // Many singular values might have been deflated, the zero ones have been moved to the end,
+  // but others are interleaved and we must ignore them at this stage.
+  // To this end, let's compute a permutation skipping them:
+  Index actual_n = n;
+  while (actual_n > 1 && numext::is_exactly_zero(diag(actual_n - 1))) {
+    --actual_n;
+    eigen_internal_assert(numext::is_exactly_zero(col0(actual_n)));
+  }
+  Index m = 0;  // size of the deflated problem
+  for (Index k = 0; k < actual_n; ++k)
+    if (abs(col0(k)) > considerZero) m_workspaceI(m++) = k;
+  Map<ArrayXi> perm(m_workspaceI.data(), m);
+
+  Map<ArrayXr> shifts(m_workspace.data() + 1 * n, n);
+  Map<ArrayXr> mus(m_workspace.data() + 2 * n, n);
+  Map<ArrayXr> zhat(m_workspace.data() + 3 * n, n);
+
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+  std::cout << "computeSVDofM using:\n";
+  std::cout << "  z: " << col0.transpose() << "\n";
+  std::cout << "  d: " << diag.transpose() << "\n";
+#endif
+
+  // Compute singVals, shifts, and mus
+  computeSingVals(col0, diag, perm, singVals, shifts, mus);
+
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+  std::cout << "  j:        "
+            << (m_computed.block(firstCol, firstCol, n, n)).jacobiSvd().singularValues().transpose().reverse()
+            << "\n\n";
+  std::cout << "  sing-val: " << singVals.transpose() << "\n";
+  std::cout << "  mu:       " << mus.transpose() << "\n";
+  std::cout << "  shift:    " << shifts.transpose() << "\n";
+
+  {
+    std::cout << "\n\n    mus:    " << mus.head(actual_n).transpose() << "\n\n";
+    std::cout << "    check1 (expect0) : "
+              << ((singVals.array() - (shifts + mus)) / singVals.array()).head(actual_n).transpose() << "\n\n";
+    eigen_internal_assert((((singVals.array() - (shifts + mus)) / singVals.array()).head(actual_n) >= 0).all());
+    std::cout << "    check2 (>0)      : " << ((singVals.array() - diag) / singVals.array()).head(actual_n).transpose()
+              << "\n\n";
+    eigen_internal_assert((((singVals.array() - diag) / singVals.array()).head(actual_n) >= 0).all());
+  }
+#endif
+
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+  eigen_internal_assert(singVals.allFinite());
+  eigen_internal_assert(mus.allFinite());
+  eigen_internal_assert(shifts.allFinite());
+#endif
+
+  // Compute zhat
+  perturbCol0(col0, diag, perm, singVals, shifts, mus, zhat);
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+  std::cout << "  zhat: " << zhat.transpose() << "\n";
+#endif
+
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+  eigen_internal_assert(zhat.allFinite());
+#endif
+
+  computeSingVecs(zhat, diag, perm, singVals, shifts, mus, U, V);
+
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+  std::cout << "U^T U: " << (U.transpose() * U - MatrixXr(MatrixXr::Identity(U.cols(), U.cols()))).norm() << "\n";
+  std::cout << "V^T V: " << (V.transpose() * V - MatrixXr(MatrixXr::Identity(V.cols(), V.cols()))).norm() << "\n";
+#endif
+
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+  eigen_internal_assert(m_naiveU.allFinite());
+  eigen_internal_assert(m_naiveV.allFinite());
+  eigen_internal_assert(m_computed.allFinite());
+  eigen_internal_assert(U.allFinite());
+  eigen_internal_assert(V.allFinite());
+//   eigen_internal_assert((U.transpose() * U - MatrixXr(MatrixXr::Identity(U.cols(),U.cols()))).norm() <
+//   100*NumTraits<RealScalar>::epsilon() * n); eigen_internal_assert((V.transpose() * V -
+//   MatrixXr(MatrixXr::Identity(V.cols(),V.cols()))).norm() < 100*NumTraits<RealScalar>::epsilon() * n);
+#endif
+
+  // Because of deflation, the singular values might not be completely sorted.
+  // Fortunately, reordering them is a O(n) problem
+  for (Index i = 0; i < actual_n - 1; ++i) {
+    if (singVals(i) > singVals(i + 1)) {
+      using std::swap;
+      swap(singVals(i), singVals(i + 1));
+      U.col(i).swap(U.col(i + 1));
+      if (m_compV) V.col(i).swap(V.col(i + 1));
+    }
+  }
+
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+  {
+    bool singular_values_sorted =
+        (((singVals.segment(1, actual_n - 1) - singVals.head(actual_n - 1))).array() >= 0).all();
+    if (!singular_values_sorted)
+      std::cout << "Singular values are not sorted: " << singVals.segment(1, actual_n).transpose() << "\n";
+    eigen_internal_assert(singular_values_sorted);
+  }
+#endif
+
+  // Reverse order so that singular values in increased order
+  // Because of deflation, the zeros singular-values are already at the end
+  singVals.head(actual_n).reverseInPlace();
+  U.leftCols(actual_n).rowwise().reverseInPlace();
+  if (m_compV) V.leftCols(actual_n).rowwise().reverseInPlace();
+
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+  JacobiSVD<MatrixXr> jsvd(m_computed.block(firstCol, firstCol, n, n));
+  std::cout << "  * j:        " << jsvd.singularValues().transpose() << "\n\n";
+  std::cout << "  * sing-val: " << singVals.transpose() << "\n";
+//   std::cout << "  * err:      " << ((jsvd.singularValues()-singVals)>1e-13*singVals.norm()).transpose() << "\n";
+#endif
+}
+
+template <typename MatrixType, int Options>
+typename BDCSVD<MatrixType, Options>::RealScalar BDCSVD<MatrixType, Options>::secularEq(
+    RealScalar mu, const ArrayRef& col0, const ArrayRef& diag, const IndicesRef& perm, const ArrayRef& diagShifted,
+    RealScalar shift) {
+  Index m = perm.size();
+  RealScalar res = Literal(1);
+  for (Index i = 0; i < m; ++i) {
+    Index j = perm(i);
+    // The following expression could be rewritten to involve only a single division,
+    // but this would make the expression more sensitive to overflow.
+    res += (col0(j) / (diagShifted(j) - mu)) * (col0(j) / (diag(j) + shift + mu));
+  }
+  return res;
+}
+
+template <typename MatrixType, int Options>
+void BDCSVD<MatrixType, Options>::computeSingVals(const ArrayRef& col0, const ArrayRef& diag, const IndicesRef& perm,
+                                                  VectorType& singVals, ArrayRef shifts, ArrayRef mus) {
+  using std::abs;
+  using std::sqrt;
+  using std::swap;
+
+  Index n = col0.size();
+  Index actual_n = n;
+  // Note that here actual_n is computed based on col0(i)==0 instead of diag(i)==0 as above
+  // because 1) we have diag(i)==0 => col0(i)==0 and 2) if col0(i)==0, then diag(i) is already a singular value.
+  while (actual_n > 1 && numext::is_exactly_zero(col0(actual_n - 1))) --actual_n;
+
+  for (Index k = 0; k < n; ++k) {
+    if (numext::is_exactly_zero(col0(k)) || actual_n == 1) {
+      // if col0(k) == 0, then entry is deflated, so singular value is on diagonal
+      // if actual_n==1, then the deflated problem is already diagonalized
+      singVals(k) = k == 0 ? col0(0) : diag(k);
+      mus(k) = Literal(0);
+      shifts(k) = k == 0 ? col0(0) : diag(k);
+      continue;
+    }
+
+    // otherwise, use secular equation to find singular value
+    RealScalar left = diag(k);
+    RealScalar right;  // was: = (k != actual_n-1) ? diag(k+1) : (diag(actual_n-1) + col0.matrix().norm());
+    if (k == actual_n - 1)
+      right = (diag(actual_n - 1) + col0.matrix().norm());
+    else {
+      // Skip deflated singular values,
+      // recall that at this stage we assume that z[j]!=0 and all entries for which z[j]==0 have been put aside.
+      // This should be equivalent to using perm[]
+      Index l = k + 1;
+      while (numext::is_exactly_zero(col0(l))) {
+        ++l;
+        eigen_internal_assert(l < actual_n);
+      }
+      right = diag(l);
+    }
+
+    // first decide whether it's closer to the left end or the right end
+    RealScalar mid = left + (right - left) / Literal(2);
+    RealScalar fMid = secularEq(mid, col0, diag, perm, diag, Literal(0));
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+    std::cout << "right-left = " << right - left << "\n";
+    //     std::cout << "fMid = " << fMid << " " << secularEq(mid-left, col0, diag, perm, ArrayXr(diag-left), left)
+    //                            << " " << secularEq(mid-right, col0, diag, perm, ArrayXr(diag-right), right)   <<
+    //                            "\n";
+    std::cout << "     = " << secularEq(left + RealScalar(0.000001) * (right - left), col0, diag, perm, diag, 0) << " "
+              << secularEq(left + RealScalar(0.1) * (right - left), col0, diag, perm, diag, 0) << " "
+              << secularEq(left + RealScalar(0.2) * (right - left), col0, diag, perm, diag, 0) << " "
+              << secularEq(left + RealScalar(0.3) * (right - left), col0, diag, perm, diag, 0) << " "
+              << secularEq(left + RealScalar(0.4) * (right - left), col0, diag, perm, diag, 0) << " "
+              << secularEq(left + RealScalar(0.49) * (right - left), col0, diag, perm, diag, 0) << " "
+              << secularEq(left + RealScalar(0.5) * (right - left), col0, diag, perm, diag, 0) << " "
+              << secularEq(left + RealScalar(0.51) * (right - left), col0, diag, perm, diag, 0) << " "
+              << secularEq(left + RealScalar(0.6) * (right - left), col0, diag, perm, diag, 0) << " "
+              << secularEq(left + RealScalar(0.7) * (right - left), col0, diag, perm, diag, 0) << " "
+              << secularEq(left + RealScalar(0.8) * (right - left), col0, diag, perm, diag, 0) << " "
+              << secularEq(left + RealScalar(0.9) * (right - left), col0, diag, perm, diag, 0) << " "
+              << secularEq(left + RealScalar(0.999999) * (right - left), col0, diag, perm, diag, 0) << "\n";
+#endif
+    RealScalar shift = (k == actual_n - 1 || fMid > Literal(0)) ? left : right;
+
+    // measure everything relative to shift
+    Map<ArrayXr> diagShifted(m_workspace.data() + 4 * n, n);
+    diagShifted = diag - shift;
+
+    if (k != actual_n - 1) {
+      // check that after the shift, f(mid) is still negative:
+      RealScalar midShifted = (right - left) / RealScalar(2);
+      // we can test exact equality here, because shift comes from `... ? left : right`
+      if (numext::equal_strict(shift, right)) midShifted = -midShifted;
+      RealScalar fMidShifted = secularEq(midShifted, col0, diag, perm, diagShifted, shift);
+      if (fMidShifted > 0) {
+        // fMid was erroneous, fix it:
+        shift = fMidShifted > Literal(0) ? left : right;
+        diagShifted = diag - shift;
+      }
+    }
+
+    // initial guess
+    RealScalar muPrev, muCur;
+    // we can test exact equality here, because shift comes from `... ? left : right`
+    if (numext::equal_strict(shift, left)) {
+      muPrev = (right - left) * RealScalar(0.1);
+      if (k == actual_n - 1)
+        muCur = right - left;
+      else
+        muCur = (right - left) * RealScalar(0.5);
+    } else {
+      muPrev = -(right - left) * RealScalar(0.1);
+      muCur = -(right - left) * RealScalar(0.5);
+    }
+
+    RealScalar fPrev = secularEq(muPrev, col0, diag, perm, diagShifted, shift);
+    RealScalar fCur = secularEq(muCur, col0, diag, perm, diagShifted, shift);
+    if (abs(fPrev) < abs(fCur)) {
+      swap(fPrev, fCur);
+      swap(muPrev, muCur);
+    }
+
+    // rational interpolation: fit a function of the form a / mu + b through the two previous
+    // iterates and use its zero to compute the next iterate
+    bool useBisection = fPrev * fCur > Literal(0);
+    while (!numext::is_exactly_zero(fCur) &&
+           abs(muCur - muPrev) >
+               Literal(8) * NumTraits<RealScalar>::epsilon() * numext::maxi<RealScalar>(abs(muCur), abs(muPrev)) &&
+           abs(fCur - fPrev) > NumTraits<RealScalar>::epsilon() && !useBisection) {
+      ++m_numIters;
+
+      // Find a and b such that the function f(mu) = a / mu + b matches the current and previous samples.
+      RealScalar a = (fCur - fPrev) / (Literal(1) / muCur - Literal(1) / muPrev);
+      RealScalar b = fCur - a / muCur;
+      // And find mu such that f(mu)==0:
+      RealScalar muZero = -a / b;
+      RealScalar fZero = secularEq(muZero, col0, diag, perm, diagShifted, shift);
+
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+      eigen_internal_assert((numext::isfinite)(fZero));
+#endif
+
+      muPrev = muCur;
+      fPrev = fCur;
+      muCur = muZero;
+      fCur = fZero;
+
+      // we can test exact equality here, because shift comes from `... ? left : right`
+      if (numext::equal_strict(shift, left) && (muCur < Literal(0) || muCur > right - left)) useBisection = true;
+      if (numext::equal_strict(shift, right) && (muCur < -(right - left) || muCur > Literal(0))) useBisection = true;
+      if (abs(fCur) > abs(fPrev)) useBisection = true;
+    }
+
+    // fall back on bisection method if rational interpolation did not work
+    if (useBisection) {
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+      std::cout << "useBisection for k = " << k << ", actual_n = " << actual_n << "\n";
+#endif
+      RealScalar leftShifted, rightShifted;
+      // we can test exact equality here, because shift comes from `... ? left : right`
+      if (numext::equal_strict(shift, left)) {
+        // to avoid overflow, we must have mu > max(real_min, |z(k)|/sqrt(real_max)),
+        // the factor 2 is to be more conservative
+        leftShifted =
+            numext::maxi<RealScalar>((std::numeric_limits<RealScalar>::min)(),
+                                     Literal(2) * abs(col0(k)) / sqrt((std::numeric_limits<RealScalar>::max)()));
+
+        // check that we did it right:
+        eigen_internal_assert(
+            (numext::isfinite)((col0(k) / leftShifted) * (col0(k) / (diag(k) + shift + leftShifted))));
+        // I don't understand why the case k==0 would be special there:
+        // if (k == 0) rightShifted = right - left; else
+        rightShifted = (k == actual_n - 1)
+                           ? right
+                           : ((right - left) * RealScalar(0.51));  // theoretically we can take 0.5, but let's be safe
+      } else {
+        leftShifted = -(right - left) * RealScalar(0.51);
+        if (k + 1 < n)
+          rightShifted = -numext::maxi<RealScalar>((std::numeric_limits<RealScalar>::min)(),
+                                                   abs(col0(k + 1)) / sqrt((std::numeric_limits<RealScalar>::max)()));
+        else
+          rightShifted = -(std::numeric_limits<RealScalar>::min)();
+      }
+
+      RealScalar fLeft = secularEq(leftShifted, col0, diag, perm, diagShifted, shift);
+      eigen_internal_assert(fLeft < Literal(0));
+
+#if defined EIGEN_BDCSVD_DEBUG_VERBOSE || defined EIGEN_BDCSVD_SANITY_CHECKS || defined EIGEN_INTERNAL_DEBUGGING
+      RealScalar fRight = secularEq(rightShifted, col0, diag, perm, diagShifted, shift);
+#endif
+
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+      if (!(numext::isfinite)(fLeft))
+        std::cout << "f(" << leftShifted << ") =" << fLeft << " ; " << left << " " << shift << " " << right << "\n";
+      eigen_internal_assert((numext::isfinite)(fLeft));
+
+      if (!(numext::isfinite)(fRight))
+        std::cout << "f(" << rightShifted << ") =" << fRight << " ; " << left << " " << shift << " " << right << "\n";
+        // eigen_internal_assert((numext::isfinite)(fRight));
+#endif
+
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+      if (!(fLeft * fRight < 0)) {
+        std::cout << "f(leftShifted) using  leftShifted=" << leftShifted
+                  << " ;  diagShifted(1:10):" << diagShifted.head(10).transpose() << "\n ; "
+                  << "left==shift=" << bool(left == shift) << " ; left-shift = " << (left - shift) << "\n";
+        std::cout << "k=" << k << ", " << fLeft << " * " << fRight << " == " << fLeft * fRight << "  ;  "
+                  << "[" << left << " .. " << right << "] -> [" << leftShifted << " " << rightShifted
+                  << "], shift=" << shift << " ,  f(right)=" << secularEq(0, col0, diag, perm, diagShifted, shift)
+                  << " == " << secularEq(right, col0, diag, perm, diag, 0) << " == " << fRight << "\n";
+      }
+#endif
+      eigen_internal_assert(fLeft * fRight < Literal(0));
+
+      if (fLeft < Literal(0)) {
+        while (rightShifted - leftShifted > Literal(2) * NumTraits<RealScalar>::epsilon() *
+                                                numext::maxi<RealScalar>(abs(leftShifted), abs(rightShifted))) {
+          RealScalar midShifted = (leftShifted + rightShifted) / Literal(2);
+          fMid = secularEq(midShifted, col0, diag, perm, diagShifted, shift);
+          eigen_internal_assert((numext::isfinite)(fMid));
+
+          if (fLeft * fMid < Literal(0)) {
+            rightShifted = midShifted;
+          } else {
+            leftShifted = midShifted;
+            fLeft = fMid;
+          }
+        }
+        muCur = (leftShifted + rightShifted) / Literal(2);
+      } else {
+        // We have a problem as shifting on the left or right give either a positive or negative value
+        // at the middle of [left,right]...
+        // Instead of abbording or entering an infinite loop,
+        // let's just use the middle as the estimated zero-crossing:
+        muCur = (right - left) * RealScalar(0.5);
+        // we can test exact equality here, because shift comes from `... ? left : right`
+        if (numext::equal_strict(shift, right)) muCur = -muCur;
+      }
+    }
+
+    singVals[k] = shift + muCur;
+    shifts[k] = shift;
+    mus[k] = muCur;
+
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+    if (k + 1 < n)
+      std::cout << "found " << singVals[k] << " == " << shift << " + " << muCur << " from " << diag(k) << " .. "
+                << diag(k + 1) << "\n";
+#endif
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+    eigen_internal_assert(k == 0 || singVals[k] >= singVals[k - 1]);
+    eigen_internal_assert(singVals[k] >= diag(k));
+#endif
+
+    // perturb singular value slightly if it equals diagonal entry to avoid division by zero later
+    // (deflation is supposed to avoid this from happening)
+    // - this does no seem to be necessary anymore -
+    // if (singVals[k] == left) singVals[k] *= 1 + NumTraits<RealScalar>::epsilon();
+    // if (singVals[k] == right) singVals[k] *= 1 - NumTraits<RealScalar>::epsilon();
+  }
+}
+
+// zhat is perturbation of col0 for which singular vectors can be computed stably (see Section 3.1)
+template <typename MatrixType, int Options>
+void BDCSVD<MatrixType, Options>::perturbCol0(const ArrayRef& col0, const ArrayRef& diag, const IndicesRef& perm,
+                                              const VectorType& singVals, const ArrayRef& shifts, const ArrayRef& mus,
+                                              ArrayRef zhat) {
+  using std::sqrt;
+  Index n = col0.size();
+  Index m = perm.size();
+  if (m == 0) {
+    zhat.setZero();
+    return;
+  }
+  Index lastIdx = perm(m - 1);
+  // The offset permits to skip deflated entries while computing zhat
+  for (Index k = 0; k < n; ++k) {
+    if (numext::is_exactly_zero(col0(k)))  // deflated
+      zhat(k) = Literal(0);
+    else {
+      // see equation (3.6)
+      RealScalar dk = diag(k);
+      RealScalar prod = (singVals(lastIdx) + dk) * (mus(lastIdx) + (shifts(lastIdx) - dk));
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+      if (prod < 0) {
+        std::cout << "k = " << k << " ;  z(k)=" << col0(k) << ", diag(k)=" << dk << "\n";
+        std::cout << "prod = "
+                  << "(" << singVals(lastIdx) << " + " << dk << ") * (" << mus(lastIdx) << " + (" << shifts(lastIdx)
+                  << " - " << dk << "))"
+                  << "\n";
+        std::cout << "     = " << singVals(lastIdx) + dk << " * " << mus(lastIdx) + (shifts(lastIdx) - dk) << "\n";
+      }
+      eigen_internal_assert(prod >= 0);
+#endif
+
+      for (Index l = 0; l < m; ++l) {
+        Index i = perm(l);
+        if (i != k) {
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+          if (i >= k && (l == 0 || l - 1 >= m)) {
+            std::cout << "Error in perturbCol0\n";
+            std::cout << "  " << k << "/" << n << " " << l << "/" << m << " " << i << "/" << n << " ; " << col0(k)
+                      << " " << diag(k) << " "
+                      << "\n";
+            std::cout << "  " << diag(i) << "\n";
+            Index j = (i < k /*|| l==0*/) ? i : perm(l - 1);
+            std::cout << "  "
+                      << "j=" << j << "\n";
+          }
+#endif
+          Index j = i < k ? i : l > 0 ? perm(l - 1) : i;
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+          if (!(dk != Literal(0) || diag(i) != Literal(0))) {
+            std::cout << "k=" << k << ", i=" << i << ", l=" << l << ", perm.size()=" << perm.size() << "\n";
+          }
+          eigen_internal_assert(dk != Literal(0) || diag(i) != Literal(0));
+#endif
+          prod *= ((singVals(j) + dk) / ((diag(i) + dk))) * ((mus(j) + (shifts(j) - dk)) / ((diag(i) - dk)));
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+          eigen_internal_assert(prod >= 0);
+#endif
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+          if (i != k &&
+              numext::abs(((singVals(j) + dk) * (mus(j) + (shifts(j) - dk))) / ((diag(i) + dk) * (diag(i) - dk)) - 1) >
+                  0.9)
+            std::cout << "     "
+                      << ((singVals(j) + dk) * (mus(j) + (shifts(j) - dk))) / ((diag(i) + dk) * (diag(i) - dk))
+                      << " == (" << (singVals(j) + dk) << " * " << (mus(j) + (shifts(j) - dk)) << ") / ("
+                      << (diag(i) + dk) << " * " << (diag(i) - dk) << ")\n";
+#endif
+        }
+      }
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+      std::cout << "zhat(" << k << ") =  sqrt( " << prod << ")  ;  " << (singVals(lastIdx) + dk) << " * "
+                << mus(lastIdx) + shifts(lastIdx) << " - " << dk << "\n";
+#endif
+      RealScalar tmp = sqrt(prod);
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+      eigen_internal_assert((numext::isfinite)(tmp));
+#endif
+      zhat(k) = col0(k) > Literal(0) ? RealScalar(tmp) : RealScalar(-tmp);
+    }
+  }
+}
+
+// compute singular vectors
+template <typename MatrixType, int Options>
+void BDCSVD<MatrixType, Options>::computeSingVecs(const ArrayRef& zhat, const ArrayRef& diag, const IndicesRef& perm,
+                                                  const VectorType& singVals, const ArrayRef& shifts,
+                                                  const ArrayRef& mus, MatrixXr& U, MatrixXr& V) {
+  Index n = zhat.size();
+  Index m = perm.size();
+
+  for (Index k = 0; k < n; ++k) {
+    if (numext::is_exactly_zero(zhat(k))) {
+      U.col(k) = VectorType::Unit(n + 1, k);
+      if (m_compV) V.col(k) = VectorType::Unit(n, k);
+    } else {
+      U.col(k).setZero();
+      for (Index l = 0; l < m; ++l) {
+        Index i = perm(l);
+        U(i, k) = zhat(i) / (((diag(i) - shifts(k)) - mus(k))) / ((diag(i) + singVals[k]));
+      }
+      U(n, k) = Literal(0);
+      U.col(k).normalize();
+
+      if (m_compV) {
+        V.col(k).setZero();
+        for (Index l = 1; l < m; ++l) {
+          Index i = perm(l);
+          V(i, k) = diag(i) * zhat(i) / (((diag(i) - shifts(k)) - mus(k))) / ((diag(i) + singVals[k]));
+        }
+        V(0, k) = Literal(-1);
+        V.col(k).normalize();
+      }
+    }
+  }
+  U.col(n) = VectorType::Unit(n + 1, n);
+}
+
+// page 12_13
+// i >= 1, di almost null and zi non null.
+// We use a rotation to zero out zi applied to the left of M, and set di = 0.
+template <typename MatrixType, int Options>
+void BDCSVD<MatrixType, Options>::deflation43(Index firstCol, Index shift, Index i, Index size) {
+  using std::abs;
+  using std::pow;
+  using std::sqrt;
+  Index start = firstCol + shift;
+  RealScalar c = m_computed(start, start);
+  RealScalar s = m_computed(start + i, start);
+  RealScalar r = numext::hypot(c, s);
+  if (numext::is_exactly_zero(r)) {
+    m_computed(start + i, start + i) = Literal(0);
+    return;
+  }
+  m_computed(start, start) = r;
+  m_computed(start + i, start) = Literal(0);
+  m_computed(start + i, start + i) = Literal(0);
+
+  JacobiRotation<RealScalar> J(c / r, -s / r);
+  if (m_compU)
+    m_naiveU.middleRows(firstCol, size + 1).applyOnTheRight(firstCol, firstCol + i, J);
+  else
+    m_naiveU.applyOnTheRight(firstCol, firstCol + i, J);
+}  // end deflation 43
+
+// page 13
+// i,j >= 1, i > j, and |di - dj| < epsilon * norm2(M)
+// We apply two rotations to have zi = 0, and dj = di.
+template <typename MatrixType, int Options>
+void BDCSVD<MatrixType, Options>::deflation44(Index firstColu, Index firstColm, Index firstRowW, Index firstColW,
+                                              Index i, Index j, Index size) {
+  using std::abs;
+  using std::conj;
+  using std::pow;
+  using std::sqrt;
+
+  RealScalar s = m_computed(firstColm + i, firstColm);
+  RealScalar c = m_computed(firstColm + j, firstColm);
+  RealScalar r = numext::hypot(c, s);
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+  std::cout << "deflation 4.4: " << i << "," << j << " -> " << c << " " << s << " " << r << " ; "
+            << m_computed(firstColm + i - 1, firstColm) << " " << m_computed(firstColm + i, firstColm) << " "
+            << m_computed(firstColm + i + 1, firstColm) << " " << m_computed(firstColm + i + 2, firstColm) << "\n";
+  std::cout << m_computed(firstColm + i - 1, firstColm + i - 1) << " " << m_computed(firstColm + i, firstColm + i)
+            << " " << m_computed(firstColm + i + 1, firstColm + i + 1) << " "
+            << m_computed(firstColm + i + 2, firstColm + i + 2) << "\n";
+#endif
+  if (numext::is_exactly_zero(r)) {
+    m_computed(firstColm + j, firstColm + j) = m_computed(firstColm + i, firstColm + i);
+    return;
+  }
+  c /= r;
+  s /= r;
+  m_computed(firstColm + j, firstColm) = r;
+  m_computed(firstColm + j, firstColm + j) = m_computed(firstColm + i, firstColm + i);
+  m_computed(firstColm + i, firstColm) = Literal(0);
+
+  JacobiRotation<RealScalar> J(c, -s);
+  if (m_compU)
+    m_naiveU.middleRows(firstColu, size + 1).applyOnTheRight(firstColu + j, firstColu + i, J);
+  else
+    m_naiveU.applyOnTheRight(firstColu + j, firstColu + i, J);
+  if (m_compV) m_naiveV.middleRows(firstRowW, size).applyOnTheRight(firstColW + j, firstColW + i, J);
+}  // end deflation 44
+
+// acts on block from (firstCol+shift, firstCol+shift) to (lastCol+shift, lastCol+shift) [inclusive]
+template <typename MatrixType, int Options>
+void BDCSVD<MatrixType, Options>::deflation(Index firstCol, Index lastCol, Index k, Index firstRowW, Index firstColW,
+                                            Index shift) {
+  using std::abs;
+  using std::sqrt;
+  const Index length = lastCol + 1 - firstCol;
+
+  Block<MatrixXr, Dynamic, 1> col0(m_computed, firstCol + shift, firstCol + shift, length, 1);
+  Diagonal<MatrixXr> fulldiag(m_computed);
+  VectorBlock<Diagonal<MatrixXr>, Dynamic> diag(fulldiag, firstCol + shift, length);
+
+  const RealScalar considerZero = (std::numeric_limits<RealScalar>::min)();
+  RealScalar maxDiag = diag.tail((std::max)(Index(1), length - 1)).cwiseAbs().maxCoeff();
+  RealScalar epsilon_strict = numext::maxi<RealScalar>(considerZero, NumTraits<RealScalar>::epsilon() * maxDiag);
+  RealScalar epsilon_coarse =
+      Literal(8) * NumTraits<RealScalar>::epsilon() * numext::maxi<RealScalar>(col0.cwiseAbs().maxCoeff(), maxDiag);
+
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+  eigen_internal_assert(m_naiveU.allFinite());
+  eigen_internal_assert(m_naiveV.allFinite());
+  eigen_internal_assert(m_computed.allFinite());
+#endif
+
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+  std::cout << "\ndeflate:" << diag.head(k + 1).transpose() << "  |  "
+            << diag.segment(k + 1, length - k - 1).transpose() << "\n";
+#endif
+
+  // condition 4.1
+  if (diag(0) < epsilon_coarse) {
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+    std::cout << "deflation 4.1, because " << diag(0) << " < " << epsilon_coarse << "\n";
+#endif
+    diag(0) = epsilon_coarse;
+  }
+
+  // condition 4.2
+  for (Index i = 1; i < length; ++i)
+    if (abs(col0(i)) < epsilon_strict) {
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+      std::cout << "deflation 4.2, set z(" << i << ") to zero because " << abs(col0(i)) << " < " << epsilon_strict
+                << "  (diag(" << i << ")=" << diag(i) << ")\n";
+#endif
+      col0(i) = Literal(0);
+    }
+
+  // condition 4.3
+  for (Index i = 1; i < length; i++)
+    if (diag(i) < epsilon_coarse) {
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+      std::cout << "deflation 4.3, cancel z(" << i << ")=" << col0(i) << " because diag(" << i << ")=" << diag(i)
+                << " < " << epsilon_coarse << "\n";
+#endif
+      deflation43(firstCol, shift, i, length);
+    }
+
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+  eigen_internal_assert(m_naiveU.allFinite());
+  eigen_internal_assert(m_naiveV.allFinite());
+  eigen_internal_assert(m_computed.allFinite());
+#endif
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+  std::cout << "to be sorted: " << diag.transpose() << "\n\n";
+  std::cout << "            : " << col0.transpose() << "\n\n";
+#endif
+  {
+    // Check for total deflation:
+    // If we have a total deflation, then we have to consider col0(0)==diag(0) as a singular value during sorting.
+    const bool total_deflation = (col0.tail(length - 1).array().abs() < considerZero).all();
+
+    // Sort the diagonal entries, since diag(1:k-1) and diag(k:length) are already sorted, let's do a sorted merge.
+    // First, compute the respective permutation.
+    Index* permutation = m_workspaceI.data();
+    {
+      permutation[0] = 0;
+      Index p = 1;
+
+      // Move deflated diagonal entries at the end.
+      for (Index i = 1; i < length; ++i)
+        if (diag(i) < considerZero) permutation[p++] = i;
+
+      Index i = 1, j = k + 1;
+      for (; p < length; ++p) {
+        if (i > k)
+          permutation[p] = j++;
+        else if (j >= length)
+          permutation[p] = i++;
+        else if (diag(i) < diag(j))
+          permutation[p] = j++;
+        else
+          permutation[p] = i++;
+      }
+    }
+
+    // If we have a total deflation, then we have to insert diag(0) at the right place
+    if (total_deflation) {
+      for (Index i = 1; i < length; ++i) {
+        Index pi = permutation[i];
+        if (diag(pi) < considerZero || diag(0) < diag(pi))
+          permutation[i - 1] = permutation[i];
+        else {
+          permutation[i - 1] = 0;
+          break;
+        }
+      }
+    }
+
+    // Current index of each col, and current column of each index
+    Index* realInd = m_workspaceI.data() + length;
+    Index* realCol = m_workspaceI.data() + 2 * length;
+
+    for (int pos = 0; pos < length; pos++) {
+      realCol[pos] = pos;
+      realInd[pos] = pos;
+    }
+
+    for (Index i = total_deflation ? 0 : 1; i < length; i++) {
+      const Index pi = permutation[length - (total_deflation ? i + 1 : i)];
+      const Index J = realCol[pi];
+
+      using std::swap;
+      // swap diagonal and first column entries:
+      swap(diag(i), diag(J));
+      if (i != 0 && J != 0) swap(col0(i), col0(J));
+
+      // change columns
+      if (m_compU)
+        m_naiveU.col(firstCol + i)
+            .segment(firstCol, length + 1)
+            .swap(m_naiveU.col(firstCol + J).segment(firstCol, length + 1));
+      else
+        m_naiveU.col(firstCol + i).segment(0, 2).swap(m_naiveU.col(firstCol + J).segment(0, 2));
+      if (m_compV)
+        m_naiveV.col(firstColW + i)
+            .segment(firstRowW, length)
+            .swap(m_naiveV.col(firstColW + J).segment(firstRowW, length));
+
+      // update real pos
+      const Index realI = realInd[i];
+      realCol[realI] = J;
+      realCol[pi] = i;
+      realInd[J] = realI;
+      realInd[i] = pi;
+    }
+  }
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+  std::cout << "sorted: " << diag.transpose().format(bdcsvdfmt) << "\n";
+  std::cout << "      : " << col0.transpose() << "\n\n";
+#endif
+
+  // condition 4.4
+  {
+    Index i = length - 1;
+    // Find last non-deflated entry.
+    while (i > 0 && (diag(i) < considerZero || abs(col0(i)) < considerZero)) --i;
+
+    for (; i > 1; --i)
+      if ((diag(i) - diag(i - 1)) < epsilon_strict) {
+#ifdef EIGEN_BDCSVD_DEBUG_VERBOSE
+        std::cout << "deflation 4.4 with i = " << i << " because " << diag(i) << " - " << diag(i - 1)
+                  << " == " << (diag(i) - diag(i - 1)) << " < " << epsilon_strict << "\n";
+#endif
+        eigen_internal_assert(abs(diag(i) - diag(i - 1)) < epsilon_coarse &&
+                              " diagonal entries are not properly sorted");
+        deflation44(firstCol, firstCol + shift, firstRowW, firstColW, i, i - 1, length);
+      }
+  }
+
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+  for (Index j = 2; j < length; ++j) eigen_internal_assert(diag(j - 1) <= diag(j) || abs(diag(j)) < considerZero);
+#endif
+
+#ifdef EIGEN_BDCSVD_SANITY_CHECKS
+  eigen_internal_assert(m_naiveU.allFinite());
+  eigen_internal_assert(m_naiveV.allFinite());
+  eigen_internal_assert(m_computed.allFinite());
+#endif
+}  // end deflation
+
+/** \svd_module
+ *
+ * \return the singular value decomposition of \c *this computed by Divide & Conquer algorithm
+ *
+ * \sa class BDCSVD
+ */
+template <typename Derived>
+template <int Options>
+BDCSVD<typename MatrixBase<Derived>::PlainObject, Options> MatrixBase<Derived>::bdcSvd() const {
+  return BDCSVD<PlainObject, Options>(*this);
+}
+
+/** \svd_module
+ *
+ * \return the singular value decomposition of \c *this computed by Divide & Conquer algorithm
+ *
+ * \sa class BDCSVD
+ */
+template <typename Derived>
+template <int Options>
+BDCSVD<typename MatrixBase<Derived>::PlainObject, Options> MatrixBase<Derived>::bdcSvd(
+    unsigned int computationOptions) const {
+  return BDCSVD<PlainObject, Options>(*this, computationOptions);
+}
+
+}  // end namespace Eigen
+
+#endif
diff --git a/inst/include/Eigen/src/SVD/BDCSVD_LAPACKE.h b/inst/include/Eigen/src/SVD/BDCSVD_LAPACKE.h
new file mode 100644
index 00000000..5d2b8c71
--- /dev/null
+++ b/inst/include/Eigen/src/SVD/BDCSVD_LAPACKE.h
@@ -0,0 +1,174 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2022 Melven Roehrig-Zoellner <Melven.Roehrig-Zoellner@DLR.de>
+// Copyright (c) 2011, Intel Corporation. All rights reserved.
+//
+// This file is based on the JacobiSVD_LAPACKE.h originally from Intel -
+// see license notice below:
+/*
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to LAPACKe
+ *    Singular Value Decomposition - SVD (divide and conquer variant)
+ ********************************************************************************
+*/
+#ifndef EIGEN_BDCSVD_LAPACKE_H
+#define EIGEN_BDCSVD_LAPACKE_H
+
+namespace Eigen {
+
+namespace internal {
+
+namespace lapacke_helpers {
+
+/** \internal Specialization for the data types supported by LAPACKe */
+
+// defining a derived class to allow access to protected members
+template <typename MatrixType_, int Options>
+class BDCSVD_LAPACKE : public BDCSVD<MatrixType_, Options> {
+  typedef BDCSVD<MatrixType_, Options> SVD;
+  typedef typename SVD::MatrixType MatrixType;
+  typedef typename SVD::Scalar Scalar;
+  typedef typename SVD::RealScalar RealScalar;
+
+ public:
+  // construct this by moving from a parent object
+  BDCSVD_LAPACKE(SVD&& svd) : SVD(std::move(svd)) {}
+
+  template <typename Derived>
+  void compute_impl_lapacke(const MatrixBase<Derived>& matrix, unsigned int computationOptions) {
+    SVD::allocate(matrix.rows(), matrix.cols(), computationOptions);
+
+    SVD::m_nonzeroSingularValues = SVD::m_diagSize;
+
+    // prepare arguments to ?gesdd
+    const lapack_int matrix_order = lapack_storage_of(matrix);
+    const char jobz = (SVD::m_computeFullU || SVD::m_computeFullV)   ? 'A'
+                      : (SVD::m_computeThinU || SVD::m_computeThinV) ? 'S'
+                                                                     : 'N';
+    const lapack_int u_cols = (jobz == 'A') ? to_lapack(SVD::rows()) : (jobz == 'S') ? to_lapack(SVD::diagSize()) : 1;
+    const lapack_int vt_rows = (jobz == 'A') ? to_lapack(SVD::cols()) : (jobz == 'S') ? to_lapack(SVD::diagSize()) : 1;
+    lapack_int ldu, ldvt;
+    Scalar *u, *vt, dummy;
+    MatrixType localU;
+    if (SVD::computeU() && !(SVD::m_computeThinU && SVD::m_computeFullV)) {
+      ldu = to_lapack(SVD::m_matrixU.outerStride());
+      u = SVD::m_matrixU.data();
+    } else if (SVD::computeV()) {
+      localU.resize(SVD::rows(), u_cols);
+      ldu = to_lapack(localU.outerStride());
+      u = localU.data();
+    } else {
+      ldu = 1;
+      u = &dummy;
+    }
+    MatrixType localV;
+    if (SVD::computeU() || SVD::computeV()) {
+      localV.resize(vt_rows, SVD::cols());
+      ldvt = to_lapack(localV.outerStride());
+      vt = localV.data();
+    } else {
+      ldvt = 1;
+      vt = &dummy;
+    }
+    MatrixType temp;
+    temp = matrix;
+
+    // actual call to ?gesdd
+    lapack_int info = gesdd(matrix_order, jobz, to_lapack(SVD::rows()), to_lapack(SVD::cols()), to_lapack(temp.data()),
+                            to_lapack(temp.outerStride()), (RealScalar*)SVD::m_singularValues.data(), to_lapack(u), ldu,
+                            to_lapack(vt), ldvt);
+
+    // Check the result of the LAPACK call
+    if (info < 0 || !SVD::m_singularValues.allFinite()) {
+      // this includes info == -4 => NaN entry in A
+      SVD::m_info = InvalidInput;
+    } else if (info > 0) {
+      SVD::m_info = NoConvergence;
+    } else {
+      SVD::m_info = Success;
+      if (SVD::m_computeThinU && SVD::m_computeFullV) {
+        SVD::m_matrixU = localU.leftCols(SVD::m_matrixU.cols());
+      }
+      if (SVD::computeV()) {
+        SVD::m_matrixV = localV.adjoint().leftCols(SVD::m_matrixV.cols());
+      }
+    }
+    SVD::m_isInitialized = true;
+  }
+};
+
+template <typename MatrixType_, int Options, typename Derived>
+BDCSVD<MatrixType_, Options>& BDCSVD_wrapper(BDCSVD<MatrixType_, Options>& svd, const MatrixBase<Derived>& matrix,
+                                             int computationOptions) {
+  // we need to move to the wrapper type and back
+  BDCSVD_LAPACKE<MatrixType_, Options> tmpSvd(std::move(svd));
+  tmpSvd.compute_impl_lapacke(matrix, computationOptions);
+  svd = std::move(tmpSvd);
+  return svd;
+}
+
+}  // end namespace lapacke_helpers
+
+}  // end namespace internal
+
+#define EIGEN_LAPACKE_SDD(EIGTYPE, EIGCOLROW, OPTIONS)                                           \
+  template <>                                                                                    \
+  template <typename Derived>                                                                    \
+  inline BDCSVD<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>, OPTIONS>&        \
+  BDCSVD<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>, OPTIONS>::compute_impl( \
+      const MatrixBase<Derived>& matrix, unsigned int computationOptions) {                      \
+    return internal::lapacke_helpers::BDCSVD_wrapper(*this, matrix, computationOptions);         \
+  }
+
+#define EIGEN_LAPACK_SDD_OPTIONS(OPTIONS)        \
+  EIGEN_LAPACKE_SDD(double, ColMajor, OPTIONS)   \
+  EIGEN_LAPACKE_SDD(float, ColMajor, OPTIONS)    \
+  EIGEN_LAPACKE_SDD(dcomplex, ColMajor, OPTIONS) \
+  EIGEN_LAPACKE_SDD(scomplex, ColMajor, OPTIONS) \
+                                                 \
+  EIGEN_LAPACKE_SDD(double, RowMajor, OPTIONS)   \
+  EIGEN_LAPACKE_SDD(float, RowMajor, OPTIONS)    \
+  EIGEN_LAPACKE_SDD(dcomplex, RowMajor, OPTIONS) \
+  EIGEN_LAPACKE_SDD(scomplex, RowMajor, OPTIONS)
+
+EIGEN_LAPACK_SDD_OPTIONS(0)
+EIGEN_LAPACK_SDD_OPTIONS(ComputeThinU)
+EIGEN_LAPACK_SDD_OPTIONS(ComputeThinV)
+EIGEN_LAPACK_SDD_OPTIONS(ComputeFullU)
+EIGEN_LAPACK_SDD_OPTIONS(ComputeFullV)
+EIGEN_LAPACK_SDD_OPTIONS(ComputeThinU | ComputeThinV)
+EIGEN_LAPACK_SDD_OPTIONS(ComputeFullU | ComputeFullV)
+EIGEN_LAPACK_SDD_OPTIONS(ComputeThinU | ComputeFullV)
+EIGEN_LAPACK_SDD_OPTIONS(ComputeFullU | ComputeThinV)
+
+#undef EIGEN_LAPACK_SDD_OPTIONS
+
+#undef EIGEN_LAPACKE_SDD
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_BDCSVD_LAPACKE_H
diff --git a/inst/include/Eigen/src/SVD/InternalHeaderCheck.h b/inst/include/Eigen/src/SVD/InternalHeaderCheck.h
new file mode 100644
index 00000000..fa67b96a
--- /dev/null
+++ b/inst/include/Eigen/src/SVD/InternalHeaderCheck.h
@@ -0,0 +1,3 @@
+#ifndef EIGEN_SVD_MODULE_H
+#error "Please include Eigen/SVD instead of including headers inside the src directory directly."
+#endif
diff --git a/inst/include/Eigen/src/SVD/JacobiSVD.h b/inst/include/Eigen/src/SVD/JacobiSVD.h
index 1b297741..dfcb6df5 100644
--- a/inst/include/Eigen/src/SVD/JacobiSVD.h
+++ b/inst/include/Eigen/src/SVD/JacobiSVD.h
@@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2009-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2013-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,13 +11,16 @@
 #ifndef EIGEN_JACOBISVD_H
 #define EIGEN_JACOBISVD_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 namespace internal {
+
 // forward declaration (needed by ICC)
 // the empty body is required by MSVC
-template<typename MatrixType, int QRPreconditioner,
-         bool IsComplex = NumTraits<typename MatrixType::Scalar>::IsComplex>
+template <typename MatrixType, int Options, bool IsComplex = NumTraits<typename MatrixType::Scalar>::IsComplex>
 struct svd_precondition_2x2_block_to_be_real {};
 
 /*** QR preconditioners (R-SVD)
@@ -28,325 +32,326 @@ struct svd_precondition_2x2_block_to_be_real {};
 
 enum { PreconditionIfMoreColsThanRows, PreconditionIfMoreRowsThanCols };
 
-template<typename MatrixType, int QRPreconditioner, int Case>
-struct qr_preconditioner_should_do_anything
-{
-  enum { a = MatrixType::RowsAtCompileTime != Dynamic &&
-             MatrixType::ColsAtCompileTime != Dynamic &&
-             MatrixType::ColsAtCompileTime <= MatrixType::RowsAtCompileTime,
-         b = MatrixType::RowsAtCompileTime != Dynamic &&
-             MatrixType::ColsAtCompileTime != Dynamic &&
-             MatrixType::RowsAtCompileTime <= MatrixType::ColsAtCompileTime,
-         ret = !( (QRPreconditioner == NoQRPreconditioner) ||
-                  (Case == PreconditionIfMoreColsThanRows && bool(a)) ||
-                  (Case == PreconditionIfMoreRowsThanCols && bool(b)) )
+template <typename MatrixType, int QRPreconditioner, int Case>
+struct qr_preconditioner_should_do_anything {
+  enum {
+    a = MatrixType::RowsAtCompileTime != Dynamic && MatrixType::ColsAtCompileTime != Dynamic &&
+        MatrixType::ColsAtCompileTime <= MatrixType::RowsAtCompileTime,
+    b = MatrixType::RowsAtCompileTime != Dynamic && MatrixType::ColsAtCompileTime != Dynamic &&
+        MatrixType::RowsAtCompileTime <= MatrixType::ColsAtCompileTime,
+    ret = !((QRPreconditioner == NoQRPreconditioner) || (Case == PreconditionIfMoreColsThanRows && bool(a)) ||
+            (Case == PreconditionIfMoreRowsThanCols && bool(b)))
   };
 };
 
-template<typename MatrixType, int QRPreconditioner, int Case,
-         bool DoAnything = qr_preconditioner_should_do_anything<MatrixType, QRPreconditioner, Case>::ret
-> struct qr_preconditioner_impl {};
-
-template<typename MatrixType, int QRPreconditioner, int Case>
-class qr_preconditioner_impl<MatrixType, QRPreconditioner, Case, false>
-{
-public:
-  typedef typename MatrixType::Index Index;
-  void allocate(const JacobiSVD<MatrixType, QRPreconditioner>&) {}
-  bool run(JacobiSVD<MatrixType, QRPreconditioner>&, const MatrixType&)
-  {
+template <typename MatrixType, int Options, int QRPreconditioner, int Case,
+          bool DoAnything = qr_preconditioner_should_do_anything<MatrixType, QRPreconditioner, Case>::ret>
+struct qr_preconditioner_impl {};
+
+template <typename MatrixType, int Options, int QRPreconditioner, int Case>
+class qr_preconditioner_impl<MatrixType, Options, QRPreconditioner, Case, false> {
+ public:
+  void allocate(const JacobiSVD<MatrixType, Options>&) {}
+  template <typename Xpr>
+  bool run(JacobiSVD<MatrixType, Options>&, const Xpr&) {
     return false;
   }
 };
 
 /*** preconditioner using FullPivHouseholderQR ***/
 
-template<typename MatrixType>
-class qr_preconditioner_impl<MatrixType, FullPivHouseholderQRPreconditioner, PreconditionIfMoreRowsThanCols, true>
-{
-public:
-  typedef typename MatrixType::Index Index;
+template <typename MatrixType, int Options>
+class qr_preconditioner_impl<MatrixType, Options, FullPivHouseholderQRPreconditioner, PreconditionIfMoreRowsThanCols,
+                             true> {
+ public:
   typedef typename MatrixType::Scalar Scalar;
-  enum
-  {
-    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime
-  };
-  typedef Matrix<Scalar, 1, RowsAtCompileTime, RowMajor, 1, MaxRowsAtCompileTime> WorkspaceType;
-
-  void allocate(const JacobiSVD<MatrixType, FullPivHouseholderQRPreconditioner>& svd)
-  {
-    if (svd.rows() != m_qr.rows() || svd.cols() != m_qr.cols())
-    {
-      m_qr.~QRType();
-      ::new (&m_qr) QRType(svd.rows(), svd.cols());
+  typedef JacobiSVD<MatrixType, Options> SVDType;
+
+  enum { WorkspaceSize = MatrixType::RowsAtCompileTime, MaxWorkspaceSize = MatrixType::MaxRowsAtCompileTime };
+
+  typedef Matrix<Scalar, 1, WorkspaceSize, RowMajor, 1, MaxWorkspaceSize> WorkspaceType;
+
+  void allocate(const SVDType& svd) {
+    if (svd.rows() != m_qr.rows() || svd.cols() != m_qr.cols()) {
+      internal::destroy_at(&m_qr);
+      internal::construct_at(&m_qr, svd.rows(), svd.cols());
     }
     if (svd.m_computeFullU) m_workspace.resize(svd.rows());
   }
-
-  bool run(JacobiSVD<MatrixType, FullPivHouseholderQRPreconditioner>& svd, const MatrixType& matrix)
-  {
-    if(matrix.rows() > matrix.cols())
-    {
+  template <typename Xpr>
+  bool run(SVDType& svd, const Xpr& matrix) {
+    if (matrix.rows() > matrix.cols()) {
       m_qr.compute(matrix);
-      svd.m_workMatrix = m_qr.matrixQR().block(0,0,matrix.cols(),matrix.cols()).template triangularView<Upper>();
-      if(svd.m_computeFullU) m_qr.matrixQ().evalTo(svd.m_matrixU, m_workspace);
-      if(svd.computeV()) svd.m_matrixV = m_qr.colsPermutation();
+      svd.m_workMatrix = m_qr.matrixQR().block(0, 0, matrix.cols(), matrix.cols()).template triangularView<Upper>();
+      if (svd.m_computeFullU) m_qr.matrixQ().evalTo(svd.m_matrixU, m_workspace);
+      if (svd.computeV()) svd.m_matrixV = m_qr.colsPermutation();
       return true;
     }
     return false;
   }
-private:
+
+ private:
   typedef FullPivHouseholderQR<MatrixType> QRType;
   QRType m_qr;
   WorkspaceType m_workspace;
 };
 
-template<typename MatrixType>
-class qr_preconditioner_impl<MatrixType, FullPivHouseholderQRPreconditioner, PreconditionIfMoreColsThanRows, true>
-{
-public:
-  typedef typename MatrixType::Index Index;
+template <typename MatrixType, int Options>
+class qr_preconditioner_impl<MatrixType, Options, FullPivHouseholderQRPreconditioner, PreconditionIfMoreColsThanRows,
+                             true> {
+ public:
   typedef typename MatrixType::Scalar Scalar;
-  enum
-  {
+  typedef JacobiSVD<MatrixType, Options> SVDType;
+
+  enum {
     RowsAtCompileTime = MatrixType::RowsAtCompileTime,
     ColsAtCompileTime = MatrixType::ColsAtCompileTime,
     MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
-    Options = MatrixType::Options
+    MatrixOptions = traits<MatrixType>::Options
   };
-  typedef Matrix<Scalar, ColsAtCompileTime, RowsAtCompileTime, Options, MaxColsAtCompileTime, MaxRowsAtCompileTime>
-          TransposeTypeWithSameStorageOrder;
-
-  void allocate(const JacobiSVD<MatrixType, FullPivHouseholderQRPreconditioner>& svd)
-  {
-    if (svd.cols() != m_qr.rows() || svd.rows() != m_qr.cols())
-    {
-      m_qr.~QRType();
-      ::new (&m_qr) QRType(svd.cols(), svd.rows());
+
+  typedef typename internal::make_proper_matrix_type<Scalar, ColsAtCompileTime, RowsAtCompileTime, MatrixOptions,
+                                                     MaxColsAtCompileTime, MaxRowsAtCompileTime>::type
+      TransposeTypeWithSameStorageOrder;
+
+  void allocate(const SVDType& svd) {
+    if (svd.cols() != m_qr.rows() || svd.rows() != m_qr.cols()) {
+      internal::destroy_at(&m_qr);
+      internal::construct_at(&m_qr, svd.cols(), svd.rows());
     }
-    m_adjoint.resize(svd.cols(), svd.rows());
     if (svd.m_computeFullV) m_workspace.resize(svd.cols());
   }
-
-  bool run(JacobiSVD<MatrixType, FullPivHouseholderQRPreconditioner>& svd, const MatrixType& matrix)
-  {
-    if(matrix.cols() > matrix.rows())
-    {
-      m_adjoint = matrix.adjoint();
-      m_qr.compute(m_adjoint);
-      svd.m_workMatrix = m_qr.matrixQR().block(0,0,matrix.rows(),matrix.rows()).template triangularView<Upper>().adjoint();
-      if(svd.m_computeFullV) m_qr.matrixQ().evalTo(svd.m_matrixV, m_workspace);
-      if(svd.computeU()) svd.m_matrixU = m_qr.colsPermutation();
+  template <typename Xpr>
+  bool run(SVDType& svd, const Xpr& matrix) {
+    if (matrix.cols() > matrix.rows()) {
+      m_qr.compute(matrix.adjoint());
+      svd.m_workMatrix =
+          m_qr.matrixQR().block(0, 0, matrix.rows(), matrix.rows()).template triangularView<Upper>().adjoint();
+      if (svd.m_computeFullV) m_qr.matrixQ().evalTo(svd.m_matrixV, m_workspace);
+      if (svd.computeU()) svd.m_matrixU = m_qr.colsPermutation();
       return true;
-    }
-    else return false;
+    } else
+      return false;
   }
-private:
+
+ private:
   typedef FullPivHouseholderQR<TransposeTypeWithSameStorageOrder> QRType;
   QRType m_qr;
-  TransposeTypeWithSameStorageOrder m_adjoint;
-  typename internal::plain_row_type<MatrixType>::type m_workspace;
+  typename plain_row_type<MatrixType>::type m_workspace;
 };
 
 /*** preconditioner using ColPivHouseholderQR ***/
 
-template<typename MatrixType>
-class qr_preconditioner_impl<MatrixType, ColPivHouseholderQRPreconditioner, PreconditionIfMoreRowsThanCols, true>
-{
-public:
-  typedef typename MatrixType::Index Index;
-
-  void allocate(const JacobiSVD<MatrixType, ColPivHouseholderQRPreconditioner>& svd)
-  {
-    if (svd.rows() != m_qr.rows() || svd.cols() != m_qr.cols())
-    {
-      m_qr.~QRType();
-      ::new (&m_qr) QRType(svd.rows(), svd.cols());
+template <typename MatrixType, int Options>
+class qr_preconditioner_impl<MatrixType, Options, ColPivHouseholderQRPreconditioner, PreconditionIfMoreRowsThanCols,
+                             true> {
+ public:
+  typedef typename MatrixType::Scalar Scalar;
+  typedef JacobiSVD<MatrixType, Options> SVDType;
+
+  enum {
+    WorkspaceSize = internal::traits<SVDType>::MatrixUColsAtCompileTime,
+    MaxWorkspaceSize = internal::traits<SVDType>::MatrixUMaxColsAtCompileTime
+  };
+
+  typedef Matrix<Scalar, 1, WorkspaceSize, RowMajor, 1, MaxWorkspaceSize> WorkspaceType;
+
+  void allocate(const SVDType& svd) {
+    if (svd.rows() != m_qr.rows() || svd.cols() != m_qr.cols()) {
+      internal::destroy_at(&m_qr);
+      internal::construct_at(&m_qr, svd.rows(), svd.cols());
     }
-    if (svd.m_computeFullU) m_workspace.resize(svd.rows());
-    else if (svd.m_computeThinU) m_workspace.resize(svd.cols());
+    if (svd.m_computeFullU)
+      m_workspace.resize(svd.rows());
+    else if (svd.m_computeThinU)
+      m_workspace.resize(svd.cols());
   }
-
-  bool run(JacobiSVD<MatrixType, ColPivHouseholderQRPreconditioner>& svd, const MatrixType& matrix)
-  {
-    if(matrix.rows() > matrix.cols())
-    {
+  template <typename Xpr>
+  bool run(SVDType& svd, const Xpr& matrix) {
+    if (matrix.rows() > matrix.cols()) {
       m_qr.compute(matrix);
-      svd.m_workMatrix = m_qr.matrixQR().block(0,0,matrix.cols(),matrix.cols()).template triangularView<Upper>();
-      if(svd.m_computeFullU) m_qr.householderQ().evalTo(svd.m_matrixU, m_workspace);
-      else if(svd.m_computeThinU)
-      {
+      svd.m_workMatrix = m_qr.matrixQR().block(0, 0, matrix.cols(), matrix.cols()).template triangularView<Upper>();
+      if (svd.m_computeFullU)
+        m_qr.householderQ().evalTo(svd.m_matrixU, m_workspace);
+      else if (svd.m_computeThinU) {
         svd.m_matrixU.setIdentity(matrix.rows(), matrix.cols());
         m_qr.householderQ().applyThisOnTheLeft(svd.m_matrixU, m_workspace);
       }
-      if(svd.computeV()) svd.m_matrixV = m_qr.colsPermutation();
+      if (svd.computeV()) svd.m_matrixV = m_qr.colsPermutation();
       return true;
     }
     return false;
   }
 
-private:
+ private:
   typedef ColPivHouseholderQR<MatrixType> QRType;
   QRType m_qr;
-  typename internal::plain_col_type<MatrixType>::type m_workspace;
+  WorkspaceType m_workspace;
 };
 
-template<typename MatrixType>
-class qr_preconditioner_impl<MatrixType, ColPivHouseholderQRPreconditioner, PreconditionIfMoreColsThanRows, true>
-{
-public:
-  typedef typename MatrixType::Index Index;
+template <typename MatrixType, int Options>
+class qr_preconditioner_impl<MatrixType, Options, ColPivHouseholderQRPreconditioner, PreconditionIfMoreColsThanRows,
+                             true> {
+ public:
   typedef typename MatrixType::Scalar Scalar;
-  enum
-  {
+  typedef JacobiSVD<MatrixType, Options> SVDType;
+
+  enum {
     RowsAtCompileTime = MatrixType::RowsAtCompileTime,
     ColsAtCompileTime = MatrixType::ColsAtCompileTime,
     MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
-    Options = MatrixType::Options
+    MatrixOptions = internal::traits<MatrixType>::Options,
+    WorkspaceSize = internal::traits<SVDType>::MatrixVColsAtCompileTime,
+    MaxWorkspaceSize = internal::traits<SVDType>::MatrixVMaxColsAtCompileTime
   };
 
-  typedef Matrix<Scalar, ColsAtCompileTime, RowsAtCompileTime, Options, MaxColsAtCompileTime, MaxRowsAtCompileTime>
-          TransposeTypeWithSameStorageOrder;
+  typedef Matrix<Scalar, WorkspaceSize, 1, ColMajor, MaxWorkspaceSize, 1> WorkspaceType;
+
+  typedef typename internal::make_proper_matrix_type<Scalar, ColsAtCompileTime, RowsAtCompileTime, MatrixOptions,
+                                                     MaxColsAtCompileTime, MaxRowsAtCompileTime>::type
+      TransposeTypeWithSameStorageOrder;
 
-  void allocate(const JacobiSVD<MatrixType, ColPivHouseholderQRPreconditioner>& svd)
-  {
-    if (svd.cols() != m_qr.rows() || svd.rows() != m_qr.cols())
-    {
-      m_qr.~QRType();
-      ::new (&m_qr) QRType(svd.cols(), svd.rows());
+  void allocate(const SVDType& svd) {
+    if (svd.cols() != m_qr.rows() || svd.rows() != m_qr.cols()) {
+      internal::destroy_at(&m_qr);
+      internal::construct_at(&m_qr, svd.cols(), svd.rows());
     }
-    if (svd.m_computeFullV) m_workspace.resize(svd.cols());
-    else if (svd.m_computeThinV) m_workspace.resize(svd.rows());
-    m_adjoint.resize(svd.cols(), svd.rows());
+    if (svd.m_computeFullV)
+      m_workspace.resize(svd.cols());
+    else if (svd.m_computeThinV)
+      m_workspace.resize(svd.rows());
   }
-
-  bool run(JacobiSVD<MatrixType, ColPivHouseholderQRPreconditioner>& svd, const MatrixType& matrix)
-  {
-    if(matrix.cols() > matrix.rows())
-    {
-      m_adjoint = matrix.adjoint();
-      m_qr.compute(m_adjoint);
-
-      svd.m_workMatrix = m_qr.matrixQR().block(0,0,matrix.rows(),matrix.rows()).template triangularView<Upper>().adjoint();
-      if(svd.m_computeFullV) m_qr.householderQ().evalTo(svd.m_matrixV, m_workspace);
-      else if(svd.m_computeThinV)
-      {
+  template <typename Xpr>
+  bool run(SVDType& svd, const Xpr& matrix) {
+    if (matrix.cols() > matrix.rows()) {
+      m_qr.compute(matrix.adjoint());
+
+      svd.m_workMatrix =
+          m_qr.matrixQR().block(0, 0, matrix.rows(), matrix.rows()).template triangularView<Upper>().adjoint();
+      if (svd.m_computeFullV)
+        m_qr.householderQ().evalTo(svd.m_matrixV, m_workspace);
+      else if (svd.m_computeThinV) {
         svd.m_matrixV.setIdentity(matrix.cols(), matrix.rows());
         m_qr.householderQ().applyThisOnTheLeft(svd.m_matrixV, m_workspace);
       }
-      if(svd.computeU()) svd.m_matrixU = m_qr.colsPermutation();
+      if (svd.computeU()) svd.m_matrixU = m_qr.colsPermutation();
       return true;
-    }
-    else return false;
+    } else
+      return false;
   }
 
-private:
+ private:
   typedef ColPivHouseholderQR<TransposeTypeWithSameStorageOrder> QRType;
   QRType m_qr;
-  TransposeTypeWithSameStorageOrder m_adjoint;
-  typename internal::plain_row_type<MatrixType>::type m_workspace;
+  WorkspaceType m_workspace;
 };
 
 /*** preconditioner using HouseholderQR ***/
 
-template<typename MatrixType>
-class qr_preconditioner_impl<MatrixType, HouseholderQRPreconditioner, PreconditionIfMoreRowsThanCols, true>
-{
-public:
-  typedef typename MatrixType::Index Index;
-
-  void allocate(const JacobiSVD<MatrixType, HouseholderQRPreconditioner>& svd)
-  {
-    if (svd.rows() != m_qr.rows() || svd.cols() != m_qr.cols())
-    {
-      m_qr.~QRType();
-      ::new (&m_qr) QRType(svd.rows(), svd.cols());
+template <typename MatrixType, int Options>
+class qr_preconditioner_impl<MatrixType, Options, HouseholderQRPreconditioner, PreconditionIfMoreRowsThanCols, true> {
+ public:
+  typedef typename MatrixType::Scalar Scalar;
+  typedef JacobiSVD<MatrixType, Options> SVDType;
+
+  enum {
+    WorkspaceSize = internal::traits<SVDType>::MatrixUColsAtCompileTime,
+    MaxWorkspaceSize = internal::traits<SVDType>::MatrixUMaxColsAtCompileTime
+  };
+
+  typedef Matrix<Scalar, 1, WorkspaceSize, RowMajor, 1, MaxWorkspaceSize> WorkspaceType;
+
+  void allocate(const SVDType& svd) {
+    if (svd.rows() != m_qr.rows() || svd.cols() != m_qr.cols()) {
+      internal::destroy_at(&m_qr);
+      internal::construct_at(&m_qr, svd.rows(), svd.cols());
     }
-    if (svd.m_computeFullU) m_workspace.resize(svd.rows());
-    else if (svd.m_computeThinU) m_workspace.resize(svd.cols());
+    if (svd.m_computeFullU)
+      m_workspace.resize(svd.rows());
+    else if (svd.m_computeThinU)
+      m_workspace.resize(svd.cols());
   }
-
-  bool run(JacobiSVD<MatrixType, HouseholderQRPreconditioner>& svd, const MatrixType& matrix)
-  {
-    if(matrix.rows() > matrix.cols())
-    {
+  template <typename Xpr>
+  bool run(SVDType& svd, const Xpr& matrix) {
+    if (matrix.rows() > matrix.cols()) {
       m_qr.compute(matrix);
-      svd.m_workMatrix = m_qr.matrixQR().block(0,0,matrix.cols(),matrix.cols()).template triangularView<Upper>();
-      if(svd.m_computeFullU) m_qr.householderQ().evalTo(svd.m_matrixU, m_workspace);
-      else if(svd.m_computeThinU)
-      {
+      svd.m_workMatrix = m_qr.matrixQR().block(0, 0, matrix.cols(), matrix.cols()).template triangularView<Upper>();
+      if (svd.m_computeFullU)
+        m_qr.householderQ().evalTo(svd.m_matrixU, m_workspace);
+      else if (svd.m_computeThinU) {
         svd.m_matrixU.setIdentity(matrix.rows(), matrix.cols());
         m_qr.householderQ().applyThisOnTheLeft(svd.m_matrixU, m_workspace);
       }
-      if(svd.computeV()) svd.m_matrixV.setIdentity(matrix.cols(), matrix.cols());
+      if (svd.computeV()) svd.m_matrixV.setIdentity(matrix.cols(), matrix.cols());
       return true;
     }
     return false;
   }
-private:
+
+ private:
   typedef HouseholderQR<MatrixType> QRType;
   QRType m_qr;
-  typename internal::plain_col_type<MatrixType>::type m_workspace;
+  WorkspaceType m_workspace;
 };
 
-template<typename MatrixType>
-class qr_preconditioner_impl<MatrixType, HouseholderQRPreconditioner, PreconditionIfMoreColsThanRows, true>
-{
-public:
-  typedef typename MatrixType::Index Index;
+template <typename MatrixType, int Options>
+class qr_preconditioner_impl<MatrixType, Options, HouseholderQRPreconditioner, PreconditionIfMoreColsThanRows, true> {
+ public:
   typedef typename MatrixType::Scalar Scalar;
-  enum
-  {
+  typedef JacobiSVD<MatrixType, Options> SVDType;
+
+  enum {
     RowsAtCompileTime = MatrixType::RowsAtCompileTime,
     ColsAtCompileTime = MatrixType::ColsAtCompileTime,
     MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
     MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
-    Options = MatrixType::Options
+    MatrixOptions = internal::traits<MatrixType>::Options,
+    WorkspaceSize = internal::traits<SVDType>::MatrixVColsAtCompileTime,
+    MaxWorkspaceSize = internal::traits<SVDType>::MatrixVMaxColsAtCompileTime
   };
 
-  typedef Matrix<Scalar, ColsAtCompileTime, RowsAtCompileTime, Options, MaxColsAtCompileTime, MaxRowsAtCompileTime>
-          TransposeTypeWithSameStorageOrder;
+  typedef Matrix<Scalar, WorkspaceSize, 1, ColMajor, MaxWorkspaceSize, 1> WorkspaceType;
+
+  typedef typename internal::make_proper_matrix_type<Scalar, ColsAtCompileTime, RowsAtCompileTime, MatrixOptions,
+                                                     MaxColsAtCompileTime, MaxRowsAtCompileTime>::type
+      TransposeTypeWithSameStorageOrder;
 
-  void allocate(const JacobiSVD<MatrixType, HouseholderQRPreconditioner>& svd)
-  {
-    if (svd.cols() != m_qr.rows() || svd.rows() != m_qr.cols())
-    {
-      m_qr.~QRType();
-      ::new (&m_qr) QRType(svd.cols(), svd.rows());
+  void allocate(const SVDType& svd) {
+    if (svd.cols() != m_qr.rows() || svd.rows() != m_qr.cols()) {
+      internal::destroy_at(&m_qr);
+      internal::construct_at(&m_qr, svd.cols(), svd.rows());
     }
-    if (svd.m_computeFullV) m_workspace.resize(svd.cols());
-    else if (svd.m_computeThinV) m_workspace.resize(svd.rows());
-    m_adjoint.resize(svd.cols(), svd.rows());
+    if (svd.m_computeFullV)
+      m_workspace.resize(svd.cols());
+    else if (svd.m_computeThinV)
+      m_workspace.resize(svd.rows());
   }
 
-  bool run(JacobiSVD<MatrixType, HouseholderQRPreconditioner>& svd, const MatrixType& matrix)
-  {
-    if(matrix.cols() > matrix.rows())
-    {
-      m_adjoint = matrix.adjoint();
-      m_qr.compute(m_adjoint);
-
-      svd.m_workMatrix = m_qr.matrixQR().block(0,0,matrix.rows(),matrix.rows()).template triangularView<Upper>().adjoint();
-      if(svd.m_computeFullV) m_qr.householderQ().evalTo(svd.m_matrixV, m_workspace);
-      else if(svd.m_computeThinV)
-      {
+  template <typename Xpr>
+  bool run(SVDType& svd, const Xpr& matrix) {
+    if (matrix.cols() > matrix.rows()) {
+      m_qr.compute(matrix.adjoint());
+
+      svd.m_workMatrix =
+          m_qr.matrixQR().block(0, 0, matrix.rows(), matrix.rows()).template triangularView<Upper>().adjoint();
+      if (svd.m_computeFullV)
+        m_qr.householderQ().evalTo(svd.m_matrixV, m_workspace);
+      else if (svd.m_computeThinV) {
         svd.m_matrixV.setIdentity(matrix.cols(), matrix.rows());
         m_qr.householderQ().applyThisOnTheLeft(svd.m_matrixV, m_workspace);
       }
-      if(svd.computeU()) svd.m_matrixU.setIdentity(matrix.rows(), matrix.rows());
+      if (svd.computeU()) svd.m_matrixU.setIdentity(matrix.rows(), matrix.rows());
       return true;
-    }
-    else return false;
+    } else
+      return false;
   }
 
-private:
+ private:
   typedef HouseholderQR<TransposeTypeWithSameStorageOrder> QRType;
   QRType m_qr;
-  TransposeTypeWithSameStorageOrder m_adjoint;
-  typename internal::plain_row_type<MatrixType>::type m_workspace;
+  WorkspaceType m_workspace;
 };
 
 /*** 2x2 SVD implementation
@@ -354,623 +359,488 @@ class qr_preconditioner_impl<MatrixType, HouseholderQRPreconditioner, Preconditi
  *** JacobiSVD consists in performing a series of 2x2 SVD subproblems
  ***/
 
-template<typename MatrixType, int QRPreconditioner>
-struct svd_precondition_2x2_block_to_be_real<MatrixType, QRPreconditioner, false>
-{
-  typedef JacobiSVD<MatrixType, QRPreconditioner> SVD;
-  typedef typename SVD::Index Index;
-  static void run(typename SVD::WorkMatrixType&, SVD&, Index, Index) {}
+template <typename MatrixType, int Options>
+struct svd_precondition_2x2_block_to_be_real<MatrixType, Options, false> {
+  typedef JacobiSVD<MatrixType, Options> SVD;
+  typedef typename MatrixType::RealScalar RealScalar;
+  static bool run(typename SVD::WorkMatrixType&, SVD&, Index, Index, RealScalar&) { return true; }
 };
 
-template<typename MatrixType, int QRPreconditioner>
-struct svd_precondition_2x2_block_to_be_real<MatrixType, QRPreconditioner, true>
-{
-  typedef JacobiSVD<MatrixType, QRPreconditioner> SVD;
+template <typename MatrixType, int Options>
+struct svd_precondition_2x2_block_to_be_real<MatrixType, Options, true> {
+  typedef JacobiSVD<MatrixType, Options> SVD;
   typedef typename MatrixType::Scalar Scalar;
   typedef typename MatrixType::RealScalar RealScalar;
-  typedef typename SVD::Index Index;
-  static void run(typename SVD::WorkMatrixType& work_matrix, SVD& svd, Index p, Index q)
-  {
+  static bool run(typename SVD::WorkMatrixType& work_matrix, SVD& svd, Index p, Index q, RealScalar& maxDiagEntry) {
+    using std::abs;
     using std::sqrt;
     Scalar z;
     JacobiRotation<Scalar> rot;
-    RealScalar n = sqrt(numext::abs2(work_matrix.coeff(p,p)) + numext::abs2(work_matrix.coeff(q,p)));
-    
-    if(n==0)
-    {
-      z = abs(work_matrix.coeff(p,q)) / work_matrix.coeff(p,q);
-      work_matrix.row(p) *= z;
-      if(svd.computeU()) svd.m_matrixU.col(p) *= conj(z);
-      if(work_matrix.coeff(q,q)!=Scalar(0))
-      {
-        z = abs(work_matrix.coeff(q,q)) / work_matrix.coeff(q,q);
+    RealScalar n = sqrt(numext::abs2(work_matrix.coeff(p, p)) + numext::abs2(work_matrix.coeff(q, p)));
+
+    const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
+    const RealScalar precision = NumTraits<Scalar>::epsilon();
+
+    if (numext::is_exactly_zero(n)) {
+      // make sure first column is zero
+      work_matrix.coeffRef(p, p) = work_matrix.coeffRef(q, p) = Scalar(0);
+
+      if (abs(numext::imag(work_matrix.coeff(p, q))) > considerAsZero) {
+        // work_matrix.coeff(p,q) can be zero if work_matrix.coeff(q,p) is not zero but small enough to underflow when
+        // computing n
+        z = abs(work_matrix.coeff(p, q)) / work_matrix.coeff(p, q);
+        work_matrix.row(p) *= z;
+        if (svd.computeU()) svd.m_matrixU.col(p) *= conj(z);
+      }
+      if (abs(numext::imag(work_matrix.coeff(q, q))) > considerAsZero) {
+        z = abs(work_matrix.coeff(q, q)) / work_matrix.coeff(q, q);
         work_matrix.row(q) *= z;
-        if(svd.computeU()) svd.m_matrixU.col(q) *= conj(z);
+        if (svd.computeU()) svd.m_matrixU.col(q) *= conj(z);
       }
       // otherwise the second row is already zero, so we have nothing to do.
-    }
-    else
-    {
-      rot.c() = conj(work_matrix.coeff(p,p)) / n;
-      rot.s() = work_matrix.coeff(q,p) / n;
-      work_matrix.applyOnTheLeft(p,q,rot);
-      if(svd.computeU()) svd.m_matrixU.applyOnTheRight(p,q,rot.adjoint());
-      if(work_matrix.coeff(p,q) != Scalar(0))
-      {
-        Scalar z = abs(work_matrix.coeff(p,q)) / work_matrix.coeff(p,q);
+    } else {
+      rot.c() = conj(work_matrix.coeff(p, p)) / n;
+      rot.s() = work_matrix.coeff(q, p) / n;
+      work_matrix.applyOnTheLeft(p, q, rot);
+      if (svd.computeU()) svd.m_matrixU.applyOnTheRight(p, q, rot.adjoint());
+      if (abs(numext::imag(work_matrix.coeff(p, q))) > considerAsZero) {
+        z = abs(work_matrix.coeff(p, q)) / work_matrix.coeff(p, q);
         work_matrix.col(q) *= z;
-        if(svd.computeV()) svd.m_matrixV.col(q) *= z;
+        if (svd.computeV()) svd.m_matrixV.col(q) *= z;
       }
-      if(work_matrix.coeff(q,q) != Scalar(0))
-      {
-        z = abs(work_matrix.coeff(q,q)) / work_matrix.coeff(q,q);
+      if (abs(numext::imag(work_matrix.coeff(q, q))) > considerAsZero) {
+        z = abs(work_matrix.coeff(q, q)) / work_matrix.coeff(q, q);
         work_matrix.row(q) *= z;
-        if(svd.computeU()) svd.m_matrixU.col(q) *= conj(z);
+        if (svd.computeU()) svd.m_matrixU.col(q) *= conj(z);
       }
     }
+
+    // update largest diagonal entry
+    maxDiagEntry = numext::maxi<RealScalar>(
+        maxDiagEntry, numext::maxi<RealScalar>(abs(work_matrix.coeff(p, p)), abs(work_matrix.coeff(q, q))));
+    // and check whether the 2x2 block is already diagonal
+    RealScalar threshold = numext::maxi<RealScalar>(considerAsZero, precision * maxDiagEntry);
+    return abs(work_matrix.coeff(p, q)) > threshold || abs(work_matrix.coeff(q, p)) > threshold;
   }
 };
 
-template<typename MatrixType, typename RealScalar, typename Index>
-void real_2x2_jacobi_svd(const MatrixType& matrix, Index p, Index q,
-                            JacobiRotation<RealScalar> *j_left,
-                            JacobiRotation<RealScalar> *j_right)
-{
-  using std::sqrt;
-  using std::abs;
-  Matrix<RealScalar,2,2> m;
-  m << numext::real(matrix.coeff(p,p)), numext::real(matrix.coeff(p,q)),
-       numext::real(matrix.coeff(q,p)), numext::real(matrix.coeff(q,q));
-  JacobiRotation<RealScalar> rot1;
-  RealScalar t = m.coeff(0,0) + m.coeff(1,1);
-  RealScalar d = m.coeff(1,0) - m.coeff(0,1);
-  if(t == RealScalar(0))
-  {
-    rot1.c() = RealScalar(0);
-    rot1.s() = d > RealScalar(0) ? RealScalar(1) : RealScalar(-1);
-  }
-  else
-  {
-    RealScalar t2d2 = numext::hypot(t,d);
-    rot1.c() = abs(t)/t2d2;
-    rot1.s() = d/t2d2;
-    if(t<RealScalar(0))
-      rot1.s() = -rot1.s();
-  }
-  m.applyOnTheLeft(0,1,rot1);
-  j_right->makeJacobi(m,0,1);
-  *j_left  = rot1 * j_right->transpose();
-}
+template <typename MatrixType_, int Options>
+struct traits<JacobiSVD<MatrixType_, Options> > : svd_traits<MatrixType_, Options> {
+  typedef MatrixType_ MatrixType;
+};
 
-} // end namespace internal
+}  // end namespace internal
 
 /** \ingroup SVD_Module
-  *
-  *
-  * \class JacobiSVD
-  *
-  * \brief Two-sided Jacobi SVD decomposition of a rectangular matrix
-  *
-  * \param MatrixType the type of the matrix of which we are computing the SVD decomposition
-  * \param QRPreconditioner this optional parameter allows to specify the type of QR decomposition that will be used internally
-  *                        for the R-SVD step for non-square matrices. See discussion of possible values below.
-  *
-  * SVD decomposition consists in decomposing any n-by-p matrix \a A as a product
-  *   \f[ A = U S V^* \f]
-  * where \a U is a n-by-n unitary, \a V is a p-by-p unitary, and \a S is a n-by-p real positive matrix which is zero outside of its main diagonal;
-  * the diagonal entries of S are known as the \em singular \em values of \a A and the columns of \a U and \a V are known as the left
-  * and right \em singular \em vectors of \a A respectively.
-  *
-  * Singular values are always sorted in decreasing order.
-  *
-  * This JacobiSVD decomposition computes only the singular values by default. If you want \a U or \a V, you need to ask for them explicitly.
-  *
-  * You can ask for only \em thin \a U or \a V to be computed, meaning the following. In case of a rectangular n-by-p matrix, letting \a m be the
-  * smaller value among \a n and \a p, there are only \a m singular vectors; the remaining columns of \a U and \a V do not correspond to actual
-  * singular vectors. Asking for \em thin \a U or \a V means asking for only their \a m first columns to be formed. So \a U is then a n-by-m matrix,
-  * and \a V is then a p-by-m matrix. Notice that thin \a U and \a V are all you need for (least squares) solving.
-  *
-  * Here's an example demonstrating basic usage:
-  * \include JacobiSVD_basic.cpp
-  * Output: \verbinclude JacobiSVD_basic.out
-  *
-  * This JacobiSVD class is a two-sided Jacobi R-SVD decomposition, ensuring optimal reliability and accuracy. The downside is that it's slower than
-  * bidiagonalizing SVD algorithms for large square matrices; however its complexity is still \f$ O(n^2p) \f$ where \a n is the smaller dimension and
-  * \a p is the greater dimension, meaning that it is still of the same order of complexity as the faster bidiagonalizing R-SVD algorithms.
-  * In particular, like any R-SVD, it takes advantage of non-squareness in that its complexity is only linear in the greater dimension.
-  *
-  * If the input matrix has inf or nan coefficients, the result of the computation is undefined, but the computation is guaranteed to
-  * terminate in finite (and reasonable) time.
-  *
-  * The possible values for QRPreconditioner are:
-  * \li ColPivHouseholderQRPreconditioner is the default. In practice it's very safe. It uses column-pivoting QR.
-  * \li FullPivHouseholderQRPreconditioner, is the safest and slowest. It uses full-pivoting QR.
-  *     Contrary to other QRs, it doesn't allow computing thin unitaries.
-  * \li HouseholderQRPreconditioner is the fastest, and less safe and accurate than the pivoting variants. It uses non-pivoting QR.
-  *     This is very similar in safety and accuracy to the bidiagonalization process used by bidiagonalizing SVD algorithms (since bidiagonalization
-  *     is inherently non-pivoting). However the resulting SVD is still more reliable than bidiagonalizing SVDs because the Jacobi-based iterarive
-  *     process is more reliable than the optimized bidiagonal SVD iterations.
-  * \li NoQRPreconditioner allows not to use a QR preconditioner at all. This is useful if you know that you will only be computing
-  *     JacobiSVD decompositions of square matrices. Non-square matrices require a QR preconditioner. Using this option will result in
-  *     faster compilation and smaller executable code. It won't significantly speed up computation, since JacobiSVD is always checking
-  *     if QR preconditioning is needed before applying it anyway.
-  *
-  * \sa MatrixBase::jacobiSvd()
-  */
-template<typename _MatrixType, int QRPreconditioner> class JacobiSVD
-{
-  public:
-
-    typedef _MatrixType MatrixType;
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
-    typedef typename MatrixType::Index Index;
-    enum {
-      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-      DiagSizeAtCompileTime = EIGEN_SIZE_MIN_PREFER_DYNAMIC(RowsAtCompileTime,ColsAtCompileTime),
-      MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
-      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
-      MaxDiagSizeAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED(MaxRowsAtCompileTime,MaxColsAtCompileTime),
-      MatrixOptions = MatrixType::Options
-    };
-
-    typedef Matrix<Scalar, RowsAtCompileTime, RowsAtCompileTime,
-                   MatrixOptions, MaxRowsAtCompileTime, MaxRowsAtCompileTime>
-            MatrixUType;
-    typedef Matrix<Scalar, ColsAtCompileTime, ColsAtCompileTime,
-                   MatrixOptions, MaxColsAtCompileTime, MaxColsAtCompileTime>
-            MatrixVType;
-    typedef typename internal::plain_diag_type<MatrixType, RealScalar>::type SingularValuesType;
-    typedef typename internal::plain_row_type<MatrixType>::type RowType;
-    typedef typename internal::plain_col_type<MatrixType>::type ColType;
-    typedef Matrix<Scalar, DiagSizeAtCompileTime, DiagSizeAtCompileTime,
-                   MatrixOptions, MaxDiagSizeAtCompileTime, MaxDiagSizeAtCompileTime>
-            WorkMatrixType;
-
-    /** \brief Default Constructor.
-      *
-      * The default constructor is useful in cases in which the user intends to
-      * perform decompositions via JacobiSVD::compute(const MatrixType&).
-      */
-    JacobiSVD()
-      : m_isInitialized(false),
-        m_isAllocated(false),
-        m_usePrescribedThreshold(false),
-        m_computationOptions(0),
-        m_rows(-1), m_cols(-1), m_diagSize(0)
-    {}
-
-
-    /** \brief Default Constructor with memory preallocation
-      *
-      * Like the default constructor but with preallocation of the internal data
-      * according to the specified problem size.
-      * \sa JacobiSVD()
-      */
-    JacobiSVD(Index rows, Index cols, unsigned int computationOptions = 0)
-      : m_isInitialized(false),
-        m_isAllocated(false),
-        m_usePrescribedThreshold(false),
-        m_computationOptions(0),
-        m_rows(-1), m_cols(-1)
-    {
-      allocate(rows, cols, computationOptions);
-    }
-
-    /** \brief Constructor performing the decomposition of given matrix.
-     *
-     * \param matrix the matrix to decompose
-     * \param computationOptions optional parameter allowing to specify if you want full or thin U or V unitaries to be computed.
-     *                           By default, none is computed. This is a bit-field, the possible bits are #ComputeFullU, #ComputeThinU,
-     *                           #ComputeFullV, #ComputeThinV.
-     *
-     * Thin unitaries are only available if your matrix type has a Dynamic number of columns (for example MatrixXf). They also are not
-     * available with the (non-default) FullPivHouseholderQR preconditioner.
-     */
-    JacobiSVD(const MatrixType& matrix, unsigned int computationOptions = 0)
-      : m_isInitialized(false),
-        m_isAllocated(false),
-        m_usePrescribedThreshold(false),
-        m_computationOptions(0),
-        m_rows(-1), m_cols(-1)
-    {
-      compute(matrix, computationOptions);
-    }
-
-    /** \brief Method performing the decomposition of given matrix using custom options.
-     *
-     * \param matrix the matrix to decompose
-     * \param computationOptions optional parameter allowing to specify if you want full or thin U or V unitaries to be computed.
-     *                           By default, none is computed. This is a bit-field, the possible bits are #ComputeFullU, #ComputeThinU,
-     *                           #ComputeFullV, #ComputeThinV.
-     *
-     * Thin unitaries are only available if your matrix type has a Dynamic number of columns (for example MatrixXf). They also are not
-     * available with the (non-default) FullPivHouseholderQR preconditioner.
-     */
-    JacobiSVD& compute(const MatrixType& matrix, unsigned int computationOptions);
-
-    /** \brief Method performing the decomposition of given matrix using current options.
-     *
-     * \param matrix the matrix to decompose
-     *
-     * This method uses the current \a computationOptions, as already passed to the constructor or to compute(const MatrixType&, unsigned int).
-     */
-    JacobiSVD& compute(const MatrixType& matrix)
-    {
-      return compute(matrix, m_computationOptions);
-    }
-
-    /** \returns the \a U matrix.
-     *
-     * For the SVD decomposition of a n-by-p matrix, letting \a m be the minimum of \a n and \a p,
-     * the U matrix is n-by-n if you asked for #ComputeFullU, and is n-by-m if you asked for #ComputeThinU.
-     *
-     * The \a m first columns of \a U are the left singular vectors of the matrix being decomposed.
-     *
-     * This method asserts that you asked for \a U to be computed.
-     */
-    const MatrixUType& matrixU() const
-    {
-      eigen_assert(m_isInitialized && "JacobiSVD is not initialized.");
-      eigen_assert(computeU() && "This JacobiSVD decomposition didn't compute U. Did you ask for it?");
-      return m_matrixU;
-    }
+ *
+ *
+ * \class JacobiSVD
+ *
+ * \brief Two-sided Jacobi SVD decomposition of a rectangular matrix
+ *
+ * \tparam MatrixType_ the type of the matrix of which we are computing the SVD decomposition
+ * \tparam Options this optional parameter allows one to specify the type of QR decomposition that will be used
+ * internally for the R-SVD step for non-square matrices. Additionally, it allows one to specify whether to compute thin
+ * or full unitaries \a U and \a V. See discussion of possible values below.
+ *
+ * SVD decomposition consists in decomposing any n-by-p matrix \a A as a product
+ *   \f[ A = U S V^* \f]
+ * where \a U is a n-by-n unitary, \a V is a p-by-p unitary, and \a S is a n-by-p real positive matrix which is zero
+ * outside of its main diagonal; the diagonal entries of S are known as the \em singular \em values of \a A and the
+ * columns of \a U and \a V are known as the left and right \em singular \em vectors of \a A respectively.
+ *
+ * Singular values are always sorted in decreasing order.
+ *
+ * This JacobiSVD decomposition computes only the singular values by default. If you want \a U or \a V, you need to ask
+ * for them explicitly.
+ *
+ * You can ask for only \em thin \a U or \a V to be computed, meaning the following. In case of a rectangular n-by-p
+ * matrix, letting \a m be the smaller value among \a n and \a p, there are only \a m singular vectors; the remaining
+ * columns of \a U and \a V do not correspond to actual singular vectors. Asking for \em thin \a U or \a V means asking
+ * for only their \a m first columns to be formed. So \a U is then a n-by-m matrix, and \a V is then a p-by-m matrix.
+ * Notice that thin \a U and \a V are all you need for (least squares) solving.
+ *
+ * Here's an example demonstrating basic usage:
+ * \include JacobiSVD_basic.cpp
+ * Output: \verbinclude JacobiSVD_basic.out
+ *
+ * This JacobiSVD class is a two-sided Jacobi R-SVD decomposition, ensuring optimal reliability and accuracy. The
+ * downside is that it's slower than bidiagonalizing SVD algorithms for large square matrices; however its complexity is
+ * still \f$ O(n^2p) \f$ where \a n is the smaller dimension and \a p is the greater dimension, meaning that it is still
+ * of the same order of complexity as the faster bidiagonalizing R-SVD algorithms. In particular, like any R-SVD, it
+ * takes advantage of non-squareness in that its complexity is only linear in the greater dimension.
+ *
+ * If the input matrix has inf or nan coefficients, the result of the computation is undefined, but the computation is
+ * guaranteed to terminate in finite (and reasonable) time.
+ *
+ * The possible QR preconditioners that can be set with Options template parameter are:
+ * \li ColPivHouseholderQRPreconditioner is the default. In practice it's very safe. It uses column-pivoting QR.
+ * \li FullPivHouseholderQRPreconditioner, is the safest and slowest. It uses full-pivoting QR.
+ *     Contrary to other QRs, it doesn't allow computing thin unitaries.
+ * \li HouseholderQRPreconditioner is the fastest, and less safe and accurate than the pivoting variants. It uses
+ * non-pivoting QR. This is very similar in safety and accuracy to the bidiagonalization process used by bidiagonalizing
+ * SVD algorithms (since bidiagonalization is inherently non-pivoting). However the resulting SVD is still more reliable
+ * than bidiagonalizing SVDs because the Jacobi-based iterarive process is more reliable than the optimized bidiagonal
+ * SVD iterations. \li NoQRPreconditioner allows not to use a QR preconditioner at all. This is useful if you know that
+ * you will only be computing JacobiSVD decompositions of square matrices. Non-square matrices require a QR
+ * preconditioner. Using this option will result in faster compilation and smaller executable code. It won't
+ * significantly speed up computation, since JacobiSVD is always checking if QR preconditioning is needed before
+ * applying it anyway.
+ *
+ * One may also use the Options template parameter to specify how the unitaries should be computed. The options are
+ * #ComputeThinU, #ComputeThinV, #ComputeFullU, #ComputeFullV. It is not possible to request both the thin and full
+ * versions of a unitary. By default, unitaries will not be computed.
+ *
+ * You can set the QRPreconditioner and unitary options together: JacobiSVD<MatrixType,
+ * ColPivHouseholderQRPreconditioner | ComputeThinU | ComputeFullV>
+ *
+ * \sa MatrixBase::jacobiSvd()
+ */
+template <typename MatrixType_, int Options_>
+class JacobiSVD : public SVDBase<JacobiSVD<MatrixType_, Options_> > {
+  typedef SVDBase<JacobiSVD> Base;
+
+ public:
+  typedef MatrixType_ MatrixType;
+  typedef typename Base::Scalar Scalar;
+  typedef typename Base::RealScalar RealScalar;
+  enum : int {
+    Options = Options_,
+    QRPreconditioner = internal::get_qr_preconditioner(Options),
+    RowsAtCompileTime = Base::RowsAtCompileTime,
+    ColsAtCompileTime = Base::ColsAtCompileTime,
+    DiagSizeAtCompileTime = Base::DiagSizeAtCompileTime,
+    MaxRowsAtCompileTime = Base::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = Base::MaxColsAtCompileTime,
+    MaxDiagSizeAtCompileTime = Base::MaxDiagSizeAtCompileTime,
+    MatrixOptions = Base::MatrixOptions
+  };
 
-    /** \returns the \a V matrix.
-     *
-     * For the SVD decomposition of a n-by-p matrix, letting \a m be the minimum of \a n and \a p,
-     * the V matrix is p-by-p if you asked for #ComputeFullV, and is p-by-m if you asked for ComputeThinV.
-     *
-     * The \a m first columns of \a V are the right singular vectors of the matrix being decomposed.
-     *
-     * This method asserts that you asked for \a V to be computed.
-     */
-    const MatrixVType& matrixV() const
-    {
-      eigen_assert(m_isInitialized && "JacobiSVD is not initialized.");
-      eigen_assert(computeV() && "This JacobiSVD decomposition didn't compute V. Did you ask for it?");
-      return m_matrixV;
-    }
+  typedef typename Base::MatrixUType MatrixUType;
+  typedef typename Base::MatrixVType MatrixVType;
+  typedef typename Base::SingularValuesType SingularValuesType;
+  typedef Matrix<Scalar, DiagSizeAtCompileTime, DiagSizeAtCompileTime, MatrixOptions, MaxDiagSizeAtCompileTime,
+                 MaxDiagSizeAtCompileTime>
+      WorkMatrixType;
+
+  /** \brief Default Constructor.
+   *
+   * The default constructor is useful in cases in which the user intends to
+   * perform decompositions via JacobiSVD::compute(const MatrixType&).
+   */
+  JacobiSVD() {}
+
+  /** \brief Default Constructor with memory preallocation
+   *
+   * Like the default constructor but with preallocation of the internal data
+   * according to the specified problem size and \a Options template parameter.
+   *
+   * \sa JacobiSVD()
+   */
+  JacobiSVD(Index rows, Index cols) { allocate(rows, cols, internal::get_computation_options(Options)); }
+
+  /** \brief Default Constructor with memory preallocation
+   *
+   * Like the default constructor but with preallocation of the internal data
+   * according to the specified problem size.
+   *
+   * One \b cannot request unitaries using both the \a Options template parameter
+   * and the constructor. If possible, prefer using the \a Options template parameter.
+   *
+   * \param rows number of rows for the input matrix
+   * \param cols number of columns for the input matrix
+   * \param computationOptions specify whether to compute Thin/Full unitaries U/V
+   * \sa JacobiSVD()
+   *
+   * \deprecated Will be removed in the next major Eigen version. Options should
+   * be specified in the \a Options template parameter.
+   */
+  EIGEN_DEPRECATED_WITH_REASON("Options should be specified using the class template parameter.")
+  JacobiSVD(Index rows, Index cols, unsigned int computationOptions) {
+    internal::check_svd_options_assertions<MatrixType, Options>(computationOptions, rows, cols);
+    allocate(rows, cols, computationOptions);
+  }
 
-    /** \returns the vector of singular values.
-     *
-     * For the SVD decomposition of a n-by-p matrix, letting \a m be the minimum of \a n and \a p, the
-     * returned vector has size \a m.  Singular values are always sorted in decreasing order.
-     */
-    const SingularValuesType& singularValues() const
-    {
-      eigen_assert(m_isInitialized && "JacobiSVD is not initialized.");
-      return m_singularValues;
-    }
+  /** \brief Constructor performing the decomposition of given matrix, using the custom options specified
+   *         with the \a Options template parameter.
+   *
+   * \param matrix the matrix to decompose
+   */
+  template <typename Derived>
+  explicit JacobiSVD(const MatrixBase<Derived>& matrix) {
+    compute_impl(matrix, internal::get_computation_options(Options));
+  }
 
-    /** \returns true if \a U (full or thin) is asked for in this SVD decomposition */
-    inline bool computeU() const { return m_computeFullU || m_computeThinU; }
-    /** \returns true if \a V (full or thin) is asked for in this SVD decomposition */
-    inline bool computeV() const { return m_computeFullV || m_computeThinV; }
-
-    /** \returns a (least squares) solution of \f$ A x = b \f$ using the current SVD decomposition of A.
-      *
-      * \param b the right-hand-side of the equation to solve.
-      *
-      * \note Solving requires both U and V to be computed. Thin U and V are enough, there is no need for full U or V.
-      *
-      * \note SVD solving is implicitly least-squares. Thus, this method serves both purposes of exact solving and least-squares solving.
-      * In other words, the returned solution is guaranteed to minimize the Euclidean norm \f$ \Vert A x - b \Vert \f$.
-      */
-    template<typename Rhs>
-    inline const internal::solve_retval<JacobiSVD, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "JacobiSVD is not initialized.");
-      eigen_assert(computeU() && computeV() && "JacobiSVD::solve() requires both unitaries U and V to be computed (thin unitaries suffice).");
-      return internal::solve_retval<JacobiSVD, Rhs>(*this, b.derived());
-    }
+  template <typename Derived>
+  explicit JacobiSVD(const TriangularBase<Derived>& matrix) {
+    compute_impl(matrix, internal::get_computation_options(Options));
+  }
 
-    /** \returns the number of singular values that are not exactly 0 */
-    Index nonzeroSingularValues() const
-    {
-      eigen_assert(m_isInitialized && "JacobiSVD is not initialized.");
-      return m_nonzeroSingularValues;
-    }
-    
-    /** \returns the rank of the matrix of which \c *this is the SVD.
-      *
-      * \note This method has to determine which singular values should be considered nonzero.
-      *       For that, it uses the threshold value that you can control by calling
-      *       setThreshold(const RealScalar&).
-      */
-    inline Index rank() const
-    {
-      using std::abs;
-      eigen_assert(m_isInitialized && "JacobiSVD is not initialized.");
-      if(m_singularValues.size()==0) return 0;
-      RealScalar premultiplied_threshold = m_singularValues.coeff(0) * threshold();
-      Index i = m_nonzeroSingularValues-1;
-      while(i>=0 && m_singularValues.coeff(i) < premultiplied_threshold) --i;
-      return i+1;
-    }
-    
-    /** Allows to prescribe a threshold to be used by certain methods, such as rank() and solve(),
-      * which need to determine when singular values are to be considered nonzero.
-      * This is not used for the SVD decomposition itself.
-      *
-      * When it needs to get the threshold value, Eigen calls threshold().
-      * The default is \c NumTraits<Scalar>::epsilon()
-      *
-      * \param threshold The new value to use as the threshold.
-      *
-      * A singular value will be considered nonzero if its value is strictly greater than
-      *  \f$ \vert singular value \vert \leqslant threshold \times \vert max singular value \vert \f$.
-      *
-      * If you want to come back to the default behavior, call setThreshold(Default_t)
-      */
-    JacobiSVD& setThreshold(const RealScalar& threshold)
-    {
-      m_usePrescribedThreshold = true;
-      m_prescribedThreshold = threshold;
-      return *this;
-    }
+  /** \brief Constructor performing the decomposition of given matrix using specified options
+   *         for computing unitaries.
+   *
+   *  One \b cannot request unitiaries using both the \a Options template parameter
+   *  and the constructor. If possible, prefer using the \a Options template parameter.
+   *
+   * \param matrix the matrix to decompose
+   * \param computationOptions specify whether to compute Thin/Full unitaries U/V
+   *
+   * \deprecated Will be removed in the next major Eigen version. Options should
+   * be specified in the \a Options template parameter.
+   */
+  // EIGEN_DEPRECATED // TODO(cantonios): re-enable after fixing a few 3p libraries that error on deprecation warnings.
+  template <typename Derived>
+  JacobiSVD(const MatrixBase<Derived>& matrix, unsigned int computationOptions) {
+    internal::check_svd_options_assertions<MatrixBase<Derived>, Options>(computationOptions, matrix.rows(),
+                                                                         matrix.cols());
+    compute_impl(matrix, computationOptions);
+  }
 
-    /** Allows to come back to the default behavior, letting Eigen use its default formula for
-      * determining the threshold.
-      *
-      * You should pass the special object Eigen::Default as parameter here.
-      * \code svd.setThreshold(Eigen::Default); \endcode
-      *
-      * See the documentation of setThreshold(const RealScalar&).
-      */
-    JacobiSVD& setThreshold(Default_t)
-    {
-      m_usePrescribedThreshold = false;
-      return *this;
-    }
+  /** \brief Method performing the decomposition of given matrix. Computes Thin/Full unitaries U/V if specified
+   *         using the \a Options template parameter or the class constructor.
+   *
+   * \param matrix the matrix to decompose
+   */
+  template <typename Derived>
+  JacobiSVD& compute(const MatrixBase<Derived>& matrix) {
+    return compute_impl(matrix, m_computationOptions);
+  }
 
-    /** Returns the threshold that will be used by certain methods such as rank().
-      *
-      * See the documentation of setThreshold(const RealScalar&).
-      */
-    RealScalar threshold() const
-    {
-      eigen_assert(m_isInitialized || m_usePrescribedThreshold);
-      return m_usePrescribedThreshold ? m_prescribedThreshold
-                                      : (std::max<Index>)(1,m_diagSize)*NumTraits<Scalar>::epsilon();
-    }
+  template <typename Derived>
+  JacobiSVD& compute(const TriangularBase<Derived>& matrix) {
+    return compute_impl(matrix, m_computationOptions);
+  }
 
-    inline Index rows() const { return m_rows; }
-    inline Index cols() const { return m_cols; }
+  /** \brief Method performing the decomposition of given matrix, as specified by
+   *         the `computationOptions` parameter.
+   *
+   * \param matrix the matrix to decompose
+   * \param computationOptions specify whether to compute Thin/Full unitaries U/V
+   *
+   * \deprecated Will be removed in the next major Eigen version. Options should
+   * be specified in the \a Options template parameter.
+   */
+  template <typename Derived>
+  EIGEN_DEPRECATED_WITH_REASON("Options should be specified using the class template parameter.")
+  JacobiSVD& compute(const MatrixBase<Derived>& matrix, unsigned int computationOptions) {
+    internal::check_svd_options_assertions<MatrixBase<Derived>, Options>(m_computationOptions, matrix.rows(),
+                                                                         matrix.cols());
+    return compute_impl(matrix, computationOptions);
+  }
 
-  private:
-    void allocate(Index rows, Index cols, unsigned int computationOptions);
-    
-    static void check_template_parameters()
-    {
-      EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar);
-    }
+  using Base::cols;
+  using Base::computeU;
+  using Base::computeV;
+  using Base::diagSize;
+  using Base::rank;
+  using Base::rows;
+
+  void allocate(Index rows_, Index cols_, unsigned int computationOptions) {
+    if (Base::allocate(rows_, cols_, computationOptions)) return;
+    eigen_assert(!(ShouldComputeThinU && int(QRPreconditioner) == int(FullPivHouseholderQRPreconditioner)) &&
+                 !(ShouldComputeThinU && int(QRPreconditioner) == int(FullPivHouseholderQRPreconditioner)) &&
+                 "JacobiSVD: can't compute thin U or thin V with the FullPivHouseholderQR preconditioner. "
+                 "Use the ColPivHouseholderQR preconditioner instead.");
+
+    m_workMatrix.resize(diagSize(), diagSize());
+    if (cols() > rows()) m_qr_precond_morecols.allocate(*this);
+    if (rows() > cols()) m_qr_precond_morerows.allocate(*this);
+  }
 
-  protected:
-    MatrixUType m_matrixU;
-    MatrixVType m_matrixV;
-    SingularValuesType m_singularValues;
-    WorkMatrixType m_workMatrix;
-    bool m_isInitialized, m_isAllocated, m_usePrescribedThreshold;
-    bool m_computeFullU, m_computeThinU;
-    bool m_computeFullV, m_computeThinV;
-    unsigned int m_computationOptions;
-    Index m_nonzeroSingularValues, m_rows, m_cols, m_diagSize;
-    RealScalar m_prescribedThreshold;
-
-    template<typename __MatrixType, int _QRPreconditioner, bool _IsComplex>
-    friend struct internal::svd_precondition_2x2_block_to_be_real;
-    template<typename __MatrixType, int _QRPreconditioner, int _Case, bool _DoAnything>
-    friend struct internal::qr_preconditioner_impl;
-
-    internal::qr_preconditioner_impl<MatrixType, QRPreconditioner, internal::PreconditionIfMoreColsThanRows> m_qr_precond_morecols;
-    internal::qr_preconditioner_impl<MatrixType, QRPreconditioner, internal::PreconditionIfMoreRowsThanCols> m_qr_precond_morerows;
-    MatrixType m_scaledMatrix;
+ private:
+  template <typename Derived>
+  JacobiSVD& compute_impl(const TriangularBase<Derived>& matrix, unsigned int computationOptions);
+  template <typename Derived>
+  JacobiSVD& compute_impl(const MatrixBase<Derived>& matrix, unsigned int computationOptions);
+
+ protected:
+  using Base::m_computationOptions;
+  using Base::m_computeFullU;
+  using Base::m_computeFullV;
+  using Base::m_computeThinU;
+  using Base::m_computeThinV;
+  using Base::m_info;
+  using Base::m_isAllocated;
+  using Base::m_isInitialized;
+  using Base::m_matrixU;
+  using Base::m_matrixV;
+  using Base::m_nonzeroSingularValues;
+  using Base::m_prescribedThreshold;
+  using Base::m_singularValues;
+  using Base::m_usePrescribedThreshold;
+  using Base::ShouldComputeThinU;
+  using Base::ShouldComputeThinV;
+
+  EIGEN_STATIC_ASSERT(!(ShouldComputeThinU && int(QRPreconditioner) == int(FullPivHouseholderQRPreconditioner)) &&
+                          !(ShouldComputeThinU && int(QRPreconditioner) == int(FullPivHouseholderQRPreconditioner)),
+                      "JacobiSVD: can't compute thin U or thin V with the FullPivHouseholderQR preconditioner. "
+                      "Use the ColPivHouseholderQR preconditioner instead.")
+
+  template <typename MatrixType__, int Options__, bool IsComplex_>
+  friend struct internal::svd_precondition_2x2_block_to_be_real;
+  template <typename MatrixType__, int Options__, int QRPreconditioner_, int Case_, bool DoAnything_>
+  friend struct internal::qr_preconditioner_impl;
+
+  internal::qr_preconditioner_impl<MatrixType, Options, QRPreconditioner, internal::PreconditionIfMoreColsThanRows>
+      m_qr_precond_morecols;
+  internal::qr_preconditioner_impl<MatrixType, Options, QRPreconditioner, internal::PreconditionIfMoreRowsThanCols>
+      m_qr_precond_morerows;
+  WorkMatrixType m_workMatrix;
 };
 
-template<typename MatrixType, int QRPreconditioner>
-void JacobiSVD<MatrixType, QRPreconditioner>::allocate(Index rows, Index cols, unsigned int computationOptions)
-{
-  eigen_assert(rows >= 0 && cols >= 0);
-
-  if (m_isAllocated &&
-      rows == m_rows &&
-      cols == m_cols &&
-      computationOptions == m_computationOptions)
-  {
-    return;
-  }
-
-  m_rows = rows;
-  m_cols = cols;
-  m_isInitialized = false;
-  m_isAllocated = true;
-  m_computationOptions = computationOptions;
-  m_computeFullU = (computationOptions & ComputeFullU) != 0;
-  m_computeThinU = (computationOptions & ComputeThinU) != 0;
-  m_computeFullV = (computationOptions & ComputeFullV) != 0;
-  m_computeThinV = (computationOptions & ComputeThinV) != 0;
-  eigen_assert(!(m_computeFullU && m_computeThinU) && "JacobiSVD: you can't ask for both full and thin U");
-  eigen_assert(!(m_computeFullV && m_computeThinV) && "JacobiSVD: you can't ask for both full and thin V");
-  eigen_assert(EIGEN_IMPLIES(m_computeThinU || m_computeThinV, MatrixType::ColsAtCompileTime==Dynamic) &&
-              "JacobiSVD: thin U and V are only available when your matrix has a dynamic number of columns.");
-  if (QRPreconditioner == FullPivHouseholderQRPreconditioner)
-  {
-      eigen_assert(!(m_computeThinU || m_computeThinV) &&
-              "JacobiSVD: can't compute thin U or thin V with the FullPivHouseholderQR preconditioner. "
-              "Use the ColPivHouseholderQR preconditioner instead.");
-  }
-  m_diagSize = (std::min)(m_rows, m_cols);
-  m_singularValues.resize(m_diagSize);
-  if(RowsAtCompileTime==Dynamic)
-    m_matrixU.resize(m_rows, m_computeFullU ? m_rows
-                            : m_computeThinU ? m_diagSize
-                            : 0);
-  if(ColsAtCompileTime==Dynamic)
-    m_matrixV.resize(m_cols, m_computeFullV ? m_cols
-                            : m_computeThinV ? m_diagSize
-                            : 0);
-  m_workMatrix.resize(m_diagSize, m_diagSize);
-  
-  if(m_cols>m_rows)   m_qr_precond_morecols.allocate(*this);
-  if(m_rows>m_cols)   m_qr_precond_morerows.allocate(*this);
-  if(m_cols!=m_cols)  m_scaledMatrix.resize(rows,cols);
+template <typename MatrixType, int Options>
+template <typename Derived>
+JacobiSVD<MatrixType, Options>& JacobiSVD<MatrixType, Options>::compute_impl(const TriangularBase<Derived>& matrix,
+                                                                             unsigned int computationOptions) {
+  return compute_impl(matrix.toDenseMatrix(), computationOptions);
 }
 
-template<typename MatrixType, int QRPreconditioner>
-JacobiSVD<MatrixType, QRPreconditioner>&
-JacobiSVD<MatrixType, QRPreconditioner>::compute(const MatrixType& matrix, unsigned int computationOptions)
-{
-  check_template_parameters();
-  
+template <typename MatrixType, int Options>
+template <typename Derived>
+JacobiSVD<MatrixType, Options>& JacobiSVD<MatrixType, Options>::compute_impl(const MatrixBase<Derived>& matrix,
+                                                                             unsigned int computationOptions) {
+  EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Derived, MatrixType);
+  EIGEN_STATIC_ASSERT((std::is_same<typename Derived::Scalar, typename MatrixType::Scalar>::value),
+                      Input matrix must have the same Scalar type as the BDCSVD object.);
+
   using std::abs;
+
   allocate(matrix.rows(), matrix.cols(), computationOptions);
 
-  // currently we stop when we reach precision 2*epsilon as the last bit of precision can require an unreasonable number of iterations,
-  // only worsening the precision of U and V as we accumulate more rotations
+  // currently we stop when we reach precision 2*epsilon as the last bit of precision can require an unreasonable number
+  // of iterations, only worsening the precision of U and V as we accumulate more rotations
   const RealScalar precision = RealScalar(2) * NumTraits<Scalar>::epsilon();
 
-  // limit for very small denormal numbers to be considered zero in order to avoid infinite loops (see bug 286)
-  const RealScalar considerAsZero = RealScalar(2) * std::numeric_limits<RealScalar>::denorm_min();
+  // limit for denormal numbers to be considered zero in order to avoid infinite loops (see bug 286)
+  const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
 
   // Scaling factor to reduce over/under-flows
-  RealScalar scale = matrix.cwiseAbs().maxCoeff();
-  if(scale==RealScalar(0)) scale = RealScalar(1);
-  
+  RealScalar scale = matrix.cwiseAbs().template maxCoeff<PropagateNaN>();
+  if (!(numext::isfinite)(scale)) {
+    m_isInitialized = true;
+    m_info = InvalidInput;
+    m_nonzeroSingularValues = 0;
+    return *this;
+  }
+  if (numext::is_exactly_zero(scale)) scale = RealScalar(1);
+
   /*** step 1. The R-SVD step: we use a QR decomposition to reduce to the case of a square matrix */
 
-  if(m_rows!=m_cols)
-  {
-    m_scaledMatrix = matrix / scale;
-    m_qr_precond_morecols.run(*this, m_scaledMatrix);
-    m_qr_precond_morerows.run(*this, m_scaledMatrix);
-  }
-  else
-  {
-    m_workMatrix = matrix.block(0,0,m_diagSize,m_diagSize) / scale;
-    if(m_computeFullU) m_matrixU.setIdentity(m_rows,m_rows);
-    if(m_computeThinU) m_matrixU.setIdentity(m_rows,m_diagSize);
-    if(m_computeFullV) m_matrixV.setIdentity(m_cols,m_cols);
-    if(m_computeThinV) m_matrixV.setIdentity(m_cols, m_diagSize);
+  if (rows() != cols()) {
+    m_qr_precond_morecols.run(*this, matrix / scale);
+    m_qr_precond_morerows.run(*this, matrix / scale);
+  } else {
+    m_workMatrix =
+        matrix.template topLeftCorner<DiagSizeAtCompileTime, DiagSizeAtCompileTime>(diagSize(), diagSize()) / scale;
+    if (m_computeFullU) m_matrixU.setIdentity(rows(), rows());
+    if (m_computeThinU) m_matrixU.setIdentity(rows(), diagSize());
+    if (m_computeFullV) m_matrixV.setIdentity(cols(), cols());
+    if (m_computeThinV) m_matrixV.setIdentity(cols(), diagSize());
   }
 
   /*** step 2. The main Jacobi SVD iteration. ***/
+  RealScalar maxDiagEntry = m_workMatrix.cwiseAbs().diagonal().maxCoeff();
 
   bool finished = false;
-  while(!finished)
-  {
+  while (!finished) {
     finished = true;
 
     // do a sweep: for all index pairs (p,q), perform SVD of the corresponding 2x2 sub-matrix
 
-    for(Index p = 1; p < m_diagSize; ++p)
-    {
-      for(Index q = 0; q < p; ++q)
-      {
+    for (Index p = 1; p < diagSize(); ++p) {
+      for (Index q = 0; q < p; ++q) {
         // if this 2x2 sub-matrix is not diagonal already...
         // notice that this comparison will evaluate to false if any NaN is involved, ensuring that NaN's don't
         // keep us iterating forever. Similarly, small denormal numbers are considered zero.
-        using std::max;
-        RealScalar threshold = (max)(considerAsZero, precision * (max)(abs(m_workMatrix.coeff(p,p)),
-                                                                       abs(m_workMatrix.coeff(q,q))));
-        // We compare both values to threshold instead of calling max to be robust to NaN (See bug 791)
-        if(abs(m_workMatrix.coeff(p,q))>threshold || abs(m_workMatrix.coeff(q,p)) > threshold)
-        {
+        RealScalar threshold = numext::maxi<RealScalar>(considerAsZero, precision * maxDiagEntry);
+        if (abs(m_workMatrix.coeff(p, q)) > threshold || abs(m_workMatrix.coeff(q, p)) > threshold) {
           finished = false;
-
           // perform SVD decomposition of 2x2 sub-matrix corresponding to indices p,q to make it diagonal
-          internal::svd_precondition_2x2_block_to_be_real<MatrixType, QRPreconditioner>::run(m_workMatrix, *this, p, q);
-          JacobiRotation<RealScalar> j_left, j_right;
-          internal::real_2x2_jacobi_svd(m_workMatrix, p, q, &j_left, &j_right);
-
-          // accumulate resulting Jacobi rotations
-          m_workMatrix.applyOnTheLeft(p,q,j_left);
-          if(computeU()) m_matrixU.applyOnTheRight(p,q,j_left.transpose());
-
-          m_workMatrix.applyOnTheRight(p,q,j_right);
-          if(computeV()) m_matrixV.applyOnTheRight(p,q,j_right);
+          // the complex to real operation returns true if the updated 2x2 block is not already diagonal
+          if (internal::svd_precondition_2x2_block_to_be_real<MatrixType, Options>::run(m_workMatrix, *this, p, q,
+                                                                                        maxDiagEntry)) {
+            JacobiRotation<RealScalar> j_left, j_right;
+            internal::real_2x2_jacobi_svd(m_workMatrix, p, q, &j_left, &j_right);
+
+            // accumulate resulting Jacobi rotations
+            m_workMatrix.applyOnTheLeft(p, q, j_left);
+            if (computeU()) m_matrixU.applyOnTheRight(p, q, j_left.transpose());
+
+            m_workMatrix.applyOnTheRight(p, q, j_right);
+            if (computeV()) m_matrixV.applyOnTheRight(p, q, j_right);
+
+            // keep track of the largest diagonal coefficient
+            maxDiagEntry = numext::maxi<RealScalar>(
+                maxDiagEntry, numext::maxi<RealScalar>(abs(m_workMatrix.coeff(p, p)), abs(m_workMatrix.coeff(q, q))));
+          }
         }
       }
     }
   }
 
-  /*** step 3. The work matrix is now diagonal, so ensure it's positive so its diagonal entries are the singular values ***/
-
-  for(Index i = 0; i < m_diagSize; ++i)
-  {
-    RealScalar a = abs(m_workMatrix.coeff(i,i));
-    m_singularValues.coeffRef(i) = a;
-    if(computeU() && (a!=RealScalar(0))) m_matrixU.col(i) *= m_workMatrix.coeff(i,i)/a;
+  /*** step 3. The work matrix is now diagonal, so ensure it's positive so its diagonal entries are the singular values
+   * ***/
+
+  for (Index i = 0; i < diagSize(); ++i) {
+    // For a complex matrix, some diagonal coefficients might note have been
+    // treated by svd_precondition_2x2_block_to_be_real, and the imaginary part
+    // of some diagonal entry might not be null.
+    if (NumTraits<Scalar>::IsComplex && abs(numext::imag(m_workMatrix.coeff(i, i))) > considerAsZero) {
+      RealScalar a = abs(m_workMatrix.coeff(i, i));
+      m_singularValues.coeffRef(i) = abs(a);
+      if (computeU()) m_matrixU.col(i) *= m_workMatrix.coeff(i, i) / a;
+    } else {
+      // m_workMatrix.coeff(i,i) is already real, no difficulty:
+      RealScalar a = numext::real(m_workMatrix.coeff(i, i));
+      m_singularValues.coeffRef(i) = abs(a);
+      if (computeU() && (a < RealScalar(0))) m_matrixU.col(i) = -m_matrixU.col(i);
+    }
   }
 
+  m_singularValues *= scale;
+
   /*** step 4. Sort singular values in descending order and compute the number of nonzero singular values ***/
 
-  m_nonzeroSingularValues = m_diagSize;
-  for(Index i = 0; i < m_diagSize; i++)
-  {
+  m_nonzeroSingularValues = diagSize();
+  for (Index i = 0; i < diagSize(); i++) {
     Index pos;
-    RealScalar maxRemainingSingularValue = m_singularValues.tail(m_diagSize-i).maxCoeff(&pos);
-    if(maxRemainingSingularValue == RealScalar(0))
-    {
+    RealScalar maxRemainingSingularValue = m_singularValues.tail(diagSize() - i).maxCoeff(&pos);
+    if (numext::is_exactly_zero(maxRemainingSingularValue)) {
       m_nonzeroSingularValues = i;
       break;
     }
-    if(pos)
-    {
+    if (pos) {
       pos += i;
       std::swap(m_singularValues.coeffRef(i), m_singularValues.coeffRef(pos));
-      if(computeU()) m_matrixU.col(pos).swap(m_matrixU.col(i));
-      if(computeV()) m_matrixV.col(pos).swap(m_matrixV.col(i));
+      if (computeU()) m_matrixU.col(pos).swap(m_matrixU.col(i));
+      if (computeV()) m_matrixV.col(pos).swap(m_matrixV.col(i));
     }
   }
-  
-  m_singularValues *= scale;
 
   m_isInitialized = true;
   return *this;
 }
 
-namespace internal {
-template<typename _MatrixType, int QRPreconditioner, typename Rhs>
-struct solve_retval<JacobiSVD<_MatrixType, QRPreconditioner>, Rhs>
-  : solve_retval_base<JacobiSVD<_MatrixType, QRPreconditioner>, Rhs>
-{
-  typedef JacobiSVD<_MatrixType, QRPreconditioner> JacobiSVDType;
-  EIGEN_MAKE_SOLVE_HELPERS(JacobiSVDType,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    eigen_assert(rhs().rows() == dec().rows());
-
-    // A = U S V^*
-    // So A^{-1} = V S^{-1} U^*
-
-    Matrix<Scalar, Dynamic, Rhs::ColsAtCompileTime, 0, _MatrixType::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime> tmp;
-    Index rank = dec().rank();
-    
-    tmp.noalias() = dec().matrixU().leftCols(rank).adjoint() * rhs();
-    tmp = dec().singularValues().head(rank).asDiagonal().inverse() * tmp;
-    dst = dec().matrixV().leftCols(rank) * tmp;
-  }
-};
-} // end namespace internal
-
 /** \svd_module
-  *
-  * \return the singular value decomposition of \c *this computed by two-sided
-  * Jacobi transformations.
-  *
-  * \sa class JacobiSVD
-  */
-template<typename Derived>
-JacobiSVD<typename MatrixBase<Derived>::PlainObject>
-MatrixBase<Derived>::jacobiSvd(unsigned int computationOptions) const
-{
-  return JacobiSVD<PlainObject>(*this, computationOptions);
+ *
+ * \return the singular value decomposition of \c *this computed by two-sided
+ * Jacobi transformations.
+ *
+ * \sa class JacobiSVD
+ */
+template <typename Derived>
+template <int Options>
+JacobiSVD<typename MatrixBase<Derived>::PlainObject, Options> MatrixBase<Derived>::jacobiSvd() const {
+  return JacobiSVD<PlainObject, Options>(*this);
+}
+
+template <typename Derived>
+template <int Options>
+JacobiSVD<typename MatrixBase<Derived>::PlainObject, Options> MatrixBase<Derived>::jacobiSvd(
+    unsigned int computationOptions) const {
+  return JacobiSVD<PlainObject, Options>(*this, computationOptions);
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_JACOBISVD_H
+#endif  // EIGEN_JACOBISVD_H
diff --git a/inst/include/Eigen/src/SVD/JacobiSVD_LAPACKE.h b/inst/include/Eigen/src/SVD/JacobiSVD_LAPACKE.h
new file mode 100644
index 00000000..db263669
--- /dev/null
+++ b/inst/include/Eigen/src/SVD/JacobiSVD_LAPACKE.h
@@ -0,0 +1,127 @@
+/*
+ Copyright (c) 2011, Intel Corporation. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without modification,
+ are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of Intel Corporation nor the names of its contributors may
+   be used to endorse or promote products derived from this software without
+   specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
+ ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ ********************************************************************************
+ *   Content : Eigen bindings to LAPACKe
+ *    Singular Value Decomposition - SVD.
+ ********************************************************************************
+*/
+
+#ifndef EIGEN_JACOBISVD_LAPACKE_H
+#define EIGEN_JACOBISVD_LAPACKE_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \internal Specialization for the data types supported by LAPACKe */
+
+#define EIGEN_LAPACKE_SVD(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_PREFIX, EIGCOLROW, LAPACKE_COLROW, OPTIONS) \
+  template <>                                                                                                       \
+  template <typename Derived>                                                                                       \
+  inline JacobiSVD<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>, OPTIONS>&                        \
+  JacobiSVD<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>, OPTIONS>::compute_impl(                 \
+      const MatrixBase<Derived>& matrix, unsigned int computationOptions) {                                         \
+    /*typedef MatrixType::Scalar Scalar;*/                                                                          \
+    /*typedef MatrixType::RealScalar RealScalar;*/                                                                  \
+    allocate(matrix.rows(), matrix.cols(), computationOptions);                                                     \
+                                                                                                                    \
+    /*const RealScalar precision = RealScalar(2) * NumTraits<Scalar>::epsilon();*/                                  \
+    m_nonzeroSingularValues = diagSize();                                                                           \
+                                                                                                                    \
+    lapack_int lda = internal::convert_index<lapack_int>(matrix.outerStride()), ldu, ldvt;                          \
+    lapack_int matrix_order = LAPACKE_COLROW;                                                                       \
+    char jobu, jobvt;                                                                                               \
+    LAPACKE_TYPE *u, *vt, dummy;                                                                                    \
+    jobu = (m_computeFullU) ? 'A' : (m_computeThinU) ? 'S' : 'N';                                                   \
+    jobvt = (m_computeFullV) ? 'A' : (m_computeThinV) ? 'S' : 'N';                                                  \
+    if (computeU()) {                                                                                               \
+      ldu = internal::convert_index<lapack_int>(m_matrixU.outerStride());                                           \
+      u = (LAPACKE_TYPE*)m_matrixU.data();                                                                          \
+    } else {                                                                                                        \
+      ldu = 1;                                                                                                      \
+      u = &dummy;                                                                                                   \
+    }                                                                                                               \
+    MatrixType localV;                                                                                              \
+    lapack_int vt_rows = (m_computeFullV)   ? internal::convert_index<lapack_int>(cols())                           \
+                         : (m_computeThinV) ? internal::convert_index<lapack_int>(diagSize())                       \
+                                            : 1;                                                                    \
+    if (computeV()) {                                                                                               \
+      localV.resize(vt_rows, cols());                                                                               \
+      ldvt = internal::convert_index<lapack_int>(localV.outerStride());                                             \
+      vt = (LAPACKE_TYPE*)localV.data();                                                                            \
+    } else {                                                                                                        \
+      ldvt = 1;                                                                                                     \
+      vt = &dummy;                                                                                                  \
+    }                                                                                                               \
+    Matrix<LAPACKE_RTYPE, Dynamic, Dynamic> superb;                                                                 \
+    superb.resize(diagSize(), 1);                                                                                   \
+    MatrixType m_temp;                                                                                              \
+    m_temp = matrix;                                                                                                \
+    lapack_int info = LAPACKE_##LAPACKE_PREFIX##gesvd(                                                              \
+        matrix_order, jobu, jobvt, internal::convert_index<lapack_int>(rows()),                                     \
+        internal::convert_index<lapack_int>(cols()), (LAPACKE_TYPE*)m_temp.data(), lda,                             \
+        (LAPACKE_RTYPE*)m_singularValues.data(), u, ldu, vt, ldvt, superb.data());                                  \
+    /* Check the result of the LAPACK call */                                                                       \
+    if (info < 0 || !m_singularValues.allFinite()) {                                                                \
+      m_info = InvalidInput;                                                                                        \
+    } else if (info > 0) {                                                                                          \
+      m_info = NoConvergence;                                                                                       \
+    } else {                                                                                                        \
+      m_info = Success;                                                                                             \
+      if (computeV()) m_matrixV = localV.adjoint();                                                                 \
+    }                                                                                                               \
+    /* for(int i=0;i<diagSize();i++) if (m_singularValues.coeffRef(i) < precision) { m_nonzeroSingularValues--;     \
+     * m_singularValues.coeffRef(i)=RealScalar(0);}*/                                                               \
+    m_isInitialized = true;                                                                                         \
+    return *this;                                                                                                   \
+  }
+
+#define EIGEN_LAPACK_SVD_OPTIONS(OPTIONS)                                                            \
+  EIGEN_LAPACKE_SVD(double, double, double, d, ColMajor, LAPACK_COL_MAJOR, OPTIONS)                  \
+  EIGEN_LAPACKE_SVD(float, float, float, s, ColMajor, LAPACK_COL_MAJOR, OPTIONS)                     \
+  EIGEN_LAPACKE_SVD(dcomplex, lapack_complex_double, double, z, ColMajor, LAPACK_COL_MAJOR, OPTIONS) \
+  EIGEN_LAPACKE_SVD(scomplex, lapack_complex_float, float, c, ColMajor, LAPACK_COL_MAJOR, OPTIONS)   \
+                                                                                                     \
+  EIGEN_LAPACKE_SVD(double, double, double, d, RowMajor, LAPACK_ROW_MAJOR, OPTIONS)                  \
+  EIGEN_LAPACKE_SVD(float, float, float, s, RowMajor, LAPACK_ROW_MAJOR, OPTIONS)                     \
+  EIGEN_LAPACKE_SVD(dcomplex, lapack_complex_double, double, z, RowMajor, LAPACK_ROW_MAJOR, OPTIONS) \
+  EIGEN_LAPACKE_SVD(scomplex, lapack_complex_float, float, c, RowMajor, LAPACK_ROW_MAJOR, OPTIONS)
+
+EIGEN_LAPACK_SVD_OPTIONS(0)
+EIGEN_LAPACK_SVD_OPTIONS(ComputeThinU)
+EIGEN_LAPACK_SVD_OPTIONS(ComputeThinV)
+EIGEN_LAPACK_SVD_OPTIONS(ComputeFullU)
+EIGEN_LAPACK_SVD_OPTIONS(ComputeFullV)
+EIGEN_LAPACK_SVD_OPTIONS(ComputeThinU | ComputeThinV)
+EIGEN_LAPACK_SVD_OPTIONS(ComputeFullU | ComputeFullV)
+EIGEN_LAPACK_SVD_OPTIONS(ComputeThinU | ComputeFullV)
+EIGEN_LAPACK_SVD_OPTIONS(ComputeFullU | ComputeThinV)
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_JACOBISVD_LAPACKE_H
diff --git a/inst/include/Eigen/src/SVD/JacobiSVD_MKL.h b/inst/include/Eigen/src/SVD/JacobiSVD_MKL.h
deleted file mode 100644
index decda754..00000000
--- a/inst/include/Eigen/src/SVD/JacobiSVD_MKL.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- Copyright (c) 2011, Intel Corporation. All rights reserved.
-
- Redistribution and use in source and binary forms, with or without modification,
- are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice, this
-   list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above copyright notice,
-   this list of conditions and the following disclaimer in the documentation
-   and/or other materials provided with the distribution.
- * Neither the name of Intel Corporation nor the names of its contributors may
-   be used to endorse or promote products derived from this software without
-   specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
- ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
- ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
- ********************************************************************************
- *   Content : Eigen bindings to Intel(R) MKL
- *    Singular Value Decomposition - SVD.
- ********************************************************************************
-*/
-
-#ifndef EIGEN_JACOBISVD_MKL_H
-#define EIGEN_JACOBISVD_MKL_H
-
-#include "Eigen/src/Core/util/MKL_support.h"
-
-namespace Eigen { 
-
-/** \internal Specialization for the data types supported by MKL */
-
-#define EIGEN_MKL_SVD(EIGTYPE, MKLTYPE, MKLRTYPE, MKLPREFIX, EIGCOLROW, MKLCOLROW) \
-template<> inline \
-JacobiSVD<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>, ColPivHouseholderQRPreconditioner>& \
-JacobiSVD<Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>, ColPivHouseholderQRPreconditioner>::compute(const Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic>& matrix, unsigned int computationOptions) \
-{ \
-  typedef Matrix<EIGTYPE, Dynamic, Dynamic, EIGCOLROW, Dynamic, Dynamic> MatrixType; \
-  typedef MatrixType::Scalar Scalar; \
-  typedef MatrixType::RealScalar RealScalar; \
-  allocate(matrix.rows(), matrix.cols(), computationOptions); \
-\
-  /*const RealScalar precision = RealScalar(2) * NumTraits<Scalar>::epsilon();*/ \
-  m_nonzeroSingularValues = m_diagSize; \
-\
-  lapack_int lda = matrix.outerStride(), ldu, ldvt; \
-  lapack_int matrix_order = MKLCOLROW; \
-  char jobu, jobvt; \
-  MKLTYPE *u, *vt, dummy; \
-  jobu  = (m_computeFullU) ? 'A' : (m_computeThinU) ? 'S' : 'N'; \
-  jobvt = (m_computeFullV) ? 'A' : (m_computeThinV) ? 'S' : 'N'; \
-  if (computeU()) { \
-    ldu  = m_matrixU.outerStride(); \
-    u    = (MKLTYPE*)m_matrixU.data(); \
-  } else { ldu=1; u=&dummy; }\
-  MatrixType localV; \
-  ldvt = (m_computeFullV) ? m_cols : (m_computeThinV) ? m_diagSize : 1; \
-  if (computeV()) { \
-    localV.resize(ldvt, m_cols); \
-    vt   = (MKLTYPE*)localV.data(); \
-  } else { ldvt=1; vt=&dummy; }\
-  Matrix<MKLRTYPE, Dynamic, Dynamic> superb; superb.resize(m_diagSize, 1); \
-  MatrixType m_temp; m_temp = matrix; \
-  LAPACKE_##MKLPREFIX##gesvd( matrix_order, jobu, jobvt, m_rows, m_cols, (MKLTYPE*)m_temp.data(), lda, (MKLRTYPE*)m_singularValues.data(), u, ldu, vt, ldvt, superb.data()); \
-  if (computeV()) m_matrixV = localV.adjoint(); \
- /* for(int i=0;i<m_diagSize;i++) if (m_singularValues.coeffRef(i) < precision) { m_nonzeroSingularValues--; m_singularValues.coeffRef(i)=RealScalar(0);}*/ \
-  m_isInitialized = true; \
-  return *this; \
-}
-
-EIGEN_MKL_SVD(double,   double,        double, d, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_MKL_SVD(float,    float,         float , s, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_MKL_SVD(dcomplex, MKL_Complex16, double, z, ColMajor, LAPACK_COL_MAJOR)
-EIGEN_MKL_SVD(scomplex, MKL_Complex8,  float , c, ColMajor, LAPACK_COL_MAJOR)
-
-EIGEN_MKL_SVD(double,   double,        double, d, RowMajor, LAPACK_ROW_MAJOR)
-EIGEN_MKL_SVD(float,    float,         float , s, RowMajor, LAPACK_ROW_MAJOR)
-EIGEN_MKL_SVD(dcomplex, MKL_Complex16, double, z, RowMajor, LAPACK_ROW_MAJOR)
-EIGEN_MKL_SVD(scomplex, MKL_Complex8,  float , c, RowMajor, LAPACK_ROW_MAJOR)
-
-} // end namespace Eigen
-
-#endif // EIGEN_JACOBISVD_MKL_H
diff --git a/inst/include/Eigen/src/SVD/SVDBase.h b/inst/include/Eigen/src/SVD/SVDBase.h
new file mode 100644
index 00000000..dcb4dba2
--- /dev/null
+++ b/inst/include/Eigen/src/SVD/SVDBase.h
@@ -0,0 +1,436 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// Copyright (C) 2013 Gauthier Brun <brun.gauthier@gmail.com>
+// Copyright (C) 2013 Nicolas Carre <nicolas.carre@ensimag.fr>
+// Copyright (C) 2013 Jean Ceccato <jean.ceccato@ensimag.fr>
+// Copyright (C) 2013 Pierre Zoppitelli <pierre.zoppitelli@ensimag.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SVDBASE_H
+#define EIGEN_SVDBASE_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+enum OptionsMasks {
+  QRPreconditionerBits = NoQRPreconditioner | HouseholderQRPreconditioner | ColPivHouseholderQRPreconditioner |
+                         FullPivHouseholderQRPreconditioner,
+  ComputationOptionsBits = ComputeThinU | ComputeFullU | ComputeThinV | ComputeFullV
+};
+
+constexpr int get_qr_preconditioner(int options) { return options & QRPreconditionerBits; }
+
+constexpr int get_computation_options(int options) { return options & ComputationOptionsBits; }
+
+constexpr bool should_svd_compute_thin_u(int options) { return (options & ComputeThinU) != 0; }
+constexpr bool should_svd_compute_full_u(int options) { return (options & ComputeFullU) != 0; }
+constexpr bool should_svd_compute_thin_v(int options) { return (options & ComputeThinV) != 0; }
+constexpr bool should_svd_compute_full_v(int options) { return (options & ComputeFullV) != 0; }
+
+template <typename MatrixType, int Options>
+void check_svd_options_assertions(unsigned int computationOptions, Index rows, Index cols) {
+  EIGEN_STATIC_ASSERT((Options & ComputationOptionsBits) == 0,
+                      "SVDBase: Cannot request U or V using both static and runtime options, even if they match. "
+                      "Requesting unitaries at runtime is DEPRECATED: "
+                      "Prefer requesting unitaries statically, using the Options template parameter.");
+  eigen_assert(
+      !(should_svd_compute_thin_u(computationOptions) && cols < rows && MatrixType::RowsAtCompileTime != Dynamic) &&
+      !(should_svd_compute_thin_v(computationOptions) && rows < cols && MatrixType::ColsAtCompileTime != Dynamic) &&
+      "SVDBase: If thin U is requested at runtime, your matrix must have more rows than columns or a dynamic number of "
+      "rows."
+      "Similarly, if thin V is requested at runtime, you matrix must have more columns than rows or a dynamic number "
+      "of columns.");
+  (void)computationOptions;
+  (void)rows;
+  (void)cols;
+}
+
+template <typename Derived>
+struct traits<SVDBase<Derived> > : traits<Derived> {
+  typedef MatrixXpr XprKind;
+  typedef SolverStorage StorageKind;
+  typedef int StorageIndex;
+  enum { Flags = 0 };
+};
+
+template <typename MatrixType, int Options_>
+struct svd_traits : traits<MatrixType> {
+  static constexpr int Options = Options_;
+  static constexpr bool ShouldComputeFullU = internal::should_svd_compute_full_u(Options);
+  static constexpr bool ShouldComputeThinU = internal::should_svd_compute_thin_u(Options);
+  static constexpr bool ShouldComputeFullV = internal::should_svd_compute_full_v(Options);
+  static constexpr bool ShouldComputeThinV = internal::should_svd_compute_thin_v(Options);
+  enum {
+    DiagSizeAtCompileTime =
+        internal::min_size_prefer_dynamic(MatrixType::RowsAtCompileTime, MatrixType::ColsAtCompileTime),
+    MaxDiagSizeAtCompileTime =
+        internal::min_size_prefer_dynamic(MatrixType::MaxRowsAtCompileTime, MatrixType::MaxColsAtCompileTime),
+    MatrixUColsAtCompileTime = ShouldComputeThinU ? DiagSizeAtCompileTime : MatrixType::RowsAtCompileTime,
+    MatrixVColsAtCompileTime = ShouldComputeThinV ? DiagSizeAtCompileTime : MatrixType::ColsAtCompileTime,
+    MatrixUMaxColsAtCompileTime = ShouldComputeThinU ? MaxDiagSizeAtCompileTime : MatrixType::MaxRowsAtCompileTime,
+    MatrixVMaxColsAtCompileTime = ShouldComputeThinV ? MaxDiagSizeAtCompileTime : MatrixType::MaxColsAtCompileTime
+  };
+};
+}  // namespace internal
+
+/** \ingroup SVD_Module
+ *
+ *
+ * \class SVDBase
+ *
+ * \brief Base class of SVD algorithms
+ *
+ * \tparam Derived the type of the actual SVD decomposition
+ *
+ * SVD decomposition consists in decomposing any n-by-p matrix \a A as a product
+ *   \f[ A = U S V^* \f]
+ * where \a U is a n-by-n unitary, \a V is a p-by-p unitary, and \a S is a n-by-p real positive matrix which is zero
+ * outside of its main diagonal; the diagonal entries of S are known as the \em singular \em values of \a A and the
+ * columns of \a U and \a V are known as the left and right \em singular \em vectors of \a A respectively.
+ *
+ * Singular values are always sorted in decreasing order.
+ *
+ *
+ * You can ask for only \em thin \a U or \a V to be computed, meaning the following. In case of a rectangular n-by-p
+ * matrix, letting \a m be the smaller value among \a n and \a p, there are only \a m singular vectors; the remaining
+ * columns of \a U and \a V do not correspond to actual singular vectors. Asking for \em thin \a U or \a V means asking
+ * for only their \a m first columns to be formed. So \a U is then a n-by-m matrix, and \a V is then a p-by-m matrix.
+ * Notice that thin \a U and \a V are all you need for (least squares) solving.
+ *
+ * The status of the computation can be retrieved using the \a info() method. Unless \a info() returns \a Success, the
+ * results should be not considered well defined.
+ *
+ * If the input matrix has inf or nan coefficients, the result of the computation is undefined, and \a info() will
+ * return \a InvalidInput, but the computation is guaranteed to terminate in finite (and reasonable) time. \sa class
+ * BDCSVD, class JacobiSVD
+ */
+template <typename Derived>
+class SVDBase : public SolverBase<SVDBase<Derived> > {
+ public:
+  template <typename Derived_>
+  friend struct internal::solve_assertion;
+
+  typedef typename internal::traits<Derived>::MatrixType MatrixType;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
+  typedef typename Eigen::internal::traits<SVDBase>::StorageIndex StorageIndex;
+
+  static constexpr bool ShouldComputeFullU = internal::traits<Derived>::ShouldComputeFullU;
+  static constexpr bool ShouldComputeThinU = internal::traits<Derived>::ShouldComputeThinU;
+  static constexpr bool ShouldComputeFullV = internal::traits<Derived>::ShouldComputeFullV;
+  static constexpr bool ShouldComputeThinV = internal::traits<Derived>::ShouldComputeThinV;
+
+  enum {
+    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+    ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+    DiagSizeAtCompileTime = internal::min_size_prefer_dynamic(RowsAtCompileTime, ColsAtCompileTime),
+    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
+    MaxDiagSizeAtCompileTime = internal::min_size_prefer_fixed(MaxRowsAtCompileTime, MaxColsAtCompileTime),
+    MatrixOptions = internal::traits<MatrixType>::Options,
+    MatrixUColsAtCompileTime = internal::traits<Derived>::MatrixUColsAtCompileTime,
+    MatrixVColsAtCompileTime = internal::traits<Derived>::MatrixVColsAtCompileTime,
+    MatrixUMaxColsAtCompileTime = internal::traits<Derived>::MatrixUMaxColsAtCompileTime,
+    MatrixVMaxColsAtCompileTime = internal::traits<Derived>::MatrixVMaxColsAtCompileTime
+  };
+
+  EIGEN_STATIC_ASSERT(!(ShouldComputeFullU && ShouldComputeThinU), "SVDBase: Cannot request both full and thin U")
+  EIGEN_STATIC_ASSERT(!(ShouldComputeFullV && ShouldComputeThinV), "SVDBase: Cannot request both full and thin V")
+
+  typedef
+      typename internal::make_proper_matrix_type<Scalar, RowsAtCompileTime, MatrixUColsAtCompileTime, MatrixOptions,
+                                                 MaxRowsAtCompileTime, MatrixUMaxColsAtCompileTime>::type MatrixUType;
+  typedef
+      typename internal::make_proper_matrix_type<Scalar, ColsAtCompileTime, MatrixVColsAtCompileTime, MatrixOptions,
+                                                 MaxColsAtCompileTime, MatrixVMaxColsAtCompileTime>::type MatrixVType;
+
+  typedef typename internal::plain_diag_type<MatrixType, RealScalar>::type SingularValuesType;
+
+  Derived& derived() { return *static_cast<Derived*>(this); }
+  const Derived& derived() const { return *static_cast<const Derived*>(this); }
+
+  /** \returns the \a U matrix.
+   *
+   * For the SVD decomposition of a n-by-p matrix, letting \a m be the minimum of \a n and \a p,
+   * the U matrix is n-by-n if you asked for \link Eigen::ComputeFullU ComputeFullU \endlink, and is n-by-m if you asked
+   * for \link Eigen::ComputeThinU ComputeThinU \endlink.
+   *
+   * The \a m first columns of \a U are the left singular vectors of the matrix being decomposed.
+   *
+   * This method asserts that you asked for \a U to be computed.
+   */
+  const MatrixUType& matrixU() const {
+    _check_compute_assertions();
+    eigen_assert(computeU() && "This SVD decomposition didn't compute U. Did you ask for it?");
+    return m_matrixU;
+  }
+
+  /** \returns the \a V matrix.
+   *
+   * For the SVD decomposition of a n-by-p matrix, letting \a m be the minimum of \a n and \a p,
+   * the V matrix is p-by-p if you asked for \link Eigen::ComputeFullV ComputeFullV \endlink, and is p-by-m if you asked
+   * for \link Eigen::ComputeThinV ComputeThinV \endlink.
+   *
+   * The \a m first columns of \a V are the right singular vectors of the matrix being decomposed.
+   *
+   * This method asserts that you asked for \a V to be computed.
+   */
+  const MatrixVType& matrixV() const {
+    _check_compute_assertions();
+    eigen_assert(computeV() && "This SVD decomposition didn't compute V. Did you ask for it?");
+    return m_matrixV;
+  }
+
+  /** \returns the vector of singular values.
+   *
+   * For the SVD decomposition of a n-by-p matrix, letting \a m be the minimum of \a n and \a p, the
+   * returned vector has size \a m.  Singular values are always sorted in decreasing order.
+   */
+  const SingularValuesType& singularValues() const {
+    _check_compute_assertions();
+    return m_singularValues;
+  }
+
+  /** \returns the number of singular values that are not exactly 0 */
+  Index nonzeroSingularValues() const {
+    _check_compute_assertions();
+    return m_nonzeroSingularValues;
+  }
+
+  /** \returns the rank of the matrix of which \c *this is the SVD.
+   *
+   * \note This method has to determine which singular values should be considered nonzero.
+   *       For that, it uses the threshold value that you can control by calling
+   *       setThreshold(const RealScalar&).
+   */
+  inline Index rank() const {
+    using std::abs;
+    _check_compute_assertions();
+    if (m_singularValues.size() == 0) return 0;
+    RealScalar premultiplied_threshold =
+        numext::maxi<RealScalar>(m_singularValues.coeff(0) * threshold(), (std::numeric_limits<RealScalar>::min)());
+    Index i = m_nonzeroSingularValues - 1;
+    while (i >= 0 && m_singularValues.coeff(i) < premultiplied_threshold) --i;
+    return i + 1;
+  }
+
+  /** Allows to prescribe a threshold to be used by certain methods, such as rank() and solve(),
+   * which need to determine when singular values are to be considered nonzero.
+   * This is not used for the SVD decomposition itself.
+   *
+   * When it needs to get the threshold value, Eigen calls threshold().
+   * The default is \c NumTraits<Scalar>::epsilon()
+   *
+   * \param threshold The new value to use as the threshold.
+   *
+   * A singular value will be considered nonzero if its value is strictly greater than
+   *  \f$ \vert singular value \vert \leqslant threshold \times \vert max singular value \vert \f$.
+   *
+   * If you want to come back to the default behavior, call setThreshold(Default_t)
+   */
+  Derived& setThreshold(const RealScalar& threshold) {
+    m_usePrescribedThreshold = true;
+    m_prescribedThreshold = threshold;
+    return derived();
+  }
+
+  /** Allows to come back to the default behavior, letting Eigen use its default formula for
+   * determining the threshold.
+   *
+   * You should pass the special object Eigen::Default as parameter here.
+   * \code svd.setThreshold(Eigen::Default); \endcode
+   *
+   * See the documentation of setThreshold(const RealScalar&).
+   */
+  Derived& setThreshold(Default_t) {
+    m_usePrescribedThreshold = false;
+    return derived();
+  }
+
+  /** Returns the threshold that will be used by certain methods such as rank().
+   *
+   * See the documentation of setThreshold(const RealScalar&).
+   */
+  RealScalar threshold() const {
+    eigen_assert(m_isInitialized || m_usePrescribedThreshold);
+    // this temporary is needed to workaround a MSVC issue
+    Index diagSize = (std::max<Index>)(1, m_diagSize);
+    return m_usePrescribedThreshold ? m_prescribedThreshold : RealScalar(diagSize) * NumTraits<Scalar>::epsilon();
+  }
+
+  /** \returns true if \a U (full or thin) is asked for in this SVD decomposition */
+  inline bool computeU() const { return m_computeFullU || m_computeThinU; }
+  /** \returns true if \a V (full or thin) is asked for in this SVD decomposition */
+  inline bool computeV() const { return m_computeFullV || m_computeThinV; }
+
+  inline Index rows() const { return m_rows.value(); }
+  inline Index cols() const { return m_cols.value(); }
+  inline Index diagSize() const { return m_diagSize.value(); }
+
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+  /** \returns a (least squares) solution of \f$ A x = b \f$ using the current SVD decomposition of A.
+   *
+   * \param b the right-hand-side of the equation to solve.
+   *
+   * \note Solving requires both U and V to be computed. Thin U and V are enough, there is no need for full U or V.
+   *
+   * \note SVD solving is implicitly least-squares. Thus, this method serves both purposes of exact solving and
+   * least-squares solving. In other words, the returned solution is guaranteed to minimize the Euclidean norm \f$ \Vert
+   * A x - b \Vert \f$.
+   */
+  template <typename Rhs>
+  inline const Solve<Derived, Rhs> solve(const MatrixBase<Rhs>& b) const;
+#endif
+
+  /** \brief Reports whether previous computation was successful.
+   *
+   * \returns \c Success if computation was successful.
+   */
+  EIGEN_DEVICE_FUNC ComputationInfo info() const {
+    eigen_assert(m_isInitialized && "SVD is not initialized.");
+    return m_info;
+  }
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  template <typename RhsType, typename DstType>
+  void _solve_impl(const RhsType& rhs, DstType& dst) const;
+
+  template <bool Conjugate, typename RhsType, typename DstType>
+  void _solve_impl_transposed(const RhsType& rhs, DstType& dst) const;
+#endif
+
+ protected:
+  EIGEN_STATIC_ASSERT_NON_INTEGER(Scalar)
+
+  void _check_compute_assertions() const { eigen_assert(m_isInitialized && "SVD is not initialized."); }
+
+  template <bool Transpose_, typename Rhs>
+  void _check_solve_assertion(const Rhs& b) const {
+    EIGEN_ONLY_USED_FOR_DEBUG(b);
+    _check_compute_assertions();
+    eigen_assert(computeU() && computeV() &&
+                 "SVDBase::solve(): Both unitaries U and V are required to be computed (thin unitaries suffice).");
+    eigen_assert((Transpose_ ? cols() : rows()) == b.rows() &&
+                 "SVDBase::solve(): invalid number of rows of the right hand side matrix b");
+  }
+
+  // return true if already allocated
+  bool allocate(Index rows, Index cols, unsigned int computationOptions);
+
+  MatrixUType m_matrixU;
+  MatrixVType m_matrixV;
+  SingularValuesType m_singularValues;
+  ComputationInfo m_info;
+  bool m_isInitialized, m_isAllocated, m_usePrescribedThreshold;
+  bool m_computeFullU, m_computeThinU;
+  bool m_computeFullV, m_computeThinV;
+  unsigned int m_computationOptions;
+  Index m_nonzeroSingularValues;
+  internal::variable_if_dynamic<Index, RowsAtCompileTime> m_rows;
+  internal::variable_if_dynamic<Index, ColsAtCompileTime> m_cols;
+  internal::variable_if_dynamic<Index, DiagSizeAtCompileTime> m_diagSize;
+  RealScalar m_prescribedThreshold;
+
+  /** \brief Default Constructor.
+   *
+   * Default constructor of SVDBase
+   */
+  SVDBase()
+      : m_matrixU(MatrixUType()),
+        m_matrixV(MatrixVType()),
+        m_singularValues(SingularValuesType()),
+        m_info(Success),
+        m_isInitialized(false),
+        m_isAllocated(false),
+        m_usePrescribedThreshold(false),
+        m_computeFullU(ShouldComputeFullU),
+        m_computeThinU(ShouldComputeThinU),
+        m_computeFullV(ShouldComputeFullV),
+        m_computeThinV(ShouldComputeThinV),
+        m_computationOptions(internal::traits<Derived>::Options),
+        m_nonzeroSingularValues(0),
+        m_rows(RowsAtCompileTime),
+        m_cols(ColsAtCompileTime),
+        m_diagSize(DiagSizeAtCompileTime),
+        m_prescribedThreshold(0) {}
+};
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+template <typename Derived>
+template <typename RhsType, typename DstType>
+void SVDBase<Derived>::_solve_impl(const RhsType& rhs, DstType& dst) const {
+  // A = U S V^*
+  // So A^{-1} = V S^{-1} U^*
+
+  Matrix<typename RhsType::Scalar, Dynamic, RhsType::ColsAtCompileTime, 0, MatrixType::MaxRowsAtCompileTime,
+         RhsType::MaxColsAtCompileTime>
+      tmp;
+  Index l_rank = rank();
+  tmp.noalias() = m_matrixU.leftCols(l_rank).adjoint() * rhs;
+  tmp = m_singularValues.head(l_rank).asDiagonal().inverse() * tmp;
+  dst.noalias() = m_matrixV.leftCols(l_rank) * tmp;
+}
+
+template <typename Derived>
+template <bool Conjugate, typename RhsType, typename DstType>
+void SVDBase<Derived>::_solve_impl_transposed(const RhsType& rhs, DstType& dst) const {
+  // A = U S V^*
+  // So  A^{-*} = U S^{-1} V^*
+  // And A^{-T} = U_conj S^{-1} V^T
+  Matrix<typename RhsType::Scalar, Dynamic, RhsType::ColsAtCompileTime, 0, MatrixType::MaxRowsAtCompileTime,
+         RhsType::MaxColsAtCompileTime>
+      tmp;
+  Index l_rank = rank();
+
+  tmp.noalias() = m_matrixV.leftCols(l_rank).transpose().template conjugateIf<Conjugate>() * rhs;
+  tmp = m_singularValues.head(l_rank).asDiagonal().inverse() * tmp;
+  dst = m_matrixU.template conjugateIf<!Conjugate>().leftCols(l_rank) * tmp;
+}
+#endif
+
+template <typename Derived>
+bool SVDBase<Derived>::allocate(Index rows, Index cols, unsigned int computationOptions) {
+  eigen_assert(rows >= 0 && cols >= 0);
+
+  if (m_isAllocated && rows == m_rows.value() && cols == m_cols.value() && computationOptions == m_computationOptions) {
+    return true;
+  }
+
+  m_rows.setValue(rows);
+  m_cols.setValue(cols);
+  m_info = Success;
+  m_isInitialized = false;
+  m_isAllocated = true;
+  m_computationOptions = computationOptions;
+  m_computeFullU = ShouldComputeFullU || internal::should_svd_compute_full_u(computationOptions);
+  m_computeThinU = ShouldComputeThinU || internal::should_svd_compute_thin_u(computationOptions);
+  m_computeFullV = ShouldComputeFullV || internal::should_svd_compute_full_v(computationOptions);
+  m_computeThinV = ShouldComputeThinV || internal::should_svd_compute_thin_v(computationOptions);
+
+  eigen_assert(!(m_computeFullU && m_computeThinU) && "SVDBase: you can't ask for both full and thin U");
+  eigen_assert(!(m_computeFullV && m_computeThinV) && "SVDBase: you can't ask for both full and thin V");
+
+  m_diagSize.setValue(numext::mini(m_rows.value(), m_cols.value()));
+  m_singularValues.resize(m_diagSize.value());
+  if (RowsAtCompileTime == Dynamic)
+    m_matrixU.resize(m_rows.value(), m_computeFullU ? m_rows.value() : m_computeThinU ? m_diagSize.value() : 0);
+  if (ColsAtCompileTime == Dynamic)
+    m_matrixV.resize(m_cols.value(), m_computeFullV ? m_cols.value() : m_computeThinV ? m_diagSize.value() : 0);
+
+  return false;
+}
+
+}  // namespace Eigen
+
+#endif  // EIGEN_SVDBASE_H
diff --git a/inst/include/Eigen/src/SVD/UpperBidiagonalization.h b/inst/include/Eigen/src/SVD/UpperBidiagonalization.h
index 587de37a..6df6318c 100644
--- a/inst/include/Eigen/src/SVD/UpperBidiagonalization.h
+++ b/inst/include/Eigen/src/SVD/UpperBidiagonalization.h
@@ -2,6 +2,7 @@
 // for linear algebra.
 //
 // Copyright (C) 2010 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2013-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,120 +11,350 @@
 #ifndef EIGEN_BIDIAGONALIZATION_H
 #define EIGEN_BIDIAGONALIZATION_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 namespace internal {
 // UpperBidiagonalization will probably be replaced by a Bidiagonalization class, don't want to make it stable API.
 // At the same time, it's useful to keep for now as it's about the only thing that is testing the BandMatrix class.
 
-template<typename _MatrixType> class UpperBidiagonalization
-{
-  public:
-
-    typedef _MatrixType MatrixType;
-    enum {
-      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-      ColsAtCompileTimeMinusOne = internal::decrement_size<ColsAtCompileTime>::ret
-    };
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
-    typedef Matrix<Scalar, 1, ColsAtCompileTime> RowVectorType;
-    typedef Matrix<Scalar, RowsAtCompileTime, 1> ColVectorType;
-    typedef BandMatrix<RealScalar, ColsAtCompileTime, ColsAtCompileTime, 1, 0> BidiagonalType;
-    typedef Matrix<Scalar, ColsAtCompileTime, 1> DiagVectorType;
-    typedef Matrix<Scalar, ColsAtCompileTimeMinusOne, 1> SuperDiagVectorType;
-    typedef HouseholderSequence<
-              const MatrixType,
-              CwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, const Diagonal<const MatrixType,0> >
-            > HouseholderUSequenceType;
-    typedef HouseholderSequence<
-              const typename internal::remove_all<typename MatrixType::ConjugateReturnType>::type,
-              Diagonal<const MatrixType,1>,
-              OnTheRight
-            > HouseholderVSequenceType;
-    
-    /**
-    * \brief Default Constructor.
-    *
-    * The default constructor is useful in cases in which the user intends to
-    * perform decompositions via Bidiagonalization::compute(const MatrixType&).
-    */
-    UpperBidiagonalization() : m_householder(), m_bidiagonal(), m_isInitialized(false) {}
-
-    UpperBidiagonalization(const MatrixType& matrix)
+template <typename MatrixType_>
+class UpperBidiagonalization {
+ public:
+  typedef MatrixType_ MatrixType;
+  enum {
+    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+    ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+    ColsAtCompileTimeMinusOne = internal::decrement_size<ColsAtCompileTime>::ret
+  };
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef Eigen::Index Index;  ///< \deprecated since Eigen 3.3
+  typedef Matrix<Scalar, 1, ColsAtCompileTime> RowVectorType;
+  typedef Matrix<Scalar, RowsAtCompileTime, 1> ColVectorType;
+  typedef BandMatrix<RealScalar, ColsAtCompileTime, ColsAtCompileTime, 1, 0, RowMajor> BidiagonalType;
+  typedef Matrix<Scalar, ColsAtCompileTime, 1> DiagVectorType;
+  typedef Matrix<Scalar, ColsAtCompileTimeMinusOne, 1> SuperDiagVectorType;
+  typedef HouseholderSequence<
+      const MatrixType, const internal::remove_all_t<typename Diagonal<const MatrixType, 0>::ConjugateReturnType> >
+      HouseholderUSequenceType;
+  typedef HouseholderSequence<const internal::remove_all_t<typename MatrixType::ConjugateReturnType>,
+                              Diagonal<const MatrixType, 1>, OnTheRight>
+      HouseholderVSequenceType;
+
+  /**
+   * \brief Default Constructor.
+   *
+   * The default constructor is useful in cases in which the user intends to
+   * perform decompositions via Bidiagonalization::compute(const MatrixType&).
+   */
+  UpperBidiagonalization() : m_householder(), m_bidiagonal(0, 0), m_isInitialized(false) {}
+
+  explicit UpperBidiagonalization(const MatrixType& matrix)
       : m_householder(matrix.rows(), matrix.cols()),
         m_bidiagonal(matrix.cols(), matrix.cols()),
-        m_isInitialized(false)
-    {
-      compute(matrix);
-    }
-    
-    UpperBidiagonalization& compute(const MatrixType& matrix);
-    
-    const MatrixType& householder() const { return m_householder; }
-    const BidiagonalType& bidiagonal() const { return m_bidiagonal; }
-    
-    const HouseholderUSequenceType householderU() const
-    {
-      eigen_assert(m_isInitialized && "UpperBidiagonalization is not initialized.");
-      return HouseholderUSequenceType(m_householder, m_householder.diagonal().conjugate());
-    }
+        m_isInitialized(false) {
+    compute(matrix);
+  }
+
+  UpperBidiagonalization(Index rows, Index cols)
+      : m_householder(rows, cols), m_bidiagonal(cols, cols), m_isInitialized(false) {}
+
+  UpperBidiagonalization& compute(const MatrixType& matrix);
+  UpperBidiagonalization& computeUnblocked(const MatrixType& matrix);
+
+  const MatrixType& householder() const { return m_householder; }
+  const BidiagonalType& bidiagonal() const { return m_bidiagonal; }
+
+  const HouseholderUSequenceType householderU() const {
+    eigen_assert(m_isInitialized && "UpperBidiagonalization is not initialized.");
+    return HouseholderUSequenceType(m_householder, m_householder.diagonal().conjugate());
+  }
+
+  const HouseholderVSequenceType householderV()  // const here gives nasty errors and i'm lazy
+  {
+    eigen_assert(m_isInitialized && "UpperBidiagonalization is not initialized.");
+    return HouseholderVSequenceType(m_householder.conjugate(), m_householder.const_derived().template diagonal<1>())
+        .setLength(m_householder.cols() - 1)
+        .setShift(1);
+  }
+
+ protected:
+  MatrixType m_householder;
+  BidiagonalType m_bidiagonal;
+  bool m_isInitialized;
+};
+
+// Standard upper bidiagonalization without fancy optimizations
+// This version should be faster for small matrix size
+template <typename MatrixType>
+void upperbidiagonalization_inplace_unblocked(MatrixType& mat, typename MatrixType::RealScalar* diagonal,
+                                              typename MatrixType::RealScalar* upper_diagonal,
+                                              typename MatrixType::Scalar* tempData = 0) {
+  typedef typename MatrixType::Scalar Scalar;
+
+  Index rows = mat.rows();
+  Index cols = mat.cols();
+
+  typedef Matrix<Scalar, Dynamic, 1, ColMajor, MatrixType::MaxRowsAtCompileTime, 1> TempType;
+  TempType tempVector;
+  if (tempData == 0) {
+    tempVector.resize(rows);
+    tempData = tempVector.data();
+  }
+
+  for (Index k = 0; /* breaks at k==cols-1 below */; ++k) {
+    Index remainingRows = rows - k;
+    Index remainingCols = cols - k - 1;
+
+    // construct left householder transform in-place in A
+    mat.col(k).tail(remainingRows).makeHouseholderInPlace(mat.coeffRef(k, k), diagonal[k]);
+    // apply householder transform to remaining part of A on the left
+    mat.bottomRightCorner(remainingRows, remainingCols)
+        .applyHouseholderOnTheLeft(mat.col(k).tail(remainingRows - 1), mat.coeff(k, k), tempData);
+
+    if (k == cols - 1) break;
+
+    // construct right householder transform in-place in mat
+    mat.row(k).tail(remainingCols).makeHouseholderInPlace(mat.coeffRef(k, k + 1), upper_diagonal[k]);
+    // apply householder transform to remaining part of mat on the left
+    mat.bottomRightCorner(remainingRows - 1, remainingCols)
+        .applyHouseholderOnTheRight(mat.row(k).tail(remainingCols - 1).adjoint(), mat.coeff(k, k + 1), tempData);
+  }
+}
+
+/** \internal
+ * Helper routine for the block reduction to upper bidiagonal form.
+ *
+ * Let's partition the matrix A:
+ *
+ *      | A00 A01 |
+ *  A = |         |
+ *      | A10 A11 |
+ *
+ * This function reduces to bidiagonal form the left \c rows x \a blockSize vertical panel [A00/A10]
+ * and the \a blockSize x \c cols horizontal panel [A00 A01] of the matrix \a A. The bottom-right block A11
+ * is updated using matrix-matrix products:
+ *   A22 -= V * Y^T - X * U^T
+ * where V and U contains the left and right Householder vectors. U and V are stored in A10, and A01
+ * respectively, and the update matrices X and Y are computed during the reduction.
+ *
+ */
+template <typename MatrixType>
+void upperbidiagonalization_blocked_helper(
+    MatrixType& A, typename MatrixType::RealScalar* diagonal, typename MatrixType::RealScalar* upper_diagonal, Index bs,
+    Ref<Matrix<typename MatrixType::Scalar, Dynamic, Dynamic, traits<MatrixType>::Flags & RowMajorBit> > X,
+    Ref<Matrix<typename MatrixType::Scalar, Dynamic, Dynamic, traits<MatrixType>::Flags & RowMajorBit> > Y) {
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef typename NumTraits<RealScalar>::Literal Literal;
+  static constexpr int StorageOrder = (traits<MatrixType>::Flags & RowMajorBit) ? RowMajor : ColMajor;
+  typedef InnerStride<StorageOrder == ColMajor ? 1 : Dynamic> ColInnerStride;
+  typedef InnerStride<StorageOrder == ColMajor ? Dynamic : 1> RowInnerStride;
+  typedef Ref<Matrix<Scalar, Dynamic, 1>, 0, ColInnerStride> SubColumnType;
+  typedef Ref<Matrix<Scalar, 1, Dynamic>, 0, RowInnerStride> SubRowType;
+  typedef Ref<Matrix<Scalar, Dynamic, Dynamic, StorageOrder> > SubMatType;
+
+  Index brows = A.rows();
+  Index bcols = A.cols();
+
+  Scalar tau_u, tau_u_prev(0), tau_v;
+
+  for (Index k = 0; k < bs; ++k) {
+    Index remainingRows = brows - k;
+    Index remainingCols = bcols - k - 1;
+
+    SubMatType X_k1(X.block(k, 0, remainingRows, k));
+    SubMatType V_k1(A.block(k, 0, remainingRows, k));
+
+    // 1 - update the k-th column of A
+    SubColumnType v_k = A.col(k).tail(remainingRows);
+    v_k -= V_k1 * Y.row(k).head(k).adjoint();
+    if (k) v_k.noalias() -= X_k1 * A.col(k).head(k);
+
+    // 2 - construct left Householder transform in-place
+    v_k.makeHouseholderInPlace(tau_v, diagonal[k]);
+
+    if (k + 1 < bcols) {
+      SubMatType Y_k(Y.block(k + 1, 0, remainingCols, k + 1));
+      SubMatType U_k1(A.block(0, k + 1, k, remainingCols));
+
+      // this eases the application of Householder transforAions
+      // A(k,k) will store tau_v later
+      A(k, k) = Scalar(1);
+
+      // 3 - Compute y_k^T = tau_v * ( A^T*v_k - Y_k-1*V_k-1^T*v_k - U_k-1*X_k-1^T*v_k )
+      {
+        SubColumnType y_k(Y.col(k).tail(remainingCols));
+
+        // let's use the beginning of column k of Y as a temporary vector
+        SubColumnType tmp(Y.col(k).head(k));
+        y_k.noalias() = A.block(k, k + 1, remainingRows, remainingCols).adjoint() * v_k;  // bottleneck
+        tmp.noalias() = V_k1.adjoint() * v_k;
+        y_k.noalias() -= Y_k.leftCols(k) * tmp;
+        tmp.noalias() = X_k1.adjoint() * v_k;
+        y_k.noalias() -= U_k1.adjoint() * tmp;
+        y_k *= numext::conj(tau_v);
+      }
 
-    const HouseholderVSequenceType householderV() // const here gives nasty errors and i'm lazy
+      // 4 - update k-th row of A (it will become u_k)
+      SubRowType u_k(A.row(k).tail(remainingCols));
+      u_k = u_k.conjugate();
+      {
+        u_k.noalias() -= Y_k * A.row(k).head(k + 1).adjoint();
+        if (k) u_k -= U_k1.adjoint() * X.row(k).head(k).adjoint();
+      }
+
+      // 5 - construct right Householder transform in-place
+      u_k.makeHouseholderInPlace(tau_u, upper_diagonal[k]);
+
+      // this eases the application of Householder transformations
+      // A(k,k+1) will store tau_u later
+      A(k, k + 1) = Scalar(1);
+
+      // 6 - Compute x_k = tau_u * ( A*u_k - X_k-1*U_k-1^T*u_k - V_k*Y_k^T*u_k )
+      {
+        SubColumnType x_k(X.col(k).tail(remainingRows - 1));
+
+        // let's use the beginning of column k of X as a temporary vectors
+        // note that tmp0 and tmp1 overlaps
+        SubColumnType tmp0(X.col(k).head(k)), tmp1(X.col(k).head(k + 1));
+
+        x_k.noalias() = A.block(k + 1, k + 1, remainingRows - 1, remainingCols) * u_k.transpose();  // bottleneck
+        tmp0.noalias() = U_k1 * u_k.transpose();
+        x_k.noalias() -= X_k1.bottomRows(remainingRows - 1) * tmp0;
+        tmp1.noalias() = Y_k.adjoint() * u_k.transpose();
+        x_k.noalias() -= A.block(k + 1, 0, remainingRows - 1, k + 1) * tmp1;
+        x_k *= numext::conj(tau_u);
+        tau_u = numext::conj(tau_u);
+        u_k = u_k.conjugate();
+      }
+
+      if (k > 0) A.coeffRef(k - 1, k) = tau_u_prev;
+      tau_u_prev = tau_u;
+    } else
+      A.coeffRef(k - 1, k) = tau_u_prev;
+
+    A.coeffRef(k, k) = tau_v;
+  }
+
+  if (bs < bcols) A.coeffRef(bs - 1, bs) = tau_u_prev;
+
+  // update A22
+  if (bcols > bs && brows > bs) {
+    SubMatType A11(A.bottomRightCorner(brows - bs, bcols - bs));
+    SubMatType A10(A.block(bs, 0, brows - bs, bs));
+    SubMatType A01(A.block(0, bs, bs, bcols - bs));
+    Scalar tmp = A01(bs - 1, 0);
+    A01(bs - 1, 0) = Literal(1);
+    A11.noalias() -= A10 * Y.topLeftCorner(bcols, bs).bottomRows(bcols - bs).adjoint();
+    A11.noalias() -= X.topLeftCorner(brows, bs).bottomRows(brows - bs) * A01;
+    A01(bs - 1, 0) = tmp;
+  }
+}
+
+/** \internal
+ *
+ * Implementation of a block-bidiagonal reduction.
+ * It is based on the following paper:
+ *   The Design of a Parallel Dense Linear Algebra Software Library: Reduction to Hessenberg, Tridiagonal, and
+ * Bidiagonal Form. by Jaeyoung Choi, Jack J. Dongarra, David W. Walker. (1995) section 3.3
+ */
+template <typename MatrixType, typename BidiagType>
+void upperbidiagonalization_inplace_blocked(MatrixType& A, BidiagType& bidiagonal, Index maxBlockSize = 32,
+                                            typename MatrixType::Scalar* /*tempData*/ = 0) {
+  typedef typename MatrixType::Scalar Scalar;
+  typedef Block<MatrixType, Dynamic, Dynamic> BlockType;
+
+  Index rows = A.rows();
+  Index cols = A.cols();
+  Index size = (std::min)(rows, cols);
+
+  // X and Y are work space
+  static constexpr int StorageOrder = (traits<MatrixType>::Flags & RowMajorBit) ? RowMajor : ColMajor;
+  Matrix<Scalar, MatrixType::RowsAtCompileTime, Dynamic, StorageOrder, MatrixType::MaxRowsAtCompileTime> X(
+      rows, maxBlockSize);
+  Matrix<Scalar, MatrixType::ColsAtCompileTime, Dynamic, StorageOrder, MatrixType::MaxColsAtCompileTime> Y(
+      cols, maxBlockSize);
+  Index blockSize = (std::min)(maxBlockSize, size);
+
+  Index k = 0;
+  for (k = 0; k < size; k += blockSize) {
+    Index bs = (std::min)(size - k, blockSize);  // actual size of the block
+    Index brows = rows - k;                      // rows of the block
+    Index bcols = cols - k;                      // columns of the block
+
+    // partition the matrix A:
+    //
+    //      | A00 A01 A02 |
+    //      |             |
+    // A  = | A10 A11 A12 |
+    //      |             |
+    //      | A20 A21 A22 |
+    //
+    // where A11 is a bs x bs diagonal block,
+    // and let:
+    //      | A11 A12 |
+    //  B = |         |
+    //      | A21 A22 |
+
+    BlockType B = A.block(k, k, brows, bcols);
+
+    // This stage performs the bidiagonalization of A11, A21, A12, and updating of A22.
+    // Finally, the algorithm continue on the updated A22.
+    //
+    // However, if B is too small, or A22 empty, then let's use an unblocked strategy
+
+    auto upper_diagonal = bidiagonal.template diagonal<1>();
+    typename MatrixType::RealScalar* upper_diagonal_ptr =
+        upper_diagonal.size() > 0 ? &upper_diagonal.coeffRef(k) : nullptr;
+
+    if (k + bs == cols || bcols < 48)  // somewhat arbitrary threshold
     {
-      eigen_assert(m_isInitialized && "UpperBidiagonalization is not initialized.");
-      return HouseholderVSequenceType(m_householder.conjugate(), m_householder.const_derived().template diagonal<1>())
-             .setLength(m_householder.cols()-1)
-             .setShift(1);
+      upperbidiagonalization_inplace_unblocked(B, &(bidiagonal.template diagonal<0>().coeffRef(k)), upper_diagonal_ptr,
+                                               X.data());
+      break;  // We're done
+    } else {
+      upperbidiagonalization_blocked_helper<BlockType>(B, &(bidiagonal.template diagonal<0>().coeffRef(k)),
+                                                       upper_diagonal_ptr, bs, X.topLeftCorner(brows, bs),
+                                                       Y.topLeftCorner(bcols, bs));
     }
-    
-  protected:
-    MatrixType m_householder;
-    BidiagonalType m_bidiagonal;
-    bool m_isInitialized;
-};
+  }
+}
 
-template<typename _MatrixType>
-UpperBidiagonalization<_MatrixType>& UpperBidiagonalization<_MatrixType>::compute(const _MatrixType& matrix)
-{
+template <typename MatrixType_>
+UpperBidiagonalization<MatrixType_>& UpperBidiagonalization<MatrixType_>::computeUnblocked(const MatrixType_& matrix) {
   Index rows = matrix.rows();
   Index cols = matrix.cols();
-  
-  eigen_assert(rows >= cols && "UpperBidiagonalization is only for matrices satisfying rows>=cols.");
-  
+  EIGEN_ONLY_USED_FOR_DEBUG(cols);
+
+  eigen_assert(rows >= cols && "UpperBidiagonalization is only for Arices satisfying rows>=cols.");
+
   m_householder = matrix;
 
   ColVectorType temp(rows);
 
-  for (Index k = 0; /* breaks at k==cols-1 below */ ; ++k)
-  {
-    Index remainingRows = rows - k;
-    Index remainingCols = cols - k - 1;
+  upperbidiagonalization_inplace_unblocked(m_householder, &(m_bidiagonal.template diagonal<0>().coeffRef(0)),
+                                           &(m_bidiagonal.template diagonal<1>().coeffRef(0)), temp.data());
+
+  m_isInitialized = true;
+  return *this;
+}
+
+template <typename MatrixType_>
+UpperBidiagonalization<MatrixType_>& UpperBidiagonalization<MatrixType_>::compute(const MatrixType_& matrix) {
+  Index rows = matrix.rows();
+  Index cols = matrix.cols();
+  EIGEN_ONLY_USED_FOR_DEBUG(rows);
+  EIGEN_ONLY_USED_FOR_DEBUG(cols);
+
+  eigen_assert(rows >= cols && "UpperBidiagonalization is only for Arices satisfying rows>=cols.");
+
+  m_householder = matrix;
+  upperbidiagonalization_inplace_blocked(m_householder, m_bidiagonal);
 
-    // construct left householder transform in-place in m_householder
-    m_householder.col(k).tail(remainingRows)
-                 .makeHouseholderInPlace(m_householder.coeffRef(k,k),
-                                         m_bidiagonal.template diagonal<0>().coeffRef(k));
-    // apply householder transform to remaining part of m_householder on the left
-    m_householder.bottomRightCorner(remainingRows, remainingCols)
-                 .applyHouseholderOnTheLeft(m_householder.col(k).tail(remainingRows-1),
-                                            m_householder.coeff(k,k),
-                                            temp.data());
-
-    if(k == cols-1) break;
-    
-    // construct right householder transform in-place in m_householder
-    m_householder.row(k).tail(remainingCols)
-                 .makeHouseholderInPlace(m_householder.coeffRef(k,k+1),
-                                         m_bidiagonal.template diagonal<1>().coeffRef(k));
-    // apply householder transform to remaining part of m_householder on the left
-    m_householder.bottomRightCorner(remainingRows-1, remainingCols)
-                 .applyHouseholderOnTheRight(m_householder.row(k).tail(remainingCols-1).transpose(),
-                                             m_householder.coeff(k,k+1),
-                                             temp.data());
-  }
   m_isInitialized = true;
   return *this;
 }
@@ -141,8 +372,8 @@ MatrixBase<Derived>::bidiagonalization() const
 }
 #endif
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_BIDIAGONALIZATION_H
+#endif  // EIGEN_BIDIAGONALIZATION_H
diff --git a/inst/include/Eigen/src/SparseCholesky/InternalHeaderCheck.h b/inst/include/Eigen/src/SparseCholesky/InternalHeaderCheck.h
new file mode 100644
index 00000000..f8d87628
--- /dev/null
+++ b/inst/include/Eigen/src/SparseCholesky/InternalHeaderCheck.h
@@ -0,0 +1,3 @@
+#ifndef EIGEN_SPARSECHOLESKY_MODULE_H
+#error "Please include Eigen/SparseCholesky instead of including headers inside the src directory directly."
+#endif
diff --git a/inst/include/Eigen/src/SparseCholesky/SimplicialCholesky.h b/inst/include/Eigen/src/SparseCholesky/SimplicialCholesky.h
index e1f96ba5..14147945 100644
--- a/inst/include/Eigen/src/SparseCholesky/SimplicialCholesky.h
+++ b/inst/include/Eigen/src/SparseCholesky/SimplicialCholesky.h
@@ -10,497 +10,711 @@
 #ifndef EIGEN_SIMPLICIAL_CHOLESKY_H
 #define EIGEN_SIMPLICIAL_CHOLESKY_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 
-enum SimplicialCholeskyMode {
-  SimplicialCholeskyLLT,
-  SimplicialCholeskyLDLT
+namespace Eigen {
+
+enum SimplicialCholeskyMode { SimplicialCholeskyLLT, SimplicialCholeskyLDLT };
+
+namespace internal {
+template <typename CholMatrixType, typename InputMatrixType>
+struct simplicial_cholesky_grab_input {
+  typedef CholMatrixType const* ConstCholMatrixPtr;
+  static void run(const InputMatrixType& input, ConstCholMatrixPtr& pmat, CholMatrixType& tmp) {
+    tmp = input;
+    pmat = &tmp;
+  }
+};
+
+template <typename MatrixType>
+struct simplicial_cholesky_grab_input<MatrixType, MatrixType> {
+  typedef MatrixType const* ConstMatrixPtr;
+  static void run(const MatrixType& input, ConstMatrixPtr& pmat, MatrixType& /*tmp*/) { pmat = &input; }
 };
+}  // end namespace internal
 
 /** \ingroup SparseCholesky_Module
-  * \brief A direct sparse Cholesky factorizations
-  *
-  * These classes provide LL^T and LDL^T Cholesky factorizations of sparse matrices that are
-  * selfadjoint and positive definite. The factorization allows for solving A.X = B where
-  * X and B can be either dense or sparse.
-  * 
-  * In order to reduce the fill-in, a symmetric permutation P is applied prior to the factorization
-  * such that the factorized matrix is P A P^-1.
-  *
-  * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
-  * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower
-  *               or Upper. Default is Lower.
-  *
-  */
-template<typename Derived>
-class SimplicialCholeskyBase : internal::noncopyable
-{
-  public:
-    typedef typename internal::traits<Derived>::MatrixType MatrixType;
-    typedef typename internal::traits<Derived>::OrderingType OrderingType;
-    enum { UpLo = internal::traits<Derived>::UpLo };
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
-    typedef SparseMatrix<Scalar,ColMajor,Index> CholMatrixType;
-    typedef Matrix<Scalar,Dynamic,1> VectorType;
-
-  public:
-
-    /** Default constructor */
-    SimplicialCholeskyBase()
-      : m_info(Success), m_isInitialized(false), m_shiftOffset(0), m_shiftScale(1)
-    {}
-
-    SimplicialCholeskyBase(const MatrixType& matrix)
-      : m_info(Success), m_isInitialized(false), m_shiftOffset(0), m_shiftScale(1)
-    {
-      derived().compute(matrix);
-    }
+ * \brief A base class for direct sparse Cholesky factorizations
+ *
+ * This is a base class for LL^T and LDL^T Cholesky factorizations of sparse matrices that are
+ * selfadjoint and positive definite. These factorizations allow for solving A.X = B where
+ * X and B can be either dense or sparse.
+ *
+ * In order to reduce the fill-in, a symmetric permutation P is applied prior to the factorization
+ * such that the factorized matrix is P A P^-1.
+ *
+ * \tparam Derived the type of the derived class, that is the actual factorization type.
+ *
+ */
+template <typename Derived>
+class SimplicialCholeskyBase : public SparseSolverBase<Derived> {
+  typedef SparseSolverBase<Derived> Base;
+  using Base::m_isInitialized;
+
+ public:
+  typedef typename internal::traits<Derived>::MatrixType MatrixType;
+  typedef typename internal::traits<Derived>::OrderingType OrderingType;
+  enum { UpLo = internal::traits<Derived>::UpLo };
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef typename internal::traits<Derived>::DiagonalScalar DiagonalScalar;
+  typedef typename MatrixType::StorageIndex StorageIndex;
+  typedef SparseMatrix<Scalar, ColMajor, StorageIndex> CholMatrixType;
+  typedef CholMatrixType const* ConstCholMatrixPtr;
+  typedef Matrix<Scalar, Dynamic, 1> VectorType;
+  typedef Matrix<StorageIndex, Dynamic, 1> VectorI;
+
+  enum { ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime };
+
+ public:
+  using Base::derived;
+
+  /** Default constructor */
+  SimplicialCholeskyBase()
+      : m_info(Success), m_factorizationIsOk(false), m_analysisIsOk(false), m_shiftOffset(0), m_shiftScale(1) {}
+
+  explicit SimplicialCholeskyBase(const MatrixType& matrix)
+      : m_info(Success), m_factorizationIsOk(false), m_analysisIsOk(false), m_shiftOffset(0), m_shiftScale(1) {
+    derived().compute(matrix);
+  }
 
-    ~SimplicialCholeskyBase()
-    {
-    }
+  ~SimplicialCholeskyBase() {}
 
-    Derived& derived() { return *static_cast<Derived*>(this); }
-    const Derived& derived() const { return *static_cast<const Derived*>(this); }
-    
-    inline Index cols() const { return m_matrix.cols(); }
-    inline Index rows() const { return m_matrix.rows(); }
-    
-    /** \brief Reports whether previous computation was successful.
-      *
-      * \returns \c Success if computation was succesful,
-      *          \c NumericalIssue if the matrix.appears to be negative.
-      */
-    ComputationInfo info() const
-    {
-      eigen_assert(m_isInitialized && "Decomposition is not initialized.");
-      return m_info;
-    }
-    
-    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
-      *
-      * \sa compute()
-      */
-    template<typename Rhs>
-    inline const internal::solve_retval<SimplicialCholeskyBase, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "Simplicial LLT or LDLT is not initialized.");
-      eigen_assert(rows()==b.rows()
-                && "SimplicialCholeskyBase::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::solve_retval<SimplicialCholeskyBase, Rhs>(*this, b.derived());
-    }
-    
-    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
-      *
-      * \sa compute()
-      */
-    template<typename Rhs>
-    inline const internal::sparse_solve_retval<SimplicialCholeskyBase, Rhs>
-    solve(const SparseMatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "Simplicial LLT or LDLT is not initialized.");
-      eigen_assert(rows()==b.rows()
-                && "SimplicialCholesky::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::sparse_solve_retval<SimplicialCholeskyBase, Rhs>(*this, b.derived());
-    }
-    
-    /** \returns the permutation P
-      * \sa permutationPinv() */
-    const PermutationMatrix<Dynamic,Dynamic,Index>& permutationP() const
-    { return m_P; }
-    
-    /** \returns the inverse P^-1 of the permutation P
-      * \sa permutationP() */
-    const PermutationMatrix<Dynamic,Dynamic,Index>& permutationPinv() const
-    { return m_Pinv; }
-
-    /** Sets the shift parameters that will be used to adjust the diagonal coefficients during the numerical factorization.
-      *
-      * During the numerical factorization, the diagonal coefficients are transformed by the following linear model:\n
-      * \c d_ii = \a offset + \a scale * \c d_ii
-      *
-      * The default is the identity transformation with \a offset=0, and \a scale=1.
-      *
-      * \returns a reference to \c *this.
-      */
-    Derived& setShift(const RealScalar& offset, const RealScalar& scale = 1)
-    {
-      m_shiftOffset = offset;
-      m_shiftScale = scale;
-      return derived();
-    }
+  Derived& derived() { return *static_cast<Derived*>(this); }
+  const Derived& derived() const { return *static_cast<const Derived*>(this); }
+
+  inline Index cols() const { return m_matrix.cols(); }
+  inline Index rows() const { return m_matrix.rows(); }
+
+  /** \brief Reports whether previous computation was successful.
+   *
+   * \returns \c Success if computation was successful,
+   *          \c NumericalIssue if the matrix.appears to be negative.
+   */
+  ComputationInfo info() const {
+    eigen_assert(m_isInitialized && "Decomposition is not initialized.");
+    return m_info;
+  }
+
+  /** \returns the permutation P
+   * \sa permutationPinv() */
+  const PermutationMatrix<Dynamic, Dynamic, StorageIndex>& permutationP() const { return m_P; }
+
+  /** \returns the inverse P^-1 of the permutation P
+   * \sa permutationP() */
+  const PermutationMatrix<Dynamic, Dynamic, StorageIndex>& permutationPinv() const { return m_Pinv; }
+
+  /** Sets the shift parameters that will be used to adjust the diagonal coefficients during the numerical
+   * factorization.
+   *
+   * During the numerical factorization, the diagonal coefficients are transformed by the following linear model:\n
+   * \c d_ii = \a offset + \a scale * \c d_ii
+   *
+   * The default is the identity transformation with \a offset=0, and \a scale=1.
+   *
+   * \returns a reference to \c *this.
+   */
+  Derived& setShift(const DiagonalScalar& offset, const DiagonalScalar& scale = 1) {
+    m_shiftOffset = offset;
+    m_shiftScale = scale;
+    return derived();
+  }
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** \internal */
-    template<typename Stream>
-    void dumpMemory(Stream& s)
-    {
-      int total = 0;
-      s << "  L:        " << ((total+=(m_matrix.cols()+1) * sizeof(int) + m_matrix.nonZeros()*(sizeof(int)+sizeof(Scalar))) >> 20) << "Mb" << "\n";
-      s << "  diag:     " << ((total+=m_diag.size() * sizeof(Scalar)) >> 20) << "Mb" << "\n";
-      s << "  tree:     " << ((total+=m_parent.size() * sizeof(int)) >> 20) << "Mb" << "\n";
-      s << "  nonzeros: " << ((total+=m_nonZerosPerCol.size() * sizeof(int)) >> 20) << "Mb" << "\n";
-      s << "  perm:     " << ((total+=m_P.size() * sizeof(int)) >> 20) << "Mb" << "\n";
-      s << "  perm^-1:  " << ((total+=m_Pinv.size() * sizeof(int)) >> 20) << "Mb" << "\n";
-      s << "  TOTAL:    " << (total>> 20) << "Mb" << "\n";
-    }
+  /** \internal */
+  template <typename Stream>
+  void dumpMemory(Stream& s) {
+    int total = 0;
+    s << "  L:        "
+      << ((total += (m_matrix.cols() + 1) * sizeof(int) + m_matrix.nonZeros() * (sizeof(int) + sizeof(Scalar))) >> 20)
+      << "Mb"
+      << "\n";
+    s << "  diag:     " << ((total += m_diag.size() * sizeof(Scalar)) >> 20) << "Mb"
+      << "\n";
+    s << "  tree:     " << ((total += m_parent.size() * sizeof(int)) >> 20) << "Mb"
+      << "\n";
+    s << "  nonzeros: " << ((total += m_workSpace.size() * sizeof(int)) >> 20) << "Mb"
+      << "\n";
+    s << "  perm:     " << ((total += m_P.size() * sizeof(int)) >> 20) << "Mb"
+      << "\n";
+    s << "  perm^-1:  " << ((total += m_Pinv.size() * sizeof(int)) >> 20) << "Mb"
+      << "\n";
+    s << "  TOTAL:    " << (total >> 20) << "Mb"
+      << "\n";
+  }
 
-    /** \internal */
-    template<typename Rhs,typename Dest>
-    void _solve(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const
-    {
-      eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or symbolic()/numeric()");
-      eigen_assert(m_matrix.rows()==b.rows());
+  /** \internal */
+  template <typename Rhs, typename Dest>
+  void _solve_impl(const MatrixBase<Rhs>& b, MatrixBase<Dest>& dest) const {
+    eigen_assert(m_factorizationIsOk &&
+                 "The decomposition is not in a valid state for solving, you must first call either compute() or "
+                 "symbolic()/numeric()");
+    eigen_assert(m_matrix.rows() == b.rows());
 
-      if(m_info!=Success)
-        return;
+    if (m_info != Success) return;
 
-      if(m_P.size()>0)
-        dest = m_P * b;
-      else
-        dest = b;
+    if (m_P.size() > 0)
+      dest = m_P * b;
+    else
+      dest = b;
 
-      if(m_matrix.nonZeros()>0) // otherwise L==I
-        derived().matrixL().solveInPlace(dest);
+    if (m_matrix.nonZeros() > 0)  // otherwise L==I
+      derived().matrixL().solveInPlace(dest);
 
-      if(m_diag.size()>0)
-        dest = m_diag.asDiagonal().inverse() * dest;
+    if (m_diag.size() > 0) dest = m_diag.asDiagonal().inverse() * dest;
 
-      if (m_matrix.nonZeros()>0) // otherwise U==I
-        derived().matrixU().solveInPlace(dest);
+    if (m_matrix.nonZeros() > 0)  // otherwise U==I
+      derived().matrixU().solveInPlace(dest);
 
-      if(m_P.size()>0)
-        dest = m_Pinv * dest;
-    }
+    if (m_P.size() > 0) dest = m_Pinv * dest;
+  }
 
-#endif // EIGEN_PARSED_BY_DOXYGEN
+  template <typename Rhs, typename Dest>
+  void _solve_impl(const SparseMatrixBase<Rhs>& b, SparseMatrixBase<Dest>& dest) const {
+    internal::solve_sparse_through_dense_panels(derived(), b, dest);
+  }
 
-  protected:
-    
-    /** Computes the sparse Cholesky decomposition of \a matrix */
-    template<bool DoLDLT>
-    void compute(const MatrixType& matrix)
-    {
-      eigen_assert(matrix.rows()==matrix.cols());
-      Index size = matrix.cols();
-      CholMatrixType ap(size,size);
-      ordering(matrix, ap);
-      analyzePattern_preordered(ap, DoLDLT);
-      factorize_preordered<DoLDLT>(ap);
-    }
-    
-    template<bool DoLDLT>
-    void factorize(const MatrixType& a)
-    {
-      eigen_assert(a.rows()==a.cols());
-      int size = a.cols();
-      CholMatrixType ap(size,size);
-      ap.template selfadjointView<Upper>() = a.template selfadjointView<UpLo>().twistedBy(m_P);
-      factorize_preordered<DoLDLT>(ap);
-    }
+#endif  // EIGEN_PARSED_BY_DOXYGEN
+
+ protected:
+  /** Computes the sparse Cholesky decomposition of \a matrix */
+  template <bool DoLDLT, bool NonHermitian>
+  void compute(const MatrixType& matrix) {
+    eigen_assert(matrix.rows() == matrix.cols());
+    Index size = matrix.cols();
+    CholMatrixType tmp(size, size);
+    ConstCholMatrixPtr pmat;
+    ordering<NonHermitian>(matrix, pmat, tmp);
+    analyzePattern_preordered(*pmat, DoLDLT);
+    factorize_preordered<DoLDLT, NonHermitian>(*pmat);
+  }
 
-    template<bool DoLDLT>
-    void factorize_preordered(const CholMatrixType& a);
+  template <bool DoLDLT, bool NonHermitian>
+  void factorize(const MatrixType& a) {
+    eigen_assert(a.rows() == a.cols());
+    Index size = a.cols();
+    CholMatrixType tmp(size, size);
+    ConstCholMatrixPtr pmat;
 
-    void analyzePattern(const MatrixType& a, bool doLDLT)
-    {
-      eigen_assert(a.rows()==a.cols());
-      int size = a.cols();
-      CholMatrixType ap(size,size);
-      ordering(a, ap);
-      analyzePattern_preordered(ap,doLDLT);
+    if (m_P.size() == 0 && (int(UpLo) & int(Upper)) == Upper) {
+      // If there is no ordering, try to directly use the input matrix without any copy
+      internal::simplicial_cholesky_grab_input<CholMatrixType, MatrixType>::run(a, pmat, tmp);
+    } else {
+      internal::permute_symm_to_symm<UpLo, Upper, NonHermitian>(a, tmp, m_P.indices().data());
+      pmat = &tmp;
     }
-    void analyzePattern_preordered(const CholMatrixType& a, bool doLDLT);
-    
-    void ordering(const MatrixType& a, CholMatrixType& ap);
-
-    /** keeps off-diagonal entries; drops diagonal entries */
-    struct keep_diag {
-      inline bool operator() (const Index& row, const Index& col, const Scalar&) const
-      {
-        return row!=col;
-      }
-    };
-
-    mutable ComputationInfo m_info;
-    bool m_isInitialized;
-    bool m_factorizationIsOk;
-    bool m_analysisIsOk;
-    
-    CholMatrixType m_matrix;
-    VectorType m_diag;                                // the diagonal coefficients (LDLT mode)
-    VectorXi m_parent;                                // elimination tree
-    VectorXi m_nonZerosPerCol;
-    PermutationMatrix<Dynamic,Dynamic,Index> m_P;     // the permutation
-    PermutationMatrix<Dynamic,Dynamic,Index> m_Pinv;  // the inverse permutation
-
-    RealScalar m_shiftOffset;
-    RealScalar m_shiftScale;
+
+    factorize_preordered<DoLDLT, NonHermitian>(*pmat);
+  }
+
+  template <bool DoLDLT, bool NonHermitian>
+  void factorize_preordered(const CholMatrixType& a);
+
+  template <bool DoLDLT, bool NonHermitian>
+  void analyzePattern(const MatrixType& a) {
+    eigen_assert(a.rows() == a.cols());
+    Index size = a.cols();
+    CholMatrixType tmp(size, size);
+    ConstCholMatrixPtr pmat;
+    ordering<NonHermitian>(a, pmat, tmp);
+    analyzePattern_preordered(*pmat, DoLDLT);
+  }
+  void analyzePattern_preordered(const CholMatrixType& a, bool doLDLT);
+
+  template <bool NonHermitian>
+  void ordering(const MatrixType& a, ConstCholMatrixPtr& pmat, CholMatrixType& ap);
+
+  inline DiagonalScalar getDiag(Scalar x) { return internal::traits<Derived>::getDiag(x); }
+  inline Scalar getSymm(Scalar x) { return internal::traits<Derived>::getSymm(x); }
+
+  /** keeps off-diagonal entries; drops diagonal entries */
+  struct keep_diag {
+    inline bool operator()(const Index& row, const Index& col, const Scalar&) const { return row != col; }
+  };
+
+  mutable ComputationInfo m_info;
+  bool m_factorizationIsOk;
+  bool m_analysisIsOk;
+
+  CholMatrixType m_matrix;
+  VectorType m_diag;  // the diagonal coefficients (LDLT mode)
+  VectorI m_parent;   // elimination tree
+  VectorI m_workSpace;
+  PermutationMatrix<Dynamic, Dynamic, StorageIndex> m_P;     // the permutation
+  PermutationMatrix<Dynamic, Dynamic, StorageIndex> m_Pinv;  // the inverse permutation
+
+  DiagonalScalar m_shiftOffset;
+  DiagonalScalar m_shiftScale;
 };
 
-template<typename _MatrixType, int _UpLo = Lower, typename _Ordering = AMDOrdering<typename _MatrixType::Index> > class SimplicialLLT;
-template<typename _MatrixType, int _UpLo = Lower, typename _Ordering = AMDOrdering<typename _MatrixType::Index> > class SimplicialLDLT;
-template<typename _MatrixType, int _UpLo = Lower, typename _Ordering = AMDOrdering<typename _MatrixType::Index> > class SimplicialCholesky;
+template <typename MatrixType_, int UpLo_ = Lower,
+          typename Ordering_ = AMDOrdering<typename MatrixType_::StorageIndex> >
+class SimplicialLLT;
+template <typename MatrixType_, int UpLo_ = Lower,
+          typename Ordering_ = AMDOrdering<typename MatrixType_::StorageIndex> >
+class SimplicialLDLT;
+template <typename MatrixType_, int UpLo_ = Lower,
+          typename Ordering_ = AMDOrdering<typename MatrixType_::StorageIndex> >
+class SimplicialNonHermitianLLT;
+template <typename MatrixType_, int UpLo_ = Lower,
+          typename Ordering_ = AMDOrdering<typename MatrixType_::StorageIndex> >
+class SimplicialNonHermitianLDLT;
+template <typename MatrixType_, int UpLo_ = Lower,
+          typename Ordering_ = AMDOrdering<typename MatrixType_::StorageIndex> >
+class SimplicialCholesky;
 
 namespace internal {
 
-template<typename _MatrixType, int _UpLo, typename _Ordering> struct traits<SimplicialLLT<_MatrixType,_UpLo,_Ordering> >
-{
-  typedef _MatrixType MatrixType;
-  typedef _Ordering OrderingType;
-  enum { UpLo = _UpLo };
-  typedef typename MatrixType::Scalar                         Scalar;
-  typedef typename MatrixType::Index                          Index;
-  typedef SparseMatrix<Scalar, ColMajor, Index>               CholMatrixType;
-  typedef SparseTriangularView<CholMatrixType, Eigen::Lower>  MatrixL;
-  typedef SparseTriangularView<typename CholMatrixType::AdjointReturnType, Eigen::Upper>   MatrixU;
-  static inline MatrixL getL(const MatrixType& m) { return m; }
-  static inline MatrixU getU(const MatrixType& m) { return m.adjoint(); }
+template <typename MatrixType_, int UpLo_, typename Ordering_>
+struct traits<SimplicialLLT<MatrixType_, UpLo_, Ordering_> > {
+  typedef MatrixType_ MatrixType;
+  typedef Ordering_ OrderingType;
+  enum { UpLo = UpLo_ };
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar DiagonalScalar;
+  typedef typename MatrixType::StorageIndex StorageIndex;
+  typedef SparseMatrix<Scalar, ColMajor, StorageIndex> CholMatrixType;
+  typedef TriangularView<const CholMatrixType, Eigen::Lower> MatrixL;
+  typedef TriangularView<const typename CholMatrixType::AdjointReturnType, Eigen::Upper> MatrixU;
+  static inline MatrixL getL(const CholMatrixType& m) { return MatrixL(m); }
+  static inline MatrixU getU(const CholMatrixType& m) { return MatrixU(m.adjoint()); }
+  static inline DiagonalScalar getDiag(Scalar x) { return numext::real(x); }
+  static inline Scalar getSymm(Scalar x) { return numext::conj(x); }
 };
 
-template<typename _MatrixType,int _UpLo, typename _Ordering> struct traits<SimplicialLDLT<_MatrixType,_UpLo,_Ordering> >
-{
-  typedef _MatrixType MatrixType;
-  typedef _Ordering OrderingType;
-  enum { UpLo = _UpLo };
-  typedef typename MatrixType::Scalar                             Scalar;
-  typedef typename MatrixType::Index                              Index;
-  typedef SparseMatrix<Scalar, ColMajor, Index>                   CholMatrixType;
-  typedef SparseTriangularView<CholMatrixType, Eigen::UnitLower>  MatrixL;
-  typedef SparseTriangularView<typename CholMatrixType::AdjointReturnType, Eigen::UnitUpper> MatrixU;
-  static inline MatrixL getL(const MatrixType& m) { return m; }
-  static inline MatrixU getU(const MatrixType& m) { return m.adjoint(); }
+template <typename MatrixType_, int UpLo_, typename Ordering_>
+struct traits<SimplicialLDLT<MatrixType_, UpLo_, Ordering_> > {
+  typedef MatrixType_ MatrixType;
+  typedef Ordering_ OrderingType;
+  enum { UpLo = UpLo_ };
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar DiagonalScalar;
+  typedef typename MatrixType::StorageIndex StorageIndex;
+  typedef SparseMatrix<Scalar, ColMajor, StorageIndex> CholMatrixType;
+  typedef TriangularView<const CholMatrixType, Eigen::UnitLower> MatrixL;
+  typedef TriangularView<const typename CholMatrixType::AdjointReturnType, Eigen::UnitUpper> MatrixU;
+  static inline MatrixL getL(const CholMatrixType& m) { return MatrixL(m); }
+  static inline MatrixU getU(const CholMatrixType& m) { return MatrixU(m.adjoint()); }
+  static inline DiagonalScalar getDiag(Scalar x) { return numext::real(x); }
+  static inline Scalar getSymm(Scalar x) { return numext::conj(x); }
 };
 
-template<typename _MatrixType, int _UpLo, typename _Ordering> struct traits<SimplicialCholesky<_MatrixType,_UpLo,_Ordering> >
-{
-  typedef _MatrixType MatrixType;
-  typedef _Ordering OrderingType;
-  enum { UpLo = _UpLo };
+template <typename MatrixType_, int UpLo_, typename Ordering_>
+struct traits<SimplicialNonHermitianLLT<MatrixType_, UpLo_, Ordering_> > {
+  typedef MatrixType_ MatrixType;
+  typedef Ordering_ OrderingType;
+  enum { UpLo = UpLo_ };
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::Scalar DiagonalScalar;
+  typedef typename MatrixType::StorageIndex StorageIndex;
+  typedef SparseMatrix<Scalar, ColMajor, StorageIndex> CholMatrixType;
+  typedef TriangularView<const CholMatrixType, Eigen::Lower> MatrixL;
+  typedef TriangularView<const typename CholMatrixType::ConstTransposeReturnType, Eigen::Upper> MatrixU;
+  static inline MatrixL getL(const CholMatrixType& m) { return MatrixL(m); }
+  static inline MatrixU getU(const CholMatrixType& m) { return MatrixU(m.transpose()); }
+  static inline DiagonalScalar getDiag(Scalar x) { return x; }
+  static inline Scalar getSymm(Scalar x) { return x; }
 };
 
-}
+template <typename MatrixType_, int UpLo_, typename Ordering_>
+struct traits<SimplicialNonHermitianLDLT<MatrixType_, UpLo_, Ordering_> > {
+  typedef MatrixType_ MatrixType;
+  typedef Ordering_ OrderingType;
+  enum { UpLo = UpLo_ };
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::Scalar DiagonalScalar;
+  typedef typename MatrixType::StorageIndex StorageIndex;
+  typedef SparseMatrix<Scalar, ColMajor, StorageIndex> CholMatrixType;
+  typedef TriangularView<const CholMatrixType, Eigen::UnitLower> MatrixL;
+  typedef TriangularView<const typename CholMatrixType::ConstTransposeReturnType, Eigen::UnitUpper> MatrixU;
+  static inline MatrixL getL(const CholMatrixType& m) { return MatrixL(m); }
+  static inline MatrixU getU(const CholMatrixType& m) { return MatrixU(m.transpose()); }
+  static inline DiagonalScalar getDiag(Scalar x) { return x; }
+  static inline Scalar getSymm(Scalar x) { return x; }
+};
+
+template <typename MatrixType_, int UpLo_, typename Ordering_>
+struct traits<SimplicialCholesky<MatrixType_, UpLo_, Ordering_> > {
+  typedef MatrixType_ MatrixType;
+  typedef Ordering_ OrderingType;
+  enum { UpLo = UpLo_ };
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar DiagonalScalar;
+  static inline DiagonalScalar getDiag(Scalar x) { return numext::real(x); }
+  static inline Scalar getSymm(Scalar x) { return numext::conj(x); }
+};
+
+}  // namespace internal
 
 /** \ingroup SparseCholesky_Module
-  * \class SimplicialLLT
-  * \brief A direct sparse LLT Cholesky factorizations
-  *
-  * This class provides a LL^T Cholesky factorizations of sparse matrices that are
-  * selfadjoint and positive definite. The factorization allows for solving A.X = B where
-  * X and B can be either dense or sparse.
-  * 
-  * In order to reduce the fill-in, a symmetric permutation P is applied prior to the factorization
-  * such that the factorized matrix is P A P^-1.
-  *
-  * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
-  * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower
-  *               or Upper. Default is Lower.
-  * \tparam _Ordering The ordering method to use, either AMDOrdering<> or NaturalOrdering<>. Default is AMDOrdering<>
-  *
-  * \sa class SimplicialLDLT, class AMDOrdering, class NaturalOrdering
-  */
-template<typename _MatrixType, int _UpLo, typename _Ordering>
-    class SimplicialLLT : public SimplicialCholeskyBase<SimplicialLLT<_MatrixType,_UpLo,_Ordering> >
-{
-public:
-    typedef _MatrixType MatrixType;
-    enum { UpLo = _UpLo };
-    typedef SimplicialCholeskyBase<SimplicialLLT> Base;
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
-    typedef SparseMatrix<Scalar,ColMajor,Index> CholMatrixType;
-    typedef Matrix<Scalar,Dynamic,1> VectorType;
-    typedef internal::traits<SimplicialLLT> Traits;
-    typedef typename Traits::MatrixL  MatrixL;
-    typedef typename Traits::MatrixU  MatrixU;
-public:
-    /** Default constructor */
-    SimplicialLLT() : Base() {}
-    /** Constructs and performs the LLT factorization of \a matrix */
-    SimplicialLLT(const MatrixType& matrix)
-        : Base(matrix) {}
-
-    /** \returns an expression of the factor L */
-    inline const MatrixL matrixL() const {
-        eigen_assert(Base::m_factorizationIsOk && "Simplicial LLT not factorized");
-        return Traits::getL(Base::m_matrix);
-    }
+ * \class SimplicialLLT
+ * \brief A direct sparse LLT Cholesky factorizations
+ *
+ * This class provides a LL^T Cholesky factorizations of sparse matrices that are
+ * selfadjoint and positive definite. The factorization allows for solving A.X = B where
+ * X and B can be either dense or sparse.
+ *
+ * In order to reduce the fill-in, a symmetric permutation P is applied prior to the factorization
+ * such that the factorized matrix is P A P^-1.
+ *
+ * \tparam MatrixType_ the type of the sparse matrix A, it must be a SparseMatrix<>
+ * \tparam UpLo_ the triangular part that will be used for the computations. It can be Lower
+ *               or Upper. Default is Lower.
+ * \tparam Ordering_ The ordering method to use, either AMDOrdering<> or NaturalOrdering<>. Default is AMDOrdering<>
+ *
+ * \implsparsesolverconcept
+ *
+ * \sa class SimplicialLDLT, class AMDOrdering, class NaturalOrdering
+ */
+template <typename MatrixType_, int UpLo_, typename Ordering_>
+class SimplicialLLT : public SimplicialCholeskyBase<SimplicialLLT<MatrixType_, UpLo_, Ordering_> > {
+ public:
+  typedef MatrixType_ MatrixType;
+  enum { UpLo = UpLo_ };
+  typedef SimplicialCholeskyBase<SimplicialLLT> Base;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef typename MatrixType::StorageIndex StorageIndex;
+  typedef SparseMatrix<Scalar, ColMajor, Index> CholMatrixType;
+  typedef Matrix<Scalar, Dynamic, 1> VectorType;
+  typedef internal::traits<SimplicialLLT> Traits;
+  typedef typename Traits::MatrixL MatrixL;
+  typedef typename Traits::MatrixU MatrixU;
+
+ public:
+  /** Default constructor */
+  SimplicialLLT() : Base() {}
+  /** Constructs and performs the LLT factorization of \a matrix */
+  explicit SimplicialLLT(const MatrixType& matrix) : Base(matrix) {}
+
+  /** \returns an expression of the factor L */
+  inline const MatrixL matrixL() const {
+    eigen_assert(Base::m_factorizationIsOk && "Simplicial LLT not factorized");
+    return Traits::getL(Base::m_matrix);
+  }
 
-    /** \returns an expression of the factor U (= L^*) */
-    inline const MatrixU matrixU() const {
-        eigen_assert(Base::m_factorizationIsOk && "Simplicial LLT not factorized");
-        return Traits::getU(Base::m_matrix);
-    }
-    
-    /** Computes the sparse Cholesky decomposition of \a matrix */
-    SimplicialLLT& compute(const MatrixType& matrix)
-    {
-      Base::template compute<false>(matrix);
-      return *this;
-    }
+  /** \returns an expression of the factor U (= L^*) */
+  inline const MatrixU matrixU() const {
+    eigen_assert(Base::m_factorizationIsOk && "Simplicial LLT not factorized");
+    return Traits::getU(Base::m_matrix);
+  }
 
-    /** Performs a symbolic decomposition on the sparcity of \a matrix.
-      *
-      * This function is particularly useful when solving for several problems having the same structure.
-      *
-      * \sa factorize()
-      */
-    void analyzePattern(const MatrixType& a)
-    {
-      Base::analyzePattern(a, false);
-    }
+  /** Computes the sparse Cholesky decomposition of \a matrix */
+  SimplicialLLT& compute(const MatrixType& matrix) {
+    Base::template compute<false, false>(matrix);
+    return *this;
+  }
 
-    /** Performs a numeric decomposition of \a matrix
-      *
-      * The given matrix must has the same sparcity than the matrix on which the symbolic decomposition has been performed.
-      *
-      * \sa analyzePattern()
-      */
-    void factorize(const MatrixType& a)
-    {
-      Base::template factorize<false>(a);
-    }
+  /** Performs a symbolic decomposition on the sparsity of \a matrix.
+   *
+   * This function is particularly useful when solving for several problems having the same structure.
+   *
+   * \sa factorize()
+   */
+  void analyzePattern(const MatrixType& a) { Base::template analyzePattern<false, false>(a); }
+
+  /** Performs a numeric decomposition of \a matrix
+   *
+   * The given matrix must have the same sparsity than the matrix on which the symbolic decomposition has been
+   * performed.
+   *
+   * \sa analyzePattern()
+   */
+  void factorize(const MatrixType& a) { Base::template factorize<false, false>(a); }
+
+  /** \returns the determinant of the underlying matrix from the current factorization */
+  Scalar determinant() const {
+    Scalar detL = Base::m_matrix.diagonal().prod();
+    return numext::abs2(detL);
+  }
+};
 
-    /** \returns the determinant of the underlying matrix from the current factorization */
-    Scalar determinant() const
-    {
-      Scalar detL = Base::m_matrix.diagonal().prod();
-      return numext::abs2(detL);
-    }
+/** \ingroup SparseCholesky_Module
+ * \class SimplicialLDLT
+ * \brief A direct sparse LDLT Cholesky factorizations without square root.
+ *
+ * This class provides a LDL^T Cholesky factorizations without square root of sparse matrices that are
+ * selfadjoint and positive definite. The factorization allows for solving A.X = B where
+ * X and B can be either dense or sparse.
+ *
+ * In order to reduce the fill-in, a symmetric permutation P is applied prior to the factorization
+ * such that the factorized matrix is P A P^-1.
+ *
+ * \tparam MatrixType_ the type of the sparse matrix A, it must be a SparseMatrix<>
+ * \tparam UpLo_ the triangular part that will be used for the computations. It can be Lower
+ *               or Upper. Default is Lower.
+ * \tparam Ordering_ The ordering method to use, either AMDOrdering<> or NaturalOrdering<>. Default is AMDOrdering<>
+ *
+ * \implsparsesolverconcept
+ *
+ * \sa class SimplicialLLT, class AMDOrdering, class NaturalOrdering
+ */
+template <typename MatrixType_, int UpLo_, typename Ordering_>
+class SimplicialLDLT : public SimplicialCholeskyBase<SimplicialLDLT<MatrixType_, UpLo_, Ordering_> > {
+ public:
+  typedef MatrixType_ MatrixType;
+  enum { UpLo = UpLo_ };
+  typedef SimplicialCholeskyBase<SimplicialLDLT> Base;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef typename MatrixType::StorageIndex StorageIndex;
+  typedef SparseMatrix<Scalar, ColMajor, StorageIndex> CholMatrixType;
+  typedef Matrix<Scalar, Dynamic, 1> VectorType;
+  typedef internal::traits<SimplicialLDLT> Traits;
+  typedef typename Traits::MatrixL MatrixL;
+  typedef typename Traits::MatrixU MatrixU;
+
+ public:
+  /** Default constructor */
+  SimplicialLDLT() : Base() {}
+
+  /** Constructs and performs the LLT factorization of \a matrix */
+  explicit SimplicialLDLT(const MatrixType& matrix) : Base(matrix) {}
+
+  /** \returns a vector expression of the diagonal D */
+  inline const VectorType vectorD() const {
+    eigen_assert(Base::m_factorizationIsOk && "Simplicial LDLT not factorized");
+    return Base::m_diag;
+  }
+  /** \returns an expression of the factor L */
+  inline const MatrixL matrixL() const {
+    eigen_assert(Base::m_factorizationIsOk && "Simplicial LDLT not factorized");
+    return Traits::getL(Base::m_matrix);
+  }
+
+  /** \returns an expression of the factor U (= L^*) */
+  inline const MatrixU matrixU() const {
+    eigen_assert(Base::m_factorizationIsOk && "Simplicial LDLT not factorized");
+    return Traits::getU(Base::m_matrix);
+  }
+
+  /** Computes the sparse Cholesky decomposition of \a matrix */
+  SimplicialLDLT& compute(const MatrixType& matrix) {
+    Base::template compute<true, false>(matrix);
+    return *this;
+  }
+
+  /** Performs a symbolic decomposition on the sparsity of \a matrix.
+   *
+   * This function is particularly useful when solving for several problems having the same structure.
+   *
+   * \sa factorize()
+   */
+  void analyzePattern(const MatrixType& a) { Base::template analyzePattern<true, false>(a); }
+
+  /** Performs a numeric decomposition of \a matrix
+   *
+   * The given matrix must have the same sparsity than the matrix on which the symbolic decomposition has been
+   * performed.
+   *
+   * \sa analyzePattern()
+   */
+  void factorize(const MatrixType& a) { Base::template factorize<true, false>(a); }
+
+  /** \returns the determinant of the underlying matrix from the current factorization */
+  Scalar determinant() const { return Base::m_diag.prod(); }
 };
 
 /** \ingroup SparseCholesky_Module
-  * \class SimplicialLDLT
-  * \brief A direct sparse LDLT Cholesky factorizations without square root.
-  *
-  * This class provides a LDL^T Cholesky factorizations without square root of sparse matrices that are
-  * selfadjoint and positive definite. The factorization allows for solving A.X = B where
-  * X and B can be either dense or sparse.
-  * 
-  * In order to reduce the fill-in, a symmetric permutation P is applied prior to the factorization
-  * such that the factorized matrix is P A P^-1.
-  *
-  * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
-  * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower
-  *               or Upper. Default is Lower.
-  * \tparam _Ordering The ordering method to use, either AMDOrdering<> or NaturalOrdering<>. Default is AMDOrdering<>
-  *
-  * \sa class SimplicialLLT, class AMDOrdering, class NaturalOrdering
-  */
-template<typename _MatrixType, int _UpLo, typename _Ordering>
-    class SimplicialLDLT : public SimplicialCholeskyBase<SimplicialLDLT<_MatrixType,_UpLo,_Ordering> >
-{
-public:
-    typedef _MatrixType MatrixType;
-    enum { UpLo = _UpLo };
-    typedef SimplicialCholeskyBase<SimplicialLDLT> Base;
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
-    typedef SparseMatrix<Scalar,ColMajor,Index> CholMatrixType;
-    typedef Matrix<Scalar,Dynamic,1> VectorType;
-    typedef internal::traits<SimplicialLDLT> Traits;
-    typedef typename Traits::MatrixL  MatrixL;
-    typedef typename Traits::MatrixU  MatrixU;
-public:
-    /** Default constructor */
-    SimplicialLDLT() : Base() {}
-
-    /** Constructs and performs the LLT factorization of \a matrix */
-    SimplicialLDLT(const MatrixType& matrix)
-        : Base(matrix) {}
-
-    /** \returns a vector expression of the diagonal D */
-    inline const VectorType vectorD() const {
-        eigen_assert(Base::m_factorizationIsOk && "Simplicial LDLT not factorized");
-        return Base::m_diag;
-    }
-    /** \returns an expression of the factor L */
-    inline const MatrixL matrixL() const {
-        eigen_assert(Base::m_factorizationIsOk && "Simplicial LDLT not factorized");
-        return Traits::getL(Base::m_matrix);
-    }
+ * \class SimplicialNonHermitianLLT
+ * \brief A direct sparse LLT Cholesky factorizations, for symmetric non-hermitian matrices.
+ *
+ * This class provides a LL^T Cholesky factorizations of sparse matrices that are
+ * symmetric but not hermitian. For real matrices, this is equivalent to the regular LLT factorization.
+ * The factorization allows for solving A.X = B where X and B can be either dense or sparse.
+ *
+ * In order to reduce the fill-in, a symmetric permutation P is applied prior to the factorization
+ * such that the factorized matrix is P A P^-1.
+ *
+ * \tparam MatrixType_ the type of the sparse matrix A, it must be a SparseMatrix<>
+ * \tparam UpLo_ the triangular part that will be used for the computations. It can be Lower
+ *               or Upper. Default is Lower.
+ * \tparam Ordering_ The ordering method to use, either AMDOrdering<> or NaturalOrdering<>. Default is AMDOrdering<>
+ *
+ * \implsparsesolverconcept
+ *
+ * \sa class SimplicialNonHermitianLDLT, SimplicialLLT, class AMDOrdering, class NaturalOrdering
+ */
+template <typename MatrixType_, int UpLo_, typename Ordering_>
+class SimplicialNonHermitianLLT
+    : public SimplicialCholeskyBase<SimplicialNonHermitianLLT<MatrixType_, UpLo_, Ordering_> > {
+ public:
+  typedef MatrixType_ MatrixType;
+  enum { UpLo = UpLo_ };
+  typedef SimplicialCholeskyBase<SimplicialNonHermitianLLT> Base;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef typename MatrixType::StorageIndex StorageIndex;
+  typedef SparseMatrix<Scalar, ColMajor, StorageIndex> CholMatrixType;
+  typedef Matrix<Scalar, Dynamic, 1> VectorType;
+  typedef internal::traits<SimplicialNonHermitianLLT> Traits;
+  typedef typename Traits::MatrixL MatrixL;
+  typedef typename Traits::MatrixU MatrixU;
+
+ public:
+  /** Default constructor */
+  SimplicialNonHermitianLLT() : Base() {}
+
+  /** Constructs and performs the LLT factorization of \a matrix */
+  explicit SimplicialNonHermitianLLT(const MatrixType& matrix) : Base(matrix) {}
+
+  /** \returns an expression of the factor L */
+  inline const MatrixL matrixL() const {
+    eigen_assert(Base::m_factorizationIsOk && "Simplicial LLT not factorized");
+    return Traits::getL(Base::m_matrix);
+  }
 
-    /** \returns an expression of the factor U (= L^*) */
-    inline const MatrixU matrixU() const {
-        eigen_assert(Base::m_factorizationIsOk && "Simplicial LDLT not factorized");
-        return Traits::getU(Base::m_matrix);
-    }
+  /** \returns an expression of the factor U (= L^*) */
+  inline const MatrixU matrixU() const {
+    eigen_assert(Base::m_factorizationIsOk && "Simplicial LLT not factorized");
+    return Traits::getU(Base::m_matrix);
+  }
 
-    /** Computes the sparse Cholesky decomposition of \a matrix */
-    SimplicialLDLT& compute(const MatrixType& matrix)
-    {
-      Base::template compute<true>(matrix);
-      return *this;
-    }
-    
-    /** Performs a symbolic decomposition on the sparcity of \a matrix.
-      *
-      * This function is particularly useful when solving for several problems having the same structure.
-      *
-      * \sa factorize()
-      */
-    void analyzePattern(const MatrixType& a)
-    {
-      Base::analyzePattern(a, true);
-    }
+  /** Computes the sparse Cholesky decomposition of \a matrix */
+  SimplicialNonHermitianLLT& compute(const MatrixType& matrix) {
+    Base::template compute<false, true>(matrix);
+    return *this;
+  }
 
-    /** Performs a numeric decomposition of \a matrix
-      *
-      * The given matrix must has the same sparcity than the matrix on which the symbolic decomposition has been performed.
-      *
-      * \sa analyzePattern()
-      */
-    void factorize(const MatrixType& a)
-    {
-      Base::template factorize<true>(a);
-    }
+  /** Performs a symbolic decomposition on the sparsity of \a matrix.
+   *
+   * This function is particularly useful when solving for several problems having the same structure.
+   *
+   * \sa factorize()
+   */
+  void analyzePattern(const MatrixType& a) { Base::template analyzePattern<false, true>(a); }
+
+  /** Performs a numeric decomposition of \a matrix
+   *
+   * The given matrix must have the same sparsity than the matrix on which the symbolic decomposition has been
+   * performed.
+   *
+   * \sa analyzePattern()
+   */
+  void factorize(const MatrixType& a) { Base::template factorize<false, true>(a); }
+
+  /** \returns the determinant of the underlying matrix from the current factorization */
+  Scalar determinant() const {
+    Scalar detL = Base::m_matrix.diagonal().prod();
+    return detL * detL;
+  }
+};
 
-    /** \returns the determinant of the underlying matrix from the current factorization */
-    Scalar determinant() const
-    {
-      return Base::m_diag.prod();
-    }
+/** \ingroup SparseCholesky_Module
+ * \class SimplicialNonHermitianLDLT
+ * \brief A direct sparse LDLT Cholesky factorizations without square root, for symmetric non-hermitian matrices.
+ *
+ * This class provides a LDL^T Cholesky factorizations without square root of sparse matrices that are
+ * symmetric but not hermitian. For real matrices, this is equivalent to the regular LDLT factorization.
+ * The factorization allows for solving A.X = B where X and B can be either dense or sparse.
+ *
+ * In order to reduce the fill-in, a symmetric permutation P is applied prior to the factorization
+ * such that the factorized matrix is P A P^-1.
+ *
+ * \tparam MatrixType_ the type of the sparse matrix A, it must be a SparseMatrix<>
+ * \tparam UpLo_ the triangular part that will be used for the computations. It can be Lower
+ *               or Upper. Default is Lower.
+ * \tparam Ordering_ The ordering method to use, either AMDOrdering<> or NaturalOrdering<>. Default is AMDOrdering<>
+ *
+ * \implsparsesolverconcept
+ *
+ * \sa class SimplicialNonHermitianLLT, SimplicialLDLT, class AMDOrdering, class NaturalOrdering
+ */
+template <typename MatrixType_, int UpLo_, typename Ordering_>
+class SimplicialNonHermitianLDLT
+    : public SimplicialCholeskyBase<SimplicialNonHermitianLDLT<MatrixType_, UpLo_, Ordering_> > {
+ public:
+  typedef MatrixType_ MatrixType;
+  enum { UpLo = UpLo_ };
+  typedef SimplicialCholeskyBase<SimplicialNonHermitianLDLT> Base;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef typename MatrixType::StorageIndex StorageIndex;
+  typedef SparseMatrix<Scalar, ColMajor, StorageIndex> CholMatrixType;
+  typedef Matrix<Scalar, Dynamic, 1> VectorType;
+  typedef internal::traits<SimplicialNonHermitianLDLT> Traits;
+  typedef typename Traits::MatrixL MatrixL;
+  typedef typename Traits::MatrixU MatrixU;
+
+ public:
+  /** Default constructor */
+  SimplicialNonHermitianLDLT() : Base() {}
+
+  /** Constructs and performs the LLT factorization of \a matrix */
+  explicit SimplicialNonHermitianLDLT(const MatrixType& matrix) : Base(matrix) {}
+
+  /** \returns a vector expression of the diagonal D */
+  inline const VectorType vectorD() const {
+    eigen_assert(Base::m_factorizationIsOk && "Simplicial LDLT not factorized");
+    return Base::m_diag;
+  }
+  /** \returns an expression of the factor L */
+  inline const MatrixL matrixL() const {
+    eigen_assert(Base::m_factorizationIsOk && "Simplicial LDLT not factorized");
+    return Traits::getL(Base::m_matrix);
+  }
+
+  /** \returns an expression of the factor U (= L^*) */
+  inline const MatrixU matrixU() const {
+    eigen_assert(Base::m_factorizationIsOk && "Simplicial LDLT not factorized");
+    return Traits::getU(Base::m_matrix);
+  }
+
+  /** Computes the sparse Cholesky decomposition of \a matrix */
+  SimplicialNonHermitianLDLT& compute(const MatrixType& matrix) {
+    Base::template compute<true, true>(matrix);
+    return *this;
+  }
+
+  /** Performs a symbolic decomposition on the sparsity of \a matrix.
+   *
+   * This function is particularly useful when solving for several problems having the same structure.
+   *
+   * \sa factorize()
+   */
+  void analyzePattern(const MatrixType& a) { Base::template analyzePattern<true, true>(a); }
+
+  /** Performs a numeric decomposition of \a matrix
+   *
+   * The given matrix must have the same sparsity than the matrix on which the symbolic decomposition has been
+   * performed.
+   *
+   * \sa analyzePattern()
+   */
+  void factorize(const MatrixType& a) { Base::template factorize<true, true>(a); }
+
+  /** \returns the determinant of the underlying matrix from the current factorization */
+  Scalar determinant() const { return Base::m_diag.prod(); }
 };
 
 /** \deprecated use SimplicialLDLT or class SimplicialLLT
-  * \ingroup SparseCholesky_Module
-  * \class SimplicialCholesky
-  *
-  * \sa class SimplicialLDLT, class SimplicialLLT
-  */
-template<typename _MatrixType, int _UpLo, typename _Ordering>
-    class SimplicialCholesky : public SimplicialCholeskyBase<SimplicialCholesky<_MatrixType,_UpLo,_Ordering> >
-{
-public:
-    typedef _MatrixType MatrixType;
-    enum { UpLo = _UpLo };
-    typedef SimplicialCholeskyBase<SimplicialCholesky> Base;
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
-    typedef SparseMatrix<Scalar,ColMajor,Index> CholMatrixType;
-    typedef Matrix<Scalar,Dynamic,1> VectorType;
-    typedef internal::traits<SimplicialCholesky> Traits;
-    typedef internal::traits<SimplicialLDLT<MatrixType,UpLo> > LDLTTraits;
-    typedef internal::traits<SimplicialLLT<MatrixType,UpLo>  > LLTTraits;
-  public:
-    SimplicialCholesky() : Base(), m_LDLT(true) {}
-
-    SimplicialCholesky(const MatrixType& matrix)
-      : Base(), m_LDLT(true)
-    {
-      compute(matrix);
-    }
-
-    SimplicialCholesky& setMode(SimplicialCholeskyMode mode)
-    {
-      switch(mode)
-      {
+ * \ingroup SparseCholesky_Module
+ * \class SimplicialCholesky
+ *
+ * \sa class SimplicialLDLT, class SimplicialLLT
+ */
+template <typename MatrixType_, int UpLo_, typename Ordering_>
+class SimplicialCholesky : public SimplicialCholeskyBase<SimplicialCholesky<MatrixType_, UpLo_, Ordering_> > {
+ public:
+  typedef MatrixType_ MatrixType;
+  enum { UpLo = UpLo_ };
+  typedef SimplicialCholeskyBase<SimplicialCholesky> Base;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef typename MatrixType::StorageIndex StorageIndex;
+  typedef SparseMatrix<Scalar, ColMajor, StorageIndex> CholMatrixType;
+  typedef Matrix<Scalar, Dynamic, 1> VectorType;
+  typedef internal::traits<SimplicialLDLT<MatrixType, UpLo> > LDLTTraits;
+  typedef internal::traits<SimplicialLLT<MatrixType, UpLo> > LLTTraits;
+
+ public:
+  SimplicialCholesky() : Base(), m_LDLT(true) {}
+
+  explicit SimplicialCholesky(const MatrixType& matrix) : Base(), m_LDLT(true) { compute(matrix); }
+
+  SimplicialCholesky& setMode(SimplicialCholeskyMode mode) {
+    switch (mode) {
       case SimplicialCholeskyLLT:
         m_LDLT = false;
         break;
@@ -509,163 +723,146 @@ template<typename _MatrixType, int _UpLo, typename _Ordering>
         break;
       default:
         break;
-      }
-
-      return *this;
     }
 
-    inline const VectorType vectorD() const {
-        eigen_assert(Base::m_factorizationIsOk && "Simplicial Cholesky not factorized");
-        return Base::m_diag;
-    }
-    inline const CholMatrixType rawMatrix() const {
-        eigen_assert(Base::m_factorizationIsOk && "Simplicial Cholesky not factorized");
-        return Base::m_matrix;
-    }
-    
-    /** Computes the sparse Cholesky decomposition of \a matrix */
-    SimplicialCholesky& compute(const MatrixType& matrix)
+    return *this;
+  }
+
+  inline const VectorType vectorD() const {
+    eigen_assert(Base::m_factorizationIsOk && "Simplicial Cholesky not factorized");
+    return Base::m_diag;
+  }
+  inline const CholMatrixType rawMatrix() const {
+    eigen_assert(Base::m_factorizationIsOk && "Simplicial Cholesky not factorized");
+    return Base::m_matrix;
+  }
+
+  /** Computes the sparse Cholesky decomposition of \a matrix */
+  SimplicialCholesky& compute(const MatrixType& matrix) {
+    if (m_LDLT)
+      Base::template compute<true, false>(matrix);
+    else
+      Base::template compute<false, false>(matrix);
+    return *this;
+  }
+
+  /** Performs a symbolic decomposition on the sparsity of \a matrix.
+   *
+   * This function is particularly useful when solving for several problems having the same structure.
+   *
+   * \sa factorize()
+   */
+  void analyzePattern(const MatrixType& a) {
+    if (m_LDLT)
+      Base::template analyzePattern<true, false>(a);
+    else
+      Base::template analyzePattern<false, false>(a);
+  }
+
+  /** Performs a numeric decomposition of \a matrix
+   *
+   * The given matrix must have the same sparsity than the matrix on which the symbolic decomposition has been
+   * performed.
+   *
+   * \sa analyzePattern()
+   */
+  void factorize(const MatrixType& a) {
+    if (m_LDLT)
+      Base::template factorize<true, false>(a);
+    else
+      Base::template factorize<false, false>(a);
+  }
+
+  /** \internal */
+  template <typename Rhs, typename Dest>
+  void _solve_impl(const MatrixBase<Rhs>& b, MatrixBase<Dest>& dest) const {
+    eigen_assert(Base::m_factorizationIsOk &&
+                 "The decomposition is not in a valid state for solving, you must first call either compute() or "
+                 "symbolic()/numeric()");
+    eigen_assert(Base::m_matrix.rows() == b.rows());
+
+    if (Base::m_info != Success) return;
+
+    if (Base::m_P.size() > 0)
+      dest = Base::m_P * b;
+    else
+      dest = b;
+
+    if (Base::m_matrix.nonZeros() > 0)  // otherwise L==I
     {
-      if(m_LDLT)
-        Base::template compute<true>(matrix);
+      if (m_LDLT)
+        LDLTTraits::getL(Base::m_matrix).solveInPlace(dest);
       else
-        Base::template compute<false>(matrix);
-      return *this;
+        LLTTraits::getL(Base::m_matrix).solveInPlace(dest);
     }
 
-    /** Performs a symbolic decomposition on the sparcity of \a matrix.
-      *
-      * This function is particularly useful when solving for several problems having the same structure.
-      *
-      * \sa factorize()
-      */
-    void analyzePattern(const MatrixType& a)
-    {
-      Base::analyzePattern(a, m_LDLT);
-    }
+    if (Base::m_diag.size() > 0) dest = Base::m_diag.real().asDiagonal().inverse() * dest;
 
-    /** Performs a numeric decomposition of \a matrix
-      *
-      * The given matrix must has the same sparcity than the matrix on which the symbolic decomposition has been performed.
-      *
-      * \sa analyzePattern()
-      */
-    void factorize(const MatrixType& a)
+    if (Base::m_matrix.nonZeros() > 0)  // otherwise I==I
     {
-      if(m_LDLT)
-        Base::template factorize<true>(a);
+      if (m_LDLT)
+        LDLTTraits::getU(Base::m_matrix).solveInPlace(dest);
       else
-        Base::template factorize<false>(a);
+        LLTTraits::getU(Base::m_matrix).solveInPlace(dest);
     }
 
-    /** \internal */
-    template<typename Rhs,typename Dest>
-    void _solve(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const
-    {
-      eigen_assert(Base::m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or symbolic()/numeric()");
-      eigen_assert(Base::m_matrix.rows()==b.rows());
+    if (Base::m_P.size() > 0) dest = Base::m_Pinv * dest;
+  }
 
-      if(Base::m_info!=Success)
-        return;
+  /** \internal */
+  template <typename Rhs, typename Dest>
+  void _solve_impl(const SparseMatrixBase<Rhs>& b, SparseMatrixBase<Dest>& dest) const {
+    internal::solve_sparse_through_dense_panels(*this, b, dest);
+  }
 
-      if(Base::m_P.size()>0)
-        dest = Base::m_P * b;
-      else
-        dest = b;
-
-      if(Base::m_matrix.nonZeros()>0) // otherwise L==I
-      {
-        if(m_LDLT)
-          LDLTTraits::getL(Base::m_matrix).solveInPlace(dest);
-        else
-          LLTTraits::getL(Base::m_matrix).solveInPlace(dest);
-      }
-
-      if(Base::m_diag.size()>0)
-        dest = Base::m_diag.asDiagonal().inverse() * dest;
-
-      if (Base::m_matrix.nonZeros()>0) // otherwise I==I
-      {
-        if(m_LDLT)
-          LDLTTraits::getU(Base::m_matrix).solveInPlace(dest);
-        else
-          LLTTraits::getU(Base::m_matrix).solveInPlace(dest);
-      }
-
-      if(Base::m_P.size()>0)
-        dest = Base::m_Pinv * dest;
-    }
-    
-    Scalar determinant() const
-    {
-      if(m_LDLT)
-      {
-        return Base::m_diag.prod();
-      }
-      else
-      {
-        Scalar detL = Diagonal<const CholMatrixType>(Base::m_matrix).prod();
-        return numext::abs2(detL);
-      }
+  Scalar determinant() const {
+    if (m_LDLT) {
+      return Base::m_diag.prod();
+    } else {
+      Scalar detL = Diagonal<const CholMatrixType>(Base::m_matrix).prod();
+      return numext::abs2(detL);
     }
-    
-  protected:
-    bool m_LDLT;
+  }
+
+ protected:
+  bool m_LDLT;
 };
 
-template<typename Derived>
-void SimplicialCholeskyBase<Derived>::ordering(const MatrixType& a, CholMatrixType& ap)
-{
-  eigen_assert(a.rows()==a.cols());
+template <typename Derived>
+template <bool NonHermitian>
+void SimplicialCholeskyBase<Derived>::ordering(const MatrixType& a, ConstCholMatrixPtr& pmat, CholMatrixType& ap) {
+  eigen_assert(a.rows() == a.cols());
   const Index size = a.rows();
-  // Note that amd compute the inverse permutation
-  {
-    CholMatrixType C;
-    C = a.template selfadjointView<UpLo>();
-    
-    OrderingType ordering;
-    ordering(C,m_Pinv);
-  }
-
-  if(m_Pinv.size()>0)
-    m_P = m_Pinv.inverse();
-  else
-    m_P.resize(0);
-
-  ap.resize(size,size);
-  ap.template selfadjointView<Upper>() = a.template selfadjointView<UpLo>().twistedBy(m_P);
-}
+  pmat = &ap;
+  // Note that ordering methods compute the inverse permutation
+  if (!internal::is_same<OrderingType, NaturalOrdering<StorageIndex> >::value) {
+    {
+      CholMatrixType C;
+      internal::permute_symm_to_fullsymm<UpLo, NonHermitian>(a, C, NULL);
 
-namespace internal {
-  
-template<typename Derived, typename Rhs>
-struct solve_retval<SimplicialCholeskyBase<Derived>, Rhs>
-  : solve_retval_base<SimplicialCholeskyBase<Derived>, Rhs>
-{
-  typedef SimplicialCholeskyBase<Derived> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec().derived()._solve(rhs(),dst);
-  }
-};
+      OrderingType ordering;
+      ordering(C, m_Pinv);
+    }
 
-template<typename Derived, typename Rhs>
-struct sparse_solve_retval<SimplicialCholeskyBase<Derived>, Rhs>
-  : sparse_solve_retval_base<SimplicialCholeskyBase<Derived>, Rhs>
-{
-  typedef SimplicialCholeskyBase<Derived> Dec;
-  EIGEN_MAKE_SPARSE_SOLVE_HELPERS(Dec,Rhs)
+    if (m_Pinv.size() > 0)
+      m_P = m_Pinv.inverse();
+    else
+      m_P.resize(0);
 
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    this->defaultEvalTo(dst);
+    ap.resize(size, size);
+    internal::permute_symm_to_symm<UpLo, Upper, NonHermitian>(a, ap, m_P.indices().data());
+  } else {
+    m_Pinv.resize(0);
+    m_P.resize(0);
+    if (int(UpLo) == int(Lower) || MatrixType::IsRowMajor) {
+      // we have to transpose the lower part to to the upper one
+      ap.resize(size, size);
+      internal::permute_symm_to_symm<UpLo, Upper, NonHermitian>(a, ap, NULL);
+    } else
+      internal::simplicial_cholesky_grab_input<CholMatrixType, MatrixType>::run(a, pmat, ap);
   }
-};
-
-} // end namespace internal
+}
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_SIMPLICIAL_CHOLESKY_H
+#endif  // EIGEN_SIMPLICIAL_CHOLESKY_H
diff --git a/inst/include/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h b/inst/include/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h
index 7aaf702b..3c65541b 100644
--- a/inst/include/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h
+++ b/inst/include/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h
@@ -2,191 +2,380 @@
 // for linear algebra.
 //
 // Copyright (C) 2008-2012 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 /*
-
-NOTE: thes functions vave been adapted from the LDL library:
+NOTE: these functions have been adapted from the LDL library:
 
 LDL Copyright (c) 2005 by Timothy A. Davis.  All Rights Reserved.
 
-LDL License:
-
-    Your use or distribution of LDL or any modified version of
-    LDL implies that you agree to this License.
-
-    This library is free software; you can redistribute it and/or
-    modify it under the terms of the GNU Lesser General Public
-    License as published by the Free Software Foundation; either
-    version 2.1 of the License, or (at your option) any later version.
-
-    This library is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-    Lesser General Public License for more details.
-
-    You should have received a copy of the GNU Lesser General Public
-    License along with this library; if not, write to the Free Software
-    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
-    USA
-
-    Permission is hereby granted to use or copy this program under the
-    terms of the GNU LGPL, provided that the Copyright, this License,
-    and the Availability of the original version is retained on all copies.
-    User documentation of any code that uses this code or any modified
-    version of this code must cite the Copyright, this License, the
-    Availability note, and "Used by permission." Permission to modify
-    the code and to distribute modified code is granted, provided the
-    Copyright, this License, and the Availability note are retained,
-    and a notice that the code was modified is included.
+The author of LDL, Timothy A. Davis., has executed a license with Google LLC
+to permit distribution of this code and derivative works as part of Eigen under
+the Mozilla Public License v. 2.0, as stated at the top of this file.
  */
 
-#include "../Core/util/NonMPL2.h"
-
 #ifndef EIGEN_SIMPLICIAL_CHOLESKY_IMPL_H
 #define EIGEN_SIMPLICIAL_CHOLESKY_IMPL_H
 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
 namespace Eigen {
 
-template<typename Derived>
-void SimplicialCholeskyBase<Derived>::analyzePattern_preordered(const CholMatrixType& ap, bool doLDLT)
-{
-  const Index size = ap.rows();
-  m_matrix.resize(size, size);
-  m_parent.resize(size);
-  m_nonZerosPerCol.resize(size);
-
-  ei_declare_aligned_stack_constructed_variable(Index, tags, size, 0);
-
-  for(Index k = 0; k < size; ++k)
-  {
-    /* L(k,:) pattern: all nodes reachable in etree from nz in A(0:k-1,k) */
-    m_parent[k] = -1;             /* parent of k is not yet known */
-    tags[k] = k;                  /* mark node k as visited */
-    m_nonZerosPerCol[k] = 0;      /* count of nonzeros in column k of L */
-    for(typename CholMatrixType::InnerIterator it(ap,k); it; ++it)
-    {
-      Index i = it.index();
-      if(i < k)
-      {
-        /* follow path from i to root of etree, stop at flagged node */
-        for(; tags[i] != k; i = m_parent[i])
-        {
-          /* find parent of i if not yet determined */
-          if (m_parent[i] == -1)
-            m_parent[i] = k;
-          m_nonZerosPerCol[i]++;        /* L (k,i) is nonzero */
-          tags[i] = k;                  /* mark i as visited */
+namespace internal {
+
+template <typename Scalar, typename StorageIndex>
+struct simpl_chol_helper {
+  using CholMatrixType = SparseMatrix<Scalar, ColMajor, StorageIndex>;
+  using InnerIterator = typename CholMatrixType::InnerIterator;
+  using VectorI = Matrix<StorageIndex, Dynamic, 1>;
+  static constexpr StorageIndex kEmpty = -1;
+
+  // Implementation of a stack or last-in first-out structure with some debugging machinery.
+  struct Stack {
+    StorageIndex* m_data;
+    Index m_size;
+#ifndef EIGEN_NO_DEBUG
+    const Index m_maxSize;
+    Stack(StorageIndex* data, StorageIndex size, StorageIndex maxSize)
+        : m_data(data), m_size(size), m_maxSize(maxSize) {
+      eigen_assert(size >= 0);
+      eigen_assert(maxSize >= size);
+    }
+#else
+    Stack(StorageIndex* data, StorageIndex size, StorageIndex /*maxSize*/) : m_data(data), m_size(size) {}
+#endif
+    bool empty() const { return m_size == 0; }
+    Index size() const { return m_size; }
+    StorageIndex back() const {
+      eigen_assert(m_size > 0);
+      return m_data[m_size - 1];
+    }
+    void push(const StorageIndex& value) {
+#ifndef EIGEN_NO_DEBUG
+      eigen_assert(m_size < m_maxSize);
+#endif
+      m_data[m_size] = value;
+      m_size++;
+    }
+    void pop() {
+      eigen_assert(m_size > 0);
+      m_size--;
+    }
+  };
+
+  // Implementation of a disjoint-set or union-find structure with path compression.
+  struct DisjointSet {
+    StorageIndex* m_set;
+    DisjointSet(StorageIndex* set, StorageIndex size) : m_set(set) { std::iota(set, set + size, 0); }
+    // Find the set representative or root of `u`.
+    StorageIndex find(StorageIndex u) const {
+      eigen_assert(u != kEmpty);
+      while (m_set[u] != u) {
+        // manually unroll the loop by a factor of 2 to improve performance
+        u = m_set[m_set[u]];
+      }
+      return u;
+    }
+    // Perform full path compression such that each node from `u` to `v` points to `v`.
+    void compress(StorageIndex u, StorageIndex v) {
+      eigen_assert(u != kEmpty);
+      eigen_assert(v != kEmpty);
+      while (m_set[u] != v) {
+        StorageIndex next = m_set[u];
+        m_set[u] = v;
+        u = next;
+      }
+    };
+  };
+
+  // Computes the higher adjacency pattern by transposing the input lower adjacency matrix.
+  // Only the index arrays are calculated, as the values are not needed for the symbolic factorization.
+  // The outer index array provides the size requirements of the inner index array.
+
+  // Computes the outer index array of the higher adjacency matrix.
+  static void calc_hadj_outer(const StorageIndex size, const CholMatrixType& ap, StorageIndex* outerIndex) {
+    for (StorageIndex j = 1; j < size; j++) {
+      for (InnerIterator it(ap, j); it; ++it) {
+        StorageIndex i = it.index();
+        if (i < j) outerIndex[i + 1]++;
+      }
+    }
+    std::partial_sum(outerIndex, outerIndex + size + 1, outerIndex);
+  }
+
+  // inner index array
+  static void calc_hadj_inner(const StorageIndex size, const CholMatrixType& ap, const StorageIndex* outerIndex,
+                              StorageIndex* innerIndex, StorageIndex* tmp) {
+    std::fill_n(tmp, size, 0);
+
+    for (StorageIndex j = 1; j < size; j++) {
+      for (InnerIterator it(ap, j); it; ++it) {
+        StorageIndex i = it.index();
+        if (i < j) {
+          StorageIndex b = outerIndex[i] + tmp[i];
+          innerIndex[b] = j;
+          tmp[i]++;
         }
       }
     }
   }
 
-  /* construct Lp index array from m_nonZerosPerCol column counts */
-  Index* Lp = m_matrix.outerIndexPtr();
-  Lp[0] = 0;
-  for(Index k = 0; k < size; ++k)
-    Lp[k+1] = Lp[k] + m_nonZerosPerCol[k] + (doLDLT ? 0 : 1);
+  // Adapted from:
+  // Joseph W. Liu. (1986).
+  // A compact row storage scheme for Cholesky factors using elimination trees.
+  // ACM Trans. Math. Softw. 12, 2 (June 1986), 127-148. https://doi.org/10.1145/6497.6499
+
+  // Computes the elimination forest of the lower adjacency matrix, a compact representation of the sparse L factor.
+  // The L factor may contain multiple elimination trees if a column contains only its diagonal element.
+  // Each elimination tree is an n-ary tree in which each node points to its parent.
+  static void calc_etree(const StorageIndex size, const CholMatrixType& ap, StorageIndex* parent, StorageIndex* tmp) {
+    std::fill_n(parent, size, kEmpty);
+
+    DisjointSet ancestor(tmp, size);
+
+    for (StorageIndex j = 1; j < size; j++) {
+      for (InnerIterator it(ap, j); it; ++it) {
+        StorageIndex i = it.index();
+        if (i < j) {
+          StorageIndex r = ancestor.find(i);
+          if (r != j) parent[r] = j;
+          ancestor.compress(i, j);
+        }
+      }
+    }
+  }
 
-  m_matrix.resizeNonZeros(Lp[size]);
+  // Computes the child pointers of the parent tree to facilitate a depth-first search traversal.
+  static void calc_lineage(const StorageIndex size, const StorageIndex* parent, StorageIndex* firstChild,
+                           StorageIndex* firstSibling) {
+    std::fill_n(firstChild, size, kEmpty);
+    std::fill_n(firstSibling, size, kEmpty);
+
+    for (StorageIndex j = 0; j < size; j++) {
+      StorageIndex p = parent[j];
+      if (p == kEmpty) continue;
+      StorageIndex c = firstChild[p];
+      if (c == kEmpty)
+        firstChild[p] = j;
+      else {
+        while (firstSibling[c] != kEmpty) c = firstSibling[c];
+        firstSibling[c] = j;
+      }
+    }
+  }
 
-  m_isInitialized     = true;
-  m_info              = Success;
-  m_analysisIsOk      = true;
+  // Computes a post-ordered traversal of the elimination tree.
+  static void calc_post(const StorageIndex size, const StorageIndex* parent, StorageIndex* firstChild,
+                        const StorageIndex* firstSibling, StorageIndex* post, StorageIndex* dfs) {
+    Stack post_stack(post, 0, size);
+    for (StorageIndex j = 0; j < size; j++) {
+      if (parent[j] != kEmpty) continue;
+      // Begin at a root
+      Stack dfs_stack(dfs, 0, size);
+      dfs_stack.push(j);
+      while (!dfs_stack.empty()) {
+        StorageIndex i = dfs_stack.back();
+        StorageIndex c = firstChild[i];
+        if (c == kEmpty) {
+          post_stack.push(i);
+          dfs_stack.pop();
+        } else {
+          dfs_stack.push(c);
+          // Remove the path from `i` to `c` for future traversals.
+          firstChild[i] = firstSibling[c];
+        }
+      }
+    }
+    eigen_assert(post_stack.size() == size);
+    eigen_assert(std::all_of(firstChild, firstChild + size, [](StorageIndex a) { return a == kEmpty; }));
+  }
+
+  // Adapted from:
+  // Gilbert, J. R., Ng, E., & Peyton, B. W. (1994).
+  // An efficient algorithm to compute row and column counts for sparse Cholesky factorization.
+  // SIAM Journal on Matrix Analysis and Applications, 15(4), 1075-1091.
+
+  // Computes the non-zero pattern of the L factor.
+  static void calc_colcount(const StorageIndex size, const StorageIndex* hadjOuter, const StorageIndex* hadjInner,
+                            const StorageIndex* parent, StorageIndex* prevLeaf, StorageIndex* tmp,
+                            const StorageIndex* post, StorageIndex* nonZerosPerCol, bool doLDLT) {
+    // initialize nonZerosPerCol with 1 for leaves, 0 for non-leaves
+    std::fill_n(nonZerosPerCol, size, 1);
+    for (StorageIndex j = 0; j < size; j++) {
+      StorageIndex p = parent[j];
+      // p is not a leaf
+      if (p != kEmpty) nonZerosPerCol[p] = 0;
+    }
+
+    DisjointSet parentSet(tmp, size);
+    // prevLeaf is already initialized
+    eigen_assert(std::all_of(prevLeaf, prevLeaf + size, [](StorageIndex a) { return a == kEmpty; }));
+
+    for (StorageIndex j_ = 0; j_ < size; j_++) {
+      StorageIndex j = post[j_];
+      nonZerosPerCol[j] += hadjOuter[j + 1] - hadjOuter[j];
+      for (StorageIndex k = hadjOuter[j]; k < hadjOuter[j + 1]; k++) {
+        StorageIndex i = hadjInner[k];
+        eigen_assert(i > j);
+        StorageIndex prev = prevLeaf[i];
+        if (prev != kEmpty) {
+          StorageIndex q = parentSet.find(prev);
+          parentSet.compress(prev, q);
+          nonZerosPerCol[q]--;
+        }
+        prevLeaf[i] = j;
+      }
+      StorageIndex p = parent[j];
+      if (p != kEmpty) parentSet.compress(j, p);
+    }
+
+    for (StorageIndex j = 0; j < size; j++) {
+      StorageIndex p = parent[j];
+      if (p != kEmpty) nonZerosPerCol[p] += nonZerosPerCol[j] - 1;
+      if (doLDLT) nonZerosPerCol[j]--;
+    }
+  }
+
+  // Finalizes the non zero pattern of the L factor and allocates the memory for the factorization.
+  static void init_matrix(const StorageIndex size, const StorageIndex* nonZerosPerCol, CholMatrixType& L) {
+    eigen_assert(L.outerIndexPtr()[0] == 0);
+    std::partial_sum(nonZerosPerCol, nonZerosPerCol + size, L.outerIndexPtr() + 1);
+    L.resizeNonZeros(L.outerIndexPtr()[size]);
+  }
+
+  // Driver routine for the symbolic sparse Cholesky factorization.
+  static void run(const StorageIndex size, const CholMatrixType& ap, CholMatrixType& L, VectorI& parent,
+                  VectorI& workSpace, bool doLDLT) {
+    parent.resize(size);
+    workSpace.resize(4 * size);
+    L.resize(size, size);
+
+    StorageIndex* tmp1 = workSpace.data();
+    StorageIndex* tmp2 = workSpace.data() + size;
+    StorageIndex* tmp3 = workSpace.data() + 2 * size;
+    StorageIndex* tmp4 = workSpace.data() + 3 * size;
+
+    // Borrow L's outer index array for the higher adjacency pattern.
+    StorageIndex* hadj_outer = L.outerIndexPtr();
+    calc_hadj_outer(size, ap, hadj_outer);
+    // Request additional temporary storage for the inner indices of the higher adjacency pattern.
+    ei_declare_aligned_stack_constructed_variable(StorageIndex, hadj_inner, hadj_outer[size], nullptr);
+    calc_hadj_inner(size, ap, hadj_outer, hadj_inner, tmp1);
+
+    calc_etree(size, ap, parent.data(), tmp1);
+    calc_lineage(size, parent.data(), tmp1, tmp2);
+    calc_post(size, parent.data(), tmp1, tmp2, tmp3, tmp4);
+    calc_colcount(size, hadj_outer, hadj_inner, parent.data(), tmp1, tmp2, tmp3, tmp4, doLDLT);
+    init_matrix(size, tmp4, L);
+  }
+};
+
+// Symbol is ODR-used, so we need a definition.
+template <typename Scalar, typename StorageIndex>
+constexpr StorageIndex simpl_chol_helper<Scalar, StorageIndex>::kEmpty;
+
+}  // namespace internal
+
+template <typename Derived>
+void SimplicialCholeskyBase<Derived>::analyzePattern_preordered(const CholMatrixType& ap, bool doLDLT) {
+  using Helper = internal::simpl_chol_helper<Scalar, StorageIndex>;
+
+  eigen_assert(ap.innerSize() == ap.outerSize());
+  const StorageIndex size = internal::convert_index<StorageIndex>(ap.outerSize());
+
+  Helper::run(size, ap, m_matrix, m_parent, m_workSpace, doLDLT);
+
+  m_isInitialized = true;
+  m_info = Success;
+  m_analysisIsOk = true;
   m_factorizationIsOk = false;
 }
 
-
-template<typename Derived>
-template<bool DoLDLT>
-void SimplicialCholeskyBase<Derived>::factorize_preordered(const CholMatrixType& ap)
-{
+template <typename Derived>
+template <bool DoLDLT, bool NonHermitian>
+void SimplicialCholeskyBase<Derived>::factorize_preordered(const CholMatrixType& ap) {
   using std::sqrt;
+  const StorageIndex size = StorageIndex(ap.rows());
 
   eigen_assert(m_analysisIsOk && "You must first call analyzePattern()");
-  eigen_assert(ap.rows()==ap.cols());
-  const Index size = ap.rows();
-  eigen_assert(m_parent.size()==size);
-  eigen_assert(m_nonZerosPerCol.size()==size);
+  eigen_assert(ap.rows() == ap.cols());
+  eigen_assert(m_parent.size() == size);
+  eigen_assert(m_workSpace.size() >= 3 * size);
 
-  const Index* Lp = m_matrix.outerIndexPtr();
-  Index* Li = m_matrix.innerIndexPtr();
+  const StorageIndex* Lp = m_matrix.outerIndexPtr();
+  StorageIndex* Li = m_matrix.innerIndexPtr();
   Scalar* Lx = m_matrix.valuePtr();
 
   ei_declare_aligned_stack_constructed_variable(Scalar, y, size, 0);
-  ei_declare_aligned_stack_constructed_variable(Index,  pattern, size, 0);
-  ei_declare_aligned_stack_constructed_variable(Index,  tags, size, 0);
+  StorageIndex* nonZerosPerCol = m_workSpace.data();
+  StorageIndex* pattern = m_workSpace.data() + size;
+  StorageIndex* tags = m_workSpace.data() + 2 * size;
 
   bool ok = true;
   m_diag.resize(DoLDLT ? size : 0);
 
-  for(Index k = 0; k < size; ++k)
-  {
+  for (StorageIndex k = 0; k < size; ++k) {
     // compute nonzero pattern of kth row of L, in topological order
-    y[k] = 0.0;                     // Y(0:k) is now all zero
-    Index top = size;               // stack for pattern is empty
-    tags[k] = k;                    // mark node k as visited
-    m_nonZerosPerCol[k] = 0;        // count of nonzeros in column k of L
-    for(typename MatrixType::InnerIterator it(ap,k); it; ++it)
-    {
-      Index i = it.index();
-      if(i <= k)
-      {
-        y[i] += numext::conj(it.value());            /* scatter A(i,k) into Y (sum duplicates) */
+    y[k] = Scalar(0);         // Y(0:k) is now all zero
+    StorageIndex top = size;  // stack for pattern is empty
+    tags[k] = k;              // mark node k as visited
+    nonZerosPerCol[k] = 0;    // count of nonzeros in column k of L
+    for (typename CholMatrixType::InnerIterator it(ap, k); it; ++it) {
+      StorageIndex i = it.index();
+      if (i <= k) {
+        y[i] += getSymm(it.value()); /* scatter A(i,k) into Y (sum duplicates) */
         Index len;
-        for(len = 0; tags[i] != k; i = m_parent[i])
-        {
-          pattern[len++] = i;     /* L(k,i) is nonzero */
-          tags[i] = k;            /* mark i as visited */
+        for (len = 0; tags[i] != k; i = m_parent[i]) {
+          pattern[len++] = i; /* L(k,i) is nonzero */
+          tags[i] = k;        /* mark i as visited */
         }
-        while(len > 0)
-          pattern[--top] = pattern[--len];
+        while (len > 0) pattern[--top] = pattern[--len];
       }
     }
 
     /* compute numerical values kth row of L (a sparse triangular solve) */
 
-    RealScalar d = numext::real(y[k]) * m_shiftScale + m_shiftOffset;    // get D(k,k), apply the shift function, and clear Y(k)
-    y[k] = 0.0;
-    for(; top < size; ++top)
-    {
-      Index i = pattern[top];       /* pattern[top:n-1] is pattern of L(:,k) */
-      Scalar yi = y[i];             /* get and clear Y(i) */
-      y[i] = 0.0;
+    DiagonalScalar d =
+        getDiag(y[k]) * m_shiftScale + m_shiftOffset;  // get D(k,k), apply the shift function, and clear Y(k)
+    y[k] = Scalar(0);
+    for (; top < size; ++top) {
+      Index i = pattern[top]; /* pattern[top:n-1] is pattern of L(:,k) */
+      Scalar yi = y[i];       /* get and clear Y(i) */
+      y[i] = Scalar(0);
 
       /* the nonzero entry L(k,i) */
       Scalar l_ki;
-      if(DoLDLT)
-        l_ki = yi / m_diag[i];
+      if (DoLDLT)
+        l_ki = yi / getDiag(m_diag[i]);
       else
         yi = l_ki = yi / Lx[Lp[i]];
 
-      Index p2 = Lp[i] + m_nonZerosPerCol[i];
+      Index p2 = Lp[i] + nonZerosPerCol[i];
       Index p;
-      for(p = Lp[i] + (DoLDLT ? 0 : 1); p < p2; ++p)
-        y[Li[p]] -= numext::conj(Lx[p]) * yi;
-      d -= numext::real(l_ki * numext::conj(yi));
-      Li[p] = k;                          /* store L(k,i) in column form of L */
+      for (p = Lp[i] + (DoLDLT ? 0 : 1); p < p2; ++p) y[Li[p]] -= getSymm(Lx[p]) * yi;
+      d -= getDiag(l_ki * getSymm(yi));
+      Li[p] = k; /* store L(k,i) in column form of L */
       Lx[p] = l_ki;
-      ++m_nonZerosPerCol[i];              /* increment count of nonzeros in col i */
+      ++nonZerosPerCol[i]; /* increment count of nonzeros in col i */
     }
-    if(DoLDLT)
-    {
+    if (DoLDLT) {
       m_diag[k] = d;
-      if(d == RealScalar(0))
-      {
-        ok = false;                         /* failure, D(k,k) is zero */
+      if (d == RealScalar(0)) {
+        ok = false; /* failure, D(k,k) is zero */
         break;
       }
-    }
-    else
-    {
-      Index p = Lp[k] + m_nonZerosPerCol[k]++;
-      Li[p] = k ;                /* store L(k,k) = sqrt (d) in column k */
-      if(d <= RealScalar(0)) {
-        ok = false;              /* failure, matrix is not positive definite */
+    } else {
+      Index p = Lp[k] + nonZerosPerCol[k]++;
+      Li[p] = k; /* store L(k,k) = sqrt (d) in column k */
+      if (NonHermitian ? d == RealScalar(0) : numext::real(d) <= RealScalar(0)) {
+        ok = false; /* failure, matrix is not positive definite */
         break;
       }
-      Lx[p] = sqrt(d) ;
+      Lx[p] = sqrt(d);
     }
   }
 
@@ -194,6 +383,6 @@ void SimplicialCholeskyBase<Derived>::factorize_preordered(const CholMatrixType&
   m_factorizationIsOk = true;
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_SIMPLICIAL_CHOLESKY_IMPL_H
+#endif  // EIGEN_SIMPLICIAL_CHOLESKY_IMPL_H
diff --git a/inst/include/Eigen/src/SparseCore/AmbiVector.h b/inst/include/Eigen/src/SparseCore/AmbiVector.h
index 220c6451..9f265f05 100644
--- a/inst/include/Eigen/src/SparseCore/AmbiVector.h
+++ b/inst/include/Eigen/src/SparseCore/AmbiVector.h
@@ -10,140 +10,135 @@
 #ifndef EIGEN_AMBIVECTOR_H
 #define EIGEN_AMBIVECTOR_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 namespace internal {
 
 /** \internal
-  * Hybrid sparse/dense vector class designed for intensive read-write operations.
-  *
-  * See BasicSparseLLT and SparseProduct for usage examples.
-  */
-template<typename _Scalar, typename _Index>
-class AmbiVector
-{
-  public:
-    typedef _Scalar Scalar;
-    typedef _Index Index;
-    typedef typename NumTraits<Scalar>::Real RealScalar;
-
-    AmbiVector(Index size)
-      : m_buffer(0), m_zero(0), m_size(0), m_allocatedSize(0), m_allocatedElements(0), m_mode(-1)
-    {
-      resize(size);
-    }
-
-    void init(double estimatedDensity);
-    void init(int mode);
-
-    Index nonZeros() const;
+ * Hybrid sparse/dense vector class designed for intensive read-write operations.
+ *
+ * See BasicSparseLLT and SparseProduct for usage examples.
+ */
+template <typename Scalar_, typename StorageIndex_>
+class AmbiVector {
+ public:
+  typedef Scalar_ Scalar;
+  typedef StorageIndex_ StorageIndex;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+
+  explicit AmbiVector(Index size)
+      : m_buffer(0), m_zero(0), m_size(0), m_end(0), m_allocatedSize(0), m_allocatedElements(0), m_mode(-1) {
+    resize(size);
+  }
 
-    /** Specifies a sub-vector to work on */
-    void setBounds(Index start, Index end) { m_start = start; m_end = end; }
+  void init(double estimatedDensity);
+  void init(int mode);
 
-    void setZero();
+  Index nonZeros() const;
 
-    void restart();
-    Scalar& coeffRef(Index i);
-    Scalar& coeff(Index i);
+  /** Specifies a sub-vector to work on */
+  void setBounds(Index start, Index end) {
+    m_start = convert_index(start);
+    m_end = convert_index(end);
+  }
 
-    class Iterator;
+  void setZero();
 
-    ~AmbiVector() { delete[] m_buffer; }
+  void restart();
+  Scalar& coeffRef(Index i);
+  Scalar& coeff(Index i);
 
-    void resize(Index size)
-    {
-      if (m_allocatedSize < size)
-        reallocate(size);
-      m_size = size;
-    }
+  class Iterator;
 
-    Index size() const { return m_size; }
+  ~AmbiVector() { delete[] m_buffer; }
 
-  protected:
+  void resize(Index size) {
+    if (m_allocatedSize < size) reallocate(size);
+    m_size = convert_index(size);
+  }
 
-    void reallocate(Index size)
-    {
-      // if the size of the matrix is not too large, let's allocate a bit more than needed such
-      // that we can handle dense vector even in sparse mode.
-      delete[] m_buffer;
-      if (size<1000)
-      {
-        Index allocSize = (size * sizeof(ListEl) + sizeof(Scalar) - 1)/sizeof(Scalar);
-        m_allocatedElements = (allocSize*sizeof(Scalar))/sizeof(ListEl);
-        m_buffer = new Scalar[allocSize];
-      }
-      else
-      {
-        m_allocatedElements = (size*sizeof(Scalar))/sizeof(ListEl);
-        m_buffer = new Scalar[size];
-      }
-      m_size = size;
-      m_start = 0;
-      m_end = m_size;
+  StorageIndex size() const { return m_size; }
+
+ protected:
+  StorageIndex convert_index(Index idx) { return internal::convert_index<StorageIndex>(idx); }
+
+  void reallocate(Index size) {
+    // if the size of the matrix is not too large, let's allocate a bit more than needed such
+    // that we can handle dense vector even in sparse mode.
+    delete[] m_buffer;
+    if (size < 1000) {
+      Index allocSize = (size * sizeof(ListEl) + sizeof(Scalar) - 1) / sizeof(Scalar);
+      m_allocatedElements = convert_index((allocSize * sizeof(Scalar)) / sizeof(ListEl));
+      m_buffer = new Scalar[allocSize];
+    } else {
+      m_allocatedElements = convert_index((size * sizeof(Scalar)) / sizeof(ListEl));
+      m_buffer = new Scalar[size];
     }
+    m_size = convert_index(size);
+    m_start = 0;
+    m_end = m_size;
+  }
 
-    void reallocateSparse()
-    {
-      Index copyElements = m_allocatedElements;
-      m_allocatedElements = (std::min)(Index(m_allocatedElements*1.5),m_size);
-      Index allocSize = m_allocatedElements * sizeof(ListEl);
-      allocSize = (allocSize + sizeof(Scalar) - 1)/sizeof(Scalar);
-      Scalar* newBuffer = new Scalar[allocSize];
-      memcpy(newBuffer,  m_buffer,  copyElements * sizeof(ListEl));
-      delete[] m_buffer;
-      m_buffer = newBuffer;
-    }
+  void reallocateSparse() {
+    Index copyElements = m_allocatedElements;
+    m_allocatedElements = (std::min)(StorageIndex(m_allocatedElements * 1.5), m_size);
+    Index allocSize = m_allocatedElements * sizeof(ListEl);
+    allocSize = (allocSize + sizeof(Scalar) - 1) / sizeof(Scalar);
+    Scalar* newBuffer = new Scalar[allocSize];
+    std::memcpy(newBuffer, m_buffer, copyElements * sizeof(ListEl));
+    delete[] m_buffer;
+    m_buffer = newBuffer;
+  }
 
-  protected:
-    // element type of the linked list
-    struct ListEl
-    {
-      Index next;
-      Index index;
-      Scalar value;
-    };
-
-    // used to store data in both mode
-    Scalar* m_buffer;
-    Scalar m_zero;
-    Index m_size;
-    Index m_start;
-    Index m_end;
-    Index m_allocatedSize;
-    Index m_allocatedElements;
-    Index m_mode;
-
-    // linked list mode
-    Index m_llStart;
-    Index m_llCurrent;
-    Index m_llSize;
+ protected:
+  // element type of the linked list
+  struct ListEl {
+    StorageIndex next;
+    StorageIndex index;
+    Scalar value;
+  };
+
+  // used to store data in both mode
+  Scalar* m_buffer;
+  Scalar m_zero;
+  StorageIndex m_size;
+  StorageIndex m_start;
+  StorageIndex m_end;
+  StorageIndex m_allocatedSize;
+  StorageIndex m_allocatedElements;
+  StorageIndex m_mode;
+
+  // linked list mode
+  StorageIndex m_llStart;
+  StorageIndex m_llCurrent;
+  StorageIndex m_llSize;
 };
 
 /** \returns the number of non zeros in the current sub vector */
-template<typename _Scalar,typename _Index>
-_Index AmbiVector<_Scalar,_Index>::nonZeros() const
-{
-  if (m_mode==IsSparse)
+template <typename Scalar_, typename StorageIndex_>
+Index AmbiVector<Scalar_, StorageIndex_>::nonZeros() const {
+  if (m_mode == IsSparse)
     return m_llSize;
   else
     return m_end - m_start;
 }
 
-template<typename _Scalar,typename _Index>
-void AmbiVector<_Scalar,_Index>::init(double estimatedDensity)
-{
-  if (estimatedDensity>0.1)
+template <typename Scalar_, typename StorageIndex_>
+void AmbiVector<Scalar_, StorageIndex_>::init(double estimatedDensity) {
+  if (estimatedDensity > 0.1)
     init(IsDense);
   else
     init(IsSparse);
 }
 
-template<typename _Scalar,typename _Index>
-void AmbiVector<_Scalar,_Index>::init(int mode)
-{
+template <typename Scalar_, typename StorageIndex_>
+void AmbiVector<Scalar_, StorageIndex_>::init(int mode) {
   m_mode = mode;
-  if (m_mode==IsSparse)
+  // This is only necessary in sparse mode, but we set these unconditionally to avoid some maybe-uninitialized warnings
+  // if (m_mode==IsSparse)
   {
     m_llSize = 0;
     m_llStart = -1;
@@ -151,93 +146,76 @@ void AmbiVector<_Scalar,_Index>::init(int mode)
 }
 
 /** Must be called whenever we might perform a write access
-  * with an index smaller than the previous one.
-  *
-  * Don't worry, this function is extremely cheap.
-  */
-template<typename _Scalar,typename _Index>
-void AmbiVector<_Scalar,_Index>::restart()
-{
+ * with an index smaller than the previous one.
+ *
+ * Don't worry, this function is extremely cheap.
+ */
+template <typename Scalar_, typename StorageIndex_>
+void AmbiVector<Scalar_, StorageIndex_>::restart() {
   m_llCurrent = m_llStart;
 }
 
 /** Set all coefficients of current subvector to zero */
-template<typename _Scalar,typename _Index>
-void AmbiVector<_Scalar,_Index>::setZero()
-{
-  if (m_mode==IsDense)
-  {
-    for (Index i=m_start; i<m_end; ++i)
-      m_buffer[i] = Scalar(0);
-  }
-  else
-  {
-    eigen_assert(m_mode==IsSparse);
+template <typename Scalar_, typename StorageIndex_>
+void AmbiVector<Scalar_, StorageIndex_>::setZero() {
+  if (m_mode == IsDense) {
+    for (Index i = m_start; i < m_end; ++i) m_buffer[i] = Scalar(0);
+  } else {
+    eigen_assert(m_mode == IsSparse);
     m_llSize = 0;
     m_llStart = -1;
   }
 }
 
-template<typename _Scalar,typename _Index>
-_Scalar& AmbiVector<_Scalar,_Index>::coeffRef(_Index i)
-{
-  if (m_mode==IsDense)
+template <typename Scalar_, typename StorageIndex_>
+Scalar_& AmbiVector<Scalar_, StorageIndex_>::coeffRef(Index i) {
+  if (m_mode == IsDense)
     return m_buffer[i];
-  else
-  {
+  else {
     ListEl* EIGEN_RESTRICT llElements = reinterpret_cast<ListEl*>(m_buffer);
     // TODO factorize the following code to reduce code generation
-    eigen_assert(m_mode==IsSparse);
-    if (m_llSize==0)
-    {
+    eigen_assert(m_mode == IsSparse);
+    if (m_llSize == 0) {
       // this is the first element
       m_llStart = 0;
       m_llCurrent = 0;
       ++m_llSize;
       llElements[0].value = Scalar(0);
-      llElements[0].index = i;
+      llElements[0].index = convert_index(i);
       llElements[0].next = -1;
       return llElements[0].value;
-    }
-    else if (i<llElements[m_llStart].index)
-    {
+    } else if (i < llElements[m_llStart].index) {
       // this is going to be the new first element of the list
       ListEl& el = llElements[m_llSize];
       el.value = Scalar(0);
-      el.index = i;
+      el.index = convert_index(i);
       el.next = m_llStart;
       m_llStart = m_llSize;
       ++m_llSize;
       m_llCurrent = m_llStart;
       return el.value;
-    }
-    else
-    {
-      Index nextel = llElements[m_llCurrent].next;
-      eigen_assert(i>=llElements[m_llCurrent].index && "you must call restart() before inserting an element with lower or equal index");
-      while (nextel >= 0 && llElements[nextel].index<=i)
-      {
+    } else {
+      StorageIndex nextel = llElements[m_llCurrent].next;
+      eigen_assert(i >= llElements[m_llCurrent].index &&
+                   "you must call restart() before inserting an element with lower or equal index");
+      while (nextel >= 0 && llElements[nextel].index <= i) {
         m_llCurrent = nextel;
         nextel = llElements[nextel].next;
       }
 
-      if (llElements[m_llCurrent].index==i)
-      {
+      if (llElements[m_llCurrent].index == i) {
         // the coefficient already exists and we found it !
         return llElements[m_llCurrent].value;
-      }
-      else
-      {
-        if (m_llSize>=m_allocatedElements)
-        {
+      } else {
+        if (m_llSize >= m_allocatedElements) {
           reallocateSparse();
           llElements = reinterpret_cast<ListEl*>(m_buffer);
         }
-        eigen_internal_assert(m_llSize<m_allocatedElements && "internal error: overflow in sparse mode");
+        eigen_internal_assert(m_llSize < m_allocatedElements && "internal error: overflow in sparse mode");
         // let's insert a new coefficient
         ListEl& el = llElements[m_llSize];
         el.value = Scalar(0);
-        el.index = i;
+        el.index = convert_index(i);
         el.next = llElements[m_llCurrent].next;
         llElements[m_llCurrent].next = m_llSize;
         ++m_llSize;
@@ -247,26 +225,20 @@ _Scalar& AmbiVector<_Scalar,_Index>::coeffRef(_Index i)
   }
 }
 
-template<typename _Scalar,typename _Index>
-_Scalar& AmbiVector<_Scalar,_Index>::coeff(_Index i)
-{
-  if (m_mode==IsDense)
+template <typename Scalar_, typename StorageIndex_>
+Scalar_& AmbiVector<Scalar_, StorageIndex_>::coeff(Index i) {
+  if (m_mode == IsDense)
     return m_buffer[i];
-  else
-  {
+  else {
     ListEl* EIGEN_RESTRICT llElements = reinterpret_cast<ListEl*>(m_buffer);
-    eigen_assert(m_mode==IsSparse);
-    if ((m_llSize==0) || (i<llElements[m_llStart].index))
-    {
+    eigen_assert(m_mode == IsSparse);
+    if ((m_llSize == 0) || (i < llElements[m_llStart].index)) {
       return m_zero;
-    }
-    else
-    {
+    } else {
       Index elid = m_llStart;
-      while (elid >= 0 && llElements[elid].index<i)
-        elid = llElements[elid].next;
+      while (elid >= 0 && llElements[elid].index < i) elid = llElements[elid].next;
 
-      if (llElements[elid].index==i)
+      if (llElements[elid].index == i)
         return llElements[m_llCurrent].value;
       else
         return m_zero;
@@ -275,99 +247,83 @@ _Scalar& AmbiVector<_Scalar,_Index>::coeff(_Index i)
 }
 
 /** Iterator over the nonzero coefficients */
-template<typename _Scalar,typename _Index>
-class AmbiVector<_Scalar,_Index>::Iterator
-{
-  public:
-    typedef _Scalar Scalar;
-    typedef typename NumTraits<Scalar>::Real RealScalar;
-
-    /** Default constructor
-      * \param vec the vector on which we iterate
-      * \param epsilon the minimal value used to prune zero coefficients.
-      * In practice, all coefficients having a magnitude smaller than \a epsilon
-      * are skipped.
-      */
-    Iterator(const AmbiVector& vec, const RealScalar& epsilon = 0)
-      : m_vector(vec)
-    {
-      using std::abs;
-      m_epsilon = epsilon;
-      m_isDense = m_vector.m_mode==IsDense;
-      if (m_isDense)
-      {
-        m_currentEl = 0;   // this is to avoid a compilation warning
-        m_cachedValue = 0; // this is to avoid a compilation warning
-        m_cachedIndex = m_vector.m_start-1;
-        ++(*this);
-      }
-      else
-      {
-        ListEl* EIGEN_RESTRICT llElements = reinterpret_cast<ListEl*>(m_vector.m_buffer);
-        m_currentEl = m_vector.m_llStart;
-        while (m_currentEl>=0 && abs(llElements[m_currentEl].value)<=m_epsilon)
-          m_currentEl = llElements[m_currentEl].next;
-        if (m_currentEl<0)
-        {
-          m_cachedValue = 0; // this is to avoid a compilation warning
-          m_cachedIndex = -1;
-        }
-        else
-        {
-          m_cachedIndex = llElements[m_currentEl].index;
-          m_cachedValue = llElements[m_currentEl].value;
-        }
+template <typename Scalar_, typename StorageIndex_>
+class AmbiVector<Scalar_, StorageIndex_>::Iterator {
+ public:
+  typedef Scalar_ Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+
+  /** Default constructor
+   * \param vec the vector on which we iterate
+   * \param epsilon the minimal value used to prune zero coefficients.
+   * In practice, all coefficients having a magnitude smaller than \a epsilon
+   * are skipped.
+   */
+  explicit Iterator(const AmbiVector& vec, const RealScalar& epsilon = 0) : m_vector(vec) {
+    using std::abs;
+    m_epsilon = epsilon;
+    m_isDense = m_vector.m_mode == IsDense;
+    if (m_isDense) {
+      m_currentEl = 0;    // this is to avoid a compilation warning
+      m_cachedValue = 0;  // this is to avoid a compilation warning
+      m_cachedIndex = m_vector.m_start - 1;
+      ++(*this);
+    } else {
+      ListEl* EIGEN_RESTRICT llElements = reinterpret_cast<ListEl*>(m_vector.m_buffer);
+      m_currentEl = m_vector.m_llStart;
+      while (m_currentEl >= 0 && abs(llElements[m_currentEl].value) <= m_epsilon)
+        m_currentEl = llElements[m_currentEl].next;
+      if (m_currentEl < 0) {
+        m_cachedValue = 0;  // this is to avoid a compilation warning
+        m_cachedIndex = -1;
+      } else {
+        m_cachedIndex = llElements[m_currentEl].index;
+        m_cachedValue = llElements[m_currentEl].value;
       }
     }
+  }
 
-    Index index() const { return m_cachedIndex; }
-    Scalar value() const { return m_cachedValue; }
-
-    operator bool() const { return m_cachedIndex>=0; }
-
-    Iterator& operator++()
-    {
-      using std::abs;
-      if (m_isDense)
-      {
-        do {
-          ++m_cachedIndex;
-        } while (m_cachedIndex<m_vector.m_end && abs(m_vector.m_buffer[m_cachedIndex])<m_epsilon);
-        if (m_cachedIndex<m_vector.m_end)
-          m_cachedValue = m_vector.m_buffer[m_cachedIndex];
-        else
-          m_cachedIndex=-1;
-      }
+  StorageIndex index() const { return m_cachedIndex; }
+  Scalar value() const { return m_cachedValue; }
+
+  operator bool() const { return m_cachedIndex >= 0; }
+
+  Iterator& operator++() {
+    using std::abs;
+    if (m_isDense) {
+      do {
+        ++m_cachedIndex;
+      } while (m_cachedIndex < m_vector.m_end && abs(m_vector.m_buffer[m_cachedIndex]) <= m_epsilon);
+      if (m_cachedIndex < m_vector.m_end)
+        m_cachedValue = m_vector.m_buffer[m_cachedIndex];
       else
-      {
-        ListEl* EIGEN_RESTRICT llElements = reinterpret_cast<ListEl*>(m_vector.m_buffer);
-        do {
-          m_currentEl = llElements[m_currentEl].next;
-        } while (m_currentEl>=0 && abs(llElements[m_currentEl].value)<m_epsilon);
-        if (m_currentEl<0)
-        {
-          m_cachedIndex = -1;
-        }
-        else
-        {
-          m_cachedIndex = llElements[m_currentEl].index;
-          m_cachedValue = llElements[m_currentEl].value;
-        }
+        m_cachedIndex = -1;
+    } else {
+      ListEl* EIGEN_RESTRICT llElements = reinterpret_cast<ListEl*>(m_vector.m_buffer);
+      do {
+        m_currentEl = llElements[m_currentEl].next;
+      } while (m_currentEl >= 0 && abs(llElements[m_currentEl].value) <= m_epsilon);
+      if (m_currentEl < 0) {
+        m_cachedIndex = -1;
+      } else {
+        m_cachedIndex = llElements[m_currentEl].index;
+        m_cachedValue = llElements[m_currentEl].value;
       }
-      return *this;
     }
+    return *this;
+  }
 
-  protected:
-    const AmbiVector& m_vector; // the target vector
-    Index m_currentEl;            // the current element in sparse/linked-list mode
-    RealScalar m_epsilon;       // epsilon used to prune zero coefficients
-    Index m_cachedIndex;          // current coordinate
-    Scalar m_cachedValue;       // current value
-    bool m_isDense;             // mode of the vector
+ protected:
+  const AmbiVector& m_vector;  // the target vector
+  StorageIndex m_currentEl;    // the current element in sparse/linked-list mode
+  RealScalar m_epsilon;        // epsilon used to prune zero coefficients
+  StorageIndex m_cachedIndex;  // current coordinate
+  Scalar m_cachedValue;        // current value
+  bool m_isDense;              // mode of the vector
 };
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_AMBIVECTOR_H
+#endif  // EIGEN_AMBIVECTOR_H
diff --git a/inst/include/Eigen/src/SparseCore/CompressedStorage.h b/inst/include/Eigen/src/SparseCore/CompressedStorage.h
index a667cb56..8f8a6963 100644
--- a/inst/include/Eigen/src/SparseCore/CompressedStorage.h
+++ b/inst/include/Eigen/src/SparseCore/CompressedStorage.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,224 +10,197 @@
 #ifndef EIGEN_COMPRESSED_STORAGE_H
 #define EIGEN_COMPRESSED_STORAGE_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 namespace internal {
 
 /** \internal
-  * Stores a sparse set of values as a list of values and a list of indices.
-  *
-  */
-template<typename _Scalar,typename _Index>
-class CompressedStorage
-{
-  public:
-
-    typedef _Scalar Scalar;
-    typedef _Index Index;
-
-  protected:
-
-    typedef typename NumTraits<Scalar>::Real RealScalar;
-
-  public:
-
-    CompressedStorage()
-      : m_values(0), m_indices(0), m_size(0), m_allocatedSize(0)
-    {}
-
-    CompressedStorage(size_t size)
-      : m_values(0), m_indices(0), m_size(0), m_allocatedSize(0)
-    {
-      resize(size);
-    }
-
-    CompressedStorage(const CompressedStorage& other)
-      : m_values(0), m_indices(0), m_size(0), m_allocatedSize(0)
-    {
-      *this = other;
-    }
-
-    CompressedStorage& operator=(const CompressedStorage& other)
-    {
-      resize(other.size());
-      internal::smart_copy(other.m_values,  other.m_values  + m_size, m_values);
+ * Stores a sparse set of values as a list of values and a list of indices.
+ *
+ */
+template <typename Scalar_, typename StorageIndex_>
+class CompressedStorage {
+ public:
+  typedef Scalar_ Scalar;
+  typedef StorageIndex_ StorageIndex;
+
+ protected:
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+
+ public:
+  CompressedStorage() : m_values(0), m_indices(0), m_size(0), m_allocatedSize(0) {}
+
+  explicit CompressedStorage(Index size) : m_values(0), m_indices(0), m_size(0), m_allocatedSize(0) { resize(size); }
+
+  CompressedStorage(const CompressedStorage& other) : m_values(0), m_indices(0), m_size(0), m_allocatedSize(0) {
+    *this = other;
+  }
+
+  CompressedStorage& operator=(const CompressedStorage& other) {
+    resize(other.size());
+    if (other.size() > 0) {
+      internal::smart_copy(other.m_values, other.m_values + m_size, m_values);
       internal::smart_copy(other.m_indices, other.m_indices + m_size, m_indices);
-      return *this;
-    }
-
-    void swap(CompressedStorage& other)
-    {
-      std::swap(m_values, other.m_values);
-      std::swap(m_indices, other.m_indices);
-      std::swap(m_size, other.m_size);
-      std::swap(m_allocatedSize, other.m_allocatedSize);
-    }
-
-    ~CompressedStorage()
-    {
-      delete[] m_values;
-      delete[] m_indices;
-    }
-
-    void reserve(size_t size)
-    {
-      size_t newAllocatedSize = m_size + size;
-      if (newAllocatedSize > m_allocatedSize)
-        reallocate(newAllocatedSize);
-    }
-
-    void squeeze()
-    {
-      if (m_allocatedSize>m_size)
-        reallocate(m_size);
-    }
-
-    void resize(size_t size, double reserveSizeFactor = 0)
-    {
-      if (m_allocatedSize<size)
-        reallocate(size + size_t(reserveSizeFactor*double(size)));
-      m_size = size;
-    }
-
-    void append(const Scalar& v, Index i)
-    {
-      Index id = static_cast<Index>(m_size);
-      resize(m_size+1, 1);
-      m_values[id] = v;
-      m_indices[id] = i;
     }
-
-    inline size_t size() const { return m_size; }
-    inline size_t allocatedSize() const { return m_allocatedSize; }
-    inline void clear() { m_size = 0; }
-
-    inline Scalar& value(size_t i) { return m_values[i]; }
-    inline const Scalar& value(size_t i) const { return m_values[i]; }
-
-    inline Index& index(size_t i) { return m_indices[i]; }
-    inline const Index& index(size_t i) const { return m_indices[i]; }
-
-    static CompressedStorage Map(Index* indices, Scalar* values, size_t size)
-    {
-      CompressedStorage res;
-      res.m_indices = indices;
-      res.m_values = values;
-      res.m_allocatedSize = res.m_size = size;
-      return res;
-    }
-
-    /** \returns the largest \c k such that for all \c j in [0,k) index[\c j]\<\a key */
-    inline Index searchLowerIndex(Index key) const
-    {
-      return searchLowerIndex(0, m_size, key);
-    }
-
-    /** \returns the largest \c k in [start,end) such that for all \c j in [start,k) index[\c j]\<\a key */
-    inline Index searchLowerIndex(size_t start, size_t end, Index key) const
-    {
-      while(end>start)
-      {
-        size_t mid = (end+start)>>1;
-        if (m_indices[mid]<key)
-          start = mid+1;
-        else
-          end = mid;
-      }
-      return static_cast<Index>(start);
-    }
-
-    /** \returns the stored value at index \a key
-      * If the value does not exist, then the value \a defaultValue is returned without any insertion. */
-    inline Scalar at(Index key, const Scalar& defaultValue = Scalar(0)) const
-    {
-      if (m_size==0)
-        return defaultValue;
-      else if (key==m_indices[m_size-1])
-        return m_values[m_size-1];
-      // ^^  optimization: let's first check if it is the last coefficient
-      // (very common in high level algorithms)
-      const size_t id = searchLowerIndex(0,m_size-1,key);
-      return ((id<m_size) && (m_indices[id]==key)) ? m_values[id] : defaultValue;
-    }
-
-    /** Like at(), but the search is performed in the range [start,end) */
-    inline Scalar atInRange(size_t start, size_t end, Index key, const Scalar& defaultValue = Scalar(0)) const
-    {
-      if (start>=end)
-        return Scalar(0);
-      else if (end>start && key==m_indices[end-1])
-        return m_values[end-1];
-      // ^^  optimization: let's first check if it is the last coefficient
-      // (very common in high level algorithms)
-      const size_t id = searchLowerIndex(start,end-1,key);
-      return ((id<end) && (m_indices[id]==key)) ? m_values[id] : defaultValue;
-    }
-
-    /** \returns a reference to the value at index \a key
-      * If the value does not exist, then the value \a defaultValue is inserted
-      * such that the keys are sorted. */
-    inline Scalar& atWithInsertion(Index key, const Scalar& defaultValue = Scalar(0))
-    {
-      size_t id = searchLowerIndex(0,m_size,key);
-      if (id>=m_size || m_indices[id]!=key)
-      {
-        resize(m_size+1,1);
-        for (size_t j=m_size-1; j>id; --j)
-        {
-          m_indices[j] = m_indices[j-1];
-          m_values[j] = m_values[j-1];
-        }
-        m_indices[id] = key;
-        m_values[id] = defaultValue;
+    return *this;
+  }
+
+  void swap(CompressedStorage& other) {
+    std::swap(m_values, other.m_values);
+    std::swap(m_indices, other.m_indices);
+    std::swap(m_size, other.m_size);
+    std::swap(m_allocatedSize, other.m_allocatedSize);
+  }
+
+  ~CompressedStorage() {
+    conditional_aligned_delete_auto<Scalar, true>(m_values, m_allocatedSize);
+    conditional_aligned_delete_auto<StorageIndex, true>(m_indices, m_allocatedSize);
+  }
+
+  void reserve(Index size) {
+    Index newAllocatedSize = m_size + size;
+    if (newAllocatedSize > m_allocatedSize) reallocate(newAllocatedSize);
+  }
+
+  void squeeze() {
+    if (m_allocatedSize > m_size) reallocate(m_size);
+  }
+
+  void resize(Index size, double reserveSizeFactor = 0) {
+    if (m_allocatedSize < size) {
+      // Avoid underflow on the std::min<Index> call by choosing the smaller index type.
+      using SmallerIndexType =
+          typename std::conditional<static_cast<size_t>((std::numeric_limits<Index>::max)()) <
+                                        static_cast<size_t>((std::numeric_limits<StorageIndex>::max)()),
+                                    Index, StorageIndex>::type;
+      Index realloc_size =
+          (std::min<Index>)(NumTraits<SmallerIndexType>::highest(), size + Index(reserveSizeFactor * double(size)));
+      if (realloc_size < size) internal::throw_std_bad_alloc();
+      reallocate(realloc_size);
+    }
+    m_size = size;
+  }
+
+  void append(const Scalar& v, Index i) {
+    Index id = m_size;
+    resize(m_size + 1, 1);
+    m_values[id] = v;
+    m_indices[id] = internal::convert_index<StorageIndex>(i);
+  }
+
+  inline Index size() const { return m_size; }
+  inline Index allocatedSize() const { return m_allocatedSize; }
+  inline void clear() { m_size = 0; }
+
+  const Scalar* valuePtr() const { return m_values; }
+  Scalar* valuePtr() { return m_values; }
+  const StorageIndex* indexPtr() const { return m_indices; }
+  StorageIndex* indexPtr() { return m_indices; }
+
+  inline Scalar& value(Index i) {
+    eigen_internal_assert(m_values != 0);
+    return m_values[i];
+  }
+  inline const Scalar& value(Index i) const {
+    eigen_internal_assert(m_values != 0);
+    return m_values[i];
+  }
+
+  inline StorageIndex& index(Index i) {
+    eigen_internal_assert(m_indices != 0);
+    return m_indices[i];
+  }
+  inline const StorageIndex& index(Index i) const {
+    eigen_internal_assert(m_indices != 0);
+    return m_indices[i];
+  }
+
+  /** \returns the largest \c k such that for all \c j in [0,k) index[\c j]\<\a key */
+  inline Index searchLowerIndex(Index key) const { return searchLowerIndex(0, m_size, key); }
+
+  /** \returns the largest \c k in [start,end) such that for all \c j in [start,k) index[\c j]\<\a key */
+  inline Index searchLowerIndex(Index start, Index end, Index key) const {
+    return static_cast<Index>(std::distance(m_indices, std::lower_bound(m_indices + start, m_indices + end, key)));
+  }
+
+  /** \returns the stored value at index \a key
+   * If the value does not exist, then the value \a defaultValue is returned without any insertion. */
+  inline Scalar at(Index key, const Scalar& defaultValue = Scalar(0)) const {
+    if (m_size == 0)
+      return defaultValue;
+    else if (key == m_indices[m_size - 1])
+      return m_values[m_size - 1];
+    // ^^  optimization: let's first check if it is the last coefficient
+    // (very common in high level algorithms)
+    const Index id = searchLowerIndex(0, m_size - 1, key);
+    return ((id < m_size) && (m_indices[id] == key)) ? m_values[id] : defaultValue;
+  }
+
+  /** Like at(), but the search is performed in the range [start,end) */
+  inline Scalar atInRange(Index start, Index end, Index key, const Scalar& defaultValue = Scalar(0)) const {
+    if (start >= end)
+      return defaultValue;
+    else if (end > start && key == m_indices[end - 1])
+      return m_values[end - 1];
+    // ^^  optimization: let's first check if it is the last coefficient
+    // (very common in high level algorithms)
+    const Index id = searchLowerIndex(start, end - 1, key);
+    return ((id < end) && (m_indices[id] == key)) ? m_values[id] : defaultValue;
+  }
+
+  /** \returns a reference to the value at index \a key
+   * If the value does not exist, then the value \a defaultValue is inserted
+   * such that the keys are sorted. */
+  inline Scalar& atWithInsertion(Index key, const Scalar& defaultValue = Scalar(0)) {
+    Index id = searchLowerIndex(0, m_size, key);
+    if (id >= m_size || m_indices[id] != key) {
+      if (m_allocatedSize < m_size + 1) {
+        Index newAllocatedSize = 2 * (m_size + 1);
+        m_values = conditional_aligned_realloc_new_auto<Scalar, true>(m_values, newAllocatedSize, m_allocatedSize);
+        m_indices =
+            conditional_aligned_realloc_new_auto<StorageIndex, true>(m_indices, newAllocatedSize, m_allocatedSize);
+        m_allocatedSize = newAllocatedSize;
       }
-      return m_values[id];
-    }
-
-    void prune(const Scalar& reference, const RealScalar& epsilon = NumTraits<RealScalar>::dummy_precision())
-    {
-      size_t k = 0;
-      size_t n = size();
-      for (size_t i=0; i<n; ++i)
-      {
-        if (!internal::isMuchSmallerThan(value(i), reference, epsilon))
-        {
-          value(k) = value(i);
-          index(k) = index(i);
-          ++k;
-        }
+      if (m_size > id) {
+        internal::smart_memmove(m_values + id, m_values + m_size, m_values + id + 1);
+        internal::smart_memmove(m_indices + id, m_indices + m_size, m_indices + id + 1);
       }
-      resize(k,0);
-    }
-
-  protected:
-
-    inline void reallocate(size_t size)
-    {
-      Scalar* newValues  = new Scalar[size];
-      Index* newIndices = new Index[size];
-      size_t copySize = (std::min)(size, m_size);
-      // copy
-      internal::smart_copy(m_values, m_values+copySize, newValues);
-      internal::smart_copy(m_indices, m_indices+copySize, newIndices);
-      // delete old stuff
-      delete[] m_values;
-      delete[] m_indices;
-      m_values = newValues;
-      m_indices = newIndices;
-      m_allocatedSize = size;
-    }
-
-  protected:
-    Scalar* m_values;
-    Index* m_indices;
-    size_t m_size;
-    size_t m_allocatedSize;
-
+      m_size++;
+      m_indices[id] = internal::convert_index<StorageIndex>(key);
+      m_values[id] = defaultValue;
+    }
+    return m_values[id];
+  }
+
+  inline void moveChunk(Index from, Index to, Index chunkSize) {
+    eigen_internal_assert(chunkSize >= 0 && to + chunkSize <= m_size);
+    internal::smart_memmove(m_values + from, m_values + from + chunkSize, m_values + to);
+    internal::smart_memmove(m_indices + from, m_indices + from + chunkSize, m_indices + to);
+  }
+
+ protected:
+  inline void reallocate(Index size) {
+#ifdef EIGEN_SPARSE_COMPRESSED_STORAGE_REALLOCATE_PLUGIN
+    EIGEN_SPARSE_COMPRESSED_STORAGE_REALLOCATE_PLUGIN
+#endif
+    eigen_internal_assert(size != m_allocatedSize);
+    m_values = conditional_aligned_realloc_new_auto<Scalar, true>(m_values, size, m_allocatedSize);
+    m_indices = conditional_aligned_realloc_new_auto<StorageIndex, true>(m_indices, size, m_allocatedSize);
+    m_allocatedSize = size;
+  }
+
+ protected:
+  Scalar* m_values;
+  StorageIndex* m_indices;
+  Index m_size;
+  Index m_allocatedSize;
 };
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_COMPRESSED_STORAGE_H
+#endif  // EIGEN_COMPRESSED_STORAGE_H
diff --git a/inst/include/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h b/inst/include/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h
index 5c320e2d..3c6e797b 100644
--- a/inst/include/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h
+++ b/inst/include/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,24 +10,33 @@
 #ifndef EIGEN_CONSERVATIVESPARSESPARSEPRODUCT_H
 #define EIGEN_CONSERVATIVESPARSESPARSEPRODUCT_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 namespace internal {
 
-template<typename Lhs, typename Rhs, typename ResultType>
-static void conservative_sparse_sparse_product_impl(const Lhs& lhs, const Rhs& rhs, ResultType& res)
-{
-  typedef typename remove_all<Lhs>::type::Scalar Scalar;
-  typedef typename remove_all<Lhs>::type::Index Index;
+template <typename Lhs, typename Rhs, typename ResultType>
+static void conservative_sparse_sparse_product_impl(const Lhs& lhs, const Rhs& rhs, ResultType& res,
+                                                    bool sortedInsertion = false) {
+  typedef typename remove_all_t<Lhs>::Scalar LhsScalar;
+  typedef typename remove_all_t<Rhs>::Scalar RhsScalar;
+  typedef typename remove_all_t<ResultType>::Scalar ResScalar;
 
   // make sure to call innerSize/outerSize since we fake the storage order.
   Index rows = lhs.innerSize();
   Index cols = rhs.outerSize();
   eigen_assert(lhs.outerSize() == rhs.innerSize());
 
-  std::vector<bool> mask(rows,false);
-  Matrix<Scalar,Dynamic,1> values(rows);
-  Matrix<Index,Dynamic,1>  indices(rows);
+  ei_declare_aligned_stack_constructed_variable(bool, mask, rows, 0);
+  ei_declare_aligned_stack_constructed_variable(ResScalar, values, rows, 0);
+  ei_declare_aligned_stack_constructed_variable(Index, indices, rows, 0);
+
+  std::memset(mask, 0, sizeof(bool) * rows);
+
+  evaluator<Lhs> lhsEval(lhs);
+  evaluator<Rhs> rhsEval(rhs);
 
   // estimate the number of non zero entries
   // given a rhs column containing Y non zeros, we assume that the respective Y columns
@@ -35,211 +44,265 @@ static void conservative_sparse_sparse_product_impl(const Lhs& lhs, const Rhs& r
   // the product of a rhs column with the lhs is X+Y where X is the average number of non zero
   // per column of the lhs.
   // Therefore, we have nnz(lhs*rhs) = nnz(lhs) + nnz(rhs)
-  Index estimated_nnz_prod = lhs.nonZeros() + rhs.nonZeros();
+  Index estimated_nnz_prod = lhsEval.nonZerosEstimate() + rhsEval.nonZerosEstimate();
 
   res.setZero();
   res.reserve(Index(estimated_nnz_prod));
   // we compute each column of the result, one after the other
-  for (Index j=0; j<cols; ++j)
-  {
-
+  for (Index j = 0; j < cols; ++j) {
     res.startVec(j);
     Index nnz = 0;
-    for (typename Rhs::InnerIterator rhsIt(rhs, j); rhsIt; ++rhsIt)
-    {
-      Scalar y = rhsIt.value();
+    for (typename evaluator<Rhs>::InnerIterator rhsIt(rhsEval, j); rhsIt; ++rhsIt) {
+      RhsScalar y = rhsIt.value();
       Index k = rhsIt.index();
-      for (typename Lhs::InnerIterator lhsIt(lhs, k); lhsIt; ++lhsIt)
-      {
+      for (typename evaluator<Lhs>::InnerIterator lhsIt(lhsEval, k); lhsIt; ++lhsIt) {
         Index i = lhsIt.index();
-        Scalar x = lhsIt.value();
-        if(!mask[i])
-        {
+        LhsScalar x = lhsIt.value();
+        if (!mask[i]) {
           mask[i] = true;
           values[i] = x * y;
           indices[nnz] = i;
           ++nnz;
-        }
-        else
+        } else
           values[i] += x * y;
       }
     }
-
-    // unordered insertion
-    for(Index k=0; k<nnz; ++k)
-    {
-      Index i = indices[k];
-      res.insertBackByOuterInnerUnordered(j,i) = values[i];
-      mask[i] = false;
-    }
-
-#if 0
-    // alternative ordered insertion code:
-
-    Index t200 = rows/(log2(200)*1.39);
-    Index t = (rows*100)/139;
-
-    // FIXME reserve nnz non zeros
-    // FIXME implement fast sort algorithms for very small nnz
-    // if the result is sparse enough => use a quick sort
-    // otherwise => loop through the entire vector
-    // In order to avoid to perform an expensive log2 when the
-    // result is clearly very sparse we use a linear bound up to 200.
-    //if((nnz<200 && nnz<t200) || nnz * log2(nnz) < t)
-    //res.startVec(j);
-    if(true)
-    {
-      if(nnz>1) std::sort(indices.data(),indices.data()+nnz);
-      for(Index k=0; k<nnz; ++k)
-      {
+    if (!sortedInsertion) {
+      // unordered insertion
+      for (Index k = 0; k < nnz; ++k) {
         Index i = indices[k];
-        res.insertBackByOuterInner(j,i) = values[i];
+        res.insertBackByOuterInnerUnordered(j, i) = values[i];
         mask[i] = false;
       }
-    }
-    else
-    {
-      // dense path
-      for(Index i=0; i<rows; ++i)
-      {
-        if(mask[i])
-        {
+    } else {
+      // alternative ordered insertion code:
+      const Index t200 = rows / 11;  // 11 == (log2(200)*1.39)
+      const Index t = (rows * 100) / 139;
+
+      // FIXME reserve nnz non zeros
+      // FIXME implement faster sorting algorithms for very small nnz
+      // if the result is sparse enough => use a quick sort
+      // otherwise => loop through the entire vector
+      // In order to avoid to perform an expensive log2 when the
+      // result is clearly very sparse we use a linear bound up to 200.
+      if ((nnz < 200 && nnz < t200) || nnz * numext::log2(int(nnz)) < t) {
+        if (nnz > 1) std::sort(indices, indices + nnz);
+        for (Index k = 0; k < nnz; ++k) {
+          Index i = indices[k];
+          res.insertBackByOuterInner(j, i) = values[i];
           mask[i] = false;
-          res.insertBackByOuterInner(j,i) = values[i];
+        }
+      } else {
+        // dense path
+        for (Index i = 0; i < rows; ++i) {
+          if (mask[i]) {
+            mask[i] = false;
+            res.insertBackByOuterInner(j, i) = values[i];
+          }
         }
       }
     }
-#endif
-
   }
   res.finalize();
 }
 
-
-} // end namespace internal
+}  // end namespace internal
 
 namespace internal {
 
-template<typename Lhs, typename Rhs, typename ResultType,
-  int LhsStorageOrder = (traits<Lhs>::Flags&RowMajorBit) ? RowMajor : ColMajor,
-  int RhsStorageOrder = (traits<Rhs>::Flags&RowMajorBit) ? RowMajor : ColMajor,
-  int ResStorageOrder = (traits<ResultType>::Flags&RowMajorBit) ? RowMajor : ColMajor>
+// Helper template to generate new sparse matrix types
+template <class Source, int Order>
+using WithStorageOrder = SparseMatrix<typename Source::Scalar, Order, typename Source::StorageIndex>;
+
+template <typename Lhs, typename Rhs, typename ResultType,
+          int LhsStorageOrder = (traits<Lhs>::Flags & RowMajorBit) ? RowMajor : ColMajor,
+          int RhsStorageOrder = (traits<Rhs>::Flags & RowMajorBit) ? RowMajor : ColMajor,
+          int ResStorageOrder = (traits<ResultType>::Flags & RowMajorBit) ? RowMajor : ColMajor>
 struct conservative_sparse_sparse_product_selector;
 
-template<typename Lhs, typename Rhs, typename ResultType>
-struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,ColMajor,ColMajor,ColMajor>
-{
-  typedef typename remove_all<Lhs>::type LhsCleaned;
+template <typename Lhs, typename Rhs, typename ResultType>
+struct conservative_sparse_sparse_product_selector<Lhs, Rhs, ResultType, ColMajor, ColMajor, ColMajor> {
+  typedef remove_all_t<Lhs> LhsCleaned;
   typedef typename LhsCleaned::Scalar Scalar;
 
-  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
-  {
-    typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::Index> RowMajorMatrix;
-    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::Index> ColMajorMatrix;
-    ColMajorMatrix resCol(lhs.rows(),rhs.cols());
-    internal::conservative_sparse_sparse_product_impl<Lhs,Rhs,ColMajorMatrix>(lhs, rhs, resCol);
-    // sort the non zeros:
-    RowMajorMatrix resRow(resCol);
-    res = resRow;
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res) {
+    using RowMajorMatrix = WithStorageOrder<ResultType, RowMajor>;
+    using ColMajorMatrixAux = WithStorageOrder<ResultType, ColMajor>;
+
+    // If the result is tall and thin (in the extreme case a column vector)
+    // then it is faster to sort the coefficients inplace instead of transposing twice.
+    // FIXME, the following heuristic is probably not very good.
+    if (lhs.rows() > rhs.cols()) {
+      using ColMajorMatrix = typename sparse_eval<ColMajorMatrixAux, ResultType::RowsAtCompileTime,
+                                                  ResultType::ColsAtCompileTime, ColMajorMatrixAux::Flags>::type;
+      ColMajorMatrix resCol(lhs.rows(), rhs.cols());
+      // perform sorted insertion
+      internal::conservative_sparse_sparse_product_impl<Lhs, Rhs, ColMajorMatrix>(lhs, rhs, resCol, true);
+      res = resCol.markAsRValue();
+    } else {
+      ColMajorMatrixAux resCol(lhs.rows(), rhs.cols());
+      // resort to transpose to sort the entries
+      internal::conservative_sparse_sparse_product_impl<Lhs, Rhs, ColMajorMatrixAux>(lhs, rhs, resCol, false);
+      RowMajorMatrix resRow(resCol);
+      res = resRow.markAsRValue();
+    }
   }
 };
 
-template<typename Lhs, typename Rhs, typename ResultType>
-struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,RowMajor,ColMajor,ColMajor>
-{
-  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
-  {
-     typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::Index> RowMajorMatrix;
-     RowMajorMatrix rhsRow = rhs;
-     RowMajorMatrix resRow(lhs.rows(), rhs.cols());
-     internal::conservative_sparse_sparse_product_impl<RowMajorMatrix,Lhs,RowMajorMatrix>(rhsRow, lhs, resRow);
-     res = resRow;
+template <typename Lhs, typename Rhs, typename ResultType>
+struct conservative_sparse_sparse_product_selector<Lhs, Rhs, ResultType, RowMajor, ColMajor, ColMajor> {
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res) {
+    using RowMajorRhs = WithStorageOrder<Rhs, RowMajor>;
+    using RowMajorRes = WithStorageOrder<ResultType, RowMajor>;
+    RowMajorRhs rhsRow = rhs;
+    RowMajorRes resRow(lhs.rows(), rhs.cols());
+    internal::conservative_sparse_sparse_product_impl<RowMajorRhs, Lhs, RowMajorRes>(rhsRow, lhs, resRow);
+    res = resRow;
   }
 };
 
-template<typename Lhs, typename Rhs, typename ResultType>
-struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,ColMajor,RowMajor,ColMajor>
-{
-  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
-  {
-    typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::Index> RowMajorMatrix;
-    RowMajorMatrix lhsRow = lhs;
-    RowMajorMatrix resRow(lhs.rows(), rhs.cols());
-    internal::conservative_sparse_sparse_product_impl<Rhs,RowMajorMatrix,RowMajorMatrix>(rhs, lhsRow, resRow);
+template <typename Lhs, typename Rhs, typename ResultType>
+struct conservative_sparse_sparse_product_selector<Lhs, Rhs, ResultType, ColMajor, RowMajor, ColMajor> {
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res) {
+    using RowMajorLhs = WithStorageOrder<Lhs, RowMajor>;
+    using RowMajorRes = WithStorageOrder<ResultType, RowMajor>;
+    RowMajorLhs lhsRow = lhs;
+    RowMajorRes resRow(lhs.rows(), rhs.cols());
+    internal::conservative_sparse_sparse_product_impl<Rhs, RowMajorLhs, RowMajorRes>(rhs, lhsRow, resRow);
     res = resRow;
   }
 };
 
-template<typename Lhs, typename Rhs, typename ResultType>
-struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,RowMajor,RowMajor,ColMajor>
-{
-  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
-  {
-    typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::Index> RowMajorMatrix;
-    RowMajorMatrix resRow(lhs.rows(), rhs.cols());
-    internal::conservative_sparse_sparse_product_impl<Rhs,Lhs,RowMajorMatrix>(rhs, lhs, resRow);
+template <typename Lhs, typename Rhs, typename ResultType>
+struct conservative_sparse_sparse_product_selector<Lhs, Rhs, ResultType, RowMajor, RowMajor, ColMajor> {
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res) {
+    using RowMajorRes = WithStorageOrder<ResultType, RowMajor>;
+    RowMajorRes resRow(lhs.rows(), rhs.cols());
+    internal::conservative_sparse_sparse_product_impl<Rhs, Lhs, RowMajorRes>(rhs, lhs, resRow);
     res = resRow;
   }
 };
 
+template <typename Lhs, typename Rhs, typename ResultType>
+struct conservative_sparse_sparse_product_selector<Lhs, Rhs, ResultType, ColMajor, ColMajor, RowMajor> {
+  typedef typename traits<remove_all_t<Lhs>>::Scalar Scalar;
 
-template<typename Lhs, typename Rhs, typename ResultType>
-struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,ColMajor,ColMajor,RowMajor>
-{
-  typedef typename traits<typename remove_all<Lhs>::type>::Scalar Scalar;
-
-  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
-  {
-    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::Index> ColMajorMatrix;
-    ColMajorMatrix resCol(lhs.rows(), rhs.cols());
-    internal::conservative_sparse_sparse_product_impl<Lhs,Rhs,ColMajorMatrix>(lhs, rhs, resCol);
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res) {
+    using ColMajorRes = WithStorageOrder<ResultType, ColMajor>;
+    ColMajorRes resCol(lhs.rows(), rhs.cols());
+    internal::conservative_sparse_sparse_product_impl<Lhs, Rhs, ColMajorRes>(lhs, rhs, resCol);
     res = resCol;
   }
 };
 
-template<typename Lhs, typename Rhs, typename ResultType>
-struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,RowMajor,ColMajor,RowMajor>
-{
-  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
-  {
-    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::Index> ColMajorMatrix;
-    ColMajorMatrix lhsCol = lhs;
-    ColMajorMatrix resCol(lhs.rows(), rhs.cols());
-    internal::conservative_sparse_sparse_product_impl<ColMajorMatrix,Rhs,ColMajorMatrix>(lhsCol, rhs, resCol);
+template <typename Lhs, typename Rhs, typename ResultType>
+struct conservative_sparse_sparse_product_selector<Lhs, Rhs, ResultType, RowMajor, ColMajor, RowMajor> {
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res) {
+    using ColMajorLhs = WithStorageOrder<Lhs, ColMajor>;
+    using ColMajorRes = WithStorageOrder<ResultType, ColMajor>;
+    ColMajorLhs lhsCol = lhs;
+    ColMajorRes resCol(lhs.rows(), rhs.cols());
+    internal::conservative_sparse_sparse_product_impl<ColMajorLhs, Rhs, ColMajorRes>(lhsCol, rhs, resCol);
     res = resCol;
   }
 };
 
-template<typename Lhs, typename Rhs, typename ResultType>
-struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,ColMajor,RowMajor,RowMajor>
-{
-  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
-  {
-    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::Index> ColMajorMatrix;
-    ColMajorMatrix rhsCol = rhs;
-    ColMajorMatrix resCol(lhs.rows(), rhs.cols());
-    internal::conservative_sparse_sparse_product_impl<Lhs,ColMajorMatrix,ColMajorMatrix>(lhs, rhsCol, resCol);
+template <typename Lhs, typename Rhs, typename ResultType>
+struct conservative_sparse_sparse_product_selector<Lhs, Rhs, ResultType, ColMajor, RowMajor, RowMajor> {
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res) {
+    using ColMajorRhs = WithStorageOrder<Rhs, ColMajor>;
+    using ColMajorRes = WithStorageOrder<ResultType, ColMajor>;
+    ColMajorRhs rhsCol = rhs;
+    ColMajorRes resCol(lhs.rows(), rhs.cols());
+    internal::conservative_sparse_sparse_product_impl<Lhs, ColMajorRhs, ColMajorRes>(lhs, rhsCol, resCol);
     res = resCol;
   }
 };
 
-template<typename Lhs, typename Rhs, typename ResultType>
-struct conservative_sparse_sparse_product_selector<Lhs,Rhs,ResultType,RowMajor,RowMajor,RowMajor>
-{
-  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res)
-  {
-    typedef SparseMatrix<typename ResultType::Scalar,RowMajor,typename ResultType::Index> RowMajorMatrix;
-    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::Index> ColMajorMatrix;
-    RowMajorMatrix resRow(lhs.rows(),rhs.cols());
-    internal::conservative_sparse_sparse_product_impl<Rhs,Lhs,RowMajorMatrix>(rhs, lhs, resRow);
+template <typename Lhs, typename Rhs, typename ResultType>
+struct conservative_sparse_sparse_product_selector<Lhs, Rhs, ResultType, RowMajor, RowMajor, RowMajor> {
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res) {
+    using ColMajorRes = WithStorageOrder<ResultType, ColMajor>;
+    using RowMajorRes = WithStorageOrder<ResultType, RowMajor>;
+    RowMajorRes resRow(lhs.rows(), rhs.cols());
+    internal::conservative_sparse_sparse_product_impl<Rhs, Lhs, RowMajorRes>(rhs, lhs, resRow);
     // sort the non zeros:
-    ColMajorMatrix resCol(resRow);
+    ColMajorRes resCol(resRow);
     res = resCol;
   }
 };
 
-} // end namespace internal
+}  // end namespace internal
+
+namespace internal {
+
+template <typename Lhs, typename Rhs, typename ResultType>
+static void sparse_sparse_to_dense_product_impl(const Lhs& lhs, const Rhs& rhs, ResultType& res) {
+  typedef typename remove_all_t<Lhs>::Scalar LhsScalar;
+  typedef typename remove_all_t<Rhs>::Scalar RhsScalar;
+  Index cols = rhs.outerSize();
+  eigen_assert(lhs.outerSize() == rhs.innerSize());
+
+  evaluator<Lhs> lhsEval(lhs);
+  evaluator<Rhs> rhsEval(rhs);
+
+  for (Index j = 0; j < cols; ++j) {
+    for (typename evaluator<Rhs>::InnerIterator rhsIt(rhsEval, j); rhsIt; ++rhsIt) {
+      RhsScalar y = rhsIt.value();
+      Index k = rhsIt.index();
+      for (typename evaluator<Lhs>::InnerIterator lhsIt(lhsEval, k); lhsIt; ++lhsIt) {
+        Index i = lhsIt.index();
+        LhsScalar x = lhsIt.value();
+        res.coeffRef(i, j) += x * y;
+      }
+    }
+  }
+}
+
+}  // end namespace internal
+
+namespace internal {
+
+template <typename Lhs, typename Rhs, typename ResultType,
+          int LhsStorageOrder = (traits<Lhs>::Flags & RowMajorBit) ? RowMajor : ColMajor,
+          int RhsStorageOrder = (traits<Rhs>::Flags & RowMajorBit) ? RowMajor : ColMajor>
+struct sparse_sparse_to_dense_product_selector;
+
+template <typename Lhs, typename Rhs, typename ResultType>
+struct sparse_sparse_to_dense_product_selector<Lhs, Rhs, ResultType, ColMajor, ColMajor> {
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res) {
+    internal::sparse_sparse_to_dense_product_impl<Lhs, Rhs, ResultType>(lhs, rhs, res);
+  }
+};
+
+template <typename Lhs, typename Rhs, typename ResultType>
+struct sparse_sparse_to_dense_product_selector<Lhs, Rhs, ResultType, RowMajor, ColMajor> {
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res) {
+    using ColMajorLhs = WithStorageOrder<Lhs, ColMajor>;
+    ColMajorLhs lhsCol(lhs);
+    internal::sparse_sparse_to_dense_product_impl<ColMajorLhs, Rhs, ResultType>(lhsCol, rhs, res);
+  }
+};
+
+template <typename Lhs, typename Rhs, typename ResultType>
+struct sparse_sparse_to_dense_product_selector<Lhs, Rhs, ResultType, ColMajor, RowMajor> {
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res) {
+    using ColMajorRhs = WithStorageOrder<Rhs, ColMajor>;
+    ColMajorRhs rhsCol(rhs);
+    internal::sparse_sparse_to_dense_product_impl<Lhs, ColMajorRhs, ResultType>(lhs, rhsCol, res);
+  }
+};
+
+template <typename Lhs, typename Rhs, typename ResultType>
+struct sparse_sparse_to_dense_product_selector<Lhs, Rhs, ResultType, RowMajor, RowMajor> {
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res) {
+    Transpose<ResultType> trRes(res);
+    internal::sparse_sparse_to_dense_product_impl<Rhs, Lhs, Transpose<ResultType>>(rhs, lhs, trRes);
+  }
+};
+
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_CONSERVATIVESPARSESPARSEPRODUCT_H
+#endif  // EIGEN_CONSERVATIVESPARSESPARSEPRODUCT_H
diff --git a/inst/include/Eigen/src/SparseCore/InternalHeaderCheck.h b/inst/include/Eigen/src/SparseCore/InternalHeaderCheck.h
new file mode 100644
index 00000000..9de59365
--- /dev/null
+++ b/inst/include/Eigen/src/SparseCore/InternalHeaderCheck.h
@@ -0,0 +1,3 @@
+#ifndef EIGEN_SPARSECORE_MODULE_H
+#error "Please include Eigen/SparseCore instead of including headers inside the src directory directly."
+#endif
diff --git a/inst/include/Eigen/src/SparseCore/MappedSparseMatrix.h b/inst/include/Eigen/src/SparseCore/MappedSparseMatrix.h
deleted file mode 100644
index ab1a266a..00000000
--- a/inst/include/Eigen/src/SparseCore/MappedSparseMatrix.h
+++ /dev/null
@@ -1,181 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_MAPPED_SPARSEMATRIX_H
-#define EIGEN_MAPPED_SPARSEMATRIX_H
-
-namespace Eigen { 
-
-/** \class MappedSparseMatrix
-  *
-  * \brief Sparse matrix
-  *
-  * \param _Scalar the scalar type, i.e. the type of the coefficients
-  *
-  * See http://www.netlib.org/linalg/html_templates/node91.html for details on the storage scheme.
-  *
-  */
-namespace internal {
-template<typename _Scalar, int _Flags, typename _Index>
-struct traits<MappedSparseMatrix<_Scalar, _Flags, _Index> > : traits<SparseMatrix<_Scalar, _Flags, _Index> >
-{};
-}
-
-template<typename _Scalar, int _Flags, typename _Index>
-class MappedSparseMatrix
-  : public SparseMatrixBase<MappedSparseMatrix<_Scalar, _Flags, _Index> >
-{
-  public:
-    EIGEN_SPARSE_PUBLIC_INTERFACE(MappedSparseMatrix)
-    enum { IsRowMajor = Base::IsRowMajor };
-
-  protected:
-
-    Index   m_outerSize;
-    Index   m_innerSize;
-    Index   m_nnz;
-    Index*  m_outerIndex;
-    Index*  m_innerIndices;
-    Scalar* m_values;
-
-  public:
-
-    inline Index rows() const { return IsRowMajor ? m_outerSize : m_innerSize; }
-    inline Index cols() const { return IsRowMajor ? m_innerSize : m_outerSize; }
-    inline Index innerSize() const { return m_innerSize; }
-    inline Index outerSize() const { return m_outerSize; }
-    
-    bool isCompressed() const { return true; }
-
-    //----------------------------------------
-    // direct access interface
-    inline const Scalar* valuePtr() const { return m_values; }
-    inline Scalar* valuePtr() { return m_values; }
-
-    inline const Index* innerIndexPtr() const { return m_innerIndices; }
-    inline Index* innerIndexPtr() { return m_innerIndices; }
-
-    inline const Index* outerIndexPtr() const { return m_outerIndex; }
-    inline Index* outerIndexPtr() { return m_outerIndex; }
-    //----------------------------------------
-
-    inline Scalar coeff(Index row, Index col) const
-    {
-      const Index outer = IsRowMajor ? row : col;
-      const Index inner = IsRowMajor ? col : row;
-
-      Index start = m_outerIndex[outer];
-      Index end = m_outerIndex[outer+1];
-      if (start==end)
-        return Scalar(0);
-      else if (end>0 && inner==m_innerIndices[end-1])
-        return m_values[end-1];
-      // ^^  optimization: let's first check if it is the last coefficient
-      // (very common in high level algorithms)
-
-      const Index* r = std::lower_bound(&m_innerIndices[start],&m_innerIndices[end-1],inner);
-      const Index id = r-&m_innerIndices[0];
-      return ((*r==inner) && (id<end)) ? m_values[id] : Scalar(0);
-    }
-
-    inline Scalar& coeffRef(Index row, Index col)
-    {
-      const Index outer = IsRowMajor ? row : col;
-      const Index inner = IsRowMajor ? col : row;
-
-      Index start = m_outerIndex[outer];
-      Index end = m_outerIndex[outer+1];
-      eigen_assert(end>=start && "you probably called coeffRef on a non finalized matrix");
-      eigen_assert(end>start && "coeffRef cannot be called on a zero coefficient");
-      Index* r = std::lower_bound(&m_innerIndices[start],&m_innerIndices[end],inner);
-      const Index id = r-&m_innerIndices[0];
-      eigen_assert((*r==inner) && (id<end) && "coeffRef cannot be called on a zero coefficient");
-      return m_values[id];
-    }
-
-    class InnerIterator;
-    class ReverseInnerIterator;
-
-    /** \returns the number of non zero coefficients */
-    inline Index nonZeros() const  { return m_nnz; }
-
-    inline MappedSparseMatrix(Index rows, Index cols, Index nnz, Index* outerIndexPtr, Index* innerIndexPtr, Scalar* valuePtr)
-      : m_outerSize(IsRowMajor?rows:cols), m_innerSize(IsRowMajor?cols:rows), m_nnz(nnz), m_outerIndex(outerIndexPtr),
-        m_innerIndices(innerIndexPtr), m_values(valuePtr)
-    {}
-
-    /** Empty destructor */
-    inline ~MappedSparseMatrix() {}
-};
-
-template<typename Scalar, int _Flags, typename _Index>
-class MappedSparseMatrix<Scalar,_Flags,_Index>::InnerIterator
-{
-  public:
-    InnerIterator(const MappedSparseMatrix& mat, Index outer)
-      : m_matrix(mat),
-        m_outer(outer),
-        m_id(mat.outerIndexPtr()[outer]),
-        m_start(m_id),
-        m_end(mat.outerIndexPtr()[outer+1])
-    {}
-
-    inline InnerIterator& operator++() { m_id++; return *this; }
-
-    inline Scalar value() const { return m_matrix.valuePtr()[m_id]; }
-    inline Scalar& valueRef() { return const_cast<Scalar&>(m_matrix.valuePtr()[m_id]); }
-
-    inline Index index() const { return m_matrix.innerIndexPtr()[m_id]; }
-    inline Index row() const { return IsRowMajor ? m_outer : index(); }
-    inline Index col() const { return IsRowMajor ? index() : m_outer; }
-
-    inline operator bool() const { return (m_id < m_end) && (m_id>=m_start); }
-
-  protected:
-    const MappedSparseMatrix& m_matrix;
-    const Index m_outer;
-    Index m_id;
-    const Index m_start;
-    const Index m_end;
-};
-
-template<typename Scalar, int _Flags, typename _Index>
-class MappedSparseMatrix<Scalar,_Flags,_Index>::ReverseInnerIterator
-{
-  public:
-    ReverseInnerIterator(const MappedSparseMatrix& mat, Index outer)
-      : m_matrix(mat),
-        m_outer(outer),
-        m_id(mat.outerIndexPtr()[outer+1]),
-        m_start(mat.outerIndexPtr()[outer]),
-        m_end(m_id)
-    {}
-
-    inline ReverseInnerIterator& operator--() { m_id--; return *this; }
-
-    inline Scalar value() const { return m_matrix.valuePtr()[m_id-1]; }
-    inline Scalar& valueRef() { return const_cast<Scalar&>(m_matrix.valuePtr()[m_id-1]); }
-
-    inline Index index() const { return m_matrix.innerIndexPtr()[m_id-1]; }
-    inline Index row() const { return IsRowMajor ? m_outer : index(); }
-    inline Index col() const { return IsRowMajor ? index() : m_outer; }
-
-    inline operator bool() const { return (m_id <= m_end) && (m_id>m_start); }
-
-  protected:
-    const MappedSparseMatrix& m_matrix;
-    const Index m_outer;
-    Index m_id;
-    const Index m_start;
-    const Index m_end;
-};
-
-} // end namespace Eigen
-
-#endif // EIGEN_MAPPED_SPARSEMATRIX_H
diff --git a/inst/include/Eigen/src/SparseCore/SparseAssign.h b/inst/include/Eigen/src/SparseCore/SparseAssign.h
new file mode 100644
index 00000000..f2da5193
--- /dev/null
+++ b/inst/include/Eigen/src/SparseCore/SparseAssign.h
@@ -0,0 +1,279 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSEASSIGN_H
+#define EIGEN_SPARSEASSIGN_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+template <typename Derived>
+template <typename OtherDerived>
+Derived &SparseMatrixBase<Derived>::operator=(const EigenBase<OtherDerived> &other) {
+  internal::call_assignment_no_alias(derived(), other.derived());
+  return derived();
+}
+
+template <typename Derived>
+template <typename OtherDerived>
+Derived &SparseMatrixBase<Derived>::operator=(const ReturnByValue<OtherDerived> &other) {
+  // TODO use the evaluator mechanism
+  other.evalTo(derived());
+  return derived();
+}
+
+template <typename Derived>
+template <typename OtherDerived>
+inline Derived &SparseMatrixBase<Derived>::operator=(const SparseMatrixBase<OtherDerived> &other) {
+  // by default sparse evaluation do not alias, so we can safely bypass the generic call_assignment routine
+  internal::Assignment<Derived, OtherDerived, internal::assign_op<Scalar, typename OtherDerived::Scalar>>::run(
+      derived(), other.derived(), internal::assign_op<Scalar, typename OtherDerived::Scalar>());
+  return derived();
+}
+
+template <typename Derived>
+inline Derived &SparseMatrixBase<Derived>::operator=(const Derived &other) {
+  internal::call_assignment_no_alias(derived(), other.derived());
+  return derived();
+}
+
+namespace internal {
+
+template <>
+struct storage_kind_to_evaluator_kind<Sparse> {
+  typedef IteratorBased Kind;
+};
+
+template <>
+struct storage_kind_to_shape<Sparse> {
+  typedef SparseShape Shape;
+};
+
+struct Sparse2Sparse {};
+struct Sparse2Dense {};
+
+template <>
+struct AssignmentKind<SparseShape, SparseShape> {
+  typedef Sparse2Sparse Kind;
+};
+template <>
+struct AssignmentKind<SparseShape, SparseTriangularShape> {
+  typedef Sparse2Sparse Kind;
+};
+template <>
+struct AssignmentKind<DenseShape, SparseShape> {
+  typedef Sparse2Dense Kind;
+};
+template <>
+struct AssignmentKind<DenseShape, SparseTriangularShape> {
+  typedef Sparse2Dense Kind;
+};
+
+template <typename DstXprType, typename SrcXprType>
+void assign_sparse_to_sparse(DstXprType &dst, const SrcXprType &src) {
+  typedef typename DstXprType::Scalar Scalar;
+  typedef internal::evaluator<DstXprType> DstEvaluatorType;
+  typedef internal::evaluator<SrcXprType> SrcEvaluatorType;
+
+  SrcEvaluatorType srcEvaluator(src);
+
+  constexpr bool transpose = (DstEvaluatorType::Flags & RowMajorBit) != (SrcEvaluatorType::Flags & RowMajorBit);
+  const Index outerEvaluationSize = (SrcEvaluatorType::Flags & RowMajorBit) ? src.rows() : src.cols();
+
+  Index reserveSize = 0;
+  for (Index j = 0; j < outerEvaluationSize; ++j)
+    for (typename SrcEvaluatorType::InnerIterator it(srcEvaluator, j); it; ++it) reserveSize++;
+
+  if ((!transpose) && src.isRValue()) {
+    // eval without temporary
+    dst.resize(src.rows(), src.cols());
+    dst.setZero();
+    dst.reserve(reserveSize);
+    for (Index j = 0; j < outerEvaluationSize; ++j) {
+      dst.startVec(j);
+      for (typename SrcEvaluatorType::InnerIterator it(srcEvaluator, j); it; ++it) {
+        Scalar v = it.value();
+        dst.insertBackByOuterInner(j, it.index()) = v;
+      }
+    }
+    dst.finalize();
+  } else {
+    // eval through a temporary
+    eigen_assert((((internal::traits<DstXprType>::SupportedAccessPatterns & OuterRandomAccessPattern) ==
+                   OuterRandomAccessPattern) ||
+                  (!((DstEvaluatorType::Flags & RowMajorBit) != (SrcEvaluatorType::Flags & RowMajorBit)))) &&
+                 "the transpose operation is supposed to be handled in SparseMatrix::operator=");
+
+    enum { Flip = (DstEvaluatorType::Flags & RowMajorBit) != (SrcEvaluatorType::Flags & RowMajorBit) };
+
+    DstXprType temp(src.rows(), src.cols());
+
+    temp.reserve(reserveSize);
+    for (Index j = 0; j < outerEvaluationSize; ++j) {
+      temp.startVec(j);
+      for (typename SrcEvaluatorType::InnerIterator it(srcEvaluator, j); it; ++it) {
+        Scalar v = it.value();
+        temp.insertBackByOuterInner(Flip ? it.index() : j, Flip ? j : it.index()) = v;
+      }
+    }
+    temp.finalize();
+
+    dst = temp.markAsRValue();
+  }
+}
+
+// Generic Sparse to Sparse assignment
+template <typename DstXprType, typename SrcXprType, typename Functor>
+struct Assignment<DstXprType, SrcXprType, Functor, Sparse2Sparse> {
+  static void run(DstXprType &dst, const SrcXprType &src,
+                  const internal::assign_op<typename DstXprType::Scalar, typename SrcXprType::Scalar> & /*func*/) {
+    assign_sparse_to_sparse(dst.derived(), src.derived());
+  }
+};
+
+// Generic Sparse to Dense assignment
+template <typename DstXprType, typename SrcXprType, typename Functor, typename Weak>
+struct Assignment<DstXprType, SrcXprType, Functor, Sparse2Dense, Weak> {
+  static void run(DstXprType &dst, const SrcXprType &src, const Functor &func) {
+    if (internal::is_same<Functor,
+                          internal::assign_op<typename DstXprType::Scalar, typename SrcXprType::Scalar>>::value)
+      dst.setZero();
+
+    internal::evaluator<SrcXprType> srcEval(src);
+    resize_if_allowed(dst, src, func);
+    internal::evaluator<DstXprType> dstEval(dst);
+
+    const Index outerEvaluationSize = (internal::evaluator<SrcXprType>::Flags & RowMajorBit) ? src.rows() : src.cols();
+    for (Index j = 0; j < outerEvaluationSize; ++j)
+      for (typename internal::evaluator<SrcXprType>::InnerIterator i(srcEval, j); i; ++i)
+        func.assignCoeff(dstEval.coeffRef(i.row(), i.col()), i.value());
+  }
+};
+
+// Specialization for dense ?= dense +/- sparse and dense ?= sparse +/- dense
+template <typename DstXprType, typename Func1, typename Func2>
+struct assignment_from_dense_op_sparse {
+  template <typename SrcXprType, typename InitialFunc>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src,
+                                                        const InitialFunc & /*func*/) {
+#ifdef EIGEN_SPARSE_ASSIGNMENT_FROM_DENSE_OP_SPARSE_PLUGIN
+    EIGEN_SPARSE_ASSIGNMENT_FROM_DENSE_OP_SPARSE_PLUGIN
+#endif
+
+    call_assignment_no_alias(dst, src.lhs(), Func1());
+    call_assignment_no_alias(dst, src.rhs(), Func2());
+  }
+
+  // Specialization for dense1 = sparse + dense2; -> dense1 = dense2; dense1 += sparse;
+  template <typename Lhs, typename Rhs, typename Scalar>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+      std::enable_if_t<internal::is_same<typename internal::evaluator_traits<Rhs>::Shape, DenseShape>::value>
+      run(DstXprType &dst, const CwiseBinaryOp<internal::scalar_sum_op<Scalar, Scalar>, const Lhs, const Rhs> &src,
+          const internal::assign_op<typename DstXprType::Scalar, Scalar> & /*func*/) {
+#ifdef EIGEN_SPARSE_ASSIGNMENT_FROM_SPARSE_ADD_DENSE_PLUGIN
+    EIGEN_SPARSE_ASSIGNMENT_FROM_SPARSE_ADD_DENSE_PLUGIN
+#endif
+
+    // Apply the dense matrix first, then the sparse one.
+    call_assignment_no_alias(dst, src.rhs(), Func1());
+    call_assignment_no_alias(dst, src.lhs(), Func2());
+  }
+
+  // Specialization for dense1 = sparse - dense2; -> dense1 = -dense2; dense1 += sparse;
+  template <typename Lhs, typename Rhs, typename Scalar>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+      std::enable_if_t<internal::is_same<typename internal::evaluator_traits<Rhs>::Shape, DenseShape>::value>
+      run(DstXprType &dst,
+          const CwiseBinaryOp<internal::scalar_difference_op<Scalar, Scalar>, const Lhs, const Rhs> &src,
+          const internal::assign_op<typename DstXprType::Scalar, Scalar> & /*func*/) {
+#ifdef EIGEN_SPARSE_ASSIGNMENT_FROM_SPARSE_SUB_DENSE_PLUGIN
+    EIGEN_SPARSE_ASSIGNMENT_FROM_SPARSE_SUB_DENSE_PLUGIN
+#endif
+
+    // Apply the dense matrix first, then the sparse one.
+    call_assignment_no_alias(dst, -src.rhs(), Func1());
+    call_assignment_no_alias(dst, src.lhs(), add_assign_op<typename DstXprType::Scalar, typename Lhs::Scalar>());
+  }
+};
+
+#define EIGEN_CATCH_ASSIGN_DENSE_OP_SPARSE(ASSIGN_OP, BINOP, ASSIGN_OP2)                                        \
+  template <typename DstXprType, typename Lhs, typename Rhs, typename Scalar>                                   \
+  struct Assignment<                                                                                            \
+      DstXprType, CwiseBinaryOp<internal::BINOP<Scalar, Scalar>, const Lhs, const Rhs>,                         \
+      internal::ASSIGN_OP<typename DstXprType::Scalar, Scalar>, Sparse2Dense,                                   \
+      std::enable_if_t<internal::is_same<typename internal::evaluator_traits<Lhs>::Shape, DenseShape>::value || \
+                       internal::is_same<typename internal::evaluator_traits<Rhs>::Shape, DenseShape>::value>>  \
+      : assignment_from_dense_op_sparse<DstXprType,                                                             \
+                                        internal::ASSIGN_OP<typename DstXprType::Scalar, typename Lhs::Scalar>, \
+                                        internal::ASSIGN_OP2<typename DstXprType::Scalar, typename Rhs::Scalar>> {}
+
+EIGEN_CATCH_ASSIGN_DENSE_OP_SPARSE(assign_op, scalar_sum_op, add_assign_op);
+EIGEN_CATCH_ASSIGN_DENSE_OP_SPARSE(add_assign_op, scalar_sum_op, add_assign_op);
+EIGEN_CATCH_ASSIGN_DENSE_OP_SPARSE(sub_assign_op, scalar_sum_op, sub_assign_op);
+
+EIGEN_CATCH_ASSIGN_DENSE_OP_SPARSE(assign_op, scalar_difference_op, sub_assign_op);
+EIGEN_CATCH_ASSIGN_DENSE_OP_SPARSE(add_assign_op, scalar_difference_op, sub_assign_op);
+EIGEN_CATCH_ASSIGN_DENSE_OP_SPARSE(sub_assign_op, scalar_difference_op, add_assign_op);
+
+// Specialization for "dst = dec.solve(rhs)"
+// NOTE we need to specialize it for Sparse2Sparse to avoid ambiguous specialization error
+template <typename DstXprType, typename DecType, typename RhsType, typename Scalar>
+struct Assignment<DstXprType, Solve<DecType, RhsType>, internal::assign_op<Scalar, Scalar>, Sparse2Sparse> {
+  typedef Solve<DecType, RhsType> SrcXprType;
+  static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar, Scalar> &) {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if ((dst.rows() != dstRows) || (dst.cols() != dstCols)) dst.resize(dstRows, dstCols);
+
+    src.dec()._solve_impl(src.rhs(), dst);
+  }
+};
+
+struct Diagonal2Sparse {};
+
+template <>
+struct AssignmentKind<SparseShape, DiagonalShape> {
+  typedef Diagonal2Sparse Kind;
+};
+
+template <typename DstXprType, typename SrcXprType, typename Functor>
+struct Assignment<DstXprType, SrcXprType, Functor, Diagonal2Sparse> {
+  typedef typename DstXprType::StorageIndex StorageIndex;
+  typedef typename DstXprType::Scalar Scalar;
+
+  template <int Options, typename AssignFunc>
+  static void run(SparseMatrix<Scalar, Options, StorageIndex> &dst, const SrcXprType &src, const AssignFunc &func) {
+    dst.assignDiagonal(src.diagonal(), func);
+  }
+
+  template <typename DstDerived>
+  static void run(SparseMatrixBase<DstDerived> &dst, const SrcXprType &src,
+                  const internal::assign_op<typename DstXprType::Scalar, typename SrcXprType::Scalar> & /*func*/) {
+    dst.derived().diagonal() = src.diagonal();
+  }
+
+  template <typename DstDerived>
+  static void run(SparseMatrixBase<DstDerived> &dst, const SrcXprType &src,
+                  const internal::add_assign_op<typename DstXprType::Scalar, typename SrcXprType::Scalar> & /*func*/) {
+    dst.derived().diagonal() += src.diagonal();
+  }
+
+  template <typename DstDerived>
+  static void run(SparseMatrixBase<DstDerived> &dst, const SrcXprType &src,
+                  const internal::sub_assign_op<typename DstXprType::Scalar, typename SrcXprType::Scalar> & /*func*/) {
+    dst.derived().diagonal() -= src.diagonal();
+  }
+};
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_SPARSEASSIGN_H
diff --git a/inst/include/Eigen/src/SparseCore/SparseBlock.h b/inst/include/Eigen/src/SparseCore/SparseBlock.h
index 0c90bafb..1342f4e7 100644
--- a/inst/include/Eigen/src/SparseCore/SparseBlock.h
+++ b/inst/include/Eigen/src/SparseCore/SparseBlock.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,528 +10,525 @@
 #ifndef EIGEN_SPARSE_BLOCK_H
 #define EIGEN_SPARSE_BLOCK_H
 
-namespace Eigen { 
-
-template<typename XprType, int BlockRows, int BlockCols>
-class BlockImpl<XprType,BlockRows,BlockCols,true,Sparse>
-  : public SparseMatrixBase<Block<XprType,BlockRows,BlockCols,true> >
-{
-    typedef typename internal::remove_all<typename XprType::Nested>::type _MatrixTypeNested;
-    typedef Block<XprType, BlockRows, BlockCols, true> BlockType;
-public:
-    enum { IsRowMajor = internal::traits<BlockType>::IsRowMajor };
-protected:
-    enum { OuterSize = IsRowMajor ? BlockRows : BlockCols };
-public:
-    EIGEN_SPARSE_PUBLIC_INTERFACE(BlockType)
-    
-    class InnerIterator: public XprType::InnerIterator
-    {
-        typedef typename BlockImpl::Index Index;
-      public:
-        inline InnerIterator(const BlockType& xpr, Index outer)
-          : XprType::InnerIterator(xpr.m_matrix, xpr.m_outerStart + outer), m_outer(outer)
-        {}
-        inline Index row() const { return IsRowMajor ? m_outer : this->index(); }
-        inline Index col() const { return IsRowMajor ? this->index() : m_outer; }
-      protected:
-        Index m_outer;
-    };
-    class ReverseInnerIterator: public XprType::ReverseInnerIterator
-    {
-        typedef typename BlockImpl::Index Index;
-      public:
-        inline ReverseInnerIterator(const BlockType& xpr, Index outer)
-          : XprType::ReverseInnerIterator(xpr.m_matrix, xpr.m_outerStart + outer), m_outer(outer)
-        {}
-        inline Index row() const { return IsRowMajor ? m_outer : this->index(); }
-        inline Index col() const { return IsRowMajor ? this->index() : m_outer; }
-      protected:
-        Index m_outer;
-    };
-
-    inline BlockImpl(const XprType& xpr, int i)
-      : m_matrix(xpr), m_outerStart(i), m_outerSize(OuterSize)
-    {}
-
-    inline BlockImpl(const XprType& xpr, int startRow, int startCol, int blockRows, int blockCols)
-      : m_matrix(xpr), m_outerStart(IsRowMajor ? startRow : startCol), m_outerSize(IsRowMajor ? blockRows : blockCols)
-    {}
-    
-    inline const Scalar coeff(int row, int col) const
-    {
-      return m_matrix.coeff(row + IsRowMajor ? m_outerStart : 0, col +IsRowMajor ? 0 :  m_outerStart);
-    }
-    
-    inline const Scalar coeff(int index) const
-    {
-      return m_matrix.coeff(IsRowMajor ? m_outerStart : index, IsRowMajor ? index :  m_outerStart);
-    }
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 
-    EIGEN_STRONG_INLINE Index rows() const { return IsRowMajor ? m_outerSize.value() : m_matrix.rows(); }
-    EIGEN_STRONG_INLINE Index cols() const { return IsRowMajor ? m_matrix.cols() : m_outerSize.value(); }
+namespace Eigen {
 
-  protected:
+// Subset of columns or rows
+template <typename XprType, int BlockRows, int BlockCols>
+class BlockImpl<XprType, BlockRows, BlockCols, true, Sparse>
+    : public SparseMatrixBase<Block<XprType, BlockRows, BlockCols, true> > {
+  typedef internal::remove_all_t<typename XprType::Nested> MatrixTypeNested_;
+  typedef Block<XprType, BlockRows, BlockCols, true> BlockType;
 
-    typename XprType::Nested m_matrix;
-    Index m_outerStart;
-    const internal::variable_if_dynamic<Index, OuterSize> m_outerSize;
+ public:
+  enum { IsRowMajor = internal::traits<BlockType>::IsRowMajor };
 
-    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl)
-  private:
-    Index nonZeros() const;
-};
+ protected:
+  enum { OuterSize = IsRowMajor ? BlockRows : BlockCols };
+  typedef SparseMatrixBase<BlockType> Base;
+  using Base::convert_index;
+
+ public:
+  EIGEN_SPARSE_PUBLIC_INTERFACE(BlockType)
 
+  inline BlockImpl(XprType& xpr, Index i) : m_matrix(xpr), m_outerStart(convert_index(i)), m_outerSize(OuterSize) {}
+
+  inline BlockImpl(XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols)
+      : m_matrix(xpr),
+        m_outerStart(convert_index(IsRowMajor ? startRow : startCol)),
+        m_outerSize(convert_index(IsRowMajor ? blockRows : blockCols)) {}
+
+  EIGEN_STRONG_INLINE Index rows() const { return IsRowMajor ? m_outerSize.value() : m_matrix.rows(); }
+  EIGEN_STRONG_INLINE Index cols() const { return IsRowMajor ? m_matrix.cols() : m_outerSize.value(); }
+
+  Index nonZeros() const {
+    typedef internal::evaluator<XprType> EvaluatorType;
+    EvaluatorType matEval(m_matrix);
+    Index nnz = 0;
+    Index end = m_outerStart + m_outerSize.value();
+    for (Index j = m_outerStart; j < end; ++j)
+      for (typename EvaluatorType::InnerIterator it(matEval, j); it; ++it) ++nnz;
+    return nnz;
+  }
+
+  inline const Scalar coeff(Index row, Index col) const {
+    return m_matrix.coeff(row + (IsRowMajor ? m_outerStart : 0), col + (IsRowMajor ? 0 : m_outerStart));
+  }
+
+  inline const Scalar coeff(Index index) const {
+    return m_matrix.coeff(IsRowMajor ? m_outerStart : index, IsRowMajor ? index : m_outerStart);
+  }
+
+  inline const XprType& nestedExpression() const { return m_matrix; }
+  inline XprType& nestedExpression() { return m_matrix; }
+  Index startRow() const { return IsRowMajor ? m_outerStart : 0; }
+  Index startCol() const { return IsRowMajor ? 0 : m_outerStart; }
+  Index blockRows() const { return IsRowMajor ? m_outerSize.value() : m_matrix.rows(); }
+  Index blockCols() const { return IsRowMajor ? m_matrix.cols() : m_outerSize.value(); }
+
+ protected:
+  typename internal::ref_selector<XprType>::non_const_type m_matrix;
+  Index m_outerStart;
+  const internal::variable_if_dynamic<Index, OuterSize> m_outerSize;
+
+ protected:
+  // Disable assignment with clear error message.
+  // Note that simply removing operator= yields compilation errors with ICC+MSVC
+  template <typename T>
+  BlockImpl& operator=(const T&) {
+    EIGEN_STATIC_ASSERT(sizeof(T) == 0, THIS_SPARSE_BLOCK_SUBEXPRESSION_IS_READ_ONLY);
+    return *this;
+  }
+};
 
 /***************************************************************************
-* specialisation for SparseMatrix
-***************************************************************************/
-
-template<typename _Scalar, int _Options, typename _Index, int BlockRows, int BlockCols>
-class BlockImpl<SparseMatrix<_Scalar, _Options, _Index>,BlockRows,BlockCols,true,Sparse>
-  : public SparseMatrixBase<Block<SparseMatrix<_Scalar, _Options, _Index>,BlockRows,BlockCols,true> >
-{
-    typedef SparseMatrix<_Scalar, _Options, _Index> SparseMatrixType;
-    typedef typename internal::remove_all<typename SparseMatrixType::Nested>::type _MatrixTypeNested;
-    typedef Block<SparseMatrixType, BlockRows, BlockCols, true> BlockType;
-    typedef Block<const SparseMatrixType, BlockRows, BlockCols, true> ConstBlockType;
-public:
-    enum { IsRowMajor = internal::traits<BlockType>::IsRowMajor };
-    EIGEN_SPARSE_PUBLIC_INTERFACE(BlockType)
-protected:
-    enum { OuterSize = IsRowMajor ? BlockRows : BlockCols };
-public:
-    
-    class InnerIterator: public SparseMatrixType::InnerIterator
-    {
-      public:
-        inline InnerIterator(const BlockType& xpr, Index outer)
-          : SparseMatrixType::InnerIterator(xpr.m_matrix, xpr.m_outerStart + outer), m_outer(outer)
-        {}
-        inline Index row() const { return IsRowMajor ? m_outer : this->index(); }
-        inline Index col() const { return IsRowMajor ? this->index() : m_outer; }
-      protected:
-        Index m_outer;
-    };
-    class ReverseInnerIterator: public SparseMatrixType::ReverseInnerIterator
-    {
-      public:
-        inline ReverseInnerIterator(const BlockType& xpr, Index outer)
-          : SparseMatrixType::ReverseInnerIterator(xpr.m_matrix, xpr.m_outerStart + outer), m_outer(outer)
-        {}
-        inline Index row() const { return IsRowMajor ? m_outer : this->index(); }
-        inline Index col() const { return IsRowMajor ? this->index() : m_outer; }
-      protected:
-        Index m_outer;
-    };
-
-    inline BlockImpl(const SparseMatrixType& xpr, int i)
-      : m_matrix(xpr), m_outerStart(i), m_outerSize(OuterSize)
-    {}
-
-    inline BlockImpl(const SparseMatrixType& xpr, int startRow, int startCol, int blockRows, int blockCols)
-      : m_matrix(xpr), m_outerStart(IsRowMajor ? startRow : startCol), m_outerSize(IsRowMajor ? blockRows : blockCols)
-    {}
-
-    template<typename OtherDerived>
-    inline BlockType& operator=(const SparseMatrixBase<OtherDerived>& other)
-    {
-      typedef typename internal::remove_all<typename SparseMatrixType::Nested>::type _NestedMatrixType;
-      _NestedMatrixType& matrix = const_cast<_NestedMatrixType&>(m_matrix);;
-      // This assignement is slow if this vector set is not empty
-      // and/or it is not at the end of the nonzeros of the underlying matrix.
-
-      // 1 - eval to a temporary to avoid transposition and/or aliasing issues
-      SparseMatrix<Scalar, IsRowMajor ? RowMajor : ColMajor, Index> tmp(other);
-
-      // 2 - let's check whether there is enough allocated memory
-      Index nnz           = tmp.nonZeros();
-      Index start         = m_outerStart==0 ? 0 : matrix.outerIndexPtr()[m_outerStart]; // starting position of the current block
-      Index end           = m_matrix.outerIndexPtr()[m_outerStart+m_outerSize.value()]; // ending posiiton of the current block
-      Index block_size    = end - start;                                                // available room in the current block
-      Index tail_size     = m_matrix.outerIndexPtr()[m_matrix.outerSize()] - end;
-      
-      Index free_size     = m_matrix.isCompressed()
-                          ? Index(matrix.data().allocatedSize()) + block_size
-                          : block_size;
-
-      if(nnz>free_size) 
-      {
-        // realloc manually to reduce copies
-        typename SparseMatrixType::Storage newdata(m_matrix.data().allocatedSize() - block_size + nnz);
-
-        std::memcpy(&newdata.value(0), &m_matrix.data().value(0), start*sizeof(Scalar));
-        std::memcpy(&newdata.index(0), &m_matrix.data().index(0), start*sizeof(Index));
-
-        std::memcpy(&newdata.value(start), &tmp.data().value(0), nnz*sizeof(Scalar));
-        std::memcpy(&newdata.index(start), &tmp.data().index(0), nnz*sizeof(Index));
-
-        std::memcpy(&newdata.value(start+nnz), &matrix.data().value(end), tail_size*sizeof(Scalar));
-        std::memcpy(&newdata.index(start+nnz), &matrix.data().index(end), tail_size*sizeof(Index));
-        
-        newdata.resize(m_matrix.outerIndexPtr()[m_matrix.outerSize()] - block_size + nnz);
-
-        matrix.data().swap(newdata);
-      }
-      else
-      {
+ * specialization for SparseMatrix
+ ***************************************************************************/
+
+namespace internal {
+
+template <typename SparseMatrixType, int BlockRows, int BlockCols>
+class sparse_matrix_block_impl : public SparseCompressedBase<Block<SparseMatrixType, BlockRows, BlockCols, true> > {
+  typedef internal::remove_all_t<typename SparseMatrixType::Nested> MatrixTypeNested_;
+  typedef Block<SparseMatrixType, BlockRows, BlockCols, true> BlockType;
+  typedef SparseCompressedBase<Block<SparseMatrixType, BlockRows, BlockCols, true> > Base;
+  using Base::convert_index;
+
+ public:
+  enum { IsRowMajor = internal::traits<BlockType>::IsRowMajor };
+  EIGEN_SPARSE_PUBLIC_INTERFACE(BlockType)
+ protected:
+  typedef typename Base::IndexVector IndexVector;
+  enum { OuterSize = IsRowMajor ? BlockRows : BlockCols };
+
+ public:
+  inline sparse_matrix_block_impl(SparseMatrixType& xpr, Index i)
+      : m_matrix(xpr), m_outerStart(convert_index(i)), m_outerSize(OuterSize) {}
+
+  inline sparse_matrix_block_impl(SparseMatrixType& xpr, Index startRow, Index startCol, Index blockRows,
+                                  Index blockCols)
+      : m_matrix(xpr),
+        m_outerStart(convert_index(IsRowMajor ? startRow : startCol)),
+        m_outerSize(convert_index(IsRowMajor ? blockRows : blockCols)) {}
+
+  template <typename OtherDerived>
+  inline BlockType& operator=(const SparseMatrixBase<OtherDerived>& other) {
+    typedef internal::remove_all_t<typename SparseMatrixType::Nested> NestedMatrixType_;
+    NestedMatrixType_& matrix = m_matrix;
+    // This assignment is slow if this vector set is not empty
+    // and/or it is not at the end of the nonzeros of the underlying matrix.
+
+    // 1 - eval to a temporary to avoid transposition and/or aliasing issues
+    Ref<const SparseMatrix<Scalar, IsRowMajor ? RowMajor : ColMajor, StorageIndex> > tmp(other.derived());
+    eigen_internal_assert(tmp.outerSize() == m_outerSize.value());
+
+    // 2 - let's check whether there is enough allocated memory
+    Index nnz = tmp.nonZeros();
+    Index start =
+        m_outerStart == 0 ? 0 : m_matrix.outerIndexPtr()[m_outerStart];        // starting position of the current block
+    Index end = m_matrix.outerIndexPtr()[m_outerStart + m_outerSize.value()];  // ending position of the current block
+    Index block_size = end - start;                                            // available room in the current block
+    Index tail_size = m_matrix.outerIndexPtr()[m_matrix.outerSize()] - end;
+
+    Index free_size = m_matrix.isCompressed() ? Index(matrix.data().allocatedSize()) + block_size : block_size;
+
+    Index tmp_start = tmp.outerIndexPtr()[0];
+
+    bool update_trailing_pointers = false;
+    if (nnz > free_size) {
+      // realloc manually to reduce copies
+      typename SparseMatrixType::Storage newdata(m_matrix.data().allocatedSize() - block_size + nnz);
+
+      internal::smart_copy(m_matrix.valuePtr(), m_matrix.valuePtr() + start, newdata.valuePtr());
+      internal::smart_copy(m_matrix.innerIndexPtr(), m_matrix.innerIndexPtr() + start, newdata.indexPtr());
+
+      internal::smart_copy(tmp.valuePtr() + tmp_start, tmp.valuePtr() + tmp_start + nnz, newdata.valuePtr() + start);
+      internal::smart_copy(tmp.innerIndexPtr() + tmp_start, tmp.innerIndexPtr() + tmp_start + nnz,
+                           newdata.indexPtr() + start);
+
+      internal::smart_copy(matrix.valuePtr() + end, matrix.valuePtr() + end + tail_size,
+                           newdata.valuePtr() + start + nnz);
+      internal::smart_copy(matrix.innerIndexPtr() + end, matrix.innerIndexPtr() + end + tail_size,
+                           newdata.indexPtr() + start + nnz);
+
+      newdata.resize(m_matrix.outerIndexPtr()[m_matrix.outerSize()] - block_size + nnz);
+
+      matrix.data().swap(newdata);
+
+      update_trailing_pointers = true;
+    } else {
+      if (m_matrix.isCompressed() && nnz != block_size) {
         // no need to realloc, simply copy the tail at its respective position and insert tmp
         matrix.data().resize(start + nnz + tail_size);
 
-        std::memmove(&matrix.data().value(start+nnz), &matrix.data().value(end), tail_size*sizeof(Scalar));
-        std::memmove(&matrix.data().index(start+nnz), &matrix.data().index(end), tail_size*sizeof(Index));
+        internal::smart_memmove(matrix.valuePtr() + end, matrix.valuePtr() + end + tail_size,
+                                matrix.valuePtr() + start + nnz);
+        internal::smart_memmove(matrix.innerIndexPtr() + end, matrix.innerIndexPtr() + end + tail_size,
+                                matrix.innerIndexPtr() + start + nnz);
 
-        std::memcpy(&matrix.data().value(start), &tmp.data().value(0), nnz*sizeof(Scalar));
-        std::memcpy(&matrix.data().index(start), &tmp.data().index(0), nnz*sizeof(Index));
-      }
-      
-      // update innerNonZeros
-      if(!m_matrix.isCompressed())
-        for(Index j=0; j<m_outerSize.value(); ++j)
-          matrix.innerNonZeroPtr()[m_outerStart+j] = tmp.innerVector(j).nonZeros();
-
-      // update outer index pointers
-      Index p = start;
-      for(Index k=0; k<m_outerSize.value(); ++k)
-      {
-        matrix.outerIndexPtr()[m_outerStart+k] = p;
-        p += tmp.innerVector(k).nonZeros();
-      }
-      std::ptrdiff_t offset = nnz - block_size;
-      for(Index k = m_outerStart + m_outerSize.value(); k<=matrix.outerSize(); ++k)
-      {
-        matrix.outerIndexPtr()[k] += offset;
+        update_trailing_pointers = true;
       }
 
-      return derived();
+      internal::smart_copy(tmp.valuePtr() + tmp_start, tmp.valuePtr() + tmp_start + nnz, matrix.valuePtr() + start);
+      internal::smart_copy(tmp.innerIndexPtr() + tmp_start, tmp.innerIndexPtr() + tmp_start + nnz,
+                           matrix.innerIndexPtr() + start);
     }
 
-    inline BlockType& operator=(const BlockType& other)
-    {
-      return operator=<BlockType>(other);
+    // update outer index pointers and innerNonZeros
+    if (IsVectorAtCompileTime) {
+      if (!m_matrix.isCompressed()) matrix.innerNonZeroPtr()[m_outerStart] = StorageIndex(nnz);
+      matrix.outerIndexPtr()[m_outerStart] = StorageIndex(start);
+    } else {
+      StorageIndex p = StorageIndex(start);
+      for (Index k = 0; k < m_outerSize.value(); ++k) {
+        StorageIndex nnz_k = internal::convert_index<StorageIndex>(tmp.innerVector(k).nonZeros());
+        if (!m_matrix.isCompressed()) matrix.innerNonZeroPtr()[m_outerStart + k] = nnz_k;
+        matrix.outerIndexPtr()[m_outerStart + k] = p;
+        p += nnz_k;
+      }
     }
 
-    inline const Scalar* valuePtr() const
-    { return m_matrix.valuePtr() + m_matrix.outerIndexPtr()[m_outerStart]; }
-    inline Scalar* valuePtr()
-    { return m_matrix.const_cast_derived().valuePtr() + m_matrix.outerIndexPtr()[m_outerStart]; }
-
-    inline const Index* innerIndexPtr() const
-    { return m_matrix.innerIndexPtr() + m_matrix.outerIndexPtr()[m_outerStart]; }
-    inline Index* innerIndexPtr()
-    { return m_matrix.const_cast_derived().innerIndexPtr() + m_matrix.outerIndexPtr()[m_outerStart]; }
-
-    inline const Index* outerIndexPtr() const
-    { return m_matrix.outerIndexPtr() + m_outerStart; }
-    inline Index* outerIndexPtr()
-    { return m_matrix.const_cast_derived().outerIndexPtr() + m_outerStart; }
-
-    Index nonZeros() const
-    {
-      if(m_matrix.isCompressed())
-        return  std::size_t(m_matrix.outerIndexPtr()[m_outerStart+m_outerSize.value()])
-              - std::size_t(m_matrix.outerIndexPtr()[m_outerStart]);
-      else if(m_outerSize.value()==0)
-        return 0;
-      else
-        return Map<const Matrix<Index,OuterSize,1> >(m_matrix.innerNonZeroPtr()+m_outerStart, m_outerSize.value()).sum();
-    }
-    
-    inline Scalar& coeffRef(int row, int col)
-    {
-      return m_matrix.const_cast_derived().coeffRef(row + (IsRowMajor ? m_outerStart : 0), col + (IsRowMajor ? 0 :  m_outerStart));
-    }
-    
-    inline const Scalar coeff(int row, int col) const
-    {
-      return m_matrix.coeff(row + (IsRowMajor ? m_outerStart : 0), col + (IsRowMajor ? 0 :  m_outerStart));
-    }
-    
-    inline const Scalar coeff(int index) const
-    {
-      return m_matrix.coeff(IsRowMajor ? m_outerStart : index, IsRowMajor ? index :  m_outerStart);
+    if (update_trailing_pointers) {
+      StorageIndex offset = internal::convert_index<StorageIndex>(nnz - block_size);
+      for (Index k = m_outerStart + m_outerSize.value(); k <= matrix.outerSize(); ++k) {
+        matrix.outerIndexPtr()[k] += offset;
+      }
     }
 
-    const Scalar& lastCoeff() const
-    {
-      EIGEN_STATIC_ASSERT_VECTOR_ONLY(BlockImpl);
-      eigen_assert(nonZeros()>0);
-      if(m_matrix.isCompressed())
-        return m_matrix.valuePtr()[m_matrix.outerIndexPtr()[m_outerStart+1]-1];
-      else
-        return m_matrix.valuePtr()[m_matrix.outerIndexPtr()[m_outerStart]+m_matrix.innerNonZeroPtr()[m_outerStart]-1];
-    }
+    return derived();
+  }
 
-    EIGEN_STRONG_INLINE Index rows() const { return IsRowMajor ? m_outerSize.value() : m_matrix.rows(); }
-    EIGEN_STRONG_INLINE Index cols() const { return IsRowMajor ? m_matrix.cols() : m_outerSize.value(); }
+  inline BlockType& operator=(const BlockType& other) { return operator= <BlockType>(other); }
 
-  protected:
+  inline const Scalar* valuePtr() const { return m_matrix.valuePtr(); }
+  inline Scalar* valuePtr() { return m_matrix.valuePtr(); }
 
-    typename SparseMatrixType::Nested m_matrix;
-    Index m_outerStart;
-    const internal::variable_if_dynamic<Index, OuterSize> m_outerSize;
+  inline const StorageIndex* innerIndexPtr() const { return m_matrix.innerIndexPtr(); }
+  inline StorageIndex* innerIndexPtr() { return m_matrix.innerIndexPtr(); }
 
-};
+  inline const StorageIndex* outerIndexPtr() const { return m_matrix.outerIndexPtr() + m_outerStart; }
+  inline StorageIndex* outerIndexPtr() { return m_matrix.outerIndexPtr() + m_outerStart; }
 
+  inline const StorageIndex* innerNonZeroPtr() const {
+    return isCompressed() ? 0 : (m_matrix.innerNonZeroPtr() + m_outerStart);
+  }
+  inline StorageIndex* innerNonZeroPtr() { return isCompressed() ? 0 : (m_matrix.innerNonZeroPtr() + m_outerStart); }
 
-template<typename _Scalar, int _Options, typename _Index, int BlockRows, int BlockCols>
-class BlockImpl<const SparseMatrix<_Scalar, _Options, _Index>,BlockRows,BlockCols,true,Sparse>
-  : public SparseMatrixBase<Block<const SparseMatrix<_Scalar, _Options, _Index>,BlockRows,BlockCols,true> >
-{
-    typedef SparseMatrix<_Scalar, _Options, _Index> SparseMatrixType;
-    typedef typename internal::remove_all<typename SparseMatrixType::Nested>::type _MatrixTypeNested;
-    typedef Block<const SparseMatrixType, BlockRows, BlockCols, true> BlockType;
-public:
-    enum { IsRowMajor = internal::traits<BlockType>::IsRowMajor };
-    EIGEN_SPARSE_PUBLIC_INTERFACE(BlockType)
-protected:
-    enum { OuterSize = IsRowMajor ? BlockRows : BlockCols };
-public:
-    
-    class InnerIterator: public SparseMatrixType::InnerIterator
-    {
-      public:
-        inline InnerIterator(const BlockType& xpr, Index outer)
-          : SparseMatrixType::InnerIterator(xpr.m_matrix, xpr.m_outerStart + outer), m_outer(outer)
-        {}
-        inline Index row() const { return IsRowMajor ? m_outer : this->index(); }
-        inline Index col() const { return IsRowMajor ? this->index() : m_outer; }
-      protected:
-        Index m_outer;
-    };
-    class ReverseInnerIterator: public SparseMatrixType::ReverseInnerIterator
-    {
-      public:
-        inline ReverseInnerIterator(const BlockType& xpr, Index outer)
-          : SparseMatrixType::ReverseInnerIterator(xpr.m_matrix, xpr.m_outerStart + outer), m_outer(outer)
-        {}
-        inline Index row() const { return IsRowMajor ? m_outer : this->index(); }
-        inline Index col() const { return IsRowMajor ? this->index() : m_outer; }
-      protected:
-        Index m_outer;
-    };
-
-    inline BlockImpl(const SparseMatrixType& xpr, int i)
-      : m_matrix(xpr), m_outerStart(i), m_outerSize(OuterSize)
-    {}
-
-    inline BlockImpl(const SparseMatrixType& xpr, int startRow, int startCol, int blockRows, int blockCols)
-      : m_matrix(xpr), m_outerStart(IsRowMajor ? startRow : startCol), m_outerSize(IsRowMajor ? blockRows : blockCols)
-    {}
-
-    inline const Scalar* valuePtr() const
-    { return m_matrix.valuePtr() + m_matrix.outerIndexPtr()[m_outerStart]; }
-
-    inline const Index* innerIndexPtr() const
-    { return m_matrix.innerIndexPtr() + m_matrix.outerIndexPtr()[m_outerStart]; }
-
-    inline const Index* outerIndexPtr() const
-    { return m_matrix.outerIndexPtr() + m_outerStart; }
-
-    Index nonZeros() const
-    {
-      if(m_matrix.isCompressed())
-        return  std::size_t(m_matrix.outerIndexPtr()[m_outerStart+m_outerSize.value()])
-              - std::size_t(m_matrix.outerIndexPtr()[m_outerStart]);
-      else if(m_outerSize.value()==0)
-        return 0;
-      else
-        return Map<const Matrix<Index,OuterSize,1> >(m_matrix.innerNonZeroPtr()+m_outerStart, m_outerSize.value()).sum();
-    }
-    
-    inline const Scalar coeff(int row, int col) const
-    {
-      return m_matrix.coeff(row + (IsRowMajor ? m_outerStart : 0), col + (IsRowMajor ? 0 :  m_outerStart));
-    }
-    
-    inline const Scalar coeff(int index) const
-    {
-      return m_matrix.coeff(IsRowMajor ? m_outerStart : index, IsRowMajor ? index :  m_outerStart);
-    }
+  bool isCompressed() const { return m_matrix.innerNonZeroPtr() == 0; }
 
-    const Scalar& lastCoeff() const
-    {
-      EIGEN_STATIC_ASSERT_VECTOR_ONLY(BlockImpl);
-      eigen_assert(nonZeros()>0);
-      if(m_matrix.isCompressed())
-        return m_matrix.valuePtr()[m_matrix.outerIndexPtr()[m_outerStart+1]-1];
-      else
-        return m_matrix.valuePtr()[m_matrix.outerIndexPtr()[m_outerStart]+m_matrix.innerNonZeroPtr()[m_outerStart]-1];
-    }
+  inline Scalar& coeffRef(Index row, Index col) {
+    return m_matrix.coeffRef(row + (IsRowMajor ? m_outerStart : 0), col + (IsRowMajor ? 0 : m_outerStart));
+  }
+
+  inline const Scalar coeff(Index row, Index col) const {
+    return m_matrix.coeff(row + (IsRowMajor ? m_outerStart : 0), col + (IsRowMajor ? 0 : m_outerStart));
+  }
+
+  inline const Scalar coeff(Index index) const {
+    return m_matrix.coeff(IsRowMajor ? m_outerStart : index, IsRowMajor ? index : m_outerStart);
+  }
 
-    EIGEN_STRONG_INLINE Index rows() const { return IsRowMajor ? m_outerSize.value() : m_matrix.rows(); }
-    EIGEN_STRONG_INLINE Index cols() const { return IsRowMajor ? m_matrix.cols() : m_outerSize.value(); }
+  const Scalar& lastCoeff() const {
+    EIGEN_STATIC_ASSERT_VECTOR_ONLY(sparse_matrix_block_impl);
+    eigen_assert(Base::nonZeros() > 0);
+    if (m_matrix.isCompressed())
+      return m_matrix.valuePtr()[m_matrix.outerIndexPtr()[m_outerStart + 1] - 1];
+    else
+      return m_matrix.valuePtr()[m_matrix.outerIndexPtr()[m_outerStart] + m_matrix.innerNonZeroPtr()[m_outerStart] - 1];
+  }
 
-  protected:
+  EIGEN_STRONG_INLINE Index rows() const { return IsRowMajor ? m_outerSize.value() : m_matrix.rows(); }
+  EIGEN_STRONG_INLINE Index cols() const { return IsRowMajor ? m_matrix.cols() : m_outerSize.value(); }
 
-    typename SparseMatrixType::Nested m_matrix;
-    Index m_outerStart;
-    const internal::variable_if_dynamic<Index, OuterSize> m_outerSize;
+  inline const SparseMatrixType& nestedExpression() const { return m_matrix; }
+  inline SparseMatrixType& nestedExpression() { return m_matrix; }
+  Index startRow() const { return IsRowMajor ? m_outerStart : 0; }
+  Index startCol() const { return IsRowMajor ? 0 : m_outerStart; }
+  Index blockRows() const { return IsRowMajor ? m_outerSize.value() : m_matrix.rows(); }
+  Index blockCols() const { return IsRowMajor ? m_matrix.cols() : m_outerSize.value(); }
 
+ protected:
+  typename internal::ref_selector<SparseMatrixType>::non_const_type m_matrix;
+  Index m_outerStart;
+  const internal::variable_if_dynamic<Index, OuterSize> m_outerSize;
 };
 
-//----------
+}  // namespace internal
+
+template <typename Scalar_, int Options_, typename StorageIndex_, int BlockRows, int BlockCols>
+class BlockImpl<SparseMatrix<Scalar_, Options_, StorageIndex_>, BlockRows, BlockCols, true, Sparse>
+    : public internal::sparse_matrix_block_impl<SparseMatrix<Scalar_, Options_, StorageIndex_>, BlockRows, BlockCols> {
+ public:
+  typedef StorageIndex_ StorageIndex;
+  typedef SparseMatrix<Scalar_, Options_, StorageIndex_> SparseMatrixType;
+  typedef internal::sparse_matrix_block_impl<SparseMatrixType, BlockRows, BlockCols> Base;
+  inline BlockImpl(SparseMatrixType& xpr, Index i) : Base(xpr, i) {}
+
+  inline BlockImpl(SparseMatrixType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols)
+      : Base(xpr, startRow, startCol, blockRows, blockCols) {}
+
+  using Base::operator=;
+};
+
+template <typename Scalar_, int Options_, typename StorageIndex_, int BlockRows, int BlockCols>
+class BlockImpl<const SparseMatrix<Scalar_, Options_, StorageIndex_>, BlockRows, BlockCols, true, Sparse>
+    : public internal::sparse_matrix_block_impl<const SparseMatrix<Scalar_, Options_, StorageIndex_>, BlockRows,
+                                                BlockCols> {
+ public:
+  typedef StorageIndex_ StorageIndex;
+  typedef const SparseMatrix<Scalar_, Options_, StorageIndex_> SparseMatrixType;
+  typedef internal::sparse_matrix_block_impl<SparseMatrixType, BlockRows, BlockCols> Base;
+  inline BlockImpl(SparseMatrixType& xpr, Index i) : Base(xpr, i) {}
+
+  inline BlockImpl(SparseMatrixType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols)
+      : Base(xpr, startRow, startCol, blockRows, blockCols) {}
+
+  using Base::operator=;
+
+ private:
+  template <typename Derived>
+  BlockImpl(const SparseMatrixBase<Derived>& xpr, Index i);
+  template <typename Derived>
+  BlockImpl(const SparseMatrixBase<Derived>& xpr);
+};
 
-/** \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
-  * is col-major (resp. row-major).
-  */
-template<typename Derived>
-typename SparseMatrixBase<Derived>::InnerVectorReturnType SparseMatrixBase<Derived>::innerVector(Index outer)
-{ return InnerVectorReturnType(derived(), outer); }
-
-/** \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
-  * is col-major (resp. row-major). Read-only.
-  */
-template<typename Derived>
-const typename SparseMatrixBase<Derived>::ConstInnerVectorReturnType SparseMatrixBase<Derived>::innerVector(Index outer) const
-{ return ConstInnerVectorReturnType(derived(), outer); }
-
-/** \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
-  * is col-major (resp. row-major).
-  */
-template<typename Derived>
-typename SparseMatrixBase<Derived>::InnerVectorsReturnType
-SparseMatrixBase<Derived>::innerVectors(Index outerStart, Index outerSize)
-{
-  return Block<Derived,Dynamic,Dynamic,true>(derived(),
-                                             IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart,
-                                             IsRowMajor ? outerSize : rows(), IsRowMajor ? cols() : outerSize);
-  
-}
-
-/** \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
-  * is col-major (resp. row-major). Read-only.
-  */
-template<typename Derived>
-const typename SparseMatrixBase<Derived>::ConstInnerVectorsReturnType
-SparseMatrixBase<Derived>::innerVectors(Index outerStart, Index outerSize) const
-{
-  return Block<const Derived,Dynamic,Dynamic,true>(derived(),
-                                                  IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart,
-                                                  IsRowMajor ? outerSize : rows(), IsRowMajor ? cols() : outerSize);
-  
-}
+//----------
 
 /** Generic implementation of sparse Block expression.
-  * Real-only. 
-  */
-template<typename XprType, int BlockRows, int BlockCols, bool InnerPanel>
-class BlockImpl<XprType,BlockRows,BlockCols,InnerPanel,Sparse>
-  : public SparseMatrixBase<Block<XprType,BlockRows,BlockCols,InnerPanel> >, internal::no_assignment_operator
-{
-  typedef typename internal::remove_all<typename XprType::Nested>::type _MatrixTypeNested;
+ * Real-only.
+ */
+template <typename XprType, int BlockRows, int BlockCols, bool InnerPanel>
+class BlockImpl<XprType, BlockRows, BlockCols, InnerPanel, Sparse>
+    : public SparseMatrixBase<Block<XprType, BlockRows, BlockCols, InnerPanel> >, internal::no_assignment_operator {
   typedef Block<XprType, BlockRows, BlockCols, InnerPanel> BlockType;
-public:
-    enum { IsRowMajor = internal::traits<BlockType>::IsRowMajor };
-    EIGEN_SPARSE_PUBLIC_INTERFACE(BlockType)
+  typedef SparseMatrixBase<BlockType> Base;
+  using Base::convert_index;
+
+ public:
+  enum { IsRowMajor = internal::traits<BlockType>::IsRowMajor };
+  EIGEN_SPARSE_PUBLIC_INTERFACE(BlockType)
 
-    /** Column or Row constructor
-      */
-    inline BlockImpl(const XprType& xpr, int i)
+  typedef internal::remove_all_t<typename XprType::Nested> MatrixTypeNested_;
+
+  /** Column or Row constructor
+   */
+  inline BlockImpl(XprType& xpr, Index i)
       : m_matrix(xpr),
-        m_startRow( (BlockRows==1) && (BlockCols==XprType::ColsAtCompileTime) ? i : 0),
-        m_startCol( (BlockRows==XprType::RowsAtCompileTime) && (BlockCols==1) ? i : 0),
-        m_blockRows(BlockRows==1 ? 1 : xpr.rows()),
-        m_blockCols(BlockCols==1 ? 1 : xpr.cols())
-    {}
-
-    /** Dynamic-size constructor
-      */
-    inline BlockImpl(const XprType& xpr, int startRow, int startCol, int blockRows, int blockCols)
-      : m_matrix(xpr), m_startRow(startRow), m_startCol(startCol), m_blockRows(blockRows), m_blockCols(blockCols)
-    {}
-
-    inline int rows() const { return m_blockRows.value(); }
-    inline int cols() const { return m_blockCols.value(); }
-
-    inline Scalar& coeffRef(int row, int col)
-    {
-      return m_matrix.const_cast_derived()
-               .coeffRef(row + m_startRow.value(), col + m_startCol.value());
-    }
+        m_startRow((BlockRows == 1) && (BlockCols == XprType::ColsAtCompileTime) ? convert_index(i) : 0),
+        m_startCol((BlockRows == XprType::RowsAtCompileTime) && (BlockCols == 1) ? convert_index(i) : 0),
+        m_blockRows(BlockRows == 1 ? 1 : xpr.rows()),
+        m_blockCols(BlockCols == 1 ? 1 : xpr.cols()) {}
+
+  /** Dynamic-size constructor
+   */
+  inline BlockImpl(XprType& xpr, Index startRow, Index startCol, Index blockRows, Index blockCols)
+      : m_matrix(xpr),
+        m_startRow(convert_index(startRow)),
+        m_startCol(convert_index(startCol)),
+        m_blockRows(convert_index(blockRows)),
+        m_blockCols(convert_index(blockCols)) {}
+
+  inline Index rows() const { return m_blockRows.value(); }
+  inline Index cols() const { return m_blockCols.value(); }
+
+  inline Scalar& coeffRef(Index row, Index col) {
+    return m_matrix.coeffRef(row + m_startRow.value(), col + m_startCol.value());
+  }
+
+  inline const Scalar coeff(Index row, Index col) const {
+    return m_matrix.coeff(row + m_startRow.value(), col + m_startCol.value());
+  }
+
+  inline Scalar& coeffRef(Index index) {
+    return m_matrix.coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
+                             m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
+  }
+
+  inline const Scalar coeff(Index index) const {
+    return m_matrix.coeff(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
+                          m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
+  }
+
+  inline const XprType& nestedExpression() const { return m_matrix; }
+  inline XprType& nestedExpression() { return m_matrix; }
+  Index startRow() const { return m_startRow.value(); }
+  Index startCol() const { return m_startCol.value(); }
+  Index blockRows() const { return m_blockRows.value(); }
+  Index blockCols() const { return m_blockCols.value(); }
+
+ protected:
+  //     friend class internal::GenericSparseBlockInnerIteratorImpl<XprType,BlockRows,BlockCols,InnerPanel>;
+  friend struct internal::unary_evaluator<Block<XprType, BlockRows, BlockCols, InnerPanel>, internal::IteratorBased,
+                                          Scalar>;
+
+  Index nonZeros() const { return Dynamic; }
+
+  typename internal::ref_selector<XprType>::non_const_type m_matrix;
+  const internal::variable_if_dynamic<Index, XprType::RowsAtCompileTime == 1 ? 0 : Dynamic> m_startRow;
+  const internal::variable_if_dynamic<Index, XprType::ColsAtCompileTime == 1 ? 0 : Dynamic> m_startCol;
+  const internal::variable_if_dynamic<Index, RowsAtCompileTime> m_blockRows;
+  const internal::variable_if_dynamic<Index, ColsAtCompileTime> m_blockCols;
+
+ protected:
+  // Disable assignment with clear error message.
+  // Note that simply removing operator= yields compilation errors with ICC+MSVC
+  template <typename T>
+  BlockImpl& operator=(const T&) {
+    EIGEN_STATIC_ASSERT(sizeof(T) == 0, THIS_SPARSE_BLOCK_SUBEXPRESSION_IS_READ_ONLY);
+    return *this;
+  }
+};
 
-    inline const Scalar coeff(int row, int col) const
-    {
-      return m_matrix.coeff(row + m_startRow.value(), col + m_startCol.value());
+namespace internal {
+
+template <typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
+struct unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IteratorBased>
+    : public evaluator_base<Block<ArgType, BlockRows, BlockCols, InnerPanel> > {
+  class InnerVectorInnerIterator;
+  class OuterVectorInnerIterator;
+
+ public:
+  typedef Block<ArgType, BlockRows, BlockCols, InnerPanel> XprType;
+  typedef typename XprType::StorageIndex StorageIndex;
+  typedef typename XprType::Scalar Scalar;
+
+  enum {
+    IsRowMajor = XprType::IsRowMajor,
+    OuterVector = (BlockCols == 1 && ArgType::IsRowMajor) || (BlockRows == 1 && !ArgType::IsRowMajor),
+    CoeffReadCost = evaluator<ArgType>::CoeffReadCost,
+    Flags = XprType::Flags
+  };
+
+  typedef std::conditional_t<OuterVector, OuterVectorInnerIterator, InnerVectorInnerIterator> InnerIterator;
+
+  explicit unary_evaluator(const XprType& op) : m_argImpl(op.nestedExpression()), m_block(op) {}
+
+  inline Index nonZerosEstimate() const {
+    const Index nnz = m_block.nonZeros();
+    if (nnz < 0) {
+      // Scale the non-zero estimate for the underlying expression linearly with block size.
+      // Return zero if the underlying block is empty.
+      const Index nested_sz = m_block.nestedExpression().size();
+      return nested_sz == 0 ? 0 : m_argImpl.nonZerosEstimate() * m_block.size() / nested_sz;
     }
+    return nnz;
+  }
 
-    inline Scalar& coeffRef(int index)
-    {
-      return m_matrix.const_cast_derived()
-             .coeffRef(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
-                       m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
-    }
+ protected:
+  typedef typename evaluator<ArgType>::InnerIterator EvalIterator;
 
-    inline const Scalar coeff(int index) const
-    {
-      return m_matrix
-             .coeff(m_startRow.value() + (RowsAtCompileTime == 1 ? 0 : index),
-                    m_startCol.value() + (RowsAtCompileTime == 1 ? index : 0));
+  evaluator<ArgType> m_argImpl;
+  const XprType& m_block;
+};
+
+template <typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
+class unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IteratorBased>::InnerVectorInnerIterator
+    : public EvalIterator {
+  // NOTE MSVC fails to compile if we don't explicitly "import" IsRowMajor from unary_evaluator
+  //      because the base class EvalIterator has a private IsRowMajor enum too. (bug #1786)
+  // NOTE We cannot call it IsRowMajor because it would shadow unary_evaluator::IsRowMajor
+  enum { XprIsRowMajor = unary_evaluator::IsRowMajor };
+  const XprType& m_block;
+  Index m_end;
+
+ public:
+  EIGEN_STRONG_INLINE InnerVectorInnerIterator(const unary_evaluator& aEval, Index outer)
+      : EvalIterator(aEval.m_argImpl, outer + (XprIsRowMajor ? aEval.m_block.startRow() : aEval.m_block.startCol())),
+        m_block(aEval.m_block),
+        m_end(XprIsRowMajor ? aEval.m_block.startCol() + aEval.m_block.blockCols()
+                            : aEval.m_block.startRow() + aEval.m_block.blockRows()) {
+    while ((EvalIterator::operator bool()) &&
+           (EvalIterator::index() < (XprIsRowMajor ? m_block.startCol() : m_block.startRow())))
+      EvalIterator::operator++();
+  }
+
+  inline StorageIndex index() const {
+    return EvalIterator::index() - convert_index<StorageIndex>(XprIsRowMajor ? m_block.startCol() : m_block.startRow());
+  }
+  inline Index outer() const {
+    return EvalIterator::outer() - (XprIsRowMajor ? m_block.startRow() : m_block.startCol());
+  }
+  inline Index row() const { return EvalIterator::row() - m_block.startRow(); }
+  inline Index col() const { return EvalIterator::col() - m_block.startCol(); }
+
+  inline operator bool() const { return EvalIterator::operator bool() && EvalIterator::index() < m_end; }
+};
+
+template <typename ArgType, int BlockRows, int BlockCols, bool InnerPanel>
+class unary_evaluator<Block<ArgType, BlockRows, BlockCols, InnerPanel>, IteratorBased>::OuterVectorInnerIterator {
+  // NOTE see above
+  enum { XprIsRowMajor = unary_evaluator::IsRowMajor };
+  const unary_evaluator& m_eval;
+  Index m_outerPos;
+  const Index m_innerIndex;
+  Index m_end;
+  EvalIterator m_it;
+
+ public:
+  EIGEN_STRONG_INLINE OuterVectorInnerIterator(const unary_evaluator& aEval, Index outer)
+      : m_eval(aEval),
+        m_outerPos((XprIsRowMajor ? aEval.m_block.startCol() : aEval.m_block.startRow())),
+        m_innerIndex(XprIsRowMajor ? aEval.m_block.startRow() : aEval.m_block.startCol()),
+        m_end(XprIsRowMajor ? aEval.m_block.startCol() + aEval.m_block.blockCols()
+                            : aEval.m_block.startRow() + aEval.m_block.blockRows()),
+        m_it(m_eval.m_argImpl, m_outerPos) {
+    EIGEN_UNUSED_VARIABLE(outer);
+    eigen_assert(outer == 0);
+
+    while (m_it && m_it.index() < m_innerIndex) ++m_it;
+    if ((!m_it) || (m_it.index() != m_innerIndex)) ++(*this);
+  }
+
+  inline StorageIndex index() const {
+    return convert_index<StorageIndex>(m_outerPos -
+                                       (XprIsRowMajor ? m_eval.m_block.startCol() : m_eval.m_block.startRow()));
+  }
+  inline Index outer() const { return 0; }
+  inline Index row() const { return XprIsRowMajor ? 0 : index(); }
+  inline Index col() const { return XprIsRowMajor ? index() : 0; }
+
+  inline Scalar value() const { return m_it.value(); }
+  inline Scalar& valueRef() { return m_it.valueRef(); }
+
+  inline OuterVectorInnerIterator& operator++() {
+    // search next non-zero entry
+    while (++m_outerPos < m_end) {
+      // Restart iterator at the next inner-vector:
+      internal::destroy_at(&m_it);
+      internal::construct_at(&m_it, m_eval.m_argImpl, m_outerPos);
+      // search for the key m_innerIndex in the current outer-vector
+      while (m_it && m_it.index() < m_innerIndex) ++m_it;
+      if (m_it && m_it.index() == m_innerIndex) break;
     }
-    
-    inline const _MatrixTypeNested& nestedExpression() const { return m_matrix; }
-    
-    class InnerIterator : public _MatrixTypeNested::InnerIterator
-    {
-      typedef typename _MatrixTypeNested::InnerIterator Base;
-      const BlockType& m_block;
-      Index m_end;
-    public:
-
-      EIGEN_STRONG_INLINE InnerIterator(const BlockType& block, Index outer)
-        : Base(block.derived().nestedExpression(), outer + (IsRowMajor ? block.m_startRow.value() : block.m_startCol.value())),
-          m_block(block),
-          m_end(IsRowMajor ? block.m_startCol.value()+block.m_blockCols.value() : block.m_startRow.value()+block.m_blockRows.value())
-      {
-        while( (Base::operator bool()) && (Base::index() < (IsRowMajor ? m_block.m_startCol.value() : m_block.m_startRow.value())) )
-          Base::operator++();
-      }
+    return *this;
+  }
 
-      inline Index index()  const { return Base::index() - (IsRowMajor ? m_block.m_startCol.value() : m_block.m_startRow.value()); }
-      inline Index outer()  const { return Base::outer() - (IsRowMajor ? m_block.m_startRow.value() : m_block.m_startCol.value()); }
-      inline Index row()    const { return Base::row()   - m_block.m_startRow.value(); }
-      inline Index col()    const { return Base::col()   - m_block.m_startCol.value(); }
-      
-      inline operator bool() const { return Base::operator bool() && Base::index() < m_end; }
-    };
-    class ReverseInnerIterator : public _MatrixTypeNested::ReverseInnerIterator
-    {
-      typedef typename _MatrixTypeNested::ReverseInnerIterator Base;
-      const BlockType& m_block;
-      Index m_begin;
-    public:
-
-      EIGEN_STRONG_INLINE ReverseInnerIterator(const BlockType& block, Index outer)
-        : Base(block.derived().nestedExpression(), outer + (IsRowMajor ? block.m_startRow.value() : block.m_startCol.value())),
-          m_block(block),
-          m_begin(IsRowMajor ? block.m_startCol.value() : block.m_startRow.value())
-      {
-        while( (Base::operator bool()) && (Base::index() >= (IsRowMajor ? m_block.m_startCol.value()+block.m_blockCols.value() : m_block.m_startRow.value()+block.m_blockRows.value())) )
-          Base::operator--();
-      }
+  inline operator bool() const { return m_outerPos < m_end; }
+};
 
-      inline Index index()  const { return Base::index() - (IsRowMajor ? m_block.m_startCol.value() : m_block.m_startRow.value()); }
-      inline Index outer()  const { return Base::outer() - (IsRowMajor ? m_block.m_startRow.value() : m_block.m_startCol.value()); }
-      inline Index row()    const { return Base::row()   - m_block.m_startRow.value(); }
-      inline Index col()    const { return Base::col()   - m_block.m_startCol.value(); }
-      
-      inline operator bool() const { return Base::operator bool() && Base::index() >= m_begin; }
-    };
-  protected:
-    friend class InnerIterator;
-    friend class ReverseInnerIterator;
-    
-    EIGEN_INHERIT_ASSIGNMENT_OPERATORS(BlockImpl)
-
-    typename XprType::Nested m_matrix;
-    const internal::variable_if_dynamic<Index, XprType::RowsAtCompileTime == 1 ? 0 : Dynamic> m_startRow;
-    const internal::variable_if_dynamic<Index, XprType::ColsAtCompileTime == 1 ? 0 : Dynamic> m_startCol;
-    const internal::variable_if_dynamic<Index, RowsAtCompileTime> m_blockRows;
-    const internal::variable_if_dynamic<Index, ColsAtCompileTime> m_blockCols;
+template <typename Scalar_, int Options_, typename StorageIndex_, int BlockRows, int BlockCols>
+struct unary_evaluator<Block<SparseMatrix<Scalar_, Options_, StorageIndex_>, BlockRows, BlockCols, true>, IteratorBased>
+    : evaluator<
+          SparseCompressedBase<Block<SparseMatrix<Scalar_, Options_, StorageIndex_>, BlockRows, BlockCols, true> > > {
+  typedef Block<SparseMatrix<Scalar_, Options_, StorageIndex_>, BlockRows, BlockCols, true> XprType;
+  typedef evaluator<SparseCompressedBase<XprType> > Base;
+  explicit unary_evaluator(const XprType& xpr) : Base(xpr) {}
+};
 
+template <typename Scalar_, int Options_, typename StorageIndex_, int BlockRows, int BlockCols>
+struct unary_evaluator<Block<const SparseMatrix<Scalar_, Options_, StorageIndex_>, BlockRows, BlockCols, true>,
+                       IteratorBased>
+    : evaluator<SparseCompressedBase<
+          Block<const SparseMatrix<Scalar_, Options_, StorageIndex_>, BlockRows, BlockCols, true> > > {
+  typedef Block<const SparseMatrix<Scalar_, Options_, StorageIndex_>, BlockRows, BlockCols, true> XprType;
+  typedef evaluator<SparseCompressedBase<XprType> > Base;
+  explicit unary_evaluator(const XprType& xpr) : Base(xpr) {}
 };
 
-} // end namespace Eigen
+}  // end namespace internal
 
-#endif // EIGEN_SPARSE_BLOCK_H
+}  // end namespace Eigen
 
+#endif  // EIGEN_SPARSE_BLOCK_H
diff --git a/inst/include/Eigen/src/SparseCore/SparseColEtree.h b/inst/include/Eigen/src/SparseCore/SparseColEtree.h
index f8745f46..76575c99 100644
--- a/inst/include/Eigen/src/SparseCore/SparseColEtree.h
+++ b/inst/include/Eigen/src/SparseCore/SparseColEtree.h
@@ -7,11 +7,10 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+/*
+
+ * NOTE: This file is the modified version of sp_coletree.c file in SuperLU
 
-/* 
- 
- * NOTE: This file is the modified version of sp_coletree.c file in SuperLU 
- 
  * -- SuperLU routine (version 3.1) --
  * Univ. of California Berkeley, Xerox Palo Alto Research Center,
  * and Lawrence Berkeley National Lab.
@@ -31,57 +30,55 @@
 #ifndef SPARSE_COLETREE_H
 #define SPARSE_COLETREE_H
 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
 namespace Eigen {
 
 namespace internal {
 
-/** Find the root of the tree/set containing the vertex i : Use Path halving */ 
-template<typename Index, typename IndexVector>
-Index etree_find (Index i, IndexVector& pp)
-{
-  Index p = pp(i); // Parent 
-  Index gp = pp(p); // Grand parent 
-  while (gp != p) 
-  {
-    pp(i) = gp; // Parent pointer on find path is changed to former grand parent
-    i = gp; 
+/** Find the root of the tree/set containing the vertex i : Use Path halving */
+template <typename Index, typename IndexVector>
+Index etree_find(Index i, IndexVector& pp) {
+  Index p = pp(i);   // Parent
+  Index gp = pp(p);  // Grand parent
+  while (gp != p) {
+    pp(i) = gp;  // Parent pointer on find path is changed to former grand parent
+    i = gp;
     p = pp(i);
     gp = pp(p);
   }
-  return p; 
+  return p;
 }
 
 /** Compute the column elimination tree of a sparse matrix
-  * \param mat The matrix in column-major format. 
-  * \param parent The elimination tree
-  * \param firstRowElt The column index of the first element in each row
-  * \param perm The permutation to apply to the column of \b mat
-  */
+ * \param mat The matrix in column-major format.
+ * \param parent The elimination tree
+ * \param firstRowElt The column index of the first element in each row
+ * \param perm The permutation to apply to the column of \b mat
+ */
 template <typename MatrixType, typename IndexVector>
-int coletree(const MatrixType& mat, IndexVector& parent, IndexVector& firstRowElt, typename MatrixType::Index *perm=0)
-{
-  typedef typename MatrixType::Index Index;
-  Index nc = mat.cols(); // Number of columns 
-  Index m = mat.rows();
-  Index diagSize = (std::min)(nc,m);
-  IndexVector root(nc); // root of subtree of etree 
+int coletree(const MatrixType& mat, IndexVector& parent, IndexVector& firstRowElt,
+             typename MatrixType::StorageIndex* perm = 0) {
+  typedef typename MatrixType::StorageIndex StorageIndex;
+  StorageIndex nc = convert_index<StorageIndex>(mat.cols());  // Number of columns
+  StorageIndex m = convert_index<StorageIndex>(mat.rows());
+  StorageIndex diagSize = (std::min)(nc, m);
+  IndexVector root(nc);  // root of subtree of etree
   root.setZero();
-  IndexVector pp(nc); // disjoint sets 
-  pp.setZero(); // Initialize disjoint sets 
+  IndexVector pp(nc);  // disjoint sets
+  pp.setZero();        // Initialize disjoint sets
   parent.resize(mat.cols());
-  //Compute first nonzero column in each row 
-  Index row,col; 
+  // Compute first nonzero column in each row
   firstRowElt.resize(m);
   firstRowElt.setConstant(nc);
-  firstRowElt.segment(0, diagSize).setLinSpaced(diagSize, 0, diagSize-1);
+  firstRowElt.segment(0, diagSize).setLinSpaced(diagSize, 0, diagSize - 1);
   bool found_diag;
-  for (col = 0; col < nc; col++)
-  {
-    Index pcol = col;
-    if(perm) pcol  = perm[col];
-    for (typename MatrixType::InnerIterator it(mat, pcol); it; ++it)
-    { 
-      row = it.row();
+  for (StorageIndex col = 0; col < nc; col++) {
+    StorageIndex pcol = col;
+    if (perm) pcol = perm[col];
+    for (typename MatrixType::InnerIterator it(mat, pcol); it; ++it) {
+      Index row = it.row();
       firstRowElt(row) = (std::min)(firstRowElt(row), col);
     }
   }
@@ -89,118 +86,109 @@ int coletree(const MatrixType& mat, IndexVector& parent, IndexVector& firstRowEl
           except use (firstRowElt[r],c) in place of an edge (r,c) of A.
     Thus each row clique in A'*A is replaced by a star
     centered at its first vertex, which has the same fill. */
-  Index rset, cset, rroot; 
-  for (col = 0; col < nc; col++) 
-  {
-    found_diag = col>=m;
-    pp(col) = col; 
-    cset = col; 
-    root(cset) = col; 
-    parent(col) = nc; 
+  StorageIndex rset, cset, rroot;
+  for (StorageIndex col = 0; col < nc; col++) {
+    found_diag = col >= m;
+    pp(col) = col;
+    cset = col;
+    root(cset) = col;
+    parent(col) = nc;
     /* The diagonal element is treated here even if it does not exist in the matrix
-     * hence the loop is executed once more */ 
-    Index pcol = col;
-    if(perm) pcol  = perm[col];
-    for (typename MatrixType::InnerIterator it(mat, pcol); it||!found_diag; ++it)
-    { //  A sequence of interleaved find and union is performed 
+     * hence the loop is executed once more */
+    StorageIndex pcol = col;
+    if (perm) pcol = perm[col];
+    for (typename MatrixType::InnerIterator it(mat, pcol); it || !found_diag;
+         ++it) {  //  A sequence of interleaved find and union is performed
       Index i = col;
-      if(it) i = it.index();
+      if (it) i = it.index();
       if (i == col) found_diag = true;
-      
-      row = firstRowElt(i);
-      if (row >= col) continue; 
-      rset = internal::etree_find(row, pp); // Find the name of the set containing row
+
+      StorageIndex row = firstRowElt(i);
+      if (row >= col) continue;
+      rset = internal::etree_find(row, pp);  // Find the name of the set containing row
       rroot = root(rset);
-      if (rroot != col) 
-      {
-        parent(rroot) = col; 
-        pp(cset) = rset; 
-        cset = rset; 
-        root(cset) = col; 
+      if (rroot != col) {
+        parent(rroot) = col;
+        pp(cset) = rset;
+        cset = rset;
+        root(cset) = col;
       }
     }
   }
-  return 0;  
+  return 0;
 }
 
-/** 
-  * Depth-first search from vertex n.  No recursion.
-  * This routine was contributed by Cédric Doucet, CEDRAT Group, Meylan, France.
-*/
-template <typename Index, typename IndexVector>
-void nr_etdfs (Index n, IndexVector& parent, IndexVector& first_kid, IndexVector& next_kid, IndexVector& post, Index postnum)
-{
-  Index current = n, first, next;
-  while (postnum != n) 
-  {
+/**
+ * Depth-first search from vertex n.  No recursion.
+ * This routine was contributed by Cédric Doucet, CEDRAT Group, Meylan, France.
+ */
+template <typename IndexVector>
+void nr_etdfs(typename IndexVector::Scalar n, IndexVector& parent, IndexVector& first_kid, IndexVector& next_kid,
+              IndexVector& post, typename IndexVector::Scalar postnum) {
+  typedef typename IndexVector::Scalar StorageIndex;
+  StorageIndex current = n, first, next;
+  while (postnum != n) {
     // No kid for the current node
     first = first_kid(current);
-    
+
     // no kid for the current node
-    if (first == -1) 
-    {
-      // Numbering this node because it has no kid 
+    if (first == -1) {
+      // Numbering this node because it has no kid
       post(current) = postnum++;
-      
-      // looking for the next kid 
-      next = next_kid(current); 
-      while (next == -1) 
-      {
+
+      // looking for the next kid
+      next = next_kid(current);
+      while (next == -1) {
         // No more kids : back to the parent node
-        current = parent(current); 
-        // numbering the parent node 
+        current = parent(current);
+        // numbering the parent node
         post(current) = postnum++;
-        
-        // Get the next kid 
-        next = next_kid(current); 
+
+        // Get the next kid
+        next = next_kid(current);
       }
-      // stopping criterion 
-      if (postnum == n+1) return; 
-      
-      // Updating current node 
-      current = next; 
-    }
-    else 
-    {
-      current = first; 
+      // stopping criterion
+      if (postnum == n + 1) return;
+
+      // Updating current node
+      current = next;
+    } else {
+      current = first;
     }
   }
 }
 
-
 /**
-  * \brief Post order a tree 
-  * \param n the number of nodes
-  * \param parent Input tree
-  * \param post postordered tree
-  */
-template <typename Index, typename IndexVector>
-void treePostorder(Index n, IndexVector& parent, IndexVector& post)
-{
-  IndexVector first_kid, next_kid; // Linked list of children 
-  Index postnum; 
-  // Allocate storage for working arrays and results 
-  first_kid.resize(n+1); 
-  next_kid.setZero(n+1);
-  post.setZero(n+1);
-  
+ * \brief Post order a tree
+ * \param n the number of nodes
+ * \param parent Input tree
+ * \param post postordered tree
+ */
+template <typename IndexVector>
+void treePostorder(typename IndexVector::Scalar n, IndexVector& parent, IndexVector& post) {
+  typedef typename IndexVector::Scalar StorageIndex;
+  IndexVector first_kid, next_kid;  // Linked list of children
+  StorageIndex postnum;
+  // Allocate storage for working arrays and results
+  first_kid.resize(n + 1);
+  next_kid.setZero(n + 1);
+  post.setZero(n + 1);
+
   // Set up structure describing children
-  Index v, dad; 
-  first_kid.setConstant(-1); 
-  for (v = n-1; v >= 0; v--) 
-  {
-    dad = parent(v);
-    next_kid(v) = first_kid(dad); 
-    first_kid(dad) = v; 
+  first_kid.setConstant(-1);
+  for (StorageIndex v = n - 1; v >= 0; v--) {
+    StorageIndex dad = parent(v);
+    next_kid(v) = first_kid(dad);
+    first_kid(dad) = v;
   }
-  
+
   // Depth-first search from dummy root vertex #n
-  postnum = 0; 
+  postnum = 0;
   internal::nr_etdfs(n, parent, first_kid, next_kid, post, postnum);
 }
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // SPARSE_COLETREE_H
+#endif  // SPARSE_COLETREE_H
diff --git a/inst/include/Eigen/src/SparseCore/SparseCompressedBase.h b/inst/include/Eigen/src/SparseCore/SparseCompressedBase.h
new file mode 100644
index 00000000..420e9fa3
--- /dev/null
+++ b/inst/include/Eigen/src/SparseCore/SparseCompressedBase.h
@@ -0,0 +1,592 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSE_COMPRESSED_BASE_H
+#define EIGEN_SPARSE_COMPRESSED_BASE_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+template <typename Derived>
+class SparseCompressedBase;
+
+namespace internal {
+
+template <typename Derived>
+struct traits<SparseCompressedBase<Derived>> : traits<Derived> {};
+
+template <typename Derived, class Comp, bool IsVector>
+struct inner_sort_impl;
+
+}  // end namespace internal
+
+/** \ingroup SparseCore_Module
+ * \class SparseCompressedBase
+ * \brief Common base class for sparse [compressed]-{row|column}-storage format.
+ *
+ * This class defines the common interface for all derived classes implementing the compressed sparse storage format,
+ * such as:
+ *  - SparseMatrix
+ *  - Ref<SparseMatrixType,Options>
+ *  - Map<SparseMatrixType>
+ *
+ */
+template <typename Derived>
+class SparseCompressedBase : public SparseMatrixBase<Derived> {
+ public:
+  typedef SparseMatrixBase<Derived> Base;
+  EIGEN_SPARSE_PUBLIC_INTERFACE(SparseCompressedBase)
+  using Base::operator=;
+  using Base::IsRowMajor;
+
+  class InnerIterator;
+  class ReverseInnerIterator;
+
+ protected:
+  typedef typename Base::IndexVector IndexVector;
+  Eigen::Map<IndexVector> innerNonZeros() {
+    return Eigen::Map<IndexVector>(innerNonZeroPtr(), isCompressed() ? 0 : derived().outerSize());
+  }
+  const Eigen::Map<const IndexVector> innerNonZeros() const {
+    return Eigen::Map<const IndexVector>(innerNonZeroPtr(), isCompressed() ? 0 : derived().outerSize());
+  }
+
+ public:
+  /** \returns the number of non zero coefficients */
+  inline Index nonZeros() const {
+    if (Derived::IsVectorAtCompileTime && outerIndexPtr() == 0)
+      return derived().nonZeros();
+    else if (derived().outerSize() == 0)
+      return 0;
+    else if (isCompressed())
+      return outerIndexPtr()[derived().outerSize()] - outerIndexPtr()[0];
+    else
+      return innerNonZeros().sum();
+  }
+
+  /** \returns a const pointer to the array of values.
+   * This function is aimed at interoperability with other libraries.
+   * \sa innerIndexPtr(), outerIndexPtr() */
+  inline const Scalar* valuePtr() const { return derived().valuePtr(); }
+  /** \returns a non-const pointer to the array of values.
+   * This function is aimed at interoperability with other libraries.
+   * \sa innerIndexPtr(), outerIndexPtr() */
+  inline Scalar* valuePtr() { return derived().valuePtr(); }
+
+  /** \returns a const pointer to the array of inner indices.
+   * This function is aimed at interoperability with other libraries.
+   * \sa valuePtr(), outerIndexPtr() */
+  inline const StorageIndex* innerIndexPtr() const { return derived().innerIndexPtr(); }
+  /** \returns a non-const pointer to the array of inner indices.
+   * This function is aimed at interoperability with other libraries.
+   * \sa valuePtr(), outerIndexPtr() */
+  inline StorageIndex* innerIndexPtr() { return derived().innerIndexPtr(); }
+
+  /** \returns a const pointer to the array of the starting positions of the inner vectors.
+   * This function is aimed at interoperability with other libraries.
+   * \warning it returns the null pointer 0 for SparseVector
+   * \sa valuePtr(), innerIndexPtr() */
+  inline const StorageIndex* outerIndexPtr() const { return derived().outerIndexPtr(); }
+  /** \returns a non-const pointer to the array of the starting positions of the inner vectors.
+   * This function is aimed at interoperability with other libraries.
+   * \warning it returns the null pointer 0 for SparseVector
+   * \sa valuePtr(), innerIndexPtr() */
+  inline StorageIndex* outerIndexPtr() { return derived().outerIndexPtr(); }
+
+  /** \returns a const pointer to the array of the number of non zeros of the inner vectors.
+   * This function is aimed at interoperability with other libraries.
+   * \warning it returns the null pointer 0 in compressed mode */
+  inline const StorageIndex* innerNonZeroPtr() const { return derived().innerNonZeroPtr(); }
+  /** \returns a non-const pointer to the array of the number of non zeros of the inner vectors.
+   * This function is aimed at interoperability with other libraries.
+   * \warning it returns the null pointer 0 in compressed mode */
+  inline StorageIndex* innerNonZeroPtr() { return derived().innerNonZeroPtr(); }
+
+  /** \returns whether \c *this is in compressed form. */
+  inline bool isCompressed() const { return innerNonZeroPtr() == 0; }
+
+  /** \returns a read-only view of the stored coefficients as a 1D array expression.
+   *
+   * \warning this method is for \b compressed \b storage \b only, and it will trigger an assertion otherwise.
+   *
+   * \sa valuePtr(), isCompressed() */
+  const Map<const Array<Scalar, Dynamic, 1>> coeffs() const {
+    eigen_assert(isCompressed());
+    return Array<Scalar, Dynamic, 1>::Map(valuePtr(), nonZeros());
+  }
+
+  /** \returns a read-write view of the stored coefficients as a 1D array expression
+   *
+   * \warning this method is for \b compressed \b storage \b only, and it will trigger an assertion otherwise.
+   *
+   * Here is an example:
+   * \include SparseMatrix_coeffs.cpp
+   * and the output is:
+   * \include SparseMatrix_coeffs.out
+   *
+   * \sa valuePtr(), isCompressed() */
+  Map<Array<Scalar, Dynamic, 1>> coeffs() {
+    eigen_assert(isCompressed());
+    return Array<Scalar, Dynamic, 1>::Map(valuePtr(), nonZeros());
+  }
+
+  /** sorts the inner vectors in the range [begin,end) with respect to `Comp`
+   * \sa innerIndicesAreSorted() */
+  template <class Comp = std::less<>>
+  inline void sortInnerIndices(Index begin, Index end) {
+    eigen_assert(begin >= 0 && end <= derived().outerSize() && end >= begin);
+    internal::inner_sort_impl<Derived, Comp, IsVectorAtCompileTime>::run(*this, begin, end);
+  }
+
+  /** \returns the index of the first inner vector in the range [begin,end) that is not sorted with respect to `Comp`,
+   * or `end` if the range is fully sorted \sa sortInnerIndices() */
+  template <class Comp = std::less<>>
+  inline Index innerIndicesAreSorted(Index begin, Index end) const {
+    eigen_assert(begin >= 0 && end <= derived().outerSize() && end >= begin);
+    return internal::inner_sort_impl<Derived, Comp, IsVectorAtCompileTime>::check(*this, begin, end);
+  }
+
+  /** sorts the inner vectors in the range [0,outerSize) with respect to `Comp`
+   * \sa innerIndicesAreSorted() */
+  template <class Comp = std::less<>>
+  inline void sortInnerIndices() {
+    Index begin = 0;
+    Index end = derived().outerSize();
+    internal::inner_sort_impl<Derived, Comp, IsVectorAtCompileTime>::run(*this, begin, end);
+  }
+
+  /** \returns the index of the first inner vector in the range [0,outerSize) that is not sorted with respect to `Comp`,
+   * or `outerSize` if the range is fully sorted \sa sortInnerIndices() */
+  template <class Comp = std::less<>>
+  inline Index innerIndicesAreSorted() const {
+    Index begin = 0;
+    Index end = derived().outerSize();
+    return internal::inner_sort_impl<Derived, Comp, IsVectorAtCompileTime>::check(*this, begin, end);
+  }
+
+ protected:
+  /** Default constructor. Do nothing. */
+  SparseCompressedBase() {}
+
+  /** \internal return the index of the coeff at (row,col) or just before if it does not exist.
+   * This is an analogue of std::lower_bound.
+   */
+  internal::LowerBoundIndex lower_bound(Index row, Index col) const {
+    eigen_internal_assert(row >= 0 && row < this->rows() && col >= 0 && col < this->cols());
+
+    const Index outer = Derived::IsRowMajor ? row : col;
+    const Index inner = Derived::IsRowMajor ? col : row;
+
+    Index start = this->outerIndexPtr()[outer];
+    Index end = this->isCompressed() ? this->outerIndexPtr()[outer + 1]
+                                     : this->outerIndexPtr()[outer] + this->innerNonZeroPtr()[outer];
+    eigen_assert(end >= start && "you are using a non finalized sparse matrix or written coefficient does not exist");
+    internal::LowerBoundIndex p;
+    p.value =
+        std::lower_bound(this->innerIndexPtr() + start, this->innerIndexPtr() + end, inner) - this->innerIndexPtr();
+    p.found = (p.value < end) && (this->innerIndexPtr()[p.value] == inner);
+    return p;
+  }
+
+  friend struct internal::evaluator<SparseCompressedBase<Derived>>;
+
+ private:
+  template <typename OtherDerived>
+  explicit SparseCompressedBase(const SparseCompressedBase<OtherDerived>&);
+};
+
+template <typename Derived>
+class SparseCompressedBase<Derived>::InnerIterator {
+ public:
+  InnerIterator() : m_values(0), m_indices(0), m_outer(0), m_id(0), m_end(0) {}
+
+  InnerIterator(const InnerIterator& other)
+      : m_values(other.m_values),
+        m_indices(other.m_indices),
+        m_outer(other.m_outer),
+        m_id(other.m_id),
+        m_end(other.m_end) {}
+
+  InnerIterator& operator=(const InnerIterator& other) {
+    m_values = other.m_values;
+    m_indices = other.m_indices;
+    const_cast<OuterType&>(m_outer).setValue(other.m_outer.value());
+    m_id = other.m_id;
+    m_end = other.m_end;
+    return *this;
+  }
+
+  InnerIterator(const SparseCompressedBase& mat, Index outer)
+      : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(outer) {
+    if (Derived::IsVectorAtCompileTime && mat.outerIndexPtr() == 0) {
+      m_id = 0;
+      m_end = mat.nonZeros();
+    } else {
+      m_id = mat.outerIndexPtr()[outer];
+      if (mat.isCompressed())
+        m_end = mat.outerIndexPtr()[outer + 1];
+      else
+        m_end = m_id + mat.innerNonZeroPtr()[outer];
+    }
+  }
+
+  explicit InnerIterator(const SparseCompressedBase& mat) : InnerIterator(mat, Index(0)) {
+    EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
+  }
+
+  explicit InnerIterator(const internal::CompressedStorage<Scalar, StorageIndex>& data)
+      : m_values(data.valuePtr()), m_indices(data.indexPtr()), m_outer(0), m_id(0), m_end(data.size()) {
+    EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
+  }
+
+  inline InnerIterator& operator++() {
+    m_id++;
+    return *this;
+  }
+  inline InnerIterator& operator+=(Index i) {
+    m_id += i;
+    return *this;
+  }
+
+  inline InnerIterator operator+(Index i) {
+    InnerIterator result = *this;
+    result += i;
+    return result;
+  }
+
+  inline const Scalar& value() const { return m_values[m_id]; }
+  inline Scalar& valueRef() { return const_cast<Scalar&>(m_values[m_id]); }
+
+  inline StorageIndex index() const { return m_indices[m_id]; }
+  inline Index outer() const { return m_outer.value(); }
+  inline Index row() const { return IsRowMajor ? m_outer.value() : index(); }
+  inline Index col() const { return IsRowMajor ? index() : m_outer.value(); }
+
+  inline operator bool() const { return (m_id < m_end); }
+
+ protected:
+  const Scalar* m_values;
+  const StorageIndex* m_indices;
+  typedef internal::variable_if_dynamic<Index, Derived::IsVectorAtCompileTime ? 0 : Dynamic> OuterType;
+  const OuterType m_outer;
+  Index m_id;
+  Index m_end;
+
+ private:
+  // If you get here, then you're not using the right InnerIterator type, e.g.:
+  //   SparseMatrix<double,RowMajor> A;
+  //   SparseMatrix<double>::InnerIterator it(A,0);
+  template <typename T>
+  InnerIterator(const SparseMatrixBase<T>&, Index outer);
+};
+
+template <typename Derived>
+class SparseCompressedBase<Derived>::ReverseInnerIterator {
+ public:
+  ReverseInnerIterator(const SparseCompressedBase& mat, Index outer)
+      : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(outer) {
+    if (Derived::IsVectorAtCompileTime && mat.outerIndexPtr() == 0) {
+      m_start = 0;
+      m_id = mat.nonZeros();
+    } else {
+      m_start = mat.outerIndexPtr()[outer];
+      if (mat.isCompressed())
+        m_id = mat.outerIndexPtr()[outer + 1];
+      else
+        m_id = m_start + mat.innerNonZeroPtr()[outer];
+    }
+  }
+
+  explicit ReverseInnerIterator(const SparseCompressedBase& mat)
+      : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(0), m_start(0), m_id(mat.nonZeros()) {
+    EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
+  }
+
+  explicit ReverseInnerIterator(const internal::CompressedStorage<Scalar, StorageIndex>& data)
+      : m_values(data.valuePtr()), m_indices(data.indexPtr()), m_outer(0), m_start(0), m_id(data.size()) {
+    EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
+  }
+
+  inline ReverseInnerIterator& operator--() {
+    --m_id;
+    return *this;
+  }
+  inline ReverseInnerIterator& operator-=(Index i) {
+    m_id -= i;
+    return *this;
+  }
+
+  inline ReverseInnerIterator operator-(Index i) {
+    ReverseInnerIterator result = *this;
+    result -= i;
+    return result;
+  }
+
+  inline const Scalar& value() const { return m_values[m_id - 1]; }
+  inline Scalar& valueRef() { return const_cast<Scalar&>(m_values[m_id - 1]); }
+
+  inline StorageIndex index() const { return m_indices[m_id - 1]; }
+  inline Index outer() const { return m_outer.value(); }
+  inline Index row() const { return IsRowMajor ? m_outer.value() : index(); }
+  inline Index col() const { return IsRowMajor ? index() : m_outer.value(); }
+
+  inline operator bool() const { return (m_id > m_start); }
+
+ protected:
+  const Scalar* m_values;
+  const StorageIndex* m_indices;
+  typedef internal::variable_if_dynamic<Index, Derived::IsVectorAtCompileTime ? 0 : Dynamic> OuterType;
+  const OuterType m_outer;
+  Index m_start;
+  Index m_id;
+};
+
+namespace internal {
+
+// modified from https://artificial-mind.net/blog/2020/11/28/std-sort-multiple-ranges
+
+template <typename Scalar, typename StorageIndex>
+class StorageVal;
+template <typename Scalar, typename StorageIndex>
+class StorageRef;
+template <typename Scalar, typename StorageIndex>
+class CompressedStorageIterator;
+
+// class to hold an index/value pair
+template <typename Scalar, typename StorageIndex>
+class StorageVal {
+ public:
+  StorageVal(const StorageIndex& innerIndex, const Scalar& value) : m_innerIndex(innerIndex), m_value(value) {}
+  StorageVal(const StorageVal& other) : m_innerIndex(other.m_innerIndex), m_value(other.m_value) {}
+  StorageVal(StorageVal&& other) = default;
+
+  inline const StorageIndex& key() const { return m_innerIndex; }
+  inline StorageIndex& key() { return m_innerIndex; }
+  inline const Scalar& value() const { return m_value; }
+  inline Scalar& value() { return m_value; }
+
+  // enables StorageVal to be compared with respect to any type that is convertible to StorageIndex
+  inline operator StorageIndex() const { return m_innerIndex; }
+
+ protected:
+  StorageIndex m_innerIndex;
+  Scalar m_value;
+
+ private:
+  StorageVal() = delete;
+};
+// class to hold an index/value iterator pair
+// used to define assignment, swap, and comparison operators for CompressedStorageIterator
+template <typename Scalar, typename StorageIndex>
+class StorageRef {
+ public:
+  using value_type = StorageVal<Scalar, StorageIndex>;
+
+  // StorageRef Needs to be move-able for sort on macos.
+  StorageRef(StorageRef&& other) = default;
+
+  inline StorageRef& operator=(const StorageRef& other) {
+    key() = other.key();
+    value() = other.value();
+    return *this;
+  }
+  inline StorageRef& operator=(const value_type& other) {
+    key() = other.key();
+    value() = other.value();
+    return *this;
+  }
+  inline operator value_type() const { return value_type(key(), value()); }
+  inline friend void swap(const StorageRef& a, const StorageRef& b) {
+    std::iter_swap(a.keyPtr(), b.keyPtr());
+    std::iter_swap(a.valuePtr(), b.valuePtr());
+  }
+
+  inline const StorageIndex& key() const { return *m_innerIndexIterator; }
+  inline StorageIndex& key() { return *m_innerIndexIterator; }
+  inline const Scalar& value() const { return *m_valueIterator; }
+  inline Scalar& value() { return *m_valueIterator; }
+  inline StorageIndex* keyPtr() const { return m_innerIndexIterator; }
+  inline Scalar* valuePtr() const { return m_valueIterator; }
+
+  // enables StorageRef to be compared with respect to any type that is convertible to StorageIndex
+  inline operator StorageIndex() const { return *m_innerIndexIterator; }
+
+ protected:
+  StorageIndex* m_innerIndexIterator;
+  Scalar* m_valueIterator;
+
+ private:
+  StorageRef() = delete;
+  // these constructors are called by the CompressedStorageIterator constructors for convenience only
+  StorageRef(StorageIndex* innerIndexIterator, Scalar* valueIterator)
+      : m_innerIndexIterator(innerIndexIterator), m_valueIterator(valueIterator) {}
+  StorageRef(const StorageRef& other)
+      : m_innerIndexIterator(other.m_innerIndexIterator), m_valueIterator(other.m_valueIterator) {}
+
+  friend class CompressedStorageIterator<Scalar, StorageIndex>;
+};
+
+// STL-compatible iterator class that operates on inner indices and values
+template <typename Scalar, typename StorageIndex>
+class CompressedStorageIterator {
+ public:
+  using iterator_category = std::random_access_iterator_tag;
+  using reference = StorageRef<Scalar, StorageIndex>;
+  using difference_type = Index;
+  using value_type = typename reference::value_type;
+  using pointer = value_type*;
+
+  CompressedStorageIterator() = delete;
+  CompressedStorageIterator(difference_type index, StorageIndex* innerIndexPtr, Scalar* valuePtr)
+      : m_index(index), m_data(innerIndexPtr, valuePtr) {}
+  CompressedStorageIterator(difference_type index, reference data) : m_index(index), m_data(data) {}
+  CompressedStorageIterator(const CompressedStorageIterator& other) : m_index(other.m_index), m_data(other.m_data) {}
+  CompressedStorageIterator(CompressedStorageIterator&& other) = default;
+  inline CompressedStorageIterator& operator=(const CompressedStorageIterator& other) {
+    m_index = other.m_index;
+    m_data = other.m_data;
+    return *this;
+  }
+
+  inline CompressedStorageIterator operator+(difference_type offset) const {
+    return CompressedStorageIterator(m_index + offset, m_data);
+  }
+  inline CompressedStorageIterator operator-(difference_type offset) const {
+    return CompressedStorageIterator(m_index - offset, m_data);
+  }
+  inline difference_type operator-(const CompressedStorageIterator& other) const { return m_index - other.m_index; }
+  inline CompressedStorageIterator& operator++() {
+    ++m_index;
+    return *this;
+  }
+  inline CompressedStorageIterator& operator--() {
+    --m_index;
+    return *this;
+  }
+  inline CompressedStorageIterator& operator+=(difference_type offset) {
+    m_index += offset;
+    return *this;
+  }
+  inline CompressedStorageIterator& operator-=(difference_type offset) {
+    m_index -= offset;
+    return *this;
+  }
+  inline reference operator*() const { return reference(m_data.keyPtr() + m_index, m_data.valuePtr() + m_index); }
+  inline reference operator[](int index) { return *(*this + index); }
+
+#define MAKE_COMP(OP) \
+  inline bool operator OP(const CompressedStorageIterator& other) const { return m_index OP other.m_index; }
+  MAKE_COMP(<)
+  MAKE_COMP(>)
+  MAKE_COMP(>=)
+  MAKE_COMP(<=)
+  MAKE_COMP(!=)
+  MAKE_COMP(==)
+#undef MAKE_COMP
+
+ protected:
+  difference_type m_index;
+  reference m_data;
+};
+
+template <typename Derived, class Comp, bool IsVector>
+struct inner_sort_impl {
+  typedef typename Derived::Scalar Scalar;
+  typedef typename Derived::StorageIndex StorageIndex;
+  static inline void run(SparseCompressedBase<Derived>& obj, Index begin, Index end) {
+    const bool is_compressed = obj.isCompressed();
+    for (Index outer = begin; outer < end; outer++) {
+      Index begin_offset = obj.outerIndexPtr()[outer];
+      Index end_offset = is_compressed ? obj.outerIndexPtr()[outer + 1] : (begin_offset + obj.innerNonZeroPtr()[outer]);
+      CompressedStorageIterator<Scalar, StorageIndex> begin_it(begin_offset, obj.innerIndexPtr(), obj.valuePtr());
+      CompressedStorageIterator<Scalar, StorageIndex> end_it(end_offset, obj.innerIndexPtr(), obj.valuePtr());
+      std::sort(begin_it, end_it, Comp());
+    }
+  }
+  static inline Index check(const SparseCompressedBase<Derived>& obj, Index begin, Index end) {
+    const bool is_compressed = obj.isCompressed();
+    for (Index outer = begin; outer < end; outer++) {
+      Index begin_offset = obj.outerIndexPtr()[outer];
+      Index end_offset = is_compressed ? obj.outerIndexPtr()[outer + 1] : (begin_offset + obj.innerNonZeroPtr()[outer]);
+      const StorageIndex* begin_it = obj.innerIndexPtr() + begin_offset;
+      const StorageIndex* end_it = obj.innerIndexPtr() + end_offset;
+      bool is_sorted = std::is_sorted(begin_it, end_it, Comp());
+      if (!is_sorted) return outer;
+    }
+    return end;
+  }
+};
+template <typename Derived, class Comp>
+struct inner_sort_impl<Derived, Comp, true> {
+  typedef typename Derived::Scalar Scalar;
+  typedef typename Derived::StorageIndex StorageIndex;
+  static inline void run(SparseCompressedBase<Derived>& obj, Index, Index) {
+    Index begin_offset = 0;
+    Index end_offset = obj.nonZeros();
+    CompressedStorageIterator<Scalar, StorageIndex> begin_it(begin_offset, obj.innerIndexPtr(), obj.valuePtr());
+    CompressedStorageIterator<Scalar, StorageIndex> end_it(end_offset, obj.innerIndexPtr(), obj.valuePtr());
+    std::sort(begin_it, end_it, Comp());
+  }
+  static inline Index check(const SparseCompressedBase<Derived>& obj, Index, Index) {
+    Index begin_offset = 0;
+    Index end_offset = obj.nonZeros();
+    const StorageIndex* begin_it = obj.innerIndexPtr() + begin_offset;
+    const StorageIndex* end_it = obj.innerIndexPtr() + end_offset;
+    return std::is_sorted(begin_it, end_it, Comp()) ? 1 : 0;
+  }
+};
+
+template <typename Derived>
+struct evaluator<SparseCompressedBase<Derived>> : evaluator_base<Derived> {
+  typedef typename Derived::Scalar Scalar;
+  typedef typename Derived::InnerIterator InnerIterator;
+
+  enum { CoeffReadCost = NumTraits<Scalar>::ReadCost, Flags = Derived::Flags };
+
+  evaluator() : m_matrix(0), m_zero(0) { EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); }
+  explicit evaluator(const Derived& mat) : m_matrix(&mat), m_zero(0) { EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); }
+
+  inline Index nonZerosEstimate() const { return m_matrix->nonZeros(); }
+
+  operator Derived&() { return m_matrix->const_cast_derived(); }
+  operator const Derived&() const { return *m_matrix; }
+
+  typedef typename DenseCoeffsBase<Derived, ReadOnlyAccessors>::CoeffReturnType CoeffReturnType;
+  const Scalar& coeff(Index row, Index col) const {
+    Index p = find(row, col);
+
+    if (p == Dynamic)
+      return m_zero;
+    else
+      return m_matrix->const_cast_derived().valuePtr()[p];
+  }
+
+  Scalar& coeffRef(Index row, Index col) {
+    Index p = find(row, col);
+    eigen_assert(p != Dynamic && "written coefficient does not exist");
+    return m_matrix->const_cast_derived().valuePtr()[p];
+  }
+
+ protected:
+  Index find(Index row, Index col) const {
+    internal::LowerBoundIndex p = m_matrix->lower_bound(row, col);
+    return p.found ? p.value : Dynamic;
+  }
+
+  const Derived* m_matrix;
+  const Scalar m_zero;
+};
+
+}  // namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_SPARSE_COMPRESSED_BASE_H
diff --git a/inst/include/Eigen/src/SparseCore/SparseCwiseBinaryOp.h b/inst/include/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
index 4ca91283..7fcf2c21 100644
--- a/inst/include/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
+++ b/inst/include/Eigen/src/SparseCore/SparseCwiseBinaryOp.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,7 +10,10 @@
 #ifndef EIGEN_SPARSE_CWISE_BINARY_OP_H
 #define EIGEN_SPARSE_CWISE_BINARY_OP_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 // Here we have to handle 3 cases:
 //  1 - sparse op dense
@@ -28,152 +31,365 @@ namespace Eigen {
 //                         generic      sparse
 //  4 - dense op dense     product      dense
 //                         generic      dense
-
-namespace internal {
-
-template<> struct promote_storage_type<Dense,Sparse>
-{ typedef Sparse ret; };
-
-template<> struct promote_storage_type<Sparse,Dense>
-{ typedef Sparse ret; };
-
-template<typename BinaryOp, typename Lhs, typename Rhs, typename Derived,
-  typename _LhsStorageMode = typename traits<Lhs>::StorageKind,
-  typename _RhsStorageMode = typename traits<Rhs>::StorageKind>
-class sparse_cwise_binary_op_inner_iterator_selector;
-
-} // end namespace internal
-
-template<typename BinaryOp, typename Lhs, typename Rhs>
-class CwiseBinaryOpImpl<BinaryOp, Lhs, Rhs, Sparse>
-  : public SparseMatrixBase<CwiseBinaryOp<BinaryOp, Lhs, Rhs> >
-{
-  public:
-    class InnerIterator;
-    class ReverseInnerIterator;
-    typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> Derived;
-    EIGEN_SPARSE_PUBLIC_INTERFACE(Derived)
-    CwiseBinaryOpImpl()
-    {
-      typedef typename internal::traits<Lhs>::StorageKind LhsStorageKind;
-      typedef typename internal::traits<Rhs>::StorageKind RhsStorageKind;
-      EIGEN_STATIC_ASSERT((
-                (!internal::is_same<LhsStorageKind,RhsStorageKind>::value)
-            ||  ((Lhs::Flags&RowMajorBit) == (Rhs::Flags&RowMajorBit))),
-            THE_STORAGE_ORDER_OF_BOTH_SIDES_MUST_MATCH);
-    }
-};
-
-template<typename BinaryOp, typename Lhs, typename Rhs>
-class CwiseBinaryOpImpl<BinaryOp,Lhs,Rhs,Sparse>::InnerIterator
-  : public internal::sparse_cwise_binary_op_inner_iterator_selector<BinaryOp,Lhs,Rhs,typename CwiseBinaryOpImpl<BinaryOp,Lhs,Rhs,Sparse>::InnerIterator>
-{
-  public:
-    typedef typename Lhs::Index Index;
-    typedef internal::sparse_cwise_binary_op_inner_iterator_selector<
-      BinaryOp,Lhs,Rhs, InnerIterator> Base;
-
-    // NOTE: we have to prefix Index by "typename Lhs::" to avoid an ICE with VC11
-    EIGEN_STRONG_INLINE InnerIterator(const CwiseBinaryOpImpl& binOp, typename Lhs::Index outer)
-      : Base(binOp.derived(),outer)
-    {}
+//
+// TODO to ease compiler job, we could specialize product/quotient with a scalar
+//      and fallback to cwise-unary evaluator using bind1st_op and bind2nd_op.
+
+template <typename BinaryOp, typename Lhs, typename Rhs>
+class CwiseBinaryOpImpl<BinaryOp, Lhs, Rhs, Sparse> : public SparseMatrixBase<CwiseBinaryOp<BinaryOp, Lhs, Rhs> > {
+ public:
+  typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> Derived;
+  typedef SparseMatrixBase<Derived> Base;
+  EIGEN_SPARSE_PUBLIC_INTERFACE(Derived)
+  EIGEN_STATIC_ASSERT(((!internal::is_same<typename internal::traits<Lhs>::StorageKind,
+                                           typename internal::traits<Rhs>::StorageKind>::value) ||
+                       ((internal::evaluator<Lhs>::Flags & RowMajorBit) ==
+                        (internal::evaluator<Rhs>::Flags & RowMajorBit))),
+                      THE_STORAGE_ORDER_OF_BOTH_SIDES_MUST_MATCH)
 };
 
-/***************************************************************************
-* Implementation of inner-iterators
-***************************************************************************/
-
-// template<typename T> struct internal::func_is_conjunction { enum { ret = false }; };
-// template<typename T> struct internal::func_is_conjunction<internal::scalar_product_op<T> > { enum { ret = true }; };
-
-// TODO generalize the internal::scalar_product_op specialization to all conjunctions if any !
-
 namespace internal {
 
-// sparse - sparse  (generic)
-template<typename BinaryOp, typename Lhs, typename Rhs, typename Derived>
-class sparse_cwise_binary_op_inner_iterator_selector<BinaryOp, Lhs, Rhs, Derived, Sparse, Sparse>
-{
-    typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> CwiseBinaryXpr;
-    typedef typename traits<CwiseBinaryXpr>::Scalar Scalar;
-    typedef typename traits<CwiseBinaryXpr>::_LhsNested _LhsNested;
-    typedef typename traits<CwiseBinaryXpr>::_RhsNested _RhsNested;
-    typedef typename _LhsNested::InnerIterator LhsIterator;
-    typedef typename _RhsNested::InnerIterator RhsIterator;
-    typedef typename Lhs::Index Index;
-
-  public:
-
-    EIGEN_STRONG_INLINE sparse_cwise_binary_op_inner_iterator_selector(const CwiseBinaryXpr& xpr, Index outer)
-      : m_lhsIter(xpr.lhs(),outer), m_rhsIter(xpr.rhs(),outer), m_functor(xpr.functor())
-    {
+// The default evaluator performs an "arithmetic" operation on two input arrays.
+// Given input arrays 'lhs' and 'rhs' and binary functor 'func',
+// the sparse destination array 'dst' is evaluated as follows:
+//   if lhs(i,j) and rhs(i,j) are present, dst(i,j) = func(lhs(i,j), rhs(i,j))
+//   if lhs(i,j) is present and rhs(i,j) is null, dst(i,j) = func(lhs(i,j), 0)
+//   if lhs(i,j) is null and rhs(i,j) is present, dst(i,j) = func(0, rhs(i,j))
+
+// Generic "sparse OP sparse"
+template <typename XprType>
+struct binary_sparse_evaluator;
+
+template <typename BinaryOp, typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IteratorBased, IteratorBased>
+    : evaluator_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs> > {
+ protected:
+  typedef typename evaluator<Lhs>::InnerIterator LhsIterator;
+  typedef typename evaluator<Rhs>::InnerIterator RhsIterator;
+  typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> XprType;
+  typedef typename traits<XprType>::Scalar Scalar;
+  typedef typename XprType::StorageIndex StorageIndex;
+
+ public:
+  class InnerIterator {
+   public:
+    EIGEN_STRONG_INLINE InnerIterator(const binary_evaluator& aEval, Index outer)
+        : m_lhsIter(aEval.m_lhsImpl, outer),
+          m_rhsIter(aEval.m_rhsImpl, outer),
+          m_functor(aEval.m_functor),
+          m_value(Scalar(0)) {
       this->operator++();
     }
 
-    EIGEN_STRONG_INLINE Derived& operator++()
-    {
-      if (m_lhsIter && m_rhsIter && (m_lhsIter.index() == m_rhsIter.index()))
-      {
+    EIGEN_STRONG_INLINE InnerIterator& operator++() {
+      if (m_lhsIter && m_rhsIter && (m_lhsIter.index() == m_rhsIter.index())) {
         m_id = m_lhsIter.index();
         m_value = m_functor(m_lhsIter.value(), m_rhsIter.value());
         ++m_lhsIter;
         ++m_rhsIter;
-      }
-      else if (m_lhsIter && (!m_rhsIter || (m_lhsIter.index() < m_rhsIter.index())))
-      {
+      } else if (m_lhsIter && (!m_rhsIter || (m_lhsIter.index() < m_rhsIter.index()))) {
         m_id = m_lhsIter.index();
         m_value = m_functor(m_lhsIter.value(), Scalar(0));
         ++m_lhsIter;
-      }
-      else if (m_rhsIter && (!m_lhsIter || (m_lhsIter.index() > m_rhsIter.index())))
-      {
+      } else if (m_rhsIter && (!m_lhsIter || (m_lhsIter.index() > m_rhsIter.index()))) {
         m_id = m_rhsIter.index();
         m_value = m_functor(Scalar(0), m_rhsIter.value());
         ++m_rhsIter;
-      }
-      else
-      {
-        m_value = 0; // this is to avoid a compilation warning
+      } else {
         m_id = -1;
       }
-      return *static_cast<Derived*>(this);
+      return *this;
     }
 
     EIGEN_STRONG_INLINE Scalar value() const { return m_value; }
 
-    EIGEN_STRONG_INLINE Index index() const { return m_id; }
+    EIGEN_STRONG_INLINE StorageIndex index() const { return m_id; }
+    EIGEN_STRONG_INLINE Index outer() const { return m_lhsIter.outer(); }
     EIGEN_STRONG_INLINE Index row() const { return Lhs::IsRowMajor ? m_lhsIter.row() : index(); }
     EIGEN_STRONG_INLINE Index col() const { return Lhs::IsRowMajor ? index() : m_lhsIter.col(); }
 
-    EIGEN_STRONG_INLINE operator bool() const { return m_id>=0; }
+    EIGEN_STRONG_INLINE operator bool() const { return m_id >= 0; }
 
-  protected:
+   protected:
     LhsIterator m_lhsIter;
     RhsIterator m_rhsIter;
     const BinaryOp& m_functor;
     Scalar m_value;
-    Index m_id;
+    StorageIndex m_id;
+  };
+
+  enum {
+    CoeffReadCost =
+        int(evaluator<Lhs>::CoeffReadCost) + int(evaluator<Rhs>::CoeffReadCost) + int(functor_traits<BinaryOp>::Cost),
+    Flags = XprType::Flags
+  };
+
+  explicit binary_evaluator(const XprType& xpr) : m_functor(xpr.functor()), m_lhsImpl(xpr.lhs()), m_rhsImpl(xpr.rhs()) {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<BinaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  inline Index nonZerosEstimate() const { return m_lhsImpl.nonZerosEstimate() + m_rhsImpl.nonZerosEstimate(); }
+
+ protected:
+  const BinaryOp m_functor;
+  evaluator<Lhs> m_lhsImpl;
+  evaluator<Rhs> m_rhsImpl;
+};
+
+// dense op sparse
+template <typename BinaryOp, typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IndexBased, IteratorBased>
+    : evaluator_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs> > {
+ protected:
+  typedef typename evaluator<Rhs>::InnerIterator RhsIterator;
+  typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> XprType;
+  typedef typename traits<XprType>::Scalar Scalar;
+  typedef typename XprType::StorageIndex StorageIndex;
+
+ public:
+  class InnerIterator {
+    enum { IsRowMajor = (int(Rhs::Flags) & RowMajorBit) == RowMajorBit };
+
+   public:
+    EIGEN_STRONG_INLINE InnerIterator(const binary_evaluator& aEval, Index outer)
+        : m_lhsEval(aEval.m_lhsImpl),
+          m_rhsIter(aEval.m_rhsImpl, outer),
+          m_functor(aEval.m_functor),
+          m_value(0),
+          m_id(-1),
+          m_innerSize(aEval.m_expr.rhs().innerSize()) {
+      this->operator++();
+    }
+
+    EIGEN_STRONG_INLINE InnerIterator& operator++() {
+      ++m_id;
+      if (m_id < m_innerSize) {
+        Scalar lhsVal = m_lhsEval.coeff(IsRowMajor ? m_rhsIter.outer() : m_id, IsRowMajor ? m_id : m_rhsIter.outer());
+        if (m_rhsIter && m_rhsIter.index() == m_id) {
+          m_value = m_functor(lhsVal, m_rhsIter.value());
+          ++m_rhsIter;
+        } else
+          m_value = m_functor(lhsVal, Scalar(0));
+      }
+
+      return *this;
+    }
+
+    EIGEN_STRONG_INLINE Scalar value() const {
+      eigen_internal_assert(m_id < m_innerSize);
+      return m_value;
+    }
+
+    EIGEN_STRONG_INLINE StorageIndex index() const { return m_id; }
+    EIGEN_STRONG_INLINE Index outer() const { return m_rhsIter.outer(); }
+    EIGEN_STRONG_INLINE Index row() const { return IsRowMajor ? m_rhsIter.outer() : m_id; }
+    EIGEN_STRONG_INLINE Index col() const { return IsRowMajor ? m_id : m_rhsIter.outer(); }
+
+    EIGEN_STRONG_INLINE operator bool() const { return m_id < m_innerSize; }
+
+   protected:
+    const evaluator<Lhs>& m_lhsEval;
+    RhsIterator m_rhsIter;
+    const BinaryOp& m_functor;
+    Scalar m_value;
+    StorageIndex m_id;
+    StorageIndex m_innerSize;
+  };
+
+  enum {
+    CoeffReadCost =
+        int(evaluator<Lhs>::CoeffReadCost) + int(evaluator<Rhs>::CoeffReadCost) + int(functor_traits<BinaryOp>::Cost),
+    Flags = XprType::Flags
+  };
+
+  explicit binary_evaluator(const XprType& xpr)
+      : m_functor(xpr.functor()), m_lhsImpl(xpr.lhs()), m_rhsImpl(xpr.rhs()), m_expr(xpr) {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<BinaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  inline Index nonZerosEstimate() const { return m_expr.size(); }
+
+ protected:
+  const BinaryOp m_functor;
+  evaluator<Lhs> m_lhsImpl;
+  evaluator<Rhs> m_rhsImpl;
+  const XprType& m_expr;
 };
 
-// sparse - sparse  (product)
-template<typename T, typename Lhs, typename Rhs, typename Derived>
-class sparse_cwise_binary_op_inner_iterator_selector<scalar_product_op<T>, Lhs, Rhs, Derived, Sparse, Sparse>
-{
-    typedef scalar_product_op<T> BinaryFunc;
-    typedef CwiseBinaryOp<BinaryFunc, Lhs, Rhs> CwiseBinaryXpr;
-    typedef typename CwiseBinaryXpr::Scalar Scalar;
-    typedef typename traits<CwiseBinaryXpr>::_LhsNested _LhsNested;
-    typedef typename _LhsNested::InnerIterator LhsIterator;
-    typedef typename traits<CwiseBinaryXpr>::_RhsNested _RhsNested;
-    typedef typename _RhsNested::InnerIterator RhsIterator;
-    typedef typename Lhs::Index Index;
-  public:
-
-    EIGEN_STRONG_INLINE sparse_cwise_binary_op_inner_iterator_selector(const CwiseBinaryXpr& xpr, Index outer)
-      : m_lhsIter(xpr.lhs(),outer), m_rhsIter(xpr.rhs(),outer), m_functor(xpr.functor())
-    {
-      while (m_lhsIter && m_rhsIter && (m_lhsIter.index() != m_rhsIter.index()))
-      {
+// sparse op dense
+template <typename BinaryOp, typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<BinaryOp, Lhs, Rhs>, IteratorBased, IndexBased>
+    : evaluator_base<CwiseBinaryOp<BinaryOp, Lhs, Rhs> > {
+ protected:
+  typedef typename evaluator<Lhs>::InnerIterator LhsIterator;
+  typedef CwiseBinaryOp<BinaryOp, Lhs, Rhs> XprType;
+  typedef typename traits<XprType>::Scalar Scalar;
+  typedef typename XprType::StorageIndex StorageIndex;
+
+ public:
+  class InnerIterator {
+    enum { IsRowMajor = (int(Lhs::Flags) & RowMajorBit) == RowMajorBit };
+
+   public:
+    EIGEN_STRONG_INLINE InnerIterator(const binary_evaluator& aEval, Index outer)
+        : m_lhsIter(aEval.m_lhsImpl, outer),
+          m_rhsEval(aEval.m_rhsImpl),
+          m_functor(aEval.m_functor),
+          m_value(0),
+          m_id(-1),
+          m_innerSize(aEval.m_expr.lhs().innerSize()) {
+      this->operator++();
+    }
+
+    EIGEN_STRONG_INLINE InnerIterator& operator++() {
+      ++m_id;
+      if (m_id < m_innerSize) {
+        Scalar rhsVal = m_rhsEval.coeff(IsRowMajor ? m_lhsIter.outer() : m_id, IsRowMajor ? m_id : m_lhsIter.outer());
+        if (m_lhsIter && m_lhsIter.index() == m_id) {
+          m_value = m_functor(m_lhsIter.value(), rhsVal);
+          ++m_lhsIter;
+        } else
+          m_value = m_functor(Scalar(0), rhsVal);
+      }
+
+      return *this;
+    }
+
+    EIGEN_STRONG_INLINE Scalar value() const {
+      eigen_internal_assert(m_id < m_innerSize);
+      return m_value;
+    }
+
+    EIGEN_STRONG_INLINE StorageIndex index() const { return m_id; }
+    EIGEN_STRONG_INLINE Index outer() const { return m_lhsIter.outer(); }
+    EIGEN_STRONG_INLINE Index row() const { return IsRowMajor ? m_lhsIter.outer() : m_id; }
+    EIGEN_STRONG_INLINE Index col() const { return IsRowMajor ? m_id : m_lhsIter.outer(); }
+
+    EIGEN_STRONG_INLINE operator bool() const { return m_id < m_innerSize; }
+
+   protected:
+    LhsIterator m_lhsIter;
+    const evaluator<Rhs>& m_rhsEval;
+    const BinaryOp& m_functor;
+    Scalar m_value;
+    StorageIndex m_id;
+    StorageIndex m_innerSize;
+  };
+
+  enum {
+    CoeffReadCost =
+        int(evaluator<Lhs>::CoeffReadCost) + int(evaluator<Rhs>::CoeffReadCost) + int(functor_traits<BinaryOp>::Cost),
+    Flags = XprType::Flags
+  };
+
+  explicit binary_evaluator(const XprType& xpr)
+      : m_functor(xpr.functor()), m_lhsImpl(xpr.lhs()), m_rhsImpl(xpr.rhs()), m_expr(xpr) {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<BinaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  inline Index nonZerosEstimate() const { return m_expr.size(); }
+
+ protected:
+  const BinaryOp m_functor;
+  evaluator<Lhs> m_lhsImpl;
+  evaluator<Rhs> m_rhsImpl;
+  const XprType& m_expr;
+};
+
+template <typename T, typename LhsKind = typename evaluator_traits<typename T::Lhs>::Kind,
+          typename RhsKind = typename evaluator_traits<typename T::Rhs>::Kind,
+          typename LhsScalar = typename traits<typename T::Lhs>::Scalar,
+          typename RhsScalar = typename traits<typename T::Rhs>::Scalar>
+struct sparse_conjunction_evaluator;
+
+// "sparse .* sparse"
+template <typename T1, typename T2, typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<scalar_product_op<T1, T2>, Lhs, Rhs>, IteratorBased, IteratorBased>
+    : sparse_conjunction_evaluator<CwiseBinaryOp<scalar_product_op<T1, T2>, Lhs, Rhs> > {
+  typedef CwiseBinaryOp<scalar_product_op<T1, T2>, Lhs, Rhs> XprType;
+  typedef sparse_conjunction_evaluator<XprType> Base;
+  explicit binary_evaluator(const XprType& xpr) : Base(xpr) {}
+};
+// "dense .* sparse"
+template <typename T1, typename T2, typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<scalar_product_op<T1, T2>, Lhs, Rhs>, IndexBased, IteratorBased>
+    : sparse_conjunction_evaluator<CwiseBinaryOp<scalar_product_op<T1, T2>, Lhs, Rhs> > {
+  typedef CwiseBinaryOp<scalar_product_op<T1, T2>, Lhs, Rhs> XprType;
+  typedef sparse_conjunction_evaluator<XprType> Base;
+  explicit binary_evaluator(const XprType& xpr) : Base(xpr) {}
+};
+// "sparse .* dense"
+template <typename T1, typename T2, typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<scalar_product_op<T1, T2>, Lhs, Rhs>, IteratorBased, IndexBased>
+    : sparse_conjunction_evaluator<CwiseBinaryOp<scalar_product_op<T1, T2>, Lhs, Rhs> > {
+  typedef CwiseBinaryOp<scalar_product_op<T1, T2>, Lhs, Rhs> XprType;
+  typedef sparse_conjunction_evaluator<XprType> Base;
+  explicit binary_evaluator(const XprType& xpr) : Base(xpr) {}
+};
+
+// "sparse ./ dense"
+template <typename T1, typename T2, typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<scalar_quotient_op<T1, T2>, Lhs, Rhs>, IteratorBased, IndexBased>
+    : sparse_conjunction_evaluator<CwiseBinaryOp<scalar_quotient_op<T1, T2>, Lhs, Rhs> > {
+  typedef CwiseBinaryOp<scalar_quotient_op<T1, T2>, Lhs, Rhs> XprType;
+  typedef sparse_conjunction_evaluator<XprType> Base;
+  explicit binary_evaluator(const XprType& xpr) : Base(xpr) {}
+};
+
+// "sparse && sparse"
+template <typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<scalar_boolean_and_op<bool>, Lhs, Rhs>, IteratorBased, IteratorBased>
+    : sparse_conjunction_evaluator<CwiseBinaryOp<scalar_boolean_and_op<bool>, Lhs, Rhs> > {
+  typedef CwiseBinaryOp<scalar_boolean_and_op<bool>, Lhs, Rhs> XprType;
+  typedef sparse_conjunction_evaluator<XprType> Base;
+  explicit binary_evaluator(const XprType& xpr) : Base(xpr) {}
+};
+// "dense && sparse"
+template <typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<scalar_boolean_and_op<bool>, Lhs, Rhs>, IndexBased, IteratorBased>
+    : sparse_conjunction_evaluator<CwiseBinaryOp<scalar_boolean_and_op<bool>, Lhs, Rhs> > {
+  typedef CwiseBinaryOp<scalar_boolean_and_op<bool>, Lhs, Rhs> XprType;
+  typedef sparse_conjunction_evaluator<XprType> Base;
+  explicit binary_evaluator(const XprType& xpr) : Base(xpr) {}
+};
+// "sparse && dense"
+template <typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<scalar_boolean_and_op<bool>, Lhs, Rhs>, IteratorBased, IndexBased>
+    : sparse_conjunction_evaluator<CwiseBinaryOp<scalar_boolean_and_op<bool>, Lhs, Rhs> > {
+  typedef CwiseBinaryOp<scalar_boolean_and_op<bool>, Lhs, Rhs> XprType;
+  typedef sparse_conjunction_evaluator<XprType> Base;
+  explicit binary_evaluator(const XprType& xpr) : Base(xpr) {}
+};
+
+// The conjunction "^" evaluator performs a logical "and" or set "intersection" operation on two input arrays.
+// Given input arrays 'lhs' and 'rhs' and binary functor 'func',
+// the sparse destination array 'dst' is evaluated as follows:
+//   if lhs(i,j) and rhs(i,j) are present, dst(i,j) = func(lhs(i,j), rhs(i,j))
+//   if lhs(i,j) is present and rhs(i,j) is null, dst(i,j) is null
+//   if lhs(i,j) is null and rhs(i,j) is present, dst(i,j) is null
+
+// "sparse ^ sparse"
+template <typename XprType>
+struct sparse_conjunction_evaluator<XprType, IteratorBased, IteratorBased> : evaluator_base<XprType> {
+ protected:
+  typedef typename XprType::Functor BinaryOp;
+  typedef typename XprType::Lhs LhsArg;
+  typedef typename XprType::Rhs RhsArg;
+  typedef typename evaluator<LhsArg>::InnerIterator LhsIterator;
+  typedef typename evaluator<RhsArg>::InnerIterator RhsIterator;
+  typedef typename XprType::StorageIndex StorageIndex;
+  typedef typename traits<XprType>::Scalar Scalar;
+
+ public:
+  class InnerIterator {
+   public:
+    EIGEN_STRONG_INLINE InnerIterator(const sparse_conjunction_evaluator& aEval, Index outer)
+        : m_lhsIter(aEval.m_lhsImpl, outer), m_rhsIter(aEval.m_rhsImpl, outer), m_functor(aEval.m_functor) {
+      while (m_lhsIter && m_rhsIter && (m_lhsIter.index() != m_rhsIter.index())) {
         if (m_lhsIter.index() < m_rhsIter.index())
           ++m_lhsIter;
         else
@@ -181,145 +397,542 @@ class sparse_cwise_binary_op_inner_iterator_selector<scalar_product_op<T>, Lhs,
       }
     }
 
-    EIGEN_STRONG_INLINE Derived& operator++()
-    {
+    EIGEN_STRONG_INLINE InnerIterator& operator++() {
       ++m_lhsIter;
       ++m_rhsIter;
-      while (m_lhsIter && m_rhsIter && (m_lhsIter.index() != m_rhsIter.index()))
-      {
+      while (m_lhsIter && m_rhsIter && (m_lhsIter.index() != m_rhsIter.index())) {
         if (m_lhsIter.index() < m_rhsIter.index())
           ++m_lhsIter;
         else
           ++m_rhsIter;
       }
-      return *static_cast<Derived*>(this);
+      return *this;
     }
 
     EIGEN_STRONG_INLINE Scalar value() const { return m_functor(m_lhsIter.value(), m_rhsIter.value()); }
 
-    EIGEN_STRONG_INLINE Index index() const { return m_lhsIter.index(); }
+    EIGEN_STRONG_INLINE StorageIndex index() const { return m_lhsIter.index(); }
+    EIGEN_STRONG_INLINE Index outer() const { return m_lhsIter.outer(); }
     EIGEN_STRONG_INLINE Index row() const { return m_lhsIter.row(); }
     EIGEN_STRONG_INLINE Index col() const { return m_lhsIter.col(); }
 
     EIGEN_STRONG_INLINE operator bool() const { return (m_lhsIter && m_rhsIter); }
 
-  protected:
+   protected:
     LhsIterator m_lhsIter;
     RhsIterator m_rhsIter;
-    const BinaryFunc& m_functor;
+    const BinaryOp& m_functor;
+  };
+
+  enum {
+    CoeffReadCost = int(evaluator<LhsArg>::CoeffReadCost) + int(evaluator<RhsArg>::CoeffReadCost) +
+                    int(functor_traits<BinaryOp>::Cost),
+    Flags = XprType::Flags
+  };
+
+  explicit sparse_conjunction_evaluator(const XprType& xpr)
+      : m_functor(xpr.functor()), m_lhsImpl(xpr.lhs()), m_rhsImpl(xpr.rhs()) {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<BinaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  inline Index nonZerosEstimate() const {
+    return (std::min)(m_lhsImpl.nonZerosEstimate(), m_rhsImpl.nonZerosEstimate());
+  }
+
+ protected:
+  const BinaryOp m_functor;
+  evaluator<LhsArg> m_lhsImpl;
+  evaluator<RhsArg> m_rhsImpl;
+};
+
+// "dense ^ sparse"
+template <typename XprType>
+struct sparse_conjunction_evaluator<XprType, IndexBased, IteratorBased> : evaluator_base<XprType> {
+ protected:
+  typedef typename XprType::Functor BinaryOp;
+  typedef typename XprType::Lhs LhsArg;
+  typedef typename XprType::Rhs RhsArg;
+  typedef evaluator<LhsArg> LhsEvaluator;
+  typedef typename evaluator<RhsArg>::InnerIterator RhsIterator;
+  typedef typename XprType::StorageIndex StorageIndex;
+  typedef typename traits<XprType>::Scalar Scalar;
+
+ public:
+  class InnerIterator {
+    enum { IsRowMajor = (int(RhsArg::Flags) & RowMajorBit) == RowMajorBit };
+
+   public:
+    EIGEN_STRONG_INLINE InnerIterator(const sparse_conjunction_evaluator& aEval, Index outer)
+        : m_lhsEval(aEval.m_lhsImpl), m_rhsIter(aEval.m_rhsImpl, outer), m_functor(aEval.m_functor), m_outer(outer) {}
+
+    EIGEN_STRONG_INLINE InnerIterator& operator++() {
+      ++m_rhsIter;
+      return *this;
+    }
+
+    EIGEN_STRONG_INLINE Scalar value() const {
+      return m_functor(
+          m_lhsEval.coeff(IsRowMajor ? m_outer : m_rhsIter.index(), IsRowMajor ? m_rhsIter.index() : m_outer),
+          m_rhsIter.value());
+    }
+
+    EIGEN_STRONG_INLINE StorageIndex index() const { return m_rhsIter.index(); }
+    EIGEN_STRONG_INLINE Index outer() const { return m_rhsIter.outer(); }
+    EIGEN_STRONG_INLINE Index row() const { return m_rhsIter.row(); }
+    EIGEN_STRONG_INLINE Index col() const { return m_rhsIter.col(); }
+
+    EIGEN_STRONG_INLINE operator bool() const { return m_rhsIter; }
+
+   protected:
+    const LhsEvaluator& m_lhsEval;
+    RhsIterator m_rhsIter;
+    const BinaryOp& m_functor;
+    const Index m_outer;
+  };
+
+  enum {
+    CoeffReadCost = int(evaluator<LhsArg>::CoeffReadCost) + int(evaluator<RhsArg>::CoeffReadCost) +
+                    int(functor_traits<BinaryOp>::Cost),
+    Flags = XprType::Flags
+  };
+
+  explicit sparse_conjunction_evaluator(const XprType& xpr)
+      : m_functor(xpr.functor()), m_lhsImpl(xpr.lhs()), m_rhsImpl(xpr.rhs()) {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<BinaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  inline Index nonZerosEstimate() const { return m_rhsImpl.nonZerosEstimate(); }
+
+ protected:
+  const BinaryOp m_functor;
+  evaluator<LhsArg> m_lhsImpl;
+  evaluator<RhsArg> m_rhsImpl;
 };
 
-// sparse - dense  (product)
-template<typename T, typename Lhs, typename Rhs, typename Derived>
-class sparse_cwise_binary_op_inner_iterator_selector<scalar_product_op<T>, Lhs, Rhs, Derived, Sparse, Dense>
-{
-    typedef scalar_product_op<T> BinaryFunc;
-    typedef CwiseBinaryOp<BinaryFunc, Lhs, Rhs> CwiseBinaryXpr;
-    typedef typename CwiseBinaryXpr::Scalar Scalar;
-    typedef typename traits<CwiseBinaryXpr>::_LhsNested _LhsNested;
-    typedef typename traits<CwiseBinaryXpr>::RhsNested RhsNested;
-    typedef typename _LhsNested::InnerIterator LhsIterator;
-    typedef typename Lhs::Index Index;
-    enum { IsRowMajor = (int(Lhs::Flags)&RowMajorBit)==RowMajorBit };
-  public:
-
-    EIGEN_STRONG_INLINE sparse_cwise_binary_op_inner_iterator_selector(const CwiseBinaryXpr& xpr, Index outer)
-      : m_rhs(xpr.rhs()), m_lhsIter(xpr.lhs(),outer), m_functor(xpr.functor()), m_outer(outer)
-    {}
-
-    EIGEN_STRONG_INLINE Derived& operator++()
-    {
+// "sparse ^ dense"
+template <typename XprType>
+struct sparse_conjunction_evaluator<XprType, IteratorBased, IndexBased> : evaluator_base<XprType> {
+ protected:
+  typedef typename XprType::Functor BinaryOp;
+  typedef typename XprType::Lhs LhsArg;
+  typedef typename XprType::Rhs RhsArg;
+  typedef typename evaluator<LhsArg>::InnerIterator LhsIterator;
+  typedef evaluator<RhsArg> RhsEvaluator;
+  typedef typename XprType::StorageIndex StorageIndex;
+  typedef typename traits<XprType>::Scalar Scalar;
+
+ public:
+  class InnerIterator {
+    enum { IsRowMajor = (int(LhsArg::Flags) & RowMajorBit) == RowMajorBit };
+
+   public:
+    EIGEN_STRONG_INLINE InnerIterator(const sparse_conjunction_evaluator& aEval, Index outer)
+        : m_lhsIter(aEval.m_lhsImpl, outer), m_rhsEval(aEval.m_rhsImpl), m_functor(aEval.m_functor), m_outer(outer) {}
+
+    EIGEN_STRONG_INLINE InnerIterator& operator++() {
       ++m_lhsIter;
-      return *static_cast<Derived*>(this);
+      return *this;
     }
 
-    EIGEN_STRONG_INLINE Scalar value() const
-    { return m_functor(m_lhsIter.value(),
-                       m_rhs.coeff(IsRowMajor?m_outer:m_lhsIter.index(),IsRowMajor?m_lhsIter.index():m_outer)); }
+    EIGEN_STRONG_INLINE Scalar value() const {
+      return m_functor(m_lhsIter.value(), m_rhsEval.coeff(IsRowMajor ? m_outer : m_lhsIter.index(),
+                                                          IsRowMajor ? m_lhsIter.index() : m_outer));
+    }
 
-    EIGEN_STRONG_INLINE Index index() const { return m_lhsIter.index(); }
+    EIGEN_STRONG_INLINE StorageIndex index() const { return m_lhsIter.index(); }
+    EIGEN_STRONG_INLINE Index outer() const { return m_lhsIter.outer(); }
     EIGEN_STRONG_INLINE Index row() const { return m_lhsIter.row(); }
     EIGEN_STRONG_INLINE Index col() const { return m_lhsIter.col(); }
 
     EIGEN_STRONG_INLINE operator bool() const { return m_lhsIter; }
 
-  protected:
-    RhsNested m_rhs;
+   protected:
     LhsIterator m_lhsIter;
-    const BinaryFunc m_functor;
+    const evaluator<RhsArg>& m_rhsEval;
+    const BinaryOp& m_functor;
     const Index m_outer;
+  };
+
+  enum {
+    CoeffReadCost = int(evaluator<LhsArg>::CoeffReadCost) + int(evaluator<RhsArg>::CoeffReadCost) +
+                    int(functor_traits<BinaryOp>::Cost),
+    Flags = XprType::Flags
+  };
+
+  explicit sparse_conjunction_evaluator(const XprType& xpr)
+      : m_functor(xpr.functor()), m_lhsImpl(xpr.lhs()), m_rhsImpl(xpr.rhs()) {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<BinaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  inline Index nonZerosEstimate() const { return m_lhsImpl.nonZerosEstimate(); }
+
+ protected:
+  const BinaryOp m_functor;
+  evaluator<LhsArg> m_lhsImpl;
+  evaluator<RhsArg> m_rhsImpl;
 };
 
-// sparse - dense  (product)
-template<typename T, typename Lhs, typename Rhs, typename Derived>
-class sparse_cwise_binary_op_inner_iterator_selector<scalar_product_op<T>, Lhs, Rhs, Derived, Dense, Sparse>
-{
-    typedef scalar_product_op<T> BinaryFunc;
-    typedef CwiseBinaryOp<BinaryFunc, Lhs, Rhs> CwiseBinaryXpr;
-    typedef typename CwiseBinaryXpr::Scalar Scalar;
-    typedef typename traits<CwiseBinaryXpr>::_RhsNested _RhsNested;
-    typedef typename _RhsNested::InnerIterator RhsIterator;
-    typedef typename Lhs::Index Index;
-
-    enum { IsRowMajor = (int(Rhs::Flags)&RowMajorBit)==RowMajorBit };
-  public:
-
-    EIGEN_STRONG_INLINE sparse_cwise_binary_op_inner_iterator_selector(const CwiseBinaryXpr& xpr, Index outer)
-      : m_xpr(xpr), m_rhsIter(xpr.rhs(),outer), m_functor(xpr.functor()), m_outer(outer)
-    {}
-
-    EIGEN_STRONG_INLINE Derived& operator++()
-    {
-      ++m_rhsIter;
-      return *static_cast<Derived*>(this);
+template <typename T, typename LhsKind = typename evaluator_traits<typename T::Lhs>::Kind,
+          typename RhsKind = typename evaluator_traits<typename T::Rhs>::Kind,
+          typename LhsScalar = typename traits<typename T::Lhs>::Scalar,
+          typename RhsScalar = typename traits<typename T::Rhs>::Scalar>
+struct sparse_disjunction_evaluator;
+
+// The disjunction "v" evaluator performs a logical "or" or set "union" operation on two input arrays.
+// Given input arrays 'lhs' and 'rhs' and binary functor 'func',
+// the sparse destination array 'dst' is evaluated as follows:
+//   if lhs(i,j) and rhs(i,j) are present, dst(i,j) = func(lhs(i,j), rhs(i,j))
+//   if lhs(i,j) is present and rhs(i,j) is null, dst(i,j) = lhs(i,j)
+//   if lhs(i,j) is null and rhs(i,j) is present, dst(i,j) = rhs(i,j)
+
+// "sparse v sparse"
+template <typename XprType>
+struct sparse_disjunction_evaluator<XprType, IteratorBased, IteratorBased> : evaluator_base<XprType> {
+ protected:
+  typedef typename XprType::Functor BinaryOp;
+  typedef typename XprType::Lhs LhsArg;
+  typedef typename XprType::Rhs RhsArg;
+  typedef typename evaluator<LhsArg>::InnerIterator LhsIterator;
+  typedef typename evaluator<RhsArg>::InnerIterator RhsIterator;
+  typedef typename XprType::StorageIndex StorageIndex;
+  typedef typename traits<XprType>::Scalar Scalar;
+
+ public:
+  class InnerIterator {
+   public:
+    EIGEN_STRONG_INLINE InnerIterator(const sparse_disjunction_evaluator& aEval, Index outer)
+        : m_lhsIter(aEval.m_lhsImpl, outer),
+          m_rhsIter(aEval.m_rhsImpl, outer),
+          m_functor(aEval.m_functor),
+          m_value(Scalar(0)) {
+      this->operator++();
     }
 
-    EIGEN_STRONG_INLINE Scalar value() const
-    { return m_functor(m_xpr.lhs().coeff(IsRowMajor?m_outer:m_rhsIter.index(),IsRowMajor?m_rhsIter.index():m_outer), m_rhsIter.value()); }
+    EIGEN_STRONG_INLINE InnerIterator& operator++() {
+      if (m_lhsIter && m_rhsIter && (m_lhsIter.index() == m_rhsIter.index())) {
+        m_id = m_lhsIter.index();
+        m_value = m_functor(m_lhsIter.value(), m_rhsIter.value());
+        ++m_lhsIter;
+        ++m_rhsIter;
+      } else if (m_lhsIter && (!m_rhsIter || (m_lhsIter.index() < m_rhsIter.index()))) {
+        m_id = m_lhsIter.index();
+        m_value = m_lhsIter.value();
+        ++m_lhsIter;
+      } else if (m_rhsIter && (!m_lhsIter || (m_lhsIter.index() > m_rhsIter.index()))) {
+        m_id = m_rhsIter.index();
+        m_value = m_rhsIter.value();
+        ++m_rhsIter;
+      } else {
+        m_id = -1;
+      }
+      return *this;
+    }
 
-    EIGEN_STRONG_INLINE Index index() const { return m_rhsIter.index(); }
-    EIGEN_STRONG_INLINE Index row() const { return m_rhsIter.row(); }
-    EIGEN_STRONG_INLINE Index col() const { return m_rhsIter.col(); }
+    EIGEN_STRONG_INLINE Scalar value() const { return m_value; }
 
-    EIGEN_STRONG_INLINE operator bool() const { return m_rhsIter; }
+    EIGEN_STRONG_INLINE StorageIndex index() const { return m_id; }
+    EIGEN_STRONG_INLINE Index outer() const { return m_lhsIter.outer(); }
+    EIGEN_STRONG_INLINE Index row() const { return LhsArg::IsRowMajor ? m_lhsIter.row() : index(); }
+    EIGEN_STRONG_INLINE Index col() const { return LhsArg::IsRowMajor ? index() : m_lhsIter.col(); }
 
-  protected:
-    const CwiseBinaryXpr& m_xpr;
+    EIGEN_STRONG_INLINE operator bool() const { return m_id >= 0; }
+
+   protected:
+    LhsIterator m_lhsIter;
     RhsIterator m_rhsIter;
-    const BinaryFunc& m_functor;
-    const Index m_outer;
+    const BinaryOp& m_functor;
+    Scalar m_value;
+    StorageIndex m_id;
+  };
+
+  enum {
+    CoeffReadCost = int(evaluator<LhsArg>::CoeffReadCost) + int(evaluator<RhsArg>::CoeffReadCost) +
+                    int(functor_traits<BinaryOp>::Cost),
+    Flags = XprType::Flags
+  };
+
+  explicit sparse_disjunction_evaluator(const XprType& xpr)
+      : m_functor(xpr.functor()), m_lhsImpl(xpr.lhs()), m_rhsImpl(xpr.rhs()) {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<BinaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  inline Index nonZerosEstimate() const { return m_lhsImpl.nonZerosEstimate() + m_rhsImpl.nonZerosEstimate(); }
+
+ protected:
+  const BinaryOp m_functor;
+  evaluator<LhsArg> m_lhsImpl;
+  evaluator<RhsArg> m_rhsImpl;
 };
 
-} // end namespace internal
+// "dense v sparse"
+template <typename XprType>
+struct sparse_disjunction_evaluator<XprType, IndexBased, IteratorBased> : evaluator_base<XprType> {
+ protected:
+  typedef typename XprType::Functor BinaryOp;
+  typedef typename XprType::Lhs LhsArg;
+  typedef typename XprType::Rhs RhsArg;
+  typedef evaluator<LhsArg> LhsEvaluator;
+  typedef typename evaluator<RhsArg>::InnerIterator RhsIterator;
+  typedef typename XprType::StorageIndex StorageIndex;
+  typedef typename traits<XprType>::Scalar Scalar;
+
+ public:
+  class InnerIterator {
+    enum { IsRowMajor = (int(RhsArg::Flags) & RowMajorBit) == RowMajorBit };
+
+   public:
+    EIGEN_STRONG_INLINE InnerIterator(const sparse_disjunction_evaluator& aEval, Index outer)
+        : m_lhsEval(aEval.m_lhsImpl),
+          m_rhsIter(aEval.m_rhsImpl, outer),
+          m_functor(aEval.m_functor),
+          m_value(0),
+          m_id(-1),
+          m_innerSize(aEval.m_expr.rhs().innerSize()) {
+      this->operator++();
+    }
+
+    EIGEN_STRONG_INLINE InnerIterator& operator++() {
+      ++m_id;
+      if (m_id < m_innerSize) {
+        Scalar lhsVal = m_lhsEval.coeff(IsRowMajor ? m_rhsIter.outer() : m_id, IsRowMajor ? m_id : m_rhsIter.outer());
+        if (m_rhsIter && m_rhsIter.index() == m_id) {
+          m_value = m_functor(lhsVal, m_rhsIter.value());
+          ++m_rhsIter;
+        } else
+          m_value = lhsVal;
+      }
+
+      return *this;
+    }
+
+    EIGEN_STRONG_INLINE Scalar value() const {
+      eigen_internal_assert(m_id < m_innerSize);
+      return m_value;
+    }
+
+    EIGEN_STRONG_INLINE StorageIndex index() const { return m_id; }
+    EIGEN_STRONG_INLINE Index outer() const { return m_rhsIter.outer(); }
+    EIGEN_STRONG_INLINE Index row() const { return IsRowMajor ? m_rhsIter.outer() : m_id; }
+    EIGEN_STRONG_INLINE Index col() const { return IsRowMajor ? m_id : m_rhsIter.outer(); }
+
+    EIGEN_STRONG_INLINE operator bool() const { return m_id < m_innerSize; }
+
+   protected:
+    const evaluator<LhsArg>& m_lhsEval;
+    RhsIterator m_rhsIter;
+    const BinaryOp& m_functor;
+    Scalar m_value;
+    StorageIndex m_id;
+    StorageIndex m_innerSize;
+  };
+
+  enum {
+    CoeffReadCost = int(evaluator<LhsArg>::CoeffReadCost) + int(evaluator<RhsArg>::CoeffReadCost) +
+                    int(functor_traits<BinaryOp>::Cost),
+    Flags = XprType::Flags
+  };
+
+  explicit sparse_disjunction_evaluator(const XprType& xpr)
+      : m_functor(xpr.functor()), m_lhsImpl(xpr.lhs()), m_rhsImpl(xpr.rhs()), m_expr(xpr) {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<BinaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  inline Index nonZerosEstimate() const { return m_expr.size(); }
+
+ protected:
+  const BinaryOp m_functor;
+  evaluator<LhsArg> m_lhsImpl;
+  evaluator<RhsArg> m_rhsImpl;
+  const XprType& m_expr;
+};
+
+// "sparse v dense"
+template <typename XprType>
+struct sparse_disjunction_evaluator<XprType, IteratorBased, IndexBased> : evaluator_base<XprType> {
+ protected:
+  typedef typename XprType::Functor BinaryOp;
+  typedef typename XprType::Lhs LhsArg;
+  typedef typename XprType::Rhs RhsArg;
+  typedef typename evaluator<LhsArg>::InnerIterator LhsIterator;
+  typedef evaluator<RhsArg> RhsEvaluator;
+  typedef typename XprType::StorageIndex StorageIndex;
+  typedef typename traits<XprType>::Scalar Scalar;
+
+ public:
+  class InnerIterator {
+    enum { IsRowMajor = (int(LhsArg::Flags) & RowMajorBit) == RowMajorBit };
+
+   public:
+    EIGEN_STRONG_INLINE InnerIterator(const sparse_disjunction_evaluator& aEval, Index outer)
+        : m_lhsIter(aEval.m_lhsImpl, outer),
+          m_rhsEval(aEval.m_rhsImpl),
+          m_functor(aEval.m_functor),
+          m_value(0),
+          m_id(-1),
+          m_innerSize(aEval.m_expr.lhs().innerSize()) {
+      this->operator++();
+    }
+
+    EIGEN_STRONG_INLINE InnerIterator& operator++() {
+      ++m_id;
+      if (m_id < m_innerSize) {
+        Scalar rhsVal = m_rhsEval.coeff(IsRowMajor ? m_lhsIter.outer() : m_id, IsRowMajor ? m_id : m_lhsIter.outer());
+        if (m_lhsIter && m_lhsIter.index() == m_id) {
+          m_value = m_functor(m_lhsIter.value(), rhsVal);
+          ++m_lhsIter;
+        } else
+          m_value = rhsVal;
+      }
+
+      return *this;
+    }
+
+    EIGEN_STRONG_INLINE Scalar value() const {
+      eigen_internal_assert(m_id < m_innerSize);
+      return m_value;
+    }
+
+    EIGEN_STRONG_INLINE StorageIndex index() const { return m_id; }
+    EIGEN_STRONG_INLINE Index outer() const { return m_lhsIter.outer(); }
+    EIGEN_STRONG_INLINE Index row() const { return IsRowMajor ? m_lhsIter.outer() : m_id; }
+    EIGEN_STRONG_INLINE Index col() const { return IsRowMajor ? m_id : m_lhsIter.outer(); }
+
+    EIGEN_STRONG_INLINE operator bool() const { return m_id < m_innerSize; }
+
+   protected:
+    LhsIterator m_lhsIter;
+    const evaluator<RhsArg>& m_rhsEval;
+    const BinaryOp& m_functor;
+    Scalar m_value;
+    StorageIndex m_id;
+    StorageIndex m_innerSize;
+  };
+
+  enum {
+    CoeffReadCost = int(evaluator<LhsArg>::CoeffReadCost) + int(evaluator<RhsArg>::CoeffReadCost) +
+                    int(functor_traits<BinaryOp>::Cost),
+    Flags = XprType::Flags
+  };
+
+  explicit sparse_disjunction_evaluator(const XprType& xpr)
+      : m_functor(xpr.functor()), m_lhsImpl(xpr.lhs()), m_rhsImpl(xpr.rhs()), m_expr(xpr) {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<BinaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+  inline Index nonZerosEstimate() const { return m_expr.size(); }
+
+ protected:
+  const BinaryOp m_functor;
+  evaluator<LhsArg> m_lhsImpl;
+  evaluator<RhsArg> m_rhsImpl;
+  const XprType& m_expr;
+};
+
+// when DupFunc is wrapped with scalar_dup_op, use disjunction evaluator
+template <typename T1, typename T2, typename DupFunc, typename Lhs, typename Rhs>
+struct binary_evaluator<CwiseBinaryOp<scalar_disjunction_op<DupFunc, T1, T2>, Lhs, Rhs>, IteratorBased, IteratorBased>
+    : sparse_disjunction_evaluator<CwiseBinaryOp<scalar_disjunction_op<DupFunc, T1, T2>, Lhs, Rhs> > {
+  typedef CwiseBinaryOp<scalar_disjunction_op<DupFunc, T1, T2>, Lhs, Rhs> XprType;
+  typedef sparse_disjunction_evaluator<XprType> Base;
+  explicit binary_evaluator(const XprType& xpr) : Base(xpr) {}
+};
+}  // namespace internal
 
 /***************************************************************************
-* Implementation of SparseMatrixBase and SparseCwise functions/operators
-***************************************************************************/
-
-template<typename Derived>
-template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived &
-SparseMatrixBase<Derived>::operator-=(const SparseMatrixBase<OtherDerived> &other)
-{
+ * Implementation of SparseMatrixBase and SparseCwise functions/operators
+ ***************************************************************************/
+
+template <typename Derived>
+template <typename OtherDerived>
+Derived& SparseMatrixBase<Derived>::operator+=(const EigenBase<OtherDerived>& other) {
+  call_assignment(derived(), other.derived(), internal::add_assign_op<Scalar, typename OtherDerived::Scalar>());
+  return derived();
+}
+
+template <typename Derived>
+template <typename OtherDerived>
+Derived& SparseMatrixBase<Derived>::operator-=(const EigenBase<OtherDerived>& other) {
+  call_assignment(derived(), other.derived(), internal::assign_op<Scalar, typename OtherDerived::Scalar>());
+  return derived();
+}
+
+template <typename Derived>
+template <typename OtherDerived>
+EIGEN_STRONG_INLINE Derived& SparseMatrixBase<Derived>::operator-=(const SparseMatrixBase<OtherDerived>& other) {
   return derived() = derived() - other.derived();
 }
 
-template<typename Derived>
-template<typename OtherDerived>
-EIGEN_STRONG_INLINE Derived &
-SparseMatrixBase<Derived>::operator+=(const SparseMatrixBase<OtherDerived>& other)
-{
+template <typename Derived>
+template <typename OtherDerived>
+EIGEN_STRONG_INLINE Derived& SparseMatrixBase<Derived>::operator+=(const SparseMatrixBase<OtherDerived>& other) {
   return derived() = derived() + other.derived();
 }
 
-template<typename Derived>
-template<typename OtherDerived>
+template <typename Derived>
+template <typename OtherDerived>
+Derived& SparseMatrixBase<Derived>::operator+=(const DiagonalBase<OtherDerived>& other) {
+  call_assignment_no_alias(derived(), other.derived(),
+                           internal::add_assign_op<Scalar, typename OtherDerived::Scalar>());
+  return derived();
+}
+
+template <typename Derived>
+template <typename OtherDerived>
+Derived& SparseMatrixBase<Derived>::operator-=(const DiagonalBase<OtherDerived>& other) {
+  call_assignment_no_alias(derived(), other.derived(),
+                           internal::sub_assign_op<Scalar, typename OtherDerived::Scalar>());
+  return derived();
+}
+
+template <typename Derived>
+template <typename OtherDerived>
 EIGEN_STRONG_INLINE const typename SparseMatrixBase<Derived>::template CwiseProductDenseReturnType<OtherDerived>::Type
-SparseMatrixBase<Derived>::cwiseProduct(const MatrixBase<OtherDerived> &other) const
-{
+SparseMatrixBase<Derived>::cwiseProduct(const MatrixBase<OtherDerived>& other) const {
   return typename CwiseProductDenseReturnType<OtherDerived>::Type(derived(), other.derived());
 }
 
-} // end namespace Eigen
+template <typename DenseDerived, typename SparseDerived>
+EIGEN_STRONG_INLINE const
+    CwiseBinaryOp<internal::scalar_sum_op<typename DenseDerived::Scalar, typename SparseDerived::Scalar>,
+                  const DenseDerived, const SparseDerived>
+    operator+(const MatrixBase<DenseDerived>& a, const SparseMatrixBase<SparseDerived>& b) {
+  return CwiseBinaryOp<internal::scalar_sum_op<typename DenseDerived::Scalar, typename SparseDerived::Scalar>,
+                       const DenseDerived, const SparseDerived>(a.derived(), b.derived());
+}
+
+template <typename SparseDerived, typename DenseDerived>
+EIGEN_STRONG_INLINE const
+    CwiseBinaryOp<internal::scalar_sum_op<typename SparseDerived::Scalar, typename DenseDerived::Scalar>,
+                  const SparseDerived, const DenseDerived>
+    operator+(const SparseMatrixBase<SparseDerived>& a, const MatrixBase<DenseDerived>& b) {
+  return CwiseBinaryOp<internal::scalar_sum_op<typename SparseDerived::Scalar, typename DenseDerived::Scalar>,
+                       const SparseDerived, const DenseDerived>(a.derived(), b.derived());
+}
+
+template <typename DenseDerived, typename SparseDerived>
+EIGEN_STRONG_INLINE const
+    CwiseBinaryOp<internal::scalar_difference_op<typename DenseDerived::Scalar, typename SparseDerived::Scalar>,
+                  const DenseDerived, const SparseDerived>
+    operator-(const MatrixBase<DenseDerived>& a, const SparseMatrixBase<SparseDerived>& b) {
+  return CwiseBinaryOp<internal::scalar_difference_op<typename DenseDerived::Scalar, typename SparseDerived::Scalar>,
+                       const DenseDerived, const SparseDerived>(a.derived(), b.derived());
+}
+
+template <typename SparseDerived, typename DenseDerived>
+EIGEN_STRONG_INLINE const
+    CwiseBinaryOp<internal::scalar_difference_op<typename SparseDerived::Scalar, typename DenseDerived::Scalar>,
+                  const SparseDerived, const DenseDerived>
+    operator-(const SparseMatrixBase<SparseDerived>& a, const MatrixBase<DenseDerived>& b) {
+  return CwiseBinaryOp<internal::scalar_difference_op<typename SparseDerived::Scalar, typename DenseDerived::Scalar>,
+                       const SparseDerived, const DenseDerived>(a.derived(), b.derived());
+}
+
+}  // end namespace Eigen
 
-#endif // EIGEN_SPARSE_CWISE_BINARY_OP_H
+#endif  // EIGEN_SPARSE_CWISE_BINARY_OP_H
diff --git a/inst/include/Eigen/src/SparseCore/SparseCwiseUnaryOp.h b/inst/include/Eigen/src/SparseCore/SparseCwiseUnaryOp.h
index 5a50c780..9fc1e66b 100644
--- a/inst/include/Eigen/src/SparseCore/SparseCwiseUnaryOp.h
+++ b/inst/include/Eigen/src/SparseCore/SparseCwiseUnaryOp.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,154 +10,133 @@
 #ifndef EIGEN_SPARSE_CWISE_UNARY_OP_H
 #define EIGEN_SPARSE_CWISE_UNARY_OP_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 
-template<typename UnaryOp, typename MatrixType>
-class CwiseUnaryOpImpl<UnaryOp,MatrixType,Sparse>
-  : public SparseMatrixBase<CwiseUnaryOp<UnaryOp, MatrixType> >
-{
-  public:
+namespace Eigen {
 
-    class InnerIterator;
-    class ReverseInnerIterator;
+namespace internal {
 
-    typedef CwiseUnaryOp<UnaryOp, MatrixType> Derived;
-    EIGEN_SPARSE_PUBLIC_INTERFACE(Derived)
+template <typename UnaryOp, typename ArgType>
+struct unary_evaluator<CwiseUnaryOp<UnaryOp, ArgType>, IteratorBased>
+    : public evaluator_base<CwiseUnaryOp<UnaryOp, ArgType> > {
+ public:
+  typedef CwiseUnaryOp<UnaryOp, ArgType> XprType;
 
-  protected:
-    typedef typename internal::traits<Derived>::_XprTypeNested _MatrixTypeNested;
-    typedef typename _MatrixTypeNested::InnerIterator MatrixTypeIterator;
-    typedef typename _MatrixTypeNested::ReverseInnerIterator MatrixTypeReverseIterator;
-};
+  class InnerIterator;
 
-template<typename UnaryOp, typename MatrixType>
-class CwiseUnaryOpImpl<UnaryOp,MatrixType,Sparse>::InnerIterator
-    : public CwiseUnaryOpImpl<UnaryOp,MatrixType,Sparse>::MatrixTypeIterator
-{
-    typedef typename CwiseUnaryOpImpl::Scalar Scalar;
-    typedef typename CwiseUnaryOpImpl<UnaryOp,MatrixType,Sparse>::MatrixTypeIterator Base;
-  public:
+  enum {
+    CoeffReadCost = int(evaluator<ArgType>::CoeffReadCost) + int(functor_traits<UnaryOp>::Cost),
+    Flags = XprType::Flags
+  };
 
-    EIGEN_STRONG_INLINE InnerIterator(const CwiseUnaryOpImpl& unaryOp, typename CwiseUnaryOpImpl::Index outer)
-      : Base(unaryOp.derived().nestedExpression(),outer), m_functor(unaryOp.derived().functor())
-    {}
+  explicit unary_evaluator(const XprType& op) : m_functor(op.functor()), m_argImpl(op.nestedExpression()) {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<UnaryOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
 
-    EIGEN_STRONG_INLINE InnerIterator& operator++()
-    { Base::operator++(); return *this; }
+  inline Index nonZerosEstimate() const { return m_argImpl.nonZerosEstimate(); }
 
-    EIGEN_STRONG_INLINE typename CwiseUnaryOpImpl::Scalar value() const { return m_functor(Base::value()); }
+ protected:
+  typedef typename evaluator<ArgType>::InnerIterator EvalIterator;
 
-  protected:
-    const UnaryOp m_functor;
-  private:
-    typename CwiseUnaryOpImpl::Scalar& valueRef();
+  const UnaryOp m_functor;
+  evaluator<ArgType> m_argImpl;
 };
 
-template<typename UnaryOp, typename MatrixType>
-class CwiseUnaryOpImpl<UnaryOp,MatrixType,Sparse>::ReverseInnerIterator
-    : public CwiseUnaryOpImpl<UnaryOp,MatrixType,Sparse>::MatrixTypeReverseIterator
-{
-    typedef typename CwiseUnaryOpImpl::Scalar Scalar;
-    typedef typename CwiseUnaryOpImpl<UnaryOp,MatrixType,Sparse>::MatrixTypeReverseIterator Base;
-  public:
-
-    EIGEN_STRONG_INLINE ReverseInnerIterator(const CwiseUnaryOpImpl& unaryOp, typename CwiseUnaryOpImpl::Index outer)
-      : Base(unaryOp.derived().nestedExpression(),outer), m_functor(unaryOp.derived().functor())
-    {}
+template <typename UnaryOp, typename ArgType>
+class unary_evaluator<CwiseUnaryOp<UnaryOp, ArgType>, IteratorBased>::InnerIterator
+    : public unary_evaluator<CwiseUnaryOp<UnaryOp, ArgType>, IteratorBased>::EvalIterator {
+ protected:
+  typedef typename XprType::Scalar Scalar;
+  typedef typename unary_evaluator<CwiseUnaryOp<UnaryOp, ArgType>, IteratorBased>::EvalIterator Base;
 
-    EIGEN_STRONG_INLINE ReverseInnerIterator& operator--()
-    { Base::operator--(); return *this; }
+ public:
+  EIGEN_STRONG_INLINE InnerIterator(const unary_evaluator& unaryOp, Index outer)
+      : Base(unaryOp.m_argImpl, outer), m_functor(unaryOp.m_functor) {}
 
-    EIGEN_STRONG_INLINE typename CwiseUnaryOpImpl::Scalar value() const { return m_functor(Base::value()); }
-
-  protected:
-    const UnaryOp m_functor;
-  private:
-    typename CwiseUnaryOpImpl::Scalar& valueRef();
-};
+  EIGEN_STRONG_INLINE InnerIterator& operator++() {
+    Base::operator++();
+    return *this;
+  }
 
-template<typename ViewOp, typename MatrixType>
-class CwiseUnaryViewImpl<ViewOp,MatrixType,Sparse>
-  : public SparseMatrixBase<CwiseUnaryView<ViewOp, MatrixType> >
-{
-  public:
+  EIGEN_STRONG_INLINE Scalar value() const { return m_functor(Base::value()); }
 
-    class InnerIterator;
-    class ReverseInnerIterator;
+ protected:
+  const UnaryOp m_functor;
 
-    typedef CwiseUnaryView<ViewOp, MatrixType> Derived;
-    EIGEN_SPARSE_PUBLIC_INTERFACE(Derived)
-
-  protected:
-    typedef typename internal::traits<Derived>::_MatrixTypeNested _MatrixTypeNested;
-    typedef typename _MatrixTypeNested::InnerIterator MatrixTypeIterator;
-    typedef typename _MatrixTypeNested::ReverseInnerIterator MatrixTypeReverseIterator;
+ private:
+  Scalar& valueRef();
 };
 
-template<typename ViewOp, typename MatrixType>
-class CwiseUnaryViewImpl<ViewOp,MatrixType,Sparse>::InnerIterator
-    : public CwiseUnaryViewImpl<ViewOp,MatrixType,Sparse>::MatrixTypeIterator
-{
-    typedef typename CwiseUnaryViewImpl::Scalar Scalar;
-    typedef typename CwiseUnaryViewImpl<ViewOp,MatrixType,Sparse>::MatrixTypeIterator Base;
-  public:
+template <typename ViewOp, typename ArgType>
+struct unary_evaluator<CwiseUnaryView<ViewOp, ArgType>, IteratorBased>
+    : public evaluator_base<CwiseUnaryView<ViewOp, ArgType> > {
+ public:
+  typedef CwiseUnaryView<ViewOp, ArgType> XprType;
+
+  class InnerIterator;
 
-    EIGEN_STRONG_INLINE InnerIterator(const CwiseUnaryViewImpl& unaryOp, typename CwiseUnaryViewImpl::Index outer)
-      : Base(unaryOp.derived().nestedExpression(),outer), m_functor(unaryOp.derived().functor())
-    {}
+  enum {
+    CoeffReadCost = int(evaluator<ArgType>::CoeffReadCost) + int(functor_traits<ViewOp>::Cost),
+    Flags = XprType::Flags
+  };
 
-    EIGEN_STRONG_INLINE InnerIterator& operator++()
-    { Base::operator++(); return *this; }
+  explicit unary_evaluator(const XprType& op) : m_functor(op.functor()), m_argImpl(op.nestedExpression()) {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(functor_traits<ViewOp>::Cost);
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
 
-    EIGEN_STRONG_INLINE typename CwiseUnaryViewImpl::Scalar value() const { return m_functor(Base::value()); }
-    EIGEN_STRONG_INLINE typename CwiseUnaryViewImpl::Scalar& valueRef() { return m_functor(Base::valueRef()); }
+ protected:
+  typedef typename evaluator<ArgType>::InnerIterator EvalIterator;
 
-  protected:
-    const ViewOp m_functor;
+  const ViewOp m_functor;
+  evaluator<ArgType> m_argImpl;
 };
 
-template<typename ViewOp, typename MatrixType>
-class CwiseUnaryViewImpl<ViewOp,MatrixType,Sparse>::ReverseInnerIterator
-    : public CwiseUnaryViewImpl<ViewOp,MatrixType,Sparse>::MatrixTypeReverseIterator
-{
-    typedef typename CwiseUnaryViewImpl::Scalar Scalar;
-    typedef typename CwiseUnaryViewImpl<ViewOp,MatrixType,Sparse>::MatrixTypeReverseIterator Base;
-  public:
+template <typename ViewOp, typename ArgType>
+class unary_evaluator<CwiseUnaryView<ViewOp, ArgType>, IteratorBased>::InnerIterator
+    : public unary_evaluator<CwiseUnaryView<ViewOp, ArgType>, IteratorBased>::EvalIterator {
+ protected:
+  typedef typename XprType::Scalar Scalar;
+  typedef typename unary_evaluator<CwiseUnaryView<ViewOp, ArgType>, IteratorBased>::EvalIterator Base;
 
-    EIGEN_STRONG_INLINE ReverseInnerIterator(const CwiseUnaryViewImpl& unaryOp, typename CwiseUnaryViewImpl::Index outer)
-      : Base(unaryOp.derived().nestedExpression(),outer), m_functor(unaryOp.derived().functor())
-    {}
+ public:
+  EIGEN_STRONG_INLINE InnerIterator(const unary_evaluator& unaryOp, Index outer)
+      : Base(unaryOp.m_argImpl, outer), m_functor(unaryOp.m_functor) {}
 
-    EIGEN_STRONG_INLINE ReverseInnerIterator& operator--()
-    { Base::operator--(); return *this; }
+  EIGEN_STRONG_INLINE InnerIterator& operator++() {
+    Base::operator++();
+    return *this;
+  }
 
-    EIGEN_STRONG_INLINE typename CwiseUnaryViewImpl::Scalar value() const { return m_functor(Base::value()); }
-    EIGEN_STRONG_INLINE typename CwiseUnaryViewImpl::Scalar& valueRef() { return m_functor(Base::valueRef()); }
+  EIGEN_STRONG_INLINE Scalar value() const { return m_functor(Base::value()); }
+  EIGEN_STRONG_INLINE Scalar& valueRef() { return m_functor(Base::valueRef()); }
 
-  protected:
-    const ViewOp m_functor;
+ protected:
+  const ViewOp m_functor;
 };
 
-template<typename Derived>
-EIGEN_STRONG_INLINE Derived&
-SparseMatrixBase<Derived>::operator*=(const Scalar& other)
-{
-  for (Index j=0; j<outerSize(); ++j)
-    for (typename Derived::InnerIterator i(derived(),j); i; ++i)
-      i.valueRef() *= other;
+}  // end namespace internal
+
+template <typename Derived>
+EIGEN_STRONG_INLINE Derived& SparseMatrixBase<Derived>::operator*=(const Scalar& other) {
+  typedef typename internal::evaluator<Derived>::InnerIterator EvalIterator;
+  internal::evaluator<Derived> thisEval(derived());
+  for (Index j = 0; j < outerSize(); ++j)
+    for (EvalIterator i(thisEval, j); i; ++i) i.valueRef() *= other;
   return derived();
 }
 
-template<typename Derived>
-EIGEN_STRONG_INLINE Derived&
-SparseMatrixBase<Derived>::operator/=(const Scalar& other)
-{
-  for (Index j=0; j<outerSize(); ++j)
-    for (typename Derived::InnerIterator i(derived(),j); i; ++i)
-      i.valueRef() /= other;
+template <typename Derived>
+EIGEN_STRONG_INLINE Derived& SparseMatrixBase<Derived>::operator/=(const Scalar& other) {
+  typedef typename internal::evaluator<Derived>::InnerIterator EvalIterator;
+  internal::evaluator<Derived> thisEval(derived());
+  for (Index j = 0; j < outerSize(); ++j)
+    for (EvalIterator i(thisEval, j); i; ++i) i.valueRef() /= other;
   return derived();
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_SPARSE_CWISE_UNARY_OP_H
+#endif  // EIGEN_SPARSE_CWISE_UNARY_OP_H
diff --git a/inst/include/Eigen/src/SparseCore/SparseDenseProduct.h b/inst/include/Eigen/src/SparseCore/SparseDenseProduct.h
index ccb6ae7b..17ce596a 100644
--- a/inst/include/Eigen/src/SparseCore/SparseDenseProduct.h
+++ b/inst/include/Eigen/src/SparseCore/SparseDenseProduct.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,302 +10,305 @@
 #ifndef EIGEN_SPARSEDENSEPRODUCT_H
 #define EIGEN_SPARSEDENSEPRODUCT_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 
-template<typename Lhs, typename Rhs, int InnerSize> struct SparseDenseProductReturnType
-{
-  typedef SparseTimeDenseProduct<Lhs,Rhs> Type;
-};
-
-template<typename Lhs, typename Rhs> struct SparseDenseProductReturnType<Lhs,Rhs,1>
-{
-  typedef typename internal::conditional<
-    Lhs::IsRowMajor,
-    SparseDenseOuterProduct<Rhs,Lhs,true>,
-    SparseDenseOuterProduct<Lhs,Rhs,false> >::type Type;
-};
-
-template<typename Lhs, typename Rhs, int InnerSize> struct DenseSparseProductReturnType
-{
-  typedef DenseTimeSparseProduct<Lhs,Rhs> Type;
-};
-
-template<typename Lhs, typename Rhs> struct DenseSparseProductReturnType<Lhs,Rhs,1>
-{
-  typedef typename internal::conditional<
-    Rhs::IsRowMajor,
-    SparseDenseOuterProduct<Rhs,Lhs,true>,
-    SparseDenseOuterProduct<Lhs,Rhs,false> >::type Type;
-};
+namespace Eigen {
 
 namespace internal {
 
-template<typename Lhs, typename Rhs, bool Tr>
-struct traits<SparseDenseOuterProduct<Lhs,Rhs,Tr> >
-{
-  typedef Sparse StorageKind;
-  typedef typename scalar_product_traits<typename traits<Lhs>::Scalar,
-                                         typename traits<Rhs>::Scalar>::ReturnType Scalar;
-  typedef typename Lhs::Index Index;
-  typedef typename Lhs::Nested LhsNested;
-  typedef typename Rhs::Nested RhsNested;
-  typedef typename remove_all<LhsNested>::type _LhsNested;
-  typedef typename remove_all<RhsNested>::type _RhsNested;
-
-  enum {
-    LhsCoeffReadCost = traits<_LhsNested>::CoeffReadCost,
-    RhsCoeffReadCost = traits<_RhsNested>::CoeffReadCost,
-
-    RowsAtCompileTime    = Tr ? int(traits<Rhs>::RowsAtCompileTime)     : int(traits<Lhs>::RowsAtCompileTime),
-    ColsAtCompileTime    = Tr ? int(traits<Lhs>::ColsAtCompileTime)     : int(traits<Rhs>::ColsAtCompileTime),
-    MaxRowsAtCompileTime = Tr ? int(traits<Rhs>::MaxRowsAtCompileTime)  : int(traits<Lhs>::MaxRowsAtCompileTime),
-    MaxColsAtCompileTime = Tr ? int(traits<Lhs>::MaxColsAtCompileTime)  : int(traits<Rhs>::MaxColsAtCompileTime),
-
-    Flags = Tr ? RowMajorBit : 0,
-
-    CoeffReadCost = LhsCoeffReadCost + RhsCoeffReadCost + NumTraits<Scalar>::MulCost
-  };
-};
-
-} // end namespace internal
-
-template<typename Lhs, typename Rhs, bool Tr>
-class SparseDenseOuterProduct
- : public SparseMatrixBase<SparseDenseOuterProduct<Lhs,Rhs,Tr> >
-{
-  public:
-
-    typedef SparseMatrixBase<SparseDenseOuterProduct> Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(SparseDenseOuterProduct)
-    typedef internal::traits<SparseDenseOuterProduct> Traits;
-
-  private:
-
-    typedef typename Traits::LhsNested LhsNested;
-    typedef typename Traits::RhsNested RhsNested;
-    typedef typename Traits::_LhsNested _LhsNested;
-    typedef typename Traits::_RhsNested _RhsNested;
-
-  public:
-
-    class InnerIterator;
-
-    EIGEN_STRONG_INLINE SparseDenseOuterProduct(const Lhs& lhs, const Rhs& rhs)
-      : m_lhs(lhs), m_rhs(rhs)
-    {
-      EIGEN_STATIC_ASSERT(!Tr,YOU_MADE_A_PROGRAMMING_MISTAKE);
-    }
-
-    EIGEN_STRONG_INLINE SparseDenseOuterProduct(const Rhs& rhs, const Lhs& lhs)
-      : m_lhs(lhs), m_rhs(rhs)
-    {
-      EIGEN_STATIC_ASSERT(Tr,YOU_MADE_A_PROGRAMMING_MISTAKE);
-    }
-
-    EIGEN_STRONG_INLINE Index rows() const { return Tr ? m_rhs.rows() : m_lhs.rows(); }
-    EIGEN_STRONG_INLINE Index cols() const { return Tr ? m_lhs.cols() : m_rhs.cols(); }
-
-    EIGEN_STRONG_INLINE const _LhsNested& lhs() const { return m_lhs; }
-    EIGEN_STRONG_INLINE const _RhsNested& rhs() const { return m_rhs; }
-
-  protected:
-    LhsNested m_lhs;
-    RhsNested m_rhs;
+template <>
+struct product_promote_storage_type<Sparse, Dense, OuterProduct> {
+  typedef Sparse ret;
 };
-
-template<typename Lhs, typename Rhs, bool Transpose>
-class SparseDenseOuterProduct<Lhs,Rhs,Transpose>::InnerIterator : public _LhsNested::InnerIterator
-{
-    typedef typename _LhsNested::InnerIterator Base;
-    typedef typename SparseDenseOuterProduct::Index Index;
-  public:
-    EIGEN_STRONG_INLINE InnerIterator(const SparseDenseOuterProduct& prod, Index outer)
-      : Base(prod.lhs(), 0), m_outer(outer), m_factor(get(prod.rhs(), outer, typename internal::traits<Rhs>::StorageKind() ))
-    { }
-
-    inline Index outer() const { return m_outer; }
-    inline Index row() const { return Transpose ? m_outer : Base::index(); }
-    inline Index col() const { return Transpose ? Base::index() : m_outer; }
-
-    inline Scalar value() const { return Base::value() * m_factor; }
-
-  protected:
-    static Scalar get(const _RhsNested &rhs, Index outer, Dense = Dense())
-    {
-      return rhs.coeff(outer);
-    }
-    
-    static Scalar get(const _RhsNested &rhs, Index outer, Sparse = Sparse())
-    {
-      typename Traits::_RhsNested::InnerIterator it(rhs, outer);
-      if (it && it.index()==0)
-        return it.value();
-      
-      return Scalar(0);
-    }
-    
-    Index m_outer;
-    Scalar m_factor;
+template <>
+struct product_promote_storage_type<Dense, Sparse, OuterProduct> {
+  typedef Sparse ret;
 };
 
-namespace internal {
-template<typename Lhs, typename Rhs>
-struct traits<SparseTimeDenseProduct<Lhs,Rhs> >
- : traits<ProductBase<SparseTimeDenseProduct<Lhs,Rhs>, Lhs, Rhs> >
-{
-  typedef Dense StorageKind;
-  typedef MatrixXpr XprKind;
-};
-
-template<typename SparseLhsType, typename DenseRhsType, typename DenseResType,
-         int LhsStorageOrder = ((SparseLhsType::Flags&RowMajorBit)==RowMajorBit) ? RowMajor : ColMajor,
-         bool ColPerCol = ((DenseRhsType::Flags&RowMajorBit)==0) || DenseRhsType::ColsAtCompileTime==1>
+template <typename SparseLhsType, typename DenseRhsType, typename DenseResType, typename AlphaType,
+          int LhsStorageOrder = ((SparseLhsType::Flags & RowMajorBit) == RowMajorBit) ? RowMajor : ColMajor,
+          bool ColPerCol = ((DenseRhsType::Flags & RowMajorBit) == 0) || DenseRhsType::ColsAtCompileTime == 1>
 struct sparse_time_dense_product_impl;
 
-template<typename SparseLhsType, typename DenseRhsType, typename DenseResType>
-struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, RowMajor, true>
-{
-  typedef typename internal::remove_all<SparseLhsType>::type Lhs;
-  typedef typename internal::remove_all<DenseRhsType>::type Rhs;
-  typedef typename internal::remove_all<DenseResType>::type Res;
-  typedef typename Lhs::Index Index;
-  typedef typename Lhs::InnerIterator LhsInnerIterator;
-  static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha)
-  {
-    for(Index c=0; c<rhs.cols(); ++c)
-    {
-      Index n = lhs.outerSize();
-      for(Index j=0; j<n; ++j)
+template <typename SparseLhsType, typename DenseRhsType, typename DenseResType>
+struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType, typename DenseResType::Scalar,
+                                      RowMajor, true> {
+  typedef internal::remove_all_t<SparseLhsType> Lhs;
+  typedef internal::remove_all_t<DenseRhsType> Rhs;
+  typedef internal::remove_all_t<DenseResType> Res;
+  typedef typename evaluator<Lhs>::InnerIterator LhsInnerIterator;
+  typedef evaluator<Lhs> LhsEval;
+  static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res,
+                  const typename Res::Scalar& alpha) {
+    LhsEval lhsEval(lhs);
+
+    Index n = lhs.outerSize();
+#ifdef EIGEN_HAS_OPENMP
+    Index threads = Eigen::nbThreads();
+#endif
+
+    for (Index c = 0; c < rhs.cols(); ++c) {
+#ifdef EIGEN_HAS_OPENMP
+      // This 20000 threshold has been found experimentally on 2D and 3D Poisson problems.
+      // It basically represents the minimal amount of work to be done to be worth it.
+      if (threads > 1 && lhsEval.nonZerosEstimate() > 20000) {
+#pragma omp parallel for schedule(dynamic, (n + threads * 4 - 1) / (threads * 4)) num_threads(threads)
+        for (Index i = 0; i < n; ++i) processRow(lhsEval, rhs, res, alpha, i, c);
+      } else
+#endif
       {
-        typename Res::Scalar tmp(0);
-        for(LhsInnerIterator it(lhs,j); it ;++it)
-          tmp += it.value() * rhs.coeff(it.index(),c);
-        res.coeffRef(j,c) += alpha * tmp;
+        for (Index i = 0; i < n; ++i) processRow(lhsEval, rhs, res, alpha, i, c);
       }
     }
   }
+
+  static void processRow(const LhsEval& lhsEval, const DenseRhsType& rhs, DenseResType& res,
+                         const typename Res::Scalar& alpha, Index i, Index col) {
+    // Two accumulators, which breaks the dependency chain on the accumulator
+    // and allows more instruction-level parallelism in the following loop
+    typename Res::Scalar tmp_a(0);
+    typename Res::Scalar tmp_b(0);
+    for (LhsInnerIterator it(lhsEval, i); it; ++it) {
+      tmp_a += it.value() * rhs.coeff(it.index(), col);
+      ++it;
+      if (it) {
+        tmp_b += it.value() * rhs.coeff(it.index(), col);
+      }
+    }
+    res.coeffRef(i, col) += alpha * (tmp_a + tmp_b);
+  }
 };
 
-template<typename SparseLhsType, typename DenseRhsType, typename DenseResType>
-struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, ColMajor, true>
-{
-  typedef typename internal::remove_all<SparseLhsType>::type Lhs;
-  typedef typename internal::remove_all<DenseRhsType>::type Rhs;
-  typedef typename internal::remove_all<DenseResType>::type Res;
-  typedef typename Lhs::InnerIterator LhsInnerIterator;
-  typedef typename Lhs::Index Index;
-  static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha)
-  {
-    for(Index c=0; c<rhs.cols(); ++c)
-    {
-      for(Index j=0; j<lhs.outerSize(); ++j)
-      {
-        typename Res::Scalar rhs_j = alpha * rhs.coeff(j,c);
-        for(LhsInnerIterator it(lhs,j); it ;++it)
-          res.coeffRef(it.index(),c) += it.value() * rhs_j;
+// FIXME: what is the purpose of the following specialization? Is it for the BlockedSparse format?
+// -> let's disable it for now as it is conflicting with generic scalar*matrix and matrix*scalar operators
+// template<typename T1, typename T2/*, int Options_, typename StrideType_*/>
+// struct ScalarBinaryOpTraits<T1, Ref<T2/*, Options_, StrideType_*/> >
+// {
+//   enum {
+//     Defined = 1
+//   };
+//   typedef typename CwiseUnaryOp<scalar_multiple2_op<T1, typename T2::Scalar>, T2>::PlainObject ReturnType;
+// };
+
+template <typename SparseLhsType, typename DenseRhsType, typename DenseResType, typename AlphaType>
+struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType, AlphaType, ColMajor, true> {
+  typedef internal::remove_all_t<SparseLhsType> Lhs;
+  typedef internal::remove_all_t<DenseRhsType> Rhs;
+  typedef internal::remove_all_t<DenseResType> Res;
+  typedef evaluator<Lhs> LhsEval;
+  typedef typename LhsEval::InnerIterator LhsInnerIterator;
+  static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const AlphaType& alpha) {
+    LhsEval lhsEval(lhs);
+    for (Index c = 0; c < rhs.cols(); ++c) {
+      for (Index j = 0; j < lhs.outerSize(); ++j) {
+        //        typename Res::Scalar rhs_j = alpha * rhs.coeff(j,c);
+        typename ScalarBinaryOpTraits<AlphaType, typename Rhs::Scalar>::ReturnType rhs_j(alpha * rhs.coeff(j, c));
+        for (LhsInnerIterator it(lhsEval, j); it; ++it) res.coeffRef(it.index(), c) += it.value() * rhs_j;
       }
     }
   }
 };
 
-template<typename SparseLhsType, typename DenseRhsType, typename DenseResType>
-struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, RowMajor, false>
-{
-  typedef typename internal::remove_all<SparseLhsType>::type Lhs;
-  typedef typename internal::remove_all<DenseRhsType>::type Rhs;
-  typedef typename internal::remove_all<DenseResType>::type Res;
-  typedef typename Lhs::InnerIterator LhsInnerIterator;
-  typedef typename Lhs::Index Index;
-  static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha)
-  {
-    for(Index j=0; j<lhs.outerSize(); ++j)
+template <typename SparseLhsType, typename DenseRhsType, typename DenseResType>
+struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType, typename DenseResType::Scalar,
+                                      RowMajor, false> {
+  typedef internal::remove_all_t<SparseLhsType> Lhs;
+  typedef internal::remove_all_t<DenseRhsType> Rhs;
+  typedef internal::remove_all_t<DenseResType> Res;
+  typedef evaluator<Lhs> LhsEval;
+  typedef typename LhsEval::InnerIterator LhsInnerIterator;
+  static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res,
+                  const typename Res::Scalar& alpha) {
+    Index n = lhs.rows();
+    LhsEval lhsEval(lhs);
+
+#ifdef EIGEN_HAS_OPENMP
+    Index threads = Eigen::nbThreads();
+    // This 20000 threshold has been found experimentally on 2D and 3D Poisson problems.
+    // It basically represents the minimal amount of work to be done to be worth it.
+    if (threads > 1 && lhsEval.nonZerosEstimate() * rhs.cols() > 20000) {
+#pragma omp parallel for schedule(dynamic, (n + threads * 4 - 1) / (threads * 4)) num_threads(threads)
+      for (Index i = 0; i < n; ++i) processRow(lhsEval, rhs, res, alpha, i);
+    } else
+#endif
     {
-      typename Res::RowXpr res_j(res.row(j));
-      for(LhsInnerIterator it(lhs,j); it ;++it)
-        res_j += (alpha*it.value()) * rhs.row(it.index());
+      for (Index i = 0; i < n; ++i) processRow(lhsEval, rhs, res, alpha, i);
     }
   }
+
+  static void processRow(const LhsEval& lhsEval, const DenseRhsType& rhs, Res& res, const typename Res::Scalar& alpha,
+                         Index i) {
+    typename Res::RowXpr res_i(res.row(i));
+    for (LhsInnerIterator it(lhsEval, i); it; ++it) res_i += (alpha * it.value()) * rhs.row(it.index());
+  }
 };
 
-template<typename SparseLhsType, typename DenseRhsType, typename DenseResType>
-struct sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType, ColMajor, false>
-{
-  typedef typename internal::remove_all<SparseLhsType>::type Lhs;
-  typedef typename internal::remove_all<DenseRhsType>::type Rhs;
-  typedef typename internal::remove_all<DenseResType>::type Res;
-  typedef typename Lhs::InnerIterator LhsInnerIterator;
-  typedef typename Lhs::Index Index;
-  static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const typename Res::Scalar& alpha)
-  {
-    for(Index j=0; j<lhs.outerSize(); ++j)
-    {
+template <typename SparseLhsType, typename DenseRhsType, typename DenseResType>
+struct sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType, typename DenseResType::Scalar,
+                                      ColMajor, false> {
+  typedef internal::remove_all_t<SparseLhsType> Lhs;
+  typedef internal::remove_all_t<DenseRhsType> Rhs;
+  typedef internal::remove_all_t<DenseResType> Res;
+  typedef typename evaluator<Lhs>::InnerIterator LhsInnerIterator;
+  static void run(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res,
+                  const typename Res::Scalar& alpha) {
+    evaluator<Lhs> lhsEval(lhs);
+    for (Index j = 0; j < lhs.outerSize(); ++j) {
       typename Rhs::ConstRowXpr rhs_j(rhs.row(j));
-      for(LhsInnerIterator it(lhs,j); it ;++it)
-        res.row(it.index()) += (alpha*it.value()) * rhs_j;
+      for (LhsInnerIterator it(lhsEval, j); it; ++it) res.row(it.index()) += (alpha * it.value()) * rhs_j;
     }
   }
 };
 
-template<typename SparseLhsType, typename DenseRhsType, typename DenseResType,typename AlphaType>
-inline void sparse_time_dense_product(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res, const AlphaType& alpha)
-{
-  sparse_time_dense_product_impl<SparseLhsType,DenseRhsType,DenseResType>::run(lhs, rhs, res, alpha);
+template <typename SparseLhsType, typename DenseRhsType, typename DenseResType, typename AlphaType>
+inline void sparse_time_dense_product(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res,
+                                      const AlphaType& alpha) {
+  sparse_time_dense_product_impl<SparseLhsType, DenseRhsType, DenseResType, AlphaType>::run(lhs, rhs, res, alpha);
 }
 
-} // end namespace internal
+}  // end namespace internal
 
-template<typename Lhs, typename Rhs>
-class SparseTimeDenseProduct
-  : public ProductBase<SparseTimeDenseProduct<Lhs,Rhs>, Lhs, Rhs>
-{
-  public:
-    EIGEN_PRODUCT_PUBLIC_INTERFACE(SparseTimeDenseProduct)
+namespace internal {
 
-    SparseTimeDenseProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs)
-    {}
+template <typename Lhs, typename Rhs, int ProductType>
+struct generic_product_impl<Lhs, Rhs, SparseShape, DenseShape, ProductType>
+    : generic_product_impl_base<Lhs, Rhs, generic_product_impl<Lhs, Rhs, SparseShape, DenseShape, ProductType> > {
+  typedef typename Product<Lhs, Rhs>::Scalar Scalar;
+
+  template <typename Dest>
+  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) {
+    typedef typename nested_eval<Lhs, ((Rhs::Flags & RowMajorBit) == 0) ? 1 : Rhs::ColsAtCompileTime>::type LhsNested;
+    typedef typename nested_eval<Rhs, ((Lhs::Flags & RowMajorBit) == 0) ? 1 : Dynamic>::type RhsNested;
+    LhsNested lhsNested(lhs);
+    RhsNested rhsNested(rhs);
+    internal::sparse_time_dense_product(lhsNested, rhsNested, dst, alpha);
+  }
+};
 
-    template<typename Dest> void scaleAndAddTo(Dest& dest, const Scalar& alpha) const
-    {
-      internal::sparse_time_dense_product(m_lhs, m_rhs, dest, alpha);
+template <typename Lhs, typename Rhs, int ProductType>
+struct generic_product_impl<Lhs, Rhs, SparseTriangularShape, DenseShape, ProductType>
+    : generic_product_impl<Lhs, Rhs, SparseShape, DenseShape, ProductType> {};
+
+template <typename Lhs, typename Rhs, int ProductType>
+struct generic_product_impl<Lhs, Rhs, DenseShape, SparseShape, ProductType>
+    : generic_product_impl_base<Lhs, Rhs, generic_product_impl<Lhs, Rhs, DenseShape, SparseShape, ProductType> > {
+  typedef typename Product<Lhs, Rhs>::Scalar Scalar;
+
+  template <typename Dst>
+  static void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) {
+    typedef typename nested_eval<Lhs, ((Rhs::Flags & RowMajorBit) == 0) ? Dynamic : 1>::type LhsNested;
+    typedef typename nested_eval<Rhs, ((Lhs::Flags & RowMajorBit) == RowMajorBit) ? 1 : Lhs::RowsAtCompileTime>::type
+        RhsNested;
+    LhsNested lhsNested(lhs);
+    RhsNested rhsNested(rhs);
+
+    // transpose everything
+    Transpose<Dst> dstT(dst);
+    internal::sparse_time_dense_product(rhsNested.transpose(), lhsNested.transpose(), dstT, alpha);
+  }
+};
+
+template <typename Lhs, typename Rhs, int ProductType>
+struct generic_product_impl<Lhs, Rhs, DenseShape, SparseTriangularShape, ProductType>
+    : generic_product_impl<Lhs, Rhs, DenseShape, SparseShape, ProductType> {};
+
+template <typename LhsT, typename RhsT, bool NeedToTranspose>
+struct sparse_dense_outer_product_evaluator {
+ protected:
+  typedef std::conditional_t<NeedToTranspose, RhsT, LhsT> Lhs1;
+  typedef std::conditional_t<NeedToTranspose, LhsT, RhsT> ActualRhs;
+  typedef Product<LhsT, RhsT, DefaultProduct> ProdXprType;
+
+  // if the actual left-hand side is a dense vector,
+  // then build a sparse-view so that we can seamlessly iterate over it.
+  typedef std::conditional_t<is_same<typename internal::traits<Lhs1>::StorageKind, Sparse>::value, Lhs1,
+                             SparseView<Lhs1> >
+      ActualLhs;
+  typedef std::conditional_t<is_same<typename internal::traits<Lhs1>::StorageKind, Sparse>::value, Lhs1 const&,
+                             SparseView<Lhs1> >
+      LhsArg;
+
+  typedef evaluator<ActualLhs> LhsEval;
+  typedef evaluator<ActualRhs> RhsEval;
+  typedef typename evaluator<ActualLhs>::InnerIterator LhsIterator;
+  typedef typename ProdXprType::Scalar Scalar;
+
+ public:
+  enum { Flags = NeedToTranspose ? RowMajorBit : 0, CoeffReadCost = HugeCost };
+
+  class InnerIterator : public LhsIterator {
+   public:
+    InnerIterator(const sparse_dense_outer_product_evaluator& xprEval, Index outer)
+        : LhsIterator(xprEval.m_lhsXprImpl, 0),
+          m_outer(outer),
+          m_empty(false),
+          m_factor(get(xprEval.m_rhsXprImpl, outer, typename internal::traits<ActualRhs>::StorageKind())) {}
+
+    EIGEN_STRONG_INLINE Index outer() const { return m_outer; }
+    EIGEN_STRONG_INLINE Index row() const { return NeedToTranspose ? m_outer : LhsIterator::index(); }
+    EIGEN_STRONG_INLINE Index col() const { return NeedToTranspose ? LhsIterator::index() : m_outer; }
+
+    EIGEN_STRONG_INLINE Scalar value() const { return LhsIterator::value() * m_factor; }
+    EIGEN_STRONG_INLINE operator bool() const { return LhsIterator::operator bool() && (!m_empty); }
+
+   protected:
+    Scalar get(const RhsEval& rhs, Index outer, Dense = Dense()) const { return rhs.coeff(outer); }
+
+    Scalar get(const RhsEval& rhs, Index outer, Sparse = Sparse()) {
+      typename RhsEval::InnerIterator it(rhs, outer);
+      if (it && it.index() == 0 && it.value() != Scalar(0)) return it.value();
+      m_empty = true;
+      return Scalar(0);
     }
 
-  private:
-    SparseTimeDenseProduct& operator=(const SparseTimeDenseProduct&);
-};
+    Index m_outer;
+    bool m_empty;
+    Scalar m_factor;
+  };
 
+  sparse_dense_outer_product_evaluator(const Lhs1& lhs, const ActualRhs& rhs)
+      : m_lhs(lhs), m_lhsXprImpl(m_lhs), m_rhsXprImpl(rhs) {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
 
-// dense = dense * sparse
-namespace internal {
-template<typename Lhs, typename Rhs>
-struct traits<DenseTimeSparseProduct<Lhs,Rhs> >
- : traits<ProductBase<DenseTimeSparseProduct<Lhs,Rhs>, Lhs, Rhs> >
-{
-  typedef Dense StorageKind;
+  // transpose case
+  sparse_dense_outer_product_evaluator(const ActualRhs& rhs, const Lhs1& lhs)
+      : m_lhs(lhs), m_lhsXprImpl(m_lhs), m_rhsXprImpl(rhs) {
+    EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost);
+  }
+
+ protected:
+  const LhsArg m_lhs;
+  evaluator<ActualLhs> m_lhsXprImpl;
+  evaluator<ActualRhs> m_rhsXprImpl;
 };
-} // end namespace internal
 
-template<typename Lhs, typename Rhs>
-class DenseTimeSparseProduct
-  : public ProductBase<DenseTimeSparseProduct<Lhs,Rhs>, Lhs, Rhs>
-{
-  public:
-    EIGEN_PRODUCT_PUBLIC_INTERFACE(DenseTimeSparseProduct)
+// sparse * dense outer product
+template <typename Lhs, typename Rhs>
+struct product_evaluator<Product<Lhs, Rhs, DefaultProduct>, OuterProduct, SparseShape, DenseShape>
+    : sparse_dense_outer_product_evaluator<Lhs, Rhs, Lhs::IsRowMajor> {
+  typedef sparse_dense_outer_product_evaluator<Lhs, Rhs, Lhs::IsRowMajor> Base;
 
-    DenseTimeSparseProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs)
-    {}
+  typedef Product<Lhs, Rhs> XprType;
+  typedef typename XprType::PlainObject PlainObject;
 
-    template<typename Dest> void scaleAndAddTo(Dest& dest, const Scalar& alpha) const
-    {
-      Transpose<const _LhsNested> lhs_t(m_lhs);
-      Transpose<const _RhsNested> rhs_t(m_rhs);
-      Transpose<Dest> dest_t(dest);
-      internal::sparse_time_dense_product(rhs_t, lhs_t, dest_t, alpha);
-    }
+  explicit product_evaluator(const XprType& xpr) : Base(xpr.lhs(), xpr.rhs()) {}
+};
+
+template <typename Lhs, typename Rhs>
+struct product_evaluator<Product<Lhs, Rhs, DefaultProduct>, OuterProduct, DenseShape, SparseShape>
+    : sparse_dense_outer_product_evaluator<Lhs, Rhs, Rhs::IsRowMajor> {
+  typedef sparse_dense_outer_product_evaluator<Lhs, Rhs, Rhs::IsRowMajor> Base;
 
-  private:
-    DenseTimeSparseProduct& operator=(const DenseTimeSparseProduct&);
+  typedef Product<Lhs, Rhs> XprType;
+  typedef typename XprType::PlainObject PlainObject;
+
+  explicit product_evaluator(const XprType& xpr) : Base(xpr.lhs(), xpr.rhs()) {}
 };
 
-} // end namespace Eigen
+}  // end namespace internal
+
+}  // end namespace Eigen
 
-#endif // EIGEN_SPARSEDENSEPRODUCT_H
+#endif  // EIGEN_SPARSEDENSEPRODUCT_H
diff --git a/inst/include/Eigen/src/SparseCore/SparseDiagonalProduct.h b/inst/include/Eigen/src/SparseCore/SparseDiagonalProduct.h
index 1bb590e6..1f72a6b3 100644
--- a/inst/include/Eigen/src/SparseCore/SparseDiagonalProduct.h
+++ b/inst/include/Eigen/src/SparseCore/SparseDiagonalProduct.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2009-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,7 +10,10 @@
 #ifndef EIGEN_SPARSE_DIAGONAL_PRODUCT_H
 #define EIGEN_SPARSE_DIAGONAL_PRODUCT_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 // The product of a diagonal matrix with a sparse matrix can be easily
 // implemented using expression template.
@@ -26,171 +29,110 @@ namespace Eigen {
 
 namespace internal {
 
-template<typename Lhs, typename Rhs>
-struct traits<SparseDiagonalProduct<Lhs, Rhs> >
-{
-  typedef typename remove_all<Lhs>::type _Lhs;
-  typedef typename remove_all<Rhs>::type _Rhs;
-  typedef typename _Lhs::Scalar Scalar;
-  typedef typename promote_index_type<typename traits<Lhs>::Index,
-                                         typename traits<Rhs>::Index>::type Index;
-  typedef Sparse StorageKind;
-  typedef MatrixXpr XprKind;
-  enum {
-    RowsAtCompileTime = _Lhs::RowsAtCompileTime,
-    ColsAtCompileTime = _Rhs::ColsAtCompileTime,
-
-    MaxRowsAtCompileTime = _Lhs::MaxRowsAtCompileTime,
-    MaxColsAtCompileTime = _Rhs::MaxColsAtCompileTime,
-
-    SparseFlags = is_diagonal<_Lhs>::ret ? int(_Rhs::Flags) : int(_Lhs::Flags),
-    Flags = (SparseFlags&RowMajorBit),
-    CoeffReadCost = Dynamic
-  };
-};
+enum { SDP_AsScalarProduct, SDP_AsCwiseProduct };
 
-enum {SDP_IsDiagonal, SDP_IsSparseRowMajor, SDP_IsSparseColMajor};
-template<typename Lhs, typename Rhs, typename SparseDiagonalProductType, int RhsMode, int LhsMode>
-class sparse_diagonal_product_inner_iterator_selector;
-
-} // end namespace internal
-
-template<typename Lhs, typename Rhs>
-class SparseDiagonalProduct
-  : public SparseMatrixBase<SparseDiagonalProduct<Lhs,Rhs> >,
-    internal::no_assignment_operator
-{
-    typedef typename Lhs::Nested LhsNested;
-    typedef typename Rhs::Nested RhsNested;
-
-    typedef typename internal::remove_all<LhsNested>::type _LhsNested;
-    typedef typename internal::remove_all<RhsNested>::type _RhsNested;
-
-    enum {
-      LhsMode = internal::is_diagonal<_LhsNested>::ret ? internal::SDP_IsDiagonal
-              : (_LhsNested::Flags&RowMajorBit) ? internal::SDP_IsSparseRowMajor : internal::SDP_IsSparseColMajor,
-      RhsMode = internal::is_diagonal<_RhsNested>::ret ? internal::SDP_IsDiagonal
-              : (_RhsNested::Flags&RowMajorBit) ? internal::SDP_IsSparseRowMajor : internal::SDP_IsSparseColMajor
-    };
-
-  public:
-
-    EIGEN_SPARSE_PUBLIC_INTERFACE(SparseDiagonalProduct)
-
-    typedef internal::sparse_diagonal_product_inner_iterator_selector
-                      <_LhsNested,_RhsNested,SparseDiagonalProduct,LhsMode,RhsMode> InnerIterator;
-    
-    // We do not want ReverseInnerIterator for diagonal-sparse products,
-    // but this dummy declaration is needed to make diag * sparse * diag compile.
-    class ReverseInnerIterator;
-
-    EIGEN_STRONG_INLINE SparseDiagonalProduct(const Lhs& lhs, const Rhs& rhs)
-      : m_lhs(lhs), m_rhs(rhs)
-    {
-      eigen_assert(lhs.cols() == rhs.rows() && "invalid sparse matrix * diagonal matrix product");
-    }
+template <typename SparseXprType, typename DiagonalCoeffType, int SDP_Tag>
+struct sparse_diagonal_product_evaluator;
 
-    EIGEN_STRONG_INLINE Index rows() const { return m_lhs.rows(); }
-    EIGEN_STRONG_INLINE Index cols() const { return m_rhs.cols(); }
+template <typename Lhs, typename Rhs, int ProductTag>
+struct product_evaluator<Product<Lhs, Rhs, DefaultProduct>, ProductTag, DiagonalShape, SparseShape>
+    : public sparse_diagonal_product_evaluator<Rhs, typename Lhs::DiagonalVectorType,
+                                               Rhs::Flags & RowMajorBit ? SDP_AsScalarProduct : SDP_AsCwiseProduct> {
+  typedef Product<Lhs, Rhs, DefaultProduct> XprType;
+  enum { CoeffReadCost = HugeCost, Flags = Rhs::Flags & RowMajorBit, Alignment = 0 };  // FIXME CoeffReadCost & Flags
 
-    EIGEN_STRONG_INLINE const _LhsNested& lhs() const { return m_lhs; }
-    EIGEN_STRONG_INLINE const _RhsNested& rhs() const { return m_rhs; }
+  typedef sparse_diagonal_product_evaluator<Rhs, typename Lhs::DiagonalVectorType,
+                                            Rhs::Flags & RowMajorBit ? SDP_AsScalarProduct : SDP_AsCwiseProduct>
+      Base;
+  explicit product_evaluator(const XprType &xpr) : Base(xpr.rhs(), xpr.lhs().diagonal()) {}
+};
 
-  protected:
-    LhsNested m_lhs;
-    RhsNested m_rhs;
+template <typename Lhs, typename Rhs, int ProductTag>
+struct product_evaluator<Product<Lhs, Rhs, DefaultProduct>, ProductTag, SparseShape, DiagonalShape>
+    : public sparse_diagonal_product_evaluator<Lhs, Transpose<const typename Rhs::DiagonalVectorType>,
+                                               Lhs::Flags & RowMajorBit ? SDP_AsCwiseProduct : SDP_AsScalarProduct> {
+  typedef Product<Lhs, Rhs, DefaultProduct> XprType;
+  enum { CoeffReadCost = HugeCost, Flags = Lhs::Flags & RowMajorBit, Alignment = 0 };  // FIXME CoeffReadCost & Flags
+
+  typedef sparse_diagonal_product_evaluator<Lhs, Transpose<const typename Rhs::DiagonalVectorType>,
+                                            Lhs::Flags & RowMajorBit ? SDP_AsCwiseProduct : SDP_AsScalarProduct>
+      Base;
+  explicit product_evaluator(const XprType &xpr) : Base(xpr.lhs(), xpr.rhs().diagonal().transpose()) {}
 };
 
-namespace internal {
+template <typename SparseXprType, typename DiagonalCoeffType>
+struct sparse_diagonal_product_evaluator<SparseXprType, DiagonalCoeffType, SDP_AsScalarProduct> {
+ protected:
+  typedef typename evaluator<SparseXprType>::InnerIterator SparseXprInnerIterator;
+  typedef typename SparseXprType::Scalar Scalar;
 
-template<typename Lhs, typename Rhs, typename SparseDiagonalProductType>
-class sparse_diagonal_product_inner_iterator_selector
-<Lhs,Rhs,SparseDiagonalProductType,SDP_IsDiagonal,SDP_IsSparseRowMajor>
-  : public CwiseUnaryOp<scalar_multiple_op<typename Lhs::Scalar>,const Rhs>::InnerIterator
-{
-    typedef typename CwiseUnaryOp<scalar_multiple_op<typename Lhs::Scalar>,const Rhs>::InnerIterator Base;
-    typedef typename Lhs::Index Index;
-  public:
-    inline sparse_diagonal_product_inner_iterator_selector(
-              const SparseDiagonalProductType& expr, Index outer)
-      : Base(expr.rhs()*(expr.lhs().diagonal().coeff(outer)), outer)
-    {}
-};
+ public:
+  class InnerIterator : public SparseXprInnerIterator {
+   public:
+    InnerIterator(const sparse_diagonal_product_evaluator &xprEval, Index outer)
+        : SparseXprInnerIterator(xprEval.m_sparseXprImpl, outer), m_coeff(xprEval.m_diagCoeffImpl.coeff(outer)) {}
 
-template<typename Lhs, typename Rhs, typename SparseDiagonalProductType>
-class sparse_diagonal_product_inner_iterator_selector
-<Lhs,Rhs,SparseDiagonalProductType,SDP_IsDiagonal,SDP_IsSparseColMajor>
-  : public CwiseBinaryOp<
-      scalar_product_op<typename Lhs::Scalar>,
-      const typename Rhs::ConstInnerVectorReturnType,
-      const typename Lhs::DiagonalVectorType>::InnerIterator
-{
-    typedef typename CwiseBinaryOp<
-      scalar_product_op<typename Lhs::Scalar>,
-      const typename Rhs::ConstInnerVectorReturnType,
-      const typename Lhs::DiagonalVectorType>::InnerIterator Base;
-    typedef typename Lhs::Index Index;
-    Index m_outer;
-  public:
-    inline sparse_diagonal_product_inner_iterator_selector(
-              const SparseDiagonalProductType& expr, Index outer)
-      : Base(expr.rhs().innerVector(outer) .cwiseProduct(expr.lhs().diagonal()), 0), m_outer(outer)
-    {}
-    
-    inline Index outer() const { return m_outer; }
-    inline Index col() const { return m_outer; }
-};
+    EIGEN_STRONG_INLINE Scalar value() const { return m_coeff * SparseXprInnerIterator::value(); }
 
-template<typename Lhs, typename Rhs, typename SparseDiagonalProductType>
-class sparse_diagonal_product_inner_iterator_selector
-<Lhs,Rhs,SparseDiagonalProductType,SDP_IsSparseColMajor,SDP_IsDiagonal>
-  : public CwiseUnaryOp<scalar_multiple_op<typename Rhs::Scalar>,const Lhs>::InnerIterator
-{
-    typedef typename CwiseUnaryOp<scalar_multiple_op<typename Rhs::Scalar>,const Lhs>::InnerIterator Base;
-    typedef typename Lhs::Index Index;
-  public:
-    inline sparse_diagonal_product_inner_iterator_selector(
-              const SparseDiagonalProductType& expr, Index outer)
-      : Base(expr.lhs()*expr.rhs().diagonal().coeff(outer), outer)
-    {}
-};
+   protected:
+    typename DiagonalCoeffType::Scalar m_coeff;
+  };
+
+  sparse_diagonal_product_evaluator(const SparseXprType &sparseXpr, const DiagonalCoeffType &diagCoeff)
+      : m_sparseXprImpl(sparseXpr), m_diagCoeffImpl(diagCoeff) {}
+
+  Index nonZerosEstimate() const { return m_sparseXprImpl.nonZerosEstimate(); }
 
-template<typename Lhs, typename Rhs, typename SparseDiagonalProductType>
-class sparse_diagonal_product_inner_iterator_selector
-<Lhs,Rhs,SparseDiagonalProductType,SDP_IsSparseRowMajor,SDP_IsDiagonal>
-  : public CwiseBinaryOp<
-      scalar_product_op<typename Rhs::Scalar>,
-      const typename Lhs::ConstInnerVectorReturnType,
-      const Transpose<const typename Rhs::DiagonalVectorType> >::InnerIterator
-{
-    typedef typename CwiseBinaryOp<
-      scalar_product_op<typename Rhs::Scalar>,
-      const typename Lhs::ConstInnerVectorReturnType,
-      const Transpose<const typename Rhs::DiagonalVectorType> >::InnerIterator Base;
-    typedef typename Lhs::Index Index;
-    Index m_outer;
-  public:
-    inline sparse_diagonal_product_inner_iterator_selector(
-              const SparseDiagonalProductType& expr, Index outer)
-      : Base(expr.lhs().innerVector(outer) .cwiseProduct(expr.rhs().diagonal().transpose()), 0), m_outer(outer)
-    {}
-    
-    inline Index outer() const { return m_outer; }
-    inline Index row() const { return m_outer; }
+ protected:
+  evaluator<SparseXprType> m_sparseXprImpl;
+  evaluator<DiagonalCoeffType> m_diagCoeffImpl;
 };
 
-} // end namespace internal
+template <typename SparseXprType, typename DiagCoeffType>
+struct sparse_diagonal_product_evaluator<SparseXprType, DiagCoeffType, SDP_AsCwiseProduct> {
+  typedef typename SparseXprType::Scalar Scalar;
+  typedef typename SparseXprType::StorageIndex StorageIndex;
 
-// SparseMatrixBase functions
+  typedef typename nested_eval<DiagCoeffType, SparseXprType::IsRowMajor ? SparseXprType::RowsAtCompileTime
+                                                                        : SparseXprType::ColsAtCompileTime>::type
+      DiagCoeffNested;
+
+  class InnerIterator {
+    typedef typename evaluator<SparseXprType>::InnerIterator SparseXprIter;
+
+   public:
+    InnerIterator(const sparse_diagonal_product_evaluator &xprEval, Index outer)
+        : m_sparseIter(xprEval.m_sparseXprEval, outer), m_diagCoeffNested(xprEval.m_diagCoeffNested) {}
+
+    inline Scalar value() const { return m_sparseIter.value() * m_diagCoeffNested.coeff(index()); }
+    inline StorageIndex index() const { return m_sparseIter.index(); }
+    inline Index outer() const { return m_sparseIter.outer(); }
+    inline Index col() const { return SparseXprType::IsRowMajor ? m_sparseIter.index() : m_sparseIter.outer(); }
+    inline Index row() const { return SparseXprType::IsRowMajor ? m_sparseIter.outer() : m_sparseIter.index(); }
+
+    EIGEN_STRONG_INLINE InnerIterator &operator++() {
+      ++m_sparseIter;
+      return *this;
+    }
+    inline operator bool() const { return m_sparseIter; }
+
+   protected:
+    SparseXprIter m_sparseIter;
+    DiagCoeffNested m_diagCoeffNested;
+  };
+
+  sparse_diagonal_product_evaluator(const SparseXprType &sparseXpr, const DiagCoeffType &diagCoeff)
+      : m_sparseXprEval(sparseXpr), m_diagCoeffNested(diagCoeff) {}
+
+  Index nonZerosEstimate() const { return m_sparseXprEval.nonZerosEstimate(); }
+
+ protected:
+  evaluator<SparseXprType> m_sparseXprEval;
+  DiagCoeffNested m_diagCoeffNested;
+};
 
-template<typename Derived>
-template<typename OtherDerived>
-const SparseDiagonalProduct<Derived,OtherDerived>
-SparseMatrixBase<Derived>::operator*(const DiagonalBase<OtherDerived> &other) const
-{
-  return SparseDiagonalProduct<Derived,OtherDerived>(this->derived(), other.derived());
-}
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_SPARSE_DIAGONAL_PRODUCT_H
+#endif  // EIGEN_SPARSE_DIAGONAL_PRODUCT_H
diff --git a/inst/include/Eigen/src/SparseCore/SparseDot.h b/inst/include/Eigen/src/SparseCore/SparseDot.h
index db39c9ae..485605fd 100644
--- a/inst/include/Eigen/src/SparseCore/SparseDot.h
+++ b/inst/include/Eigen/src/SparseCore/SparseDot.h
@@ -10,64 +10,67 @@
 #ifndef EIGEN_SPARSE_DOT_H
 #define EIGEN_SPARSE_DOT_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 
-template<typename Derived>
-template<typename OtherDerived>
-typename internal::traits<Derived>::Scalar
-SparseMatrixBase<Derived>::dot(const MatrixBase<OtherDerived>& other) const
-{
+namespace Eigen {
+
+template <typename Derived>
+template <typename OtherDerived>
+inline typename internal::traits<Derived>::Scalar SparseMatrixBase<Derived>::dot(
+    const MatrixBase<OtherDerived>& other) const {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
-  EIGEN_STATIC_ASSERT_SAME_VECTOR_SIZE(Derived,OtherDerived)
-  EIGEN_STATIC_ASSERT((internal::is_same<Scalar, typename OtherDerived::Scalar>::value),
-    YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+  EIGEN_STATIC_ASSERT_SAME_VECTOR_SIZE(Derived, OtherDerived)
+  EIGEN_STATIC_ASSERT(
+      (internal::is_same<Scalar, typename OtherDerived::Scalar>::value),
+      YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
 
   eigen_assert(size() == other.size());
-  eigen_assert(other.size()>0 && "you are using a non initialized vector");
+  eigen_assert(other.size() > 0 && "you are using a non initialized vector");
 
-  typename Derived::InnerIterator i(derived(),0);
-  Scalar res(0);
-  while (i)
-  {
-    res += numext::conj(i.value()) * other.coeff(i.index());
+  internal::evaluator<Derived> thisEval(derived());
+  typename internal::evaluator<Derived>::InnerIterator i(thisEval, 0);
+  // Two accumulators, which breaks the dependency chain on the accumulator
+  // and allows more instruction-level parallelism in the following loop.
+  Scalar res1(0);
+  Scalar res2(0);
+  for (; i; ++i) {
+    res1 = numext::madd<Scalar>(numext::conj(i.value()), other.coeff(i.index()), res1);
     ++i;
+    if (i) {
+      res2 = numext::madd<Scalar>(numext::conj(i.value()), other.coeff(i.index()), res2);
+    }
   }
-  return res;
+  return res1 + res2;
 }
 
-template<typename Derived>
-template<typename OtherDerived>
-typename internal::traits<Derived>::Scalar
-SparseMatrixBase<Derived>::dot(const SparseMatrixBase<OtherDerived>& other) const
-{
+template <typename Derived>
+template <typename OtherDerived>
+inline typename internal::traits<Derived>::Scalar SparseMatrixBase<Derived>::dot(
+    const SparseMatrixBase<OtherDerived>& other) const {
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
   EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
-  EIGEN_STATIC_ASSERT_SAME_VECTOR_SIZE(Derived,OtherDerived)
-  EIGEN_STATIC_ASSERT((internal::is_same<Scalar, typename OtherDerived::Scalar>::value),
-    YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+  EIGEN_STATIC_ASSERT_SAME_VECTOR_SIZE(Derived, OtherDerived)
+  EIGEN_STATIC_ASSERT(
+      (internal::is_same<Scalar, typename OtherDerived::Scalar>::value),
+      YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
 
   eigen_assert(size() == other.size());
 
-  typedef typename Derived::Nested  Nested;
-  typedef typename OtherDerived::Nested  OtherNested;
-  typedef typename internal::remove_all<Nested>::type  NestedCleaned;
-  typedef typename internal::remove_all<OtherNested>::type  OtherNestedCleaned;
+  internal::evaluator<Derived> thisEval(derived());
+  typename internal::evaluator<Derived>::InnerIterator i(thisEval, 0);
 
-  Nested nthis(derived());
-  OtherNested nother(other.derived());
+  internal::evaluator<OtherDerived> otherEval(other.derived());
+  typename internal::evaluator<OtherDerived>::InnerIterator j(otherEval, 0);
 
-  typename NestedCleaned::InnerIterator i(nthis,0);
-  typename OtherNestedCleaned::InnerIterator j(nother,0);
   Scalar res(0);
-  while (i && j)
-  {
-    if (i.index()==j.index())
-    {
-      res += numext::conj(i.value()) * j.value();
-      ++i; ++j;
-    }
-    else if (i.index()<j.index())
+  while (i && j) {
+    if (i.index() == j.index()) {
+      res = numext::madd<Scalar>(numext::conj(i.value()), j.value(), res);
+      ++i;
+      ++j;
+    } else if (i.index() < j.index())
       ++i;
     else
       ++j;
@@ -75,27 +78,23 @@ SparseMatrixBase<Derived>::dot(const SparseMatrixBase<OtherDerived>& other) cons
   return res;
 }
 
-template<typename Derived>
-inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
-SparseMatrixBase<Derived>::squaredNorm() const
-{
+template <typename Derived>
+inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real SparseMatrixBase<Derived>::squaredNorm()
+    const {
   return numext::real((*this).cwiseAbs2().sum());
 }
 
-template<typename Derived>
-inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
-SparseMatrixBase<Derived>::norm() const
-{
+template <typename Derived>
+inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real SparseMatrixBase<Derived>::norm() const {
   using std::sqrt;
   return sqrt(squaredNorm());
 }
 
-template<typename Derived>
-inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real
-SparseMatrixBase<Derived>::blueNorm() const
-{
+template <typename Derived>
+inline typename NumTraits<typename internal::traits<Derived>::Scalar>::Real SparseMatrixBase<Derived>::blueNorm()
+    const {
   return internal::blueNorm_impl(*this);
 }
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_SPARSE_DOT_H
+#endif  // EIGEN_SPARSE_DOT_H
diff --git a/inst/include/Eigen/src/SparseCore/SparseFuzzy.h b/inst/include/Eigen/src/SparseCore/SparseFuzzy.h
index 45f36e9e..2285845f 100644
--- a/inst/include/Eigen/src/SparseCore/SparseFuzzy.h
+++ b/inst/include/Eigen/src/SparseCore/SparseFuzzy.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,17 +10,22 @@
 #ifndef EIGEN_SPARSE_FUZZY_H
 #define EIGEN_SPARSE_FUZZY_H
 
-// template<typename Derived>
-// template<typename OtherDerived>
-// bool SparseMatrixBase<Derived>::isApprox(
-//   const OtherDerived& other,
-//   typename NumTraits<Scalar>::Real prec
-// ) const
-// {
-//   const typename internal::nested<Derived,2>::type nested(derived());
-//   const typename internal::nested<OtherDerived,2>::type otherNested(other.derived());
-//   return    (nested - otherNested).cwise().abs2().sum()
-//          <= prec * prec * (std::min)(nested.cwise().abs2().sum(), otherNested.cwise().abs2().sum());
-// }
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 
-#endif // EIGEN_SPARSE_FUZZY_H
+namespace Eigen {
+
+template <typename Derived>
+template <typename OtherDerived>
+bool SparseMatrixBase<Derived>::isApprox(const SparseMatrixBase<OtherDerived>& other, const RealScalar& prec) const {
+  const typename internal::nested_eval<Derived, 2, PlainObject>::type actualA(derived());
+  std::conditional_t<bool(IsRowMajor) == bool(OtherDerived::IsRowMajor),
+                     const typename internal::nested_eval<OtherDerived, 2, PlainObject>::type, const PlainObject>
+      actualB(other.derived());
+
+  return (actualA - actualB).squaredNorm() <= prec * prec * numext::mini(actualA.squaredNorm(), actualB.squaredNorm());
+}
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_SPARSE_FUZZY_H
diff --git a/inst/include/Eigen/src/SparseCore/SparseMap.h b/inst/include/Eigen/src/SparseCore/SparseMap.h
new file mode 100644
index 00000000..73e29c7b
--- /dev/null
+++ b/inst/include/Eigen/src/SparseCore/SparseMap.h
@@ -0,0 +1,295 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSE_MAP_H
+#define EIGEN_SPARSE_MAP_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+struct traits<Map<SparseMatrix<MatScalar, MatOptions, MatIndex>, Options, StrideType> >
+    : public traits<SparseMatrix<MatScalar, MatOptions, MatIndex> > {
+  typedef SparseMatrix<MatScalar, MatOptions, MatIndex> PlainObjectType;
+  typedef traits<PlainObjectType> TraitsBase;
+  enum { Flags = TraitsBase::Flags & (~NestByRefBit) };
+};
+
+template <typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+struct traits<Map<const SparseMatrix<MatScalar, MatOptions, MatIndex>, Options, StrideType> >
+    : public traits<SparseMatrix<MatScalar, MatOptions, MatIndex> > {
+  typedef SparseMatrix<MatScalar, MatOptions, MatIndex> PlainObjectType;
+  typedef traits<PlainObjectType> TraitsBase;
+  enum { Flags = TraitsBase::Flags & (~(NestByRefBit | LvalueBit)) };
+};
+
+}  // end namespace internal
+
+template <typename Derived,
+          int Level = internal::accessors_level<Derived>::has_write_access ? WriteAccessors : ReadOnlyAccessors>
+class SparseMapBase;
+
+/** \ingroup SparseCore_Module
+ * class SparseMapBase
+ * \brief Common base class for Map and Ref instance of sparse matrix and vector.
+ */
+template <typename Derived>
+class SparseMapBase<Derived, ReadOnlyAccessors> : public SparseCompressedBase<Derived> {
+ public:
+  typedef SparseCompressedBase<Derived> Base;
+  typedef typename Base::Scalar Scalar;
+  typedef typename Base::StorageIndex StorageIndex;
+  enum { IsRowMajor = Base::IsRowMajor };
+  using Base::operator=;
+
+ protected:
+  typedef std::conditional_t<bool(internal::is_lvalue<Derived>::value), Scalar*, const Scalar*> ScalarPointer;
+  typedef std::conditional_t<bool(internal::is_lvalue<Derived>::value), StorageIndex*, const StorageIndex*>
+      IndexPointer;
+
+  Index m_outerSize;
+  Index m_innerSize;
+  Array<StorageIndex, 2, 1> m_zero_nnz;
+  IndexPointer m_outerIndex;
+  IndexPointer m_innerIndices;
+  ScalarPointer m_values;
+  IndexPointer m_innerNonZeros;
+
+ public:
+  /** \copydoc SparseMatrixBase::rows() */
+  inline Index rows() const { return IsRowMajor ? m_outerSize : m_innerSize; }
+  /** \copydoc SparseMatrixBase::cols() */
+  inline Index cols() const { return IsRowMajor ? m_innerSize : m_outerSize; }
+  /** \copydoc SparseMatrixBase::innerSize() */
+  inline Index innerSize() const { return m_innerSize; }
+  /** \copydoc SparseMatrixBase::outerSize() */
+  inline Index outerSize() const { return m_outerSize; }
+  /** \copydoc SparseCompressedBase::nonZeros */
+  inline Index nonZeros() const { return m_zero_nnz[1]; }
+
+  /** \copydoc SparseCompressedBase::isCompressed */
+  bool isCompressed() const { return m_innerNonZeros == 0; }
+
+  //----------------------------------------
+  // direct access interface
+  /** \copydoc SparseMatrix::valuePtr */
+  inline const Scalar* valuePtr() const { return m_values; }
+  /** \copydoc SparseMatrix::innerIndexPtr */
+  inline const StorageIndex* innerIndexPtr() const { return m_innerIndices; }
+  /** \copydoc SparseMatrix::outerIndexPtr */
+  inline const StorageIndex* outerIndexPtr() const { return m_outerIndex; }
+  /** \copydoc SparseMatrix::innerNonZeroPtr */
+  inline const StorageIndex* innerNonZeroPtr() const { return m_innerNonZeros; }
+  //----------------------------------------
+
+  /** \copydoc SparseMatrix::coeff */
+  inline Scalar coeff(Index row, Index col) const {
+    const Index outer = IsRowMajor ? row : col;
+    const Index inner = IsRowMajor ? col : row;
+
+    Index start = m_outerIndex[outer];
+    Index end = isCompressed() ? m_outerIndex[outer + 1] : start + m_innerNonZeros[outer];
+    if (start == end)
+      return Scalar(0);
+    else if (end > 0 && inner == m_innerIndices[end - 1])
+      return m_values[end - 1];
+    // ^^  optimization: let's first check if it is the last coefficient
+    // (very common in high level algorithms)
+
+    const StorageIndex* r = std::lower_bound(&m_innerIndices[start], &m_innerIndices[end - 1], inner);
+    const Index id = r - &m_innerIndices[0];
+    return ((*r == inner) && (id < end)) ? m_values[id] : Scalar(0);
+  }
+
+  inline SparseMapBase(Index rows, Index cols, Index nnz, IndexPointer outerIndexPtr, IndexPointer innerIndexPtr,
+                       ScalarPointer valuePtr, IndexPointer innerNonZerosPtr = 0)
+      : m_outerSize(IsRowMajor ? rows : cols),
+        m_innerSize(IsRowMajor ? cols : rows),
+        m_zero_nnz(0, internal::convert_index<StorageIndex>(nnz)),
+        m_outerIndex(outerIndexPtr),
+        m_innerIndices(innerIndexPtr),
+        m_values(valuePtr),
+        m_innerNonZeros(innerNonZerosPtr) {}
+
+  // for vectors
+  inline SparseMapBase(Index size, Index nnz, IndexPointer innerIndexPtr, ScalarPointer valuePtr)
+      : m_outerSize(1),
+        m_innerSize(size),
+        m_zero_nnz(0, internal::convert_index<StorageIndex>(nnz)),
+        m_outerIndex(m_zero_nnz.data()),
+        m_innerIndices(innerIndexPtr),
+        m_values(valuePtr),
+        m_innerNonZeros(0) {}
+
+  /** Empty destructor */
+  inline ~SparseMapBase() {}
+
+ protected:
+  inline SparseMapBase() {}
+};
+
+/** \ingroup SparseCore_Module
+ * class SparseMapBase
+ * \brief Common base class for writable Map and Ref instance of sparse matrix and vector.
+ */
+template <typename Derived>
+class SparseMapBase<Derived, WriteAccessors> : public SparseMapBase<Derived, ReadOnlyAccessors> {
+  typedef MapBase<Derived, ReadOnlyAccessors> ReadOnlyMapBase;
+
+ public:
+  typedef SparseMapBase<Derived, ReadOnlyAccessors> Base;
+  typedef typename Base::Scalar Scalar;
+  typedef typename Base::StorageIndex StorageIndex;
+  enum { IsRowMajor = Base::IsRowMajor };
+
+  using Base::operator=;
+
+ public:
+  //----------------------------------------
+  // direct access interface
+  using Base::innerIndexPtr;
+  using Base::innerNonZeroPtr;
+  using Base::outerIndexPtr;
+  using Base::valuePtr;
+  /** \copydoc SparseMatrix::valuePtr */
+  inline Scalar* valuePtr() { return Base::m_values; }
+  /** \copydoc SparseMatrix::innerIndexPtr */
+  inline StorageIndex* innerIndexPtr() { return Base::m_innerIndices; }
+  /** \copydoc SparseMatrix::outerIndexPtr */
+  inline StorageIndex* outerIndexPtr() { return Base::m_outerIndex; }
+  /** \copydoc SparseMatrix::innerNonZeroPtr */
+  inline StorageIndex* innerNonZeroPtr() { return Base::m_innerNonZeros; }
+  //----------------------------------------
+
+  /** \copydoc SparseMatrix::coeffRef */
+  inline Scalar& coeffRef(Index row, Index col) {
+    const Index outer = IsRowMajor ? row : col;
+    const Index inner = IsRowMajor ? col : row;
+
+    Index start = Base::m_outerIndex[outer];
+    Index end = Base::isCompressed() ? Base::m_outerIndex[outer + 1] : start + Base::m_innerNonZeros[outer];
+    eigen_assert(end >= start && "you probably called coeffRef on a non finalized matrix");
+    eigen_assert(end > start && "coeffRef cannot be called on a zero coefficient");
+    StorageIndex* r = std::lower_bound(&Base::m_innerIndices[start], &Base::m_innerIndices[end], inner);
+    const Index id = r - &Base::m_innerIndices[0];
+    eigen_assert((*r == inner) && (id < end) && "coeffRef cannot be called on a zero coefficient");
+    return const_cast<Scalar*>(Base::m_values)[id];
+  }
+
+  inline SparseMapBase(Index rows, Index cols, Index nnz, StorageIndex* outerIndexPtr, StorageIndex* innerIndexPtr,
+                       Scalar* valuePtr, StorageIndex* innerNonZerosPtr = 0)
+      : Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZerosPtr) {}
+
+  // for vectors
+  inline SparseMapBase(Index size, Index nnz, StorageIndex* innerIndexPtr, Scalar* valuePtr)
+      : Base(size, nnz, innerIndexPtr, valuePtr) {}
+
+  /** Empty destructor */
+  inline ~SparseMapBase() {}
+
+ protected:
+  inline SparseMapBase() {}
+};
+
+/** \ingroup SparseCore_Module
+ *
+ * \brief Specialization of class Map for SparseMatrix-like storage.
+ *
+ * \tparam SparseMatrixType the equivalent sparse matrix type of the referenced data, it must be a template instance of
+ * class SparseMatrix.
+ *
+ * \sa class Map, class SparseMatrix, class Ref<SparseMatrixType,Options>
+ */
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+template <typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+class Map<SparseMatrix<MatScalar, MatOptions, MatIndex>, Options, StrideType>
+    : public SparseMapBase<Map<SparseMatrix<MatScalar, MatOptions, MatIndex>, Options, StrideType> >
+#else
+template <typename SparseMatrixType>
+class Map<SparseMatrixType> : public SparseMapBase<Derived, WriteAccessors>
+#endif
+{
+ public:
+  typedef SparseMapBase<Map> Base;
+  EIGEN_SPARSE_PUBLIC_INTERFACE(Map)
+  enum { IsRowMajor = Base::IsRowMajor };
+
+ public:
+  /** Constructs a read-write Map to a sparse matrix of size \a rows x \a cols, containing \a nnz non-zero coefficients,
+   * stored as a sparse format as defined by the pointers \a outerIndexPtr, \a innerIndexPtr, and \a valuePtr.
+   * If the optional parameter \a innerNonZerosPtr is the null pointer, then a standard compressed format is assumed.
+   * The inner indices must be sorted appropriately.
+   *
+   * This constructor is available only if \c SparseMatrixType is non-const.
+   *
+   * More details on the expected storage schemes are given in the \ref TutorialSparse "manual pages".
+   */
+  inline Map(Index rows, Index cols, Index nnz, StorageIndex* outerIndexPtr, StorageIndex* innerIndexPtr,
+             Scalar* valuePtr, StorageIndex* innerNonZerosPtr = 0)
+      : Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZerosPtr) {}
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  /** Empty destructor */
+  inline ~Map() {}
+};
+
+template <typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+class Map<const SparseMatrix<MatScalar, MatOptions, MatIndex>, Options, StrideType>
+    : public SparseMapBase<Map<const SparseMatrix<MatScalar, MatOptions, MatIndex>, Options, StrideType> > {
+ public:
+  typedef SparseMapBase<Map> Base;
+  EIGEN_SPARSE_PUBLIC_INTERFACE(Map)
+  enum { IsRowMajor = Base::IsRowMajor };
+
+ public:
+#endif
+  /** This is the const version of the above constructor.
+   *
+   * This constructor is available only if \c SparseMatrixType is const, e.g.:
+   * \code Map<const SparseMatrix<double> >  \endcode
+   */
+  inline Map(Index rows, Index cols, Index nnz, const StorageIndex* outerIndexPtr, const StorageIndex* innerIndexPtr,
+             const Scalar* valuePtr, const StorageIndex* innerNonZerosPtr = 0)
+      : Base(rows, cols, nnz, outerIndexPtr, innerIndexPtr, valuePtr, innerNonZerosPtr) {}
+
+  /** Empty destructor */
+  inline ~Map() {}
+};
+
+namespace internal {
+
+template <typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+struct evaluator<Map<SparseMatrix<MatScalar, MatOptions, MatIndex>, Options, StrideType> >
+    : evaluator<SparseCompressedBase<Map<SparseMatrix<MatScalar, MatOptions, MatIndex>, Options, StrideType> > > {
+  typedef evaluator<SparseCompressedBase<Map<SparseMatrix<MatScalar, MatOptions, MatIndex>, Options, StrideType> > >
+      Base;
+  typedef Map<SparseMatrix<MatScalar, MatOptions, MatIndex>, Options, StrideType> XprType;
+  evaluator() : Base() {}
+  explicit evaluator(const XprType& mat) : Base(mat) {}
+};
+
+template <typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+struct evaluator<Map<const SparseMatrix<MatScalar, MatOptions, MatIndex>, Options, StrideType> >
+    : evaluator<SparseCompressedBase<Map<const SparseMatrix<MatScalar, MatOptions, MatIndex>, Options, StrideType> > > {
+  typedef evaluator<
+      SparseCompressedBase<Map<const SparseMatrix<MatScalar, MatOptions, MatIndex>, Options, StrideType> > >
+      Base;
+  typedef Map<const SparseMatrix<MatScalar, MatOptions, MatIndex>, Options, StrideType> XprType;
+  evaluator() : Base() {}
+  explicit evaluator(const XprType& mat) : Base(mat) {}
+};
+
+}  // namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_SPARSE_MAP_H
diff --git a/inst/include/Eigen/src/SparseCore/SparseMatrix.h b/inst/include/Eigen/src/SparseCore/SparseMatrix.h
index 2ff20155..8fcdfdf1 100644
--- a/inst/include/Eigen/src/SparseCore/SparseMatrix.h
+++ b/inst/include/Eigen/src/SparseCore/SparseMatrix.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,40 +10,47 @@
 #ifndef EIGEN_SPARSEMATRIX_H
 #define EIGEN_SPARSEMATRIX_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 /** \ingroup SparseCore_Module
-  *
-  * \class SparseMatrix
-  *
-  * \brief A versatible sparse matrix representation
-  *
-  * This class implements a more versatile variants of the common \em compressed row/column storage format.
-  * Each colmun's (resp. row) non zeros are stored as a pair of value with associated row (resp. colmiun) index.
-  * All the non zeros are stored in a single large buffer. Unlike the \em compressed format, there might be extra
-  * space inbetween the nonzeros of two successive colmuns (resp. rows) such that insertion of new non-zero
-  * can be done with limited memory reallocation and copies.
-  *
-  * A call to the function makeCompressed() turns the matrix into the standard \em compressed format
-  * compatible with many library.
-  *
-  * More details on this storage sceheme are given in the \ref TutorialSparse "manual pages".
-  *
-  * \tparam _Scalar the scalar type, i.e. the type of the coefficients
-  * \tparam _Options Union of bit flags controlling the storage scheme. Currently the only possibility
-  *                 is ColMajor or RowMajor. The default is 0 which means column-major.
-  * \tparam _Index the type of the indices. It has to be a \b signed type (e.g., short, int, std::ptrdiff_t). Default is \c int.
-  *
-  * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_SPARSEMATRIX_PLUGIN.
-  */
+ *
+ * \class SparseMatrix
+ *
+ * \brief A versatible sparse matrix representation
+ *
+ * This class implements a more versatile variants of the common \em compressed row/column storage format.
+ * Each colmun's (resp. row) non zeros are stored as a pair of value with associated row (resp. colmiun) index.
+ * All the non zeros are stored in a single large buffer. Unlike the \em compressed format, there might be extra
+ * space in between the nonzeros of two successive colmuns (resp. rows) such that insertion of new non-zero
+ * can be done with limited memory reallocation and copies.
+ *
+ * A call to the function makeCompressed() turns the matrix into the standard \em compressed format
+ * compatible with many library.
+ *
+ * More details on this storage sceheme are given in the \ref TutorialSparse "manual pages".
+ *
+ * \tparam Scalar_ the scalar type, i.e. the type of the coefficients
+ * \tparam Options_ Union of bit flags controlling the storage scheme. Currently the only possibility
+ *                 is ColMajor or RowMajor. The default is 0 which means column-major.
+ * \tparam StorageIndex_ the type of the indices. It has to be a \b signed type (e.g., short, int, std::ptrdiff_t).
+ * Default is \c int.
+ *
+ * \warning In %Eigen 3.2, the undocumented type \c SparseMatrix::Index was improperly defined as the storage index type
+ * (e.g., int), whereas it is now (starting from %Eigen 3.3) deprecated and always defined as Eigen::Index. Codes making
+ * use of \c SparseMatrix::Index, might thus likely have to be changed to use \c SparseMatrix::StorageIndex instead.
+ *
+ * This class can be extended with the help of the plugin mechanism described on the page
+ * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_SPARSEMATRIX_PLUGIN.
+ */
 
 namespace internal {
-template<typename _Scalar, int _Options, typename _Index>
-struct traits<SparseMatrix<_Scalar, _Options, _Index> >
-{
-  typedef _Scalar Scalar;
-  typedef _Index Index;
+template <typename Scalar_, int Options_, typename StorageIndex_>
+struct traits<SparseMatrix<Scalar_, Options_, StorageIndex_>> {
+  typedef Scalar_ Scalar;
+  typedef StorageIndex_ StorageIndex;
   typedef Sparse StorageKind;
   typedef MatrixXpr XprKind;
   enum {
@@ -51,22 +58,21 @@ struct traits<SparseMatrix<_Scalar, _Options, _Index> >
     ColsAtCompileTime = Dynamic,
     MaxRowsAtCompileTime = Dynamic,
     MaxColsAtCompileTime = Dynamic,
-    Flags = _Options | NestByRefBit | LvalueBit,
-    CoeffReadCost = NumTraits<Scalar>::ReadCost,
+    Options = Options_,
+    Flags = Options_ | NestByRefBit | LvalueBit | CompressedAccessBit,
     SupportedAccessPatterns = InnerRandomAccessPattern
   };
 };
 
-template<typename _Scalar, int _Options, typename _Index, int DiagIndex>
-struct traits<Diagonal<const SparseMatrix<_Scalar, _Options, _Index>, DiagIndex> >
-{
-  typedef SparseMatrix<_Scalar, _Options, _Index> MatrixType;
-  typedef typename nested<MatrixType>::type MatrixTypeNested;
-  typedef typename remove_reference<MatrixTypeNested>::type _MatrixTypeNested;
+template <typename Scalar_, int Options_, typename StorageIndex_, int DiagIndex>
+struct traits<Diagonal<SparseMatrix<Scalar_, Options_, StorageIndex_>, DiagIndex>> {
+  typedef SparseMatrix<Scalar_, Options_, StorageIndex_> MatrixType;
+  typedef typename ref_selector<MatrixType>::type MatrixTypeNested;
+  typedef std::remove_reference_t<MatrixTypeNested> MatrixTypeNested_;
 
-  typedef _Scalar Scalar;
+  typedef Scalar_ Scalar;
   typedef Dense StorageKind;
-  typedef _Index Index;
+  typedef StorageIndex_ StorageIndex;
   typedef MatrixXpr XprKind;
 
   enum {
@@ -74,793 +80,1036 @@ struct traits<Diagonal<const SparseMatrix<_Scalar, _Options, _Index>, DiagIndex>
     ColsAtCompileTime = 1,
     MaxRowsAtCompileTime = Dynamic,
     MaxColsAtCompileTime = 1,
-    Flags = 0,
-    CoeffReadCost = _MatrixTypeNested::CoeffReadCost*10
+    Flags = LvalueBit
   };
 };
 
-} // end namespace internal
-
-template<typename _Scalar, int _Options, typename _Index>
-class SparseMatrix
-  : public SparseMatrixBase<SparseMatrix<_Scalar, _Options, _Index> >
-{
-  public:
-    EIGEN_SPARSE_PUBLIC_INTERFACE(SparseMatrix)
-    EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(SparseMatrix, +=)
-    EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(SparseMatrix, -=)
-
-    typedef MappedSparseMatrix<Scalar,Flags> Map;
-    using Base::IsRowMajor;
-    typedef internal::CompressedStorage<Scalar,Index> Storage;
-    enum {
-      Options = _Options
-    };
-
-  protected:
-
-    typedef SparseMatrix<Scalar,(Flags&~RowMajorBit)|(IsRowMajor?RowMajorBit:0)> TransposedSparseMatrix;
-
-    Index m_outerSize;
-    Index m_innerSize;
-    Index* m_outerIndex;
-    Index* m_innerNonZeros;     // optional, if null then the data is compressed
-    Storage m_data;
-    
-    Eigen::Map<Matrix<Index,Dynamic,1> > innerNonZeros() { return Eigen::Map<Matrix<Index,Dynamic,1> >(m_innerNonZeros, m_innerNonZeros?m_outerSize:0); }
-    const  Eigen::Map<const Matrix<Index,Dynamic,1> > innerNonZeros() const { return Eigen::Map<const Matrix<Index,Dynamic,1> >(m_innerNonZeros, m_innerNonZeros?m_outerSize:0); }
-
-  public:
-    
-    /** \returns whether \c *this is in compressed form. */
-    inline bool isCompressed() const { return m_innerNonZeros==0; }
-
-    /** \returns the number of rows of the matrix */
-    inline Index rows() const { return IsRowMajor ? m_outerSize : m_innerSize; }
-    /** \returns the number of columns of the matrix */
-    inline Index cols() const { return IsRowMajor ? m_innerSize : m_outerSize; }
-
-    /** \returns the number of rows (resp. columns) of the matrix if the storage order column major (resp. row major) */
-    inline Index innerSize() const { return m_innerSize; }
-    /** \returns the number of columns (resp. rows) of the matrix if the storage order column major (resp. row major) */
-    inline Index outerSize() const { return m_outerSize; }
-    
-    /** \returns a const pointer to the array of values.
-      * This function is aimed at interoperability with other libraries.
-      * \sa innerIndexPtr(), outerIndexPtr() */
-    inline const Scalar* valuePtr() const { return &m_data.value(0); }
-    /** \returns a non-const pointer to the array of values.
-      * This function is aimed at interoperability with other libraries.
-      * \sa innerIndexPtr(), outerIndexPtr() */
-    inline Scalar* valuePtr() { return &m_data.value(0); }
-
-    /** \returns a const pointer to the array of inner indices.
-      * This function is aimed at interoperability with other libraries.
-      * \sa valuePtr(), outerIndexPtr() */
-    inline const Index* innerIndexPtr() const { return &m_data.index(0); }
-    /** \returns a non-const pointer to the array of inner indices.
-      * This function is aimed at interoperability with other libraries.
-      * \sa valuePtr(), outerIndexPtr() */
-    inline Index* innerIndexPtr() { return &m_data.index(0); }
-
-    /** \returns a const pointer to the array of the starting positions of the inner vectors.
-      * This function is aimed at interoperability with other libraries.
-      * \sa valuePtr(), innerIndexPtr() */
-    inline const Index* outerIndexPtr() const { return m_outerIndex; }
-    /** \returns a non-const pointer to the array of the starting positions of the inner vectors.
-      * This function is aimed at interoperability with other libraries.
-      * \sa valuePtr(), innerIndexPtr() */
-    inline Index* outerIndexPtr() { return m_outerIndex; }
-
-    /** \returns a const pointer to the array of the number of non zeros of the inner vectors.
-      * This function is aimed at interoperability with other libraries.
-      * \warning it returns the null pointer 0 in compressed mode */
-    inline const Index* innerNonZeroPtr() const { return m_innerNonZeros; }
-    /** \returns a non-const pointer to the array of the number of non zeros of the inner vectors.
-      * This function is aimed at interoperability with other libraries.
-      * \warning it returns the null pointer 0 in compressed mode */
-    inline Index* innerNonZeroPtr() { return m_innerNonZeros; }
-
-    /** \internal */
-    inline Storage& data() { return m_data; }
-    /** \internal */
-    inline const Storage& data() const { return m_data; }
-
-    /** \returns the value of the matrix at position \a i, \a j
-      * This function returns Scalar(0) if the element is an explicit \em zero */
-    inline Scalar coeff(Index row, Index col) const
-    {
-      eigen_assert(row>=0 && row<rows() && col>=0 && col<cols());
-      
-      const Index outer = IsRowMajor ? row : col;
-      const Index inner = IsRowMajor ? col : row;
-      Index end = m_innerNonZeros ? m_outerIndex[outer] + m_innerNonZeros[outer] : m_outerIndex[outer+1];
-      return m_data.atInRange(m_outerIndex[outer], end, inner);
-    }
+template <typename Scalar_, int Options_, typename StorageIndex_, int DiagIndex>
+struct traits<Diagonal<const SparseMatrix<Scalar_, Options_, StorageIndex_>, DiagIndex>>
+    : public traits<Diagonal<SparseMatrix<Scalar_, Options_, StorageIndex_>, DiagIndex>> {
+  enum { Flags = 0 };
+};
 
-    /** \returns a non-const reference to the value of the matrix at position \a i, \a j
-      *
-      * If the element does not exist then it is inserted via the insert(Index,Index) function
-      * which itself turns the matrix into a non compressed form if that was not the case.
-      *
-      * This is a O(log(nnz_j)) operation (binary search) plus the cost of insert(Index,Index)
-      * function if the element does not already exist.
-      */
-    inline Scalar& coeffRef(Index row, Index col)
-    {
-      eigen_assert(row>=0 && row<rows() && col>=0 && col<cols());
-      
-      const Index outer = IsRowMajor ? row : col;
-      const Index inner = IsRowMajor ? col : row;
-
-      Index start = m_outerIndex[outer];
-      Index end = m_innerNonZeros ? m_outerIndex[outer] + m_innerNonZeros[outer] : m_outerIndex[outer+1];
-      eigen_assert(end>=start && "you probably called coeffRef on a non finalized matrix");
-      if(end<=start)
-        return insert(row,col);
-      const Index p = m_data.searchLowerIndex(start,end-1,inner);
-      if((p<end) && (m_data.index(p)==inner))
-        return m_data.value(p);
-      else
-        return insert(row,col);
-    }
+template <typename StorageIndex>
+struct sparse_reserve_op {
+  EIGEN_DEVICE_FUNC sparse_reserve_op(Index begin, Index end, Index size) {
+    Index range = numext::mini(end - begin, size);
+    m_begin = begin;
+    m_end = begin + range;
+    m_val = StorageIndex(size / range);
+    m_remainder = StorageIndex(size % range);
+  }
+  template <typename IndexType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageIndex operator()(IndexType i) const {
+    if ((i >= m_begin) && (i < m_end))
+      return m_val + ((i - m_begin) < m_remainder ? 1 : 0);
+    else
+      return 0;
+  }
+  StorageIndex m_val, m_remainder;
+  Index m_begin, m_end;
+};
 
-    /** \returns a reference to a novel non zero coefficient with coordinates \a row x \a col.
-      * The non zero coefficient must \b not already exist.
-      *
-      * If the matrix \c *this is in compressed mode, then \c *this is turned into uncompressed
-      * mode while reserving room for 2 non zeros per inner vector. It is strongly recommended to first
-      * call reserve(const SizesType &) to reserve a more appropriate number of elements per
-      * inner vector that better match your scenario.
-      *
-      * This function performs a sorted insertion in O(1) if the elements of each inner vector are
-      * inserted in increasing inner index order, and in O(nnz_j) for a random insertion.
-      *
-      */
-    Scalar& insert(Index row, Index col)
-    {
-      eigen_assert(row>=0 && row<rows() && col>=0 && col<cols());
-      
-      if(isCompressed())
-      {
-        reserve(Matrix<Index,Dynamic,1>::Constant(outerSize(), 2));
+template <typename Scalar>
+struct functor_traits<sparse_reserve_op<Scalar>> {
+  enum { Cost = 1, PacketAccess = false, IsRepeatable = true };
+};
+
+}  // end namespace internal
+
+template <typename Scalar_, int Options_, typename StorageIndex_>
+class SparseMatrix : public SparseCompressedBase<SparseMatrix<Scalar_, Options_, StorageIndex_>> {
+  typedef SparseCompressedBase<SparseMatrix> Base;
+  using Base::convert_index;
+  friend class SparseVector<Scalar_, 0, StorageIndex_>;
+  template <typename, typename, typename, typename, typename>
+  friend struct internal::Assignment;
+
+ public:
+  using Base::isCompressed;
+  using Base::nonZeros;
+  EIGEN_SPARSE_PUBLIC_INTERFACE(SparseMatrix)
+  using Base::operator+=;
+  using Base::operator-=;
+
+  typedef Eigen::Map<SparseMatrix<Scalar, Options_, StorageIndex>> Map;
+  typedef Diagonal<SparseMatrix> DiagonalReturnType;
+  typedef Diagonal<const SparseMatrix> ConstDiagonalReturnType;
+  typedef typename Base::InnerIterator InnerIterator;
+  typedef typename Base::ReverseInnerIterator ReverseInnerIterator;
+
+  using Base::IsRowMajor;
+  typedef internal::CompressedStorage<Scalar, StorageIndex> Storage;
+  enum { Options = Options_ };
+
+  typedef typename Base::IndexVector IndexVector;
+  typedef typename Base::ScalarVector ScalarVector;
+
+ protected:
+  typedef SparseMatrix<Scalar, IsRowMajor ? ColMajor : RowMajor, StorageIndex> TransposedSparseMatrix;
+
+  Index m_outerSize;
+  Index m_innerSize;
+  StorageIndex* m_outerIndex;
+  StorageIndex* m_innerNonZeros;  // optional, if null then the data is compressed
+  Storage m_data;
+
+ public:
+  /** \returns the number of rows of the matrix */
+  inline Index rows() const { return IsRowMajor ? m_outerSize : m_innerSize; }
+  /** \returns the number of columns of the matrix */
+  inline Index cols() const { return IsRowMajor ? m_innerSize : m_outerSize; }
+
+  /** \returns the number of rows (resp. columns) of the matrix if the storage order column major (resp. row major) */
+  inline Index innerSize() const { return m_innerSize; }
+  /** \returns the number of columns (resp. rows) of the matrix if the storage order column major (resp. row major) */
+  inline Index outerSize() const { return m_outerSize; }
+
+  /** \returns a const pointer to the array of values.
+   * This function is aimed at interoperability with other libraries.
+   * \sa innerIndexPtr(), outerIndexPtr() */
+  inline const Scalar* valuePtr() const { return m_data.valuePtr(); }
+  /** \returns a non-const pointer to the array of values.
+   * This function is aimed at interoperability with other libraries.
+   * \sa innerIndexPtr(), outerIndexPtr() */
+  inline Scalar* valuePtr() { return m_data.valuePtr(); }
+
+  /** \returns a const pointer to the array of inner indices.
+   * This function is aimed at interoperability with other libraries.
+   * \sa valuePtr(), outerIndexPtr() */
+  inline const StorageIndex* innerIndexPtr() const { return m_data.indexPtr(); }
+  /** \returns a non-const pointer to the array of inner indices.
+   * This function is aimed at interoperability with other libraries.
+   * \sa valuePtr(), outerIndexPtr() */
+  inline StorageIndex* innerIndexPtr() { return m_data.indexPtr(); }
+
+  /** \returns a const pointer to the array of the starting positions of the inner vectors.
+   * This function is aimed at interoperability with other libraries.
+   * \sa valuePtr(), innerIndexPtr() */
+  inline const StorageIndex* outerIndexPtr() const { return m_outerIndex; }
+  /** \returns a non-const pointer to the array of the starting positions of the inner vectors.
+   * This function is aimed at interoperability with other libraries.
+   * \sa valuePtr(), innerIndexPtr() */
+  inline StorageIndex* outerIndexPtr() { return m_outerIndex; }
+
+  /** \returns a const pointer to the array of the number of non zeros of the inner vectors.
+   * This function is aimed at interoperability with other libraries.
+   * \warning it returns the null pointer 0 in compressed mode */
+  inline const StorageIndex* innerNonZeroPtr() const { return m_innerNonZeros; }
+  /** \returns a non-const pointer to the array of the number of non zeros of the inner vectors.
+   * This function is aimed at interoperability with other libraries.
+   * \warning it returns the null pointer 0 in compressed mode */
+  inline StorageIndex* innerNonZeroPtr() { return m_innerNonZeros; }
+
+  /** \internal */
+  constexpr Storage& data() { return m_data; }
+  /** \internal */
+  constexpr const Storage& data() const { return m_data; }
+
+  /** \returns the value of the matrix at position \a i, \a j
+   * This function returns Scalar(0) if the element is an explicit \em zero */
+  inline Scalar coeff(Index row, Index col) const {
+    eigen_assert(row >= 0 && row < rows() && col >= 0 && col < cols());
+
+    const Index outer = IsRowMajor ? row : col;
+    const Index inner = IsRowMajor ? col : row;
+    Index end = m_innerNonZeros ? m_outerIndex[outer] + m_innerNonZeros[outer] : m_outerIndex[outer + 1];
+    return m_data.atInRange(m_outerIndex[outer], end, inner);
+  }
+
+  /** \returns a non-const reference to the value of the matrix at position \a i, \a j.
+   *
+   * If the element does not exist then it is inserted via the insert(Index,Index) function
+   * which itself turns the matrix into a non compressed form if that was not the case.
+   * The output parameter `inserted` is set to true.
+   *
+   * Otherwise, if the element does exist, `inserted` will be set to false.
+   *
+   * This is a O(log(nnz_j)) operation (binary search) plus the cost of insert(Index,Index)
+   * function if the element does not already exist.
+   */
+  inline Scalar& findOrInsertCoeff(Index row, Index col, bool* inserted) {
+    eigen_assert(row >= 0 && row < rows() && col >= 0 && col < cols());
+    const Index outer = IsRowMajor ? row : col;
+    const Index inner = IsRowMajor ? col : row;
+    Index start = m_outerIndex[outer];
+    Index end = isCompressed() ? m_outerIndex[outer + 1] : m_outerIndex[outer] + m_innerNonZeros[outer];
+    eigen_assert(end >= start && "you probably called coeffRef on a non finalized matrix");
+    Index dst = start == end ? end : m_data.searchLowerIndex(start, end, inner);
+    if (dst == end) {
+      Index capacity = m_outerIndex[outer + 1] - end;
+      if (capacity > 0) {
+        // implies uncompressed: push to back of vector
+        m_innerNonZeros[outer]++;
+        m_data.index(end) = StorageIndex(inner);
+        m_data.value(end) = Scalar(0);
+        if (inserted != nullptr) {
+          *inserted = true;
+        }
+        return m_data.value(end);
+      }
+    }
+    if ((dst < end) && (m_data.index(dst) == inner)) {
+      // this coefficient exists, return a reference to it
+      if (inserted != nullptr) {
+        *inserted = false;
+      }
+      return m_data.value(dst);
+    } else {
+      if (inserted != nullptr) {
+        *inserted = true;
       }
-      return insertUncompressed(row,col);
+      // insertion will require reconfiguring the buffer
+      return insertAtByOuterInner(outer, inner, dst);
     }
+  }
 
-  public:
+  /** \returns a non-const reference to the value of the matrix at position \a i, \a j
+   *
+   * If the element does not exist then it is inserted via the insert(Index,Index) function
+   * which itself turns the matrix into a non compressed form if that was not the case.
+   *
+   * This is a O(log(nnz_j)) operation (binary search) plus the cost of insert(Index,Index)
+   * function if the element does not already exist.
+   */
+  inline Scalar& coeffRef(Index row, Index col) { return findOrInsertCoeff(row, col, nullptr); }
+
+  /** \returns a reference to a novel non zero coefficient with coordinates \a row x \a col.
+   * The non zero coefficient must \b not already exist.
+   *
+   * If the matrix \c *this is in compressed mode, then \c *this is turned into uncompressed
+   * mode while reserving room for 2 x this->innerSize() non zeros if reserve(Index) has not been called earlier.
+   * In this case, the insertion procedure is optimized for a \e sequential insertion mode where elements are assumed to
+   * be inserted by increasing outer-indices.
+   *
+   * If that's not the case, then it is strongly recommended to either use a triplet-list to assemble the matrix, or to
+   * first call reserve(const SizesType &) to reserve the appropriate number of non-zero elements per inner vector.
+   *
+   * Assuming memory has been appropriately reserved, this function performs a sorted insertion in O(1)
+   * if the elements of each inner vector are inserted in increasing inner index order, and in O(nnz_j) for a random
+   * insertion.
+   *
+   */
+  inline Scalar& insert(Index row, Index col);
+
+ public:
+  /** Removes all non zeros but keep allocated memory
+   *
+   * This function does not free the currently allocated memory. To release as much as memory as possible,
+   * call \code mat.data().squeeze(); \endcode after resizing it.
+   *
+   * \sa resize(Index,Index), data()
+   */
+  inline void setZero() {
+    m_data.clear();
+    using std::fill_n;
+    fill_n(m_outerIndex, m_outerSize + 1, StorageIndex(0));
+    if (m_innerNonZeros) {
+      fill_n(m_innerNonZeros, m_outerSize, StorageIndex(0));
+    }
+  }
 
-    class InnerIterator;
-    class ReverseInnerIterator;
+  /** Preallocates \a reserveSize non zeros.
+   *
+   * Precondition: the matrix must be in compressed mode. */
+  inline void reserve(Index reserveSize) {
+    eigen_assert(isCompressed() && "This function does not make sense in non compressed mode.");
+    m_data.reserve(reserveSize);
+  }
 
-    /** Removes all non zeros but keep allocated memory */
-    inline void setZero()
-    {
-      m_data.clear();
-      memset(m_outerIndex, 0, (m_outerSize+1)*sizeof(Index));
-      if(m_innerNonZeros)
-        memset(m_innerNonZeros, 0, (m_outerSize)*sizeof(Index));
-    }
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+  /** Preallocates \a reserveSize[\c j] non zeros for each column (resp. row) \c j.
+    *
+    * This function turns the matrix in non-compressed mode.
+    *
+    * The type \c SizesType must expose the following interface:
+      \code
+      typedef value_type;
+      const value_type& operator[](i) const;
+      \endcode
+    * for \c i in the [0,this->outerSize()[ range.
+    * Typical choices include std::vector<int>, Eigen::VectorXi, Eigen::VectorXi::Constant, etc.
+    */
+  template <class SizesType>
+  inline void reserve(const SizesType& reserveSizes);
+#else
+  template <class SizesType>
+  inline void reserve(const SizesType& reserveSizes,
+                      const typename SizesType::value_type& enableif = typename SizesType::value_type()) {
+    EIGEN_UNUSED_VARIABLE(enableif);
+    reserveInnerVectors(reserveSizes);
+  }
+#endif  // EIGEN_PARSED_BY_DOXYGEN
+ protected:
+  template <class SizesType>
+  inline void reserveInnerVectors(const SizesType& reserveSizes) {
+    if (isCompressed()) {
+      Index totalReserveSize = 0;
+      for (Index j = 0; j < m_outerSize; ++j) totalReserveSize += internal::convert_index<Index>(reserveSizes[j]);
+
+      // if reserveSizes is empty, don't do anything!
+      if (totalReserveSize == 0) return;
+
+      // turn the matrix into non-compressed mode
+      m_innerNonZeros = internal::conditional_aligned_new_auto<StorageIndex, true>(m_outerSize);
+
+      // temporarily use m_innerSizes to hold the new starting points.
+      StorageIndex* newOuterIndex = m_innerNonZeros;
+
+      Index count = 0;
+      for (Index j = 0; j < m_outerSize; ++j) {
+        newOuterIndex[j] = internal::convert_index<StorageIndex>(count);
+        Index reserveSize = internal::convert_index<Index>(reserveSizes[j]);
+        count += reserveSize + internal::convert_index<Index>(m_outerIndex[j + 1] - m_outerIndex[j]);
+      }
 
-    /** \returns the number of non zero coefficients */
-    inline Index nonZeros() const
-    {
-      if(m_innerNonZeros)
-        return innerNonZeros().sum();
-      return static_cast<Index>(m_data.size());
-    }
+      m_data.reserve(totalReserveSize);
+      StorageIndex previousOuterIndex = m_outerIndex[m_outerSize];
+      for (Index j = m_outerSize - 1; j >= 0; --j) {
+        StorageIndex innerNNZ = previousOuterIndex - m_outerIndex[j];
+        StorageIndex begin = m_outerIndex[j];
+        StorageIndex end = begin + innerNNZ;
+        StorageIndex target = newOuterIndex[j];
+        internal::smart_memmove(innerIndexPtr() + begin, innerIndexPtr() + end, innerIndexPtr() + target);
+        internal::smart_memmove(valuePtr() + begin, valuePtr() + end, valuePtr() + target);
+        previousOuterIndex = m_outerIndex[j];
+        m_outerIndex[j] = newOuterIndex[j];
+        m_innerNonZeros[j] = innerNNZ;
+      }
+      if (m_outerSize > 0)
+        m_outerIndex[m_outerSize] = m_outerIndex[m_outerSize - 1] + m_innerNonZeros[m_outerSize - 1] +
+                                    internal::convert_index<StorageIndex>(reserveSizes[m_outerSize - 1]);
 
-    /** Preallocates \a reserveSize non zeros.
-      *
-      * Precondition: the matrix must be in compressed mode. */
-    inline void reserve(Index reserveSize)
-    {
-      eigen_assert(isCompressed() && "This function does not make sense in non compressed mode.");
-      m_data.reserve(reserveSize);
-    }
-    
-    #ifdef EIGEN_PARSED_BY_DOXYGEN
-    /** Preallocates \a reserveSize[\c j] non zeros for each column (resp. row) \c j.
-      *
-      * This function turns the matrix in non-compressed mode */
-    template<class SizesType>
-    inline void reserve(const SizesType& reserveSizes);
-    #else
-    template<class SizesType>
-    inline void reserve(const SizesType& reserveSizes, const typename SizesType::value_type& enableif = typename SizesType::value_type())
-    {
-      EIGEN_UNUSED_VARIABLE(enableif);
-      reserveInnerVectors(reserveSizes);
-    }
-    template<class SizesType>
-    inline void reserve(const SizesType& reserveSizes, const typename SizesType::Scalar& enableif =
-    #if (!defined(_MSC_VER)) || (_MSC_VER>=1500) // MSVC 2005 fails to compile with this typename
-        typename
-    #endif
-        SizesType::Scalar())
-    {
-      EIGEN_UNUSED_VARIABLE(enableif);
-      reserveInnerVectors(reserveSizes);
-    }
-    #endif // EIGEN_PARSED_BY_DOXYGEN
-  protected:
-    template<class SizesType>
-    inline void reserveInnerVectors(const SizesType& reserveSizes)
-    {
-      if(isCompressed())
-      {
-        std::size_t totalReserveSize = 0;
-        // turn the matrix into non-compressed mode
-        m_innerNonZeros = static_cast<Index*>(std::malloc(m_outerSize * sizeof(Index)));
-        if (!m_innerNonZeros) internal::throw_std_bad_alloc();
-        
-        // temporarily use m_innerSizes to hold the new starting points.
-        Index* newOuterIndex = m_innerNonZeros;
-        
-        Index count = 0;
-        for(Index j=0; j<m_outerSize; ++j)
-        {
-          newOuterIndex[j] = count;
-          count += reserveSizes[j] + (m_outerIndex[j+1]-m_outerIndex[j]);
-          totalReserveSize += reserveSizes[j];
-        }
-        m_data.reserve(totalReserveSize);
-        Index previousOuterIndex = m_outerIndex[m_outerSize];
-        for(Index j=m_outerSize-1; j>=0; --j)
-        {
-          Index innerNNZ = previousOuterIndex - m_outerIndex[j];
-          for(Index i=innerNNZ-1; i>=0; --i)
-          {
-            m_data.index(newOuterIndex[j]+i) = m_data.index(m_outerIndex[j]+i);
-            m_data.value(newOuterIndex[j]+i) = m_data.value(m_outerIndex[j]+i);
-          }
-          previousOuterIndex = m_outerIndex[j];
-          m_outerIndex[j] = newOuterIndex[j];
-          m_innerNonZeros[j] = innerNNZ;
-        }
-        m_outerIndex[m_outerSize] = m_outerIndex[m_outerSize-1] + m_innerNonZeros[m_outerSize-1] + reserveSizes[m_outerSize-1];
-        
-        m_data.resize(m_outerIndex[m_outerSize]);
+      m_data.resize(m_outerIndex[m_outerSize]);
+    } else {
+      StorageIndex* newOuterIndex = internal::conditional_aligned_new_auto<StorageIndex, true>(m_outerSize + 1);
+
+      Index count = 0;
+      for (Index j = 0; j < m_outerSize; ++j) {
+        newOuterIndex[j] = internal::convert_index<StorageIndex>(count);
+        Index alreadyReserved =
+            internal::convert_index<Index>(m_outerIndex[j + 1] - m_outerIndex[j] - m_innerNonZeros[j]);
+        Index reserveSize = internal::convert_index<Index>(reserveSizes[j]);
+        Index toReserve = numext::maxi(reserveSize, alreadyReserved);
+        count += toReserve + internal::convert_index<Index>(m_innerNonZeros[j]);
       }
-      else
-      {
-        Index* newOuterIndex = static_cast<Index*>(std::malloc((m_outerSize+1)*sizeof(Index)));
-        if (!newOuterIndex) internal::throw_std_bad_alloc();
-        
-        Index count = 0;
-        for(Index j=0; j<m_outerSize; ++j)
-        {
-          newOuterIndex[j] = count;
-          Index alreadyReserved = (m_outerIndex[j+1]-m_outerIndex[j]) - m_innerNonZeros[j];
-          Index toReserve = std::max<Index>(reserveSizes[j], alreadyReserved);
-          count += toReserve + m_innerNonZeros[j];
-        }
-        newOuterIndex[m_outerSize] = count;
-        
-        m_data.resize(count);
-        for(Index j=m_outerSize-1; j>=0; --j)
-        {
-          Index offset = newOuterIndex[j] - m_outerIndex[j];
-          if(offset>0)
-          {
-            Index innerNNZ = m_innerNonZeros[j];
-            for(Index i=innerNNZ-1; i>=0; --i)
-            {
-              m_data.index(newOuterIndex[j]+i) = m_data.index(m_outerIndex[j]+i);
-              m_data.value(newOuterIndex[j]+i) = m_data.value(m_outerIndex[j]+i);
-            }
-          }
-        }
-        
-        std::swap(m_outerIndex, newOuterIndex);
-        std::free(newOuterIndex);
+      newOuterIndex[m_outerSize] = internal::convert_index<StorageIndex>(count);
+
+      m_data.resize(count);
+      for (Index j = m_outerSize - 1; j >= 0; --j) {
+        StorageIndex innerNNZ = m_innerNonZeros[j];
+        StorageIndex begin = m_outerIndex[j];
+        StorageIndex target = newOuterIndex[j];
+        m_data.moveChunk(begin, target, innerNNZ);
       }
-      
-    }
-  public:
-
-    //--- low level purely coherent filling ---
-
-    /** \internal
-      * \returns a reference to the non zero coefficient at position \a row, \a col assuming that:
-      * - the nonzero does not already exist
-      * - the new coefficient is the last one according to the storage order
-      *
-      * Before filling a given inner vector you must call the statVec(Index) function.
-      *
-      * After an insertion session, you should call the finalize() function.
-      *
-      * \sa insert, insertBackByOuterInner, startVec */
-    inline Scalar& insertBack(Index row, Index col)
-    {
-      return insertBackByOuterInner(IsRowMajor?row:col, IsRowMajor?col:row);
-    }
 
-    /** \internal
-      * \sa insertBack, startVec */
-    inline Scalar& insertBackByOuterInner(Index outer, Index inner)
-    {
-      eigen_assert(size_t(m_outerIndex[outer+1]) == m_data.size() && "Invalid ordered insertion (invalid outer index)");
-      eigen_assert( (m_outerIndex[outer+1]-m_outerIndex[outer]==0 || m_data.index(m_data.size()-1)<inner) && "Invalid ordered insertion (invalid inner index)");
-      Index p = m_outerIndex[outer+1];
-      ++m_outerIndex[outer+1];
-      m_data.append(0, inner);
-      return m_data.value(p);
+      std::swap(m_outerIndex, newOuterIndex);
+      internal::conditional_aligned_delete_auto<StorageIndex, true>(newOuterIndex, m_outerSize + 1);
     }
+  }
 
-    /** \internal
-      * \warning use it only if you know what you are doing */
-    inline Scalar& insertBackByOuterInnerUnordered(Index outer, Index inner)
-    {
-      Index p = m_outerIndex[outer+1];
-      ++m_outerIndex[outer+1];
-      m_data.append(0, inner);
-      return m_data.value(p);
-    }
+ public:
+  //--- low level purely coherent filling ---
+
+  /** \internal
+   * \returns a reference to the non zero coefficient at position \a row, \a col assuming that:
+   * - the nonzero does not already exist
+   * - the new coefficient is the last one according to the storage order
+   *
+   * Before filling a given inner vector you must call the statVec(Index) function.
+   *
+   * After an insertion session, you should call the finalize() function.
+   *
+   * \sa insert, insertBackByOuterInner, startVec */
+  inline Scalar& insertBack(Index row, Index col) {
+    return insertBackByOuterInner(IsRowMajor ? row : col, IsRowMajor ? col : row);
+  }
 
-    /** \internal
-      * \sa insertBack, insertBackByOuterInner */
-    inline void startVec(Index outer)
-    {
-      eigen_assert(m_outerIndex[outer]==Index(m_data.size()) && "You must call startVec for each inner vector sequentially");
-      eigen_assert(m_outerIndex[outer+1]==0 && "You must call startVec for each inner vector sequentially");
-      m_outerIndex[outer+1] = m_outerIndex[outer];
-    }
+  /** \internal
+   * \sa insertBack, startVec */
+  inline Scalar& insertBackByOuterInner(Index outer, Index inner) {
+    eigen_assert(Index(m_outerIndex[outer + 1]) == m_data.size() && "Invalid ordered insertion (invalid outer index)");
+    eigen_assert((m_outerIndex[outer + 1] - m_outerIndex[outer] == 0 || m_data.index(m_data.size() - 1) < inner) &&
+                 "Invalid ordered insertion (invalid inner index)");
+    StorageIndex p = m_outerIndex[outer + 1];
+    ++m_outerIndex[outer + 1];
+    m_data.append(Scalar(0), inner);
+    return m_data.value(p);
+  }
 
-    /** \internal
-      * Must be called after inserting a set of non zero entries using the low level compressed API.
-      */
-    inline void finalize()
-    {
-      if(isCompressed())
-      {
-        Index size = static_cast<Index>(m_data.size());
-        Index i = m_outerSize;
-        // find the last filled column
-        while (i>=0 && m_outerIndex[i]==0)
-          --i;
+  /** \internal
+   * \warning use it only if you know what you are doing */
+  inline Scalar& insertBackByOuterInnerUnordered(Index outer, Index inner) {
+    StorageIndex p = m_outerIndex[outer + 1];
+    ++m_outerIndex[outer + 1];
+    m_data.append(Scalar(0), inner);
+    return m_data.value(p);
+  }
+
+  /** \internal
+   * \sa insertBack, insertBackByOuterInner */
+  inline void startVec(Index outer) {
+    eigen_assert(m_outerIndex[outer] == Index(m_data.size()) &&
+                 "You must call startVec for each inner vector sequentially");
+    eigen_assert(m_outerIndex[outer + 1] == 0 && "You must call startVec for each inner vector sequentially");
+    m_outerIndex[outer + 1] = m_outerIndex[outer];
+  }
+
+  /** \internal
+   * Must be called after inserting a set of non zero entries using the low level compressed API.
+   */
+  inline void finalize() {
+    if (isCompressed()) {
+      StorageIndex size = internal::convert_index<StorageIndex>(m_data.size());
+      Index i = m_outerSize;
+      // find the last filled column
+      while (i >= 0 && m_outerIndex[i] == 0) --i;
+      ++i;
+      while (i <= m_outerSize) {
+        m_outerIndex[i] = size;
         ++i;
-        while (i<=m_outerSize)
-        {
-          m_outerIndex[i] = size;
-          ++i;
-        }
       }
     }
+  }
 
-    //---
+  // remove outer vectors j, j+1 ... j+num-1 and resize the matrix
+  void removeOuterVectors(Index j, Index num = 1) {
+    eigen_assert(num >= 0 && j >= 0 && j + num <= m_outerSize && "Invalid parameters");
+
+    const Index newRows = IsRowMajor ? m_outerSize - num : rows();
+    const Index newCols = IsRowMajor ? cols() : m_outerSize - num;
+
+    const Index begin = j + num;
+    const Index end = m_outerSize;
+    const Index target = j;
+
+    // if the removed vectors are not empty, uncompress the matrix
+    if (m_outerIndex[j + num] > m_outerIndex[j]) uncompress();
+
+    // shift m_outerIndex and m_innerNonZeros [num] to the left
+    internal::smart_memmove(m_outerIndex + begin, m_outerIndex + end + 1, m_outerIndex + target);
+    if (!isCompressed())
+      internal::smart_memmove(m_innerNonZeros + begin, m_innerNonZeros + end, m_innerNonZeros + target);
+
+    // if m_outerIndex[0] > 0, shift the data within the first vector while it is easy to do so
+    if (m_outerIndex[0] > StorageIndex(0)) {
+      uncompress();
+      const Index from = internal::convert_index<Index>(m_outerIndex[0]);
+      const Index to = Index(0);
+      const Index chunkSize = internal::convert_index<Index>(m_innerNonZeros[0]);
+      m_data.moveChunk(from, to, chunkSize);
+      m_outerIndex[0] = StorageIndex(0);
+    }
 
-    template<typename InputIterators>
-    void setFromTriplets(const InputIterators& begin, const InputIterators& end);
+    // truncate the matrix to the smaller size
+    conservativeResize(newRows, newCols);
+  }
 
-    void sumupDuplicates();
+  // insert empty outer vectors at indices j, j+1 ... j+num-1 and resize the matrix
+  void insertEmptyOuterVectors(Index j, Index num = 1) {
+    using std::fill_n;
+    eigen_assert(num >= 0 && j >= 0 && j < m_outerSize && "Invalid parameters");
 
-    //---
-    
-    /** \internal
-      * same as insert(Index,Index) except that the indices are given relative to the storage order */
-    Scalar& insertByOuterInner(Index j, Index i)
-    {
-      return insert(IsRowMajor ? j : i, IsRowMajor ? i : j);
+    const Index newRows = IsRowMajor ? m_outerSize + num : rows();
+    const Index newCols = IsRowMajor ? cols() : m_outerSize + num;
+
+    const Index begin = j;
+    const Index end = m_outerSize;
+    const Index target = j + num;
+
+    // expand the matrix to the larger size
+    conservativeResize(newRows, newCols);
+
+    // shift m_outerIndex and m_innerNonZeros [num] to the right
+    internal::smart_memmove(m_outerIndex + begin, m_outerIndex + end + 1, m_outerIndex + target);
+    // m_outerIndex[begin] == m_outerIndex[target], set all indices in this range to same value
+    fill_n(m_outerIndex + begin, num, m_outerIndex[begin]);
+
+    if (!isCompressed()) {
+      internal::smart_memmove(m_innerNonZeros + begin, m_innerNonZeros + end, m_innerNonZeros + target);
+      // set the nonzeros of the newly inserted vectors to 0
+      fill_n(m_innerNonZeros + begin, num, StorageIndex(0));
     }
+  }
 
-    /** Turns the matrix into the \em compressed format.
-      */
-    void makeCompressed()
-    {
-      if(isCompressed())
-        return;
-      
-      Index oldStart = m_outerIndex[1];
-      m_outerIndex[1] = m_innerNonZeros[0];
-      for(Index j=1; j<m_outerSize; ++j)
-      {
-        Index nextOldStart = m_outerIndex[j+1];
-        Index offset = oldStart - m_outerIndex[j];
-        if(offset>0)
-        {
-          for(Index k=0; k<m_innerNonZeros[j]; ++k)
-          {
-            m_data.index(m_outerIndex[j]+k) = m_data.index(oldStart+k);
-            m_data.value(m_outerIndex[j]+k) = m_data.value(oldStart+k);
-          }
-        }
-        m_outerIndex[j+1] = m_outerIndex[j] + m_innerNonZeros[j];
-        oldStart = nextOldStart;
+  template <typename InputIterators>
+  void setFromTriplets(const InputIterators& begin, const InputIterators& end);
+
+  template <typename InputIterators, typename DupFunctor>
+  void setFromTriplets(const InputIterators& begin, const InputIterators& end, DupFunctor dup_func);
+
+  template <typename Derived, typename DupFunctor>
+  void collapseDuplicates(DenseBase<Derived>& wi, DupFunctor dup_func = DupFunctor());
+
+  template <typename InputIterators>
+  void setFromSortedTriplets(const InputIterators& begin, const InputIterators& end);
+
+  template <typename InputIterators, typename DupFunctor>
+  void setFromSortedTriplets(const InputIterators& begin, const InputIterators& end, DupFunctor dup_func);
+
+  template <typename InputIterators>
+  void insertFromTriplets(const InputIterators& begin, const InputIterators& end);
+
+  template <typename InputIterators, typename DupFunctor>
+  void insertFromTriplets(const InputIterators& begin, const InputIterators& end, DupFunctor dup_func);
+
+  template <typename InputIterators>
+  void insertFromSortedTriplets(const InputIterators& begin, const InputIterators& end);
+
+  template <typename InputIterators, typename DupFunctor>
+  void insertFromSortedTriplets(const InputIterators& begin, const InputIterators& end, DupFunctor dup_func);
+
+  //---
+
+  /** \internal
+   * same as insert(Index,Index) except that the indices are given relative to the storage order */
+  Scalar& insertByOuterInner(Index j, Index i) {
+    eigen_assert(j >= 0 && j < m_outerSize && "invalid outer index");
+    eigen_assert(i >= 0 && i < m_innerSize && "invalid inner index");
+    Index start = m_outerIndex[j];
+    Index end = isCompressed() ? m_outerIndex[j + 1] : start + m_innerNonZeros[j];
+    Index dst = start == end ? end : m_data.searchLowerIndex(start, end, i);
+    if (dst == end) {
+      Index capacity = m_outerIndex[j + 1] - end;
+      if (capacity > 0) {
+        // implies uncompressed: push to back of vector
+        m_innerNonZeros[j]++;
+        m_data.index(end) = StorageIndex(i);
+        m_data.value(end) = Scalar(0);
+        return m_data.value(end);
       }
-      std::free(m_innerNonZeros);
-      m_innerNonZeros = 0;
-      m_data.resize(m_outerIndex[m_outerSize]);
-      m_data.squeeze();
     }
+    eigen_assert((dst == end || m_data.index(dst) != i) &&
+                 "you cannot insert an element that already exists, you must call coeffRef to this end");
+    return insertAtByOuterInner(j, i, dst);
+  }
 
-    /** Turns the matrix into the uncompressed mode */
-    void uncompress()
-    {
-      if(m_innerNonZeros != 0)
-        return; 
-      m_innerNonZeros = static_cast<Index*>(std::malloc(m_outerSize * sizeof(Index)));
-      for (Index i = 0; i < m_outerSize; i++)
-      {
-        m_innerNonZeros[i] = m_outerIndex[i+1] - m_outerIndex[i]; 
+  /** Turns the matrix into the \em compressed format.
+   */
+  void makeCompressed() {
+    if (isCompressed()) return;
+
+    eigen_internal_assert(m_outerIndex != 0 && m_outerSize > 0);
+
+    StorageIndex start = m_outerIndex[1];
+    m_outerIndex[1] = m_innerNonZeros[0];
+    // try to move fewer, larger contiguous chunks
+    Index copyStart = start;
+    Index copyTarget = m_innerNonZeros[0];
+    for (Index j = 1; j < m_outerSize; j++) {
+      StorageIndex end = start + m_innerNonZeros[j];
+      StorageIndex nextStart = m_outerIndex[j + 1];
+      // dont forget to move the last chunk!
+      bool breakUpCopy = (end != nextStart) || (j == m_outerSize - 1);
+      if (breakUpCopy) {
+        Index chunkSize = end - copyStart;
+        if (chunkSize > 0) m_data.moveChunk(copyStart, copyTarget, chunkSize);
+        copyStart = nextStart;
+        copyTarget += chunkSize;
       }
+      start = nextStart;
+      m_outerIndex[j + 1] = m_outerIndex[j] + m_innerNonZeros[j];
     }
-    
-    /** Suppresses all nonzeros which are \b much \b smaller \b than \a reference under the tolerence \a epsilon */
-    void prune(const Scalar& reference, const RealScalar& epsilon = NumTraits<RealScalar>::dummy_precision())
-    {
-      prune(default_prunning_func(reference,epsilon));
+    m_data.resize(m_outerIndex[m_outerSize]);
+
+    // release as much memory as possible
+    internal::conditional_aligned_delete_auto<StorageIndex, true>(m_innerNonZeros, m_outerSize);
+    m_innerNonZeros = 0;
+    m_data.squeeze();
+  }
+
+  /** Turns the matrix into the uncompressed mode */
+  void uncompress() {
+    if (!isCompressed()) return;
+    m_innerNonZeros = internal::conditional_aligned_new_auto<StorageIndex, true>(m_outerSize);
+    if (m_outerIndex[m_outerSize] == 0) {
+      using std::fill_n;
+      fill_n(m_innerNonZeros, m_outerSize, StorageIndex(0));
+    } else {
+      for (Index j = 0; j < m_outerSize; j++) m_innerNonZeros[j] = m_outerIndex[j + 1] - m_outerIndex[j];
     }
-    
-    /** Turns the matrix into compressed format, and suppresses all nonzeros which do not satisfy the predicate \a keep.
-      * The functor type \a KeepFunc must implement the following function:
-      * \code
-      * bool operator() (const Index& row, const Index& col, const Scalar& value) const;
-      * \endcode
-      * \sa prune(Scalar,RealScalar)
-      */
-    template<typename KeepFunc>
-    void prune(const KeepFunc& keep = KeepFunc())
-    {
-      // TODO optimize the uncompressed mode to avoid moving and allocating the data twice
-      // TODO also implement a unit test
-      makeCompressed();
-
-      Index k = 0;
-      for(Index j=0; j<m_outerSize; ++j)
-      {
-        Index previousStart = m_outerIndex[j];
+  }
+
+  /** Suppresses all nonzeros which are \b much \b smaller \b than \a reference under the tolerance \a epsilon */
+  void prune(const Scalar& reference, const RealScalar& epsilon = NumTraits<RealScalar>::dummy_precision()) {
+    prune(default_prunning_func(reference, epsilon));
+  }
+
+  /** Turns the matrix into compressed format, and suppresses all nonzeros which do not satisfy the predicate \a keep.
+   * The functor type \a KeepFunc must implement the following function:
+   * \code
+   * bool operator() (const Index& row, const Index& col, const Scalar& value) const;
+   * \endcode
+   * \sa prune(Scalar,RealScalar)
+   */
+  template <typename KeepFunc>
+  void prune(const KeepFunc& keep = KeepFunc()) {
+    StorageIndex k = 0;
+    for (Index j = 0; j < m_outerSize; ++j) {
+      StorageIndex previousStart = m_outerIndex[j];
+      if (isCompressed())
         m_outerIndex[j] = k;
-        Index end = m_outerIndex[j+1];
-        for(Index i=previousStart; i<end; ++i)
-        {
-          if(keep(IsRowMajor?j:m_data.index(i), IsRowMajor?m_data.index(i):j, m_data.value(i)))
-          {
-            m_data.value(k) = m_data.value(i);
-            m_data.index(k) = m_data.index(i);
-            ++k;
-          }
-        }
+      else
+        k = m_outerIndex[j];
+      StorageIndex end = isCompressed() ? m_outerIndex[j + 1] : previousStart + m_innerNonZeros[j];
+      for (StorageIndex i = previousStart; i < end; ++i) {
+        StorageIndex row = IsRowMajor ? StorageIndex(j) : m_data.index(i);
+        StorageIndex col = IsRowMajor ? m_data.index(i) : StorageIndex(j);
+        bool keepEntry = keep(row, col, m_data.value(i));
+        if (keepEntry) {
+          m_data.value(k) = m_data.value(i);
+          m_data.index(k) = m_data.index(i);
+          ++k;
+        } else if (!isCompressed())
+          m_innerNonZeros[j]--;
       }
+    }
+    if (isCompressed()) {
       m_outerIndex[m_outerSize] = k;
-      m_data.resize(k,0);
+      m_data.resize(k, 0);
     }
+  }
 
-    /** Resizes the matrix to a \a rows x \a cols matrix leaving old values untouched.
-      * \sa resizeNonZeros(Index), reserve(), setZero()
-      */
-    void conservativeResize(Index rows, Index cols) 
-    {
-      // No change
-      if (this->rows() == rows && this->cols() == cols) return;
-      
-      // If one dimension is null, then there is nothing to be preserved
-      if(rows==0 || cols==0) return resize(rows,cols);
-
-      Index innerChange = IsRowMajor ? cols - this->cols() : rows - this->rows();
-      Index outerChange = IsRowMajor ? rows - this->rows() : cols - this->cols();
-      Index newInnerSize = IsRowMajor ? cols : rows;
-
-      // Deals with inner non zeros
-      if (m_innerNonZeros)
-      {
-        // Resize m_innerNonZeros
-        Index *newInnerNonZeros = static_cast<Index*>(std::realloc(m_innerNonZeros, (m_outerSize + outerChange) * sizeof(Index)));
-        if (!newInnerNonZeros) internal::throw_std_bad_alloc();
-        m_innerNonZeros = newInnerNonZeros;
-        
-        for(Index i=m_outerSize; i<m_outerSize+outerChange; i++)          
-          m_innerNonZeros[i] = 0;
-      } 
-      else if (innerChange < 0) 
-      {
-        // Inner size decreased: allocate a new m_innerNonZeros
-        m_innerNonZeros = static_cast<Index*>(std::malloc((m_outerSize+outerChange+1) * sizeof(Index)));
-        if (!m_innerNonZeros) internal::throw_std_bad_alloc();
-        for(Index i = 0; i < m_outerSize; i++)
-          m_innerNonZeros[i] = m_outerIndex[i+1] - m_outerIndex[i];
+  /** Resizes the matrix to a \a rows x \a cols matrix leaving old values untouched.
+   *
+   * If the sizes of the matrix are decreased, then the matrix is turned to \b uncompressed-mode
+   * and the storage of the out of bounds coefficients is kept and reserved.
+   * Call makeCompressed() to pack the entries and squeeze extra memory.
+   *
+   * \sa reserve(), setZero(), makeCompressed()
+   */
+  void conservativeResize(Index rows, Index cols) {
+    // If one dimension is null, then there is nothing to be preserved
+    if (rows == 0 || cols == 0) return resize(rows, cols);
+
+    Index newOuterSize = IsRowMajor ? rows : cols;
+    Index newInnerSize = IsRowMajor ? cols : rows;
+
+    Index innerChange = newInnerSize - m_innerSize;
+    Index outerChange = newOuterSize - m_outerSize;
+
+    if (outerChange != 0) {
+      m_outerIndex = internal::conditional_aligned_realloc_new_auto<StorageIndex, true>(m_outerIndex, newOuterSize + 1,
+                                                                                        m_outerSize + 1);
+
+      if (!isCompressed())
+        m_innerNonZeros = internal::conditional_aligned_realloc_new_auto<StorageIndex, true>(m_innerNonZeros,
+                                                                                             newOuterSize, m_outerSize);
+
+      if (outerChange > 0) {
+        StorageIndex lastIdx = m_outerSize == 0 ? StorageIndex(0) : m_outerIndex[m_outerSize];
+        using std::fill_n;
+        fill_n(m_outerIndex + m_outerSize, outerChange + 1, lastIdx);
+
+        if (!isCompressed()) fill_n(m_innerNonZeros + m_outerSize, outerChange, StorageIndex(0));
       }
-      
-      // Change the m_innerNonZeros in case of a decrease of inner size
-      if (m_innerNonZeros && innerChange < 0)
-      {
-        for(Index i = 0; i < m_outerSize + (std::min)(outerChange, Index(0)); i++)
-        {
-          Index &n = m_innerNonZeros[i];
-          Index start = m_outerIndex[i];
-          while (n > 0 && m_data.index(start+n-1) >= newInnerSize) --n; 
-        }
-      }
-      
-      m_innerSize = newInnerSize;
-
-      // Re-allocate outer index structure if necessary
-      if (outerChange == 0)
-        return;
-          
-      Index *newOuterIndex = static_cast<Index*>(std::realloc(m_outerIndex, (m_outerSize + outerChange + 1) * sizeof(Index)));
-      if (!newOuterIndex) internal::throw_std_bad_alloc();
-      m_outerIndex = newOuterIndex;
-      if (outerChange > 0)
-      {
-        Index last = m_outerSize == 0 ? 0 : m_outerIndex[m_outerSize];
-        for(Index i=m_outerSize; i<m_outerSize+outerChange+1; i++)          
-          m_outerIndex[i] = last; 
-      }
-      m_outerSize += outerChange;
     }
-    
-    /** Resizes the matrix to a \a rows x \a cols matrix and initializes it to zero.
-      * \sa resizeNonZeros(Index), reserve(), setZero()
-      */
-    void resize(Index rows, Index cols)
-    {
-      const Index outerSize = IsRowMajor ? rows : cols;
-      m_innerSize = IsRowMajor ? cols : rows;
-      m_data.clear();
-      if (m_outerSize != outerSize || m_outerSize==0)
-      {
-        std::free(m_outerIndex);
-        m_outerIndex = static_cast<Index*>(std::malloc((outerSize + 1) * sizeof(Index)));
-        if (!m_outerIndex) internal::throw_std_bad_alloc();
-        
-        m_outerSize = outerSize;
-      }
-      if(m_innerNonZeros)
-      {
-        std::free(m_innerNonZeros);
-        m_innerNonZeros = 0;
+    m_outerSize = newOuterSize;
+
+    if (innerChange < 0) {
+      for (Index j = 0; j < m_outerSize; j++) {
+        Index start = m_outerIndex[j];
+        Index end = isCompressed() ? m_outerIndex[j + 1] : start + m_innerNonZeros[j];
+        Index lb = m_data.searchLowerIndex(start, end, newInnerSize);
+        if (lb != end) {
+          uncompress();
+          m_innerNonZeros[j] = StorageIndex(lb - start);
+        }
       }
-      memset(m_outerIndex, 0, (m_outerSize+1)*sizeof(Index));
     }
+    m_innerSize = newInnerSize;
 
-    /** \internal
-      * Resize the nonzero vector to \a size */
-    void resizeNonZeros(Index size)
-    {
-      // TODO remove this function
-      m_data.resize(size);
+    Index newSize = m_outerIndex[m_outerSize];
+    eigen_assert(newSize <= m_data.size());
+    m_data.resize(newSize);
+  }
+
+  /** Resizes the matrix to a \a rows x \a cols matrix and initializes it to zero.
+   *
+   * This function does not free the currently allocated memory. To release as much as memory as possible,
+   * call \code mat.data().squeeze(); \endcode after resizing it.
+   *
+   * \sa reserve(), setZero()
+   */
+  void resize(Index rows, Index cols) {
+    const Index outerSize = IsRowMajor ? rows : cols;
+    m_innerSize = IsRowMajor ? cols : rows;
+    m_data.clear();
+
+    if ((m_outerIndex == 0) || (m_outerSize != outerSize)) {
+      m_outerIndex = internal::conditional_aligned_realloc_new_auto<StorageIndex, true>(m_outerIndex, outerSize + 1,
+                                                                                        m_outerSize + 1);
+      m_outerSize = outerSize;
     }
 
-    /** \returns a const expression of the diagonal coefficients */
-    const Diagonal<const SparseMatrix> diagonal() const { return *this; }
+    internal::conditional_aligned_delete_auto<StorageIndex, true>(m_innerNonZeros, m_outerSize);
+    m_innerNonZeros = 0;
 
-    /** Default constructor yielding an empty \c 0 \c x \c 0 matrix */
-    inline SparseMatrix()
-      : m_outerSize(-1), m_innerSize(0), m_outerIndex(0), m_innerNonZeros(0)
-    {
-      check_template_parameters();
-      resize(0, 0);
-    }
+    using std::fill_n;
+    fill_n(m_outerIndex, m_outerSize + 1, StorageIndex(0));
+  }
 
-    /** Constructs a \a rows \c x \a cols empty matrix */
-    inline SparseMatrix(Index rows, Index cols)
-      : m_outerSize(0), m_innerSize(0), m_outerIndex(0), m_innerNonZeros(0)
-    {
-      check_template_parameters();
-      resize(rows, cols);
-    }
+  /** \internal
+   * Resize the nonzero vector to \a size */
+  void resizeNonZeros(Index size) { m_data.resize(size); }
 
-    /** Constructs a sparse matrix from the sparse expression \a other */
-    template<typename OtherDerived>
-    inline SparseMatrix(const SparseMatrixBase<OtherDerived>& other)
-      : m_outerSize(0), m_innerSize(0), m_outerIndex(0), m_innerNonZeros(0)
-    {
-      EIGEN_STATIC_ASSERT((internal::is_same<Scalar, typename OtherDerived::Scalar>::value),
+  /** \returns a const expression of the diagonal coefficients. */
+  const ConstDiagonalReturnType diagonal() const { return ConstDiagonalReturnType(*this); }
+
+  /** \returns a read-write expression of the diagonal coefficients.
+   * \warning If the diagonal entries are written, then all diagonal
+   * entries \b must already exist, otherwise an assertion will be raised.
+   */
+  DiagonalReturnType diagonal() { return DiagonalReturnType(*this); }
+
+  /** Default constructor yielding an empty \c 0 \c x \c 0 matrix */
+  inline SparseMatrix() : m_outerSize(0), m_innerSize(0), m_outerIndex(0), m_innerNonZeros(0) { resize(0, 0); }
+
+  /** Constructs a \a rows \c x \a cols empty matrix */
+  inline SparseMatrix(Index rows, Index cols) : m_outerSize(0), m_innerSize(0), m_outerIndex(0), m_innerNonZeros(0) {
+    resize(rows, cols);
+  }
+
+  /** Constructs a sparse matrix from the sparse expression \a other */
+  template <typename OtherDerived>
+  inline SparseMatrix(const SparseMatrixBase<OtherDerived>& other)
+      : m_outerSize(0), m_innerSize(0), m_outerIndex(0), m_innerNonZeros(0) {
+    EIGEN_STATIC_ASSERT(
+        (internal::is_same<Scalar, typename OtherDerived::Scalar>::value),
         YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
-      check_template_parameters();
+    const bool needToTranspose = (Flags & RowMajorBit) != (internal::evaluator<OtherDerived>::Flags & RowMajorBit);
+    if (needToTranspose)
       *this = other.derived();
+    else {
+#ifdef EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+      EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+#endif
+      internal::call_assignment_no_alias(*this, other.derived());
     }
-    
-    /** Constructs a sparse matrix from the sparse selfadjoint view \a other */
-    template<typename OtherDerived, unsigned int UpLo>
-    inline SparseMatrix(const SparseSelfAdjointView<OtherDerived, UpLo>& other)
-      : m_outerSize(0), m_innerSize(0), m_outerIndex(0), m_innerNonZeros(0)
-    {
-      check_template_parameters();
-      *this = other;
-    }
+  }
 
-    /** Copy constructor (it performs a deep copy) */
-    inline SparseMatrix(const SparseMatrix& other)
-      : Base(), m_outerSize(0), m_innerSize(0), m_outerIndex(0), m_innerNonZeros(0)
-    {
-      check_template_parameters();
-      *this = other.derived();
-    }
+  /** Constructs a sparse matrix from the sparse selfadjoint view \a other */
+  template <typename OtherDerived, unsigned int UpLo>
+  inline SparseMatrix(const SparseSelfAdjointView<OtherDerived, UpLo>& other)
+      : m_outerSize(0), m_innerSize(0), m_outerIndex(0), m_innerNonZeros(0) {
+    Base::operator=(other);
+  }
 
-    /** \brief Copy constructor with in-place evaluation */
-    template<typename OtherDerived>
-    SparseMatrix(const ReturnByValue<OtherDerived>& other)
-      : Base(), m_outerSize(0), m_innerSize(0), m_outerIndex(0), m_innerNonZeros(0)
-    {
-      check_template_parameters();
-      initAssignment(other);
-      other.evalTo(*this);
-    }
+  /** Move constructor */
+  inline SparseMatrix(SparseMatrix&& other) : SparseMatrix() { this->swap(other); }
 
-    /** Swaps the content of two sparse matrices of the same type.
-      * This is a fast operation that simply swaps the underlying pointers and parameters. */
-    inline void swap(SparseMatrix& other)
-    {
-      //EIGEN_DBG_SPARSE(std::cout << "SparseMatrix:: swap\n");
-      std::swap(m_outerIndex, other.m_outerIndex);
-      std::swap(m_innerSize, other.m_innerSize);
-      std::swap(m_outerSize, other.m_outerSize);
-      std::swap(m_innerNonZeros, other.m_innerNonZeros);
-      m_data.swap(other.m_data);
-    }
+  template <typename OtherDerived>
+  inline SparseMatrix(SparseCompressedBase<OtherDerived>&& other) : SparseMatrix() {
+    *this = other.derived().markAsRValue();
+  }
 
-    /** Sets *this to the identity matrix.
-      * This function also turns the matrix into compressed mode, and drop any reserved memory. */
-    inline void setIdentity()
-    {
-      eigen_assert(rows() == cols() && "ONLY FOR SQUARED MATRICES");
-      this->m_data.resize(rows());
-      Eigen::Map<Matrix<Index, Dynamic, 1> >(&this->m_data.index(0), rows()).setLinSpaced(0, rows()-1);
-      Eigen::Map<Matrix<Scalar, Dynamic, 1> >(&this->m_data.value(0), rows()).setOnes();
-      Eigen::Map<Matrix<Index, Dynamic, 1> >(this->m_outerIndex, rows()+1).setLinSpaced(0, rows());
-      std::free(m_innerNonZeros);
-      m_innerNonZeros = 0;
-    }
-    inline SparseMatrix& operator=(const SparseMatrix& other)
-    {
-      if (other.isRValue())
-      {
-        swap(other.const_cast_derived());
-      }
-      else if(this!=&other)
-      {
-        initAssignment(other);
-        if(other.isCompressed())
-        {
-          memcpy(m_outerIndex, other.m_outerIndex, (m_outerSize+1)*sizeof(Index));
-          m_data = other.m_data;
-        }
-        else
-        {
-          Base::operator=(other);
-        }
-      }
-      return *this;
-    }
+  /** Copy constructor (it performs a deep copy) */
+  inline SparseMatrix(const SparseMatrix& other)
+      : Base(), m_outerSize(0), m_innerSize(0), m_outerIndex(0), m_innerNonZeros(0) {
+    *this = other.derived();
+  }
 
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    template<typename Lhs, typename Rhs>
-    inline SparseMatrix& operator=(const SparseSparseProduct<Lhs,Rhs>& product)
-    { return Base::operator=(product); }
-    
-    template<typename OtherDerived>
-    inline SparseMatrix& operator=(const ReturnByValue<OtherDerived>& other)
-    {
+  /** \brief Copy constructor with in-place evaluation */
+  template <typename OtherDerived>
+  SparseMatrix(const ReturnByValue<OtherDerived>& other)
+      : Base(), m_outerSize(0), m_innerSize(0), m_outerIndex(0), m_innerNonZeros(0) {
+    initAssignment(other);
+    other.evalTo(*this);
+  }
+
+  /** \brief Copy constructor with in-place evaluation */
+  template <typename OtherDerived>
+  explicit SparseMatrix(const DiagonalBase<OtherDerived>& other)
+      : Base(), m_outerSize(0), m_innerSize(0), m_outerIndex(0), m_innerNonZeros(0) {
+    *this = other.derived();
+  }
+
+  /** Swaps the content of two sparse matrices of the same type.
+   * This is a fast operation that simply swaps the underlying pointers and parameters. */
+  inline void swap(SparseMatrix& other) {
+    // EIGEN_DBG_SPARSE(std::cout << "SparseMatrix:: swap\n");
+    std::swap(m_outerIndex, other.m_outerIndex);
+    std::swap(m_innerSize, other.m_innerSize);
+    std::swap(m_outerSize, other.m_outerSize);
+    std::swap(m_innerNonZeros, other.m_innerNonZeros);
+    m_data.swap(other.m_data);
+  }
+  /** Free-function swap. */
+  friend EIGEN_DEVICE_FUNC void swap(SparseMatrix& a, SparseMatrix& b) { a.swap(b); }
+
+  /** Sets *this to the identity matrix.
+   * This function also turns the matrix into compressed mode, and drop any reserved memory. */
+  inline void setIdentity() {
+    eigen_assert(m_outerSize == m_innerSize && "ONLY FOR SQUARED MATRICES");
+    internal::conditional_aligned_delete_auto<StorageIndex, true>(m_innerNonZeros, m_outerSize);
+    m_innerNonZeros = 0;
+    m_data.resize(m_outerSize);
+    // is it necessary to squeeze?
+    m_data.squeeze();
+    std::iota(m_outerIndex, m_outerIndex + m_outerSize + 1, StorageIndex(0));
+    std::iota(innerIndexPtr(), innerIndexPtr() + m_outerSize, StorageIndex(0));
+    using std::fill_n;
+    fill_n(valuePtr(), m_outerSize, Scalar(1));
+  }
+
+  inline SparseMatrix& operator=(const SparseMatrix& other) {
+    if (other.isRValue()) {
+      swap(other.const_cast_derived());
+    } else if (this != &other) {
+#ifdef EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+      EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+#endif
       initAssignment(other);
-      return Base::operator=(other.derived());
+      if (other.isCompressed()) {
+        internal::smart_copy(other.m_outerIndex, other.m_outerIndex + m_outerSize + 1, m_outerIndex);
+        m_data = other.m_data;
+      } else {
+        Base::operator=(other);
+      }
     }
-    
-    template<typename OtherDerived>
-    inline SparseMatrix& operator=(const EigenBase<OtherDerived>& other)
-    { return Base::operator=(other.derived()); }
-    #endif
+    return *this;
+  }
+
+  inline SparseMatrix& operator=(SparseMatrix&& other) {
+    this->swap(other);
+    return *this;
+  }
+
+  template <typename OtherDerived>
+  inline SparseMatrix& operator=(const EigenBase<OtherDerived>& other) {
+    return Base::operator=(other.derived());
+  }
 
-    template<typename OtherDerived>
-    EIGEN_DONT_INLINE SparseMatrix& operator=(const SparseMatrixBase<OtherDerived>& other);
+  template <typename Lhs, typename Rhs>
+  inline SparseMatrix& operator=(const Product<Lhs, Rhs, AliasFreeProduct>& other);
 
-    friend std::ostream & operator << (std::ostream & s, const SparseMatrix& m)
-    {
-      EIGEN_DBG_SPARSE(
-        s << "Nonzero entries:\n";
-        if(m.isCompressed())
-          for (Index i=0; i<m.nonZeros(); ++i)
-            s << "(" << m.m_data.value(i) << "," << m.m_data.index(i) << ") ";
-        else
-          for (Index i=0; i<m.outerSize(); ++i)
-          {
+  template <typename OtherDerived>
+  EIGEN_DONT_INLINE SparseMatrix& operator=(const SparseMatrixBase<OtherDerived>& other);
+
+  template <typename OtherDerived>
+  inline SparseMatrix& operator=(SparseCompressedBase<OtherDerived>&& other) {
+    *this = other.derived().markAsRValue();
+    return *this;
+  }
+
+#ifndef EIGEN_NO_IO
+  friend std::ostream& operator<<(std::ostream& s, const SparseMatrix& m) {
+    EIGEN_DBG_SPARSE(
+        s << "Nonzero entries:\n"; if (m.isCompressed()) {
+          for (Index i = 0; i < m.nonZeros(); ++i) s << "(" << m.m_data.value(i) << "," << m.m_data.index(i) << ") ";
+        } else {
+          for (Index i = 0; i < m.outerSize(); ++i) {
             Index p = m.m_outerIndex[i];
-            Index pe = m.m_outerIndex[i]+m.m_innerNonZeros[i];
-            Index k=p;
-            for (; k<pe; ++k)
+            Index pe = m.m_outerIndex[i] + m.m_innerNonZeros[i];
+            Index k = p;
+            for (; k < pe; ++k) {
               s << "(" << m.m_data.value(k) << "," << m.m_data.index(k) << ") ";
-            for (; k<m.m_outerIndex[i+1]; ++k)
+            }
+            for (; k < m.m_outerIndex[i + 1]; ++k) {
               s << "(_,_) ";
+            }
           }
-        s << std::endl;
-        s << std::endl;
-        s << "Outer pointers:\n";
-        for (Index i=0; i<m.outerSize(); ++i)
-          s << m.m_outerIndex[i] << " ";
-        s << " $" << std::endl;
-        if(!m.isCompressed())
-        {
+        } s << std::endl;
+        s << std::endl; s << "Outer pointers:\n";
+        for (Index i = 0; i < m.outerSize(); ++i) { s << m.m_outerIndex[i] << " "; } s << " $" << std::endl;
+        if (!m.isCompressed()) {
           s << "Inner non zeros:\n";
-          for (Index i=0; i<m.outerSize(); ++i)
+          for (Index i = 0; i < m.outerSize(); ++i) {
             s << m.m_innerNonZeros[i] << " ";
+          }
           s << " $" << std::endl;
-        }
-        s << std::endl;
-      );
-      s << static_cast<const SparseMatrixBase<SparseMatrix>&>(m);
-      return s;
-    }
+        } s
+        << std::endl;);
+    s << static_cast<const SparseMatrixBase<SparseMatrix>&>(m);
+    return s;
+  }
+#endif
 
-    /** Destructor */
-    inline ~SparseMatrix()
-    {
-      std::free(m_outerIndex);
-      std::free(m_innerNonZeros);
-    }
+  /** Destructor */
+  inline ~SparseMatrix() {
+    internal::conditional_aligned_delete_auto<StorageIndex, true>(m_outerIndex, m_outerSize + 1);
+    internal::conditional_aligned_delete_auto<StorageIndex, true>(m_innerNonZeros, m_outerSize);
+  }
+
+  /** Overloaded for performance */
+  Scalar sum() const;
 
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** Overloaded for performance */
-    Scalar sum() const;
+#ifdef EIGEN_SPARSEMATRIX_PLUGIN
+#include EIGEN_SPARSEMATRIX_PLUGIN
 #endif
-    
-#   ifdef EIGEN_SPARSEMATRIX_PLUGIN
-#     include EIGEN_SPARSEMATRIX_PLUGIN
-#   endif
 
-protected:
+ protected:
+  template <typename Other>
+  void initAssignment(const Other& other) {
+    resize(other.rows(), other.cols());
+    internal::conditional_aligned_delete_auto<StorageIndex, true>(m_innerNonZeros, m_outerSize);
+    m_innerNonZeros = 0;
+  }
 
-    template<typename Other>
-    void initAssignment(const Other& other)
-    {
-      resize(other.rows(), other.cols());
-      if(m_innerNonZeros)
-      {
-        std::free(m_innerNonZeros);
-        m_innerNonZeros = 0;
-      }
-    }
+  /** \internal
+   * \sa insert(Index,Index) */
+  EIGEN_DEPRECATED EIGEN_DONT_INLINE Scalar& insertCompressed(Index row, Index col);
 
-    /** \internal
-      * \sa insert(Index,Index) */
-    EIGEN_DONT_INLINE Scalar& insertCompressed(Index row, Index col);
+  /** \internal
+   * A vector object that is equal to 0 everywhere but v at the position i */
+  class SingletonVector {
+    StorageIndex m_index;
+    StorageIndex m_value;
 
-    /** \internal
-      * A vector object that is equal to 0 everywhere but v at the position i */
-    class SingletonVector
-    {
-        Index m_index;
-        Index m_value;
-      public:
-        typedef Index value_type;
-        SingletonVector(Index i, Index v)
-          : m_index(i), m_value(v)
-        {}
-
-        Index operator[](Index i) const { return i==m_index ? m_value : 0; }
-    };
-
-    /** \internal
-      * \sa insert(Index,Index) */
-    EIGEN_DONT_INLINE Scalar& insertUncompressed(Index row, Index col);
-
-public:
-    /** \internal
-      * \sa insert(Index,Index) */
-    EIGEN_STRONG_INLINE Scalar& insertBackUncompressed(Index row, Index col)
-    {
-      const Index outer = IsRowMajor ? row : col;
-      const Index inner = IsRowMajor ? col : row;
+   public:
+    typedef StorageIndex value_type;
+    SingletonVector(Index i, Index v) : m_index(convert_index(i)), m_value(convert_index(v)) {}
+
+    StorageIndex operator[](Index i) const { return i == m_index ? m_value : 0; }
+  };
 
-      eigen_assert(!isCompressed());
-      eigen_assert(m_innerNonZeros[outer]<=(m_outerIndex[outer+1] - m_outerIndex[outer]));
+  /** \internal
+   * \sa insert(Index,Index) */
+  EIGEN_DEPRECATED EIGEN_DONT_INLINE Scalar& insertUncompressed(Index row, Index col);
 
-      Index p = m_outerIndex[outer] + m_innerNonZeros[outer]++;
-      m_data.index(p) = inner;
-      return (m_data.value(p) = 0);
+ public:
+  /** \internal
+   * \sa insert(Index,Index) */
+  EIGEN_STRONG_INLINE Scalar& insertBackUncompressed(Index row, Index col) {
+    const Index outer = IsRowMajor ? row : col;
+    const Index inner = IsRowMajor ? col : row;
+
+    eigen_assert(!isCompressed());
+    eigen_assert(m_innerNonZeros[outer] <= (m_outerIndex[outer + 1] - m_outerIndex[outer]));
+
+    Index p = m_outerIndex[outer] + m_innerNonZeros[outer]++;
+    m_data.index(p) = StorageIndex(inner);
+    m_data.value(p) = Scalar(0);
+    return m_data.value(p);
+  }
+
+ protected:
+  struct IndexPosPair {
+    IndexPosPair(Index a_i, Index a_p) : i(a_i), p(a_p) {}
+    Index i;
+    Index p;
+  };
+
+  /** \internal assign \a diagXpr to the diagonal of \c *this
+   * There are different strategies:
+   *   1 - if *this is overwritten (Func==assign_op) or *this is empty, then we can work treat *this as a dense vector
+   * expression. 2 - otherwise, for each diagonal coeff, 2.a - if it already exists, then we update it, 2.b - if the
+   * correct position is at the end of the vector, and there is capacity, push to back 2.b - otherwise, the insertion
+   * requires a data move, record insertion locations and handle in a second pass 3 - at the end, if some entries failed
+   * to be updated in-place, then we alloc a new buffer, copy each chunk at the right position, and insert the new
+   * elements.
+   */
+  template <typename DiagXpr, typename Func>
+  void assignDiagonal(const DiagXpr diagXpr, const Func& assignFunc) {
+    constexpr StorageIndex kEmptyIndexVal(-1);
+    typedef typename ScalarVector::AlignedMapType ValueMap;
+
+    Index n = diagXpr.size();
+
+    const bool overwrite = internal::is_same<Func, internal::assign_op<Scalar, Scalar>>::value;
+    if (overwrite) {
+      if ((m_outerSize != n) || (m_innerSize != n)) resize(n, n);
     }
 
-private:
-  static void check_template_parameters()
-  {
-    EIGEN_STATIC_ASSERT(NumTraits<Index>::IsSigned,THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE);
-    EIGEN_STATIC_ASSERT((Options&(ColMajor|RowMajor))==Options,INVALID_MATRIX_TEMPLATE_PARAMETERS);
+    if (m_data.size() == 0 || overwrite) {
+      internal::conditional_aligned_delete_auto<StorageIndex, true>(m_innerNonZeros, m_outerSize);
+      m_innerNonZeros = 0;
+      resizeNonZeros(n);
+      ValueMap valueMap(valuePtr(), n);
+      std::iota(m_outerIndex, m_outerIndex + n + 1, StorageIndex(0));
+      std::iota(innerIndexPtr(), innerIndexPtr() + n, StorageIndex(0));
+      valueMap.setZero();
+      internal::call_assignment_no_alias(valueMap, diagXpr, assignFunc);
+    } else {
+      internal::evaluator<DiagXpr> diaEval(diagXpr);
+
+      ei_declare_aligned_stack_constructed_variable(StorageIndex, tmp, n, 0);
+      typename IndexVector::AlignedMapType insertionLocations(tmp, n);
+      insertionLocations.setConstant(kEmptyIndexVal);
+
+      Index deferredInsertions = 0;
+      Index shift = 0;
+
+      for (Index j = 0; j < n; j++) {
+        Index begin = m_outerIndex[j];
+        Index end = isCompressed() ? m_outerIndex[j + 1] : begin + m_innerNonZeros[j];
+        Index capacity = m_outerIndex[j + 1] - end;
+        Index dst = m_data.searchLowerIndex(begin, end, j);
+        // the entry exists: update it now
+        if (dst != end && m_data.index(dst) == StorageIndex(j))
+          assignFunc.assignCoeff(m_data.value(dst), diaEval.coeff(j));
+        // the entry belongs at the back of the vector: push to back
+        else if (dst == end && capacity > 0)
+          assignFunc.assignCoeff(insertBackUncompressed(j, j), diaEval.coeff(j));
+        // the insertion requires a data move, record insertion location and handle in second pass
+        else {
+          insertionLocations.coeffRef(j) = StorageIndex(dst);
+          deferredInsertions++;
+          // if there is no capacity, all vectors to the right of this are shifted
+          if (capacity == 0) shift++;
+        }
+      }
+
+      if (deferredInsertions > 0) {
+        m_data.resize(m_data.size() + shift);
+        Index copyEnd = isCompressed() ? m_outerIndex[m_outerSize]
+                                       : m_outerIndex[m_outerSize - 1] + m_innerNonZeros[m_outerSize - 1];
+        for (Index j = m_outerSize - 1; deferredInsertions > 0; j--) {
+          Index begin = m_outerIndex[j];
+          Index end = isCompressed() ? m_outerIndex[j + 1] : begin + m_innerNonZeros[j];
+          Index capacity = m_outerIndex[j + 1] - end;
+
+          bool doInsertion = insertionLocations(j) >= 0;
+          bool breakUpCopy = doInsertion && (capacity > 0);
+          // break up copy for sorted insertion into inactive nonzeros
+          // optionally, add another criterium, i.e. 'breakUpCopy || (capacity > threhsold)'
+          // where `threshold >= 0` to skip inactive nonzeros in each vector
+          // this reduces the total number of copied elements, but requires more moveChunk calls
+          if (breakUpCopy) {
+            Index copyBegin = m_outerIndex[j + 1];
+            Index to = copyBegin + shift;
+            Index chunkSize = copyEnd - copyBegin;
+            m_data.moveChunk(copyBegin, to, chunkSize);
+            copyEnd = end;
+          }
+
+          m_outerIndex[j + 1] += shift;
+
+          if (doInsertion) {
+            // if there is capacity, shift into the inactive nonzeros
+            if (capacity > 0) shift++;
+            Index copyBegin = insertionLocations(j);
+            Index to = copyBegin + shift;
+            Index chunkSize = copyEnd - copyBegin;
+            m_data.moveChunk(copyBegin, to, chunkSize);
+            Index dst = to - 1;
+            m_data.index(dst) = StorageIndex(j);
+            m_data.value(dst) = Scalar(0);
+            assignFunc.assignCoeff(m_data.value(dst), diaEval.coeff(j));
+            if (!isCompressed()) m_innerNonZeros[j]++;
+            shift--;
+            deferredInsertions--;
+            copyEnd = copyBegin;
+          }
+        }
+      }
+      eigen_assert((shift == 0) && (deferredInsertions == 0));
+    }
   }
 
+  /* These functions are used to avoid a redundant binary search operation in functions such as coeffRef() and assume
+   * `dst` is the appropriate sorted insertion point */
+  EIGEN_STRONG_INLINE Scalar& insertAtByOuterInner(Index outer, Index inner, Index dst);
+  Scalar& insertCompressedAtByOuterInner(Index outer, Index inner, Index dst);
+  Scalar& insertUncompressedAtByOuterInner(Index outer, Index inner, Index dst);
+
+ private:
+  EIGEN_STATIC_ASSERT(NumTraits<StorageIndex>::IsSigned, THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE)
+  EIGEN_STATIC_ASSERT((Options & (ColMajor | RowMajor)) == Options, INVALID_MATRIX_TEMPLATE_PARAMETERS)
+
   struct default_prunning_func {
     default_prunning_func(const Scalar& ref, const RealScalar& eps) : reference(ref), epsilon(eps) {}
-    inline bool operator() (const Index&, const Index&, const Scalar& value) const
-    {
+    inline bool operator()(const Index&, const Index&, const Scalar& value) const {
       return !internal::isMuchSmallerThan(value, reference, epsilon);
     }
     Scalar reference;
@@ -868,125 +1117,192 @@ class SparseMatrix
   };
 };
 
-template<typename Scalar, int _Options, typename _Index>
-class SparseMatrix<Scalar,_Options,_Index>::InnerIterator
-{
-  public:
-    InnerIterator(const SparseMatrix& mat, Index outer)
-      : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(outer), m_id(mat.m_outerIndex[outer])
-    {
-      if(mat.isCompressed())
-        m_end = mat.m_outerIndex[outer+1];
-      else
-        m_end = m_id + mat.m_innerNonZeros[outer];
-    }
-
-    inline InnerIterator& operator++() { m_id++; return *this; }
+namespace internal {
 
-    inline const Scalar& value() const { return m_values[m_id]; }
-    inline Scalar& valueRef() { return const_cast<Scalar&>(m_values[m_id]); }
+// Creates a compressed sparse matrix from a range of unsorted triplets
+// Requires temporary storage to handle duplicate entries
+template <typename InputIterator, typename SparseMatrixType, typename DupFunctor>
+void set_from_triplets(const InputIterator& begin, const InputIterator& end, SparseMatrixType& mat,
+                       DupFunctor dup_func) {
+  constexpr bool IsRowMajor = SparseMatrixType::IsRowMajor;
+  using StorageIndex = typename SparseMatrixType::StorageIndex;
+  using IndexMap = typename VectorX<StorageIndex>::AlignedMapType;
+  using TransposedSparseMatrix =
+      SparseMatrix<typename SparseMatrixType::Scalar, IsRowMajor ? ColMajor : RowMajor, StorageIndex>;
+
+  if (begin == end) {
+    // Clear out existing data (if any).
+    mat.setZero();
+    return;
+  }
 
-    inline Index index() const { return m_indices[m_id]; }
-    inline Index outer() const { return m_outer; }
-    inline Index row() const { return IsRowMajor ? m_outer : index(); }
-    inline Index col() const { return IsRowMajor ? index() : m_outer; }
+  // There are two strategies to consider for constructing a matrix from unordered triplets:
+  // A) construct the 'mat' in its native storage order and sort in-place (less memory); or,
+  // B) construct the transposed matrix and use an implicit sort upon assignment to `mat` (less time).
+  // This routine uses B) for faster execution time.
+  TransposedSparseMatrix trmat(mat.rows(), mat.cols());
+
+  // scan triplets to determine allocation size before constructing matrix
+  Index nonZeros = 0;
+  for (InputIterator it(begin); it != end; ++it) {
+    eigen_assert(it->row() >= 0 && it->row() < mat.rows() && it->col() >= 0 && it->col() < mat.cols());
+    StorageIndex j = convert_index<StorageIndex>(IsRowMajor ? it->col() : it->row());
+    if (nonZeros == NumTraits<StorageIndex>::highest()) internal::throw_std_bad_alloc();
+    trmat.outerIndexPtr()[j + 1]++;
+    nonZeros++;
+  }
 
-    inline operator bool() const { return (m_id < m_end); }
+  std::partial_sum(trmat.outerIndexPtr(), trmat.outerIndexPtr() + trmat.outerSize() + 1, trmat.outerIndexPtr());
+  eigen_assert(nonZeros == trmat.outerIndexPtr()[trmat.outerSize()]);
+  trmat.resizeNonZeros(nonZeros);
+
+  // construct temporary array to track insertions (outersize) and collapse duplicates (innersize)
+  ei_declare_aligned_stack_constructed_variable(StorageIndex, tmp, numext::maxi(mat.innerSize(), mat.outerSize()), 0);
+  smart_copy(trmat.outerIndexPtr(), trmat.outerIndexPtr() + trmat.outerSize(), tmp);
+
+  // push triplets to back of each vector
+  for (InputIterator it(begin); it != end; ++it) {
+    StorageIndex j = convert_index<StorageIndex>(IsRowMajor ? it->col() : it->row());
+    StorageIndex i = convert_index<StorageIndex>(IsRowMajor ? it->row() : it->col());
+    StorageIndex k = tmp[j];
+    trmat.data().index(k) = i;
+    trmat.data().value(k) = it->value();
+    tmp[j]++;
+  }
 
-  protected:
-    const Scalar* m_values;
-    const Index* m_indices;
-    const Index m_outer;
-    Index m_id;
-    Index m_end;
-};
+  IndexMap wi(tmp, trmat.innerSize());
+  trmat.collapseDuplicates(wi, dup_func);
+  // implicit sorting
+  mat = trmat;
+}
 
-template<typename Scalar, int _Options, typename _Index>
-class SparseMatrix<Scalar,_Options,_Index>::ReverseInnerIterator
-{
-  public:
-    ReverseInnerIterator(const SparseMatrix& mat, Index outer)
-      : m_values(mat.valuePtr()), m_indices(mat.innerIndexPtr()), m_outer(outer), m_start(mat.m_outerIndex[outer])
-    {
-      if(mat.isCompressed())
-        m_id = mat.m_outerIndex[outer+1];
-      else
-        m_id = m_start + mat.m_innerNonZeros[outer];
+// Creates a compressed sparse matrix from a sorted range of triplets
+template <typename InputIterator, typename SparseMatrixType, typename DupFunctor>
+void set_from_triplets_sorted(const InputIterator& begin, const InputIterator& end, SparseMatrixType& mat,
+                              DupFunctor dup_func) {
+  constexpr bool IsRowMajor = SparseMatrixType::IsRowMajor;
+  using StorageIndex = typename SparseMatrixType::StorageIndex;
+
+  if (begin == end) return;
+
+  constexpr StorageIndex kEmptyIndexValue(-1);
+  // deallocate inner nonzeros if present and zero outerIndexPtr
+  mat.resize(mat.rows(), mat.cols());
+  // use outer indices to count non zero entries (excluding duplicate entries)
+  StorageIndex previous_j = kEmptyIndexValue;
+  StorageIndex previous_i = kEmptyIndexValue;
+  // scan triplets to determine allocation size before constructing matrix
+  Index nonZeros = 0;
+  for (InputIterator it(begin); it != end; ++it) {
+    eigen_assert(it->row() >= 0 && it->row() < mat.rows() && it->col() >= 0 && it->col() < mat.cols());
+    StorageIndex j = convert_index<StorageIndex>(IsRowMajor ? it->row() : it->col());
+    StorageIndex i = convert_index<StorageIndex>(IsRowMajor ? it->col() : it->row());
+    eigen_assert(j > previous_j || (j == previous_j && i >= previous_i));
+    // identify duplicates by examining previous location
+    bool duplicate = (previous_j == j) && (previous_i == i);
+    if (!duplicate) {
+      if (nonZeros == NumTraits<StorageIndex>::highest()) internal::throw_std_bad_alloc();
+      nonZeros++;
+      mat.outerIndexPtr()[j + 1]++;
+      previous_j = j;
+      previous_i = i;
     }
+  }
 
-    inline ReverseInnerIterator& operator--() { --m_id; return *this; }
-
-    inline const Scalar& value() const { return m_values[m_id-1]; }
-    inline Scalar& valueRef() { return const_cast<Scalar&>(m_values[m_id-1]); }
-
-    inline Index index() const { return m_indices[m_id-1]; }
-    inline Index outer() const { return m_outer; }
-    inline Index row() const { return IsRowMajor ? m_outer : index(); }
-    inline Index col() const { return IsRowMajor ? index() : m_outer; }
-
-    inline operator bool() const { return (m_id > m_start); }
+  // finalize outer indices and allocate memory
+  std::partial_sum(mat.outerIndexPtr(), mat.outerIndexPtr() + mat.outerSize() + 1, mat.outerIndexPtr());
+  eigen_assert(nonZeros == mat.outerIndexPtr()[mat.outerSize()]);
+  mat.resizeNonZeros(nonZeros);
+
+  previous_i = kEmptyIndexValue;
+  previous_j = kEmptyIndexValue;
+  Index back = 0;
+  for (InputIterator it(begin); it != end; ++it) {
+    StorageIndex j = convert_index<StorageIndex>(IsRowMajor ? it->row() : it->col());
+    StorageIndex i = convert_index<StorageIndex>(IsRowMajor ? it->col() : it->row());
+    bool duplicate = (previous_j == j) && (previous_i == i);
+    if (duplicate) {
+      mat.data().value(back - 1) = dup_func(mat.data().value(back - 1), it->value());
+    } else {
+      // push triplets to back
+      mat.data().index(back) = i;
+      mat.data().value(back) = it->value();
+      previous_j = j;
+      previous_i = i;
+      back++;
+    }
+  }
+  eigen_assert(back == nonZeros);
+  // matrix is finalized
+}
 
-  protected:
-    const Scalar* m_values;
-    const Index* m_indices;
-    const Index m_outer;
-    Index m_id;
-    const Index m_start;
+// thin wrapper around a generic binary functor to use the sparse disjunction evaluator instead of the default
+// "arithmetic" evaluator
+template <typename DupFunctor, typename LhsScalar, typename RhsScalar = LhsScalar>
+struct scalar_disjunction_op {
+  using result_type = typename result_of<DupFunctor(LhsScalar, RhsScalar)>::type;
+  scalar_disjunction_op(const DupFunctor& op) : m_functor(op) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE result_type operator()(const LhsScalar& a, const RhsScalar& b) const {
+    return m_functor(a, b);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DupFunctor& functor() const { return m_functor; }
+  const DupFunctor& m_functor;
 };
 
-namespace internal {
+template <typename DupFunctor, typename LhsScalar, typename RhsScalar>
+struct functor_traits<scalar_disjunction_op<DupFunctor, LhsScalar, RhsScalar>> : public functor_traits<DupFunctor> {};
 
-template<typename InputIterator, typename SparseMatrixType>
-void set_from_triplets(const InputIterator& begin, const InputIterator& end, SparseMatrixType& mat, int Options = 0)
-{
-  EIGEN_UNUSED_VARIABLE(Options);
-  enum { IsRowMajor = SparseMatrixType::IsRowMajor };
-  typedef typename SparseMatrixType::Scalar Scalar;
-  typedef typename SparseMatrixType::Index Index;
-  SparseMatrix<Scalar,IsRowMajor?ColMajor:RowMajor,Index> trMat(mat.rows(),mat.cols());
-
-  if(begin!=end)
-  {
-    // pass 1: count the nnz per inner-vector
-    Matrix<Index,Dynamic,1> wi(trMat.outerSize());
-    wi.setZero();
-    for(InputIterator it(begin); it!=end; ++it)
-    {
-      eigen_assert(it->row()>=0 && it->row()<mat.rows() && it->col()>=0 && it->col()<mat.cols());
-      wi(IsRowMajor ? it->col() : it->row())++;
-    }
+// Creates a compressed sparse matrix from its existing entries and those from an unsorted range of triplets
+template <typename InputIterator, typename SparseMatrixType, typename DupFunctor>
+void insert_from_triplets(const InputIterator& begin, const InputIterator& end, SparseMatrixType& mat,
+                          DupFunctor dup_func) {
+  using Scalar = typename SparseMatrixType::Scalar;
+  using SrcXprType =
+      CwiseBinaryOp<scalar_disjunction_op<DupFunctor, Scalar>, const SparseMatrixType, const SparseMatrixType>;
 
-    // pass 2: insert all the elements into trMat
-    trMat.reserve(wi);
-    for(InputIterator it(begin); it!=end; ++it)
-      trMat.insertBackUncompressed(it->row(),it->col()) = it->value();
+  // set_from_triplets is necessary to sort the inner indices and remove the duplicate entries
+  SparseMatrixType trips(mat.rows(), mat.cols());
+  set_from_triplets(begin, end, trips, dup_func);
 
-    // pass 3:
-    trMat.sumupDuplicates();
-  }
-
-  // pass 4: transposed copy -> implicit sorting
-  mat = trMat;
+  SrcXprType src = mat.binaryExpr(trips, scalar_disjunction_op<DupFunctor, Scalar>(dup_func));
+  // the sparse assignment procedure creates a temporary matrix and swaps the final result
+  assign_sparse_to_sparse<SparseMatrixType, SrcXprType>(mat, src);
 }
 
+// Creates a compressed sparse matrix from its existing entries and those from an sorted range of triplets
+template <typename InputIterator, typename SparseMatrixType, typename DupFunctor>
+void insert_from_triplets_sorted(const InputIterator& begin, const InputIterator& end, SparseMatrixType& mat,
+                                 DupFunctor dup_func) {
+  using Scalar = typename SparseMatrixType::Scalar;
+  using SrcXprType =
+      CwiseBinaryOp<scalar_disjunction_op<DupFunctor, Scalar>, const SparseMatrixType, const SparseMatrixType>;
+
+  // TODO: process triplets without making a copy
+  SparseMatrixType trips(mat.rows(), mat.cols());
+  set_from_triplets_sorted(begin, end, trips, dup_func);
+
+  SrcXprType src = mat.binaryExpr(trips, scalar_disjunction_op<DupFunctor, Scalar>(dup_func));
+  // the sparse assignment procedure creates a temporary matrix and swaps the final result
+  assign_sparse_to_sparse<SparseMatrixType, SrcXprType>(mat, src);
 }
 
+}  // namespace internal
 
-/** Fill the matrix \c *this with the list of \em triplets defined by the iterator range \a begin - \a end.
+/** Fill the matrix \c *this with the list of \em triplets defined in the half-open range from \a begin to \a end.
   *
   * A \em triplet is a tuple (i,j,value) defining a non-zero element.
-  * The input list of triplets does not have to be sorted, and can contains duplicated elements.
+  * The input list of triplets does not have to be sorted, and may contain duplicated elements.
   * In any case, the result is a \b sorted and \b compressed sparse matrix where the duplicates have been summed up.
   * This is a \em O(n) operation, with \em n the number of triplet elements.
-  * The initial contents of \c *this is destroyed.
+  * The initial contents of \c *this are destroyed.
   * The matrix \c *this must be properly resized beforehand using the SparseMatrix(Index,Index) constructor,
   * or the resize(Index,Index) method. The sizes are not extracted from the triplet list.
   *
   * The \a InputIterators value_type must provide the following interface:
   * \code
   * Scalar value() const; // the value
-  * Scalar row() const;   // the row index i
-  * Scalar col() const;   // the column index j
+  * IndexType row() const;   // the row index i
+  * IndexType col() const;   // the column index j
   * \endcode
   * See for instance the Eigen::Triplet template class.
   *
@@ -994,7 +1310,7 @@ void set_from_triplets(const InputIterator& begin, const InputIterator& end, Spa
   * \code
     typedef Triplet<double> T;
     std::vector<T> tripletList;
-    triplets.reserve(estimation_of_entries);
+    tripletList.reserve(estimation_of_entries);
     for(...)
     {
       // ...
@@ -1007,88 +1323,240 @@ void set_from_triplets(const InputIterator& begin, const InputIterator& end, Spa
   *
   * \warning The list of triplets is read multiple times (at least twice). Therefore, it is not recommended to define
   * an abstract iterator over a complex data-structure that would be expensive to evaluate. The triplets should rather
-  * be explicitely stored into a std::vector for instance.
+  * be explicitly stored into a std::vector for instance.
   */
-template<typename Scalar, int _Options, typename _Index>
-template<typename InputIterators>
-void SparseMatrix<Scalar,_Options,_Index>::setFromTriplets(const InputIterators& begin, const InputIterators& end)
-{
-  internal::set_from_triplets(begin, end, *this);
+template <typename Scalar, int Options_, typename StorageIndex_>
+template <typename InputIterators>
+void SparseMatrix<Scalar, Options_, StorageIndex_>::setFromTriplets(const InputIterators& begin,
+                                                                    const InputIterators& end) {
+  internal::set_from_triplets<InputIterators, SparseMatrix<Scalar, Options_, StorageIndex_>>(
+      begin, end, *this, internal::scalar_sum_op<Scalar, Scalar>());
+}
+
+/** The same as setFromTriplets but when duplicates are met the functor \a dup_func is applied:
+ * \code
+ * value = dup_func(OldValue, NewValue)
+ * \endcode
+ * Here is a C++11 example keeping the latest entry only:
+ * \code
+ * mat.setFromTriplets(triplets.begin(), triplets.end(), [] (const Scalar&,const Scalar &b) { return b; });
+ * \endcode
+ */
+template <typename Scalar, int Options_, typename StorageIndex_>
+template <typename InputIterators, typename DupFunctor>
+void SparseMatrix<Scalar, Options_, StorageIndex_>::setFromTriplets(const InputIterators& begin,
+                                                                    const InputIterators& end, DupFunctor dup_func) {
+  internal::set_from_triplets<InputIterators, SparseMatrix<Scalar, Options_, StorageIndex_>, DupFunctor>(
+      begin, end, *this, dup_func);
+}
+
+/** The same as setFromTriplets but triplets are assumed to be pre-sorted. This is faster and requires less temporary
+ * storage. Two triplets `a` and `b` are appropriately ordered if: \code ColMajor: ((a.col() != b.col()) ? (a.col() <
+ * b.col()) : (a.row() < b.row()) RowMajor: ((a.row() != b.row()) ? (a.row() < b.row()) : (a.col() < b.col()) \endcode
+ */
+template <typename Scalar, int Options_, typename StorageIndex_>
+template <typename InputIterators>
+void SparseMatrix<Scalar, Options_, StorageIndex_>::setFromSortedTriplets(const InputIterators& begin,
+                                                                          const InputIterators& end) {
+  internal::set_from_triplets_sorted<InputIterators, SparseMatrix<Scalar, Options_, StorageIndex_>>(
+      begin, end, *this, internal::scalar_sum_op<Scalar, Scalar>());
+}
+
+/** The same as setFromSortedTriplets but when duplicates are met the functor \a dup_func is applied:
+ * \code
+ * value = dup_func(OldValue, NewValue)
+ * \endcode
+ * Here is a C++11 example keeping the latest entry only:
+ * \code
+ * mat.setFromSortedTriplets(triplets.begin(), triplets.end(), [] (const Scalar&,const Scalar &b) { return b; });
+ * \endcode
+ */
+template <typename Scalar, int Options_, typename StorageIndex_>
+template <typename InputIterators, typename DupFunctor>
+void SparseMatrix<Scalar, Options_, StorageIndex_>::setFromSortedTriplets(const InputIterators& begin,
+                                                                          const InputIterators& end,
+                                                                          DupFunctor dup_func) {
+  internal::set_from_triplets_sorted<InputIterators, SparseMatrix<Scalar, Options_, StorageIndex_>, DupFunctor>(
+      begin, end, *this, dup_func);
+}
+
+/** Insert a batch of elements into the matrix \c *this with the list of \em triplets defined in the half-open range
+  from \a begin to \a end.
+  *
+  * A \em triplet is a tuple (i,j,value) defining a non-zero element.
+  * The input list of triplets does not have to be sorted, and may contain duplicated elements.
+  * In any case, the result is a \b sorted and \b compressed sparse matrix where the duplicates have been summed up.
+  * This is a \em O(n) operation, with \em n the number of triplet elements.
+  * The initial contents of \c *this are preserved (except for the summation of duplicate elements).
+  * The matrix \c *this must be properly sized beforehand. The sizes are not extracted from the triplet list.
+  *
+  * The \a InputIterators value_type must provide the following interface:
+  * \code
+  * Scalar value() const; // the value
+  * IndexType row() const;   // the row index i
+  * IndexType col() const;   // the column index j
+  * \endcode
+  * See for instance the Eigen::Triplet template class.
+  *
+  * Here is a typical usage example:
+  * \code
+    SparseMatrixType m(rows,cols); // m contains nonzero entries
+    typedef Triplet<double> T;
+    std::vector<T> tripletList;
+    tripletList.reserve(estimation_of_entries);
+    for(...)
+    {
+      // ...
+      tripletList.push_back(T(i,j,v_ij));
+    }
+
+    m.insertFromTriplets(tripletList.begin(), tripletList.end());
+    // m is ready to go!
+  * \endcode
+  *
+  * \warning The list of triplets is read multiple times (at least twice). Therefore, it is not recommended to define
+  * an abstract iterator over a complex data-structure that would be expensive to evaluate. The triplets should rather
+  * be explicitly stored into a std::vector for instance.
+  */
+template <typename Scalar, int Options_, typename StorageIndex_>
+template <typename InputIterators>
+void SparseMatrix<Scalar, Options_, StorageIndex_>::insertFromTriplets(const InputIterators& begin,
+                                                                       const InputIterators& end) {
+  internal::insert_from_triplets<InputIterators, SparseMatrix<Scalar, Options_, StorageIndex_>>(
+      begin, end, *this, internal::scalar_sum_op<Scalar, Scalar>());
+}
+
+/** The same as insertFromTriplets but when duplicates are met the functor \a dup_func is applied:
+ * \code
+ * value = dup_func(OldValue, NewValue)
+ * \endcode
+ * Here is a C++11 example keeping the latest entry only:
+ * \code
+ * mat.insertFromTriplets(triplets.begin(), triplets.end(), [] (const Scalar&,const Scalar &b) { return b; });
+ * \endcode
+ */
+template <typename Scalar, int Options_, typename StorageIndex_>
+template <typename InputIterators, typename DupFunctor>
+void SparseMatrix<Scalar, Options_, StorageIndex_>::insertFromTriplets(const InputIterators& begin,
+                                                                       const InputIterators& end, DupFunctor dup_func) {
+  internal::insert_from_triplets<InputIterators, SparseMatrix<Scalar, Options_, StorageIndex_>, DupFunctor>(
+      begin, end, *this, dup_func);
+}
+
+/** The same as insertFromTriplets but triplets are assumed to be pre-sorted. This is faster and requires less temporary
+ * storage. Two triplets `a` and `b` are appropriately ordered if: \code ColMajor: ((a.col() != b.col()) ? (a.col() <
+ * b.col()) : (a.row() < b.row()) RowMajor: ((a.row() != b.row()) ? (a.row() < b.row()) : (a.col() < b.col()) \endcode
+ */
+template <typename Scalar, int Options_, typename StorageIndex_>
+template <typename InputIterators>
+void SparseMatrix<Scalar, Options_, StorageIndex_>::insertFromSortedTriplets(const InputIterators& begin,
+                                                                             const InputIterators& end) {
+  internal::insert_from_triplets_sorted<InputIterators, SparseMatrix<Scalar, Options_, StorageIndex_>>(
+      begin, end, *this, internal::scalar_sum_op<Scalar, Scalar>());
+}
+
+/** The same as insertFromSortedTriplets but when duplicates are met the functor \a dup_func is applied:
+ * \code
+ * value = dup_func(OldValue, NewValue)
+ * \endcode
+ * Here is a C++11 example keeping the latest entry only:
+ * \code
+ * mat.insertFromSortedTriplets(triplets.begin(), triplets.end(), [] (const Scalar&,const Scalar &b) { return b; });
+ * \endcode
+ */
+template <typename Scalar, int Options_, typename StorageIndex_>
+template <typename InputIterators, typename DupFunctor>
+void SparseMatrix<Scalar, Options_, StorageIndex_>::insertFromSortedTriplets(const InputIterators& begin,
+                                                                             const InputIterators& end,
+                                                                             DupFunctor dup_func) {
+  internal::insert_from_triplets_sorted<InputIterators, SparseMatrix<Scalar, Options_, StorageIndex_>, DupFunctor>(
+      begin, end, *this, dup_func);
 }
 
 /** \internal */
-template<typename Scalar, int _Options, typename _Index>
-void SparseMatrix<Scalar,_Options,_Index>::sumupDuplicates()
-{
-  eigen_assert(!isCompressed());
-  // TODO, in practice we should be able to use m_innerNonZeros for that task
-  Matrix<Index,Dynamic,1> wi(innerSize());
-  wi.fill(-1);
-  Index count = 0;
+template <typename Scalar_, int Options_, typename StorageIndex_>
+template <typename Derived, typename DupFunctor>
+void SparseMatrix<Scalar_, Options_, StorageIndex_>::collapseDuplicates(DenseBase<Derived>& wi, DupFunctor dup_func) {
+  // removes duplicate entries and compresses the matrix
+  // the excess allocated memory is not released
+  // the inner indices do not need to be sorted, nor is the matrix returned in a sorted state
+  eigen_assert(wi.size() == m_innerSize);
+  constexpr StorageIndex kEmptyIndexValue(-1);
+  wi.setConstant(kEmptyIndexValue);
+  StorageIndex count = 0;
+  const bool is_compressed = isCompressed();
   // for each inner-vector, wi[inner_index] will hold the position of first element into the index/value buffers
-  for(Index j=0; j<outerSize(); ++j)
-  {
-    Index start   = count;
-    Index oldEnd  = m_outerIndex[j]+m_innerNonZeros[j];
-    for(Index k=m_outerIndex[j]; k<oldEnd; ++k)
-    {
-      Index i = m_data.index(k);
-      if(wi(i)>=start)
-      {
-        // we already meet this entry => accumulate it
-        m_data.value(wi(i)) += m_data.value(k);
-      }
-      else
-      {
+  for (Index j = 0; j < m_outerSize; ++j) {
+    const StorageIndex newBegin = count;
+    const StorageIndex end = is_compressed ? m_outerIndex[j + 1] : m_outerIndex[j] + m_innerNonZeros[j];
+    for (StorageIndex k = m_outerIndex[j]; k < end; ++k) {
+      StorageIndex i = m_data.index(k);
+      if (wi(i) >= newBegin) {
+        // entry at k is a duplicate
+        // accumulate it into the primary entry located at wi(i)
+        m_data.value(wi(i)) = dup_func(m_data.value(wi(i)), m_data.value(k));
+      } else {
+        // k is the primary entry in j with inner index i
+        // shift it to the left and record its location at wi(i)
+        m_data.index(count) = i;
         m_data.value(count) = m_data.value(k);
-        m_data.index(count) = m_data.index(k);
         wi(i) = count;
         ++count;
       }
     }
-    m_outerIndex[j] = start;
+    m_outerIndex[j] = newBegin;
   }
   m_outerIndex[m_outerSize] = count;
+  m_data.resize(count);
 
-  // turn the matrix into compressed form
-  std::free(m_innerNonZeros);
+  // turn the matrix into compressed form (if it is not already)
+  internal::conditional_aligned_delete_auto<StorageIndex, true>(m_innerNonZeros, m_outerSize);
   m_innerNonZeros = 0;
-  m_data.resize(m_outerIndex[m_outerSize]);
 }
 
-template<typename Scalar, int _Options, typename _Index>
-template<typename OtherDerived>
-EIGEN_DONT_INLINE SparseMatrix<Scalar,_Options,_Index>& SparseMatrix<Scalar,_Options,_Index>::operator=(const SparseMatrixBase<OtherDerived>& other)
-{
-  EIGEN_STATIC_ASSERT((internal::is_same<Scalar, typename OtherDerived::Scalar>::value),
-        YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
-  
-  const bool needToTranspose = (Flags & RowMajorBit) != (OtherDerived::Flags & RowMajorBit);
-  if (needToTranspose)
-  {
+/** \internal */
+template <typename Scalar, int Options_, typename StorageIndex_>
+template <typename OtherDerived>
+EIGEN_DONT_INLINE SparseMatrix<Scalar, Options_, StorageIndex_>&
+SparseMatrix<Scalar, Options_, StorageIndex_>::operator=(const SparseMatrixBase<OtherDerived>& other) {
+  EIGEN_STATIC_ASSERT(
+      (internal::is_same<Scalar, typename OtherDerived::Scalar>::value),
+      YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+
+#ifdef EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+  EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+#endif
+
+  const bool needToTranspose = (Flags & RowMajorBit) != (internal::evaluator<OtherDerived>::Flags & RowMajorBit);
+  if (needToTranspose) {
+#ifdef EIGEN_SPARSE_TRANSPOSED_COPY_PLUGIN
+    EIGEN_SPARSE_TRANSPOSED_COPY_PLUGIN
+#endif
     // two passes algorithm:
     //  1 - compute the number of coeffs per dest inner vector
     //  2 - do the actual copy/eval
     // Since each coeff of the rhs has to be evaluated twice, let's evaluate it if needed
-    typedef typename internal::nested<OtherDerived,2>::type OtherCopy;
-    typedef typename internal::remove_all<OtherCopy>::type _OtherCopy;
+    typedef
+        typename internal::nested_eval<OtherDerived, 2, typename internal::plain_matrix_type<OtherDerived>::type>::type
+            OtherCopy;
+    typedef internal::remove_all_t<OtherCopy> OtherCopy_;
+    typedef internal::evaluator<OtherCopy_> OtherCopyEval;
     OtherCopy otherCopy(other.derived());
+    OtherCopyEval otherCopyEval(otherCopy);
 
-    SparseMatrix dest(other.rows(),other.cols());
-    Eigen::Map<Matrix<Index, Dynamic, 1> > (dest.m_outerIndex,dest.outerSize()).setZero();
+    SparseMatrix dest(other.rows(), other.cols());
+    Eigen::Map<IndexVector>(dest.m_outerIndex, dest.outerSize()).setZero();
 
     // pass 1
     // FIXME the above copy could be merged with that pass
-    for (Index j=0; j<otherCopy.outerSize(); ++j)
-      for (typename _OtherCopy::InnerIterator it(otherCopy, j); it; ++it)
-        ++dest.m_outerIndex[it.index()];
+    for (Index j = 0; j < otherCopy.outerSize(); ++j)
+      for (typename OtherCopyEval::InnerIterator it(otherCopyEval, j); it; ++it) ++dest.m_outerIndex[it.index()];
 
     // prefix sum
-    Index count = 0;
-    Matrix<Index,Dynamic,1> positions(dest.outerSize());
-    for (Index j=0; j<dest.outerSize(); ++j)
-    {
-      Index tmp = dest.m_outerIndex[j];
+    StorageIndex count = 0;
+    IndexVector positions(dest.outerSize());
+    for (Index j = 0; j < dest.outerSize(); ++j) {
+      StorageIndex tmp = dest.m_outerIndex[j];
       dest.m_outerIndex[j] = count;
       positions[j] = count;
       count += tmp;
@@ -1097,10 +1565,8 @@ EIGEN_DONT_INLINE SparseMatrix<Scalar,_Options,_Index>& SparseMatrix<Scalar,_Opt
     // alloc
     dest.m_data.resize(count);
     // pass 2
-    for (Index j=0; j<otherCopy.outerSize(); ++j)
-    {
-      for (typename _OtherCopy::InnerIterator it(otherCopy, j); it; ++it)
-      {
+    for (StorageIndex j = 0; j < otherCopy.outerSize(); ++j) {
+      for (typename OtherCopyEval::InnerIterator it(otherCopyEval, j); it; ++it) {
         Index pos = positions[it.index()]++;
         dest.m_data.index(pos) = j;
         dest.m_data.value(pos) = it.value();
@@ -1108,155 +1574,303 @@ EIGEN_DONT_INLINE SparseMatrix<Scalar,_Options,_Index>& SparseMatrix<Scalar,_Opt
     }
     this->swap(dest);
     return *this;
-  }
-  else
-  {
-    if(other.isRValue())
+  } else {
+    if (other.isRValue()) {
       initAssignment(other.derived());
+    }
     // there is no special optimization
     return Base::operator=(other.derived());
   }
 }
 
-template<typename _Scalar, int _Options, typename _Index>
-EIGEN_DONT_INLINE typename SparseMatrix<_Scalar,_Options,_Index>::Scalar& SparseMatrix<_Scalar,_Options,_Index>::insertUncompressed(Index row, Index col)
-{
+template <typename Scalar_, int Options_, typename StorageIndex_>
+inline typename SparseMatrix<Scalar_, Options_, StorageIndex_>::Scalar&
+SparseMatrix<Scalar_, Options_, StorageIndex_>::insert(Index row, Index col) {
+  return insertByOuterInner(IsRowMajor ? row : col, IsRowMajor ? col : row);
+}
+
+template <typename Scalar_, int Options_, typename StorageIndex_>
+EIGEN_STRONG_INLINE typename SparseMatrix<Scalar_, Options_, StorageIndex_>::Scalar&
+SparseMatrix<Scalar_, Options_, StorageIndex_>::insertAtByOuterInner(Index outer, Index inner, Index dst) {
+  // random insertion into compressed matrix is very slow
+  uncompress();
+  return insertUncompressedAtByOuterInner(outer, inner, dst);
+}
+
+template <typename Scalar_, int Options_, typename StorageIndex_>
+EIGEN_DEPRECATED EIGEN_DONT_INLINE typename SparseMatrix<Scalar_, Options_, StorageIndex_>::Scalar&
+SparseMatrix<Scalar_, Options_, StorageIndex_>::insertUncompressed(Index row, Index col) {
   eigen_assert(!isCompressed());
+  Index outer = IsRowMajor ? row : col;
+  Index inner = IsRowMajor ? col : row;
+  Index start = m_outerIndex[outer];
+  Index end = start + m_innerNonZeros[outer];
+  Index dst = start == end ? end : m_data.searchLowerIndex(start, end, inner);
+  if (dst == end) {
+    Index capacity = m_outerIndex[outer + 1] - end;
+    if (capacity > 0) {
+      // implies uncompressed: push to back of vector
+      m_innerNonZeros[outer]++;
+      m_data.index(end) = StorageIndex(inner);
+      m_data.value(end) = Scalar(0);
+      return m_data.value(end);
+    }
+  }
+  eigen_assert((dst == end || m_data.index(dst) != inner) &&
+               "you cannot insert an element that already exists, you must call coeffRef to this end");
+  return insertUncompressedAtByOuterInner(outer, inner, dst);
+}
 
-  const Index outer = IsRowMajor ? row : col;
-  const Index inner = IsRowMajor ? col : row;
+template <typename Scalar_, int Options_, typename StorageIndex_>
+EIGEN_DEPRECATED EIGEN_DONT_INLINE typename SparseMatrix<Scalar_, Options_, StorageIndex_>::Scalar&
+SparseMatrix<Scalar_, Options_, StorageIndex_>::insertCompressed(Index row, Index col) {
+  eigen_assert(isCompressed());
+  Index outer = IsRowMajor ? row : col;
+  Index inner = IsRowMajor ? col : row;
+  Index start = m_outerIndex[outer];
+  Index end = m_outerIndex[outer + 1];
+  Index dst = start == end ? end : m_data.searchLowerIndex(start, end, inner);
+  eigen_assert((dst == end || m_data.index(dst) != inner) &&
+               "you cannot insert an element that already exists, you must call coeffRef to this end");
+  return insertCompressedAtByOuterInner(outer, inner, dst);
+}
 
-  Index room = m_outerIndex[outer+1] - m_outerIndex[outer];
-  Index innerNNZ = m_innerNonZeros[outer];
-  if(innerNNZ>=room)
-  {
-    // this inner vector is full, we need to reallocate the whole buffer :(
-    reserve(SingletonVector(outer,std::max<Index>(2,innerNNZ)));
+template <typename Scalar_, int Options_, typename StorageIndex_>
+typename SparseMatrix<Scalar_, Options_, StorageIndex_>::Scalar&
+SparseMatrix<Scalar_, Options_, StorageIndex_>::insertCompressedAtByOuterInner(Index outer, Index inner, Index dst) {
+  eigen_assert(isCompressed());
+  // compressed insertion always requires expanding the buffer
+  // first, check if there is adequate allocated memory
+  if (m_data.allocatedSize() <= m_data.size()) {
+    // if there is no capacity for a single insertion, double the capacity
+    // increase capacity by a minimum of 32
+    Index minReserve = 32;
+    Index reserveSize = numext::maxi(minReserve, m_data.allocatedSize());
+    m_data.reserve(reserveSize);
   }
+  m_data.resize(m_data.size() + 1);
+  Index chunkSize = m_outerIndex[m_outerSize] - dst;
+  // shift the existing data to the right if necessary
+  m_data.moveChunk(dst, dst + 1, chunkSize);
+  // update nonzero counts
+  // potentially O(outerSize) bottleneck!
+  for (Index j = outer; j < m_outerSize; j++) m_outerIndex[j + 1]++;
+  // initialize the coefficient
+  m_data.index(dst) = StorageIndex(inner);
+  m_data.value(dst) = Scalar(0);
+  // return a reference to the coefficient
+  return m_data.value(dst);
+}
 
-  Index startId = m_outerIndex[outer];
-  Index p = startId + m_innerNonZeros[outer];
-  while ( (p > startId) && (m_data.index(p-1) > inner) )
-  {
-    m_data.index(p) = m_data.index(p-1);
-    m_data.value(p) = m_data.value(p-1);
-    --p;
+template <typename Scalar_, int Options_, typename StorageIndex_>
+typename SparseMatrix<Scalar_, Options_, StorageIndex_>::Scalar&
+SparseMatrix<Scalar_, Options_, StorageIndex_>::insertUncompressedAtByOuterInner(Index outer, Index inner, Index dst) {
+  eigen_assert(!isCompressed());
+  // find a vector with capacity, starting at `outer` and searching to the left and right
+  for (Index leftTarget = outer - 1, rightTarget = outer; (leftTarget >= 0) || (rightTarget < m_outerSize);) {
+    if (rightTarget < m_outerSize) {
+      Index start = m_outerIndex[rightTarget];
+      Index end = start + m_innerNonZeros[rightTarget];
+      Index nextStart = m_outerIndex[rightTarget + 1];
+      Index capacity = nextStart - end;
+      if (capacity > 0) {
+        // move [dst, end) to dst+1 and insert at dst
+        Index chunkSize = end - dst;
+        if (chunkSize > 0) m_data.moveChunk(dst, dst + 1, chunkSize);
+        m_innerNonZeros[outer]++;
+        for (Index j = outer; j < rightTarget; j++) m_outerIndex[j + 1]++;
+        m_data.index(dst) = StorageIndex(inner);
+        m_data.value(dst) = Scalar(0);
+        return m_data.value(dst);
+      }
+      rightTarget++;
+    }
+    if (leftTarget >= 0) {
+      Index start = m_outerIndex[leftTarget];
+      Index end = start + m_innerNonZeros[leftTarget];
+      Index nextStart = m_outerIndex[leftTarget + 1];
+      Index capacity = nextStart - end;
+      if (capacity > 0) {
+        // tricky: dst is a lower bound, so we must insert at dst-1 when shifting left
+        // move [nextStart, dst) to nextStart-1 and insert at dst-1
+        Index chunkSize = dst - nextStart;
+        if (chunkSize > 0) m_data.moveChunk(nextStart, nextStart - 1, chunkSize);
+        m_innerNonZeros[outer]++;
+        for (Index j = leftTarget; j < outer; j++) m_outerIndex[j + 1]--;
+        m_data.index(dst - 1) = StorageIndex(inner);
+        m_data.value(dst - 1) = Scalar(0);
+        return m_data.value(dst - 1);
+      }
+      leftTarget--;
+    }
   }
-  eigen_assert((p<=startId || m_data.index(p-1)!=inner) && "you cannot insert an element that already exist, you must call coeffRef to this end");
 
+  // no room for interior insertion
+  // nonZeros() == m_data.size()
+  // record offset as outerIndxPtr will change
+  Index dst_offset = dst - m_outerIndex[outer];
+  // allocate space for random insertion
+  if (m_data.allocatedSize() == 0) {
+    // fast method to allocate space for one element per vector in empty matrix
+    m_data.resize(m_outerSize);
+    std::iota(m_outerIndex, m_outerIndex + m_outerSize + 1, StorageIndex(0));
+  } else {
+    // check for integer overflow: if maxReserveSize == 0, insertion is not possible
+    Index maxReserveSize = static_cast<Index>(NumTraits<StorageIndex>::highest()) - m_data.allocatedSize();
+    eigen_assert(maxReserveSize > 0);
+    if (m_outerSize <= maxReserveSize) {
+      // allocate space for one additional element per vector
+      reserveInnerVectors(IndexVector::Constant(m_outerSize, 1));
+    } else {
+      // handle the edge case where StorageIndex is insufficient to reserve outerSize additional elements
+      // allocate space for one additional element in the interval [outer,maxReserveSize)
+      typedef internal::sparse_reserve_op<StorageIndex> ReserveSizesOp;
+      typedef CwiseNullaryOp<ReserveSizesOp, IndexVector> ReserveSizesXpr;
+      ReserveSizesXpr reserveSizesXpr(m_outerSize, 1, ReserveSizesOp(outer, m_outerSize, maxReserveSize));
+      reserveInnerVectors(reserveSizesXpr);
+    }
+  }
+  // insert element at `dst` with new outer indices
+  Index start = m_outerIndex[outer];
+  Index end = start + m_innerNonZeros[outer];
+  Index new_dst = start + dst_offset;
+  Index chunkSize = end - new_dst;
+  if (chunkSize > 0) m_data.moveChunk(new_dst, new_dst + 1, chunkSize);
   m_innerNonZeros[outer]++;
-
-  m_data.index(p) = inner;
-  return (m_data.value(p) = 0);
+  m_data.index(new_dst) = StorageIndex(inner);
+  m_data.value(new_dst) = Scalar(0);
+  return m_data.value(new_dst);
 }
 
-template<typename _Scalar, int _Options, typename _Index>
-EIGEN_DONT_INLINE typename SparseMatrix<_Scalar,_Options,_Index>::Scalar& SparseMatrix<_Scalar,_Options,_Index>::insertCompressed(Index row, Index col)
-{
-  eigen_assert(isCompressed());
+namespace internal {
 
-  const Index outer = IsRowMajor ? row : col;
-  const Index inner = IsRowMajor ? col : row;
+template <typename Scalar_, int Options_, typename StorageIndex_>
+struct evaluator<SparseMatrix<Scalar_, Options_, StorageIndex_>>
+    : evaluator<SparseCompressedBase<SparseMatrix<Scalar_, Options_, StorageIndex_>>> {
+  typedef evaluator<SparseCompressedBase<SparseMatrix<Scalar_, Options_, StorageIndex_>>> Base;
+  typedef SparseMatrix<Scalar_, Options_, StorageIndex_> SparseMatrixType;
+  evaluator() : Base() {}
+  explicit evaluator(const SparseMatrixType& mat) : Base(mat) {}
+};
 
-  Index previousOuter = outer;
-  if (m_outerIndex[outer+1]==0)
-  {
-    // we start a new inner vector
-    while (previousOuter>=0 && m_outerIndex[previousOuter]==0)
-    {
-      m_outerIndex[previousOuter] = static_cast<Index>(m_data.size());
-      --previousOuter;
-    }
-    m_outerIndex[outer+1] = m_outerIndex[outer];
+}  // namespace internal
+
+// Specialization for SparseMatrix.
+// Serializes [rows, cols, isCompressed, outerSize, innerBufferSize,
+// innerNonZeros, outerIndices, innerIndices, values].
+template <typename Scalar, int Options, typename StorageIndex>
+class Serializer<SparseMatrix<Scalar, Options, StorageIndex>, void> {
+ public:
+  typedef SparseMatrix<Scalar, Options, StorageIndex> SparseMat;
+
+  struct Header {
+    typename SparseMat::Index rows;
+    typename SparseMat::Index cols;
+    bool compressed;
+    Index outer_size;
+    Index inner_buffer_size;
+  };
+
+  EIGEN_DEVICE_FUNC size_t size(const SparseMat& value) const {
+    // innerNonZeros.
+    std::size_t num_storage_indices = value.isCompressed() ? 0 : value.outerSize();
+    // Outer indices.
+    num_storage_indices += value.outerSize() + 1;
+    // Inner indices.
+    const StorageIndex inner_buffer_size = value.outerIndexPtr()[value.outerSize()];
+    num_storage_indices += inner_buffer_size;
+    // Values.
+    std::size_t num_values = inner_buffer_size;
+    return sizeof(Header) + sizeof(Scalar) * num_values + sizeof(StorageIndex) * num_storage_indices;
   }
 
-  // here we have to handle the tricky case where the outerIndex array
-  // starts with: [ 0 0 0 0 0 1 ...] and we are inserted in, e.g.,
-  // the 2nd inner vector...
-  bool isLastVec = (!(previousOuter==-1 && m_data.size()!=0))
-                && (size_t(m_outerIndex[outer+1]) == m_data.size());
-
-  size_t startId = m_outerIndex[outer];
-  // FIXME let's make sure sizeof(long int) == sizeof(size_t)
-  size_t p = m_outerIndex[outer+1];
-  ++m_outerIndex[outer+1];
-
-  double reallocRatio = 1;
-  if (m_data.allocatedSize()<=m_data.size())
-  {
-    // if there is no preallocated memory, let's reserve a minimum of 32 elements
-    if (m_data.size()==0)
-    {
-      m_data.reserve(32);
-    }
-    else
-    {
-      // we need to reallocate the data, to reduce multiple reallocations
-      // we use a smart resize algorithm based on the current filling ratio
-      // in addition, we use double to avoid integers overflows
-      double nnzEstimate = double(m_outerIndex[outer])*double(m_outerSize)/double(outer+1);
-      reallocRatio = (nnzEstimate-double(m_data.size()))/double(m_data.size());
-      // furthermore we bound the realloc ratio to:
-      //   1) reduce multiple minor realloc when the matrix is almost filled
-      //   2) avoid to allocate too much memory when the matrix is almost empty
-      reallocRatio = (std::min)((std::max)(reallocRatio,1.5),8.);
+  EIGEN_DEVICE_FUNC uint8_t* serialize(uint8_t* dest, uint8_t* end, const SparseMat& value) {
+    if (EIGEN_PREDICT_FALSE(dest == nullptr)) return nullptr;
+    if (EIGEN_PREDICT_FALSE(dest + size(value) > end)) return nullptr;
+
+    const size_t header_bytes = sizeof(Header);
+    Header header = {value.rows(), value.cols(), value.isCompressed(), value.outerSize(),
+                     value.outerIndexPtr()[value.outerSize()]};
+    EIGEN_USING_STD(memcpy)
+    memcpy(dest, &header, header_bytes);
+    dest += header_bytes;
+
+    // innerNonZeros.
+    if (!header.compressed) {
+      std::size_t data_bytes = sizeof(StorageIndex) * header.outer_size;
+      memcpy(dest, value.innerNonZeroPtr(), data_bytes);
+      dest += data_bytes;
     }
+
+    // Outer indices.
+    std::size_t data_bytes = sizeof(StorageIndex) * (header.outer_size + 1);
+    memcpy(dest, value.outerIndexPtr(), data_bytes);
+    dest += data_bytes;
+
+    // Inner indices.
+    data_bytes = sizeof(StorageIndex) * header.inner_buffer_size;
+    memcpy(dest, value.innerIndexPtr(), data_bytes);
+    dest += data_bytes;
+
+    // Values.
+    data_bytes = sizeof(Scalar) * header.inner_buffer_size;
+    memcpy(dest, value.valuePtr(), data_bytes);
+    dest += data_bytes;
+
+    return dest;
   }
-  m_data.resize(m_data.size()+1,reallocRatio);
 
-  if (!isLastVec)
-  {
-    if (previousOuter==-1)
-    {
-      // oops wrong guess.
-      // let's correct the outer offsets
-      for (Index k=0; k<=(outer+1); ++k)
-        m_outerIndex[k] = 0;
-      Index k=outer+1;
-      while(m_outerIndex[k]==0)
-        m_outerIndex[k++] = 1;
-      while (k<=m_outerSize && m_outerIndex[k]!=0)
-        m_outerIndex[k++]++;
-      p = 0;
-      --k;
-      k = m_outerIndex[k]-1;
-      while (k>0)
-      {
-        m_data.index(k) = m_data.index(k-1);
-        m_data.value(k) = m_data.value(k-1);
-        k--;
-      }
+  EIGEN_DEVICE_FUNC const uint8_t* deserialize(const uint8_t* src, const uint8_t* end, SparseMat& value) const {
+    if (EIGEN_PREDICT_FALSE(src == nullptr)) return nullptr;
+    if (EIGEN_PREDICT_FALSE(src + sizeof(Header) > end)) return nullptr;
+
+    const size_t header_bytes = sizeof(Header);
+    Header header;
+    EIGEN_USING_STD(memcpy)
+    memcpy(&header, src, header_bytes);
+    src += header_bytes;
+
+    value.setZero();
+    value.resize(header.rows, header.cols);
+    if (header.compressed) {
+      value.makeCompressed();
+    } else {
+      value.uncompress();
     }
-    else
-    {
-      // we are not inserting into the last inner vec
-      // update outer indices:
-      Index j = outer+2;
-      while (j<=m_outerSize && m_outerIndex[j]!=0)
-        m_outerIndex[j++]++;
-      --j;
-      // shift data of last vecs:
-      Index k = m_outerIndex[j]-1;
-      while (k>=Index(p))
-      {
-        m_data.index(k) = m_data.index(k-1);
-        m_data.value(k) = m_data.value(k-1);
-        k--;
-      }
+
+    // Adjust value ptr size.
+    value.data().resize(header.inner_buffer_size);
+
+    // Initialize compressed state and inner non-zeros.
+    if (!header.compressed) {
+      // Inner non-zero counts.
+      std::size_t data_bytes = sizeof(StorageIndex) * header.outer_size;
+      if (EIGEN_PREDICT_FALSE(src + data_bytes > end)) return nullptr;
+      memcpy(value.innerNonZeroPtr(), src, data_bytes);
+      src += data_bytes;
     }
-  }
 
-  while ( (p > startId) && (m_data.index(p-1) > inner) )
-  {
-    m_data.index(p) = m_data.index(p-1);
-    m_data.value(p) = m_data.value(p-1);
-    --p;
+    // Outer indices.
+    std::size_t data_bytes = sizeof(StorageIndex) * (header.outer_size + 1);
+    if (EIGEN_PREDICT_FALSE(src + data_bytes > end)) return nullptr;
+    memcpy(value.outerIndexPtr(), src, data_bytes);
+    src += data_bytes;
+
+    // Inner indices.
+    data_bytes = sizeof(StorageIndex) * header.inner_buffer_size;
+    if (EIGEN_PREDICT_FALSE(src + data_bytes > end)) return nullptr;
+    memcpy(value.innerIndexPtr(), src, data_bytes);
+    src += data_bytes;
+
+    // Values.
+    data_bytes = sizeof(Scalar) * header.inner_buffer_size;
+    if (EIGEN_PREDICT_FALSE(src + data_bytes > end)) return nullptr;
+    memcpy(value.valuePtr(), src, data_bytes);
+    src += data_bytes;
+    return src;
   }
+};
 
-  m_data.index(p) = inner;
-  return (m_data.value(p) = 0);
-}
-
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_SPARSEMATRIX_H
+#endif  // EIGEN_SPARSEMATRIX_H
diff --git a/inst/include/Eigen/src/SparseCore/SparseMatrixBase.h b/inst/include/Eigen/src/SparseCore/SparseMatrixBase.h
index 9341d9ad..ccbbe98d 100644
--- a/inst/include/Eigen/src/SparseCore/SparseMatrixBase.h
+++ b/inst/include/Eigen/src/SparseCore/SparseMatrixBase.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,452 +10,442 @@
 #ifndef EIGEN_SPARSEMATRIXBASE_H
 #define EIGEN_SPARSEMATRIXBASE_H
 
-namespace Eigen { 
-
-/** \ingroup SparseCore_Module
-  *
-  * \class SparseMatrixBase
-  *
-  * \brief Base class of any sparse matrices or sparse expressions
-  *
-  * \tparam Derived
-  *
-  * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_SPARSEMATRIXBASE_PLUGIN.
-  */
-template<typename Derived> class SparseMatrixBase
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-  : public internal::special_scalar_op_base<Derived,typename internal::traits<Derived>::Scalar,
-                                            typename NumTraits<typename internal::traits<Derived>::Scalar>::Real,
-                                            EigenBase<Derived> >
-#else
-  : public EigenBase<Derived>
-#endif // not EIGEN_PARSED_BY_DOXYGEN
-{
-  public:
-
-    typedef typename internal::traits<Derived>::Scalar Scalar;
-    typedef typename internal::packet_traits<Scalar>::type PacketScalar;
-    typedef typename internal::traits<Derived>::StorageKind StorageKind;
-    typedef typename internal::traits<Derived>::Index Index;
-    typedef typename internal::add_const_on_value_type_if_arithmetic<
-                         typename internal::packet_traits<Scalar>::type
-                     >::type PacketReturnType;
-
-    typedef SparseMatrixBase StorageBaseType;
-    
-    template<typename OtherDerived>
-    Derived& operator=(const EigenBase<OtherDerived> &other)
-    {
-      other.derived().evalTo(derived());
-      return derived();
-    }
-
-    enum {
-
-      RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime,
-        /**< The number of rows at compile-time. This is just a copy of the value provided
-          * by the \a Derived type. If a value is not known at compile-time,
-          * it is set to the \a Dynamic constant.
-          * \sa MatrixBase::rows(), MatrixBase::cols(), ColsAtCompileTime, SizeAtCompileTime */
-
-      ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
-        /**< The number of columns at compile-time. This is just a copy of the value provided
-          * by the \a Derived type. If a value is not known at compile-time,
-          * it is set to the \a Dynamic constant.
-          * \sa MatrixBase::rows(), MatrixBase::cols(), RowsAtCompileTime, SizeAtCompileTime */
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 
+namespace Eigen {
 
-      SizeAtCompileTime = (internal::size_at_compile_time<internal::traits<Derived>::RowsAtCompileTime,
-                                                   internal::traits<Derived>::ColsAtCompileTime>::ret),
-        /**< This is equal to the number of coefficients, i.e. the number of
-          * rows times the number of columns, or to \a Dynamic if this is not
-          * known at compile-time. \sa RowsAtCompileTime, ColsAtCompileTime */
-
-      MaxRowsAtCompileTime = RowsAtCompileTime,
-      MaxColsAtCompileTime = ColsAtCompileTime,
-
-      MaxSizeAtCompileTime = (internal::size_at_compile_time<MaxRowsAtCompileTime,
-                                                      MaxColsAtCompileTime>::ret),
-
-      IsVectorAtCompileTime = RowsAtCompileTime == 1 || ColsAtCompileTime == 1,
-        /**< This is set to true if either the number of rows or the number of
-          * columns is known at compile-time to be equal to 1. Indeed, in that case,
-          * we are dealing with a column-vector (if there is only one column) or with
-          * a row-vector (if there is only one row). */
-
-      Flags = internal::traits<Derived>::Flags,
-        /**< This stores expression \ref flags flags which may or may not be inherited by new expressions
-          * constructed from this one. See the \ref flags "list of flags".
-          */
+/** \ingroup SparseCore_Module
+ *
+ * \class SparseMatrixBase
+ *
+ * \brief Base class of any sparse matrices or sparse expressions
+ *
+ * \tparam Derived is the derived type, e.g. a sparse matrix type, or an expression, etc.
+ *
+ * This class can be extended with the help of the plugin mechanism described on the page
+ * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_SPARSEMATRIXBASE_PLUGIN.
+ */
+template <typename Derived>
+class SparseMatrixBase : public EigenBase<Derived> {
+ public:
+  typedef typename internal::traits<Derived>::Scalar Scalar;
+
+  /** The numeric type of the expression' coefficients, e.g. float, double, int or std::complex<float>, etc.
+   *
+   * It is an alias for the Scalar type */
+  typedef Scalar value_type;
+
+  typedef typename internal::packet_traits<Scalar>::type PacketScalar;
+  typedef typename internal::traits<Derived>::StorageKind StorageKind;
+
+  /** The integer type used to \b store indices within a SparseMatrix.
+   * For a \c SparseMatrix<Scalar,Options,IndexType> it an alias of the third template parameter \c IndexType. */
+  typedef typename internal::traits<Derived>::StorageIndex StorageIndex;
+
+  typedef typename internal::add_const_on_value_type_if_arithmetic<typename internal::packet_traits<Scalar>::type>::type
+      PacketReturnType;
+
+  typedef SparseMatrixBase StorageBaseType;
+
+  typedef Matrix<StorageIndex, Dynamic, 1> IndexVector;
+  typedef Matrix<Scalar, Dynamic, 1> ScalarVector;
+
+  template <typename OtherDerived>
+  Derived& operator=(const EigenBase<OtherDerived>& other);
+
+  enum {
+
+    RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime,
+    /**< The number of rows at compile-time. This is just a copy of the value provided
+     * by the \a Derived type. If a value is not known at compile-time,
+     * it is set to the \a Dynamic constant.
+     * \sa MatrixBase::rows(), MatrixBase::cols(), ColsAtCompileTime, SizeAtCompileTime */
+
+    ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
+    /**< The number of columns at compile-time. This is just a copy of the value provided
+     * by the \a Derived type. If a value is not known at compile-time,
+     * it is set to the \a Dynamic constant.
+     * \sa MatrixBase::rows(), MatrixBase::cols(), RowsAtCompileTime, SizeAtCompileTime */
+
+    SizeAtCompileTime = (internal::size_of_xpr_at_compile_time<Derived>::ret),
+    /**< This is equal to the number of coefficients, i.e. the number of
+     * rows times the number of columns, or to \a Dynamic if this is not
+     * known at compile-time. \sa RowsAtCompileTime, ColsAtCompileTime */
+
+    MaxRowsAtCompileTime = RowsAtCompileTime,
+    MaxColsAtCompileTime = ColsAtCompileTime,
+
+    MaxSizeAtCompileTime = internal::size_at_compile_time(MaxRowsAtCompileTime, MaxColsAtCompileTime),
+
+    IsVectorAtCompileTime = RowsAtCompileTime == 1 || ColsAtCompileTime == 1,
+    /**< This is set to true if either the number of rows or the number of
+     * columns is known at compile-time to be equal to 1. Indeed, in that case,
+     * we are dealing with a column-vector (if there is only one column) or with
+     * a row-vector (if there is only one row). */
+
+    NumDimensions = int(MaxSizeAtCompileTime) == 1 ? 0
+                    : bool(IsVectorAtCompileTime)  ? 1
+                                                   : 2,
+    /**< This value is equal to Tensor::NumDimensions, i.e. 0 for scalars, 1 for vectors,
+     * and 2 for matrices.
+     */
+
+    Flags = internal::traits<Derived>::Flags,
+    /**< This stores expression \ref flags flags which may or may not be inherited by new expressions
+     * constructed from this one. See the \ref flags "list of flags".
+     */
+
+    IsRowMajor = Flags & RowMajorBit ? 1 : 0,
+
+    InnerSizeAtCompileTime = int(IsVectorAtCompileTime) ? int(SizeAtCompileTime)
+                             : int(IsRowMajor)          ? int(ColsAtCompileTime)
+                                                        : int(RowsAtCompileTime),
 
-      CoeffReadCost = internal::traits<Derived>::CoeffReadCost,
-        /**< This is a rough measure of how expensive it is to read one coefficient from
-          * this expression.
-          */
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    HasDirectAccess_ = (int(Flags) & DirectAccessBit) ? 1 : 0  // workaround sunCC
+#endif
+  };
+
+  /** \internal the return type of MatrixBase::adjoint() */
+  typedef std::conditional_t<NumTraits<Scalar>::IsComplex,
+                             CwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, Eigen::Transpose<const Derived> >,
+                             Transpose<const Derived> >
+      AdjointReturnType;
+  typedef Transpose<Derived> TransposeReturnType;
+  typedef Transpose<const Derived> ConstTransposeReturnType;
+
+  // FIXME storage order do not match evaluator storage order
+  typedef SparseMatrix<Scalar, Flags & RowMajorBit ? RowMajor : ColMajor, StorageIndex> PlainObject;
+
+  /** This is the "real scalar" type; if the \a Scalar type is already real numbers
+   * (e.g. int, float or double) then \a RealScalar is just the same as \a Scalar. If
+   * \a Scalar is \a std::complex<T> then RealScalar is \a T.
+   *
+   * \sa class NumTraits
+   */
+  typedef typename NumTraits<Scalar>::Real RealScalar;
 
-      IsRowMajor = Flags&RowMajorBit ? 1 : 0,
-      
-      InnerSizeAtCompileTime = int(IsVectorAtCompileTime) ? int(SizeAtCompileTime)
-                             : int(IsRowMajor) ? int(ColsAtCompileTime) : int(RowsAtCompileTime),
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  /** \internal the return type of coeff()
+   */
+  typedef std::conditional_t<HasDirectAccess_, const Scalar&, Scalar> CoeffReturnType;
 
-      #ifndef EIGEN_PARSED_BY_DOXYGEN
-      _HasDirectAccess = (int(Flags)&DirectAccessBit) ? 1 : 0 // workaround sunCC
-      #endif
-    };
+  /** \internal Represents a matrix with all coefficients equal to one another*/
+  typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>, Matrix<Scalar, Dynamic, Dynamic> > ConstantReturnType;
 
-    /** \internal the return type of MatrixBase::adjoint() */
-    typedef typename internal::conditional<NumTraits<Scalar>::IsComplex,
-                        CwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, Eigen::Transpose<const Derived> >,
-                        Transpose<const Derived>
-                     >::type AdjointReturnType;
+  /** type of the equivalent dense matrix */
+  typedef Matrix<Scalar, RowsAtCompileTime, ColsAtCompileTime> DenseMatrixType;
+  /** type of the equivalent square matrix */
+  typedef Matrix<Scalar, internal::max_size_prefer_dynamic(RowsAtCompileTime, ColsAtCompileTime),
+                 internal::max_size_prefer_dynamic(RowsAtCompileTime, ColsAtCompileTime)>
+      SquareMatrixType;
 
+  inline const Derived& derived() const { return *static_cast<const Derived*>(this); }
+  inline Derived& derived() { return *static_cast<Derived*>(this); }
+  inline Derived& const_cast_derived() const { return *static_cast<Derived*>(const_cast<SparseMatrixBase*>(this)); }
 
-    typedef SparseMatrix<Scalar, Flags&RowMajorBit ? RowMajor : ColMajor, Index> PlainObject;
+  typedef EigenBase<Derived> Base;
 
-
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** This is the "real scalar" type; if the \a Scalar type is already real numbers
-      * (e.g. int, float or double) then \a RealScalar is just the same as \a Scalar. If
-      * \a Scalar is \a std::complex<T> then RealScalar is \a T.
-      *
-      * \sa class NumTraits
-      */
-    typedef typename NumTraits<Scalar>::Real RealScalar;
-
-    /** \internal the return type of coeff()
-      */
-    typedef typename internal::conditional<_HasDirectAccess, const Scalar&, Scalar>::type CoeffReturnType;
-
-    /** \internal Represents a matrix with all coefficients equal to one another*/
-    typedef CwiseNullaryOp<internal::scalar_constant_op<Scalar>,Matrix<Scalar,Dynamic,Dynamic> > ConstantReturnType;
-
-    /** type of the equivalent square matrix */
-    typedef Matrix<Scalar,EIGEN_SIZE_MAX(RowsAtCompileTime,ColsAtCompileTime),
-                          EIGEN_SIZE_MAX(RowsAtCompileTime,ColsAtCompileTime)> SquareMatrixType;
-
-    inline const Derived& derived() const { return *static_cast<const Derived*>(this); }
-    inline Derived& derived() { return *static_cast<Derived*>(this); }
-    inline Derived& const_cast_derived() const
-    { return *static_cast<Derived*>(const_cast<SparseMatrixBase*>(this)); }
-
-    typedef internal::special_scalar_op_base<Derived, Scalar, RealScalar, EigenBase<Derived> > Base;
-    using Base::operator*;
-#endif // not EIGEN_PARSED_BY_DOXYGEN
+#endif  // not EIGEN_PARSED_BY_DOXYGEN
 
 #define EIGEN_CURRENT_STORAGE_BASE_CLASS Eigen::SparseMatrixBase
-#   include "../plugins/CommonCwiseUnaryOps.h"
-#   include "../plugins/CommonCwiseBinaryOps.h"
-#   include "../plugins/MatrixCwiseUnaryOps.h"
-#   include "../plugins/MatrixCwiseBinaryOps.h"
-#   include "../plugins/BlockMethods.h"
-#   ifdef EIGEN_SPARSEMATRIXBASE_PLUGIN
-#     include EIGEN_SPARSEMATRIXBASE_PLUGIN
-#   endif
-#   undef EIGEN_CURRENT_STORAGE_BASE_CLASS
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+#define EIGEN_DOC_UNARY_ADDONS(METHOD,                                                                               \
+                               OP) /** <p>This method does not change the sparsity of \c *this: the OP is applied to \
+                                      explicitly stored coefficients only. \sa SparseCompressedBase::coeffs() </p> */
+#define EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL /** <p> \warning This method returns a read-only expression for any   \
+                                                  sparse matrices. \sa \ref TutorialSparse_SubMatrices "Sparse block \
+                                                  operations" </p> */
+#define EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(                                                                       \
+    COND) /** <p> \warning This method returns a read-write expression for COND sparse matrices only. Otherwise, the \
+             returned expression is read-only. \sa \ref TutorialSparse_SubMatrices "Sparse block operations" </p> */
+#else
+#define EIGEN_DOC_UNARY_ADDONS(X, Y)
+#define EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+#define EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(COND)
+#endif
+#include "../plugins/CommonCwiseUnaryOps.inc"
+#include "../plugins/CommonCwiseBinaryOps.inc"
+#include "../plugins/MatrixCwiseUnaryOps.inc"
+#include "../plugins/MatrixCwiseBinaryOps.inc"
+#include "../plugins/BlockMethods.inc"
+#ifdef EIGEN_SPARSEMATRIXBASE_PLUGIN
+#include EIGEN_SPARSEMATRIXBASE_PLUGIN
+#endif
 #undef EIGEN_CURRENT_STORAGE_BASE_CLASS
-
-    /** \returns the number of rows. \sa cols() */
-    inline Index rows() const { return derived().rows(); }
-    /** \returns the number of columns. \sa rows() */
-    inline Index cols() const { return derived().cols(); }
-    /** \returns the number of coefficients, which is \a rows()*cols().
-      * \sa rows(), cols(). */
-    inline Index size() const { return rows() * cols(); }
-    /** \returns the number of nonzero coefficients which is in practice the number
-      * of stored coefficients. */
-    inline Index nonZeros() const { return derived().nonZeros(); }
-    /** \returns true if either the number of rows or the number of columns is equal to 1.
-      * In other words, this function returns
-      * \code rows()==1 || cols()==1 \endcode
-      * \sa rows(), cols(), IsVectorAtCompileTime. */
-    inline bool isVector() const { return rows()==1 || cols()==1; }
-    /** \returns the size of the storage major dimension,
-      * i.e., the number of columns for a columns major matrix, and the number of rows otherwise */
-    Index outerSize() const { return (int(Flags)&RowMajorBit) ? this->rows() : this->cols(); }
-    /** \returns the size of the inner dimension according to the storage order,
-      * i.e., the number of rows for a columns major matrix, and the number of cols otherwise */
-    Index innerSize() const { return (int(Flags)&RowMajorBit) ? this->cols() : this->rows(); }
-
-    bool isRValue() const { return m_isRValue; }
-    Derived& markAsRValue() { m_isRValue = true; return derived(); }
-
-    SparseMatrixBase() : m_isRValue(false) { /* TODO check flags */ }
-
-    
-    template<typename OtherDerived>
-    Derived& operator=(const ReturnByValue<OtherDerived>& other)
-    {
-      other.evalTo(derived());
-      return derived();
-    }
-
-
-    template<typename OtherDerived>
-    inline Derived& operator=(const SparseMatrixBase<OtherDerived>& other)
-    {
-      return assign(other.derived());
-    }
-
-    inline Derived& operator=(const Derived& other)
-    {
-//       if (other.isRValue())
-//         derived().swap(other.const_cast_derived());
-//       else
-      return assign(other.derived());
-    }
-
-  protected:
-
-    template<typename OtherDerived>
-    inline Derived& assign(const OtherDerived& other)
-    {
-      const bool transpose = (Flags & RowMajorBit) != (OtherDerived::Flags & RowMajorBit);
-      const Index outerSize = (int(OtherDerived::Flags) & RowMajorBit) ? other.rows() : other.cols();
-      if ((!transpose) && other.isRValue())
+#undef EIGEN_DOC_UNARY_ADDONS
+#undef EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+#undef EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF
+
+  /** \returns the number of rows. \sa cols() */
+  inline Index rows() const { return derived().rows(); }
+  /** \returns the number of columns. \sa rows() */
+  inline Index cols() const { return derived().cols(); }
+  /** \returns the number of coefficients, which is \a rows()*cols().
+   * \sa rows(), cols(). */
+  inline Index size() const { return rows() * cols(); }
+  /** \returns true if either the number of rows or the number of columns is equal to 1.
+   * In other words, this function returns
+   * \code rows()==1 || cols()==1 \endcode
+   * \sa rows(), cols(), IsVectorAtCompileTime. */
+  inline bool isVector() const { return rows() == 1 || cols() == 1; }
+  /** \returns the size of the storage major dimension,
+   * i.e., the number of columns for a columns major matrix, and the number of rows otherwise */
+  Index outerSize() const { return (int(Flags) & RowMajorBit) ? this->rows() : this->cols(); }
+  /** \returns the size of the inner dimension according to the storage order,
+   * i.e., the number of rows for a columns major matrix, and the number of cols otherwise */
+  Index innerSize() const { return (int(Flags) & RowMajorBit) ? this->cols() : this->rows(); }
+
+  bool isRValue() const { return m_isRValue; }
+  Derived& markAsRValue() {
+    m_isRValue = true;
+    return derived();
+  }
+
+  SparseMatrixBase() : m_isRValue(false) { /* TODO check flags */
+  }
+
+  template <typename OtherDerived>
+  Derived& operator=(const ReturnByValue<OtherDerived>& other);
+
+  template <typename OtherDerived>
+  inline Derived& operator=(const SparseMatrixBase<OtherDerived>& other);
+
+  inline Derived& operator=(const Derived& other);
+
+ protected:
+  template <typename OtherDerived>
+  inline Derived& assign(const OtherDerived& other);
+
+  template <typename OtherDerived>
+  inline void assignGeneric(const OtherDerived& other);
+
+ public:
+#ifndef EIGEN_NO_IO
+  friend std::ostream& operator<<(std::ostream& s, const SparseMatrixBase& m) {
+    using Nested = typename Derived::Nested;
+    using NestedCleaned = typename internal::remove_all<Nested>::type;
+
+    if (Flags & RowMajorBit) {
+      Nested nm(m.derived());
+      internal::evaluator<NestedCleaned> thisEval(nm);
+
+      // compute global width
+      std::size_t width = 0;
       {
-        // eval without temporary
-        derived().resize(other.rows(), other.cols());
-        derived().setZero();
-        derived().reserve((std::max)(this->rows(),this->cols())*2);
-        for (Index j=0; j<outerSize; ++j)
-        {
-          derived().startVec(j);
-          for (typename OtherDerived::InnerIterator it(other, j); it; ++it)
-          {
-            Scalar v = it.value();
-            derived().insertBackByOuterInner(j,it.index()) = v;
+        std::ostringstream ss0;
+        ss0.copyfmt(s);
+        ss0 << Scalar(0);
+        width = ss0.str().size();
+        for (Index row = 0; row < nm.outerSize(); ++row) {
+          for (typename internal::evaluator<NestedCleaned>::InnerIterator it(thisEval, row); it; ++it) {
+            std::ostringstream ss;
+            ss.copyfmt(s);
+            ss << it.value();
+
+            const std::size_t potential_width = ss.str().size();
+            if (potential_width > width) width = potential_width;
           }
         }
-        derived().finalize();
       }
-      else
-      {
-        assignGeneric(other);
-      }
-      return derived();
-    }
 
-    template<typename OtherDerived>
-    inline void assignGeneric(const OtherDerived& other)
-    {
-      //const bool transpose = (Flags & RowMajorBit) != (OtherDerived::Flags & RowMajorBit);
-      eigen_assert(( ((internal::traits<Derived>::SupportedAccessPatterns&OuterRandomAccessPattern)==OuterRandomAccessPattern) ||
-                  (!((Flags & RowMajorBit) != (OtherDerived::Flags & RowMajorBit)))) &&
-                  "the transpose operation is supposed to be handled in SparseMatrix::operator=");
-
-      enum { Flip = (Flags & RowMajorBit) != (OtherDerived::Flags & RowMajorBit) };
-
-      const Index outerSize = other.outerSize();
-      //typedef typename internal::conditional<transpose, LinkedVectorMatrix<Scalar,Flags&RowMajorBit>, Derived>::type TempType;
-      // thanks to shallow copies, we always eval to a tempary
-      Derived temp(other.rows(), other.cols());
-
-      temp.reserve((std::max)(this->rows(),this->cols())*2);
-      for (Index j=0; j<outerSize; ++j)
-      {
-        temp.startVec(j);
-        for (typename OtherDerived::InnerIterator it(other.derived(), j); it; ++it)
-        {
-          Scalar v = it.value();
-          temp.insertBackByOuterInner(Flip?it.index():j,Flip?j:it.index()) = v;
+      for (Index row = 0; row < nm.outerSize(); ++row) {
+        Index col = 0;
+        for (typename internal::evaluator<NestedCleaned>::InnerIterator it(thisEval, row); it; ++it) {
+          for (; col < it.index(); ++col) {
+            s.width(width);
+            s << Scalar(0) << " ";
+          }
+          s.width(width);
+          s << it.value() << " ";
+          ++col;
         }
+        for (; col < m.cols(); ++col) {
+          s.width(width);
+          s << Scalar(0) << " ";
+        }
+        s << std::endl;
       }
-      temp.finalize();
-
-      derived() = temp.markAsRValue();
-    }
-
-  public:
-
-    template<typename Lhs, typename Rhs>
-    inline Derived& operator=(const SparseSparseProduct<Lhs,Rhs>& product);
-
-    friend std::ostream & operator << (std::ostream & s, const SparseMatrixBase& m)
-    {
-      typedef typename Derived::Nested Nested;
-      typedef typename internal::remove_all<Nested>::type NestedCleaned;
-
-      if (Flags&RowMajorBit)
-      {
-        const Nested nm(m.derived());
-        for (Index row=0; row<nm.outerSize(); ++row)
+    } else {
+      Nested nm(m.derived());
+      internal::evaluator<NestedCleaned> thisEval(nm);
+      if (m.cols() == 1) {
+        // compute local width (single col)
+        std::size_t width = 0;
         {
-          Index col = 0;
-          for (typename NestedCleaned::InnerIterator it(nm.derived(), row); it; ++it)
-          {
-            for ( ; col<it.index(); ++col)
-              s << "0 ";
-            s << it.value() << " ";
-            ++col;
+          std::ostringstream ss0;
+          ss0.copyfmt(s);
+          ss0 << Scalar(0);
+          width = ss0.str().size();
+          for (typename internal::evaluator<NestedCleaned>::InnerIterator it(thisEval, 0); it; ++it) {
+            std::ostringstream ss;
+            ss.copyfmt(s);
+            ss << it.value();
+
+            const std::size_t potential_width = ss.str().size();
+            if (potential_width > width) width = potential_width;
           }
-          for ( ; col<m.cols(); ++col)
-            s << "0 ";
-          s << std::endl;
         }
-      }
-      else
-      {
-        const Nested nm(m.derived());
-        if (m.cols() == 1) {
-          Index row = 0;
-          for (typename NestedCleaned::InnerIterator it(nm.derived(), 0); it; ++it)
-          {
-            for ( ; row<it.index(); ++row)
-              s << "0" << std::endl;
-            s << it.value() << std::endl;
-            ++row;
+
+        Index row = 0;
+        for (typename internal::evaluator<NestedCleaned>::InnerIterator it(thisEval, 0); it; ++it) {
+          for (; row < it.index(); ++row) {
+            s.width(width);
+            s << Scalar(0) << std::endl;
           }
-          for ( ; row<m.rows(); ++row)
-            s << "0" << std::endl;
+          s.width(width);
+          s << it.value() << std::endl;
+          ++row;
         }
-        else
-        {
-          SparseMatrix<Scalar, RowMajorBit, Index> trans = m;
-          s << static_cast<const SparseMatrixBase<SparseMatrix<Scalar, RowMajorBit, Index> >&>(trans);
+        for (; row < m.rows(); ++row) {
+          s.width(width);
+          s << Scalar(0) << std::endl;
         }
+      } else {
+        SparseMatrix<Scalar, RowMajorBit, StorageIndex> trans = m;
+        s << static_cast<const SparseMatrixBase<SparseMatrix<Scalar, RowMajorBit, StorageIndex> >&>(trans);
       }
-      return s;
-    }
-
-    template<typename OtherDerived>
-    Derived& operator+=(const SparseMatrixBase<OtherDerived>& other);
-    template<typename OtherDerived>
-    Derived& operator-=(const SparseMatrixBase<OtherDerived>& other);
-
-    Derived& operator*=(const Scalar& other);
-    Derived& operator/=(const Scalar& other);
-
-    template<typename OtherDerived> struct CwiseProductDenseReturnType {
-      typedef CwiseBinaryOp<internal::scalar_product_op<typename internal::scalar_product_traits<
-                                                          typename internal::traits<Derived>::Scalar,
-                                                          typename internal::traits<OtherDerived>::Scalar
-                                                        >::ReturnType>,
-                            const Derived,
-                            const OtherDerived
-                          > Type;
-    };
-
-    template<typename OtherDerived>
-    EIGEN_STRONG_INLINE const typename CwiseProductDenseReturnType<OtherDerived>::Type
-    cwiseProduct(const MatrixBase<OtherDerived> &other) const;
-
-    // sparse * sparse
-    template<typename OtherDerived>
-    const typename SparseSparseProductReturnType<Derived,OtherDerived>::Type
-    operator*(const SparseMatrixBase<OtherDerived> &other) const;
-
-    // sparse * diagonal
-    template<typename OtherDerived>
-    const SparseDiagonalProduct<Derived,OtherDerived>
-    operator*(const DiagonalBase<OtherDerived> &other) const;
-
-    // diagonal * sparse
-    template<typename OtherDerived> friend
-    const SparseDiagonalProduct<OtherDerived,Derived>
-    operator*(const DiagonalBase<OtherDerived> &lhs, const SparseMatrixBase& rhs)
-    { return SparseDiagonalProduct<OtherDerived,Derived>(lhs.derived(), rhs.derived()); }
-
-    /** dense * sparse (return a dense object unless it is an outer product) */
-    template<typename OtherDerived> friend
-    const typename DenseSparseProductReturnType<OtherDerived,Derived>::Type
-    operator*(const MatrixBase<OtherDerived>& lhs, const Derived& rhs)
-    { return typename DenseSparseProductReturnType<OtherDerived,Derived>::Type(lhs.derived(),rhs); }
-
-    /** sparse * dense (returns a dense object unless it is an outer product) */
-    template<typename OtherDerived>
-    const typename SparseDenseProductReturnType<Derived,OtherDerived>::Type
-    operator*(const MatrixBase<OtherDerived> &other) const
-    { return typename SparseDenseProductReturnType<Derived,OtherDerived>::Type(derived(), other.derived()); }
-    
-     /** \returns an expression of P H P^-1 where H is the matrix represented by \c *this */
-    SparseSymmetricPermutationProduct<Derived,Upper|Lower> twistedBy(const PermutationMatrix<Dynamic,Dynamic,Index>& perm) const
-    {
-      return SparseSymmetricPermutationProduct<Derived,Upper|Lower>(derived(), perm);
-    }
-
-    template<typename OtherDerived>
-    Derived& operator*=(const SparseMatrixBase<OtherDerived>& other);
-
-    #ifdef EIGEN2_SUPPORT
-    // deprecated
-    template<typename OtherDerived>
-    typename internal::plain_matrix_type_column_major<OtherDerived>::type
-    solveTriangular(const MatrixBase<OtherDerived>& other) const;
-
-    // deprecated
-    template<typename OtherDerived>
-    void solveTriangularInPlace(MatrixBase<OtherDerived>& other) const;
-    #endif // EIGEN2_SUPPORT
-
-    template<int Mode>
-    inline const SparseTriangularView<Derived, Mode> triangularView() const;
-
-    template<unsigned int UpLo> inline const SparseSelfAdjointView<Derived, UpLo> selfadjointView() const;
-    template<unsigned int UpLo> inline SparseSelfAdjointView<Derived, UpLo> selfadjointView();
-
-    template<typename OtherDerived> Scalar dot(const MatrixBase<OtherDerived>& other) const;
-    template<typename OtherDerived> Scalar dot(const SparseMatrixBase<OtherDerived>& other) const;
-    RealScalar squaredNorm() const;
-    RealScalar norm()  const;
-    RealScalar blueNorm() const;
-
-    Transpose<Derived> transpose() { return derived(); }
-    const Transpose<const Derived> transpose() const { return derived(); }
-    const AdjointReturnType adjoint() const { return transpose(); }
-
-    // inner-vector
-    typedef Block<Derived,IsRowMajor?1:Dynamic,IsRowMajor?Dynamic:1,true>       InnerVectorReturnType;
-    typedef Block<const Derived,IsRowMajor?1:Dynamic,IsRowMajor?Dynamic:1,true> ConstInnerVectorReturnType;
-    InnerVectorReturnType innerVector(Index outer);
-    const ConstInnerVectorReturnType innerVector(Index outer) const;
-
-    // set of inner-vectors
-    typedef Block<Derived,Dynamic,Dynamic,true> InnerVectorsReturnType;
-    typedef Block<const Derived,Dynamic,Dynamic,true> ConstInnerVectorsReturnType;
-    InnerVectorsReturnType innerVectors(Index outerStart, Index outerSize);
-    const ConstInnerVectorsReturnType innerVectors(Index outerStart, Index outerSize) const;
-
-    /** \internal use operator= */
-    template<typename DenseDerived>
-    void evalTo(MatrixBase<DenseDerived>& dst) const
-    {
-      dst.setZero();
-      for (Index j=0; j<outerSize(); ++j)
-        for (typename Derived::InnerIterator i(derived(),j); i; ++i)
-          dst.coeffRef(i.row(),i.col()) = i.value();
     }
-
-    Matrix<Scalar,RowsAtCompileTime,ColsAtCompileTime> toDense() const
-    {
-      return derived();
-    }
-
-    template<typename OtherDerived>
-    bool isApprox(const SparseMatrixBase<OtherDerived>& other,
-                  const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const
-    { return toDense().isApprox(other.toDense(),prec); }
-
-    template<typename OtherDerived>
-    bool isApprox(const MatrixBase<OtherDerived>& other,
-                  const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const
-    { return toDense().isApprox(other,prec); }
-
-    /** \returns the matrix or vector obtained by evaluating this expression.
-      *
-      * Notice that in the case of a plain matrix or vector (not an expression) this function just returns
-      * a const reference, in order to avoid a useless copy.
-      */
-    inline const typename internal::eval<Derived>::type eval() const
-    { return typename internal::eval<Derived>::type(derived()); }
-
-    Scalar sum() const;
-
-  protected:
-
-    bool m_isRValue;
+    return s;
+  }
+#endif
+
+  template <typename OtherDerived>
+  Derived& operator+=(const SparseMatrixBase<OtherDerived>& other);
+  template <typename OtherDerived>
+  Derived& operator-=(const SparseMatrixBase<OtherDerived>& other);
+
+  template <typename OtherDerived>
+  Derived& operator+=(const DiagonalBase<OtherDerived>& other);
+  template <typename OtherDerived>
+  Derived& operator-=(const DiagonalBase<OtherDerived>& other);
+
+  template <typename OtherDerived>
+  Derived& operator+=(const EigenBase<OtherDerived>& other);
+  template <typename OtherDerived>
+  Derived& operator-=(const EigenBase<OtherDerived>& other);
+
+  Derived& operator*=(const Scalar& other);
+  Derived& operator/=(const Scalar& other);
+
+  template <typename OtherDerived>
+  struct CwiseProductDenseReturnType {
+    typedef CwiseBinaryOp<
+        internal::scalar_product_op<typename ScalarBinaryOpTraits<
+            typename internal::traits<Derived>::Scalar, typename internal::traits<OtherDerived>::Scalar>::ReturnType>,
+        const Derived, const OtherDerived>
+        Type;
+  };
+
+  template <typename OtherDerived>
+  EIGEN_STRONG_INLINE const typename CwiseProductDenseReturnType<OtherDerived>::Type cwiseProduct(
+      const MatrixBase<OtherDerived>& other) const;
+
+  // sparse * diagonal
+  template <typename OtherDerived>
+  const Product<Derived, OtherDerived> operator*(const DiagonalBase<OtherDerived>& other) const {
+    return Product<Derived, OtherDerived>(derived(), other.derived());
+  }
+
+  // diagonal * sparse
+  template <typename OtherDerived>
+  friend const Product<OtherDerived, Derived> operator*(const DiagonalBase<OtherDerived>& lhs,
+                                                        const SparseMatrixBase& rhs) {
+    return Product<OtherDerived, Derived>(lhs.derived(), rhs.derived());
+  }
+
+  // sparse * sparse
+  template <typename OtherDerived>
+  const Product<Derived, OtherDerived, AliasFreeProduct> operator*(const SparseMatrixBase<OtherDerived>& other) const;
+
+  // sparse * dense
+  template <typename OtherDerived>
+  const Product<Derived, OtherDerived> operator*(const MatrixBase<OtherDerived>& other) const {
+    return Product<Derived, OtherDerived>(derived(), other.derived());
+  }
+
+  // dense * sparse
+  template <typename OtherDerived>
+  friend const Product<OtherDerived, Derived> operator*(const MatrixBase<OtherDerived>& lhs,
+                                                        const SparseMatrixBase& rhs) {
+    return Product<OtherDerived, Derived>(lhs.derived(), rhs.derived());
+  }
+
+  /** \returns an expression of P H P^-1 where H is the matrix represented by \c *this */
+  SparseSymmetricPermutationProduct<Derived, Upper | Lower> twistedBy(
+      const PermutationMatrix<Dynamic, Dynamic, StorageIndex>& perm) const {
+    return SparseSymmetricPermutationProduct<Derived, Upper | Lower>(derived(), perm);
+  }
+
+  template <typename OtherDerived>
+  Derived& operator*=(const SparseMatrixBase<OtherDerived>& other);
+
+  template <int Mode>
+  inline const TriangularView<const Derived, Mode> triangularView() const;
+
+  template <unsigned int UpLo>
+  struct SelfAdjointViewReturnType {
+    typedef SparseSelfAdjointView<Derived, UpLo> Type;
+  };
+  template <unsigned int UpLo>
+  struct ConstSelfAdjointViewReturnType {
+    typedef const SparseSelfAdjointView<const Derived, UpLo> Type;
+  };
+
+  template <unsigned int UpLo>
+  inline typename ConstSelfAdjointViewReturnType<UpLo>::Type selfadjointView() const;
+  template <unsigned int UpLo>
+  inline typename SelfAdjointViewReturnType<UpLo>::Type selfadjointView();
+
+  template <typename OtherDerived>
+  Scalar dot(const MatrixBase<OtherDerived>& other) const;
+  template <typename OtherDerived>
+  Scalar dot(const SparseMatrixBase<OtherDerived>& other) const;
+  RealScalar squaredNorm() const;
+  RealScalar norm() const;
+  RealScalar blueNorm() const;
+
+  TransposeReturnType transpose() { return TransposeReturnType(derived()); }
+  const ConstTransposeReturnType transpose() const { return ConstTransposeReturnType(derived()); }
+  const AdjointReturnType adjoint() const { return AdjointReturnType(transpose()); }
+
+  DenseMatrixType toDense() const { return DenseMatrixType(derived()); }
+
+  template <typename OtherDerived>
+  bool isApprox(const SparseMatrixBase<OtherDerived>& other,
+                const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const;
+
+  template <typename OtherDerived>
+  bool isApprox(const MatrixBase<OtherDerived>& other,
+                const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const {
+    return toDense().isApprox(other, prec);
+  }
+
+  /** \returns the matrix or vector obtained by evaluating this expression.
+   *
+   * Notice that in the case of a plain matrix or vector (not an expression) this function just returns
+   * a const reference, in order to avoid a useless copy.
+   */
+  inline const typename internal::eval<Derived>::type eval() const {
+    return typename internal::eval<Derived>::type(derived());
+  }
+
+  Scalar sum() const;
+
+  inline const SparseView<Derived> pruned(const Scalar& reference = Scalar(0),
+                                          const RealScalar& epsilon = NumTraits<Scalar>::dummy_precision()) const;
+
+ protected:
+  bool m_isRValue;
+
+  static inline StorageIndex convert_index(const Index idx) { return internal::convert_index<StorageIndex>(idx); }
+
+ private:
+  template <typename Dest>
+  void evalTo(Dest&) const;
 };
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_SPARSEMATRIXBASE_H
+#endif  // EIGEN_SPARSEMATRIXBASE_H
diff --git a/inst/include/Eigen/src/SparseCore/SparsePermutation.h b/inst/include/Eigen/src/SparseCore/SparsePermutation.h
index 75e21000..56f572d3 100644
--- a/inst/include/Eigen/src/SparseCore/SparsePermutation.h
+++ b/inst/include/Eigen/src/SparseCore/SparsePermutation.h
@@ -12,137 +12,238 @@
 
 // This file implements sparse * permutation products
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 namespace internal {
 
-template<typename PermutationType, typename MatrixType, int Side, bool Transposed>
-struct traits<permut_sparsematrix_product_retval<PermutationType, MatrixType, Side, Transposed> >
-{
-  typedef typename remove_all<typename MatrixType::Nested>::type MatrixTypeNestedCleaned;
-  typedef typename MatrixTypeNestedCleaned::Scalar Scalar;
-  typedef typename MatrixTypeNestedCleaned::Index Index;
-  enum {
-    SrcStorageOrder = MatrixTypeNestedCleaned::Flags&RowMajorBit ? RowMajor : ColMajor,
-    MoveOuter = SrcStorageOrder==RowMajor ? Side==OnTheLeft : Side==OnTheRight
-  };
-
-  typedef typename internal::conditional<MoveOuter,
-        SparseMatrix<Scalar,SrcStorageOrder,Index>,
-        SparseMatrix<Scalar,int(SrcStorageOrder)==RowMajor?ColMajor:RowMajor,Index> >::type ReturnType;
+template <typename ExpressionType, typename PlainObjectType,
+          bool NeedEval = !is_same<ExpressionType, PlainObjectType>::value>
+struct XprHelper {
+  XprHelper(const ExpressionType& xpr) : m_xpr(xpr) {}
+  inline const PlainObjectType& xpr() const { return m_xpr; }
+  // this is a new PlainObjectType initialized by xpr
+  const PlainObjectType m_xpr;
+};
+template <typename ExpressionType, typename PlainObjectType>
+struct XprHelper<ExpressionType, PlainObjectType, false> {
+  XprHelper(const ExpressionType& xpr) : m_xpr(xpr) {}
+  inline const PlainObjectType& xpr() const { return m_xpr; }
+  // this is a reference to xpr
+  const PlainObjectType& m_xpr;
+};
+
+template <typename PermDerived, bool NeedInverseEval>
+struct PermHelper {
+  using IndicesType = typename PermDerived::IndicesType;
+  using PermutationIndex = typename IndicesType::Scalar;
+  using type = PermutationMatrix<IndicesType::SizeAtCompileTime, IndicesType::MaxSizeAtCompileTime, PermutationIndex>;
+  PermHelper(const PermDerived& perm) : m_perm(perm.inverse()) {}
+  inline const type& perm() const { return m_perm; }
+  // this is a new PermutationMatrix initialized by perm.inverse()
+  const type m_perm;
 };
+template <typename PermDerived>
+struct PermHelper<PermDerived, false> {
+  using type = PermDerived;
+  PermHelper(const PermDerived& perm) : m_perm(perm) {}
+  inline const type& perm() const { return m_perm; }
+  // this is a reference to perm
+  const type& m_perm;
+};
+
+template <typename ExpressionType, int Side, bool Transposed>
+struct permutation_matrix_product<ExpressionType, Side, Transposed, SparseShape> {
+  using MatrixType = typename nested_eval<ExpressionType, 1>::type;
+  using MatrixTypeCleaned = remove_all_t<MatrixType>;
+
+  using Scalar = typename MatrixTypeCleaned::Scalar;
+  using StorageIndex = typename MatrixTypeCleaned::StorageIndex;
+
+  // the actual "return type" is `Dest`. this is a temporary type
+  using ReturnType = SparseMatrix<Scalar, MatrixTypeCleaned::IsRowMajor ? RowMajor : ColMajor, StorageIndex>;
+  using TmpHelper = XprHelper<ExpressionType, ReturnType>;
+
+  static constexpr bool NeedOuterPermutation = ExpressionType::IsRowMajor ? Side == OnTheLeft : Side == OnTheRight;
+  static constexpr bool NeedInversePermutation = Transposed ? Side == OnTheLeft : Side == OnTheRight;
+
+  template <typename Dest, typename PermutationType>
+  static inline void permute_outer(Dest& dst, const PermutationType& perm, const ExpressionType& xpr) {
+    // if ExpressionType is not ReturnType, evaluate `xpr` (allocation)
+    // otherwise, just reference `xpr`
+    // TODO: handle trivial expressions such as CwiseBinaryOp without temporary
+    const TmpHelper tmpHelper(xpr);
+    const ReturnType& tmp = tmpHelper.xpr();
+
+    ReturnType result(tmp.rows(), tmp.cols());
+
+    for (Index j = 0; j < tmp.outerSize(); j++) {
+      Index jp = perm.indices().coeff(j);
+      Index jsrc = NeedInversePermutation ? jp : j;
+      Index jdst = NeedInversePermutation ? j : jp;
+      Index begin = tmp.outerIndexPtr()[jsrc];
+      Index end = tmp.isCompressed() ? tmp.outerIndexPtr()[jsrc + 1] : begin + tmp.innerNonZeroPtr()[jsrc];
+      result.outerIndexPtr()[jdst + 1] += end - begin;
+    }
 
-template<typename PermutationType, typename MatrixType, int Side, bool Transposed>
-struct permut_sparsematrix_product_retval
- : public ReturnByValue<permut_sparsematrix_product_retval<PermutationType, MatrixType, Side, Transposed> >
-{
-    typedef typename remove_all<typename MatrixType::Nested>::type MatrixTypeNestedCleaned;
-    typedef typename MatrixTypeNestedCleaned::Scalar Scalar;
-    typedef typename MatrixTypeNestedCleaned::Index Index;
-
-    enum {
-      SrcStorageOrder = MatrixTypeNestedCleaned::Flags&RowMajorBit ? RowMajor : ColMajor,
-      MoveOuter = SrcStorageOrder==RowMajor ? Side==OnTheLeft : Side==OnTheRight
-    };
-
-    permut_sparsematrix_product_retval(const PermutationType& perm, const MatrixType& matrix)
-      : m_permutation(perm), m_matrix(matrix)
-    {}
-
-    inline int rows() const { return m_matrix.rows(); }
-    inline int cols() const { return m_matrix.cols(); }
-
-    template<typename Dest> inline void evalTo(Dest& dst) const
-    {
-      if(MoveOuter)
-      {
-        SparseMatrix<Scalar,SrcStorageOrder,Index> tmp(m_matrix.rows(), m_matrix.cols());
-        Matrix<Index,Dynamic,1> sizes(m_matrix.outerSize());
-        for(Index j=0; j<m_matrix.outerSize(); ++j)
-        {
-          Index jp = m_permutation.indices().coeff(j);
-          sizes[((Side==OnTheLeft) ^ Transposed) ? jp : j] = m_matrix.innerVector(((Side==OnTheRight) ^ Transposed) ? jp : j).nonZeros();
-        }
-        tmp.reserve(sizes);
-        for(Index j=0; j<m_matrix.outerSize(); ++j)
-        {
-          Index jp = m_permutation.indices().coeff(j);
-          Index jsrc = ((Side==OnTheRight) ^ Transposed) ? jp : j;
-          Index jdst = ((Side==OnTheLeft) ^ Transposed) ? jp : j;
-          for(typename MatrixTypeNestedCleaned::InnerIterator it(m_matrix,jsrc); it; ++it)
-            tmp.insertByOuterInner(jdst,it.index()) = it.value();
-        }
-        dst = tmp;
-      }
-      else
-      {
-        SparseMatrix<Scalar,int(SrcStorageOrder)==RowMajor?ColMajor:RowMajor,Index> tmp(m_matrix.rows(), m_matrix.cols());
-        Matrix<Index,Dynamic,1> sizes(tmp.outerSize());
-        sizes.setZero();
-        PermutationMatrix<Dynamic,Dynamic,Index> perm;
-        if((Side==OnTheLeft) ^ Transposed)
-          perm = m_permutation;
-        else
-          perm = m_permutation.transpose();
-
-        for(Index j=0; j<m_matrix.outerSize(); ++j)
-          for(typename MatrixTypeNestedCleaned::InnerIterator it(m_matrix,j); it; ++it)
-            sizes[perm.indices().coeff(it.index())]++;
-        tmp.reserve(sizes);
-        for(Index j=0; j<m_matrix.outerSize(); ++j)
-          for(typename MatrixTypeNestedCleaned::InnerIterator it(m_matrix,j); it; ++it)
-            tmp.insertByOuterInner(perm.indices().coeff(it.index()),j) = it.value();
-        dst = tmp;
-      }
+    std::partial_sum(result.outerIndexPtr(), result.outerIndexPtr() + result.outerSize() + 1, result.outerIndexPtr());
+    result.resizeNonZeros(result.nonZeros());
+
+    for (Index j = 0; j < tmp.outerSize(); j++) {
+      Index jp = perm.indices().coeff(j);
+      Index jsrc = NeedInversePermutation ? jp : j;
+      Index jdst = NeedInversePermutation ? j : jp;
+      Index begin = tmp.outerIndexPtr()[jsrc];
+      Index end = tmp.isCompressed() ? tmp.outerIndexPtr()[jsrc + 1] : begin + tmp.innerNonZeroPtr()[jsrc];
+      Index target = result.outerIndexPtr()[jdst];
+      smart_copy(tmp.innerIndexPtr() + begin, tmp.innerIndexPtr() + end, result.innerIndexPtr() + target);
+      smart_copy(tmp.valuePtr() + begin, tmp.valuePtr() + end, result.valuePtr() + target);
     }
+    dst = std::move(result);
+  }
+
+  template <typename Dest, typename PermutationType>
+  static inline void permute_inner(Dest& dst, const PermutationType& perm, const ExpressionType& xpr) {
+    using InnerPermHelper = PermHelper<PermutationType, NeedInversePermutation>;
+    using InnerPermType = typename InnerPermHelper::type;
+
+    // if ExpressionType is not ReturnType, evaluate `xpr` (allocation)
+    // otherwise, just reference `xpr`
+    // TODO: handle trivial expressions such as CwiseBinaryOp without temporary
+    const TmpHelper tmpHelper(xpr);
+    const ReturnType& tmp = tmpHelper.xpr();
+
+    // if inverse permutation of inner indices is requested, calculate perm.inverse() (allocation)
+    // otherwise, just reference `perm`
+    const InnerPermHelper permHelper(perm);
+    const InnerPermType& innerPerm = permHelper.perm();
+
+    ReturnType result(tmp.rows(), tmp.cols());
+
+    for (Index j = 0; j < tmp.outerSize(); j++) {
+      Index begin = tmp.outerIndexPtr()[j];
+      Index end = tmp.isCompressed() ? tmp.outerIndexPtr()[j + 1] : begin + tmp.innerNonZeroPtr()[j];
+      result.outerIndexPtr()[j + 1] += end - begin;
+    }
+
+    std::partial_sum(result.outerIndexPtr(), result.outerIndexPtr() + result.outerSize() + 1, result.outerIndexPtr());
+    result.resizeNonZeros(result.nonZeros());
 
-  protected:
-    const PermutationType& m_permutation;
-    typename MatrixType::Nested m_matrix;
+    for (Index j = 0; j < tmp.outerSize(); j++) {
+      Index begin = tmp.outerIndexPtr()[j];
+      Index end = tmp.isCompressed() ? tmp.outerIndexPtr()[j + 1] : begin + tmp.innerNonZeroPtr()[j];
+      Index target = result.outerIndexPtr()[j];
+      std::transform(tmp.innerIndexPtr() + begin, tmp.innerIndexPtr() + end, result.innerIndexPtr() + target,
+                     [&innerPerm](StorageIndex i) { return innerPerm.indices().coeff(i); });
+      smart_copy(tmp.valuePtr() + begin, tmp.valuePtr() + end, result.valuePtr() + target);
+    }
+    // the inner indices were permuted, and must be sorted
+    result.sortInnerIndices();
+    dst = std::move(result);
+  }
+
+  template <typename Dest, typename PermutationType, bool DoOuter = NeedOuterPermutation,
+            std::enable_if_t<DoOuter, int> = 0>
+  static inline void run(Dest& dst, const PermutationType& perm, const ExpressionType& xpr) {
+    permute_outer(dst, perm, xpr);
+  }
+
+  template <typename Dest, typename PermutationType, bool DoOuter = NeedOuterPermutation,
+            std::enable_if_t<!DoOuter, int> = 0>
+  static inline void run(Dest& dst, const PermutationType& perm, const ExpressionType& xpr) {
+    permute_inner(dst, perm, xpr);
+  }
 };
 
-}
+}  // namespace internal
+
+namespace internal {
+
+template <int ProductTag>
+struct product_promote_storage_type<Sparse, PermutationStorage, ProductTag> {
+  typedef Sparse ret;
+};
+template <int ProductTag>
+struct product_promote_storage_type<PermutationStorage, Sparse, ProductTag> {
+  typedef Sparse ret;
+};
+
+// TODO, the following two overloads are only needed to define the right temporary type through
+// typename traits<permutation_sparse_matrix_product<Rhs,Lhs,OnTheRight,false> >::ReturnType
+// whereas it should be correctly handled by traits<Product<> >::PlainObject
+
+template <typename Lhs, typename Rhs, int ProductTag>
+struct product_evaluator<Product<Lhs, Rhs, AliasFreeProduct>, ProductTag, PermutationShape, SparseShape>
+    : public evaluator<typename permutation_matrix_product<Rhs, OnTheLeft, false, SparseShape>::ReturnType> {
+  typedef Product<Lhs, Rhs, AliasFreeProduct> XprType;
+  typedef typename permutation_matrix_product<Rhs, OnTheLeft, false, SparseShape>::ReturnType PlainObject;
+  typedef evaluator<PlainObject> Base;
+
+  enum { Flags = Base::Flags | EvalBeforeNestingBit };
+
+  explicit product_evaluator(const XprType& xpr) : m_result(xpr.rows(), xpr.cols()) {
+    internal::construct_at<Base>(this, m_result);
+    generic_product_impl<Lhs, Rhs, PermutationShape, SparseShape, ProductTag>::evalTo(m_result, xpr.lhs(), xpr.rhs());
+  }
+
+ protected:
+  PlainObject m_result;
+};
+
+template <typename Lhs, typename Rhs, int ProductTag>
+struct product_evaluator<Product<Lhs, Rhs, AliasFreeProduct>, ProductTag, SparseShape, PermutationShape>
+    : public evaluator<typename permutation_matrix_product<Lhs, OnTheRight, false, SparseShape>::ReturnType> {
+  typedef Product<Lhs, Rhs, AliasFreeProduct> XprType;
+  typedef typename permutation_matrix_product<Lhs, OnTheRight, false, SparseShape>::ReturnType PlainObject;
+  typedef evaluator<PlainObject> Base;
 
+  enum { Flags = Base::Flags | EvalBeforeNestingBit };
 
+  explicit product_evaluator(const XprType& xpr) : m_result(xpr.rows(), xpr.cols()) {
+    ::new (static_cast<Base*>(this)) Base(m_result);
+    generic_product_impl<Lhs, Rhs, SparseShape, PermutationShape, ProductTag>::evalTo(m_result, xpr.lhs(), xpr.rhs());
+  }
+
+ protected:
+  PlainObject m_result;
+};
+
+}  // end namespace internal
 
 /** \returns the matrix with the permutation applied to the columns
-  */
-template<typename SparseDerived, typename PermDerived>
-inline const internal::permut_sparsematrix_product_retval<PermutationBase<PermDerived>, SparseDerived, OnTheRight, false>
-operator*(const SparseMatrixBase<SparseDerived>& matrix, const PermutationBase<PermDerived>& perm)
-{
-  return internal::permut_sparsematrix_product_retval<PermutationBase<PermDerived>, SparseDerived, OnTheRight, false>(perm, matrix.derived());
+ */
+template <typename SparseDerived, typename PermDerived>
+inline const Product<SparseDerived, PermDerived, AliasFreeProduct> operator*(
+    const SparseMatrixBase<SparseDerived>& matrix, const PermutationBase<PermDerived>& perm) {
+  return Product<SparseDerived, PermDerived, AliasFreeProduct>(matrix.derived(), perm.derived());
 }
 
 /** \returns the matrix with the permutation applied to the rows
-  */
-template<typename SparseDerived, typename PermDerived>
-inline const internal::permut_sparsematrix_product_retval<PermutationBase<PermDerived>, SparseDerived, OnTheLeft, false>
-operator*( const PermutationBase<PermDerived>& perm, const SparseMatrixBase<SparseDerived>& matrix)
-{
-  return internal::permut_sparsematrix_product_retval<PermutationBase<PermDerived>, SparseDerived, OnTheLeft, false>(perm, matrix.derived());
+ */
+template <typename SparseDerived, typename PermDerived>
+inline const Product<PermDerived, SparseDerived, AliasFreeProduct> operator*(
+    const PermutationBase<PermDerived>& perm, const SparseMatrixBase<SparseDerived>& matrix) {
+  return Product<PermDerived, SparseDerived, AliasFreeProduct>(perm.derived(), matrix.derived());
 }
 
-
-
 /** \returns the matrix with the inverse permutation applied to the columns.
-  */
-template<typename SparseDerived, typename PermDerived>
-inline const internal::permut_sparsematrix_product_retval<PermutationBase<PermDerived>, SparseDerived, OnTheRight, true>
-operator*(const SparseMatrixBase<SparseDerived>& matrix, const Transpose<PermutationBase<PermDerived> >& tperm)
-{
-  return internal::permut_sparsematrix_product_retval<PermutationBase<PermDerived>, SparseDerived, OnTheRight, true>(tperm.nestedPermutation(), matrix.derived());
+ */
+template <typename SparseDerived, typename PermutationType>
+inline const Product<SparseDerived, Inverse<PermutationType>, AliasFreeProduct> operator*(
+    const SparseMatrixBase<SparseDerived>& matrix, const InverseImpl<PermutationType, PermutationStorage>& tperm) {
+  return Product<SparseDerived, Inverse<PermutationType>, AliasFreeProduct>(matrix.derived(), tperm.derived());
 }
 
 /** \returns the matrix with the inverse permutation applied to the rows.
-  */
-template<typename SparseDerived, typename PermDerived>
-inline const internal::permut_sparsematrix_product_retval<PermutationBase<PermDerived>, SparseDerived, OnTheLeft, true>
-operator*(const Transpose<PermutationBase<PermDerived> >& tperm, const SparseMatrixBase<SparseDerived>& matrix)
-{
-  return internal::permut_sparsematrix_product_retval<PermutationBase<PermDerived>, SparseDerived, OnTheLeft, true>(tperm.nestedPermutation(), matrix.derived());
+ */
+template <typename SparseDerived, typename PermutationType>
+inline const Product<Inverse<PermutationType>, SparseDerived, AliasFreeProduct> operator*(
+    const InverseImpl<PermutationType, PermutationStorage>& tperm, const SparseMatrixBase<SparseDerived>& matrix) {
+  return Product<Inverse<PermutationType>, SparseDerived, AliasFreeProduct>(tperm.derived(), matrix.derived());
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_SPARSE_SELFADJOINTVIEW_H
+#endif  // EIGEN_SPARSE_SELFADJOINTVIEW_H
diff --git a/inst/include/Eigen/src/SparseCore/SparseProduct.h b/inst/include/Eigen/src/SparseCore/SparseProduct.h
index cf766307..249dabc1 100644
--- a/inst/include/Eigen/src/SparseCore/SparseProduct.h
+++ b/inst/include/Eigen/src/SparseCore/SparseProduct.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,179 +10,169 @@
 #ifndef EIGEN_SPARSEPRODUCT_H
 #define EIGEN_SPARSEPRODUCT_H
 
-namespace Eigen { 
-
-template<typename Lhs, typename Rhs>
-struct SparseSparseProductReturnType
-{
-  typedef typename internal::traits<Lhs>::Scalar Scalar;
-  typedef typename internal::traits<Lhs>::Index Index;
-  enum {
-    LhsRowMajor = internal::traits<Lhs>::Flags & RowMajorBit,
-    RhsRowMajor = internal::traits<Rhs>::Flags & RowMajorBit,
-    TransposeRhs = (!LhsRowMajor) && RhsRowMajor,
-    TransposeLhs = LhsRowMajor && (!RhsRowMajor)
-  };
-
-  typedef typename internal::conditional<TransposeLhs,
-    SparseMatrix<Scalar,0,Index>,
-    typename internal::nested<Lhs,Rhs::RowsAtCompileTime>::type>::type LhsNested;
-
-  typedef typename internal::conditional<TransposeRhs,
-    SparseMatrix<Scalar,0,Index>,
-    typename internal::nested<Rhs,Lhs::RowsAtCompileTime>::type>::type RhsNested;
-
-  typedef SparseSparseProduct<LhsNested, RhsNested> Type;
-};
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \returns an expression of the product of two sparse matrices.
+ * By default a conservative product preserving the symbolic non zeros is performed.
+ * The automatic pruning of the small values can be achieved by calling the pruned() function
+ * in which case a totally different product algorithm is employed:
+ * \code
+ * C = (A*B).pruned();             // suppress numerical zeros (exact)
+ * C = (A*B).pruned(ref);
+ * C = (A*B).pruned(ref,epsilon);
+ * \endcode
+ * where \c ref is a meaningful non zero reference value.
+ * */
+template <typename Derived>
+template <typename OtherDerived>
+inline const Product<Derived, OtherDerived, AliasFreeProduct> SparseMatrixBase<Derived>::operator*(
+    const SparseMatrixBase<OtherDerived>& other) const {
+  return Product<Derived, OtherDerived, AliasFreeProduct>(derived(), other.derived());
+}
 
 namespace internal {
-template<typename LhsNested, typename RhsNested>
-struct traits<SparseSparseProduct<LhsNested, RhsNested> >
-{
-  typedef MatrixXpr XprKind;
-  // clean the nested types:
-  typedef typename remove_all<LhsNested>::type _LhsNested;
-  typedef typename remove_all<RhsNested>::type _RhsNested;
-  typedef typename _LhsNested::Scalar Scalar;
-  typedef typename promote_index_type<typename traits<_LhsNested>::Index,
-                                         typename traits<_RhsNested>::Index>::type Index;
-
-  enum {
-    LhsCoeffReadCost = _LhsNested::CoeffReadCost,
-    RhsCoeffReadCost = _RhsNested::CoeffReadCost,
-    LhsFlags = _LhsNested::Flags,
-    RhsFlags = _RhsNested::Flags,
-
-    RowsAtCompileTime    = _LhsNested::RowsAtCompileTime,
-    ColsAtCompileTime    = _RhsNested::ColsAtCompileTime,
-    MaxRowsAtCompileTime = _LhsNested::MaxRowsAtCompileTime,
-    MaxColsAtCompileTime = _RhsNested::MaxColsAtCompileTime,
-
-    InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(_LhsNested::ColsAtCompileTime, _RhsNested::RowsAtCompileTime),
-
-    EvalToRowMajor = (RhsFlags & LhsFlags & RowMajorBit),
-
-    RemovedBits = ~(EvalToRowMajor ? 0 : RowMajorBit),
-
-    Flags = (int(LhsFlags | RhsFlags) & HereditaryBits & RemovedBits)
-          | EvalBeforeAssigningBit
-          | EvalBeforeNestingBit,
-
-    CoeffReadCost = Dynamic
-  };
-
-  typedef Sparse StorageKind;
+
+// sparse * sparse
+template <typename Lhs, typename Rhs, int ProductType>
+struct generic_product_impl<Lhs, Rhs, SparseShape, SparseShape, ProductType> {
+  template <typename Dest>
+  static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs) {
+    evalTo(dst, lhs, rhs, typename evaluator_traits<Dest>::Shape());
+  }
+
+  // dense += sparse * sparse
+  template <typename Dest, typename ActualLhs>
+  static void addTo(Dest& dst, const ActualLhs& lhs, const Rhs& rhs,
+                    std::enable_if_t<is_same<typename evaluator_traits<Dest>::Shape, DenseShape>::value, int*>* = 0) {
+    typedef typename nested_eval<ActualLhs, Dynamic>::type LhsNested;
+    typedef typename nested_eval<Rhs, Dynamic>::type RhsNested;
+    LhsNested lhsNested(lhs);
+    RhsNested rhsNested(rhs);
+    internal::sparse_sparse_to_dense_product_selector<remove_all_t<LhsNested>, remove_all_t<RhsNested>, Dest>::run(
+        lhsNested, rhsNested, dst);
+  }
+
+  // dense -= sparse * sparse
+  template <typename Dest>
+  static void subTo(Dest& dst, const Lhs& lhs, const Rhs& rhs,
+                    std::enable_if_t<is_same<typename evaluator_traits<Dest>::Shape, DenseShape>::value, int*>* = 0) {
+    addTo(dst, -lhs, rhs);
+  }
+
+ protected:
+  // sparse = sparse * sparse
+  template <typename Dest>
+  static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, SparseShape) {
+    typedef typename nested_eval<Lhs, Dynamic>::type LhsNested;
+    typedef typename nested_eval<Rhs, Dynamic>::type RhsNested;
+    LhsNested lhsNested(lhs);
+    RhsNested rhsNested(rhs);
+    internal::conservative_sparse_sparse_product_selector<remove_all_t<LhsNested>, remove_all_t<RhsNested>, Dest>::run(
+        lhsNested, rhsNested, dst);
+  }
+
+  // dense = sparse * sparse
+  template <typename Dest>
+  static void evalTo(Dest& dst, const Lhs& lhs, const Rhs& rhs, DenseShape) {
+    dst.setZero();
+    addTo(dst, lhs, rhs);
+  }
 };
 
-} // end namespace internal
-
-template<typename LhsNested, typename RhsNested>
-class SparseSparseProduct : internal::no_assignment_operator,
-  public SparseMatrixBase<SparseSparseProduct<LhsNested, RhsNested> >
-{
-  public:
-
-    typedef SparseMatrixBase<SparseSparseProduct> Base;
-    EIGEN_DENSE_PUBLIC_INTERFACE(SparseSparseProduct)
-
-  private:
-
-    typedef typename internal::traits<SparseSparseProduct>::_LhsNested _LhsNested;
-    typedef typename internal::traits<SparseSparseProduct>::_RhsNested _RhsNested;
-
-  public:
-
-    template<typename Lhs, typename Rhs>
-    EIGEN_STRONG_INLINE SparseSparseProduct(const Lhs& lhs, const Rhs& rhs)
-      : m_lhs(lhs), m_rhs(rhs), m_tolerance(0), m_conservative(true)
-    {
-      init();
-    }
-
-    template<typename Lhs, typename Rhs>
-    EIGEN_STRONG_INLINE SparseSparseProduct(const Lhs& lhs, const Rhs& rhs, const RealScalar& tolerance)
-      : m_lhs(lhs), m_rhs(rhs), m_tolerance(tolerance), m_conservative(false)
-    {
-      init();
-    }
-
-    SparseSparseProduct pruned(const Scalar& reference = 0, const RealScalar& epsilon = NumTraits<RealScalar>::dummy_precision()) const
-    {
-      using std::abs;
-      return SparseSparseProduct(m_lhs,m_rhs,abs(reference)*epsilon);
-    }
-
-    template<typename Dest>
-    void evalTo(Dest& result) const
-    {
-      if(m_conservative)
-        internal::conservative_sparse_sparse_product_selector<_LhsNested, _RhsNested, Dest>::run(lhs(),rhs(),result);
-      else
-        internal::sparse_sparse_product_with_pruning_selector<_LhsNested, _RhsNested, Dest>::run(lhs(),rhs(),result,m_tolerance);
-    }
-
-    EIGEN_STRONG_INLINE Index rows() const { return m_lhs.rows(); }
-    EIGEN_STRONG_INLINE Index cols() const { return m_rhs.cols(); }
-
-    EIGEN_STRONG_INLINE const _LhsNested& lhs() const { return m_lhs; }
-    EIGEN_STRONG_INLINE const _RhsNested& rhs() const { return m_rhs; }
-
-  protected:
-    void init()
-    {
-      eigen_assert(m_lhs.cols() == m_rhs.rows());
-
-      enum {
-        ProductIsValid = _LhsNested::ColsAtCompileTime==Dynamic
-                      || _RhsNested::RowsAtCompileTime==Dynamic
-                      || int(_LhsNested::ColsAtCompileTime)==int(_RhsNested::RowsAtCompileTime),
-        AreVectors = _LhsNested::IsVectorAtCompileTime && _RhsNested::IsVectorAtCompileTime,
-        SameSizes = EIGEN_PREDICATE_SAME_MATRIX_SIZE(_LhsNested,_RhsNested)
-      };
-      // note to the lost user:
-      //    * for a dot product use: v1.dot(v2)
-      //    * for a coeff-wise product use: v1.cwise()*v2
-      EIGEN_STATIC_ASSERT(ProductIsValid || !(AreVectors && SameSizes),
-        INVALID_VECTOR_VECTOR_PRODUCT__IF_YOU_WANTED_A_DOT_OR_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTIONS)
-      EIGEN_STATIC_ASSERT(ProductIsValid || !(SameSizes && !AreVectors),
-        INVALID_MATRIX_PRODUCT__IF_YOU_WANTED_A_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTION)
-      EIGEN_STATIC_ASSERT(ProductIsValid || SameSizes, INVALID_MATRIX_PRODUCT)
-    }
-
-    LhsNested m_lhs;
-    RhsNested m_rhs;
-    RealScalar m_tolerance;
-    bool m_conservative;
+// sparse * sparse-triangular
+template <typename Lhs, typename Rhs, int ProductType>
+struct generic_product_impl<Lhs, Rhs, SparseShape, SparseTriangularShape, ProductType>
+    : public generic_product_impl<Lhs, Rhs, SparseShape, SparseShape, ProductType> {};
+
+// sparse-triangular * sparse
+template <typename Lhs, typename Rhs, int ProductType>
+struct generic_product_impl<Lhs, Rhs, SparseTriangularShape, SparseShape, ProductType>
+    : public generic_product_impl<Lhs, Rhs, SparseShape, SparseShape, ProductType> {};
+
+// dense = sparse-product (can be sparse*sparse, sparse*perm, etc.)
+template <typename DstXprType, typename Lhs, typename Rhs>
+struct Assignment<
+    DstXprType, Product<Lhs, Rhs, AliasFreeProduct>,
+    internal::assign_op<typename DstXprType::Scalar, typename Product<Lhs, Rhs, AliasFreeProduct>::Scalar>,
+    Sparse2Dense> {
+  typedef Product<Lhs, Rhs, AliasFreeProduct> SrcXprType;
+  static void run(DstXprType& dst, const SrcXprType& src,
+                  const internal::assign_op<typename DstXprType::Scalar, typename SrcXprType::Scalar>&) {
+    Index dstRows = src.rows();
+    Index dstCols = src.cols();
+    if ((dst.rows() != dstRows) || (dst.cols() != dstCols)) dst.resize(dstRows, dstCols);
+
+    generic_product_impl<Lhs, Rhs>::evalTo(dst, src.lhs(), src.rhs());
+  }
 };
 
-// sparse = sparse * sparse
-template<typename Derived>
-template<typename Lhs, typename Rhs>
-inline Derived& SparseMatrixBase<Derived>::operator=(const SparseSparseProduct<Lhs,Rhs>& product)
-{
-  product.evalTo(derived());
-  return derived();
-}
+// dense += sparse-product (can be sparse*sparse, sparse*perm, etc.)
+template <typename DstXprType, typename Lhs, typename Rhs>
+struct Assignment<
+    DstXprType, Product<Lhs, Rhs, AliasFreeProduct>,
+    internal::add_assign_op<typename DstXprType::Scalar, typename Product<Lhs, Rhs, AliasFreeProduct>::Scalar>,
+    Sparse2Dense> {
+  typedef Product<Lhs, Rhs, AliasFreeProduct> SrcXprType;
+  static void run(DstXprType& dst, const SrcXprType& src,
+                  const internal::add_assign_op<typename DstXprType::Scalar, typename SrcXprType::Scalar>&) {
+    generic_product_impl<Lhs, Rhs>::addTo(dst, src.lhs(), src.rhs());
+  }
+};
 
-/** \returns an expression of the product of two sparse matrices.
-  * By default a conservative product preserving the symbolic non zeros is performed.
-  * The automatic pruning of the small values can be achieved by calling the pruned() function
-  * in which case a totally different product algorithm is employed:
-  * \code
-  * C = (A*B).pruned();             // supress numerical zeros (exact)
-  * C = (A*B).pruned(ref);
-  * C = (A*B).pruned(ref,epsilon);
-  * \endcode
-  * where \c ref is a meaningful non zero reference value.
-  * */
-template<typename Derived>
-template<typename OtherDerived>
-inline const typename SparseSparseProductReturnType<Derived,OtherDerived>::Type
-SparseMatrixBase<Derived>::operator*(const SparseMatrixBase<OtherDerived> &other) const
-{
-  return typename SparseSparseProductReturnType<Derived,OtherDerived>::Type(derived(), other.derived());
+// dense -= sparse-product (can be sparse*sparse, sparse*perm, etc.)
+template <typename DstXprType, typename Lhs, typename Rhs>
+struct Assignment<
+    DstXprType, Product<Lhs, Rhs, AliasFreeProduct>,
+    internal::sub_assign_op<typename DstXprType::Scalar, typename Product<Lhs, Rhs, AliasFreeProduct>::Scalar>,
+    Sparse2Dense> {
+  typedef Product<Lhs, Rhs, AliasFreeProduct> SrcXprType;
+  static void run(DstXprType& dst, const SrcXprType& src,
+                  const internal::sub_assign_op<typename DstXprType::Scalar, typename SrcXprType::Scalar>&) {
+    generic_product_impl<Lhs, Rhs>::subTo(dst, src.lhs(), src.rhs());
+  }
+};
+
+template <typename Lhs, typename Rhs, int Options>
+struct unary_evaluator<SparseView<Product<Lhs, Rhs, Options> >, IteratorBased>
+    : public evaluator<typename Product<Lhs, Rhs, DefaultProduct>::PlainObject> {
+  typedef SparseView<Product<Lhs, Rhs, Options> > XprType;
+  typedef typename XprType::PlainObject PlainObject;
+  typedef evaluator<PlainObject> Base;
+
+  explicit unary_evaluator(const XprType& xpr) : m_result(xpr.rows(), xpr.cols()) {
+    using std::abs;
+    internal::construct_at<Base>(this, m_result);
+    typedef typename nested_eval<Lhs, Dynamic>::type LhsNested;
+    typedef typename nested_eval<Rhs, Dynamic>::type RhsNested;
+    LhsNested lhsNested(xpr.nestedExpression().lhs());
+    RhsNested rhsNested(xpr.nestedExpression().rhs());
+
+    internal::sparse_sparse_product_with_pruning_selector<remove_all_t<LhsNested>, remove_all_t<RhsNested>,
+                                                          PlainObject>::run(lhsNested, rhsNested, m_result,
+                                                                            abs(xpr.reference()) * xpr.epsilon());
+  }
+
+ protected:
+  PlainObject m_result;
+};
+
+}  // end namespace internal
+
+// sparse matrix = sparse-product (can be sparse*sparse, sparse*perm, etc.)
+template <typename Scalar, int Options_, typename StorageIndex_>
+template <typename Lhs, typename Rhs>
+SparseMatrix<Scalar, Options_, StorageIndex_>& SparseMatrix<Scalar, Options_, StorageIndex_>::operator=(
+    const Product<Lhs, Rhs, AliasFreeProduct>& src) {
+  // std::cout << "in Assignment : " << DstOptions << "\n";
+  SparseMatrix dst(src.rows(), src.cols());
+  internal::generic_product_impl<Lhs, Rhs>::evalTo(dst, src.lhs(), src.rhs());
+  this->swap(dst);
+  return *this;
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_SPARSEPRODUCT_H
+#endif  // EIGEN_SPARSEPRODUCT_H
diff --git a/inst/include/Eigen/src/SparseCore/SparseRedux.h b/inst/include/Eigen/src/SparseCore/SparseRedux.h
index f3da93a7..732e4f77 100644
--- a/inst/include/Eigen/src/SparseCore/SparseRedux.h
+++ b/inst/include/Eigen/src/SparseCore/SparseRedux.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,36 +10,38 @@
 #ifndef EIGEN_SPARSEREDUX_H
 #define EIGEN_SPARSEREDUX_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 
-template<typename Derived>
-typename internal::traits<Derived>::Scalar
-SparseMatrixBase<Derived>::sum() const
-{
-  eigen_assert(rows()>0 && cols()>0 && "you are using a non initialized matrix");
+namespace Eigen {
+
+template <typename Derived>
+typename internal::traits<Derived>::Scalar SparseMatrixBase<Derived>::sum() const {
+  eigen_assert(rows() > 0 && cols() > 0 && "you are using a non initialized matrix");
   Scalar res(0);
-  for (Index j=0; j<outerSize(); ++j)
-    for (typename Derived::InnerIterator iter(derived(),j); iter; ++iter)
-      res += iter.value();
+  internal::evaluator<Derived> thisEval(derived());
+  for (Index j = 0; j < outerSize(); ++j)
+    for (typename internal::evaluator<Derived>::InnerIterator iter(thisEval, j); iter; ++iter) res += iter.value();
   return res;
 }
 
-template<typename _Scalar, int _Options, typename _Index>
-typename internal::traits<SparseMatrix<_Scalar,_Options,_Index> >::Scalar
-SparseMatrix<_Scalar,_Options,_Index>::sum() const
-{
-  eigen_assert(rows()>0 && cols()>0 && "you are using a non initialized matrix");
-  return Matrix<Scalar,1,Dynamic>::Map(&m_data.value(0), m_data.size()).sum();
+template <typename Scalar_, int Options_, typename Index_>
+typename internal::traits<SparseMatrix<Scalar_, Options_, Index_> >::Scalar
+SparseMatrix<Scalar_, Options_, Index_>::sum() const {
+  eigen_assert(rows() > 0 && cols() > 0 && "you are using a non initialized matrix");
+  if (this->isCompressed())
+    return Matrix<Scalar, 1, Dynamic>::Map(m_data.valuePtr(), m_data.size()).sum();
+  else
+    return Base::sum();
 }
 
-template<typename _Scalar, int _Options, typename _Index>
-typename internal::traits<SparseVector<_Scalar,_Options, _Index> >::Scalar
-SparseVector<_Scalar,_Options,_Index>::sum() const
-{
-  eigen_assert(rows()>0 && cols()>0 && "you are using a non initialized matrix");
-  return Matrix<Scalar,1,Dynamic>::Map(&m_data.value(0), m_data.size()).sum();
+template <typename Scalar_, int Options_, typename Index_>
+typename internal::traits<SparseVector<Scalar_, Options_, Index_> >::Scalar
+SparseVector<Scalar_, Options_, Index_>::sum() const {
+  eigen_assert(rows() > 0 && cols() > 0 && "you are using a non initialized matrix");
+  return Matrix<Scalar, 1, Dynamic>::Map(m_data.valuePtr(), m_data.size()).sum();
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_SPARSEREDUX_H
+#endif  // EIGEN_SPARSEREDUX_H
diff --git a/inst/include/Eigen/src/SparseCore/SparseRef.h b/inst/include/Eigen/src/SparseCore/SparseRef.h
new file mode 100644
index 00000000..c205e6dd
--- /dev/null
+++ b/inst/include/Eigen/src/SparseCore/SparseRef.h
@@ -0,0 +1,370 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSE_REF_H
+#define EIGEN_SPARSE_REF_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+enum {
+  StandardCompressedFormat =
+      2 /**< used by Ref<SparseMatrix> to specify whether the input storage must be in standard compressed form */
+};
+
+namespace internal {
+
+template <typename Derived>
+class SparseRefBase;
+
+template <typename MatScalar, int MatOptions, typename MatIndex, int Options_, typename StrideType_>
+struct traits<Ref<SparseMatrix<MatScalar, MatOptions, MatIndex>, Options_, StrideType_>>
+    : public traits<SparseMatrix<MatScalar, MatOptions, MatIndex>> {
+  typedef SparseMatrix<MatScalar, MatOptions, MatIndex> PlainObjectType;
+  enum { Options = Options_, Flags = traits<PlainObjectType>::Flags | CompressedAccessBit | NestByRefBit };
+
+  template <typename Derived>
+  struct match {
+    enum {
+      StorageOrderMatch = PlainObjectType::IsVectorAtCompileTime || Derived::IsVectorAtCompileTime ||
+                          ((PlainObjectType::Flags & RowMajorBit) == (Derived::Flags & RowMajorBit)),
+      MatchAtCompileTime = (Derived::Flags & CompressedAccessBit) && StorageOrderMatch
+    };
+    typedef std::conditional_t<MatchAtCompileTime, internal::true_type, internal::false_type> type;
+  };
+};
+
+template <typename MatScalar, int MatOptions, typename MatIndex, int Options_, typename StrideType_>
+struct traits<Ref<const SparseMatrix<MatScalar, MatOptions, MatIndex>, Options_, StrideType_>>
+    : public traits<Ref<SparseMatrix<MatScalar, MatOptions, MatIndex>, Options_, StrideType_>> {
+  enum {
+    Flags =
+        (traits<SparseMatrix<MatScalar, MatOptions, MatIndex>>::Flags | CompressedAccessBit | NestByRefBit) & ~LvalueBit
+  };
+};
+
+template <typename MatScalar, int MatOptions, typename MatIndex, int Options_, typename StrideType_>
+struct traits<Ref<SparseVector<MatScalar, MatOptions, MatIndex>, Options_, StrideType_>>
+    : public traits<SparseVector<MatScalar, MatOptions, MatIndex>> {
+  typedef SparseVector<MatScalar, MatOptions, MatIndex> PlainObjectType;
+  enum { Options = Options_, Flags = traits<PlainObjectType>::Flags | CompressedAccessBit | NestByRefBit };
+
+  template <typename Derived>
+  struct match {
+    enum { MatchAtCompileTime = (Derived::Flags & CompressedAccessBit) && Derived::IsVectorAtCompileTime };
+    typedef std::conditional_t<MatchAtCompileTime, internal::true_type, internal::false_type> type;
+  };
+};
+
+template <typename MatScalar, int MatOptions, typename MatIndex, int Options_, typename StrideType_>
+struct traits<Ref<const SparseVector<MatScalar, MatOptions, MatIndex>, Options_, StrideType_>>
+    : public traits<Ref<SparseVector<MatScalar, MatOptions, MatIndex>, Options_, StrideType_>> {
+  enum {
+    Flags =
+        (traits<SparseVector<MatScalar, MatOptions, MatIndex>>::Flags | CompressedAccessBit | NestByRefBit) & ~LvalueBit
+  };
+};
+
+template <typename Derived>
+struct traits<SparseRefBase<Derived>> : public traits<Derived> {};
+
+template <typename Derived>
+class SparseRefBase : public SparseMapBase<Derived> {
+ public:
+  typedef SparseMapBase<Derived> Base;
+  EIGEN_SPARSE_PUBLIC_INTERFACE(SparseRefBase)
+
+  SparseRefBase()
+      : Base(RowsAtCompileTime == Dynamic ? 0 : RowsAtCompileTime, ColsAtCompileTime == Dynamic ? 0 : ColsAtCompileTime,
+             0, 0, 0, 0, 0) {}
+
+ protected:
+  template <typename Expression>
+  void construct(Expression& expr) {
+    if (expr.outerIndexPtr() == 0)
+      internal::construct_at<Base>(this, expr.size(), expr.nonZeros(), expr.innerIndexPtr(), expr.valuePtr());
+    else
+      internal::construct_at<Base>(this, expr.rows(), expr.cols(), expr.nonZeros(), expr.outerIndexPtr(),
+                                   expr.innerIndexPtr(), expr.valuePtr(), expr.innerNonZeroPtr());
+  }
+};
+
+}  // namespace internal
+
+/**
+ * \ingroup SparseCore_Module
+ *
+ * \brief A sparse matrix expression referencing an existing sparse expression
+ *
+ * \tparam SparseMatrixType the equivalent sparse matrix type of the referenced data, it must be a template instance of
+ * class SparseMatrix. \tparam Options specifies whether the a standard compressed format is required \c Options is  \c
+ * #StandardCompressedFormat, or \c 0. The default is \c 0.
+ *
+ * \sa class Ref
+ */
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+template <typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+class Ref<SparseMatrix<MatScalar, MatOptions, MatIndex>, Options, StrideType>
+    : public internal::SparseRefBase<Ref<SparseMatrix<MatScalar, MatOptions, MatIndex>, Options, StrideType>>
+#else
+template <typename SparseMatrixType, int Options>
+class Ref<SparseMatrixType, Options>
+    : public SparseMapBase<Derived, WriteAccessors>  // yes, that's weird to use Derived here, but that works!
+#endif
+{
+  typedef SparseMatrix<MatScalar, MatOptions, MatIndex> PlainObjectType;
+  typedef internal::traits<Ref> Traits;
+  template <int OtherOptions>
+  inline Ref(const SparseMatrix<MatScalar, OtherOptions, MatIndex>& expr);
+  template <int OtherOptions>
+  inline Ref(const Map<SparseMatrix<MatScalar, OtherOptions, MatIndex>>& expr);
+
+ public:
+  typedef internal::SparseRefBase<Ref> Base;
+  EIGEN_SPARSE_PUBLIC_INTERFACE(Ref)
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  template <int OtherOptions>
+  inline Ref(SparseMatrix<MatScalar, OtherOptions, MatIndex>& expr) {
+    EIGEN_STATIC_ASSERT(
+        bool(Traits::template match<SparseMatrix<MatScalar, OtherOptions, MatIndex>>::MatchAtCompileTime),
+        STORAGE_LAYOUT_DOES_NOT_MATCH);
+    eigen_assert(((Options & int(StandardCompressedFormat)) == 0) || (expr.isCompressed()));
+    Base::construct(expr.derived());
+  }
+
+  template <int OtherOptions>
+  inline Ref(Map<SparseMatrix<MatScalar, OtherOptions, MatIndex>>& expr) {
+    EIGEN_STATIC_ASSERT(
+        bool(Traits::template match<SparseMatrix<MatScalar, OtherOptions, MatIndex>>::MatchAtCompileTime),
+        STORAGE_LAYOUT_DOES_NOT_MATCH);
+    eigen_assert(((Options & int(StandardCompressedFormat)) == 0) || (expr.isCompressed()));
+    Base::construct(expr.derived());
+  }
+
+  template <typename Derived>
+  inline Ref(const SparseCompressedBase<Derived>& expr)
+#else
+  /** Implicit constructor from any sparse expression (2D matrix or 1D vector) */
+  template <typename Derived>
+  inline Ref(SparseCompressedBase<Derived>& expr)
+#endif
+  {
+    EIGEN_STATIC_ASSERT(bool(internal::is_lvalue<Derived>::value), THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
+    EIGEN_STATIC_ASSERT(bool(Traits::template match<Derived>::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
+    eigen_assert(((Options & int(StandardCompressedFormat)) == 0) || (expr.isCompressed()));
+    Base::construct(expr.const_cast_derived());
+  }
+};
+
+// this is the const ref version
+template <typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+class Ref<const SparseMatrix<MatScalar, MatOptions, MatIndex>, Options, StrideType>
+    : public internal::SparseRefBase<Ref<const SparseMatrix<MatScalar, MatOptions, MatIndex>, Options, StrideType>> {
+  typedef SparseMatrix<MatScalar, MatOptions, MatIndex> TPlainObjectType;
+  typedef internal::traits<Ref> Traits;
+
+ public:
+  typedef internal::SparseRefBase<Ref> Base;
+  EIGEN_SPARSE_PUBLIC_INTERFACE(Ref)
+
+  template <typename Derived>
+  inline Ref(const SparseMatrixBase<Derived>& expr) : m_hasCopy(false) {
+    construct(expr.derived(), typename Traits::template match<Derived>::type());
+  }
+
+  inline Ref(const Ref& other) : Base(other), m_hasCopy(false) {
+    // copy constructor shall not copy the m_object, to avoid unnecessary malloc and copy
+  }
+
+  template <typename OtherRef>
+  inline Ref(const RefBase<OtherRef>& other) : m_hasCopy(false) {
+    construct(other.derived(), typename Traits::template match<OtherRef>::type());
+  }
+
+  ~Ref() {
+    if (m_hasCopy) {
+      internal::destroy_at(reinterpret_cast<TPlainObjectType*>(&m_storage));
+    }
+  }
+
+ protected:
+  template <typename Expression>
+  void construct(const Expression& expr, internal::true_type) {
+    if ((Options & int(StandardCompressedFormat)) && (!expr.isCompressed())) {
+      TPlainObjectType* obj = internal::construct_at(reinterpret_cast<TPlainObjectType*>(&m_storage), expr);
+      m_hasCopy = true;
+      Base::construct(*obj);
+    } else {
+      Base::construct(expr);
+    }
+  }
+
+  template <typename Expression>
+  void construct(const Expression& expr, internal::false_type) {
+    TPlainObjectType* obj = internal::construct_at(reinterpret_cast<TPlainObjectType*>(&m_storage), expr);
+    m_hasCopy = true;
+    Base::construct(*obj);
+  }
+
+ protected:
+  typename internal::aligned_storage<sizeof(TPlainObjectType), EIGEN_ALIGNOF(TPlainObjectType)>::type m_storage;
+  bool m_hasCopy;
+};
+
+/**
+ * \ingroup SparseCore_Module
+ *
+ * \brief A sparse vector expression referencing an existing sparse vector expression
+ *
+ * \tparam SparseVectorType the equivalent sparse vector type of the referenced data, it must be a template instance of
+ * class SparseVector.
+ *
+ * \sa class Ref
+ */
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+template <typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+class Ref<SparseVector<MatScalar, MatOptions, MatIndex>, Options, StrideType>
+    : public internal::SparseRefBase<Ref<SparseVector<MatScalar, MatOptions, MatIndex>, Options, StrideType>>
+#else
+template <typename SparseVectorType>
+class Ref<SparseVectorType> : public SparseMapBase<Derived, WriteAccessors>
+#endif
+{
+  typedef SparseVector<MatScalar, MatOptions, MatIndex> PlainObjectType;
+  typedef internal::traits<Ref> Traits;
+  template <int OtherOptions>
+  inline Ref(const SparseVector<MatScalar, OtherOptions, MatIndex>& expr);
+
+ public:
+  typedef internal::SparseRefBase<Ref> Base;
+  EIGEN_SPARSE_PUBLIC_INTERFACE(Ref)
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  template <int OtherOptions>
+  inline Ref(SparseVector<MatScalar, OtherOptions, MatIndex>& expr) {
+    EIGEN_STATIC_ASSERT(
+        bool(Traits::template match<SparseVector<MatScalar, OtherOptions, MatIndex>>::MatchAtCompileTime),
+        STORAGE_LAYOUT_DOES_NOT_MATCH);
+    Base::construct(expr.derived());
+  }
+
+  template <typename Derived>
+  inline Ref(const SparseCompressedBase<Derived>& expr)
+#else
+  /** Implicit constructor from any 1D sparse vector expression */
+  template <typename Derived>
+  inline Ref(SparseCompressedBase<Derived>& expr)
+#endif
+  {
+    EIGEN_STATIC_ASSERT(bool(internal::is_lvalue<Derived>::value), THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY);
+    EIGEN_STATIC_ASSERT(bool(Traits::template match<Derived>::MatchAtCompileTime), STORAGE_LAYOUT_DOES_NOT_MATCH);
+    Base::construct(expr.const_cast_derived());
+  }
+};
+
+// this is the const ref version
+template <typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+class Ref<const SparseVector<MatScalar, MatOptions, MatIndex>, Options, StrideType>
+    : public internal::SparseRefBase<Ref<const SparseVector<MatScalar, MatOptions, MatIndex>, Options, StrideType>> {
+  typedef SparseVector<MatScalar, MatOptions, MatIndex> TPlainObjectType;
+  typedef internal::traits<Ref> Traits;
+
+ public:
+  typedef internal::SparseRefBase<Ref> Base;
+  EIGEN_SPARSE_PUBLIC_INTERFACE(Ref)
+
+  template <typename Derived>
+  inline Ref(const SparseMatrixBase<Derived>& expr) : m_hasCopy(false) {
+    construct(expr.derived(), typename Traits::template match<Derived>::type());
+  }
+
+  inline Ref(const Ref& other) : Base(other), m_hasCopy(false) {
+    // copy constructor shall not copy the m_object, to avoid unnecessary malloc and copy
+  }
+
+  template <typename OtherRef>
+  inline Ref(const RefBase<OtherRef>& other) : m_hasCopy(false) {
+    construct(other.derived(), typename Traits::template match<OtherRef>::type());
+  }
+
+  ~Ref() {
+    if (m_hasCopy) {
+      internal::destroy_at(reinterpret_cast<TPlainObjectType*>(&m_storage));
+    }
+  }
+
+ protected:
+  template <typename Expression>
+  void construct(const Expression& expr, internal::true_type) {
+    Base::construct(expr);
+  }
+
+  template <typename Expression>
+  void construct(const Expression& expr, internal::false_type) {
+    TPlainObjectType* obj = internal::construct_at(reinterpret_cast<TPlainObjectType*>(&m_storage), expr);
+    m_hasCopy = true;
+    Base::construct(*obj);
+  }
+
+ protected:
+  typename internal::aligned_storage<sizeof(TPlainObjectType), EIGEN_ALIGNOF(TPlainObjectType)>::type m_storage;
+  bool m_hasCopy;
+};
+
+namespace internal {
+
+// FIXME shall we introduce a general evaluatior_ref that we can specialize for any sparse object once, and thus remove
+// this copy-pasta thing...
+
+template <typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+struct evaluator<Ref<SparseMatrix<MatScalar, MatOptions, MatIndex>, Options, StrideType>>
+    : evaluator<SparseCompressedBase<Ref<SparseMatrix<MatScalar, MatOptions, MatIndex>, Options, StrideType>>> {
+  typedef evaluator<SparseCompressedBase<Ref<SparseMatrix<MatScalar, MatOptions, MatIndex>, Options, StrideType>>> Base;
+  typedef Ref<SparseMatrix<MatScalar, MatOptions, MatIndex>, Options, StrideType> XprType;
+  evaluator() : Base() {}
+  explicit evaluator(const XprType& mat) : Base(mat) {}
+};
+
+template <typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+struct evaluator<Ref<const SparseMatrix<MatScalar, MatOptions, MatIndex>, Options, StrideType>>
+    : evaluator<SparseCompressedBase<Ref<const SparseMatrix<MatScalar, MatOptions, MatIndex>, Options, StrideType>>> {
+  typedef evaluator<SparseCompressedBase<Ref<const SparseMatrix<MatScalar, MatOptions, MatIndex>, Options, StrideType>>>
+      Base;
+  typedef Ref<const SparseMatrix<MatScalar, MatOptions, MatIndex>, Options, StrideType> XprType;
+  evaluator() : Base() {}
+  explicit evaluator(const XprType& mat) : Base(mat) {}
+};
+
+template <typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+struct evaluator<Ref<SparseVector<MatScalar, MatOptions, MatIndex>, Options, StrideType>>
+    : evaluator<SparseCompressedBase<Ref<SparseVector<MatScalar, MatOptions, MatIndex>, Options, StrideType>>> {
+  typedef evaluator<SparseCompressedBase<Ref<SparseVector<MatScalar, MatOptions, MatIndex>, Options, StrideType>>> Base;
+  typedef Ref<SparseVector<MatScalar, MatOptions, MatIndex>, Options, StrideType> XprType;
+  evaluator() : Base() {}
+  explicit evaluator(const XprType& mat) : Base(mat) {}
+};
+
+template <typename MatScalar, int MatOptions, typename MatIndex, int Options, typename StrideType>
+struct evaluator<Ref<const SparseVector<MatScalar, MatOptions, MatIndex>, Options, StrideType>>
+    : evaluator<SparseCompressedBase<Ref<const SparseVector<MatScalar, MatOptions, MatIndex>, Options, StrideType>>> {
+  typedef evaluator<SparseCompressedBase<Ref<const SparseVector<MatScalar, MatOptions, MatIndex>, Options, StrideType>>>
+      Base;
+  typedef Ref<const SparseVector<MatScalar, MatOptions, MatIndex>, Options, StrideType> XprType;
+  evaluator() : Base() {}
+  explicit evaluator(const XprType& mat) : Base(mat) {}
+};
+
+}  // namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_SPARSE_REF_H
diff --git a/inst/include/Eigen/src/SparseCore/SparseSelfAdjointView.h b/inst/include/Eigen/src/SparseCore/SparseSelfAdjointView.h
index 0eda96bc..05b3de56 100644
--- a/inst/include/Eigen/src/SparseCore/SparseSelfAdjointView.h
+++ b/inst/include/Eigen/src/SparseCore/SparseSelfAdjointView.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2009-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,498 +10,604 @@
 #ifndef EIGEN_SPARSE_SELFADJOINTVIEW_H
 #define EIGEN_SPARSE_SELFADJOINTVIEW_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 
-/** \ingroup SparseCore_Module
-  * \class SparseSelfAdjointView
-  *
-  * \brief Pseudo expression to manipulate a triangular sparse matrix as a selfadjoint matrix.
-  *
-  * \param MatrixType the type of the dense matrix storing the coefficients
-  * \param UpLo can be either \c #Lower or \c #Upper
-  *
-  * This class is an expression of a sefladjoint matrix from a triangular part of a matrix
-  * with given dense storage of the coefficients. It is the return type of MatrixBase::selfadjointView()
-  * and most of the time this is the only way that it is used.
-  *
-  * \sa SparseMatrixBase::selfadjointView()
-  */
-template<typename Lhs, typename Rhs, int UpLo>
-class SparseSelfAdjointTimeDenseProduct;
-
-template<typename Lhs, typename Rhs, int UpLo>
-class DenseTimeSparseSelfAdjointProduct;
+namespace Eigen {
 
+/** \ingroup SparseCore_Module
+ * \class SparseSelfAdjointView
+ *
+ * \brief Pseudo expression to manipulate a triangular sparse matrix as a selfadjoint matrix.
+ *
+ * \param MatrixType the type of the dense matrix storing the coefficients
+ * \param Mode can be either \c #Lower or \c #Upper
+ *
+ * This class is an expression of a sefladjoint matrix from a triangular part of a matrix
+ * with given dense storage of the coefficients. It is the return type of MatrixBase::selfadjointView()
+ * and most of the time this is the only way that it is used.
+ *
+ * \sa SparseMatrixBase::selfadjointView()
+ */
 namespace internal {
-  
-template<typename MatrixType, unsigned int UpLo>
-struct traits<SparseSelfAdjointView<MatrixType,UpLo> > : traits<MatrixType> {
-};
 
-template<int SrcUpLo,int DstUpLo,typename MatrixType,int DestOrder>
-void permute_symm_to_symm(const MatrixType& mat, SparseMatrix<typename MatrixType::Scalar,DestOrder,typename MatrixType::Index>& _dest, const typename MatrixType::Index* perm = 0);
+template <typename MatrixType, unsigned int Mode>
+struct traits<SparseSelfAdjointView<MatrixType, Mode> > : traits<MatrixType> {};
 
-template<int UpLo,typename MatrixType,int DestOrder>
-void permute_symm_to_fullsymm(const MatrixType& mat, SparseMatrix<typename MatrixType::Scalar,DestOrder,typename MatrixType::Index>& _dest, const typename MatrixType::Index* perm = 0);
+template <int SrcMode, int DstMode, bool NonHermitian, typename MatrixType, int DestOrder>
+void permute_symm_to_symm(
+    const MatrixType& mat,
+    SparseMatrix<typename MatrixType::Scalar, DestOrder, typename MatrixType::StorageIndex>& _dest,
+    const typename MatrixType::StorageIndex* perm = 0);
 
-}
+template <int Mode, bool NonHermitian, typename MatrixType, int DestOrder>
+void permute_symm_to_fullsymm(
+    const MatrixType& mat,
+    SparseMatrix<typename MatrixType::Scalar, DestOrder, typename MatrixType::StorageIndex>& _dest,
+    const typename MatrixType::StorageIndex* perm = 0);
 
-template<typename MatrixType, unsigned int UpLo> class SparseSelfAdjointView
-  : public EigenBase<SparseSelfAdjointView<MatrixType,UpLo> >
-{
-  public:
+}  // namespace internal
 
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::Index Index;
-    typedef Matrix<Index,Dynamic,1> VectorI;
-    typedef typename MatrixType::Nested MatrixTypeNested;
-    typedef typename internal::remove_all<MatrixTypeNested>::type _MatrixTypeNested;
+template <typename MatrixType, unsigned int Mode_>
+class SparseSelfAdjointView : public EigenBase<SparseSelfAdjointView<MatrixType, Mode_> > {
+ public:
+  enum {
+    Mode = Mode_,
+    TransposeMode = ((int(Mode) & int(Upper)) ? Lower : 0) | ((int(Mode) & int(Lower)) ? Upper : 0),
+    RowsAtCompileTime = internal::traits<SparseSelfAdjointView>::RowsAtCompileTime,
+    ColsAtCompileTime = internal::traits<SparseSelfAdjointView>::ColsAtCompileTime
+  };
 
-    inline SparseSelfAdjointView(const MatrixType& matrix) : m_matrix(matrix)
-    {
-      eigen_assert(rows()==cols() && "SelfAdjointView is only for squared matrices");
-    }
+  typedef EigenBase<SparseSelfAdjointView> Base;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::StorageIndex StorageIndex;
+  typedef Matrix<StorageIndex, Dynamic, 1> VectorI;
+  typedef typename internal::ref_selector<MatrixType>::non_const_type MatrixTypeNested;
+  typedef internal::remove_all_t<MatrixTypeNested> MatrixTypeNested_;
 
-    inline Index rows() const { return m_matrix.rows(); }
-    inline Index cols() const { return m_matrix.cols(); }
-
-    /** \internal \returns a reference to the nested matrix */
-    const _MatrixTypeNested& matrix() const { return m_matrix; }
-    _MatrixTypeNested& matrix() { return m_matrix.const_cast_derived(); }
-
-    /** \returns an expression of the matrix product between a sparse self-adjoint matrix \c *this and a sparse matrix \a rhs.
-      *
-      * Note that there is no algorithmic advantage of performing such a product compared to a general sparse-sparse matrix product.
-      * Indeed, the SparseSelfadjointView operand is first copied into a temporary SparseMatrix before computing the product.
-      */
-    template<typename OtherDerived>
-    SparseSparseProduct<typename OtherDerived::PlainObject, OtherDerived>
-    operator*(const SparseMatrixBase<OtherDerived>& rhs) const
-    {
-      return SparseSparseProduct<typename OtherDerived::PlainObject, OtherDerived>(*this, rhs.derived());
-    }
+  explicit inline SparseSelfAdjointView(MatrixType& matrix) : m_matrix(matrix) {
+    eigen_assert(rows() == cols() && "SelfAdjointView is only for squared matrices");
+  }
 
-    /** \returns an expression of the matrix product between a sparse matrix \a lhs and a sparse self-adjoint matrix \a rhs.
-      *
-      * Note that there is no algorithmic advantage of performing such a product compared to a general sparse-sparse matrix product.
-      * Indeed, the SparseSelfadjointView operand is first copied into a temporary SparseMatrix before computing the product.
-      */
-    template<typename OtherDerived> friend
-    SparseSparseProduct<OtherDerived, typename OtherDerived::PlainObject >
-    operator*(const SparseMatrixBase<OtherDerived>& lhs, const SparseSelfAdjointView& rhs)
-    {
-      return SparseSparseProduct<OtherDerived, typename OtherDerived::PlainObject>(lhs.derived(), rhs);
-    }
-    
-    /** Efficient sparse self-adjoint matrix times dense vector/matrix product */
-    template<typename OtherDerived>
-    SparseSelfAdjointTimeDenseProduct<MatrixType,OtherDerived,UpLo>
-    operator*(const MatrixBase<OtherDerived>& rhs) const
-    {
-      return SparseSelfAdjointTimeDenseProduct<MatrixType,OtherDerived,UpLo>(m_matrix, rhs.derived());
-    }
+  inline Index rows() const { return m_matrix.rows(); }
+  inline Index cols() const { return m_matrix.cols(); }
+
+  /** \internal \returns a reference to the nested matrix */
+  const MatrixTypeNested_& matrix() const { return m_matrix; }
+  std::remove_reference_t<MatrixTypeNested>& matrix() { return m_matrix; }
+
+  /** \returns an expression of the matrix product between a sparse self-adjoint matrix \c *this and a sparse matrix \a
+   * rhs.
+   *
+   * Note that there is no algorithmic advantage of performing such a product compared to a general sparse-sparse matrix
+   * product. Indeed, the SparseSelfadjointView operand is first copied into a temporary SparseMatrix before computing
+   * the product.
+   */
+  template <typename OtherDerived>
+  Product<SparseSelfAdjointView, OtherDerived> operator*(const SparseMatrixBase<OtherDerived>& rhs) const {
+    return Product<SparseSelfAdjointView, OtherDerived>(*this, rhs.derived());
+  }
 
-    /** Efficient dense vector/matrix times sparse self-adjoint matrix product */
-    template<typename OtherDerived> friend
-    DenseTimeSparseSelfAdjointProduct<OtherDerived,MatrixType,UpLo>
-    operator*(const MatrixBase<OtherDerived>& lhs, const SparseSelfAdjointView& rhs)
-    {
-      return DenseTimeSparseSelfAdjointProduct<OtherDerived,_MatrixTypeNested,UpLo>(lhs.derived(), rhs.m_matrix);
-    }
+  /** \returns an expression of the matrix product between a sparse matrix \a lhs and a sparse self-adjoint matrix \a
+   * rhs.
+   *
+   * Note that there is no algorithmic advantage of performing such a product compared to a general sparse-sparse matrix
+   * product. Indeed, the SparseSelfadjointView operand is first copied into a temporary SparseMatrix before computing
+   * the product.
+   */
+  template <typename OtherDerived>
+  friend Product<OtherDerived, SparseSelfAdjointView> operator*(const SparseMatrixBase<OtherDerived>& lhs,
+                                                                const SparseSelfAdjointView& rhs) {
+    return Product<OtherDerived, SparseSelfAdjointView>(lhs.derived(), rhs);
+  }
 
-    /** Perform a symmetric rank K update of the selfadjoint matrix \c *this:
-      * \f$ this = this + \alpha ( u u^* ) \f$ where \a u is a vector or matrix.
-      *
-      * \returns a reference to \c *this
-      *
-      * To perform \f$ this = this + \alpha ( u^* u ) \f$ you can simply
-      * call this function with u.adjoint().
-      */
-    template<typename DerivedU>
-    SparseSelfAdjointView& rankUpdate(const SparseMatrixBase<DerivedU>& u, const Scalar& alpha = Scalar(1));
-    
-    /** \internal triggered by sparse_matrix = SparseSelfadjointView; */
-    template<typename DestScalar,int StorageOrder> void evalTo(SparseMatrix<DestScalar,StorageOrder,Index>& _dest) const
-    {
-      internal::permute_symm_to_fullsymm<UpLo>(m_matrix, _dest);
-    }
-    
-    template<typename DestScalar> void evalTo(DynamicSparseMatrix<DestScalar,ColMajor,Index>& _dest) const
-    {
-      // TODO directly evaluate into _dest;
-      SparseMatrix<DestScalar,ColMajor,Index> tmp(_dest.rows(),_dest.cols());
-      internal::permute_symm_to_fullsymm<UpLo>(m_matrix, tmp);
-      _dest = tmp;
-    }
-    
-    /** \returns an expression of P H P^-1 */
-    SparseSymmetricPermutationProduct<_MatrixTypeNested,UpLo> twistedBy(const PermutationMatrix<Dynamic,Dynamic,Index>& perm) const
-    {
-      return SparseSymmetricPermutationProduct<_MatrixTypeNested,UpLo>(m_matrix, perm);
-    }
-    
-    template<typename SrcMatrixType,int SrcUpLo>
-    SparseSelfAdjointView& operator=(const SparseSymmetricPermutationProduct<SrcMatrixType,SrcUpLo>& permutedMatrix)
-    {
-      permutedMatrix.evalTo(*this);
-      return *this;
-    }
+  /** Efficient sparse self-adjoint matrix times dense vector/matrix product */
+  template <typename OtherDerived>
+  Product<SparseSelfAdjointView, OtherDerived> operator*(const MatrixBase<OtherDerived>& rhs) const {
+    return Product<SparseSelfAdjointView, OtherDerived>(*this, rhs.derived());
+  }
 
+  /** Efficient dense vector/matrix times sparse self-adjoint matrix product */
+  template <typename OtherDerived>
+  friend Product<OtherDerived, SparseSelfAdjointView> operator*(const MatrixBase<OtherDerived>& lhs,
+                                                                const SparseSelfAdjointView& rhs) {
+    return Product<OtherDerived, SparseSelfAdjointView>(lhs.derived(), rhs);
+  }
 
-    SparseSelfAdjointView& operator=(const SparseSelfAdjointView& src)
-    {
-      PermutationMatrix<Dynamic> pnull;
-      return *this = src.twistedBy(pnull);
-    }
+  /** Perform a symmetric rank K update of the selfadjoint matrix \c *this:
+   * \f$ this = this + \alpha ( u u^* ) \f$ where \a u is a vector or matrix.
+   *
+   * \returns a reference to \c *this
+   *
+   * To perform \f$ this = this + \alpha ( u^* u ) \f$ you can simply
+   * call this function with u.adjoint().
+   */
+  template <typename DerivedU>
+  SparseSelfAdjointView& rankUpdate(const SparseMatrixBase<DerivedU>& u, const Scalar& alpha = Scalar(1));
+
+  /** \returns an expression of P H P^-1 */
+  // TODO implement twists in a more evaluator friendly fashion
+  SparseSymmetricPermutationProduct<MatrixTypeNested_, Mode> twistedBy(
+      const PermutationMatrix<Dynamic, Dynamic, StorageIndex>& perm) const {
+    return SparseSymmetricPermutationProduct<MatrixTypeNested_, Mode>(m_matrix, perm);
+  }
 
-    template<typename SrcMatrixType,unsigned int SrcUpLo>
-    SparseSelfAdjointView& operator=(const SparseSelfAdjointView<SrcMatrixType,SrcUpLo>& src)
-    {
-      PermutationMatrix<Dynamic> pnull;
-      return *this = src.twistedBy(pnull);
-    }
-    
+  template <typename SrcMatrixType, int SrcMode>
+  SparseSelfAdjointView& operator=(const SparseSymmetricPermutationProduct<SrcMatrixType, SrcMode>& permutedMatrix) {
+    internal::call_assignment_no_alias_no_transpose(*this, permutedMatrix);
+    return *this;
+  }
+
+  SparseSelfAdjointView& operator=(const SparseSelfAdjointView& src) {
+    PermutationMatrix<Dynamic, Dynamic, StorageIndex> pnull;
+    return *this = src.twistedBy(pnull);
+  }
 
-    // const SparseLLT<PlainObject, UpLo> llt() const;
-    // const SparseLDLT<PlainObject, UpLo> ldlt() const;
+  // Since we override the copy-assignment operator, we need to explicitly redeclare the copy-constructor
+  EIGEN_DEFAULT_COPY_CONSTRUCTOR(SparseSelfAdjointView)
 
-  protected:
+  template <typename SrcMatrixType, unsigned int SrcMode>
+  SparseSelfAdjointView& operator=(const SparseSelfAdjointView<SrcMatrixType, SrcMode>& src) {
+    PermutationMatrix<Dynamic, Dynamic, StorageIndex> pnull;
+    return *this = src.twistedBy(pnull);
+  }
 
-    typename MatrixType::Nested m_matrix;
-    mutable VectorI m_countPerRow;
-    mutable VectorI m_countPerCol;
+  void resize(Index rows, Index cols) {
+    EIGEN_ONLY_USED_FOR_DEBUG(rows);
+    EIGEN_ONLY_USED_FOR_DEBUG(cols);
+    eigen_assert(rows == this->rows() && cols == this->cols() &&
+                 "SparseSelfadjointView::resize() does not actually allow to resize.");
+  }
+
+ protected:
+  MatrixTypeNested m_matrix;
+  // mutable VectorI m_countPerRow;
+  // mutable VectorI m_countPerCol;
+ private:
+  template <typename Dest>
+  void evalTo(Dest&) const;
 };
 
 /***************************************************************************
-* Implementation of SparseMatrixBase methods
-***************************************************************************/
-
-template<typename Derived>
-template<unsigned int UpLo>
-const SparseSelfAdjointView<Derived, UpLo> SparseMatrixBase<Derived>::selfadjointView() const
-{
-  return derived();
+ * Implementation of SparseMatrixBase methods
+ ***************************************************************************/
+
+template <typename Derived>
+template <unsigned int UpLo>
+typename SparseMatrixBase<Derived>::template ConstSelfAdjointViewReturnType<UpLo>::Type
+SparseMatrixBase<Derived>::selfadjointView() const {
+  return SparseSelfAdjointView<const Derived, UpLo>(derived());
 }
 
-template<typename Derived>
-template<unsigned int UpLo>
-SparseSelfAdjointView<Derived, UpLo> SparseMatrixBase<Derived>::selfadjointView()
-{
-  return derived();
+template <typename Derived>
+template <unsigned int UpLo>
+typename SparseMatrixBase<Derived>::template SelfAdjointViewReturnType<UpLo>::Type
+SparseMatrixBase<Derived>::selfadjointView() {
+  return SparseSelfAdjointView<Derived, UpLo>(derived());
 }
 
 /***************************************************************************
-* Implementation of SparseSelfAdjointView methods
-***************************************************************************/
-
-template<typename MatrixType, unsigned int UpLo>
-template<typename DerivedU>
-SparseSelfAdjointView<MatrixType,UpLo>&
-SparseSelfAdjointView<MatrixType,UpLo>::rankUpdate(const SparseMatrixBase<DerivedU>& u, const Scalar& alpha)
-{
-  SparseMatrix<Scalar,MatrixType::Flags&RowMajorBit?RowMajor:ColMajor> tmp = u * u.adjoint();
-  if(alpha==Scalar(0))
-    m_matrix.const_cast_derived() = tmp.template triangularView<UpLo>();
+ * Implementation of SparseSelfAdjointView methods
+ ***************************************************************************/
+
+template <typename MatrixType, unsigned int Mode>
+template <typename DerivedU>
+SparseSelfAdjointView<MatrixType, Mode>& SparseSelfAdjointView<MatrixType, Mode>::rankUpdate(
+    const SparseMatrixBase<DerivedU>& u, const Scalar& alpha) {
+  SparseMatrix<Scalar, (MatrixType::Flags & RowMajorBit) ? RowMajor : ColMajor> tmp = u * u.adjoint();
+  if (alpha == Scalar(0))
+    m_matrix = tmp.template triangularView<Mode>();
   else
-    m_matrix.const_cast_derived() += alpha * tmp.template triangularView<UpLo>();
+    m_matrix += alpha * tmp.template triangularView<Mode>();
 
   return *this;
 }
 
+namespace internal {
+
+// TODO currently a selfadjoint expression has the form SelfAdjointView<.,.>
+//      in the future selfadjoint-ness should be defined by the expression traits
+//      such that Transpose<SelfAdjointView<.,.> > is valid. (currently TriangularBase::transpose() is overloaded to
+//      make it work)
+template <typename MatrixType, unsigned int Mode>
+struct evaluator_traits<SparseSelfAdjointView<MatrixType, Mode> > {
+  typedef typename storage_kind_to_evaluator_kind<typename MatrixType::StorageKind>::Kind Kind;
+  typedef SparseSelfAdjointShape Shape;
+};
+
+struct SparseSelfAdjoint2Sparse {};
+
+template <>
+struct AssignmentKind<SparseShape, SparseSelfAdjointShape> {
+  typedef SparseSelfAdjoint2Sparse Kind;
+};
+template <>
+struct AssignmentKind<SparseSelfAdjointShape, SparseShape> {
+  typedef Sparse2Sparse Kind;
+};
+
+template <typename DstXprType, typename SrcXprType, typename Functor>
+struct Assignment<DstXprType, SrcXprType, Functor, SparseSelfAdjoint2Sparse> {
+  typedef typename DstXprType::StorageIndex StorageIndex;
+  typedef internal::assign_op<typename DstXprType::Scalar, typename SrcXprType::Scalar> AssignOpType;
+
+  template <typename DestScalar, int StorageOrder>
+  static void run(SparseMatrix<DestScalar, StorageOrder, StorageIndex>& dst, const SrcXprType& src,
+                  const AssignOpType& /*func*/) {
+    internal::permute_symm_to_fullsymm<SrcXprType::Mode, false>(src.matrix(), dst);
+  }
+
+  // FIXME: the handling of += and -= in sparse matrices should be cleanup so that next two overloads could be reduced
+  // to:
+  template <typename DestScalar, int StorageOrder, typename AssignFunc>
+  static void run(SparseMatrix<DestScalar, StorageOrder, StorageIndex>& dst, const SrcXprType& src,
+                  const AssignFunc& func) {
+    SparseMatrix<DestScalar, StorageOrder, StorageIndex> tmp(src.rows(), src.cols());
+    run(tmp, src, AssignOpType());
+    call_assignment_no_alias_no_transpose(dst, tmp, func);
+  }
+
+  template <typename DestScalar, int StorageOrder>
+  static void run(SparseMatrix<DestScalar, StorageOrder, StorageIndex>& dst, const SrcXprType& src,
+                  const internal::add_assign_op<typename DstXprType::Scalar, typename SrcXprType::Scalar>& /* func */) {
+    SparseMatrix<DestScalar, StorageOrder, StorageIndex> tmp(src.rows(), src.cols());
+    run(tmp, src, AssignOpType());
+    dst += tmp;
+  }
+
+  template <typename DestScalar, int StorageOrder>
+  static void run(SparseMatrix<DestScalar, StorageOrder, StorageIndex>& dst, const SrcXprType& src,
+                  const internal::sub_assign_op<typename DstXprType::Scalar, typename SrcXprType::Scalar>& /* func */) {
+    SparseMatrix<DestScalar, StorageOrder, StorageIndex> tmp(src.rows(), src.cols());
+    run(tmp, src, AssignOpType());
+    dst -= tmp;
+  }
+};
+
+}  // end namespace internal
+
 /***************************************************************************
-* Implementation of sparse self-adjoint time dense matrix
-***************************************************************************/
+ * Implementation of sparse self-adjoint time dense matrix
+ ***************************************************************************/
 
 namespace internal {
-template<typename Lhs, typename Rhs, int UpLo>
-struct traits<SparseSelfAdjointTimeDenseProduct<Lhs,Rhs,UpLo> >
- : traits<ProductBase<SparseSelfAdjointTimeDenseProduct<Lhs,Rhs,UpLo>, Lhs, Rhs> >
-{
-  typedef Dense StorageKind;
-};
-}
 
-template<typename Lhs, typename Rhs, int UpLo>
-class SparseSelfAdjointTimeDenseProduct
-  : public ProductBase<SparseSelfAdjointTimeDenseProduct<Lhs,Rhs,UpLo>, Lhs, Rhs>
-{
-  public:
-    EIGEN_PRODUCT_PUBLIC_INTERFACE(SparseSelfAdjointTimeDenseProduct)
-
-    SparseSelfAdjointTimeDenseProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs)
-    {}
-
-    template<typename Dest> void scaleAndAddTo(Dest& dest, const Scalar& alpha) const
-    {
-      EIGEN_ONLY_USED_FOR_DEBUG(alpha);
-      // TODO use alpha
-      eigen_assert(alpha==Scalar(1) && "alpha != 1 is not implemented yet, sorry");
-      typedef typename internal::remove_all<Lhs>::type _Lhs;
-      typedef typename _Lhs::InnerIterator LhsInnerIterator;
-      enum {
-        LhsIsRowMajor = (_Lhs::Flags&RowMajorBit)==RowMajorBit,
-        ProcessFirstHalf =
-                 ((UpLo&(Upper|Lower))==(Upper|Lower))
-              || ( (UpLo&Upper) && !LhsIsRowMajor)
-              || ( (UpLo&Lower) && LhsIsRowMajor),
-        ProcessSecondHalf = !ProcessFirstHalf
-      };
-      for (Index j=0; j<m_lhs.outerSize(); ++j)
-      {
-        LhsInnerIterator i(m_lhs,j);
-        if (ProcessSecondHalf)
-        {
-          while (i && i.index()<j) ++i;
-          if(i && i.index()==j)
-          {
-            dest.row(j) += i.value() * m_rhs.row(j);
-            ++i;
-          }
-        }
-        for(; (ProcessFirstHalf ? i && i.index() < j : i) ; ++i)
-        {
-          Index a = LhsIsRowMajor ? j : i.index();
-          Index b = LhsIsRowMajor ? i.index() : j;
-          typename Lhs::Scalar v = i.value();
-          dest.row(a) += (v) * m_rhs.row(b);
-          dest.row(b) += numext::conj(v) * m_rhs.row(a);
+template <int Mode, typename SparseLhsType, typename DenseRhsType, typename DenseResType, typename AlphaType>
+inline void sparse_selfadjoint_time_dense_product(const SparseLhsType& lhs, const DenseRhsType& rhs, DenseResType& res,
+                                                  const AlphaType& alpha) {
+  EIGEN_ONLY_USED_FOR_DEBUG(alpha);
+
+  typedef typename internal::nested_eval<SparseLhsType, DenseRhsType::MaxColsAtCompileTime>::type SparseLhsTypeNested;
+  typedef internal::remove_all_t<SparseLhsTypeNested> SparseLhsTypeNestedCleaned;
+  typedef evaluator<SparseLhsTypeNestedCleaned> LhsEval;
+  typedef typename LhsEval::InnerIterator LhsIterator;
+  typedef typename SparseLhsType::Scalar LhsScalar;
+
+  enum {
+    LhsIsRowMajor = (LhsEval::Flags & RowMajorBit) == RowMajorBit,
+    ProcessFirstHalf = ((Mode & (Upper | Lower)) == (Upper | Lower)) || ((Mode & Upper) && !LhsIsRowMajor) ||
+                       ((Mode & Lower) && LhsIsRowMajor),
+    ProcessSecondHalf = !ProcessFirstHalf
+  };
+
+  SparseLhsTypeNested lhs_nested(lhs);
+  LhsEval lhsEval(lhs_nested);
+
+  // work on one column at once
+  for (Index k = 0; k < rhs.cols(); ++k) {
+    for (Index j = 0; j < lhs.outerSize(); ++j) {
+      LhsIterator i(lhsEval, j);
+      // handle diagonal coeff
+      if (ProcessSecondHalf) {
+        while (i && i.index() < j) ++i;
+        if (i && i.index() == j) {
+          res.coeffRef(j, k) += alpha * i.value() * rhs.coeff(j, k);
+          ++i;
         }
-        if (ProcessFirstHalf && i && (i.index()==j))
-          dest.row(j) += i.value() * m_rhs.row(j);
       }
+
+      // premultiplied rhs for scatters
+      typename ScalarBinaryOpTraits<AlphaType, typename DenseRhsType::Scalar>::ReturnType rhs_j(alpha * rhs(j, k));
+      // accumulator for partial scalar product
+      typename DenseResType::Scalar res_j(0);
+      for (; (ProcessFirstHalf ? i && i.index() < j : i); ++i) {
+        LhsScalar lhs_ij = i.value();
+        if (!LhsIsRowMajor) lhs_ij = numext::conj(lhs_ij);
+        res_j += lhs_ij * rhs.coeff(i.index(), k);
+        res(i.index(), k) += numext::conj(lhs_ij) * rhs_j;
+      }
+      res.coeffRef(j, k) += alpha * res_j;
+
+      // handle diagonal coeff
+      if (ProcessFirstHalf && i && (i.index() == j)) res.coeffRef(j, k) += alpha * i.value() * rhs.coeff(j, k);
     }
+  }
+}
 
-  private:
-    SparseSelfAdjointTimeDenseProduct& operator=(const SparseSelfAdjointTimeDenseProduct&);
+template <typename LhsView, typename Rhs, int ProductType>
+struct generic_product_impl<LhsView, Rhs, SparseSelfAdjointShape, DenseShape, ProductType>
+    : generic_product_impl_base<LhsView, Rhs,
+                                generic_product_impl<LhsView, Rhs, SparseSelfAdjointShape, DenseShape, ProductType> > {
+  template <typename Dest>
+  static void scaleAndAddTo(Dest& dst, const LhsView& lhsView, const Rhs& rhs, const typename Dest::Scalar& alpha) {
+    typedef typename LhsView::MatrixTypeNested_ Lhs;
+    typedef typename nested_eval<Lhs, Dynamic>::type LhsNested;
+    typedef typename nested_eval<Rhs, Dynamic>::type RhsNested;
+    LhsNested lhsNested(lhsView.matrix());
+    RhsNested rhsNested(rhs);
+
+    internal::sparse_selfadjoint_time_dense_product<LhsView::Mode>(lhsNested, rhsNested, dst, alpha);
+  }
 };
 
-namespace internal {
-template<typename Lhs, typename Rhs, int UpLo>
-struct traits<DenseTimeSparseSelfAdjointProduct<Lhs,Rhs,UpLo> >
- : traits<ProductBase<DenseTimeSparseSelfAdjointProduct<Lhs,Rhs,UpLo>, Lhs, Rhs> >
-{};
-}
+template <typename Lhs, typename RhsView, int ProductType>
+struct generic_product_impl<Lhs, RhsView, DenseShape, SparseSelfAdjointShape, ProductType>
+    : generic_product_impl_base<Lhs, RhsView,
+                                generic_product_impl<Lhs, RhsView, DenseShape, SparseSelfAdjointShape, ProductType> > {
+  template <typename Dest>
+  static void scaleAndAddTo(Dest& dst, const Lhs& lhs, const RhsView& rhsView, const typename Dest::Scalar& alpha) {
+    typedef typename RhsView::MatrixTypeNested_ Rhs;
+    typedef typename nested_eval<Lhs, Dynamic>::type LhsNested;
+    typedef typename nested_eval<Rhs, Dynamic>::type RhsNested;
+    LhsNested lhsNested(lhs);
+    RhsNested rhsNested(rhsView.matrix());
+
+    // transpose everything
+    Transpose<Dest> dstT(dst);
+    internal::sparse_selfadjoint_time_dense_product<RhsView::TransposeMode>(rhsNested.transpose(),
+                                                                            lhsNested.transpose(), dstT, alpha);
+  }
+};
 
-template<typename Lhs, typename Rhs, int UpLo>
-class DenseTimeSparseSelfAdjointProduct
-  : public ProductBase<DenseTimeSparseSelfAdjointProduct<Lhs,Rhs,UpLo>, Lhs, Rhs>
-{
-  public:
-    EIGEN_PRODUCT_PUBLIC_INTERFACE(DenseTimeSparseSelfAdjointProduct)
+// NOTE: these two overloads are needed to evaluate the sparse selfadjoint view into a full sparse matrix
+// TODO: maybe the copy could be handled by generic_product_impl so that these overloads would not be needed anymore
 
-    DenseTimeSparseSelfAdjointProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs,rhs)
-    {}
+template <typename LhsView, typename Rhs, int ProductTag>
+struct product_evaluator<Product<LhsView, Rhs, DefaultProduct>, ProductTag, SparseSelfAdjointShape, SparseShape>
+    : public evaluator<typename Product<typename Rhs::PlainObject, Rhs, DefaultProduct>::PlainObject> {
+  typedef Product<LhsView, Rhs, DefaultProduct> XprType;
+  typedef typename XprType::PlainObject PlainObject;
+  typedef evaluator<PlainObject> Base;
 
-    template<typename Dest> void scaleAndAddTo(Dest& /*dest*/, const Scalar& /*alpha*/) const
-    {
-      // TODO
-    }
+  product_evaluator(const XprType& xpr) : m_lhs(xpr.lhs()), m_result(xpr.rows(), xpr.cols()) {
+    internal::construct_at<Base>(this, m_result);
+    generic_product_impl<typename Rhs::PlainObject, Rhs, SparseShape, SparseShape, ProductTag>::evalTo(m_result, m_lhs,
+                                                                                                       xpr.rhs());
+  }
 
-  private:
-    DenseTimeSparseSelfAdjointProduct& operator=(const DenseTimeSparseSelfAdjointProduct&);
+ protected:
+  typename Rhs::PlainObject m_lhs;
+  PlainObject m_result;
 };
 
+template <typename Lhs, typename RhsView, int ProductTag>
+struct product_evaluator<Product<Lhs, RhsView, DefaultProduct>, ProductTag, SparseShape, SparseSelfAdjointShape>
+    : public evaluator<typename Product<Lhs, typename Lhs::PlainObject, DefaultProduct>::PlainObject> {
+  typedef Product<Lhs, RhsView, DefaultProduct> XprType;
+  typedef typename XprType::PlainObject PlainObject;
+  typedef evaluator<PlainObject> Base;
+
+  product_evaluator(const XprType& xpr) : m_rhs(xpr.rhs()), m_result(xpr.rows(), xpr.cols()) {
+    ::new (static_cast<Base*>(this)) Base(m_result);
+    generic_product_impl<Lhs, typename Lhs::PlainObject, SparseShape, SparseShape, ProductTag>::evalTo(
+        m_result, xpr.lhs(), m_rhs);
+  }
+
+ protected:
+  typename Lhs::PlainObject m_rhs;
+  PlainObject m_result;
+};
+
+}  // namespace internal
+
 /***************************************************************************
-* Implementation of symmetric copies and permutations
-***************************************************************************/
+ * Implementation of symmetric copies and permutations
+ ***************************************************************************/
 namespace internal {
-  
-template<typename MatrixType, int UpLo>
-struct traits<SparseSymmetricPermutationProduct<MatrixType,UpLo> > : traits<MatrixType> {
-};
 
-template<int UpLo,typename MatrixType,int DestOrder>
-void permute_symm_to_fullsymm(const MatrixType& mat, SparseMatrix<typename MatrixType::Scalar,DestOrder,typename MatrixType::Index>& _dest, const typename MatrixType::Index* perm)
-{
-  typedef typename MatrixType::Index Index;
+template <int Mode, bool NonHermitian, typename MatrixType, int DestOrder>
+void permute_symm_to_fullsymm(
+    const MatrixType& mat,
+    SparseMatrix<typename MatrixType::Scalar, DestOrder, typename MatrixType::StorageIndex>& _dest,
+    const typename MatrixType::StorageIndex* perm) {
+  typedef typename MatrixType::StorageIndex StorageIndex;
   typedef typename MatrixType::Scalar Scalar;
-  typedef SparseMatrix<Scalar,DestOrder,Index> Dest;
-  typedef Matrix<Index,Dynamic,1> VectorI;
-  
+  typedef SparseMatrix<Scalar, DestOrder, StorageIndex> Dest;
+  typedef Matrix<StorageIndex, Dynamic, 1> VectorI;
+  typedef evaluator<MatrixType> MatEval;
+  typedef typename evaluator<MatrixType>::InnerIterator MatIterator;
+
+  MatEval matEval(mat);
   Dest& dest(_dest.derived());
-  enum {
-    StorageOrderMatch = int(Dest::IsRowMajor) == int(MatrixType::IsRowMajor)
-  };
-  
+  enum { StorageOrderMatch = int(Dest::IsRowMajor) == int(MatrixType::IsRowMajor) };
+
   Index size = mat.rows();
   VectorI count;
   count.resize(size);
   count.setZero();
-  dest.resize(size,size);
-  for(Index j = 0; j<size; ++j)
-  {
+  dest.resize(size, size);
+  for (Index j = 0; j < size; ++j) {
     Index jp = perm ? perm[j] : j;
-    for(typename MatrixType::InnerIterator it(mat,j); it; ++it)
-    {
+    for (MatIterator it(matEval, j); it; ++it) {
       Index i = it.index();
       Index r = it.row();
       Index c = it.col();
       Index ip = perm ? perm[i] : i;
-      if(UpLo==(Upper|Lower))
+      if (Mode == int(Upper | Lower))
         count[StorageOrderMatch ? jp : ip]++;
-      else if(r==c)
+      else if (r == c)
         count[ip]++;
-      else if(( UpLo==Lower && r>c) || ( UpLo==Upper && r<c))
-      {
+      else if ((Mode == Lower && r > c) || (Mode == Upper && r < c)) {
         count[ip]++;
         count[jp]++;
       }
     }
   }
   Index nnz = count.sum();
-  
+
   // reserve space
   dest.resizeNonZeros(nnz);
   dest.outerIndexPtr()[0] = 0;
-  for(Index j=0; j<size; ++j)
-    dest.outerIndexPtr()[j+1] = dest.outerIndexPtr()[j] + count[j];
-  for(Index j=0; j<size; ++j)
-    count[j] = dest.outerIndexPtr()[j];
-  
+  for (Index j = 0; j < size; ++j) dest.outerIndexPtr()[j + 1] = dest.outerIndexPtr()[j] + count[j];
+  for (Index j = 0; j < size; ++j) count[j] = dest.outerIndexPtr()[j];
+
   // copy data
-  for(Index j = 0; j<size; ++j)
-  {
-    for(typename MatrixType::InnerIterator it(mat,j); it; ++it)
-    {
-      Index i = it.index();
+  for (StorageIndex j = 0; j < size; ++j) {
+    for (MatIterator it(matEval, j); it; ++it) {
+      StorageIndex i = internal::convert_index<StorageIndex>(it.index());
       Index r = it.row();
       Index c = it.col();
-      
-      Index jp = perm ? perm[j] : j;
-      Index ip = perm ? perm[i] : i;
-      
-      if(UpLo==(Upper|Lower))
-      {
+
+      StorageIndex jp = perm ? perm[j] : j;
+      StorageIndex ip = perm ? perm[i] : i;
+
+      if (Mode == int(Upper | Lower)) {
         Index k = count[StorageOrderMatch ? jp : ip]++;
         dest.innerIndexPtr()[k] = StorageOrderMatch ? ip : jp;
         dest.valuePtr()[k] = it.value();
-      }
-      else if(r==c)
-      {
+      } else if (r == c) {
         Index k = count[ip]++;
         dest.innerIndexPtr()[k] = ip;
         dest.valuePtr()[k] = it.value();
-      }
-      else if(( (UpLo&Lower)==Lower && r>c) || ( (UpLo&Upper)==Upper && r<c))
-      {
-        if(!StorageOrderMatch)
-          std::swap(ip,jp);
+      } else if (((Mode & Lower) == Lower && r > c) || ((Mode & Upper) == Upper && r < c)) {
+        if (!StorageOrderMatch) std::swap(ip, jp);
         Index k = count[jp]++;
         dest.innerIndexPtr()[k] = ip;
         dest.valuePtr()[k] = it.value();
         k = count[ip]++;
         dest.innerIndexPtr()[k] = jp;
-        dest.valuePtr()[k] = numext::conj(it.value());
+        dest.valuePtr()[k] = (NonHermitian ? it.value() : numext::conj(it.value()));
       }
     }
   }
 }
 
-template<int _SrcUpLo,int _DstUpLo,typename MatrixType,int DstOrder>
-void permute_symm_to_symm(const MatrixType& mat, SparseMatrix<typename MatrixType::Scalar,DstOrder,typename MatrixType::Index>& _dest, const typename MatrixType::Index* perm)
-{
-  typedef typename MatrixType::Index Index;
+template <int SrcMode_, int DstMode_, bool NonHermitian, typename MatrixType, int DstOrder>
+void permute_symm_to_symm(const MatrixType& mat,
+                          SparseMatrix<typename MatrixType::Scalar, DstOrder, typename MatrixType::StorageIndex>& _dest,
+                          const typename MatrixType::StorageIndex* perm) {
+  typedef typename MatrixType::StorageIndex StorageIndex;
   typedef typename MatrixType::Scalar Scalar;
-  SparseMatrix<Scalar,DstOrder,Index>& dest(_dest.derived());
-  typedef Matrix<Index,Dynamic,1> VectorI;
+  SparseMatrix<Scalar, DstOrder, StorageIndex>& dest(_dest.derived());
+  typedef Matrix<StorageIndex, Dynamic, 1> VectorI;
+  typedef evaluator<MatrixType> MatEval;
+  typedef typename evaluator<MatrixType>::InnerIterator MatIterator;
+
   enum {
     SrcOrder = MatrixType::IsRowMajor ? RowMajor : ColMajor,
     StorageOrderMatch = int(SrcOrder) == int(DstOrder),
-    DstUpLo = DstOrder==RowMajor ? (_DstUpLo==Upper ? Lower : Upper) : _DstUpLo,
-    SrcUpLo = SrcOrder==RowMajor ? (_SrcUpLo==Upper ? Lower : Upper) : _SrcUpLo
+    DstMode = DstOrder == RowMajor ? (DstMode_ == Upper ? Lower : Upper) : DstMode_,
+    SrcMode = SrcOrder == RowMajor ? (SrcMode_ == Upper ? Lower : Upper) : SrcMode_
   };
-  
+
+  MatEval matEval(mat);
+
   Index size = mat.rows();
   VectorI count(size);
   count.setZero();
-  dest.resize(size,size);
-  for(Index j = 0; j<size; ++j)
-  {
-    Index jp = perm ? perm[j] : j;
-    for(typename MatrixType::InnerIterator it(mat,j); it; ++it)
-    {
-      Index i = it.index();
-      if((int(SrcUpLo)==int(Lower) && i<j) || (int(SrcUpLo)==int(Upper) && i>j))
-        continue;
-                  
-      Index ip = perm ? perm[i] : i;
-      count[int(DstUpLo)==int(Lower) ? (std::min)(ip,jp) : (std::max)(ip,jp)]++;
+  dest.resize(size, size);
+  for (StorageIndex j = 0; j < size; ++j) {
+    StorageIndex jp = perm ? perm[j] : j;
+    for (MatIterator it(matEval, j); it; ++it) {
+      StorageIndex i = it.index();
+      if ((int(SrcMode) == int(Lower) && i < j) || (int(SrcMode) == int(Upper) && i > j)) continue;
+
+      StorageIndex ip = perm ? perm[i] : i;
+      count[int(DstMode) == int(Lower) ? (std::min)(ip, jp) : (std::max)(ip, jp)]++;
     }
   }
   dest.outerIndexPtr()[0] = 0;
-  for(Index j=0; j<size; ++j)
-    dest.outerIndexPtr()[j+1] = dest.outerIndexPtr()[j] + count[j];
+  for (Index j = 0; j < size; ++j) dest.outerIndexPtr()[j + 1] = dest.outerIndexPtr()[j] + count[j];
   dest.resizeNonZeros(dest.outerIndexPtr()[size]);
-  for(Index j=0; j<size; ++j)
-    count[j] = dest.outerIndexPtr()[j];
-  
-  for(Index j = 0; j<size; ++j)
-  {
-    
-    for(typename MatrixType::InnerIterator it(mat,j); it; ++it)
-    {
-      Index i = it.index();
-      if((int(SrcUpLo)==int(Lower) && i<j) || (int(SrcUpLo)==int(Upper) && i>j))
-        continue;
-                  
-      Index jp = perm ? perm[j] : j;
-      Index ip = perm? perm[i] : i;
-      
-      Index k = count[int(DstUpLo)==int(Lower) ? (std::min)(ip,jp) : (std::max)(ip,jp)]++;
-      dest.innerIndexPtr()[k] = int(DstUpLo)==int(Lower) ? (std::max)(ip,jp) : (std::min)(ip,jp);
-      
-      if(!StorageOrderMatch) std::swap(ip,jp);
-      if( ((int(DstUpLo)==int(Lower) && ip<jp) || (int(DstUpLo)==int(Upper) && ip>jp)))
-        dest.valuePtr()[k] = numext::conj(it.value());
+  for (Index j = 0; j < size; ++j) count[j] = dest.outerIndexPtr()[j];
+
+  for (StorageIndex j = 0; j < size; ++j) {
+    for (MatIterator it(matEval, j); it; ++it) {
+      StorageIndex i = it.index();
+      if ((int(SrcMode) == int(Lower) && i < j) || (int(SrcMode) == int(Upper) && i > j)) continue;
+
+      StorageIndex jp = perm ? perm[j] : j;
+      StorageIndex ip = perm ? perm[i] : i;
+
+      Index k = count[int(DstMode) == int(Lower) ? (std::min)(ip, jp) : (std::max)(ip, jp)]++;
+      dest.innerIndexPtr()[k] = int(DstMode) == int(Lower) ? (std::max)(ip, jp) : (std::min)(ip, jp);
+
+      if (!StorageOrderMatch) std::swap(ip, jp);
+      if (((int(DstMode) == int(Lower) && ip < jp) || (int(DstMode) == int(Upper) && ip > jp)))
+        dest.valuePtr()[k] = (NonHermitian ? it.value() : numext::conj(it.value()));
       else
         dest.valuePtr()[k] = it.value();
     }
   }
 }
 
-}
+}  // namespace internal
 
-template<typename MatrixType,int UpLo>
-class SparseSymmetricPermutationProduct
-  : public EigenBase<SparseSymmetricPermutationProduct<MatrixType,UpLo> >
-{
-  public:
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::Index Index;
-  protected:
-    typedef PermutationMatrix<Dynamic,Dynamic,Index> Perm;
-  public:
-    typedef Matrix<Index,Dynamic,1> VectorI;
-    typedef typename MatrixType::Nested MatrixTypeNested;
-    typedef typename internal::remove_all<MatrixTypeNested>::type _MatrixTypeNested;
-    
-    SparseSymmetricPermutationProduct(const MatrixType& mat, const Perm& perm)
-      : m_matrix(mat), m_perm(perm)
-    {}
-    
-    inline Index rows() const { return m_matrix.rows(); }
-    inline Index cols() const { return m_matrix.cols(); }
-    
-    template<typename DestScalar, int Options, typename DstIndex>
-    void evalTo(SparseMatrix<DestScalar,Options,DstIndex>& _dest) const
-    {
-//       internal::permute_symm_to_fullsymm<UpLo>(m_matrix,_dest,m_perm.indices().data());
-      SparseMatrix<DestScalar,(Options&RowMajor)==RowMajor ? ColMajor : RowMajor, DstIndex> tmp;
-      internal::permute_symm_to_fullsymm<UpLo>(m_matrix,tmp,m_perm.indices().data());
-      _dest = tmp;
-    }
-    
-    template<typename DestType,unsigned int DestUpLo> void evalTo(SparseSelfAdjointView<DestType,DestUpLo>& dest) const
-    {
-      internal::permute_symm_to_symm<UpLo,DestUpLo>(m_matrix,dest.matrix(),m_perm.indices().data());
-    }
-    
-  protected:
-    MatrixTypeNested m_matrix;
-    const Perm& m_perm;
+// TODO implement twists in a more evaluator friendly fashion
+
+namespace internal {
+
+template <typename MatrixType, int Mode>
+struct traits<SparseSymmetricPermutationProduct<MatrixType, Mode> > : traits<MatrixType> {};
+
+}  // namespace internal
+
+template <typename MatrixType, int Mode>
+class SparseSymmetricPermutationProduct : public EigenBase<SparseSymmetricPermutationProduct<MatrixType, Mode> > {
+ public:
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::StorageIndex StorageIndex;
+  enum {
+    RowsAtCompileTime = internal::traits<SparseSymmetricPermutationProduct>::RowsAtCompileTime,
+    ColsAtCompileTime = internal::traits<SparseSymmetricPermutationProduct>::ColsAtCompileTime
+  };
+
+ protected:
+  typedef PermutationMatrix<Dynamic, Dynamic, StorageIndex> Perm;
+
+ public:
+  typedef Matrix<StorageIndex, Dynamic, 1> VectorI;
+  typedef typename MatrixType::Nested MatrixTypeNested;
+  typedef internal::remove_all_t<MatrixTypeNested> NestedExpression;
 
+  SparseSymmetricPermutationProduct(const MatrixType& mat, const Perm& perm) : m_matrix(mat), m_perm(perm) {}
+
+  inline Index rows() const { return m_matrix.rows(); }
+  inline Index cols() const { return m_matrix.cols(); }
+
+  const NestedExpression& matrix() const { return m_matrix; }
+  const Perm& perm() const { return m_perm; }
+
+ protected:
+  MatrixTypeNested m_matrix;
+  const Perm& m_perm;
 };
 
-} // end namespace Eigen
+namespace internal {
+
+template <typename DstXprType, typename MatrixType, int Mode, typename Scalar>
+struct Assignment<DstXprType, SparseSymmetricPermutationProduct<MatrixType, Mode>,
+                  internal::assign_op<Scalar, typename MatrixType::Scalar>, Sparse2Sparse> {
+  typedef SparseSymmetricPermutationProduct<MatrixType, Mode> SrcXprType;
+  typedef typename DstXprType::StorageIndex DstIndex;
+  template <int Options>
+  static void run(SparseMatrix<Scalar, Options, DstIndex>& dst, const SrcXprType& src,
+                  const internal::assign_op<Scalar, typename MatrixType::Scalar>&) {
+    // internal::permute_symm_to_fullsymm<Mode>(m_matrix,_dest,m_perm.indices().data());
+    SparseMatrix<Scalar, (Options & RowMajor) == RowMajor ? ColMajor : RowMajor, DstIndex> tmp;
+    internal::permute_symm_to_fullsymm<Mode, false>(src.matrix(), tmp, src.perm().indices().data());
+    dst = tmp;
+  }
+
+  template <typename DestType, unsigned int DestMode>
+  static void run(SparseSelfAdjointView<DestType, DestMode>& dst, const SrcXprType& src,
+                  const internal::assign_op<Scalar, typename MatrixType::Scalar>&) {
+    internal::permute_symm_to_symm<Mode, DestMode, false>(src.matrix(), dst.matrix(), src.perm().indices().data());
+  }
+};
+
+}  // end namespace internal
+
+}  // end namespace Eigen
 
-#endif // EIGEN_SPARSE_SELFADJOINTVIEW_H
+#endif  // EIGEN_SPARSE_SELFADJOINTVIEW_H
diff --git a/inst/include/Eigen/src/SparseCore/SparseSolverBase.h b/inst/include/Eigen/src/SparseCore/SparseSolverBase.h
new file mode 100644
index 00000000..d67a6773
--- /dev/null
+++ b/inst/include/Eigen/src/SparseCore/SparseSolverBase.h
@@ -0,0 +1,115 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSESOLVERBASE_H
+#define EIGEN_SPARSESOLVERBASE_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+/** \internal
+ * Helper functions to solve with a sparse right-hand-side and result.
+ * The rhs is decomposed into small vertical panels which are solved through dense temporaries.
+ */
+template <typename Decomposition, typename Rhs, typename Dest>
+std::enable_if_t<Rhs::ColsAtCompileTime != 1 && Dest::ColsAtCompileTime != 1> solve_sparse_through_dense_panels(
+    const Decomposition& dec, const Rhs& rhs, Dest& dest) {
+  EIGEN_STATIC_ASSERT((Dest::Flags & RowMajorBit) == 0, THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES);
+  typedef typename Dest::Scalar DestScalar;
+  // we process the sparse rhs per block of NbColsAtOnce columns temporarily stored into a dense matrix.
+  static const Index NbColsAtOnce = 4;
+  Index rhsCols = rhs.cols();
+  Index size = rhs.rows();
+  // the temporary matrices do not need more columns than NbColsAtOnce:
+  Index tmpCols = (std::min)(rhsCols, NbColsAtOnce);
+  Eigen::Matrix<DestScalar, Dynamic, Dynamic> tmp(size, tmpCols);
+  Eigen::Matrix<DestScalar, Dynamic, Dynamic> tmpX(size, tmpCols);
+  for (Index k = 0; k < rhsCols; k += NbColsAtOnce) {
+    Index actualCols = std::min<Index>(rhsCols - k, NbColsAtOnce);
+    tmp.leftCols(actualCols) = rhs.middleCols(k, actualCols);
+    tmpX.leftCols(actualCols) = dec.solve(tmp.leftCols(actualCols));
+    dest.middleCols(k, actualCols) = tmpX.leftCols(actualCols).sparseView();
+  }
+}
+
+// Overload for vector as rhs
+template <typename Decomposition, typename Rhs, typename Dest>
+std::enable_if_t<Rhs::ColsAtCompileTime == 1 || Dest::ColsAtCompileTime == 1> solve_sparse_through_dense_panels(
+    const Decomposition& dec, const Rhs& rhs, Dest& dest) {
+  typedef typename Dest::Scalar DestScalar;
+  Index size = rhs.rows();
+  Eigen::Matrix<DestScalar, Dynamic, 1> rhs_dense(rhs);
+  Eigen::Matrix<DestScalar, Dynamic, 1> dest_dense(size);
+  dest_dense = dec.solve(rhs_dense);
+  dest = dest_dense.sparseView();
+}
+
+}  // end namespace internal
+
+/** \class SparseSolverBase
+ * \ingroup SparseCore_Module
+ * \brief A base class for sparse solvers
+ *
+ * \tparam Derived the actual type of the solver.
+ *
+ */
+template <typename Derived>
+class SparseSolverBase : internal::noncopyable {
+ public:
+  /** Default constructor */
+  SparseSolverBase() : m_isInitialized(false) {}
+
+  SparseSolverBase(SparseSolverBase&& other) : internal::noncopyable{}, m_isInitialized{other.m_isInitialized} {}
+
+  ~SparseSolverBase() {}
+
+  Derived& derived() { return *static_cast<Derived*>(this); }
+  const Derived& derived() const { return *static_cast<const Derived*>(this); }
+
+  /** \returns an expression of the solution x of \f$ A x = b \f$ using the current decomposition of A.
+   *
+   * \sa compute()
+   */
+  template <typename Rhs>
+  inline const Solve<Derived, Rhs> solve(const MatrixBase<Rhs>& b) const {
+    eigen_assert(m_isInitialized && "Solver is not initialized.");
+    eigen_assert(derived().rows() == b.rows() && "solve(): invalid number of rows of the right hand side matrix b");
+    return Solve<Derived, Rhs>(derived(), b.derived());
+  }
+
+  /** \returns an expression of the solution x of \f$ A x = b \f$ using the current decomposition of A.
+   *
+   * \sa compute()
+   */
+  template <typename Rhs>
+  inline const Solve<Derived, Rhs> solve(const SparseMatrixBase<Rhs>& b) const {
+    eigen_assert(m_isInitialized && "Solver is not initialized.");
+    eigen_assert(derived().rows() == b.rows() && "solve(): invalid number of rows of the right hand side matrix b");
+    return Solve<Derived, Rhs>(derived(), b.derived());
+  }
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  /** \internal default implementation of solving with a sparse rhs */
+  template <typename Rhs, typename Dest>
+  void _solve_impl(const SparseMatrixBase<Rhs>& b, SparseMatrixBase<Dest>& dest) const {
+    internal::solve_sparse_through_dense_panels(derived(), b.derived(), dest.derived());
+  }
+#endif  // EIGEN_PARSED_BY_DOXYGEN
+
+ protected:
+  mutable bool m_isInitialized;
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_SPARSESOLVERBASE_H
diff --git a/inst/include/Eigen/src/SparseCore/SparseSparseProductWithPruning.h b/inst/include/Eigen/src/SparseCore/SparseSparseProductWithPruning.h
index fcc18f5c..6e1c9cf5 100644
--- a/inst/include/Eigen/src/SparseCore/SparseSparseProductWithPruning.h
+++ b/inst/include/Eigen/src/SparseCore/SparseSparseProductWithPruning.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,28 +10,40 @@
 #ifndef EIGEN_SPARSESPARSEPRODUCTWITHPRUNING_H
 #define EIGEN_SPARSESPARSEPRODUCTWITHPRUNING_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 
-namespace internal {
+namespace Eigen {
 
+namespace internal {
 
 // perform a pseudo in-place sparse * sparse product assuming all matrices are col major
-template<typename Lhs, typename Rhs, typename ResultType>
-static void sparse_sparse_product_with_pruning_impl(const Lhs& lhs, const Rhs& rhs, ResultType& res, const typename ResultType::RealScalar& tolerance)
-{
+template <typename Lhs, typename Rhs, typename ResultType>
+static void sparse_sparse_product_with_pruning_impl(const Lhs& lhs, const Rhs& rhs, ResultType& res,
+                                                    const typename ResultType::RealScalar& tolerance) {
   // return sparse_sparse_product_with_pruning_impl2(lhs,rhs,res);
 
-  typedef typename remove_all<Lhs>::type::Scalar Scalar;
-  typedef typename remove_all<Lhs>::type::Index Index;
+  typedef typename remove_all_t<Rhs>::Scalar RhsScalar;
+  typedef typename remove_all_t<ResultType>::Scalar ResScalar;
+  typedef typename remove_all_t<Lhs>::StorageIndex StorageIndex;
 
   // make sure to call innerSize/outerSize since we fake the storage order.
   Index rows = lhs.innerSize();
   Index cols = rhs.outerSize();
-  //Index size = lhs.outerSize();
+  // Index size = lhs.outerSize();
   eigen_assert(lhs.outerSize() == rhs.innerSize());
 
   // allocate a temporary buffer
-  AmbiVector<Scalar,Index> tempVector(rows);
+  AmbiVector<ResScalar, StorageIndex> tempVector(rows);
+
+  // mimics a resizeByInnerOuter:
+  if (ResultType::IsRowMajor)
+    res.resize(cols, rows);
+  else
+    res.resize(rows, cols);
+
+  evaluator<Lhs> lhsEval(lhs);
+  evaluator<Rhs> rhsEval(rhs);
 
   // estimate the number of non zero entries
   // given a rhs column containing Y non zeros, we assume that the respective Y columns
@@ -39,112 +51,134 @@ static void sparse_sparse_product_with_pruning_impl(const Lhs& lhs, const Rhs& r
   // the product of a rhs column with the lhs is X+Y where X is the average number of non zero
   // per column of the lhs.
   // Therefore, we have nnz(lhs*rhs) = nnz(lhs) + nnz(rhs)
-  Index estimated_nnz_prod = lhs.nonZeros() + rhs.nonZeros();
-
-  // mimics a resizeByInnerOuter:
-  if(ResultType::IsRowMajor)
-    res.resize(cols, rows);
-  else
-    res.resize(rows, cols);
+  Index estimated_nnz_prod = lhsEval.nonZerosEstimate() + rhsEval.nonZerosEstimate();
 
   res.reserve(estimated_nnz_prod);
-  double ratioColRes = double(estimated_nnz_prod)/double(lhs.rows()*rhs.cols());
-  for (Index j=0; j<cols; ++j)
-  {
+  double ratioColRes = double(estimated_nnz_prod) / (double(lhs.rows()) * double(rhs.cols()));
+  for (Index j = 0; j < cols; ++j) {
     // FIXME:
-    //double ratioColRes = (double(rhs.innerVector(j).nonZeros()) + double(lhs.nonZeros())/double(lhs.cols()))/double(lhs.rows());
+    // double ratioColRes = (double(rhs.innerVector(j).nonZeros()) +
+    // double(lhs.nonZeros())/double(lhs.cols()))/double(lhs.rows());
     // let's do a more accurate determination of the nnz ratio for the current column j of res
     tempVector.init(ratioColRes);
     tempVector.setZero();
-    for (typename Rhs::InnerIterator rhsIt(rhs, j); rhsIt; ++rhsIt)
-    {
+    for (typename evaluator<Rhs>::InnerIterator rhsIt(rhsEval, j); rhsIt; ++rhsIt) {
       // FIXME should be written like this: tmp += rhsIt.value() * lhs.col(rhsIt.index())
       tempVector.restart();
-      Scalar x = rhsIt.value();
-      for (typename Lhs::InnerIterator lhsIt(lhs, rhsIt.index()); lhsIt; ++lhsIt)
-      {
+      RhsScalar x = rhsIt.value();
+      for (typename evaluator<Lhs>::InnerIterator lhsIt(lhsEval, rhsIt.index()); lhsIt; ++lhsIt) {
         tempVector.coeffRef(lhsIt.index()) += lhsIt.value() * x;
       }
     }
     res.startVec(j);
-    for (typename AmbiVector<Scalar,Index>::Iterator it(tempVector,tolerance); it; ++it)
-      res.insertBackByOuterInner(j,it.index()) = it.value();
+    for (typename AmbiVector<ResScalar, StorageIndex>::Iterator it(tempVector, tolerance); it; ++it)
+      res.insertBackByOuterInner(j, it.index()) = it.value();
   }
   res.finalize();
 }
 
-template<typename Lhs, typename Rhs, typename ResultType,
-  int LhsStorageOrder = traits<Lhs>::Flags&RowMajorBit,
-  int RhsStorageOrder = traits<Rhs>::Flags&RowMajorBit,
-  int ResStorageOrder = traits<ResultType>::Flags&RowMajorBit>
+template <typename Lhs, typename Rhs, typename ResultType, int LhsStorageOrder = traits<Lhs>::Flags & RowMajorBit,
+          int RhsStorageOrder = traits<Rhs>::Flags & RowMajorBit,
+          int ResStorageOrder = traits<ResultType>::Flags & RowMajorBit>
 struct sparse_sparse_product_with_pruning_selector;
 
-template<typename Lhs, typename Rhs, typename ResultType>
-struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,ColMajor,ColMajor,ColMajor>
-{
-  typedef typename traits<typename remove_all<Lhs>::type>::Scalar Scalar;
+template <typename Lhs, typename Rhs, typename ResultType>
+struct sparse_sparse_product_with_pruning_selector<Lhs, Rhs, ResultType, ColMajor, ColMajor, ColMajor> {
   typedef typename ResultType::RealScalar RealScalar;
 
-  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
-  {
-    typename remove_all<ResultType>::type _res(res.rows(), res.cols());
-    internal::sparse_sparse_product_with_pruning_impl<Lhs,Rhs,ResultType>(lhs, rhs, _res, tolerance);
-    res.swap(_res);
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance) {
+    remove_all_t<ResultType> res_(res.rows(), res.cols());
+    internal::sparse_sparse_product_with_pruning_impl<Lhs, Rhs, ResultType>(lhs, rhs, res_, tolerance);
+    res.swap(res_);
   }
 };
 
-template<typename Lhs, typename Rhs, typename ResultType>
-struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,ColMajor,ColMajor,RowMajor>
-{
+template <typename Lhs, typename Rhs, typename ResultType>
+struct sparse_sparse_product_with_pruning_selector<Lhs, Rhs, ResultType, ColMajor, ColMajor, RowMajor> {
   typedef typename ResultType::RealScalar RealScalar;
-  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
-  {
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance) {
     // we need a col-major matrix to hold the result
-    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename ResultType::Index> SparseTemporaryType;
-    SparseTemporaryType _res(res.rows(), res.cols());
-    internal::sparse_sparse_product_with_pruning_impl<Lhs,Rhs,SparseTemporaryType>(lhs, rhs, _res, tolerance);
-    res = _res;
+    typedef SparseMatrix<typename ResultType::Scalar, ColMajor, typename ResultType::StorageIndex> SparseTemporaryType;
+    SparseTemporaryType res_(res.rows(), res.cols());
+    internal::sparse_sparse_product_with_pruning_impl<Lhs, Rhs, SparseTemporaryType>(lhs, rhs, res_, tolerance);
+    res = res_;
   }
 };
 
-template<typename Lhs, typename Rhs, typename ResultType>
-struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,RowMajor,RowMajor,RowMajor>
-{
+template <typename Lhs, typename Rhs, typename ResultType>
+struct sparse_sparse_product_with_pruning_selector<Lhs, Rhs, ResultType, RowMajor, RowMajor, RowMajor> {
   typedef typename ResultType::RealScalar RealScalar;
-  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
-  {
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance) {
     // let's transpose the product to get a column x column product
-    typename remove_all<ResultType>::type _res(res.rows(), res.cols());
-    internal::sparse_sparse_product_with_pruning_impl<Rhs,Lhs,ResultType>(rhs, lhs, _res, tolerance);
-    res.swap(_res);
+    remove_all_t<ResultType> res_(res.rows(), res.cols());
+    internal::sparse_sparse_product_with_pruning_impl<Rhs, Lhs, ResultType>(rhs, lhs, res_, tolerance);
+    res.swap(res_);
   }
 };
 
-template<typename Lhs, typename Rhs, typename ResultType>
-struct sparse_sparse_product_with_pruning_selector<Lhs,Rhs,ResultType,RowMajor,RowMajor,ColMajor>
-{
+template <typename Lhs, typename Rhs, typename ResultType>
+struct sparse_sparse_product_with_pruning_selector<Lhs, Rhs, ResultType, RowMajor, RowMajor, ColMajor> {
   typedef typename ResultType::RealScalar RealScalar;
-  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance)
-  {
-    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename Lhs::Index> ColMajorMatrixLhs;
-    typedef SparseMatrix<typename ResultType::Scalar,ColMajor,typename Lhs::Index> ColMajorMatrixRhs;
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance) {
+    typedef SparseMatrix<typename Lhs::Scalar, ColMajor, typename Lhs::StorageIndex> ColMajorMatrixLhs;
+    typedef SparseMatrix<typename Rhs::Scalar, ColMajor, typename Lhs::StorageIndex> ColMajorMatrixRhs;
     ColMajorMatrixLhs colLhs(lhs);
     ColMajorMatrixRhs colRhs(rhs);
-    internal::sparse_sparse_product_with_pruning_impl<ColMajorMatrixLhs,ColMajorMatrixRhs,ResultType>(colLhs, colRhs, res, tolerance);
+    internal::sparse_sparse_product_with_pruning_impl<ColMajorMatrixLhs, ColMajorMatrixRhs, ResultType>(colLhs, colRhs,
+                                                                                                        res, tolerance);
 
     // let's transpose the product to get a column x column product
-//     typedef SparseMatrix<typename ResultType::Scalar> SparseTemporaryType;
-//     SparseTemporaryType _res(res.cols(), res.rows());
-//     sparse_sparse_product_with_pruning_impl<Rhs,Lhs,SparseTemporaryType>(rhs, lhs, _res);
-//     res = _res.transpose();
+    //     typedef SparseMatrix<typename ResultType::Scalar> SparseTemporaryType;
+    //     SparseTemporaryType res_(res.cols(), res.rows());
+    //     sparse_sparse_product_with_pruning_impl<Rhs,Lhs,SparseTemporaryType>(rhs, lhs, res_);
+    //     res = res_.transpose();
+  }
+};
+
+template <typename Lhs, typename Rhs, typename ResultType>
+struct sparse_sparse_product_with_pruning_selector<Lhs, Rhs, ResultType, ColMajor, RowMajor, RowMajor> {
+  typedef typename ResultType::RealScalar RealScalar;
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance) {
+    typedef SparseMatrix<typename Lhs::Scalar, RowMajor, typename Lhs::StorageIndex> RowMajorMatrixLhs;
+    RowMajorMatrixLhs rowLhs(lhs);
+    sparse_sparse_product_with_pruning_selector<RowMajorMatrixLhs, Rhs, ResultType, RowMajor, RowMajor>(rowLhs, rhs,
+                                                                                                        res, tolerance);
+  }
+};
+
+template <typename Lhs, typename Rhs, typename ResultType>
+struct sparse_sparse_product_with_pruning_selector<Lhs, Rhs, ResultType, RowMajor, ColMajor, RowMajor> {
+  typedef typename ResultType::RealScalar RealScalar;
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance) {
+    typedef SparseMatrix<typename Rhs::Scalar, RowMajor, typename Lhs::StorageIndex> RowMajorMatrixRhs;
+    RowMajorMatrixRhs rowRhs(rhs);
+    sparse_sparse_product_with_pruning_selector<Lhs, RowMajorMatrixRhs, ResultType, RowMajor, RowMajor, RowMajor>(
+        lhs, rowRhs, res, tolerance);
   }
 };
 
-// NOTE the 2 others cases (col row *) must never occur since they are caught
-// by ProductReturnType which transforms it to (col col *) by evaluating rhs.
+template <typename Lhs, typename Rhs, typename ResultType>
+struct sparse_sparse_product_with_pruning_selector<Lhs, Rhs, ResultType, ColMajor, RowMajor, ColMajor> {
+  typedef typename ResultType::RealScalar RealScalar;
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance) {
+    typedef SparseMatrix<typename Rhs::Scalar, ColMajor, typename Lhs::StorageIndex> ColMajorMatrixRhs;
+    ColMajorMatrixRhs colRhs(rhs);
+    internal::sparse_sparse_product_with_pruning_impl<Lhs, ColMajorMatrixRhs, ResultType>(lhs, colRhs, res, tolerance);
+  }
+};
+
+template <typename Lhs, typename Rhs, typename ResultType>
+struct sparse_sparse_product_with_pruning_selector<Lhs, Rhs, ResultType, RowMajor, ColMajor, ColMajor> {
+  typedef typename ResultType::RealScalar RealScalar;
+  static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance) {
+    typedef SparseMatrix<typename Lhs::Scalar, ColMajor, typename Lhs::StorageIndex> ColMajorMatrixLhs;
+    ColMajorMatrixLhs colLhs(lhs);
+    internal::sparse_sparse_product_with_pruning_impl<ColMajorMatrixLhs, Rhs, ResultType>(colLhs, rhs, res, tolerance);
+  }
+};
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_SPARSESPARSEPRODUCTWITHPRUNING_H
+#endif  // EIGEN_SPARSESPARSEPRODUCTWITHPRUNING_H
diff --git a/inst/include/Eigen/src/SparseCore/SparseTranspose.h b/inst/include/Eigen/src/SparseCore/SparseTranspose.h
index 76d031d5..158f7788 100644
--- a/inst/include/Eigen/src/SparseCore/SparseTranspose.h
+++ b/inst/include/Eigen/src/SparseCore/SparseTranspose.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,54 +10,74 @@
 #ifndef EIGEN_SPARSETRANSPOSE_H
 #define EIGEN_SPARSETRANSPOSE_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 
-template<typename MatrixType> class TransposeImpl<MatrixType,Sparse>
-  : public SparseMatrixBase<Transpose<MatrixType> >
-{
-    typedef typename internal::remove_all<typename MatrixType::Nested>::type _MatrixTypeNested;
-  public:
+namespace Eigen {
 
-    EIGEN_SPARSE_PUBLIC_INTERFACE(Transpose<MatrixType> )
+namespace internal {
+template <typename MatrixType, int CompressedAccess = int(MatrixType::Flags & CompressedAccessBit)>
+class SparseTransposeImpl : public SparseMatrixBase<Transpose<MatrixType> > {};
 
-    class InnerIterator;
-    class ReverseInnerIterator;
+template <typename MatrixType>
+class SparseTransposeImpl<MatrixType, CompressedAccessBit> : public SparseCompressedBase<Transpose<MatrixType> > {
+  typedef SparseCompressedBase<Transpose<MatrixType> > Base;
 
-    inline Index nonZeros() const { return derived().nestedExpression().nonZeros(); }
+ public:
+  using Base::derived;
+  typedef typename Base::Scalar Scalar;
+  typedef typename Base::StorageIndex StorageIndex;
+
+  inline Index nonZeros() const { return derived().nestedExpression().nonZeros(); }
+
+  inline const Scalar* valuePtr() const { return derived().nestedExpression().valuePtr(); }
+  inline const StorageIndex* innerIndexPtr() const { return derived().nestedExpression().innerIndexPtr(); }
+  inline const StorageIndex* outerIndexPtr() const { return derived().nestedExpression().outerIndexPtr(); }
+  inline const StorageIndex* innerNonZeroPtr() const { return derived().nestedExpression().innerNonZeroPtr(); }
+
+  inline Scalar* valuePtr() { return derived().nestedExpression().valuePtr(); }
+  inline StorageIndex* innerIndexPtr() { return derived().nestedExpression().innerIndexPtr(); }
+  inline StorageIndex* outerIndexPtr() { return derived().nestedExpression().outerIndexPtr(); }
+  inline StorageIndex* innerNonZeroPtr() { return derived().nestedExpression().innerNonZeroPtr(); }
 };
+}  // namespace internal
 
-// NOTE: VC10 and VC11 trigger an ICE if don't put typename TransposeImpl<MatrixType,Sparse>:: in front of Index,
-// a typedef typename TransposeImpl<MatrixType,Sparse>::Index Index;
-// does not fix the issue.
-// An alternative is to define the nested class in the parent class itself.
-template<typename MatrixType> class TransposeImpl<MatrixType,Sparse>::InnerIterator
-  : public _MatrixTypeNested::InnerIterator
-{
-    typedef typename _MatrixTypeNested::InnerIterator Base;
-    typedef typename TransposeImpl::Index Index;
-  public:
-
-    EIGEN_STRONG_INLINE InnerIterator(const TransposeImpl& trans, typename TransposeImpl<MatrixType,Sparse>::Index outer)
-      : Base(trans.derived().nestedExpression(), outer)
-    {}
-    typename TransposeImpl<MatrixType,Sparse>::Index row() const { return Base::col(); }
-    typename TransposeImpl<MatrixType,Sparse>::Index col() const { return Base::row(); }
+template <typename MatrixType>
+class TransposeImpl<MatrixType, Sparse> : public internal::SparseTransposeImpl<MatrixType> {
+ protected:
+  typedef internal::SparseTransposeImpl<MatrixType> Base;
 };
 
-template<typename MatrixType> class TransposeImpl<MatrixType,Sparse>::ReverseInnerIterator
-  : public _MatrixTypeNested::ReverseInnerIterator
-{
-    typedef typename _MatrixTypeNested::ReverseInnerIterator Base;
-    typedef typename TransposeImpl::Index Index;
-  public:
-
-    EIGEN_STRONG_INLINE ReverseInnerIterator(const TransposeImpl& xpr, typename TransposeImpl<MatrixType,Sparse>::Index outer)
-      : Base(xpr.derived().nestedExpression(), outer)
-    {}
-    typename TransposeImpl<MatrixType,Sparse>::Index row() const { return Base::col(); }
-    typename TransposeImpl<MatrixType,Sparse>::Index col() const { return Base::row(); }
+namespace internal {
+
+template <typename ArgType>
+struct unary_evaluator<Transpose<ArgType>, IteratorBased> : public evaluator_base<Transpose<ArgType> > {
+  typedef typename evaluator<ArgType>::InnerIterator EvalIterator;
+
+ public:
+  typedef Transpose<ArgType> XprType;
+
+  inline Index nonZerosEstimate() const { return m_argImpl.nonZerosEstimate(); }
+
+  class InnerIterator : public EvalIterator {
+   public:
+    EIGEN_STRONG_INLINE InnerIterator(const unary_evaluator& unaryOp, Index outer)
+        : EvalIterator(unaryOp.m_argImpl, outer) {}
+
+    Index row() const { return EvalIterator::col(); }
+    Index col() const { return EvalIterator::row(); }
+  };
+
+  enum { CoeffReadCost = evaluator<ArgType>::CoeffReadCost, Flags = XprType::Flags };
+
+  explicit unary_evaluator(const XprType& op) : m_argImpl(op.nestedExpression()) {}
+
+ protected:
+  evaluator<ArgType> m_argImpl;
 };
 
-} // end namespace Eigen
+}  // end namespace internal
+
+}  // end namespace Eigen
 
-#endif // EIGEN_SPARSETRANSPOSE_H
+#endif  // EIGEN_SPARSETRANSPOSE_H
diff --git a/inst/include/Eigen/src/SparseCore/SparseTriangularView.h b/inst/include/Eigen/src/SparseCore/SparseTriangularView.h
index 333127b7..a6c3eaad 100644
--- a/inst/include/Eigen/src/SparseCore/SparseTriangularView.h
+++ b/inst/include/Eigen/src/SparseCore/SparseTriangularView.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2009-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -11,169 +11,167 @@
 #ifndef EIGEN_SPARSE_TRIANGULARVIEW_H
 #define EIGEN_SPARSE_TRIANGULARVIEW_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \ingroup SparseCore_Module
+ *
+ * \brief Base class for a triangular part in a \b sparse matrix
+ *
+ * This class is an abstract base class of class TriangularView, and objects of type TriangularViewImpl cannot be
+ * instantiated. It extends class TriangularView with additional methods which are available for sparse expressions
+ * only.
+ *
+ * \sa class TriangularView, SparseMatrixBase::triangularView()
+ */
+template <typename MatrixType, unsigned int Mode>
+class TriangularViewImpl<MatrixType, Mode, Sparse> : public SparseMatrixBase<TriangularView<MatrixType, Mode> > {
+  enum {
+    SkipFirst =
+        ((Mode & Lower) && !(MatrixType::Flags & RowMajorBit)) || ((Mode & Upper) && (MatrixType::Flags & RowMajorBit)),
+    SkipLast = !SkipFirst,
+    SkipDiag = (Mode & ZeroDiag) ? 1 : 0,
+    HasUnitDiag = (Mode & UnitDiag) ? 1 : 0
+  };
+
+  typedef TriangularView<MatrixType, Mode> TriangularViewType;
+
+ protected:
+  // dummy solve function to make TriangularView happy.
+  void solve() const;
+
+  typedef SparseMatrixBase<TriangularViewType> Base;
+
+ public:
+  EIGEN_SPARSE_PUBLIC_INTERFACE(TriangularViewType)
+
+  typedef typename MatrixType::Nested MatrixTypeNested;
+  typedef std::remove_reference_t<MatrixTypeNested> MatrixTypeNestedNonRef;
+  typedef internal::remove_all_t<MatrixTypeNested> MatrixTypeNestedCleaned;
+
+  template <typename RhsType, typename DstType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _solve_impl(const RhsType& rhs, DstType& dst) const {
+    if (!(internal::is_same<RhsType, DstType>::value && internal::extract_data(dst) == internal::extract_data(rhs)))
+      dst = rhs;
+    this->solveInPlace(dst);
+  }
+
+  /** Applies the inverse of \c *this to the dense vector or matrix \a other, "in-place" */
+  template <typename OtherDerived>
+  void solveInPlace(MatrixBase<OtherDerived>& other) const;
+
+  /** Applies the inverse of \c *this to the sparse vector or matrix \a other, "in-place" */
+  template <typename OtherDerived>
+  void solveInPlace(SparseMatrixBase<OtherDerived>& other) const;
+};
 
 namespace internal {
-  
-template<typename MatrixType, int Mode>
-struct traits<SparseTriangularView<MatrixType,Mode> >
-: public traits<MatrixType>
-{};
-
-} // namespace internal
-
-template<typename MatrixType, int Mode> class SparseTriangularView
-  : public SparseMatrixBase<SparseTriangularView<MatrixType,Mode> >
-{
-    enum { SkipFirst = ((Mode&Lower) && !(MatrixType::Flags&RowMajorBit))
-                    || ((Mode&Upper) &&  (MatrixType::Flags&RowMajorBit)),
-           SkipLast = !SkipFirst,
-           SkipDiag = (Mode&ZeroDiag) ? 1 : 0,
-           HasUnitDiag = (Mode&UnitDiag) ? 1 : 0
-    };
 
-  public:
-    
-    EIGEN_SPARSE_PUBLIC_INTERFACE(SparseTriangularView)
+template <typename ArgType, unsigned int Mode>
+struct unary_evaluator<TriangularView<ArgType, Mode>, IteratorBased> : evaluator_base<TriangularView<ArgType, Mode> > {
+  typedef TriangularView<ArgType, Mode> XprType;
 
-    class InnerIterator;
-    class ReverseInnerIterator;
+ protected:
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::StorageIndex StorageIndex;
+  typedef typename evaluator<ArgType>::InnerIterator EvalIterator;
 
-    inline Index rows() const { return m_matrix.rows(); }
-    inline Index cols() const { return m_matrix.cols(); }
+  enum {
+    SkipFirst =
+        ((Mode & Lower) && !(ArgType::Flags & RowMajorBit)) || ((Mode & Upper) && (ArgType::Flags & RowMajorBit)),
+    SkipLast = !SkipFirst,
+    SkipDiag = (Mode & ZeroDiag) ? 1 : 0,
+    HasUnitDiag = (Mode & UnitDiag) ? 1 : 0
+  };
 
-    typedef typename MatrixType::Nested MatrixTypeNested;
-    typedef typename internal::remove_reference<MatrixTypeNested>::type MatrixTypeNestedNonRef;
-    typedef typename internal::remove_all<MatrixTypeNested>::type MatrixTypeNestedCleaned;
+ public:
+  enum { CoeffReadCost = evaluator<ArgType>::CoeffReadCost, Flags = XprType::Flags };
 
-    inline SparseTriangularView(const MatrixType& matrix) : m_matrix(matrix) {}
+  explicit unary_evaluator(const XprType& xpr) : m_argImpl(xpr.nestedExpression()), m_arg(xpr.nestedExpression()) {}
 
-    /** \internal */
-    inline const MatrixTypeNestedCleaned& nestedExpression() const { return m_matrix; }
+  inline Index nonZerosEstimate() const { return m_argImpl.nonZerosEstimate(); }
 
-    template<typename OtherDerived>
-    typename internal::plain_matrix_type_column_major<OtherDerived>::type
-    solve(const MatrixBase<OtherDerived>& other) const;
-
-    template<typename OtherDerived> void solveInPlace(MatrixBase<OtherDerived>& other) const;
-    template<typename OtherDerived> void solveInPlace(SparseMatrixBase<OtherDerived>& other) const;
-
-  protected:
-    MatrixTypeNested m_matrix;
-};
+  class InnerIterator : public EvalIterator {
+    typedef EvalIterator Base;
 
-template<typename MatrixType, int Mode>
-class SparseTriangularView<MatrixType,Mode>::InnerIterator : public MatrixTypeNestedCleaned::InnerIterator
-{
-    typedef typename MatrixTypeNestedCleaned::InnerIterator Base;
-    typedef typename SparseTriangularView::Index Index;
-  public:
-
-    EIGEN_STRONG_INLINE InnerIterator(const SparseTriangularView& view, Index outer)
-      : Base(view.nestedExpression(), outer), m_returnOne(false)
-    {
-      if(SkipFirst)
-      {
-        while((*this) && ((HasUnitDiag||SkipDiag)  ? this->index()<=outer : this->index()<outer))
+   public:
+    EIGEN_STRONG_INLINE InnerIterator(const unary_evaluator& xprEval, Index outer)
+        : Base(xprEval.m_argImpl, outer),
+          m_returnOne(false),
+          m_containsDiag(Base::outer() < xprEval.m_arg.innerSize()) {
+      if (SkipFirst) {
+        while ((*this) && ((HasUnitDiag || SkipDiag) ? this->index() <= outer : this->index() < outer))
           Base::operator++();
-        if(HasUnitDiag)
-          m_returnOne = true;
-      }
-      else if(HasUnitDiag && ((!Base::operator bool()) || Base::index()>=Base::outer()))
-      {
-        if((!SkipFirst) && Base::operator bool())
-          Base::operator++();
-        m_returnOne = true;
+        if (HasUnitDiag) m_returnOne = m_containsDiag;
+      } else if (HasUnitDiag && ((!Base::operator bool()) || Base::index() >= Base::outer())) {
+        if ((!SkipFirst) && Base::operator bool()) Base::operator++();
+        m_returnOne = m_containsDiag;
       }
     }
 
-    EIGEN_STRONG_INLINE InnerIterator& operator++()
-    {
-      if(HasUnitDiag && m_returnOne)
+    EIGEN_STRONG_INLINE InnerIterator& operator++() {
+      if (HasUnitDiag && m_returnOne)
         m_returnOne = false;
-      else
-      {
+      else {
         Base::operator++();
-        if(HasUnitDiag && (!SkipFirst) && ((!Base::operator bool()) || Base::index()>=Base::outer()))
-        {
-          if((!SkipFirst) && Base::operator bool())
-            Base::operator++();
-          m_returnOne = true;
+        if (HasUnitDiag && (!SkipFirst) && ((!Base::operator bool()) || Base::index() >= Base::outer())) {
+          if ((!SkipFirst) && Base::operator bool()) Base::operator++();
+          m_returnOne = m_containsDiag;
         }
       }
       return *this;
     }
 
-    inline Index row() const { return (MatrixType::Flags&RowMajorBit ? Base::outer() : this->index()); }
-    inline Index col() const { return (MatrixType::Flags&RowMajorBit ? this->index() : Base::outer()); }
-    inline Index index() const
-    {
-      if(HasUnitDiag && m_returnOne)  return Base::outer();
-      else                            return Base::index();
-    }
-    inline Scalar value() const
-    {
-      if(HasUnitDiag && m_returnOne)  return Scalar(1);
-      else                            return Base::value();
+    EIGEN_STRONG_INLINE operator bool() const {
+      if (HasUnitDiag && m_returnOne) return true;
+      if (SkipFirst)
+        return Base::operator bool();
+      else {
+        if (SkipDiag)
+          return (Base::operator bool() && this->index() < this->outer());
+        else
+          return (Base::operator bool() && this->index() <= this->outer());
+      }
     }
 
-    EIGEN_STRONG_INLINE operator bool() const
-    {
-      if(HasUnitDiag && m_returnOne)
-        return true;
-      if(SkipFirst) return  Base::operator bool();
+    inline Index row() const { return (ArgType::Flags & RowMajorBit ? Base::outer() : this->index()); }
+    inline Index col() const { return (ArgType::Flags & RowMajorBit ? this->index() : Base::outer()); }
+    inline StorageIndex index() const {
+      if (HasUnitDiag && m_returnOne)
+        return internal::convert_index<StorageIndex>(Base::outer());
       else
-      {
-        if (SkipDiag) return (Base::operator bool() && this->index() < this->outer());
-        else return (Base::operator bool() && this->index() <= this->outer());
-      }
+        return Base::index();
     }
-  protected:
-    bool m_returnOne;
-};
-
-template<typename MatrixType, int Mode>
-class SparseTriangularView<MatrixType,Mode>::ReverseInnerIterator : public MatrixTypeNestedCleaned::ReverseInnerIterator
-{
-    typedef typename MatrixTypeNestedCleaned::ReverseInnerIterator Base;
-    typedef typename SparseTriangularView::Index Index;
-  public:
-
-    EIGEN_STRONG_INLINE ReverseInnerIterator(const SparseTriangularView& view, Index outer)
-      : Base(view.nestedExpression(), outer)
-    {
-      eigen_assert((!HasUnitDiag) && "ReverseInnerIterator does not support yet triangular views with a unit diagonal");
-      if(SkipLast) {
-        while((*this) && (SkipDiag ? this->index()>=outer : this->index()>outer))
-          --(*this);
-      }
+    inline Scalar value() const {
+      if (HasUnitDiag && m_returnOne)
+        return Scalar(1);
+      else
+        return Base::value();
     }
 
-    EIGEN_STRONG_INLINE ReverseInnerIterator& operator--()
-    { Base::operator--(); return *this; }
+   protected:
+    bool m_returnOne;
+    bool m_containsDiag;
 
-    inline Index row() const { return Base::row(); }
-    inline Index col() const { return Base::col(); }
+   private:
+    Scalar& valueRef();
+  };
 
-    EIGEN_STRONG_INLINE operator bool() const
-    {
-      if (SkipLast) return Base::operator bool() ;
-      else
-      {
-        if(SkipDiag) return (Base::operator bool() && this->index() > this->outer());
-        else return (Base::operator bool() && this->index() >= this->outer());
-      }
-    }
+ protected:
+  evaluator<ArgType> m_argImpl;
+  const ArgType& m_arg;
 };
 
-template<typename Derived>
-template<int Mode>
-inline const SparseTriangularView<Derived, Mode>
-SparseMatrixBase<Derived>::triangularView() const
-{
-  return derived();
+}  // end namespace internal
+
+template <typename Derived>
+template <int Mode>
+inline const TriangularView<const Derived, Mode> SparseMatrixBase<Derived>::triangularView() const {
+  return TriangularView<const Derived, Mode>(derived());
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_SPARSE_TRIANGULARVIEW_H
+#endif  // EIGEN_SPARSE_TRIANGULARVIEW_H
diff --git a/inst/include/Eigen/src/SparseCore/SparseUtil.h b/inst/include/Eigen/src/SparseCore/SparseUtil.h
index d627546d..33cedaf3 100644
--- a/inst/include/Eigen/src/SparseCore/SparseUtil.h
+++ b/inst/include/Eigen/src/SparseCore/SparseUtil.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,7 +10,10 @@
 #ifndef EIGEN_SPARSEUTIL_H
 #define EIGEN_SPARSEUTIL_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 #ifdef NDEBUG
 #define EIGEN_DBG_SPARSE(X)
@@ -18,155 +21,189 @@ namespace Eigen {
 #define EIGEN_DBG_SPARSE(X) X
 #endif
 
-#define EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(Derived, Op) \
-template<typename OtherDerived> \
-EIGEN_STRONG_INLINE Derived& operator Op(const Eigen::SparseMatrixBase<OtherDerived>& other) \
-{ \
-  return Base::operator Op(other.derived()); \
-} \
-EIGEN_STRONG_INLINE Derived& operator Op(const Derived& other) \
-{ \
-  return Base::operator Op(other); \
-}
+#define EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(Derived, Op)                                    \
+  template <typename OtherDerived>                                                               \
+  EIGEN_STRONG_INLINE Derived& operator Op(const Eigen::SparseMatrixBase<OtherDerived>& other) { \
+    return Base::operator Op(other.derived());                                                   \
+  }                                                                                              \
+  EIGEN_STRONG_INLINE Derived& operator Op(const Derived & other) { return Base::operator Op(other); }
 
 #define EIGEN_SPARSE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Derived, Op) \
-template<typename Other> \
-EIGEN_STRONG_INLINE Derived& operator Op(const Other& scalar) \
-{ \
-  return Base::operator Op(scalar); \
-}
-
-#define EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATORS(Derived) \
-EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(Derived, =) \
-EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(Derived, +=) \
-EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(Derived, -=) \
-EIGEN_SPARSE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Derived, *=) \
-EIGEN_SPARSE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Derived, /=)
-
-#define _EIGEN_SPARSE_PUBLIC_INTERFACE(Derived, BaseClass) \
-  typedef BaseClass Base; \
-  typedef typename Eigen::internal::traits<Derived >::Scalar Scalar; \
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; \
-  typedef typename Eigen::internal::nested<Derived >::type Nested; \
-  typedef typename Eigen::internal::traits<Derived >::StorageKind StorageKind; \
-  typedef typename Eigen::internal::traits<Derived >::Index Index; \
-  enum { RowsAtCompileTime = Eigen::internal::traits<Derived >::RowsAtCompileTime, \
-        ColsAtCompileTime = Eigen::internal::traits<Derived >::ColsAtCompileTime, \
-        Flags = Eigen::internal::traits<Derived >::Flags, \
-        CoeffReadCost = Eigen::internal::traits<Derived >::CoeffReadCost, \
-        SizeAtCompileTime = Base::SizeAtCompileTime, \
-        IsVectorAtCompileTime = Base::IsVectorAtCompileTime }; \
-  using Base::derived; \
-  using Base::const_cast_derived;
-
-#define EIGEN_SPARSE_PUBLIC_INTERFACE(Derived) \
-  _EIGEN_SPARSE_PUBLIC_INTERFACE(Derived, Eigen::SparseMatrixBase<Derived >)
-
-const int CoherentAccessPattern     = 0x1;
-const int InnerRandomAccessPattern  = 0x2 | CoherentAccessPattern;
-const int OuterRandomAccessPattern  = 0x4 | CoherentAccessPattern;
-const int RandomAccessPattern       = 0x8 | OuterRandomAccessPattern | InnerRandomAccessPattern;
-
-template<typename _Scalar, int _Flags = 0, typename _Index = int>  class SparseMatrix;
-template<typename _Scalar, int _Flags = 0, typename _Index = int>  class DynamicSparseMatrix;
-template<typename _Scalar, int _Flags = 0, typename _Index = int>  class SparseVector;
-template<typename _Scalar, int _Flags = 0, typename _Index = int>  class MappedSparseMatrix;
-
-template<typename MatrixType, int Mode>           class SparseTriangularView;
-template<typename MatrixType, unsigned int UpLo>  class SparseSelfAdjointView;
-template<typename Lhs, typename Rhs>              class SparseDiagonalProduct;
-template<typename MatrixType> class SparseView;
-
-template<typename Lhs, typename Rhs>        class SparseSparseProduct;
-template<typename Lhs, typename Rhs>        class SparseTimeDenseProduct;
-template<typename Lhs, typename Rhs>        class DenseTimeSparseProduct;
-template<typename Lhs, typename Rhs, bool Transpose> class SparseDenseOuterProduct;
-
-template<typename Lhs, typename Rhs> struct SparseSparseProductReturnType;
-template<typename Lhs, typename Rhs,
-         int InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(internal::traits<Lhs>::ColsAtCompileTime,internal::traits<Rhs>::RowsAtCompileTime)> struct DenseSparseProductReturnType;         
-template<typename Lhs, typename Rhs,
-         int InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(internal::traits<Lhs>::ColsAtCompileTime,internal::traits<Rhs>::RowsAtCompileTime)> struct SparseDenseProductReturnType;
-template<typename MatrixType,int UpLo> class SparseSymmetricPermutationProduct;
+  template <typename Other>                                          \
+  EIGEN_STRONG_INLINE Derived& operator Op(const Other & scalar) {   \
+    return Base::operator Op(scalar);                                \
+  }
+
+#define EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATORS(Derived) EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(Derived, =)
+
+#define EIGEN_SPARSE_PUBLIC_INTERFACE(Derived) EIGEN_GENERIC_PUBLIC_INTERFACE(Derived)
+
+const int CoherentAccessPattern = 0x1;
+const int InnerRandomAccessPattern = 0x2 | CoherentAccessPattern;
+const int OuterRandomAccessPattern = 0x4 | CoherentAccessPattern;
+const int RandomAccessPattern = 0x8 | OuterRandomAccessPattern | InnerRandomAccessPattern;
+
+template <typename Scalar_, int Flags_ = 0, typename StorageIndex_ = int>
+class SparseMatrix;
+template <typename Scalar_, int Flags_ = 0, typename StorageIndex_ = int>
+class SparseVector;
+
+template <typename MatrixType, unsigned int UpLo>
+class SparseSelfAdjointView;
+template <typename Lhs, typename Rhs>
+class SparseDiagonalProduct;
+template <typename MatrixType>
+class SparseView;
+
+template <typename Lhs, typename Rhs>
+class SparseSparseProduct;
+template <typename Lhs, typename Rhs>
+class SparseTimeDenseProduct;
+template <typename Lhs, typename Rhs>
+class DenseTimeSparseProduct;
+template <typename Lhs, typename Rhs, bool Transpose>
+class SparseDenseOuterProduct;
+
+template <typename Lhs, typename Rhs>
+struct SparseSparseProductReturnType;
+template <typename Lhs, typename Rhs,
+          int InnerSize = internal::min_size_prefer_fixed(internal::traits<Lhs>::ColsAtCompileTime,
+                                                          internal::traits<Rhs>::RowsAtCompileTime)>
+struct DenseSparseProductReturnType;
+
+template <typename Lhs, typename Rhs,
+          int InnerSize = internal::min_size_prefer_fixed(internal::traits<Lhs>::ColsAtCompileTime,
+                                                          internal::traits<Rhs>::RowsAtCompileTime)>
+struct SparseDenseProductReturnType;
+template <typename MatrixType, int UpLo>
+class SparseSymmetricPermutationProduct;
 
 namespace internal {
 
-template<typename T,int Rows,int Cols> struct sparse_eval;
+template <typename T, int Rows, int Cols, int Flags>
+struct sparse_eval;
+
+template <typename T>
+struct eval<T, Sparse> : sparse_eval<T, traits<T>::RowsAtCompileTime, traits<T>::ColsAtCompileTime, traits<T>::Flags> {
+};
 
-template<typename T> struct eval<T,Sparse>
-  : public sparse_eval<T, traits<T>::RowsAtCompileTime,traits<T>::ColsAtCompileTime>
-{};
+template <typename T, int Cols, int Flags>
+struct sparse_eval<T, 1, Cols, Flags> {
+  typedef typename traits<T>::Scalar Scalar_;
+  typedef typename traits<T>::StorageIndex StorageIndex_;
 
-template<typename T,int Cols> struct sparse_eval<T,1,Cols> {
-    typedef typename traits<T>::Scalar _Scalar;
-    typedef typename traits<T>::Index _Index;
-  public:
-    typedef SparseVector<_Scalar, RowMajor, _Index> type;
+ public:
+  typedef SparseVector<Scalar_, RowMajor, StorageIndex_> type;
 };
 
-template<typename T,int Rows> struct sparse_eval<T,Rows,1> {
-    typedef typename traits<T>::Scalar _Scalar;
-    typedef typename traits<T>::Index _Index;
-  public:
-    typedef SparseVector<_Scalar, ColMajor, _Index> type;
+template <typename T, int Rows, int Flags>
+struct sparse_eval<T, Rows, 1, Flags> {
+  typedef typename traits<T>::Scalar Scalar_;
+  typedef typename traits<T>::StorageIndex StorageIndex_;
+
+ public:
+  typedef SparseVector<Scalar_, ColMajor, StorageIndex_> type;
 };
 
-template<typename T,int Rows,int Cols> struct sparse_eval {
-    typedef typename traits<T>::Scalar _Scalar;
-    typedef typename traits<T>::Index _Index;
-    enum { _Options = ((traits<T>::Flags&RowMajorBit)==RowMajorBit) ? RowMajor : ColMajor };
-  public:
-    typedef SparseMatrix<_Scalar, _Options, _Index> type;
+// TODO this seems almost identical to plain_matrix_type<T, Sparse>
+template <typename T, int Rows, int Cols, int Flags>
+struct sparse_eval {
+  typedef typename traits<T>::Scalar Scalar_;
+  typedef typename traits<T>::StorageIndex StorageIndex_;
+  enum { Options_ = ((Flags & RowMajorBit) == RowMajorBit) ? RowMajor : ColMajor };
+
+ public:
+  typedef SparseMatrix<Scalar_, Options_, StorageIndex_> type;
 };
 
-template<typename T> struct sparse_eval<T,1,1> {
-    typedef typename traits<T>::Scalar _Scalar;
-  public:
-    typedef Matrix<_Scalar, 1, 1> type;
+template <typename T, int Flags>
+struct sparse_eval<T, 1, 1, Flags> {
+  typedef typename traits<T>::Scalar Scalar_;
+
+ public:
+  typedef Matrix<Scalar_, 1, 1> type;
 };
 
-template<typename T> struct plain_matrix_type<T,Sparse>
-{
-  typedef typename traits<T>::Scalar _Scalar;
-  typedef typename traits<T>::Index _Index;
-  enum { _Options = ((traits<T>::Flags&RowMajorBit)==RowMajorBit) ? RowMajor : ColMajor };
-  public:
-    typedef SparseMatrix<_Scalar, _Options, _Index> type;
+template <typename T>
+struct plain_matrix_type<T, Sparse> {
+  typedef typename traits<T>::Scalar Scalar_;
+  typedef typename traits<T>::StorageIndex StorageIndex_;
+  enum { Options_ = ((evaluator<T>::Flags & RowMajorBit) == RowMajorBit) ? RowMajor : ColMajor };
+
+ public:
+  typedef SparseMatrix<Scalar_, Options_, StorageIndex_> type;
 };
 
-} // end namespace internal
+template <typename T>
+struct plain_object_eval<T, Sparse>
+    : sparse_eval<T, traits<T>::RowsAtCompileTime, traits<T>::ColsAtCompileTime, evaluator<T>::Flags> {};
+
+template <typename Decomposition, typename RhsType>
+struct solve_traits<Decomposition, RhsType, Sparse> {
+  typedef typename sparse_eval<RhsType, RhsType::RowsAtCompileTime, RhsType::ColsAtCompileTime,
+                               traits<RhsType>::Flags>::type PlainObject;
+};
+
+template <typename Derived>
+struct generic_xpr_base<Derived, MatrixXpr, Sparse> {
+  typedef SparseMatrixBase<Derived> type;
+};
+
+struct SparseTriangularShape {
+  static std::string debugName() { return "SparseTriangularShape"; }
+};
+struct SparseSelfAdjointShape {
+  static std::string debugName() { return "SparseSelfAdjointShape"; }
+};
+
+template <>
+struct glue_shapes<SparseShape, SelfAdjointShape> {
+  typedef SparseSelfAdjointShape type;
+};
+template <>
+struct glue_shapes<SparseShape, TriangularShape> {
+  typedef SparseTriangularShape type;
+};
+
+// return type of SparseCompressedBase::lower_bound;
+struct LowerBoundIndex {
+  LowerBoundIndex() : value(-1), found(false) {}
+  LowerBoundIndex(Index val, bool ok) : value(val), found(ok) {}
+  Index value;
+  bool found;
+};
+
+}  // end namespace internal
 
 /** \ingroup SparseCore_Module
-  *
-  * \class Triplet
-  *
-  * \brief A small structure to hold a non zero as a triplet (i,j,value).
-  *
-  * \sa SparseMatrix::setFromTriplets()
-  */
-template<typename Scalar, typename Index=typename SparseMatrix<Scalar>::Index >
-class Triplet
-{
-public:
+ *
+ * \class Triplet
+ *
+ * \brief A small structure to hold a non zero as a triplet (i,j,value).
+ *
+ * \sa SparseMatrix::setFromTriplets()
+ */
+template <typename Scalar, typename StorageIndex = typename SparseMatrix<Scalar>::StorageIndex>
+class Triplet {
+ public:
   Triplet() : m_row(0), m_col(0), m_value(0) {}
 
-  Triplet(const Index& i, const Index& j, const Scalar& v = Scalar(0))
-    : m_row(i), m_col(j), m_value(v)
-  {}
+  Triplet(const StorageIndex& i, const StorageIndex& j, const Scalar& v = Scalar(0)) : m_row(i), m_col(j), m_value(v) {}
 
   /** \returns the row index of the element */
-  const Index& row() const { return m_row; }
+  const StorageIndex& row() const { return m_row; }
 
   /** \returns the column index of the element */
-  const Index& col() const { return m_col; }
+  const StorageIndex& col() const { return m_col; }
 
   /** \returns the value of the element */
   const Scalar& value() const { return m_value; }
-protected:
-  Index m_row, m_col;
+
+ protected:
+  StorageIndex m_row, m_col;
   Scalar m_value;
 };
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_SPARSEUTIL_H
+#endif  // EIGEN_SPARSEUTIL_H
diff --git a/inst/include/Eigen/src/SparseCore/SparseVector.h b/inst/include/Eigen/src/SparseCore/SparseVector.h
index 49865d0e..c8d34e31 100644
--- a/inst/include/Eigen/src/SparseCore/SparseVector.h
+++ b/inst/include/Eigen/src/SparseCore/SparseVector.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,439 +10,523 @@
 #ifndef EIGEN_SPARSEVECTOR_H
 #define EIGEN_SPARSEVECTOR_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 /** \ingroup SparseCore_Module
-  * \class SparseVector
-  *
-  * \brief a sparse vector class
-  *
-  * \tparam _Scalar the scalar type, i.e. the type of the coefficients
-  *
-  * See http://www.netlib.org/linalg/html_templates/node91.html for details on the storage scheme.
-  *
-  * This class can be extended with the help of the plugin mechanism described on the page
-  * \ref TopicCustomizingEigen by defining the preprocessor symbol \c EIGEN_SPARSEVECTOR_PLUGIN.
-  */
+ * \class SparseVector
+ *
+ * \brief a sparse vector class
+ *
+ * \tparam Scalar_ the scalar type, i.e. the type of the coefficients
+ *
+ * See http://www.netlib.org/linalg/html_templates/node91.html for details on the storage scheme.
+ *
+ * This class can be extended with the help of the plugin mechanism described on the page
+ * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_SPARSEVECTOR_PLUGIN.
+ */
 
 namespace internal {
-template<typename _Scalar, int _Options, typename _Index>
-struct traits<SparseVector<_Scalar, _Options, _Index> >
-{
-  typedef _Scalar Scalar;
-  typedef _Index Index;
+template <typename Scalar_, int Options_, typename StorageIndex_>
+struct traits<SparseVector<Scalar_, Options_, StorageIndex_> > {
+  typedef Scalar_ Scalar;
+  typedef StorageIndex_ StorageIndex;
   typedef Sparse StorageKind;
   typedef MatrixXpr XprKind;
   enum {
-    IsColVector = (_Options & RowMajorBit) ? 0 : 1,
+    IsColVector = (Options_ & RowMajorBit) ? 0 : 1,
 
     RowsAtCompileTime = IsColVector ? Dynamic : 1,
     ColsAtCompileTime = IsColVector ? 1 : Dynamic,
     MaxRowsAtCompileTime = RowsAtCompileTime,
     MaxColsAtCompileTime = ColsAtCompileTime,
-    Flags = _Options | NestByRefBit | LvalueBit | (IsColVector ? 0 : RowMajorBit),
-    CoeffReadCost = NumTraits<Scalar>::ReadCost,
+    Flags = Options_ | NestByRefBit | LvalueBit | (IsColVector ? 0 : RowMajorBit) | CompressedAccessBit,
     SupportedAccessPatterns = InnerRandomAccessPattern
   };
 };
 
 // Sparse-Vector-Assignment kinds:
-enum {
-  SVA_RuntimeSwitch,
-  SVA_Inner,
-  SVA_Outer
-};
+enum { SVA_RuntimeSwitch, SVA_Inner, SVA_Outer };
 
-template< typename Dest, typename Src,
-          int AssignmentKind = !bool(Src::IsVectorAtCompileTime) ? SVA_RuntimeSwitch
-                             : Src::InnerSizeAtCompileTime==1 ? SVA_Outer
-                             : SVA_Inner>
+template <typename Dest, typename Src,
+          int AssignmentKind = !bool(Src::IsVectorAtCompileTime)  ? SVA_RuntimeSwitch
+                               : Src::InnerSizeAtCompileTime == 1 ? SVA_Outer
+                                                                  : SVA_Inner>
 struct sparse_vector_assign_selector;
 
-}
-
-template<typename _Scalar, int _Options, typename _Index>
-class SparseVector
-  : public SparseMatrixBase<SparseVector<_Scalar, _Options, _Index> >
-{
-    typedef SparseMatrixBase<SparseVector> SparseBase;
-    
-  public:
-    EIGEN_SPARSE_PUBLIC_INTERFACE(SparseVector)
-    EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(SparseVector, +=)
-    EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(SparseVector, -=)
-    
-    typedef internal::CompressedStorage<Scalar,Index> Storage;
-    enum { IsColVector = internal::traits<SparseVector>::IsColVector };
-    
-    enum {
-      Options = _Options
-    };
-    
-    EIGEN_STRONG_INLINE Index rows() const { return IsColVector ? m_size : 1; }
-    EIGEN_STRONG_INLINE Index cols() const { return IsColVector ? 1 : m_size; }
-    EIGEN_STRONG_INLINE Index innerSize() const { return m_size; }
-    EIGEN_STRONG_INLINE Index outerSize() const { return 1; }
-
-    EIGEN_STRONG_INLINE const Scalar* valuePtr() const { return &m_data.value(0); }
-    EIGEN_STRONG_INLINE Scalar* valuePtr() { return &m_data.value(0); }
-
-    EIGEN_STRONG_INLINE const Index* innerIndexPtr() const { return &m_data.index(0); }
-    EIGEN_STRONG_INLINE Index* innerIndexPtr() { return &m_data.index(0); }
-    
-    /** \internal */
-    inline Storage& data() { return m_data; }
-    /** \internal */
-    inline const Storage& data() const { return m_data; }
-
-    inline Scalar coeff(Index row, Index col) const
-    {
-      eigen_assert(IsColVector ? (col==0 && row>=0 && row<m_size) : (row==0 && col>=0 && col<m_size));
-      return coeff(IsColVector ? row : col);
-    }
-    inline Scalar coeff(Index i) const
-    {
-      eigen_assert(i>=0 && i<m_size);
-      return m_data.at(i);
-    }
+}  // namespace internal
 
-    inline Scalar& coeffRef(Index row, Index col)
-    {
-      eigen_assert(IsColVector ? (col==0 && row>=0 && row<m_size) : (row==0 && col>=0 && col<m_size));
-      return coeff(IsColVector ? row : col);
-    }
+template <typename Scalar_, int Options_, typename StorageIndex_>
+class SparseVector : public SparseCompressedBase<SparseVector<Scalar_, Options_, StorageIndex_> > {
+  typedef SparseCompressedBase<SparseVector> Base;
+  using Base::convert_index;
 
-    /** \returns a reference to the coefficient value at given index \a i
-      * This operation involes a log(rho*size) binary search. If the coefficient does not
-      * exist yet, then a sorted insertion into a sequential buffer is performed.
-      *
-      * This insertion might be very costly if the number of nonzeros above \a i is large.
-      */
-    inline Scalar& coeffRef(Index i)
-    {
-      eigen_assert(i>=0 && i<m_size);
-      return m_data.atWithInsertion(i);
-    }
+ public:
+  EIGEN_SPARSE_PUBLIC_INTERFACE(SparseVector)
+  EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(SparseVector, +=)
+  EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(SparseVector, -=)
 
-  public:
+  typedef internal::CompressedStorage<Scalar, StorageIndex> Storage;
+  enum { IsColVector = internal::traits<SparseVector>::IsColVector };
 
-    class InnerIterator;
-    class ReverseInnerIterator;
+  enum { Options = Options_ };
 
-    inline void setZero() { m_data.clear(); }
+  EIGEN_STRONG_INLINE Index rows() const { return IsColVector ? m_size : 1; }
+  EIGEN_STRONG_INLINE Index cols() const { return IsColVector ? 1 : m_size; }
+  EIGEN_STRONG_INLINE Index innerSize() const { return m_size; }
+  EIGEN_STRONG_INLINE Index outerSize() const { return 1; }
 
-    /** \returns the number of non zero coefficients */
-    inline Index nonZeros() const  { return static_cast<Index>(m_data.size()); }
+  EIGEN_STRONG_INLINE const Scalar* valuePtr() const { return m_data.valuePtr(); }
+  EIGEN_STRONG_INLINE Scalar* valuePtr() { return m_data.valuePtr(); }
 
-    inline void startVec(Index outer)
-    {
-      EIGEN_UNUSED_VARIABLE(outer);
-      eigen_assert(outer==0);
-    }
+  EIGEN_STRONG_INLINE const StorageIndex* innerIndexPtr() const { return m_data.indexPtr(); }
+  EIGEN_STRONG_INLINE StorageIndex* innerIndexPtr() { return m_data.indexPtr(); }
 
-    inline Scalar& insertBackByOuterInner(Index outer, Index inner)
-    {
-      EIGEN_UNUSED_VARIABLE(outer);
-      eigen_assert(outer==0);
-      return insertBack(inner);
-    }
-    inline Scalar& insertBack(Index i)
-    {
-      m_data.append(0, i);
-      return m_data.value(m_data.size()-1);
-    }
+  inline const StorageIndex* outerIndexPtr() const { return 0; }
+  inline StorageIndex* outerIndexPtr() { return 0; }
+  inline const StorageIndex* innerNonZeroPtr() const { return 0; }
+  inline StorageIndex* innerNonZeroPtr() { return 0; }
 
-    inline Scalar& insert(Index row, Index col)
-    {
-      eigen_assert(IsColVector ? (col==0 && row>=0 && row<m_size) : (row==0 && col>=0 && col<m_size));
-      
-      Index inner = IsColVector ? row : col;
-      Index outer = IsColVector ? col : row;
-      EIGEN_ONLY_USED_FOR_DEBUG(outer);
-      eigen_assert(outer==0);
-      return insert(inner);
-    }
-    Scalar& insert(Index i)
-    {
-      eigen_assert(i>=0 && i<m_size);
-      
-      Index startId = 0;
-      Index p = Index(m_data.size()) - 1;
-      // TODO smart realloc
-      m_data.resize(p+2,1);
-
-      while ( (p >= startId) && (m_data.index(p) > i) )
-      {
-        m_data.index(p+1) = m_data.index(p);
-        m_data.value(p+1) = m_data.value(p);
-        --p;
-      }
-      m_data.index(p+1) = i;
-      m_data.value(p+1) = 0;
-      return m_data.value(p+1);
-    }
+  /** \internal */
+  constexpr Storage& data() { return m_data; }
+  /** \internal */
+  constexpr const Storage& data() const { return m_data; }
 
-    /**
-      */
-    inline void reserve(Index reserveSize) { m_data.reserve(reserveSize); }
+  inline Scalar coeff(Index row, Index col) const {
+    eigen_assert(IsColVector ? (col == 0 && row >= 0 && row < m_size) : (row == 0 && col >= 0 && col < m_size));
+    return coeff(IsColVector ? row : col);
+  }
+  inline Scalar coeff(Index i) const {
+    eigen_assert(i >= 0 && i < m_size);
+    return m_data.at(StorageIndex(i));
+  }
 
+  inline Scalar& coeffRef(Index row, Index col) {
+    eigen_assert(IsColVector ? (col == 0 && row >= 0 && row < m_size) : (row == 0 && col >= 0 && col < m_size));
+    return coeffRef(IsColVector ? row : col);
+  }
 
-    inline void finalize() {}
+  /** \returns a reference to the coefficient value at given index \a i
+   * This operation involves a log(rho*size) binary search. If the coefficient does not
+   * exist yet, then a sorted insertion into a sequential buffer is performed.
+   *
+   * This insertion might be very costly if the number of nonzeros above \a i is large.
+   */
+  inline Scalar& coeffRef(Index i) {
+    eigen_assert(i >= 0 && i < m_size);
 
-    void prune(const Scalar& reference, const RealScalar& epsilon = NumTraits<RealScalar>::dummy_precision())
-    {
-      m_data.prune(reference,epsilon);
-    }
+    return m_data.atWithInsertion(StorageIndex(i));
+  }
 
-    void resize(Index rows, Index cols)
-    {
-      eigen_assert(rows==1 || cols==1);
-      resize(IsColVector ? rows : cols);
-    }
+ public:
+  typedef typename Base::InnerIterator InnerIterator;
+  typedef typename Base::ReverseInnerIterator ReverseInnerIterator;
 
-    void resize(Index newSize)
-    {
-      m_size = newSize;
-      m_data.clear();
-    }
+  inline void setZero() { m_data.clear(); }
+
+  /** \returns the number of non zero coefficients */
+  inline Index nonZeros() const { return m_data.size(); }
 
-    void resizeNonZeros(Index size) { m_data.resize(size); }
+  inline void startVec(Index outer) {
+    EIGEN_UNUSED_VARIABLE(outer);
+    eigen_assert(outer == 0);
+  }
 
-    inline SparseVector() : m_size(0) { check_template_parameters(); resize(0); }
+  inline Scalar& insertBackByOuterInner(Index outer, Index inner) {
+    EIGEN_UNUSED_VARIABLE(outer);
+    eigen_assert(outer == 0);
+    return insertBack(inner);
+  }
+  inline Scalar& insertBack(Index i) {
+    m_data.append(Scalar(0), i);
+    return m_data.value(m_data.size() - 1);
+  }
 
-    inline SparseVector(Index size) : m_size(0) { check_template_parameters(); resize(size); }
+  Scalar& insertBackByOuterInnerUnordered(Index outer, Index inner) {
+    EIGEN_UNUSED_VARIABLE(outer);
+    eigen_assert(outer == 0);
+    return insertBackUnordered(inner);
+  }
+  inline Scalar& insertBackUnordered(Index i) {
+    m_data.append(Scalar(0), i);
+    return m_data.value(m_data.size() - 1);
+  }
 
-    inline SparseVector(Index rows, Index cols) : m_size(0) { check_template_parameters(); resize(rows,cols); }
+  inline Scalar& insert(Index row, Index col) {
+    eigen_assert(IsColVector ? (col == 0 && row >= 0 && row < m_size) : (row == 0 && col >= 0 && col < m_size));
 
-    template<typename OtherDerived>
-    inline SparseVector(const SparseMatrixBase<OtherDerived>& other)
-      : m_size(0)
-    {
-      check_template_parameters();
-      *this = other.derived();
+    Index inner = IsColVector ? row : col;
+    Index outer = IsColVector ? col : row;
+    EIGEN_ONLY_USED_FOR_DEBUG(outer);
+    eigen_assert(outer == 0);
+    return insert(inner);
+  }
+  Scalar& insert(Index i) {
+    eigen_assert(i >= 0 && i < m_size);
+
+    Index startId = 0;
+    Index p = Index(m_data.size()) - 1;
+    // TODO smart realloc
+    m_data.resize(p + 2, 1);
+
+    while ((p >= startId) && (m_data.index(p) > i)) {
+      m_data.index(p + 1) = m_data.index(p);
+      m_data.value(p + 1) = m_data.value(p);
+      --p;
     }
+    m_data.index(p + 1) = convert_index(i);
+    m_data.value(p + 1) = Scalar(0);
+    return m_data.value(p + 1);
+  }
 
-    inline SparseVector(const SparseVector& other)
-      : SparseBase(other), m_size(0)
-    {
-      check_template_parameters();
-      *this = other.derived();
-    }
+  /**
+   */
+  inline void reserve(Index reserveSize) { m_data.reserve(reserveSize); }
 
-    /** Swaps the values of \c *this and \a other.
-      * Overloaded for performance: this version performs a \em shallow swap by swaping pointers and attributes only.
-      * \sa SparseMatrixBase::swap()
-      */
-    inline void swap(SparseVector& other)
-    {
-      std::swap(m_size, other.m_size);
-      m_data.swap(other.m_data);
-    }
+  inline void finalize() {}
 
-    inline SparseVector& operator=(const SparseVector& other)
-    {
-      if (other.isRValue())
-      {
-        swap(other.const_cast_derived());
-      }
-      else
-      {
-        resize(other.size());
-        m_data = other.m_data;
+  /** \copydoc SparseMatrix::prune(const Scalar&,const RealScalar&) */
+  Index prune(const Scalar& reference, const RealScalar& epsilon = NumTraits<RealScalar>::dummy_precision()) {
+    return prune([&](const Scalar& val) { return !internal::isMuchSmallerThan(val, reference, epsilon); });
+  }
+
+  /**
+   * \brief Prunes the entries of the vector based on a `predicate`
+   * \tparam F Type of the predicate.
+   * \param keep_predicate The predicate that is used to test whether a value should be kept. A callable that
+   * gets passed om a `Scalar` value and returns a boolean. If the predicate returns true, the value is kept.
+   * \return The new number of structural non-zeros.
+   */
+  template <class F>
+  Index prune(F&& keep_predicate) {
+    Index k = 0;
+    Index n = m_data.size();
+    for (Index i = 0; i < n; ++i) {
+      if (keep_predicate(m_data.value(i))) {
+        m_data.value(k) = std::move(m_data.value(i));
+        m_data.index(k) = m_data.index(i);
+        ++k;
       }
-      return *this;
     }
+    m_data.resize(k);
+    return k;
+  }
 
-    template<typename OtherDerived>
-    inline SparseVector& operator=(const SparseMatrixBase<OtherDerived>& other)
-    {
-      SparseVector tmp(other.size());
-      internal::sparse_vector_assign_selector<SparseVector,OtherDerived>::run(tmp,other.derived());
-      this->swap(tmp);
-      return *this;
-    }
+  /** Resizes the sparse vector to \a rows x \a cols
+   *
+   * This method is provided for compatibility with matrices.
+   * For a column vector, \a cols must be equal to 1.
+   * For a row vector, \a rows must be equal to 1.
+   *
+   * \sa resize(Index)
+   */
+  void resize(Index rows, Index cols) {
+    eigen_assert((IsColVector ? cols : rows) == 1 && "Outer dimension must equal 1");
+    resize(IsColVector ? rows : cols);
+  }
 
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    template<typename Lhs, typename Rhs>
-    inline SparseVector& operator=(const SparseSparseProduct<Lhs,Rhs>& product)
-    {
-      return Base::operator=(product);
-    }
-    #endif
-
-    friend std::ostream & operator << (std::ostream & s, const SparseVector& m)
-    {
-      for (Index i=0; i<m.nonZeros(); ++i)
-        s << "(" << m.m_data.value(i) << "," << m.m_data.index(i) << ") ";
-      s << std::endl;
-      return s;
+  /** Resizes the sparse vector to \a newSize
+   * This method deletes all entries, thus leaving an empty sparse vector
+   *
+   * \sa  conservativeResize(), setZero() */
+  void resize(Index newSize) {
+    m_size = newSize;
+    m_data.clear();
+  }
+
+  /** Resizes the sparse vector to \a newSize, while leaving old values untouched.
+   *
+   * If the size of the vector is decreased, then the storage of the out-of bounds coefficients is kept and reserved.
+   * Call .data().squeeze() to free extra memory.
+   *
+   * \sa reserve(), setZero()
+   */
+  void conservativeResize(Index newSize) {
+    if (newSize < m_size) {
+      Index i = 0;
+      while (i < m_data.size() && m_data.index(i) < newSize) ++i;
+      m_data.resize(i);
     }
+    m_size = newSize;
+  }
 
-    /** Destructor */
-    inline ~SparseVector() {}
+  void resizeNonZeros(Index size) { m_data.resize(size); }
 
-    /** Overloaded for performance */
-    Scalar sum() const;
+  inline SparseVector() : m_size(0) { resize(0); }
 
-  public:
+  explicit inline SparseVector(Index size) : m_size(0) { resize(size); }
 
-    /** \internal \deprecated use setZero() and reserve() */
-    EIGEN_DEPRECATED void startFill(Index reserve)
-    {
-      setZero();
-      m_data.reserve(reserve);
-    }
+  inline SparseVector(Index rows, Index cols) : m_size(0) { resize(rows, cols); }
 
-    /** \internal \deprecated use insertBack(Index,Index) */
-    EIGEN_DEPRECATED Scalar& fill(Index r, Index c)
-    {
-      eigen_assert(r==0 || c==0);
-      return fill(IsColVector ? r : c);
-    }
+  template <typename OtherDerived>
+  inline SparseVector(const SparseMatrixBase<OtherDerived>& other) : m_size(0) {
+#ifdef EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+    EIGEN_SPARSE_CREATE_TEMPORARY_PLUGIN
+#endif
+    *this = other.derived();
+  }
 
-    /** \internal \deprecated use insertBack(Index) */
-    EIGEN_DEPRECATED Scalar& fill(Index i)
-    {
-      m_data.append(0, i);
-      return m_data.value(m_data.size()-1);
-    }
+  inline SparseVector(const SparseVector& other) : Base(other), m_size(0) { *this = other.derived(); }
 
-    /** \internal \deprecated use insert(Index,Index) */
-    EIGEN_DEPRECATED Scalar& fillrand(Index r, Index c)
-    {
-      eigen_assert(r==0 || c==0);
-      return fillrand(IsColVector ? r : c);
-    }
+  /** Swaps the values of \c *this and \a other.
+   * Overloaded for performance: this version performs a \em shallow swap by swapping pointers and attributes only.
+   * \sa SparseMatrixBase::swap()
+   */
+  inline void swap(SparseVector& other) {
+    std::swap(m_size, other.m_size);
+    m_data.swap(other.m_data);
+  }
+  friend EIGEN_DEVICE_FUNC void swap(SparseVector& a, SparseVector& b) { a.swap(b); }
 
-    /** \internal \deprecated use insert(Index) */
-    EIGEN_DEPRECATED Scalar& fillrand(Index i)
-    {
-      return insert(i);
-    }
+  template <int OtherOptions>
+  inline void swap(SparseMatrix<Scalar, OtherOptions, StorageIndex>& other) {
+    eigen_assert(other.outerSize() == 1);
+    std::swap(m_size, other.m_innerSize);
+    m_data.swap(other.m_data);
+  }
+  template <int OtherOptions>
+  friend EIGEN_DEVICE_FUNC void swap(SparseVector& a, SparseMatrix<Scalar, OtherOptions, StorageIndex>& b) {
+    a.swap(b);
+  }
+  template <int OtherOptions>
+  friend EIGEN_DEVICE_FUNC void swap(SparseMatrix<Scalar, OtherOptions, StorageIndex>& a, SparseVector& b) {
+    b.swap(a);
+  }
 
-    /** \internal \deprecated use finalize() */
-    EIGEN_DEPRECATED void endFill() {}
-    
-    // These two functions were here in the 3.1 release, so let's keep them in case some code rely on them.
-    /** \internal \deprecated use data() */
-    EIGEN_DEPRECATED Storage& _data() { return m_data; }
-    /** \internal \deprecated use data() */
-    EIGEN_DEPRECATED const Storage& _data() const { return m_data; }
-    
-#   ifdef EIGEN_SPARSEVECTOR_PLUGIN
-#     include EIGEN_SPARSEVECTOR_PLUGIN
-#   endif
-
-protected:
-  
-    static void check_template_parameters()
-    {
-      EIGEN_STATIC_ASSERT(NumTraits<Index>::IsSigned,THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE);
-      EIGEN_STATIC_ASSERT((_Options&(ColMajor|RowMajor))==Options,INVALID_MATRIX_TEMPLATE_PARAMETERS);
+  inline SparseVector& operator=(const SparseVector& other) {
+    if (other.isRValue()) {
+      swap(other.const_cast_derived());
+    } else {
+      resize(other.size());
+      m_data = other.m_data;
     }
-    
-    Storage m_data;
-    Index m_size;
-};
+    return *this;
+  }
 
-template<typename Scalar, int _Options, typename _Index>
-class SparseVector<Scalar,_Options,_Index>::InnerIterator
-{
-  public:
-    InnerIterator(const SparseVector& vec, Index outer=0)
-      : m_data(vec.m_data), m_id(0), m_end(static_cast<Index>(m_data.size()))
-    {
-      EIGEN_UNUSED_VARIABLE(outer);
-      eigen_assert(outer==0);
-    }
+  template <typename OtherDerived>
+  inline SparseVector& operator=(const SparseMatrixBase<OtherDerived>& other) {
+    SparseVector tmp(other.size());
+    internal::sparse_vector_assign_selector<SparseVector, OtherDerived>::run(tmp, other.derived());
+    this->swap(tmp);
+    return *this;
+  }
 
-    InnerIterator(const internal::CompressedStorage<Scalar,Index>& data)
-      : m_data(data), m_id(0), m_end(static_cast<Index>(m_data.size()))
-    {}
+  inline SparseVector(SparseVector&& other) : SparseVector() { this->swap(other); }
 
-    inline InnerIterator& operator++() { m_id++; return *this; }
+  template <typename OtherDerived>
+  inline SparseVector(SparseCompressedBase<OtherDerived>&& other) : SparseVector() {
+    *this = other.derived().markAsRValue();
+  }
 
-    inline Scalar value() const { return m_data.value(m_id); }
-    inline Scalar& valueRef() { return const_cast<Scalar&>(m_data.value(m_id)); }
+  inline SparseVector& operator=(SparseVector&& other) {
+    this->swap(other);
+    return *this;
+  }
 
-    inline Index index() const { return m_data.index(m_id); }
-    inline Index row() const { return IsColVector ? index() : 0; }
-    inline Index col() const { return IsColVector ? 0 : index(); }
+  template <typename OtherDerived>
+  inline SparseVector& operator=(SparseCompressedBase<OtherDerived>&& other) {
+    *this = other.derived().markAsRValue();
+    return *this;
+  }
 
-    inline operator bool() const { return (m_id < m_end); }
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  template <typename Lhs, typename Rhs>
+  inline SparseVector& operator=(const SparseSparseProduct<Lhs, Rhs>& product) {
+    return Base::operator=(product);
+  }
+#endif
 
-  protected:
-    const internal::CompressedStorage<Scalar,Index>& m_data;
-    Index m_id;
-    const Index m_end;
-};
+#ifndef EIGEN_NO_IO
+  friend std::ostream& operator<<(std::ostream& s, const SparseVector& m) {
+    for (Index i = 0; i < m.nonZeros(); ++i) s << "(" << m.m_data.value(i) << "," << m.m_data.index(i) << ") ";
+    s << std::endl;
+    return s;
+  }
+#endif
 
-template<typename Scalar, int _Options, typename _Index>
-class SparseVector<Scalar,_Options,_Index>::ReverseInnerIterator
-{
-  public:
-    ReverseInnerIterator(const SparseVector& vec, Index outer=0)
-      : m_data(vec.m_data), m_id(static_cast<Index>(m_data.size())), m_start(0)
-    {
-      EIGEN_UNUSED_VARIABLE(outer);
-      eigen_assert(outer==0);
-    }
+  /** Destructor */
+  inline ~SparseVector() {}
+
+  /** Overloaded for performance */
+  Scalar sum() const;
+
+ public:
+  /** \internal \deprecated use setZero() and reserve() */
+  EIGEN_DEPRECATED_WITH_REASON("Use .setZero() and .reserve() instead.") void startFill(Index reserve) {
+    setZero();
+    m_data.reserve(reserve);
+  }
+
+  /** \internal \deprecated use insertBack(Index,Index) */
+  EIGEN_DEPRECATED_WITH_REASON("Use .insertBack() instead.") Scalar& fill(Index r, Index c) {
+    eigen_assert(r == 0 || c == 0);
+    return fill(IsColVector ? r : c);
+  }
+
+  /** \internal \deprecated use insertBack(Index) */
+  EIGEN_DEPRECATED_WITH_REASON("Use .insertBack() instead.") Scalar& fill(Index i) {
+    m_data.append(Scalar(0), i);
+    return m_data.value(m_data.size() - 1);
+  }
+
+  /** \internal \deprecated use insert(Index,Index) */
+  EIGEN_DEPRECATED_WITH_REASON("Use .insert() instead.") Scalar& fillrand(Index r, Index c) {
+    eigen_assert(r == 0 || c == 0);
+    return fillrand(IsColVector ? r : c);
+  }
 
-    ReverseInnerIterator(const internal::CompressedStorage<Scalar,Index>& data)
-      : m_data(data), m_id(static_cast<Index>(m_data.size())), m_start(0)
-    {}
+  /** \internal \deprecated use insert(Index) */
+  EIGEN_DEPRECATED_WITH_REASON("Use .insert() instead.") Scalar& fillrand(Index i) { return insert(i); }
 
-    inline ReverseInnerIterator& operator--() { m_id--; return *this; }
+  /** \internal \deprecated use finalize() */
+  EIGEN_DEPRECATED_WITH_REASON("Use .finalize() instead.") void endFill() {}
 
-    inline Scalar value() const { return m_data.value(m_id-1); }
-    inline Scalar& valueRef() { return const_cast<Scalar&>(m_data.value(m_id-1)); }
+  // These two functions were here in the 3.1 release, so let's keep them in case some code rely on them.
+  /** \internal \deprecated use data() */
+  EIGEN_DEPRECATED_WITH_REASON("Use .data() instead.") Storage& _data() { return m_data; }
+  /** \internal \deprecated use data() */
+  EIGEN_DEPRECATED_WITH_REASON("Use .data() instead.") const Storage& _data() const { return m_data; }
 
-    inline Index index() const { return m_data.index(m_id-1); }
-    inline Index row() const { return IsColVector ? index() : 0; }
-    inline Index col() const { return IsColVector ? 0 : index(); }
+#ifdef EIGEN_SPARSEVECTOR_PLUGIN
+#include EIGEN_SPARSEVECTOR_PLUGIN
+#endif
 
-    inline operator bool() const { return (m_id > m_start); }
+ protected:
+  EIGEN_STATIC_ASSERT(NumTraits<StorageIndex>::IsSigned, THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE)
+  EIGEN_STATIC_ASSERT((Options_ & (ColMajor | RowMajor)) == Options, INVALID_MATRIX_TEMPLATE_PARAMETERS)
 
-  protected:
-    const internal::CompressedStorage<Scalar,Index>& m_data;
-    Index m_id;
-    const Index m_start;
+  Storage m_data;
+  Index m_size;
 };
 
 namespace internal {
 
-template< typename Dest, typename Src>
-struct sparse_vector_assign_selector<Dest,Src,SVA_Inner> {
+template <typename Scalar_, int Options_, typename Index_>
+struct evaluator<SparseVector<Scalar_, Options_, Index_> > : evaluator_base<SparseVector<Scalar_, Options_, Index_> > {
+  typedef SparseVector<Scalar_, Options_, Index_> SparseVectorType;
+  typedef evaluator_base<SparseVectorType> Base;
+  typedef typename SparseVectorType::InnerIterator InnerIterator;
+  typedef typename SparseVectorType::ReverseInnerIterator ReverseInnerIterator;
+
+  enum { CoeffReadCost = NumTraits<Scalar_>::ReadCost, Flags = SparseVectorType::Flags };
+
+  evaluator() : Base() {}
+
+  explicit evaluator(const SparseVectorType& mat) : m_matrix(&mat) { EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); }
+
+  inline Index nonZerosEstimate() const { return m_matrix->nonZeros(); }
+
+  operator SparseVectorType&() { return m_matrix->const_cast_derived(); }
+  operator const SparseVectorType&() const { return *m_matrix; }
+
+  const SparseVectorType* m_matrix;
+};
+
+template <typename Dest, typename Src>
+struct sparse_vector_assign_selector<Dest, Src, SVA_Inner> {
   static void run(Dest& dst, const Src& src) {
-    eigen_internal_assert(src.innerSize()==src.size());
-    for(typename Src::InnerIterator it(src, 0); it; ++it)
-      dst.insert(it.index()) = it.value();
+    eigen_internal_assert(src.innerSize() == src.size());
+    typedef internal::evaluator<Src> SrcEvaluatorType;
+    SrcEvaluatorType srcEval(src);
+    for (typename SrcEvaluatorType::InnerIterator it(srcEval, 0); it; ++it) dst.insert(it.index()) = it.value();
   }
 };
 
-template< typename Dest, typename Src>
-struct sparse_vector_assign_selector<Dest,Src,SVA_Outer> {
+template <typename Dest, typename Src>
+struct sparse_vector_assign_selector<Dest, Src, SVA_Outer> {
   static void run(Dest& dst, const Src& src) {
-    eigen_internal_assert(src.outerSize()==src.size());
-    for(typename Dest::Index i=0; i<src.size(); ++i)
-    {
-      typename Src::InnerIterator it(src, i);
-      if(it)
-        dst.insert(i) = it.value();
+    eigen_internal_assert(src.outerSize() == src.size());
+    typedef internal::evaluator<Src> SrcEvaluatorType;
+    SrcEvaluatorType srcEval(src);
+    for (Index i = 0; i < src.size(); ++i) {
+      typename SrcEvaluatorType::InnerIterator it(srcEval, i);
+      if (it) dst.insert(i) = it.value();
     }
   }
 };
 
-template< typename Dest, typename Src>
-struct sparse_vector_assign_selector<Dest,Src,SVA_RuntimeSwitch> {
+template <typename Dest, typename Src>
+struct sparse_vector_assign_selector<Dest, Src, SVA_RuntimeSwitch> {
   static void run(Dest& dst, const Src& src) {
-    if(src.outerSize()==1)  sparse_vector_assign_selector<Dest,Src,SVA_Inner>::run(dst, src);
-    else                    sparse_vector_assign_selector<Dest,Src,SVA_Outer>::run(dst, src);
+    if (src.outerSize() == 1)
+      sparse_vector_assign_selector<Dest, Src, SVA_Inner>::run(dst, src);
+    else
+      sparse_vector_assign_selector<Dest, Src, SVA_Outer>::run(dst, src);
   }
 };
 
-}
+}  // namespace internal
+
+// Specialization for SparseVector.
+// Serializes [size, numNonZeros, innerIndices, values].
+template <typename Scalar, int Options, typename StorageIndex>
+class Serializer<SparseVector<Scalar, Options, StorageIndex>, void> {
+ public:
+  typedef SparseVector<Scalar, Options, StorageIndex> SparseMat;
+
+  struct Header {
+    typename SparseMat::Index size;
+    Index num_non_zeros;
+  };
+
+  EIGEN_DEVICE_FUNC size_t size(const SparseMat& value) const {
+    return sizeof(Header) + (sizeof(Scalar) + sizeof(StorageIndex)) * value.nonZeros();
+  }
+
+  EIGEN_DEVICE_FUNC uint8_t* serialize(uint8_t* dest, uint8_t* end, const SparseMat& value) {
+    if (EIGEN_PREDICT_FALSE(dest == nullptr)) return nullptr;
+    if (EIGEN_PREDICT_FALSE(dest + size(value) > end)) return nullptr;
+
+    const size_t header_bytes = sizeof(Header);
+    Header header = {value.innerSize(), value.nonZeros()};
+    EIGEN_USING_STD(memcpy)
+    memcpy(dest, &header, header_bytes);
+    dest += header_bytes;
+
+    // Inner indices.
+    std::size_t data_bytes = sizeof(StorageIndex) * header.num_non_zeros;
+    memcpy(dest, value.innerIndexPtr(), data_bytes);
+    dest += data_bytes;
+
+    // Values.
+    data_bytes = sizeof(Scalar) * header.num_non_zeros;
+    memcpy(dest, value.valuePtr(), data_bytes);
+    dest += data_bytes;
+
+    return dest;
+  }
+
+  EIGEN_DEVICE_FUNC const uint8_t* deserialize(const uint8_t* src, const uint8_t* end, SparseMat& value) const {
+    if (EIGEN_PREDICT_FALSE(src == nullptr)) return nullptr;
+    if (EIGEN_PREDICT_FALSE(src + sizeof(Header) > end)) return nullptr;
+
+    const size_t header_bytes = sizeof(Header);
+    Header header;
+    EIGEN_USING_STD(memcpy)
+    memcpy(&header, src, header_bytes);
+    src += header_bytes;
+
+    value.setZero();
+    value.resize(header.size);
+    value.resizeNonZeros(header.num_non_zeros);
+
+    // Inner indices.
+    std::size_t data_bytes = sizeof(StorageIndex) * header.num_non_zeros;
+    if (EIGEN_PREDICT_FALSE(src + data_bytes > end)) return nullptr;
+    memcpy(value.innerIndexPtr(), src, data_bytes);
+    src += data_bytes;
+
+    // Values.
+    data_bytes = sizeof(Scalar) * header.num_non_zeros;
+    if (EIGEN_PREDICT_FALSE(src + data_bytes > end)) return nullptr;
+    memcpy(value.valuePtr(), src, data_bytes);
+    src += data_bytes;
+    return src;
+  }
+};
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_SPARSEVECTOR_H
+#endif  // EIGEN_SPARSEVECTOR_H
diff --git a/inst/include/Eigen/src/SparseCore/SparseView.h b/inst/include/Eigen/src/SparseCore/SparseView.h
index fd845046..7220beea 100644
--- a/inst/include/Eigen/src/SparseCore/SparseView.h
+++ b/inst/include/Eigen/src/SparseCore/SparseView.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2011-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 // Copyright (C) 2010 Daniel Lowengrub <lowdanie@gmail.com>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -11,35 +11,49 @@
 #ifndef EIGEN_SPARSEVIEW_H
 #define EIGEN_SPARSEVIEW_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 namespace internal {
 
-template<typename MatrixType>
-struct traits<SparseView<MatrixType> > : traits<MatrixType>
-{
-  typedef typename MatrixType::Index Index;
+template <typename MatrixType>
+struct traits<SparseView<MatrixType> > : traits<MatrixType> {
+  typedef typename MatrixType::StorageIndex StorageIndex;
   typedef Sparse StorageKind;
-  enum {
-    Flags = int(traits<MatrixType>::Flags) & (RowMajorBit)
-  };
+  enum { Flags = int(traits<MatrixType>::Flags) & (RowMajorBit) };
 };
 
-} // end namespace internal
-
-template<typename MatrixType>
-class SparseView : public SparseMatrixBase<SparseView<MatrixType> >
-{
+}  // end namespace internal
+
+/** \ingroup SparseCore_Module
+ * \class SparseView
+ *
+ * \brief Expression of a dense or sparse matrix with zero or too small values removed
+ *
+ * \tparam MatrixType the type of the object of which we are removing the small entries
+ *
+ * This class represents an expression of a given dense or sparse matrix with
+ * entries smaller than \c reference * \c epsilon are removed.
+ * It is the return type of MatrixBase::sparseView() and SparseMatrixBase::pruned()
+ * and most of the time this is the only way it is used.
+ *
+ * \sa MatrixBase::sparseView(), SparseMatrixBase::pruned()
+ */
+template <typename MatrixType>
+class SparseView : public SparseMatrixBase<SparseView<MatrixType> > {
   typedef typename MatrixType::Nested MatrixTypeNested;
-  typedef typename internal::remove_all<MatrixTypeNested>::type _MatrixTypeNested;
-public:
-  EIGEN_SPARSE_PUBLIC_INTERFACE(SparseView)
+  typedef internal::remove_all_t<MatrixTypeNested> MatrixTypeNested_;
+  typedef SparseMatrixBase<SparseView> Base;
 
-  SparseView(const MatrixType& mat, const Scalar& m_reference = Scalar(0),
-             typename NumTraits<Scalar>::Real m_epsilon = NumTraits<Scalar>::dummy_precision()) : 
-    m_matrix(mat), m_reference(m_reference), m_epsilon(m_epsilon) {}
+ public:
+  EIGEN_SPARSE_PUBLIC_INTERFACE(SparseView)
+  typedef internal::remove_all_t<MatrixType> NestedExpression;
 
-  class InnerIterator;
+  explicit SparseView(const MatrixType& mat, const Scalar& reference = Scalar(0),
+                      const RealScalar& epsilon = NumTraits<Scalar>::dummy_precision())
+      : m_matrix(mat), m_reference(reference), m_epsilon(epsilon) {}
 
   inline Index rows() const { return m_matrix.rows(); }
   inline Index cols() const { return m_matrix.cols(); }
@@ -47,53 +61,165 @@ class SparseView : public SparseMatrixBase<SparseView<MatrixType> >
   inline Index innerSize() const { return m_matrix.innerSize(); }
   inline Index outerSize() const { return m_matrix.outerSize(); }
 
-protected:
+  /** \returns the nested expression */
+  const internal::remove_all_t<MatrixTypeNested>& nestedExpression() const { return m_matrix; }
+
+  Scalar reference() const { return m_reference; }
+  RealScalar epsilon() const { return m_epsilon; }
+
+ protected:
   MatrixTypeNested m_matrix;
   Scalar m_reference;
-  typename NumTraits<Scalar>::Real m_epsilon;
+  RealScalar m_epsilon;
 };
 
-template<typename MatrixType>
-class SparseView<MatrixType>::InnerIterator : public _MatrixTypeNested::InnerIterator
-{
-  typedef typename SparseView::Index Index;
-public:
-  typedef typename _MatrixTypeNested::InnerIterator IterBase;
-  InnerIterator(const SparseView& view, Index outer) :
-  IterBase(view.m_matrix, outer), m_view(view)
-  {
-    incrementToNonZero();
-  }
-
-  EIGEN_STRONG_INLINE InnerIterator& operator++()
-  {
-    IterBase::operator++();
-    incrementToNonZero();
-    return *this;
-  }
-
-  using IterBase::value;
-
-protected:
-  const SparseView& m_view;
-
-private:
-  void incrementToNonZero()
-  {
-    while((bool(*this)) && internal::isMuchSmallerThan(value(), m_view.m_reference, m_view.m_epsilon))
-    {
-      IterBase::operator++();
+namespace internal {
+
+// TODO find a way to unify the two following variants
+// This is tricky because implementing an inner iterator on top of an IndexBased evaluator is
+// not easy because the evaluators do not expose the sizes of the underlying expression.
+
+template <typename ArgType>
+struct unary_evaluator<SparseView<ArgType>, IteratorBased> : public evaluator_base<SparseView<ArgType> > {
+  typedef typename evaluator<ArgType>::InnerIterator EvalIterator;
+
+ public:
+  typedef SparseView<ArgType> XprType;
+
+  class InnerIterator : public EvalIterator {
+   protected:
+    typedef typename XprType::Scalar Scalar;
+
+   public:
+    EIGEN_STRONG_INLINE InnerIterator(const unary_evaluator& sve, Index outer)
+        : EvalIterator(sve.m_argImpl, outer), m_view(sve.m_view) {
+      incrementToNonZero();
+    }
+
+    EIGEN_STRONG_INLINE InnerIterator& operator++() {
+      EvalIterator::operator++();
+      incrementToNonZero();
+      return *this;
+    }
+
+    using EvalIterator::value;
+
+   protected:
+    const XprType& m_view;
+
+   private:
+    void incrementToNonZero() {
+      while ((bool(*this)) && internal::isMuchSmallerThan(value(), m_view.reference(), m_view.epsilon())) {
+        EvalIterator::operator++();
+      }
     }
-  }
+  };
+
+  enum { CoeffReadCost = evaluator<ArgType>::CoeffReadCost, Flags = XprType::Flags };
+
+  explicit unary_evaluator(const XprType& xpr) : m_argImpl(xpr.nestedExpression()), m_view(xpr) {}
+
+ protected:
+  evaluator<ArgType> m_argImpl;
+  const XprType& m_view;
+};
+
+template <typename ArgType>
+struct unary_evaluator<SparseView<ArgType>, IndexBased> : public evaluator_base<SparseView<ArgType> > {
+ public:
+  typedef SparseView<ArgType> XprType;
+
+ protected:
+  enum { IsRowMajor = (XprType::Flags & RowMajorBit) == RowMajorBit };
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::StorageIndex StorageIndex;
+
+ public:
+  class InnerIterator {
+   public:
+    EIGEN_STRONG_INLINE InnerIterator(const unary_evaluator& sve, Index outer)
+        : m_sve(sve), m_inner(0), m_outer(outer), m_end(sve.m_view.innerSize()) {
+      incrementToNonZero();
+    }
+
+    EIGEN_STRONG_INLINE InnerIterator& operator++() {
+      m_inner++;
+      incrementToNonZero();
+      return *this;
+    }
+
+    EIGEN_STRONG_INLINE Scalar value() const {
+      return (IsRowMajor) ? m_sve.m_argImpl.coeff(m_outer, m_inner) : m_sve.m_argImpl.coeff(m_inner, m_outer);
+    }
+
+    EIGEN_STRONG_INLINE StorageIndex index() const { return m_inner; }
+    inline Index row() const { return IsRowMajor ? m_outer : index(); }
+    inline Index col() const { return IsRowMajor ? index() : m_outer; }
+
+    EIGEN_STRONG_INLINE operator bool() const { return m_inner < m_end && m_inner >= 0; }
+
+   protected:
+    const unary_evaluator& m_sve;
+    Index m_inner;
+    const Index m_outer;
+    const Index m_end;
+
+   private:
+    void incrementToNonZero() {
+      while ((bool(*this)) && internal::isMuchSmallerThan(value(), m_sve.m_view.reference(), m_sve.m_view.epsilon())) {
+        m_inner++;
+      }
+    }
+  };
+
+  enum { CoeffReadCost = evaluator<ArgType>::CoeffReadCost, Flags = XprType::Flags };
+
+  explicit unary_evaluator(const XprType& xpr) : m_argImpl(xpr.nestedExpression()), m_view(xpr) {}
+
+ protected:
+  evaluator<ArgType> m_argImpl;
+  const XprType& m_view;
 };
 
-template<typename Derived>
-const SparseView<Derived> MatrixBase<Derived>::sparseView(const Scalar& m_reference,
-                                                          const typename NumTraits<Scalar>::Real& m_epsilon) const
-{
-  return SparseView<Derived>(derived(), m_reference, m_epsilon);
+}  // end namespace internal
+
+/** \ingroup SparseCore_Module
+ *
+ * \returns a sparse expression of the dense expression \c *this with values smaller than
+ * \a reference * \a epsilon removed.
+ *
+ * This method is typically used when prototyping to convert a quickly assembled dense Matrix \c D to a SparseMatrix \c
+ * S: \code MatrixXd D(n,m); SparseMatrix<double> S; S = D.sparseView();             // suppress numerical zeros (exact)
+ * S = D.sparseView(reference);
+ * S = D.sparseView(reference,epsilon);
+ * \endcode
+ * where \a reference is a meaningful non zero reference value,
+ * and \a epsilon is a tolerance factor defaulting to NumTraits<Scalar>::dummy_precision().
+ *
+ * \sa SparseMatrixBase::pruned(), class SparseView */
+template <typename Derived>
+const SparseView<Derived> MatrixBase<Derived>::sparseView(const Scalar& reference,
+                                                          const typename NumTraits<Scalar>::Real& epsilon) const {
+  return SparseView<Derived>(derived(), reference, epsilon);
+}
+
+/** \returns an expression of \c *this with values smaller than
+ * \a reference * \a epsilon removed.
+ *
+ * This method is typically used in conjunction with the product of two sparse matrices
+ * to automatically prune the smallest values as follows:
+ * \code
+ * C = (A*B).pruned();             // suppress numerical zeros (exact)
+ * C = (A*B).pruned(ref);
+ * C = (A*B).pruned(ref,epsilon);
+ * \endcode
+ * where \c ref is a meaningful non zero reference value.
+ * */
+template <typename Derived>
+const SparseView<Derived> SparseMatrixBase<Derived>::pruned(const Scalar& reference, const RealScalar& epsilon) const {
+  return SparseView<Derived>(derived(), reference, epsilon);
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
 #endif
diff --git a/inst/include/Eigen/src/SparseCore/TriangularSolver.h b/inst/include/Eigen/src/SparseCore/TriangularSolver.h
index ccc12af7..684de483 100644
--- a/inst/include/Eigen/src/SparseCore/TriangularSolver.h
+++ b/inst/include/Eigen/src/SparseCore/TriangularSolver.h
@@ -10,47 +10,44 @@
 #ifndef EIGEN_SPARSETRIANGULARSOLVER_H
 #define EIGEN_SPARSETRIANGULARSOLVER_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 namespace internal {
 
-template<typename Lhs, typename Rhs, int Mode,
-  int UpLo = (Mode & Lower)
-           ? Lower
-           : (Mode & Upper)
-           ? Upper
-           : -1,
-  int StorageOrder = int(traits<Lhs>::Flags) & RowMajorBit>
+template <typename Lhs, typename Rhs, int Mode,
+          int UpLo = (Mode & Lower)   ? Lower
+                     : (Mode & Upper) ? Upper
+                                      : -1,
+          int StorageOrder = int(traits<Lhs>::Flags) & RowMajorBit>
 struct sparse_solve_triangular_selector;
 
 // forward substitution, row-major
-template<typename Lhs, typename Rhs, int Mode>
-struct sparse_solve_triangular_selector<Lhs,Rhs,Mode,Lower,RowMajor>
-{
+template <typename Lhs, typename Rhs, int Mode>
+struct sparse_solve_triangular_selector<Lhs, Rhs, Mode, Lower, RowMajor> {
   typedef typename Rhs::Scalar Scalar;
-  static void run(const Lhs& lhs, Rhs& other)
-  {
-    for(int col=0 ; col<other.cols() ; ++col)
-    {
-      for(int i=0; i<lhs.rows(); ++i)
-      {
-        Scalar tmp = other.coeff(i,col);
+  typedef evaluator<Lhs> LhsEval;
+  typedef typename evaluator<Lhs>::InnerIterator LhsIterator;
+  static void run(const Lhs& lhs, Rhs& other) {
+    LhsEval lhsEval(lhs);
+    for (Index col = 0; col < other.cols(); ++col) {
+      for (Index i = 0; i < lhs.rows(); ++i) {
+        Scalar tmp = other.coeff(i, col);
         Scalar lastVal(0);
-        int lastIndex = 0;
-        for(typename Lhs::InnerIterator it(lhs, i); it; ++it)
-        {
+        Index lastIndex = 0;
+        for (LhsIterator it(lhsEval, i); it; ++it) {
           lastVal = it.value();
           lastIndex = it.index();
-          if(lastIndex==i)
-            break;
-          tmp -= lastVal * other.coeff(lastIndex,col);
+          if (lastIndex == i) break;
+          tmp = numext::madd<Scalar>(-lastVal, other.coeff(lastIndex, col), tmp);
         }
         if (Mode & UnitDiag)
-          other.coeffRef(i,col) = tmp;
-        else
-        {
-          eigen_assert(lastIndex==i);
-          other.coeffRef(i,col) = tmp/lastVal;
+          other.coeffRef(i, col) = tmp;
+        else {
+          eigen_assert(lastIndex == i);
+          other.coeffRef(i, col) = tmp / lastVal;
         }
       }
     }
@@ -58,69 +55,61 @@ struct sparse_solve_triangular_selector<Lhs,Rhs,Mode,Lower,RowMajor>
 };
 
 // backward substitution, row-major
-template<typename Lhs, typename Rhs, int Mode>
-struct sparse_solve_triangular_selector<Lhs,Rhs,Mode,Upper,RowMajor>
-{
+template <typename Lhs, typename Rhs, int Mode>
+struct sparse_solve_triangular_selector<Lhs, Rhs, Mode, Upper, RowMajor> {
   typedef typename Rhs::Scalar Scalar;
-  static void run(const Lhs& lhs, Rhs& other)
-  {
-    for(int col=0 ; col<other.cols() ; ++col)
-    {
-      for(int i=lhs.rows()-1 ; i>=0 ; --i)
-      {
-        Scalar tmp = other.coeff(i,col);
+  typedef evaluator<Lhs> LhsEval;
+  typedef typename evaluator<Lhs>::InnerIterator LhsIterator;
+  static void run(const Lhs& lhs, Rhs& other) {
+    LhsEval lhsEval(lhs);
+    for (Index col = 0; col < other.cols(); ++col) {
+      for (Index i = lhs.rows() - 1; i >= 0; --i) {
+        Scalar tmp = other.coeff(i, col);
         Scalar l_ii(0);
-        typename Lhs::InnerIterator it(lhs, i);
-        while(it && it.index()<i)
-          ++it;
-        if(!(Mode & UnitDiag))
-        {
-          eigen_assert(it && it.index()==i);
+        LhsIterator it(lhsEval, i);
+        while (it && it.index() < i) ++it;
+        if (!(Mode & UnitDiag)) {
+          eigen_assert(it && it.index() == i);
           l_ii = it.value();
           ++it;
-        }
-        else if (it && it.index() == i)
+        } else if (it && it.index() == i)
           ++it;
-        for(; it; ++it)
-        {
-          tmp -= it.value() * other.coeff(it.index(),col);
+        for (; it; ++it) {
+          tmp = numext::madd<Scalar>(-it.value(), other.coeff(it.index(), col), tmp);
         }
 
         if (Mode & UnitDiag)
-          other.coeffRef(i,col) = tmp;
+          other.coeffRef(i, col) = tmp;
         else
-          other.coeffRef(i,col) = tmp/l_ii;
+          other.coeffRef(i, col) = tmp / l_ii;
       }
     }
   }
 };
 
 // forward substitution, col-major
-template<typename Lhs, typename Rhs, int Mode>
-struct sparse_solve_triangular_selector<Lhs,Rhs,Mode,Lower,ColMajor>
-{
+template <typename Lhs, typename Rhs, int Mode>
+struct sparse_solve_triangular_selector<Lhs, Rhs, Mode, Lower, ColMajor> {
   typedef typename Rhs::Scalar Scalar;
-  static void run(const Lhs& lhs, Rhs& other)
-  {
-    for(int col=0 ; col<other.cols() ; ++col)
-    {
-      for(int i=0; i<lhs.cols(); ++i)
-      {
-        Scalar& tmp = other.coeffRef(i,col);
-        if (tmp!=Scalar(0)) // optimization when other is actually sparse
+  typedef evaluator<Lhs> LhsEval;
+  typedef typename evaluator<Lhs>::InnerIterator LhsIterator;
+  static void run(const Lhs& lhs, Rhs& other) {
+    LhsEval lhsEval(lhs);
+    for (Index col = 0; col < other.cols(); ++col) {
+      for (Index i = 0; i < lhs.cols(); ++i) {
+        Scalar& tmp = other.coeffRef(i, col);
+        if (!numext::is_exactly_zero(tmp))  // optimization when other is actually sparse
         {
-          typename Lhs::InnerIterator it(lhs, i);
-          while(it && it.index()<i)
-            ++it;
-          if(!(Mode & UnitDiag))
-          {
-            eigen_assert(it && it.index()==i);
+          LhsIterator it(lhsEval, i);
+          while (it && it.index() < i) ++it;
+          if (!(Mode & UnitDiag)) {
+            eigen_assert(it && it.index() == i);
             tmp /= it.value();
           }
-          if (it && it.index()==i)
-            ++it;
-          for(; it; ++it)
-            other.coeffRef(it.index(), col) -= tmp * it.value();
+          if (it && it.index() == i) ++it;
+          for (; it; ++it) {
+            other.coeffRef(it.index(), col) = numext::madd<Scalar>(-tmp, it.value(), other.coeffRef(it.index(), col));
+          }
         }
       }
     }
@@ -128,207 +117,158 @@ struct sparse_solve_triangular_selector<Lhs,Rhs,Mode,Lower,ColMajor>
 };
 
 // backward substitution, col-major
-template<typename Lhs, typename Rhs, int Mode>
-struct sparse_solve_triangular_selector<Lhs,Rhs,Mode,Upper,ColMajor>
-{
+template <typename Lhs, typename Rhs, int Mode>
+struct sparse_solve_triangular_selector<Lhs, Rhs, Mode, Upper, ColMajor> {
   typedef typename Rhs::Scalar Scalar;
-  static void run(const Lhs& lhs, Rhs& other)
-  {
-    for(int col=0 ; col<other.cols() ; ++col)
-    {
-      for(int i=lhs.cols()-1; i>=0; --i)
-      {
-        Scalar& tmp = other.coeffRef(i,col);
-        if (tmp!=Scalar(0)) // optimization when other is actually sparse
+  typedef evaluator<Lhs> LhsEval;
+  typedef typename evaluator<Lhs>::InnerIterator LhsIterator;
+  static void run(const Lhs& lhs, Rhs& other) {
+    LhsEval lhsEval(lhs);
+    for (Index col = 0; col < other.cols(); ++col) {
+      for (Index i = lhs.cols() - 1; i >= 0; --i) {
+        Scalar& tmp = other.coeffRef(i, col);
+        if (!numext::is_exactly_zero(tmp))  // optimization when other is actually sparse
         {
-          if(!(Mode & UnitDiag))
-          {
+          if (!(Mode & UnitDiag)) {
             // TODO replace this by a binary search. make sure the binary search is safe for partially sorted elements
-            typename Lhs::ReverseInnerIterator it(lhs, i);
-            while(it && it.index()!=i)
-              --it;
-            eigen_assert(it && it.index()==i);
-            other.coeffRef(i,col) /= it.value();
+            LhsIterator it(lhsEval, i);
+            while (it && it.index() != i) ++it;
+            eigen_assert(it && it.index() == i);
+            other.coeffRef(i, col) /= it.value();
+          }
+          LhsIterator it(lhsEval, i);
+          for (; it && it.index() < i; ++it) {
+            other.coeffRef(it.index(), col) = numext::madd<Scalar>(-tmp, it.value(), other.coeffRef(it.index(), col));
           }
-          typename Lhs::InnerIterator it(lhs, i);
-          for(; it && it.index()<i; ++it)
-            other.coeffRef(it.index(), col) -= tmp * it.value();
         }
       }
     }
   }
 };
 
-} // end namespace internal
+}  // end namespace internal
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
 
-template<typename ExpressionType,int Mode>
-template<typename OtherDerived>
-void SparseTriangularView<ExpressionType,Mode>::solveInPlace(MatrixBase<OtherDerived>& other) const
-{
-  eigen_assert(m_matrix.cols() == m_matrix.rows() && m_matrix.cols() == other.rows());
-  eigen_assert((!(Mode & ZeroDiag)) && bool(Mode & (Upper|Lower)));
+template <typename ExpressionType, unsigned int Mode>
+template <typename OtherDerived>
+void TriangularViewImpl<ExpressionType, Mode, Sparse>::solveInPlace(MatrixBase<OtherDerived>& other) const {
+  eigen_assert(derived().cols() == derived().rows() && derived().cols() == other.rows());
+  eigen_assert((!(Mode & ZeroDiag)) && bool(Mode & (Upper | Lower)));
 
   enum { copy = internal::traits<OtherDerived>::Flags & RowMajorBit };
 
-  typedef typename internal::conditional<copy,
-    typename internal::plain_matrix_type_column_major<OtherDerived>::type, OtherDerived&>::type OtherCopy;
+  typedef std::conditional_t<copy, typename internal::plain_matrix_type_column_major<OtherDerived>::type, OtherDerived&>
+      OtherCopy;
   OtherCopy otherCopy(other.derived());
 
-  internal::sparse_solve_triangular_selector<ExpressionType, typename internal::remove_reference<OtherCopy>::type, Mode>::run(m_matrix, otherCopy);
+  internal::sparse_solve_triangular_selector<ExpressionType, std::remove_reference_t<OtherCopy>, Mode>::run(
+      derived().nestedExpression(), otherCopy);
 
-  if (copy)
-    other = otherCopy;
-}
-
-template<typename ExpressionType,int Mode>
-template<typename OtherDerived>
-typename internal::plain_matrix_type_column_major<OtherDerived>::type
-SparseTriangularView<ExpressionType,Mode>::solve(const MatrixBase<OtherDerived>& other) const
-{
-  typename internal::plain_matrix_type_column_major<OtherDerived>::type res(other);
-  solveInPlace(res);
-  return res;
+  if (copy) other = otherCopy;
 }
+#endif
 
 // pure sparse path
 
 namespace internal {
 
-template<typename Lhs, typename Rhs, int Mode,
-  int UpLo = (Mode & Lower)
-           ? Lower
-           : (Mode & Upper)
-           ? Upper
-           : -1,
-  int StorageOrder = int(Lhs::Flags) & (RowMajorBit)>
+template <typename Lhs, typename Rhs, int Mode,
+          int UpLo = (Mode & Lower)   ? Lower
+                     : (Mode & Upper) ? Upper
+                                      : -1,
+          int StorageOrder = int(Lhs::Flags) & (RowMajorBit)>
 struct sparse_solve_triangular_sparse_selector;
 
 // forward substitution, col-major
-template<typename Lhs, typename Rhs, int Mode, int UpLo>
-struct sparse_solve_triangular_sparse_selector<Lhs,Rhs,Mode,UpLo,ColMajor>
-{
+template <typename Lhs, typename Rhs, int Mode, int UpLo>
+struct sparse_solve_triangular_sparse_selector<Lhs, Rhs, Mode, UpLo, ColMajor> {
   typedef typename Rhs::Scalar Scalar;
-  typedef typename promote_index_type<typename traits<Lhs>::Index,
-                                         typename traits<Rhs>::Index>::type Index;
-  static void run(const Lhs& lhs, Rhs& other)
-  {
-    const bool IsLower = (UpLo==Lower);
-    AmbiVector<Scalar,Index> tempVector(other.rows()*2);
-    tempVector.setBounds(0,other.rows());
+  typedef typename promote_index_type<typename traits<Lhs>::StorageIndex, typename traits<Rhs>::StorageIndex>::type
+      StorageIndex;
+  static void run(const Lhs& lhs, Rhs& other) {
+    const bool IsLower = (UpLo == Lower);
+    AmbiVector<Scalar, StorageIndex> tempVector(other.rows() * 2);
+    tempVector.setBounds(0, other.rows());
 
     Rhs res(other.rows(), other.cols());
     res.reserve(other.nonZeros());
 
-    for(int col=0 ; col<other.cols() ; ++col)
-    {
+    for (Index col = 0; col < other.cols(); ++col) {
       // FIXME estimate number of non zeros
-      tempVector.init(.99/*float(other.col(col).nonZeros())/float(other.rows())*/);
+      tempVector.init(.99 /*float(other.col(col).nonZeros())/float(other.rows())*/);
       tempVector.setZero();
       tempVector.restart();
-      for (typename Rhs::InnerIterator rhsIt(other, col); rhsIt; ++rhsIt)
-      {
+      for (typename Rhs::InnerIterator rhsIt(other, col); rhsIt; ++rhsIt) {
         tempVector.coeffRef(rhsIt.index()) = rhsIt.value();
       }
 
-      for(int i=IsLower?0:lhs.cols()-1;
-          IsLower?i<lhs.cols():i>=0;
-          i+=IsLower?1:-1)
-      {
+      for (Index i = IsLower ? 0 : lhs.cols() - 1; IsLower ? i < lhs.cols() : i >= 0; i += IsLower ? 1 : -1) {
         tempVector.restart();
         Scalar& ci = tempVector.coeffRef(i);
-        if (ci!=Scalar(0))
-        {
+        if (!numext::is_exactly_zero(ci)) {
           // find
           typename Lhs::InnerIterator it(lhs, i);
-          if(!(Mode & UnitDiag))
-          {
-            if (IsLower)
-            {
-              eigen_assert(it.index()==i);
+          if (!(Mode & UnitDiag)) {
+            if (IsLower) {
+              eigen_assert(it.index() == i);
               ci /= it.value();
-            }
-            else
-              ci /= lhs.coeff(i,i);
+            } else
+              ci /= lhs.coeff(i, i);
           }
           tempVector.restart();
-          if (IsLower)
-          {
-            if (it.index()==i)
-              ++it;
-            for(; it; ++it)
-              tempVector.coeffRef(it.index()) -= ci * it.value();
-          }
-          else
-          {
-            for(; it && it.index()<i; ++it)
-              tempVector.coeffRef(it.index()) -= ci * it.value();
+          if (IsLower) {
+            if (it.index() == i) ++it;
+            for (; it; ++it) {
+              tempVector.coeffRef(it.index()) = numext::madd<Scalar>(-ci, it.value(), tempVector.coeffRef(it.index()));
+            }
+          } else {
+            for (; it && it.index() < i; ++it) {
+              tempVector.coeffRef(it.index()) = numext::madd<Scalar>(-ci, it.value(), tempVector.coeffRef(it.index()));
+            }
           }
         }
       }
 
-
-      int count = 0;
+      //       Index count = 0;
       // FIXME compute a reference value to filter zeros
-      for (typename AmbiVector<Scalar,Index>::Iterator it(tempVector/*,1e-12*/); it; ++it)
-      {
-        ++ count;
-//         std::cerr << "fill " << it.index() << ", " << col << "\n";
-//         std::cout << it.value() << "  ";
+      for (typename AmbiVector<Scalar, StorageIndex>::Iterator it(tempVector /*,1e-12*/); it; ++it) {
+        //         ++ count;
+        //         std::cerr << "fill " << it.index() << ", " << col << "\n";
+        //         std::cout << it.value() << "  ";
         // FIXME use insertBack
         res.insert(it.index(), col) = it.value();
       }
-//       std::cout << "tempVector.nonZeros() == " << int(count) << " / " << (other.rows()) << "\n";
+      //       std::cout << "tempVector.nonZeros() == " << int(count) << " / " << (other.rows()) << "\n";
     }
     res.finalize();
     other = res.markAsRValue();
   }
 };
 
-} // end namespace internal
+}  // end namespace internal
 
-template<typename ExpressionType,int Mode>
-template<typename OtherDerived>
-void SparseTriangularView<ExpressionType,Mode>::solveInPlace(SparseMatrixBase<OtherDerived>& other) const
-{
-  eigen_assert(m_matrix.cols() == m_matrix.rows() && m_matrix.cols() == other.rows());
-  eigen_assert( (!(Mode & ZeroDiag)) && bool(Mode & (Upper|Lower)));
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+template <typename ExpressionType, unsigned int Mode>
+template <typename OtherDerived>
+void TriangularViewImpl<ExpressionType, Mode, Sparse>::solveInPlace(SparseMatrixBase<OtherDerived>& other) const {
+  eigen_assert(derived().cols() == derived().rows() && derived().cols() == other.rows());
+  eigen_assert((!(Mode & ZeroDiag)) && bool(Mode & (Upper | Lower)));
 
-//   enum { copy = internal::traits<OtherDerived>::Flags & RowMajorBit };
+  //   enum { copy = internal::traits<OtherDerived>::Flags & RowMajorBit };
 
-//   typedef typename internal::conditional<copy,
-//     typename internal::plain_matrix_type_column_major<OtherDerived>::type, OtherDerived&>::type OtherCopy;
-//   OtherCopy otherCopy(other.derived());
+  //   typedef std::conditional_t<copy,
+  //     typename internal::plain_matrix_type_column_major<OtherDerived>::type, OtherDerived&> OtherCopy;
+  //   OtherCopy otherCopy(other.derived());
 
-  internal::sparse_solve_triangular_sparse_selector<ExpressionType, OtherDerived, Mode>::run(m_matrix, other.derived());
-
-//   if (copy)
-//     other = otherCopy;
-}
-
-#ifdef EIGEN2_SUPPORT
-
-// deprecated stuff:
-
-/** \deprecated */
-template<typename Derived>
-template<typename OtherDerived>
-void SparseMatrixBase<Derived>::solveTriangularInPlace(MatrixBase<OtherDerived>& other) const
-{
-  this->template triangular<Flags&(Upper|Lower)>().solveInPlace(other);
-}
+  internal::sparse_solve_triangular_sparse_selector<ExpressionType, OtherDerived, Mode>::run(
+      derived().nestedExpression(), other.derived());
 
-/** \deprecated */
-template<typename Derived>
-template<typename OtherDerived>
-typename internal::plain_matrix_type_column_major<OtherDerived>::type
-SparseMatrixBase<Derived>::solveTriangular(const MatrixBase<OtherDerived>& other) const
-{
-  typename internal::plain_matrix_type_column_major<OtherDerived>::type res(other);
-  derived().solveTriangularInPlace(res);
-  return res;
+  //   if (copy)
+  //     other = otherCopy;
 }
-#endif // EIGEN2_SUPPORT
+#endif
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_SPARSETRIANGULARSOLVER_H
+#endif  // EIGEN_SPARSETRIANGULARSOLVER_H
diff --git a/inst/include/Eigen/src/SparseLU/InternalHeaderCheck.h b/inst/include/Eigen/src/SparseLU/InternalHeaderCheck.h
new file mode 100644
index 00000000..78ebfcc0
--- /dev/null
+++ b/inst/include/Eigen/src/SparseLU/InternalHeaderCheck.h
@@ -0,0 +1,3 @@
+#ifndef EIGEN_SPARSELU_MODULE_H
+#error "Please include Eigen/SparseLU instead of including headers inside the src directory directly."
+#endif
diff --git a/inst/include/Eigen/src/SparseLU/SparseLU.h b/inst/include/Eigen/src/SparseLU/SparseLU.h
index bdc4f193..cc69a42d 100644
--- a/inst/include/Eigen/src/SparseLU/SparseLU.h
+++ b/inst/include/Eigen/src/SparseLU/SparseLU.h
@@ -2,805 +2,968 @@
 // for linear algebra.
 //
 // Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam@inria.fr>
-// Copyright (C) 2012 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2012-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-
 #ifndef EIGEN_SPARSE_LU_H
 #define EIGEN_SPARSE_LU_H
 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
 namespace Eigen {
 
-template <typename _MatrixType, typename _OrderingType = COLAMDOrdering<typename _MatrixType::Index> > class SparseLU;
-template <typename MappedSparseMatrixType> struct SparseLUMatrixLReturnType;
-template <typename MatrixLType, typename MatrixUType> struct SparseLUMatrixUReturnType;
+template <typename MatrixType_, typename OrderingType_ = COLAMDOrdering<typename MatrixType_::StorageIndex>>
+class SparseLU;
+template <typename MappedSparseMatrixType>
+struct SparseLUMatrixLReturnType;
+template <typename MatrixLType, typename MatrixUType>
+struct SparseLUMatrixUReturnType;
 
-/** \ingroup SparseLU_Module
-  * \class SparseLU
-  * 
-  * \brief Sparse supernodal LU factorization for general matrices
-  * 
-  * This class implements the supernodal LU factorization for general matrices.
-  * It uses the main techniques from the sequential SuperLU package 
-  * (http://crd-legacy.lbl.gov/~xiaoye/SuperLU/). It handles transparently real 
-  * and complex arithmetics with single and double precision, depending on the 
-  * scalar type of your input matrix. 
-  * The code has been optimized to provide BLAS-3 operations during supernode-panel updates. 
-  * It benefits directly from the built-in high-performant Eigen BLAS routines. 
-  * Moreover, when the size of a supernode is very small, the BLAS calls are avoided to 
-  * enable a better optimization from the compiler. For best performance, 
-  * you should compile it with NDEBUG flag to avoid the numerous bounds checking on vectors. 
-  * 
-  * An important parameter of this class is the ordering method. It is used to reorder the columns 
-  * (and eventually the rows) of the matrix to reduce the number of new elements that are created during 
-  * numerical factorization. The cheapest method available is COLAMD. 
-  * See  \link OrderingMethods_Module the OrderingMethods module \endlink for the list of 
-  * built-in and external ordering methods. 
-  *
-  * Simple example with key steps 
-  * \code
-  * VectorXd x(n), b(n);
-  * SparseMatrix<double, ColMajor> A;
-  * SparseLU<SparseMatrix<scalar, ColMajor>, COLAMDOrdering<Index> >   solver;
-  * // fill A and b;
-  * // Compute the ordering permutation vector from the structural pattern of A
-  * solver.analyzePattern(A); 
-  * // Compute the numerical factorization 
-  * solver.factorize(A); 
-  * //Use the factors to solve the linear system 
-  * x = solver.solve(b); 
-  * \endcode
-  * 
-  * \warning The input matrix A should be in a \b compressed and \b column-major form.
-  * Otherwise an expensive copy will be made. You can call the inexpensive makeCompressed() to get a compressed matrix.
-  * 
-  * \note Unlike the initial SuperLU implementation, there is no step to equilibrate the matrix. 
-  * For badly scaled matrices, this step can be useful to reduce the pivoting during factorization. 
-  * If this is the case for your matrices, you can try the basic scaling method at
-  *  "unsupported/Eigen/src/IterativeSolvers/Scaling.h"
-  * 
-  * \tparam _MatrixType The type of the sparse matrix. It must be a column-major SparseMatrix<>
-  * \tparam _OrderingType The ordering method to use, either AMD, COLAMD or METIS. Default is COLMAD
-  * 
-  * 
-  * \sa \ref TutorialSparseDirectSolvers
-  * \sa \ref OrderingMethods_Module
-  */
-template <typename _MatrixType, typename _OrderingType>
-class SparseLU : public internal::SparseLUImpl<typename _MatrixType::Scalar, typename _MatrixType::Index>
-{
-  public:
-    typedef _MatrixType MatrixType; 
-    typedef _OrderingType OrderingType;
-    typedef typename MatrixType::Scalar Scalar; 
-    typedef typename MatrixType::RealScalar RealScalar; 
-    typedef typename MatrixType::Index Index; 
-    typedef SparseMatrix<Scalar,ColMajor,Index> NCMatrix;
-    typedef internal::MappedSuperNodalMatrix<Scalar, Index> SCMatrix; 
-    typedef Matrix<Scalar,Dynamic,1> ScalarVector;
-    typedef Matrix<Index,Dynamic,1> IndexVector;
-    typedef PermutationMatrix<Dynamic, Dynamic, Index> PermutationType;
-    typedef internal::SparseLUImpl<Scalar, Index> Base;
-    
-  public:
-    SparseLU():m_isInitialized(true),m_lastError(""),m_Ustore(0,0,0,0,0,0),m_symmetricmode(false),m_diagpivotthresh(1.0),m_detPermR(1)
-    {
-      initperfvalues(); 
-    }
-    SparseLU(const MatrixType& matrix):m_isInitialized(true),m_lastError(""),m_Ustore(0,0,0,0,0,0),m_symmetricmode(false),m_diagpivotthresh(1.0),m_detPermR(1)
-    {
-      initperfvalues(); 
-      compute(matrix);
-    }
-    
-    ~SparseLU()
-    {
-      // Free all explicit dynamic pointers 
-    }
-    
-    void analyzePattern (const MatrixType& matrix);
-    void factorize (const MatrixType& matrix);
-    void simplicialfactorize(const MatrixType& matrix);
-    
-    /**
-      * Compute the symbolic and numeric factorization of the input sparse matrix.
-      * The input matrix should be in column-major storage. 
-      */
-    void compute (const MatrixType& matrix)
-    {
-      // Analyze 
-      analyzePattern(matrix); 
-      //Factorize
-      factorize(matrix);
-    } 
-    
-    inline Index rows() const { return m_mat.rows(); }
-    inline Index cols() const { return m_mat.cols(); }
-    /** Indicate that the pattern of the input matrix is symmetric */
-    void isSymmetric(bool sym)
-    {
-      m_symmetricmode = sym;
-    }
-    
-    /** \returns an expression of the matrix L, internally stored as supernodes
-      * The only operation available with this expression is the triangular solve
-      * \code
-      * y = b; matrixL().solveInPlace(y);
-      * \endcode
-      */
-    SparseLUMatrixLReturnType<SCMatrix> matrixL() const
-    {
-      return SparseLUMatrixLReturnType<SCMatrix>(m_Lstore);
-    }
-    /** \returns an expression of the matrix U,
-      * The only operation available with this expression is the triangular solve
-      * \code
-      * y = b; matrixU().solveInPlace(y);
-      * \endcode
-      */
-    SparseLUMatrixUReturnType<SCMatrix,MappedSparseMatrix<Scalar,ColMajor,Index> > matrixU() const
-    {
-      return SparseLUMatrixUReturnType<SCMatrix, MappedSparseMatrix<Scalar,ColMajor,Index> >(m_Lstore, m_Ustore);
-    }
+template <bool Conjugate, class SparseLUType>
+class SparseLUTransposeView : public SparseSolverBase<SparseLUTransposeView<Conjugate, SparseLUType>> {
+ protected:
+  typedef SparseSolverBase<SparseLUTransposeView<Conjugate, SparseLUType>> APIBase;
+  using APIBase::m_isInitialized;
 
-    /**
-      * \returns a reference to the row matrix permutation \f$ P_r \f$ such that \f$P_r A P_c^T = L U\f$
-      * \sa colsPermutation()
-      */
-    inline const PermutationType& rowsPermutation() const
-    {
-      return m_perm_r;
-    }
-    /**
-      * \returns a reference to the column matrix permutation\f$ P_c^T \f$ such that \f$P_r A P_c^T = L U\f$
-      * \sa rowsPermutation()
-      */
-    inline const PermutationType& colsPermutation() const
-    {
-      return m_perm_c;
-    }
-    /** Set the threshold used for a diagonal entry to be an acceptable pivot. */
-    void setPivotThreshold(const RealScalar& thresh)
-    {
-      m_diagpivotthresh = thresh; 
-    }
+ public:
+  typedef typename SparseLUType::Scalar Scalar;
+  typedef typename SparseLUType::StorageIndex StorageIndex;
+  typedef typename SparseLUType::MatrixType MatrixType;
+  typedef typename SparseLUType::OrderingType OrderingType;
 
-    /** \returns the solution X of \f$ A X = B \f$ using the current decomposition of A.
-      *
-      * \warning the destination matrix X in X = this->solve(B) must be colmun-major.
-      *
-      * \sa compute()
-      */
-    template<typename Rhs>
-    inline const internal::solve_retval<SparseLU, Rhs> solve(const MatrixBase<Rhs>& B) const 
-    {
-      eigen_assert(m_factorizationIsOk && "SparseLU is not initialized."); 
-      eigen_assert(rows()==B.rows()
-                    && "SparseLU::solve(): invalid number of rows of the right hand side matrix B");
-          return internal::solve_retval<SparseLU, Rhs>(*this, B.derived());
-    }
+  enum { ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime };
 
-    /** \returns the solution X of \f$ A X = B \f$ using the current decomposition of A.
-      *
-      * \sa compute()
-      */
-    template<typename Rhs>
-    inline const internal::sparse_solve_retval<SparseLU, Rhs> solve(const SparseMatrixBase<Rhs>& B) const 
-    {
-      eigen_assert(m_factorizationIsOk && "SparseLU is not initialized."); 
-      eigen_assert(rows()==B.rows()
-                    && "SparseLU::solve(): invalid number of rows of the right hand side matrix B");
-          return internal::sparse_solve_retval<SparseLU, Rhs>(*this, B.derived());
-    }
-    
-    /** \brief Reports whether previous computation was successful.
-      *
-      * \returns \c Success if computation was succesful,
-      *          \c NumericalIssue if the LU factorization reports a problem, zero diagonal for instance
-      *          \c InvalidInput if the input matrix is invalid
-      *
-      * \sa iparm()          
-      */
-    ComputationInfo info() const
-    {
-      eigen_assert(m_isInitialized && "Decomposition is not initialized.");
-      return m_info;
-    }
-    
-    /**
-      * \returns A string describing the type of error
-      */
-    std::string lastErrorMessage() const
-    {
-      return m_lastError; 
-    }
+  SparseLUTransposeView() : APIBase(), m_sparseLU(NULL) {}
+  SparseLUTransposeView(const SparseLUTransposeView& view) : APIBase() {
+    this->m_sparseLU = view.m_sparseLU;
+    this->m_isInitialized = view.m_isInitialized;
+  }
+  void setIsInitialized(const bool isInitialized) { this->m_isInitialized = isInitialized; }
+  void setSparseLU(SparseLUType* sparseLU) { m_sparseLU = sparseLU; }
+  using APIBase::_solve_impl;
+  template <typename Rhs, typename Dest>
+  bool _solve_impl(const MatrixBase<Rhs>& B, MatrixBase<Dest>& X_base) const {
+    Dest& X(X_base.derived());
+    eigen_assert(m_sparseLU->info() == Success && "The matrix should be factorized first");
+    EIGEN_STATIC_ASSERT((Dest::Flags & RowMajorBit) == 0, THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES);
 
-    template<typename Rhs, typename Dest>
-    bool _solve(const MatrixBase<Rhs> &B, MatrixBase<Dest> &X_base) const
-    {
-      Dest& X(X_base.derived());
-      eigen_assert(m_factorizationIsOk && "The matrix should be factorized first");
-      EIGEN_STATIC_ASSERT((Dest::Flags&RowMajorBit)==0,
-                        THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES);
-      
-      // Permute the right hand side to form X = Pr*B
-      // on return, X is overwritten by the computed solution
-      X.resize(B.rows(),B.cols());
-
-      // this ugly const_cast_derived() helps to detect aliasing when applying the permutations
-      for(Index j = 0; j < B.cols(); ++j)
-        X.col(j) = rowsPermutation() * B.const_cast_derived().col(j);
-      
-      //Forward substitution with L
-      this->matrixL().solveInPlace(X);
-      this->matrixU().solveInPlace(X);
-      
-      // Permute back the solution 
-      for (Index j = 0; j < B.cols(); ++j)
-        X.col(j) = colsPermutation().inverse() * X.col(j);
-      
-      return true; 
+    // this ugly const_cast_derived() helps to detect aliasing when applying the permutations
+    for (Index j = 0; j < B.cols(); ++j) {
+      X.col(j) = m_sparseLU->colsPermutation() * B.const_cast_derived().col(j);
     }
-    
-    /**
-      * \returns the absolute value of the determinant of the matrix of which
-      * *this is the QR decomposition.
-      *
-      * \warning a determinant can be very big or small, so for matrices
-      * of large enough dimension, there is a risk of overflow/underflow.
-      * One way to work around that is to use logAbsDeterminant() instead.
-      *
-      * \sa logAbsDeterminant(), signDeterminant()
-      */
-     Scalar absDeterminant()
-    {
-      eigen_assert(m_factorizationIsOk && "The matrix should be factorized first.");
-      // Initialize with the determinant of the row matrix
-      Scalar det = Scalar(1.);
-      // Note that the diagonal blocks of U are stored in supernodes,
-      // which are available in the  L part :)
-      for (Index j = 0; j < this->cols(); ++j)
-      {
-        for (typename SCMatrix::InnerIterator it(m_Lstore, j); it; ++it)
-        {
-          if(it.index() == j)
-          {
-            using std::abs;
-            det *= abs(it.value());
-            break;
-          }
+    // Forward substitution with transposed or adjoint of U
+    m_sparseLU->matrixU().template solveTransposedInPlace<Conjugate>(X);
+
+    // Backward substitution with transposed or adjoint of L
+    m_sparseLU->matrixL().template solveTransposedInPlace<Conjugate>(X);
+
+    // Permute back the solution
+    for (Index j = 0; j < B.cols(); ++j) X.col(j) = m_sparseLU->rowsPermutation().transpose() * X.col(j);
+    return true;
+  }
+  inline Index rows() const { return m_sparseLU->rows(); }
+  inline Index cols() const { return m_sparseLU->cols(); }
+
+ private:
+  SparseLUType* m_sparseLU;
+  SparseLUTransposeView& operator=(const SparseLUTransposeView&);
+};
+
+/** \ingroup SparseLU_Module
+ * \class SparseLU
+ *
+ * \brief Sparse supernodal LU factorization for general matrices
+ *
+ * This class implements the supernodal LU factorization for general matrices.
+ * It uses the main techniques from the sequential SuperLU package
+ * (http://crd-legacy.lbl.gov/~xiaoye/SuperLU/). It handles transparently real
+ * and complex arithmetic with single and double precision, depending on the
+ * scalar type of your input matrix.
+ * The code has been optimized to provide BLAS-3 operations during supernode-panel updates.
+ * It benefits directly from the built-in high-performant Eigen BLAS routines.
+ * Moreover, when the size of a supernode is very small, the BLAS calls are avoided to
+ * enable a better optimization from the compiler. For best performance,
+ * you should compile it with NDEBUG flag to avoid the numerous bounds checking on vectors.
+ *
+ * An important parameter of this class is the ordering method. It is used to reorder the columns
+ * (and eventually the rows) of the matrix to reduce the number of new elements that are created during
+ * numerical factorization. The cheapest method available is COLAMD.
+ * See  \link OrderingMethods_Module the OrderingMethods module \endlink for the list of
+ * built-in and external ordering methods.
+ *
+ * Simple example with key steps
+ * \code
+ * VectorXd x(n), b(n);
+ * SparseMatrix<double> A;
+ * SparseLU<SparseMatrix<double>, COLAMDOrdering<int> > solver;
+ * // Fill A and b.
+ * // Compute the ordering permutation vector from the structural pattern of A.
+ * solver.analyzePattern(A);
+ * // Compute the numerical factorization.
+ * solver.factorize(A);
+ * // Use the factors to solve the linear system.
+ * x = solver.solve(b);
+ * \endcode
+ *
+ * We can directly call compute() instead of analyzePattern() and factorize()
+ * \code
+ * VectorXd x(n), b(n);
+ * SparseMatrix<double> A;
+ * SparseLU<SparseMatrix<double>, COLAMDOrdering<int> > solver;
+ * // Fill A and b.
+ * solver.compute(A);
+ * // Use the factors to solve the linear system.
+ * x = solver.solve(b);
+ * \endcode
+ *
+ * Or give the matrix to the constructor SparseLU(const MatrixType& matrix)
+ * \code
+ * VectorXd x(n), b(n);
+ * SparseMatrix<double> A;
+ * // Fill A and b.
+ * SparseLU<SparseMatrix<double>, COLAMDOrdering<int> > solver(A);
+ * // Use the factors to solve the linear system.
+ * x = solver.solve(b);
+ * \endcode
+ *
+ * \warning The input matrix A should be in a \b compressed and \b column-major form.
+ * Otherwise an expensive copy will be made. You can call the inexpensive makeCompressed() to get a compressed matrix.
+ *
+ * \note Unlike the initial SuperLU implementation, there is no step to equilibrate the matrix.
+ * For badly scaled matrices, this step can be useful to reduce the pivoting during factorization.
+ * If this is the case for your matrices, you can try the basic scaling method at
+ *  "unsupported/Eigen/src/IterativeSolvers/Scaling.h"
+ *
+ * \tparam MatrixType_ The type of the sparse matrix. It must be a column-major SparseMatrix<>
+ * \tparam OrderingType_ The ordering method to use, either AMD, COLAMD or METIS. Default is COLMAD
+ *
+ * \implsparsesolverconcept
+ *
+ * \sa \ref TutorialSparseSolverConcept
+ * \sa \ref OrderingMethods_Module
+ */
+template <typename MatrixType_, typename OrderingType_>
+class SparseLU : public SparseSolverBase<SparseLU<MatrixType_, OrderingType_>>,
+                 public internal::SparseLUImpl<typename MatrixType_::Scalar, typename MatrixType_::StorageIndex> {
+ protected:
+  typedef SparseSolverBase<SparseLU<MatrixType_, OrderingType_>> APIBase;
+  using APIBase::m_isInitialized;
+
+ public:
+  using APIBase::_solve_impl;
+
+  typedef MatrixType_ MatrixType;
+  typedef OrderingType_ OrderingType;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef typename MatrixType::StorageIndex StorageIndex;
+  typedef SparseMatrix<Scalar, ColMajor, StorageIndex> NCMatrix;
+  typedef internal::MappedSuperNodalMatrix<Scalar, StorageIndex> SCMatrix;
+  typedef Matrix<Scalar, Dynamic, 1> ScalarVector;
+  typedef Matrix<StorageIndex, Dynamic, 1> IndexVector;
+  typedef PermutationMatrix<Dynamic, Dynamic, StorageIndex> PermutationType;
+  typedef internal::SparseLUImpl<Scalar, StorageIndex> Base;
+
+  enum { ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime };
+
+ public:
+  /** \brief Basic constructor of the solver.
+   *
+   * Construct a SparseLU. As no matrix is given as argument, compute() should be called afterward with a matrix.
+   */
+  SparseLU()
+      : m_lastError(""), m_Ustore(0, 0, 0, 0, 0, 0), m_symmetricmode(false), m_diagpivotthresh(1.0), m_detPermR(1) {
+    initperfvalues();
+  }
+  /** \brief Constructor of the solver already based on a specific matrix.
+   *
+   * Construct a SparseLU. compute() is already called with the given matrix.
+   */
+  explicit SparseLU(const MatrixType& matrix)
+      : m_lastError(""), m_Ustore(0, 0, 0, 0, 0, 0), m_symmetricmode(false), m_diagpivotthresh(1.0), m_detPermR(1) {
+    initperfvalues();
+    compute(matrix);
+  }
+
+  ~SparseLU() {
+    // Free all explicit dynamic pointers
+  }
+
+  void analyzePattern(const MatrixType& matrix);
+  void factorize(const MatrixType& matrix);
+  void simplicialfactorize(const MatrixType& matrix);
+
+  /** \brief Analyze and factorize the matrix so the solver is ready to solve.
+   *
+   * Compute the symbolic and numeric factorization of the input sparse matrix.
+   * The input matrix should be in column-major storage, otherwise analyzePattern()
+   * will do a heavy copy.
+   *
+   * Call analyzePattern() followed by factorize()
+   *
+   * \sa analyzePattern(), factorize()
+   */
+  void compute(const MatrixType& matrix) {
+    // Analyze
+    analyzePattern(matrix);
+    // Factorize
+    factorize(matrix);
+  }
+
+  /** \brief Return a solver for the transposed matrix.
+   *
+   * \returns an expression of the transposed of the factored matrix.
+   *
+   * A typical usage is to solve for the transposed problem A^T x = b:
+   * \code
+   * solver.compute(A);
+   * x = solver.transpose().solve(b);
+   * \endcode
+   *
+   * \sa adjoint(), solve()
+   */
+  const SparseLUTransposeView<false, SparseLU<MatrixType_, OrderingType_>> transpose() {
+    SparseLUTransposeView<false, SparseLU<MatrixType_, OrderingType_>> transposeView;
+    transposeView.setSparseLU(this);
+    transposeView.setIsInitialized(this->m_isInitialized);
+    return transposeView;
+  }
+
+  /** \brief Return a solver for the adjointed matrix.
+   *
+   * \returns an expression of the adjoint of the factored matrix
+   *
+   * A typical usage is to solve for the adjoint problem A' x = b:
+   * \code
+   * solver.compute(A);
+   * x = solver.adjoint().solve(b);
+   * \endcode
+   *
+   * For real scalar types, this function is equivalent to transpose().
+   *
+   * \sa transpose(), solve()
+   */
+  const SparseLUTransposeView<true, SparseLU<MatrixType_, OrderingType_>> adjoint() {
+    SparseLUTransposeView<true, SparseLU<MatrixType_, OrderingType_>> adjointView;
+    adjointView.setSparseLU(this);
+    adjointView.setIsInitialized(this->m_isInitialized);
+    return adjointView;
+  }
+
+  /** \brief Give the number of rows.
+   */
+  inline Index rows() const { return m_mat.rows(); }
+  /** \brief Give the number of columns.
+   */
+  inline Index cols() const { return m_mat.cols(); }
+  /** \brief Let you set that the pattern of the input matrix is symmetric
+   */
+  void isSymmetric(bool sym) { m_symmetricmode = sym; }
+
+  /** \brief Give the matrixL
+   *
+   * \returns an expression of the matrix L, internally stored as supernodes
+   * The only operation available with this expression is the triangular solve
+   * \code
+   * y = b; matrixL().solveInPlace(y);
+   * \endcode
+   */
+  SparseLUMatrixLReturnType<SCMatrix> matrixL() const { return SparseLUMatrixLReturnType<SCMatrix>(m_Lstore); }
+  /** \brief Give the MatrixU
+   *
+   * \returns an expression of the matrix U,
+   * The only operation available with this expression is the triangular solve
+   * \code
+   * y = b; matrixU().solveInPlace(y);
+   * \endcode
+   */
+  SparseLUMatrixUReturnType<SCMatrix, Map<SparseMatrix<Scalar, ColMajor, StorageIndex>>> matrixU() const {
+    return SparseLUMatrixUReturnType<SCMatrix, Map<SparseMatrix<Scalar, ColMajor, StorageIndex>>>(m_Lstore, m_Ustore);
+  }
+
+  /** \brief Give the row matrix permutation.
+   *
+   * \returns a reference to the row matrix permutation \f$ P_r \f$ such that \f$P_r A P_c^T = L U\f$
+   * \sa colsPermutation()
+   */
+  inline const PermutationType& rowsPermutation() const { return m_perm_r; }
+  /** \brief Give the column matrix permutation.
+   *
+   * \returns a reference to the column matrix permutation\f$ P_c^T \f$ such that \f$P_r A P_c^T = L U\f$
+   * \sa rowsPermutation()
+   */
+  inline const PermutationType& colsPermutation() const { return m_perm_c; }
+  /** Set the threshold used for a diagonal entry to be an acceptable pivot. */
+  void setPivotThreshold(const RealScalar& thresh) { m_diagpivotthresh = thresh; }
+
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+  /** \brief Solve a system \f$ A X = B \f$
+   *
+   * \returns the solution X of \f$ A X = B \f$ using the current decomposition of A.
+   *
+   * \warning the destination matrix X in X = this->solve(B) must be colmun-major.
+   *
+   * \sa compute()
+   */
+  template <typename Rhs>
+  inline const Solve<SparseLU, Rhs> solve(const MatrixBase<Rhs>& B) const;
+#endif  // EIGEN_PARSED_BY_DOXYGEN
+
+  /** \brief Reports whether previous computation was successful.
+   *
+   * \returns \c Success if computation was successful,
+   *          \c NumericalIssue if the LU factorization reports a problem, zero diagonal for instance
+   *          \c InvalidInput if the input matrix is invalid
+   *
+   * You can get a readable error message with lastErrorMessage().
+   *
+   * \sa lastErrorMessage()
+   */
+  ComputationInfo info() const {
+    eigen_assert(m_isInitialized && "Decomposition is not initialized.");
+    return m_info;
+  }
+
+  /** \brief Give a human readable error
+   *
+   * \returns A string describing the type of error
+   */
+  std::string lastErrorMessage() const { return m_lastError; }
+
+  template <typename Rhs, typename Dest>
+  bool _solve_impl(const MatrixBase<Rhs>& B, MatrixBase<Dest>& X_base) const {
+    Dest& X(X_base.derived());
+    eigen_assert(m_factorizationIsOk && "The matrix should be factorized first");
+    EIGEN_STATIC_ASSERT((Dest::Flags & RowMajorBit) == 0, THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES);
+
+    // Permute the right hand side to form X = Pr*B
+    // on return, X is overwritten by the computed solution
+    X.resize(B.rows(), B.cols());
+
+    // this ugly const_cast_derived() helps to detect aliasing when applying the permutations
+    for (Index j = 0; j < B.cols(); ++j) X.col(j) = rowsPermutation() * B.const_cast_derived().col(j);
+
+    // Forward substitution with L
+    this->matrixL().solveInPlace(X);
+    this->matrixU().solveInPlace(X);
+
+    // Permute back the solution
+    for (Index j = 0; j < B.cols(); ++j) X.col(j) = colsPermutation().inverse() * X.col(j);
+
+    return true;
+  }
+
+  /** \brief Give the absolute value of the determinant.
+   *
+   * \returns the absolute value of the determinant of the matrix of which
+   * *this is the QR decomposition.
+   *
+   * \warning a determinant can be very big or small, so for matrices
+   * of large enough dimension, there is a risk of overflow/underflow.
+   * One way to work around that is to use logAbsDeterminant() instead.
+   *
+   * \sa logAbsDeterminant(), signDeterminant()
+   */
+  Scalar absDeterminant() {
+    using std::abs;
+    eigen_assert(m_factorizationIsOk && "The matrix should be factorized first.");
+    // Initialize with the determinant of the row matrix
+    Scalar det = Scalar(1.);
+    // Note that the diagonal blocks of U are stored in supernodes,
+    // which are available in the  L part :)
+    for (Index j = 0; j < this->cols(); ++j) {
+      for (typename SCMatrix::InnerIterator it(m_Lstore, j); it; ++it) {
+        if (it.index() == j) {
+          det *= abs(it.value());
+          break;
         }
-       }
-       return det;
-     }
-
-     /** \returns the natural log of the absolute value of the determinant of the matrix
-       * of which **this is the QR decomposition
-       *
-       * \note This method is useful to work around the risk of overflow/underflow that's
-       * inherent to the determinant computation.
-       *
-       * \sa absDeterminant(), signDeterminant()
-       */
-     Scalar logAbsDeterminant() const
-     {
-       eigen_assert(m_factorizationIsOk && "The matrix should be factorized first.");
-       Scalar det = Scalar(0.);
-       for (Index j = 0; j < this->cols(); ++j)
-       {
-         for (typename SCMatrix::InnerIterator it(m_Lstore, j); it; ++it)
-         {
-           if(it.row() < j) continue;
-           if(it.row() == j)
-           {
-             using std::log; using std::abs;
-             det += log(abs(it.value()));
-             break;
-           }
-         }
-       }
-       return det;
-     }
-
-    /** \returns A number representing the sign of the determinant
-      *
-      * \sa absDeterminant(), logAbsDeterminant()
-      */
-    Scalar signDeterminant()
-    {
-      eigen_assert(m_factorizationIsOk && "The matrix should be factorized first.");
-      // Initialize with the determinant of the row matrix
-      Index det = 1;
-      // Note that the diagonal blocks of U are stored in supernodes,
-      // which are available in the  L part :)
-      for (Index j = 0; j < this->cols(); ++j)
-      {
-        for (typename SCMatrix::InnerIterator it(m_Lstore, j); it; ++it)
-        {
-          if(it.index() == j)
-          {
-            if(it.value()<0)
-              det = -det;
-            else if(it.value()==0)
-              return 0;
-            break;
-          }
+      }
+    }
+    return det;
+  }
+
+  /** \brief Give the natural log of the absolute determinant.
+   *
+   * \returns the natural log of the absolute value of the determinant of the matrix
+   * of which **this is the QR decomposition
+   *
+   * \note This method is useful to work around the risk of overflow/underflow that's
+   * inherent to the determinant computation.
+   *
+   * \sa absDeterminant(), signDeterminant()
+   */
+  Scalar logAbsDeterminant() const {
+    using std::abs;
+    using std::log;
+
+    eigen_assert(m_factorizationIsOk && "The matrix should be factorized first.");
+    Scalar det = Scalar(0.);
+    for (Index j = 0; j < this->cols(); ++j) {
+      for (typename SCMatrix::InnerIterator it(m_Lstore, j); it; ++it) {
+        if (it.row() < j) continue;
+        if (it.row() == j) {
+          det += log(abs(it.value()));
+          break;
         }
       }
-      return det * m_detPermR * m_detPermC;
     }
-    
-    /** \returns The determinant of the matrix.
-      *
-      * \sa absDeterminant(), logAbsDeterminant()
-      */
-    Scalar determinant()
-    {
-      eigen_assert(m_factorizationIsOk && "The matrix should be factorized first.");
-      // Initialize with the determinant of the row matrix
-      Scalar det = Scalar(1.);
-      // Note that the diagonal blocks of U are stored in supernodes,
-      // which are available in the  L part :)
-      for (Index j = 0; j < this->cols(); ++j)
-      {
-        for (typename SCMatrix::InnerIterator it(m_Lstore, j); it; ++it)
-        {
-          if(it.index() == j)
-          {
-            det *= it.value();
-            break;
-          }
+    return det;
+  }
+
+  /** \brief Give the sign of the determinant.
+   *
+   * \returns A number representing the sign of the determinant
+   *
+   * \sa absDeterminant(), logAbsDeterminant()
+   */
+  Scalar signDeterminant() {
+    eigen_assert(m_factorizationIsOk && "The matrix should be factorized first.");
+    // Initialize with the determinant of the row matrix
+    Index det = 1;
+    // Note that the diagonal blocks of U are stored in supernodes,
+    // which are available in the  L part :)
+    for (Index j = 0; j < this->cols(); ++j) {
+      for (typename SCMatrix::InnerIterator it(m_Lstore, j); it; ++it) {
+        if (it.index() == j) {
+          if (it.value() < 0)
+            det = -det;
+          else if (it.value() == 0)
+            return 0;
+          break;
         }
       }
-      return det * Scalar(m_detPermR * m_detPermC);
     }
+    return det * m_detPermR * m_detPermC;
+  }
 
-  protected:
-    // Functions 
-    void initperfvalues()
-    {
-      m_perfv.panel_size = 16;
-      m_perfv.relax = 1; 
-      m_perfv.maxsuper = 128; 
-      m_perfv.rowblk = 16; 
-      m_perfv.colblk = 8; 
-      m_perfv.fillfactor = 20;  
+  /** \brief Give the determinant.
+   *
+   * \returns The determinant of the matrix.
+   *
+   * \sa absDeterminant(), logAbsDeterminant()
+   */
+  Scalar determinant() {
+    eigen_assert(m_factorizationIsOk && "The matrix should be factorized first.");
+    // Initialize with the determinant of the row matrix
+    Scalar det = Scalar(1.);
+    // Note that the diagonal blocks of U are stored in supernodes,
+    // which are available in the  L part :)
+    for (Index j = 0; j < this->cols(); ++j) {
+      for (typename SCMatrix::InnerIterator it(m_Lstore, j); it; ++it) {
+        if (it.index() == j) {
+          det *= it.value();
+          break;
+        }
+      }
     }
-      
-    // Variables 
-    mutable ComputationInfo m_info;
-    bool m_isInitialized;
-    bool m_factorizationIsOk;
-    bool m_analysisIsOk;
-    std::string m_lastError;
-    NCMatrix m_mat; // The input (permuted ) matrix 
-    SCMatrix m_Lstore; // The lower triangular matrix (supernodal)
-    MappedSparseMatrix<Scalar,ColMajor,Index> m_Ustore; // The upper triangular matrix
-    PermutationType m_perm_c; // Column permutation 
-    PermutationType m_perm_r ; // Row permutation
-    IndexVector m_etree; // Column elimination tree 
-    
-    typename Base::GlobalLU_t m_glu; 
-                               
-    // SparseLU options 
-    bool m_symmetricmode;
-    // values for performance 
-    internal::perfvalues<Index> m_perfv; 
-    RealScalar m_diagpivotthresh; // Specifies the threshold used for a diagonal entry to be an acceptable pivot
-    Index m_nnzL, m_nnzU; // Nonzeros in L and U factors
-    Index m_detPermR, m_detPermC; // Determinants of the permutation matrices
-  private:
-    // Disable copy constructor 
-    SparseLU (const SparseLU& );
-  
-}; // End class SparseLU
+    return (m_detPermR * m_detPermC) > 0 ? det : -det;
+  }
+
+  /** \brief Give the number of non zero in matrix L.
+   */
+  Index nnzL() const { return m_nnzL; }
+  /** \brief Give the number of non zero in matrix U.
+   */
+  Index nnzU() const { return m_nnzU; }
+
+ protected:
+  // Functions
+  void initperfvalues() {
+    m_perfv.panel_size = 16;
+    m_perfv.relax = 1;
+    m_perfv.maxsuper = 128;
+    m_perfv.rowblk = 16;
+    m_perfv.colblk = 8;
+    m_perfv.fillfactor = 20;
+  }
 
+  // Variables
+  mutable ComputationInfo m_info;
+  bool m_factorizationIsOk;
+  bool m_analysisIsOk;
+  std::string m_lastError;
+  NCMatrix m_mat;                                              // The input (permuted ) matrix
+  SCMatrix m_Lstore;                                           // The lower triangular matrix (supernodal)
+  Map<SparseMatrix<Scalar, ColMajor, StorageIndex>> m_Ustore;  // The upper triangular matrix
+  PermutationType m_perm_c;                                    // Column permutation
+  PermutationType m_perm_r;                                    // Row permutation
+  IndexVector m_etree;                                         // Column elimination tree
 
+  typename Base::GlobalLU_t m_glu;
+
+  // SparseLU options
+  bool m_symmetricmode;
+  // values for performance
+  internal::perfvalues m_perfv;
+  RealScalar m_diagpivotthresh;  // Specifies the threshold used for a diagonal entry to be an acceptable pivot
+  Index m_nnzL, m_nnzU;          // Nonzeros in L and U factors
+  Index m_detPermR, m_detPermC;  // Determinants of the permutation matrices
+ private:
+  // Disable copy constructor
+  SparseLU(const SparseLU&);
+};  // End class SparseLU
 
 // Functions needed by the anaysis phase
-/** 
-  * Compute the column permutation to minimize the fill-in
-  * 
-  *  - Apply this permutation to the input matrix - 
-  * 
-  *  - Compute the column elimination tree on the permuted matrix 
-  * 
-  *  - Postorder the elimination tree and the column permutation
-  * 
-  */
+/** \brief Compute the column permutation.
+ *
+ * Compute the column permutation to minimize the fill-in
+ *
+ *  - Apply this permutation to the input matrix -
+ *
+ *  - Compute the column elimination tree on the permuted matrix
+ *
+ *  - Postorder the elimination tree and the column permutation
+ *
+ * It is possible to call compute() instead of analyzePattern() + factorize().
+ *
+ * If the matrix is row-major this function will do an heavy copy.
+ *
+ * \sa factorize(), compute()
+ */
 template <typename MatrixType, typename OrderingType>
-void SparseLU<MatrixType, OrderingType>::analyzePattern(const MatrixType& mat)
-{
-  
-  //TODO  It is possible as in SuperLU to compute row and columns scaling vectors to equilibrate the matrix mat.
-  
-  OrderingType ord; 
-  ord(mat,m_perm_c);
-  
-  // Apply the permutation to the column of the input  matrix
-  //First copy the whole input matrix. 
+void SparseLU<MatrixType, OrderingType>::analyzePattern(const MatrixType& mat) {
+  // TODO  It is possible as in SuperLU to compute row and columns scaling vectors to equilibrate the matrix mat.
+
+  // Firstly, copy the whole input matrix.
   m_mat = mat;
+
+  // Compute fill-in ordering
+  OrderingType ord;
+  ord(m_mat, m_perm_c);
+
+  // Apply the permutation to the column of the input  matrix
   if (m_perm_c.size()) {
-    m_mat.uncompress(); //NOTE: The effect of this command is only to create the InnerNonzeros pointers. FIXME : This vector is filled but not subsequently used.  
-    //Then, permute only the column pointers
-    const Index * outerIndexPtr;
-    if (mat.isCompressed()) outerIndexPtr = mat.outerIndexPtr();
-    else
-    {
-      Index *outerIndexPtr_t = new Index[mat.cols()+1];
-      for(Index i = 0; i <= mat.cols(); i++) outerIndexPtr_t[i] = m_mat.outerIndexPtr()[i];
-      outerIndexPtr = outerIndexPtr_t;
-    }
-    for (Index i = 0; i < mat.cols(); i++)
-    {
+    m_mat.uncompress();  // NOTE: The effect of this command is only to create the InnerNonzeros pointers. FIXME : This
+                         // vector is filled but not subsequently used.
+    // Then, permute only the column pointers
+    ei_declare_aligned_stack_constructed_variable(
+        StorageIndex, outerIndexPtr, mat.cols() + 1,
+        mat.isCompressed() ? const_cast<StorageIndex*>(mat.outerIndexPtr()) : 0);
+
+    // If the input matrix 'mat' is uncompressed, then the outer-indices do not match the ones of m_mat, and a copy is
+    // thus needed.
+    if (!mat.isCompressed())
+      IndexVector::Map(outerIndexPtr, mat.cols() + 1) = IndexVector::Map(m_mat.outerIndexPtr(), mat.cols() + 1);
+
+    // Apply the permutation and compute the nnz per column.
+    for (Index i = 0; i < mat.cols(); i++) {
       m_mat.outerIndexPtr()[m_perm_c.indices()(i)] = outerIndexPtr[i];
-      m_mat.innerNonZeroPtr()[m_perm_c.indices()(i)] = outerIndexPtr[i+1] - outerIndexPtr[i];
+      m_mat.innerNonZeroPtr()[m_perm_c.indices()(i)] = outerIndexPtr[i + 1] - outerIndexPtr[i];
     }
-    if(!mat.isCompressed()) delete[] outerIndexPtr;
   }
-  // Compute the column elimination tree of the permuted matrix 
+
+  // Compute the column elimination tree of the permuted matrix
   IndexVector firstRowElt;
-  internal::coletree(m_mat, m_etree,firstRowElt); 
-     
+  internal::coletree(m_mat, m_etree, firstRowElt);
+
   // In symmetric mode, do not do postorder here
   if (!m_symmetricmode) {
-    IndexVector post, iwork; 
+    IndexVector post, iwork;
     // Post order etree
-    internal::treePostorder(m_mat.cols(), m_etree, post); 
-      
-   
-    // Renumber etree in postorder 
-    Index m = m_mat.cols(); 
-    iwork.resize(m+1);
+    internal::treePostorder(StorageIndex(m_mat.cols()), m_etree, post);
+
+    // Renumber etree in postorder
+    Index m = m_mat.cols();
+    iwork.resize(m + 1);
     for (Index i = 0; i < m; ++i) iwork(post(i)) = post(m_etree(i));
     m_etree = iwork;
-    
+
     // Postmultiply A*Pc by post, i.e reorder the matrix according to the postorder of the etree
-    PermutationType post_perm(m); 
-    for (Index i = 0; i < m; i++) 
-      post_perm.indices()(i) = post(i); 
-        
+    PermutationType post_perm(m);
+    for (Index i = 0; i < m; i++) post_perm.indices()(i) = post(i);
+
     // Combine the two permutations : postorder the permutation for future use
-    if(m_perm_c.size()) {
+    if (m_perm_c.size()) {
       m_perm_c = post_perm * m_perm_c;
     }
-    
-  } // end postordering 
-  
-  m_analysisIsOk = true; 
+
+  }  // end postordering
+
+  m_analysisIsOk = true;
 }
 
 // Functions needed by the numerical factorization phase
 
-
-/** 
-  *  - Numerical factorization 
-  *  - Interleaved with the symbolic factorization 
-  * On exit,  info is 
-  * 
-  *    = 0: successful factorization
-  * 
-  *    > 0: if info = i, and i is
-  * 
-  *       <= A->ncol: U(i,i) is exactly zero. The factorization has
-  *          been completed, but the factor U is exactly singular,
-  *          and division by zero will occur if it is used to solve a
-  *          system of equations.
-  * 
-  *       > A->ncol: number of bytes allocated when memory allocation
-  *         failure occurred, plus A->ncol. If lwork = -1, it is
-  *         the estimated amount of space needed, plus A->ncol.  
-  */
+/** \brief Factorize the matrix to get the solver ready.
+ *
+ *  - Numerical factorization
+ *  - Interleaved with the symbolic factorization
+ *
+ * To get error of this function you should check info(), you can get more info of
+ * errors with lastErrorMessage().
+ *
+ * In the past (before 2012 (git history is not older)), this function was returning an integer.
+ * This exit was 0 if successful factorization.
+ * > 0 if info = i, and i is been completed, but the factor U is exactly singular,
+ * and division by zero will occur if it is used to solve a system of equation.
+ * > A->ncol: number of bytes allocated when memory allocation failure occurred, plus A->ncol.
+ * If lwork = -1, it is the estimated amount of space needed, plus A->ncol.
+ *
+ * It seems that A was the name of the matrix in the past.
+ *
+ * \sa analyzePattern(), compute(), SparseLU(), info(), lastErrorMessage()
+ */
 template <typename MatrixType, typename OrderingType>
-void SparseLU<MatrixType, OrderingType>::factorize(const MatrixType& matrix)
-{
+void SparseLU<MatrixType, OrderingType>::factorize(const MatrixType& matrix) {
   using internal::emptyIdxLU;
-  eigen_assert(m_analysisIsOk && "analyzePattern() should be called first"); 
+  eigen_assert(m_analysisIsOk && "analyzePattern() should be called first");
   eigen_assert((matrix.rows() == matrix.cols()) && "Only for squared matrices");
-  
-  typedef typename IndexVector::Scalar Index; 
-  
-  
+
+  m_isInitialized = true;
+
   // Apply the column permutation computed in analyzepattern()
-  //   m_mat = matrix * m_perm_c.inverse(); 
+  //   m_mat = matrix * m_perm_c.inverse();
   m_mat = matrix;
-  if (m_perm_c.size()) 
-  {
-    m_mat.uncompress(); //NOTE: The effect of this command is only to create the InnerNonzeros pointers.
-    //Then, permute only the column pointers
-    const Index * outerIndexPtr;
-    if (matrix.isCompressed()) outerIndexPtr = matrix.outerIndexPtr();
-    else
-    {
-      Index* outerIndexPtr_t = new Index[matrix.cols()+1];
-      for(Index i = 0; i <= matrix.cols(); i++) outerIndexPtr_t[i] = m_mat.outerIndexPtr()[i];
+  if (m_perm_c.size()) {
+    m_mat.uncompress();  // NOTE: The effect of this command is only to create the InnerNonzeros pointers.
+    // Then, permute only the column pointers
+    const StorageIndex* outerIndexPtr;
+    if (matrix.isCompressed())
+      outerIndexPtr = matrix.outerIndexPtr();
+    else {
+      StorageIndex* outerIndexPtr_t = new StorageIndex[matrix.cols() + 1];
+      for (Index i = 0; i <= matrix.cols(); i++) outerIndexPtr_t[i] = m_mat.outerIndexPtr()[i];
       outerIndexPtr = outerIndexPtr_t;
     }
-    for (Index i = 0; i < matrix.cols(); i++)
-    {
+    for (Index i = 0; i < matrix.cols(); i++) {
       m_mat.outerIndexPtr()[m_perm_c.indices()(i)] = outerIndexPtr[i];
-      m_mat.innerNonZeroPtr()[m_perm_c.indices()(i)] = outerIndexPtr[i+1] - outerIndexPtr[i];
+      m_mat.innerNonZeroPtr()[m_perm_c.indices()(i)] = outerIndexPtr[i + 1] - outerIndexPtr[i];
     }
-    if(!matrix.isCompressed()) delete[] outerIndexPtr;
-  } 
-  else 
-  { //FIXME This should not be needed if the empty permutation is handled transparently
+    if (!matrix.isCompressed()) delete[] outerIndexPtr;
+  } else {  // FIXME This should not be needed if the empty permutation is handled transparently
     m_perm_c.resize(matrix.cols());
-    for(Index i = 0; i < matrix.cols(); ++i) m_perm_c.indices()(i) = i;
+    for (StorageIndex i = 0; i < matrix.cols(); ++i) m_perm_c.indices()(i) = i;
   }
-  
+
   Index m = m_mat.rows();
   Index n = m_mat.cols();
   Index nnz = m_mat.nonZeros();
   Index maxpanel = m_perfv.panel_size * m;
   // Allocate working storage common to the factor routines
   Index lwork = 0;
-  Index info = Base::memInit(m, n, nnz, lwork, m_perfv.fillfactor, m_perfv.panel_size, m_glu); 
-  if (info) 
-  {
-    m_lastError = "UNABLE TO ALLOCATE WORKING MEMORY\n\n" ;
+  // Return the size of actually allocated memory when allocation failed,
+  // and 0 on success.
+  Index info = Base::memInit(m, n, nnz, lwork, m_perfv.fillfactor, m_perfv.panel_size, m_glu);
+  if (info) {
+    m_lastError = "UNABLE TO ALLOCATE WORKING MEMORY\n\n";
     m_factorizationIsOk = false;
-    return ; 
+    return;
   }
-  
-  // Set up pointers for integer working arrays 
-  IndexVector segrep(m); segrep.setZero();
-  IndexVector parent(m); parent.setZero();
-  IndexVector xplore(m); xplore.setZero();
+
+  // Set up pointers for integer working arrays
+  IndexVector segrep(m);
+  segrep.setZero();
+  IndexVector parent(m);
+  parent.setZero();
+  IndexVector xplore(m);
+  xplore.setZero();
   IndexVector repfnz(maxpanel);
   IndexVector panel_lsub(maxpanel);
-  IndexVector xprune(n); xprune.setZero();
-  IndexVector marker(m*internal::LUNoMarker); marker.setZero();
-  
-  repfnz.setConstant(-1); 
+  IndexVector xprune(n);
+  xprune.setZero();
+  IndexVector marker(m * internal::LUNoMarker);
+  marker.setZero();
+
+  repfnz.setConstant(-1);
   panel_lsub.setConstant(-1);
-  
-  // Set up pointers for scalar working arrays 
-  ScalarVector dense; 
+
+  // Set up pointers for scalar working arrays
+  ScalarVector dense;
   dense.setZero(maxpanel);
-  ScalarVector tempv; 
-  tempv.setZero(internal::LUnumTempV(m, m_perfv.panel_size, m_perfv.maxsuper, /*m_perfv.rowblk*/m) );
-  
+  ScalarVector tempv;
+  tempv.setZero(internal::LUnumTempV(m, m_perfv.panel_size, m_perfv.maxsuper, /*m_perfv.rowblk*/ m));
+
   // Compute the inverse of perm_c
-  PermutationType iperm_c(m_perm_c.inverse()); 
-  
+  PermutationType iperm_c(m_perm_c.inverse());
+
   // Identify initial relaxed snodes
   IndexVector relax_end(n);
-  if ( m_symmetricmode == true ) 
+  if (m_symmetricmode == true)
     Base::heap_relax_snode(n, m_etree, m_perfv.relax, marker, relax_end);
   else
     Base::relax_snode(n, m_etree, m_perfv.relax, marker, relax_end);
-  
-  
-  m_perm_r.resize(m); 
+
+  m_perm_r.resize(m);
   m_perm_r.indices().setConstant(-1);
   marker.setConstant(-1);
-  m_detPermR = 1; // Record the determinant of the row permutation
-  
-  m_glu.supno(0) = emptyIdxLU; m_glu.xsup.setConstant(0);
+  m_detPermR = 1;  // Record the determinant of the row permutation
+
+  m_glu.supno(0) = emptyIdxLU;
+  m_glu.xsup.setConstant(0);
   m_glu.xsup(0) = m_glu.xlsub(0) = m_glu.xusub(0) = m_glu.xlusup(0) = Index(0);
-  
+
   // Work on one 'panel' at a time. A panel is one of the following :
   //  (a) a relaxed supernode at the bottom of the etree, or
   //  (b) panel_size contiguous columns, <panel_size> defined by the user
-  Index jcol; 
-  IndexVector panel_histo(n);
-  Index pivrow; // Pivotal row number in the original row matrix
-  Index nseg1; // Number of segments in U-column above panel row jcol
-  Index nseg; // Number of segments in each U-column 
-  Index irep; 
-  Index i, k, jj; 
-  for (jcol = 0; jcol < n; )
-  {
-    // Adjust panel size so that a panel won't overlap with the next relaxed snode. 
-    Index panel_size = m_perfv.panel_size; // upper bound on panel width
-    for (k = jcol + 1; k < (std::min)(jcol+panel_size, n); k++)
-    {
-      if (relax_end(k) != emptyIdxLU) 
-      {
-        panel_size = k - jcol; 
-        break; 
+  Index jcol;
+  Index pivrow;  // Pivotal row number in the original row matrix
+  Index nseg1;   // Number of segments in U-column above panel row jcol
+  Index nseg;    // Number of segments in each U-column
+  Index irep;
+  Index i, k, jj;
+  for (jcol = 0; jcol < n;) {
+    // Adjust panel size so that a panel won't overlap with the next relaxed snode.
+    Index panel_size = m_perfv.panel_size;  // upper bound on panel width
+    for (k = jcol + 1; k < (std::min)(jcol + panel_size, n); k++) {
+      if (relax_end(k) != emptyIdxLU) {
+        panel_size = k - jcol;
+        break;
       }
     }
-    if (k == n) 
-      panel_size = n - jcol; 
-      
-    // Symbolic outer factorization on a panel of columns 
-    Base::panel_dfs(m, panel_size, jcol, m_mat, m_perm_r.indices(), nseg1, dense, panel_lsub, segrep, repfnz, xprune, marker, parent, xplore, m_glu); 
-    
-    // Numeric sup-panel updates in topological order 
-    Base::panel_bmod(m, panel_size, jcol, nseg1, dense, tempv, segrep, repfnz, m_glu); 
-    
-    // Sparse LU within the panel, and below the panel diagonal 
-    for ( jj = jcol; jj< jcol + panel_size; jj++) 
-    {
-      k = (jj - jcol) * m; // Column index for w-wide arrays 
-      
-      nseg = nseg1; // begin after all the panel segments
-      //Depth-first-search for the current column
+    if (k == n) panel_size = n - jcol;
+
+    // Symbolic outer factorization on a panel of columns
+    Base::panel_dfs(m, panel_size, jcol, m_mat, m_perm_r.indices(), nseg1, dense, panel_lsub, segrep, repfnz, xprune,
+                    marker, parent, xplore, m_glu);
+
+    // Numeric sup-panel updates in topological order
+    Base::panel_bmod(m, panel_size, jcol, nseg1, dense, tempv, segrep, repfnz, m_glu);
+
+    // Sparse LU within the panel, and below the panel diagonal
+    for (jj = jcol; jj < jcol + panel_size; jj++) {
+      k = (jj - jcol) * m;  // Column index for w-wide arrays
+
+      nseg = nseg1;  // begin after all the panel segments
+      // Depth-first-search for the current column
       VectorBlock<IndexVector> panel_lsubk(panel_lsub, k, m);
-      VectorBlock<IndexVector> repfnz_k(repfnz, k, m); 
-      info = Base::column_dfs(m, jj, m_perm_r.indices(), m_perfv.maxsuper, nseg, panel_lsubk, segrep, repfnz_k, xprune, marker, parent, xplore, m_glu); 
-      if ( info ) 
-      {
-        m_lastError =  "UNABLE TO EXPAND MEMORY IN COLUMN_DFS() ";
-        m_info = NumericalIssue; 
-        m_factorizationIsOk = false; 
-        return; 
+      VectorBlock<IndexVector> repfnz_k(repfnz, k, m);
+      // Return 0 on success and > 0 number of bytes allocated when run out of space.
+      info = Base::column_dfs(m, jj, m_perm_r.indices(), m_perfv.maxsuper, nseg, panel_lsubk, segrep, repfnz_k, xprune,
+                              marker, parent, xplore, m_glu);
+      if (info) {
+        m_lastError = "UNABLE TO EXPAND MEMORY IN COLUMN_DFS() ";
+        m_info = NumericalIssue;
+        m_factorizationIsOk = false;
+        return;
       }
-      // Numeric updates to this column 
-      VectorBlock<ScalarVector> dense_k(dense, k, m); 
-      VectorBlock<IndexVector> segrep_k(segrep, nseg1, m-nseg1); 
-      info = Base::column_bmod(jj, (nseg - nseg1), dense_k, tempv, segrep_k, repfnz_k, jcol, m_glu); 
-      if ( info ) 
-      {
+      // Numeric updates to this column
+      VectorBlock<ScalarVector> dense_k(dense, k, m);
+      VectorBlock<IndexVector> segrep_k(segrep, nseg1, m - nseg1);
+      // Return 0 on success and > 0 number of bytes allocated when run out of space.
+      info = Base::column_bmod(jj, (nseg - nseg1), dense_k, tempv, segrep_k, repfnz_k, jcol, m_glu);
+      if (info) {
         m_lastError = "UNABLE TO EXPAND MEMORY IN COLUMN_BMOD() ";
-        m_info = NumericalIssue; 
-        m_factorizationIsOk = false; 
-        return; 
+        m_info = NumericalIssue;
+        m_factorizationIsOk = false;
+        return;
       }
-      
+
       // Copy the U-segments to ucol(*)
-      info = Base::copy_to_ucol(jj, nseg, segrep, repfnz_k ,m_perm_r.indices(), dense_k, m_glu); 
-      if ( info ) 
-      {
+      // Return 0 on success and > 0 number of bytes allocated when run out of space.
+      info = Base::copy_to_ucol(jj, nseg, segrep, repfnz_k, m_perm_r.indices(), dense_k, m_glu);
+      if (info) {
         m_lastError = "UNABLE TO EXPAND MEMORY IN COPY_TO_UCOL() ";
-        m_info = NumericalIssue; 
-        m_factorizationIsOk = false; 
-        return; 
+        m_info = NumericalIssue;
+        m_factorizationIsOk = false;
+        return;
       }
-      
-      // Form the L-segment 
+
+      // Form the L-segment
+      // Return O if success, i > 0 if U(i, i) is exactly zero.
       info = Base::pivotL(jj, m_diagpivotthresh, m_perm_r.indices(), iperm_c.indices(), pivrow, m_glu);
-      if ( info ) 
-      {
-        m_lastError = "THE MATRIX IS STRUCTURALLY SINGULAR ... ZERO COLUMN AT ";
+      if (info) {
+        m_lastError = "THE MATRIX IS STRUCTURALLY SINGULAR";
+#ifndef EIGEN_NO_IO
         std::ostringstream returnInfo;
-        returnInfo << info; 
+        returnInfo << " ... ZERO COLUMN AT ";
+        returnInfo << info;
         m_lastError += returnInfo.str();
-        m_info = NumericalIssue; 
-        m_factorizationIsOk = false; 
-        return; 
+#endif
+        m_info = NumericalIssue;
+        m_factorizationIsOk = false;
+        return;
       }
-      
+
       // Update the determinant of the row permutation matrix
-      // FIXME: the following test is not correct, we should probably take iperm_c into account and pivrow is not directly the row pivot.
+      // FIXME: the following test is not correct, we should probably take iperm_c into account and pivrow is not
+      // directly the row pivot.
       if (pivrow != jj) m_detPermR = -m_detPermR;
 
       // Prune columns (0:jj-1) using column jj
-      Base::pruneL(jj, m_perm_r.indices(), pivrow, nseg, segrep, repfnz_k, xprune, m_glu); 
-      
-      // Reset repfnz for this column 
-      for (i = 0; i < nseg; i++)
-      {
-        irep = segrep(i); 
-        repfnz_k(irep) = emptyIdxLU; 
+      Base::pruneL(jj, m_perm_r.indices(), pivrow, nseg, segrep, repfnz_k, xprune, m_glu);
+
+      // Reset repfnz for this column
+      for (i = 0; i < nseg; i++) {
+        irep = segrep(i);
+        repfnz_k(irep) = emptyIdxLU;
       }
-    } // end SparseLU within the panel  
+    }                    // end SparseLU within the panel
     jcol += panel_size;  // Move to the next panel
-  } // end for -- end elimination 
-  
+  }                      // end for -- end elimination
+
   m_detPermR = m_perm_r.determinant();
   m_detPermC = m_perm_c.determinant();
-  
-  // Count the number of nonzeros in factors 
-  Base::countnz(n, m_nnzL, m_nnzU, m_glu); 
-  // Apply permutation  to the L subscripts 
+
+  // Count the number of nonzeros in factors
+  Base::countnz(n, m_nnzL, m_nnzU, m_glu);
+  // Apply permutation  to the L subscripts
   Base::fixupL(n, m_perm_r.indices(), m_glu);
-  
-  // Create supernode matrix L 
-  m_Lstore.setInfos(m, n, m_glu.lusup, m_glu.xlusup, m_glu.lsub, m_glu.xlsub, m_glu.supno, m_glu.xsup); 
-  // Create the column major upper sparse matrix  U; 
-  new (&m_Ustore) MappedSparseMatrix<Scalar, ColMajor, Index> ( m, n, m_nnzU, m_glu.xusub.data(), m_glu.usub.data(), m_glu.ucol.data() ); 
-  
+
+  // Create supernode matrix L
+  m_Lstore.setInfos(m, n, m_glu.lusup, m_glu.xlusup, m_glu.lsub, m_glu.xlsub, m_glu.supno, m_glu.xsup);
+  // Create the column major upper sparse matrix  U;
+  new (&m_Ustore) Map<SparseMatrix<Scalar, ColMajor, StorageIndex>>(m, n, m_nnzU, m_glu.xusub.data(), m_glu.usub.data(),
+                                                                    m_glu.ucol.data());
+
   m_info = Success;
   m_factorizationIsOk = true;
 }
 
-template<typename MappedSupernodalType>
-struct SparseLUMatrixLReturnType : internal::no_assignment_operator
-{
-  typedef typename MappedSupernodalType::Index Index;
+template <typename MappedSupernodalType>
+struct SparseLUMatrixLReturnType : internal::no_assignment_operator {
   typedef typename MappedSupernodalType::Scalar Scalar;
-  SparseLUMatrixLReturnType(const MappedSupernodalType& mapL) : m_mapL(mapL)
-  { }
-  Index rows() { return m_mapL.rows(); }
-  Index cols() { return m_mapL.cols(); }
-  template<typename Dest>
-  void solveInPlace( MatrixBase<Dest> &X) const
-  {
+  explicit SparseLUMatrixLReturnType(const MappedSupernodalType& mapL) : m_mapL(mapL) {}
+  Index rows() const { return m_mapL.rows(); }
+  Index cols() const { return m_mapL.cols(); }
+  template <typename Dest>
+  void solveInPlace(MatrixBase<Dest>& X) const {
     m_mapL.solveInPlace(X);
   }
+  template <bool Conjugate, typename Dest>
+  void solveTransposedInPlace(MatrixBase<Dest>& X) const {
+    m_mapL.template solveTransposedInPlace<Conjugate>(X);
+  }
+
+  SparseMatrix<Scalar, ColMajor, Index> toSparse() const {
+    ArrayXi colCount = ArrayXi::Ones(cols());
+    for (Index i = 0; i < cols(); i++) {
+      typename MappedSupernodalType::InnerIterator iter(m_mapL, i);
+      for (; iter; ++iter) {
+        if (iter.row() > iter.col()) {
+          colCount(iter.col())++;
+        }
+      }
+    }
+    SparseMatrix<Scalar, ColMajor, Index> sL(rows(), cols());
+    sL.reserve(colCount);
+    for (Index i = 0; i < cols(); i++) {
+      sL.insert(i, i) = 1.0;
+      typename MappedSupernodalType::InnerIterator iter(m_mapL, i);
+      for (; iter; ++iter) {
+        if (iter.row() > iter.col()) {
+          sL.insert(iter.row(), iter.col()) = iter.value();
+        }
+      }
+    }
+    sL.makeCompressed();
+    return sL;
+  }
+
   const MappedSupernodalType& m_mapL;
 };
 
-template<typename MatrixLType, typename MatrixUType>
-struct SparseLUMatrixUReturnType : internal::no_assignment_operator
-{
-  typedef typename MatrixLType::Index Index;
+template <typename MatrixLType, typename MatrixUType>
+struct SparseLUMatrixUReturnType : internal::no_assignment_operator {
   typedef typename MatrixLType::Scalar Scalar;
-  SparseLUMatrixUReturnType(const MatrixLType& mapL, const MatrixUType& mapU)
-  : m_mapL(mapL),m_mapU(mapU)
-  { }
-  Index rows() { return m_mapL.rows(); }
-  Index cols() { return m_mapL.cols(); }
-
-  template<typename Dest>   void solveInPlace(MatrixBase<Dest> &X) const
-  {
+  SparseLUMatrixUReturnType(const MatrixLType& mapL, const MatrixUType& mapU) : m_mapL(mapL), m_mapU(mapU) {}
+  Index rows() const { return m_mapL.rows(); }
+  Index cols() const { return m_mapL.cols(); }
+
+  template <typename Dest>
+  void solveInPlace(MatrixBase<Dest>& X) const {
     Index nrhs = X.cols();
-    Index n = X.rows();
     // Backward solve with U
-    for (Index k = m_mapL.nsuper(); k >= 0; k--)
-    {
+    for (Index k = m_mapL.nsuper(); k >= 0; k--) {
       Index fsupc = m_mapL.supToCol()[k];
-      Index lda = m_mapL.colIndexPtr()[fsupc+1] - m_mapL.colIndexPtr()[fsupc]; // leading dimension
-      Index nsupc = m_mapL.supToCol()[k+1] - fsupc;
+      Index lda = m_mapL.colIndexPtr()[fsupc + 1] - m_mapL.colIndexPtr()[fsupc];  // leading dimension
+      Index nsupc = m_mapL.supToCol()[k + 1] - fsupc;
       Index luptr = m_mapL.colIndexPtr()[fsupc];
 
-      if (nsupc == 1)
-      {
-        for (Index j = 0; j < nrhs; j++)
-        {
+      if (nsupc == 1) {
+        for (Index j = 0; j < nrhs; j++) {
           X(fsupc, j) /= m_mapL.valuePtr()[luptr];
         }
-      }
-      else
-      {
-        Map<const Matrix<Scalar,Dynamic,Dynamic, ColMajor>, 0, OuterStride<> > A( &(m_mapL.valuePtr()[luptr]), nsupc, nsupc, OuterStride<>(lda) );
-        Map< Matrix<Scalar,Dynamic,Dynamic, ColMajor>, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) );
+      } else {
+        // FIXME: the following lines should use Block expressions and not Map!
+        Map<const Matrix<Scalar, Dynamic, Dynamic, ColMajor>, 0, OuterStride<>> A(&(m_mapL.valuePtr()[luptr]), nsupc,
+                                                                                  nsupc, OuterStride<>(lda));
+        typename Dest::RowsBlockXpr U = X.derived().middleRows(fsupc, nsupc);
         U = A.template triangularView<Upper>().solve(U);
       }
 
-      for (Index j = 0; j < nrhs; ++j)
-      {
-        for (Index jcol = fsupc; jcol < fsupc + nsupc; jcol++)
-        {
+      for (Index j = 0; j < nrhs; ++j) {
+        for (Index jcol = fsupc; jcol < fsupc + nsupc; jcol++) {
           typename MatrixUType::InnerIterator it(m_mapU, jcol);
-          for ( ; it; ++it)
-          {
+          for (; it; ++it) {
             Index irow = it.index();
             X(irow, j) -= X(jcol, j) * it.value();
           }
         }
       }
-    } // End For U-solve
+    }  // End For U-solve
   }
-  const MatrixLType& m_mapL;
-  const MatrixUType& m_mapU;
-};
 
-namespace internal {
-  
-template<typename _MatrixType, typename Derived, typename Rhs>
-struct solve_retval<SparseLU<_MatrixType,Derived>, Rhs>
-  : solve_retval_base<SparseLU<_MatrixType,Derived>, Rhs>
-{
-  typedef SparseLU<_MatrixType,Derived> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
+  template <bool Conjugate, typename Dest>
+  void solveTransposedInPlace(MatrixBase<Dest>& X) const {
+    using numext::conj;
+    Index nrhs = X.cols();
+    // Forward solve with U
+    for (Index k = 0; k <= m_mapL.nsuper(); k++) {
+      Index fsupc = m_mapL.supToCol()[k];
+      Index lda = m_mapL.colIndexPtr()[fsupc + 1] - m_mapL.colIndexPtr()[fsupc];  // leading dimension
+      Index nsupc = m_mapL.supToCol()[k + 1] - fsupc;
+      Index luptr = m_mapL.colIndexPtr()[fsupc];
+
+      for (Index j = 0; j < nrhs; ++j) {
+        for (Index jcol = fsupc; jcol < fsupc + nsupc; jcol++) {
+          typename MatrixUType::InnerIterator it(m_mapU, jcol);
+          for (; it; ++it) {
+            Index irow = it.index();
+            X(jcol, j) -= X(irow, j) * (Conjugate ? conj(it.value()) : it.value());
+          }
+        }
+      }
+      if (nsupc == 1) {
+        for (Index j = 0; j < nrhs; j++) {
+          X(fsupc, j) /= (Conjugate ? conj(m_mapL.valuePtr()[luptr]) : m_mapL.valuePtr()[luptr]);
+        }
+      } else {
+        Map<const Matrix<Scalar, Dynamic, Dynamic, ColMajor>, 0, OuterStride<>> A(&(m_mapL.valuePtr()[luptr]), nsupc,
+                                                                                  nsupc, OuterStride<>(lda));
+        typename Dest::RowsBlockXpr U = X.derived().middleRows(fsupc, nsupc);
+        if (Conjugate)
+          U = A.adjoint().template triangularView<Lower>().solve(U);
+        else
+          U = A.transpose().template triangularView<Lower>().solve(U);
+      }
+    }  // End For U-solve
   }
-};
 
-template<typename _MatrixType, typename Derived, typename Rhs>
-struct sparse_solve_retval<SparseLU<_MatrixType,Derived>, Rhs>
-  : sparse_solve_retval_base<SparseLU<_MatrixType,Derived>, Rhs>
-{
-  typedef SparseLU<_MatrixType,Derived> Dec;
-  EIGEN_MAKE_SPARSE_SOLVE_HELPERS(Dec,Rhs)
+  SparseMatrix<Scalar, RowMajor, Index> toSparse() {
+    ArrayXi rowCount = ArrayXi::Zero(rows());
+    for (Index i = 0; i < cols(); i++) {
+      typename MatrixLType::InnerIterator iter(m_mapL, i);
+      for (; iter; ++iter) {
+        if (iter.row() <= iter.col()) {
+          rowCount(iter.row())++;
+        }
+      }
+    }
 
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    this->defaultEvalTo(dst);
+    SparseMatrix<Scalar, RowMajor, Index> sU(rows(), cols());
+    sU.reserve(rowCount);
+    for (Index i = 0; i < cols(); i++) {
+      typename MatrixLType::InnerIterator iter(m_mapL, i);
+      for (; iter; ++iter) {
+        if (iter.row() <= iter.col()) {
+          sU.insert(iter.row(), iter.col()) = iter.value();
+        }
+      }
+    }
+    sU.makeCompressed();
+    const SparseMatrix<Scalar, RowMajor, Index> u = m_mapU;  // convert to RowMajor
+    sU += u;
+    return sU;
   }
+
+  const MatrixLType& m_mapL;
+  const MatrixUType& m_mapU;
 };
-} // end namespace internal
 
-} // End namespace Eigen 
+}  // End namespace Eigen
 
 #endif
diff --git a/inst/include/Eigen/src/SparseLU/SparseLUImpl.h b/inst/include/Eigen/src/SparseLU/SparseLUImpl.h
index 99d651e4..96b9c651 100644
--- a/inst/include/Eigen/src/SparseLU/SparseLUImpl.h
+++ b/inst/include/Eigen/src/SparseLU/SparseLUImpl.h
@@ -9,58 +9,71 @@
 #ifndef SPARSELU_IMPL_H
 #define SPARSELU_IMPL_H
 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
 namespace Eigen {
 namespace internal {
-  
+
 /** \ingroup SparseLU_Module
-  * \class SparseLUImpl
-  * Base class for sparseLU
-  */
-template <typename Scalar, typename Index>
-class SparseLUImpl
-{
-  public:
-    typedef Matrix<Scalar,Dynamic,1> ScalarVector;
-    typedef Matrix<Scalar,Dynamic,Dynamic,ColMajor> ScalarMatrix;
-    typedef Map<ScalarMatrix, 0,  OuterStride<> > MappedMatrixBlock;
-    typedef Matrix<Index,Dynamic,1> IndexVector; 
-    typedef typename ScalarVector::RealScalar RealScalar; 
-    typedef Ref<Matrix<Scalar,Dynamic,1> > BlockScalarVector;
-    typedef Ref<Matrix<Index,Dynamic,1> > BlockIndexVector;
-    typedef LU_GlobalLU_t<IndexVector, ScalarVector> GlobalLU_t; 
-    typedef SparseMatrix<Scalar,ColMajor,Index> MatrixType; 
-    
-  protected:
-     template <typename VectorType>
-     Index expand(VectorType& vec, Index& length, Index nbElts, Index keep_prev, Index& num_expansions);
-     Index memInit(Index m, Index n, Index annz, Index lwork, Index fillratio, Index panel_size,  GlobalLU_t& glu); 
-     template <typename VectorType>
-     Index memXpand(VectorType& vec, Index& maxlen, Index nbElts, MemType memtype, Index& num_expansions);
-     void heap_relax_snode (const Index n, IndexVector& et, const Index relax_columns, IndexVector& descendants, IndexVector& relax_end); 
-     void relax_snode (const Index n, IndexVector& et, const Index relax_columns, IndexVector& descendants, IndexVector& relax_end); 
-     Index snode_dfs(const Index jcol, const Index kcol,const MatrixType& mat,  IndexVector& xprune, IndexVector& marker, GlobalLU_t& glu); 
-     Index snode_bmod (const Index jcol, const Index fsupc, ScalarVector& dense, GlobalLU_t& glu);
-     Index pivotL(const Index jcol, const RealScalar& diagpivotthresh, IndexVector& perm_r, IndexVector& iperm_c, Index& pivrow, GlobalLU_t& glu);
-     template <typename Traits>
-     void dfs_kernel(const Index jj, IndexVector& perm_r,
-                    Index& nseg, IndexVector& panel_lsub, IndexVector& segrep,
-                    Ref<IndexVector> repfnz_col, IndexVector& xprune, Ref<IndexVector> marker, IndexVector& parent,
-                    IndexVector& xplore, GlobalLU_t& glu, Index& nextl_col, Index krow, Traits& traits);
-     void panel_dfs(const Index m, const Index w, const Index jcol, MatrixType& A, IndexVector& perm_r, Index& nseg, ScalarVector& dense, IndexVector& panel_lsub, IndexVector& segrep, IndexVector& repfnz, IndexVector& xprune, IndexVector& marker, IndexVector& parent, IndexVector& xplore, GlobalLU_t& glu);
-    
-     void panel_bmod(const Index m, const Index w, const Index jcol, const Index nseg, ScalarVector& dense, ScalarVector& tempv, IndexVector& segrep, IndexVector& repfnz, GlobalLU_t& glu);
-     Index column_dfs(const Index m, const Index jcol, IndexVector& perm_r, Index maxsuper, Index& nseg,  BlockIndexVector lsub_col, IndexVector& segrep, BlockIndexVector repfnz, IndexVector& xprune, IndexVector& marker, IndexVector& parent, IndexVector& xplore, GlobalLU_t& glu);
-     Index column_bmod(const Index jcol, const Index nseg, BlockScalarVector dense, ScalarVector& tempv, BlockIndexVector segrep, BlockIndexVector repfnz, Index fpanelc, GlobalLU_t& glu); 
-     Index copy_to_ucol(const Index jcol, const Index nseg, IndexVector& segrep, BlockIndexVector repfnz ,IndexVector& perm_r, BlockScalarVector dense, GlobalLU_t& glu); 
-     void pruneL(const Index jcol, const IndexVector& perm_r, const Index pivrow, const Index nseg, const IndexVector& segrep, BlockIndexVector repfnz, IndexVector& xprune, GlobalLU_t& glu);
-     void countnz(const Index n, Index& nnzL, Index& nnzU, GlobalLU_t& glu); 
-     void fixupL(const Index n, const IndexVector& perm_r, GlobalLU_t& glu); 
-     
-     template<typename , typename >
-     friend struct column_dfs_traits;
-}; 
+ * \class SparseLUImpl
+ * Base class for sparseLU
+ */
+template <typename Scalar, typename StorageIndex>
+class SparseLUImpl {
+ public:
+  typedef Matrix<Scalar, Dynamic, 1> ScalarVector;
+  typedef Matrix<StorageIndex, Dynamic, 1> IndexVector;
+  typedef Matrix<Scalar, Dynamic, Dynamic, ColMajor> ScalarMatrix;
+  typedef Map<ScalarMatrix, 0, OuterStride<> > MappedMatrixBlock;
+  typedef typename ScalarVector::RealScalar RealScalar;
+  typedef Ref<Matrix<Scalar, Dynamic, 1> > BlockScalarVector;
+  typedef Ref<Matrix<StorageIndex, Dynamic, 1> > BlockIndexVector;
+  typedef LU_GlobalLU_t<IndexVector, ScalarVector> GlobalLU_t;
+  typedef SparseMatrix<Scalar, ColMajor, StorageIndex> MatrixType;
+
+ protected:
+  template <typename VectorType>
+  Index expand(VectorType& vec, Index& length, Index nbElts, Index keep_prev, Index& num_expansions);
+  Index memInit(Index m, Index n, Index annz, Index lwork, Index fillratio, Index panel_size, GlobalLU_t& glu);
+  template <typename VectorType>
+  Index memXpand(VectorType& vec, Index& maxlen, Index nbElts, MemType memtype, Index& num_expansions);
+  void heap_relax_snode(const Index n, IndexVector& et, const Index relax_columns, IndexVector& descendants,
+                        IndexVector& relax_end);
+  void relax_snode(const Index n, IndexVector& et, const Index relax_columns, IndexVector& descendants,
+                   IndexVector& relax_end);
+  Index snode_dfs(const Index jcol, const Index kcol, const MatrixType& mat, IndexVector& xprune, IndexVector& marker,
+                  GlobalLU_t& glu);
+  Index snode_bmod(const Index jcol, const Index fsupc, ScalarVector& dense, GlobalLU_t& glu);
+  Index pivotL(const Index jcol, const RealScalar& diagpivotthresh, IndexVector& perm_r, IndexVector& iperm_c,
+               Index& pivrow, GlobalLU_t& glu);
+  template <typename Traits>
+  void dfs_kernel(const StorageIndex jj, IndexVector& perm_r, Index& nseg, IndexVector& panel_lsub, IndexVector& segrep,
+                  Ref<IndexVector> repfnz_col, IndexVector& xprune, Ref<IndexVector> marker, IndexVector& parent,
+                  IndexVector& xplore, GlobalLU_t& glu, Index& nextl_col, Index krow, Traits& traits);
+  void panel_dfs(const Index m, const Index w, const Index jcol, MatrixType& A, IndexVector& perm_r, Index& nseg,
+                 ScalarVector& dense, IndexVector& panel_lsub, IndexVector& segrep, IndexVector& repfnz,
+                 IndexVector& xprune, IndexVector& marker, IndexVector& parent, IndexVector& xplore, GlobalLU_t& glu);
+
+  void panel_bmod(const Index m, const Index w, const Index jcol, const Index nseg, ScalarVector& dense,
+                  ScalarVector& tempv, IndexVector& segrep, IndexVector& repfnz, GlobalLU_t& glu);
+  Index column_dfs(const Index m, const Index jcol, IndexVector& perm_r, Index maxsuper, Index& nseg,
+                   BlockIndexVector lsub_col, IndexVector& segrep, BlockIndexVector repfnz, IndexVector& xprune,
+                   IndexVector& marker, IndexVector& parent, IndexVector& xplore, GlobalLU_t& glu);
+  Index column_bmod(const Index jcol, const Index nseg, BlockScalarVector dense, ScalarVector& tempv,
+                    BlockIndexVector segrep, BlockIndexVector repfnz, Index fpanelc, GlobalLU_t& glu);
+  Index copy_to_ucol(const Index jcol, const Index nseg, IndexVector& segrep, BlockIndexVector repfnz,
+                     IndexVector& perm_r, BlockScalarVector dense, GlobalLU_t& glu);
+  void pruneL(const Index jcol, const IndexVector& perm_r, const Index pivrow, const Index nseg,
+              const IndexVector& segrep, BlockIndexVector repfnz, IndexVector& xprune, GlobalLU_t& glu);
+  void countnz(const Index n, Index& nnzL, Index& nnzU, GlobalLU_t& glu);
+  void fixupL(const Index n, const IndexVector& perm_r, GlobalLU_t& glu);
+
+  template <typename, typename>
+  friend struct column_dfs_traits;
+};
 
-} // end namespace internal
-} // namespace Eigen
+}  // end namespace internal
+}  // namespace Eigen
 
 #endif
diff --git a/inst/include/Eigen/src/SparseLU/SparseLU_Memory.h b/inst/include/Eigen/src/SparseLU/SparseLU_Memory.h
index 45f96d16..22affd22 100644
--- a/inst/include/Eigen/src/SparseLU/SparseLU_Memory.h
+++ b/inst/include/Eigen/src/SparseLU/SparseLU_Memory.h
@@ -7,10 +7,10 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-/* 
- 
- * NOTE: This file is the modified version of [s,d,c,z]memory.c files in SuperLU 
- 
+/*
+
+ * NOTE: This file is the modified version of [s,d,c,z]memory.c files in SuperLU
+
  * -- SuperLU routine (version 3.1) --
  * Univ. of California Berkeley, Xerox Palo Alto Research Center,
  * and Lawrence Berkeley National Lab.
@@ -31,197 +31,180 @@
 #ifndef EIGEN_SPARSELU_MEMORY
 #define EIGEN_SPARSELU_MEMORY
 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
 namespace Eigen {
 namespace internal {
-  
+
 enum { LUNoMarker = 3 };
-enum {emptyIdxLU = -1};
-template<typename Index>
-inline Index LUnumTempV(Index& m, Index& w, Index& t, Index& b)
-{
-  return (std::max)(m, (t+b)*w);
-}
+enum { emptyIdxLU = -1 };
+inline Index LUnumTempV(Index& m, Index& w, Index& t, Index& b) { return (std::max)(m, (t + b) * w); }
 
-template< typename Scalar, typename Index>
-inline Index LUTempSpace(Index&m, Index& w)
-{
-  return (2*w + 4 + LUNoMarker) * m * sizeof(Index) + (w + 1) * m * sizeof(Scalar);
+template <typename Scalar>
+inline Index LUTempSpace(Index& m, Index& w) {
+  return (2 * w + 4 + LUNoMarker) * m * sizeof(Index) + (w + 1) * m * sizeof(Scalar);
 }
 
+/**
+ * Expand the existing storage to accommodate more fill-ins
+ * \param vec Valid pointer to the vector to allocate or expand
+ * \param[in,out] length  At input, contain the current length of the vector that is to be increased. At output, length
+ * of the newly allocated vector \param[in] nbElts Current number of elements in the factors \param keep_prev  1: use
+ * length  and do not expand the vector; 0: compute new_len and expand \param[in,out] num_expansions Number of times the
+ * memory has been expanded
+ */
+template <typename Scalar, typename StorageIndex>
+template <typename VectorType>
+Index SparseLUImpl<Scalar, StorageIndex>::expand(VectorType& vec, Index& length, Index nbElts, Index keep_prev,
+                                                 Index& num_expansions) {
+  float alpha = 1.5;  // Ratio of the memory increase
+  Index new_len;      // New size of the allocated memory
 
+  if (num_expansions == 0 || keep_prev)
+    new_len = length;  // First time allocate requested
+  else
+    new_len = (std::max)(length + 1, Index(alpha * length));
 
+  VectorType old_vec;  // Temporary vector to hold the previous values
+  if (nbElts > 0) old_vec = vec.segment(0, nbElts);
 
-/** 
-  * Expand the existing storage to accomodate more fill-ins
-  * \param vec Valid pointer to the vector to allocate or expand
-  * \param[in,out] length  At input, contain the current length of the vector that is to be increased. At output, length of the newly allocated vector
-  * \param[in] nbElts Current number of elements in the factors
-  * \param keep_prev  1: use length  and do not expand the vector; 0: compute new_len and expand
-  * \param[in,out] num_expansions Number of times the memory has been expanded
-  */
-template <typename Scalar, typename Index>
-template <typename VectorType>
-Index  SparseLUImpl<Scalar,Index>::expand(VectorType& vec, Index& length, Index nbElts, Index keep_prev, Index& num_expansions) 
-{
-  
-  float alpha = 1.5; // Ratio of the memory increase 
-  Index new_len; // New size of the allocated memory
-  
-  if(num_expansions == 0 || keep_prev) 
-    new_len = length ; // First time allocate requested
-  else 
-    new_len = (std::max)(length+1,Index(alpha * length));
-  
-  VectorType old_vec; // Temporary vector to hold the previous values   
-  if (nbElts > 0 )
-    old_vec = vec.segment(0,nbElts); 
-  
-  //Allocate or expand the current vector
+    // Allocate or expand the current vector
 #ifdef EIGEN_EXCEPTIONS
   try
 #endif
   {
-    vec.resize(new_len); 
+    vec.resize(new_len);
   }
 #ifdef EIGEN_EXCEPTIONS
-  catch(std::bad_alloc& )
+  catch (std::bad_alloc&)
 #else
-  if(!vec.size())
+  if (!vec.size())
 #endif
   {
-    if (!num_expansions)
-    {
+    if (!num_expansions) {
       // First time to allocate from LUMemInit()
       // Let LUMemInit() deals with it.
       return -1;
     }
-    if (keep_prev)
-    {
+    if (keep_prev) {
       // In this case, the memory length should not not be reduced
       return new_len;
-    }
-    else 
-    {
-      // Reduce the size and increase again 
-      Index tries = 0; // Number of attempts
-      do 
-      {
-        alpha = (alpha + 1)/2;
-        new_len = (std::max)(length+1,Index(alpha * length));
+    } else {
+      // Reduce the size and increase again
+      Index tries = 0;  // Number of attempts
+      do {
+        alpha = (alpha + 1) / 2;
+        new_len = (std::max)(length + 1, Index(alpha * length));
 #ifdef EIGEN_EXCEPTIONS
         try
 #endif
         {
-          vec.resize(new_len); 
+          vec.resize(new_len);
         }
 #ifdef EIGEN_EXCEPTIONS
-        catch(std::bad_alloc& )
+        catch (std::bad_alloc&)
 #else
         if (!vec.size())
 #endif
         {
-          tries += 1; 
-          if ( tries > 10) return new_len; 
+          tries += 1;
+          if (tries > 10) return new_len;
         }
       } while (!vec.size());
     }
   }
-  //Copy the previous values to the newly allocated space 
-  if (nbElts > 0)
-    vec.segment(0, nbElts) = old_vec;   
-   
-  
-  length  = new_len;
-  if(num_expansions) ++num_expansions;
-  return 0; 
+  // Copy the previous values to the newly allocated space
+  if (nbElts > 0) vec.segment(0, nbElts) = old_vec;
+
+  length = new_len;
+  if (num_expansions) ++num_expansions;
+  return 0;
 }
 
 /**
  * \brief  Allocate various working space for the numerical factorization phase.
- * \param m number of rows of the input matrix 
- * \param n number of columns 
- * \param annz number of initial nonzeros in the matrix 
+ * \param m number of rows of the input matrix
+ * \param n number of columns
+ * \param annz number of initial nonzeros in the matrix
  * \param lwork  if lwork=-1, this routine returns an estimated size of the required memory
  * \param glu persistent data to facilitate multiple factors : will be deleted later ??
  * \param fillratio estimated ratio of fill in the factors
  * \param panel_size Size of a panel
- * \return an estimated size of the required memory if lwork = -1; otherwise, return the size of actually allocated memory when allocation failed, and 0 on success
- * \note Unlike SuperLU, this routine does not support successive factorization with the same pattern and the same row permutation
+ * \return an estimated size of the required memory if lwork = -1; otherwise, return the size of actually allocated
+ * memory when allocation failed, and 0 on success \note Unlike SuperLU, this routine does not support successive
+ * factorization with the same pattern and the same row permutation
  */
-template <typename Scalar, typename Index>
-Index SparseLUImpl<Scalar,Index>::memInit(Index m, Index n, Index annz, Index lwork, Index fillratio, Index panel_size,  GlobalLU_t& glu)
-{
-  Index& num_expansions = glu.num_expansions; //No memory expansions so far
+template <typename Scalar, typename StorageIndex>
+Index SparseLUImpl<Scalar, StorageIndex>::memInit(Index m, Index n, Index annz, Index lwork, Index fillratio,
+                                                  Index panel_size, GlobalLU_t& glu) {
+  Index& num_expansions = glu.num_expansions;  // No memory expansions so far
   num_expansions = 0;
-  glu.nzumax = glu.nzlumax = (std::min)(fillratio * (annz+1) / n, m) * n; // estimated number of nonzeros in U 
-  glu.nzlmax = (std::max)(Index(4), fillratio) * (annz+1) / 4; // estimated  nnz in L factor
+  glu.nzumax = glu.nzlumax = (std::min)(fillratio * (annz + 1) / n, m) * n;  // estimated number of nonzeros in U
+  glu.nzlmax = (std::max)(Index(4), fillratio) * (annz + 1) / 4;             // estimated  nnz in L factor
   // Return the estimated size to the user if necessary
   Index tempSpace;
-  tempSpace = (2*panel_size + 4 + LUNoMarker) * m * sizeof(Index) + (panel_size + 1) * m * sizeof(Scalar);
-  if (lwork == emptyIdxLU) 
-  {
+  tempSpace = (2 * panel_size + 4 + LUNoMarker) * m * sizeof(Index) + (panel_size + 1) * m * sizeof(Scalar);
+  if (lwork == emptyIdxLU) {
     Index estimated_size;
-    estimated_size = (5 * n + 5) * sizeof(Index)  + tempSpace
-                    + (glu.nzlmax + glu.nzumax) * sizeof(Index) + (glu.nzlumax+glu.nzumax) *  sizeof(Scalar) + n; 
+    estimated_size = (5 * n + 5) * sizeof(Index) + tempSpace + (glu.nzlmax + glu.nzumax) * sizeof(Index) +
+                     (glu.nzlumax + glu.nzumax) * sizeof(Scalar) + n;
     return estimated_size;
   }
-  
-  // Setup the required space 
-  
+
+  // Setup the required space
+
   // First allocate Integer pointers for L\U factors
-  glu.xsup.resize(n+1);
-  glu.supno.resize(n+1);
-  glu.xlsub.resize(n+1);
-  glu.xlusup.resize(n+1);
-  glu.xusub.resize(n+1);
+  glu.xsup.resize(n + 1);
+  glu.supno.resize(n + 1);
+  glu.xlsub.resize(n + 1);
+  glu.xlusup.resize(n + 1);
+  glu.xusub.resize(n + 1);
 
   // Reserve memory for L/U factors
-  do 
-  {
-    if(     (expand<ScalarVector>(glu.lusup, glu.nzlumax, 0, 0, num_expansions)<0)
-        ||  (expand<ScalarVector>(glu.ucol,  glu.nzumax,  0, 0, num_expansions)<0)
-        ||  (expand<IndexVector> (glu.lsub,  glu.nzlmax,  0, 0, num_expansions)<0)
-        ||  (expand<IndexVector> (glu.usub,  glu.nzumax,  0, 1, num_expansions)<0) )
-    {
-      //Reduce the estimated size and retry
+  do {
+    if ((expand<ScalarVector>(glu.lusup, glu.nzlumax, 0, 0, num_expansions) < 0) ||
+        (expand<ScalarVector>(glu.ucol, glu.nzumax, 0, 0, num_expansions) < 0) ||
+        (expand<IndexVector>(glu.lsub, glu.nzlmax, 0, 0, num_expansions) < 0) ||
+        (expand<IndexVector>(glu.usub, glu.nzumax, 0, 1, num_expansions) < 0)) {
+      // Reduce the estimated size and retry
       glu.nzlumax /= 2;
       glu.nzumax /= 2;
       glu.nzlmax /= 2;
-      if (glu.nzlumax < annz ) return glu.nzlumax; 
+      if (glu.nzlumax < annz) return glu.nzlumax;
     }
   } while (!glu.lusup.size() || !glu.ucol.size() || !glu.lsub.size() || !glu.usub.size());
-  
+
   ++num_expansions;
   return 0;
-  
-} // end LuMemInit
 
-/** 
- * \brief Expand the existing storage 
- * \param vec vector to expand 
+}  // end LuMemInit
+
+/**
+ * \brief Expand the existing storage
+ * \param vec vector to expand
  * \param[in,out] maxlen On input, previous size of vec (Number of elements to copy ). on output, new size
  * \param nbElts current number of elements in the vector.
  * \param memtype Type of the element to expand
- * \param num_expansions Number of expansions 
+ * \param num_expansions Number of expansions
  * \return 0 on success, > 0 size of the memory allocated so far
  */
-template <typename Scalar, typename Index>
+template <typename Scalar, typename StorageIndex>
 template <typename VectorType>
-Index SparseLUImpl<Scalar,Index>::memXpand(VectorType& vec, Index& maxlen, Index nbElts, MemType memtype, Index& num_expansions)
-{
-  Index failed_size; 
+Index SparseLUImpl<Scalar, StorageIndex>::memXpand(VectorType& vec, Index& maxlen, Index nbElts, MemType memtype,
+                                                   Index& num_expansions) {
+  Index failed_size;
   if (memtype == USUB)
-     failed_size = this->expand<VectorType>(vec, maxlen, nbElts, 1, num_expansions);
+    failed_size = this->expand<VectorType>(vec, maxlen, nbElts, 1, num_expansions);
   else
     failed_size = this->expand<VectorType>(vec, maxlen, nbElts, 0, num_expansions);
 
-  if (failed_size)
-    return failed_size; 
-  
-  return 0 ;  
+  if (failed_size) return failed_size;
+
+  return 0;
 }
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
-#endif // EIGEN_SPARSELU_MEMORY
+}  // end namespace Eigen
+#endif  // EIGEN_SPARSELU_MEMORY
diff --git a/inst/include/Eigen/src/SparseLU/SparseLU_Structs.h b/inst/include/Eigen/src/SparseLU/SparseLU_Structs.h
index 24d6bf17..85ba8841 100644
--- a/inst/include/Eigen/src/SparseLU/SparseLU_Structs.h
+++ b/inst/include/Eigen/src/SparseLU/SparseLU_Structs.h
@@ -7,26 +7,26 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-/* 
+/*
  * NOTE: This file comes from a partly modified version of files slu_[s,d,c,z]defs.h
  * -- SuperLU routine (version 4.1) --
  * Univ. of California Berkeley, Xerox Palo Alto Research Center,
  * and Lawrence Berkeley National Lab.
  * November, 2010
- * 
+ *
  * Global data structures used in LU factorization -
- * 
+ *
  *   nsuper: #supernodes = nsuper + 1, numbered [0, nsuper].
  *   (xsup,supno): supno[i] is the supernode no to which i belongs;
  *  xsup(s) points to the beginning of the s-th supernode.
  *  e.g.   supno 0 1 2 2 3 3 3 4 4 4 4 4   (n=12)
  *          xsup 0 1 2 4 7 12
- *  Note: dfs will be performed on supernode rep. relative to the new 
+ *  Note: dfs will be performed on supernode rep. relative to the new
  *        row pivoting ordering
  *
  *   (xlsub,lsub): lsub[*] contains the compressed subscript of
  *  rectangular supernodes; xlsub[j] points to the starting
- *  location of the j-th column in lsub[*]. Note that xlsub 
+ *  location of the j-th column in lsub[*]. Note that xlsub
  *  is indexed by column.
  *  Storage: original row subscripts
  *
@@ -50,7 +50,7 @@
  *  values.
  *
  *  The last column structures (for pruning) will be removed
- *  after the numercial LU factorization phase.
+ *  after the numerical LU factorization phase.
  *
  *   (xlusup,lusup): lusup[*] contains the numerical values of the
  *  rectangular supernodes; xlusup[j] points to the starting
@@ -68,44 +68,46 @@
 
 #ifndef EIGEN_LU_STRUCTS
 #define EIGEN_LU_STRUCTS
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
 namespace Eigen {
 namespace internal {
-  
-typedef enum {LUSUP, UCOL, LSUB, USUB, LLVL, ULVL} MemType; 
+
+enum MemType { LUSUP, UCOL, LSUB, USUB, LLVL, ULVL };
 
 template <typename IndexVector, typename ScalarVector>
 struct LU_GlobalLU_t {
-  typedef typename IndexVector::Scalar Index; 
-  IndexVector xsup; //First supernode column ... xsup(s) points to the beginning of the s-th supernode
-  IndexVector supno; // Supernode number corresponding to this column (column to supernode mapping)
-  ScalarVector  lusup; // nonzero values of L ordered by columns 
-  IndexVector lsub; // Compressed row indices of L rectangular supernodes. 
-  IndexVector xlusup; // pointers to the beginning of each column in lusup
-  IndexVector xlsub; // pointers to the beginning of each column in lsub
-  Index   nzlmax; // Current max size of lsub
-  Index   nzlumax; // Current max size of lusup
-  ScalarVector  ucol; // nonzero values of U ordered by columns 
-  IndexVector usub; // row indices of U columns in ucol
-  IndexVector xusub; // Pointers to the beginning of each column of U in ucol 
-  Index   nzumax; // Current max size of ucol
-  Index   n; // Number of columns in the matrix  
-  Index   num_expansions; 
+  typedef typename IndexVector::Scalar StorageIndex;
+  IndexVector xsup;    // First supernode column ... xsup(s) points to the beginning of the s-th supernode
+  IndexVector supno;   // Supernode number corresponding to this column (column to supernode mapping)
+  ScalarVector lusup;  // nonzero values of L ordered by columns
+  IndexVector lsub;    // Compressed row indices of L rectangular supernodes.
+  IndexVector xlusup;  // pointers to the beginning of each column in lusup
+  IndexVector xlsub;   // pointers to the beginning of each column in lsub
+  Index nzlmax;        // Current max size of lsub
+  Index nzlumax;       // Current max size of lusup
+  ScalarVector ucol;   // nonzero values of U ordered by columns
+  IndexVector usub;    // row indices of U columns in ucol
+  IndexVector xusub;   // Pointers to the beginning of each column of U in ucol
+  Index nzumax;        // Current max size of ucol
+  Index n;             // Number of columns in the matrix
+  Index num_expansions;
 };
 
 // Values to set for performance
-template <typename Index>
 struct perfvalues {
-  Index panel_size; // a panel consists of at most <panel_size> consecutive columns
-  Index relax; // To control degree of relaxing supernodes. If the number of nodes (columns) 
-                // in a subtree of the elimination tree is less than relax, this subtree is considered 
-                // as one supernode regardless of the row structures of those columns
-  Index maxsuper; // The maximum size for a supernode in complete LU
-  Index rowblk; // The minimum row dimension for 2-D blocking to be used;
-  Index colblk; // The minimum column dimension for 2-D blocking to be used;
-  Index fillfactor; // The estimated fills factors for L and U, compared with A
-}; 
+  Index panel_size;  // a panel consists of at most <panel_size> consecutive columns
+  Index relax;       // To control degree of relaxing supernodes. If the number of nodes (columns)
+                     // in a subtree of the elimination tree is less than relax, this subtree is considered
+                     // as one supernode regardless of the row structures of those columns
+  Index maxsuper;    // The maximum size for a supernode in complete LU
+  Index rowblk;      // The minimum row dimension for 2-D blocking to be used;
+  Index colblk;      // The minimum column dimension for 2-D blocking to be used;
+  Index fillfactor;  // The estimated fills factors for L and U, compared with A
+};
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
-#endif // EIGEN_LU_STRUCTS
+}  // end namespace Eigen
+#endif  // EIGEN_LU_STRUCTS
diff --git a/inst/include/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h b/inst/include/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h
index 54a56940..eb159091 100644
--- a/inst/include/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h
+++ b/inst/include/Eigen/src/SparseLU/SparseLU_SupernodalMatrix.h
@@ -11,288 +11,309 @@
 #ifndef EIGEN_SPARSELU_SUPERNODAL_MATRIX_H
 #define EIGEN_SPARSELU_SUPERNODAL_MATRIX_H
 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
 namespace Eigen {
 namespace internal {
 
 /** \ingroup SparseLU_Module
  * \brief a class to manipulate the L supernodal factor from the SparseLU factorization
- * 
- * This class  contain the data to easily store 
- * and manipulate the supernodes during the factorization and solution phase of Sparse LU. 
+ *
+ * This class  contain the data to easily store
+ * and manipulate the supernodes during the factorization and solution phase of Sparse LU.
  * Only the lower triangular matrix has supernodes.
- * 
+ *
  * NOTE : This class corresponds to the SCformat structure in SuperLU
- * 
+ *
  */
 /* TODO
- * InnerIterator as for sparsematrix 
- * SuperInnerIterator to iterate through all supernodes 
+ * InnerIterator as for sparsematrix
+ * SuperInnerIterator to iterate through all supernodes
  * Function for triangular solve
  */
-template <typename _Scalar, typename _Index>
-class MappedSuperNodalMatrix
-{
-  public:
-    typedef _Scalar Scalar; 
-    typedef _Index Index;
-    typedef Matrix<Index,Dynamic,1> IndexVector; 
-    typedef Matrix<Scalar,Dynamic,1> ScalarVector;
-  public:
-    MappedSuperNodalMatrix()
-    {
-      
-    }
-    MappedSuperNodalMatrix(Index m, Index n,  ScalarVector& nzval, IndexVector& nzval_colptr, IndexVector& rowind, 
-             IndexVector& rowind_colptr, IndexVector& col_to_sup, IndexVector& sup_to_col )
-    {
-      setInfos(m, n, nzval, nzval_colptr, rowind, rowind_colptr, col_to_sup, sup_to_col);
-    }
-    
-    ~MappedSuperNodalMatrix()
-    {
-      
-    }
-    /**
-     * Set appropriate pointers for the lower triangular supernodal matrix
-     * These infos are available at the end of the numerical factorization
-     * FIXME This class will be modified such that it can be use in the course 
-     * of the factorization.
-     */
-    void setInfos(Index m, Index n, ScalarVector& nzval, IndexVector& nzval_colptr, IndexVector& rowind, 
-             IndexVector& rowind_colptr, IndexVector& col_to_sup, IndexVector& sup_to_col )
-    {
-      m_row = m;
-      m_col = n; 
-      m_nzval = nzval.data(); 
-      m_nzval_colptr = nzval_colptr.data(); 
-      m_rowind = rowind.data(); 
-      m_rowind_colptr = rowind_colptr.data(); 
-      m_nsuper = col_to_sup(n); 
-      m_col_to_sup = col_to_sup.data(); 
-      m_sup_to_col = sup_to_col.data(); 
-    }
-    
-    /**
-     * Number of rows
-     */
-    Index rows() { return m_row; }
-    
-    /**
-     * Number of columns
-     */
-    Index cols() { return m_col; }
-    
-    /**
-     * Return the array of nonzero values packed by column
-     * 
-     * The size is nnz
-     */
-    Scalar* valuePtr() {  return m_nzval; }
-    
-    const Scalar* valuePtr() const 
-    {
-      return m_nzval; 
-    }
-    /**
-     * Return the pointers to the beginning of each column in \ref valuePtr()
-     */
-    Index* colIndexPtr()
-    {
-      return m_nzval_colptr; 
-    }
-    
-    const Index* colIndexPtr() const
-    {
-      return m_nzval_colptr; 
-    }
-    
-    /**
-     * Return the array of compressed row indices of all supernodes
-     */
-    Index* rowIndex()  { return m_rowind; }
-    
-    const Index* rowIndex() const
-    {
-      return m_rowind; 
-    }
-    
-    /**
-     * Return the location in \em rowvaluePtr() which starts each column
-     */
-    Index* rowIndexPtr() { return m_rowind_colptr; }
-    
-    const Index* rowIndexPtr() const 
-    {
-      return m_rowind_colptr; 
-    }
-    
-    /** 
-     * Return the array of column-to-supernode mapping 
-     */
-    Index* colToSup()  { return m_col_to_sup; }
-    
-    const Index* colToSup() const
-    {
-      return m_col_to_sup;       
-    }
-    /**
-     * Return the array of supernode-to-column mapping
-     */
-    Index* supToCol() { return m_sup_to_col; }
-    
-    const Index* supToCol() const 
-    {
-      return m_sup_to_col;
-    }
-    
-    /**
-     * Return the number of supernodes
-     */
-    Index nsuper() const 
-    {
-      return m_nsuper; 
-    }
-    
-    class InnerIterator; 
-    template<typename Dest>
-    void solveInPlace( MatrixBase<Dest>&X) const;
-    
-      
-      
-    
-  protected:
-    Index m_row; // Number of rows
-    Index m_col; // Number of columns 
-    Index m_nsuper; // Number of supernodes 
-    Scalar* m_nzval; //array of nonzero values packed by column
-    Index* m_nzval_colptr; //nzval_colptr[j] Stores the location in nzval[] which starts column j 
-    Index* m_rowind; // Array of compressed row indices of rectangular supernodes
-    Index* m_rowind_colptr; //rowind_colptr[j] stores the location in rowind[] which starts column j
-    Index* m_col_to_sup; // col_to_sup[j] is the supernode number to which column j belongs
-    Index* m_sup_to_col; //sup_to_col[s] points to the starting column of the s-th supernode
-    
-  private :
+template <typename Scalar_, typename StorageIndex_>
+class MappedSuperNodalMatrix {
+ public:
+  typedef Scalar_ Scalar;
+  typedef StorageIndex_ StorageIndex;
+  typedef Matrix<StorageIndex, Dynamic, 1> IndexVector;
+  typedef Matrix<Scalar, Dynamic, 1> ScalarVector;
+
+ public:
+  MappedSuperNodalMatrix() {}
+  MappedSuperNodalMatrix(Index m, Index n, ScalarVector& nzval, IndexVector& nzval_colptr, IndexVector& rowind,
+                         IndexVector& rowind_colptr, IndexVector& col_to_sup, IndexVector& sup_to_col) {
+    setInfos(m, n, nzval, nzval_colptr, rowind, rowind_colptr, col_to_sup, sup_to_col);
+  }
+
+  ~MappedSuperNodalMatrix() {}
+  /**
+   * Set appropriate pointers for the lower triangular supernodal matrix
+   * These infos are available at the end of the numerical factorization
+   * FIXME This class will be modified such that it can be use in the course
+   * of the factorization.
+   */
+  void setInfos(Index m, Index n, ScalarVector& nzval, IndexVector& nzval_colptr, IndexVector& rowind,
+                IndexVector& rowind_colptr, IndexVector& col_to_sup, IndexVector& sup_to_col) {
+    m_row = m;
+    m_col = n;
+    m_nzval = nzval.data();
+    m_nzval_colptr = nzval_colptr.data();
+    m_rowind = rowind.data();
+    m_rowind_colptr = rowind_colptr.data();
+    m_nsuper = col_to_sup(n);
+    m_col_to_sup = col_to_sup.data();
+    m_sup_to_col = sup_to_col.data();
+  }
+
+  /**
+   * Number of rows
+   */
+  Index rows() const { return m_row; }
+
+  /**
+   * Number of columns
+   */
+  Index cols() const { return m_col; }
+
+  /**
+   * Return the array of nonzero values packed by column
+   *
+   * The size is nnz
+   */
+  Scalar* valuePtr() { return m_nzval; }
+
+  const Scalar* valuePtr() const { return m_nzval; }
+  /**
+   * Return the pointers to the beginning of each column in \ref valuePtr()
+   */
+  StorageIndex* colIndexPtr() { return m_nzval_colptr; }
+
+  const StorageIndex* colIndexPtr() const { return m_nzval_colptr; }
+
+  /**
+   * Return the array of compressed row indices of all supernodes
+   */
+  StorageIndex* rowIndex() { return m_rowind; }
+
+  const StorageIndex* rowIndex() const { return m_rowind; }
+
+  /**
+   * Return the location in \em rowvaluePtr() which starts each column
+   */
+  StorageIndex* rowIndexPtr() { return m_rowind_colptr; }
+
+  const StorageIndex* rowIndexPtr() const { return m_rowind_colptr; }
+
+  /**
+   * Return the array of column-to-supernode mapping
+   */
+  StorageIndex* colToSup() { return m_col_to_sup; }
+
+  const StorageIndex* colToSup() const { return m_col_to_sup; }
+  /**
+   * Return the array of supernode-to-column mapping
+   */
+  StorageIndex* supToCol() { return m_sup_to_col; }
+
+  const StorageIndex* supToCol() const { return m_sup_to_col; }
+
+  /**
+   * Return the number of supernodes
+   */
+  Index nsuper() const { return m_nsuper; }
+
+  class InnerIterator;
+  template <typename Dest>
+  void solveInPlace(MatrixBase<Dest>& X) const;
+  template <bool Conjugate, typename Dest>
+  void solveTransposedInPlace(MatrixBase<Dest>& X) const;
+
+ protected:
+  Index m_row;                    // Number of rows
+  Index m_col;                    // Number of columns
+  Index m_nsuper;                 // Number of supernodes
+  Scalar* m_nzval;                // array of nonzero values packed by column
+  StorageIndex* m_nzval_colptr;   // nzval_colptr[j] Stores the location in nzval[] which starts column j
+  StorageIndex* m_rowind;         // Array of compressed row indices of rectangular supernodes
+  StorageIndex* m_rowind_colptr;  // rowind_colptr[j] stores the location in rowind[] which starts column j
+  StorageIndex* m_col_to_sup;     // col_to_sup[j] is the supernode number to which column j belongs
+  StorageIndex* m_sup_to_col;     // sup_to_col[s] points to the starting column of the s-th supernode
+
+ private:
 };
 
 /**
-  * \brief InnerIterator class to iterate over nonzero values of the current column in the supernodal matrix L
-  * 
-  */
-template<typename Scalar, typename Index>
-class MappedSuperNodalMatrix<Scalar,Index>::InnerIterator
-{
-  public:
-     InnerIterator(const MappedSuperNodalMatrix& mat, Index outer)
+ * \brief InnerIterator class to iterate over nonzero values of the current column in the supernodal matrix L
+ *
+ */
+template <typename Scalar, typename StorageIndex>
+class MappedSuperNodalMatrix<Scalar, StorageIndex>::InnerIterator {
+ public:
+  InnerIterator(const MappedSuperNodalMatrix& mat, Index outer)
       : m_matrix(mat),
-        m_outer(outer), 
+        m_outer(outer),
         m_supno(mat.colToSup()[outer]),
         m_idval(mat.colIndexPtr()[outer]),
         m_startidval(m_idval),
-        m_endidval(mat.colIndexPtr()[outer+1]),
+        m_endidval(mat.colIndexPtr()[outer + 1]),
         m_idrow(mat.rowIndexPtr()[mat.supToCol()[mat.colToSup()[outer]]]),
-        m_endidrow(mat.rowIndexPtr()[mat.supToCol()[mat.colToSup()[outer]]+1])
-    {}
-    inline InnerIterator& operator++()
-    { 
-      m_idval++; 
-      m_idrow++;
-      return *this;
-    }
-    inline Scalar value() const { return m_matrix.valuePtr()[m_idval]; }
-    
-    inline Scalar& valueRef() { return const_cast<Scalar&>(m_matrix.valuePtr()[m_idval]); }
-    
-    inline Index index() const { return m_matrix.rowIndex()[m_idrow]; }
-    inline Index row() const { return index(); }
-    inline Index col() const { return m_outer; }
-    
-    inline Index supIndex() const { return m_supno; }
-    
-    inline operator bool() const 
-    { 
-      return ( (m_idval < m_endidval) && (m_idval >= m_startidval)
-                && (m_idrow < m_endidrow) );
-    }
-    
-  protected:
-    const MappedSuperNodalMatrix& m_matrix; // Supernodal lower triangular matrix 
-    const Index m_outer;                    // Current column 
-    const Index m_supno;                    // Current SuperNode number
-    Index m_idval;                          // Index to browse the values in the current column
-    const Index m_startidval;               // Start of the column value
-    const Index m_endidval;                 // End of the column value
-    Index m_idrow;                          // Index to browse the row indices 
-    Index m_endidrow;                       // End index of row indices of the current column
+        m_endidrow(mat.rowIndexPtr()[mat.supToCol()[mat.colToSup()[outer]] + 1]) {}
+  inline InnerIterator& operator++() {
+    m_idval++;
+    m_idrow++;
+    return *this;
+  }
+  inline Scalar value() const { return m_matrix.valuePtr()[m_idval]; }
+
+  inline Scalar& valueRef() { return const_cast<Scalar&>(m_matrix.valuePtr()[m_idval]); }
+
+  inline Index index() const { return m_matrix.rowIndex()[m_idrow]; }
+  inline Index row() const { return index(); }
+  inline Index col() const { return m_outer; }
+
+  inline Index supIndex() const { return m_supno; }
+
+  inline operator bool() const {
+    return ((m_idval < m_endidval) && (m_idval >= m_startidval) && (m_idrow < m_endidrow));
+  }
+
+ protected:
+  const MappedSuperNodalMatrix& m_matrix;  // Supernodal lower triangular matrix
+  const Index m_outer;                     // Current column
+  const Index m_supno;                     // Current SuperNode number
+  Index m_idval;                           // Index to browse the values in the current column
+  const Index m_startidval;                // Start of the column value
+  const Index m_endidval;                  // End of the column value
+  Index m_idrow;                           // Index to browse the row indices
+  Index m_endidrow;                        // End index of row indices of the current column
 };
 
 /**
  * \brief Solve with the supernode triangular matrix
- * 
+ *
  */
-template<typename Scalar, typename Index>
-template<typename Dest>
-void MappedSuperNodalMatrix<Scalar,Index>::solveInPlace( MatrixBase<Dest>&X) const
-{
-    Index n = X.rows(); 
-    Index nrhs = X.cols(); 
-    const Scalar * Lval = valuePtr();                 // Nonzero values 
-    Matrix<Scalar,Dynamic,Dynamic, ColMajor> work(n, nrhs);     // working vector
-    work.setZero();
-    for (Index k = 0; k <= nsuper(); k ++)
-    {
-      Index fsupc = supToCol()[k];                    // First column of the current supernode 
-      Index istart = rowIndexPtr()[fsupc];            // Pointer index to the subscript of the current column
-      Index nsupr = rowIndexPtr()[fsupc+1] - istart;  // Number of rows in the current supernode
-      Index nsupc = supToCol()[k+1] - fsupc;          // Number of columns in the current supernode
-      Index nrow = nsupr - nsupc;                     // Number of rows in the non-diagonal part of the supernode
-      Index irow;                                     //Current index row
-      
-      if (nsupc == 1 )
-      {
-        for (Index j = 0; j < nrhs; j++)
-        {
-          InnerIterator it(*this, fsupc);
-          ++it; // Skip the diagonal element
-          for (; it; ++it)
-          {
-            irow = it.row();
-            X(irow, j) -= X(fsupc, j) * it.value();
-          }
+template <typename Scalar, typename Index_>
+template <typename Dest>
+void MappedSuperNodalMatrix<Scalar, Index_>::solveInPlace(MatrixBase<Dest>& X) const {
+  /* Explicit type conversion as the Index type of MatrixBase<Dest> may be wider than Index */
+  //    eigen_assert(X.rows() <= NumTraits<Index>::highest());
+  //    eigen_assert(X.cols() <= NumTraits<Index>::highest());
+  Index n = int(X.rows());
+  Index nrhs = Index(X.cols());
+  const Scalar* Lval = valuePtr();                                           // Nonzero values
+  Matrix<Scalar, Dynamic, Dest::ColsAtCompileTime, ColMajor> work(n, nrhs);  // working vector
+  work.setZero();
+  for (Index k = 0; k <= nsuper(); k++) {
+    Index fsupc = supToCol()[k];                      // First column of the current supernode
+    Index istart = rowIndexPtr()[fsupc];              // Pointer index to the subscript of the current column
+    Index nsupr = rowIndexPtr()[fsupc + 1] - istart;  // Number of rows in the current supernode
+    Index nsupc = supToCol()[k + 1] - fsupc;          // Number of columns in the current supernode
+    Index nrow = nsupr - nsupc;                       // Number of rows in the non-diagonal part of the supernode
+    Index irow;                                       // Current index row
+
+    if (nsupc == 1) {
+      for (Index j = 0; j < nrhs; j++) {
+        InnerIterator it(*this, fsupc);
+        ++it;  // Skip the diagonal element
+        for (; it; ++it) {
+          irow = it.row();
+          X(irow, j) -= X(fsupc, j) * it.value();
         }
       }
-      else
-      {
-        // The supernode has more than one column 
-        Index luptr = colIndexPtr()[fsupc]; 
-        Index lda = colIndexPtr()[fsupc+1] - luptr;
-        
-        // Triangular solve 
-        Map<const Matrix<Scalar,Dynamic,Dynamic, ColMajor>, 0, OuterStride<> > A( &(Lval[luptr]), nsupc, nsupc, OuterStride<>(lda) );
-        Map< Matrix<Scalar,Dynamic,Dynamic, ColMajor>, 0, OuterStride<> > U (&(X(fsupc,0)), nsupc, nrhs, OuterStride<>(n) ); 
-        U = A.template triangularView<UnitLower>().solve(U); 
-        
-        // Matrix-vector product 
-        new (&A) Map<const Matrix<Scalar,Dynamic,Dynamic, ColMajor>, 0, OuterStride<> > ( &(Lval[luptr+nsupc]), nrow, nsupc, OuterStride<>(lda) );
-        work.block(0, 0, nrow, nrhs) = A * U; 
-        
-        //Begin Scatter 
-        for (Index j = 0; j < nrhs; j++)
-        {
-          Index iptr = istart + nsupc; 
-          for (Index i = 0; i < nrow; i++)
-          {
-            irow = rowIndex()[iptr]; 
-            X(irow, j) -= work(i, j); // Scatter operation
-            work(i, j) = Scalar(0); 
-            iptr++;
-          }
+    } else {
+      // The supernode has more than one column
+      Index luptr = colIndexPtr()[fsupc];
+      Index lda = colIndexPtr()[fsupc + 1] - luptr;
+
+      // Triangular solve
+      Map<const Matrix<Scalar, Dynamic, Dynamic, ColMajor>, 0, OuterStride<> > A(&(Lval[luptr]), nsupc, nsupc,
+                                                                                 OuterStride<>(lda));
+      typename Dest::RowsBlockXpr U = X.derived().middleRows(fsupc, nsupc);
+      U = A.template triangularView<UnitLower>().solve(U);
+      // Matrix-vector product
+      new (&A) Map<const Matrix<Scalar, Dynamic, Dynamic, ColMajor>, 0, OuterStride<> >(&(Lval[luptr + nsupc]), nrow,
+                                                                                        nsupc, OuterStride<>(lda));
+      work.topRows(nrow).noalias() = A * U;
+
+      // Begin Scatter
+      for (Index j = 0; j < nrhs; j++) {
+        Index iptr = istart + nsupc;
+        for (Index i = 0; i < nrow; i++) {
+          irow = rowIndex()[iptr];
+          X(irow, j) -= work(i, j);  // Scatter operation
+          work(i, j) = Scalar(0);
+          iptr++;
         }
       }
-    } 
+    }
+  }
+}
+
+template <typename Scalar, typename Index_>
+template <bool Conjugate, typename Dest>
+void MappedSuperNodalMatrix<Scalar, Index_>::solveTransposedInPlace(MatrixBase<Dest>& X) const {
+  using numext::conj;
+  Index n = int(X.rows());
+  Index nrhs = Index(X.cols());
+  const Scalar* Lval = valuePtr();                                           // Nonzero values
+  Matrix<Scalar, Dynamic, Dest::ColsAtCompileTime, ColMajor> work(n, nrhs);  // working vector
+  work.setZero();
+  for (Index k = nsuper(); k >= 0; k--) {
+    Index fsupc = supToCol()[k];                      // First column of the current supernode
+    Index istart = rowIndexPtr()[fsupc];              // Pointer index to the subscript of the current column
+    Index nsupr = rowIndexPtr()[fsupc + 1] - istart;  // Number of rows in the current supernode
+    Index nsupc = supToCol()[k + 1] - fsupc;          // Number of columns in the current supernode
+    Index nrow = nsupr - nsupc;                       // Number of rows in the non-diagonal part of the supernode
+    Index irow;                                       // Current index row
+
+    if (nsupc == 1) {
+      for (Index j = 0; j < nrhs; j++) {
+        InnerIterator it(*this, fsupc);
+        ++it;  // Skip the diagonal element
+        for (; it; ++it) {
+          irow = it.row();
+          X(fsupc, j) -= X(irow, j) * (Conjugate ? conj(it.value()) : it.value());
+        }
+      }
+    } else {
+      // The supernode has more than one column
+      Index luptr = colIndexPtr()[fsupc];
+      Index lda = colIndexPtr()[fsupc + 1] - luptr;
+
+      // Begin Gather
+      for (Index j = 0; j < nrhs; j++) {
+        Index iptr = istart + nsupc;
+        for (Index i = 0; i < nrow; i++) {
+          irow = rowIndex()[iptr];
+          work.topRows(nrow)(i, j) = X(irow, j);  // Gather operation
+          iptr++;
+        }
+      }
+
+      // Matrix-vector product with transposed submatrix
+      Map<const Matrix<Scalar, Dynamic, Dynamic, ColMajor>, 0, OuterStride<> > A(&(Lval[luptr + nsupc]), nrow, nsupc,
+                                                                                 OuterStride<>(lda));
+      typename Dest::RowsBlockXpr U = X.derived().middleRows(fsupc, nsupc);
+      if (Conjugate)
+        U = U - A.adjoint() * work.topRows(nrow);
+      else
+        U = U - A.transpose() * work.topRows(nrow);
+
+      // Triangular solve (of transposed diagonal block)
+      new (&A) Map<const Matrix<Scalar, Dynamic, Dynamic, ColMajor>, 0, OuterStride<> >(&(Lval[luptr]), nsupc, nsupc,
+                                                                                        OuterStride<>(lda));
+      if (Conjugate)
+        U = A.adjoint().template triangularView<UnitUpper>().solve(U);
+      else
+        U = A.transpose().template triangularView<UnitUpper>().solve(U);
+    }
+  }
 }
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_SPARSELU_MATRIX_H
+#endif  // EIGEN_SPARSELU_MATRIX_H
diff --git a/inst/include/Eigen/src/SparseLU/SparseLU_Utils.h b/inst/include/Eigen/src/SparseLU/SparseLU_Utils.h
index 15352ac3..ef087cde 100644
--- a/inst/include/Eigen/src/SparseLU/SparseLU_Utils.h
+++ b/inst/include/Eigen/src/SparseLU/SparseLU_Utils.h
@@ -7,74 +7,69 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-
 #ifndef EIGEN_SPARSELU_UTILS_H
 #define EIGEN_SPARSELU_UTILS_H
 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
 namespace Eigen {
 namespace internal {
 
 /**
  * \brief Count Nonzero elements in the factors
  */
-template <typename Scalar, typename Index>
-void SparseLUImpl<Scalar,Index>::countnz(const Index n, Index& nnzL, Index& nnzU, GlobalLU_t& glu)
-{
- nnzL = 0; 
- nnzU = (glu.xusub)(n); 
- Index nsuper = (glu.supno)(n); 
- Index jlen; 
- Index i, j, fsupc;
- if (n <= 0 ) return; 
- // For each supernode
- for (i = 0; i <= nsuper; i++)
- {
-   fsupc = glu.xsup(i); 
-   jlen = glu.xlsub(fsupc+1) - glu.xlsub(fsupc); 
-   
-   for (j = fsupc; j < glu.xsup(i+1); j++)
-   {
-     nnzL += jlen; 
-     nnzU += j - fsupc + 1; 
-     jlen--; 
-   }
- }
+template <typename Scalar, typename StorageIndex>
+void SparseLUImpl<Scalar, StorageIndex>::countnz(const Index n, Index& nnzL, Index& nnzU, GlobalLU_t& glu) {
+  nnzL = 0;
+  nnzU = (glu.xusub)(n);
+  Index nsuper = (glu.supno)(n);
+  Index jlen;
+  Index i, j, fsupc;
+  if (n <= 0) return;
+  // For each supernode
+  for (i = 0; i <= nsuper; i++) {
+    fsupc = glu.xsup(i);
+    jlen = glu.xlsub(fsupc + 1) - glu.xlsub(fsupc);
+
+    for (j = fsupc; j < glu.xsup(i + 1); j++) {
+      nnzL += jlen;
+      nnzU += j - fsupc + 1;
+      jlen--;
+    }
+  }
 }
 
 /**
- * \brief Fix up the data storage lsub for L-subscripts. 
- * 
- * It removes the subscripts sets for structural pruning, 
+ * \brief Fix up the data storage lsub for L-subscripts.
+ *
+ * It removes the subscripts sets for structural pruning,
  * and applies permutation to the remaining subscripts
- * 
+ *
  */
-template <typename Scalar, typename Index>
-void SparseLUImpl<Scalar,Index>::fixupL(const Index n, const IndexVector& perm_r, GlobalLU_t& glu)
-{
-  Index fsupc, i, j, k, jstart; 
-  
-  Index nextl = 0; 
-  Index nsuper = (glu.supno)(n); 
-  
-  // For each supernode 
-  for (i = 0; i <= nsuper; i++)
-  {
-    fsupc = glu.xsup(i); 
-    jstart = glu.xlsub(fsupc); 
-    glu.xlsub(fsupc) = nextl; 
-    for (j = jstart; j < glu.xlsub(fsupc + 1); j++)
-    {
-      glu.lsub(nextl) = perm_r(glu.lsub(j)); // Now indexed into P*A
+template <typename Scalar, typename StorageIndex>
+void SparseLUImpl<Scalar, StorageIndex>::fixupL(const Index n, const IndexVector& perm_r, GlobalLU_t& glu) {
+  Index fsupc, i, j, k, jstart;
+
+  StorageIndex nextl = 0;
+  Index nsuper = (glu.supno)(n);
+
+  // For each supernode
+  for (i = 0; i <= nsuper; i++) {
+    fsupc = glu.xsup(i);
+    jstart = glu.xlsub(fsupc);
+    glu.xlsub(fsupc) = nextl;
+    for (j = jstart; j < glu.xlsub(fsupc + 1); j++) {
+      glu.lsub(nextl) = perm_r(glu.lsub(j));  // Now indexed into P*A
       nextl++;
     }
-    for (k = fsupc+1; k < glu.xsup(i+1); k++)
-      glu.xlsub(k) = nextl; // other columns in supernode i
+    for (k = fsupc + 1; k < glu.xsup(i + 1); k++) glu.xlsub(k) = nextl;  // other columns in supernode i
   }
-  
-  glu.xlsub(n) = nextl; 
+
+  glu.xlsub(n) = nextl;
 }
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
-#endif // EIGEN_SPARSELU_UTILS_H
+}  // end namespace Eigen
+#endif  // EIGEN_SPARSELU_UTILS_H
diff --git a/inst/include/Eigen/src/SparseLU/SparseLU_column_bmod.h b/inst/include/Eigen/src/SparseLU/SparseLU_column_bmod.h
index cacc7e98..8435b562 100644
--- a/inst/include/Eigen/src/SparseLU/SparseLU_column_bmod.h
+++ b/inst/include/Eigen/src/SparseLU/SparseLU_column_bmod.h
@@ -8,10 +8,10 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-/* 
- 
- * NOTE: This file is the modified version of xcolumn_bmod.c file in SuperLU 
- 
+/*
+
+ * NOTE: This file is the modified version of xcolumn_bmod.c file in SuperLU
+
  * -- SuperLU routine (version 3.0) --
  * Univ. of California Berkeley, Xerox Palo Alto Research Center,
  * and Lawrence Berkeley National Lab.
@@ -31,150 +31,147 @@
 #ifndef SPARSELU_COLUMN_BMOD_H
 #define SPARSELU_COLUMN_BMOD_H
 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
 namespace Eigen {
 
 namespace internal {
 /**
  * \brief Performs numeric block updates (sup-col) in topological order
- * 
+ *
  * \param jcol current column to update
  * \param nseg Number of segments in the U part
  * \param dense Store the full representation of the column
- * \param tempv working array 
+ * \param tempv working array
  * \param segrep segment representative ...
  * \param repfnz ??? First nonzero column in each row ???  ...
  * \param fpanelc First column in the current panel
- * \param glu Global LU data. 
- * \return 0 - successful return 
+ * \param glu Global LU data.
+ * \return 0 - successful return
  *         > 0 - number of bytes allocated when run out of space
- * 
+ *
  */
-template <typename Scalar, typename Index>
-Index SparseLUImpl<Scalar,Index>::column_bmod(const Index jcol, const Index nseg, BlockScalarVector dense, ScalarVector& tempv, BlockIndexVector segrep, BlockIndexVector repfnz, Index fpanelc, GlobalLU_t& glu)
-{
-  Index  jsupno, k, ksub, krep, ksupno; 
-  Index lptr, nrow, isub, irow, nextlu, new_next, ufirst; 
-  Index fsupc, nsupc, nsupr, luptr, kfnz, no_zeros; 
+template <typename Scalar, typename StorageIndex>
+Index SparseLUImpl<Scalar, StorageIndex>::column_bmod(const Index jcol, const Index nseg, BlockScalarVector dense,
+                                                      ScalarVector& tempv, BlockIndexVector segrep,
+                                                      BlockIndexVector repfnz, Index fpanelc, GlobalLU_t& glu) {
+  Index jsupno, k, ksub, krep, ksupno;
+  Index lptr, nrow, isub, irow, nextlu, new_next, ufirst;
+  Index fsupc, nsupc, nsupr, luptr, kfnz, no_zeros;
   /* krep = representative of current k-th supernode
-    * fsupc =  first supernodal column
-    * nsupc = number of columns in a supernode
-    * nsupr = number of rows in a supernode
-    * luptr = location of supernodal LU-block in storage
-    * kfnz = first nonz in the k-th supernodal segment
-    * no_zeros = no lf leading zeros in a supernodal U-segment
-    */
-  
+   * fsupc =  first supernodal column
+   * nsupc = number of columns in a supernode
+   * nsupr = number of rows in a supernode
+   * luptr = location of supernodal LU-block in storage
+   * kfnz = first nonz in the k-th supernodal segment
+   * no_zeros = no lf leading zeros in a supernodal U-segment
+   */
+
   jsupno = glu.supno(jcol);
-  // For each nonzero supernode segment of U[*,j] in topological order 
-  k = nseg - 1; 
-  Index d_fsupc; // distance between the first column of the current panel and the 
-               // first column of the current snode
-  Index fst_col; // First column within small LU update
-  Index segsize; 
-  for (ksub = 0; ksub < nseg; ksub++)
-  {
-    krep = segrep(k); k--; 
-    ksupno = glu.supno(krep); 
-    if (jsupno != ksupno )
-    {
-      // outside the rectangular supernode 
-      fsupc = glu.xsup(ksupno); 
-      fst_col = (std::max)(fsupc, fpanelc); 
-      
-      // Distance from the current supernode to the current panel; 
+  // For each nonzero supernode segment of U[*,j] in topological order
+  k = nseg - 1;
+  Index d_fsupc;  // distance between the first column of the current panel and the
+                  // first column of the current snode
+  Index fst_col;  // First column within small LU update
+  Index segsize;
+  for (ksub = 0; ksub < nseg; ksub++) {
+    krep = segrep(k);
+    k--;
+    ksupno = glu.supno(krep);
+    if (jsupno != ksupno) {
+      // outside the rectangular supernode
+      fsupc = glu.xsup(ksupno);
+      fst_col = (std::max)(fsupc, fpanelc);
+
+      // Distance from the current supernode to the current panel;
       // d_fsupc = 0 if fsupc > fpanelc
-      d_fsupc = fst_col - fsupc; 
-      
-      luptr = glu.xlusup(fst_col) + d_fsupc; 
-      lptr = glu.xlsub(fsupc) + d_fsupc; 
-      
-      kfnz = repfnz(krep); 
-      kfnz = (std::max)(kfnz, fpanelc); 
-      
-      segsize = krep - kfnz + 1; 
-      nsupc = krep - fst_col + 1; 
-      nsupr = glu.xlsub(fsupc+1) - glu.xlsub(fsupc); 
+      d_fsupc = fst_col - fsupc;
+
+      luptr = glu.xlusup(fst_col) + d_fsupc;
+      lptr = glu.xlsub(fsupc) + d_fsupc;
+
+      kfnz = repfnz(krep);
+      kfnz = (std::max)(kfnz, fpanelc);
+
+      segsize = krep - kfnz + 1;
+      nsupc = krep - fst_col + 1;
+      nsupr = glu.xlsub(fsupc + 1) - glu.xlsub(fsupc);
       nrow = nsupr - d_fsupc - nsupc;
-      Index lda = glu.xlusup(fst_col+1) - glu.xlusup(fst_col);
-      
-      
-      // Perform a triangular solver and block update, 
+      Index lda = glu.xlusup(fst_col + 1) - glu.xlusup(fst_col);
+
+      // Perform a triangular solver and block update,
       // then scatter the result of sup-col update to dense
-      no_zeros = kfnz - fst_col; 
-      if(segsize==1)
+      no_zeros = kfnz - fst_col;
+      if (segsize == 1)
         LU_kernel_bmod<1>::run(segsize, dense, tempv, glu.lusup, luptr, lda, nrow, glu.lsub, lptr, no_zeros);
       else
         LU_kernel_bmod<Dynamic>::run(segsize, dense, tempv, glu.lusup, luptr, lda, nrow, glu.lsub, lptr, no_zeros);
-    } // end if jsupno 
-  } // end for each segment
-  
+    }  // end if jsupno
+  }    // end for each segment
+
   // Process the supernodal portion of  L\U[*,j]
-  nextlu = glu.xlusup(jcol); 
+  nextlu = glu.xlusup(jcol);
   fsupc = glu.xsup(jsupno);
-  
+
   // copy the SPA dense into L\U[*,j]
-  Index mem; 
-  new_next = nextlu + glu.xlsub(fsupc + 1) - glu.xlsub(fsupc); 
+  Index mem;
+  new_next = nextlu + glu.xlsub(fsupc + 1) - glu.xlsub(fsupc);
   Index offset = internal::first_multiple<Index>(new_next, internal::packet_traits<Scalar>::size) - new_next;
-  if(offset)
-    new_next += offset;
-  while (new_next > glu.nzlumax )
-  {
-    mem = memXpand<ScalarVector>(glu.lusup, glu.nzlumax, nextlu, LUSUP, glu.num_expansions);  
-    if (mem) return mem; 
+  if (offset) new_next += offset;
+  while (new_next > glu.nzlumax) {
+    mem = memXpand<ScalarVector>(glu.lusup, glu.nzlumax, nextlu, LUSUP, glu.num_expansions);
+    if (mem) return mem;
   }
-  
-  for (isub = glu.xlsub(fsupc); isub < glu.xlsub(fsupc+1); isub++)
-  {
+
+  for (isub = glu.xlsub(fsupc); isub < glu.xlsub(fsupc + 1); isub++) {
     irow = glu.lsub(isub);
     glu.lusup(nextlu) = dense(irow);
-    dense(irow) = Scalar(0.0); 
-    ++nextlu; 
+    dense(irow) = Scalar(0.0);
+    ++nextlu;
   }
-  
-  if(offset)
-  {
-    glu.lusup.segment(nextlu,offset).setZero();
+
+  if (offset) {
+    glu.lusup.segment(nextlu, offset).setZero();
     nextlu += offset;
   }
-  glu.xlusup(jcol + 1) = nextlu;  // close L\U(*,jcol); 
-  
+  glu.xlusup(jcol + 1) = StorageIndex(nextlu);  // close L\U(*,jcol);
+
   /* For more updates within the panel (also within the current supernode),
    * should start from the first column of the panel, or the first column
    * of the supernode, whichever is bigger. There are two cases:
    *  1) fsupc < fpanelc, then fst_col <-- fpanelc
    *  2) fsupc >= fpanelc, then fst_col <-- fsupc
    */
-  fst_col = (std::max)(fsupc, fpanelc); 
-  
-  if (fst_col  < jcol)
-  {
+  fst_col = (std::max)(fsupc, fpanelc);
+
+  if (fst_col < jcol) {
     // Distance between the current supernode and the current panel
     // d_fsupc = 0 if fsupc >= fpanelc
-    d_fsupc = fst_col - fsupc; 
-    
-    lptr = glu.xlsub(fsupc) + d_fsupc; 
-    luptr = glu.xlusup(fst_col) + d_fsupc; 
-    nsupr = glu.xlsub(fsupc+1) - glu.xlsub(fsupc); // leading dimension
-    nsupc = jcol - fst_col; // excluding jcol 
-    nrow = nsupr - d_fsupc - nsupc; 
-    
-    // points to the beginning of jcol in snode L\U(jsupno) 
-    ufirst = glu.xlusup(jcol) + d_fsupc; 
-    Index lda = glu.xlusup(jcol+1) - glu.xlusup(jcol);
-    MappedMatrixBlock A( &(glu.lusup.data()[luptr]), nsupc, nsupc, OuterStride<>(lda) );
-    VectorBlock<ScalarVector> u(glu.lusup, ufirst, nsupc); 
-    u = A.template triangularView<UnitLower>().solve(u); 
-    
-    new (&A) MappedMatrixBlock ( &(glu.lusup.data()[luptr+nsupc]), nrow, nsupc, OuterStride<>(lda) );
-    VectorBlock<ScalarVector> l(glu.lusup, ufirst+nsupc, nrow); 
+    d_fsupc = fst_col - fsupc;
+
+    lptr = glu.xlsub(fsupc) + d_fsupc;
+    luptr = glu.xlusup(fst_col) + d_fsupc;
+    nsupr = glu.xlsub(fsupc + 1) - glu.xlsub(fsupc);  // leading dimension
+    nsupc = jcol - fst_col;                           // excluding jcol
+    nrow = nsupr - d_fsupc - nsupc;
+
+    // points to the beginning of jcol in snode L\U(jsupno)
+    ufirst = glu.xlusup(jcol) + d_fsupc;
+    Index lda = glu.xlusup(jcol + 1) - glu.xlusup(jcol);
+    MappedMatrixBlock A(&(glu.lusup.data()[luptr]), nsupc, nsupc, OuterStride<>(lda));
+    VectorBlock<ScalarVector> u(glu.lusup, ufirst, nsupc);
+    u = A.template triangularView<UnitLower>().solve(u);
+
+    new (&A) MappedMatrixBlock(&(glu.lusup.data()[luptr + nsupc]), nrow, nsupc, OuterStride<>(lda));
+    VectorBlock<ScalarVector> l(glu.lusup, ufirst + nsupc, nrow);
     l.noalias() -= A * u;
-    
-  } // End if fst_col
-  return 0; 
+
+  }  // End if fst_col
+  return 0;
 }
 
-} // end namespace internal
-} // end namespace Eigen
+}  // end namespace internal
+}  // end namespace Eigen
 
-#endif // SPARSELU_COLUMN_BMOD_H
+#endif  // SPARSELU_COLUMN_BMOD_H
diff --git a/inst/include/Eigen/src/SparseLU/SparseLU_column_dfs.h b/inst/include/Eigen/src/SparseLU/SparseLU_column_dfs.h
index 4c04b0e4..71a9ff48 100644
--- a/inst/include/Eigen/src/SparseLU/SparseLU_column_dfs.h
+++ b/inst/include/Eigen/src/SparseLU/SparseLU_column_dfs.h
@@ -7,10 +7,10 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-/* 
- 
- * NOTE: This file is the modified version of [s,d,c,z]column_dfs.c file in SuperLU 
- 
+/*
+
+ * NOTE: This file is the modified version of [s,d,c,z]column_dfs.c file in SuperLU
+
  * -- SuperLU routine (version 2.0) --
  * Univ. of California Berkeley, Xerox Palo Alto Research Center,
  * and Lawrence Berkeley National Lab.
@@ -30,148 +30,139 @@
 #ifndef SPARSELU_COLUMN_DFS_H
 #define SPARSELU_COLUMN_DFS_H
 
-template <typename Scalar, typename Index> class SparseLUImpl;
-namespace Eigen {
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 
+namespace Eigen {
 namespace internal {
 
-template<typename IndexVector, typename ScalarVector>
-struct column_dfs_traits : no_assignment_operator
-{
+template <typename Scalar, typename StorageIndex>
+class SparseLUImpl;
+
+template <typename IndexVector, typename ScalarVector>
+struct column_dfs_traits : no_assignment_operator {
   typedef typename ScalarVector::Scalar Scalar;
-  typedef typename IndexVector::Scalar Index;
-  column_dfs_traits(Index jcol, Index& jsuper, typename SparseLUImpl<Scalar, Index>::GlobalLU_t& glu, SparseLUImpl<Scalar, Index>& luImpl)
-   : m_jcol(jcol), m_jsuper_ref(jsuper), m_glu(glu), m_luImpl(luImpl)
- {}
-  bool update_segrep(Index /*krep*/, Index /*jj*/)
-  {
-    return true;
-  }
-  void mem_expand(IndexVector& lsub, Index& nextl, Index chmark)
-  {
-    if (nextl >= m_glu.nzlmax)
-      m_luImpl.memXpand(lsub, m_glu.nzlmax, nextl, LSUB, m_glu.num_expansions); 
-    if (chmark != (m_jcol-1)) m_jsuper_ref = emptyIdxLU;
+  typedef typename IndexVector::Scalar StorageIndex;
+  column_dfs_traits(Index jcol, Index& jsuper, typename SparseLUImpl<Scalar, StorageIndex>::GlobalLU_t& glu,
+                    SparseLUImpl<Scalar, StorageIndex>& luImpl)
+      : m_jcol(jcol), m_jsuper_ref(jsuper), m_glu(glu), m_luImpl(luImpl) {}
+  bool update_segrep(Index /*krep*/, Index /*jj*/) { return true; }
+  void mem_expand(IndexVector& lsub, Index& nextl, Index chmark) {
+    if (nextl >= m_glu.nzlmax) m_luImpl.memXpand(lsub, m_glu.nzlmax, nextl, LSUB, m_glu.num_expansions);
+    if (chmark != (m_jcol - 1)) m_jsuper_ref = emptyIdxLU;
   }
   enum { ExpandMem = true };
-  
+
   Index m_jcol;
   Index& m_jsuper_ref;
-  typename SparseLUImpl<Scalar, Index>::GlobalLU_t& m_glu;
-  SparseLUImpl<Scalar, Index>& m_luImpl;
+  typename SparseLUImpl<Scalar, StorageIndex>::GlobalLU_t& m_glu;
+  SparseLUImpl<Scalar, StorageIndex>& m_luImpl;
 };
 
-
 /**
  * \brief Performs a symbolic factorization on column jcol and decide the supernode boundary
- * 
+ *
  * A supernode representative is the last column of a supernode.
- * The nonzeros in U[*,j] are segments that end at supernodes representatives. 
- * The routine returns a list of the supernodal representatives 
- * in topological order of the dfs that generates them. 
- * The location of the first nonzero in each supernodal segment 
- * (supernodal entry location) is also returned. 
- * 
+ * The nonzeros in U[*,j] are segments that end at supernodes representatives.
+ * The routine returns a list of the supernodal representatives
+ * in topological order of the dfs that generates them.
+ * The location of the first nonzero in each supernodal segment
+ * (supernodal entry location) is also returned.
+ *
  * \param m number of rows in the matrix
- * \param jcol Current column 
+ * \param jcol Current column
  * \param perm_r Row permutation
  * \param maxsuper  Maximum number of column allowed in a supernode
  * \param [in,out] nseg Number of segments in current U[*,j] - new segments appended
  * \param lsub_col defines the rhs vector to start the dfs
- * \param [in,out] segrep Segment representatives - new segments appended 
+ * \param [in,out] segrep Segment representatives - new segments appended
  * \param repfnz  First nonzero location in each row
- * \param xprune 
+ * \param xprune
  * \param marker  marker[i] == jj, if i was visited during dfs of current column jj;
  * \param parent
  * \param xplore working array
- * \param glu global LU data 
+ * \param glu global LU data
  * \return 0 success
  *         > 0 number of bytes allocated when run out of space
- * 
+ *
  */
-template <typename Scalar, typename Index>
-Index SparseLUImpl<Scalar,Index>::column_dfs(const Index m, const Index jcol, IndexVector& perm_r, Index maxsuper, Index& nseg,  BlockIndexVector lsub_col, IndexVector& segrep, BlockIndexVector repfnz, IndexVector& xprune, IndexVector& marker, IndexVector& parent, IndexVector& xplore, GlobalLU_t& glu)
-{
-  
-  Index jsuper = glu.supno(jcol); 
-  Index nextl = glu.xlsub(jcol); 
-  VectorBlock<IndexVector> marker2(marker, 2*m, m); 
-  
-  
+template <typename Scalar, typename StorageIndex>
+Index SparseLUImpl<Scalar, StorageIndex>::column_dfs(const Index m, const Index jcol, IndexVector& perm_r,
+                                                     Index maxsuper, Index& nseg, BlockIndexVector lsub_col,
+                                                     IndexVector& segrep, BlockIndexVector repfnz, IndexVector& xprune,
+                                                     IndexVector& marker, IndexVector& parent, IndexVector& xplore,
+                                                     GlobalLU_t& glu) {
+  Index jsuper = glu.supno(jcol);
+  Index nextl = glu.xlsub(jcol);
+  VectorBlock<IndexVector> marker2(marker, 2 * m, m);
+
   column_dfs_traits<IndexVector, ScalarVector> traits(jcol, jsuper, glu, *this);
-  
-  // For each nonzero in A(*,jcol) do dfs 
-  for (Index k = 0; ((k < m) ? lsub_col[k] != emptyIdxLU : false) ; k++)
-  {
-    Index krow = lsub_col(k); 
-    lsub_col(k) = emptyIdxLU; 
-    Index kmark = marker2(krow); 
-    
-    // krow was visited before, go to the next nonz; 
+
+  // For each nonzero in A(*,jcol) do dfs
+  for (Index k = 0; ((k < m) ? lsub_col[k] != emptyIdxLU : false); k++) {
+    Index krow = lsub_col(k);
+    lsub_col(k) = emptyIdxLU;
+    Index kmark = marker2(krow);
+
+    // krow was visited before, go to the next nonz;
     if (kmark == jcol) continue;
-    
-    dfs_kernel(jcol, perm_r, nseg, glu.lsub, segrep, repfnz, xprune, marker2, parent,
-                   xplore, glu, nextl, krow, traits);
-  } // for each nonzero ... 
-  
-  Index fsupc, jptr, jm1ptr, ito, ifrom, istop;
-  Index nsuper = glu.supno(jcol);
-  Index jcolp1 = jcol + 1;
+
+    dfs_kernel(StorageIndex(jcol), perm_r, nseg, glu.lsub, segrep, repfnz, xprune, marker2, parent, xplore, glu, nextl,
+               krow, traits);
+  }  // for each nonzero ...
+
+  Index fsupc;
+  StorageIndex nsuper = glu.supno(jcol);
+  StorageIndex jcolp1 = StorageIndex(jcol) + 1;
   Index jcolm1 = jcol - 1;
-  
+
   // check to see if j belongs in the same supernode as j-1
-  if ( jcol == 0 )
-  { // Do nothing for column 0 
-    nsuper = glu.supno(0) = 0 ;
-  }
-  else 
-  {
-    fsupc = glu.xsup(nsuper); 
-    jptr = glu.xlsub(jcol); // Not yet compressed
-    jm1ptr = glu.xlsub(jcolm1); 
-    
+  if (jcol == 0) {  // Do nothing for column 0
+    nsuper = glu.supno(0) = 0;
+  } else {
+    fsupc = glu.xsup(nsuper);
+    StorageIndex jptr = glu.xlsub(jcol);  // Not yet compressed
+    StorageIndex jm1ptr = glu.xlsub(jcolm1);
+
     // Use supernodes of type T2 : see SuperLU paper
-    if ( (nextl-jptr != jptr-jm1ptr-1) ) jsuper = emptyIdxLU;
-    
+    if ((nextl - jptr != jptr - jm1ptr - 1)) jsuper = emptyIdxLU;
+
     // Make sure the number of columns in a supernode doesn't
     // exceed threshold
-    if ( (jcol - fsupc) >= maxsuper) jsuper = emptyIdxLU; 
-    
+    if ((jcol - fsupc) >= maxsuper) jsuper = emptyIdxLU;
+
     /* If jcol starts a new supernode, reclaim storage space in
-     * glu.lsub from previous supernode. Note we only store 
-     * the subscript set of the first and last columns of 
+     * glu.lsub from previous supernode. Note we only store
+     * the subscript set of the first and last columns of
      * a supernode. (first for num values, last for pruning)
      */
-    if (jsuper == emptyIdxLU)
-    { // starts a new supernode 
-      if ( (fsupc < jcolm1-1) ) 
-      { // >= 3 columns in nsuper
-        ito = glu.xlsub(fsupc+1);
-        glu.xlsub(jcolm1) = ito; 
-        istop = ito + jptr - jm1ptr; 
-        xprune(jcolm1) = istop; // intialize xprune(jcol-1)
-        glu.xlsub(jcol) = istop; 
-        
-        for (ifrom = jm1ptr; ifrom < nextl; ++ifrom, ++ito)
-          glu.lsub(ito) = glu.lsub(ifrom); 
+    if (jsuper == emptyIdxLU) {    // starts a new supernode
+      if ((fsupc < jcolm1 - 1)) {  // >= 3 columns in nsuper
+        StorageIndex ito = glu.xlsub(fsupc + 1);
+        glu.xlsub(jcolm1) = ito;
+        StorageIndex istop = ito + jptr - jm1ptr;
+        xprune(jcolm1) = istop;  // initialize xprune(jcol-1)
+        glu.xlsub(jcol) = istop;
+
+        for (StorageIndex ifrom = jm1ptr; ifrom < nextl; ++ifrom, ++ito) glu.lsub(ito) = glu.lsub(ifrom);
         nextl = ito;  // = istop + length(jcol)
       }
-      nsuper++; 
-      glu.supno(jcol) = nsuper; 
-    } // if a new supernode 
-  } // end else:  jcol > 0
-  
+      nsuper++;
+      glu.supno(jcol) = nsuper;
+    }  // if a new supernode
+  }    // end else:  jcol > 0
+
   // Tidy up the pointers before exit
-  glu.xsup(nsuper+1) = jcolp1; 
-  glu.supno(jcolp1) = nsuper; 
-  xprune(jcol) = nextl;  // Intialize upper bound for pruning
-  glu.xlsub(jcolp1) = nextl; 
-  
-  return 0; 
+  glu.xsup(nsuper + 1) = jcolp1;
+  glu.supno(jcolp1) = nsuper;
+  xprune(jcol) = StorageIndex(nextl);  // Initialize upper bound for pruning
+  glu.xlsub(jcolp1) = StorageIndex(nextl);
+
+  return 0;
 }
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
 #endif
diff --git a/inst/include/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h b/inst/include/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h
index 170610d9..12e7650c 100644
--- a/inst/include/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h
+++ b/inst/include/Eigen/src/SparseLU/SparseLU_copy_to_ucol.h
@@ -6,10 +6,10 @@
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-/* 
- 
- * NOTE: This file is the modified version of [s,d,c,z]copy_to_ucol.c file in SuperLU 
- 
+/*
+
+ * NOTE: This file is the modified version of [s,d,c,z]copy_to_ucol.c file in SuperLU
+
  * -- SuperLU routine (version 2.0) --
  * Univ. of California Berkeley, Xerox Palo Alto Research Center,
  * and Lawrence Berkeley National Lab.
@@ -29,78 +29,78 @@
 #ifndef SPARSELU_COPY_TO_UCOL_H
 #define SPARSELU_COPY_TO_UCOL_H
 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
 namespace Eigen {
 namespace internal {
 
 /**
  * \brief Performs numeric block updates (sup-col) in topological order
- * 
+ *
  * \param jcol current column to update
  * \param nseg Number of segments in the U part
  * \param segrep segment representative ...
  * \param repfnz First nonzero column in each row  ...
- * \param perm_r Row permutation 
+ * \param perm_r Row permutation
  * \param dense Store the full representation of the column
- * \param glu Global LU data. 
- * \return 0 - successful return 
+ * \param glu Global LU data.
+ * \return 0 - successful return
  *         > 0 - number of bytes allocated when run out of space
- * 
+ *
  */
-template <typename Scalar, typename Index>
-Index SparseLUImpl<Scalar,Index>::copy_to_ucol(const Index jcol, const Index nseg, IndexVector& segrep, BlockIndexVector repfnz ,IndexVector& perm_r, BlockScalarVector dense, GlobalLU_t& glu)
-{  
-  Index ksub, krep, ksupno; 
-    
+template <typename Scalar, typename StorageIndex>
+Index SparseLUImpl<Scalar, StorageIndex>::copy_to_ucol(const Index jcol, const Index nseg, IndexVector& segrep,
+                                                       BlockIndexVector repfnz, IndexVector& perm_r,
+                                                       BlockScalarVector dense, GlobalLU_t& glu) {
+  Index ksub, krep, ksupno;
+
   Index jsupno = glu.supno(jcol);
-  
-  // For each nonzero supernode segment of U[*,j] in topological order 
-  Index k = nseg - 1, i; 
-  Index nextu = glu.xusub(jcol); 
-  Index kfnz, isub, segsize; 
-  Index new_next,irow; 
-  Index fsupc, mem; 
-  for (ksub = 0; ksub < nseg; ksub++)
-  {
-    krep = segrep(k); k--; 
-    ksupno = glu.supno(krep); 
-    if (jsupno != ksupno ) // should go into ucol(); 
+
+  // For each nonzero supernode segment of U[*,j] in topological order
+  Index k = nseg - 1, i;
+  StorageIndex nextu = glu.xusub(jcol);
+  Index kfnz, isub, segsize;
+  Index new_next, irow;
+  Index fsupc, mem;
+  for (ksub = 0; ksub < nseg; ksub++) {
+    krep = segrep(k);
+    k--;
+    ksupno = glu.supno(krep);
+    if (jsupno != ksupno)  // should go into ucol();
     {
-      kfnz = repfnz(krep); 
-      if (kfnz != emptyIdxLU)
-      { // Nonzero U-segment 
-        fsupc = glu.xsup(ksupno); 
-        isub = glu.xlsub(fsupc) + kfnz - fsupc; 
-        segsize = krep - kfnz + 1; 
-        new_next = nextu + segsize; 
-        while (new_next > glu.nzumax) 
-        {
-          mem = memXpand<ScalarVector>(glu.ucol, glu.nzumax, nextu, UCOL, glu.num_expansions); 
-          if (mem) return mem; 
-          mem = memXpand<IndexVector>(glu.usub, glu.nzumax, nextu, USUB, glu.num_expansions); 
-          if (mem) return mem; 
-          
+      kfnz = repfnz(krep);
+      if (kfnz != emptyIdxLU) {  // Nonzero U-segment
+        fsupc = glu.xsup(ksupno);
+        isub = glu.xlsub(fsupc) + kfnz - fsupc;
+        segsize = krep - kfnz + 1;
+        new_next = nextu + segsize;
+        while (new_next > glu.nzumax) {
+          mem = memXpand<ScalarVector>(glu.ucol, glu.nzumax, nextu, UCOL, glu.num_expansions);
+          if (mem) return mem;
+          mem = memXpand<IndexVector>(glu.usub, glu.nzumax, nextu, USUB, glu.num_expansions);
+          if (mem) return mem;
         }
-        
-        for (i = 0; i < segsize; i++)
-        {
-          irow = glu.lsub(isub); 
-          glu.usub(nextu) = perm_r(irow); // Unlike the L part, the U part is stored in its final order
-          glu.ucol(nextu) = dense(irow); 
-          dense(irow) = Scalar(0.0); 
+
+        for (i = 0; i < segsize; i++) {
+          irow = glu.lsub(isub);
+          glu.usub(nextu) = perm_r(irow);  // Unlike the L part, the U part is stored in its final order
+          glu.ucol(nextu) = dense(irow);
+          dense(irow) = Scalar(0.0);
           nextu++;
           isub++;
         }
-        
-      } // end nonzero U-segment 
-      
-    } // end if jsupno 
-    
-  } // end for each segment
-  glu.xusub(jcol + 1) = nextu; // close U(*,jcol)
-  return 0; 
+
+      }  // end nonzero U-segment
+
+    }  // end if jsupno
+
+  }                             // end for each segment
+  glu.xusub(jcol + 1) = nextu;  // close U(*,jcol)
+  return 0;
 }
 
-} // namespace internal
-} // end namespace Eigen
+}  // namespace internal
+}  // end namespace Eigen
 
-#endif // SPARSELU_COPY_TO_UCOL_H
+#endif  // SPARSELU_COPY_TO_UCOL_H
diff --git a/inst/include/Eigen/src/SparseLU/SparseLU_gemm_kernel.h b/inst/include/Eigen/src/SparseLU/SparseLU_gemm_kernel.h
deleted file mode 100644
index 9e4e3e72..00000000
--- a/inst/include/Eigen/src/SparseLU/SparseLU_gemm_kernel.h
+++ /dev/null
@@ -1,279 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2012 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_SPARSELU_GEMM_KERNEL_H
-#define EIGEN_SPARSELU_GEMM_KERNEL_H
-
-namespace Eigen {
-
-namespace internal {
-
-
-/** \internal
-  * A general matrix-matrix product kernel optimized for the SparseLU factorization.
-  *  - A, B, and C must be column major
-  *  - lda and ldc must be multiples of the respective packet size
-  *  - C must have the same alignment as A
-  */
-template<typename Scalar,typename Index>
-EIGEN_DONT_INLINE
-void sparselu_gemm(Index m, Index n, Index d, const Scalar* A, Index lda, const Scalar* B, Index ldb, Scalar* C, Index ldc)
-{
-  using namespace Eigen::internal;
-  
-  typedef typename packet_traits<Scalar>::type Packet;
-  enum {
-    NumberOfRegisters = EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS,
-    PacketSize = packet_traits<Scalar>::size,
-    PM = 8,                             // peeling in M
-    RN = 2,                             // register blocking
-    RK = NumberOfRegisters>=16 ? 4 : 2, // register blocking
-    BM = 4096/sizeof(Scalar),           // number of rows of A-C per chunk
-    SM = PM*PacketSize                  // step along M
-  };
-  Index d_end = (d/RK)*RK;    // number of columns of A (rows of B) suitable for full register blocking
-  Index n_end = (n/RN)*RN;    // number of columns of B-C suitable for processing RN columns at once
-  Index i0 = internal::first_aligned(A,m);
-  
-  eigen_internal_assert(((lda%PacketSize)==0) && ((ldc%PacketSize)==0) && (i0==internal::first_aligned(C,m)));
-  
-  // handle the non aligned rows of A and C without any optimization:
-  for(Index i=0; i<i0; ++i)
-  {
-    for(Index j=0; j<n; ++j)
-    {
-      Scalar c = C[i+j*ldc];
-      for(Index k=0; k<d; ++k)
-        c += B[k+j*ldb] * A[i+k*lda];
-      C[i+j*ldc] = c;
-    }
-  }
-  // process the remaining rows per chunk of BM rows
-  for(Index ib=i0; ib<m; ib+=BM)
-  {
-    Index actual_b = std::min<Index>(BM, m-ib);                 // actual number of rows
-    Index actual_b_end1 = (actual_b/SM)*SM;                   // actual number of rows suitable for peeling
-    Index actual_b_end2 = (actual_b/PacketSize)*PacketSize;   // actual number of rows suitable for vectorization
-    
-    // Let's process two columns of B-C at once
-    for(Index j=0; j<n_end; j+=RN)
-    {
-      const Scalar* Bc0 = B+(j+0)*ldb;
-      const Scalar* Bc1 = B+(j+1)*ldb;
-      
-      for(Index k=0; k<d_end; k+=RK)
-      {
-        
-        // load and expand a RN x RK block of B
-        Packet b00, b10, b20, b30, b01, b11, b21, b31;
-                  b00 = pset1<Packet>(Bc0[0]);
-                  b10 = pset1<Packet>(Bc0[1]);
-        if(RK==4) b20 = pset1<Packet>(Bc0[2]);
-        if(RK==4) b30 = pset1<Packet>(Bc0[3]);
-                  b01 = pset1<Packet>(Bc1[0]);
-                  b11 = pset1<Packet>(Bc1[1]);
-        if(RK==4) b21 = pset1<Packet>(Bc1[2]);
-        if(RK==4) b31 = pset1<Packet>(Bc1[3]);
-        
-        Packet a0, a1, a2, a3, c0, c1, t0, t1;
-        
-        const Scalar* A0 = A+ib+(k+0)*lda;
-        const Scalar* A1 = A+ib+(k+1)*lda;
-        const Scalar* A2 = A+ib+(k+2)*lda;
-        const Scalar* A3 = A+ib+(k+3)*lda;
-        
-        Scalar* C0 = C+ib+(j+0)*ldc;
-        Scalar* C1 = C+ib+(j+1)*ldc;
-        
-                  a0 = pload<Packet>(A0);
-                  a1 = pload<Packet>(A1);
-        if(RK==4)
-        {
-          a2 = pload<Packet>(A2);
-          a3 = pload<Packet>(A3);
-        }
-        else
-        {
-          // workaround "may be used uninitialized in this function" warning
-          a2 = a3 = a0;
-        }
-        
-#define KMADD(c, a, b, tmp) {tmp = b; tmp = pmul(a,tmp); c = padd(c,tmp);}
-#define WORK(I)  \
-                    c0 = pload<Packet>(C0+i+(I)*PacketSize);   \
-                    c1 = pload<Packet>(C1+i+(I)*PacketSize);   \
-                    KMADD(c0, a0, b00, t0)      \
-                    KMADD(c1, a0, b01, t1)      \
-                    a0 = pload<Packet>(A0+i+(I+1)*PacketSize); \
-                    KMADD(c0, a1, b10, t0)      \
-                    KMADD(c1, a1, b11, t1)       \
-                    a1 = pload<Packet>(A1+i+(I+1)*PacketSize); \
-          if(RK==4) KMADD(c0, a2, b20, t0)       \
-          if(RK==4) KMADD(c1, a2, b21, t1)       \
-          if(RK==4) a2 = pload<Packet>(A2+i+(I+1)*PacketSize); \
-          if(RK==4) KMADD(c0, a3, b30, t0)       \
-          if(RK==4) KMADD(c1, a3, b31, t1)       \
-          if(RK==4) a3 = pload<Packet>(A3+i+(I+1)*PacketSize); \
-                    pstore(C0+i+(I)*PacketSize, c0);           \
-                    pstore(C1+i+(I)*PacketSize, c1)
-        
-        // process rows of A' - C' with aggressive vectorization and peeling 
-        for(Index i=0; i<actual_b_end1; i+=PacketSize*8)
-        {
-          EIGEN_ASM_COMMENT("SPARSELU_GEMML_KERNEL1");
-                    prefetch((A0+i+(5)*PacketSize));
-                    prefetch((A1+i+(5)*PacketSize));
-          if(RK==4) prefetch((A2+i+(5)*PacketSize));
-          if(RK==4) prefetch((A3+i+(5)*PacketSize));
-                    WORK(0);
-                    WORK(1);
-                    WORK(2);
-                    WORK(3);
-                    WORK(4);
-                    WORK(5);
-                    WORK(6);
-                    WORK(7);
-        }
-        // process the remaining rows with vectorization only
-        for(Index i=actual_b_end1; i<actual_b_end2; i+=PacketSize)
-        {
-          WORK(0);
-        }
-#undef WORK
-        // process the remaining rows without vectorization
-        for(Index i=actual_b_end2; i<actual_b; ++i)
-        {
-          if(RK==4)
-          {
-            C0[i] += A0[i]*Bc0[0]+A1[i]*Bc0[1]+A2[i]*Bc0[2]+A3[i]*Bc0[3];
-            C1[i] += A0[i]*Bc1[0]+A1[i]*Bc1[1]+A2[i]*Bc1[2]+A3[i]*Bc1[3];
-          }
-          else
-          {
-            C0[i] += A0[i]*Bc0[0]+A1[i]*Bc0[1];
-            C1[i] += A0[i]*Bc1[0]+A1[i]*Bc1[1];
-          }
-        }
-        
-        Bc0 += RK;
-        Bc1 += RK;
-      } // peeled loop on k
-    } // peeled loop on the columns j
-    // process the last column (we now perform a matrux-vector product)
-    if((n-n_end)>0)
-    {
-      const Scalar* Bc0 = B+(n-1)*ldb;
-      
-      for(Index k=0; k<d_end; k+=RK)
-      {
-        
-        // load and expand a 1 x RK block of B
-        Packet b00, b10, b20, b30;
-                  b00 = pset1<Packet>(Bc0[0]);
-                  b10 = pset1<Packet>(Bc0[1]);
-        if(RK==4) b20 = pset1<Packet>(Bc0[2]);
-        if(RK==4) b30 = pset1<Packet>(Bc0[3]);
-        
-        Packet a0, a1, a2, a3, c0, t0/*, t1*/;
-        
-        const Scalar* A0 = A+ib+(k+0)*lda;
-        const Scalar* A1 = A+ib+(k+1)*lda;
-        const Scalar* A2 = A+ib+(k+2)*lda;
-        const Scalar* A3 = A+ib+(k+3)*lda;
-        
-        Scalar* C0 = C+ib+(n_end)*ldc;
-        
-                  a0 = pload<Packet>(A0);
-                  a1 = pload<Packet>(A1);
-        if(RK==4)
-        {
-          a2 = pload<Packet>(A2);
-          a3 = pload<Packet>(A3);
-        }
-        else
-        {
-          // workaround "may be used uninitialized in this function" warning
-          a2 = a3 = a0;
-        }
-        
-#define WORK(I) \
-                  c0 = pload<Packet>(C0+i+(I)*PacketSize);   \
-                  KMADD(c0, a0, b00, t0)       \
-                  a0 = pload<Packet>(A0+i+(I+1)*PacketSize); \
-                  KMADD(c0, a1, b10, t0)       \
-                  a1 = pload<Packet>(A1+i+(I+1)*PacketSize); \
-        if(RK==4) KMADD(c0, a2, b20, t0)       \
-        if(RK==4) a2 = pload<Packet>(A2+i+(I+1)*PacketSize); \
-        if(RK==4) KMADD(c0, a3, b30, t0)       \
-        if(RK==4) a3 = pload<Packet>(A3+i+(I+1)*PacketSize); \
-                  pstore(C0+i+(I)*PacketSize, c0);
-        
-        // agressive vectorization and peeling
-        for(Index i=0; i<actual_b_end1; i+=PacketSize*8)
-        {
-          EIGEN_ASM_COMMENT("SPARSELU_GEMML_KERNEL2");
-          WORK(0);
-          WORK(1);
-          WORK(2);
-          WORK(3);
-          WORK(4);
-          WORK(5);
-          WORK(6);
-          WORK(7);
-        }
-        // vectorization only
-        for(Index i=actual_b_end1; i<actual_b_end2; i+=PacketSize)
-        {
-          WORK(0);
-        }
-        // remaining scalars
-        for(Index i=actual_b_end2; i<actual_b; ++i)
-        {
-          if(RK==4) 
-            C0[i] += A0[i]*Bc0[0]+A1[i]*Bc0[1]+A2[i]*Bc0[2]+A3[i]*Bc0[3];
-          else
-            C0[i] += A0[i]*Bc0[0]+A1[i]*Bc0[1];
-        }
-        
-        Bc0 += RK;
-#undef WORK
-      }
-    }
-    
-    // process the last columns of A, corresponding to the last rows of B
-    Index rd = d-d_end;
-    if(rd>0)
-    {
-      for(Index j=0; j<n; ++j)
-      {
-        enum {
-          Alignment = PacketSize>1 ? Aligned : 0
-        };
-        typedef Map<Matrix<Scalar,Dynamic,1>, Alignment > MapVector;
-        typedef Map<const Matrix<Scalar,Dynamic,1>, Alignment > ConstMapVector;
-        if(rd==1)       MapVector(C+j*ldc+ib,actual_b) += B[0+d_end+j*ldb] * ConstMapVector(A+(d_end+0)*lda+ib, actual_b);
-        
-        else if(rd==2)  MapVector(C+j*ldc+ib,actual_b) += B[0+d_end+j*ldb] * ConstMapVector(A+(d_end+0)*lda+ib, actual_b)
-                                                        + B[1+d_end+j*ldb] * ConstMapVector(A+(d_end+1)*lda+ib, actual_b);
-        
-        else            MapVector(C+j*ldc+ib,actual_b) += B[0+d_end+j*ldb] * ConstMapVector(A+(d_end+0)*lda+ib, actual_b)
-                                                        + B[1+d_end+j*ldb] * ConstMapVector(A+(d_end+1)*lda+ib, actual_b)
-                                                        + B[2+d_end+j*ldb] * ConstMapVector(A+(d_end+2)*lda+ib, actual_b);
-      }
-    }
-  
-  } // blocking on the rows of A and C
-}
-#undef KMADD
-
-} // namespace internal
-
-} // namespace Eigen
-
-#endif // EIGEN_SPARSELU_GEMM_KERNEL_H
diff --git a/inst/include/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h b/inst/include/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h
index 7a4e4305..8df830b0 100644
--- a/inst/include/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h
+++ b/inst/include/Eigen/src/SparseLU/SparseLU_heap_relax_snode.h
@@ -28,100 +28,87 @@
 #ifndef SPARSELU_HEAP_RELAX_SNODE_H
 #define SPARSELU_HEAP_RELAX_SNODE_H
 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
 namespace Eigen {
 namespace internal {
 
-/** 
+/**
  * \brief Identify the initial relaxed supernodes
- * 
- * This routine applied to a symmetric elimination tree. 
+ *
+ * This routine applied to a symmetric elimination tree.
  * It assumes that the matrix has been reordered according to the postorder of the etree
  * \param n The number of columns
- * \param et elimination tree 
- * \param relax_columns Maximum number of columns allowed in a relaxed snode 
+ * \param et elimination tree
+ * \param relax_columns Maximum number of columns allowed in a relaxed snode
  * \param descendants Number of descendants of each node in the etree
  * \param relax_end last column in a supernode
  */
-template <typename Scalar, typename Index>
-void SparseLUImpl<Scalar,Index>::heap_relax_snode (const Index n, IndexVector& et, const Index relax_columns, IndexVector& descendants, IndexVector& relax_end)
-{
-  
-  // The etree may not be postordered, but its heap ordered  
+template <typename Scalar, typename StorageIndex>
+void SparseLUImpl<Scalar, StorageIndex>::heap_relax_snode(const Index n, IndexVector& et, const Index relax_columns,
+                                                          IndexVector& descendants, IndexVector& relax_end) {
+  // The etree may not be postordered, but its heap ordered
   IndexVector post;
-  internal::treePostorder(n, et, post); // Post order etree
-  IndexVector inv_post(n+1); 
-  Index i;
-  for (i = 0; i < n+1; ++i) inv_post(post(i)) = i; // inv_post = post.inverse()???
-  
-  // Renumber etree in postorder 
+  internal::treePostorder(StorageIndex(n), et, post);  // Post order etree
+  IndexVector inv_post(n + 1);
+  for (StorageIndex i = 0; i < n + 1; ++i) inv_post(post(i)) = i;  // inv_post = post.inverse()???
+
+  // Renumber etree in postorder
   IndexVector iwork(n);
-  IndexVector et_save(n+1);
-  for (i = 0; i < n; ++i)
-  {
+  IndexVector et_save(n + 1);
+  for (Index i = 0; i < n; ++i) {
     iwork(post(i)) = post(et(i));
   }
-  et_save = et; // Save the original etree
-  et = iwork; 
-  
+  et_save = et;  // Save the original etree
+  et = iwork;
+
   // compute the number of descendants of each node in the etree
   relax_end.setConstant(emptyIdxLU);
-  Index j, parent; 
+  Index j, parent;
   descendants.setZero();
-  for (j = 0; j < n; j++) 
-  {
+  for (j = 0; j < n; j++) {
     parent = et(j);
-    if (parent != n) // not the dummy root
+    if (parent != n)  // not the dummy root
       descendants(parent) += descendants(j) + 1;
   }
   // Identify the relaxed supernodes by postorder traversal of the etree
-  Index snode_start; // beginning of a snode 
-  Index k;
-  Index nsuper_et_post = 0; // Number of relaxed snodes in postordered etree 
-  Index nsuper_et = 0; // Number of relaxed snodes in the original etree 
-  Index l; 
-  for (j = 0; j < n; )
-  {
+  Index snode_start;  // beginning of a snode
+  StorageIndex k;
+  StorageIndex l;
+  for (j = 0; j < n;) {
     parent = et(j);
-    snode_start = j; 
-    while ( parent != n && descendants(parent) < relax_columns ) 
-    {
-      j = parent; 
+    snode_start = j;
+    while (parent != n && descendants(parent) < relax_columns) {
+      j = parent;
       parent = et(j);
     }
-    // Found a supernode in postordered etree, j is the last column 
-    ++nsuper_et_post;
-    k = n;
-    for (i = snode_start; i <= j; ++i)
-      k = (std::min)(k, inv_post(i));
+    // Found a supernode in postordered etree, j is the last column
+    k = StorageIndex(n);
+    for (Index i = snode_start; i <= j; ++i) k = (std::min)(k, inv_post(i));
     l = inv_post(j);
-    if ( (l - k) == (j - snode_start) )  // Same number of columns in the snode
+    if ((l - k) == (j - snode_start))  // Same number of columns in the snode
     {
       // This is also a supernode in the original etree
-      relax_end(k) = l; // Record last column 
-      ++nsuper_et; 
-    }
-    else 
-    {
-      for (i = snode_start; i <= j; ++i) 
-      {
+      relax_end(k) = l;  // Record last column
+    } else {
+      for (Index i = snode_start; i <= j; ++i) {
         l = inv_post(i);
-        if (descendants(i) == 0) 
-        {
+        if (descendants(i) == 0) {
           relax_end(l) = l;
-          ++nsuper_et;
         }
       }
     }
     j++;
     // Search for a new leaf
     while (descendants(j) != 0 && j < n) j++;
-  } // End postorder traversal of the etree
-  
+  }  // End postorder traversal of the etree
+
   // Recover the original etree
-  et = et_save; 
+  et = et_save;
 }
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
-#endif // SPARSELU_HEAP_RELAX_SNODE_H
+}  // end namespace Eigen
+#endif  // SPARSELU_HEAP_RELAX_SNODE_H
diff --git a/inst/include/Eigen/src/SparseLU/SparseLU_kernel_bmod.h b/inst/include/Eigen/src/SparseLU/SparseLU_kernel_bmod.h
index 6af02675..54bda0c5 100644
--- a/inst/include/Eigen/src/SparseLU/SparseLU_kernel_bmod.h
+++ b/inst/include/Eigen/src/SparseLU/SparseLU_kernel_bmod.h
@@ -11,120 +11,123 @@
 #ifndef SPARSELU_KERNEL_BMOD_H
 #define SPARSELU_KERNEL_BMOD_H
 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
 namespace Eigen {
 namespace internal {
-  
-/**
- * \brief Performs numeric block updates from a given supernode to a single column
- * 
- * \param segsize Size of the segment (and blocks ) to use for updates
- * \param[in,out] dense Packed values of the original matrix
- * \param tempv temporary vector to use for updates
- * \param lusup array containing the supernodes
- * \param lda Leading dimension in the supernode
- * \param nrow Number of rows in the rectangular part of the supernode
- * \param lsub compressed row subscripts of supernodes
- * \param lptr pointer to the first column of the current supernode in lsub
- * \param no_zeros Number of nonzeros elements before the diagonal part of the supernode
- * \return 0 on success
- */
-template <int SegSizeAtCompileTime> struct LU_kernel_bmod
-{
-  template <typename BlockScalarVector, typename ScalarVector, typename IndexVector, typename Index>
-  static EIGEN_DONT_INLINE void run(const int segsize, BlockScalarVector& dense, ScalarVector& tempv, ScalarVector& lusup, Index& luptr, const Index lda,
-                                    const Index nrow, IndexVector& lsub, const Index lptr, const Index no_zeros);
+
+template <int SegSizeAtCompileTime>
+struct LU_kernel_bmod {
+  /** \internal
+   * \brief Performs numeric block updates from a given supernode to a single column
+   *
+   * \param segsize Size of the segment (and blocks ) to use for updates
+   * \param[in,out] dense Packed values of the original matrix
+   * \param tempv temporary vector to use for updates
+   * \param lusup array containing the supernodes
+   * \param lda Leading dimension in the supernode
+   * \param nrow Number of rows in the rectangular part of the supernode
+   * \param lsub compressed row subscripts of supernodes
+   * \param lptr pointer to the first column of the current supernode in lsub
+   * \param no_zeros Number of nonzeros elements before the diagonal part of the supernode
+   */
+  template <typename BlockScalarVector, typename ScalarVector, typename IndexVector>
+  static EIGEN_DONT_INLINE void run(const Index segsize, BlockScalarVector& dense, ScalarVector& tempv,
+                                    ScalarVector& lusup, Index& luptr, const Index lda, const Index nrow,
+                                    IndexVector& lsub, const Index lptr, const Index no_zeros);
 };
 
 template <int SegSizeAtCompileTime>
-template <typename BlockScalarVector, typename ScalarVector, typename IndexVector, typename Index>
-EIGEN_DONT_INLINE void LU_kernel_bmod<SegSizeAtCompileTime>::run(const int segsize, BlockScalarVector& dense, ScalarVector& tempv, ScalarVector& lusup, Index& luptr, const Index lda,
-                                                                  const Index nrow, IndexVector& lsub, const Index lptr, const Index no_zeros)
-{
+template <typename BlockScalarVector, typename ScalarVector, typename IndexVector>
+EIGEN_DONT_INLINE void LU_kernel_bmod<SegSizeAtCompileTime>::run(const Index segsize, BlockScalarVector& dense,
+                                                                 ScalarVector& tempv, ScalarVector& lusup, Index& luptr,
+                                                                 const Index lda, const Index nrow, IndexVector& lsub,
+                                                                 const Index lptr, const Index no_zeros) {
   typedef typename ScalarVector::Scalar Scalar;
   // First, copy U[*,j] segment from dense(*) to tempv(*)
-  // The result of triangular solve is in tempv[*]; 
-    // The result of matric-vector update is in dense[*]
-  Index isub = lptr + no_zeros; 
-  int i;
+  // The result of triangular solve is in tempv[*];
+  // The result of matric-vector update is in dense[*]
+  Index isub = lptr + no_zeros;
+  Index i;
   Index irow;
-  for (i = 0; i < ((SegSizeAtCompileTime==Dynamic)?segsize:SegSizeAtCompileTime); i++)
-  {
-    irow = lsub(isub); 
-    tempv(i) = dense(irow); 
-    ++isub; 
+  for (i = 0; i < ((SegSizeAtCompileTime == Dynamic) ? segsize : SegSizeAtCompileTime); i++) {
+    irow = lsub(isub);
+    tempv(i) = dense(irow);
+    ++isub;
   }
   // Dense triangular solve -- start effective triangle
-  luptr += lda * no_zeros + no_zeros; 
-  // Form Eigen matrix and vector 
-  Map<Matrix<Scalar,SegSizeAtCompileTime,SegSizeAtCompileTime, ColMajor>, 0, OuterStride<> > A( &(lusup.data()[luptr]), segsize, segsize, OuterStride<>(lda) );
-  Map<Matrix<Scalar,SegSizeAtCompileTime,1> > u(tempv.data(), segsize);
-  
-  u = A.template triangularView<UnitLower>().solve(u); 
-  
-  // Dense matrix-vector product y <-- B*x 
+  luptr += lda * no_zeros + no_zeros;
+  // Form Eigen matrix and vector
+  Map<Matrix<Scalar, SegSizeAtCompileTime, SegSizeAtCompileTime, ColMajor>, 0, OuterStride<> > A(
+      &(lusup.data()[luptr]), segsize, segsize, OuterStride<>(lda));
+  Map<Matrix<Scalar, SegSizeAtCompileTime, 1> > u(tempv.data(), segsize);
+
+  u = A.template triangularView<UnitLower>().solve(u);
+
+  // Dense matrix-vector product y <-- B*x
   luptr += segsize;
   const Index PacketSize = internal::packet_traits<Scalar>::size;
   Index ldl = internal::first_multiple(nrow, PacketSize);
-  Map<Matrix<Scalar,Dynamic,SegSizeAtCompileTime, ColMajor>, 0, OuterStride<> > B( &(lusup.data()[luptr]), nrow, segsize, OuterStride<>(lda) );
-  Index aligned_offset = internal::first_aligned(tempv.data()+segsize, PacketSize);
-  Index aligned_with_B_offset = (PacketSize-internal::first_aligned(B.data(), PacketSize))%PacketSize;
-  Map<Matrix<Scalar,Dynamic,1>, 0, OuterStride<> > l(tempv.data()+segsize+aligned_offset+aligned_with_B_offset, nrow, OuterStride<>(ldl) );
-  
-  l.setZero();
-  internal::sparselu_gemm<Scalar>(l.rows(), l.cols(), B.cols(), B.data(), B.outerStride(), u.data(), u.outerStride(), l.data(), l.outerStride());
-  
-  // Scatter tempv[] into SPA dense[] as a temporary storage 
+  Map<Matrix<Scalar, Dynamic, SegSizeAtCompileTime, ColMajor>, 0, OuterStride<> > B(&(lusup.data()[luptr]), nrow,
+                                                                                    segsize, OuterStride<>(lda));
+  Index aligned_offset = internal::first_default_aligned(tempv.data() + segsize, PacketSize);
+  Index aligned_with_B_offset = (PacketSize - internal::first_default_aligned(B.data(), PacketSize)) % PacketSize;
+  Map<Matrix<Scalar, Dynamic, 1>, 0, OuterStride<> > l(tempv.data() + segsize + aligned_offset + aligned_with_B_offset,
+                                                       nrow, OuterStride<>(ldl));
+
+  l.noalias() = B * u;
+
+  // Scatter tempv[] into SPA dense[] as a temporary storage
   isub = lptr + no_zeros;
-  for (i = 0; i < ((SegSizeAtCompileTime==Dynamic)?segsize:SegSizeAtCompileTime); i++)
-  {
-    irow = lsub(isub++); 
+  for (i = 0; i < ((SegSizeAtCompileTime == Dynamic) ? segsize : SegSizeAtCompileTime); i++) {
+    irow = lsub(isub++);
     dense(irow) = tempv(i);
   }
-  
+
   // Scatter l into SPA dense[]
-  for (i = 0; i < nrow; i++)
-  {
-    irow = lsub(isub++); 
+  for (i = 0; i < nrow; i++) {
+    irow = lsub(isub++);
     dense(irow) -= l(i);
-  } 
+  }
 }
 
-template <> struct LU_kernel_bmod<1>
-{
-  template <typename BlockScalarVector, typename ScalarVector, typename IndexVector, typename Index>
-  static EIGEN_DONT_INLINE void run(const int /*segsize*/, BlockScalarVector& dense, ScalarVector& /*tempv*/, ScalarVector& lusup, Index& luptr,
-                                    const Index lda, const Index nrow, IndexVector& lsub, const Index lptr, const Index no_zeros);
+template <>
+struct LU_kernel_bmod<1> {
+  template <typename BlockScalarVector, typename ScalarVector, typename IndexVector>
+  static EIGEN_DONT_INLINE void run(const Index /*segsize*/, BlockScalarVector& dense, ScalarVector& /*tempv*/,
+                                    ScalarVector& lusup, Index& luptr, const Index lda, const Index nrow,
+                                    IndexVector& lsub, const Index lptr, const Index no_zeros);
 };
 
-
-template <typename BlockScalarVector, typename ScalarVector, typename IndexVector, typename Index>
-EIGEN_DONT_INLINE void LU_kernel_bmod<1>::run(const int /*segsize*/, BlockScalarVector& dense, ScalarVector& /*tempv*/, ScalarVector& lusup, Index& luptr,
-                                              const Index lda, const Index nrow, IndexVector& lsub, const Index lptr, const Index no_zeros)
-{
+template <typename BlockScalarVector, typename ScalarVector, typename IndexVector>
+EIGEN_DONT_INLINE void LU_kernel_bmod<1>::run(const Index /*segsize*/, BlockScalarVector& dense,
+                                              ScalarVector& /*tempv*/, ScalarVector& lusup, Index& luptr,
+                                              const Index lda, const Index nrow, IndexVector& lsub, const Index lptr,
+                                              const Index no_zeros) {
   typedef typename ScalarVector::Scalar Scalar;
+  typedef typename IndexVector::Scalar StorageIndex;
   Scalar f = dense(lsub(lptr + no_zeros));
   luptr += lda * no_zeros + no_zeros + 1;
   const Scalar* a(lusup.data() + luptr);
-  const /*typename IndexVector::Scalar*/Index*  irow(lsub.data()+lptr + no_zeros + 1);
+  const StorageIndex* irow(lsub.data() + lptr + no_zeros + 1);
   Index i = 0;
-  for (; i+1 < nrow; i+=2)
-  {
+  for (; i + 1 < nrow; i += 2) {
     Index i0 = *(irow++);
     Index i1 = *(irow++);
     Scalar a0 = *(a++);
     Scalar a1 = *(a++);
     Scalar d0 = dense.coeff(i0);
     Scalar d1 = dense.coeff(i1);
-    d0 -= f*a0;
-    d1 -= f*a1;
+    d0 -= f * a0;
+    d1 -= f * a1;
     dense.coeffRef(i0) = d0;
     dense.coeffRef(i1) = d1;
   }
-  if(i<nrow)
-    dense.coeffRef(*(irow++)) -= f * *(a++);
+  if (i < nrow) dense.coeffRef(*(irow++)) -= f * *(a++);
 }
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
-#endif // SPARSELU_KERNEL_BMOD_H
+}  // end namespace Eigen
+#endif  // SPARSELU_KERNEL_BMOD_H
diff --git a/inst/include/Eigen/src/SparseLU/SparseLU_panel_bmod.h b/inst/include/Eigen/src/SparseLU/SparseLU_panel_bmod.h
index 9d2ff290..505d9829 100644
--- a/inst/include/Eigen/src/SparseLU/SparseLU_panel_bmod.h
+++ b/inst/include/Eigen/src/SparseLU/SparseLU_panel_bmod.h
@@ -8,10 +8,10 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-/* 
- 
- * NOTE: This file is the modified version of [s,d,c,z]panel_bmod.c file in SuperLU 
- 
+/*
+
+ * NOTE: This file is the modified version of [s,d,c,z]panel_bmod.c file in SuperLU
+
  * -- SuperLU routine (version 3.0) --
  * Univ. of California Berkeley, Xerox Palo Alto Research Center,
  * and Lawrence Berkeley National Lab.
@@ -31,193 +31,185 @@
 #ifndef SPARSELU_PANEL_BMOD_H
 #define SPARSELU_PANEL_BMOD_H
 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
 namespace Eigen {
 namespace internal {
 
 /**
  * \brief Performs numeric block updates (sup-panel) in topological order.
- * 
+ *
  * Before entering this routine, the original nonzeros in the panel
- * were already copied i nto the spa[m,w]
- * 
+ * were already copied into the spa[m,w]
+ *
  * \param m number of rows in the matrix
  * \param w Panel size
  * \param jcol Starting  column of the panel
  * \param nseg Number of segments in the U part
- * \param dense Store the full representation of the panel 
- * \param tempv working array 
+ * \param dense Store the full representation of the panel
+ * \param tempv working array
  * \param segrep segment representative... first row in the segment
  * \param repfnz First nonzero rows
- * \param glu Global LU data. 
- * 
- * 
+ * \param glu Global LU data.
+ *
+ *
  */
-template <typename Scalar, typename Index>
-void SparseLUImpl<Scalar,Index>::panel_bmod(const Index m, const Index w, const Index jcol, 
-                                            const Index nseg, ScalarVector& dense, ScalarVector& tempv,
-                                            IndexVector& segrep, IndexVector& repfnz, GlobalLU_t& glu)
-{
-  
-  Index ksub,jj,nextl_col; 
-  Index fsupc, nsupc, nsupr, nrow; 
-  Index krep, kfnz; 
-  Index lptr; // points to the row subscripts of a supernode 
-  Index luptr; // ...
-  Index segsize,no_zeros ; 
+template <typename Scalar, typename StorageIndex>
+void SparseLUImpl<Scalar, StorageIndex>::panel_bmod(const Index m, const Index w, const Index jcol, const Index nseg,
+                                                    ScalarVector& dense, ScalarVector& tempv, IndexVector& segrep,
+                                                    IndexVector& repfnz, GlobalLU_t& glu) {
+  Index ksub, jj, nextl_col;
+  Index fsupc, nsupc, nsupr, nrow;
+  Index krep, kfnz;
+  Index lptr;   // points to the row subscripts of a supernode
+  Index luptr;  // ...
+  Index segsize, no_zeros;
   // For each nonz supernode segment of U[*,j] in topological order
-  Index k = nseg - 1; 
+  Index k = nseg - 1;
   const Index PacketSize = internal::packet_traits<Scalar>::size;
-  
-  for (ksub = 0; ksub < nseg; ksub++)
-  { // For each updating supernode
+
+  for (ksub = 0; ksub < nseg; ksub++) {  // For each updating supernode
     /* krep = representative of current k-th supernode
      * fsupc =  first supernodal column
      * nsupc = number of columns in a supernode
      * nsupr = number of rows in a supernode
      */
-    krep = segrep(k); k--; 
-    fsupc = glu.xsup(glu.supno(krep)); 
-    nsupc = krep - fsupc + 1; 
-    nsupr = glu.xlsub(fsupc+1) - glu.xlsub(fsupc); 
-    nrow = nsupr - nsupc; 
-    lptr = glu.xlsub(fsupc); 
-    
+    krep = segrep(k);
+    k--;
+    fsupc = glu.xsup(glu.supno(krep));
+    nsupc = krep - fsupc + 1;
+    nsupr = glu.xlsub(fsupc + 1) - glu.xlsub(fsupc);
+    nrow = nsupr - nsupc;
+    lptr = glu.xlsub(fsupc);
+
     // loop over the panel columns to detect the actual number of columns and rows
     Index u_rows = 0;
     Index u_cols = 0;
-    for (jj = jcol; jj < jcol + w; jj++)
-    {
-      nextl_col = (jj-jcol) * m; 
-      VectorBlock<IndexVector> repfnz_col(repfnz, nextl_col, m); // First nonzero column index for each row
-      
-      kfnz = repfnz_col(krep); 
-      if ( kfnz == emptyIdxLU ) 
-        continue; // skip any zero segment
-      
+    for (jj = jcol; jj < jcol + w; jj++) {
+      nextl_col = (jj - jcol) * m;
+      VectorBlock<IndexVector> repfnz_col(repfnz, nextl_col, m);  // First nonzero column index for each row
+
+      kfnz = repfnz_col(krep);
+      if (kfnz == emptyIdxLU) continue;  // skip any zero segment
+
       segsize = krep - kfnz + 1;
       u_cols++;
-      u_rows = (std::max)(segsize,u_rows);
+      u_rows = (std::max)(segsize, u_rows);
     }
-    
-    if(nsupc >= 2)
-    { 
+
+    if (nsupc >= 2) {
       Index ldu = internal::first_multiple<Index>(u_rows, PacketSize);
-      Map<ScalarMatrix, Aligned,  OuterStride<> > U(tempv.data(), u_rows, u_cols, OuterStride<>(ldu));
-      
+      Map<ScalarMatrix, Aligned, OuterStride<> > U(tempv.data(), u_rows, u_cols, OuterStride<>(ldu));
+
       // gather U
       Index u_col = 0;
-      for (jj = jcol; jj < jcol + w; jj++)
-      {
-        nextl_col = (jj-jcol) * m; 
-        VectorBlock<IndexVector> repfnz_col(repfnz, nextl_col, m); // First nonzero column index for each row
-        VectorBlock<ScalarVector> dense_col(dense, nextl_col, m); // Scatter/gather entire matrix column from/to here
-        
-        kfnz = repfnz_col(krep); 
-        if ( kfnz == emptyIdxLU ) 
-          continue; // skip any zero segment
-        
+      for (jj = jcol; jj < jcol + w; jj++) {
+        nextl_col = (jj - jcol) * m;
+        VectorBlock<IndexVector> repfnz_col(repfnz, nextl_col, m);  // First nonzero column index for each row
+        VectorBlock<ScalarVector> dense_col(dense, nextl_col, m);   // Scatter/gather entire matrix column from/to here
+
+        kfnz = repfnz_col(krep);
+        if (kfnz == emptyIdxLU) continue;  // skip any zero segment
+
         segsize = krep - kfnz + 1;
-        luptr = glu.xlusup(fsupc);    
-        no_zeros = kfnz - fsupc; 
-        
+        luptr = glu.xlusup(fsupc);
+        no_zeros = kfnz - fsupc;
+
         Index isub = lptr + no_zeros;
-        Index off = u_rows-segsize;
-        for (Index i = 0; i < off; i++) U(i,u_col) = 0;
-        for (Index i = 0; i < segsize; i++)
-        {
-          Index irow = glu.lsub(isub); 
-          U(i+off,u_col) = dense_col(irow); 
-          ++isub; 
+        Index off = u_rows - segsize;
+        for (Index i = 0; i < off; i++) U(i, u_col) = 0;
+        for (Index i = 0; i < segsize; i++) {
+          Index irow = glu.lsub(isub);
+          U(i + off, u_col) = dense_col(irow);
+          ++isub;
         }
         u_col++;
       }
       // solve U = A^-1 U
       luptr = glu.xlusup(fsupc);
-      Index lda = glu.xlusup(fsupc+1) - glu.xlusup(fsupc);
+      Index lda = glu.xlusup(fsupc + 1) - glu.xlusup(fsupc);
       no_zeros = (krep - u_rows + 1) - fsupc;
       luptr += lda * no_zeros + no_zeros;
-      MappedMatrixBlock A(glu.lusup.data()+luptr, u_rows, u_rows, OuterStride<>(lda) );
+      MappedMatrixBlock A(glu.lusup.data() + luptr, u_rows, u_rows, OuterStride<>(lda));
       U = A.template triangularView<UnitLower>().solve(U);
-      
+
       // update
       luptr += u_rows;
-      MappedMatrixBlock B(glu.lusup.data()+luptr, nrow, u_rows, OuterStride<>(lda) );
-      eigen_assert(tempv.size()>w*ldu + nrow*w + 1);
-      
+      MappedMatrixBlock B(glu.lusup.data() + luptr, nrow, u_rows, OuterStride<>(lda));
+      eigen_assert(tempv.size() > w * ldu + nrow * w + 1);
+
       Index ldl = internal::first_multiple<Index>(nrow, PacketSize);
-      Index offset = (PacketSize-internal::first_aligned(B.data(), PacketSize)) % PacketSize;
-      MappedMatrixBlock L(tempv.data()+w*ldu+offset, nrow, u_cols, OuterStride<>(ldl));
-      
-      L.setZero();
-      internal::sparselu_gemm<Scalar>(L.rows(), L.cols(), B.cols(), B.data(), B.outerStride(), U.data(), U.outerStride(), L.data(), L.outerStride());
-      
+      Index offset = (PacketSize - internal::first_default_aligned(B.data(), PacketSize)) % PacketSize;
+      MappedMatrixBlock L(tempv.data() + w * ldu + offset, nrow, u_cols, OuterStride<>(ldl));
+
+      L.noalias() = B * U;
+
       // scatter U and L
       u_col = 0;
-      for (jj = jcol; jj < jcol + w; jj++)
-      {
-        nextl_col = (jj-jcol) * m; 
-        VectorBlock<IndexVector> repfnz_col(repfnz, nextl_col, m); // First nonzero column index for each row
-        VectorBlock<ScalarVector> dense_col(dense, nextl_col, m); // Scatter/gather entire matrix column from/to here
-        
-        kfnz = repfnz_col(krep); 
-        if ( kfnz == emptyIdxLU ) 
-          continue; // skip any zero segment
-        
+      for (jj = jcol; jj < jcol + w; jj++) {
+        nextl_col = (jj - jcol) * m;
+        VectorBlock<IndexVector> repfnz_col(repfnz, nextl_col, m);  // First nonzero column index for each row
+        VectorBlock<ScalarVector> dense_col(dense, nextl_col, m);   // Scatter/gather entire matrix column from/to here
+
+        kfnz = repfnz_col(krep);
+        if (kfnz == emptyIdxLU) continue;  // skip any zero segment
+
         segsize = krep - kfnz + 1;
-        no_zeros = kfnz - fsupc; 
+        no_zeros = kfnz - fsupc;
         Index isub = lptr + no_zeros;
-        
-        Index off = u_rows-segsize;
-        for (Index i = 0; i < segsize; i++)
-        {
-          Index irow = glu.lsub(isub++); 
-          dense_col(irow) = U.coeff(i+off,u_col);
-          U.coeffRef(i+off,u_col) = 0;
+
+        Index off = u_rows - segsize;
+        for (Index i = 0; i < segsize; i++) {
+          Index irow = glu.lsub(isub++);
+          dense_col(irow) = U.coeff(i + off, u_col);
+          U.coeffRef(i + off, u_col) = 0;
         }
-        
+
         // Scatter l into SPA dense[]
-        for (Index i = 0; i < nrow; i++)
-        {
-          Index irow = glu.lsub(isub++); 
-          dense_col(irow) -= L.coeff(i,u_col);
-          L.coeffRef(i,u_col) = 0;
+        for (Index i = 0; i < nrow; i++) {
+          Index irow = glu.lsub(isub++);
+          dense_col(irow) -= L.coeff(i, u_col);
+          L.coeffRef(i, u_col) = 0;
         }
         u_col++;
       }
-    }
-    else // level 2 only
+    } else  // level 2 only
     {
       // Sequence through each column in the panel
-      for (jj = jcol; jj < jcol + w; jj++)
-      {
-        nextl_col = (jj-jcol) * m; 
-        VectorBlock<IndexVector> repfnz_col(repfnz, nextl_col, m); // First nonzero column index for each row
-        VectorBlock<ScalarVector> dense_col(dense, nextl_col, m); // Scatter/gather entire matrix column from/to here
-        
-        kfnz = repfnz_col(krep); 
-        if ( kfnz == emptyIdxLU ) 
-          continue; // skip any zero segment
-        
+      for (jj = jcol; jj < jcol + w; jj++) {
+        nextl_col = (jj - jcol) * m;
+        VectorBlock<IndexVector> repfnz_col(repfnz, nextl_col, m);  // First nonzero column index for each row
+        VectorBlock<ScalarVector> dense_col(dense, nextl_col, m);   // Scatter/gather entire matrix column from/to here
+
+        kfnz = repfnz_col(krep);
+        if (kfnz == emptyIdxLU) continue;  // skip any zero segment
+
         segsize = krep - kfnz + 1;
         luptr = glu.xlusup(fsupc);
-        
-        Index lda = glu.xlusup(fsupc+1)-glu.xlusup(fsupc);// nsupr
-        
-        // Perform a trianglar solve and block update, 
+
+        Index lda = glu.xlusup(fsupc + 1) - glu.xlusup(fsupc);  // nsupr
+
+        // Perform a trianglar solve and block update,
         // then scatter the result of sup-col update to dense[]
-        no_zeros = kfnz - fsupc; 
-              if(segsize==1)  LU_kernel_bmod<1>::run(segsize, dense_col, tempv, glu.lusup, luptr, lda, nrow, glu.lsub, lptr, no_zeros);
-        else  if(segsize==2)  LU_kernel_bmod<2>::run(segsize, dense_col, tempv, glu.lusup, luptr, lda, nrow, glu.lsub, lptr, no_zeros);
-        else  if(segsize==3)  LU_kernel_bmod<3>::run(segsize, dense_col, tempv, glu.lusup, luptr, lda, nrow, glu.lsub, lptr, no_zeros);
-        else                  LU_kernel_bmod<Dynamic>::run(segsize, dense_col, tempv, glu.lusup, luptr, lda, nrow, glu.lsub, lptr, no_zeros); 
-      } // End for each column in the panel 
+        no_zeros = kfnz - fsupc;
+        if (segsize == 1)
+          LU_kernel_bmod<1>::run(segsize, dense_col, tempv, glu.lusup, luptr, lda, nrow, glu.lsub, lptr, no_zeros);
+        else if (segsize == 2)
+          LU_kernel_bmod<2>::run(segsize, dense_col, tempv, glu.lusup, luptr, lda, nrow, glu.lsub, lptr, no_zeros);
+        else if (segsize == 3)
+          LU_kernel_bmod<3>::run(segsize, dense_col, tempv, glu.lusup, luptr, lda, nrow, glu.lsub, lptr, no_zeros);
+        else
+          LU_kernel_bmod<Dynamic>::run(segsize, dense_col, tempv, glu.lusup, luptr, lda, nrow, glu.lsub, lptr,
+                                       no_zeros);
+      }  // End for each column in the panel
     }
-    
-  } // End for each updating supernode
-} // end panel bmod
 
-} // end namespace internal
+  }  // End for each updating supernode
+}  // end panel bmod
+
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // SPARSELU_PANEL_BMOD_H
+#endif  // SPARSELU_PANEL_BMOD_H
diff --git a/inst/include/Eigen/src/SparseLU/SparseLU_panel_dfs.h b/inst/include/Eigen/src/SparseLU/SparseLU_panel_dfs.h
index dc0054ef..df315484 100644
--- a/inst/include/Eigen/src/SparseLU/SparseLU_panel_dfs.h
+++ b/inst/include/Eigen/src/SparseLU/SparseLU_panel_dfs.h
@@ -7,10 +7,10 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-/* 
- 
- * NOTE: This file is the modified version of [s,d,c,z]panel_dfs.c file in SuperLU 
- 
+/*
+
+ * NOTE: This file is the modified version of [s,d,c,z]panel_dfs.c file in SuperLU
+
  * -- SuperLU routine (version 2.0) --
  * Univ. of California Berkeley, Xerox Palo Alto Research Center,
  * and Lawrence Berkeley National Lab.
@@ -30,22 +30,20 @@
 #ifndef SPARSELU_PANEL_DFS_H
 #define SPARSELU_PANEL_DFS_H
 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
 namespace Eigen {
 
 namespace internal {
-  
-template<typename IndexVector>
-struct panel_dfs_traits
-{
-  typedef typename IndexVector::Scalar Index;
-  panel_dfs_traits(Index jcol, Index* marker)
-    : m_jcol(jcol), m_marker(marker)
-  {}
-  bool update_segrep(Index krep, Index jj)
-  {
-    if(m_marker[krep]<m_jcol)
-    {
-      m_marker[krep] = jj; 
+
+template <typename IndexVector>
+struct panel_dfs_traits {
+  typedef typename IndexVector::Scalar StorageIndex;
+  panel_dfs_traits(Index jcol, StorageIndex* marker) : m_jcol(jcol), m_marker(marker) {}
+  bool update_segrep(Index krep, StorageIndex jj) {
+    if (m_marker[krep] < m_jcol) {
+      m_marker[krep] = jj;
       return true;
     }
     return false;
@@ -53,150 +51,130 @@ struct panel_dfs_traits
   void mem_expand(IndexVector& /*glu.lsub*/, Index /*nextl*/, Index /*chmark*/) {}
   enum { ExpandMem = false };
   Index m_jcol;
-  Index* m_marker;
+  StorageIndex* m_marker;
 };
 
-
-template <typename Scalar, typename Index>
+template <typename Scalar, typename StorageIndex>
 template <typename Traits>
-void SparseLUImpl<Scalar,Index>::dfs_kernel(const Index jj, IndexVector& perm_r,
-                   Index& nseg, IndexVector& panel_lsub, IndexVector& segrep,
-                   Ref<IndexVector> repfnz_col, IndexVector& xprune, Ref<IndexVector> marker, IndexVector& parent,
-                   IndexVector& xplore, GlobalLU_t& glu,
-                   Index& nextl_col, Index krow, Traits& traits
-                  )
-{
-  
-  Index kmark = marker(krow);
-      
+void SparseLUImpl<Scalar, StorageIndex>::dfs_kernel(const StorageIndex jj, IndexVector& perm_r, Index& nseg,
+                                                    IndexVector& panel_lsub, IndexVector& segrep,
+                                                    Ref<IndexVector> repfnz_col, IndexVector& xprune,
+                                                    Ref<IndexVector> marker, IndexVector& parent, IndexVector& xplore,
+                                                    GlobalLU_t& glu, Index& nextl_col, Index krow, Traits& traits) {
+  StorageIndex kmark = marker(krow);
+
   // For each unmarked krow of jj
-  marker(krow) = jj; 
-  Index kperm = perm_r(krow); 
-  if (kperm == emptyIdxLU ) {
+  marker(krow) = jj;
+  StorageIndex kperm = perm_r(krow);
+  if (kperm == emptyIdxLU) {
     // krow is in L : place it in structure of L(*, jj)
-    panel_lsub(nextl_col++) = krow;  // krow is indexed into A
-    
+    panel_lsub(nextl_col++) = StorageIndex(krow);  // krow is indexed into A
+
     traits.mem_expand(panel_lsub, nextl_col, kmark);
-  }
-  else 
-  {
+  } else {
     // krow is in U : if its supernode-representative krep
     // has been explored, update repfnz(*)
     // krep = supernode representative of the current row
-    Index krep = glu.xsup(glu.supno(kperm)+1) - 1; 
+    StorageIndex krep = glu.xsup(glu.supno(kperm) + 1) - 1;
     // First nonzero element in the current column:
-    Index myfnz = repfnz_col(krep); 
-    
-    if (myfnz != emptyIdxLU )
-    {
+    StorageIndex myfnz = repfnz_col(krep);
+
+    if (myfnz != emptyIdxLU) {
       // Representative visited before
-      if (myfnz > kperm ) repfnz_col(krep) = kperm; 
-      
-    }
-    else 
-    {
+      if (myfnz > kperm) repfnz_col(krep) = kperm;
+
+    } else {
       // Otherwise, perform dfs starting at krep
-      Index oldrep = emptyIdxLU; 
-      parent(krep) = oldrep; 
-      repfnz_col(krep) = kperm; 
-      Index xdfs =  glu.xlsub(krep); 
-      Index maxdfs = xprune(krep); 
-      
-      Index kpar;
-      do 
-      {
+      StorageIndex oldrep = emptyIdxLU;
+      parent(krep) = oldrep;
+      repfnz_col(krep) = kperm;
+      StorageIndex xdfs = glu.xlsub(krep);
+      Index maxdfs = xprune(krep);
+
+      StorageIndex kpar;
+      do {
         // For each unmarked kchild of krep
-        while (xdfs < maxdfs) 
-        {
-          Index kchild = glu.lsub(xdfs); 
-          xdfs++; 
-          Index chmark = marker(kchild); 
-          
-          if (chmark != jj ) 
-          {
-            marker(kchild) = jj; 
-            Index chperm = perm_r(kchild); 
-            
-            if (chperm == emptyIdxLU) 
-            {
+        while (xdfs < maxdfs) {
+          StorageIndex kchild = glu.lsub(xdfs);
+          xdfs++;
+          StorageIndex chmark = marker(kchild);
+
+          if (chmark != jj) {
+            marker(kchild) = jj;
+            StorageIndex chperm = perm_r(kchild);
+
+            if (chperm == emptyIdxLU) {
               // case kchild is in L: place it in L(*, j)
               panel_lsub(nextl_col++) = kchild;
               traits.mem_expand(panel_lsub, nextl_col, chmark);
-            }
-            else
-            {
+            } else {
               // case kchild is in U :
-              // chrep = its supernode-rep. If its rep has been explored, 
+              // chrep = its supernode-rep. If its rep has been explored,
               // update its repfnz(*)
-              Index chrep = glu.xsup(glu.supno(chperm)+1) - 1; 
-              myfnz = repfnz_col(chrep); 
-              
-              if (myfnz != emptyIdxLU) 
-              { // Visited before 
-                if (myfnz > chperm) 
-                  repfnz_col(chrep) = chperm; 
-              }
-              else 
-              { // Cont. dfs at snode-rep of kchild
-                xplore(krep) = xdfs; 
-                oldrep = krep; 
-                krep = chrep; // Go deeper down G(L)
-                parent(krep) = oldrep; 
-                repfnz_col(krep) = chperm; 
-                xdfs = glu.xlsub(krep); 
-                maxdfs = xprune(krep); 
-                
-              } // end if myfnz != -1
-            } // end if chperm == -1 
-                
-          } // end if chmark !=jj
-        } // end while xdfs < maxdfs
-        
+              StorageIndex chrep = glu.xsup(glu.supno(chperm) + 1) - 1;
+              myfnz = repfnz_col(chrep);
+
+              if (myfnz != emptyIdxLU) {  // Visited before
+                if (myfnz > chperm) repfnz_col(chrep) = chperm;
+              } else {  // Cont. dfs at snode-rep of kchild
+                xplore(krep) = xdfs;
+                oldrep = krep;
+                krep = chrep;  // Go deeper down G(L)
+                parent(krep) = oldrep;
+                repfnz_col(krep) = chperm;
+                xdfs = glu.xlsub(krep);
+                maxdfs = xprune(krep);
+
+              }  // end if myfnz != -1
+            }    // end if chperm == -1
+
+          }  // end if chmark !=jj
+        }    // end while xdfs < maxdfs
+
         // krow has no more unexplored nbrs :
-        //    Place snode-rep krep in postorder DFS, if this 
-        //    segment is seen for the first time. (Note that 
+        //    Place snode-rep krep in postorder DFS, if this
+        //    segment is seen for the first time. (Note that
         //    "repfnz(krep)" may change later.)
         //    Baktrack dfs to its parent
-        if(traits.update_segrep(krep,jj))
-        //if (marker1(krep) < jcol )
+        if (traits.update_segrep(krep, jj))
+        // if (marker1(krep) < jcol )
         {
-          segrep(nseg) = krep; 
-          ++nseg; 
-          //marker1(krep) = jj; 
+          segrep(nseg) = krep;
+          ++nseg;
+          // marker1(krep) = jj;
         }
-        
-        kpar = parent(krep); // Pop recursion, mimic recursion 
-        if (kpar == emptyIdxLU) 
-          break; // dfs done 
-        krep = kpar; 
-        xdfs = xplore(krep); 
-        maxdfs = xprune(krep); 
-
-      } while (kpar != emptyIdxLU); // Do until empty stack 
-      
-    } // end if (myfnz = -1)
-
-  } // end if (kperm == -1)   
+
+        kpar = parent(krep);            // Pop recursion, mimic recursion
+        if (kpar == emptyIdxLU) break;  // dfs done
+        krep = kpar;
+        xdfs = xplore(krep);
+        maxdfs = xprune(krep);
+
+      } while (kpar != emptyIdxLU);  // Do until empty stack
+
+    }  // end if (myfnz = -1)
+
+  }  // end if (kperm == -1)
 }
 
 /**
  * \brief Performs a symbolic factorization on a panel of columns [jcol, jcol+w)
- * 
+ *
  * A supernode representative is the last column of a supernode.
  * The nonzeros in U[*,j] are segments that end at supernodes representatives
- * 
- * The routine returns a list of the supernodal representatives 
- * in topological order of the dfs that generates them. This list is 
- * a superset of the topological order of each individual column within 
+ *
+ * The routine returns a list of the supernodal representatives
+ * in topological order of the dfs that generates them. This list is
+ * a superset of the topological order of each individual column within
  * the panel.
- * The location of the first nonzero in each supernodal segment 
- * (supernodal entry location) is also returned. Each column has 
- * a separate list for this purpose. 
- * 
+ * The location of the first nonzero in each supernodal segment
+ * (supernodal entry location) is also returned. Each column has
+ * a separate list for this purpose.
+ *
  * Two markers arrays are used for dfs :
  *    marker[i] == jj, if i was visited during dfs of current column jj;
- *    marker1[i] >= jcol, if i was visited by earlier columns in this panel; 
- * 
+ *    marker1[i] >= jcol, if i was visited by earlier columns in this panel;
+ *
  * \param[in] m number of rows in the matrix
  * \param[in] w Panel size
  * \param[in] jcol Starting  column of the panel
@@ -204,7 +182,7 @@ void SparseLUImpl<Scalar,Index>::dfs_kernel(const Index jj, IndexVector& perm_r,
  * \param[in] perm_r Row permutation
  * \param[out] nseg Number of U segments
  * \param[out] dense Accumulate the column vectors of the panel
- * \param[out] panel_lsub Subscripts of the row in the panel 
+ * \param[out] panel_lsub Subscripts of the row in the panel
  * \param[out] segrep Segment representative i.e first nonzero row of each segment
  * \param[out] repfnz First nonzero location in each row
  * \param[out] xprune The pruned elimination tree
@@ -212,47 +190,46 @@ void SparseLUImpl<Scalar,Index>::dfs_kernel(const Index jj, IndexVector& perm_r,
  * \param  parent The elimination tree
  * \param xplore work vector
  * \param glu The global data structure
- * 
+ *
  */
 
-template <typename Scalar, typename Index>
-void SparseLUImpl<Scalar,Index>::panel_dfs(const Index m, const Index w, const Index jcol, MatrixType& A, IndexVector& perm_r, Index& nseg, ScalarVector& dense, IndexVector& panel_lsub, IndexVector& segrep, IndexVector& repfnz, IndexVector& xprune, IndexVector& marker, IndexVector& parent, IndexVector& xplore, GlobalLU_t& glu)
-{
-  Index nextl_col; // Next available position in panel_lsub[*,jj] 
-  
-  // Initialize pointers 
-  VectorBlock<IndexVector> marker1(marker, m, m); 
-  nseg = 0; 
-  
+template <typename Scalar, typename StorageIndex>
+void SparseLUImpl<Scalar, StorageIndex>::panel_dfs(const Index m, const Index w, const Index jcol, MatrixType& A,
+                                                   IndexVector& perm_r, Index& nseg, ScalarVector& dense,
+                                                   IndexVector& panel_lsub, IndexVector& segrep, IndexVector& repfnz,
+                                                   IndexVector& xprune, IndexVector& marker, IndexVector& parent,
+                                                   IndexVector& xplore, GlobalLU_t& glu) {
+  Index nextl_col;  // Next available position in panel_lsub[*,jj]
+
+  // Initialize pointers
+  VectorBlock<IndexVector> marker1(marker, m, m);
+  nseg = 0;
+
   panel_dfs_traits<IndexVector> traits(jcol, marker1.data());
-  
-  // For each column in the panel 
-  for (Index jj = jcol; jj < jcol + w; jj++) 
-  {
-    nextl_col = (jj - jcol) * m; 
-    
-    VectorBlock<IndexVector> repfnz_col(repfnz, nextl_col, m); // First nonzero location in each row
-    VectorBlock<ScalarVector> dense_col(dense,nextl_col, m); // Accumulate a column vector here
-    
-    
+
+  // For each column in the panel
+  for (StorageIndex jj = StorageIndex(jcol); jj < jcol + w; jj++) {
+    nextl_col = (jj - jcol) * m;
+
+    VectorBlock<IndexVector> repfnz_col(repfnz, nextl_col, m);  // First nonzero location in each row
+    VectorBlock<ScalarVector> dense_col(dense, nextl_col, m);   // Accumulate a column vector here
+
     // For each nnz in A[*, jj] do depth first search
-    for (typename MatrixType::InnerIterator it(A, jj); it; ++it)
-    {
-      Index krow = it.row(); 
+    for (typename MatrixType::InnerIterator it(A, jj); it; ++it) {
+      Index krow = it.row();
       dense_col(krow) = it.value();
-      
-      Index kmark = marker(krow); 
-      if (kmark == jj) 
-        continue; // krow visited before, go to the next nonzero
-      
-      dfs_kernel(jj, perm_r, nseg, panel_lsub, segrep, repfnz_col, xprune, marker, parent,
-                   xplore, glu, nextl_col, krow, traits);
-    }// end for nonzeros in column jj
-    
-  } // end for column jj
+
+      StorageIndex kmark = marker(krow);
+      if (kmark == jj) continue;  // krow visited before, go to the next nonzero
+
+      dfs_kernel(jj, perm_r, nseg, panel_lsub, segrep, repfnz_col, xprune, marker, parent, xplore, glu, nextl_col, krow,
+                 traits);
+    }  // end for nonzeros in column jj
+
+  }  // end for column jj
 }
 
-} // end namespace internal
-} // end namespace Eigen
+}  // end namespace internal
+}  // end namespace Eigen
 
-#endif // SPARSELU_PANEL_DFS_H
+#endif  // SPARSELU_PANEL_DFS_H
diff --git a/inst/include/Eigen/src/SparseLU/SparseLU_pivotL.h b/inst/include/Eigen/src/SparseLU/SparseLU_pivotL.h
index 2e49ef66..10a090b5 100644
--- a/inst/include/Eigen/src/SparseLU/SparseLU_pivotL.h
+++ b/inst/include/Eigen/src/SparseLU/SparseLU_pivotL.h
@@ -7,10 +7,10 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-/* 
- 
- * NOTE: This file is the modified version of xpivotL.c file in SuperLU 
- 
+/*
+
+ * NOTE: This file is the modified version of xpivotL.c file in SuperLU
+
  * -- SuperLU routine (version 3.0) --
  * Univ. of California Berkeley, Xerox Palo Alto Research Center,
  * and Lawrence Berkeley National Lab.
@@ -30,12 +30,15 @@
 #ifndef SPARSELU_PIVOTL_H
 #define SPARSELU_PIVOTL_H
 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
 namespace Eigen {
 namespace internal {
-  
+
 /**
- * \brief Performs the numerical pivotin on the current column of L, and the CDIV operation.
- * 
+ * \brief Performs the numerical pivoting on the current column of L, and the CDIV operation.
+ *
  * Pivot policy :
  * (1) Compute thresh = u * max_(i>=j) abs(A_ij);
  * (2) IF user specifies pivot row k and abs(A_kj) >= thresh THEN
@@ -44,94 +47,90 @@ namespace internal {
  *           pivot row = j;
  *       ELSE
  *           pivot row = m;
- * 
+ *
  *   Note: If you absolutely want to use a given pivot order, then set u=0.0.
- * 
+ *
  * \param jcol The current column of L
  * \param diagpivotthresh diagonal pivoting threshold
  * \param[in,out] perm_r Row permutation (threshold pivoting)
  * \param[in] iperm_c column permutation - used to finf diagonal of Pc*A*Pc'
  * \param[out] pivrow  The pivot row
  * \param glu Global LU data
- * \return 0 if success, i > 0 if U(i,i) is exactly zero 
- * 
+ * \return 0 if success, i > 0 if U(i,i) is exactly zero
+ *
  */
-template <typename Scalar, typename Index>
-Index SparseLUImpl<Scalar,Index>::pivotL(const Index jcol, const RealScalar& diagpivotthresh, IndexVector& perm_r, IndexVector& iperm_c, Index& pivrow, GlobalLU_t& glu)
-{
-  
-  Index fsupc = (glu.xsup)((glu.supno)(jcol)); // First column in the supernode containing the column jcol
-  Index nsupc = jcol - fsupc; // Number of columns in the supernode portion, excluding jcol; nsupc >=0
-  Index lptr = glu.xlsub(fsupc); // pointer to the starting location of the row subscripts for this supernode portion
-  Index nsupr = glu.xlsub(fsupc+1) - lptr; // Number of rows in the supernode
-  Index lda = glu.xlusup(fsupc+1) - glu.xlusup(fsupc); // leading dimension
-  Scalar* lu_sup_ptr = &(glu.lusup.data()[glu.xlusup(fsupc)]); // Start of the current supernode
-  Scalar* lu_col_ptr = &(glu.lusup.data()[glu.xlusup(jcol)]); // Start of jcol in the supernode
-  Index* lsub_ptr = &(glu.lsub.data()[lptr]); // Start of row indices of the supernode
-  
-  // Determine the largest abs numerical value for partial pivoting 
-  Index diagind = iperm_c(jcol); // diagonal index 
+template <typename Scalar, typename StorageIndex>
+Index SparseLUImpl<Scalar, StorageIndex>::pivotL(const Index jcol, const RealScalar& diagpivotthresh,
+                                                 IndexVector& perm_r, IndexVector& iperm_c, Index& pivrow,
+                                                 GlobalLU_t& glu) {
+  Index fsupc = (glu.xsup)((glu.supno)(jcol));  // First column in the supernode containing the column jcol
+  Index nsupc = jcol - fsupc;                   // Number of columns in the supernode portion, excluding jcol; nsupc >=0
+  Index lptr = glu.xlsub(fsupc);  // pointer to the starting location of the row subscripts for this supernode portion
+  Index nsupr = glu.xlsub(fsupc + 1) - lptr;                    // Number of rows in the supernode
+  Index lda = glu.xlusup(fsupc + 1) - glu.xlusup(fsupc);        // leading dimension
+  Scalar* lu_sup_ptr = &(glu.lusup.data()[glu.xlusup(fsupc)]);  // Start of the current supernode
+  Scalar* lu_col_ptr = &(glu.lusup.data()[glu.xlusup(jcol)]);   // Start of jcol in the supernode
+  StorageIndex* lsub_ptr = &(glu.lsub.data()[lptr]);            // Start of row indices of the supernode
+
+  // Determine the largest abs numerical value for partial pivoting
+  Index diagind = iperm_c(jcol);  // diagonal index
   RealScalar pivmax(-1.0);
-  Index pivptr = nsupc; 
-  Index diag = emptyIdxLU; 
+  Index pivptr = nsupc;
+  Index diag = emptyIdxLU;
   RealScalar rtemp;
-  Index isub, icol, itemp, k; 
+  Index isub, icol, itemp, k;
   for (isub = nsupc; isub < nsupr; ++isub) {
     using std::abs;
     rtemp = abs(lu_col_ptr[isub]);
     if (rtemp > pivmax) {
-      pivmax = rtemp; 
+      pivmax = rtemp;
       pivptr = isub;
-    } 
+    }
     if (lsub_ptr[isub] == diagind) diag = isub;
   }
-  
+
   // Test for singularity
-  if ( pivmax <= RealScalar(0.0) ) {
+  if (pivmax <= RealScalar(0.0)) {
     // if pivmax == -1, the column is structurally empty, otherwise it is only numerically zero
     pivrow = pivmax < RealScalar(0.0) ? diagind : lsub_ptr[pivptr];
-    perm_r(pivrow) = jcol;
-    return (jcol+1);
+    perm_r(pivrow) = StorageIndex(jcol);
+    return (jcol + 1);
   }
-  
-  RealScalar thresh = diagpivotthresh * pivmax; 
-  
-  // Choose appropriate pivotal element 
-  
+
+  RealScalar thresh = diagpivotthresh * pivmax;
+
+  // Choose appropriate pivotal element
+
   {
     // Test if the diagonal element can be used as a pivot (given the threshold value)
-    if (diag >= 0 ) 
-    {
+    if (diag >= 0) {
       // Diagonal element exists
       using std::abs;
       rtemp = abs(lu_col_ptr[diag]);
-      if (rtemp != 0.0 && rtemp >= thresh) pivptr = diag;
+      if (rtemp != RealScalar(0.0) && rtemp >= thresh) pivptr = diag;
     }
     pivrow = lsub_ptr[pivptr];
   }
-  
+
   // Record pivot row
-  perm_r(pivrow) = jcol; 
+  perm_r(pivrow) = StorageIndex(jcol);
   // Interchange row subscripts
-  if (pivptr != nsupc )
-  {
-    std::swap( lsub_ptr[pivptr], lsub_ptr[nsupc] );
+  if (pivptr != nsupc) {
+    std::swap(lsub_ptr[pivptr], lsub_ptr[nsupc]);
     // Interchange numerical values as well, for the two rows in the whole snode
     // such that L is indexed the same way as A
-    for (icol = 0; icol <= nsupc; icol++)
-    {
-      itemp = pivptr + icol * lda; 
+    for (icol = 0; icol <= nsupc; icol++) {
+      itemp = pivptr + icol * lda;
       std::swap(lu_sup_ptr[itemp], lu_sup_ptr[nsupc + icol * lda]);
     }
   }
   // cdiv operations
   Scalar temp = Scalar(1.0) / lu_col_ptr[nsupc];
-  for (k = nsupc+1; k < nsupr; k++)
-    lu_col_ptr[k] *= temp; 
+  for (k = nsupc + 1; k < nsupr; k++) lu_col_ptr[k] *= temp;
   return 0;
 }
 
-} // end namespace internal
-} // end namespace Eigen
+}  // end namespace internal
+}  // end namespace Eigen
 
-#endif // SPARSELU_PIVOTL_H
+#endif  // SPARSELU_PIVOTL_H
diff --git a/inst/include/Eigen/src/SparseLU/SparseLU_pruneL.h b/inst/include/Eigen/src/SparseLU/SparseLU_pruneL.h
index 66460d16..620f2850 100644
--- a/inst/include/Eigen/src/SparseLU/SparseLU_pruneL.h
+++ b/inst/include/Eigen/src/SparseLU/SparseLU_pruneL.h
@@ -7,10 +7,10 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-/* 
- 
- * NOTE: This file is the modified version of [s,d,c,z]pruneL.c file in SuperLU 
- 
+/*
+
+ * NOTE: This file is the modified version of [s,d,c,z]pruneL.c file in SuperLU
+
  * -- SuperLU routine (version 2.0) --
  * Univ. of California Berkeley, Xerox Palo Alto Research Center,
  * and Lawrence Berkeley National Lab.
@@ -30,6 +30,9 @@
 #ifndef SPARSELU_PRUNEL_H
 #define SPARSELU_PRUNEL_H
 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
 namespace Eigen {
 namespace internal {
 
@@ -37,99 +40,91 @@ namespace internal {
  * \brief Prunes the L-structure.
  *
  * It prunes the L-structure  of supernodes whose L-structure contains the current pivot row "pivrow"
- * 
- * 
+ *
+ *
  * \param jcol The current column of L
  * \param[in] perm_r Row permutation
  * \param[out] pivrow  The pivot row
  * \param nseg Number of segments
- * \param segrep 
+ * \param segrep
  * \param repfnz
- * \param[out] xprune 
+ * \param[out] xprune
  * \param glu Global LU data
- * 
+ *
  */
-template <typename Scalar, typename Index>
-void SparseLUImpl<Scalar,Index>::pruneL(const Index jcol, const IndexVector& perm_r, const Index pivrow, const Index nseg, const IndexVector& segrep, BlockIndexVector repfnz, IndexVector& xprune, GlobalLU_t& glu)
-{
+template <typename Scalar, typename StorageIndex>
+void SparseLUImpl<Scalar, StorageIndex>::pruneL(const Index jcol, const IndexVector& perm_r, const Index pivrow,
+                                                const Index nseg, const IndexVector& segrep, BlockIndexVector repfnz,
+                                                IndexVector& xprune, GlobalLU_t& glu) {
   // For each supernode-rep irep in U(*,j]
-  Index jsupno = glu.supno(jcol); 
-  Index i,irep,irep1; 
-  bool movnum, do_prune = false; 
-  Index kmin = 0, kmax = 0, minloc, maxloc,krow; 
-  for (i = 0; i < nseg; i++)
-  {
-    irep = segrep(i); 
-    irep1 = irep + 1; 
-    do_prune = false; 
-    
-    // Don't prune with a zero U-segment 
-    if (repfnz(irep) == emptyIdxLU) continue; 
-    
+  Index jsupno = glu.supno(jcol);
+  Index i, irep, irep1;
+  bool movnum, do_prune = false;
+  Index kmin = 0, kmax = 0, minloc, maxloc, krow;
+  for (i = 0; i < nseg; i++) {
+    irep = segrep(i);
+    irep1 = irep + 1;
+    do_prune = false;
+
+    // Don't prune with a zero U-segment
+    if (repfnz(irep) == emptyIdxLU) continue;
+
     // If a snode overlaps with the next panel, then the U-segment
-    // is fragmented into two parts -- irep and irep1. We should let 
-    // pruning occur at the rep-column in irep1s snode. 
-    if (glu.supno(irep) == glu.supno(irep1) ) continue; // don't prune 
-    
+    // is fragmented into two parts -- irep and irep1. We should let
+    // pruning occur at the rep-column in irep1s snode.
+    if (glu.supno(irep) == glu.supno(irep1)) continue;  // don't prune
+
     // If it has not been pruned & it has a nonz in row L(pivrow,i)
-    if (glu.supno(irep) != jsupno )
-    {
-      if ( xprune (irep) >= glu.xlsub(irep1) )
-      {
+    if (glu.supno(irep) != jsupno) {
+      if (xprune(irep) >= glu.xlsub(irep1)) {
         kmin = glu.xlsub(irep);
-        kmax = glu.xlsub(irep1) - 1; 
-        for (krow = kmin; krow <= kmax; krow++)
-        {
-          if (glu.lsub(krow) == pivrow) 
-          {
-            do_prune = true; 
-            break; 
+        kmax = glu.xlsub(irep1) - 1;
+        for (krow = kmin; krow <= kmax; krow++) {
+          if (glu.lsub(krow) == pivrow) {
+            do_prune = true;
+            break;
           }
         }
       }
-      
-      if (do_prune) 
-      {
+
+      if (do_prune) {
         // do a quicksort-type partition
         // movnum=true means that the num values have to be exchanged
-        movnum = false; 
-        if (irep == glu.xsup(glu.supno(irep)) ) // Snode of size 1 
-          movnum = true; 
-        
-        while (kmin <= kmax)
-        {
+        movnum = false;
+        if (irep == glu.xsup(glu.supno(irep)))  // Snode of size 1
+          movnum = true;
+
+        while (kmin <= kmax) {
           if (perm_r(glu.lsub(kmax)) == emptyIdxLU)
-            kmax--; 
-          else if ( perm_r(glu.lsub(kmin)) != emptyIdxLU)
+            kmax--;
+          else if (perm_r(glu.lsub(kmin)) != emptyIdxLU)
             kmin++;
-          else 
-          {
+          else {
             // kmin below pivrow (not yet pivoted), and kmax
-            // above pivrow: interchange the two suscripts
-            std::swap(glu.lsub(kmin), glu.lsub(kmax)); 
-            
-            // If the supernode has only one column, then we 
+            // above pivrow: interchange the two subscripts
+            std::swap(glu.lsub(kmin), glu.lsub(kmax));
+
+            // If the supernode has only one column, then we
             // only keep one set of subscripts. For any subscript
-            // intercnahge performed, similar interchange must be 
-            // done on the numerical values. 
-            if (movnum) 
-            {
-              minloc = glu.xlusup(irep) + ( kmin - glu.xlsub(irep) ); 
-              maxloc = glu.xlusup(irep) + ( kmax - glu.xlsub(irep) ); 
-              std::swap(glu.lusup(minloc), glu.lusup(maxloc)); 
+            // intercnahge performed, similar interchange must be
+            // done on the numerical values.
+            if (movnum) {
+              minloc = glu.xlusup(irep) + (kmin - glu.xlsub(irep));
+              maxloc = glu.xlusup(irep) + (kmax - glu.xlsub(irep));
+              std::swap(glu.lusup(minloc), glu.lusup(maxloc));
             }
             kmin++;
             kmax--;
           }
-        } // end while 
-        
-        xprune(irep) = kmin;  //Pruning 
-      } // end if do_prune 
-    } // end pruning 
-  } // End for each U-segment
+        }  // end while
+
+        xprune(irep) = StorageIndex(kmin);  // Pruning
+      }                                     // end if do_prune
+    }                                       // end pruning
+  }                                         // End for each U-segment
 }
 
-} // end namespace internal
-} // end namespace Eigen
+}  // end namespace internal
+}  // end namespace Eigen
 
-#endif // SPARSELU_PRUNEL_H
+#endif  // SPARSELU_PRUNEL_H
diff --git a/inst/include/Eigen/src/SparseLU/SparseLU_relax_snode.h b/inst/include/Eigen/src/SparseLU/SparseLU_relax_snode.h
index 58ec32e2..df3869eb 100644
--- a/inst/include/Eigen/src/SparseLU/SparseLU_relax_snode.h
+++ b/inst/include/Eigen/src/SparseLU/SparseLU_relax_snode.h
@@ -28,56 +28,54 @@
 #ifndef SPARSELU_RELAX_SNODE_H
 #define SPARSELU_RELAX_SNODE_H
 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
 namespace Eigen {
 
 namespace internal {
- 
-/** 
+
+/**
  * \brief Identify the initial relaxed supernodes
- * 
- * This routine is applied to a column elimination tree. 
+ *
+ * This routine is applied to a column elimination tree.
  * It assumes that the matrix has been reordered according to the postorder of the etree
  * \param n  the number of columns
- * \param et elimination tree 
- * \param relax_columns Maximum number of columns allowed in a relaxed snode 
+ * \param et elimination tree
+ * \param relax_columns Maximum number of columns allowed in a relaxed snode
  * \param descendants Number of descendants of each node in the etree
  * \param relax_end last column in a supernode
  */
-template <typename Scalar, typename Index>
-void SparseLUImpl<Scalar,Index>::relax_snode (const Index n, IndexVector& et, const Index relax_columns, IndexVector& descendants, IndexVector& relax_end)
-{
-  
+template <typename Scalar, typename StorageIndex>
+void SparseLUImpl<Scalar, StorageIndex>::relax_snode(const Index n, IndexVector& et, const Index relax_columns,
+                                                     IndexVector& descendants, IndexVector& relax_end) {
   // compute the number of descendants of each node in the etree
-  Index j, parent; 
+  Index parent;
   relax_end.setConstant(emptyIdxLU);
   descendants.setZero();
-  for (j = 0; j < n; j++) 
-  {
+  for (Index j = 0; j < n; j++) {
     parent = et(j);
-    if (parent != n) // not the dummy root
+    if (parent != n)  // not the dummy root
       descendants(parent) += descendants(j) + 1;
   }
   // Identify the relaxed supernodes by postorder traversal of the etree
-  Index snode_start; // beginning of a snode 
-  for (j = 0; j < n; )
-  {
+  Index snode_start;  // beginning of a snode
+  for (Index j = 0; j < n;) {
     parent = et(j);
-    snode_start = j; 
-    while ( parent != n && descendants(parent) < relax_columns ) 
-    {
-      j = parent; 
+    snode_start = j;
+    while (parent != n && descendants(parent) < relax_columns) {
+      j = parent;
       parent = et(j);
     }
-    // Found a supernode in postordered etree, j is the last column 
-    relax_end(snode_start) = j; // Record last column
+    // Found a supernode in postordered etree, j is the last column
+    relax_end(snode_start) = StorageIndex(j);  // Record last column
     j++;
     // Search for a new leaf
     while (descendants(j) != 0 && j < n) j++;
-  } // End postorder traversal of the etree
-  
+  }  // End postorder traversal of the etree
 }
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 #endif
diff --git a/inst/include/Eigen/src/SparseQR/InternalHeaderCheck.h b/inst/include/Eigen/src/SparseQR/InternalHeaderCheck.h
new file mode 100644
index 00000000..0564e932
--- /dev/null
+++ b/inst/include/Eigen/src/SparseQR/InternalHeaderCheck.h
@@ -0,0 +1,3 @@
+#ifndef EIGEN_SPARSEQR_MODULE_H
+#error "Please include Eigen/SparseQR instead of including headers inside the src directory directly."
+#endif
diff --git a/inst/include/Eigen/src/SparseQR/SparseQR.h b/inst/include/Eigen/src/SparseQR/SparseQR.h
index a00bd5db..4dc7aa9f 100644
--- a/inst/include/Eigen/src/SparseQR/SparseQR.h
+++ b/inst/include/Eigen/src/SparseQR/SparseQR.h
@@ -11,432 +11,459 @@
 #ifndef EIGEN_SPARSE_QR_H
 #define EIGEN_SPARSE_QR_H
 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
 namespace Eigen {
 
-template<typename MatrixType, typename OrderingType> class SparseQR;
-template<typename SparseQRType> struct SparseQRMatrixQReturnType;
-template<typename SparseQRType> struct SparseQRMatrixQTransposeReturnType;
-template<typename SparseQRType, typename Derived> struct SparseQR_QProduct;
+template <typename MatrixType, typename OrderingType>
+class SparseQR;
+template <typename SparseQRType>
+struct SparseQRMatrixQReturnType;
+template <typename SparseQRType>
+struct SparseQRMatrixQTransposeReturnType;
+template <typename SparseQRType, typename Derived>
+struct SparseQR_QProduct;
 namespace internal {
-  template <typename SparseQRType> struct traits<SparseQRMatrixQReturnType<SparseQRType> >
-  {
-    typedef typename SparseQRType::MatrixType ReturnType;
-    typedef typename ReturnType::Index Index;
-    typedef typename ReturnType::StorageKind StorageKind;
-  };
-  template <typename SparseQRType> struct traits<SparseQRMatrixQTransposeReturnType<SparseQRType> >
-  {
-    typedef typename SparseQRType::MatrixType ReturnType;
-  };
-  template <typename SparseQRType, typename Derived> struct traits<SparseQR_QProduct<SparseQRType, Derived> >
-  {
-    typedef typename Derived::PlainObject ReturnType;
-  };
-} // End namespace internal
+template <typename SparseQRType>
+struct traits<SparseQRMatrixQReturnType<SparseQRType> > {
+  typedef typename SparseQRType::MatrixType ReturnType;
+  typedef typename ReturnType::StorageIndex StorageIndex;
+  typedef typename ReturnType::StorageKind StorageKind;
+  enum { RowsAtCompileTime = Dynamic, ColsAtCompileTime = Dynamic };
+};
+template <typename SparseQRType>
+struct traits<SparseQRMatrixQTransposeReturnType<SparseQRType> > {
+  typedef typename SparseQRType::MatrixType ReturnType;
+};
+template <typename SparseQRType, typename Derived>
+struct traits<SparseQR_QProduct<SparseQRType, Derived> > {
+  typedef typename Derived::PlainObject ReturnType;
+};
+}  // End namespace internal
 
 /**
-  * \ingroup SparseQR_Module
-  * \class SparseQR
-  * \brief Sparse left-looking rank-revealing QR factorization
-  * 
-  * This class implements a left-looking rank-revealing QR decomposition 
-  * of sparse matrices. When a column has a norm less than a given tolerance
-  * it is implicitly permuted to the end. The QR factorization thus obtained is 
-  * given by A*P = Q*R where R is upper triangular or trapezoidal. 
-  * 
-  * P is the column permutation which is the product of the fill-reducing and the
-  * rank-revealing permutations. Use colsPermutation() to get it.
-  * 
-  * Q is the orthogonal matrix represented as products of Householder reflectors. 
-  * Use matrixQ() to get an expression and matrixQ().transpose() to get the transpose.
-  * You can then apply it to a vector.
-  * 
-  * R is the sparse triangular or trapezoidal matrix. The later occurs when A is rank-deficient.
-  * matrixR().topLeftCorner(rank(), rank()) always returns a triangular factor of full rank.
-  * 
-  * \tparam _MatrixType The type of the sparse matrix A, must be a column-major SparseMatrix<>
-  * \tparam _OrderingType The fill-reducing ordering method. See the \link OrderingMethods_Module 
-  *  OrderingMethods \endlink module for the list of built-in and external ordering methods.
-  * 
-  * \warning The input sparse matrix A must be in compressed mode (see SparseMatrix::makeCompressed()).
-  * 
-  */
-template<typename _MatrixType, typename _OrderingType>
-class SparseQR
-{
-  public:
-    typedef _MatrixType MatrixType;
-    typedef _OrderingType OrderingType;
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
-    typedef SparseMatrix<Scalar,ColMajor,Index> QRMatrixType;
-    typedef Matrix<Index, Dynamic, 1> IndexVector;
-    typedef Matrix<Scalar, Dynamic, 1> ScalarVector;
-    typedef PermutationMatrix<Dynamic, Dynamic, Index> PermutationType;
-  public:
-    SparseQR () : m_isInitialized(false), m_analysisIsok(false), m_lastError(""), m_useDefaultThreshold(true),m_isQSorted(false),m_isEtreeOk(false)
-    { }
-    
-    /** Construct a QR factorization of the matrix \a mat.
-      * 
-      * \warning The matrix \a mat must be in compressed mode (see SparseMatrix::makeCompressed()).
-      * 
-      * \sa compute()
-      */
-    SparseQR(const MatrixType& mat) : m_isInitialized(false), m_analysisIsok(false), m_lastError(""), m_useDefaultThreshold(true),m_isQSorted(false),m_isEtreeOk(false)
-    {
-      compute(mat);
-    }
-    
-    /** Computes the QR factorization of the sparse matrix \a mat.
-      * 
-      * \warning The matrix \a mat must be in compressed mode (see SparseMatrix::makeCompressed()).
-      * 
-      * \sa analyzePattern(), factorize()
-      */
-    void compute(const MatrixType& mat)
-    {
-      analyzePattern(mat);
-      factorize(mat);
-    }
-    void analyzePattern(const MatrixType& mat);
-    void factorize(const MatrixType& mat);
-    
-    /** \returns the number of rows of the represented matrix. 
-      */
-    inline Index rows() const { return m_pmat.rows(); }
-    
-    /** \returns the number of columns of the represented matrix. 
-      */
-    inline Index cols() const { return m_pmat.cols();}
-    
-    /** \returns a const reference to the \b sparse upper triangular matrix R of the QR factorization.
-      */
-    const QRMatrixType& matrixR() const { return m_R; }
-    
-    /** \returns the number of non linearly dependent columns as determined by the pivoting threshold.
-      *
-      * \sa setPivotThreshold()
-      */
-    Index rank() const 
-    {
-      eigen_assert(m_isInitialized && "The factorization should be called first, use compute()");
-      return m_nonzeropivots; 
-    }
-    
-    /** \returns an expression of the matrix Q as products of sparse Householder reflectors.
-    * The common usage of this function is to apply it to a dense matrix or vector
-    * \code
-    * VectorXd B1, B2;
-    * // Initialize B1
-    * B2 = matrixQ() * B1;
-    * \endcode
-    *
-    * To get a plain SparseMatrix representation of Q:
-    * \code
-    * SparseMatrix<double> Q;
-    * Q = SparseQR<SparseMatrix<double> >(A).matrixQ();
-    * \endcode
-    * Internally, this call simply performs a sparse product between the matrix Q
-    * and a sparse identity matrix. However, due to the fact that the sparse
-    * reflectors are stored unsorted, two transpositions are needed to sort
-    * them before performing the product.
-    */
-    SparseQRMatrixQReturnType<SparseQR> matrixQ() const 
-    { return SparseQRMatrixQReturnType<SparseQR>(*this); }
-    
-    /** \returns a const reference to the column permutation P that was applied to A such that A*P = Q*R
-      * It is the combination of the fill-in reducing permutation and numerical column pivoting.
-      */
-    const PermutationType& colsPermutation() const
-    { 
-      eigen_assert(m_isInitialized && "Decomposition is not initialized.");
-      return m_outputPerm_c;
-    }
-    
-    /** \returns A string describing the type of error.
-      * This method is provided to ease debugging, not to handle errors.
-      */
-    std::string lastErrorMessage() const { return m_lastError; }
-    
-    /** \internal */
-    template<typename Rhs, typename Dest>
-    bool _solve(const MatrixBase<Rhs> &B, MatrixBase<Dest> &dest) const
-    {
-      eigen_assert(m_isInitialized && "The factorization should be called first, use compute()");
-      eigen_assert(this->rows() == B.rows() && "SparseQR::solve() : invalid number of rows in the right hand side matrix");
-
-      Index rank = this->rank();
-      
-      // Compute Q^T * b;
-      typename Dest::PlainObject y, b;
-      y = this->matrixQ().transpose() * B; 
-      b = y;
-      
-      // Solve with the triangular matrix R
-      y.resize((std::max)(cols(),Index(y.rows())),y.cols());
-      y.topRows(rank) = this->matrixR().topLeftCorner(rank, rank).template triangularView<Upper>().solve(b.topRows(rank));
-      y.bottomRows(y.rows()-rank).setZero();
-
-      // Apply the column permutation
-      if (m_perm_c.size())  dest = colsPermutation() * y.topRows(cols());
-      else                  dest = y.topRows(cols());
-      
-      m_info = Success;
-      return true;
-    }
-    
-
-    /** Sets the threshold that is used to determine linearly dependent columns during the factorization.
-      *
-      * In practice, if during the factorization the norm of the column that has to be eliminated is below
-      * this threshold, then the entire column is treated as zero, and it is moved at the end.
-      */
-    void setPivotThreshold(const RealScalar& threshold)
-    {
-      m_useDefaultThreshold = false;
-      m_threshold = threshold;
-    }
-    
-    /** \returns the solution X of \f$ A X = B \f$ using the current decomposition of A.
-      *
-      * \sa compute()
-      */
-    template<typename Rhs>
-    inline const internal::solve_retval<SparseQR, Rhs> solve(const MatrixBase<Rhs>& B) const 
-    {
-      eigen_assert(m_isInitialized && "The factorization should be called first, use compute()");
-      eigen_assert(this->rows() == B.rows() && "SparseQR::solve() : invalid number of rows in the right hand side matrix");
-      return internal::solve_retval<SparseQR, Rhs>(*this, B.derived());
-    }
-    template<typename Rhs>
-    inline const internal::sparse_solve_retval<SparseQR, Rhs> solve(const SparseMatrixBase<Rhs>& B) const
-    {
-          eigen_assert(m_isInitialized && "The factorization should be called first, use compute()");
-          eigen_assert(this->rows() == B.rows() && "SparseQR::solve() : invalid number of rows in the right hand side matrix");
-          return internal::sparse_solve_retval<SparseQR, Rhs>(*this, B.derived());
-    }
-    
-    /** \brief Reports whether previous computation was successful.
-      *
-      * \returns \c Success if computation was successful,
-      *          \c NumericalIssue if the QR factorization reports a numerical problem
-      *          \c InvalidInput if the input matrix is invalid
-      *
-      * \sa iparm()          
-      */
-    ComputationInfo info() const
-    {
-      eigen_assert(m_isInitialized && "Decomposition is not initialized.");
-      return m_info;
-    }
+ * \ingroup SparseQR_Module
+ * \class SparseQR
+ * \brief Sparse left-looking QR factorization with numerical column pivoting
+ *
+ * This class implements a left-looking QR decomposition of sparse matrices
+ * with numerical column pivoting.
+ * When a column has a norm less than a given tolerance
+ * it is implicitly permuted to the end. The QR factorization thus obtained is
+ * given by A*P = Q*R where R is upper triangular or trapezoidal.
+ *
+ * P is the column permutation which is the product of the fill-reducing and the
+ * numerical permutations. Use colsPermutation() to get it.
+ *
+ * Q is the orthogonal matrix represented as products of Householder reflectors.
+ * Use matrixQ() to get an expression and matrixQ().adjoint() to get the adjoint.
+ * You can then apply it to a vector.
+ *
+ * R is the sparse triangular or trapezoidal matrix. The later occurs when A is rank-deficient.
+ * matrixR().topLeftCorner(rank(), rank()) always returns a triangular factor of full rank.
+ *
+ * \tparam MatrixType_ The type of the sparse matrix A, must be a column-major SparseMatrix<>
+ * \tparam OrderingType_ The fill-reducing ordering method. See the \link OrderingMethods_Module
+ *  OrderingMethods \endlink module for the list of built-in and external ordering methods.
+ *
+ * \implsparsesolverconcept
+ *
+ * The numerical pivoting strategy and default threshold are the same as in SuiteSparse QR, and
+ * detailed in the following paper:
+ * <i>
+ * Tim Davis, "Algorithm 915, SuiteSparseQR: Multifrontal Multithreaded Rank-Revealing
+ * Sparse QR Factorization", ACM Trans. on Math. Soft. 38(1), 2011.
+ * </i>
+ * Even though it is qualified as "rank-revealing", this strategy might fail for some
+ * rank deficient problems. When this class is used to solve linear or least-square problems
+ * it is thus strongly recommended to check the accuracy of the computed solution. If it
+ * failed, it usually helps to increase the threshold with setPivotThreshold.
+ *
+ * \warning The input sparse matrix A must be in compressed mode (see SparseMatrix::makeCompressed()).
+ * \warning For complex matrices matrixQ().transpose() will actually return the adjoint matrix.
+ *
+ */
+template <typename MatrixType_, typename OrderingType_>
+class SparseQR : public SparseSolverBase<SparseQR<MatrixType_, OrderingType_> > {
+ protected:
+  typedef SparseSolverBase<SparseQR<MatrixType_, OrderingType_> > Base;
+  using Base::m_isInitialized;
 
-  protected:
-    inline void sort_matrix_Q()
-    {
-      if(this->m_isQSorted) return;
-      // The matrix Q is sorted during the transposition
-      SparseMatrix<Scalar, RowMajor, Index> mQrm(this->m_Q);
-      this->m_Q = mQrm;
-      this->m_isQSorted = true;
-    }
+ public:
+  using Base::_solve_impl;
+  typedef MatrixType_ MatrixType;
+  typedef OrderingType_ OrderingType;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef typename MatrixType::StorageIndex StorageIndex;
+  typedef SparseMatrix<Scalar, ColMajor, StorageIndex> QRMatrixType;
+  typedef Matrix<StorageIndex, Dynamic, 1> IndexVector;
+  typedef Matrix<Scalar, Dynamic, 1> ScalarVector;
+  typedef PermutationMatrix<Dynamic, Dynamic, StorageIndex> PermutationType;
+
+  enum { ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime };
+
+ public:
+  SparseQR()
+      : m_analysisIsok(false), m_lastError(""), m_useDefaultThreshold(true), m_isQSorted(false), m_isEtreeOk(false) {}
+
+  /** Construct a QR factorization of the matrix \a mat.
+   *
+   * \warning The matrix \a mat must be in compressed mode (see SparseMatrix::makeCompressed()).
+   *
+   * \sa compute()
+   */
+  explicit SparseQR(const MatrixType& mat)
+      : m_analysisIsok(false), m_lastError(""), m_useDefaultThreshold(true), m_isQSorted(false), m_isEtreeOk(false) {
+    compute(mat);
+  }
+
+  /** Computes the QR factorization of the sparse matrix \a mat.
+   *
+   * \warning The matrix \a mat must be in compressed mode (see SparseMatrix::makeCompressed()).
+   *
+   * \sa analyzePattern(), factorize()
+   */
+  void compute(const MatrixType& mat) {
+    analyzePattern(mat);
+    factorize(mat);
+  }
+  void analyzePattern(const MatrixType& mat);
+  void factorize(const MatrixType& mat);
+
+  /** \returns the number of rows of the represented matrix.
+   */
+  inline Index rows() const { return m_pmat.rows(); }
+
+  /** \returns the number of columns of the represented matrix.
+   */
+  inline Index cols() const { return m_pmat.cols(); }
 
-    
-  protected:
-    bool m_isInitialized;
-    bool m_analysisIsok;
-    bool m_factorizationIsok;
-    mutable ComputationInfo m_info;
-    std::string m_lastError;
-    QRMatrixType m_pmat;            // Temporary matrix
-    QRMatrixType m_R;               // The triangular factor matrix
-    QRMatrixType m_Q;               // The orthogonal reflectors
-    ScalarVector m_hcoeffs;         // The Householder coefficients
-    PermutationType m_perm_c;       // Fill-reducing  Column  permutation
-    PermutationType m_pivotperm;    // The permutation for rank revealing
-    PermutationType m_outputPerm_c; // The final column permutation
-    RealScalar m_threshold;         // Threshold to determine null Householder reflections
-    bool m_useDefaultThreshold;     // Use default threshold
-    Index m_nonzeropivots;          // Number of non zero pivots found 
-    IndexVector m_etree;            // Column elimination tree
-    IndexVector m_firstRowElt;      // First element in each row
-    bool m_isQSorted;               // whether Q is sorted or not
-    bool m_isEtreeOk;               // whether the elimination tree match the initial input matrix
-    
-    template <typename, typename > friend struct SparseQR_QProduct;
-    template <typename > friend struct SparseQRMatrixQReturnType;
-    
+  /** \returns a const reference to the \b sparse upper triangular matrix R of the QR factorization.
+   * \warning The entries of the returned matrix are not sorted. This means that using it in algorithms
+   *          expecting sorted entries will fail. This include random coefficient accesses (SpaseMatrix::coeff()),
+   *          and coefficient-wise operations. Matrix products and triangular solves are fine though.
+   *
+   * To sort the entries, you can assign it to a row-major matrix, and if a column-major matrix
+   * is required, you can copy it again:
+   * \code
+   * SparseMatrix<double>          R  = qr.matrixR();  // column-major, not sorted!
+   * SparseMatrix<double,RowMajor> Rr = qr.matrixR();  // row-major, sorted
+   * SparseMatrix<double>          Rc = Rr;            // column-major, sorted
+   * \endcode
+   */
+  const QRMatrixType& matrixR() const { return m_R; }
+
+  /** \returns the number of non linearly dependent columns as determined by the pivoting threshold.
+   *
+   * \sa setPivotThreshold()
+   */
+  Index rank() const {
+    eigen_assert(m_isInitialized && "The factorization should be called first, use compute()");
+    return m_nonzeropivots;
+  }
+
+  /** \returns an expression of the matrix Q as products of sparse Householder reflectors.
+   * The common usage of this function is to apply it to a dense matrix or vector
+   * \code
+   * VectorXd B1, B2;
+   * // Initialize B1
+   * B2 = matrixQ() * B1;
+   * \endcode
+   *
+   * To get a plain SparseMatrix representation of Q:
+   * \code
+   * SparseMatrix<double> Q;
+   * Q = SparseQR<SparseMatrix<double> >(A).matrixQ();
+   * \endcode
+   * Internally, this call simply performs a sparse product between the matrix Q
+   * and a sparse identity matrix. However, due to the fact that the sparse
+   * reflectors are stored unsorted, two transpositions are needed to sort
+   * them before performing the product.
+   */
+  SparseQRMatrixQReturnType<SparseQR> matrixQ() const { return SparseQRMatrixQReturnType<SparseQR>(*this); }
+
+  /** \returns a const reference to the column permutation P that was applied to A such that A*P = Q*R
+   * It is the combination of the fill-in reducing permutation and numerical column pivoting.
+   */
+  const PermutationType& colsPermutation() const {
+    eigen_assert(m_isInitialized && "Decomposition is not initialized.");
+    return m_outputPerm_c;
+  }
+
+  /** \returns A string describing the type of error.
+   * This method is provided to ease debugging, not to handle errors.
+   */
+  std::string lastErrorMessage() const { return m_lastError; }
+
+  /** \internal */
+  template <typename Rhs, typename Dest>
+  bool _solve_impl(const MatrixBase<Rhs>& B, MatrixBase<Dest>& dest) const {
+    eigen_assert(m_isInitialized && "The factorization should be called first, use compute()");
+    eigen_assert(this->rows() == B.rows() &&
+                 "SparseQR::solve() : invalid number of rows in the right hand side matrix");
+
+    Index rank = this->rank();
+
+    // Compute Q^* * b;
+    typename Dest::PlainObject y, b;
+    y = this->matrixQ().adjoint() * B;
+    b = y;
+
+    // Solve with the triangular matrix R
+    y.resize((std::max<Index>)(cols(), y.rows()), y.cols());
+    y.topRows(rank) = this->matrixR().topLeftCorner(rank, rank).template triangularView<Upper>().solve(b.topRows(rank));
+    y.bottomRows(y.rows() - rank).setZero();
+
+    // Apply the column permutation
+    if (m_perm_c.size())
+      dest = colsPermutation() * y.topRows(cols());
+    else
+      dest = y.topRows(cols());
+
+    m_info = Success;
+    return true;
+  }
+
+  /** Sets the threshold that is used to determine linearly dependent columns during the factorization.
+   *
+   * In practice, if during the factorization the norm of the column that has to be eliminated is below
+   * this threshold, then the entire column is treated as zero, and it is moved at the end.
+   */
+  void setPivotThreshold(const RealScalar& threshold) {
+    m_useDefaultThreshold = false;
+    m_threshold = threshold;
+  }
+
+  /** \returns the solution X of \f$ A X = B \f$ using the current decomposition of A.
+   *
+   * \sa compute()
+   */
+  template <typename Rhs>
+  inline const Solve<SparseQR, Rhs> solve(const MatrixBase<Rhs>& B) const {
+    eigen_assert(m_isInitialized && "The factorization should be called first, use compute()");
+    eigen_assert(this->rows() == B.rows() &&
+                 "SparseQR::solve() : invalid number of rows in the right hand side matrix");
+    return Solve<SparseQR, Rhs>(*this, B.derived());
+  }
+  template <typename Rhs>
+  inline const Solve<SparseQR, Rhs> solve(const SparseMatrixBase<Rhs>& B) const {
+    eigen_assert(m_isInitialized && "The factorization should be called first, use compute()");
+    eigen_assert(this->rows() == B.rows() &&
+                 "SparseQR::solve() : invalid number of rows in the right hand side matrix");
+    return Solve<SparseQR, Rhs>(*this, B.derived());
+  }
+
+  /** \brief Reports whether previous computation was successful.
+   *
+   * \returns \c Success if computation was successful,
+   *          \c NumericalIssue if the QR factorization reports a numerical problem
+   *          \c InvalidInput if the input matrix is invalid
+   *
+   * \sa iparm()
+   */
+  ComputationInfo info() const {
+    eigen_assert(m_isInitialized && "Decomposition is not initialized.");
+    return m_info;
+  }
+
+  /** \internal */
+  inline void _sort_matrix_Q() {
+    if (this->m_isQSorted) return;
+    // The matrix Q is sorted during the transposition
+    SparseMatrix<Scalar, RowMajor, Index> mQrm(this->m_Q);
+    this->m_Q = mQrm;
+    this->m_isQSorted = true;
+  }
+
+ protected:
+  bool m_analysisIsok;
+  bool m_factorizationIsok;
+  mutable ComputationInfo m_info;
+  std::string m_lastError;
+  QRMatrixType m_pmat;             // Temporary matrix
+  QRMatrixType m_R;                // The triangular factor matrix
+  QRMatrixType m_Q;                // The orthogonal reflectors
+  ScalarVector m_hcoeffs;          // The Householder coefficients
+  PermutationType m_perm_c;        // Fill-reducing  Column  permutation
+  PermutationType m_pivotperm;     // The permutation for rank revealing
+  PermutationType m_outputPerm_c;  // The final column permutation
+  RealScalar m_threshold;          // Threshold to determine null Householder reflections
+  bool m_useDefaultThreshold;      // Use default threshold
+  Index m_nonzeropivots;           // Number of non zero pivots found
+  IndexVector m_etree;             // Column elimination tree
+  IndexVector m_firstRowElt;       // First element in each row
+  bool m_isQSorted;                // whether Q is sorted or not
+  bool m_isEtreeOk;                // whether the elimination tree match the initial input matrix
+
+  template <typename, typename>
+  friend struct SparseQR_QProduct;
 };
 
-/** \brief Preprocessing step of a QR factorization 
-  * 
-  * \warning The matrix \a mat must be in compressed mode (see SparseMatrix::makeCompressed()).
-  * 
-  * In this step, the fill-reducing permutation is computed and applied to the columns of A
-  * and the column elimination tree is computed as well. Only the sparsity pattern of \a mat is exploited.
-  * 
-  * \note In this step it is assumed that there is no empty row in the matrix \a mat.
-  */
+/** \brief Preprocessing step of a QR factorization
+ *
+ * \warning The matrix \a mat must be in compressed mode (see SparseMatrix::makeCompressed()).
+ *
+ * In this step, the fill-reducing permutation is computed and applied to the columns of A
+ * and the column elimination tree is computed as well. Only the sparsity pattern of \a mat is exploited.
+ *
+ * \note In this step it is assumed that there is no empty row in the matrix \a mat.
+ */
 template <typename MatrixType, typename OrderingType>
-void SparseQR<MatrixType,OrderingType>::analyzePattern(const MatrixType& mat)
-{
-  eigen_assert(mat.isCompressed() && "SparseQR requires a sparse matrix in compressed mode. Call .makeCompressed() before passing it to SparseQR");
+void SparseQR<MatrixType, OrderingType>::analyzePattern(const MatrixType& mat) {
+  eigen_assert(
+      mat.isCompressed() &&
+      "SparseQR requires a sparse matrix in compressed mode. Call .makeCompressed() before passing it to SparseQR");
   // Copy to a column major matrix if the input is rowmajor
-  typename internal::conditional<MatrixType::IsRowMajor,QRMatrixType,const MatrixType&>::type matCpy(mat);
+  std::conditional_t<MatrixType::IsRowMajor, QRMatrixType, const MatrixType&> matCpy(mat);
   // Compute the column fill reducing ordering
-  OrderingType ord; 
-  ord(matCpy, m_perm_c); 
+  OrderingType ord;
+  ord(matCpy, m_perm_c);
   Index n = mat.cols();
   Index m = mat.rows();
-  Index diagSize = (std::min)(m,n);
-  
-  if (!m_perm_c.size())
-  {
+  Index diagSize = (std::min)(m, n);
+
+  if (!m_perm_c.size()) {
     m_perm_c.resize(n);
-    m_perm_c.indices().setLinSpaced(n, 0,n-1);
+    m_perm_c.indices().setLinSpaced(n, 0, StorageIndex(n - 1));
   }
-  
+
   // Compute the column elimination tree of the permuted matrix
   m_outputPerm_c = m_perm_c.inverse();
   internal::coletree(matCpy, m_etree, m_firstRowElt, m_outputPerm_c.indices().data());
   m_isEtreeOk = true;
-  
+
   m_R.resize(m, n);
   m_Q.resize(m, diagSize);
-  
-  // Allocate space for nonzero elements : rough estimation
-  m_R.reserve(2*mat.nonZeros()); //FIXME Get a more accurate estimation through symbolic factorization with the etree
-  m_Q.reserve(2*mat.nonZeros());
+
+  // Allocate space for nonzero elements: rough estimation
+  m_R.reserve(2 * mat.nonZeros());  // FIXME Get a more accurate estimation through symbolic factorization with the
+                                    // etree
+  m_Q.reserve(2 * mat.nonZeros());
   m_hcoeffs.resize(diagSize);
   m_analysisIsok = true;
 }
 
 /** \brief Performs the numerical QR factorization of the input matrix
-  * 
-  * The function SparseQR::analyzePattern(const MatrixType&) must have been called beforehand with
-  * a matrix having the same sparsity pattern than \a mat.
-  * 
-  * \param mat The sparse column-major matrix
-  */
+ *
+ * The function SparseQR::analyzePattern(const MatrixType&) must have been called beforehand with
+ * a matrix having the same sparsity pattern than \a mat.
+ *
+ * \param mat The sparse column-major matrix
+ */
 template <typename MatrixType, typename OrderingType>
-void SparseQR<MatrixType,OrderingType>::factorize(const MatrixType& mat)
-{
+void SparseQR<MatrixType, OrderingType>::factorize(const MatrixType& mat) {
   using std::abs;
-  using std::max;
-  
+
   eigen_assert(m_analysisIsok && "analyzePattern() should be called before this step");
-  Index m = mat.rows();
-  Index n = mat.cols();
-  Index diagSize = (std::min)(m,n);
-  IndexVector mark((std::max)(m,n)); mark.setConstant(-1);  // Record the visited nodes
-  IndexVector Ridx(n), Qidx(m);                             // Store temporarily the row indexes for the current column of R and Q
-  Index nzcolR, nzcolQ;                                     // Number of nonzero for the current column of R and Q
-  ScalarVector tval(m);                                     // The dense vector used to compute the current column
-  RealScalar pivotThreshold = m_threshold;
-  
+  StorageIndex m = StorageIndex(mat.rows());
+  StorageIndex n = StorageIndex(mat.cols());
+  StorageIndex diagSize = (std::min)(m, n);
+  IndexVector mark((std::max)(m, n));
+  mark.setConstant(-1);          // Record the visited nodes
+  IndexVector Ridx(n), Qidx(m);  // Store temporarily the row indexes for the current column of R and Q
+  Index nzcolR, nzcolQ;          // Number of nonzero for the current column of R and Q
+  ScalarVector tval(m);          // The dense vector used to compute the current column
+
   m_R.setZero();
   m_Q.setZero();
   m_pmat = mat;
-  if(!m_isEtreeOk)
-  {
+  if (!m_isEtreeOk) {
     m_outputPerm_c = m_perm_c.inverse();
     internal::coletree(m_pmat, m_etree, m_firstRowElt, m_outputPerm_c.indices().data());
     m_isEtreeOk = true;
   }
 
-  m_pmat.uncompress(); // To have the innerNonZeroPtr allocated
-  
+  m_pmat.uncompress();  // To have the innerNonZeroPtr allocated
+
   // Apply the fill-in reducing permutation lazily:
   {
     // If the input is row major, copy the original column indices,
     // otherwise directly use the input matrix
-    // 
+    //
     IndexVector originalOuterIndicesCpy;
-    const Index *originalOuterIndices = mat.outerIndexPtr();
-    if(MatrixType::IsRowMajor)
-    {
-      originalOuterIndicesCpy = IndexVector::Map(m_pmat.outerIndexPtr(),n+1);
+    const StorageIndex* originalOuterIndices = mat.outerIndexPtr();
+    if (MatrixType::IsRowMajor) {
+      originalOuterIndicesCpy = IndexVector::Map(m_pmat.outerIndexPtr(), n + 1);
       originalOuterIndices = originalOuterIndicesCpy.data();
     }
-    
-    for (int i = 0; i < n; i++)
-    {
+
+    for (int i = 0; i < n; i++) {
       Index p = m_perm_c.size() ? m_perm_c.indices()(i) : i;
-      m_pmat.outerIndexPtr()[p] = originalOuterIndices[i]; 
-      m_pmat.innerNonZeroPtr()[p] = originalOuterIndices[i+1] - originalOuterIndices[i]; 
+      m_pmat.outerIndexPtr()[p] = originalOuterIndices[i];
+      m_pmat.innerNonZeroPtr()[p] = originalOuterIndices[i + 1] - originalOuterIndices[i];
     }
   }
-  
+
   /* Compute the default threshold as in MatLab, see:
    * Tim Davis, "Algorithm 915, SuiteSparseQR: Multifrontal Multithreaded Rank-Revealing
-   * Sparse QR Factorization, ACM Trans. on Math. Soft. 38(1), 2011, Page 8:3 
+   * Sparse QR Factorization, ACM Trans. on Math. Soft. 38(1), 2011, Page 8:3
    */
-  if(m_useDefaultThreshold) 
-  {
+  RealScalar pivotThreshold;
+  if (m_useDefaultThreshold) {
     RealScalar max2Norm = 0.0;
-    for (int j = 0; j < n; j++) max2Norm = (max)(max2Norm, m_pmat.col(j).norm());
-    if(max2Norm==RealScalar(0))
-      max2Norm = RealScalar(1);
+    for (int j = 0; j < n; j++) max2Norm = numext::maxi(max2Norm, m_pmat.col(j).norm());
+    if (max2Norm == RealScalar(0)) max2Norm = RealScalar(1);
     pivotThreshold = 20 * (m + n) * max2Norm * NumTraits<RealScalar>::epsilon();
+  } else {
+    pivotThreshold = m_threshold;
   }
-  
+
   // Initialize the numerical permutation
   m_pivotperm.setIdentity(n);
-  
-  Index nonzeroCol = 0; // Record the number of valid pivots
+
+  StorageIndex nonzeroCol = 0;  // Record the number of valid pivots
   m_Q.startVec(0);
 
   // Left looking rank-revealing QR factorization: compute a column of R and Q at a time
-  for (Index col = 0; col < n; ++col)
-  {
+  for (StorageIndex col = 0; col < n; ++col) {
     mark.setConstant(-1);
     m_R.startVec(col);
     mark(nonzeroCol) = col;
     Qidx(0) = nonzeroCol;
-    nzcolR = 0; nzcolQ = 1;
-    bool found_diag = nonzeroCol>=m;
-    tval.setZero(); 
-    
+    nzcolR = 0;
+    nzcolQ = 1;
+    bool found_diag = nonzeroCol >= m;
+    tval.setZero();
+
     // Symbolic factorization: find the nonzero locations of the column k of the factors R and Q, i.e.,
-    // all the nodes (with indexes lower than rank) reachable through the column elimination tree (etree) rooted at node k.
-    // Note: if the diagonal entry does not exist, then its contribution must be explicitly added,
-    // thus the trick with found_diag that permits to do one more iteration on the diagonal element if this one has not been found.
-    for (typename QRMatrixType::InnerIterator itp(m_pmat, col); itp || !found_diag; ++itp)
-    {
-      Index curIdx = nonzeroCol;
-      if(itp) curIdx = itp.row();
-      if(curIdx == nonzeroCol) found_diag = true;
-      
+    // all the nodes (with indexes lower than rank) reachable through the column elimination tree (etree) rooted at node
+    // k. Note: if the diagonal entry does not exist, then its contribution must be explicitly added, thus the trick
+    // with found_diag that permits to do one more iteration on the diagonal element if this one has not been found.
+    for (typename QRMatrixType::InnerIterator itp(m_pmat, col); itp || !found_diag; ++itp) {
+      StorageIndex curIdx = nonzeroCol;
+      if (itp) curIdx = StorageIndex(itp.row());
+      if (curIdx == nonzeroCol) found_diag = true;
+
       // Get the nonzeros indexes of the current column of R
-      Index st = m_firstRowElt(curIdx); // The traversal of the etree starts here 
-      if (st < 0 )
-      {
+      StorageIndex st = m_firstRowElt(curIdx);  // The traversal of the etree starts here
+      if (st < 0) {
         m_lastError = "Empty row found during numerical factorization";
         m_info = InvalidInput;
         return;
       }
 
-      // Traverse the etree 
+      // Traverse the etree
       Index bi = nzcolR;
-      for (; mark(st) != col; st = m_etree(st))
-      {
+      for (; mark(st) != col; st = m_etree(st)) {
         Ridx(nzcolR) = st;  // Add this row to the list,
         mark(st) = col;     // and mark this row as visited
         nzcolR++;
       }
 
       // Reverse the list to get the topological ordering
-      Index nt = nzcolR-bi;
-      for(Index i = 0; i < nt/2; i++) std::swap(Ridx(bi+i), Ridx(nzcolR-i-1));
-       
+      Index nt = nzcolR - bi;
+      for (Index i = 0; i < nt / 2; i++) std::swap(Ridx(bi + i), Ridx(nzcolR - i - 1));
+
       // Copy the current (curIdx,pcol) value of the input matrix
-      if(itp) tval(curIdx) = itp.value();
-      else    tval(curIdx) = Scalar(0);
-      
+      if (itp)
+        tval(curIdx) = itp.value();
+      else
+        tval(curIdx) = Scalar(0);
+
       // Compute the pattern of Q(:,k)
-      if(curIdx > nonzeroCol && mark(curIdx) != col ) 
-      {
+      if (curIdx > nonzeroCol && mark(curIdx) != col) {
         Qidx(nzcolQ) = curIdx;  // Add this row to the pattern of Q,
         mark(curIdx) = col;     // and mark it as visited
         nzcolQ++;
@@ -444,110 +471,89 @@ void SparseQR<MatrixType,OrderingType>::factorize(const MatrixType& mat)
     }
 
     // Browse all the indexes of R(:,col) in reverse order
-    for (Index i = nzcolR-1; i >= 0; i--)
-    {
+    for (Index i = nzcolR - 1; i >= 0; i--) {
       Index curIdx = Ridx(i);
-      
+
       // Apply the curIdx-th householder vector to the current column (temporarily stored into tval)
       Scalar tdot(0);
-      
+
       // First compute q' * tval
       tdot = m_Q.col(curIdx).dot(tval);
 
       tdot *= m_hcoeffs(curIdx);
-      
+
       // Then update tval = tval - q * tau
-      // FIXME: tval -= tdot * m_Q.col(curIdx) should amount to the same (need to check/add support for efficient "dense ?= sparse")
-      for (typename QRMatrixType::InnerIterator itq(m_Q, curIdx); itq; ++itq)
-        tval(itq.row()) -= itq.value() * tdot;
+      tval -= tdot * m_Q.col(curIdx);
 
       // Detect fill-in for the current column of Q
-      if(m_etree(Ridx(i)) == nonzeroCol)
-      {
-        for (typename QRMatrixType::InnerIterator itq(m_Q, curIdx); itq; ++itq)
-        {
-          Index iQ = itq.row();
-          if (mark(iQ) != col)
-          {
+      if (m_etree(Ridx(i)) == nonzeroCol) {
+        for (typename QRMatrixType::InnerIterator itq(m_Q, curIdx); itq; ++itq) {
+          StorageIndex iQ = StorageIndex(itq.row());
+          if (mark(iQ) != col) {
             Qidx(nzcolQ++) = iQ;  // Add this row to the pattern of Q,
             mark(iQ) = col;       // and mark it as visited
           }
         }
       }
-    } // End update current column
-    
-    Scalar tau = 0;
+    }  // End update current column
+
+    Scalar tau = RealScalar(0);
     RealScalar beta = 0;
-    
-    if(nonzeroCol < diagSize)
-    {
+
+    if (nonzeroCol < diagSize) {
       // Compute the Householder reflection that eliminate the current column
       // FIXME this step should call the Householder module.
       Scalar c0 = nzcolQ ? tval(Qidx(0)) : Scalar(0);
-      
+
       // First, the squared norm of Q((col+1):m, col)
       RealScalar sqrNorm = 0.;
       for (Index itq = 1; itq < nzcolQ; ++itq) sqrNorm += numext::abs2(tval(Qidx(itq)));
-      if(sqrNorm == RealScalar(0) && numext::imag(c0) == RealScalar(0))
-      {
+      if (sqrNorm == RealScalar(0) && numext::imag(c0) == RealScalar(0)) {
         beta = numext::real(c0);
         tval(Qidx(0)) = 1;
-      }
-      else
-      {
+      } else {
         using std::sqrt;
         beta = sqrt(numext::abs2(c0) + sqrNorm);
-        if(numext::real(c0) >= RealScalar(0))
-          beta = -beta;
+        if (numext::real(c0) >= RealScalar(0)) beta = -beta;
         tval(Qidx(0)) = 1;
-        for (Index itq = 1; itq < nzcolQ; ++itq)
-          tval(Qidx(itq)) /= (c0 - beta);
-        tau = numext::conj((beta-c0) / beta);
-          
+        for (Index itq = 1; itq < nzcolQ; ++itq) tval(Qidx(itq)) /= (c0 - beta);
+        tau = numext::conj((beta - c0) / beta);
       }
     }
 
     // Insert values in R
-    for (Index  i = nzcolR-1; i >= 0; i--)
-    {
+    for (Index i = nzcolR - 1; i >= 0; i--) {
       Index curIdx = Ridx(i);
-      if(curIdx < nonzeroCol) 
-      {
+      if (curIdx < nonzeroCol) {
         m_R.insertBackByOuterInnerUnordered(col, curIdx) = tval(curIdx);
         tval(curIdx) = Scalar(0.);
       }
     }
 
-    if(nonzeroCol < diagSize && abs(beta) >= pivotThreshold)
-    {
+    if (nonzeroCol < diagSize && abs(beta) >= pivotThreshold) {
       m_R.insertBackByOuterInner(col, nonzeroCol) = beta;
       // The householder coefficient
       m_hcoeffs(nonzeroCol) = tau;
       // Record the householder reflections
-      for (Index itq = 0; itq < nzcolQ; ++itq)
-      {
+      for (Index itq = 0; itq < nzcolQ; ++itq) {
         Index iQ = Qidx(itq);
-        m_Q.insertBackByOuterInnerUnordered(nonzeroCol,iQ) = tval(iQ);
+        m_Q.insertBackByOuterInnerUnordered(nonzeroCol, iQ) = tval(iQ);
         tval(iQ) = Scalar(0.);
       }
       nonzeroCol++;
-      if(nonzeroCol<diagSize)
-        m_Q.startVec(nonzeroCol);
-    }
-    else
-    {
+      if (nonzeroCol < diagSize) m_Q.startVec(nonzeroCol);
+    } else {
       // Zero pivot found: move implicitly this column to the end
-      for (Index j = nonzeroCol; j < n-1; j++) 
-        std::swap(m_pivotperm.indices()(j), m_pivotperm.indices()[j+1]);
-      
+      for (Index j = nonzeroCol; j < n - 1; j++) std::swap(m_pivotperm.indices()(j), m_pivotperm.indices()[j + 1]);
+
       // Recompute the column elimination tree
       internal::coletree(m_pmat, m_etree, m_firstRowElt, m_pivotperm.indices().data());
       m_isEtreeOk = false;
     }
   }
-  
-  m_hcoeffs.tail(diagSize-nonzeroCol).setZero();
-  
+
+  m_hcoeffs.tail(diagSize - nonzeroCol).setZero();
+
   // Finalize the column pointers of the sparse matrices R and Q
   m_Q.finalize();
   m_Q.makeCompressed();
@@ -556,159 +562,145 @@ void SparseQR<MatrixType,OrderingType>::factorize(const MatrixType& mat)
   m_isQSorted = false;
 
   m_nonzeropivots = nonzeroCol;
-  
-  if(nonzeroCol<n)
-  {
+
+  if (nonzeroCol < n) {
     // Permute the triangular factor to put the 'dead' columns to the end
     QRMatrixType tempR(m_R);
     m_R = tempR * m_pivotperm;
-    
+
     // Update the column permutation
     m_outputPerm_c = m_outputPerm_c * m_pivotperm;
   }
-  
-  m_isInitialized = true; 
+
+  m_isInitialized = true;
   m_factorizationIsok = true;
   m_info = Success;
 }
 
-namespace internal {
-  
-template<typename _MatrixType, typename OrderingType, typename Rhs>
-struct solve_retval<SparseQR<_MatrixType,OrderingType>, Rhs>
-  : solve_retval_base<SparseQR<_MatrixType,OrderingType>, Rhs>
-{
-  typedef SparseQR<_MatrixType,OrderingType> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
-  }
-};
-template<typename _MatrixType, typename OrderingType, typename Rhs>
-struct sparse_solve_retval<SparseQR<_MatrixType, OrderingType>, Rhs>
- : sparse_solve_retval_base<SparseQR<_MatrixType, OrderingType>, Rhs>
-{
-  typedef SparseQR<_MatrixType, OrderingType> Dec;
-  EIGEN_MAKE_SPARSE_SOLVE_HELPERS(Dec, Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    this->defaultEvalTo(dst);
-  }
-};
-} // end namespace internal
-
 template <typename SparseQRType, typename Derived>
-struct SparseQR_QProduct : ReturnByValue<SparseQR_QProduct<SparseQRType, Derived> >
-{
+struct SparseQR_QProduct : ReturnByValue<SparseQR_QProduct<SparseQRType, Derived> > {
   typedef typename SparseQRType::QRMatrixType MatrixType;
   typedef typename SparseQRType::Scalar Scalar;
-  typedef typename SparseQRType::Index Index;
-  // Get the references 
-  SparseQR_QProduct(const SparseQRType& qr, const Derived& other, bool transpose) : 
-  m_qr(qr),m_other(other),m_transpose(transpose) {}
-  inline Index rows() const { return m_transpose ? m_qr.rows() : m_qr.cols(); }
+  // Get the references
+  SparseQR_QProduct(const SparseQRType& qr, const Derived& other, bool transpose)
+      : m_qr(qr), m_other(other), m_transpose(transpose) {}
+  inline Index rows() const { return m_qr.matrixQ().rows(); }
   inline Index cols() const { return m_other.cols(); }
-  
+
   // Assign to a vector
-  template<typename DesType>
-  void evalTo(DesType& res) const
-  {
+  template <typename DesType>
+  void evalTo(DesType& res) const {
     Index m = m_qr.rows();
     Index n = m_qr.cols();
-    Index diagSize = (std::min)(m,n);
+    Index diagSize = (std::min)(m, n);
     res = m_other;
-    if (m_transpose)
-    {
+    if (m_transpose) {
       eigen_assert(m_qr.m_Q.rows() == m_other.rows() && "Non conforming object sizes");
-      //Compute res = Q' * other column by column
-      for(Index j = 0; j < res.cols(); j++){
-        for (Index k = 0; k < diagSize; k++)
-        {
+      // Compute res = Q' * other column by column
+      for (Index j = 0; j < res.cols(); j++) {
+        for (Index k = 0; k < diagSize; k++) {
           Scalar tau = Scalar(0);
           tau = m_qr.m_Q.col(k).dot(res.col(j));
-          if(tau==Scalar(0)) continue;
+          if (tau == Scalar(0)) continue;
           tau = tau * m_qr.m_hcoeffs(k);
           res.col(j) -= tau * m_qr.m_Q.col(k);
         }
       }
-    }
-    else
-    {
-      eigen_assert(m_qr.m_Q.rows() == m_other.rows() && "Non conforming object sizes");
+    } else {
+      eigen_assert(m_qr.matrixQ().cols() == m_other.rows() && "Non conforming object sizes");
+
+      res.conservativeResize(rows(), cols());
+
       // Compute res = Q * other column by column
-      for(Index j = 0; j < res.cols(); j++)
-      {
-        for (Index k = diagSize-1; k >=0; k--)
-        {
+      for (Index j = 0; j < res.cols(); j++) {
+        Index start_k = internal::is_identity<Derived>::value ? numext::mini(j, diagSize - 1) : diagSize - 1;
+        for (Index k = start_k; k >= 0; k--) {
           Scalar tau = Scalar(0);
           tau = m_qr.m_Q.col(k).dot(res.col(j));
-          if(tau==Scalar(0)) continue;
-          tau = tau * m_qr.m_hcoeffs(k);
+          if (tau == Scalar(0)) continue;
+          tau = tau * numext::conj(m_qr.m_hcoeffs(k));
           res.col(j) -= tau * m_qr.m_Q.col(k);
         }
       }
     }
   }
-  
+
   const SparseQRType& m_qr;
   const Derived& m_other;
-  bool m_transpose;
+  bool m_transpose;  // TODO this actually means adjoint
 };
 
-template<typename SparseQRType>
-struct SparseQRMatrixQReturnType : public EigenBase<SparseQRMatrixQReturnType<SparseQRType> >
-{  
-  typedef typename SparseQRType::Index Index;
+template <typename SparseQRType>
+struct SparseQRMatrixQReturnType : public EigenBase<SparseQRMatrixQReturnType<SparseQRType> > {
   typedef typename SparseQRType::Scalar Scalar;
-  typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix;
-  SparseQRMatrixQReturnType(const SparseQRType& qr) : m_qr(qr) {}
-  template<typename Derived>
-  SparseQR_QProduct<SparseQRType, Derived> operator*(const MatrixBase<Derived>& other)
-  {
-    return SparseQR_QProduct<SparseQRType,Derived>(m_qr,other.derived(),false);
+  typedef Matrix<Scalar, Dynamic, Dynamic> DenseMatrix;
+  enum { RowsAtCompileTime = Dynamic, ColsAtCompileTime = Dynamic };
+  explicit SparseQRMatrixQReturnType(const SparseQRType& qr) : m_qr(qr) {}
+  template <typename Derived>
+  SparseQR_QProduct<SparseQRType, Derived> operator*(const MatrixBase<Derived>& other) {
+    return SparseQR_QProduct<SparseQRType, Derived>(m_qr, other.derived(), false);
   }
-  SparseQRMatrixQTransposeReturnType<SparseQRType> adjoint() const
-  {
+  // To use for operations with the adjoint of Q
+  SparseQRMatrixQTransposeReturnType<SparseQRType> adjoint() const {
     return SparseQRMatrixQTransposeReturnType<SparseQRType>(m_qr);
   }
   inline Index rows() const { return m_qr.rows(); }
-  inline Index cols() const { return (std::min)(m_qr.rows(),m_qr.cols()); }
-  // To use for operations with the transpose of Q
-  SparseQRMatrixQTransposeReturnType<SparseQRType> transpose() const
-  {
+  inline Index cols() const { return m_qr.rows(); }
+  // To use for operations with the transpose of Q FIXME this is the same as adjoint at the moment
+  SparseQRMatrixQTransposeReturnType<SparseQRType> transpose() const {
     return SparseQRMatrixQTransposeReturnType<SparseQRType>(m_qr);
   }
-  template<typename Dest> void evalTo(MatrixBase<Dest>& dest) const
-  {
-    dest.derived() = m_qr.matrixQ() * Dest::Identity(m_qr.rows(), m_qr.rows());
+  const SparseQRType& m_qr;
+};
+
+// TODO this actually represents the adjoint of Q
+template <typename SparseQRType>
+struct SparseQRMatrixQTransposeReturnType {
+  explicit SparseQRMatrixQTransposeReturnType(const SparseQRType& qr) : m_qr(qr) {}
+  template <typename Derived>
+  SparseQR_QProduct<SparseQRType, Derived> operator*(const MatrixBase<Derived>& other) {
+    return SparseQR_QProduct<SparseQRType, Derived>(m_qr, other.derived(), true);
   }
-  template<typename Dest> void evalTo(SparseMatrixBase<Dest>& dest) const
-  {
-    Dest idMat(m_qr.rows(), m_qr.rows());
+  const SparseQRType& m_qr;
+};
+
+namespace internal {
+
+template <typename SparseQRType>
+struct evaluator_traits<SparseQRMatrixQReturnType<SparseQRType> > {
+  typedef typename SparseQRType::MatrixType MatrixType;
+  typedef typename storage_kind_to_evaluator_kind<typename MatrixType::StorageKind>::Kind Kind;
+  typedef SparseShape Shape;
+};
+
+template <typename DstXprType, typename SparseQRType>
+struct Assignment<DstXprType, SparseQRMatrixQReturnType<SparseQRType>,
+                  internal::assign_op<typename DstXprType::Scalar, typename DstXprType::Scalar>, Sparse2Sparse> {
+  typedef SparseQRMatrixQReturnType<SparseQRType> SrcXprType;
+  typedef typename DstXprType::Scalar Scalar;
+  typedef typename DstXprType::StorageIndex StorageIndex;
+  static void run(DstXprType& dst, const SrcXprType& src, const internal::assign_op<Scalar, Scalar>& /*func*/) {
+    typename DstXprType::PlainObject idMat(src.rows(), src.cols());
     idMat.setIdentity();
     // Sort the sparse householder reflectors if needed
-    const_cast<SparseQRType *>(&m_qr)->sort_matrix_Q();
-    dest.derived() = SparseQR_QProduct<SparseQRType, Dest>(m_qr, idMat, false);
+    const_cast<SparseQRType*>(&src.m_qr)->_sort_matrix_Q();
+    dst = SparseQR_QProduct<SparseQRType, DstXprType>(src.m_qr, idMat, false);
   }
-
-  const SparseQRType& m_qr;
 };
 
-template<typename SparseQRType>
-struct SparseQRMatrixQTransposeReturnType
-{
-  SparseQRMatrixQTransposeReturnType(const SparseQRType& qr) : m_qr(qr) {}
-  template<typename Derived>
-  SparseQR_QProduct<SparseQRType,Derived> operator*(const MatrixBase<Derived>& other)
-  {
-    return SparseQR_QProduct<SparseQRType,Derived>(m_qr,other.derived(), true);
+template <typename DstXprType, typename SparseQRType>
+struct Assignment<DstXprType, SparseQRMatrixQReturnType<SparseQRType>,
+                  internal::assign_op<typename DstXprType::Scalar, typename DstXprType::Scalar>, Sparse2Dense> {
+  typedef SparseQRMatrixQReturnType<SparseQRType> SrcXprType;
+  typedef typename DstXprType::Scalar Scalar;
+  typedef typename DstXprType::StorageIndex StorageIndex;
+  static void run(DstXprType& dst, const SrcXprType& src, const internal::assign_op<Scalar, Scalar>& /*func*/) {
+    dst = src.m_qr.matrixQ() * DstXprType::Identity(src.m_qr.rows(), src.m_qr.rows());
   }
-  const SparseQRType& m_qr;
 };
 
-} // end namespace Eigen
+}  // end namespace internal
+
+}  // end namespace Eigen
 
 #endif
diff --git a/inst/include/Eigen/src/StlSupport/StdDeque.h b/inst/include/Eigen/src/StlSupport/StdDeque.h
index aaf66330..692281be 100644
--- a/inst/include/Eigen/src/StlSupport/StdDeque.h
+++ b/inst/include/Eigen/src/StlSupport/StdDeque.h
@@ -11,124 +11,41 @@
 #ifndef EIGEN_STDDEQUE_H
 #define EIGEN_STDDEQUE_H
 
-#include "details.h"
-
-// Define the explicit instantiation (e.g. necessary for the Intel compiler)
-#if defined(__INTEL_COMPILER) || defined(__GNUC__)
-  #define EIGEN_EXPLICIT_STL_DEQUE_INSTANTIATION(...) template class std::deque<__VA_ARGS__, EIGEN_ALIGNED_ALLOCATOR<__VA_ARGS__> >;
-#else
-  #define EIGEN_EXPLICIT_STL_DEQUE_INSTANTIATION(...)
+#ifndef EIGEN_STDDEQUE_MODULE_H
+#error "Please include Eigen/StdDeque instead of including this file directly."
 #endif
 
+#include "details.h"
+
 /**
  * This section contains a convenience MACRO which allows an easy specialization of
  * std::deque such that for data types with alignment issues the correct allocator
  * is used automatically.
  */
-#define EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(...) \
-EIGEN_EXPLICIT_STL_DEQUE_INSTANTIATION(__VA_ARGS__) \
-namespace std \
-{ \
-  template<typename _Ay> \
-  class deque<__VA_ARGS__, _Ay>  \
-    : public deque<__VA_ARGS__, EIGEN_ALIGNED_ALLOCATOR<__VA_ARGS__> > \
-  { \
-    typedef deque<__VA_ARGS__, EIGEN_ALIGNED_ALLOCATOR<__VA_ARGS__> > deque_base; \
-  public: \
-    typedef __VA_ARGS__ value_type; \
-    typedef typename deque_base::allocator_type allocator_type; \
-    typedef typename deque_base::size_type size_type;  \
-    typedef typename deque_base::iterator iterator;  \
-    explicit deque(const allocator_type& a = allocator_type()) : deque_base(a) {}  \
-    template<typename InputIterator> \
-    deque(InputIterator first, InputIterator last, const allocator_type& a = allocator_type()) : deque_base(first, last, a) {} \
-    deque(const deque& c) : deque_base(c) {}  \
-    explicit deque(size_type num, const value_type& val = value_type()) : deque_base(num, val) {} \
-    deque(iterator start, iterator end) : deque_base(start, end) {}  \
-    deque& operator=(const deque& x) {  \
-      deque_base::operator=(x);  \
-      return *this;  \
-    } \
-  }; \
-}
-
-// check whether we really need the std::deque specialization
-#if !(defined(_GLIBCXX_DEQUE) && (!EIGEN_GNUC_AT_LEAST(4,1))) /* Note that before gcc-4.1 we already have: std::deque::resize(size_type,const T&). */
-
-namespace std {
-
-#define EIGEN_STD_DEQUE_SPECIALIZATION_BODY \
-  public:  \
-    typedef T value_type; \
-    typedef typename deque_base::allocator_type allocator_type; \
-    typedef typename deque_base::size_type size_type;  \
-    typedef typename deque_base::iterator iterator;  \
-    typedef typename deque_base::const_iterator const_iterator;  \
-    explicit deque(const allocator_type& a = allocator_type()) : deque_base(a) {}  \
-    template<typename InputIterator> \
-    deque(InputIterator first, InputIterator last, const allocator_type& a = allocator_type()) \
-    : deque_base(first, last, a) {} \
-    deque(const deque& c) : deque_base(c) {}  \
+#define EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(...)                                                \
+  namespace std {                                                                                 \
+  template <>                                                                                     \
+  class deque<__VA_ARGS__, std::allocator<__VA_ARGS__> >                                          \
+      : public deque<__VA_ARGS__, EIGEN_ALIGNED_ALLOCATOR<__VA_ARGS__> > {                        \
+    typedef deque<__VA_ARGS__, EIGEN_ALIGNED_ALLOCATOR<__VA_ARGS__> > deque_base;                 \
+                                                                                                  \
+   public:                                                                                        \
+    typedef __VA_ARGS__ value_type;                                                               \
+    typedef deque_base::allocator_type allocator_type;                                            \
+    typedef deque_base::size_type size_type;                                                      \
+    typedef deque_base::iterator iterator;                                                        \
+    explicit deque(const allocator_type& a = allocator_type()) : deque_base(a) {}                 \
+    template <typename InputIterator>                                                             \
+    deque(InputIterator first, InputIterator last, const allocator_type& a = allocator_type())    \
+        : deque_base(first, last, a) {}                                                           \
+    deque(const deque& c) : deque_base(c) {}                                                      \
     explicit deque(size_type num, const value_type& val = value_type()) : deque_base(num, val) {} \
-    deque(iterator start, iterator end) : deque_base(start, end) {}  \
-    deque& operator=(const deque& x) {  \
-      deque_base::operator=(x);  \
-      return *this;  \
-    }
-
-  template<typename T>
-  class deque<T,EIGEN_ALIGNED_ALLOCATOR<T> >
-    : public deque<EIGEN_WORKAROUND_MSVC_STL_SUPPORT(T),
-                   Eigen::aligned_allocator_indirection<EIGEN_WORKAROUND_MSVC_STL_SUPPORT(T)> >
-{
-  typedef deque<EIGEN_WORKAROUND_MSVC_STL_SUPPORT(T),
-                Eigen::aligned_allocator_indirection<EIGEN_WORKAROUND_MSVC_STL_SUPPORT(T)> > deque_base;
-  EIGEN_STD_DEQUE_SPECIALIZATION_BODY
-
-  void resize(size_type new_size)
-  { resize(new_size, T()); }
-
-#if defined(_DEQUE_)
-  // workaround MSVC std::deque implementation
-  void resize(size_type new_size, const value_type& x)
-  {
-    if (deque_base::size() < new_size)
-      deque_base::_Insert_n(deque_base::end(), new_size - deque_base::size(), x);
-    else if (new_size < deque_base::size())
-      deque_base::erase(deque_base::begin() + new_size, deque_base::end());
-  }
-  void push_back(const value_type& x)
-  { deque_base::push_back(x); } 
-  void push_front(const value_type& x)
-  { deque_base::push_front(x); }
-  using deque_base::insert;  
-  iterator insert(const_iterator position, const value_type& x)
-  { return deque_base::insert(position,x); }
-  void insert(const_iterator position, size_type new_size, const value_type& x)
-  { deque_base::insert(position, new_size, x); }
-#elif defined(_GLIBCXX_DEQUE) && EIGEN_GNUC_AT_LEAST(4,2)
-  // workaround GCC std::deque implementation
-  void resize(size_type new_size, const value_type& x)
-  {
-    if (new_size < deque_base::size())
-      deque_base::_M_erase_at_end(this->_M_impl._M_start + new_size);
-    else
-      deque_base::insert(deque_base::end(), new_size - deque_base::size(), x);
+    deque(iterator start_, iterator end_) : deque_base(start_, end_) {}                           \
+    deque& operator=(const deque& x) {                                                            \
+      deque_base::operator=(x);                                                                   \
+      return *this;                                                                               \
+    }                                                                                             \
+  };                                                                                              \
   }
-#else
-  // either GCC 4.1 or non-GCC
-  // default implementation which should always work.
-  void resize(size_type new_size, const value_type& x)
-  {
-    if (new_size < deque_base::size())
-      deque_base::erase(deque_base::begin() + new_size, deque_base::end());
-    else if (new_size > deque_base::size())
-      deque_base::insert(deque_base::end(), new_size - deque_base::size(), x);
-  }
-#endif
-  };
-}
-
-#endif // check whether specialization is actually required
 
-#endif // EIGEN_STDDEQUE_H
+#endif  // EIGEN_STDDEQUE_H
diff --git a/inst/include/Eigen/src/StlSupport/StdList.h b/inst/include/Eigen/src/StlSupport/StdList.h
index 3c742430..26395df3 100644
--- a/inst/include/Eigen/src/StlSupport/StdList.h
+++ b/inst/include/Eigen/src/StlSupport/StdList.h
@@ -10,105 +10,41 @@
 #ifndef EIGEN_STDLIST_H
 #define EIGEN_STDLIST_H
 
-#include "details.h"
-
-// Define the explicit instantiation (e.g. necessary for the Intel compiler)
-#if defined(__INTEL_COMPILER) || defined(__GNUC__)
-  #define EIGEN_EXPLICIT_STL_LIST_INSTANTIATION(...) template class std::list<__VA_ARGS__, EIGEN_ALIGNED_ALLOCATOR<__VA_ARGS__> >;
-#else
-  #define EIGEN_EXPLICIT_STL_LIST_INSTANTIATION(...)
+#ifndef EIGEN_STDLIST_MODULE_H
+#error "Please include Eigen/StdList instead of including this file directly."
 #endif
 
+#include "details.h"
+
 /**
  * This section contains a convenience MACRO which allows an easy specialization of
  * std::list such that for data types with alignment issues the correct allocator
  * is used automatically.
  */
-#define EIGEN_DEFINE_STL_LIST_SPECIALIZATION(...) \
-EIGEN_EXPLICIT_STL_LIST_INSTANTIATION(__VA_ARGS__) \
-namespace std \
-{ \
-  template<typename _Ay> \
-  class list<__VA_ARGS__, _Ay>  \
-    : public list<__VA_ARGS__, EIGEN_ALIGNED_ALLOCATOR<__VA_ARGS__> > \
-  { \
-    typedef list<__VA_ARGS__, EIGEN_ALIGNED_ALLOCATOR<__VA_ARGS__> > list_base; \
-  public: \
-    typedef __VA_ARGS__ value_type; \
-    typedef typename list_base::allocator_type allocator_type; \
-    typedef typename list_base::size_type size_type;  \
-    typedef typename list_base::iterator iterator;  \
-    explicit list(const allocator_type& a = allocator_type()) : list_base(a) {}  \
-    template<typename InputIterator> \
-    list(InputIterator first, InputIterator last, const allocator_type& a = allocator_type()) : list_base(first, last, a) {} \
-    list(const list& c) : list_base(c) {}  \
-    explicit list(size_type num, const value_type& val = value_type()) : list_base(num, val) {} \
-    list(iterator start, iterator end) : list_base(start, end) {}  \
-    list& operator=(const list& x) {  \
-      list_base::operator=(x);  \
-      return *this;  \
-    } \
-  }; \
-}
-
-// check whether we really need the std::vector specialization
-#if !(defined(_GLIBCXX_VECTOR) && (!EIGEN_GNUC_AT_LEAST(4,1))) /* Note that before gcc-4.1 we already have: std::list::resize(size_type,const T&). */
-
-namespace std
-{
-
-#define EIGEN_STD_LIST_SPECIALIZATION_BODY \
-  public:  \
-    typedef T value_type; \
-    typedef typename list_base::allocator_type allocator_type; \
-    typedef typename list_base::size_type size_type;  \
-    typedef typename list_base::iterator iterator;  \
-    typedef typename list_base::const_iterator const_iterator;  \
-    explicit list(const allocator_type& a = allocator_type()) : list_base(a) {}  \
-    template<typename InputIterator> \
-    list(InputIterator first, InputIterator last, const allocator_type& a = allocator_type()) \
-    : list_base(first, last, a) {} \
-    list(const list& c) : list_base(c) {}  \
+#define EIGEN_DEFINE_STL_LIST_SPECIALIZATION(...)                                               \
+  namespace std {                                                                               \
+  template <>                                                                                   \
+  class list<__VA_ARGS__, std::allocator<__VA_ARGS__> >                                         \
+      : public list<__VA_ARGS__, EIGEN_ALIGNED_ALLOCATOR<__VA_ARGS__> > {                       \
+    typedef list<__VA_ARGS__, EIGEN_ALIGNED_ALLOCATOR<__VA_ARGS__> > list_base;                 \
+                                                                                                \
+   public:                                                                                      \
+    typedef __VA_ARGS__ value_type;                                                             \
+    typedef list_base::allocator_type allocator_type;                                           \
+    typedef list_base::size_type size_type;                                                     \
+    typedef list_base::iterator iterator;                                                       \
+    explicit list(const allocator_type& a = allocator_type()) : list_base(a) {}                 \
+    template <typename InputIterator>                                                           \
+    list(InputIterator first, InputIterator last, const allocator_type& a = allocator_type())   \
+        : list_base(first, last, a) {}                                                          \
+    list(const list& c) : list_base(c) {}                                                       \
     explicit list(size_type num, const value_type& val = value_type()) : list_base(num, val) {} \
-    list(iterator start, iterator end) : list_base(start, end) {}  \
-    list& operator=(const list& x) {  \
-    list_base::operator=(x);  \
-    return *this; \
+    list(iterator start_, iterator end_) : list_base(start_, end_) {}                           \
+    list& operator=(const list& x) {                                                            \
+      list_base::operator=(x);                                                                  \
+      return *this;                                                                             \
+    }                                                                                           \
+  };                                                                                            \
   }
 
-  template<typename T>
-  class list<T,EIGEN_ALIGNED_ALLOCATOR<T> >
-    : public list<EIGEN_WORKAROUND_MSVC_STL_SUPPORT(T),
-                  Eigen::aligned_allocator_indirection<EIGEN_WORKAROUND_MSVC_STL_SUPPORT(T)> >
-  {
-    typedef list<EIGEN_WORKAROUND_MSVC_STL_SUPPORT(T),
-                 Eigen::aligned_allocator_indirection<EIGEN_WORKAROUND_MSVC_STL_SUPPORT(T)> > list_base;
-    EIGEN_STD_LIST_SPECIALIZATION_BODY
-
-    void resize(size_type new_size)
-    { resize(new_size, T()); }
-
-    void resize(size_type new_size, const value_type& x)
-    {
-      if (list_base::size() < new_size)
-        list_base::insert(list_base::end(), new_size - list_base::size(), x);
-      else
-        while (new_size < list_base::size()) list_base::pop_back();
-    }
-
-#if defined(_LIST_)
-    // workaround MSVC std::list implementation
-    void push_back(const value_type& x)
-    { list_base::push_back(x); } 
-    using list_base::insert;  
-    iterator insert(const_iterator position, const value_type& x)
-    { return list_base::insert(position,x); }
-    void insert(const_iterator position, size_type new_size, const value_type& x)
-    { list_base::insert(position, new_size, x); }
-#endif
-  };
-}
-
-#endif // check whether specialization is actually required
-
-#endif // EIGEN_STDLIST_H
+#endif  // EIGEN_STDLIST_H
diff --git a/inst/include/Eigen/src/StlSupport/StdVector.h b/inst/include/Eigen/src/StlSupport/StdVector.h
index 611664a2..dd538e95 100644
--- a/inst/include/Eigen/src/StlSupport/StdVector.h
+++ b/inst/include/Eigen/src/StlSupport/StdVector.h
@@ -11,6 +11,10 @@
 #ifndef EIGEN_STDVECTOR_H
 #define EIGEN_STDVECTOR_H
 
+#ifndef EIGEN_STDVECTOR_MODULE_H
+#error "Please include Eigen/StdVector instead of including this file directly."
+#endif
+
 #include "details.h"
 
 /**
@@ -18,109 +22,30 @@
  * std::vector such that for data types with alignment issues the correct allocator
  * is used automatically.
  */
-#define EIGEN_DEFINE_STL_VECTOR_SPECIALIZATION(...) \
-namespace std \
-{ \
-  template<> \
-  class vector<__VA_ARGS__, std::allocator<__VA_ARGS__> >  \
-    : public vector<__VA_ARGS__, EIGEN_ALIGNED_ALLOCATOR<__VA_ARGS__> > \
-  { \
-    typedef vector<__VA_ARGS__, EIGEN_ALIGNED_ALLOCATOR<__VA_ARGS__> > vector_base; \
-  public: \
-    typedef __VA_ARGS__ value_type; \
-    typedef vector_base::allocator_type allocator_type; \
-    typedef vector_base::size_type size_type;  \
-    typedef vector_base::iterator iterator;  \
-    explicit vector(const allocator_type& a = allocator_type()) : vector_base(a) {}  \
-    template<typename InputIterator> \
-    vector(InputIterator first, InputIterator last, const allocator_type& a = allocator_type()) : vector_base(first, last, a) {} \
-    vector(const vector& c) : vector_base(c) {}  \
-    explicit vector(size_type num, const value_type& val = value_type()) : vector_base(num, val) {} \
-    vector(iterator start, iterator end) : vector_base(start, end) {}  \
-    vector& operator=(const vector& x) {  \
-      vector_base::operator=(x);  \
-      return *this;  \
-    } \
-  }; \
-}
-
-namespace std {
-
-#define EIGEN_STD_VECTOR_SPECIALIZATION_BODY \
-  public:  \
-    typedef T value_type; \
-    typedef typename vector_base::allocator_type allocator_type; \
-    typedef typename vector_base::size_type size_type;  \
-    typedef typename vector_base::iterator iterator;  \
-    typedef typename vector_base::const_iterator const_iterator;  \
-    explicit vector(const allocator_type& a = allocator_type()) : vector_base(a) {}  \
-    template<typename InputIterator> \
-    vector(InputIterator first, InputIterator last, const allocator_type& a = allocator_type()) \
-    : vector_base(first, last, a) {} \
-    vector(const vector& c) : vector_base(c) {}  \
+#define EIGEN_DEFINE_STL_VECTOR_SPECIALIZATION(...)                                                 \
+  namespace std {                                                                                   \
+  template <>                                                                                       \
+  class vector<__VA_ARGS__, std::allocator<__VA_ARGS__> >                                           \
+      : public vector<__VA_ARGS__, EIGEN_ALIGNED_ALLOCATOR<__VA_ARGS__> > {                         \
+    typedef vector<__VA_ARGS__, EIGEN_ALIGNED_ALLOCATOR<__VA_ARGS__> > vector_base;                 \
+                                                                                                    \
+   public:                                                                                          \
+    typedef __VA_ARGS__ value_type;                                                                 \
+    typedef vector_base::allocator_type allocator_type;                                             \
+    typedef vector_base::size_type size_type;                                                       \
+    typedef vector_base::iterator iterator;                                                         \
+    explicit vector(const allocator_type& a = allocator_type()) : vector_base(a) {}                 \
+    template <typename InputIterator>                                                               \
+    vector(InputIterator first, InputIterator last, const allocator_type& a = allocator_type())     \
+        : vector_base(first, last, a) {}                                                            \
+    vector(const vector& c) : vector_base(c) {}                                                     \
     explicit vector(size_type num, const value_type& val = value_type()) : vector_base(num, val) {} \
-    vector(iterator start, iterator end) : vector_base(start, end) {}  \
-    vector& operator=(const vector& x) {  \
-      vector_base::operator=(x);  \
-      return *this;  \
-    }
-
-  template<typename T>
-  class vector<T,EIGEN_ALIGNED_ALLOCATOR<T> >
-    : public vector<EIGEN_WORKAROUND_MSVC_STL_SUPPORT(T),
-                    Eigen::aligned_allocator_indirection<EIGEN_WORKAROUND_MSVC_STL_SUPPORT(T)> >
-{
-  typedef vector<EIGEN_WORKAROUND_MSVC_STL_SUPPORT(T),
-                 Eigen::aligned_allocator_indirection<EIGEN_WORKAROUND_MSVC_STL_SUPPORT(T)> > vector_base;
-  EIGEN_STD_VECTOR_SPECIALIZATION_BODY
-
-  void resize(size_type new_size)
-  { resize(new_size, T()); }
-
-#if defined(_VECTOR_)
-  // workaround MSVC std::vector implementation
-  void resize(size_type new_size, const value_type& x)
-  {
-    if (vector_base::size() < new_size)
-      vector_base::_Insert_n(vector_base::end(), new_size - vector_base::size(), x);
-    else if (new_size < vector_base::size())
-      vector_base::erase(vector_base::begin() + new_size, vector_base::end());
-  }
-  void push_back(const value_type& x)
-  { vector_base::push_back(x); } 
-  using vector_base::insert;  
-  iterator insert(const_iterator position, const value_type& x)
-  { return vector_base::insert(position,x); }
-  void insert(const_iterator position, size_type new_size, const value_type& x)
-  { vector_base::insert(position, new_size, x); }
-#elif defined(_GLIBCXX_VECTOR) && (!(EIGEN_GNUC_AT_LEAST(4,1)))
-  /* Note that before gcc-4.1 we already have: std::vector::resize(size_type,const T&).
-   * However, this specialization is still needed to make the above EIGEN_DEFINE_STL_VECTOR_SPECIALIZATION trick to work. */
-  void resize(size_type new_size, const value_type& x)
-  {
-    vector_base::resize(new_size,x);
+    vector(iterator start_, iterator end_) : vector_base(start_, end_) {}                           \
+    vector& operator=(const vector& x) {                                                            \
+      vector_base::operator=(x);                                                                    \
+      return *this;                                                                                 \
+    }                                                                                               \
+  };                                                                                                \
   }
-#elif defined(_GLIBCXX_VECTOR) && EIGEN_GNUC_AT_LEAST(4,2)
-  // workaround GCC std::vector implementation
-  void resize(size_type new_size, const value_type& x)
-  {
-    if (new_size < vector_base::size())
-      vector_base::_M_erase_at_end(this->_M_impl._M_start + new_size);
-    else
-      vector_base::insert(vector_base::end(), new_size - vector_base::size(), x);
-  }
-#else
-  // either GCC 4.1 or non-GCC
-  // default implementation which should always work.
-  void resize(size_type new_size, const value_type& x)
-  {
-    if (new_size < vector_base::size())
-      vector_base::erase(vector_base::begin() + new_size, vector_base::end());
-    else if (new_size > vector_base::size())
-      vector_base::insert(vector_base::end(), new_size - vector_base::size(), x);
-  }
-#endif
-  };
-}
 
-#endif // EIGEN_STDVECTOR_H
+#endif  // EIGEN_STDVECTOR_H
diff --git a/inst/include/Eigen/src/StlSupport/details.h b/inst/include/Eigen/src/StlSupport/details.h
index d8debc7c..9bc9d210 100644
--- a/inst/include/Eigen/src/StlSupport/details.h
+++ b/inst/include/Eigen/src/StlSupport/details.h
@@ -12,66 +12,64 @@
 #define EIGEN_STL_DETAILS_H
 
 #ifndef EIGEN_ALIGNED_ALLOCATOR
-  #define EIGEN_ALIGNED_ALLOCATOR Eigen::aligned_allocator
+#define EIGEN_ALIGNED_ALLOCATOR Eigen::aligned_allocator
 #endif
 
 namespace Eigen {
 
-  // This one is needed to prevent reimplementing the whole std::vector.
-  template <class T>
-  class aligned_allocator_indirection : public EIGEN_ALIGNED_ALLOCATOR<T>
-  {
-  public:
-    typedef size_t    size_type;
-    typedef ptrdiff_t difference_type;
-    typedef T*        pointer;
-    typedef const T*  const_pointer;
-    typedef T&        reference;
-    typedef const T&  const_reference;
-    typedef T         value_type;
+// This one is needed to prevent reimplementing the whole std::vector.
+template <class T>
+class aligned_allocator_indirection : public EIGEN_ALIGNED_ALLOCATOR<T> {
+ public:
+  typedef std::size_t size_type;
+  typedef std::ptrdiff_t difference_type;
+  typedef T* pointer;
+  typedef const T* const_pointer;
+  typedef T& reference;
+  typedef const T& const_reference;
+  typedef T value_type;
 
-    template<class U>
-    struct rebind
-    {
-      typedef aligned_allocator_indirection<U> other;
-    };
-
-    aligned_allocator_indirection() {}
-    aligned_allocator_indirection(const aligned_allocator_indirection& ) : EIGEN_ALIGNED_ALLOCATOR<T>() {}
-    aligned_allocator_indirection(const EIGEN_ALIGNED_ALLOCATOR<T>& ) {}
-    template<class U>
-    aligned_allocator_indirection(const aligned_allocator_indirection<U>& ) {}
-    template<class U>
-    aligned_allocator_indirection(const EIGEN_ALIGNED_ALLOCATOR<U>& ) {}
-    ~aligned_allocator_indirection() {}
+  template <class U>
+  struct rebind {
+    typedef aligned_allocator_indirection<U> other;
   };
 
-#ifdef _MSC_VER
+  aligned_allocator_indirection() {}
+  aligned_allocator_indirection(const aligned_allocator_indirection&) : EIGEN_ALIGNED_ALLOCATOR<T>() {}
+  aligned_allocator_indirection(const EIGEN_ALIGNED_ALLOCATOR<T>&) {}
+  template <class U>
+  aligned_allocator_indirection(const aligned_allocator_indirection<U>&) {}
+  template <class U>
+  aligned_allocator_indirection(const EIGEN_ALIGNED_ALLOCATOR<U>&) {}
+  ~aligned_allocator_indirection() {}
+};
+
+#if EIGEN_COMP_MSVC
 
-  // sometimes, MSVC detects, at compile time, that the argument x
-  // in std::vector::resize(size_t s,T x) won't be aligned and generate an error
-  // even if this function is never called. Whence this little wrapper.
+// sometimes, MSVC detects, at compile time, that the argument x
+// in std::vector::resize(size_t s,T x) won't be aligned and generate an error
+// even if this function is never called. Whence this little wrapper.
 #define EIGEN_WORKAROUND_MSVC_STL_SUPPORT(T) \
-  typename Eigen::internal::conditional< \
-    Eigen::internal::is_arithmetic<T>::value, \
-    T, \
-    Eigen::internal::workaround_msvc_stl_support<T> \
-  >::type
+  std::conditional_t<Eigen::internal::is_arithmetic<T>::value, T, Eigen::internal::workaround_msvc_stl_support<T> >
 
-  namespace internal {
-  template<typename T> struct workaround_msvc_stl_support : public T
-  {
-    inline workaround_msvc_stl_support() : T() {}
-    inline workaround_msvc_stl_support(const T& other) : T(other) {}
-    inline operator T& () { return *static_cast<T*>(this); }
-    inline operator const T& () const { return *static_cast<const T*>(this); }
-    template<typename OtherT>
-    inline T& operator=(const OtherT& other)
-    { T::operator=(other); return *this; }
-    inline workaround_msvc_stl_support& operator=(const workaround_msvc_stl_support& other)
-    { T::operator=(other); return *this; }
-  };
+namespace internal {
+template <typename T>
+struct workaround_msvc_stl_support : public T {
+  inline workaround_msvc_stl_support() : T() {}
+  inline workaround_msvc_stl_support(const T& other) : T(other) {}
+  inline operator T&() { return *static_cast<T*>(this); }
+  inline operator const T&() const { return *static_cast<const T*>(this); }
+  template <typename OtherT>
+  inline T& operator=(const OtherT& other) {
+    T::operator=(other);
+    return *this;
+  }
+  inline workaround_msvc_stl_support& operator=(const workaround_msvc_stl_support& other) {
+    T::operator=(other);
+    return *this;
   }
+};
+}  // namespace internal
 
 #else
 
@@ -79,6 +77,6 @@ namespace Eigen {
 
 #endif
 
-}
+}  // namespace Eigen
 
-#endif // EIGEN_STL_DETAILS_H
+#endif  // EIGEN_STL_DETAILS_H
diff --git a/inst/include/Eigen/src/SuperLUSupport/InternalHeaderCheck.h b/inst/include/Eigen/src/SuperLUSupport/InternalHeaderCheck.h
new file mode 100644
index 00000000..94a62b55
--- /dev/null
+++ b/inst/include/Eigen/src/SuperLUSupport/InternalHeaderCheck.h
@@ -0,0 +1,3 @@
+#ifndef EIGEN_SUPERLUSUPPORT_MODULE_H
+#error "Please include Eigen/SuperLUSupport instead of including headers inside the src directory directly."
+#endif
diff --git a/inst/include/Eigen/src/SuperLUSupport/SuperLUSupport.h b/inst/include/Eigen/src/SuperLUSupport/SuperLUSupport.h
index bcb35576..4db92497 100644
--- a/inst/include/Eigen/src/SuperLUSupport/SuperLUSupport.h
+++ b/inst/include/Eigen/src/SuperLUSupport/SuperLUSupport.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2008-2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2008-2015 Gael Guennebaud <gael.guennebaud@inria.fr>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,36 +10,53 @@
 #ifndef EIGEN_SUPERLUSUPPORT_H
 #define EIGEN_SUPERLUSUPPORT_H
 
-namespace Eigen { 
-
-#define DECL_GSSVX(PREFIX,FLOATTYPE,KEYTYPE)		\
-    extern "C" {                                                                                          \
-      typedef struct { FLOATTYPE for_lu; FLOATTYPE total_needed; int expansions; } PREFIX##mem_usage_t;   \
-      extern void PREFIX##gssvx(superlu_options_t *, SuperMatrix *, int *, int *, int *,                  \
-                                char *, FLOATTYPE *, FLOATTYPE *, SuperMatrix *, SuperMatrix *,           \
-                                void *, int, SuperMatrix *, SuperMatrix *,                                \
-                                FLOATTYPE *, FLOATTYPE *, FLOATTYPE *, FLOATTYPE *,                       \
-                                PREFIX##mem_usage_t *, SuperLUStat_t *, int *);                           \
-    }                                                                                                     \
-    inline float SuperLU_gssvx(superlu_options_t *options, SuperMatrix *A,                                \
-         int *perm_c, int *perm_r, int *etree, char *equed,                                               \
-         FLOATTYPE *R, FLOATTYPE *C, SuperMatrix *L,                                                      \
-         SuperMatrix *U, void *work, int lwork,                                                           \
-         SuperMatrix *B, SuperMatrix *X,                                                                  \
-         FLOATTYPE *recip_pivot_growth,                                                                   \
-         FLOATTYPE *rcond, FLOATTYPE *ferr, FLOATTYPE *berr,                                              \
-         SuperLUStat_t *stats, int *info, KEYTYPE) {                                                      \
-    PREFIX##mem_usage_t mem_usage;                                                                        \
-    PREFIX##gssvx(options, A, perm_c, perm_r, etree, equed, R, C, L,                                      \
-         U, work, lwork, B, X, recip_pivot_growth, rcond,                                                 \
-         ferr, berr, &mem_usage, stats, info);                                                            \
-    return mem_usage.for_lu; /* bytes used by the factor storage */                                       \
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+#if defined(SUPERLU_MAJOR_VERSION) && (SUPERLU_MAJOR_VERSION >= 5)
+#define DECL_GSSVX(PREFIX, FLOATTYPE, KEYTYPE)                                                                         \
+  extern "C" {                                                                                                         \
+  extern void PREFIX##gssvx(superlu_options_t *, SuperMatrix *, int *, int *, int *, char *, FLOATTYPE *, FLOATTYPE *, \
+                            SuperMatrix *, SuperMatrix *, void *, int, SuperMatrix *, SuperMatrix *, FLOATTYPE *,      \
+                            FLOATTYPE *, FLOATTYPE *, FLOATTYPE *, GlobalLU_t *, mem_usage_t *, SuperLUStat_t *,       \
+                            int *);                                                                                    \
+  }                                                                                                                    \
+  inline float SuperLU_gssvx(superlu_options_t *options, SuperMatrix *A, int *perm_c, int *perm_r, int *etree,         \
+                             char *equed, FLOATTYPE *R, FLOATTYPE *C, SuperMatrix *L, SuperMatrix *U, void *work,      \
+                             int lwork, SuperMatrix *B, SuperMatrix *X, FLOATTYPE *recip_pivot_growth,                 \
+                             FLOATTYPE *rcond, FLOATTYPE *ferr, FLOATTYPE *berr, SuperLUStat_t *stats, int *info,      \
+                             KEYTYPE) {                                                                                \
+    mem_usage_t mem_usage;                                                                                             \
+    GlobalLU_t gLU;                                                                                                    \
+    PREFIX##gssvx(options, A, perm_c, perm_r, etree, equed, R, C, L, U, work, lwork, B, X, recip_pivot_growth, rcond,  \
+                  ferr, berr, &gLU, &mem_usage, stats, info);                                                          \
+    return mem_usage.for_lu; /* bytes used by the factor storage */                                                    \
   }
+#else  // version < 5.0
+#define DECL_GSSVX(PREFIX, FLOATTYPE, KEYTYPE)                                                                         \
+  extern "C" {                                                                                                         \
+  extern void PREFIX##gssvx(superlu_options_t *, SuperMatrix *, int *, int *, int *, char *, FLOATTYPE *, FLOATTYPE *, \
+                            SuperMatrix *, SuperMatrix *, void *, int, SuperMatrix *, SuperMatrix *, FLOATTYPE *,      \
+                            FLOATTYPE *, FLOATTYPE *, FLOATTYPE *, mem_usage_t *, SuperLUStat_t *, int *);             \
+  }                                                                                                                    \
+  inline float SuperLU_gssvx(superlu_options_t *options, SuperMatrix *A, int *perm_c, int *perm_r, int *etree,         \
+                             char *equed, FLOATTYPE *R, FLOATTYPE *C, SuperMatrix *L, SuperMatrix *U, void *work,      \
+                             int lwork, SuperMatrix *B, SuperMatrix *X, FLOATTYPE *recip_pivot_growth,                 \
+                             FLOATTYPE *rcond, FLOATTYPE *ferr, FLOATTYPE *berr, SuperLUStat_t *stats, int *info,      \
+                             KEYTYPE) {                                                                                \
+    mem_usage_t mem_usage;                                                                                             \
+    PREFIX##gssvx(options, A, perm_c, perm_r, etree, equed, R, C, L, U, work, lwork, B, X, recip_pivot_growth, rcond,  \
+                  ferr, berr, &mem_usage, stats, info);                                                                \
+    return mem_usage.for_lu; /* bytes used by the factor storage */                                                    \
+  }
+#endif
 
-DECL_GSSVX(s,float,float)
-DECL_GSSVX(c,float,std::complex<float>)
-DECL_GSSVX(d,double,double)
-DECL_GSSVX(z,double,std::complex<double>)
+DECL_GSSVX(s, float, float)
+DECL_GSSVX(c, float, std::complex<float>)
+DECL_GSSVX(d, double, double)
+DECL_GSSVX(z, double, std::complex<double>)
 
 #ifdef MILU_ALPHA
 #define EIGEN_SUPERLU_HAS_ILU
@@ -48,583 +65,519 @@ DECL_GSSVX(z,double,std::complex<double>)
 #ifdef EIGEN_SUPERLU_HAS_ILU
 
 // similarly for the incomplete factorization using gsisx
-#define DECL_GSISX(PREFIX,FLOATTYPE,KEYTYPE)                                                    \
-    extern "C" {                                                                                \
-      extern void PREFIX##gsisx(superlu_options_t *, SuperMatrix *, int *, int *, int *,        \
-                         char *, FLOATTYPE *, FLOATTYPE *, SuperMatrix *, SuperMatrix *,        \
-                         void *, int, SuperMatrix *, SuperMatrix *, FLOATTYPE *, FLOATTYPE *,   \
-                         PREFIX##mem_usage_t *, SuperLUStat_t *, int *);                        \
-    }                                                                                           \
-    inline float SuperLU_gsisx(superlu_options_t *options, SuperMatrix *A,                      \
-         int *perm_c, int *perm_r, int *etree, char *equed,                                     \
-         FLOATTYPE *R, FLOATTYPE *C, SuperMatrix *L,                                            \
-         SuperMatrix *U, void *work, int lwork,                                                 \
-         SuperMatrix *B, SuperMatrix *X,                                                        \
-         FLOATTYPE *recip_pivot_growth,                                                         \
-         FLOATTYPE *rcond,                                                                      \
-         SuperLUStat_t *stats, int *info, KEYTYPE) {                                            \
-    PREFIX##mem_usage_t mem_usage;                                                              \
-    PREFIX##gsisx(options, A, perm_c, perm_r, etree, equed, R, C, L,                            \
-         U, work, lwork, B, X, recip_pivot_growth, rcond,                                       \
-         &mem_usage, stats, info);                                                              \
-    return mem_usage.for_lu; /* bytes used by the factor storage */                             \
+#if defined(SUPERLU_MAJOR_VERSION) && (SUPERLU_MAJOR_VERSION >= 5)
+#define DECL_GSISX(PREFIX, FLOATTYPE, KEYTYPE)                                                                         \
+  extern "C" {                                                                                                         \
+  extern void PREFIX##gsisx(superlu_options_t *, SuperMatrix *, int *, int *, int *, char *, FLOATTYPE *, FLOATTYPE *, \
+                            SuperMatrix *, SuperMatrix *, void *, int, SuperMatrix *, SuperMatrix *, FLOATTYPE *,      \
+                            FLOATTYPE *, GlobalLU_t *, mem_usage_t *, SuperLUStat_t *, int *);                         \
+  }                                                                                                                    \
+  inline float SuperLU_gsisx(superlu_options_t *options, SuperMatrix *A, int *perm_c, int *perm_r, int *etree,         \
+                             char *equed, FLOATTYPE *R, FLOATTYPE *C, SuperMatrix *L, SuperMatrix *U, void *work,      \
+                             int lwork, SuperMatrix *B, SuperMatrix *X, FLOATTYPE *recip_pivot_growth,                 \
+                             FLOATTYPE *rcond, SuperLUStat_t *stats, int *info, KEYTYPE) {                             \
+    mem_usage_t mem_usage;                                                                                             \
+    GlobalLU_t gLU;                                                                                                    \
+    PREFIX##gsisx(options, A, perm_c, perm_r, etree, equed, R, C, L, U, work, lwork, B, X, recip_pivot_growth, rcond,  \
+                  &gLU, &mem_usage, stats, info);                                                                      \
+    return mem_usage.for_lu; /* bytes used by the factor storage */                                                    \
+  }
+#else  // version < 5.0
+#define DECL_GSISX(PREFIX, FLOATTYPE, KEYTYPE)                                                                         \
+  extern "C" {                                                                                                         \
+  extern void PREFIX##gsisx(superlu_options_t *, SuperMatrix *, int *, int *, int *, char *, FLOATTYPE *, FLOATTYPE *, \
+                            SuperMatrix *, SuperMatrix *, void *, int, SuperMatrix *, SuperMatrix *, FLOATTYPE *,      \
+                            FLOATTYPE *, mem_usage_t *, SuperLUStat_t *, int *);                                       \
+  }                                                                                                                    \
+  inline float SuperLU_gsisx(superlu_options_t *options, SuperMatrix *A, int *perm_c, int *perm_r, int *etree,         \
+                             char *equed, FLOATTYPE *R, FLOATTYPE *C, SuperMatrix *L, SuperMatrix *U, void *work,      \
+                             int lwork, SuperMatrix *B, SuperMatrix *X, FLOATTYPE *recip_pivot_growth,                 \
+                             FLOATTYPE *rcond, SuperLUStat_t *stats, int *info, KEYTYPE) {                             \
+    mem_usage_t mem_usage;                                                                                             \
+    PREFIX##gsisx(options, A, perm_c, perm_r, etree, equed, R, C, L, U, work, lwork, B, X, recip_pivot_growth, rcond,  \
+                  &mem_usage, stats, info);                                                                            \
+    return mem_usage.for_lu; /* bytes used by the factor storage */                                                    \
   }
+#endif
 
-DECL_GSISX(s,float,float)
-DECL_GSISX(c,float,std::complex<float>)
-DECL_GSISX(d,double,double)
-DECL_GSISX(z,double,std::complex<double>)
+DECL_GSISX(s, float, float)
+DECL_GSISX(c, float, std::complex<float>)
+DECL_GSISX(d, double, double)
+DECL_GSISX(z, double, std::complex<double>)
 
 #endif
 
-template<typename MatrixType>
+template <typename MatrixType>
 struct SluMatrixMapHelper;
 
 /** \internal
-  *
-  * A wrapper class for SuperLU matrices. It supports only compressed sparse matrices
-  * and dense matrices. Supernodal and other fancy format are not supported by this wrapper.
-  *
-  * This wrapper class mainly aims to avoids the need of dynamic allocation of the storage structure.
-  */
-struct SluMatrix : SuperMatrix
-{
-  SluMatrix()
-  {
-    Store = &storage;
-  }
-
-  SluMatrix(const SluMatrix& other)
-    : SuperMatrix(other)
-  {
+ *
+ * A wrapper class for SuperLU matrices. It supports only compressed sparse matrices
+ * and dense matrices. Supernodal and other fancy format are not supported by this wrapper.
+ *
+ * This wrapper class mainly aims to avoids the need of dynamic allocation of the storage structure.
+ */
+struct SluMatrix : SuperMatrix {
+  SluMatrix() { Store = &storage; }
+
+  SluMatrix(const SluMatrix &other) : SuperMatrix(other) {
     Store = &storage;
     storage = other.storage;
   }
 
-  SluMatrix& operator=(const SluMatrix& other)
-  {
-    SuperMatrix::operator=(static_cast<const SuperMatrix&>(other));
+  SluMatrix &operator=(const SluMatrix &other) {
+    SuperMatrix::operator=(static_cast<const SuperMatrix &>(other));
     Store = &storage;
     storage = other.storage;
     return *this;
   }
 
-  struct
-  {
-    union {int nnz;int lda;};
+  struct {
+    union {
+      int nnz;
+      int lda;
+    };
     void *values;
     int *innerInd;
     int *outerInd;
   } storage;
 
-  void setStorageType(Stype_t t)
-  {
+  void setStorageType(Stype_t t) {
     Stype = t;
-    if (t==SLU_NC || t==SLU_NR || t==SLU_DN)
+    if (t == SLU_NC || t == SLU_NR || t == SLU_DN)
       Store = &storage;
-    else
-    {
+    else {
       eigen_assert(false && "storage type not supported");
       Store = 0;
     }
   }
 
-  template<typename Scalar>
-  void setScalarType()
-  {
-    if (internal::is_same<Scalar,float>::value)
+  template <typename Scalar>
+  void setScalarType() {
+    if (internal::is_same<Scalar, float>::value)
       Dtype = SLU_S;
-    else if (internal::is_same<Scalar,double>::value)
+    else if (internal::is_same<Scalar, double>::value)
       Dtype = SLU_D;
-    else if (internal::is_same<Scalar,std::complex<float> >::value)
+    else if (internal::is_same<Scalar, std::complex<float> >::value)
       Dtype = SLU_C;
-    else if (internal::is_same<Scalar,std::complex<double> >::value)
+    else if (internal::is_same<Scalar, std::complex<double> >::value)
       Dtype = SLU_Z;
-    else
-    {
+    else {
       eigen_assert(false && "Scalar type not supported by SuperLU");
     }
   }
 
-  template<typename MatrixType>
-  static SluMatrix Map(MatrixBase<MatrixType>& _mat)
-  {
-    MatrixType& mat(_mat.derived());
-    eigen_assert( ((MatrixType::Flags&RowMajorBit)!=RowMajorBit) && "row-major dense matrices are not supported by SuperLU");
+  template <typename MatrixType>
+  static SluMatrix Map(MatrixBase<MatrixType> &_mat) {
+    MatrixType &mat(_mat.derived());
+    eigen_assert(((MatrixType::Flags & RowMajorBit) != RowMajorBit) &&
+                 "row-major dense matrices are not supported by SuperLU");
     SluMatrix res;
     res.setStorageType(SLU_DN);
     res.setScalarType<typename MatrixType::Scalar>();
-    res.Mtype     = SLU_GE;
+    res.Mtype = SLU_GE;
 
-    res.nrow      = mat.rows();
-    res.ncol      = mat.cols();
+    res.nrow = internal::convert_index<int>(mat.rows());
+    res.ncol = internal::convert_index<int>(mat.cols());
 
-    res.storage.lda       = MatrixType::IsVectorAtCompileTime ? mat.size() : mat.outerStride();
-    res.storage.values    = (void*)(mat.data());
+    res.storage.lda = internal::convert_index<int>(MatrixType::IsVectorAtCompileTime ? mat.size() : mat.outerStride());
+    res.storage.values = (void *)(mat.data());
     return res;
   }
 
-  template<typename MatrixType>
-  static SluMatrix Map(SparseMatrixBase<MatrixType>& mat)
-  {
+  template <typename MatrixType>
+  static SluMatrix Map(SparseMatrixBase<MatrixType> &a_mat) {
+    MatrixType &mat(a_mat.derived());
     SluMatrix res;
-    if ((MatrixType::Flags&RowMajorBit)==RowMajorBit)
-    {
+    if ((MatrixType::Flags & RowMajorBit) == RowMajorBit) {
       res.setStorageType(SLU_NR);
-      res.nrow      = mat.cols();
-      res.ncol      = mat.rows();
-    }
-    else
-    {
+      res.nrow = internal::convert_index<int>(mat.cols());
+      res.ncol = internal::convert_index<int>(mat.rows());
+    } else {
       res.setStorageType(SLU_NC);
-      res.nrow      = mat.rows();
-      res.ncol      = mat.cols();
+      res.nrow = internal::convert_index<int>(mat.rows());
+      res.ncol = internal::convert_index<int>(mat.cols());
     }
 
-    res.Mtype       = SLU_GE;
+    res.Mtype = SLU_GE;
 
-    res.storage.nnz       = mat.nonZeros();
-    res.storage.values    = mat.derived().valuePtr();
-    res.storage.innerInd  = mat.derived().innerIndexPtr();
-    res.storage.outerInd  = mat.derived().outerIndexPtr();
+    res.storage.nnz = internal::convert_index<int>(mat.nonZeros());
+    res.storage.values = mat.valuePtr();
+    res.storage.innerInd = mat.innerIndexPtr();
+    res.storage.outerInd = mat.outerIndexPtr();
 
     res.setScalarType<typename MatrixType::Scalar>();
 
     // FIXME the following is not very accurate
-    if (MatrixType::Flags & Upper)
-      res.Mtype = SLU_TRU;
-    if (MatrixType::Flags & Lower)
-      res.Mtype = SLU_TRL;
+    if (int(MatrixType::Flags) & int(Upper)) res.Mtype = SLU_TRU;
+    if (int(MatrixType::Flags) & int(Lower)) res.Mtype = SLU_TRL;
 
-    eigen_assert(((MatrixType::Flags & SelfAdjoint)==0) && "SelfAdjoint matrix shape not supported by SuperLU");
+    eigen_assert(((int(MatrixType::Flags) & int(SelfAdjoint)) == 0) &&
+                 "SelfAdjoint matrix shape not supported by SuperLU");
 
     return res;
   }
 };
 
-template<typename Scalar, int Rows, int Cols, int Options, int MRows, int MCols>
-struct SluMatrixMapHelper<Matrix<Scalar,Rows,Cols,Options,MRows,MCols> >
-{
-  typedef Matrix<Scalar,Rows,Cols,Options,MRows,MCols> MatrixType;
-  static void run(MatrixType& mat, SluMatrix& res)
-  {
-    eigen_assert( ((Options&RowMajor)!=RowMajor) && "row-major dense matrices is not supported by SuperLU");
+template <typename Scalar, int Rows, int Cols, int Options, int MRows, int MCols>
+struct SluMatrixMapHelper<Matrix<Scalar, Rows, Cols, Options, MRows, MCols> > {
+  typedef Matrix<Scalar, Rows, Cols, Options, MRows, MCols> MatrixType;
+  static void run(MatrixType &mat, SluMatrix &res) {
+    eigen_assert(((Options & RowMajor) != RowMajor) && "row-major dense matrices is not supported by SuperLU");
     res.setStorageType(SLU_DN);
     res.setScalarType<Scalar>();
-    res.Mtype     = SLU_GE;
+    res.Mtype = SLU_GE;
 
-    res.nrow      = mat.rows();
-    res.ncol      = mat.cols();
+    res.nrow = mat.rows();
+    res.ncol = mat.cols();
 
-    res.storage.lda       = mat.outerStride();
-    res.storage.values    = mat.data();
+    res.storage.lda = mat.outerStride();
+    res.storage.values = mat.data();
   }
 };
 
-template<typename Derived>
-struct SluMatrixMapHelper<SparseMatrixBase<Derived> >
-{
+template <typename Derived>
+struct SluMatrixMapHelper<SparseMatrixBase<Derived> > {
   typedef Derived MatrixType;
-  static void run(MatrixType& mat, SluMatrix& res)
-  {
-    if ((MatrixType::Flags&RowMajorBit)==RowMajorBit)
-    {
+  static void run(MatrixType &mat, SluMatrix &res) {
+    if ((MatrixType::Flags & RowMajorBit) == RowMajorBit) {
       res.setStorageType(SLU_NR);
-      res.nrow      = mat.cols();
-      res.ncol      = mat.rows();
-    }
-    else
-    {
+      res.nrow = mat.cols();
+      res.ncol = mat.rows();
+    } else {
       res.setStorageType(SLU_NC);
-      res.nrow      = mat.rows();
-      res.ncol      = mat.cols();
+      res.nrow = mat.rows();
+      res.ncol = mat.cols();
     }
 
-    res.Mtype       = SLU_GE;
+    res.Mtype = SLU_GE;
 
-    res.storage.nnz       = mat.nonZeros();
-    res.storage.values    = mat.valuePtr();
-    res.storage.innerInd  = mat.innerIndexPtr();
-    res.storage.outerInd  = mat.outerIndexPtr();
+    res.storage.nnz = mat.nonZeros();
+    res.storage.values = mat.valuePtr();
+    res.storage.innerInd = mat.innerIndexPtr();
+    res.storage.outerInd = mat.outerIndexPtr();
 
     res.setScalarType<typename MatrixType::Scalar>();
 
     // FIXME the following is not very accurate
-    if (MatrixType::Flags & Upper)
-      res.Mtype = SLU_TRU;
-    if (MatrixType::Flags & Lower)
-      res.Mtype = SLU_TRL;
+    if (MatrixType::Flags & Upper) res.Mtype = SLU_TRU;
+    if (MatrixType::Flags & Lower) res.Mtype = SLU_TRL;
 
-    eigen_assert(((MatrixType::Flags & SelfAdjoint)==0) && "SelfAdjoint matrix shape not supported by SuperLU");
+    eigen_assert(((MatrixType::Flags & SelfAdjoint) == 0) && "SelfAdjoint matrix shape not supported by SuperLU");
   }
 };
 
 namespace internal {
 
-template<typename MatrixType>
-SluMatrix asSluMatrix(MatrixType& mat)
-{
+template <typename MatrixType>
+SluMatrix asSluMatrix(MatrixType &mat) {
   return SluMatrix::Map(mat);
 }
 
 /** View a Super LU matrix as an Eigen expression */
-template<typename Scalar, int Flags, typename Index>
-MappedSparseMatrix<Scalar,Flags,Index> map_superlu(SluMatrix& sluMat)
-{
-  eigen_assert((Flags&RowMajor)==RowMajor && sluMat.Stype == SLU_NR
-         || (Flags&ColMajor)==ColMajor && sluMat.Stype == SLU_NC);
+template <typename Scalar, int Flags, typename Index>
+Map<SparseMatrix<Scalar, Flags, Index> > map_superlu(SluMatrix &sluMat) {
+  eigen_assert(((Flags & RowMajor) == RowMajor && sluMat.Stype == SLU_NR) ||
+               ((Flags & ColMajor) == ColMajor && sluMat.Stype == SLU_NC));
 
-  Index outerSize = (Flags&RowMajor)==RowMajor ? sluMat.ncol : sluMat.nrow;
+  Index outerSize = (Flags & RowMajor) == RowMajor ? sluMat.ncol : sluMat.nrow;
 
-  return MappedSparseMatrix<Scalar,Flags,Index>(
-    sluMat.nrow, sluMat.ncol, sluMat.storage.outerInd[outerSize],
-    sluMat.storage.outerInd, sluMat.storage.innerInd, reinterpret_cast<Scalar*>(sluMat.storage.values) );
+  return Map<SparseMatrix<Scalar, Flags, Index> >(sluMat.nrow, sluMat.ncol, sluMat.storage.outerInd[outerSize],
+                                                  sluMat.storage.outerInd, sluMat.storage.innerInd,
+                                                  reinterpret_cast<Scalar *>(sluMat.storage.values));
 }
 
-} // end namespace internal
+}  // end namespace internal
 
 /** \ingroup SuperLUSupport_Module
-  * \class SuperLUBase
-  * \brief The base class for the direct and incomplete LU factorization of SuperLU
-  */
-template<typename _MatrixType, typename Derived>
-class SuperLUBase : internal::noncopyable
-{
-  public:
-    typedef _MatrixType MatrixType;
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
-    typedef Matrix<Scalar,Dynamic,1> Vector;
-    typedef Matrix<int, 1, MatrixType::ColsAtCompileTime> IntRowVectorType;
-    typedef Matrix<int, MatrixType::RowsAtCompileTime, 1> IntColVectorType;    
-    typedef SparseMatrix<Scalar> LUMatrixType;
-
-  public:
-
-    SuperLUBase() {}
-
-    ~SuperLUBase()
-    {
-      clearFactors();
-    }
-    
-    Derived& derived() { return *static_cast<Derived*>(this); }
-    const Derived& derived() const { return *static_cast<const Derived*>(this); }
-    
-    inline Index rows() const { return m_matrix.rows(); }
-    inline Index cols() const { return m_matrix.cols(); }
-    
-    /** \returns a reference to the Super LU option object to configure the  Super LU algorithms. */
-    inline superlu_options_t& options() { return m_sluOptions; }
-    
-    /** \brief Reports whether previous computation was successful.
-      *
-      * \returns \c Success if computation was succesful,
-      *          \c NumericalIssue if the matrix.appears to be negative.
-      */
-    ComputationInfo info() const
-    {
-      eigen_assert(m_isInitialized && "Decomposition is not initialized.");
-      return m_info;
-    }
+ * \class SuperLUBase
+ * \brief The base class for the direct and incomplete LU factorization of SuperLU
+ */
+template <typename MatrixType_, typename Derived>
+class SuperLUBase : public SparseSolverBase<Derived> {
+ protected:
+  typedef SparseSolverBase<Derived> Base;
+  using Base::derived;
+  using Base::m_isInitialized;
+
+ public:
+  typedef MatrixType_ MatrixType;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef typename MatrixType::StorageIndex StorageIndex;
+  typedef Matrix<Scalar, Dynamic, 1> Vector;
+  typedef Matrix<int, 1, MatrixType::ColsAtCompileTime> IntRowVectorType;
+  typedef Matrix<int, MatrixType::RowsAtCompileTime, 1> IntColVectorType;
+  typedef Map<PermutationMatrix<Dynamic, Dynamic, int> > PermutationMap;
+  typedef SparseMatrix<Scalar> LUMatrixType;
+  enum { ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime };
+
+ public:
+  SuperLUBase() {}
+
+  ~SuperLUBase() { clearFactors(); }
+
+  inline Index rows() const { return m_matrix.rows(); }
+  inline Index cols() const { return m_matrix.cols(); }
+
+  /** \returns a reference to the Super LU option object to configure the  Super LU algorithms. */
+  inline superlu_options_t &options() { return m_sluOptions; }
+
+  /** \brief Reports whether previous computation was successful.
+   *
+   * \returns \c Success if computation was successful,
+   *          \c NumericalIssue if the matrix.appears to be negative.
+   */
+  ComputationInfo info() const {
+    eigen_assert(m_isInitialized && "Decomposition is not initialized.");
+    return m_info;
+  }
 
-    /** Computes the sparse Cholesky decomposition of \a matrix */
-    void compute(const MatrixType& matrix)
-    {
-      derived().analyzePattern(matrix);
-      derived().factorize(matrix);
-    }
-    
-    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
-      *
-      * \sa compute()
-      */
-    template<typename Rhs>
-    inline const internal::solve_retval<SuperLUBase, Rhs> solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "SuperLU is not initialized.");
-      eigen_assert(rows()==b.rows()
-                && "SuperLU::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::solve_retval<SuperLUBase, Rhs>(*this, b.derived());
-    }
-    
-    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
-      *
-      * \sa compute()
-      */
-    template<typename Rhs>
-    inline const internal::sparse_solve_retval<SuperLUBase, Rhs> solve(const SparseMatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "SuperLU is not initialized.");
-      eigen_assert(rows()==b.rows()
-                && "SuperLU::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::sparse_solve_retval<SuperLUBase, Rhs>(*this, b.derived());
-    }
-    
-    /** Performs a symbolic decomposition on the sparcity of \a matrix.
-      *
-      * This function is particularly useful when solving for several problems having the same structure.
-      * 
-      * \sa factorize()
-      */
-    void analyzePattern(const MatrixType& /*matrix*/)
-    {
-      m_isInitialized = true;
-      m_info = Success;
-      m_analysisIsOk = true;
-      m_factorizationIsOk = false;
-    }
-    
-    template<typename Stream>
-    void dumpMemory(Stream& /*s*/)
-    {}
-    
-  protected:
-    
-    void initFactorization(const MatrixType& a)
-    {
-      set_default_options(&this->m_sluOptions);
-      
-      const int size = a.rows();
-      m_matrix = a;
-
-      m_sluA = internal::asSluMatrix(m_matrix);
-      clearFactors();
-
-      m_p.resize(size);
-      m_q.resize(size);
-      m_sluRscale.resize(size);
-      m_sluCscale.resize(size);
-      m_sluEtree.resize(size);
-
-      // set empty B and X
-      m_sluB.setStorageType(SLU_DN);
-      m_sluB.setScalarType<Scalar>();
-      m_sluB.Mtype          = SLU_GE;
-      m_sluB.storage.values = 0;
-      m_sluB.nrow           = 0;
-      m_sluB.ncol           = 0;
-      m_sluB.storage.lda    = size;
-      m_sluX                = m_sluB;
-      
-      m_extractedDataAreDirty = true;
-    }
-    
-    void init()
-    {
-      m_info = InvalidInput;
-      m_isInitialized = false;
-      m_sluL.Store = 0;
-      m_sluU.Store = 0;
-    }
-    
-    void extractData() const;
+  /** Computes the sparse Cholesky decomposition of \a matrix */
+  void compute(const MatrixType &matrix) {
+    derived().analyzePattern(matrix);
+    derived().factorize(matrix);
+  }
 
-    void clearFactors()
-    {
-      if(m_sluL.Store)
-        Destroy_SuperNode_Matrix(&m_sluL);
-      if(m_sluU.Store)
-        Destroy_CompCol_Matrix(&m_sluU);
+  /** Performs a symbolic decomposition on the sparsity of \a matrix.
+   *
+   * This function is particularly useful when solving for several problems having the same structure.
+   *
+   * \sa factorize()
+   */
+  void analyzePattern(const MatrixType & /*matrix*/) {
+    m_isInitialized = true;
+    m_info = Success;
+    m_analysisIsOk = true;
+    m_factorizationIsOk = false;
+  }
 
-      m_sluL.Store = 0;
-      m_sluU.Store = 0;
+  template <typename Stream>
+  void dumpMemory(Stream & /*s*/) {}
 
-      memset(&m_sluL,0,sizeof m_sluL);
-      memset(&m_sluU,0,sizeof m_sluU);
-    }
+ protected:
+  void initFactorization(const MatrixType &a) {
+    set_default_options(&this->m_sluOptions);
 
-    // cached data to reduce reallocation, etc.
-    mutable LUMatrixType m_l;
-    mutable LUMatrixType m_u;
-    mutable IntColVectorType m_p;
-    mutable IntRowVectorType m_q;
-
-    mutable LUMatrixType m_matrix;  // copy of the factorized matrix
-    mutable SluMatrix m_sluA;
-    mutable SuperMatrix m_sluL, m_sluU;
-    mutable SluMatrix m_sluB, m_sluX;
-    mutable SuperLUStat_t m_sluStat;
-    mutable superlu_options_t m_sluOptions;
-    mutable std::vector<int> m_sluEtree;
-    mutable Matrix<RealScalar,Dynamic,1> m_sluRscale, m_sluCscale;
-    mutable Matrix<RealScalar,Dynamic,1> m_sluFerr, m_sluBerr;
-    mutable char m_sluEqued;
-
-    mutable ComputationInfo m_info;
-    bool m_isInitialized;
-    int m_factorizationIsOk;
-    int m_analysisIsOk;
-    mutable bool m_extractedDataAreDirty;
-    
-  private:
-    SuperLUBase(SuperLUBase& ) { }
-};
+    const Index size = a.rows();
+    m_matrix = a;
+
+    m_sluA = internal::asSluMatrix(m_matrix);
+    clearFactors();
 
+    m_p.resize(size);
+    m_q.resize(size);
+    m_sluRscale.resize(size);
+    m_sluCscale.resize(size);
+    m_sluEtree.resize(size);
+
+    // set empty B and X
+    m_sluB.setStorageType(SLU_DN);
+    m_sluB.setScalarType<Scalar>();
+    m_sluB.Mtype = SLU_GE;
+    m_sluB.storage.values = 0;
+    m_sluB.nrow = 0;
+    m_sluB.ncol = 0;
+    m_sluB.storage.lda = internal::convert_index<int>(size);
+    m_sluX = m_sluB;
+
+    m_extractedDataAreDirty = true;
+  }
+
+  void init() {
+    m_info = InvalidInput;
+    m_isInitialized = false;
+    m_sluL.Store = 0;
+    m_sluU.Store = 0;
+  }
+
+  void extractData() const;
+
+  void clearFactors() {
+    if (m_sluL.Store) Destroy_SuperNode_Matrix(&m_sluL);
+    if (m_sluU.Store) Destroy_CompCol_Matrix(&m_sluU);
+
+    m_sluL.Store = 0;
+    m_sluU.Store = 0;
+
+    memset(&m_sluL, 0, sizeof m_sluL);
+    memset(&m_sluU, 0, sizeof m_sluU);
+  }
+
+  // cached data to reduce reallocation, etc.
+  mutable LUMatrixType m_l;
+  mutable LUMatrixType m_u;
+  mutable IntColVectorType m_p;
+  mutable IntRowVectorType m_q;
+
+  mutable LUMatrixType m_matrix;  // copy of the factorized matrix
+  mutable SluMatrix m_sluA;
+  mutable SuperMatrix m_sluL, m_sluU;
+  mutable SluMatrix m_sluB, m_sluX;
+  mutable SuperLUStat_t m_sluStat;
+  mutable superlu_options_t m_sluOptions;
+  mutable std::vector<int> m_sluEtree;
+  mutable Matrix<RealScalar, Dynamic, 1> m_sluRscale, m_sluCscale;
+  mutable Matrix<RealScalar, Dynamic, 1> m_sluFerr, m_sluBerr;
+  mutable char m_sluEqued;
+
+  mutable ComputationInfo m_info;
+  int m_factorizationIsOk;
+  int m_analysisIsOk;
+  mutable bool m_extractedDataAreDirty;
+
+ private:
+  SuperLUBase(SuperLUBase &) {}
+};
 
 /** \ingroup SuperLUSupport_Module
-  * \class SuperLU
-  * \brief A sparse direct LU factorization and solver based on the SuperLU library
-  *
-  * This class allows to solve for A.X = B sparse linear problems via a direct LU factorization
-  * using the SuperLU library. The sparse matrix A must be squared and invertible. The vectors or matrices
-  * X and B can be either dense or sparse.
-  *
-  * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
-  *
-  * \sa \ref TutorialSparseDirectSolvers
-  */
-template<typename _MatrixType>
-class SuperLU : public SuperLUBase<_MatrixType,SuperLU<_MatrixType> >
-{
-  public:
-    typedef SuperLUBase<_MatrixType,SuperLU> Base;
-    typedef _MatrixType MatrixType;
-    typedef typename Base::Scalar Scalar;
-    typedef typename Base::RealScalar RealScalar;
-    typedef typename Base::Index Index;
-    typedef typename Base::IntRowVectorType IntRowVectorType;
-    typedef typename Base::IntColVectorType IntColVectorType;    
-    typedef typename Base::LUMatrixType LUMatrixType;
-    typedef TriangularView<LUMatrixType, Lower|UnitDiag>  LMatrixType;
-    typedef TriangularView<LUMatrixType,  Upper>           UMatrixType;
-
-  public:
-
-    SuperLU() : Base() { init(); }
-
-    SuperLU(const MatrixType& matrix) : Base()
-    {
-      init();
-      Base::compute(matrix);
-    }
+ * \class SuperLU
+ * \brief A sparse direct LU factorization and solver based on the SuperLU library
+ *
+ * This class allows to solve for A.X = B sparse linear problems via a direct LU factorization
+ * using the SuperLU library. The sparse matrix A must be squared and invertible. The vectors or matrices
+ * X and B can be either dense or sparse.
+ *
+ * \tparam MatrixType_ the type of the sparse matrix A, it must be a SparseMatrix<>
+ *
+ * \warning This class is only for the 4.x versions of SuperLU. The 3.x and 5.x versions are not supported.
+ *
+ * \implsparsesolverconcept
+ *
+ * \sa \ref TutorialSparseSolverConcept, class SparseLU
+ */
+template <typename MatrixType_>
+class SuperLU : public SuperLUBase<MatrixType_, SuperLU<MatrixType_> > {
+ public:
+  typedef SuperLUBase<MatrixType_, SuperLU> Base;
+  typedef MatrixType_ MatrixType;
+  typedef typename Base::Scalar Scalar;
+  typedef typename Base::RealScalar RealScalar;
+  typedef typename Base::StorageIndex StorageIndex;
+  typedef typename Base::IntRowVectorType IntRowVectorType;
+  typedef typename Base::IntColVectorType IntColVectorType;
+  typedef typename Base::PermutationMap PermutationMap;
+  typedef typename Base::LUMatrixType LUMatrixType;
+  typedef TriangularView<LUMatrixType, Lower | UnitDiag> LMatrixType;
+  typedef TriangularView<LUMatrixType, Upper> UMatrixType;
+
+ public:
+  using Base::_solve_impl;
+
+  SuperLU() : Base() { init(); }
+
+  explicit SuperLU(const MatrixType &matrix) : Base() {
+    init();
+    Base::compute(matrix);
+  }
 
-    ~SuperLU()
-    {
-    }
-    
-    /** Performs a symbolic decomposition on the sparcity of \a matrix.
-      *
-      * This function is particularly useful when solving for several problems having the same structure.
-      * 
-      * \sa factorize()
-      */
-    void analyzePattern(const MatrixType& matrix)
-    {
-      m_info = InvalidInput;
-      m_isInitialized = false;
-      Base::analyzePattern(matrix);
-    }
-    
-    /** Performs a numeric decomposition of \a matrix
-      *
-      * The given matrix must has the same sparcity than the matrix on which the symbolic decomposition has been performed.
-      *
-      * \sa analyzePattern()
-      */
-    void factorize(const MatrixType& matrix);
-    
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** \internal */
-    template<typename Rhs,typename Dest>
-    void _solve(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const;
-    #endif // EIGEN_PARSED_BY_DOXYGEN
-    
-    inline const LMatrixType& matrixL() const
-    {
-      if (m_extractedDataAreDirty) this->extractData();
-      return m_l;
-    }
+  ~SuperLU() {}
 
-    inline const UMatrixType& matrixU() const
-    {
-      if (m_extractedDataAreDirty) this->extractData();
-      return m_u;
-    }
+  /** Performs a symbolic decomposition on the sparsity of \a matrix.
+   *
+   * This function is particularly useful when solving for several problems having the same structure.
+   *
+   * \sa factorize()
+   */
+  void analyzePattern(const MatrixType &matrix) {
+    m_info = InvalidInput;
+    m_isInitialized = false;
+    Base::analyzePattern(matrix);
+  }
 
-    inline const IntColVectorType& permutationP() const
-    {
-      if (m_extractedDataAreDirty) this->extractData();
-      return m_p;
-    }
+  /** Performs a numeric decomposition of \a matrix
+   *
+   * The given matrix must have the same sparsity than the matrix on which the symbolic decomposition has been
+   * performed.
+   *
+   * \sa analyzePattern()
+   */
+  void factorize(const MatrixType &matrix);
+
+  /** \internal */
+  template <typename Rhs, typename Dest>
+  void _solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const;
+
+  inline const LMatrixType &matrixL() const {
+    if (m_extractedDataAreDirty) this->extractData();
+    return m_l;
+  }
 
-    inline const IntRowVectorType& permutationQ() const
-    {
-      if (m_extractedDataAreDirty) this->extractData();
-      return m_q;
-    }
-    
-    Scalar determinant() const;
-    
-  protected:
-    
-    using Base::m_matrix;
-    using Base::m_sluOptions;
-    using Base::m_sluA;
-    using Base::m_sluB;
-    using Base::m_sluX;
-    using Base::m_p;
-    using Base::m_q;
-    using Base::m_sluEtree;
-    using Base::m_sluEqued;
-    using Base::m_sluRscale;
-    using Base::m_sluCscale;
-    using Base::m_sluL;
-    using Base::m_sluU;
-    using Base::m_sluStat;
-    using Base::m_sluFerr;
-    using Base::m_sluBerr;
-    using Base::m_l;
-    using Base::m_u;
-    
-    using Base::m_analysisIsOk;
-    using Base::m_factorizationIsOk;
-    using Base::m_extractedDataAreDirty;
-    using Base::m_isInitialized;
-    using Base::m_info;
-    
-    void init()
-    {
-      Base::init();
-      
-      set_default_options(&this->m_sluOptions);
-      m_sluOptions.PrintStat        = NO;
-      m_sluOptions.ConditionNumber  = NO;
-      m_sluOptions.Trans            = NOTRANS;
-      m_sluOptions.ColPerm          = COLAMD;
-    }
-    
-    
-  private:
-    SuperLU(SuperLU& ) { }
+  inline const UMatrixType &matrixU() const {
+    if (m_extractedDataAreDirty) this->extractData();
+    return m_u;
+  }
+
+  inline const IntColVectorType &permutationP() const {
+    if (m_extractedDataAreDirty) this->extractData();
+    return m_p;
+  }
+
+  inline const IntRowVectorType &permutationQ() const {
+    if (m_extractedDataAreDirty) this->extractData();
+    return m_q;
+  }
+
+  Scalar determinant() const;
+
+ protected:
+  using Base::m_l;
+  using Base::m_matrix;
+  using Base::m_p;
+  using Base::m_q;
+  using Base::m_sluA;
+  using Base::m_sluB;
+  using Base::m_sluBerr;
+  using Base::m_sluCscale;
+  using Base::m_sluEqued;
+  using Base::m_sluEtree;
+  using Base::m_sluFerr;
+  using Base::m_sluL;
+  using Base::m_sluOptions;
+  using Base::m_sluRscale;
+  using Base::m_sluStat;
+  using Base::m_sluU;
+  using Base::m_sluX;
+  using Base::m_u;
+
+  using Base::m_analysisIsOk;
+  using Base::m_extractedDataAreDirty;
+  using Base::m_factorizationIsOk;
+  using Base::m_info;
+  using Base::m_isInitialized;
+
+  void init() {
+    Base::init();
+
+    set_default_options(&this->m_sluOptions);
+    m_sluOptions.PrintStat = NO;
+    m_sluOptions.ConditionNumber = NO;
+    m_sluOptions.Trans = NOTRANS;
+    m_sluOptions.ColPerm = COLAMD;
+  }
+
+ private:
+  SuperLU(SuperLU &) {}
 };
 
-template<typename MatrixType>
-void SuperLU<MatrixType>::factorize(const MatrixType& a)
-{
+template <typename MatrixType>
+void SuperLU<MatrixType>::factorize(const MatrixType &a) {
   eigen_assert(m_analysisIsOk && "You must first call analyzePattern()");
-  if(!m_analysisIsOk)
-  {
+  if (!m_analysisIsOk) {
     m_info = InvalidInput;
     return;
   }
-  
+
   this->initFactorization(a);
-  
+
   m_sluOptions.ColPerm = COLAMD;
   int info = 0;
   RealScalar recip_pivot_growth, rcond;
   RealScalar ferr, berr;
 
   StatInit(&m_sluStat);
-  SuperLU_gssvx(&m_sluOptions, &m_sluA, m_q.data(), m_p.data(), &m_sluEtree[0],
-                &m_sluEqued, &m_sluRscale[0], &m_sluCscale[0],
-                &m_sluL, &m_sluU,
-                NULL, 0,
-                &m_sluB, &m_sluX,
-                &recip_pivot_growth, &rcond,
-                &ferr, &berr,
+  SuperLU_gssvx(&m_sluOptions, &m_sluA, m_q.data(), m_p.data(), &m_sluEtree[0], &m_sluEqued, &m_sluRscale[0],
+                &m_sluCscale[0], &m_sluL, &m_sluU, NULL, 0, &m_sluB, &m_sluX, &recip_pivot_growth, &rcond, &ferr, &berr,
                 &m_sluStat, &info, Scalar());
   StatFree(&m_sluStat);
 
@@ -635,48 +588,46 @@ void SuperLU<MatrixType>::factorize(const MatrixType& a)
   m_factorizationIsOk = true;
 }
 
-template<typename MatrixType>
-template<typename Rhs,typename Dest>
-void SuperLU<MatrixType>::_solve(const MatrixBase<Rhs> &b, MatrixBase<Dest>& x) const
-{
-  eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or analyzePattern()/factorize()");
+template <typename MatrixType>
+template <typename Rhs, typename Dest>
+void SuperLU<MatrixType>::_solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest> &x) const {
+  eigen_assert(m_factorizationIsOk &&
+               "The decomposition is not in a valid state for solving, you must first call either compute() or "
+               "analyzePattern()/factorize()");
 
-  const int size = m_matrix.rows();
-  const int rhsCols = b.cols();
-  eigen_assert(size==b.rows());
+  const Index rhsCols = b.cols();
+  eigen_assert(m_matrix.rows() == b.rows());
 
   m_sluOptions.Trans = NOTRANS;
   m_sluOptions.Fact = FACTORED;
   m_sluOptions.IterRefine = NOREFINE;
-  
 
   m_sluFerr.resize(rhsCols);
   m_sluBerr.resize(rhsCols);
-  m_sluB = SluMatrix::Map(b.const_cast_derived());
-  m_sluX = SluMatrix::Map(x.derived());
-  
+
+  Ref<const Matrix<typename Rhs::Scalar, Dynamic, Dynamic, ColMajor> > b_ref(b);
+  Ref<const Matrix<typename Dest::Scalar, Dynamic, Dynamic, ColMajor> > x_ref(x);
+
+  m_sluB = SluMatrix::Map(b_ref.const_cast_derived());
+  m_sluX = SluMatrix::Map(x_ref.const_cast_derived());
+
   typename Rhs::PlainObject b_cpy;
-  if(m_sluEqued!='N')
-  {
+  if (m_sluEqued != 'N') {
     b_cpy = b;
-    m_sluB = SluMatrix::Map(b_cpy.const_cast_derived());  
+    m_sluB = SluMatrix::Map(b_cpy.const_cast_derived());
   }
 
   StatInit(&m_sluStat);
   int info = 0;
   RealScalar recip_pivot_growth, rcond;
-  SuperLU_gssvx(&m_sluOptions, &m_sluA,
-                m_q.data(), m_p.data(),
-                &m_sluEtree[0], &m_sluEqued,
-                &m_sluRscale[0], &m_sluCscale[0],
-                &m_sluL, &m_sluU,
-                NULL, 0,
-                &m_sluB, &m_sluX,
-                &recip_pivot_growth, &rcond,
-                &m_sluFerr[0], &m_sluBerr[0],
-                &m_sluStat, &info, Scalar());
+  SuperLU_gssvx(&m_sluOptions, &m_sluA, m_q.data(), m_p.data(), &m_sluEtree[0], &m_sluEqued, &m_sluRscale[0],
+                &m_sluCscale[0], &m_sluL, &m_sluU, NULL, 0, &m_sluB, &m_sluX, &recip_pivot_growth, &rcond,
+                &m_sluFerr[0], &m_sluBerr[0], &m_sluStat, &info, Scalar());
   StatFree(&m_sluStat);
-  m_info = info==0 ? Success : NumericalIssue;
+
+  if (x.derived().data() != x_ref.data()) x = x_ref;
+
+  m_info = info == 0 ? Success : NumericalIssue;
 }
 
 // the code of this extractData() function has been adapted from the SuperLU's Matlab support code,
@@ -686,78 +637,70 @@ void SuperLU<MatrixType>::_solve(const MatrixBase<Rhs> &b, MatrixBase<Dest>& x)
 //  THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY
 //  EXPRESSED OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
 //
-template<typename MatrixType, typename Derived>
-void SuperLUBase<MatrixType,Derived>::extractData() const
-{
-  eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for extracting factors, you must first call either compute() or analyzePattern()/factorize()");
-  if (m_extractedDataAreDirty)
-  {
-    int         upper;
-    int         fsupc, istart, nsupr;
-    int         lastl = 0, lastu = 0;
-    SCformat    *Lstore = static_cast<SCformat*>(m_sluL.Store);
-    NCformat    *Ustore = static_cast<NCformat*>(m_sluU.Store);
-    Scalar      *SNptr;
-
-    const int size = m_matrix.rows();
-    m_l.resize(size,size);
+template <typename MatrixType, typename Derived>
+void SuperLUBase<MatrixType, Derived>::extractData() const {
+  eigen_assert(m_factorizationIsOk &&
+               "The decomposition is not in a valid state for extracting factors, you must first call either compute() "
+               "or analyzePattern()/factorize()");
+  if (m_extractedDataAreDirty) {
+    int upper;
+    int fsupc, istart, nsupr;
+    int lastl = 0, lastu = 0;
+    SCformat *Lstore = static_cast<SCformat *>(m_sluL.Store);
+    NCformat *Ustore = static_cast<NCformat *>(m_sluU.Store);
+    Scalar *SNptr;
+
+    const Index size = m_matrix.rows();
+    m_l.resize(size, size);
     m_l.resizeNonZeros(Lstore->nnz);
-    m_u.resize(size,size);
+    m_u.resize(size, size);
     m_u.resizeNonZeros(Ustore->nnz);
 
-    int* Lcol = m_l.outerIndexPtr();
-    int* Lrow = m_l.innerIndexPtr();
-    Scalar* Lval = m_l.valuePtr();
+    int *Lcol = m_l.outerIndexPtr();
+    int *Lrow = m_l.innerIndexPtr();
+    Scalar *Lval = m_l.valuePtr();
 
-    int* Ucol = m_u.outerIndexPtr();
-    int* Urow = m_u.innerIndexPtr();
-    Scalar* Uval = m_u.valuePtr();
+    int *Ucol = m_u.outerIndexPtr();
+    int *Urow = m_u.innerIndexPtr();
+    Scalar *Uval = m_u.valuePtr();
 
     Ucol[0] = 0;
     Ucol[0] = 0;
 
     /* for each supernode */
-    for (int k = 0; k <= Lstore->nsuper; ++k)
-    {
-      fsupc   = L_FST_SUPC(k);
-      istart  = L_SUB_START(fsupc);
-      nsupr   = L_SUB_START(fsupc+1) - istart;
-      upper   = 1;
+    for (int k = 0; k <= Lstore->nsuper; ++k) {
+      fsupc = L_FST_SUPC(k);
+      istart = L_SUB_START(fsupc);
+      nsupr = L_SUB_START(fsupc + 1) - istart;
+      upper = 1;
 
       /* for each column in the supernode */
-      for (int j = fsupc; j < L_FST_SUPC(k+1); ++j)
-      {
-        SNptr = &((Scalar*)Lstore->nzval)[L_NZ_START(j)];
+      for (int j = fsupc; j < L_FST_SUPC(k + 1); ++j) {
+        SNptr = &((Scalar *)Lstore->nzval)[L_NZ_START(j)];
 
         /* Extract U */
-        for (int i = U_NZ_START(j); i < U_NZ_START(j+1); ++i)
-        {
-          Uval[lastu] = ((Scalar*)Ustore->nzval)[i];
+        for (int i = U_NZ_START(j); i < U_NZ_START(j + 1); ++i) {
+          Uval[lastu] = ((Scalar *)Ustore->nzval)[i];
           /* Matlab doesn't like explicit zero. */
-          if (Uval[lastu] != 0.0)
-            Urow[lastu++] = U_SUB(i);
+          if (Uval[lastu] != 0.0) Urow[lastu++] = U_SUB(i);
         }
-        for (int i = 0; i < upper; ++i)
-        {
+        for (int i = 0; i < upper; ++i) {
           /* upper triangle in the supernode */
           Uval[lastu] = SNptr[i];
           /* Matlab doesn't like explicit zero. */
-          if (Uval[lastu] != 0.0)
-            Urow[lastu++] = L_SUB(istart+i);
+          if (Uval[lastu] != 0.0) Urow[lastu++] = L_SUB(istart + i);
         }
-        Ucol[j+1] = lastu;
+        Ucol[j + 1] = lastu;
 
         /* Extract L */
         Lval[lastl] = 1.0; /* unit diagonal */
         Lrow[lastl++] = L_SUB(istart + upper - 1);
-        for (int i = upper; i < nsupr; ++i)
-        {
+        for (int i = upper; i < nsupr; ++i) {
           Lval[lastl] = SNptr[i];
           /* Matlab doesn't like explicit zero. */
-          if (Lval[lastl] != 0.0)
-            Lrow[lastl++] = L_SUB(istart+i);
+          if (Lval[lastl] != 0.0) Lrow[lastl++] = L_SUB(istart + i);
         }
-        Lcol[j+1] = lastl;
+        Lcol[j + 1] = lastl;
 
         ++upper;
       } /* for j ... */
@@ -772,27 +715,26 @@ void SuperLUBase<MatrixType,Derived>::extractData() const
   }
 }
 
-template<typename MatrixType>
-typename SuperLU<MatrixType>::Scalar SuperLU<MatrixType>::determinant() const
-{
-  eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for computing the determinant, you must first call either compute() or analyzePattern()/factorize()");
-  
-  if (m_extractedDataAreDirty)
-    this->extractData();
+template <typename MatrixType>
+typename SuperLU<MatrixType>::Scalar SuperLU<MatrixType>::determinant() const {
+  eigen_assert(m_factorizationIsOk &&
+               "The decomposition is not in a valid state for computing the determinant, you must first call either "
+               "compute() or analyzePattern()/factorize()");
+
+  if (m_extractedDataAreDirty) this->extractData();
 
   Scalar det = Scalar(1);
-  for (int j=0; j<m_u.cols(); ++j)
-  {
-    if (m_u.outerIndexPtr()[j+1]-m_u.outerIndexPtr()[j] > 0)
-    {
-      int lastId = m_u.outerIndexPtr()[j+1]-1;
-      eigen_assert(m_u.innerIndexPtr()[lastId]<=j);
-      if (m_u.innerIndexPtr()[lastId]==j)
-        det *= m_u.valuePtr()[lastId];
+  for (int j = 0; j < m_u.cols(); ++j) {
+    if (m_u.outerIndexPtr()[j + 1] - m_u.outerIndexPtr()[j] > 0) {
+      int lastId = m_u.outerIndexPtr()[j + 1] - 1;
+      eigen_assert(m_u.innerIndexPtr()[lastId] <= j);
+      if (m_u.innerIndexPtr()[lastId] == j) det *= m_u.valuePtr()[lastId];
     }
   }
-  if(m_sluEqued!='N')
-    return det/m_sluRscale.prod()/m_sluCscale.prod();
+  if (PermutationMap(m_p.data(), m_p.size()).determinant() * PermutationMap(m_q.data(), m_q.size()).determinant() < 0)
+    det = -det;
+  if (m_sluEqued != 'N')
+    return det / m_sluRscale.prod() / m_sluCscale.prod();
   else
     return det;
 }
@@ -804,141 +746,130 @@ typename SuperLU<MatrixType>::Scalar SuperLU<MatrixType>::determinant() const
 #ifdef EIGEN_SUPERLU_HAS_ILU
 
 /** \ingroup SuperLUSupport_Module
-  * \class SuperILU
-  * \brief A sparse direct \b incomplete LU factorization and solver based on the SuperLU library
-  *
-  * This class allows to solve for an approximate solution of A.X = B sparse linear problems via an incomplete LU factorization
-  * using the SuperLU library. This class is aimed to be used as a preconditioner of the iterative linear solvers.
-  *
-  * \warning This class requires SuperLU 4 or later.
-  *
-  * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
-  *
-  * \sa \ref TutorialSparseDirectSolvers, class ConjugateGradient, class BiCGSTAB
-  */
-
-template<typename _MatrixType>
-class SuperILU : public SuperLUBase<_MatrixType,SuperILU<_MatrixType> >
-{
-  public:
-    typedef SuperLUBase<_MatrixType,SuperILU> Base;
-    typedef _MatrixType MatrixType;
-    typedef typename Base::Scalar Scalar;
-    typedef typename Base::RealScalar RealScalar;
-    typedef typename Base::Index Index;
-
-  public:
-
-    SuperILU() : Base() { init(); }
-
-    SuperILU(const MatrixType& matrix) : Base()
-    {
-      init();
-      Base::compute(matrix);
-    }
+ * \class SuperILU
+ * \brief A sparse direct \b incomplete LU factorization and solver based on the SuperLU library
+ *
+ * This class allows to solve for an approximate solution of A.X = B sparse linear problems via an incomplete LU
+ * factorization using the SuperLU library. This class is aimed to be used as a preconditioner of the iterative linear
+ * solvers.
+ *
+ * \warning This class is only for the 4.x versions of SuperLU. The 3.x and 5.x versions are not supported.
+ *
+ * \tparam MatrixType_ the type of the sparse matrix A, it must be a SparseMatrix<>
+ *
+ * \implsparsesolverconcept
+ *
+ * \sa \ref TutorialSparseSolverConcept, class IncompleteLUT, class ConjugateGradient, class BiCGSTAB
+ */
+
+template <typename MatrixType_>
+class SuperILU : public SuperLUBase<MatrixType_, SuperILU<MatrixType_> > {
+ public:
+  typedef SuperLUBase<MatrixType_, SuperILU> Base;
+  typedef MatrixType_ MatrixType;
+  typedef typename Base::Scalar Scalar;
+  typedef typename Base::RealScalar RealScalar;
+
+ public:
+  using Base::_solve_impl;
+
+  SuperILU() : Base() { init(); }
+
+  SuperILU(const MatrixType &matrix) : Base() {
+    init();
+    Base::compute(matrix);
+  }
 
-    ~SuperILU()
-    {
-    }
-    
-    /** Performs a symbolic decomposition on the sparcity of \a matrix.
-      *
-      * This function is particularly useful when solving for several problems having the same structure.
-      * 
-      * \sa factorize()
-      */
-    void analyzePattern(const MatrixType& matrix)
-    {
-      Base::analyzePattern(matrix);
-    }
-    
-    /** Performs a numeric decomposition of \a matrix
-      *
-      * The given matrix must has the same sparcity than the matrix on which the symbolic decomposition has been performed.
-      *
-      * \sa analyzePattern()
-      */
-    void factorize(const MatrixType& matrix);
-    
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** \internal */
-    template<typename Rhs,typename Dest>
-    void _solve(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const;
-    #endif // EIGEN_PARSED_BY_DOXYGEN
-    
-  protected:
-    
-    using Base::m_matrix;
-    using Base::m_sluOptions;
-    using Base::m_sluA;
-    using Base::m_sluB;
-    using Base::m_sluX;
-    using Base::m_p;
-    using Base::m_q;
-    using Base::m_sluEtree;
-    using Base::m_sluEqued;
-    using Base::m_sluRscale;
-    using Base::m_sluCscale;
-    using Base::m_sluL;
-    using Base::m_sluU;
-    using Base::m_sluStat;
-    using Base::m_sluFerr;
-    using Base::m_sluBerr;
-    using Base::m_l;
-    using Base::m_u;
-    
-    using Base::m_analysisIsOk;
-    using Base::m_factorizationIsOk;
-    using Base::m_extractedDataAreDirty;
-    using Base::m_isInitialized;
-    using Base::m_info;
-
-    void init()
-    {
-      Base::init();
-      
-      ilu_set_default_options(&m_sluOptions);
-      m_sluOptions.PrintStat        = NO;
-      m_sluOptions.ConditionNumber  = NO;
-      m_sluOptions.Trans            = NOTRANS;
-      m_sluOptions.ColPerm          = MMD_AT_PLUS_A;
-      
-      // no attempt to preserve column sum
-      m_sluOptions.ILU_MILU = SILU;
-      // only basic ILU(k) support -- no direct control over memory consumption
-      // better to use ILU_DropRule = DROP_BASIC | DROP_AREA
-      // and set ILU_FillFactor to max memory growth
-      m_sluOptions.ILU_DropRule = DROP_BASIC;
-      m_sluOptions.ILU_DropTol = NumTraits<Scalar>::dummy_precision()*10;
-    }
-    
-  private:
-    SuperILU(SuperILU& ) { }
+  ~SuperILU() {}
+
+  /** Performs a symbolic decomposition on the sparsity of \a matrix.
+   *
+   * This function is particularly useful when solving for several problems having the same structure.
+   *
+   * \sa factorize()
+   */
+  void analyzePattern(const MatrixType &matrix) { Base::analyzePattern(matrix); }
+
+  /** Performs a numeric decomposition of \a matrix
+   *
+   * The given matrix must have the same sparsity than the matrix on which the symbolic decomposition has been
+   * performed.
+   *
+   * \sa analyzePattern()
+   */
+  void factorize(const MatrixType &matrix);
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+  /** \internal */
+  template <typename Rhs, typename Dest>
+  void _solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest> &dest) const;
+#endif  // EIGEN_PARSED_BY_DOXYGEN
+
+ protected:
+  using Base::m_l;
+  using Base::m_matrix;
+  using Base::m_p;
+  using Base::m_q;
+  using Base::m_sluA;
+  using Base::m_sluB;
+  using Base::m_sluBerr;
+  using Base::m_sluCscale;
+  using Base::m_sluEqued;
+  using Base::m_sluEtree;
+  using Base::m_sluFerr;
+  using Base::m_sluL;
+  using Base::m_sluOptions;
+  using Base::m_sluRscale;
+  using Base::m_sluStat;
+  using Base::m_sluU;
+  using Base::m_sluX;
+  using Base::m_u;
+
+  using Base::m_analysisIsOk;
+  using Base::m_extractedDataAreDirty;
+  using Base::m_factorizationIsOk;
+  using Base::m_info;
+  using Base::m_isInitialized;
+
+  void init() {
+    Base::init();
+
+    ilu_set_default_options(&m_sluOptions);
+    m_sluOptions.PrintStat = NO;
+    m_sluOptions.ConditionNumber = NO;
+    m_sluOptions.Trans = NOTRANS;
+    m_sluOptions.ColPerm = MMD_AT_PLUS_A;
+
+    // no attempt to preserve column sum
+    m_sluOptions.ILU_MILU = SILU;
+    // only basic ILU(k) support -- no direct control over memory consumption
+    // better to use ILU_DropRule = DROP_BASIC | DROP_AREA
+    // and set ILU_FillFactor to max memory growth
+    m_sluOptions.ILU_DropRule = DROP_BASIC;
+    m_sluOptions.ILU_DropTol = NumTraits<Scalar>::dummy_precision() * 10;
+  }
+
+ private:
+  SuperILU(SuperILU &) {}
 };
 
-template<typename MatrixType>
-void SuperILU<MatrixType>::factorize(const MatrixType& a)
-{
+template <typename MatrixType>
+void SuperILU<MatrixType>::factorize(const MatrixType &a) {
   eigen_assert(m_analysisIsOk && "You must first call analyzePattern()");
-  if(!m_analysisIsOk)
-  {
+  if (!m_analysisIsOk) {
     m_info = InvalidInput;
     return;
   }
-  
+
   this->initFactorization(a);
 
   int info = 0;
   RealScalar recip_pivot_growth, rcond;
 
   StatInit(&m_sluStat);
-  SuperLU_gsisx(&m_sluOptions, &m_sluA, m_q.data(), m_p.data(), &m_sluEtree[0],
-                &m_sluEqued, &m_sluRscale[0], &m_sluCscale[0],
-                &m_sluL, &m_sluU,
-                NULL, 0,
-                &m_sluB, &m_sluX,
-                &recip_pivot_growth, &rcond,
-                &m_sluStat, &info, Scalar());
+  SuperLU_gsisx(&m_sluOptions, &m_sluA, m_q.data(), m_p.data(), &m_sluEtree[0], &m_sluEqued, &m_sluRscale[0],
+                &m_sluCscale[0], &m_sluL, &m_sluU, NULL, 0, &m_sluB, &m_sluX, &recip_pivot_growth, &rcond, &m_sluStat,
+                &info, Scalar());
   StatFree(&m_sluStat);
 
   // FIXME how to better check for errors ???
@@ -946,15 +877,16 @@ void SuperILU<MatrixType>::factorize(const MatrixType& a)
   m_factorizationIsOk = true;
 }
 
-template<typename MatrixType>
-template<typename Rhs,typename Dest>
-void SuperILU<MatrixType>::_solve(const MatrixBase<Rhs> &b, MatrixBase<Dest>& x) const
-{
-  eigen_assert(m_factorizationIsOk && "The decomposition is not in a valid state for solving, you must first call either compute() or analyzePattern()/factorize()");
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+template <typename MatrixType>
+template <typename Rhs, typename Dest>
+void SuperILU<MatrixType>::_solve_impl(const MatrixBase<Rhs> &b, MatrixBase<Dest> &x) const {
+  eigen_assert(m_factorizationIsOk &&
+               "The decomposition is not in a valid state for solving, you must first call either compute() or "
+               "analyzePattern()/factorize()");
 
-  const int size = m_matrix.rows();
   const int rhsCols = b.cols();
-  eigen_assert(size==b.rows());
+  eigen_assert(m_matrix.rows() == b.rows());
 
   m_sluOptions.Trans = NOTRANS;
   m_sluOptions.Fact = FACTORED;
@@ -962,65 +894,36 @@ void SuperILU<MatrixType>::_solve(const MatrixBase<Rhs> &b, MatrixBase<Dest>& x)
 
   m_sluFerr.resize(rhsCols);
   m_sluBerr.resize(rhsCols);
-  m_sluB = SluMatrix::Map(b.const_cast_derived());
-  m_sluX = SluMatrix::Map(x.derived());
+
+  Ref<const Matrix<typename Rhs::Scalar, Dynamic, Dynamic, ColMajor> > b_ref(b);
+  Ref<const Matrix<typename Dest::Scalar, Dynamic, Dynamic, ColMajor> > x_ref(x);
+
+  m_sluB = SluMatrix::Map(b_ref.const_cast_derived());
+  m_sluX = SluMatrix::Map(x_ref.const_cast_derived());
 
   typename Rhs::PlainObject b_cpy;
-  if(m_sluEqued!='N')
-  {
+  if (m_sluEqued != 'N') {
     b_cpy = b;
-    m_sluB = SluMatrix::Map(b_cpy.const_cast_derived());  
+    m_sluB = SluMatrix::Map(b_cpy.const_cast_derived());
   }
-  
+
   int info = 0;
   RealScalar recip_pivot_growth, rcond;
 
   StatInit(&m_sluStat);
-  SuperLU_gsisx(&m_sluOptions, &m_sluA,
-                m_q.data(), m_p.data(),
-                &m_sluEtree[0], &m_sluEqued,
-                &m_sluRscale[0], &m_sluCscale[0],
-                &m_sluL, &m_sluU,
-                NULL, 0,
-                &m_sluB, &m_sluX,
-                &recip_pivot_growth, &rcond,
-                &m_sluStat, &info, Scalar());
+  SuperLU_gsisx(&m_sluOptions, &m_sluA, m_q.data(), m_p.data(), &m_sluEtree[0], &m_sluEqued, &m_sluRscale[0],
+                &m_sluCscale[0], &m_sluL, &m_sluU, NULL, 0, &m_sluB, &m_sluX, &recip_pivot_growth, &rcond, &m_sluStat,
+                &info, Scalar());
   StatFree(&m_sluStat);
 
-  m_info = info==0 ? Success : NumericalIssue;
+  if (x.derived().data() != x_ref.data()) x = x_ref;
+
+  m_info = info == 0 ? Success : NumericalIssue;
 }
 #endif
 
-namespace internal {
-  
-template<typename _MatrixType, typename Derived, typename Rhs>
-struct solve_retval<SuperLUBase<_MatrixType,Derived>, Rhs>
-  : solve_retval_base<SuperLUBase<_MatrixType,Derived>, Rhs>
-{
-  typedef SuperLUBase<_MatrixType,Derived> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec().derived()._solve(rhs(),dst);
-  }
-};
-
-template<typename _MatrixType, typename Derived, typename Rhs>
-struct sparse_solve_retval<SuperLUBase<_MatrixType,Derived>, Rhs>
-  : sparse_solve_retval_base<SuperLUBase<_MatrixType,Derived>, Rhs>
-{
-  typedef SuperLUBase<_MatrixType,Derived> Dec;
-  EIGEN_MAKE_SPARSE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    this->defaultEvalTo(dst);
-  }
-};
-
-} // end namespace internal
+#endif
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_SUPERLUSUPPORT_H
+#endif  // EIGEN_SUPERLUSUPPORT_H
diff --git a/inst/include/Eigen/src/ThreadPool/Barrier.h b/inst/include/Eigen/src/ThreadPool/Barrier.h
new file mode 100644
index 00000000..8b2f8da7
--- /dev/null
+++ b/inst/include/Eigen/src/ThreadPool/Barrier.h
@@ -0,0 +1,70 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2018 Rasmus Munk Larsen <rmlarsen@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// Barrier is an object that allows one or more threads to wait until
+// Notify has been called a specified number of times.
+
+#ifndef EIGEN_CXX11_THREADPOOL_BARRIER_H
+#define EIGEN_CXX11_THREADPOOL_BARRIER_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+class Barrier {
+ public:
+  Barrier(unsigned int count) : state_(count << 1), notified_(false) {
+    eigen_plain_assert(((count << 1) >> 1) == count);
+  }
+  ~Barrier() { eigen_plain_assert((state_ >> 1) == 0); }
+
+  void Notify() {
+    unsigned int v = state_.fetch_sub(2, std::memory_order_acq_rel) - 2;
+    if (v != 1) {
+      // Clear the lowest bit (waiter flag) and check that the original state
+      // value was not zero. If it was zero, it means that notify was called
+      // more times than the original count.
+      eigen_plain_assert(((v + 2) & ~1) != 0);
+      return;  // either count has not dropped to 0, or waiter is not waiting
+    }
+    EIGEN_MUTEX_LOCK l(mu_);
+    eigen_plain_assert(!notified_);
+    notified_ = true;
+    cv_.notify_all();
+  }
+
+  void Wait() {
+    unsigned int v = state_.fetch_or(1, std::memory_order_acq_rel);
+    if ((v >> 1) == 0) return;
+    EIGEN_MUTEX_LOCK l(mu_);
+    while (!notified_) {
+      cv_.wait(l);
+    }
+  }
+
+ private:
+  EIGEN_MUTEX mu_;
+  EIGEN_CONDVAR cv_;
+  std::atomic<unsigned int> state_;  // low bit is waiter flag
+  bool notified_;
+};
+
+// Notification is an object that allows a user to to wait for another
+// thread to signal a notification that an event has occurred.
+//
+// Multiple threads can wait on the same Notification object,
+// but only one caller must call Notify() on the object.
+struct Notification : Barrier {
+  Notification() : Barrier(1){};
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_THREADPOOL_BARRIER_H
diff --git a/inst/include/Eigen/src/ThreadPool/CoreThreadPoolDevice.h b/inst/include/Eigen/src/ThreadPool/CoreThreadPoolDevice.h
new file mode 100644
index 00000000..c603a38a
--- /dev/null
+++ b/inst/include/Eigen/src/ThreadPool/CoreThreadPoolDevice.h
@@ -0,0 +1,336 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2023 Charlie Schlosser <cs.schlosser@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CORE_THREAD_POOL_DEVICE_H
+#define EIGEN_CORE_THREAD_POOL_DEVICE_H
+
+namespace Eigen {
+
+// CoreThreadPoolDevice provides an easy-to-understand Device for parallelizing Eigen Core expressions with
+// Threadpool. Expressions are recursively split evenly until the evaluation cost is less than the threshold for
+// delegating the task to a thread.
+/*
+                 a
+                / \
+               /   \
+              /     \
+             /       \
+            /         \
+           /           \
+          /             \
+         a               e
+        / \             / \
+       /   \           /   \
+      /     \         /     \
+     a       c       e       g
+    / \     / \     / \     / \
+   /   \   /   \   /   \   /   \
+  a     b c     d e     f g     h
+*/
+// Each task descends the binary tree to the left, delegates the right task to a new thread, and continues to the
+// left. This ensures that work is evenly distributed to the thread pool as quickly as possible and minimizes the number
+// of tasks created during the evaluation. Consider an expression that is divided into 8 chunks. The
+// primary task 'a' creates tasks 'e' 'c' and 'b', and executes its portion of the expression at the bottom of the
+// tree. Likewise, task 'e' creates tasks 'g' and 'f', and executes its portion of the expression.
+
+struct CoreThreadPoolDevice {
+  using Task = std::function<void()>;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoreThreadPoolDevice(ThreadPool& pool, float threadCostThreshold = 3e-5f)
+      : m_pool(pool) {
+    eigen_assert(threadCostThreshold >= 0.0f && "threadCostThreshold must be non-negative");
+    m_costFactor = threadCostThreshold;
+  }
+
+  template <int PacketSize>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int calculateLevels(Index size, float cost) const {
+    eigen_assert(cost >= 0.0f && "cost must be non-negative");
+    Index numOps = size / PacketSize;
+    int actualThreads = numOps < m_pool.NumThreads() ? static_cast<int>(numOps) : m_pool.NumThreads();
+    float totalCost = static_cast<float>(numOps) * cost;
+    float idealThreads = totalCost * m_costFactor;
+    if (idealThreads < static_cast<float>(actualThreads)) {
+      idealThreads = numext::maxi(idealThreads, 1.0f);
+      actualThreads = numext::mini(actualThreads, static_cast<int>(idealThreads));
+    }
+    int maxLevel = internal::log2_ceil(actualThreads);
+    return maxLevel;
+  }
+
+// MSVC does not like inlining parallelForImpl
+#if EIGEN_COMP_MSVC && !EIGEN_COMP_CLANG
+#define EIGEN_PARALLEL_FOR_INLINE
+#else
+#define EIGEN_PARALLEL_FOR_INLINE EIGEN_STRONG_INLINE
+#endif
+
+  template <typename UnaryFunctor, int PacketSize>
+  EIGEN_DEVICE_FUNC EIGEN_PARALLEL_FOR_INLINE void parallelForImpl(Index begin, Index end, UnaryFunctor& f,
+                                                                   Barrier& barrier, int level) {
+    while (level > 0) {
+      level--;
+      Index size = end - begin;
+      eigen_assert(size % PacketSize == 0 && "this function assumes size is a multiple of PacketSize");
+      Index mid = begin + numext::round_down(size >> 1, PacketSize);
+      Task right = [this, mid, end, &f, &barrier, level]() {
+        parallelForImpl<UnaryFunctor, PacketSize>(mid, end, f, barrier, level);
+      };
+      m_pool.Schedule(std::move(right));
+      end = mid;
+    }
+    for (Index i = begin; i < end; i += PacketSize) f(i);
+    barrier.Notify();
+  }
+
+  template <typename BinaryFunctor, int PacketSize>
+  EIGEN_DEVICE_FUNC EIGEN_PARALLEL_FOR_INLINE void parallelForImpl(Index outerBegin, Index outerEnd, Index innerBegin,
+                                                                   Index innerEnd, BinaryFunctor& f, Barrier& barrier,
+                                                                   int level) {
+    while (level > 0) {
+      level--;
+      Index outerSize = outerEnd - outerBegin;
+      if (outerSize > 1) {
+        Index outerMid = outerBegin + (outerSize >> 1);
+        Task right = [this, &f, &barrier, outerMid, outerEnd, innerBegin, innerEnd, level]() {
+          parallelForImpl<BinaryFunctor, PacketSize>(outerMid, outerEnd, innerBegin, innerEnd, f, barrier, level);
+        };
+        m_pool.Schedule(std::move(right));
+        outerEnd = outerMid;
+      } else {
+        Index innerSize = innerEnd - innerBegin;
+        eigen_assert(innerSize % PacketSize == 0 && "this function assumes innerSize is a multiple of PacketSize");
+        Index innerMid = innerBegin + numext::round_down(innerSize >> 1, PacketSize);
+        Task right = [this, &f, &barrier, outerBegin, outerEnd, innerMid, innerEnd, level]() {
+          parallelForImpl<BinaryFunctor, PacketSize>(outerBegin, outerEnd, innerMid, innerEnd, f, barrier, level);
+        };
+        m_pool.Schedule(std::move(right));
+        innerEnd = innerMid;
+      }
+    }
+    for (Index outer = outerBegin; outer < outerEnd; outer++)
+      for (Index inner = innerBegin; inner < innerEnd; inner += PacketSize) f(outer, inner);
+    barrier.Notify();
+  }
+
+#undef EIGEN_PARALLEL_FOR_INLINE
+
+  template <typename UnaryFunctor, int PacketSize>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void parallelFor(Index begin, Index end, UnaryFunctor& f, float cost) {
+    Index size = end - begin;
+    int maxLevel = calculateLevels<PacketSize>(size, cost);
+    Barrier barrier(1 << maxLevel);
+    parallelForImpl<UnaryFunctor, PacketSize>(begin, end, f, barrier, maxLevel);
+    barrier.Wait();
+  }
+
+  template <typename BinaryFunctor, int PacketSize>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void parallelFor(Index outerBegin, Index outerEnd, Index innerBegin,
+                                                         Index innerEnd, BinaryFunctor& f, float cost) {
+    Index outerSize = outerEnd - outerBegin;
+    Index innerSize = innerEnd - innerBegin;
+    Index size = outerSize * innerSize;
+    int maxLevel = calculateLevels<PacketSize>(size, cost);
+    Barrier barrier(1 << maxLevel);
+    parallelForImpl<BinaryFunctor, PacketSize>(outerBegin, outerEnd, innerBegin, innerEnd, f, barrier, maxLevel);
+    barrier.Wait();
+  }
+
+  ThreadPool& m_pool;
+  // costFactor is the cost of delegating a task to a thread
+  // the inverse is used to avoid a floating point division
+  float m_costFactor;
+};
+
+// specialization of coefficient-wise assignment loops for CoreThreadPoolDevice
+
+namespace internal {
+
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+struct Kernel;
+#endif
+
+template <typename Kernel>
+struct cost_helper {
+  using SrcEvaluatorType = typename Kernel::SrcEvaluatorType;
+  using DstEvaluatorType = typename Kernel::DstEvaluatorType;
+  using SrcXprType = typename SrcEvaluatorType::XprType;
+  using DstXprType = typename DstEvaluatorType::XprType;
+  static constexpr Index Cost = functor_cost<SrcXprType>::Cost + functor_cost<DstXprType>::Cost;
+};
+
+template <typename Kernel>
+struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, DefaultTraversal, NoUnrolling> {
+  static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost;
+  struct AssignmentFunctor : public Kernel {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer, Index inner) {
+      this->assignCoeffByOuterInner(outer, inner);
+    }
+  };
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {
+    const Index innerSize = kernel.innerSize();
+    const Index outerSize = kernel.outerSize();
+    constexpr float cost = static_cast<float>(XprEvaluationCost);
+    AssignmentFunctor functor(kernel);
+    device.template parallelFor<AssignmentFunctor, 1>(0, outerSize, 0, innerSize, functor, cost);
+  }
+};
+
+template <typename Kernel>
+struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, DefaultTraversal, InnerUnrolling> {
+  using DstXprType = typename Kernel::DstEvaluatorType::XprType;
+  static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost, InnerSize = DstXprType::InnerSizeAtCompileTime;
+  struct AssignmentFunctor : public Kernel {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer) {
+      copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, 0, InnerSize>::run(*this, outer);
+    }
+  };
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {
+    const Index outerSize = kernel.outerSize();
+    AssignmentFunctor functor(kernel);
+    constexpr float cost = static_cast<float>(XprEvaluationCost) * static_cast<float>(InnerSize);
+    device.template parallelFor<AssignmentFunctor, 1>(0, outerSize, functor, cost);
+  }
+};
+
+template <typename Kernel>
+struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, InnerVectorizedTraversal, NoUnrolling> {
+  using PacketType = typename Kernel::PacketType;
+  static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost, PacketSize = unpacket_traits<PacketType>::size,
+                         SrcAlignment = Kernel::AssignmentTraits::SrcAlignment,
+                         DstAlignment = Kernel::AssignmentTraits::DstAlignment;
+  struct AssignmentFunctor : public Kernel {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer, Index inner) {
+      this->template assignPacketByOuterInner<Unaligned, Unaligned, PacketType>(outer, inner);
+    }
+  };
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {
+    const Index innerSize = kernel.innerSize();
+    const Index outerSize = kernel.outerSize();
+    const float cost = static_cast<float>(XprEvaluationCost) * static_cast<float>(innerSize);
+    AssignmentFunctor functor(kernel);
+    device.template parallelFor<AssignmentFunctor, PacketSize>(0, outerSize, 0, innerSize, functor, cost);
+  }
+};
+
+template <typename Kernel>
+struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, InnerVectorizedTraversal, InnerUnrolling> {
+  using PacketType = typename Kernel::PacketType;
+  using DstXprType = typename Kernel::DstEvaluatorType::XprType;
+  static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost, PacketSize = unpacket_traits<PacketType>::size,
+                         SrcAlignment = Kernel::AssignmentTraits::SrcAlignment,
+                         DstAlignment = Kernel::AssignmentTraits::DstAlignment,
+                         InnerSize = DstXprType::InnerSizeAtCompileTime;
+  struct AssignmentFunctor : public Kernel {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer) {
+      copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, InnerSize, SrcAlignment, DstAlignment>::run(*this, outer);
+    }
+  };
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {
+    const Index outerSize = kernel.outerSize();
+    constexpr float cost = static_cast<float>(XprEvaluationCost) * static_cast<float>(InnerSize);
+    AssignmentFunctor functor(kernel);
+    device.template parallelFor<AssignmentFunctor, PacketSize>(0, outerSize, functor, cost);
+  }
+};
+
+template <typename Kernel>
+struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, SliceVectorizedTraversal, NoUnrolling> {
+  using Scalar = typename Kernel::Scalar;
+  using PacketType = typename Kernel::PacketType;
+  static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost, PacketSize = unpacket_traits<PacketType>::size;
+  struct PacketAssignmentFunctor : public Kernel {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketAssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer, Index inner) {
+      this->template assignPacketByOuterInner<Unaligned, Unaligned, PacketType>(outer, inner);
+    }
+  };
+  struct ScalarAssignmentFunctor : public Kernel {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScalarAssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index outer) {
+      const Index innerSize = this->innerSize();
+      const Index packetAccessSize = numext::round_down(innerSize, PacketSize);
+      for (Index inner = packetAccessSize; inner < innerSize; inner++) this->assignCoeffByOuterInner(outer, inner);
+    }
+  };
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {
+    const Index outerSize = kernel.outerSize();
+    const Index innerSize = kernel.innerSize();
+    const Index packetAccessSize = numext::round_down(innerSize, PacketSize);
+    constexpr float packetCost = static_cast<float>(XprEvaluationCost);
+    const float scalarCost = static_cast<float>(XprEvaluationCost) * static_cast<float>(innerSize - packetAccessSize);
+    PacketAssignmentFunctor packetFunctor(kernel);
+    ScalarAssignmentFunctor scalarFunctor(kernel);
+    device.template parallelFor<PacketAssignmentFunctor, PacketSize>(0, outerSize, 0, packetAccessSize, packetFunctor,
+                                                                     packetCost);
+    device.template parallelFor<ScalarAssignmentFunctor, 1>(0, outerSize, scalarFunctor, scalarCost);
+  };
+};
+
+template <typename Kernel>
+struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, LinearTraversal, NoUnrolling> {
+  static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost;
+  struct AssignmentFunctor : public Kernel {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index index) { this->assignCoeff(index); }
+  };
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {
+    const Index size = kernel.size();
+    constexpr float cost = static_cast<float>(XprEvaluationCost);
+    AssignmentFunctor functor(kernel);
+    device.template parallelFor<AssignmentFunctor, 1>(0, size, functor, cost);
+  }
+};
+
+template <typename Kernel>
+struct dense_assignment_loop_with_device<Kernel, CoreThreadPoolDevice, LinearVectorizedTraversal, NoUnrolling> {
+  using Scalar = typename Kernel::Scalar;
+  using PacketType = typename Kernel::PacketType;
+  static constexpr Index XprEvaluationCost = cost_helper<Kernel>::Cost,
+                         RequestedAlignment = Kernel::AssignmentTraits::LinearRequiredAlignment,
+                         PacketSize = unpacket_traits<PacketType>::size,
+                         DstIsAligned = Kernel::AssignmentTraits::DstAlignment >= RequestedAlignment,
+                         DstAlignment = packet_traits<Scalar>::AlignedOnScalar ? RequestedAlignment
+                                                                               : Kernel::AssignmentTraits::DstAlignment,
+                         SrcAlignment = Kernel::AssignmentTraits::JointAlignment;
+  struct AssignmentFunctor : public Kernel {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE AssignmentFunctor(Kernel& kernel) : Kernel(kernel) {}
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index index) {
+      this->template assignPacket<DstAlignment, SrcAlignment, PacketType>(index);
+    }
+  };
+  static constexpr bool UsePacketSegment = Kernel::AssignmentTraits::UsePacketSegment;
+  using head_loop =
+      unaligned_dense_assignment_loop<PacketType, DstAlignment, SrcAlignment, UsePacketSegment, DstIsAligned>;
+  using tail_loop = unaligned_dense_assignment_loop<PacketType, DstAlignment, SrcAlignment, UsePacketSegment, false>;
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void run(Kernel& kernel, CoreThreadPoolDevice& device) {
+    const Index size = kernel.size();
+    const Index alignedStart =
+        DstIsAligned ? 0 : internal::first_aligned<RequestedAlignment>(kernel.dstDataPtr(), size);
+    const Index alignedEnd = alignedStart + numext::round_down(size - alignedStart, PacketSize);
+
+    head_loop::run(kernel, 0, alignedStart);
+
+    constexpr float cost = static_cast<float>(XprEvaluationCost);
+    AssignmentFunctor functor(kernel);
+    device.template parallelFor<AssignmentFunctor, PacketSize>(alignedStart, alignedEnd, functor, cost);
+
+    tail_loop::run(kernel, alignedEnd, size);
+  }
+};
+
+}  // namespace internal
+
+}  // namespace Eigen
+
+#endif  // EIGEN_CORE_THREAD_POOL_DEVICE_H
diff --git a/inst/include/Eigen/src/ThreadPool/EventCount.h b/inst/include/Eigen/src/ThreadPool/EventCount.h
new file mode 100644
index 00000000..6eda6f4b
--- /dev/null
+++ b/inst/include/Eigen/src/ThreadPool/EventCount.h
@@ -0,0 +1,241 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_EVENTCOUNT_H
+#define EIGEN_CXX11_THREADPOOL_EVENTCOUNT_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+// EventCount allows to wait for arbitrary predicates in non-blocking
+// algorithms. Think of condition variable, but wait predicate does not need to
+// be protected by a mutex. Usage:
+// Waiting thread does:
+//
+//   if (predicate)
+//     return act();
+//   EventCount::Waiter& w = waiters[my_index];
+//   ec.Prewait(&w);
+//   if (predicate) {
+//     ec.CancelWait(&w);
+//     return act();
+//   }
+//   ec.CommitWait(&w);
+//
+// Notifying thread does:
+//
+//   predicate = true;
+//   ec.Notify(true);
+//
+// Notify is cheap if there are no waiting threads. Prewait/CommitWait are not
+// cheap, but they are executed only if the preceding predicate check has
+// failed.
+//
+// Algorithm outline:
+// There are two main variables: predicate (managed by user) and state_.
+// Operation closely resembles Dekker mutual algorithm:
+// https://en.wikipedia.org/wiki/Dekker%27s_algorithm
+// Waiting thread sets state_ then checks predicate, Notifying thread sets
+// predicate then checks state_. Due to seq_cst fences in between these
+// operations it is guaranteed than either waiter will see predicate change
+// and won't block, or notifying thread will see state_ change and will unblock
+// the waiter, or both. But it can't happen that both threads don't see each
+// other changes, which would lead to deadlock.
+class EventCount {
+ public:
+  class Waiter;
+
+  EventCount(MaxSizeVector<Waiter>& waiters) : state_(kStackMask), waiters_(waiters) {
+    eigen_plain_assert(waiters.size() < (1 << kWaiterBits) - 1);
+  }
+
+  EventCount(const EventCount&) = delete;
+  void operator=(const EventCount&) = delete;
+
+  ~EventCount() {
+    // Ensure there are no waiters.
+    eigen_plain_assert(state_.load() == kStackMask);
+  }
+
+  // Prewait prepares for waiting.
+  // After calling Prewait, the thread must re-check the wait predicate
+  // and then call either CancelWait or CommitWait.
+  void Prewait() {
+    uint64_t state = state_.load(std::memory_order_relaxed);
+    for (;;) {
+      CheckState(state);
+      uint64_t newstate = state + kWaiterInc;
+      CheckState(newstate);
+      if (state_.compare_exchange_weak(state, newstate, std::memory_order_seq_cst)) return;
+    }
+  }
+
+  // CommitWait commits waiting after Prewait.
+  void CommitWait(Waiter* w) {
+    eigen_plain_assert((w->epoch & ~kEpochMask) == 0);
+    w->state = Waiter::kNotSignaled;
+    const uint64_t me = (w - &waiters_[0]) | w->epoch;
+    uint64_t state = state_.load(std::memory_order_seq_cst);
+    for (;;) {
+      CheckState(state, true);
+      uint64_t newstate;
+      if ((state & kSignalMask) != 0) {
+        // Consume the signal and return immediately.
+        newstate = state - kWaiterInc - kSignalInc;
+      } else {
+        // Remove this thread from pre-wait counter and add to the waiter stack.
+        newstate = ((state & kWaiterMask) - kWaiterInc) | me;
+        w->next.store(state & (kStackMask | kEpochMask), std::memory_order_relaxed);
+      }
+      CheckState(newstate);
+      if (state_.compare_exchange_weak(state, newstate, std::memory_order_acq_rel)) {
+        if ((state & kSignalMask) == 0) {
+          w->epoch += kEpochInc;
+          Park(w);
+        }
+        return;
+      }
+    }
+  }
+
+  // CancelWait cancels effects of the previous Prewait call.
+  void CancelWait() {
+    uint64_t state = state_.load(std::memory_order_relaxed);
+    for (;;) {
+      CheckState(state, true);
+      uint64_t newstate = state - kWaiterInc;
+      // We don't know if the thread was also notified or not,
+      // so we should not consume a signal unconditionally.
+      // Only if number of waiters is equal to number of signals,
+      // we know that the thread was notified and we must take away the signal.
+      if (((state & kWaiterMask) >> kWaiterShift) == ((state & kSignalMask) >> kSignalShift)) newstate -= kSignalInc;
+      CheckState(newstate);
+      if (state_.compare_exchange_weak(state, newstate, std::memory_order_acq_rel)) return;
+    }
+  }
+
+  // Notify wakes one or all waiting threads.
+  // Must be called after changing the associated wait predicate.
+  void Notify(bool notifyAll) {
+    std::atomic_thread_fence(std::memory_order_seq_cst);
+    uint64_t state = state_.load(std::memory_order_acquire);
+    for (;;) {
+      CheckState(state);
+      const uint64_t waiters = (state & kWaiterMask) >> kWaiterShift;
+      const uint64_t signals = (state & kSignalMask) >> kSignalShift;
+      // Easy case: no waiters.
+      if ((state & kStackMask) == kStackMask && waiters == signals) return;
+      uint64_t newstate;
+      if (notifyAll) {
+        // Empty wait stack and set signal to number of pre-wait threads.
+        newstate = (state & kWaiterMask) | (waiters << kSignalShift) | kStackMask;
+      } else if (signals < waiters) {
+        // There is a thread in pre-wait state, unblock it.
+        newstate = state + kSignalInc;
+      } else {
+        // Pop a waiter from list and unpark it.
+        Waiter* w = &waiters_[state & kStackMask];
+        uint64_t next = w->next.load(std::memory_order_relaxed);
+        newstate = (state & (kWaiterMask | kSignalMask)) | next;
+      }
+      CheckState(newstate);
+      if (state_.compare_exchange_weak(state, newstate, std::memory_order_acq_rel)) {
+        if (!notifyAll && (signals < waiters)) return;  // unblocked pre-wait thread
+        if ((state & kStackMask) == kStackMask) return;
+        Waiter* w = &waiters_[state & kStackMask];
+        if (!notifyAll) w->next.store(kStackMask, std::memory_order_relaxed);
+        Unpark(w);
+        return;
+      }
+    }
+  }
+
+ private:
+  // State_ layout:
+  // - low kWaiterBits is a stack of waiters committed wait
+  //   (indexes in waiters_ array are used as stack elements,
+  //   kStackMask means empty stack).
+  // - next kWaiterBits is count of waiters in prewait state.
+  // - next kWaiterBits is count of pending signals.
+  // - remaining bits are ABA counter for the stack.
+  //   (stored in Waiter node and incremented on push).
+  static const uint64_t kWaiterBits = 14;
+  static const uint64_t kStackMask = (1ull << kWaiterBits) - 1;
+  static const uint64_t kWaiterShift = kWaiterBits;
+  static const uint64_t kWaiterMask = ((1ull << kWaiterBits) - 1) << kWaiterShift;
+  static const uint64_t kWaiterInc = 1ull << kWaiterShift;
+  static const uint64_t kSignalShift = 2 * kWaiterBits;
+  static const uint64_t kSignalMask = ((1ull << kWaiterBits) - 1) << kSignalShift;
+  static const uint64_t kSignalInc = 1ull << kSignalShift;
+  static const uint64_t kEpochShift = 3 * kWaiterBits;
+  static const uint64_t kEpochBits = 64 - kEpochShift;
+  static const uint64_t kEpochMask = ((1ull << kEpochBits) - 1) << kEpochShift;
+  static const uint64_t kEpochInc = 1ull << kEpochShift;
+
+ public:
+  class Waiter {
+    friend class EventCount;
+
+    enum State {
+      kNotSignaled,
+      kWaiting,
+      kSignaled,
+    };
+
+    EIGEN_ALIGN_TO_AVOID_FALSE_SHARING std::atomic<uint64_t> next{kStackMask};
+    EIGEN_MUTEX mu;
+    EIGEN_CONDVAR cv;
+    uint64_t epoch{0};
+    unsigned state{kNotSignaled};
+  };
+
+ private:
+  static void CheckState(uint64_t state, bool waiter = false) {
+    static_assert(kEpochBits >= 20, "not enough bits to prevent ABA problem");
+    const uint64_t waiters = (state & kWaiterMask) >> kWaiterShift;
+    const uint64_t signals = (state & kSignalMask) >> kSignalShift;
+    eigen_plain_assert(waiters >= signals);
+    eigen_plain_assert(waiters < (1 << kWaiterBits) - 1);
+    eigen_plain_assert(!waiter || waiters > 0);
+    (void)waiters;
+    (void)signals;
+  }
+
+  void Park(Waiter* w) {
+    EIGEN_MUTEX_LOCK lock(w->mu);
+    while (w->state != Waiter::kSignaled) {
+      w->state = Waiter::kWaiting;
+      w->cv.wait(lock);
+    }
+  }
+
+  void Unpark(Waiter* w) {
+    for (Waiter* next; w; w = next) {
+      uint64_t wnext = w->next.load(std::memory_order_relaxed) & kStackMask;
+      next = wnext == kStackMask ? nullptr : &waiters_[internal::convert_index<size_t>(wnext)];
+      unsigned state;
+      {
+        EIGEN_MUTEX_LOCK lock(w->mu);
+        state = w->state;
+        w->state = Waiter::kSignaled;
+      }
+      // Avoid notifying if it wasn't waiting.
+      if (state == Waiter::kWaiting) w->cv.notify_one();
+    }
+  }
+
+  std::atomic<uint64_t> state_;
+  MaxSizeVector<Waiter>& waiters_;
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_THREADPOOL_EVENTCOUNT_H
diff --git a/inst/include/Eigen/src/ThreadPool/ForkJoin.h b/inst/include/Eigen/src/ThreadPool/ForkJoin.h
new file mode 100644
index 00000000..588636a6
--- /dev/null
+++ b/inst/include/Eigen/src/ThreadPool/ForkJoin.h
@@ -0,0 +1,140 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2025 Weiwei Kong <weiweikong@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_THREADPOOL_FORKJOIN_H
+#define EIGEN_THREADPOOL_FORKJOIN_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+// ForkJoinScheduler provides implementations of various non-blocking ParallelFor algorithms for unary
+// and binary parallel tasks. More specifically, the implementations follow the binary tree-based
+// algorithm from the following paper:
+//
+//   Lea, D. (2000, June). A java fork/join framework. *In Proceedings of the
+//   ACM 2000 conference on Java Grande* (pp. 36-43).
+//
+// For a given binary task function `f(i,j)` and integers `num_threads`, `granularity`, `start`, and `end`,
+// the implemented parallel for algorithm schedules and executes at most `num_threads` of the functions
+// from the following set in parallel (either synchronously or asynchronously):
+//
+//   f(start,start+s_1), f(start+s_1,start+s_2), ..., f(start+s_n,end)
+//
+// where `s_{j+1} - s_{j}` and `end - s_n` are roughly within a factor of two of `granularity`. For a unary
+// task function `g(k)`, the same operation is applied with
+//
+//   f(i,j) = [&](){ for(Index k = i; k < j; ++k) g(k); };
+//
+// Note that the parameter `granularity` should be tuned by the user based on the trade-off of running the
+// given task function sequentially vs. scheduling individual tasks in parallel. An example of a partially
+// tuned `granularity` is in `Eigen::CoreThreadPoolDevice::parallelFor(...)` where the template
+// parameter `PacketSize` and float input `cost` are used to indirectly compute a granularity level for a
+// given task function.
+//
+// Example usage #1 (synchronous):
+// ```
+// ThreadPool thread_pool(num_threads);
+// ForkJoinScheduler::ParallelFor(0, num_tasks, granularity, std::move(parallel_task), &thread_pool);
+// ```
+//
+// Example usage #2 (executing multiple tasks asynchronously, each one parallelized with ParallelFor):
+// ```
+// ThreadPool thread_pool(num_threads);
+// Barrier barrier(num_async_calls);
+// auto done = [&](){ barrier.Notify(); };
+// for (Index k=0; k<num_async_calls; ++k) {
+//   ForkJoinScheduler::ParallelForAsync(task_start[k], task_end[k], granularity[k], parallel_task[k], done,
+//   &thread_pool);
+// }
+// barrier.Wait();
+// ```
+class ForkJoinScheduler {
+ public:
+  // Runs `do_func` asynchronously for the range [start, end) with a specified
+  // granularity. `do_func` should be of type `std::function<void(Index,
+  // Index)`. `done()` is called exactly once after all tasks have been executed.
+  template <typename DoFnType, typename DoneFnType, typename ThreadPoolEnv>
+  static void ParallelForAsync(Index start, Index end, Index granularity, DoFnType&& do_func, DoneFnType&& done,
+                               ThreadPoolTempl<ThreadPoolEnv>* thread_pool) {
+    if (start >= end) {
+      done();
+      return;
+    }
+    thread_pool->Schedule([start, end, granularity, thread_pool, do_func = std::forward<DoFnType>(do_func),
+                           done = std::forward<DoneFnType>(done)]() {
+      RunParallelFor(start, end, granularity, do_func, thread_pool);
+      done();
+    });
+  }
+
+  // Synchronous variant of ParallelForAsync.
+  // WARNING: Making nested calls to `ParallelFor`, e.g., calling `ParallelFor` inside a task passed into another
+  // `ParallelFor` call, may lead to deadlocks due to how task stealing is implemented.
+  template <typename DoFnType, typename ThreadPoolEnv>
+  static void ParallelFor(Index start, Index end, Index granularity, DoFnType&& do_func,
+                          ThreadPoolTempl<ThreadPoolEnv>* thread_pool) {
+    if (start >= end) return;
+    Barrier barrier(1);
+    auto done = [&barrier]() { barrier.Notify(); };
+    ParallelForAsync(start, end, granularity, do_func, done, thread_pool);
+    barrier.Wait();
+  }
+
+ private:
+  // Schedules `right_thunk`, runs `left_thunk`, and runs other tasks until `right_thunk` has finished.
+  template <typename LeftType, typename RightType, typename ThreadPoolEnv>
+  static void ForkJoin(LeftType&& left_thunk, RightType&& right_thunk, ThreadPoolTempl<ThreadPoolEnv>* thread_pool) {
+    typedef typename ThreadPoolTempl<ThreadPoolEnv>::Task Task;
+    std::atomic<bool> right_done(false);
+    auto execute_right = [&right_thunk, &right_done]() {
+      std::forward<RightType>(right_thunk)();
+      right_done.store(true, std::memory_order_release);
+    };
+    thread_pool->Schedule(execute_right);
+    std::forward<LeftType>(left_thunk)();
+    Task task;
+    while (!right_done.load(std::memory_order_acquire)) {
+      thread_pool->MaybeGetTask(&task);
+      if (task.f) task.f();
+    }
+  }
+
+  static Index ComputeMidpoint(Index start, Index end, Index granularity) {
+    // Typical workloads choose initial values of `{start, end, granularity}` such that `start - end` and
+    // `granularity` are powers of two. Since modern processors usually implement (2^x)-way
+    // set-associative caches, we minimize the number of cache misses by choosing midpoints that are not
+    // powers of two (to avoid having two addresses in the main memory pointing to the same point in the
+    // cache). More specifically, we choose the midpoint at (roughly) the 9/16 mark.
+    const Index size = end - start;
+    const Index offset = numext::round_down(9 * (size + 1) / 16, granularity);
+    return start + offset;
+  }
+
+  template <typename DoFnType, typename ThreadPoolEnv>
+  static void RunParallelFor(Index start, Index end, Index granularity, DoFnType&& do_func,
+                             ThreadPoolTempl<ThreadPoolEnv>* thread_pool) {
+    Index mid = ComputeMidpoint(start, end, granularity);
+    if ((end - start) < granularity || mid == start || mid == end) {
+      do_func(start, end);
+      return;
+    }
+    ForkJoin([start, mid, granularity, &do_func,
+              thread_pool]() { RunParallelFor(start, mid, granularity, do_func, thread_pool); },
+             [mid, end, granularity, &do_func, thread_pool]() {
+               RunParallelFor(mid, end, granularity, do_func, thread_pool);
+             },
+             thread_pool);
+  }
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_THREADPOOL_FORKJOIN_H
diff --git a/inst/include/Eigen/src/ThreadPool/InternalHeaderCheck.h b/inst/include/Eigen/src/ThreadPool/InternalHeaderCheck.h
new file mode 100644
index 00000000..5b27ef43
--- /dev/null
+++ b/inst/include/Eigen/src/ThreadPool/InternalHeaderCheck.h
@@ -0,0 +1,4 @@
+#ifndef EIGEN_THREADPOOL_MODULE_H
+#error \
+    "Please include unsupported/Eigen/CXX11/ThreadPool instead of including headers inside the src directory directly."
+#endif
diff --git a/inst/include/Eigen/src/ThreadPool/NonBlockingThreadPool.h b/inst/include/Eigen/src/ThreadPool/NonBlockingThreadPool.h
new file mode 100644
index 00000000..44d4b243
--- /dev/null
+++ b/inst/include/Eigen/src/ThreadPool/NonBlockingThreadPool.h
@@ -0,0 +1,587 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_NONBLOCKING_THREAD_POOL_H
+#define EIGEN_CXX11_THREADPOOL_NONBLOCKING_THREAD_POOL_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+template <typename Environment>
+class ThreadPoolTempl : public Eigen::ThreadPoolInterface {
+ public:
+  typedef typename Environment::EnvThread Thread;
+  typedef typename Environment::Task Task;
+  typedef RunQueue<Task, 1024> Queue;
+
+  struct PerThread {
+    constexpr PerThread() : pool(NULL), rand(0), thread_id(-1) {}
+    ThreadPoolTempl* pool;  // Parent pool, or null for normal threads.
+    uint64_t rand;          // Random generator state.
+    int thread_id;          // Worker thread index in pool.
+  };
+
+  struct ThreadData {
+    constexpr ThreadData() : thread(), steal_partition(0), queue() {}
+    std::unique_ptr<Thread> thread;
+    std::atomic<unsigned> steal_partition;
+    Queue queue;
+  };
+
+  ThreadPoolTempl(int num_threads, Environment env = Environment()) : ThreadPoolTempl(num_threads, true, env) {}
+
+  ThreadPoolTempl(int num_threads, bool allow_spinning, Environment env = Environment())
+      : env_(env),
+        num_threads_(num_threads),
+        allow_spinning_(allow_spinning),
+        spin_count_(
+            // TODO(dvyukov,rmlarsen): The time spent in NonEmptyQueueIndex() is proportional to num_threads_ and
+            // we assume that new work is scheduled at a constant rate, so we divide `kSpintCount` by number of
+            // threads and number of spinning threads. The constant was picked based on a fair dice roll, tune it.
+            allow_spinning && num_threads > 0 ? kSpinCount / kMaxSpinningThreads / num_threads : 0),
+        thread_data_(num_threads),
+        all_coprimes_(num_threads),
+        waiters_(num_threads),
+        global_steal_partition_(EncodePartition(0, num_threads_)),
+        spinning_state_(0),
+        blocked_(0),
+        done_(false),
+        cancelled_(false),
+        ec_(waiters_) {
+    waiters_.resize(num_threads_);
+    // Calculate coprimes of all numbers [1, num_threads].
+    // Coprimes are used for random walks over all threads in Steal
+    // and NonEmptyQueueIndex. Iteration is based on the fact that if we take
+    // a random starting thread index t and calculate num_threads - 1 subsequent
+    // indices as (t + coprime) % num_threads, we will cover all threads without
+    // repetitions (effectively getting a presudo-random permutation of thread
+    // indices).
+    eigen_plain_assert(num_threads_ < kMaxThreads);
+    for (int i = 1; i <= num_threads_; ++i) {
+      all_coprimes_.emplace_back(i);
+      ComputeCoprimes(i, &all_coprimes_.back());
+    }
+#ifndef EIGEN_THREAD_LOCAL
+    init_barrier_.reset(new Barrier(num_threads_));
+#endif
+    thread_data_.resize(num_threads_);
+    for (int i = 0; i < num_threads_; i++) {
+      SetStealPartition(i, EncodePartition(0, num_threads_));
+      thread_data_[i].thread.reset(env_.CreateThread([this, i]() { WorkerLoop(i); }));
+    }
+#ifndef EIGEN_THREAD_LOCAL
+    // Wait for workers to initialize per_thread_map_. Otherwise we might race
+    // with them in Schedule or CurrentThreadId.
+    init_barrier_->Wait();
+#endif
+  }
+
+  ~ThreadPoolTempl() {
+    done_ = true;
+
+    // Now if all threads block without work, they will start exiting.
+    // But note that threads can continue to work arbitrary long,
+    // block, submit new work, unblock and otherwise live full life.
+    if (!cancelled_) {
+      ec_.Notify(true);
+    } else {
+      // Since we were cancelled, there might be entries in the queues.
+      // Empty them to prevent their destructor from asserting.
+      for (size_t i = 0; i < thread_data_.size(); i++) {
+        thread_data_[i].queue.Flush();
+      }
+    }
+    // Join threads explicitly (by destroying) to avoid destruction order within
+    // this class.
+    for (size_t i = 0; i < thread_data_.size(); ++i) thread_data_[i].thread.reset();
+  }
+
+  void SetStealPartitions(const std::vector<std::pair<unsigned, unsigned>>& partitions) {
+    eigen_plain_assert(partitions.size() == static_cast<std::size_t>(num_threads_));
+
+    // Pass this information to each thread queue.
+    for (int i = 0; i < num_threads_; i++) {
+      const auto& pair = partitions[i];
+      unsigned start = pair.first, end = pair.second;
+      AssertBounds(start, end);
+      unsigned val = EncodePartition(start, end);
+      SetStealPartition(i, val);
+    }
+  }
+
+  void Schedule(std::function<void()> fn) EIGEN_OVERRIDE { ScheduleWithHint(std::move(fn), 0, num_threads_); }
+
+  void ScheduleWithHint(std::function<void()> fn, int start, int limit) override {
+    Task t = env_.CreateTask(std::move(fn));
+    PerThread* pt = GetPerThread();
+    if (pt->pool == this) {
+      // Worker thread of this pool, push onto the thread's queue.
+      Queue& q = thread_data_[pt->thread_id].queue;
+      t = q.PushFront(std::move(t));
+    } else {
+      // A free-standing thread (or worker of another pool), push onto a random
+      // queue.
+      eigen_plain_assert(start < limit);
+      eigen_plain_assert(limit <= num_threads_);
+      int num_queues = limit - start;
+      int rnd = Rand(&pt->rand) % num_queues;
+      eigen_plain_assert(start + rnd < limit);
+      Queue& q = thread_data_[start + rnd].queue;
+      t = q.PushBack(std::move(t));
+    }
+    // Note: below we touch this after making w available to worker threads.
+    // Strictly speaking, this can lead to a racy-use-after-free. Consider that
+    // Schedule is called from a thread that is neither main thread nor a worker
+    // thread of this pool. Then, execution of w directly or indirectly
+    // completes overall computations, which in turn leads to destruction of
+    // this. We expect that such scenario is prevented by program, that is,
+    // this is kept alive while any threads can potentially be in Schedule.
+    if (!t.f) {
+      if (IsNotifyParkedThreadRequired()) {
+        ec_.Notify(false);
+      }
+    } else {
+      env_.ExecuteTask(t);  // Push failed, execute directly.
+    }
+  }
+
+  // Tries to assign work to the current task.
+  void MaybeGetTask(Task* t) {
+    PerThread* pt = GetPerThread();
+    const int thread_id = pt->thread_id;
+    // If we are not a worker thread of this pool, we can't get any work.
+    if (thread_id < 0) return;
+    Queue& q = thread_data_[thread_id].queue;
+    *t = q.PopFront();
+    if (t->f) return;
+    if (num_threads_ == 1) {
+      // For num_threads_ == 1 there is no point in going through the expensive
+      // steal loop. Moreover, since NonEmptyQueueIndex() calls PopBack() on the
+      // victim queues it might reverse the order in which ops are executed
+      // compared to the order in which they are scheduled, which tends to be
+      // counter-productive for the types of I/O workloads single thread pools
+      // tend to be used for.
+      for (int i = 0; i < spin_count_ && !t->f; ++i) *t = q.PopFront();
+    } else {
+      if (EIGEN_PREDICT_FALSE(!t->f)) *t = LocalSteal();
+      if (EIGEN_PREDICT_FALSE(!t->f)) *t = GlobalSteal();
+      if (EIGEN_PREDICT_FALSE(!t->f)) {
+        if (allow_spinning_ && StartSpinning()) {
+          for (int i = 0; i < spin_count_ && !t->f; ++i) *t = GlobalSteal();
+          // Notify `spinning_state_` that we are no longer spinning.
+          bool has_no_notify_task = StopSpinning();
+          // If a task was submitted to the queue without a call to
+          // `ec_.Notify()` (if `IsNotifyParkedThreadRequired()` returned
+          // false), and we didn't steal anything above, we must try to
+          // steal one more time, to make sure that this task will be
+          // executed. We will not necessarily find it, because it might
+          // have been already stolen by some other thread.
+          if (has_no_notify_task && !t->f) *t = GlobalSteal();
+        }
+      }
+    }
+  }
+
+  void Cancel() EIGEN_OVERRIDE {
+    cancelled_ = true;
+    done_ = true;
+
+    // Let each thread know it's been cancelled.
+#ifdef EIGEN_THREAD_ENV_SUPPORTS_CANCELLATION
+    for (size_t i = 0; i < thread_data_.size(); i++) {
+      thread_data_[i].thread->OnCancel();
+    }
+#endif
+
+    // Wake up the threads without work to let them exit on their own.
+    ec_.Notify(true);
+  }
+
+  int NumThreads() const EIGEN_FINAL { return num_threads_; }
+
+  int CurrentThreadId() const EIGEN_FINAL {
+    const PerThread* pt = const_cast<ThreadPoolTempl*>(this)->GetPerThread();
+    if (pt->pool == this) {
+      return pt->thread_id;
+    } else {
+      return -1;
+    }
+  }
+
+ private:
+  // Create a single atomic<int> that encodes start and limit information for
+  // each thread.
+  // We expect num_threads_ < 65536, so we can store them in a single
+  // std::atomic<unsigned>.
+  // Exposed publicly as static functions so that external callers can reuse
+  // this encode/decode logic for maintaining their own thread-safe copies of
+  // scheduling and steal domain(s).
+  static constexpr int kMaxPartitionBits = 16;
+  static constexpr int kMaxThreads = 1 << kMaxPartitionBits;
+
+  inline unsigned EncodePartition(unsigned start, unsigned limit) { return (start << kMaxPartitionBits) | limit; }
+
+  inline void DecodePartition(unsigned val, unsigned* start, unsigned* limit) {
+    *limit = val & (kMaxThreads - 1);
+    val >>= kMaxPartitionBits;
+    *start = val;
+  }
+
+  void AssertBounds(int start, int end) {
+    eigen_plain_assert(start >= 0);
+    eigen_plain_assert(start < end);  // non-zero sized partition
+    eigen_plain_assert(end <= num_threads_);
+  }
+
+  inline void SetStealPartition(size_t i, unsigned val) {
+    thread_data_[i].steal_partition.store(val, std::memory_order_relaxed);
+  }
+
+  inline unsigned GetStealPartition(int i) { return thread_data_[i].steal_partition.load(std::memory_order_relaxed); }
+
+  void ComputeCoprimes(int N, MaxSizeVector<unsigned>* coprimes) {
+    for (int i = 1; i <= N; i++) {
+      unsigned a = i;
+      unsigned b = N;
+      // If GCD(a, b) == 1, then a and b are coprimes.
+      while (b != 0) {
+        unsigned tmp = a;
+        a = b;
+        b = tmp % b;
+      }
+      if (a == 1) {
+        coprimes->push_back(i);
+      }
+    }
+  }
+
+  // Maximum number of threads that can spin in steal loop.
+  static constexpr int kMaxSpinningThreads = 1;
+
+  // The number of steal loop spin iterations before parking (this number is
+  // divided by the number of threads, to get spin count for each thread).
+  static constexpr int kSpinCount = 5000;
+
+  // If there are enough active threads with empty pending-task queues, a thread
+  // that runs out of work can just be parked without spinning, because these
+  // active threads will go into a steal loop after finishing their current
+  // tasks.
+  //
+  // In the worst case when all active threads are executing long/expensive
+  // tasks, the next Schedule() will have to wait until one of the parked
+  // threads will be unparked, however this should be very rare in practice.
+  static constexpr int kMinActiveThreadsToStartSpinning = 4;
+
+  struct SpinningState {
+    // Spinning state layout:
+    //
+    // - Low 32 bits encode the number of threads that are spinning in steal
+    //   loop.
+    //
+    // - High 32 bits encode the number of tasks that were submitted to the pool
+    //   without a call to `ec_.Notify()`. This number can't be larger than
+    //   the number of spinning threads. Each spinning thread, when it exits the
+    //   spin loop must check if this number is greater than zero, and maybe
+    //   make another attempt to steal a task and decrement it by one.
+    static constexpr uint64_t kNumSpinningMask = 0x00000000FFFFFFFF;
+    static constexpr uint64_t kNumNoNotifyMask = 0xFFFFFFFF00000000;
+    static constexpr uint64_t kNumNoNotifyShift = 32;
+
+    uint64_t num_spinning;         // number of spinning threads
+    uint64_t num_no_notification;  // number of tasks submitted without
+                                   // notifying waiting threads
+
+    // Decodes `spinning_state_` value.
+    static SpinningState Decode(uint64_t state) {
+      uint64_t num_spinning = (state & kNumSpinningMask);
+      uint64_t num_no_notification = (state & kNumNoNotifyMask) >> kNumNoNotifyShift;
+
+      eigen_plain_assert(num_no_notification <= num_spinning);
+      return {num_spinning, num_no_notification};
+    }
+
+    // Encodes as `spinning_state_` value.
+    uint64_t Encode() const {
+      eigen_plain_assert(num_no_notification <= num_spinning);
+      return (num_no_notification << kNumNoNotifyShift) | num_spinning;
+    }
+  };
+
+  Environment env_;
+  const int num_threads_;
+  const bool allow_spinning_;
+  const int spin_count_;
+  MaxSizeVector<ThreadData> thread_data_;
+  MaxSizeVector<MaxSizeVector<unsigned>> all_coprimes_;
+  MaxSizeVector<EventCount::Waiter> waiters_;
+  unsigned global_steal_partition_;
+  std::atomic<uint64_t> spinning_state_;
+  std::atomic<unsigned> blocked_;
+  std::atomic<bool> done_;
+  std::atomic<bool> cancelled_;
+  EventCount ec_;
+#ifndef EIGEN_THREAD_LOCAL
+  std::unique_ptr<Barrier> init_barrier_;
+  EIGEN_MUTEX per_thread_map_mutex_;  // Protects per_thread_map_.
+  std::unordered_map<uint64_t, std::unique_ptr<PerThread>> per_thread_map_;
+#endif
+
+  unsigned NumBlockedThreads() const { return blocked_.load(); }
+  unsigned NumActiveThreads() const { return num_threads_ - blocked_.load(); }
+
+  // Main worker thread loop.
+  void WorkerLoop(int thread_id) {
+#ifndef EIGEN_THREAD_LOCAL
+    std::unique_ptr<PerThread> new_pt(new PerThread());
+    per_thread_map_mutex_.lock();
+    bool insertOK = per_thread_map_.emplace(GlobalThreadIdHash(), std::move(new_pt)).second;
+    eigen_plain_assert(insertOK);
+    EIGEN_UNUSED_VARIABLE(insertOK);
+    per_thread_map_mutex_.unlock();
+    init_barrier_->Notify();
+    init_barrier_->Wait();
+#endif
+    PerThread* pt = GetPerThread();
+    pt->pool = this;
+    pt->rand = GlobalThreadIdHash();
+    pt->thread_id = thread_id;
+    Task t;
+    while (!cancelled_.load(std::memory_order_relaxed)) {
+      MaybeGetTask(&t);
+      // If we still don't have a task, wait for one. Return if thread pool is
+      // in cancelled state.
+      if (EIGEN_PREDICT_FALSE(!t.f)) {
+        EventCount::Waiter* waiter = &waiters_[pt->thread_id];
+        if (!WaitForWork(waiter, &t)) return;
+      }
+      if (EIGEN_PREDICT_TRUE(t.f)) env_.ExecuteTask(t);
+    }
+  }
+
+  // Steal tries to steal work from other worker threads in the range [start,
+  // limit) in best-effort manner.
+  Task Steal(unsigned start, unsigned limit) {
+    PerThread* pt = GetPerThread();
+    const size_t size = limit - start;
+    unsigned r = Rand(&pt->rand);
+    // Reduce r into [0, size) range, this utilizes trick from
+    // https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
+    eigen_plain_assert(all_coprimes_[size - 1].size() < (1 << 30));
+    unsigned victim = ((uint64_t)r * (uint64_t)size) >> 32;
+    unsigned index = ((uint64_t)all_coprimes_[size - 1].size() * (uint64_t)r) >> 32;
+    unsigned inc = all_coprimes_[size - 1][index];
+
+    for (unsigned i = 0; i < size; i++) {
+      eigen_plain_assert(start + victim < limit);
+      Task t = thread_data_[start + victim].queue.PopBack();
+      if (t.f) {
+        return t;
+      }
+      victim += inc;
+      if (victim >= size) {
+        victim -= static_cast<unsigned int>(size);
+      }
+    }
+    return Task();
+  }
+
+  // Steals work within threads belonging to the partition.
+  Task LocalSteal() {
+    PerThread* pt = GetPerThread();
+    unsigned partition = GetStealPartition(pt->thread_id);
+    // If thread steal partition is the same as global partition, there is no
+    // need to go through the steal loop twice.
+    if (global_steal_partition_ == partition) return Task();
+    unsigned start, limit;
+    DecodePartition(partition, &start, &limit);
+    AssertBounds(start, limit);
+
+    return Steal(start, limit);
+  }
+
+  // Steals work from any other thread in the pool.
+  Task GlobalSteal() { return Steal(0, num_threads_); }
+
+  // WaitForWork blocks until new work is available (returns true), or if it is
+  // time to exit (returns false). Can optionally return a task to execute in t
+  // (in such case t.f != nullptr on return).
+  bool WaitForWork(EventCount::Waiter* waiter, Task* t) {
+    eigen_plain_assert(!t->f);
+    // We already did best-effort emptiness check in Steal, so prepare for
+    // blocking.
+    ec_.Prewait();
+    // Now do a reliable emptiness check.
+    int victim = NonEmptyQueueIndex();
+    if (victim != -1) {
+      ec_.CancelWait();
+      if (cancelled_) {
+        return false;
+      } else {
+        *t = thread_data_[victim].queue.PopBack();
+        return true;
+      }
+    }
+    // Number of blocked threads is used as termination condition.
+    // If we are shutting down and all worker threads blocked without work,
+    // that's we are done.
+    blocked_++;
+    // TODO is blocked_ required to be unsigned?
+    if (done_ && blocked_ == static_cast<unsigned>(num_threads_)) {
+      ec_.CancelWait();
+      // Almost done, but need to re-check queues.
+      // Consider that all queues are empty and all worker threads are preempted
+      // right after incrementing blocked_ above. Now a free-standing thread
+      // submits work and calls destructor (which sets done_). If we don't
+      // re-check queues, we will exit leaving the work unexecuted.
+      if (NonEmptyQueueIndex() != -1) {
+        // Note: we must not pop from queues before we decrement blocked_,
+        // otherwise the following scenario is possible. Consider that instead
+        // of checking for emptiness we popped the only element from queues.
+        // Now other worker threads can start exiting, which is bad if the
+        // work item submits other work. So we just check emptiness here,
+        // which ensures that all worker threads exit at the same time.
+        blocked_--;
+        return true;
+      }
+      // Reached stable termination state.
+      ec_.Notify(true);
+      return false;
+    }
+    ec_.CommitWait(waiter);
+    blocked_--;
+    return true;
+  }
+
+  int NonEmptyQueueIndex() {
+    PerThread* pt = GetPerThread();
+    // We intentionally design NonEmptyQueueIndex to steal work from
+    // anywhere in the queue so threads don't block in WaitForWork() forever
+    // when all threads in their partition go to sleep. Steal is still local.
+    const size_t size = thread_data_.size();
+    unsigned r = Rand(&pt->rand);
+    unsigned inc = all_coprimes_[size - 1][r % all_coprimes_[size - 1].size()];
+    unsigned victim = r % size;
+    for (unsigned i = 0; i < size; i++) {
+      if (!thread_data_[victim].queue.Empty()) {
+        return victim;
+      }
+      victim += inc;
+      if (victim >= size) {
+        victim -= static_cast<unsigned int>(size);
+      }
+    }
+    return -1;
+  }
+
+  // StartSpinning() checks if the number of threads in the spin loop is less
+  // than the allowed maximum. If so, increments the number of spinning threads
+  // by one and returns true (caller must enter the spin loop). Otherwise
+  // returns false, and the caller must not enter the spin loop.
+  bool StartSpinning() {
+    if (NumActiveThreads() > kMinActiveThreadsToStartSpinning) return false;
+
+    uint64_t spinning = spinning_state_.load(std::memory_order_relaxed);
+    for (;;) {
+      SpinningState state = SpinningState::Decode(spinning);
+
+      if ((state.num_spinning - state.num_no_notification) >= kMaxSpinningThreads) {
+        return false;
+      }
+
+      // Increment the number of spinning threads.
+      ++state.num_spinning;
+
+      if (spinning_state_.compare_exchange_weak(spinning, state.Encode(), std::memory_order_relaxed)) {
+        return true;
+      }
+    }
+  }
+
+  // StopSpinning() decrements the number of spinning threads by one. It also
+  // checks if there were any tasks submitted into the pool without notifying
+  // parked threads, and decrements the count by one. Returns true if the number
+  // of tasks submitted without notification was decremented. In this case,
+  // caller thread might have to call Steal() one more time.
+  bool StopSpinning() {
+    uint64_t spinning = spinning_state_.load(std::memory_order_relaxed);
+    for (;;) {
+      SpinningState state = SpinningState::Decode(spinning);
+
+      // Decrement the number of spinning threads.
+      --state.num_spinning;
+
+      // Maybe decrement the number of tasks submitted without notification.
+      bool has_no_notify_task = state.num_no_notification > 0;
+      if (has_no_notify_task) --state.num_no_notification;
+
+      if (spinning_state_.compare_exchange_weak(spinning, state.Encode(), std::memory_order_relaxed)) {
+        return has_no_notify_task;
+      }
+    }
+  }
+
+  // IsNotifyParkedThreadRequired() returns true if parked thread must be
+  // notified about new added task. If there are threads spinning in the steal
+  // loop, there is no need to unpark any of the waiting threads, the task will
+  // be picked up by one of the spinning threads.
+  bool IsNotifyParkedThreadRequired() {
+    uint64_t spinning = spinning_state_.load(std::memory_order_relaxed);
+    for (;;) {
+      SpinningState state = SpinningState::Decode(spinning);
+
+      // If the number of tasks submitted without notifying parked threads is
+      // equal to the number of spinning threads, we must wake up one of the
+      // parked threads.
+      if (state.num_no_notification == state.num_spinning) return true;
+
+      // Increment the number of tasks submitted without notification.
+      ++state.num_no_notification;
+
+      if (spinning_state_.compare_exchange_weak(spinning, state.Encode(), std::memory_order_relaxed)) {
+        return false;
+      }
+    }
+  }
+
+  static EIGEN_STRONG_INLINE uint64_t GlobalThreadIdHash() {
+    return std::hash<std::thread::id>()(std::this_thread::get_id());
+  }
+
+  EIGEN_STRONG_INLINE PerThread* GetPerThread() {
+#ifndef EIGEN_THREAD_LOCAL
+    static PerThread dummy;
+    auto it = per_thread_map_.find(GlobalThreadIdHash());
+    if (it == per_thread_map_.end()) {
+      return &dummy;
+    } else {
+      return it->second.get();
+    }
+#else
+    EIGEN_THREAD_LOCAL PerThread per_thread_;
+    PerThread* pt = &per_thread_;
+    return pt;
+#endif
+  }
+
+  static EIGEN_STRONG_INLINE unsigned Rand(uint64_t* state) {
+    uint64_t current = *state;
+    // Update the internal state
+    *state = current * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
+    // Generate the random output (using the PCG-XSH-RS scheme)
+    return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61)));
+  }
+};
+
+typedef ThreadPoolTempl<StlThreadEnvironment> ThreadPool;
+
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_THREADPOOL_NONBLOCKING_THREAD_POOL_H
diff --git a/inst/include/Eigen/src/ThreadPool/RunQueue.h b/inst/include/Eigen/src/ThreadPool/RunQueue.h
new file mode 100644
index 00000000..9046b180
--- /dev/null
+++ b/inst/include/Eigen/src/ThreadPool/RunQueue.h
@@ -0,0 +1,230 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Dmitry Vyukov <dvyukov@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_RUNQUEUE_H
+#define EIGEN_CXX11_THREADPOOL_RUNQUEUE_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+// RunQueue is a fixed-size, partially non-blocking deque or Work items.
+// Operations on front of the queue must be done by a single thread (owner),
+// operations on back of the queue can be done by multiple threads concurrently.
+//
+// Algorithm outline:
+// All remote threads operating on the queue back are serialized by a mutex.
+// This ensures that at most two threads access state: owner and one remote
+// thread (Size aside). The algorithm ensures that the occupied region of the
+// underlying array is logically continuous (can wraparound, but no stray
+// occupied elements). Owner operates on one end of this region, remote thread
+// operates on the other end. Synchronization between these threads
+// (potential consumption of the last element and take up of the last empty
+// element) happens by means of state variable in each element. States are:
+// empty, busy (in process of insertion of removal) and ready. Threads claim
+// elements (empty->busy and ready->busy transitions) by means of a CAS
+// operation. The finishing transition (busy->empty and busy->ready) are done
+// with plain store as the element is exclusively owned by the current thread.
+//
+// Note: we could permit only pointers as elements, then we would not need
+// separate state variable as null/non-null pointer value would serve as state,
+// but that would require malloc/free per operation for large, complex values
+// (and this is designed to store std::function<()>).
+template <typename Work, unsigned kSize>
+class RunQueue {
+ public:
+  RunQueue() : front_(0), back_(0) {
+    // require power-of-two for fast masking
+    eigen_plain_assert((kSize & (kSize - 1)) == 0);
+    eigen_plain_assert(kSize > 2);            // why would you do this?
+    eigen_plain_assert(kSize <= (64 << 10));  // leave enough space for counter
+    for (unsigned i = 0; i < kSize; i++) array_[i].state.store(kEmpty, std::memory_order_relaxed);
+  }
+
+  ~RunQueue() { eigen_plain_assert(Size() == 0); }
+
+  // PushFront inserts w at the beginning of the queue.
+  // If queue is full returns w, otherwise returns default-constructed Work.
+  Work PushFront(Work w) {
+    unsigned front = front_.load(std::memory_order_relaxed);
+    Elem* e = &array_[front & kMask];
+    uint8_t s = e->state.load(std::memory_order_relaxed);
+    if (s != kEmpty || !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire)) return w;
+    front_.store(front + 1 + (kSize << 1), std::memory_order_relaxed);
+    e->w = std::move(w);
+    e->state.store(kReady, std::memory_order_release);
+    return Work();
+  }
+
+  // PopFront removes and returns the first element in the queue.
+  // If the queue was empty returns default-constructed Work.
+  Work PopFront() {
+    unsigned front = front_.load(std::memory_order_relaxed);
+    Elem* e = &array_[(front - 1) & kMask];
+    uint8_t s = e->state.load(std::memory_order_relaxed);
+    if (s != kReady || !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire)) return Work();
+    Work w = std::move(e->w);
+    e->state.store(kEmpty, std::memory_order_release);
+    front = ((front - 1) & kMask2) | (front & ~kMask2);
+    front_.store(front, std::memory_order_relaxed);
+    return w;
+  }
+
+  // PushBack adds w at the end of the queue.
+  // If queue is full returns w, otherwise returns default-constructed Work.
+  Work PushBack(Work w) {
+    EIGEN_MUTEX_LOCK lock(mutex_);
+    unsigned back = back_.load(std::memory_order_relaxed);
+    Elem* e = &array_[(back - 1) & kMask];
+    uint8_t s = e->state.load(std::memory_order_relaxed);
+    if (s != kEmpty || !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire)) return w;
+    back = ((back - 1) & kMask2) | (back & ~kMask2);
+    back_.store(back, std::memory_order_relaxed);
+    e->w = std::move(w);
+    e->state.store(kReady, std::memory_order_release);
+    return Work();
+  }
+
+  // PopBack removes and returns the last elements in the queue.
+  Work PopBack() {
+    if (Empty()) return Work();
+    EIGEN_MUTEX_LOCK lock(mutex_);
+    unsigned back = back_.load(std::memory_order_relaxed);
+    Elem* e = &array_[back & kMask];
+    uint8_t s = e->state.load(std::memory_order_relaxed);
+    if (s != kReady || !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire)) return Work();
+    Work w = std::move(e->w);
+    e->state.store(kEmpty, std::memory_order_release);
+    back_.store(back + 1 + (kSize << 1), std::memory_order_relaxed);
+    return w;
+  }
+
+  // PopBackHalf removes and returns half last elements in the queue.
+  // Returns number of elements removed.
+  unsigned PopBackHalf(std::vector<Work>* result) {
+    if (Empty()) return 0;
+    EIGEN_MUTEX_LOCK lock(mutex_);
+    unsigned back = back_.load(std::memory_order_relaxed);
+    unsigned size = Size();
+    unsigned mid = back;
+    if (size > 1) mid = back + (size - 1) / 2;
+    unsigned n = 0;
+    unsigned start = 0;
+    for (; static_cast<int>(mid - back) >= 0; mid--) {
+      Elem* e = &array_[mid & kMask];
+      uint8_t s = e->state.load(std::memory_order_relaxed);
+      if (n == 0) {
+        if (s != kReady || !e->state.compare_exchange_strong(s, kBusy, std::memory_order_acquire)) continue;
+        start = mid;
+      } else {
+        // Note: no need to store temporal kBusy, we exclusively own these
+        // elements.
+        eigen_plain_assert(s == kReady);
+      }
+      result->push_back(std::move(e->w));
+      e->state.store(kEmpty, std::memory_order_release);
+      n++;
+    }
+    if (n != 0) back_.store(start + 1 + (kSize << 1), std::memory_order_relaxed);
+    return n;
+  }
+
+  // Size returns current queue size.
+  // Can be called by any thread at any time.
+  unsigned Size() const { return SizeOrNotEmpty<true>(); }
+
+  // Empty tests whether container is empty.
+  // Can be called by any thread at any time.
+  bool Empty() const { return SizeOrNotEmpty<false>() == 0; }
+
+  // Delete all the elements from the queue.
+  void Flush() {
+    while (!Empty()) {
+      PopFront();
+    }
+  }
+
+ private:
+  static const unsigned kMask = kSize - 1;
+  static const unsigned kMask2 = (kSize << 1) - 1;
+
+  enum State {
+    kEmpty,
+    kBusy,
+    kReady,
+  };
+
+  struct Elem {
+    std::atomic<uint8_t> state;
+    Work w;
+  };
+
+  // Low log(kSize) + 1 bits in front_ and back_ contain rolling index of
+  // front/back, respectively. The remaining bits contain modification counters
+  // that are incremented on Push operations. This allows us to (1) distinguish
+  // between empty and full conditions (if we would use log(kSize) bits for
+  // position, these conditions would be indistinguishable); (2) obtain
+  // consistent snapshot of front_/back_ for Size operation using the
+  // modification counters.
+  EIGEN_ALIGN_TO_AVOID_FALSE_SHARING std::atomic<unsigned> front_;
+  EIGEN_ALIGN_TO_AVOID_FALSE_SHARING std::atomic<unsigned> back_;
+  EIGEN_MUTEX mutex_;  // guards `PushBack` and `PopBack` (accesses `back_`)
+
+  EIGEN_ALIGN_TO_AVOID_FALSE_SHARING Elem array_[kSize];
+
+  // SizeOrNotEmpty returns current queue size; if NeedSizeEstimate is false,
+  // only whether the size is 0 is guaranteed to be correct.
+  // Can be called by any thread at any time.
+  template <bool NeedSizeEstimate>
+  unsigned SizeOrNotEmpty() const {
+    // Emptiness plays critical role in thread pool blocking. So we go to great
+    // effort to not produce false positives (claim non-empty queue as empty).
+    unsigned front = front_.load(std::memory_order_acquire);
+    for (;;) {
+      // Capture a consistent snapshot of front/tail.
+      unsigned back = back_.load(std::memory_order_acquire);
+      unsigned front1 = front_.load(std::memory_order_relaxed);
+      if (front != front1) {
+        front = front1;
+        std::atomic_thread_fence(std::memory_order_acquire);
+        continue;
+      }
+      if (NeedSizeEstimate) {
+        return CalculateSize(front, back);
+      } else {
+        // This value will be 0 if the queue is empty, and undefined otherwise.
+        unsigned maybe_zero = ((front ^ back) & kMask2);
+        // Queue size estimate must agree with maybe zero check on the queue
+        // empty/non-empty state.
+        eigen_assert((CalculateSize(front, back) == 0) == (maybe_zero == 0));
+        return maybe_zero;
+      }
+    }
+  }
+
+  EIGEN_ALWAYS_INLINE unsigned CalculateSize(unsigned front, unsigned back) const {
+    int size = (front & kMask2) - (back & kMask2);
+    // Fix overflow.
+    if (EIGEN_PREDICT_FALSE(size < 0)) size += 2 * kSize;
+    // Order of modification in push/pop is crafted to make the queue look
+    // larger than it is during concurrent modifications. E.g. push can
+    // increment size before the corresponding pop has decremented it.
+    // So the computed size can be up to kSize + 1, fix it.
+    if (EIGEN_PREDICT_FALSE(size > static_cast<int>(kSize))) size = kSize;
+    return static_cast<unsigned>(size);
+  }
+
+  RunQueue(const RunQueue&) = delete;
+  void operator=(const RunQueue&) = delete;
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_THREADPOOL_RUNQUEUE_H
diff --git a/inst/include/Eigen/src/ThreadPool/ThreadCancel.h b/inst/include/Eigen/src/ThreadPool/ThreadCancel.h
new file mode 100644
index 00000000..6f4dc67d
--- /dev/null
+++ b/inst/include/Eigen/src/ThreadPool/ThreadCancel.h
@@ -0,0 +1,21 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_THREAD_CANCEL_H
+#define EIGEN_CXX11_THREADPOOL_THREAD_CANCEL_H
+
+// Try to come up with a portable way to cancel a thread
+#if EIGEN_OS_GNULINUX
+#define EIGEN_THREAD_CANCEL(t) pthread_cancel(t.native_handle());
+#define EIGEN_SUPPORTS_THREAD_CANCELLATION 1
+#else
+#define EIGEN_THREAD_CANCEL(t)
+#endif
+
+#endif  // EIGEN_CXX11_THREADPOOL_THREAD_CANCEL_H
diff --git a/inst/include/Eigen/src/ThreadPool/ThreadEnvironment.h b/inst/include/Eigen/src/ThreadPool/ThreadEnvironment.h
new file mode 100644
index 00000000..b4dd05c7
--- /dev/null
+++ b/inst/include/Eigen/src/ThreadPool/ThreadEnvironment.h
@@ -0,0 +1,43 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_THREAD_ENVIRONMENT_H
+#define EIGEN_CXX11_THREADPOOL_THREAD_ENVIRONMENT_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+struct StlThreadEnvironment {
+  struct Task {
+    std::function<void()> f;
+  };
+
+  // EnvThread constructor must start the thread,
+  // destructor must join the thread.
+  class EnvThread {
+   public:
+    EnvThread(std::function<void()> f) : thr_(std::move(f)) {}
+    ~EnvThread() { thr_.join(); }
+    // This function is called when the threadpool is cancelled.
+    void OnCancel() {}
+
+   private:
+    std::thread thr_;
+  };
+
+  EnvThread* CreateThread(std::function<void()> f) { return new EnvThread(std::move(f)); }
+  Task CreateTask(std::function<void()> f) { return Task{std::move(f)}; }
+  void ExecuteTask(const Task& t) { t.f(); }
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_THREADPOOL_THREAD_ENVIRONMENT_H
diff --git a/inst/include/Eigen/src/ThreadPool/ThreadLocal.h b/inst/include/Eigen/src/ThreadPool/ThreadLocal.h
new file mode 100644
index 00000000..aa0bd108
--- /dev/null
+++ b/inst/include/Eigen/src/ThreadPool/ThreadLocal.h
@@ -0,0 +1,289 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H
+#define EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H
+
+#ifdef EIGEN_AVOID_THREAD_LOCAL
+
+#ifdef EIGEN_THREAD_LOCAL
+#undef EIGEN_THREAD_LOCAL
+#endif
+
+#else
+
+#if ((EIGEN_COMP_GNUC) || __has_feature(cxx_thread_local) || EIGEN_COMP_MSVC)
+#define EIGEN_THREAD_LOCAL static thread_local
+#endif
+
+// Disable TLS for Apple and Android builds with older toolchains.
+#if defined(__APPLE__)
+// Included for TARGET_OS_IPHONE, __IPHONE_OS_VERSION_MIN_REQUIRED,
+// __IPHONE_8_0.
+#include <Availability.h>
+#include <TargetConditionals.h>
+#endif
+// Checks whether C++11's `thread_local` storage duration specifier is
+// supported.
+#if EIGEN_COMP_CLANGAPPLE && \
+    ((EIGEN_COMP_CLANGAPPLE < 8000042) || (TARGET_OS_IPHONE && __IPHONE_OS_VERSION_MIN_REQUIRED < __IPHONE_9_0))
+// Notes: Xcode's clang did not support `thread_local` until version
+// 8, and even then not for all iOS < 9.0.
+#undef EIGEN_THREAD_LOCAL
+
+#elif defined(__ANDROID__) && EIGEN_COMP_CLANG
+// There are platforms for which TLS should not be used even though the compiler
+// makes it seem like it's supported (Android NDK < r12b for example).
+// This is primarily because of linker problems and toolchain misconfiguration:
+// TLS isn't supported until NDK r12b per
+// https://developer.android.com/ndk/downloads/revision_history.html
+
+#if defined(__ANDROID__) && defined(__clang__) && defined(__NDK_MAJOR__) && defined(__NDK_MINOR__) && \
+    ((__NDK_MAJOR__ < 12) || ((__NDK_MAJOR__ == 12) && (__NDK_MINOR__ < 1)))
+#undef EIGEN_THREAD_LOCAL
+#endif
+#endif  // defined(__ANDROID__) && defined(__clang__)
+
+#endif  // EIGEN_AVOID_THREAD_LOCAL
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+template <typename T>
+struct ThreadLocalNoOpInitialize {
+  void operator()(T&) const {}
+};
+
+template <typename T>
+struct ThreadLocalNoOpRelease {
+  void operator()(T&) const {}
+};
+
+}  // namespace internal
+
+// Thread local container for elements of type T, that does not use thread local
+// storage. As long as the number of unique threads accessing this storage
+// is smaller than `capacity_`, it is lock-free and wait-free. Otherwise it will
+// use a mutex for synchronization.
+//
+// Type `T` has to be default constructible, and by default each thread will get
+// a default constructed value. It is possible to specify custom `initialize`
+// callable, that will be called lazily from each thread accessing this object,
+// and will be passed a default initialized object of type `T`. Also it's
+// possible to pass a custom `release` callable, that will be invoked before
+// calling ~T().
+//
+// Example:
+//
+//   struct Counter {
+//     int value = 0;
+//   }
+//
+//   Eigen::ThreadLocal<Counter> counter(10);
+//
+//   // Each thread will have access to it's own counter object.
+//   Counter& cnt = counter.local();
+//   cnt++;
+//
+// WARNING: Eigen::ThreadLocal uses the OS-specific value returned by
+// std::this_thread::get_id() to identify threads. This value is not guaranteed
+// to be unique except for the life of the thread. A newly created thread may
+// get an OS-specific ID equal to that of an already destroyed thread.
+//
+// Somewhat similar to TBB thread local storage, with similar restrictions:
+// https://www.threadingbuildingblocks.org/docs/help/reference/thread_local_storage/enumerable_thread_specific_cls.html
+//
+template <typename T, typename Initialize = internal::ThreadLocalNoOpInitialize<T>,
+          typename Release = internal::ThreadLocalNoOpRelease<T>>
+class ThreadLocal {
+  // We preallocate default constructed elements in MaxSizedVector.
+  static_assert(std::is_default_constructible<T>::value, "ThreadLocal data type must be default constructible");
+
+ public:
+  explicit ThreadLocal(int capacity)
+      : ThreadLocal(capacity, internal::ThreadLocalNoOpInitialize<T>(), internal::ThreadLocalNoOpRelease<T>()) {}
+
+  ThreadLocal(int capacity, Initialize initialize)
+      : ThreadLocal(capacity, std::move(initialize), internal::ThreadLocalNoOpRelease<T>()) {}
+
+  ThreadLocal(int capacity, Initialize initialize, Release release)
+      : initialize_(std::move(initialize)),
+        release_(std::move(release)),
+        capacity_(capacity),
+        data_(capacity_),
+        ptr_(capacity_),
+        filled_records_(0) {
+    eigen_assert(capacity_ >= 0);
+    data_.resize(capacity_);
+    for (int i = 0; i < capacity_; ++i) {
+      ptr_.emplace_back(nullptr);
+    }
+  }
+
+  T& local() {
+    std::thread::id this_thread = std::this_thread::get_id();
+    if (capacity_ == 0) return SpilledLocal(this_thread);
+
+    std::size_t h = std::hash<std::thread::id>()(this_thread);
+    const int start_idx = h % capacity_;
+
+    // NOTE: From the definition of `std::this_thread::get_id()` it is
+    // guaranteed that we never can have concurrent insertions with the same key
+    // to our hash-map like data structure. If we didn't find an element during
+    // the initial traversal, it's guaranteed that no one else could have
+    // inserted it while we are in this function. This allows to massively
+    // simplify out lock-free insert-only hash map.
+
+    // Check if we already have an element for `this_thread`.
+    int idx = start_idx;
+    while (ptr_[idx].load() != nullptr) {
+      ThreadIdAndValue& record = *(ptr_[idx].load());
+      if (record.thread_id == this_thread) return record.value;
+
+      idx += 1;
+      if (idx >= capacity_) idx -= capacity_;
+      if (idx == start_idx) break;
+    }
+
+    // If we are here, it means that we found an insertion point in lookup
+    // table at `idx`, or we did a full traversal and table is full.
+
+    // If lock-free storage is full, fallback on mutex.
+    if (filled_records_.load() >= capacity_) return SpilledLocal(this_thread);
+
+    // We double check that we still have space to insert an element into a lock
+    // free storage. If old value in `filled_records_` is larger than the
+    // records capacity, it means that some other thread added an element while
+    // we were traversing lookup table.
+    int insertion_index = filled_records_.fetch_add(1, std::memory_order_relaxed);
+    if (insertion_index >= capacity_) return SpilledLocal(this_thread);
+
+    // At this point it's guaranteed that we can access to
+    // data_[insertion_index_] without a data race.
+    data_[insertion_index].thread_id = this_thread;
+    initialize_(data_[insertion_index].value);
+
+    // That's the pointer we'll put into the lookup table.
+    ThreadIdAndValue* inserted = &data_[insertion_index];
+
+    // We'll use nullptr pointer to ThreadIdAndValue in a compare-and-swap loop.
+    ThreadIdAndValue* empty = nullptr;
+
+    // Now we have to find an insertion point into the lookup table. We start
+    // from the `idx` that was identified as an insertion point above, it's
+    // guaranteed that we will have an empty record somewhere in a lookup table
+    // (because we created a record in the `data_`).
+    const int insertion_idx = idx;
+
+    do {
+      // Always start search from the original insertion candidate.
+      idx = insertion_idx;
+      while (ptr_[idx].load() != nullptr) {
+        idx += 1;
+        if (idx >= capacity_) idx -= capacity_;
+        // If we did a full loop, it means that we don't have any free entries
+        // in the lookup table, and this means that something is terribly wrong.
+        eigen_assert(idx != insertion_idx);
+      }
+      // Atomic CAS of the pointer guarantees that any other thread, that will
+      // follow this pointer will see all the mutations in the `data_`.
+    } while (!ptr_[idx].compare_exchange_weak(empty, inserted));
+
+    return inserted->value;
+  }
+
+  // WARN: It's not thread safe to call it concurrently with `local()`.
+  void ForEach(std::function<void(std::thread::id, T&)> f) {
+    // Reading directly from `data_` is unsafe, because only CAS to the
+    // record in `ptr_` makes all changes visible to other threads.
+    for (auto& ptr : ptr_) {
+      ThreadIdAndValue* record = ptr.load();
+      if (record == nullptr) continue;
+      f(record->thread_id, record->value);
+    }
+
+    // We did not spill into the map based storage.
+    if (filled_records_.load(std::memory_order_relaxed) < capacity_) return;
+
+    // Adds a happens before edge from the last call to SpilledLocal().
+    EIGEN_MUTEX_LOCK lock(mu_);
+    for (auto& kv : per_thread_map_) {
+      f(kv.first, kv.second);
+    }
+  }
+
+  // WARN: It's not thread safe to call it concurrently with `local()`.
+  ~ThreadLocal() {
+    // Reading directly from `data_` is unsafe, because only CAS to the record
+    // in `ptr_` makes all changes visible to other threads.
+    for (auto& ptr : ptr_) {
+      ThreadIdAndValue* record = ptr.load();
+      if (record == nullptr) continue;
+      release_(record->value);
+    }
+
+    // We did not spill into the map based storage.
+    if (filled_records_.load(std::memory_order_relaxed) < capacity_) return;
+
+    // Adds a happens before edge from the last call to SpilledLocal().
+    EIGEN_MUTEX_LOCK lock(mu_);
+    for (auto& kv : per_thread_map_) {
+      release_(kv.second);
+    }
+  }
+
+ private:
+  struct ThreadIdAndValue {
+    std::thread::id thread_id;
+    T value;
+  };
+
+  // Use unordered map guarded by a mutex when lock free storage is full.
+  T& SpilledLocal(std::thread::id this_thread) {
+    EIGEN_MUTEX_LOCK lock(mu_);
+
+    auto it = per_thread_map_.find(this_thread);
+    if (it == per_thread_map_.end()) {
+      auto result = per_thread_map_.emplace(this_thread, T());
+      eigen_assert(result.second);
+      initialize_((*result.first).second);
+      return (*result.first).second;
+    } else {
+      return it->second;
+    }
+  }
+
+  Initialize initialize_;
+  Release release_;
+  const int capacity_;
+
+  // Storage that backs lock-free lookup table `ptr_`. Records stored in this
+  // storage contiguously starting from index 0.
+  MaxSizeVector<ThreadIdAndValue> data_;
+
+  // Atomic pointers to the data stored in `data_`. Used as a lookup table for
+  // linear probing hash map (https://en.wikipedia.org/wiki/Linear_probing).
+  MaxSizeVector<std::atomic<ThreadIdAndValue*>> ptr_;
+
+  // Number of records stored in the `data_`.
+  std::atomic<int> filled_records_;
+
+  // We fallback on per thread map if lock-free storage is full. In practice
+  // this should never happen, if `capacity_` is a reasonable estimate of the
+  // number of threads running in a system.
+  EIGEN_MUTEX mu_;  // Protects per_thread_map_.
+  std::unordered_map<std::thread::id, T> per_thread_map_;
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_THREADPOOL_THREAD_LOCAL_H
diff --git a/inst/include/Eigen/src/ThreadPool/ThreadPoolInterface.h b/inst/include/Eigen/src/ThreadPool/ThreadPoolInterface.h
new file mode 100644
index 00000000..d07c03e8
--- /dev/null
+++ b/inst/include/Eigen/src/ThreadPool/ThreadPoolInterface.h
@@ -0,0 +1,50 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_THREAD_POOL_INTERFACE_H
+#define EIGEN_CXX11_THREADPOOL_THREAD_POOL_INTERFACE_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+// This defines an interface that ThreadPoolDevice can take to use
+// custom thread pools underneath.
+class ThreadPoolInterface {
+ public:
+  // Submits a closure to be run by a thread in the pool.
+  virtual void Schedule(std::function<void()> fn) = 0;
+
+  // Submits a closure to be run by threads in the range [start, end) in the
+  // pool.
+  virtual void ScheduleWithHint(std::function<void()> fn, int /*start*/, int /*end*/) {
+    // Just defer to Schedule in case sub-classes aren't interested in
+    // overriding this functionality.
+    Schedule(fn);
+  }
+
+  // If implemented, stop processing the closures that have been enqueued.
+  // Currently running closures may still be processed.
+  // If not implemented, does nothing.
+  virtual void Cancel() {}
+
+  // Returns the number of threads in the pool.
+  virtual int NumThreads() const = 0;
+
+  // Returns a logical thread index between 0 and NumThreads() - 1 if called
+  // from one of the threads in the pool. Returns -1 otherwise.
+  virtual int CurrentThreadId() const = 0;
+
+  virtual ~ThreadPoolInterface() {}
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_THREADPOOL_THREAD_POOL_INTERFACE_H
diff --git a/inst/include/Eigen/src/ThreadPool/ThreadYield.h b/inst/include/Eigen/src/ThreadPool/ThreadYield.h
new file mode 100644
index 00000000..f556ff63
--- /dev/null
+++ b/inst/include/Eigen/src/ThreadPool/ThreadYield.h
@@ -0,0 +1,16 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H
+#define EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H
+
+// Try to come up with a portable way to yield
+#define EIGEN_THREAD_YIELD() std::this_thread::yield()
+
+#endif  // EIGEN_CXX11_THREADPOOL_THREAD_YIELD_H
diff --git a/inst/include/Eigen/src/UmfPackSupport/InternalHeaderCheck.h b/inst/include/Eigen/src/UmfPackSupport/InternalHeaderCheck.h
new file mode 100644
index 00000000..64112f1a
--- /dev/null
+++ b/inst/include/Eigen/src/UmfPackSupport/InternalHeaderCheck.h
@@ -0,0 +1,3 @@
+#ifndef EIGEN_UMFPACKSUPPORT_MODULE_H
+#error "Please include Eigen/UmfPackSupport instead of including headers inside the src directory directly."
+#endif
diff --git a/inst/include/Eigen/src/UmfPackSupport/UmfPackSupport.h b/inst/include/Eigen/src/UmfPackSupport/UmfPackSupport.h
index 29c60c37..22c701b9 100644
--- a/inst/include/Eigen/src/UmfPackSupport/UmfPackSupport.h
+++ b/inst/include/Eigen/src/UmfPackSupport/UmfPackSupport.h
@@ -10,465 +10,597 @@
 #ifndef EIGEN_UMFPACKSUPPORT_H
 #define EIGEN_UMFPACKSUPPORT_H
 
-namespace Eigen { 
+// for compatibility with super old version of umfpack,
+// not sure this is really needed, but this is harmless.
+#ifndef SuiteSparse_long
+#ifdef UF_long
+#define SuiteSparse_long UF_long
+#else
+#error neither SuiteSparse_long nor UF_long are defined
+#endif
+#endif
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 /* TODO extract L, extract U, compute det, etc... */
 
 // generic double/complex<double> wrapper functions:
 
-inline void umfpack_free_numeric(void **Numeric, double)
-{ umfpack_di_free_numeric(Numeric); *Numeric = 0; }
+// Defaults
+inline void umfpack_defaults(double control[UMFPACK_CONTROL], double, int) { umfpack_di_defaults(control); }
+
+inline void umfpack_defaults(double control[UMFPACK_CONTROL], std::complex<double>, int) {
+  umfpack_zi_defaults(control);
+}
+
+inline void umfpack_defaults(double control[UMFPACK_CONTROL], double, SuiteSparse_long) {
+  umfpack_dl_defaults(control);
+}
+
+inline void umfpack_defaults(double control[UMFPACK_CONTROL], std::complex<double>, SuiteSparse_long) {
+  umfpack_zl_defaults(control);
+}
+
+// Report info
+inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], double, int) {
+  umfpack_di_report_info(control, info);
+}
+
+inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], std::complex<double>, int) {
+  umfpack_zi_report_info(control, info);
+}
+
+inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], double, SuiteSparse_long) {
+  umfpack_dl_report_info(control, info);
+}
+
+inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], std::complex<double>,
+                                SuiteSparse_long) {
+  umfpack_zl_report_info(control, info);
+}
+
+// Report status
+inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, double, int) {
+  umfpack_di_report_status(control, status);
+}
+
+inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, std::complex<double>, int) {
+  umfpack_zi_report_status(control, status);
+}
+
+inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, double, SuiteSparse_long) {
+  umfpack_dl_report_status(control, status);
+}
+
+inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, std::complex<double>, SuiteSparse_long) {
+  umfpack_zl_report_status(control, status);
+}
+
+// report control
+inline void umfpack_report_control(double control[UMFPACK_CONTROL], double, int) { umfpack_di_report_control(control); }
+
+inline void umfpack_report_control(double control[UMFPACK_CONTROL], std::complex<double>, int) {
+  umfpack_zi_report_control(control);
+}
+
+inline void umfpack_report_control(double control[UMFPACK_CONTROL], double, SuiteSparse_long) {
+  umfpack_dl_report_control(control);
+}
 
-inline void umfpack_free_numeric(void **Numeric, std::complex<double>)
-{ umfpack_zi_free_numeric(Numeric); *Numeric = 0; }
+inline void umfpack_report_control(double control[UMFPACK_CONTROL], std::complex<double>, SuiteSparse_long) {
+  umfpack_zl_report_control(control);
+}
 
-inline void umfpack_free_symbolic(void **Symbolic, double)
-{ umfpack_di_free_symbolic(Symbolic); *Symbolic = 0; }
+// Free numeric
+inline void umfpack_free_numeric(void **Numeric, double, int) {
+  umfpack_di_free_numeric(Numeric);
+  *Numeric = 0;
+}
 
-inline void umfpack_free_symbolic(void **Symbolic, std::complex<double>)
-{ umfpack_zi_free_symbolic(Symbolic); *Symbolic = 0; }
+inline void umfpack_free_numeric(void **Numeric, std::complex<double>, int) {
+  umfpack_zi_free_numeric(Numeric);
+  *Numeric = 0;
+}
 
-inline int umfpack_symbolic(int n_row,int n_col,
-                            const int Ap[], const int Ai[], const double Ax[], void **Symbolic,
-                            const double Control [UMFPACK_CONTROL], double Info [UMFPACK_INFO])
-{
-  return umfpack_di_symbolic(n_row,n_col,Ap,Ai,Ax,Symbolic,Control,Info);
+inline void umfpack_free_numeric(void **Numeric, double, SuiteSparse_long) {
+  umfpack_dl_free_numeric(Numeric);
+  *Numeric = 0;
 }
 
-inline int umfpack_symbolic(int n_row,int n_col,
-                            const int Ap[], const int Ai[], const std::complex<double> Ax[], void **Symbolic,
-                            const double Control [UMFPACK_CONTROL], double Info [UMFPACK_INFO])
-{
-  return umfpack_zi_symbolic(n_row,n_col,Ap,Ai,&numext::real_ref(Ax[0]),0,Symbolic,Control,Info);
+inline void umfpack_free_numeric(void **Numeric, std::complex<double>, SuiteSparse_long) {
+  umfpack_zl_free_numeric(Numeric);
+  *Numeric = 0;
 }
 
-inline int umfpack_numeric( const int Ap[], const int Ai[], const double Ax[],
-                            void *Symbolic, void **Numeric,
-                            const double Control[UMFPACK_CONTROL],double Info [UMFPACK_INFO])
-{
-  return umfpack_di_numeric(Ap,Ai,Ax,Symbolic,Numeric,Control,Info);
+// Free symbolic
+inline void umfpack_free_symbolic(void **Symbolic, double, int) {
+  umfpack_di_free_symbolic(Symbolic);
+  *Symbolic = 0;
 }
 
-inline int umfpack_numeric( const int Ap[], const int Ai[], const std::complex<double> Ax[],
-                            void *Symbolic, void **Numeric,
-                            const double Control[UMFPACK_CONTROL],double Info [UMFPACK_INFO])
-{
-  return umfpack_zi_numeric(Ap,Ai,&numext::real_ref(Ax[0]),0,Symbolic,Numeric,Control,Info);
+inline void umfpack_free_symbolic(void **Symbolic, std::complex<double>, int) {
+  umfpack_zi_free_symbolic(Symbolic);
+  *Symbolic = 0;
 }
 
-inline int umfpack_solve( int sys, const int Ap[], const int Ai[], const double Ax[],
-                          double X[], const double B[], void *Numeric,
-                          const double Control[UMFPACK_CONTROL], double Info[UMFPACK_INFO])
-{
-  return umfpack_di_solve(sys,Ap,Ai,Ax,X,B,Numeric,Control,Info);
+inline void umfpack_free_symbolic(void **Symbolic, double, SuiteSparse_long) {
+  umfpack_dl_free_symbolic(Symbolic);
+  *Symbolic = 0;
 }
 
-inline int umfpack_solve( int sys, const int Ap[], const int Ai[], const std::complex<double> Ax[],
-                          std::complex<double> X[], const std::complex<double> B[], void *Numeric,
-                          const double Control[UMFPACK_CONTROL], double Info[UMFPACK_INFO])
-{
-  return umfpack_zi_solve(sys,Ap,Ai,&numext::real_ref(Ax[0]),0,&numext::real_ref(X[0]),0,&numext::real_ref(B[0]),0,Numeric,Control,Info);
+inline void umfpack_free_symbolic(void **Symbolic, std::complex<double>, SuiteSparse_long) {
+  umfpack_zl_free_symbolic(Symbolic);
+  *Symbolic = 0;
 }
 
-inline int umfpack_get_lunz(int *lnz, int *unz, int *n_row, int *n_col, int *nz_udiag, void *Numeric, double)
-{
-  return umfpack_di_get_lunz(lnz,unz,n_row,n_col,nz_udiag,Numeric);
+// Symbolic
+inline int umfpack_symbolic(int n_row, int n_col, const int Ap[], const int Ai[], const double Ax[], void **Symbolic,
+                            const double Control[UMFPACK_CONTROL], double Info[UMFPACK_INFO]) {
+  return umfpack_di_symbolic(n_row, n_col, Ap, Ai, Ax, Symbolic, Control, Info);
 }
 
-inline int umfpack_get_lunz(int *lnz, int *unz, int *n_row, int *n_col, int *nz_udiag, void *Numeric, std::complex<double>)
-{
-  return umfpack_zi_get_lunz(lnz,unz,n_row,n_col,nz_udiag,Numeric);
+inline int umfpack_symbolic(int n_row, int n_col, const int Ap[], const int Ai[], const std::complex<double> Ax[],
+                            void **Symbolic, const double Control[UMFPACK_CONTROL], double Info[UMFPACK_INFO]) {
+  return umfpack_zi_symbolic(n_row, n_col, Ap, Ai, &numext::real_ref(Ax[0]), 0, Symbolic, Control, Info);
+}
+inline SuiteSparse_long umfpack_symbolic(SuiteSparse_long n_row, SuiteSparse_long n_col, const SuiteSparse_long Ap[],
+                                         const SuiteSparse_long Ai[], const double Ax[], void **Symbolic,
+                                         const double Control[UMFPACK_CONTROL], double Info[UMFPACK_INFO]) {
+  return umfpack_dl_symbolic(n_row, n_col, Ap, Ai, Ax, Symbolic, Control, Info);
+}
+
+inline SuiteSparse_long umfpack_symbolic(SuiteSparse_long n_row, SuiteSparse_long n_col, const SuiteSparse_long Ap[],
+                                         const SuiteSparse_long Ai[], const std::complex<double> Ax[], void **Symbolic,
+                                         const double Control[UMFPACK_CONTROL], double Info[UMFPACK_INFO]) {
+  return umfpack_zl_symbolic(n_row, n_col, Ap, Ai, &numext::real_ref(Ax[0]), 0, Symbolic, Control, Info);
+}
+
+// Numeric
+inline int umfpack_numeric(const int Ap[], const int Ai[], const double Ax[], void *Symbolic, void **Numeric,
+                           const double Control[UMFPACK_CONTROL], double Info[UMFPACK_INFO]) {
+  return umfpack_di_numeric(Ap, Ai, Ax, Symbolic, Numeric, Control, Info);
+}
+
+inline int umfpack_numeric(const int Ap[], const int Ai[], const std::complex<double> Ax[], void *Symbolic,
+                           void **Numeric, const double Control[UMFPACK_CONTROL], double Info[UMFPACK_INFO]) {
+  return umfpack_zi_numeric(Ap, Ai, &numext::real_ref(Ax[0]), 0, Symbolic, Numeric, Control, Info);
+}
+inline SuiteSparse_long umfpack_numeric(const SuiteSparse_long Ap[], const SuiteSparse_long Ai[], const double Ax[],
+                                        void *Symbolic, void **Numeric, const double Control[UMFPACK_CONTROL],
+                                        double Info[UMFPACK_INFO]) {
+  return umfpack_dl_numeric(Ap, Ai, Ax, Symbolic, Numeric, Control, Info);
+}
+
+inline SuiteSparse_long umfpack_numeric(const SuiteSparse_long Ap[], const SuiteSparse_long Ai[],
+                                        const std::complex<double> Ax[], void *Symbolic, void **Numeric,
+                                        const double Control[UMFPACK_CONTROL], double Info[UMFPACK_INFO]) {
+  return umfpack_zl_numeric(Ap, Ai, &numext::real_ref(Ax[0]), 0, Symbolic, Numeric, Control, Info);
+}
+
+// solve
+inline int umfpack_solve(int sys, const int Ap[], const int Ai[], const double Ax[], double X[], const double B[],
+                         void *Numeric, const double Control[UMFPACK_CONTROL], double Info[UMFPACK_INFO]) {
+  return umfpack_di_solve(sys, Ap, Ai, Ax, X, B, Numeric, Control, Info);
+}
+
+inline int umfpack_solve(int sys, const int Ap[], const int Ai[], const std::complex<double> Ax[],
+                         std::complex<double> X[], const std::complex<double> B[], void *Numeric,
+                         const double Control[UMFPACK_CONTROL], double Info[UMFPACK_INFO]) {
+  return umfpack_zi_solve(sys, Ap, Ai, &numext::real_ref(Ax[0]), 0, &numext::real_ref(X[0]), 0, &numext::real_ref(B[0]),
+                          0, Numeric, Control, Info);
+}
+
+inline SuiteSparse_long umfpack_solve(int sys, const SuiteSparse_long Ap[], const SuiteSparse_long Ai[],
+                                      const double Ax[], double X[], const double B[], void *Numeric,
+                                      const double Control[UMFPACK_CONTROL], double Info[UMFPACK_INFO]) {
+  return umfpack_dl_solve(sys, Ap, Ai, Ax, X, B, Numeric, Control, Info);
+}
+
+inline SuiteSparse_long umfpack_solve(int sys, const SuiteSparse_long Ap[], const SuiteSparse_long Ai[],
+                                      const std::complex<double> Ax[], std::complex<double> X[],
+                                      const std::complex<double> B[], void *Numeric,
+                                      const double Control[UMFPACK_CONTROL], double Info[UMFPACK_INFO]) {
+  return umfpack_zl_solve(sys, Ap, Ai, &numext::real_ref(Ax[0]), 0, &numext::real_ref(X[0]), 0, &numext::real_ref(B[0]),
+                          0, Numeric, Control, Info);
+}
+
+// Get Lunz
+inline int umfpack_get_lunz(int *lnz, int *unz, int *n_row, int *n_col, int *nz_udiag, void *Numeric, double) {
+  return umfpack_di_get_lunz(lnz, unz, n_row, n_col, nz_udiag, Numeric);
+}
+
+inline int umfpack_get_lunz(int *lnz, int *unz, int *n_row, int *n_col, int *nz_udiag, void *Numeric,
+                            std::complex<double>) {
+  return umfpack_zi_get_lunz(lnz, unz, n_row, n_col, nz_udiag, Numeric);
+}
+
+inline SuiteSparse_long umfpack_get_lunz(SuiteSparse_long *lnz, SuiteSparse_long *unz, SuiteSparse_long *n_row,
+                                         SuiteSparse_long *n_col, SuiteSparse_long *nz_udiag, void *Numeric, double) {
+  return umfpack_dl_get_lunz(lnz, unz, n_row, n_col, nz_udiag, Numeric);
+}
+
+inline SuiteSparse_long umfpack_get_lunz(SuiteSparse_long *lnz, SuiteSparse_long *unz, SuiteSparse_long *n_row,
+                                         SuiteSparse_long *n_col, SuiteSparse_long *nz_udiag, void *Numeric,
+                                         std::complex<double>) {
+  return umfpack_zl_get_lunz(lnz, unz, n_row, n_col, nz_udiag, Numeric);
+}
+
+// Get Numeric
+inline int umfpack_get_numeric(int Lp[], int Lj[], double Lx[], int Up[], int Ui[], double Ux[], int P[], int Q[],
+                               double Dx[], int *do_recip, double Rs[], void *Numeric) {
+  return umfpack_di_get_numeric(Lp, Lj, Lx, Up, Ui, Ux, P, Q, Dx, do_recip, Rs, Numeric);
+}
+
+inline int umfpack_get_numeric(int Lp[], int Lj[], std::complex<double> Lx[], int Up[], int Ui[],
+                               std::complex<double> Ux[], int P[], int Q[], std::complex<double> Dx[], int *do_recip,
+                               double Rs[], void *Numeric) {
+  double &lx0_real = numext::real_ref(Lx[0]);
+  double &ux0_real = numext::real_ref(Ux[0]);
+  double &dx0_real = numext::real_ref(Dx[0]);
+  return umfpack_zi_get_numeric(Lp, Lj, Lx ? &lx0_real : 0, 0, Up, Ui, Ux ? &ux0_real : 0, 0, P, Q, Dx ? &dx0_real : 0,
+                                0, do_recip, Rs, Numeric);
+}
+inline SuiteSparse_long umfpack_get_numeric(SuiteSparse_long Lp[], SuiteSparse_long Lj[], double Lx[],
+                                            SuiteSparse_long Up[], SuiteSparse_long Ui[], double Ux[],
+                                            SuiteSparse_long P[], SuiteSparse_long Q[], double Dx[],
+                                            SuiteSparse_long *do_recip, double Rs[], void *Numeric) {
+  return umfpack_dl_get_numeric(Lp, Lj, Lx, Up, Ui, Ux, P, Q, Dx, do_recip, Rs, Numeric);
 }
 
-inline int umfpack_get_numeric(int Lp[], int Lj[], double Lx[], int Up[], int Ui[], double Ux[],
-                               int P[], int Q[], double Dx[], int *do_recip, double Rs[], void *Numeric)
-{
-  return umfpack_di_get_numeric(Lp,Lj,Lx,Up,Ui,Ux,P,Q,Dx,do_recip,Rs,Numeric);
+inline SuiteSparse_long umfpack_get_numeric(SuiteSparse_long Lp[], SuiteSparse_long Lj[], std::complex<double> Lx[],
+                                            SuiteSparse_long Up[], SuiteSparse_long Ui[], std::complex<double> Ux[],
+                                            SuiteSparse_long P[], SuiteSparse_long Q[], std::complex<double> Dx[],
+                                            SuiteSparse_long *do_recip, double Rs[], void *Numeric) {
+  double &lx0_real = numext::real_ref(Lx[0]);
+  double &ux0_real = numext::real_ref(Ux[0]);
+  double &dx0_real = numext::real_ref(Dx[0]);
+  return umfpack_zl_get_numeric(Lp, Lj, Lx ? &lx0_real : 0, 0, Up, Ui, Ux ? &ux0_real : 0, 0, P, Q, Dx ? &dx0_real : 0,
+                                0, do_recip, Rs, Numeric);
 }
 
-inline int umfpack_get_numeric(int Lp[], int Lj[], std::complex<double> Lx[], int Up[], int Ui[], std::complex<double> Ux[],
-                               int P[], int Q[], std::complex<double> Dx[], int *do_recip, double Rs[], void *Numeric)
-{
-  double& lx0_real = numext::real_ref(Lx[0]);
-  double& ux0_real = numext::real_ref(Ux[0]);
-  double& dx0_real = numext::real_ref(Dx[0]);
-  return umfpack_zi_get_numeric(Lp,Lj,Lx?&lx0_real:0,0,Up,Ui,Ux?&ux0_real:0,0,P,Q,
-                                Dx?&dx0_real:0,0,do_recip,Rs,Numeric);
+// Get Determinant
+inline int umfpack_get_determinant(double *Mx, double *Ex, void *NumericHandle, double User_Info[UMFPACK_INFO], int) {
+  return umfpack_di_get_determinant(Mx, Ex, NumericHandle, User_Info);
 }
 
-inline int umfpack_get_determinant(double *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO])
-{
-  return umfpack_di_get_determinant(Mx,Ex,NumericHandle,User_Info);
+inline int umfpack_get_determinant(std::complex<double> *Mx, double *Ex, void *NumericHandle,
+                                   double User_Info[UMFPACK_INFO], int) {
+  double &mx_real = numext::real_ref(*Mx);
+  return umfpack_zi_get_determinant(&mx_real, 0, Ex, NumericHandle, User_Info);
 }
 
-inline int umfpack_get_determinant(std::complex<double> *Mx, double *Ex, void *NumericHandle, double User_Info [UMFPACK_INFO])
-{
-  double& mx_real = numext::real_ref(*Mx);
-  return umfpack_zi_get_determinant(&mx_real,0,Ex,NumericHandle,User_Info);
+inline SuiteSparse_long umfpack_get_determinant(double *Mx, double *Ex, void *NumericHandle,
+                                                double User_Info[UMFPACK_INFO], SuiteSparse_long) {
+  return umfpack_dl_get_determinant(Mx, Ex, NumericHandle, User_Info);
 }
 
-namespace internal {
-  template<typename T> struct umfpack_helper_is_sparse_plain : false_type {};
-  template<typename Scalar, int Options, typename StorageIndex>
-  struct umfpack_helper_is_sparse_plain<SparseMatrix<Scalar,Options,StorageIndex> >
-    : true_type {};
-  template<typename Scalar, int Options, typename StorageIndex>
-  struct umfpack_helper_is_sparse_plain<MappedSparseMatrix<Scalar,Options,StorageIndex> >
-    : true_type {};
+inline SuiteSparse_long umfpack_get_determinant(std::complex<double> *Mx, double *Ex, void *NumericHandle,
+                                                double User_Info[UMFPACK_INFO], SuiteSparse_long) {
+  double &mx_real = numext::real_ref(*Mx);
+  return umfpack_zl_get_determinant(&mx_real, 0, Ex, NumericHandle, User_Info);
 }
 
 /** \ingroup UmfPackSupport_Module
-  * \brief A sparse LU factorization and solver based on UmfPack
-  *
-  * This class allows to solve for A.X = B sparse linear problems via a LU factorization
-  * using the UmfPack library. The sparse matrix A must be squared and full rank.
-  * The vectors or matrices X and B can be either dense or sparse.
-  *
-  * \warning The input matrix A should be in a \b compressed and \b column-major form.
-  * Otherwise an expensive copy will be made. You can call the inexpensive makeCompressed() to get a compressed matrix.
-  * \tparam _MatrixType the type of the sparse matrix A, it must be a SparseMatrix<>
-  *
-  * \sa \ref TutorialSparseDirectSolvers
-  */
-template<typename _MatrixType>
-class UmfPackLU : internal::noncopyable
-{
-  public:
-    typedef _MatrixType MatrixType;
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
-    typedef Matrix<Scalar,Dynamic,1> Vector;
-    typedef Matrix<int, 1, MatrixType::ColsAtCompileTime> IntRowVectorType;
-    typedef Matrix<int, MatrixType::RowsAtCompileTime, 1> IntColVectorType;
-    typedef SparseMatrix<Scalar> LUMatrixType;
-    typedef SparseMatrix<Scalar,ColMajor,int> UmfpackMatrixType;
-
-  public:
-
-    UmfPackLU() { init(); }
-
-    UmfPackLU(const MatrixType& matrix)
-    {
-      init();
-      compute(matrix);
-    }
+ * \brief A sparse LU factorization and solver based on UmfPack
+ *
+ * This class allows to solve for A.X = B sparse linear problems via a LU factorization
+ * using the UmfPack library. The sparse matrix A must be squared and full rank.
+ * The vectors or matrices X and B can be either dense or sparse.
+ *
+ * \warning The input matrix A should be in a \b compressed and \b column-major form.
+ * Otherwise an expensive copy will be made. You can call the inexpensive makeCompressed() to get a compressed matrix.
+ * \tparam MatrixType_ the type of the sparse matrix A, it must be a SparseMatrix<>
+ *
+ * \implsparsesolverconcept
+ *
+ * \sa \ref TutorialSparseSolverConcept, class SparseLU
+ */
+template <typename MatrixType_>
+class UmfPackLU : public SparseSolverBase<UmfPackLU<MatrixType_> > {
+ protected:
+  typedef SparseSolverBase<UmfPackLU<MatrixType_> > Base;
+  using Base::m_isInitialized;
+
+ public:
+  using Base::_solve_impl;
+  typedef MatrixType_ MatrixType;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef typename MatrixType::StorageIndex StorageIndex;
+  typedef Matrix<Scalar, Dynamic, 1> Vector;
+  typedef Matrix<int, 1, MatrixType::ColsAtCompileTime> IntRowVectorType;
+  typedef Matrix<int, MatrixType::RowsAtCompileTime, 1> IntColVectorType;
+  typedef SparseMatrix<Scalar> LUMatrixType;
+  typedef SparseMatrix<Scalar, ColMajor, StorageIndex> UmfpackMatrixType;
+  typedef Ref<const UmfpackMatrixType, StandardCompressedFormat> UmfpackMatrixRef;
+  enum { ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime };
+
+ public:
+  typedef Array<double, UMFPACK_CONTROL, 1> UmfpackControl;
+  typedef Array<double, UMFPACK_INFO, 1> UmfpackInfo;
+
+  UmfPackLU() : m_dummy(0, 0), mp_matrix(m_dummy) { init(); }
+
+  template <typename InputMatrixType>
+  explicit UmfPackLU(const InputMatrixType &matrix) : mp_matrix(matrix) {
+    init();
+    compute(matrix);
+  }
 
-    ~UmfPackLU()
-    {
-      if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar());
-      if(m_numeric)  umfpack_free_numeric(&m_numeric,Scalar());
-    }
+  ~UmfPackLU() {
+    if (m_symbolic) umfpack_free_symbolic(&m_symbolic, Scalar(), StorageIndex());
+    if (m_numeric) umfpack_free_numeric(&m_numeric, Scalar(), StorageIndex());
+  }
 
-    inline Index rows() const { return m_copyMatrix.rows(); }
-    inline Index cols() const { return m_copyMatrix.cols(); }
-
-    /** \brief Reports whether previous computation was successful.
-      *
-      * \returns \c Success if computation was succesful,
-      *          \c NumericalIssue if the matrix.appears to be negative.
-      */
-    ComputationInfo info() const
-    {
-      eigen_assert(m_isInitialized && "Decomposition is not initialized.");
-      return m_info;
-    }
+  inline Index rows() const { return mp_matrix.rows(); }
+  inline Index cols() const { return mp_matrix.cols(); }
+
+  /** \brief Reports whether previous computation was successful.
+   *
+   * \returns \c Success if computation was successful,
+   *          \c NumericalIssue if the matrix.appears to be negative.
+   */
+  ComputationInfo info() const {
+    eigen_assert(m_isInitialized && "Decomposition is not initialized.");
+    return m_info;
+  }
 
-    inline const LUMatrixType& matrixL() const
-    {
-      if (m_extractedDataAreDirty) extractData();
-      return m_l;
-    }
+  inline const LUMatrixType &matrixL() const {
+    if (m_extractedDataAreDirty) extractData();
+    return m_l;
+  }
 
-    inline const LUMatrixType& matrixU() const
-    {
-      if (m_extractedDataAreDirty) extractData();
-      return m_u;
-    }
+  inline const LUMatrixType &matrixU() const {
+    if (m_extractedDataAreDirty) extractData();
+    return m_u;
+  }
 
-    inline const IntColVectorType& permutationP() const
-    {
-      if (m_extractedDataAreDirty) extractData();
-      return m_p;
-    }
+  inline const IntColVectorType &permutationP() const {
+    if (m_extractedDataAreDirty) extractData();
+    return m_p;
+  }
 
-    inline const IntRowVectorType& permutationQ() const
-    {
-      if (m_extractedDataAreDirty) extractData();
-      return m_q;
-    }
+  inline const IntRowVectorType &permutationQ() const {
+    if (m_extractedDataAreDirty) extractData();
+    return m_q;
+  }
 
-    /** Computes the sparse Cholesky decomposition of \a matrix 
-     *  Note that the matrix should be column-major, and in compressed format for best performance.
-     *  \sa SparseMatrix::makeCompressed().
-     */
-    template<typename InputMatrixType>
-    void compute(const InputMatrixType& matrix)
-    {
-      if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar());
-      if(m_numeric)  umfpack_free_numeric(&m_numeric,Scalar());
-      grapInput(matrix.derived());
-      analyzePattern_impl();
-      factorize_impl();
-    }
+  /** Computes the sparse Cholesky decomposition of \a matrix
+   *  Note that the matrix should be column-major, and in compressed format for best performance.
+   *  \sa SparseMatrix::makeCompressed().
+   */
+  template <typename InputMatrixType>
+  void compute(const InputMatrixType &matrix) {
+    if (m_symbolic) umfpack_free_symbolic(&m_symbolic, Scalar(), StorageIndex());
+    if (m_numeric) umfpack_free_numeric(&m_numeric, Scalar(), StorageIndex());
+    grab(matrix.derived());
+    analyzePattern_impl();
+    factorize_impl();
+  }
 
-    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
-      *
-      * \sa compute()
-      */
-    template<typename Rhs>
-    inline const internal::solve_retval<UmfPackLU, Rhs> solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "UmfPackLU is not initialized.");
-      eigen_assert(rows()==b.rows()
-                && "UmfPackLU::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::solve_retval<UmfPackLU, Rhs>(*this, b.derived());
-    }
+  /** Performs a symbolic decomposition on the sparsity of \a matrix.
+   *
+   * This function is particularly useful when solving for several problems having the same structure.
+   *
+   * \sa factorize(), compute()
+   */
+  template <typename InputMatrixType>
+  void analyzePattern(const InputMatrixType &matrix) {
+    if (m_symbolic) umfpack_free_symbolic(&m_symbolic, Scalar(), StorageIndex());
+    if (m_numeric) umfpack_free_numeric(&m_numeric, Scalar(), StorageIndex());
 
-    /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A.
-      *
-      * \sa compute()
-      */
-    template<typename Rhs>
-    inline const internal::sparse_solve_retval<UmfPackLU, Rhs> solve(const SparseMatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "UmfPackLU is not initialized.");
-      eigen_assert(rows()==b.rows()
-                && "UmfPackLU::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::sparse_solve_retval<UmfPackLU, Rhs>(*this, b.derived());
-    }
+    grab(matrix.derived());
 
-    /** Performs a symbolic decomposition on the sparcity of \a matrix.
-      *
-      * This function is particularly useful when solving for several problems having the same structure.
-      *
-      * \sa factorize(), compute()
-      */
-    template<typename InputMatrixType>
-    void analyzePattern(const InputMatrixType& matrix)
-    {
-      if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar());
-      if(m_numeric)  umfpack_free_numeric(&m_numeric,Scalar());
-      
-      grapInput(matrix.derived());
-
-      analyzePattern_impl();
-    }
+    analyzePattern_impl();
+  }
 
-    /** Performs a numeric decomposition of \a matrix
-      *
-      * The given matrix must has the same sparcity than the matrix on which the pattern anylysis has been performed.
-      *
-      * \sa analyzePattern(), compute()
-      */
-    template<typename InputMatrixType>
-    void factorize(const InputMatrixType& matrix)
-    {
-      eigen_assert(m_analysisIsOk && "UmfPackLU: you must first call analyzePattern()");
-      if(m_numeric)
-        umfpack_free_numeric(&m_numeric,Scalar());
-
-      grapInput(matrix.derived());
-      
-      factorize_impl();
-    }
+  /** Provides the return status code returned by UmfPack during the numeric
+   * factorization.
+   *
+   * \sa factorize(), compute()
+   */
+  inline int umfpackFactorizeReturncode() const {
+    eigen_assert(m_numeric && "UmfPackLU: you must first call factorize()");
+    return m_fact_errorCode;
+  }
 
-    #ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** \internal */
-    template<typename BDerived,typename XDerived>
-    bool _solve(const MatrixBase<BDerived> &b, MatrixBase<XDerived> &x) const;
-    #endif
+  /** Provides access to the control settings array used by UmfPack.
+   *
+   * If this array contains NaN's, the default values are used.
+   *
+   * See UMFPACK documentation for details.
+   */
+  inline const UmfpackControl &umfpackControl() const { return m_control; }
+
+  /** Provides access to the control settings array used by UmfPack.
+   *
+   * If this array contains NaN's, the default values are used.
+   *
+   * See UMFPACK documentation for details.
+   */
+  inline UmfpackControl &umfpackControl() { return m_control; }
+
+  /** Performs a numeric decomposition of \a matrix
+   *
+   * The given matrix must have the same sparsity than the matrix on which the pattern anylysis has been performed.
+   *
+   * \sa analyzePattern(), compute()
+   */
+  template <typename InputMatrixType>
+  void factorize(const InputMatrixType &matrix) {
+    eigen_assert(m_analysisIsOk && "UmfPackLU: you must first call analyzePattern()");
+    if (m_numeric) umfpack_free_numeric(&m_numeric, Scalar(), StorageIndex());
+
+    grab(matrix.derived());
+
+    factorize_impl();
+  }
 
-    Scalar determinant() const;
+  /** Prints the current UmfPack control settings.
+   *
+   * \sa umfpackControl()
+   */
+  void printUmfpackControl() { umfpack_report_control(m_control.data(), Scalar(), StorageIndex()); }
+
+  /** Prints statistics collected by UmfPack.
+   *
+   * \sa analyzePattern(), compute()
+   */
+  void printUmfpackInfo() {
+    eigen_assert(m_analysisIsOk && "UmfPackLU: you must first call analyzePattern()");
+    umfpack_report_info(m_control.data(), m_umfpackInfo.data(), Scalar(), StorageIndex());
+  }
 
-    void extractData() const;
+  /** Prints the status of the previous factorization operation performed by UmfPack (symbolic or numerical
+   * factorization).
+   *
+   * \sa analyzePattern(), compute()
+   */
+  void printUmfpackStatus() {
+    eigen_assert(m_analysisIsOk && "UmfPackLU: you must first call analyzePattern()");
+    umfpack_report_status(m_control.data(), m_fact_errorCode, Scalar(), StorageIndex());
+  }
 
-  protected:
+  /** \internal */
+  template <typename BDerived, typename XDerived>
+  bool _solve_impl(const MatrixBase<BDerived> &b, MatrixBase<XDerived> &x) const;
 
-    void init()
-    {
-      m_info                  = InvalidInput;
-      m_isInitialized         = false;
-      m_numeric               = 0;
-      m_symbolic              = 0;
-      m_outerIndexPtr         = 0;
-      m_innerIndexPtr         = 0;
-      m_valuePtr              = 0;
-      m_extractedDataAreDirty = true;
-    }
-    
-    template<typename InputMatrixType>
-    void grapInput_impl(const InputMatrixType& mat, internal::true_type)
-    {
-      m_copyMatrix.resize(mat.rows(), mat.cols());
-      if( ((MatrixType::Flags&RowMajorBit)==RowMajorBit) || sizeof(typename MatrixType::Index)!=sizeof(int) || !mat.isCompressed() )
-      {
-        // non supported input -> copy
-        m_copyMatrix = mat;
-        m_outerIndexPtr = m_copyMatrix.outerIndexPtr();
-        m_innerIndexPtr = m_copyMatrix.innerIndexPtr();
-        m_valuePtr      = m_copyMatrix.valuePtr();
-      }
-      else
-      {
-        m_outerIndexPtr = mat.outerIndexPtr();
-        m_innerIndexPtr = mat.innerIndexPtr();
-        m_valuePtr      = mat.valuePtr();
-      }
-    }
-    
-    template<typename InputMatrixType>
-    void grapInput_impl(const InputMatrixType& mat, internal::false_type)
-    {
-      m_copyMatrix = mat;
-      m_outerIndexPtr = m_copyMatrix.outerIndexPtr();
-      m_innerIndexPtr = m_copyMatrix.innerIndexPtr();
-      m_valuePtr      = m_copyMatrix.valuePtr();
-    }
-    
-    template<typename InputMatrixType>
-    void grapInput(const InputMatrixType& mat)
-    {
-      grapInput_impl(mat, internal::umfpack_helper_is_sparse_plain<InputMatrixType>());
-    }
-    
-    void analyzePattern_impl()
-    {
-      int errorCode = 0;
-      errorCode = umfpack_symbolic(m_copyMatrix.rows(), m_copyMatrix.cols(), m_outerIndexPtr, m_innerIndexPtr, m_valuePtr,
-                                   &m_symbolic, 0, 0);
-
-      m_isInitialized = true;
-      m_info = errorCode ? InvalidInput : Success;
-      m_analysisIsOk = true;
-      m_factorizationIsOk = false;
-      m_extractedDataAreDirty = true;
-    }
-    
-    void factorize_impl()
-    {
-      int errorCode;
-      errorCode = umfpack_numeric(m_outerIndexPtr, m_innerIndexPtr, m_valuePtr,
-                                  m_symbolic, &m_numeric, 0, 0);
-
-      m_info = errorCode ? NumericalIssue : Success;
-      m_factorizationIsOk = true;
-      m_extractedDataAreDirty = true;
+  Scalar determinant() const;
+
+  void extractData() const;
+
+ protected:
+  void init() {
+    m_info = InvalidInput;
+    m_isInitialized = false;
+    m_numeric = 0;
+    m_symbolic = 0;
+    m_extractedDataAreDirty = true;
+
+    umfpack_defaults(m_control.data(), Scalar(), StorageIndex());
+  }
+
+  void analyzePattern_impl() {
+    m_fact_errorCode = umfpack_symbolic(internal::convert_index<StorageIndex>(mp_matrix.rows()),
+                                        internal::convert_index<StorageIndex>(mp_matrix.cols()),
+                                        mp_matrix.outerIndexPtr(), mp_matrix.innerIndexPtr(), mp_matrix.valuePtr(),
+                                        &m_symbolic, m_control.data(), m_umfpackInfo.data());
+
+    m_isInitialized = true;
+    m_info = m_fact_errorCode ? InvalidInput : Success;
+    m_analysisIsOk = true;
+    m_factorizationIsOk = false;
+    m_extractedDataAreDirty = true;
+  }
+
+  void factorize_impl() {
+    m_fact_errorCode = umfpack_numeric(mp_matrix.outerIndexPtr(), mp_matrix.innerIndexPtr(), mp_matrix.valuePtr(),
+                                       m_symbolic, &m_numeric, m_control.data(), m_umfpackInfo.data());
+
+    m_info = m_fact_errorCode == UMFPACK_OK ? Success : NumericalIssue;
+    m_factorizationIsOk = true;
+    m_extractedDataAreDirty = true;
+  }
+
+  template <typename MatrixDerived>
+  void grab(const EigenBase<MatrixDerived> &A) {
+    internal::destroy_at(&mp_matrix);
+    internal::construct_at(&mp_matrix, A.derived());
+  }
+
+  void grab(const UmfpackMatrixRef &A) {
+    if (&(A.derived()) != &mp_matrix) {
+      internal::destroy_at(&mp_matrix);
+      internal::construct_at(&mp_matrix, A);
     }
+  }
 
-    // cached data to reduce reallocation, etc.
-    mutable LUMatrixType m_l;
-    mutable LUMatrixType m_u;
-    mutable IntColVectorType m_p;
-    mutable IntRowVectorType m_q;
-
-    UmfpackMatrixType m_copyMatrix;
-    const Scalar* m_valuePtr;
-    const int* m_outerIndexPtr;
-    const int* m_innerIndexPtr;
-    void* m_numeric;
-    void* m_symbolic;
-
-    mutable ComputationInfo m_info;
-    bool m_isInitialized;
-    int m_factorizationIsOk;
-    int m_analysisIsOk;
-    mutable bool m_extractedDataAreDirty;
-    
-  private:
-    UmfPackLU(UmfPackLU& ) { }
-};
+  // cached data to reduce reallocation, etc.
+  mutable LUMatrixType m_l;
+  StorageIndex m_fact_errorCode;
+  UmfpackControl m_control;
+  mutable UmfpackInfo m_umfpackInfo;
+
+  mutable LUMatrixType m_u;
+  mutable IntColVectorType m_p;
+  mutable IntRowVectorType m_q;
+
+  UmfpackMatrixType m_dummy;
+  UmfpackMatrixRef mp_matrix;
+
+  void *m_numeric;
+  void *m_symbolic;
 
+  mutable ComputationInfo m_info;
+  int m_factorizationIsOk;
+  int m_analysisIsOk;
+  mutable bool m_extractedDataAreDirty;
 
-template<typename MatrixType>
-void UmfPackLU<MatrixType>::extractData() const
-{
-  if (m_extractedDataAreDirty)
-  {
+ private:
+  UmfPackLU(const UmfPackLU &) {}
+};
+
+template <typename MatrixType>
+void UmfPackLU<MatrixType>::extractData() const {
+  if (m_extractedDataAreDirty) {
     // get size of the data
-    int lnz, unz, rows, cols, nz_udiag;
+    StorageIndex lnz, unz, rows, cols, nz_udiag;
     umfpack_get_lunz(&lnz, &unz, &rows, &cols, &nz_udiag, m_numeric, Scalar());
 
     // allocate data
-    m_l.resize(rows,(std::min)(rows,cols));
+    m_l.resize(rows, (std::min)(rows, cols));
     m_l.resizeNonZeros(lnz);
 
-    m_u.resize((std::min)(rows,cols),cols);
+    m_u.resize((std::min)(rows, cols), cols);
     m_u.resizeNonZeros(unz);
 
     m_p.resize(rows);
     m_q.resize(cols);
 
     // extract
-    umfpack_get_numeric(m_l.outerIndexPtr(), m_l.innerIndexPtr(), m_l.valuePtr(),
-                        m_u.outerIndexPtr(), m_u.innerIndexPtr(), m_u.valuePtr(),
-                        m_p.data(), m_q.data(), 0, 0, 0, m_numeric);
+    umfpack_get_numeric(m_l.outerIndexPtr(), m_l.innerIndexPtr(), m_l.valuePtr(), m_u.outerIndexPtr(),
+                        m_u.innerIndexPtr(), m_u.valuePtr(), m_p.data(), m_q.data(), 0, 0, 0, m_numeric);
 
     m_extractedDataAreDirty = false;
   }
 }
 
-template<typename MatrixType>
-typename UmfPackLU<MatrixType>::Scalar UmfPackLU<MatrixType>::determinant() const
-{
+template <typename MatrixType>
+typename UmfPackLU<MatrixType>::Scalar UmfPackLU<MatrixType>::determinant() const {
   Scalar det;
-  umfpack_get_determinant(&det, 0, m_numeric, 0);
+  umfpack_get_determinant(&det, 0, m_numeric, 0, StorageIndex());
   return det;
 }
 
-template<typename MatrixType>
-template<typename BDerived,typename XDerived>
-bool UmfPackLU<MatrixType>::_solve(const MatrixBase<BDerived> &b, MatrixBase<XDerived> &x) const
-{
-  const int rhsCols = b.cols();
-  eigen_assert((BDerived::Flags&RowMajorBit)==0 && "UmfPackLU backend does not support non col-major rhs yet");
-  eigen_assert((XDerived::Flags&RowMajorBit)==0 && "UmfPackLU backend does not support non col-major result yet");
+template <typename MatrixType>
+template <typename BDerived, typename XDerived>
+bool UmfPackLU<MatrixType>::_solve_impl(const MatrixBase<BDerived> &b, MatrixBase<XDerived> &x) const {
+  Index rhsCols = b.cols();
+  eigen_assert((BDerived::Flags & RowMajorBit) == 0 && "UmfPackLU backend does not support non col-major rhs yet");
+  eigen_assert((XDerived::Flags & RowMajorBit) == 0 && "UmfPackLU backend does not support non col-major result yet");
   eigen_assert(b.derived().data() != x.derived().data() && " Umfpack does not support inplace solve");
-  
-  int errorCode;
-  for (int j=0; j<rhsCols; ++j)
-  {
-    errorCode = umfpack_solve(UMFPACK_A,
-        m_outerIndexPtr, m_innerIndexPtr, m_valuePtr,
-        &x.col(j).coeffRef(0), &b.const_cast_derived().col(j).coeffRef(0), m_numeric, 0, 0);
-    if (errorCode!=0)
-      return false;
-  }
-
-  return true;
-}
-
-
-namespace internal {
 
-template<typename _MatrixType, typename Rhs>
-struct solve_retval<UmfPackLU<_MatrixType>, Rhs>
-  : solve_retval_base<UmfPackLU<_MatrixType>, Rhs>
-{
-  typedef UmfPackLU<_MatrixType> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
+  Scalar *x_ptr = 0;
+  Matrix<Scalar, Dynamic, 1> x_tmp;
+  if (x.innerStride() != 1) {
+    x_tmp.resize(x.rows());
+    x_ptr = x_tmp.data();
   }
-};
-
-template<typename _MatrixType, typename Rhs>
-struct sparse_solve_retval<UmfPackLU<_MatrixType>, Rhs>
-  : sparse_solve_retval_base<UmfPackLU<_MatrixType>, Rhs>
-{
-  typedef UmfPackLU<_MatrixType> Dec;
-  EIGEN_MAKE_SPARSE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    this->defaultEvalTo(dst);
+  for (int j = 0; j < rhsCols; ++j) {
+    if (x.innerStride() == 1) x_ptr = &x.col(j).coeffRef(0);
+    StorageIndex errorCode =
+        umfpack_solve(UMFPACK_A, mp_matrix.outerIndexPtr(), mp_matrix.innerIndexPtr(), mp_matrix.valuePtr(), x_ptr,
+                      &b.const_cast_derived().col(j).coeffRef(0), m_numeric, m_control.data(), m_umfpackInfo.data());
+    if (x.innerStride() != 1) x.col(j) = x_tmp;
+    if (errorCode != 0) return false;
   }
-};
 
-} // end namespace internal
+  return true;
+}
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_UMFPACKSUPPORT_H
+#endif  // EIGEN_UMFPACKSUPPORT_H
diff --git a/inst/include/Eigen/src/misc/Image.h b/inst/include/Eigen/src/misc/Image.h
index 75c5f433..38d516e2 100644
--- a/inst/include/Eigen/src/misc/Image.h
+++ b/inst/include/Eigen/src/misc/Image.h
@@ -10,41 +10,39 @@
 #ifndef EIGEN_MISC_IMAGE_H
 #define EIGEN_MISC_IMAGE_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 namespace internal {
 
 /** \class image_retval_base
-  *
-  */
-template<typename DecompositionType>
-struct traits<image_retval_base<DecompositionType> >
-{
+ *
+ */
+template <typename DecompositionType>
+struct traits<image_retval_base<DecompositionType> > {
   typedef typename DecompositionType::MatrixType MatrixType;
-  typedef Matrix<
-    typename MatrixType::Scalar,
-    MatrixType::RowsAtCompileTime, // the image is a subspace of the destination space, whose
-                                   // dimension is the number of rows of the original matrix
-    Dynamic,                       // we don't know at compile time the dimension of the image (the rank)
-    MatrixType::Options,
-    MatrixType::MaxRowsAtCompileTime, // the image matrix will consist of columns from the original matrix,
-    MatrixType::MaxColsAtCompileTime  // so it has the same number of rows and at most as many columns.
-  > ReturnType;
+  typedef Matrix<typename MatrixType::Scalar,
+                 MatrixType::RowsAtCompileTime,  // the image is a subspace of the destination space, whose
+                                                 // dimension is the number of rows of the original matrix
+                 Dynamic,                        // we don't know at compile time the dimension of the image (the rank)
+                 traits<MatrixType>::Options,
+                 MatrixType::MaxRowsAtCompileTime,  // the image matrix will consist of columns from the original
+                                                    // matrix,
+                 MatrixType::MaxColsAtCompileTime   // so it has the same number of rows and at most as many columns.
+                 >
+      ReturnType;
 };
 
-template<typename _DecompositionType> struct image_retval_base
- : public ReturnByValue<image_retval_base<_DecompositionType> >
-{
-  typedef _DecompositionType DecompositionType;
+template <typename DecompositionType_>
+struct image_retval_base : public ReturnByValue<image_retval_base<DecompositionType_> > {
+  typedef DecompositionType_ DecompositionType;
   typedef typename DecompositionType::MatrixType MatrixType;
   typedef ReturnByValue<image_retval_base> Base;
-  typedef typename Base::Index Index;
 
   image_retval_base(const DecompositionType& dec, const MatrixType& originalMatrix)
-    : m_dec(dec), m_rank(dec.rank()),
-      m_cols(m_rank == 0 ? 1 : m_rank),
-      m_originalMatrix(originalMatrix)
-  {}
+      : m_dec(dec), m_rank(dec.rank()), m_cols(m_rank == 0 ? 1 : m_rank), m_originalMatrix(originalMatrix) {}
 
   inline Index rows() const { return m_dec.rows(); }
   inline Index cols() const { return m_cols; }
@@ -52,33 +50,31 @@ template<typename _DecompositionType> struct image_retval_base
   inline const DecompositionType& dec() const { return m_dec; }
   inline const MatrixType& originalMatrix() const { return m_originalMatrix; }
 
-  template<typename Dest> inline void evalTo(Dest& dst) const
-  {
+  template <typename Dest>
+  inline void evalTo(Dest& dst) const {
     static_cast<const image_retval<DecompositionType>*>(this)->evalTo(dst);
   }
 
-  protected:
-    const DecompositionType& m_dec;
-    Index m_rank, m_cols;
-    const MatrixType& m_originalMatrix;
+ protected:
+  const DecompositionType& m_dec;
+  Index m_rank, m_cols;
+  const MatrixType& m_originalMatrix;
 };
 
-} // end namespace internal
+}  // end namespace internal
 
-#define EIGEN_MAKE_IMAGE_HELPERS(DecompositionType) \
-  typedef typename DecompositionType::MatrixType MatrixType; \
-  typedef typename MatrixType::Scalar Scalar; \
-  typedef typename MatrixType::RealScalar RealScalar; \
-  typedef typename MatrixType::Index Index; \
+#define EIGEN_MAKE_IMAGE_HELPERS(DecompositionType)                   \
+  typedef typename DecompositionType::MatrixType MatrixType;          \
+  typedef typename MatrixType::Scalar Scalar;                         \
+  typedef typename MatrixType::RealScalar RealScalar;                 \
   typedef Eigen::internal::image_retval_base<DecompositionType> Base; \
-  using Base::dec; \
-  using Base::originalMatrix; \
-  using Base::rank; \
-  using Base::rows; \
-  using Base::cols; \
-  image_retval(const DecompositionType& dec, const MatrixType& originalMatrix) \
-    : Base(dec, originalMatrix) {}
+  using Base::dec;                                                    \
+  using Base::originalMatrix;                                         \
+  using Base::rank;                                                   \
+  using Base::rows;                                                   \
+  using Base::cols;                                                   \
+  image_retval(const DecompositionType& dec, const MatrixType& originalMatrix) : Base(dec, originalMatrix) {}
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_MISC_IMAGE_H
+#endif  // EIGEN_MISC_IMAGE_H
diff --git a/inst/include/Eigen/src/misc/InternalHeaderCheck.h b/inst/include/Eigen/src/misc/InternalHeaderCheck.h
new file mode 100644
index 00000000..1cea572d
--- /dev/null
+++ b/inst/include/Eigen/src/misc/InternalHeaderCheck.h
@@ -0,0 +1,3 @@
+#ifndef EIGEN_CORE_MODULE_H
+#error "Please include Eigen/Core instead of including headers inside the src directory directly."
+#endif
diff --git a/inst/include/Eigen/src/misc/Kernel.h b/inst/include/Eigen/src/misc/Kernel.h
index b9e1518f..3ed458bc 100644
--- a/inst/include/Eigen/src/misc/Kernel.h
+++ b/inst/include/Eigen/src/misc/Kernel.h
@@ -10,72 +10,68 @@
 #ifndef EIGEN_MISC_KERNEL_H
 #define EIGEN_MISC_KERNEL_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 namespace internal {
 
 /** \class kernel_retval_base
-  *
-  */
-template<typename DecompositionType>
-struct traits<kernel_retval_base<DecompositionType> >
-{
+ *
+ */
+template <typename DecompositionType>
+struct traits<kernel_retval_base<DecompositionType> > {
   typedef typename DecompositionType::MatrixType MatrixType;
-  typedef Matrix<
-    typename MatrixType::Scalar,
-    MatrixType::ColsAtCompileTime, // the number of rows in the "kernel matrix"
-                                   // is the number of cols of the original matrix
-                                   // so that the product "matrix * kernel = zero" makes sense
-    Dynamic,                       // we don't know at compile-time the dimension of the kernel
-    MatrixType::Options,
-    MatrixType::MaxColsAtCompileTime, // see explanation for 2nd template parameter
-    MatrixType::MaxColsAtCompileTime // the kernel is a subspace of the domain space,
-                                     // whose dimension is the number of columns of the original matrix
-  > ReturnType;
+  typedef Matrix<typename MatrixType::Scalar,
+                 MatrixType::ColsAtCompileTime,  // the number of rows in the "kernel matrix"
+                                                 // is the number of cols of the original matrix
+                                                 // so that the product "matrix * kernel = zero" makes sense
+                 Dynamic,                        // we don't know at compile-time the dimension of the kernel
+                 traits<MatrixType>::Options,
+                 MatrixType::MaxColsAtCompileTime,  // see explanation for 2nd template parameter
+                 MatrixType::MaxColsAtCompileTime   // the kernel is a subspace of the domain space,
+                                                    // whose dimension is the number of columns of the original matrix
+                 >
+      ReturnType;
 };
 
-template<typename _DecompositionType> struct kernel_retval_base
- : public ReturnByValue<kernel_retval_base<_DecompositionType> >
-{
-  typedef _DecompositionType DecompositionType;
+template <typename DecompositionType_>
+struct kernel_retval_base : public ReturnByValue<kernel_retval_base<DecompositionType_> > {
+  typedef DecompositionType_ DecompositionType;
   typedef ReturnByValue<kernel_retval_base> Base;
-  typedef typename Base::Index Index;
 
-  kernel_retval_base(const DecompositionType& dec)
-    : m_dec(dec),
-      m_rank(dec.rank()),
-      m_cols(m_rank==dec.cols() ? 1 : dec.cols() - m_rank)
-  {}
+  explicit kernel_retval_base(const DecompositionType& dec)
+      : m_dec(dec), m_rank(dec.rank()), m_cols(m_rank == dec.cols() ? 1 : dec.cols() - m_rank) {}
 
   inline Index rows() const { return m_dec.cols(); }
   inline Index cols() const { return m_cols; }
   inline Index rank() const { return m_rank; }
   inline const DecompositionType& dec() const { return m_dec; }
 
-  template<typename Dest> inline void evalTo(Dest& dst) const
-  {
+  template <typename Dest>
+  inline void evalTo(Dest& dst) const {
     static_cast<const kernel_retval<DecompositionType>*>(this)->evalTo(dst);
   }
 
-  protected:
-    const DecompositionType& m_dec;
-    Index m_rank, m_cols;
+ protected:
+  const DecompositionType& m_dec;
+  Index m_rank, m_cols;
 };
 
-} // end namespace internal
+}  // end namespace internal
 
-#define EIGEN_MAKE_KERNEL_HELPERS(DecompositionType) \
-  typedef typename DecompositionType::MatrixType MatrixType; \
-  typedef typename MatrixType::Scalar Scalar; \
-  typedef typename MatrixType::RealScalar RealScalar; \
-  typedef typename MatrixType::Index Index; \
+#define EIGEN_MAKE_KERNEL_HELPERS(DecompositionType)                   \
+  typedef typename DecompositionType::MatrixType MatrixType;           \
+  typedef typename MatrixType::Scalar Scalar;                          \
+  typedef typename MatrixType::RealScalar RealScalar;                  \
   typedef Eigen::internal::kernel_retval_base<DecompositionType> Base; \
-  using Base::dec; \
-  using Base::rank; \
-  using Base::rows; \
-  using Base::cols; \
+  using Base::dec;                                                     \
+  using Base::rank;                                                    \
+  using Base::rows;                                                    \
+  using Base::cols;                                                    \
   kernel_retval(const DecompositionType& dec) : Base(dec) {}
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_MISC_KERNEL_H
+#endif  // EIGEN_MISC_KERNEL_H
diff --git a/inst/include/Eigen/src/misc/RealSvd2x2.h b/inst/include/Eigen/src/misc/RealSvd2x2.h
new file mode 100644
index 00000000..332a5abb
--- /dev/null
+++ b/inst/include/Eigen/src/misc/RealSvd2x2.h
@@ -0,0 +1,53 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
+// Copyright (C) 2013-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_REALSVD2X2_H
+#define EIGEN_REALSVD2X2_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename MatrixType, typename RealScalar, typename Index>
+void real_2x2_jacobi_svd(const MatrixType &matrix, Index p, Index q, JacobiRotation<RealScalar> *j_left,
+                         JacobiRotation<RealScalar> *j_right) {
+  using std::abs;
+  using std::sqrt;
+  Matrix<RealScalar, 2, 2> m;
+  m << numext::real(matrix.coeff(p, p)), numext::real(matrix.coeff(p, q)), numext::real(matrix.coeff(q, p)),
+      numext::real(matrix.coeff(q, q));
+  JacobiRotation<RealScalar> rot1;
+  RealScalar t = m.coeff(0, 0) + m.coeff(1, 1);
+  RealScalar d = m.coeff(1, 0) - m.coeff(0, 1);
+
+  if (abs(d) < (std::numeric_limits<RealScalar>::min)()) {
+    rot1.s() = RealScalar(0);
+    rot1.c() = RealScalar(1);
+  } else {
+    // If d!=0, then t/d cannot overflow because the magnitude of the
+    // entries forming d are not too small compared to the ones forming t.
+    RealScalar u = t / d;
+    RealScalar tmp = sqrt(RealScalar(1) + numext::abs2(u));
+    rot1.s() = RealScalar(1) / tmp;
+    rot1.c() = u / tmp;
+  }
+  m.applyOnTheLeft(0, 1, rot1);
+  j_right->makeJacobi(m, 0, 1);
+  *j_left = rot1 * j_right->transpose();
+}
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_REALSVD2X2_H
diff --git a/inst/include/Eigen/src/misc/Solve.h b/inst/include/Eigen/src/misc/Solve.h
deleted file mode 100644
index 7f70d60a..00000000
--- a/inst/include/Eigen/src/misc/Solve.h
+++ /dev/null
@@ -1,76 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_MISC_SOLVE_H
-#define EIGEN_MISC_SOLVE_H
-
-namespace Eigen { 
-
-namespace internal {
-
-/** \class solve_retval_base
-  *
-  */
-template<typename DecompositionType, typename Rhs>
-struct traits<solve_retval_base<DecompositionType, Rhs> >
-{
-  typedef typename DecompositionType::MatrixType MatrixType;
-  typedef Matrix<typename Rhs::Scalar,
-                 MatrixType::ColsAtCompileTime,
-                 Rhs::ColsAtCompileTime,
-                 Rhs::PlainObject::Options,
-                 MatrixType::MaxColsAtCompileTime,
-                 Rhs::MaxColsAtCompileTime> ReturnType;
-};
-
-template<typename _DecompositionType, typename Rhs> struct solve_retval_base
- : public ReturnByValue<solve_retval_base<_DecompositionType, Rhs> >
-{
-  typedef typename remove_all<typename Rhs::Nested>::type RhsNestedCleaned;
-  typedef _DecompositionType DecompositionType;
-  typedef ReturnByValue<solve_retval_base> Base;
-  typedef typename Base::Index Index;
-
-  solve_retval_base(const DecompositionType& dec, const Rhs& rhs)
-    : m_dec(dec), m_rhs(rhs)
-  {}
-
-  inline Index rows() const { return m_dec.cols(); }
-  inline Index cols() const { return m_rhs.cols(); }
-  inline const DecompositionType& dec() const { return m_dec; }
-  inline const RhsNestedCleaned& rhs() const { return m_rhs; }
-
-  template<typename Dest> inline void evalTo(Dest& dst) const
-  {
-    static_cast<const solve_retval<DecompositionType,Rhs>*>(this)->evalTo(dst);
-  }
-
-  protected:
-    const DecompositionType& m_dec;
-    typename Rhs::Nested m_rhs;
-};
-
-} // end namespace internal
-
-#define EIGEN_MAKE_SOLVE_HELPERS(DecompositionType,Rhs) \
-  typedef typename DecompositionType::MatrixType MatrixType; \
-  typedef typename MatrixType::Scalar Scalar; \
-  typedef typename MatrixType::RealScalar RealScalar; \
-  typedef typename MatrixType::Index Index; \
-  typedef Eigen::internal::solve_retval_base<DecompositionType,Rhs> Base; \
-  using Base::dec; \
-  using Base::rhs; \
-  using Base::rows; \
-  using Base::cols; \
-  solve_retval(const DecompositionType& dec, const Rhs& rhs) \
-    : Base(dec, rhs) {}
-
-} // end namespace Eigen
-
-#endif // EIGEN_MISC_SOLVE_H
diff --git a/inst/include/Eigen/src/misc/SparseSolve.h b/inst/include/Eigen/src/misc/SparseSolve.h
deleted file mode 100644
index 244bb8ec..00000000
--- a/inst/include/Eigen/src/misc/SparseSolve.h
+++ /dev/null
@@ -1,128 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2010 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_SPARSE_SOLVE_H
-#define EIGEN_SPARSE_SOLVE_H
-
-namespace Eigen { 
-
-namespace internal {
-
-template<typename _DecompositionType, typename Rhs> struct sparse_solve_retval_base;
-template<typename _DecompositionType, typename Rhs> struct sparse_solve_retval;
-  
-template<typename DecompositionType, typename Rhs>
-struct traits<sparse_solve_retval_base<DecompositionType, Rhs> >
-{
-  typedef typename DecompositionType::MatrixType MatrixType;
-  typedef SparseMatrix<typename Rhs::Scalar, Rhs::Options, typename Rhs::Index> ReturnType;
-};
-
-template<typename _DecompositionType, typename Rhs> struct sparse_solve_retval_base
- : public ReturnByValue<sparse_solve_retval_base<_DecompositionType, Rhs> >
-{
-  typedef typename remove_all<typename Rhs::Nested>::type RhsNestedCleaned;
-  typedef _DecompositionType DecompositionType;
-  typedef ReturnByValue<sparse_solve_retval_base> Base;
-  typedef typename Base::Index Index;
-
-  sparse_solve_retval_base(const DecompositionType& dec, const Rhs& rhs)
-    : m_dec(dec), m_rhs(rhs)
-  {}
-
-  inline Index rows() const { return m_dec.cols(); }
-  inline Index cols() const { return m_rhs.cols(); }
-  inline const DecompositionType& dec() const { return m_dec; }
-  inline const RhsNestedCleaned& rhs() const { return m_rhs; }
-
-  template<typename Dest> inline void evalTo(Dest& dst) const
-  {
-    static_cast<const sparse_solve_retval<DecompositionType,Rhs>*>(this)->evalTo(dst);
-  }
-
-  protected:
-    template<typename DestScalar, int DestOptions, typename DestIndex>
-    inline void defaultEvalTo(SparseMatrix<DestScalar,DestOptions,DestIndex>& dst) const
-    {
-      // we process the sparse rhs per block of NbColsAtOnce columns temporarily stored into a dense matrix.
-      static const int NbColsAtOnce = 4;
-      int rhsCols = m_rhs.cols();
-      int size = m_rhs.rows();
-      Eigen::Matrix<DestScalar,Dynamic,Dynamic> tmp(size,rhsCols);
-      Eigen::Matrix<DestScalar,Dynamic,Dynamic> tmpX(size,rhsCols);
-      for(int k=0; k<rhsCols; k+=NbColsAtOnce)
-      {
-        int actualCols = std::min<int>(rhsCols-k, NbColsAtOnce);
-        tmp.leftCols(actualCols) = m_rhs.middleCols(k,actualCols);
-        tmpX.leftCols(actualCols) = m_dec.solve(tmp.leftCols(actualCols));
-        dst.middleCols(k,actualCols) = tmpX.leftCols(actualCols).sparseView();
-      }
-    }
-    const DecompositionType& m_dec;
-    typename Rhs::Nested m_rhs;
-};
-
-#define EIGEN_MAKE_SPARSE_SOLVE_HELPERS(DecompositionType,Rhs) \
-  typedef typename DecompositionType::MatrixType MatrixType; \
-  typedef typename MatrixType::Scalar Scalar; \
-  typedef typename MatrixType::RealScalar RealScalar; \
-  typedef typename MatrixType::Index Index; \
-  typedef Eigen::internal::sparse_solve_retval_base<DecompositionType,Rhs> Base; \
-  using Base::dec; \
-  using Base::rhs; \
-  using Base::rows; \
-  using Base::cols; \
-  sparse_solve_retval(const DecompositionType& dec, const Rhs& rhs) \
-    : Base(dec, rhs) {}
-
-
-
-template<typename DecompositionType, typename Rhs, typename Guess> struct solve_retval_with_guess;
-
-template<typename DecompositionType, typename Rhs, typename Guess>
-struct traits<solve_retval_with_guess<DecompositionType, Rhs, Guess> >
-{
-  typedef typename DecompositionType::MatrixType MatrixType;
-  typedef Matrix<typename Rhs::Scalar,
-                 MatrixType::ColsAtCompileTime,
-                 Rhs::ColsAtCompileTime,
-                 Rhs::PlainObject::Options,
-                 MatrixType::MaxColsAtCompileTime,
-                 Rhs::MaxColsAtCompileTime> ReturnType;
-};
-
-template<typename DecompositionType, typename Rhs, typename Guess> struct solve_retval_with_guess
- : public ReturnByValue<solve_retval_with_guess<DecompositionType, Rhs, Guess> >
-{
-  typedef typename DecompositionType::Index Index;
-
-  solve_retval_with_guess(const DecompositionType& dec, const Rhs& rhs, const Guess& guess)
-    : m_dec(dec), m_rhs(rhs), m_guess(guess)
-  {}
-
-  inline Index rows() const { return m_dec.cols(); }
-  inline Index cols() const { return m_rhs.cols(); }
-
-  template<typename Dest> inline void evalTo(Dest& dst) const
-  {
-    dst = m_guess;
-    m_dec._solveWithGuess(m_rhs,dst);
-  }
-
-  protected:
-    const DecompositionType& m_dec;
-    const typename Rhs::Nested m_rhs;
-    const typename Guess::Nested m_guess;
-};
-
-} // namepsace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_SPARSE_SOLVE_H
diff --git a/inst/include/Eigen/src/misc/blas.h b/inst/include/Eigen/src/misc/blas.h
index 6fce99ed..f12bc7cd 100644
--- a/inst/include/Eigen/src/misc/blas.h
+++ b/inst/include/Eigen/src/misc/blas.h
@@ -1,658 +1,97 @@
-#ifndef BLAS_H
-#define BLAS_H
+#ifndef EIGEN_MISC_BLAS_H
+#define EIGEN_MISC_BLAS_H
 
-#ifdef __cplusplus
-extern "C"
-{
-#endif
+extern "C" {
 
 #define BLASFUNC(FUNC) FUNC##_
 
-#ifdef __WIN64__
-typedef long long BLASLONG;
-typedef unsigned long long BLASULONG;
-#else
-typedef long BLASLONG;
-typedef unsigned long BLASULONG;
-#endif
-
-int    BLASFUNC(xerbla)(const char *, int *info, int);
-
-float  BLASFUNC(sdot)  (int *, float  *, int *, float  *, int *);
-float  BLASFUNC(sdsdot)(int *, float  *,        float  *, int *, float  *, int *);
-
-double BLASFUNC(dsdot) (int *, float  *, int *, float  *, int *);
-double BLASFUNC(ddot)  (int *, double *, int *, double *, int *);
-double BLASFUNC(qdot)  (int *, double *, int *, double *, int *);
-
-int  BLASFUNC(cdotuw)  (int *, float  *, int *, float  *, int *, float*);
-int  BLASFUNC(cdotcw)  (int *, float  *, int *, float  *, int *, float*);
-int  BLASFUNC(zdotuw)  (int *, double  *, int *, double  *, int *, double*);
-int  BLASFUNC(zdotcw)  (int *, double  *, int *, double  *, int *, double*);
-
-int    BLASFUNC(saxpy) (int *, float  *, float  *, int *, float  *, int *);
-int    BLASFUNC(daxpy) (int *, double *, double *, int *, double *, int *);
-int    BLASFUNC(qaxpy) (int *, double *, double *, int *, double *, int *);
-int    BLASFUNC(caxpy) (int *, float  *, float  *, int *, float  *, int *);
-int    BLASFUNC(zaxpy) (int *, double *, double *, int *, double *, int *);
-int    BLASFUNC(xaxpy) (int *, double *, double *, int *, double *, int *);
-int    BLASFUNC(caxpyc)(int *, float  *, float  *, int *, float  *, int *);
-int    BLASFUNC(zaxpyc)(int *, double *, double *, int *, double *, int *);
-int    BLASFUNC(xaxpyc)(int *, double *, double *, int *, double *, int *);
-
-int    BLASFUNC(scopy) (int *, float  *, int *, float  *, int *);
-int    BLASFUNC(dcopy) (int *, double *, int *, double *, int *);
-int    BLASFUNC(qcopy) (int *, double *, int *, double *, int *);
-int    BLASFUNC(ccopy) (int *, float  *, int *, float  *, int *);
-int    BLASFUNC(zcopy) (int *, double *, int *, double *, int *);
-int    BLASFUNC(xcopy) (int *, double *, int *, double *, int *);
-
-int    BLASFUNC(sswap) (int *, float  *, int *, float  *, int *);
-int    BLASFUNC(dswap) (int *, double *, int *, double *, int *);
-int    BLASFUNC(qswap) (int *, double *, int *, double *, int *);
-int    BLASFUNC(cswap) (int *, float  *, int *, float  *, int *);
-int    BLASFUNC(zswap) (int *, double *, int *, double *, int *);
-int    BLASFUNC(xswap) (int *, double *, int *, double *, int *);
-
-float  BLASFUNC(sasum) (int *, float  *, int *);
-float  BLASFUNC(scasum)(int *, float  *, int *);
-double BLASFUNC(dasum) (int *, double *, int *);
-double BLASFUNC(qasum) (int *, double *, int *);
-double BLASFUNC(dzasum)(int *, double *, int *);
-double BLASFUNC(qxasum)(int *, double *, int *);
-
-int    BLASFUNC(isamax)(int *, float  *, int *);
-int    BLASFUNC(idamax)(int *, double *, int *);
-int    BLASFUNC(iqamax)(int *, double *, int *);
-int    BLASFUNC(icamax)(int *, float  *, int *);
-int    BLASFUNC(izamax)(int *, double *, int *);
-int    BLASFUNC(ixamax)(int *, double *, int *);
-
-int    BLASFUNC(ismax) (int *, float  *, int *);
-int    BLASFUNC(idmax) (int *, double *, int *);
-int    BLASFUNC(iqmax) (int *, double *, int *);
-int    BLASFUNC(icmax) (int *, float  *, int *);
-int    BLASFUNC(izmax) (int *, double *, int *);
-int    BLASFUNC(ixmax) (int *, double *, int *);
-
-int    BLASFUNC(isamin)(int *, float  *, int *);
-int    BLASFUNC(idamin)(int *, double *, int *);
-int    BLASFUNC(iqamin)(int *, double *, int *);
-int    BLASFUNC(icamin)(int *, float  *, int *);
-int    BLASFUNC(izamin)(int *, double *, int *);
-int    BLASFUNC(ixamin)(int *, double *, int *);
-
-int    BLASFUNC(ismin)(int *, float  *, int *);
-int    BLASFUNC(idmin)(int *, double *, int *);
-int    BLASFUNC(iqmin)(int *, double *, int *);
-int    BLASFUNC(icmin)(int *, float  *, int *);
-int    BLASFUNC(izmin)(int *, double *, int *);
-int    BLASFUNC(ixmin)(int *, double *, int *);
-
-float  BLASFUNC(samax) (int *, float  *, int *);
-double BLASFUNC(damax) (int *, double *, int *);
-double BLASFUNC(qamax) (int *, double *, int *);
-float  BLASFUNC(scamax)(int *, float  *, int *);
-double BLASFUNC(dzamax)(int *, double *, int *);
-double BLASFUNC(qxamax)(int *, double *, int *);
-
-float  BLASFUNC(samin) (int *, float  *, int *);
-double BLASFUNC(damin) (int *, double *, int *);
-double BLASFUNC(qamin) (int *, double *, int *);
-float  BLASFUNC(scamin)(int *, float  *, int *);
-double BLASFUNC(dzamin)(int *, double *, int *);
-double BLASFUNC(qxamin)(int *, double *, int *);
-
-float  BLASFUNC(smax)  (int *, float  *, int *);
-double BLASFUNC(dmax)  (int *, double *, int *);
-double BLASFUNC(qmax)  (int *, double *, int *);
-float  BLASFUNC(scmax) (int *, float  *, int *);
-double BLASFUNC(dzmax) (int *, double *, int *);
-double BLASFUNC(qxmax) (int *, double *, int *);
-
-float  BLASFUNC(smin)  (int *, float  *, int *);
-double BLASFUNC(dmin)  (int *, double *, int *);
-double BLASFUNC(qmin)  (int *, double *, int *);
-float  BLASFUNC(scmin) (int *, float  *, int *);
-double BLASFUNC(dzmin) (int *, double *, int *);
-double BLASFUNC(qxmin) (int *, double *, int *);
-
-int    BLASFUNC(sscal) (int *,  float  *, float  *, int *);
-int    BLASFUNC(dscal) (int *,  double *, double *, int *);
-int    BLASFUNC(qscal) (int *,  double *, double *, int *);
-int    BLASFUNC(cscal) (int *,  float  *, float  *, int *);
-int    BLASFUNC(zscal) (int *,  double *, double *, int *);
-int    BLASFUNC(xscal) (int *,  double *, double *, int *);
-int    BLASFUNC(csscal)(int *,  float  *, float  *, int *);
-int    BLASFUNC(zdscal)(int *,  double *, double *, int *);
-int    BLASFUNC(xqscal)(int *,  double *, double *, int *);
+/* Level 1 routines */
 
-float  BLASFUNC(snrm2) (int *, float  *, int *);
-float  BLASFUNC(scnrm2)(int *, float  *, int *);
-
-double BLASFUNC(dnrm2) (int *, double *, int *);
-double BLASFUNC(qnrm2) (int *, double *, int *);
-double BLASFUNC(dznrm2)(int *, double *, int *);
-double BLASFUNC(qxnrm2)(int *, double *, int *);
-
-int    BLASFUNC(srot)  (int *, float  *, int *, float  *, int *, float  *, float  *);
-int    BLASFUNC(drot)  (int *, double *, int *, double *, int *, double *, double *);
-int    BLASFUNC(qrot)  (int *, double *, int *, double *, int *, double *, double *);
-int    BLASFUNC(csrot) (int *, float  *, int *, float  *, int *, float  *, float  *);
-int    BLASFUNC(zdrot) (int *, double *, int *, double *, int *, double *, double *);
-int    BLASFUNC(xqrot) (int *, double *, int *, double *, int *, double *, double *);
-
-int    BLASFUNC(srotg) (float  *, float  *, float  *, float  *);
-int    BLASFUNC(drotg) (double *, double *, double *, double *);
-int    BLASFUNC(qrotg) (double *, double *, double *, double *);
-int    BLASFUNC(crotg) (float  *, float  *, float  *, float  *);
-int    BLASFUNC(zrotg) (double *, double *, double *, double *);
-int    BLASFUNC(xrotg) (double *, double *, double *, double *);
-
-int    BLASFUNC(srotmg)(float  *, float  *, float  *, float  *, float  *);
-int    BLASFUNC(drotmg)(double *, double *, double *, double *, double *);
-
-int    BLASFUNC(srotm) (int *, float  *, int *, float  *, int *, float  *);
-int    BLASFUNC(drotm) (int *, double *, int *, double *, int *, double *);
-int    BLASFUNC(qrotm) (int *, double *, int *, double *, int *, double *);
+void BLASFUNC(saxpy)(const int *, const float *, const float *, const int *, float *, const int *);
+void BLASFUNC(daxpy)(const int *, const double *, const double *, const int *, double *, const int *);
+void BLASFUNC(caxpy)(const int *, const float *, const float *, const int *, float *, const int *);
+void BLASFUNC(zaxpy)(const int *, const double *, const double *, const int *, double *, const int *);
 
 /* Level 2 routines */
 
-int BLASFUNC(sger)(int *,    int *, float *,  float *, int *,
-		   float *,  int *, float *,  int *);
-int BLASFUNC(dger)(int *,    int *, double *, double *, int *,
-		   double *, int *, double *, int *);
-int BLASFUNC(qger)(int *,    int *, double *, double *, int *,
-		   double *, int *, double *, int *);
-int BLASFUNC(cgeru)(int *,    int *, float *,  float *, int *,
-		    float *,  int *, float *,  int *);
-int BLASFUNC(cgerc)(int *,    int *, float *,  float *, int *,
-		    float *,  int *, float *,  int *);
-int BLASFUNC(zgeru)(int *,    int *, double *, double *, int *,
-		    double *, int *, double *, int *);
-int BLASFUNC(zgerc)(int *,    int *, double *, double *, int *,
-		    double *, int *, double *, int *);
-int BLASFUNC(xgeru)(int *,    int *, double *, double *, int *,
-		    double *, int *, double *, int *);
-int BLASFUNC(xgerc)(int *,    int *, double *, double *, int *,
-		    double *, int *, double *, int *);
-
-int BLASFUNC(sgemv)(char *, int *, int *, float  *, float  *, int *,
-		    float  *, int *, float  *, float  *, int *);
-int BLASFUNC(dgemv)(char *, int *, int *, double *, double *, int *,
-		    double *, int *, double *, double *, int *);
-int BLASFUNC(qgemv)(char *, int *, int *, double *, double *, int *,
-		    double *, int *, double *, double *, int *);
-int BLASFUNC(cgemv)(char *, int *, int *, float  *, float  *, int *,
-		    float  *, int *, float  *, float  *, int *);
-int BLASFUNC(zgemv)(char *, int *, int *, double *, double *, int *,
-		    double *, int *, double *, double *, int *);
-int BLASFUNC(xgemv)(char *, int *, int *, double *, double *, int *,
-		    double *, int *, double *, double *, int *);
-
-int BLASFUNC(strsv) (char *, char *, char *, int *, float  *, int *,
-		     float  *, int *);
-int BLASFUNC(dtrsv) (char *, char *, char *, int *, double *, int *,
-		     double *, int *);
-int BLASFUNC(qtrsv) (char *, char *, char *, int *, double *, int *,
-		     double *, int *);
-int BLASFUNC(ctrsv) (char *, char *, char *, int *, float  *, int *,
-		     float  *, int *);
-int BLASFUNC(ztrsv) (char *, char *, char *, int *, double *, int *,
-		     double *, int *);
-int BLASFUNC(xtrsv) (char *, char *, char *, int *, double *, int *,
-		     double *, int *);
-
-int BLASFUNC(stpsv) (char *, char *, char *, int *, float  *, float  *, int *);
-int BLASFUNC(dtpsv) (char *, char *, char *, int *, double *, double *, int *);
-int BLASFUNC(qtpsv) (char *, char *, char *, int *, double *, double *, int *);
-int BLASFUNC(ctpsv) (char *, char *, char *, int *, float  *, float  *, int *);
-int BLASFUNC(ztpsv) (char *, char *, char *, int *, double *, double *, int *);
-int BLASFUNC(xtpsv) (char *, char *, char *, int *, double *, double *, int *);
-
-int BLASFUNC(strmv) (char *, char *, char *, int *, float  *, int *,
-		     float  *, int *);
-int BLASFUNC(dtrmv) (char *, char *, char *, int *, double *, int *,
-		     double *, int *);
-int BLASFUNC(qtrmv) (char *, char *, char *, int *, double *, int *,
-		     double *, int *);
-int BLASFUNC(ctrmv) (char *, char *, char *, int *, float  *, int *,
-		     float  *, int *);
-int BLASFUNC(ztrmv) (char *, char *, char *, int *, double *, int *,
-		     double *, int *);
-int BLASFUNC(xtrmv) (char *, char *, char *, int *, double *, int *,
-		     double *, int *);
-
-int BLASFUNC(stpmv) (char *, char *, char *, int *, float  *, float  *, int *);
-int BLASFUNC(dtpmv) (char *, char *, char *, int *, double *, double *, int *);
-int BLASFUNC(qtpmv) (char *, char *, char *, int *, double *, double *, int *);
-int BLASFUNC(ctpmv) (char *, char *, char *, int *, float  *, float  *, int *);
-int BLASFUNC(ztpmv) (char *, char *, char *, int *, double *, double *, int *);
-int BLASFUNC(xtpmv) (char *, char *, char *, int *, double *, double *, int *);
-
-int BLASFUNC(stbmv) (char *, char *, char *, int *, int *, float  *, int *, float  *, int *);
-int BLASFUNC(dtbmv) (char *, char *, char *, int *, int *, double *, int *, double *, int *);
-int BLASFUNC(qtbmv) (char *, char *, char *, int *, int *, double *, int *, double *, int *);
-int BLASFUNC(ctbmv) (char *, char *, char *, int *, int *, float  *, int *, float  *, int *);
-int BLASFUNC(ztbmv) (char *, char *, char *, int *, int *, double *, int *, double *, int *);
-int BLASFUNC(xtbmv) (char *, char *, char *, int *, int *, double *, int *, double *, int *);
-
-int BLASFUNC(stbsv) (char *, char *, char *, int *, int *, float  *, int *, float  *, int *);
-int BLASFUNC(dtbsv) (char *, char *, char *, int *, int *, double *, int *, double *, int *);
-int BLASFUNC(qtbsv) (char *, char *, char *, int *, int *, double *, int *, double *, int *);
-int BLASFUNC(ctbsv) (char *, char *, char *, int *, int *, float  *, int *, float  *, int *);
-int BLASFUNC(ztbsv) (char *, char *, char *, int *, int *, double *, int *, double *, int *);
-int BLASFUNC(xtbsv) (char *, char *, char *, int *, int *, double *, int *, double *, int *);
-
-int BLASFUNC(ssymv) (char *, int *, float  *, float *, int *,
-		     float  *, int *, float *, float *, int *);
-int BLASFUNC(dsymv) (char *, int *, double  *, double *, int *,
-		     double  *, int *, double *, double *, int *);
-int BLASFUNC(qsymv) (char *, int *, double  *, double *, int *,
-		     double  *, int *, double *, double *, int *);
-int BLASFUNC(csymv) (char *, int *, float  *, float *, int *,
-		     float  *, int *, float *, float *, int *);
-int BLASFUNC(zsymv) (char *, int *, double  *, double *, int *,
-		     double  *, int *, double *, double *, int *);
-int BLASFUNC(xsymv) (char *, int *, double  *, double *, int *,
-		     double  *, int *, double *, double *, int *);
-
-int BLASFUNC(sspmv) (char *, int *, float  *, float *,
-		     float  *, int *, float *, float *, int *);
-int BLASFUNC(dspmv) (char *, int *, double  *, double *,
-		     double  *, int *, double *, double *, int *);
-int BLASFUNC(qspmv) (char *, int *, double  *, double *,
-		     double  *, int *, double *, double *, int *);
-int BLASFUNC(cspmv) (char *, int *, float  *, float *,
-		     float  *, int *, float *, float *, int *);
-int BLASFUNC(zspmv) (char *, int *, double  *, double *,
-		     double  *, int *, double *, double *, int *);
-int BLASFUNC(xspmv) (char *, int *, double  *, double *,
-		     double  *, int *, double *, double *, int *);
-
-int BLASFUNC(ssyr) (char *, int *, float   *, float  *, int *,
-		    float  *, int *);
-int BLASFUNC(dsyr) (char *, int *, double  *, double *, int *,
-		    double *, int *);
-int BLASFUNC(qsyr) (char *, int *, double  *, double *, int *,
-		    double *, int *);
-int BLASFUNC(csyr) (char *, int *, float   *, float  *, int *,
-		    float  *, int *);
-int BLASFUNC(zsyr) (char *, int *, double  *, double *, int *,
-		    double *, int *);
-int BLASFUNC(xsyr) (char *, int *, double  *, double *, int *,
-		    double *, int *);
-
-int BLASFUNC(ssyr2) (char *, int *, float   *,
-		     float  *, int *, float  *, int *, float  *, int *);
-int BLASFUNC(dsyr2) (char *, int *, double  *,
-		     double *, int *, double *, int *, double *, int *);
-int BLASFUNC(qsyr2) (char *, int *, double  *,
-		     double *, int *, double *, int *, double *, int *);
-int BLASFUNC(csyr2) (char *, int *, float   *,
-		     float  *, int *, float  *, int *, float  *, int *);
-int BLASFUNC(zsyr2) (char *, int *, double  *,
-		     double *, int *, double *, int *, double *, int *);
-int BLASFUNC(xsyr2) (char *, int *, double  *,
-		     double *, int *, double *, int *, double *, int *);
-
-int BLASFUNC(sspr) (char *, int *, float   *, float  *, int *,
-		    float  *);
-int BLASFUNC(dspr) (char *, int *, double  *, double *, int *,
-		    double *);
-int BLASFUNC(qspr) (char *, int *, double  *, double *, int *,
-		    double *);
-int BLASFUNC(cspr) (char *, int *, float   *, float  *, int *,
-		    float  *);
-int BLASFUNC(zspr) (char *, int *, double  *, double *, int *,
-		    double *);
-int BLASFUNC(xspr) (char *, int *, double  *, double *, int *,
-		    double *);
-
-int BLASFUNC(sspr2) (char *, int *, float   *,
-		     float  *, int *, float  *, int *, float  *);
-int BLASFUNC(dspr2) (char *, int *, double  *,
-		     double *, int *, double *, int *, double *);
-int BLASFUNC(qspr2) (char *, int *, double  *,
-		     double *, int *, double *, int *, double *);
-int BLASFUNC(cspr2) (char *, int *, float   *,
-		     float  *, int *, float  *, int *, float  *);
-int BLASFUNC(zspr2) (char *, int *, double  *,
-		     double *, int *, double *, int *, double *);
-int BLASFUNC(xspr2) (char *, int *, double  *,
-		     double *, int *, double *, int *, double *);
-
-int BLASFUNC(cher) (char *, int *, float   *, float  *, int *,
-		    float  *, int *);
-int BLASFUNC(zher) (char *, int *, double  *, double *, int *,
-		    double *, int *);
-int BLASFUNC(xher) (char *, int *, double  *, double *, int *,
-		    double *, int *);
-
-int BLASFUNC(chpr) (char *, int *, float   *, float  *, int *, float  *);
-int BLASFUNC(zhpr) (char *, int *, double  *, double *, int *, double *);
-int BLASFUNC(xhpr) (char *, int *, double  *, double *, int *, double *);
-
-int BLASFUNC(cher2) (char *, int *, float   *,
-		     float  *, int *, float  *, int *, float  *, int *);
-int BLASFUNC(zher2) (char *, int *, double  *,
-		     double *, int *, double *, int *, double *, int *);
-int BLASFUNC(xher2) (char *, int *, double  *,
-		     double *, int *, double *, int *, double *, int *);
-
-int BLASFUNC(chpr2) (char *, int *, float   *,
-		     float  *, int *, float  *, int *, float  *);
-int BLASFUNC(zhpr2) (char *, int *, double  *,
-		     double *, int *, double *, int *, double *);
-int BLASFUNC(xhpr2) (char *, int *, double  *,
-		     double *, int *, double *, int *, double *);
-
-int BLASFUNC(chemv) (char *, int *, float  *, float *, int *,
-		     float  *, int *, float *, float *, int *);
-int BLASFUNC(zhemv) (char *, int *, double  *, double *, int *,
-		     double  *, int *, double *, double *, int *);
-int BLASFUNC(xhemv) (char *, int *, double  *, double *, int *,
-		     double  *, int *, double *, double *, int *);
-
-int BLASFUNC(chpmv) (char *, int *, float  *, float *,
-		     float  *, int *, float *, float *, int *);
-int BLASFUNC(zhpmv) (char *, int *, double  *, double *,
-		     double  *, int *, double *, double *, int *);
-int BLASFUNC(xhpmv) (char *, int *, double  *, double *,
-		     double  *, int *, double *, double *, int *);
-
-int BLASFUNC(snorm)(char *, int *, int *, float  *, int *);
-int BLASFUNC(dnorm)(char *, int *, int *, double *, int *);
-int BLASFUNC(cnorm)(char *, int *, int *, float  *, int *);
-int BLASFUNC(znorm)(char *, int *, int *, double *, int *);
-
-int BLASFUNC(sgbmv)(char *, int *, int *, int *, int *, float  *, float  *, int *,
-		    float  *, int *, float  *, float  *, int *);
-int BLASFUNC(dgbmv)(char *, int *, int *, int *, int *, double *, double *, int *,
-		    double *, int *, double *, double *, int *);
-int BLASFUNC(qgbmv)(char *, int *, int *, int *, int *, double *, double *, int *,
-		    double *, int *, double *, double *, int *);
-int BLASFUNC(cgbmv)(char *, int *, int *, int *, int *, float  *, float  *, int *,
-		    float  *, int *, float  *, float  *, int *);
-int BLASFUNC(zgbmv)(char *, int *, int *, int *, int *, double *, double *, int *,
-		    double *, int *, double *, double *, int *);
-int BLASFUNC(xgbmv)(char *, int *, int *, int *, int *, double *, double *, int *,
-		    double *, int *, double *, double *, int *);
-
-int BLASFUNC(ssbmv)(char *, int *, int *, float  *, float  *, int *,
-		    float  *, int *, float  *, float  *, int *);
-int BLASFUNC(dsbmv)(char *, int *, int *, double *, double *, int *,
-		    double *, int *, double *, double *, int *);
-int BLASFUNC(qsbmv)(char *, int *, int *, double *, double *, int *,
-		    double *, int *, double *, double *, int *);
-int BLASFUNC(csbmv)(char *, int *, int *, float  *, float  *, int *,
-		    float  *, int *, float  *, float  *, int *);
-int BLASFUNC(zsbmv)(char *, int *, int *, double *, double *, int *,
-		    double *, int *, double *, double *, int *);
-int BLASFUNC(xsbmv)(char *, int *, int *, double *, double *, int *,
-		    double *, int *, double *, double *, int *);
-
-int BLASFUNC(chbmv)(char *, int *, int *, float  *, float  *, int *,
-		    float  *, int *, float  *, float  *, int *);
-int BLASFUNC(zhbmv)(char *, int *, int *, double *, double *, int *,
-		    double *, int *, double *, double *, int *);
-int BLASFUNC(xhbmv)(char *, int *, int *, double *, double *, int *,
-		    double *, int *, double *, double *, int *);
+void BLASFUNC(sgemv)(const char *, const int *, const int *, const float *, const float *, const int *, const float *,
+                     const int *, const float *, float *, const int *);
+void BLASFUNC(dgemv)(const char *, const int *, const int *, const double *, const double *, const int *,
+                     const double *, const int *, const double *, double *, const int *);
+void BLASFUNC(cgemv)(const char *, const int *, const int *, const float *, const float *, const int *, const float *,
+                     const int *, const float *, float *, const int *);
+void BLASFUNC(zgemv)(const char *, const int *, const int *, const double *, const double *, const int *,
+                     const double *, const int *, const double *, double *, const int *);
+
+void BLASFUNC(strmv)(const char *, const char *, const char *, const int *, const float *, const int *, float *,
+                     const int *);
+void BLASFUNC(dtrmv)(const char *, const char *, const char *, const int *, const double *, const int *, double *,
+                     const int *);
+void BLASFUNC(ctrmv)(const char *, const char *, const char *, const int *, const float *, const int *, float *,
+                     const int *);
+void BLASFUNC(ztrmv)(const char *, const char *, const char *, const int *, const double *, const int *, double *,
+                     const int *);
+
+void BLASFUNC(ssymv)(const char *, const int *, const float *, const float *, const int *, const float *, const int *,
+                     const float *, float *, const int *);
+void BLASFUNC(dsymv)(const char *, const int *, const double *, const double *, const int *, const double *,
+                     const int *, const double *, double *, const int *);
+
+void BLASFUNC(chemv)(const char *, const int *, const float *, const float *, const int *, const float *, const int *,
+                     const float *, float *, const int *);
+void BLASFUNC(zhemv)(const char *, const int *, const double *, const double *, const int *, const double *,
+                     const int *, const double *, double *, const int *);
 
 /* Level 3 routines */
 
-int BLASFUNC(sgemm)(char *, char *, int *, int *, int *, float *,
-	   float  *, int *, float  *, int *, float  *, float  *, int *);
-int BLASFUNC(dgemm)(char *, char *, int *, int *, int *, double *,
-	   double *, int *, double *, int *, double *, double *, int *);
-int BLASFUNC(qgemm)(char *, char *, int *, int *, int *, double *,
-	   double *, int *, double *, int *, double *, double *, int *);
-int BLASFUNC(cgemm)(char *, char *, int *, int *, int *, float *,
-	   float  *, int *, float  *, int *, float  *, float  *, int *);
-int BLASFUNC(zgemm)(char *, char *, int *, int *, int *, double *,
-	   double *, int *, double *, int *, double *, double *, int *);
-int BLASFUNC(xgemm)(char *, char *, int *, int *, int *, double *,
-	   double *, int *, double *, int *, double *, double *, int *);
-
-int BLASFUNC(cgemm3m)(char *, char *, int *, int *, int *, float *,
-	   float  *, int *, float  *, int *, float  *, float  *, int *);
-int BLASFUNC(zgemm3m)(char *, char *, int *, int *, int *, double *,
-	   double *, int *, double *, int *, double *, double *, int *);
-int BLASFUNC(xgemm3m)(char *, char *, int *, int *, int *, double *,
-	   double *, int *, double *, int *, double *, double *, int *);
-
-int BLASFUNC(sge2mm)(char *, char *, char *, int *, int *,
-		     float *, float  *, int *, float  *, int *,
-		     float *, float  *, int *);
-int BLASFUNC(dge2mm)(char *, char *, char *, int *, int *,
-		     double *, double  *, int *, double  *, int *,
-		     double *, double  *, int *);
-int BLASFUNC(cge2mm)(char *, char *, char *, int *, int *,
-		     float *, float  *, int *, float  *, int *,
-		     float *, float  *, int *);
-int BLASFUNC(zge2mm)(char *, char *, char *, int *, int *,
-		     double *, double  *, int *, double  *, int *,
-		     double *, double  *, int *);
-
-int BLASFUNC(strsm)(char *, char *, char *, char *, int *, int *,
-	   float *,  float *, int *, float *, int *);
-int BLASFUNC(dtrsm)(char *, char *, char *, char *, int *, int *,
-	   double *,  double *, int *, double *, int *);
-int BLASFUNC(qtrsm)(char *, char *, char *, char *, int *, int *,
-	   double *,  double *, int *, double *, int *);
-int BLASFUNC(ctrsm)(char *, char *, char *, char *, int *, int *,
-	   float *,  float *, int *, float *, int *);
-int BLASFUNC(ztrsm)(char *, char *, char *, char *, int *, int *,
-	   double *,  double *, int *, double *, int *);
-int BLASFUNC(xtrsm)(char *, char *, char *, char *, int *, int *,
-	   double *,  double *, int *, double *, int *);
-
-int BLASFUNC(strmm)(char *, char *, char *, char *, int *, int *,
-	   float *,  float *, int *, float *, int *);
-int BLASFUNC(dtrmm)(char *, char *, char *, char *, int *, int *,
-	   double *,  double *, int *, double *, int *);
-int BLASFUNC(qtrmm)(char *, char *, char *, char *, int *, int *,
-	   double *,  double *, int *, double *, int *);
-int BLASFUNC(ctrmm)(char *, char *, char *, char *, int *, int *,
-	   float *,  float *, int *, float *, int *);
-int BLASFUNC(ztrmm)(char *, char *, char *, char *, int *, int *,
-	   double *,  double *, int *, double *, int *);
-int BLASFUNC(xtrmm)(char *, char *, char *, char *, int *, int *,
-	   double *,  double *, int *, double *, int *);
-
-int BLASFUNC(ssymm)(char *, char *, int *, int *, float  *, float  *, int *,
-	   float  *, int *, float  *, float  *, int *);
-int BLASFUNC(dsymm)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, int *, double *, double *, int *);
-int BLASFUNC(qsymm)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, int *, double *, double *, int *);
-int BLASFUNC(csymm)(char *, char *, int *, int *, float  *, float  *, int *,
-	   float  *, int *, float  *, float  *, int *);
-int BLASFUNC(zsymm)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, int *, double *, double *, int *);
-int BLASFUNC(xsymm)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, int *, double *, double *, int *);
-
-int BLASFUNC(csymm3m)(char *, char *, int *, int *, float  *, float  *, int *,
-	   float  *, int *, float  *, float  *, int *);
-int BLASFUNC(zsymm3m)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, int *, double *, double *, int *);
-int BLASFUNC(xsymm3m)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, int *, double *, double *, int *);
-
-int BLASFUNC(ssyrk)(char *, char *, int *, int *, float  *, float  *, int *,
-	   float  *, float  *, int *);
-int BLASFUNC(dsyrk)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, double *, int *);
-int BLASFUNC(qsyrk)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, double *, int *);
-int BLASFUNC(csyrk)(char *, char *, int *, int *, float  *, float  *, int *,
-	   float  *, float  *, int *);
-int BLASFUNC(zsyrk)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, double *, int *);
-int BLASFUNC(xsyrk)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, double *, int *);
-
-int BLASFUNC(ssyr2k)(char *, char *, int *, int *, float  *, float  *, int *,
-	   float *, int *, float  *, float  *, int *);
-int BLASFUNC(dsyr2k)(char *, char *, int *, int *, double *, double *, int *,
-	   double*, int *, double *, double *, int *);
-int BLASFUNC(qsyr2k)(char *, char *, int *, int *, double *, double *, int *,
-	   double*, int *, double *, double *, int *);
-int BLASFUNC(csyr2k)(char *, char *, int *, int *, float  *, float  *, int *,
-	   float *, int *, float  *, float  *, int *);
-int BLASFUNC(zsyr2k)(char *, char *, int *, int *, double *, double *, int *,
-	   double*, int *, double *, double *, int *);
-int BLASFUNC(xsyr2k)(char *, char *, int *, int *, double *, double *, int *,
-	   double*, int *, double *, double *, int *);
-
-int BLASFUNC(chemm)(char *, char *, int *, int *, float  *, float  *, int *,
-	   float  *, int *, float  *, float  *, int *);
-int BLASFUNC(zhemm)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, int *, double *, double *, int *);
-int BLASFUNC(xhemm)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, int *, double *, double *, int *);
-
-int BLASFUNC(chemm3m)(char *, char *, int *, int *, float  *, float  *, int *,
-	   float  *, int *, float  *, float  *, int *);
-int BLASFUNC(zhemm3m)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, int *, double *, double *, int *);
-int BLASFUNC(xhemm3m)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, int *, double *, double *, int *);
-
-int BLASFUNC(cherk)(char *, char *, int *, int *, float  *, float  *, int *,
-	   float  *, float  *, int *);
-int BLASFUNC(zherk)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, double *, int *);
-int BLASFUNC(xherk)(char *, char *, int *, int *, double *, double *, int *,
-	   double *, double *, int *);
-
-int BLASFUNC(cher2k)(char *, char *, int *, int *, float  *, float  *, int *,
-	   float *, int *, float  *, float  *, int *);
-int BLASFUNC(zher2k)(char *, char *, int *, int *, double *, double *, int *,
-	   double*, int *, double *, double *, int *);
-int BLASFUNC(xher2k)(char *, char *, int *, int *, double *, double *, int *,
-	   double*, int *, double *, double *, int *);
-int BLASFUNC(cher2m)(char *, char *, char *, int *, int *, float  *, float  *, int *,
-	   float *, int *, float  *, float  *, int *);
-int BLASFUNC(zher2m)(char *, char *, char *, int *, int *, double *, double *, int *,
-	   double*, int *, double *, double *, int *);
-int BLASFUNC(xher2m)(char *, char *, char *, int *, int *, double *, double *, int *,
-	   double*, int *, double *, double *, int *);
-
-int BLASFUNC(sgemt)(char *, int *, int *, float  *, float  *, int *,
-		    float  *, int *);
-int BLASFUNC(dgemt)(char *, int *, int *, double *, double *, int *,
-		    double *, int *);
-int BLASFUNC(cgemt)(char *, int *, int *, float  *, float  *, int *,
-		    float  *, int *);
-int BLASFUNC(zgemt)(char *, int *, int *, double *, double *, int *,
-		    double *, int *);
-
-int BLASFUNC(sgema)(char *, char *, int *, int *, float  *,
-		    float  *, int *, float *, float  *, int *, float *, int *);
-int BLASFUNC(dgema)(char *, char *, int *, int *, double *,
-		    double *, int *, double*, double *, int *, double*, int *);
-int BLASFUNC(cgema)(char *, char *, int *, int *, float  *,
-		    float  *, int *, float *, float  *, int *, float *, int *);
-int BLASFUNC(zgema)(char *, char *, int *, int *, double *,
-		    double *, int *, double*, double *, int *, double*, int *);
-
-int BLASFUNC(sgems)(char *, char *, int *, int *, float  *,
-		    float  *, int *, float *, float  *, int *, float *, int *);
-int BLASFUNC(dgems)(char *, char *, int *, int *, double *,
-		    double *, int *, double*, double *, int *, double*, int *);
-int BLASFUNC(cgems)(char *, char *, int *, int *, float  *,
-		    float  *, int *, float *, float  *, int *, float *, int *);
-int BLASFUNC(zgems)(char *, char *, int *, int *, double *,
-		    double *, int *, double*, double *, int *, double*, int *);
-
-int BLASFUNC(sgetf2)(int *, int *, float  *, int *, int *, int *);
-int BLASFUNC(dgetf2)(int *, int *, double *, int *, int *, int *);
-int BLASFUNC(qgetf2)(int *, int *, double *, int *, int *, int *);
-int BLASFUNC(cgetf2)(int *, int *, float  *, int *, int *, int *);
-int BLASFUNC(zgetf2)(int *, int *, double *, int *, int *, int *);
-int BLASFUNC(xgetf2)(int *, int *, double *, int *, int *, int *);
-
-int BLASFUNC(sgetrf)(int *, int *, float  *, int *, int *, int *);
-int BLASFUNC(dgetrf)(int *, int *, double *, int *, int *, int *);
-int BLASFUNC(qgetrf)(int *, int *, double *, int *, int *, int *);
-int BLASFUNC(cgetrf)(int *, int *, float  *, int *, int *, int *);
-int BLASFUNC(zgetrf)(int *, int *, double *, int *, int *, int *);
-int BLASFUNC(xgetrf)(int *, int *, double *, int *, int *, int *);
-
-int BLASFUNC(slaswp)(int *, float  *, int *, int *, int *, int *, int *);
-int BLASFUNC(dlaswp)(int *, double *, int *, int *, int *, int *, int *);
-int BLASFUNC(qlaswp)(int *, double *, int *, int *, int *, int *, int *);
-int BLASFUNC(claswp)(int *, float  *, int *, int *, int *, int *, int *);
-int BLASFUNC(zlaswp)(int *, double *, int *, int *, int *, int *, int *);
-int BLASFUNC(xlaswp)(int *, double *, int *, int *, int *, int *, int *);
-
-int BLASFUNC(sgetrs)(char *, int *, int *, float  *, int *, int *, float  *, int *, int *);
-int BLASFUNC(dgetrs)(char *, int *, int *, double *, int *, int *, double *, int *, int *);
-int BLASFUNC(qgetrs)(char *, int *, int *, double *, int *, int *, double *, int *, int *);
-int BLASFUNC(cgetrs)(char *, int *, int *, float  *, int *, int *, float  *, int *, int *);
-int BLASFUNC(zgetrs)(char *, int *, int *, double *, int *, int *, double *, int *, int *);
-int BLASFUNC(xgetrs)(char *, int *, int *, double *, int *, int *, double *, int *, int *);
-
-int BLASFUNC(sgesv)(int *, int *, float  *, int *, int *, float *, int *, int *);
-int BLASFUNC(dgesv)(int *, int *, double *, int *, int *, double*, int *, int *);
-int BLASFUNC(qgesv)(int *, int *, double *, int *, int *, double*, int *, int *);
-int BLASFUNC(cgesv)(int *, int *, float  *, int *, int *, float *, int *, int *);
-int BLASFUNC(zgesv)(int *, int *, double *, int *, int *, double*, int *, int *);
-int BLASFUNC(xgesv)(int *, int *, double *, int *, int *, double*, int *, int *);
-
-int BLASFUNC(spotf2)(char *, int *, float  *, int *, int *);
-int BLASFUNC(dpotf2)(char *, int *, double *, int *, int *);
-int BLASFUNC(qpotf2)(char *, int *, double *, int *, int *);
-int BLASFUNC(cpotf2)(char *, int *, float  *, int *, int *);
-int BLASFUNC(zpotf2)(char *, int *, double *, int *, int *);
-int BLASFUNC(xpotf2)(char *, int *, double *, int *, int *);
-
-int BLASFUNC(spotrf)(char *, int *, float  *, int *, int *);
-int BLASFUNC(dpotrf)(char *, int *, double *, int *, int *);
-int BLASFUNC(qpotrf)(char *, int *, double *, int *, int *);
-int BLASFUNC(cpotrf)(char *, int *, float  *, int *, int *);
-int BLASFUNC(zpotrf)(char *, int *, double *, int *, int *);
-int BLASFUNC(xpotrf)(char *, int *, double *, int *, int *);
-
-int BLASFUNC(slauu2)(char *, int *, float  *, int *, int *);
-int BLASFUNC(dlauu2)(char *, int *, double *, int *, int *);
-int BLASFUNC(qlauu2)(char *, int *, double *, int *, int *);
-int BLASFUNC(clauu2)(char *, int *, float  *, int *, int *);
-int BLASFUNC(zlauu2)(char *, int *, double *, int *, int *);
-int BLASFUNC(xlauu2)(char *, int *, double *, int *, int *);
-
-int BLASFUNC(slauum)(char *, int *, float  *, int *, int *);
-int BLASFUNC(dlauum)(char *, int *, double *, int *, int *);
-int BLASFUNC(qlauum)(char *, int *, double *, int *, int *);
-int BLASFUNC(clauum)(char *, int *, float  *, int *, int *);
-int BLASFUNC(zlauum)(char *, int *, double *, int *, int *);
-int BLASFUNC(xlauum)(char *, int *, double *, int *, int *);
-
-int BLASFUNC(strti2)(char *, char *, int *, float  *, int *, int *);
-int BLASFUNC(dtrti2)(char *, char *, int *, double *, int *, int *);
-int BLASFUNC(qtrti2)(char *, char *, int *, double *, int *, int *);
-int BLASFUNC(ctrti2)(char *, char *, int *, float  *, int *, int *);
-int BLASFUNC(ztrti2)(char *, char *, int *, double *, int *, int *);
-int BLASFUNC(xtrti2)(char *, char *, int *, double *, int *, int *);
-
-int BLASFUNC(strtri)(char *, char *, int *, float  *, int *, int *);
-int BLASFUNC(dtrtri)(char *, char *, int *, double *, int *, int *);
-int BLASFUNC(qtrtri)(char *, char *, int *, double *, int *, int *);
-int BLASFUNC(ctrtri)(char *, char *, int *, float  *, int *, int *);
-int BLASFUNC(ztrtri)(char *, char *, int *, double *, int *, int *);
-int BLASFUNC(xtrtri)(char *, char *, int *, double *, int *, int *);
-
-int BLASFUNC(spotri)(char *, int *, float  *, int *, int *);
-int BLASFUNC(dpotri)(char *, int *, double *, int *, int *);
-int BLASFUNC(qpotri)(char *, int *, double *, int *, int *);
-int BLASFUNC(cpotri)(char *, int *, float  *, int *, int *);
-int BLASFUNC(zpotri)(char *, int *, double *, int *, int *);
-int BLASFUNC(xpotri)(char *, int *, double *, int *, int *);
-
-#ifdef __cplusplus
+void BLASFUNC(sgemm)(const char *, const char *, const int *, const int *, const int *, const float *, const float *,
+                     const int *, const float *, const int *, const float *, float *, const int *);
+void BLASFUNC(dgemm)(const char *, const char *, const int *, const int *, const int *, const double *, const double *,
+                     const int *, const double *, const int *, const double *, double *, const int *);
+void BLASFUNC(cgemm)(const char *, const char *, const int *, const int *, const int *, const float *, const float *,
+                     const int *, const float *, const int *, const float *, float *, const int *);
+void BLASFUNC(zgemm)(const char *, const char *, const int *, const int *, const int *, const double *, const double *,
+                     const int *, const double *, const int *, const double *, double *, const int *);
+
+void BLASFUNC(strsm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *,
+                     const float *, const int *, float *, const int *);
+void BLASFUNC(dtrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *,
+                     const double *, const int *, double *, const int *);
+void BLASFUNC(ctrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *,
+                     const float *, const int *, float *, const int *);
+void BLASFUNC(ztrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *,
+                     const double *, const int *, double *, const int *);
+
+void BLASFUNC(strmm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *,
+                     const float *, const int *, float *, const int *);
+void BLASFUNC(dtrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *,
+                     const double *, const int *, double *, const int *);
+void BLASFUNC(ctrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *,
+                     const float *, const int *, float *, const int *);
+void BLASFUNC(ztrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *,
+                     const double *, const int *, double *, const int *);
+
+void BLASFUNC(ssymm)(const char *, const char *, const int *, const int *, const float *, const float *, const int *,
+                     const float *, const int *, const float *, float *, const int *);
+void BLASFUNC(dsymm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *,
+                     const double *, const int *, const double *, double *, const int *);
+
+void BLASFUNC(ssyrk)(const char *, const char *, const int *, const int *, const float *, const float *, const int *,
+                     const float *, float *, const int *);
+void BLASFUNC(dsyrk)(const char *, const char *, const int *, const int *, const double *, const double *, const int *,
+                     const double *, double *, const int *);
+
+void BLASFUNC(chemm)(const char *, const char *, const int *, const int *, const float *, const float *, const int *,
+                     const float *, const int *, const float *, float *, const int *);
+void BLASFUNC(zhemm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *,
+                     const double *, const int *, const double *, double *, const int *);
+
+void BLASFUNC(cherk)(const char *, const char *, const int *, const int *, const float *, const float *, const int *,
+                     const float *, float *, const int *);
+void BLASFUNC(zherk)(const char *, const char *, const int *, const int *, const double *, const double *, const int *,
+                     const double *, double *, const int *);
+
+#undef BLASFUNC
 }
-#endif
 
 #endif
diff --git a/inst/include/Eigen/src/misc/lapacke.h b/inst/include/Eigen/src/misc/lapacke.h
new file mode 100644
index 00000000..94afd502
--- /dev/null
+++ b/inst/include/Eigen/src/misc/lapacke.h
@@ -0,0 +1,10085 @@
+/*****************************************************************************
+  Copyright (c) 2010, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+******************************************************************************
+* Contents: Native C interface to LAPACK
+* Author: Intel Corporation
+* Generated November, 2011
+*****************************************************************************/
+
+#ifndef _MKL_LAPACKE_H_
+
+#ifndef _LAPACKE_H_
+#define _LAPACKE_H_
+
+/*
+ *  Turn on HAVE_LAPACK_CONFIG_H to redefine C-LAPACK datatypes
+ */
+#ifdef HAVE_LAPACK_CONFIG_H
+#include "lapacke_config.h"
+#endif
+
+#include <stdlib.h>
+
+#ifndef lapack_int
+#ifdef LAPACK_ILP64
+#define lapack_int int64_t
+#else
+#define lapack_int int
+#endif
+#endif
+
+#ifndef lapack_logical
+#define lapack_logical lapack_int
+#endif
+
+/* Complex types are structures equivalent to the
+ * Fortran complex types COMPLEX(4) and COMPLEX(8).
+ *
+ * One can also redefine the types with his own types
+ * for example by including in the code definitions like
+ *
+ * #define lapack_complex_float std::complex<float>
+ * #define lapack_complex_double std::complex<double>
+ *
+ * or define these types in the command line:
+ *
+ * -Dlapack_complex_float="std::complex<float>"
+ * -Dlapack_complex_double="std::complex<double>"
+ */
+
+#ifndef LAPACK_COMPLEX_CUSTOM
+
+/* Complex type (single precision) */
+#ifndef lapack_complex_float
+#define lapack_complex_float std::complex<float>
+#endif
+
+#ifndef lapack_complex_float_real
+#define lapack_complex_float_real(z) (creal(z))
+#endif
+
+#ifndef lapack_complex_float_imag
+#define lapack_complex_float_imag(z) (cimag(z))
+#endif
+
+lapack_complex_float lapack_make_complex_float(float re, float im);
+
+/* Complex type (double precision) */
+#ifndef lapack_complex_double
+#define lapack_complex_double std::complex<double>
+#endif
+
+#ifndef lapack_complex_double_real
+#define lapack_complex_double_real(z) (creal(z))
+#endif
+
+#ifndef lapack_complex_double_imag
+#define lapack_complex_double_imag(z) (cimag(z))
+#endif
+
+lapack_complex_double lapack_make_complex_double(double re, double im);
+
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+#ifndef LAPACKE_malloc
+#define LAPACKE_malloc(size) malloc(size)
+#endif
+#ifndef LAPACKE_free
+#define LAPACKE_free(p) free(p)
+#endif
+
+#define LAPACK_C2INT(x) (lapack_int)(*((float*)&x))
+#define LAPACK_Z2INT(x) (lapack_int)(*((double*)&x))
+
+#define LAPACK_ROW_MAJOR 101
+#define LAPACK_COL_MAJOR 102
+
+#define LAPACK_WORK_MEMORY_ERROR -1010
+#define LAPACK_TRANSPOSE_MEMORY_ERROR -1011
+
+/* Callback logical functions of one, two, or three arguments are used
+ *  to select eigenvalues to sort to the top left of the Schur form.
+ *  The value is selected if function returns TRUE (non-zero). */
+
+typedef lapack_logical (*LAPACK_S_SELECT2)(const float*, const float*);
+typedef lapack_logical (*LAPACK_S_SELECT3)(const float*, const float*, const float*);
+typedef lapack_logical (*LAPACK_D_SELECT2)(const double*, const double*);
+typedef lapack_logical (*LAPACK_D_SELECT3)(const double*, const double*, const double*);
+
+typedef lapack_logical (*LAPACK_C_SELECT1)(const lapack_complex_float*);
+typedef lapack_logical (*LAPACK_C_SELECT2)(const lapack_complex_float*, const lapack_complex_float*);
+typedef lapack_logical (*LAPACK_Z_SELECT1)(const lapack_complex_double*);
+typedef lapack_logical (*LAPACK_Z_SELECT2)(const lapack_complex_double*, const lapack_complex_double*);
+
+#include "lapacke_mangling.h"
+
+#define LAPACK_lsame LAPACK_GLOBAL(lsame, LSAME)
+lapack_logical LAPACK_lsame(char* ca, char* cb, lapack_int lca, lapack_int lcb);
+
+/* C-LAPACK function prototypes */
+
+lapack_int LAPACKE_sbdsdc(int matrix_order, char uplo, char compq, lapack_int n, float* d, float* e, float* u,
+                          lapack_int ldu, float* vt, lapack_int ldvt, float* q, lapack_int* iq);
+lapack_int LAPACKE_dbdsdc(int matrix_order, char uplo, char compq, lapack_int n, double* d, double* e, double* u,
+                          lapack_int ldu, double* vt, lapack_int ldvt, double* q, lapack_int* iq);
+
+lapack_int LAPACKE_sbdsqr(int matrix_order, char uplo, lapack_int n, lapack_int ncvt, lapack_int nru, lapack_int ncc,
+                          float* d, float* e, float* vt, lapack_int ldvt, float* u, lapack_int ldu, float* c,
+                          lapack_int ldc);
+lapack_int LAPACKE_dbdsqr(int matrix_order, char uplo, lapack_int n, lapack_int ncvt, lapack_int nru, lapack_int ncc,
+                          double* d, double* e, double* vt, lapack_int ldvt, double* u, lapack_int ldu, double* c,
+                          lapack_int ldc);
+lapack_int LAPACKE_cbdsqr(int matrix_order, char uplo, lapack_int n, lapack_int ncvt, lapack_int nru, lapack_int ncc,
+                          float* d, float* e, lapack_complex_float* vt, lapack_int ldvt, lapack_complex_float* u,
+                          lapack_int ldu, lapack_complex_float* c, lapack_int ldc);
+lapack_int LAPACKE_zbdsqr(int matrix_order, char uplo, lapack_int n, lapack_int ncvt, lapack_int nru, lapack_int ncc,
+                          double* d, double* e, lapack_complex_double* vt, lapack_int ldvt, lapack_complex_double* u,
+                          lapack_int ldu, lapack_complex_double* c, lapack_int ldc);
+
+lapack_int LAPACKE_sdisna(char job, lapack_int m, lapack_int n, const float* d, float* sep);
+lapack_int LAPACKE_ddisna(char job, lapack_int m, lapack_int n, const double* d, double* sep);
+
+lapack_int LAPACKE_sgbbrd(int matrix_order, char vect, lapack_int m, lapack_int n, lapack_int ncc, lapack_int kl,
+                          lapack_int ku, float* ab, lapack_int ldab, float* d, float* e, float* q, lapack_int ldq,
+                          float* pt, lapack_int ldpt, float* c, lapack_int ldc);
+lapack_int LAPACKE_dgbbrd(int matrix_order, char vect, lapack_int m, lapack_int n, lapack_int ncc, lapack_int kl,
+                          lapack_int ku, double* ab, lapack_int ldab, double* d, double* e, double* q, lapack_int ldq,
+                          double* pt, lapack_int ldpt, double* c, lapack_int ldc);
+lapack_int LAPACKE_cgbbrd(int matrix_order, char vect, lapack_int m, lapack_int n, lapack_int ncc, lapack_int kl,
+                          lapack_int ku, lapack_complex_float* ab, lapack_int ldab, float* d, float* e,
+                          lapack_complex_float* q, lapack_int ldq, lapack_complex_float* pt, lapack_int ldpt,
+                          lapack_complex_float* c, lapack_int ldc);
+lapack_int LAPACKE_zgbbrd(int matrix_order, char vect, lapack_int m, lapack_int n, lapack_int ncc, lapack_int kl,
+                          lapack_int ku, lapack_complex_double* ab, lapack_int ldab, double* d, double* e,
+                          lapack_complex_double* q, lapack_int ldq, lapack_complex_double* pt, lapack_int ldpt,
+                          lapack_complex_double* c, lapack_int ldc);
+
+lapack_int LAPACKE_sgbcon(int matrix_order, char norm, lapack_int n, lapack_int kl, lapack_int ku, const float* ab,
+                          lapack_int ldab, const lapack_int* ipiv, float anorm, float* rcond);
+lapack_int LAPACKE_dgbcon(int matrix_order, char norm, lapack_int n, lapack_int kl, lapack_int ku, const double* ab,
+                          lapack_int ldab, const lapack_int* ipiv, double anorm, double* rcond);
+lapack_int LAPACKE_cgbcon(int matrix_order, char norm, lapack_int n, lapack_int kl, lapack_int ku,
+                          const lapack_complex_float* ab, lapack_int ldab, const lapack_int* ipiv, float anorm,
+                          float* rcond);
+lapack_int LAPACKE_zgbcon(int matrix_order, char norm, lapack_int n, lapack_int kl, lapack_int ku,
+                          const lapack_complex_double* ab, lapack_int ldab, const lapack_int* ipiv, double anorm,
+                          double* rcond);
+
+lapack_int LAPACKE_sgbequ(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku, const float* ab,
+                          lapack_int ldab, float* r, float* c, float* rowcnd, float* colcnd, float* amax);
+lapack_int LAPACKE_dgbequ(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku, const double* ab,
+                          lapack_int ldab, double* r, double* c, double* rowcnd, double* colcnd, double* amax);
+lapack_int LAPACKE_cgbequ(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku,
+                          const lapack_complex_float* ab, lapack_int ldab, float* r, float* c, float* rowcnd,
+                          float* colcnd, float* amax);
+lapack_int LAPACKE_zgbequ(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku,
+                          const lapack_complex_double* ab, lapack_int ldab, double* r, double* c, double* rowcnd,
+                          double* colcnd, double* amax);
+
+lapack_int LAPACKE_sgbequb(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku, const float* ab,
+                           lapack_int ldab, float* r, float* c, float* rowcnd, float* colcnd, float* amax);
+lapack_int LAPACKE_dgbequb(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku, const double* ab,
+                           lapack_int ldab, double* r, double* c, double* rowcnd, double* colcnd, double* amax);
+lapack_int LAPACKE_cgbequb(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku,
+                           const lapack_complex_float* ab, lapack_int ldab, float* r, float* c, float* rowcnd,
+                           float* colcnd, float* amax);
+lapack_int LAPACKE_zgbequb(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku,
+                           const lapack_complex_double* ab, lapack_int ldab, double* r, double* c, double* rowcnd,
+                           double* colcnd, double* amax);
+
+lapack_int LAPACKE_sgbrfs(int matrix_order, char trans, lapack_int n, lapack_int kl, lapack_int ku, lapack_int nrhs,
+                          const float* ab, lapack_int ldab, const float* afb, lapack_int ldafb, const lapack_int* ipiv,
+                          const float* b, lapack_int ldb, float* x, lapack_int ldx, float* ferr, float* berr);
+lapack_int LAPACKE_dgbrfs(int matrix_order, char trans, lapack_int n, lapack_int kl, lapack_int ku, lapack_int nrhs,
+                          const double* ab, lapack_int ldab, const double* afb, lapack_int ldafb,
+                          const lapack_int* ipiv, const double* b, lapack_int ldb, double* x, lapack_int ldx,
+                          double* ferr, double* berr);
+lapack_int LAPACKE_cgbrfs(int matrix_order, char trans, lapack_int n, lapack_int kl, lapack_int ku, lapack_int nrhs,
+                          const lapack_complex_float* ab, lapack_int ldab, const lapack_complex_float* afb,
+                          lapack_int ldafb, const lapack_int* ipiv, const lapack_complex_float* b, lapack_int ldb,
+                          lapack_complex_float* x, lapack_int ldx, float* ferr, float* berr);
+lapack_int LAPACKE_zgbrfs(int matrix_order, char trans, lapack_int n, lapack_int kl, lapack_int ku, lapack_int nrhs,
+                          const lapack_complex_double* ab, lapack_int ldab, const lapack_complex_double* afb,
+                          lapack_int ldafb, const lapack_int* ipiv, const lapack_complex_double* b, lapack_int ldb,
+                          lapack_complex_double* x, lapack_int ldx, double* ferr, double* berr);
+
+lapack_int LAPACKE_sgbrfsx(int matrix_order, char trans, char equed, lapack_int n, lapack_int kl, lapack_int ku,
+                           lapack_int nrhs, const float* ab, lapack_int ldab, const float* afb, lapack_int ldafb,
+                           const lapack_int* ipiv, const float* r, const float* c, const float* b, lapack_int ldb,
+                           float* x, lapack_int ldx, float* rcond, float* berr, lapack_int n_err_bnds,
+                           float* err_bnds_norm, float* err_bnds_comp, lapack_int nparams, float* params);
+lapack_int LAPACKE_dgbrfsx(int matrix_order, char trans, char equed, lapack_int n, lapack_int kl, lapack_int ku,
+                           lapack_int nrhs, const double* ab, lapack_int ldab, const double* afb, lapack_int ldafb,
+                           const lapack_int* ipiv, const double* r, const double* c, const double* b, lapack_int ldb,
+                           double* x, lapack_int ldx, double* rcond, double* berr, lapack_int n_err_bnds,
+                           double* err_bnds_norm, double* err_bnds_comp, lapack_int nparams, double* params);
+lapack_int LAPACKE_cgbrfsx(int matrix_order, char trans, char equed, lapack_int n, lapack_int kl, lapack_int ku,
+                           lapack_int nrhs, const lapack_complex_float* ab, lapack_int ldab,
+                           const lapack_complex_float* afb, lapack_int ldafb, const lapack_int* ipiv, const float* r,
+                           const float* c, const lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x,
+                           lapack_int ldx, float* rcond, float* berr, lapack_int n_err_bnds, float* err_bnds_norm,
+                           float* err_bnds_comp, lapack_int nparams, float* params);
+lapack_int LAPACKE_zgbrfsx(int matrix_order, char trans, char equed, lapack_int n, lapack_int kl, lapack_int ku,
+                           lapack_int nrhs, const lapack_complex_double* ab, lapack_int ldab,
+                           const lapack_complex_double* afb, lapack_int ldafb, const lapack_int* ipiv, const double* r,
+                           const double* c, const lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x,
+                           lapack_int ldx, double* rcond, double* berr, lapack_int n_err_bnds, double* err_bnds_norm,
+                           double* err_bnds_comp, lapack_int nparams, double* params);
+
+lapack_int LAPACKE_sgbsv(int matrix_order, lapack_int n, lapack_int kl, lapack_int ku, lapack_int nrhs, float* ab,
+                         lapack_int ldab, lapack_int* ipiv, float* b, lapack_int ldb);
+lapack_int LAPACKE_dgbsv(int matrix_order, lapack_int n, lapack_int kl, lapack_int ku, lapack_int nrhs, double* ab,
+                         lapack_int ldab, lapack_int* ipiv, double* b, lapack_int ldb);
+lapack_int LAPACKE_cgbsv(int matrix_order, lapack_int n, lapack_int kl, lapack_int ku, lapack_int nrhs,
+                         lapack_complex_float* ab, lapack_int ldab, lapack_int* ipiv, lapack_complex_float* b,
+                         lapack_int ldb);
+lapack_int LAPACKE_zgbsv(int matrix_order, lapack_int n, lapack_int kl, lapack_int ku, lapack_int nrhs,
+                         lapack_complex_double* ab, lapack_int ldab, lapack_int* ipiv, lapack_complex_double* b,
+                         lapack_int ldb);
+
+lapack_int LAPACKE_sgbsvx(int matrix_order, char fact, char trans, lapack_int n, lapack_int kl, lapack_int ku,
+                          lapack_int nrhs, float* ab, lapack_int ldab, float* afb, lapack_int ldafb, lapack_int* ipiv,
+                          char* equed, float* r, float* c, float* b, lapack_int ldb, float* x, lapack_int ldx,
+                          float* rcond, float* ferr, float* berr, float* rpivot);
+lapack_int LAPACKE_dgbsvx(int matrix_order, char fact, char trans, lapack_int n, lapack_int kl, lapack_int ku,
+                          lapack_int nrhs, double* ab, lapack_int ldab, double* afb, lapack_int ldafb, lapack_int* ipiv,
+                          char* equed, double* r, double* c, double* b, lapack_int ldb, double* x, lapack_int ldx,
+                          double* rcond, double* ferr, double* berr, double* rpivot);
+lapack_int LAPACKE_cgbsvx(int matrix_order, char fact, char trans, lapack_int n, lapack_int kl, lapack_int ku,
+                          lapack_int nrhs, lapack_complex_float* ab, lapack_int ldab, lapack_complex_float* afb,
+                          lapack_int ldafb, lapack_int* ipiv, char* equed, float* r, float* c, lapack_complex_float* b,
+                          lapack_int ldb, lapack_complex_float* x, lapack_int ldx, float* rcond, float* ferr,
+                          float* berr, float* rpivot);
+lapack_int LAPACKE_zgbsvx(int matrix_order, char fact, char trans, lapack_int n, lapack_int kl, lapack_int ku,
+                          lapack_int nrhs, lapack_complex_double* ab, lapack_int ldab, lapack_complex_double* afb,
+                          lapack_int ldafb, lapack_int* ipiv, char* equed, double* r, double* c,
+                          lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x, lapack_int ldx,
+                          double* rcond, double* ferr, double* berr, double* rpivot);
+
+lapack_int LAPACKE_sgbsvxx(int matrix_order, char fact, char trans, lapack_int n, lapack_int kl, lapack_int ku,
+                           lapack_int nrhs, float* ab, lapack_int ldab, float* afb, lapack_int ldafb, lapack_int* ipiv,
+                           char* equed, float* r, float* c, float* b, lapack_int ldb, float* x, lapack_int ldx,
+                           float* rcond, float* rpvgrw, float* berr, lapack_int n_err_bnds, float* err_bnds_norm,
+                           float* err_bnds_comp, lapack_int nparams, float* params);
+lapack_int LAPACKE_dgbsvxx(int matrix_order, char fact, char trans, lapack_int n, lapack_int kl, lapack_int ku,
+                           lapack_int nrhs, double* ab, lapack_int ldab, double* afb, lapack_int ldafb,
+                           lapack_int* ipiv, char* equed, double* r, double* c, double* b, lapack_int ldb, double* x,
+                           lapack_int ldx, double* rcond, double* rpvgrw, double* berr, lapack_int n_err_bnds,
+                           double* err_bnds_norm, double* err_bnds_comp, lapack_int nparams, double* params);
+lapack_int LAPACKE_cgbsvxx(int matrix_order, char fact, char trans, lapack_int n, lapack_int kl, lapack_int ku,
+                           lapack_int nrhs, lapack_complex_float* ab, lapack_int ldab, lapack_complex_float* afb,
+                           lapack_int ldafb, lapack_int* ipiv, char* equed, float* r, float* c, lapack_complex_float* b,
+                           lapack_int ldb, lapack_complex_float* x, lapack_int ldx, float* rcond, float* rpvgrw,
+                           float* berr, lapack_int n_err_bnds, float* err_bnds_norm, float* err_bnds_comp,
+                           lapack_int nparams, float* params);
+lapack_int LAPACKE_zgbsvxx(int matrix_order, char fact, char trans, lapack_int n, lapack_int kl, lapack_int ku,
+                           lapack_int nrhs, lapack_complex_double* ab, lapack_int ldab, lapack_complex_double* afb,
+                           lapack_int ldafb, lapack_int* ipiv, char* equed, double* r, double* c,
+                           lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x, lapack_int ldx,
+                           double* rcond, double* rpvgrw, double* berr, lapack_int n_err_bnds, double* err_bnds_norm,
+                           double* err_bnds_comp, lapack_int nparams, double* params);
+
+lapack_int LAPACKE_sgbtrf(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku, float* ab,
+                          lapack_int ldab, lapack_int* ipiv);
+lapack_int LAPACKE_dgbtrf(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku, double* ab,
+                          lapack_int ldab, lapack_int* ipiv);
+lapack_int LAPACKE_cgbtrf(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku,
+                          lapack_complex_float* ab, lapack_int ldab, lapack_int* ipiv);
+lapack_int LAPACKE_zgbtrf(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku,
+                          lapack_complex_double* ab, lapack_int ldab, lapack_int* ipiv);
+
+lapack_int LAPACKE_sgbtrs(int matrix_order, char trans, lapack_int n, lapack_int kl, lapack_int ku, lapack_int nrhs,
+                          const float* ab, lapack_int ldab, const lapack_int* ipiv, float* b, lapack_int ldb);
+lapack_int LAPACKE_dgbtrs(int matrix_order, char trans, lapack_int n, lapack_int kl, lapack_int ku, lapack_int nrhs,
+                          const double* ab, lapack_int ldab, const lapack_int* ipiv, double* b, lapack_int ldb);
+lapack_int LAPACKE_cgbtrs(int matrix_order, char trans, lapack_int n, lapack_int kl, lapack_int ku, lapack_int nrhs,
+                          const lapack_complex_float* ab, lapack_int ldab, const lapack_int* ipiv,
+                          lapack_complex_float* b, lapack_int ldb);
+lapack_int LAPACKE_zgbtrs(int matrix_order, char trans, lapack_int n, lapack_int kl, lapack_int ku, lapack_int nrhs,
+                          const lapack_complex_double* ab, lapack_int ldab, const lapack_int* ipiv,
+                          lapack_complex_double* b, lapack_int ldb);
+
+lapack_int LAPACKE_sgebak(int matrix_order, char job, char side, lapack_int n, lapack_int ilo, lapack_int ihi,
+                          const float* scale, lapack_int m, float* v, lapack_int ldv);
+lapack_int LAPACKE_dgebak(int matrix_order, char job, char side, lapack_int n, lapack_int ilo, lapack_int ihi,
+                          const double* scale, lapack_int m, double* v, lapack_int ldv);
+lapack_int LAPACKE_cgebak(int matrix_order, char job, char side, lapack_int n, lapack_int ilo, lapack_int ihi,
+                          const float* scale, lapack_int m, lapack_complex_float* v, lapack_int ldv);
+lapack_int LAPACKE_zgebak(int matrix_order, char job, char side, lapack_int n, lapack_int ilo, lapack_int ihi,
+                          const double* scale, lapack_int m, lapack_complex_double* v, lapack_int ldv);
+
+lapack_int LAPACKE_sgebal(int matrix_order, char job, lapack_int n, float* a, lapack_int lda, lapack_int* ilo,
+                          lapack_int* ihi, float* scale);
+lapack_int LAPACKE_dgebal(int matrix_order, char job, lapack_int n, double* a, lapack_int lda, lapack_int* ilo,
+                          lapack_int* ihi, double* scale);
+lapack_int LAPACKE_cgebal(int matrix_order, char job, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                          lapack_int* ilo, lapack_int* ihi, float* scale);
+lapack_int LAPACKE_zgebal(int matrix_order, char job, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                          lapack_int* ilo, lapack_int* ihi, double* scale);
+
+lapack_int LAPACKE_sgebrd(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, float* d, float* e,
+                          float* tauq, float* taup);
+lapack_int LAPACKE_dgebrd(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda, double* d, double* e,
+                          double* tauq, double* taup);
+lapack_int LAPACKE_cgebrd(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                          float* d, float* e, lapack_complex_float* tauq, lapack_complex_float* taup);
+lapack_int LAPACKE_zgebrd(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                          double* d, double* e, lapack_complex_double* tauq, lapack_complex_double* taup);
+
+lapack_int LAPACKE_sgecon(int matrix_order, char norm, lapack_int n, const float* a, lapack_int lda, float anorm,
+                          float* rcond);
+lapack_int LAPACKE_dgecon(int matrix_order, char norm, lapack_int n, const double* a, lapack_int lda, double anorm,
+                          double* rcond);
+lapack_int LAPACKE_cgecon(int matrix_order, char norm, lapack_int n, const lapack_complex_float* a, lapack_int lda,
+                          float anorm, float* rcond);
+lapack_int LAPACKE_zgecon(int matrix_order, char norm, lapack_int n, const lapack_complex_double* a, lapack_int lda,
+                          double anorm, double* rcond);
+
+lapack_int LAPACKE_sgeequ(int matrix_order, lapack_int m, lapack_int n, const float* a, lapack_int lda, float* r,
+                          float* c, float* rowcnd, float* colcnd, float* amax);
+lapack_int LAPACKE_dgeequ(int matrix_order, lapack_int m, lapack_int n, const double* a, lapack_int lda, double* r,
+                          double* c, double* rowcnd, double* colcnd, double* amax);
+lapack_int LAPACKE_cgeequ(int matrix_order, lapack_int m, lapack_int n, const lapack_complex_float* a, lapack_int lda,
+                          float* r, float* c, float* rowcnd, float* colcnd, float* amax);
+lapack_int LAPACKE_zgeequ(int matrix_order, lapack_int m, lapack_int n, const lapack_complex_double* a, lapack_int lda,
+                          double* r, double* c, double* rowcnd, double* colcnd, double* amax);
+
+lapack_int LAPACKE_sgeequb(int matrix_order, lapack_int m, lapack_int n, const float* a, lapack_int lda, float* r,
+                           float* c, float* rowcnd, float* colcnd, float* amax);
+lapack_int LAPACKE_dgeequb(int matrix_order, lapack_int m, lapack_int n, const double* a, lapack_int lda, double* r,
+                           double* c, double* rowcnd, double* colcnd, double* amax);
+lapack_int LAPACKE_cgeequb(int matrix_order, lapack_int m, lapack_int n, const lapack_complex_float* a, lapack_int lda,
+                           float* r, float* c, float* rowcnd, float* colcnd, float* amax);
+lapack_int LAPACKE_zgeequb(int matrix_order, lapack_int m, lapack_int n, const lapack_complex_double* a, lapack_int lda,
+                           double* r, double* c, double* rowcnd, double* colcnd, double* amax);
+
+lapack_int LAPACKE_sgees(int matrix_order, char jobvs, char sort, LAPACK_S_SELECT2 select, lapack_int n, float* a,
+                         lapack_int lda, lapack_int* sdim, float* wr, float* wi, float* vs, lapack_int ldvs);
+lapack_int LAPACKE_dgees(int matrix_order, char jobvs, char sort, LAPACK_D_SELECT2 select, lapack_int n, double* a,
+                         lapack_int lda, lapack_int* sdim, double* wr, double* wi, double* vs, lapack_int ldvs);
+lapack_int LAPACKE_cgees(int matrix_order, char jobvs, char sort, LAPACK_C_SELECT1 select, lapack_int n,
+                         lapack_complex_float* a, lapack_int lda, lapack_int* sdim, lapack_complex_float* w,
+                         lapack_complex_float* vs, lapack_int ldvs);
+lapack_int LAPACKE_zgees(int matrix_order, char jobvs, char sort, LAPACK_Z_SELECT1 select, lapack_int n,
+                         lapack_complex_double* a, lapack_int lda, lapack_int* sdim, lapack_complex_double* w,
+                         lapack_complex_double* vs, lapack_int ldvs);
+
+lapack_int LAPACKE_sgeesx(int matrix_order, char jobvs, char sort, LAPACK_S_SELECT2 select, char sense, lapack_int n,
+                          float* a, lapack_int lda, lapack_int* sdim, float* wr, float* wi, float* vs, lapack_int ldvs,
+                          float* rconde, float* rcondv);
+lapack_int LAPACKE_dgeesx(int matrix_order, char jobvs, char sort, LAPACK_D_SELECT2 select, char sense, lapack_int n,
+                          double* a, lapack_int lda, lapack_int* sdim, double* wr, double* wi, double* vs,
+                          lapack_int ldvs, double* rconde, double* rcondv);
+lapack_int LAPACKE_cgeesx(int matrix_order, char jobvs, char sort, LAPACK_C_SELECT1 select, char sense, lapack_int n,
+                          lapack_complex_float* a, lapack_int lda, lapack_int* sdim, lapack_complex_float* w,
+                          lapack_complex_float* vs, lapack_int ldvs, float* rconde, float* rcondv);
+lapack_int LAPACKE_zgeesx(int matrix_order, char jobvs, char sort, LAPACK_Z_SELECT1 select, char sense, lapack_int n,
+                          lapack_complex_double* a, lapack_int lda, lapack_int* sdim, lapack_complex_double* w,
+                          lapack_complex_double* vs, lapack_int ldvs, double* rconde, double* rcondv);
+
+lapack_int LAPACKE_sgeev(int matrix_order, char jobvl, char jobvr, lapack_int n, float* a, lapack_int lda, float* wr,
+                         float* wi, float* vl, lapack_int ldvl, float* vr, lapack_int ldvr);
+lapack_int LAPACKE_dgeev(int matrix_order, char jobvl, char jobvr, lapack_int n, double* a, lapack_int lda, double* wr,
+                         double* wi, double* vl, lapack_int ldvl, double* vr, lapack_int ldvr);
+lapack_int LAPACKE_cgeev(int matrix_order, char jobvl, char jobvr, lapack_int n, lapack_complex_float* a,
+                         lapack_int lda, lapack_complex_float* w, lapack_complex_float* vl, lapack_int ldvl,
+                         lapack_complex_float* vr, lapack_int ldvr);
+lapack_int LAPACKE_zgeev(int matrix_order, char jobvl, char jobvr, lapack_int n, lapack_complex_double* a,
+                         lapack_int lda, lapack_complex_double* w, lapack_complex_double* vl, lapack_int ldvl,
+                         lapack_complex_double* vr, lapack_int ldvr);
+
+lapack_int LAPACKE_sgeevx(int matrix_order, char balanc, char jobvl, char jobvr, char sense, lapack_int n, float* a,
+                          lapack_int lda, float* wr, float* wi, float* vl, lapack_int ldvl, float* vr, lapack_int ldvr,
+                          lapack_int* ilo, lapack_int* ihi, float* scale, float* abnrm, float* rconde, float* rcondv);
+lapack_int LAPACKE_dgeevx(int matrix_order, char balanc, char jobvl, char jobvr, char sense, lapack_int n, double* a,
+                          lapack_int lda, double* wr, double* wi, double* vl, lapack_int ldvl, double* vr,
+                          lapack_int ldvr, lapack_int* ilo, lapack_int* ihi, double* scale, double* abnrm,
+                          double* rconde, double* rcondv);
+lapack_int LAPACKE_cgeevx(int matrix_order, char balanc, char jobvl, char jobvr, char sense, lapack_int n,
+                          lapack_complex_float* a, lapack_int lda, lapack_complex_float* w, lapack_complex_float* vl,
+                          lapack_int ldvl, lapack_complex_float* vr, lapack_int ldvr, lapack_int* ilo, lapack_int* ihi,
+                          float* scale, float* abnrm, float* rconde, float* rcondv);
+lapack_int LAPACKE_zgeevx(int matrix_order, char balanc, char jobvl, char jobvr, char sense, lapack_int n,
+                          lapack_complex_double* a, lapack_int lda, lapack_complex_double* w, lapack_complex_double* vl,
+                          lapack_int ldvl, lapack_complex_double* vr, lapack_int ldvr, lapack_int* ilo, lapack_int* ihi,
+                          double* scale, double* abnrm, double* rconde, double* rcondv);
+
+lapack_int LAPACKE_sgehrd(int matrix_order, lapack_int n, lapack_int ilo, lapack_int ihi, float* a, lapack_int lda,
+                          float* tau);
+lapack_int LAPACKE_dgehrd(int matrix_order, lapack_int n, lapack_int ilo, lapack_int ihi, double* a, lapack_int lda,
+                          double* tau);
+lapack_int LAPACKE_cgehrd(int matrix_order, lapack_int n, lapack_int ilo, lapack_int ihi, lapack_complex_float* a,
+                          lapack_int lda, lapack_complex_float* tau);
+lapack_int LAPACKE_zgehrd(int matrix_order, lapack_int n, lapack_int ilo, lapack_int ihi, lapack_complex_double* a,
+                          lapack_int lda, lapack_complex_double* tau);
+
+lapack_int LAPACKE_sgejsv(int matrix_order, char joba, char jobu, char jobv, char jobr, char jobt, char jobp,
+                          lapack_int m, lapack_int n, float* a, lapack_int lda, float* sva, float* u, lapack_int ldu,
+                          float* v, lapack_int ldv, float* stat, lapack_int* istat);
+lapack_int LAPACKE_dgejsv(int matrix_order, char joba, char jobu, char jobv, char jobr, char jobt, char jobp,
+                          lapack_int m, lapack_int n, double* a, lapack_int lda, double* sva, double* u, lapack_int ldu,
+                          double* v, lapack_int ldv, double* stat, lapack_int* istat);
+
+lapack_int LAPACKE_sgelq2(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, float* tau);
+lapack_int LAPACKE_dgelq2(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda, double* tau);
+lapack_int LAPACKE_cgelq2(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                          lapack_complex_float* tau);
+lapack_int LAPACKE_zgelq2(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                          lapack_complex_double* tau);
+
+lapack_int LAPACKE_sgelqf(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, float* tau);
+lapack_int LAPACKE_dgelqf(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda, double* tau);
+lapack_int LAPACKE_cgelqf(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                          lapack_complex_float* tau);
+lapack_int LAPACKE_zgelqf(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                          lapack_complex_double* tau);
+
+lapack_int LAPACKE_sgels(int matrix_order, char trans, lapack_int m, lapack_int n, lapack_int nrhs, float* a,
+                         lapack_int lda, float* b, lapack_int ldb);
+lapack_int LAPACKE_dgels(int matrix_order, char trans, lapack_int m, lapack_int n, lapack_int nrhs, double* a,
+                         lapack_int lda, double* b, lapack_int ldb);
+lapack_int LAPACKE_cgels(int matrix_order, char trans, lapack_int m, lapack_int n, lapack_int nrhs,
+                         lapack_complex_float* a, lapack_int lda, lapack_complex_float* b, lapack_int ldb);
+lapack_int LAPACKE_zgels(int matrix_order, char trans, lapack_int m, lapack_int n, lapack_int nrhs,
+                         lapack_complex_double* a, lapack_int lda, lapack_complex_double* b, lapack_int ldb);
+
+lapack_int LAPACKE_sgelsd(int matrix_order, lapack_int m, lapack_int n, lapack_int nrhs, float* a, lapack_int lda,
+                          float* b, lapack_int ldb, float* s, float rcond, lapack_int* rank);
+lapack_int LAPACKE_dgelsd(int matrix_order, lapack_int m, lapack_int n, lapack_int nrhs, double* a, lapack_int lda,
+                          double* b, lapack_int ldb, double* s, double rcond, lapack_int* rank);
+lapack_int LAPACKE_cgelsd(int matrix_order, lapack_int m, lapack_int n, lapack_int nrhs, lapack_complex_float* a,
+                          lapack_int lda, lapack_complex_float* b, lapack_int ldb, float* s, float rcond,
+                          lapack_int* rank);
+lapack_int LAPACKE_zgelsd(int matrix_order, lapack_int m, lapack_int n, lapack_int nrhs, lapack_complex_double* a,
+                          lapack_int lda, lapack_complex_double* b, lapack_int ldb, double* s, double rcond,
+                          lapack_int* rank);
+
+lapack_int LAPACKE_sgelss(int matrix_order, lapack_int m, lapack_int n, lapack_int nrhs, float* a, lapack_int lda,
+                          float* b, lapack_int ldb, float* s, float rcond, lapack_int* rank);
+lapack_int LAPACKE_dgelss(int matrix_order, lapack_int m, lapack_int n, lapack_int nrhs, double* a, lapack_int lda,
+                          double* b, lapack_int ldb, double* s, double rcond, lapack_int* rank);
+lapack_int LAPACKE_cgelss(int matrix_order, lapack_int m, lapack_int n, lapack_int nrhs, lapack_complex_float* a,
+                          lapack_int lda, lapack_complex_float* b, lapack_int ldb, float* s, float rcond,
+                          lapack_int* rank);
+lapack_int LAPACKE_zgelss(int matrix_order, lapack_int m, lapack_int n, lapack_int nrhs, lapack_complex_double* a,
+                          lapack_int lda, lapack_complex_double* b, lapack_int ldb, double* s, double rcond,
+                          lapack_int* rank);
+
+lapack_int LAPACKE_sgelsy(int matrix_order, lapack_int m, lapack_int n, lapack_int nrhs, float* a, lapack_int lda,
+                          float* b, lapack_int ldb, lapack_int* jpvt, float rcond, lapack_int* rank);
+lapack_int LAPACKE_dgelsy(int matrix_order, lapack_int m, lapack_int n, lapack_int nrhs, double* a, lapack_int lda,
+                          double* b, lapack_int ldb, lapack_int* jpvt, double rcond, lapack_int* rank);
+lapack_int LAPACKE_cgelsy(int matrix_order, lapack_int m, lapack_int n, lapack_int nrhs, lapack_complex_float* a,
+                          lapack_int lda, lapack_complex_float* b, lapack_int ldb, lapack_int* jpvt, float rcond,
+                          lapack_int* rank);
+lapack_int LAPACKE_zgelsy(int matrix_order, lapack_int m, lapack_int n, lapack_int nrhs, lapack_complex_double* a,
+                          lapack_int lda, lapack_complex_double* b, lapack_int ldb, lapack_int* jpvt, double rcond,
+                          lapack_int* rank);
+
+lapack_int LAPACKE_sgeqlf(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, float* tau);
+lapack_int LAPACKE_dgeqlf(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda, double* tau);
+lapack_int LAPACKE_cgeqlf(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                          lapack_complex_float* tau);
+lapack_int LAPACKE_zgeqlf(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                          lapack_complex_double* tau);
+
+lapack_int LAPACKE_sgeqp3(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, lapack_int* jpvt,
+                          float* tau);
+lapack_int LAPACKE_dgeqp3(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda, lapack_int* jpvt,
+                          double* tau);
+lapack_int LAPACKE_cgeqp3(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                          lapack_int* jpvt, lapack_complex_float* tau);
+lapack_int LAPACKE_zgeqp3(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                          lapack_int* jpvt, lapack_complex_double* tau);
+
+lapack_int LAPACKE_sgeqpf(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, lapack_int* jpvt,
+                          float* tau);
+lapack_int LAPACKE_dgeqpf(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda, lapack_int* jpvt,
+                          double* tau);
+lapack_int LAPACKE_cgeqpf(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                          lapack_int* jpvt, lapack_complex_float* tau);
+lapack_int LAPACKE_zgeqpf(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                          lapack_int* jpvt, lapack_complex_double* tau);
+
+lapack_int LAPACKE_sgeqr2(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, float* tau);
+lapack_int LAPACKE_dgeqr2(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda, double* tau);
+lapack_int LAPACKE_cgeqr2(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                          lapack_complex_float* tau);
+lapack_int LAPACKE_zgeqr2(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                          lapack_complex_double* tau);
+
+lapack_int LAPACKE_sgeqrf(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, float* tau);
+lapack_int LAPACKE_dgeqrf(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda, double* tau);
+lapack_int LAPACKE_cgeqrf(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                          lapack_complex_float* tau);
+lapack_int LAPACKE_zgeqrf(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                          lapack_complex_double* tau);
+
+lapack_int LAPACKE_sgeqrfp(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, float* tau);
+lapack_int LAPACKE_dgeqrfp(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda, double* tau);
+lapack_int LAPACKE_cgeqrfp(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* tau);
+lapack_int LAPACKE_zgeqrfp(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* tau);
+
+lapack_int LAPACKE_sgerfs(int matrix_order, char trans, lapack_int n, lapack_int nrhs, const float* a, lapack_int lda,
+                          const float* af, lapack_int ldaf, const lapack_int* ipiv, const float* b, lapack_int ldb,
+                          float* x, lapack_int ldx, float* ferr, float* berr);
+lapack_int LAPACKE_dgerfs(int matrix_order, char trans, lapack_int n, lapack_int nrhs, const double* a, lapack_int lda,
+                          const double* af, lapack_int ldaf, const lapack_int* ipiv, const double* b, lapack_int ldb,
+                          double* x, lapack_int ldx, double* ferr, double* berr);
+lapack_int LAPACKE_cgerfs(int matrix_order, char trans, lapack_int n, lapack_int nrhs, const lapack_complex_float* a,
+                          lapack_int lda, const lapack_complex_float* af, lapack_int ldaf, const lapack_int* ipiv,
+                          const lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x, lapack_int ldx,
+                          float* ferr, float* berr);
+lapack_int LAPACKE_zgerfs(int matrix_order, char trans, lapack_int n, lapack_int nrhs, const lapack_complex_double* a,
+                          lapack_int lda, const lapack_complex_double* af, lapack_int ldaf, const lapack_int* ipiv,
+                          const lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x, lapack_int ldx,
+                          double* ferr, double* berr);
+
+lapack_int LAPACKE_sgerfsx(int matrix_order, char trans, char equed, lapack_int n, lapack_int nrhs, const float* a,
+                           lapack_int lda, const float* af, lapack_int ldaf, const lapack_int* ipiv, const float* r,
+                           const float* c, const float* b, lapack_int ldb, float* x, lapack_int ldx, float* rcond,
+                           float* berr, lapack_int n_err_bnds, float* err_bnds_norm, float* err_bnds_comp,
+                           lapack_int nparams, float* params);
+lapack_int LAPACKE_dgerfsx(int matrix_order, char trans, char equed, lapack_int n, lapack_int nrhs, const double* a,
+                           lapack_int lda, const double* af, lapack_int ldaf, const lapack_int* ipiv, const double* r,
+                           const double* c, const double* b, lapack_int ldb, double* x, lapack_int ldx, double* rcond,
+                           double* berr, lapack_int n_err_bnds, double* err_bnds_norm, double* err_bnds_comp,
+                           lapack_int nparams, double* params);
+lapack_int LAPACKE_cgerfsx(int matrix_order, char trans, char equed, lapack_int n, lapack_int nrhs,
+                           const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* af,
+                           lapack_int ldaf, const lapack_int* ipiv, const float* r, const float* c,
+                           const lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x, lapack_int ldx,
+                           float* rcond, float* berr, lapack_int n_err_bnds, float* err_bnds_norm, float* err_bnds_comp,
+                           lapack_int nparams, float* params);
+lapack_int LAPACKE_zgerfsx(int matrix_order, char trans, char equed, lapack_int n, lapack_int nrhs,
+                           const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* af,
+                           lapack_int ldaf, const lapack_int* ipiv, const double* r, const double* c,
+                           const lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x, lapack_int ldx,
+                           double* rcond, double* berr, lapack_int n_err_bnds, double* err_bnds_norm,
+                           double* err_bnds_comp, lapack_int nparams, double* params);
+
+lapack_int LAPACKE_sgerqf(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, float* tau);
+lapack_int LAPACKE_dgerqf(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda, double* tau);
+lapack_int LAPACKE_cgerqf(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                          lapack_complex_float* tau);
+lapack_int LAPACKE_zgerqf(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                          lapack_complex_double* tau);
+
+lapack_int LAPACKE_sgesdd(int matrix_order, char jobz, lapack_int m, lapack_int n, float* a, lapack_int lda, float* s,
+                          float* u, lapack_int ldu, float* vt, lapack_int ldvt);
+lapack_int LAPACKE_dgesdd(int matrix_order, char jobz, lapack_int m, lapack_int n, double* a, lapack_int lda, double* s,
+                          double* u, lapack_int ldu, double* vt, lapack_int ldvt);
+lapack_int LAPACKE_cgesdd(int matrix_order, char jobz, lapack_int m, lapack_int n, lapack_complex_float* a,
+                          lapack_int lda, float* s, lapack_complex_float* u, lapack_int ldu, lapack_complex_float* vt,
+                          lapack_int ldvt);
+lapack_int LAPACKE_zgesdd(int matrix_order, char jobz, lapack_int m, lapack_int n, lapack_complex_double* a,
+                          lapack_int lda, double* s, lapack_complex_double* u, lapack_int ldu,
+                          lapack_complex_double* vt, lapack_int ldvt);
+
+lapack_int LAPACKE_sgesv(int matrix_order, lapack_int n, lapack_int nrhs, float* a, lapack_int lda, lapack_int* ipiv,
+                         float* b, lapack_int ldb);
+lapack_int LAPACKE_dgesv(int matrix_order, lapack_int n, lapack_int nrhs, double* a, lapack_int lda, lapack_int* ipiv,
+                         double* b, lapack_int ldb);
+lapack_int LAPACKE_cgesv(int matrix_order, lapack_int n, lapack_int nrhs, lapack_complex_float* a, lapack_int lda,
+                         lapack_int* ipiv, lapack_complex_float* b, lapack_int ldb);
+lapack_int LAPACKE_zgesv(int matrix_order, lapack_int n, lapack_int nrhs, lapack_complex_double* a, lapack_int lda,
+                         lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb);
+lapack_int LAPACKE_dsgesv(int matrix_order, lapack_int n, lapack_int nrhs, double* a, lapack_int lda, lapack_int* ipiv,
+                          double* b, lapack_int ldb, double* x, lapack_int ldx, lapack_int* iter);
+lapack_int LAPACKE_zcgesv(int matrix_order, lapack_int n, lapack_int nrhs, lapack_complex_double* a, lapack_int lda,
+                          lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x,
+                          lapack_int ldx, lapack_int* iter);
+
+lapack_int LAPACKE_sgesvd(int matrix_order, char jobu, char jobvt, lapack_int m, lapack_int n, float* a, lapack_int lda,
+                          float* s, float* u, lapack_int ldu, float* vt, lapack_int ldvt, float* superb);
+lapack_int LAPACKE_dgesvd(int matrix_order, char jobu, char jobvt, lapack_int m, lapack_int n, double* a,
+                          lapack_int lda, double* s, double* u, lapack_int ldu, double* vt, lapack_int ldvt,
+                          double* superb);
+lapack_int LAPACKE_cgesvd(int matrix_order, char jobu, char jobvt, lapack_int m, lapack_int n, lapack_complex_float* a,
+                          lapack_int lda, float* s, lapack_complex_float* u, lapack_int ldu, lapack_complex_float* vt,
+                          lapack_int ldvt, float* superb);
+lapack_int LAPACKE_zgesvd(int matrix_order, char jobu, char jobvt, lapack_int m, lapack_int n, lapack_complex_double* a,
+                          lapack_int lda, double* s, lapack_complex_double* u, lapack_int ldu,
+                          lapack_complex_double* vt, lapack_int ldvt, double* superb);
+
+lapack_int LAPACKE_sgesvj(int matrix_order, char joba, char jobu, char jobv, lapack_int m, lapack_int n, float* a,
+                          lapack_int lda, float* sva, lapack_int mv, float* v, lapack_int ldv, float* stat);
+lapack_int LAPACKE_dgesvj(int matrix_order, char joba, char jobu, char jobv, lapack_int m, lapack_int n, double* a,
+                          lapack_int lda, double* sva, lapack_int mv, double* v, lapack_int ldv, double* stat);
+
+lapack_int LAPACKE_sgesvx(int matrix_order, char fact, char trans, lapack_int n, lapack_int nrhs, float* a,
+                          lapack_int lda, float* af, lapack_int ldaf, lapack_int* ipiv, char* equed, float* r, float* c,
+                          float* b, lapack_int ldb, float* x, lapack_int ldx, float* rcond, float* ferr, float* berr,
+                          float* rpivot);
+lapack_int LAPACKE_dgesvx(int matrix_order, char fact, char trans, lapack_int n, lapack_int nrhs, double* a,
+                          lapack_int lda, double* af, lapack_int ldaf, lapack_int* ipiv, char* equed, double* r,
+                          double* c, double* b, lapack_int ldb, double* x, lapack_int ldx, double* rcond, double* ferr,
+                          double* berr, double* rpivot);
+lapack_int LAPACKE_cgesvx(int matrix_order, char fact, char trans, lapack_int n, lapack_int nrhs,
+                          lapack_complex_float* a, lapack_int lda, lapack_complex_float* af, lapack_int ldaf,
+                          lapack_int* ipiv, char* equed, float* r, float* c, lapack_complex_float* b, lapack_int ldb,
+                          lapack_complex_float* x, lapack_int ldx, float* rcond, float* ferr, float* berr,
+                          float* rpivot);
+lapack_int LAPACKE_zgesvx(int matrix_order, char fact, char trans, lapack_int n, lapack_int nrhs,
+                          lapack_complex_double* a, lapack_int lda, lapack_complex_double* af, lapack_int ldaf,
+                          lapack_int* ipiv, char* equed, double* r, double* c, lapack_complex_double* b, lapack_int ldb,
+                          lapack_complex_double* x, lapack_int ldx, double* rcond, double* ferr, double* berr,
+                          double* rpivot);
+
+lapack_int LAPACKE_sgesvxx(int matrix_order, char fact, char trans, lapack_int n, lapack_int nrhs, float* a,
+                           lapack_int lda, float* af, lapack_int ldaf, lapack_int* ipiv, char* equed, float* r,
+                           float* c, float* b, lapack_int ldb, float* x, lapack_int ldx, float* rcond, float* rpvgrw,
+                           float* berr, lapack_int n_err_bnds, float* err_bnds_norm, float* err_bnds_comp,
+                           lapack_int nparams, float* params);
+lapack_int LAPACKE_dgesvxx(int matrix_order, char fact, char trans, lapack_int n, lapack_int nrhs, double* a,
+                           lapack_int lda, double* af, lapack_int ldaf, lapack_int* ipiv, char* equed, double* r,
+                           double* c, double* b, lapack_int ldb, double* x, lapack_int ldx, double* rcond,
+                           double* rpvgrw, double* berr, lapack_int n_err_bnds, double* err_bnds_norm,
+                           double* err_bnds_comp, lapack_int nparams, double* params);
+lapack_int LAPACKE_cgesvxx(int matrix_order, char fact, char trans, lapack_int n, lapack_int nrhs,
+                           lapack_complex_float* a, lapack_int lda, lapack_complex_float* af, lapack_int ldaf,
+                           lapack_int* ipiv, char* equed, float* r, float* c, lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx, float* rcond, float* rpvgrw, float* berr,
+                           lapack_int n_err_bnds, float* err_bnds_norm, float* err_bnds_comp, lapack_int nparams,
+                           float* params);
+lapack_int LAPACKE_zgesvxx(int matrix_order, char fact, char trans, lapack_int n, lapack_int nrhs,
+                           lapack_complex_double* a, lapack_int lda, lapack_complex_double* af, lapack_int ldaf,
+                           lapack_int* ipiv, char* equed, double* r, double* c, lapack_complex_double* b,
+                           lapack_int ldb, lapack_complex_double* x, lapack_int ldx, double* rcond, double* rpvgrw,
+                           double* berr, lapack_int n_err_bnds, double* err_bnds_norm, double* err_bnds_comp,
+                           lapack_int nparams, double* params);
+
+lapack_int LAPACKE_sgetf2(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, lapack_int* ipiv);
+lapack_int LAPACKE_dgetf2(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda, lapack_int* ipiv);
+lapack_int LAPACKE_cgetf2(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                          lapack_int* ipiv);
+lapack_int LAPACKE_zgetf2(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                          lapack_int* ipiv);
+
+lapack_int LAPACKE_sgetrf(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, lapack_int* ipiv);
+lapack_int LAPACKE_dgetrf(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda, lapack_int* ipiv);
+lapack_int LAPACKE_cgetrf(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                          lapack_int* ipiv);
+lapack_int LAPACKE_zgetrf(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                          lapack_int* ipiv);
+
+lapack_int LAPACKE_sgetri(int matrix_order, lapack_int n, float* a, lapack_int lda, const lapack_int* ipiv);
+lapack_int LAPACKE_dgetri(int matrix_order, lapack_int n, double* a, lapack_int lda, const lapack_int* ipiv);
+lapack_int LAPACKE_cgetri(int matrix_order, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                          const lapack_int* ipiv);
+lapack_int LAPACKE_zgetri(int matrix_order, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                          const lapack_int* ipiv);
+
+lapack_int LAPACKE_sgetrs(int matrix_order, char trans, lapack_int n, lapack_int nrhs, const float* a, lapack_int lda,
+                          const lapack_int* ipiv, float* b, lapack_int ldb);
+lapack_int LAPACKE_dgetrs(int matrix_order, char trans, lapack_int n, lapack_int nrhs, const double* a, lapack_int lda,
+                          const lapack_int* ipiv, double* b, lapack_int ldb);
+lapack_int LAPACKE_cgetrs(int matrix_order, char trans, lapack_int n, lapack_int nrhs, const lapack_complex_float* a,
+                          lapack_int lda, const lapack_int* ipiv, lapack_complex_float* b, lapack_int ldb);
+lapack_int LAPACKE_zgetrs(int matrix_order, char trans, lapack_int n, lapack_int nrhs, const lapack_complex_double* a,
+                          lapack_int lda, const lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb);
+
+lapack_int LAPACKE_sggbak(int matrix_order, char job, char side, lapack_int n, lapack_int ilo, lapack_int ihi,
+                          const float* lscale, const float* rscale, lapack_int m, float* v, lapack_int ldv);
+lapack_int LAPACKE_dggbak(int matrix_order, char job, char side, lapack_int n, lapack_int ilo, lapack_int ihi,
+                          const double* lscale, const double* rscale, lapack_int m, double* v, lapack_int ldv);
+lapack_int LAPACKE_cggbak(int matrix_order, char job, char side, lapack_int n, lapack_int ilo, lapack_int ihi,
+                          const float* lscale, const float* rscale, lapack_int m, lapack_complex_float* v,
+                          lapack_int ldv);
+lapack_int LAPACKE_zggbak(int matrix_order, char job, char side, lapack_int n, lapack_int ilo, lapack_int ihi,
+                          const double* lscale, const double* rscale, lapack_int m, lapack_complex_double* v,
+                          lapack_int ldv);
+
+lapack_int LAPACKE_sggbal(int matrix_order, char job, lapack_int n, float* a, lapack_int lda, float* b, lapack_int ldb,
+                          lapack_int* ilo, lapack_int* ihi, float* lscale, float* rscale);
+lapack_int LAPACKE_dggbal(int matrix_order, char job, lapack_int n, double* a, lapack_int lda, double* b,
+                          lapack_int ldb, lapack_int* ilo, lapack_int* ihi, double* lscale, double* rscale);
+lapack_int LAPACKE_cggbal(int matrix_order, char job, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                          lapack_complex_float* b, lapack_int ldb, lapack_int* ilo, lapack_int* ihi, float* lscale,
+                          float* rscale);
+lapack_int LAPACKE_zggbal(int matrix_order, char job, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                          lapack_complex_double* b, lapack_int ldb, lapack_int* ilo, lapack_int* ihi, double* lscale,
+                          double* rscale);
+
+lapack_int LAPACKE_sgges(int matrix_order, char jobvsl, char jobvsr, char sort, LAPACK_S_SELECT3 selctg, lapack_int n,
+                         float* a, lapack_int lda, float* b, lapack_int ldb, lapack_int* sdim, float* alphar,
+                         float* alphai, float* beta, float* vsl, lapack_int ldvsl, float* vsr, lapack_int ldvsr);
+lapack_int LAPACKE_dgges(int matrix_order, char jobvsl, char jobvsr, char sort, LAPACK_D_SELECT3 selctg, lapack_int n,
+                         double* a, lapack_int lda, double* b, lapack_int ldb, lapack_int* sdim, double* alphar,
+                         double* alphai, double* beta, double* vsl, lapack_int ldvsl, double* vsr, lapack_int ldvsr);
+lapack_int LAPACKE_cgges(int matrix_order, char jobvsl, char jobvsr, char sort, LAPACK_C_SELECT2 selctg, lapack_int n,
+                         lapack_complex_float* a, lapack_int lda, lapack_complex_float* b, lapack_int ldb,
+                         lapack_int* sdim, lapack_complex_float* alpha, lapack_complex_float* beta,
+                         lapack_complex_float* vsl, lapack_int ldvsl, lapack_complex_float* vsr, lapack_int ldvsr);
+lapack_int LAPACKE_zgges(int matrix_order, char jobvsl, char jobvsr, char sort, LAPACK_Z_SELECT2 selctg, lapack_int n,
+                         lapack_complex_double* a, lapack_int lda, lapack_complex_double* b, lapack_int ldb,
+                         lapack_int* sdim, lapack_complex_double* alpha, lapack_complex_double* beta,
+                         lapack_complex_double* vsl, lapack_int ldvsl, lapack_complex_double* vsr, lapack_int ldvsr);
+
+lapack_int LAPACKE_sggesx(int matrix_order, char jobvsl, char jobvsr, char sort, LAPACK_S_SELECT3 selctg, char sense,
+                          lapack_int n, float* a, lapack_int lda, float* b, lapack_int ldb, lapack_int* sdim,
+                          float* alphar, float* alphai, float* beta, float* vsl, lapack_int ldvsl, float* vsr,
+                          lapack_int ldvsr, float* rconde, float* rcondv);
+lapack_int LAPACKE_dggesx(int matrix_order, char jobvsl, char jobvsr, char sort, LAPACK_D_SELECT3 selctg, char sense,
+                          lapack_int n, double* a, lapack_int lda, double* b, lapack_int ldb, lapack_int* sdim,
+                          double* alphar, double* alphai, double* beta, double* vsl, lapack_int ldvsl, double* vsr,
+                          lapack_int ldvsr, double* rconde, double* rcondv);
+lapack_int LAPACKE_cggesx(int matrix_order, char jobvsl, char jobvsr, char sort, LAPACK_C_SELECT2 selctg, char sense,
+                          lapack_int n, lapack_complex_float* a, lapack_int lda, lapack_complex_float* b,
+                          lapack_int ldb, lapack_int* sdim, lapack_complex_float* alpha, lapack_complex_float* beta,
+                          lapack_complex_float* vsl, lapack_int ldvsl, lapack_complex_float* vsr, lapack_int ldvsr,
+                          float* rconde, float* rcondv);
+lapack_int LAPACKE_zggesx(int matrix_order, char jobvsl, char jobvsr, char sort, LAPACK_Z_SELECT2 selctg, char sense,
+                          lapack_int n, lapack_complex_double* a, lapack_int lda, lapack_complex_double* b,
+                          lapack_int ldb, lapack_int* sdim, lapack_complex_double* alpha, lapack_complex_double* beta,
+                          lapack_complex_double* vsl, lapack_int ldvsl, lapack_complex_double* vsr, lapack_int ldvsr,
+                          double* rconde, double* rcondv);
+
+lapack_int LAPACKE_sggev(int matrix_order, char jobvl, char jobvr, lapack_int n, float* a, lapack_int lda, float* b,
+                         lapack_int ldb, float* alphar, float* alphai, float* beta, float* vl, lapack_int ldvl,
+                         float* vr, lapack_int ldvr);
+lapack_int LAPACKE_dggev(int matrix_order, char jobvl, char jobvr, lapack_int n, double* a, lapack_int lda, double* b,
+                         lapack_int ldb, double* alphar, double* alphai, double* beta, double* vl, lapack_int ldvl,
+                         double* vr, lapack_int ldvr);
+lapack_int LAPACKE_cggev(int matrix_order, char jobvl, char jobvr, lapack_int n, lapack_complex_float* a,
+                         lapack_int lda, lapack_complex_float* b, lapack_int ldb, lapack_complex_float* alpha,
+                         lapack_complex_float* beta, lapack_complex_float* vl, lapack_int ldvl,
+                         lapack_complex_float* vr, lapack_int ldvr);
+lapack_int LAPACKE_zggev(int matrix_order, char jobvl, char jobvr, lapack_int n, lapack_complex_double* a,
+                         lapack_int lda, lapack_complex_double* b, lapack_int ldb, lapack_complex_double* alpha,
+                         lapack_complex_double* beta, lapack_complex_double* vl, lapack_int ldvl,
+                         lapack_complex_double* vr, lapack_int ldvr);
+
+lapack_int LAPACKE_sggevx(int matrix_order, char balanc, char jobvl, char jobvr, char sense, lapack_int n, float* a,
+                          lapack_int lda, float* b, lapack_int ldb, float* alphar, float* alphai, float* beta,
+                          float* vl, lapack_int ldvl, float* vr, lapack_int ldvr, lapack_int* ilo, lapack_int* ihi,
+                          float* lscale, float* rscale, float* abnrm, float* bbnrm, float* rconde, float* rcondv);
+lapack_int LAPACKE_dggevx(int matrix_order, char balanc, char jobvl, char jobvr, char sense, lapack_int n, double* a,
+                          lapack_int lda, double* b, lapack_int ldb, double* alphar, double* alphai, double* beta,
+                          double* vl, lapack_int ldvl, double* vr, lapack_int ldvr, lapack_int* ilo, lapack_int* ihi,
+                          double* lscale, double* rscale, double* abnrm, double* bbnrm, double* rconde, double* rcondv);
+lapack_int LAPACKE_cggevx(int matrix_order, char balanc, char jobvl, char jobvr, char sense, lapack_int n,
+                          lapack_complex_float* a, lapack_int lda, lapack_complex_float* b, lapack_int ldb,
+                          lapack_complex_float* alpha, lapack_complex_float* beta, lapack_complex_float* vl,
+                          lapack_int ldvl, lapack_complex_float* vr, lapack_int ldvr, lapack_int* ilo, lapack_int* ihi,
+                          float* lscale, float* rscale, float* abnrm, float* bbnrm, float* rconde, float* rcondv);
+lapack_int LAPACKE_zggevx(int matrix_order, char balanc, char jobvl, char jobvr, char sense, lapack_int n,
+                          lapack_complex_double* a, lapack_int lda, lapack_complex_double* b, lapack_int ldb,
+                          lapack_complex_double* alpha, lapack_complex_double* beta, lapack_complex_double* vl,
+                          lapack_int ldvl, lapack_complex_double* vr, lapack_int ldvr, lapack_int* ilo, lapack_int* ihi,
+                          double* lscale, double* rscale, double* abnrm, double* bbnrm, double* rconde, double* rcondv);
+
+lapack_int LAPACKE_sggglm(int matrix_order, lapack_int n, lapack_int m, lapack_int p, float* a, lapack_int lda,
+                          float* b, lapack_int ldb, float* d, float* x, float* y);
+lapack_int LAPACKE_dggglm(int matrix_order, lapack_int n, lapack_int m, lapack_int p, double* a, lapack_int lda,
+                          double* b, lapack_int ldb, double* d, double* x, double* y);
+lapack_int LAPACKE_cggglm(int matrix_order, lapack_int n, lapack_int m, lapack_int p, lapack_complex_float* a,
+                          lapack_int lda, lapack_complex_float* b, lapack_int ldb, lapack_complex_float* d,
+                          lapack_complex_float* x, lapack_complex_float* y);
+lapack_int LAPACKE_zggglm(int matrix_order, lapack_int n, lapack_int m, lapack_int p, lapack_complex_double* a,
+                          lapack_int lda, lapack_complex_double* b, lapack_int ldb, lapack_complex_double* d,
+                          lapack_complex_double* x, lapack_complex_double* y);
+
+lapack_int LAPACKE_sgghrd(int matrix_order, char compq, char compz, lapack_int n, lapack_int ilo, lapack_int ihi,
+                          float* a, lapack_int lda, float* b, lapack_int ldb, float* q, lapack_int ldq, float* z,
+                          lapack_int ldz);
+lapack_int LAPACKE_dgghrd(int matrix_order, char compq, char compz, lapack_int n, lapack_int ilo, lapack_int ihi,
+                          double* a, lapack_int lda, double* b, lapack_int ldb, double* q, lapack_int ldq, double* z,
+                          lapack_int ldz);
+lapack_int LAPACKE_cgghrd(int matrix_order, char compq, char compz, lapack_int n, lapack_int ilo, lapack_int ihi,
+                          lapack_complex_float* a, lapack_int lda, lapack_complex_float* b, lapack_int ldb,
+                          lapack_complex_float* q, lapack_int ldq, lapack_complex_float* z, lapack_int ldz);
+lapack_int LAPACKE_zgghrd(int matrix_order, char compq, char compz, lapack_int n, lapack_int ilo, lapack_int ihi,
+                          lapack_complex_double* a, lapack_int lda, lapack_complex_double* b, lapack_int ldb,
+                          lapack_complex_double* q, lapack_int ldq, lapack_complex_double* z, lapack_int ldz);
+
+lapack_int LAPACKE_sgglse(int matrix_order, lapack_int m, lapack_int n, lapack_int p, float* a, lapack_int lda,
+                          float* b, lapack_int ldb, float* c, float* d, float* x);
+lapack_int LAPACKE_dgglse(int matrix_order, lapack_int m, lapack_int n, lapack_int p, double* a, lapack_int lda,
+                          double* b, lapack_int ldb, double* c, double* d, double* x);
+lapack_int LAPACKE_cgglse(int matrix_order, lapack_int m, lapack_int n, lapack_int p, lapack_complex_float* a,
+                          lapack_int lda, lapack_complex_float* b, lapack_int ldb, lapack_complex_float* c,
+                          lapack_complex_float* d, lapack_complex_float* x);
+lapack_int LAPACKE_zgglse(int matrix_order, lapack_int m, lapack_int n, lapack_int p, lapack_complex_double* a,
+                          lapack_int lda, lapack_complex_double* b, lapack_int ldb, lapack_complex_double* c,
+                          lapack_complex_double* d, lapack_complex_double* x);
+
+lapack_int LAPACKE_sggqrf(int matrix_order, lapack_int n, lapack_int m, lapack_int p, float* a, lapack_int lda,
+                          float* taua, float* b, lapack_int ldb, float* taub);
+lapack_int LAPACKE_dggqrf(int matrix_order, lapack_int n, lapack_int m, lapack_int p, double* a, lapack_int lda,
+                          double* taua, double* b, lapack_int ldb, double* taub);
+lapack_int LAPACKE_cggqrf(int matrix_order, lapack_int n, lapack_int m, lapack_int p, lapack_complex_float* a,
+                          lapack_int lda, lapack_complex_float* taua, lapack_complex_float* b, lapack_int ldb,
+                          lapack_complex_float* taub);
+lapack_int LAPACKE_zggqrf(int matrix_order, lapack_int n, lapack_int m, lapack_int p, lapack_complex_double* a,
+                          lapack_int lda, lapack_complex_double* taua, lapack_complex_double* b, lapack_int ldb,
+                          lapack_complex_double* taub);
+
+lapack_int LAPACKE_sggrqf(int matrix_order, lapack_int m, lapack_int p, lapack_int n, float* a, lapack_int lda,
+                          float* taua, float* b, lapack_int ldb, float* taub);
+lapack_int LAPACKE_dggrqf(int matrix_order, lapack_int m, lapack_int p, lapack_int n, double* a, lapack_int lda,
+                          double* taua, double* b, lapack_int ldb, double* taub);
+lapack_int LAPACKE_cggrqf(int matrix_order, lapack_int m, lapack_int p, lapack_int n, lapack_complex_float* a,
+                          lapack_int lda, lapack_complex_float* taua, lapack_complex_float* b, lapack_int ldb,
+                          lapack_complex_float* taub);
+lapack_int LAPACKE_zggrqf(int matrix_order, lapack_int m, lapack_int p, lapack_int n, lapack_complex_double* a,
+                          lapack_int lda, lapack_complex_double* taua, lapack_complex_double* b, lapack_int ldb,
+                          lapack_complex_double* taub);
+
+lapack_int LAPACKE_sggsvd(int matrix_order, char jobu, char jobv, char jobq, lapack_int m, lapack_int n, lapack_int p,
+                          lapack_int* k, lapack_int* l, float* a, lapack_int lda, float* b, lapack_int ldb,
+                          float* alpha, float* beta, float* u, lapack_int ldu, float* v, lapack_int ldv, float* q,
+                          lapack_int ldq, lapack_int* iwork);
+lapack_int LAPACKE_dggsvd(int matrix_order, char jobu, char jobv, char jobq, lapack_int m, lapack_int n, lapack_int p,
+                          lapack_int* k, lapack_int* l, double* a, lapack_int lda, double* b, lapack_int ldb,
+                          double* alpha, double* beta, double* u, lapack_int ldu, double* v, lapack_int ldv, double* q,
+                          lapack_int ldq, lapack_int* iwork);
+lapack_int LAPACKE_cggsvd(int matrix_order, char jobu, char jobv, char jobq, lapack_int m, lapack_int n, lapack_int p,
+                          lapack_int* k, lapack_int* l, lapack_complex_float* a, lapack_int lda,
+                          lapack_complex_float* b, lapack_int ldb, float* alpha, float* beta, lapack_complex_float* u,
+                          lapack_int ldu, lapack_complex_float* v, lapack_int ldv, lapack_complex_float* q,
+                          lapack_int ldq, lapack_int* iwork);
+lapack_int LAPACKE_zggsvd(int matrix_order, char jobu, char jobv, char jobq, lapack_int m, lapack_int n, lapack_int p,
+                          lapack_int* k, lapack_int* l, lapack_complex_double* a, lapack_int lda,
+                          lapack_complex_double* b, lapack_int ldb, double* alpha, double* beta,
+                          lapack_complex_double* u, lapack_int ldu, lapack_complex_double* v, lapack_int ldv,
+                          lapack_complex_double* q, lapack_int ldq, lapack_int* iwork);
+
+lapack_int LAPACKE_sggsvp(int matrix_order, char jobu, char jobv, char jobq, lapack_int m, lapack_int p, lapack_int n,
+                          float* a, lapack_int lda, float* b, lapack_int ldb, float tola, float tolb, lapack_int* k,
+                          lapack_int* l, float* u, lapack_int ldu, float* v, lapack_int ldv, float* q, lapack_int ldq);
+lapack_int LAPACKE_dggsvp(int matrix_order, char jobu, char jobv, char jobq, lapack_int m, lapack_int p, lapack_int n,
+                          double* a, lapack_int lda, double* b, lapack_int ldb, double tola, double tolb, lapack_int* k,
+                          lapack_int* l, double* u, lapack_int ldu, double* v, lapack_int ldv, double* q,
+                          lapack_int ldq);
+lapack_int LAPACKE_cggsvp(int matrix_order, char jobu, char jobv, char jobq, lapack_int m, lapack_int p, lapack_int n,
+                          lapack_complex_float* a, lapack_int lda, lapack_complex_float* b, lapack_int ldb, float tola,
+                          float tolb, lapack_int* k, lapack_int* l, lapack_complex_float* u, lapack_int ldu,
+                          lapack_complex_float* v, lapack_int ldv, lapack_complex_float* q, lapack_int ldq);
+lapack_int LAPACKE_zggsvp(int matrix_order, char jobu, char jobv, char jobq, lapack_int m, lapack_int p, lapack_int n,
+                          lapack_complex_double* a, lapack_int lda, lapack_complex_double* b, lapack_int ldb,
+                          double tola, double tolb, lapack_int* k, lapack_int* l, lapack_complex_double* u,
+                          lapack_int ldu, lapack_complex_double* v, lapack_int ldv, lapack_complex_double* q,
+                          lapack_int ldq);
+
+lapack_int LAPACKE_sgtcon(char norm, lapack_int n, const float* dl, const float* d, const float* du, const float* du2,
+                          const lapack_int* ipiv, float anorm, float* rcond);
+lapack_int LAPACKE_dgtcon(char norm, lapack_int n, const double* dl, const double* d, const double* du,
+                          const double* du2, const lapack_int* ipiv, double anorm, double* rcond);
+lapack_int LAPACKE_cgtcon(char norm, lapack_int n, const lapack_complex_float* dl, const lapack_complex_float* d,
+                          const lapack_complex_float* du, const lapack_complex_float* du2, const lapack_int* ipiv,
+                          float anorm, float* rcond);
+lapack_int LAPACKE_zgtcon(char norm, lapack_int n, const lapack_complex_double* dl, const lapack_complex_double* d,
+                          const lapack_complex_double* du, const lapack_complex_double* du2, const lapack_int* ipiv,
+                          double anorm, double* rcond);
+
+lapack_int LAPACKE_sgtrfs(int matrix_order, char trans, lapack_int n, lapack_int nrhs, const float* dl, const float* d,
+                          const float* du, const float* dlf, const float* df, const float* duf, const float* du2,
+                          const lapack_int* ipiv, const float* b, lapack_int ldb, float* x, lapack_int ldx, float* ferr,
+                          float* berr);
+lapack_int LAPACKE_dgtrfs(int matrix_order, char trans, lapack_int n, lapack_int nrhs, const double* dl,
+                          const double* d, const double* du, const double* dlf, const double* df, const double* duf,
+                          const double* du2, const lapack_int* ipiv, const double* b, lapack_int ldb, double* x,
+                          lapack_int ldx, double* ferr, double* berr);
+lapack_int LAPACKE_cgtrfs(int matrix_order, char trans, lapack_int n, lapack_int nrhs, const lapack_complex_float* dl,
+                          const lapack_complex_float* d, const lapack_complex_float* du,
+                          const lapack_complex_float* dlf, const lapack_complex_float* df,
+                          const lapack_complex_float* duf, const lapack_complex_float* du2, const lapack_int* ipiv,
+                          const lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x, lapack_int ldx,
+                          float* ferr, float* berr);
+lapack_int LAPACKE_zgtrfs(int matrix_order, char trans, lapack_int n, lapack_int nrhs, const lapack_complex_double* dl,
+                          const lapack_complex_double* d, const lapack_complex_double* du,
+                          const lapack_complex_double* dlf, const lapack_complex_double* df,
+                          const lapack_complex_double* duf, const lapack_complex_double* du2, const lapack_int* ipiv,
+                          const lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x, lapack_int ldx,
+                          double* ferr, double* berr);
+
+lapack_int LAPACKE_sgtsv(int matrix_order, lapack_int n, lapack_int nrhs, float* dl, float* d, float* du, float* b,
+                         lapack_int ldb);
+lapack_int LAPACKE_dgtsv(int matrix_order, lapack_int n, lapack_int nrhs, double* dl, double* d, double* du, double* b,
+                         lapack_int ldb);
+lapack_int LAPACKE_cgtsv(int matrix_order, lapack_int n, lapack_int nrhs, lapack_complex_float* dl,
+                         lapack_complex_float* d, lapack_complex_float* du, lapack_complex_float* b, lapack_int ldb);
+lapack_int LAPACKE_zgtsv(int matrix_order, lapack_int n, lapack_int nrhs, lapack_complex_double* dl,
+                         lapack_complex_double* d, lapack_complex_double* du, lapack_complex_double* b, lapack_int ldb);
+
+lapack_int LAPACKE_sgtsvx(int matrix_order, char fact, char trans, lapack_int n, lapack_int nrhs, const float* dl,
+                          const float* d, const float* du, float* dlf, float* df, float* duf, float* du2,
+                          lapack_int* ipiv, const float* b, lapack_int ldb, float* x, lapack_int ldx, float* rcond,
+                          float* ferr, float* berr);
+lapack_int LAPACKE_dgtsvx(int matrix_order, char fact, char trans, lapack_int n, lapack_int nrhs, const double* dl,
+                          const double* d, const double* du, double* dlf, double* df, double* duf, double* du2,
+                          lapack_int* ipiv, const double* b, lapack_int ldb, double* x, lapack_int ldx, double* rcond,
+                          double* ferr, double* berr);
+lapack_int LAPACKE_cgtsvx(int matrix_order, char fact, char trans, lapack_int n, lapack_int nrhs,
+                          const lapack_complex_float* dl, const lapack_complex_float* d, const lapack_complex_float* du,
+                          lapack_complex_float* dlf, lapack_complex_float* df, lapack_complex_float* duf,
+                          lapack_complex_float* du2, lapack_int* ipiv, const lapack_complex_float* b, lapack_int ldb,
+                          lapack_complex_float* x, lapack_int ldx, float* rcond, float* ferr, float* berr);
+lapack_int LAPACKE_zgtsvx(int matrix_order, char fact, char trans, lapack_int n, lapack_int nrhs,
+                          const lapack_complex_double* dl, const lapack_complex_double* d,
+                          const lapack_complex_double* du, lapack_complex_double* dlf, lapack_complex_double* df,
+                          lapack_complex_double* duf, lapack_complex_double* du2, lapack_int* ipiv,
+                          const lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x, lapack_int ldx,
+                          double* rcond, double* ferr, double* berr);
+
+lapack_int LAPACKE_sgttrf(lapack_int n, float* dl, float* d, float* du, float* du2, lapack_int* ipiv);
+lapack_int LAPACKE_dgttrf(lapack_int n, double* dl, double* d, double* du, double* du2, lapack_int* ipiv);
+lapack_int LAPACKE_cgttrf(lapack_int n, lapack_complex_float* dl, lapack_complex_float* d, lapack_complex_float* du,
+                          lapack_complex_float* du2, lapack_int* ipiv);
+lapack_int LAPACKE_zgttrf(lapack_int n, lapack_complex_double* dl, lapack_complex_double* d, lapack_complex_double* du,
+                          lapack_complex_double* du2, lapack_int* ipiv);
+
+lapack_int LAPACKE_sgttrs(int matrix_order, char trans, lapack_int n, lapack_int nrhs, const float* dl, const float* d,
+                          const float* du, const float* du2, const lapack_int* ipiv, float* b, lapack_int ldb);
+lapack_int LAPACKE_dgttrs(int matrix_order, char trans, lapack_int n, lapack_int nrhs, const double* dl,
+                          const double* d, const double* du, const double* du2, const lapack_int* ipiv, double* b,
+                          lapack_int ldb);
+lapack_int LAPACKE_cgttrs(int matrix_order, char trans, lapack_int n, lapack_int nrhs, const lapack_complex_float* dl,
+                          const lapack_complex_float* d, const lapack_complex_float* du,
+                          const lapack_complex_float* du2, const lapack_int* ipiv, lapack_complex_float* b,
+                          lapack_int ldb);
+lapack_int LAPACKE_zgttrs(int matrix_order, char trans, lapack_int n, lapack_int nrhs, const lapack_complex_double* dl,
+                          const lapack_complex_double* d, const lapack_complex_double* du,
+                          const lapack_complex_double* du2, const lapack_int* ipiv, lapack_complex_double* b,
+                          lapack_int ldb);
+
+lapack_int LAPACKE_chbev(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int kd, lapack_complex_float* ab,
+                         lapack_int ldab, float* w, lapack_complex_float* z, lapack_int ldz);
+lapack_int LAPACKE_zhbev(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int kd, lapack_complex_double* ab,
+                         lapack_int ldab, double* w, lapack_complex_double* z, lapack_int ldz);
+
+lapack_int LAPACKE_chbevd(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int kd, lapack_complex_float* ab,
+                          lapack_int ldab, float* w, lapack_complex_float* z, lapack_int ldz);
+lapack_int LAPACKE_zhbevd(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int kd,
+                          lapack_complex_double* ab, lapack_int ldab, double* w, lapack_complex_double* z,
+                          lapack_int ldz);
+
+lapack_int LAPACKE_chbevx(int matrix_order, char jobz, char range, char uplo, lapack_int n, lapack_int kd,
+                          lapack_complex_float* ab, lapack_int ldab, lapack_complex_float* q, lapack_int ldq, float vl,
+                          float vu, lapack_int il, lapack_int iu, float abstol, lapack_int* m, float* w,
+                          lapack_complex_float* z, lapack_int ldz, lapack_int* ifail);
+lapack_int LAPACKE_zhbevx(int matrix_order, char jobz, char range, char uplo, lapack_int n, lapack_int kd,
+                          lapack_complex_double* ab, lapack_int ldab, lapack_complex_double* q, lapack_int ldq,
+                          double vl, double vu, lapack_int il, lapack_int iu, double abstol, lapack_int* m, double* w,
+                          lapack_complex_double* z, lapack_int ldz, lapack_int* ifail);
+
+lapack_int LAPACKE_chbgst(int matrix_order, char vect, char uplo, lapack_int n, lapack_int ka, lapack_int kb,
+                          lapack_complex_float* ab, lapack_int ldab, const lapack_complex_float* bb, lapack_int ldbb,
+                          lapack_complex_float* x, lapack_int ldx);
+lapack_int LAPACKE_zhbgst(int matrix_order, char vect, char uplo, lapack_int n, lapack_int ka, lapack_int kb,
+                          lapack_complex_double* ab, lapack_int ldab, const lapack_complex_double* bb, lapack_int ldbb,
+                          lapack_complex_double* x, lapack_int ldx);
+
+lapack_int LAPACKE_chbgv(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int ka, lapack_int kb,
+                         lapack_complex_float* ab, lapack_int ldab, lapack_complex_float* bb, lapack_int ldbb, float* w,
+                         lapack_complex_float* z, lapack_int ldz);
+lapack_int LAPACKE_zhbgv(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int ka, lapack_int kb,
+                         lapack_complex_double* ab, lapack_int ldab, lapack_complex_double* bb, lapack_int ldbb,
+                         double* w, lapack_complex_double* z, lapack_int ldz);
+
+lapack_int LAPACKE_chbgvd(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int ka, lapack_int kb,
+                          lapack_complex_float* ab, lapack_int ldab, lapack_complex_float* bb, lapack_int ldbb,
+                          float* w, lapack_complex_float* z, lapack_int ldz);
+lapack_int LAPACKE_zhbgvd(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int ka, lapack_int kb,
+                          lapack_complex_double* ab, lapack_int ldab, lapack_complex_double* bb, lapack_int ldbb,
+                          double* w, lapack_complex_double* z, lapack_int ldz);
+
+lapack_int LAPACKE_chbgvx(int matrix_order, char jobz, char range, char uplo, lapack_int n, lapack_int ka,
+                          lapack_int kb, lapack_complex_float* ab, lapack_int ldab, lapack_complex_float* bb,
+                          lapack_int ldbb, lapack_complex_float* q, lapack_int ldq, float vl, float vu, lapack_int il,
+                          lapack_int iu, float abstol, lapack_int* m, float* w, lapack_complex_float* z, lapack_int ldz,
+                          lapack_int* ifail);
+lapack_int LAPACKE_zhbgvx(int matrix_order, char jobz, char range, char uplo, lapack_int n, lapack_int ka,
+                          lapack_int kb, lapack_complex_double* ab, lapack_int ldab, lapack_complex_double* bb,
+                          lapack_int ldbb, lapack_complex_double* q, lapack_int ldq, double vl, double vu,
+                          lapack_int il, lapack_int iu, double abstol, lapack_int* m, double* w,
+                          lapack_complex_double* z, lapack_int ldz, lapack_int* ifail);
+
+lapack_int LAPACKE_chbtrd(int matrix_order, char vect, char uplo, lapack_int n, lapack_int kd, lapack_complex_float* ab,
+                          lapack_int ldab, float* d, float* e, lapack_complex_float* q, lapack_int ldq);
+lapack_int LAPACKE_zhbtrd(int matrix_order, char vect, char uplo, lapack_int n, lapack_int kd,
+                          lapack_complex_double* ab, lapack_int ldab, double* d, double* e, lapack_complex_double* q,
+                          lapack_int ldq);
+
+lapack_int LAPACKE_checon(int matrix_order, char uplo, lapack_int n, const lapack_complex_float* a, lapack_int lda,
+                          const lapack_int* ipiv, float anorm, float* rcond);
+lapack_int LAPACKE_zhecon(int matrix_order, char uplo, lapack_int n, const lapack_complex_double* a, lapack_int lda,
+                          const lapack_int* ipiv, double anorm, double* rcond);
+
+lapack_int LAPACKE_cheequb(int matrix_order, char uplo, lapack_int n, const lapack_complex_float* a, lapack_int lda,
+                           float* s, float* scond, float* amax);
+lapack_int LAPACKE_zheequb(int matrix_order, char uplo, lapack_int n, const lapack_complex_double* a, lapack_int lda,
+                           double* s, double* scond, double* amax);
+
+lapack_int LAPACKE_cheev(int matrix_order, char jobz, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                         float* w);
+lapack_int LAPACKE_zheev(int matrix_order, char jobz, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                         double* w);
+
+lapack_int LAPACKE_cheevd(int matrix_order, char jobz, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                          float* w);
+lapack_int LAPACKE_zheevd(int matrix_order, char jobz, char uplo, lapack_int n, lapack_complex_double* a,
+                          lapack_int lda, double* w);
+
+lapack_int LAPACKE_cheevr(int matrix_order, char jobz, char range, char uplo, lapack_int n, lapack_complex_float* a,
+                          lapack_int lda, float vl, float vu, lapack_int il, lapack_int iu, float abstol, lapack_int* m,
+                          float* w, lapack_complex_float* z, lapack_int ldz, lapack_int* isuppz);
+lapack_int LAPACKE_zheevr(int matrix_order, char jobz, char range, char uplo, lapack_int n, lapack_complex_double* a,
+                          lapack_int lda, double vl, double vu, lapack_int il, lapack_int iu, double abstol,
+                          lapack_int* m, double* w, lapack_complex_double* z, lapack_int ldz, lapack_int* isuppz);
+
+lapack_int LAPACKE_cheevx(int matrix_order, char jobz, char range, char uplo, lapack_int n, lapack_complex_float* a,
+                          lapack_int lda, float vl, float vu, lapack_int il, lapack_int iu, float abstol, lapack_int* m,
+                          float* w, lapack_complex_float* z, lapack_int ldz, lapack_int* ifail);
+lapack_int LAPACKE_zheevx(int matrix_order, char jobz, char range, char uplo, lapack_int n, lapack_complex_double* a,
+                          lapack_int lda, double vl, double vu, lapack_int il, lapack_int iu, double abstol,
+                          lapack_int* m, double* w, lapack_complex_double* z, lapack_int ldz, lapack_int* ifail);
+
+lapack_int LAPACKE_chegst(int matrix_order, lapack_int itype, char uplo, lapack_int n, lapack_complex_float* a,
+                          lapack_int lda, const lapack_complex_float* b, lapack_int ldb);
+lapack_int LAPACKE_zhegst(int matrix_order, lapack_int itype, char uplo, lapack_int n, lapack_complex_double* a,
+                          lapack_int lda, const lapack_complex_double* b, lapack_int ldb);
+
+lapack_int LAPACKE_chegv(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n,
+                         lapack_complex_float* a, lapack_int lda, lapack_complex_float* b, lapack_int ldb, float* w);
+lapack_int LAPACKE_zhegv(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n,
+                         lapack_complex_double* a, lapack_int lda, lapack_complex_double* b, lapack_int ldb, double* w);
+
+lapack_int LAPACKE_chegvd(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n,
+                          lapack_complex_float* a, lapack_int lda, lapack_complex_float* b, lapack_int ldb, float* w);
+lapack_int LAPACKE_zhegvd(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n,
+                          lapack_complex_double* a, lapack_int lda, lapack_complex_double* b, lapack_int ldb,
+                          double* w);
+
+lapack_int LAPACKE_chegvx(int matrix_order, lapack_int itype, char jobz, char range, char uplo, lapack_int n,
+                          lapack_complex_float* a, lapack_int lda, lapack_complex_float* b, lapack_int ldb, float vl,
+                          float vu, lapack_int il, lapack_int iu, float abstol, lapack_int* m, float* w,
+                          lapack_complex_float* z, lapack_int ldz, lapack_int* ifail);
+lapack_int LAPACKE_zhegvx(int matrix_order, lapack_int itype, char jobz, char range, char uplo, lapack_int n,
+                          lapack_complex_double* a, lapack_int lda, lapack_complex_double* b, lapack_int ldb, double vl,
+                          double vu, lapack_int il, lapack_int iu, double abstol, lapack_int* m, double* w,
+                          lapack_complex_double* z, lapack_int ldz, lapack_int* ifail);
+
+lapack_int LAPACKE_cherfs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const lapack_complex_float* a,
+                          lapack_int lda, const lapack_complex_float* af, lapack_int ldaf, const lapack_int* ipiv,
+                          const lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x, lapack_int ldx,
+                          float* ferr, float* berr);
+lapack_int LAPACKE_zherfs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const lapack_complex_double* a,
+                          lapack_int lda, const lapack_complex_double* af, lapack_int ldaf, const lapack_int* ipiv,
+                          const lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x, lapack_int ldx,
+                          double* ferr, double* berr);
+
+lapack_int LAPACKE_cherfsx(int matrix_order, char uplo, char equed, lapack_int n, lapack_int nrhs,
+                           const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* af,
+                           lapack_int ldaf, const lapack_int* ipiv, const float* s, const lapack_complex_float* b,
+                           lapack_int ldb, lapack_complex_float* x, lapack_int ldx, float* rcond, float* berr,
+                           lapack_int n_err_bnds, float* err_bnds_norm, float* err_bnds_comp, lapack_int nparams,
+                           float* params);
+lapack_int LAPACKE_zherfsx(int matrix_order, char uplo, char equed, lapack_int n, lapack_int nrhs,
+                           const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* af,
+                           lapack_int ldaf, const lapack_int* ipiv, const double* s, const lapack_complex_double* b,
+                           lapack_int ldb, lapack_complex_double* x, lapack_int ldx, double* rcond, double* berr,
+                           lapack_int n_err_bnds, double* err_bnds_norm, double* err_bnds_comp, lapack_int nparams,
+                           double* params);
+
+lapack_int LAPACKE_chesv(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, lapack_complex_float* a,
+                         lapack_int lda, lapack_int* ipiv, lapack_complex_float* b, lapack_int ldb);
+lapack_int LAPACKE_zhesv(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, lapack_complex_double* a,
+                         lapack_int lda, lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb);
+
+lapack_int LAPACKE_chesvx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
+                          const lapack_complex_float* a, lapack_int lda, lapack_complex_float* af, lapack_int ldaf,
+                          lapack_int* ipiv, const lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x,
+                          lapack_int ldx, float* rcond, float* ferr, float* berr);
+lapack_int LAPACKE_zhesvx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
+                          const lapack_complex_double* a, lapack_int lda, lapack_complex_double* af, lapack_int ldaf,
+                          lapack_int* ipiv, const lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x,
+                          lapack_int ldx, double* rcond, double* ferr, double* berr);
+
+lapack_int LAPACKE_chesvxx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
+                           lapack_complex_float* a, lapack_int lda, lapack_complex_float* af, lapack_int ldaf,
+                           lapack_int* ipiv, char* equed, float* s, lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx, float* rcond, float* rpvgrw, float* berr,
+                           lapack_int n_err_bnds, float* err_bnds_norm, float* err_bnds_comp, lapack_int nparams,
+                           float* params);
+lapack_int LAPACKE_zhesvxx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
+                           lapack_complex_double* a, lapack_int lda, lapack_complex_double* af, lapack_int ldaf,
+                           lapack_int* ipiv, char* equed, double* s, lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx, double* rcond, double* rpvgrw, double* berr,
+                           lapack_int n_err_bnds, double* err_bnds_norm, double* err_bnds_comp, lapack_int nparams,
+                           double* params);
+
+lapack_int LAPACKE_chetrd(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda, float* d,
+                          float* e, lapack_complex_float* tau);
+lapack_int LAPACKE_zhetrd(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                          double* d, double* e, lapack_complex_double* tau);
+
+lapack_int LAPACKE_chetrf(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                          lapack_int* ipiv);
+lapack_int LAPACKE_zhetrf(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                          lapack_int* ipiv);
+
+lapack_int LAPACKE_chetri(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                          const lapack_int* ipiv);
+lapack_int LAPACKE_zhetri(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                          const lapack_int* ipiv);
+
+lapack_int LAPACKE_chetrs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const lapack_complex_float* a,
+                          lapack_int lda, const lapack_int* ipiv, lapack_complex_float* b, lapack_int ldb);
+lapack_int LAPACKE_zhetrs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const lapack_complex_double* a,
+                          lapack_int lda, const lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb);
+
+lapack_int LAPACKE_chfrk(int matrix_order, char transr, char uplo, char trans, lapack_int n, lapack_int k, float alpha,
+                         const lapack_complex_float* a, lapack_int lda, float beta, lapack_complex_float* c);
+lapack_int LAPACKE_zhfrk(int matrix_order, char transr, char uplo, char trans, lapack_int n, lapack_int k, double alpha,
+                         const lapack_complex_double* a, lapack_int lda, double beta, lapack_complex_double* c);
+
+lapack_int LAPACKE_shgeqz(int matrix_order, char job, char compq, char compz, lapack_int n, lapack_int ilo,
+                          lapack_int ihi, float* h, lapack_int ldh, float* t, lapack_int ldt, float* alphar,
+                          float* alphai, float* beta, float* q, lapack_int ldq, float* z, lapack_int ldz);
+lapack_int LAPACKE_dhgeqz(int matrix_order, char job, char compq, char compz, lapack_int n, lapack_int ilo,
+                          lapack_int ihi, double* h, lapack_int ldh, double* t, lapack_int ldt, double* alphar,
+                          double* alphai, double* beta, double* q, lapack_int ldq, double* z, lapack_int ldz);
+lapack_int LAPACKE_chgeqz(int matrix_order, char job, char compq, char compz, lapack_int n, lapack_int ilo,
+                          lapack_int ihi, lapack_complex_float* h, lapack_int ldh, lapack_complex_float* t,
+                          lapack_int ldt, lapack_complex_float* alpha, lapack_complex_float* beta,
+                          lapack_complex_float* q, lapack_int ldq, lapack_complex_float* z, lapack_int ldz);
+lapack_int LAPACKE_zhgeqz(int matrix_order, char job, char compq, char compz, lapack_int n, lapack_int ilo,
+                          lapack_int ihi, lapack_complex_double* h, lapack_int ldh, lapack_complex_double* t,
+                          lapack_int ldt, lapack_complex_double* alpha, lapack_complex_double* beta,
+                          lapack_complex_double* q, lapack_int ldq, lapack_complex_double* z, lapack_int ldz);
+
+lapack_int LAPACKE_chpcon(int matrix_order, char uplo, lapack_int n, const lapack_complex_float* ap,
+                          const lapack_int* ipiv, float anorm, float* rcond);
+lapack_int LAPACKE_zhpcon(int matrix_order, char uplo, lapack_int n, const lapack_complex_double* ap,
+                          const lapack_int* ipiv, double anorm, double* rcond);
+
+lapack_int LAPACKE_chpev(int matrix_order, char jobz, char uplo, lapack_int n, lapack_complex_float* ap, float* w,
+                         lapack_complex_float* z, lapack_int ldz);
+lapack_int LAPACKE_zhpev(int matrix_order, char jobz, char uplo, lapack_int n, lapack_complex_double* ap, double* w,
+                         lapack_complex_double* z, lapack_int ldz);
+
+lapack_int LAPACKE_chpevd(int matrix_order, char jobz, char uplo, lapack_int n, lapack_complex_float* ap, float* w,
+                          lapack_complex_float* z, lapack_int ldz);
+lapack_int LAPACKE_zhpevd(int matrix_order, char jobz, char uplo, lapack_int n, lapack_complex_double* ap, double* w,
+                          lapack_complex_double* z, lapack_int ldz);
+
+lapack_int LAPACKE_chpevx(int matrix_order, char jobz, char range, char uplo, lapack_int n, lapack_complex_float* ap,
+                          float vl, float vu, lapack_int il, lapack_int iu, float abstol, lapack_int* m, float* w,
+                          lapack_complex_float* z, lapack_int ldz, lapack_int* ifail);
+lapack_int LAPACKE_zhpevx(int matrix_order, char jobz, char range, char uplo, lapack_int n, lapack_complex_double* ap,
+                          double vl, double vu, lapack_int il, lapack_int iu, double abstol, lapack_int* m, double* w,
+                          lapack_complex_double* z, lapack_int ldz, lapack_int* ifail);
+
+lapack_int LAPACKE_chpgst(int matrix_order, lapack_int itype, char uplo, lapack_int n, lapack_complex_float* ap,
+                          const lapack_complex_float* bp);
+lapack_int LAPACKE_zhpgst(int matrix_order, lapack_int itype, char uplo, lapack_int n, lapack_complex_double* ap,
+                          const lapack_complex_double* bp);
+
+lapack_int LAPACKE_chpgv(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n,
+                         lapack_complex_float* ap, lapack_complex_float* bp, float* w, lapack_complex_float* z,
+                         lapack_int ldz);
+lapack_int LAPACKE_zhpgv(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n,
+                         lapack_complex_double* ap, lapack_complex_double* bp, double* w, lapack_complex_double* z,
+                         lapack_int ldz);
+
+lapack_int LAPACKE_chpgvd(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n,
+                          lapack_complex_float* ap, lapack_complex_float* bp, float* w, lapack_complex_float* z,
+                          lapack_int ldz);
+lapack_int LAPACKE_zhpgvd(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n,
+                          lapack_complex_double* ap, lapack_complex_double* bp, double* w, lapack_complex_double* z,
+                          lapack_int ldz);
+
+lapack_int LAPACKE_chpgvx(int matrix_order, lapack_int itype, char jobz, char range, char uplo, lapack_int n,
+                          lapack_complex_float* ap, lapack_complex_float* bp, float vl, float vu, lapack_int il,
+                          lapack_int iu, float abstol, lapack_int* m, float* w, lapack_complex_float* z, lapack_int ldz,
+                          lapack_int* ifail);
+lapack_int LAPACKE_zhpgvx(int matrix_order, lapack_int itype, char jobz, char range, char uplo, lapack_int n,
+                          lapack_complex_double* ap, lapack_complex_double* bp, double vl, double vu, lapack_int il,
+                          lapack_int iu, double abstol, lapack_int* m, double* w, lapack_complex_double* z,
+                          lapack_int ldz, lapack_int* ifail);
+
+lapack_int LAPACKE_chprfs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const lapack_complex_float* ap,
+                          const lapack_complex_float* afp, const lapack_int* ipiv, const lapack_complex_float* b,
+                          lapack_int ldb, lapack_complex_float* x, lapack_int ldx, float* ferr, float* berr);
+lapack_int LAPACKE_zhprfs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const lapack_complex_double* ap,
+                          const lapack_complex_double* afp, const lapack_int* ipiv, const lapack_complex_double* b,
+                          lapack_int ldb, lapack_complex_double* x, lapack_int ldx, double* ferr, double* berr);
+
+lapack_int LAPACKE_chpsv(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, lapack_complex_float* ap,
+                         lapack_int* ipiv, lapack_complex_float* b, lapack_int ldb);
+lapack_int LAPACKE_zhpsv(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, lapack_complex_double* ap,
+                         lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb);
+
+lapack_int LAPACKE_chpsvx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
+                          const lapack_complex_float* ap, lapack_complex_float* afp, lapack_int* ipiv,
+                          const lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x, lapack_int ldx,
+                          float* rcond, float* ferr, float* berr);
+lapack_int LAPACKE_zhpsvx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
+                          const lapack_complex_double* ap, lapack_complex_double* afp, lapack_int* ipiv,
+                          const lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x, lapack_int ldx,
+                          double* rcond, double* ferr, double* berr);
+
+lapack_int LAPACKE_chptrd(int matrix_order, char uplo, lapack_int n, lapack_complex_float* ap, float* d, float* e,
+                          lapack_complex_float* tau);
+lapack_int LAPACKE_zhptrd(int matrix_order, char uplo, lapack_int n, lapack_complex_double* ap, double* d, double* e,
+                          lapack_complex_double* tau);
+
+lapack_int LAPACKE_chptrf(int matrix_order, char uplo, lapack_int n, lapack_complex_float* ap, lapack_int* ipiv);
+lapack_int LAPACKE_zhptrf(int matrix_order, char uplo, lapack_int n, lapack_complex_double* ap, lapack_int* ipiv);
+
+lapack_int LAPACKE_chptri(int matrix_order, char uplo, lapack_int n, lapack_complex_float* ap, const lapack_int* ipiv);
+lapack_int LAPACKE_zhptri(int matrix_order, char uplo, lapack_int n, lapack_complex_double* ap, const lapack_int* ipiv);
+
+lapack_int LAPACKE_chptrs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const lapack_complex_float* ap,
+                          const lapack_int* ipiv, lapack_complex_float* b, lapack_int ldb);
+lapack_int LAPACKE_zhptrs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const lapack_complex_double* ap,
+                          const lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb);
+
+lapack_int LAPACKE_shsein(int matrix_order, char job, char eigsrc, char initv, lapack_logical* select, lapack_int n,
+                          const float* h, lapack_int ldh, float* wr, const float* wi, float* vl, lapack_int ldvl,
+                          float* vr, lapack_int ldvr, lapack_int mm, lapack_int* m, lapack_int* ifaill,
+                          lapack_int* ifailr);
+lapack_int LAPACKE_dhsein(int matrix_order, char job, char eigsrc, char initv, lapack_logical* select, lapack_int n,
+                          const double* h, lapack_int ldh, double* wr, const double* wi, double* vl, lapack_int ldvl,
+                          double* vr, lapack_int ldvr, lapack_int mm, lapack_int* m, lapack_int* ifaill,
+                          lapack_int* ifailr);
+lapack_int LAPACKE_chsein(int matrix_order, char job, char eigsrc, char initv, const lapack_logical* select,
+                          lapack_int n, const lapack_complex_float* h, lapack_int ldh, lapack_complex_float* w,
+                          lapack_complex_float* vl, lapack_int ldvl, lapack_complex_float* vr, lapack_int ldvr,
+                          lapack_int mm, lapack_int* m, lapack_int* ifaill, lapack_int* ifailr);
+lapack_int LAPACKE_zhsein(int matrix_order, char job, char eigsrc, char initv, const lapack_logical* select,
+                          lapack_int n, const lapack_complex_double* h, lapack_int ldh, lapack_complex_double* w,
+                          lapack_complex_double* vl, lapack_int ldvl, lapack_complex_double* vr, lapack_int ldvr,
+                          lapack_int mm, lapack_int* m, lapack_int* ifaill, lapack_int* ifailr);
+
+lapack_int LAPACKE_shseqr(int matrix_order, char job, char compz, lapack_int n, lapack_int ilo, lapack_int ihi,
+                          float* h, lapack_int ldh, float* wr, float* wi, float* z, lapack_int ldz);
+lapack_int LAPACKE_dhseqr(int matrix_order, char job, char compz, lapack_int n, lapack_int ilo, lapack_int ihi,
+                          double* h, lapack_int ldh, double* wr, double* wi, double* z, lapack_int ldz);
+lapack_int LAPACKE_chseqr(int matrix_order, char job, char compz, lapack_int n, lapack_int ilo, lapack_int ihi,
+                          lapack_complex_float* h, lapack_int ldh, lapack_complex_float* w, lapack_complex_float* z,
+                          lapack_int ldz);
+lapack_int LAPACKE_zhseqr(int matrix_order, char job, char compz, lapack_int n, lapack_int ilo, lapack_int ihi,
+                          lapack_complex_double* h, lapack_int ldh, lapack_complex_double* w, lapack_complex_double* z,
+                          lapack_int ldz);
+
+lapack_int LAPACKE_clacgv(lapack_int n, lapack_complex_float* x, lapack_int incx);
+lapack_int LAPACKE_zlacgv(lapack_int n, lapack_complex_double* x, lapack_int incx);
+
+lapack_int LAPACKE_slacpy(int matrix_order, char uplo, lapack_int m, lapack_int n, const float* a, lapack_int lda,
+                          float* b, lapack_int ldb);
+lapack_int LAPACKE_dlacpy(int matrix_order, char uplo, lapack_int m, lapack_int n, const double* a, lapack_int lda,
+                          double* b, lapack_int ldb);
+lapack_int LAPACKE_clacpy(int matrix_order, char uplo, lapack_int m, lapack_int n, const lapack_complex_float* a,
+                          lapack_int lda, lapack_complex_float* b, lapack_int ldb);
+lapack_int LAPACKE_zlacpy(int matrix_order, char uplo, lapack_int m, lapack_int n, const lapack_complex_double* a,
+                          lapack_int lda, lapack_complex_double* b, lapack_int ldb);
+
+lapack_int LAPACKE_zlag2c(int matrix_order, lapack_int m, lapack_int n, const lapack_complex_double* a, lapack_int lda,
+                          lapack_complex_float* sa, lapack_int ldsa);
+
+lapack_int LAPACKE_slag2d(int matrix_order, lapack_int m, lapack_int n, const float* sa, lapack_int ldsa, double* a,
+                          lapack_int lda);
+
+lapack_int LAPACKE_dlag2s(int matrix_order, lapack_int m, lapack_int n, const double* a, lapack_int lda, float* sa,
+                          lapack_int ldsa);
+
+lapack_int LAPACKE_clag2z(int matrix_order, lapack_int m, lapack_int n, const lapack_complex_float* sa, lapack_int ldsa,
+                          lapack_complex_double* a, lapack_int lda);
+
+lapack_int LAPACKE_slagge(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku, const float* d,
+                          float* a, lapack_int lda, lapack_int* iseed);
+lapack_int LAPACKE_dlagge(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku, const double* d,
+                          double* a, lapack_int lda, lapack_int* iseed);
+lapack_int LAPACKE_clagge(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku, const float* d,
+                          lapack_complex_float* a, lapack_int lda, lapack_int* iseed);
+lapack_int LAPACKE_zlagge(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku, const double* d,
+                          lapack_complex_double* a, lapack_int lda, lapack_int* iseed);
+
+float LAPACKE_slamch(char cmach);
+double LAPACKE_dlamch(char cmach);
+
+float LAPACKE_slange(int matrix_order, char norm, lapack_int m, lapack_int n, const float* a, lapack_int lda);
+double LAPACKE_dlange(int matrix_order, char norm, lapack_int m, lapack_int n, const double* a, lapack_int lda);
+float LAPACKE_clange(int matrix_order, char norm, lapack_int m, lapack_int n, const lapack_complex_float* a,
+                     lapack_int lda);
+double LAPACKE_zlange(int matrix_order, char norm, lapack_int m, lapack_int n, const lapack_complex_double* a,
+                      lapack_int lda);
+
+float LAPACKE_clanhe(int matrix_order, char norm, char uplo, lapack_int n, const lapack_complex_float* a,
+                     lapack_int lda);
+double LAPACKE_zlanhe(int matrix_order, char norm, char uplo, lapack_int n, const lapack_complex_double* a,
+                      lapack_int lda);
+
+float LAPACKE_slansy(int matrix_order, char norm, char uplo, lapack_int n, const float* a, lapack_int lda);
+double LAPACKE_dlansy(int matrix_order, char norm, char uplo, lapack_int n, const double* a, lapack_int lda);
+float LAPACKE_clansy(int matrix_order, char norm, char uplo, lapack_int n, const lapack_complex_float* a,
+                     lapack_int lda);
+double LAPACKE_zlansy(int matrix_order, char norm, char uplo, lapack_int n, const lapack_complex_double* a,
+                      lapack_int lda);
+
+float LAPACKE_slantr(int matrix_order, char norm, char uplo, char diag, lapack_int m, lapack_int n, const float* a,
+                     lapack_int lda);
+double LAPACKE_dlantr(int matrix_order, char norm, char uplo, char diag, lapack_int m, lapack_int n, const double* a,
+                      lapack_int lda);
+float LAPACKE_clantr(int matrix_order, char norm, char uplo, char diag, lapack_int m, lapack_int n,
+                     const lapack_complex_float* a, lapack_int lda);
+double LAPACKE_zlantr(int matrix_order, char norm, char uplo, char diag, lapack_int m, lapack_int n,
+                      const lapack_complex_double* a, lapack_int lda);
+
+lapack_int LAPACKE_slarfb(int matrix_order, char side, char trans, char direct, char storev, lapack_int m, lapack_int n,
+                          lapack_int k, const float* v, lapack_int ldv, const float* t, lapack_int ldt, float* c,
+                          lapack_int ldc);
+lapack_int LAPACKE_dlarfb(int matrix_order, char side, char trans, char direct, char storev, lapack_int m, lapack_int n,
+                          lapack_int k, const double* v, lapack_int ldv, const double* t, lapack_int ldt, double* c,
+                          lapack_int ldc);
+lapack_int LAPACKE_clarfb(int matrix_order, char side, char trans, char direct, char storev, lapack_int m, lapack_int n,
+                          lapack_int k, const lapack_complex_float* v, lapack_int ldv, const lapack_complex_float* t,
+                          lapack_int ldt, lapack_complex_float* c, lapack_int ldc);
+lapack_int LAPACKE_zlarfb(int matrix_order, char side, char trans, char direct, char storev, lapack_int m, lapack_int n,
+                          lapack_int k, const lapack_complex_double* v, lapack_int ldv, const lapack_complex_double* t,
+                          lapack_int ldt, lapack_complex_double* c, lapack_int ldc);
+
+lapack_int LAPACKE_slarfg(lapack_int n, float* alpha, float* x, lapack_int incx, float* tau);
+lapack_int LAPACKE_dlarfg(lapack_int n, double* alpha, double* x, lapack_int incx, double* tau);
+lapack_int LAPACKE_clarfg(lapack_int n, lapack_complex_float* alpha, lapack_complex_float* x, lapack_int incx,
+                          lapack_complex_float* tau);
+lapack_int LAPACKE_zlarfg(lapack_int n, lapack_complex_double* alpha, lapack_complex_double* x, lapack_int incx,
+                          lapack_complex_double* tau);
+
+lapack_int LAPACKE_slarft(int matrix_order, char direct, char storev, lapack_int n, lapack_int k, const float* v,
+                          lapack_int ldv, const float* tau, float* t, lapack_int ldt);
+lapack_int LAPACKE_dlarft(int matrix_order, char direct, char storev, lapack_int n, lapack_int k, const double* v,
+                          lapack_int ldv, const double* tau, double* t, lapack_int ldt);
+lapack_int LAPACKE_clarft(int matrix_order, char direct, char storev, lapack_int n, lapack_int k,
+                          const lapack_complex_float* v, lapack_int ldv, const lapack_complex_float* tau,
+                          lapack_complex_float* t, lapack_int ldt);
+lapack_int LAPACKE_zlarft(int matrix_order, char direct, char storev, lapack_int n, lapack_int k,
+                          const lapack_complex_double* v, lapack_int ldv, const lapack_complex_double* tau,
+                          lapack_complex_double* t, lapack_int ldt);
+
+lapack_int LAPACKE_slarfx(int matrix_order, char side, lapack_int m, lapack_int n, const float* v, float tau, float* c,
+                          lapack_int ldc, float* work);
+lapack_int LAPACKE_dlarfx(int matrix_order, char side, lapack_int m, lapack_int n, const double* v, double tau,
+                          double* c, lapack_int ldc, double* work);
+lapack_int LAPACKE_clarfx(int matrix_order, char side, lapack_int m, lapack_int n, const lapack_complex_float* v,
+                          lapack_complex_float tau, lapack_complex_float* c, lapack_int ldc,
+                          lapack_complex_float* work);
+lapack_int LAPACKE_zlarfx(int matrix_order, char side, lapack_int m, lapack_int n, const lapack_complex_double* v,
+                          lapack_complex_double tau, lapack_complex_double* c, lapack_int ldc,
+                          lapack_complex_double* work);
+
+lapack_int LAPACKE_slarnv(lapack_int idist, lapack_int* iseed, lapack_int n, float* x);
+lapack_int LAPACKE_dlarnv(lapack_int idist, lapack_int* iseed, lapack_int n, double* x);
+lapack_int LAPACKE_clarnv(lapack_int idist, lapack_int* iseed, lapack_int n, lapack_complex_float* x);
+lapack_int LAPACKE_zlarnv(lapack_int idist, lapack_int* iseed, lapack_int n, lapack_complex_double* x);
+
+lapack_int LAPACKE_slaset(int matrix_order, char uplo, lapack_int m, lapack_int n, float alpha, float beta, float* a,
+                          lapack_int lda);
+lapack_int LAPACKE_dlaset(int matrix_order, char uplo, lapack_int m, lapack_int n, double alpha, double beta, double* a,
+                          lapack_int lda);
+lapack_int LAPACKE_claset(int matrix_order, char uplo, lapack_int m, lapack_int n, lapack_complex_float alpha,
+                          lapack_complex_float beta, lapack_complex_float* a, lapack_int lda);
+lapack_int LAPACKE_zlaset(int matrix_order, char uplo, lapack_int m, lapack_int n, lapack_complex_double alpha,
+                          lapack_complex_double beta, lapack_complex_double* a, lapack_int lda);
+
+lapack_int LAPACKE_slasrt(char id, lapack_int n, float* d);
+lapack_int LAPACKE_dlasrt(char id, lapack_int n, double* d);
+
+lapack_int LAPACKE_slaswp(int matrix_order, lapack_int n, float* a, lapack_int lda, lapack_int k1, lapack_int k2,
+                          const lapack_int* ipiv, lapack_int incx);
+lapack_int LAPACKE_dlaswp(int matrix_order, lapack_int n, double* a, lapack_int lda, lapack_int k1, lapack_int k2,
+                          const lapack_int* ipiv, lapack_int incx);
+lapack_int LAPACKE_claswp(int matrix_order, lapack_int n, lapack_complex_float* a, lapack_int lda, lapack_int k1,
+                          lapack_int k2, const lapack_int* ipiv, lapack_int incx);
+lapack_int LAPACKE_zlaswp(int matrix_order, lapack_int n, lapack_complex_double* a, lapack_int lda, lapack_int k1,
+                          lapack_int k2, const lapack_int* ipiv, lapack_int incx);
+
+lapack_int LAPACKE_slatms(int matrix_order, lapack_int m, lapack_int n, char dist, lapack_int* iseed, char sym,
+                          float* d, lapack_int mode, float cond, float dmax, lapack_int kl, lapack_int ku, char pack,
+                          float* a, lapack_int lda);
+lapack_int LAPACKE_dlatms(int matrix_order, lapack_int m, lapack_int n, char dist, lapack_int* iseed, char sym,
+                          double* d, lapack_int mode, double cond, double dmax, lapack_int kl, lapack_int ku, char pack,
+                          double* a, lapack_int lda);
+lapack_int LAPACKE_clatms(int matrix_order, lapack_int m, lapack_int n, char dist, lapack_int* iseed, char sym,
+                          float* d, lapack_int mode, float cond, float dmax, lapack_int kl, lapack_int ku, char pack,
+                          lapack_complex_float* a, lapack_int lda);
+lapack_int LAPACKE_zlatms(int matrix_order, lapack_int m, lapack_int n, char dist, lapack_int* iseed, char sym,
+                          double* d, lapack_int mode, double cond, double dmax, lapack_int kl, lapack_int ku, char pack,
+                          lapack_complex_double* a, lapack_int lda);
+
+lapack_int LAPACKE_slauum(int matrix_order, char uplo, lapack_int n, float* a, lapack_int lda);
+lapack_int LAPACKE_dlauum(int matrix_order, char uplo, lapack_int n, double* a, lapack_int lda);
+lapack_int LAPACKE_clauum(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda);
+lapack_int LAPACKE_zlauum(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda);
+
+lapack_int LAPACKE_sopgtr(int matrix_order, char uplo, lapack_int n, const float* ap, const float* tau, float* q,
+                          lapack_int ldq);
+lapack_int LAPACKE_dopgtr(int matrix_order, char uplo, lapack_int n, const double* ap, const double* tau, double* q,
+                          lapack_int ldq);
+
+lapack_int LAPACKE_sopmtr(int matrix_order, char side, char uplo, char trans, lapack_int m, lapack_int n,
+                          const float* ap, const float* tau, float* c, lapack_int ldc);
+lapack_int LAPACKE_dopmtr(int matrix_order, char side, char uplo, char trans, lapack_int m, lapack_int n,
+                          const double* ap, const double* tau, double* c, lapack_int ldc);
+
+lapack_int LAPACKE_sorgbr(int matrix_order, char vect, lapack_int m, lapack_int n, lapack_int k, float* a,
+                          lapack_int lda, const float* tau);
+lapack_int LAPACKE_dorgbr(int matrix_order, char vect, lapack_int m, lapack_int n, lapack_int k, double* a,
+                          lapack_int lda, const double* tau);
+
+lapack_int LAPACKE_sorghr(int matrix_order, lapack_int n, lapack_int ilo, lapack_int ihi, float* a, lapack_int lda,
+                          const float* tau);
+lapack_int LAPACKE_dorghr(int matrix_order, lapack_int n, lapack_int ilo, lapack_int ihi, double* a, lapack_int lda,
+                          const double* tau);
+
+lapack_int LAPACKE_sorglq(int matrix_order, lapack_int m, lapack_int n, lapack_int k, float* a, lapack_int lda,
+                          const float* tau);
+lapack_int LAPACKE_dorglq(int matrix_order, lapack_int m, lapack_int n, lapack_int k, double* a, lapack_int lda,
+                          const double* tau);
+
+lapack_int LAPACKE_sorgql(int matrix_order, lapack_int m, lapack_int n, lapack_int k, float* a, lapack_int lda,
+                          const float* tau);
+lapack_int LAPACKE_dorgql(int matrix_order, lapack_int m, lapack_int n, lapack_int k, double* a, lapack_int lda,
+                          const double* tau);
+
+lapack_int LAPACKE_sorgqr(int matrix_order, lapack_int m, lapack_int n, lapack_int k, float* a, lapack_int lda,
+                          const float* tau);
+lapack_int LAPACKE_dorgqr(int matrix_order, lapack_int m, lapack_int n, lapack_int k, double* a, lapack_int lda,
+                          const double* tau);
+
+lapack_int LAPACKE_sorgrq(int matrix_order, lapack_int m, lapack_int n, lapack_int k, float* a, lapack_int lda,
+                          const float* tau);
+lapack_int LAPACKE_dorgrq(int matrix_order, lapack_int m, lapack_int n, lapack_int k, double* a, lapack_int lda,
+                          const double* tau);
+
+lapack_int LAPACKE_sorgtr(int matrix_order, char uplo, lapack_int n, float* a, lapack_int lda, const float* tau);
+lapack_int LAPACKE_dorgtr(int matrix_order, char uplo, lapack_int n, double* a, lapack_int lda, const double* tau);
+
+lapack_int LAPACKE_sormbr(int matrix_order, char vect, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                          const float* a, lapack_int lda, const float* tau, float* c, lapack_int ldc);
+lapack_int LAPACKE_dormbr(int matrix_order, char vect, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                          const double* a, lapack_int lda, const double* tau, double* c, lapack_int ldc);
+
+lapack_int LAPACKE_sormhr(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int ilo,
+                          lapack_int ihi, const float* a, lapack_int lda, const float* tau, float* c, lapack_int ldc);
+lapack_int LAPACKE_dormhr(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int ilo,
+                          lapack_int ihi, const double* a, lapack_int lda, const double* tau, double* c,
+                          lapack_int ldc);
+
+lapack_int LAPACKE_sormlq(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                          const float* a, lapack_int lda, const float* tau, float* c, lapack_int ldc);
+lapack_int LAPACKE_dormlq(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                          const double* a, lapack_int lda, const double* tau, double* c, lapack_int ldc);
+
+lapack_int LAPACKE_sormql(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                          const float* a, lapack_int lda, const float* tau, float* c, lapack_int ldc);
+lapack_int LAPACKE_dormql(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                          const double* a, lapack_int lda, const double* tau, double* c, lapack_int ldc);
+
+lapack_int LAPACKE_sormqr(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                          const float* a, lapack_int lda, const float* tau, float* c, lapack_int ldc);
+lapack_int LAPACKE_dormqr(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                          const double* a, lapack_int lda, const double* tau, double* c, lapack_int ldc);
+
+lapack_int LAPACKE_sormrq(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                          const float* a, lapack_int lda, const float* tau, float* c, lapack_int ldc);
+lapack_int LAPACKE_dormrq(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                          const double* a, lapack_int lda, const double* tau, double* c, lapack_int ldc);
+
+lapack_int LAPACKE_sormrz(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                          lapack_int l, const float* a, lapack_int lda, const float* tau, float* c, lapack_int ldc);
+lapack_int LAPACKE_dormrz(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                          lapack_int l, const double* a, lapack_int lda, const double* tau, double* c, lapack_int ldc);
+
+lapack_int LAPACKE_sormtr(int matrix_order, char side, char uplo, char trans, lapack_int m, lapack_int n,
+                          const float* a, lapack_int lda, const float* tau, float* c, lapack_int ldc);
+lapack_int LAPACKE_dormtr(int matrix_order, char side, char uplo, char trans, lapack_int m, lapack_int n,
+                          const double* a, lapack_int lda, const double* tau, double* c, lapack_int ldc);
+
+lapack_int LAPACKE_spbcon(int matrix_order, char uplo, lapack_int n, lapack_int kd, const float* ab, lapack_int ldab,
+                          float anorm, float* rcond);
+lapack_int LAPACKE_dpbcon(int matrix_order, char uplo, lapack_int n, lapack_int kd, const double* ab, lapack_int ldab,
+                          double anorm, double* rcond);
+lapack_int LAPACKE_cpbcon(int matrix_order, char uplo, lapack_int n, lapack_int kd, const lapack_complex_float* ab,
+                          lapack_int ldab, float anorm, float* rcond);
+lapack_int LAPACKE_zpbcon(int matrix_order, char uplo, lapack_int n, lapack_int kd, const lapack_complex_double* ab,
+                          lapack_int ldab, double anorm, double* rcond);
+
+lapack_int LAPACKE_spbequ(int matrix_order, char uplo, lapack_int n, lapack_int kd, const float* ab, lapack_int ldab,
+                          float* s, float* scond, float* amax);
+lapack_int LAPACKE_dpbequ(int matrix_order, char uplo, lapack_int n, lapack_int kd, const double* ab, lapack_int ldab,
+                          double* s, double* scond, double* amax);
+lapack_int LAPACKE_cpbequ(int matrix_order, char uplo, lapack_int n, lapack_int kd, const lapack_complex_float* ab,
+                          lapack_int ldab, float* s, float* scond, float* amax);
+lapack_int LAPACKE_zpbequ(int matrix_order, char uplo, lapack_int n, lapack_int kd, const lapack_complex_double* ab,
+                          lapack_int ldab, double* s, double* scond, double* amax);
+
+lapack_int LAPACKE_spbrfs(int matrix_order, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs, const float* ab,
+                          lapack_int ldab, const float* afb, lapack_int ldafb, const float* b, lapack_int ldb, float* x,
+                          lapack_int ldx, float* ferr, float* berr);
+lapack_int LAPACKE_dpbrfs(int matrix_order, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs, const double* ab,
+                          lapack_int ldab, const double* afb, lapack_int ldafb, const double* b, lapack_int ldb,
+                          double* x, lapack_int ldx, double* ferr, double* berr);
+lapack_int LAPACKE_cpbrfs(int matrix_order, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs,
+                          const lapack_complex_float* ab, lapack_int ldab, const lapack_complex_float* afb,
+                          lapack_int ldafb, const lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x,
+                          lapack_int ldx, float* ferr, float* berr);
+lapack_int LAPACKE_zpbrfs(int matrix_order, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs,
+                          const lapack_complex_double* ab, lapack_int ldab, const lapack_complex_double* afb,
+                          lapack_int ldafb, const lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x,
+                          lapack_int ldx, double* ferr, double* berr);
+
+lapack_int LAPACKE_spbstf(int matrix_order, char uplo, lapack_int n, lapack_int kb, float* bb, lapack_int ldbb);
+lapack_int LAPACKE_dpbstf(int matrix_order, char uplo, lapack_int n, lapack_int kb, double* bb, lapack_int ldbb);
+lapack_int LAPACKE_cpbstf(int matrix_order, char uplo, lapack_int n, lapack_int kb, lapack_complex_float* bb,
+                          lapack_int ldbb);
+lapack_int LAPACKE_zpbstf(int matrix_order, char uplo, lapack_int n, lapack_int kb, lapack_complex_double* bb,
+                          lapack_int ldbb);
+
+lapack_int LAPACKE_spbsv(int matrix_order, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs, float* ab,
+                         lapack_int ldab, float* b, lapack_int ldb);
+lapack_int LAPACKE_dpbsv(int matrix_order, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs, double* ab,
+                         lapack_int ldab, double* b, lapack_int ldb);
+lapack_int LAPACKE_cpbsv(int matrix_order, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs,
+                         lapack_complex_float* ab, lapack_int ldab, lapack_complex_float* b, lapack_int ldb);
+lapack_int LAPACKE_zpbsv(int matrix_order, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs,
+                         lapack_complex_double* ab, lapack_int ldab, lapack_complex_double* b, lapack_int ldb);
+
+lapack_int LAPACKE_spbsvx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs,
+                          float* ab, lapack_int ldab, float* afb, lapack_int ldafb, char* equed, float* s, float* b,
+                          lapack_int ldb, float* x, lapack_int ldx, float* rcond, float* ferr, float* berr);
+lapack_int LAPACKE_dpbsvx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs,
+                          double* ab, lapack_int ldab, double* afb, lapack_int ldafb, char* equed, double* s, double* b,
+                          lapack_int ldb, double* x, lapack_int ldx, double* rcond, double* ferr, double* berr);
+lapack_int LAPACKE_cpbsvx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs,
+                          lapack_complex_float* ab, lapack_int ldab, lapack_complex_float* afb, lapack_int ldafb,
+                          char* equed, float* s, lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x,
+                          lapack_int ldx, float* rcond, float* ferr, float* berr);
+lapack_int LAPACKE_zpbsvx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs,
+                          lapack_complex_double* ab, lapack_int ldab, lapack_complex_double* afb, lapack_int ldafb,
+                          char* equed, double* s, lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x,
+                          lapack_int ldx, double* rcond, double* ferr, double* berr);
+
+lapack_int LAPACKE_spbtrf(int matrix_order, char uplo, lapack_int n, lapack_int kd, float* ab, lapack_int ldab);
+lapack_int LAPACKE_dpbtrf(int matrix_order, char uplo, lapack_int n, lapack_int kd, double* ab, lapack_int ldab);
+lapack_int LAPACKE_cpbtrf(int matrix_order, char uplo, lapack_int n, lapack_int kd, lapack_complex_float* ab,
+                          lapack_int ldab);
+lapack_int LAPACKE_zpbtrf(int matrix_order, char uplo, lapack_int n, lapack_int kd, lapack_complex_double* ab,
+                          lapack_int ldab);
+
+lapack_int LAPACKE_spbtrs(int matrix_order, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs, const float* ab,
+                          lapack_int ldab, float* b, lapack_int ldb);
+lapack_int LAPACKE_dpbtrs(int matrix_order, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs, const double* ab,
+                          lapack_int ldab, double* b, lapack_int ldb);
+lapack_int LAPACKE_cpbtrs(int matrix_order, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs,
+                          const lapack_complex_float* ab, lapack_int ldab, lapack_complex_float* b, lapack_int ldb);
+lapack_int LAPACKE_zpbtrs(int matrix_order, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs,
+                          const lapack_complex_double* ab, lapack_int ldab, lapack_complex_double* b, lapack_int ldb);
+
+lapack_int LAPACKE_spftrf(int matrix_order, char transr, char uplo, lapack_int n, float* a);
+lapack_int LAPACKE_dpftrf(int matrix_order, char transr, char uplo, lapack_int n, double* a);
+lapack_int LAPACKE_cpftrf(int matrix_order, char transr, char uplo, lapack_int n, lapack_complex_float* a);
+lapack_int LAPACKE_zpftrf(int matrix_order, char transr, char uplo, lapack_int n, lapack_complex_double* a);
+
+lapack_int LAPACKE_spftri(int matrix_order, char transr, char uplo, lapack_int n, float* a);
+lapack_int LAPACKE_dpftri(int matrix_order, char transr, char uplo, lapack_int n, double* a);
+lapack_int LAPACKE_cpftri(int matrix_order, char transr, char uplo, lapack_int n, lapack_complex_float* a);
+lapack_int LAPACKE_zpftri(int matrix_order, char transr, char uplo, lapack_int n, lapack_complex_double* a);
+
+lapack_int LAPACKE_spftrs(int matrix_order, char transr, char uplo, lapack_int n, lapack_int nrhs, const float* a,
+                          float* b, lapack_int ldb);
+lapack_int LAPACKE_dpftrs(int matrix_order, char transr, char uplo, lapack_int n, lapack_int nrhs, const double* a,
+                          double* b, lapack_int ldb);
+lapack_int LAPACKE_cpftrs(int matrix_order, char transr, char uplo, lapack_int n, lapack_int nrhs,
+                          const lapack_complex_float* a, lapack_complex_float* b, lapack_int ldb);
+lapack_int LAPACKE_zpftrs(int matrix_order, char transr, char uplo, lapack_int n, lapack_int nrhs,
+                          const lapack_complex_double* a, lapack_complex_double* b, lapack_int ldb);
+
+lapack_int LAPACKE_spocon(int matrix_order, char uplo, lapack_int n, const float* a, lapack_int lda, float anorm,
+                          float* rcond);
+lapack_int LAPACKE_dpocon(int matrix_order, char uplo, lapack_int n, const double* a, lapack_int lda, double anorm,
+                          double* rcond);
+lapack_int LAPACKE_cpocon(int matrix_order, char uplo, lapack_int n, const lapack_complex_float* a, lapack_int lda,
+                          float anorm, float* rcond);
+lapack_int LAPACKE_zpocon(int matrix_order, char uplo, lapack_int n, const lapack_complex_double* a, lapack_int lda,
+                          double anorm, double* rcond);
+
+lapack_int LAPACKE_spoequ(int matrix_order, lapack_int n, const float* a, lapack_int lda, float* s, float* scond,
+                          float* amax);
+lapack_int LAPACKE_dpoequ(int matrix_order, lapack_int n, const double* a, lapack_int lda, double* s, double* scond,
+                          double* amax);
+lapack_int LAPACKE_cpoequ(int matrix_order, lapack_int n, const lapack_complex_float* a, lapack_int lda, float* s,
+                          float* scond, float* amax);
+lapack_int LAPACKE_zpoequ(int matrix_order, lapack_int n, const lapack_complex_double* a, lapack_int lda, double* s,
+                          double* scond, double* amax);
+
+lapack_int LAPACKE_spoequb(int matrix_order, lapack_int n, const float* a, lapack_int lda, float* s, float* scond,
+                           float* amax);
+lapack_int LAPACKE_dpoequb(int matrix_order, lapack_int n, const double* a, lapack_int lda, double* s, double* scond,
+                           double* amax);
+lapack_int LAPACKE_cpoequb(int matrix_order, lapack_int n, const lapack_complex_float* a, lapack_int lda, float* s,
+                           float* scond, float* amax);
+lapack_int LAPACKE_zpoequb(int matrix_order, lapack_int n, const lapack_complex_double* a, lapack_int lda, double* s,
+                           double* scond, double* amax);
+
+lapack_int LAPACKE_sporfs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const float* a, lapack_int lda,
+                          const float* af, lapack_int ldaf, const float* b, lapack_int ldb, float* x, lapack_int ldx,
+                          float* ferr, float* berr);
+lapack_int LAPACKE_dporfs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const double* a, lapack_int lda,
+                          const double* af, lapack_int ldaf, const double* b, lapack_int ldb, double* x, lapack_int ldx,
+                          double* ferr, double* berr);
+lapack_int LAPACKE_cporfs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const lapack_complex_float* a,
+                          lapack_int lda, const lapack_complex_float* af, lapack_int ldaf,
+                          const lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x, lapack_int ldx,
+                          float* ferr, float* berr);
+lapack_int LAPACKE_zporfs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const lapack_complex_double* a,
+                          lapack_int lda, const lapack_complex_double* af, lapack_int ldaf,
+                          const lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x, lapack_int ldx,
+                          double* ferr, double* berr);
+
+lapack_int LAPACKE_sporfsx(int matrix_order, char uplo, char equed, lapack_int n, lapack_int nrhs, const float* a,
+                           lapack_int lda, const float* af, lapack_int ldaf, const float* s, const float* b,
+                           lapack_int ldb, float* x, lapack_int ldx, float* rcond, float* berr, lapack_int n_err_bnds,
+                           float* err_bnds_norm, float* err_bnds_comp, lapack_int nparams, float* params);
+lapack_int LAPACKE_dporfsx(int matrix_order, char uplo, char equed, lapack_int n, lapack_int nrhs, const double* a,
+                           lapack_int lda, const double* af, lapack_int ldaf, const double* s, const double* b,
+                           lapack_int ldb, double* x, lapack_int ldx, double* rcond, double* berr,
+                           lapack_int n_err_bnds, double* err_bnds_norm, double* err_bnds_comp, lapack_int nparams,
+                           double* params);
+lapack_int LAPACKE_cporfsx(int matrix_order, char uplo, char equed, lapack_int n, lapack_int nrhs,
+                           const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* af,
+                           lapack_int ldaf, const float* s, const lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx, float* rcond, float* berr, lapack_int n_err_bnds,
+                           float* err_bnds_norm, float* err_bnds_comp, lapack_int nparams, float* params);
+lapack_int LAPACKE_zporfsx(int matrix_order, char uplo, char equed, lapack_int n, lapack_int nrhs,
+                           const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* af,
+                           lapack_int ldaf, const double* s, const lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx, double* rcond, double* berr, lapack_int n_err_bnds,
+                           double* err_bnds_norm, double* err_bnds_comp, lapack_int nparams, double* params);
+
+lapack_int LAPACKE_sposv(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, float* a, lapack_int lda, float* b,
+                         lapack_int ldb);
+lapack_int LAPACKE_dposv(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, double* a, lapack_int lda,
+                         double* b, lapack_int ldb);
+lapack_int LAPACKE_cposv(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, lapack_complex_float* a,
+                         lapack_int lda, lapack_complex_float* b, lapack_int ldb);
+lapack_int LAPACKE_zposv(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, lapack_complex_double* a,
+                         lapack_int lda, lapack_complex_double* b, lapack_int ldb);
+lapack_int LAPACKE_dsposv(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, double* a, lapack_int lda,
+                          double* b, lapack_int ldb, double* x, lapack_int ldx, lapack_int* iter);
+lapack_int LAPACKE_zcposv(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, lapack_complex_double* a,
+                          lapack_int lda, lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x,
+                          lapack_int ldx, lapack_int* iter);
+
+lapack_int LAPACKE_sposvx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs, float* a,
+                          lapack_int lda, float* af, lapack_int ldaf, char* equed, float* s, float* b, lapack_int ldb,
+                          float* x, lapack_int ldx, float* rcond, float* ferr, float* berr);
+lapack_int LAPACKE_dposvx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs, double* a,
+                          lapack_int lda, double* af, lapack_int ldaf, char* equed, double* s, double* b,
+                          lapack_int ldb, double* x, lapack_int ldx, double* rcond, double* ferr, double* berr);
+lapack_int LAPACKE_cposvx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
+                          lapack_complex_float* a, lapack_int lda, lapack_complex_float* af, lapack_int ldaf,
+                          char* equed, float* s, lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x,
+                          lapack_int ldx, float* rcond, float* ferr, float* berr);
+lapack_int LAPACKE_zposvx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
+                          lapack_complex_double* a, lapack_int lda, lapack_complex_double* af, lapack_int ldaf,
+                          char* equed, double* s, lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x,
+                          lapack_int ldx, double* rcond, double* ferr, double* berr);
+
+lapack_int LAPACKE_sposvxx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs, float* a,
+                           lapack_int lda, float* af, lapack_int ldaf, char* equed, float* s, float* b, lapack_int ldb,
+                           float* x, lapack_int ldx, float* rcond, float* rpvgrw, float* berr, lapack_int n_err_bnds,
+                           float* err_bnds_norm, float* err_bnds_comp, lapack_int nparams, float* params);
+lapack_int LAPACKE_dposvxx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs, double* a,
+                           lapack_int lda, double* af, lapack_int ldaf, char* equed, double* s, double* b,
+                           lapack_int ldb, double* x, lapack_int ldx, double* rcond, double* rpvgrw, double* berr,
+                           lapack_int n_err_bnds, double* err_bnds_norm, double* err_bnds_comp, lapack_int nparams,
+                           double* params);
+lapack_int LAPACKE_cposvxx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
+                           lapack_complex_float* a, lapack_int lda, lapack_complex_float* af, lapack_int ldaf,
+                           char* equed, float* s, lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x,
+                           lapack_int ldx, float* rcond, float* rpvgrw, float* berr, lapack_int n_err_bnds,
+                           float* err_bnds_norm, float* err_bnds_comp, lapack_int nparams, float* params);
+lapack_int LAPACKE_zposvxx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
+                           lapack_complex_double* a, lapack_int lda, lapack_complex_double* af, lapack_int ldaf,
+                           char* equed, double* s, lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x,
+                           lapack_int ldx, double* rcond, double* rpvgrw, double* berr, lapack_int n_err_bnds,
+                           double* err_bnds_norm, double* err_bnds_comp, lapack_int nparams, double* params);
+
+lapack_int LAPACKE_spotrf(int matrix_order, char uplo, lapack_int n, float* a, lapack_int lda);
+lapack_int LAPACKE_dpotrf(int matrix_order, char uplo, lapack_int n, double* a, lapack_int lda);
+lapack_int LAPACKE_cpotrf(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda);
+lapack_int LAPACKE_zpotrf(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda);
+
+lapack_int LAPACKE_spotri(int matrix_order, char uplo, lapack_int n, float* a, lapack_int lda);
+lapack_int LAPACKE_dpotri(int matrix_order, char uplo, lapack_int n, double* a, lapack_int lda);
+lapack_int LAPACKE_cpotri(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda);
+lapack_int LAPACKE_zpotri(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda);
+
+lapack_int LAPACKE_spotrs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const float* a, lapack_int lda,
+                          float* b, lapack_int ldb);
+lapack_int LAPACKE_dpotrs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const double* a, lapack_int lda,
+                          double* b, lapack_int ldb);
+lapack_int LAPACKE_cpotrs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const lapack_complex_float* a,
+                          lapack_int lda, lapack_complex_float* b, lapack_int ldb);
+lapack_int LAPACKE_zpotrs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const lapack_complex_double* a,
+                          lapack_int lda, lapack_complex_double* b, lapack_int ldb);
+
+lapack_int LAPACKE_sppcon(int matrix_order, char uplo, lapack_int n, const float* ap, float anorm, float* rcond);
+lapack_int LAPACKE_dppcon(int matrix_order, char uplo, lapack_int n, const double* ap, double anorm, double* rcond);
+lapack_int LAPACKE_cppcon(int matrix_order, char uplo, lapack_int n, const lapack_complex_float* ap, float anorm,
+                          float* rcond);
+lapack_int LAPACKE_zppcon(int matrix_order, char uplo, lapack_int n, const lapack_complex_double* ap, double anorm,
+                          double* rcond);
+
+lapack_int LAPACKE_sppequ(int matrix_order, char uplo, lapack_int n, const float* ap, float* s, float* scond,
+                          float* amax);
+lapack_int LAPACKE_dppequ(int matrix_order, char uplo, lapack_int n, const double* ap, double* s, double* scond,
+                          double* amax);
+lapack_int LAPACKE_cppequ(int matrix_order, char uplo, lapack_int n, const lapack_complex_float* ap, float* s,
+                          float* scond, float* amax);
+lapack_int LAPACKE_zppequ(int matrix_order, char uplo, lapack_int n, const lapack_complex_double* ap, double* s,
+                          double* scond, double* amax);
+
+lapack_int LAPACKE_spprfs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const float* ap, const float* afp,
+                          const float* b, lapack_int ldb, float* x, lapack_int ldx, float* ferr, float* berr);
+lapack_int LAPACKE_dpprfs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const double* ap,
+                          const double* afp, const double* b, lapack_int ldb, double* x, lapack_int ldx, double* ferr,
+                          double* berr);
+lapack_int LAPACKE_cpprfs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const lapack_complex_float* ap,
+                          const lapack_complex_float* afp, const lapack_complex_float* b, lapack_int ldb,
+                          lapack_complex_float* x, lapack_int ldx, float* ferr, float* berr);
+lapack_int LAPACKE_zpprfs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const lapack_complex_double* ap,
+                          const lapack_complex_double* afp, const lapack_complex_double* b, lapack_int ldb,
+                          lapack_complex_double* x, lapack_int ldx, double* ferr, double* berr);
+
+lapack_int LAPACKE_sppsv(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, float* ap, float* b,
+                         lapack_int ldb);
+lapack_int LAPACKE_dppsv(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, double* ap, double* b,
+                         lapack_int ldb);
+lapack_int LAPACKE_cppsv(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, lapack_complex_float* ap,
+                         lapack_complex_float* b, lapack_int ldb);
+lapack_int LAPACKE_zppsv(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, lapack_complex_double* ap,
+                         lapack_complex_double* b, lapack_int ldb);
+
+lapack_int LAPACKE_sppsvx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs, float* ap, float* afp,
+                          char* equed, float* s, float* b, lapack_int ldb, float* x, lapack_int ldx, float* rcond,
+                          float* ferr, float* berr);
+lapack_int LAPACKE_dppsvx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs, double* ap,
+                          double* afp, char* equed, double* s, double* b, lapack_int ldb, double* x, lapack_int ldx,
+                          double* rcond, double* ferr, double* berr);
+lapack_int LAPACKE_cppsvx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
+                          lapack_complex_float* ap, lapack_complex_float* afp, char* equed, float* s,
+                          lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x, lapack_int ldx,
+                          float* rcond, float* ferr, float* berr);
+lapack_int LAPACKE_zppsvx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
+                          lapack_complex_double* ap, lapack_complex_double* afp, char* equed, double* s,
+                          lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x, lapack_int ldx,
+                          double* rcond, double* ferr, double* berr);
+
+lapack_int LAPACKE_spptrf(int matrix_order, char uplo, lapack_int n, float* ap);
+lapack_int LAPACKE_dpptrf(int matrix_order, char uplo, lapack_int n, double* ap);
+lapack_int LAPACKE_cpptrf(int matrix_order, char uplo, lapack_int n, lapack_complex_float* ap);
+lapack_int LAPACKE_zpptrf(int matrix_order, char uplo, lapack_int n, lapack_complex_double* ap);
+
+lapack_int LAPACKE_spptri(int matrix_order, char uplo, lapack_int n, float* ap);
+lapack_int LAPACKE_dpptri(int matrix_order, char uplo, lapack_int n, double* ap);
+lapack_int LAPACKE_cpptri(int matrix_order, char uplo, lapack_int n, lapack_complex_float* ap);
+lapack_int LAPACKE_zpptri(int matrix_order, char uplo, lapack_int n, lapack_complex_double* ap);
+
+lapack_int LAPACKE_spptrs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const float* ap, float* b,
+                          lapack_int ldb);
+lapack_int LAPACKE_dpptrs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const double* ap, double* b,
+                          lapack_int ldb);
+lapack_int LAPACKE_cpptrs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const lapack_complex_float* ap,
+                          lapack_complex_float* b, lapack_int ldb);
+lapack_int LAPACKE_zpptrs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const lapack_complex_double* ap,
+                          lapack_complex_double* b, lapack_int ldb);
+
+lapack_int LAPACKE_spstrf(int matrix_order, char uplo, lapack_int n, float* a, lapack_int lda, lapack_int* piv,
+                          lapack_int* rank, float tol);
+lapack_int LAPACKE_dpstrf(int matrix_order, char uplo, lapack_int n, double* a, lapack_int lda, lapack_int* piv,
+                          lapack_int* rank, double tol);
+lapack_int LAPACKE_cpstrf(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                          lapack_int* piv, lapack_int* rank, float tol);
+lapack_int LAPACKE_zpstrf(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                          lapack_int* piv, lapack_int* rank, double tol);
+
+lapack_int LAPACKE_sptcon(lapack_int n, const float* d, const float* e, float anorm, float* rcond);
+lapack_int LAPACKE_dptcon(lapack_int n, const double* d, const double* e, double anorm, double* rcond);
+lapack_int LAPACKE_cptcon(lapack_int n, const float* d, const lapack_complex_float* e, float anorm, float* rcond);
+lapack_int LAPACKE_zptcon(lapack_int n, const double* d, const lapack_complex_double* e, double anorm, double* rcond);
+
+lapack_int LAPACKE_spteqr(int matrix_order, char compz, lapack_int n, float* d, float* e, float* z, lapack_int ldz);
+lapack_int LAPACKE_dpteqr(int matrix_order, char compz, lapack_int n, double* d, double* e, double* z, lapack_int ldz);
+lapack_int LAPACKE_cpteqr(int matrix_order, char compz, lapack_int n, float* d, float* e, lapack_complex_float* z,
+                          lapack_int ldz);
+lapack_int LAPACKE_zpteqr(int matrix_order, char compz, lapack_int n, double* d, double* e, lapack_complex_double* z,
+                          lapack_int ldz);
+
+lapack_int LAPACKE_sptrfs(int matrix_order, lapack_int n, lapack_int nrhs, const float* d, const float* e,
+                          const float* df, const float* ef, const float* b, lapack_int ldb, float* x, lapack_int ldx,
+                          float* ferr, float* berr);
+lapack_int LAPACKE_dptrfs(int matrix_order, lapack_int n, lapack_int nrhs, const double* d, const double* e,
+                          const double* df, const double* ef, const double* b, lapack_int ldb, double* x,
+                          lapack_int ldx, double* ferr, double* berr);
+lapack_int LAPACKE_cptrfs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const float* d,
+                          const lapack_complex_float* e, const float* df, const lapack_complex_float* ef,
+                          const lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x, lapack_int ldx,
+                          float* ferr, float* berr);
+lapack_int LAPACKE_zptrfs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const double* d,
+                          const lapack_complex_double* e, const double* df, const lapack_complex_double* ef,
+                          const lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x, lapack_int ldx,
+                          double* ferr, double* berr);
+
+lapack_int LAPACKE_sptsv(int matrix_order, lapack_int n, lapack_int nrhs, float* d, float* e, float* b, lapack_int ldb);
+lapack_int LAPACKE_dptsv(int matrix_order, lapack_int n, lapack_int nrhs, double* d, double* e, double* b,
+                         lapack_int ldb);
+lapack_int LAPACKE_cptsv(int matrix_order, lapack_int n, lapack_int nrhs, float* d, lapack_complex_float* e,
+                         lapack_complex_float* b, lapack_int ldb);
+lapack_int LAPACKE_zptsv(int matrix_order, lapack_int n, lapack_int nrhs, double* d, lapack_complex_double* e,
+                         lapack_complex_double* b, lapack_int ldb);
+
+lapack_int LAPACKE_sptsvx(int matrix_order, char fact, lapack_int n, lapack_int nrhs, const float* d, const float* e,
+                          float* df, float* ef, const float* b, lapack_int ldb, float* x, lapack_int ldx, float* rcond,
+                          float* ferr, float* berr);
+lapack_int LAPACKE_dptsvx(int matrix_order, char fact, lapack_int n, lapack_int nrhs, const double* d, const double* e,
+                          double* df, double* ef, const double* b, lapack_int ldb, double* x, lapack_int ldx,
+                          double* rcond, double* ferr, double* berr);
+lapack_int LAPACKE_cptsvx(int matrix_order, char fact, lapack_int n, lapack_int nrhs, const float* d,
+                          const lapack_complex_float* e, float* df, lapack_complex_float* ef,
+                          const lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x, lapack_int ldx,
+                          float* rcond, float* ferr, float* berr);
+lapack_int LAPACKE_zptsvx(int matrix_order, char fact, lapack_int n, lapack_int nrhs, const double* d,
+                          const lapack_complex_double* e, double* df, lapack_complex_double* ef,
+                          const lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x, lapack_int ldx,
+                          double* rcond, double* ferr, double* berr);
+
+lapack_int LAPACKE_spttrf(lapack_int n, float* d, float* e);
+lapack_int LAPACKE_dpttrf(lapack_int n, double* d, double* e);
+lapack_int LAPACKE_cpttrf(lapack_int n, float* d, lapack_complex_float* e);
+lapack_int LAPACKE_zpttrf(lapack_int n, double* d, lapack_complex_double* e);
+
+lapack_int LAPACKE_spttrs(int matrix_order, lapack_int n, lapack_int nrhs, const float* d, const float* e, float* b,
+                          lapack_int ldb);
+lapack_int LAPACKE_dpttrs(int matrix_order, lapack_int n, lapack_int nrhs, const double* d, const double* e, double* b,
+                          lapack_int ldb);
+lapack_int LAPACKE_cpttrs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const float* d,
+                          const lapack_complex_float* e, lapack_complex_float* b, lapack_int ldb);
+lapack_int LAPACKE_zpttrs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const double* d,
+                          const lapack_complex_double* e, lapack_complex_double* b, lapack_int ldb);
+
+lapack_int LAPACKE_ssbev(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int kd, float* ab,
+                         lapack_int ldab, float* w, float* z, lapack_int ldz);
+lapack_int LAPACKE_dsbev(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int kd, double* ab,
+                         lapack_int ldab, double* w, double* z, lapack_int ldz);
+
+lapack_int LAPACKE_ssbevd(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int kd, float* ab,
+                          lapack_int ldab, float* w, float* z, lapack_int ldz);
+lapack_int LAPACKE_dsbevd(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int kd, double* ab,
+                          lapack_int ldab, double* w, double* z, lapack_int ldz);
+
+lapack_int LAPACKE_ssbevx(int matrix_order, char jobz, char range, char uplo, lapack_int n, lapack_int kd, float* ab,
+                          lapack_int ldab, float* q, lapack_int ldq, float vl, float vu, lapack_int il, lapack_int iu,
+                          float abstol, lapack_int* m, float* w, float* z, lapack_int ldz, lapack_int* ifail);
+lapack_int LAPACKE_dsbevx(int matrix_order, char jobz, char range, char uplo, lapack_int n, lapack_int kd, double* ab,
+                          lapack_int ldab, double* q, lapack_int ldq, double vl, double vu, lapack_int il,
+                          lapack_int iu, double abstol, lapack_int* m, double* w, double* z, lapack_int ldz,
+                          lapack_int* ifail);
+
+lapack_int LAPACKE_ssbgst(int matrix_order, char vect, char uplo, lapack_int n, lapack_int ka, lapack_int kb, float* ab,
+                          lapack_int ldab, const float* bb, lapack_int ldbb, float* x, lapack_int ldx);
+lapack_int LAPACKE_dsbgst(int matrix_order, char vect, char uplo, lapack_int n, lapack_int ka, lapack_int kb,
+                          double* ab, lapack_int ldab, const double* bb, lapack_int ldbb, double* x, lapack_int ldx);
+
+lapack_int LAPACKE_ssbgv(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int ka, lapack_int kb, float* ab,
+                         lapack_int ldab, float* bb, lapack_int ldbb, float* w, float* z, lapack_int ldz);
+lapack_int LAPACKE_dsbgv(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int ka, lapack_int kb, double* ab,
+                         lapack_int ldab, double* bb, lapack_int ldbb, double* w, double* z, lapack_int ldz);
+
+lapack_int LAPACKE_ssbgvd(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int ka, lapack_int kb, float* ab,
+                          lapack_int ldab, float* bb, lapack_int ldbb, float* w, float* z, lapack_int ldz);
+lapack_int LAPACKE_dsbgvd(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int ka, lapack_int kb,
+                          double* ab, lapack_int ldab, double* bb, lapack_int ldbb, double* w, double* z,
+                          lapack_int ldz);
+
+lapack_int LAPACKE_ssbgvx(int matrix_order, char jobz, char range, char uplo, lapack_int n, lapack_int ka,
+                          lapack_int kb, float* ab, lapack_int ldab, float* bb, lapack_int ldbb, float* q,
+                          lapack_int ldq, float vl, float vu, lapack_int il, lapack_int iu, float abstol, lapack_int* m,
+                          float* w, float* z, lapack_int ldz, lapack_int* ifail);
+lapack_int LAPACKE_dsbgvx(int matrix_order, char jobz, char range, char uplo, lapack_int n, lapack_int ka,
+                          lapack_int kb, double* ab, lapack_int ldab, double* bb, lapack_int ldbb, double* q,
+                          lapack_int ldq, double vl, double vu, lapack_int il, lapack_int iu, double abstol,
+                          lapack_int* m, double* w, double* z, lapack_int ldz, lapack_int* ifail);
+
+lapack_int LAPACKE_ssbtrd(int matrix_order, char vect, char uplo, lapack_int n, lapack_int kd, float* ab,
+                          lapack_int ldab, float* d, float* e, float* q, lapack_int ldq);
+lapack_int LAPACKE_dsbtrd(int matrix_order, char vect, char uplo, lapack_int n, lapack_int kd, double* ab,
+                          lapack_int ldab, double* d, double* e, double* q, lapack_int ldq);
+
+lapack_int LAPACKE_ssfrk(int matrix_order, char transr, char uplo, char trans, lapack_int n, lapack_int k, float alpha,
+                         const float* a, lapack_int lda, float beta, float* c);
+lapack_int LAPACKE_dsfrk(int matrix_order, char transr, char uplo, char trans, lapack_int n, lapack_int k, double alpha,
+                         const double* a, lapack_int lda, double beta, double* c);
+
+lapack_int LAPACKE_sspcon(int matrix_order, char uplo, lapack_int n, const float* ap, const lapack_int* ipiv,
+                          float anorm, float* rcond);
+lapack_int LAPACKE_dspcon(int matrix_order, char uplo, lapack_int n, const double* ap, const lapack_int* ipiv,
+                          double anorm, double* rcond);
+lapack_int LAPACKE_cspcon(int matrix_order, char uplo, lapack_int n, const lapack_complex_float* ap,
+                          const lapack_int* ipiv, float anorm, float* rcond);
+lapack_int LAPACKE_zspcon(int matrix_order, char uplo, lapack_int n, const lapack_complex_double* ap,
+                          const lapack_int* ipiv, double anorm, double* rcond);
+
+lapack_int LAPACKE_sspev(int matrix_order, char jobz, char uplo, lapack_int n, float* ap, float* w, float* z,
+                         lapack_int ldz);
+lapack_int LAPACKE_dspev(int matrix_order, char jobz, char uplo, lapack_int n, double* ap, double* w, double* z,
+                         lapack_int ldz);
+
+lapack_int LAPACKE_sspevd(int matrix_order, char jobz, char uplo, lapack_int n, float* ap, float* w, float* z,
+                          lapack_int ldz);
+lapack_int LAPACKE_dspevd(int matrix_order, char jobz, char uplo, lapack_int n, double* ap, double* w, double* z,
+                          lapack_int ldz);
+
+lapack_int LAPACKE_sspevx(int matrix_order, char jobz, char range, char uplo, lapack_int n, float* ap, float vl,
+                          float vu, lapack_int il, lapack_int iu, float abstol, lapack_int* m, float* w, float* z,
+                          lapack_int ldz, lapack_int* ifail);
+lapack_int LAPACKE_dspevx(int matrix_order, char jobz, char range, char uplo, lapack_int n, double* ap, double vl,
+                          double vu, lapack_int il, lapack_int iu, double abstol, lapack_int* m, double* w, double* z,
+                          lapack_int ldz, lapack_int* ifail);
+
+lapack_int LAPACKE_sspgst(int matrix_order, lapack_int itype, char uplo, lapack_int n, float* ap, const float* bp);
+lapack_int LAPACKE_dspgst(int matrix_order, lapack_int itype, char uplo, lapack_int n, double* ap, const double* bp);
+
+lapack_int LAPACKE_sspgv(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n, float* ap, float* bp,
+                         float* w, float* z, lapack_int ldz);
+lapack_int LAPACKE_dspgv(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n, double* ap, double* bp,
+                         double* w, double* z, lapack_int ldz);
+
+lapack_int LAPACKE_sspgvd(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n, float* ap, float* bp,
+                          float* w, float* z, lapack_int ldz);
+lapack_int LAPACKE_dspgvd(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n, double* ap,
+                          double* bp, double* w, double* z, lapack_int ldz);
+
+lapack_int LAPACKE_sspgvx(int matrix_order, lapack_int itype, char jobz, char range, char uplo, lapack_int n, float* ap,
+                          float* bp, float vl, float vu, lapack_int il, lapack_int iu, float abstol, lapack_int* m,
+                          float* w, float* z, lapack_int ldz, lapack_int* ifail);
+lapack_int LAPACKE_dspgvx(int matrix_order, lapack_int itype, char jobz, char range, char uplo, lapack_int n,
+                          double* ap, double* bp, double vl, double vu, lapack_int il, lapack_int iu, double abstol,
+                          lapack_int* m, double* w, double* z, lapack_int ldz, lapack_int* ifail);
+
+lapack_int LAPACKE_ssprfs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const float* ap, const float* afp,
+                          const lapack_int* ipiv, const float* b, lapack_int ldb, float* x, lapack_int ldx, float* ferr,
+                          float* berr);
+lapack_int LAPACKE_dsprfs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const double* ap,
+                          const double* afp, const lapack_int* ipiv, const double* b, lapack_int ldb, double* x,
+                          lapack_int ldx, double* ferr, double* berr);
+lapack_int LAPACKE_csprfs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const lapack_complex_float* ap,
+                          const lapack_complex_float* afp, const lapack_int* ipiv, const lapack_complex_float* b,
+                          lapack_int ldb, lapack_complex_float* x, lapack_int ldx, float* ferr, float* berr);
+lapack_int LAPACKE_zsprfs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const lapack_complex_double* ap,
+                          const lapack_complex_double* afp, const lapack_int* ipiv, const lapack_complex_double* b,
+                          lapack_int ldb, lapack_complex_double* x, lapack_int ldx, double* ferr, double* berr);
+
+lapack_int LAPACKE_sspsv(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, float* ap, lapack_int* ipiv,
+                         float* b, lapack_int ldb);
+lapack_int LAPACKE_dspsv(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, double* ap, lapack_int* ipiv,
+                         double* b, lapack_int ldb);
+lapack_int LAPACKE_cspsv(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, lapack_complex_float* ap,
+                         lapack_int* ipiv, lapack_complex_float* b, lapack_int ldb);
+lapack_int LAPACKE_zspsv(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, lapack_complex_double* ap,
+                         lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb);
+
+lapack_int LAPACKE_sspsvx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs, const float* ap,
+                          float* afp, lapack_int* ipiv, const float* b, lapack_int ldb, float* x, lapack_int ldx,
+                          float* rcond, float* ferr, float* berr);
+lapack_int LAPACKE_dspsvx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs, const double* ap,
+                          double* afp, lapack_int* ipiv, const double* b, lapack_int ldb, double* x, lapack_int ldx,
+                          double* rcond, double* ferr, double* berr);
+lapack_int LAPACKE_cspsvx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
+                          const lapack_complex_float* ap, lapack_complex_float* afp, lapack_int* ipiv,
+                          const lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x, lapack_int ldx,
+                          float* rcond, float* ferr, float* berr);
+lapack_int LAPACKE_zspsvx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
+                          const lapack_complex_double* ap, lapack_complex_double* afp, lapack_int* ipiv,
+                          const lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x, lapack_int ldx,
+                          double* rcond, double* ferr, double* berr);
+
+lapack_int LAPACKE_ssptrd(int matrix_order, char uplo, lapack_int n, float* ap, float* d, float* e, float* tau);
+lapack_int LAPACKE_dsptrd(int matrix_order, char uplo, lapack_int n, double* ap, double* d, double* e, double* tau);
+
+lapack_int LAPACKE_ssptrf(int matrix_order, char uplo, lapack_int n, float* ap, lapack_int* ipiv);
+lapack_int LAPACKE_dsptrf(int matrix_order, char uplo, lapack_int n, double* ap, lapack_int* ipiv);
+lapack_int LAPACKE_csptrf(int matrix_order, char uplo, lapack_int n, lapack_complex_float* ap, lapack_int* ipiv);
+lapack_int LAPACKE_zsptrf(int matrix_order, char uplo, lapack_int n, lapack_complex_double* ap, lapack_int* ipiv);
+
+lapack_int LAPACKE_ssptri(int matrix_order, char uplo, lapack_int n, float* ap, const lapack_int* ipiv);
+lapack_int LAPACKE_dsptri(int matrix_order, char uplo, lapack_int n, double* ap, const lapack_int* ipiv);
+lapack_int LAPACKE_csptri(int matrix_order, char uplo, lapack_int n, lapack_complex_float* ap, const lapack_int* ipiv);
+lapack_int LAPACKE_zsptri(int matrix_order, char uplo, lapack_int n, lapack_complex_double* ap, const lapack_int* ipiv);
+
+lapack_int LAPACKE_ssptrs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const float* ap,
+                          const lapack_int* ipiv, float* b, lapack_int ldb);
+lapack_int LAPACKE_dsptrs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const double* ap,
+                          const lapack_int* ipiv, double* b, lapack_int ldb);
+lapack_int LAPACKE_csptrs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const lapack_complex_float* ap,
+                          const lapack_int* ipiv, lapack_complex_float* b, lapack_int ldb);
+lapack_int LAPACKE_zsptrs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const lapack_complex_double* ap,
+                          const lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb);
+
+lapack_int LAPACKE_sstebz(char range, char order, lapack_int n, float vl, float vu, lapack_int il, lapack_int iu,
+                          float abstol, const float* d, const float* e, lapack_int* m, lapack_int* nsplit, float* w,
+                          lapack_int* iblock, lapack_int* isplit);
+lapack_int LAPACKE_dstebz(char range, char order, lapack_int n, double vl, double vu, lapack_int il, lapack_int iu,
+                          double abstol, const double* d, const double* e, lapack_int* m, lapack_int* nsplit, double* w,
+                          lapack_int* iblock, lapack_int* isplit);
+
+lapack_int LAPACKE_sstedc(int matrix_order, char compz, lapack_int n, float* d, float* e, float* z, lapack_int ldz);
+lapack_int LAPACKE_dstedc(int matrix_order, char compz, lapack_int n, double* d, double* e, double* z, lapack_int ldz);
+lapack_int LAPACKE_cstedc(int matrix_order, char compz, lapack_int n, float* d, float* e, lapack_complex_float* z,
+                          lapack_int ldz);
+lapack_int LAPACKE_zstedc(int matrix_order, char compz, lapack_int n, double* d, double* e, lapack_complex_double* z,
+                          lapack_int ldz);
+
+lapack_int LAPACKE_sstegr(int matrix_order, char jobz, char range, lapack_int n, float* d, float* e, float vl, float vu,
+                          lapack_int il, lapack_int iu, float abstol, lapack_int* m, float* w, float* z, lapack_int ldz,
+                          lapack_int* isuppz);
+lapack_int LAPACKE_dstegr(int matrix_order, char jobz, char range, lapack_int n, double* d, double* e, double vl,
+                          double vu, lapack_int il, lapack_int iu, double abstol, lapack_int* m, double* w, double* z,
+                          lapack_int ldz, lapack_int* isuppz);
+lapack_int LAPACKE_cstegr(int matrix_order, char jobz, char range, lapack_int n, float* d, float* e, float vl, float vu,
+                          lapack_int il, lapack_int iu, float abstol, lapack_int* m, float* w, lapack_complex_float* z,
+                          lapack_int ldz, lapack_int* isuppz);
+lapack_int LAPACKE_zstegr(int matrix_order, char jobz, char range, lapack_int n, double* d, double* e, double vl,
+                          double vu, lapack_int il, lapack_int iu, double abstol, lapack_int* m, double* w,
+                          lapack_complex_double* z, lapack_int ldz, lapack_int* isuppz);
+
+lapack_int LAPACKE_sstein(int matrix_order, lapack_int n, const float* d, const float* e, lapack_int m, const float* w,
+                          const lapack_int* iblock, const lapack_int* isplit, float* z, lapack_int ldz,
+                          lapack_int* ifailv);
+lapack_int LAPACKE_dstein(int matrix_order, lapack_int n, const double* d, const double* e, lapack_int m,
+                          const double* w, const lapack_int* iblock, const lapack_int* isplit, double* z,
+                          lapack_int ldz, lapack_int* ifailv);
+lapack_int LAPACKE_cstein(int matrix_order, lapack_int n, const float* d, const float* e, lapack_int m, const float* w,
+                          const lapack_int* iblock, const lapack_int* isplit, lapack_complex_float* z, lapack_int ldz,
+                          lapack_int* ifailv);
+lapack_int LAPACKE_zstein(int matrix_order, lapack_int n, const double* d, const double* e, lapack_int m,
+                          const double* w, const lapack_int* iblock, const lapack_int* isplit, lapack_complex_double* z,
+                          lapack_int ldz, lapack_int* ifailv);
+
+lapack_int LAPACKE_sstemr(int matrix_order, char jobz, char range, lapack_int n, float* d, float* e, float vl, float vu,
+                          lapack_int il, lapack_int iu, lapack_int* m, float* w, float* z, lapack_int ldz,
+                          lapack_int nzc, lapack_int* isuppz, lapack_logical* tryrac);
+lapack_int LAPACKE_dstemr(int matrix_order, char jobz, char range, lapack_int n, double* d, double* e, double vl,
+                          double vu, lapack_int il, lapack_int iu, lapack_int* m, double* w, double* z, lapack_int ldz,
+                          lapack_int nzc, lapack_int* isuppz, lapack_logical* tryrac);
+lapack_int LAPACKE_cstemr(int matrix_order, char jobz, char range, lapack_int n, float* d, float* e, float vl, float vu,
+                          lapack_int il, lapack_int iu, lapack_int* m, float* w, lapack_complex_float* z,
+                          lapack_int ldz, lapack_int nzc, lapack_int* isuppz, lapack_logical* tryrac);
+lapack_int LAPACKE_zstemr(int matrix_order, char jobz, char range, lapack_int n, double* d, double* e, double vl,
+                          double vu, lapack_int il, lapack_int iu, lapack_int* m, double* w, lapack_complex_double* z,
+                          lapack_int ldz, lapack_int nzc, lapack_int* isuppz, lapack_logical* tryrac);
+
+lapack_int LAPACKE_ssteqr(int matrix_order, char compz, lapack_int n, float* d, float* e, float* z, lapack_int ldz);
+lapack_int LAPACKE_dsteqr(int matrix_order, char compz, lapack_int n, double* d, double* e, double* z, lapack_int ldz);
+lapack_int LAPACKE_csteqr(int matrix_order, char compz, lapack_int n, float* d, float* e, lapack_complex_float* z,
+                          lapack_int ldz);
+lapack_int LAPACKE_zsteqr(int matrix_order, char compz, lapack_int n, double* d, double* e, lapack_complex_double* z,
+                          lapack_int ldz);
+
+lapack_int LAPACKE_ssterf(lapack_int n, float* d, float* e);
+lapack_int LAPACKE_dsterf(lapack_int n, double* d, double* e);
+
+lapack_int LAPACKE_sstev(int matrix_order, char jobz, lapack_int n, float* d, float* e, float* z, lapack_int ldz);
+lapack_int LAPACKE_dstev(int matrix_order, char jobz, lapack_int n, double* d, double* e, double* z, lapack_int ldz);
+
+lapack_int LAPACKE_sstevd(int matrix_order, char jobz, lapack_int n, float* d, float* e, float* z, lapack_int ldz);
+lapack_int LAPACKE_dstevd(int matrix_order, char jobz, lapack_int n, double* d, double* e, double* z, lapack_int ldz);
+
+lapack_int LAPACKE_sstevr(int matrix_order, char jobz, char range, lapack_int n, float* d, float* e, float vl, float vu,
+                          lapack_int il, lapack_int iu, float abstol, lapack_int* m, float* w, float* z, lapack_int ldz,
+                          lapack_int* isuppz);
+lapack_int LAPACKE_dstevr(int matrix_order, char jobz, char range, lapack_int n, double* d, double* e, double vl,
+                          double vu, lapack_int il, lapack_int iu, double abstol, lapack_int* m, double* w, double* z,
+                          lapack_int ldz, lapack_int* isuppz);
+
+lapack_int LAPACKE_sstevx(int matrix_order, char jobz, char range, lapack_int n, float* d, float* e, float vl, float vu,
+                          lapack_int il, lapack_int iu, float abstol, lapack_int* m, float* w, float* z, lapack_int ldz,
+                          lapack_int* ifail);
+lapack_int LAPACKE_dstevx(int matrix_order, char jobz, char range, lapack_int n, double* d, double* e, double vl,
+                          double vu, lapack_int il, lapack_int iu, double abstol, lapack_int* m, double* w, double* z,
+                          lapack_int ldz, lapack_int* ifail);
+
+lapack_int LAPACKE_ssycon(int matrix_order, char uplo, lapack_int n, const float* a, lapack_int lda,
+                          const lapack_int* ipiv, float anorm, float* rcond);
+lapack_int LAPACKE_dsycon(int matrix_order, char uplo, lapack_int n, const double* a, lapack_int lda,
+                          const lapack_int* ipiv, double anorm, double* rcond);
+lapack_int LAPACKE_csycon(int matrix_order, char uplo, lapack_int n, const lapack_complex_float* a, lapack_int lda,
+                          const lapack_int* ipiv, float anorm, float* rcond);
+lapack_int LAPACKE_zsycon(int matrix_order, char uplo, lapack_int n, const lapack_complex_double* a, lapack_int lda,
+                          const lapack_int* ipiv, double anorm, double* rcond);
+
+lapack_int LAPACKE_ssyequb(int matrix_order, char uplo, lapack_int n, const float* a, lapack_int lda, float* s,
+                           float* scond, float* amax);
+lapack_int LAPACKE_dsyequb(int matrix_order, char uplo, lapack_int n, const double* a, lapack_int lda, double* s,
+                           double* scond, double* amax);
+lapack_int LAPACKE_csyequb(int matrix_order, char uplo, lapack_int n, const lapack_complex_float* a, lapack_int lda,
+                           float* s, float* scond, float* amax);
+lapack_int LAPACKE_zsyequb(int matrix_order, char uplo, lapack_int n, const lapack_complex_double* a, lapack_int lda,
+                           double* s, double* scond, double* amax);
+
+lapack_int LAPACKE_ssyev(int matrix_order, char jobz, char uplo, lapack_int n, float* a, lapack_int lda, float* w);
+lapack_int LAPACKE_dsyev(int matrix_order, char jobz, char uplo, lapack_int n, double* a, lapack_int lda, double* w);
+
+lapack_int LAPACKE_ssyevd(int matrix_order, char jobz, char uplo, lapack_int n, float* a, lapack_int lda, float* w);
+lapack_int LAPACKE_dsyevd(int matrix_order, char jobz, char uplo, lapack_int n, double* a, lapack_int lda, double* w);
+
+lapack_int LAPACKE_ssyevr(int matrix_order, char jobz, char range, char uplo, lapack_int n, float* a, lapack_int lda,
+                          float vl, float vu, lapack_int il, lapack_int iu, float abstol, lapack_int* m, float* w,
+                          float* z, lapack_int ldz, lapack_int* isuppz);
+lapack_int LAPACKE_dsyevr(int matrix_order, char jobz, char range, char uplo, lapack_int n, double* a, lapack_int lda,
+                          double vl, double vu, lapack_int il, lapack_int iu, double abstol, lapack_int* m, double* w,
+                          double* z, lapack_int ldz, lapack_int* isuppz);
+
+lapack_int LAPACKE_ssyevx(int matrix_order, char jobz, char range, char uplo, lapack_int n, float* a, lapack_int lda,
+                          float vl, float vu, lapack_int il, lapack_int iu, float abstol, lapack_int* m, float* w,
+                          float* z, lapack_int ldz, lapack_int* ifail);
+lapack_int LAPACKE_dsyevx(int matrix_order, char jobz, char range, char uplo, lapack_int n, double* a, lapack_int lda,
+                          double vl, double vu, lapack_int il, lapack_int iu, double abstol, lapack_int* m, double* w,
+                          double* z, lapack_int ldz, lapack_int* ifail);
+
+lapack_int LAPACKE_ssygst(int matrix_order, lapack_int itype, char uplo, lapack_int n, float* a, lapack_int lda,
+                          const float* b, lapack_int ldb);
+lapack_int LAPACKE_dsygst(int matrix_order, lapack_int itype, char uplo, lapack_int n, double* a, lapack_int lda,
+                          const double* b, lapack_int ldb);
+
+lapack_int LAPACKE_ssygv(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n, float* a,
+                         lapack_int lda, float* b, lapack_int ldb, float* w);
+lapack_int LAPACKE_dsygv(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n, double* a,
+                         lapack_int lda, double* b, lapack_int ldb, double* w);
+
+lapack_int LAPACKE_ssygvd(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n, float* a,
+                          lapack_int lda, float* b, lapack_int ldb, float* w);
+lapack_int LAPACKE_dsygvd(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n, double* a,
+                          lapack_int lda, double* b, lapack_int ldb, double* w);
+
+lapack_int LAPACKE_ssygvx(int matrix_order, lapack_int itype, char jobz, char range, char uplo, lapack_int n, float* a,
+                          lapack_int lda, float* b, lapack_int ldb, float vl, float vu, lapack_int il, lapack_int iu,
+                          float abstol, lapack_int* m, float* w, float* z, lapack_int ldz, lapack_int* ifail);
+lapack_int LAPACKE_dsygvx(int matrix_order, lapack_int itype, char jobz, char range, char uplo, lapack_int n, double* a,
+                          lapack_int lda, double* b, lapack_int ldb, double vl, double vu, lapack_int il, lapack_int iu,
+                          double abstol, lapack_int* m, double* w, double* z, lapack_int ldz, lapack_int* ifail);
+
+lapack_int LAPACKE_ssyrfs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const float* a, lapack_int lda,
+                          const float* af, lapack_int ldaf, const lapack_int* ipiv, const float* b, lapack_int ldb,
+                          float* x, lapack_int ldx, float* ferr, float* berr);
+lapack_int LAPACKE_dsyrfs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const double* a, lapack_int lda,
+                          const double* af, lapack_int ldaf, const lapack_int* ipiv, const double* b, lapack_int ldb,
+                          double* x, lapack_int ldx, double* ferr, double* berr);
+lapack_int LAPACKE_csyrfs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const lapack_complex_float* a,
+                          lapack_int lda, const lapack_complex_float* af, lapack_int ldaf, const lapack_int* ipiv,
+                          const lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x, lapack_int ldx,
+                          float* ferr, float* berr);
+lapack_int LAPACKE_zsyrfs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const lapack_complex_double* a,
+                          lapack_int lda, const lapack_complex_double* af, lapack_int ldaf, const lapack_int* ipiv,
+                          const lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x, lapack_int ldx,
+                          double* ferr, double* berr);
+
+lapack_int LAPACKE_ssyrfsx(int matrix_order, char uplo, char equed, lapack_int n, lapack_int nrhs, const float* a,
+                           lapack_int lda, const float* af, lapack_int ldaf, const lapack_int* ipiv, const float* s,
+                           const float* b, lapack_int ldb, float* x, lapack_int ldx, float* rcond, float* berr,
+                           lapack_int n_err_bnds, float* err_bnds_norm, float* err_bnds_comp, lapack_int nparams,
+                           float* params);
+lapack_int LAPACKE_dsyrfsx(int matrix_order, char uplo, char equed, lapack_int n, lapack_int nrhs, const double* a,
+                           lapack_int lda, const double* af, lapack_int ldaf, const lapack_int* ipiv, const double* s,
+                           const double* b, lapack_int ldb, double* x, lapack_int ldx, double* rcond, double* berr,
+                           lapack_int n_err_bnds, double* err_bnds_norm, double* err_bnds_comp, lapack_int nparams,
+                           double* params);
+lapack_int LAPACKE_csyrfsx(int matrix_order, char uplo, char equed, lapack_int n, lapack_int nrhs,
+                           const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* af,
+                           lapack_int ldaf, const lapack_int* ipiv, const float* s, const lapack_complex_float* b,
+                           lapack_int ldb, lapack_complex_float* x, lapack_int ldx, float* rcond, float* berr,
+                           lapack_int n_err_bnds, float* err_bnds_norm, float* err_bnds_comp, lapack_int nparams,
+                           float* params);
+lapack_int LAPACKE_zsyrfsx(int matrix_order, char uplo, char equed, lapack_int n, lapack_int nrhs,
+                           const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* af,
+                           lapack_int ldaf, const lapack_int* ipiv, const double* s, const lapack_complex_double* b,
+                           lapack_int ldb, lapack_complex_double* x, lapack_int ldx, double* rcond, double* berr,
+                           lapack_int n_err_bnds, double* err_bnds_norm, double* err_bnds_comp, lapack_int nparams,
+                           double* params);
+
+lapack_int LAPACKE_ssysv(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, float* a, lapack_int lda,
+                         lapack_int* ipiv, float* b, lapack_int ldb);
+lapack_int LAPACKE_dsysv(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, double* a, lapack_int lda,
+                         lapack_int* ipiv, double* b, lapack_int ldb);
+lapack_int LAPACKE_csysv(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, lapack_complex_float* a,
+                         lapack_int lda, lapack_int* ipiv, lapack_complex_float* b, lapack_int ldb);
+lapack_int LAPACKE_zsysv(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, lapack_complex_double* a,
+                         lapack_int lda, lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb);
+
+lapack_int LAPACKE_ssysvx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs, const float* a,
+                          lapack_int lda, float* af, lapack_int ldaf, lapack_int* ipiv, const float* b, lapack_int ldb,
+                          float* x, lapack_int ldx, float* rcond, float* ferr, float* berr);
+lapack_int LAPACKE_dsysvx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs, const double* a,
+                          lapack_int lda, double* af, lapack_int ldaf, lapack_int* ipiv, const double* b,
+                          lapack_int ldb, double* x, lapack_int ldx, double* rcond, double* ferr, double* berr);
+lapack_int LAPACKE_csysvx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
+                          const lapack_complex_float* a, lapack_int lda, lapack_complex_float* af, lapack_int ldaf,
+                          lapack_int* ipiv, const lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x,
+                          lapack_int ldx, float* rcond, float* ferr, float* berr);
+lapack_int LAPACKE_zsysvx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
+                          const lapack_complex_double* a, lapack_int lda, lapack_complex_double* af, lapack_int ldaf,
+                          lapack_int* ipiv, const lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x,
+                          lapack_int ldx, double* rcond, double* ferr, double* berr);
+
+lapack_int LAPACKE_ssysvxx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs, float* a,
+                           lapack_int lda, float* af, lapack_int ldaf, lapack_int* ipiv, char* equed, float* s,
+                           float* b, lapack_int ldb, float* x, lapack_int ldx, float* rcond, float* rpvgrw, float* berr,
+                           lapack_int n_err_bnds, float* err_bnds_norm, float* err_bnds_comp, lapack_int nparams,
+                           float* params);
+lapack_int LAPACKE_dsysvxx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs, double* a,
+                           lapack_int lda, double* af, lapack_int ldaf, lapack_int* ipiv, char* equed, double* s,
+                           double* b, lapack_int ldb, double* x, lapack_int ldx, double* rcond, double* rpvgrw,
+                           double* berr, lapack_int n_err_bnds, double* err_bnds_norm, double* err_bnds_comp,
+                           lapack_int nparams, double* params);
+lapack_int LAPACKE_csysvxx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
+                           lapack_complex_float* a, lapack_int lda, lapack_complex_float* af, lapack_int ldaf,
+                           lapack_int* ipiv, char* equed, float* s, lapack_complex_float* b, lapack_int ldb,
+                           lapack_complex_float* x, lapack_int ldx, float* rcond, float* rpvgrw, float* berr,
+                           lapack_int n_err_bnds, float* err_bnds_norm, float* err_bnds_comp, lapack_int nparams,
+                           float* params);
+lapack_int LAPACKE_zsysvxx(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
+                           lapack_complex_double* a, lapack_int lda, lapack_complex_double* af, lapack_int ldaf,
+                           lapack_int* ipiv, char* equed, double* s, lapack_complex_double* b, lapack_int ldb,
+                           lapack_complex_double* x, lapack_int ldx, double* rcond, double* rpvgrw, double* berr,
+                           lapack_int n_err_bnds, double* err_bnds_norm, double* err_bnds_comp, lapack_int nparams,
+                           double* params);
+
+lapack_int LAPACKE_ssytrd(int matrix_order, char uplo, lapack_int n, float* a, lapack_int lda, float* d, float* e,
+                          float* tau);
+lapack_int LAPACKE_dsytrd(int matrix_order, char uplo, lapack_int n, double* a, lapack_int lda, double* d, double* e,
+                          double* tau);
+
+lapack_int LAPACKE_ssytrf(int matrix_order, char uplo, lapack_int n, float* a, lapack_int lda, lapack_int* ipiv);
+lapack_int LAPACKE_dsytrf(int matrix_order, char uplo, lapack_int n, double* a, lapack_int lda, lapack_int* ipiv);
+lapack_int LAPACKE_csytrf(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                          lapack_int* ipiv);
+lapack_int LAPACKE_zsytrf(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                          lapack_int* ipiv);
+
+lapack_int LAPACKE_ssytri(int matrix_order, char uplo, lapack_int n, float* a, lapack_int lda, const lapack_int* ipiv);
+lapack_int LAPACKE_dsytri(int matrix_order, char uplo, lapack_int n, double* a, lapack_int lda, const lapack_int* ipiv);
+lapack_int LAPACKE_csytri(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                          const lapack_int* ipiv);
+lapack_int LAPACKE_zsytri(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                          const lapack_int* ipiv);
+
+lapack_int LAPACKE_ssytrs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const float* a, lapack_int lda,
+                          const lapack_int* ipiv, float* b, lapack_int ldb);
+lapack_int LAPACKE_dsytrs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const double* a, lapack_int lda,
+                          const lapack_int* ipiv, double* b, lapack_int ldb);
+lapack_int LAPACKE_csytrs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const lapack_complex_float* a,
+                          lapack_int lda, const lapack_int* ipiv, lapack_complex_float* b, lapack_int ldb);
+lapack_int LAPACKE_zsytrs(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const lapack_complex_double* a,
+                          lapack_int lda, const lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb);
+
+lapack_int LAPACKE_stbcon(int matrix_order, char norm, char uplo, char diag, lapack_int n, lapack_int kd,
+                          const float* ab, lapack_int ldab, float* rcond);
+lapack_int LAPACKE_dtbcon(int matrix_order, char norm, char uplo, char diag, lapack_int n, lapack_int kd,
+                          const double* ab, lapack_int ldab, double* rcond);
+lapack_int LAPACKE_ctbcon(int matrix_order, char norm, char uplo, char diag, lapack_int n, lapack_int kd,
+                          const lapack_complex_float* ab, lapack_int ldab, float* rcond);
+lapack_int LAPACKE_ztbcon(int matrix_order, char norm, char uplo, char diag, lapack_int n, lapack_int kd,
+                          const lapack_complex_double* ab, lapack_int ldab, double* rcond);
+
+lapack_int LAPACKE_stbrfs(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int kd,
+                          lapack_int nrhs, const float* ab, lapack_int ldab, const float* b, lapack_int ldb,
+                          const float* x, lapack_int ldx, float* ferr, float* berr);
+lapack_int LAPACKE_dtbrfs(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int kd,
+                          lapack_int nrhs, const double* ab, lapack_int ldab, const double* b, lapack_int ldb,
+                          const double* x, lapack_int ldx, double* ferr, double* berr);
+lapack_int LAPACKE_ctbrfs(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int kd,
+                          lapack_int nrhs, const lapack_complex_float* ab, lapack_int ldab,
+                          const lapack_complex_float* b, lapack_int ldb, const lapack_complex_float* x, lapack_int ldx,
+                          float* ferr, float* berr);
+lapack_int LAPACKE_ztbrfs(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int kd,
+                          lapack_int nrhs, const lapack_complex_double* ab, lapack_int ldab,
+                          const lapack_complex_double* b, lapack_int ldb, const lapack_complex_double* x,
+                          lapack_int ldx, double* ferr, double* berr);
+
+lapack_int LAPACKE_stbtrs(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int kd,
+                          lapack_int nrhs, const float* ab, lapack_int ldab, float* b, lapack_int ldb);
+lapack_int LAPACKE_dtbtrs(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int kd,
+                          lapack_int nrhs, const double* ab, lapack_int ldab, double* b, lapack_int ldb);
+lapack_int LAPACKE_ctbtrs(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int kd,
+                          lapack_int nrhs, const lapack_complex_float* ab, lapack_int ldab, lapack_complex_float* b,
+                          lapack_int ldb);
+lapack_int LAPACKE_ztbtrs(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int kd,
+                          lapack_int nrhs, const lapack_complex_double* ab, lapack_int ldab, lapack_complex_double* b,
+                          lapack_int ldb);
+
+lapack_int LAPACKE_stfsm(int matrix_order, char transr, char side, char uplo, char trans, char diag, lapack_int m,
+                         lapack_int n, float alpha, const float* a, float* b, lapack_int ldb);
+lapack_int LAPACKE_dtfsm(int matrix_order, char transr, char side, char uplo, char trans, char diag, lapack_int m,
+                         lapack_int n, double alpha, const double* a, double* b, lapack_int ldb);
+lapack_int LAPACKE_ctfsm(int matrix_order, char transr, char side, char uplo, char trans, char diag, lapack_int m,
+                         lapack_int n, lapack_complex_float alpha, const lapack_complex_float* a,
+                         lapack_complex_float* b, lapack_int ldb);
+lapack_int LAPACKE_ztfsm(int matrix_order, char transr, char side, char uplo, char trans, char diag, lapack_int m,
+                         lapack_int n, lapack_complex_double alpha, const lapack_complex_double* a,
+                         lapack_complex_double* b, lapack_int ldb);
+
+lapack_int LAPACKE_stftri(int matrix_order, char transr, char uplo, char diag, lapack_int n, float* a);
+lapack_int LAPACKE_dtftri(int matrix_order, char transr, char uplo, char diag, lapack_int n, double* a);
+lapack_int LAPACKE_ctftri(int matrix_order, char transr, char uplo, char diag, lapack_int n, lapack_complex_float* a);
+lapack_int LAPACKE_ztftri(int matrix_order, char transr, char uplo, char diag, lapack_int n, lapack_complex_double* a);
+
+lapack_int LAPACKE_stfttp(int matrix_order, char transr, char uplo, lapack_int n, const float* arf, float* ap);
+lapack_int LAPACKE_dtfttp(int matrix_order, char transr, char uplo, lapack_int n, const double* arf, double* ap);
+lapack_int LAPACKE_ctfttp(int matrix_order, char transr, char uplo, lapack_int n, const lapack_complex_float* arf,
+                          lapack_complex_float* ap);
+lapack_int LAPACKE_ztfttp(int matrix_order, char transr, char uplo, lapack_int n, const lapack_complex_double* arf,
+                          lapack_complex_double* ap);
+
+lapack_int LAPACKE_stfttr(int matrix_order, char transr, char uplo, lapack_int n, const float* arf, float* a,
+                          lapack_int lda);
+lapack_int LAPACKE_dtfttr(int matrix_order, char transr, char uplo, lapack_int n, const double* arf, double* a,
+                          lapack_int lda);
+lapack_int LAPACKE_ctfttr(int matrix_order, char transr, char uplo, lapack_int n, const lapack_complex_float* arf,
+                          lapack_complex_float* a, lapack_int lda);
+lapack_int LAPACKE_ztfttr(int matrix_order, char transr, char uplo, lapack_int n, const lapack_complex_double* arf,
+                          lapack_complex_double* a, lapack_int lda);
+
+lapack_int LAPACKE_stgevc(int matrix_order, char side, char howmny, const lapack_logical* select, lapack_int n,
+                          const float* s, lapack_int lds, const float* p, lapack_int ldp, float* vl, lapack_int ldvl,
+                          float* vr, lapack_int ldvr, lapack_int mm, lapack_int* m);
+lapack_int LAPACKE_dtgevc(int matrix_order, char side, char howmny, const lapack_logical* select, lapack_int n,
+                          const double* s, lapack_int lds, const double* p, lapack_int ldp, double* vl, lapack_int ldvl,
+                          double* vr, lapack_int ldvr, lapack_int mm, lapack_int* m);
+lapack_int LAPACKE_ctgevc(int matrix_order, char side, char howmny, const lapack_logical* select, lapack_int n,
+                          const lapack_complex_float* s, lapack_int lds, const lapack_complex_float* p, lapack_int ldp,
+                          lapack_complex_float* vl, lapack_int ldvl, lapack_complex_float* vr, lapack_int ldvr,
+                          lapack_int mm, lapack_int* m);
+lapack_int LAPACKE_ztgevc(int matrix_order, char side, char howmny, const lapack_logical* select, lapack_int n,
+                          const lapack_complex_double* s, lapack_int lds, const lapack_complex_double* p,
+                          lapack_int ldp, lapack_complex_double* vl, lapack_int ldvl, lapack_complex_double* vr,
+                          lapack_int ldvr, lapack_int mm, lapack_int* m);
+
+lapack_int LAPACKE_stgexc(int matrix_order, lapack_logical wantq, lapack_logical wantz, lapack_int n, float* a,
+                          lapack_int lda, float* b, lapack_int ldb, float* q, lapack_int ldq, float* z, lapack_int ldz,
+                          lapack_int* ifst, lapack_int* ilst);
+lapack_int LAPACKE_dtgexc(int matrix_order, lapack_logical wantq, lapack_logical wantz, lapack_int n, double* a,
+                          lapack_int lda, double* b, lapack_int ldb, double* q, lapack_int ldq, double* z,
+                          lapack_int ldz, lapack_int* ifst, lapack_int* ilst);
+lapack_int LAPACKE_ctgexc(int matrix_order, lapack_logical wantq, lapack_logical wantz, lapack_int n,
+                          lapack_complex_float* a, lapack_int lda, lapack_complex_float* b, lapack_int ldb,
+                          lapack_complex_float* q, lapack_int ldq, lapack_complex_float* z, lapack_int ldz,
+                          lapack_int ifst, lapack_int ilst);
+lapack_int LAPACKE_ztgexc(int matrix_order, lapack_logical wantq, lapack_logical wantz, lapack_int n,
+                          lapack_complex_double* a, lapack_int lda, lapack_complex_double* b, lapack_int ldb,
+                          lapack_complex_double* q, lapack_int ldq, lapack_complex_double* z, lapack_int ldz,
+                          lapack_int ifst, lapack_int ilst);
+
+lapack_int LAPACKE_stgsen(int matrix_order, lapack_int ijob, lapack_logical wantq, lapack_logical wantz,
+                          const lapack_logical* select, lapack_int n, float* a, lapack_int lda, float* b,
+                          lapack_int ldb, float* alphar, float* alphai, float* beta, float* q, lapack_int ldq, float* z,
+                          lapack_int ldz, lapack_int* m, float* pl, float* pr, float* dif);
+lapack_int LAPACKE_dtgsen(int matrix_order, lapack_int ijob, lapack_logical wantq, lapack_logical wantz,
+                          const lapack_logical* select, lapack_int n, double* a, lapack_int lda, double* b,
+                          lapack_int ldb, double* alphar, double* alphai, double* beta, double* q, lapack_int ldq,
+                          double* z, lapack_int ldz, lapack_int* m, double* pl, double* pr, double* dif);
+lapack_int LAPACKE_ctgsen(int matrix_order, lapack_int ijob, lapack_logical wantq, lapack_logical wantz,
+                          const lapack_logical* select, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                          lapack_complex_float* b, lapack_int ldb, lapack_complex_float* alpha,
+                          lapack_complex_float* beta, lapack_complex_float* q, lapack_int ldq, lapack_complex_float* z,
+                          lapack_int ldz, lapack_int* m, float* pl, float* pr, float* dif);
+lapack_int LAPACKE_ztgsen(int matrix_order, lapack_int ijob, lapack_logical wantq, lapack_logical wantz,
+                          const lapack_logical* select, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                          lapack_complex_double* b, lapack_int ldb, lapack_complex_double* alpha,
+                          lapack_complex_double* beta, lapack_complex_double* q, lapack_int ldq,
+                          lapack_complex_double* z, lapack_int ldz, lapack_int* m, double* pl, double* pr, double* dif);
+
+lapack_int LAPACKE_stgsja(int matrix_order, char jobu, char jobv, char jobq, lapack_int m, lapack_int p, lapack_int n,
+                          lapack_int k, lapack_int l, float* a, lapack_int lda, float* b, lapack_int ldb, float tola,
+                          float tolb, float* alpha, float* beta, float* u, lapack_int ldu, float* v, lapack_int ldv,
+                          float* q, lapack_int ldq, lapack_int* ncycle);
+lapack_int LAPACKE_dtgsja(int matrix_order, char jobu, char jobv, char jobq, lapack_int m, lapack_int p, lapack_int n,
+                          lapack_int k, lapack_int l, double* a, lapack_int lda, double* b, lapack_int ldb, double tola,
+                          double tolb, double* alpha, double* beta, double* u, lapack_int ldu, double* v,
+                          lapack_int ldv, double* q, lapack_int ldq, lapack_int* ncycle);
+lapack_int LAPACKE_ctgsja(int matrix_order, char jobu, char jobv, char jobq, lapack_int m, lapack_int p, lapack_int n,
+                          lapack_int k, lapack_int l, lapack_complex_float* a, lapack_int lda, lapack_complex_float* b,
+                          lapack_int ldb, float tola, float tolb, float* alpha, float* beta, lapack_complex_float* u,
+                          lapack_int ldu, lapack_complex_float* v, lapack_int ldv, lapack_complex_float* q,
+                          lapack_int ldq, lapack_int* ncycle);
+lapack_int LAPACKE_ztgsja(int matrix_order, char jobu, char jobv, char jobq, lapack_int m, lapack_int p, lapack_int n,
+                          lapack_int k, lapack_int l, lapack_complex_double* a, lapack_int lda,
+                          lapack_complex_double* b, lapack_int ldb, double tola, double tolb, double* alpha,
+                          double* beta, lapack_complex_double* u, lapack_int ldu, lapack_complex_double* v,
+                          lapack_int ldv, lapack_complex_double* q, lapack_int ldq, lapack_int* ncycle);
+
+lapack_int LAPACKE_stgsna(int matrix_order, char job, char howmny, const lapack_logical* select, lapack_int n,
+                          const float* a, lapack_int lda, const float* b, lapack_int ldb, const float* vl,
+                          lapack_int ldvl, const float* vr, lapack_int ldvr, float* s, float* dif, lapack_int mm,
+                          lapack_int* m);
+lapack_int LAPACKE_dtgsna(int matrix_order, char job, char howmny, const lapack_logical* select, lapack_int n,
+                          const double* a, lapack_int lda, const double* b, lapack_int ldb, const double* vl,
+                          lapack_int ldvl, const double* vr, lapack_int ldvr, double* s, double* dif, lapack_int mm,
+                          lapack_int* m);
+lapack_int LAPACKE_ctgsna(int matrix_order, char job, char howmny, const lapack_logical* select, lapack_int n,
+                          const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* b, lapack_int ldb,
+                          const lapack_complex_float* vl, lapack_int ldvl, const lapack_complex_float* vr,
+                          lapack_int ldvr, float* s, float* dif, lapack_int mm, lapack_int* m);
+lapack_int LAPACKE_ztgsna(int matrix_order, char job, char howmny, const lapack_logical* select, lapack_int n,
+                          const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* b,
+                          lapack_int ldb, const lapack_complex_double* vl, lapack_int ldvl,
+                          const lapack_complex_double* vr, lapack_int ldvr, double* s, double* dif, lapack_int mm,
+                          lapack_int* m);
+
+lapack_int LAPACKE_stgsyl(int matrix_order, char trans, lapack_int ijob, lapack_int m, lapack_int n, const float* a,
+                          lapack_int lda, const float* b, lapack_int ldb, float* c, lapack_int ldc, const float* d,
+                          lapack_int ldd, const float* e, lapack_int lde, float* f, lapack_int ldf, float* scale,
+                          float* dif);
+lapack_int LAPACKE_dtgsyl(int matrix_order, char trans, lapack_int ijob, lapack_int m, lapack_int n, const double* a,
+                          lapack_int lda, const double* b, lapack_int ldb, double* c, lapack_int ldc, const double* d,
+                          lapack_int ldd, const double* e, lapack_int lde, double* f, lapack_int ldf, double* scale,
+                          double* dif);
+lapack_int LAPACKE_ctgsyl(int matrix_order, char trans, lapack_int ijob, lapack_int m, lapack_int n,
+                          const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* b, lapack_int ldb,
+                          lapack_complex_float* c, lapack_int ldc, const lapack_complex_float* d, lapack_int ldd,
+                          const lapack_complex_float* e, lapack_int lde, lapack_complex_float* f, lapack_int ldf,
+                          float* scale, float* dif);
+lapack_int LAPACKE_ztgsyl(int matrix_order, char trans, lapack_int ijob, lapack_int m, lapack_int n,
+                          const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* b,
+                          lapack_int ldb, lapack_complex_double* c, lapack_int ldc, const lapack_complex_double* d,
+                          lapack_int ldd, const lapack_complex_double* e, lapack_int lde, lapack_complex_double* f,
+                          lapack_int ldf, double* scale, double* dif);
+
+lapack_int LAPACKE_stpcon(int matrix_order, char norm, char uplo, char diag, lapack_int n, const float* ap,
+                          float* rcond);
+lapack_int LAPACKE_dtpcon(int matrix_order, char norm, char uplo, char diag, lapack_int n, const double* ap,
+                          double* rcond);
+lapack_int LAPACKE_ctpcon(int matrix_order, char norm, char uplo, char diag, lapack_int n,
+                          const lapack_complex_float* ap, float* rcond);
+lapack_int LAPACKE_ztpcon(int matrix_order, char norm, char uplo, char diag, lapack_int n,
+                          const lapack_complex_double* ap, double* rcond);
+
+lapack_int LAPACKE_stprfs(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
+                          const float* ap, const float* b, lapack_int ldb, const float* x, lapack_int ldx, float* ferr,
+                          float* berr);
+lapack_int LAPACKE_dtprfs(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
+                          const double* ap, const double* b, lapack_int ldb, const double* x, lapack_int ldx,
+                          double* ferr, double* berr);
+lapack_int LAPACKE_ctprfs(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
+                          const lapack_complex_float* ap, const lapack_complex_float* b, lapack_int ldb,
+                          const lapack_complex_float* x, lapack_int ldx, float* ferr, float* berr);
+lapack_int LAPACKE_ztprfs(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
+                          const lapack_complex_double* ap, const lapack_complex_double* b, lapack_int ldb,
+                          const lapack_complex_double* x, lapack_int ldx, double* ferr, double* berr);
+
+lapack_int LAPACKE_stptri(int matrix_order, char uplo, char diag, lapack_int n, float* ap);
+lapack_int LAPACKE_dtptri(int matrix_order, char uplo, char diag, lapack_int n, double* ap);
+lapack_int LAPACKE_ctptri(int matrix_order, char uplo, char diag, lapack_int n, lapack_complex_float* ap);
+lapack_int LAPACKE_ztptri(int matrix_order, char uplo, char diag, lapack_int n, lapack_complex_double* ap);
+
+lapack_int LAPACKE_stptrs(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
+                          const float* ap, float* b, lapack_int ldb);
+lapack_int LAPACKE_dtptrs(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
+                          const double* ap, double* b, lapack_int ldb);
+lapack_int LAPACKE_ctptrs(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
+                          const lapack_complex_float* ap, lapack_complex_float* b, lapack_int ldb);
+lapack_int LAPACKE_ztptrs(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
+                          const lapack_complex_double* ap, lapack_complex_double* b, lapack_int ldb);
+
+lapack_int LAPACKE_stpttf(int matrix_order, char transr, char uplo, lapack_int n, const float* ap, float* arf);
+lapack_int LAPACKE_dtpttf(int matrix_order, char transr, char uplo, lapack_int n, const double* ap, double* arf);
+lapack_int LAPACKE_ctpttf(int matrix_order, char transr, char uplo, lapack_int n, const lapack_complex_float* ap,
+                          lapack_complex_float* arf);
+lapack_int LAPACKE_ztpttf(int matrix_order, char transr, char uplo, lapack_int n, const lapack_complex_double* ap,
+                          lapack_complex_double* arf);
+
+lapack_int LAPACKE_stpttr(int matrix_order, char uplo, lapack_int n, const float* ap, float* a, lapack_int lda);
+lapack_int LAPACKE_dtpttr(int matrix_order, char uplo, lapack_int n, const double* ap, double* a, lapack_int lda);
+lapack_int LAPACKE_ctpttr(int matrix_order, char uplo, lapack_int n, const lapack_complex_float* ap,
+                          lapack_complex_float* a, lapack_int lda);
+lapack_int LAPACKE_ztpttr(int matrix_order, char uplo, lapack_int n, const lapack_complex_double* ap,
+                          lapack_complex_double* a, lapack_int lda);
+
+lapack_int LAPACKE_strcon(int matrix_order, char norm, char uplo, char diag, lapack_int n, const float* a,
+                          lapack_int lda, float* rcond);
+lapack_int LAPACKE_dtrcon(int matrix_order, char norm, char uplo, char diag, lapack_int n, const double* a,
+                          lapack_int lda, double* rcond);
+lapack_int LAPACKE_ctrcon(int matrix_order, char norm, char uplo, char diag, lapack_int n,
+                          const lapack_complex_float* a, lapack_int lda, float* rcond);
+lapack_int LAPACKE_ztrcon(int matrix_order, char norm, char uplo, char diag, lapack_int n,
+                          const lapack_complex_double* a, lapack_int lda, double* rcond);
+
+lapack_int LAPACKE_strevc(int matrix_order, char side, char howmny, lapack_logical* select, lapack_int n,
+                          const float* t, lapack_int ldt, float* vl, lapack_int ldvl, float* vr, lapack_int ldvr,
+                          lapack_int mm, lapack_int* m);
+lapack_int LAPACKE_dtrevc(int matrix_order, char side, char howmny, lapack_logical* select, lapack_int n,
+                          const double* t, lapack_int ldt, double* vl, lapack_int ldvl, double* vr, lapack_int ldvr,
+                          lapack_int mm, lapack_int* m);
+lapack_int LAPACKE_ctrevc(int matrix_order, char side, char howmny, const lapack_logical* select, lapack_int n,
+                          lapack_complex_float* t, lapack_int ldt, lapack_complex_float* vl, lapack_int ldvl,
+                          lapack_complex_float* vr, lapack_int ldvr, lapack_int mm, lapack_int* m);
+lapack_int LAPACKE_ztrevc(int matrix_order, char side, char howmny, const lapack_logical* select, lapack_int n,
+                          lapack_complex_double* t, lapack_int ldt, lapack_complex_double* vl, lapack_int ldvl,
+                          lapack_complex_double* vr, lapack_int ldvr, lapack_int mm, lapack_int* m);
+
+lapack_int LAPACKE_strexc(int matrix_order, char compq, lapack_int n, float* t, lapack_int ldt, float* q,
+                          lapack_int ldq, lapack_int* ifst, lapack_int* ilst);
+lapack_int LAPACKE_dtrexc(int matrix_order, char compq, lapack_int n, double* t, lapack_int ldt, double* q,
+                          lapack_int ldq, lapack_int* ifst, lapack_int* ilst);
+lapack_int LAPACKE_ctrexc(int matrix_order, char compq, lapack_int n, lapack_complex_float* t, lapack_int ldt,
+                          lapack_complex_float* q, lapack_int ldq, lapack_int ifst, lapack_int ilst);
+lapack_int LAPACKE_ztrexc(int matrix_order, char compq, lapack_int n, lapack_complex_double* t, lapack_int ldt,
+                          lapack_complex_double* q, lapack_int ldq, lapack_int ifst, lapack_int ilst);
+
+lapack_int LAPACKE_strrfs(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
+                          const float* a, lapack_int lda, const float* b, lapack_int ldb, const float* x,
+                          lapack_int ldx, float* ferr, float* berr);
+lapack_int LAPACKE_dtrrfs(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
+                          const double* a, lapack_int lda, const double* b, lapack_int ldb, const double* x,
+                          lapack_int ldx, double* ferr, double* berr);
+lapack_int LAPACKE_ctrrfs(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
+                          const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* b, lapack_int ldb,
+                          const lapack_complex_float* x, lapack_int ldx, float* ferr, float* berr);
+lapack_int LAPACKE_ztrrfs(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
+                          const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* b,
+                          lapack_int ldb, const lapack_complex_double* x, lapack_int ldx, double* ferr, double* berr);
+
+lapack_int LAPACKE_strsen(int matrix_order, char job, char compq, const lapack_logical* select, lapack_int n, float* t,
+                          lapack_int ldt, float* q, lapack_int ldq, float* wr, float* wi, lapack_int* m, float* s,
+                          float* sep);
+lapack_int LAPACKE_dtrsen(int matrix_order, char job, char compq, const lapack_logical* select, lapack_int n, double* t,
+                          lapack_int ldt, double* q, lapack_int ldq, double* wr, double* wi, lapack_int* m, double* s,
+                          double* sep);
+lapack_int LAPACKE_ctrsen(int matrix_order, char job, char compq, const lapack_logical* select, lapack_int n,
+                          lapack_complex_float* t, lapack_int ldt, lapack_complex_float* q, lapack_int ldq,
+                          lapack_complex_float* w, lapack_int* m, float* s, float* sep);
+lapack_int LAPACKE_ztrsen(int matrix_order, char job, char compq, const lapack_logical* select, lapack_int n,
+                          lapack_complex_double* t, lapack_int ldt, lapack_complex_double* q, lapack_int ldq,
+                          lapack_complex_double* w, lapack_int* m, double* s, double* sep);
+
+lapack_int LAPACKE_strsna(int matrix_order, char job, char howmny, const lapack_logical* select, lapack_int n,
+                          const float* t, lapack_int ldt, const float* vl, lapack_int ldvl, const float* vr,
+                          lapack_int ldvr, float* s, float* sep, lapack_int mm, lapack_int* m);
+lapack_int LAPACKE_dtrsna(int matrix_order, char job, char howmny, const lapack_logical* select, lapack_int n,
+                          const double* t, lapack_int ldt, const double* vl, lapack_int ldvl, const double* vr,
+                          lapack_int ldvr, double* s, double* sep, lapack_int mm, lapack_int* m);
+lapack_int LAPACKE_ctrsna(int matrix_order, char job, char howmny, const lapack_logical* select, lapack_int n,
+                          const lapack_complex_float* t, lapack_int ldt, const lapack_complex_float* vl,
+                          lapack_int ldvl, const lapack_complex_float* vr, lapack_int ldvr, float* s, float* sep,
+                          lapack_int mm, lapack_int* m);
+lapack_int LAPACKE_ztrsna(int matrix_order, char job, char howmny, const lapack_logical* select, lapack_int n,
+                          const lapack_complex_double* t, lapack_int ldt, const lapack_complex_double* vl,
+                          lapack_int ldvl, const lapack_complex_double* vr, lapack_int ldvr, double* s, double* sep,
+                          lapack_int mm, lapack_int* m);
+
+lapack_int LAPACKE_strsyl(int matrix_order, char trana, char tranb, lapack_int isgn, lapack_int m, lapack_int n,
+                          const float* a, lapack_int lda, const float* b, lapack_int ldb, float* c, lapack_int ldc,
+                          float* scale);
+lapack_int LAPACKE_dtrsyl(int matrix_order, char trana, char tranb, lapack_int isgn, lapack_int m, lapack_int n,
+                          const double* a, lapack_int lda, const double* b, lapack_int ldb, double* c, lapack_int ldc,
+                          double* scale);
+lapack_int LAPACKE_ctrsyl(int matrix_order, char trana, char tranb, lapack_int isgn, lapack_int m, lapack_int n,
+                          const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* b, lapack_int ldb,
+                          lapack_complex_float* c, lapack_int ldc, float* scale);
+lapack_int LAPACKE_ztrsyl(int matrix_order, char trana, char tranb, lapack_int isgn, lapack_int m, lapack_int n,
+                          const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* b,
+                          lapack_int ldb, lapack_complex_double* c, lapack_int ldc, double* scale);
+
+lapack_int LAPACKE_strtri(int matrix_order, char uplo, char diag, lapack_int n, float* a, lapack_int lda);
+lapack_int LAPACKE_dtrtri(int matrix_order, char uplo, char diag, lapack_int n, double* a, lapack_int lda);
+lapack_int LAPACKE_ctrtri(int matrix_order, char uplo, char diag, lapack_int n, lapack_complex_float* a,
+                          lapack_int lda);
+lapack_int LAPACKE_ztrtri(int matrix_order, char uplo, char diag, lapack_int n, lapack_complex_double* a,
+                          lapack_int lda);
+
+lapack_int LAPACKE_strtrs(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
+                          const float* a, lapack_int lda, float* b, lapack_int ldb);
+lapack_int LAPACKE_dtrtrs(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
+                          const double* a, lapack_int lda, double* b, lapack_int ldb);
+lapack_int LAPACKE_ctrtrs(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
+                          const lapack_complex_float* a, lapack_int lda, lapack_complex_float* b, lapack_int ldb);
+lapack_int LAPACKE_ztrtrs(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
+                          const lapack_complex_double* a, lapack_int lda, lapack_complex_double* b, lapack_int ldb);
+
+lapack_int LAPACKE_strttf(int matrix_order, char transr, char uplo, lapack_int n, const float* a, lapack_int lda,
+                          float* arf);
+lapack_int LAPACKE_dtrttf(int matrix_order, char transr, char uplo, lapack_int n, const double* a, lapack_int lda,
+                          double* arf);
+lapack_int LAPACKE_ctrttf(int matrix_order, char transr, char uplo, lapack_int n, const lapack_complex_float* a,
+                          lapack_int lda, lapack_complex_float* arf);
+lapack_int LAPACKE_ztrttf(int matrix_order, char transr, char uplo, lapack_int n, const lapack_complex_double* a,
+                          lapack_int lda, lapack_complex_double* arf);
+
+lapack_int LAPACKE_strttp(int matrix_order, char uplo, lapack_int n, const float* a, lapack_int lda, float* ap);
+lapack_int LAPACKE_dtrttp(int matrix_order, char uplo, lapack_int n, const double* a, lapack_int lda, double* ap);
+lapack_int LAPACKE_ctrttp(int matrix_order, char uplo, lapack_int n, const lapack_complex_float* a, lapack_int lda,
+                          lapack_complex_float* ap);
+lapack_int LAPACKE_ztrttp(int matrix_order, char uplo, lapack_int n, const lapack_complex_double* a, lapack_int lda,
+                          lapack_complex_double* ap);
+
+lapack_int LAPACKE_stzrzf(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, float* tau);
+lapack_int LAPACKE_dtzrzf(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda, double* tau);
+lapack_int LAPACKE_ctzrzf(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                          lapack_complex_float* tau);
+lapack_int LAPACKE_ztzrzf(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                          lapack_complex_double* tau);
+
+lapack_int LAPACKE_cungbr(int matrix_order, char vect, lapack_int m, lapack_int n, lapack_int k,
+                          lapack_complex_float* a, lapack_int lda, const lapack_complex_float* tau);
+lapack_int LAPACKE_zungbr(int matrix_order, char vect, lapack_int m, lapack_int n, lapack_int k,
+                          lapack_complex_double* a, lapack_int lda, const lapack_complex_double* tau);
+
+lapack_int LAPACKE_cunghr(int matrix_order, lapack_int n, lapack_int ilo, lapack_int ihi, lapack_complex_float* a,
+                          lapack_int lda, const lapack_complex_float* tau);
+lapack_int LAPACKE_zunghr(int matrix_order, lapack_int n, lapack_int ilo, lapack_int ihi, lapack_complex_double* a,
+                          lapack_int lda, const lapack_complex_double* tau);
+
+lapack_int LAPACKE_cunglq(int matrix_order, lapack_int m, lapack_int n, lapack_int k, lapack_complex_float* a,
+                          lapack_int lda, const lapack_complex_float* tau);
+lapack_int LAPACKE_zunglq(int matrix_order, lapack_int m, lapack_int n, lapack_int k, lapack_complex_double* a,
+                          lapack_int lda, const lapack_complex_double* tau);
+
+lapack_int LAPACKE_cungql(int matrix_order, lapack_int m, lapack_int n, lapack_int k, lapack_complex_float* a,
+                          lapack_int lda, const lapack_complex_float* tau);
+lapack_int LAPACKE_zungql(int matrix_order, lapack_int m, lapack_int n, lapack_int k, lapack_complex_double* a,
+                          lapack_int lda, const lapack_complex_double* tau);
+
+lapack_int LAPACKE_cungqr(int matrix_order, lapack_int m, lapack_int n, lapack_int k, lapack_complex_float* a,
+                          lapack_int lda, const lapack_complex_float* tau);
+lapack_int LAPACKE_zungqr(int matrix_order, lapack_int m, lapack_int n, lapack_int k, lapack_complex_double* a,
+                          lapack_int lda, const lapack_complex_double* tau);
+
+lapack_int LAPACKE_cungrq(int matrix_order, lapack_int m, lapack_int n, lapack_int k, lapack_complex_float* a,
+                          lapack_int lda, const lapack_complex_float* tau);
+lapack_int LAPACKE_zungrq(int matrix_order, lapack_int m, lapack_int n, lapack_int k, lapack_complex_double* a,
+                          lapack_int lda, const lapack_complex_double* tau);
+
+lapack_int LAPACKE_cungtr(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                          const lapack_complex_float* tau);
+lapack_int LAPACKE_zungtr(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                          const lapack_complex_double* tau);
+
+lapack_int LAPACKE_cunmbr(int matrix_order, char vect, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                          const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* tau,
+                          lapack_complex_float* c, lapack_int ldc);
+lapack_int LAPACKE_zunmbr(int matrix_order, char vect, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                          const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* tau,
+                          lapack_complex_double* c, lapack_int ldc);
+
+lapack_int LAPACKE_cunmhr(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int ilo,
+                          lapack_int ihi, const lapack_complex_float* a, lapack_int lda,
+                          const lapack_complex_float* tau, lapack_complex_float* c, lapack_int ldc);
+lapack_int LAPACKE_zunmhr(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int ilo,
+                          lapack_int ihi, const lapack_complex_double* a, lapack_int lda,
+                          const lapack_complex_double* tau, lapack_complex_double* c, lapack_int ldc);
+
+lapack_int LAPACKE_cunmlq(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                          const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* tau,
+                          lapack_complex_float* c, lapack_int ldc);
+lapack_int LAPACKE_zunmlq(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                          const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* tau,
+                          lapack_complex_double* c, lapack_int ldc);
+
+lapack_int LAPACKE_cunmql(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                          const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* tau,
+                          lapack_complex_float* c, lapack_int ldc);
+lapack_int LAPACKE_zunmql(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                          const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* tau,
+                          lapack_complex_double* c, lapack_int ldc);
+
+lapack_int LAPACKE_cunmqr(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                          const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* tau,
+                          lapack_complex_float* c, lapack_int ldc);
+lapack_int LAPACKE_zunmqr(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                          const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* tau,
+                          lapack_complex_double* c, lapack_int ldc);
+
+lapack_int LAPACKE_cunmrq(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                          const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* tau,
+                          lapack_complex_float* c, lapack_int ldc);
+lapack_int LAPACKE_zunmrq(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                          const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* tau,
+                          lapack_complex_double* c, lapack_int ldc);
+
+lapack_int LAPACKE_cunmrz(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                          lapack_int l, const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* tau,
+                          lapack_complex_float* c, lapack_int ldc);
+lapack_int LAPACKE_zunmrz(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                          lapack_int l, const lapack_complex_double* a, lapack_int lda,
+                          const lapack_complex_double* tau, lapack_complex_double* c, lapack_int ldc);
+
+lapack_int LAPACKE_cunmtr(int matrix_order, char side, char uplo, char trans, lapack_int m, lapack_int n,
+                          const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* tau,
+                          lapack_complex_float* c, lapack_int ldc);
+lapack_int LAPACKE_zunmtr(int matrix_order, char side, char uplo, char trans, lapack_int m, lapack_int n,
+                          const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* tau,
+                          lapack_complex_double* c, lapack_int ldc);
+
+lapack_int LAPACKE_cupgtr(int matrix_order, char uplo, lapack_int n, const lapack_complex_float* ap,
+                          const lapack_complex_float* tau, lapack_complex_float* q, lapack_int ldq);
+lapack_int LAPACKE_zupgtr(int matrix_order, char uplo, lapack_int n, const lapack_complex_double* ap,
+                          const lapack_complex_double* tau, lapack_complex_double* q, lapack_int ldq);
+
+lapack_int LAPACKE_cupmtr(int matrix_order, char side, char uplo, char trans, lapack_int m, lapack_int n,
+                          const lapack_complex_float* ap, const lapack_complex_float* tau, lapack_complex_float* c,
+                          lapack_int ldc);
+lapack_int LAPACKE_zupmtr(int matrix_order, char side, char uplo, char trans, lapack_int m, lapack_int n,
+                          const lapack_complex_double* ap, const lapack_complex_double* tau, lapack_complex_double* c,
+                          lapack_int ldc);
+
+lapack_int LAPACKE_sbdsdc_work(int matrix_order, char uplo, char compq, lapack_int n, float* d, float* e, float* u,
+                               lapack_int ldu, float* vt, lapack_int ldvt, float* q, lapack_int* iq, float* work,
+                               lapack_int* iwork);
+lapack_int LAPACKE_dbdsdc_work(int matrix_order, char uplo, char compq, lapack_int n, double* d, double* e, double* u,
+                               lapack_int ldu, double* vt, lapack_int ldvt, double* q, lapack_int* iq, double* work,
+                               lapack_int* iwork);
+
+lapack_int LAPACKE_sbdsqr_work(int matrix_order, char uplo, lapack_int n, lapack_int ncvt, lapack_int nru,
+                               lapack_int ncc, float* d, float* e, float* vt, lapack_int ldvt, float* u, lapack_int ldu,
+                               float* c, lapack_int ldc, float* work);
+lapack_int LAPACKE_dbdsqr_work(int matrix_order, char uplo, lapack_int n, lapack_int ncvt, lapack_int nru,
+                               lapack_int ncc, double* d, double* e, double* vt, lapack_int ldvt, double* u,
+                               lapack_int ldu, double* c, lapack_int ldc, double* work);
+lapack_int LAPACKE_cbdsqr_work(int matrix_order, char uplo, lapack_int n, lapack_int ncvt, lapack_int nru,
+                               lapack_int ncc, float* d, float* e, lapack_complex_float* vt, lapack_int ldvt,
+                               lapack_complex_float* u, lapack_int ldu, lapack_complex_float* c, lapack_int ldc,
+                               float* work);
+lapack_int LAPACKE_zbdsqr_work(int matrix_order, char uplo, lapack_int n, lapack_int ncvt, lapack_int nru,
+                               lapack_int ncc, double* d, double* e, lapack_complex_double* vt, lapack_int ldvt,
+                               lapack_complex_double* u, lapack_int ldu, lapack_complex_double* c, lapack_int ldc,
+                               double* work);
+
+lapack_int LAPACKE_sdisna_work(char job, lapack_int m, lapack_int n, const float* d, float* sep);
+lapack_int LAPACKE_ddisna_work(char job, lapack_int m, lapack_int n, const double* d, double* sep);
+
+lapack_int LAPACKE_sgbbrd_work(int matrix_order, char vect, lapack_int m, lapack_int n, lapack_int ncc, lapack_int kl,
+                               lapack_int ku, float* ab, lapack_int ldab, float* d, float* e, float* q, lapack_int ldq,
+                               float* pt, lapack_int ldpt, float* c, lapack_int ldc, float* work);
+lapack_int LAPACKE_dgbbrd_work(int matrix_order, char vect, lapack_int m, lapack_int n, lapack_int ncc, lapack_int kl,
+                               lapack_int ku, double* ab, lapack_int ldab, double* d, double* e, double* q,
+                               lapack_int ldq, double* pt, lapack_int ldpt, double* c, lapack_int ldc, double* work);
+lapack_int LAPACKE_cgbbrd_work(int matrix_order, char vect, lapack_int m, lapack_int n, lapack_int ncc, lapack_int kl,
+                               lapack_int ku, lapack_complex_float* ab, lapack_int ldab, float* d, float* e,
+                               lapack_complex_float* q, lapack_int ldq, lapack_complex_float* pt, lapack_int ldpt,
+                               lapack_complex_float* c, lapack_int ldc, lapack_complex_float* work, float* rwork);
+lapack_int LAPACKE_zgbbrd_work(int matrix_order, char vect, lapack_int m, lapack_int n, lapack_int ncc, lapack_int kl,
+                               lapack_int ku, lapack_complex_double* ab, lapack_int ldab, double* d, double* e,
+                               lapack_complex_double* q, lapack_int ldq, lapack_complex_double* pt, lapack_int ldpt,
+                               lapack_complex_double* c, lapack_int ldc, lapack_complex_double* work, double* rwork);
+
+lapack_int LAPACKE_sgbcon_work(int matrix_order, char norm, lapack_int n, lapack_int kl, lapack_int ku, const float* ab,
+                               lapack_int ldab, const lapack_int* ipiv, float anorm, float* rcond, float* work,
+                               lapack_int* iwork);
+lapack_int LAPACKE_dgbcon_work(int matrix_order, char norm, lapack_int n, lapack_int kl, lapack_int ku,
+                               const double* ab, lapack_int ldab, const lapack_int* ipiv, double anorm, double* rcond,
+                               double* work, lapack_int* iwork);
+lapack_int LAPACKE_cgbcon_work(int matrix_order, char norm, lapack_int n, lapack_int kl, lapack_int ku,
+                               const lapack_complex_float* ab, lapack_int ldab, const lapack_int* ipiv, float anorm,
+                               float* rcond, lapack_complex_float* work, float* rwork);
+lapack_int LAPACKE_zgbcon_work(int matrix_order, char norm, lapack_int n, lapack_int kl, lapack_int ku,
+                               const lapack_complex_double* ab, lapack_int ldab, const lapack_int* ipiv, double anorm,
+                               double* rcond, lapack_complex_double* work, double* rwork);
+
+lapack_int LAPACKE_sgbequ_work(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku,
+                               const float* ab, lapack_int ldab, float* r, float* c, float* rowcnd, float* colcnd,
+                               float* amax);
+lapack_int LAPACKE_dgbequ_work(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku,
+                               const double* ab, lapack_int ldab, double* r, double* c, double* rowcnd, double* colcnd,
+                               double* amax);
+lapack_int LAPACKE_cgbequ_work(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku,
+                               const lapack_complex_float* ab, lapack_int ldab, float* r, float* c, float* rowcnd,
+                               float* colcnd, float* amax);
+lapack_int LAPACKE_zgbequ_work(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku,
+                               const lapack_complex_double* ab, lapack_int ldab, double* r, double* c, double* rowcnd,
+                               double* colcnd, double* amax);
+
+lapack_int LAPACKE_sgbequb_work(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku,
+                                const float* ab, lapack_int ldab, float* r, float* c, float* rowcnd, float* colcnd,
+                                float* amax);
+lapack_int LAPACKE_dgbequb_work(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku,
+                                const double* ab, lapack_int ldab, double* r, double* c, double* rowcnd, double* colcnd,
+                                double* amax);
+lapack_int LAPACKE_cgbequb_work(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku,
+                                const lapack_complex_float* ab, lapack_int ldab, float* r, float* c, float* rowcnd,
+                                float* colcnd, float* amax);
+lapack_int LAPACKE_zgbequb_work(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku,
+                                const lapack_complex_double* ab, lapack_int ldab, double* r, double* c, double* rowcnd,
+                                double* colcnd, double* amax);
+
+lapack_int LAPACKE_sgbrfs_work(int matrix_order, char trans, lapack_int n, lapack_int kl, lapack_int ku,
+                               lapack_int nrhs, const float* ab, lapack_int ldab, const float* afb, lapack_int ldafb,
+                               const lapack_int* ipiv, const float* b, lapack_int ldb, float* x, lapack_int ldx,
+                               float* ferr, float* berr, float* work, lapack_int* iwork);
+lapack_int LAPACKE_dgbrfs_work(int matrix_order, char trans, lapack_int n, lapack_int kl, lapack_int ku,
+                               lapack_int nrhs, const double* ab, lapack_int ldab, const double* afb, lapack_int ldafb,
+                               const lapack_int* ipiv, const double* b, lapack_int ldb, double* x, lapack_int ldx,
+                               double* ferr, double* berr, double* work, lapack_int* iwork);
+lapack_int LAPACKE_cgbrfs_work(int matrix_order, char trans, lapack_int n, lapack_int kl, lapack_int ku,
+                               lapack_int nrhs, const lapack_complex_float* ab, lapack_int ldab,
+                               const lapack_complex_float* afb, lapack_int ldafb, const lapack_int* ipiv,
+                               const lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x, lapack_int ldx,
+                               float* ferr, float* berr, lapack_complex_float* work, float* rwork);
+lapack_int LAPACKE_zgbrfs_work(int matrix_order, char trans, lapack_int n, lapack_int kl, lapack_int ku,
+                               lapack_int nrhs, const lapack_complex_double* ab, lapack_int ldab,
+                               const lapack_complex_double* afb, lapack_int ldafb, const lapack_int* ipiv,
+                               const lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x, lapack_int ldx,
+                               double* ferr, double* berr, lapack_complex_double* work, double* rwork);
+
+lapack_int LAPACKE_sgbrfsx_work(int matrix_order, char trans, char equed, lapack_int n, lapack_int kl, lapack_int ku,
+                                lapack_int nrhs, const float* ab, lapack_int ldab, const float* afb, lapack_int ldafb,
+                                const lapack_int* ipiv, const float* r, const float* c, const float* b, lapack_int ldb,
+                                float* x, lapack_int ldx, float* rcond, float* berr, lapack_int n_err_bnds,
+                                float* err_bnds_norm, float* err_bnds_comp, lapack_int nparams, float* params,
+                                float* work, lapack_int* iwork);
+lapack_int LAPACKE_dgbrfsx_work(int matrix_order, char trans, char equed, lapack_int n, lapack_int kl, lapack_int ku,
+                                lapack_int nrhs, const double* ab, lapack_int ldab, const double* afb, lapack_int ldafb,
+                                const lapack_int* ipiv, const double* r, const double* c, const double* b,
+                                lapack_int ldb, double* x, lapack_int ldx, double* rcond, double* berr,
+                                lapack_int n_err_bnds, double* err_bnds_norm, double* err_bnds_comp, lapack_int nparams,
+                                double* params, double* work, lapack_int* iwork);
+lapack_int LAPACKE_cgbrfsx_work(int matrix_order, char trans, char equed, lapack_int n, lapack_int kl, lapack_int ku,
+                                lapack_int nrhs, const lapack_complex_float* ab, lapack_int ldab,
+                                const lapack_complex_float* afb, lapack_int ldafb, const lapack_int* ipiv,
+                                const float* r, const float* c, const lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* x, lapack_int ldx, float* rcond, float* berr,
+                                lapack_int n_err_bnds, float* err_bnds_norm, float* err_bnds_comp, lapack_int nparams,
+                                float* params, lapack_complex_float* work, float* rwork);
+lapack_int LAPACKE_zgbrfsx_work(int matrix_order, char trans, char equed, lapack_int n, lapack_int kl, lapack_int ku,
+                                lapack_int nrhs, const lapack_complex_double* ab, lapack_int ldab,
+                                const lapack_complex_double* afb, lapack_int ldafb, const lapack_int* ipiv,
+                                const double* r, const double* c, const lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx, double* rcond, double* berr,
+                                lapack_int n_err_bnds, double* err_bnds_norm, double* err_bnds_comp, lapack_int nparams,
+                                double* params, lapack_complex_double* work, double* rwork);
+
+lapack_int LAPACKE_sgbsv_work(int matrix_order, lapack_int n, lapack_int kl, lapack_int ku, lapack_int nrhs, float* ab,
+                              lapack_int ldab, lapack_int* ipiv, float* b, lapack_int ldb);
+lapack_int LAPACKE_dgbsv_work(int matrix_order, lapack_int n, lapack_int kl, lapack_int ku, lapack_int nrhs, double* ab,
+                              lapack_int ldab, lapack_int* ipiv, double* b, lapack_int ldb);
+lapack_int LAPACKE_cgbsv_work(int matrix_order, lapack_int n, lapack_int kl, lapack_int ku, lapack_int nrhs,
+                              lapack_complex_float* ab, lapack_int ldab, lapack_int* ipiv, lapack_complex_float* b,
+                              lapack_int ldb);
+lapack_int LAPACKE_zgbsv_work(int matrix_order, lapack_int n, lapack_int kl, lapack_int ku, lapack_int nrhs,
+                              lapack_complex_double* ab, lapack_int ldab, lapack_int* ipiv, lapack_complex_double* b,
+                              lapack_int ldb);
+
+lapack_int LAPACKE_sgbsvx_work(int matrix_order, char fact, char trans, lapack_int n, lapack_int kl, lapack_int ku,
+                               lapack_int nrhs, float* ab, lapack_int ldab, float* afb, lapack_int ldafb,
+                               lapack_int* ipiv, char* equed, float* r, float* c, float* b, lapack_int ldb, float* x,
+                               lapack_int ldx, float* rcond, float* ferr, float* berr, float* work, lapack_int* iwork);
+lapack_int LAPACKE_dgbsvx_work(int matrix_order, char fact, char trans, lapack_int n, lapack_int kl, lapack_int ku,
+                               lapack_int nrhs, double* ab, lapack_int ldab, double* afb, lapack_int ldafb,
+                               lapack_int* ipiv, char* equed, double* r, double* c, double* b, lapack_int ldb,
+                               double* x, lapack_int ldx, double* rcond, double* ferr, double* berr, double* work,
+                               lapack_int* iwork);
+lapack_int LAPACKE_cgbsvx_work(int matrix_order, char fact, char trans, lapack_int n, lapack_int kl, lapack_int ku,
+                               lapack_int nrhs, lapack_complex_float* ab, lapack_int ldab, lapack_complex_float* afb,
+                               lapack_int ldafb, lapack_int* ipiv, char* equed, float* r, float* c,
+                               lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x, lapack_int ldx,
+                               float* rcond, float* ferr, float* berr, lapack_complex_float* work, float* rwork);
+lapack_int LAPACKE_zgbsvx_work(int matrix_order, char fact, char trans, lapack_int n, lapack_int kl, lapack_int ku,
+                               lapack_int nrhs, lapack_complex_double* ab, lapack_int ldab, lapack_complex_double* afb,
+                               lapack_int ldafb, lapack_int* ipiv, char* equed, double* r, double* c,
+                               lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x, lapack_int ldx,
+                               double* rcond, double* ferr, double* berr, lapack_complex_double* work, double* rwork);
+
+lapack_int LAPACKE_sgbsvxx_work(int matrix_order, char fact, char trans, lapack_int n, lapack_int kl, lapack_int ku,
+                                lapack_int nrhs, float* ab, lapack_int ldab, float* afb, lapack_int ldafb,
+                                lapack_int* ipiv, char* equed, float* r, float* c, float* b, lapack_int ldb, float* x,
+                                lapack_int ldx, float* rcond, float* rpvgrw, float* berr, lapack_int n_err_bnds,
+                                float* err_bnds_norm, float* err_bnds_comp, lapack_int nparams, float* params,
+                                float* work, lapack_int* iwork);
+lapack_int LAPACKE_dgbsvxx_work(int matrix_order, char fact, char trans, lapack_int n, lapack_int kl, lapack_int ku,
+                                lapack_int nrhs, double* ab, lapack_int ldab, double* afb, lapack_int ldafb,
+                                lapack_int* ipiv, char* equed, double* r, double* c, double* b, lapack_int ldb,
+                                double* x, lapack_int ldx, double* rcond, double* rpvgrw, double* berr,
+                                lapack_int n_err_bnds, double* err_bnds_norm, double* err_bnds_comp, lapack_int nparams,
+                                double* params, double* work, lapack_int* iwork);
+lapack_int LAPACKE_cgbsvxx_work(int matrix_order, char fact, char trans, lapack_int n, lapack_int kl, lapack_int ku,
+                                lapack_int nrhs, lapack_complex_float* ab, lapack_int ldab, lapack_complex_float* afb,
+                                lapack_int ldafb, lapack_int* ipiv, char* equed, float* r, float* c,
+                                lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x, lapack_int ldx,
+                                float* rcond, float* rpvgrw, float* berr, lapack_int n_err_bnds, float* err_bnds_norm,
+                                float* err_bnds_comp, lapack_int nparams, float* params, lapack_complex_float* work,
+                                float* rwork);
+lapack_int LAPACKE_zgbsvxx_work(int matrix_order, char fact, char trans, lapack_int n, lapack_int kl, lapack_int ku,
+                                lapack_int nrhs, lapack_complex_double* ab, lapack_int ldab, lapack_complex_double* afb,
+                                lapack_int ldafb, lapack_int* ipiv, char* equed, double* r, double* c,
+                                lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x, lapack_int ldx,
+                                double* rcond, double* rpvgrw, double* berr, lapack_int n_err_bnds,
+                                double* err_bnds_norm, double* err_bnds_comp, lapack_int nparams, double* params,
+                                lapack_complex_double* work, double* rwork);
+
+lapack_int LAPACKE_sgbtrf_work(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku, float* ab,
+                               lapack_int ldab, lapack_int* ipiv);
+lapack_int LAPACKE_dgbtrf_work(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku, double* ab,
+                               lapack_int ldab, lapack_int* ipiv);
+lapack_int LAPACKE_cgbtrf_work(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku,
+                               lapack_complex_float* ab, lapack_int ldab, lapack_int* ipiv);
+lapack_int LAPACKE_zgbtrf_work(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku,
+                               lapack_complex_double* ab, lapack_int ldab, lapack_int* ipiv);
+
+lapack_int LAPACKE_sgbtrs_work(int matrix_order, char trans, lapack_int n, lapack_int kl, lapack_int ku,
+                               lapack_int nrhs, const float* ab, lapack_int ldab, const lapack_int* ipiv, float* b,
+                               lapack_int ldb);
+lapack_int LAPACKE_dgbtrs_work(int matrix_order, char trans, lapack_int n, lapack_int kl, lapack_int ku,
+                               lapack_int nrhs, const double* ab, lapack_int ldab, const lapack_int* ipiv, double* b,
+                               lapack_int ldb);
+lapack_int LAPACKE_cgbtrs_work(int matrix_order, char trans, lapack_int n, lapack_int kl, lapack_int ku,
+                               lapack_int nrhs, const lapack_complex_float* ab, lapack_int ldab, const lapack_int* ipiv,
+                               lapack_complex_float* b, lapack_int ldb);
+lapack_int LAPACKE_zgbtrs_work(int matrix_order, char trans, lapack_int n, lapack_int kl, lapack_int ku,
+                               lapack_int nrhs, const lapack_complex_double* ab, lapack_int ldab,
+                               const lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb);
+
+lapack_int LAPACKE_sgebak_work(int matrix_order, char job, char side, lapack_int n, lapack_int ilo, lapack_int ihi,
+                               const float* scale, lapack_int m, float* v, lapack_int ldv);
+lapack_int LAPACKE_dgebak_work(int matrix_order, char job, char side, lapack_int n, lapack_int ilo, lapack_int ihi,
+                               const double* scale, lapack_int m, double* v, lapack_int ldv);
+lapack_int LAPACKE_cgebak_work(int matrix_order, char job, char side, lapack_int n, lapack_int ilo, lapack_int ihi,
+                               const float* scale, lapack_int m, lapack_complex_float* v, lapack_int ldv);
+lapack_int LAPACKE_zgebak_work(int matrix_order, char job, char side, lapack_int n, lapack_int ilo, lapack_int ihi,
+                               const double* scale, lapack_int m, lapack_complex_double* v, lapack_int ldv);
+
+lapack_int LAPACKE_sgebal_work(int matrix_order, char job, lapack_int n, float* a, lapack_int lda, lapack_int* ilo,
+                               lapack_int* ihi, float* scale);
+lapack_int LAPACKE_dgebal_work(int matrix_order, char job, lapack_int n, double* a, lapack_int lda, lapack_int* ilo,
+                               lapack_int* ihi, double* scale);
+lapack_int LAPACKE_cgebal_work(int matrix_order, char job, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                               lapack_int* ilo, lapack_int* ihi, float* scale);
+lapack_int LAPACKE_zgebal_work(int matrix_order, char job, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                               lapack_int* ilo, lapack_int* ihi, double* scale);
+
+lapack_int LAPACKE_sgebrd_work(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, float* d,
+                               float* e, float* tauq, float* taup, float* work, lapack_int lwork);
+lapack_int LAPACKE_dgebrd_work(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda, double* d,
+                               double* e, double* tauq, double* taup, double* work, lapack_int lwork);
+lapack_int LAPACKE_cgebrd_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                               float* d, float* e, lapack_complex_float* tauq, lapack_complex_float* taup,
+                               lapack_complex_float* work, lapack_int lwork);
+lapack_int LAPACKE_zgebrd_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                               double* d, double* e, lapack_complex_double* tauq, lapack_complex_double* taup,
+                               lapack_complex_double* work, lapack_int lwork);
+
+lapack_int LAPACKE_sgecon_work(int matrix_order, char norm, lapack_int n, const float* a, lapack_int lda, float anorm,
+                               float* rcond, float* work, lapack_int* iwork);
+lapack_int LAPACKE_dgecon_work(int matrix_order, char norm, lapack_int n, const double* a, lapack_int lda, double anorm,
+                               double* rcond, double* work, lapack_int* iwork);
+lapack_int LAPACKE_cgecon_work(int matrix_order, char norm, lapack_int n, const lapack_complex_float* a, lapack_int lda,
+                               float anorm, float* rcond, lapack_complex_float* work, float* rwork);
+lapack_int LAPACKE_zgecon_work(int matrix_order, char norm, lapack_int n, const lapack_complex_double* a,
+                               lapack_int lda, double anorm, double* rcond, lapack_complex_double* work, double* rwork);
+
+lapack_int LAPACKE_sgeequ_work(int matrix_order, lapack_int m, lapack_int n, const float* a, lapack_int lda, float* r,
+                               float* c, float* rowcnd, float* colcnd, float* amax);
+lapack_int LAPACKE_dgeequ_work(int matrix_order, lapack_int m, lapack_int n, const double* a, lapack_int lda, double* r,
+                               double* c, double* rowcnd, double* colcnd, double* amax);
+lapack_int LAPACKE_cgeequ_work(int matrix_order, lapack_int m, lapack_int n, const lapack_complex_float* a,
+                               lapack_int lda, float* r, float* c, float* rowcnd, float* colcnd, float* amax);
+lapack_int LAPACKE_zgeequ_work(int matrix_order, lapack_int m, lapack_int n, const lapack_complex_double* a,
+                               lapack_int lda, double* r, double* c, double* rowcnd, double* colcnd, double* amax);
+
+lapack_int LAPACKE_sgeequb_work(int matrix_order, lapack_int m, lapack_int n, const float* a, lapack_int lda, float* r,
+                                float* c, float* rowcnd, float* colcnd, float* amax);
+lapack_int LAPACKE_dgeequb_work(int matrix_order, lapack_int m, lapack_int n, const double* a, lapack_int lda,
+                                double* r, double* c, double* rowcnd, double* colcnd, double* amax);
+lapack_int LAPACKE_cgeequb_work(int matrix_order, lapack_int m, lapack_int n, const lapack_complex_float* a,
+                                lapack_int lda, float* r, float* c, float* rowcnd, float* colcnd, float* amax);
+lapack_int LAPACKE_zgeequb_work(int matrix_order, lapack_int m, lapack_int n, const lapack_complex_double* a,
+                                lapack_int lda, double* r, double* c, double* rowcnd, double* colcnd, double* amax);
+
+lapack_int LAPACKE_sgees_work(int matrix_order, char jobvs, char sort, LAPACK_S_SELECT2 select, lapack_int n, float* a,
+                              lapack_int lda, lapack_int* sdim, float* wr, float* wi, float* vs, lapack_int ldvs,
+                              float* work, lapack_int lwork, lapack_logical* bwork);
+lapack_int LAPACKE_dgees_work(int matrix_order, char jobvs, char sort, LAPACK_D_SELECT2 select, lapack_int n, double* a,
+                              lapack_int lda, lapack_int* sdim, double* wr, double* wi, double* vs, lapack_int ldvs,
+                              double* work, lapack_int lwork, lapack_logical* bwork);
+lapack_int LAPACKE_cgees_work(int matrix_order, char jobvs, char sort, LAPACK_C_SELECT1 select, lapack_int n,
+                              lapack_complex_float* a, lapack_int lda, lapack_int* sdim, lapack_complex_float* w,
+                              lapack_complex_float* vs, lapack_int ldvs, lapack_complex_float* work, lapack_int lwork,
+                              float* rwork, lapack_logical* bwork);
+lapack_int LAPACKE_zgees_work(int matrix_order, char jobvs, char sort, LAPACK_Z_SELECT1 select, lapack_int n,
+                              lapack_complex_double* a, lapack_int lda, lapack_int* sdim, lapack_complex_double* w,
+                              lapack_complex_double* vs, lapack_int ldvs, lapack_complex_double* work, lapack_int lwork,
+                              double* rwork, lapack_logical* bwork);
+
+lapack_int LAPACKE_sgeesx_work(int matrix_order, char jobvs, char sort, LAPACK_S_SELECT2 select, char sense,
+                               lapack_int n, float* a, lapack_int lda, lapack_int* sdim, float* wr, float* wi,
+                               float* vs, lapack_int ldvs, float* rconde, float* rcondv, float* work, lapack_int lwork,
+                               lapack_int* iwork, lapack_int liwork, lapack_logical* bwork);
+lapack_int LAPACKE_dgeesx_work(int matrix_order, char jobvs, char sort, LAPACK_D_SELECT2 select, char sense,
+                               lapack_int n, double* a, lapack_int lda, lapack_int* sdim, double* wr, double* wi,
+                               double* vs, lapack_int ldvs, double* rconde, double* rcondv, double* work,
+                               lapack_int lwork, lapack_int* iwork, lapack_int liwork, lapack_logical* bwork);
+lapack_int LAPACKE_cgeesx_work(int matrix_order, char jobvs, char sort, LAPACK_C_SELECT1 select, char sense,
+                               lapack_int n, lapack_complex_float* a, lapack_int lda, lapack_int* sdim,
+                               lapack_complex_float* w, lapack_complex_float* vs, lapack_int ldvs, float* rconde,
+                               float* rcondv, lapack_complex_float* work, lapack_int lwork, float* rwork,
+                               lapack_logical* bwork);
+lapack_int LAPACKE_zgeesx_work(int matrix_order, char jobvs, char sort, LAPACK_Z_SELECT1 select, char sense,
+                               lapack_int n, lapack_complex_double* a, lapack_int lda, lapack_int* sdim,
+                               lapack_complex_double* w, lapack_complex_double* vs, lapack_int ldvs, double* rconde,
+                               double* rcondv, lapack_complex_double* work, lapack_int lwork, double* rwork,
+                               lapack_logical* bwork);
+
+lapack_int LAPACKE_sgeev_work(int matrix_order, char jobvl, char jobvr, lapack_int n, float* a, lapack_int lda,
+                              float* wr, float* wi, float* vl, lapack_int ldvl, float* vr, lapack_int ldvr, float* work,
+                              lapack_int lwork);
+lapack_int LAPACKE_dgeev_work(int matrix_order, char jobvl, char jobvr, lapack_int n, double* a, lapack_int lda,
+                              double* wr, double* wi, double* vl, lapack_int ldvl, double* vr, lapack_int ldvr,
+                              double* work, lapack_int lwork);
+lapack_int LAPACKE_cgeev_work(int matrix_order, char jobvl, char jobvr, lapack_int n, lapack_complex_float* a,
+                              lapack_int lda, lapack_complex_float* w, lapack_complex_float* vl, lapack_int ldvl,
+                              lapack_complex_float* vr, lapack_int ldvr, lapack_complex_float* work, lapack_int lwork,
+                              float* rwork);
+lapack_int LAPACKE_zgeev_work(int matrix_order, char jobvl, char jobvr, lapack_int n, lapack_complex_double* a,
+                              lapack_int lda, lapack_complex_double* w, lapack_complex_double* vl, lapack_int ldvl,
+                              lapack_complex_double* vr, lapack_int ldvr, lapack_complex_double* work, lapack_int lwork,
+                              double* rwork);
+
+lapack_int LAPACKE_sgeevx_work(int matrix_order, char balanc, char jobvl, char jobvr, char sense, lapack_int n,
+                               float* a, lapack_int lda, float* wr, float* wi, float* vl, lapack_int ldvl, float* vr,
+                               lapack_int ldvr, lapack_int* ilo, lapack_int* ihi, float* scale, float* abnrm,
+                               float* rconde, float* rcondv, float* work, lapack_int lwork, lapack_int* iwork);
+lapack_int LAPACKE_dgeevx_work(int matrix_order, char balanc, char jobvl, char jobvr, char sense, lapack_int n,
+                               double* a, lapack_int lda, double* wr, double* wi, double* vl, lapack_int ldvl,
+                               double* vr, lapack_int ldvr, lapack_int* ilo, lapack_int* ihi, double* scale,
+                               double* abnrm, double* rconde, double* rcondv, double* work, lapack_int lwork,
+                               lapack_int* iwork);
+lapack_int LAPACKE_cgeevx_work(int matrix_order, char balanc, char jobvl, char jobvr, char sense, lapack_int n,
+                               lapack_complex_float* a, lapack_int lda, lapack_complex_float* w,
+                               lapack_complex_float* vl, lapack_int ldvl, lapack_complex_float* vr, lapack_int ldvr,
+                               lapack_int* ilo, lapack_int* ihi, float* scale, float* abnrm, float* rconde,
+                               float* rcondv, lapack_complex_float* work, lapack_int lwork, float* rwork);
+lapack_int LAPACKE_zgeevx_work(int matrix_order, char balanc, char jobvl, char jobvr, char sense, lapack_int n,
+                               lapack_complex_double* a, lapack_int lda, lapack_complex_double* w,
+                               lapack_complex_double* vl, lapack_int ldvl, lapack_complex_double* vr, lapack_int ldvr,
+                               lapack_int* ilo, lapack_int* ihi, double* scale, double* abnrm, double* rconde,
+                               double* rcondv, lapack_complex_double* work, lapack_int lwork, double* rwork);
+
+lapack_int LAPACKE_sgehrd_work(int matrix_order, lapack_int n, lapack_int ilo, lapack_int ihi, float* a, lapack_int lda,
+                               float* tau, float* work, lapack_int lwork);
+lapack_int LAPACKE_dgehrd_work(int matrix_order, lapack_int n, lapack_int ilo, lapack_int ihi, double* a,
+                               lapack_int lda, double* tau, double* work, lapack_int lwork);
+lapack_int LAPACKE_cgehrd_work(int matrix_order, lapack_int n, lapack_int ilo, lapack_int ihi, lapack_complex_float* a,
+                               lapack_int lda, lapack_complex_float* tau, lapack_complex_float* work, lapack_int lwork);
+lapack_int LAPACKE_zgehrd_work(int matrix_order, lapack_int n, lapack_int ilo, lapack_int ihi, lapack_complex_double* a,
+                               lapack_int lda, lapack_complex_double* tau, lapack_complex_double* work,
+                               lapack_int lwork);
+
+lapack_int LAPACKE_sgejsv_work(int matrix_order, char joba, char jobu, char jobv, char jobr, char jobt, char jobp,
+                               lapack_int m, lapack_int n, float* a, lapack_int lda, float* sva, float* u,
+                               lapack_int ldu, float* v, lapack_int ldv, float* work, lapack_int lwork,
+                               lapack_int* iwork);
+lapack_int LAPACKE_dgejsv_work(int matrix_order, char joba, char jobu, char jobv, char jobr, char jobt, char jobp,
+                               lapack_int m, lapack_int n, double* a, lapack_int lda, double* sva, double* u,
+                               lapack_int ldu, double* v, lapack_int ldv, double* work, lapack_int lwork,
+                               lapack_int* iwork);
+
+lapack_int LAPACKE_sgelq2_work(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, float* tau,
+                               float* work);
+lapack_int LAPACKE_dgelq2_work(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda, double* tau,
+                               double* work);
+lapack_int LAPACKE_cgelq2_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                               lapack_complex_float* tau, lapack_complex_float* work);
+lapack_int LAPACKE_zgelq2_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                               lapack_complex_double* tau, lapack_complex_double* work);
+
+lapack_int LAPACKE_sgelqf_work(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, float* tau,
+                               float* work, lapack_int lwork);
+lapack_int LAPACKE_dgelqf_work(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda, double* tau,
+                               double* work, lapack_int lwork);
+lapack_int LAPACKE_cgelqf_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                               lapack_complex_float* tau, lapack_complex_float* work, lapack_int lwork);
+lapack_int LAPACKE_zgelqf_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                               lapack_complex_double* tau, lapack_complex_double* work, lapack_int lwork);
+
+lapack_int LAPACKE_sgels_work(int matrix_order, char trans, lapack_int m, lapack_int n, lapack_int nrhs, float* a,
+                              lapack_int lda, float* b, lapack_int ldb, float* work, lapack_int lwork);
+lapack_int LAPACKE_dgels_work(int matrix_order, char trans, lapack_int m, lapack_int n, lapack_int nrhs, double* a,
+                              lapack_int lda, double* b, lapack_int ldb, double* work, lapack_int lwork);
+lapack_int LAPACKE_cgels_work(int matrix_order, char trans, lapack_int m, lapack_int n, lapack_int nrhs,
+                              lapack_complex_float* a, lapack_int lda, lapack_complex_float* b, lapack_int ldb,
+                              lapack_complex_float* work, lapack_int lwork);
+lapack_int LAPACKE_zgels_work(int matrix_order, char trans, lapack_int m, lapack_int n, lapack_int nrhs,
+                              lapack_complex_double* a, lapack_int lda, lapack_complex_double* b, lapack_int ldb,
+                              lapack_complex_double* work, lapack_int lwork);
+
+lapack_int LAPACKE_sgelsd_work(int matrix_order, lapack_int m, lapack_int n, lapack_int nrhs, float* a, lapack_int lda,
+                               float* b, lapack_int ldb, float* s, float rcond, lapack_int* rank, float* work,
+                               lapack_int lwork, lapack_int* iwork);
+lapack_int LAPACKE_dgelsd_work(int matrix_order, lapack_int m, lapack_int n, lapack_int nrhs, double* a, lapack_int lda,
+                               double* b, lapack_int ldb, double* s, double rcond, lapack_int* rank, double* work,
+                               lapack_int lwork, lapack_int* iwork);
+lapack_int LAPACKE_cgelsd_work(int matrix_order, lapack_int m, lapack_int n, lapack_int nrhs, lapack_complex_float* a,
+                               lapack_int lda, lapack_complex_float* b, lapack_int ldb, float* s, float rcond,
+                               lapack_int* rank, lapack_complex_float* work, lapack_int lwork, float* rwork,
+                               lapack_int* iwork);
+lapack_int LAPACKE_zgelsd_work(int matrix_order, lapack_int m, lapack_int n, lapack_int nrhs, lapack_complex_double* a,
+                               lapack_int lda, lapack_complex_double* b, lapack_int ldb, double* s, double rcond,
+                               lapack_int* rank, lapack_complex_double* work, lapack_int lwork, double* rwork,
+                               lapack_int* iwork);
+
+lapack_int LAPACKE_sgelss_work(int matrix_order, lapack_int m, lapack_int n, lapack_int nrhs, float* a, lapack_int lda,
+                               float* b, lapack_int ldb, float* s, float rcond, lapack_int* rank, float* work,
+                               lapack_int lwork);
+lapack_int LAPACKE_dgelss_work(int matrix_order, lapack_int m, lapack_int n, lapack_int nrhs, double* a, lapack_int lda,
+                               double* b, lapack_int ldb, double* s, double rcond, lapack_int* rank, double* work,
+                               lapack_int lwork);
+lapack_int LAPACKE_cgelss_work(int matrix_order, lapack_int m, lapack_int n, lapack_int nrhs, lapack_complex_float* a,
+                               lapack_int lda, lapack_complex_float* b, lapack_int ldb, float* s, float rcond,
+                               lapack_int* rank, lapack_complex_float* work, lapack_int lwork, float* rwork);
+lapack_int LAPACKE_zgelss_work(int matrix_order, lapack_int m, lapack_int n, lapack_int nrhs, lapack_complex_double* a,
+                               lapack_int lda, lapack_complex_double* b, lapack_int ldb, double* s, double rcond,
+                               lapack_int* rank, lapack_complex_double* work, lapack_int lwork, double* rwork);
+
+lapack_int LAPACKE_sgelsy_work(int matrix_order, lapack_int m, lapack_int n, lapack_int nrhs, float* a, lapack_int lda,
+                               float* b, lapack_int ldb, lapack_int* jpvt, float rcond, lapack_int* rank, float* work,
+                               lapack_int lwork);
+lapack_int LAPACKE_dgelsy_work(int matrix_order, lapack_int m, lapack_int n, lapack_int nrhs, double* a, lapack_int lda,
+                               double* b, lapack_int ldb, lapack_int* jpvt, double rcond, lapack_int* rank,
+                               double* work, lapack_int lwork);
+lapack_int LAPACKE_cgelsy_work(int matrix_order, lapack_int m, lapack_int n, lapack_int nrhs, lapack_complex_float* a,
+                               lapack_int lda, lapack_complex_float* b, lapack_int ldb, lapack_int* jpvt, float rcond,
+                               lapack_int* rank, lapack_complex_float* work, lapack_int lwork, float* rwork);
+lapack_int LAPACKE_zgelsy_work(int matrix_order, lapack_int m, lapack_int n, lapack_int nrhs, lapack_complex_double* a,
+                               lapack_int lda, lapack_complex_double* b, lapack_int ldb, lapack_int* jpvt, double rcond,
+                               lapack_int* rank, lapack_complex_double* work, lapack_int lwork, double* rwork);
+
+lapack_int LAPACKE_sgeqlf_work(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, float* tau,
+                               float* work, lapack_int lwork);
+lapack_int LAPACKE_dgeqlf_work(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda, double* tau,
+                               double* work, lapack_int lwork);
+lapack_int LAPACKE_cgeqlf_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                               lapack_complex_float* tau, lapack_complex_float* work, lapack_int lwork);
+lapack_int LAPACKE_zgeqlf_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                               lapack_complex_double* tau, lapack_complex_double* work, lapack_int lwork);
+
+lapack_int LAPACKE_sgeqp3_work(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, lapack_int* jpvt,
+                               float* tau, float* work, lapack_int lwork);
+lapack_int LAPACKE_dgeqp3_work(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda,
+                               lapack_int* jpvt, double* tau, double* work, lapack_int lwork);
+lapack_int LAPACKE_cgeqp3_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                               lapack_int* jpvt, lapack_complex_float* tau, lapack_complex_float* work,
+                               lapack_int lwork, float* rwork);
+lapack_int LAPACKE_zgeqp3_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                               lapack_int* jpvt, lapack_complex_double* tau, lapack_complex_double* work,
+                               lapack_int lwork, double* rwork);
+
+lapack_int LAPACKE_sgeqpf_work(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, lapack_int* jpvt,
+                               float* tau, float* work);
+lapack_int LAPACKE_dgeqpf_work(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda,
+                               lapack_int* jpvt, double* tau, double* work);
+lapack_int LAPACKE_cgeqpf_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                               lapack_int* jpvt, lapack_complex_float* tau, lapack_complex_float* work, float* rwork);
+lapack_int LAPACKE_zgeqpf_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                               lapack_int* jpvt, lapack_complex_double* tau, lapack_complex_double* work,
+                               double* rwork);
+
+lapack_int LAPACKE_sgeqr2_work(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, float* tau,
+                               float* work);
+lapack_int LAPACKE_dgeqr2_work(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda, double* tau,
+                               double* work);
+lapack_int LAPACKE_cgeqr2_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                               lapack_complex_float* tau, lapack_complex_float* work);
+lapack_int LAPACKE_zgeqr2_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                               lapack_complex_double* tau, lapack_complex_double* work);
+
+lapack_int LAPACKE_sgeqrf_work(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, float* tau,
+                               float* work, lapack_int lwork);
+lapack_int LAPACKE_dgeqrf_work(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda, double* tau,
+                               double* work, lapack_int lwork);
+lapack_int LAPACKE_cgeqrf_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                               lapack_complex_float* tau, lapack_complex_float* work, lapack_int lwork);
+lapack_int LAPACKE_zgeqrf_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                               lapack_complex_double* tau, lapack_complex_double* work, lapack_int lwork);
+
+lapack_int LAPACKE_sgeqrfp_work(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, float* tau,
+                                float* work, lapack_int lwork);
+lapack_int LAPACKE_dgeqrfp_work(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda, double* tau,
+                                double* work, lapack_int lwork);
+lapack_int LAPACKE_cgeqrfp_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* tau, lapack_complex_float* work, lapack_int lwork);
+lapack_int LAPACKE_zgeqrfp_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* tau, lapack_complex_double* work, lapack_int lwork);
+
+lapack_int LAPACKE_sgerfs_work(int matrix_order, char trans, lapack_int n, lapack_int nrhs, const float* a,
+                               lapack_int lda, const float* af, lapack_int ldaf, const lapack_int* ipiv, const float* b,
+                               lapack_int ldb, float* x, lapack_int ldx, float* ferr, float* berr, float* work,
+                               lapack_int* iwork);
+lapack_int LAPACKE_dgerfs_work(int matrix_order, char trans, lapack_int n, lapack_int nrhs, const double* a,
+                               lapack_int lda, const double* af, lapack_int ldaf, const lapack_int* ipiv,
+                               const double* b, lapack_int ldb, double* x, lapack_int ldx, double* ferr, double* berr,
+                               double* work, lapack_int* iwork);
+lapack_int LAPACKE_cgerfs_work(int matrix_order, char trans, lapack_int n, lapack_int nrhs,
+                               const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* af,
+                               lapack_int ldaf, const lapack_int* ipiv, const lapack_complex_float* b, lapack_int ldb,
+                               lapack_complex_float* x, lapack_int ldx, float* ferr, float* berr,
+                               lapack_complex_float* work, float* rwork);
+lapack_int LAPACKE_zgerfs_work(int matrix_order, char trans, lapack_int n, lapack_int nrhs,
+                               const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* af,
+                               lapack_int ldaf, const lapack_int* ipiv, const lapack_complex_double* b, lapack_int ldb,
+                               lapack_complex_double* x, lapack_int ldx, double* ferr, double* berr,
+                               lapack_complex_double* work, double* rwork);
+
+lapack_int LAPACKE_sgerfsx_work(int matrix_order, char trans, char equed, lapack_int n, lapack_int nrhs, const float* a,
+                                lapack_int lda, const float* af, lapack_int ldaf, const lapack_int* ipiv,
+                                const float* r, const float* c, const float* b, lapack_int ldb, float* x,
+                                lapack_int ldx, float* rcond, float* berr, lapack_int n_err_bnds, float* err_bnds_norm,
+                                float* err_bnds_comp, lapack_int nparams, float* params, float* work,
+                                lapack_int* iwork);
+lapack_int LAPACKE_dgerfsx_work(int matrix_order, char trans, char equed, lapack_int n, lapack_int nrhs,
+                                const double* a, lapack_int lda, const double* af, lapack_int ldaf,
+                                const lapack_int* ipiv, const double* r, const double* c, const double* b,
+                                lapack_int ldb, double* x, lapack_int ldx, double* rcond, double* berr,
+                                lapack_int n_err_bnds, double* err_bnds_norm, double* err_bnds_comp, lapack_int nparams,
+                                double* params, double* work, lapack_int* iwork);
+lapack_int LAPACKE_cgerfsx_work(int matrix_order, char trans, char equed, lapack_int n, lapack_int nrhs,
+                                const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* af,
+                                lapack_int ldaf, const lapack_int* ipiv, const float* r, const float* c,
+                                const lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x, lapack_int ldx,
+                                float* rcond, float* berr, lapack_int n_err_bnds, float* err_bnds_norm,
+                                float* err_bnds_comp, lapack_int nparams, float* params, lapack_complex_float* work,
+                                float* rwork);
+lapack_int LAPACKE_zgerfsx_work(int matrix_order, char trans, char equed, lapack_int n, lapack_int nrhs,
+                                const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* af,
+                                lapack_int ldaf, const lapack_int* ipiv, const double* r, const double* c,
+                                const lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x,
+                                lapack_int ldx, double* rcond, double* berr, lapack_int n_err_bnds,
+                                double* err_bnds_norm, double* err_bnds_comp, lapack_int nparams, double* params,
+                                lapack_complex_double* work, double* rwork);
+
+lapack_int LAPACKE_sgerqf_work(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, float* tau,
+                               float* work, lapack_int lwork);
+lapack_int LAPACKE_dgerqf_work(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda, double* tau,
+                               double* work, lapack_int lwork);
+lapack_int LAPACKE_cgerqf_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                               lapack_complex_float* tau, lapack_complex_float* work, lapack_int lwork);
+lapack_int LAPACKE_zgerqf_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                               lapack_complex_double* tau, lapack_complex_double* work, lapack_int lwork);
+
+lapack_int LAPACKE_sgesdd_work(int matrix_order, char jobz, lapack_int m, lapack_int n, float* a, lapack_int lda,
+                               float* s, float* u, lapack_int ldu, float* vt, lapack_int ldvt, float* work,
+                               lapack_int lwork, lapack_int* iwork);
+lapack_int LAPACKE_dgesdd_work(int matrix_order, char jobz, lapack_int m, lapack_int n, double* a, lapack_int lda,
+                               double* s, double* u, lapack_int ldu, double* vt, lapack_int ldvt, double* work,
+                               lapack_int lwork, lapack_int* iwork);
+lapack_int LAPACKE_cgesdd_work(int matrix_order, char jobz, lapack_int m, lapack_int n, lapack_complex_float* a,
+                               lapack_int lda, float* s, lapack_complex_float* u, lapack_int ldu,
+                               lapack_complex_float* vt, lapack_int ldvt, lapack_complex_float* work, lapack_int lwork,
+                               float* rwork, lapack_int* iwork);
+lapack_int LAPACKE_zgesdd_work(int matrix_order, char jobz, lapack_int m, lapack_int n, lapack_complex_double* a,
+                               lapack_int lda, double* s, lapack_complex_double* u, lapack_int ldu,
+                               lapack_complex_double* vt, lapack_int ldvt, lapack_complex_double* work,
+                               lapack_int lwork, double* rwork, lapack_int* iwork);
+
+lapack_int LAPACKE_sgesv_work(int matrix_order, lapack_int n, lapack_int nrhs, float* a, lapack_int lda,
+                              lapack_int* ipiv, float* b, lapack_int ldb);
+lapack_int LAPACKE_dgesv_work(int matrix_order, lapack_int n, lapack_int nrhs, double* a, lapack_int lda,
+                              lapack_int* ipiv, double* b, lapack_int ldb);
+lapack_int LAPACKE_cgesv_work(int matrix_order, lapack_int n, lapack_int nrhs, lapack_complex_float* a, lapack_int lda,
+                              lapack_int* ipiv, lapack_complex_float* b, lapack_int ldb);
+lapack_int LAPACKE_zgesv_work(int matrix_order, lapack_int n, lapack_int nrhs, lapack_complex_double* a, lapack_int lda,
+                              lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb);
+lapack_int LAPACKE_dsgesv_work(int matrix_order, lapack_int n, lapack_int nrhs, double* a, lapack_int lda,
+                               lapack_int* ipiv, double* b, lapack_int ldb, double* x, lapack_int ldx, double* work,
+                               float* swork, lapack_int* iter);
+lapack_int LAPACKE_zcgesv_work(int matrix_order, lapack_int n, lapack_int nrhs, lapack_complex_double* a,
+                               lapack_int lda, lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb,
+                               lapack_complex_double* x, lapack_int ldx, lapack_complex_double* work,
+                               lapack_complex_float* swork, double* rwork, lapack_int* iter);
+
+lapack_int LAPACKE_sgesvd_work(int matrix_order, char jobu, char jobvt, lapack_int m, lapack_int n, float* a,
+                               lapack_int lda, float* s, float* u, lapack_int ldu, float* vt, lapack_int ldvt,
+                               float* work, lapack_int lwork);
+lapack_int LAPACKE_dgesvd_work(int matrix_order, char jobu, char jobvt, lapack_int m, lapack_int n, double* a,
+                               lapack_int lda, double* s, double* u, lapack_int ldu, double* vt, lapack_int ldvt,
+                               double* work, lapack_int lwork);
+lapack_int LAPACKE_cgesvd_work(int matrix_order, char jobu, char jobvt, lapack_int m, lapack_int n,
+                               lapack_complex_float* a, lapack_int lda, float* s, lapack_complex_float* u,
+                               lapack_int ldu, lapack_complex_float* vt, lapack_int ldvt, lapack_complex_float* work,
+                               lapack_int lwork, float* rwork);
+lapack_int LAPACKE_zgesvd_work(int matrix_order, char jobu, char jobvt, lapack_int m, lapack_int n,
+                               lapack_complex_double* a, lapack_int lda, double* s, lapack_complex_double* u,
+                               lapack_int ldu, lapack_complex_double* vt, lapack_int ldvt, lapack_complex_double* work,
+                               lapack_int lwork, double* rwork);
+
+lapack_int LAPACKE_sgesvj_work(int matrix_order, char joba, char jobu, char jobv, lapack_int m, lapack_int n, float* a,
+                               lapack_int lda, float* sva, lapack_int mv, float* v, lapack_int ldv, float* work,
+                               lapack_int lwork);
+lapack_int LAPACKE_dgesvj_work(int matrix_order, char joba, char jobu, char jobv, lapack_int m, lapack_int n, double* a,
+                               lapack_int lda, double* sva, lapack_int mv, double* v, lapack_int ldv, double* work,
+                               lapack_int lwork);
+
+lapack_int LAPACKE_sgesvx_work(int matrix_order, char fact, char trans, lapack_int n, lapack_int nrhs, float* a,
+                               lapack_int lda, float* af, lapack_int ldaf, lapack_int* ipiv, char* equed, float* r,
+                               float* c, float* b, lapack_int ldb, float* x, lapack_int ldx, float* rcond, float* ferr,
+                               float* berr, float* work, lapack_int* iwork);
+lapack_int LAPACKE_dgesvx_work(int matrix_order, char fact, char trans, lapack_int n, lapack_int nrhs, double* a,
+                               lapack_int lda, double* af, lapack_int ldaf, lapack_int* ipiv, char* equed, double* r,
+                               double* c, double* b, lapack_int ldb, double* x, lapack_int ldx, double* rcond,
+                               double* ferr, double* berr, double* work, lapack_int* iwork);
+lapack_int LAPACKE_cgesvx_work(int matrix_order, char fact, char trans, lapack_int n, lapack_int nrhs,
+                               lapack_complex_float* a, lapack_int lda, lapack_complex_float* af, lapack_int ldaf,
+                               lapack_int* ipiv, char* equed, float* r, float* c, lapack_complex_float* b,
+                               lapack_int ldb, lapack_complex_float* x, lapack_int ldx, float* rcond, float* ferr,
+                               float* berr, lapack_complex_float* work, float* rwork);
+lapack_int LAPACKE_zgesvx_work(int matrix_order, char fact, char trans, lapack_int n, lapack_int nrhs,
+                               lapack_complex_double* a, lapack_int lda, lapack_complex_double* af, lapack_int ldaf,
+                               lapack_int* ipiv, char* equed, double* r, double* c, lapack_complex_double* b,
+                               lapack_int ldb, lapack_complex_double* x, lapack_int ldx, double* rcond, double* ferr,
+                               double* berr, lapack_complex_double* work, double* rwork);
+
+lapack_int LAPACKE_sgesvxx_work(int matrix_order, char fact, char trans, lapack_int n, lapack_int nrhs, float* a,
+                                lapack_int lda, float* af, lapack_int ldaf, lapack_int* ipiv, char* equed, float* r,
+                                float* c, float* b, lapack_int ldb, float* x, lapack_int ldx, float* rcond,
+                                float* rpvgrw, float* berr, lapack_int n_err_bnds, float* err_bnds_norm,
+                                float* err_bnds_comp, lapack_int nparams, float* params, float* work,
+                                lapack_int* iwork);
+lapack_int LAPACKE_dgesvxx_work(int matrix_order, char fact, char trans, lapack_int n, lapack_int nrhs, double* a,
+                                lapack_int lda, double* af, lapack_int ldaf, lapack_int* ipiv, char* equed, double* r,
+                                double* c, double* b, lapack_int ldb, double* x, lapack_int ldx, double* rcond,
+                                double* rpvgrw, double* berr, lapack_int n_err_bnds, double* err_bnds_norm,
+                                double* err_bnds_comp, lapack_int nparams, double* params, double* work,
+                                lapack_int* iwork);
+lapack_int LAPACKE_cgesvxx_work(int matrix_order, char fact, char trans, lapack_int n, lapack_int nrhs,
+                                lapack_complex_float* a, lapack_int lda, lapack_complex_float* af, lapack_int ldaf,
+                                lapack_int* ipiv, char* equed, float* r, float* c, lapack_complex_float* b,
+                                lapack_int ldb, lapack_complex_float* x, lapack_int ldx, float* rcond, float* rpvgrw,
+                                float* berr, lapack_int n_err_bnds, float* err_bnds_norm, float* err_bnds_comp,
+                                lapack_int nparams, float* params, lapack_complex_float* work, float* rwork);
+lapack_int LAPACKE_zgesvxx_work(int matrix_order, char fact, char trans, lapack_int n, lapack_int nrhs,
+                                lapack_complex_double* a, lapack_int lda, lapack_complex_double* af, lapack_int ldaf,
+                                lapack_int* ipiv, char* equed, double* r, double* c, lapack_complex_double* b,
+                                lapack_int ldb, lapack_complex_double* x, lapack_int ldx, double* rcond, double* rpvgrw,
+                                double* berr, lapack_int n_err_bnds, double* err_bnds_norm, double* err_bnds_comp,
+                                lapack_int nparams, double* params, lapack_complex_double* work, double* rwork);
+
+lapack_int LAPACKE_sgetf2_work(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda,
+                               lapack_int* ipiv);
+lapack_int LAPACKE_dgetf2_work(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda,
+                               lapack_int* ipiv);
+lapack_int LAPACKE_cgetf2_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                               lapack_int* ipiv);
+lapack_int LAPACKE_zgetf2_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                               lapack_int* ipiv);
+
+lapack_int LAPACKE_sgetrf_work(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda,
+                               lapack_int* ipiv);
+lapack_int LAPACKE_dgetrf_work(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda,
+                               lapack_int* ipiv);
+lapack_int LAPACKE_cgetrf_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                               lapack_int* ipiv);
+lapack_int LAPACKE_zgetrf_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                               lapack_int* ipiv);
+
+lapack_int LAPACKE_sgetri_work(int matrix_order, lapack_int n, float* a, lapack_int lda, const lapack_int* ipiv,
+                               float* work, lapack_int lwork);
+lapack_int LAPACKE_dgetri_work(int matrix_order, lapack_int n, double* a, lapack_int lda, const lapack_int* ipiv,
+                               double* work, lapack_int lwork);
+lapack_int LAPACKE_cgetri_work(int matrix_order, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                               const lapack_int* ipiv, lapack_complex_float* work, lapack_int lwork);
+lapack_int LAPACKE_zgetri_work(int matrix_order, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                               const lapack_int* ipiv, lapack_complex_double* work, lapack_int lwork);
+
+lapack_int LAPACKE_sgetrs_work(int matrix_order, char trans, lapack_int n, lapack_int nrhs, const float* a,
+                               lapack_int lda, const lapack_int* ipiv, float* b, lapack_int ldb);
+lapack_int LAPACKE_dgetrs_work(int matrix_order, char trans, lapack_int n, lapack_int nrhs, const double* a,
+                               lapack_int lda, const lapack_int* ipiv, double* b, lapack_int ldb);
+lapack_int LAPACKE_cgetrs_work(int matrix_order, char trans, lapack_int n, lapack_int nrhs,
+                               const lapack_complex_float* a, lapack_int lda, const lapack_int* ipiv,
+                               lapack_complex_float* b, lapack_int ldb);
+lapack_int LAPACKE_zgetrs_work(int matrix_order, char trans, lapack_int n, lapack_int nrhs,
+                               const lapack_complex_double* a, lapack_int lda, const lapack_int* ipiv,
+                               lapack_complex_double* b, lapack_int ldb);
+
+lapack_int LAPACKE_sggbak_work(int matrix_order, char job, char side, lapack_int n, lapack_int ilo, lapack_int ihi,
+                               const float* lscale, const float* rscale, lapack_int m, float* v, lapack_int ldv);
+lapack_int LAPACKE_dggbak_work(int matrix_order, char job, char side, lapack_int n, lapack_int ilo, lapack_int ihi,
+                               const double* lscale, const double* rscale, lapack_int m, double* v, lapack_int ldv);
+lapack_int LAPACKE_cggbak_work(int matrix_order, char job, char side, lapack_int n, lapack_int ilo, lapack_int ihi,
+                               const float* lscale, const float* rscale, lapack_int m, lapack_complex_float* v,
+                               lapack_int ldv);
+lapack_int LAPACKE_zggbak_work(int matrix_order, char job, char side, lapack_int n, lapack_int ilo, lapack_int ihi,
+                               const double* lscale, const double* rscale, lapack_int m, lapack_complex_double* v,
+                               lapack_int ldv);
+
+lapack_int LAPACKE_sggbal_work(int matrix_order, char job, lapack_int n, float* a, lapack_int lda, float* b,
+                               lapack_int ldb, lapack_int* ilo, lapack_int* ihi, float* lscale, float* rscale,
+                               float* work);
+lapack_int LAPACKE_dggbal_work(int matrix_order, char job, lapack_int n, double* a, lapack_int lda, double* b,
+                               lapack_int ldb, lapack_int* ilo, lapack_int* ihi, double* lscale, double* rscale,
+                               double* work);
+lapack_int LAPACKE_cggbal_work(int matrix_order, char job, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                               lapack_complex_float* b, lapack_int ldb, lapack_int* ilo, lapack_int* ihi, float* lscale,
+                               float* rscale, float* work);
+lapack_int LAPACKE_zggbal_work(int matrix_order, char job, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                               lapack_complex_double* b, lapack_int ldb, lapack_int* ilo, lapack_int* ihi,
+                               double* lscale, double* rscale, double* work);
+
+lapack_int LAPACKE_sgges_work(int matrix_order, char jobvsl, char jobvsr, char sort, LAPACK_S_SELECT3 selctg,
+                              lapack_int n, float* a, lapack_int lda, float* b, lapack_int ldb, lapack_int* sdim,
+                              float* alphar, float* alphai, float* beta, float* vsl, lapack_int ldvsl, float* vsr,
+                              lapack_int ldvsr, float* work, lapack_int lwork, lapack_logical* bwork);
+lapack_int LAPACKE_dgges_work(int matrix_order, char jobvsl, char jobvsr, char sort, LAPACK_D_SELECT3 selctg,
+                              lapack_int n, double* a, lapack_int lda, double* b, lapack_int ldb, lapack_int* sdim,
+                              double* alphar, double* alphai, double* beta, double* vsl, lapack_int ldvsl, double* vsr,
+                              lapack_int ldvsr, double* work, lapack_int lwork, lapack_logical* bwork);
+lapack_int LAPACKE_cgges_work(int matrix_order, char jobvsl, char jobvsr, char sort, LAPACK_C_SELECT2 selctg,
+                              lapack_int n, lapack_complex_float* a, lapack_int lda, lapack_complex_float* b,
+                              lapack_int ldb, lapack_int* sdim, lapack_complex_float* alpha, lapack_complex_float* beta,
+                              lapack_complex_float* vsl, lapack_int ldvsl, lapack_complex_float* vsr, lapack_int ldvsr,
+                              lapack_complex_float* work, lapack_int lwork, float* rwork, lapack_logical* bwork);
+lapack_int LAPACKE_zgges_work(int matrix_order, char jobvsl, char jobvsr, char sort, LAPACK_Z_SELECT2 selctg,
+                              lapack_int n, lapack_complex_double* a, lapack_int lda, lapack_complex_double* b,
+                              lapack_int ldb, lapack_int* sdim, lapack_complex_double* alpha,
+                              lapack_complex_double* beta, lapack_complex_double* vsl, lapack_int ldvsl,
+                              lapack_complex_double* vsr, lapack_int ldvsr, lapack_complex_double* work,
+                              lapack_int lwork, double* rwork, lapack_logical* bwork);
+
+lapack_int LAPACKE_sggesx_work(int matrix_order, char jobvsl, char jobvsr, char sort, LAPACK_S_SELECT3 selctg,
+                               char sense, lapack_int n, float* a, lapack_int lda, float* b, lapack_int ldb,
+                               lapack_int* sdim, float* alphar, float* alphai, float* beta, float* vsl,
+                               lapack_int ldvsl, float* vsr, lapack_int ldvsr, float* rconde, float* rcondv,
+                               float* work, lapack_int lwork, lapack_int* iwork, lapack_int liwork,
+                               lapack_logical* bwork);
+lapack_int LAPACKE_dggesx_work(int matrix_order, char jobvsl, char jobvsr, char sort, LAPACK_D_SELECT3 selctg,
+                               char sense, lapack_int n, double* a, lapack_int lda, double* b, lapack_int ldb,
+                               lapack_int* sdim, double* alphar, double* alphai, double* beta, double* vsl,
+                               lapack_int ldvsl, double* vsr, lapack_int ldvsr, double* rconde, double* rcondv,
+                               double* work, lapack_int lwork, lapack_int* iwork, lapack_int liwork,
+                               lapack_logical* bwork);
+lapack_int LAPACKE_cggesx_work(int matrix_order, char jobvsl, char jobvsr, char sort, LAPACK_C_SELECT2 selctg,
+                               char sense, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                               lapack_complex_float* b, lapack_int ldb, lapack_int* sdim, lapack_complex_float* alpha,
+                               lapack_complex_float* beta, lapack_complex_float* vsl, lapack_int ldvsl,
+                               lapack_complex_float* vsr, lapack_int ldvsr, float* rconde, float* rcondv,
+                               lapack_complex_float* work, lapack_int lwork, float* rwork, lapack_int* iwork,
+                               lapack_int liwork, lapack_logical* bwork);
+lapack_int LAPACKE_zggesx_work(int matrix_order, char jobvsl, char jobvsr, char sort, LAPACK_Z_SELECT2 selctg,
+                               char sense, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                               lapack_complex_double* b, lapack_int ldb, lapack_int* sdim, lapack_complex_double* alpha,
+                               lapack_complex_double* beta, lapack_complex_double* vsl, lapack_int ldvsl,
+                               lapack_complex_double* vsr, lapack_int ldvsr, double* rconde, double* rcondv,
+                               lapack_complex_double* work, lapack_int lwork, double* rwork, lapack_int* iwork,
+                               lapack_int liwork, lapack_logical* bwork);
+
+lapack_int LAPACKE_sggev_work(int matrix_order, char jobvl, char jobvr, lapack_int n, float* a, lapack_int lda,
+                              float* b, lapack_int ldb, float* alphar, float* alphai, float* beta, float* vl,
+                              lapack_int ldvl, float* vr, lapack_int ldvr, float* work, lapack_int lwork);
+lapack_int LAPACKE_dggev_work(int matrix_order, char jobvl, char jobvr, lapack_int n, double* a, lapack_int lda,
+                              double* b, lapack_int ldb, double* alphar, double* alphai, double* beta, double* vl,
+                              lapack_int ldvl, double* vr, lapack_int ldvr, double* work, lapack_int lwork);
+lapack_int LAPACKE_cggev_work(int matrix_order, char jobvl, char jobvr, lapack_int n, lapack_complex_float* a,
+                              lapack_int lda, lapack_complex_float* b, lapack_int ldb, lapack_complex_float* alpha,
+                              lapack_complex_float* beta, lapack_complex_float* vl, lapack_int ldvl,
+                              lapack_complex_float* vr, lapack_int ldvr, lapack_complex_float* work, lapack_int lwork,
+                              float* rwork);
+lapack_int LAPACKE_zggev_work(int matrix_order, char jobvl, char jobvr, lapack_int n, lapack_complex_double* a,
+                              lapack_int lda, lapack_complex_double* b, lapack_int ldb, lapack_complex_double* alpha,
+                              lapack_complex_double* beta, lapack_complex_double* vl, lapack_int ldvl,
+                              lapack_complex_double* vr, lapack_int ldvr, lapack_complex_double* work, lapack_int lwork,
+                              double* rwork);
+
+lapack_int LAPACKE_sggevx_work(int matrix_order, char balanc, char jobvl, char jobvr, char sense, lapack_int n,
+                               float* a, lapack_int lda, float* b, lapack_int ldb, float* alphar, float* alphai,
+                               float* beta, float* vl, lapack_int ldvl, float* vr, lapack_int ldvr, lapack_int* ilo,
+                               lapack_int* ihi, float* lscale, float* rscale, float* abnrm, float* bbnrm, float* rconde,
+                               float* rcondv, float* work, lapack_int lwork, lapack_int* iwork, lapack_logical* bwork);
+lapack_int LAPACKE_dggevx_work(int matrix_order, char balanc, char jobvl, char jobvr, char sense, lapack_int n,
+                               double* a, lapack_int lda, double* b, lapack_int ldb, double* alphar, double* alphai,
+                               double* beta, double* vl, lapack_int ldvl, double* vr, lapack_int ldvr, lapack_int* ilo,
+                               lapack_int* ihi, double* lscale, double* rscale, double* abnrm, double* bbnrm,
+                               double* rconde, double* rcondv, double* work, lapack_int lwork, lapack_int* iwork,
+                               lapack_logical* bwork);
+lapack_int LAPACKE_cggevx_work(int matrix_order, char balanc, char jobvl, char jobvr, char sense, lapack_int n,
+                               lapack_complex_float* a, lapack_int lda, lapack_complex_float* b, lapack_int ldb,
+                               lapack_complex_float* alpha, lapack_complex_float* beta, lapack_complex_float* vl,
+                               lapack_int ldvl, lapack_complex_float* vr, lapack_int ldvr, lapack_int* ilo,
+                               lapack_int* ihi, float* lscale, float* rscale, float* abnrm, float* bbnrm, float* rconde,
+                               float* rcondv, lapack_complex_float* work, lapack_int lwork, float* rwork,
+                               lapack_int* iwork, lapack_logical* bwork);
+lapack_int LAPACKE_zggevx_work(int matrix_order, char balanc, char jobvl, char jobvr, char sense, lapack_int n,
+                               lapack_complex_double* a, lapack_int lda, lapack_complex_double* b, lapack_int ldb,
+                               lapack_complex_double* alpha, lapack_complex_double* beta, lapack_complex_double* vl,
+                               lapack_int ldvl, lapack_complex_double* vr, lapack_int ldvr, lapack_int* ilo,
+                               lapack_int* ihi, double* lscale, double* rscale, double* abnrm, double* bbnrm,
+                               double* rconde, double* rcondv, lapack_complex_double* work, lapack_int lwork,
+                               double* rwork, lapack_int* iwork, lapack_logical* bwork);
+
+lapack_int LAPACKE_sggglm_work(int matrix_order, lapack_int n, lapack_int m, lapack_int p, float* a, lapack_int lda,
+                               float* b, lapack_int ldb, float* d, float* x, float* y, float* work, lapack_int lwork);
+lapack_int LAPACKE_dggglm_work(int matrix_order, lapack_int n, lapack_int m, lapack_int p, double* a, lapack_int lda,
+                               double* b, lapack_int ldb, double* d, double* x, double* y, double* work,
+                               lapack_int lwork);
+lapack_int LAPACKE_cggglm_work(int matrix_order, lapack_int n, lapack_int m, lapack_int p, lapack_complex_float* a,
+                               lapack_int lda, lapack_complex_float* b, lapack_int ldb, lapack_complex_float* d,
+                               lapack_complex_float* x, lapack_complex_float* y, lapack_complex_float* work,
+                               lapack_int lwork);
+lapack_int LAPACKE_zggglm_work(int matrix_order, lapack_int n, lapack_int m, lapack_int p, lapack_complex_double* a,
+                               lapack_int lda, lapack_complex_double* b, lapack_int ldb, lapack_complex_double* d,
+                               lapack_complex_double* x, lapack_complex_double* y, lapack_complex_double* work,
+                               lapack_int lwork);
+
+lapack_int LAPACKE_sgghrd_work(int matrix_order, char compq, char compz, lapack_int n, lapack_int ilo, lapack_int ihi,
+                               float* a, lapack_int lda, float* b, lapack_int ldb, float* q, lapack_int ldq, float* z,
+                               lapack_int ldz);
+lapack_int LAPACKE_dgghrd_work(int matrix_order, char compq, char compz, lapack_int n, lapack_int ilo, lapack_int ihi,
+                               double* a, lapack_int lda, double* b, lapack_int ldb, double* q, lapack_int ldq,
+                               double* z, lapack_int ldz);
+lapack_int LAPACKE_cgghrd_work(int matrix_order, char compq, char compz, lapack_int n, lapack_int ilo, lapack_int ihi,
+                               lapack_complex_float* a, lapack_int lda, lapack_complex_float* b, lapack_int ldb,
+                               lapack_complex_float* q, lapack_int ldq, lapack_complex_float* z, lapack_int ldz);
+lapack_int LAPACKE_zgghrd_work(int matrix_order, char compq, char compz, lapack_int n, lapack_int ilo, lapack_int ihi,
+                               lapack_complex_double* a, lapack_int lda, lapack_complex_double* b, lapack_int ldb,
+                               lapack_complex_double* q, lapack_int ldq, lapack_complex_double* z, lapack_int ldz);
+
+lapack_int LAPACKE_sgglse_work(int matrix_order, lapack_int m, lapack_int n, lapack_int p, float* a, lapack_int lda,
+                               float* b, lapack_int ldb, float* c, float* d, float* x, float* work, lapack_int lwork);
+lapack_int LAPACKE_dgglse_work(int matrix_order, lapack_int m, lapack_int n, lapack_int p, double* a, lapack_int lda,
+                               double* b, lapack_int ldb, double* c, double* d, double* x, double* work,
+                               lapack_int lwork);
+lapack_int LAPACKE_cgglse_work(int matrix_order, lapack_int m, lapack_int n, lapack_int p, lapack_complex_float* a,
+                               lapack_int lda, lapack_complex_float* b, lapack_int ldb, lapack_complex_float* c,
+                               lapack_complex_float* d, lapack_complex_float* x, lapack_complex_float* work,
+                               lapack_int lwork);
+lapack_int LAPACKE_zgglse_work(int matrix_order, lapack_int m, lapack_int n, lapack_int p, lapack_complex_double* a,
+                               lapack_int lda, lapack_complex_double* b, lapack_int ldb, lapack_complex_double* c,
+                               lapack_complex_double* d, lapack_complex_double* x, lapack_complex_double* work,
+                               lapack_int lwork);
+
+lapack_int LAPACKE_sggqrf_work(int matrix_order, lapack_int n, lapack_int m, lapack_int p, float* a, lapack_int lda,
+                               float* taua, float* b, lapack_int ldb, float* taub, float* work, lapack_int lwork);
+lapack_int LAPACKE_dggqrf_work(int matrix_order, lapack_int n, lapack_int m, lapack_int p, double* a, lapack_int lda,
+                               double* taua, double* b, lapack_int ldb, double* taub, double* work, lapack_int lwork);
+lapack_int LAPACKE_cggqrf_work(int matrix_order, lapack_int n, lapack_int m, lapack_int p, lapack_complex_float* a,
+                               lapack_int lda, lapack_complex_float* taua, lapack_complex_float* b, lapack_int ldb,
+                               lapack_complex_float* taub, lapack_complex_float* work, lapack_int lwork);
+lapack_int LAPACKE_zggqrf_work(int matrix_order, lapack_int n, lapack_int m, lapack_int p, lapack_complex_double* a,
+                               lapack_int lda, lapack_complex_double* taua, lapack_complex_double* b, lapack_int ldb,
+                               lapack_complex_double* taub, lapack_complex_double* work, lapack_int lwork);
+
+lapack_int LAPACKE_sggrqf_work(int matrix_order, lapack_int m, lapack_int p, lapack_int n, float* a, lapack_int lda,
+                               float* taua, float* b, lapack_int ldb, float* taub, float* work, lapack_int lwork);
+lapack_int LAPACKE_dggrqf_work(int matrix_order, lapack_int m, lapack_int p, lapack_int n, double* a, lapack_int lda,
+                               double* taua, double* b, lapack_int ldb, double* taub, double* work, lapack_int lwork);
+lapack_int LAPACKE_cggrqf_work(int matrix_order, lapack_int m, lapack_int p, lapack_int n, lapack_complex_float* a,
+                               lapack_int lda, lapack_complex_float* taua, lapack_complex_float* b, lapack_int ldb,
+                               lapack_complex_float* taub, lapack_complex_float* work, lapack_int lwork);
+lapack_int LAPACKE_zggrqf_work(int matrix_order, lapack_int m, lapack_int p, lapack_int n, lapack_complex_double* a,
+                               lapack_int lda, lapack_complex_double* taua, lapack_complex_double* b, lapack_int ldb,
+                               lapack_complex_double* taub, lapack_complex_double* work, lapack_int lwork);
+
+lapack_int LAPACKE_sggsvd_work(int matrix_order, char jobu, char jobv, char jobq, lapack_int m, lapack_int n,
+                               lapack_int p, lapack_int* k, lapack_int* l, float* a, lapack_int lda, float* b,
+                               lapack_int ldb, float* alpha, float* beta, float* u, lapack_int ldu, float* v,
+                               lapack_int ldv, float* q, lapack_int ldq, float* work, lapack_int* iwork);
+lapack_int LAPACKE_dggsvd_work(int matrix_order, char jobu, char jobv, char jobq, lapack_int m, lapack_int n,
+                               lapack_int p, lapack_int* k, lapack_int* l, double* a, lapack_int lda, double* b,
+                               lapack_int ldb, double* alpha, double* beta, double* u, lapack_int ldu, double* v,
+                               lapack_int ldv, double* q, lapack_int ldq, double* work, lapack_int* iwork);
+lapack_int LAPACKE_cggsvd_work(int matrix_order, char jobu, char jobv, char jobq, lapack_int m, lapack_int n,
+                               lapack_int p, lapack_int* k, lapack_int* l, lapack_complex_float* a, lapack_int lda,
+                               lapack_complex_float* b, lapack_int ldb, float* alpha, float* beta,
+                               lapack_complex_float* u, lapack_int ldu, lapack_complex_float* v, lapack_int ldv,
+                               lapack_complex_float* q, lapack_int ldq, lapack_complex_float* work, float* rwork,
+                               lapack_int* iwork);
+lapack_int LAPACKE_zggsvd_work(int matrix_order, char jobu, char jobv, char jobq, lapack_int m, lapack_int n,
+                               lapack_int p, lapack_int* k, lapack_int* l, lapack_complex_double* a, lapack_int lda,
+                               lapack_complex_double* b, lapack_int ldb, double* alpha, double* beta,
+                               lapack_complex_double* u, lapack_int ldu, lapack_complex_double* v, lapack_int ldv,
+                               lapack_complex_double* q, lapack_int ldq, lapack_complex_double* work, double* rwork,
+                               lapack_int* iwork);
+
+lapack_int LAPACKE_sggsvp_work(int matrix_order, char jobu, char jobv, char jobq, lapack_int m, lapack_int p,
+                               lapack_int n, float* a, lapack_int lda, float* b, lapack_int ldb, float tola, float tolb,
+                               lapack_int* k, lapack_int* l, float* u, lapack_int ldu, float* v, lapack_int ldv,
+                               float* q, lapack_int ldq, lapack_int* iwork, float* tau, float* work);
+lapack_int LAPACKE_dggsvp_work(int matrix_order, char jobu, char jobv, char jobq, lapack_int m, lapack_int p,
+                               lapack_int n, double* a, lapack_int lda, double* b, lapack_int ldb, double tola,
+                               double tolb, lapack_int* k, lapack_int* l, double* u, lapack_int ldu, double* v,
+                               lapack_int ldv, double* q, lapack_int ldq, lapack_int* iwork, double* tau, double* work);
+lapack_int LAPACKE_cggsvp_work(int matrix_order, char jobu, char jobv, char jobq, lapack_int m, lapack_int p,
+                               lapack_int n, lapack_complex_float* a, lapack_int lda, lapack_complex_float* b,
+                               lapack_int ldb, float tola, float tolb, lapack_int* k, lapack_int* l,
+                               lapack_complex_float* u, lapack_int ldu, lapack_complex_float* v, lapack_int ldv,
+                               lapack_complex_float* q, lapack_int ldq, lapack_int* iwork, float* rwork,
+                               lapack_complex_float* tau, lapack_complex_float* work);
+lapack_int LAPACKE_zggsvp_work(int matrix_order, char jobu, char jobv, char jobq, lapack_int m, lapack_int p,
+                               lapack_int n, lapack_complex_double* a, lapack_int lda, lapack_complex_double* b,
+                               lapack_int ldb, double tola, double tolb, lapack_int* k, lapack_int* l,
+                               lapack_complex_double* u, lapack_int ldu, lapack_complex_double* v, lapack_int ldv,
+                               lapack_complex_double* q, lapack_int ldq, lapack_int* iwork, double* rwork,
+                               lapack_complex_double* tau, lapack_complex_double* work);
+
+lapack_int LAPACKE_sgtcon_work(char norm, lapack_int n, const float* dl, const float* d, const float* du,
+                               const float* du2, const lapack_int* ipiv, float anorm, float* rcond, float* work,
+                               lapack_int* iwork);
+lapack_int LAPACKE_dgtcon_work(char norm, lapack_int n, const double* dl, const double* d, const double* du,
+                               const double* du2, const lapack_int* ipiv, double anorm, double* rcond, double* work,
+                               lapack_int* iwork);
+lapack_int LAPACKE_cgtcon_work(char norm, lapack_int n, const lapack_complex_float* dl, const lapack_complex_float* d,
+                               const lapack_complex_float* du, const lapack_complex_float* du2, const lapack_int* ipiv,
+                               float anorm, float* rcond, lapack_complex_float* work);
+lapack_int LAPACKE_zgtcon_work(char norm, lapack_int n, const lapack_complex_double* dl, const lapack_complex_double* d,
+                               const lapack_complex_double* du, const lapack_complex_double* du2,
+                               const lapack_int* ipiv, double anorm, double* rcond, lapack_complex_double* work);
+
+lapack_int LAPACKE_sgtrfs_work(int matrix_order, char trans, lapack_int n, lapack_int nrhs, const float* dl,
+                               const float* d, const float* du, const float* dlf, const float* df, const float* duf,
+                               const float* du2, const lapack_int* ipiv, const float* b, lapack_int ldb, float* x,
+                               lapack_int ldx, float* ferr, float* berr, float* work, lapack_int* iwork);
+lapack_int LAPACKE_dgtrfs_work(int matrix_order, char trans, lapack_int n, lapack_int nrhs, const double* dl,
+                               const double* d, const double* du, const double* dlf, const double* df,
+                               const double* duf, const double* du2, const lapack_int* ipiv, const double* b,
+                               lapack_int ldb, double* x, lapack_int ldx, double* ferr, double* berr, double* work,
+                               lapack_int* iwork);
+lapack_int LAPACKE_cgtrfs_work(int matrix_order, char trans, lapack_int n, lapack_int nrhs,
+                               const lapack_complex_float* dl, const lapack_complex_float* d,
+                               const lapack_complex_float* du, const lapack_complex_float* dlf,
+                               const lapack_complex_float* df, const lapack_complex_float* duf,
+                               const lapack_complex_float* du2, const lapack_int* ipiv, const lapack_complex_float* b,
+                               lapack_int ldb, lapack_complex_float* x, lapack_int ldx, float* ferr, float* berr,
+                               lapack_complex_float* work, float* rwork);
+lapack_int LAPACKE_zgtrfs_work(int matrix_order, char trans, lapack_int n, lapack_int nrhs,
+                               const lapack_complex_double* dl, const lapack_complex_double* d,
+                               const lapack_complex_double* du, const lapack_complex_double* dlf,
+                               const lapack_complex_double* df, const lapack_complex_double* duf,
+                               const lapack_complex_double* du2, const lapack_int* ipiv, const lapack_complex_double* b,
+                               lapack_int ldb, lapack_complex_double* x, lapack_int ldx, double* ferr, double* berr,
+                               lapack_complex_double* work, double* rwork);
+
+lapack_int LAPACKE_sgtsv_work(int matrix_order, lapack_int n, lapack_int nrhs, float* dl, float* d, float* du, float* b,
+                              lapack_int ldb);
+lapack_int LAPACKE_dgtsv_work(int matrix_order, lapack_int n, lapack_int nrhs, double* dl, double* d, double* du,
+                              double* b, lapack_int ldb);
+lapack_int LAPACKE_cgtsv_work(int matrix_order, lapack_int n, lapack_int nrhs, lapack_complex_float* dl,
+                              lapack_complex_float* d, lapack_complex_float* du, lapack_complex_float* b,
+                              lapack_int ldb);
+lapack_int LAPACKE_zgtsv_work(int matrix_order, lapack_int n, lapack_int nrhs, lapack_complex_double* dl,
+                              lapack_complex_double* d, lapack_complex_double* du, lapack_complex_double* b,
+                              lapack_int ldb);
+
+lapack_int LAPACKE_sgtsvx_work(int matrix_order, char fact, char trans, lapack_int n, lapack_int nrhs, const float* dl,
+                               const float* d, const float* du, float* dlf, float* df, float* duf, float* du2,
+                               lapack_int* ipiv, const float* b, lapack_int ldb, float* x, lapack_int ldx, float* rcond,
+                               float* ferr, float* berr, float* work, lapack_int* iwork);
+lapack_int LAPACKE_dgtsvx_work(int matrix_order, char fact, char trans, lapack_int n, lapack_int nrhs, const double* dl,
+                               const double* d, const double* du, double* dlf, double* df, double* duf, double* du2,
+                               lapack_int* ipiv, const double* b, lapack_int ldb, double* x, lapack_int ldx,
+                               double* rcond, double* ferr, double* berr, double* work, lapack_int* iwork);
+lapack_int LAPACKE_cgtsvx_work(int matrix_order, char fact, char trans, lapack_int n, lapack_int nrhs,
+                               const lapack_complex_float* dl, const lapack_complex_float* d,
+                               const lapack_complex_float* du, lapack_complex_float* dlf, lapack_complex_float* df,
+                               lapack_complex_float* duf, lapack_complex_float* du2, lapack_int* ipiv,
+                               const lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x, lapack_int ldx,
+                               float* rcond, float* ferr, float* berr, lapack_complex_float* work, float* rwork);
+lapack_int LAPACKE_zgtsvx_work(int matrix_order, char fact, char trans, lapack_int n, lapack_int nrhs,
+                               const lapack_complex_double* dl, const lapack_complex_double* d,
+                               const lapack_complex_double* du, lapack_complex_double* dlf, lapack_complex_double* df,
+                               lapack_complex_double* duf, lapack_complex_double* du2, lapack_int* ipiv,
+                               const lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x, lapack_int ldx,
+                               double* rcond, double* ferr, double* berr, lapack_complex_double* work, double* rwork);
+
+lapack_int LAPACKE_sgttrf_work(lapack_int n, float* dl, float* d, float* du, float* du2, lapack_int* ipiv);
+lapack_int LAPACKE_dgttrf_work(lapack_int n, double* dl, double* d, double* du, double* du2, lapack_int* ipiv);
+lapack_int LAPACKE_cgttrf_work(lapack_int n, lapack_complex_float* dl, lapack_complex_float* d,
+                               lapack_complex_float* du, lapack_complex_float* du2, lapack_int* ipiv);
+lapack_int LAPACKE_zgttrf_work(lapack_int n, lapack_complex_double* dl, lapack_complex_double* d,
+                               lapack_complex_double* du, lapack_complex_double* du2, lapack_int* ipiv);
+
+lapack_int LAPACKE_sgttrs_work(int matrix_order, char trans, lapack_int n, lapack_int nrhs, const float* dl,
+                               const float* d, const float* du, const float* du2, const lapack_int* ipiv, float* b,
+                               lapack_int ldb);
+lapack_int LAPACKE_dgttrs_work(int matrix_order, char trans, lapack_int n, lapack_int nrhs, const double* dl,
+                               const double* d, const double* du, const double* du2, const lapack_int* ipiv, double* b,
+                               lapack_int ldb);
+lapack_int LAPACKE_cgttrs_work(int matrix_order, char trans, lapack_int n, lapack_int nrhs,
+                               const lapack_complex_float* dl, const lapack_complex_float* d,
+                               const lapack_complex_float* du, const lapack_complex_float* du2, const lapack_int* ipiv,
+                               lapack_complex_float* b, lapack_int ldb);
+lapack_int LAPACKE_zgttrs_work(int matrix_order, char trans, lapack_int n, lapack_int nrhs,
+                               const lapack_complex_double* dl, const lapack_complex_double* d,
+                               const lapack_complex_double* du, const lapack_complex_double* du2,
+                               const lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb);
+
+lapack_int LAPACKE_chbev_work(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int kd,
+                              lapack_complex_float* ab, lapack_int ldab, float* w, lapack_complex_float* z,
+                              lapack_int ldz, lapack_complex_float* work, float* rwork);
+lapack_int LAPACKE_zhbev_work(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int kd,
+                              lapack_complex_double* ab, lapack_int ldab, double* w, lapack_complex_double* z,
+                              lapack_int ldz, lapack_complex_double* work, double* rwork);
+
+lapack_int LAPACKE_chbevd_work(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int kd,
+                               lapack_complex_float* ab, lapack_int ldab, float* w, lapack_complex_float* z,
+                               lapack_int ldz, lapack_complex_float* work, lapack_int lwork, float* rwork,
+                               lapack_int lrwork, lapack_int* iwork, lapack_int liwork);
+lapack_int LAPACKE_zhbevd_work(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int kd,
+                               lapack_complex_double* ab, lapack_int ldab, double* w, lapack_complex_double* z,
+                               lapack_int ldz, lapack_complex_double* work, lapack_int lwork, double* rwork,
+                               lapack_int lrwork, lapack_int* iwork, lapack_int liwork);
+
+lapack_int LAPACKE_chbevx_work(int matrix_order, char jobz, char range, char uplo, lapack_int n, lapack_int kd,
+                               lapack_complex_float* ab, lapack_int ldab, lapack_complex_float* q, lapack_int ldq,
+                               float vl, float vu, lapack_int il, lapack_int iu, float abstol, lapack_int* m, float* w,
+                               lapack_complex_float* z, lapack_int ldz, lapack_complex_float* work, float* rwork,
+                               lapack_int* iwork, lapack_int* ifail);
+lapack_int LAPACKE_zhbevx_work(int matrix_order, char jobz, char range, char uplo, lapack_int n, lapack_int kd,
+                               lapack_complex_double* ab, lapack_int ldab, lapack_complex_double* q, lapack_int ldq,
+                               double vl, double vu, lapack_int il, lapack_int iu, double abstol, lapack_int* m,
+                               double* w, lapack_complex_double* z, lapack_int ldz, lapack_complex_double* work,
+                               double* rwork, lapack_int* iwork, lapack_int* ifail);
+
+lapack_int LAPACKE_chbgst_work(int matrix_order, char vect, char uplo, lapack_int n, lapack_int ka, lapack_int kb,
+                               lapack_complex_float* ab, lapack_int ldab, const lapack_complex_float* bb,
+                               lapack_int ldbb, lapack_complex_float* x, lapack_int ldx, lapack_complex_float* work,
+                               float* rwork);
+lapack_int LAPACKE_zhbgst_work(int matrix_order, char vect, char uplo, lapack_int n, lapack_int ka, lapack_int kb,
+                               lapack_complex_double* ab, lapack_int ldab, const lapack_complex_double* bb,
+                               lapack_int ldbb, lapack_complex_double* x, lapack_int ldx, lapack_complex_double* work,
+                               double* rwork);
+
+lapack_int LAPACKE_chbgv_work(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int ka, lapack_int kb,
+                              lapack_complex_float* ab, lapack_int ldab, lapack_complex_float* bb, lapack_int ldbb,
+                              float* w, lapack_complex_float* z, lapack_int ldz, lapack_complex_float* work,
+                              float* rwork);
+lapack_int LAPACKE_zhbgv_work(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int ka, lapack_int kb,
+                              lapack_complex_double* ab, lapack_int ldab, lapack_complex_double* bb, lapack_int ldbb,
+                              double* w, lapack_complex_double* z, lapack_int ldz, lapack_complex_double* work,
+                              double* rwork);
+
+lapack_int LAPACKE_chbgvd_work(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int ka, lapack_int kb,
+                               lapack_complex_float* ab, lapack_int ldab, lapack_complex_float* bb, lapack_int ldbb,
+                               float* w, lapack_complex_float* z, lapack_int ldz, lapack_complex_float* work,
+                               lapack_int lwork, float* rwork, lapack_int lrwork, lapack_int* iwork, lapack_int liwork);
+lapack_int LAPACKE_zhbgvd_work(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int ka, lapack_int kb,
+                               lapack_complex_double* ab, lapack_int ldab, lapack_complex_double* bb, lapack_int ldbb,
+                               double* w, lapack_complex_double* z, lapack_int ldz, lapack_complex_double* work,
+                               lapack_int lwork, double* rwork, lapack_int lrwork, lapack_int* iwork,
+                               lapack_int liwork);
+
+lapack_int LAPACKE_chbgvx_work(int matrix_order, char jobz, char range, char uplo, lapack_int n, lapack_int ka,
+                               lapack_int kb, lapack_complex_float* ab, lapack_int ldab, lapack_complex_float* bb,
+                               lapack_int ldbb, lapack_complex_float* q, lapack_int ldq, float vl, float vu,
+                               lapack_int il, lapack_int iu, float abstol, lapack_int* m, float* w,
+                               lapack_complex_float* z, lapack_int ldz, lapack_complex_float* work, float* rwork,
+                               lapack_int* iwork, lapack_int* ifail);
+lapack_int LAPACKE_zhbgvx_work(int matrix_order, char jobz, char range, char uplo, lapack_int n, lapack_int ka,
+                               lapack_int kb, lapack_complex_double* ab, lapack_int ldab, lapack_complex_double* bb,
+                               lapack_int ldbb, lapack_complex_double* q, lapack_int ldq, double vl, double vu,
+                               lapack_int il, lapack_int iu, double abstol, lapack_int* m, double* w,
+                               lapack_complex_double* z, lapack_int ldz, lapack_complex_double* work, double* rwork,
+                               lapack_int* iwork, lapack_int* ifail);
+
+lapack_int LAPACKE_chbtrd_work(int matrix_order, char vect, char uplo, lapack_int n, lapack_int kd,
+                               lapack_complex_float* ab, lapack_int ldab, float* d, float* e, lapack_complex_float* q,
+                               lapack_int ldq, lapack_complex_float* work);
+lapack_int LAPACKE_zhbtrd_work(int matrix_order, char vect, char uplo, lapack_int n, lapack_int kd,
+                               lapack_complex_double* ab, lapack_int ldab, double* d, double* e,
+                               lapack_complex_double* q, lapack_int ldq, lapack_complex_double* work);
+
+lapack_int LAPACKE_checon_work(int matrix_order, char uplo, lapack_int n, const lapack_complex_float* a, lapack_int lda,
+                               const lapack_int* ipiv, float anorm, float* rcond, lapack_complex_float* work);
+lapack_int LAPACKE_zhecon_work(int matrix_order, char uplo, lapack_int n, const lapack_complex_double* a,
+                               lapack_int lda, const lapack_int* ipiv, double anorm, double* rcond,
+                               lapack_complex_double* work);
+
+lapack_int LAPACKE_cheequb_work(int matrix_order, char uplo, lapack_int n, const lapack_complex_float* a,
+                                lapack_int lda, float* s, float* scond, float* amax, lapack_complex_float* work);
+lapack_int LAPACKE_zheequb_work(int matrix_order, char uplo, lapack_int n, const lapack_complex_double* a,
+                                lapack_int lda, double* s, double* scond, double* amax, lapack_complex_double* work);
+
+lapack_int LAPACKE_cheev_work(int matrix_order, char jobz, char uplo, lapack_int n, lapack_complex_float* a,
+                              lapack_int lda, float* w, lapack_complex_float* work, lapack_int lwork, float* rwork);
+lapack_int LAPACKE_zheev_work(int matrix_order, char jobz, char uplo, lapack_int n, lapack_complex_double* a,
+                              lapack_int lda, double* w, lapack_complex_double* work, lapack_int lwork, double* rwork);
+
+lapack_int LAPACKE_cheevd_work(int matrix_order, char jobz, char uplo, lapack_int n, lapack_complex_float* a,
+                               lapack_int lda, float* w, lapack_complex_float* work, lapack_int lwork, float* rwork,
+                               lapack_int lrwork, lapack_int* iwork, lapack_int liwork);
+lapack_int LAPACKE_zheevd_work(int matrix_order, char jobz, char uplo, lapack_int n, lapack_complex_double* a,
+                               lapack_int lda, double* w, lapack_complex_double* work, lapack_int lwork, double* rwork,
+                               lapack_int lrwork, lapack_int* iwork, lapack_int liwork);
+
+lapack_int LAPACKE_cheevr_work(int matrix_order, char jobz, char range, char uplo, lapack_int n,
+                               lapack_complex_float* a, lapack_int lda, float vl, float vu, lapack_int il,
+                               lapack_int iu, float abstol, lapack_int* m, float* w, lapack_complex_float* z,
+                               lapack_int ldz, lapack_int* isuppz, lapack_complex_float* work, lapack_int lwork,
+                               float* rwork, lapack_int lrwork, lapack_int* iwork, lapack_int liwork);
+lapack_int LAPACKE_zheevr_work(int matrix_order, char jobz, char range, char uplo, lapack_int n,
+                               lapack_complex_double* a, lapack_int lda, double vl, double vu, lapack_int il,
+                               lapack_int iu, double abstol, lapack_int* m, double* w, lapack_complex_double* z,
+                               lapack_int ldz, lapack_int* isuppz, lapack_complex_double* work, lapack_int lwork,
+                               double* rwork, lapack_int lrwork, lapack_int* iwork, lapack_int liwork);
+
+lapack_int LAPACKE_cheevx_work(int matrix_order, char jobz, char range, char uplo, lapack_int n,
+                               lapack_complex_float* a, lapack_int lda, float vl, float vu, lapack_int il,
+                               lapack_int iu, float abstol, lapack_int* m, float* w, lapack_complex_float* z,
+                               lapack_int ldz, lapack_complex_float* work, lapack_int lwork, float* rwork,
+                               lapack_int* iwork, lapack_int* ifail);
+lapack_int LAPACKE_zheevx_work(int matrix_order, char jobz, char range, char uplo, lapack_int n,
+                               lapack_complex_double* a, lapack_int lda, double vl, double vu, lapack_int il,
+                               lapack_int iu, double abstol, lapack_int* m, double* w, lapack_complex_double* z,
+                               lapack_int ldz, lapack_complex_double* work, lapack_int lwork, double* rwork,
+                               lapack_int* iwork, lapack_int* ifail);
+
+lapack_int LAPACKE_chegst_work(int matrix_order, lapack_int itype, char uplo, lapack_int n, lapack_complex_float* a,
+                               lapack_int lda, const lapack_complex_float* b, lapack_int ldb);
+lapack_int LAPACKE_zhegst_work(int matrix_order, lapack_int itype, char uplo, lapack_int n, lapack_complex_double* a,
+                               lapack_int lda, const lapack_complex_double* b, lapack_int ldb);
+
+lapack_int LAPACKE_chegv_work(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n,
+                              lapack_complex_float* a, lapack_int lda, lapack_complex_float* b, lapack_int ldb,
+                              float* w, lapack_complex_float* work, lapack_int lwork, float* rwork);
+lapack_int LAPACKE_zhegv_work(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n,
+                              lapack_complex_double* a, lapack_int lda, lapack_complex_double* b, lapack_int ldb,
+                              double* w, lapack_complex_double* work, lapack_int lwork, double* rwork);
+
+lapack_int LAPACKE_chegvd_work(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n,
+                               lapack_complex_float* a, lapack_int lda, lapack_complex_float* b, lapack_int ldb,
+                               float* w, lapack_complex_float* work, lapack_int lwork, float* rwork, lapack_int lrwork,
+                               lapack_int* iwork, lapack_int liwork);
+lapack_int LAPACKE_zhegvd_work(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n,
+                               lapack_complex_double* a, lapack_int lda, lapack_complex_double* b, lapack_int ldb,
+                               double* w, lapack_complex_double* work, lapack_int lwork, double* rwork,
+                               lapack_int lrwork, lapack_int* iwork, lapack_int liwork);
+
+lapack_int LAPACKE_chegvx_work(int matrix_order, lapack_int itype, char jobz, char range, char uplo, lapack_int n,
+                               lapack_complex_float* a, lapack_int lda, lapack_complex_float* b, lapack_int ldb,
+                               float vl, float vu, lapack_int il, lapack_int iu, float abstol, lapack_int* m, float* w,
+                               lapack_complex_float* z, lapack_int ldz, lapack_complex_float* work, lapack_int lwork,
+                               float* rwork, lapack_int* iwork, lapack_int* ifail);
+lapack_int LAPACKE_zhegvx_work(int matrix_order, lapack_int itype, char jobz, char range, char uplo, lapack_int n,
+                               lapack_complex_double* a, lapack_int lda, lapack_complex_double* b, lapack_int ldb,
+                               double vl, double vu, lapack_int il, lapack_int iu, double abstol, lapack_int* m,
+                               double* w, lapack_complex_double* z, lapack_int ldz, lapack_complex_double* work,
+                               lapack_int lwork, double* rwork, lapack_int* iwork, lapack_int* ifail);
+
+lapack_int LAPACKE_cherfs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs,
+                               const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* af,
+                               lapack_int ldaf, const lapack_int* ipiv, const lapack_complex_float* b, lapack_int ldb,
+                               lapack_complex_float* x, lapack_int ldx, float* ferr, float* berr,
+                               lapack_complex_float* work, float* rwork);
+lapack_int LAPACKE_zherfs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs,
+                               const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* af,
+                               lapack_int ldaf, const lapack_int* ipiv, const lapack_complex_double* b, lapack_int ldb,
+                               lapack_complex_double* x, lapack_int ldx, double* ferr, double* berr,
+                               lapack_complex_double* work, double* rwork);
+
+lapack_int LAPACKE_cherfsx_work(int matrix_order, char uplo, char equed, lapack_int n, lapack_int nrhs,
+                                const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* af,
+                                lapack_int ldaf, const lapack_int* ipiv, const float* s, const lapack_complex_float* b,
+                                lapack_int ldb, lapack_complex_float* x, lapack_int ldx, float* rcond, float* berr,
+                                lapack_int n_err_bnds, float* err_bnds_norm, float* err_bnds_comp, lapack_int nparams,
+                                float* params, lapack_complex_float* work, float* rwork);
+lapack_int LAPACKE_zherfsx_work(int matrix_order, char uplo, char equed, lapack_int n, lapack_int nrhs,
+                                const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* af,
+                                lapack_int ldaf, const lapack_int* ipiv, const double* s,
+                                const lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x,
+                                lapack_int ldx, double* rcond, double* berr, lapack_int n_err_bnds,
+                                double* err_bnds_norm, double* err_bnds_comp, lapack_int nparams, double* params,
+                                lapack_complex_double* work, double* rwork);
+
+lapack_int LAPACKE_chesv_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, lapack_complex_float* a,
+                              lapack_int lda, lapack_int* ipiv, lapack_complex_float* b, lapack_int ldb,
+                              lapack_complex_float* work, lapack_int lwork);
+lapack_int LAPACKE_zhesv_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, lapack_complex_double* a,
+                              lapack_int lda, lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb,
+                              lapack_complex_double* work, lapack_int lwork);
+
+lapack_int LAPACKE_chesvx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
+                               const lapack_complex_float* a, lapack_int lda, lapack_complex_float* af, lapack_int ldaf,
+                               lapack_int* ipiv, const lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x,
+                               lapack_int ldx, float* rcond, float* ferr, float* berr, lapack_complex_float* work,
+                               lapack_int lwork, float* rwork);
+lapack_int LAPACKE_zhesvx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
+                               const lapack_complex_double* a, lapack_int lda, lapack_complex_double* af,
+                               lapack_int ldaf, lapack_int* ipiv, const lapack_complex_double* b, lapack_int ldb,
+                               lapack_complex_double* x, lapack_int ldx, double* rcond, double* ferr, double* berr,
+                               lapack_complex_double* work, lapack_int lwork, double* rwork);
+
+lapack_int LAPACKE_chesvxx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
+                                lapack_complex_float* a, lapack_int lda, lapack_complex_float* af, lapack_int ldaf,
+                                lapack_int* ipiv, char* equed, float* s, lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* x, lapack_int ldx, float* rcond, float* rpvgrw, float* berr,
+                                lapack_int n_err_bnds, float* err_bnds_norm, float* err_bnds_comp, lapack_int nparams,
+                                float* params, lapack_complex_float* work, float* rwork);
+lapack_int LAPACKE_zhesvxx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
+                                lapack_complex_double* a, lapack_int lda, lapack_complex_double* af, lapack_int ldaf,
+                                lapack_int* ipiv, char* equed, double* s, lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx, double* rcond, double* rpvgrw, double* berr,
+                                lapack_int n_err_bnds, double* err_bnds_norm, double* err_bnds_comp, lapack_int nparams,
+                                double* params, lapack_complex_double* work, double* rwork);
+
+lapack_int LAPACKE_chetrd_work(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                               float* d, float* e, lapack_complex_float* tau, lapack_complex_float* work,
+                               lapack_int lwork);
+lapack_int LAPACKE_zhetrd_work(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                               double* d, double* e, lapack_complex_double* tau, lapack_complex_double* work,
+                               lapack_int lwork);
+
+lapack_int LAPACKE_chetrf_work(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                               lapack_int* ipiv, lapack_complex_float* work, lapack_int lwork);
+lapack_int LAPACKE_zhetrf_work(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                               lapack_int* ipiv, lapack_complex_double* work, lapack_int lwork);
+
+lapack_int LAPACKE_chetri_work(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                               const lapack_int* ipiv, lapack_complex_float* work);
+lapack_int LAPACKE_zhetri_work(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                               const lapack_int* ipiv, lapack_complex_double* work);
+
+lapack_int LAPACKE_chetrs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs,
+                               const lapack_complex_float* a, lapack_int lda, const lapack_int* ipiv,
+                               lapack_complex_float* b, lapack_int ldb);
+lapack_int LAPACKE_zhetrs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs,
+                               const lapack_complex_double* a, lapack_int lda, const lapack_int* ipiv,
+                               lapack_complex_double* b, lapack_int ldb);
+
+lapack_int LAPACKE_chfrk_work(int matrix_order, char transr, char uplo, char trans, lapack_int n, lapack_int k,
+                              float alpha, const lapack_complex_float* a, lapack_int lda, float beta,
+                              lapack_complex_float* c);
+lapack_int LAPACKE_zhfrk_work(int matrix_order, char transr, char uplo, char trans, lapack_int n, lapack_int k,
+                              double alpha, const lapack_complex_double* a, lapack_int lda, double beta,
+                              lapack_complex_double* c);
+
+lapack_int LAPACKE_shgeqz_work(int matrix_order, char job, char compq, char compz, lapack_int n, lapack_int ilo,
+                               lapack_int ihi, float* h, lapack_int ldh, float* t, lapack_int ldt, float* alphar,
+                               float* alphai, float* beta, float* q, lapack_int ldq, float* z, lapack_int ldz,
+                               float* work, lapack_int lwork);
+lapack_int LAPACKE_dhgeqz_work(int matrix_order, char job, char compq, char compz, lapack_int n, lapack_int ilo,
+                               lapack_int ihi, double* h, lapack_int ldh, double* t, lapack_int ldt, double* alphar,
+                               double* alphai, double* beta, double* q, lapack_int ldq, double* z, lapack_int ldz,
+                               double* work, lapack_int lwork);
+lapack_int LAPACKE_chgeqz_work(int matrix_order, char job, char compq, char compz, lapack_int n, lapack_int ilo,
+                               lapack_int ihi, lapack_complex_float* h, lapack_int ldh, lapack_complex_float* t,
+                               lapack_int ldt, lapack_complex_float* alpha, lapack_complex_float* beta,
+                               lapack_complex_float* q, lapack_int ldq, lapack_complex_float* z, lapack_int ldz,
+                               lapack_complex_float* work, lapack_int lwork, float* rwork);
+lapack_int LAPACKE_zhgeqz_work(int matrix_order, char job, char compq, char compz, lapack_int n, lapack_int ilo,
+                               lapack_int ihi, lapack_complex_double* h, lapack_int ldh, lapack_complex_double* t,
+                               lapack_int ldt, lapack_complex_double* alpha, lapack_complex_double* beta,
+                               lapack_complex_double* q, lapack_int ldq, lapack_complex_double* z, lapack_int ldz,
+                               lapack_complex_double* work, lapack_int lwork, double* rwork);
+
+lapack_int LAPACKE_chpcon_work(int matrix_order, char uplo, lapack_int n, const lapack_complex_float* ap,
+                               const lapack_int* ipiv, float anorm, float* rcond, lapack_complex_float* work);
+lapack_int LAPACKE_zhpcon_work(int matrix_order, char uplo, lapack_int n, const lapack_complex_double* ap,
+                               const lapack_int* ipiv, double anorm, double* rcond, lapack_complex_double* work);
+
+lapack_int LAPACKE_chpev_work(int matrix_order, char jobz, char uplo, lapack_int n, lapack_complex_float* ap, float* w,
+                              lapack_complex_float* z, lapack_int ldz, lapack_complex_float* work, float* rwork);
+lapack_int LAPACKE_zhpev_work(int matrix_order, char jobz, char uplo, lapack_int n, lapack_complex_double* ap,
+                              double* w, lapack_complex_double* z, lapack_int ldz, lapack_complex_double* work,
+                              double* rwork);
+
+lapack_int LAPACKE_chpevd_work(int matrix_order, char jobz, char uplo, lapack_int n, lapack_complex_float* ap, float* w,
+                               lapack_complex_float* z, lapack_int ldz, lapack_complex_float* work, lapack_int lwork,
+                               float* rwork, lapack_int lrwork, lapack_int* iwork, lapack_int liwork);
+lapack_int LAPACKE_zhpevd_work(int matrix_order, char jobz, char uplo, lapack_int n, lapack_complex_double* ap,
+                               double* w, lapack_complex_double* z, lapack_int ldz, lapack_complex_double* work,
+                               lapack_int lwork, double* rwork, lapack_int lrwork, lapack_int* iwork,
+                               lapack_int liwork);
+
+lapack_int LAPACKE_chpevx_work(int matrix_order, char jobz, char range, char uplo, lapack_int n,
+                               lapack_complex_float* ap, float vl, float vu, lapack_int il, lapack_int iu, float abstol,
+                               lapack_int* m, float* w, lapack_complex_float* z, lapack_int ldz,
+                               lapack_complex_float* work, float* rwork, lapack_int* iwork, lapack_int* ifail);
+lapack_int LAPACKE_zhpevx_work(int matrix_order, char jobz, char range, char uplo, lapack_int n,
+                               lapack_complex_double* ap, double vl, double vu, lapack_int il, lapack_int iu,
+                               double abstol, lapack_int* m, double* w, lapack_complex_double* z, lapack_int ldz,
+                               lapack_complex_double* work, double* rwork, lapack_int* iwork, lapack_int* ifail);
+
+lapack_int LAPACKE_chpgst_work(int matrix_order, lapack_int itype, char uplo, lapack_int n, lapack_complex_float* ap,
+                               const lapack_complex_float* bp);
+lapack_int LAPACKE_zhpgst_work(int matrix_order, lapack_int itype, char uplo, lapack_int n, lapack_complex_double* ap,
+                               const lapack_complex_double* bp);
+
+lapack_int LAPACKE_chpgv_work(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n,
+                              lapack_complex_float* ap, lapack_complex_float* bp, float* w, lapack_complex_float* z,
+                              lapack_int ldz, lapack_complex_float* work, float* rwork);
+lapack_int LAPACKE_zhpgv_work(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n,
+                              lapack_complex_double* ap, lapack_complex_double* bp, double* w, lapack_complex_double* z,
+                              lapack_int ldz, lapack_complex_double* work, double* rwork);
+
+lapack_int LAPACKE_chpgvd_work(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n,
+                               lapack_complex_float* ap, lapack_complex_float* bp, float* w, lapack_complex_float* z,
+                               lapack_int ldz, lapack_complex_float* work, lapack_int lwork, float* rwork,
+                               lapack_int lrwork, lapack_int* iwork, lapack_int liwork);
+lapack_int LAPACKE_zhpgvd_work(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n,
+                               lapack_complex_double* ap, lapack_complex_double* bp, double* w,
+                               lapack_complex_double* z, lapack_int ldz, lapack_complex_double* work, lapack_int lwork,
+                               double* rwork, lapack_int lrwork, lapack_int* iwork, lapack_int liwork);
+
+lapack_int LAPACKE_chpgvx_work(int matrix_order, lapack_int itype, char jobz, char range, char uplo, lapack_int n,
+                               lapack_complex_float* ap, lapack_complex_float* bp, float vl, float vu, lapack_int il,
+                               lapack_int iu, float abstol, lapack_int* m, float* w, lapack_complex_float* z,
+                               lapack_int ldz, lapack_complex_float* work, float* rwork, lapack_int* iwork,
+                               lapack_int* ifail);
+lapack_int LAPACKE_zhpgvx_work(int matrix_order, lapack_int itype, char jobz, char range, char uplo, lapack_int n,
+                               lapack_complex_double* ap, lapack_complex_double* bp, double vl, double vu,
+                               lapack_int il, lapack_int iu, double abstol, lapack_int* m, double* w,
+                               lapack_complex_double* z, lapack_int ldz, lapack_complex_double* work, double* rwork,
+                               lapack_int* iwork, lapack_int* ifail);
+
+lapack_int LAPACKE_chprfs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs,
+                               const lapack_complex_float* ap, const lapack_complex_float* afp, const lapack_int* ipiv,
+                               const lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x, lapack_int ldx,
+                               float* ferr, float* berr, lapack_complex_float* work, float* rwork);
+lapack_int LAPACKE_zhprfs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs,
+                               const lapack_complex_double* ap, const lapack_complex_double* afp,
+                               const lapack_int* ipiv, const lapack_complex_double* b, lapack_int ldb,
+                               lapack_complex_double* x, lapack_int ldx, double* ferr, double* berr,
+                               lapack_complex_double* work, double* rwork);
+
+lapack_int LAPACKE_chpsv_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, lapack_complex_float* ap,
+                              lapack_int* ipiv, lapack_complex_float* b, lapack_int ldb);
+lapack_int LAPACKE_zhpsv_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, lapack_complex_double* ap,
+                              lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb);
+
+lapack_int LAPACKE_chpsvx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
+                               const lapack_complex_float* ap, lapack_complex_float* afp, lapack_int* ipiv,
+                               const lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x, lapack_int ldx,
+                               float* rcond, float* ferr, float* berr, lapack_complex_float* work, float* rwork);
+lapack_int LAPACKE_zhpsvx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
+                               const lapack_complex_double* ap, lapack_complex_double* afp, lapack_int* ipiv,
+                               const lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x, lapack_int ldx,
+                               double* rcond, double* ferr, double* berr, lapack_complex_double* work, double* rwork);
+
+lapack_int LAPACKE_chptrd_work(int matrix_order, char uplo, lapack_int n, lapack_complex_float* ap, float* d, float* e,
+                               lapack_complex_float* tau);
+lapack_int LAPACKE_zhptrd_work(int matrix_order, char uplo, lapack_int n, lapack_complex_double* ap, double* d,
+                               double* e, lapack_complex_double* tau);
+
+lapack_int LAPACKE_chptrf_work(int matrix_order, char uplo, lapack_int n, lapack_complex_float* ap, lapack_int* ipiv);
+lapack_int LAPACKE_zhptrf_work(int matrix_order, char uplo, lapack_int n, lapack_complex_double* ap, lapack_int* ipiv);
+
+lapack_int LAPACKE_chptri_work(int matrix_order, char uplo, lapack_int n, lapack_complex_float* ap,
+                               const lapack_int* ipiv, lapack_complex_float* work);
+lapack_int LAPACKE_zhptri_work(int matrix_order, char uplo, lapack_int n, lapack_complex_double* ap,
+                               const lapack_int* ipiv, lapack_complex_double* work);
+
+lapack_int LAPACKE_chptrs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs,
+                               const lapack_complex_float* ap, const lapack_int* ipiv, lapack_complex_float* b,
+                               lapack_int ldb);
+lapack_int LAPACKE_zhptrs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs,
+                               const lapack_complex_double* ap, const lapack_int* ipiv, lapack_complex_double* b,
+                               lapack_int ldb);
+
+lapack_int LAPACKE_shsein_work(int matrix_order, char job, char eigsrc, char initv, lapack_logical* select,
+                               lapack_int n, const float* h, lapack_int ldh, float* wr, const float* wi, float* vl,
+                               lapack_int ldvl, float* vr, lapack_int ldvr, lapack_int mm, lapack_int* m, float* work,
+                               lapack_int* ifaill, lapack_int* ifailr);
+lapack_int LAPACKE_dhsein_work(int matrix_order, char job, char eigsrc, char initv, lapack_logical* select,
+                               lapack_int n, const double* h, lapack_int ldh, double* wr, const double* wi, double* vl,
+                               lapack_int ldvl, double* vr, lapack_int ldvr, lapack_int mm, lapack_int* m, double* work,
+                               lapack_int* ifaill, lapack_int* ifailr);
+lapack_int LAPACKE_chsein_work(int matrix_order, char job, char eigsrc, char initv, const lapack_logical* select,
+                               lapack_int n, const lapack_complex_float* h, lapack_int ldh, lapack_complex_float* w,
+                               lapack_complex_float* vl, lapack_int ldvl, lapack_complex_float* vr, lapack_int ldvr,
+                               lapack_int mm, lapack_int* m, lapack_complex_float* work, float* rwork,
+                               lapack_int* ifaill, lapack_int* ifailr);
+lapack_int LAPACKE_zhsein_work(int matrix_order, char job, char eigsrc, char initv, const lapack_logical* select,
+                               lapack_int n, const lapack_complex_double* h, lapack_int ldh, lapack_complex_double* w,
+                               lapack_complex_double* vl, lapack_int ldvl, lapack_complex_double* vr, lapack_int ldvr,
+                               lapack_int mm, lapack_int* m, lapack_complex_double* work, double* rwork,
+                               lapack_int* ifaill, lapack_int* ifailr);
+
+lapack_int LAPACKE_shseqr_work(int matrix_order, char job, char compz, lapack_int n, lapack_int ilo, lapack_int ihi,
+                               float* h, lapack_int ldh, float* wr, float* wi, float* z, lapack_int ldz, float* work,
+                               lapack_int lwork);
+lapack_int LAPACKE_dhseqr_work(int matrix_order, char job, char compz, lapack_int n, lapack_int ilo, lapack_int ihi,
+                               double* h, lapack_int ldh, double* wr, double* wi, double* z, lapack_int ldz,
+                               double* work, lapack_int lwork);
+lapack_int LAPACKE_chseqr_work(int matrix_order, char job, char compz, lapack_int n, lapack_int ilo, lapack_int ihi,
+                               lapack_complex_float* h, lapack_int ldh, lapack_complex_float* w,
+                               lapack_complex_float* z, lapack_int ldz, lapack_complex_float* work, lapack_int lwork);
+lapack_int LAPACKE_zhseqr_work(int matrix_order, char job, char compz, lapack_int n, lapack_int ilo, lapack_int ihi,
+                               lapack_complex_double* h, lapack_int ldh, lapack_complex_double* w,
+                               lapack_complex_double* z, lapack_int ldz, lapack_complex_double* work, lapack_int lwork);
+
+lapack_int LAPACKE_clacgv_work(lapack_int n, lapack_complex_float* x, lapack_int incx);
+lapack_int LAPACKE_zlacgv_work(lapack_int n, lapack_complex_double* x, lapack_int incx);
+
+lapack_int LAPACKE_slacpy_work(int matrix_order, char uplo, lapack_int m, lapack_int n, const float* a, lapack_int lda,
+                               float* b, lapack_int ldb);
+lapack_int LAPACKE_dlacpy_work(int matrix_order, char uplo, lapack_int m, lapack_int n, const double* a, lapack_int lda,
+                               double* b, lapack_int ldb);
+lapack_int LAPACKE_clacpy_work(int matrix_order, char uplo, lapack_int m, lapack_int n, const lapack_complex_float* a,
+                               lapack_int lda, lapack_complex_float* b, lapack_int ldb);
+lapack_int LAPACKE_zlacpy_work(int matrix_order, char uplo, lapack_int m, lapack_int n, const lapack_complex_double* a,
+                               lapack_int lda, lapack_complex_double* b, lapack_int ldb);
+
+lapack_int LAPACKE_zlag2c_work(int matrix_order, lapack_int m, lapack_int n, const lapack_complex_double* a,
+                               lapack_int lda, lapack_complex_float* sa, lapack_int ldsa);
+
+lapack_int LAPACKE_slag2d_work(int matrix_order, lapack_int m, lapack_int n, const float* sa, lapack_int ldsa,
+                               double* a, lapack_int lda);
+
+lapack_int LAPACKE_dlag2s_work(int matrix_order, lapack_int m, lapack_int n, const double* a, lapack_int lda, float* sa,
+                               lapack_int ldsa);
+
+lapack_int LAPACKE_clag2z_work(int matrix_order, lapack_int m, lapack_int n, const lapack_complex_float* sa,
+                               lapack_int ldsa, lapack_complex_double* a, lapack_int lda);
+
+lapack_int LAPACKE_slagge_work(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku,
+                               const float* d, float* a, lapack_int lda, lapack_int* iseed, float* work);
+lapack_int LAPACKE_dlagge_work(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku,
+                               const double* d, double* a, lapack_int lda, lapack_int* iseed, double* work);
+lapack_int LAPACKE_clagge_work(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku,
+                               const float* d, lapack_complex_float* a, lapack_int lda, lapack_int* iseed,
+                               lapack_complex_float* work);
+lapack_int LAPACKE_zlagge_work(int matrix_order, lapack_int m, lapack_int n, lapack_int kl, lapack_int ku,
+                               const double* d, lapack_complex_double* a, lapack_int lda, lapack_int* iseed,
+                               lapack_complex_double* work);
+
+lapack_int LAPACKE_claghe_work(int matrix_order, lapack_int n, lapack_int k, const float* d, lapack_complex_float* a,
+                               lapack_int lda, lapack_int* iseed, lapack_complex_float* work);
+lapack_int LAPACKE_zlaghe_work(int matrix_order, lapack_int n, lapack_int k, const double* d, lapack_complex_double* a,
+                               lapack_int lda, lapack_int* iseed, lapack_complex_double* work);
+
+lapack_int LAPACKE_slagsy_work(int matrix_order, lapack_int n, lapack_int k, const float* d, float* a, lapack_int lda,
+                               lapack_int* iseed, float* work);
+lapack_int LAPACKE_dlagsy_work(int matrix_order, lapack_int n, lapack_int k, const double* d, double* a, lapack_int lda,
+                               lapack_int* iseed, double* work);
+lapack_int LAPACKE_clagsy_work(int matrix_order, lapack_int n, lapack_int k, const float* d, lapack_complex_float* a,
+                               lapack_int lda, lapack_int* iseed, lapack_complex_float* work);
+lapack_int LAPACKE_zlagsy_work(int matrix_order, lapack_int n, lapack_int k, const double* d, lapack_complex_double* a,
+                               lapack_int lda, lapack_int* iseed, lapack_complex_double* work);
+
+lapack_int LAPACKE_slapmr_work(int matrix_order, lapack_logical forwrd, lapack_int m, lapack_int n, float* x,
+                               lapack_int ldx, lapack_int* k);
+lapack_int LAPACKE_dlapmr_work(int matrix_order, lapack_logical forwrd, lapack_int m, lapack_int n, double* x,
+                               lapack_int ldx, lapack_int* k);
+lapack_int LAPACKE_clapmr_work(int matrix_order, lapack_logical forwrd, lapack_int m, lapack_int n,
+                               lapack_complex_float* x, lapack_int ldx, lapack_int* k);
+lapack_int LAPACKE_zlapmr_work(int matrix_order, lapack_logical forwrd, lapack_int m, lapack_int n,
+                               lapack_complex_double* x, lapack_int ldx, lapack_int* k);
+
+lapack_int LAPACKE_slartgp_work(float f, float g, float* cs, float* sn, float* r);
+lapack_int LAPACKE_dlartgp_work(double f, double g, double* cs, double* sn, double* r);
+
+lapack_int LAPACKE_slartgs_work(float x, float y, float sigma, float* cs, float* sn);
+lapack_int LAPACKE_dlartgs_work(double x, double y, double sigma, double* cs, double* sn);
+
+float LAPACKE_slapy2_work(float x, float y);
+double LAPACKE_dlapy2_work(double x, double y);
+
+float LAPACKE_slapy3_work(float x, float y, float z);
+double LAPACKE_dlapy3_work(double x, double y, double z);
+
+float LAPACKE_slamch_work(char cmach);
+double LAPACKE_dlamch_work(char cmach);
+
+float LAPACKE_slange_work(int matrix_order, char norm, lapack_int m, lapack_int n, const float* a, lapack_int lda,
+                          float* work);
+double LAPACKE_dlange_work(int matrix_order, char norm, lapack_int m, lapack_int n, const double* a, lapack_int lda,
+                           double* work);
+float LAPACKE_clange_work(int matrix_order, char norm, lapack_int m, lapack_int n, const lapack_complex_float* a,
+                          lapack_int lda, float* work);
+double LAPACKE_zlange_work(int matrix_order, char norm, lapack_int m, lapack_int n, const lapack_complex_double* a,
+                           lapack_int lda, double* work);
+
+float LAPACKE_clanhe_work(int matrix_order, char norm, char uplo, lapack_int n, const lapack_complex_float* a,
+                          lapack_int lda, float* work);
+double LAPACKE_zlanhe_work(int matrix_order, char norm, char uplo, lapack_int n, const lapack_complex_double* a,
+                           lapack_int lda, double* work);
+
+float LAPACKE_slansy_work(int matrix_order, char norm, char uplo, lapack_int n, const float* a, lapack_int lda,
+                          float* work);
+double LAPACKE_dlansy_work(int matrix_order, char norm, char uplo, lapack_int n, const double* a, lapack_int lda,
+                           double* work);
+float LAPACKE_clansy_work(int matrix_order, char norm, char uplo, lapack_int n, const lapack_complex_float* a,
+                          lapack_int lda, float* work);
+double LAPACKE_zlansy_work(int matrix_order, char norm, char uplo, lapack_int n, const lapack_complex_double* a,
+                           lapack_int lda, double* work);
+
+float LAPACKE_slantr_work(int matrix_order, char norm, char uplo, char diag, lapack_int m, lapack_int n, const float* a,
+                          lapack_int lda, float* work);
+double LAPACKE_dlantr_work(int matrix_order, char norm, char uplo, char diag, lapack_int m, lapack_int n,
+                           const double* a, lapack_int lda, double* work);
+float LAPACKE_clantr_work(int matrix_order, char norm, char uplo, char diag, lapack_int m, lapack_int n,
+                          const lapack_complex_float* a, lapack_int lda, float* work);
+double LAPACKE_zlantr_work(int matrix_order, char norm, char uplo, char diag, lapack_int m, lapack_int n,
+                           const lapack_complex_double* a, lapack_int lda, double* work);
+
+lapack_int LAPACKE_slarfb_work(int matrix_order, char side, char trans, char direct, char storev, lapack_int m,
+                               lapack_int n, lapack_int k, const float* v, lapack_int ldv, const float* t,
+                               lapack_int ldt, float* c, lapack_int ldc, float* work, lapack_int ldwork);
+lapack_int LAPACKE_dlarfb_work(int matrix_order, char side, char trans, char direct, char storev, lapack_int m,
+                               lapack_int n, lapack_int k, const double* v, lapack_int ldv, const double* t,
+                               lapack_int ldt, double* c, lapack_int ldc, double* work, lapack_int ldwork);
+lapack_int LAPACKE_clarfb_work(int matrix_order, char side, char trans, char direct, char storev, lapack_int m,
+                               lapack_int n, lapack_int k, const lapack_complex_float* v, lapack_int ldv,
+                               const lapack_complex_float* t, lapack_int ldt, lapack_complex_float* c, lapack_int ldc,
+                               lapack_complex_float* work, lapack_int ldwork);
+lapack_int LAPACKE_zlarfb_work(int matrix_order, char side, char trans, char direct, char storev, lapack_int m,
+                               lapack_int n, lapack_int k, const lapack_complex_double* v, lapack_int ldv,
+                               const lapack_complex_double* t, lapack_int ldt, lapack_complex_double* c, lapack_int ldc,
+                               lapack_complex_double* work, lapack_int ldwork);
+
+lapack_int LAPACKE_slarfg_work(lapack_int n, float* alpha, float* x, lapack_int incx, float* tau);
+lapack_int LAPACKE_dlarfg_work(lapack_int n, double* alpha, double* x, lapack_int incx, double* tau);
+lapack_int LAPACKE_clarfg_work(lapack_int n, lapack_complex_float* alpha, lapack_complex_float* x, lapack_int incx,
+                               lapack_complex_float* tau);
+lapack_int LAPACKE_zlarfg_work(lapack_int n, lapack_complex_double* alpha, lapack_complex_double* x, lapack_int incx,
+                               lapack_complex_double* tau);
+
+lapack_int LAPACKE_slarft_work(int matrix_order, char direct, char storev, lapack_int n, lapack_int k, const float* v,
+                               lapack_int ldv, const float* tau, float* t, lapack_int ldt);
+lapack_int LAPACKE_dlarft_work(int matrix_order, char direct, char storev, lapack_int n, lapack_int k, const double* v,
+                               lapack_int ldv, const double* tau, double* t, lapack_int ldt);
+lapack_int LAPACKE_clarft_work(int matrix_order, char direct, char storev, lapack_int n, lapack_int k,
+                               const lapack_complex_float* v, lapack_int ldv, const lapack_complex_float* tau,
+                               lapack_complex_float* t, lapack_int ldt);
+lapack_int LAPACKE_zlarft_work(int matrix_order, char direct, char storev, lapack_int n, lapack_int k,
+                               const lapack_complex_double* v, lapack_int ldv, const lapack_complex_double* tau,
+                               lapack_complex_double* t, lapack_int ldt);
+
+lapack_int LAPACKE_slarfx_work(int matrix_order, char side, lapack_int m, lapack_int n, const float* v, float tau,
+                               float* c, lapack_int ldc, float* work);
+lapack_int LAPACKE_dlarfx_work(int matrix_order, char side, lapack_int m, lapack_int n, const double* v, double tau,
+                               double* c, lapack_int ldc, double* work);
+lapack_int LAPACKE_clarfx_work(int matrix_order, char side, lapack_int m, lapack_int n, const lapack_complex_float* v,
+                               lapack_complex_float tau, lapack_complex_float* c, lapack_int ldc,
+                               lapack_complex_float* work);
+lapack_int LAPACKE_zlarfx_work(int matrix_order, char side, lapack_int m, lapack_int n, const lapack_complex_double* v,
+                               lapack_complex_double tau, lapack_complex_double* c, lapack_int ldc,
+                               lapack_complex_double* work);
+
+lapack_int LAPACKE_slarnv_work(lapack_int idist, lapack_int* iseed, lapack_int n, float* x);
+lapack_int LAPACKE_dlarnv_work(lapack_int idist, lapack_int* iseed, lapack_int n, double* x);
+lapack_int LAPACKE_clarnv_work(lapack_int idist, lapack_int* iseed, lapack_int n, lapack_complex_float* x);
+lapack_int LAPACKE_zlarnv_work(lapack_int idist, lapack_int* iseed, lapack_int n, lapack_complex_double* x);
+
+lapack_int LAPACKE_slaset_work(int matrix_order, char uplo, lapack_int m, lapack_int n, float alpha, float beta,
+                               float* a, lapack_int lda);
+lapack_int LAPACKE_dlaset_work(int matrix_order, char uplo, lapack_int m, lapack_int n, double alpha, double beta,
+                               double* a, lapack_int lda);
+lapack_int LAPACKE_claset_work(int matrix_order, char uplo, lapack_int m, lapack_int n, lapack_complex_float alpha,
+                               lapack_complex_float beta, lapack_complex_float* a, lapack_int lda);
+lapack_int LAPACKE_zlaset_work(int matrix_order, char uplo, lapack_int m, lapack_int n, lapack_complex_double alpha,
+                               lapack_complex_double beta, lapack_complex_double* a, lapack_int lda);
+
+lapack_int LAPACKE_slasrt_work(char id, lapack_int n, float* d);
+lapack_int LAPACKE_dlasrt_work(char id, lapack_int n, double* d);
+
+lapack_int LAPACKE_slaswp_work(int matrix_order, lapack_int n, float* a, lapack_int lda, lapack_int k1, lapack_int k2,
+                               const lapack_int* ipiv, lapack_int incx);
+lapack_int LAPACKE_dlaswp_work(int matrix_order, lapack_int n, double* a, lapack_int lda, lapack_int k1, lapack_int k2,
+                               const lapack_int* ipiv, lapack_int incx);
+lapack_int LAPACKE_claswp_work(int matrix_order, lapack_int n, lapack_complex_float* a, lapack_int lda, lapack_int k1,
+                               lapack_int k2, const lapack_int* ipiv, lapack_int incx);
+lapack_int LAPACKE_zlaswp_work(int matrix_order, lapack_int n, lapack_complex_double* a, lapack_int lda, lapack_int k1,
+                               lapack_int k2, const lapack_int* ipiv, lapack_int incx);
+
+lapack_int LAPACKE_slatms_work(int matrix_order, lapack_int m, lapack_int n, char dist, lapack_int* iseed, char sym,
+                               float* d, lapack_int mode, float cond, float dmax, lapack_int kl, lapack_int ku,
+                               char pack, float* a, lapack_int lda, float* work);
+lapack_int LAPACKE_dlatms_work(int matrix_order, lapack_int m, lapack_int n, char dist, lapack_int* iseed, char sym,
+                               double* d, lapack_int mode, double cond, double dmax, lapack_int kl, lapack_int ku,
+                               char pack, double* a, lapack_int lda, double* work);
+lapack_int LAPACKE_clatms_work(int matrix_order, lapack_int m, lapack_int n, char dist, lapack_int* iseed, char sym,
+                               float* d, lapack_int mode, float cond, float dmax, lapack_int kl, lapack_int ku,
+                               char pack, lapack_complex_float* a, lapack_int lda, lapack_complex_float* work);
+lapack_int LAPACKE_zlatms_work(int matrix_order, lapack_int m, lapack_int n, char dist, lapack_int* iseed, char sym,
+                               double* d, lapack_int mode, double cond, double dmax, lapack_int kl, lapack_int ku,
+                               char pack, lapack_complex_double* a, lapack_int lda, lapack_complex_double* work);
+
+lapack_int LAPACKE_slauum_work(int matrix_order, char uplo, lapack_int n, float* a, lapack_int lda);
+lapack_int LAPACKE_dlauum_work(int matrix_order, char uplo, lapack_int n, double* a, lapack_int lda);
+lapack_int LAPACKE_clauum_work(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda);
+lapack_int LAPACKE_zlauum_work(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda);
+
+lapack_int LAPACKE_sopgtr_work(int matrix_order, char uplo, lapack_int n, const float* ap, const float* tau, float* q,
+                               lapack_int ldq, float* work);
+lapack_int LAPACKE_dopgtr_work(int matrix_order, char uplo, lapack_int n, const double* ap, const double* tau,
+                               double* q, lapack_int ldq, double* work);
+
+lapack_int LAPACKE_sopmtr_work(int matrix_order, char side, char uplo, char trans, lapack_int m, lapack_int n,
+                               const float* ap, const float* tau, float* c, lapack_int ldc, float* work);
+lapack_int LAPACKE_dopmtr_work(int matrix_order, char side, char uplo, char trans, lapack_int m, lapack_int n,
+                               const double* ap, const double* tau, double* c, lapack_int ldc, double* work);
+
+lapack_int LAPACKE_sorgbr_work(int matrix_order, char vect, lapack_int m, lapack_int n, lapack_int k, float* a,
+                               lapack_int lda, const float* tau, float* work, lapack_int lwork);
+lapack_int LAPACKE_dorgbr_work(int matrix_order, char vect, lapack_int m, lapack_int n, lapack_int k, double* a,
+                               lapack_int lda, const double* tau, double* work, lapack_int lwork);
+
+lapack_int LAPACKE_sorghr_work(int matrix_order, lapack_int n, lapack_int ilo, lapack_int ihi, float* a, lapack_int lda,
+                               const float* tau, float* work, lapack_int lwork);
+lapack_int LAPACKE_dorghr_work(int matrix_order, lapack_int n, lapack_int ilo, lapack_int ihi, double* a,
+                               lapack_int lda, const double* tau, double* work, lapack_int lwork);
+
+lapack_int LAPACKE_sorglq_work(int matrix_order, lapack_int m, lapack_int n, lapack_int k, float* a, lapack_int lda,
+                               const float* tau, float* work, lapack_int lwork);
+lapack_int LAPACKE_dorglq_work(int matrix_order, lapack_int m, lapack_int n, lapack_int k, double* a, lapack_int lda,
+                               const double* tau, double* work, lapack_int lwork);
+
+lapack_int LAPACKE_sorgql_work(int matrix_order, lapack_int m, lapack_int n, lapack_int k, float* a, lapack_int lda,
+                               const float* tau, float* work, lapack_int lwork);
+lapack_int LAPACKE_dorgql_work(int matrix_order, lapack_int m, lapack_int n, lapack_int k, double* a, lapack_int lda,
+                               const double* tau, double* work, lapack_int lwork);
+
+lapack_int LAPACKE_sorgqr_work(int matrix_order, lapack_int m, lapack_int n, lapack_int k, float* a, lapack_int lda,
+                               const float* tau, float* work, lapack_int lwork);
+lapack_int LAPACKE_dorgqr_work(int matrix_order, lapack_int m, lapack_int n, lapack_int k, double* a, lapack_int lda,
+                               const double* tau, double* work, lapack_int lwork);
+
+lapack_int LAPACKE_sorgrq_work(int matrix_order, lapack_int m, lapack_int n, lapack_int k, float* a, lapack_int lda,
+                               const float* tau, float* work, lapack_int lwork);
+lapack_int LAPACKE_dorgrq_work(int matrix_order, lapack_int m, lapack_int n, lapack_int k, double* a, lapack_int lda,
+                               const double* tau, double* work, lapack_int lwork);
+
+lapack_int LAPACKE_sorgtr_work(int matrix_order, char uplo, lapack_int n, float* a, lapack_int lda, const float* tau,
+                               float* work, lapack_int lwork);
+lapack_int LAPACKE_dorgtr_work(int matrix_order, char uplo, lapack_int n, double* a, lapack_int lda, const double* tau,
+                               double* work, lapack_int lwork);
+
+lapack_int LAPACKE_sormbr_work(int matrix_order, char vect, char side, char trans, lapack_int m, lapack_int n,
+                               lapack_int k, const float* a, lapack_int lda, const float* tau, float* c, lapack_int ldc,
+                               float* work, lapack_int lwork);
+lapack_int LAPACKE_dormbr_work(int matrix_order, char vect, char side, char trans, lapack_int m, lapack_int n,
+                               lapack_int k, const double* a, lapack_int lda, const double* tau, double* c,
+                               lapack_int ldc, double* work, lapack_int lwork);
+
+lapack_int LAPACKE_sormhr_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int ilo,
+                               lapack_int ihi, const float* a, lapack_int lda, const float* tau, float* c,
+                               lapack_int ldc, float* work, lapack_int lwork);
+lapack_int LAPACKE_dormhr_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int ilo,
+                               lapack_int ihi, const double* a, lapack_int lda, const double* tau, double* c,
+                               lapack_int ldc, double* work, lapack_int lwork);
+
+lapack_int LAPACKE_sormlq_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                               const float* a, lapack_int lda, const float* tau, float* c, lapack_int ldc, float* work,
+                               lapack_int lwork);
+lapack_int LAPACKE_dormlq_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                               const double* a, lapack_int lda, const double* tau, double* c, lapack_int ldc,
+                               double* work, lapack_int lwork);
+
+lapack_int LAPACKE_sormql_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                               const float* a, lapack_int lda, const float* tau, float* c, lapack_int ldc, float* work,
+                               lapack_int lwork);
+lapack_int LAPACKE_dormql_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                               const double* a, lapack_int lda, const double* tau, double* c, lapack_int ldc,
+                               double* work, lapack_int lwork);
+
+lapack_int LAPACKE_sormqr_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                               const float* a, lapack_int lda, const float* tau, float* c, lapack_int ldc, float* work,
+                               lapack_int lwork);
+lapack_int LAPACKE_dormqr_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                               const double* a, lapack_int lda, const double* tau, double* c, lapack_int ldc,
+                               double* work, lapack_int lwork);
+
+lapack_int LAPACKE_sormrq_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                               const float* a, lapack_int lda, const float* tau, float* c, lapack_int ldc, float* work,
+                               lapack_int lwork);
+lapack_int LAPACKE_dormrq_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                               const double* a, lapack_int lda, const double* tau, double* c, lapack_int ldc,
+                               double* work, lapack_int lwork);
+
+lapack_int LAPACKE_sormrz_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                               lapack_int l, const float* a, lapack_int lda, const float* tau, float* c, lapack_int ldc,
+                               float* work, lapack_int lwork);
+lapack_int LAPACKE_dormrz_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                               lapack_int l, const double* a, lapack_int lda, const double* tau, double* c,
+                               lapack_int ldc, double* work, lapack_int lwork);
+
+lapack_int LAPACKE_sormtr_work(int matrix_order, char side, char uplo, char trans, lapack_int m, lapack_int n,
+                               const float* a, lapack_int lda, const float* tau, float* c, lapack_int ldc, float* work,
+                               lapack_int lwork);
+lapack_int LAPACKE_dormtr_work(int matrix_order, char side, char uplo, char trans, lapack_int m, lapack_int n,
+                               const double* a, lapack_int lda, const double* tau, double* c, lapack_int ldc,
+                               double* work, lapack_int lwork);
+
+lapack_int LAPACKE_spbcon_work(int matrix_order, char uplo, lapack_int n, lapack_int kd, const float* ab,
+                               lapack_int ldab, float anorm, float* rcond, float* work, lapack_int* iwork);
+lapack_int LAPACKE_dpbcon_work(int matrix_order, char uplo, lapack_int n, lapack_int kd, const double* ab,
+                               lapack_int ldab, double anorm, double* rcond, double* work, lapack_int* iwork);
+lapack_int LAPACKE_cpbcon_work(int matrix_order, char uplo, lapack_int n, lapack_int kd, const lapack_complex_float* ab,
+                               lapack_int ldab, float anorm, float* rcond, lapack_complex_float* work, float* rwork);
+lapack_int LAPACKE_zpbcon_work(int matrix_order, char uplo, lapack_int n, lapack_int kd,
+                               const lapack_complex_double* ab, lapack_int ldab, double anorm, double* rcond,
+                               lapack_complex_double* work, double* rwork);
+
+lapack_int LAPACKE_spbequ_work(int matrix_order, char uplo, lapack_int n, lapack_int kd, const float* ab,
+                               lapack_int ldab, float* s, float* scond, float* amax);
+lapack_int LAPACKE_dpbequ_work(int matrix_order, char uplo, lapack_int n, lapack_int kd, const double* ab,
+                               lapack_int ldab, double* s, double* scond, double* amax);
+lapack_int LAPACKE_cpbequ_work(int matrix_order, char uplo, lapack_int n, lapack_int kd, const lapack_complex_float* ab,
+                               lapack_int ldab, float* s, float* scond, float* amax);
+lapack_int LAPACKE_zpbequ_work(int matrix_order, char uplo, lapack_int n, lapack_int kd,
+                               const lapack_complex_double* ab, lapack_int ldab, double* s, double* scond,
+                               double* amax);
+
+lapack_int LAPACKE_spbrfs_work(int matrix_order, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs,
+                               const float* ab, lapack_int ldab, const float* afb, lapack_int ldafb, const float* b,
+                               lapack_int ldb, float* x, lapack_int ldx, float* ferr, float* berr, float* work,
+                               lapack_int* iwork);
+lapack_int LAPACKE_dpbrfs_work(int matrix_order, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs,
+                               const double* ab, lapack_int ldab, const double* afb, lapack_int ldafb, const double* b,
+                               lapack_int ldb, double* x, lapack_int ldx, double* ferr, double* berr, double* work,
+                               lapack_int* iwork);
+lapack_int LAPACKE_cpbrfs_work(int matrix_order, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs,
+                               const lapack_complex_float* ab, lapack_int ldab, const lapack_complex_float* afb,
+                               lapack_int ldafb, const lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x,
+                               lapack_int ldx, float* ferr, float* berr, lapack_complex_float* work, float* rwork);
+lapack_int LAPACKE_zpbrfs_work(int matrix_order, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs,
+                               const lapack_complex_double* ab, lapack_int ldab, const lapack_complex_double* afb,
+                               lapack_int ldafb, const lapack_complex_double* b, lapack_int ldb,
+                               lapack_complex_double* x, lapack_int ldx, double* ferr, double* berr,
+                               lapack_complex_double* work, double* rwork);
+
+lapack_int LAPACKE_spbstf_work(int matrix_order, char uplo, lapack_int n, lapack_int kb, float* bb, lapack_int ldbb);
+lapack_int LAPACKE_dpbstf_work(int matrix_order, char uplo, lapack_int n, lapack_int kb, double* bb, lapack_int ldbb);
+lapack_int LAPACKE_cpbstf_work(int matrix_order, char uplo, lapack_int n, lapack_int kb, lapack_complex_float* bb,
+                               lapack_int ldbb);
+lapack_int LAPACKE_zpbstf_work(int matrix_order, char uplo, lapack_int n, lapack_int kb, lapack_complex_double* bb,
+                               lapack_int ldbb);
+
+lapack_int LAPACKE_spbsv_work(int matrix_order, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs, float* ab,
+                              lapack_int ldab, float* b, lapack_int ldb);
+lapack_int LAPACKE_dpbsv_work(int matrix_order, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs, double* ab,
+                              lapack_int ldab, double* b, lapack_int ldb);
+lapack_int LAPACKE_cpbsv_work(int matrix_order, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs,
+                              lapack_complex_float* ab, lapack_int ldab, lapack_complex_float* b, lapack_int ldb);
+lapack_int LAPACKE_zpbsv_work(int matrix_order, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs,
+                              lapack_complex_double* ab, lapack_int ldab, lapack_complex_double* b, lapack_int ldb);
+
+lapack_int LAPACKE_spbsvx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs,
+                               float* ab, lapack_int ldab, float* afb, lapack_int ldafb, char* equed, float* s,
+                               float* b, lapack_int ldb, float* x, lapack_int ldx, float* rcond, float* ferr,
+                               float* berr, float* work, lapack_int* iwork);
+lapack_int LAPACKE_dpbsvx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs,
+                               double* ab, lapack_int ldab, double* afb, lapack_int ldafb, char* equed, double* s,
+                               double* b, lapack_int ldb, double* x, lapack_int ldx, double* rcond, double* ferr,
+                               double* berr, double* work, lapack_int* iwork);
+lapack_int LAPACKE_cpbsvx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs,
+                               lapack_complex_float* ab, lapack_int ldab, lapack_complex_float* afb, lapack_int ldafb,
+                               char* equed, float* s, lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x,
+                               lapack_int ldx, float* rcond, float* ferr, float* berr, lapack_complex_float* work,
+                               float* rwork);
+lapack_int LAPACKE_zpbsvx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs,
+                               lapack_complex_double* ab, lapack_int ldab, lapack_complex_double* afb, lapack_int ldafb,
+                               char* equed, double* s, lapack_complex_double* b, lapack_int ldb,
+                               lapack_complex_double* x, lapack_int ldx, double* rcond, double* ferr, double* berr,
+                               lapack_complex_double* work, double* rwork);
+
+lapack_int LAPACKE_spbtrf_work(int matrix_order, char uplo, lapack_int n, lapack_int kd, float* ab, lapack_int ldab);
+lapack_int LAPACKE_dpbtrf_work(int matrix_order, char uplo, lapack_int n, lapack_int kd, double* ab, lapack_int ldab);
+lapack_int LAPACKE_cpbtrf_work(int matrix_order, char uplo, lapack_int n, lapack_int kd, lapack_complex_float* ab,
+                               lapack_int ldab);
+lapack_int LAPACKE_zpbtrf_work(int matrix_order, char uplo, lapack_int n, lapack_int kd, lapack_complex_double* ab,
+                               lapack_int ldab);
+
+lapack_int LAPACKE_spbtrs_work(int matrix_order, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs,
+                               const float* ab, lapack_int ldab, float* b, lapack_int ldb);
+lapack_int LAPACKE_dpbtrs_work(int matrix_order, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs,
+                               const double* ab, lapack_int ldab, double* b, lapack_int ldb);
+lapack_int LAPACKE_cpbtrs_work(int matrix_order, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs,
+                               const lapack_complex_float* ab, lapack_int ldab, lapack_complex_float* b,
+                               lapack_int ldb);
+lapack_int LAPACKE_zpbtrs_work(int matrix_order, char uplo, lapack_int n, lapack_int kd, lapack_int nrhs,
+                               const lapack_complex_double* ab, lapack_int ldab, lapack_complex_double* b,
+                               lapack_int ldb);
+
+lapack_int LAPACKE_spftrf_work(int matrix_order, char transr, char uplo, lapack_int n, float* a);
+lapack_int LAPACKE_dpftrf_work(int matrix_order, char transr, char uplo, lapack_int n, double* a);
+lapack_int LAPACKE_cpftrf_work(int matrix_order, char transr, char uplo, lapack_int n, lapack_complex_float* a);
+lapack_int LAPACKE_zpftrf_work(int matrix_order, char transr, char uplo, lapack_int n, lapack_complex_double* a);
+
+lapack_int LAPACKE_spftri_work(int matrix_order, char transr, char uplo, lapack_int n, float* a);
+lapack_int LAPACKE_dpftri_work(int matrix_order, char transr, char uplo, lapack_int n, double* a);
+lapack_int LAPACKE_cpftri_work(int matrix_order, char transr, char uplo, lapack_int n, lapack_complex_float* a);
+lapack_int LAPACKE_zpftri_work(int matrix_order, char transr, char uplo, lapack_int n, lapack_complex_double* a);
+
+lapack_int LAPACKE_spftrs_work(int matrix_order, char transr, char uplo, lapack_int n, lapack_int nrhs, const float* a,
+                               float* b, lapack_int ldb);
+lapack_int LAPACKE_dpftrs_work(int matrix_order, char transr, char uplo, lapack_int n, lapack_int nrhs, const double* a,
+                               double* b, lapack_int ldb);
+lapack_int LAPACKE_cpftrs_work(int matrix_order, char transr, char uplo, lapack_int n, lapack_int nrhs,
+                               const lapack_complex_float* a, lapack_complex_float* b, lapack_int ldb);
+lapack_int LAPACKE_zpftrs_work(int matrix_order, char transr, char uplo, lapack_int n, lapack_int nrhs,
+                               const lapack_complex_double* a, lapack_complex_double* b, lapack_int ldb);
+
+lapack_int LAPACKE_spocon_work(int matrix_order, char uplo, lapack_int n, const float* a, lapack_int lda, float anorm,
+                               float* rcond, float* work, lapack_int* iwork);
+lapack_int LAPACKE_dpocon_work(int matrix_order, char uplo, lapack_int n, const double* a, lapack_int lda, double anorm,
+                               double* rcond, double* work, lapack_int* iwork);
+lapack_int LAPACKE_cpocon_work(int matrix_order, char uplo, lapack_int n, const lapack_complex_float* a, lapack_int lda,
+                               float anorm, float* rcond, lapack_complex_float* work, float* rwork);
+lapack_int LAPACKE_zpocon_work(int matrix_order, char uplo, lapack_int n, const lapack_complex_double* a,
+                               lapack_int lda, double anorm, double* rcond, lapack_complex_double* work, double* rwork);
+
+lapack_int LAPACKE_spoequ_work(int matrix_order, lapack_int n, const float* a, lapack_int lda, float* s, float* scond,
+                               float* amax);
+lapack_int LAPACKE_dpoequ_work(int matrix_order, lapack_int n, const double* a, lapack_int lda, double* s,
+                               double* scond, double* amax);
+lapack_int LAPACKE_cpoequ_work(int matrix_order, lapack_int n, const lapack_complex_float* a, lapack_int lda, float* s,
+                               float* scond, float* amax);
+lapack_int LAPACKE_zpoequ_work(int matrix_order, lapack_int n, const lapack_complex_double* a, lapack_int lda,
+                               double* s, double* scond, double* amax);
+
+lapack_int LAPACKE_spoequb_work(int matrix_order, lapack_int n, const float* a, lapack_int lda, float* s, float* scond,
+                                float* amax);
+lapack_int LAPACKE_dpoequb_work(int matrix_order, lapack_int n, const double* a, lapack_int lda, double* s,
+                                double* scond, double* amax);
+lapack_int LAPACKE_cpoequb_work(int matrix_order, lapack_int n, const lapack_complex_float* a, lapack_int lda, float* s,
+                                float* scond, float* amax);
+lapack_int LAPACKE_zpoequb_work(int matrix_order, lapack_int n, const lapack_complex_double* a, lapack_int lda,
+                                double* s, double* scond, double* amax);
+
+lapack_int LAPACKE_sporfs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const float* a,
+                               lapack_int lda, const float* af, lapack_int ldaf, const float* b, lapack_int ldb,
+                               float* x, lapack_int ldx, float* ferr, float* berr, float* work, lapack_int* iwork);
+lapack_int LAPACKE_dporfs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const double* a,
+                               lapack_int lda, const double* af, lapack_int ldaf, const double* b, lapack_int ldb,
+                               double* x, lapack_int ldx, double* ferr, double* berr, double* work, lapack_int* iwork);
+lapack_int LAPACKE_cporfs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs,
+                               const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* af,
+                               lapack_int ldaf, const lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x,
+                               lapack_int ldx, float* ferr, float* berr, lapack_complex_float* work, float* rwork);
+lapack_int LAPACKE_zporfs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs,
+                               const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* af,
+                               lapack_int ldaf, const lapack_complex_double* b, lapack_int ldb,
+                               lapack_complex_double* x, lapack_int ldx, double* ferr, double* berr,
+                               lapack_complex_double* work, double* rwork);
+
+lapack_int LAPACKE_sporfsx_work(int matrix_order, char uplo, char equed, lapack_int n, lapack_int nrhs, const float* a,
+                                lapack_int lda, const float* af, lapack_int ldaf, const float* s, const float* b,
+                                lapack_int ldb, float* x, lapack_int ldx, float* rcond, float* berr,
+                                lapack_int n_err_bnds, float* err_bnds_norm, float* err_bnds_comp, lapack_int nparams,
+                                float* params, float* work, lapack_int* iwork);
+lapack_int LAPACKE_dporfsx_work(int matrix_order, char uplo, char equed, lapack_int n, lapack_int nrhs, const double* a,
+                                lapack_int lda, const double* af, lapack_int ldaf, const double* s, const double* b,
+                                lapack_int ldb, double* x, lapack_int ldx, double* rcond, double* berr,
+                                lapack_int n_err_bnds, double* err_bnds_norm, double* err_bnds_comp, lapack_int nparams,
+                                double* params, double* work, lapack_int* iwork);
+lapack_int LAPACKE_cporfsx_work(int matrix_order, char uplo, char equed, lapack_int n, lapack_int nrhs,
+                                const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* af,
+                                lapack_int ldaf, const float* s, const lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* x, lapack_int ldx, float* rcond, float* berr,
+                                lapack_int n_err_bnds, float* err_bnds_norm, float* err_bnds_comp, lapack_int nparams,
+                                float* params, lapack_complex_float* work, float* rwork);
+lapack_int LAPACKE_zporfsx_work(int matrix_order, char uplo, char equed, lapack_int n, lapack_int nrhs,
+                                const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* af,
+                                lapack_int ldaf, const double* s, const lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx, double* rcond, double* berr,
+                                lapack_int n_err_bnds, double* err_bnds_norm, double* err_bnds_comp, lapack_int nparams,
+                                double* params, lapack_complex_double* work, double* rwork);
+
+lapack_int LAPACKE_sposv_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, float* a, lapack_int lda,
+                              float* b, lapack_int ldb);
+lapack_int LAPACKE_dposv_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, double* a, lapack_int lda,
+                              double* b, lapack_int ldb);
+lapack_int LAPACKE_cposv_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, lapack_complex_float* a,
+                              lapack_int lda, lapack_complex_float* b, lapack_int ldb);
+lapack_int LAPACKE_zposv_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, lapack_complex_double* a,
+                              lapack_int lda, lapack_complex_double* b, lapack_int ldb);
+lapack_int LAPACKE_dsposv_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, double* a, lapack_int lda,
+                               double* b, lapack_int ldb, double* x, lapack_int ldx, double* work, float* swork,
+                               lapack_int* iter);
+lapack_int LAPACKE_zcposv_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, lapack_complex_double* a,
+                               lapack_int lda, lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x,
+                               lapack_int ldx, lapack_complex_double* work, lapack_complex_float* swork, double* rwork,
+                               lapack_int* iter);
+
+lapack_int LAPACKE_sposvx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs, float* a,
+                               lapack_int lda, float* af, lapack_int ldaf, char* equed, float* s, float* b,
+                               lapack_int ldb, float* x, lapack_int ldx, float* rcond, float* ferr, float* berr,
+                               float* work, lapack_int* iwork);
+lapack_int LAPACKE_dposvx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs, double* a,
+                               lapack_int lda, double* af, lapack_int ldaf, char* equed, double* s, double* b,
+                               lapack_int ldb, double* x, lapack_int ldx, double* rcond, double* ferr, double* berr,
+                               double* work, lapack_int* iwork);
+lapack_int LAPACKE_cposvx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
+                               lapack_complex_float* a, lapack_int lda, lapack_complex_float* af, lapack_int ldaf,
+                               char* equed, float* s, lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x,
+                               lapack_int ldx, float* rcond, float* ferr, float* berr, lapack_complex_float* work,
+                               float* rwork);
+lapack_int LAPACKE_zposvx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
+                               lapack_complex_double* a, lapack_int lda, lapack_complex_double* af, lapack_int ldaf,
+                               char* equed, double* s, lapack_complex_double* b, lapack_int ldb,
+                               lapack_complex_double* x, lapack_int ldx, double* rcond, double* ferr, double* berr,
+                               lapack_complex_double* work, double* rwork);
+
+lapack_int LAPACKE_sposvxx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs, float* a,
+                                lapack_int lda, float* af, lapack_int ldaf, char* equed, float* s, float* b,
+                                lapack_int ldb, float* x, lapack_int ldx, float* rcond, float* rpvgrw, float* berr,
+                                lapack_int n_err_bnds, float* err_bnds_norm, float* err_bnds_comp, lapack_int nparams,
+                                float* params, float* work, lapack_int* iwork);
+lapack_int LAPACKE_dposvxx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs, double* a,
+                                lapack_int lda, double* af, lapack_int ldaf, char* equed, double* s, double* b,
+                                lapack_int ldb, double* x, lapack_int ldx, double* rcond, double* rpvgrw, double* berr,
+                                lapack_int n_err_bnds, double* err_bnds_norm, double* err_bnds_comp, lapack_int nparams,
+                                double* params, double* work, lapack_int* iwork);
+lapack_int LAPACKE_cposvxx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
+                                lapack_complex_float* a, lapack_int lda, lapack_complex_float* af, lapack_int ldaf,
+                                char* equed, float* s, lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x,
+                                lapack_int ldx, float* rcond, float* rpvgrw, float* berr, lapack_int n_err_bnds,
+                                float* err_bnds_norm, float* err_bnds_comp, lapack_int nparams, float* params,
+                                lapack_complex_float* work, float* rwork);
+lapack_int LAPACKE_zposvxx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
+                                lapack_complex_double* a, lapack_int lda, lapack_complex_double* af, lapack_int ldaf,
+                                char* equed, double* s, lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx, double* rcond, double* rpvgrw, double* berr,
+                                lapack_int n_err_bnds, double* err_bnds_norm, double* err_bnds_comp, lapack_int nparams,
+                                double* params, lapack_complex_double* work, double* rwork);
+
+lapack_int LAPACKE_spotrf_work(int matrix_order, char uplo, lapack_int n, float* a, lapack_int lda);
+lapack_int LAPACKE_dpotrf_work(int matrix_order, char uplo, lapack_int n, double* a, lapack_int lda);
+lapack_int LAPACKE_cpotrf_work(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda);
+lapack_int LAPACKE_zpotrf_work(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda);
+
+lapack_int LAPACKE_spotri_work(int matrix_order, char uplo, lapack_int n, float* a, lapack_int lda);
+lapack_int LAPACKE_dpotri_work(int matrix_order, char uplo, lapack_int n, double* a, lapack_int lda);
+lapack_int LAPACKE_cpotri_work(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda);
+lapack_int LAPACKE_zpotri_work(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda);
+
+lapack_int LAPACKE_spotrs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const float* a,
+                               lapack_int lda, float* b, lapack_int ldb);
+lapack_int LAPACKE_dpotrs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const double* a,
+                               lapack_int lda, double* b, lapack_int ldb);
+lapack_int LAPACKE_cpotrs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs,
+                               const lapack_complex_float* a, lapack_int lda, lapack_complex_float* b, lapack_int ldb);
+lapack_int LAPACKE_zpotrs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs,
+                               const lapack_complex_double* a, lapack_int lda, lapack_complex_double* b,
+                               lapack_int ldb);
+
+lapack_int LAPACKE_sppcon_work(int matrix_order, char uplo, lapack_int n, const float* ap, float anorm, float* rcond,
+                               float* work, lapack_int* iwork);
+lapack_int LAPACKE_dppcon_work(int matrix_order, char uplo, lapack_int n, const double* ap, double anorm, double* rcond,
+                               double* work, lapack_int* iwork);
+lapack_int LAPACKE_cppcon_work(int matrix_order, char uplo, lapack_int n, const lapack_complex_float* ap, float anorm,
+                               float* rcond, lapack_complex_float* work, float* rwork);
+lapack_int LAPACKE_zppcon_work(int matrix_order, char uplo, lapack_int n, const lapack_complex_double* ap, double anorm,
+                               double* rcond, lapack_complex_double* work, double* rwork);
+
+lapack_int LAPACKE_sppequ_work(int matrix_order, char uplo, lapack_int n, const float* ap, float* s, float* scond,
+                               float* amax);
+lapack_int LAPACKE_dppequ_work(int matrix_order, char uplo, lapack_int n, const double* ap, double* s, double* scond,
+                               double* amax);
+lapack_int LAPACKE_cppequ_work(int matrix_order, char uplo, lapack_int n, const lapack_complex_float* ap, float* s,
+                               float* scond, float* amax);
+lapack_int LAPACKE_zppequ_work(int matrix_order, char uplo, lapack_int n, const lapack_complex_double* ap, double* s,
+                               double* scond, double* amax);
+
+lapack_int LAPACKE_spprfs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const float* ap,
+                               const float* afp, const float* b, lapack_int ldb, float* x, lapack_int ldx, float* ferr,
+                               float* berr, float* work, lapack_int* iwork);
+lapack_int LAPACKE_dpprfs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const double* ap,
+                               const double* afp, const double* b, lapack_int ldb, double* x, lapack_int ldx,
+                               double* ferr, double* berr, double* work, lapack_int* iwork);
+lapack_int LAPACKE_cpprfs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs,
+                               const lapack_complex_float* ap, const lapack_complex_float* afp,
+                               const lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x, lapack_int ldx,
+                               float* ferr, float* berr, lapack_complex_float* work, float* rwork);
+lapack_int LAPACKE_zpprfs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs,
+                               const lapack_complex_double* ap, const lapack_complex_double* afp,
+                               const lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x, lapack_int ldx,
+                               double* ferr, double* berr, lapack_complex_double* work, double* rwork);
+
+lapack_int LAPACKE_sppsv_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, float* ap, float* b,
+                              lapack_int ldb);
+lapack_int LAPACKE_dppsv_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, double* ap, double* b,
+                              lapack_int ldb);
+lapack_int LAPACKE_cppsv_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, lapack_complex_float* ap,
+                              lapack_complex_float* b, lapack_int ldb);
+lapack_int LAPACKE_zppsv_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, lapack_complex_double* ap,
+                              lapack_complex_double* b, lapack_int ldb);
+
+lapack_int LAPACKE_sppsvx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs, float* ap,
+                               float* afp, char* equed, float* s, float* b, lapack_int ldb, float* x, lapack_int ldx,
+                               float* rcond, float* ferr, float* berr, float* work, lapack_int* iwork);
+lapack_int LAPACKE_dppsvx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs, double* ap,
+                               double* afp, char* equed, double* s, double* b, lapack_int ldb, double* x,
+                               lapack_int ldx, double* rcond, double* ferr, double* berr, double* work,
+                               lapack_int* iwork);
+lapack_int LAPACKE_cppsvx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
+                               lapack_complex_float* ap, lapack_complex_float* afp, char* equed, float* s,
+                               lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x, lapack_int ldx,
+                               float* rcond, float* ferr, float* berr, lapack_complex_float* work, float* rwork);
+lapack_int LAPACKE_zppsvx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
+                               lapack_complex_double* ap, lapack_complex_double* afp, char* equed, double* s,
+                               lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x, lapack_int ldx,
+                               double* rcond, double* ferr, double* berr, lapack_complex_double* work, double* rwork);
+
+lapack_int LAPACKE_spptrf_work(int matrix_order, char uplo, lapack_int n, float* ap);
+lapack_int LAPACKE_dpptrf_work(int matrix_order, char uplo, lapack_int n, double* ap);
+lapack_int LAPACKE_cpptrf_work(int matrix_order, char uplo, lapack_int n, lapack_complex_float* ap);
+lapack_int LAPACKE_zpptrf_work(int matrix_order, char uplo, lapack_int n, lapack_complex_double* ap);
+
+lapack_int LAPACKE_spptri_work(int matrix_order, char uplo, lapack_int n, float* ap);
+lapack_int LAPACKE_dpptri_work(int matrix_order, char uplo, lapack_int n, double* ap);
+lapack_int LAPACKE_cpptri_work(int matrix_order, char uplo, lapack_int n, lapack_complex_float* ap);
+lapack_int LAPACKE_zpptri_work(int matrix_order, char uplo, lapack_int n, lapack_complex_double* ap);
+
+lapack_int LAPACKE_spptrs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const float* ap, float* b,
+                               lapack_int ldb);
+lapack_int LAPACKE_dpptrs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const double* ap, double* b,
+                               lapack_int ldb);
+lapack_int LAPACKE_cpptrs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs,
+                               const lapack_complex_float* ap, lapack_complex_float* b, lapack_int ldb);
+lapack_int LAPACKE_zpptrs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs,
+                               const lapack_complex_double* ap, lapack_complex_double* b, lapack_int ldb);
+
+lapack_int LAPACKE_spstrf_work(int matrix_order, char uplo, lapack_int n, float* a, lapack_int lda, lapack_int* piv,
+                               lapack_int* rank, float tol, float* work);
+lapack_int LAPACKE_dpstrf_work(int matrix_order, char uplo, lapack_int n, double* a, lapack_int lda, lapack_int* piv,
+                               lapack_int* rank, double tol, double* work);
+lapack_int LAPACKE_cpstrf_work(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                               lapack_int* piv, lapack_int* rank, float tol, float* work);
+lapack_int LAPACKE_zpstrf_work(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                               lapack_int* piv, lapack_int* rank, double tol, double* work);
+
+lapack_int LAPACKE_sptcon_work(lapack_int n, const float* d, const float* e, float anorm, float* rcond, float* work);
+lapack_int LAPACKE_dptcon_work(lapack_int n, const double* d, const double* e, double anorm, double* rcond,
+                               double* work);
+lapack_int LAPACKE_cptcon_work(lapack_int n, const float* d, const lapack_complex_float* e, float anorm, float* rcond,
+                               float* work);
+lapack_int LAPACKE_zptcon_work(lapack_int n, const double* d, const lapack_complex_double* e, double anorm,
+                               double* rcond, double* work);
+
+lapack_int LAPACKE_spteqr_work(int matrix_order, char compz, lapack_int n, float* d, float* e, float* z, lapack_int ldz,
+                               float* work);
+lapack_int LAPACKE_dpteqr_work(int matrix_order, char compz, lapack_int n, double* d, double* e, double* z,
+                               lapack_int ldz, double* work);
+lapack_int LAPACKE_cpteqr_work(int matrix_order, char compz, lapack_int n, float* d, float* e, lapack_complex_float* z,
+                               lapack_int ldz, float* work);
+lapack_int LAPACKE_zpteqr_work(int matrix_order, char compz, lapack_int n, double* d, double* e,
+                               lapack_complex_double* z, lapack_int ldz, double* work);
+
+lapack_int LAPACKE_sptrfs_work(int matrix_order, lapack_int n, lapack_int nrhs, const float* d, const float* e,
+                               const float* df, const float* ef, const float* b, lapack_int ldb, float* x,
+                               lapack_int ldx, float* ferr, float* berr, float* work);
+lapack_int LAPACKE_dptrfs_work(int matrix_order, lapack_int n, lapack_int nrhs, const double* d, const double* e,
+                               const double* df, const double* ef, const double* b, lapack_int ldb, double* x,
+                               lapack_int ldx, double* ferr, double* berr, double* work);
+lapack_int LAPACKE_cptrfs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const float* d,
+                               const lapack_complex_float* e, const float* df, const lapack_complex_float* ef,
+                               const lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x, lapack_int ldx,
+                               float* ferr, float* berr, lapack_complex_float* work, float* rwork);
+lapack_int LAPACKE_zptrfs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const double* d,
+                               const lapack_complex_double* e, const double* df, const lapack_complex_double* ef,
+                               const lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x, lapack_int ldx,
+                               double* ferr, double* berr, lapack_complex_double* work, double* rwork);
+
+lapack_int LAPACKE_sptsv_work(int matrix_order, lapack_int n, lapack_int nrhs, float* d, float* e, float* b,
+                              lapack_int ldb);
+lapack_int LAPACKE_dptsv_work(int matrix_order, lapack_int n, lapack_int nrhs, double* d, double* e, double* b,
+                              lapack_int ldb);
+lapack_int LAPACKE_cptsv_work(int matrix_order, lapack_int n, lapack_int nrhs, float* d, lapack_complex_float* e,
+                              lapack_complex_float* b, lapack_int ldb);
+lapack_int LAPACKE_zptsv_work(int matrix_order, lapack_int n, lapack_int nrhs, double* d, lapack_complex_double* e,
+                              lapack_complex_double* b, lapack_int ldb);
+
+lapack_int LAPACKE_sptsvx_work(int matrix_order, char fact, lapack_int n, lapack_int nrhs, const float* d,
+                               const float* e, float* df, float* ef, const float* b, lapack_int ldb, float* x,
+                               lapack_int ldx, float* rcond, float* ferr, float* berr, float* work);
+lapack_int LAPACKE_dptsvx_work(int matrix_order, char fact, lapack_int n, lapack_int nrhs, const double* d,
+                               const double* e, double* df, double* ef, const double* b, lapack_int ldb, double* x,
+                               lapack_int ldx, double* rcond, double* ferr, double* berr, double* work);
+lapack_int LAPACKE_cptsvx_work(int matrix_order, char fact, lapack_int n, lapack_int nrhs, const float* d,
+                               const lapack_complex_float* e, float* df, lapack_complex_float* ef,
+                               const lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x, lapack_int ldx,
+                               float* rcond, float* ferr, float* berr, lapack_complex_float* work, float* rwork);
+lapack_int LAPACKE_zptsvx_work(int matrix_order, char fact, lapack_int n, lapack_int nrhs, const double* d,
+                               const lapack_complex_double* e, double* df, lapack_complex_double* ef,
+                               const lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x, lapack_int ldx,
+                               double* rcond, double* ferr, double* berr, lapack_complex_double* work, double* rwork);
+
+lapack_int LAPACKE_spttrf_work(lapack_int n, float* d, float* e);
+lapack_int LAPACKE_dpttrf_work(lapack_int n, double* d, double* e);
+lapack_int LAPACKE_cpttrf_work(lapack_int n, float* d, lapack_complex_float* e);
+lapack_int LAPACKE_zpttrf_work(lapack_int n, double* d, lapack_complex_double* e);
+
+lapack_int LAPACKE_spttrs_work(int matrix_order, lapack_int n, lapack_int nrhs, const float* d, const float* e,
+                               float* b, lapack_int ldb);
+lapack_int LAPACKE_dpttrs_work(int matrix_order, lapack_int n, lapack_int nrhs, const double* d, const double* e,
+                               double* b, lapack_int ldb);
+lapack_int LAPACKE_cpttrs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const float* d,
+                               const lapack_complex_float* e, lapack_complex_float* b, lapack_int ldb);
+lapack_int LAPACKE_zpttrs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const double* d,
+                               const lapack_complex_double* e, lapack_complex_double* b, lapack_int ldb);
+
+lapack_int LAPACKE_ssbev_work(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int kd, float* ab,
+                              lapack_int ldab, float* w, float* z, lapack_int ldz, float* work);
+lapack_int LAPACKE_dsbev_work(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int kd, double* ab,
+                              lapack_int ldab, double* w, double* z, lapack_int ldz, double* work);
+
+lapack_int LAPACKE_ssbevd_work(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int kd, float* ab,
+                               lapack_int ldab, float* w, float* z, lapack_int ldz, float* work, lapack_int lwork,
+                               lapack_int* iwork, lapack_int liwork);
+lapack_int LAPACKE_dsbevd_work(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int kd, double* ab,
+                               lapack_int ldab, double* w, double* z, lapack_int ldz, double* work, lapack_int lwork,
+                               lapack_int* iwork, lapack_int liwork);
+
+lapack_int LAPACKE_ssbevx_work(int matrix_order, char jobz, char range, char uplo, lapack_int n, lapack_int kd,
+                               float* ab, lapack_int ldab, float* q, lapack_int ldq, float vl, float vu, lapack_int il,
+                               lapack_int iu, float abstol, lapack_int* m, float* w, float* z, lapack_int ldz,
+                               float* work, lapack_int* iwork, lapack_int* ifail);
+lapack_int LAPACKE_dsbevx_work(int matrix_order, char jobz, char range, char uplo, lapack_int n, lapack_int kd,
+                               double* ab, lapack_int ldab, double* q, lapack_int ldq, double vl, double vu,
+                               lapack_int il, lapack_int iu, double abstol, lapack_int* m, double* w, double* z,
+                               lapack_int ldz, double* work, lapack_int* iwork, lapack_int* ifail);
+
+lapack_int LAPACKE_ssbgst_work(int matrix_order, char vect, char uplo, lapack_int n, lapack_int ka, lapack_int kb,
+                               float* ab, lapack_int ldab, const float* bb, lapack_int ldbb, float* x, lapack_int ldx,
+                               float* work);
+lapack_int LAPACKE_dsbgst_work(int matrix_order, char vect, char uplo, lapack_int n, lapack_int ka, lapack_int kb,
+                               double* ab, lapack_int ldab, const double* bb, lapack_int ldbb, double* x,
+                               lapack_int ldx, double* work);
+
+lapack_int LAPACKE_ssbgv_work(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int ka, lapack_int kb,
+                              float* ab, lapack_int ldab, float* bb, lapack_int ldbb, float* w, float* z,
+                              lapack_int ldz, float* work);
+lapack_int LAPACKE_dsbgv_work(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int ka, lapack_int kb,
+                              double* ab, lapack_int ldab, double* bb, lapack_int ldbb, double* w, double* z,
+                              lapack_int ldz, double* work);
+
+lapack_int LAPACKE_ssbgvd_work(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int ka, lapack_int kb,
+                               float* ab, lapack_int ldab, float* bb, lapack_int ldbb, float* w, float* z,
+                               lapack_int ldz, float* work, lapack_int lwork, lapack_int* iwork, lapack_int liwork);
+lapack_int LAPACKE_dsbgvd_work(int matrix_order, char jobz, char uplo, lapack_int n, lapack_int ka, lapack_int kb,
+                               double* ab, lapack_int ldab, double* bb, lapack_int ldbb, double* w, double* z,
+                               lapack_int ldz, double* work, lapack_int lwork, lapack_int* iwork, lapack_int liwork);
+
+lapack_int LAPACKE_ssbgvx_work(int matrix_order, char jobz, char range, char uplo, lapack_int n, lapack_int ka,
+                               lapack_int kb, float* ab, lapack_int ldab, float* bb, lapack_int ldbb, float* q,
+                               lapack_int ldq, float vl, float vu, lapack_int il, lapack_int iu, float abstol,
+                               lapack_int* m, float* w, float* z, lapack_int ldz, float* work, lapack_int* iwork,
+                               lapack_int* ifail);
+lapack_int LAPACKE_dsbgvx_work(int matrix_order, char jobz, char range, char uplo, lapack_int n, lapack_int ka,
+                               lapack_int kb, double* ab, lapack_int ldab, double* bb, lapack_int ldbb, double* q,
+                               lapack_int ldq, double vl, double vu, lapack_int il, lapack_int iu, double abstol,
+                               lapack_int* m, double* w, double* z, lapack_int ldz, double* work, lapack_int* iwork,
+                               lapack_int* ifail);
+
+lapack_int LAPACKE_ssbtrd_work(int matrix_order, char vect, char uplo, lapack_int n, lapack_int kd, float* ab,
+                               lapack_int ldab, float* d, float* e, float* q, lapack_int ldq, float* work);
+lapack_int LAPACKE_dsbtrd_work(int matrix_order, char vect, char uplo, lapack_int n, lapack_int kd, double* ab,
+                               lapack_int ldab, double* d, double* e, double* q, lapack_int ldq, double* work);
+
+lapack_int LAPACKE_ssfrk_work(int matrix_order, char transr, char uplo, char trans, lapack_int n, lapack_int k,
+                              float alpha, const float* a, lapack_int lda, float beta, float* c);
+lapack_int LAPACKE_dsfrk_work(int matrix_order, char transr, char uplo, char trans, lapack_int n, lapack_int k,
+                              double alpha, const double* a, lapack_int lda, double beta, double* c);
+
+lapack_int LAPACKE_sspcon_work(int matrix_order, char uplo, lapack_int n, const float* ap, const lapack_int* ipiv,
+                               float anorm, float* rcond, float* work, lapack_int* iwork);
+lapack_int LAPACKE_dspcon_work(int matrix_order, char uplo, lapack_int n, const double* ap, const lapack_int* ipiv,
+                               double anorm, double* rcond, double* work, lapack_int* iwork);
+lapack_int LAPACKE_cspcon_work(int matrix_order, char uplo, lapack_int n, const lapack_complex_float* ap,
+                               const lapack_int* ipiv, float anorm, float* rcond, lapack_complex_float* work);
+lapack_int LAPACKE_zspcon_work(int matrix_order, char uplo, lapack_int n, const lapack_complex_double* ap,
+                               const lapack_int* ipiv, double anorm, double* rcond, lapack_complex_double* work);
+
+lapack_int LAPACKE_sspev_work(int matrix_order, char jobz, char uplo, lapack_int n, float* ap, float* w, float* z,
+                              lapack_int ldz, float* work);
+lapack_int LAPACKE_dspev_work(int matrix_order, char jobz, char uplo, lapack_int n, double* ap, double* w, double* z,
+                              lapack_int ldz, double* work);
+
+lapack_int LAPACKE_sspevd_work(int matrix_order, char jobz, char uplo, lapack_int n, float* ap, float* w, float* z,
+                               lapack_int ldz, float* work, lapack_int lwork, lapack_int* iwork, lapack_int liwork);
+lapack_int LAPACKE_dspevd_work(int matrix_order, char jobz, char uplo, lapack_int n, double* ap, double* w, double* z,
+                               lapack_int ldz, double* work, lapack_int lwork, lapack_int* iwork, lapack_int liwork);
+
+lapack_int LAPACKE_sspevx_work(int matrix_order, char jobz, char range, char uplo, lapack_int n, float* ap, float vl,
+                               float vu, lapack_int il, lapack_int iu, float abstol, lapack_int* m, float* w, float* z,
+                               lapack_int ldz, float* work, lapack_int* iwork, lapack_int* ifail);
+lapack_int LAPACKE_dspevx_work(int matrix_order, char jobz, char range, char uplo, lapack_int n, double* ap, double vl,
+                               double vu, lapack_int il, lapack_int iu, double abstol, lapack_int* m, double* w,
+                               double* z, lapack_int ldz, double* work, lapack_int* iwork, lapack_int* ifail);
+
+lapack_int LAPACKE_sspgst_work(int matrix_order, lapack_int itype, char uplo, lapack_int n, float* ap, const float* bp);
+lapack_int LAPACKE_dspgst_work(int matrix_order, lapack_int itype, char uplo, lapack_int n, double* ap,
+                               const double* bp);
+
+lapack_int LAPACKE_sspgv_work(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n, float* ap,
+                              float* bp, float* w, float* z, lapack_int ldz, float* work);
+lapack_int LAPACKE_dspgv_work(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n, double* ap,
+                              double* bp, double* w, double* z, lapack_int ldz, double* work);
+
+lapack_int LAPACKE_sspgvd_work(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n, float* ap,
+                               float* bp, float* w, float* z, lapack_int ldz, float* work, lapack_int lwork,
+                               lapack_int* iwork, lapack_int liwork);
+lapack_int LAPACKE_dspgvd_work(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n, double* ap,
+                               double* bp, double* w, double* z, lapack_int ldz, double* work, lapack_int lwork,
+                               lapack_int* iwork, lapack_int liwork);
+
+lapack_int LAPACKE_sspgvx_work(int matrix_order, lapack_int itype, char jobz, char range, char uplo, lapack_int n,
+                               float* ap, float* bp, float vl, float vu, lapack_int il, lapack_int iu, float abstol,
+                               lapack_int* m, float* w, float* z, lapack_int ldz, float* work, lapack_int* iwork,
+                               lapack_int* ifail);
+lapack_int LAPACKE_dspgvx_work(int matrix_order, lapack_int itype, char jobz, char range, char uplo, lapack_int n,
+                               double* ap, double* bp, double vl, double vu, lapack_int il, lapack_int iu,
+                               double abstol, lapack_int* m, double* w, double* z, lapack_int ldz, double* work,
+                               lapack_int* iwork, lapack_int* ifail);
+
+lapack_int LAPACKE_ssprfs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const float* ap,
+                               const float* afp, const lapack_int* ipiv, const float* b, lapack_int ldb, float* x,
+                               lapack_int ldx, float* ferr, float* berr, float* work, lapack_int* iwork);
+lapack_int LAPACKE_dsprfs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const double* ap,
+                               const double* afp, const lapack_int* ipiv, const double* b, lapack_int ldb, double* x,
+                               lapack_int ldx, double* ferr, double* berr, double* work, lapack_int* iwork);
+lapack_int LAPACKE_csprfs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs,
+                               const lapack_complex_float* ap, const lapack_complex_float* afp, const lapack_int* ipiv,
+                               const lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x, lapack_int ldx,
+                               float* ferr, float* berr, lapack_complex_float* work, float* rwork);
+lapack_int LAPACKE_zsprfs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs,
+                               const lapack_complex_double* ap, const lapack_complex_double* afp,
+                               const lapack_int* ipiv, const lapack_complex_double* b, lapack_int ldb,
+                               lapack_complex_double* x, lapack_int ldx, double* ferr, double* berr,
+                               lapack_complex_double* work, double* rwork);
+
+lapack_int LAPACKE_sspsv_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, float* ap, lapack_int* ipiv,
+                              float* b, lapack_int ldb);
+lapack_int LAPACKE_dspsv_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, double* ap, lapack_int* ipiv,
+                              double* b, lapack_int ldb);
+lapack_int LAPACKE_cspsv_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, lapack_complex_float* ap,
+                              lapack_int* ipiv, lapack_complex_float* b, lapack_int ldb);
+lapack_int LAPACKE_zspsv_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, lapack_complex_double* ap,
+                              lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb);
+
+lapack_int LAPACKE_sspsvx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs, const float* ap,
+                               float* afp, lapack_int* ipiv, const float* b, lapack_int ldb, float* x, lapack_int ldx,
+                               float* rcond, float* ferr, float* berr, float* work, lapack_int* iwork);
+lapack_int LAPACKE_dspsvx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs, const double* ap,
+                               double* afp, lapack_int* ipiv, const double* b, lapack_int ldb, double* x,
+                               lapack_int ldx, double* rcond, double* ferr, double* berr, double* work,
+                               lapack_int* iwork);
+lapack_int LAPACKE_cspsvx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
+                               const lapack_complex_float* ap, lapack_complex_float* afp, lapack_int* ipiv,
+                               const lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x, lapack_int ldx,
+                               float* rcond, float* ferr, float* berr, lapack_complex_float* work, float* rwork);
+lapack_int LAPACKE_zspsvx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
+                               const lapack_complex_double* ap, lapack_complex_double* afp, lapack_int* ipiv,
+                               const lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x, lapack_int ldx,
+                               double* rcond, double* ferr, double* berr, lapack_complex_double* work, double* rwork);
+
+lapack_int LAPACKE_ssptrd_work(int matrix_order, char uplo, lapack_int n, float* ap, float* d, float* e, float* tau);
+lapack_int LAPACKE_dsptrd_work(int matrix_order, char uplo, lapack_int n, double* ap, double* d, double* e,
+                               double* tau);
+
+lapack_int LAPACKE_ssptrf_work(int matrix_order, char uplo, lapack_int n, float* ap, lapack_int* ipiv);
+lapack_int LAPACKE_dsptrf_work(int matrix_order, char uplo, lapack_int n, double* ap, lapack_int* ipiv);
+lapack_int LAPACKE_csptrf_work(int matrix_order, char uplo, lapack_int n, lapack_complex_float* ap, lapack_int* ipiv);
+lapack_int LAPACKE_zsptrf_work(int matrix_order, char uplo, lapack_int n, lapack_complex_double* ap, lapack_int* ipiv);
+
+lapack_int LAPACKE_ssptri_work(int matrix_order, char uplo, lapack_int n, float* ap, const lapack_int* ipiv,
+                               float* work);
+lapack_int LAPACKE_dsptri_work(int matrix_order, char uplo, lapack_int n, double* ap, const lapack_int* ipiv,
+                               double* work);
+lapack_int LAPACKE_csptri_work(int matrix_order, char uplo, lapack_int n, lapack_complex_float* ap,
+                               const lapack_int* ipiv, lapack_complex_float* work);
+lapack_int LAPACKE_zsptri_work(int matrix_order, char uplo, lapack_int n, lapack_complex_double* ap,
+                               const lapack_int* ipiv, lapack_complex_double* work);
+
+lapack_int LAPACKE_ssptrs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const float* ap,
+                               const lapack_int* ipiv, float* b, lapack_int ldb);
+lapack_int LAPACKE_dsptrs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const double* ap,
+                               const lapack_int* ipiv, double* b, lapack_int ldb);
+lapack_int LAPACKE_csptrs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs,
+                               const lapack_complex_float* ap, const lapack_int* ipiv, lapack_complex_float* b,
+                               lapack_int ldb);
+lapack_int LAPACKE_zsptrs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs,
+                               const lapack_complex_double* ap, const lapack_int* ipiv, lapack_complex_double* b,
+                               lapack_int ldb);
+
+lapack_int LAPACKE_sstebz_work(char range, char order, lapack_int n, float vl, float vu, lapack_int il, lapack_int iu,
+                               float abstol, const float* d, const float* e, lapack_int* m, lapack_int* nsplit,
+                               float* w, lapack_int* iblock, lapack_int* isplit, float* work, lapack_int* iwork);
+lapack_int LAPACKE_dstebz_work(char range, char order, lapack_int n, double vl, double vu, lapack_int il, lapack_int iu,
+                               double abstol, const double* d, const double* e, lapack_int* m, lapack_int* nsplit,
+                               double* w, lapack_int* iblock, lapack_int* isplit, double* work, lapack_int* iwork);
+
+lapack_int LAPACKE_sstedc_work(int matrix_order, char compz, lapack_int n, float* d, float* e, float* z, lapack_int ldz,
+                               float* work, lapack_int lwork, lapack_int* iwork, lapack_int liwork);
+lapack_int LAPACKE_dstedc_work(int matrix_order, char compz, lapack_int n, double* d, double* e, double* z,
+                               lapack_int ldz, double* work, lapack_int lwork, lapack_int* iwork, lapack_int liwork);
+lapack_int LAPACKE_cstedc_work(int matrix_order, char compz, lapack_int n, float* d, float* e, lapack_complex_float* z,
+                               lapack_int ldz, lapack_complex_float* work, lapack_int lwork, float* rwork,
+                               lapack_int lrwork, lapack_int* iwork, lapack_int liwork);
+lapack_int LAPACKE_zstedc_work(int matrix_order, char compz, lapack_int n, double* d, double* e,
+                               lapack_complex_double* z, lapack_int ldz, lapack_complex_double* work, lapack_int lwork,
+                               double* rwork, lapack_int lrwork, lapack_int* iwork, lapack_int liwork);
+
+lapack_int LAPACKE_sstegr_work(int matrix_order, char jobz, char range, lapack_int n, float* d, float* e, float vl,
+                               float vu, lapack_int il, lapack_int iu, float abstol, lapack_int* m, float* w, float* z,
+                               lapack_int ldz, lapack_int* isuppz, float* work, lapack_int lwork, lapack_int* iwork,
+                               lapack_int liwork);
+lapack_int LAPACKE_dstegr_work(int matrix_order, char jobz, char range, lapack_int n, double* d, double* e, double vl,
+                               double vu, lapack_int il, lapack_int iu, double abstol, lapack_int* m, double* w,
+                               double* z, lapack_int ldz, lapack_int* isuppz, double* work, lapack_int lwork,
+                               lapack_int* iwork, lapack_int liwork);
+lapack_int LAPACKE_cstegr_work(int matrix_order, char jobz, char range, lapack_int n, float* d, float* e, float vl,
+                               float vu, lapack_int il, lapack_int iu, float abstol, lapack_int* m, float* w,
+                               lapack_complex_float* z, lapack_int ldz, lapack_int* isuppz, float* work,
+                               lapack_int lwork, lapack_int* iwork, lapack_int liwork);
+lapack_int LAPACKE_zstegr_work(int matrix_order, char jobz, char range, lapack_int n, double* d, double* e, double vl,
+                               double vu, lapack_int il, lapack_int iu, double abstol, lapack_int* m, double* w,
+                               lapack_complex_double* z, lapack_int ldz, lapack_int* isuppz, double* work,
+                               lapack_int lwork, lapack_int* iwork, lapack_int liwork);
+
+lapack_int LAPACKE_sstein_work(int matrix_order, lapack_int n, const float* d, const float* e, lapack_int m,
+                               const float* w, const lapack_int* iblock, const lapack_int* isplit, float* z,
+                               lapack_int ldz, float* work, lapack_int* iwork, lapack_int* ifailv);
+lapack_int LAPACKE_dstein_work(int matrix_order, lapack_int n, const double* d, const double* e, lapack_int m,
+                               const double* w, const lapack_int* iblock, const lapack_int* isplit, double* z,
+                               lapack_int ldz, double* work, lapack_int* iwork, lapack_int* ifailv);
+lapack_int LAPACKE_cstein_work(int matrix_order, lapack_int n, const float* d, const float* e, lapack_int m,
+                               const float* w, const lapack_int* iblock, const lapack_int* isplit,
+                               lapack_complex_float* z, lapack_int ldz, float* work, lapack_int* iwork,
+                               lapack_int* ifailv);
+lapack_int LAPACKE_zstein_work(int matrix_order, lapack_int n, const double* d, const double* e, lapack_int m,
+                               const double* w, const lapack_int* iblock, const lapack_int* isplit,
+                               lapack_complex_double* z, lapack_int ldz, double* work, lapack_int* iwork,
+                               lapack_int* ifailv);
+
+lapack_int LAPACKE_sstemr_work(int matrix_order, char jobz, char range, lapack_int n, float* d, float* e, float vl,
+                               float vu, lapack_int il, lapack_int iu, lapack_int* m, float* w, float* z,
+                               lapack_int ldz, lapack_int nzc, lapack_int* isuppz, lapack_logical* tryrac, float* work,
+                               lapack_int lwork, lapack_int* iwork, lapack_int liwork);
+lapack_int LAPACKE_dstemr_work(int matrix_order, char jobz, char range, lapack_int n, double* d, double* e, double vl,
+                               double vu, lapack_int il, lapack_int iu, lapack_int* m, double* w, double* z,
+                               lapack_int ldz, lapack_int nzc, lapack_int* isuppz, lapack_logical* tryrac, double* work,
+                               lapack_int lwork, lapack_int* iwork, lapack_int liwork);
+lapack_int LAPACKE_cstemr_work(int matrix_order, char jobz, char range, lapack_int n, float* d, float* e, float vl,
+                               float vu, lapack_int il, lapack_int iu, lapack_int* m, float* w, lapack_complex_float* z,
+                               lapack_int ldz, lapack_int nzc, lapack_int* isuppz, lapack_logical* tryrac, float* work,
+                               lapack_int lwork, lapack_int* iwork, lapack_int liwork);
+lapack_int LAPACKE_zstemr_work(int matrix_order, char jobz, char range, lapack_int n, double* d, double* e, double vl,
+                               double vu, lapack_int il, lapack_int iu, lapack_int* m, double* w,
+                               lapack_complex_double* z, lapack_int ldz, lapack_int nzc, lapack_int* isuppz,
+                               lapack_logical* tryrac, double* work, lapack_int lwork, lapack_int* iwork,
+                               lapack_int liwork);
+
+lapack_int LAPACKE_ssteqr_work(int matrix_order, char compz, lapack_int n, float* d, float* e, float* z, lapack_int ldz,
+                               float* work);
+lapack_int LAPACKE_dsteqr_work(int matrix_order, char compz, lapack_int n, double* d, double* e, double* z,
+                               lapack_int ldz, double* work);
+lapack_int LAPACKE_csteqr_work(int matrix_order, char compz, lapack_int n, float* d, float* e, lapack_complex_float* z,
+                               lapack_int ldz, float* work);
+lapack_int LAPACKE_zsteqr_work(int matrix_order, char compz, lapack_int n, double* d, double* e,
+                               lapack_complex_double* z, lapack_int ldz, double* work);
+
+lapack_int LAPACKE_ssterf_work(lapack_int n, float* d, float* e);
+lapack_int LAPACKE_dsterf_work(lapack_int n, double* d, double* e);
+
+lapack_int LAPACKE_sstev_work(int matrix_order, char jobz, lapack_int n, float* d, float* e, float* z, lapack_int ldz,
+                              float* work);
+lapack_int LAPACKE_dstev_work(int matrix_order, char jobz, lapack_int n, double* d, double* e, double* z,
+                              lapack_int ldz, double* work);
+
+lapack_int LAPACKE_sstevd_work(int matrix_order, char jobz, lapack_int n, float* d, float* e, float* z, lapack_int ldz,
+                               float* work, lapack_int lwork, lapack_int* iwork, lapack_int liwork);
+lapack_int LAPACKE_dstevd_work(int matrix_order, char jobz, lapack_int n, double* d, double* e, double* z,
+                               lapack_int ldz, double* work, lapack_int lwork, lapack_int* iwork, lapack_int liwork);
+
+lapack_int LAPACKE_sstevr_work(int matrix_order, char jobz, char range, lapack_int n, float* d, float* e, float vl,
+                               float vu, lapack_int il, lapack_int iu, float abstol, lapack_int* m, float* w, float* z,
+                               lapack_int ldz, lapack_int* isuppz, float* work, lapack_int lwork, lapack_int* iwork,
+                               lapack_int liwork);
+lapack_int LAPACKE_dstevr_work(int matrix_order, char jobz, char range, lapack_int n, double* d, double* e, double vl,
+                               double vu, lapack_int il, lapack_int iu, double abstol, lapack_int* m, double* w,
+                               double* z, lapack_int ldz, lapack_int* isuppz, double* work, lapack_int lwork,
+                               lapack_int* iwork, lapack_int liwork);
+
+lapack_int LAPACKE_sstevx_work(int matrix_order, char jobz, char range, lapack_int n, float* d, float* e, float vl,
+                               float vu, lapack_int il, lapack_int iu, float abstol, lapack_int* m, float* w, float* z,
+                               lapack_int ldz, float* work, lapack_int* iwork, lapack_int* ifail);
+lapack_int LAPACKE_dstevx_work(int matrix_order, char jobz, char range, lapack_int n, double* d, double* e, double vl,
+                               double vu, lapack_int il, lapack_int iu, double abstol, lapack_int* m, double* w,
+                               double* z, lapack_int ldz, double* work, lapack_int* iwork, lapack_int* ifail);
+
+lapack_int LAPACKE_ssycon_work(int matrix_order, char uplo, lapack_int n, const float* a, lapack_int lda,
+                               const lapack_int* ipiv, float anorm, float* rcond, float* work, lapack_int* iwork);
+lapack_int LAPACKE_dsycon_work(int matrix_order, char uplo, lapack_int n, const double* a, lapack_int lda,
+                               const lapack_int* ipiv, double anorm, double* rcond, double* work, lapack_int* iwork);
+lapack_int LAPACKE_csycon_work(int matrix_order, char uplo, lapack_int n, const lapack_complex_float* a, lapack_int lda,
+                               const lapack_int* ipiv, float anorm, float* rcond, lapack_complex_float* work);
+lapack_int LAPACKE_zsycon_work(int matrix_order, char uplo, lapack_int n, const lapack_complex_double* a,
+                               lapack_int lda, const lapack_int* ipiv, double anorm, double* rcond,
+                               lapack_complex_double* work);
+
+lapack_int LAPACKE_ssyequb_work(int matrix_order, char uplo, lapack_int n, const float* a, lapack_int lda, float* s,
+                                float* scond, float* amax, float* work);
+lapack_int LAPACKE_dsyequb_work(int matrix_order, char uplo, lapack_int n, const double* a, lapack_int lda, double* s,
+                                double* scond, double* amax, double* work);
+lapack_int LAPACKE_csyequb_work(int matrix_order, char uplo, lapack_int n, const lapack_complex_float* a,
+                                lapack_int lda, float* s, float* scond, float* amax, lapack_complex_float* work);
+lapack_int LAPACKE_zsyequb_work(int matrix_order, char uplo, lapack_int n, const lapack_complex_double* a,
+                                lapack_int lda, double* s, double* scond, double* amax, lapack_complex_double* work);
+
+lapack_int LAPACKE_ssyev_work(int matrix_order, char jobz, char uplo, lapack_int n, float* a, lapack_int lda, float* w,
+                              float* work, lapack_int lwork);
+lapack_int LAPACKE_dsyev_work(int matrix_order, char jobz, char uplo, lapack_int n, double* a, lapack_int lda,
+                              double* w, double* work, lapack_int lwork);
+
+lapack_int LAPACKE_ssyevd_work(int matrix_order, char jobz, char uplo, lapack_int n, float* a, lapack_int lda, float* w,
+                               float* work, lapack_int lwork, lapack_int* iwork, lapack_int liwork);
+lapack_int LAPACKE_dsyevd_work(int matrix_order, char jobz, char uplo, lapack_int n, double* a, lapack_int lda,
+                               double* w, double* work, lapack_int lwork, lapack_int* iwork, lapack_int liwork);
+
+lapack_int LAPACKE_ssyevr_work(int matrix_order, char jobz, char range, char uplo, lapack_int n, float* a,
+                               lapack_int lda, float vl, float vu, lapack_int il, lapack_int iu, float abstol,
+                               lapack_int* m, float* w, float* z, lapack_int ldz, lapack_int* isuppz, float* work,
+                               lapack_int lwork, lapack_int* iwork, lapack_int liwork);
+lapack_int LAPACKE_dsyevr_work(int matrix_order, char jobz, char range, char uplo, lapack_int n, double* a,
+                               lapack_int lda, double vl, double vu, lapack_int il, lapack_int iu, double abstol,
+                               lapack_int* m, double* w, double* z, lapack_int ldz, lapack_int* isuppz, double* work,
+                               lapack_int lwork, lapack_int* iwork, lapack_int liwork);
+
+lapack_int LAPACKE_ssyevx_work(int matrix_order, char jobz, char range, char uplo, lapack_int n, float* a,
+                               lapack_int lda, float vl, float vu, lapack_int il, lapack_int iu, float abstol,
+                               lapack_int* m, float* w, float* z, lapack_int ldz, float* work, lapack_int lwork,
+                               lapack_int* iwork, lapack_int* ifail);
+lapack_int LAPACKE_dsyevx_work(int matrix_order, char jobz, char range, char uplo, lapack_int n, double* a,
+                               lapack_int lda, double vl, double vu, lapack_int il, lapack_int iu, double abstol,
+                               lapack_int* m, double* w, double* z, lapack_int ldz, double* work, lapack_int lwork,
+                               lapack_int* iwork, lapack_int* ifail);
+
+lapack_int LAPACKE_ssygst_work(int matrix_order, lapack_int itype, char uplo, lapack_int n, float* a, lapack_int lda,
+                               const float* b, lapack_int ldb);
+lapack_int LAPACKE_dsygst_work(int matrix_order, lapack_int itype, char uplo, lapack_int n, double* a, lapack_int lda,
+                               const double* b, lapack_int ldb);
+
+lapack_int LAPACKE_ssygv_work(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n, float* a,
+                              lapack_int lda, float* b, lapack_int ldb, float* w, float* work, lapack_int lwork);
+lapack_int LAPACKE_dsygv_work(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n, double* a,
+                              lapack_int lda, double* b, lapack_int ldb, double* w, double* work, lapack_int lwork);
+
+lapack_int LAPACKE_ssygvd_work(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n, float* a,
+                               lapack_int lda, float* b, lapack_int ldb, float* w, float* work, lapack_int lwork,
+                               lapack_int* iwork, lapack_int liwork);
+lapack_int LAPACKE_dsygvd_work(int matrix_order, lapack_int itype, char jobz, char uplo, lapack_int n, double* a,
+                               lapack_int lda, double* b, lapack_int ldb, double* w, double* work, lapack_int lwork,
+                               lapack_int* iwork, lapack_int liwork);
+
+lapack_int LAPACKE_ssygvx_work(int matrix_order, lapack_int itype, char jobz, char range, char uplo, lapack_int n,
+                               float* a, lapack_int lda, float* b, lapack_int ldb, float vl, float vu, lapack_int il,
+                               lapack_int iu, float abstol, lapack_int* m, float* w, float* z, lapack_int ldz,
+                               float* work, lapack_int lwork, lapack_int* iwork, lapack_int* ifail);
+lapack_int LAPACKE_dsygvx_work(int matrix_order, lapack_int itype, char jobz, char range, char uplo, lapack_int n,
+                               double* a, lapack_int lda, double* b, lapack_int ldb, double vl, double vu,
+                               lapack_int il, lapack_int iu, double abstol, lapack_int* m, double* w, double* z,
+                               lapack_int ldz, double* work, lapack_int lwork, lapack_int* iwork, lapack_int* ifail);
+
+lapack_int LAPACKE_ssyrfs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const float* a,
+                               lapack_int lda, const float* af, lapack_int ldaf, const lapack_int* ipiv, const float* b,
+                               lapack_int ldb, float* x, lapack_int ldx, float* ferr, float* berr, float* work,
+                               lapack_int* iwork);
+lapack_int LAPACKE_dsyrfs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const double* a,
+                               lapack_int lda, const double* af, lapack_int ldaf, const lapack_int* ipiv,
+                               const double* b, lapack_int ldb, double* x, lapack_int ldx, double* ferr, double* berr,
+                               double* work, lapack_int* iwork);
+lapack_int LAPACKE_csyrfs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs,
+                               const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* af,
+                               lapack_int ldaf, const lapack_int* ipiv, const lapack_complex_float* b, lapack_int ldb,
+                               lapack_complex_float* x, lapack_int ldx, float* ferr, float* berr,
+                               lapack_complex_float* work, float* rwork);
+lapack_int LAPACKE_zsyrfs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs,
+                               const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* af,
+                               lapack_int ldaf, const lapack_int* ipiv, const lapack_complex_double* b, lapack_int ldb,
+                               lapack_complex_double* x, lapack_int ldx, double* ferr, double* berr,
+                               lapack_complex_double* work, double* rwork);
+
+lapack_int LAPACKE_ssyrfsx_work(int matrix_order, char uplo, char equed, lapack_int n, lapack_int nrhs, const float* a,
+                                lapack_int lda, const float* af, lapack_int ldaf, const lapack_int* ipiv,
+                                const float* s, const float* b, lapack_int ldb, float* x, lapack_int ldx, float* rcond,
+                                float* berr, lapack_int n_err_bnds, float* err_bnds_norm, float* err_bnds_comp,
+                                lapack_int nparams, float* params, float* work, lapack_int* iwork);
+lapack_int LAPACKE_dsyrfsx_work(int matrix_order, char uplo, char equed, lapack_int n, lapack_int nrhs, const double* a,
+                                lapack_int lda, const double* af, lapack_int ldaf, const lapack_int* ipiv,
+                                const double* s, const double* b, lapack_int ldb, double* x, lapack_int ldx,
+                                double* rcond, double* berr, lapack_int n_err_bnds, double* err_bnds_norm,
+                                double* err_bnds_comp, lapack_int nparams, double* params, double* work,
+                                lapack_int* iwork);
+lapack_int LAPACKE_csyrfsx_work(int matrix_order, char uplo, char equed, lapack_int n, lapack_int nrhs,
+                                const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* af,
+                                lapack_int ldaf, const lapack_int* ipiv, const float* s, const lapack_complex_float* b,
+                                lapack_int ldb, lapack_complex_float* x, lapack_int ldx, float* rcond, float* berr,
+                                lapack_int n_err_bnds, float* err_bnds_norm, float* err_bnds_comp, lapack_int nparams,
+                                float* params, lapack_complex_float* work, float* rwork);
+lapack_int LAPACKE_zsyrfsx_work(int matrix_order, char uplo, char equed, lapack_int n, lapack_int nrhs,
+                                const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* af,
+                                lapack_int ldaf, const lapack_int* ipiv, const double* s,
+                                const lapack_complex_double* b, lapack_int ldb, lapack_complex_double* x,
+                                lapack_int ldx, double* rcond, double* berr, lapack_int n_err_bnds,
+                                double* err_bnds_norm, double* err_bnds_comp, lapack_int nparams, double* params,
+                                lapack_complex_double* work, double* rwork);
+
+lapack_int LAPACKE_ssysv_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, float* a, lapack_int lda,
+                              lapack_int* ipiv, float* b, lapack_int ldb, float* work, lapack_int lwork);
+lapack_int LAPACKE_dsysv_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, double* a, lapack_int lda,
+                              lapack_int* ipiv, double* b, lapack_int ldb, double* work, lapack_int lwork);
+lapack_int LAPACKE_csysv_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, lapack_complex_float* a,
+                              lapack_int lda, lapack_int* ipiv, lapack_complex_float* b, lapack_int ldb,
+                              lapack_complex_float* work, lapack_int lwork);
+lapack_int LAPACKE_zsysv_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, lapack_complex_double* a,
+                              lapack_int lda, lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb,
+                              lapack_complex_double* work, lapack_int lwork);
+
+lapack_int LAPACKE_ssysvx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs, const float* a,
+                               lapack_int lda, float* af, lapack_int ldaf, lapack_int* ipiv, const float* b,
+                               lapack_int ldb, float* x, lapack_int ldx, float* rcond, float* ferr, float* berr,
+                               float* work, lapack_int lwork, lapack_int* iwork);
+lapack_int LAPACKE_dsysvx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs, const double* a,
+                               lapack_int lda, double* af, lapack_int ldaf, lapack_int* ipiv, const double* b,
+                               lapack_int ldb, double* x, lapack_int ldx, double* rcond, double* ferr, double* berr,
+                               double* work, lapack_int lwork, lapack_int* iwork);
+lapack_int LAPACKE_csysvx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
+                               const lapack_complex_float* a, lapack_int lda, lapack_complex_float* af, lapack_int ldaf,
+                               lapack_int* ipiv, const lapack_complex_float* b, lapack_int ldb, lapack_complex_float* x,
+                               lapack_int ldx, float* rcond, float* ferr, float* berr, lapack_complex_float* work,
+                               lapack_int lwork, float* rwork);
+lapack_int LAPACKE_zsysvx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
+                               const lapack_complex_double* a, lapack_int lda, lapack_complex_double* af,
+                               lapack_int ldaf, lapack_int* ipiv, const lapack_complex_double* b, lapack_int ldb,
+                               lapack_complex_double* x, lapack_int ldx, double* rcond, double* ferr, double* berr,
+                               lapack_complex_double* work, lapack_int lwork, double* rwork);
+
+lapack_int LAPACKE_ssysvxx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs, float* a,
+                                lapack_int lda, float* af, lapack_int ldaf, lapack_int* ipiv, char* equed, float* s,
+                                float* b, lapack_int ldb, float* x, lapack_int ldx, float* rcond, float* rpvgrw,
+                                float* berr, lapack_int n_err_bnds, float* err_bnds_norm, float* err_bnds_comp,
+                                lapack_int nparams, float* params, float* work, lapack_int* iwork);
+lapack_int LAPACKE_dsysvxx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs, double* a,
+                                lapack_int lda, double* af, lapack_int ldaf, lapack_int* ipiv, char* equed, double* s,
+                                double* b, lapack_int ldb, double* x, lapack_int ldx, double* rcond, double* rpvgrw,
+                                double* berr, lapack_int n_err_bnds, double* err_bnds_norm, double* err_bnds_comp,
+                                lapack_int nparams, double* params, double* work, lapack_int* iwork);
+lapack_int LAPACKE_csysvxx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
+                                lapack_complex_float* a, lapack_int lda, lapack_complex_float* af, lapack_int ldaf,
+                                lapack_int* ipiv, char* equed, float* s, lapack_complex_float* b, lapack_int ldb,
+                                lapack_complex_float* x, lapack_int ldx, float* rcond, float* rpvgrw, float* berr,
+                                lapack_int n_err_bnds, float* err_bnds_norm, float* err_bnds_comp, lapack_int nparams,
+                                float* params, lapack_complex_float* work, float* rwork);
+lapack_int LAPACKE_zsysvxx_work(int matrix_order, char fact, char uplo, lapack_int n, lapack_int nrhs,
+                                lapack_complex_double* a, lapack_int lda, lapack_complex_double* af, lapack_int ldaf,
+                                lapack_int* ipiv, char* equed, double* s, lapack_complex_double* b, lapack_int ldb,
+                                lapack_complex_double* x, lapack_int ldx, double* rcond, double* rpvgrw, double* berr,
+                                lapack_int n_err_bnds, double* err_bnds_norm, double* err_bnds_comp, lapack_int nparams,
+                                double* params, lapack_complex_double* work, double* rwork);
+
+lapack_int LAPACKE_ssytrd_work(int matrix_order, char uplo, lapack_int n, float* a, lapack_int lda, float* d, float* e,
+                               float* tau, float* work, lapack_int lwork);
+lapack_int LAPACKE_dsytrd_work(int matrix_order, char uplo, lapack_int n, double* a, lapack_int lda, double* d,
+                               double* e, double* tau, double* work, lapack_int lwork);
+
+lapack_int LAPACKE_ssytrf_work(int matrix_order, char uplo, lapack_int n, float* a, lapack_int lda, lapack_int* ipiv,
+                               float* work, lapack_int lwork);
+lapack_int LAPACKE_dsytrf_work(int matrix_order, char uplo, lapack_int n, double* a, lapack_int lda, lapack_int* ipiv,
+                               double* work, lapack_int lwork);
+lapack_int LAPACKE_csytrf_work(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                               lapack_int* ipiv, lapack_complex_float* work, lapack_int lwork);
+lapack_int LAPACKE_zsytrf_work(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                               lapack_int* ipiv, lapack_complex_double* work, lapack_int lwork);
+
+lapack_int LAPACKE_ssytri_work(int matrix_order, char uplo, lapack_int n, float* a, lapack_int lda,
+                               const lapack_int* ipiv, float* work);
+lapack_int LAPACKE_dsytri_work(int matrix_order, char uplo, lapack_int n, double* a, lapack_int lda,
+                               const lapack_int* ipiv, double* work);
+lapack_int LAPACKE_csytri_work(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                               const lapack_int* ipiv, lapack_complex_float* work);
+lapack_int LAPACKE_zsytri_work(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                               const lapack_int* ipiv, lapack_complex_double* work);
+
+lapack_int LAPACKE_ssytrs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const float* a,
+                               lapack_int lda, const lapack_int* ipiv, float* b, lapack_int ldb);
+lapack_int LAPACKE_dsytrs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const double* a,
+                               lapack_int lda, const lapack_int* ipiv, double* b, lapack_int ldb);
+lapack_int LAPACKE_csytrs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs,
+                               const lapack_complex_float* a, lapack_int lda, const lapack_int* ipiv,
+                               lapack_complex_float* b, lapack_int ldb);
+lapack_int LAPACKE_zsytrs_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs,
+                               const lapack_complex_double* a, lapack_int lda, const lapack_int* ipiv,
+                               lapack_complex_double* b, lapack_int ldb);
+
+lapack_int LAPACKE_stbcon_work(int matrix_order, char norm, char uplo, char diag, lapack_int n, lapack_int kd,
+                               const float* ab, lapack_int ldab, float* rcond, float* work, lapack_int* iwork);
+lapack_int LAPACKE_dtbcon_work(int matrix_order, char norm, char uplo, char diag, lapack_int n, lapack_int kd,
+                               const double* ab, lapack_int ldab, double* rcond, double* work, lapack_int* iwork);
+lapack_int LAPACKE_ctbcon_work(int matrix_order, char norm, char uplo, char diag, lapack_int n, lapack_int kd,
+                               const lapack_complex_float* ab, lapack_int ldab, float* rcond,
+                               lapack_complex_float* work, float* rwork);
+lapack_int LAPACKE_ztbcon_work(int matrix_order, char norm, char uplo, char diag, lapack_int n, lapack_int kd,
+                               const lapack_complex_double* ab, lapack_int ldab, double* rcond,
+                               lapack_complex_double* work, double* rwork);
+
+lapack_int LAPACKE_stbrfs_work(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int kd,
+                               lapack_int nrhs, const float* ab, lapack_int ldab, const float* b, lapack_int ldb,
+                               const float* x, lapack_int ldx, float* ferr, float* berr, float* work,
+                               lapack_int* iwork);
+lapack_int LAPACKE_dtbrfs_work(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int kd,
+                               lapack_int nrhs, const double* ab, lapack_int ldab, const double* b, lapack_int ldb,
+                               const double* x, lapack_int ldx, double* ferr, double* berr, double* work,
+                               lapack_int* iwork);
+lapack_int LAPACKE_ctbrfs_work(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int kd,
+                               lapack_int nrhs, const lapack_complex_float* ab, lapack_int ldab,
+                               const lapack_complex_float* b, lapack_int ldb, const lapack_complex_float* x,
+                               lapack_int ldx, float* ferr, float* berr, lapack_complex_float* work, float* rwork);
+lapack_int LAPACKE_ztbrfs_work(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int kd,
+                               lapack_int nrhs, const lapack_complex_double* ab, lapack_int ldab,
+                               const lapack_complex_double* b, lapack_int ldb, const lapack_complex_double* x,
+                               lapack_int ldx, double* ferr, double* berr, lapack_complex_double* work, double* rwork);
+
+lapack_int LAPACKE_stbtrs_work(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int kd,
+                               lapack_int nrhs, const float* ab, lapack_int ldab, float* b, lapack_int ldb);
+lapack_int LAPACKE_dtbtrs_work(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int kd,
+                               lapack_int nrhs, const double* ab, lapack_int ldab, double* b, lapack_int ldb);
+lapack_int LAPACKE_ctbtrs_work(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int kd,
+                               lapack_int nrhs, const lapack_complex_float* ab, lapack_int ldab,
+                               lapack_complex_float* b, lapack_int ldb);
+lapack_int LAPACKE_ztbtrs_work(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int kd,
+                               lapack_int nrhs, const lapack_complex_double* ab, lapack_int ldab,
+                               lapack_complex_double* b, lapack_int ldb);
+
+lapack_int LAPACKE_stfsm_work(int matrix_order, char transr, char side, char uplo, char trans, char diag, lapack_int m,
+                              lapack_int n, float alpha, const float* a, float* b, lapack_int ldb);
+lapack_int LAPACKE_dtfsm_work(int matrix_order, char transr, char side, char uplo, char trans, char diag, lapack_int m,
+                              lapack_int n, double alpha, const double* a, double* b, lapack_int ldb);
+lapack_int LAPACKE_ctfsm_work(int matrix_order, char transr, char side, char uplo, char trans, char diag, lapack_int m,
+                              lapack_int n, lapack_complex_float alpha, const lapack_complex_float* a,
+                              lapack_complex_float* b, lapack_int ldb);
+lapack_int LAPACKE_ztfsm_work(int matrix_order, char transr, char side, char uplo, char trans, char diag, lapack_int m,
+                              lapack_int n, lapack_complex_double alpha, const lapack_complex_double* a,
+                              lapack_complex_double* b, lapack_int ldb);
+
+lapack_int LAPACKE_stftri_work(int matrix_order, char transr, char uplo, char diag, lapack_int n, float* a);
+lapack_int LAPACKE_dtftri_work(int matrix_order, char transr, char uplo, char diag, lapack_int n, double* a);
+lapack_int LAPACKE_ctftri_work(int matrix_order, char transr, char uplo, char diag, lapack_int n,
+                               lapack_complex_float* a);
+lapack_int LAPACKE_ztftri_work(int matrix_order, char transr, char uplo, char diag, lapack_int n,
+                               lapack_complex_double* a);
+
+lapack_int LAPACKE_stfttp_work(int matrix_order, char transr, char uplo, lapack_int n, const float* arf, float* ap);
+lapack_int LAPACKE_dtfttp_work(int matrix_order, char transr, char uplo, lapack_int n, const double* arf, double* ap);
+lapack_int LAPACKE_ctfttp_work(int matrix_order, char transr, char uplo, lapack_int n, const lapack_complex_float* arf,
+                               lapack_complex_float* ap);
+lapack_int LAPACKE_ztfttp_work(int matrix_order, char transr, char uplo, lapack_int n, const lapack_complex_double* arf,
+                               lapack_complex_double* ap);
+
+lapack_int LAPACKE_stfttr_work(int matrix_order, char transr, char uplo, lapack_int n, const float* arf, float* a,
+                               lapack_int lda);
+lapack_int LAPACKE_dtfttr_work(int matrix_order, char transr, char uplo, lapack_int n, const double* arf, double* a,
+                               lapack_int lda);
+lapack_int LAPACKE_ctfttr_work(int matrix_order, char transr, char uplo, lapack_int n, const lapack_complex_float* arf,
+                               lapack_complex_float* a, lapack_int lda);
+lapack_int LAPACKE_ztfttr_work(int matrix_order, char transr, char uplo, lapack_int n, const lapack_complex_double* arf,
+                               lapack_complex_double* a, lapack_int lda);
+
+lapack_int LAPACKE_stgevc_work(int matrix_order, char side, char howmny, const lapack_logical* select, lapack_int n,
+                               const float* s, lapack_int lds, const float* p, lapack_int ldp, float* vl,
+                               lapack_int ldvl, float* vr, lapack_int ldvr, lapack_int mm, lapack_int* m, float* work);
+lapack_int LAPACKE_dtgevc_work(int matrix_order, char side, char howmny, const lapack_logical* select, lapack_int n,
+                               const double* s, lapack_int lds, const double* p, lapack_int ldp, double* vl,
+                               lapack_int ldvl, double* vr, lapack_int ldvr, lapack_int mm, lapack_int* m,
+                               double* work);
+lapack_int LAPACKE_ctgevc_work(int matrix_order, char side, char howmny, const lapack_logical* select, lapack_int n,
+                               const lapack_complex_float* s, lapack_int lds, const lapack_complex_float* p,
+                               lapack_int ldp, lapack_complex_float* vl, lapack_int ldvl, lapack_complex_float* vr,
+                               lapack_int ldvr, lapack_int mm, lapack_int* m, lapack_complex_float* work, float* rwork);
+lapack_int LAPACKE_ztgevc_work(int matrix_order, char side, char howmny, const lapack_logical* select, lapack_int n,
+                               const lapack_complex_double* s, lapack_int lds, const lapack_complex_double* p,
+                               lapack_int ldp, lapack_complex_double* vl, lapack_int ldvl, lapack_complex_double* vr,
+                               lapack_int ldvr, lapack_int mm, lapack_int* m, lapack_complex_double* work,
+                               double* rwork);
+
+lapack_int LAPACKE_stgexc_work(int matrix_order, lapack_logical wantq, lapack_logical wantz, lapack_int n, float* a,
+                               lapack_int lda, float* b, lapack_int ldb, float* q, lapack_int ldq, float* z,
+                               lapack_int ldz, lapack_int* ifst, lapack_int* ilst, float* work, lapack_int lwork);
+lapack_int LAPACKE_dtgexc_work(int matrix_order, lapack_logical wantq, lapack_logical wantz, lapack_int n, double* a,
+                               lapack_int lda, double* b, lapack_int ldb, double* q, lapack_int ldq, double* z,
+                               lapack_int ldz, lapack_int* ifst, lapack_int* ilst, double* work, lapack_int lwork);
+lapack_int LAPACKE_ctgexc_work(int matrix_order, lapack_logical wantq, lapack_logical wantz, lapack_int n,
+                               lapack_complex_float* a, lapack_int lda, lapack_complex_float* b, lapack_int ldb,
+                               lapack_complex_float* q, lapack_int ldq, lapack_complex_float* z, lapack_int ldz,
+                               lapack_int ifst, lapack_int ilst);
+lapack_int LAPACKE_ztgexc_work(int matrix_order, lapack_logical wantq, lapack_logical wantz, lapack_int n,
+                               lapack_complex_double* a, lapack_int lda, lapack_complex_double* b, lapack_int ldb,
+                               lapack_complex_double* q, lapack_int ldq, lapack_complex_double* z, lapack_int ldz,
+                               lapack_int ifst, lapack_int ilst);
+
+lapack_int LAPACKE_stgsen_work(int matrix_order, lapack_int ijob, lapack_logical wantq, lapack_logical wantz,
+                               const lapack_logical* select, lapack_int n, float* a, lapack_int lda, float* b,
+                               lapack_int ldb, float* alphar, float* alphai, float* beta, float* q, lapack_int ldq,
+                               float* z, lapack_int ldz, lapack_int* m, float* pl, float* pr, float* dif, float* work,
+                               lapack_int lwork, lapack_int* iwork, lapack_int liwork);
+lapack_int LAPACKE_dtgsen_work(int matrix_order, lapack_int ijob, lapack_logical wantq, lapack_logical wantz,
+                               const lapack_logical* select, lapack_int n, double* a, lapack_int lda, double* b,
+                               lapack_int ldb, double* alphar, double* alphai, double* beta, double* q, lapack_int ldq,
+                               double* z, lapack_int ldz, lapack_int* m, double* pl, double* pr, double* dif,
+                               double* work, lapack_int lwork, lapack_int* iwork, lapack_int liwork);
+lapack_int LAPACKE_ctgsen_work(int matrix_order, lapack_int ijob, lapack_logical wantq, lapack_logical wantz,
+                               const lapack_logical* select, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                               lapack_complex_float* b, lapack_int ldb, lapack_complex_float* alpha,
+                               lapack_complex_float* beta, lapack_complex_float* q, lapack_int ldq,
+                               lapack_complex_float* z, lapack_int ldz, lapack_int* m, float* pl, float* pr, float* dif,
+                               lapack_complex_float* work, lapack_int lwork, lapack_int* iwork, lapack_int liwork);
+lapack_int LAPACKE_ztgsen_work(int matrix_order, lapack_int ijob, lapack_logical wantq, lapack_logical wantz,
+                               const lapack_logical* select, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                               lapack_complex_double* b, lapack_int ldb, lapack_complex_double* alpha,
+                               lapack_complex_double* beta, lapack_complex_double* q, lapack_int ldq,
+                               lapack_complex_double* z, lapack_int ldz, lapack_int* m, double* pl, double* pr,
+                               double* dif, lapack_complex_double* work, lapack_int lwork, lapack_int* iwork,
+                               lapack_int liwork);
+
+lapack_int LAPACKE_stgsja_work(int matrix_order, char jobu, char jobv, char jobq, lapack_int m, lapack_int p,
+                               lapack_int n, lapack_int k, lapack_int l, float* a, lapack_int lda, float* b,
+                               lapack_int ldb, float tola, float tolb, float* alpha, float* beta, float* u,
+                               lapack_int ldu, float* v, lapack_int ldv, float* q, lapack_int ldq, float* work,
+                               lapack_int* ncycle);
+lapack_int LAPACKE_dtgsja_work(int matrix_order, char jobu, char jobv, char jobq, lapack_int m, lapack_int p,
+                               lapack_int n, lapack_int k, lapack_int l, double* a, lapack_int lda, double* b,
+                               lapack_int ldb, double tola, double tolb, double* alpha, double* beta, double* u,
+                               lapack_int ldu, double* v, lapack_int ldv, double* q, lapack_int ldq, double* work,
+                               lapack_int* ncycle);
+lapack_int LAPACKE_ctgsja_work(int matrix_order, char jobu, char jobv, char jobq, lapack_int m, lapack_int p,
+                               lapack_int n, lapack_int k, lapack_int l, lapack_complex_float* a, lapack_int lda,
+                               lapack_complex_float* b, lapack_int ldb, float tola, float tolb, float* alpha,
+                               float* beta, lapack_complex_float* u, lapack_int ldu, lapack_complex_float* v,
+                               lapack_int ldv, lapack_complex_float* q, lapack_int ldq, lapack_complex_float* work,
+                               lapack_int* ncycle);
+lapack_int LAPACKE_ztgsja_work(int matrix_order, char jobu, char jobv, char jobq, lapack_int m, lapack_int p,
+                               lapack_int n, lapack_int k, lapack_int l, lapack_complex_double* a, lapack_int lda,
+                               lapack_complex_double* b, lapack_int ldb, double tola, double tolb, double* alpha,
+                               double* beta, lapack_complex_double* u, lapack_int ldu, lapack_complex_double* v,
+                               lapack_int ldv, lapack_complex_double* q, lapack_int ldq, lapack_complex_double* work,
+                               lapack_int* ncycle);
+
+lapack_int LAPACKE_stgsna_work(int matrix_order, char job, char howmny, const lapack_logical* select, lapack_int n,
+                               const float* a, lapack_int lda, const float* b, lapack_int ldb, const float* vl,
+                               lapack_int ldvl, const float* vr, lapack_int ldvr, float* s, float* dif, lapack_int mm,
+                               lapack_int* m, float* work, lapack_int lwork, lapack_int* iwork);
+lapack_int LAPACKE_dtgsna_work(int matrix_order, char job, char howmny, const lapack_logical* select, lapack_int n,
+                               const double* a, lapack_int lda, const double* b, lapack_int ldb, const double* vl,
+                               lapack_int ldvl, const double* vr, lapack_int ldvr, double* s, double* dif,
+                               lapack_int mm, lapack_int* m, double* work, lapack_int lwork, lapack_int* iwork);
+lapack_int LAPACKE_ctgsna_work(int matrix_order, char job, char howmny, const lapack_logical* select, lapack_int n,
+                               const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* b,
+                               lapack_int ldb, const lapack_complex_float* vl, lapack_int ldvl,
+                               const lapack_complex_float* vr, lapack_int ldvr, float* s, float* dif, lapack_int mm,
+                               lapack_int* m, lapack_complex_float* work, lapack_int lwork, lapack_int* iwork);
+lapack_int LAPACKE_ztgsna_work(int matrix_order, char job, char howmny, const lapack_logical* select, lapack_int n,
+                               const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* b,
+                               lapack_int ldb, const lapack_complex_double* vl, lapack_int ldvl,
+                               const lapack_complex_double* vr, lapack_int ldvr, double* s, double* dif, lapack_int mm,
+                               lapack_int* m, lapack_complex_double* work, lapack_int lwork, lapack_int* iwork);
+
+lapack_int LAPACKE_stgsyl_work(int matrix_order, char trans, lapack_int ijob, lapack_int m, lapack_int n,
+                               const float* a, lapack_int lda, const float* b, lapack_int ldb, float* c, lapack_int ldc,
+                               const float* d, lapack_int ldd, const float* e, lapack_int lde, float* f, lapack_int ldf,
+                               float* scale, float* dif, float* work, lapack_int lwork, lapack_int* iwork);
+lapack_int LAPACKE_dtgsyl_work(int matrix_order, char trans, lapack_int ijob, lapack_int m, lapack_int n,
+                               const double* a, lapack_int lda, const double* b, lapack_int ldb, double* c,
+                               lapack_int ldc, const double* d, lapack_int ldd, const double* e, lapack_int lde,
+                               double* f, lapack_int ldf, double* scale, double* dif, double* work, lapack_int lwork,
+                               lapack_int* iwork);
+lapack_int LAPACKE_ctgsyl_work(int matrix_order, char trans, lapack_int ijob, lapack_int m, lapack_int n,
+                               const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* b,
+                               lapack_int ldb, lapack_complex_float* c, lapack_int ldc, const lapack_complex_float* d,
+                               lapack_int ldd, const lapack_complex_float* e, lapack_int lde, lapack_complex_float* f,
+                               lapack_int ldf, float* scale, float* dif, lapack_complex_float* work, lapack_int lwork,
+                               lapack_int* iwork);
+lapack_int LAPACKE_ztgsyl_work(int matrix_order, char trans, lapack_int ijob, lapack_int m, lapack_int n,
+                               const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* b,
+                               lapack_int ldb, lapack_complex_double* c, lapack_int ldc, const lapack_complex_double* d,
+                               lapack_int ldd, const lapack_complex_double* e, lapack_int lde, lapack_complex_double* f,
+                               lapack_int ldf, double* scale, double* dif, lapack_complex_double* work,
+                               lapack_int lwork, lapack_int* iwork);
+
+lapack_int LAPACKE_stpcon_work(int matrix_order, char norm, char uplo, char diag, lapack_int n, const float* ap,
+                               float* rcond, float* work, lapack_int* iwork);
+lapack_int LAPACKE_dtpcon_work(int matrix_order, char norm, char uplo, char diag, lapack_int n, const double* ap,
+                               double* rcond, double* work, lapack_int* iwork);
+lapack_int LAPACKE_ctpcon_work(int matrix_order, char norm, char uplo, char diag, lapack_int n,
+                               const lapack_complex_float* ap, float* rcond, lapack_complex_float* work, float* rwork);
+lapack_int LAPACKE_ztpcon_work(int matrix_order, char norm, char uplo, char diag, lapack_int n,
+                               const lapack_complex_double* ap, double* rcond, lapack_complex_double* work,
+                               double* rwork);
+
+lapack_int LAPACKE_stprfs_work(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
+                               const float* ap, const float* b, lapack_int ldb, const float* x, lapack_int ldx,
+                               float* ferr, float* berr, float* work, lapack_int* iwork);
+lapack_int LAPACKE_dtprfs_work(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
+                               const double* ap, const double* b, lapack_int ldb, const double* x, lapack_int ldx,
+                               double* ferr, double* berr, double* work, lapack_int* iwork);
+lapack_int LAPACKE_ctprfs_work(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
+                               const lapack_complex_float* ap, const lapack_complex_float* b, lapack_int ldb,
+                               const lapack_complex_float* x, lapack_int ldx, float* ferr, float* berr,
+                               lapack_complex_float* work, float* rwork);
+lapack_int LAPACKE_ztprfs_work(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
+                               const lapack_complex_double* ap, const lapack_complex_double* b, lapack_int ldb,
+                               const lapack_complex_double* x, lapack_int ldx, double* ferr, double* berr,
+                               lapack_complex_double* work, double* rwork);
+
+lapack_int LAPACKE_stptri_work(int matrix_order, char uplo, char diag, lapack_int n, float* ap);
+lapack_int LAPACKE_dtptri_work(int matrix_order, char uplo, char diag, lapack_int n, double* ap);
+lapack_int LAPACKE_ctptri_work(int matrix_order, char uplo, char diag, lapack_int n, lapack_complex_float* ap);
+lapack_int LAPACKE_ztptri_work(int matrix_order, char uplo, char diag, lapack_int n, lapack_complex_double* ap);
+
+lapack_int LAPACKE_stptrs_work(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
+                               const float* ap, float* b, lapack_int ldb);
+lapack_int LAPACKE_dtptrs_work(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
+                               const double* ap, double* b, lapack_int ldb);
+lapack_int LAPACKE_ctptrs_work(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
+                               const lapack_complex_float* ap, lapack_complex_float* b, lapack_int ldb);
+lapack_int LAPACKE_ztptrs_work(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
+                               const lapack_complex_double* ap, lapack_complex_double* b, lapack_int ldb);
+
+lapack_int LAPACKE_stpttf_work(int matrix_order, char transr, char uplo, lapack_int n, const float* ap, float* arf);
+lapack_int LAPACKE_dtpttf_work(int matrix_order, char transr, char uplo, lapack_int n, const double* ap, double* arf);
+lapack_int LAPACKE_ctpttf_work(int matrix_order, char transr, char uplo, lapack_int n, const lapack_complex_float* ap,
+                               lapack_complex_float* arf);
+lapack_int LAPACKE_ztpttf_work(int matrix_order, char transr, char uplo, lapack_int n, const lapack_complex_double* ap,
+                               lapack_complex_double* arf);
+
+lapack_int LAPACKE_stpttr_work(int matrix_order, char uplo, lapack_int n, const float* ap, float* a, lapack_int lda);
+lapack_int LAPACKE_dtpttr_work(int matrix_order, char uplo, lapack_int n, const double* ap, double* a, lapack_int lda);
+lapack_int LAPACKE_ctpttr_work(int matrix_order, char uplo, lapack_int n, const lapack_complex_float* ap,
+                               lapack_complex_float* a, lapack_int lda);
+lapack_int LAPACKE_ztpttr_work(int matrix_order, char uplo, lapack_int n, const lapack_complex_double* ap,
+                               lapack_complex_double* a, lapack_int lda);
+
+lapack_int LAPACKE_strcon_work(int matrix_order, char norm, char uplo, char diag, lapack_int n, const float* a,
+                               lapack_int lda, float* rcond, float* work, lapack_int* iwork);
+lapack_int LAPACKE_dtrcon_work(int matrix_order, char norm, char uplo, char diag, lapack_int n, const double* a,
+                               lapack_int lda, double* rcond, double* work, lapack_int* iwork);
+lapack_int LAPACKE_ctrcon_work(int matrix_order, char norm, char uplo, char diag, lapack_int n,
+                               const lapack_complex_float* a, lapack_int lda, float* rcond, lapack_complex_float* work,
+                               float* rwork);
+lapack_int LAPACKE_ztrcon_work(int matrix_order, char norm, char uplo, char diag, lapack_int n,
+                               const lapack_complex_double* a, lapack_int lda, double* rcond,
+                               lapack_complex_double* work, double* rwork);
+
+lapack_int LAPACKE_strevc_work(int matrix_order, char side, char howmny, lapack_logical* select, lapack_int n,
+                               const float* t, lapack_int ldt, float* vl, lapack_int ldvl, float* vr, lapack_int ldvr,
+                               lapack_int mm, lapack_int* m, float* work);
+lapack_int LAPACKE_dtrevc_work(int matrix_order, char side, char howmny, lapack_logical* select, lapack_int n,
+                               const double* t, lapack_int ldt, double* vl, lapack_int ldvl, double* vr,
+                               lapack_int ldvr, lapack_int mm, lapack_int* m, double* work);
+lapack_int LAPACKE_ctrevc_work(int matrix_order, char side, char howmny, const lapack_logical* select, lapack_int n,
+                               lapack_complex_float* t, lapack_int ldt, lapack_complex_float* vl, lapack_int ldvl,
+                               lapack_complex_float* vr, lapack_int ldvr, lapack_int mm, lapack_int* m,
+                               lapack_complex_float* work, float* rwork);
+lapack_int LAPACKE_ztrevc_work(int matrix_order, char side, char howmny, const lapack_logical* select, lapack_int n,
+                               lapack_complex_double* t, lapack_int ldt, lapack_complex_double* vl, lapack_int ldvl,
+                               lapack_complex_double* vr, lapack_int ldvr, lapack_int mm, lapack_int* m,
+                               lapack_complex_double* work, double* rwork);
+
+lapack_int LAPACKE_strexc_work(int matrix_order, char compq, lapack_int n, float* t, lapack_int ldt, float* q,
+                               lapack_int ldq, lapack_int* ifst, lapack_int* ilst, float* work);
+lapack_int LAPACKE_dtrexc_work(int matrix_order, char compq, lapack_int n, double* t, lapack_int ldt, double* q,
+                               lapack_int ldq, lapack_int* ifst, lapack_int* ilst, double* work);
+lapack_int LAPACKE_ctrexc_work(int matrix_order, char compq, lapack_int n, lapack_complex_float* t, lapack_int ldt,
+                               lapack_complex_float* q, lapack_int ldq, lapack_int ifst, lapack_int ilst);
+lapack_int LAPACKE_ztrexc_work(int matrix_order, char compq, lapack_int n, lapack_complex_double* t, lapack_int ldt,
+                               lapack_complex_double* q, lapack_int ldq, lapack_int ifst, lapack_int ilst);
+
+lapack_int LAPACKE_strrfs_work(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
+                               const float* a, lapack_int lda, const float* b, lapack_int ldb, const float* x,
+                               lapack_int ldx, float* ferr, float* berr, float* work, lapack_int* iwork);
+lapack_int LAPACKE_dtrrfs_work(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
+                               const double* a, lapack_int lda, const double* b, lapack_int ldb, const double* x,
+                               lapack_int ldx, double* ferr, double* berr, double* work, lapack_int* iwork);
+lapack_int LAPACKE_ctrrfs_work(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
+                               const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* b,
+                               lapack_int ldb, const lapack_complex_float* x, lapack_int ldx, float* ferr, float* berr,
+                               lapack_complex_float* work, float* rwork);
+lapack_int LAPACKE_ztrrfs_work(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
+                               const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* b,
+                               lapack_int ldb, const lapack_complex_double* x, lapack_int ldx, double* ferr,
+                               double* berr, lapack_complex_double* work, double* rwork);
+
+lapack_int LAPACKE_strsen_work(int matrix_order, char job, char compq, const lapack_logical* select, lapack_int n,
+                               float* t, lapack_int ldt, float* q, lapack_int ldq, float* wr, float* wi, lapack_int* m,
+                               float* s, float* sep, float* work, lapack_int lwork, lapack_int* iwork,
+                               lapack_int liwork);
+lapack_int LAPACKE_dtrsen_work(int matrix_order, char job, char compq, const lapack_logical* select, lapack_int n,
+                               double* t, lapack_int ldt, double* q, lapack_int ldq, double* wr, double* wi,
+                               lapack_int* m, double* s, double* sep, double* work, lapack_int lwork, lapack_int* iwork,
+                               lapack_int liwork);
+lapack_int LAPACKE_ctrsen_work(int matrix_order, char job, char compq, const lapack_logical* select, lapack_int n,
+                               lapack_complex_float* t, lapack_int ldt, lapack_complex_float* q, lapack_int ldq,
+                               lapack_complex_float* w, lapack_int* m, float* s, float* sep, lapack_complex_float* work,
+                               lapack_int lwork);
+lapack_int LAPACKE_ztrsen_work(int matrix_order, char job, char compq, const lapack_logical* select, lapack_int n,
+                               lapack_complex_double* t, lapack_int ldt, lapack_complex_double* q, lapack_int ldq,
+                               lapack_complex_double* w, lapack_int* m, double* s, double* sep,
+                               lapack_complex_double* work, lapack_int lwork);
+
+lapack_int LAPACKE_strsna_work(int matrix_order, char job, char howmny, const lapack_logical* select, lapack_int n,
+                               const float* t, lapack_int ldt, const float* vl, lapack_int ldvl, const float* vr,
+                               lapack_int ldvr, float* s, float* sep, lapack_int mm, lapack_int* m, float* work,
+                               lapack_int ldwork, lapack_int* iwork);
+lapack_int LAPACKE_dtrsna_work(int matrix_order, char job, char howmny, const lapack_logical* select, lapack_int n,
+                               const double* t, lapack_int ldt, const double* vl, lapack_int ldvl, const double* vr,
+                               lapack_int ldvr, double* s, double* sep, lapack_int mm, lapack_int* m, double* work,
+                               lapack_int ldwork, lapack_int* iwork);
+lapack_int LAPACKE_ctrsna_work(int matrix_order, char job, char howmny, const lapack_logical* select, lapack_int n,
+                               const lapack_complex_float* t, lapack_int ldt, const lapack_complex_float* vl,
+                               lapack_int ldvl, const lapack_complex_float* vr, lapack_int ldvr, float* s, float* sep,
+                               lapack_int mm, lapack_int* m, lapack_complex_float* work, lapack_int ldwork,
+                               float* rwork);
+lapack_int LAPACKE_ztrsna_work(int matrix_order, char job, char howmny, const lapack_logical* select, lapack_int n,
+                               const lapack_complex_double* t, lapack_int ldt, const lapack_complex_double* vl,
+                               lapack_int ldvl, const lapack_complex_double* vr, lapack_int ldvr, double* s,
+                               double* sep, lapack_int mm, lapack_int* m, lapack_complex_double* work,
+                               lapack_int ldwork, double* rwork);
+
+lapack_int LAPACKE_strsyl_work(int matrix_order, char trana, char tranb, lapack_int isgn, lapack_int m, lapack_int n,
+                               const float* a, lapack_int lda, const float* b, lapack_int ldb, float* c, lapack_int ldc,
+                               float* scale);
+lapack_int LAPACKE_dtrsyl_work(int matrix_order, char trana, char tranb, lapack_int isgn, lapack_int m, lapack_int n,
+                               const double* a, lapack_int lda, const double* b, lapack_int ldb, double* c,
+                               lapack_int ldc, double* scale);
+lapack_int LAPACKE_ctrsyl_work(int matrix_order, char trana, char tranb, lapack_int isgn, lapack_int m, lapack_int n,
+                               const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* b,
+                               lapack_int ldb, lapack_complex_float* c, lapack_int ldc, float* scale);
+lapack_int LAPACKE_ztrsyl_work(int matrix_order, char trana, char tranb, lapack_int isgn, lapack_int m, lapack_int n,
+                               const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* b,
+                               lapack_int ldb, lapack_complex_double* c, lapack_int ldc, double* scale);
+
+lapack_int LAPACKE_strtri_work(int matrix_order, char uplo, char diag, lapack_int n, float* a, lapack_int lda);
+lapack_int LAPACKE_dtrtri_work(int matrix_order, char uplo, char diag, lapack_int n, double* a, lapack_int lda);
+lapack_int LAPACKE_ctrtri_work(int matrix_order, char uplo, char diag, lapack_int n, lapack_complex_float* a,
+                               lapack_int lda);
+lapack_int LAPACKE_ztrtri_work(int matrix_order, char uplo, char diag, lapack_int n, lapack_complex_double* a,
+                               lapack_int lda);
+
+lapack_int LAPACKE_strtrs_work(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
+                               const float* a, lapack_int lda, float* b, lapack_int ldb);
+lapack_int LAPACKE_dtrtrs_work(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
+                               const double* a, lapack_int lda, double* b, lapack_int ldb);
+lapack_int LAPACKE_ctrtrs_work(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
+                               const lapack_complex_float* a, lapack_int lda, lapack_complex_float* b, lapack_int ldb);
+lapack_int LAPACKE_ztrtrs_work(int matrix_order, char uplo, char trans, char diag, lapack_int n, lapack_int nrhs,
+                               const lapack_complex_double* a, lapack_int lda, lapack_complex_double* b,
+                               lapack_int ldb);
+
+lapack_int LAPACKE_strttf_work(int matrix_order, char transr, char uplo, lapack_int n, const float* a, lapack_int lda,
+                               float* arf);
+lapack_int LAPACKE_dtrttf_work(int matrix_order, char transr, char uplo, lapack_int n, const double* a, lapack_int lda,
+                               double* arf);
+lapack_int LAPACKE_ctrttf_work(int matrix_order, char transr, char uplo, lapack_int n, const lapack_complex_float* a,
+                               lapack_int lda, lapack_complex_float* arf);
+lapack_int LAPACKE_ztrttf_work(int matrix_order, char transr, char uplo, lapack_int n, const lapack_complex_double* a,
+                               lapack_int lda, lapack_complex_double* arf);
+
+lapack_int LAPACKE_strttp_work(int matrix_order, char uplo, lapack_int n, const float* a, lapack_int lda, float* ap);
+lapack_int LAPACKE_dtrttp_work(int matrix_order, char uplo, lapack_int n, const double* a, lapack_int lda, double* ap);
+lapack_int LAPACKE_ctrttp_work(int matrix_order, char uplo, lapack_int n, const lapack_complex_float* a, lapack_int lda,
+                               lapack_complex_float* ap);
+lapack_int LAPACKE_ztrttp_work(int matrix_order, char uplo, lapack_int n, const lapack_complex_double* a,
+                               lapack_int lda, lapack_complex_double* ap);
+
+lapack_int LAPACKE_stzrzf_work(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, float* tau,
+                               float* work, lapack_int lwork);
+lapack_int LAPACKE_dtzrzf_work(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda, double* tau,
+                               double* work, lapack_int lwork);
+lapack_int LAPACKE_ctzrzf_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                               lapack_complex_float* tau, lapack_complex_float* work, lapack_int lwork);
+lapack_int LAPACKE_ztzrzf_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                               lapack_complex_double* tau, lapack_complex_double* work, lapack_int lwork);
+
+lapack_int LAPACKE_cungbr_work(int matrix_order, char vect, lapack_int m, lapack_int n, lapack_int k,
+                               lapack_complex_float* a, lapack_int lda, const lapack_complex_float* tau,
+                               lapack_complex_float* work, lapack_int lwork);
+lapack_int LAPACKE_zungbr_work(int matrix_order, char vect, lapack_int m, lapack_int n, lapack_int k,
+                               lapack_complex_double* a, lapack_int lda, const lapack_complex_double* tau,
+                               lapack_complex_double* work, lapack_int lwork);
+
+lapack_int LAPACKE_cunghr_work(int matrix_order, lapack_int n, lapack_int ilo, lapack_int ihi, lapack_complex_float* a,
+                               lapack_int lda, const lapack_complex_float* tau, lapack_complex_float* work,
+                               lapack_int lwork);
+lapack_int LAPACKE_zunghr_work(int matrix_order, lapack_int n, lapack_int ilo, lapack_int ihi, lapack_complex_double* a,
+                               lapack_int lda, const lapack_complex_double* tau, lapack_complex_double* work,
+                               lapack_int lwork);
+
+lapack_int LAPACKE_cunglq_work(int matrix_order, lapack_int m, lapack_int n, lapack_int k, lapack_complex_float* a,
+                               lapack_int lda, const lapack_complex_float* tau, lapack_complex_float* work,
+                               lapack_int lwork);
+lapack_int LAPACKE_zunglq_work(int matrix_order, lapack_int m, lapack_int n, lapack_int k, lapack_complex_double* a,
+                               lapack_int lda, const lapack_complex_double* tau, lapack_complex_double* work,
+                               lapack_int lwork);
+
+lapack_int LAPACKE_cungql_work(int matrix_order, lapack_int m, lapack_int n, lapack_int k, lapack_complex_float* a,
+                               lapack_int lda, const lapack_complex_float* tau, lapack_complex_float* work,
+                               lapack_int lwork);
+lapack_int LAPACKE_zungql_work(int matrix_order, lapack_int m, lapack_int n, lapack_int k, lapack_complex_double* a,
+                               lapack_int lda, const lapack_complex_double* tau, lapack_complex_double* work,
+                               lapack_int lwork);
+
+lapack_int LAPACKE_cungqr_work(int matrix_order, lapack_int m, lapack_int n, lapack_int k, lapack_complex_float* a,
+                               lapack_int lda, const lapack_complex_float* tau, lapack_complex_float* work,
+                               lapack_int lwork);
+lapack_int LAPACKE_zungqr_work(int matrix_order, lapack_int m, lapack_int n, lapack_int k, lapack_complex_double* a,
+                               lapack_int lda, const lapack_complex_double* tau, lapack_complex_double* work,
+                               lapack_int lwork);
+
+lapack_int LAPACKE_cungrq_work(int matrix_order, lapack_int m, lapack_int n, lapack_int k, lapack_complex_float* a,
+                               lapack_int lda, const lapack_complex_float* tau, lapack_complex_float* work,
+                               lapack_int lwork);
+lapack_int LAPACKE_zungrq_work(int matrix_order, lapack_int m, lapack_int n, lapack_int k, lapack_complex_double* a,
+                               lapack_int lda, const lapack_complex_double* tau, lapack_complex_double* work,
+                               lapack_int lwork);
+
+lapack_int LAPACKE_cungtr_work(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                               const lapack_complex_float* tau, lapack_complex_float* work, lapack_int lwork);
+lapack_int LAPACKE_zungtr_work(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                               const lapack_complex_double* tau, lapack_complex_double* work, lapack_int lwork);
+
+lapack_int LAPACKE_cunmbr_work(int matrix_order, char vect, char side, char trans, lapack_int m, lapack_int n,
+                               lapack_int k, const lapack_complex_float* a, lapack_int lda,
+                               const lapack_complex_float* tau, lapack_complex_float* c, lapack_int ldc,
+                               lapack_complex_float* work, lapack_int lwork);
+lapack_int LAPACKE_zunmbr_work(int matrix_order, char vect, char side, char trans, lapack_int m, lapack_int n,
+                               lapack_int k, const lapack_complex_double* a, lapack_int lda,
+                               const lapack_complex_double* tau, lapack_complex_double* c, lapack_int ldc,
+                               lapack_complex_double* work, lapack_int lwork);
+
+lapack_int LAPACKE_cunmhr_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int ilo,
+                               lapack_int ihi, const lapack_complex_float* a, lapack_int lda,
+                               const lapack_complex_float* tau, lapack_complex_float* c, lapack_int ldc,
+                               lapack_complex_float* work, lapack_int lwork);
+lapack_int LAPACKE_zunmhr_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int ilo,
+                               lapack_int ihi, const lapack_complex_double* a, lapack_int lda,
+                               const lapack_complex_double* tau, lapack_complex_double* c, lapack_int ldc,
+                               lapack_complex_double* work, lapack_int lwork);
+
+lapack_int LAPACKE_cunmlq_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                               const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* tau,
+                               lapack_complex_float* c, lapack_int ldc, lapack_complex_float* work, lapack_int lwork);
+lapack_int LAPACKE_zunmlq_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                               const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* tau,
+                               lapack_complex_double* c, lapack_int ldc, lapack_complex_double* work, lapack_int lwork);
+
+lapack_int LAPACKE_cunmql_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                               const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* tau,
+                               lapack_complex_float* c, lapack_int ldc, lapack_complex_float* work, lapack_int lwork);
+lapack_int LAPACKE_zunmql_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                               const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* tau,
+                               lapack_complex_double* c, lapack_int ldc, lapack_complex_double* work, lapack_int lwork);
+
+lapack_int LAPACKE_cunmqr_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                               const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* tau,
+                               lapack_complex_float* c, lapack_int ldc, lapack_complex_float* work, lapack_int lwork);
+lapack_int LAPACKE_zunmqr_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                               const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* tau,
+                               lapack_complex_double* c, lapack_int ldc, lapack_complex_double* work, lapack_int lwork);
+
+lapack_int LAPACKE_cunmrq_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                               const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* tau,
+                               lapack_complex_float* c, lapack_int ldc, lapack_complex_float* work, lapack_int lwork);
+lapack_int LAPACKE_zunmrq_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                               const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* tau,
+                               lapack_complex_double* c, lapack_int ldc, lapack_complex_double* work, lapack_int lwork);
+
+lapack_int LAPACKE_cunmrz_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                               lapack_int l, const lapack_complex_float* a, lapack_int lda,
+                               const lapack_complex_float* tau, lapack_complex_float* c, lapack_int ldc,
+                               lapack_complex_float* work, lapack_int lwork);
+lapack_int LAPACKE_zunmrz_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                               lapack_int l, const lapack_complex_double* a, lapack_int lda,
+                               const lapack_complex_double* tau, lapack_complex_double* c, lapack_int ldc,
+                               lapack_complex_double* work, lapack_int lwork);
+
+lapack_int LAPACKE_cunmtr_work(int matrix_order, char side, char uplo, char trans, lapack_int m, lapack_int n,
+                               const lapack_complex_float* a, lapack_int lda, const lapack_complex_float* tau,
+                               lapack_complex_float* c, lapack_int ldc, lapack_complex_float* work, lapack_int lwork);
+lapack_int LAPACKE_zunmtr_work(int matrix_order, char side, char uplo, char trans, lapack_int m, lapack_int n,
+                               const lapack_complex_double* a, lapack_int lda, const lapack_complex_double* tau,
+                               lapack_complex_double* c, lapack_int ldc, lapack_complex_double* work, lapack_int lwork);
+
+lapack_int LAPACKE_cupgtr_work(int matrix_order, char uplo, lapack_int n, const lapack_complex_float* ap,
+                               const lapack_complex_float* tau, lapack_complex_float* q, lapack_int ldq,
+                               lapack_complex_float* work);
+lapack_int LAPACKE_zupgtr_work(int matrix_order, char uplo, lapack_int n, const lapack_complex_double* ap,
+                               const lapack_complex_double* tau, lapack_complex_double* q, lapack_int ldq,
+                               lapack_complex_double* work);
+
+lapack_int LAPACKE_cupmtr_work(int matrix_order, char side, char uplo, char trans, lapack_int m, lapack_int n,
+                               const lapack_complex_float* ap, const lapack_complex_float* tau, lapack_complex_float* c,
+                               lapack_int ldc, lapack_complex_float* work);
+lapack_int LAPACKE_zupmtr_work(int matrix_order, char side, char uplo, char trans, lapack_int m, lapack_int n,
+                               const lapack_complex_double* ap, const lapack_complex_double* tau,
+                               lapack_complex_double* c, lapack_int ldc, lapack_complex_double* work);
+
+lapack_int LAPACKE_claghe(int matrix_order, lapack_int n, lapack_int k, const float* d, lapack_complex_float* a,
+                          lapack_int lda, lapack_int* iseed);
+lapack_int LAPACKE_zlaghe(int matrix_order, lapack_int n, lapack_int k, const double* d, lapack_complex_double* a,
+                          lapack_int lda, lapack_int* iseed);
+
+lapack_int LAPACKE_slagsy(int matrix_order, lapack_int n, lapack_int k, const float* d, float* a, lapack_int lda,
+                          lapack_int* iseed);
+lapack_int LAPACKE_dlagsy(int matrix_order, lapack_int n, lapack_int k, const double* d, double* a, lapack_int lda,
+                          lapack_int* iseed);
+lapack_int LAPACKE_clagsy(int matrix_order, lapack_int n, lapack_int k, const float* d, lapack_complex_float* a,
+                          lapack_int lda, lapack_int* iseed);
+lapack_int LAPACKE_zlagsy(int matrix_order, lapack_int n, lapack_int k, const double* d, lapack_complex_double* a,
+                          lapack_int lda, lapack_int* iseed);
+
+lapack_int LAPACKE_slapmr(int matrix_order, lapack_logical forwrd, lapack_int m, lapack_int n, float* x, lapack_int ldx,
+                          lapack_int* k);
+lapack_int LAPACKE_dlapmr(int matrix_order, lapack_logical forwrd, lapack_int m, lapack_int n, double* x,
+                          lapack_int ldx, lapack_int* k);
+lapack_int LAPACKE_clapmr(int matrix_order, lapack_logical forwrd, lapack_int m, lapack_int n, lapack_complex_float* x,
+                          lapack_int ldx, lapack_int* k);
+lapack_int LAPACKE_zlapmr(int matrix_order, lapack_logical forwrd, lapack_int m, lapack_int n, lapack_complex_double* x,
+                          lapack_int ldx, lapack_int* k);
+
+float LAPACKE_slapy2(float x, float y);
+double LAPACKE_dlapy2(double x, double y);
+
+float LAPACKE_slapy3(float x, float y, float z);
+double LAPACKE_dlapy3(double x, double y, double z);
+
+lapack_int LAPACKE_slartgp(float f, float g, float* cs, float* sn, float* r);
+lapack_int LAPACKE_dlartgp(double f, double g, double* cs, double* sn, double* r);
+
+lapack_int LAPACKE_slartgs(float x, float y, float sigma, float* cs, float* sn);
+lapack_int LAPACKE_dlartgs(double x, double y, double sigma, double* cs, double* sn);
+
+// LAPACK 3.3.0
+lapack_int LAPACKE_cbbcsd(int matrix_order, char jobu1, char jobu2, char jobv1t, char jobv2t, char trans, lapack_int m,
+                          lapack_int p, lapack_int q, float* theta, float* phi, lapack_complex_float* u1,
+                          lapack_int ldu1, lapack_complex_float* u2, lapack_int ldu2, lapack_complex_float* v1t,
+                          lapack_int ldv1t, lapack_complex_float* v2t, lapack_int ldv2t, float* b11d, float* b11e,
+                          float* b12d, float* b12e, float* b21d, float* b21e, float* b22d, float* b22e);
+lapack_int LAPACKE_cbbcsd_work(int matrix_order, char jobu1, char jobu2, char jobv1t, char jobv2t, char trans,
+                               lapack_int m, lapack_int p, lapack_int q, float* theta, float* phi,
+                               lapack_complex_float* u1, lapack_int ldu1, lapack_complex_float* u2, lapack_int ldu2,
+                               lapack_complex_float* v1t, lapack_int ldv1t, lapack_complex_float* v2t, lapack_int ldv2t,
+                               float* b11d, float* b11e, float* b12d, float* b12e, float* b21d, float* b21e,
+                               float* b22d, float* b22e, float* rwork, lapack_int lrwork);
+lapack_int LAPACKE_cheswapr(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int i1,
+                            lapack_int i2);
+lapack_int LAPACKE_cheswapr_work(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int i1,
+                                 lapack_int i2);
+lapack_int LAPACKE_chetri2(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                           const lapack_int* ipiv);
+lapack_int LAPACKE_chetri2_work(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                                const lapack_int* ipiv, lapack_complex_float* work, lapack_int lwork);
+lapack_int LAPACKE_chetri2x(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                            const lapack_int* ipiv, lapack_int nb);
+lapack_int LAPACKE_chetri2x_work(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                                 const lapack_int* ipiv, lapack_complex_float* work, lapack_int nb);
+lapack_int LAPACKE_chetrs2(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const lapack_complex_float* a,
+                           lapack_int lda, const lapack_int* ipiv, lapack_complex_float* b, lapack_int ldb);
+lapack_int LAPACKE_chetrs2_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs,
+                                const lapack_complex_float* a, lapack_int lda, const lapack_int* ipiv,
+                                lapack_complex_float* b, lapack_int ldb, lapack_complex_float* work);
+lapack_int LAPACKE_csyconv(int matrix_order, char uplo, char way, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                           const lapack_int* ipiv);
+lapack_int LAPACKE_csyconv_work(int matrix_order, char uplo, char way, lapack_int n, lapack_complex_float* a,
+                                lapack_int lda, const lapack_int* ipiv, lapack_complex_float* work);
+lapack_int LAPACKE_csyswapr(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int i1,
+                            lapack_int i2);
+lapack_int LAPACKE_csyswapr_work(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int i1,
+                                 lapack_int i2);
+lapack_int LAPACKE_csytri2(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                           const lapack_int* ipiv);
+lapack_int LAPACKE_csytri2_work(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                                const lapack_int* ipiv, lapack_complex_float* work, lapack_int lwork);
+lapack_int LAPACKE_csytri2x(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                            const lapack_int* ipiv, lapack_int nb);
+lapack_int LAPACKE_csytri2x_work(int matrix_order, char uplo, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                                 const lapack_int* ipiv, lapack_complex_float* work, lapack_int nb);
+lapack_int LAPACKE_csytrs2(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const lapack_complex_float* a,
+                           lapack_int lda, const lapack_int* ipiv, lapack_complex_float* b, lapack_int ldb);
+lapack_int LAPACKE_csytrs2_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs,
+                                const lapack_complex_float* a, lapack_int lda, const lapack_int* ipiv,
+                                lapack_complex_float* b, lapack_int ldb, lapack_complex_float* work);
+lapack_int LAPACKE_cunbdb(int matrix_order, char trans, char signs, lapack_int m, lapack_int p, lapack_int q,
+                          lapack_complex_float* x11, lapack_int ldx11, lapack_complex_float* x12, lapack_int ldx12,
+                          lapack_complex_float* x21, lapack_int ldx21, lapack_complex_float* x22, lapack_int ldx22,
+                          float* theta, float* phi, lapack_complex_float* taup1, lapack_complex_float* taup2,
+                          lapack_complex_float* tauq1, lapack_complex_float* tauq2);
+lapack_int LAPACKE_cunbdb_work(int matrix_order, char trans, char signs, lapack_int m, lapack_int p, lapack_int q,
+                               lapack_complex_float* x11, lapack_int ldx11, lapack_complex_float* x12, lapack_int ldx12,
+                               lapack_complex_float* x21, lapack_int ldx21, lapack_complex_float* x22, lapack_int ldx22,
+                               float* theta, float* phi, lapack_complex_float* taup1, lapack_complex_float* taup2,
+                               lapack_complex_float* tauq1, lapack_complex_float* tauq2, lapack_complex_float* work,
+                               lapack_int lwork);
+lapack_int LAPACKE_cuncsd(int matrix_order, char jobu1, char jobu2, char jobv1t, char jobv2t, char trans, char signs,
+                          lapack_int m, lapack_int p, lapack_int q, lapack_complex_float* x11, lapack_int ldx11,
+                          lapack_complex_float* x12, lapack_int ldx12, lapack_complex_float* x21, lapack_int ldx21,
+                          lapack_complex_float* x22, lapack_int ldx22, float* theta, lapack_complex_float* u1,
+                          lapack_int ldu1, lapack_complex_float* u2, lapack_int ldu2, lapack_complex_float* v1t,
+                          lapack_int ldv1t, lapack_complex_float* v2t, lapack_int ldv2t);
+lapack_int LAPACKE_cuncsd_work(int matrix_order, char jobu1, char jobu2, char jobv1t, char jobv2t, char trans,
+                               char signs, lapack_int m, lapack_int p, lapack_int q, lapack_complex_float* x11,
+                               lapack_int ldx11, lapack_complex_float* x12, lapack_int ldx12, lapack_complex_float* x21,
+                               lapack_int ldx21, lapack_complex_float* x22, lapack_int ldx22, float* theta,
+                               lapack_complex_float* u1, lapack_int ldu1, lapack_complex_float* u2, lapack_int ldu2,
+                               lapack_complex_float* v1t, lapack_int ldv1t, lapack_complex_float* v2t, lapack_int ldv2t,
+                               lapack_complex_float* work, lapack_int lwork, float* rwork, lapack_int lrwork,
+                               lapack_int* iwork);
+lapack_int LAPACKE_dbbcsd(int matrix_order, char jobu1, char jobu2, char jobv1t, char jobv2t, char trans, lapack_int m,
+                          lapack_int p, lapack_int q, double* theta, double* phi, double* u1, lapack_int ldu1,
+                          double* u2, lapack_int ldu2, double* v1t, lapack_int ldv1t, double* v2t, lapack_int ldv2t,
+                          double* b11d, double* b11e, double* b12d, double* b12e, double* b21d, double* b21e,
+                          double* b22d, double* b22e);
+lapack_int LAPACKE_dbbcsd_work(int matrix_order, char jobu1, char jobu2, char jobv1t, char jobv2t, char trans,
+                               lapack_int m, lapack_int p, lapack_int q, double* theta, double* phi, double* u1,
+                               lapack_int ldu1, double* u2, lapack_int ldu2, double* v1t, lapack_int ldv1t, double* v2t,
+                               lapack_int ldv2t, double* b11d, double* b11e, double* b12d, double* b12e, double* b21d,
+                               double* b21e, double* b22d, double* b22e, double* work, lapack_int lwork);
+lapack_int LAPACKE_dorbdb(int matrix_order, char trans, char signs, lapack_int m, lapack_int p, lapack_int q,
+                          double* x11, lapack_int ldx11, double* x12, lapack_int ldx12, double* x21, lapack_int ldx21,
+                          double* x22, lapack_int ldx22, double* theta, double* phi, double* taup1, double* taup2,
+                          double* tauq1, double* tauq2);
+lapack_int LAPACKE_dorbdb_work(int matrix_order, char trans, char signs, lapack_int m, lapack_int p, lapack_int q,
+                               double* x11, lapack_int ldx11, double* x12, lapack_int ldx12, double* x21,
+                               lapack_int ldx21, double* x22, lapack_int ldx22, double* theta, double* phi,
+                               double* taup1, double* taup2, double* tauq1, double* tauq2, double* work,
+                               lapack_int lwork);
+lapack_int LAPACKE_dorcsd(int matrix_order, char jobu1, char jobu2, char jobv1t, char jobv2t, char trans, char signs,
+                          lapack_int m, lapack_int p, lapack_int q, double* x11, lapack_int ldx11, double* x12,
+                          lapack_int ldx12, double* x21, lapack_int ldx21, double* x22, lapack_int ldx22, double* theta,
+                          double* u1, lapack_int ldu1, double* u2, lapack_int ldu2, double* v1t, lapack_int ldv1t,
+                          double* v2t, lapack_int ldv2t);
+lapack_int LAPACKE_dorcsd_work(int matrix_order, char jobu1, char jobu2, char jobv1t, char jobv2t, char trans,
+                               char signs, lapack_int m, lapack_int p, lapack_int q, double* x11, lapack_int ldx11,
+                               double* x12, lapack_int ldx12, double* x21, lapack_int ldx21, double* x22,
+                               lapack_int ldx22, double* theta, double* u1, lapack_int ldu1, double* u2,
+                               lapack_int ldu2, double* v1t, lapack_int ldv1t, double* v2t, lapack_int ldv2t,
+                               double* work, lapack_int lwork, lapack_int* iwork);
+lapack_int LAPACKE_dsyconv(int matrix_order, char uplo, char way, lapack_int n, double* a, lapack_int lda,
+                           const lapack_int* ipiv);
+lapack_int LAPACKE_dsyconv_work(int matrix_order, char uplo, char way, lapack_int n, double* a, lapack_int lda,
+                                const lapack_int* ipiv, double* work);
+lapack_int LAPACKE_dsyswapr(int matrix_order, char uplo, lapack_int n, double* a, lapack_int i1, lapack_int i2);
+lapack_int LAPACKE_dsyswapr_work(int matrix_order, char uplo, lapack_int n, double* a, lapack_int i1, lapack_int i2);
+lapack_int LAPACKE_dsytri2(int matrix_order, char uplo, lapack_int n, double* a, lapack_int lda,
+                           const lapack_int* ipiv);
+lapack_int LAPACKE_dsytri2_work(int matrix_order, char uplo, lapack_int n, double* a, lapack_int lda,
+                                const lapack_int* ipiv, lapack_complex_double* work, lapack_int lwork);
+lapack_int LAPACKE_dsytri2x(int matrix_order, char uplo, lapack_int n, double* a, lapack_int lda,
+                            const lapack_int* ipiv, lapack_int nb);
+lapack_int LAPACKE_dsytri2x_work(int matrix_order, char uplo, lapack_int n, double* a, lapack_int lda,
+                                 const lapack_int* ipiv, double* work, lapack_int nb);
+lapack_int LAPACKE_dsytrs2(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const double* a, lapack_int lda,
+                           const lapack_int* ipiv, double* b, lapack_int ldb);
+lapack_int LAPACKE_dsytrs2_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const double* a,
+                                lapack_int lda, const lapack_int* ipiv, double* b, lapack_int ldb, double* work);
+lapack_int LAPACKE_sbbcsd(int matrix_order, char jobu1, char jobu2, char jobv1t, char jobv2t, char trans, lapack_int m,
+                          lapack_int p, lapack_int q, float* theta, float* phi, float* u1, lapack_int ldu1, float* u2,
+                          lapack_int ldu2, float* v1t, lapack_int ldv1t, float* v2t, lapack_int ldv2t, float* b11d,
+                          float* b11e, float* b12d, float* b12e, float* b21d, float* b21e, float* b22d, float* b22e);
+lapack_int LAPACKE_sbbcsd_work(int matrix_order, char jobu1, char jobu2, char jobv1t, char jobv2t, char trans,
+                               lapack_int m, lapack_int p, lapack_int q, float* theta, float* phi, float* u1,
+                               lapack_int ldu1, float* u2, lapack_int ldu2, float* v1t, lapack_int ldv1t, float* v2t,
+                               lapack_int ldv2t, float* b11d, float* b11e, float* b12d, float* b12e, float* b21d,
+                               float* b21e, float* b22d, float* b22e, float* work, lapack_int lwork);
+lapack_int LAPACKE_sorbdb(int matrix_order, char trans, char signs, lapack_int m, lapack_int p, lapack_int q,
+                          float* x11, lapack_int ldx11, float* x12, lapack_int ldx12, float* x21, lapack_int ldx21,
+                          float* x22, lapack_int ldx22, float* theta, float* phi, float* taup1, float* taup2,
+                          float* tauq1, float* tauq2);
+lapack_int LAPACKE_sorbdb_work(int matrix_order, char trans, char signs, lapack_int m, lapack_int p, lapack_int q,
+                               float* x11, lapack_int ldx11, float* x12, lapack_int ldx12, float* x21, lapack_int ldx21,
+                               float* x22, lapack_int ldx22, float* theta, float* phi, float* taup1, float* taup2,
+                               float* tauq1, float* tauq2, float* work, lapack_int lwork);
+lapack_int LAPACKE_sorcsd(int matrix_order, char jobu1, char jobu2, char jobv1t, char jobv2t, char trans, char signs,
+                          lapack_int m, lapack_int p, lapack_int q, float* x11, lapack_int ldx11, float* x12,
+                          lapack_int ldx12, float* x21, lapack_int ldx21, float* x22, lapack_int ldx22, float* theta,
+                          float* u1, lapack_int ldu1, float* u2, lapack_int ldu2, float* v1t, lapack_int ldv1t,
+                          float* v2t, lapack_int ldv2t);
+lapack_int LAPACKE_sorcsd_work(int matrix_order, char jobu1, char jobu2, char jobv1t, char jobv2t, char trans,
+                               char signs, lapack_int m, lapack_int p, lapack_int q, float* x11, lapack_int ldx11,
+                               float* x12, lapack_int ldx12, float* x21, lapack_int ldx21, float* x22, lapack_int ldx22,
+                               float* theta, float* u1, lapack_int ldu1, float* u2, lapack_int ldu2, float* v1t,
+                               lapack_int ldv1t, float* v2t, lapack_int ldv2t, float* work, lapack_int lwork,
+                               lapack_int* iwork);
+lapack_int LAPACKE_ssyconv(int matrix_order, char uplo, char way, lapack_int n, float* a, lapack_int lda,
+                           const lapack_int* ipiv);
+lapack_int LAPACKE_ssyconv_work(int matrix_order, char uplo, char way, lapack_int n, float* a, lapack_int lda,
+                                const lapack_int* ipiv, float* work);
+lapack_int LAPACKE_ssyswapr(int matrix_order, char uplo, lapack_int n, float* a, lapack_int i1, lapack_int i2);
+lapack_int LAPACKE_ssyswapr_work(int matrix_order, char uplo, lapack_int n, float* a, lapack_int i1, lapack_int i2);
+lapack_int LAPACKE_ssytri2(int matrix_order, char uplo, lapack_int n, float* a, lapack_int lda, const lapack_int* ipiv);
+lapack_int LAPACKE_ssytri2_work(int matrix_order, char uplo, lapack_int n, float* a, lapack_int lda,
+                                const lapack_int* ipiv, lapack_complex_float* work, lapack_int lwork);
+lapack_int LAPACKE_ssytri2x(int matrix_order, char uplo, lapack_int n, float* a, lapack_int lda, const lapack_int* ipiv,
+                            lapack_int nb);
+lapack_int LAPACKE_ssytri2x_work(int matrix_order, char uplo, lapack_int n, float* a, lapack_int lda,
+                                 const lapack_int* ipiv, float* work, lapack_int nb);
+lapack_int LAPACKE_ssytrs2(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const float* a, lapack_int lda,
+                           const lapack_int* ipiv, float* b, lapack_int ldb);
+lapack_int LAPACKE_ssytrs2_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const float* a,
+                                lapack_int lda, const lapack_int* ipiv, float* b, lapack_int ldb, float* work);
+lapack_int LAPACKE_zbbcsd(int matrix_order, char jobu1, char jobu2, char jobv1t, char jobv2t, char trans, lapack_int m,
+                          lapack_int p, lapack_int q, double* theta, double* phi, lapack_complex_double* u1,
+                          lapack_int ldu1, lapack_complex_double* u2, lapack_int ldu2, lapack_complex_double* v1t,
+                          lapack_int ldv1t, lapack_complex_double* v2t, lapack_int ldv2t, double* b11d, double* b11e,
+                          double* b12d, double* b12e, double* b21d, double* b21e, double* b22d, double* b22e);
+lapack_int LAPACKE_zbbcsd_work(int matrix_order, char jobu1, char jobu2, char jobv1t, char jobv2t, char trans,
+                               lapack_int m, lapack_int p, lapack_int q, double* theta, double* phi,
+                               lapack_complex_double* u1, lapack_int ldu1, lapack_complex_double* u2, lapack_int ldu2,
+                               lapack_complex_double* v1t, lapack_int ldv1t, lapack_complex_double* v2t,
+                               lapack_int ldv2t, double* b11d, double* b11e, double* b12d, double* b12e, double* b21d,
+                               double* b21e, double* b22d, double* b22e, double* rwork, lapack_int lrwork);
+lapack_int LAPACKE_zheswapr(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int i1,
+                            lapack_int i2);
+lapack_int LAPACKE_zheswapr_work(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int i1,
+                                 lapack_int i2);
+lapack_int LAPACKE_zhetri2(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                           const lapack_int* ipiv);
+lapack_int LAPACKE_zhetri2_work(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                                const lapack_int* ipiv, lapack_complex_double* work, lapack_int lwork);
+lapack_int LAPACKE_zhetri2x(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                            const lapack_int* ipiv, lapack_int nb);
+lapack_int LAPACKE_zhetri2x_work(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                                 const lapack_int* ipiv, lapack_complex_double* work, lapack_int nb);
+lapack_int LAPACKE_zhetrs2(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const lapack_complex_double* a,
+                           lapack_int lda, const lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb);
+lapack_int LAPACKE_zhetrs2_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs,
+                                const lapack_complex_double* a, lapack_int lda, const lapack_int* ipiv,
+                                lapack_complex_double* b, lapack_int ldb, lapack_complex_double* work);
+lapack_int LAPACKE_zsyconv(int matrix_order, char uplo, char way, lapack_int n, lapack_complex_double* a,
+                           lapack_int lda, const lapack_int* ipiv);
+lapack_int LAPACKE_zsyconv_work(int matrix_order, char uplo, char way, lapack_int n, lapack_complex_double* a,
+                                lapack_int lda, const lapack_int* ipiv, lapack_complex_double* work);
+lapack_int LAPACKE_zsyswapr(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int i1,
+                            lapack_int i2);
+lapack_int LAPACKE_zsyswapr_work(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int i1,
+                                 lapack_int i2);
+lapack_int LAPACKE_zsytri2(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                           const lapack_int* ipiv);
+lapack_int LAPACKE_zsytri2_work(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                                const lapack_int* ipiv, lapack_complex_double* work, lapack_int lwork);
+lapack_int LAPACKE_zsytri2x(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                            const lapack_int* ipiv, lapack_int nb);
+lapack_int LAPACKE_zsytri2x_work(int matrix_order, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                                 const lapack_int* ipiv, lapack_complex_double* work, lapack_int nb);
+lapack_int LAPACKE_zsytrs2(int matrix_order, char uplo, lapack_int n, lapack_int nrhs, const lapack_complex_double* a,
+                           lapack_int lda, const lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb);
+lapack_int LAPACKE_zsytrs2_work(int matrix_order, char uplo, lapack_int n, lapack_int nrhs,
+                                const lapack_complex_double* a, lapack_int lda, const lapack_int* ipiv,
+                                lapack_complex_double* b, lapack_int ldb, lapack_complex_double* work);
+lapack_int LAPACKE_zunbdb(int matrix_order, char trans, char signs, lapack_int m, lapack_int p, lapack_int q,
+                          lapack_complex_double* x11, lapack_int ldx11, lapack_complex_double* x12, lapack_int ldx12,
+                          lapack_complex_double* x21, lapack_int ldx21, lapack_complex_double* x22, lapack_int ldx22,
+                          double* theta, double* phi, lapack_complex_double* taup1, lapack_complex_double* taup2,
+                          lapack_complex_double* tauq1, lapack_complex_double* tauq2);
+lapack_int LAPACKE_zunbdb_work(int matrix_order, char trans, char signs, lapack_int m, lapack_int p, lapack_int q,
+                               lapack_complex_double* x11, lapack_int ldx11, lapack_complex_double* x12,
+                               lapack_int ldx12, lapack_complex_double* x21, lapack_int ldx21,
+                               lapack_complex_double* x22, lapack_int ldx22, double* theta, double* phi,
+                               lapack_complex_double* taup1, lapack_complex_double* taup2, lapack_complex_double* tauq1,
+                               lapack_complex_double* tauq2, lapack_complex_double* work, lapack_int lwork);
+lapack_int LAPACKE_zuncsd(int matrix_order, char jobu1, char jobu2, char jobv1t, char jobv2t, char trans, char signs,
+                          lapack_int m, lapack_int p, lapack_int q, lapack_complex_double* x11, lapack_int ldx11,
+                          lapack_complex_double* x12, lapack_int ldx12, lapack_complex_double* x21, lapack_int ldx21,
+                          lapack_complex_double* x22, lapack_int ldx22, double* theta, lapack_complex_double* u1,
+                          lapack_int ldu1, lapack_complex_double* u2, lapack_int ldu2, lapack_complex_double* v1t,
+                          lapack_int ldv1t, lapack_complex_double* v2t, lapack_int ldv2t);
+lapack_int LAPACKE_zuncsd_work(int matrix_order, char jobu1, char jobu2, char jobv1t, char jobv2t, char trans,
+                               char signs, lapack_int m, lapack_int p, lapack_int q, lapack_complex_double* x11,
+                               lapack_int ldx11, lapack_complex_double* x12, lapack_int ldx12,
+                               lapack_complex_double* x21, lapack_int ldx21, lapack_complex_double* x22,
+                               lapack_int ldx22, double* theta, lapack_complex_double* u1, lapack_int ldu1,
+                               lapack_complex_double* u2, lapack_int ldu2, lapack_complex_double* v1t, lapack_int ldv1t,
+                               lapack_complex_double* v2t, lapack_int ldv2t, lapack_complex_double* work,
+                               lapack_int lwork, double* rwork, lapack_int lrwork, lapack_int* iwork);
+// LAPACK 3.4.0
+lapack_int LAPACKE_sgemqrt(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                           lapack_int nb, const float* v, lapack_int ldv, const float* t, lapack_int ldt, float* c,
+                           lapack_int ldc);
+lapack_int LAPACKE_dgemqrt(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                           lapack_int nb, const double* v, lapack_int ldv, const double* t, lapack_int ldt, double* c,
+                           lapack_int ldc);
+lapack_int LAPACKE_cgemqrt(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                           lapack_int nb, const lapack_complex_float* v, lapack_int ldv, const lapack_complex_float* t,
+                           lapack_int ldt, lapack_complex_float* c, lapack_int ldc);
+lapack_int LAPACKE_zgemqrt(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                           lapack_int nb, const lapack_complex_double* v, lapack_int ldv,
+                           const lapack_complex_double* t, lapack_int ldt, lapack_complex_double* c, lapack_int ldc);
+
+lapack_int LAPACKE_sgeqrt(int matrix_order, lapack_int m, lapack_int n, lapack_int nb, float* a, lapack_int lda,
+                          float* t, lapack_int ldt);
+lapack_int LAPACKE_dgeqrt(int matrix_order, lapack_int m, lapack_int n, lapack_int nb, double* a, lapack_int lda,
+                          double* t, lapack_int ldt);
+lapack_int LAPACKE_cgeqrt(int matrix_order, lapack_int m, lapack_int n, lapack_int nb, lapack_complex_float* a,
+                          lapack_int lda, lapack_complex_float* t, lapack_int ldt);
+lapack_int LAPACKE_zgeqrt(int matrix_order, lapack_int m, lapack_int n, lapack_int nb, lapack_complex_double* a,
+                          lapack_int lda, lapack_complex_double* t, lapack_int ldt);
+
+lapack_int LAPACKE_sgeqrt2(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, float* t,
+                           lapack_int ldt);
+lapack_int LAPACKE_dgeqrt2(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda, double* t,
+                           lapack_int ldt);
+lapack_int LAPACKE_cgeqrt2(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* t, lapack_int ldt);
+lapack_int LAPACKE_zgeqrt2(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* t, lapack_int ldt);
+
+lapack_int LAPACKE_sgeqrt3(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, float* t,
+                           lapack_int ldt);
+lapack_int LAPACKE_dgeqrt3(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda, double* t,
+                           lapack_int ldt);
+lapack_int LAPACKE_cgeqrt3(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* t, lapack_int ldt);
+lapack_int LAPACKE_zgeqrt3(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* t, lapack_int ldt);
+
+lapack_int LAPACKE_stpmqrt(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                           lapack_int l, lapack_int nb, const float* v, lapack_int ldv, const float* t, lapack_int ldt,
+                           float* a, lapack_int lda, float* b, lapack_int ldb);
+lapack_int LAPACKE_dtpmqrt(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                           lapack_int l, lapack_int nb, const double* v, lapack_int ldv, const double* t,
+                           lapack_int ldt, double* a, lapack_int lda, double* b, lapack_int ldb);
+lapack_int LAPACKE_ctpmqrt(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                           lapack_int l, lapack_int nb, const lapack_complex_float* v, lapack_int ldv,
+                           const lapack_complex_float* t, lapack_int ldt, lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* b, lapack_int ldb);
+lapack_int LAPACKE_ztpmqrt(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                           lapack_int l, lapack_int nb, const lapack_complex_double* v, lapack_int ldv,
+                           const lapack_complex_double* t, lapack_int ldt, lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* b, lapack_int ldb);
+
+lapack_int LAPACKE_dtpqrt(int matrix_order, lapack_int m, lapack_int n, lapack_int l, lapack_int nb, double* a,
+                          lapack_int lda, double* b, lapack_int ldb, double* t, lapack_int ldt);
+lapack_int LAPACKE_ctpqrt(int matrix_order, lapack_int m, lapack_int n, lapack_int l, lapack_int nb,
+                          lapack_complex_float* a, lapack_int lda, lapack_complex_float* t, lapack_complex_float* b,
+                          lapack_int ldb, lapack_int ldt);
+lapack_int LAPACKE_ztpqrt(int matrix_order, lapack_int m, lapack_int n, lapack_int l, lapack_int nb,
+                          lapack_complex_double* a, lapack_int lda, lapack_complex_double* b, lapack_int ldb,
+                          lapack_complex_double* t, lapack_int ldt);
+
+lapack_int LAPACKE_stpqrt2(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, float* b,
+                           lapack_int ldb, float* t, lapack_int ldt);
+lapack_int LAPACKE_dtpqrt2(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda, double* b,
+                           lapack_int ldb, double* t, lapack_int ldt);
+lapack_int LAPACKE_ctpqrt2(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* b, lapack_int ldb, lapack_complex_float* t, lapack_int ldt);
+lapack_int LAPACKE_ztpqrt2(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* b, lapack_int ldb, lapack_complex_double* t, lapack_int ldt);
+
+lapack_int LAPACKE_stprfb(int matrix_order, char side, char trans, char direct, char storev, lapack_int m, lapack_int n,
+                          lapack_int k, lapack_int l, const float* v, lapack_int ldv, const float* t, lapack_int ldt,
+                          float* a, lapack_int lda, float* b, lapack_int ldb, lapack_int myldwork);
+lapack_int LAPACKE_dtprfb(int matrix_order, char side, char trans, char direct, char storev, lapack_int m, lapack_int n,
+                          lapack_int k, lapack_int l, const double* v, lapack_int ldv, const double* t, lapack_int ldt,
+                          double* a, lapack_int lda, double* b, lapack_int ldb, lapack_int myldwork);
+lapack_int LAPACKE_ctprfb(int matrix_order, char side, char trans, char direct, char storev, lapack_int m, lapack_int n,
+                          lapack_int k, lapack_int l, const lapack_complex_float* v, lapack_int ldv,
+                          const lapack_complex_float* t, lapack_int ldt, lapack_complex_float* a, lapack_int lda,
+                          lapack_complex_float* b, lapack_int ldb, lapack_int myldwork);
+lapack_int LAPACKE_ztprfb(int matrix_order, char side, char trans, char direct, char storev, lapack_int m, lapack_int n,
+                          lapack_int k, lapack_int l, const lapack_complex_double* v, lapack_int ldv,
+                          const lapack_complex_double* t, lapack_int ldt, lapack_complex_double* a, lapack_int lda,
+                          lapack_complex_double* b, lapack_int ldb, lapack_int myldwork);
+
+lapack_int LAPACKE_sgemqrt_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                                lapack_int nb, const float* v, lapack_int ldv, const float* t, lapack_int ldt, float* c,
+                                lapack_int ldc, float* work);
+lapack_int LAPACKE_dgemqrt_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                                lapack_int nb, const double* v, lapack_int ldv, const double* t, lapack_int ldt,
+                                double* c, lapack_int ldc, double* work);
+lapack_int LAPACKE_cgemqrt_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                                lapack_int nb, const lapack_complex_float* v, lapack_int ldv,
+                                const lapack_complex_float* t, lapack_int ldt, lapack_complex_float* c, lapack_int ldc,
+                                lapack_complex_float* work);
+lapack_int LAPACKE_zgemqrt_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                                lapack_int nb, const lapack_complex_double* v, lapack_int ldv,
+                                const lapack_complex_double* t, lapack_int ldt, lapack_complex_double* c,
+                                lapack_int ldc, lapack_complex_double* work);
+
+lapack_int LAPACKE_sgeqrt_work(int matrix_order, lapack_int m, lapack_int n, lapack_int nb, float* a, lapack_int lda,
+                               float* t, lapack_int ldt, float* work);
+lapack_int LAPACKE_dgeqrt_work(int matrix_order, lapack_int m, lapack_int n, lapack_int nb, double* a, lapack_int lda,
+                               double* t, lapack_int ldt, double* work);
+lapack_int LAPACKE_cgeqrt_work(int matrix_order, lapack_int m, lapack_int n, lapack_int nb, lapack_complex_float* a,
+                               lapack_int lda, lapack_complex_float* t, lapack_int ldt, lapack_complex_float* work);
+lapack_int LAPACKE_zgeqrt_work(int matrix_order, lapack_int m, lapack_int n, lapack_int nb, lapack_complex_double* a,
+                               lapack_int lda, lapack_complex_double* t, lapack_int ldt, lapack_complex_double* work);
+
+lapack_int LAPACKE_sgeqrt2_work(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, float* t,
+                                lapack_int ldt);
+lapack_int LAPACKE_dgeqrt2_work(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda, double* t,
+                                lapack_int ldt);
+lapack_int LAPACKE_cgeqrt2_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* t, lapack_int ldt);
+lapack_int LAPACKE_zgeqrt2_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* t, lapack_int ldt);
+
+lapack_int LAPACKE_sgeqrt3_work(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, float* t,
+                                lapack_int ldt);
+lapack_int LAPACKE_dgeqrt3_work(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda, double* t,
+                                lapack_int ldt);
+lapack_int LAPACKE_cgeqrt3_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* t, lapack_int ldt);
+lapack_int LAPACKE_zgeqrt3_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* t, lapack_int ldt);
+
+lapack_int LAPACKE_stpmqrt_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                                lapack_int l, lapack_int nb, const float* v, lapack_int ldv, const float* t,
+                                lapack_int ldt, float* a, lapack_int lda, float* b, lapack_int ldb, float* work);
+lapack_int LAPACKE_dtpmqrt_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                                lapack_int l, lapack_int nb, const double* v, lapack_int ldv, const double* t,
+                                lapack_int ldt, double* a, lapack_int lda, double* b, lapack_int ldb, double* work);
+lapack_int LAPACKE_ctpmqrt_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                                lapack_int l, lapack_int nb, const lapack_complex_float* v, lapack_int ldv,
+                                const lapack_complex_float* t, lapack_int ldt, lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* b, lapack_int ldb, lapack_complex_float* work);
+lapack_int LAPACKE_ztpmqrt_work(int matrix_order, char side, char trans, lapack_int m, lapack_int n, lapack_int k,
+                                lapack_int l, lapack_int nb, const lapack_complex_double* v, lapack_int ldv,
+                                const lapack_complex_double* t, lapack_int ldt, lapack_complex_double* a,
+                                lapack_int lda, lapack_complex_double* b, lapack_int ldb, lapack_complex_double* work);
+
+lapack_int LAPACKE_dtpqrt_work(int matrix_order, lapack_int m, lapack_int n, lapack_int l, lapack_int nb, double* a,
+                               lapack_int lda, double* b, lapack_int ldb, double* t, lapack_int ldt, double* work);
+lapack_int LAPACKE_ctpqrt_work(int matrix_order, lapack_int m, lapack_int n, lapack_int l, lapack_int nb,
+                               lapack_complex_float* a, lapack_int lda, lapack_complex_float* t,
+                               lapack_complex_float* b, lapack_int ldb, lapack_int ldt, lapack_complex_float* work);
+lapack_int LAPACKE_ztpqrt_work(int matrix_order, lapack_int m, lapack_int n, lapack_int l, lapack_int nb,
+                               lapack_complex_double* a, lapack_int lda, lapack_complex_double* b, lapack_int ldb,
+                               lapack_complex_double* t, lapack_int ldt, lapack_complex_double* work);
+
+lapack_int LAPACKE_stpqrt2_work(int matrix_order, lapack_int m, lapack_int n, float* a, lapack_int lda, float* b,
+                                lapack_int ldb, float* t, lapack_int ldt);
+lapack_int LAPACKE_dtpqrt2_work(int matrix_order, lapack_int m, lapack_int n, double* a, lapack_int lda, double* b,
+                                lapack_int ldb, double* t, lapack_int ldt);
+lapack_int LAPACKE_ctpqrt2_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_float* a, lapack_int lda,
+                                lapack_complex_float* b, lapack_int ldb, lapack_complex_float* t, lapack_int ldt);
+lapack_int LAPACKE_ztpqrt2_work(int matrix_order, lapack_int m, lapack_int n, lapack_complex_double* a, lapack_int lda,
+                                lapack_complex_double* b, lapack_int ldb, lapack_complex_double* t, lapack_int ldt);
+
+lapack_int LAPACKE_stprfb_work(int matrix_order, char side, char trans, char direct, char storev, lapack_int m,
+                               lapack_int n, lapack_int k, lapack_int l, const float* v, lapack_int ldv, const float* t,
+                               lapack_int ldt, float* a, lapack_int lda, float* b, lapack_int ldb, const float* mywork,
+                               lapack_int myldwork);
+lapack_int LAPACKE_dtprfb_work(int matrix_order, char side, char trans, char direct, char storev, lapack_int m,
+                               lapack_int n, lapack_int k, lapack_int l, const double* v, lapack_int ldv,
+                               const double* t, lapack_int ldt, double* a, lapack_int lda, double* b, lapack_int ldb,
+                               const double* mywork, lapack_int myldwork);
+lapack_int LAPACKE_ctprfb_work(int matrix_order, char side, char trans, char direct, char storev, lapack_int m,
+                               lapack_int n, lapack_int k, lapack_int l, const lapack_complex_float* v, lapack_int ldv,
+                               const lapack_complex_float* t, lapack_int ldt, lapack_complex_float* a, lapack_int lda,
+                               lapack_complex_float* b, lapack_int ldb, const float* mywork, lapack_int myldwork);
+lapack_int LAPACKE_ztprfb_work(int matrix_order, char side, char trans, char direct, char storev, lapack_int m,
+                               lapack_int n, lapack_int k, lapack_int l, const lapack_complex_double* v, lapack_int ldv,
+                               const lapack_complex_double* t, lapack_int ldt, lapack_complex_double* a, lapack_int lda,
+                               lapack_complex_double* b, lapack_int ldb, const double* mywork, lapack_int myldwork);
+// LAPACK 3.X.X
+lapack_int LAPACKE_csyr(int matrix_order, char uplo, lapack_int n, lapack_complex_float alpha,
+                        const lapack_complex_float* x, lapack_int incx, lapack_complex_float* a, lapack_int lda);
+lapack_int LAPACKE_zsyr(int matrix_order, char uplo, lapack_int n, lapack_complex_double alpha,
+                        const lapack_complex_double* x, lapack_int incx, lapack_complex_double* a, lapack_int lda);
+
+lapack_int LAPACKE_csyr_work(int matrix_order, char uplo, lapack_int n, lapack_complex_float alpha,
+                             const lapack_complex_float* x, lapack_int incx, lapack_complex_float* a, lapack_int lda);
+lapack_int LAPACKE_zsyr_work(int matrix_order, char uplo, lapack_int n, lapack_complex_double alpha,
+                             const lapack_complex_double* x, lapack_int incx, lapack_complex_double* a, lapack_int lda);
+
+#define LAPACK_sgetrf LAPACK_GLOBAL(sgetrf, SGETRF)
+#define LAPACK_dgetrf LAPACK_GLOBAL(dgetrf, DGETRF)
+#define LAPACK_cgetrf LAPACK_GLOBAL(cgetrf, CGETRF)
+#define LAPACK_zgetrf LAPACK_GLOBAL(zgetrf, ZGETRF)
+#define LAPACK_sgbtrf LAPACK_GLOBAL(sgbtrf, SGBTRF)
+#define LAPACK_dgbtrf LAPACK_GLOBAL(dgbtrf, DGBTRF)
+#define LAPACK_cgbtrf LAPACK_GLOBAL(cgbtrf, CGBTRF)
+#define LAPACK_zgbtrf LAPACK_GLOBAL(zgbtrf, ZGBTRF)
+#define LAPACK_sgttrf LAPACK_GLOBAL(sgttrf, SGTTRF)
+#define LAPACK_dgttrf LAPACK_GLOBAL(dgttrf, DGTTRF)
+#define LAPACK_cgttrf LAPACK_GLOBAL(cgttrf, CGTTRF)
+#define LAPACK_zgttrf LAPACK_GLOBAL(zgttrf, ZGTTRF)
+#define LAPACK_spotrf LAPACK_GLOBAL(spotrf, SPOTRF)
+#define LAPACK_dpotrf LAPACK_GLOBAL(dpotrf, DPOTRF)
+#define LAPACK_cpotrf LAPACK_GLOBAL(cpotrf, CPOTRF)
+#define LAPACK_zpotrf LAPACK_GLOBAL(zpotrf, ZPOTRF)
+#define LAPACK_dpstrf LAPACK_GLOBAL(dpstrf, DPSTRF)
+#define LAPACK_spstrf LAPACK_GLOBAL(spstrf, SPSTRF)
+#define LAPACK_zpstrf LAPACK_GLOBAL(zpstrf, ZPSTRF)
+#define LAPACK_cpstrf LAPACK_GLOBAL(cpstrf, CPSTRF)
+#define LAPACK_dpftrf LAPACK_GLOBAL(dpftrf, DPFTRF)
+#define LAPACK_spftrf LAPACK_GLOBAL(spftrf, SPFTRF)
+#define LAPACK_zpftrf LAPACK_GLOBAL(zpftrf, ZPFTRF)
+#define LAPACK_cpftrf LAPACK_GLOBAL(cpftrf, CPFTRF)
+#define LAPACK_spptrf LAPACK_GLOBAL(spptrf, SPPTRF)
+#define LAPACK_dpptrf LAPACK_GLOBAL(dpptrf, DPPTRF)
+#define LAPACK_cpptrf LAPACK_GLOBAL(cpptrf, CPPTRF)
+#define LAPACK_zpptrf LAPACK_GLOBAL(zpptrf, ZPPTRF)
+#define LAPACK_spbtrf LAPACK_GLOBAL(spbtrf, SPBTRF)
+#define LAPACK_dpbtrf LAPACK_GLOBAL(dpbtrf, DPBTRF)
+#define LAPACK_cpbtrf LAPACK_GLOBAL(cpbtrf, CPBTRF)
+#define LAPACK_zpbtrf LAPACK_GLOBAL(zpbtrf, ZPBTRF)
+#define LAPACK_spttrf LAPACK_GLOBAL(spttrf, SPTTRF)
+#define LAPACK_dpttrf LAPACK_GLOBAL(dpttrf, DPTTRF)
+#define LAPACK_cpttrf LAPACK_GLOBAL(cpttrf, CPTTRF)
+#define LAPACK_zpttrf LAPACK_GLOBAL(zpttrf, ZPTTRF)
+#define LAPACK_ssytrf LAPACK_GLOBAL(ssytrf, SSYTRF)
+#define LAPACK_dsytrf LAPACK_GLOBAL(dsytrf, DSYTRF)
+#define LAPACK_csytrf LAPACK_GLOBAL(csytrf, CSYTRF)
+#define LAPACK_zsytrf LAPACK_GLOBAL(zsytrf, ZSYTRF)
+#define LAPACK_chetrf LAPACK_GLOBAL(chetrf, CHETRF)
+#define LAPACK_zhetrf LAPACK_GLOBAL(zhetrf, ZHETRF)
+#define LAPACK_ssptrf LAPACK_GLOBAL(ssptrf, SSPTRF)
+#define LAPACK_dsptrf LAPACK_GLOBAL(dsptrf, DSPTRF)
+#define LAPACK_csptrf LAPACK_GLOBAL(csptrf, CSPTRF)
+#define LAPACK_zsptrf LAPACK_GLOBAL(zsptrf, ZSPTRF)
+#define LAPACK_chptrf LAPACK_GLOBAL(chptrf, CHPTRF)
+#define LAPACK_zhptrf LAPACK_GLOBAL(zhptrf, ZHPTRF)
+#define LAPACK_sgetrs LAPACK_GLOBAL(sgetrs, SGETRS)
+#define LAPACK_dgetrs LAPACK_GLOBAL(dgetrs, DGETRS)
+#define LAPACK_cgetrs LAPACK_GLOBAL(cgetrs, CGETRS)
+#define LAPACK_zgetrs LAPACK_GLOBAL(zgetrs, ZGETRS)
+#define LAPACK_sgbtrs LAPACK_GLOBAL(sgbtrs, SGBTRS)
+#define LAPACK_dgbtrs LAPACK_GLOBAL(dgbtrs, DGBTRS)
+#define LAPACK_cgbtrs LAPACK_GLOBAL(cgbtrs, CGBTRS)
+#define LAPACK_zgbtrs LAPACK_GLOBAL(zgbtrs, ZGBTRS)
+#define LAPACK_sgttrs LAPACK_GLOBAL(sgttrs, SGTTRS)
+#define LAPACK_dgttrs LAPACK_GLOBAL(dgttrs, DGTTRS)
+#define LAPACK_cgttrs LAPACK_GLOBAL(cgttrs, CGTTRS)
+#define LAPACK_zgttrs LAPACK_GLOBAL(zgttrs, ZGTTRS)
+#define LAPACK_spotrs LAPACK_GLOBAL(spotrs, SPOTRS)
+#define LAPACK_dpotrs LAPACK_GLOBAL(dpotrs, DPOTRS)
+#define LAPACK_cpotrs LAPACK_GLOBAL(cpotrs, CPOTRS)
+#define LAPACK_zpotrs LAPACK_GLOBAL(zpotrs, ZPOTRS)
+#define LAPACK_dpftrs LAPACK_GLOBAL(dpftrs, DPFTRS)
+#define LAPACK_spftrs LAPACK_GLOBAL(spftrs, SPFTRS)
+#define LAPACK_zpftrs LAPACK_GLOBAL(zpftrs, ZPFTRS)
+#define LAPACK_cpftrs LAPACK_GLOBAL(cpftrs, CPFTRS)
+#define LAPACK_spptrs LAPACK_GLOBAL(spptrs, SPPTRS)
+#define LAPACK_dpptrs LAPACK_GLOBAL(dpptrs, DPPTRS)
+#define LAPACK_cpptrs LAPACK_GLOBAL(cpptrs, CPPTRS)
+#define LAPACK_zpptrs LAPACK_GLOBAL(zpptrs, ZPPTRS)
+#define LAPACK_spbtrs LAPACK_GLOBAL(spbtrs, SPBTRS)
+#define LAPACK_dpbtrs LAPACK_GLOBAL(dpbtrs, DPBTRS)
+#define LAPACK_cpbtrs LAPACK_GLOBAL(cpbtrs, CPBTRS)
+#define LAPACK_zpbtrs LAPACK_GLOBAL(zpbtrs, ZPBTRS)
+#define LAPACK_spttrs LAPACK_GLOBAL(spttrs, SPTTRS)
+#define LAPACK_dpttrs LAPACK_GLOBAL(dpttrs, DPTTRS)
+#define LAPACK_cpttrs LAPACK_GLOBAL(cpttrs, CPTTRS)
+#define LAPACK_zpttrs LAPACK_GLOBAL(zpttrs, ZPTTRS)
+#define LAPACK_ssytrs LAPACK_GLOBAL(ssytrs, SSYTRS)
+#define LAPACK_dsytrs LAPACK_GLOBAL(dsytrs, DSYTRS)
+#define LAPACK_csytrs LAPACK_GLOBAL(csytrs, CSYTRS)
+#define LAPACK_zsytrs LAPACK_GLOBAL(zsytrs, ZSYTRS)
+#define LAPACK_chetrs LAPACK_GLOBAL(chetrs, CHETRS)
+#define LAPACK_zhetrs LAPACK_GLOBAL(zhetrs, ZHETRS)
+#define LAPACK_ssptrs LAPACK_GLOBAL(ssptrs, SSPTRS)
+#define LAPACK_dsptrs LAPACK_GLOBAL(dsptrs, DSPTRS)
+#define LAPACK_csptrs LAPACK_GLOBAL(csptrs, CSPTRS)
+#define LAPACK_zsptrs LAPACK_GLOBAL(zsptrs, ZSPTRS)
+#define LAPACK_chptrs LAPACK_GLOBAL(chptrs, CHPTRS)
+#define LAPACK_zhptrs LAPACK_GLOBAL(zhptrs, ZHPTRS)
+#define LAPACK_strtrs LAPACK_GLOBAL(strtrs, STRTRS)
+#define LAPACK_dtrtrs LAPACK_GLOBAL(dtrtrs, DTRTRS)
+#define LAPACK_ctrtrs LAPACK_GLOBAL(ctrtrs, CTRTRS)
+#define LAPACK_ztrtrs LAPACK_GLOBAL(ztrtrs, ZTRTRS)
+#define LAPACK_stptrs LAPACK_GLOBAL(stptrs, STPTRS)
+#define LAPACK_dtptrs LAPACK_GLOBAL(dtptrs, DTPTRS)
+#define LAPACK_ctptrs LAPACK_GLOBAL(ctptrs, CTPTRS)
+#define LAPACK_ztptrs LAPACK_GLOBAL(ztptrs, ZTPTRS)
+#define LAPACK_stbtrs LAPACK_GLOBAL(stbtrs, STBTRS)
+#define LAPACK_dtbtrs LAPACK_GLOBAL(dtbtrs, DTBTRS)
+#define LAPACK_ctbtrs LAPACK_GLOBAL(ctbtrs, CTBTRS)
+#define LAPACK_ztbtrs LAPACK_GLOBAL(ztbtrs, ZTBTRS)
+#define LAPACK_sgecon LAPACK_GLOBAL(sgecon, SGECON)
+#define LAPACK_dgecon LAPACK_GLOBAL(dgecon, DGECON)
+#define LAPACK_cgecon LAPACK_GLOBAL(cgecon, CGECON)
+#define LAPACK_zgecon LAPACK_GLOBAL(zgecon, ZGECON)
+#define LAPACK_sgbcon LAPACK_GLOBAL(sgbcon, SGBCON)
+#define LAPACK_dgbcon LAPACK_GLOBAL(dgbcon, DGBCON)
+#define LAPACK_cgbcon LAPACK_GLOBAL(cgbcon, CGBCON)
+#define LAPACK_zgbcon LAPACK_GLOBAL(zgbcon, ZGBCON)
+#define LAPACK_sgtcon LAPACK_GLOBAL(sgtcon, SGTCON)
+#define LAPACK_dgtcon LAPACK_GLOBAL(dgtcon, DGTCON)
+#define LAPACK_cgtcon LAPACK_GLOBAL(cgtcon, CGTCON)
+#define LAPACK_zgtcon LAPACK_GLOBAL(zgtcon, ZGTCON)
+#define LAPACK_spocon LAPACK_GLOBAL(spocon, SPOCON)
+#define LAPACK_dpocon LAPACK_GLOBAL(dpocon, DPOCON)
+#define LAPACK_cpocon LAPACK_GLOBAL(cpocon, CPOCON)
+#define LAPACK_zpocon LAPACK_GLOBAL(zpocon, ZPOCON)
+#define LAPACK_sppcon LAPACK_GLOBAL(sppcon, SPPCON)
+#define LAPACK_dppcon LAPACK_GLOBAL(dppcon, DPPCON)
+#define LAPACK_cppcon LAPACK_GLOBAL(cppcon, CPPCON)
+#define LAPACK_zppcon LAPACK_GLOBAL(zppcon, ZPPCON)
+#define LAPACK_spbcon LAPACK_GLOBAL(spbcon, SPBCON)
+#define LAPACK_dpbcon LAPACK_GLOBAL(dpbcon, DPBCON)
+#define LAPACK_cpbcon LAPACK_GLOBAL(cpbcon, CPBCON)
+#define LAPACK_zpbcon LAPACK_GLOBAL(zpbcon, ZPBCON)
+#define LAPACK_sptcon LAPACK_GLOBAL(sptcon, SPTCON)
+#define LAPACK_dptcon LAPACK_GLOBAL(dptcon, DPTCON)
+#define LAPACK_cptcon LAPACK_GLOBAL(cptcon, CPTCON)
+#define LAPACK_zptcon LAPACK_GLOBAL(zptcon, ZPTCON)
+#define LAPACK_ssycon LAPACK_GLOBAL(ssycon, SSYCON)
+#define LAPACK_dsycon LAPACK_GLOBAL(dsycon, DSYCON)
+#define LAPACK_csycon LAPACK_GLOBAL(csycon, CSYCON)
+#define LAPACK_zsycon LAPACK_GLOBAL(zsycon, ZSYCON)
+#define LAPACK_checon LAPACK_GLOBAL(checon, CHECON)
+#define LAPACK_zhecon LAPACK_GLOBAL(zhecon, ZHECON)
+#define LAPACK_sspcon LAPACK_GLOBAL(sspcon, SSPCON)
+#define LAPACK_dspcon LAPACK_GLOBAL(dspcon, DSPCON)
+#define LAPACK_cspcon LAPACK_GLOBAL(cspcon, CSPCON)
+#define LAPACK_zspcon LAPACK_GLOBAL(zspcon, ZSPCON)
+#define LAPACK_chpcon LAPACK_GLOBAL(chpcon, CHPCON)
+#define LAPACK_zhpcon LAPACK_GLOBAL(zhpcon, ZHPCON)
+#define LAPACK_strcon LAPACK_GLOBAL(strcon, STRCON)
+#define LAPACK_dtrcon LAPACK_GLOBAL(dtrcon, DTRCON)
+#define LAPACK_ctrcon LAPACK_GLOBAL(ctrcon, CTRCON)
+#define LAPACK_ztrcon LAPACK_GLOBAL(ztrcon, ZTRCON)
+#define LAPACK_stpcon LAPACK_GLOBAL(stpcon, STPCON)
+#define LAPACK_dtpcon LAPACK_GLOBAL(dtpcon, DTPCON)
+#define LAPACK_ctpcon LAPACK_GLOBAL(ctpcon, CTPCON)
+#define LAPACK_ztpcon LAPACK_GLOBAL(ztpcon, ZTPCON)
+#define LAPACK_stbcon LAPACK_GLOBAL(stbcon, STBCON)
+#define LAPACK_dtbcon LAPACK_GLOBAL(dtbcon, DTBCON)
+#define LAPACK_ctbcon LAPACK_GLOBAL(ctbcon, CTBCON)
+#define LAPACK_ztbcon LAPACK_GLOBAL(ztbcon, ZTBCON)
+#define LAPACK_sgerfs LAPACK_GLOBAL(sgerfs, SGERFS)
+#define LAPACK_dgerfs LAPACK_GLOBAL(dgerfs, DGERFS)
+#define LAPACK_cgerfs LAPACK_GLOBAL(cgerfs, CGERFS)
+#define LAPACK_zgerfs LAPACK_GLOBAL(zgerfs, ZGERFS)
+#define LAPACK_dgerfsx LAPACK_GLOBAL(dgerfsx, DGERFSX)
+#define LAPACK_sgerfsx LAPACK_GLOBAL(sgerfsx, SGERFSX)
+#define LAPACK_zgerfsx LAPACK_GLOBAL(zgerfsx, ZGERFSX)
+#define LAPACK_cgerfsx LAPACK_GLOBAL(cgerfsx, CGERFSX)
+#define LAPACK_sgbrfs LAPACK_GLOBAL(sgbrfs, SGBRFS)
+#define LAPACK_dgbrfs LAPACK_GLOBAL(dgbrfs, DGBRFS)
+#define LAPACK_cgbrfs LAPACK_GLOBAL(cgbrfs, CGBRFS)
+#define LAPACK_zgbrfs LAPACK_GLOBAL(zgbrfs, ZGBRFS)
+#define LAPACK_dgbrfsx LAPACK_GLOBAL(dgbrfsx, DGBRFSX)
+#define LAPACK_sgbrfsx LAPACK_GLOBAL(sgbrfsx, SGBRFSX)
+#define LAPACK_zgbrfsx LAPACK_GLOBAL(zgbrfsx, ZGBRFSX)
+#define LAPACK_cgbrfsx LAPACK_GLOBAL(cgbrfsx, CGBRFSX)
+#define LAPACK_sgtrfs LAPACK_GLOBAL(sgtrfs, SGTRFS)
+#define LAPACK_dgtrfs LAPACK_GLOBAL(dgtrfs, DGTRFS)
+#define LAPACK_cgtrfs LAPACK_GLOBAL(cgtrfs, CGTRFS)
+#define LAPACK_zgtrfs LAPACK_GLOBAL(zgtrfs, ZGTRFS)
+#define LAPACK_sporfs LAPACK_GLOBAL(sporfs, SPORFS)
+#define LAPACK_dporfs LAPACK_GLOBAL(dporfs, DPORFS)
+#define LAPACK_cporfs LAPACK_GLOBAL(cporfs, CPORFS)
+#define LAPACK_zporfs LAPACK_GLOBAL(zporfs, ZPORFS)
+#define LAPACK_dporfsx LAPACK_GLOBAL(dporfsx, DPORFSX)
+#define LAPACK_sporfsx LAPACK_GLOBAL(sporfsx, SPORFSX)
+#define LAPACK_zporfsx LAPACK_GLOBAL(zporfsx, ZPORFSX)
+#define LAPACK_cporfsx LAPACK_GLOBAL(cporfsx, CPORFSX)
+#define LAPACK_spprfs LAPACK_GLOBAL(spprfs, SPPRFS)
+#define LAPACK_dpprfs LAPACK_GLOBAL(dpprfs, DPPRFS)
+#define LAPACK_cpprfs LAPACK_GLOBAL(cpprfs, CPPRFS)
+#define LAPACK_zpprfs LAPACK_GLOBAL(zpprfs, ZPPRFS)
+#define LAPACK_spbrfs LAPACK_GLOBAL(spbrfs, SPBRFS)
+#define LAPACK_dpbrfs LAPACK_GLOBAL(dpbrfs, DPBRFS)
+#define LAPACK_cpbrfs LAPACK_GLOBAL(cpbrfs, CPBRFS)
+#define LAPACK_zpbrfs LAPACK_GLOBAL(zpbrfs, ZPBRFS)
+#define LAPACK_sptrfs LAPACK_GLOBAL(sptrfs, SPTRFS)
+#define LAPACK_dptrfs LAPACK_GLOBAL(dptrfs, DPTRFS)
+#define LAPACK_cptrfs LAPACK_GLOBAL(cptrfs, CPTRFS)
+#define LAPACK_zptrfs LAPACK_GLOBAL(zptrfs, ZPTRFS)
+#define LAPACK_ssyrfs LAPACK_GLOBAL(ssyrfs, SSYRFS)
+#define LAPACK_dsyrfs LAPACK_GLOBAL(dsyrfs, DSYRFS)
+#define LAPACK_csyrfs LAPACK_GLOBAL(csyrfs, CSYRFS)
+#define LAPACK_zsyrfs LAPACK_GLOBAL(zsyrfs, ZSYRFS)
+#define LAPACK_dsyrfsx LAPACK_GLOBAL(dsyrfsx, DSYRFSX)
+#define LAPACK_ssyrfsx LAPACK_GLOBAL(ssyrfsx, SSYRFSX)
+#define LAPACK_zsyrfsx LAPACK_GLOBAL(zsyrfsx, ZSYRFSX)
+#define LAPACK_csyrfsx LAPACK_GLOBAL(csyrfsx, CSYRFSX)
+#define LAPACK_cherfs LAPACK_GLOBAL(cherfs, CHERFS)
+#define LAPACK_zherfs LAPACK_GLOBAL(zherfs, ZHERFS)
+#define LAPACK_zherfsx LAPACK_GLOBAL(zherfsx, ZHERFSX)
+#define LAPACK_cherfsx LAPACK_GLOBAL(cherfsx, CHERFSX)
+#define LAPACK_ssprfs LAPACK_GLOBAL(ssprfs, SSPRFS)
+#define LAPACK_dsprfs LAPACK_GLOBAL(dsprfs, DSPRFS)
+#define LAPACK_csprfs LAPACK_GLOBAL(csprfs, CSPRFS)
+#define LAPACK_zsprfs LAPACK_GLOBAL(zsprfs, ZSPRFS)
+#define LAPACK_chprfs LAPACK_GLOBAL(chprfs, CHPRFS)
+#define LAPACK_zhprfs LAPACK_GLOBAL(zhprfs, ZHPRFS)
+#define LAPACK_strrfs LAPACK_GLOBAL(strrfs, STRRFS)
+#define LAPACK_dtrrfs LAPACK_GLOBAL(dtrrfs, DTRRFS)
+#define LAPACK_ctrrfs LAPACK_GLOBAL(ctrrfs, CTRRFS)
+#define LAPACK_ztrrfs LAPACK_GLOBAL(ztrrfs, ZTRRFS)
+#define LAPACK_stprfs LAPACK_GLOBAL(stprfs, STPRFS)
+#define LAPACK_dtprfs LAPACK_GLOBAL(dtprfs, DTPRFS)
+#define LAPACK_ctprfs LAPACK_GLOBAL(ctprfs, CTPRFS)
+#define LAPACK_ztprfs LAPACK_GLOBAL(ztprfs, ZTPRFS)
+#define LAPACK_stbrfs LAPACK_GLOBAL(stbrfs, STBRFS)
+#define LAPACK_dtbrfs LAPACK_GLOBAL(dtbrfs, DTBRFS)
+#define LAPACK_ctbrfs LAPACK_GLOBAL(ctbrfs, CTBRFS)
+#define LAPACK_ztbrfs LAPACK_GLOBAL(ztbrfs, ZTBRFS)
+#define LAPACK_sgetri LAPACK_GLOBAL(sgetri, SGETRI)
+#define LAPACK_dgetri LAPACK_GLOBAL(dgetri, DGETRI)
+#define LAPACK_cgetri LAPACK_GLOBAL(cgetri, CGETRI)
+#define LAPACK_zgetri LAPACK_GLOBAL(zgetri, ZGETRI)
+#define LAPACK_spotri LAPACK_GLOBAL(spotri, SPOTRI)
+#define LAPACK_dpotri LAPACK_GLOBAL(dpotri, DPOTRI)
+#define LAPACK_cpotri LAPACK_GLOBAL(cpotri, CPOTRI)
+#define LAPACK_zpotri LAPACK_GLOBAL(zpotri, ZPOTRI)
+#define LAPACK_dpftri LAPACK_GLOBAL(dpftri, DPFTRI)
+#define LAPACK_spftri LAPACK_GLOBAL(spftri, SPFTRI)
+#define LAPACK_zpftri LAPACK_GLOBAL(zpftri, ZPFTRI)
+#define LAPACK_cpftri LAPACK_GLOBAL(cpftri, CPFTRI)
+#define LAPACK_spptri LAPACK_GLOBAL(spptri, SPPTRI)
+#define LAPACK_dpptri LAPACK_GLOBAL(dpptri, DPPTRI)
+#define LAPACK_cpptri LAPACK_GLOBAL(cpptri, CPPTRI)
+#define LAPACK_zpptri LAPACK_GLOBAL(zpptri, ZPPTRI)
+#define LAPACK_ssytri LAPACK_GLOBAL(ssytri, SSYTRI)
+#define LAPACK_dsytri LAPACK_GLOBAL(dsytri, DSYTRI)
+#define LAPACK_csytri LAPACK_GLOBAL(csytri, CSYTRI)
+#define LAPACK_zsytri LAPACK_GLOBAL(zsytri, ZSYTRI)
+#define LAPACK_chetri LAPACK_GLOBAL(chetri, CHETRI)
+#define LAPACK_zhetri LAPACK_GLOBAL(zhetri, ZHETRI)
+#define LAPACK_ssptri LAPACK_GLOBAL(ssptri, SSPTRI)
+#define LAPACK_dsptri LAPACK_GLOBAL(dsptri, DSPTRI)
+#define LAPACK_csptri LAPACK_GLOBAL(csptri, CSPTRI)
+#define LAPACK_zsptri LAPACK_GLOBAL(zsptri, ZSPTRI)
+#define LAPACK_chptri LAPACK_GLOBAL(chptri, CHPTRI)
+#define LAPACK_zhptri LAPACK_GLOBAL(zhptri, ZHPTRI)
+#define LAPACK_strtri LAPACK_GLOBAL(strtri, STRTRI)
+#define LAPACK_dtrtri LAPACK_GLOBAL(dtrtri, DTRTRI)
+#define LAPACK_ctrtri LAPACK_GLOBAL(ctrtri, CTRTRI)
+#define LAPACK_ztrtri LAPACK_GLOBAL(ztrtri, ZTRTRI)
+#define LAPACK_dtftri LAPACK_GLOBAL(dtftri, DTFTRI)
+#define LAPACK_stftri LAPACK_GLOBAL(stftri, STFTRI)
+#define LAPACK_ztftri LAPACK_GLOBAL(ztftri, ZTFTRI)
+#define LAPACK_ctftri LAPACK_GLOBAL(ctftri, CTFTRI)
+#define LAPACK_stptri LAPACK_GLOBAL(stptri, STPTRI)
+#define LAPACK_dtptri LAPACK_GLOBAL(dtptri, DTPTRI)
+#define LAPACK_ctptri LAPACK_GLOBAL(ctptri, CTPTRI)
+#define LAPACK_ztptri LAPACK_GLOBAL(ztptri, ZTPTRI)
+#define LAPACK_sgeequ LAPACK_GLOBAL(sgeequ, SGEEQU)
+#define LAPACK_dgeequ LAPACK_GLOBAL(dgeequ, DGEEQU)
+#define LAPACK_cgeequ LAPACK_GLOBAL(cgeequ, CGEEQU)
+#define LAPACK_zgeequ LAPACK_GLOBAL(zgeequ, ZGEEQU)
+#define LAPACK_dgeequb LAPACK_GLOBAL(dgeequb, DGEEQUB)
+#define LAPACK_sgeequb LAPACK_GLOBAL(sgeequb, SGEEQUB)
+#define LAPACK_zgeequb LAPACK_GLOBAL(zgeequb, ZGEEQUB)
+#define LAPACK_cgeequb LAPACK_GLOBAL(cgeequb, CGEEQUB)
+#define LAPACK_sgbequ LAPACK_GLOBAL(sgbequ, SGBEQU)
+#define LAPACK_dgbequ LAPACK_GLOBAL(dgbequ, DGBEQU)
+#define LAPACK_cgbequ LAPACK_GLOBAL(cgbequ, CGBEQU)
+#define LAPACK_zgbequ LAPACK_GLOBAL(zgbequ, ZGBEQU)
+#define LAPACK_dgbequb LAPACK_GLOBAL(dgbequb, DGBEQUB)
+#define LAPACK_sgbequb LAPACK_GLOBAL(sgbequb, SGBEQUB)
+#define LAPACK_zgbequb LAPACK_GLOBAL(zgbequb, ZGBEQUB)
+#define LAPACK_cgbequb LAPACK_GLOBAL(cgbequb, CGBEQUB)
+#define LAPACK_spoequ LAPACK_GLOBAL(spoequ, SPOEQU)
+#define LAPACK_dpoequ LAPACK_GLOBAL(dpoequ, DPOEQU)
+#define LAPACK_cpoequ LAPACK_GLOBAL(cpoequ, CPOEQU)
+#define LAPACK_zpoequ LAPACK_GLOBAL(zpoequ, ZPOEQU)
+#define LAPACK_dpoequb LAPACK_GLOBAL(dpoequb, DPOEQUB)
+#define LAPACK_spoequb LAPACK_GLOBAL(spoequb, SPOEQUB)
+#define LAPACK_zpoequb LAPACK_GLOBAL(zpoequb, ZPOEQUB)
+#define LAPACK_cpoequb LAPACK_GLOBAL(cpoequb, CPOEQUB)
+#define LAPACK_sppequ LAPACK_GLOBAL(sppequ, SPPEQU)
+#define LAPACK_dppequ LAPACK_GLOBAL(dppequ, DPPEQU)
+#define LAPACK_cppequ LAPACK_GLOBAL(cppequ, CPPEQU)
+#define LAPACK_zppequ LAPACK_GLOBAL(zppequ, ZPPEQU)
+#define LAPACK_spbequ LAPACK_GLOBAL(spbequ, SPBEQU)
+#define LAPACK_dpbequ LAPACK_GLOBAL(dpbequ, DPBEQU)
+#define LAPACK_cpbequ LAPACK_GLOBAL(cpbequ, CPBEQU)
+#define LAPACK_zpbequ LAPACK_GLOBAL(zpbequ, ZPBEQU)
+#define LAPACK_dsyequb LAPACK_GLOBAL(dsyequb, DSYEQUB)
+#define LAPACK_ssyequb LAPACK_GLOBAL(ssyequb, SSYEQUB)
+#define LAPACK_zsyequb LAPACK_GLOBAL(zsyequb, ZSYEQUB)
+#define LAPACK_csyequb LAPACK_GLOBAL(csyequb, CSYEQUB)
+#define LAPACK_zheequb LAPACK_GLOBAL(zheequb, ZHEEQUB)
+#define LAPACK_cheequb LAPACK_GLOBAL(cheequb, CHEEQUB)
+#define LAPACK_sgesv LAPACK_GLOBAL(sgesv, SGESV)
+#define LAPACK_dgesv LAPACK_GLOBAL(dgesv, DGESV)
+#define LAPACK_cgesv LAPACK_GLOBAL(cgesv, CGESV)
+#define LAPACK_zgesv LAPACK_GLOBAL(zgesv, ZGESV)
+#define LAPACK_dsgesv LAPACK_GLOBAL(dsgesv, DSGESV)
+#define LAPACK_zcgesv LAPACK_GLOBAL(zcgesv, ZCGESV)
+#define LAPACK_sgesvx LAPACK_GLOBAL(sgesvx, SGESVX)
+#define LAPACK_dgesvx LAPACK_GLOBAL(dgesvx, DGESVX)
+#define LAPACK_cgesvx LAPACK_GLOBAL(cgesvx, CGESVX)
+#define LAPACK_zgesvx LAPACK_GLOBAL(zgesvx, ZGESVX)
+#define LAPACK_dgesvxx LAPACK_GLOBAL(dgesvxx, DGESVXX)
+#define LAPACK_sgesvxx LAPACK_GLOBAL(sgesvxx, SGESVXX)
+#define LAPACK_zgesvxx LAPACK_GLOBAL(zgesvxx, ZGESVXX)
+#define LAPACK_cgesvxx LAPACK_GLOBAL(cgesvxx, CGESVXX)
+#define LAPACK_sgbsv LAPACK_GLOBAL(sgbsv, SGBSV)
+#define LAPACK_dgbsv LAPACK_GLOBAL(dgbsv, DGBSV)
+#define LAPACK_cgbsv LAPACK_GLOBAL(cgbsv, CGBSV)
+#define LAPACK_zgbsv LAPACK_GLOBAL(zgbsv, ZGBSV)
+#define LAPACK_sgbsvx LAPACK_GLOBAL(sgbsvx, SGBSVX)
+#define LAPACK_dgbsvx LAPACK_GLOBAL(dgbsvx, DGBSVX)
+#define LAPACK_cgbsvx LAPACK_GLOBAL(cgbsvx, CGBSVX)
+#define LAPACK_zgbsvx LAPACK_GLOBAL(zgbsvx, ZGBSVX)
+#define LAPACK_dgbsvxx LAPACK_GLOBAL(dgbsvxx, DGBSVXX)
+#define LAPACK_sgbsvxx LAPACK_GLOBAL(sgbsvxx, SGBSVXX)
+#define LAPACK_zgbsvxx LAPACK_GLOBAL(zgbsvxx, ZGBSVXX)
+#define LAPACK_cgbsvxx LAPACK_GLOBAL(cgbsvxx, CGBSVXX)
+#define LAPACK_sgtsv LAPACK_GLOBAL(sgtsv, SGTSV)
+#define LAPACK_dgtsv LAPACK_GLOBAL(dgtsv, DGTSV)
+#define LAPACK_cgtsv LAPACK_GLOBAL(cgtsv, CGTSV)
+#define LAPACK_zgtsv LAPACK_GLOBAL(zgtsv, ZGTSV)
+#define LAPACK_sgtsvx LAPACK_GLOBAL(sgtsvx, SGTSVX)
+#define LAPACK_dgtsvx LAPACK_GLOBAL(dgtsvx, DGTSVX)
+#define LAPACK_cgtsvx LAPACK_GLOBAL(cgtsvx, CGTSVX)
+#define LAPACK_zgtsvx LAPACK_GLOBAL(zgtsvx, ZGTSVX)
+#define LAPACK_sposv LAPACK_GLOBAL(sposv, SPOSV)
+#define LAPACK_dposv LAPACK_GLOBAL(dposv, DPOSV)
+#define LAPACK_cposv LAPACK_GLOBAL(cposv, CPOSV)
+#define LAPACK_zposv LAPACK_GLOBAL(zposv, ZPOSV)
+#define LAPACK_dsposv LAPACK_GLOBAL(dsposv, DSPOSV)
+#define LAPACK_zcposv LAPACK_GLOBAL(zcposv, ZCPOSV)
+#define LAPACK_sposvx LAPACK_GLOBAL(sposvx, SPOSVX)
+#define LAPACK_dposvx LAPACK_GLOBAL(dposvx, DPOSVX)
+#define LAPACK_cposvx LAPACK_GLOBAL(cposvx, CPOSVX)
+#define LAPACK_zposvx LAPACK_GLOBAL(zposvx, ZPOSVX)
+#define LAPACK_dposvxx LAPACK_GLOBAL(dposvxx, DPOSVXX)
+#define LAPACK_sposvxx LAPACK_GLOBAL(sposvxx, SPOSVXX)
+#define LAPACK_zposvxx LAPACK_GLOBAL(zposvxx, ZPOSVXX)
+#define LAPACK_cposvxx LAPACK_GLOBAL(cposvxx, CPOSVXX)
+#define LAPACK_sppsv LAPACK_GLOBAL(sppsv, SPPSV)
+#define LAPACK_dppsv LAPACK_GLOBAL(dppsv, DPPSV)
+#define LAPACK_cppsv LAPACK_GLOBAL(cppsv, CPPSV)
+#define LAPACK_zppsv LAPACK_GLOBAL(zppsv, ZPPSV)
+#define LAPACK_sppsvx LAPACK_GLOBAL(sppsvx, SPPSVX)
+#define LAPACK_dppsvx LAPACK_GLOBAL(dppsvx, DPPSVX)
+#define LAPACK_cppsvx LAPACK_GLOBAL(cppsvx, CPPSVX)
+#define LAPACK_zppsvx LAPACK_GLOBAL(zppsvx, ZPPSVX)
+#define LAPACK_spbsv LAPACK_GLOBAL(spbsv, SPBSV)
+#define LAPACK_dpbsv LAPACK_GLOBAL(dpbsv, DPBSV)
+#define LAPACK_cpbsv LAPACK_GLOBAL(cpbsv, CPBSV)
+#define LAPACK_zpbsv LAPACK_GLOBAL(zpbsv, ZPBSV)
+#define LAPACK_spbsvx LAPACK_GLOBAL(spbsvx, SPBSVX)
+#define LAPACK_dpbsvx LAPACK_GLOBAL(dpbsvx, DPBSVX)
+#define LAPACK_cpbsvx LAPACK_GLOBAL(cpbsvx, CPBSVX)
+#define LAPACK_zpbsvx LAPACK_GLOBAL(zpbsvx, ZPBSVX)
+#define LAPACK_sptsv LAPACK_GLOBAL(sptsv, SPTSV)
+#define LAPACK_dptsv LAPACK_GLOBAL(dptsv, DPTSV)
+#define LAPACK_cptsv LAPACK_GLOBAL(cptsv, CPTSV)
+#define LAPACK_zptsv LAPACK_GLOBAL(zptsv, ZPTSV)
+#define LAPACK_sptsvx LAPACK_GLOBAL(sptsvx, SPTSVX)
+#define LAPACK_dptsvx LAPACK_GLOBAL(dptsvx, DPTSVX)
+#define LAPACK_cptsvx LAPACK_GLOBAL(cptsvx, CPTSVX)
+#define LAPACK_zptsvx LAPACK_GLOBAL(zptsvx, ZPTSVX)
+#define LAPACK_ssysv LAPACK_GLOBAL(ssysv, SSYSV)
+#define LAPACK_dsysv LAPACK_GLOBAL(dsysv, DSYSV)
+#define LAPACK_csysv LAPACK_GLOBAL(csysv, CSYSV)
+#define LAPACK_zsysv LAPACK_GLOBAL(zsysv, ZSYSV)
+#define LAPACK_ssysvx LAPACK_GLOBAL(ssysvx, SSYSVX)
+#define LAPACK_dsysvx LAPACK_GLOBAL(dsysvx, DSYSVX)
+#define LAPACK_csysvx LAPACK_GLOBAL(csysvx, CSYSVX)
+#define LAPACK_zsysvx LAPACK_GLOBAL(zsysvx, ZSYSVX)
+#define LAPACK_dsysvxx LAPACK_GLOBAL(dsysvxx, DSYSVXX)
+#define LAPACK_ssysvxx LAPACK_GLOBAL(ssysvxx, SSYSVXX)
+#define LAPACK_zsysvxx LAPACK_GLOBAL(zsysvxx, ZSYSVXX)
+#define LAPACK_csysvxx LAPACK_GLOBAL(csysvxx, CSYSVXX)
+#define LAPACK_chesv LAPACK_GLOBAL(chesv, CHESV)
+#define LAPACK_zhesv LAPACK_GLOBAL(zhesv, ZHESV)
+#define LAPACK_chesvx LAPACK_GLOBAL(chesvx, CHESVX)
+#define LAPACK_zhesvx LAPACK_GLOBAL(zhesvx, ZHESVX)
+#define LAPACK_zhesvxx LAPACK_GLOBAL(zhesvxx, ZHESVXX)
+#define LAPACK_chesvxx LAPACK_GLOBAL(chesvxx, CHESVXX)
+#define LAPACK_sspsv LAPACK_GLOBAL(sspsv, SSPSV)
+#define LAPACK_dspsv LAPACK_GLOBAL(dspsv, DSPSV)
+#define LAPACK_cspsv LAPACK_GLOBAL(cspsv, CSPSV)
+#define LAPACK_zspsv LAPACK_GLOBAL(zspsv, ZSPSV)
+#define LAPACK_sspsvx LAPACK_GLOBAL(sspsvx, SSPSVX)
+#define LAPACK_dspsvx LAPACK_GLOBAL(dspsvx, DSPSVX)
+#define LAPACK_cspsvx LAPACK_GLOBAL(cspsvx, CSPSVX)
+#define LAPACK_zspsvx LAPACK_GLOBAL(zspsvx, ZSPSVX)
+#define LAPACK_chpsv LAPACK_GLOBAL(chpsv, CHPSV)
+#define LAPACK_zhpsv LAPACK_GLOBAL(zhpsv, ZHPSV)
+#define LAPACK_chpsvx LAPACK_GLOBAL(chpsvx, CHPSVX)
+#define LAPACK_zhpsvx LAPACK_GLOBAL(zhpsvx, ZHPSVX)
+#define LAPACK_sgeqrf LAPACK_GLOBAL(sgeqrf, SGEQRF)
+#define LAPACK_dgeqrf LAPACK_GLOBAL(dgeqrf, DGEQRF)
+#define LAPACK_cgeqrf LAPACK_GLOBAL(cgeqrf, CGEQRF)
+#define LAPACK_zgeqrf LAPACK_GLOBAL(zgeqrf, ZGEQRF)
+#define LAPACK_sgeqpf LAPACK_GLOBAL(sgeqpf, SGEQPF)
+#define LAPACK_dgeqpf LAPACK_GLOBAL(dgeqpf, DGEQPF)
+#define LAPACK_cgeqpf LAPACK_GLOBAL(cgeqpf, CGEQPF)
+#define LAPACK_zgeqpf LAPACK_GLOBAL(zgeqpf, ZGEQPF)
+#define LAPACK_sgeqp3 LAPACK_GLOBAL(sgeqp3, SGEQP3)
+#define LAPACK_dgeqp3 LAPACK_GLOBAL(dgeqp3, DGEQP3)
+#define LAPACK_cgeqp3 LAPACK_GLOBAL(cgeqp3, CGEQP3)
+#define LAPACK_zgeqp3 LAPACK_GLOBAL(zgeqp3, ZGEQP3)
+#define LAPACK_sorgqr LAPACK_GLOBAL(sorgqr, SORGQR)
+#define LAPACK_dorgqr LAPACK_GLOBAL(dorgqr, DORGQR)
+#define LAPACK_sormqr LAPACK_GLOBAL(sormqr, SORMQR)
+#define LAPACK_dormqr LAPACK_GLOBAL(dormqr, DORMQR)
+#define LAPACK_cungqr LAPACK_GLOBAL(cungqr, CUNGQR)
+#define LAPACK_zungqr LAPACK_GLOBAL(zungqr, ZUNGQR)
+#define LAPACK_cunmqr LAPACK_GLOBAL(cunmqr, CUNMQR)
+#define LAPACK_zunmqr LAPACK_GLOBAL(zunmqr, ZUNMQR)
+#define LAPACK_sgelqf LAPACK_GLOBAL(sgelqf, SGELQF)
+#define LAPACK_dgelqf LAPACK_GLOBAL(dgelqf, DGELQF)
+#define LAPACK_cgelqf LAPACK_GLOBAL(cgelqf, CGELQF)
+#define LAPACK_zgelqf LAPACK_GLOBAL(zgelqf, ZGELQF)
+#define LAPACK_sorglq LAPACK_GLOBAL(sorglq, SORGLQ)
+#define LAPACK_dorglq LAPACK_GLOBAL(dorglq, DORGLQ)
+#define LAPACK_sormlq LAPACK_GLOBAL(sormlq, SORMLQ)
+#define LAPACK_dormlq LAPACK_GLOBAL(dormlq, DORMLQ)
+#define LAPACK_cunglq LAPACK_GLOBAL(cunglq, CUNGLQ)
+#define LAPACK_zunglq LAPACK_GLOBAL(zunglq, ZUNGLQ)
+#define LAPACK_cunmlq LAPACK_GLOBAL(cunmlq, CUNMLQ)
+#define LAPACK_zunmlq LAPACK_GLOBAL(zunmlq, ZUNMLQ)
+#define LAPACK_sgeqlf LAPACK_GLOBAL(sgeqlf, SGEQLF)
+#define LAPACK_dgeqlf LAPACK_GLOBAL(dgeqlf, DGEQLF)
+#define LAPACK_cgeqlf LAPACK_GLOBAL(cgeqlf, CGEQLF)
+#define LAPACK_zgeqlf LAPACK_GLOBAL(zgeqlf, ZGEQLF)
+#define LAPACK_sorgql LAPACK_GLOBAL(sorgql, SORGQL)
+#define LAPACK_dorgql LAPACK_GLOBAL(dorgql, DORGQL)
+#define LAPACK_cungql LAPACK_GLOBAL(cungql, CUNGQL)
+#define LAPACK_zungql LAPACK_GLOBAL(zungql, ZUNGQL)
+#define LAPACK_sormql LAPACK_GLOBAL(sormql, SORMQL)
+#define LAPACK_dormql LAPACK_GLOBAL(dormql, DORMQL)
+#define LAPACK_cunmql LAPACK_GLOBAL(cunmql, CUNMQL)
+#define LAPACK_zunmql LAPACK_GLOBAL(zunmql, ZUNMQL)
+#define LAPACK_sgerqf LAPACK_GLOBAL(sgerqf, SGERQF)
+#define LAPACK_dgerqf LAPACK_GLOBAL(dgerqf, DGERQF)
+#define LAPACK_cgerqf LAPACK_GLOBAL(cgerqf, CGERQF)
+#define LAPACK_zgerqf LAPACK_GLOBAL(zgerqf, ZGERQF)
+#define LAPACK_sorgrq LAPACK_GLOBAL(sorgrq, SORGRQ)
+#define LAPACK_dorgrq LAPACK_GLOBAL(dorgrq, DORGRQ)
+#define LAPACK_cungrq LAPACK_GLOBAL(cungrq, CUNGRQ)
+#define LAPACK_zungrq LAPACK_GLOBAL(zungrq, ZUNGRQ)
+#define LAPACK_sormrq LAPACK_GLOBAL(sormrq, SORMRQ)
+#define LAPACK_dormrq LAPACK_GLOBAL(dormrq, DORMRQ)
+#define LAPACK_cunmrq LAPACK_GLOBAL(cunmrq, CUNMRQ)
+#define LAPACK_zunmrq LAPACK_GLOBAL(zunmrq, ZUNMRQ)
+#define LAPACK_stzrzf LAPACK_GLOBAL(stzrzf, STZRZF)
+#define LAPACK_dtzrzf LAPACK_GLOBAL(dtzrzf, DTZRZF)
+#define LAPACK_ctzrzf LAPACK_GLOBAL(ctzrzf, CTZRZF)
+#define LAPACK_ztzrzf LAPACK_GLOBAL(ztzrzf, ZTZRZF)
+#define LAPACK_sormrz LAPACK_GLOBAL(sormrz, SORMRZ)
+#define LAPACK_dormrz LAPACK_GLOBAL(dormrz, DORMRZ)
+#define LAPACK_cunmrz LAPACK_GLOBAL(cunmrz, CUNMRZ)
+#define LAPACK_zunmrz LAPACK_GLOBAL(zunmrz, ZUNMRZ)
+#define LAPACK_sggqrf LAPACK_GLOBAL(sggqrf, SGGQRF)
+#define LAPACK_dggqrf LAPACK_GLOBAL(dggqrf, DGGQRF)
+#define LAPACK_cggqrf LAPACK_GLOBAL(cggqrf, CGGQRF)
+#define LAPACK_zggqrf LAPACK_GLOBAL(zggqrf, ZGGQRF)
+#define LAPACK_sggrqf LAPACK_GLOBAL(sggrqf, SGGRQF)
+#define LAPACK_dggrqf LAPACK_GLOBAL(dggrqf, DGGRQF)
+#define LAPACK_cggrqf LAPACK_GLOBAL(cggrqf, CGGRQF)
+#define LAPACK_zggrqf LAPACK_GLOBAL(zggrqf, ZGGRQF)
+#define LAPACK_sgebrd LAPACK_GLOBAL(sgebrd, SGEBRD)
+#define LAPACK_dgebrd LAPACK_GLOBAL(dgebrd, DGEBRD)
+#define LAPACK_cgebrd LAPACK_GLOBAL(cgebrd, CGEBRD)
+#define LAPACK_zgebrd LAPACK_GLOBAL(zgebrd, ZGEBRD)
+#define LAPACK_sgbbrd LAPACK_GLOBAL(sgbbrd, SGBBRD)
+#define LAPACK_dgbbrd LAPACK_GLOBAL(dgbbrd, DGBBRD)
+#define LAPACK_cgbbrd LAPACK_GLOBAL(cgbbrd, CGBBRD)
+#define LAPACK_zgbbrd LAPACK_GLOBAL(zgbbrd, ZGBBRD)
+#define LAPACK_sorgbr LAPACK_GLOBAL(sorgbr, SORGBR)
+#define LAPACK_dorgbr LAPACK_GLOBAL(dorgbr, DORGBR)
+#define LAPACK_sormbr LAPACK_GLOBAL(sormbr, SORMBR)
+#define LAPACK_dormbr LAPACK_GLOBAL(dormbr, DORMBR)
+#define LAPACK_cungbr LAPACK_GLOBAL(cungbr, CUNGBR)
+#define LAPACK_zungbr LAPACK_GLOBAL(zungbr, ZUNGBR)
+#define LAPACK_cunmbr LAPACK_GLOBAL(cunmbr, CUNMBR)
+#define LAPACK_zunmbr LAPACK_GLOBAL(zunmbr, ZUNMBR)
+#define LAPACK_sbdsqr LAPACK_GLOBAL(sbdsqr, SBDSQR)
+#define LAPACK_dbdsqr LAPACK_GLOBAL(dbdsqr, DBDSQR)
+#define LAPACK_cbdsqr LAPACK_GLOBAL(cbdsqr, CBDSQR)
+#define LAPACK_zbdsqr LAPACK_GLOBAL(zbdsqr, ZBDSQR)
+#define LAPACK_sbdsdc LAPACK_GLOBAL(sbdsdc, SBDSDC)
+#define LAPACK_dbdsdc LAPACK_GLOBAL(dbdsdc, DBDSDC)
+#define LAPACK_ssytrd LAPACK_GLOBAL(ssytrd, SSYTRD)
+#define LAPACK_dsytrd LAPACK_GLOBAL(dsytrd, DSYTRD)
+#define LAPACK_sorgtr LAPACK_GLOBAL(sorgtr, SORGTR)
+#define LAPACK_dorgtr LAPACK_GLOBAL(dorgtr, DORGTR)
+#define LAPACK_sormtr LAPACK_GLOBAL(sormtr, SORMTR)
+#define LAPACK_dormtr LAPACK_GLOBAL(dormtr, DORMTR)
+#define LAPACK_chetrd LAPACK_GLOBAL(chetrd, CHETRD)
+#define LAPACK_zhetrd LAPACK_GLOBAL(zhetrd, ZHETRD)
+#define LAPACK_cungtr LAPACK_GLOBAL(cungtr, CUNGTR)
+#define LAPACK_zungtr LAPACK_GLOBAL(zungtr, ZUNGTR)
+#define LAPACK_cunmtr LAPACK_GLOBAL(cunmtr, CUNMTR)
+#define LAPACK_zunmtr LAPACK_GLOBAL(zunmtr, ZUNMTR)
+#define LAPACK_ssptrd LAPACK_GLOBAL(ssptrd, SSPTRD)
+#define LAPACK_dsptrd LAPACK_GLOBAL(dsptrd, DSPTRD)
+#define LAPACK_sopgtr LAPACK_GLOBAL(sopgtr, SOPGTR)
+#define LAPACK_dopgtr LAPACK_GLOBAL(dopgtr, DOPGTR)
+#define LAPACK_sopmtr LAPACK_GLOBAL(sopmtr, SOPMTR)
+#define LAPACK_dopmtr LAPACK_GLOBAL(dopmtr, DOPMTR)
+#define LAPACK_chptrd LAPACK_GLOBAL(chptrd, CHPTRD)
+#define LAPACK_zhptrd LAPACK_GLOBAL(zhptrd, ZHPTRD)
+#define LAPACK_cupgtr LAPACK_GLOBAL(cupgtr, CUPGTR)
+#define LAPACK_zupgtr LAPACK_GLOBAL(zupgtr, ZUPGTR)
+#define LAPACK_cupmtr LAPACK_GLOBAL(cupmtr, CUPMTR)
+#define LAPACK_zupmtr LAPACK_GLOBAL(zupmtr, ZUPMTR)
+#define LAPACK_ssbtrd LAPACK_GLOBAL(ssbtrd, SSBTRD)
+#define LAPACK_dsbtrd LAPACK_GLOBAL(dsbtrd, DSBTRD)
+#define LAPACK_chbtrd LAPACK_GLOBAL(chbtrd, CHBTRD)
+#define LAPACK_zhbtrd LAPACK_GLOBAL(zhbtrd, ZHBTRD)
+#define LAPACK_ssterf LAPACK_GLOBAL(ssterf, SSTERF)
+#define LAPACK_dsterf LAPACK_GLOBAL(dsterf, DSTERF)
+#define LAPACK_ssteqr LAPACK_GLOBAL(ssteqr, SSTEQR)
+#define LAPACK_dsteqr LAPACK_GLOBAL(dsteqr, DSTEQR)
+#define LAPACK_csteqr LAPACK_GLOBAL(csteqr, CSTEQR)
+#define LAPACK_zsteqr LAPACK_GLOBAL(zsteqr, ZSTEQR)
+#define LAPACK_sstemr LAPACK_GLOBAL(sstemr, SSTEMR)
+#define LAPACK_dstemr LAPACK_GLOBAL(dstemr, DSTEMR)
+#define LAPACK_cstemr LAPACK_GLOBAL(cstemr, CSTEMR)
+#define LAPACK_zstemr LAPACK_GLOBAL(zstemr, ZSTEMR)
+#define LAPACK_sstedc LAPACK_GLOBAL(sstedc, SSTEDC)
+#define LAPACK_dstedc LAPACK_GLOBAL(dstedc, DSTEDC)
+#define LAPACK_cstedc LAPACK_GLOBAL(cstedc, CSTEDC)
+#define LAPACK_zstedc LAPACK_GLOBAL(zstedc, ZSTEDC)
+#define LAPACK_sstegr LAPACK_GLOBAL(sstegr, SSTEGR)
+#define LAPACK_dstegr LAPACK_GLOBAL(dstegr, DSTEGR)
+#define LAPACK_cstegr LAPACK_GLOBAL(cstegr, CSTEGR)
+#define LAPACK_zstegr LAPACK_GLOBAL(zstegr, ZSTEGR)
+#define LAPACK_spteqr LAPACK_GLOBAL(spteqr, SPTEQR)
+#define LAPACK_dpteqr LAPACK_GLOBAL(dpteqr, DPTEQR)
+#define LAPACK_cpteqr LAPACK_GLOBAL(cpteqr, CPTEQR)
+#define LAPACK_zpteqr LAPACK_GLOBAL(zpteqr, ZPTEQR)
+#define LAPACK_sstebz LAPACK_GLOBAL(sstebz, SSTEBZ)
+#define LAPACK_dstebz LAPACK_GLOBAL(dstebz, DSTEBZ)
+#define LAPACK_sstein LAPACK_GLOBAL(sstein, SSTEIN)
+#define LAPACK_dstein LAPACK_GLOBAL(dstein, DSTEIN)
+#define LAPACK_cstein LAPACK_GLOBAL(cstein, CSTEIN)
+#define LAPACK_zstein LAPACK_GLOBAL(zstein, ZSTEIN)
+#define LAPACK_sdisna LAPACK_GLOBAL(sdisna, SDISNA)
+#define LAPACK_ddisna LAPACK_GLOBAL(ddisna, DDISNA)
+#define LAPACK_ssygst LAPACK_GLOBAL(ssygst, SSYGST)
+#define LAPACK_dsygst LAPACK_GLOBAL(dsygst, DSYGST)
+#define LAPACK_chegst LAPACK_GLOBAL(chegst, CHEGST)
+#define LAPACK_zhegst LAPACK_GLOBAL(zhegst, ZHEGST)
+#define LAPACK_sspgst LAPACK_GLOBAL(sspgst, SSPGST)
+#define LAPACK_dspgst LAPACK_GLOBAL(dspgst, DSPGST)
+#define LAPACK_chpgst LAPACK_GLOBAL(chpgst, CHPGST)
+#define LAPACK_zhpgst LAPACK_GLOBAL(zhpgst, ZHPGST)
+#define LAPACK_ssbgst LAPACK_GLOBAL(ssbgst, SSBGST)
+#define LAPACK_dsbgst LAPACK_GLOBAL(dsbgst, DSBGST)
+#define LAPACK_chbgst LAPACK_GLOBAL(chbgst, CHBGST)
+#define LAPACK_zhbgst LAPACK_GLOBAL(zhbgst, ZHBGST)
+#define LAPACK_spbstf LAPACK_GLOBAL(spbstf, SPBSTF)
+#define LAPACK_dpbstf LAPACK_GLOBAL(dpbstf, DPBSTF)
+#define LAPACK_cpbstf LAPACK_GLOBAL(cpbstf, CPBSTF)
+#define LAPACK_zpbstf LAPACK_GLOBAL(zpbstf, ZPBSTF)
+#define LAPACK_sgehrd LAPACK_GLOBAL(sgehrd, SGEHRD)
+#define LAPACK_dgehrd LAPACK_GLOBAL(dgehrd, DGEHRD)
+#define LAPACK_cgehrd LAPACK_GLOBAL(cgehrd, CGEHRD)
+#define LAPACK_zgehrd LAPACK_GLOBAL(zgehrd, ZGEHRD)
+#define LAPACK_sorghr LAPACK_GLOBAL(sorghr, SORGHR)
+#define LAPACK_dorghr LAPACK_GLOBAL(dorghr, DORGHR)
+#define LAPACK_sormhr LAPACK_GLOBAL(sormhr, SORMHR)
+#define LAPACK_dormhr LAPACK_GLOBAL(dormhr, DORMHR)
+#define LAPACK_cunghr LAPACK_GLOBAL(cunghr, CUNGHR)
+#define LAPACK_zunghr LAPACK_GLOBAL(zunghr, ZUNGHR)
+#define LAPACK_cunmhr LAPACK_GLOBAL(cunmhr, CUNMHR)
+#define LAPACK_zunmhr LAPACK_GLOBAL(zunmhr, ZUNMHR)
+#define LAPACK_sgebal LAPACK_GLOBAL(sgebal, SGEBAL)
+#define LAPACK_dgebal LAPACK_GLOBAL(dgebal, DGEBAL)
+#define LAPACK_cgebal LAPACK_GLOBAL(cgebal, CGEBAL)
+#define LAPACK_zgebal LAPACK_GLOBAL(zgebal, ZGEBAL)
+#define LAPACK_sgebak LAPACK_GLOBAL(sgebak, SGEBAK)
+#define LAPACK_dgebak LAPACK_GLOBAL(dgebak, DGEBAK)
+#define LAPACK_cgebak LAPACK_GLOBAL(cgebak, CGEBAK)
+#define LAPACK_zgebak LAPACK_GLOBAL(zgebak, ZGEBAK)
+#define LAPACK_shseqr LAPACK_GLOBAL(shseqr, SHSEQR)
+#define LAPACK_dhseqr LAPACK_GLOBAL(dhseqr, DHSEQR)
+#define LAPACK_chseqr LAPACK_GLOBAL(chseqr, CHSEQR)
+#define LAPACK_zhseqr LAPACK_GLOBAL(zhseqr, ZHSEQR)
+#define LAPACK_shsein LAPACK_GLOBAL(shsein, SHSEIN)
+#define LAPACK_dhsein LAPACK_GLOBAL(dhsein, DHSEIN)
+#define LAPACK_chsein LAPACK_GLOBAL(chsein, CHSEIN)
+#define LAPACK_zhsein LAPACK_GLOBAL(zhsein, ZHSEIN)
+#define LAPACK_strevc LAPACK_GLOBAL(strevc, STREVC)
+#define LAPACK_dtrevc LAPACK_GLOBAL(dtrevc, DTREVC)
+#define LAPACK_ctrevc LAPACK_GLOBAL(ctrevc, CTREVC)
+#define LAPACK_ztrevc LAPACK_GLOBAL(ztrevc, ZTREVC)
+#define LAPACK_strsna LAPACK_GLOBAL(strsna, STRSNA)
+#define LAPACK_dtrsna LAPACK_GLOBAL(dtrsna, DTRSNA)
+#define LAPACK_ctrsna LAPACK_GLOBAL(ctrsna, CTRSNA)
+#define LAPACK_ztrsna LAPACK_GLOBAL(ztrsna, ZTRSNA)
+#define LAPACK_strexc LAPACK_GLOBAL(strexc, STREXC)
+#define LAPACK_dtrexc LAPACK_GLOBAL(dtrexc, DTREXC)
+#define LAPACK_ctrexc LAPACK_GLOBAL(ctrexc, CTREXC)
+#define LAPACK_ztrexc LAPACK_GLOBAL(ztrexc, ZTREXC)
+#define LAPACK_strsen LAPACK_GLOBAL(strsen, STRSEN)
+#define LAPACK_dtrsen LAPACK_GLOBAL(dtrsen, DTRSEN)
+#define LAPACK_ctrsen LAPACK_GLOBAL(ctrsen, CTRSEN)
+#define LAPACK_ztrsen LAPACK_GLOBAL(ztrsen, ZTRSEN)
+#define LAPACK_strsyl LAPACK_GLOBAL(strsyl, STRSYL)
+#define LAPACK_dtrsyl LAPACK_GLOBAL(dtrsyl, DTRSYL)
+#define LAPACK_ctrsyl LAPACK_GLOBAL(ctrsyl, CTRSYL)
+#define LAPACK_ztrsyl LAPACK_GLOBAL(ztrsyl, ZTRSYL)
+#define LAPACK_sgghrd LAPACK_GLOBAL(sgghrd, SGGHRD)
+#define LAPACK_dgghrd LAPACK_GLOBAL(dgghrd, DGGHRD)
+#define LAPACK_cgghrd LAPACK_GLOBAL(cgghrd, CGGHRD)
+#define LAPACK_zgghrd LAPACK_GLOBAL(zgghrd, ZGGHRD)
+#define LAPACK_sggbal LAPACK_GLOBAL(sggbal, SGGBAL)
+#define LAPACK_dggbal LAPACK_GLOBAL(dggbal, DGGBAL)
+#define LAPACK_cggbal LAPACK_GLOBAL(cggbal, CGGBAL)
+#define LAPACK_zggbal LAPACK_GLOBAL(zggbal, ZGGBAL)
+#define LAPACK_sggbak LAPACK_GLOBAL(sggbak, SGGBAK)
+#define LAPACK_dggbak LAPACK_GLOBAL(dggbak, DGGBAK)
+#define LAPACK_cggbak LAPACK_GLOBAL(cggbak, CGGBAK)
+#define LAPACK_zggbak LAPACK_GLOBAL(zggbak, ZGGBAK)
+#define LAPACK_shgeqz LAPACK_GLOBAL(shgeqz, SHGEQZ)
+#define LAPACK_dhgeqz LAPACK_GLOBAL(dhgeqz, DHGEQZ)
+#define LAPACK_chgeqz LAPACK_GLOBAL(chgeqz, CHGEQZ)
+#define LAPACK_zhgeqz LAPACK_GLOBAL(zhgeqz, ZHGEQZ)
+#define LAPACK_stgevc LAPACK_GLOBAL(stgevc, STGEVC)
+#define LAPACK_dtgevc LAPACK_GLOBAL(dtgevc, DTGEVC)
+#define LAPACK_ctgevc LAPACK_GLOBAL(ctgevc, CTGEVC)
+#define LAPACK_ztgevc LAPACK_GLOBAL(ztgevc, ZTGEVC)
+#define LAPACK_stgexc LAPACK_GLOBAL(stgexc, STGEXC)
+#define LAPACK_dtgexc LAPACK_GLOBAL(dtgexc, DTGEXC)
+#define LAPACK_ctgexc LAPACK_GLOBAL(ctgexc, CTGEXC)
+#define LAPACK_ztgexc LAPACK_GLOBAL(ztgexc, ZTGEXC)
+#define LAPACK_stgsen LAPACK_GLOBAL(stgsen, STGSEN)
+#define LAPACK_dtgsen LAPACK_GLOBAL(dtgsen, DTGSEN)
+#define LAPACK_ctgsen LAPACK_GLOBAL(ctgsen, CTGSEN)
+#define LAPACK_ztgsen LAPACK_GLOBAL(ztgsen, ZTGSEN)
+#define LAPACK_stgsyl LAPACK_GLOBAL(stgsyl, STGSYL)
+#define LAPACK_dtgsyl LAPACK_GLOBAL(dtgsyl, DTGSYL)
+#define LAPACK_ctgsyl LAPACK_GLOBAL(ctgsyl, CTGSYL)
+#define LAPACK_ztgsyl LAPACK_GLOBAL(ztgsyl, ZTGSYL)
+#define LAPACK_stgsna LAPACK_GLOBAL(stgsna, STGSNA)
+#define LAPACK_dtgsna LAPACK_GLOBAL(dtgsna, DTGSNA)
+#define LAPACK_ctgsna LAPACK_GLOBAL(ctgsna, CTGSNA)
+#define LAPACK_ztgsna LAPACK_GLOBAL(ztgsna, ZTGSNA)
+#define LAPACK_sggsvp LAPACK_GLOBAL(sggsvp, SGGSVP)
+#define LAPACK_dggsvp LAPACK_GLOBAL(dggsvp, DGGSVP)
+#define LAPACK_cggsvp LAPACK_GLOBAL(cggsvp, CGGSVP)
+#define LAPACK_zggsvp LAPACK_GLOBAL(zggsvp, ZGGSVP)
+#define LAPACK_stgsja LAPACK_GLOBAL(stgsja, STGSJA)
+#define LAPACK_dtgsja LAPACK_GLOBAL(dtgsja, DTGSJA)
+#define LAPACK_ctgsja LAPACK_GLOBAL(ctgsja, CTGSJA)
+#define LAPACK_ztgsja LAPACK_GLOBAL(ztgsja, ZTGSJA)
+#define LAPACK_sgels LAPACK_GLOBAL(sgels, SGELS)
+#define LAPACK_dgels LAPACK_GLOBAL(dgels, DGELS)
+#define LAPACK_cgels LAPACK_GLOBAL(cgels, CGELS)
+#define LAPACK_zgels LAPACK_GLOBAL(zgels, ZGELS)
+#define LAPACK_sgelsy LAPACK_GLOBAL(sgelsy, SGELSY)
+#define LAPACK_dgelsy LAPACK_GLOBAL(dgelsy, DGELSY)
+#define LAPACK_cgelsy LAPACK_GLOBAL(cgelsy, CGELSY)
+#define LAPACK_zgelsy LAPACK_GLOBAL(zgelsy, ZGELSY)
+#define LAPACK_sgelss LAPACK_GLOBAL(sgelss, SGELSS)
+#define LAPACK_dgelss LAPACK_GLOBAL(dgelss, DGELSS)
+#define LAPACK_cgelss LAPACK_GLOBAL(cgelss, CGELSS)
+#define LAPACK_zgelss LAPACK_GLOBAL(zgelss, ZGELSS)
+#define LAPACK_sgelsd LAPACK_GLOBAL(sgelsd, SGELSD)
+#define LAPACK_dgelsd LAPACK_GLOBAL(dgelsd, DGELSD)
+#define LAPACK_cgelsd LAPACK_GLOBAL(cgelsd, CGELSD)
+#define LAPACK_zgelsd LAPACK_GLOBAL(zgelsd, ZGELSD)
+#define LAPACK_sgglse LAPACK_GLOBAL(sgglse, SGGLSE)
+#define LAPACK_dgglse LAPACK_GLOBAL(dgglse, DGGLSE)
+#define LAPACK_cgglse LAPACK_GLOBAL(cgglse, CGGLSE)
+#define LAPACK_zgglse LAPACK_GLOBAL(zgglse, ZGGLSE)
+#define LAPACK_sggglm LAPACK_GLOBAL(sggglm, SGGGLM)
+#define LAPACK_dggglm LAPACK_GLOBAL(dggglm, DGGGLM)
+#define LAPACK_cggglm LAPACK_GLOBAL(cggglm, CGGGLM)
+#define LAPACK_zggglm LAPACK_GLOBAL(zggglm, ZGGGLM)
+#define LAPACK_ssyev LAPACK_GLOBAL(ssyev, SSYEV)
+#define LAPACK_dsyev LAPACK_GLOBAL(dsyev, DSYEV)
+#define LAPACK_cheev LAPACK_GLOBAL(cheev, CHEEV)
+#define LAPACK_zheev LAPACK_GLOBAL(zheev, ZHEEV)
+#define LAPACK_ssyevd LAPACK_GLOBAL(ssyevd, SSYEVD)
+#define LAPACK_dsyevd LAPACK_GLOBAL(dsyevd, DSYEVD)
+#define LAPACK_cheevd LAPACK_GLOBAL(cheevd, CHEEVD)
+#define LAPACK_zheevd LAPACK_GLOBAL(zheevd, ZHEEVD)
+#define LAPACK_ssyevx LAPACK_GLOBAL(ssyevx, SSYEVX)
+#define LAPACK_dsyevx LAPACK_GLOBAL(dsyevx, DSYEVX)
+#define LAPACK_cheevx LAPACK_GLOBAL(cheevx, CHEEVX)
+#define LAPACK_zheevx LAPACK_GLOBAL(zheevx, ZHEEVX)
+#define LAPACK_ssyevr LAPACK_GLOBAL(ssyevr, SSYEVR)
+#define LAPACK_dsyevr LAPACK_GLOBAL(dsyevr, DSYEVR)
+#define LAPACK_cheevr LAPACK_GLOBAL(cheevr, CHEEVR)
+#define LAPACK_zheevr LAPACK_GLOBAL(zheevr, ZHEEVR)
+#define LAPACK_sspev LAPACK_GLOBAL(sspev, SSPEV)
+#define LAPACK_dspev LAPACK_GLOBAL(dspev, DSPEV)
+#define LAPACK_chpev LAPACK_GLOBAL(chpev, CHPEV)
+#define LAPACK_zhpev LAPACK_GLOBAL(zhpev, ZHPEV)
+#define LAPACK_sspevd LAPACK_GLOBAL(sspevd, SSPEVD)
+#define LAPACK_dspevd LAPACK_GLOBAL(dspevd, DSPEVD)
+#define LAPACK_chpevd LAPACK_GLOBAL(chpevd, CHPEVD)
+#define LAPACK_zhpevd LAPACK_GLOBAL(zhpevd, ZHPEVD)
+#define LAPACK_sspevx LAPACK_GLOBAL(sspevx, SSPEVX)
+#define LAPACK_dspevx LAPACK_GLOBAL(dspevx, DSPEVX)
+#define LAPACK_chpevx LAPACK_GLOBAL(chpevx, CHPEVX)
+#define LAPACK_zhpevx LAPACK_GLOBAL(zhpevx, ZHPEVX)
+#define LAPACK_ssbev LAPACK_GLOBAL(ssbev, SSBEV)
+#define LAPACK_dsbev LAPACK_GLOBAL(dsbev, DSBEV)
+#define LAPACK_chbev LAPACK_GLOBAL(chbev, CHBEV)
+#define LAPACK_zhbev LAPACK_GLOBAL(zhbev, ZHBEV)
+#define LAPACK_ssbevd LAPACK_GLOBAL(ssbevd, SSBEVD)
+#define LAPACK_dsbevd LAPACK_GLOBAL(dsbevd, DSBEVD)
+#define LAPACK_chbevd LAPACK_GLOBAL(chbevd, CHBEVD)
+#define LAPACK_zhbevd LAPACK_GLOBAL(zhbevd, ZHBEVD)
+#define LAPACK_ssbevx LAPACK_GLOBAL(ssbevx, SSBEVX)
+#define LAPACK_dsbevx LAPACK_GLOBAL(dsbevx, DSBEVX)
+#define LAPACK_chbevx LAPACK_GLOBAL(chbevx, CHBEVX)
+#define LAPACK_zhbevx LAPACK_GLOBAL(zhbevx, ZHBEVX)
+#define LAPACK_sstev LAPACK_GLOBAL(sstev, SSTEV)
+#define LAPACK_dstev LAPACK_GLOBAL(dstev, DSTEV)
+#define LAPACK_sstevd LAPACK_GLOBAL(sstevd, SSTEVD)
+#define LAPACK_dstevd LAPACK_GLOBAL(dstevd, DSTEVD)
+#define LAPACK_sstevx LAPACK_GLOBAL(sstevx, SSTEVX)
+#define LAPACK_dstevx LAPACK_GLOBAL(dstevx, DSTEVX)
+#define LAPACK_sstevr LAPACK_GLOBAL(sstevr, SSTEVR)
+#define LAPACK_dstevr LAPACK_GLOBAL(dstevr, DSTEVR)
+#define LAPACK_sgees LAPACK_GLOBAL(sgees, SGEES)
+#define LAPACK_dgees LAPACK_GLOBAL(dgees, DGEES)
+#define LAPACK_cgees LAPACK_GLOBAL(cgees, CGEES)
+#define LAPACK_zgees LAPACK_GLOBAL(zgees, ZGEES)
+#define LAPACK_sgeesx LAPACK_GLOBAL(sgeesx, SGEESX)
+#define LAPACK_dgeesx LAPACK_GLOBAL(dgeesx, DGEESX)
+#define LAPACK_cgeesx LAPACK_GLOBAL(cgeesx, CGEESX)
+#define LAPACK_zgeesx LAPACK_GLOBAL(zgeesx, ZGEESX)
+#define LAPACK_sgeev LAPACK_GLOBAL(sgeev, SGEEV)
+#define LAPACK_dgeev LAPACK_GLOBAL(dgeev, DGEEV)
+#define LAPACK_cgeev LAPACK_GLOBAL(cgeev, CGEEV)
+#define LAPACK_zgeev LAPACK_GLOBAL(zgeev, ZGEEV)
+#define LAPACK_sgeevx LAPACK_GLOBAL(sgeevx, SGEEVX)
+#define LAPACK_dgeevx LAPACK_GLOBAL(dgeevx, DGEEVX)
+#define LAPACK_cgeevx LAPACK_GLOBAL(cgeevx, CGEEVX)
+#define LAPACK_zgeevx LAPACK_GLOBAL(zgeevx, ZGEEVX)
+#define LAPACK_sgesvd LAPACK_GLOBAL(sgesvd, SGESVD)
+#define LAPACK_dgesvd LAPACK_GLOBAL(dgesvd, DGESVD)
+#define LAPACK_cgesvd LAPACK_GLOBAL(cgesvd, CGESVD)
+#define LAPACK_zgesvd LAPACK_GLOBAL(zgesvd, ZGESVD)
+#define LAPACK_sgesdd LAPACK_GLOBAL(sgesdd, SGESDD)
+#define LAPACK_dgesdd LAPACK_GLOBAL(dgesdd, DGESDD)
+#define LAPACK_cgesdd LAPACK_GLOBAL(cgesdd, CGESDD)
+#define LAPACK_zgesdd LAPACK_GLOBAL(zgesdd, ZGESDD)
+#define LAPACK_dgejsv LAPACK_GLOBAL(dgejsv, DGEJSV)
+#define LAPACK_sgejsv LAPACK_GLOBAL(sgejsv, SGEJSV)
+#define LAPACK_dgesvj LAPACK_GLOBAL(dgesvj, DGESVJ)
+#define LAPACK_sgesvj LAPACK_GLOBAL(sgesvj, SGESVJ)
+#define LAPACK_sggsvd LAPACK_GLOBAL(sggsvd, SGGSVD)
+#define LAPACK_dggsvd LAPACK_GLOBAL(dggsvd, DGGSVD)
+#define LAPACK_cggsvd LAPACK_GLOBAL(cggsvd, CGGSVD)
+#define LAPACK_zggsvd LAPACK_GLOBAL(zggsvd, ZGGSVD)
+#define LAPACK_ssygv LAPACK_GLOBAL(ssygv, SSYGV)
+#define LAPACK_dsygv LAPACK_GLOBAL(dsygv, DSYGV)
+#define LAPACK_chegv LAPACK_GLOBAL(chegv, CHEGV)
+#define LAPACK_zhegv LAPACK_GLOBAL(zhegv, ZHEGV)
+#define LAPACK_ssygvd LAPACK_GLOBAL(ssygvd, SSYGVD)
+#define LAPACK_dsygvd LAPACK_GLOBAL(dsygvd, DSYGVD)
+#define LAPACK_chegvd LAPACK_GLOBAL(chegvd, CHEGVD)
+#define LAPACK_zhegvd LAPACK_GLOBAL(zhegvd, ZHEGVD)
+#define LAPACK_ssygvx LAPACK_GLOBAL(ssygvx, SSYGVX)
+#define LAPACK_dsygvx LAPACK_GLOBAL(dsygvx, DSYGVX)
+#define LAPACK_chegvx LAPACK_GLOBAL(chegvx, CHEGVX)
+#define LAPACK_zhegvx LAPACK_GLOBAL(zhegvx, ZHEGVX)
+#define LAPACK_sspgv LAPACK_GLOBAL(sspgv, SSPGV)
+#define LAPACK_dspgv LAPACK_GLOBAL(dspgv, DSPGV)
+#define LAPACK_chpgv LAPACK_GLOBAL(chpgv, CHPGV)
+#define LAPACK_zhpgv LAPACK_GLOBAL(zhpgv, ZHPGV)
+#define LAPACK_sspgvd LAPACK_GLOBAL(sspgvd, SSPGVD)
+#define LAPACK_dspgvd LAPACK_GLOBAL(dspgvd, DSPGVD)
+#define LAPACK_chpgvd LAPACK_GLOBAL(chpgvd, CHPGVD)
+#define LAPACK_zhpgvd LAPACK_GLOBAL(zhpgvd, ZHPGVD)
+#define LAPACK_sspgvx LAPACK_GLOBAL(sspgvx, SSPGVX)
+#define LAPACK_dspgvx LAPACK_GLOBAL(dspgvx, DSPGVX)
+#define LAPACK_chpgvx LAPACK_GLOBAL(chpgvx, CHPGVX)
+#define LAPACK_zhpgvx LAPACK_GLOBAL(zhpgvx, ZHPGVX)
+#define LAPACK_ssbgv LAPACK_GLOBAL(ssbgv, SSBGV)
+#define LAPACK_dsbgv LAPACK_GLOBAL(dsbgv, DSBGV)
+#define LAPACK_chbgv LAPACK_GLOBAL(chbgv, CHBGV)
+#define LAPACK_zhbgv LAPACK_GLOBAL(zhbgv, ZHBGV)
+#define LAPACK_ssbgvd LAPACK_GLOBAL(ssbgvd, SSBGVD)
+#define LAPACK_dsbgvd LAPACK_GLOBAL(dsbgvd, DSBGVD)
+#define LAPACK_chbgvd LAPACK_GLOBAL(chbgvd, CHBGVD)
+#define LAPACK_zhbgvd LAPACK_GLOBAL(zhbgvd, ZHBGVD)
+#define LAPACK_ssbgvx LAPACK_GLOBAL(ssbgvx, SSBGVX)
+#define LAPACK_dsbgvx LAPACK_GLOBAL(dsbgvx, DSBGVX)
+#define LAPACK_chbgvx LAPACK_GLOBAL(chbgvx, CHBGVX)
+#define LAPACK_zhbgvx LAPACK_GLOBAL(zhbgvx, ZHBGVX)
+#define LAPACK_sgges LAPACK_GLOBAL(sgges, SGGES)
+#define LAPACK_dgges LAPACK_GLOBAL(dgges, DGGES)
+#define LAPACK_cgges LAPACK_GLOBAL(cgges, CGGES)
+#define LAPACK_zgges LAPACK_GLOBAL(zgges, ZGGES)
+#define LAPACK_sggesx LAPACK_GLOBAL(sggesx, SGGESX)
+#define LAPACK_dggesx LAPACK_GLOBAL(dggesx, DGGESX)
+#define LAPACK_cggesx LAPACK_GLOBAL(cggesx, CGGESX)
+#define LAPACK_zggesx LAPACK_GLOBAL(zggesx, ZGGESX)
+#define LAPACK_sggev LAPACK_GLOBAL(sggev, SGGEV)
+#define LAPACK_dggev LAPACK_GLOBAL(dggev, DGGEV)
+#define LAPACK_cggev LAPACK_GLOBAL(cggev, CGGEV)
+#define LAPACK_zggev LAPACK_GLOBAL(zggev, ZGGEV)
+#define LAPACK_sggevx LAPACK_GLOBAL(sggevx, SGGEVX)
+#define LAPACK_dggevx LAPACK_GLOBAL(dggevx, DGGEVX)
+#define LAPACK_cggevx LAPACK_GLOBAL(cggevx, CGGEVX)
+#define LAPACK_zggevx LAPACK_GLOBAL(zggevx, ZGGEVX)
+#define LAPACK_dsfrk LAPACK_GLOBAL(dsfrk, DSFRK)
+#define LAPACK_ssfrk LAPACK_GLOBAL(ssfrk, SSFRK)
+#define LAPACK_zhfrk LAPACK_GLOBAL(zhfrk, ZHFRK)
+#define LAPACK_chfrk LAPACK_GLOBAL(chfrk, CHFRK)
+#define LAPACK_dtfsm LAPACK_GLOBAL(dtfsm, DTFSM)
+#define LAPACK_stfsm LAPACK_GLOBAL(stfsm, STFSM)
+#define LAPACK_ztfsm LAPACK_GLOBAL(ztfsm, ZTFSM)
+#define LAPACK_ctfsm LAPACK_GLOBAL(ctfsm, CTFSM)
+#define LAPACK_dtfttp LAPACK_GLOBAL(dtfttp, DTFTTP)
+#define LAPACK_stfttp LAPACK_GLOBAL(stfttp, STFTTP)
+#define LAPACK_ztfttp LAPACK_GLOBAL(ztfttp, ZTFTTP)
+#define LAPACK_ctfttp LAPACK_GLOBAL(ctfttp, CTFTTP)
+#define LAPACK_dtfttr LAPACK_GLOBAL(dtfttr, DTFTTR)
+#define LAPACK_stfttr LAPACK_GLOBAL(stfttr, STFTTR)
+#define LAPACK_ztfttr LAPACK_GLOBAL(ztfttr, ZTFTTR)
+#define LAPACK_ctfttr LAPACK_GLOBAL(ctfttr, CTFTTR)
+#define LAPACK_dtpttf LAPACK_GLOBAL(dtpttf, DTPTTF)
+#define LAPACK_stpttf LAPACK_GLOBAL(stpttf, STPTTF)
+#define LAPACK_ztpttf LAPACK_GLOBAL(ztpttf, ZTPTTF)
+#define LAPACK_ctpttf LAPACK_GLOBAL(ctpttf, CTPTTF)
+#define LAPACK_dtpttr LAPACK_GLOBAL(dtpttr, DTPTTR)
+#define LAPACK_stpttr LAPACK_GLOBAL(stpttr, STPTTR)
+#define LAPACK_ztpttr LAPACK_GLOBAL(ztpttr, ZTPTTR)
+#define LAPACK_ctpttr LAPACK_GLOBAL(ctpttr, CTPTTR)
+#define LAPACK_dtrttf LAPACK_GLOBAL(dtrttf, DTRTTF)
+#define LAPACK_strttf LAPACK_GLOBAL(strttf, STRTTF)
+#define LAPACK_ztrttf LAPACK_GLOBAL(ztrttf, ZTRTTF)
+#define LAPACK_ctrttf LAPACK_GLOBAL(ctrttf, CTRTTF)
+#define LAPACK_dtrttp LAPACK_GLOBAL(dtrttp, DTRTTP)
+#define LAPACK_strttp LAPACK_GLOBAL(strttp, STRTTP)
+#define LAPACK_ztrttp LAPACK_GLOBAL(ztrttp, ZTRTTP)
+#define LAPACK_ctrttp LAPACK_GLOBAL(ctrttp, CTRTTP)
+#define LAPACK_sgeqrfp LAPACK_GLOBAL(sgeqrfp, SGEQRFP)
+#define LAPACK_dgeqrfp LAPACK_GLOBAL(dgeqrfp, DGEQRFP)
+#define LAPACK_cgeqrfp LAPACK_GLOBAL(cgeqrfp, CGEQRFP)
+#define LAPACK_zgeqrfp LAPACK_GLOBAL(zgeqrfp, ZGEQRFP)
+#define LAPACK_clacgv LAPACK_GLOBAL(clacgv, CLACGV)
+#define LAPACK_zlacgv LAPACK_GLOBAL(zlacgv, ZLACGV)
+#define LAPACK_slarnv LAPACK_GLOBAL(slarnv, SLARNV)
+#define LAPACK_dlarnv LAPACK_GLOBAL(dlarnv, DLARNV)
+#define LAPACK_clarnv LAPACK_GLOBAL(clarnv, CLARNV)
+#define LAPACK_zlarnv LAPACK_GLOBAL(zlarnv, ZLARNV)
+#define LAPACK_sgeqr2 LAPACK_GLOBAL(sgeqr2, SGEQR2)
+#define LAPACK_dgeqr2 LAPACK_GLOBAL(dgeqr2, DGEQR2)
+#define LAPACK_cgeqr2 LAPACK_GLOBAL(cgeqr2, CGEQR2)
+#define LAPACK_zgeqr2 LAPACK_GLOBAL(zgeqr2, ZGEQR2)
+#define LAPACK_slacpy LAPACK_GLOBAL(slacpy, SLACPY)
+#define LAPACK_dlacpy LAPACK_GLOBAL(dlacpy, DLACPY)
+#define LAPACK_clacpy LAPACK_GLOBAL(clacpy, CLACPY)
+#define LAPACK_zlacpy LAPACK_GLOBAL(zlacpy, ZLACPY)
+#define LAPACK_sgetf2 LAPACK_GLOBAL(sgetf2, SGETF2)
+#define LAPACK_dgetf2 LAPACK_GLOBAL(dgetf2, DGETF2)
+#define LAPACK_cgetf2 LAPACK_GLOBAL(cgetf2, CGETF2)
+#define LAPACK_zgetf2 LAPACK_GLOBAL(zgetf2, ZGETF2)
+#define LAPACK_slaswp LAPACK_GLOBAL(slaswp, SLASWP)
+#define LAPACK_dlaswp LAPACK_GLOBAL(dlaswp, DLASWP)
+#define LAPACK_claswp LAPACK_GLOBAL(claswp, CLASWP)
+#define LAPACK_zlaswp LAPACK_GLOBAL(zlaswp, ZLASWP)
+#define LAPACK_slange LAPACK_GLOBAL(slange, SLANGE)
+#define LAPACK_dlange LAPACK_GLOBAL(dlange, DLANGE)
+#define LAPACK_clange LAPACK_GLOBAL(clange, CLANGE)
+#define LAPACK_zlange LAPACK_GLOBAL(zlange, ZLANGE)
+#define LAPACK_clanhe LAPACK_GLOBAL(clanhe, CLANHE)
+#define LAPACK_zlanhe LAPACK_GLOBAL(zlanhe, ZLANHE)
+#define LAPACK_slansy LAPACK_GLOBAL(slansy, SLANSY)
+#define LAPACK_dlansy LAPACK_GLOBAL(dlansy, DLANSY)
+#define LAPACK_clansy LAPACK_GLOBAL(clansy, CLANSY)
+#define LAPACK_zlansy LAPACK_GLOBAL(zlansy, ZLANSY)
+#define LAPACK_slantr LAPACK_GLOBAL(slantr, SLANTR)
+#define LAPACK_dlantr LAPACK_GLOBAL(dlantr, DLANTR)
+#define LAPACK_clantr LAPACK_GLOBAL(clantr, CLANTR)
+#define LAPACK_zlantr LAPACK_GLOBAL(zlantr, ZLANTR)
+#define LAPACK_slamch LAPACK_GLOBAL(slamch, SLAMCH)
+#define LAPACK_dlamch LAPACK_GLOBAL(dlamch, DLAMCH)
+#define LAPACK_sgelq2 LAPACK_GLOBAL(sgelq2, SGELQ2)
+#define LAPACK_dgelq2 LAPACK_GLOBAL(dgelq2, DGELQ2)
+#define LAPACK_cgelq2 LAPACK_GLOBAL(cgelq2, CGELQ2)
+#define LAPACK_zgelq2 LAPACK_GLOBAL(zgelq2, ZGELQ2)
+#define LAPACK_slarfb LAPACK_GLOBAL(slarfb, SLARFB)
+#define LAPACK_dlarfb LAPACK_GLOBAL(dlarfb, DLARFB)
+#define LAPACK_clarfb LAPACK_GLOBAL(clarfb, CLARFB)
+#define LAPACK_zlarfb LAPACK_GLOBAL(zlarfb, ZLARFB)
+#define LAPACK_slarfg LAPACK_GLOBAL(slarfg, SLARFG)
+#define LAPACK_dlarfg LAPACK_GLOBAL(dlarfg, DLARFG)
+#define LAPACK_clarfg LAPACK_GLOBAL(clarfg, CLARFG)
+#define LAPACK_zlarfg LAPACK_GLOBAL(zlarfg, ZLARFG)
+#define LAPACK_slarft LAPACK_GLOBAL(slarft, SLARFT)
+#define LAPACK_dlarft LAPACK_GLOBAL(dlarft, DLARFT)
+#define LAPACK_clarft LAPACK_GLOBAL(clarft, CLARFT)
+#define LAPACK_zlarft LAPACK_GLOBAL(zlarft, ZLARFT)
+#define LAPACK_slarfx LAPACK_GLOBAL(slarfx, SLARFX)
+#define LAPACK_dlarfx LAPACK_GLOBAL(dlarfx, DLARFX)
+#define LAPACK_clarfx LAPACK_GLOBAL(clarfx, CLARFX)
+#define LAPACK_zlarfx LAPACK_GLOBAL(zlarfx, ZLARFX)
+#define LAPACK_slatms LAPACK_GLOBAL(slatms, SLATMS)
+#define LAPACK_dlatms LAPACK_GLOBAL(dlatms, DLATMS)
+#define LAPACK_clatms LAPACK_GLOBAL(clatms, CLATMS)
+#define LAPACK_zlatms LAPACK_GLOBAL(zlatms, ZLATMS)
+#define LAPACK_slag2d LAPACK_GLOBAL(slag2d, SLAG2D)
+#define LAPACK_dlag2s LAPACK_GLOBAL(dlag2s, DLAG2S)
+#define LAPACK_clag2z LAPACK_GLOBAL(clag2z, CLAG2Z)
+#define LAPACK_zlag2c LAPACK_GLOBAL(zlag2c, ZLAG2C)
+#define LAPACK_slauum LAPACK_GLOBAL(slauum, SLAUUM)
+#define LAPACK_dlauum LAPACK_GLOBAL(dlauum, DLAUUM)
+#define LAPACK_clauum LAPACK_GLOBAL(clauum, CLAUUM)
+#define LAPACK_zlauum LAPACK_GLOBAL(zlauum, ZLAUUM)
+#define LAPACK_slagge LAPACK_GLOBAL(slagge, SLAGGE)
+#define LAPACK_dlagge LAPACK_GLOBAL(dlagge, DLAGGE)
+#define LAPACK_clagge LAPACK_GLOBAL(clagge, CLAGGE)
+#define LAPACK_zlagge LAPACK_GLOBAL(zlagge, ZLAGGE)
+#define LAPACK_slaset LAPACK_GLOBAL(slaset, SLASET)
+#define LAPACK_dlaset LAPACK_GLOBAL(dlaset, DLASET)
+#define LAPACK_claset LAPACK_GLOBAL(claset, CLASET)
+#define LAPACK_zlaset LAPACK_GLOBAL(zlaset, ZLASET)
+#define LAPACK_slasrt LAPACK_GLOBAL(slasrt, SLASRT)
+#define LAPACK_dlasrt LAPACK_GLOBAL(dlasrt, DLASRT)
+#define LAPACK_slagsy LAPACK_GLOBAL(slagsy, SLAGSY)
+#define LAPACK_dlagsy LAPACK_GLOBAL(dlagsy, DLAGSY)
+#define LAPACK_clagsy LAPACK_GLOBAL(clagsy, CLAGSY)
+#define LAPACK_zlagsy LAPACK_GLOBAL(zlagsy, ZLAGSY)
+#define LAPACK_claghe LAPACK_GLOBAL(claghe, CLAGHE)
+#define LAPACK_zlaghe LAPACK_GLOBAL(zlaghe, ZLAGHE)
+#define LAPACK_slapmr LAPACK_GLOBAL(slapmr, SLAPMR)
+#define LAPACK_dlapmr LAPACK_GLOBAL(dlapmr, DLAPMR)
+#define LAPACK_clapmr LAPACK_GLOBAL(clapmr, CLAPMR)
+#define LAPACK_zlapmr LAPACK_GLOBAL(zlapmr, ZLAPMR)
+#define LAPACK_slapy2 LAPACK_GLOBAL(slapy2, SLAPY2)
+#define LAPACK_dlapy2 LAPACK_GLOBAL(dlapy2, DLAPY2)
+#define LAPACK_slapy3 LAPACK_GLOBAL(slapy3, SLAPY3)
+#define LAPACK_dlapy3 LAPACK_GLOBAL(dlapy3, DLAPY3)
+#define LAPACK_slartgp LAPACK_GLOBAL(slartgp, SLARTGP)
+#define LAPACK_dlartgp LAPACK_GLOBAL(dlartgp, DLARTGP)
+#define LAPACK_slartgs LAPACK_GLOBAL(slartgs, SLARTGS)
+#define LAPACK_dlartgs LAPACK_GLOBAL(dlartgs, DLARTGS)
+// LAPACK 3.3.0
+#define LAPACK_cbbcsd LAPACK_GLOBAL(cbbcsd, CBBCSD)
+#define LAPACK_cheswapr LAPACK_GLOBAL(cheswapr, CHESWAPR)
+#define LAPACK_chetri2 LAPACK_GLOBAL(chetri2, CHETRI2)
+#define LAPACK_chetri2x LAPACK_GLOBAL(chetri2x, CHETRI2X)
+#define LAPACK_chetrs2 LAPACK_GLOBAL(chetrs2, CHETRS2)
+#define LAPACK_csyconv LAPACK_GLOBAL(csyconv, CSYCONV)
+#define LAPACK_csyswapr LAPACK_GLOBAL(csyswapr, CSYSWAPR)
+#define LAPACK_csytri2 LAPACK_GLOBAL(csytri2, CSYTRI2)
+#define LAPACK_csytri2x LAPACK_GLOBAL(csytri2x, CSYTRI2X)
+#define LAPACK_csytrs2 LAPACK_GLOBAL(csytrs2, CSYTRS2)
+#define LAPACK_cunbdb LAPACK_GLOBAL(cunbdb, CUNBDB)
+#define LAPACK_cuncsd LAPACK_GLOBAL(cuncsd, CUNCSD)
+#define LAPACK_dbbcsd LAPACK_GLOBAL(dbbcsd, DBBCSD)
+#define LAPACK_dorbdb LAPACK_GLOBAL(dorbdb, DORBDB)
+#define LAPACK_dorcsd LAPACK_GLOBAL(dorcsd, DORCSD)
+#define LAPACK_dsyconv LAPACK_GLOBAL(dsyconv, DSYCONV)
+#define LAPACK_dsyswapr LAPACK_GLOBAL(dsyswapr, DSYSWAPR)
+#define LAPACK_dsytri2 LAPACK_GLOBAL(dsytri2, DSYTRI2)
+#define LAPACK_dsytri2x LAPACK_GLOBAL(dsytri2x, DSYTRI2X)
+#define LAPACK_dsytrs2 LAPACK_GLOBAL(dsytrs2, DSYTRS2)
+#define LAPACK_sbbcsd LAPACK_GLOBAL(sbbcsd, SBBCSD)
+#define LAPACK_sorbdb LAPACK_GLOBAL(sorbdb, SORBDB)
+#define LAPACK_sorcsd LAPACK_GLOBAL(sorcsd, SORCSD)
+#define LAPACK_ssyconv LAPACK_GLOBAL(ssyconv, SSYCONV)
+#define LAPACK_ssyswapr LAPACK_GLOBAL(ssyswapr, SSYSWAPR)
+#define LAPACK_ssytri2 LAPACK_GLOBAL(ssytri2, SSYTRI2)
+#define LAPACK_ssytri2x LAPACK_GLOBAL(ssytri2x, SSYTRI2X)
+#define LAPACK_ssytrs2 LAPACK_GLOBAL(ssytrs2, SSYTRS2)
+#define LAPACK_zbbcsd LAPACK_GLOBAL(zbbcsd, ZBBCSD)
+#define LAPACK_zheswapr LAPACK_GLOBAL(zheswapr, ZHESWAPR)
+#define LAPACK_zhetri2 LAPACK_GLOBAL(zhetri2, ZHETRI2)
+#define LAPACK_zhetri2x LAPACK_GLOBAL(zhetri2x, ZHETRI2X)
+#define LAPACK_zhetrs2 LAPACK_GLOBAL(zhetrs2, ZHETRS2)
+#define LAPACK_zsyconv LAPACK_GLOBAL(zsyconv, ZSYCONV)
+#define LAPACK_zsyswapr LAPACK_GLOBAL(zsyswapr, ZSYSWAPR)
+#define LAPACK_zsytri2 LAPACK_GLOBAL(zsytri2, ZSYTRI2)
+#define LAPACK_zsytri2x LAPACK_GLOBAL(zsytri2x, ZSYTRI2X)
+#define LAPACK_zsytrs2 LAPACK_GLOBAL(zsytrs2, ZSYTRS2)
+#define LAPACK_zunbdb LAPACK_GLOBAL(zunbdb, ZUNBDB)
+#define LAPACK_zuncsd LAPACK_GLOBAL(zuncsd, ZUNCSD)
+// LAPACK 3.4.0
+#define LAPACK_sgemqrt LAPACK_GLOBAL(sgemqrt, SGEMQRT)
+#define LAPACK_dgemqrt LAPACK_GLOBAL(dgemqrt, DGEMQRT)
+#define LAPACK_cgemqrt LAPACK_GLOBAL(cgemqrt, CGEMQRT)
+#define LAPACK_zgemqrt LAPACK_GLOBAL(zgemqrt, ZGEMQRT)
+#define LAPACK_sgeqrt LAPACK_GLOBAL(sgeqrt, SGEQRT)
+#define LAPACK_dgeqrt LAPACK_GLOBAL(dgeqrt, DGEQRT)
+#define LAPACK_cgeqrt LAPACK_GLOBAL(cgeqrt, CGEQRT)
+#define LAPACK_zgeqrt LAPACK_GLOBAL(zgeqrt, ZGEQRT)
+#define LAPACK_sgeqrt2 LAPACK_GLOBAL(sgeqrt2, SGEQRT2)
+#define LAPACK_dgeqrt2 LAPACK_GLOBAL(dgeqrt2, DGEQRT2)
+#define LAPACK_cgeqrt2 LAPACK_GLOBAL(cgeqrt2, CGEQRT2)
+#define LAPACK_zgeqrt2 LAPACK_GLOBAL(zgeqrt2, ZGEQRT2)
+#define LAPACK_sgeqrt3 LAPACK_GLOBAL(sgeqrt3, SGEQRT3)
+#define LAPACK_dgeqrt3 LAPACK_GLOBAL(dgeqrt3, DGEQRT3)
+#define LAPACK_cgeqrt3 LAPACK_GLOBAL(cgeqrt3, CGEQRT3)
+#define LAPACK_zgeqrt3 LAPACK_GLOBAL(zgeqrt3, ZGEQRT3)
+#define LAPACK_stpmqrt LAPACK_GLOBAL(stpmqrt, STPMQRT)
+#define LAPACK_dtpmqrt LAPACK_GLOBAL(dtpmqrt, DTPMQRT)
+#define LAPACK_ctpmqrt LAPACK_GLOBAL(ctpmqrt, CTPMQRT)
+#define LAPACK_ztpmqrt LAPACK_GLOBAL(ztpmqrt, ZTPMQRT)
+#define LAPACK_dtpqrt LAPACK_GLOBAL(dtpqrt, DTPQRT)
+#define LAPACK_ctpqrt LAPACK_GLOBAL(ctpqrt, CTPQRT)
+#define LAPACK_ztpqrt LAPACK_GLOBAL(ztpqrt, ZTPQRT)
+#define LAPACK_stpqrt2 LAPACK_GLOBAL(stpqrt2, STPQRT2)
+#define LAPACK_dtpqrt2 LAPACK_GLOBAL(dtpqrt2, DTPQRT2)
+#define LAPACK_ctpqrt2 LAPACK_GLOBAL(ctpqrt2, CTPQRT2)
+#define LAPACK_ztpqrt2 LAPACK_GLOBAL(ztpqrt2, ZTPQRT2)
+#define LAPACK_stprfb LAPACK_GLOBAL(stprfb, STPRFB)
+#define LAPACK_dtprfb LAPACK_GLOBAL(dtprfb, DTPRFB)
+#define LAPACK_ctprfb LAPACK_GLOBAL(ctprfb, CTPRFB)
+#define LAPACK_ztprfb LAPACK_GLOBAL(ztprfb, ZTPRFB)
+// LAPACK 3.X.X
+#define LAPACK_csyr LAPACK_GLOBAL(csyr, CSYR)
+#define LAPACK_zsyr LAPACK_GLOBAL(zsyr, ZSYR)
+
+void LAPACK_sgetrf(lapack_int* m, lapack_int* n, float* a, lapack_int* lda, lapack_int* ipiv, lapack_int* info);
+void LAPACK_dgetrf(lapack_int* m, lapack_int* n, double* a, lapack_int* lda, lapack_int* ipiv, lapack_int* info);
+void LAPACK_cgetrf(lapack_int* m, lapack_int* n, lapack_complex_float* a, lapack_int* lda, lapack_int* ipiv,
+                   lapack_int* info);
+void LAPACK_zgetrf(lapack_int* m, lapack_int* n, lapack_complex_double* a, lapack_int* lda, lapack_int* ipiv,
+                   lapack_int* info);
+void LAPACK_sgbtrf(lapack_int* m, lapack_int* n, lapack_int* kl, lapack_int* ku, float* ab, lapack_int* ldab,
+                   lapack_int* ipiv, lapack_int* info);
+void LAPACK_dgbtrf(lapack_int* m, lapack_int* n, lapack_int* kl, lapack_int* ku, double* ab, lapack_int* ldab,
+                   lapack_int* ipiv, lapack_int* info);
+void LAPACK_cgbtrf(lapack_int* m, lapack_int* n, lapack_int* kl, lapack_int* ku, lapack_complex_float* ab,
+                   lapack_int* ldab, lapack_int* ipiv, lapack_int* info);
+void LAPACK_zgbtrf(lapack_int* m, lapack_int* n, lapack_int* kl, lapack_int* ku, lapack_complex_double* ab,
+                   lapack_int* ldab, lapack_int* ipiv, lapack_int* info);
+void LAPACK_sgttrf(lapack_int* n, float* dl, float* d, float* du, float* du2, lapack_int* ipiv, lapack_int* info);
+void LAPACK_dgttrf(lapack_int* n, double* dl, double* d, double* du, double* du2, lapack_int* ipiv, lapack_int* info);
+void LAPACK_cgttrf(lapack_int* n, lapack_complex_float* dl, lapack_complex_float* d, lapack_complex_float* du,
+                   lapack_complex_float* du2, lapack_int* ipiv, lapack_int* info);
+void LAPACK_zgttrf(lapack_int* n, lapack_complex_double* dl, lapack_complex_double* d, lapack_complex_double* du,
+                   lapack_complex_double* du2, lapack_int* ipiv, lapack_int* info);
+void LAPACK_spotrf(char* uplo, lapack_int* n, float* a, lapack_int* lda, lapack_int* info);
+void LAPACK_dpotrf(char* uplo, lapack_int* n, double* a, lapack_int* lda, lapack_int* info);
+void LAPACK_cpotrf(char* uplo, lapack_int* n, lapack_complex_float* a, lapack_int* lda, lapack_int* info);
+void LAPACK_zpotrf(char* uplo, lapack_int* n, lapack_complex_double* a, lapack_int* lda, lapack_int* info);
+void LAPACK_dpstrf(char* uplo, lapack_int* n, double* a, lapack_int* lda, lapack_int* piv, lapack_int* rank,
+                   double* tol, double* work, lapack_int* info);
+void LAPACK_spstrf(char* uplo, lapack_int* n, float* a, lapack_int* lda, lapack_int* piv, lapack_int* rank, float* tol,
+                   float* work, lapack_int* info);
+void LAPACK_zpstrf(char* uplo, lapack_int* n, lapack_complex_double* a, lapack_int* lda, lapack_int* piv,
+                   lapack_int* rank, double* tol, double* work, lapack_int* info);
+void LAPACK_cpstrf(char* uplo, lapack_int* n, lapack_complex_float* a, lapack_int* lda, lapack_int* piv,
+                   lapack_int* rank, float* tol, float* work, lapack_int* info);
+void LAPACK_dpftrf(char* transr, char* uplo, lapack_int* n, double* a, lapack_int* info);
+void LAPACK_spftrf(char* transr, char* uplo, lapack_int* n, float* a, lapack_int* info);
+void LAPACK_zpftrf(char* transr, char* uplo, lapack_int* n, lapack_complex_double* a, lapack_int* info);
+void LAPACK_cpftrf(char* transr, char* uplo, lapack_int* n, lapack_complex_float* a, lapack_int* info);
+void LAPACK_spptrf(char* uplo, lapack_int* n, float* ap, lapack_int* info);
+void LAPACK_dpptrf(char* uplo, lapack_int* n, double* ap, lapack_int* info);
+void LAPACK_cpptrf(char* uplo, lapack_int* n, lapack_complex_float* ap, lapack_int* info);
+void LAPACK_zpptrf(char* uplo, lapack_int* n, lapack_complex_double* ap, lapack_int* info);
+void LAPACK_spbtrf(char* uplo, lapack_int* n, lapack_int* kd, float* ab, lapack_int* ldab, lapack_int* info);
+void LAPACK_dpbtrf(char* uplo, lapack_int* n, lapack_int* kd, double* ab, lapack_int* ldab, lapack_int* info);
+void LAPACK_cpbtrf(char* uplo, lapack_int* n, lapack_int* kd, lapack_complex_float* ab, lapack_int* ldab,
+                   lapack_int* info);
+void LAPACK_zpbtrf(char* uplo, lapack_int* n, lapack_int* kd, lapack_complex_double* ab, lapack_int* ldab,
+                   lapack_int* info);
+void LAPACK_spttrf(lapack_int* n, float* d, float* e, lapack_int* info);
+void LAPACK_dpttrf(lapack_int* n, double* d, double* e, lapack_int* info);
+void LAPACK_cpttrf(lapack_int* n, float* d, lapack_complex_float* e, lapack_int* info);
+void LAPACK_zpttrf(lapack_int* n, double* d, lapack_complex_double* e, lapack_int* info);
+void LAPACK_ssytrf(char* uplo, lapack_int* n, float* a, lapack_int* lda, lapack_int* ipiv, float* work,
+                   lapack_int* lwork, lapack_int* info);
+void LAPACK_dsytrf(char* uplo, lapack_int* n, double* a, lapack_int* lda, lapack_int* ipiv, double* work,
+                   lapack_int* lwork, lapack_int* info);
+void LAPACK_csytrf(char* uplo, lapack_int* n, lapack_complex_float* a, lapack_int* lda, lapack_int* ipiv,
+                   lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_zsytrf(char* uplo, lapack_int* n, lapack_complex_double* a, lapack_int* lda, lapack_int* ipiv,
+                   lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_chetrf(char* uplo, lapack_int* n, lapack_complex_float* a, lapack_int* lda, lapack_int* ipiv,
+                   lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_zhetrf(char* uplo, lapack_int* n, lapack_complex_double* a, lapack_int* lda, lapack_int* ipiv,
+                   lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_ssptrf(char* uplo, lapack_int* n, float* ap, lapack_int* ipiv, lapack_int* info);
+void LAPACK_dsptrf(char* uplo, lapack_int* n, double* ap, lapack_int* ipiv, lapack_int* info);
+void LAPACK_csptrf(char* uplo, lapack_int* n, lapack_complex_float* ap, lapack_int* ipiv, lapack_int* info);
+void LAPACK_zsptrf(char* uplo, lapack_int* n, lapack_complex_double* ap, lapack_int* ipiv, lapack_int* info);
+void LAPACK_chptrf(char* uplo, lapack_int* n, lapack_complex_float* ap, lapack_int* ipiv, lapack_int* info);
+void LAPACK_zhptrf(char* uplo, lapack_int* n, lapack_complex_double* ap, lapack_int* ipiv, lapack_int* info);
+void LAPACK_sgetrs(char* trans, lapack_int* n, lapack_int* nrhs, const float* a, lapack_int* lda,
+                   const lapack_int* ipiv, float* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_dgetrs(char* trans, lapack_int* n, lapack_int* nrhs, const double* a, lapack_int* lda,
+                   const lapack_int* ipiv, double* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_cgetrs(char* trans, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* a, lapack_int* lda,
+                   const lapack_int* ipiv, lapack_complex_float* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_zgetrs(char* trans, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* a, lapack_int* lda,
+                   const lapack_int* ipiv, lapack_complex_double* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_sgbtrs(char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku, lapack_int* nrhs, const float* ab,
+                   lapack_int* ldab, const lapack_int* ipiv, float* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_dgbtrs(char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku, lapack_int* nrhs, const double* ab,
+                   lapack_int* ldab, const lapack_int* ipiv, double* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_cgbtrs(char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku, lapack_int* nrhs,
+                   const lapack_complex_float* ab, lapack_int* ldab, const lapack_int* ipiv, lapack_complex_float* b,
+                   lapack_int* ldb, lapack_int* info);
+void LAPACK_zgbtrs(char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku, lapack_int* nrhs,
+                   const lapack_complex_double* ab, lapack_int* ldab, const lapack_int* ipiv, lapack_complex_double* b,
+                   lapack_int* ldb, lapack_int* info);
+void LAPACK_sgttrs(char* trans, lapack_int* n, lapack_int* nrhs, const float* dl, const float* d, const float* du,
+                   const float* du2, const lapack_int* ipiv, float* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_dgttrs(char* trans, lapack_int* n, lapack_int* nrhs, const double* dl, const double* d, const double* du,
+                   const double* du2, const lapack_int* ipiv, double* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_cgttrs(char* trans, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* dl,
+                   const lapack_complex_float* d, const lapack_complex_float* du, const lapack_complex_float* du2,
+                   const lapack_int* ipiv, lapack_complex_float* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_zgttrs(char* trans, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* dl,
+                   const lapack_complex_double* d, const lapack_complex_double* du, const lapack_complex_double* du2,
+                   const lapack_int* ipiv, lapack_complex_double* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_spotrs(char* uplo, lapack_int* n, lapack_int* nrhs, const float* a, lapack_int* lda, float* b,
+                   lapack_int* ldb, lapack_int* info);
+void LAPACK_dpotrs(char* uplo, lapack_int* n, lapack_int* nrhs, const double* a, lapack_int* lda, double* b,
+                   lapack_int* ldb, lapack_int* info);
+void LAPACK_cpotrs(char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* a, lapack_int* lda,
+                   lapack_complex_float* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_zpotrs(char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* a, lapack_int* lda,
+                   lapack_complex_double* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_dpftrs(char* transr, char* uplo, lapack_int* n, lapack_int* nrhs, const double* a, double* b,
+                   lapack_int* ldb, lapack_int* info);
+void LAPACK_spftrs(char* transr, char* uplo, lapack_int* n, lapack_int* nrhs, const float* a, float* b, lapack_int* ldb,
+                   lapack_int* info);
+void LAPACK_zpftrs(char* transr, char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* a,
+                   lapack_complex_double* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_cpftrs(char* transr, char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* a,
+                   lapack_complex_float* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_spptrs(char* uplo, lapack_int* n, lapack_int* nrhs, const float* ap, float* b, lapack_int* ldb,
+                   lapack_int* info);
+void LAPACK_dpptrs(char* uplo, lapack_int* n, lapack_int* nrhs, const double* ap, double* b, lapack_int* ldb,
+                   lapack_int* info);
+void LAPACK_cpptrs(char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* ap, lapack_complex_float* b,
+                   lapack_int* ldb, lapack_int* info);
+void LAPACK_zpptrs(char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* ap,
+                   lapack_complex_double* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_spbtrs(char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs, const float* ab, lapack_int* ldab,
+                   float* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_dpbtrs(char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs, const double* ab, lapack_int* ldab,
+                   double* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_cpbtrs(char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs, const lapack_complex_float* ab,
+                   lapack_int* ldab, lapack_complex_float* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_zpbtrs(char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs, const lapack_complex_double* ab,
+                   lapack_int* ldab, lapack_complex_double* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_spttrs(lapack_int* n, lapack_int* nrhs, const float* d, const float* e, float* b, lapack_int* ldb,
+                   lapack_int* info);
+void LAPACK_dpttrs(lapack_int* n, lapack_int* nrhs, const double* d, const double* e, double* b, lapack_int* ldb,
+                   lapack_int* info);
+void LAPACK_cpttrs(char* uplo, lapack_int* n, lapack_int* nrhs, const float* d, const lapack_complex_float* e,
+                   lapack_complex_float* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_zpttrs(char* uplo, lapack_int* n, lapack_int* nrhs, const double* d, const lapack_complex_double* e,
+                   lapack_complex_double* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_ssytrs(char* uplo, lapack_int* n, lapack_int* nrhs, const float* a, lapack_int* lda, const lapack_int* ipiv,
+                   float* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_dsytrs(char* uplo, lapack_int* n, lapack_int* nrhs, const double* a, lapack_int* lda,
+                   const lapack_int* ipiv, double* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_csytrs(char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* a, lapack_int* lda,
+                   const lapack_int* ipiv, lapack_complex_float* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_zsytrs(char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* a, lapack_int* lda,
+                   const lapack_int* ipiv, lapack_complex_double* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_chetrs(char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* a, lapack_int* lda,
+                   const lapack_int* ipiv, lapack_complex_float* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_zhetrs(char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* a, lapack_int* lda,
+                   const lapack_int* ipiv, lapack_complex_double* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_ssptrs(char* uplo, lapack_int* n, lapack_int* nrhs, const float* ap, const lapack_int* ipiv, float* b,
+                   lapack_int* ldb, lapack_int* info);
+void LAPACK_dsptrs(char* uplo, lapack_int* n, lapack_int* nrhs, const double* ap, const lapack_int* ipiv, double* b,
+                   lapack_int* ldb, lapack_int* info);
+void LAPACK_csptrs(char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* ap, const lapack_int* ipiv,
+                   lapack_complex_float* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_zsptrs(char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* ap, const lapack_int* ipiv,
+                   lapack_complex_double* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_chptrs(char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* ap, const lapack_int* ipiv,
+                   lapack_complex_float* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_zhptrs(char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* ap, const lapack_int* ipiv,
+                   lapack_complex_double* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_strtrs(char* uplo, char* trans, char* diag, lapack_int* n, lapack_int* nrhs, const float* a,
+                   lapack_int* lda, float* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_dtrtrs(char* uplo, char* trans, char* diag, lapack_int* n, lapack_int* nrhs, const double* a,
+                   lapack_int* lda, double* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_ctrtrs(char* uplo, char* trans, char* diag, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* a,
+                   lapack_int* lda, lapack_complex_float* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_ztrtrs(char* uplo, char* trans, char* diag, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* a,
+                   lapack_int* lda, lapack_complex_double* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_stptrs(char* uplo, char* trans, char* diag, lapack_int* n, lapack_int* nrhs, const float* ap, float* b,
+                   lapack_int* ldb, lapack_int* info);
+void LAPACK_dtptrs(char* uplo, char* trans, char* diag, lapack_int* n, lapack_int* nrhs, const double* ap, double* b,
+                   lapack_int* ldb, lapack_int* info);
+void LAPACK_ctptrs(char* uplo, char* trans, char* diag, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* ap,
+                   lapack_complex_float* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_ztptrs(char* uplo, char* trans, char* diag, lapack_int* n, lapack_int* nrhs,
+                   const lapack_complex_double* ap, lapack_complex_double* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_stbtrs(char* uplo, char* trans, char* diag, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
+                   const float* ab, lapack_int* ldab, float* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_dtbtrs(char* uplo, char* trans, char* diag, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
+                   const double* ab, lapack_int* ldab, double* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_ctbtrs(char* uplo, char* trans, char* diag, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
+                   const lapack_complex_float* ab, lapack_int* ldab, lapack_complex_float* b, lapack_int* ldb,
+                   lapack_int* info);
+void LAPACK_ztbtrs(char* uplo, char* trans, char* diag, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
+                   const lapack_complex_double* ab, lapack_int* ldab, lapack_complex_double* b, lapack_int* ldb,
+                   lapack_int* info);
+void LAPACK_sgecon(char* norm, lapack_int* n, const float* a, lapack_int* lda, float* anorm, float* rcond, float* work,
+                   lapack_int* iwork, lapack_int* info);
+void LAPACK_dgecon(char* norm, lapack_int* n, const double* a, lapack_int* lda, double* anorm, double* rcond,
+                   double* work, lapack_int* iwork, lapack_int* info);
+void LAPACK_cgecon(char* norm, lapack_int* n, const lapack_complex_float* a, lapack_int* lda, float* anorm,
+                   float* rcond, lapack_complex_float* work, float* rwork, lapack_int* info);
+void LAPACK_zgecon(char* norm, lapack_int* n, const lapack_complex_double* a, lapack_int* lda, double* anorm,
+                   double* rcond, lapack_complex_double* work, double* rwork, lapack_int* info);
+void LAPACK_sgbcon(char* norm, lapack_int* n, lapack_int* kl, lapack_int* ku, const float* ab, lapack_int* ldab,
+                   const lapack_int* ipiv, float* anorm, float* rcond, float* work, lapack_int* iwork,
+                   lapack_int* info);
+void LAPACK_dgbcon(char* norm, lapack_int* n, lapack_int* kl, lapack_int* ku, const double* ab, lapack_int* ldab,
+                   const lapack_int* ipiv, double* anorm, double* rcond, double* work, lapack_int* iwork,
+                   lapack_int* info);
+void LAPACK_cgbcon(char* norm, lapack_int* n, lapack_int* kl, lapack_int* ku, const lapack_complex_float* ab,
+                   lapack_int* ldab, const lapack_int* ipiv, float* anorm, float* rcond, lapack_complex_float* work,
+                   float* rwork, lapack_int* info);
+void LAPACK_zgbcon(char* norm, lapack_int* n, lapack_int* kl, lapack_int* ku, const lapack_complex_double* ab,
+                   lapack_int* ldab, const lapack_int* ipiv, double* anorm, double* rcond, lapack_complex_double* work,
+                   double* rwork, lapack_int* info);
+void LAPACK_sgtcon(char* norm, lapack_int* n, const float* dl, const float* d, const float* du, const float* du2,
+                   const lapack_int* ipiv, float* anorm, float* rcond, float* work, lapack_int* iwork,
+                   lapack_int* info);
+void LAPACK_dgtcon(char* norm, lapack_int* n, const double* dl, const double* d, const double* du, const double* du2,
+                   const lapack_int* ipiv, double* anorm, double* rcond, double* work, lapack_int* iwork,
+                   lapack_int* info);
+void LAPACK_cgtcon(char* norm, lapack_int* n, const lapack_complex_float* dl, const lapack_complex_float* d,
+                   const lapack_complex_float* du, const lapack_complex_float* du2, const lapack_int* ipiv,
+                   float* anorm, float* rcond, lapack_complex_float* work, lapack_int* info);
+void LAPACK_zgtcon(char* norm, lapack_int* n, const lapack_complex_double* dl, const lapack_complex_double* d,
+                   const lapack_complex_double* du, const lapack_complex_double* du2, const lapack_int* ipiv,
+                   double* anorm, double* rcond, lapack_complex_double* work, lapack_int* info);
+void LAPACK_spocon(char* uplo, lapack_int* n, const float* a, lapack_int* lda, float* anorm, float* rcond, float* work,
+                   lapack_int* iwork, lapack_int* info);
+void LAPACK_dpocon(char* uplo, lapack_int* n, const double* a, lapack_int* lda, double* anorm, double* rcond,
+                   double* work, lapack_int* iwork, lapack_int* info);
+void LAPACK_cpocon(char* uplo, lapack_int* n, const lapack_complex_float* a, lapack_int* lda, float* anorm,
+                   float* rcond, lapack_complex_float* work, float* rwork, lapack_int* info);
+void LAPACK_zpocon(char* uplo, lapack_int* n, const lapack_complex_double* a, lapack_int* lda, double* anorm,
+                   double* rcond, lapack_complex_double* work, double* rwork, lapack_int* info);
+void LAPACK_sppcon(char* uplo, lapack_int* n, const float* ap, float* anorm, float* rcond, float* work,
+                   lapack_int* iwork, lapack_int* info);
+void LAPACK_dppcon(char* uplo, lapack_int* n, const double* ap, double* anorm, double* rcond, double* work,
+                   lapack_int* iwork, lapack_int* info);
+void LAPACK_cppcon(char* uplo, lapack_int* n, const lapack_complex_float* ap, float* anorm, float* rcond,
+                   lapack_complex_float* work, float* rwork, lapack_int* info);
+void LAPACK_zppcon(char* uplo, lapack_int* n, const lapack_complex_double* ap, double* anorm, double* rcond,
+                   lapack_complex_double* work, double* rwork, lapack_int* info);
+void LAPACK_spbcon(char* uplo, lapack_int* n, lapack_int* kd, const float* ab, lapack_int* ldab, float* anorm,
+                   float* rcond, float* work, lapack_int* iwork, lapack_int* info);
+void LAPACK_dpbcon(char* uplo, lapack_int* n, lapack_int* kd, const double* ab, lapack_int* ldab, double* anorm,
+                   double* rcond, double* work, lapack_int* iwork, lapack_int* info);
+void LAPACK_cpbcon(char* uplo, lapack_int* n, lapack_int* kd, const lapack_complex_float* ab, lapack_int* ldab,
+                   float* anorm, float* rcond, lapack_complex_float* work, float* rwork, lapack_int* info);
+void LAPACK_zpbcon(char* uplo, lapack_int* n, lapack_int* kd, const lapack_complex_double* ab, lapack_int* ldab,
+                   double* anorm, double* rcond, lapack_complex_double* work, double* rwork, lapack_int* info);
+void LAPACK_sptcon(lapack_int* n, const float* d, const float* e, float* anorm, float* rcond, float* work,
+                   lapack_int* info);
+void LAPACK_dptcon(lapack_int* n, const double* d, const double* e, double* anorm, double* rcond, double* work,
+                   lapack_int* info);
+void LAPACK_cptcon(lapack_int* n, const float* d, const lapack_complex_float* e, float* anorm, float* rcond,
+                   float* work, lapack_int* info);
+void LAPACK_zptcon(lapack_int* n, const double* d, const lapack_complex_double* e, double* anorm, double* rcond,
+                   double* work, lapack_int* info);
+void LAPACK_ssycon(char* uplo, lapack_int* n, const float* a, lapack_int* lda, const lapack_int* ipiv, float* anorm,
+                   float* rcond, float* work, lapack_int* iwork, lapack_int* info);
+void LAPACK_dsycon(char* uplo, lapack_int* n, const double* a, lapack_int* lda, const lapack_int* ipiv, double* anorm,
+                   double* rcond, double* work, lapack_int* iwork, lapack_int* info);
+void LAPACK_csycon(char* uplo, lapack_int* n, const lapack_complex_float* a, lapack_int* lda, const lapack_int* ipiv,
+                   float* anorm, float* rcond, lapack_complex_float* work, lapack_int* info);
+void LAPACK_zsycon(char* uplo, lapack_int* n, const lapack_complex_double* a, lapack_int* lda, const lapack_int* ipiv,
+                   double* anorm, double* rcond, lapack_complex_double* work, lapack_int* info);
+void LAPACK_checon(char* uplo, lapack_int* n, const lapack_complex_float* a, lapack_int* lda, const lapack_int* ipiv,
+                   float* anorm, float* rcond, lapack_complex_float* work, lapack_int* info);
+void LAPACK_zhecon(char* uplo, lapack_int* n, const lapack_complex_double* a, lapack_int* lda, const lapack_int* ipiv,
+                   double* anorm, double* rcond, lapack_complex_double* work, lapack_int* info);
+void LAPACK_sspcon(char* uplo, lapack_int* n, const float* ap, const lapack_int* ipiv, float* anorm, float* rcond,
+                   float* work, lapack_int* iwork, lapack_int* info);
+void LAPACK_dspcon(char* uplo, lapack_int* n, const double* ap, const lapack_int* ipiv, double* anorm, double* rcond,
+                   double* work, lapack_int* iwork, lapack_int* info);
+void LAPACK_cspcon(char* uplo, lapack_int* n, const lapack_complex_float* ap, const lapack_int* ipiv, float* anorm,
+                   float* rcond, lapack_complex_float* work, lapack_int* info);
+void LAPACK_zspcon(char* uplo, lapack_int* n, const lapack_complex_double* ap, const lapack_int* ipiv, double* anorm,
+                   double* rcond, lapack_complex_double* work, lapack_int* info);
+void LAPACK_chpcon(char* uplo, lapack_int* n, const lapack_complex_float* ap, const lapack_int* ipiv, float* anorm,
+                   float* rcond, lapack_complex_float* work, lapack_int* info);
+void LAPACK_zhpcon(char* uplo, lapack_int* n, const lapack_complex_double* ap, const lapack_int* ipiv, double* anorm,
+                   double* rcond, lapack_complex_double* work, lapack_int* info);
+void LAPACK_strcon(char* norm, char* uplo, char* diag, lapack_int* n, const float* a, lapack_int* lda, float* rcond,
+                   float* work, lapack_int* iwork, lapack_int* info);
+void LAPACK_dtrcon(char* norm, char* uplo, char* diag, lapack_int* n, const double* a, lapack_int* lda, double* rcond,
+                   double* work, lapack_int* iwork, lapack_int* info);
+void LAPACK_ctrcon(char* norm, char* uplo, char* diag, lapack_int* n, const lapack_complex_float* a, lapack_int* lda,
+                   float* rcond, lapack_complex_float* work, float* rwork, lapack_int* info);
+void LAPACK_ztrcon(char* norm, char* uplo, char* diag, lapack_int* n, const lapack_complex_double* a, lapack_int* lda,
+                   double* rcond, lapack_complex_double* work, double* rwork, lapack_int* info);
+void LAPACK_stpcon(char* norm, char* uplo, char* diag, lapack_int* n, const float* ap, float* rcond, float* work,
+                   lapack_int* iwork, lapack_int* info);
+void LAPACK_dtpcon(char* norm, char* uplo, char* diag, lapack_int* n, const double* ap, double* rcond, double* work,
+                   lapack_int* iwork, lapack_int* info);
+void LAPACK_ctpcon(char* norm, char* uplo, char* diag, lapack_int* n, const lapack_complex_float* ap, float* rcond,
+                   lapack_complex_float* work, float* rwork, lapack_int* info);
+void LAPACK_ztpcon(char* norm, char* uplo, char* diag, lapack_int* n, const lapack_complex_double* ap, double* rcond,
+                   lapack_complex_double* work, double* rwork, lapack_int* info);
+void LAPACK_stbcon(char* norm, char* uplo, char* diag, lapack_int* n, lapack_int* kd, const float* ab, lapack_int* ldab,
+                   float* rcond, float* work, lapack_int* iwork, lapack_int* info);
+void LAPACK_dtbcon(char* norm, char* uplo, char* diag, lapack_int* n, lapack_int* kd, const double* ab,
+                   lapack_int* ldab, double* rcond, double* work, lapack_int* iwork, lapack_int* info);
+void LAPACK_ctbcon(char* norm, char* uplo, char* diag, lapack_int* n, lapack_int* kd, const lapack_complex_float* ab,
+                   lapack_int* ldab, float* rcond, lapack_complex_float* work, float* rwork, lapack_int* info);
+void LAPACK_ztbcon(char* norm, char* uplo, char* diag, lapack_int* n, lapack_int* kd, const lapack_complex_double* ab,
+                   lapack_int* ldab, double* rcond, lapack_complex_double* work, double* rwork, lapack_int* info);
+void LAPACK_sgerfs(char* trans, lapack_int* n, lapack_int* nrhs, const float* a, lapack_int* lda, const float* af,
+                   lapack_int* ldaf, const lapack_int* ipiv, const float* b, lapack_int* ldb, float* x, lapack_int* ldx,
+                   float* ferr, float* berr, float* work, lapack_int* iwork, lapack_int* info);
+void LAPACK_dgerfs(char* trans, lapack_int* n, lapack_int* nrhs, const double* a, lapack_int* lda, const double* af,
+                   lapack_int* ldaf, const lapack_int* ipiv, const double* b, lapack_int* ldb, double* x,
+                   lapack_int* ldx, double* ferr, double* berr, double* work, lapack_int* iwork, lapack_int* info);
+void LAPACK_cgerfs(char* trans, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* a, lapack_int* lda,
+                   const lapack_complex_float* af, lapack_int* ldaf, const lapack_int* ipiv,
+                   const lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx,
+                   float* ferr, float* berr, lapack_complex_float* work, float* rwork, lapack_int* info);
+void LAPACK_zgerfs(char* trans, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* a, lapack_int* lda,
+                   const lapack_complex_double* af, lapack_int* ldaf, const lapack_int* ipiv,
+                   const lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
+                   double* ferr, double* berr, lapack_complex_double* work, double* rwork, lapack_int* info);
+void LAPACK_dgerfsx(char* trans, char* equed, lapack_int* n, lapack_int* nrhs, const double* a, lapack_int* lda,
+                    const double* af, lapack_int* ldaf, const lapack_int* ipiv, const double* r, const double* c,
+                    const double* b, lapack_int* ldb, double* x, lapack_int* ldx, double* rcond, double* berr,
+                    lapack_int* n_err_bnds, double* err_bnds_norm, double* err_bnds_comp, lapack_int* nparams,
+                    double* params, double* work, lapack_int* iwork, lapack_int* info);
+void LAPACK_sgerfsx(char* trans, char* equed, lapack_int* n, lapack_int* nrhs, const float* a, lapack_int* lda,
+                    const float* af, lapack_int* ldaf, const lapack_int* ipiv, const float* r, const float* c,
+                    const float* b, lapack_int* ldb, float* x, lapack_int* ldx, float* rcond, float* berr,
+                    lapack_int* n_err_bnds, float* err_bnds_norm, float* err_bnds_comp, lapack_int* nparams,
+                    float* params, float* work, lapack_int* iwork, lapack_int* info);
+void LAPACK_zgerfsx(char* trans, char* equed, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* a,
+                    lapack_int* lda, const lapack_complex_double* af, lapack_int* ldaf, const lapack_int* ipiv,
+                    const double* r, const double* c, const lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* x, lapack_int* ldx, double* rcond, double* berr, lapack_int* n_err_bnds,
+                    double* err_bnds_norm, double* err_bnds_comp, lapack_int* nparams, double* params,
+                    lapack_complex_double* work, double* rwork, lapack_int* info);
+void LAPACK_cgerfsx(char* trans, char* equed, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* a,
+                    lapack_int* lda, const lapack_complex_float* af, lapack_int* ldaf, const lapack_int* ipiv,
+                    const float* r, const float* c, const lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* x, lapack_int* ldx, float* rcond, float* berr, lapack_int* n_err_bnds,
+                    float* err_bnds_norm, float* err_bnds_comp, lapack_int* nparams, float* params,
+                    lapack_complex_float* work, float* rwork, lapack_int* info);
+void LAPACK_sgbrfs(char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku, lapack_int* nrhs, const float* ab,
+                   lapack_int* ldab, const float* afb, lapack_int* ldafb, const lapack_int* ipiv, const float* b,
+                   lapack_int* ldb, float* x, lapack_int* ldx, float* ferr, float* berr, float* work, lapack_int* iwork,
+                   lapack_int* info);
+void LAPACK_dgbrfs(char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku, lapack_int* nrhs, const double* ab,
+                   lapack_int* ldab, const double* afb, lapack_int* ldafb, const lapack_int* ipiv, const double* b,
+                   lapack_int* ldb, double* x, lapack_int* ldx, double* ferr, double* berr, double* work,
+                   lapack_int* iwork, lapack_int* info);
+void LAPACK_cgbrfs(char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku, lapack_int* nrhs,
+                   const lapack_complex_float* ab, lapack_int* ldab, const lapack_complex_float* afb, lapack_int* ldafb,
+                   const lapack_int* ipiv, const lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* x,
+                   lapack_int* ldx, float* ferr, float* berr, lapack_complex_float* work, float* rwork,
+                   lapack_int* info);
+void LAPACK_zgbrfs(char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku, lapack_int* nrhs,
+                   const lapack_complex_double* ab, lapack_int* ldab, const lapack_complex_double* afb,
+                   lapack_int* ldafb, const lapack_int* ipiv, const lapack_complex_double* b, lapack_int* ldb,
+                   lapack_complex_double* x, lapack_int* ldx, double* ferr, double* berr, lapack_complex_double* work,
+                   double* rwork, lapack_int* info);
+void LAPACK_dgbrfsx(char* trans, char* equed, lapack_int* n, lapack_int* kl, lapack_int* ku, lapack_int* nrhs,
+                    const double* ab, lapack_int* ldab, const double* afb, lapack_int* ldafb, const lapack_int* ipiv,
+                    const double* r, const double* c, const double* b, lapack_int* ldb, double* x, lapack_int* ldx,
+                    double* rcond, double* berr, lapack_int* n_err_bnds, double* err_bnds_norm, double* err_bnds_comp,
+                    lapack_int* nparams, double* params, double* work, lapack_int* iwork, lapack_int* info);
+void LAPACK_sgbrfsx(char* trans, char* equed, lapack_int* n, lapack_int* kl, lapack_int* ku, lapack_int* nrhs,
+                    const float* ab, lapack_int* ldab, const float* afb, lapack_int* ldafb, const lapack_int* ipiv,
+                    const float* r, const float* c, const float* b, lapack_int* ldb, float* x, lapack_int* ldx,
+                    float* rcond, float* berr, lapack_int* n_err_bnds, float* err_bnds_norm, float* err_bnds_comp,
+                    lapack_int* nparams, float* params, float* work, lapack_int* iwork, lapack_int* info);
+void LAPACK_zgbrfsx(char* trans, char* equed, lapack_int* n, lapack_int* kl, lapack_int* ku, lapack_int* nrhs,
+                    const lapack_complex_double* ab, lapack_int* ldab, const lapack_complex_double* afb,
+                    lapack_int* ldafb, const lapack_int* ipiv, const double* r, const double* c,
+                    const lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
+                    double* rcond, double* berr, lapack_int* n_err_bnds, double* err_bnds_norm, double* err_bnds_comp,
+                    lapack_int* nparams, double* params, lapack_complex_double* work, double* rwork, lapack_int* info);
+void LAPACK_cgbrfsx(char* trans, char* equed, lapack_int* n, lapack_int* kl, lapack_int* ku, lapack_int* nrhs,
+                    const lapack_complex_float* ab, lapack_int* ldab, const lapack_complex_float* afb,
+                    lapack_int* ldafb, const lapack_int* ipiv, const float* r, const float* c,
+                    const lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx,
+                    float* rcond, float* berr, lapack_int* n_err_bnds, float* err_bnds_norm, float* err_bnds_comp,
+                    lapack_int* nparams, float* params, lapack_complex_float* work, float* rwork, lapack_int* info);
+void LAPACK_sgtrfs(char* trans, lapack_int* n, lapack_int* nrhs, const float* dl, const float* d, const float* du,
+                   const float* dlf, const float* df, const float* duf, const float* du2, const lapack_int* ipiv,
+                   const float* b, lapack_int* ldb, float* x, lapack_int* ldx, float* ferr, float* berr, float* work,
+                   lapack_int* iwork, lapack_int* info);
+void LAPACK_dgtrfs(char* trans, lapack_int* n, lapack_int* nrhs, const double* dl, const double* d, const double* du,
+                   const double* dlf, const double* df, const double* duf, const double* du2, const lapack_int* ipiv,
+                   const double* b, lapack_int* ldb, double* x, lapack_int* ldx, double* ferr, double* berr,
+                   double* work, lapack_int* iwork, lapack_int* info);
+void LAPACK_cgtrfs(char* trans, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* dl,
+                   const lapack_complex_float* d, const lapack_complex_float* du, const lapack_complex_float* dlf,
+                   const lapack_complex_float* df, const lapack_complex_float* duf, const lapack_complex_float* du2,
+                   const lapack_int* ipiv, const lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* x,
+                   lapack_int* ldx, float* ferr, float* berr, lapack_complex_float* work, float* rwork,
+                   lapack_int* info);
+void LAPACK_zgtrfs(char* trans, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* dl,
+                   const lapack_complex_double* d, const lapack_complex_double* du, const lapack_complex_double* dlf,
+                   const lapack_complex_double* df, const lapack_complex_double* duf, const lapack_complex_double* du2,
+                   const lapack_int* ipiv, const lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* x,
+                   lapack_int* ldx, double* ferr, double* berr, lapack_complex_double* work, double* rwork,
+                   lapack_int* info);
+void LAPACK_sporfs(char* uplo, lapack_int* n, lapack_int* nrhs, const float* a, lapack_int* lda, const float* af,
+                   lapack_int* ldaf, const float* b, lapack_int* ldb, float* x, lapack_int* ldx, float* ferr,
+                   float* berr, float* work, lapack_int* iwork, lapack_int* info);
+void LAPACK_dporfs(char* uplo, lapack_int* n, lapack_int* nrhs, const double* a, lapack_int* lda, const double* af,
+                   lapack_int* ldaf, const double* b, lapack_int* ldb, double* x, lapack_int* ldx, double* ferr,
+                   double* berr, double* work, lapack_int* iwork, lapack_int* info);
+void LAPACK_cporfs(char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* a, lapack_int* lda,
+                   const lapack_complex_float* af, lapack_int* ldaf, const lapack_complex_float* b, lapack_int* ldb,
+                   lapack_complex_float* x, lapack_int* ldx, float* ferr, float* berr, lapack_complex_float* work,
+                   float* rwork, lapack_int* info);
+void LAPACK_zporfs(char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* a, lapack_int* lda,
+                   const lapack_complex_double* af, lapack_int* ldaf, const lapack_complex_double* b, lapack_int* ldb,
+                   lapack_complex_double* x, lapack_int* ldx, double* ferr, double* berr, lapack_complex_double* work,
+                   double* rwork, lapack_int* info);
+void LAPACK_dporfsx(char* uplo, char* equed, lapack_int* n, lapack_int* nrhs, const double* a, lapack_int* lda,
+                    const double* af, lapack_int* ldaf, const double* s, const double* b, lapack_int* ldb, double* x,
+                    lapack_int* ldx, double* rcond, double* berr, lapack_int* n_err_bnds, double* err_bnds_norm,
+                    double* err_bnds_comp, lapack_int* nparams, double* params, double* work, lapack_int* iwork,
+                    lapack_int* info);
+void LAPACK_sporfsx(char* uplo, char* equed, lapack_int* n, lapack_int* nrhs, const float* a, lapack_int* lda,
+                    const float* af, lapack_int* ldaf, const float* s, const float* b, lapack_int* ldb, float* x,
+                    lapack_int* ldx, float* rcond, float* berr, lapack_int* n_err_bnds, float* err_bnds_norm,
+                    float* err_bnds_comp, lapack_int* nparams, float* params, float* work, lapack_int* iwork,
+                    lapack_int* info);
+void LAPACK_zporfsx(char* uplo, char* equed, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* a,
+                    lapack_int* lda, const lapack_complex_double* af, lapack_int* ldaf, const double* s,
+                    const lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
+                    double* rcond, double* berr, lapack_int* n_err_bnds, double* err_bnds_norm, double* err_bnds_comp,
+                    lapack_int* nparams, double* params, lapack_complex_double* work, double* rwork, lapack_int* info);
+void LAPACK_cporfsx(char* uplo, char* equed, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* a,
+                    lapack_int* lda, const lapack_complex_float* af, lapack_int* ldaf, const float* s,
+                    const lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx,
+                    float* rcond, float* berr, lapack_int* n_err_bnds, float* err_bnds_norm, float* err_bnds_comp,
+                    lapack_int* nparams, float* params, lapack_complex_float* work, float* rwork, lapack_int* info);
+void LAPACK_spprfs(char* uplo, lapack_int* n, lapack_int* nrhs, const float* ap, const float* afp, const float* b,
+                   lapack_int* ldb, float* x, lapack_int* ldx, float* ferr, float* berr, float* work, lapack_int* iwork,
+                   lapack_int* info);
+void LAPACK_dpprfs(char* uplo, lapack_int* n, lapack_int* nrhs, const double* ap, const double* afp, const double* b,
+                   lapack_int* ldb, double* x, lapack_int* ldx, double* ferr, double* berr, double* work,
+                   lapack_int* iwork, lapack_int* info);
+void LAPACK_cpprfs(char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* ap,
+                   const lapack_complex_float* afp, const lapack_complex_float* b, lapack_int* ldb,
+                   lapack_complex_float* x, lapack_int* ldx, float* ferr, float* berr, lapack_complex_float* work,
+                   float* rwork, lapack_int* info);
+void LAPACK_zpprfs(char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* ap,
+                   const lapack_complex_double* afp, const lapack_complex_double* b, lapack_int* ldb,
+                   lapack_complex_double* x, lapack_int* ldx, double* ferr, double* berr, lapack_complex_double* work,
+                   double* rwork, lapack_int* info);
+void LAPACK_spbrfs(char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs, const float* ab, lapack_int* ldab,
+                   const float* afb, lapack_int* ldafb, const float* b, lapack_int* ldb, float* x, lapack_int* ldx,
+                   float* ferr, float* berr, float* work, lapack_int* iwork, lapack_int* info);
+void LAPACK_dpbrfs(char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs, const double* ab, lapack_int* ldab,
+                   const double* afb, lapack_int* ldafb, const double* b, lapack_int* ldb, double* x, lapack_int* ldx,
+                   double* ferr, double* berr, double* work, lapack_int* iwork, lapack_int* info);
+void LAPACK_cpbrfs(char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs, const lapack_complex_float* ab,
+                   lapack_int* ldab, const lapack_complex_float* afb, lapack_int* ldafb, const lapack_complex_float* b,
+                   lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx, float* ferr, float* berr,
+                   lapack_complex_float* work, float* rwork, lapack_int* info);
+void LAPACK_zpbrfs(char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs, const lapack_complex_double* ab,
+                   lapack_int* ldab, const lapack_complex_double* afb, lapack_int* ldafb,
+                   const lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
+                   double* ferr, double* berr, lapack_complex_double* work, double* rwork, lapack_int* info);
+void LAPACK_sptrfs(lapack_int* n, lapack_int* nrhs, const float* d, const float* e, const float* df, const float* ef,
+                   const float* b, lapack_int* ldb, float* x, lapack_int* ldx, float* ferr, float* berr, float* work,
+                   lapack_int* info);
+void LAPACK_dptrfs(lapack_int* n, lapack_int* nrhs, const double* d, const double* e, const double* df,
+                   const double* ef, const double* b, lapack_int* ldb, double* x, lapack_int* ldx, double* ferr,
+                   double* berr, double* work, lapack_int* info);
+void LAPACK_cptrfs(char* uplo, lapack_int* n, lapack_int* nrhs, const float* d, const lapack_complex_float* e,
+                   const float* df, const lapack_complex_float* ef, const lapack_complex_float* b, lapack_int* ldb,
+                   lapack_complex_float* x, lapack_int* ldx, float* ferr, float* berr, lapack_complex_float* work,
+                   float* rwork, lapack_int* info);
+void LAPACK_zptrfs(char* uplo, lapack_int* n, lapack_int* nrhs, const double* d, const lapack_complex_double* e,
+                   const double* df, const lapack_complex_double* ef, const lapack_complex_double* b, lapack_int* ldb,
+                   lapack_complex_double* x, lapack_int* ldx, double* ferr, double* berr, lapack_complex_double* work,
+                   double* rwork, lapack_int* info);
+void LAPACK_ssyrfs(char* uplo, lapack_int* n, lapack_int* nrhs, const float* a, lapack_int* lda, const float* af,
+                   lapack_int* ldaf, const lapack_int* ipiv, const float* b, lapack_int* ldb, float* x, lapack_int* ldx,
+                   float* ferr, float* berr, float* work, lapack_int* iwork, lapack_int* info);
+void LAPACK_dsyrfs(char* uplo, lapack_int* n, lapack_int* nrhs, const double* a, lapack_int* lda, const double* af,
+                   lapack_int* ldaf, const lapack_int* ipiv, const double* b, lapack_int* ldb, double* x,
+                   lapack_int* ldx, double* ferr, double* berr, double* work, lapack_int* iwork, lapack_int* info);
+void LAPACK_csyrfs(char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* a, lapack_int* lda,
+                   const lapack_complex_float* af, lapack_int* ldaf, const lapack_int* ipiv,
+                   const lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx,
+                   float* ferr, float* berr, lapack_complex_float* work, float* rwork, lapack_int* info);
+void LAPACK_zsyrfs(char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* a, lapack_int* lda,
+                   const lapack_complex_double* af, lapack_int* ldaf, const lapack_int* ipiv,
+                   const lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
+                   double* ferr, double* berr, lapack_complex_double* work, double* rwork, lapack_int* info);
+void LAPACK_dsyrfsx(char* uplo, char* equed, lapack_int* n, lapack_int* nrhs, const double* a, lapack_int* lda,
+                    const double* af, lapack_int* ldaf, const lapack_int* ipiv, const double* s, const double* b,
+                    lapack_int* ldb, double* x, lapack_int* ldx, double* rcond, double* berr, lapack_int* n_err_bnds,
+                    double* err_bnds_norm, double* err_bnds_comp, lapack_int* nparams, double* params, double* work,
+                    lapack_int* iwork, lapack_int* info);
+void LAPACK_ssyrfsx(char* uplo, char* equed, lapack_int* n, lapack_int* nrhs, const float* a, lapack_int* lda,
+                    const float* af, lapack_int* ldaf, const lapack_int* ipiv, const float* s, const float* b,
+                    lapack_int* ldb, float* x, lapack_int* ldx, float* rcond, float* berr, lapack_int* n_err_bnds,
+                    float* err_bnds_norm, float* err_bnds_comp, lapack_int* nparams, float* params, float* work,
+                    lapack_int* iwork, lapack_int* info);
+void LAPACK_zsyrfsx(char* uplo, char* equed, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* a,
+                    lapack_int* lda, const lapack_complex_double* af, lapack_int* ldaf, const lapack_int* ipiv,
+                    const double* s, const lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* x,
+                    lapack_int* ldx, double* rcond, double* berr, lapack_int* n_err_bnds, double* err_bnds_norm,
+                    double* err_bnds_comp, lapack_int* nparams, double* params, lapack_complex_double* work,
+                    double* rwork, lapack_int* info);
+void LAPACK_csyrfsx(char* uplo, char* equed, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* a,
+                    lapack_int* lda, const lapack_complex_float* af, lapack_int* ldaf, const lapack_int* ipiv,
+                    const float* s, const lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* x,
+                    lapack_int* ldx, float* rcond, float* berr, lapack_int* n_err_bnds, float* err_bnds_norm,
+                    float* err_bnds_comp, lapack_int* nparams, float* params, lapack_complex_float* work, float* rwork,
+                    lapack_int* info);
+void LAPACK_cherfs(char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* a, lapack_int* lda,
+                   const lapack_complex_float* af, lapack_int* ldaf, const lapack_int* ipiv,
+                   const lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx,
+                   float* ferr, float* berr, lapack_complex_float* work, float* rwork, lapack_int* info);
+void LAPACK_zherfs(char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* a, lapack_int* lda,
+                   const lapack_complex_double* af, lapack_int* ldaf, const lapack_int* ipiv,
+                   const lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
+                   double* ferr, double* berr, lapack_complex_double* work, double* rwork, lapack_int* info);
+void LAPACK_zherfsx(char* uplo, char* equed, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* a,
+                    lapack_int* lda, const lapack_complex_double* af, lapack_int* ldaf, const lapack_int* ipiv,
+                    const double* s, const lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* x,
+                    lapack_int* ldx, double* rcond, double* berr, lapack_int* n_err_bnds, double* err_bnds_norm,
+                    double* err_bnds_comp, lapack_int* nparams, double* params, lapack_complex_double* work,
+                    double* rwork, lapack_int* info);
+void LAPACK_cherfsx(char* uplo, char* equed, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* a,
+                    lapack_int* lda, const lapack_complex_float* af, lapack_int* ldaf, const lapack_int* ipiv,
+                    const float* s, const lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* x,
+                    lapack_int* ldx, float* rcond, float* berr, lapack_int* n_err_bnds, float* err_bnds_norm,
+                    float* err_bnds_comp, lapack_int* nparams, float* params, lapack_complex_float* work, float* rwork,
+                    lapack_int* info);
+void LAPACK_ssprfs(char* uplo, lapack_int* n, lapack_int* nrhs, const float* ap, const float* afp,
+                   const lapack_int* ipiv, const float* b, lapack_int* ldb, float* x, lapack_int* ldx, float* ferr,
+                   float* berr, float* work, lapack_int* iwork, lapack_int* info);
+void LAPACK_dsprfs(char* uplo, lapack_int* n, lapack_int* nrhs, const double* ap, const double* afp,
+                   const lapack_int* ipiv, const double* b, lapack_int* ldb, double* x, lapack_int* ldx, double* ferr,
+                   double* berr, double* work, lapack_int* iwork, lapack_int* info);
+void LAPACK_csprfs(char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* ap,
+                   const lapack_complex_float* afp, const lapack_int* ipiv, const lapack_complex_float* b,
+                   lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx, float* ferr, float* berr,
+                   lapack_complex_float* work, float* rwork, lapack_int* info);
+void LAPACK_zsprfs(char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* ap,
+                   const lapack_complex_double* afp, const lapack_int* ipiv, const lapack_complex_double* b,
+                   lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx, double* ferr, double* berr,
+                   lapack_complex_double* work, double* rwork, lapack_int* info);
+void LAPACK_chprfs(char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* ap,
+                   const lapack_complex_float* afp, const lapack_int* ipiv, const lapack_complex_float* b,
+                   lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx, float* ferr, float* berr,
+                   lapack_complex_float* work, float* rwork, lapack_int* info);
+void LAPACK_zhprfs(char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* ap,
+                   const lapack_complex_double* afp, const lapack_int* ipiv, const lapack_complex_double* b,
+                   lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx, double* ferr, double* berr,
+                   lapack_complex_double* work, double* rwork, lapack_int* info);
+void LAPACK_strrfs(char* uplo, char* trans, char* diag, lapack_int* n, lapack_int* nrhs, const float* a,
+                   lapack_int* lda, const float* b, lapack_int* ldb, const float* x, lapack_int* ldx, float* ferr,
+                   float* berr, float* work, lapack_int* iwork, lapack_int* info);
+void LAPACK_dtrrfs(char* uplo, char* trans, char* diag, lapack_int* n, lapack_int* nrhs, const double* a,
+                   lapack_int* lda, const double* b, lapack_int* ldb, const double* x, lapack_int* ldx, double* ferr,
+                   double* berr, double* work, lapack_int* iwork, lapack_int* info);
+void LAPACK_ctrrfs(char* uplo, char* trans, char* diag, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* a,
+                   lapack_int* lda, const lapack_complex_float* b, lapack_int* ldb, const lapack_complex_float* x,
+                   lapack_int* ldx, float* ferr, float* berr, lapack_complex_float* work, float* rwork,
+                   lapack_int* info);
+void LAPACK_ztrrfs(char* uplo, char* trans, char* diag, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* a,
+                   lapack_int* lda, const lapack_complex_double* b, lapack_int* ldb, const lapack_complex_double* x,
+                   lapack_int* ldx, double* ferr, double* berr, lapack_complex_double* work, double* rwork,
+                   lapack_int* info);
+void LAPACK_stprfs(char* uplo, char* trans, char* diag, lapack_int* n, lapack_int* nrhs, const float* ap,
+                   const float* b, lapack_int* ldb, const float* x, lapack_int* ldx, float* ferr, float* berr,
+                   float* work, lapack_int* iwork, lapack_int* info);
+void LAPACK_dtprfs(char* uplo, char* trans, char* diag, lapack_int* n, lapack_int* nrhs, const double* ap,
+                   const double* b, lapack_int* ldb, const double* x, lapack_int* ldx, double* ferr, double* berr,
+                   double* work, lapack_int* iwork, lapack_int* info);
+void LAPACK_ctprfs(char* uplo, char* trans, char* diag, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* ap,
+                   const lapack_complex_float* b, lapack_int* ldb, const lapack_complex_float* x, lapack_int* ldx,
+                   float* ferr, float* berr, lapack_complex_float* work, float* rwork, lapack_int* info);
+void LAPACK_ztprfs(char* uplo, char* trans, char* diag, lapack_int* n, lapack_int* nrhs,
+                   const lapack_complex_double* ap, const lapack_complex_double* b, lapack_int* ldb,
+                   const lapack_complex_double* x, lapack_int* ldx, double* ferr, double* berr,
+                   lapack_complex_double* work, double* rwork, lapack_int* info);
+void LAPACK_stbrfs(char* uplo, char* trans, char* diag, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
+                   const float* ab, lapack_int* ldab, const float* b, lapack_int* ldb, const float* x, lapack_int* ldx,
+                   float* ferr, float* berr, float* work, lapack_int* iwork, lapack_int* info);
+void LAPACK_dtbrfs(char* uplo, char* trans, char* diag, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
+                   const double* ab, lapack_int* ldab, const double* b, lapack_int* ldb, const double* x,
+                   lapack_int* ldx, double* ferr, double* berr, double* work, lapack_int* iwork, lapack_int* info);
+void LAPACK_ctbrfs(char* uplo, char* trans, char* diag, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
+                   const lapack_complex_float* ab, lapack_int* ldab, const lapack_complex_float* b, lapack_int* ldb,
+                   const lapack_complex_float* x, lapack_int* ldx, float* ferr, float* berr, lapack_complex_float* work,
+                   float* rwork, lapack_int* info);
+void LAPACK_ztbrfs(char* uplo, char* trans, char* diag, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
+                   const lapack_complex_double* ab, lapack_int* ldab, const lapack_complex_double* b, lapack_int* ldb,
+                   const lapack_complex_double* x, lapack_int* ldx, double* ferr, double* berr,
+                   lapack_complex_double* work, double* rwork, lapack_int* info);
+void LAPACK_sgetri(lapack_int* n, float* a, lapack_int* lda, const lapack_int* ipiv, float* work, lapack_int* lwork,
+                   lapack_int* info);
+void LAPACK_dgetri(lapack_int* n, double* a, lapack_int* lda, const lapack_int* ipiv, double* work, lapack_int* lwork,
+                   lapack_int* info);
+void LAPACK_cgetri(lapack_int* n, lapack_complex_float* a, lapack_int* lda, const lapack_int* ipiv,
+                   lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_zgetri(lapack_int* n, lapack_complex_double* a, lapack_int* lda, const lapack_int* ipiv,
+                   lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_spotri(char* uplo, lapack_int* n, float* a, lapack_int* lda, lapack_int* info);
+void LAPACK_dpotri(char* uplo, lapack_int* n, double* a, lapack_int* lda, lapack_int* info);
+void LAPACK_cpotri(char* uplo, lapack_int* n, lapack_complex_float* a, lapack_int* lda, lapack_int* info);
+void LAPACK_zpotri(char* uplo, lapack_int* n, lapack_complex_double* a, lapack_int* lda, lapack_int* info);
+void LAPACK_dpftri(char* transr, char* uplo, lapack_int* n, double* a, lapack_int* info);
+void LAPACK_spftri(char* transr, char* uplo, lapack_int* n, float* a, lapack_int* info);
+void LAPACK_zpftri(char* transr, char* uplo, lapack_int* n, lapack_complex_double* a, lapack_int* info);
+void LAPACK_cpftri(char* transr, char* uplo, lapack_int* n, lapack_complex_float* a, lapack_int* info);
+void LAPACK_spptri(char* uplo, lapack_int* n, float* ap, lapack_int* info);
+void LAPACK_dpptri(char* uplo, lapack_int* n, double* ap, lapack_int* info);
+void LAPACK_cpptri(char* uplo, lapack_int* n, lapack_complex_float* ap, lapack_int* info);
+void LAPACK_zpptri(char* uplo, lapack_int* n, lapack_complex_double* ap, lapack_int* info);
+void LAPACK_ssytri(char* uplo, lapack_int* n, float* a, lapack_int* lda, const lapack_int* ipiv, float* work,
+                   lapack_int* info);
+void LAPACK_dsytri(char* uplo, lapack_int* n, double* a, lapack_int* lda, const lapack_int* ipiv, double* work,
+                   lapack_int* info);
+void LAPACK_csytri(char* uplo, lapack_int* n, lapack_complex_float* a, lapack_int* lda, const lapack_int* ipiv,
+                   lapack_complex_float* work, lapack_int* info);
+void LAPACK_zsytri(char* uplo, lapack_int* n, lapack_complex_double* a, lapack_int* lda, const lapack_int* ipiv,
+                   lapack_complex_double* work, lapack_int* info);
+void LAPACK_chetri(char* uplo, lapack_int* n, lapack_complex_float* a, lapack_int* lda, const lapack_int* ipiv,
+                   lapack_complex_float* work, lapack_int* info);
+void LAPACK_zhetri(char* uplo, lapack_int* n, lapack_complex_double* a, lapack_int* lda, const lapack_int* ipiv,
+                   lapack_complex_double* work, lapack_int* info);
+void LAPACK_ssptri(char* uplo, lapack_int* n, float* ap, const lapack_int* ipiv, float* work, lapack_int* info);
+void LAPACK_dsptri(char* uplo, lapack_int* n, double* ap, const lapack_int* ipiv, double* work, lapack_int* info);
+void LAPACK_csptri(char* uplo, lapack_int* n, lapack_complex_float* ap, const lapack_int* ipiv,
+                   lapack_complex_float* work, lapack_int* info);
+void LAPACK_zsptri(char* uplo, lapack_int* n, lapack_complex_double* ap, const lapack_int* ipiv,
+                   lapack_complex_double* work, lapack_int* info);
+void LAPACK_chptri(char* uplo, lapack_int* n, lapack_complex_float* ap, const lapack_int* ipiv,
+                   lapack_complex_float* work, lapack_int* info);
+void LAPACK_zhptri(char* uplo, lapack_int* n, lapack_complex_double* ap, const lapack_int* ipiv,
+                   lapack_complex_double* work, lapack_int* info);
+void LAPACK_strtri(char* uplo, char* diag, lapack_int* n, float* a, lapack_int* lda, lapack_int* info);
+void LAPACK_dtrtri(char* uplo, char* diag, lapack_int* n, double* a, lapack_int* lda, lapack_int* info);
+void LAPACK_ctrtri(char* uplo, char* diag, lapack_int* n, lapack_complex_float* a, lapack_int* lda, lapack_int* info);
+void LAPACK_ztrtri(char* uplo, char* diag, lapack_int* n, lapack_complex_double* a, lapack_int* lda, lapack_int* info);
+void LAPACK_dtftri(char* transr, char* uplo, char* diag, lapack_int* n, double* a, lapack_int* info);
+void LAPACK_stftri(char* transr, char* uplo, char* diag, lapack_int* n, float* a, lapack_int* info);
+void LAPACK_ztftri(char* transr, char* uplo, char* diag, lapack_int* n, lapack_complex_double* a, lapack_int* info);
+void LAPACK_ctftri(char* transr, char* uplo, char* diag, lapack_int* n, lapack_complex_float* a, lapack_int* info);
+void LAPACK_stptri(char* uplo, char* diag, lapack_int* n, float* ap, lapack_int* info);
+void LAPACK_dtptri(char* uplo, char* diag, lapack_int* n, double* ap, lapack_int* info);
+void LAPACK_ctptri(char* uplo, char* diag, lapack_int* n, lapack_complex_float* ap, lapack_int* info);
+void LAPACK_ztptri(char* uplo, char* diag, lapack_int* n, lapack_complex_double* ap, lapack_int* info);
+void LAPACK_sgeequ(lapack_int* m, lapack_int* n, const float* a, lapack_int* lda, float* r, float* c, float* rowcnd,
+                   float* colcnd, float* amax, lapack_int* info);
+void LAPACK_dgeequ(lapack_int* m, lapack_int* n, const double* a, lapack_int* lda, double* r, double* c, double* rowcnd,
+                   double* colcnd, double* amax, lapack_int* info);
+void LAPACK_cgeequ(lapack_int* m, lapack_int* n, const lapack_complex_float* a, lapack_int* lda, float* r, float* c,
+                   float* rowcnd, float* colcnd, float* amax, lapack_int* info);
+void LAPACK_zgeequ(lapack_int* m, lapack_int* n, const lapack_complex_double* a, lapack_int* lda, double* r, double* c,
+                   double* rowcnd, double* colcnd, double* amax, lapack_int* info);
+void LAPACK_dgeequb(lapack_int* m, lapack_int* n, const double* a, lapack_int* lda, double* r, double* c,
+                    double* rowcnd, double* colcnd, double* amax, lapack_int* info);
+void LAPACK_sgeequb(lapack_int* m, lapack_int* n, const float* a, lapack_int* lda, float* r, float* c, float* rowcnd,
+                    float* colcnd, float* amax, lapack_int* info);
+void LAPACK_zgeequb(lapack_int* m, lapack_int* n, const lapack_complex_double* a, lapack_int* lda, double* r, double* c,
+                    double* rowcnd, double* colcnd, double* amax, lapack_int* info);
+void LAPACK_cgeequb(lapack_int* m, lapack_int* n, const lapack_complex_float* a, lapack_int* lda, float* r, float* c,
+                    float* rowcnd, float* colcnd, float* amax, lapack_int* info);
+void LAPACK_sgbequ(lapack_int* m, lapack_int* n, lapack_int* kl, lapack_int* ku, const float* ab, lapack_int* ldab,
+                   float* r, float* c, float* rowcnd, float* colcnd, float* amax, lapack_int* info);
+void LAPACK_dgbequ(lapack_int* m, lapack_int* n, lapack_int* kl, lapack_int* ku, const double* ab, lapack_int* ldab,
+                   double* r, double* c, double* rowcnd, double* colcnd, double* amax, lapack_int* info);
+void LAPACK_cgbequ(lapack_int* m, lapack_int* n, lapack_int* kl, lapack_int* ku, const lapack_complex_float* ab,
+                   lapack_int* ldab, float* r, float* c, float* rowcnd, float* colcnd, float* amax, lapack_int* info);
+void LAPACK_zgbequ(lapack_int* m, lapack_int* n, lapack_int* kl, lapack_int* ku, const lapack_complex_double* ab,
+                   lapack_int* ldab, double* r, double* c, double* rowcnd, double* colcnd, double* amax,
+                   lapack_int* info);
+void LAPACK_dgbequb(lapack_int* m, lapack_int* n, lapack_int* kl, lapack_int* ku, const double* ab, lapack_int* ldab,
+                    double* r, double* c, double* rowcnd, double* colcnd, double* amax, lapack_int* info);
+void LAPACK_sgbequb(lapack_int* m, lapack_int* n, lapack_int* kl, lapack_int* ku, const float* ab, lapack_int* ldab,
+                    float* r, float* c, float* rowcnd, float* colcnd, float* amax, lapack_int* info);
+void LAPACK_zgbequb(lapack_int* m, lapack_int* n, lapack_int* kl, lapack_int* ku, const lapack_complex_double* ab,
+                    lapack_int* ldab, double* r, double* c, double* rowcnd, double* colcnd, double* amax,
+                    lapack_int* info);
+void LAPACK_cgbequb(lapack_int* m, lapack_int* n, lapack_int* kl, lapack_int* ku, const lapack_complex_float* ab,
+                    lapack_int* ldab, float* r, float* c, float* rowcnd, float* colcnd, float* amax, lapack_int* info);
+void LAPACK_spoequ(lapack_int* n, const float* a, lapack_int* lda, float* s, float* scond, float* amax,
+                   lapack_int* info);
+void LAPACK_dpoequ(lapack_int* n, const double* a, lapack_int* lda, double* s, double* scond, double* amax,
+                   lapack_int* info);
+void LAPACK_cpoequ(lapack_int* n, const lapack_complex_float* a, lapack_int* lda, float* s, float* scond, float* amax,
+                   lapack_int* info);
+void LAPACK_zpoequ(lapack_int* n, const lapack_complex_double* a, lapack_int* lda, double* s, double* scond,
+                   double* amax, lapack_int* info);
+void LAPACK_dpoequb(lapack_int* n, const double* a, lapack_int* lda, double* s, double* scond, double* amax,
+                    lapack_int* info);
+void LAPACK_spoequb(lapack_int* n, const float* a, lapack_int* lda, float* s, float* scond, float* amax,
+                    lapack_int* info);
+void LAPACK_zpoequb(lapack_int* n, const lapack_complex_double* a, lapack_int* lda, double* s, double* scond,
+                    double* amax, lapack_int* info);
+void LAPACK_cpoequb(lapack_int* n, const lapack_complex_float* a, lapack_int* lda, float* s, float* scond, float* amax,
+                    lapack_int* info);
+void LAPACK_sppequ(char* uplo, lapack_int* n, const float* ap, float* s, float* scond, float* amax, lapack_int* info);
+void LAPACK_dppequ(char* uplo, lapack_int* n, const double* ap, double* s, double* scond, double* amax,
+                   lapack_int* info);
+void LAPACK_cppequ(char* uplo, lapack_int* n, const lapack_complex_float* ap, float* s, float* scond, float* amax,
+                   lapack_int* info);
+void LAPACK_zppequ(char* uplo, lapack_int* n, const lapack_complex_double* ap, double* s, double* scond, double* amax,
+                   lapack_int* info);
+void LAPACK_spbequ(char* uplo, lapack_int* n, lapack_int* kd, const float* ab, lapack_int* ldab, float* s, float* scond,
+                   float* amax, lapack_int* info);
+void LAPACK_dpbequ(char* uplo, lapack_int* n, lapack_int* kd, const double* ab, lapack_int* ldab, double* s,
+                   double* scond, double* amax, lapack_int* info);
+void LAPACK_cpbequ(char* uplo, lapack_int* n, lapack_int* kd, const lapack_complex_float* ab, lapack_int* ldab,
+                   float* s, float* scond, float* amax, lapack_int* info);
+void LAPACK_zpbequ(char* uplo, lapack_int* n, lapack_int* kd, const lapack_complex_double* ab, lapack_int* ldab,
+                   double* s, double* scond, double* amax, lapack_int* info);
+void LAPACK_dsyequb(char* uplo, lapack_int* n, const double* a, lapack_int* lda, double* s, double* scond, double* amax,
+                    double* work, lapack_int* info);
+void LAPACK_ssyequb(char* uplo, lapack_int* n, const float* a, lapack_int* lda, float* s, float* scond, float* amax,
+                    float* work, lapack_int* info);
+void LAPACK_zsyequb(char* uplo, lapack_int* n, const lapack_complex_double* a, lapack_int* lda, double* s,
+                    double* scond, double* amax, lapack_complex_double* work, lapack_int* info);
+void LAPACK_csyequb(char* uplo, lapack_int* n, const lapack_complex_float* a, lapack_int* lda, float* s, float* scond,
+                    float* amax, lapack_complex_float* work, lapack_int* info);
+void LAPACK_zheequb(char* uplo, lapack_int* n, const lapack_complex_double* a, lapack_int* lda, double* s,
+                    double* scond, double* amax, lapack_complex_double* work, lapack_int* info);
+void LAPACK_cheequb(char* uplo, lapack_int* n, const lapack_complex_float* a, lapack_int* lda, float* s, float* scond,
+                    float* amax, lapack_complex_float* work, lapack_int* info);
+void LAPACK_sgesv(lapack_int* n, lapack_int* nrhs, float* a, lapack_int* lda, lapack_int* ipiv, float* b,
+                  lapack_int* ldb, lapack_int* info);
+void LAPACK_dgesv(lapack_int* n, lapack_int* nrhs, double* a, lapack_int* lda, lapack_int* ipiv, double* b,
+                  lapack_int* ldb, lapack_int* info);
+void LAPACK_cgesv(lapack_int* n, lapack_int* nrhs, lapack_complex_float* a, lapack_int* lda, lapack_int* ipiv,
+                  lapack_complex_float* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_zgesv(lapack_int* n, lapack_int* nrhs, lapack_complex_double* a, lapack_int* lda, lapack_int* ipiv,
+                  lapack_complex_double* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_dsgesv(lapack_int* n, lapack_int* nrhs, double* a, lapack_int* lda, lapack_int* ipiv, double* b,
+                   lapack_int* ldb, double* x, lapack_int* ldx, double* work, float* swork, lapack_int* iter,
+                   lapack_int* info);
+void LAPACK_zcgesv(lapack_int* n, lapack_int* nrhs, lapack_complex_double* a, lapack_int* lda, lapack_int* ipiv,
+                   lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
+                   lapack_complex_double* work, lapack_complex_float* swork, double* rwork, lapack_int* iter,
+                   lapack_int* info);
+void LAPACK_sgesvx(char* fact, char* trans, lapack_int* n, lapack_int* nrhs, float* a, lapack_int* lda, float* af,
+                   lapack_int* ldaf, lapack_int* ipiv, char* equed, float* r, float* c, float* b, lapack_int* ldb,
+                   float* x, lapack_int* ldx, float* rcond, float* ferr, float* berr, float* work, lapack_int* iwork,
+                   lapack_int* info);
+void LAPACK_dgesvx(char* fact, char* trans, lapack_int* n, lapack_int* nrhs, double* a, lapack_int* lda, double* af,
+                   lapack_int* ldaf, lapack_int* ipiv, char* equed, double* r, double* c, double* b, lapack_int* ldb,
+                   double* x, lapack_int* ldx, double* rcond, double* ferr, double* berr, double* work,
+                   lapack_int* iwork, lapack_int* info);
+void LAPACK_cgesvx(char* fact, char* trans, lapack_int* n, lapack_int* nrhs, lapack_complex_float* a, lapack_int* lda,
+                   lapack_complex_float* af, lapack_int* ldaf, lapack_int* ipiv, char* equed, float* r, float* c,
+                   lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx, float* rcond,
+                   float* ferr, float* berr, lapack_complex_float* work, float* rwork, lapack_int* info);
+void LAPACK_zgesvx(char* fact, char* trans, lapack_int* n, lapack_int* nrhs, lapack_complex_double* a, lapack_int* lda,
+                   lapack_complex_double* af, lapack_int* ldaf, lapack_int* ipiv, char* equed, double* r, double* c,
+                   lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx, double* rcond,
+                   double* ferr, double* berr, lapack_complex_double* work, double* rwork, lapack_int* info);
+void LAPACK_dgesvxx(char* fact, char* trans, lapack_int* n, lapack_int* nrhs, double* a, lapack_int* lda, double* af,
+                    lapack_int* ldaf, lapack_int* ipiv, char* equed, double* r, double* c, double* b, lapack_int* ldb,
+                    double* x, lapack_int* ldx, double* rcond, double* rpvgrw, double* berr, lapack_int* n_err_bnds,
+                    double* err_bnds_norm, double* err_bnds_comp, lapack_int* nparams, double* params, double* work,
+                    lapack_int* iwork, lapack_int* info);
+void LAPACK_sgesvxx(char* fact, char* trans, lapack_int* n, lapack_int* nrhs, float* a, lapack_int* lda, float* af,
+                    lapack_int* ldaf, lapack_int* ipiv, char* equed, float* r, float* c, float* b, lapack_int* ldb,
+                    float* x, lapack_int* ldx, float* rcond, float* rpvgrw, float* berr, lapack_int* n_err_bnds,
+                    float* err_bnds_norm, float* err_bnds_comp, lapack_int* nparams, float* params, float* work,
+                    lapack_int* iwork, lapack_int* info);
+void LAPACK_zgesvxx(char* fact, char* trans, lapack_int* n, lapack_int* nrhs, lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* af, lapack_int* ldaf, lapack_int* ipiv, char* equed, double* r, double* c,
+                    lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx, double* rcond,
+                    double* rpvgrw, double* berr, lapack_int* n_err_bnds, double* err_bnds_norm, double* err_bnds_comp,
+                    lapack_int* nparams, double* params, lapack_complex_double* work, double* rwork, lapack_int* info);
+void LAPACK_cgesvxx(char* fact, char* trans, lapack_int* n, lapack_int* nrhs, lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* af, lapack_int* ldaf, lapack_int* ipiv, char* equed, float* r, float* c,
+                    lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx, float* rcond,
+                    float* rpvgrw, float* berr, lapack_int* n_err_bnds, float* err_bnds_norm, float* err_bnds_comp,
+                    lapack_int* nparams, float* params, lapack_complex_float* work, float* rwork, lapack_int* info);
+void LAPACK_sgbsv(lapack_int* n, lapack_int* kl, lapack_int* ku, lapack_int* nrhs, float* ab, lapack_int* ldab,
+                  lapack_int* ipiv, float* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_dgbsv(lapack_int* n, lapack_int* kl, lapack_int* ku, lapack_int* nrhs, double* ab, lapack_int* ldab,
+                  lapack_int* ipiv, double* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_cgbsv(lapack_int* n, lapack_int* kl, lapack_int* ku, lapack_int* nrhs, lapack_complex_float* ab,
+                  lapack_int* ldab, lapack_int* ipiv, lapack_complex_float* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_zgbsv(lapack_int* n, lapack_int* kl, lapack_int* ku, lapack_int* nrhs, lapack_complex_double* ab,
+                  lapack_int* ldab, lapack_int* ipiv, lapack_complex_double* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_sgbsvx(char* fact, char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku, lapack_int* nrhs, float* ab,
+                   lapack_int* ldab, float* afb, lapack_int* ldafb, lapack_int* ipiv, char* equed, float* r, float* c,
+                   float* b, lapack_int* ldb, float* x, lapack_int* ldx, float* rcond, float* ferr, float* berr,
+                   float* work, lapack_int* iwork, lapack_int* info);
+void LAPACK_dgbsvx(char* fact, char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku, lapack_int* nrhs, double* ab,
+                   lapack_int* ldab, double* afb, lapack_int* ldafb, lapack_int* ipiv, char* equed, double* r,
+                   double* c, double* b, lapack_int* ldb, double* x, lapack_int* ldx, double* rcond, double* ferr,
+                   double* berr, double* work, lapack_int* iwork, lapack_int* info);
+void LAPACK_cgbsvx(char* fact, char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku, lapack_int* nrhs,
+                   lapack_complex_float* ab, lapack_int* ldab, lapack_complex_float* afb, lapack_int* ldafb,
+                   lapack_int* ipiv, char* equed, float* r, float* c, lapack_complex_float* b, lapack_int* ldb,
+                   lapack_complex_float* x, lapack_int* ldx, float* rcond, float* ferr, float* berr,
+                   lapack_complex_float* work, float* rwork, lapack_int* info);
+void LAPACK_zgbsvx(char* fact, char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku, lapack_int* nrhs,
+                   lapack_complex_double* ab, lapack_int* ldab, lapack_complex_double* afb, lapack_int* ldafb,
+                   lapack_int* ipiv, char* equed, double* r, double* c, lapack_complex_double* b, lapack_int* ldb,
+                   lapack_complex_double* x, lapack_int* ldx, double* rcond, double* ferr, double* berr,
+                   lapack_complex_double* work, double* rwork, lapack_int* info);
+void LAPACK_dgbsvxx(char* fact, char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku, lapack_int* nrhs,
+                    double* ab, lapack_int* ldab, double* afb, lapack_int* ldafb, lapack_int* ipiv, char* equed,
+                    double* r, double* c, double* b, lapack_int* ldb, double* x, lapack_int* ldx, double* rcond,
+                    double* rpvgrw, double* berr, lapack_int* n_err_bnds, double* err_bnds_norm, double* err_bnds_comp,
+                    lapack_int* nparams, double* params, double* work, lapack_int* iwork, lapack_int* info);
+void LAPACK_sgbsvxx(char* fact, char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku, lapack_int* nrhs, float* ab,
+                    lapack_int* ldab, float* afb, lapack_int* ldafb, lapack_int* ipiv, char* equed, float* r, float* c,
+                    float* b, lapack_int* ldb, float* x, lapack_int* ldx, float* rcond, float* rpvgrw, float* berr,
+                    lapack_int* n_err_bnds, float* err_bnds_norm, float* err_bnds_comp, lapack_int* nparams,
+                    float* params, float* work, lapack_int* iwork, lapack_int* info);
+void LAPACK_zgbsvxx(char* fact, char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku, lapack_int* nrhs,
+                    lapack_complex_double* ab, lapack_int* ldab, lapack_complex_double* afb, lapack_int* ldafb,
+                    lapack_int* ipiv, char* equed, double* r, double* c, lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* x, lapack_int* ldx, double* rcond, double* rpvgrw, double* berr,
+                    lapack_int* n_err_bnds, double* err_bnds_norm, double* err_bnds_comp, lapack_int* nparams,
+                    double* params, lapack_complex_double* work, double* rwork, lapack_int* info);
+void LAPACK_cgbsvxx(char* fact, char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku, lapack_int* nrhs,
+                    lapack_complex_float* ab, lapack_int* ldab, lapack_complex_float* afb, lapack_int* ldafb,
+                    lapack_int* ipiv, char* equed, float* r, float* c, lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* x, lapack_int* ldx, float* rcond, float* rpvgrw, float* berr,
+                    lapack_int* n_err_bnds, float* err_bnds_norm, float* err_bnds_comp, lapack_int* nparams,
+                    float* params, lapack_complex_float* work, float* rwork, lapack_int* info);
+void LAPACK_sgtsv(lapack_int* n, lapack_int* nrhs, float* dl, float* d, float* du, float* b, lapack_int* ldb,
+                  lapack_int* info);
+void LAPACK_dgtsv(lapack_int* n, lapack_int* nrhs, double* dl, double* d, double* du, double* b, lapack_int* ldb,
+                  lapack_int* info);
+void LAPACK_cgtsv(lapack_int* n, lapack_int* nrhs, lapack_complex_float* dl, lapack_complex_float* d,
+                  lapack_complex_float* du, lapack_complex_float* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_zgtsv(lapack_int* n, lapack_int* nrhs, lapack_complex_double* dl, lapack_complex_double* d,
+                  lapack_complex_double* du, lapack_complex_double* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_sgtsvx(char* fact, char* trans, lapack_int* n, lapack_int* nrhs, const float* dl, const float* d,
+                   const float* du, float* dlf, float* df, float* duf, float* du2, lapack_int* ipiv, const float* b,
+                   lapack_int* ldb, float* x, lapack_int* ldx, float* rcond, float* ferr, float* berr, float* work,
+                   lapack_int* iwork, lapack_int* info);
+void LAPACK_dgtsvx(char* fact, char* trans, lapack_int* n, lapack_int* nrhs, const double* dl, const double* d,
+                   const double* du, double* dlf, double* df, double* duf, double* du2, lapack_int* ipiv,
+                   const double* b, lapack_int* ldb, double* x, lapack_int* ldx, double* rcond, double* ferr,
+                   double* berr, double* work, lapack_int* iwork, lapack_int* info);
+void LAPACK_cgtsvx(char* fact, char* trans, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* dl,
+                   const lapack_complex_float* d, const lapack_complex_float* du, lapack_complex_float* dlf,
+                   lapack_complex_float* df, lapack_complex_float* duf, lapack_complex_float* du2, lapack_int* ipiv,
+                   const lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx,
+                   float* rcond, float* ferr, float* berr, lapack_complex_float* work, float* rwork, lapack_int* info);
+void LAPACK_zgtsvx(char* fact, char* trans, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* dl,
+                   const lapack_complex_double* d, const lapack_complex_double* du, lapack_complex_double* dlf,
+                   lapack_complex_double* df, lapack_complex_double* duf, lapack_complex_double* du2, lapack_int* ipiv,
+                   const lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
+                   double* rcond, double* ferr, double* berr, lapack_complex_double* work, double* rwork,
+                   lapack_int* info);
+void LAPACK_sposv(char* uplo, lapack_int* n, lapack_int* nrhs, float* a, lapack_int* lda, float* b, lapack_int* ldb,
+                  lapack_int* info);
+void LAPACK_dposv(char* uplo, lapack_int* n, lapack_int* nrhs, double* a, lapack_int* lda, double* b, lapack_int* ldb,
+                  lapack_int* info);
+void LAPACK_cposv(char* uplo, lapack_int* n, lapack_int* nrhs, lapack_complex_float* a, lapack_int* lda,
+                  lapack_complex_float* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_zposv(char* uplo, lapack_int* n, lapack_int* nrhs, lapack_complex_double* a, lapack_int* lda,
+                  lapack_complex_double* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_dsposv(char* uplo, lapack_int* n, lapack_int* nrhs, double* a, lapack_int* lda, double* b, lapack_int* ldb,
+                   double* x, lapack_int* ldx, double* work, float* swork, lapack_int* iter, lapack_int* info);
+void LAPACK_zcposv(char* uplo, lapack_int* n, lapack_int* nrhs, lapack_complex_double* a, lapack_int* lda,
+                   lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
+                   lapack_complex_double* work, lapack_complex_float* swork, double* rwork, lapack_int* iter,
+                   lapack_int* info);
+void LAPACK_sposvx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, float* a, lapack_int* lda, float* af,
+                   lapack_int* ldaf, char* equed, float* s, float* b, lapack_int* ldb, float* x, lapack_int* ldx,
+                   float* rcond, float* ferr, float* berr, float* work, lapack_int* iwork, lapack_int* info);
+void LAPACK_dposvx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, double* a, lapack_int* lda, double* af,
+                   lapack_int* ldaf, char* equed, double* s, double* b, lapack_int* ldb, double* x, lapack_int* ldx,
+                   double* rcond, double* ferr, double* berr, double* work, lapack_int* iwork, lapack_int* info);
+void LAPACK_cposvx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, lapack_complex_float* a, lapack_int* lda,
+                   lapack_complex_float* af, lapack_int* ldaf, char* equed, float* s, lapack_complex_float* b,
+                   lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx, float* rcond, float* ferr, float* berr,
+                   lapack_complex_float* work, float* rwork, lapack_int* info);
+void LAPACK_zposvx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, lapack_complex_double* a, lapack_int* lda,
+                   lapack_complex_double* af, lapack_int* ldaf, char* equed, double* s, lapack_complex_double* b,
+                   lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx, double* rcond, double* ferr,
+                   double* berr, lapack_complex_double* work, double* rwork, lapack_int* info);
+void LAPACK_dposvxx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, double* a, lapack_int* lda, double* af,
+                    lapack_int* ldaf, char* equed, double* s, double* b, lapack_int* ldb, double* x, lapack_int* ldx,
+                    double* rcond, double* rpvgrw, double* berr, lapack_int* n_err_bnds, double* err_bnds_norm,
+                    double* err_bnds_comp, lapack_int* nparams, double* params, double* work, lapack_int* iwork,
+                    lapack_int* info);
+void LAPACK_sposvxx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, float* a, lapack_int* lda, float* af,
+                    lapack_int* ldaf, char* equed, float* s, float* b, lapack_int* ldb, float* x, lapack_int* ldx,
+                    float* rcond, float* rpvgrw, float* berr, lapack_int* n_err_bnds, float* err_bnds_norm,
+                    float* err_bnds_comp, lapack_int* nparams, float* params, float* work, lapack_int* iwork,
+                    lapack_int* info);
+void LAPACK_zposvxx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* af, lapack_int* ldaf, char* equed, double* s, lapack_complex_double* b,
+                    lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx, double* rcond, double* rpvgrw,
+                    double* berr, lapack_int* n_err_bnds, double* err_bnds_norm, double* err_bnds_comp,
+                    lapack_int* nparams, double* params, lapack_complex_double* work, double* rwork, lapack_int* info);
+void LAPACK_cposvxx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* af, lapack_int* ldaf, char* equed, float* s, lapack_complex_float* b,
+                    lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx, float* rcond, float* rpvgrw, float* berr,
+                    lapack_int* n_err_bnds, float* err_bnds_norm, float* err_bnds_comp, lapack_int* nparams,
+                    float* params, lapack_complex_float* work, float* rwork, lapack_int* info);
+void LAPACK_sppsv(char* uplo, lapack_int* n, lapack_int* nrhs, float* ap, float* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_dppsv(char* uplo, lapack_int* n, lapack_int* nrhs, double* ap, double* b, lapack_int* ldb,
+                  lapack_int* info);
+void LAPACK_cppsv(char* uplo, lapack_int* n, lapack_int* nrhs, lapack_complex_float* ap, lapack_complex_float* b,
+                  lapack_int* ldb, lapack_int* info);
+void LAPACK_zppsv(char* uplo, lapack_int* n, lapack_int* nrhs, lapack_complex_double* ap, lapack_complex_double* b,
+                  lapack_int* ldb, lapack_int* info);
+void LAPACK_sppsvx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, float* ap, float* afp, char* equed,
+                   float* s, float* b, lapack_int* ldb, float* x, lapack_int* ldx, float* rcond, float* ferr,
+                   float* berr, float* work, lapack_int* iwork, lapack_int* info);
+void LAPACK_dppsvx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, double* ap, double* afp, char* equed,
+                   double* s, double* b, lapack_int* ldb, double* x, lapack_int* ldx, double* rcond, double* ferr,
+                   double* berr, double* work, lapack_int* iwork, lapack_int* info);
+void LAPACK_cppsvx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, lapack_complex_float* ap,
+                   lapack_complex_float* afp, char* equed, float* s, lapack_complex_float* b, lapack_int* ldb,
+                   lapack_complex_float* x, lapack_int* ldx, float* rcond, float* ferr, float* berr,
+                   lapack_complex_float* work, float* rwork, lapack_int* info);
+void LAPACK_zppsvx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, lapack_complex_double* ap,
+                   lapack_complex_double* afp, char* equed, double* s, lapack_complex_double* b, lapack_int* ldb,
+                   lapack_complex_double* x, lapack_int* ldx, double* rcond, double* ferr, double* berr,
+                   lapack_complex_double* work, double* rwork, lapack_int* info);
+void LAPACK_spbsv(char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs, float* ab, lapack_int* ldab, float* b,
+                  lapack_int* ldb, lapack_int* info);
+void LAPACK_dpbsv(char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs, double* ab, lapack_int* ldab, double* b,
+                  lapack_int* ldb, lapack_int* info);
+void LAPACK_cpbsv(char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs, lapack_complex_float* ab,
+                  lapack_int* ldab, lapack_complex_float* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_zpbsv(char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs, lapack_complex_double* ab,
+                  lapack_int* ldab, lapack_complex_double* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_spbsvx(char* fact, char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs, float* ab, lapack_int* ldab,
+                   float* afb, lapack_int* ldafb, char* equed, float* s, float* b, lapack_int* ldb, float* x,
+                   lapack_int* ldx, float* rcond, float* ferr, float* berr, float* work, lapack_int* iwork,
+                   lapack_int* info);
+void LAPACK_dpbsvx(char* fact, char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs, double* ab,
+                   lapack_int* ldab, double* afb, lapack_int* ldafb, char* equed, double* s, double* b, lapack_int* ldb,
+                   double* x, lapack_int* ldx, double* rcond, double* ferr, double* berr, double* work,
+                   lapack_int* iwork, lapack_int* info);
+void LAPACK_cpbsvx(char* fact, char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs, lapack_complex_float* ab,
+                   lapack_int* ldab, lapack_complex_float* afb, lapack_int* ldafb, char* equed, float* s,
+                   lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx, float* rcond,
+                   float* ferr, float* berr, lapack_complex_float* work, float* rwork, lapack_int* info);
+void LAPACK_zpbsvx(char* fact, char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs, lapack_complex_double* ab,
+                   lapack_int* ldab, lapack_complex_double* afb, lapack_int* ldafb, char* equed, double* s,
+                   lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx, double* rcond,
+                   double* ferr, double* berr, lapack_complex_double* work, double* rwork, lapack_int* info);
+void LAPACK_sptsv(lapack_int* n, lapack_int* nrhs, float* d, float* e, float* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_dptsv(lapack_int* n, lapack_int* nrhs, double* d, double* e, double* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_cptsv(lapack_int* n, lapack_int* nrhs, float* d, lapack_complex_float* e, lapack_complex_float* b,
+                  lapack_int* ldb, lapack_int* info);
+void LAPACK_zptsv(lapack_int* n, lapack_int* nrhs, double* d, lapack_complex_double* e, lapack_complex_double* b,
+                  lapack_int* ldb, lapack_int* info);
+void LAPACK_sptsvx(char* fact, lapack_int* n, lapack_int* nrhs, const float* d, const float* e, float* df, float* ef,
+                   const float* b, lapack_int* ldb, float* x, lapack_int* ldx, float* rcond, float* ferr, float* berr,
+                   float* work, lapack_int* info);
+void LAPACK_dptsvx(char* fact, lapack_int* n, lapack_int* nrhs, const double* d, const double* e, double* df,
+                   double* ef, const double* b, lapack_int* ldb, double* x, lapack_int* ldx, double* rcond,
+                   double* ferr, double* berr, double* work, lapack_int* info);
+void LAPACK_cptsvx(char* fact, lapack_int* n, lapack_int* nrhs, const float* d, const lapack_complex_float* e,
+                   float* df, lapack_complex_float* ef, const lapack_complex_float* b, lapack_int* ldb,
+                   lapack_complex_float* x, lapack_int* ldx, float* rcond, float* ferr, float* berr,
+                   lapack_complex_float* work, float* rwork, lapack_int* info);
+void LAPACK_zptsvx(char* fact, lapack_int* n, lapack_int* nrhs, const double* d, const lapack_complex_double* e,
+                   double* df, lapack_complex_double* ef, const lapack_complex_double* b, lapack_int* ldb,
+                   lapack_complex_double* x, lapack_int* ldx, double* rcond, double* ferr, double* berr,
+                   lapack_complex_double* work, double* rwork, lapack_int* info);
+void LAPACK_ssysv(char* uplo, lapack_int* n, lapack_int* nrhs, float* a, lapack_int* lda, lapack_int* ipiv, float* b,
+                  lapack_int* ldb, float* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_dsysv(char* uplo, lapack_int* n, lapack_int* nrhs, double* a, lapack_int* lda, lapack_int* ipiv, double* b,
+                  lapack_int* ldb, double* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_csysv(char* uplo, lapack_int* n, lapack_int* nrhs, lapack_complex_float* a, lapack_int* lda,
+                  lapack_int* ipiv, lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* work,
+                  lapack_int* lwork, lapack_int* info);
+void LAPACK_zsysv(char* uplo, lapack_int* n, lapack_int* nrhs, lapack_complex_double* a, lapack_int* lda,
+                  lapack_int* ipiv, lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* work,
+                  lapack_int* lwork, lapack_int* info);
+void LAPACK_ssysvx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, const float* a, lapack_int* lda, float* af,
+                   lapack_int* ldaf, lapack_int* ipiv, const float* b, lapack_int* ldb, float* x, lapack_int* ldx,
+                   float* rcond, float* ferr, float* berr, float* work, lapack_int* lwork, lapack_int* iwork,
+                   lapack_int* info);
+void LAPACK_dsysvx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, const double* a, lapack_int* lda,
+                   double* af, lapack_int* ldaf, lapack_int* ipiv, const double* b, lapack_int* ldb, double* x,
+                   lapack_int* ldx, double* rcond, double* ferr, double* berr, double* work, lapack_int* lwork,
+                   lapack_int* iwork, lapack_int* info);
+void LAPACK_csysvx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* a,
+                   lapack_int* lda, lapack_complex_float* af, lapack_int* ldaf, lapack_int* ipiv,
+                   const lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx,
+                   float* rcond, float* ferr, float* berr, lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                   lapack_int* info);
+void LAPACK_zsysvx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* a,
+                   lapack_int* lda, lapack_complex_double* af, lapack_int* ldaf, lapack_int* ipiv,
+                   const lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
+                   double* rcond, double* ferr, double* berr, lapack_complex_double* work, lapack_int* lwork,
+                   double* rwork, lapack_int* info);
+void LAPACK_dsysvxx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, double* a, lapack_int* lda, double* af,
+                    lapack_int* ldaf, lapack_int* ipiv, char* equed, double* s, double* b, lapack_int* ldb, double* x,
+                    lapack_int* ldx, double* rcond, double* rpvgrw, double* berr, lapack_int* n_err_bnds,
+                    double* err_bnds_norm, double* err_bnds_comp, lapack_int* nparams, double* params, double* work,
+                    lapack_int* iwork, lapack_int* info);
+void LAPACK_ssysvxx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, float* a, lapack_int* lda, float* af,
+                    lapack_int* ldaf, lapack_int* ipiv, char* equed, float* s, float* b, lapack_int* ldb, float* x,
+                    lapack_int* ldx, float* rcond, float* rpvgrw, float* berr, lapack_int* n_err_bnds,
+                    float* err_bnds_norm, float* err_bnds_comp, lapack_int* nparams, float* params, float* work,
+                    lapack_int* iwork, lapack_int* info);
+void LAPACK_zsysvxx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* af, lapack_int* ldaf, lapack_int* ipiv, char* equed, double* s,
+                    lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx, double* rcond,
+                    double* rpvgrw, double* berr, lapack_int* n_err_bnds, double* err_bnds_norm, double* err_bnds_comp,
+                    lapack_int* nparams, double* params, lapack_complex_double* work, double* rwork, lapack_int* info);
+void LAPACK_csysvxx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* af, lapack_int* ldaf, lapack_int* ipiv, char* equed, float* s,
+                    lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx, float* rcond,
+                    float* rpvgrw, float* berr, lapack_int* n_err_bnds, float* err_bnds_norm, float* err_bnds_comp,
+                    lapack_int* nparams, float* params, lapack_complex_float* work, float* rwork, lapack_int* info);
+void LAPACK_chesv(char* uplo, lapack_int* n, lapack_int* nrhs, lapack_complex_float* a, lapack_int* lda,
+                  lapack_int* ipiv, lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* work,
+                  lapack_int* lwork, lapack_int* info);
+void LAPACK_zhesv(char* uplo, lapack_int* n, lapack_int* nrhs, lapack_complex_double* a, lapack_int* lda,
+                  lapack_int* ipiv, lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* work,
+                  lapack_int* lwork, lapack_int* info);
+void LAPACK_chesvx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* a,
+                   lapack_int* lda, lapack_complex_float* af, lapack_int* ldaf, lapack_int* ipiv,
+                   const lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx,
+                   float* rcond, float* ferr, float* berr, lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                   lapack_int* info);
+void LAPACK_zhesvx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* a,
+                   lapack_int* lda, lapack_complex_double* af, lapack_int* ldaf, lapack_int* ipiv,
+                   const lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
+                   double* rcond, double* ferr, double* berr, lapack_complex_double* work, lapack_int* lwork,
+                   double* rwork, lapack_int* info);
+void LAPACK_zhesvxx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, lapack_complex_double* a, lapack_int* lda,
+                    lapack_complex_double* af, lapack_int* ldaf, lapack_int* ipiv, char* equed, double* s,
+                    lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx, double* rcond,
+                    double* rpvgrw, double* berr, lapack_int* n_err_bnds, double* err_bnds_norm, double* err_bnds_comp,
+                    lapack_int* nparams, double* params, lapack_complex_double* work, double* rwork, lapack_int* info);
+void LAPACK_chesvxx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, lapack_complex_float* a, lapack_int* lda,
+                    lapack_complex_float* af, lapack_int* ldaf, lapack_int* ipiv, char* equed, float* s,
+                    lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx, float* rcond,
+                    float* rpvgrw, float* berr, lapack_int* n_err_bnds, float* err_bnds_norm, float* err_bnds_comp,
+                    lapack_int* nparams, float* params, lapack_complex_float* work, float* rwork, lapack_int* info);
+void LAPACK_sspsv(char* uplo, lapack_int* n, lapack_int* nrhs, float* ap, lapack_int* ipiv, float* b, lapack_int* ldb,
+                  lapack_int* info);
+void LAPACK_dspsv(char* uplo, lapack_int* n, lapack_int* nrhs, double* ap, lapack_int* ipiv, double* b, lapack_int* ldb,
+                  lapack_int* info);
+void LAPACK_cspsv(char* uplo, lapack_int* n, lapack_int* nrhs, lapack_complex_float* ap, lapack_int* ipiv,
+                  lapack_complex_float* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_zspsv(char* uplo, lapack_int* n, lapack_int* nrhs, lapack_complex_double* ap, lapack_int* ipiv,
+                  lapack_complex_double* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_sspsvx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, const float* ap, float* afp,
+                   lapack_int* ipiv, const float* b, lapack_int* ldb, float* x, lapack_int* ldx, float* rcond,
+                   float* ferr, float* berr, float* work, lapack_int* iwork, lapack_int* info);
+void LAPACK_dspsvx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, const double* ap, double* afp,
+                   lapack_int* ipiv, const double* b, lapack_int* ldb, double* x, lapack_int* ldx, double* rcond,
+                   double* ferr, double* berr, double* work, lapack_int* iwork, lapack_int* info);
+void LAPACK_cspsvx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* ap,
+                   lapack_complex_float* afp, lapack_int* ipiv, const lapack_complex_float* b, lapack_int* ldb,
+                   lapack_complex_float* x, lapack_int* ldx, float* rcond, float* ferr, float* berr,
+                   lapack_complex_float* work, float* rwork, lapack_int* info);
+void LAPACK_zspsvx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* ap,
+                   lapack_complex_double* afp, lapack_int* ipiv, const lapack_complex_double* b, lapack_int* ldb,
+                   lapack_complex_double* x, lapack_int* ldx, double* rcond, double* ferr, double* berr,
+                   lapack_complex_double* work, double* rwork, lapack_int* info);
+void LAPACK_chpsv(char* uplo, lapack_int* n, lapack_int* nrhs, lapack_complex_float* ap, lapack_int* ipiv,
+                  lapack_complex_float* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_zhpsv(char* uplo, lapack_int* n, lapack_int* nrhs, lapack_complex_double* ap, lapack_int* ipiv,
+                  lapack_complex_double* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_chpsvx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* ap,
+                   lapack_complex_float* afp, lapack_int* ipiv, const lapack_complex_float* b, lapack_int* ldb,
+                   lapack_complex_float* x, lapack_int* ldx, float* rcond, float* ferr, float* berr,
+                   lapack_complex_float* work, float* rwork, lapack_int* info);
+void LAPACK_zhpsvx(char* fact, char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* ap,
+                   lapack_complex_double* afp, lapack_int* ipiv, const lapack_complex_double* b, lapack_int* ldb,
+                   lapack_complex_double* x, lapack_int* ldx, double* rcond, double* ferr, double* berr,
+                   lapack_complex_double* work, double* rwork, lapack_int* info);
+void LAPACK_sgeqrf(lapack_int* m, lapack_int* n, float* a, lapack_int* lda, float* tau, float* work, lapack_int* lwork,
+                   lapack_int* info);
+void LAPACK_dgeqrf(lapack_int* m, lapack_int* n, double* a, lapack_int* lda, double* tau, double* work,
+                   lapack_int* lwork, lapack_int* info);
+void LAPACK_cgeqrf(lapack_int* m, lapack_int* n, lapack_complex_float* a, lapack_int* lda, lapack_complex_float* tau,
+                   lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_zgeqrf(lapack_int* m, lapack_int* n, lapack_complex_double* a, lapack_int* lda, lapack_complex_double* tau,
+                   lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_sgeqpf(lapack_int* m, lapack_int* n, float* a, lapack_int* lda, lapack_int* jpvt, float* tau, float* work,
+                   lapack_int* info);
+void LAPACK_dgeqpf(lapack_int* m, lapack_int* n, double* a, lapack_int* lda, lapack_int* jpvt, double* tau,
+                   double* work, lapack_int* info);
+void LAPACK_cgeqpf(lapack_int* m, lapack_int* n, lapack_complex_float* a, lapack_int* lda, lapack_int* jpvt,
+                   lapack_complex_float* tau, lapack_complex_float* work, float* rwork, lapack_int* info);
+void LAPACK_zgeqpf(lapack_int* m, lapack_int* n, lapack_complex_double* a, lapack_int* lda, lapack_int* jpvt,
+                   lapack_complex_double* tau, lapack_complex_double* work, double* rwork, lapack_int* info);
+void LAPACK_sgeqp3(lapack_int* m, lapack_int* n, float* a, lapack_int* lda, lapack_int* jpvt, float* tau, float* work,
+                   lapack_int* lwork, lapack_int* info);
+void LAPACK_dgeqp3(lapack_int* m, lapack_int* n, double* a, lapack_int* lda, lapack_int* jpvt, double* tau,
+                   double* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_cgeqp3(lapack_int* m, lapack_int* n, lapack_complex_float* a, lapack_int* lda, lapack_int* jpvt,
+                   lapack_complex_float* tau, lapack_complex_float* work, lapack_int* lwork, float* rwork,
+                   lapack_int* info);
+void LAPACK_zgeqp3(lapack_int* m, lapack_int* n, lapack_complex_double* a, lapack_int* lda, lapack_int* jpvt,
+                   lapack_complex_double* tau, lapack_complex_double* work, lapack_int* lwork, double* rwork,
+                   lapack_int* info);
+void LAPACK_sorgqr(lapack_int* m, lapack_int* n, lapack_int* k, float* a, lapack_int* lda, const float* tau,
+                   float* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_dorgqr(lapack_int* m, lapack_int* n, lapack_int* k, double* a, lapack_int* lda, const double* tau,
+                   double* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_sormqr(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, const float* a,
+                   lapack_int* lda, const float* tau, float* c, lapack_int* ldc, float* work, lapack_int* lwork,
+                   lapack_int* info);
+void LAPACK_dormqr(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, const double* a,
+                   lapack_int* lda, const double* tau, double* c, lapack_int* ldc, double* work, lapack_int* lwork,
+                   lapack_int* info);
+void LAPACK_cungqr(lapack_int* m, lapack_int* n, lapack_int* k, lapack_complex_float* a, lapack_int* lda,
+                   const lapack_complex_float* tau, lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_zungqr(lapack_int* m, lapack_int* n, lapack_int* k, lapack_complex_double* a, lapack_int* lda,
+                   const lapack_complex_double* tau, lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_cunmqr(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, const lapack_complex_float* a,
+                   lapack_int* lda, const lapack_complex_float* tau, lapack_complex_float* c, lapack_int* ldc,
+                   lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_zunmqr(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, const lapack_complex_double* a,
+                   lapack_int* lda, const lapack_complex_double* tau, lapack_complex_double* c, lapack_int* ldc,
+                   lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_sgelqf(lapack_int* m, lapack_int* n, float* a, lapack_int* lda, float* tau, float* work, lapack_int* lwork,
+                   lapack_int* info);
+void LAPACK_dgelqf(lapack_int* m, lapack_int* n, double* a, lapack_int* lda, double* tau, double* work,
+                   lapack_int* lwork, lapack_int* info);
+void LAPACK_cgelqf(lapack_int* m, lapack_int* n, lapack_complex_float* a, lapack_int* lda, lapack_complex_float* tau,
+                   lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_zgelqf(lapack_int* m, lapack_int* n, lapack_complex_double* a, lapack_int* lda, lapack_complex_double* tau,
+                   lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_sorglq(lapack_int* m, lapack_int* n, lapack_int* k, float* a, lapack_int* lda, const float* tau,
+                   float* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_dorglq(lapack_int* m, lapack_int* n, lapack_int* k, double* a, lapack_int* lda, const double* tau,
+                   double* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_sormlq(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, const float* a,
+                   lapack_int* lda, const float* tau, float* c, lapack_int* ldc, float* work, lapack_int* lwork,
+                   lapack_int* info);
+void LAPACK_dormlq(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, const double* a,
+                   lapack_int* lda, const double* tau, double* c, lapack_int* ldc, double* work, lapack_int* lwork,
+                   lapack_int* info);
+void LAPACK_cunglq(lapack_int* m, lapack_int* n, lapack_int* k, lapack_complex_float* a, lapack_int* lda,
+                   const lapack_complex_float* tau, lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_zunglq(lapack_int* m, lapack_int* n, lapack_int* k, lapack_complex_double* a, lapack_int* lda,
+                   const lapack_complex_double* tau, lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_cunmlq(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, const lapack_complex_float* a,
+                   lapack_int* lda, const lapack_complex_float* tau, lapack_complex_float* c, lapack_int* ldc,
+                   lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_zunmlq(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, const lapack_complex_double* a,
+                   lapack_int* lda, const lapack_complex_double* tau, lapack_complex_double* c, lapack_int* ldc,
+                   lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_sgeqlf(lapack_int* m, lapack_int* n, float* a, lapack_int* lda, float* tau, float* work, lapack_int* lwork,
+                   lapack_int* info);
+void LAPACK_dgeqlf(lapack_int* m, lapack_int* n, double* a, lapack_int* lda, double* tau, double* work,
+                   lapack_int* lwork, lapack_int* info);
+void LAPACK_cgeqlf(lapack_int* m, lapack_int* n, lapack_complex_float* a, lapack_int* lda, lapack_complex_float* tau,
+                   lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_zgeqlf(lapack_int* m, lapack_int* n, lapack_complex_double* a, lapack_int* lda, lapack_complex_double* tau,
+                   lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_sorgql(lapack_int* m, lapack_int* n, lapack_int* k, float* a, lapack_int* lda, const float* tau,
+                   float* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_dorgql(lapack_int* m, lapack_int* n, lapack_int* k, double* a, lapack_int* lda, const double* tau,
+                   double* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_cungql(lapack_int* m, lapack_int* n, lapack_int* k, lapack_complex_float* a, lapack_int* lda,
+                   const lapack_complex_float* tau, lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_zungql(lapack_int* m, lapack_int* n, lapack_int* k, lapack_complex_double* a, lapack_int* lda,
+                   const lapack_complex_double* tau, lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_sormql(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, const float* a,
+                   lapack_int* lda, const float* tau, float* c, lapack_int* ldc, float* work, lapack_int* lwork,
+                   lapack_int* info);
+void LAPACK_dormql(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, const double* a,
+                   lapack_int* lda, const double* tau, double* c, lapack_int* ldc, double* work, lapack_int* lwork,
+                   lapack_int* info);
+void LAPACK_cunmql(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, const lapack_complex_float* a,
+                   lapack_int* lda, const lapack_complex_float* tau, lapack_complex_float* c, lapack_int* ldc,
+                   lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_zunmql(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, const lapack_complex_double* a,
+                   lapack_int* lda, const lapack_complex_double* tau, lapack_complex_double* c, lapack_int* ldc,
+                   lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_sgerqf(lapack_int* m, lapack_int* n, float* a, lapack_int* lda, float* tau, float* work, lapack_int* lwork,
+                   lapack_int* info);
+void LAPACK_dgerqf(lapack_int* m, lapack_int* n, double* a, lapack_int* lda, double* tau, double* work,
+                   lapack_int* lwork, lapack_int* info);
+void LAPACK_cgerqf(lapack_int* m, lapack_int* n, lapack_complex_float* a, lapack_int* lda, lapack_complex_float* tau,
+                   lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_zgerqf(lapack_int* m, lapack_int* n, lapack_complex_double* a, lapack_int* lda, lapack_complex_double* tau,
+                   lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_sorgrq(lapack_int* m, lapack_int* n, lapack_int* k, float* a, lapack_int* lda, const float* tau,
+                   float* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_dorgrq(lapack_int* m, lapack_int* n, lapack_int* k, double* a, lapack_int* lda, const double* tau,
+                   double* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_cungrq(lapack_int* m, lapack_int* n, lapack_int* k, lapack_complex_float* a, lapack_int* lda,
+                   const lapack_complex_float* tau, lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_zungrq(lapack_int* m, lapack_int* n, lapack_int* k, lapack_complex_double* a, lapack_int* lda,
+                   const lapack_complex_double* tau, lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_sormrq(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, const float* a,
+                   lapack_int* lda, const float* tau, float* c, lapack_int* ldc, float* work, lapack_int* lwork,
+                   lapack_int* info);
+void LAPACK_dormrq(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, const double* a,
+                   lapack_int* lda, const double* tau, double* c, lapack_int* ldc, double* work, lapack_int* lwork,
+                   lapack_int* info);
+void LAPACK_cunmrq(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, const lapack_complex_float* a,
+                   lapack_int* lda, const lapack_complex_float* tau, lapack_complex_float* c, lapack_int* ldc,
+                   lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_zunmrq(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, const lapack_complex_double* a,
+                   lapack_int* lda, const lapack_complex_double* tau, lapack_complex_double* c, lapack_int* ldc,
+                   lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_stzrzf(lapack_int* m, lapack_int* n, float* a, lapack_int* lda, float* tau, float* work, lapack_int* lwork,
+                   lapack_int* info);
+void LAPACK_dtzrzf(lapack_int* m, lapack_int* n, double* a, lapack_int* lda, double* tau, double* work,
+                   lapack_int* lwork, lapack_int* info);
+void LAPACK_ctzrzf(lapack_int* m, lapack_int* n, lapack_complex_float* a, lapack_int* lda, lapack_complex_float* tau,
+                   lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_ztzrzf(lapack_int* m, lapack_int* n, lapack_complex_double* a, lapack_int* lda, lapack_complex_double* tau,
+                   lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_sormrz(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, lapack_int* l, const float* a,
+                   lapack_int* lda, const float* tau, float* c, lapack_int* ldc, float* work, lapack_int* lwork,
+                   lapack_int* info);
+void LAPACK_dormrz(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, lapack_int* l, const double* a,
+                   lapack_int* lda, const double* tau, double* c, lapack_int* ldc, double* work, lapack_int* lwork,
+                   lapack_int* info);
+void LAPACK_cunmrz(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, lapack_int* l,
+                   const lapack_complex_float* a, lapack_int* lda, const lapack_complex_float* tau,
+                   lapack_complex_float* c, lapack_int* ldc, lapack_complex_float* work, lapack_int* lwork,
+                   lapack_int* info);
+void LAPACK_zunmrz(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, lapack_int* l,
+                   const lapack_complex_double* a, lapack_int* lda, const lapack_complex_double* tau,
+                   lapack_complex_double* c, lapack_int* ldc, lapack_complex_double* work, lapack_int* lwork,
+                   lapack_int* info);
+void LAPACK_sggqrf(lapack_int* n, lapack_int* m, lapack_int* p, float* a, lapack_int* lda, float* taua, float* b,
+                   lapack_int* ldb, float* taub, float* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_dggqrf(lapack_int* n, lapack_int* m, lapack_int* p, double* a, lapack_int* lda, double* taua, double* b,
+                   lapack_int* ldb, double* taub, double* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_cggqrf(lapack_int* n, lapack_int* m, lapack_int* p, lapack_complex_float* a, lapack_int* lda,
+                   lapack_complex_float* taua, lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* taub,
+                   lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_zggqrf(lapack_int* n, lapack_int* m, lapack_int* p, lapack_complex_double* a, lapack_int* lda,
+                   lapack_complex_double* taua, lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* taub,
+                   lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_sggrqf(lapack_int* m, lapack_int* p, lapack_int* n, float* a, lapack_int* lda, float* taua, float* b,
+                   lapack_int* ldb, float* taub, float* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_dggrqf(lapack_int* m, lapack_int* p, lapack_int* n, double* a, lapack_int* lda, double* taua, double* b,
+                   lapack_int* ldb, double* taub, double* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_cggrqf(lapack_int* m, lapack_int* p, lapack_int* n, lapack_complex_float* a, lapack_int* lda,
+                   lapack_complex_float* taua, lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* taub,
+                   lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_zggrqf(lapack_int* m, lapack_int* p, lapack_int* n, lapack_complex_double* a, lapack_int* lda,
+                   lapack_complex_double* taua, lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* taub,
+                   lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_sgebrd(lapack_int* m, lapack_int* n, float* a, lapack_int* lda, float* d, float* e, float* tauq,
+                   float* taup, float* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_dgebrd(lapack_int* m, lapack_int* n, double* a, lapack_int* lda, double* d, double* e, double* tauq,
+                   double* taup, double* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_cgebrd(lapack_int* m, lapack_int* n, lapack_complex_float* a, lapack_int* lda, float* d, float* e,
+                   lapack_complex_float* tauq, lapack_complex_float* taup, lapack_complex_float* work,
+                   lapack_int* lwork, lapack_int* info);
+void LAPACK_zgebrd(lapack_int* m, lapack_int* n, lapack_complex_double* a, lapack_int* lda, double* d, double* e,
+                   lapack_complex_double* tauq, lapack_complex_double* taup, lapack_complex_double* work,
+                   lapack_int* lwork, lapack_int* info);
+void LAPACK_sgbbrd(char* vect, lapack_int* m, lapack_int* n, lapack_int* ncc, lapack_int* kl, lapack_int* ku, float* ab,
+                   lapack_int* ldab, float* d, float* e, float* q, lapack_int* ldq, float* pt, lapack_int* ldpt,
+                   float* c, lapack_int* ldc, float* work, lapack_int* info);
+void LAPACK_dgbbrd(char* vect, lapack_int* m, lapack_int* n, lapack_int* ncc, lapack_int* kl, lapack_int* ku,
+                   double* ab, lapack_int* ldab, double* d, double* e, double* q, lapack_int* ldq, double* pt,
+                   lapack_int* ldpt, double* c, lapack_int* ldc, double* work, lapack_int* info);
+void LAPACK_cgbbrd(char* vect, lapack_int* m, lapack_int* n, lapack_int* ncc, lapack_int* kl, lapack_int* ku,
+                   lapack_complex_float* ab, lapack_int* ldab, float* d, float* e, lapack_complex_float* q,
+                   lapack_int* ldq, lapack_complex_float* pt, lapack_int* ldpt, lapack_complex_float* c,
+                   lapack_int* ldc, lapack_complex_float* work, float* rwork, lapack_int* info);
+void LAPACK_zgbbrd(char* vect, lapack_int* m, lapack_int* n, lapack_int* ncc, lapack_int* kl, lapack_int* ku,
+                   lapack_complex_double* ab, lapack_int* ldab, double* d, double* e, lapack_complex_double* q,
+                   lapack_int* ldq, lapack_complex_double* pt, lapack_int* ldpt, lapack_complex_double* c,
+                   lapack_int* ldc, lapack_complex_double* work, double* rwork, lapack_int* info);
+void LAPACK_sorgbr(char* vect, lapack_int* m, lapack_int* n, lapack_int* k, float* a, lapack_int* lda, const float* tau,
+                   float* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_dorgbr(char* vect, lapack_int* m, lapack_int* n, lapack_int* k, double* a, lapack_int* lda,
+                   const double* tau, double* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_sormbr(char* vect, char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, const float* a,
+                   lapack_int* lda, const float* tau, float* c, lapack_int* ldc, float* work, lapack_int* lwork,
+                   lapack_int* info);
+void LAPACK_dormbr(char* vect, char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, const double* a,
+                   lapack_int* lda, const double* tau, double* c, lapack_int* ldc, double* work, lapack_int* lwork,
+                   lapack_int* info);
+void LAPACK_cungbr(char* vect, lapack_int* m, lapack_int* n, lapack_int* k, lapack_complex_float* a, lapack_int* lda,
+                   const lapack_complex_float* tau, lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_zungbr(char* vect, lapack_int* m, lapack_int* n, lapack_int* k, lapack_complex_double* a, lapack_int* lda,
+                   const lapack_complex_double* tau, lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_cunmbr(char* vect, char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k,
+                   const lapack_complex_float* a, lapack_int* lda, const lapack_complex_float* tau,
+                   lapack_complex_float* c, lapack_int* ldc, lapack_complex_float* work, lapack_int* lwork,
+                   lapack_int* info);
+void LAPACK_zunmbr(char* vect, char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k,
+                   const lapack_complex_double* a, lapack_int* lda, const lapack_complex_double* tau,
+                   lapack_complex_double* c, lapack_int* ldc, lapack_complex_double* work, lapack_int* lwork,
+                   lapack_int* info);
+void LAPACK_sbdsqr(char* uplo, lapack_int* n, lapack_int* ncvt, lapack_int* nru, lapack_int* ncc, float* d, float* e,
+                   float* vt, lapack_int* ldvt, float* u, lapack_int* ldu, float* c, lapack_int* ldc, float* work,
+                   lapack_int* info);
+void LAPACK_dbdsqr(char* uplo, lapack_int* n, lapack_int* ncvt, lapack_int* nru, lapack_int* ncc, double* d, double* e,
+                   double* vt, lapack_int* ldvt, double* u, lapack_int* ldu, double* c, lapack_int* ldc, double* work,
+                   lapack_int* info);
+void LAPACK_cbdsqr(char* uplo, lapack_int* n, lapack_int* ncvt, lapack_int* nru, lapack_int* ncc, float* d, float* e,
+                   lapack_complex_float* vt, lapack_int* ldvt, lapack_complex_float* u, lapack_int* ldu,
+                   lapack_complex_float* c, lapack_int* ldc, float* work, lapack_int* info);
+void LAPACK_zbdsqr(char* uplo, lapack_int* n, lapack_int* ncvt, lapack_int* nru, lapack_int* ncc, double* d, double* e,
+                   lapack_complex_double* vt, lapack_int* ldvt, lapack_complex_double* u, lapack_int* ldu,
+                   lapack_complex_double* c, lapack_int* ldc, double* work, lapack_int* info);
+void LAPACK_sbdsdc(char* uplo, char* compq, lapack_int* n, float* d, float* e, float* u, lapack_int* ldu, float* vt,
+                   lapack_int* ldvt, float* q, lapack_int* iq, float* work, lapack_int* iwork, lapack_int* info);
+void LAPACK_dbdsdc(char* uplo, char* compq, lapack_int* n, double* d, double* e, double* u, lapack_int* ldu, double* vt,
+                   lapack_int* ldvt, double* q, lapack_int* iq, double* work, lapack_int* iwork, lapack_int* info);
+void LAPACK_ssytrd(char* uplo, lapack_int* n, float* a, lapack_int* lda, float* d, float* e, float* tau, float* work,
+                   lapack_int* lwork, lapack_int* info);
+void LAPACK_dsytrd(char* uplo, lapack_int* n, double* a, lapack_int* lda, double* d, double* e, double* tau,
+                   double* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_sorgtr(char* uplo, lapack_int* n, float* a, lapack_int* lda, const float* tau, float* work,
+                   lapack_int* lwork, lapack_int* info);
+void LAPACK_dorgtr(char* uplo, lapack_int* n, double* a, lapack_int* lda, const double* tau, double* work,
+                   lapack_int* lwork, lapack_int* info);
+void LAPACK_sormtr(char* side, char* uplo, char* trans, lapack_int* m, lapack_int* n, const float* a, lapack_int* lda,
+                   const float* tau, float* c, lapack_int* ldc, float* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_dormtr(char* side, char* uplo, char* trans, lapack_int* m, lapack_int* n, const double* a, lapack_int* lda,
+                   const double* tau, double* c, lapack_int* ldc, double* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_chetrd(char* uplo, lapack_int* n, lapack_complex_float* a, lapack_int* lda, float* d, float* e,
+                   lapack_complex_float* tau, lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_zhetrd(char* uplo, lapack_int* n, lapack_complex_double* a, lapack_int* lda, double* d, double* e,
+                   lapack_complex_double* tau, lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_cungtr(char* uplo, lapack_int* n, lapack_complex_float* a, lapack_int* lda, const lapack_complex_float* tau,
+                   lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_zungtr(char* uplo, lapack_int* n, lapack_complex_double* a, lapack_int* lda,
+                   const lapack_complex_double* tau, lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_cunmtr(char* side, char* uplo, char* trans, lapack_int* m, lapack_int* n, const lapack_complex_float* a,
+                   lapack_int* lda, const lapack_complex_float* tau, lapack_complex_float* c, lapack_int* ldc,
+                   lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_zunmtr(char* side, char* uplo, char* trans, lapack_int* m, lapack_int* n, const lapack_complex_double* a,
+                   lapack_int* lda, const lapack_complex_double* tau, lapack_complex_double* c, lapack_int* ldc,
+                   lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_ssptrd(char* uplo, lapack_int* n, float* ap, float* d, float* e, float* tau, lapack_int* info);
+void LAPACK_dsptrd(char* uplo, lapack_int* n, double* ap, double* d, double* e, double* tau, lapack_int* info);
+void LAPACK_sopgtr(char* uplo, lapack_int* n, const float* ap, const float* tau, float* q, lapack_int* ldq, float* work,
+                   lapack_int* info);
+void LAPACK_dopgtr(char* uplo, lapack_int* n, const double* ap, const double* tau, double* q, lapack_int* ldq,
+                   double* work, lapack_int* info);
+void LAPACK_sopmtr(char* side, char* uplo, char* trans, lapack_int* m, lapack_int* n, const float* ap, const float* tau,
+                   float* c, lapack_int* ldc, float* work, lapack_int* info);
+void LAPACK_dopmtr(char* side, char* uplo, char* trans, lapack_int* m, lapack_int* n, const double* ap,
+                   const double* tau, double* c, lapack_int* ldc, double* work, lapack_int* info);
+void LAPACK_chptrd(char* uplo, lapack_int* n, lapack_complex_float* ap, float* d, float* e, lapack_complex_float* tau,
+                   lapack_int* info);
+void LAPACK_zhptrd(char* uplo, lapack_int* n, lapack_complex_double* ap, double* d, double* e,
+                   lapack_complex_double* tau, lapack_int* info);
+void LAPACK_cupgtr(char* uplo, lapack_int* n, const lapack_complex_float* ap, const lapack_complex_float* tau,
+                   lapack_complex_float* q, lapack_int* ldq, lapack_complex_float* work, lapack_int* info);
+void LAPACK_zupgtr(char* uplo, lapack_int* n, const lapack_complex_double* ap, const lapack_complex_double* tau,
+                   lapack_complex_double* q, lapack_int* ldq, lapack_complex_double* work, lapack_int* info);
+void LAPACK_cupmtr(char* side, char* uplo, char* trans, lapack_int* m, lapack_int* n, const lapack_complex_float* ap,
+                   const lapack_complex_float* tau, lapack_complex_float* c, lapack_int* ldc,
+                   lapack_complex_float* work, lapack_int* info);
+void LAPACK_zupmtr(char* side, char* uplo, char* trans, lapack_int* m, lapack_int* n, const lapack_complex_double* ap,
+                   const lapack_complex_double* tau, lapack_complex_double* c, lapack_int* ldc,
+                   lapack_complex_double* work, lapack_int* info);
+void LAPACK_ssbtrd(char* vect, char* uplo, lapack_int* n, lapack_int* kd, float* ab, lapack_int* ldab, float* d,
+                   float* e, float* q, lapack_int* ldq, float* work, lapack_int* info);
+void LAPACK_dsbtrd(char* vect, char* uplo, lapack_int* n, lapack_int* kd, double* ab, lapack_int* ldab, double* d,
+                   double* e, double* q, lapack_int* ldq, double* work, lapack_int* info);
+void LAPACK_chbtrd(char* vect, char* uplo, lapack_int* n, lapack_int* kd, lapack_complex_float* ab, lapack_int* ldab,
+                   float* d, float* e, lapack_complex_float* q, lapack_int* ldq, lapack_complex_float* work,
+                   lapack_int* info);
+void LAPACK_zhbtrd(char* vect, char* uplo, lapack_int* n, lapack_int* kd, lapack_complex_double* ab, lapack_int* ldab,
+                   double* d, double* e, lapack_complex_double* q, lapack_int* ldq, lapack_complex_double* work,
+                   lapack_int* info);
+void LAPACK_ssterf(lapack_int* n, float* d, float* e, lapack_int* info);
+void LAPACK_dsterf(lapack_int* n, double* d, double* e, lapack_int* info);
+void LAPACK_ssteqr(char* compz, lapack_int* n, float* d, float* e, float* z, lapack_int* ldz, float* work,
+                   lapack_int* info);
+void LAPACK_dsteqr(char* compz, lapack_int* n, double* d, double* e, double* z, lapack_int* ldz, double* work,
+                   lapack_int* info);
+void LAPACK_csteqr(char* compz, lapack_int* n, float* d, float* e, lapack_complex_float* z, lapack_int* ldz,
+                   float* work, lapack_int* info);
+void LAPACK_zsteqr(char* compz, lapack_int* n, double* d, double* e, lapack_complex_double* z, lapack_int* ldz,
+                   double* work, lapack_int* info);
+void LAPACK_sstemr(char* jobz, char* range, lapack_int* n, float* d, float* e, float* vl, float* vu, lapack_int* il,
+                   lapack_int* iu, lapack_int* m, float* w, float* z, lapack_int* ldz, lapack_int* nzc,
+                   lapack_int* isuppz, lapack_logical* tryrac, float* work, lapack_int* lwork, lapack_int* iwork,
+                   lapack_int* liwork, lapack_int* info);
+void LAPACK_dstemr(char* jobz, char* range, lapack_int* n, double* d, double* e, double* vl, double* vu, lapack_int* il,
+                   lapack_int* iu, lapack_int* m, double* w, double* z, lapack_int* ldz, lapack_int* nzc,
+                   lapack_int* isuppz, lapack_logical* tryrac, double* work, lapack_int* lwork, lapack_int* iwork,
+                   lapack_int* liwork, lapack_int* info);
+void LAPACK_cstemr(char* jobz, char* range, lapack_int* n, float* d, float* e, float* vl, float* vu, lapack_int* il,
+                   lapack_int* iu, lapack_int* m, float* w, lapack_complex_float* z, lapack_int* ldz, lapack_int* nzc,
+                   lapack_int* isuppz, lapack_logical* tryrac, float* work, lapack_int* lwork, lapack_int* iwork,
+                   lapack_int* liwork, lapack_int* info);
+void LAPACK_zstemr(char* jobz, char* range, lapack_int* n, double* d, double* e, double* vl, double* vu, lapack_int* il,
+                   lapack_int* iu, lapack_int* m, double* w, lapack_complex_double* z, lapack_int* ldz, lapack_int* nzc,
+                   lapack_int* isuppz, lapack_logical* tryrac, double* work, lapack_int* lwork, lapack_int* iwork,
+                   lapack_int* liwork, lapack_int* info);
+void LAPACK_sstedc(char* compz, lapack_int* n, float* d, float* e, float* z, lapack_int* ldz, float* work,
+                   lapack_int* lwork, lapack_int* iwork, lapack_int* liwork, lapack_int* info);
+void LAPACK_dstedc(char* compz, lapack_int* n, double* d, double* e, double* z, lapack_int* ldz, double* work,
+                   lapack_int* lwork, lapack_int* iwork, lapack_int* liwork, lapack_int* info);
+void LAPACK_cstedc(char* compz, lapack_int* n, float* d, float* e, lapack_complex_float* z, lapack_int* ldz,
+                   lapack_complex_float* work, lapack_int* lwork, float* rwork, lapack_int* lrwork, lapack_int* iwork,
+                   lapack_int* liwork, lapack_int* info);
+void LAPACK_zstedc(char* compz, lapack_int* n, double* d, double* e, lapack_complex_double* z, lapack_int* ldz,
+                   lapack_complex_double* work, lapack_int* lwork, double* rwork, lapack_int* lrwork, lapack_int* iwork,
+                   lapack_int* liwork, lapack_int* info);
+void LAPACK_sstegr(char* jobz, char* range, lapack_int* n, float* d, float* e, float* vl, float* vu, lapack_int* il,
+                   lapack_int* iu, float* abstol, lapack_int* m, float* w, float* z, lapack_int* ldz,
+                   lapack_int* isuppz, float* work, lapack_int* lwork, lapack_int* iwork, lapack_int* liwork,
+                   lapack_int* info);
+void LAPACK_dstegr(char* jobz, char* range, lapack_int* n, double* d, double* e, double* vl, double* vu, lapack_int* il,
+                   lapack_int* iu, double* abstol, lapack_int* m, double* w, double* z, lapack_int* ldz,
+                   lapack_int* isuppz, double* work, lapack_int* lwork, lapack_int* iwork, lapack_int* liwork,
+                   lapack_int* info);
+void LAPACK_cstegr(char* jobz, char* range, lapack_int* n, float* d, float* e, float* vl, float* vu, lapack_int* il,
+                   lapack_int* iu, float* abstol, lapack_int* m, float* w, lapack_complex_float* z, lapack_int* ldz,
+                   lapack_int* isuppz, float* work, lapack_int* lwork, lapack_int* iwork, lapack_int* liwork,
+                   lapack_int* info);
+void LAPACK_zstegr(char* jobz, char* range, lapack_int* n, double* d, double* e, double* vl, double* vu, lapack_int* il,
+                   lapack_int* iu, double* abstol, lapack_int* m, double* w, lapack_complex_double* z, lapack_int* ldz,
+                   lapack_int* isuppz, double* work, lapack_int* lwork, lapack_int* iwork, lapack_int* liwork,
+                   lapack_int* info);
+void LAPACK_spteqr(char* compz, lapack_int* n, float* d, float* e, float* z, lapack_int* ldz, float* work,
+                   lapack_int* info);
+void LAPACK_dpteqr(char* compz, lapack_int* n, double* d, double* e, double* z, lapack_int* ldz, double* work,
+                   lapack_int* info);
+void LAPACK_cpteqr(char* compz, lapack_int* n, float* d, float* e, lapack_complex_float* z, lapack_int* ldz,
+                   float* work, lapack_int* info);
+void LAPACK_zpteqr(char* compz, lapack_int* n, double* d, double* e, lapack_complex_double* z, lapack_int* ldz,
+                   double* work, lapack_int* info);
+void LAPACK_sstebz(char* range, char* order, lapack_int* n, float* vl, float* vu, lapack_int* il, lapack_int* iu,
+                   float* abstol, const float* d, const float* e, lapack_int* m, lapack_int* nsplit, float* w,
+                   lapack_int* iblock, lapack_int* isplit, float* work, lapack_int* iwork, lapack_int* info);
+void LAPACK_dstebz(char* range, char* order, lapack_int* n, double* vl, double* vu, lapack_int* il, lapack_int* iu,
+                   double* abstol, const double* d, const double* e, lapack_int* m, lapack_int* nsplit, double* w,
+                   lapack_int* iblock, lapack_int* isplit, double* work, lapack_int* iwork, lapack_int* info);
+void LAPACK_sstein(lapack_int* n, const float* d, const float* e, lapack_int* m, const float* w,
+                   const lapack_int* iblock, const lapack_int* isplit, float* z, lapack_int* ldz, float* work,
+                   lapack_int* iwork, lapack_int* ifailv, lapack_int* info);
+void LAPACK_dstein(lapack_int* n, const double* d, const double* e, lapack_int* m, const double* w,
+                   const lapack_int* iblock, const lapack_int* isplit, double* z, lapack_int* ldz, double* work,
+                   lapack_int* iwork, lapack_int* ifailv, lapack_int* info);
+void LAPACK_cstein(lapack_int* n, const float* d, const float* e, lapack_int* m, const float* w,
+                   const lapack_int* iblock, const lapack_int* isplit, lapack_complex_float* z, lapack_int* ldz,
+                   float* work, lapack_int* iwork, lapack_int* ifailv, lapack_int* info);
+void LAPACK_zstein(lapack_int* n, const double* d, const double* e, lapack_int* m, const double* w,
+                   const lapack_int* iblock, const lapack_int* isplit, lapack_complex_double* z, lapack_int* ldz,
+                   double* work, lapack_int* iwork, lapack_int* ifailv, lapack_int* info);
+void LAPACK_sdisna(char* job, lapack_int* m, lapack_int* n, const float* d, float* sep, lapack_int* info);
+void LAPACK_ddisna(char* job, lapack_int* m, lapack_int* n, const double* d, double* sep, lapack_int* info);
+void LAPACK_ssygst(lapack_int* itype, char* uplo, lapack_int* n, float* a, lapack_int* lda, const float* b,
+                   lapack_int* ldb, lapack_int* info);
+void LAPACK_dsygst(lapack_int* itype, char* uplo, lapack_int* n, double* a, lapack_int* lda, const double* b,
+                   lapack_int* ldb, lapack_int* info);
+void LAPACK_chegst(lapack_int* itype, char* uplo, lapack_int* n, lapack_complex_float* a, lapack_int* lda,
+                   const lapack_complex_float* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_zhegst(lapack_int* itype, char* uplo, lapack_int* n, lapack_complex_double* a, lapack_int* lda,
+                   const lapack_complex_double* b, lapack_int* ldb, lapack_int* info);
+void LAPACK_sspgst(lapack_int* itype, char* uplo, lapack_int* n, float* ap, const float* bp, lapack_int* info);
+void LAPACK_dspgst(lapack_int* itype, char* uplo, lapack_int* n, double* ap, const double* bp, lapack_int* info);
+void LAPACK_chpgst(lapack_int* itype, char* uplo, lapack_int* n, lapack_complex_float* ap,
+                   const lapack_complex_float* bp, lapack_int* info);
+void LAPACK_zhpgst(lapack_int* itype, char* uplo, lapack_int* n, lapack_complex_double* ap,
+                   const lapack_complex_double* bp, lapack_int* info);
+void LAPACK_ssbgst(char* vect, char* uplo, lapack_int* n, lapack_int* ka, lapack_int* kb, float* ab, lapack_int* ldab,
+                   const float* bb, lapack_int* ldbb, float* x, lapack_int* ldx, float* work, lapack_int* info);
+void LAPACK_dsbgst(char* vect, char* uplo, lapack_int* n, lapack_int* ka, lapack_int* kb, double* ab, lapack_int* ldab,
+                   const double* bb, lapack_int* ldbb, double* x, lapack_int* ldx, double* work, lapack_int* info);
+void LAPACK_chbgst(char* vect, char* uplo, lapack_int* n, lapack_int* ka, lapack_int* kb, lapack_complex_float* ab,
+                   lapack_int* ldab, const lapack_complex_float* bb, lapack_int* ldbb, lapack_complex_float* x,
+                   lapack_int* ldx, lapack_complex_float* work, float* rwork, lapack_int* info);
+void LAPACK_zhbgst(char* vect, char* uplo, lapack_int* n, lapack_int* ka, lapack_int* kb, lapack_complex_double* ab,
+                   lapack_int* ldab, const lapack_complex_double* bb, lapack_int* ldbb, lapack_complex_double* x,
+                   lapack_int* ldx, lapack_complex_double* work, double* rwork, lapack_int* info);
+void LAPACK_spbstf(char* uplo, lapack_int* n, lapack_int* kb, float* bb, lapack_int* ldbb, lapack_int* info);
+void LAPACK_dpbstf(char* uplo, lapack_int* n, lapack_int* kb, double* bb, lapack_int* ldbb, lapack_int* info);
+void LAPACK_cpbstf(char* uplo, lapack_int* n, lapack_int* kb, lapack_complex_float* bb, lapack_int* ldbb,
+                   lapack_int* info);
+void LAPACK_zpbstf(char* uplo, lapack_int* n, lapack_int* kb, lapack_complex_double* bb, lapack_int* ldbb,
+                   lapack_int* info);
+void LAPACK_sgehrd(lapack_int* n, lapack_int* ilo, lapack_int* ihi, float* a, lapack_int* lda, float* tau, float* work,
+                   lapack_int* lwork, lapack_int* info);
+void LAPACK_dgehrd(lapack_int* n, lapack_int* ilo, lapack_int* ihi, double* a, lapack_int* lda, double* tau,
+                   double* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_cgehrd(lapack_int* n, lapack_int* ilo, lapack_int* ihi, lapack_complex_float* a, lapack_int* lda,
+                   lapack_complex_float* tau, lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_zgehrd(lapack_int* n, lapack_int* ilo, lapack_int* ihi, lapack_complex_double* a, lapack_int* lda,
+                   lapack_complex_double* tau, lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_sorghr(lapack_int* n, lapack_int* ilo, lapack_int* ihi, float* a, lapack_int* lda, const float* tau,
+                   float* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_dorghr(lapack_int* n, lapack_int* ilo, lapack_int* ihi, double* a, lapack_int* lda, const double* tau,
+                   double* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_sormhr(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* ilo, lapack_int* ihi,
+                   const float* a, lapack_int* lda, const float* tau, float* c, lapack_int* ldc, float* work,
+                   lapack_int* lwork, lapack_int* info);
+void LAPACK_dormhr(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* ilo, lapack_int* ihi,
+                   const double* a, lapack_int* lda, const double* tau, double* c, lapack_int* ldc, double* work,
+                   lapack_int* lwork, lapack_int* info);
+void LAPACK_cunghr(lapack_int* n, lapack_int* ilo, lapack_int* ihi, lapack_complex_float* a, lapack_int* lda,
+                   const lapack_complex_float* tau, lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_zunghr(lapack_int* n, lapack_int* ilo, lapack_int* ihi, lapack_complex_double* a, lapack_int* lda,
+                   const lapack_complex_double* tau, lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_cunmhr(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* ilo, lapack_int* ihi,
+                   const lapack_complex_float* a, lapack_int* lda, const lapack_complex_float* tau,
+                   lapack_complex_float* c, lapack_int* ldc, lapack_complex_float* work, lapack_int* lwork,
+                   lapack_int* info);
+void LAPACK_zunmhr(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* ilo, lapack_int* ihi,
+                   const lapack_complex_double* a, lapack_int* lda, const lapack_complex_double* tau,
+                   lapack_complex_double* c, lapack_int* ldc, lapack_complex_double* work, lapack_int* lwork,
+                   lapack_int* info);
+void LAPACK_sgebal(char* job, lapack_int* n, float* a, lapack_int* lda, lapack_int* ilo, lapack_int* ihi, float* scale,
+                   lapack_int* info);
+void LAPACK_dgebal(char* job, lapack_int* n, double* a, lapack_int* lda, lapack_int* ilo, lapack_int* ihi,
+                   double* scale, lapack_int* info);
+void LAPACK_cgebal(char* job, lapack_int* n, lapack_complex_float* a, lapack_int* lda, lapack_int* ilo, lapack_int* ihi,
+                   float* scale, lapack_int* info);
+void LAPACK_zgebal(char* job, lapack_int* n, lapack_complex_double* a, lapack_int* lda, lapack_int* ilo,
+                   lapack_int* ihi, double* scale, lapack_int* info);
+void LAPACK_sgebak(char* job, char* side, lapack_int* n, lapack_int* ilo, lapack_int* ihi, const float* scale,
+                   lapack_int* m, float* v, lapack_int* ldv, lapack_int* info);
+void LAPACK_dgebak(char* job, char* side, lapack_int* n, lapack_int* ilo, lapack_int* ihi, const double* scale,
+                   lapack_int* m, double* v, lapack_int* ldv, lapack_int* info);
+void LAPACK_cgebak(char* job, char* side, lapack_int* n, lapack_int* ilo, lapack_int* ihi, const float* scale,
+                   lapack_int* m, lapack_complex_float* v, lapack_int* ldv, lapack_int* info);
+void LAPACK_zgebak(char* job, char* side, lapack_int* n, lapack_int* ilo, lapack_int* ihi, const double* scale,
+                   lapack_int* m, lapack_complex_double* v, lapack_int* ldv, lapack_int* info);
+void LAPACK_shseqr(char* job, char* compz, lapack_int* n, lapack_int* ilo, lapack_int* ihi, float* h, lapack_int* ldh,
+                   float* wr, float* wi, float* z, lapack_int* ldz, float* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_dhseqr(char* job, char* compz, lapack_int* n, lapack_int* ilo, lapack_int* ihi, double* h, lapack_int* ldh,
+                   double* wr, double* wi, double* z, lapack_int* ldz, double* work, lapack_int* lwork,
+                   lapack_int* info);
+void LAPACK_chseqr(char* job, char* compz, lapack_int* n, lapack_int* ilo, lapack_int* ihi, lapack_complex_float* h,
+                   lapack_int* ldh, lapack_complex_float* w, lapack_complex_float* z, lapack_int* ldz,
+                   lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_zhseqr(char* job, char* compz, lapack_int* n, lapack_int* ilo, lapack_int* ihi, lapack_complex_double* h,
+                   lapack_int* ldh, lapack_complex_double* w, lapack_complex_double* z, lapack_int* ldz,
+                   lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_shsein(char* job, char* eigsrc, char* initv, lapack_logical* select, lapack_int* n, const float* h,
+                   lapack_int* ldh, float* wr, const float* wi, float* vl, lapack_int* ldvl, float* vr,
+                   lapack_int* ldvr, lapack_int* mm, lapack_int* m, float* work, lapack_int* ifaill, lapack_int* ifailr,
+                   lapack_int* info);
+void LAPACK_dhsein(char* job, char* eigsrc, char* initv, lapack_logical* select, lapack_int* n, const double* h,
+                   lapack_int* ldh, double* wr, const double* wi, double* vl, lapack_int* ldvl, double* vr,
+                   lapack_int* ldvr, lapack_int* mm, lapack_int* m, double* work, lapack_int* ifaill,
+                   lapack_int* ifailr, lapack_int* info);
+void LAPACK_chsein(char* job, char* eigsrc, char* initv, const lapack_logical* select, lapack_int* n,
+                   const lapack_complex_float* h, lapack_int* ldh, lapack_complex_float* w, lapack_complex_float* vl,
+                   lapack_int* ldvl, lapack_complex_float* vr, lapack_int* ldvr, lapack_int* mm, lapack_int* m,
+                   lapack_complex_float* work, float* rwork, lapack_int* ifaill, lapack_int* ifailr, lapack_int* info);
+void LAPACK_zhsein(char* job, char* eigsrc, char* initv, const lapack_logical* select, lapack_int* n,
+                   const lapack_complex_double* h, lapack_int* ldh, lapack_complex_double* w, lapack_complex_double* vl,
+                   lapack_int* ldvl, lapack_complex_double* vr, lapack_int* ldvr, lapack_int* mm, lapack_int* m,
+                   lapack_complex_double* work, double* rwork, lapack_int* ifaill, lapack_int* ifailr,
+                   lapack_int* info);
+void LAPACK_strevc(char* side, char* howmny, lapack_logical* select, lapack_int* n, const float* t, lapack_int* ldt,
+                   float* vl, lapack_int* ldvl, float* vr, lapack_int* ldvr, lapack_int* mm, lapack_int* m, float* work,
+                   lapack_int* info);
+void LAPACK_dtrevc(char* side, char* howmny, lapack_logical* select, lapack_int* n, const double* t, lapack_int* ldt,
+                   double* vl, lapack_int* ldvl, double* vr, lapack_int* ldvr, lapack_int* mm, lapack_int* m,
+                   double* work, lapack_int* info);
+void LAPACK_ctrevc(char* side, char* howmny, const lapack_logical* select, lapack_int* n, lapack_complex_float* t,
+                   lapack_int* ldt, lapack_complex_float* vl, lapack_int* ldvl, lapack_complex_float* vr,
+                   lapack_int* ldvr, lapack_int* mm, lapack_int* m, lapack_complex_float* work, float* rwork,
+                   lapack_int* info);
+void LAPACK_ztrevc(char* side, char* howmny, const lapack_logical* select, lapack_int* n, lapack_complex_double* t,
+                   lapack_int* ldt, lapack_complex_double* vl, lapack_int* ldvl, lapack_complex_double* vr,
+                   lapack_int* ldvr, lapack_int* mm, lapack_int* m, lapack_complex_double* work, double* rwork,
+                   lapack_int* info);
+void LAPACK_strsna(char* job, char* howmny, const lapack_logical* select, lapack_int* n, const float* t,
+                   lapack_int* ldt, const float* vl, lapack_int* ldvl, const float* vr, lapack_int* ldvr, float* s,
+                   float* sep, lapack_int* mm, lapack_int* m, float* work, lapack_int* ldwork, lapack_int* iwork,
+                   lapack_int* info);
+void LAPACK_dtrsna(char* job, char* howmny, const lapack_logical* select, lapack_int* n, const double* t,
+                   lapack_int* ldt, const double* vl, lapack_int* ldvl, const double* vr, lapack_int* ldvr, double* s,
+                   double* sep, lapack_int* mm, lapack_int* m, double* work, lapack_int* ldwork, lapack_int* iwork,
+                   lapack_int* info);
+void LAPACK_ctrsna(char* job, char* howmny, const lapack_logical* select, lapack_int* n, const lapack_complex_float* t,
+                   lapack_int* ldt, const lapack_complex_float* vl, lapack_int* ldvl, const lapack_complex_float* vr,
+                   lapack_int* ldvr, float* s, float* sep, lapack_int* mm, lapack_int* m, lapack_complex_float* work,
+                   lapack_int* ldwork, float* rwork, lapack_int* info);
+void LAPACK_ztrsna(char* job, char* howmny, const lapack_logical* select, lapack_int* n, const lapack_complex_double* t,
+                   lapack_int* ldt, const lapack_complex_double* vl, lapack_int* ldvl, const lapack_complex_double* vr,
+                   lapack_int* ldvr, double* s, double* sep, lapack_int* mm, lapack_int* m, lapack_complex_double* work,
+                   lapack_int* ldwork, double* rwork, lapack_int* info);
+void LAPACK_strexc(char* compq, lapack_int* n, float* t, lapack_int* ldt, float* q, lapack_int* ldq, lapack_int* ifst,
+                   lapack_int* ilst, float* work, lapack_int* info);
+void LAPACK_dtrexc(char* compq, lapack_int* n, double* t, lapack_int* ldt, double* q, lapack_int* ldq, lapack_int* ifst,
+                   lapack_int* ilst, double* work, lapack_int* info);
+void LAPACK_ctrexc(char* compq, lapack_int* n, lapack_complex_float* t, lapack_int* ldt, lapack_complex_float* q,
+                   lapack_int* ldq, lapack_int* ifst, lapack_int* ilst, lapack_int* info);
+void LAPACK_ztrexc(char* compq, lapack_int* n, lapack_complex_double* t, lapack_int* ldt, lapack_complex_double* q,
+                   lapack_int* ldq, lapack_int* ifst, lapack_int* ilst, lapack_int* info);
+void LAPACK_strsen(char* job, char* compq, const lapack_logical* select, lapack_int* n, float* t, lapack_int* ldt,
+                   float* q, lapack_int* ldq, float* wr, float* wi, lapack_int* m, float* s, float* sep, float* work,
+                   lapack_int* lwork, lapack_int* iwork, lapack_int* liwork, lapack_int* info);
+void LAPACK_dtrsen(char* job, char* compq, const lapack_logical* select, lapack_int* n, double* t, lapack_int* ldt,
+                   double* q, lapack_int* ldq, double* wr, double* wi, lapack_int* m, double* s, double* sep,
+                   double* work, lapack_int* lwork, lapack_int* iwork, lapack_int* liwork, lapack_int* info);
+void LAPACK_ctrsen(char* job, char* compq, const lapack_logical* select, lapack_int* n, lapack_complex_float* t,
+                   lapack_int* ldt, lapack_complex_float* q, lapack_int* ldq, lapack_complex_float* w, lapack_int* m,
+                   float* s, float* sep, lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_ztrsen(char* job, char* compq, const lapack_logical* select, lapack_int* n, lapack_complex_double* t,
+                   lapack_int* ldt, lapack_complex_double* q, lapack_int* ldq, lapack_complex_double* w, lapack_int* m,
+                   double* s, double* sep, lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_strsyl(char* trana, char* tranb, lapack_int* isgn, lapack_int* m, lapack_int* n, const float* a,
+                   lapack_int* lda, const float* b, lapack_int* ldb, float* c, lapack_int* ldc, float* scale,
+                   lapack_int* info);
+void LAPACK_dtrsyl(char* trana, char* tranb, lapack_int* isgn, lapack_int* m, lapack_int* n, const double* a,
+                   lapack_int* lda, const double* b, lapack_int* ldb, double* c, lapack_int* ldc, double* scale,
+                   lapack_int* info);
+void LAPACK_ctrsyl(char* trana, char* tranb, lapack_int* isgn, lapack_int* m, lapack_int* n,
+                   const lapack_complex_float* a, lapack_int* lda, const lapack_complex_float* b, lapack_int* ldb,
+                   lapack_complex_float* c, lapack_int* ldc, float* scale, lapack_int* info);
+void LAPACK_ztrsyl(char* trana, char* tranb, lapack_int* isgn, lapack_int* m, lapack_int* n,
+                   const lapack_complex_double* a, lapack_int* lda, const lapack_complex_double* b, lapack_int* ldb,
+                   lapack_complex_double* c, lapack_int* ldc, double* scale, lapack_int* info);
+void LAPACK_sgghrd(char* compq, char* compz, lapack_int* n, lapack_int* ilo, lapack_int* ihi, float* a, lapack_int* lda,
+                   float* b, lapack_int* ldb, float* q, lapack_int* ldq, float* z, lapack_int* ldz, lapack_int* info);
+void LAPACK_dgghrd(char* compq, char* compz, lapack_int* n, lapack_int* ilo, lapack_int* ihi, double* a,
+                   lapack_int* lda, double* b, lapack_int* ldb, double* q, lapack_int* ldq, double* z, lapack_int* ldz,
+                   lapack_int* info);
+void LAPACK_cgghrd(char* compq, char* compz, lapack_int* n, lapack_int* ilo, lapack_int* ihi, lapack_complex_float* a,
+                   lapack_int* lda, lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* q, lapack_int* ldq,
+                   lapack_complex_float* z, lapack_int* ldz, lapack_int* info);
+void LAPACK_zgghrd(char* compq, char* compz, lapack_int* n, lapack_int* ilo, lapack_int* ihi, lapack_complex_double* a,
+                   lapack_int* lda, lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* q,
+                   lapack_int* ldq, lapack_complex_double* z, lapack_int* ldz, lapack_int* info);
+void LAPACK_sggbal(char* job, lapack_int* n, float* a, lapack_int* lda, float* b, lapack_int* ldb, lapack_int* ilo,
+                   lapack_int* ihi, float* lscale, float* rscale, float* work, lapack_int* info);
+void LAPACK_dggbal(char* job, lapack_int* n, double* a, lapack_int* lda, double* b, lapack_int* ldb, lapack_int* ilo,
+                   lapack_int* ihi, double* lscale, double* rscale, double* work, lapack_int* info);
+void LAPACK_cggbal(char* job, lapack_int* n, lapack_complex_float* a, lapack_int* lda, lapack_complex_float* b,
+                   lapack_int* ldb, lapack_int* ilo, lapack_int* ihi, float* lscale, float* rscale, float* work,
+                   lapack_int* info);
+void LAPACK_zggbal(char* job, lapack_int* n, lapack_complex_double* a, lapack_int* lda, lapack_complex_double* b,
+                   lapack_int* ldb, lapack_int* ilo, lapack_int* ihi, double* lscale, double* rscale, double* work,
+                   lapack_int* info);
+void LAPACK_sggbak(char* job, char* side, lapack_int* n, lapack_int* ilo, lapack_int* ihi, const float* lscale,
+                   const float* rscale, lapack_int* m, float* v, lapack_int* ldv, lapack_int* info);
+void LAPACK_dggbak(char* job, char* side, lapack_int* n, lapack_int* ilo, lapack_int* ihi, const double* lscale,
+                   const double* rscale, lapack_int* m, double* v, lapack_int* ldv, lapack_int* info);
+void LAPACK_cggbak(char* job, char* side, lapack_int* n, lapack_int* ilo, lapack_int* ihi, const float* lscale,
+                   const float* rscale, lapack_int* m, lapack_complex_float* v, lapack_int* ldv, lapack_int* info);
+void LAPACK_zggbak(char* job, char* side, lapack_int* n, lapack_int* ilo, lapack_int* ihi, const double* lscale,
+                   const double* rscale, lapack_int* m, lapack_complex_double* v, lapack_int* ldv, lapack_int* info);
+void LAPACK_shgeqz(char* job, char* compq, char* compz, lapack_int* n, lapack_int* ilo, lapack_int* ihi, float* h,
+                   lapack_int* ldh, float* t, lapack_int* ldt, float* alphar, float* alphai, float* beta, float* q,
+                   lapack_int* ldq, float* z, lapack_int* ldz, float* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_dhgeqz(char* job, char* compq, char* compz, lapack_int* n, lapack_int* ilo, lapack_int* ihi, double* h,
+                   lapack_int* ldh, double* t, lapack_int* ldt, double* alphar, double* alphai, double* beta, double* q,
+                   lapack_int* ldq, double* z, lapack_int* ldz, double* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_chgeqz(char* job, char* compq, char* compz, lapack_int* n, lapack_int* ilo, lapack_int* ihi,
+                   lapack_complex_float* h, lapack_int* ldh, lapack_complex_float* t, lapack_int* ldt,
+                   lapack_complex_float* alpha, lapack_complex_float* beta, lapack_complex_float* q, lapack_int* ldq,
+                   lapack_complex_float* z, lapack_int* ldz, lapack_complex_float* work, lapack_int* lwork,
+                   float* rwork, lapack_int* info);
+void LAPACK_zhgeqz(char* job, char* compq, char* compz, lapack_int* n, lapack_int* ilo, lapack_int* ihi,
+                   lapack_complex_double* h, lapack_int* ldh, lapack_complex_double* t, lapack_int* ldt,
+                   lapack_complex_double* alpha, lapack_complex_double* beta, lapack_complex_double* q, lapack_int* ldq,
+                   lapack_complex_double* z, lapack_int* ldz, lapack_complex_double* work, lapack_int* lwork,
+                   double* rwork, lapack_int* info);
+void LAPACK_stgevc(char* side, char* howmny, const lapack_logical* select, lapack_int* n, const float* s,
+                   lapack_int* lds, const float* p, lapack_int* ldp, float* vl, lapack_int* ldvl, float* vr,
+                   lapack_int* ldvr, lapack_int* mm, lapack_int* m, float* work, lapack_int* info);
+void LAPACK_dtgevc(char* side, char* howmny, const lapack_logical* select, lapack_int* n, const double* s,
+                   lapack_int* lds, const double* p, lapack_int* ldp, double* vl, lapack_int* ldvl, double* vr,
+                   lapack_int* ldvr, lapack_int* mm, lapack_int* m, double* work, lapack_int* info);
+void LAPACK_ctgevc(char* side, char* howmny, const lapack_logical* select, lapack_int* n, const lapack_complex_float* s,
+                   lapack_int* lds, const lapack_complex_float* p, lapack_int* ldp, lapack_complex_float* vl,
+                   lapack_int* ldvl, lapack_complex_float* vr, lapack_int* ldvr, lapack_int* mm, lapack_int* m,
+                   lapack_complex_float* work, float* rwork, lapack_int* info);
+void LAPACK_ztgevc(char* side, char* howmny, const lapack_logical* select, lapack_int* n,
+                   const lapack_complex_double* s, lapack_int* lds, const lapack_complex_double* p, lapack_int* ldp,
+                   lapack_complex_double* vl, lapack_int* ldvl, lapack_complex_double* vr, lapack_int* ldvr,
+                   lapack_int* mm, lapack_int* m, lapack_complex_double* work, double* rwork, lapack_int* info);
+void LAPACK_stgexc(lapack_logical* wantq, lapack_logical* wantz, lapack_int* n, float* a, lapack_int* lda, float* b,
+                   lapack_int* ldb, float* q, lapack_int* ldq, float* z, lapack_int* ldz, lapack_int* ifst,
+                   lapack_int* ilst, float* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_dtgexc(lapack_logical* wantq, lapack_logical* wantz, lapack_int* n, double* a, lapack_int* lda, double* b,
+                   lapack_int* ldb, double* q, lapack_int* ldq, double* z, lapack_int* ldz, lapack_int* ifst,
+                   lapack_int* ilst, double* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_ctgexc(lapack_logical* wantq, lapack_logical* wantz, lapack_int* n, lapack_complex_float* a,
+                   lapack_int* lda, lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* q, lapack_int* ldq,
+                   lapack_complex_float* z, lapack_int* ldz, lapack_int* ifst, lapack_int* ilst, lapack_int* info);
+void LAPACK_ztgexc(lapack_logical* wantq, lapack_logical* wantz, lapack_int* n, lapack_complex_double* a,
+                   lapack_int* lda, lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* q,
+                   lapack_int* ldq, lapack_complex_double* z, lapack_int* ldz, lapack_int* ifst, lapack_int* ilst,
+                   lapack_int* info);
+void LAPACK_stgsen(lapack_int* ijob, lapack_logical* wantq, lapack_logical* wantz, const lapack_logical* select,
+                   lapack_int* n, float* a, lapack_int* lda, float* b, lapack_int* ldb, float* alphar, float* alphai,
+                   float* beta, float* q, lapack_int* ldq, float* z, lapack_int* ldz, lapack_int* m, float* pl,
+                   float* pr, float* dif, float* work, lapack_int* lwork, lapack_int* iwork, lapack_int* liwork,
+                   lapack_int* info);
+void LAPACK_dtgsen(lapack_int* ijob, lapack_logical* wantq, lapack_logical* wantz, const lapack_logical* select,
+                   lapack_int* n, double* a, lapack_int* lda, double* b, lapack_int* ldb, double* alphar,
+                   double* alphai, double* beta, double* q, lapack_int* ldq, double* z, lapack_int* ldz, lapack_int* m,
+                   double* pl, double* pr, double* dif, double* work, lapack_int* lwork, lapack_int* iwork,
+                   lapack_int* liwork, lapack_int* info);
+void LAPACK_ctgsen(lapack_int* ijob, lapack_logical* wantq, lapack_logical* wantz, const lapack_logical* select,
+                   lapack_int* n, lapack_complex_float* a, lapack_int* lda, lapack_complex_float* b, lapack_int* ldb,
+                   lapack_complex_float* alpha, lapack_complex_float* beta, lapack_complex_float* q, lapack_int* ldq,
+                   lapack_complex_float* z, lapack_int* ldz, lapack_int* m, float* pl, float* pr, float* dif,
+                   lapack_complex_float* work, lapack_int* lwork, lapack_int* iwork, lapack_int* liwork,
+                   lapack_int* info);
+void LAPACK_ztgsen(lapack_int* ijob, lapack_logical* wantq, lapack_logical* wantz, const lapack_logical* select,
+                   lapack_int* n, lapack_complex_double* a, lapack_int* lda, lapack_complex_double* b, lapack_int* ldb,
+                   lapack_complex_double* alpha, lapack_complex_double* beta, lapack_complex_double* q, lapack_int* ldq,
+                   lapack_complex_double* z, lapack_int* ldz, lapack_int* m, double* pl, double* pr, double* dif,
+                   lapack_complex_double* work, lapack_int* lwork, lapack_int* iwork, lapack_int* liwork,
+                   lapack_int* info);
+void LAPACK_stgsyl(char* trans, lapack_int* ijob, lapack_int* m, lapack_int* n, const float* a, lapack_int* lda,
+                   const float* b, lapack_int* ldb, float* c, lapack_int* ldc, const float* d, lapack_int* ldd,
+                   const float* e, lapack_int* lde, float* f, lapack_int* ldf, float* scale, float* dif, float* work,
+                   lapack_int* lwork, lapack_int* iwork, lapack_int* info);
+void LAPACK_dtgsyl(char* trans, lapack_int* ijob, lapack_int* m, lapack_int* n, const double* a, lapack_int* lda,
+                   const double* b, lapack_int* ldb, double* c, lapack_int* ldc, const double* d, lapack_int* ldd,
+                   const double* e, lapack_int* lde, double* f, lapack_int* ldf, double* scale, double* dif,
+                   double* work, lapack_int* lwork, lapack_int* iwork, lapack_int* info);
+void LAPACK_ctgsyl(char* trans, lapack_int* ijob, lapack_int* m, lapack_int* n, const lapack_complex_float* a,
+                   lapack_int* lda, const lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* c,
+                   lapack_int* ldc, const lapack_complex_float* d, lapack_int* ldd, const lapack_complex_float* e,
+                   lapack_int* lde, lapack_complex_float* f, lapack_int* ldf, float* scale, float* dif,
+                   lapack_complex_float* work, lapack_int* lwork, lapack_int* iwork, lapack_int* info);
+void LAPACK_ztgsyl(char* trans, lapack_int* ijob, lapack_int* m, lapack_int* n, const lapack_complex_double* a,
+                   lapack_int* lda, const lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* c,
+                   lapack_int* ldc, const lapack_complex_double* d, lapack_int* ldd, const lapack_complex_double* e,
+                   lapack_int* lde, lapack_complex_double* f, lapack_int* ldf, double* scale, double* dif,
+                   lapack_complex_double* work, lapack_int* lwork, lapack_int* iwork, lapack_int* info);
+void LAPACK_stgsna(char* job, char* howmny, const lapack_logical* select, lapack_int* n, const float* a,
+                   lapack_int* lda, const float* b, lapack_int* ldb, const float* vl, lapack_int* ldvl, const float* vr,
+                   lapack_int* ldvr, float* s, float* dif, lapack_int* mm, lapack_int* m, float* work,
+                   lapack_int* lwork, lapack_int* iwork, lapack_int* info);
+void LAPACK_dtgsna(char* job, char* howmny, const lapack_logical* select, lapack_int* n, const double* a,
+                   lapack_int* lda, const double* b, lapack_int* ldb, const double* vl, lapack_int* ldvl,
+                   const double* vr, lapack_int* ldvr, double* s, double* dif, lapack_int* mm, lapack_int* m,
+                   double* work, lapack_int* lwork, lapack_int* iwork, lapack_int* info);
+void LAPACK_ctgsna(char* job, char* howmny, const lapack_logical* select, lapack_int* n, const lapack_complex_float* a,
+                   lapack_int* lda, const lapack_complex_float* b, lapack_int* ldb, const lapack_complex_float* vl,
+                   lapack_int* ldvl, const lapack_complex_float* vr, lapack_int* ldvr, float* s, float* dif,
+                   lapack_int* mm, lapack_int* m, lapack_complex_float* work, lapack_int* lwork, lapack_int* iwork,
+                   lapack_int* info);
+void LAPACK_ztgsna(char* job, char* howmny, const lapack_logical* select, lapack_int* n, const lapack_complex_double* a,
+                   lapack_int* lda, const lapack_complex_double* b, lapack_int* ldb, const lapack_complex_double* vl,
+                   lapack_int* ldvl, const lapack_complex_double* vr, lapack_int* ldvr, double* s, double* dif,
+                   lapack_int* mm, lapack_int* m, lapack_complex_double* work, lapack_int* lwork, lapack_int* iwork,
+                   lapack_int* info);
+void LAPACK_sggsvp(char* jobu, char* jobv, char* jobq, lapack_int* m, lapack_int* p, lapack_int* n, float* a,
+                   lapack_int* lda, float* b, lapack_int* ldb, float* tola, float* tolb, lapack_int* k, lapack_int* l,
+                   float* u, lapack_int* ldu, float* v, lapack_int* ldv, float* q, lapack_int* ldq, lapack_int* iwork,
+                   float* tau, float* work, lapack_int* info);
+void LAPACK_dggsvp(char* jobu, char* jobv, char* jobq, lapack_int* m, lapack_int* p, lapack_int* n, double* a,
+                   lapack_int* lda, double* b, lapack_int* ldb, double* tola, double* tolb, lapack_int* k,
+                   lapack_int* l, double* u, lapack_int* ldu, double* v, lapack_int* ldv, double* q, lapack_int* ldq,
+                   lapack_int* iwork, double* tau, double* work, lapack_int* info);
+void LAPACK_cggsvp(char* jobu, char* jobv, char* jobq, lapack_int* m, lapack_int* p, lapack_int* n,
+                   lapack_complex_float* a, lapack_int* lda, lapack_complex_float* b, lapack_int* ldb, float* tola,
+                   float* tolb, lapack_int* k, lapack_int* l, lapack_complex_float* u, lapack_int* ldu,
+                   lapack_complex_float* v, lapack_int* ldv, lapack_complex_float* q, lapack_int* ldq,
+                   lapack_int* iwork, float* rwork, lapack_complex_float* tau, lapack_complex_float* work,
+                   lapack_int* info);
+void LAPACK_zggsvp(char* jobu, char* jobv, char* jobq, lapack_int* m, lapack_int* p, lapack_int* n,
+                   lapack_complex_double* a, lapack_int* lda, lapack_complex_double* b, lapack_int* ldb, double* tola,
+                   double* tolb, lapack_int* k, lapack_int* l, lapack_complex_double* u, lapack_int* ldu,
+                   lapack_complex_double* v, lapack_int* ldv, lapack_complex_double* q, lapack_int* ldq,
+                   lapack_int* iwork, double* rwork, lapack_complex_double* tau, lapack_complex_double* work,
+                   lapack_int* info);
+void LAPACK_stgsja(char* jobu, char* jobv, char* jobq, lapack_int* m, lapack_int* p, lapack_int* n, lapack_int* k,
+                   lapack_int* l, float* a, lapack_int* lda, float* b, lapack_int* ldb, float* tola, float* tolb,
+                   float* alpha, float* beta, float* u, lapack_int* ldu, float* v, lapack_int* ldv, float* q,
+                   lapack_int* ldq, float* work, lapack_int* ncycle, lapack_int* info);
+void LAPACK_dtgsja(char* jobu, char* jobv, char* jobq, lapack_int* m, lapack_int* p, lapack_int* n, lapack_int* k,
+                   lapack_int* l, double* a, lapack_int* lda, double* b, lapack_int* ldb, double* tola, double* tolb,
+                   double* alpha, double* beta, double* u, lapack_int* ldu, double* v, lapack_int* ldv, double* q,
+                   lapack_int* ldq, double* work, lapack_int* ncycle, lapack_int* info);
+void LAPACK_ctgsja(char* jobu, char* jobv, char* jobq, lapack_int* m, lapack_int* p, lapack_int* n, lapack_int* k,
+                   lapack_int* l, lapack_complex_float* a, lapack_int* lda, lapack_complex_float* b, lapack_int* ldb,
+                   float* tola, float* tolb, float* alpha, float* beta, lapack_complex_float* u, lapack_int* ldu,
+                   lapack_complex_float* v, lapack_int* ldv, lapack_complex_float* q, lapack_int* ldq,
+                   lapack_complex_float* work, lapack_int* ncycle, lapack_int* info);
+void LAPACK_ztgsja(char* jobu, char* jobv, char* jobq, lapack_int* m, lapack_int* p, lapack_int* n, lapack_int* k,
+                   lapack_int* l, lapack_complex_double* a, lapack_int* lda, lapack_complex_double* b, lapack_int* ldb,
+                   double* tola, double* tolb, double* alpha, double* beta, lapack_complex_double* u, lapack_int* ldu,
+                   lapack_complex_double* v, lapack_int* ldv, lapack_complex_double* q, lapack_int* ldq,
+                   lapack_complex_double* work, lapack_int* ncycle, lapack_int* info);
+void LAPACK_sgels(char* trans, lapack_int* m, lapack_int* n, lapack_int* nrhs, float* a, lapack_int* lda, float* b,
+                  lapack_int* ldb, float* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_dgels(char* trans, lapack_int* m, lapack_int* n, lapack_int* nrhs, double* a, lapack_int* lda, double* b,
+                  lapack_int* ldb, double* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_cgels(char* trans, lapack_int* m, lapack_int* n, lapack_int* nrhs, lapack_complex_float* a, lapack_int* lda,
+                  lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* work, lapack_int* lwork,
+                  lapack_int* info);
+void LAPACK_zgels(char* trans, lapack_int* m, lapack_int* n, lapack_int* nrhs, lapack_complex_double* a,
+                  lapack_int* lda, lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* work,
+                  lapack_int* lwork, lapack_int* info);
+void LAPACK_sgelsy(lapack_int* m, lapack_int* n, lapack_int* nrhs, float* a, lapack_int* lda, float* b, lapack_int* ldb,
+                   lapack_int* jpvt, float* rcond, lapack_int* rank, float* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_dgelsy(lapack_int* m, lapack_int* n, lapack_int* nrhs, double* a, lapack_int* lda, double* b,
+                   lapack_int* ldb, lapack_int* jpvt, double* rcond, lapack_int* rank, double* work, lapack_int* lwork,
+                   lapack_int* info);
+void LAPACK_cgelsy(lapack_int* m, lapack_int* n, lapack_int* nrhs, lapack_complex_float* a, lapack_int* lda,
+                   lapack_complex_float* b, lapack_int* ldb, lapack_int* jpvt, float* rcond, lapack_int* rank,
+                   lapack_complex_float* work, lapack_int* lwork, float* rwork, lapack_int* info);
+void LAPACK_zgelsy(lapack_int* m, lapack_int* n, lapack_int* nrhs, lapack_complex_double* a, lapack_int* lda,
+                   lapack_complex_double* b, lapack_int* ldb, lapack_int* jpvt, double* rcond, lapack_int* rank,
+                   lapack_complex_double* work, lapack_int* lwork, double* rwork, lapack_int* info);
+void LAPACK_sgelss(lapack_int* m, lapack_int* n, lapack_int* nrhs, float* a, lapack_int* lda, float* b, lapack_int* ldb,
+                   float* s, float* rcond, lapack_int* rank, float* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_dgelss(lapack_int* m, lapack_int* n, lapack_int* nrhs, double* a, lapack_int* lda, double* b,
+                   lapack_int* ldb, double* s, double* rcond, lapack_int* rank, double* work, lapack_int* lwork,
+                   lapack_int* info);
+void LAPACK_cgelss(lapack_int* m, lapack_int* n, lapack_int* nrhs, lapack_complex_float* a, lapack_int* lda,
+                   lapack_complex_float* b, lapack_int* ldb, float* s, float* rcond, lapack_int* rank,
+                   lapack_complex_float* work, lapack_int* lwork, float* rwork, lapack_int* info);
+void LAPACK_zgelss(lapack_int* m, lapack_int* n, lapack_int* nrhs, lapack_complex_double* a, lapack_int* lda,
+                   lapack_complex_double* b, lapack_int* ldb, double* s, double* rcond, lapack_int* rank,
+                   lapack_complex_double* work, lapack_int* lwork, double* rwork, lapack_int* info);
+void LAPACK_sgelsd(lapack_int* m, lapack_int* n, lapack_int* nrhs, float* a, lapack_int* lda, float* b, lapack_int* ldb,
+                   float* s, float* rcond, lapack_int* rank, float* work, lapack_int* lwork, lapack_int* iwork,
+                   lapack_int* info);
+void LAPACK_dgelsd(lapack_int* m, lapack_int* n, lapack_int* nrhs, double* a, lapack_int* lda, double* b,
+                   lapack_int* ldb, double* s, double* rcond, lapack_int* rank, double* work, lapack_int* lwork,
+                   lapack_int* iwork, lapack_int* info);
+void LAPACK_cgelsd(lapack_int* m, lapack_int* n, lapack_int* nrhs, lapack_complex_float* a, lapack_int* lda,
+                   lapack_complex_float* b, lapack_int* ldb, float* s, float* rcond, lapack_int* rank,
+                   lapack_complex_float* work, lapack_int* lwork, float* rwork, lapack_int* iwork, lapack_int* info);
+void LAPACK_zgelsd(lapack_int* m, lapack_int* n, lapack_int* nrhs, lapack_complex_double* a, lapack_int* lda,
+                   lapack_complex_double* b, lapack_int* ldb, double* s, double* rcond, lapack_int* rank,
+                   lapack_complex_double* work, lapack_int* lwork, double* rwork, lapack_int* iwork, lapack_int* info);
+void LAPACK_sgglse(lapack_int* m, lapack_int* n, lapack_int* p, float* a, lapack_int* lda, float* b, lapack_int* ldb,
+                   float* c, float* d, float* x, float* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_dgglse(lapack_int* m, lapack_int* n, lapack_int* p, double* a, lapack_int* lda, double* b, lapack_int* ldb,
+                   double* c, double* d, double* x, double* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_cgglse(lapack_int* m, lapack_int* n, lapack_int* p, lapack_complex_float* a, lapack_int* lda,
+                   lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* c, lapack_complex_float* d,
+                   lapack_complex_float* x, lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_zgglse(lapack_int* m, lapack_int* n, lapack_int* p, lapack_complex_double* a, lapack_int* lda,
+                   lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* c, lapack_complex_double* d,
+                   lapack_complex_double* x, lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_sggglm(lapack_int* n, lapack_int* m, lapack_int* p, float* a, lapack_int* lda, float* b, lapack_int* ldb,
+                   float* d, float* x, float* y, float* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_dggglm(lapack_int* n, lapack_int* m, lapack_int* p, double* a, lapack_int* lda, double* b, lapack_int* ldb,
+                   double* d, double* x, double* y, double* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_cggglm(lapack_int* n, lapack_int* m, lapack_int* p, lapack_complex_float* a, lapack_int* lda,
+                   lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* d, lapack_complex_float* x,
+                   lapack_complex_float* y, lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_zggglm(lapack_int* n, lapack_int* m, lapack_int* p, lapack_complex_double* a, lapack_int* lda,
+                   lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* d, lapack_complex_double* x,
+                   lapack_complex_double* y, lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_ssyev(char* jobz, char* uplo, lapack_int* n, float* a, lapack_int* lda, float* w, float* work,
+                  lapack_int* lwork, lapack_int* info);
+void LAPACK_dsyev(char* jobz, char* uplo, lapack_int* n, double* a, lapack_int* lda, double* w, double* work,
+                  lapack_int* lwork, lapack_int* info);
+void LAPACK_cheev(char* jobz, char* uplo, lapack_int* n, lapack_complex_float* a, lapack_int* lda, float* w,
+                  lapack_complex_float* work, lapack_int* lwork, float* rwork, lapack_int* info);
+void LAPACK_zheev(char* jobz, char* uplo, lapack_int* n, lapack_complex_double* a, lapack_int* lda, double* w,
+                  lapack_complex_double* work, lapack_int* lwork, double* rwork, lapack_int* info);
+void LAPACK_ssyevd(char* jobz, char* uplo, lapack_int* n, float* a, lapack_int* lda, float* w, float* work,
+                   lapack_int* lwork, lapack_int* iwork, lapack_int* liwork, lapack_int* info);
+void LAPACK_dsyevd(char* jobz, char* uplo, lapack_int* n, double* a, lapack_int* lda, double* w, double* work,
+                   lapack_int* lwork, lapack_int* iwork, lapack_int* liwork, lapack_int* info);
+void LAPACK_cheevd(char* jobz, char* uplo, lapack_int* n, lapack_complex_float* a, lapack_int* lda, float* w,
+                   lapack_complex_float* work, lapack_int* lwork, float* rwork, lapack_int* lrwork, lapack_int* iwork,
+                   lapack_int* liwork, lapack_int* info);
+void LAPACK_zheevd(char* jobz, char* uplo, lapack_int* n, lapack_complex_double* a, lapack_int* lda, double* w,
+                   lapack_complex_double* work, lapack_int* lwork, double* rwork, lapack_int* lrwork, lapack_int* iwork,
+                   lapack_int* liwork, lapack_int* info);
+void LAPACK_ssyevx(char* jobz, char* range, char* uplo, lapack_int* n, float* a, lapack_int* lda, float* vl, float* vu,
+                   lapack_int* il, lapack_int* iu, float* abstol, lapack_int* m, float* w, float* z, lapack_int* ldz,
+                   float* work, lapack_int* lwork, lapack_int* iwork, lapack_int* ifail, lapack_int* info);
+void LAPACK_dsyevx(char* jobz, char* range, char* uplo, lapack_int* n, double* a, lapack_int* lda, double* vl,
+                   double* vu, lapack_int* il, lapack_int* iu, double* abstol, lapack_int* m, double* w, double* z,
+                   lapack_int* ldz, double* work, lapack_int* lwork, lapack_int* iwork, lapack_int* ifail,
+                   lapack_int* info);
+void LAPACK_cheevx(char* jobz, char* range, char* uplo, lapack_int* n, lapack_complex_float* a, lapack_int* lda,
+                   float* vl, float* vu, lapack_int* il, lapack_int* iu, float* abstol, lapack_int* m, float* w,
+                   lapack_complex_float* z, lapack_int* ldz, lapack_complex_float* work, lapack_int* lwork,
+                   float* rwork, lapack_int* iwork, lapack_int* ifail, lapack_int* info);
+void LAPACK_zheevx(char* jobz, char* range, char* uplo, lapack_int* n, lapack_complex_double* a, lapack_int* lda,
+                   double* vl, double* vu, lapack_int* il, lapack_int* iu, double* abstol, lapack_int* m, double* w,
+                   lapack_complex_double* z, lapack_int* ldz, lapack_complex_double* work, lapack_int* lwork,
+                   double* rwork, lapack_int* iwork, lapack_int* ifail, lapack_int* info);
+void LAPACK_ssyevr(char* jobz, char* range, char* uplo, lapack_int* n, float* a, lapack_int* lda, float* vl, float* vu,
+                   lapack_int* il, lapack_int* iu, float* abstol, lapack_int* m, float* w, float* z, lapack_int* ldz,
+                   lapack_int* isuppz, float* work, lapack_int* lwork, lapack_int* iwork, lapack_int* liwork,
+                   lapack_int* info);
+void LAPACK_dsyevr(char* jobz, char* range, char* uplo, lapack_int* n, double* a, lapack_int* lda, double* vl,
+                   double* vu, lapack_int* il, lapack_int* iu, double* abstol, lapack_int* m, double* w, double* z,
+                   lapack_int* ldz, lapack_int* isuppz, double* work, lapack_int* lwork, lapack_int* iwork,
+                   lapack_int* liwork, lapack_int* info);
+void LAPACK_cheevr(char* jobz, char* range, char* uplo, lapack_int* n, lapack_complex_float* a, lapack_int* lda,
+                   float* vl, float* vu, lapack_int* il, lapack_int* iu, float* abstol, lapack_int* m, float* w,
+                   lapack_complex_float* z, lapack_int* ldz, lapack_int* isuppz, lapack_complex_float* work,
+                   lapack_int* lwork, float* rwork, lapack_int* lrwork, lapack_int* iwork, lapack_int* liwork,
+                   lapack_int* info);
+void LAPACK_zheevr(char* jobz, char* range, char* uplo, lapack_int* n, lapack_complex_double* a, lapack_int* lda,
+                   double* vl, double* vu, lapack_int* il, lapack_int* iu, double* abstol, lapack_int* m, double* w,
+                   lapack_complex_double* z, lapack_int* ldz, lapack_int* isuppz, lapack_complex_double* work,
+                   lapack_int* lwork, double* rwork, lapack_int* lrwork, lapack_int* iwork, lapack_int* liwork,
+                   lapack_int* info);
+void LAPACK_sspev(char* jobz, char* uplo, lapack_int* n, float* ap, float* w, float* z, lapack_int* ldz, float* work,
+                  lapack_int* info);
+void LAPACK_dspev(char* jobz, char* uplo, lapack_int* n, double* ap, double* w, double* z, lapack_int* ldz,
+                  double* work, lapack_int* info);
+void LAPACK_chpev(char* jobz, char* uplo, lapack_int* n, lapack_complex_float* ap, float* w, lapack_complex_float* z,
+                  lapack_int* ldz, lapack_complex_float* work, float* rwork, lapack_int* info);
+void LAPACK_zhpev(char* jobz, char* uplo, lapack_int* n, lapack_complex_double* ap, double* w, lapack_complex_double* z,
+                  lapack_int* ldz, lapack_complex_double* work, double* rwork, lapack_int* info);
+void LAPACK_sspevd(char* jobz, char* uplo, lapack_int* n, float* ap, float* w, float* z, lapack_int* ldz, float* work,
+                   lapack_int* lwork, lapack_int* iwork, lapack_int* liwork, lapack_int* info);
+void LAPACK_dspevd(char* jobz, char* uplo, lapack_int* n, double* ap, double* w, double* z, lapack_int* ldz,
+                   double* work, lapack_int* lwork, lapack_int* iwork, lapack_int* liwork, lapack_int* info);
+void LAPACK_chpevd(char* jobz, char* uplo, lapack_int* n, lapack_complex_float* ap, float* w, lapack_complex_float* z,
+                   lapack_int* ldz, lapack_complex_float* work, lapack_int* lwork, float* rwork, lapack_int* lrwork,
+                   lapack_int* iwork, lapack_int* liwork, lapack_int* info);
+void LAPACK_zhpevd(char* jobz, char* uplo, lapack_int* n, lapack_complex_double* ap, double* w,
+                   lapack_complex_double* z, lapack_int* ldz, lapack_complex_double* work, lapack_int* lwork,
+                   double* rwork, lapack_int* lrwork, lapack_int* iwork, lapack_int* liwork, lapack_int* info);
+void LAPACK_sspevx(char* jobz, char* range, char* uplo, lapack_int* n, float* ap, float* vl, float* vu, lapack_int* il,
+                   lapack_int* iu, float* abstol, lapack_int* m, float* w, float* z, lapack_int* ldz, float* work,
+                   lapack_int* iwork, lapack_int* ifail, lapack_int* info);
+void LAPACK_dspevx(char* jobz, char* range, char* uplo, lapack_int* n, double* ap, double* vl, double* vu,
+                   lapack_int* il, lapack_int* iu, double* abstol, lapack_int* m, double* w, double* z, lapack_int* ldz,
+                   double* work, lapack_int* iwork, lapack_int* ifail, lapack_int* info);
+void LAPACK_chpevx(char* jobz, char* range, char* uplo, lapack_int* n, lapack_complex_float* ap, float* vl, float* vu,
+                   lapack_int* il, lapack_int* iu, float* abstol, lapack_int* m, float* w, lapack_complex_float* z,
+                   lapack_int* ldz, lapack_complex_float* work, float* rwork, lapack_int* iwork, lapack_int* ifail,
+                   lapack_int* info);
+void LAPACK_zhpevx(char* jobz, char* range, char* uplo, lapack_int* n, lapack_complex_double* ap, double* vl,
+                   double* vu, lapack_int* il, lapack_int* iu, double* abstol, lapack_int* m, double* w,
+                   lapack_complex_double* z, lapack_int* ldz, lapack_complex_double* work, double* rwork,
+                   lapack_int* iwork, lapack_int* ifail, lapack_int* info);
+void LAPACK_ssbev(char* jobz, char* uplo, lapack_int* n, lapack_int* kd, float* ab, lapack_int* ldab, float* w,
+                  float* z, lapack_int* ldz, float* work, lapack_int* info);
+void LAPACK_dsbev(char* jobz, char* uplo, lapack_int* n, lapack_int* kd, double* ab, lapack_int* ldab, double* w,
+                  double* z, lapack_int* ldz, double* work, lapack_int* info);
+void LAPACK_chbev(char* jobz, char* uplo, lapack_int* n, lapack_int* kd, lapack_complex_float* ab, lapack_int* ldab,
+                  float* w, lapack_complex_float* z, lapack_int* ldz, lapack_complex_float* work, float* rwork,
+                  lapack_int* info);
+void LAPACK_zhbev(char* jobz, char* uplo, lapack_int* n, lapack_int* kd, lapack_complex_double* ab, lapack_int* ldab,
+                  double* w, lapack_complex_double* z, lapack_int* ldz, lapack_complex_double* work, double* rwork,
+                  lapack_int* info);
+void LAPACK_ssbevd(char* jobz, char* uplo, lapack_int* n, lapack_int* kd, float* ab, lapack_int* ldab, float* w,
+                   float* z, lapack_int* ldz, float* work, lapack_int* lwork, lapack_int* iwork, lapack_int* liwork,
+                   lapack_int* info);
+void LAPACK_dsbevd(char* jobz, char* uplo, lapack_int* n, lapack_int* kd, double* ab, lapack_int* ldab, double* w,
+                   double* z, lapack_int* ldz, double* work, lapack_int* lwork, lapack_int* iwork, lapack_int* liwork,
+                   lapack_int* info);
+void LAPACK_chbevd(char* jobz, char* uplo, lapack_int* n, lapack_int* kd, lapack_complex_float* ab, lapack_int* ldab,
+                   float* w, lapack_complex_float* z, lapack_int* ldz, lapack_complex_float* work, lapack_int* lwork,
+                   float* rwork, lapack_int* lrwork, lapack_int* iwork, lapack_int* liwork, lapack_int* info);
+void LAPACK_zhbevd(char* jobz, char* uplo, lapack_int* n, lapack_int* kd, lapack_complex_double* ab, lapack_int* ldab,
+                   double* w, lapack_complex_double* z, lapack_int* ldz, lapack_complex_double* work, lapack_int* lwork,
+                   double* rwork, lapack_int* lrwork, lapack_int* iwork, lapack_int* liwork, lapack_int* info);
+void LAPACK_ssbevx(char* jobz, char* range, char* uplo, lapack_int* n, lapack_int* kd, float* ab, lapack_int* ldab,
+                   float* q, lapack_int* ldq, float* vl, float* vu, lapack_int* il, lapack_int* iu, float* abstol,
+                   lapack_int* m, float* w, float* z, lapack_int* ldz, float* work, lapack_int* iwork,
+                   lapack_int* ifail, lapack_int* info);
+void LAPACK_dsbevx(char* jobz, char* range, char* uplo, lapack_int* n, lapack_int* kd, double* ab, lapack_int* ldab,
+                   double* q, lapack_int* ldq, double* vl, double* vu, lapack_int* il, lapack_int* iu, double* abstol,
+                   lapack_int* m, double* w, double* z, lapack_int* ldz, double* work, lapack_int* iwork,
+                   lapack_int* ifail, lapack_int* info);
+void LAPACK_chbevx(char* jobz, char* range, char* uplo, lapack_int* n, lapack_int* kd, lapack_complex_float* ab,
+                   lapack_int* ldab, lapack_complex_float* q, lapack_int* ldq, float* vl, float* vu, lapack_int* il,
+                   lapack_int* iu, float* abstol, lapack_int* m, float* w, lapack_complex_float* z, lapack_int* ldz,
+                   lapack_complex_float* work, float* rwork, lapack_int* iwork, lapack_int* ifail, lapack_int* info);
+void LAPACK_zhbevx(char* jobz, char* range, char* uplo, lapack_int* n, lapack_int* kd, lapack_complex_double* ab,
+                   lapack_int* ldab, lapack_complex_double* q, lapack_int* ldq, double* vl, double* vu, lapack_int* il,
+                   lapack_int* iu, double* abstol, lapack_int* m, double* w, lapack_complex_double* z, lapack_int* ldz,
+                   lapack_complex_double* work, double* rwork, lapack_int* iwork, lapack_int* ifail, lapack_int* info);
+void LAPACK_sstev(char* jobz, lapack_int* n, float* d, float* e, float* z, lapack_int* ldz, float* work,
+                  lapack_int* info);
+void LAPACK_dstev(char* jobz, lapack_int* n, double* d, double* e, double* z, lapack_int* ldz, double* work,
+                  lapack_int* info);
+void LAPACK_sstevd(char* jobz, lapack_int* n, float* d, float* e, float* z, lapack_int* ldz, float* work,
+                   lapack_int* lwork, lapack_int* iwork, lapack_int* liwork, lapack_int* info);
+void LAPACK_dstevd(char* jobz, lapack_int* n, double* d, double* e, double* z, lapack_int* ldz, double* work,
+                   lapack_int* lwork, lapack_int* iwork, lapack_int* liwork, lapack_int* info);
+void LAPACK_sstevx(char* jobz, char* range, lapack_int* n, float* d, float* e, float* vl, float* vu, lapack_int* il,
+                   lapack_int* iu, float* abstol, lapack_int* m, float* w, float* z, lapack_int* ldz, float* work,
+                   lapack_int* iwork, lapack_int* ifail, lapack_int* info);
+void LAPACK_dstevx(char* jobz, char* range, lapack_int* n, double* d, double* e, double* vl, double* vu, lapack_int* il,
+                   lapack_int* iu, double* abstol, lapack_int* m, double* w, double* z, lapack_int* ldz, double* work,
+                   lapack_int* iwork, lapack_int* ifail, lapack_int* info);
+void LAPACK_sstevr(char* jobz, char* range, lapack_int* n, float* d, float* e, float* vl, float* vu, lapack_int* il,
+                   lapack_int* iu, float* abstol, lapack_int* m, float* w, float* z, lapack_int* ldz,
+                   lapack_int* isuppz, float* work, lapack_int* lwork, lapack_int* iwork, lapack_int* liwork,
+                   lapack_int* info);
+void LAPACK_dstevr(char* jobz, char* range, lapack_int* n, double* d, double* e, double* vl, double* vu, lapack_int* il,
+                   lapack_int* iu, double* abstol, lapack_int* m, double* w, double* z, lapack_int* ldz,
+                   lapack_int* isuppz, double* work, lapack_int* lwork, lapack_int* iwork, lapack_int* liwork,
+                   lapack_int* info);
+void LAPACK_sgees(char* jobvs, char* sort, LAPACK_S_SELECT2 select, lapack_int* n, float* a, lapack_int* lda,
+                  lapack_int* sdim, float* wr, float* wi, float* vs, lapack_int* ldvs, float* work, lapack_int* lwork,
+                  lapack_logical* bwork, lapack_int* info);
+void LAPACK_dgees(char* jobvs, char* sort, LAPACK_D_SELECT2 select, lapack_int* n, double* a, lapack_int* lda,
+                  lapack_int* sdim, double* wr, double* wi, double* vs, lapack_int* ldvs, double* work,
+                  lapack_int* lwork, lapack_logical* bwork, lapack_int* info);
+void LAPACK_cgees(char* jobvs, char* sort, LAPACK_C_SELECT1 select, lapack_int* n, lapack_complex_float* a,
+                  lapack_int* lda, lapack_int* sdim, lapack_complex_float* w, lapack_complex_float* vs,
+                  lapack_int* ldvs, lapack_complex_float* work, lapack_int* lwork, float* rwork, lapack_logical* bwork,
+                  lapack_int* info);
+void LAPACK_zgees(char* jobvs, char* sort, LAPACK_Z_SELECT1 select, lapack_int* n, lapack_complex_double* a,
+                  lapack_int* lda, lapack_int* sdim, lapack_complex_double* w, lapack_complex_double* vs,
+                  lapack_int* ldvs, lapack_complex_double* work, lapack_int* lwork, double* rwork,
+                  lapack_logical* bwork, lapack_int* info);
+void LAPACK_sgeesx(char* jobvs, char* sort, LAPACK_S_SELECT2 select, char* sense, lapack_int* n, float* a,
+                   lapack_int* lda, lapack_int* sdim, float* wr, float* wi, float* vs, lapack_int* ldvs, float* rconde,
+                   float* rcondv, float* work, lapack_int* lwork, lapack_int* iwork, lapack_int* liwork,
+                   lapack_logical* bwork, lapack_int* info);
+void LAPACK_dgeesx(char* jobvs, char* sort, LAPACK_D_SELECT2 select, char* sense, lapack_int* n, double* a,
+                   lapack_int* lda, lapack_int* sdim, double* wr, double* wi, double* vs, lapack_int* ldvs,
+                   double* rconde, double* rcondv, double* work, lapack_int* lwork, lapack_int* iwork,
+                   lapack_int* liwork, lapack_logical* bwork, lapack_int* info);
+void LAPACK_cgeesx(char* jobvs, char* sort, LAPACK_C_SELECT1 select, char* sense, lapack_int* n,
+                   lapack_complex_float* a, lapack_int* lda, lapack_int* sdim, lapack_complex_float* w,
+                   lapack_complex_float* vs, lapack_int* ldvs, float* rconde, float* rcondv, lapack_complex_float* work,
+                   lapack_int* lwork, float* rwork, lapack_logical* bwork, lapack_int* info);
+void LAPACK_zgeesx(char* jobvs, char* sort, LAPACK_Z_SELECT1 select, char* sense, lapack_int* n,
+                   lapack_complex_double* a, lapack_int* lda, lapack_int* sdim, lapack_complex_double* w,
+                   lapack_complex_double* vs, lapack_int* ldvs, double* rconde, double* rcondv,
+                   lapack_complex_double* work, lapack_int* lwork, double* rwork, lapack_logical* bwork,
+                   lapack_int* info);
+void LAPACK_sgeev(char* jobvl, char* jobvr, lapack_int* n, float* a, lapack_int* lda, float* wr, float* wi, float* vl,
+                  lapack_int* ldvl, float* vr, lapack_int* ldvr, float* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_dgeev(char* jobvl, char* jobvr, lapack_int* n, double* a, lapack_int* lda, double* wr, double* wi,
+                  double* vl, lapack_int* ldvl, double* vr, lapack_int* ldvr, double* work, lapack_int* lwork,
+                  lapack_int* info);
+void LAPACK_cgeev(char* jobvl, char* jobvr, lapack_int* n, lapack_complex_float* a, lapack_int* lda,
+                  lapack_complex_float* w, lapack_complex_float* vl, lapack_int* ldvl, lapack_complex_float* vr,
+                  lapack_int* ldvr, lapack_complex_float* work, lapack_int* lwork, float* rwork, lapack_int* info);
+void LAPACK_zgeev(char* jobvl, char* jobvr, lapack_int* n, lapack_complex_double* a, lapack_int* lda,
+                  lapack_complex_double* w, lapack_complex_double* vl, lapack_int* ldvl, lapack_complex_double* vr,
+                  lapack_int* ldvr, lapack_complex_double* work, lapack_int* lwork, double* rwork, lapack_int* info);
+void LAPACK_sgeevx(char* balanc, char* jobvl, char* jobvr, char* sense, lapack_int* n, float* a, lapack_int* lda,
+                   float* wr, float* wi, float* vl, lapack_int* ldvl, float* vr, lapack_int* ldvr, lapack_int* ilo,
+                   lapack_int* ihi, float* scale, float* abnrm, float* rconde, float* rcondv, float* work,
+                   lapack_int* lwork, lapack_int* iwork, lapack_int* info);
+void LAPACK_dgeevx(char* balanc, char* jobvl, char* jobvr, char* sense, lapack_int* n, double* a, lapack_int* lda,
+                   double* wr, double* wi, double* vl, lapack_int* ldvl, double* vr, lapack_int* ldvr, lapack_int* ilo,
+                   lapack_int* ihi, double* scale, double* abnrm, double* rconde, double* rcondv, double* work,
+                   lapack_int* lwork, lapack_int* iwork, lapack_int* info);
+void LAPACK_cgeevx(char* balanc, char* jobvl, char* jobvr, char* sense, lapack_int* n, lapack_complex_float* a,
+                   lapack_int* lda, lapack_complex_float* w, lapack_complex_float* vl, lapack_int* ldvl,
+                   lapack_complex_float* vr, lapack_int* ldvr, lapack_int* ilo, lapack_int* ihi, float* scale,
+                   float* abnrm, float* rconde, float* rcondv, lapack_complex_float* work, lapack_int* lwork,
+                   float* rwork, lapack_int* info);
+void LAPACK_zgeevx(char* balanc, char* jobvl, char* jobvr, char* sense, lapack_int* n, lapack_complex_double* a,
+                   lapack_int* lda, lapack_complex_double* w, lapack_complex_double* vl, lapack_int* ldvl,
+                   lapack_complex_double* vr, lapack_int* ldvr, lapack_int* ilo, lapack_int* ihi, double* scale,
+                   double* abnrm, double* rconde, double* rcondv, lapack_complex_double* work, lapack_int* lwork,
+                   double* rwork, lapack_int* info);
+void LAPACK_sgesvd(char* jobu, char* jobvt, lapack_int* m, lapack_int* n, float* a, lapack_int* lda, float* s, float* u,
+                   lapack_int* ldu, float* vt, lapack_int* ldvt, float* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_dgesvd(char* jobu, char* jobvt, lapack_int* m, lapack_int* n, double* a, lapack_int* lda, double* s,
+                   double* u, lapack_int* ldu, double* vt, lapack_int* ldvt, double* work, lapack_int* lwork,
+                   lapack_int* info);
+void LAPACK_cgesvd(char* jobu, char* jobvt, lapack_int* m, lapack_int* n, lapack_complex_float* a, lapack_int* lda,
+                   float* s, lapack_complex_float* u, lapack_int* ldu, lapack_complex_float* vt, lapack_int* ldvt,
+                   lapack_complex_float* work, lapack_int* lwork, float* rwork, lapack_int* info);
+void LAPACK_zgesvd(char* jobu, char* jobvt, lapack_int* m, lapack_int* n, lapack_complex_double* a, lapack_int* lda,
+                   double* s, lapack_complex_double* u, lapack_int* ldu, lapack_complex_double* vt, lapack_int* ldvt,
+                   lapack_complex_double* work, lapack_int* lwork, double* rwork, lapack_int* info);
+void LAPACK_sgesdd(char* jobz, lapack_int* m, lapack_int* n, float* a, lapack_int* lda, float* s, float* u,
+                   lapack_int* ldu, float* vt, lapack_int* ldvt, float* work, lapack_int* lwork, lapack_int* iwork,
+                   lapack_int* info);
+void LAPACK_dgesdd(char* jobz, lapack_int* m, lapack_int* n, double* a, lapack_int* lda, double* s, double* u,
+                   lapack_int* ldu, double* vt, lapack_int* ldvt, double* work, lapack_int* lwork, lapack_int* iwork,
+                   lapack_int* info);
+void LAPACK_cgesdd(char* jobz, lapack_int* m, lapack_int* n, lapack_complex_float* a, lapack_int* lda, float* s,
+                   lapack_complex_float* u, lapack_int* ldu, lapack_complex_float* vt, lapack_int* ldvt,
+                   lapack_complex_float* work, lapack_int* lwork, float* rwork, lapack_int* iwork, lapack_int* info);
+void LAPACK_zgesdd(char* jobz, lapack_int* m, lapack_int* n, lapack_complex_double* a, lapack_int* lda, double* s,
+                   lapack_complex_double* u, lapack_int* ldu, lapack_complex_double* vt, lapack_int* ldvt,
+                   lapack_complex_double* work, lapack_int* lwork, double* rwork, lapack_int* iwork, lapack_int* info);
+void LAPACK_dgejsv(char* joba, char* jobu, char* jobv, char* jobr, char* jobt, char* jobp, lapack_int* m, lapack_int* n,
+                   double* a, lapack_int* lda, double* sva, double* u, lapack_int* ldu, double* v, lapack_int* ldv,
+                   double* work, lapack_int* lwork, lapack_int* iwork, lapack_int* info);
+void LAPACK_sgejsv(char* joba, char* jobu, char* jobv, char* jobr, char* jobt, char* jobp, lapack_int* m, lapack_int* n,
+                   float* a, lapack_int* lda, float* sva, float* u, lapack_int* ldu, float* v, lapack_int* ldv,
+                   float* work, lapack_int* lwork, lapack_int* iwork, lapack_int* info);
+void LAPACK_dgesvj(char* joba, char* jobu, char* jobv, lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
+                   double* sva, lapack_int* mv, double* v, lapack_int* ldv, double* work, lapack_int* lwork,
+                   lapack_int* info);
+void LAPACK_sgesvj(char* joba, char* jobu, char* jobv, lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
+                   float* sva, lapack_int* mv, float* v, lapack_int* ldv, float* work, lapack_int* lwork,
+                   lapack_int* info);
+void LAPACK_sggsvd(char* jobu, char* jobv, char* jobq, lapack_int* m, lapack_int* n, lapack_int* p, lapack_int* k,
+                   lapack_int* l, float* a, lapack_int* lda, float* b, lapack_int* ldb, float* alpha, float* beta,
+                   float* u, lapack_int* ldu, float* v, lapack_int* ldv, float* q, lapack_int* ldq, float* work,
+                   lapack_int* iwork, lapack_int* info);
+void LAPACK_dggsvd(char* jobu, char* jobv, char* jobq, lapack_int* m, lapack_int* n, lapack_int* p, lapack_int* k,
+                   lapack_int* l, double* a, lapack_int* lda, double* b, lapack_int* ldb, double* alpha, double* beta,
+                   double* u, lapack_int* ldu, double* v, lapack_int* ldv, double* q, lapack_int* ldq, double* work,
+                   lapack_int* iwork, lapack_int* info);
+void LAPACK_cggsvd(char* jobu, char* jobv, char* jobq, lapack_int* m, lapack_int* n, lapack_int* p, lapack_int* k,
+                   lapack_int* l, lapack_complex_float* a, lapack_int* lda, lapack_complex_float* b, lapack_int* ldb,
+                   float* alpha, float* beta, lapack_complex_float* u, lapack_int* ldu, lapack_complex_float* v,
+                   lapack_int* ldv, lapack_complex_float* q, lapack_int* ldq, lapack_complex_float* work, float* rwork,
+                   lapack_int* iwork, lapack_int* info);
+void LAPACK_zggsvd(char* jobu, char* jobv, char* jobq, lapack_int* m, lapack_int* n, lapack_int* p, lapack_int* k,
+                   lapack_int* l, lapack_complex_double* a, lapack_int* lda, lapack_complex_double* b, lapack_int* ldb,
+                   double* alpha, double* beta, lapack_complex_double* u, lapack_int* ldu, lapack_complex_double* v,
+                   lapack_int* ldv, lapack_complex_double* q, lapack_int* ldq, lapack_complex_double* work,
+                   double* rwork, lapack_int* iwork, lapack_int* info);
+void LAPACK_ssygv(lapack_int* itype, char* jobz, char* uplo, lapack_int* n, float* a, lapack_int* lda, float* b,
+                  lapack_int* ldb, float* w, float* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_dsygv(lapack_int* itype, char* jobz, char* uplo, lapack_int* n, double* a, lapack_int* lda, double* b,
+                  lapack_int* ldb, double* w, double* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_chegv(lapack_int* itype, char* jobz, char* uplo, lapack_int* n, lapack_complex_float* a, lapack_int* lda,
+                  lapack_complex_float* b, lapack_int* ldb, float* w, lapack_complex_float* work, lapack_int* lwork,
+                  float* rwork, lapack_int* info);
+void LAPACK_zhegv(lapack_int* itype, char* jobz, char* uplo, lapack_int* n, lapack_complex_double* a, lapack_int* lda,
+                  lapack_complex_double* b, lapack_int* ldb, double* w, lapack_complex_double* work, lapack_int* lwork,
+                  double* rwork, lapack_int* info);
+void LAPACK_ssygvd(lapack_int* itype, char* jobz, char* uplo, lapack_int* n, float* a, lapack_int* lda, float* b,
+                   lapack_int* ldb, float* w, float* work, lapack_int* lwork, lapack_int* iwork, lapack_int* liwork,
+                   lapack_int* info);
+void LAPACK_dsygvd(lapack_int* itype, char* jobz, char* uplo, lapack_int* n, double* a, lapack_int* lda, double* b,
+                   lapack_int* ldb, double* w, double* work, lapack_int* lwork, lapack_int* iwork, lapack_int* liwork,
+                   lapack_int* info);
+void LAPACK_chegvd(lapack_int* itype, char* jobz, char* uplo, lapack_int* n, lapack_complex_float* a, lapack_int* lda,
+                   lapack_complex_float* b, lapack_int* ldb, float* w, lapack_complex_float* work, lapack_int* lwork,
+                   float* rwork, lapack_int* lrwork, lapack_int* iwork, lapack_int* liwork, lapack_int* info);
+void LAPACK_zhegvd(lapack_int* itype, char* jobz, char* uplo, lapack_int* n, lapack_complex_double* a, lapack_int* lda,
+                   lapack_complex_double* b, lapack_int* ldb, double* w, lapack_complex_double* work, lapack_int* lwork,
+                   double* rwork, lapack_int* lrwork, lapack_int* iwork, lapack_int* liwork, lapack_int* info);
+void LAPACK_ssygvx(lapack_int* itype, char* jobz, char* range, char* uplo, lapack_int* n, float* a, lapack_int* lda,
+                   float* b, lapack_int* ldb, float* vl, float* vu, lapack_int* il, lapack_int* iu, float* abstol,
+                   lapack_int* m, float* w, float* z, lapack_int* ldz, float* work, lapack_int* lwork,
+                   lapack_int* iwork, lapack_int* ifail, lapack_int* info);
+void LAPACK_dsygvx(lapack_int* itype, char* jobz, char* range, char* uplo, lapack_int* n, double* a, lapack_int* lda,
+                   double* b, lapack_int* ldb, double* vl, double* vu, lapack_int* il, lapack_int* iu, double* abstol,
+                   lapack_int* m, double* w, double* z, lapack_int* ldz, double* work, lapack_int* lwork,
+                   lapack_int* iwork, lapack_int* ifail, lapack_int* info);
+void LAPACK_chegvx(lapack_int* itype, char* jobz, char* range, char* uplo, lapack_int* n, lapack_complex_float* a,
+                   lapack_int* lda, lapack_complex_float* b, lapack_int* ldb, float* vl, float* vu, lapack_int* il,
+                   lapack_int* iu, float* abstol, lapack_int* m, float* w, lapack_complex_float* z, lapack_int* ldz,
+                   lapack_complex_float* work, lapack_int* lwork, float* rwork, lapack_int* iwork, lapack_int* ifail,
+                   lapack_int* info);
+void LAPACK_zhegvx(lapack_int* itype, char* jobz, char* range, char* uplo, lapack_int* n, lapack_complex_double* a,
+                   lapack_int* lda, lapack_complex_double* b, lapack_int* ldb, double* vl, double* vu, lapack_int* il,
+                   lapack_int* iu, double* abstol, lapack_int* m, double* w, lapack_complex_double* z, lapack_int* ldz,
+                   lapack_complex_double* work, lapack_int* lwork, double* rwork, lapack_int* iwork, lapack_int* ifail,
+                   lapack_int* info);
+void LAPACK_sspgv(lapack_int* itype, char* jobz, char* uplo, lapack_int* n, float* ap, float* bp, float* w, float* z,
+                  lapack_int* ldz, float* work, lapack_int* info);
+void LAPACK_dspgv(lapack_int* itype, char* jobz, char* uplo, lapack_int* n, double* ap, double* bp, double* w,
+                  double* z, lapack_int* ldz, double* work, lapack_int* info);
+void LAPACK_chpgv(lapack_int* itype, char* jobz, char* uplo, lapack_int* n, lapack_complex_float* ap,
+                  lapack_complex_float* bp, float* w, lapack_complex_float* z, lapack_int* ldz,
+                  lapack_complex_float* work, float* rwork, lapack_int* info);
+void LAPACK_zhpgv(lapack_int* itype, char* jobz, char* uplo, lapack_int* n, lapack_complex_double* ap,
+                  lapack_complex_double* bp, double* w, lapack_complex_double* z, lapack_int* ldz,
+                  lapack_complex_double* work, double* rwork, lapack_int* info);
+void LAPACK_sspgvd(lapack_int* itype, char* jobz, char* uplo, lapack_int* n, float* ap, float* bp, float* w, float* z,
+                   lapack_int* ldz, float* work, lapack_int* lwork, lapack_int* iwork, lapack_int* liwork,
+                   lapack_int* info);
+void LAPACK_dspgvd(lapack_int* itype, char* jobz, char* uplo, lapack_int* n, double* ap, double* bp, double* w,
+                   double* z, lapack_int* ldz, double* work, lapack_int* lwork, lapack_int* iwork, lapack_int* liwork,
+                   lapack_int* info);
+void LAPACK_chpgvd(lapack_int* itype, char* jobz, char* uplo, lapack_int* n, lapack_complex_float* ap,
+                   lapack_complex_float* bp, float* w, lapack_complex_float* z, lapack_int* ldz,
+                   lapack_complex_float* work, lapack_int* lwork, float* rwork, lapack_int* lrwork, lapack_int* iwork,
+                   lapack_int* liwork, lapack_int* info);
+void LAPACK_zhpgvd(lapack_int* itype, char* jobz, char* uplo, lapack_int* n, lapack_complex_double* ap,
+                   lapack_complex_double* bp, double* w, lapack_complex_double* z, lapack_int* ldz,
+                   lapack_complex_double* work, lapack_int* lwork, double* rwork, lapack_int* lrwork, lapack_int* iwork,
+                   lapack_int* liwork, lapack_int* info);
+void LAPACK_sspgvx(lapack_int* itype, char* jobz, char* range, char* uplo, lapack_int* n, float* ap, float* bp,
+                   float* vl, float* vu, lapack_int* il, lapack_int* iu, float* abstol, lapack_int* m, float* w,
+                   float* z, lapack_int* ldz, float* work, lapack_int* iwork, lapack_int* ifail, lapack_int* info);
+void LAPACK_dspgvx(lapack_int* itype, char* jobz, char* range, char* uplo, lapack_int* n, double* ap, double* bp,
+                   double* vl, double* vu, lapack_int* il, lapack_int* iu, double* abstol, lapack_int* m, double* w,
+                   double* z, lapack_int* ldz, double* work, lapack_int* iwork, lapack_int* ifail, lapack_int* info);
+void LAPACK_chpgvx(lapack_int* itype, char* jobz, char* range, char* uplo, lapack_int* n, lapack_complex_float* ap,
+                   lapack_complex_float* bp, float* vl, float* vu, lapack_int* il, lapack_int* iu, float* abstol,
+                   lapack_int* m, float* w, lapack_complex_float* z, lapack_int* ldz, lapack_complex_float* work,
+                   float* rwork, lapack_int* iwork, lapack_int* ifail, lapack_int* info);
+void LAPACK_zhpgvx(lapack_int* itype, char* jobz, char* range, char* uplo, lapack_int* n, lapack_complex_double* ap,
+                   lapack_complex_double* bp, double* vl, double* vu, lapack_int* il, lapack_int* iu, double* abstol,
+                   lapack_int* m, double* w, lapack_complex_double* z, lapack_int* ldz, lapack_complex_double* work,
+                   double* rwork, lapack_int* iwork, lapack_int* ifail, lapack_int* info);
+void LAPACK_ssbgv(char* jobz, char* uplo, lapack_int* n, lapack_int* ka, lapack_int* kb, float* ab, lapack_int* ldab,
+                  float* bb, lapack_int* ldbb, float* w, float* z, lapack_int* ldz, float* work, lapack_int* info);
+void LAPACK_dsbgv(char* jobz, char* uplo, lapack_int* n, lapack_int* ka, lapack_int* kb, double* ab, lapack_int* ldab,
+                  double* bb, lapack_int* ldbb, double* w, double* z, lapack_int* ldz, double* work, lapack_int* info);
+void LAPACK_chbgv(char* jobz, char* uplo, lapack_int* n, lapack_int* ka, lapack_int* kb, lapack_complex_float* ab,
+                  lapack_int* ldab, lapack_complex_float* bb, lapack_int* ldbb, float* w, lapack_complex_float* z,
+                  lapack_int* ldz, lapack_complex_float* work, float* rwork, lapack_int* info);
+void LAPACK_zhbgv(char* jobz, char* uplo, lapack_int* n, lapack_int* ka, lapack_int* kb, lapack_complex_double* ab,
+                  lapack_int* ldab, lapack_complex_double* bb, lapack_int* ldbb, double* w, lapack_complex_double* z,
+                  lapack_int* ldz, lapack_complex_double* work, double* rwork, lapack_int* info);
+void LAPACK_ssbgvd(char* jobz, char* uplo, lapack_int* n, lapack_int* ka, lapack_int* kb, float* ab, lapack_int* ldab,
+                   float* bb, lapack_int* ldbb, float* w, float* z, lapack_int* ldz, float* work, lapack_int* lwork,
+                   lapack_int* iwork, lapack_int* liwork, lapack_int* info);
+void LAPACK_dsbgvd(char* jobz, char* uplo, lapack_int* n, lapack_int* ka, lapack_int* kb, double* ab, lapack_int* ldab,
+                   double* bb, lapack_int* ldbb, double* w, double* z, lapack_int* ldz, double* work, lapack_int* lwork,
+                   lapack_int* iwork, lapack_int* liwork, lapack_int* info);
+void LAPACK_chbgvd(char* jobz, char* uplo, lapack_int* n, lapack_int* ka, lapack_int* kb, lapack_complex_float* ab,
+                   lapack_int* ldab, lapack_complex_float* bb, lapack_int* ldbb, float* w, lapack_complex_float* z,
+                   lapack_int* ldz, lapack_complex_float* work, lapack_int* lwork, float* rwork, lapack_int* lrwork,
+                   lapack_int* iwork, lapack_int* liwork, lapack_int* info);
+void LAPACK_zhbgvd(char* jobz, char* uplo, lapack_int* n, lapack_int* ka, lapack_int* kb, lapack_complex_double* ab,
+                   lapack_int* ldab, lapack_complex_double* bb, lapack_int* ldbb, double* w, lapack_complex_double* z,
+                   lapack_int* ldz, lapack_complex_double* work, lapack_int* lwork, double* rwork, lapack_int* lrwork,
+                   lapack_int* iwork, lapack_int* liwork, lapack_int* info);
+void LAPACK_ssbgvx(char* jobz, char* range, char* uplo, lapack_int* n, lapack_int* ka, lapack_int* kb, float* ab,
+                   lapack_int* ldab, float* bb, lapack_int* ldbb, float* q, lapack_int* ldq, float* vl, float* vu,
+                   lapack_int* il, lapack_int* iu, float* abstol, lapack_int* m, float* w, float* z, lapack_int* ldz,
+                   float* work, lapack_int* iwork, lapack_int* ifail, lapack_int* info);
+void LAPACK_dsbgvx(char* jobz, char* range, char* uplo, lapack_int* n, lapack_int* ka, lapack_int* kb, double* ab,
+                   lapack_int* ldab, double* bb, lapack_int* ldbb, double* q, lapack_int* ldq, double* vl, double* vu,
+                   lapack_int* il, lapack_int* iu, double* abstol, lapack_int* m, double* w, double* z, lapack_int* ldz,
+                   double* work, lapack_int* iwork, lapack_int* ifail, lapack_int* info);
+void LAPACK_chbgvx(char* jobz, char* range, char* uplo, lapack_int* n, lapack_int* ka, lapack_int* kb,
+                   lapack_complex_float* ab, lapack_int* ldab, lapack_complex_float* bb, lapack_int* ldbb,
+                   lapack_complex_float* q, lapack_int* ldq, float* vl, float* vu, lapack_int* il, lapack_int* iu,
+                   float* abstol, lapack_int* m, float* w, lapack_complex_float* z, lapack_int* ldz,
+                   lapack_complex_float* work, float* rwork, lapack_int* iwork, lapack_int* ifail, lapack_int* info);
+void LAPACK_zhbgvx(char* jobz, char* range, char* uplo, lapack_int* n, lapack_int* ka, lapack_int* kb,
+                   lapack_complex_double* ab, lapack_int* ldab, lapack_complex_double* bb, lapack_int* ldbb,
+                   lapack_complex_double* q, lapack_int* ldq, double* vl, double* vu, lapack_int* il, lapack_int* iu,
+                   double* abstol, lapack_int* m, double* w, lapack_complex_double* z, lapack_int* ldz,
+                   lapack_complex_double* work, double* rwork, lapack_int* iwork, lapack_int* ifail, lapack_int* info);
+void LAPACK_sgges(char* jobvsl, char* jobvsr, char* sort, LAPACK_S_SELECT3 selctg, lapack_int* n, float* a,
+                  lapack_int* lda, float* b, lapack_int* ldb, lapack_int* sdim, float* alphar, float* alphai,
+                  float* beta, float* vsl, lapack_int* ldvsl, float* vsr, lapack_int* ldvsr, float* work,
+                  lapack_int* lwork, lapack_logical* bwork, lapack_int* info);
+void LAPACK_dgges(char* jobvsl, char* jobvsr, char* sort, LAPACK_D_SELECT3 selctg, lapack_int* n, double* a,
+                  lapack_int* lda, double* b, lapack_int* ldb, lapack_int* sdim, double* alphar, double* alphai,
+                  double* beta, double* vsl, lapack_int* ldvsl, double* vsr, lapack_int* ldvsr, double* work,
+                  lapack_int* lwork, lapack_logical* bwork, lapack_int* info);
+void LAPACK_cgges(char* jobvsl, char* jobvsr, char* sort, LAPACK_C_SELECT2 selctg, lapack_int* n,
+                  lapack_complex_float* a, lapack_int* lda, lapack_complex_float* b, lapack_int* ldb, lapack_int* sdim,
+                  lapack_complex_float* alpha, lapack_complex_float* beta, lapack_complex_float* vsl, lapack_int* ldvsl,
+                  lapack_complex_float* vsr, lapack_int* ldvsr, lapack_complex_float* work, lapack_int* lwork,
+                  float* rwork, lapack_logical* bwork, lapack_int* info);
+void LAPACK_zgges(char* jobvsl, char* jobvsr, char* sort, LAPACK_Z_SELECT2 selctg, lapack_int* n,
+                  lapack_complex_double* a, lapack_int* lda, lapack_complex_double* b, lapack_int* ldb,
+                  lapack_int* sdim, lapack_complex_double* alpha, lapack_complex_double* beta,
+                  lapack_complex_double* vsl, lapack_int* ldvsl, lapack_complex_double* vsr, lapack_int* ldvsr,
+                  lapack_complex_double* work, lapack_int* lwork, double* rwork, lapack_logical* bwork,
+                  lapack_int* info);
+void LAPACK_sggesx(char* jobvsl, char* jobvsr, char* sort, LAPACK_S_SELECT3 selctg, char* sense, lapack_int* n,
+                   float* a, lapack_int* lda, float* b, lapack_int* ldb, lapack_int* sdim, float* alphar, float* alphai,
+                   float* beta, float* vsl, lapack_int* ldvsl, float* vsr, lapack_int* ldvsr, float* rconde,
+                   float* rcondv, float* work, lapack_int* lwork, lapack_int* iwork, lapack_int* liwork,
+                   lapack_logical* bwork, lapack_int* info);
+void LAPACK_dggesx(char* jobvsl, char* jobvsr, char* sort, LAPACK_D_SELECT3 selctg, char* sense, lapack_int* n,
+                   double* a, lapack_int* lda, double* b, lapack_int* ldb, lapack_int* sdim, double* alphar,
+                   double* alphai, double* beta, double* vsl, lapack_int* ldvsl, double* vsr, lapack_int* ldvsr,
+                   double* rconde, double* rcondv, double* work, lapack_int* lwork, lapack_int* iwork,
+                   lapack_int* liwork, lapack_logical* bwork, lapack_int* info);
+void LAPACK_cggesx(char* jobvsl, char* jobvsr, char* sort, LAPACK_C_SELECT2 selctg, char* sense, lapack_int* n,
+                   lapack_complex_float* a, lapack_int* lda, lapack_complex_float* b, lapack_int* ldb, lapack_int* sdim,
+                   lapack_complex_float* alpha, lapack_complex_float* beta, lapack_complex_float* vsl,
+                   lapack_int* ldvsl, lapack_complex_float* vsr, lapack_int* ldvsr, float* rconde, float* rcondv,
+                   lapack_complex_float* work, lapack_int* lwork, float* rwork, lapack_int* iwork, lapack_int* liwork,
+                   lapack_logical* bwork, lapack_int* info);
+void LAPACK_zggesx(char* jobvsl, char* jobvsr, char* sort, LAPACK_Z_SELECT2 selctg, char* sense, lapack_int* n,
+                   lapack_complex_double* a, lapack_int* lda, lapack_complex_double* b, lapack_int* ldb,
+                   lapack_int* sdim, lapack_complex_double* alpha, lapack_complex_double* beta,
+                   lapack_complex_double* vsl, lapack_int* ldvsl, lapack_complex_double* vsr, lapack_int* ldvsr,
+                   double* rconde, double* rcondv, lapack_complex_double* work, lapack_int* lwork, double* rwork,
+                   lapack_int* iwork, lapack_int* liwork, lapack_logical* bwork, lapack_int* info);
+void LAPACK_sggev(char* jobvl, char* jobvr, lapack_int* n, float* a, lapack_int* lda, float* b, lapack_int* ldb,
+                  float* alphar, float* alphai, float* beta, float* vl, lapack_int* ldvl, float* vr, lapack_int* ldvr,
+                  float* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_dggev(char* jobvl, char* jobvr, lapack_int* n, double* a, lapack_int* lda, double* b, lapack_int* ldb,
+                  double* alphar, double* alphai, double* beta, double* vl, lapack_int* ldvl, double* vr,
+                  lapack_int* ldvr, double* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_cggev(char* jobvl, char* jobvr, lapack_int* n, lapack_complex_float* a, lapack_int* lda,
+                  lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* alpha, lapack_complex_float* beta,
+                  lapack_complex_float* vl, lapack_int* ldvl, lapack_complex_float* vr, lapack_int* ldvr,
+                  lapack_complex_float* work, lapack_int* lwork, float* rwork, lapack_int* info);
+void LAPACK_zggev(char* jobvl, char* jobvr, lapack_int* n, lapack_complex_double* a, lapack_int* lda,
+                  lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* alpha, lapack_complex_double* beta,
+                  lapack_complex_double* vl, lapack_int* ldvl, lapack_complex_double* vr, lapack_int* ldvr,
+                  lapack_complex_double* work, lapack_int* lwork, double* rwork, lapack_int* info);
+void LAPACK_sggevx(char* balanc, char* jobvl, char* jobvr, char* sense, lapack_int* n, float* a, lapack_int* lda,
+                   float* b, lapack_int* ldb, float* alphar, float* alphai, float* beta, float* vl, lapack_int* ldvl,
+                   float* vr, lapack_int* ldvr, lapack_int* ilo, lapack_int* ihi, float* lscale, float* rscale,
+                   float* abnrm, float* bbnrm, float* rconde, float* rcondv, float* work, lapack_int* lwork,
+                   lapack_int* iwork, lapack_logical* bwork, lapack_int* info);
+void LAPACK_dggevx(char* balanc, char* jobvl, char* jobvr, char* sense, lapack_int* n, double* a, lapack_int* lda,
+                   double* b, lapack_int* ldb, double* alphar, double* alphai, double* beta, double* vl,
+                   lapack_int* ldvl, double* vr, lapack_int* ldvr, lapack_int* ilo, lapack_int* ihi, double* lscale,
+                   double* rscale, double* abnrm, double* bbnrm, double* rconde, double* rcondv, double* work,
+                   lapack_int* lwork, lapack_int* iwork, lapack_logical* bwork, lapack_int* info);
+void LAPACK_cggevx(char* balanc, char* jobvl, char* jobvr, char* sense, lapack_int* n, lapack_complex_float* a,
+                   lapack_int* lda, lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* alpha,
+                   lapack_complex_float* beta, lapack_complex_float* vl, lapack_int* ldvl, lapack_complex_float* vr,
+                   lapack_int* ldvr, lapack_int* ilo, lapack_int* ihi, float* lscale, float* rscale, float* abnrm,
+                   float* bbnrm, float* rconde, float* rcondv, lapack_complex_float* work, lapack_int* lwork,
+                   float* rwork, lapack_int* iwork, lapack_logical* bwork, lapack_int* info);
+void LAPACK_zggevx(char* balanc, char* jobvl, char* jobvr, char* sense, lapack_int* n, lapack_complex_double* a,
+                   lapack_int* lda, lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* alpha,
+                   lapack_complex_double* beta, lapack_complex_double* vl, lapack_int* ldvl, lapack_complex_double* vr,
+                   lapack_int* ldvr, lapack_int* ilo, lapack_int* ihi, double* lscale, double* rscale, double* abnrm,
+                   double* bbnrm, double* rconde, double* rcondv, lapack_complex_double* work, lapack_int* lwork,
+                   double* rwork, lapack_int* iwork, lapack_logical* bwork, lapack_int* info);
+void LAPACK_dsfrk(char* transr, char* uplo, char* trans, lapack_int* n, lapack_int* k, double* alpha, const double* a,
+                  lapack_int* lda, double* beta, double* c);
+void LAPACK_ssfrk(char* transr, char* uplo, char* trans, lapack_int* n, lapack_int* k, float* alpha, const float* a,
+                  lapack_int* lda, float* beta, float* c);
+void LAPACK_zhfrk(char* transr, char* uplo, char* trans, lapack_int* n, lapack_int* k, double* alpha,
+                  const lapack_complex_double* a, lapack_int* lda, double* beta, lapack_complex_double* c);
+void LAPACK_chfrk(char* transr, char* uplo, char* trans, lapack_int* n, lapack_int* k, float* alpha,
+                  const lapack_complex_float* a, lapack_int* lda, float* beta, lapack_complex_float* c);
+void LAPACK_dtfsm(char* transr, char* side, char* uplo, char* trans, char* diag, lapack_int* m, lapack_int* n,
+                  double* alpha, const double* a, double* b, lapack_int* ldb);
+void LAPACK_stfsm(char* transr, char* side, char* uplo, char* trans, char* diag, lapack_int* m, lapack_int* n,
+                  float* alpha, const float* a, float* b, lapack_int* ldb);
+void LAPACK_ztfsm(char* transr, char* side, char* uplo, char* trans, char* diag, lapack_int* m, lapack_int* n,
+                  lapack_complex_double* alpha, const lapack_complex_double* a, lapack_complex_double* b,
+                  lapack_int* ldb);
+void LAPACK_ctfsm(char* transr, char* side, char* uplo, char* trans, char* diag, lapack_int* m, lapack_int* n,
+                  lapack_complex_float* alpha, const lapack_complex_float* a, lapack_complex_float* b, lapack_int* ldb);
+void LAPACK_dtfttp(char* transr, char* uplo, lapack_int* n, const double* arf, double* ap, lapack_int* info);
+void LAPACK_stfttp(char* transr, char* uplo, lapack_int* n, const float* arf, float* ap, lapack_int* info);
+void LAPACK_ztfttp(char* transr, char* uplo, lapack_int* n, const lapack_complex_double* arf, lapack_complex_double* ap,
+                   lapack_int* info);
+void LAPACK_ctfttp(char* transr, char* uplo, lapack_int* n, const lapack_complex_float* arf, lapack_complex_float* ap,
+                   lapack_int* info);
+void LAPACK_dtfttr(char* transr, char* uplo, lapack_int* n, const double* arf, double* a, lapack_int* lda,
+                   lapack_int* info);
+void LAPACK_stfttr(char* transr, char* uplo, lapack_int* n, const float* arf, float* a, lapack_int* lda,
+                   lapack_int* info);
+void LAPACK_ztfttr(char* transr, char* uplo, lapack_int* n, const lapack_complex_double* arf, lapack_complex_double* a,
+                   lapack_int* lda, lapack_int* info);
+void LAPACK_ctfttr(char* transr, char* uplo, lapack_int* n, const lapack_complex_float* arf, lapack_complex_float* a,
+                   lapack_int* lda, lapack_int* info);
+void LAPACK_dtpttf(char* transr, char* uplo, lapack_int* n, const double* ap, double* arf, lapack_int* info);
+void LAPACK_stpttf(char* transr, char* uplo, lapack_int* n, const float* ap, float* arf, lapack_int* info);
+void LAPACK_ztpttf(char* transr, char* uplo, lapack_int* n, const lapack_complex_double* ap, lapack_complex_double* arf,
+                   lapack_int* info);
+void LAPACK_ctpttf(char* transr, char* uplo, lapack_int* n, const lapack_complex_float* ap, lapack_complex_float* arf,
+                   lapack_int* info);
+void LAPACK_dtpttr(char* uplo, lapack_int* n, const double* ap, double* a, lapack_int* lda, lapack_int* info);
+void LAPACK_stpttr(char* uplo, lapack_int* n, const float* ap, float* a, lapack_int* lda, lapack_int* info);
+void LAPACK_ztpttr(char* uplo, lapack_int* n, const lapack_complex_double* ap, lapack_complex_double* a,
+                   lapack_int* lda, lapack_int* info);
+void LAPACK_ctpttr(char* uplo, lapack_int* n, const lapack_complex_float* ap, lapack_complex_float* a, lapack_int* lda,
+                   lapack_int* info);
+void LAPACK_dtrttf(char* transr, char* uplo, lapack_int* n, const double* a, lapack_int* lda, double* arf,
+                   lapack_int* info);
+void LAPACK_strttf(char* transr, char* uplo, lapack_int* n, const float* a, lapack_int* lda, float* arf,
+                   lapack_int* info);
+void LAPACK_ztrttf(char* transr, char* uplo, lapack_int* n, const lapack_complex_double* a, lapack_int* lda,
+                   lapack_complex_double* arf, lapack_int* info);
+void LAPACK_ctrttf(char* transr, char* uplo, lapack_int* n, const lapack_complex_float* a, lapack_int* lda,
+                   lapack_complex_float* arf, lapack_int* info);
+void LAPACK_dtrttp(char* uplo, lapack_int* n, const double* a, lapack_int* lda, double* ap, lapack_int* info);
+void LAPACK_strttp(char* uplo, lapack_int* n, const float* a, lapack_int* lda, float* ap, lapack_int* info);
+void LAPACK_ztrttp(char* uplo, lapack_int* n, const lapack_complex_double* a, lapack_int* lda,
+                   lapack_complex_double* ap, lapack_int* info);
+void LAPACK_ctrttp(char* uplo, lapack_int* n, const lapack_complex_float* a, lapack_int* lda, lapack_complex_float* ap,
+                   lapack_int* info);
+void LAPACK_sgeqrfp(lapack_int* m, lapack_int* n, float* a, lapack_int* lda, float* tau, float* work, lapack_int* lwork,
+                    lapack_int* info);
+void LAPACK_dgeqrfp(lapack_int* m, lapack_int* n, double* a, lapack_int* lda, double* tau, double* work,
+                    lapack_int* lwork, lapack_int* info);
+void LAPACK_cgeqrfp(lapack_int* m, lapack_int* n, lapack_complex_float* a, lapack_int* lda, lapack_complex_float* tau,
+                    lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_zgeqrfp(lapack_int* m, lapack_int* n, lapack_complex_double* a, lapack_int* lda, lapack_complex_double* tau,
+                    lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_clacgv(lapack_int* n, lapack_complex_float* x, lapack_int* incx);
+void LAPACK_zlacgv(lapack_int* n, lapack_complex_double* x, lapack_int* incx);
+void LAPACK_slarnv(lapack_int* idist, lapack_int* iseed, lapack_int* n, float* x);
+void LAPACK_dlarnv(lapack_int* idist, lapack_int* iseed, lapack_int* n, double* x);
+void LAPACK_clarnv(lapack_int* idist, lapack_int* iseed, lapack_int* n, lapack_complex_float* x);
+void LAPACK_zlarnv(lapack_int* idist, lapack_int* iseed, lapack_int* n, lapack_complex_double* x);
+void LAPACK_sgeqr2(lapack_int* m, lapack_int* n, float* a, lapack_int* lda, float* tau, float* work, lapack_int* info);
+void LAPACK_dgeqr2(lapack_int* m, lapack_int* n, double* a, lapack_int* lda, double* tau, double* work,
+                   lapack_int* info);
+void LAPACK_cgeqr2(lapack_int* m, lapack_int* n, lapack_complex_float* a, lapack_int* lda, lapack_complex_float* tau,
+                   lapack_complex_float* work, lapack_int* info);
+void LAPACK_zgeqr2(lapack_int* m, lapack_int* n, lapack_complex_double* a, lapack_int* lda, lapack_complex_double* tau,
+                   lapack_complex_double* work, lapack_int* info);
+void LAPACK_slacpy(char* uplo, lapack_int* m, lapack_int* n, const float* a, lapack_int* lda, float* b,
+                   lapack_int* ldb);
+void LAPACK_dlacpy(char* uplo, lapack_int* m, lapack_int* n, const double* a, lapack_int* lda, double* b,
+                   lapack_int* ldb);
+void LAPACK_clacpy(char* uplo, lapack_int* m, lapack_int* n, const lapack_complex_float* a, lapack_int* lda,
+                   lapack_complex_float* b, lapack_int* ldb);
+void LAPACK_zlacpy(char* uplo, lapack_int* m, lapack_int* n, const lapack_complex_double* a, lapack_int* lda,
+                   lapack_complex_double* b, lapack_int* ldb);
+void LAPACK_sgetf2(lapack_int* m, lapack_int* n, float* a, lapack_int* lda, lapack_int* ipiv, lapack_int* info);
+void LAPACK_dgetf2(lapack_int* m, lapack_int* n, double* a, lapack_int* lda, lapack_int* ipiv, lapack_int* info);
+void LAPACK_cgetf2(lapack_int* m, lapack_int* n, lapack_complex_float* a, lapack_int* lda, lapack_int* ipiv,
+                   lapack_int* info);
+void LAPACK_zgetf2(lapack_int* m, lapack_int* n, lapack_complex_double* a, lapack_int* lda, lapack_int* ipiv,
+                   lapack_int* info);
+void LAPACK_slaswp(lapack_int* n, float* a, lapack_int* lda, lapack_int* k1, lapack_int* k2, const lapack_int* ipiv,
+                   lapack_int* incx);
+void LAPACK_dlaswp(lapack_int* n, double* a, lapack_int* lda, lapack_int* k1, lapack_int* k2, const lapack_int* ipiv,
+                   lapack_int* incx);
+void LAPACK_claswp(lapack_int* n, lapack_complex_float* a, lapack_int* lda, lapack_int* k1, lapack_int* k2,
+                   const lapack_int* ipiv, lapack_int* incx);
+void LAPACK_zlaswp(lapack_int* n, lapack_complex_double* a, lapack_int* lda, lapack_int* k1, lapack_int* k2,
+                   const lapack_int* ipiv, lapack_int* incx);
+float LAPACK_slange(char* norm, lapack_int* m, lapack_int* n, const float* a, lapack_int* lda, float* work);
+double LAPACK_dlange(char* norm, lapack_int* m, lapack_int* n, const double* a, lapack_int* lda, double* work);
+float LAPACK_clange(char* norm, lapack_int* m, lapack_int* n, const lapack_complex_float* a, lapack_int* lda,
+                    float* work);
+double LAPACK_zlange(char* norm, lapack_int* m, lapack_int* n, const lapack_complex_double* a, lapack_int* lda,
+                     double* work);
+float LAPACK_clanhe(char* norm, char* uplo, lapack_int* n, const lapack_complex_float* a, lapack_int* lda, float* work);
+double LAPACK_zlanhe(char* norm, char* uplo, lapack_int* n, const lapack_complex_double* a, lapack_int* lda,
+                     double* work);
+float LAPACK_slansy(char* norm, char* uplo, lapack_int* n, const float* a, lapack_int* lda, float* work);
+double LAPACK_dlansy(char* norm, char* uplo, lapack_int* n, const double* a, lapack_int* lda, double* work);
+float LAPACK_clansy(char* norm, char* uplo, lapack_int* n, const lapack_complex_float* a, lapack_int* lda, float* work);
+double LAPACK_zlansy(char* norm, char* uplo, lapack_int* n, const lapack_complex_double* a, lapack_int* lda,
+                     double* work);
+float LAPACK_slantr(char* norm, char* uplo, char* diag, lapack_int* m, lapack_int* n, const float* a, lapack_int* lda,
+                    float* work);
+double LAPACK_dlantr(char* norm, char* uplo, char* diag, lapack_int* m, lapack_int* n, const double* a, lapack_int* lda,
+                     double* work);
+float LAPACK_clantr(char* norm, char* uplo, char* diag, lapack_int* m, lapack_int* n, const lapack_complex_float* a,
+                    lapack_int* lda, float* work);
+double LAPACK_zlantr(char* norm, char* uplo, char* diag, lapack_int* m, lapack_int* n, const lapack_complex_double* a,
+                     lapack_int* lda, double* work);
+float LAPACK_slamch(char* cmach);
+double LAPACK_dlamch(char* cmach);
+void LAPACK_sgelq2(lapack_int* m, lapack_int* n, float* a, lapack_int* lda, float* tau, float* work, lapack_int* info);
+void LAPACK_dgelq2(lapack_int* m, lapack_int* n, double* a, lapack_int* lda, double* tau, double* work,
+                   lapack_int* info);
+void LAPACK_cgelq2(lapack_int* m, lapack_int* n, lapack_complex_float* a, lapack_int* lda, lapack_complex_float* tau,
+                   lapack_complex_float* work, lapack_int* info);
+void LAPACK_zgelq2(lapack_int* m, lapack_int* n, lapack_complex_double* a, lapack_int* lda, lapack_complex_double* tau,
+                   lapack_complex_double* work, lapack_int* info);
+void LAPACK_slarfb(char* side, char* trans, char* direct, char* storev, lapack_int* m, lapack_int* n, lapack_int* k,
+                   const float* v, lapack_int* ldv, const float* t, lapack_int* ldt, float* c, lapack_int* ldc,
+                   float* work, lapack_int* ldwork);
+void LAPACK_dlarfb(char* side, char* trans, char* direct, char* storev, lapack_int* m, lapack_int* n, lapack_int* k,
+                   const double* v, lapack_int* ldv, const double* t, lapack_int* ldt, double* c, lapack_int* ldc,
+                   double* work, lapack_int* ldwork);
+void LAPACK_clarfb(char* side, char* trans, char* direct, char* storev, lapack_int* m, lapack_int* n, lapack_int* k,
+                   const lapack_complex_float* v, lapack_int* ldv, const lapack_complex_float* t, lapack_int* ldt,
+                   lapack_complex_float* c, lapack_int* ldc, lapack_complex_float* work, lapack_int* ldwork);
+void LAPACK_zlarfb(char* side, char* trans, char* direct, char* storev, lapack_int* m, lapack_int* n, lapack_int* k,
+                   const lapack_complex_double* v, lapack_int* ldv, const lapack_complex_double* t, lapack_int* ldt,
+                   lapack_complex_double* c, lapack_int* ldc, lapack_complex_double* work, lapack_int* ldwork);
+void LAPACK_slarfg(lapack_int* n, float* alpha, float* x, lapack_int* incx, float* tau);
+void LAPACK_dlarfg(lapack_int* n, double* alpha, double* x, lapack_int* incx, double* tau);
+void LAPACK_clarfg(lapack_int* n, lapack_complex_float* alpha, lapack_complex_float* x, lapack_int* incx,
+                   lapack_complex_float* tau);
+void LAPACK_zlarfg(lapack_int* n, lapack_complex_double* alpha, lapack_complex_double* x, lapack_int* incx,
+                   lapack_complex_double* tau);
+void LAPACK_slarft(char* direct, char* storev, lapack_int* n, lapack_int* k, const float* v, lapack_int* ldv,
+                   const float* tau, float* t, lapack_int* ldt);
+void LAPACK_dlarft(char* direct, char* storev, lapack_int* n, lapack_int* k, const double* v, lapack_int* ldv,
+                   const double* tau, double* t, lapack_int* ldt);
+void LAPACK_clarft(char* direct, char* storev, lapack_int* n, lapack_int* k, const lapack_complex_float* v,
+                   lapack_int* ldv, const lapack_complex_float* tau, lapack_complex_float* t, lapack_int* ldt);
+void LAPACK_zlarft(char* direct, char* storev, lapack_int* n, lapack_int* k, const lapack_complex_double* v,
+                   lapack_int* ldv, const lapack_complex_double* tau, lapack_complex_double* t, lapack_int* ldt);
+void LAPACK_slarfx(char* side, lapack_int* m, lapack_int* n, const float* v, float* tau, float* c, lapack_int* ldc,
+                   float* work);
+void LAPACK_dlarfx(char* side, lapack_int* m, lapack_int* n, const double* v, double* tau, double* c, lapack_int* ldc,
+                   double* work);
+void LAPACK_clarfx(char* side, lapack_int* m, lapack_int* n, const lapack_complex_float* v, lapack_complex_float* tau,
+                   lapack_complex_float* c, lapack_int* ldc, lapack_complex_float* work);
+void LAPACK_zlarfx(char* side, lapack_int* m, lapack_int* n, const lapack_complex_double* v, lapack_complex_double* tau,
+                   lapack_complex_double* c, lapack_int* ldc, lapack_complex_double* work);
+void LAPACK_slatms(lapack_int* m, lapack_int* n, char* dist, lapack_int* iseed, char* sym, float* d, lapack_int* mode,
+                   float* cond, float* dmax, lapack_int* kl, lapack_int* ku, char* pack, float* a, lapack_int* lda,
+                   float* work, lapack_int* info);
+void LAPACK_dlatms(lapack_int* m, lapack_int* n, char* dist, lapack_int* iseed, char* sym, double* d, lapack_int* mode,
+                   double* cond, double* dmax, lapack_int* kl, lapack_int* ku, char* pack, double* a, lapack_int* lda,
+                   double* work, lapack_int* info);
+void LAPACK_clatms(lapack_int* m, lapack_int* n, char* dist, lapack_int* iseed, char* sym, float* d, lapack_int* mode,
+                   float* cond, float* dmax, lapack_int* kl, lapack_int* ku, char* pack, lapack_complex_float* a,
+                   lapack_int* lda, lapack_complex_float* work, lapack_int* info);
+void LAPACK_zlatms(lapack_int* m, lapack_int* n, char* dist, lapack_int* iseed, char* sym, double* d, lapack_int* mode,
+                   double* cond, double* dmax, lapack_int* kl, lapack_int* ku, char* pack, lapack_complex_double* a,
+                   lapack_int* lda, lapack_complex_double* work, lapack_int* info);
+void LAPACK_slag2d(lapack_int* m, lapack_int* n, const float* sa, lapack_int* ldsa, double* a, lapack_int* lda,
+                   lapack_int* info);
+void LAPACK_dlag2s(lapack_int* m, lapack_int* n, const double* a, lapack_int* lda, float* sa, lapack_int* ldsa,
+                   lapack_int* info);
+void LAPACK_clag2z(lapack_int* m, lapack_int* n, const lapack_complex_float* sa, lapack_int* ldsa,
+                   lapack_complex_double* a, lapack_int* lda, lapack_int* info);
+void LAPACK_zlag2c(lapack_int* m, lapack_int* n, const lapack_complex_double* a, lapack_int* lda,
+                   lapack_complex_float* sa, lapack_int* ldsa, lapack_int* info);
+void LAPACK_slauum(char* uplo, lapack_int* n, float* a, lapack_int* lda, lapack_int* info);
+void LAPACK_dlauum(char* uplo, lapack_int* n, double* a, lapack_int* lda, lapack_int* info);
+void LAPACK_clauum(char* uplo, lapack_int* n, lapack_complex_float* a, lapack_int* lda, lapack_int* info);
+void LAPACK_zlauum(char* uplo, lapack_int* n, lapack_complex_double* a, lapack_int* lda, lapack_int* info);
+void LAPACK_slagge(lapack_int* m, lapack_int* n, lapack_int* kl, lapack_int* ku, const float* d, float* a,
+                   lapack_int* lda, lapack_int* iseed, float* work, lapack_int* info);
+void LAPACK_dlagge(lapack_int* m, lapack_int* n, lapack_int* kl, lapack_int* ku, const double* d, double* a,
+                   lapack_int* lda, lapack_int* iseed, double* work, lapack_int* info);
+void LAPACK_clagge(lapack_int* m, lapack_int* n, lapack_int* kl, lapack_int* ku, const float* d,
+                   lapack_complex_float* a, lapack_int* lda, lapack_int* iseed, lapack_complex_float* work,
+                   lapack_int* info);
+void LAPACK_zlagge(lapack_int* m, lapack_int* n, lapack_int* kl, lapack_int* ku, const double* d,
+                   lapack_complex_double* a, lapack_int* lda, lapack_int* iseed, lapack_complex_double* work,
+                   lapack_int* info);
+void LAPACK_slaset(char* uplo, lapack_int* m, lapack_int* n, float* alpha, float* beta, float* a, lapack_int* lda);
+void LAPACK_dlaset(char* uplo, lapack_int* m, lapack_int* n, double* alpha, double* beta, double* a, lapack_int* lda);
+void LAPACK_claset(char* uplo, lapack_int* m, lapack_int* n, lapack_complex_float* alpha, lapack_complex_float* beta,
+                   lapack_complex_float* a, lapack_int* lda);
+void LAPACK_zlaset(char* uplo, lapack_int* m, lapack_int* n, lapack_complex_double* alpha, lapack_complex_double* beta,
+                   lapack_complex_double* a, lapack_int* lda);
+void LAPACK_slasrt(char* id, lapack_int* n, float* d, lapack_int* info);
+void LAPACK_dlasrt(char* id, lapack_int* n, double* d, lapack_int* info);
+void LAPACK_claghe(lapack_int* n, lapack_int* k, const float* d, lapack_complex_float* a, lapack_int* lda,
+                   lapack_int* iseed, lapack_complex_float* work, lapack_int* info);
+void LAPACK_zlaghe(lapack_int* n, lapack_int* k, const double* d, lapack_complex_double* a, lapack_int* lda,
+                   lapack_int* iseed, lapack_complex_double* work, lapack_int* info);
+void LAPACK_slagsy(lapack_int* n, lapack_int* k, const float* d, float* a, lapack_int* lda, lapack_int* iseed,
+                   float* work, lapack_int* info);
+void LAPACK_dlagsy(lapack_int* n, lapack_int* k, const double* d, double* a, lapack_int* lda, lapack_int* iseed,
+                   double* work, lapack_int* info);
+void LAPACK_clagsy(lapack_int* n, lapack_int* k, const float* d, lapack_complex_float* a, lapack_int* lda,
+                   lapack_int* iseed, lapack_complex_float* work, lapack_int* info);
+void LAPACK_zlagsy(lapack_int* n, lapack_int* k, const double* d, lapack_complex_double* a, lapack_int* lda,
+                   lapack_int* iseed, lapack_complex_double* work, lapack_int* info);
+void LAPACK_slapmr(lapack_logical* forwrd, lapack_int* m, lapack_int* n, float* x, lapack_int* ldx, lapack_int* k);
+void LAPACK_dlapmr(lapack_logical* forwrd, lapack_int* m, lapack_int* n, double* x, lapack_int* ldx, lapack_int* k);
+void LAPACK_clapmr(lapack_logical* forwrd, lapack_int* m, lapack_int* n, lapack_complex_float* x, lapack_int* ldx,
+                   lapack_int* k);
+void LAPACK_zlapmr(lapack_logical* forwrd, lapack_int* m, lapack_int* n, lapack_complex_double* x, lapack_int* ldx,
+                   lapack_int* k);
+float LAPACK_slapy2(float* x, float* y);
+double LAPACK_dlapy2(double* x, double* y);
+float LAPACK_slapy3(float* x, float* y, float* z);
+double LAPACK_dlapy3(double* x, double* y, double* z);
+void LAPACK_slartgp(float* f, float* g, float* cs, float* sn, float* r);
+void LAPACK_dlartgp(double* f, double* g, double* cs, double* sn, double* r);
+void LAPACK_slartgs(float* x, float* y, float* sigma, float* cs, float* sn);
+void LAPACK_dlartgs(double* x, double* y, double* sigma, double* cs, double* sn);
+// LAPACK 3.3.0
+void LAPACK_cbbcsd(char* jobu1, char* jobu2, char* jobv1t, char* jobv2t, char* trans, lapack_int* m, lapack_int* p,
+                   lapack_int* q, float* theta, float* phi, lapack_complex_float* u1, lapack_int* ldu1,
+                   lapack_complex_float* u2, lapack_int* ldu2, lapack_complex_float* v1t, lapack_int* ldv1t,
+                   lapack_complex_float* v2t, lapack_int* ldv2t, float* b11d, float* b11e, float* b12d, float* b12e,
+                   float* b21d, float* b21e, float* b22d, float* b22e, float* rwork, lapack_int* lrwork,
+                   lapack_int* info);
+void LAPACK_cheswapr(char* uplo, lapack_int* n, lapack_complex_float* a, lapack_int* i1, lapack_int* i2);
+void LAPACK_chetri2(char* uplo, lapack_int* n, lapack_complex_float* a, lapack_int* lda, const lapack_int* ipiv,
+                    lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_chetri2x(char* uplo, lapack_int* n, lapack_complex_float* a, lapack_int* lda, const lapack_int* ipiv,
+                     lapack_complex_float* work, lapack_int* nb, lapack_int* info);
+void LAPACK_chetrs2(char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* a, lapack_int* lda,
+                    const lapack_int* ipiv, lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* work,
+                    lapack_int* info);
+void LAPACK_csyconv(char* uplo, char* way, lapack_int* n, lapack_complex_float* a, lapack_int* lda,
+                    const lapack_int* ipiv, lapack_complex_float* work, lapack_int* info);
+void LAPACK_csyswapr(char* uplo, lapack_int* n, lapack_complex_float* a, lapack_int* i1, lapack_int* i2);
+void LAPACK_csytri2(char* uplo, lapack_int* n, lapack_complex_float* a, lapack_int* lda, const lapack_int* ipiv,
+                    lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_csytri2x(char* uplo, lapack_int* n, lapack_complex_float* a, lapack_int* lda, const lapack_int* ipiv,
+                     lapack_complex_float* work, lapack_int* nb, lapack_int* info);
+void LAPACK_csytrs2(char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_float* a, lapack_int* lda,
+                    const lapack_int* ipiv, lapack_complex_float* b, lapack_int* ldb, lapack_complex_float* work,
+                    lapack_int* info);
+void LAPACK_cunbdb(char* trans, char* signs, lapack_int* m, lapack_int* p, lapack_int* q, lapack_complex_float* x11,
+                   lapack_int* ldx11, lapack_complex_float* x12, lapack_int* ldx12, lapack_complex_float* x21,
+                   lapack_int* ldx21, lapack_complex_float* x22, lapack_int* ldx22, float* theta, float* phi,
+                   lapack_complex_float* taup1, lapack_complex_float* taup2, lapack_complex_float* tauq1,
+                   lapack_complex_float* tauq2, lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_cuncsd(char* jobu1, char* jobu2, char* jobv1t, char* jobv2t, char* trans, char* signs, lapack_int* m,
+                   lapack_int* p, lapack_int* q, lapack_complex_float* x11, lapack_int* ldx11,
+                   lapack_complex_float* x12, lapack_int* ldx12, lapack_complex_float* x21, lapack_int* ldx21,
+                   lapack_complex_float* x22, lapack_int* ldx22, float* theta, lapack_complex_float* u1,
+                   lapack_int* ldu1, lapack_complex_float* u2, lapack_int* ldu2, lapack_complex_float* v1t,
+                   lapack_int* ldv1t, lapack_complex_float* v2t, lapack_int* ldv2t, lapack_complex_float* work,
+                   lapack_int* lwork, float* rwork, lapack_int* lrwork, lapack_int* iwork, lapack_int* info);
+void LAPACK_dbbcsd(char* jobu1, char* jobu2, char* jobv1t, char* jobv2t, char* trans, lapack_int* m, lapack_int* p,
+                   lapack_int* q, double* theta, double* phi, double* u1, lapack_int* ldu1, double* u2,
+                   lapack_int* ldu2, double* v1t, lapack_int* ldv1t, double* v2t, lapack_int* ldv2t, double* b11d,
+                   double* b11e, double* b12d, double* b12e, double* b21d, double* b21e, double* b22d, double* b22e,
+                   double* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_dorbdb(char* trans, char* signs, lapack_int* m, lapack_int* p, lapack_int* q, double* x11,
+                   lapack_int* ldx11, double* x12, lapack_int* ldx12, double* x21, lapack_int* ldx21, double* x22,
+                   lapack_int* ldx22, double* theta, double* phi, double* taup1, double* taup2, double* tauq1,
+                   double* tauq2, double* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_dorcsd(char* jobu1, char* jobu2, char* jobv1t, char* jobv2t, char* trans, char* signs, lapack_int* m,
+                   lapack_int* p, lapack_int* q, double* x11, lapack_int* ldx11, double* x12, lapack_int* ldx12,
+                   double* x21, lapack_int* ldx21, double* x22, lapack_int* ldx22, double* theta, double* u1,
+                   lapack_int* ldu1, double* u2, lapack_int* ldu2, double* v1t, lapack_int* ldv1t, double* v2t,
+                   lapack_int* ldv2t, double* work, lapack_int* lwork, lapack_int* iwork, lapack_int* info);
+void LAPACK_dsyconv(char* uplo, char* way, lapack_int* n, double* a, lapack_int* lda, const lapack_int* ipiv,
+                    double* work, lapack_int* info);
+void LAPACK_dsyswapr(char* uplo, lapack_int* n, double* a, lapack_int* i1, lapack_int* i2);
+void LAPACK_dsytri2(char* uplo, lapack_int* n, double* a, lapack_int* lda, const lapack_int* ipiv,
+                    lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_dsytri2x(char* uplo, lapack_int* n, double* a, lapack_int* lda, const lapack_int* ipiv, double* work,
+                     lapack_int* nb, lapack_int* info);
+void LAPACK_dsytrs2(char* uplo, lapack_int* n, lapack_int* nrhs, const double* a, lapack_int* lda,
+                    const lapack_int* ipiv, double* b, lapack_int* ldb, double* work, lapack_int* info);
+void LAPACK_sbbcsd(char* jobu1, char* jobu2, char* jobv1t, char* jobv2t, char* trans, lapack_int* m, lapack_int* p,
+                   lapack_int* q, float* theta, float* phi, float* u1, lapack_int* ldu1, float* u2, lapack_int* ldu2,
+                   float* v1t, lapack_int* ldv1t, float* v2t, lapack_int* ldv2t, float* b11d, float* b11e, float* b12d,
+                   float* b12e, float* b21d, float* b21e, float* b22d, float* b22e, float* work, lapack_int* lwork,
+                   lapack_int* info);
+void LAPACK_sorbdb(char* trans, char* signs, lapack_int* m, lapack_int* p, lapack_int* q, float* x11, lapack_int* ldx11,
+                   float* x12, lapack_int* ldx12, float* x21, lapack_int* ldx21, float* x22, lapack_int* ldx22,
+                   float* theta, float* phi, float* taup1, float* taup2, float* tauq1, float* tauq2, float* work,
+                   lapack_int* lwork, lapack_int* info);
+void LAPACK_sorcsd(char* jobu1, char* jobu2, char* jobv1t, char* jobv2t, char* trans, char* signs, lapack_int* m,
+                   lapack_int* p, lapack_int* q, float* x11, lapack_int* ldx11, float* x12, lapack_int* ldx12,
+                   float* x21, lapack_int* ldx21, float* x22, lapack_int* ldx22, float* theta, float* u1,
+                   lapack_int* ldu1, float* u2, lapack_int* ldu2, float* v1t, lapack_int* ldv1t, float* v2t,
+                   lapack_int* ldv2t, float* work, lapack_int* lwork, lapack_int* iwork, lapack_int* info);
+void LAPACK_ssyconv(char* uplo, char* way, lapack_int* n, float* a, lapack_int* lda, const lapack_int* ipiv,
+                    float* work, lapack_int* info);
+void LAPACK_ssyswapr(char* uplo, lapack_int* n, float* a, lapack_int* i1, lapack_int* i2);
+void LAPACK_ssytri2(char* uplo, lapack_int* n, float* a, lapack_int* lda, const lapack_int* ipiv,
+                    lapack_complex_float* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_ssytri2x(char* uplo, lapack_int* n, float* a, lapack_int* lda, const lapack_int* ipiv, float* work,
+                     lapack_int* nb, lapack_int* info);
+void LAPACK_ssytrs2(char* uplo, lapack_int* n, lapack_int* nrhs, const float* a, lapack_int* lda,
+                    const lapack_int* ipiv, float* b, lapack_int* ldb, float* work, lapack_int* info);
+void LAPACK_zbbcsd(char* jobu1, char* jobu2, char* jobv1t, char* jobv2t, char* trans, lapack_int* m, lapack_int* p,
+                   lapack_int* q, double* theta, double* phi, lapack_complex_double* u1, lapack_int* ldu1,
+                   lapack_complex_double* u2, lapack_int* ldu2, lapack_complex_double* v1t, lapack_int* ldv1t,
+                   lapack_complex_double* v2t, lapack_int* ldv2t, double* b11d, double* b11e, double* b12d,
+                   double* b12e, double* b21d, double* b21e, double* b22d, double* b22e, double* rwork,
+                   lapack_int* lrwork, lapack_int* info);
+void LAPACK_zheswapr(char* uplo, lapack_int* n, lapack_complex_double* a, lapack_int* i1, lapack_int* i2);
+void LAPACK_zhetri2(char* uplo, lapack_int* n, lapack_complex_double* a, lapack_int* lda, const lapack_int* ipiv,
+                    lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_zhetri2x(char* uplo, lapack_int* n, lapack_complex_double* a, lapack_int* lda, const lapack_int* ipiv,
+                     lapack_complex_double* work, lapack_int* nb, lapack_int* info);
+void LAPACK_zhetrs2(char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* a, lapack_int* lda,
+                    const lapack_int* ipiv, lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* work,
+                    lapack_int* info);
+void LAPACK_zsyconv(char* uplo, char* way, lapack_int* n, lapack_complex_double* a, lapack_int* lda,
+                    const lapack_int* ipiv, lapack_complex_double* work, lapack_int* info);
+void LAPACK_zsyswapr(char* uplo, lapack_int* n, lapack_complex_double* a, lapack_int* i1, lapack_int* i2);
+void LAPACK_zsytri2(char* uplo, lapack_int* n, lapack_complex_double* a, lapack_int* lda, const lapack_int* ipiv,
+                    lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_zsytri2x(char* uplo, lapack_int* n, lapack_complex_double* a, lapack_int* lda, const lapack_int* ipiv,
+                     lapack_complex_double* work, lapack_int* nb, lapack_int* info);
+void LAPACK_zsytrs2(char* uplo, lapack_int* n, lapack_int* nrhs, const lapack_complex_double* a, lapack_int* lda,
+                    const lapack_int* ipiv, lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* work,
+                    lapack_int* info);
+void LAPACK_zunbdb(char* trans, char* signs, lapack_int* m, lapack_int* p, lapack_int* q, lapack_complex_double* x11,
+                   lapack_int* ldx11, lapack_complex_double* x12, lapack_int* ldx12, lapack_complex_double* x21,
+                   lapack_int* ldx21, lapack_complex_double* x22, lapack_int* ldx22, double* theta, double* phi,
+                   lapack_complex_double* taup1, lapack_complex_double* taup2, lapack_complex_double* tauq1,
+                   lapack_complex_double* tauq2, lapack_complex_double* work, lapack_int* lwork, lapack_int* info);
+void LAPACK_zuncsd(char* jobu1, char* jobu2, char* jobv1t, char* jobv2t, char* trans, char* signs, lapack_int* m,
+                   lapack_int* p, lapack_int* q, lapack_complex_double* x11, lapack_int* ldx11,
+                   lapack_complex_double* x12, lapack_int* ldx12, lapack_complex_double* x21, lapack_int* ldx21,
+                   lapack_complex_double* x22, lapack_int* ldx22, double* theta, lapack_complex_double* u1,
+                   lapack_int* ldu1, lapack_complex_double* u2, lapack_int* ldu2, lapack_complex_double* v1t,
+                   lapack_int* ldv1t, lapack_complex_double* v2t, lapack_int* ldv2t, lapack_complex_double* work,
+                   lapack_int* lwork, double* rwork, lapack_int* lrwork, lapack_int* iwork, lapack_int* info);
+// LAPACK 3.4.0
+void LAPACK_sgemqrt(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, lapack_int* nb,
+                    const float* v, lapack_int* ldv, const float* t, lapack_int* ldt, float* c, lapack_int* ldc,
+                    float* work, lapack_int* info);
+void LAPACK_dgemqrt(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, lapack_int* nb,
+                    const double* v, lapack_int* ldv, const double* t, lapack_int* ldt, double* c, lapack_int* ldc,
+                    double* work, lapack_int* info);
+void LAPACK_cgemqrt(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, lapack_int* nb,
+                    const lapack_complex_float* v, lapack_int* ldv, const lapack_complex_float* t, lapack_int* ldt,
+                    lapack_complex_float* c, lapack_int* ldc, lapack_complex_float* work, lapack_int* info);
+void LAPACK_zgemqrt(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, lapack_int* nb,
+                    const lapack_complex_double* v, lapack_int* ldv, const lapack_complex_double* t, lapack_int* ldt,
+                    lapack_complex_double* c, lapack_int* ldc, lapack_complex_double* work, lapack_int* info);
+void LAPACK_sgeqrt(lapack_int* m, lapack_int* n, lapack_int* nb, float* a, lapack_int* lda, float* t, lapack_int* ldt,
+                   float* work, lapack_int* info);
+void LAPACK_dgeqrt(lapack_int* m, lapack_int* n, lapack_int* nb, double* a, lapack_int* lda, double* t, lapack_int* ldt,
+                   double* work, lapack_int* info);
+void LAPACK_cgeqrt(lapack_int* m, lapack_int* n, lapack_int* nb, lapack_complex_float* a, lapack_int* lda,
+                   lapack_complex_float* t, lapack_int* ldt, lapack_complex_float* work, lapack_int* info);
+void LAPACK_zgeqrt(lapack_int* m, lapack_int* n, lapack_int* nb, lapack_complex_double* a, lapack_int* lda,
+                   lapack_complex_double* t, lapack_int* ldt, lapack_complex_double* work, lapack_int* info);
+void LAPACK_sgeqrt2(lapack_int* m, lapack_int* n, float* a, lapack_int* lda, float* t, lapack_int* ldt,
+                    lapack_int* info);
+void LAPACK_dgeqrt2(lapack_int* m, lapack_int* n, double* a, lapack_int* lda, double* t, lapack_int* ldt,
+                    lapack_int* info);
+void LAPACK_cgeqrt2(lapack_int* m, lapack_int* n, lapack_complex_float* a, lapack_int* lda, lapack_complex_float* t,
+                    lapack_int* ldt, lapack_int* info);
+void LAPACK_zgeqrt2(lapack_int* m, lapack_int* n, lapack_complex_double* a, lapack_int* lda, lapack_complex_double* t,
+                    lapack_int* ldt, lapack_int* info);
+void LAPACK_sgeqrt3(lapack_int* m, lapack_int* n, float* a, lapack_int* lda, float* t, lapack_int* ldt,
+                    lapack_int* info);
+void LAPACK_dgeqrt3(lapack_int* m, lapack_int* n, double* a, lapack_int* lda, double* t, lapack_int* ldt,
+                    lapack_int* info);
+void LAPACK_cgeqrt3(lapack_int* m, lapack_int* n, lapack_complex_float* a, lapack_int* lda, lapack_complex_float* t,
+                    lapack_int* ldt, lapack_int* info);
+void LAPACK_zgeqrt3(lapack_int* m, lapack_int* n, lapack_complex_double* a, lapack_int* lda, lapack_complex_double* t,
+                    lapack_int* ldt, lapack_int* info);
+void LAPACK_stpmqrt(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, lapack_int* l, lapack_int* nb,
+                    const float* v, lapack_int* ldv, const float* t, lapack_int* ldt, float* a, lapack_int* lda,
+                    float* b, lapack_int* ldb, float* work, lapack_int* info);
+void LAPACK_dtpmqrt(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, lapack_int* l, lapack_int* nb,
+                    const double* v, lapack_int* ldv, const double* t, lapack_int* ldt, double* a, lapack_int* lda,
+                    double* b, lapack_int* ldb, double* work, lapack_int* info);
+void LAPACK_ctpmqrt(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, lapack_int* l, lapack_int* nb,
+                    const lapack_complex_float* v, lapack_int* ldv, const lapack_complex_float* t, lapack_int* ldt,
+                    lapack_complex_float* a, lapack_int* lda, lapack_complex_float* b, lapack_int* ldb,
+                    lapack_complex_float* work, lapack_int* info);
+void LAPACK_ztpmqrt(char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k, lapack_int* l, lapack_int* nb,
+                    const lapack_complex_double* v, lapack_int* ldv, const lapack_complex_double* t, lapack_int* ldt,
+                    lapack_complex_double* a, lapack_int* lda, lapack_complex_double* b, lapack_int* ldb,
+                    lapack_complex_double* work, lapack_int* info);
+void LAPACK_dtpqrt(lapack_int* m, lapack_int* n, lapack_int* l, lapack_int* nb, double* a, lapack_int* lda, double* b,
+                   lapack_int* ldb, double* t, lapack_int* ldt, double* work, lapack_int* info);
+void LAPACK_ctpqrt(lapack_int* m, lapack_int* n, lapack_int* l, lapack_int* nb, lapack_complex_float* a,
+                   lapack_int* lda, lapack_complex_float* t, lapack_complex_float* b, lapack_int* ldb, lapack_int* ldt,
+                   lapack_complex_float* work, lapack_int* info);
+void LAPACK_ztpqrt(lapack_int* m, lapack_int* n, lapack_int* l, lapack_int* nb, lapack_complex_double* a,
+                   lapack_int* lda, lapack_complex_double* b, lapack_int* ldb, lapack_complex_double* t,
+                   lapack_int* ldt, lapack_complex_double* work, lapack_int* info);
+void LAPACK_stpqrt2(lapack_int* m, lapack_int* n, float* a, lapack_int* lda, float* b, lapack_int* ldb, float* t,
+                    lapack_int* ldt, lapack_int* info);
+void LAPACK_dtpqrt2(lapack_int* m, lapack_int* n, double* a, lapack_int* lda, double* b, lapack_int* ldb, double* t,
+                    lapack_int* ldt, lapack_int* info);
+void LAPACK_ctpqrt2(lapack_int* m, lapack_int* n, lapack_complex_float* a, lapack_int* lda, lapack_complex_float* b,
+                    lapack_int* ldb, lapack_complex_float* t, lapack_int* ldt, lapack_int* info);
+void LAPACK_ztpqrt2(lapack_int* m, lapack_int* n, lapack_complex_double* a, lapack_int* lda, lapack_complex_double* b,
+                    lapack_int* ldb, lapack_complex_double* t, lapack_int* ldt, lapack_int* info);
+void LAPACK_stprfb(char* side, char* trans, char* direct, char* storev, lapack_int* m, lapack_int* n, lapack_int* k,
+                   lapack_int* l, const float* v, lapack_int* ldv, const float* t, lapack_int* ldt, float* a,
+                   lapack_int* lda, float* b, lapack_int* ldb, const float* mywork, lapack_int* myldwork);
+void LAPACK_dtprfb(char* side, char* trans, char* direct, char* storev, lapack_int* m, lapack_int* n, lapack_int* k,
+                   lapack_int* l, const double* v, lapack_int* ldv, const double* t, lapack_int* ldt, double* a,
+                   lapack_int* lda, double* b, lapack_int* ldb, const double* mywork, lapack_int* myldwork);
+void LAPACK_ctprfb(char* side, char* trans, char* direct, char* storev, lapack_int* m, lapack_int* n, lapack_int* k,
+                   lapack_int* l, const lapack_complex_float* v, lapack_int* ldv, const lapack_complex_float* t,
+                   lapack_int* ldt, lapack_complex_float* a, lapack_int* lda, lapack_complex_float* b, lapack_int* ldb,
+                   const float* mywork, lapack_int* myldwork);
+void LAPACK_ztprfb(char* side, char* trans, char* direct, char* storev, lapack_int* m, lapack_int* n, lapack_int* k,
+                   lapack_int* l, const lapack_complex_double* v, lapack_int* ldv, const lapack_complex_double* t,
+                   lapack_int* ldt, lapack_complex_double* a, lapack_int* lda, lapack_complex_double* b,
+                   lapack_int* ldb, const double* mywork, lapack_int* myldwork);
+// LAPACK 3.X.X
+void LAPACK_csyr(char* uplo, lapack_int* n, lapack_complex_float* alpha, const lapack_complex_float* x,
+                 lapack_int* incx, lapack_complex_float* a, lapack_int* lda);
+void LAPACK_zsyr(char* uplo, lapack_int* n, lapack_complex_double* alpha, const lapack_complex_double* x,
+                 lapack_int* incx, lapack_complex_double* a, lapack_int* lda);
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif /* _LAPACKE_H_ */
+
+#endif /* _MKL_LAPACKE_H_ */
diff --git a/inst/include/Eigen/src/misc/lapacke_helpers.h b/inst/include/Eigen/src/misc/lapacke_helpers.h
new file mode 100644
index 00000000..ff98639c
--- /dev/null
+++ b/inst/include/Eigen/src/misc/lapacke_helpers.h
@@ -0,0 +1,163 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2021 Erik Schultheis <erik.schultheis@aalto.fi>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_LAPACKE_HELPERS_H
+#define EIGEN_LAPACKE_HELPERS_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+#ifdef EIGEN_USE_MKL
+#include "mkl_lapacke.h"
+#else
+#include "lapacke.h"
+#endif
+
+namespace Eigen {
+namespace internal {
+/**
+ * \internal
+ * \brief Implementation details and helper functions for the lapacke glue code.
+ */
+namespace lapacke_helpers {
+
+// ---------------------------------------------------------------------------------------------------------------------
+//                  Translation from Eigen to Lapacke for types and constants
+// ---------------------------------------------------------------------------------------------------------------------
+
+// For complex numbers, the types in Eigen and Lapacke are different, but layout compatible.
+template <typename Scalar>
+struct translate_type_imp;
+template <>
+struct translate_type_imp<float> {
+  using type = float;
+};
+template <>
+struct translate_type_imp<double> {
+  using type = double;
+};
+template <>
+struct translate_type_imp<std::complex<double>> {
+  using type = lapack_complex_double;
+};
+template <>
+struct translate_type_imp<std::complex<float>> {
+  using type = lapack_complex_float;
+};
+
+/// Given an Eigen types, this is defined to be the corresponding, layout-compatible lapack type
+template <typename Scalar>
+using translated_type = typename translate_type_imp<Scalar>::type;
+
+/// These functions convert their arguments from Eigen to Lapack types
+/// This function performs conversion for any of the translations defined above.
+template <typename Source, typename Target = translated_type<Source>>
+EIGEN_ALWAYS_INLINE auto to_lapack(Source value) {
+  return static_cast<Target>(value);
+}
+
+/// This function performs conversions for pointer types corresponding to the translations abovce.
+/// This is valid because the translations are between layout-compatible types.
+template <typename Source, typename Target = translated_type<Source>>
+EIGEN_ALWAYS_INLINE auto to_lapack(Source *value) {
+  return reinterpret_cast<Target *>(value);
+}
+
+/// This function converts the Eigen Index to a lapack index, with possible range checks
+/// \sa internal::convert_index
+EIGEN_ALWAYS_INLINE lapack_int to_lapack(Index index) { return convert_index<lapack_int>(index); }
+
+/// translates storage order of the given Eigen object to the corresponding lapack constant
+template <typename Derived>
+EIGEN_ALWAYS_INLINE constexpr lapack_int lapack_storage_of(const EigenBase<Derived> &) {
+  return Derived::IsRowMajor ? LAPACK_ROW_MAJOR : LAPACK_COL_MAJOR;
+}
+
+// ---------------------------------------------------------------------------------------------------------------------
+//              Automatic generation of low-level wrappers
+// ---------------------------------------------------------------------------------------------------------------------
+
+/*!
+ * \internal
+ * \brief Helper type to facilitate the wrapping of raw LAPACKE functions for different types into a single, overloaded
+ * C++ function. This is achieved in combination with \r EIGEN_MAKE_LAPACKE_WRAPPER \details This implementation works
+ * by providing an overloaded call function that just forwards its arguments to the underlying lapack function. Each of
+ * these overloads is enabled only if the call is actually well formed. Because these lapack functions take pointers to
+ * the underlying scalar type as arguments, even though the actual Scalars would be implicitly convertible, the pointers
+ * are not and therefore only a single overload can be valid at the same time. Thus, despite all functions taking fully
+ * generic `Args&&... args` as arguments, there is never any ambiguity.
+ */
+template <typename DoubleFn, typename SingleFn, typename DoubleCpxFn, typename SingleCpxFn>
+struct WrappingHelper {
+  // The naming of double, single, double complex and single complex is purely for readability
+  // and doesn't actually affect the workings of this class. In principle, the arguments can
+  // be supplied in any permuted order.
+  DoubleFn double_;
+  SingleFn single_;
+  DoubleCpxFn double_cpx_;
+  SingleCpxFn single_cpx_;
+
+  template <typename... Args>
+  auto call(Args &&...args) -> decltype(double_(std::forward<Args>(args)...)) {
+    return double_(std::forward<Args>(args)...);
+  }
+
+  template <typename... Args>
+  auto call(Args &&...args) -> decltype(single_(std::forward<Args>(args)...)) {
+    return single_(std::forward<Args>(args)...);
+  }
+
+  template <typename... Args>
+  auto call(Args &&...args) -> decltype(double_cpx_(std::forward<Args>(args)...)) {
+    return double_cpx_(std::forward<Args>(args)...);
+  }
+
+  template <typename... Args>
+  auto call(Args &&...args) -> decltype(single_cpx_(std::forward<Args>(args)...)) {
+    return single_cpx_(std::forward<Args>(args)...);
+  }
+};
+
+/** \internal Helper function that generates a `WrappingHelper` object with the given function pointers and
+ * invokes its `call` method, thus selecting one of the overloads.
+ * \sa EIGEN_MAKE_LAPACKE_WRAPPER
+ */
+template <typename DoubleFn, typename SingleFn, typename DoubleCpxFn, typename SingleCpxFn, typename... Args>
+EIGEN_ALWAYS_INLINE auto call_wrapper(DoubleFn df, SingleFn sf, DoubleCpxFn dcf, SingleCpxFn scf, Args &&...args) {
+  WrappingHelper<DoubleFn, SingleFn, DoubleCpxFn, SingleCpxFn> helper{df, sf, dcf, scf};
+  return helper.call(std::forward<Args>(args)...);
+}
+
+/**
+ * \internal
+ * Generates a new function `Function` that dispatches to the corresponding LAPACKE_? prefixed functions.
+ * \sa WrappingHelper
+ */
+#define EIGEN_MAKE_LAPACKE_WRAPPER(FUNCTION)                                                                \
+  template <typename... Args>                                                                               \
+  EIGEN_ALWAYS_INLINE auto FUNCTION(Args &&...args) {                                                       \
+    return call_wrapper(LAPACKE_d##FUNCTION, LAPACKE_s##FUNCTION, LAPACKE_z##FUNCTION, LAPACKE_c##FUNCTION, \
+                        std::forward<Args>(args)...);                                                       \
+  }
+
+// Now with this macro and the helper wrappers, we can generate the dispatch for all the lapacke functions that are
+// used in Eigen.
+// We define these here instead of in the files where they are used because this allows us to #undef the macro again
+// right here
+EIGEN_MAKE_LAPACKE_WRAPPER(potrf)
+EIGEN_MAKE_LAPACKE_WRAPPER(getrf)
+EIGEN_MAKE_LAPACKE_WRAPPER(geqrf)
+EIGEN_MAKE_LAPACKE_WRAPPER(gesdd)
+
+#undef EIGEN_MAKE_LAPACKE_WRAPPER
+}  // namespace lapacke_helpers
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_LAPACKE_HELPERS_H
diff --git a/inst/include/Eigen/src/misc/lapacke_mangling.h b/inst/include/Eigen/src/misc/lapacke_mangling.h
new file mode 100644
index 00000000..d852de7a
--- /dev/null
+++ b/inst/include/Eigen/src/misc/lapacke_mangling.h
@@ -0,0 +1,16 @@
+#ifndef LAPACK_HEADER_INCLUDED
+#define LAPACK_HEADER_INCLUDED
+
+#ifndef LAPACK_GLOBAL
+#if defined(LAPACK_GLOBAL_PATTERN_LC) || defined(ADD_)
+#define LAPACK_GLOBAL(lcname, UCNAME) lcname##_
+#elif defined(LAPACK_GLOBAL_PATTERN_UC) || defined(UPPER)
+#define LAPACK_GLOBAL(lcname, UCNAME) UCNAME
+#elif defined(LAPACK_GLOBAL_PATTERN_MC) || defined(NOCHANGE)
+#define LAPACK_GLOBAL(lcname, UCNAME) lcname
+#else
+#define LAPACK_GLOBAL(lcname, UCNAME) lcname##_
+#endif
+#endif
+
+#endif
diff --git a/inst/include/Eigen/src/plugins/ArrayCwiseBinaryOps.h b/inst/include/Eigen/src/plugins/ArrayCwiseBinaryOps.h
deleted file mode 100644
index 1951286f..00000000
--- a/inst/include/Eigen/src/plugins/ArrayCwiseBinaryOps.h
+++ /dev/null
@@ -1,253 +0,0 @@
-/** \returns an expression of the coefficient wise product of \c *this and \a other
-  *
-  * \sa MatrixBase::cwiseProduct
-  */
-template<typename OtherDerived>
-EIGEN_STRONG_INLINE const EIGEN_CWISE_PRODUCT_RETURN_TYPE(Derived,OtherDerived)
-operator*(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
-{
-  return EIGEN_CWISE_PRODUCT_RETURN_TYPE(Derived,OtherDerived)(derived(), other.derived());
-}
-
-/** \returns an expression of the coefficient wise quotient of \c *this and \a other
-  *
-  * \sa MatrixBase::cwiseQuotient
-  */
-template<typename OtherDerived>
-EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_quotient_op<Scalar>, const Derived, const OtherDerived>
-operator/(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
-{
-  return CwiseBinaryOp<internal::scalar_quotient_op<Scalar>, const Derived, const OtherDerived>(derived(), other.derived());
-}
-
-/** \returns an expression of the coefficient-wise min of \c *this and \a other
-  *
-  * Example: \include Cwise_min.cpp
-  * Output: \verbinclude Cwise_min.out
-  *
-  * \sa max()
-  */
-EIGEN_MAKE_CWISE_BINARY_OP(min,internal::scalar_min_op)
-
-/** \returns an expression of the coefficient-wise min of \c *this and scalar \a other
-  *
-  * \sa max()
-  */
-EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_min_op<Scalar>, const Derived,
-                                        const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject> >
-#ifdef EIGEN_PARSED_BY_DOXYGEN
-min
-#else
-(min)
-#endif
-(const Scalar &other) const
-{
-  return (min)(Derived::PlainObject::Constant(rows(), cols(), other));
-}
-
-/** \returns an expression of the coefficient-wise max of \c *this and \a other
-  *
-  * Example: \include Cwise_max.cpp
-  * Output: \verbinclude Cwise_max.out
-  *
-  * \sa min()
-  */
-EIGEN_MAKE_CWISE_BINARY_OP(max,internal::scalar_max_op)
-
-/** \returns an expression of the coefficient-wise max of \c *this and scalar \a other
-  *
-  * \sa min()
-  */
-EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_max_op<Scalar>, const Derived,
-                                        const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject> >
-#ifdef EIGEN_PARSED_BY_DOXYGEN
-max
-#else
-(max)
-#endif
-(const Scalar &other) const
-{
-  return (max)(Derived::PlainObject::Constant(rows(), cols(), other));
-}
-
-
-#define EIGEN_MAKE_CWISE_COMP_OP(OP, COMPARATOR) \
-template<typename OtherDerived> \
-EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_ ## COMPARATOR>, const Derived, const OtherDerived> \
-OP(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const \
-{ \
-  return CwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_ ## COMPARATOR>, const Derived, const OtherDerived>(derived(), other.derived()); \
-}\
-typedef CwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_ ## COMPARATOR>, const Derived, const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject> > Cmp ## COMPARATOR ## ReturnType; \
-typedef CwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_ ## COMPARATOR>, const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject>, const Derived > RCmp ## COMPARATOR ## ReturnType; \
-EIGEN_STRONG_INLINE const Cmp ## COMPARATOR ## ReturnType \
-OP(const Scalar& s) const { \
-  return this->OP(Derived::PlainObject::Constant(rows(), cols(), s)); \
-} \
-friend EIGEN_STRONG_INLINE const RCmp ## COMPARATOR ## ReturnType \
-OP(const Scalar& s, const Derived& d) { \
-  return Derived::PlainObject::Constant(d.rows(), d.cols(), s).OP(d); \
-}
-
-#define EIGEN_MAKE_CWISE_COMP_R_OP(OP, R_OP, RCOMPARATOR) \
-template<typename OtherDerived> \
-EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_##RCOMPARATOR>, const OtherDerived, const Derived> \
-OP(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const \
-{ \
-  return CwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_##RCOMPARATOR>, const OtherDerived, const Derived>(other.derived(), derived()); \
-} \
-\
-inline const RCmp ## RCOMPARATOR ## ReturnType \
-OP(const Scalar& s) const { \
-  return Derived::PlainObject::Constant(rows(), cols(), s).R_OP(*this); \
-} \
-friend inline const Cmp ## RCOMPARATOR ## ReturnType \
-OP(const Scalar& s, const Derived& d) { \
-  return d.R_OP(Derived::PlainObject::Constant(d.rows(), d.cols(), s)); \
-}
-
-
-/** \returns an expression of the coefficient-wise \< operator of *this and \a other
-  *
-  * Example: \include Cwise_less.cpp
-  * Output: \verbinclude Cwise_less.out
-  *
-  * \sa all(), any(), operator>(), operator<=()
-  */
-EIGEN_MAKE_CWISE_COMP_OP(operator<, LT)
-
-/** \returns an expression of the coefficient-wise \<= operator of *this and \a other
-  *
-  * Example: \include Cwise_less_equal.cpp
-  * Output: \verbinclude Cwise_less_equal.out
-  *
-  * \sa all(), any(), operator>=(), operator<()
-  */
-EIGEN_MAKE_CWISE_COMP_OP(operator<=, LE)
-
-/** \returns an expression of the coefficient-wise \> operator of *this and \a other
-  *
-  * Example: \include Cwise_greater.cpp
-  * Output: \verbinclude Cwise_greater.out
-  *
-  * \sa all(), any(), operator>=(), operator<()
-  */
-EIGEN_MAKE_CWISE_COMP_R_OP(operator>, operator<, LT)
-
-/** \returns an expression of the coefficient-wise \>= operator of *this and \a other
-  *
-  * Example: \include Cwise_greater_equal.cpp
-  * Output: \verbinclude Cwise_greater_equal.out
-  *
-  * \sa all(), any(), operator>(), operator<=()
-  */
-EIGEN_MAKE_CWISE_COMP_R_OP(operator>=, operator<=, LE)
-
-/** \returns an expression of the coefficient-wise == operator of *this and \a other
-  *
-  * \warning this performs an exact comparison, which is generally a bad idea with floating-point types.
-  * In order to check for equality between two vectors or matrices with floating-point coefficients, it is
-  * generally a far better idea to use a fuzzy comparison as provided by isApprox() and
-  * isMuchSmallerThan().
-  *
-  * Example: \include Cwise_equal_equal.cpp
-  * Output: \verbinclude Cwise_equal_equal.out
-  *
-  * \sa all(), any(), isApprox(), isMuchSmallerThan()
-  */
-EIGEN_MAKE_CWISE_COMP_OP(operator==, EQ)
-
-/** \returns an expression of the coefficient-wise != operator of *this and \a other
-  *
-  * \warning this performs an exact comparison, which is generally a bad idea with floating-point types.
-  * In order to check for equality between two vectors or matrices with floating-point coefficients, it is
-  * generally a far better idea to use a fuzzy comparison as provided by isApprox() and
-  * isMuchSmallerThan().
-  *
-  * Example: \include Cwise_not_equal.cpp
-  * Output: \verbinclude Cwise_not_equal.out
-  *
-  * \sa all(), any(), isApprox(), isMuchSmallerThan()
-  */
-EIGEN_MAKE_CWISE_COMP_OP(operator!=, NEQ)
-
-#undef EIGEN_MAKE_CWISE_COMP_OP
-#undef EIGEN_MAKE_CWISE_COMP_R_OP
-
-// scalar addition
-
-/** \returns an expression of \c *this with each coeff incremented by the constant \a scalar
-  *
-  * Example: \include Cwise_plus.cpp
-  * Output: \verbinclude Cwise_plus.out
-  *
-  * \sa operator+=(), operator-()
-  */
-inline const CwiseUnaryOp<internal::scalar_add_op<Scalar>, const Derived>
-operator+(const Scalar& scalar) const
-{
-  return CwiseUnaryOp<internal::scalar_add_op<Scalar>, const Derived>(derived(), internal::scalar_add_op<Scalar>(scalar));
-}
-
-friend inline const CwiseUnaryOp<internal::scalar_add_op<Scalar>, const Derived>
-operator+(const Scalar& scalar,const EIGEN_CURRENT_STORAGE_BASE_CLASS<Derived>& other)
-{
-  return other + scalar;
-}
-
-/** \returns an expression of \c *this with each coeff decremented by the constant \a scalar
-  *
-  * Example: \include Cwise_minus.cpp
-  * Output: \verbinclude Cwise_minus.out
-  *
-  * \sa operator+(), operator-=()
-  */
-inline const CwiseUnaryOp<internal::scalar_add_op<Scalar>, const Derived>
-operator-(const Scalar& scalar) const
-{
-  return *this + (-scalar);
-}
-
-friend inline const CwiseUnaryOp<internal::scalar_add_op<Scalar>, const CwiseUnaryOp<internal::scalar_opposite_op<Scalar>, const Derived> >
-operator-(const Scalar& scalar,const EIGEN_CURRENT_STORAGE_BASE_CLASS<Derived>& other)
-{
-  return (-other) + scalar;
-}
-
-/** \returns an expression of the coefficient-wise && operator of *this and \a other
-  *
-  * \warning this operator is for expression of bool only.
-  *
-  * Example: \include Cwise_boolean_and.cpp
-  * Output: \verbinclude Cwise_boolean_and.out
-  *
-  * \sa operator||(), select()
-  */
-template<typename OtherDerived>
-inline const CwiseBinaryOp<internal::scalar_boolean_and_op, const Derived, const OtherDerived>
-operator&&(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
-{
-  EIGEN_STATIC_ASSERT((internal::is_same<bool,Scalar>::value && internal::is_same<bool,typename OtherDerived::Scalar>::value),
-                      THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL);
-  return CwiseBinaryOp<internal::scalar_boolean_and_op, const Derived, const OtherDerived>(derived(),other.derived());
-}
-
-/** \returns an expression of the coefficient-wise || operator of *this and \a other
-  *
-  * \warning this operator is for expression of bool only.
-  *
-  * Example: \include Cwise_boolean_or.cpp
-  * Output: \verbinclude Cwise_boolean_or.out
-  *
-  * \sa operator&&(), select()
-  */
-template<typename OtherDerived>
-inline const CwiseBinaryOp<internal::scalar_boolean_or_op, const Derived, const OtherDerived>
-operator||(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
-{
-  EIGEN_STATIC_ASSERT((internal::is_same<bool,Scalar>::value && internal::is_same<bool,typename OtherDerived::Scalar>::value),
-                      THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL);
-  return CwiseBinaryOp<internal::scalar_boolean_or_op, const Derived, const OtherDerived>(derived(),other.derived());
-}
-
-
diff --git a/inst/include/Eigen/src/plugins/ArrayCwiseBinaryOps.inc b/inst/include/Eigen/src/plugins/ArrayCwiseBinaryOps.inc
new file mode 100644
index 00000000..c8c2434d
--- /dev/null
+++ b/inst/include/Eigen/src/plugins/ArrayCwiseBinaryOps.inc
@@ -0,0 +1,344 @@
+
+/** \returns an expression of the coefficient wise product of \c *this and \a other
+ *
+ * \sa MatrixBase::cwiseProduct
+ */
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const EIGEN_CWISE_BINARY_RETURN_TYPE(Derived, OtherDerived, product) operator*(
+    const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const {
+  return EIGEN_CWISE_BINARY_RETURN_TYPE(Derived, OtherDerived, product)(derived(), other.derived());
+}
+
+/** \returns an expression of the coefficient wise quotient of \c *this and \a other
+ *
+ * \sa MatrixBase::cwiseQuotient
+ */
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseBinaryOp<
+    internal::scalar_quotient_op<Scalar, typename OtherDerived::Scalar>, const Derived, const OtherDerived>
+operator/(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const {
+  return CwiseBinaryOp<internal::scalar_quotient_op<Scalar, typename OtherDerived::Scalar>, const Derived,
+                       const OtherDerived>(derived(), other.derived());
+}
+
+/** \returns an expression of the coefficient-wise min of \c *this and \a other
+ *
+ * Example: \include Cwise_min.cpp
+ * Output: \verbinclude Cwise_min.out
+ *
+ * \sa max()
+ */
+template <int NaNPropagation = PropagateFast, typename OtherDerived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const
+    CwiseBinaryOp<internal::scalar_min_op<Scalar, Scalar, NaNPropagation>, const Derived, const OtherDerived>
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+    min
+#else
+    (min)
+#endif
+    (const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const {
+  return CwiseBinaryOp<internal::scalar_min_op<Scalar, Scalar, NaNPropagation>, const Derived, const OtherDerived>(
+      derived(), other.derived());
+}
+
+/** \returns an expression of the coefficient-wise min of \c *this and scalar \a other
+ *
+ * \sa max()
+ */
+template <int NaNPropagation = PropagateFast>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const
+    CwiseBinaryOp<internal::scalar_min_op<Scalar, Scalar, NaNPropagation>, const Derived,
+                  const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject> >
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+    min
+#else
+    (min)
+#endif
+    (const Scalar &other) const {
+  return (min<NaNPropagation>)(Derived::PlainObject::Constant(rows(), cols(), other));
+}
+
+/** \returns an expression of the coefficient-wise max of \c *this and \a other
+ *
+ * Example: \include Cwise_max.cpp
+ * Output: \verbinclude Cwise_max.out
+ *
+ * \sa min()
+ */
+template <int NaNPropagation = PropagateFast, typename OtherDerived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const
+    CwiseBinaryOp<internal::scalar_max_op<Scalar, Scalar, NaNPropagation>, const Derived, const OtherDerived>
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+    max
+#else
+    (max)
+#endif
+    (const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const {
+  return CwiseBinaryOp<internal::scalar_max_op<Scalar, Scalar, NaNPropagation>, const Derived, const OtherDerived>(
+      derived(), other.derived());
+}
+
+/** \returns an expression of the coefficient-wise max of \c *this and scalar \a other
+ *
+ * \sa min()
+ */
+template <int NaNPropagation = PropagateFast>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const
+    CwiseBinaryOp<internal::scalar_max_op<Scalar, Scalar, NaNPropagation>, const Derived,
+                  const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject> >
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+    max
+#else
+    (max)
+#endif
+    (const Scalar &other) const {
+  return (max<NaNPropagation>)(Derived::PlainObject::Constant(rows(), cols(), other));
+}
+
+/** \returns an expression of the coefficient-wise absdiff of \c *this and \a other
+ *
+ * \sa absolute_difference()
+ */
+EIGEN_MAKE_CWISE_BINARY_OP(absolute_difference, absolute_difference)
+
+/** \returns an expression of the coefficient-wise absolute_difference of \c *this and scalar \a other
+ *
+ * \sa absolute_difference()
+ */
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const
+    CwiseBinaryOp<internal::scalar_absolute_difference_op<Scalar, Scalar>, const Derived,
+                  const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject> >
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+    absolute_difference
+#else
+    (absolute_difference)
+#endif
+    (const Scalar &other) const {
+  return (absolute_difference)(Derived::PlainObject::Constant(rows(), cols(), other));
+}
+
+/** \returns an expression of the coefficient-wise power of \c *this to the given array of \a exponents.
+ *
+ * This function computes the coefficient-wise power.
+ *
+ * Example: \include Cwise_array_power_array.cpp
+ * Output: \verbinclude Cwise_array_power_array.out
+ */
+EIGEN_MAKE_CWISE_BINARY_OP(pow, pow)
+
+/** \returns an expression of the coefficient-wise atan2(\c *this, \a y), where \a y is the given array argument.
+ *
+ * This function computes the coefficient-wise atan2.
+ *
+ */
+EIGEN_MAKE_CWISE_BINARY_OP(atan2, atan2)
+
+// TODO code generating macros could be moved to Macros.h and could include generation of documentation
+#define EIGEN_MAKE_CWISE_COMP_OP(OP, COMPARATOR)                                                                     \
+  template <typename OtherDerived>                                                                                   \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const                                                                        \
+      CwiseBinaryOp<internal::scalar_cmp_op<Scalar, typename OtherDerived::Scalar, internal::cmp_##COMPARATOR>,      \
+                    const Derived, const OtherDerived>                                                               \
+      OP(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const {                                        \
+    return CwiseBinaryOp<internal::scalar_cmp_op<Scalar, typename OtherDerived::Scalar, internal::cmp_##COMPARATOR>, \
+                         const Derived, const OtherDerived>(derived(), other.derived());                             \
+  }                                                                                                                  \
+  typedef CwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_##COMPARATOR>, const Derived,          \
+                        const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject> >                    \
+      Cmp##COMPARATOR##ReturnType;                                                                                   \
+  typedef CwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_##COMPARATOR>,                         \
+                        const CwiseNullaryOp<internal::scalar_constant_op<Scalar>, PlainObject>, const Derived>      \
+      RCmp##COMPARATOR##ReturnType;                                                                                  \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Cmp##COMPARATOR##ReturnType OP(const Scalar &s) const {                \
+    return this->OP(Derived::PlainObject::Constant(rows(), cols(), s));                                              \
+  }                                                                                                                  \
+  EIGEN_DEVICE_FUNC friend EIGEN_STRONG_INLINE const RCmp##COMPARATOR##ReturnType OP(                                \
+      const Scalar &s, const EIGEN_CURRENT_STORAGE_BASE_CLASS<Derived> &d) {                                         \
+    return Derived::PlainObject::Constant(d.rows(), d.cols(), s).OP(d);                                              \
+  }
+
+#define EIGEN_MAKE_CWISE_COMP_R_OP(OP, R_OP, RCOMPARATOR)                                                             \
+  template <typename OtherDerived>                                                                                    \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const                                                                         \
+      CwiseBinaryOp<internal::scalar_cmp_op<typename OtherDerived::Scalar, Scalar, internal::cmp_##RCOMPARATOR>,      \
+                    const OtherDerived, const Derived>                                                                \
+      OP(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const {                                         \
+    return CwiseBinaryOp<internal::scalar_cmp_op<typename OtherDerived::Scalar, Scalar, internal::cmp_##RCOMPARATOR>, \
+                         const OtherDerived, const Derived>(other.derived(), derived());                              \
+  }                                                                                                                   \
+  EIGEN_DEVICE_FUNC inline const RCmp##RCOMPARATOR##ReturnType OP(const Scalar &s) const {                            \
+    return Derived::PlainObject::Constant(rows(), cols(), s).R_OP(*this);                                             \
+  }                                                                                                                   \
+  friend inline const Cmp##RCOMPARATOR##ReturnType OP(const Scalar &s, const Derived &d) {                            \
+    return d.R_OP(Derived::PlainObject::Constant(d.rows(), d.cols(), s));                                             \
+  }
+
+/** \returns an expression of the coefficient-wise \< operator of *this and \a other
+ *
+ * Example: \include Cwise_less.cpp
+ * Output: \verbinclude Cwise_less.out
+ *
+ * \sa all(), any(), operator>(), operator<=()
+ */
+EIGEN_MAKE_CWISE_COMP_OP(operator<, LT)
+
+/** \returns an expression of the coefficient-wise \<= operator of *this and \a other
+ *
+ * Example: \include Cwise_less_equal.cpp
+ * Output: \verbinclude Cwise_less_equal.out
+ *
+ * \sa all(), any(), operator>=(), operator<()
+ */
+EIGEN_MAKE_CWISE_COMP_OP(operator<=, LE)
+
+/** \returns an expression of the coefficient-wise \> operator of *this and \a other
+ *
+ * Example: \include Cwise_greater.cpp
+ * Output: \verbinclude Cwise_greater.out
+ *
+ * \sa all(), any(), operator>=(), operator<()
+ */
+EIGEN_MAKE_CWISE_COMP_R_OP(operator>, operator<, LT)
+
+/** \returns an expression of the coefficient-wise \>= operator of *this and \a other
+ *
+ * Example: \include Cwise_greater_equal.cpp
+ * Output: \verbinclude Cwise_greater_equal.out
+ *
+ * \sa all(), any(), operator>(), operator<=()
+ */
+EIGEN_MAKE_CWISE_COMP_R_OP(operator>=, operator<=, LE)
+
+/** \returns an expression of the coefficient-wise == operator of *this and \a other
+ *
+ * \warning this performs an exact comparison, which is generally a bad idea with floating-point types.
+ * In order to check for equality between two vectors or matrices with floating-point coefficients, it is
+ * generally a far better idea to use a fuzzy comparison as provided by isApprox() and
+ * isMuchSmallerThan().
+ *
+ * Example: \include Cwise_equal_equal.cpp
+ * Output: \verbinclude Cwise_equal_equal.out
+ *
+ * \sa all(), any(), isApprox(), isMuchSmallerThan()
+ */
+EIGEN_MAKE_CWISE_COMP_OP(operator==, EQ)
+
+/** \returns an expression of the coefficient-wise != operator of *this and \a other
+ *
+ * \warning this performs an exact comparison, which is generally a bad idea with floating-point types.
+ * In order to check for equality between two vectors or matrices with floating-point coefficients, it is
+ * generally a far better idea to use a fuzzy comparison as provided by isApprox() and
+ * isMuchSmallerThan().
+ *
+ * Example: \include Cwise_not_equal.cpp
+ * Output: \verbinclude Cwise_not_equal.out
+ *
+ * \sa all(), any(), isApprox(), isMuchSmallerThan()
+ */
+EIGEN_MAKE_CWISE_COMP_OP(operator!=, NEQ)
+
+#undef EIGEN_MAKE_CWISE_COMP_OP
+#undef EIGEN_MAKE_CWISE_COMP_R_OP
+
+// scalar addition
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+EIGEN_MAKE_SCALAR_BINARY_OP(operator+, sum)
+#else
+/** \returns an expression of \c *this with each coeff incremented by the constant \a scalar
+ *
+ * \tparam T is the scalar type of \a scalar. It must be compatible with the scalar type of the given expression.
+ *
+ * Example: \include Cwise_plus.cpp
+ * Output: \verbinclude Cwise_plus.out
+ *
+ * \sa operator+=(), operator-()
+ */
+template <typename T>
+const CwiseBinaryOp<internal::scalar_sum_op<Scalar, T>, Derived, Constant<T> > operator+(const T &scalar) const;
+/** \returns an expression of \a expr with each coeff incremented by the constant \a scalar
+ *
+ * \tparam T is the scalar type of \a scalar. It must be compatible with the scalar type of the given expression.
+ */
+template <typename T>
+friend const CwiseBinaryOp<internal::scalar_sum_op<T, Scalar>, Constant<T>, Derived> operator+(
+    const T &scalar, const StorageBaseType &expr);
+#endif
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+EIGEN_MAKE_SCALAR_BINARY_OP(operator-, difference)
+#else
+/** \returns an expression of \c *this with each coeff decremented by the constant \a scalar
+ *
+ * \tparam T is the scalar type of \a scalar. It must be compatible with the scalar type of the given expression.
+ *
+ * Example: \include Cwise_minus.cpp
+ * Output: \verbinclude Cwise_minus.out
+ *
+ * \sa operator+=(), operator-()
+ */
+template <typename T>
+const CwiseBinaryOp<internal::scalar_difference_op<Scalar, T>, Derived, Constant<T> > operator-(const T &scalar) const;
+/** \returns an expression of the constant matrix of value \a scalar decremented by the coefficients of \a expr
+ *
+ * \tparam T is the scalar type of \a scalar. It must be compatible with the scalar type of the given expression.
+ */
+template <typename T>
+friend const CwiseBinaryOp<internal::scalar_difference_op<T, Scalar>, Constant<T>, Derived> operator-(
+    const T &scalar, const StorageBaseType &expr);
+#endif
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+EIGEN_MAKE_SCALAR_BINARY_OP_ONTHELEFT(operator/, quotient)
+#else
+/**
+ * \brief Component-wise division of the scalar \a s by array elements of \a a.
+ *
+ * \tparam Scalar is the scalar type of \a x. It must be compatible with the scalar type of the given array expression
+ * (\c Derived::Scalar).
+ */
+template <typename T>
+friend inline const CwiseBinaryOp<internal::scalar_quotient_op<T, Scalar>, Constant<T>, Derived> operator/(
+    const T &s, const StorageBaseType &a);
+#endif
+
+// NOTE disabled until we agree on argument order
+#if 0
+/** \cpp11 \returns an expression of the coefficient-wise polygamma function.
+  *
+  * \specialfunctions_module
+  *
+  * It returns the \a n -th derivative of the digamma(psi) evaluated at \c *this.
+  *
+  * \warning Be careful with the order of the parameters: x.polygamma(n) is equivalent to polygamma(n,x)
+  *
+  * \sa Eigen::polygamma()
+  */
+template<typename DerivedN>
+inline const CwiseBinaryOp<internal::scalar_polygamma_op<Scalar>, const DerivedN, const Derived>
+polygamma(const EIGEN_CURRENT_STORAGE_BASE_CLASS<DerivedN> &n) const
+{
+  return CwiseBinaryOp<internal::scalar_polygamma_op<Scalar>, const DerivedN, const Derived>(n.derived(), this->derived());
+}
+#endif
+
+/** \returns an expression of the coefficient-wise zeta function.
+ *
+ * \specialfunctions_module
+ *
+ * It returns the Riemann zeta function of two arguments \c *this and \a q:
+ *
+ * \param q is the shift, it must be > 0
+ *
+ * \note *this is the exponent, it must be > 1.
+ * \note This function supports only float and double scalar types. To support other scalar types, the user has
+ * to provide implementations of zeta(T,T) for any scalar type T to be supported.
+ *
+ * This method is an alias for zeta(*this,q);
+ *
+ * \sa Eigen::zeta()
+ */
+template <typename DerivedQ>
+inline const CwiseBinaryOp<internal::scalar_zeta_op<Scalar>, const Derived, const DerivedQ> zeta(
+    const EIGEN_CURRENT_STORAGE_BASE_CLASS<DerivedQ> &q) const {
+  return CwiseBinaryOp<internal::scalar_zeta_op<Scalar>, const Derived, const DerivedQ>(this->derived(), q.derived());
+}
diff --git a/inst/include/Eigen/src/plugins/ArrayCwiseUnaryOps.h b/inst/include/Eigen/src/plugins/ArrayCwiseUnaryOps.h
deleted file mode 100644
index 1c3ed3fc..00000000
--- a/inst/include/Eigen/src/plugins/ArrayCwiseUnaryOps.h
+++ /dev/null
@@ -1,187 +0,0 @@
-
-
-/** \returns an expression of the coefficient-wise absolute value of \c *this
-  *
-  * Example: \include Cwise_abs.cpp
-  * Output: \verbinclude Cwise_abs.out
-  *
-  * \sa abs2()
-  */
-EIGEN_STRONG_INLINE const CwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived>
-abs() const
-{
-  return derived();
-}
-
-/** \returns an expression of the coefficient-wise squared absolute value of \c *this
-  *
-  * Example: \include Cwise_abs2.cpp
-  * Output: \verbinclude Cwise_abs2.out
-  *
-  * \sa abs(), square()
-  */
-EIGEN_STRONG_INLINE const CwiseUnaryOp<internal::scalar_abs2_op<Scalar>, const Derived>
-abs2() const
-{
-  return derived();
-}
-
-/** \returns an expression of the coefficient-wise exponential of *this.
-  *
-  * Example: \include Cwise_exp.cpp
-  * Output: \verbinclude Cwise_exp.out
-  *
-  * \sa pow(), log(), sin(), cos()
-  */
-inline const CwiseUnaryOp<internal::scalar_exp_op<Scalar>, const Derived>
-exp() const
-{
-  return derived();
-}
-
-/** \returns an expression of the coefficient-wise logarithm of *this.
-  *
-  * Example: \include Cwise_log.cpp
-  * Output: \verbinclude Cwise_log.out
-  *
-  * \sa exp()
-  */
-inline const CwiseUnaryOp<internal::scalar_log_op<Scalar>, const Derived>
-log() const
-{
-  return derived();
-}
-
-/** \returns an expression of the coefficient-wise square root of *this.
-  *
-  * Example: \include Cwise_sqrt.cpp
-  * Output: \verbinclude Cwise_sqrt.out
-  *
-  * \sa pow(), square()
-  */
-inline const CwiseUnaryOp<internal::scalar_sqrt_op<Scalar>, const Derived>
-sqrt() const
-{
-  return derived();
-}
-
-/** \returns an expression of the coefficient-wise cosine of *this.
-  *
-  * Example: \include Cwise_cos.cpp
-  * Output: \verbinclude Cwise_cos.out
-  *
-  * \sa sin(), acos()
-  */
-inline const CwiseUnaryOp<internal::scalar_cos_op<Scalar>, const Derived>
-cos() const
-{
-  return derived();
-}
-
-
-/** \returns an expression of the coefficient-wise sine of *this.
-  *
-  * Example: \include Cwise_sin.cpp
-  * Output: \verbinclude Cwise_sin.out
-  *
-  * \sa cos(), asin()
-  */
-inline const CwiseUnaryOp<internal::scalar_sin_op<Scalar>, const Derived>
-sin() const
-{
-  return derived();
-}
-
-/** \returns an expression of the coefficient-wise arc cosine of *this.
-  *
-  * Example: \include Cwise_acos.cpp
-  * Output: \verbinclude Cwise_acos.out
-  *
-  * \sa cos(), asin()
-  */
-inline const CwiseUnaryOp<internal::scalar_acos_op<Scalar>, const Derived>
-acos() const
-{
-  return derived();
-}
-
-/** \returns an expression of the coefficient-wise arc sine of *this.
-  *
-  * Example: \include Cwise_asin.cpp
-  * Output: \verbinclude Cwise_asin.out
-  *
-  * \sa sin(), acos()
-  */
-inline const CwiseUnaryOp<internal::scalar_asin_op<Scalar>, const Derived>
-asin() const
-{
-  return derived();
-}
-
-/** \returns an expression of the coefficient-wise tan of *this.
-  *
-  * Example: \include Cwise_tan.cpp
-  * Output: \verbinclude Cwise_tan.out
-  *
-  * \sa cos(), sin()
-  */
-inline const CwiseUnaryOp<internal::scalar_tan_op<Scalar>, Derived>
-tan() const
-{
-  return derived();
-}
-
-
-/** \returns an expression of the coefficient-wise power of *this to the given exponent.
-  *
-  * Example: \include Cwise_pow.cpp
-  * Output: \verbinclude Cwise_pow.out
-  *
-  * \sa exp(), log()
-  */
-inline const CwiseUnaryOp<internal::scalar_pow_op<Scalar>, const Derived>
-pow(const Scalar& exponent) const
-{
-  return CwiseUnaryOp<internal::scalar_pow_op<Scalar>, const Derived>
-          (derived(), internal::scalar_pow_op<Scalar>(exponent));
-}
-
-
-/** \returns an expression of the coefficient-wise inverse of *this.
-  *
-  * Example: \include Cwise_inverse.cpp
-  * Output: \verbinclude Cwise_inverse.out
-  *
-  * \sa operator/(), operator*()
-  */
-inline const CwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const Derived>
-inverse() const
-{
-  return derived();
-}
-
-/** \returns an expression of the coefficient-wise square of *this.
-  *
-  * Example: \include Cwise_square.cpp
-  * Output: \verbinclude Cwise_square.out
-  *
-  * \sa operator/(), operator*(), abs2()
-  */
-inline const CwiseUnaryOp<internal::scalar_square_op<Scalar>, const Derived>
-square() const
-{
-  return derived();
-}
-
-/** \returns an expression of the coefficient-wise cube of *this.
-  *
-  * Example: \include Cwise_cube.cpp
-  * Output: \verbinclude Cwise_cube.out
-  *
-  * \sa square(), pow()
-  */
-inline const CwiseUnaryOp<internal::scalar_cube_op<Scalar>, const Derived>
-cube() const
-{
-  return derived();
-}
diff --git a/inst/include/Eigen/src/plugins/ArrayCwiseUnaryOps.inc b/inst/include/Eigen/src/plugins/ArrayCwiseUnaryOps.inc
new file mode 100644
index 00000000..753aeb4f
--- /dev/null
+++ b/inst/include/Eigen/src/plugins/ArrayCwiseUnaryOps.inc
@@ -0,0 +1,544 @@
+typedef CwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived> AbsReturnType;
+typedef CwiseUnaryOp<internal::scalar_arg_op<Scalar>, const Derived> ArgReturnType;
+typedef CwiseUnaryOp<internal::scalar_carg_op<Scalar>, const Derived> CArgReturnType;
+typedef CwiseUnaryOp<internal::scalar_abs2_op<Scalar>, const Derived> Abs2ReturnType;
+typedef CwiseUnaryOp<internal::scalar_sqrt_op<Scalar>, const Derived> SqrtReturnType;
+typedef CwiseUnaryOp<internal::scalar_cbrt_op<Scalar>, const Derived> CbrtReturnType;
+typedef CwiseUnaryOp<internal::scalar_rsqrt_op<Scalar>, const Derived> RsqrtReturnType;
+typedef CwiseUnaryOp<internal::scalar_sign_op<Scalar>, const Derived> SignReturnType;
+typedef CwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const Derived> InverseReturnType;
+typedef CwiseUnaryOp<internal::scalar_boolean_not_op<Scalar>, const Derived> BooleanNotReturnType;
+typedef CwiseUnaryOp<internal::scalar_bitwise_not_op<Scalar>, const Derived> BitwiseNotReturnType;
+
+typedef CwiseUnaryOp<internal::scalar_exp_op<Scalar>, const Derived> ExpReturnType;
+typedef CwiseUnaryOp<internal::scalar_exp2_op<Scalar>, const Derived> Exp2ReturnType;
+typedef CwiseUnaryOp<internal::scalar_expm1_op<Scalar>, const Derived> Expm1ReturnType;
+typedef CwiseUnaryOp<internal::scalar_log_op<Scalar>, const Derived> LogReturnType;
+typedef CwiseUnaryOp<internal::scalar_log1p_op<Scalar>, const Derived> Log1pReturnType;
+typedef CwiseUnaryOp<internal::scalar_log10_op<Scalar>, const Derived> Log10ReturnType;
+typedef CwiseUnaryOp<internal::scalar_log2_op<Scalar>, const Derived> Log2ReturnType;
+typedef CwiseUnaryOp<internal::scalar_cos_op<Scalar>, const Derived> CosReturnType;
+typedef CwiseUnaryOp<internal::scalar_sin_op<Scalar>, const Derived> SinReturnType;
+typedef CwiseUnaryOp<internal::scalar_tan_op<Scalar>, const Derived> TanReturnType;
+typedef CwiseUnaryOp<internal::scalar_acos_op<Scalar>, const Derived> AcosReturnType;
+typedef CwiseUnaryOp<internal::scalar_asin_op<Scalar>, const Derived> AsinReturnType;
+typedef CwiseUnaryOp<internal::scalar_atan_op<Scalar>, const Derived> AtanReturnType;
+typedef CwiseUnaryOp<internal::scalar_tanh_op<Scalar>, const Derived> TanhReturnType;
+typedef CwiseUnaryOp<internal::scalar_logistic_op<Scalar>, const Derived> LogisticReturnType;
+typedef CwiseUnaryOp<internal::scalar_sinh_op<Scalar>, const Derived> SinhReturnType;
+typedef CwiseUnaryOp<internal::scalar_atanh_op<Scalar>, const Derived> AtanhReturnType;
+typedef CwiseUnaryOp<internal::scalar_asinh_op<Scalar>, const Derived> AsinhReturnType;
+typedef CwiseUnaryOp<internal::scalar_acosh_op<Scalar>, const Derived> AcoshReturnType;
+typedef CwiseUnaryOp<internal::scalar_cosh_op<Scalar>, const Derived> CoshReturnType;
+typedef CwiseUnaryOp<internal::scalar_square_op<Scalar>, const Derived> SquareReturnType;
+typedef CwiseUnaryOp<internal::scalar_cube_op<Scalar>, const Derived> CubeReturnType;
+typedef CwiseUnaryOp<internal::scalar_round_op<Scalar>, const Derived> RoundReturnType;
+typedef CwiseUnaryOp<internal::scalar_rint_op<Scalar>, const Derived> RintReturnType;
+typedef CwiseUnaryOp<internal::scalar_floor_op<Scalar>, const Derived> FloorReturnType;
+typedef CwiseUnaryOp<internal::scalar_ceil_op<Scalar>, const Derived> CeilReturnType;
+typedef CwiseUnaryOp<internal::scalar_trunc_op<Scalar>, const Derived> TruncReturnType;
+typedef CwiseUnaryOp<internal::scalar_isnan_op<Scalar>, const Derived> IsNaNReturnType;
+typedef CwiseUnaryOp<internal::scalar_isinf_op<Scalar>, const Derived> IsInfReturnType;
+typedef CwiseUnaryOp<internal::scalar_isfinite_op<Scalar>, const Derived> IsFiniteReturnType;
+typedef CwiseUnaryOp<internal::scalar_isfinite_op<Scalar, true>, const Derived> IsFiniteTypedReturnType;
+
+/** \returns an expression of the coefficient-wise absolute value of \c *this
+ *
+ * Example: \include Cwise_abs.cpp
+ * Output: \verbinclude Cwise_abs.out
+ *
+ * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_abs">Math functions</a>, abs2()
+ */
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const AbsReturnType abs() const { return AbsReturnType(derived()); }
+
+/** \returns an expression of the coefficient-wise phase angle of \c *this
+ *
+ * Example: \include Cwise_arg.cpp
+ * Output: \verbinclude Cwise_arg.out
+ *
+ * \sa abs()
+ */
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const ArgReturnType arg() const { return ArgReturnType(derived()); }
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CArgReturnType carg() const { return CArgReturnType(derived()); }
+
+/** \returns an expression of the coefficient-wise squared absolute value of \c *this
+ *
+ * Example: \include Cwise_abs2.cpp
+ * Output: \verbinclude Cwise_abs2.out
+ *
+ * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_abs2">Math functions</a>, abs(), square()
+ */
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Abs2ReturnType abs2() const { return Abs2ReturnType(derived()); }
+
+/** \returns an expression of the coefficient-wise exponential of *this.
+ *
+ * This function computes the coefficient-wise exponential. The function MatrixBase::exp() in the
+ * unsupported module MatrixFunctions computes the matrix exponential.
+ *
+ * Example: \include Cwise_exp.cpp
+ * Output: \verbinclude Cwise_exp.out
+ *
+ * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_exp">Math functions</a>, exp2(), pow(), log(), sin(),
+ * cos()
+ */
+EIGEN_DEVICE_FUNC inline const ExpReturnType exp() const { return ExpReturnType(derived()); }
+
+/** \returns an expression of the coefficient-wise exponential of *this.
+ *
+ * This function computes the coefficient-wise base2 exponential, i.e. 2^x.
+ *
+ * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_exp">Math functions</a>, exp(), pow(), log(), sin(),
+ * cos()
+ */
+EIGEN_DEVICE_FUNC inline const Exp2ReturnType exp2() const { return Exp2ReturnType(derived()); }
+
+/** \returns an expression of the coefficient-wise exponential of *this minus 1.
+ *
+ * In exact arithmetic, \c x.expm1() is equivalent to \c x.exp() - 1,
+ * however, with finite precision, this function is much more accurate when \c x is close to zero.
+ *
+ * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_expm1">Math functions</a>, exp()
+ */
+EIGEN_DEVICE_FUNC inline const Expm1ReturnType expm1() const { return Expm1ReturnType(derived()); }
+
+/** \returns an expression of the coefficient-wise logarithm of *this.
+ *
+ * This function computes the coefficient-wise logarithm. The function MatrixBase::log() in the
+ * unsupported module MatrixFunctions computes the matrix logarithm.
+ *
+ * Example: \include Cwise_log.cpp
+ * Output: \verbinclude Cwise_log.out
+ *
+ * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_log">Math functions</a>, log()
+ */
+EIGEN_DEVICE_FUNC inline const LogReturnType log() const { return LogReturnType(derived()); }
+
+/** \returns an expression of the coefficient-wise logarithm of 1 plus \c *this.
+ *
+ * In exact arithmetic, \c x.log() is equivalent to \c (x+1).log(),
+ * however, with finite precision, this function is much more accurate when \c x is close to zero.
+ *
+ * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_log1p">Math functions</a>, log()
+ */
+EIGEN_DEVICE_FUNC inline const Log1pReturnType log1p() const { return Log1pReturnType(derived()); }
+
+/** \returns an expression of the coefficient-wise base-10 logarithm of *this.
+ *
+ * This function computes the coefficient-wise base-10 logarithm.
+ *
+ * Example: \include Cwise_log10.cpp
+ * Output: \verbinclude Cwise_log10.out
+ *
+ * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_log10">Math functions</a>, log()
+ */
+EIGEN_DEVICE_FUNC inline const Log10ReturnType log10() const { return Log10ReturnType(derived()); }
+
+/** \returns an expression of the coefficient-wise base-2 logarithm of *this.
+ *
+ * This function computes the coefficient-wise base-2 logarithm.
+ *
+ */
+EIGEN_DEVICE_FUNC inline const Log2ReturnType log2() const { return Log2ReturnType(derived()); }
+
+/** \returns an expression of the coefficient-wise square root of *this.
+ *
+ * This function computes the coefficient-wise square root. The function MatrixBase::sqrt() in the
+ * unsupported module MatrixFunctions computes the matrix square root.
+ *
+ * Example: \include Cwise_sqrt.cpp
+ * Output: \verbinclude Cwise_sqrt.out
+ *
+ * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_sqrt">Math functions</a>, pow(), square(), cbrt()
+ */
+EIGEN_DEVICE_FUNC inline const SqrtReturnType sqrt() const { return SqrtReturnType(derived()); }
+
+/** \returns an expression of the coefficient-wise cube root of *this.
+ *
+ * This function computes the coefficient-wise cube root.
+ *
+ * Example: \include Cwise_cbrt.cpp
+ * Output: \verbinclude Cwise_cbrt.out
+ *
+ * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_cbrt">Math functions</a>, sqrt(), pow(), square()
+ */
+EIGEN_DEVICE_FUNC inline const CbrtReturnType cbrt() const { return CbrtReturnType(derived()); }
+
+/** \returns an expression of the coefficient-wise inverse square root of *this.
+ *
+ * This function computes the coefficient-wise inverse square root.
+ *
+ * Example: \include Cwise_sqrt.cpp
+ * Output: \verbinclude Cwise_sqrt.out
+ *
+ * \sa pow(), square()
+ */
+EIGEN_DEVICE_FUNC inline const RsqrtReturnType rsqrt() const { return RsqrtReturnType(derived()); }
+
+/** \returns an expression of the coefficient-wise signum of *this.
+ *
+ * This function computes the coefficient-wise signum.
+ *
+ * Example: \include Cwise_sign.cpp
+ * Output: \verbinclude Cwise_sign.out
+ *
+ * \sa pow(), square()
+ */
+EIGEN_DEVICE_FUNC inline const SignReturnType sign() const { return SignReturnType(derived()); }
+
+/** \returns an expression of the coefficient-wise cosine of *this.
+ *
+ * This function computes the coefficient-wise cosine. The function MatrixBase::cos() in the
+ * unsupported module MatrixFunctions computes the matrix cosine.
+ *
+ * Example: \include Cwise_cos.cpp
+ * Output: \verbinclude Cwise_cos.out
+ *
+ * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_cos">Math functions</a>, sin(), acos()
+ */
+EIGEN_DEVICE_FUNC inline const CosReturnType cos() const { return CosReturnType(derived()); }
+
+/** \returns an expression of the coefficient-wise sine of *this.
+ *
+ * This function computes the coefficient-wise sine. The function MatrixBase::sin() in the
+ * unsupported module MatrixFunctions computes the matrix sine.
+ *
+ * Example: \include Cwise_sin.cpp
+ * Output: \verbinclude Cwise_sin.out
+ *
+ * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_sin">Math functions</a>, cos(), asin()
+ */
+EIGEN_DEVICE_FUNC inline const SinReturnType sin() const { return SinReturnType(derived()); }
+
+/** \returns an expression of the coefficient-wise tan of *this.
+ *
+ * Example: \include Cwise_tan.cpp
+ * Output: \verbinclude Cwise_tan.out
+ *
+ * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_tan">Math functions</a>, cos(), sin()
+ */
+EIGEN_DEVICE_FUNC inline const TanReturnType tan() const { return TanReturnType(derived()); }
+
+/** \returns an expression of the coefficient-wise arc tan of *this.
+ *
+ * Example: \include Cwise_atan.cpp
+ * Output: \verbinclude Cwise_atan.out
+ *
+ * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_atan">Math functions</a>, tan(), asin(), acos()
+ */
+EIGEN_DEVICE_FUNC inline const AtanReturnType atan() const { return AtanReturnType(derived()); }
+
+/** \returns an expression of the coefficient-wise arc cosine of *this.
+ *
+ * Example: \include Cwise_acos.cpp
+ * Output: \verbinclude Cwise_acos.out
+ *
+ * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_acos">Math functions</a>, cos(), asin()
+ */
+EIGEN_DEVICE_FUNC inline const AcosReturnType acos() const { return AcosReturnType(derived()); }
+
+/** \returns an expression of the coefficient-wise arc sine of *this.
+ *
+ * Example: \include Cwise_asin.cpp
+ * Output: \verbinclude Cwise_asin.out
+ *
+ * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_asin">Math functions</a>, sin(), acos()
+ */
+EIGEN_DEVICE_FUNC inline const AsinReturnType asin() const { return AsinReturnType(derived()); }
+
+/** \returns an expression of the coefficient-wise hyperbolic tan of *this.
+ *
+ * Example: \include Cwise_tanh.cpp
+ * Output: \verbinclude Cwise_tanh.out
+ *
+ * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_tanh">Math functions</a>, tan(), sinh(), cosh()
+ */
+EIGEN_DEVICE_FUNC inline const TanhReturnType tanh() const { return TanhReturnType(derived()); }
+
+/** \returns an expression of the coefficient-wise hyperbolic sin of *this.
+ *
+ * Example: \include Cwise_sinh.cpp
+ * Output: \verbinclude Cwise_sinh.out
+ *
+ * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_sinh">Math functions</a>, sin(), tanh(), cosh()
+ */
+EIGEN_DEVICE_FUNC inline const SinhReturnType sinh() const { return SinhReturnType(derived()); }
+
+/** \returns an expression of the coefficient-wise hyperbolic cos of *this.
+ *
+ * Example: \include Cwise_cosh.cpp
+ * Output: \verbinclude Cwise_cosh.out
+ *
+ * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_cosh">Math functions</a>, tanh(), sinh(), cosh()
+ */
+EIGEN_DEVICE_FUNC inline const CoshReturnType cosh() const { return CoshReturnType(derived()); }
+
+/** \returns an expression of the coefficient-wise inverse hyperbolic tan of *this.
+ *
+ * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_atanh">Math functions</a>, atanh(), asinh(), acosh()
+ */
+EIGEN_DEVICE_FUNC inline const AtanhReturnType atanh() const { return AtanhReturnType(derived()); }
+
+/** \returns an expression of the coefficient-wise inverse hyperbolic sin of *this.
+ *
+ * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_asinh">Math functions</a>, atanh(), asinh(), acosh()
+ */
+EIGEN_DEVICE_FUNC inline const AsinhReturnType asinh() const { return AsinhReturnType(derived()); }
+
+/** \returns an expression of the coefficient-wise inverse hyperbolic cos of *this.
+ *
+ * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_acosh">Math functions</a>, atanh(), asinh(), acosh()
+ */
+EIGEN_DEVICE_FUNC inline const AcoshReturnType acosh() const { return AcoshReturnType(derived()); }
+
+/** \returns an expression of the coefficient-wise logistic of *this.
+ */
+EIGEN_DEVICE_FUNC inline const LogisticReturnType logistic() const { return LogisticReturnType(derived()); }
+
+/** \returns an expression of the coefficient-wise inverse of *this.
+ *
+ * Example: \include Cwise_inverse.cpp
+ * Output: \verbinclude Cwise_inverse.out
+ *
+ * \sa operator/(), operator*()
+ */
+EIGEN_DEVICE_FUNC inline const InverseReturnType inverse() const { return InverseReturnType(derived()); }
+
+/** \returns an expression of the coefficient-wise square of *this.
+ *
+ * Example: \include Cwise_square.cpp
+ * Output: \verbinclude Cwise_square.out
+ *
+ * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_squareE">Math functions</a>, abs2(), cube(), pow()
+ */
+EIGEN_DEVICE_FUNC inline const SquareReturnType square() const { return SquareReturnType(derived()); }
+
+/** \returns an expression of the coefficient-wise cube of *this.
+ *
+ * Example: \include Cwise_cube.cpp
+ * Output: \verbinclude Cwise_cube.out
+ *
+ * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_cube">Math functions</a>, square(), pow()
+ */
+EIGEN_DEVICE_FUNC inline const CubeReturnType cube() const { return CubeReturnType(derived()); }
+
+/** \returns an expression of the coefficient-wise rint of *this.
+ *
+ * Example: \include Cwise_rint.cpp
+ * Output: \verbinclude Cwise_rint.out
+ *
+ * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_rint">Math functions</a>, ceil(), floor()
+ */
+EIGEN_DEVICE_FUNC inline const RintReturnType rint() const { return RintReturnType(derived()); }
+
+/** \returns an expression of the coefficient-wise round of *this.
+ *
+ * Example: \include Cwise_round.cpp
+ * Output: \verbinclude Cwise_round.out
+ *
+ * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_round">Math functions</a>, ceil(), floor()
+ */
+EIGEN_DEVICE_FUNC inline const RoundReturnType round() const { return RoundReturnType(derived()); }
+
+/** \returns an expression of the coefficient-wise floor of *this.
+ *
+ * Example: \include Cwise_floor.cpp
+ * Output: \verbinclude Cwise_floor.out
+ *
+ * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_floor">Math functions</a>, ceil(), round()
+ */
+EIGEN_DEVICE_FUNC inline const FloorReturnType floor() const { return FloorReturnType(derived()); }
+
+/** \returns an expression of the coefficient-wise ceil of *this.
+ *
+ * Example: \include Cwise_ceil.cpp
+ * Output: \verbinclude Cwise_ceil.out
+ *
+ * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_ceil">Math functions</a>, floor(), round()
+ */
+EIGEN_DEVICE_FUNC inline const CeilReturnType ceil() const { return CeilReturnType(derived()); }
+
+/** \returns an expression of the coefficient-wise truncation of *this.
+ *
+ * Example: \include Cwise_trunc.cpp
+ * Output: \verbinclude Cwise_trunc.out
+ *
+ * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_trunc">Math functions</a>, floor(), round()
+ */
+EIGEN_DEVICE_FUNC inline const TruncReturnType trunc() const { return TruncReturnType(derived()); }
+
+template <int N>
+struct ShiftRightXpr {
+  typedef CwiseUnaryOp<internal::scalar_shift_right_op<Scalar, N>, const Derived> Type;
+};
+
+/** \returns an expression of \c *this with the \a Scalar type arithmetically
+ * shifted right by \a N bit positions.
+ *
+ * The template parameter \a N specifies the number of bit positions to shift.
+ *
+ * \sa shiftLeft()
+ */
+template <int N>
+EIGEN_DEVICE_FUNC typename ShiftRightXpr<N>::Type shiftRight() const {
+  return typename ShiftRightXpr<N>::Type(derived());
+}
+
+template <int N>
+struct ShiftLeftXpr {
+  typedef CwiseUnaryOp<internal::scalar_shift_left_op<Scalar, N>, const Derived> Type;
+};
+
+/** \returns an expression of \c *this with the \a Scalar type logically
+ * shifted left by \a N bit positions.
+ *
+ * The template parameter \a N specifies the number of bit positions to shift.
+ *
+ * \sa shiftRight()
+ */
+template <int N>
+EIGEN_DEVICE_FUNC typename ShiftLeftXpr<N>::Type shiftLeft() const {
+  return typename ShiftLeftXpr<N>::Type(derived());
+}
+
+/** \returns an expression of the coefficient-wise isnan of *this.
+ *
+ * Example: \include Cwise_isNaN.cpp
+ * Output: \verbinclude Cwise_isNaN.out
+ *
+ * \sa isfinite(), isinf()
+ */
+EIGEN_DEVICE_FUNC inline const IsNaNReturnType isNaN() const { return IsNaNReturnType(derived()); }
+
+/** \returns an expression of the coefficient-wise isinf of *this.
+ *
+ * Example: \include Cwise_isInf.cpp
+ * Output: \verbinclude Cwise_isInf.out
+ *
+ * \sa isnan(), isfinite()
+ */
+EIGEN_DEVICE_FUNC inline const IsInfReturnType isInf() const { return IsInfReturnType(derived()); }
+
+/** \returns an expression of the coefficient-wise isfinite of *this.
+ *
+ * Example: \include Cwise_isFinite.cpp
+ * Output: \verbinclude Cwise_isFinite.out
+ *
+ * \sa isnan(), isinf()
+ */
+EIGEN_DEVICE_FUNC inline const IsFiniteReturnType isFinite() const { return IsFiniteReturnType(derived()); }
+EIGEN_DEVICE_FUNC inline const IsFiniteTypedReturnType isFiniteTyped() const {
+  return IsFiniteTypedReturnType(derived());
+}
+
+/** \returns an expression of the coefficient-wise ! operator of *this
+ *
+ * Example: \include Cwise_boolean_not.cpp
+ * Output: \verbinclude Cwise_boolean_not.out
+ *
+ * \sa operator!=()
+ */
+EIGEN_DEVICE_FUNC inline const BooleanNotReturnType operator!() const { return BooleanNotReturnType(derived()); }
+
+/** \returns an expression of the bitwise ~ operator of *this
+ */
+EIGEN_DEVICE_FUNC inline const BitwiseNotReturnType operator~() const { return BitwiseNotReturnType(derived()); }
+
+// --- SpecialFunctions module ---
+
+typedef CwiseUnaryOp<internal::scalar_lgamma_op<Scalar>, const Derived> LgammaReturnType;
+typedef CwiseUnaryOp<internal::scalar_digamma_op<Scalar>, const Derived> DigammaReturnType;
+typedef CwiseUnaryOp<internal::scalar_erf_op<Scalar>, const Derived> ErfReturnType;
+typedef CwiseUnaryOp<internal::scalar_erfc_op<Scalar>, const Derived> ErfcReturnType;
+typedef CwiseUnaryOp<internal::scalar_ndtri_op<Scalar>, const Derived> NdtriReturnType;
+
+/** \cpp11 \returns an expression of the coefficient-wise ln(|gamma(*this)|).
+ *
+ * \specialfunctions_module
+ *
+ * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
+ * or float/double in non c++11 mode, the user has to provide implementations of lgamma(T) for any scalar
+ * type T to be supported.
+ *
+ * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_lgamma">Math functions</a>, digamma()
+ */
+EIGEN_DEVICE_FUNC inline const LgammaReturnType lgamma() const { return LgammaReturnType(derived()); }
+
+/** \returns an expression of the coefficient-wise digamma (psi, derivative of lgamma).
+ *
+ * \specialfunctions_module
+ *
+ * \note This function supports only float and double scalar types. To support other scalar types,
+ * the user has to provide implementations of digamma(T) for any scalar
+ * type T to be supported.
+ *
+ * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_digamma">Math functions</a>, Eigen::digamma(),
+ * Eigen::polygamma(), lgamma()
+ */
+EIGEN_DEVICE_FUNC inline const DigammaReturnType digamma() const { return DigammaReturnType(derived()); }
+
+/** \cpp11 \returns an expression of the coefficient-wise Gauss error
+ * function of *this.
+ *
+ * \specialfunctions_module
+ *
+ * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
+ * or float/double in non c++11 mode, the user has to provide implementations of erf(T) for any scalar
+ * type T to be supported.
+ *
+ * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_erf">Math functions</a>, erfc()
+ */
+EIGEN_DEVICE_FUNC inline const ErfReturnType erf() const { return ErfReturnType(derived()); }
+
+/** \cpp11 \returns an expression of the coefficient-wise Complementary error
+ * function of *this.
+ *
+ * \specialfunctions_module
+ *
+ * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
+ * or float/double in non c++11 mode, the user has to provide implementations of erfc(T) for any scalar
+ * type T to be supported.
+ *
+ * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_erfc">Math functions</a>, erf()
+ */
+EIGEN_DEVICE_FUNC inline const ErfcReturnType erfc() const { return ErfcReturnType(derived()); }
+
+/** \returns an expression of the coefficient-wise inverse of the CDF of the Normal distribution function
+ * function of *this.
+ *
+ * \specialfunctions_module
+ *
+ * In other words, considering `x = ndtri(y)`, it returns the argument, x, for which the area under the
+ * Gaussian probability density function (integrated from minus infinity to x) is equal to y.
+ *
+ * \note This function supports only float and double scalar types. To support other scalar types,
+ * the user has to provide implementations of ndtri(T) for any scalar type T to be supported.
+ *
+ * \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_ndtri">Math functions</a>
+ */
+EIGEN_DEVICE_FUNC inline const NdtriReturnType ndtri() const { return NdtriReturnType(derived()); }
+
+template <typename ScalarExponent>
+using UnaryPowReturnType =
+    std::enable_if_t<internal::is_arithmetic<typename NumTraits<ScalarExponent>::Real>::value,
+                     CwiseUnaryOp<internal::scalar_unary_pow_op<Scalar, ScalarExponent>, const Derived>>;
+
+/** \returns an expression of the coefficients of \c *this raised to the constant power \a exponent
+ *
+ * \tparam ScalarExponent is the scalar type of \a exponent. It must be compatible with the scalar type
+ *         of the given expression.
+ * \param exponent the scalar exponent value.
+ *
+ * This function computes the coefficient-wise power. The function MatrixBase::pow() in the
+ * unsupported module MatrixFunctions computes the matrix power.
+ *
+ * Example: \include Cwise_pow.cpp
+ * Output: \verbinclude Cwise_pow.out
+ *
+ * \sa ArrayBase::pow(ArrayBase), square(), cube(), exp(), log()
+ */
+template <typename ScalarExponent>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const UnaryPowReturnType<ScalarExponent> pow(
+    const ScalarExponent& exponent) const {
+  return UnaryPowReturnType<ScalarExponent>(derived(), internal::scalar_unary_pow_op<Scalar, ScalarExponent>(exponent));
+}
diff --git a/inst/include/Eigen/src/plugins/BlockMethods.h b/inst/include/Eigen/src/plugins/BlockMethods.h
deleted file mode 100644
index 2788251e..00000000
--- a/inst/include/Eigen/src/plugins/BlockMethods.h
+++ /dev/null
@@ -1,935 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
-// Copyright (C) 2006-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-
-/** \internal expression type of a column */
-typedef Block<Derived, internal::traits<Derived>::RowsAtCompileTime, 1, !IsRowMajor> ColXpr;
-typedef const Block<const Derived, internal::traits<Derived>::RowsAtCompileTime, 1, !IsRowMajor> ConstColXpr;
-/** \internal expression type of a row */
-typedef Block<Derived, 1, internal::traits<Derived>::ColsAtCompileTime, IsRowMajor> RowXpr;
-typedef const Block<const Derived, 1, internal::traits<Derived>::ColsAtCompileTime, IsRowMajor> ConstRowXpr;
-/** \internal expression type of a block of whole columns */
-typedef Block<Derived, internal::traits<Derived>::RowsAtCompileTime, Dynamic, !IsRowMajor> ColsBlockXpr;
-typedef const Block<const Derived, internal::traits<Derived>::RowsAtCompileTime, Dynamic, !IsRowMajor> ConstColsBlockXpr;
-/** \internal expression type of a block of whole rows */
-typedef Block<Derived, Dynamic, internal::traits<Derived>::ColsAtCompileTime, IsRowMajor> RowsBlockXpr;
-typedef const Block<const Derived, Dynamic, internal::traits<Derived>::ColsAtCompileTime, IsRowMajor> ConstRowsBlockXpr;
-/** \internal expression type of a block of whole columns */
-template<int N> struct NColsBlockXpr { typedef Block<Derived, internal::traits<Derived>::RowsAtCompileTime, N, !IsRowMajor> Type; };
-template<int N> struct ConstNColsBlockXpr { typedef const Block<const Derived, internal::traits<Derived>::RowsAtCompileTime, N, !IsRowMajor> Type; };
-/** \internal expression type of a block of whole rows */
-template<int N> struct NRowsBlockXpr { typedef Block<Derived, N, internal::traits<Derived>::ColsAtCompileTime, IsRowMajor> Type; };
-template<int N> struct ConstNRowsBlockXpr { typedef const Block<const Derived, N, internal::traits<Derived>::ColsAtCompileTime, IsRowMajor> Type; };
-
-typedef VectorBlock<Derived> SegmentReturnType;
-typedef const VectorBlock<const Derived> ConstSegmentReturnType;
-template<int Size> struct FixedSegmentReturnType { typedef VectorBlock<Derived, Size> Type; };
-template<int Size> struct ConstFixedSegmentReturnType { typedef const VectorBlock<const Derived, Size> Type; };
-
-#endif // not EIGEN_PARSED_BY_DOXYGEN
-
-/** \returns a dynamic-size expression of a block in *this.
-  *
-  * \param startRow the first row in the block
-  * \param startCol the first column in the block
-  * \param blockRows the number of rows in the block
-  * \param blockCols the number of columns in the block
-  *
-  * Example: \include MatrixBase_block_int_int_int_int.cpp
-  * Output: \verbinclude MatrixBase_block_int_int_int_int.out
-  *
-  * \note Even though the returned expression has dynamic size, in the case
-  * when it is applied to a fixed-size matrix, it inherits a fixed maximal size,
-  * which means that evaluating it does not cause a dynamic memory allocation.
-  *
-  * \sa class Block, block(Index,Index)
-  */
-inline Block<Derived> block(Index startRow, Index startCol, Index blockRows, Index blockCols)
-{
-  return Block<Derived>(derived(), startRow, startCol, blockRows, blockCols);
-}
-
-/** This is the const version of block(Index,Index,Index,Index). */
-inline const Block<const Derived> block(Index startRow, Index startCol, Index blockRows, Index blockCols) const
-{
-  return Block<const Derived>(derived(), startRow, startCol, blockRows, blockCols);
-}
-
-
-
-
-/** \returns a dynamic-size expression of a top-right corner of *this.
-  *
-  * \param cRows the number of rows in the corner
-  * \param cCols the number of columns in the corner
-  *
-  * Example: \include MatrixBase_topRightCorner_int_int.cpp
-  * Output: \verbinclude MatrixBase_topRightCorner_int_int.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
-inline Block<Derived> topRightCorner(Index cRows, Index cCols)
-{
-  return Block<Derived>(derived(), 0, cols() - cCols, cRows, cCols);
-}
-
-/** This is the const version of topRightCorner(Index, Index).*/
-inline const Block<const Derived> topRightCorner(Index cRows, Index cCols) const
-{
-  return Block<const Derived>(derived(), 0, cols() - cCols, cRows, cCols);
-}
-
-/** \returns an expression of a fixed-size top-right corner of *this.
-  *
-  * \tparam CRows the number of rows in the corner
-  * \tparam CCols the number of columns in the corner
-  *
-  * Example: \include MatrixBase_template_int_int_topRightCorner.cpp
-  * Output: \verbinclude MatrixBase_template_int_int_topRightCorner.out
-  *
-  * \sa class Block, block<int,int>(Index,Index)
-  */
-template<int CRows, int CCols>
-inline Block<Derived, CRows, CCols> topRightCorner()
-{
-  return Block<Derived, CRows, CCols>(derived(), 0, cols() - CCols);
-}
-
-/** This is the const version of topRightCorner<int, int>().*/
-template<int CRows, int CCols>
-inline const Block<const Derived, CRows, CCols> topRightCorner() const
-{
-  return Block<const Derived, CRows, CCols>(derived(), 0, cols() - CCols);
-}
-
-/** \returns an expression of a top-right corner of *this.
-  *
-  * \tparam CRows number of rows in corner as specified at compile-time
-  * \tparam CCols number of columns in corner as specified at compile-time
-  * \param  cRows number of rows in corner as specified at run-time
-  * \param  cCols number of columns in corner as specified at run-time
-  *
-  * This function is mainly useful for corners where the number of rows is specified at compile-time
-  * and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
-  * information should not contradict. In other words, \a cRows should equal \a CRows unless
-  * \a CRows is \a Dynamic, and the same for the number of columns.
-  *
-  * Example: \include MatrixBase_template_int_int_topRightCorner_int_int.cpp
-  * Output: \verbinclude MatrixBase_template_int_int_topRightCorner_int_int.out
-  *
-  * \sa class Block
-  */
-template<int CRows, int CCols>
-inline Block<Derived, CRows, CCols> topRightCorner(Index cRows, Index cCols)
-{
-  return Block<Derived, CRows, CCols>(derived(), 0, cols() - cCols, cRows, cCols);
-}
-
-/** This is the const version of topRightCorner<int, int>(Index, Index).*/
-template<int CRows, int CCols>
-inline const Block<const Derived, CRows, CCols> topRightCorner(Index cRows, Index cCols) const
-{
-  return Block<const Derived, CRows, CCols>(derived(), 0, cols() - cCols, cRows, cCols);
-}
-
-
-
-/** \returns a dynamic-size expression of a top-left corner of *this.
-  *
-  * \param cRows the number of rows in the corner
-  * \param cCols the number of columns in the corner
-  *
-  * Example: \include MatrixBase_topLeftCorner_int_int.cpp
-  * Output: \verbinclude MatrixBase_topLeftCorner_int_int.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
-inline Block<Derived> topLeftCorner(Index cRows, Index cCols)
-{
-  return Block<Derived>(derived(), 0, 0, cRows, cCols);
-}
-
-/** This is the const version of topLeftCorner(Index, Index).*/
-inline const Block<const Derived> topLeftCorner(Index cRows, Index cCols) const
-{
-  return Block<const Derived>(derived(), 0, 0, cRows, cCols);
-}
-
-/** \returns an expression of a fixed-size top-left corner of *this.
-  *
-  * The template parameters CRows and CCols are the number of rows and columns in the corner.
-  *
-  * Example: \include MatrixBase_template_int_int_topLeftCorner.cpp
-  * Output: \verbinclude MatrixBase_template_int_int_topLeftCorner.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
-template<int CRows, int CCols>
-inline Block<Derived, CRows, CCols> topLeftCorner()
-{
-  return Block<Derived, CRows, CCols>(derived(), 0, 0);
-}
-
-/** This is the const version of topLeftCorner<int, int>().*/
-template<int CRows, int CCols>
-inline const Block<const Derived, CRows, CCols> topLeftCorner() const
-{
-  return Block<const Derived, CRows, CCols>(derived(), 0, 0);
-}
-
-/** \returns an expression of a top-left corner of *this.
-  *
-  * \tparam CRows number of rows in corner as specified at compile-time
-  * \tparam CCols number of columns in corner as specified at compile-time
-  * \param  cRows number of rows in corner as specified at run-time
-  * \param  cCols number of columns in corner as specified at run-time
-  *
-  * This function is mainly useful for corners where the number of rows is specified at compile-time
-  * and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
-  * information should not contradict. In other words, \a cRows should equal \a CRows unless
-  * \a CRows is \a Dynamic, and the same for the number of columns.
-  *
-  * Example: \include MatrixBase_template_int_int_topLeftCorner_int_int.cpp
-  * Output: \verbinclude MatrixBase_template_int_int_topLeftCorner_int_int.out
-  *
-  * \sa class Block
-  */
-template<int CRows, int CCols>
-inline Block<Derived, CRows, CCols> topLeftCorner(Index cRows, Index cCols)
-{
-  return Block<Derived, CRows, CCols>(derived(), 0, 0, cRows, cCols);
-}
-
-/** This is the const version of topLeftCorner<int, int>(Index, Index).*/
-template<int CRows, int CCols>
-inline const Block<const Derived, CRows, CCols> topLeftCorner(Index cRows, Index cCols) const
-{
-  return Block<const Derived, CRows, CCols>(derived(), 0, 0, cRows, cCols);
-}
-
-
-
-/** \returns a dynamic-size expression of a bottom-right corner of *this.
-  *
-  * \param cRows the number of rows in the corner
-  * \param cCols the number of columns in the corner
-  *
-  * Example: \include MatrixBase_bottomRightCorner_int_int.cpp
-  * Output: \verbinclude MatrixBase_bottomRightCorner_int_int.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
-inline Block<Derived> bottomRightCorner(Index cRows, Index cCols)
-{
-  return Block<Derived>(derived(), rows() - cRows, cols() - cCols, cRows, cCols);
-}
-
-/** This is the const version of bottomRightCorner(Index, Index).*/
-inline const Block<const Derived> bottomRightCorner(Index cRows, Index cCols) const
-{
-  return Block<const Derived>(derived(), rows() - cRows, cols() - cCols, cRows, cCols);
-}
-
-/** \returns an expression of a fixed-size bottom-right corner of *this.
-  *
-  * The template parameters CRows and CCols are the number of rows and columns in the corner.
-  *
-  * Example: \include MatrixBase_template_int_int_bottomRightCorner.cpp
-  * Output: \verbinclude MatrixBase_template_int_int_bottomRightCorner.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
-template<int CRows, int CCols>
-inline Block<Derived, CRows, CCols> bottomRightCorner()
-{
-  return Block<Derived, CRows, CCols>(derived(), rows() - CRows, cols() - CCols);
-}
-
-/** This is the const version of bottomRightCorner<int, int>().*/
-template<int CRows, int CCols>
-inline const Block<const Derived, CRows, CCols> bottomRightCorner() const
-{
-  return Block<const Derived, CRows, CCols>(derived(), rows() - CRows, cols() - CCols);
-}
-
-/** \returns an expression of a bottom-right corner of *this.
-  *
-  * \tparam CRows number of rows in corner as specified at compile-time
-  * \tparam CCols number of columns in corner as specified at compile-time
-  * \param  cRows number of rows in corner as specified at run-time
-  * \param  cCols number of columns in corner as specified at run-time
-  *
-  * This function is mainly useful for corners where the number of rows is specified at compile-time
-  * and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
-  * information should not contradict. In other words, \a cRows should equal \a CRows unless
-  * \a CRows is \a Dynamic, and the same for the number of columns.
-  *
-  * Example: \include MatrixBase_template_int_int_bottomRightCorner_int_int.cpp
-  * Output: \verbinclude MatrixBase_template_int_int_bottomRightCorner_int_int.out
-  *
-  * \sa class Block
-  */
-template<int CRows, int CCols>
-inline Block<Derived, CRows, CCols> bottomRightCorner(Index cRows, Index cCols)
-{
-  return Block<Derived, CRows, CCols>(derived(), rows() - cRows, cols() - cCols, cRows, cCols);
-}
-
-/** This is the const version of bottomRightCorner<int, int>(Index, Index).*/
-template<int CRows, int CCols>
-inline const Block<const Derived, CRows, CCols> bottomRightCorner(Index cRows, Index cCols) const
-{
-  return Block<const Derived, CRows, CCols>(derived(), rows() - cRows, cols() - cCols, cRows, cCols);
-}
-
-
-
-/** \returns a dynamic-size expression of a bottom-left corner of *this.
-  *
-  * \param cRows the number of rows in the corner
-  * \param cCols the number of columns in the corner
-  *
-  * Example: \include MatrixBase_bottomLeftCorner_int_int.cpp
-  * Output: \verbinclude MatrixBase_bottomLeftCorner_int_int.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
-inline Block<Derived> bottomLeftCorner(Index cRows, Index cCols)
-{
-  return Block<Derived>(derived(), rows() - cRows, 0, cRows, cCols);
-}
-
-/** This is the const version of bottomLeftCorner(Index, Index).*/
-inline const Block<const Derived> bottomLeftCorner(Index cRows, Index cCols) const
-{
-  return Block<const Derived>(derived(), rows() - cRows, 0, cRows, cCols);
-}
-
-/** \returns an expression of a fixed-size bottom-left corner of *this.
-  *
-  * The template parameters CRows and CCols are the number of rows and columns in the corner.
-  *
-  * Example: \include MatrixBase_template_int_int_bottomLeftCorner.cpp
-  * Output: \verbinclude MatrixBase_template_int_int_bottomLeftCorner.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
-template<int CRows, int CCols>
-inline Block<Derived, CRows, CCols> bottomLeftCorner()
-{
-  return Block<Derived, CRows, CCols>(derived(), rows() - CRows, 0);
-}
-
-/** This is the const version of bottomLeftCorner<int, int>().*/
-template<int CRows, int CCols>
-inline const Block<const Derived, CRows, CCols> bottomLeftCorner() const
-{
-  return Block<const Derived, CRows, CCols>(derived(), rows() - CRows, 0);
-}
-
-/** \returns an expression of a bottom-left corner of *this.
-  *
-  * \tparam CRows number of rows in corner as specified at compile-time
-  * \tparam CCols number of columns in corner as specified at compile-time
-  * \param  cRows number of rows in corner as specified at run-time
-  * \param  cCols number of columns in corner as specified at run-time
-  *
-  * This function is mainly useful for corners where the number of rows is specified at compile-time
-  * and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
-  * information should not contradict. In other words, \a cRows should equal \a CRows unless
-  * \a CRows is \a Dynamic, and the same for the number of columns.
-  *
-  * Example: \include MatrixBase_template_int_int_bottomLeftCorner_int_int.cpp
-  * Output: \verbinclude MatrixBase_template_int_int_bottomLeftCorner_int_int.out
-  *
-  * \sa class Block
-  */
-template<int CRows, int CCols>
-inline Block<Derived, CRows, CCols> bottomLeftCorner(Index cRows, Index cCols)
-{
-  return Block<Derived, CRows, CCols>(derived(), rows() - cRows, 0, cRows, cCols);
-}
-
-/** This is the const version of bottomLeftCorner<int, int>(Index, Index).*/
-template<int CRows, int CCols>
-inline const Block<const Derived, CRows, CCols> bottomLeftCorner(Index cRows, Index cCols) const
-{
-  return Block<const Derived, CRows, CCols>(derived(), rows() - cRows, 0, cRows, cCols);
-}
-
-
-
-/** \returns a block consisting of the top rows of *this.
-  *
-  * \param n the number of rows in the block
-  *
-  * Example: \include MatrixBase_topRows_int.cpp
-  * Output: \verbinclude MatrixBase_topRows_int.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
-inline RowsBlockXpr topRows(Index n)
-{
-  return RowsBlockXpr(derived(), 0, 0, n, cols());
-}
-
-/** This is the const version of topRows(Index).*/
-inline ConstRowsBlockXpr topRows(Index n) const
-{
-  return ConstRowsBlockXpr(derived(), 0, 0, n, cols());
-}
-
-/** \returns a block consisting of the top rows of *this.
-  *
-  * \tparam N the number of rows in the block as specified at compile-time
-  * \param n the number of rows in the block as specified at run-time
-  *
-  * The compile-time and run-time information should not contradict. In other words,
-  * \a n should equal \a N unless \a N is \a Dynamic.
-  *
-  * Example: \include MatrixBase_template_int_topRows.cpp
-  * Output: \verbinclude MatrixBase_template_int_topRows.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
-template<int N>
-inline typename NRowsBlockXpr<N>::Type topRows(Index n = N)
-{
-  return typename NRowsBlockXpr<N>::Type(derived(), 0, 0, n, cols());
-}
-
-/** This is the const version of topRows<int>().*/
-template<int N>
-inline typename ConstNRowsBlockXpr<N>::Type topRows(Index n = N) const
-{
-  return typename ConstNRowsBlockXpr<N>::Type(derived(), 0, 0, n, cols());
-}
-
-
-
-/** \returns a block consisting of the bottom rows of *this.
-  *
-  * \param n the number of rows in the block
-  *
-  * Example: \include MatrixBase_bottomRows_int.cpp
-  * Output: \verbinclude MatrixBase_bottomRows_int.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
-inline RowsBlockXpr bottomRows(Index n)
-{
-  return RowsBlockXpr(derived(), rows() - n, 0, n, cols());
-}
-
-/** This is the const version of bottomRows(Index).*/
-inline ConstRowsBlockXpr bottomRows(Index n) const
-{
-  return ConstRowsBlockXpr(derived(), rows() - n, 0, n, cols());
-}
-
-/** \returns a block consisting of the bottom rows of *this.
-  *
-  * \tparam N the number of rows in the block as specified at compile-time
-  * \param n the number of rows in the block as specified at run-time
-  *
-  * The compile-time and run-time information should not contradict. In other words,
-  * \a n should equal \a N unless \a N is \a Dynamic.
-  *
-  * Example: \include MatrixBase_template_int_bottomRows.cpp
-  * Output: \verbinclude MatrixBase_template_int_bottomRows.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
-template<int N>
-inline typename NRowsBlockXpr<N>::Type bottomRows(Index n = N)
-{
-  return typename NRowsBlockXpr<N>::Type(derived(), rows() - n, 0, n, cols());
-}
-
-/** This is the const version of bottomRows<int>().*/
-template<int N>
-inline typename ConstNRowsBlockXpr<N>::Type bottomRows(Index n = N) const
-{
-  return typename ConstNRowsBlockXpr<N>::Type(derived(), rows() - n, 0, n, cols());
-}
-
-
-
-/** \returns a block consisting of a range of rows of *this.
-  *
-  * \param startRow the index of the first row in the block
-  * \param n the number of rows in the block
-  *
-  * Example: \include DenseBase_middleRows_int.cpp
-  * Output: \verbinclude DenseBase_middleRows_int.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
-inline RowsBlockXpr middleRows(Index startRow, Index n)
-{
-  return RowsBlockXpr(derived(), startRow, 0, n, cols());
-}
-
-/** This is the const version of middleRows(Index,Index).*/
-inline ConstRowsBlockXpr middleRows(Index startRow, Index n) const
-{
-  return ConstRowsBlockXpr(derived(), startRow, 0, n, cols());
-}
-
-/** \returns a block consisting of a range of rows of *this.
-  *
-  * \tparam N the number of rows in the block as specified at compile-time
-  * \param startRow the index of the first row in the block
-  * \param n the number of rows in the block as specified at run-time
-  *
-  * The compile-time and run-time information should not contradict. In other words,
-  * \a n should equal \a N unless \a N is \a Dynamic.
-  *
-  * Example: \include DenseBase_template_int_middleRows.cpp
-  * Output: \verbinclude DenseBase_template_int_middleRows.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
-template<int N>
-inline typename NRowsBlockXpr<N>::Type middleRows(Index startRow, Index n = N)
-{
-  return typename NRowsBlockXpr<N>::Type(derived(), startRow, 0, n, cols());
-}
-
-/** This is the const version of middleRows<int>().*/
-template<int N>
-inline typename ConstNRowsBlockXpr<N>::Type middleRows(Index startRow, Index n = N) const
-{
-  return typename ConstNRowsBlockXpr<N>::Type(derived(), startRow, 0, n, cols());
-}
-
-
-
-/** \returns a block consisting of the left columns of *this.
-  *
-  * \param n the number of columns in the block
-  *
-  * Example: \include MatrixBase_leftCols_int.cpp
-  * Output: \verbinclude MatrixBase_leftCols_int.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
-inline ColsBlockXpr leftCols(Index n)
-{
-  return ColsBlockXpr(derived(), 0, 0, rows(), n);
-}
-
-/** This is the const version of leftCols(Index).*/
-inline ConstColsBlockXpr leftCols(Index n) const
-{
-  return ConstColsBlockXpr(derived(), 0, 0, rows(), n);
-}
-
-/** \returns a block consisting of the left columns of *this.
-  *
-  * \tparam N the number of columns in the block as specified at compile-time
-  * \param n the number of columns in the block as specified at run-time
-  *
-  * The compile-time and run-time information should not contradict. In other words,
-  * \a n should equal \a N unless \a N is \a Dynamic.
-  *
-  * Example: \include MatrixBase_template_int_leftCols.cpp
-  * Output: \verbinclude MatrixBase_template_int_leftCols.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
-template<int N>
-inline typename NColsBlockXpr<N>::Type leftCols(Index n = N)
-{
-  return typename NColsBlockXpr<N>::Type(derived(), 0, 0, rows(), n);
-}
-
-/** This is the const version of leftCols<int>().*/
-template<int N>
-inline typename ConstNColsBlockXpr<N>::Type leftCols(Index n = N) const
-{
-  return typename ConstNColsBlockXpr<N>::Type(derived(), 0, 0, rows(), n);
-}
-
-
-
-/** \returns a block consisting of the right columns of *this.
-  *
-  * \param n the number of columns in the block
-  *
-  * Example: \include MatrixBase_rightCols_int.cpp
-  * Output: \verbinclude MatrixBase_rightCols_int.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
-inline ColsBlockXpr rightCols(Index n)
-{
-  return ColsBlockXpr(derived(), 0, cols() - n, rows(), n);
-}
-
-/** This is the const version of rightCols(Index).*/
-inline ConstColsBlockXpr rightCols(Index n) const
-{
-  return ConstColsBlockXpr(derived(), 0, cols() - n, rows(), n);
-}
-
-/** \returns a block consisting of the right columns of *this.
-  *
-  * \tparam N the number of columns in the block as specified at compile-time
-  * \param n the number of columns in the block as specified at run-time
-  *
-  * The compile-time and run-time information should not contradict. In other words,
-  * \a n should equal \a N unless \a N is \a Dynamic.
-  *
-  * Example: \include MatrixBase_template_int_rightCols.cpp
-  * Output: \verbinclude MatrixBase_template_int_rightCols.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
-template<int N>
-inline typename NColsBlockXpr<N>::Type rightCols(Index n = N)
-{
-  return typename NColsBlockXpr<N>::Type(derived(), 0, cols() - n, rows(), n);
-}
-
-/** This is the const version of rightCols<int>().*/
-template<int N>
-inline typename ConstNColsBlockXpr<N>::Type rightCols(Index n = N) const
-{
-  return typename ConstNColsBlockXpr<N>::Type(derived(), 0, cols() - n, rows(), n);
-}
-
-
-
-/** \returns a block consisting of a range of columns of *this.
-  *
-  * \param startCol the index of the first column in the block
-  * \param numCols the number of columns in the block
-  *
-  * Example: \include DenseBase_middleCols_int.cpp
-  * Output: \verbinclude DenseBase_middleCols_int.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
-inline ColsBlockXpr middleCols(Index startCol, Index numCols)
-{
-  return ColsBlockXpr(derived(), 0, startCol, rows(), numCols);
-}
-
-/** This is the const version of middleCols(Index,Index).*/
-inline ConstColsBlockXpr middleCols(Index startCol, Index numCols) const
-{
-  return ConstColsBlockXpr(derived(), 0, startCol, rows(), numCols);
-}
-
-/** \returns a block consisting of a range of columns of *this.
-  *
-  * \tparam N the number of columns in the block as specified at compile-time
-  * \param startCol the index of the first column in the block
-  * \param n the number of columns in the block as specified at run-time
-  *
-  * The compile-time and run-time information should not contradict. In other words,
-  * \a n should equal \a N unless \a N is \a Dynamic.
-  *
-  * Example: \include DenseBase_template_int_middleCols.cpp
-  * Output: \verbinclude DenseBase_template_int_middleCols.out
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
-template<int N>
-inline typename NColsBlockXpr<N>::Type middleCols(Index startCol, Index n = N)
-{
-  return typename NColsBlockXpr<N>::Type(derived(), 0, startCol, rows(), n);
-}
-
-/** This is the const version of middleCols<int>().*/
-template<int N>
-inline typename ConstNColsBlockXpr<N>::Type middleCols(Index startCol, Index n = N) const
-{
-  return typename ConstNColsBlockXpr<N>::Type(derived(), 0, startCol, rows(), n);
-}
-
-
-
-/** \returns a fixed-size expression of a block in *this.
-  *
-  * The template parameters \a BlockRows and \a BlockCols are the number of
-  * rows and columns in the block.
-  *
-  * \param startRow the first row in the block
-  * \param startCol the first column in the block
-  *
-  * Example: \include MatrixBase_block_int_int.cpp
-  * Output: \verbinclude MatrixBase_block_int_int.out
-  *
-  * \note since block is a templated member, the keyword template has to be used
-  * if the matrix type is also a template parameter: \code m.template block<3,3>(1,1); \endcode
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
-template<int BlockRows, int BlockCols>
-inline Block<Derived, BlockRows, BlockCols> block(Index startRow, Index startCol)
-{
-  return Block<Derived, BlockRows, BlockCols>(derived(), startRow, startCol);
-}
-
-/** This is the const version of block<>(Index, Index). */
-template<int BlockRows, int BlockCols>
-inline const Block<const Derived, BlockRows, BlockCols> block(Index startRow, Index startCol) const
-{
-  return Block<const Derived, BlockRows, BlockCols>(derived(), startRow, startCol);
-}
-
-/** \returns an expression of a block in *this.
-  *
-  * \tparam BlockRows number of rows in block as specified at compile-time
-  * \tparam BlockCols number of columns in block as specified at compile-time
-  * \param  startRow  the first row in the block
-  * \param  startCol  the first column in the block
-  * \param  blockRows number of rows in block as specified at run-time
-  * \param  blockCols number of columns in block as specified at run-time
-  *
-  * This function is mainly useful for blocks where the number of rows is specified at compile-time
-  * and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
-  * information should not contradict. In other words, \a blockRows should equal \a BlockRows unless
-  * \a BlockRows is \a Dynamic, and the same for the number of columns.
-  *
-  * Example: \include MatrixBase_template_int_int_block_int_int_int_int.cpp
-  * Output: \verbinclude MatrixBase_template_int_int_block_int_int_int_int.cpp
-  *
-  * \sa class Block, block(Index,Index,Index,Index)
-  */
-template<int BlockRows, int BlockCols>
-inline Block<Derived, BlockRows, BlockCols> block(Index startRow, Index startCol, 
-                                                  Index blockRows, Index blockCols)
-{
-  return Block<Derived, BlockRows, BlockCols>(derived(), startRow, startCol, blockRows, blockCols);
-}
-
-/** This is the const version of block<>(Index, Index, Index, Index). */
-template<int BlockRows, int BlockCols>
-inline const Block<const Derived, BlockRows, BlockCols> block(Index startRow, Index startCol,
-                                                              Index blockRows, Index blockCols) const
-{
-  return Block<const Derived, BlockRows, BlockCols>(derived(), startRow, startCol, blockRows, blockCols);
-}
-
-/** \returns an expression of the \a i-th column of *this. Note that the numbering starts at 0.
-  *
-  * Example: \include MatrixBase_col.cpp
-  * Output: \verbinclude MatrixBase_col.out
-  *
-  * \sa row(), class Block */
-inline ColXpr col(Index i)
-{
-  return ColXpr(derived(), i);
-}
-
-/** This is the const version of col(). */
-inline ConstColXpr col(Index i) const
-{
-  return ConstColXpr(derived(), i);
-}
-
-/** \returns an expression of the \a i-th row of *this. Note that the numbering starts at 0.
-  *
-  * Example: \include MatrixBase_row.cpp
-  * Output: \verbinclude MatrixBase_row.out
-  *
-  * \sa col(), class Block */
-inline RowXpr row(Index i)
-{
-  return RowXpr(derived(), i);
-}
-
-/** This is the const version of row(). */
-inline ConstRowXpr row(Index i) const
-{
-  return ConstRowXpr(derived(), i);
-}
-
-/** \returns a dynamic-size expression of a segment (i.e. a vector block) in *this.
-  *
-  * \only_for_vectors
-  *
-  * \param start the first coefficient in the segment
-  * \param n the number of coefficients in the segment
-  *
-  * Example: \include MatrixBase_segment_int_int.cpp
-  * Output: \verbinclude MatrixBase_segment_int_int.out
-  *
-  * \note Even though the returned expression has dynamic size, in the case
-  * when it is applied to a fixed-size vector, it inherits a fixed maximal size,
-  * which means that evaluating it does not cause a dynamic memory allocation.
-  *
-  * \sa class Block, segment(Index)
-  */
-inline SegmentReturnType segment(Index start, Index n)
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return SegmentReturnType(derived(), start, n);
-}
-
-
-/** This is the const version of segment(Index,Index).*/
-inline ConstSegmentReturnType segment(Index start, Index n) const
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return ConstSegmentReturnType(derived(), start, n);
-}
-
-/** \returns a dynamic-size expression of the first coefficients of *this.
-  *
-  * \only_for_vectors
-  *
-  * \param n the number of coefficients in the segment
-  *
-  * Example: \include MatrixBase_start_int.cpp
-  * Output: \verbinclude MatrixBase_start_int.out
-  *
-  * \note Even though the returned expression has dynamic size, in the case
-  * when it is applied to a fixed-size vector, it inherits a fixed maximal size,
-  * which means that evaluating it does not cause a dynamic memory allocation.
-  *
-  * \sa class Block, block(Index,Index)
-  */
-inline SegmentReturnType head(Index n)
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return SegmentReturnType(derived(), 0, n);
-}
-
-/** This is the const version of head(Index).*/
-inline ConstSegmentReturnType head(Index n) const
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return ConstSegmentReturnType(derived(), 0, n);
-}
-
-/** \returns a dynamic-size expression of the last coefficients of *this.
-  *
-  * \only_for_vectors
-  *
-  * \param n the number of coefficients in the segment
-  *
-  * Example: \include MatrixBase_end_int.cpp
-  * Output: \verbinclude MatrixBase_end_int.out
-  *
-  * \note Even though the returned expression has dynamic size, in the case
-  * when it is applied to a fixed-size vector, it inherits a fixed maximal size,
-  * which means that evaluating it does not cause a dynamic memory allocation.
-  *
-  * \sa class Block, block(Index,Index)
-  */
-inline SegmentReturnType tail(Index n)
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return SegmentReturnType(derived(), this->size() - n, n);
-}
-
-/** This is the const version of tail(Index).*/
-inline ConstSegmentReturnType tail(Index n) const
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return ConstSegmentReturnType(derived(), this->size() - n, n);
-}
-
-/** \returns a fixed-size expression of a segment (i.e. a vector block) in \c *this
-  *
-  * \only_for_vectors
-  *
-  * \tparam N the number of coefficients in the segment as specified at compile-time
-  * \param start the index of the first element in the segment
-  * \param n the number of coefficients in the segment as specified at compile-time
-  *
-  * The compile-time and run-time information should not contradict. In other words,
-  * \a n should equal \a N unless \a N is \a Dynamic.
-  *
-  * Example: \include MatrixBase_template_int_segment.cpp
-  * Output: \verbinclude MatrixBase_template_int_segment.out
-  *
-  * \sa class Block
-  */
-template<int N>
-inline typename FixedSegmentReturnType<N>::Type segment(Index start, Index n = N)
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return typename FixedSegmentReturnType<N>::Type(derived(), start, n);
-}
-
-/** This is the const version of segment<int>(Index).*/
-template<int N>
-inline typename ConstFixedSegmentReturnType<N>::Type segment(Index start, Index n = N) const
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return typename ConstFixedSegmentReturnType<N>::Type(derived(), start, n);
-}
-
-/** \returns a fixed-size expression of the first coefficients of *this.
-  *
-  * \only_for_vectors
-  *
-  * \tparam N the number of coefficients in the segment as specified at compile-time
-  * \param  n the number of coefficients in the segment as specified at run-time
-  *
-  * The compile-time and run-time information should not contradict. In other words,
-  * \a n should equal \a N unless \a N is \a Dynamic.
-  *
-  * Example: \include MatrixBase_template_int_start.cpp
-  * Output: \verbinclude MatrixBase_template_int_start.out
-  *
-  * \sa class Block
-  */
-template<int N>
-inline typename FixedSegmentReturnType<N>::Type head(Index n = N)
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return typename FixedSegmentReturnType<N>::Type(derived(), 0, n);
-}
-
-/** This is the const version of head<int>().*/
-template<int N>
-inline typename ConstFixedSegmentReturnType<N>::Type head(Index n = N) const
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return typename ConstFixedSegmentReturnType<N>::Type(derived(), 0, n);
-}
-
-/** \returns a fixed-size expression of the last coefficients of *this.
-  *
-  * \only_for_vectors
-  *
-  * \tparam N the number of coefficients in the segment as specified at compile-time
-  * \param  n the number of coefficients in the segment as specified at run-time
-  *
-  * The compile-time and run-time information should not contradict. In other words,
-  * \a n should equal \a N unless \a N is \a Dynamic.
-  *
-  * Example: \include MatrixBase_template_int_end.cpp
-  * Output: \verbinclude MatrixBase_template_int_end.out
-  *
-  * \sa class Block
-  */
-template<int N>
-inline typename FixedSegmentReturnType<N>::Type tail(Index n = N)
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return typename FixedSegmentReturnType<N>::Type(derived(), size() - n);
-}
-
-/** This is the const version of tail<int>.*/
-template<int N>
-inline typename ConstFixedSegmentReturnType<N>::Type tail(Index n = N) const
-{
-  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
-  return typename ConstFixedSegmentReturnType<N>::Type(derived(), size() - n);
-}
diff --git a/inst/include/Eigen/src/plugins/BlockMethods.inc b/inst/include/Eigen/src/plugins/BlockMethods.inc
new file mode 100644
index 00000000..0782aa39
--- /dev/null
+++ b/inst/include/Eigen/src/plugins/BlockMethods.inc
@@ -0,0 +1,1370 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2010 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2006-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+
+/// \internal expression type of a column */
+typedef Block<Derived, internal::traits<Derived>::RowsAtCompileTime, 1, !IsRowMajor> ColXpr;
+typedef const Block<const Derived, internal::traits<Derived>::RowsAtCompileTime, 1, !IsRowMajor> ConstColXpr;
+/// \internal expression type of a row */
+typedef Block<Derived, 1, internal::traits<Derived>::ColsAtCompileTime, IsRowMajor> RowXpr;
+typedef const Block<const Derived, 1, internal::traits<Derived>::ColsAtCompileTime, IsRowMajor> ConstRowXpr;
+/// \internal expression type of a block of whole columns */
+typedef Block<Derived, internal::traits<Derived>::RowsAtCompileTime, Dynamic, !IsRowMajor> ColsBlockXpr;
+typedef const Block<const Derived, internal::traits<Derived>::RowsAtCompileTime, Dynamic, !IsRowMajor>
+    ConstColsBlockXpr;
+/// \internal expression type of a block of whole rows */
+typedef Block<Derived, Dynamic, internal::traits<Derived>::ColsAtCompileTime, IsRowMajor> RowsBlockXpr;
+typedef const Block<const Derived, Dynamic, internal::traits<Derived>::ColsAtCompileTime, IsRowMajor> ConstRowsBlockXpr;
+/// \internal expression type of a block of whole columns */
+template <int N>
+struct NColsBlockXpr {
+  typedef Block<Derived, internal::traits<Derived>::RowsAtCompileTime, N, !IsRowMajor> Type;
+};
+template <int N>
+struct ConstNColsBlockXpr {
+  typedef const Block<const Derived, internal::traits<Derived>::RowsAtCompileTime, N, !IsRowMajor> Type;
+};
+/// \internal expression type of a block of whole rows */
+template <int N>
+struct NRowsBlockXpr {
+  typedef Block<Derived, N, internal::traits<Derived>::ColsAtCompileTime, IsRowMajor> Type;
+};
+template <int N>
+struct ConstNRowsBlockXpr {
+  typedef const Block<const Derived, N, internal::traits<Derived>::ColsAtCompileTime, IsRowMajor> Type;
+};
+/// \internal expression of a block */
+typedef Block<Derived> BlockXpr;
+typedef const Block<const Derived> ConstBlockXpr;
+/// \internal expression of a block of fixed sizes */
+template <int Rows, int Cols>
+struct FixedBlockXpr {
+  typedef Block<Derived, Rows, Cols> Type;
+};
+template <int Rows, int Cols>
+struct ConstFixedBlockXpr {
+  typedef Block<const Derived, Rows, Cols> Type;
+};
+
+typedef VectorBlock<Derived> SegmentReturnType;
+typedef const VectorBlock<const Derived> ConstSegmentReturnType;
+template <int Size>
+struct FixedSegmentReturnType {
+  typedef VectorBlock<Derived, Size> Type;
+};
+template <int Size>
+struct ConstFixedSegmentReturnType {
+  typedef const VectorBlock<const Derived, Size> Type;
+};
+
+/// \internal inner-vector
+typedef Block<Derived, IsRowMajor ? 1 : Dynamic, IsRowMajor ? Dynamic : 1, true> InnerVectorReturnType;
+typedef Block<const Derived, IsRowMajor ? 1 : Dynamic, IsRowMajor ? Dynamic : 1, true> ConstInnerVectorReturnType;
+
+/// \internal set of inner-vectors
+typedef Block<Derived, Dynamic, Dynamic, true> InnerVectorsReturnType;
+typedef Block<const Derived, Dynamic, Dynamic, true> ConstInnerVectorsReturnType;
+
+#endif  // not EIGEN_PARSED_BY_DOXYGEN
+
+/// \returns an expression of a block in \c *this with either dynamic or fixed sizes.
+///
+/// \param  startRow  the first row in the block
+/// \param  startCol  the first column in the block
+/// \param  blockRows number of rows in the block, specified at either run-time or compile-time
+/// \param  blockCols number of columns in the block, specified at either run-time or compile-time
+/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index.
+/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index.
+///
+/// Example using runtime (aka dynamic) sizes: \include MatrixBase_block_int_int_int_int.cpp
+/// Output: \verbinclude MatrixBase_block_int_int_int_int.out
+///
+/// \newin{3.4}:
+///
+/// The number of rows \a blockRows and columns \a blockCols can also be specified at compile-time by passing
+/// Eigen::fix<N>, or Eigen::fix<N>(n) as arguments. In the later case, \c n plays the role of a runtime fallback value
+/// in case \c N equals Eigen::Dynamic. Here is an example with a fixed number of rows \c NRows and dynamic number of
+/// columns \c cols: \code mat.block(i,j,fix<NRows>,cols) \endcode
+///
+/// This function thus fully covers the features offered by the following overloads block<NRows,NCols>(Index, Index),
+/// and block<NRows,NCols>(Index, Index, Index, Index) that are thus obsolete. Indeed, this generic version avoids
+/// redundancy, it preserves the argument order, and prevents the need to rely on the template keyword in templated
+/// code.
+///
+/// but with less redundancy and more consistency as it does not modify the argument order
+/// and seamlessly enable hybrid fixed/dynamic sizes.
+///
+/// \note Even in the case that the returned expression has dynamic size, in the case
+/// when it is applied to a fixed-size matrix, it inherits a fixed maximal size,
+/// which means that evaluating it does not cause a dynamic memory allocation.
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block, fix, fix<N>(int)
+///
+template <typename NRowsType, typename NColsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,
+                           internal::get_fixed_value<NColsType>::value>::Type
+#else
+    typename FixedBlockXpr<..., ...>::Type
+#endif
+    block(Index startRow, Index startCol, NRowsType blockRows, NColsType blockCols) {
+  return
+      typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,
+                             internal::get_fixed_value<NColsType>::value>::Type(derived(), startRow, startCol,
+                                                                                internal::get_runtime_value(blockRows),
+                                                                                internal::get_runtime_value(blockCols));
+}
+
+/// This is the const version of block(Index,Index,NRowsType,NColsType)
+template <typename NRowsType, typename NColsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    const typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,
+                                      internal::get_fixed_value<NColsType>::value>::Type
+#else
+    const typename ConstFixedBlockXpr<..., ...>::Type
+#endif
+    block(Index startRow, Index startCol, NRowsType blockRows, NColsType blockCols) const {
+  return typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,
+                                     internal::get_fixed_value<NColsType>::value>::Type(derived(), startRow, startCol,
+                                                                                        internal::get_runtime_value(
+                                                                                            blockRows),
+                                                                                        internal::get_runtime_value(
+                                                                                            blockCols));
+}
+
+/// \returns a expression of a top-right corner of \c *this with either dynamic or fixed sizes.
+///
+/// \param cRows the number of rows in the corner
+/// \param cCols the number of columns in the corner
+/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index.
+/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index.
+///
+/// Example with dynamic sizes: \include MatrixBase_topRightCorner_int_int.cpp
+/// Output: \verbinclude MatrixBase_topRightCorner_int_int.out
+///
+/// The number of rows \a blockRows and columns \a blockCols can also be specified at compile-time by passing
+/// Eigen::fix<N>, or Eigen::fix<N>(n) as arguments. See \link block(Index,Index,NRowsType,NColsType) block() \endlink
+/// for the details.
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
+///
+template <typename NRowsType, typename NColsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,
+                           internal::get_fixed_value<NColsType>::value>::Type
+#else
+    typename FixedBlockXpr<..., ...>::Type
+#endif
+    topRightCorner(NRowsType cRows, NColsType cCols) {
+  return typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,
+                                internal::get_fixed_value<NColsType>::value>::Type(derived(), 0,
+                                                                                   cols() - internal::get_runtime_value(
+                                                                                                cCols),
+                                                                                   internal::get_runtime_value(cRows),
+                                                                                   internal::get_runtime_value(cCols));
+}
+
+/// This is the const version of topRightCorner(NRowsType, NColsType).
+template <typename NRowsType, typename NColsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    const typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,
+                                      internal::get_fixed_value<NColsType>::value>::Type
+#else
+    const typename ConstFixedBlockXpr<..., ...>::Type
+#endif
+    topRightCorner(NRowsType cRows, NColsType cCols) const {
+  return
+      typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,
+                                  internal::get_fixed_value<NColsType>::value>::Type(derived(), 0,
+                                                                                     cols() -
+                                                                                         internal::get_runtime_value(
+                                                                                             cCols),
+                                                                                     internal::get_runtime_value(cRows),
+                                                                                     internal::get_runtime_value(
+                                                                                         cCols));
+}
+
+/// \returns an expression of a fixed-size top-right corner of \c *this.
+///
+/// \tparam CRows the number of rows in the corner
+/// \tparam CCols the number of columns in the corner
+///
+/// Example: \include MatrixBase_template_int_int_topRightCorner.cpp
+/// Output: \verbinclude MatrixBase_template_int_int_topRightCorner.out
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block, block<int,int>(Index,Index)
+///
+template <int CRows, int CCols>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename FixedBlockXpr<CRows, CCols>::Type topRightCorner() {
+  return typename FixedBlockXpr<CRows, CCols>::Type(derived(), 0, cols() - CCols);
+}
+
+/// This is the const version of topRightCorner<int, int>().
+template <int CRows, int CCols>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename ConstFixedBlockXpr<CRows, CCols>::Type topRightCorner() const {
+  return typename ConstFixedBlockXpr<CRows, CCols>::Type(derived(), 0, cols() - CCols);
+}
+
+/// \returns an expression of a top-right corner of \c *this.
+///
+/// \tparam CRows number of rows in corner as specified at compile-time
+/// \tparam CCols number of columns in corner as specified at compile-time
+/// \param  cRows number of rows in corner as specified at run-time
+/// \param  cCols number of columns in corner as specified at run-time
+///
+/// This function is mainly useful for corners where the number of rows is specified at compile-time
+/// and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
+/// information should not contradict. In other words, \a cRows should equal \a CRows unless
+/// \a CRows is \a Dynamic, and the same for the number of columns.
+///
+/// Example: \include MatrixBase_template_int_int_topRightCorner_int_int.cpp
+/// Output: \verbinclude MatrixBase_template_int_int_topRightCorner_int_int.out
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block
+///
+template <int CRows, int CCols>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename FixedBlockXpr<CRows, CCols>::Type topRightCorner(Index cRows,
+                                                                                                Index cCols) {
+  return typename FixedBlockXpr<CRows, CCols>::Type(derived(), 0, cols() - cCols, cRows, cCols);
+}
+
+/// This is the const version of topRightCorner<int, int>(Index, Index).
+template <int CRows, int CCols>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename ConstFixedBlockXpr<CRows, CCols>::Type topRightCorner(
+    Index cRows, Index cCols) const {
+  return typename ConstFixedBlockXpr<CRows, CCols>::Type(derived(), 0, cols() - cCols, cRows, cCols);
+}
+
+/// \returns an expression of a top-left corner of \c *this  with either dynamic or fixed sizes.
+///
+/// \param cRows the number of rows in the corner
+/// \param cCols the number of columns in the corner
+/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index.
+/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index.
+///
+/// Example: \include MatrixBase_topLeftCorner_int_int.cpp
+/// Output: \verbinclude MatrixBase_topLeftCorner_int_int.out
+///
+/// The number of rows \a blockRows and columns \a blockCols can also be specified at compile-time by passing
+/// Eigen::fix<N>, or Eigen::fix<N>(n) as arguments. See \link block(Index,Index,NRowsType,NColsType) block() \endlink
+/// for the details.
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
+///
+template <typename NRowsType, typename NColsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,
+                           internal::get_fixed_value<NColsType>::value>::Type
+#else
+    typename FixedBlockXpr<..., ...>::Type
+#endif
+    topLeftCorner(NRowsType cRows, NColsType cCols) {
+  return typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,
+                                internal::get_fixed_value<NColsType>::value>::Type(derived(), 0, 0,
+                                                                                   internal::get_runtime_value(cRows),
+                                                                                   internal::get_runtime_value(cCols));
+}
+
+/// This is the const version of topLeftCorner(Index, Index).
+template <typename NRowsType, typename NColsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    const typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,
+                                      internal::get_fixed_value<NColsType>::value>::Type
+#else
+    const typename ConstFixedBlockXpr<..., ...>::Type
+#endif
+    topLeftCorner(NRowsType cRows, NColsType cCols) const {
+  return
+      typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,
+                                  internal::get_fixed_value<NColsType>::value>::Type(derived(), 0, 0,
+                                                                                     internal::get_runtime_value(cRows),
+                                                                                     internal::get_runtime_value(
+                                                                                         cCols));
+}
+
+/// \returns an expression of a fixed-size top-left corner of \c *this.
+///
+/// The template parameters CRows and CCols are the number of rows and columns in the corner.
+///
+/// Example: \include MatrixBase_template_int_int_topLeftCorner.cpp
+/// Output: \verbinclude MatrixBase_template_int_int_topLeftCorner.out
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
+///
+template <int CRows, int CCols>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename FixedBlockXpr<CRows, CCols>::Type topLeftCorner() {
+  return typename FixedBlockXpr<CRows, CCols>::Type(derived(), 0, 0);
+}
+
+/// This is the const version of topLeftCorner<int, int>().
+template <int CRows, int CCols>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename ConstFixedBlockXpr<CRows, CCols>::Type topLeftCorner() const {
+  return typename ConstFixedBlockXpr<CRows, CCols>::Type(derived(), 0, 0);
+}
+
+/// \returns an expression of a top-left corner of \c *this.
+///
+/// \tparam CRows number of rows in corner as specified at compile-time
+/// \tparam CCols number of columns in corner as specified at compile-time
+/// \param  cRows number of rows in corner as specified at run-time
+/// \param  cCols number of columns in corner as specified at run-time
+///
+/// This function is mainly useful for corners where the number of rows is specified at compile-time
+/// and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
+/// information should not contradict. In other words, \a cRows should equal \a CRows unless
+/// \a CRows is \a Dynamic, and the same for the number of columns.
+///
+/// Example: \include MatrixBase_template_int_int_topLeftCorner_int_int.cpp
+/// Output: \verbinclude MatrixBase_template_int_int_topLeftCorner_int_int.out
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block
+///
+template <int CRows, int CCols>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename FixedBlockXpr<CRows, CCols>::Type topLeftCorner(Index cRows,
+                                                                                               Index cCols) {
+  return typename FixedBlockXpr<CRows, CCols>::Type(derived(), 0, 0, cRows, cCols);
+}
+
+/// This is the const version of topLeftCorner<int, int>(Index, Index).
+template <int CRows, int CCols>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename ConstFixedBlockXpr<CRows, CCols>::Type topLeftCorner(
+    Index cRows, Index cCols) const {
+  return typename ConstFixedBlockXpr<CRows, CCols>::Type(derived(), 0, 0, cRows, cCols);
+}
+
+/// \returns an expression of a bottom-right corner of \c *this  with either dynamic or fixed sizes.
+///
+/// \param cRows the number of rows in the corner
+/// \param cCols the number of columns in the corner
+/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index.
+/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index.
+///
+/// Example: \include MatrixBase_bottomRightCorner_int_int.cpp
+/// Output: \verbinclude MatrixBase_bottomRightCorner_int_int.out
+///
+/// The number of rows \a blockRows and columns \a blockCols can also be specified at compile-time by passing
+/// Eigen::fix<N>, or Eigen::fix<N>(n) as arguments. See \link block(Index,Index,NRowsType,NColsType) block() \endlink
+/// for the details.
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
+///
+template <typename NRowsType, typename NColsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,
+                           internal::get_fixed_value<NColsType>::value>::Type
+#else
+    typename FixedBlockXpr<..., ...>::Type
+#endif
+    bottomRightCorner(NRowsType cRows, NColsType cCols) {
+  return
+      typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value, internal::get_fixed_value<NColsType>::value>::
+          Type(derived(), rows() - internal::get_runtime_value(cRows), cols() - internal::get_runtime_value(cCols),
+               internal::get_runtime_value(cRows), internal::get_runtime_value(cCols));
+}
+
+/// This is the const version of bottomRightCorner(NRowsType, NColsType).
+template <typename NRowsType, typename NColsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    const typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,
+                                      internal::get_fixed_value<NColsType>::value>::Type
+#else
+    const typename ConstFixedBlockXpr<..., ...>::Type
+#endif
+    bottomRightCorner(NRowsType cRows, NColsType cCols) const {
+  return typename ConstFixedBlockXpr<
+      internal::get_fixed_value<NRowsType>::value,
+      internal::get_fixed_value<NColsType>::value>::Type(derived(), rows() - internal::get_runtime_value(cRows),
+                                                         cols() - internal::get_runtime_value(cCols),
+                                                         internal::get_runtime_value(cRows),
+                                                         internal::get_runtime_value(cCols));
+}
+
+/// \returns an expression of a fixed-size bottom-right corner of \c *this.
+///
+/// The template parameters CRows and CCols are the number of rows and columns in the corner.
+///
+/// Example: \include MatrixBase_template_int_int_bottomRightCorner.cpp
+/// Output: \verbinclude MatrixBase_template_int_int_bottomRightCorner.out
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
+///
+template <int CRows, int CCols>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename FixedBlockXpr<CRows, CCols>::Type bottomRightCorner() {
+  return typename FixedBlockXpr<CRows, CCols>::Type(derived(), rows() - CRows, cols() - CCols);
+}
+
+/// This is the const version of bottomRightCorner<int, int>().
+template <int CRows, int CCols>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename ConstFixedBlockXpr<CRows, CCols>::Type bottomRightCorner() const {
+  return typename ConstFixedBlockXpr<CRows, CCols>::Type(derived(), rows() - CRows, cols() - CCols);
+}
+
+/// \returns an expression of a bottom-right corner of \c *this.
+///
+/// \tparam CRows number of rows in corner as specified at compile-time
+/// \tparam CCols number of columns in corner as specified at compile-time
+/// \param  cRows number of rows in corner as specified at run-time
+/// \param  cCols number of columns in corner as specified at run-time
+///
+/// This function is mainly useful for corners where the number of rows is specified at compile-time
+/// and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
+/// information should not contradict. In other words, \a cRows should equal \a CRows unless
+/// \a CRows is \a Dynamic, and the same for the number of columns.
+///
+/// Example: \include MatrixBase_template_int_int_bottomRightCorner_int_int.cpp
+/// Output: \verbinclude MatrixBase_template_int_int_bottomRightCorner_int_int.out
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block
+///
+template <int CRows, int CCols>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename FixedBlockXpr<CRows, CCols>::Type bottomRightCorner(Index cRows,
+                                                                                                   Index cCols) {
+  return typename FixedBlockXpr<CRows, CCols>::Type(derived(), rows() - cRows, cols() - cCols, cRows, cCols);
+}
+
+/// This is the const version of bottomRightCorner<int, int>(Index, Index).
+template <int CRows, int CCols>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename ConstFixedBlockXpr<CRows, CCols>::Type bottomRightCorner(
+    Index cRows, Index cCols) const {
+  return typename ConstFixedBlockXpr<CRows, CCols>::Type(derived(), rows() - cRows, cols() - cCols, cRows, cCols);
+}
+
+/// \returns an expression of a bottom-left corner of \c *this  with either dynamic or fixed sizes.
+///
+/// \param cRows the number of rows in the corner
+/// \param cCols the number of columns in the corner
+/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index.
+/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index.
+///
+/// Example: \include MatrixBase_bottomLeftCorner_int_int.cpp
+/// Output: \verbinclude MatrixBase_bottomLeftCorner_int_int.out
+///
+/// The number of rows \a blockRows and columns \a blockCols can also be specified at compile-time by passing
+/// Eigen::fix<N>, or Eigen::fix<N>(n) as arguments. See \link block(Index,Index,NRowsType,NColsType) block() \endlink
+/// for the details.
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
+///
+template <typename NRowsType, typename NColsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,
+                           internal::get_fixed_value<NColsType>::value>::Type
+#else
+    typename FixedBlockXpr<..., ...>::Type
+#endif
+    bottomLeftCorner(NRowsType cRows, NColsType cCols) {
+  return
+      typename FixedBlockXpr<internal::get_fixed_value<NRowsType>::value,
+                             internal::get_fixed_value<NColsType>::value>::Type(derived(),
+                                                                                rows() -
+                                                                                    internal::get_runtime_value(cRows),
+                                                                                0, internal::get_runtime_value(cRows),
+                                                                                internal::get_runtime_value(cCols));
+}
+
+/// This is the const version of bottomLeftCorner(NRowsType, NColsType).
+template <typename NRowsType, typename NColsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    typename ConstFixedBlockXpr<internal::get_fixed_value<NRowsType>::value,
+                                internal::get_fixed_value<NColsType>::value>::Type
+#else
+    typename ConstFixedBlockXpr<..., ...>::Type
+#endif
+    bottomLeftCorner(NRowsType cRows, NColsType cCols) const {
+  return typename ConstFixedBlockXpr<
+      internal::get_fixed_value<NRowsType>::value,
+      internal::get_fixed_value<NColsType>::value>::Type(derived(), rows() - internal::get_runtime_value(cRows), 0,
+                                                         internal::get_runtime_value(cRows),
+                                                         internal::get_runtime_value(cCols));
+}
+
+/// \returns an expression of a fixed-size bottom-left corner of \c *this.
+///
+/// The template parameters CRows and CCols are the number of rows and columns in the corner.
+///
+/// Example: \include MatrixBase_template_int_int_bottomLeftCorner.cpp
+/// Output: \verbinclude MatrixBase_template_int_int_bottomLeftCorner.out
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
+///
+template <int CRows, int CCols>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename FixedBlockXpr<CRows, CCols>::Type bottomLeftCorner() {
+  return typename FixedBlockXpr<CRows, CCols>::Type(derived(), rows() - CRows, 0);
+}
+
+/// This is the const version of bottomLeftCorner<int, int>().
+template <int CRows, int CCols>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename ConstFixedBlockXpr<CRows, CCols>::Type bottomLeftCorner() const {
+  return typename ConstFixedBlockXpr<CRows, CCols>::Type(derived(), rows() - CRows, 0);
+}
+
+/// \returns an expression of a bottom-left corner of \c *this.
+///
+/// \tparam CRows number of rows in corner as specified at compile-time
+/// \tparam CCols number of columns in corner as specified at compile-time
+/// \param  cRows number of rows in corner as specified at run-time
+/// \param  cCols number of columns in corner as specified at run-time
+///
+/// This function is mainly useful for corners where the number of rows is specified at compile-time
+/// and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
+/// information should not contradict. In other words, \a cRows should equal \a CRows unless
+/// \a CRows is \a Dynamic, and the same for the number of columns.
+///
+/// Example: \include MatrixBase_template_int_int_bottomLeftCorner_int_int.cpp
+/// Output: \verbinclude MatrixBase_template_int_int_bottomLeftCorner_int_int.out
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa class Block
+///
+template <int CRows, int CCols>
+EIGEN_STRONG_INLINE typename FixedBlockXpr<CRows, CCols>::Type bottomLeftCorner(Index cRows, Index cCols) {
+  return typename FixedBlockXpr<CRows, CCols>::Type(derived(), rows() - cRows, 0, cRows, cCols);
+}
+
+/// This is the const version of bottomLeftCorner<int, int>(Index, Index).
+template <int CRows, int CCols>
+EIGEN_STRONG_INLINE const typename ConstFixedBlockXpr<CRows, CCols>::Type bottomLeftCorner(Index cRows,
+                                                                                           Index cCols) const {
+  return typename ConstFixedBlockXpr<CRows, CCols>::Type(derived(), rows() - cRows, 0, cRows, cCols);
+}
+
+/// \returns a block consisting of the top rows of \c *this.
+///
+/// \param n the number of rows in the block
+/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index.
+///
+/// Example: \include MatrixBase_topRows_int.cpp
+/// Output: \verbinclude MatrixBase_topRows_int.out
+///
+/// The number of rows \a n can also be specified at compile-time by passing Eigen::fix<N>,
+/// or Eigen::fix<N>(n) as arguments.
+/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details.
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row - major)
+///
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
+///
+template <typename NRowsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    typename NRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
+#else
+    typename NRowsBlockXpr<...>::Type
+#endif
+    topRows(NRowsType n) {
+  return typename NRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type(
+      derived(), 0, 0, internal::get_runtime_value(n), cols());
+}
+
+/// This is the const version of topRows(NRowsType).
+template <typename NRowsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    const typename ConstNRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
+#else
+    const typename ConstNRowsBlockXpr<...>::Type
+#endif
+    topRows(NRowsType n) const {
+  return typename ConstNRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type(
+      derived(), 0, 0, internal::get_runtime_value(n), cols());
+}
+
+/// \returns a block consisting of the top rows of \c *this.
+///
+/// \tparam N the number of rows in the block as specified at compile-time
+/// \param n the number of rows in the block as specified at run-time
+///
+/// The compile-time and run-time information should not contradict. In other words,
+/// \a n should equal \a N unless \a N is \a Dynamic.
+///
+/// Example: \include MatrixBase_template_int_topRows.cpp
+/// Output: \verbinclude MatrixBase_template_int_topRows.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row - major)
+///
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
+///
+template <int N>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NRowsBlockXpr<N>::Type topRows(Index n = N) {
+  return typename NRowsBlockXpr<N>::Type(derived(), 0, 0, n, cols());
+}
+
+/// This is the const version of topRows<int>().
+template <int N>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ConstNRowsBlockXpr<N>::Type topRows(Index n = N) const {
+  return typename ConstNRowsBlockXpr<N>::Type(derived(), 0, 0, n, cols());
+}
+
+/// \returns a block consisting of the bottom rows of \c *this.
+///
+/// \param n the number of rows in the block
+/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index.
+///
+/// Example: \include MatrixBase_bottomRows_int.cpp
+/// Output: \verbinclude MatrixBase_bottomRows_int.out
+///
+/// The number of rows \a n can also be specified at compile-time by passing Eigen::fix<N>,
+/// or Eigen::fix<N>(n) as arguments.
+/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details.
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row - major)
+///
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
+///
+template <typename NRowsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    typename NRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
+#else
+    typename NRowsBlockXpr<...>::Type
+#endif
+    bottomRows(NRowsType n) {
+  return typename NRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type(
+      derived(), rows() - internal::get_runtime_value(n), 0, internal::get_runtime_value(n), cols());
+}
+
+/// This is the const version of bottomRows(NRowsType).
+template <typename NRowsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    const typename ConstNRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
+#else
+    const typename ConstNRowsBlockXpr<...>::Type
+#endif
+    bottomRows(NRowsType n) const {
+  return typename ConstNRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type(
+      derived(), rows() - internal::get_runtime_value(n), 0, internal::get_runtime_value(n), cols());
+}
+
+/// \returns a block consisting of the bottom rows of \c *this.
+///
+/// \tparam N the number of rows in the block as specified at compile-time
+/// \param n the number of rows in the block as specified at run-time
+///
+/// The compile-time and run-time information should not contradict. In other words,
+/// \a n should equal \a N unless \a N is \a Dynamic.
+///
+/// Example: \include MatrixBase_template_int_bottomRows.cpp
+/// Output: \verbinclude MatrixBase_template_int_bottomRows.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row - major)
+///
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
+///
+template <int N>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NRowsBlockXpr<N>::Type bottomRows(Index n = N) {
+  return typename NRowsBlockXpr<N>::Type(derived(), rows() - n, 0, n, cols());
+}
+
+/// This is the const version of bottomRows<int>().
+template <int N>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ConstNRowsBlockXpr<N>::Type bottomRows(Index n = N) const {
+  return typename ConstNRowsBlockXpr<N>::Type(derived(), rows() - n, 0, n, cols());
+}
+
+/// \returns a block consisting of a range of rows of \c *this.
+///
+/// \param startRow the index of the first row in the block
+/// \param n the number of rows in the block
+/// \tparam NRowsType the type of the value handling the number of rows in the block, typically Index.
+///
+/// Example: \include DenseBase_middleRows_int.cpp
+/// Output: \verbinclude DenseBase_middleRows_int.out
+///
+/// The number of rows \a n can also be specified at compile-time by passing Eigen::fix<N>,
+/// or Eigen::fix<N>(n) as arguments.
+/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details.
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row - major)
+///
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
+///
+template <typename NRowsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    typename NRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
+#else
+    typename NRowsBlockXpr<...>::Type
+#endif
+    middleRows(Index startRow, NRowsType n) {
+  return typename NRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type(
+      derived(), startRow, 0, internal::get_runtime_value(n), cols());
+}
+
+/// This is the const version of middleRows(Index,NRowsType).
+template <typename NRowsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    const typename ConstNRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type
+#else
+    const typename ConstNRowsBlockXpr<...>::Type
+#endif
+    middleRows(Index startRow, NRowsType n) const {
+  return typename ConstNRowsBlockXpr<internal::get_fixed_value<NRowsType>::value>::Type(
+      derived(), startRow, 0, internal::get_runtime_value(n), cols());
+}
+
+/// \returns a block consisting of a range of rows of \c *this.
+///
+/// \tparam N the number of rows in the block as specified at compile-time
+/// \param startRow the index of the first row in the block
+/// \param n the number of rows in the block as specified at run-time
+///
+/// The compile-time and run-time information should not contradict. In other words,
+/// \a n should equal \a N unless \a N is \a Dynamic.
+///
+/// Example: \include DenseBase_template_int_middleRows.cpp
+/// Output: \verbinclude DenseBase_template_int_middleRows.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row - major)
+///
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
+///
+template <int N>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NRowsBlockXpr<N>::Type middleRows(Index startRow, Index n = N) {
+  return typename NRowsBlockXpr<N>::Type(derived(), startRow, 0, n, cols());
+}
+
+/// This is the const version of middleRows<int>().
+template <int N>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ConstNRowsBlockXpr<N>::Type middleRows(Index startRow,
+                                                                                      Index n = N) const {
+  return typename ConstNRowsBlockXpr<N>::Type(derived(), startRow, 0, n, cols());
+}
+
+/// \returns a block consisting of the left columns of \c *this.
+///
+/// \param n the number of columns in the block
+/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index.
+///
+/// Example: \include MatrixBase_leftCols_int.cpp
+/// Output: \verbinclude MatrixBase_leftCols_int.out
+///
+/// The number of columns \a n can also be specified at compile-time by passing Eigen::fix<N>,
+/// or Eigen::fix<N>(n) as arguments.
+/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details.
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column - major)
+///
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
+///
+template <typename NColsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    typename NColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
+#else
+    typename NColsBlockXpr<...>::Type
+#endif
+    leftCols(NColsType n) {
+  return typename NColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type(derived(), 0, 0, rows(),
+                                                                                   internal::get_runtime_value(n));
+}
+
+/// This is the const version of leftCols(NColsType).
+template <typename NColsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    const typename ConstNColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
+#else
+    const typename ConstNColsBlockXpr<...>::Type
+#endif
+    leftCols(NColsType n) const {
+  return typename ConstNColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type(derived(), 0, 0, rows(),
+                                                                                        internal::get_runtime_value(n));
+}
+
+/// \returns a block consisting of the left columns of \c *this.
+///
+/// \tparam N the number of columns in the block as specified at compile-time
+/// \param n the number of columns in the block as specified at run-time
+///
+/// The compile-time and run-time information should not contradict. In other words,
+/// \a n should equal \a N unless \a N is \a Dynamic.
+///
+/// Example: \include MatrixBase_template_int_leftCols.cpp
+/// Output: \verbinclude MatrixBase_template_int_leftCols.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column - major)
+///
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
+///
+template <int N>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NColsBlockXpr<N>::Type leftCols(Index n = N) {
+  return typename NColsBlockXpr<N>::Type(derived(), 0, 0, rows(), n);
+}
+
+/// This is the const version of leftCols<int>().
+template <int N>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ConstNColsBlockXpr<N>::Type leftCols(Index n = N) const {
+  return typename ConstNColsBlockXpr<N>::Type(derived(), 0, 0, rows(), n);
+}
+
+/// \returns a block consisting of the right columns of \c *this.
+///
+/// \param n the number of columns in the block
+/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index.
+///
+/// Example: \include MatrixBase_rightCols_int.cpp
+/// Output: \verbinclude MatrixBase_rightCols_int.out
+///
+/// The number of columns \a n can also be specified at compile-time by passing Eigen::fix<N>,
+/// or Eigen::fix<N>(n) as arguments.
+/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details.
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column - major)
+///
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
+///
+template <typename NColsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    typename NColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
+#else
+    typename NColsBlockXpr<...>::Type
+#endif
+    rightCols(NColsType n) {
+  return typename NColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type(
+      derived(), 0, cols() - internal::get_runtime_value(n), rows(), internal::get_runtime_value(n));
+}
+
+/// This is the const version of rightCols(NColsType).
+template <typename NColsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    const typename ConstNColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
+#else
+    const typename ConstNColsBlockXpr<...>::Type
+#endif
+    rightCols(NColsType n) const {
+  return typename ConstNColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type(
+      derived(), 0, cols() - internal::get_runtime_value(n), rows(), internal::get_runtime_value(n));
+}
+
+/// \returns a block consisting of the right columns of \c *this.
+///
+/// \tparam N the number of columns in the block as specified at compile-time
+/// \param n the number of columns in the block as specified at run-time
+///
+/// The compile-time and run-time information should not contradict. In other words,
+/// \a n should equal \a N unless \a N is \a Dynamic.
+///
+/// Example: \include MatrixBase_template_int_rightCols.cpp
+/// Output: \verbinclude MatrixBase_template_int_rightCols.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column - major)
+///
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
+///
+template <int N>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NColsBlockXpr<N>::Type rightCols(Index n = N) {
+  return typename NColsBlockXpr<N>::Type(derived(), 0, cols() - n, rows(), n);
+}
+
+/// This is the const version of rightCols<int>().
+template <int N>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ConstNColsBlockXpr<N>::Type rightCols(Index n = N) const {
+  return typename ConstNColsBlockXpr<N>::Type(derived(), 0, cols() - n, rows(), n);
+}
+
+/// \returns a block consisting of a range of columns of \c *this.
+///
+/// \param startCol the index of the first column in the block
+/// \param numCols the number of columns in the block
+/// \tparam NColsType the type of the value handling the number of columns in the block, typically Index.
+///
+/// Example: \include DenseBase_middleCols_int.cpp
+/// Output: \verbinclude DenseBase_middleCols_int.out
+///
+/// The number of columns \a n can also be specified at compile-time by passing Eigen::fix<N>,
+/// or Eigen::fix<N>(n) as arguments.
+/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details.
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column - major)
+///
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
+///
+template <typename NColsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    typename NColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
+#else
+    typename NColsBlockXpr<...>::Type
+#endif
+    middleCols(Index startCol, NColsType numCols) {
+  return typename NColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type(
+      derived(), 0, startCol, rows(), internal::get_runtime_value(numCols));
+}
+
+/// This is the const version of middleCols(Index,NColsType).
+template <typename NColsType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    const typename ConstNColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type
+#else
+    const typename ConstNColsBlockXpr<...>::Type
+#endif
+    middleCols(Index startCol, NColsType numCols) const {
+  return typename ConstNColsBlockXpr<internal::get_fixed_value<NColsType>::value>::Type(
+      derived(), 0, startCol, rows(), internal::get_runtime_value(numCols));
+}
+
+/// \returns a block consisting of a range of columns of \c *this.
+///
+/// \tparam N the number of columns in the block as specified at compile-time
+/// \param startCol the index of the first column in the block
+/// \param n the number of columns in the block as specified at run-time
+///
+/// The compile-time and run-time information should not contradict. In other words,
+/// \a n should equal \a N unless \a N is \a Dynamic.
+///
+/// Example: \include DenseBase_template_int_middleCols.cpp
+/// Output: \verbinclude DenseBase_template_int_middleCols.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column - major)
+///
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
+///
+template <int N>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename NColsBlockXpr<N>::Type middleCols(Index startCol, Index n = N) {
+  return typename NColsBlockXpr<N>::Type(derived(), 0, startCol, rows(), n);
+}
+
+/// This is the const version of middleCols<int>().
+template <int N>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ConstNColsBlockXpr<N>::Type middleCols(Index startCol,
+                                                                                      Index n = N) const {
+  return typename ConstNColsBlockXpr<N>::Type(derived(), 0, startCol, rows(), n);
+}
+
+/// \returns a fixed-size expression of a block of \c *this.
+///
+/// The template parameters \a NRows and \a NCols are the number of
+/// rows and columns in the block.
+///
+/// \param startRow the first row in the block
+/// \param startCol the first column in the block
+///
+/// Example: \include MatrixBase_block_int_int.cpp
+/// Output: \verbinclude MatrixBase_block_int_int.out
+///
+/// \note The usage of of this overload is discouraged from %Eigen 3.4, better used the generic
+/// block(Index,Index,NRowsType,NColsType), here is the one-to-one equivalence:
+/// \code
+/// mat.template block<NRows,NCols>(i,j)  <-->  mat.block(i,j,fix<NRows>,fix<NCols>)
+/// \endcode
+///
+/// \note since block is a templated member, the keyword template has to be used
+/// if the matrix type is also a template parameter: \code m.template block<3,3>(1,1); \endcode
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
+///
+template <int NRows, int NCols>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename FixedBlockXpr<NRows, NCols>::Type block(Index startRow, Index startCol) {
+  return typename FixedBlockXpr<NRows, NCols>::Type(derived(), startRow, startCol);
+}
+
+/// This is the const version of block<>(Index, Index). */
+template <int NRows, int NCols>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename ConstFixedBlockXpr<NRows, NCols>::Type block(
+    Index startRow, Index startCol) const {
+  return typename ConstFixedBlockXpr<NRows, NCols>::Type(derived(), startRow, startCol);
+}
+
+/// \returns an expression of a block of \c *this.
+///
+/// \tparam NRows number of rows in block as specified at compile-time
+/// \tparam NCols number of columns in block as specified at compile-time
+/// \param  startRow  the first row in the block
+/// \param  startCol  the first column in the block
+/// \param  blockRows number of rows in block as specified at run-time
+/// \param  blockCols number of columns in block as specified at run-time
+///
+/// This function is mainly useful for blocks where the number of rows is specified at compile-time
+/// and the number of columns is specified at run-time, or vice versa. The compile-time and run-time
+/// information should not contradict. In other words, \a blockRows should equal \a NRows unless
+/// \a NRows is \a Dynamic, and the same for the number of columns.
+///
+/// Example: \include MatrixBase_template_int_int_block_int_int_int_int.cpp
+/// Output: \verbinclude MatrixBase_template_int_int_block_int_int_int_int.out
+///
+/// \note The usage of of this overload is discouraged from %Eigen 3.4, better used the generic
+/// block(Index,Index,NRowsType,NColsType), here is the one-to-one complete equivalence:
+/// \code
+/// mat.template block<NRows,NCols>(i,j,rows,cols)     <-->  mat.block(i,j,fix<NRows>(rows),fix<NCols>(cols))
+/// \endcode
+/// If we known that, e.g., NRows==Dynamic and NCols!=Dynamic, then the equivalence becomes:
+/// \code
+/// mat.template block<Dynamic,NCols>(i,j,rows,NCols)  <-->  mat.block(i,j,rows,fix<NCols>)
+/// \endcode
+///
+EIGEN_DOC_BLOCK_ADDONS_NOT_INNER_PANEL
+///
+/// \sa block(Index,Index,NRowsType,NColsType), class Block
+///
+template <int NRows, int NCols>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename FixedBlockXpr<NRows, NCols>::Type block(Index startRow, Index startCol,
+                                                                                       Index blockRows,
+                                                                                       Index blockCols) {
+  return typename FixedBlockXpr<NRows, NCols>::Type(derived(), startRow, startCol, blockRows, blockCols);
+}
+
+/// This is the const version of block<>(Index, Index, Index, Index).
+template <int NRows, int NCols>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename ConstFixedBlockXpr<NRows, NCols>::Type block(
+    Index startRow, Index startCol, Index blockRows, Index blockCols) const {
+  return typename ConstFixedBlockXpr<NRows, NCols>::Type(derived(), startRow, startCol, blockRows, blockCols);
+}
+
+/// \returns an expression of the \a i-th column of \c *this. Note that the numbering starts at 0.
+///
+/// Example: \include MatrixBase_col.cpp
+/// Output: \verbinclude MatrixBase_col.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(column - major)
+/**
+ * \sa row(), class Block */
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ColXpr col(Index i) { return ColXpr(derived(), i); }
+
+/// This is the const version of col().
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ConstColXpr col(Index i) const { return ConstColXpr(derived(), i); }
+
+/// \returns an expression of the \a i-th row of \c *this. Note that the numbering starts at 0.
+///
+/// Example: \include MatrixBase_row.cpp
+/// Output: \verbinclude MatrixBase_row.out
+///
+EIGEN_DOC_BLOCK_ADDONS_INNER_PANEL_IF(row - major)
+/**
+ * \sa col(), class Block */
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE RowXpr row(Index i) { return RowXpr(derived(), i); }
+
+/// This is the const version of row(). */
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ConstRowXpr row(Index i) const { return ConstRowXpr(derived(), i); }
+
+/// \returns an expression of a segment (i.e. a vector block) in \c *this with either dynamic or fixed sizes.
+///
+/// \only_for_vectors
+///
+/// \param start the first coefficient in the segment
+/// \param n the number of coefficients in the segment
+/// \tparam NType the type of the value handling the number of coefficients in the segment, typically Index.
+///
+/// Example: \include MatrixBase_segment_int_int.cpp
+/// Output: \verbinclude MatrixBase_segment_int_int.out
+///
+/// The number of coefficients \a n can also be specified at compile-time by passing Eigen::fix<N>,
+/// or Eigen::fix<N>(n) as arguments.
+/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details.
+///
+/// \note Even in the case that the returned expression has dynamic size, in the case
+/// when it is applied to a fixed-size vector, it inherits a fixed maximal size,
+/// which means that evaluating it does not cause a dynamic memory allocation.
+///
+/// \sa block(Index,Index,NRowsType,NColsType), fix<N>, fix<N>(int), class Block
+///
+template <typename NType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    typename FixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
+#else
+    typename FixedSegmentReturnType<...>::Type
+#endif
+    segment(Index start, NType n) {
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return typename FixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type(derived(), start,
+                                                                                        internal::get_runtime_value(n));
+}
+
+/// This is the const version of segment(Index,NType).
+template <typename NType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    const typename ConstFixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
+#else
+    const typename ConstFixedSegmentReturnType<...>::Type
+#endif
+    segment(Index start, NType n) const {
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return typename ConstFixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type(
+      derived(), start, internal::get_runtime_value(n));
+}
+
+/// \returns an expression of the first coefficients of \c *this with either dynamic or fixed sizes.
+///
+/// \only_for_vectors
+///
+/// \param n the number of coefficients in the segment
+/// \tparam NType the type of the value handling the number of coefficients in the segment, typically Index.
+///
+/// Example: \include MatrixBase_start_int.cpp
+/// Output: \verbinclude MatrixBase_start_int.out
+///
+/// The number of coefficients \a n can also be specified at compile-time by passing Eigen::fix<N>,
+/// or Eigen::fix<N>(n) as arguments.
+/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details.
+///
+/// \note Even in the case that the returned expression has dynamic size, in the case
+/// when it is applied to a fixed-size vector, it inherits a fixed maximal size,
+/// which means that evaluating it does not cause a dynamic memory allocation.
+///
+/// \sa class Block, block(Index,Index)
+///
+template <typename NType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    typename FixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
+#else
+    typename FixedSegmentReturnType<...>::Type
+#endif
+    head(NType n) {
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return typename FixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type(derived(), 0,
+                                                                                        internal::get_runtime_value(n));
+}
+
+/// This is the const version of head(NType).
+template <typename NType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    const typename ConstFixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
+#else
+    const typename ConstFixedSegmentReturnType<...>::Type
+#endif
+    head(NType n) const {
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return typename ConstFixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type(
+      derived(), 0, internal::get_runtime_value(n));
+}
+
+/// \returns an expression of a last coefficients of \c *this with either dynamic or fixed sizes.
+///
+/// \only_for_vectors
+///
+/// \param n the number of coefficients in the segment
+/// \tparam NType the type of the value handling the number of coefficients in the segment, typically Index.
+///
+/// Example: \include MatrixBase_end_int.cpp
+/// Output: \verbinclude MatrixBase_end_int.out
+///
+/// The number of coefficients \a n can also be specified at compile-time by passing Eigen::fix<N>,
+/// or Eigen::fix<N>(n) as arguments.
+/// See \link block(Index,Index,NRowsType,NColsType) block() \endlink for the details.
+///
+/// \note Even in the case that the returned expression has dynamic size, in the case
+/// when it is applied to a fixed-size vector, it inherits a fixed maximal size,
+/// which means that evaluating it does not cause a dynamic memory allocation.
+///
+/// \sa class Block, block(Index,Index)
+///
+template <typename NType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    typename FixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
+#else
+    typename FixedSegmentReturnType<...>::Type
+#endif
+    tail(NType n) {
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return typename FixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type(
+      derived(), this->size() - internal::get_runtime_value(n), internal::get_runtime_value(n));
+}
+
+/// This is the const version of tail(Index).
+template <typename NType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    const typename ConstFixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type
+#else
+    const typename ConstFixedSegmentReturnType<...>::Type
+#endif
+    tail(NType n) const {
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return typename ConstFixedSegmentReturnType<internal::get_fixed_value<NType>::value>::Type(
+      derived(), this->size() - internal::get_runtime_value(n), internal::get_runtime_value(n));
+}
+
+/// \returns a fixed-size expression of a segment (i.e. a vector block) in \c *this
+///
+/// \only_for_vectors
+///
+/// \tparam N the number of coefficients in the segment as specified at compile-time
+/// \param start the index of the first element in the segment
+/// \param n the number of coefficients in the segment as specified at compile-time
+///
+/// The compile-time and run-time information should not contradict. In other words,
+/// \a n should equal \a N unless \a N is \a Dynamic.
+///
+/// Example: \include MatrixBase_template_int_segment.cpp
+/// Output: \verbinclude MatrixBase_template_int_segment.out
+///
+/// \sa segment(Index,NType), class Block
+///
+template <int N>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename FixedSegmentReturnType<N>::Type segment(Index start, Index n = N) {
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return typename FixedSegmentReturnType<N>::Type(derived(), start, n);
+}
+
+/// This is the const version of segment<int>(Index).
+template <int N>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ConstFixedSegmentReturnType<N>::Type segment(Index start,
+                                                                                            Index n = N) const {
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return typename ConstFixedSegmentReturnType<N>::Type(derived(), start, n);
+}
+
+/// \returns a fixed-size expression of the first coefficients of \c *this.
+///
+/// \only_for_vectors
+///
+/// \tparam N the number of coefficients in the segment as specified at compile-time
+/// \param  n the number of coefficients in the segment as specified at run-time
+///
+/// The compile-time and run-time information should not contradict. In other words,
+/// \a n should equal \a N unless \a N is \a Dynamic.
+///
+/// Example: \include MatrixBase_template_int_start.cpp
+/// Output: \verbinclude MatrixBase_template_int_start.out
+///
+/// \sa head(NType), class Block
+///
+template <int N>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename FixedSegmentReturnType<N>::Type head(Index n = N) {
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return typename FixedSegmentReturnType<N>::Type(derived(), 0, n);
+}
+
+/// This is the const version of head<int>().
+template <int N>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ConstFixedSegmentReturnType<N>::Type head(Index n = N) const {
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return typename ConstFixedSegmentReturnType<N>::Type(derived(), 0, n);
+}
+
+/// \returns a fixed-size expression of the last coefficients of \c *this.
+///
+/// \only_for_vectors
+///
+/// \tparam N the number of coefficients in the segment as specified at compile-time
+/// \param  n the number of coefficients in the segment as specified at run-time
+///
+/// The compile-time and run-time information should not contradict. In other words,
+/// \a n should equal \a N unless \a N is \a Dynamic.
+///
+/// Example: \include MatrixBase_template_int_end.cpp
+/// Output: \verbinclude MatrixBase_template_int_end.out
+///
+/// \sa tail(NType), class Block
+///
+template <int N>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename FixedSegmentReturnType<N>::Type tail(Index n = N) {
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return typename FixedSegmentReturnType<N>::Type(derived(), size() - n, n);
+}
+
+/// This is the const version of tail<int>.
+template <int N>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ConstFixedSegmentReturnType<N>::Type tail(Index n = N) const {
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return typename ConstFixedSegmentReturnType<N>::Type(derived(), size() - n, n);
+}
+
+/// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
+/// is col-major (resp. row-major).
+///
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE InnerVectorReturnType innerVector(Index outer) {
+  return InnerVectorReturnType(derived(), outer);
+}
+
+/// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
+/// is col-major (resp. row-major). Read-only.
+///
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const ConstInnerVectorReturnType innerVector(Index outer) const {
+  return ConstInnerVectorReturnType(derived(), outer);
+}
+
+/// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
+/// is col-major (resp. row-major).
+///
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE InnerVectorsReturnType innerVectors(Index outerStart, Index outerSize) {
+  return Block<Derived, Dynamic, Dynamic, true>(derived(), IsRowMajor ? outerStart : 0, IsRowMajor ? 0 : outerStart,
+                                                IsRowMajor ? outerSize : rows(), IsRowMajor ? cols() : outerSize);
+}
+
+/// \returns the \a outer -th column (resp. row) of the matrix \c *this if \c *this
+/// is col-major (resp. row-major). Read-only.
+///
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const ConstInnerVectorsReturnType innerVectors(Index outerStart,
+                                                                                     Index outerSize) const {
+  return Block<const Derived, Dynamic, Dynamic, true>(derived(), IsRowMajor ? outerStart : 0,
+                                                      IsRowMajor ? 0 : outerStart, IsRowMajor ? outerSize : rows(),
+                                                      IsRowMajor ? cols() : outerSize);
+}
+
+/** \returns the i-th subvector (column or vector) according to the \c Direction
+ * \sa subVectors()
+ */
+template <DirectionType Direction>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::conditional_t<Direction == Vertical, ColXpr, RowXpr> subVector(Index i) {
+  return std::conditional_t<Direction == Vertical, ColXpr, RowXpr>(derived(), i);
+}
+
+/** This is the const version of subVector(Index) */
+template <DirectionType Direction>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::conditional_t<Direction == Vertical, ConstColXpr, ConstRowXpr> subVector(
+    Index i) const {
+  return std::conditional_t<Direction == Vertical, ConstColXpr, ConstRowXpr>(derived(), i);
+}
+
+/** \returns the number of subvectors (rows or columns) in the direction \c Direction
+ * \sa subVector(Index)
+ */
+template <DirectionType Direction>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE constexpr Index subVectors() const {
+  return (Direction == Vertical) ? cols() : rows();
+}
diff --git a/inst/include/Eigen/src/plugins/CommonCwiseBinaryOps.h b/inst/include/Eigen/src/plugins/CommonCwiseBinaryOps.h
deleted file mode 100644
index 688d2244..00000000
--- a/inst/include/Eigen/src/plugins/CommonCwiseBinaryOps.h
+++ /dev/null
@@ -1,46 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
-// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// This file is a base class plugin containing common coefficient wise functions.
-
-/** \returns an expression of the difference of \c *this and \a other
-  *
-  * \note If you want to substract a given scalar from all coefficients, see Cwise::operator-().
-  *
-  * \sa class CwiseBinaryOp, operator-=()
-  */
-EIGEN_MAKE_CWISE_BINARY_OP(operator-,internal::scalar_difference_op)
-
-/** \returns an expression of the sum of \c *this and \a other
-  *
-  * \note If you want to add a given scalar to all coefficients, see Cwise::operator+().
-  *
-  * \sa class CwiseBinaryOp, operator+=()
-  */
-EIGEN_MAKE_CWISE_BINARY_OP(operator+,internal::scalar_sum_op)
-
-/** \returns an expression of a custom coefficient-wise operator \a func of *this and \a other
-  *
-  * The template parameter \a CustomBinaryOp is the type of the functor
-  * of the custom operator (see class CwiseBinaryOp for an example)
-  *
-  * Here is an example illustrating the use of custom functors:
-  * \include class_CwiseBinaryOp.cpp
-  * Output: \verbinclude class_CwiseBinaryOp.out
-  *
-  * \sa class CwiseBinaryOp, operator+(), operator-(), cwiseProduct()
-  */
-template<typename CustomBinaryOp, typename OtherDerived>
-EIGEN_STRONG_INLINE const CwiseBinaryOp<CustomBinaryOp, const Derived, const OtherDerived>
-binaryExpr(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other, const CustomBinaryOp& func = CustomBinaryOp()) const
-{
-  return CwiseBinaryOp<CustomBinaryOp, const Derived, const OtherDerived>(derived(), other.derived(), func);
-}
-
diff --git a/inst/include/Eigen/src/plugins/CommonCwiseBinaryOps.inc b/inst/include/Eigen/src/plugins/CommonCwiseBinaryOps.inc
new file mode 100644
index 00000000..f1ba3010
--- /dev/null
+++ b/inst/include/Eigen/src/plugins/CommonCwiseBinaryOps.inc
@@ -0,0 +1,116 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// This file is a base class plugin containing common coefficient wise functions.
+
+/** \returns an expression of the difference of \c *this and \a other
+ *
+ * \note If you want to subtract a given scalar from all coefficients, see Cwise::operator-().
+ *
+ * \sa class CwiseBinaryOp, operator-=()
+ */
+EIGEN_MAKE_CWISE_BINARY_OP(operator-, difference)
+
+/** \returns an expression of the sum of \c *this and \a other
+ *
+ * \note If you want to add a given scalar to all coefficients, see Cwise::operator+().
+ *
+ * \sa class CwiseBinaryOp, operator+=()
+ */
+EIGEN_MAKE_CWISE_BINARY_OP(operator+, sum)
+
+/** \returns an expression of a custom coefficient-wise operator \a func of *this and \a other
+ *
+ * The template parameter \a CustomBinaryOp is the type of the functor
+ * of the custom operator (see class CwiseBinaryOp for an example)
+ *
+ * Here is an example illustrating the use of custom functors:
+ * \include class_CwiseBinaryOp.cpp
+ * Output: \verbinclude class_CwiseBinaryOp.out
+ *
+ * \sa class CwiseBinaryOp, operator+(), operator-(), cwiseProduct()
+ */
+template <typename CustomBinaryOp, typename OtherDerived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseBinaryOp<CustomBinaryOp, const Derived, const OtherDerived> binaryExpr(
+    const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other, const CustomBinaryOp& func = CustomBinaryOp()) const {
+  return CwiseBinaryOp<CustomBinaryOp, const Derived, const OtherDerived>(derived(), other.derived(), func);
+}
+
+/** \returns an expression of \c *this scaled by the scalar factor \a scalar
+ *
+ * \tparam T is the scalar type of \a scalar. It must be compatible with the scalar type of the given expression.
+ */
+EIGEN_MAKE_SCALAR_BINARY_OP(operator*, product)
+
+/** \returns an expression of \c *this divided by the scalar value \a scalar
+ *
+ * \tparam T is the scalar type of \a scalar. It must be compatible with the scalar type of the given expression.
+ */
+EIGEN_MAKE_SCALAR_BINARY_OP_ONTHERIGHT(operator/, quotient)
+
+/** \returns an expression of the coefficient-wise boolean \b and operator of \c *this and \a other
+ *
+ * Example: \include Cwise_boolean_and.cpp
+ * Output: \verbinclude Cwise_boolean_and.out
+ *
+ * \sa operator||(), select()
+ */
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC inline const CwiseBinaryOp<internal::scalar_boolean_and_op<Scalar>, const Derived, const OtherDerived>
+operator&&(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
+  return CwiseBinaryOp<internal::scalar_boolean_and_op<Scalar>, const Derived, const OtherDerived>(derived(),
+                                                                                                   other.derived());
+}
+
+/** \returns an expression of the coefficient-wise boolean \b or operator of \c *this and \a other
+ *
+ * Example: \include Cwise_boolean_or.cpp
+ * Output: \verbinclude Cwise_boolean_or.out
+ *
+ * \sa operator&&(), select()
+ */
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC inline const CwiseBinaryOp<internal::scalar_boolean_or_op<Scalar>, const Derived, const OtherDerived>
+operator||(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
+  return CwiseBinaryOp<internal::scalar_boolean_or_op<Scalar>, const Derived, const OtherDerived>(derived(),
+                                                                                                  other.derived());
+}
+
+/** \returns an expression of the bitwise \b and operator of \c *this and \a other
+ *
+ * \sa operator|(), operator^()
+ */
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC inline const CwiseBinaryOp<internal::scalar_bitwise_and_op<Scalar>, const Derived, const OtherDerived>
+operator&(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
+  return CwiseBinaryOp<internal::scalar_bitwise_and_op<Scalar>, const Derived, const OtherDerived>(derived(),
+                                                                                                   other.derived());
+}
+
+/** \returns an expression of the bitwise boolean \b or operator of \c *this and \a other
+ *
+ * \sa operator&(), operator^()
+ */
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC inline const CwiseBinaryOp<internal::scalar_bitwise_or_op<Scalar>, const Derived, const OtherDerived>
+operator|(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
+  return CwiseBinaryOp<internal::scalar_bitwise_or_op<Scalar>, const Derived, const OtherDerived>(derived(),
+                                                                                                  other.derived());
+}
+
+/** \returns an expression of the bitwise xor operator of *this and \a other
+ * \sa operator&(), operator|()
+ */
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC inline const CwiseBinaryOp<internal::scalar_bitwise_xor_op<Scalar>, const Derived, const OtherDerived>
+operator^(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
+  return CwiseBinaryOp<internal::scalar_bitwise_xor_op<Scalar>, const Derived, const OtherDerived>(derived(),
+                                                                                                   other.derived());
+}
diff --git a/inst/include/Eigen/src/plugins/CommonCwiseUnaryOps.h b/inst/include/Eigen/src/plugins/CommonCwiseUnaryOps.h
deleted file mode 100644
index 08e931aa..00000000
--- a/inst/include/Eigen/src/plugins/CommonCwiseUnaryOps.h
+++ /dev/null
@@ -1,172 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
-// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// This file is a base class plugin containing common coefficient wise functions.
-
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-
-/** \internal Represents a scalar multiple of an expression */
-typedef CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const Derived> ScalarMultipleReturnType;
-/** \internal Represents a quotient of an expression by a scalar*/
-typedef CwiseUnaryOp<internal::scalar_quotient1_op<Scalar>, const Derived> ScalarQuotient1ReturnType;
-/** \internal the return type of conjugate() */
-typedef typename internal::conditional<NumTraits<Scalar>::IsComplex,
-                    const CwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, const Derived>,
-                    const Derived&
-                  >::type ConjugateReturnType;
-/** \internal the return type of real() const */
-typedef typename internal::conditional<NumTraits<Scalar>::IsComplex,
-                    const CwiseUnaryOp<internal::scalar_real_op<Scalar>, const Derived>,
-                    const Derived&
-                  >::type RealReturnType;
-/** \internal the return type of real() */
-typedef typename internal::conditional<NumTraits<Scalar>::IsComplex,
-                    CwiseUnaryView<internal::scalar_real_ref_op<Scalar>, Derived>,
-                    Derived&
-                  >::type NonConstRealReturnType;
-/** \internal the return type of imag() const */
-typedef CwiseUnaryOp<internal::scalar_imag_op<Scalar>, const Derived> ImagReturnType;
-/** \internal the return type of imag() */
-typedef CwiseUnaryView<internal::scalar_imag_ref_op<Scalar>, Derived> NonConstImagReturnType;
-
-#endif // not EIGEN_PARSED_BY_DOXYGEN
-
-/** \returns an expression of the opposite of \c *this
-  */
-inline const CwiseUnaryOp<internal::scalar_opposite_op<typename internal::traits<Derived>::Scalar>, const Derived>
-operator-() const { return derived(); }
-
-
-/** \returns an expression of \c *this scaled by the scalar factor \a scalar */
-inline const ScalarMultipleReturnType
-operator*(const Scalar& scalar) const
-{
-  return CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const Derived>
-    (derived(), internal::scalar_multiple_op<Scalar>(scalar));
-}
-
-#ifdef EIGEN_PARSED_BY_DOXYGEN
-const ScalarMultipleReturnType operator*(const RealScalar& scalar) const;
-#endif
-
-/** \returns an expression of \c *this divided by the scalar value \a scalar */
-inline const CwiseUnaryOp<internal::scalar_quotient1_op<typename internal::traits<Derived>::Scalar>, const Derived>
-operator/(const Scalar& scalar) const
-{
-  return CwiseUnaryOp<internal::scalar_quotient1_op<Scalar>, const Derived>
-    (derived(), internal::scalar_quotient1_op<Scalar>(scalar));
-}
-
-/** Overloaded for efficient real matrix times complex scalar value */
-inline const CwiseUnaryOp<internal::scalar_multiple2_op<Scalar,std::complex<Scalar> >, const Derived>
-operator*(const std::complex<Scalar>& scalar) const
-{
-  return CwiseUnaryOp<internal::scalar_multiple2_op<Scalar,std::complex<Scalar> >, const Derived>
-    (*static_cast<const Derived*>(this), internal::scalar_multiple2_op<Scalar,std::complex<Scalar> >(scalar));
-}
-
-inline friend const ScalarMultipleReturnType
-operator*(const Scalar& scalar, const StorageBaseType& matrix)
-{ return matrix*scalar; }
-
-inline friend const CwiseUnaryOp<internal::scalar_multiple2_op<Scalar,std::complex<Scalar> >, const Derived>
-operator*(const std::complex<Scalar>& scalar, const StorageBaseType& matrix)
-{ return matrix*scalar; }
-
-/** \returns an expression of *this with the \a Scalar type casted to
-  * \a NewScalar.
-  *
-  * The template parameter \a NewScalar is the type we are casting the scalars to.
-  *
-  * \sa class CwiseUnaryOp
-  */
-template<typename NewType>
-typename internal::cast_return_type<Derived,const CwiseUnaryOp<internal::scalar_cast_op<typename internal::traits<Derived>::Scalar, NewType>, const Derived> >::type
-cast() const
-{
-  return derived();
-}
-
-/** \returns an expression of the complex conjugate of \c *this.
-  *
-  * \sa adjoint() */
-inline ConjugateReturnType
-conjugate() const
-{
-  return ConjugateReturnType(derived());
-}
-
-/** \returns a read-only expression of the real part of \c *this.
-  *
-  * \sa imag() */
-inline RealReturnType
-real() const { return derived(); }
-
-/** \returns an read-only expression of the imaginary part of \c *this.
-  *
-  * \sa real() */
-inline const ImagReturnType
-imag() const { return derived(); }
-
-/** \brief Apply a unary operator coefficient-wise
-  * \param[in]  func  Functor implementing the unary operator
-  * \tparam  CustomUnaryOp Type of \a func  
-  * \returns An expression of a custom coefficient-wise unary operator \a func of *this
-  *
-  * The function \c ptr_fun() from the C++ standard library can be used to make functors out of normal functions.
-  *
-  * Example:
-  * \include class_CwiseUnaryOp_ptrfun.cpp
-  * Output: \verbinclude class_CwiseUnaryOp_ptrfun.out
-  *
-  * Genuine functors allow for more possibilities, for instance it may contain a state.
-  *
-  * Example:
-  * \include class_CwiseUnaryOp.cpp
-  * Output: \verbinclude class_CwiseUnaryOp.out
-  *
-  * \sa class CwiseUnaryOp, class CwiseBinaryOp
-  */
-template<typename CustomUnaryOp>
-inline const CwiseUnaryOp<CustomUnaryOp, const Derived>
-unaryExpr(const CustomUnaryOp& func = CustomUnaryOp()) const
-{
-  return CwiseUnaryOp<CustomUnaryOp, const Derived>(derived(), func);
-}
-
-/** \returns an expression of a custom coefficient-wise unary operator \a func of *this
-  *
-  * The template parameter \a CustomUnaryOp is the type of the functor
-  * of the custom unary operator.
-  *
-  * Example:
-  * \include class_CwiseUnaryOp.cpp
-  * Output: \verbinclude class_CwiseUnaryOp.out
-  *
-  * \sa class CwiseUnaryOp, class CwiseBinaryOp
-  */
-template<typename CustomViewOp>
-inline const CwiseUnaryView<CustomViewOp, const Derived>
-unaryViewExpr(const CustomViewOp& func = CustomViewOp()) const
-{
-  return CwiseUnaryView<CustomViewOp, const Derived>(derived(), func);
-}
-
-/** \returns a non const expression of the real part of \c *this.
-  *
-  * \sa imag() */
-inline NonConstRealReturnType
-real() { return derived(); }
-
-/** \returns a non const expression of the imaginary part of \c *this.
-  *
-  * \sa real() */
-inline NonConstImagReturnType
-imag() { return derived(); }
diff --git a/inst/include/Eigen/src/plugins/CommonCwiseUnaryOps.inc b/inst/include/Eigen/src/plugins/CommonCwiseUnaryOps.inc
new file mode 100644
index 00000000..64f36488
--- /dev/null
+++ b/inst/include/Eigen/src/plugins/CommonCwiseUnaryOps.inc
@@ -0,0 +1,167 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// This file is a base class plugin containing common coefficient wise functions.
+
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+
+/** \internal the return type of conjugate() */
+typedef std::conditional_t<NumTraits<Scalar>::IsComplex,
+                           const CwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, const Derived>, const Derived&>
+    ConjugateReturnType;
+/** \internal the return type of real() const */
+typedef std::conditional_t<NumTraits<Scalar>::IsComplex,
+                           const CwiseUnaryOp<internal::scalar_real_op<Scalar>, const Derived>, const Derived&>
+    RealReturnType;
+/** \internal the return type of real() */
+typedef std::conditional_t<NumTraits<Scalar>::IsComplex, CwiseUnaryView<internal::scalar_real_ref_op<Scalar>, Derived>,
+                           Derived&>
+    NonConstRealReturnType;
+/** \internal the return type of imag() const */
+typedef CwiseUnaryOp<internal::scalar_imag_op<Scalar>, const Derived> ImagReturnType;
+/** \internal the return type of imag() */
+typedef CwiseUnaryView<internal::scalar_imag_ref_op<Scalar>, Derived> NonConstImagReturnType;
+
+typedef CwiseUnaryOp<internal::scalar_opposite_op<Scalar>, const Derived> NegativeReturnType;
+
+#endif  // not EIGEN_PARSED_BY_DOXYGEN
+
+/// \returns an expression of the opposite of \c *this
+///
+EIGEN_DOC_UNARY_ADDONS(operator-, opposite)
+///
+EIGEN_DEVICE_FUNC inline const NegativeReturnType operator-() const { return NegativeReturnType(derived()); }
+
+template <class NewType>
+struct CastXpr {
+  typedef typename internal::cast_return_type<
+      Derived, const CwiseUnaryOp<internal::core_cast_op<Scalar, NewType>, const Derived> >::type Type;
+};
+
+/// \returns an expression of \c *this with the \a Scalar type casted to
+/// \a NewScalar.
+///
+/// The template parameter \a NewScalar is the type we are casting the scalars to.
+///
+EIGEN_DOC_UNARY_ADDONS(cast, conversion function)
+///
+/// \sa class CwiseUnaryOp
+///
+template <typename NewType>
+EIGEN_DEVICE_FUNC typename CastXpr<NewType>::Type cast() const {
+  return typename CastXpr<NewType>::Type(derived());
+}
+
+/// \returns an expression of the complex conjugate of \c *this.
+///
+EIGEN_DOC_UNARY_ADDONS(conjugate, complex conjugate)
+///
+/// \sa <a href="group__CoeffwiseMathFunctions.html#cwisetable_conj">Math functions</a>, MatrixBase::adjoint()
+EIGEN_DEVICE_FUNC inline ConjugateReturnType conjugate() const { return ConjugateReturnType(derived()); }
+
+/// \returns an expression of the complex conjugate of \c *this if Cond==true, returns derived() otherwise.
+///
+EIGEN_DOC_UNARY_ADDONS(conjugate, complex conjugate)
+///
+/// \sa conjugate()
+template <bool Cond>
+EIGEN_DEVICE_FUNC inline std::conditional_t<Cond, ConjugateReturnType, const Derived&> conjugateIf() const {
+  typedef std::conditional_t<Cond, ConjugateReturnType, const Derived&> ReturnType;
+  return ReturnType(derived());
+}
+
+/// \returns a read-only expression of the real part of \c *this.
+///
+EIGEN_DOC_UNARY_ADDONS(real, real part function)
+///
+/// \sa imag()
+EIGEN_DEVICE_FUNC inline RealReturnType real() const { return RealReturnType(derived()); }
+
+/// \returns an read-only expression of the imaginary part of \c *this.
+///
+EIGEN_DOC_UNARY_ADDONS(imag, imaginary part function)
+///
+/// \sa real()
+EIGEN_DEVICE_FUNC inline const ImagReturnType imag() const { return ImagReturnType(derived()); }
+
+/// \brief Apply a unary operator coefficient-wise
+/// \param[in]  func  Functor implementing the unary operator
+/// \tparam  CustomUnaryOp Type of \a func
+/// \returns An expression of a custom coefficient-wise unary operator \a func of *this
+///
+/// The function \c ptr_fun() from the C++ standard library can be used to make functors out of normal functions.
+///
+/// Example:
+/// \include class_CwiseUnaryOp_ptrfun.cpp
+/// Output: \verbinclude class_CwiseUnaryOp_ptrfun.out
+///
+/// Genuine functors allow for more possibilities, for instance it may contain a state.
+///
+/// Example:
+/// \include class_CwiseUnaryOp.cpp
+/// Output: \verbinclude class_CwiseUnaryOp.out
+///
+EIGEN_DOC_UNARY_ADDONS(unaryExpr, unary function)
+///
+/// \sa unaryViewExpr, binaryExpr, class CwiseUnaryOp
+///
+template <typename CustomUnaryOp>
+EIGEN_DEVICE_FUNC inline const CwiseUnaryOp<CustomUnaryOp, const Derived> unaryExpr(
+    const CustomUnaryOp& func = CustomUnaryOp()) const {
+  return CwiseUnaryOp<CustomUnaryOp, const Derived>(derived(), func);
+}
+
+/// \returns a const expression of a custom coefficient-wise unary operator \a func of *this
+///
+/// The template parameter \a CustomUnaryOp is the type of the functor
+/// of the custom unary operator.
+///
+/// Example:
+/// \include class_CwiseUnaryOp.cpp
+/// Output: \verbinclude class_CwiseUnaryOp.out
+///
+EIGEN_DOC_UNARY_ADDONS(unaryViewExpr, unary function)
+///
+/// \sa unaryExpr, binaryExpr class CwiseUnaryOp
+///
+template <typename CustomViewOp>
+EIGEN_DEVICE_FUNC inline const CwiseUnaryView<CustomViewOp, const Derived> unaryViewExpr(
+    const CustomViewOp& func = CustomViewOp()) const {
+  return CwiseUnaryView<CustomViewOp, const Derived>(derived(), func);
+}
+
+/// \returns a non-const expression of a custom coefficient-wise unary view \a func of *this
+///
+/// The template parameter \a CustomUnaryOp is the type of the functor
+/// of the custom unary operator.
+///
+EIGEN_DOC_UNARY_ADDONS(unaryViewExpr, unary function)
+///
+/// \sa unaryExpr, binaryExpr class CwiseUnaryOp
+///
+template <typename CustomViewOp>
+EIGEN_DEVICE_FUNC inline CwiseUnaryView<CustomViewOp, Derived> unaryViewExpr(
+    const CustomViewOp& func = CustomViewOp()) {
+  return CwiseUnaryView<CustomViewOp, Derived>(derived(), func);
+}
+
+/// \returns a non const expression of the real part of \c *this.
+///
+EIGEN_DOC_UNARY_ADDONS(real, real part function)
+///
+/// \sa imag()
+EIGEN_DEVICE_FUNC inline NonConstRealReturnType real() { return NonConstRealReturnType(derived()); }
+
+/// \returns a non const expression of the imaginary part of \c *this.
+///
+EIGEN_DOC_UNARY_ADDONS(imag, imaginary part function)
+///
+/// \sa real()
+EIGEN_DEVICE_FUNC inline NonConstImagReturnType imag() { return NonConstImagReturnType(derived()); }
diff --git a/inst/include/Eigen/src/plugins/IndexedViewMethods.inc b/inst/include/Eigen/src/plugins/IndexedViewMethods.inc
new file mode 100644
index 00000000..a51e3492
--- /dev/null
+++ b/inst/include/Eigen/src/plugins/IndexedViewMethods.inc
@@ -0,0 +1,192 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#if !defined(EIGEN_PARSED_BY_DOXYGEN)
+
+public:
+// SFINAE dummy types
+
+template <typename RowIndices, typename ColIndices>
+using EnableOverload = std::enable_if_t<
+    internal::valid_indexed_view_overload<RowIndices, ColIndices>::value && internal::is_lvalue<Derived>::value, bool>;
+
+template <typename RowIndices, typename ColIndices>
+using EnableConstOverload =
+    std::enable_if_t<internal::valid_indexed_view_overload<RowIndices, ColIndices>::value, bool>;
+
+template <typename Indices>
+using EnableVectorOverload =
+    std::enable_if_t<!internal::is_valid_index_type<Indices>::value && internal::is_lvalue<Derived>::value, bool>;
+
+template <typename Indices>
+using EnableConstVectorOverload = std::enable_if_t<!internal::is_valid_index_type<Indices>::value, bool>;
+
+public:
+// Public API for 2D matrices/arrays
+
+// non-const versions
+
+ template <typename RowIndices, typename ColIndices>
+ using IndexedViewType = typename internal::IndexedViewSelector<Derived, RowIndices, ColIndices>::ReturnType;
+
+ template <typename RowIndices, typename ColIndices, EnableOverload<RowIndices, ColIndices> = true>
+ IndexedViewType<RowIndices, ColIndices> operator()(const RowIndices& rowIndices, const ColIndices& colIndices) {
+   return internal::IndexedViewSelector<Derived, RowIndices, ColIndices>::run(derived(), rowIndices, colIndices);
+ }
+
+template <typename RowType, size_t RowSize, typename ColIndices, typename RowIndices = Array<RowType, RowSize, 1>,
+          EnableOverload<RowIndices, ColIndices> = true>
+IndexedViewType<RowIndices, ColIndices> operator()(const RowType (&rowIndices)[RowSize], const ColIndices& colIndices) {
+  return internal::IndexedViewSelector<Derived, RowIndices, ColIndices>::run(derived(), RowIndices{rowIndices},
+                                                                             colIndices);
+}
+
+template <typename RowIndices, typename ColType, size_t ColSize, typename ColIndices = Array<ColType, ColSize, 1>,
+          EnableOverload<RowIndices, ColIndices> = true>
+IndexedViewType<RowIndices, ColIndices> operator()(const RowIndices& rowIndices, const ColType (&colIndices)[ColSize]) {
+  return internal::IndexedViewSelector<Derived, RowIndices, ColIndices>::run(derived(), rowIndices,
+                                                                             ColIndices{colIndices});
+}
+
+template <typename RowType, size_t RowSize, typename ColType, size_t ColSize,
+          typename RowIndices = Array<RowType, RowSize, 1>, typename ColIndices = Array<ColType, ColSize, 1>,
+          EnableOverload<RowIndices, ColIndices> = true>
+IndexedViewType<RowIndices, ColIndices> operator()(const RowType (&rowIndices)[RowSize],
+                                                   const ColType (&colIndices)[ColSize]) {
+  return internal::IndexedViewSelector<Derived, RowIndices, ColIndices>::run(derived(), RowIndices{rowIndices},
+                                                                             ColIndices{colIndices});
+}
+
+// const versions
+
+template <typename RowIndices, typename ColIndices>
+using ConstIndexedViewType = typename internal::IndexedViewSelector<Derived, RowIndices, ColIndices>::ConstReturnType;
+
+template <typename RowIndices, typename ColIndices, EnableConstOverload<RowIndices, ColIndices> = true>
+ConstIndexedViewType<RowIndices, ColIndices> operator()(const RowIndices& rowIndices,
+                                                        const ColIndices& colIndices) const {
+  return internal::IndexedViewSelector<Derived, RowIndices, ColIndices>::run(derived(), rowIndices, colIndices);
+}
+
+template <typename RowType, size_t RowSize, typename ColIndices, typename RowIndices = Array<RowType, RowSize, 1>,
+          EnableConstOverload<RowIndices, ColIndices> = true>
+ConstIndexedViewType<RowIndices, ColIndices> operator()(const RowType (&rowIndices)[RowSize],
+                                                        const ColIndices& colIndices) const {
+  return internal::IndexedViewSelector<Derived, RowIndices, ColIndices>::run(derived(), RowIndices{rowIndices},
+                                                                             colIndices);
+}
+
+template <typename RowIndices, typename ColType, size_t ColSize, typename ColIndices = Array<ColType, ColSize, 1>,
+          EnableConstOverload<RowIndices, ColIndices> = true>
+ConstIndexedViewType<RowIndices, ColIndices> operator()(const RowIndices& rowIndices,
+                                                        const ColType (&colIndices)[ColSize]) const {
+  return internal::IndexedViewSelector<Derived, RowIndices, ColIndices>::run(derived(), rowIndices,
+                                                                             ColIndices{colIndices});
+}
+
+template <typename RowType, size_t RowSize, typename ColType, size_t ColSize,
+          typename RowIndices = Array<RowType, RowSize, 1>, typename ColIndices = Array<ColType, ColSize, 1>,
+          EnableConstOverload<RowIndices, ColIndices> = true>
+ConstIndexedViewType<RowIndices, ColIndices> operator()(const RowType (&rowIndices)[RowSize],
+                                                        const ColType (&colIndices)[ColSize]) const {
+  return internal::IndexedViewSelector<Derived, RowIndices, ColIndices>::run(derived(), RowIndices{rowIndices},
+                                                                             ColIndices{colIndices});
+}
+
+// Public API for 1D vectors/arrays
+
+// non-const versions
+
+template <typename Indices>
+using VectorIndexedViewType = typename internal::VectorIndexedViewSelector<Derived, Indices>::ReturnType;
+
+template <typename Indices, EnableVectorOverload<Indices> = true>
+VectorIndexedViewType<Indices> operator()(const Indices& indices) {
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return internal::VectorIndexedViewSelector<Derived, Indices>::run(derived(), indices);
+}
+
+template <typename IndexType, size_t Size, typename Indices = Array<IndexType, Size, 1>,
+          EnableVectorOverload<Indices> = true>
+VectorIndexedViewType<Indices> operator()(const IndexType (&indices)[Size]) {
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return internal::VectorIndexedViewSelector<Derived, Indices>::run(derived(), Indices{indices});
+}
+
+// const versions
+
+template <typename Indices>
+using ConstVectorIndexedViewType = typename internal::VectorIndexedViewSelector<Derived, Indices>::ConstReturnType;
+
+template <typename Indices, EnableConstVectorOverload<Indices> = true>
+ConstVectorIndexedViewType<Indices> operator()(const Indices& indices) const {
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return internal::VectorIndexedViewSelector<Derived, Indices>::run(derived(), indices);
+}
+
+template <typename IndexType, size_t Size, typename Indices = Array<IndexType, Size, 1>,
+          EnableConstVectorOverload<Indices> = true>
+ConstVectorIndexedViewType<Indices> operator()(const IndexType (&indices)[Size]) const {
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  return internal::VectorIndexedViewSelector<Derived, Indices>::run(derived(), Indices{indices});
+}
+
+#else  // EIGEN_PARSED_BY_DOXYGEN
+
+/**
+ * \returns a generic submatrix view defined by the rows and columns indexed \a rowIndices and \a colIndices
+ * respectively.
+ *
+ * Each parameter must either be:
+ *  - An integer indexing a single row or column
+ *  - Eigen::placeholders::all indexing the full set of respective rows or columns in increasing order
+ *  - An ArithmeticSequence as returned by the Eigen::seq and Eigen::seqN functions
+ *  - Any %Eigen's vector/array of integers or expressions
+ *  - Plain C arrays: \c int[N]
+ *  - And more generally any type exposing the following two member functions:
+ * \code
+ * <integral type> operator[](<integral type>) const;
+ * <integral type> size() const;
+ * \endcode
+ * where \c <integral \c type>  stands for any integer type compatible with Eigen::Index (i.e. \c std::ptrdiff_t).
+ *
+ * The last statement implies compatibility with \c std::vector, \c std::valarray, \c std::array, many of the Range-v3's
+ * ranges, etc.
+ *
+ * If the submatrix can be represented using a starting position \c (i,j) and positive sizes \c (rows,columns), then
+ * this method will returns a Block object after extraction of the relevant information from the passed arguments. This
+ * is the case when all arguments are either:
+ *  - An integer
+ *  - Eigen::placeholders::all
+ *  - An ArithmeticSequence with compile-time increment strictly equal to 1, as returned by Eigen::seq(a,b), and
+ * Eigen::seqN(a,N).
+ *
+ * Otherwise a more general IndexedView<Derived,RowIndices',ColIndices'> object will be returned, after conversion of
+ * the inputs to more suitable types \c RowIndices' and \c ColIndices'.
+ *
+ * For 1D vectors and arrays, you better use the operator()(const Indices&) overload, which behave the same way but
+ * taking a single parameter.
+ *
+ * See also this <a
+ * href="https://stackoverflow.com/questions/46110917/eigen-replicate-items-along-one-dimension-without-useless-allocations">question</a>
+ * and its answer for an example of how to duplicate coefficients.
+ *
+ * \sa operator()(const Indices&), class Block, class IndexedView, DenseBase::block(Index,Index,Index,Index)
+ */
+template <typename RowIndices, typename ColIndices>
+IndexedView_or_Block operator()(const RowIndices& rowIndices, const ColIndices& colIndices);
+
+/** This is an overload of operator()(const RowIndices&, const ColIndices&) for 1D vectors or arrays
+ *
+ * \only_for_vectors
+ */
+template <typename Indices>
+IndexedView_or_VectorBlock operator()(const Indices& indices);
+
+#endif  // EIGEN_PARSED_BY_DOXYGEN
diff --git a/inst/include/Eigen/src/plugins/InternalHeaderCheck.inc b/inst/include/Eigen/src/plugins/InternalHeaderCheck.inc
new file mode 100644
index 00000000..ac6821d1
--- /dev/null
+++ b/inst/include/Eigen/src/plugins/InternalHeaderCheck.inc
@@ -0,0 +1,3 @@
+#ifndef EIGEN_CORE_MODULE_H
+#error "Please include Eigen/plugins instead of including headers inside the src directory directly."
+#endif
diff --git a/inst/include/Eigen/src/plugins/MatrixCwiseBinaryOps.h b/inst/include/Eigen/src/plugins/MatrixCwiseBinaryOps.h
deleted file mode 100644
index c4a042b7..00000000
--- a/inst/include/Eigen/src/plugins/MatrixCwiseBinaryOps.h
+++ /dev/null
@@ -1,143 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
-// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// This file is a base class plugin containing matrix specifics coefficient wise functions.
-
-/** \returns an expression of the Schur product (coefficient wise product) of *this and \a other
-  *
-  * Example: \include MatrixBase_cwiseProduct.cpp
-  * Output: \verbinclude MatrixBase_cwiseProduct.out
-  *
-  * \sa class CwiseBinaryOp, cwiseAbs2
-  */
-template<typename OtherDerived>
-EIGEN_STRONG_INLINE const EIGEN_CWISE_PRODUCT_RETURN_TYPE(Derived,OtherDerived)
-cwiseProduct(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
-{
-  return EIGEN_CWISE_PRODUCT_RETURN_TYPE(Derived,OtherDerived)(derived(), other.derived());
-}
-
-/** \returns an expression of the coefficient-wise == operator of *this and \a other
-  *
-  * \warning this performs an exact comparison, which is generally a bad idea with floating-point types.
-  * In order to check for equality between two vectors or matrices with floating-point coefficients, it is
-  * generally a far better idea to use a fuzzy comparison as provided by isApprox() and
-  * isMuchSmallerThan().
-  *
-  * Example: \include MatrixBase_cwiseEqual.cpp
-  * Output: \verbinclude MatrixBase_cwiseEqual.out
-  *
-  * \sa cwiseNotEqual(), isApprox(), isMuchSmallerThan()
-  */
-template<typename OtherDerived>
-inline const CwiseBinaryOp<std::equal_to<Scalar>, const Derived, const OtherDerived>
-cwiseEqual(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
-{
-  return CwiseBinaryOp<std::equal_to<Scalar>, const Derived, const OtherDerived>(derived(), other.derived());
-}
-
-/** \returns an expression of the coefficient-wise != operator of *this and \a other
-  *
-  * \warning this performs an exact comparison, which is generally a bad idea with floating-point types.
-  * In order to check for equality between two vectors or matrices with floating-point coefficients, it is
-  * generally a far better idea to use a fuzzy comparison as provided by isApprox() and
-  * isMuchSmallerThan().
-  *
-  * Example: \include MatrixBase_cwiseNotEqual.cpp
-  * Output: \verbinclude MatrixBase_cwiseNotEqual.out
-  *
-  * \sa cwiseEqual(), isApprox(), isMuchSmallerThan()
-  */
-template<typename OtherDerived>
-inline const CwiseBinaryOp<std::not_equal_to<Scalar>, const Derived, const OtherDerived>
-cwiseNotEqual(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
-{
-  return CwiseBinaryOp<std::not_equal_to<Scalar>, const Derived, const OtherDerived>(derived(), other.derived());
-}
-
-/** \returns an expression of the coefficient-wise min of *this and \a other
-  *
-  * Example: \include MatrixBase_cwiseMin.cpp
-  * Output: \verbinclude MatrixBase_cwiseMin.out
-  *
-  * \sa class CwiseBinaryOp, max()
-  */
-template<typename OtherDerived>
-EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_min_op<Scalar>, const Derived, const OtherDerived>
-cwiseMin(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
-{
-  return CwiseBinaryOp<internal::scalar_min_op<Scalar>, const Derived, const OtherDerived>(derived(), other.derived());
-}
-
-/** \returns an expression of the coefficient-wise min of *this and scalar \a other
-  *
-  * \sa class CwiseBinaryOp, min()
-  */
-EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_min_op<Scalar>, const Derived, const ConstantReturnType>
-cwiseMin(const Scalar &other) const
-{
-  return cwiseMin(Derived::Constant(rows(), cols(), other));
-}
-
-/** \returns an expression of the coefficient-wise max of *this and \a other
-  *
-  * Example: \include MatrixBase_cwiseMax.cpp
-  * Output: \verbinclude MatrixBase_cwiseMax.out
-  *
-  * \sa class CwiseBinaryOp, min()
-  */
-template<typename OtherDerived>
-EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_max_op<Scalar>, const Derived, const OtherDerived>
-cwiseMax(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
-{
-  return CwiseBinaryOp<internal::scalar_max_op<Scalar>, const Derived, const OtherDerived>(derived(), other.derived());
-}
-
-/** \returns an expression of the coefficient-wise max of *this and scalar \a other
-  *
-  * \sa class CwiseBinaryOp, min()
-  */
-EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_max_op<Scalar>, const Derived, const ConstantReturnType>
-cwiseMax(const Scalar &other) const
-{
-  return cwiseMax(Derived::Constant(rows(), cols(), other));
-}
-
-
-/** \returns an expression of the coefficient-wise quotient of *this and \a other
-  *
-  * Example: \include MatrixBase_cwiseQuotient.cpp
-  * Output: \verbinclude MatrixBase_cwiseQuotient.out
-  *
-  * \sa class CwiseBinaryOp, cwiseProduct(), cwiseInverse()
-  */
-template<typename OtherDerived>
-EIGEN_STRONG_INLINE const CwiseBinaryOp<internal::scalar_quotient_op<Scalar>, const Derived, const OtherDerived>
-cwiseQuotient(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived> &other) const
-{
-  return CwiseBinaryOp<internal::scalar_quotient_op<Scalar>, const Derived, const OtherDerived>(derived(), other.derived());
-}
-
-typedef CwiseBinaryOp<internal::scalar_cmp_op<Scalar,internal::cmp_EQ>, const Derived, const ConstantReturnType> CwiseScalarEqualReturnType;
-
-/** \returns an expression of the coefficient-wise == operator of \c *this and a scalar \a s
-  *
-  * \warning this performs an exact comparison, which is generally a bad idea with floating-point types.
-  * In order to check for equality between two vectors or matrices with floating-point coefficients, it is
-  * generally a far better idea to use a fuzzy comparison as provided by isApprox() and
-  * isMuchSmallerThan().
-  *
-  * \sa cwiseEqual(const MatrixBase<OtherDerived> &) const
-  */
-inline const CwiseScalarEqualReturnType
-cwiseEqual(const Scalar& s) const
-{
-  return CwiseScalarEqualReturnType(derived(), Derived::Constant(rows(), cols(), s), internal::scalar_cmp_op<Scalar,internal::cmp_EQ>());
-}
diff --git a/inst/include/Eigen/src/plugins/MatrixCwiseBinaryOps.inc b/inst/include/Eigen/src/plugins/MatrixCwiseBinaryOps.inc
new file mode 100644
index 00000000..fae92d8d
--- /dev/null
+++ b/inst/include/Eigen/src/plugins/MatrixCwiseBinaryOps.inc
@@ -0,0 +1,331 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// This file is a base class plugin containing matrix specifics coefficient wise functions.
+
+/** \returns an expression of the Schur product (coefficient wise product) of *this and \a other
+ *
+ * Example: \include MatrixBase_cwiseProduct.cpp
+ * Output: \verbinclude MatrixBase_cwiseProduct.out
+ *
+ * \sa class CwiseBinaryOp, cwiseAbs2
+ */
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const EIGEN_CWISE_BINARY_RETURN_TYPE(Derived, OtherDerived, product)
+    cwiseProduct(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
+  return EIGEN_CWISE_BINARY_RETURN_TYPE(Derived, OtherDerived, product)(derived(), other.derived());
+}
+
+template <typename OtherDerived>
+using CwiseBinaryEqualReturnType =
+    CwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_EQ>, const Derived, const OtherDerived>;
+template <typename OtherDerived>
+using CwiseBinaryNotEqualReturnType =
+    CwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_NEQ>, const Derived, const OtherDerived>;
+template <typename OtherDerived>
+using CwiseBinaryLessReturnType =
+    CwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LT>, const Derived, const OtherDerived>;
+template <typename OtherDerived>
+using CwiseBinaryGreaterReturnType =
+    CwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GT>, const Derived, const OtherDerived>;
+template <typename OtherDerived>
+using CwiseBinaryLessOrEqualReturnType =
+    CwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LE>, const Derived, const OtherDerived>;
+template <typename OtherDerived>
+using CwiseBinaryGreaterOrEqualReturnType =
+    CwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE>, const Derived, const OtherDerived>;
+
+/** \returns an expression of the coefficient-wise == operator of *this and \a other
+ *
+ * \warning this performs an exact comparison, which is generally a bad idea with floating-point types.
+ * In order to check for equality between two vectors or matrices with floating-point coefficients, it is
+ * generally a far better idea to use a fuzzy comparison as provided by isApprox() and
+ * isMuchSmallerThan().
+ *
+ * Example: \include MatrixBase_cwiseEqual.cpp
+ * Output: \verbinclude MatrixBase_cwiseEqual.out
+ *
+ * \sa cwiseNotEqual(), isApprox(), isMuchSmallerThan()
+ */
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC inline const CwiseBinaryEqualReturnType<OtherDerived> cwiseEqual(
+    const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
+  return CwiseBinaryEqualReturnType<OtherDerived>(derived(), other.derived());
+}
+
+/** \returns an expression of the coefficient-wise != operator of *this and \a other
+ *
+ * \warning this performs an exact comparison, which is generally a bad idea with floating-point types.
+ * In order to check for equality between two vectors or matrices with floating-point coefficients, it is
+ * generally a far better idea to use a fuzzy comparison as provided by isApprox() and
+ * isMuchSmallerThan().
+ *
+ * Example: \include MatrixBase_cwiseNotEqual.cpp
+ * Output: \verbinclude MatrixBase_cwiseNotEqual.out
+ *
+ * \sa cwiseEqual(), isApprox(), isMuchSmallerThan()
+ */
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC inline const CwiseBinaryNotEqualReturnType<OtherDerived> cwiseNotEqual(
+    const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
+  return CwiseBinaryNotEqualReturnType<OtherDerived>(derived(), other.derived());
+}
+
+/** \returns an expression of the coefficient-wise < operator of *this and \a other */
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC inline const CwiseBinaryLessReturnType<OtherDerived> cwiseLess(
+    const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
+  return CwiseBinaryLessReturnType<OtherDerived>(derived(), other.derived());
+}
+
+/** \returns an expression of the coefficient-wise > operator of *this and \a other */
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC inline const CwiseBinaryGreaterReturnType<OtherDerived> cwiseGreater(
+    const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
+  return CwiseBinaryGreaterReturnType<OtherDerived>(derived(), other.derived());
+}
+
+/** \returns an expression of the coefficient-wise <= operator of *this and \a other */
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC inline const CwiseBinaryLessOrEqualReturnType<OtherDerived> cwiseLessOrEqual(
+    const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
+  return CwiseBinaryLessOrEqualReturnType<OtherDerived>(derived(), other.derived());
+}
+
+/** \returns an expression of the coefficient-wise >= operator of *this and \a other */
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC inline const CwiseBinaryGreaterOrEqualReturnType<OtherDerived> cwiseGreaterOrEqual(
+    const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
+  return CwiseBinaryGreaterOrEqualReturnType<OtherDerived>(derived(), other.derived());
+}
+
+/** \returns an expression of the coefficient-wise min of *this and \a other
+ *
+ * Example: \include MatrixBase_cwiseMin.cpp
+ * Output: \verbinclude MatrixBase_cwiseMin.out
+ *
+ * \sa class CwiseBinaryOp, max()
+ */
+template <int NaNPropagation = PropagateFast, typename OtherDerived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const
+    CwiseBinaryOp<internal::scalar_min_op<Scalar, Scalar, NaNPropagation>, const Derived, const OtherDerived>
+    cwiseMin(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
+  return CwiseBinaryOp<internal::scalar_min_op<Scalar, Scalar, NaNPropagation>, const Derived, const OtherDerived>(
+      derived(), other.derived());
+}
+
+/** \returns an expression of the coefficient-wise min of *this and scalar \a other
+ *
+ * \sa class CwiseBinaryOp, min()
+ */
+template <int NaNPropagation = PropagateFast>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const
+    CwiseBinaryOp<internal::scalar_min_op<Scalar, Scalar, NaNPropagation>, const Derived, const ConstantReturnType>
+    cwiseMin(const Scalar& other) const {
+  return cwiseMin<NaNPropagation>(Derived::Constant(rows(), cols(), other));
+}
+
+/** \returns an expression of the coefficient-wise max of *this and \a other
+ *
+ * Example: \include MatrixBase_cwiseMax.cpp
+ * Output: \verbinclude MatrixBase_cwiseMax.out
+ *
+ * \sa class CwiseBinaryOp, min()
+ */
+template <int NaNPropagation = PropagateFast, typename OtherDerived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const
+    CwiseBinaryOp<internal::scalar_max_op<Scalar, Scalar, NaNPropagation>, const Derived, const OtherDerived>
+    cwiseMax(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
+  return CwiseBinaryOp<internal::scalar_max_op<Scalar, Scalar, NaNPropagation>, const Derived, const OtherDerived>(
+      derived(), other.derived());
+}
+
+/** \returns an expression of the coefficient-wise max of *this and scalar \a other
+ *
+ * \sa class CwiseBinaryOp, min()
+ */
+template <int NaNPropagation = PropagateFast>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const
+    CwiseBinaryOp<internal::scalar_max_op<Scalar, Scalar, NaNPropagation>, const Derived, const ConstantReturnType>
+    cwiseMax(const Scalar& other) const {
+  return cwiseMax<NaNPropagation>(Derived::Constant(rows(), cols(), other));
+}
+
+/** \returns an expression of the coefficient-wise quotient of *this and \a other
+ *
+ * Example: \include MatrixBase_cwiseQuotient.cpp
+ * Output: \verbinclude MatrixBase_cwiseQuotient.out
+ *
+ * \sa class CwiseBinaryOp, cwiseProduct(), cwiseInverse()
+ */
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const
+    CwiseBinaryOp<internal::scalar_quotient_op<Scalar>, const Derived, const OtherDerived>
+    cwiseQuotient(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
+  return CwiseBinaryOp<internal::scalar_quotient_op<Scalar>, const Derived, const OtherDerived>(derived(),
+                                                                                                other.derived());
+}
+
+using CwiseScalarEqualReturnType =
+    CwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_EQ>, const Derived, const ConstantReturnType>;
+using CwiseScalarNotEqualReturnType =
+    CwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_NEQ>, const Derived, const ConstantReturnType>;
+using CwiseScalarLessReturnType =
+    CwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LT>, const Derived, const ConstantReturnType>;
+using CwiseScalarGreaterReturnType =
+    CwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GT>, const Derived, const ConstantReturnType>;
+using CwiseScalarLessOrEqualReturnType =
+    CwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LE>, const Derived, const ConstantReturnType>;
+using CwiseScalarGreaterOrEqualReturnType =
+    CwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE>, const Derived, const ConstantReturnType>;
+
+/** \returns an expression of the coefficient-wise == operator of \c *this and a scalar \a s
+ *
+ * \warning this performs an exact comparison, which is generally a bad idea with floating-point types.
+ * In order to check for equality between two vectors or matrices with floating-point coefficients, it is
+ * generally a far better idea to use a fuzzy comparison as provided by isApprox() and
+ * isMuchSmallerThan().
+ *
+ * \sa cwiseEqual(const MatrixBase<OtherDerived> &) const
+ */
+EIGEN_DEVICE_FUNC inline const CwiseScalarEqualReturnType cwiseEqual(const Scalar& s) const {
+  return CwiseScalarEqualReturnType(derived(), Derived::Constant(rows(), cols(), s));
+}
+
+/** \returns an expression of the coefficient-wise == operator of \c *this and a scalar \a s
+ *
+ * \warning this performs an exact comparison, which is generally a bad idea with floating-point types.
+ * In order to check for equality between two vectors or matrices with floating-point coefficients, it is
+ * generally a far better idea to use a fuzzy comparison as provided by isApprox() and
+ * isMuchSmallerThan().
+ *
+ * \sa cwiseEqual(const MatrixBase<OtherDerived> &) const
+ */
+EIGEN_DEVICE_FUNC inline const CwiseScalarNotEqualReturnType cwiseNotEqual(const Scalar& s) const {
+  return CwiseScalarNotEqualReturnType(derived(), Derived::Constant(rows(), cols(), s));
+}
+
+/** \returns an expression of the coefficient-wise < operator of \c *this and a scalar \a s */
+EIGEN_DEVICE_FUNC inline const CwiseScalarLessReturnType cwiseLess(const Scalar& s) const {
+  return CwiseScalarLessReturnType(derived(), Derived::Constant(rows(), cols(), s));
+}
+
+/** \returns an expression of the coefficient-wise > operator of \c *this and a scalar \a s */
+EIGEN_DEVICE_FUNC inline const CwiseScalarGreaterReturnType cwiseGreater(const Scalar& s) const {
+  return CwiseScalarGreaterReturnType(derived(), Derived::Constant(rows(), cols(), s));
+}
+
+/** \returns an expression of the coefficient-wise <= operator of \c *this and a scalar \a s */
+EIGEN_DEVICE_FUNC inline const CwiseScalarLessOrEqualReturnType cwiseLessOrEqual(const Scalar& s) const {
+  return CwiseScalarLessOrEqualReturnType(derived(), Derived::Constant(rows(), cols(), s));
+}
+
+/** \returns an expression of the coefficient-wise >= operator of \c *this and a scalar \a s */
+EIGEN_DEVICE_FUNC inline const CwiseScalarGreaterOrEqualReturnType cwiseGreaterOrEqual(const Scalar& s) const {
+  return CwiseScalarGreaterOrEqualReturnType(derived(), Derived::Constant(rows(), cols(), s));
+}
+
+template <typename OtherDerived>
+using CwiseBinaryTypedEqualReturnType =
+    CwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_EQ, true>, const Derived, const OtherDerived>;
+template <typename OtherDerived>
+using CwiseBinaryTypedNotEqualReturnType =
+    CwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_NEQ, true>, const Derived, const OtherDerived>;
+template <typename OtherDerived>
+using CwiseBinaryTypedLessReturnType =
+    CwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LT, true>, const Derived, const OtherDerived>;
+template <typename OtherDerived>
+using CwiseBinaryTypedGreaterReturnType =
+    CwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GT, true>, const Derived, const OtherDerived>;
+template <typename OtherDerived>
+using CwiseBinaryTypedLessOrEqualReturnType =
+    CwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LE, true>, const Derived, const OtherDerived>;
+template <typename OtherDerived>
+using CwiseBinaryTypedGreaterOrEqualReturnType =
+    CwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE, true>, const Derived, const OtherDerived>;
+
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseBinaryTypedEqualReturnType<OtherDerived> cwiseTypedEqual(
+    const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
+  return CwiseBinaryTypedEqualReturnType<OtherDerived>(derived(), other.derived());
+}
+
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseBinaryTypedNotEqualReturnType<OtherDerived> cwiseTypedNotEqual(
+    const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
+  return CwiseBinaryTypedNotEqualReturnType<OtherDerived>(derived(), other.derived());
+}
+
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseBinaryTypedLessReturnType<OtherDerived> cwiseTypedLess(
+    const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
+  return CwiseBinaryTypedLessReturnType<OtherDerived>(derived(), other.derived());
+}
+
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseBinaryTypedGreaterReturnType<OtherDerived> cwiseTypedGreater(
+    const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
+  return CwiseBinaryTypedGreaterReturnType<OtherDerived>(derived(), other.derived());
+}
+
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseBinaryTypedLessOrEqualReturnType<OtherDerived> cwiseTypedLessOrEqual(
+    const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
+  return CwiseBinaryTypedLessOrEqualReturnType<OtherDerived>(derived(), other.derived());
+}
+
+template <typename OtherDerived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseBinaryTypedGreaterOrEqualReturnType<OtherDerived>
+cwiseTypedGreaterOrEqual(const EIGEN_CURRENT_STORAGE_BASE_CLASS<OtherDerived>& other) const {
+  return CwiseBinaryTypedGreaterOrEqualReturnType<OtherDerived>(derived(), other.derived());
+}
+
+using CwiseScalarTypedEqualReturnType = CwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_EQ, true>,
+                                                      const Derived, const ConstantReturnType>;
+using CwiseScalarTypedNotEqualReturnType =
+    CwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_NEQ, true>, const Derived,
+                  const ConstantReturnType>;
+using CwiseScalarTypedLessReturnType = CwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LT, true>,
+                                                     const Derived, const ConstantReturnType>;
+using CwiseScalarTypedGreaterReturnType = CwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GT, true>,
+                                                        const Derived, const ConstantReturnType>;
+using CwiseScalarTypedLessOrEqualReturnType =
+    CwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LE, true>, const Derived,
+                  const ConstantReturnType>;
+using CwiseScalarTypedGreaterOrEqualReturnType =
+    CwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE, true>, const Derived,
+                  const ConstantReturnType>;
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseScalarTypedEqualReturnType cwiseTypedEqual(const Scalar& s) const {
+  return CwiseScalarTypedEqualReturnType(derived(), ConstantReturnType(rows(), cols(), s));
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseScalarTypedNotEqualReturnType
+cwiseTypedNotEqual(const Scalar& s) const {
+  return CwiseScalarTypedNotEqualReturnType(derived(), ConstantReturnType(rows(), cols(), s));
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseScalarTypedLessReturnType cwiseTypedLess(const Scalar& s) const {
+  return CwiseScalarTypedLessReturnType(derived(), ConstantReturnType(rows(), cols(), s));
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseScalarTypedGreaterReturnType cwiseTypedGreater(const Scalar& s) const {
+  return CwiseScalarTypedGreaterReturnType(derived(), ConstantReturnType(rows(), cols(), s));
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseScalarTypedLessOrEqualReturnType
+cwiseTypedLessOrEqual(const Scalar& s) const {
+  return CwiseScalarTypedLessOrEqualReturnType(derived(), ConstantReturnType(rows(), cols(), s));
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseScalarTypedGreaterOrEqualReturnType
+cwiseTypedGreaterOrEqual(const Scalar& s) const {
+  return CwiseScalarTypedGreaterOrEqualReturnType(derived(), ConstantReturnType(rows(), cols(), s));
+}
diff --git a/inst/include/Eigen/src/plugins/MatrixCwiseUnaryOps.h b/inst/include/Eigen/src/plugins/MatrixCwiseUnaryOps.h
deleted file mode 100644
index 8de10935..00000000
--- a/inst/include/Eigen/src/plugins/MatrixCwiseUnaryOps.h
+++ /dev/null
@@ -1,52 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
-// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-// This file is a base class plugin containing matrix specifics coefficient wise functions.
-
-/** \returns an expression of the coefficient-wise absolute value of \c *this
-  *
-  * Example: \include MatrixBase_cwiseAbs.cpp
-  * Output: \verbinclude MatrixBase_cwiseAbs.out
-  *
-  * \sa cwiseAbs2()
-  */
-EIGEN_STRONG_INLINE const CwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived>
-cwiseAbs() const { return derived(); }
-
-/** \returns an expression of the coefficient-wise squared absolute value of \c *this
-  *
-  * Example: \include MatrixBase_cwiseAbs2.cpp
-  * Output: \verbinclude MatrixBase_cwiseAbs2.out
-  *
-  * \sa cwiseAbs()
-  */
-EIGEN_STRONG_INLINE const CwiseUnaryOp<internal::scalar_abs2_op<Scalar>, const Derived>
-cwiseAbs2() const { return derived(); }
-
-/** \returns an expression of the coefficient-wise square root of *this.
-  *
-  * Example: \include MatrixBase_cwiseSqrt.cpp
-  * Output: \verbinclude MatrixBase_cwiseSqrt.out
-  *
-  * \sa cwisePow(), cwiseSquare()
-  */
-inline const CwiseUnaryOp<internal::scalar_sqrt_op<Scalar>, const Derived>
-cwiseSqrt() const { return derived(); }
-
-/** \returns an expression of the coefficient-wise inverse of *this.
-  *
-  * Example: \include MatrixBase_cwiseInverse.cpp
-  * Output: \verbinclude MatrixBase_cwiseInverse.out
-  *
-  * \sa cwiseProduct()
-  */
-inline const CwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const Derived>
-cwiseInverse() const { return derived(); }
-
diff --git a/inst/include/Eigen/src/plugins/MatrixCwiseUnaryOps.inc b/inst/include/Eigen/src/plugins/MatrixCwiseUnaryOps.inc
new file mode 100644
index 00000000..ffaf5aab
--- /dev/null
+++ b/inst/include/Eigen/src/plugins/MatrixCwiseUnaryOps.inc
@@ -0,0 +1,118 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2006-2008 Benoit Jacob <jacob.benoit.1@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// This file is included into the body of the base classes supporting matrix specific coefficient-wise functions.
+// This include MatrixBase and SparseMatrixBase.
+
+typedef CwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived> CwiseAbsReturnType;
+typedef CwiseUnaryOp<internal::scalar_abs2_op<Scalar>, const Derived> CwiseAbs2ReturnType;
+typedef CwiseUnaryOp<internal::scalar_arg_op<Scalar>, const Derived> CwiseArgReturnType;
+typedef CwiseUnaryOp<internal::scalar_carg_op<Scalar>, const Derived> CwiseCArgReturnType;
+typedef CwiseUnaryOp<internal::scalar_sqrt_op<Scalar>, const Derived> CwiseSqrtReturnType;
+typedef CwiseUnaryOp<internal::scalar_cbrt_op<Scalar>, const Derived> CwiseCbrtReturnType;
+typedef CwiseUnaryOp<internal::scalar_square_op<Scalar>, const Derived> CwiseSquareReturnType;
+typedef CwiseUnaryOp<internal::scalar_sign_op<Scalar>, const Derived> CwiseSignReturnType;
+typedef CwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const Derived> CwiseInverseReturnType;
+
+/// \returns an expression of the coefficient-wise absolute value of \c *this
+///
+/// Example: \include MatrixBase_cwiseAbs.cpp
+/// Output: \verbinclude MatrixBase_cwiseAbs.out
+///
+EIGEN_DOC_UNARY_ADDONS(cwiseAbs, absolute value)
+///
+/// \sa cwiseAbs2()
+///
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseAbsReturnType cwiseAbs() const {
+  return CwiseAbsReturnType(derived());
+}
+
+/// \returns an expression of the coefficient-wise squared absolute value of \c *this
+///
+/// Example: \include MatrixBase_cwiseAbs2.cpp
+/// Output: \verbinclude MatrixBase_cwiseAbs2.out
+///
+EIGEN_DOC_UNARY_ADDONS(cwiseAbs2, squared absolute value)
+///
+/// \sa cwiseAbs()
+///
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseAbs2ReturnType cwiseAbs2() const {
+  return CwiseAbs2ReturnType(derived());
+}
+
+/// \returns an expression of the coefficient-wise square root of *this.
+///
+/// Example: \include MatrixBase_cwiseSqrt.cpp
+/// Output: \verbinclude MatrixBase_cwiseSqrt.out
+///
+EIGEN_DOC_UNARY_ADDONS(cwiseSqrt, square - root)
+///
+/// \sa cwisePow(), cwiseSquare(), cwiseCbrt()
+///
+EIGEN_DEVICE_FUNC inline const CwiseSqrtReturnType cwiseSqrt() const { return CwiseSqrtReturnType(derived()); }
+
+/// \returns an expression of the coefficient-wise cube root of *this.
+///
+EIGEN_DOC_UNARY_ADDONS(cwiseCbrt, cube - root)
+///
+/// \sa cwiseSqrt(), cwiseSquare(), cwisePow()
+///
+EIGEN_DEVICE_FUNC inline const CwiseCbrtReturnType cwiseCbrt() const { return CwiseCbrtReturnType(derived()); }
+
+/// \returns an expression of the coefficient-wise square of *this.
+///
+EIGEN_DOC_UNARY_ADDONS(cwiseSquare, square)
+///
+/// \sa cwisePow(), cwiseSqrt(), cwiseCbrt()
+///
+EIGEN_DEVICE_FUNC inline const CwiseSquareReturnType cwiseSquare() const { return CwiseSquareReturnType(derived()); }
+
+/// \returns an expression of the coefficient-wise signum of *this.
+///
+/// Example: \include MatrixBase_cwiseSign.cpp
+/// Output: \verbinclude MatrixBase_cwiseSign.out
+///
+EIGEN_DOC_UNARY_ADDONS(cwiseSign, sign function)
+///
+EIGEN_DEVICE_FUNC inline const CwiseSignReturnType cwiseSign() const { return CwiseSignReturnType(derived()); }
+
+/// \returns an expression of the coefficient-wise inverse of *this.
+///
+/// Example: \include MatrixBase_cwiseInverse.cpp
+/// Output: \verbinclude MatrixBase_cwiseInverse.out
+///
+EIGEN_DOC_UNARY_ADDONS(cwiseInverse, inverse)
+///
+/// \sa cwiseProduct()
+///
+EIGEN_DEVICE_FUNC inline const CwiseInverseReturnType cwiseInverse() const { return CwiseInverseReturnType(derived()); }
+
+/// \returns an expression of the coefficient-wise phase angle of \c *this
+///
+/// Example: \include MatrixBase_cwiseArg.cpp
+/// Output: \verbinclude MatrixBase_cwiseArg.out
+///
+EIGEN_DOC_UNARY_ADDONS(cwiseArg, arg)
+
+EIGEN_DEVICE_FUNC inline const CwiseArgReturnType cwiseArg() const { return CwiseArgReturnType(derived()); }
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseCArgReturnType cwiseCArg() const {
+  return CwiseCArgReturnType(derived());
+}
+
+template <typename ScalarExponent>
+using CwisePowReturnType =
+    std::enable_if_t<internal::is_arithmetic<typename NumTraits<ScalarExponent>::Real>::value,
+                     CwiseUnaryOp<internal::scalar_unary_pow_op<Scalar, ScalarExponent>, const Derived>>;
+
+template <typename ScalarExponent>
+EIGEN_DEVICE_FUNC inline const CwisePowReturnType<ScalarExponent> cwisePow(const ScalarExponent& exponent) const {
+  return CwisePowReturnType<ScalarExponent>(derived(), internal::scalar_unary_pow_op<Scalar, ScalarExponent>(exponent));
+}
diff --git a/inst/include/Eigen/src/plugins/ReshapedMethods.inc b/inst/include/Eigen/src/plugins/ReshapedMethods.inc
new file mode 100644
index 00000000..c1f90e72
--- /dev/null
+++ b/inst/include/Eigen/src/plugins/ReshapedMethods.inc
@@ -0,0 +1,133 @@
+
+#ifdef EIGEN_PARSED_BY_DOXYGEN
+
+/// \returns an expression of \c *this with reshaped sizes.
+///
+/// \param nRows the number of rows in the reshaped expression, specified at either run-time or compile-time, or
+/// AutoSize \param nCols the number of columns in the reshaped expression, specified at either run-time or
+/// compile-time, or AutoSize \tparam Order specifies whether the coefficients should be processed in column-major-order
+/// (ColMajor), in row-major-order (RowMajor),
+///               or follows the \em natural order of the nested expression (AutoOrder). The default is ColMajor.
+/// \tparam NRowsType the type of the value handling the number of rows, typically Index.
+/// \tparam NColsType the type of the value handling the number of columns, typically Index.
+///
+/// Dynamic size example: \include MatrixBase_reshaped_int_int.cpp
+/// Output: \verbinclude MatrixBase_reshaped_int_int.out
+///
+/// The number of rows \a nRows and columns \a nCols can also be specified at compile-time by passing Eigen::fix<N>,
+/// or Eigen::fix<N>(n) as arguments. In the later case, \c n plays the role of a runtime fallback value in case \c N
+/// equals Eigen::Dynamic. Here is an example with a fixed number of rows and columns: \include
+/// MatrixBase_reshaped_fixed.cpp Output: \verbinclude MatrixBase_reshaped_fixed.out
+///
+/// Finally, one of the sizes parameter can be automatically deduced from the other one by passing AutoSize as in the
+/// following example: \include MatrixBase_reshaped_auto.cpp Output: \verbinclude MatrixBase_reshaped_auto.out AutoSize
+/// does preserve compile-time sizes when possible, i.e., when the sizes of the input are known at compile time \b and
+/// that the other size is passed at compile-time using Eigen::fix<N> as above.
+///
+/// \sa class Reshaped, fix, fix<N>(int)
+///
+template <int Order = ColMajor, typename NRowsType, typename NColsType>
+EIGEN_DEVICE_FUNC inline Reshaped<Derived, ...> reshaped(NRowsType nRows, NColsType nCols);
+
+/// This is the const version of reshaped(NRowsType,NColsType).
+template <int Order = ColMajor, typename NRowsType, typename NColsType>
+EIGEN_DEVICE_FUNC inline const Reshaped<const Derived, ...> reshaped(NRowsType nRows, NColsType nCols) const;
+
+/// \returns an expression of \c *this with columns (or rows) stacked to a linear column vector
+///
+/// \tparam Order specifies whether the coefficients should be processed in column-major-order (ColMajor), in
+/// row-major-order (RowMajor),
+///               or follows the \em natural order of the nested expression (AutoOrder). The default is ColMajor.
+///
+/// This overloads is essentially a shortcut for `A.reshaped<Order>(AutoSize,fix<1>)`.
+///
+/// - If `Order==ColMajor` (the default), then it returns a column-vector from the stacked columns of \c *this.
+/// - If `Order==RowMajor`, then it returns a column-vector from the stacked rows of \c *this.
+/// - If `Order==AutoOrder`, then it returns a column-vector with elements stacked following the storage order of \c
+/// *this.
+///   This mode is the recommended one when the particular ordering of the element is not relevant.
+///
+/// Example:
+/// \include MatrixBase_reshaped_to_vector.cpp
+/// Output: \verbinclude MatrixBase_reshaped_to_vector.out
+///
+/// If you want more control, you can still fall back to reshaped(NRowsType,NColsType).
+///
+/// \sa reshaped(NRowsType,NColsType), class Reshaped
+///
+template <int Order = ColMajor>
+EIGEN_DEVICE_FUNC inline Reshaped<Derived, ...> reshaped();
+
+/// This is the const version of reshaped().
+template <int Order = ColMajor>
+EIGEN_DEVICE_FUNC inline const Reshaped<const Derived, ...> reshaped() const;
+
+#else
+
+// This file is automatically included twice to generate const and non-const versions
+
+#ifndef EIGEN_RESHAPED_METHOD_2ND_PASS
+#define EIGEN_RESHAPED_METHOD_CONST const
+#else
+#define EIGEN_RESHAPED_METHOD_CONST
+#endif
+
+#ifndef EIGEN_RESHAPED_METHOD_2ND_PASS
+
+// This part is included once
+
+#endif
+
+template <typename NRowsType, typename NColsType>
+EIGEN_DEVICE_FUNC inline Reshaped<
+    EIGEN_RESHAPED_METHOD_CONST Derived,
+    internal::get_compiletime_reshape_size<NRowsType, NColsType, SizeAtCompileTime>::value,
+    internal::get_compiletime_reshape_size<NColsType, NRowsType, SizeAtCompileTime>::value>
+reshaped(NRowsType nRows, NColsType nCols) EIGEN_RESHAPED_METHOD_CONST {
+  return Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived,
+                  internal::get_compiletime_reshape_size<NRowsType, NColsType, SizeAtCompileTime>::value,
+                  internal::get_compiletime_reshape_size<NColsType, NRowsType, SizeAtCompileTime>::value>(
+      derived(), internal::get_runtime_reshape_size(nRows, internal::get_runtime_value(nCols), size()),
+      internal::get_runtime_reshape_size(nCols, internal::get_runtime_value(nRows), size()));
+}
+
+template <int Order, typename NRowsType, typename NColsType>
+EIGEN_DEVICE_FUNC inline Reshaped<
+    EIGEN_RESHAPED_METHOD_CONST Derived,
+    internal::get_compiletime_reshape_size<NRowsType, NColsType, SizeAtCompileTime>::value,
+    internal::get_compiletime_reshape_size<NColsType, NRowsType, SizeAtCompileTime>::value,
+    internal::get_compiletime_reshape_order(Flags, Order)>
+reshaped(NRowsType nRows, NColsType nCols) EIGEN_RESHAPED_METHOD_CONST {
+  return Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived,
+                  internal::get_compiletime_reshape_size<NRowsType, NColsType, SizeAtCompileTime>::value,
+                  internal::get_compiletime_reshape_size<NColsType, NRowsType, SizeAtCompileTime>::value,
+                  internal::get_compiletime_reshape_order(Flags, Order)>(
+      derived(), internal::get_runtime_reshape_size(nRows, internal::get_runtime_value(nCols), size()),
+      internal::get_runtime_reshape_size(nCols, internal::get_runtime_value(nRows), size()));
+}
+
+// Views as linear vectors
+
+EIGEN_DEVICE_FUNC inline Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived, SizeAtCompileTime, 1> reshaped()
+    EIGEN_RESHAPED_METHOD_CONST {
+  return Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived, SizeAtCompileTime, 1>(derived(), size(), 1);
+}
+
+template <int Order>
+EIGEN_DEVICE_FUNC inline Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived, SizeAtCompileTime, 1,
+                                  internal::get_compiletime_reshape_order(Flags, Order)>
+reshaped() EIGEN_RESHAPED_METHOD_CONST {
+  EIGEN_STATIC_ASSERT(Order == RowMajor || Order == ColMajor || Order == AutoOrder, INVALID_TEMPLATE_PARAMETER);
+  return Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived, SizeAtCompileTime, 1,
+                  internal::get_compiletime_reshape_order(Flags, Order)>(derived(), size(), 1);
+}
+
+#undef EIGEN_RESHAPED_METHOD_CONST
+
+#ifndef EIGEN_RESHAPED_METHOD_2ND_PASS
+#define EIGEN_RESHAPED_METHOD_2ND_PASS
+#include "ReshapedMethods.inc"
+#undef EIGEN_RESHAPED_METHOD_2ND_PASS
+#endif
+
+#endif  // EIGEN_PARSED_BY_DOXYGEN
diff --git a/inst/include/RcppEigenCholmod.h b/inst/include/RcppEigenCholmod.h
index 73987d1f..147a2553 100644
--- a/inst/include/RcppEigenCholmod.h
+++ b/inst/include/RcppEigenCholmod.h
@@ -1,6 +1,8 @@
 // -*- mode: C++; c-indent-level: 4; c-basic-offset: 4; tab-width: 8 -*-
 //
-// cholmod.h: selected headers from Tim Davis's CHOLMOD package
+// RcppEigenCholmod.h: Provide access to the Matrix API and in turn
+// to Eigen's CholmodSupport module.  Use of this header relies on
+// compilation of ../../src/RcppEigenStubs.cpp and LinkingTo: Matrix.
 //
 // Copyright (C)      2011 Douglas Bates, Martin Maechler, Dirk Eddelbuettel and Romain Francois
 //
@@ -19,1050 +21,15 @@
 // You should have received a copy of the GNU General Public License
 // along with RcppEigen.  If not, see <http://www.gnu.org/licenses/>.
 
-#ifndef RcppEigen_CHOLMOD_H
-#define RcppEigen_CHOLMOD_H
+#ifndef RcppEigen__RcppEigenCholmod__h
+#define RcppEigen__RcppEigenCholmod__h
 
-#ifdef	__cplusplus
-extern "C" {
+#include <Matrix.h>
+#ifndef R_MATRIX_CHOLMOD /* Matrix <= 1.6-1.1 */
+# define R_MATRIX_CHOLMOD(_NAME_) M_cholmod_ ## _NAME_
+# define M_cholmod_start M_R_cholmod_start /* sigh */
 #endif
 
-// from Matrix/src/SuiteSparse_config/SuiteSparse_config.h - line 51 :
-#ifndef SuiteSparse_long
+#include <Eigen/CholmodSupport>
 
-#ifdef _WIN64
-
-#define SuiteSparse_long __int64
-#define SuiteSparse_long_max _I64_MAX
-#define SuiteSparse_long_idd "I64d"
-
-#else
-
-#define SuiteSparse_long long
-#define SuiteSparse_long_max LONG_MAX
-#define SuiteSparse_long_idd "ld"
-
-#endif
-#define SuiteSparse_long_id "%" SuiteSparse_long_idd
 #endif
-
-/* For backward compatibility with prior versions of SuiteSparse.  The UF_*
- * macros are deprecated and will be removed in a future version. */
-#ifndef UF_long
-#define UF_long     SuiteSparse_long
-#define UF_long_max SuiteSparse_long_max
-#define UF_long_idd SuiteSparse_long_idd
-#define UF_long_id  SuiteSparse_long_id
-#endif
-
-#define CHOLMOD_HAS_VERSION_FUNCTION
-
-#define CHOLMOD_DATE "April 25, 2013"
-#define CHOLMOD_VER_CODE(main,sub) ((main) * 1000 + (sub))
-#define CHOLMOD_MAIN_VERSION 2
-#define CHOLMOD_SUB_VERSION 1
-#define CHOLMOD_SUBSUB_VERSION 2
-#define CHOLMOD_VERSION \
-    CHOLMOD_VER_CODE(CHOLMOD_MAIN_VERSION,CHOLMOD_SUB_VERSION)
-// from ../../src/CHOLMOD/Include/cholmod_core.h - line 275 :
-/* Each CHOLMOD object has its own type code. */
-
-#define CHOLMOD_COMMON 0
-#define CHOLMOD_SPARSE 1
-#define CHOLMOD_FACTOR 2
-#define CHOLMOD_DENSE 3
-#define CHOLMOD_TRIPLET 4
-
-/* ========================================================================== */
-/* === CHOLMOD Common ======================================================= */
-/* ========================================================================== */
-
-/* itype defines the types of integer used: */
-#define CHOLMOD_INT 0		/* all integer arrays are int */
-#define CHOLMOD_INTLONG 1	/* most are int, some are SuiteSparse_long */
-#define CHOLMOD_LONG 2		/* all integer arrays are SuiteSparse_long */
-
-/* The itype of all parameters for all CHOLMOD routines must match.
- * FUTURE WORK: CHOLMOD_INTLONG is not yet supported.
- */
-
-/* dtype defines what the numerical type is (double or float): */
-#define CHOLMOD_DOUBLE 0	/* all numerical values are double */
-#define CHOLMOD_SINGLE 1	/* all numerical values are float */
-
-/* The dtype of all parameters for all CHOLMOD routines must match.
- *
- * Scalar floating-point values are always passed as double arrays of size 2
- * (for the real and imaginary parts).  They are typecast to float as needed.
- * FUTURE WORK: the float case is not supported yet.
- */
-
-/* xtype defines the kind of numerical values used: */
-#define CHOLMOD_PATTERN 0	/* pattern only, no numerical values */
-#define CHOLMOD_REAL 1		/* a real matrix */
-#define CHOLMOD_COMPLEX 2	/* a complex matrix (ANSI C99 compatible) */
-#define CHOLMOD_ZOMPLEX 3	/* a complex matrix (MATLAB compatible) */
-
-/* Definitions for cholmod_common: */
-#define CHOLMOD_MAXMETHODS 9	/* maximum number of different methods that */
-				/* cholmod_analyze can try. Must be >= 9. */
-
-/* Common->status values.  zero means success, negative means a fatal error,
- * positive is a warning. */
-#define CHOLMOD_OK 0			/* success */
-#define CHOLMOD_NOT_INSTALLED (-1)	/* failure: method not installed */
-#define CHOLMOD_OUT_OF_MEMORY (-2)	/* failure: out of memory */
-#define CHOLMOD_TOO_LARGE (-3)		/* failure: integer overflow occured */
-#define CHOLMOD_INVALID (-4)		/* failure: invalid input */
-#define CHOLMOD_GPU_PROBLEM (-5)        /* failure: GPU fatal error */
-#define CHOLMOD_NOT_POSDEF (1)		/* warning: matrix not pos. def. */
-#define CHOLMOD_DSMALL (2)		/* warning: D for LDL'  or diag(L) or */
-					/* LL' has tiny absolute value */
-
-/* ordering method (also used for L->ordering) */
-#define CHOLMOD_NATURAL 0	/* use natural ordering */
-#define CHOLMOD_GIVEN 1		/* use given permutation */
-#define CHOLMOD_AMD 2		/* use minimum degree (AMD) */
-#define CHOLMOD_METIS 3		/* use METIS' nested dissection */
-#define CHOLMOD_NESDIS 4	/* use CHOLMOD's version of nested dissection:*/
-				/* node bisector applied recursively, followed
-				 * by constrained minimum degree (CSYMAMD or
-				 * CCOLAMD) */
-#define CHOLMOD_COLAMD 5	/* use AMD for A, COLAMD for A*A' */
-
-/* POSTORDERED is not a method, but a result of natural ordering followed by a
- * weighted postorder.  It is used for L->ordering, not method [ ].ordering. */
-#define CHOLMOD_POSTORDERED 6	/* natural ordering, postordered. */
-
-/* supernodal strategy (for Common->supernodal) */
-#define CHOLMOD_SIMPLICIAL 0	/* always do simplicial */
-#define CHOLMOD_AUTO 1		/* select simpl/super depending on matrix */
-#define CHOLMOD_SUPERNODAL 2	/* always do supernodal */
-
-typedef struct cholmod_common_struct
-{
-    /* ---------------------------------------------------------------------- */
-    /* parameters for symbolic/numeric factorization and update/downdate */
-    /* ---------------------------------------------------------------------- */
-
-    double dbound ;	/* Smallest absolute value of diagonal entries of D
-			 * for LDL' factorization and update/downdate/rowadd/
-	* rowdel, or the diagonal of L for an LL' factorization.
-	* Entries in the range 0 to dbound are replaced with dbound.
-	* Entries in the range -dbound to 0 are replaced with -dbound.  No
-	* changes are made to the diagonal if dbound <= 0.  Default: zero */
-
-    double grow0 ;	/* For a simplicial factorization, L->i and L->x can
-			 * grow if necessary.  grow0 is the factor by which
-	* it grows.  For the initial space, L is of size MAX (1,grow0) times
-	* the required space.  If L runs out of space, the new size of L is
-	* MAX(1.2,grow0) times the new required space.   If you do not plan on
-	* modifying the LDL' factorization in the Modify module, set grow0 to
-	* zero (or set grow2 to 0, see below).  Default: 1.2 */
-
-    double grow1 ;
-
-    size_t grow2 ;	/* For a simplicial factorization, each column j of L
-			 * is initialized with space equal to
-	* grow1*L->ColCount[j] + grow2.  If grow0 < 1, grow1 < 1, or grow2 == 0,
-	* then the space allocated is exactly equal to L->ColCount[j].  If the
-	* column j runs out of space, it increases to grow1*need + grow2 in
-	* size, where need is the total # of nonzeros in that column.  If you do
-	* not plan on modifying the factorization in the Modify module, set
-	* grow2 to zero.  Default: grow1 = 1.2, grow2 = 5. */
-
-    size_t maxrank ;	/* rank of maximum update/downdate.  Valid values:
-			 * 2, 4, or 8.  A value < 2 is set to 2, and a
-	* value > 8 is set to 8.  It is then rounded up to the next highest
-	* power of 2, if not already a power of 2.  Workspace (Xwork, below) of
-	* size nrow-by-maxrank double's is allocated for the update/downdate.
-	* If an update/downdate of rank-k is requested, with k > maxrank,
-	* it is done in steps of maxrank.  Default: 8, which is fastest.
-	* Memory usage can be reduced by setting maxrank to 2 or 4.
-	*/
-
-    double supernodal_switch ;	/* supernodal vs simplicial factorization */
-    int supernodal ;		/* If Common->supernodal <= CHOLMOD_SIMPLICIAL
-				 * (0) then cholmod_analyze performs a
-	* simplicial analysis.  If >= CHOLMOD_SUPERNODAL (2), then a supernodal
-	* analysis is performed.  If == CHOLMOD_AUTO (1) and
-	* flop/nnz(L) < Common->supernodal_switch, then a simplicial analysis
-	* is done.  A supernodal analysis done otherwise.
-	* Default:  CHOLMOD_AUTO.  Default supernodal_switch = 40 */
-
-    int final_asis ;	/* If TRUE, then ignore the other final_* parameters
-			 * (except for final_pack).
-			 * The factor is left as-is when done.  Default: TRUE.*/
-
-    int final_super ;	/* If TRUE, leave a factor in supernodal form when
-			 * supernodal factorization is finished.  If FALSE,
-			 * then convert to a simplicial factor when done.
-			 * Default: TRUE */
-
-    int final_ll ;	/* If TRUE, leave factor in LL' form when done.
-			 * Otherwise, leave in LDL' form.  Default: FALSE */
-
-    int final_pack ;	/* If TRUE, pack the columns when done.  If TRUE, and
-			 * cholmod_factorize is called with a symbolic L, L is
-	* allocated with exactly the space required, using L->ColCount.  If you
-	* plan on modifying the factorization, set Common->final_pack to FALSE,
-	* and each column will be given a little extra slack space for future
-	* growth in fill-in due to updates.  Default: TRUE */
-
-    int final_monotonic ;   /* If TRUE, ensure columns are monotonic when done.
-			 * Default: TRUE */
-
-    int final_resymbol ;/* if cholmod_factorize performed a supernodal
-			 * factorization, final_resymbol is true, and
-	* final_super is FALSE (convert a simplicial numeric factorization),
-	* then numerically zero entries that resulted from relaxed supernodal
-	* amalgamation are removed.  This does not remove entries that are zero
-	* due to exact numeric cancellation, since doing so would break the
-	* update/downdate rowadd/rowdel routines.  Default: FALSE. */
-
-    /* supernodal relaxed amalgamation parameters: */
-    double zrelax [3] ;
-    size_t nrelax [3] ;
-
-	/* Let ns be the total number of columns in two adjacent supernodes.
-	 * Let z be the fraction of zero entries in the two supernodes if they
-	 * are merged (z includes zero entries from prior amalgamations).  The
-	 * two supernodes are merged if:
-	 *    (ns <= nrelax [0]) || (no new zero entries added) ||
-	 *    (ns <= nrelax [1] && z < zrelax [0]) ||
-	 *    (ns <= nrelax [2] && z < zrelax [1]) || (z < zrelax [2])
-	 *
-	 * Default parameters result in the following rule:
-	 *    (ns <= 4) || (no new zero entries added) ||
-	 *    (ns <= 16 && z < 0.8) || (ns <= 48 && z < 0.1) || (z < 0.05)
-	 */
-
-    int prefer_zomplex ;    /* X = cholmod_solve (sys, L, B, Common) computes
-			     * x=A\b or solves a related system.  If L and B are
-	 * both real, then X is real.  Otherwise, X is returned as
-	 * CHOLMOD_COMPLEX if Common->prefer_zomplex is FALSE, or
-	 * CHOLMOD_ZOMPLEX if Common->prefer_zomplex is TRUE.  This parameter
-	 * is needed because there is no supernodal zomplex L.  Suppose the
-	 * caller wants all complex matrices to be stored in zomplex form
-	 * (MATLAB, for example).  A supernodal L is returned in complex form
-	 * if A is zomplex.  B can be real, and thus X = cholmod_solve (L,B)
-	 * should return X as zomplex.  This cannot be inferred from the input
-	 * arguments L and B.  Default: FALSE, since all data types are
-	 * supported in CHOLMOD_COMPLEX form and since this is the native type
-	 * of LAPACK and the BLAS.  Note that the MATLAB/cholmod.c mexFunction
-	 * sets this parameter to TRUE, since MATLAB matrices are in
-	 * CHOLMOD_ZOMPLEX form.
-	 */
-
-    int prefer_upper ;	    /* cholmod_analyze and cholmod_factorize work
-			     * fastest when a symmetric matrix is stored in
-	 * upper triangular form when a fill-reducing ordering is used.  In
-	 * MATLAB, this corresponds to how x=A\b works.  When the matrix is
-	 * ordered as-is, they work fastest when a symmetric matrix is in lower
-	 * triangular form.  In MATLAB, R=chol(A) does the opposite.  This
-	 * parameter affects only how cholmod_read returns a symmetric matrix.
-	 * If TRUE (the default case), a symmetric matrix is always returned in
-	 * upper-triangular form (A->stype = 1).  */
-
-    int quick_return_if_not_posdef ;	/* if TRUE, the supernodal numeric
-					 * factorization will return quickly if
-	* the matrix is not positive definite.  Default: FALSE. */
-
-    /* ---------------------------------------------------------------------- */
-    /* printing and error handling options */
-    /* ---------------------------------------------------------------------- */
-
-    int print ;		/* print level. Default: 3 */
-    int precise ;	/* if TRUE, print 16 digits.  Otherwise print 5 */
-    int (*print_function) (const char *, ...) ;	/* pointer to printf */
-
-    int try_catch ;	/* if TRUE, then ignore errors; CHOLMOD is in the middle
-			 * of a try/catch block.  No error message is printed
-	 * and the Common->error_handler function is not called. */
-
-    void (*error_handler) (int status, const char *file,
-        int line, const char *message) ;
-
-	/* Common->error_handler is the user's error handling routine.  If not
-	 * NULL, this routine is called if an error occurs in CHOLMOD.  status
-	 * can be CHOLMOD_OK (0), negative for a fatal error, and positive for
-	 * a warning. file is a string containing the name of the source code
-	 * file where the error occured, and line is the line number in that
-	 * file.  message is a string describing the error in more detail. */
-
-    /* ---------------------------------------------------------------------- */
-    /* ordering options */
-    /* ---------------------------------------------------------------------- */
-
-    /* The cholmod_analyze routine can try many different orderings and select
-     * the best one.  It can also try one ordering method multiple times, with
-     * different parameter settings.  The default is to use three orderings,
-     * the user's permutation (if provided), AMD which is the fastest ordering
-     * and generally gives good fill-in, and METIS.  CHOLMOD's nested dissection
-     * (METIS with a constrained AMD) usually gives a better ordering than METIS
-     * alone (by about 5% to 10%) but it takes more time.
-     *
-     * If you know the method that is best for your matrix, set Common->nmethods
-     * to 1 and set Common->method [0] to the set of parameters for that method.
-     * If you set it to 1 and do not provide a permutation, then only AMD will
-     * be called.
-     *
-     * If METIS is not available, the default # of methods tried is 2 (the user
-     * permutation, if any, and AMD).
-     *
-     * To try other methods, set Common->nmethods to the number of methods you
-     * want to try.  The suite of default methods and their parameters is
-     * described in the cholmod_defaults routine, and summarized here:
-     *
-     *	    Common->method [i]:
-     *	    i = 0: user-provided ordering (cholmod_analyze_p only)
-     *	    i = 1: AMD (for both A and A*A')
-     *	    i = 2: METIS
-     *	    i = 3: CHOLMOD's nested dissection (NESDIS), default parameters
-     *	    i = 4: natural
-     *	    i = 5: NESDIS with nd_small = 20000
-     *	    i = 6: NESDIS with nd_small = 4, no constrained minimum degree
-     *	    i = 7: NESDIS with no dense node removal
-     *	    i = 8: AMD for A, COLAMD for A*A'
-     *
-     * You can modify the suite of methods you wish to try by modifying
-     * Common.method [...] after calling cholmod_start or cholmod_defaults.
-     *
-     * For example, to use AMD, followed by a weighted postordering:
-     *
-     *	    Common->nmethods = 1 ;
-     *	    Common->method [0].ordering = CHOLMOD_AMD ;
-     *	    Common->postorder = TRUE ;
-     *
-     * To use the natural ordering (with no postordering):
-     *
-     *	    Common->nmethods = 1 ;
-     *	    Common->method [0].ordering = CHOLMOD_NATURAL ;
-     *	    Common->postorder = FALSE ;
-     *
-     * If you are going to factorize hundreds or more matrices with the same
-     * nonzero pattern, you may wish to spend a great deal of time finding a
-     * good permutation.  In this case, try setting Common->nmethods to 9.
-     * The time spent in cholmod_analysis will be very high, but you need to
-     * call it only once.
-     *
-     * cholmod_analyze sets Common->current to a value between 0 and nmethods-1.
-     * Each ordering method uses the set of options defined by this parameter.
-     */
-
-    int nmethods ;	/* The number of ordering methods to try.  Default: 0.
-			 * nmethods = 0 is a special case.  cholmod_analyze
-	* will try the user-provided ordering (if given) and AMD.  Let fl and
-	* lnz be the flop count and nonzeros in L from AMD's ordering.  Let
-	* anz be the number of nonzeros in the upper or lower triangular part
-	* of the symmetric matrix A.  If fl/lnz < 500 or lnz/anz < 5, then this
-	* is a good ordering, and METIS is not attempted.  Otherwise, METIS is
-	* tried.   The best ordering found is used.  If nmethods > 0, the
-	* methods used are given in the method[ ] array, below.  The first
-	* three methods in the default suite of orderings is (1) use the given
-	* permutation (if provided), (2) use AMD, and (3) use METIS.  Maximum
-	* allowed value is CHOLMOD_MAXMETHODS.  */
-
-    int current ;	/* The current method being tried.  Default: 0.  Valid
-			 * range is 0 to nmethods-1. */
-
-    int selected ;	/* The best method found. */
-
-    /* The suite of ordering methods and parameters: */
-
-    struct cholmod_method_struct
-    {
-	/* statistics for this method */
-	double lnz ;	    /* nnz(L) excl. zeros from supernodal amalgamation,
-			     * for a "pure" L */
-
-	double fl ;	    /* flop count for a "pure", real simplicial LL'
-			     * factorization, with no extra work due to
-	    * amalgamation.  Subtract n to get the LDL' flop count.   Multiply
-	    * by about 4 if the matrix is complex or zomplex. */
-
-	/* ordering method parameters */
-	double prune_dense ;/* dense row/col control for AMD, SYMAMD, CSYMAMD,
-			     * and NESDIS (cholmod_nested_dissection).  For a
-	    * symmetric n-by-n matrix, rows/columns with more than
-	    * MAX (16, prune_dense * sqrt (n)) entries are removed prior to
-	    * ordering.  They appear at the end of the re-ordered matrix.
-	    *
-	    * If prune_dense < 0, only completely dense rows/cols are removed.
-	    *
-	    * This paramater is also the dense column control for COLAMD and
-	    * CCOLAMD.  For an m-by-n matrix, columns with more than
-	    * MAX (16, prune_dense * sqrt (MIN (m,n))) entries are removed prior
-	    * to ordering.  They appear at the end of the re-ordered matrix.
-	    * CHOLMOD factorizes A*A', so it calls COLAMD and CCOLAMD with A',
-	    * not A.  Thus, this parameter affects the dense *row* control for
-	    * CHOLMOD's matrix, and the dense *column* control for COLAMD and
-	    * CCOLAMD.
-	    *
-	    * Removing dense rows and columns improves the run-time of the
-	    * ordering methods.  It has some impact on ordering quality
-	    * (usually minimal, sometimes good, sometimes bad).
-	    *
-	    * Default: 10. */
-
-	double prune_dense2 ;/* dense row control for COLAMD and CCOLAMD.
-			    *  Rows with more than MAX (16, dense2 * sqrt (n))
-	    * for an m-by-n matrix are removed prior to ordering.  CHOLMOD's
-	    * matrix is transposed before ordering it with COLAMD or CCOLAMD,
-	    * so this controls the dense *columns* of CHOLMOD's matrix, and
-	    * the dense *rows* of COLAMD's or CCOLAMD's matrix.
-	    *
-	    * If prune_dense2 < 0, only completely dense rows/cols are removed.
-	    *
-	    * Default: -1.  Note that this is not the default for COLAMD and
-	    * CCOLAMD.  -1 is best for Cholesky.  10 is best for LU.  */
-
-	double nd_oksep ;   /* in NESDIS, when a node separator is computed, it
-			     * discarded if nsep >= nd_oksep*n, where nsep is
-	    * the number of nodes in the separator, and n is the size of the
-	    * graph being cut.  Valid range is 0 to 1.  If 1 or greater, the
-	    * separator is discarded if it consists of the entire graph.
-	    * Default: 1 */
-
-	double other1 [4] ; /* future expansion */
-
-	size_t nd_small ;    /* do not partition graphs with fewer nodes than
-			     * nd_small, in NESDIS.  Default: 200 (same as
-			     * METIS) */
-
-	size_t other2 [4] ; /* future expansion */
-
-	int aggressive ;    /* Aggresive absorption in AMD, COLAMD, SYMAMD,
-			     * CCOLAMD, and CSYMAMD.  Default: TRUE */
-
-	int order_for_lu ;  /* CCOLAMD can be optimized to produce an ordering
-			     * for LU or Cholesky factorization.  CHOLMOD only
-	    * performs a Cholesky factorization.  However, you may wish to use
-	    * CHOLMOD as an interface for CCOLAMD but use it for your own LU
-	    * factorization.  In this case, order_for_lu should be set to FALSE.
-	    * When factorizing in CHOLMOD itself, you should *** NEVER *** set
-	    * this parameter FALSE.  Default: TRUE. */
-
-	int nd_compress ;   /* If TRUE, compress the graph and subgraphs before
-			     * partitioning them in NESDIS.  Default: TRUE */
-
-	int nd_camd ;	    /* If 1, follow the nested dissection ordering
-			     * with a constrained minimum degree ordering that
-	    * respects the partitioning just found (using CAMD).  If 2, use
-	    * CSYMAMD instead.  If you set nd_small very small, you may not need
-	    * this ordering, and can save time by setting it to zero (no
-	    * constrained minimum degree ordering).  Default: 1. */
-
-	int nd_components ; /* The nested dissection ordering finds a node
-			     * separator that splits the graph into two parts,
-	    * which may be unconnected.  If nd_components is TRUE, each of
-	    * these connected components is split independently.  If FALSE,
-	    * each part is split as a whole, even if it consists of more than
-	    * one connected component.  Default: FALSE */
-
-	/* fill-reducing ordering to use */
-	int ordering ;
-
-	size_t other3 [4] ; /* future expansion */
-
-    } method [CHOLMOD_MAXMETHODS + 1] ;
-
-    int postorder ;	/* If TRUE, cholmod_analyze follows the ordering with a
-			 * weighted postorder of the elimination tree.  Improves
-	* supernode amalgamation.  Does not affect fundamental nnz(L) and
-	* flop count.  Default: TRUE. */
-
-    /* ---------------------------------------------------------------------- */
-    /* memory management routines */
-    /* ---------------------------------------------------------------------- */
-
-    void *(*malloc_memory) (size_t) ;		/* pointer to malloc */
-    void *(*realloc_memory) (void *, size_t) ;  /* pointer to realloc */
-    void (*free_memory) (void *) ;		/* pointer to free */
-    void *(*calloc_memory) (size_t, size_t) ;	/* pointer to calloc */
-
-    /* ---------------------------------------------------------------------- */
-    /* routines for complex arithmetic */
-    /* ---------------------------------------------------------------------- */
-
-    int (*complex_divide) (double ax, double az, double bx, double bz,
-	    double *cx, double *cz) ;
-
-	/* flag = complex_divide (ax, az, bx, bz, &cx, &cz) computes the complex
-	 * division c = a/b, where ax and az hold the real and imaginary part
-	 * of a, and b and c are stored similarly.  flag is returned as 1 if
-	 * a divide-by-zero occurs, or 0 otherwise.  By default, the function
-	 * pointer Common->complex_divide is set equal to cholmod_divcomplex.
-	 */
-
-    double (*hypotenuse) (double x, double y) ;
-
-	/* s = hypotenuse (x,y) computes s = sqrt (x*x + y*y), but does so more
-	 * accurately.  By default, the function pointer Common->hypotenuse is
-	 * set equal to cholmod_hypot.  See also the hypot function in the C99
-	 * standard, which has an identical syntax and function.  If you have
-	 * a C99-compliant compiler, you can set Common->hypotenuse = hypot.  */
-
-    /* ---------------------------------------------------------------------- */
-    /* METIS workarounds */
-    /* ---------------------------------------------------------------------- */
-
-    double metis_memory ;   /* This is a parameter for CHOLMOD's interface to
-			     * METIS, not a parameter to METIS itself.  METIS
-	* uses an amount of memory that is difficult to estimate precisely
-	* beforehand.  If it runs out of memory, it terminates your program.
-	* All routines in CHOLMOD except for CHOLMOD's interface to METIS
-	* return an error status and safely return to your program if they run
-	* out of memory.  To mitigate this problem, the CHOLMOD interface
-	* can allocate a single block of memory equal in size to an empirical
-	* upper bound of METIS's memory usage times the Common->metis_memory
-	* parameter, and then immediately free it.  It then calls METIS.  If
-	* this pre-allocation fails, it is possible that METIS will fail as
-	* well, and so CHOLMOD returns with an out-of-memory condition without
-	* calling METIS.
-	*
-	* METIS_NodeND (used in the CHOLMOD_METIS ordering option) with its
-	* default parameter settings typically uses about (4*nz+40n+4096)
-	* times sizeof(int) memory, where nz is equal to the number of entries
-	* in A for the symmetric case or AA' if an unsymmetric matrix is
-	* being ordered (where nz includes both the upper and lower parts
-	* of A or AA').  The observed "upper bound" (with 2 exceptions),
-	* measured in an instrumented copy of METIS 4.0.1 on thousands of
-	* matrices, is (10*nz+50*n+4096) * sizeof(int).  Two large matrices
-	* exceeded this bound, one by almost a factor of 2 (Gupta/gupta2).
-	*
-	* If your program is terminated by METIS, try setting metis_memory to
-	* 2.0, or even higher if needed.  By default, CHOLMOD assumes that METIS
-	* does not have this problem (so that CHOLMOD will work correctly when
-	* this issue is fixed in METIS).  Thus, the default value is zero.
-	* This work-around is not guaranteed anyway.
-	*
-	* If a matrix exceeds this predicted memory usage, AMD is attempted
-	* instead.  It, too, may run out of memory, but if it does so it will
-	* not terminate your program.
-	*/
-
-    double metis_dswitch ;	/* METIS_NodeND in METIS 4.0.1 gives a seg */
-    size_t metis_nswitch ;	/* fault with one matrix of order n = 3005 and
-				 * nz = 6,036,025.  This is a very dense graph.
-     * The workaround is to use AMD instead of METIS for matrices of dimension
-     * greater than Common->metis_nswitch (default 3000) or more and with
-     * density of Common->metis_dswitch (default 0.66) or more.
-     * cholmod_nested_dissection has no problems with the same matrix, even
-     * though it uses METIS_NodeComputeSeparator on this matrix.  If this
-     * seg fault does not affect you, set metis_nswitch to zero or less,
-     * and CHOLMOD will not switch to AMD based just on the density of the
-     * matrix (it will still switch to AMD if the metis_memory parameter
-     * causes the switch).
-     */
-
-    /* ---------------------------------------------------------------------- */
-    /* workspace */
-    /* ---------------------------------------------------------------------- */
-
-    /* CHOLMOD has several routines that take less time than the size of
-     * workspace they require.  Allocating and initializing the workspace would
-     * dominate the run time, unless workspace is allocated and initialized
-     * just once.  CHOLMOD allocates this space when needed, and holds it here
-     * between calls to CHOLMOD.  cholmod_start sets these pointers to NULL
-     * (which is why it must be the first routine called in CHOLMOD).
-     * cholmod_finish frees the workspace (which is why it must be the last
-     * call to CHOLMOD).
-     */
-
-    size_t nrow ;	/* size of Flag and Head */
-    UF_long mark ;	/* mark value for Flag array */
-    size_t iworksize ;	/* size of Iwork.  Upper bound: 6*nrow+ncol */
-    size_t xworksize ;	/* size of Xwork,  in bytes.
-			 * maxrank*nrow*sizeof(double) for update/downdate.
-			 * 2*nrow*sizeof(double) otherwise */
-
-    /* initialized workspace: contents needed between calls to CHOLMOD */
-    void *Flag ;	/* size nrow, an integer array.  Kept cleared between
-			 * calls to cholmod rouines (Flag [i] < mark) */
-
-    void *Head ;	/* size nrow+1, an integer array. Kept cleared between
-			 * calls to cholmod routines (Head [i] = EMPTY) */
-
-    void *Xwork ; 	/* a double array.  Its size varies.  It is nrow for
-			 * most routines (cholmod_rowfac, cholmod_add,
-	* cholmod_aat, cholmod_norm, cholmod_ssmult) for the real case, twice
-	* that when the input matrices are complex or zomplex.  It is of size
-	* 2*nrow for cholmod_rowadd and cholmod_rowdel.  For cholmod_updown,
-	* its size is maxrank*nrow where maxrank is 2, 4, or 8.  Kept cleared
-	* between calls to cholmod (set to zero). */
-
-    /* uninitialized workspace, contents not needed between calls to CHOLMOD */
-    void *Iwork ;	/* size iworksize, 2*nrow+ncol for most routines,
-			 * up to 6*nrow+ncol for cholmod_analyze. */
-
-    int itype ;		/* If CHOLMOD_LONG, Flag, Head, and Iwork are UF_long.
-			 * Otherwise all three arrays are int. */
-
-    int dtype ;		/* double or float */
-
-	/* Common->itype and Common->dtype are used to define the types of all
-	 * sparse matrices, triplet matrices, dense matrices, and factors
-	 * created using this Common struct.  The itypes and dtypes of all
-	 * parameters to all CHOLMOD routines must match.  */
-
-    int no_workspace_reallocate ;   /* this is an internal flag, used as a
-	* precaution by cholmod_analyze.  It is normally false.  If true,
-	* cholmod_allocate_work is not allowed to reallocate any workspace;
-	* they must use the existing workspace in Common (Iwork, Flag, Head,
-	* and Xwork).  Added for CHOLMOD v1.1 */
-
-    /* ---------------------------------------------------------------------- */
-    /* statistics */
-    /* ---------------------------------------------------------------------- */
-
-    /* fl and lnz are set only in cholmod_analyze and cholmod_rowcolcounts,
-     * in the Cholesky modudle.  modfl is set only in the Modify module. */
-
-    int status ;	    /* error code */
-    double fl ;		    /* LL' flop count from most recent analysis */
-    double lnz ;	    /* fundamental nz in L */
-    double anz ;	    /* nonzeros in tril(A) if A is symmetric/lower,
-			     * triu(A) if symmetric/upper, or tril(A*A') if
-			     * unsymmetric, in last call to cholmod_analyze. */
-    double modfl ;	    /* flop count from most recent update/downdate/
-			     * rowadd/rowdel (excluding flops to modify the
-			     * solution to Lx=b, if computed) */
-    size_t malloc_count ;   /* # of objects malloc'ed minus the # free'd*/
-    size_t memory_usage ;   /* peak memory usage in bytes */
-    size_t memory_inuse ;   /* current memory usage in bytes */
-
-    double nrealloc_col ;   /* # of column reallocations */
-    double nrealloc_factor ;/* # of factor reallocations due to col. reallocs */
-    double ndbounds_hit ;   /* # of times diagonal modified by dbound */
-
-    double rowfacfl ;	    /* # of flops in last call to cholmod_rowfac */
-    double aatfl ;	    /* # of flops to compute A(:,f)*A(:,f)' */
-
-    /* ---------------------------------------------------------------------- */
-    /* future expansion */
-    /* ---------------------------------------------------------------------- */
-
-    /* To allow CHOLMOD to be updated without recompiling the user application,
-     * additional space is set aside here for future statistics, parameters,
-     * and workspace.  Note:  additional entries were added in v1.1 to the
-     * method array, above, and thus v1.0 and v1.1 are not binary compatible.
-     *
-     * v1.1 to the current version are binary compatible.
-     */
-
-    /* ---------------------------------------------------------------------- */
-    double other1 [10] ;
-
-    double SPQR_xstat [4] ;     /* for SuiteSparseQR statistics */
-
-    /* SuiteSparseQR control parameters: */
-    double SPQR_grain ;         /* task size is >= max (total flops / grain) */
-    double SPQR_small ;         /* task size is >= small */
-
-    /* ---------------------------------------------------------------------- */
-    UF_long SPQR_istat [10] ;   /* for SuiteSparseQR statistics */
-    UF_long other2 [6] ;        /* reduced from size 16 in v1.6 */
-
-    /* ---------------------------------------------------------------------- */
-    int other3 [10] ;       /* reduced from size 16 in v1.1. */
-
-    int prefer_binary ;	    /* cholmod_read_triplet converts a symmetric
-			     * pattern-only matrix into a real matrix.  If
-	* prefer_binary is FALSE, the diagonal entries are set to 1 + the degree
-	* of the row/column, and off-diagonal entries are set to -1 (resulting
-	* in a positive definite matrix if the diagonal is zero-free).  Most
-	* symmetric patterns are the pattern a positive definite matrix.  If
-	* this parameter is TRUE, then the matrix is returned with a 1 in each
-	* entry, instead.  Default: FALSE.  Added in v1.3. */
-
-    /* control parameter (added for v1.2): */
-    int default_nesdis ;    /* Default: FALSE.  If FALSE, then the default
-			     * ordering strategy (when Common->nmethods == 0)
-	* is to try the given ordering (if present), AMD, and then METIS if AMD
-	* reports high fill-in.  If Common->default_nesdis is TRUE then NESDIS
-	* is used instead in the default strategy. */
-
-    /* statistic (added for v1.2): */
-    int called_nd ;	    /* TRUE if the last call to
-			     * cholmod_analyze called NESDIS or METIS. */
-
-    int blas_ok ;           /* FALSE if BLAS int overflow; TRUE otherwise */
-
-    /* SuiteSparseQR control parameters: */
-    int SPQR_shrink ;        /* controls stack realloc method */
-    int SPQR_nthreads ;      /* number of TBB threads, 0 = auto */
-
-    /* ---------------------------------------------------------------------- */
-    size_t  other4 [16] ;
-
-    /* ---------------------------------------------------------------------- */
-    void   *other5 [16] ;
-
-} cholmod_common ;
-
-// in ../../src/CHOLMOD/Include/cholmod_core.h  skip forward to - line 1114 :
-/* A sparse matrix stored in compressed-column form. */
-
-typedef struct cholmod_sparse_struct
-{
-    size_t nrow ;	/* the matrix is nrow-by-ncol */
-    size_t ncol ;
-    size_t nzmax ;	/* maximum number of entries in the matrix */
-
-    /* pointers to int or UF_long: */
-    void *p ;		/* p [0..ncol], the column pointers */
-    void *i ;		/* i [0..nzmax-1], the row indices */
-
-    /* for unpacked matrices only: */
-    void *nz ;		/* nz [0..ncol-1], the # of nonzeros in each col.  In
-			 * packed form, the nonzero pattern of column j is in
-	* A->i [A->p [j] ... A->p [j+1]-1].  In unpacked form, column j is in
-	* A->i [A->p [j] ... A->p [j]+A->nz[j]-1] instead.  In both cases, the
-	* numerical values (if present) are in the corresponding locations in
-	* the array x (or z if A->xtype is CHOLMOD_ZOMPLEX). */
-
-    /* pointers to double or float: */
-    void *x ;		/* size nzmax or 2*nzmax, if present */
-    void *z ;		/* size nzmax, if present */
-
-    int stype ;		/* Describes what parts of the matrix are considered:
-			 *
-	* 0:  matrix is "unsymmetric": use both upper and lower triangular parts
-	*     (the matrix may actually be symmetric in pattern and value, but
-	*     both parts are explicitly stored and used).  May be square or
-	*     rectangular.
-	* >0: matrix is square and symmetric, use upper triangular part.
-	*     Entries in the lower triangular part are ignored.
-	* <0: matrix is square and symmetric, use lower triangular part.
-	*     Entries in the upper triangular part are ignored.
-	*
-	* Note that stype>0 and stype<0 are different for cholmod_sparse and
-	* cholmod_triplet.  See the cholmod_triplet data structure for more
-	* details.
-	*/
-
-    int itype ;		/* CHOLMOD_INT:     p, i, and nz are int.
-			 * CHOLMOD_INTLONG: p is UF_long, i and nz are int.
-			 * CHOLMOD_LONG:    p, i, and nz are UF_long.  */
-
-    int xtype ;		/* pattern, real, complex, or zomplex */
-    int dtype ;		/* x and z are double or float */
-    int sorted ;	/* TRUE if columns are sorted, FALSE otherwise */
-    int packed ;	/* TRUE if packed (nz ignored), FALSE if unpacked
-			 * (nz is required) */
-
-} cholmod_sparse ;
-
-// in ../../src/CHOLMOD/Include/cholmod_core.h  skip forward to - line 1554 :
-/* A symbolic and numeric factorization, either simplicial or supernodal.
- * In all cases, the row indices in the columns of L are kept sorted. */
-
-typedef struct cholmod_factor_struct
-{
-    /* ---------------------------------------------------------------------- */
-    /* for both simplicial and supernodal factorizations */
-    /* ---------------------------------------------------------------------- */
-
-    size_t n ;		/* L is n-by-n */
-
-    size_t minor ;	/* If the factorization failed, L->minor is the column
-			 * at which it failed (in the range 0 to n-1).  A value
-			 * of n means the factorization was successful or
-			 * the matrix has not yet been factorized. */
-
-    /* ---------------------------------------------------------------------- */
-    /* symbolic ordering and analysis */
-    /* ---------------------------------------------------------------------- */
-
-    void *Perm ;	/* size n, permutation used */
-    void *ColCount ;	/* size n, column counts for simplicial L */
-
-    void *IPerm ;       /* size n, inverse permutation.  Only created by
-                         * cholmod_solve2 if Bset is used. */
-
-    /* ---------------------------------------------------------------------- */
-    /* simplicial factorization */
-    /* ---------------------------------------------------------------------- */
-
-    size_t nzmax ;	/* size of i and x */
-
-    void *p ;		/* p [0..ncol], the column pointers */
-    void *i ;		/* i [0..nzmax-1], the row indices */
-    void *x ;		/* x [0..nzmax-1], the numerical values */
-    void *z ;
-    void *nz ;		/* nz [0..ncol-1], the # of nonzeros in each column.
-			 * i [p [j] ... p [j]+nz[j]-1] contains the row indices,
-			 * and the numerical values are in the same locatins
-			 * in x. The value of i [p [k]] is always k. */
-
-    void *next ;	/* size ncol+2. next [j] is the next column in i/x */
-    void *prev ;	/* size ncol+2. prev [j] is the prior column in i/x.
-			 * head of the list is ncol+1, and the tail is ncol. */
-
-    /* ---------------------------------------------------------------------- */
-    /* supernodal factorization */
-    /* ---------------------------------------------------------------------- */
-
-    /* Note that L->x is shared with the simplicial data structure.  L->x has
-     * size L->nzmax for a simplicial factor, and size L->xsize for a supernodal
-     * factor. */
-
-    size_t nsuper ;	/* number of supernodes */
-    size_t ssize ;	/* size of s, integer part of supernodes */
-    size_t xsize ;	/* size of x, real part of supernodes */
-    size_t maxcsize ;	/* size of largest update matrix */
-    size_t maxesize ;	/* max # of rows in supernodes, excl. triangular part */
-
-    void *super ;	/* size nsuper+1, first col in each supernode */
-    void *pi ;		/* size nsuper+1, pointers to integer patterns */
-    void *px ;		/* size nsuper+1, pointers to real parts */
-    void *s ;		/* size ssize, integer part of supernodes */
-
-    /* ---------------------------------------------------------------------- */
-    /* factorization type */
-    /* ---------------------------------------------------------------------- */
-
-    int ordering ;	/* ordering method used */
-
-    int is_ll ;		/* TRUE if LL', FALSE if LDL' */
-    int is_super ;	/* TRUE if supernodal, FALSE if simplicial */
-    int is_monotonic ;	/* TRUE if columns of L appear in order 0..n-1.
-			 * Only applicable to simplicial numeric types. */
-
-    /* There are 8 types of factor objects that cholmod_factor can represent
-     * (only 6 are used):
-     *
-     * Numeric types (xtype is not CHOLMOD_PATTERN)
-     * --------------------------------------------
-     *
-     * simplicial LDL':  (is_ll FALSE, is_super FALSE).  Stored in compressed
-     *	    column form, using the simplicial components above (nzmax, p, i,
-     *	    x, z, nz, next, and prev).  The unit diagonal of L is not stored,
-     *	    and D is stored in its place.  There are no supernodes.
-     *
-     * simplicial LL': (is_ll TRUE, is_super FALSE).  Uses the same storage
-     *	    scheme as the simplicial LDL', except that D does not appear.
-     *	    The first entry of each column of L is the diagonal entry of
-     *	    that column of L.
-     *
-     * supernodal LDL': (is_ll FALSE, is_super TRUE).  Not used.
-     *	    FUTURE WORK:  add support for supernodal LDL'
-     *
-     * supernodal LL': (is_ll TRUE, is_super TRUE).  A supernodal factor,
-     *	    using the supernodal components described above (nsuper, ssize,
-     *	    xsize, maxcsize, maxesize, super, pi, px, s, x, and z).
-     *
-     *
-     * Symbolic types (xtype is CHOLMOD_PATTERN)
-     * -----------------------------------------
-     *
-     * simplicial LDL': (is_ll FALSE, is_super FALSE).  Nothing is present
-     *	    except Perm and ColCount.
-     *
-     * simplicial LL': (is_ll TRUE, is_super FALSE).  Identical to the
-     *	    simplicial LDL', except for the is_ll flag.
-     *
-     * supernodal LDL': (is_ll FALSE, is_super TRUE).  Not used.
-     *	    FUTURE WORK:  add support for supernodal LDL'
-     *
-     * supernodal LL': (is_ll TRUE, is_super TRUE).  A supernodal symbolic
-     *	    factorization.  The simplicial symbolic information is present
-     *	    (Perm and ColCount), as is all of the supernodal factorization
-     *	    except for the numerical values (x and z).
-     */
-
-    int itype ; /* The integer arrays are Perm, ColCount, p, i, nz,
-                 * next, prev, super, pi, px, and s.  If itype is
-		 * CHOLMOD_INT, all of these are int arrays.
-		 * CHOLMOD_INTLONG: p, pi, px are SuiteSparse_long, others int.
-		 * CHOLMOD_LONG:    all integer arrays are SuiteSparse_long. */
-    int xtype ; /* pattern, real, complex, or zomplex */
-    int dtype ; /* x and z double or float */
-
-} cholmod_factor ;
-
-// in ../../src/CHOLMOD/Include/cholmod_core.h  skip forward to - line 1836 :
-/* A dense matrix in column-oriented form.  It has no itype since it contains
- * no integers.  Entry in row i and column j is located in x [i+j*d].
- */
-
-typedef struct cholmod_dense_struct
-{
-    size_t nrow ;	/* the matrix is nrow-by-ncol */
-    size_t ncol ;
-    size_t nzmax ;	/* maximum number of entries in the matrix */
-    size_t d ;		/* leading dimension (d >= nrow must hold) */
-    void *x ;		/* size nzmax or 2*nzmax, if present */
-    void *z ;		/* size nzmax, if present */
-    int xtype ;		/* pattern, real, complex, or zomplex */
-    int dtype ;		/* x and z double or float */
-
-} cholmod_dense ;
-
-// in ../../src/CHOLMOD/Include/cholmod_core.h  skip forward to - line 2033 :
-/* A sparse matrix stored in triplet form. */
-
-typedef struct cholmod_triplet_struct
-{
-    size_t nrow ;	/* the matrix is nrow-by-ncol */
-    size_t ncol ;
-    size_t nzmax ;	/* maximum number of entries in the matrix */
-    size_t nnz ;	/* number of nonzeros in the matrix */
-
-    void *i ;		/* i [0..nzmax-1], the row indices */
-    void *j ;		/* j [0..nzmax-1], the column indices */
-    void *x ;		/* size nzmax or 2*nzmax, if present */
-    void *z ;		/* size nzmax, if present */
-
-    int stype ;		/* Describes what parts of the matrix are considered:
-			 *
-	* 0:  matrix is "unsymmetric": use both upper and lower triangular parts
-	*     (the matrix may actually be symmetric in pattern and value, but
-	*     both parts are explicitly stored and used).  May be square or
-	*     rectangular.
-	* >0: matrix is square and symmetric.  Entries in the lower triangular
-	*     part are transposed and added to the upper triangular part when
-	*     the matrix is converted to cholmod_sparse form.
-	* <0: matrix is square and symmetric.  Entries in the upper triangular
-	*     part are transposed and added to the lower triangular part when
-	*     the matrix is converted to cholmod_sparse form.
-	*
-	* Note that stype>0 and stype<0 are different for cholmod_sparse and
-	* cholmod_triplet.  The reason is simple.  You can permute a symmetric
-	* triplet matrix by simply replacing a row and column index with their
-	* new row and column indices, via an inverse permutation.  Suppose
-	* P = L->Perm is your permutation, and Pinv is an array of size n.
-	* Suppose a symmetric matrix A is represent by a triplet matrix T, with
-	* entries only in the upper triangular part.  Then the following code:
-	*
-	*	Ti = T->i ;
-	*	Tj = T->j ;
-	*	for (k = 0 ; k < n  ; k++) Pinv [P [k]] = k ;
-	*	for (k = 0 ; k < nz ; k++) Ti [k] = Pinv [Ti [k]] ;
-	*	for (k = 0 ; k < nz ; k++) Tj [k] = Pinv [Tj [k]] ;
-	*
-	* creates the triplet form of C=P*A*P'.  However, if T initially
-	* contains just the upper triangular entries (T->stype = 1), after
-	* permutation it has entries in both the upper and lower triangular
-	* parts.  These entries should be transposed when constructing the
-	* cholmod_sparse form of A, which is what cholmod_triplet_to_sparse
-	* does.  Thus:
-	*
-	*	C = cholmod_triplet_to_sparse (T, 0, &Common) ;
-	*
-	* will return the matrix C = P*A*P'.
-	*
-	* Since the triplet matrix T is so simple to generate, it's quite easy
-	* to remove entries that you do not want, prior to converting T to the
-	* cholmod_sparse form.  So if you include these entries in T, CHOLMOD
-	* assumes that there must be a reason (such as the one above).  Thus,
-	* no entry in a triplet matrix is ever ignored.
-	*/
-
-    int itype ; /* CHOLMOD_LONG: i and j are SuiteSparse_long.  Otherwise int */
-    int xtype ; /* pattern, real, complex, or zomplex */
-    int dtype ; /* x and z are double or float */
-
-} cholmod_triplet ;
-
-// -------- our (Matrix)  short and const_ forms of of the pointers :
-typedef       cholmod_common*        CHM_CM;
-typedef       cholmod_dense*         CHM_DN;
-typedef const cholmod_dense*   const_CHM_DN;
-typedef       cholmod_factor*        CHM_FR;
-typedef const cholmod_factor*  const_CHM_FR;
-typedef       cholmod_sparse*        CHM_SP;
-typedef const cholmod_sparse*  const_CHM_SP;
-typedef       cholmod_triplet*       CHM_TR;
-typedef const cholmod_triplet* const_CHM_TR;
-
-
-// --------- Matrix ("M_") R ("R_") pkg  routines "re-exported": ---------------
-
-int  cholmod_start(CHM_CM);
-void R_cholmod_error(int status, const char *file, int line, const char *message);
-int  cholmod_finish(CHM_CM);
-
-CHM_SP cholmod_allocate_sparse(size_t nrow, size_t ncol,
-				 size_t nzmax, int sorted,
-				 int packed, int stype, int xtype,
-				 CHM_CM);
-int      cholmod_free_factor(CHM_FR *L, CHM_CM);
-int      cholmod_free_dense(CHM_DN *A, CHM_CM);
-int      cholmod_free_sparse(CHM_SP *A, CHM_CM);
-int      cholmod_free_triplet(CHM_TR *T, CHM_CM);
-
-long int cholmod_nnz(const_CHM_SP, CHM_CM);
-CHM_SP   cholmod_speye(size_t nrow, size_t ncol, int xtype, CHM_CM);
-CHM_SP   cholmod_transpose(const_CHM_SP, int values, CHM_CM);
-int      cholmod_sort(CHM_SP A, CHM_CM);
-CHM_SP   cholmod_vertcat(const_CHM_SP, const_CHM_SP, int values, CHM_CM);
-CHM_SP   cholmod_copy(const_CHM_SP, int stype, int mode, CHM_CM);
-CHM_SP   cholmod_add(const_CHM_SP, const_CHM_SP, double alpha [2], double beta [2],
-		     int values, int sorted, CHM_CM);
-
-// from ../../src/CHOLMOD/Include/cholmod_cholesky.h - line 178 :
-#define CHOLMOD_A    0		/* solve Ax=b */
-#define CHOLMOD_LDLt 1		/* solve LDL'x=b */
-#define CHOLMOD_LD   2		/* solve LDx=b */
-#define CHOLMOD_DLt  3		/* solve DL'x=b */
-#define CHOLMOD_L    4		/* solve Lx=b */
-#define CHOLMOD_Lt   5		/* solve L'x=b */
-#define CHOLMOD_D    6		/* solve Dx=b */
-#define CHOLMOD_P    7		/* permute x=Px */
-#define CHOLMOD_Pt   8		/* permute x=P'x */
-
-CHM_DN   cholmod_solve(int, const_CHM_FR, const_CHM_DN, CHM_CM);
-CHM_SP   cholmod_spsolve(int, const_CHM_FR, const_CHM_SP, CHM_CM);
-int      cholmod_sdmult(const_CHM_SP, int, const double*, const double*,
-		     const_CHM_DN, CHM_DN Y, CHM_CM);
-CHM_SP   cholmod_ssmult(const_CHM_SP, const_CHM_SP, int, int, int,
-			CHM_CM);
-int      cholmod_factorize(const_CHM_SP, CHM_FR L, CHM_CM);
-int      cholmod_factorize_p(const_CHM_SP, double *beta, int *fset,
-			  size_t fsize, CHM_FR L, CHM_CM);
-CHM_SP   cholmod_copy_sparse(const_CHM_SP, CHM_CM);
-CHM_DN   cholmod_copy_dense(const_CHM_DN, CHM_CM);
-CHM_SP   cholmod_aat(const_CHM_SP, int *fset, size_t fsize, int mode,
-		     CHM_CM);
-int      cholmod_band_inplace(CHM_SP A, int k1, int k2, int mode, CHM_CM);
-CHM_SP   cholmod_add(const_CHM_SP, const_CHM_SP, double alpha[2], double beta[2],
-		     int values, int sorted, CHM_CM);
-CHM_DN   cholmod_allocate_dense(size_t nrow, size_t ncol, size_t d,
-				int xtype, CHM_CM);
-CHM_FR   cholmod_analyze(const_CHM_SP, CHM_CM);
-CHM_FR   cholmod_analyze_p(const_CHM_SP, int *Perm, int *fset,
-				    size_t fsize, CHM_CM);
-int      cholmod_change_factor(int to_xtype, int to_ll, int to_super,
-			    int to_packed, int to_monotonic,
-			    CHM_FR L, CHM_CM);
-CHM_FR   cholmod_copy_factor(const_CHM_FR, CHM_CM);
-CHM_SP   cholmod_factor_to_sparse(const_CHM_FR, CHM_CM);
-CHM_SP   cholmod_dense_to_sparse(const_CHM_DN, int values, CHM_CM);
-int      cholmod_defaults (CHM_CM);
-CHM_SP   cholmod_triplet_to_sparse(const cholmod_triplet*, int nzmax, CHM_CM);
-CHM_SP   cholmod_submatrix(const_CHM_SP, int *rset, int rsize, int *cset,
-			   int csize, int values, int sorted,
-			   CHM_CM);
-CHM_TR   cholmod_sparse_to_triplet(const_CHM_SP, CHM_CM);
-CHM_DN   cholmod_sparse_to_dense(const_CHM_SP, CHM_CM);
-CHM_TR   cholmod_allocate_triplet (size_t nrow, size_t ncol, size_t nzmax,
-				   int stype, int xtype, CHM_CM);
-
-// from ../../src/CHOLMOD/Include/cholmod_matrixops.h - line 107 :
-/* scaling modes, selected by the scale input parameter: */
-#define CHOLMOD_SCALAR 0	/* A = s*A */
-#define CHOLMOD_ROW 1		/* A = diag(s)*A */
-#define CHOLMOD_COL 2		/* A = A*diag(s) */
-#define CHOLMOD_SYM 3		/* A = diag(s)*A*diag(s) */
-
-int      cholmod_scale(const_CHM_DN, int scale, CHM_SP, CHM_CM);
-
-// added in the Matrix package - the log of the determinant of the matrix that was factored
-double   chm_factor_ldetL2(const_CHM_FR);
-
-#ifdef	__cplusplus
-}
-#endif
-
-#endif  /* RcppEigen_CHOLMOD_H */
diff --git a/inst/include/RcppEigenForward.h b/inst/include/RcppEigenForward.h
index 2dcf312c..a41212ce 100644
--- a/inst/include/RcppEigenForward.h
+++ b/inst/include/RcppEigenForward.h
@@ -25,11 +25,8 @@
 #include <iterator>
 #include <RcppCommon.h>
 #include <Rconfig.h>
-#include <RcppEigenCholmod.h>
-#include <RcppEigenStubs.h>
 #include <Eigen/Dense>
 #include <Eigen/Sparse>
-#include <Eigen/CholmodSupport>
 //#include <unsupported/Eigen/AutoDiff>  // causes problems redefining sign
 #include <unsupported/Eigen/IterativeSolvers>
 #include <unsupported/Eigen/KroneckerProduct>
@@ -42,10 +39,6 @@
 
 /* forward declarations */
 namespace Rcpp {
-    /* support for wrap */
-   
-    template<typename T>
-    SEXP wrap(const Eigen::CholmodDecomposition<Eigen::SparseMatrix<T> >& obj);
 
     namespace traits {
 
@@ -60,12 +53,14 @@ namespace Rcpp {
 	template<typename T> class Exporter< Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic> >;
 	template<typename T> class Exporter< Eigen::Array<T, Eigen::Dynamic, 1> >;
 	template<typename T> class Exporter< Eigen::Array<T, 1, Eigen::Dynamic> >;
-	template<typename T> class Exporter< Eigen::MappedSparseMatrix<T> >;
+    template<typename T> class Exporter< Eigen::Map<Eigen::SparseMatrix<T> > >;
+    //template<typename T> class Exporter< Eigen::MappedSparseMatrix<T> >;  // Deprecated
 	template<typename T> class Exporter< Eigen::SparseMatrix<T> >;
-	template<typename T> class Exporter< Eigen::MappedSparseMatrix<T, Eigen::RowMajor> >;
+    template<typename T> class Exporter< Eigen::Map<Eigen::SparseMatrix<T, Eigen::RowMajor> > >;
+    //template<typename T> class Exporter< Eigen::MappedSparseMatrix<T, Eigen::RowMajor> >;  // Deprecated
 	template<typename T> class Exporter< Eigen::SparseMatrix<T, Eigen::RowMajor> >;
 
-    } // namespace traits 
+    } // namespace traits
 
 }
 
diff --git a/inst/include/RcppEigenStubs.cpp b/inst/include/RcppEigenStubs.cpp
new file mode 100644
index 00000000..ee8b20b0
--- /dev/null
+++ b/inst/include/RcppEigenStubs.cpp
@@ -0,0 +1,43 @@
+// RcppEigenStubs.cpp: Definitions for CHOLMOD stubs declared
+// in RcppEigenCholmod.h, which packages including the header
+// must compile
+//
+// Copyright (C)      2011 Douglas Bates and Martin Maechler
+//
+// This file is part of RcppEigen.
+//
+// RcppEigen is free software: you can redistribute it and/or modify it
+// under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 2 of the License, or
+// (at your option) any later version.
+//
+// RcppEigen is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with RcppEigen.  If not, see <http://www.gnu.org/licenses/>.
+
+/* MJ: Packages could do the following themselves, but that would */
+/*     destroy the illusion that they are only linking RcppEigen. */
+
+/* MJ: Matrix <= 1.6-1.1 used 'error' and 'warning' unsafely ... sigh ... */
+#include <R_ext/Error.h>
+#ifndef error
+# define error Rf_error
+# define __ERROR__WAS__UNDEFINED__
+#endif
+#ifndef warning
+# define warning Rf_warning
+# define __WARNING__WAS__UNDEFINED__
+#endif
+#include <Matrix_stubs.c>
+#ifdef __ERROR__WAS__UNDEFINED__
+# undef error
+# undef __ERROR__WAS__UNDEFINED__
+#endif
+#ifdef __WARNING__WAS__UNDEFINED__
+# undef warning
+# undef __WARNING__WAS__UNDEFINED__
+#endif
diff --git a/inst/include/RcppEigenStubs.h b/inst/include/RcppEigenStubs.h
deleted file mode 100644
index 04d54184..00000000
--- a/inst/include/RcppEigenStubs.h
+++ /dev/null
@@ -1,585 +0,0 @@
-// RcppEigenStubs.cpp: Provide access to compiled CHOLMOD functions in
-// the Matrix package.
-//
-// Copyright (C)      2011 Douglas Bates and Martin Maechler
-//
-// This file is part of RcppEigen.
-//
-// RcppEigen is free software: you can redistribute it and/or modify it
-// under the terms of the GNU General Public License as published by
-// the Free Software Foundation, either version 2 of the License, or
-// (at your option) any later version.
-//
-// RcppEigen is distributed in the hope that it will be useful, but
-// WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with RcppEigen.  If not, see <http://www.gnu.org/licenses/>.
-
-// Yes, this really is a C++ source file in an include directory.  To
-// use the Cholmod support functions in RcppEigen you should create a
-// source file, say MyPackage/src/local_stubs.c that contains the
-// single line #include "RcppEigenStubs.cpp"
-
-#include <Rconfig.h>
-#include <Rinternals.h>
-#include <R_ext/Rdynload.h>
-
-#ifdef	__cplusplus
-extern "C" {
-#endif
-
-inline CHM_DN as_cholmod_dense(CHM_DN ans, SEXP x)
-{
-    static CHM_DN(*fun)(CHM_DN,SEXP) = NULL;
-    if(fun == NULL)
-	fun = (CHM_DN(*)(CHM_DN,SEXP))
-	    R_GetCCallable("Matrix", "as_cholmod_dense");
-    return fun(ans, x);
-}
-
-inline CHM_FR as_cholmod_factor(CHM_FR ans, SEXP x)
-{
-    static CHM_FR(*fun)(CHM_FR,SEXP) = NULL;
-    if(fun == NULL)
-	fun = (CHM_FR(*)(CHM_FR,SEXP))
-	    R_GetCCallable("Matrix", "as_cholmod_factor");
-    return fun(ans, x);
-}
-
-inline CHM_SP as_cholmod_sparse(CHM_SP ans, SEXP x, Rboolean check_Udiag, Rboolean sort_in_place)
-{
-    static CHM_SP(*fun)(CHM_SP,SEXP,Rboolean,Rboolean)= NULL;
-    if(fun == NULL)
-	fun = (CHM_SP(*)(CHM_SP,SEXP,Rboolean,Rboolean))
-	    R_GetCCallable("Matrix", "as_cholmod_sparse");
-    return fun(ans, x, check_Udiag, sort_in_place);
-}
-
-inline CHM_TR as_cholmod_triplet(CHM_TR ans, SEXP x, Rboolean check_Udiag)
-{
-    static CHM_TR(*fun)(CHM_TR,SEXP,Rboolean)= NULL;
-    if(fun == NULL)
-	fun = (CHM_TR(*)(CHM_TR,SEXP,Rboolean))
-	    R_GetCCallable("Matrix", "as_cholmod_triplet");
-    return fun(ans, x, check_Udiag);
-}
-
-inline SEXP Csparse_diagU2N(SEXP x)
-{
-    static SEXP(*fun)(SEXP) = NULL;
-    if(fun == NULL)
-	fun = (SEXP(*)(SEXP))
-	    R_GetCCallable("Matrix", "Csparse_diagU2N");
-    return fun(x);
-}
-
-inline SEXP
-M_chm_factor_to_SEXP(const_CHM_FR f, int dofree)
-{
-    static SEXP(*fun)(const_CHM_FR,int) = NULL;
-    if(fun == NULL)
-	fun = (SEXP(*)(const_CHM_FR,int))
-	    R_GetCCallable("Matrix", "chm_factor_to_SEXP");
-    return fun(f, dofree);
-}
-
-inline double
-M_chm_factor_ldetL2(const_CHM_FR f)
-{
-    static double(*fun)(const_CHM_FR) = NULL;
-    if(fun == NULL)
-	fun = (double(*)(const_CHM_FR))
-	    R_GetCCallable("Matrix", "chm_factor_ldetL2");
-    return fun(f);
-}
-
-inline CHM_FR
-M_chm_factor_update(CHM_FR f, const_CHM_SP A, double mult)
-{
-    static CHM_FR(*fun)(CHM_FR,const_CHM_SP,double) = NULL;
-    if(fun == NULL)
-	fun = (CHM_FR(*)(CHM_FR,const_CHM_SP,double))
-	    R_GetCCallable("Matrix", "chm_factor_update");
-    return fun(f, A, mult);
-}
-
-inline SEXP
-M_chm_sparse_to_SEXP(const_CHM_SP a, int dofree,
-		     int uploT, int Rkind, const char *diag, SEXP dn)
-{
-    static SEXP(*fun)(const_CHM_SP,int,int,int,const char*,SEXP) = NULL;
-    if(fun == NULL)
-	fun = (SEXP(*)(const_CHM_SP,int,int,int,const char*,SEXP))
-	    R_GetCCallable("Matrix", "chm_sparse_to_SEXP");
-    return fun(a, dofree, uploT, Rkind, diag, dn);
-}
-
-inline SEXP
-M_chm_triplet_to_SEXP(const CHM_TR a, int dofree,
-		      int uploT, int Rkind, const char *diag, SEXP dn)
-{
-    static SEXP(*fun)(const CHM_TR,int,int,int,const char*,SEXP) = NULL;
-    if(fun == NULL)
-	fun = (SEXP(*)(const CHM_TR,int,int,int,const char*,SEXP))
-	    R_GetCCallable("Matrix", "chm_triplet_to_SEXP");
-    return fun(a, dofree, uploT, Rkind, diag, dn);
-}
-
-inline CHM_SP
-cholmod_aat(const_CHM_SP A, int *fset, size_t fsize,
-	      int mode, CHM_CM Common)
-{
-    static CHM_SP(*fun)(const_CHM_SP,int*,size_t,
-			int,CHM_CM) = NULL;
-    if(fun == NULL)
-	fun = (CHM_SP(*)(const_CHM_SP,int*,size_t,
-			 int,CHM_CM))
-	    R_GetCCallable("Matrix", "cholmod_aat");
-    return fun(A, fset, fsize, mode, Common);
-}
-
-inline int
-M_cholmod_band_inplace(CHM_SP A, int k1, int k2, int mode,
-		       CHM_CM Common)
-{
-    static int(*fun)(CHM_SP,int,int,int,CHM_CM) = NULL;
-    if (fun == NULL)
-	fun = (int(*)(CHM_SP,int,int,int,CHM_CM))
-	    R_GetCCallable("Matrix", "cholmod_band_inplace");
-    return fun(A, k1, k2, mode, Common);
-}
-
-inline CHM_SP
-cholmod_add(const_CHM_SP A, const_CHM_SP B,
-	      double alpha[2], double beta[2], int values,
-	      int sorted, CHM_CM Common)
-{
-    static CHM_SP(*fun)(const_CHM_SP,const_CHM_SP,
-			double*,double*,int,int,
-			CHM_CM) = NULL;
-    if (fun == NULL)
-	fun = (CHM_SP(*)(const_CHM_SP,const_CHM_SP,
-			 double*,double*,int,int,
-			 CHM_CM))
-	    R_GetCCallable("Matrix", "cholmod_add");
-    return fun(A, B, alpha, beta, values, sorted, Common);
-}
-
-inline CHM_DN
-cholmod_allocate_dense(size_t nrow, size_t ncol, size_t d,
-			 int xtype, CHM_CM Common)
-{
-    static CHM_DN(*fun)(size_t,size_t,size_t,
-			int,CHM_CM) = NULL;
-    if (fun == NULL)
-	fun = (CHM_DN(*)(size_t,size_t,size_t,
-			 int,CHM_CM))
-	    R_GetCCallable("Matrix", "cholmod_allocate_dense");
-    return fun(nrow, ncol, d, xtype, Common);
-}
-
-inline CHM_SP
-cholmod_allocate_sparse(size_t nrow, size_t ncol, size_t nzmax,
-			  int sorted, int packed, int stype,
-			  int xtype, CHM_CM Common)
-{
-    static CHM_SP(*fun)(size_t,size_t,size_t,int,int,
-			int,int,CHM_CM) = NULL;
-    if (fun == NULL)
-	fun = (CHM_SP(*)
-	       (size_t,size_t,size_t,int,int,int,int,CHM_CM))
-	    R_GetCCallable("Matrix", "cholmod_allocate_sparse");
-    return fun(nrow,ncol,nzmax,sorted,packed,stype,xtype,Common);
-}
-
-inline CHM_TR
-cholmod_allocate_triplet(size_t nrow, size_t ncol, size_t nzmax,
-			   int stype, int xtype, CHM_CM Common)
-{
-    static CHM_TR(*fun)(size_t,size_t,size_t, int,int,CHM_CM) = NULL;
-    if (fun == NULL)
-	fun = (CHM_TR(*)(size_t,size_t,size_t,int,int,CHM_CM))
-	    R_GetCCallable("Matrix", "cholmod_allocate_triplet");
-    return fun(nrow,ncol,nzmax,stype,xtype,Common);
-}
-
-inline CHM_SP
-cholmod_triplet_to_sparse(const cholmod_triplet* T, int nzmax,
-			    CHM_CM Common)
-{
-    static CHM_SP(*fun)(const cholmod_triplet*,int,CHM_CM) = NULL;
-    if (fun == NULL)
-	fun = (CHM_SP(*)(const cholmod_triplet*,int,CHM_CM))
-	    R_GetCCallable("Matrix", "cholmod_triplet_to_sparse");
-    return fun(T, nzmax, Common);
-}
-
-inline CHM_TR
-cholmod_sparse_to_triplet(const_CHM_SP A, CHM_CM Common)
-{
-    static CHM_TR(*fun)(const_CHM_SP,CHM_CM) = NULL;
-    if (fun == NULL)
-	fun = (CHM_TR(*)(const_CHM_SP,CHM_CM))
-	    R_GetCCallable("Matrix", "cholmod_sparse_to_triplet");
-    return fun(A, Common);
-}
-
-inline CHM_DN
-cholmod_sparse_to_dense(const_CHM_SP A, CHM_CM Common)
-{
-    static CHM_DN(*fun)(const_CHM_SP,CHM_CM) = NULL;
-    if (fun == NULL)
-	fun = (CHM_DN(*)(const_CHM_SP,CHM_CM))
-	    R_GetCCallable("Matrix", "cholmod_sparse_to_dense");
-    return fun(A, Common);
-}
-
-inline CHM_FR
-cholmod_analyze(const_CHM_SP A, CHM_CM Common)
-{
-    static CHM_FR(*fun)(const_CHM_SP,CHM_CM) = NULL;
-    if (fun == NULL)
-	fun = (CHM_FR(*)(const_CHM_SP,CHM_CM))
-	    R_GetCCallable("Matrix", "cholmod_analyze");
-    return fun(A, Common);
-}
-
-inline CHM_FR
-cholmod_analyze_p(const_CHM_SP A, int *Perm, int *fset,
-		  size_t fsize, CHM_CM Common) {
-    static CHM_FR(*fun)(const_CHM_SP,int*,int*,size_t,
-			CHM_CM) = NULL;
-    if (fun == NULL)
-	fun = (CHM_FR(*)(const_CHM_SP,int*,int*,
-			 size_t,CHM_CM))
-	    R_GetCCallable("Matrix", "cholmod_analyze_p");
-    return fun(A, Perm, fset, fsize, Common);
-}
-
-inline CHM_SP
-cholmod_copy(const_CHM_SP A, int stype,
-	       int mode, CHM_CM Common)
-{
-    static CHM_SP(*fun)(const_CHM_SP,int,int,CHM_CM) = NULL;
-    if (fun == NULL)
-	fun = (CHM_SP(*)(const_CHM_SP,int,int,CHM_CM))
-	    R_GetCCallable("Matrix", "cholmod_copy");
-    return fun(A, stype, mode, Common);
-}
-
-inline CHM_DN
-cholmod_copy_dense(const_CHM_DN  A, CHM_CM Common)
-{
-    static CHM_DN(*fun)(const_CHM_DN,CHM_CM) = NULL;
-    if (fun == NULL)
-	fun = (CHM_DN(*)(const_CHM_DN,CHM_CM))
-	    R_GetCCallable("Matrix", "cholmod_copy_dense");
-    return fun(A, Common);
-}
-
-inline CHM_FR
-cholmod_copy_factor(const_CHM_FR L, CHM_CM Common)
-{
-    static CHM_FR(*fun)(const_CHM_FR,CHM_CM) = NULL;
-    if (fun == NULL)
-	fun = (CHM_FR(*)(const_CHM_FR,CHM_CM))
-	    R_GetCCallable("Matrix", "cholmod_copy_factor");
-    return fun(L, Common);
-}
-
-inline int
-cholmod_change_factor(int to_xtype, int to_ll, int to_super, int to_packed,
-			int to_monotonic, CHM_FR L, CHM_CM Common)
-{
-    static int(*fun)(int,int,int,int,int,CHM_FR,CHM_CM) = NULL;
-    if (fun == NULL)
-	fun = (int(*)(int,int,int,int,int,CHM_FR,CHM_CM))
-	    R_GetCCallable("Matrix", "cholmod_change_factor");
-    return fun(to_xtype, to_ll, to_super, to_packed, to_monotonic, L, Common);
-}
-
-inline CHM_SP
-cholmod_copy_sparse(const_CHM_SP A, CHM_CM Common)
-{
-    static CHM_SP(*fun)(const_CHM_SP,CHM_CM) = NULL;
-    if (fun == NULL)
-	fun = (CHM_SP(*)(const_CHM_SP,CHM_CM))
-	    R_GetCCallable("Matrix", "cholmod_copy_sparse");
-    return fun(A, Common);
-}
-
-inline CHM_SP
-cholmod_factor_to_sparse(const_CHM_FR L, CHM_CM Common)
-{
-    static CHM_SP(*fun)(const_CHM_FR,CHM_CM) = NULL;
-    if (fun == NULL)
-	fun = (CHM_SP(*)(const_CHM_FR,CHM_CM))
-	    R_GetCCallable("Matrix", "cholmod_factor_to_sparse");
-    return fun(L, Common);
-}
-
-inline CHM_SP
-cholmod_submatrix(const_CHM_SP A, int *rset, int rsize, int *cset,
-		    int csize, int values, int sorted, CHM_CM Common)
-{
-    static CHM_SP(*fun)(const_CHM_SP,int*,int,int*,int,
-			int,int,CHM_CM) = NULL;
-    if (fun == NULL)
-	fun = (CHM_SP(*)(const_CHM_SP,int*,int,int*,
-			 int,int,int,CHM_CM))
-	    R_GetCCallable("Matrix", "cholmod_submatrix");
-    return fun(A, rset, rsize, cset, csize, values, sorted, Common);
-}
-
-inline CHM_SP
-cholmod_dense_to_sparse(const_CHM_DN  X, int values, CHM_CM Common)
-{
-    static CHM_SP(*fun)(const_CHM_DN,int,CHM_CM) = NULL;
-    if (fun == NULL)
-	fun = (CHM_SP(*)(const_CHM_DN,int,CHM_CM))
-	    R_GetCCallable("Matrix", "cholmod_dense_to_sparse");
-    return fun(X, values, Common);
-}
-
-inline int
-cholmod_factorize(const_CHM_SP A, CHM_FR L, CHM_CM Common)
-{
-    static int(*fun)(const_CHM_SP,CHM_FR,CHM_CM) = NULL;
-    if (fun == NULL)
-	fun = (int(*)(const_CHM_SP,CHM_FR,CHM_CM))
-	    R_GetCCallable("Matrix", "cholmod_factorize");
-    return fun(A, L, Common);
-}
-
-inline int
-cholmod_factorize_p(const_CHM_SP A, double *beta, int *fset,
-		      size_t fsize, CHM_FR L,
-		      CHM_CM Common)
-{
-    static int(*fun)(const_CHM_SP,double*,int*,size_t,
-		     CHM_FR,CHM_CM) = NULL;
-    if (fun == NULL)
-	fun = (int(*)(const_CHM_SP,double*,int*,size_t,
-		      CHM_FR,CHM_CM))
-	    R_GetCCallable("Matrix", "cholmod_factorize_p");
-    return fun(A, beta, fset, fsize, L, Common);
-}
-
-inline int
-cholmod_finish(CHM_CM Common)
-{
-
-    static int(*fun)(CHM_CM) = NULL;
-    if (fun == NULL)
-	fun = (int(*)(CHM_CM))
-	    R_GetCCallable("Matrix", "cholmod_finish");
-    return fun(Common);
-}
-
-inline int
-cholmod_sort(CHM_SP A, CHM_CM Common)
-{
-    static int(*fun)(CHM_SP,CHM_CM) = NULL;
-    if (fun == NULL)
-	fun = (int(*)(CHM_SP,CHM_CM))
-	    R_GetCCallable("Matrix", "cholmod_sort");
-    return fun(A, Common);
-}
-
-inline int
-cholmod_free_dense(CHM_DN  *A, CHM_CM Common)
-{
-    static int(*fun)(CHM_DN*,CHM_CM) = NULL;
-    if (fun == NULL)
-	fun = (int(*)(CHM_DN*,CHM_CM))
-	    R_GetCCallable("Matrix", "cholmod_free_dense");
-    return fun(A, Common);
-}
-
-inline int
-cholmod_free_factor(CHM_FR *L, CHM_CM Common)
-{
-    static int(*fun)(CHM_FR*,CHM_CM) = NULL;
-    if (fun == NULL)
-	fun = (int(*)(CHM_FR*,CHM_CM))
-	    R_GetCCallable("Matrix", "cholmod_free_factor");
-    return fun(L, Common);
-}
-
-inline int
-cholmod_free_sparse(CHM_SP *A, CHM_CM Common)
-{
-    static int(*fun)(CHM_SP*,CHM_CM) = NULL;
-    if (fun == NULL)
-	fun = (int(*)(CHM_SP*,CHM_CM))
-	    R_GetCCallable("Matrix", "cholmod_free_sparse");
-    return fun(A, Common);
-}
-
-inline int
-cholmod_free_triplet(cholmod_triplet **T, CHM_CM Common)
-{
-    static int(*fun)(cholmod_triplet**,CHM_CM) = NULL;
-    if (fun == NULL)
-	fun = (int(*)(cholmod_triplet**,CHM_CM))
-	    R_GetCCallable("Matrix", "cholmod_free_triplet");
-    return fun(T, Common);
-}
-
-inline long int
-cholmod_nnz(const_CHM_SP A, CHM_CM Common) {
-    static long(*fun)(const_CHM_SP,CHM_CM) = NULL;
-    if (fun == NULL)
-	fun = (long(*)(const_CHM_SP,CHM_CM))
-	    R_GetCCallable("Matrix", "cholmod_nnz");
-    return fun(A, Common);
-}
-
-inline int
-cholmod_sdmult(const_CHM_SP A, int transpose,
-		 const double *alpha, const double *beta,
-		 const_CHM_DN X, CHM_DN  Y,
-		 CHM_CM Common)
-{
-    static int(*fun)(const_CHM_SP,int,const double*,
-		     const double*,const_CHM_DN,
-		     CHM_DN,CHM_CM) = NULL;
-    if (fun == NULL)
-	fun = (int(*)(const_CHM_SP,int,const double*,
-		      const double*, const_CHM_DN,
-		      CHM_DN,CHM_CM))
-	    R_GetCCallable("Matrix", "cholmod_sdmult");
-    return fun(A, transpose, alpha, beta, X, Y, Common);
-}
-
-inline CHM_SP
-cholmod_ssmult(const_CHM_SP A, const_CHM_SP B, int stype,
-	       int values, int sorted, CHM_CM Common) {
-    static CHM_SP(*fun)(const_CHM_SP,const_CHM_SP,
-			int,int,int,CHM_CM) = NULL;
-    if (fun == NULL)
-	fun = (CHM_SP(*)(const_CHM_SP,const_CHM_SP,
-			 int,int,int,CHM_CM))
-	    R_GetCCallable("Matrix", "cholmod_ssmult");
-    return fun(A, B, stype, values, sorted, Common);
-}
-
-inline CHM_DN
-cholmod_solve(int sys, const_CHM_FR L, const_CHM_DN B, CHM_CM Common) {
-    static CHM_DN(*fun)(int,const_CHM_FR,const_CHM_DN,
-			CHM_CM) = NULL;
-    if (fun == NULL)
-	fun = (CHM_DN(*)(int,const_CHM_FR,const_CHM_DN,
-			 CHM_CM))
-	    R_GetCCallable("Matrix", "cholmod_solve");
-    return fun(sys, L, B, Common);
-}
-
-inline CHM_SP
-cholmod_speye(size_t nrow, size_t ncol,
-		int xtype, CHM_CM Common)
-{
-    static CHM_SP(*fun)(size_t,size_t,int,CHM_CM) = NULL;
-    if (fun == NULL)
-	fun = (CHM_SP(*)(size_t,size_t,int,CHM_CM))
-	    R_GetCCallable("Matrix", "cholmod_speye");
-    return fun(nrow, ncol, xtype, Common);
-}
-
-inline CHM_SP
-cholmod_spsolve(int sys, const_CHM_FR L,
-		  const_CHM_SP B, CHM_CM Common)
-{
-    static CHM_SP(*fun)(int,const_CHM_FR,
-			const_CHM_SP, CHM_CM) = NULL;
-    if (fun == NULL)
-	fun = (CHM_SP(*)(int,const_CHM_FR,
-			 const_CHM_SP, CHM_CM))
-	    R_GetCCallable("Matrix", "cholmod_spsolve");
-    return fun(sys, L, B, Common);
-}
-
-inline int
-cholmod_defaults (CHM_CM Common)
-{
-    static int(*fun)(CHM_CM) = NULL;
-    if (fun == NULL)
-	fun = (int(*)(CHM_CM))
-	    R_GetCCallable("Matrix", "cholmod_defaults");
-    return fun(Common);
-}
-
-/* extern cholmod_common c; */
-
-inline void
-R_cholmod_error(int status, const char *file, int line, const char *message)
-{
-/* NB: keep in sync with R_cholmod_error(), ../../src/chm_common.c */
-
-    if(status < 0) {
-/* Note: Matrix itself uses CHM_set_common_env, CHM_store_common 
- *   and CHM_restore_common to preserve settings through error calls.
- *  Consider defining your own error handler, *and* possibly restoring
- *  *your* version of the cholmod_common that *you* use.
- */
-	Rf_error("Cholmod error '%s' at file:%s, line %d", message, file, line);
-    }
-    else
-	Rf_warning("Cholmod warning '%s' at file:%s, line %d",
-		message, file, line);
-}
-
-inline int
-cholmod_start(CHM_CM Common)
-{
-    int val;
-    static int(*fun)(CHM_CM) = NULL;
-    if (fun == NULL)
-	fun = (int(*)(CHM_CM))
-	    R_GetCCallable("Matrix", "cholmod_start");
-    val = fun(Common);
-/*-- NB: keep in sync with  R_cholmod_start() --> ../../src/chm_common.c */
-    /* do not allow CHOLMOD printing - currently */
-    Common->print_function = NULL;/* was  R_cholmod_printf; /.* Rprintf gives warning */
-/* Consider using your own error handler: */
-    Common->error_handler = R_cholmod_error;
-    return val;
-}
-
-inline CHM_SP
-cholmod_transpose(const_CHM_SP A, int values, CHM_CM Common)
-{
-    static CHM_SP(*fun)(const_CHM_SP,int,CHM_CM) = NULL;
-    if (fun == NULL)
-	fun = (CHM_SP(*)(const_CHM_SP,int,CHM_CM))
-	    R_GetCCallable("Matrix", "cholmod_transpose");
-    return fun(A, values, Common);
-}
-
-inline CHM_SP
-cholmod_vertcat(const_CHM_SP A, const_CHM_SP B, int values, CHM_CM Common)
-{
-    static CHM_SP(*fun)(const_CHM_SP,const_CHM_SP,int,CHM_CM) = NULL;
-    if (fun == NULL)
-	fun = (CHM_SP(*)(const_CHM_SP,const_CHM_SP, int, CHM_CM))
-	    R_GetCCallable("Matrix", "cholmod_vertcat");
-    return fun(A, B, values, Common);
-}
-
-inline int
-cholmod_scale(const_CHM_DN S, int scale, CHM_SP A,
-		CHM_CM Common)
-{
-    static int(*fun)(const_CHM_DN,int,CHM_SP, CHM_CM) = NULL;
-    if (fun == NULL)
-	fun = (int(*)(const_CHM_DN,int,CHM_SP, CHM_CM))
-	    R_GetCCallable("Matrix", "cholmod_scale");
-    return fun(S, scale, A, Common);
-}
-
-#ifdef	__cplusplus
-}
-#endif
diff --git a/inst/include/RcppEigenWrap.h b/inst/include/RcppEigenWrap.h
index 32422323..4750b619 100644
--- a/inst/include/RcppEigenWrap.h
+++ b/inst/include/RcppEigenWrap.h
@@ -1,8 +1,7 @@
-// -*- mode: C++; c-indent-level: 4; c-basic-offset: 4; tab-width: 4 -*-
-//
+
 // RcppEigenWrap.h: Rcpp wrap methods for Eigen matrices, vectors and arrays
 //
-// Copyright (C) 2011 - 2012   Douglas Bates, Dirk Eddelbuettel and Romain Francois
+// Copyright (C) 2011 - 2025   Douglas Bates, Dirk Eddelbuettel and Romain Francois
 //
 // This file is part of RcppEigen.
 //
@@ -24,51 +23,6 @@
 
 namespace Rcpp{
 
-    namespace RcppEigen{
-
-        template<typename T>
-        SEXP Eigen_cholmod_wrap(const Eigen::CholmodDecomposition<Eigen::SparseMatrix<T> >& obj) {
-			typedef T* Tpt;
-            const cholmod_factor* f = obj.factor();
-            if (f->minor < f->n)
-                throw std::runtime_error("CHOLMOD factorization was unsuccessful");
-
-//FIXME: Should extend this selection according to T
-            S4 ans(std::string(f->is_super ? "dCHMsuper" : "dCHMsimpl"));
-            IntegerVector  dd(2);
-            dd[0] = dd[1] = f->n;
-            ans.slot("Dim") = dd;
-            ans.slot("perm") = ::Rcpp::wrap((int*)f->Perm, (int*)f->Perm + f->n);
-            ans.slot("colcount") = ::Rcpp::wrap((int*)f->ColCount, (int*)f->ColCount + f->n);
-            IntegerVector tt(f->is_super ? 6 : 4);
-            tt[0] = f->ordering; tt[1] = f->is_ll;
-            tt[2] = f->is_super; tt[3] = f->is_monotonic;
-            ans.slot("type") = tt;
-            if (f->is_super) {
-                tt[4] = f->maxcsize; tt[5] = f->maxesize;
-                ans.slot("super") = ::Rcpp::wrap((int*)f->super, ((int*)f->super) + f->nsuper + 1);
-                ans.slot("pi")    = ::Rcpp::wrap((int*)f->pi, ((int*)f->pi) + f->nsuper + 1);
-                ans.slot("px")    = ::Rcpp::wrap((int*)f->px, ((int*)f->px) + f->nsuper + 1);
-                ans.slot("s")     = ::Rcpp::wrap((int*)f->s, ((int*)f->s) + f->ssize);
-                ans.slot("x")     = ::Rcpp::wrap((Tpt)f->x, ((T*)f->x) + f->xsize);
-            } else {
-                ans.slot("i")     = ::Rcpp::wrap((int*)f->i, ((int*)f->i) + f->nzmax);
-                ans.slot("p")     = ::Rcpp::wrap((int*)f->p, ((int*)f->p) + f->n + 1);
-                ans.slot("x")     = ::Rcpp::wrap((Tpt)f->x, ((T*)f->x) + f->nzmax);
-                ans.slot("nz")    = ::Rcpp::wrap((int*)f->nz, ((int*)f->nz) + f->n);
-                ans.slot("nxt")   = ::Rcpp::wrap((int*)f->next, ((int*)f->next) + f->n + 2);
-                ans.slot("prv")   = ::Rcpp::wrap((int*)f->prev, ((int*)f->prev) + f->n + 2);
-            }
-            return ::Rcpp::wrap(ans);
-        }
-
-    } /* namespace RcppEigen */
-
-    template<typename T>
-    SEXP wrap(const Eigen::CholmodDecomposition<Eigen::SparseMatrix<T> >& obj) {
-        return RcppEigen::Eigen_cholmod_wrap(obj);
-    }
-
     namespace RcppEigen{
 
         // helper trait to identify if T is a plain object type
@@ -80,21 +34,28 @@ namespace Rcpp{
 
         // for plain dense objects
         template <typename T>
-        SEXP eigen_wrap_plain_dense( const T& obj, Rcpp::traits::true_type ){
-			typename Eigen::internal::conditional<T::IsRowMajor,
-												  Eigen::Matrix<typename T::Scalar,
-																T::RowsAtCompileTime,
-																T::ColsAtCompileTime>,
-												  const T&>::type objCopy(obj);
-            int m = obj.rows(), n = obj.cols();
-			SEXP ans = PROTECT(::Rcpp::wrap(objCopy.data(), objCopy.data() + m * n));
-            if( T::ColsAtCompileTime != 1 ) {
+        SEXP eigen_wrap_plain_dense( const T& obj, Rcpp::traits::true_type ) {
+            bool needs_dim = T::ColsAtCompileTime != 1;
+            R_xlen_t m = obj.rows(), n = obj.cols();
+            if (needs_dim && (m > INT_MAX || n > INT_MAX)) {
+                Rcpp::stop("array dimensions cannot exceed INT_MAX");   // #nocov
+            }
+            R_xlen_t size = m * n;
+            typename Eigen::internal::conditional<
+                T::IsRowMajor,
+                Eigen::Matrix<typename T::Scalar,
+                              T::RowsAtCompileTime,
+                              T::ColsAtCompileTime>,
+                const T&>::type objCopy(obj);
+            SEXP ans = PROTECT(::Rcpp::wrap(objCopy.data(),
+                                            objCopy.data() + size));
+            if (needs_dim) {    		// #nocov start
                 SEXP dd = PROTECT(::Rf_allocVector(INTSXP, 2));
                 int *d = INTEGER(dd);
                 d[0] = m;
                 d[1] = n;
                 ::Rf_setAttrib(ans, R_DimSymbol, dd);
-                UNPROTECT(1);
+                UNPROTECT(1);			// #nocov end
             }
             UNPROTECT(1);
             return ans;
@@ -159,7 +120,7 @@ namespace Rcpp{
         template <typename T, int RTYPE>
         class Eigen_Matrix_Exporter {
             public:
-            Eigen_Matrix_Exporter(SEXP x) : vec(x), d_ncol(1), d_nrow(Rf_length(x)) {
+            Eigen_Matrix_Exporter(SEXP x) : vec(x), d_ncol(1), d_nrow(Rf_xlength(x)) {
                 if (TYPEOF(x) != RTYPE)
                     throw std::invalid_argument("Wrong R type for mapped vector");
                 if (::Rf_isMatrix(x)) {
@@ -202,7 +163,7 @@ namespace Rcpp{
             SEXP object;
         };
 
-
+        // Provides only Map::VectorX<t> export
         template<typename T>
         class Exporter<Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, 1> > > {
             typedef typename Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, 1> > OUT ;
@@ -212,11 +173,26 @@ namespace Rcpp{
         public:
             Exporter(SEXP x) : vec(x) {
                 if (TYPEOF(x) != RTYPE)
-                    throw std::invalid_argument("Wrong R type for mapped vector");
+                    throw std::invalid_argument("Wrong R type for mapped vector"); // #nocov
             }
             OUT get() {return OUT(vec.begin(), vec.size());}
         } ;
 
+        // Provides only Map::RowVectorX<t> export
+        template<typename T>
+        class Exporter<Eigen::Map<Eigen::Matrix<T, 1, Eigen::Dynamic> > > {
+            typedef typename Eigen::Map<Eigen::Matrix<T, 1, Eigen::Dynamic> > OUT ;
+            const static int RTYPE = ::Rcpp::traits::r_sexptype_traits<T>::rtype ;
+            Rcpp::Vector<RTYPE> vec ;
+
+        public:
+            Exporter(SEXP x) : vec(x) {
+                if (TYPEOF(x) != RTYPE)
+                    throw std::invalid_argument("Wrong R type for mapped rowvector");
+            }
+            OUT get() {return OUT(vec.begin(), vec.size());}
+        };
+
         template<typename T>
         class Exporter< Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > > {
             typedef typename Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1> > OUT ;
@@ -239,9 +215,9 @@ namespace Rcpp{
             int d_ncol, d_nrow ;
 
             public:
-            Exporter(SEXP x) : vec(x), d_ncol(1), d_nrow(Rf_length(x)) {
+            Exporter(SEXP x) : vec(x), d_ncol(1), d_nrow(Rf_xlength(x)) {
                 if (TYPEOF(x) != RTYPE)
-                    throw std::invalid_argument("Wrong R type for mapped matrix");
+                    throw std::invalid_argument("Wrong R type for mapped matrix");	// #nocov
                 if (::Rf_isMatrix(x)) {
                     int *dims = INTEGER( ::Rf_getAttrib( x, R_DimSymbol ) ) ;
                     d_nrow = dims[0];
@@ -259,7 +235,7 @@ namespace Rcpp{
             int d_ncol, d_nrow ;
 
             public:
-            Exporter(SEXP x) : vec(x), d_ncol(1), d_nrow(Rf_length(x)) {
+            Exporter(SEXP x) : vec(x), d_ncol(1), d_nrow(Rf_xlength(x)) {
                 if (TYPEOF(x) != RTYPE)
                     throw std::invalid_argument("Wrong R type for mapped 2D array");
                 if (::Rf_isMatrix(x)) {
@@ -315,41 +291,81 @@ namespace Rcpp{
                 MatrixExporterForEigen< Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>, T >(x){}
         };
 
+        // Starting from Eigen 3.3 MappedSparseMatrix was deprecated.
+        // The new type is Map<SparseMatrix>.
         template<typename T>
-        class Exporter<Eigen::MappedSparseMatrix<T> > {
+        class Exporter<Eigen::Map<Eigen::SparseMatrix<T> > > {
         public:
             const static int RTYPE = ::Rcpp::traits::r_sexptype_traits<T>::rtype ;
             Exporter(SEXP x) : d_x(x), d_dims(d_x.slot("Dim")), d_i(d_x.slot("i")), d_p(d_x.slot("p")), xx( d_x.slot("x") ) {
                 if (!d_x.is("dgCMatrix"))
                     throw std::invalid_argument("Need S4 class dgCMatrix for a mapped sparse matrix");
             }
-            Eigen::MappedSparseMatrix<T> get() {
-                return Eigen::MappedSparseMatrix<T>(d_dims[0], d_dims[1], d_p[d_dims[1]],
-                                                    d_p.begin(), d_i.begin(), xx.begin() );
+            Eigen::Map<Eigen::SparseMatrix<T> > get() {
+                return Eigen::Map<Eigen::SparseMatrix<T> >(d_dims[0], d_dims[1], d_p[d_dims[1]],
+                                                           d_p.begin(), d_i.begin(), xx.begin() );
             }
         protected:
             S4            d_x;
             IntegerVector d_dims, d_i, d_p;
             Vector<RTYPE> xx ;
         };
-
+        // // Deprecated
+        // template<typename T>
+        // class Exporter<Eigen::MappedSparseMatrix<T> > {
+        // public:
+        //     const static int RTYPE = ::Rcpp::traits::r_sexptype_traits<T>::rtype ;
+        //     Exporter(SEXP x) : d_x(x), d_dims(d_x.slot("Dim")), d_i(d_x.slot("i")), d_p(d_x.slot("p")), xx( d_x.slot("x") ) {
+        //         if (!d_x.is("dgCMatrix"))
+        //             throw std::invalid_argument("Need S4 class dgCMatrix for a mapped sparse matrix");
+        //     }
+        //     Eigen::MappedSparseMatrix<T> get() {
+        //         return Eigen::MappedSparseMatrix<T>(d_dims[0], d_dims[1], d_p[d_dims[1]],
+        //                                             d_p.begin(), d_i.begin(), xx.begin() );
+        //     }
+        // protected:
+        //     S4            d_x;
+        //     IntegerVector d_dims, d_i, d_p;
+        //     Vector<RTYPE> xx ;
+        // };
+
+        // Starting from Eigen 3.3 MappedSparseMatrix was deprecated.
+        // The new type is Map<SparseMatrix>.
         template<typename T>
-        class Exporter<Eigen::MappedSparseMatrix<T, Eigen::RowMajor> > {
+        class Exporter<Eigen::Map<Eigen::SparseMatrix<T, Eigen::RowMajor> > > {
         public:
             const static int RTYPE = ::Rcpp::traits::r_sexptype_traits<T>::rtype ;
             Exporter(SEXP x) : d_x(x), d_dims(d_x.slot("Dim")), d_j(d_x.slot("j")), d_p(d_x.slot("p")), xx( d_x.slot("x") ) {
                 if (!d_x.is("dgRMatrix"))
                     throw std::invalid_argument("Need S4 class dgRMatrix for a mapped sparse matrix");
             }
-            Eigen::MappedSparseMatrix<T, Eigen::RowMajor> get() {
-                return Eigen::MappedSparseMatrix<T, Eigen::RowMajor>(d_dims[0], d_dims[1], d_p[d_dims[1]],
-                                                                     d_p.begin(), d_j.begin(), xx.begin() );
+            Eigen::Map<Eigen::SparseMatrix<T, Eigen::RowMajor> > get() {
+                return Eigen::Map<Eigen::SparseMatrix<T, Eigen::RowMajor> >(d_dims[0], d_dims[1], d_p[d_dims[1]],
+                                                                            d_p.begin(), d_j.begin(), xx.begin() );
             }
         protected:
             S4            d_x;
             IntegerVector d_dims, d_j, d_p;
             Vector<RTYPE> xx ;
         };
+        // // Deprecated
+        // template<typename T>
+        // class Exporter<Eigen::MappedSparseMatrix<T, Eigen::RowMajor> > {
+        // public:
+        //     const static int RTYPE = ::Rcpp::traits::r_sexptype_traits<T>::rtype ;
+        //     Exporter(SEXP x) : d_x(x), d_dims(d_x.slot("Dim")), d_j(d_x.slot("j")), d_p(d_x.slot("p")), xx( d_x.slot("x") ) {
+        //         if (!d_x.is("dgRMatrix"))
+        //             throw std::invalid_argument("Need S4 class dgRMatrix for a mapped sparse matrix");
+        //     }
+        //     Eigen::MappedSparseMatrix<T, Eigen::RowMajor> get() {
+        //         return Eigen::MappedSparseMatrix<T, Eigen::RowMajor>(d_dims[0], d_dims[1], d_p[d_dims[1]],
+        //                                                              d_p.begin(), d_j.begin(), xx.begin() );
+        //     }
+        // protected:
+        //     S4            d_x;
+        //     IntegerVector d_dims, d_j, d_p;
+        //     Vector<RTYPE> xx ;
+        // };
 
         template<typename T>
         class Exporter<Eigen::SparseMatrix<T> > {
diff --git a/inst/include/unsupported/Eigen/AdolcForward b/inst/include/unsupported/Eigen/AdolcForward
index 2627decd..7da99972 100644
--- a/inst/include/unsupported/Eigen/AdolcForward
+++ b/inst/include/unsupported/Eigen/AdolcForward
@@ -7,8 +7,8 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#ifndef EIGEN_ADLOC_FORWARD
-#define EIGEN_ADLOC_FORWARD
+#ifndef EIGEN_ADLOC_FORWARD_MODULE_H
+#define EIGEN_ADLOC_FORWARD_MODULE_H
 
 //--------------------------------------------------------------------------------
 //
@@ -23,64 +23,66 @@
 
 #define ADOLC_TAPELESS
 #ifndef NUMBER_DIRECTIONS
-# define NUMBER_DIRECTIONS 2
+#define NUMBER_DIRECTIONS 2
 #endif
-#include <adolc/adouble.h>
+#include <adolc/adtl.h>
 
 // adolc defines some very stupid macros:
 #if defined(malloc)
-# undef malloc
+#undef malloc
 #endif
 
 #if defined(calloc)
-# undef calloc
+#undef calloc
 #endif
 
 #if defined(realloc)
-# undef realloc
+#undef realloc
 #endif
 
-#include <Eigen/Core>
+#include "../../Eigen/Core"
 
 namespace Eigen {
 
 /**
-  * \defgroup AdolcForward_Module Adolc forward module
-  * This module provides support for adolc's adouble type in forward mode.
-  * ADOL-C is a C++ automatic differentiation library,
-  * see https://projects.coin-or.org/ADOL-C for more information.
-  * It mainly consists in:
-  *  - a struct Eigen::NumTraits<adtl::adouble> specialization
-  *  - overloads of internal::* math function for adtl::adouble type.
-  *
-  * Note that the maximal number of directions is controlled by
-  * the preprocessor token NUMBER_DIRECTIONS. The default is 2.
-  *
-  * \code
-  * #include <unsupported/Eigen/AdolcSupport>
-  * \endcode
-  */
-  //@{
-
-} // namespace Eigen
+ * \defgroup AdolcForward_Module Adolc forward module
+ * This module provides support for adolc's adouble type in forward mode.
+ * ADOL-C is a C++ automatic differentiation library,
+ * see https://projects.coin-or.org/ADOL-C for more information.
+ * It mainly consists in:
+ *  - a struct Eigen::NumTraits<adtl::adouble> specialization
+ *  - overloads of internal::* math function for adtl::adouble type.
+ *
+ * Note that the maximal number of directions is controlled by
+ * the preprocessor token NUMBER_DIRECTIONS. The default is 2.
+ *
+ * \code
+ * #include <unsupported/Eigen/AdolcSupport>
+ * \endcode
+ */
+//@{
+
+}  // namespace Eigen
 
 // Eigen's require a few additional functions which must be defined in the same namespace
 // than the custom scalar type own namespace
 namespace adtl {
 
-inline const adouble& conj(const adouble& x)  { return x; }
-inline const adouble& real(const adouble& x)  { return x; }
-inline adouble imag(const adouble&)    { return 0.; }
-inline adouble abs(const adouble&  x)  { return fabs(x); }
-inline adouble abs2(const adouble& x)  { return x*x; }
+inline const adouble& conj(const adouble& x) { return x; }
+inline const adouble& real(const adouble& x) { return x; }
+inline adouble imag(const adouble&) { return 0.; }
+inline adouble abs(const adouble& x) { return fabs(x); }
+inline adouble abs2(const adouble& x) { return x * x; }
 
-}
+inline bool(isinf)(const adouble& x) { return (Eigen::numext::isinf)(x.getValue()); }
+inline bool(isnan)(const adouble& x) { return (Eigen::numext::isnan)(x.getValue()); }
+
+}  // namespace adtl
 
 namespace Eigen {
 
-template<> struct NumTraits<adtl::adouble>
-    : NumTraits<double>
-{
+template <>
+struct NumTraits<adtl::adouble> : NumTraits<double> {
   typedef adtl::adouble Real;
   typedef adtl::adouble NonInteger;
   typedef adtl::adouble Nested;
@@ -95,20 +97,20 @@ template<> struct NumTraits<adtl::adouble>
   };
 };
 
-template<typename Functor> class AdolcForwardJacobian : public Functor
-{
+template <typename Functor>
+class AdolcForwardJacobian : public Functor {
   typedef adtl::adouble ActiveScalar;
-public:
 
+ public:
   AdolcForwardJacobian() : Functor() {}
   AdolcForwardJacobian(const Functor& f) : Functor(f) {}
 
   // forward constructors
-  template<typename T0>
+  template <typename T0>
   AdolcForwardJacobian(const T0& a0) : Functor(a0) {}
-  template<typename T0, typename T1>
+  template <typename T0, typename T1>
   AdolcForwardJacobian(const T0& a0, const T1& a1) : Functor(a0, a1) {}
-  template<typename T0, typename T1, typename T2>
+  template <typename T0, typename T1, typename T2>
   AdolcForwardJacobian(const T0& a0, const T1& a1, const T1& a2) : Functor(a0, a1, a2) {}
 
   typedef typename Functor::InputType InputType;
@@ -118,11 +120,9 @@ public:
   typedef Matrix<ActiveScalar, InputType::SizeAtCompileTime, 1> ActiveInput;
   typedef Matrix<ActiveScalar, ValueType::SizeAtCompileTime, 1> ActiveValue;
 
-  void operator() (const InputType& x, ValueType* v, JacobianType* _jac) const
-  {
-    eigen_assert(v!=0);
-    if (!_jac)
-    {
+  void operator()(const InputType& x, ValueType* v, JacobianType* _jac) const {
+    eigen_assert(v != 0);
+    if (!_jac) {
       Functor::operator()(x, v);
       return;
     }
@@ -132,25 +132,22 @@ public:
     ActiveInput ax = x.template cast<ActiveScalar>();
     ActiveValue av(jac.rows());
 
-    for (int j=0; j<jac.cols(); j++)
-      for (int i=0; i<jac.cols(); i++)
-        ax[i].setADValue(j, i==j ? 1 : 0);
+    for (int j = 0; j < jac.cols(); j++)
+      for (int i = 0; i < jac.cols(); i++) ax[i].setADValue(j, i == j ? 1 : 0);
 
     Functor::operator()(ax, &av);
 
-    for (int i=0; i<jac.rows(); i++)
-    {
+    for (int i = 0; i < jac.rows(); i++) {
       (*v)[i] = av[i].getValue();
-      for (int j=0; j<jac.cols(); j++)
-        jac.coeffRef(i,j) = av[i].getADValue(j);
+      for (int j = 0; j < jac.cols(); j++) jac.coeffRef(i, j) = av[i].getADValue(j);
     }
   }
-protected:
 
+ protected:
 };
 
 //@}
 
-}
+}  // namespace Eigen
 
-#endif // EIGEN_ADLOC_FORWARD
+#endif  // EIGEN_ADLOC_FORWARD_MODULE_H
diff --git a/inst/include/unsupported/Eigen/AlignedVector3 b/inst/include/unsupported/Eigen/AlignedVector3
index 7b45e6cc..8301ef07 100644
--- a/inst/include/unsupported/Eigen/AlignedVector3
+++ b/inst/include/unsupported/Eigen/AlignedVector3
@@ -7,184 +7,205 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#ifndef EIGEN_ALIGNED_VECTOR3
-#define EIGEN_ALIGNED_VECTOR3
+#ifndef EIGEN_ALIGNED_VECTOR3_MODULE_H
+#define EIGEN_ALIGNED_VECTOR3_MODULE_H
 
-#include <Eigen/Geometry>
+#include "../../Eigen/Geometry"
+
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
 
 namespace Eigen {
 
 /**
-  * \defgroup AlignedVector3_Module Aligned vector3 module
-  *
-  * \code
-  * #include <unsupported/Eigen/AlignedVector3>
-  * \endcode
-  */
-  //@{
-
+ * \defgroup AlignedVector3_Module Aligned vector3 module
+ *
+ * \code
+ * #include <unsupported/Eigen/AlignedVector3>
+ * \endcode
+ */
+//@{
 
 /** \class AlignedVector3
-  *
-  * \brief A vectorization friendly 3D vector
-  *
-  * This class represents a 3D vector internally using a 4D vector
-  * such that vectorization can be seamlessly enabled. Of course,
-  * the same result can be achieved by directly using a 4D vector.
-  * This class makes this process simpler.
-  *
-  */
+ *
+ * \brief A vectorization friendly 3D vector
+ *
+ * This class represents a 3D vector internally using a 4D vector
+ * such that vectorization can be seamlessly enabled. Of course,
+ * the same result can be achieved by directly using a 4D vector.
+ * This class makes this process simpler.
+ *
+ */
 // TODO specialize Cwise
-template<typename _Scalar> class AlignedVector3;
+template <typename Scalar_>
+class AlignedVector3;
 
 namespace internal {
-template<typename _Scalar> struct traits<AlignedVector3<_Scalar> >
-  : traits<Matrix<_Scalar,3,1,0,4,1> >
-{
-};
-}
-
-template<typename _Scalar> class AlignedVector3
-  : public MatrixBase<AlignedVector3<_Scalar> >
-{
-    typedef Matrix<_Scalar,4,1> CoeffType;
-    CoeffType m_coeffs;
-  public:
-
-    typedef MatrixBase<AlignedVector3<_Scalar> > Base;	
-    EIGEN_DENSE_PUBLIC_INTERFACE(AlignedVector3)
-    using Base::operator*;
-
-    inline Index rows() const { return 3; }
-    inline Index cols() const { return 1; }
-
-    inline const Scalar& coeff(Index row, Index col) const
-    { return m_coeffs.coeff(row, col); }
-
-    inline Scalar& coeffRef(Index row, Index col)
-    { return m_coeffs.coeffRef(row, col); }
-
-    inline const Scalar& coeff(Index index) const
-    { return m_coeffs.coeff(index); }
-
-    inline Scalar& coeffRef(Index index)
-    { return m_coeffs.coeffRef(index);}
-
-
-    inline AlignedVector3(const Scalar& x, const Scalar& y, const Scalar& z)
-      : m_coeffs(x, y, z, Scalar(0))
-    {}
-
-    inline AlignedVector3(const AlignedVector3& other)
-      : Base(), m_coeffs(other.m_coeffs)
-    {}
-
-    template<typename XprType, int Size=XprType::SizeAtCompileTime>
-    struct generic_assign_selector {};
-
-    template<typename XprType> struct generic_assign_selector<XprType,4>
-    {
-      inline static void run(AlignedVector3& dest, const XprType& src)
-      {
-        dest.m_coeffs = src;
-      }
-    };
-
-    template<typename XprType> struct generic_assign_selector<XprType,3>
-    {
-      inline static void run(AlignedVector3& dest, const XprType& src)
-      {
-        dest.m_coeffs.template head<3>() = src;
-        dest.m_coeffs.w() = Scalar(0);
-      }
-    };
-
-    template<typename Derived>
-    inline explicit AlignedVector3(const MatrixBase<Derived>& other)
-    {
-      generic_assign_selector<Derived>::run(*this,other.derived());
-    }
+template <typename Scalar_>
+struct traits<AlignedVector3<Scalar_> > : traits<Matrix<Scalar_, 3, 1, 0, 4, 1> > {};
+}  // namespace internal
 
-    inline AlignedVector3& operator=(const AlignedVector3& other)
-    { m_coeffs = other.m_coeffs; return *this; }
+template <typename Scalar_>
+class AlignedVector3 : public MatrixBase<AlignedVector3<Scalar_> > {
+  typedef Matrix<Scalar_, 4, 1> CoeffType;
+  CoeffType m_coeffs;
 
+ public:
+  typedef MatrixBase<AlignedVector3<Scalar_> > Base;
+  EIGEN_DENSE_PUBLIC_INTERFACE(AlignedVector3)
+  using Base::operator*;
 
-    inline AlignedVector3 operator+(const AlignedVector3& other) const
-    { return AlignedVector3(m_coeffs + other.m_coeffs); }
+  inline Index rows() const { return 3; }
+  inline Index cols() const { return 1; }
 
-    inline AlignedVector3& operator+=(const AlignedVector3& other)
-    { m_coeffs += other.m_coeffs; return *this; }
+  Scalar* data() { return m_coeffs.data(); }
+  const Scalar* data() const { return m_coeffs.data(); }
+  Index innerStride() const { return 1; }
+  Index outerStride() const { return 3; }
 
-    inline AlignedVector3 operator-(const AlignedVector3& other) const
-    { return AlignedVector3(m_coeffs - other.m_coeffs); }
+  inline const Scalar& coeff(Index row, Index col) const { return m_coeffs.coeff(row, col); }
 
-    inline AlignedVector3 operator-=(const AlignedVector3& other)
-    { m_coeffs -= other.m_coeffs; return *this; }
+  inline Scalar& coeffRef(Index row, Index col) { return m_coeffs.coeffRef(row, col); }
 
-    inline AlignedVector3 operator*(const Scalar& s) const
-    { return AlignedVector3(m_coeffs * s); }
+  inline const Scalar& coeff(Index index) const { return m_coeffs.coeff(index); }
 
-    inline friend AlignedVector3 operator*(const Scalar& s,const AlignedVector3& vec)
-    { return AlignedVector3(s * vec.m_coeffs); }
+  inline Scalar& coeffRef(Index index) { return m_coeffs.coeffRef(index); }
 
-    inline AlignedVector3& operator*=(const Scalar& s)
-    { m_coeffs *= s; return *this; }
+  inline AlignedVector3() {}
 
-    inline AlignedVector3 operator/(const Scalar& s) const
-    { return AlignedVector3(m_coeffs / s); }
+  inline AlignedVector3(const Scalar& x, const Scalar& y, const Scalar& z) : m_coeffs(x, y, z, Scalar(0)) {}
 
-    inline AlignedVector3& operator/=(const Scalar& s)
-    { m_coeffs /= s; return *this; }
+  inline AlignedVector3(const AlignedVector3& other) : Base(), m_coeffs(other.m_coeffs) {}
 
-    inline Scalar dot(const AlignedVector3& other) const
-    {
-      eigen_assert(m_coeffs.w()==Scalar(0));
-      eigen_assert(other.m_coeffs.w()==Scalar(0));
-      return m_coeffs.dot(other.m_coeffs);
-    }
+  template <typename XprType, int Size = XprType::SizeAtCompileTime>
+  struct generic_assign_selector {};
 
-    inline void normalize()
-    {
-      m_coeffs /= norm();
-    }
-
-    inline AlignedVector3 normalized()
-    {
-      return AlignedVector3(m_coeffs / norm());
-    }
+  template <typename XprType>
+  struct generic_assign_selector<XprType, 4> {
+    inline static void run(AlignedVector3& dest, const XprType& src) { dest.m_coeffs = src; }
+  };
 
-    inline Scalar sum() const
-    {
-      eigen_assert(m_coeffs.w()==Scalar(0));
-      return m_coeffs.sum();
+  template <typename XprType>
+  struct generic_assign_selector<XprType, 3> {
+    inline static void run(AlignedVector3& dest, const XprType& src) {
+      dest.m_coeffs.template head<3>() = src;
+      dest.m_coeffs.w() = Scalar(0);
     }
+  };
+
+  template <typename Derived>
+  inline AlignedVector3(const MatrixBase<Derived>& other) {
+    generic_assign_selector<Derived>::run(*this, other.derived());
+  }
+
+  inline AlignedVector3& operator=(const AlignedVector3& other) {
+    m_coeffs = other.m_coeffs;
+    return *this;
+  }
+
+  template <typename Derived>
+  inline AlignedVector3& operator=(const MatrixBase<Derived>& other) {
+    generic_assign_selector<Derived>::run(*this, other.derived());
+    return *this;
+  }
+
+  inline AlignedVector3 operator+(const AlignedVector3& other) const {
+    return AlignedVector3(m_coeffs + other.m_coeffs);
+  }
+
+  inline AlignedVector3& operator+=(const AlignedVector3& other) {
+    m_coeffs += other.m_coeffs;
+    return *this;
+  }
+
+  inline AlignedVector3 operator-(const AlignedVector3& other) const {
+    return AlignedVector3(m_coeffs - other.m_coeffs);
+  }
+
+  inline AlignedVector3 operator-() const { return AlignedVector3(-m_coeffs); }
+
+  inline AlignedVector3 operator-=(const AlignedVector3& other) {
+    m_coeffs -= other.m_coeffs;
+    return *this;
+  }
+
+  inline AlignedVector3 operator*(const Scalar& s) const { return AlignedVector3(m_coeffs * s); }
+
+  inline friend AlignedVector3 operator*(const Scalar& s, const AlignedVector3& vec) {
+    return AlignedVector3(s * vec.m_coeffs);
+  }
+
+  inline AlignedVector3& operator*=(const Scalar& s) {
+    m_coeffs *= s;
+    return *this;
+  }
+
+  inline AlignedVector3 operator/(const Scalar& s) const { return AlignedVector3(m_coeffs / s); }
+
+  inline AlignedVector3& operator/=(const Scalar& s) {
+    m_coeffs /= s;
+    return *this;
+  }
+
+  inline Scalar dot(const AlignedVector3& other) const {
+    eigen_assert(m_coeffs.w() == Scalar(0));
+    eigen_assert(other.m_coeffs.w() == Scalar(0));
+    return m_coeffs.dot(other.m_coeffs);
+  }
+
+  inline void normalize() { m_coeffs /= norm(); }
+
+  inline AlignedVector3 normalized() const { return AlignedVector3(m_coeffs / norm()); }
+
+  inline Scalar sum() const {
+    eigen_assert(m_coeffs.w() == Scalar(0));
+    return m_coeffs.sum();
+  }
+
+  inline Scalar squaredNorm() const {
+    eigen_assert(m_coeffs.w() == Scalar(0));
+    return m_coeffs.squaredNorm();
+  }
+
+  inline Scalar norm() const {
+    using std::sqrt;
+    return sqrt(squaredNorm());
+  }
+
+  inline AlignedVector3 cross(const AlignedVector3& other) const {
+    return AlignedVector3(m_coeffs.cross3(other.m_coeffs));
+  }
+
+  template <typename Derived>
+  inline bool isApprox(const MatrixBase<Derived>& other,
+                       const RealScalar& eps = NumTraits<Scalar>::dummy_precision()) const {
+    return m_coeffs.template head<3>().isApprox(other, eps);
+  }
+
+  CoeffType& coeffs() { return m_coeffs; }
+  const CoeffType& coeffs() const { return m_coeffs; }
+};
 
-    inline Scalar squaredNorm() const
-    {
-      eigen_assert(m_coeffs.w()==Scalar(0));
-      return m_coeffs.squaredNorm();
-    }
+namespace internal {
 
-    inline Scalar norm() const
-    {
-      using std::sqrt;
-      return sqrt(squaredNorm());
-    }
+template <typename Scalar_>
+struct eval<AlignedVector3<Scalar_>, Dense> {
+  typedef const AlignedVector3<Scalar_>& type;
+};
 
-    inline AlignedVector3 cross(const AlignedVector3& other) const
-    {
-      return AlignedVector3(m_coeffs.cross3(other.m_coeffs));
-    }
+template <typename Scalar>
+struct evaluator<AlignedVector3<Scalar> > : evaluator<Matrix<Scalar, 4, 1> > {
+  typedef AlignedVector3<Scalar> XprType;
+  typedef evaluator<Matrix<Scalar, 4, 1> > Base;
 
-    template<typename Derived>
-    inline bool isApprox(const MatrixBase<Derived>& other, RealScalar eps=NumTraits<Scalar>::dummy_precision()) const
-    {
-      return m_coeffs.template head<3>().isApprox(other,eps);
-    }
+  evaluator(const XprType& m) : Base(m.coeffs()) {}
 };
 
+}  // namespace internal
+
 //@}
 
-}
+}  // namespace Eigen
+
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
 
-#endif // EIGEN_ALIGNED_VECTOR3
+#endif  // EIGEN_ALIGNED_VECTOR3_MODULE_H
diff --git a/inst/include/unsupported/Eigen/ArpackSupport b/inst/include/unsupported/Eigen/ArpackSupport
index 37a2799e..3987f7c4 100644
--- a/inst/include/unsupported/Eigen/ArpackSupport
+++ b/inst/include/unsupported/Eigen/ArpackSupport
@@ -9,23 +9,25 @@
 #ifndef EIGEN_ARPACKSUPPORT_MODULE_H
 #define EIGEN_ARPACKSUPPORT_MODULE_H
 
-#include <Eigen/Core>
-
-#include <Eigen/src/Core/util/DisableStupidWarnings.h>
+#include "../../Eigen/Core"
 
 /** \defgroup ArpackSupport_Module Arpack support module
-  *
-  * This module provides a wrapper to Arpack, a library for sparse eigenvalue decomposition.
-  *
-  * \code
-  * #include <Eigen/ArpackSupport>
-  * \endcode
-  */
-
-#include <Eigen/SparseCholesky>
+ *
+ * This module provides a wrapper to Arpack, a library for sparse eigenvalue decomposition.
+ *
+ * \code
+ * #include <Eigen/ArpackSupport>
+ * \endcode
+ */
+
+#include "../../Eigen/SparseCholesky"
+
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
+// IWYU pragma: begin_exports
 #include "src/Eigenvalues/ArpackSelfAdjointEigenSolver.h"
+// IWYU pragma: end_exports
 
-#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
 
-#endif // EIGEN_ARPACKSUPPORT_MODULE_H
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */
+#endif  // EIGEN_ARPACKSUPPORT_MODULE_H
diff --git a/inst/include/unsupported/Eigen/AutoDiff b/inst/include/unsupported/Eigen/AutoDiff
index abf5b7d6..0480c69e 100644
--- a/inst/include/unsupported/Eigen/AutoDiff
+++ b/inst/include/unsupported/Eigen/AutoDiff
@@ -7,34 +7,42 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#ifndef EIGEN_AUTODIFF_MODULE
-#define EIGEN_AUTODIFF_MODULE
+#ifndef EIGEN_AUTODIFF_MODULE_H
+#define EIGEN_AUTODIFF_MODULE_H
+
+#include "../../Eigen/Core"
 
 namespace Eigen {
 
 /**
-  * \defgroup AutoDiff_Module Auto Diff module
-  *
-  * This module features forward automatic differentation via a simple
-  * templated scalar type wrapper AutoDiffScalar.
-  *
-  * Warning : this should NOT be confused with numerical differentiation, which
-  * is a different method and has its own module in Eigen : \ref NumericalDiff_Module.
-  *
-  * \code
-  * #include <unsupported/Eigen/AutoDiff>
-  * \endcode
-  */
+ * \defgroup AutoDiff_Module Auto Diff module
+ *
+ * This module features forward automatic differentation via a simple
+ * templated scalar type wrapper AutoDiffScalar.
+ *
+ * Warning : this should NOT be confused with numerical differentiation, which
+ * is a different method and has its own module in Eigen : \ref NumericalDiff_Module.
+ *
+ * \code
+ * #include <unsupported/Eigen/AutoDiff>
+ * \endcode
+ */
 //@{
 
-}
+}  // namespace Eigen
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
 
+// IWYU pragma: begin_exports
+#include "src/AutoDiff/CoherentPadOp.h"
 #include "src/AutoDiff/AutoDiffScalar.h"
 // #include "src/AutoDiff/AutoDiffVector.h"
 #include "src/AutoDiff/AutoDiffJacobian.h"
+// IWYU pragma: end_exports
+
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
 
 namespace Eigen {
 //@}
 }
 
-#endif // EIGEN_AUTODIFF_MODULE
+#endif  // EIGEN_AUTODIFF_MODULE_H
diff --git a/inst/include/unsupported/Eigen/BVH b/inst/include/unsupported/Eigen/BVH
index 0161a540..58fb72fb 100644
--- a/inst/include/unsupported/Eigen/BVH
+++ b/inst/include/unsupported/Eigen/BVH
@@ -10,9 +10,9 @@
 #ifndef EIGEN_BVH_MODULE_H
 #define EIGEN_BVH_MODULE_H
 
-#include <Eigen/Core>
-#include <Eigen/Geometry>
-#include <Eigen/StdVector>
+#include "../../Eigen/Core"
+#include "../../Eigen/Geometry"
+#include "../../Eigen/StdVector"
 #include <algorithm>
 #include <queue>
 
@@ -28,47 +28,57 @@ namespace Eigen {
   * #include <unsupported/Eigen/BVH>
   * \endcode
   *
-  * A bounding volume hierarchy (BVH) can accelerate many geometric queries.  This module provides a generic implementation
-  * of the two basic algorithms over a BVH: intersection of a query object against all objects in the hierarchy and minimization
-  * of a function over the objects in the hierarchy.  It also provides intersection and minimization over a cartesian product of
-  * two BVH's.  A BVH accelerates intersection by using the fact that if a query object does not intersect a volume, then it cannot
-  * intersect any object contained in that volume.  Similarly, a BVH accelerates minimization because the minimum of a function
+  * A bounding volume hierarchy (BVH) can accelerate many geometric queries.  This module provides a generic
+  implementation
+  * of the two basic algorithms over a BVH: intersection of a query object against all objects in the hierarchy and
+  minimization
+  * of a function over the objects in the hierarchy.  It also provides intersection and minimization over a cartesian
+  product of
+  * two BVH's.  A BVH accelerates intersection by using the fact that if a query object does not intersect a volume,
+  then it cannot
+  * intersect any object contained in that volume.  Similarly, a BVH accelerates minimization because the minimum of a
+  function
   * over a volume is no greater than the minimum of a function over any object contained in it.
   *
   * Some sample queries that can be written in terms of intersection are:
   *   - Determine all points where a ray intersects a triangle mesh
   *   - Given a set of points, determine which are contained in a query sphere
   *   - Given a set of spheres, determine which contain the query point
-  *   - Given a set of disks, determine if any is completely contained in a query rectangle (represent each 2D disk as a point \f$(x,y,r)\f$
-  *     in 3D and represent the rectangle as a pyramid based on the original rectangle and shrinking in the \f$r\f$ direction)
-  *   - Given a set of points, count how many pairs are \f$d\pm\epsilon\f$ apart (done by looking at the cartesian product of the set
+  *   - Given a set of disks, determine if any is completely contained in a query rectangle (represent each 2D disk as a
+  point \f$(x,y,r)\f$
+  *     in 3D and represent the rectangle as a pyramid based on the original rectangle and shrinking in the \f$r\f$
+  direction)
+  *   - Given a set of points, count how many pairs are \f$d\pm\epsilon\f$ apart (done by looking at the cartesian
+  product of the set
   *     of points with itself)
   *
   * Some sample queries that can be written in terms of function minimization over a set of objects are:
-  *   - Find the intersection between a ray and a triangle mesh closest to the ray origin (function is infinite off the ray)
+  *   - Find the intersection between a ray and a triangle mesh closest to the ray origin (function is infinite off the
+  ray)
   *   - Given a polyline and a query point, determine the closest point on the polyline to the query
-  *   - Find the diameter of a point cloud (done by looking at the cartesian product and using negative distance as the function)
+  *   - Find the diameter of a point cloud (done by looking at the cartesian product and using negative distance as the
+  function)
   *   - Determine how far two meshes are from colliding (this is also a cartesian product query)
   *
-  * This implementation decouples the basic algorithms both from the type of hierarchy (and the types of the bounding volumes) and
-  * from the particulars of the query.  To enable abstraction from the BVH, the BVH is required to implement a generic mechanism
+  * This implementation decouples the basic algorithms both from the type of hierarchy (and the types of the bounding
+  volumes) and
+  * from the particulars of the query.  To enable abstraction from the BVH, the BVH is required to implement a generic
+  mechanism
   * for traversal.  To abstract from the query, the query is responsible for keeping track of results.
   *
-  * To be used in the algorithms, a hierarchy must implement the following traversal mechanism (see KdBVH for a sample implementation): \code
-      typedef Volume  //the type of bounding volume
-      typedef Object  //the type of object in the hierarchy
-      typedef Index   //a reference to a node in the hierarchy--typically an int or a pointer
-      typedef VolumeIterator //an iterator type over node children--returns Index
-      typedef ObjectIterator //an iterator over object (leaf) children--returns const Object &
-      Index getRootIndex() const //returns the index of the hierarchy root
-      const Volume &getVolume(Index index) const //returns the bounding volume of the node at given index
-      void getChildren(Index index, VolumeIterator &outVBegin, VolumeIterator &outVEnd,
-                      ObjectIterator &outOBegin, ObjectIterator &outOEnd) const
+  * To be used in the algorithms, a hierarchy must implement the following traversal mechanism (see KdBVH for a sample
+  implementation): \code typedef Volume  //the type of bounding volume typedef Object  //the type of object in the
+  hierarchy typedef Index   //a reference to a node in the hierarchy--typically an int or a pointer typedef
+  VolumeIterator //an iterator type over node children--returns Index typedef ObjectIterator //an iterator over object
+  (leaf) children--returns const Object & Index getRootIndex() const //returns the index of the hierarchy root const
+  Volume &getVolume(Index index) const //returns the bounding volume of the node at given index void getChildren(Index
+  index, VolumeIterator &outVBegin, VolumeIterator &outVEnd, ObjectIterator &outOBegin, ObjectIterator &outOEnd) const
       //getChildren takes a node index and makes [outVBegin, outVEnd) range over its node children
       //and [outOBegin, outOEnd) range over its object children
     \endcode
   *
-  * To use the hierarchy, call BVIntersect or BVMinimize, passing it a BVH (or two, for cartesian product) and a minimizer or intersector.
+  * To use the hierarchy, call BVIntersect or BVMinimize, passing it a BVH (or two, for cartesian product) and a
+  minimizer or intersector.
   * For an intersection query on a single BVH, the intersector encapsulates the query and must provide two functions:
   * \code
       bool intersectVolume(const Volume &volume) //returns true if the query intersects the volume
@@ -79,7 +89,8 @@ namespace Eigen {
   * responsibility of the intersectObject function to keep track of the results in whatever manner is appropriate.
   * The cartesian product intersection and the BVMinimize queries are similar--see their individual documentation.
   *
-  * The following is a simple but complete example for how to use the BVH to accelerate the search for a closest red-blue point pair:
+  * The following is a simple but complete example for how to use the BVH to accelerate the search for a closest
+  red-blue point pair:
   * \include BVH_Example.cpp
   * Output: \verbinclude BVH_Example.out
   */
@@ -87,9 +98,11 @@ namespace Eigen {
 
 //@{
 
+// IWYU pragma: begin_exports
 #include "src/BVH/BVAlgorithms.h"
 #include "src/BVH/KdBVH.h"
+// IWYU pragma: end_exports
 
 //@}
 
-#endif // EIGEN_BVH_MODULE_H
+#endif  // EIGEN_BVH_MODULE_H
diff --git a/inst/include/unsupported/Eigen/CMakeLists.txt b/inst/include/unsupported/Eigen/CMakeLists.txt
new file mode 100644
index 00000000..1517ba95
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CMakeLists.txt
@@ -0,0 +1,31 @@
+set(Eigen_HEADERS
+  AdolcForward
+  AlignedVector3
+  ArpackSupport
+  AutoDiff
+  BVH
+  EulerAngles
+  FFT
+  IterativeSolvers
+  KroneckerProduct
+  LevenbergMarquardt
+  MatrixFunctions
+  MPRealSupport
+  NNLS
+  NonLinearOptimization
+  NumericalDiff
+  OpenGLSupport
+  Polynomials
+  SparseExtra
+  SpecialFunctions
+  Splines
+  )
+
+install(FILES
+  ${Eigen_HEADERS}
+  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen COMPONENT Devel
+  )
+
+install(DIRECTORY src DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen COMPONENT Devel FILES_MATCHING PATTERN "*.h")
+
+add_subdirectory(CXX11)
diff --git a/inst/include/unsupported/Eigen/CXX11/CMakeLists.txt b/inst/include/unsupported/Eigen/CXX11/CMakeLists.txt
new file mode 100644
index 00000000..385ed240
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/CMakeLists.txt
@@ -0,0 +1,8 @@
+set(Eigen_CXX11_HEADERS Tensor TensorSymmetry ThreadPool)
+
+install(FILES
+  ${Eigen_CXX11_HEADERS}
+  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11 COMPONENT Devel
+  )
+
+install(DIRECTORY src DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11 COMPONENT Devel FILES_MATCHING PATTERN "*.h")
diff --git a/inst/include/unsupported/Eigen/CXX11/Tensor b/inst/include/unsupported/Eigen/CXX11/Tensor
new file mode 100644
index 00000000..7375a9b4
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/Tensor
@@ -0,0 +1,142 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// #ifndef EIGEN_CXX11_TENSOR_MODULE_H
+#define EIGEN_CXX11_TENSOR_MODULE_H
+
+#include "../../../Eigen/Core"
+
+#include "../SpecialFunctions"
+
+#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
+// IWYU pragma: begin_exports
+#include "../../../Eigen/src/Core/util/Meta.h"
+#include "../../../Eigen/src/Core/util/MaxSizeVector.h"
+// IWYU pragma: end_exports
+
+/** \defgroup CXX11_Tensor_Module Tensor Module
+ *
+ * This module provides a Tensor class for storing arbitrarily indexed
+ * objects.
+ *
+ * \code
+ * #include <Eigen/CXX11/Tensor>
+ * \endcode
+ *
+ * Much of the documentation can be found \ref eigen_tensors "here".
+ */
+
+#include <atomic>
+#include <chrono>
+#include <cmath>
+#include <cstddef>
+#include <cstring>
+#include <iterator>
+#include <numeric>
+#include <random>
+#include <thread>
+
+#if defined(EIGEN_USE_THREADS) || defined(EIGEN_USE_SYCL)
+#include "../../../Eigen/ThreadPool"
+#endif
+
+#ifdef EIGEN_USE_GPU
+#include <iostream>
+#if defined(EIGEN_USE_HIP)
+#include <hip/hip_runtime.h>
+#else
+#include <cuda_runtime.h>
+#endif
+#endif
+
+// IWYU pragma: begin_exports
+#include "src/Tensor/TensorMacros.h"
+#include "src/Tensor/TensorForwardDeclarations.h"
+#include "src/Tensor/TensorMeta.h"
+#include "src/Tensor/TensorFunctors.h"
+#include "src/Tensor/TensorCostModel.h"
+#include "src/Tensor/TensorDeviceDefault.h"
+#include "src/Tensor/TensorDeviceThreadPool.h"
+#include "src/Tensor/TensorDeviceGpu.h"
+#ifndef gpu_assert
+#define gpu_assert(x)
+#endif
+#include "src/Tensor/TensorDeviceSycl.h"
+#include "src/Tensor/TensorIndexList.h"
+#include "src/Tensor/TensorDimensionList.h"
+#include "src/Tensor/TensorDimensions.h"
+#include "src/Tensor/TensorInitializer.h"
+#include "src/Tensor/TensorTraits.h"
+#include "src/Tensor/TensorRandom.h"
+#include "src/Tensor/TensorUInt128.h"
+#include "src/Tensor/TensorIntDiv.h"
+#include "src/Tensor/TensorGlobalFunctions.h"
+
+#include "src/Tensor/TensorIO.h"
+
+#include "src/Tensor/TensorBase.h"
+#include "src/Tensor/TensorBlock.h"
+
+#include "src/Tensor/TensorEvaluator.h"
+#include "src/Tensor/TensorExpr.h"
+#include "src/Tensor/TensorReduction.h"
+#include "src/Tensor/TensorReductionGpu.h"
+#include "src/Tensor/TensorArgMax.h"
+#include "src/Tensor/TensorConcatenation.h"
+#include "src/Tensor/TensorContractionMapper.h"
+#include "src/Tensor/TensorContractionBlocking.h"
+#include "src/Tensor/TensorContraction.h"
+#include "src/Tensor/TensorContractionThreadPool.h"
+#include "src/Tensor/TensorContractionGpu.h"
+#include "src/Tensor/TensorConversion.h"
+#include "src/Tensor/TensorConvolution.h"
+#include "src/Tensor/TensorFFT.h"
+#include "src/Tensor/TensorPatch.h"
+#include "src/Tensor/TensorImagePatch.h"
+#include "src/Tensor/TensorVolumePatch.h"
+#include "src/Tensor/TensorBroadcasting.h"
+#include "src/Tensor/TensorChipping.h"
+#include "src/Tensor/TensorInflation.h"
+#include "src/Tensor/TensorLayoutSwap.h"
+#include "src/Tensor/TensorMorphing.h"
+#include "src/Tensor/TensorPadding.h"
+#include "src/Tensor/TensorReverse.h"
+#include "src/Tensor/TensorRoll.h"
+#include "src/Tensor/TensorShuffling.h"
+#include "src/Tensor/TensorStriding.h"
+#include "src/Tensor/TensorCustomOp.h"
+#include "src/Tensor/TensorEvalTo.h"
+#include "src/Tensor/TensorForcedEval.h"
+#include "src/Tensor/TensorGenerator.h"
+#include "src/Tensor/TensorAssign.h"
+#include "src/Tensor/TensorScan.h"
+#include "src/Tensor/TensorTrace.h"
+
+#ifdef EIGEN_USE_SYCL
+#include "src/Tensor/TensorReductionSycl.h"
+#include "src/Tensor/TensorConvolutionSycl.h"
+#include "src/Tensor/TensorContractionSycl.h"
+#include "src/Tensor/TensorScanSycl.h"
+#endif
+
+#include "src/Tensor/TensorExecutor.h"
+#include "src/Tensor/TensorDevice.h"
+
+#include "src/Tensor/TensorStorage.h"
+#include "src/Tensor/Tensor.h"
+#include "src/Tensor/TensorFixedSize.h"
+#include "src/Tensor/TensorMap.h"
+#include "src/Tensor/TensorRef.h"
+// IWYU pragma: end_exports
+
+#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+// #endif // EIGEN_CXX11_TENSOR_MODULE_H
diff --git a/inst/include/unsupported/Eigen/CXX11/TensorSymmetry b/inst/include/unsupported/Eigen/CXX11/TensorSymmetry
new file mode 100644
index 00000000..0bf9a48b
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/TensorSymmetry
@@ -0,0 +1,40 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSORSYMMETRY_MODULE_H
+#define EIGEN_CXX11_TENSORSYMMETRY_MODULE_H
+
+#include "Tensor"
+
+#include "../../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
+#include "src/util/CXX11Meta.h"
+
+/** \defgroup TensorSymmetry_Module Tensor Symmetry Module
+ *
+ * This module provides a classes that allow for the definition of
+ * symmetries w.r.t. tensor indices.
+ *
+ * Including this module will implicitly include the Tensor module.
+ *
+ * \code
+ * #include <Eigen/TensorSymmetry>
+ * \endcode
+ */
+
+// IWYU pragma: begin_exports
+#include "src/TensorSymmetry/util/TemplateGroupTheory.h"
+#include "src/TensorSymmetry/Symmetry.h"
+#include "src/TensorSymmetry/StaticSymmetry.h"
+#include "src/TensorSymmetry/DynamicSymmetry.h"
+// IWYU pragma: end_exports
+
+#include "../../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+#endif  // EIGEN_CXX11_TENSORSYMMETRY_MODULE_H
diff --git a/inst/include/unsupported/Eigen/CXX11/ThreadPool b/inst/include/unsupported/Eigen/CXX11/ThreadPool
new file mode 100644
index 00000000..d4873337
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/ThreadPool
@@ -0,0 +1 @@
+#include "../../../Eigen/ThreadPool"  // IWYU pragma: export
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/InternalHeaderCheck.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/InternalHeaderCheck.h
new file mode 100644
index 00000000..9e4c1ed9
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/InternalHeaderCheck.h
@@ -0,0 +1,3 @@
+#ifndef EIGEN_CXX11_TENSOR_MODULE_H
+#error "Please include unsupported/Eigen/CXX11/Tensor instead of including headers inside the src directory directly."
+#endif
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/README.md b/inst/include/unsupported/Eigen/CXX11/src/Tensor/README.md
new file mode 100644
index 00000000..45a94542
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/README.md
@@ -0,0 +1,2352 @@
+# Eigen Tensors {#eigen_tensors}
+
+Tensors are multidimensional arrays of elements. Elements are typically scalars,
+but more complex types such as strings are also supported.
+
+## Tensor Classes
+
+You can manipulate a tensor with one of the following classes.  They all are in
+the namespace `::Eigen.`
+
+### Class Tensor<data_type, rank>
+
+This is the class to use to create a tensor and allocate memory for it.  The
+class is templatized with the tensor datatype, such as float or int, and the
+tensor rank.  The rank is the number of dimensions, for example rank 2 is a
+matrix.
+
+Tensors of this class are resizable.  For example, if you assign a tensor of a
+different size to a Tensor, that tensor is resized to match its new value.
+
+#### Constructor Tensor<data_type, rank>(size0, size1, ...)
+
+Constructor for a Tensor.  The constructor must be passed `rank` integers
+indicating the sizes of the instance along each of the the `rank`
+dimensions.
+
+```cpp
+// Create a tensor of rank 3 of sizes 2, 3, 4.  This tensor owns
+// memory to hold 24 floating point values (24 = 2 x 3 x 4).
+Tensor<float, 3> t_3d(2, 3, 4);
+
+// Resize t_3d by assigning a tensor of different sizes, but same rank.
+t_3d = Tensor<float, 3>(3, 4, 3);
+```
+
+#### Constructor Tensor<data_type, rank>(size_array)
+
+Constructor where the sizes for the constructor are specified as an array of
+values instead of an explicitly list of parameters.  The array type to use is
+`Eigen::array<Eigen::Index>`.  The array can be constructed automatically
+from an initializer list.
+
+```cpp
+// Create a tensor of strings of rank 2 with sizes 5, 7.
+Tensor<string, 2> t_2d({5, 7});
+```
+
+### Class TensorFixedSize<data_type, Sizes<size0, size1, ...>>
+
+Class to use for tensors of fixed size, where the size is known at compile
+time.  Fixed sized tensors can provide very fast computations because all their
+dimensions are known by the compiler.  FixedSize tensors are not resizable.
+
+If the total number of elements in a fixed size tensor is small enough the
+tensor data is held onto the stack and does not cause heap allocation and free.
+
+```cpp
+// Create a 4 x 3 tensor of floats.
+TensorFixedSize<float, Sizes<4, 3>> t_4x3;
+```
+
+### Class TensorMap<Tensor<data_type, rank>>
+
+This is the class to use to create a tensor on top of memory allocated and
+owned by another part of your code.  It allows to view any piece of allocated
+memory as a `Tensor`.  Instances of this class do not own the memory where the
+data are stored.
+
+A `TensorMap` is not resizable because it does not own the memory where its data
+are stored.
+
+#### Constructor TensorMap<Tensor<data_type, rank>>(data, size0, size1, ...)
+
+Constructor for a Tensor.  The constructor must be passed a pointer to the
+storage for the data, and "rank" size attributes.  The storage has to be
+large enough to hold all the data.
+
+```cpp
+// Map a tensor of ints on top of stack-allocated storage.
+int storage[128];  // 2 x 4 x 2 x 8 = 128
+TensorMap<Tensor<int, 4>> t_4d(storage, 2, 4, 2, 8);
+
+// The same storage can be viewed as a different tensor.
+// You can also pass the sizes as an array.
+TensorMap<Tensor<int, 2>> t_2d(storage, 16, 8);
+
+// You can also map fixed-size tensors.  Here we get a 1d view of
+// the 2d fixed-size tensor.
+TensorFixedSize<float, Sizes<4, 3>> t_4x3;
+TensorMap<Tensor<float, 1>> t_12(t_4x3.data(), 12);
+```
+
+#### Class TensorRef
+
+See **Assigning to a `TensorRef`**.
+
+## Accessing Tensor Elements
+
+#### data_type tensor(index0, index1...)
+
+Return the element at position `(index0, index1...)` in tensor
+`tensor`.  You must pass as many parameters as the rank of `tensor`.
+The expression can be used as an l-value to set the value of the element at the
+specified position.  The value returned is of the datatype of the tensor.
+
+```cpp
+// Set the value of the element at position (0, 1, 0);
+Tensor<float, 3> t_3d(2, 3, 4);
+t_3d(0, 1, 0) = 12.0f;
+
+// Initialize all elements to random values.
+for (int i = 0; i < 2; ++i) {
+  for (int j = 0; j < 3; ++j) {
+    for (int k = 0; k < 4; ++k) {
+      t_3d(i, j, k) = ...some random value...;
+    }
+  }
+}
+
+// Print elements of a tensor.
+for (int i = 0; i < 2; ++i) {
+  std::cout << t_3d(i, 0, 0);
+}
+```
+
+## TensorLayout
+
+The tensor library supports 2 layouts: `ColMajor` (the default) and
+`RowMajor`.
+
+The layout of a tensor is optionally specified as part of its type. If not
+specified explicitly column major is assumed.
+
+```cpp
+Tensor<float, 3, ColMajor> col_major;  // equivalent to Tensor<float, 3>
+TensorMap<Tensor<float, 3, RowMajor> > row_major(data, ...);
+```
+
+All the arguments to an expression must use the same layout. Attempting to mix
+different layouts will result in a compilation error.
+
+It is possible to change the layout of a tensor or an expression using the
+`swap_layout()` method.  Note that this will also reverse the order of the
+dimensions.
+
+```cpp
+Tensor<float, 2, ColMajor> col_major(2, 4);
+Tensor<float, 2, RowMajor> row_major(2, 4);
+
+Tensor<float, 2> col_major_result = col_major;  // ok, layouts match
+Tensor<float, 2> col_major_result = row_major;  // will not compile
+
+// Simple layout swap
+col_major_result = row_major.swap_layout();
+eigen_assert(col_major_result.dimension(0) == 4);
+eigen_assert(col_major_result.dimension(1) == 2);
+
+// Swap the layout and preserve the order of the dimensions
+array<int, 2> shuffle(1, 0);
+col_major_result = row_major.swap_layout().shuffle(shuffle);
+eigen_assert(col_major_result.dimension(0) == 2);
+eigen_assert(col_major_result.dimension(1) == 4);
+```
+
+## Tensor Operations
+
+The Eigen Tensor library provides a vast library of operations on Tensors:
+numerical operations such as addition and multiplication, geometry operations
+such as slicing and shuffling, etc.  These operations are available as methods
+of the `Tensor` classes, and in some cases as operator overloads.  For example
+the following code computes the elementwise addition of two tensors:
+
+```cpp
+Tensor<float, 3> t1(2, 3, 4);
+t2.setRandom();
+Tensor<float, 3> t2(2, 3, 4);
+t2.setRandom();
+// Set t3 to the element wise sum of t1 and t2
+Tensor<float, 3> t3 = t1 + t2;
+```
+
+While the code above looks easy enough, it is important to understand that the
+expression `t1 + t2` is not actually adding the values of the tensors.  The
+expression instead constructs a "tensor operator" object of the class
+`TensorCwiseBinaryOp<scalar_sum>`, which has references to the tensors
+`t1` and `t2`.  This is a small C++ object that knows how to add
+`t1` and `t2`.  It is only when the value of the expression is assigned
+to the tensor `t3` that the addition is actually performed.  Technically,
+this happens through the overloading of `operator=` in the Tensor class.
+
+This mechanism for computing tensor expressions allows for lazy evaluation and
+optimizations which are what make the tensor library very fast.
+
+Of course, the tensor operators do nest, and the expression `t1 + t2 * 0.3f`
+is actually represented with the (approximate) tree of operators:
+
+```cpp
+TensorCwiseBinaryOp<scalar_sum>(t1, TensorCwiseUnaryOp<scalar_mul>(t2, 0.3f))
+```
+
+### Tensor Operations and C++ "auto"
+
+Because `Tensor` operations create tensor operators, the C++ `auto` keyword
+does not have its intuitive meaning.  Consider these 2 lines of code:
+
+```cpp
+Tensor<float, 3> t3 = t1 + t2;
+auto t4 = t1 + t2;
+```
+
+In the first line we allocate the tensor `t3` and it will contain the
+result of the addition of `t1` and `t2`.  In the second line, `t4`
+is actually the tree of tensor operators that will compute the addition of
+`t1` and `t2`.  In fact, `t4` is *not* a tensor and you cannot get
+the values of its elements:
+
+```cpp
+Tensor<float, 3> t3 = t1 + t2;
+std::cout << t3(0, 0, 0);  // OK prints the value of t1(0, 0, 0) + t2(0, 0, 0)
+
+auto t4 = t1 + t2;
+std::cout << t4(0, 0, 0);  // Compilation error!
+```
+
+When you use `auto` you do not get a `Tensor` as a result but instead a
+non-evaluated expression.
+So only use `auto` to delay evaluation.
+
+Unfortunately, there is no single underlying concrete type for holding
+non-evaluated expressions, hence you have to use `auto` in the case when you do
+want to hold non-evaluated expressions.
+
+When you need the results of set of tensor computations you have to assign the
+result to a `Tensor` that will be capable of holding onto them.  This can be
+either a normal `Tensor`, a `TensorFixedSize`, or a `TensorMap` on an existing
+piece of memory.  All the following will work:
+
+```cpp
+auto t4 = t1 + t2;
+
+Tensor<float, 3> result = t4;  // Could also be: result(t4);
+std::cout << result(0, 0, 0);
+
+TensorMap<float, 4> result(<a float* with enough space>, <size0>, ...) = t4;
+std::cout << result(0, 0, 0);
+
+TensorFixedSize<float, Sizes<size0, ...>> result = t4;
+std::cout << result(0, 0, 0);
+```
+
+Until you need the results, you can keep the operation around, and even reuse
+it for additional operations.  As long as you keep the expression as an
+operation, no computation is performed.
+
+```cpp
+// One way to compute exp((t1 + t2) * 0.2f);
+auto t3 = t1 + t2;
+auto t4 = t3 * 0.2f;
+auto t5 = t4.exp();
+Tensor<float, 3> result = t5;
+
+// Another way, exactly as efficient as the previous one:
+Tensor<float, 3> result = ((t1 + t2) * 0.2f).exp();
+```
+
+### Controlling When Expression are Evaluated
+
+There are several ways to control when expressions are evaluated:
+
+*   Assignment to a `Tensor`, `TensorFixedSize`, or `TensorMap`.
+*   Use of the `eval()` method.
+*   Assignment to a `TensorRef`.
+
+#### Assigning to a Tensor, TensorFixedSize, or TensorMap.
+
+The most common way to evaluate an expression is to assign it to a `Tensor`.
+In the example below, the `auto` declarations make the intermediate values
+"Operations", not Tensors, and do not cause the expressions to be evaluated.
+The assignment to the Tensor `result` causes the evaluation of all the
+operations.
+
+```cpp
+auto t3 = t1 + t2;             // t3 is an Operation.
+auto t4 = t3 * 0.2f;           // t4 is an Operation.
+auto t5 = t4.exp();            // t5 is an Operation.
+Tensor<float, 3> result = t5;  // The operations are evaluated.
+```
+
+If you know the ranks and sizes of the Operation value you can assign the
+Operation to a `TensorFixedSize` instead of a `Tensor`, which is a bit more efficient.
+
+```cpp
+// We know that the result is a 4x4x2 tensor!
+TensorFixedSize<float, Sizes<4, 4, 2>> result = t5;
+```
+
+Similarly, assigning an expression to a `TensorMap` causes its evaluation.
+Like tensors of type `TensorFixedSize`, a `TensorMap` cannot be resized so they have to
+have the rank and sizes of the expression that are assigned to them.
+
+#### Calling eval().
+
+When you compute large composite expressions, you sometimes want to tell Eigen
+that an intermediate value in the expression tree is worth evaluating ahead of
+time.
+This is done by inserting a call to the `eval()` method of the
+expression Operation.
+
+```cpp
+// The previous example could have been written:
+Tensor<float, 3> result = ((t1 + t2) * 0.2f).exp();
+
+// If you want to compute (t1 + t2) once ahead of time you can write:
+Tensor<float, 3> result = ((t1 + t2).eval() * 0.2f).exp();
+```
+
+Semantically, calling `eval()` is equivalent to materializing the value of
+the expression in a temporary `Tensor` of the right size.
+The code above in effect does:
+
+```cpp
+// .eval() knows the size!
+TensorFixedSize<float, Sizes<4, 4, 2>> tmp = t1 + t2;
+Tensor<float, 3> result = (tmp * 0.2f).exp();
+```
+
+Note that the return value of `eval()` is itself an Operation, so the
+following code does not do what you may think:
+
+```cpp
+// Here t3 is an evaluation Operation.  t3 has not been evaluated yet.
+auto t3 = (t1 + t2).eval();
+
+// You can use t3 in another expression.  Still no evaluation.
+auto t4 = (t3 * 0.2f).exp();
+
+// The value is evaluated when you assign the Operation to a Tensor, using
+// an intermediate tensor to represent t3.x
+Tensor<float, 3> result = t4;
+```
+
+While in the examples above calling `eval()` does not make a difference in
+performance, in other cases it can make a huge difference.  In the expression
+below the `broadcast()` expression causes the `X.maximum()` expression
+to be evaluated many times:
+
+```cpp
+Tensor<...> X ...;
+Tensor<...> Y = ((X - X.maximum(depth_dim).reshape(dims2d).broadcast(bcast))
+                 * beta).exp();
+```
+
+Inserting a call to `eval()` between the `maximum()` and
+`reshape()` calls guarantees that `maximum()` is only computed once and
+greatly speeds-up execution:
+
+```cpp
+Tensor<...> Y =
+  ((X - X.maximum(depth_dim).eval().reshape(dims2d).broadcast(bcast))
+    * beta).exp();
+```
+
+In the other example below, the tensor `Y` is both used in the expression and its assignment.
+This is an aliasing problem and if the evaluation is not done in the right order
+Y will be updated incrementally during the evaluation
+resulting in bogus results:
+
+```cpp
+ Tensor<...> Y ...;
+ Y = Y / (Y.sum(depth_dim).reshape(dims2d).broadcast(bcast));
+```
+
+Inserting a call to `eval()` between the `sum()` and `reshape()`
+expressions ensures that the sum is computed before any updates to `Y` are
+done.
+
+```cpp
+ Y = Y / (Y.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast));
+```
+
+Note that an eval around the full right hand side expression is not needed
+because the generated has to compute the `i`-th value of the right hand side
+before assigning it to the left hand side.
+
+However, if you were assigning the expression value to a shuffle of `Y`
+then you would need to force an eval for correctness by adding an `eval()`
+call for the right hand side:
+
+```cpp
+ Y.shuffle(...) =
+    (Y / (Y.sum(depth_dim).eval().reshape(dims2d).broadcast(bcast))).eval();
+```
+
+#### Assigning to a TensorRef.
+
+If you need to access only a few elements from the value of an expression you
+can avoid materializing the value in a full tensor by using a `TensorRef`.
+
+A `TensorRef` is a small wrapper class for any Eigen Operation.  It provides
+overloads for the `()` operator that let you access individual values in
+the expression.
+`TensorRef` is convenient, because the Operation themselves do
+not provide a way to access individual elements.
+
+```cpp
+// Create a TensorRef for the expression.  The expression is not
+// evaluated yet.
+TensorRef<Tensor<float, 3> > ref = ((t1 + t2) * 0.2f).exp();
+
+// Use "ref" to access individual elements.  The expression is evaluated
+// on the fly.
+float at_0 = ref(0, 0, 0);
+std::cout << ref(0, 1, 0);
+```
+
+Only use `TensorRef` when you need a subset of the values of the expression.
+`TensorRef` only computes the values you access.
+However note that if you are going to access all the values it will be much
+ faster to materialize the results in a `Tensor` first.
+
+In some cases, if the full `Tensor` result would be very large, you may save
+memory by accessing it as a `TensorRef`.
+But not always.
+So don't count on it.
+
+
+### Controlling How Expressions Are Evaluated
+
+The tensor library provides several implementations of the various operations
+such as contractions and convolutions.  The implementations are optimized for
+different environments: single threaded on CPU, multi threaded on CPU, or on a GPU using cuda.
+
+You can choose which implementation to use with the `device()` call.  If
+you do not choose an implementation explicitly the default implementation that
+uses a single thread on the CPU is used.
+
+The default implementation has been optimized for recent Intel CPUs, taking
+advantage of SSE, AVX, and FMA instructions.  Work is ongoing to tune the
+library on ARM CPUs.  Note that you need to pass compiler-dependent flags
+to enable the use of SSE, AVX, and other instructions.
+
+For example, the following code adds two tensors using the default
+single-threaded CPU implementation:
+
+```cpp
+Tensor<float, 2> a(30, 40);
+Tensor<float, 2> b(30, 40);
+Tensor<float, 2> c = a + b;
+```
+
+To choose a different implementation you have to insert a `device()` call
+before the assignment of the result.  For technical C++ reasons this requires
+that the `Tensor` for the result be declared on its own.
+This means that you have to know the size of the result.
+
+```cpp
+Eigen::Tensor<float, 2> c(30, 40);
+c.device(...) = a + b;
+```
+
+The call to `device()` must be the last call on the left of the operator=.
+
+You must pass to the `device()` call an Eigen device object.  There are
+presently three devices you can use: `DefaultDevice`, `ThreadPoolDevice` and
+`GpuDevice`.
+
+
+#### Evaluating With the DefaultDevice
+
+This is exactly the same as not inserting a `device()` call.
+
+```cpp
+DefaultDevice my_device;
+c.device(my_device) = a + b;
+```
+
+#### Evaluating with a Thread Pool
+
+```cpp
+// Create the Eigen ThreadPool
+Eigen::ThreadPool pool(8 /* number of threads in pool */)
+
+// Create the Eigen ThreadPoolDevice.
+Eigen::ThreadPoolDevice my_device(&pool, 4 /* number of threads to use */);
+
+// Now just use the device when evaluating expressions.
+Eigen::Tensor<float, 2> c(30, 50);
+c.device(my_device) = a.contract(b, dot_product_dims);
+```
+
+
+#### Evaluating On GPU
+
+This is presently a bit more complicated than just using a thread pool device.
+You need to create a GPU device but you also need to explicitly allocate the
+memory for tensors with cuda.
+
+
+## API Reference
+
+### Datatypes
+
+In the documentation of the tensor methods and Operation we mention datatypes
+that are tensor-type specific:
+
+#### <Tensor-Type>::Dimensions
+
+Acts like an array of `int`. Has an `int size` attribute, and can be
+indexed like an array to access individual values.  Used to represent the
+dimensions of a tensor.  See `dimensions()`.
+
+#### <Tensor-Type>::Index
+
+Acts like an `int`.  Used for indexing tensors along their dimensions.  See
+`operator()`, `dimension()`, and `size()`.
+
+#### <Tensor-Type>::Scalar
+
+Represents the datatype of individual tensor elements.  For example, for a
+`Tensor<float>`, `Scalar` is the type `float`.  See `setConstant()`.
+
+#### (Operation)
+
+We use this pseudo type to indicate that a tensor Operation is returned by a
+method.  We indicate in the text the type and dimensions of the tensor that the
+Operation returns after evaluation.
+
+The Operation will have to be evaluated, for example by assigning it to a
+`Tensor`, before you can access the values of the resulting tensor.  You can also
+access the values through a `TensorRef`.
+
+
+## Built-in Tensor Methods
+
+These are usual C++ methods that act on tensors immediately.  They are not
+Operations which provide delayed evaluation of their results.  Unless specified
+otherwise, all the methods listed below are available on all tensor classes:
+`Tensor`, `TensorFixedSize`, and `TensorMap`.
+
+## Metadata
+
+### int NumDimensions
+
+Constant value indicating the number of dimensions of a `Tensor`.
+This is also known as the tensor rank.
+
+```cpp
+Eigen::Tensor<float, 2> a(3, 4);
+std::cout << "Dims " << a.NumDimensions;
+// Dims 2
+```
+
+### Dimensions dimensions()
+
+Returns an array-like object representing the dimensions of the tensor.
+The actual type of the `dimensions()` result is `<Tensor-Type>::Dimensions`.
+
+```cpp
+Eigen::Tensor<float, 2> a(3, 4);
+const Eigen::Tensor<float, 2>::Dimensions& d = a.dimensions();
+std::cout << "Dim size: " << d.size << ", dim 0: " << d[0]
+          << ", dim 1: " << d[1];
+//  Dim size: 2, dim 0: 3, dim 1: 4
+```
+
+If you use a C++11 compiler, you can use `auto` to simplify the code:
+
+```cpp
+const auto& d = a.dimensions();
+std::cout << "Dim size: " << d.size << ", dim 0: " << d[0]
+        << ", dim 1: " << d[1];
+// Dim size: 2, dim 0: 3, dim 1: 4
+```
+
+### Index dimension(Index n)
+
+Returns the n-th dimension of the tensor.  The actual type of the
+`dimension()` result is `<Tensor-Type>::Index`, but you can
+always use it like an int.
+
+```cpp
+Eigen::Tensor<float, 2> a(3, 4);
+int dim1 = a.dimension(1);
+std::cout << "Dim 1: " << dim1;
+// Dim 1: 4
+```
+
+### Index size()
+
+Returns the total number of elements in the tensor.  This is the product of all
+the tensor dimensions.  The actual type of the `size()` result is
+`<Tensor-Type>::Index`, but you can always use it like an int.
+
+```cpp
+Eigen::Tensor<float, 2> a(3, 4);
+std::cout << "Size: " << a.size();
+/// Size: 12
+```
+
+### Getting Dimensions From An Operation
+
+A few operations provide `dimensions()` directly,
+e.g. `TensorReslicingOp`.  Most operations defer calculating dimensions
+until the operation is being evaluated.  If you need access to the dimensions
+of a deferred operation, you can wrap it in a `TensorRef` (see
+**Assigning to a TensorRef** above), which provides
+`dimensions()` and `dimension()` as above.
+
+`TensorRef` can also wrap the plain `Tensor` types, so this is a useful idiom in
+templated contexts where the underlying object could be either a raw `Tensor`
+or some deferred operation (e.g. a slice of a `Tensor`).  In this case, the
+template code can wrap the object in a TensorRef and reason about its
+dimensionality while remaining agnostic to the underlying type.
+
+
+## Constructors
+
+### Tensor
+
+Creates a tensor of the specified size. The number of arguments must be equal
+to the rank of the tensor. The content of the tensor is not initialized.
+
+```cpp
+Eigen::Tensor<float, 2> a(3, 4);
+std::cout << "NumRows: " << a.dimension(0) << " NumCols: " << a.dimension(1) << endl;
+// NumRows: 3 NumCols: 4
+```
+### TensorFixedSize
+
+Creates a tensor of the specified size. The number of arguments in the `Sizes<>`
+template parameter determines the rank of the tensor. The content of the tensor
+is not initialized.
+
+```cpp
+Eigen::TensorFixedSize<float, Sizes<3, 4>> a;
+std::cout << "Rank: " << a.rank() << endl;
+// Rank: 2
+std::cout << "NumRows: " << a.dimension(0)
+          << " NumCols: " << a.dimension(1) << endl;
+// NumRows: 3 NumCols: 4
+```
+
+### TensorMap
+
+Creates a tensor mapping an existing array of data. The data must not be freed
+until the `TensorMap` is discarded, and the size of the data must be large enough
+to accommodate the coefficients of the tensor.
+
+```cpp
+float data[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+Eigen::TensorMap<Tensor<float, 2>> a(data, 3, 4);
+std::cout << "NumRows: " << a.dimension(0) << " NumCols: " << a.dimension(1) << endl;
+// NumRows: 3 NumCols: 4
+std::cout << "a(1, 2): " << a(1, 2) << endl;
+// a(1, 2): 7
+```
+
+## Contents Initialization
+
+When a new `Tensor` or a new `TensorFixedSize` are created, memory is allocated to
+hold all the tensor elements, but the memory is not initialized.  Similarly,
+when a new `TensorMap` is created on top of non-initialized memory the memory its
+contents are not initialized.
+
+You can use one of the methods below to initialize the tensor memory.  These
+have an immediate effect on the tensor and return the tensor itself as a
+result.  These are not tensor Operations which delay evaluation.
+
+### <Tensor-Type> setConstant(const Scalar& val)
+
+Sets all elements of the tensor to the constant value `val`.  `Scalar`
+is the type of data stored in the tensor.  You can pass any value that is
+convertible to that type.
+
+Returns the tensor itself in case you want to chain another call.
+
+```cpp
+a.setConstant(12.3f);
+std::cout << "Constant: " << endl << a << endl << endl;
+
+// Constant:
+// 12.3 12.3 12.3 12.3
+// 12.3 12.3 12.3 12.3
+// 12.3 12.3 12.3 12.3
+```
+Note that `setConstant()` can be used on any tensor where the element type
+has a copy constructor and an `operator=()`:
+
+```cpp
+Eigen::Tensor<string, 2> a(2, 3);
+a.setConstant("yolo");
+std::cout << "String tensor: " << endl << a << endl << endl;
+
+// String tensor:
+// yolo yolo yolo
+// yolo yolo yolo
+```
+
+### <Tensor-Type> setZero()
+
+Fills the tensor with zeros.  Equivalent to `setConstant(Scalar(0))`.
+Returns the tensor itself in case you want to chain another call.
+
+```cpp
+a.setZero();
+std::cout << "Zeros: " << endl << a << endl << endl;
+
+// Zeros:
+// 0 0 0 0
+// 0 0 0 0
+// 0 0 0 0
+```
+
+### <Tensor-Type> setValues({..initializer_list})
+
+Fills the tensor with explicit values specified in a std::initializer_list.
+The type of the initializer list depends on the type and rank of the tensor.
+
+If the tensor has rank N, the initializer list must be nested N times.  The
+most deeply nested lists must contains P scalars of the `Tensor` type where P is
+the size of the last dimension of the Tensor.
+
+For example, for a `TensorFixedSize<float, 2, 3>` the initializer list must
+contains 2 lists of 3 floats each.
+
+`setValues()` returns the tensor itself in case you want to chain another
+call.
+
+```cpp
+Eigen::Tensor<float, 2> a(2, 3);
+a.setValues({{0.0f, 1.0f, 2.0f}, {3.0f, 4.0f, 5.0f}});
+std::cout << "a" << endl << a << endl << endl;
+
+// a
+// 0 1 2
+// 3 4 5
+```
+
+If a list is too short, the corresponding elements of the tensor will not be
+changed.  This is valid at each level of nesting.  For example the following
+code only sets the values of the first row of the tensor.
+
+```cpp
+Eigen::Tensor<int, 2> a(2, 3);
+a.setConstant(1000);
+a.setValues({{10, 20, 30}});
+std::cout << "a" << endl << a << endl << endl;
+// a
+// 10   20   30
+// 1000 1000 1000
+```
+
+### <Tensor-Type> setRandom()
+
+Fills the tensor with random values.  Returns the tensor itself in case you
+want to chain another call.
+
+```cpp
+a.setRandom();
+std::cout << "Random: " << endl << a << endl << endl;
+// Random:
+//   0.680375    0.59688  -0.329554    0.10794
+//  -0.211234   0.823295   0.536459 -0.0452059
+//   0.566198  -0.604897  -0.444451   0.257742
+```
+
+You can customize `setRandom()` by providing your own random number
+generator as a template argument:
+
+```cpp
+a.setRandom<MyRandomGenerator>();
+```
+
+Here, `MyRandomGenerator` must be a struct with the following member
+functions, where Scalar and Index are the same as `<Tensor-Type>::Scalar`
+and `<Tensor-Type>::Index`.
+
+See `struct UniformRandomGenerator` in TensorFunctors.h for an example.
+
+```cpp
+// Custom number generator for use with setRandom().
+struct MyRandomGenerator {
+  // Default and copy constructors. Both are needed
+  MyRandomGenerator() { }
+  MyRandomGenerator(const MyRandomGenerator& ) { }
+
+  // Return a random value to be used.  "element_location" is the
+  // location of the entry to set in the tensor, it can typically
+  // be ignored.
+  Scalar operator()(Eigen::DenseIndex element_location,
+                    Eigen::DenseIndex /*unused*/ = 0) const {
+    return <randomly generated value of type T>;
+  }
+
+  // Same as above but generates several numbers at a time.
+  typename internal::packet_traits<Scalar>::type packetOp(
+      Eigen::DenseIndex packet_location, Eigen::DenseIndex /*unused*/ = 0) const {
+    return <a packet of randomly generated values>;
+  }
+};
+```
+
+You can also use one of the 2 random number generators that are part of the
+tensor library:
+*   UniformRandomGenerator
+*   NormalRandomGenerator
+
+## Data Access
+
+The Tensor, TensorFixedSize, and TensorRef classes provide the following
+accessors to access the tensor coefficients:
+
+```cpp
+const Scalar& operator()(const array<Index, NumIndices>& indices)
+const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices)
+Scalar& operator()(const array<Index, NumIndices>& indices)
+Scalar& operator()(Index firstIndex, IndexTypes... otherIndices)
+```
+
+The number of indices must be equal to the rank of the tensor. Moreover, these
+accessors are not available on tensor expressions. In order to access the
+values of a tensor expression, the expression must either be evaluated or
+wrapped in a TensorRef.
+
+### Scalar* data() and const Scalar* data() const
+
+Returns a pointer to the storage for the tensor.  The pointer is const if the
+tensor was const.  This allows direct access to the data.  The layout of the
+data depends on the tensor layout: `RowMajor` or `ColMajor`.
+
+This access is usually only needed for special cases, for example when mixing
+Eigen Tensor code with other libraries.
+
+Scalar is the type of data stored in the tensor.
+
+```cpp
+Eigen::Tensor<float, 2> a(3, 4);
+float* a_data = a.data();
+a_data[0] = 123.45f;
+std::cout << "a(0, 0): " << a(0, 0);
+// a(0, 0): 123.45
+```
+
+## Tensor Operations
+
+All the methods documented below return non evaluated tensor `Operations`.
+These can be chained: you can apply another `Tensor` Operation to the value
+returned by the method.
+
+The chain of Operation is evaluated lazily, typically when it is assigned to a
+tensor.  See **Controlling When Expression are Evaluated** for more details about
+their evaluation.
+
+### (Operation) constant(const Scalar& val)
+
+Returns a tensor of the same type and dimensions as the original tensor but
+where all elements have the value `val`.
+
+This is useful, for example, when you want to add or subtract a constant from a
+tensor, or multiply every element of a tensor by a scalar.
+However, such operations can also be performed using operator overloads (see `operator+`).
+
+
+```cpp
+Eigen::Tensor<float, 2> a(2, 3);
+a.setConstant(1.0f);
+Eigen::Tensor<float, 2> b = a + a.constant(2.0f);
+Eigen::Tensor<float, 2> c = b * b.constant(0.2f);
+std::cout << "a" << endl << a << endl << endl;
+std::cout << "b" << endl << b << endl << endl;
+std::cout << "c" << endl << c << endl << endl;
+// a
+// 1 1 1
+// 1 1 1
+
+// b
+// 3 3 3
+// 3 3 3
+
+// c
+// 0.6 0.6 0.6
+// 0.6 0.6 0.6
+```
+
+### (Operation) random()
+
+Returns a tensor of the same type and dimensions as the current tensor
+but where all elements have random values.
+
+This is for example useful to add random values to an existing tensor.
+The generation of random values can be customized in the same manner
+as for `setRandom()`.
+
+```cpp
+Eigen::Tensor<float, 2> a(2, 3);
+a.setConstant(1.0f);
+Eigen::Tensor<float, 2> b = a + a.random();
+std::cout << "a\n" << a << "\n\n";
+std::cout << "b\n" << b << "\n\n";
+
+// a
+// 1 1 1
+// 1 1 1
+// b
+// 1.68038   1.5662  1.82329
+// 0.788766  1.59688
+```
+
+## Unary Element Wise Operations
+
+All these operations take a single input tensor as argument and return a tensor
+of the same type and dimensions as the tensor to which they are applied.  The
+requested operations are applied to each element independently.
+
+### (Operation) operator-()
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the opposite values of the original tensor.
+
+```cpp
+Eigen::Tensor<float, 2> a(2, 3);
+a.setConstant(1.0f);
+Eigen::Tensor<float, 2> b = -a;
+std::cout << "a\n" << a << "\n\n";
+std::cout << "b\n" << b << "\n\n";
+
+// a
+// 1 1 1
+// 1 1 1
+//
+// b
+// -1 -1 -1
+// -1 -1 -1
+```
+
+### (Operation) sqrt()
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the square roots of the original tensor.
+
+### (Operation) rsqrt()
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the inverse square roots of the original tensor.
+
+### (Operation) square()
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the squares of the original tensor values.
+
+### (Operation) inverse()
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the inverse of the original tensor values.
+
+### (Operation) exp()
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the exponential of the original tensor.
+
+### (Operation) log()
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the natural logarithms of the original tensor.
+
+### (Operation) abs()
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the absolute values of the original tensor.
+
+### (Operation) arg()
+
+Returns a tensor with the same dimensions as the original tensor
+containing the complex argument (phase angle) of the values of the
+original tensor.
+
+### (Operation) real()
+
+Returns a tensor with the same dimensions as the original tensor
+containing the real part of the complex values of the original tensor.
+The result has a real-valued scalar type.
+
+### (Operation) imag()
+
+Returns a tensor with the same dimensions as the original tensor
+containing the imaginary part of the complex values of the original
+tensor.
+The result has a real-valued scalar type.
+
+### (Operation) pow(Scalar exponent)
+
+Returns a tensor of the same type and dimensions as the original tensor
+containing the coefficients of the original tensor to the power of the
+exponent.
+
+The type of the exponent, Scalar, is always the same as the type of the
+tensor coefficients.  For example, only integer exponents can be used in
+conjunction with tensors of integer values.
+
+You can use `cast()` to lift this restriction.  For example this computes
+cubic roots of an int Tensor:
+
+```cpp
+Eigen::Tensor<int, 2> a(2, 3);
+a.setValues({{0, 1, 8}, {27, 64, 125}});
+Eigen::Tensor<double, 2> b = a.cast<double>().pow(1.0 / 3.0);
+std::cout << "a" << endl << a << endl << endl;
+std::cout << "b" << endl << b << endl << endl;
+
+// a
+// 0   1   8
+// 27  64 125
+//
+// b
+// 0 1 2
+// 3 4 5
+```
+
+### (Operation)  operator* (Scalar s)
+
+Multiplies every element of the input tensor by the scalar `s`:
+```cpp
+Eigen::Tensor<int, 2> a(2, 3);
+a.setValues({{1, 2, 3},
+                {4, 5, 6}});
+Eigen::Tensor<int,2> scaled_a = a * 2;
+
+std::cout << "a\n" << a << "\n";
+std::cout << "scaled_a\n" << scaled_a << "\n";
+
+// a
+// 1 2 3
+// 4 5 6
+//
+// scaled_a
+// 2  4  6
+// 8 10 12
+```
+### (Operation) operator+ (Scalar s)
+Adds `s` to every element in the tensor.
+
+### (Operation) operator- (Scalar s)
+Subtracts `s` from every element in the tensor.
+
+### (Operation) operator/ (Scalar s)
+Divides every element in the tensor by `s`.
+
+### (Operation) operator% (Scalar s)
+Computes the element-wise modulus (remainder) of each tensor element divided by `s`
+
+**Only integer types are supported.**
+For floating-point tensors, implement a `unaryExpr` using `std::fmod`.
+
+### (Operation)  cwiseMax(Scalar threshold)
+Returns the coefficient-wise maximum between two tensors.
+```cpp
+Eigen::Tensor<int, 2> a(2, 3);
+a.setValues({{0, 100, 200}, {300, 400, 500}});
+
+Eigen::Tensor<int, 2> b(2, 3);
+b.setValues({{-1, -2, 300}, {-4, 555, -6}});
+
+Eigen::Tensor<int, 2> c = a.cwiseMax(b);
+
+std::cout << "a\n" << a << "\n"
+            << "b\n" << b << "\n"
+            << "c\n" << c << "\n";
+
+// a
+//   0 100 200
+// 300 400 500
+
+// b
+// -1  -2 300
+// -4 555  -6
+
+// c
+//   0 100 300
+// 300 555 500
+```
+### (Operation)  cwiseMin(Scalar threshold)
+Returns the coefficient-wise minimum between two tensors.
+
+```cpp
+Eigen::Tensor<int, 2> a(2, 2);
+a.setValues({{0, 100}, {300, -900}});
+
+Eigen::Tensor<int, 2> b(2, 2);
+b.setValues({{-1, -2}, {400, 555}});
+
+Eigen::Tensor<int, 2> c = a.cwiseMin(b);
+
+std::cout << "a\n" << a << "\n"
+          << "b\n" << b << "\n"
+          << "c\n" << c << "\n";
+
+// a
+//   0  100
+// 300 -900
+
+// b
+//  -1  -2
+// 400 555
+
+// c
+//  -1   -2
+// 300 -900
+```
+
+### (Operation)  unaryExpr(const CustomUnaryOp& func)
+Applies a user defined function to each element in the tensor.
+Supports lambdas or functor structs with an operator().
+
+Using lambda:
+```cpp
+Eigen::Tensor<float, 2> a(2, 3);
+a.setValues({{0, -.5, -1}, {.5, 1.5, 2.0}});
+auto my_func = [](float el){ return std::abs(el + 0.5f);};
+Eigen::Tensor<float, 2> b = a.unaryExpr(my_func);
+std::cout << "a\n" << a << "\n"
+        << "b\n" << b << "\n";
+=>
+a
+    0  -0.5   -1
+0.5   1.5    2
+b
+0.5     0  0.5
+    1     2  2.5
+```
+
+Using a functor to normalize and clamp values to `[-1.0, 1.0]`:
+
+```cpp
+template<typename Scalar>
+struct NormalizedClamp {
+NormalizedClamp(Scalar lo, Scalar hi) : _lo(lo), _hi(hi) {}
+Scalar operator()(Scalar x) const {
+    if (x < _lo) return Scalar(0);
+    if (x > _hi) return Scalar(1);
+    return (x - _lo) / (_hi - _lo);
+}
+Scalar _lo, _hi;
+};
+
+Eigen::Tensor<float, 2> c = a.unaryExpr(NormalizedClamp<float>(-1.0f, 1.0f));
+std::cout << "c\n" << c << "\n";
+
+// c
+// 0.5    0.25    0
+// 0.75   1       1
+```
+
+
+## Binary Element Wise Operations
+
+These operations take two input tensors as arguments. The 2 input tensors should
+be of the same type and dimensions. The result is a tensor of the same
+dimensions as the tensors to which they are applied, and unless otherwise
+specified it is also of the same type. The requested operations are applied to
+each pair of elements independently.
+
+### (Operation) operator+(const OtherDerived& other)
+
+Returns a tensor of the same type and dimensions as the input tensors
+containing the coefficient wise sums of the inputs.
+
+### (Operation) operator-(const OtherDerived& other)
+
+Returns a tensor of the same type and dimensions as the input tensors
+containing the coefficient wise differences of the inputs.
+
+### (Operation) operator*(const OtherDerived& other)
+
+Returns a tensor of the same type and dimensions as the input tensors
+containing the coefficient wise products of the inputs.
+
+### (Operation) operator/(const OtherDerived& other)
+
+Returns a tensor of the same type and dimensions as the input tensors
+containing the coefficient wise quotients of the inputs.
+
+This operator is not supported for integer types.
+
+### (Operation) cwiseMax(const OtherDerived& other)
+
+Returns a tensor of the same type and dimensions as the input tensors
+containing the coefficient wise maximums of the inputs.
+
+### (Operation) cwiseMin(const OtherDerived& other)
+
+Returns a tensor of the same type and dimensions as the input tensors
+containing the coefficient wise mimimums of the inputs.
+
+### (Operation) Logical operators
+
+The following boolean operators are supported:
+
+ * `operator&&(const OtherDerived& other)`
+ * `operator||(const OtherDerived& other)`
+ * `operator<(const OtherDerived& other)`
+ * `operator<=(const OtherDerived& other)`
+ * `operator>(const OtherDerived& other)`
+ * `operator>=(const OtherDerived& other)`
+ * `operator==(const OtherDerived& other)`
+ * `operator!=(const OtherDerived& other)`
+
+ as well as bitwise operators:
+
+ * `operator&(const OtherDerived& other)`
+ * `operator|(const OtherDerived& other)`
+ * `operator^(const OtherDerived& other)`
+
+The resulting tensor retains the input scalar type.
+
+## Selection (select(const ThenDerived& thenTensor, const ElseDerived& elseTensor)
+
+Selection is a coefficient-wise ternary operator that is the tensor equivalent
+to the if-then-else operation.
+
+```cpp
+    Tensor<bool, 3> if = ...;
+    Tensor<float, 3> then = ...;
+    Tensor<float, 3> else = ...;
+    Tensor<float, 3> result = if.select(then, else);
+```
+
+The 3 arguments must be of the same dimensions, which will also be the dimension
+of the result.  The 'if' tensor must be of type boolean, the 'then' and the
+'else' tensor must be of the same type, which will also be the type of the
+result.
+
+Each coefficient in the result is equal to the corresponding coefficient in the
+'then' tensor if the corresponding value in the 'if' tensor is true. If not, the
+resulting coefficient will come from the 'else' tensor.
+
+
+## Contraction
+
+Tensor *contractions* are a generalization of the matrix product to the
+multidimensional case.
+
+```cpp
+// Create 2 matrices using tensors of rank 2
+Eigen::Tensor<int, 2> a(2, 3);
+a.setValues({{1, 2, 3}, {6, 5, 4}});
+Eigen::Tensor<int, 2> b(3, 2);
+b.setValues({{1, 2}, {4, 5}, {5, 6}});
+
+// Compute the traditional matrix product
+Eigen::array<Eigen::IndexPair<int>, 1> product_dims = { Eigen::IndexPair<int>(1, 0) };
+Eigen::Tensor<int, 2> AB = a.contract(b, product_dims);
+
+// Compute the product of the transpose of the matrices
+Eigen::array<Eigen::IndexPair<int>, 1> transposed_product_dims = { Eigen::IndexPair<int>(0, 1) };
+Eigen::Tensor<int, 2> AtBt = a.contract(b, transposed_product_dims);
+
+// Contraction to scalar value using a double contraction.
+// First coordinate of both tensors are contracted as well as both second coordinates, i.e., this computes the sum of the squares of the elements.
+Eigen::array<Eigen::IndexPair<int>, 2> double_contraction_product_dims = { Eigen::IndexPair<int>(0, 0), Eigen::IndexPair<int>(1, 1) };
+Eigen::Tensor<int, 0> AdoubleContractedA = a.contract(a, double_contraction_product_dims);
+
+// Extracting the scalar value of the tensor contraction for further usage
+int value = AdoubleContractedA(0);
+```
+
+## Reduction Operations
+
+A *Reduction* operation returns a tensor with fewer dimensions than the
+original tensor.  The values in the returned tensor are computed by applying a
+*reduction operator* to slices of values from the original tensor.  You specify
+the dimensions along which the slices are made.
+
+The Eigen Tensor library provides a set of predefined reduction operators such
+as `maximum()` and `sum()` and lets you define additional operators by
+implementing a few methods from a reductor template.
+
+### Reduction Dimensions
+
+All reduction operations take a single parameter of type
+`<TensorType>::``Dimensions` which can always be specified as an array of
+ints.  These are called the "reduction dimensions."  The values are the indices
+of the dimensions of the input tensor over which the reduction is done.  The
+parameter can have at most as many element as the rank of the input tensor;
+each element must be less than the tensor rank, as it indicates one of the
+dimensions to reduce.
+
+Each dimension of the input tensor should occur at most once in the reduction
+dimensions as the implementation does not remove duplicates.
+
+The order of the values in the reduction dimensions does not affect the
+results, but the code may execute faster if you list the dimensions in
+increasing order.
+
+Example: Reduction along one dimension.
+```cpp
+// Create a tensor of 2 dimensions
+Eigen::Tensor<int, 2> a(2, 3);
+a.setValues({{1, 2, 3}, {6, 5, 4}});
+// Reduce it along the second dimension (1)...
+Eigen::array<int, 1> dims({1 /* dimension to reduce */});
+// ...using the "maximum" operator.
+// The result is a tensor with one dimension.  The size of
+// that dimension is the same as the first (non-reduced) dimension of a.
+Eigen::Tensor<int, 1> b = a.maximum(dims);
+std::cout << "a" << endl << a << endl << endl;
+std::cout << "b" << endl << b << endl << endl;
+
+// a
+// 1 2 3
+// 6 5 4
+
+// b
+// 3
+// 6
+```
+Example: Reduction along two dimensions.
+```cpp
+Eigen::Tensor<float, 3, Eigen::ColMajor> a(2, 3, 4);
+a.setValues({{{0.0f, 1.0f, 2.0f, 3.0f},
+                {7.0f, 6.0f, 5.0f, 4.0f},
+                {8.0f, 9.0f, 10.0f, 11.0f}},
+                {{12.0f, 13.0f, 14.0f, 15.0f},
+                {19.0f, 18.0f, 17.0f, 16.0f},
+                {20.0f, 21.0f, 22.0f, 23.0f}}});
+// The tensor a has 3 dimensions.  We reduce along the
+// first 2, resulting in a tensor with a single dimension
+// of size 4 (the last dimension of a.)
+// Note that we pass the array of reduction dimensions
+// directly to the maximum() call.
+Eigen::Tensor<float, 1, Eigen::ColMajor> b =
+    a.maximum(Eigen::array<int, 2>({0, 1}));
+std::cout << "b" << endl << b << endl << endl;
+
+// b
+// 20
+// 21
+// 22
+// 23
+```
+#### Reduction along all dimensions
+
+As a special case, if you pass no parameter to a reduction operation the
+original tensor is reduced along *all* its dimensions.  The result is a
+scalar, represented as a zero-dimension tensor.
+
+```cpp
+Eigen::Tensor<float, 3> a(2, 3, 4);
+a.setValues({{{0.0f, 1.0f, 2.0f, 3.0f},
+              {7.0f, 6.0f, 5.0f, 4.0f},
+              {8.0f, 9.0f, 10.0f, 11.0f}},
+              {{12.0f, 13.0f, 14.0f, 15.0f},
+              {19.0f, 18.0f, 17.0f, 16.0f},
+              {20.0f, 21.0f, 22.0f, 23.0f}}});
+// Reduce along all dimensions using the sum() operator.
+Eigen::Tensor<float, 0> b = a.sum();
+std::cout << "b\n" << b;
+
+// b
+// 276
+```
+You can extract the scalar directly by casting the expression and extract the first and only coefficient:
+```cpp
+float sum = static_cast<Eigen::Tensor<float, 0>>(a.sum())();
+```
+
+### (Operation) sum(const Dimensions& reduction_dims)
+### (Operation) sum()
+
+Reduce a tensor using the `sum()` operator.  The resulting values
+are the sum of the reduced values.
+
+### (Operation) mean(const Dimensions& reduction_dims)
+### (Operation) mean()
+
+Reduce a tensor using the `mean()` operator.  The resulting values
+are the mean of the reduced values.
+
+### (Operation) maximum(const Dimensions& reduction_dims)
+### (Operation) maximum()
+
+Reduce a tensor using the `maximum()` operator.  The resulting values are the
+largest of the reduced values.
+
+### (Operation) minimum(const Dimensions& reduction_dims)
+### (Operation) minimum()
+
+Reduce a tensor using the `minimum()` operator.  The resulting values
+are the smallest of the reduced values.
+
+### (Operation) prod(const Dimensions& reduction_dims)
+### (Operation) prod()
+
+Reduce a tensor using the `prod()` operator.  The resulting values
+are the product of the reduced values.
+
+### (Operation) all(const Dimensions& reduction_dims)
+### (Operation) all()
+Reduce a tensor using the `all()` operator.  Casts tensor to bool and then checks
+whether all elements are true.  Runs through all elements rather than
+short-circuiting, so may be significantly inefficient.
+
+### (Operation) any(const Dimensions& reduction_dims)
+### (Operation) any()
+Reduce a tensor using the `any()` operator.  Casts tensor to bool and then checks
+whether any element is true.  Runs through all elements rather than
+short-circuiting, so may be significantly inefficient.
+
+
+### (Operation) argmax(const Dimensions& reduction_dim)
+### (Operation) argmax()
+
+Reduce a tensor using the `argmax()` operator.
+
+The resulting values are the indices of the largest elements along the specified dimension.
+
+Only a single `reduction_dim` is supported.
+
+If multiple elements share the maximum value, the one with the **lowest index** is returned.
+
+```cpp
+Eigen::Tensor<float, 2> a(2, 3);
+a.setValues({{1, 4, 8}, {3, 4, 2}});
+
+Eigen::Tensor<Eigen::Index, 1> argmax_dim0 = a.argmax(0);
+
+std::cout << "a:\n" << a << "\n";
+for (int i = 0; i < argmax_dim0.size(); ++i) {
+    std::cout << "argmax along dim 0 at index " << i << " = " << argmax_dim0(i) << "\n";
+}
+
+// a:
+// 1 4 8
+// 3 4 2
+// argmax along dim 0 at index 0 = 1
+// argmax along dim 0 at index 1 = 0
+// argmax along dim 0 at index 2 = 0
+```
+
+ To compute the index of the global maximum, use the overload without arguments (which flattens the tensor).
+
+
+```cpp
+Eigen::Tensor<Eigen::Index, 0> argmax_flat = a.argmax();
+std::cout << "Flat argmax index: " << argmax_flat();
+
+// Flat argmax index: 4
+```
+
+### (Operation) argmin(const Dimensions& reduction_dim)
+### (Operation) argmin()
+See `argmax`.
+
+### (Operation) reduce(const Dimensions& reduction_dims, const Reducer& reducer)
+
+Reduce a tensor using a user-defined reduction operator.  See `SumReducer`
+in TensorFunctors.h for information on how to implement a reduction operator.
+
+
+## Trace
+
+A *Trace* operation returns a tensor with fewer dimensions than the original
+tensor. It returns a tensor whose elements are the sum of the elements of the
+original tensor along the main diagonal for a list of specified dimensions, the
+"trace dimensions". Similar to the `Reduction Dimensions`, the trace dimensions
+are passed as an input parameter to the operation, are of type `<TensorType>::``Dimensions`
+, and have the same requirements when passed as an input parameter. In addition,
+the trace dimensions must have the same size.
+
+Example: Trace along 2 dimensions.
+
+```cpp
+// Create a tensor of 3 dimensions
+Eigen::Tensor<int, 3> a(2, 2, 3);
+a.setValues({{{1, 2, 3}, {4, 5, 6}}, {{7, 8, 9}, {10, 11, 12}}});
+// Specify the dimensions along which the trace will be computed.
+// In this example, the trace can only be computed along the dimensions
+// with indices 0 and 1
+Eigen::array<int, 2> dims({0, 1});
+// The output tensor contains all but the trace dimensions.
+Tensor<int, 1> a_trace = a.trace(dims);
+std::cout << "a_trace:" << endl;
+std::cout << a_trace << endl;
+
+// a_trace:
+// 11
+// 13
+// 15
+```
+
+### (Operation) trace(const Dimensions& new_dims)
+### (Operation) trace()
+
+As a special case, if no parameter is passed to the operation, trace is computed
+along *all* dimensions of the input tensor.
+
+Example: Trace along all dimensions.
+
+```cpp
+// Create a tensor of 3 dimensions, with all dimensions having the same size.
+Eigen::Tensor<int, 3> a(3, 3, 3);
+a.setValues({{{1, 2, 3}, {4, 5, 6}, {7, 8, 9}},
+            {{10, 11, 12}, {13, 14, 15}, {16, 17, 18}},
+            {{19, 20, 21}, {22, 23, 24}, {25, 26, 27}}});
+// Result is a zero dimension tensor
+Tensor<int, 0> a_trace = a.trace();
+std::cout<<"a_trace:"<<endl;
+std::cout<<a_trace<<endl;
+
+// a_trace:
+// 42
+```
+
+## Scan Operations
+
+A *Scan* operation returns a tensor with the same dimensions as the original
+tensor. The operation performs an inclusive scan along the specified
+axis, which means it computes a running total along the axis for a given
+reduction operation.
+If the reduction operation corresponds to summation, then this computes the
+prefix sum of the tensor along the given axis.
+
+Example:
+Cumulative sum along the second dimension
+
+```cpp
+// Create a tensor of 2 dimensions
+Eigen::Tensor<int, 2> a(2, 3);
+a.setValues({{1, 2, 3}, {4, 5, 6}});
+// Scan it along the second dimension (1) using summation
+Eigen::Tensor<int, 2> b = a.cumsum(1);
+// The result is a tensor with the same size as the input
+std::cout << "a" << endl << a << endl << endl;
+std::cout << "b" << endl << b << endl << endl;
+
+// a
+// 1 2 3
+// 4 5 6
+
+// b
+// 1  3  6
+// 4  9 15
+```
+
+### (Operation) cumsum(const Index& axis)
+
+Perform a scan by summing consecutive entries.
+
+### (Operation) cumprod(const Index& axis)
+
+Perform a scan by multiplying consecutive entries.
+
+## Convolutions
+
+### (Operation) convolve(const Kernel& kernel, const Dimensions& dims)
+
+Returns a tensor that is the output of the convolution of the input tensor with the kernel,
+along the specified dimensions of the input tensor. The dimension size for dimensions of the output tensor
+which were part of the convolution will be reduced by the formula:
+```cpp
+output_dim_size = input_dim_size - kernel_dim_size + 1 // (requires: input_dim_size >= kernel_dim_size).
+```
+The dimension sizes for dimensions that were not part of the convolution will remain the same.
+Performance of the convolution can depend on the length of the stride(s) of the input tensor dimension(s) along which the
+convolution is computed (the first dimension has the shortest stride for `ColMajor`, whereas `RowMajor`'s shortest stride is
+for the last dimension).
+
+```cpp
+// Compute convolution along the second and third dimension.
+Tensor<float, 4, DataLayout> input(3, 3, 7, 11);
+Tensor<float, 2, DataLayout> kernel(2, 2);
+Tensor<float, 4, DataLayout> output(3, 2, 6, 11);
+input.setRandom();
+kernel.setRandom();
+
+Eigen::array<ptrdiff_t, 2> dims({1, 2});  // Specify second and third dimension for convolution.
+output = input.convolve(kernel, dims);
+
+for (int i = 0; i < 3; ++i) {
+  for (int j = 0; j < 2; ++j) {
+    for (int k = 0; k < 6; ++k) {
+      for (int l = 0; l < 11; ++l) {
+        const float result = output(i,j,k,l);
+        const float expected = input(i,j+0,k+0,l) * kernel(0,0) +
+                               input(i,j+1,k+0,l) * kernel(1,0) +
+                               input(i,j+0,k+1,l) * kernel(0,1) +
+                               input(i,j+1,k+1,l) * kernel(1,1);
+        VERIFY_IS_APPROX(result, expected);
+      }
+    }
+  }
+}
+```
+
+## Geometrical Operations
+
+These operations return a `Tensor` with different dimensions than the original
+`Tensor`.  They can be used to access slices of tensors, see them with different
+dimensions, or pad tensors with additional data.
+
+### (Operation) reshape(const Dimensions& new_dims)
+
+Returns a view of the input tensor that has been reshaped to the specified
+new dimensions.
+
+The argument `new_dims` is an array of Index values.
+
+The rank of the resulting tensor is equal to the number of elements in `new_dims`.
+
+The product of all the sizes in the new dimension array must be equal to
+the number of elements in the input tensor.
+
+```cpp
+// Increase the rank of the input tensor by introducing a new dimension
+// of size 1.
+Tensor<float, 2> input(7, 11);
+array<int, 3> three_dims{{7, 11, 1}};
+Tensor<float, 3> result = input.reshape(three_dims);
+
+// Decrease the rank of the input tensor by merging 2 dimensions;
+array<int, 1> one_dim{{7 * 11}};
+Tensor<float, 1> result = input.reshape(one_dim);
+```
+
+This operation does not move any data in the input tensor, so the resulting
+contents of a reshaped `Tensor` depend on the data layout of the original `Tensor`.
+
+For example this is what happens when you `reshape()` a 2D `ColMajor` tensor
+to one dimension:
+
+```cpp
+Eigen::Tensor<float, 2, Eigen::ColMajor> a(2, 3);
+a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}});
+Eigen::array<Eigen::DenseIndex, 1> one_dim({3 * 2});
+Eigen::Tensor<float, 1, Eigen::ColMajor> b = a.reshape(one_dim);
+std::cout << "b" << endl << b << endl;
+
+// b
+//   0
+// 300
+// 100
+// 400
+// 200
+// 500
+```
+
+This is what happens when the 2D `Tensor` is `RowMajor`:
+
+```cpp
+Eigen::Tensor<float, 2, Eigen::RowMajor> a(2, 3);
+a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}});
+Eigen::array<Eigen::DenseIndex, 1> one_dim({3 * 2});
+Eigen::Tensor<float, 1, Eigen::RowMajor> b = a.reshape(one_dim);
+std::cout << "b" << endl << b << endl;
+
+// b
+//   0
+// 100
+// 200
+// 300
+// 400
+// 500
+```
+
+The reshape operation is a lvalue. In other words, it can be used on the left
+side of the assignment operator.
+
+The previous example can be rewritten as follow:
+
+```cpp
+Eigen::Tensor<float, 2, Eigen::ColMajor> a(2, 3);
+a.setValues({{0.0f, 100.0f, 200.0f}, {300.0f, 400.0f, 500.0f}});
+Eigen::array<Eigen::DenseIndex, 2> two_dim({2, 3});
+Eigen::Tensor<float, 1, Eigen::ColMajor> b(6);
+b.reshape(two_dim) = a;
+std::cout << "b" << endl << b << endl;
+
+// b
+//   0
+// 300
+// 100
+// 400
+// 200
+// 500
+```
+
+Note that "b" itself was not reshaped but that instead the assignment is done to
+the reshape view of b.
+
+### (Operation) shuffle(const Shuffle& shuffle)
+
+Returns a view of the input tensor whose dimensions have been
+reordered according to the specified permutation.
+
+The argument `shuffle` is an array of `Index` values:
+* Its size is the rank of the input tensor.
+* It must contain a permutation of `[0, 1, ..., rank - 1]`.
+* The `i`-th dimension of the output tensor corresponds to the size of the dimension at position `shuffle[i]` in the input tensor. For example:
+
+```cpp
+// Shuffle all dimensions to the left by 1.
+Tensor<float, 3> input(20, 30, 50);
+// ... set some values in input.
+Tensor<float, 3> output = input.shuffle({1, 2, 0});
+
+eigen_assert(output.dimension(0) == 30);
+eigen_assert(output.dimension(1) == 50);
+eigen_assert(output.dimension(2) == 20);
+
+// Indices into the output tensor are shuffled accordingly to formulate
+// indices into the input tensor.
+eigen_assert(output(3, 7, 11) == input(11, 3, 7));
+
+// In general:
+eigen_assert(output(..., indices[shuffle[i]], ...) ==
+             input(..., indices[i], ...));
+```
+
+The shuffle operation results in a lvalue, which means that it can be assigned
+to. In other words, it can be used on the left side of the assignment operator.
+
+Let's rewrite the previous example to take advantage of this feature:
+
+```cpp
+// Shuffle all dimensions to the left by 1.
+Tensor<float, 3> input(20, 30, 50);
+input.setRandom();
+Tensor<float, 3> output(30, 50, 20);
+output.shuffle({2, 0, 1}) = input;
+```
+
+### (Operation) stride(const Strides& strides)
+
+Returns a view of the input tensor that strides (skips stride-1
+elements) along each of the dimensions.
+
+The argument strides is an array of `Index` values:
+* Its size is the rank of the input tensor.
+* Must be >= 1
+
+ The dimensions of the resulting tensor are `ceil(input_dimensions[i] / strides[i])`.
+
+For example this is what happens when you `stride()` a 2D tensor:
+
+```cpp
+Eigen::Tensor<int, 2> a(4, 3);
+a.setValues({{0, 100, 200},
+             {300, 400, 500},
+             {600, 700, 800},
+             {900, 1000, 1100}});
+Eigen::array<Eigen::DenseIndex, 2> strides({3, 2});
+Eigen::Tensor<int, 2> b = a.stride(strides);
+std::cout << "b" << endl << b << endl;
+// b
+//    0   200
+//  900  1100
+```
+
+It is possible to assign a tensor to a stride:
+```cpp
+Tensor<float, 3> input(20, 30, 50);
+input.setRandom();
+Tensor<float, 3> output(40, 90, 200);
+output.stride({2, 3, 4}) = input;
+```
+
+### (Operation) slice(const StartIndices& offsets, const Sizes& extents)
+
+Returns a sub-tensor of the given tensor. For each dimension i, the slice is
+made of the coefficients stored between `offset[i]` and `offset[i] + extents[i]` in
+the input tensor.
+
+```cpp
+Eigen::Tensor<int, 2> a(4, 3);
+a.setValues({{0, 100, 200}, {300, 400, 500},
+             {600, 700, 800}, {900, 1000, 1100}});
+Eigen::array<Eigen::Index, 2> offsets = {1, 0};
+Eigen::array<Eigen::Index, 2> extents = {2, 2};
+Eigen::Tensor<int, 2> slice = a.slice(offsets, extents);
+std::cout << "a" << endl << a << endl;
+// a
+//    0   100   200
+//  300   400   500
+//  600   700   800
+//  900  1000  1100
+
+std::cout << "slice" << endl << slice << endl;
+// slice
+//  300   400
+//  600   700
+```
+
+### (Operation) stridedSlice(const StartIndices& start, const StopIndices& stop, const Strides& strides)
+
+Returns a sub-tensor by selecting elements using `start`, `stop` (exclusive), and `strides` for each dimension.
+
+This is similar to slicing in Python using [start:stop:step].
+
+``` cpp
+Eigen::Tensor<int, 2> a(4, 6);
+a.setValues({{  0,  10,  20,  30,  40,   50},
+             {100, 110, 120, 130, 140,  150},
+             {200, 210, 220, 230, 240,  250},
+             {300, 310, 320, 330, 340,  350}});
+
+Eigen::array<Eigen::Index, 2> start = {1, 1};
+Eigen::array<Eigen::Index, 2> stop  = {4, 6}; // Stop is exclusive
+Eigen::array<Eigen::Index, 2> strides = {2, 2};
+
+Eigen::Tensor<int, 2> sub = a.stridedSlice(start, stop, strides);
+
+std::cout << "a\n" << a << "\n";
+std::cout << "sub\n" << sub << "\n";
+
+// a
+//   0  10  20  30  40  50
+// 100 110 120 130 140 150
+// 200 210 220 230 240 250
+// 300 310 320 330 340 350
+
+// sub
+// 110 130 150
+// 310 330 350
+```
+It is also possible to assign to a strided slice:
+
+``` cpp
+Eigen::Tensor<int, 2> b(sub.dimensions());
+b.setConstant(-1);
+a.stridedSlice(start, stop, strides) = b;
+std::cout << "modified a\n" << a << "\n";
+
+
+// modified a
+//   0  10  20  30  40  50
+// 100  -1 120  -1 140  -1
+// 200 210 220 230 240 250
+// 300  -1 320  -1 340  -1
+
+```
+### (Operation) chip(const Index offset, const Index dim)
+
+A chip is a special kind of slice.
+It is the subtensor at the given offset in the dimension `dim`.
+
+The returned tensor has one fewer dimension than the input tensor: the dimension dim is removed.
+
+For example, a matrix chip would be either a row or a column of the input matrix:
+
+```cpp
+Eigen::Tensor<int, 2> a(4, 3);
+a.setValues({{0, 100, 200}, {300, 400, 500},
+             {600, 700, 800}, {900, 1000, 1100}});
+Eigen::Tensor<int, 1> row_3 = a.chip(2, 0);
+Eigen::Tensor<int, 1> col_2 = a.chip(1, 1);
+std::cout << "a\n" << a << "\n";
+
+// a
+//    0   100   200
+//  300   400   500
+//  600   700   800
+//  900  1000  1100
+
+std::cout << "row_3\n" << row_3 << "\n";
+// row_3
+//    600   700   800
+
+std::cout << "col_2\n" << col_2 << "\n";
+// col_2
+//   100   400   700    1000
+```
+
+It is possible to assign values to a tensor chip since the chip operation is a
+lvalue. For example:
+
+```cpp
+Eigen::Tensor<int, 1> a(3);
+a.setValues({{100, 200, 300}});
+Eigen::Tensor<int, 2> b(2, 3);
+b.setZero();
+b.chip(0, 0) = a;
+std::cout << "a\n" << a << "\n";
+std::cout << "b\n" << b << "\n";
+
+// a
+// 100
+// 200
+// 300
+
+// b
+//   100   200   300
+//     0     0     0
+```
+
+
+The dimension can also be passed as a template parameter:
+
+```cpp
+b.chip<0>(1) = a;  // Equivalent to b.chip(1,0) = a;
+```
+
+Note that only one dimension can be chipped at a time.
+To chip off multiple dimensions, you can chain calls
+
+```cpp
+Eigen::Tensor<int, 3> a(2, 3, 4);
+Eigen::Tensor<int, 1> b = b.chip<2>(0) // Now has shape [2,3]
+                           .chip<1>(0); // Now has shape [2]
+```
+
+Be careful in which order you chip, as each operation affects the shape of the intermediate result.
+For example:
+
+```cpp
+// AVOID THIS
+Eigen::Tensor<int, 1> c = b.chip<1>(0) // Now has shape [2,4]
+                           .chip<1>(0); // Now has shape [2]
+```
+
+In general, it’s more intuitive to chip from the outermost dimension first.
+
+
+### (Operation) reverse(const ReverseDimensions& reverse)
+
+Returns a view of the input tensor that reverses the order of the coefficients
+along a subset of the dimensions.  The argument reverse is an array of boolean
+values that indicates whether or not the order of the coefficients should be
+reversed along each of the dimensions.  This operation preserves the dimensions
+of the input tensor.
+
+For example this is what happens when you `reverse()` the first dimension
+of a 2D tensor:
+
+```cpp
+Eigen::Tensor<int, 2> a(4, 3);
+a.setValues({{0, 100, 200}, {300, 400, 500},
+            {600, 700, 800}, {900, 1000, 1100}});
+Eigen::array<bool, 2> reverse({true, false});
+Eigen::Tensor<int, 2> b = a.reverse(reverse);
+std::cout << "a\n" << a << "\n";
+std::cout << "b\n" << b << "\n";
+
+// a
+//    0   100   200
+//  300   400   500
+//  600   700   800
+//  900  1000  1100
+// b
+//  900  1000  1100
+//  600   700   800
+//  300   400   500
+//    0   100   200
+```
+
+### (Operation) roll(const Rolls& shifts)
+
+Returns a tensor with the elements **circularly shifted** (like bit rotation) along one or more dimensions.
+
+For each dimension `i`, the content is shifted by `shifts[i]` positions:
+
+- A **positive shift** of `+s` moves each value to a **lower index** by `s`.
+- A **negative shift** of `-s` moves each value to a **higher index** by `s`.
+
+```cpp
+Eigen::Tensor<int, 2> a(3, 4);
+a.setValues({{ 1,  2,  3,  4},
+             { 5,  6,  7,  8},
+             { 9, 10, 11, 12}});
+
+Eigen::array<Eigen::Index, 2> shifts = {1, -2};
+
+Eigen::Tensor<int, 2> rolled = a.roll(shifts);
+
+std::cout << "a\n" << a << "\n";
+std::cout << "rolled\n" << rolled << "\n";
+
+// a
+// 1  2  3  4
+// 5  6  7  8
+// 9 10 11 12
+//
+// rolled
+// 7  8  5  6
+// 11 12  9 10
+// 3  4  1  2
+```
+
+### (Operation) broadcast(const Broadcast& broadcast)
+
+Returns a view of the input tensor in which the input is replicated one to many
+times.
+The broadcast argument specifies how many copies of the input tensor need to be
+made in each of the dimensions.
+
+```cpp
+Eigen::Tensor<int, 2> a(2, 3);
+a.setValues({{0, 100, 200}, {300, 400, 500}});
+Eigen::array<int, 2> bcast({3, 2});
+Eigen::Tensor<int, 2> b = a.broadcast(bcast);
+std::cout << "a" << endl << a << endl << "b" << endl << b << endl;
+// a
+//    0   100   200
+//  300   400   500
+// b
+//    0   100   200    0   100   200
+//  300   400   500  300   400   500
+//    0   100   200    0   100   200
+//  300   400   500  300   400   500
+//    0   100   200    0   100   200
+//  300   400   500  300   400   500
+```
+
+Note: Broadcasting does not increase rank.
+To broadcast into higher dimensions, you must first reshape the tensor with singleton (1) dimensions:
+
+```cpp
+Eigen::Tensor<int, 2> a(2, 3);
+a.setValues({{0, 100, 200}, {300, 400, 500}});
+
+Eigen::array<Eigen::Index, 3> new_shape = {1, 2, 3}; //Reshape to [1, 2, 3]
+Eigen::array<int, 3> bcast = {4, 1, 1}; // Broadcast to [4, 2, 3]
+Eigen::Tensor<int, 3> b = a.reshape(new_shape).broadcast(bcast);
+
+std::cout << "b dimensions: " << b.dimensions() << "\n";
+std::cout << b << "\n";
+```
+
+### (Operation) concatenate(const OtherDerived& other, Axis axis)
+
+Returns a view of two tensors joined along a specified axis.
+The dimensions of the two tensors must match on all axes except the concatenation axis.
+The resulting tensor has the same rank as the inputs.
+
+```cpp
+Eigen::Tensor<int, 2> a(2, 3);
+a.setValues({{0, 100, 200}, {300, 400, 500}});
+
+Eigen::Tensor<int, 2> b(2, 3);
+b.setValues({{-1, -2, -3}, {-4, -5, -6}});
+
+// Concatenate along dimension 0: resulting shape is [4, 3]
+Eigen::Tensor<int, 2> c = a.concatenate(b, 0);
+
+// Concatenate along dimension 1: resulting shape is [2, 6]
+Eigen::Tensor<int, 2> d = a.concatenate(b, 1);
+
+std::cout << "a\n" << a << "\n"
+          << "b\n" << b << "\n"
+          << "c (concatenated along dim 0)\n" << c << "\n"
+          << "d (concatenated along dim 1)\n" << d << "\n";
+// a
+//   0 100 200
+// 300 400 500
+// b
+// -1 -2 -3
+// -4 -5 -6
+// c (concatenated along dim 0)
+//   0 100 200
+// 300 400 500
+//  -1  -2  -3
+//  -4  -5  -6
+// d (concatenated along dim 1)
+//   0 100 200  -1  -2  -3
+// 300 400 500  -4  -5  -6
+```
+
+### (Operation)  pad(const PaddingDimensions& padding)
+
+Returns a view of the input tensor in which the input is padded with zeros.
+
+```cpp
+Eigen::Tensor<int, 2> a(2, 3);
+a.setValues({{0, 100, 200}, {300, 400, 500}});
+Eigen::array<pair<int, int>, 2> paddings;
+paddings[0] = make_pair(0, 1);
+paddings[1] = make_pair(2, 3);
+Eigen::Tensor<int, 2> b = a.pad(paddings);
+std::cout << "a" << endl << a << endl << "b" << endl << b << endl;
+// a
+//    0   100   200
+//  300   400   500
+// b
+//    0     0     0    0
+//    0     0     0    0
+//    0   100   200    0
+//  300   400   500    0
+//    0     0     0    0
+//    0     0     0    0
+//    0     0     0    0
+```
+
+### (Operation)  extract_patches(const PatchDims& patch_dims)
+
+Returns a tensor of coefficient patches extracted from the input tensor, where
+each patch is of dimension specified by `patch_dims`. The returned tensor has
+one greater dimension than the input tensor, which is used to index each patch.
+The patch index in the output tensor depends on the data layout of the input
+tensor: the patch index is the last dimension `ColMajor` layout, and the first
+dimension in `RowMajor` layout.
+
+For example, given the following input tensor:
+
+```cpp
+Eigen::Tensor<float, 2, DataLayout> tensor(3,4);
+tensor.setValues({{0.0f, 1.0f, 2.0f, 3.0f},
+                  {4.0f, 5.0f, 6.0f, 7.0f},
+                  {8.0f, 9.0f, 10.0f, 11.0f}});
+
+std::cout << "tensor: " << endl << tensor << endl;
+
+// tensor:
+//  0   1   2   3
+//  4   5   6   7
+//  8   9  10  11
+```
+
+Six 2x2 patches can be extracted and indexed using the following code:
+
+```cpp
+Eigen::Tensor<float, 3, DataLayout> patch;
+Eigen::array<ptrdiff_t, 2> patch_dims;
+patch_dims[0] = 2;
+patch_dims[1] = 2;
+patch = tensor.extract_patches(patch_dims);
+for (int k = 0; k < 6; ++k) {
+  std::cout << "patch index: " << k << endl;
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      if (DataLayout == ColMajor) {
+        std::cout << patch(i, j, k) << " ";
+      } else {
+        std::cout << patch(k, i, j) << " ";
+      }
+    }
+    std::cout << endl;
+  }
+}
+```
+
+This code results in the following output when the data layout is `ColMajor`:
+
+    patch index: 0
+    0 1
+    4 5
+    patch index: 1
+    4 5
+    8 9
+    patch index: 2
+    1 2
+    5 6
+    patch index: 3
+    5 6
+    9 10
+    patch index: 4
+    2 3
+    6 7
+    patch index: 5
+    6 7
+    10 11
+
+This code results in the following output when the data layout is RowMajor:
+
+**NOTE**: the set of patches is the same as in `ColMajor`, but are indexed differently
+
+    patch index: 0
+    0 1
+    4 5
+    patch index: 1
+    1 2
+    5 6
+    patch index: 2
+    2 3
+    6 7
+    patch index: 3
+    4 5
+    8 9
+    patch index: 4
+    5 6
+    9 10
+    patch index: 5
+    6 7
+    10 11
+
+### (Operation)  extract_image_patches(const Index patch_rows, const Index patch_cols, const Index row_stride, const Index col_stride, const PaddingType padding_type)
+
+Returns a tensor of coefficient image patches extracted from the input tensor,
+which is expected to have dimensions ordered as follows (depending on the data
+layout of the input tensor, and the number of additional dimensions 'N'):
+
+- `ColMajor`
+    - 1st dimension: channels (of size d)
+    - 2nd dimension: rows (of size r)
+    - 3rd dimension: columns (of size c)
+    - 4th-Nth dimension: time (for video) or batch (for bulk processing).
+
+* `RowMajor` (reverse order of `ColMajor`)
+    - 1st-Nth dimension: time (for video) or batch (for bulk processing).
+    - N+1'th dimension: columns (of size c)
+    - N+2'th dimension: rows (of size r)
+    - N+3'th dimension: channels (of size d)
+
+The returned tensor has one greater dimension than the input tensor, which is
+used to index each patch. The patch index in the output tensor depends on the
+data layout of the input tensor: the patch index is the 4'th dimension in
+`ColMajor` layout, and the 4'th from the last dimension in `RowMajor` layout.
+
+For example, given the following input tensor with the following dimension
+sizes:
+- depth:   2
+- rows:    3
+- columns: 5
+- batch:   7
+
+```cpp
+Tensor<float, 4> tensor(2,3,5,7);
+Tensor<float, 4, RowMajor> tensor_row_major = tensor.swap_layout();
+```
+
+2x2 image patches can be extracted and indexed using the following code:
+
+#### 2D patch: `ColMajor` (patch indexed by second-to-last dimension)
+
+```cpp
+Tensor<float, 5> twod_patch;
+twod_patch = tensor.extract_image_patches<2, 2>();
+// twod_patch.dimension(0) == 2
+// twod_patch.dimension(1) == 2
+// twod_patch.dimension(2) == 2
+// twod_patch.dimension(3) == 3*5
+// twod_patch.dimension(4) == 7
+```
+
+#### 2D patch: `RowMajor` (patch indexed by the second dimension)
+
+```cpp
+Tensor<float, 5, RowMajor> twod_patch_row_major;
+twod_patch_row_major = tensor_row_major.extract_image_patches<2, 2>();
+// twod_patch_row_major.dimension(0) == 7
+// twod_patch_row_major.dimension(1) == 3*5
+// twod_patch_row_major.dimension(2) == 2
+// twod_patch_row_major.dimension(3) == 2
+// twod_patch_row_major.dimension(4) == 2
+```
+
+## Special Operations
+
+### (Operation) cast<T>()
+
+Returns a tensor of type `T` with the same dimensions as the original tensor.
+The returned tensor contains the values of the original tensor converted to
+type `T`.
+
+```cpp
+Eigen::Tensor<float, 2> a(2, 3);
+Eigen::Tensor<int, 2> b = a.cast<int>();
+```
+
+This can be useful for example if you need to do element-wise division of
+Tensors of integers.
+This is not currently supported by the Tensor library
+but you can easily cast the tensors to floats to do the division:
+
+```cpp
+Eigen::Tensor<int, 2> a(2, 3);
+a.setValues({{0, 1, 2}, {3, 4, 5}});
+Eigen::Tensor<int, 2> b =
+    (a.cast<float>() / a.constant(2).cast<float>()).cast<int>();
+std::cout << "a\n" << a << "\n";
+std::cout << "b\n" << b << "\n";
+
+// a
+// 0 1 2
+// 3 4 5
+//
+// b
+// 0 0 1
+// 1 2 2
+```
+
+### (Operation)     eval()
+See **Calling eval()**.
+
+
+
+## Tensor Printing
+Tensors can be printed into a stream object (e.g. `std::cout`) using different formatting options.
+
+```cpp
+Eigen::Tensor<float, 3> tensor3d = {4, 3, 2};
+tensor3d.setValues( {{{1, 2},
+                      {3, 4},
+                      {5, 6}},
+                     {{7, 8},
+                      {9, 10},
+                      {11, 12}},
+                     {{13, 14},
+                      {15, 16},
+                      {17, 18}},
+                     {{19, 20},
+                      {21, 22},
+                      {23, 24}}} );
+std::cout << tensor3d.format(Eigen::TensorIOFormat::Plain()) << ;
+//  1  2
+//  3  4
+//  5  6
+//
+//  7  8
+//  9 10
+// 11 12
+//
+// 13 14
+// 15 16
+// 17 18
+//
+// 19 20
+// 21 22
+// 23 24
+```
+
+In the example, we used the predefined format `Eigen::TensorIOFormat::Plain`.
+Here is the list of all predefined formats from which you can choose:
+- `Eigen::TensorIOFormat::Plain()` for a plain output without braces. Different submatrices are separated by a blank line.
+- `Eigen::TensorIOFormat::Numpy()` for numpy-like output.
+- `Eigen::TensorIOFormat::Native()` for a `c++` like output which can be directly copy-pasted to `setValues()`.
+- `Eigen::TensorIOFormat::Legacy()` for a backwards compatible printing of tensors.
+
+If you send the tensor directly to the stream the default format is called which is `Eigen::IOFormats::Plain()`.
+
+You can define your own format by explicitly providing a `Eigen::TensorIOFormat` class instance. Here, you can specify:
+- The overall prefix and suffix with `std::string tenPrefix` and `std::string tenSuffix`
+- The prefix, separator and suffix for each new element, row, matrix, 3d subtensor, ... with `std::vector<std::string> prefix`, `std::vector<std::string> separator` and `std::vector<std::string> suffix`. Note that the first entry in each of the vectors refer to the last dimension of the tensor, e.g. `separator[0]` will be printed between adjacent elements,  `separator[1]` will be printed between adjacent matrices, ...
+- `char fill`: character which will be placed if the elements are aligned.
+- `int precision`
+- `int flags`: an OR-ed combination of flags, the default value is 0, the only currently available flag is `Eigen::DontAlignCols` which allows to disable the alignment of columns, resulting in faster code.
+
+## Representation of scalar values
+
+Scalar values are often represented by tensors of size 1 and rank 0.
+
+For example `Tensor<T, N>::maximum()` returns a `Tensor<T, 0>`.
+
+Similarly, the inner product of 2 1d tensors (through contractions) returns a 0d tensor.
+
+The scalar value can be extracted as explained in **Reduction along all dimensions**.
+
+
+## Limitations
+
+*   The number of tensor dimensions is currently limited to 250 when using a
+    compiler that supports cxx11. It is limited to only 5 for older compilers.
+*   The `IndexList` class requires a cxx11 compliant compiler. You can use an
+    array of indices instead if you don't have access to a modern compiler.
+*   On GPUs only floating point values are properly tested and optimized for.
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
new file mode 100644
index 00000000..9dc95916
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
@@ -0,0 +1,382 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_H
+#define EIGEN_CXX11_TENSOR_TENSOR_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \class Tensor
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief The tensor class.
+ *
+ * The %Tensor class is the work-horse for all \em dense tensors within Eigen.
+ *
+ * The %Tensor class encompasses only dynamic-size objects so far.
+ *
+ * The first two template parameters are required:
+ * \tparam Scalar_  Numeric type, e.g. float, double, int or `std::complex<float>`.
+ *                 User defined scalar types are supported as well (see \ref user_defined_scalars "here").
+ * \tparam NumIndices_ Number of indices (i.e. rank of the tensor)
+ *
+ * The remaining template parameters are optional -- in most cases you don't have to worry about them.
+ * \tparam Options_  A combination of either \b #RowMajor or \b #ColMajor, and of either
+ *                 \b #AutoAlign or \b #DontAlign.
+ *                 The former controls \ref TopicStorageOrders "storage order", and defaults to column-major. The latter
+ * controls alignment, which is required for vectorization. It defaults to aligning tensors. Note that tensors currently
+ * do not support any operations that profit from vectorization. Support for such operations (i.e. adding two tensors
+ * etc.) is planned.
+ *
+ * You can access elements of tensors using normal subscripting:
+ *
+ * \code
+ * Eigen::Tensor<double, 4> t(10, 10, 10, 10);
+ * t(0, 1, 2, 3) = 42.0;
+ * \endcode
+ *
+ * This class can be extended with the help of the plugin mechanism described on the page
+ * \ref TopicCustomizing_Plugins by defining the preprocessor symbol \c EIGEN_TENSOR_PLUGIN,
+ * \c EIGEN_TENSORBASE_PLUGIN, and \c EIGEN_READONLY_TENSORBASE_PLUGIN.
+ *
+ * <i><b>Some notes:</b></i>
+ *
+ * <dl>
+ * <dt><b>Relation to other parts of Eigen:</b></dt>
+ * <dd>The midterm development goal for this class is to have a similar hierarchy as Eigen uses for matrices, so that
+ * taking blocks or using tensors in expressions is easily possible, including an interface with the vector/matrix code
+ * by providing .asMatrix() and .asVector() (or similar) methods for rank 2 and 1 tensors. However, currently, the
+ * %Tensor class does not provide any of these features and is only available as a stand-alone class that just allows
+ * for coefficient access. Also, when fixed-size tensors are implemented, the number of template arguments is likely to
+ * change dramatically.</dd>
+ * </dl>
+ *
+ * \ref TopicStorageOrders
+ */
+
+template <typename Scalar_, int NumIndices_, int Options_, typename IndexType_>
+class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexType_> > {
+ public:
+  typedef Tensor<Scalar_, NumIndices_, Options_, IndexType_> Self;
+  typedef TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexType_> > Base;
+  typedef typename Eigen::internal::nested<Self>::type Nested;
+  typedef typename internal::traits<Self>::StorageKind StorageKind;
+  typedef typename internal::traits<Self>::Index Index;
+  typedef Scalar_ Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef typename Base::CoeffReturnType CoeffReturnType;
+
+  enum { IsAligned = (EIGEN_MAX_ALIGN_BYTES > 0) && !(Options_ & DontAlign), CoordAccess = true, RawAccess = true };
+
+  static constexpr int Layout = Options_ & RowMajor ? RowMajor : ColMajor;
+  static constexpr int Options = Options_;
+  static constexpr int NumIndices = NumIndices_;
+  typedef DSizes<Index, NumIndices_> Dimensions;
+
+ protected:
+  TensorStorage<Scalar, Dimensions, Options> m_storage;
+
+  template <typename CustomIndices>
+  struct isOfNormalIndex {
+    static const bool is_array = internal::is_base_of<array<Index, NumIndices>, CustomIndices>::value;
+    static const bool is_int = NumTraits<CustomIndices>::IsInteger;
+    static const bool value = is_array | is_int;
+  };
+
+ public:
+  // Metadata
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return NumIndices; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_storage.dimensions(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_storage.size(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() { return m_storage.data(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar* data() const { return m_storage.data(); }
+
+  // This makes EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
+  // work, because that uses base().coeffRef() - and we don't yet
+  // implement a similar class hierarchy
+  inline Self& base() { return *this; }
+  inline const Self& base() const { return *this; }
+
+  template <typename... IndexTypes>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index firstIndex, Index secondIndex,
+                                                            IndexTypes... otherIndices) const {
+    // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+    EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    return coeff(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
+  }
+
+  // normal indices
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(const array<Index, NumIndices>& indices) const {
+    eigen_internal_assert(checkIndexRange(indices));
+    return m_storage.data()[linearizedIndex(indices)];
+  }
+
+  // custom indices
+  template <typename CustomIndices, EIGEN_SFINAE_ENABLE_IF(!(isOfNormalIndex<CustomIndices>::value))>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(CustomIndices& indices) const {
+    return coeff(internal::customIndices2Array<Index, NumIndices>(indices));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff() const {
+    EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    return m_storage.data()[0];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const {
+    eigen_internal_assert(index >= 0 && index < size());
+    return m_storage.data()[index];
+  }
+
+  template <typename... IndexTypes>
+  inline Scalar& coeffRef(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) {
+    // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+    EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    return coeffRef(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
+  }
+
+  // normal indices
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array<Index, NumIndices>& indices) {
+    eigen_internal_assert(checkIndexRange(indices));
+    return m_storage.data()[linearizedIndex(indices)];
+  }
+
+  // custom indices
+  template <typename CustomIndices, EIGEN_SFINAE_ENABLE_IF(!(isOfNormalIndex<CustomIndices>::value))>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(CustomIndices& indices) {
+    return coeffRef(internal::customIndices2Array<Index, NumIndices>(indices));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef() {
+    EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    return m_storage.data()[0];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
+    eigen_internal_assert(index >= 0 && index < size());
+    return m_storage.data()[index];
+  }
+
+  template <typename... IndexTypes>
+  inline const Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const {
+    // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+    EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    return this->operator()(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
+  }
+
+  // custom indices
+  template <typename CustomIndices, EIGEN_SFINAE_ENABLE_IF(!(isOfNormalIndex<CustomIndices>::value))>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(CustomIndices& indices) const {
+    return coeff(internal::customIndices2Array<Index, NumIndices>(indices));
+  }
+
+  // normal indices
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(const array<Index, NumIndices>& indices) const {
+    return coeff(indices);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const {
+    eigen_internal_assert(index >= 0 && index < size());
+    return coeff(index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()() const {
+    EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    return coeff();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator[](Index index) const {
+    // The bracket operator is only for vectors, use the parenthesis operator instead.
+    EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    return coeff(index);
+  }
+
+  template <typename... IndexTypes>
+  inline Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) {
+    // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+    EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    return operator()(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
+  }
+
+  // normal indices
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(const array<Index, NumIndices>& indices) {
+    return coeffRef(indices);
+  }
+
+  // custom indices
+  template <typename CustomIndices, EIGEN_SFINAE_ENABLE_IF(!(isOfNormalIndex<CustomIndices>::value))>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(CustomIndices& indices) {
+    return coeffRef(internal::customIndices2Array<Index, NumIndices>(indices));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index index) {
+    eigen_assert(index >= 0 && index < size());
+    return coeffRef(index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()() {
+    EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    return coeffRef();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator[](Index index) {
+    // The bracket operator is only for vectors, use the parenthesis operator instead
+    EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    return coeffRef(index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor() : m_storage() {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(const Self& other) : Base(other), m_storage(other.m_storage) {}
+
+  template <typename... IndexTypes>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index firstDimension, IndexTypes... otherDimensions)
+      : m_storage(firstDimension, otherDimensions...) {
+    // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
+    EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+  }
+
+  /** Normal Dimension */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(const array<Index, NumIndices>& dimensions)
+      : m_storage(internal::array_prod(dimensions), dimensions) {
+    EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
+  }
+
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(const TensorBase<OtherDerived, ReadOnlyAccessors>& other) {
+    EIGEN_STATIC_ASSERT(OtherDerived::NumDimensions == Base::NumDimensions, Number_of_dimensions_must_match)
+    typedef TensorAssignOp<Tensor, const OtherDerived> Assign;
+    Assign assign(*this, other.derived());
+    resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions());
+    internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+  }
+
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(const TensorBase<OtherDerived, WriteAccessors>& other) {
+    EIGEN_STATIC_ASSERT(OtherDerived::NumDimensions == Base::NumDimensions, Number_of_dimensions_must_match)
+    typedef TensorAssignOp<Tensor, const OtherDerived> Assign;
+    Assign assign(*this, other.derived());
+    resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions());
+    internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Self&& other) : m_storage(std::move(other.m_storage)) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor& operator=(Self&& other) {
+    m_storage = std::move(other.m_storage);
+    return *this;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor& operator=(const Tensor& other) {
+    typedef TensorAssignOp<Tensor, const Tensor> Assign;
+    Assign assign(*this, other);
+    resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions());
+    internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+    return *this;
+  }
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor& operator=(const OtherDerived& other) {
+    typedef TensorAssignOp<Tensor, const OtherDerived> Assign;
+    Assign assign(*this, other);
+    resize(TensorEvaluator<const Assign, DefaultDevice>(assign, DefaultDevice()).dimensions());
+    internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+    return *this;
+  }
+
+  template <typename... IndexTypes>
+  EIGEN_DEVICE_FUNC void resize(Index firstDimension, IndexTypes... otherDimensions) {
+    // The number of dimensions used to resize a tensor must be equal to the rank of the tensor.
+    EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    resize(array<Index, NumIndices>{{firstDimension, otherDimensions...}});
+  }
+
+  /** Normal Dimension */
+  EIGEN_DEVICE_FUNC void resize(const array<Index, NumIndices>& dimensions) {
+#ifndef EIGEN_NO_DEBUG
+    Index size = Index(1);
+    for (int i = 0; i < NumIndices; i++) {
+      internal::check_rows_cols_for_overflow<Dynamic, Dynamic, Dynamic>::run(size, dimensions[i]);
+      size *= dimensions[i];
+    }
+#else
+    Index size = internal::array_prod(dimensions);
+#endif
+
+#ifdef EIGEN_INITIALIZE_COEFFS
+    bool size_changed = size != this->size();
+    m_storage.resize(size, dimensions);
+    if (size_changed) EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
+#else
+    m_storage.resize(size, dimensions);
+#endif
+  }
+
+  EIGEN_DEVICE_FUNC void resize() {
+    EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    // Nothing to do: rank 0 tensors have fixed size
+  }
+
+  template <typename FirstType, typename... OtherTypes>
+  EIGEN_DEVICE_FUNC void resize(const Eigen::IndexList<FirstType, OtherTypes...>& dimensions) {
+    array<Index, NumIndices> dims;
+    for (int i = 0; i < NumIndices; ++i) {
+      dims[i] = static_cast<Index>(dimensions[i]);
+    }
+    resize(dims);
+  }
+
+  /** Custom Dimension */
+  template <typename CustomDimension, EIGEN_SFINAE_ENABLE_IF(!(isOfNormalIndex<CustomDimension>::value))>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void resize(CustomDimension& dimensions) {
+    resize(internal::customIndices2Array<Index, NumIndices>(dimensions));
+  }
+
+  template <typename std::ptrdiff_t... Indices>
+  EIGEN_DEVICE_FUNC void resize(const Sizes<Indices...>& dimensions) {
+    array<Index, NumIndices> dims;
+    for (int i = 0; i < NumIndices; ++i) {
+      dims[i] = static_cast<Index>(dimensions[i]);
+    }
+    resize(dims);
+  }
+
+#ifdef EIGEN_TENSOR_PLUGIN
+#include EIGEN_TENSOR_PLUGIN
+#endif
+
+ protected:
+  bool checkIndexRange(const array<Index, NumIndices>& indices) const {
+    using internal::array_apply_and_reduce;
+    using internal::array_zip_and_reduce;
+    using internal::greater_equal_zero_op;
+    using internal::lesser_op;
+    using internal::logical_and_op;
+
+    return
+        // check whether the indices are all >= 0
+        array_apply_and_reduce<logical_and_op, greater_equal_zero_op>(indices) &&
+        // check whether the indices fit in the dimensions
+        array_zip_and_reduce<logical_and_op, lesser_op>(indices, m_storage.dimensions());
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index linearizedIndex(const array<Index, NumIndices>& indices) const {
+    if (Options & RowMajor) {
+      return m_storage.dimensions().IndexOfRowMajor(indices);
+    } else {
+      return m_storage.dimensions().IndexOfColMajor(indices);
+    }
+  }
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
new file mode 100644
index 00000000..3f9866ac
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
@@ -0,0 +1,282 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Eugene Brevdo <ebrevdo@gmail.com>
+//                    Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_ARG_MAX_H
+#define EIGEN_CXX11_TENSOR_TENSOR_ARG_MAX_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+template <typename XprType>
+struct traits<TensorIndexPairOp<XprType>> : public traits<XprType> {
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef Pair<Index, typename XprTraits::Scalar> Scalar;
+  typedef typename XprType::Nested Nested;
+  typedef std::remove_reference_t<Nested> Nested_;
+  static constexpr int NumDimensions = XprTraits::NumDimensions;
+  static constexpr int Layout = XprTraits::Layout;
+};
+
+template <typename XprType>
+struct eval<TensorIndexPairOp<XprType>, Eigen::Dense> {
+  typedef const TensorIndexPairOp<XprType> EIGEN_DEVICE_REF type;
+};
+
+template <typename XprType>
+struct nested<TensorIndexPairOp<XprType>, 1, typename eval<TensorIndexPairOp<XprType>>::type> {
+  typedef TensorIndexPairOp<XprType> type;
+};
+
+}  // end namespace internal
+
+/**
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor + Index Pair class.
+ */
+template <typename XprType>
+class TensorIndexPairOp : public TensorBase<TensorIndexPairOp<XprType>, ReadOnlyAccessors> {
+ public:
+  typedef typename Eigen::internal::traits<TensorIndexPairOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename Eigen::internal::nested<TensorIndexPairOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorIndexPairOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorIndexPairOp>::Index Index;
+  typedef Pair<Index, typename XprType::CoeffReturnType> CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIndexPairOp(const XprType& expr) : m_xpr(expr) {}
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename XprType::Nested>& expression() const { return m_xpr; }
+
+ protected:
+  typename XprType::Nested m_xpr;
+};
+
+// Eval as rvalue
+template <typename ArgType, typename Device>
+struct TensorEvaluator<const TensorIndexPairOp<ArgType>, Device> {
+  typedef TensorIndexPairOp<ArgType> XprType;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+  static constexpr int NumDims = internal::array_size<Dimensions>::value;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  enum {
+    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
+    PacketAccess = /*TensorEvaluator<ArgType, Device>::PacketAccess*/ false,
+    BlockAccess = false,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+  EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    return CoeffReturnType(index, m_impl.coeff(index));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, 1);
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+ protected:
+  TensorEvaluator<ArgType, Device> m_impl;
+};
+
+namespace internal {
+
+/** \class TensorPairIndex
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Converts to Tensor<Pair<Index, Scalar> > and reduces to Tensor<Index>.
+ *
+ */
+template <typename ReduceOp, typename Dims, typename XprType>
+struct traits<TensorPairReducerOp<ReduceOp, Dims, XprType>> : public traits<XprType> {
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef Index Scalar;
+  typedef typename XprType::Nested Nested;
+  typedef std::remove_reference_t<Nested> Nested_;
+  static constexpr int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value;
+  static constexpr int Layout = XprTraits::Layout;
+};
+
+template <typename ReduceOp, typename Dims, typename XprType>
+struct eval<TensorPairReducerOp<ReduceOp, Dims, XprType>, Eigen::Dense> {
+  typedef const TensorPairReducerOp<ReduceOp, Dims, XprType> EIGEN_DEVICE_REF type;
+};
+
+template <typename ReduceOp, typename Dims, typename XprType>
+struct nested<TensorPairReducerOp<ReduceOp, Dims, XprType>, 1,
+              typename eval<TensorPairReducerOp<ReduceOp, Dims, XprType>>::type> {
+  typedef TensorPairReducerOp<ReduceOp, Dims, XprType> type;
+};
+
+}  // end namespace internal
+
+template <typename ReduceOp, typename Dims, typename XprType>
+class TensorPairReducerOp : public TensorBase<TensorPairReducerOp<ReduceOp, Dims, XprType>, ReadOnlyAccessors> {
+ public:
+  typedef typename Eigen::internal::traits<TensorPairReducerOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename Eigen::internal::nested<TensorPairReducerOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorPairReducerOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorPairReducerOp>::Index Index;
+  typedef Index CoeffReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPairReducerOp(const XprType& expr, const ReduceOp& reduce_op,
+                                                            const Index return_dim, const Dims& reduce_dims)
+      : m_xpr(expr), m_reduce_op(reduce_op), m_return_dim(return_dim), m_reduce_dims(reduce_dims) {}
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename XprType::Nested>& expression() const { return m_xpr; }
+
+  EIGEN_DEVICE_FUNC const ReduceOp& reduce_op() const { return m_reduce_op; }
+
+  EIGEN_DEVICE_FUNC const Dims& reduce_dims() const { return m_reduce_dims; }
+
+  EIGEN_DEVICE_FUNC Index return_dim() const { return m_return_dim; }
+
+ protected:
+  typename XprType::Nested m_xpr;
+  const ReduceOp m_reduce_op;
+  const Index m_return_dim;
+  const Dims m_reduce_dims;
+};
+
+// Eval as rvalue
+template <typename ReduceOp, typename Dims, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorPairReducerOp<ReduceOp, Dims, ArgType>, Device> {
+  typedef TensorPairReducerOp<ReduceOp, Dims, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename TensorIndexPairOp<ArgType>::CoeffReturnType PairType;
+  typedef typename TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexPairOp<ArgType>>,
+                                   Device>::Dimensions Dimensions;
+  typedef typename TensorEvaluator<const TensorIndexPairOp<ArgType>, Device>::Dimensions InputDimensions;
+  static constexpr int NumDims = internal::array_size<InputDimensions>::value;
+  typedef array<Index, NumDims> StrideDims;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+  typedef StorageMemory<PairType, Device> PairStorageMem;
+
+  enum {
+    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
+    PacketAccess = /*TensorEvaluator<ArgType, Device>::PacketAccess*/ false,
+    BlockAccess = false,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+  static constexpr int Layout =
+      TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexPairOp<ArgType>>, Device>::Layout;
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_orig_impl(op.expression(), device),
+        m_impl(op.expression().index_pairs().reduce(op.reduce_dims(), op.reduce_op()), device),
+        m_return_dim(op.return_dim()) {
+    gen_strides(m_orig_impl.dimensions(), m_strides);
+    if (Layout == static_cast<int>(ColMajor)) {
+      const Index total_size = internal::array_prod(m_orig_impl.dimensions());
+      m_stride_mod = (m_return_dim < NumDims - 1) ? m_strides[m_return_dim + 1] : total_size;
+    } else {
+      const Index total_size = internal::array_prod(m_orig_impl.dimensions());
+      m_stride_mod = (m_return_dim > 0) ? m_strides[m_return_dim - 1] : total_size;
+    }
+    // If m_return_dim is not a valid index, returns 1 or this can crash on Windows.
+    m_stride_div =
+        ((m_return_dim >= 0) && (m_return_dim < static_cast<Index>(m_strides.size()))) ? m_strides[m_return_dim] : 1;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+  EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    const PairType v = m_impl.coeff(index);
+    return (m_return_dim < 0) ? v.first : (v.first % m_stride_mod) / m_stride_div;
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    const double compute_cost =
+        1.0 + (m_return_dim < 0 ? 0.0 : (TensorOpCost::ModCost<Index>() + TensorOpCost::DivCost<Index>()));
+    return m_orig_impl.costPerCoeff(vectorized) + m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost);
+  }
+
+ private:
+  EIGEN_DEVICE_FUNC void gen_strides(const InputDimensions& dims, StrideDims& strides) {
+    if (m_return_dim < 0) {
+      return;  // Won't be using the strides.
+    }
+    eigen_assert(m_return_dim < NumDims && "Asking to convert index to a dimension outside of the rank");
+
+    // Calculate m_stride_div and m_stride_mod, which are used to
+    // calculate the value of an index w.r.t. the m_return_dim.
+    if (Layout == static_cast<int>(ColMajor)) {
+      strides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        strides[i] = strides[i - 1] * dims[i - 1];
+      }
+    } else {
+      strides[NumDims - 1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        strides[i] = strides[i + 1] * dims[i + 1];
+      }
+    }
+  }
+
+ protected:
+  TensorEvaluator<const TensorIndexPairOp<ArgType>, Device> m_orig_impl;
+  TensorEvaluator<const TensorReductionOp<ReduceOp, Dims, const TensorIndexPairOp<ArgType>>, Device> m_impl;
+  const Index m_return_dim;
+  StrideDims m_strides;
+  Index m_stride_mod;
+  Index m_stride_div;
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_ARG_MAX_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
new file mode 100644
index 00000000..37d914e2
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h
@@ -0,0 +1,211 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H
+#define EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+template <typename LhsXprType, typename RhsXprType>
+struct traits<TensorAssignOp<LhsXprType, RhsXprType> > {
+  typedef typename LhsXprType::Scalar Scalar;
+  typedef typename traits<LhsXprType>::StorageKind StorageKind;
+  typedef
+      typename promote_index_type<typename traits<LhsXprType>::Index, typename traits<RhsXprType>::Index>::type Index;
+  typedef typename LhsXprType::Nested LhsNested;
+  typedef typename RhsXprType::Nested RhsNested;
+  typedef std::remove_reference_t<LhsNested> LhsNested_;
+  typedef std::remove_reference_t<RhsNested> RhsNested_;
+  static constexpr std::size_t NumDimensions = internal::traits<LhsXprType>::NumDimensions;
+  static constexpr int Layout = internal::traits<LhsXprType>::Layout;
+  typedef typename traits<LhsXprType>::PointerType PointerType;
+
+  enum { Flags = 0 };
+};
+
+template <typename LhsXprType, typename RhsXprType>
+struct eval<TensorAssignOp<LhsXprType, RhsXprType>, Eigen::Dense> {
+  typedef const TensorAssignOp<LhsXprType, RhsXprType>& type;
+};
+
+template <typename LhsXprType, typename RhsXprType>
+struct nested<TensorAssignOp<LhsXprType, RhsXprType>, 1, typename eval<TensorAssignOp<LhsXprType, RhsXprType> >::type> {
+  typedef TensorAssignOp<LhsXprType, RhsXprType> type;
+};
+
+}  // end namespace internal
+
+/** The tensor assignment class.
+ * \ingroup CXX11_Tensor_Module
+ *
+ * This class is represents the assignment of the values resulting from the evaluation of
+ * the rhs expression to the memory locations denoted by the lhs expression.
+ */
+template <typename LhsXprType, typename RhsXprType>
+class TensorAssignOp : public TensorBase<TensorAssignOp<LhsXprType, RhsXprType> > {
+ public:
+  typedef typename Eigen::internal::traits<TensorAssignOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename LhsXprType::CoeffReturnType CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorAssignOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorAssignOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorAssignOp>::Index Index;
+
+  static constexpr int NumDims = Eigen::internal::traits<TensorAssignOp>::NumDimensions;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorAssignOp(LhsXprType& lhs, const RhsXprType& rhs)
+      : m_lhs_xpr(lhs), m_rhs_xpr(rhs) {}
+
+  /** \returns the nested expressions */
+  EIGEN_DEVICE_FUNC internal::remove_all_t<typename LhsXprType::Nested>& lhsExpression() const {
+    return *((internal::remove_all_t<typename LhsXprType::Nested>*)&m_lhs_xpr);
+  }
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename RhsXprType::Nested>& rhsExpression() const {
+    return m_rhs_xpr;
+  }
+
+ protected:
+  internal::remove_all_t<typename LhsXprType::Nested>& m_lhs_xpr;
+  const internal::remove_all_t<typename RhsXprType::Nested>& m_rhs_xpr;
+};
+
+template <typename LeftArgType, typename RightArgType, typename Device>
+struct TensorEvaluator<const TensorAssignOp<LeftArgType, RightArgType>, Device> {
+  typedef TensorAssignOp<LeftArgType, RightArgType> XprType;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef typename TensorEvaluator<RightArgType, Device>::Dimensions Dimensions;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  static constexpr int NumDims = XprType::NumDims;
+  static constexpr int Layout = TensorEvaluator<LeftArgType, Device>::Layout;
+
+  enum {
+    IsAligned =
+        int(TensorEvaluator<LeftArgType, Device>::IsAligned) & int(TensorEvaluator<RightArgType, Device>::IsAligned),
+    PacketAccess = int(TensorEvaluator<LeftArgType, Device>::PacketAccess) &
+                   int(TensorEvaluator<RightArgType, Device>::PacketAccess),
+    BlockAccess = int(TensorEvaluator<LeftArgType, Device>::BlockAccess) &
+                  int(TensorEvaluator<RightArgType, Device>::BlockAccess),
+    PreferBlockAccess = int(TensorEvaluator<LeftArgType, Device>::PreferBlockAccess) |
+                        int(TensorEvaluator<RightArgType, Device>::PreferBlockAccess),
+    RawAccess = TensorEvaluator<LeftArgType, Device>::RawAccess
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename TensorEvaluator<const RightArgType, Device>::TensorBlock RightTensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  TensorEvaluator(const XprType& op, const Device& device)
+      : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device) {
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) ==
+                         static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)),
+                        YOU_MADE_A_PROGRAMMING_MISTAKE);
+  }
+
+  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const {
+    // The dimensions of the lhs and the rhs tensors should be equal to prevent
+    // overflows and ensure the result is fully initialized.
+    // TODO: use left impl instead if right impl dimensions are known at compile time.
+    return m_rightImpl.dimensions();
+  }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+    eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions()));
+    m_leftImpl.evalSubExprsIfNeeded(NULL);
+    // If the lhs provides raw access to its storage area (i.e. if m_leftImpl.data() returns a non
+    // null value), attempt to evaluate the rhs expression in place. Returns true iff in place
+    // evaluation isn't supported and the caller still needs to manually assign the values generated
+    // by the rhs to the lhs.
+    return m_rightImpl.evalSubExprsIfNeeded(m_leftImpl.data());
+  }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType, EvalSubExprsCallback done) {
+    m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done](bool) {
+      m_rightImpl.evalSubExprsIfNeededAsync(m_leftImpl.data(), [done](bool need_assign) { done(need_assign); });
+    });
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() {
+    m_leftImpl.cleanup();
+    m_rightImpl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) const {
+    m_leftImpl.coeffRef(i) = m_rightImpl.coeff(i);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) const {
+    const int LhsStoreMode = TensorEvaluator<LeftArgType, Device>::IsAligned ? Aligned : Unaligned;
+    const int RhsLoadMode = TensorEvaluator<RightArgType, Device>::IsAligned ? Aligned : Unaligned;
+    m_leftImpl.template writePacket<LhsStoreMode>(i, m_rightImpl.template packet<RhsLoadMode>(i));
+  }
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const { return m_leftImpl.coeff(index); }
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
+    return m_leftImpl.template packet<LoadMode>(index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    // We assume that evalPacket or evalScalar is called to perform the
+    // assignment and account for the cost of the write here, but reduce left
+    // cost by one load because we are using m_leftImpl.coeffRef.
+    TensorOpCost left = m_leftImpl.costPerCoeff(vectorized);
+    return m_rightImpl.costPerCoeff(vectorized) +
+           TensorOpCost(numext::maxi(0.0, left.bytes_loaded() - sizeof(CoeffReturnType)), left.bytes_stored(),
+                        left.compute_cycles()) +
+           TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    return internal::TensorBlockResourceRequirements::merge(m_leftImpl.getResourceRequirements(),
+                                                            m_rightImpl.getResourceRequirements());
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock(TensorBlockDesc& desc, TensorBlockScratch& scratch) {
+    if (TensorEvaluator<LeftArgType, Device>::RawAccess && m_leftImpl.data() != NULL) {
+      // If destination has raw data access, we pass it as a potential
+      // destination for a block descriptor evaluation.
+      desc.template AddDestinationBuffer<Layout>(
+          /*dst_base=*/m_leftImpl.data() + desc.offset(),
+          /*dst_strides=*/internal::strides<Layout>(m_leftImpl.dimensions()));
+    }
+
+    RightTensorBlock block = m_rightImpl.block(desc, scratch, /*root_of_expr_ast=*/true);
+    // If block was evaluated into a destination, there is no need to do assignment.
+    if (block.kind() != internal::TensorBlockKind::kMaterializedInOutput) {
+      m_leftImpl.writeBlock(desc, block);
+    }
+    block.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_leftImpl.data(); }
+
+ private:
+  TensorEvaluator<LeftArgType, Device> m_leftImpl;
+  TensorEvaluator<RightArgType, Device> m_rightImpl;
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_ASSIGN_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
new file mode 100644
index 00000000..fc3f3b78
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -0,0 +1,1244 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_BASE_H
+#define EIGEN_CXX11_TENSOR_TENSOR_BASE_H
+
+// clang-format off
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \class TensorBase
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief The tensor base class.
+  *
+  * This class is the common parent of the Tensor and TensorMap class, thus
+  * making it possible to use either class interchangeably in expressions.
+  */
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+// FIXME Doxygen does not like the inheritance with different template parameters
+// Since there is no doxygen documentation inside, we disable it for now
+template<typename Derived>
+class TensorBase<Derived, ReadOnlyAccessors>
+{
+  public:
+    typedef internal::traits<Derived> DerivedTraits;
+    typedef typename DerivedTraits::Scalar Scalar;
+    typedef typename DerivedTraits::Index Index;
+    typedef std::remove_const_t<Scalar> CoeffReturnType;
+    static constexpr int NumDimensions = DerivedTraits::NumDimensions;
+
+    // Generic nullary operation support.
+    template <typename CustomNullaryOp> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseNullaryOp<CustomNullaryOp, const Derived>
+    nullaryExpr(const CustomNullaryOp& func) const {
+      return TensorCwiseNullaryOp<CustomNullaryOp, const Derived>(derived(), func);
+    }
+
+    // Coefficient-wise nullary operators
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived>
+    constant(const Scalar& value) const {
+      return nullaryExpr(internal::scalar_constant_op<Scalar>(value));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseNullaryOp<internal::UniformRandomGenerator<Scalar>, const Derived>
+    random() const {
+      return nullaryExpr(internal::UniformRandomGenerator<Scalar>());
+    }
+    template <typename RandomGenerator> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseNullaryOp<RandomGenerator, const Derived>
+    random(const RandomGenerator& gen = RandomGenerator()) const {
+      return nullaryExpr(gen);
+    }
+
+    // Tensor generation
+    template <typename Generator> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorGeneratorOp<Generator, const Derived>
+    generate(const Generator& generator) const {
+      return TensorGeneratorOp<Generator, const Derived>(derived(), generator);
+    }
+
+    // Generic unary operation support.
+    template <typename CustomUnaryOp> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<CustomUnaryOp, const Derived>
+    unaryExpr(const CustomUnaryOp& func) const {
+      return TensorCwiseUnaryOp<CustomUnaryOp, const Derived>(derived(), func);
+    }
+
+    // Coefficient-wise unary operators
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_opposite_op<Scalar>, const Derived>
+    operator-() const {
+      return unaryExpr(internal::scalar_opposite_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_sqrt_op<Scalar>, const Derived>
+    sqrt() const {
+      return unaryExpr(internal::scalar_sqrt_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_sign_op<Scalar>, const Derived>
+    sign() const {
+      return unaryExpr(internal::scalar_sign_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_rsqrt_op<Scalar>, const Derived>
+    rsqrt() const {
+      return unaryExpr(internal::scalar_rsqrt_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_square_op<Scalar>, const Derived>
+    square() const {
+      return unaryExpr(internal::scalar_square_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_cube_op<Scalar>, const Derived>
+    cube() const {
+      return unaryExpr(internal::scalar_cube_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_inverse_op<Scalar>, const Derived>
+    inverse() const {
+      return unaryExpr(internal::scalar_inverse_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_tanh_op<Scalar>, const Derived>
+    tanh() const {
+      return unaryExpr(internal::scalar_tanh_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_lgamma_op<Scalar>, const Derived>
+    lgamma() const {
+      return unaryExpr(internal::scalar_lgamma_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_digamma_op<Scalar>, const Derived>
+    digamma() const {
+      return unaryExpr(internal::scalar_digamma_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_i0_op<Scalar>, const Derived>
+    bessel_i0() const {
+      return unaryExpr(internal::scalar_bessel_i0_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_i0e_op<Scalar>, const Derived>
+    bessel_i0e() const {
+      return unaryExpr(internal::scalar_bessel_i0e_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_i1_op<Scalar>, const Derived>
+    bessel_i1() const {
+      return unaryExpr(internal::scalar_bessel_i1_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_i1e_op<Scalar>, const Derived>
+    bessel_i1e() const {
+      return unaryExpr(internal::scalar_bessel_i1e_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_j0_op<Scalar>, const Derived>
+    bessel_j0() const {
+      return unaryExpr(internal::scalar_bessel_j0_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_y0_op<Scalar>, const Derived>
+    bessel_y0() const {
+      return unaryExpr(internal::scalar_bessel_y0_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_j1_op<Scalar>, const Derived>
+    bessel_j1() const {
+      return unaryExpr(internal::scalar_bessel_j1_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_y1_op<Scalar>, const Derived>
+    bessel_y1() const {
+      return unaryExpr(internal::scalar_bessel_y1_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_k0_op<Scalar>, const Derived>
+    bessel_k0() const {
+      return unaryExpr(internal::scalar_bessel_k0_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_k0e_op<Scalar>, const Derived>
+    bessel_k0e() const {
+      return unaryExpr(internal::scalar_bessel_k0e_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_k1_op<Scalar>, const Derived>
+    bessel_k1() const {
+      return unaryExpr(internal::scalar_bessel_k1_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_bessel_k1e_op<Scalar>, const Derived>
+    bessel_k1e() const {
+      return unaryExpr(internal::scalar_bessel_k1e_op<Scalar>());
+    }
+
+    // igamma(a = this, x = other)
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_igamma_op<Scalar>, const Derived, const OtherDerived>
+    igamma(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_igamma_op<Scalar>());
+    }
+
+    // igamma_der_a(a = this, x = other)
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_igamma_der_a_op<Scalar>, const Derived, const OtherDerived>
+    igamma_der_a(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_igamma_der_a_op<Scalar>());
+    }
+
+    // gamma_sample_der_alpha(alpha = this, sample = other)
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_gamma_sample_der_alpha_op<Scalar>, const Derived, const OtherDerived>
+    gamma_sample_der_alpha(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_gamma_sample_der_alpha_op<Scalar>());
+    }
+
+    // igammac(a = this, x = other)
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_igammac_op<Scalar>, const Derived, const OtherDerived>
+    igammac(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_igammac_op<Scalar>());
+    }
+
+    // zeta(x = this, q = other)
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_zeta_op<Scalar>, const Derived, const OtherDerived>
+    zeta(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_zeta_op<Scalar>());
+    }
+
+    // polygamma(n = this, x = other)
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_polygamma_op<Scalar>, const Derived, const OtherDerived>
+    polygamma(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_polygamma_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_erf_op<Scalar>, const Derived>
+    erf() const {
+      return unaryExpr(internal::scalar_erf_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_erfc_op<Scalar>, const Derived>
+    erfc() const {
+      return unaryExpr(internal::scalar_erfc_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_ndtri_op<Scalar>, const Derived>
+    ndtri() const {
+      return unaryExpr(internal::scalar_ndtri_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_logistic_op<Scalar>, const Derived>
+    sigmoid() const {
+      return unaryExpr(internal::scalar_logistic_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_exp_op<Scalar>, const Derived>
+    exp() const {
+      return unaryExpr(internal::scalar_exp_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_expm1_op<Scalar>, const Derived>
+    expm1() const {
+      return unaryExpr(internal::scalar_expm1_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_log_op<Scalar>, const Derived>
+    log() const {
+      return unaryExpr(internal::scalar_log_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_log1p_op<Scalar>, const Derived>
+    log1p() const {
+      return unaryExpr(internal::scalar_log1p_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_log2_op<Scalar>, const Derived>
+    log2() const {
+      return unaryExpr(internal::scalar_log2_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived>
+    abs() const {
+      return unaryExpr(internal::scalar_abs_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_arg_op<Scalar>, const Derived>
+    arg() const {
+      return unaryExpr(internal::scalar_arg_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_clamp_op<Scalar>, const Derived>
+    clip(Scalar min, Scalar max) const {
+      return unaryExpr(internal::scalar_clamp_op<Scalar>(min, max));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const std::conditional_t<NumTraits<CoeffReturnType>::IsComplex,
+                                                      TensorCwiseUnaryOp<internal::scalar_conjugate_op<Scalar>, const Derived>,
+                                                      Derived>
+    conjugate() const {
+      return choose(Cond<NumTraits<CoeffReturnType>::IsComplex>(), unaryExpr(internal::scalar_conjugate_op<Scalar>()), derived());
+    }
+
+    template<typename ScalarExponent>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const std::enable_if_t<internal::is_arithmetic<typename NumTraits<ScalarExponent>::Real>::value,
+        TensorCwiseUnaryOp<internal::scalar_unary_pow_op<Scalar, ScalarExponent>, const Derived>>
+        pow(ScalarExponent exponent) const
+    {
+        return unaryExpr(internal::scalar_unary_pow_op<Scalar, ScalarExponent>(exponent));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_real_op<Scalar>, const Derived>
+    real() const {
+      return unaryExpr(internal::scalar_real_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_imag_op<Scalar>, const Derived>
+    imag() const {
+      return unaryExpr(internal::scalar_imag_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_sum_op<Scalar,Scalar> >, const Derived>
+    operator+ (Scalar rhs) const {
+      return unaryExpr(internal::bind2nd_op<internal::scalar_sum_op<Scalar,Scalar> >(rhs));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE friend
+    const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_sum_op<Scalar> >, const Derived>
+    operator+ (Scalar lhs, const Derived& rhs) {
+      return rhs.unaryExpr(internal::bind1st_op<internal::scalar_sum_op<Scalar> >(lhs));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_difference_op<Scalar,Scalar> >, const Derived>
+    operator- (Scalar rhs) const {
+      EIGEN_STATIC_ASSERT((NumTraits<Scalar>::IsSigned || internal::is_same<Scalar, const std::complex<float> >::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
+      return unaryExpr(internal::bind2nd_op<internal::scalar_difference_op<Scalar,Scalar> >(rhs));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE friend
+    const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_difference_op<Scalar> >, const Derived>
+    operator- (Scalar lhs, const Derived& rhs) {
+      return rhs.unaryExpr(internal::bind1st_op<internal::scalar_difference_op<Scalar> >(lhs));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_product_op<Scalar,Scalar> >, const Derived>
+    operator* (Scalar rhs) const {
+      return unaryExpr(internal::bind2nd_op<internal::scalar_product_op<Scalar,Scalar> >(rhs));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE friend
+    const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_product_op<Scalar> >, const Derived>
+    operator* (Scalar lhs, const Derived& rhs) {
+      return rhs.unaryExpr(internal::bind1st_op<internal::scalar_product_op<Scalar> >(lhs));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_quotient_op<Scalar,Scalar> >, const Derived>
+    operator/ (Scalar rhs) const {
+      return unaryExpr(internal::bind2nd_op<internal::scalar_quotient_op<Scalar,Scalar> >(rhs));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE friend
+    const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_quotient_op<Scalar> >, const Derived>
+    operator/ (Scalar lhs, const Derived& rhs) {
+      return rhs.unaryExpr(internal::bind1st_op<internal::scalar_quotient_op<Scalar> >(lhs));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_mod_op<Scalar>, const Derived>
+    operator% (Scalar rhs) const {
+      EIGEN_STATIC_ASSERT(NumTraits<Scalar>::IsInteger, YOU_MADE_A_PROGRAMMING_MISTAKE_TRY_MOD);
+      return unaryExpr(internal::scalar_mod_op<Scalar>(rhs));
+    }
+
+    template <int NanPropagation=PropagateFast>
+    EIGEN_DEVICE_FUNC
+        EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar,NanPropagation>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    cwiseMax(Scalar threshold) const {
+      return cwiseMax<NanPropagation>(constant(threshold));
+    }
+
+    template <int NanPropagation=PropagateFast>
+    EIGEN_DEVICE_FUNC
+        EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar,NanPropagation>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    cwiseMin(Scalar threshold) const {
+      return cwiseMin<NanPropagation>(constant(threshold));
+    }
+
+    template<typename NewType>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const std::conditional_t<internal::is_same<NewType, CoeffReturnType>::value,
+                                                      Derived,
+                                                      TensorConversionOp<NewType, const Derived> >
+    cast() const {
+      return choose(Cond<internal::is_same<NewType, CoeffReturnType>::value>(), derived(), TensorConversionOp<NewType, const Derived>(derived()));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_round_op<Scalar>, const Derived>
+    round() const {
+      return unaryExpr(internal::scalar_round_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_rint_op<Scalar>, const Derived>
+    rint() const {
+      return unaryExpr(internal::scalar_rint_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_ceil_op<Scalar>, const Derived>
+    ceil() const {
+      return unaryExpr(internal::scalar_ceil_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_floor_op<Scalar>, const Derived>
+    floor() const {
+      return unaryExpr(internal::scalar_floor_op<Scalar>());
+    }
+
+    // Generic binary operation support.
+    template <typename CustomBinaryOp, typename OtherDerived> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<CustomBinaryOp, const Derived, const OtherDerived>
+    binaryExpr(const OtherDerived& other, const CustomBinaryOp& func) const {
+      return TensorCwiseBinaryOp<CustomBinaryOp, const Derived, const OtherDerived>(derived(), other, func);
+    }
+
+    // Coefficient-wise binary operators.
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_sum_op<Scalar>, const Derived, const OtherDerived>
+    operator+(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_sum_op<Scalar>());
+    }
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_difference_op<Scalar>, const Derived, const OtherDerived>
+    operator-(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_difference_op<Scalar>());
+    }
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_product_op<Scalar>, const Derived, const OtherDerived>
+    operator*(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_product_op<Scalar>());
+    }
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_quotient_op<Scalar>, const Derived, const OtherDerived>
+    operator/(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_quotient_op<Scalar>());
+    }
+
+    template<int NaNPropagation=PropagateFast, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_max_op<Scalar,Scalar, NaNPropagation>, const Derived, const OtherDerived>
+    cwiseMax(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_max_op<Scalar,Scalar, NaNPropagation>());
+    }
+
+    template<int NaNPropagation=PropagateFast, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_min_op<Scalar,Scalar, NaNPropagation>, const Derived, const OtherDerived>
+    cwiseMin(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_min_op<Scalar,Scalar, NaNPropagation>());
+    }
+
+    // logical operators
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_boolean_and_op<Scalar>, const Derived, const OtherDerived>
+    operator&&(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_boolean_and_op<Scalar>());
+    }
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_boolean_or_op<Scalar>, const Derived, const OtherDerived>
+    operator||(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_boolean_or_op<Scalar>());
+    }
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_bitwise_and_op<Scalar>, const Derived, const OtherDerived>
+    operator&(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_bitwise_and_op<Scalar>());
+    }
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_bitwise_or_op<Scalar>, const Derived, const OtherDerived>
+    operator|(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_bitwise_or_op<Scalar>());
+    }
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_bitwise_xor_op<Scalar>, const Derived, const OtherDerived>
+    operator^(const OtherDerived& other) const {
+      return binaryExpr(other.derived(), internal::scalar_bitwise_xor_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseUnaryOp<internal::scalar_boolean_not_op<Scalar>, const Derived>
+    operator!() const {
+      return unaryExpr(internal::scalar_boolean_not_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseUnaryOp<internal::scalar_bitwise_not_op<Scalar>, const Derived>
+    operator~() const {
+      return unaryExpr(internal::scalar_bitwise_not_op<Scalar>());
+    }
+
+    // Comparisons and tests.
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LT>, const Derived, const OtherDerived>
+    operator<(const TensorBase<OtherDerived, ReadOnlyAccessors>& other) const {
+      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LT>());
+    }
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LE>, const Derived, const OtherDerived>
+    operator<=(const TensorBase<OtherDerived, ReadOnlyAccessors>& other) const {
+      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LE>());
+    }
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GT>, const Derived, const OtherDerived>
+    operator>(const TensorBase<OtherDerived, ReadOnlyAccessors>& other) const {
+      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GT>());
+    }
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE>, const Derived, const OtherDerived>
+    operator>=(const TensorBase<OtherDerived, ReadOnlyAccessors>& other) const {
+      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE>());
+    }
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_EQ>, const Derived, const OtherDerived>
+    operator==(const TensorBase<OtherDerived, ReadOnlyAccessors>& other) const {
+      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_EQ>());
+    }
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_NEQ>, const Derived, const OtherDerived>
+    operator!=(const TensorBase<OtherDerived, ReadOnlyAccessors>& other) const {
+      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_NEQ>());
+    }
+
+    // comparisons and tests for Scalars
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LT>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    operator<(Scalar threshold) const {
+      return operator<(constant(threshold));
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LE>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    operator<=(Scalar threshold) const {
+      return operator<=(constant(threshold));
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GT>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    operator>(Scalar threshold) const {
+      return operator>(constant(threshold));
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    operator>=(Scalar threshold) const {
+      return operator>=(constant(threshold));
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_EQ>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    operator==(Scalar threshold) const {
+      return operator==(constant(threshold));
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_NEQ>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    operator!=(Scalar threshold) const {
+      return operator!=(constant(threshold));
+    }
+
+    // Predicates.
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorConversionOp<bool, const TensorCwiseUnaryOp<internal::scalar_isnan_op<Scalar, true>, const Derived>>
+    (isnan)() const {
+      return unaryExpr(internal::scalar_isnan_op<Scalar, true>()).template cast<bool>();
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorConversionOp<bool, const TensorCwiseUnaryOp<internal::scalar_isinf_op<Scalar, true>, const Derived>>
+    (isinf)() const {
+      return unaryExpr(internal::scalar_isinf_op<Scalar, true>()).template cast<bool>();
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorConversionOp<bool, const TensorCwiseUnaryOp<internal::scalar_isfinite_op<Scalar, true>, const Derived>>
+    (isfinite)() const {
+      return unaryExpr(internal::scalar_isfinite_op<Scalar, true>()).template cast<bool>();
+    }
+
+    // Coefficient-wise ternary operators.
+    template<typename ThenDerived, typename ElseDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorSelectOp<const Derived, const ThenDerived, const ElseDerived>
+    select(const ThenDerived& thenTensor, const ElseDerived& elseTensor) const {
+      return TensorSelectOp<const Derived, const ThenDerived, const ElseDerived>(derived(), thenTensor.derived(), elseTensor.derived());
+    }
+
+    // Contractions.
+    typedef Eigen::IndexPair<Index> DimensionPair;
+
+    template<typename OtherDerived, typename Dimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorContractionOp<const Dimensions, const Derived, const OtherDerived, const NoOpOutputKernel>
+    contract(const OtherDerived& other, const Dimensions& dims) const {
+      return TensorContractionOp<const Dimensions, const Derived, const OtherDerived, const NoOpOutputKernel>(derived(), other.derived(), dims);
+    }
+
+    template<typename OtherDerived, typename Dimensions, typename OutputKernel> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorContractionOp<const Dimensions, const Derived, const OtherDerived, const OutputKernel>
+    contract(const OtherDerived& other, const Dimensions& dims, const OutputKernel& output_kernel) const {
+      return TensorContractionOp<const Dimensions, const Derived, const OtherDerived, const OutputKernel>(derived(), other.derived(), dims, output_kernel);
+    }
+
+    // Convolutions.
+    template<typename KernelDerived, typename Dimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorConvolutionOp<const Dimensions, const Derived, const KernelDerived>
+    convolve(const KernelDerived& kernel, const Dimensions& dims) const {
+      return TensorConvolutionOp<const Dimensions, const Derived, const KernelDerived>(derived(), kernel.derived(), dims);
+    }
+
+    // Fourier transforms
+    template <int FFTDataType, int FFTDirection, typename FFT> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorFFTOp<const FFT, const Derived, FFTDataType, FFTDirection>
+    fft(const FFT& dims) const {
+      return TensorFFTOp<const FFT, const Derived, FFTDataType, FFTDirection>(derived(), dims);
+    }
+
+    // Scan.
+    typedef TensorScanOp<internal::SumReducer<CoeffReturnType>, const Derived> TensorScanSumOp;
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorScanSumOp
+    cumsum(const Index& axis, bool exclusive = false) const {
+      return TensorScanSumOp(derived(), axis, exclusive);
+    }
+
+    typedef TensorScanOp<internal::ProdReducer<CoeffReturnType>, const Derived> TensorScanProdOp;
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorScanProdOp
+    cumprod(const Index& axis, bool exclusive = false) const {
+      return TensorScanProdOp(derived(), axis, exclusive);
+    }
+
+    template <typename Reducer>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorScanOp<Reducer, const Derived>
+    scan(const Index& axis, const Reducer& reducer, bool exclusive = false) const {
+      return TensorScanOp<Reducer, const Derived>(derived(), axis, exclusive, reducer);
+    }
+
+    // Reductions.
+    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::SumReducer<CoeffReturnType>, const Dims, const Derived>
+    sum(const Dims& dims) const {
+      return TensorReductionOp<internal::SumReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::SumReducer<CoeffReturnType>());
+    }
+
+    const TensorReductionOp<internal::SumReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>
+    sum() const {
+      DimensionList<Index, NumDimensions> in_dims;
+      return TensorReductionOp<internal::SumReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::SumReducer<CoeffReturnType>());
+    }
+
+    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const Dims, const Derived>
+    mean(const Dims& dims) const {
+      return TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::MeanReducer<CoeffReturnType>());
+    }
+
+    const TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>
+    mean() const {
+      DimensionList<Index, NumDimensions> in_dims;
+      return TensorReductionOp<internal::MeanReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MeanReducer<CoeffReturnType>());
+    }
+
+    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const Dims, const Derived>
+    prod(const Dims& dims) const {
+      return TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const Dims, const Derived>(derived(), dims, internal::ProdReducer<CoeffReturnType>());
+    }
+
+    const TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>
+    prod() const {
+      DimensionList<Index, NumDimensions> in_dims;
+      return TensorReductionOp<internal::ProdReducer<CoeffReturnType>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::ProdReducer<CoeffReturnType>());
+    }
+
+    template <typename Dims,int NanPropagation=PropagateFast> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::MaxReducer<CoeffReturnType,NanPropagation>, const Dims, const Derived>
+    maximum(const Dims& dims) const {
+      return TensorReductionOp<internal::MaxReducer<CoeffReturnType,NanPropagation>, const Dims, const Derived>(derived(), dims, internal::MaxReducer<CoeffReturnType,NanPropagation>());
+    }
+
+    template <int NanPropagation=PropagateFast>
+    const TensorReductionOp<internal::MaxReducer<CoeffReturnType,NanPropagation>, const DimensionList<Index, NumDimensions>, const Derived>
+    maximum() const {
+      DimensionList<Index, NumDimensions> in_dims;
+      return TensorReductionOp<internal::MaxReducer<CoeffReturnType,NanPropagation>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MaxReducer<CoeffReturnType,NanPropagation>());
+    }
+
+    template <typename Dims,int NanPropagation=PropagateFast> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::MinReducer<CoeffReturnType,NanPropagation>, const Dims, const Derived>
+    minimum(const Dims& dims) const {
+      return TensorReductionOp<internal::MinReducer<CoeffReturnType,NanPropagation>, const Dims, const Derived>(derived(), dims, internal::MinReducer<CoeffReturnType,NanPropagation>());
+    }
+
+    template <int NanPropagation=PropagateFast>
+    const TensorReductionOp<internal::MinReducer<CoeffReturnType,NanPropagation>, const DimensionList<Index, NumDimensions>, const Derived>
+    minimum() const {
+      DimensionList<Index, NumDimensions> in_dims;
+      return TensorReductionOp<internal::MinReducer<CoeffReturnType,NanPropagation>, const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims, internal::MinReducer<CoeffReturnType,NanPropagation>());
+    }
+
+    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::AndReducer, const Dims, const std::conditional_t<internal::is_same<bool, CoeffReturnType>::value, Derived, TensorConversionOp<bool, const Derived> > >
+    all(const Dims& dims) const {
+      return cast<bool>().reduce(dims, internal::AndReducer());
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::AndReducer, const DimensionList<Index, NumDimensions>, const std::conditional_t<internal::is_same<bool, CoeffReturnType>::value, Derived, TensorConversionOp<bool, const Derived> > >
+    all() const {
+      DimensionList<Index, NumDimensions> in_dims;
+      return cast<bool>().reduce(in_dims, internal::AndReducer());
+    }
+
+    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::OrReducer, const Dims, const std::conditional_t<internal::is_same<bool, CoeffReturnType>::value, Derived, TensorConversionOp<bool, const Derived> > >
+    any(const Dims& dims) const {
+      return cast<bool>().reduce(dims, internal::OrReducer());
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<internal::OrReducer, const DimensionList<Index, NumDimensions>, const std::conditional_t<internal::is_same<bool, CoeffReturnType>::value, Derived, TensorConversionOp<bool, const Derived> > >
+    any() const {
+      DimensionList<Index, NumDimensions> in_dims;
+      return cast<bool>().reduce(in_dims, internal::OrReducer());
+    }
+
+   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorPairReducerOp<
+      internal::ArgMaxPairReducer<Pair<Index, CoeffReturnType> >,
+      const array<Index, NumDimensions>, const Derived>
+    argmax() const {
+      array<Index, NumDimensions> in_dims;
+      for (Index d = 0; d < NumDimensions; ++d) in_dims[d] = d;
+      return TensorPairReducerOp<
+        internal::ArgMaxPairReducer<Pair<Index, CoeffReturnType> >,
+        const array<Index, NumDimensions>,
+        const Derived>(derived(), internal::ArgMaxPairReducer<Pair<Index, CoeffReturnType> >(), -1, in_dims);
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorPairReducerOp<
+      internal::ArgMinPairReducer<Pair<Index, CoeffReturnType> >,
+      const array<Index, NumDimensions>, const Derived>
+    argmin() const {
+      array<Index, NumDimensions> in_dims;
+      for (Index d = 0; d < NumDimensions; ++d) in_dims[d] = d;
+      return TensorPairReducerOp<
+        internal::ArgMinPairReducer<Pair<Index, CoeffReturnType> >,
+        const array<Index, NumDimensions>,
+        const Derived>(derived(), internal::ArgMinPairReducer<Pair<Index, CoeffReturnType> >(), -1, in_dims);
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorPairReducerOp<
+      internal::ArgMaxPairReducer<Pair<Index, CoeffReturnType> >,
+      const array<Index, 1>, const Derived>
+    argmax(const Index return_dim) const {
+      array<Index, 1> in_dims;
+      in_dims[0] = return_dim;
+      return TensorPairReducerOp<
+        internal::ArgMaxPairReducer<Pair<Index, CoeffReturnType> >,
+        const array<Index, 1>,
+        const Derived>(derived(), internal::ArgMaxPairReducer<Pair<Index, CoeffReturnType> >(), return_dim, in_dims);
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorPairReducerOp<
+      internal::ArgMinPairReducer<Pair<Index, CoeffReturnType> >,
+      const array<Index, 1>, const Derived>
+    argmin(const Index return_dim) const {
+      array<Index, 1> in_dims;
+      in_dims[0] = return_dim;
+      return TensorPairReducerOp<
+        internal::ArgMinPairReducer<Pair<Index, CoeffReturnType> >,
+        const array<Index, 1>,
+        const Derived>(derived(), internal::ArgMinPairReducer<Pair<Index, CoeffReturnType> >(), return_dim, in_dims);
+    }
+
+    template <typename Reducer, typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReductionOp<Reducer, const Dims, const Derived>
+    reduce(const Dims& dims, const Reducer& reducer) const {
+      return TensorReductionOp<Reducer, const Dims, const Derived>(derived(), dims, reducer);
+    }
+
+    template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorTraceOp<const Dims, const Derived>
+    trace(const Dims& dims) const {
+      return TensorTraceOp<const Dims, const Derived>(derived(), dims);
+    }
+
+    const TensorTraceOp<const DimensionList<Index, NumDimensions>, const Derived>
+    trace() const {
+      DimensionList<Index, NumDimensions> in_dims;
+      return TensorTraceOp<const DimensionList<Index, NumDimensions>, const Derived>(derived(), in_dims);
+    }
+
+    template <typename Broadcast> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorBroadcastingOp<const Broadcast, const Derived>
+    broadcast(const Broadcast& bcast) const {
+      return TensorBroadcastingOp<const Broadcast, const Derived>(derived(), bcast);
+    }
+
+    template <typename Axis, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorConcatenationOp<Axis, const Derived, const OtherDerived>
+    concatenate(const OtherDerived& other, Axis axis) const {
+      return TensorConcatenationOp<Axis, const Derived, const OtherDerived>(derived(), other.derived(), axis);
+    }
+
+    template <typename PatchDims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorPatchOp<const PatchDims, const Derived>
+    extract_patches(const PatchDims& patch_dims) const {
+      return TensorPatchOp<const PatchDims, const Derived>(derived(), patch_dims);
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorImagePatchOp<Dynamic, Dynamic, const Derived>
+    extract_image_patches(const Index patch_rows = 1, const Index patch_cols = 1,
+                          const Index row_stride = 1, const Index col_stride = 1,
+                          const Index in_row_stride = 1, const Index in_col_stride = 1,
+                          const PaddingType padding_type = PADDING_SAME, const Scalar padding_value = Scalar(0)) const {
+      return TensorImagePatchOp<Dynamic, Dynamic, const Derived>(derived(), patch_rows, patch_cols, row_stride, col_stride,
+                                                                 in_row_stride, in_col_stride, 1, 1, padding_type, padding_value);
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorImagePatchOp<Dynamic, Dynamic, const Derived>
+    extract_image_patches(const Index patch_rows, const Index patch_cols,
+                          const Index row_stride, const Index col_stride,
+                          const Index in_row_stride, const Index in_col_stride,
+                          const Index row_inflate_stride, const Index col_inflate_stride,
+                          const Index padding_top, const Index padding_bottom,
+                          const Index padding_left,const Index padding_right,
+                          const Scalar padding_value) const {
+      return TensorImagePatchOp<Dynamic, Dynamic, const Derived>(derived(), patch_rows, patch_cols, row_stride, col_stride,
+                                                                 in_row_stride, in_col_stride, row_inflate_stride, col_inflate_stride,
+                                                                 padding_top, padding_bottom, padding_left, padding_right, padding_value);
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Derived>
+    extract_volume_patches(const Index patch_planes, const Index patch_rows, const Index patch_cols,
+                           const Index plane_stride = 1, const Index row_stride = 1, const Index col_stride = 1,
+                           const PaddingType padding_type = PADDING_SAME, const Scalar padding_value = Scalar(0)) const {
+      return TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Derived>(derived(), patch_planes, patch_rows, patch_cols, plane_stride, row_stride, col_stride, 1, 1, 1, 1, 1, 1, padding_type, padding_value);
+    }
+
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Derived>
+    extract_volume_patches(const Index patch_planes, const Index patch_rows, const Index patch_cols,
+                           const Index plane_stride, const Index row_stride, const Index col_stride,
+                           const Index plane_inflate_stride, const Index row_inflate_stride, const Index col_inflate_stride,
+                           const Index padding_top_z, const Index padding_bottom_z,
+                           const Index padding_top, const Index padding_bottom,
+                           const Index padding_left, const Index padding_right, const Scalar padding_value = Scalar(0)) const {
+      return TensorVolumePatchOp<Dynamic, Dynamic, Dynamic, const Derived>(derived(), patch_planes, patch_rows, patch_cols, plane_stride, row_stride, col_stride, 1, 1, 1, plane_inflate_stride, row_inflate_stride, col_inflate_stride, padding_top_z, padding_bottom_z, padding_top, padding_bottom, padding_left, padding_right, padding_value);
+    }
+
+    // Morphing operators.
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorLayoutSwapOp<const Derived>
+    swap_layout() const {
+      return TensorLayoutSwapOp<const Derived>(derived());
+    }
+    template <typename NewDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReshapingOp<const NewDimensions, const Derived>
+    reshape(const NewDimensions& newDimensions) const {
+      return TensorReshapingOp<const NewDimensions, const Derived>(derived(), newDimensions);
+    }
+    template <typename StartIndices, typename Sizes> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorSlicingOp<const StartIndices, const Sizes, const Derived>
+    slice(const StartIndices& startIndices, const Sizes& sizes) const {
+      return TensorSlicingOp<const StartIndices, const Sizes, const Derived>(derived(), startIndices, sizes);
+    }
+    template <typename StartIndices, typename StopIndices, typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides, const Derived>
+    stridedSlice(const StartIndices& startIndices, const StopIndices& stopIndices, const Strides& strides) const {
+      return TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides,
+                                const Derived>(derived(), startIndices, stopIndices, strides);
+    }
+    template <Index DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorChippingOp<DimId, const Derived>
+    chip(const Index offset) const {
+      EIGEN_STATIC_ASSERT(DimId < Derived::NumDimensions && DimId >= 0, Chip_Dim_out_of_range)
+      return TensorChippingOp<DimId, const Derived>(derived(), offset, DimId);
+    }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorChippingOp<Dynamic, const Derived>
+    chip(const Index offset, const Index dim) const {
+      return TensorChippingOp<Dynamic, const Derived>(derived(), offset, dim);
+    }
+    template <typename ReverseDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReverseOp<const ReverseDimensions, const Derived>
+    reverse(const ReverseDimensions& rev) const {
+      return TensorReverseOp<const ReverseDimensions, const Derived>(derived(), rev);
+    }
+    template <typename Rolls> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorRollOp<const Rolls, const Derived>
+    roll(const Rolls& rolls) const {
+      return TensorRollOp<const Rolls, const Derived>(derived(), rolls);
+    }
+    template <typename PaddingDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorPaddingOp<const PaddingDimensions, const Derived>
+    pad(const PaddingDimensions& padding) const {
+      return TensorPaddingOp<const PaddingDimensions, const Derived>(derived(), padding, internal::scalar_cast_op<int, Scalar>()(0));
+    }
+    template <typename PaddingDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorPaddingOp<const PaddingDimensions, const Derived>
+    pad(const PaddingDimensions& padding, const Scalar padding_value) const {
+      return TensorPaddingOp<const PaddingDimensions, const Derived>(derived(), padding, padding_value);
+    }
+    template <typename Shuffle> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorShufflingOp<const Shuffle, const Derived>
+    shuffle(const Shuffle& shfl) const {
+      return TensorShufflingOp<const Shuffle, const Derived>(derived(), shfl);
+    }
+    template <typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorStridingOp<const Strides, const Derived>
+    stride(const Strides& strides) const {
+      return TensorStridingOp<const Strides, const Derived>(derived(), strides);
+    }
+    template <typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorInflationOp<const Strides, const Derived>
+    inflate(const Strides& strides) const {
+      return TensorInflationOp<const Strides, const Derived>(derived(), strides);
+    }
+
+    // Returns a tensor containing index/value pairs
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorIndexPairOp<const Derived>
+    index_pairs() const {
+      return TensorIndexPairOp<const Derived>(derived());
+    }
+
+    // Support for custom unary and binary operations
+    template <typename CustomUnaryFunc>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCustomUnaryOp<const CustomUnaryFunc, const Derived> customOp(const CustomUnaryFunc& op) const {
+      return TensorCustomUnaryOp<const CustomUnaryFunc, const Derived>(derived(), op);
+    }
+    template <typename OtherDerived, typename CustomBinaryFunc>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorCustomBinaryOp<const CustomBinaryFunc, const Derived, const OtherDerived> customOp(const OtherDerived& other, const CustomBinaryFunc& op) const {
+      return TensorCustomBinaryOp<const CustomBinaryFunc, const Derived, const OtherDerived>(derived(), other, op);
+    }
+
+    // Force the evaluation of the expression.
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorForcedEvalOp<const Derived> eval() const {
+      return TensorForcedEvalOp<const Derived>(derived());
+    }
+
+    // Returns a formatted tensor ready for printing to a stream
+    template<typename Format>
+    inline const TensorWithFormat<Derived,DerivedTraits::Layout,DerivedTraits::NumDimensions, Format> format(const Format& fmt) const {
+      return TensorWithFormat<Derived,DerivedTraits::Layout,DerivedTraits::NumDimensions, Format>(derived(), fmt);
+    }
+
+    #ifdef EIGEN_READONLY_TENSORBASE_PLUGIN
+    #include EIGEN_READONLY_TENSORBASE_PLUGIN
+    #endif
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast<const Derived*>(this); }
+
+  protected:
+    template <typename Scalar, int NumIndices, int Options, typename IndexType> friend class Tensor;
+    template <typename Scalar, typename Dimensions, int Option, typename IndexTypes> friend class TensorFixedSize;
+    // the Eigen:: prefix is required to workaround a compilation issue with nvcc 9.0
+    template <typename OtherDerived, int AccessLevel> friend class Eigen::TensorBase;
+};
+
+template<typename Derived, int AccessLevel = internal::accessors_level<Derived>::value>
+class TensorBase : public TensorBase<Derived, ReadOnlyAccessors> {
+ public:
+    typedef TensorBase<Derived, ReadOnlyAccessors> Base;
+    typedef internal::traits<Derived> DerivedTraits;
+    typedef typename DerivedTraits::Scalar Scalar;
+    typedef typename DerivedTraits::Index Index;
+    typedef Scalar CoeffReturnType;
+    static constexpr int NumDimensions = DerivedTraits::NumDimensions;
+
+    template <typename Scalar, int NumIndices, int Options, typename IndexType> friend class Tensor;
+    template <typename Scalar, typename Dimensions, int Option, typename IndexTypes> friend class TensorFixedSize;
+    // the Eigen:: prefix is required to workaround a compilation issue with nvcc 9.0
+    template <typename OtherDerived, int OtherAccessLevel> friend class Eigen::TensorBase;
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Derived& setZero() {
+      return setConstant(Scalar(0));
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Derived& setConstant(const Scalar& val) {
+      return derived() = this->constant(val);
+    }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Derived& setRandom() {
+      return derived() = this->random();
+    }
+    template <typename RandomGenerator> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Derived& setRandom() {
+      return derived() = this->template random<RandomGenerator>();
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Derived& setValues(
+        const typename internal::Initializer<Derived, NumDimensions>::InitList& vals) {
+      TensorEvaluator<Derived, DefaultDevice> eval(derived(), DefaultDevice());
+      internal::initialize_tensor<Derived, NumDimensions>(eval, vals);
+      return derived();
+    }
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Derived& operator+=(const OtherDerived& other) {
+      return derived() = derived() + other.derived();
+    }
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Derived& operator-=(const OtherDerived& other) {
+      return derived() = derived() - other.derived();
+    }
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Derived& operator*=(const OtherDerived& other) {
+      return derived() = derived() * other.derived();
+    }
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    Derived& operator/=(const OtherDerived& other) {
+      return derived() = derived() / other.derived();
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorLayoutSwapOp<const Derived>
+    swap_layout() const {
+      return TensorLayoutSwapOp<const Derived>(derived());
+    }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorLayoutSwapOp<Derived>
+    swap_layout() {
+      return TensorLayoutSwapOp<Derived>(derived());
+    }
+
+    template <typename Axis, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorConcatenationOp<const Axis, const Derived, const OtherDerived>
+    concatenate(const OtherDerived& other, const Axis& axis) const {
+      return TensorConcatenationOp<const Axis, const Derived, const OtherDerived>(derived(), other, axis);
+    }
+    template <typename Axis, typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorConcatenationOp<const Axis, Derived, OtherDerived>
+    concatenate(const OtherDerived& other, const Axis& axis) {
+      return TensorConcatenationOp<const Axis, Derived, OtherDerived>(derived(), other, axis);
+    }
+
+    template <typename NewDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReshapingOp<const NewDimensions, const Derived>
+    reshape(const NewDimensions& newDimensions) const {
+      return TensorReshapingOp<const NewDimensions, const Derived>(derived(), newDimensions);
+    }
+    template <typename NewDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorReshapingOp<const NewDimensions, Derived>
+    reshape(const NewDimensions& newDimensions) {
+      return TensorReshapingOp<const NewDimensions, Derived>(derived(), newDimensions);
+    }
+
+    template <typename StartIndices, typename Sizes> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorSlicingOp<const StartIndices, const Sizes, const Derived>
+    slice(const StartIndices& startIndices, const Sizes& sizes) const {
+      return TensorSlicingOp<const StartIndices, const Sizes, const Derived>(derived(), startIndices, sizes);
+    }
+    template <typename StartIndices, typename Sizes> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorSlicingOp<const StartIndices, const Sizes, Derived>
+    slice(const StartIndices& startIndices, const Sizes& sizes) {
+      return TensorSlicingOp<const StartIndices, const Sizes, Derived>(derived(), startIndices, sizes);
+    }
+
+    template <typename StartIndices, typename StopIndices, typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides, const Derived>
+    stridedSlice(const StartIndices& startIndices, const StopIndices& stopIndices, const Strides& strides) const {
+      return TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides,
+                                const Derived>(derived(), startIndices, stopIndices, strides);
+    }
+    template <typename StartIndices, typename StopIndices, typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides, Derived>
+    stridedSlice(const StartIndices& startIndices, const StopIndices& stopIndices, const Strides& strides) {
+      return TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides,
+                                Derived>(derived(), startIndices, stopIndices, strides);
+    }
+
+    template <DenseIndex DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorChippingOp<DimId, const Derived>
+    chip(const Index offset) const {
+      EIGEN_STATIC_ASSERT(DimId < Derived::NumDimensions && DimId >= 0, Chip_Dim_out_of_range)
+      return TensorChippingOp<DimId, const Derived>(derived(), offset, DimId);
+    }
+    template <Index DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorChippingOp<DimId, Derived>
+    chip(const Index offset) {
+      EIGEN_STATIC_ASSERT(DimId < Derived::NumDimensions && DimId >= 0, Chip_Dim_out_of_range)
+      return TensorChippingOp<DimId, Derived>(derived(), offset, DimId);
+    }
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorChippingOp<Dynamic, const Derived>
+    chip(const Index offset, const Index dim) const {
+      return TensorChippingOp<Dynamic, const Derived>(derived(), offset, dim);
+    }
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorChippingOp<Dynamic, Derived>
+    chip(const Index offset, const Index dim) {
+      return TensorChippingOp<Dynamic, Derived>(derived(), offset, dim);
+    }
+
+    template <typename ReverseDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorReverseOp<const ReverseDimensions, const Derived>
+    reverse(const ReverseDimensions& rev) const {
+      return TensorReverseOp<const ReverseDimensions, const Derived>(derived(), rev);
+    }
+    template <typename ReverseDimensions> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorReverseOp<const ReverseDimensions, Derived>
+    reverse(const ReverseDimensions& rev) {
+      return TensorReverseOp<const ReverseDimensions, Derived>(derived(), rev);
+    }
+
+    template <typename Rolls> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorRollOp<const Rolls, const Derived>
+    roll(const Rolls& roll) const {
+      return TensorRollOp<const Rolls, const Derived>(derived(), roll);
+    }
+    template <typename Rolls> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorRollOp<const Rolls, Derived>
+    roll(const Rolls& roll) {
+      return TensorRollOp<const Rolls, Derived>(derived(), roll);
+    }
+
+    template <typename Shuffle> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorShufflingOp<const Shuffle, const Derived>
+    shuffle(const Shuffle& shfl) const {
+      return TensorShufflingOp<const Shuffle, const Derived>(derived(), shfl);
+    }
+    template <typename Shuffle> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorShufflingOp<const Shuffle, Derived>
+    shuffle(const Shuffle& shfl) {
+      return TensorShufflingOp<const Shuffle, Derived>(derived(), shfl);
+    }
+
+    template <typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorStridingOp<const Strides, const Derived>
+    stride(const Strides& strides) const {
+      return TensorStridingOp<const Strides, const Derived>(derived(), strides);
+    }
+    template <typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorStridingOp<const Strides, Derived>
+    stride(const Strides& strides) {
+      return TensorStridingOp<const Strides, Derived>(derived(), strides);
+    }
+
+    // Select the device on which to evaluate the expression.
+    template <typename DeviceType>
+    TensorDevice<Derived, DeviceType> device(const DeviceType& dev) {
+      return TensorDevice<Derived, DeviceType>(dev, derived());
+    }
+
+    // Select the async device on which to evaluate the expression.
+    template <typename DeviceType, typename DoneCallback>
+    TensorAsyncDevice<Derived, DeviceType, DoneCallback> device(const DeviceType& dev, DoneCallback done) {
+      return TensorAsyncDevice<Derived, DeviceType, DoneCallback>(dev, derived(), std::move(done));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Derived& derived() { return *static_cast<Derived*>(this); }
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast<const Derived*>(this); }
+
+    #ifdef EIGEN_TENSORBASE_PLUGIN
+    #include EIGEN_TENSORBASE_PLUGIN
+    #endif
+
+ protected:
+    EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(TensorBase)
+    EIGEN_DEFAULT_COPY_CONSTRUCTOR(TensorBase)
+
+    template<typename OtherDerived> EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE Derived& operator=(const OtherDerived& other)
+    {
+      typedef TensorAssignOp<Derived, const OtherDerived> Assign;
+      Assign assign(derived(), other.derived());
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+      return derived();
+    }
+};
+#endif // EIGEN_PARSED_BY_DOXYGEN
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_BASE_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
new file mode 100644
index 00000000..0b068a7c
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h
@@ -0,0 +1,1474 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
+#define EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+// -------------------------------------------------------------------------- //
+// Forward declarations for templates defined below.
+template <typename Scalar, typename IndexType, int NumDims, int Layout>
+class TensorBlockIO;
+
+// -------------------------------------------------------------------------- //
+// Helper function to compute strides for densely stored buffer of given
+// dimensions.
+
+// TODO(ezhulenev): We compute strides 1000 times in different evaluators, use
+// this function instead everywhere.
+template <int Layout, typename IndexType, int NumDims>
+EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(const DSizes<IndexType, NumDims>& dimensions) {
+  DSizes<IndexType, NumDims> strides;
+  if (NumDims == 0) return strides;
+
+  // TODO(ezhulenev): Use templates to unroll this loop (similar to
+  // h_array_reduce in CXX11meta.h)? Benchmark it.
+  if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+    strides[0] = 1;
+    for (int i = 1; i < NumDims; ++i) {
+      strides[i] = strides[i - 1] * dimensions[i - 1];
+    }
+  } else {
+    strides[NumDims - 1] = 1;
+    for (int i = NumDims - 2; i >= 0; --i) {
+      strides[i] = strides[i + 1] * dimensions[i + 1];
+    }
+  }
+
+  return strides;
+}
+
+template <int Layout, typename IndexType, size_t NumDims>
+EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(const Eigen::array<IndexType, NumDims>& dimensions) {
+  return strides<Layout>(DSizes<IndexType, NumDims>(dimensions));
+}
+
+template <int Layout, std::ptrdiff_t... Indices>
+EIGEN_STRONG_INLINE DSizes<std::ptrdiff_t, sizeof...(Indices)> strides(const Sizes<Indices...>& sizes) {
+  return strides<Layout>(DSizes<std::ptrdiff_t, sizeof...(Indices)>(sizes));
+}
+
+// -------------------------------------------------------------------------- //
+
+// Tensor block shape type defines what are the shape preference for the blocks
+// extracted from the larger tensor.
+//
+// Example: blocks of 100 elements from the large 100x100 tensor:
+// - tensor: 100x100
+// - target_block_size: 100
+//
+// TensorBlockShapeType:
+//  - kUniformAllDims: 100 blocks of size 10x10
+//  - kSkewedInnerDims: 100 blocks of size 100x1 (or 1x100 depending on a column
+//                      or row major layout)
+enum class TensorBlockShapeType { kUniformAllDims, kSkewedInnerDims };
+
+struct TensorBlockResourceRequirements {
+  TensorBlockShapeType shape_type;  // target block shape
+  size_t size;                      // target block size
+  TensorOpCost cost_per_coeff;      // cost of computing a single block element
+
+#ifdef EIGEN_HIPCC
+  // For HIPCC, we need to explicitly declare as a "device fun", the constructor
+  // which is implicitly invoked in the "merge" / "any" routines. else HIPCC
+  // errors out complaining about the lack of a matching constructor
+  EIGEN_DEVICE_FUNC TensorBlockResourceRequirements(TensorBlockShapeType shape_type_, size_t size_, TensorOpCost cost_)
+      : shape_type(shape_type_), size(size_), cost_per_coeff(cost_) {}
+#endif
+
+  template <typename Scalar>
+  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize(TensorBlockShapeType shape_type,
+                                                                            size_t size_in_bytes, TensorOpCost cost) {
+    const size_t size = numext::maxi(size_t(1), size_in_bytes / sizeof(Scalar));
+    return {shape_type, size, cost};
+  }
+
+  template <typename Scalar>
+  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize(TensorBlockShapeType shape_type,
+                                                                            size_t size_in_bytes) {
+    // This default cost per coefficient is valid for most materialized tensor
+    // block evaluation implementations, because they typically just read
+    // coefficients from the underlying tensor storage, and write to the tensor
+    // block buffer (scratch or destination memory, reads and writes have linear
+    // access pattern). We ignore the fixed cost of block evaluation, because in
+    // practice it should negligible.
+    //
+    // Lazy block evaluation adds the cost of calling a functor for each
+    // coefficient.
+    //
+    // All non-trivial block evaluation implementations must provide their own
+    // cost approximation (e.g. shuffling inner dimension has a much higher cost
+    // because it reads memory randomly, although the total number of moved
+    // bytes is the same).
+    return withShapeAndSize<Scalar>(shape_type, size_in_bytes,
+                                    {/*bytes_loaded=*/sizeof(Scalar),
+                                     /*bytes_stored=*/sizeof(Scalar),
+                                     /*compute_cycles=*/0});
+  }
+
+  template <typename Scalar>
+  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements skewed(size_t size_in_bytes) {
+    return withShapeAndSize<Scalar>(TensorBlockShapeType::kSkewedInnerDims, size_in_bytes);
+  }
+
+  template <typename Scalar>
+  EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements uniform(size_t size_in_bytes) {
+    return withShapeAndSize<Scalar>(TensorBlockShapeType::kUniformAllDims, size_in_bytes);
+  }
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE TensorBlockResourceRequirements
+  merge(const TensorBlockResourceRequirements& lhs, const TensorBlockResourceRequirements& rhs) {
+    return {merge(lhs.shape_type, rhs.shape_type),           // shape_type
+            merge(lhs.size, rhs.size),                       // size
+            merge(lhs.cost_per_coeff, rhs.cost_per_coeff)};  // cost_per_coeff
+  }
+
+  EIGEN_DEVICE_FUNC TensorBlockResourceRequirements& addCostPerCoeff(TensorOpCost cost) {
+    cost_per_coeff += cost;
+    return *this;
+  }
+
+  // This is a resource requirement that should be returned from expressions
+  // that do not have any block evaluation preference (e.g. default tensor
+  // expression with raw buffer access).
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE TensorBlockResourceRequirements any() {
+    return {TensorBlockShapeType::kUniformAllDims, 1, {0, 0, 0}};
+  }
+
+ private:
+  using Requirements = TensorBlockResourceRequirements;
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE size_t merge(size_t lhs_size, size_t rhs_size) {
+    return numext::maxi(lhs_size, rhs_size);
+  }
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE TensorBlockShapeType merge(TensorBlockShapeType lhs,
+                                                                          TensorBlockShapeType rhs) {
+    return (lhs == TensorBlockShapeType::kSkewedInnerDims || rhs == TensorBlockShapeType::kSkewedInnerDims)
+               ? TensorBlockShapeType::kSkewedInnerDims
+               : TensorBlockShapeType::kUniformAllDims;
+  }
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE TensorOpCost merge(TensorOpCost lhs_cost, TensorOpCost rhs_cost) {
+    return lhs_cost + rhs_cost;
+  }
+};
+
+// -------------------------------------------------------------------------- //
+// TensorBlockDescriptor specifies a block offset within a tensor and the block
+// sizes along each of the tensor dimensions.
+
+template <int NumDims, typename IndexType = Eigen::Index>
+class TensorBlockDescriptor {
+ public:
+  typedef DSizes<IndexType, NumDims> Dimensions;
+
+  // If we evaluate a Tensor assignment, and expression on the left, already has
+  // a memory buffer, then we might do performance optimization, and evaluate
+  // the root expression directly into the final output memory. Some time it's
+  // possible to reuse it for materializing subexpressions inside an expression
+  // tree, to to avoid dynamic memory allocation.
+  //
+  // The pointer type of the underlying storage is erased, because passing
+  // Scalar type through all the expression evaluation layers is way too many
+  // templates. In practice destination buffer type should always match the
+  // evaluated expression scalar type.
+  class DestinationBuffer {
+   public:
+    enum DestinationBufferKind : int {
+      // The above explicit specification of "int" as the enum basetype is
+      // needed to get around a HIPCC link error ("the field type is not
+      // amp-compatible")
+      // which is issued for class members with the enum type.
+      // TODO(rocm):
+      // remove the "int" basetype once HIPCC has been fixed to not error out
+      // in the above scenario.
+
+      // Destination buffer is not defined (`m_data` == nullptr).
+      kEmpty,
+
+      // Tensor block defined by an owning tensor block descriptor can fit
+      // contiguously into the destination buffer. In this case it's safe to
+      // materialize tensor block in the destination buffer, wrap it in a
+      // TensorMap, and use to build Eigen expression on top of it.
+      kContiguous,
+
+      // Destination buffer strides do not match strides of the contiguously
+      // stored block, and it's impossible to define a TensorMap over this
+      // buffer. However if we are evaluating a root of an expression tree, we
+      // still can materialize an output into this destination, because we can
+      // guarantee that no one will ever access it through block API.
+      //
+      // In theory it is possible to build valid TensorStriding<TensorMap>
+      // expression on top of this destination buffer, however it has
+      // inefficient coeff/packet access, and defeats the purpose of fast block
+      // evaluation API.
+      kStrided
+    };
+
+    template <typename Scalar>
+    Scalar* data() const {
+      eigen_assert(m_data_type_size == sizeof(Scalar));
+      return static_cast<Scalar*>(m_data);
+    }
+
+    const Dimensions& strides() const { return m_strides; }
+    const DestinationBufferKind& kind() const { return m_kind; }
+
+   private:
+    friend class TensorBlockDescriptor<NumDims, IndexType>;
+
+    DestinationBuffer() : m_data(NULL), m_data_type_size(0), m_kind(kEmpty) {}
+
+    template <typename Scalar>
+    DestinationBuffer(Scalar* data, const Dimensions& strides, DestinationBufferKind kind)
+        : m_data(static_cast<void*>(data)), m_data_type_size(sizeof(Scalar)), m_strides(strides), m_kind(kind) {}
+
+    template <int Layout, typename Scalar>
+    static DestinationBuffer make(const TensorBlockDescriptor& desc, Scalar* data, const Dimensions& strides) {
+      return DestinationBuffer(data, strides, kind<Layout>(desc, strides));
+    }
+
+    template <int Layout>
+    static DestinationBufferKind kind(const TensorBlockDescriptor& desc, const Dimensions& strides) {
+      const Dimensions& desc_dims = desc.dimensions();
+      const Dimensions& desc_strides = internal::strides<Layout>(desc_dims);
+      for (int i = 0; i < NumDims; ++i) {
+        if (desc_dims[i] == 1) continue;
+        if (desc_strides[i] != strides[i]) return kStrided;
+      }
+      return kContiguous;
+    }
+
+    // Storage pointer is type erased, to reduce template bloat, but we still
+    // keep the size of the underlying element type for error checking.
+    void* m_data;
+    size_t m_data_type_size;
+
+    // Destination buffer dimensions always match the dimensions of a tensor
+    // block descriptor it belongs to, however strides might be different.
+    Dimensions m_strides;
+
+    DestinationBufferKind m_kind;
+  };
+
+  TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions, const DestinationBuffer& destination)
+      : m_offset(offset), m_dimensions(dimensions), m_destination(destination) {}
+
+  TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions)
+      : m_offset(offset), m_dimensions(dimensions), m_destination(DestinationBuffer()) {}
+
+  IndexType offset() const { return m_offset; }
+  const Dimensions& dimensions() const { return m_dimensions; }
+  IndexType dimension(int index) const { return m_dimensions[index]; }
+  IndexType size() const { return array_prod<IndexType>(m_dimensions); }
+
+  const DestinationBuffer& destination() const { return m_destination; }
+
+  template <int Layout, typename Scalar>
+  void AddDestinationBuffer(Scalar* dst_base, const Dimensions& dst_strides) {
+    eigen_assert(dst_base != NULL);
+    m_destination = DestinationBuffer::template make<Layout>(*this, dst_base, dst_strides);
+  }
+
+  template <int Layout, typename Scalar, typename DstStridesIndexType>
+  void AddDestinationBuffer(Scalar* dst_base, const DSizes<DstStridesIndexType, NumDims>& dst_strides) {
+    // DSizes constructor will do index type promotion if it's safe.
+    AddDestinationBuffer<Layout>(dst_base, Dimensions(dst_strides));
+  }
+
+  TensorBlockDescriptor& DropDestinationBuffer() {
+    m_destination.m_data = NULL;
+    m_destination.m_kind = DestinationBuffer::kEmpty;
+    return *this;
+  }
+
+  bool HasDestinationBuffer() const { return m_destination.kind() != DestinationBuffer::kEmpty; }
+
+  // Returns a copy of `*this` with updated offset.
+  TensorBlockDescriptor WithOffset(IndexType offset) const {
+    return TensorBlockDescriptor(offset, m_dimensions, m_destination);
+  }
+
+ private:
+  // Offset and dimensions are immutable after construction. Block descriptor
+  // can only be mutated by adding or dropping destination.
+  const IndexType m_offset;
+  const Dimensions m_dimensions;
+  DestinationBuffer m_destination;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorBlockMapper is responsible for iterating over the blocks of a tensor.
+
+template <int NumDims, int Layout, typename IndexType = Eigen::Index>
+class TensorBlockMapper {
+  typedef TensorBlockDescriptor<NumDims, IndexType> BlockDescriptor;
+
+ public:
+  typedef DSizes<IndexType, NumDims> Dimensions;
+
+  TensorBlockMapper() = default;
+  TensorBlockMapper(const DSizes<IndexType, NumDims>& dimensions, const TensorBlockResourceRequirements& requirements)
+      : m_tensor_dimensions(dimensions), m_requirements(requirements) {
+    // Compute block dimensions and the total number of blocks.
+    InitializeBlockDimensions();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockCount() const { return m_total_block_count; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockTotalSize() const { return m_block_dimensions.TotalSize(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes<IndexType, NumDims>& blockDimensions() const {
+    return m_block_dimensions;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockDescriptor blockDescriptor(IndexType block_index) const {
+    static const bool isColMajor = Layout == static_cast<int>(ColMajor);
+
+    IndexType offset = 0;
+    DSizes<IndexType, NumDims> dimensions;
+
+    if (NumDims == 0) return BlockDescriptor(offset, dimensions);
+
+    // Iterate outer -> inner dimensions.
+    for (int i = NumDims - 1; i >= 0; --i) {
+      const int dim = isColMajor ? i : NumDims - i - 1;
+
+      const IndexType idx = block_index / m_block_strides[dim];
+      block_index -= idx * m_block_strides[dim];
+
+      const IndexType coord = idx * m_block_dimensions[dim];
+      dimensions[dim] = numext::mini(m_tensor_dimensions[dim] - coord, m_block_dimensions[dim]);
+      offset += coord * m_tensor_strides[dim];
+    }
+
+    return {offset, dimensions};
+  }
+
+ private:
+  void InitializeBlockDimensions() {
+    // Requested block shape and size.
+    const TensorBlockShapeType shape_type = m_requirements.shape_type;
+    IndexType target_block_size = numext::maxi<IndexType>(1, static_cast<IndexType>(m_requirements.size));
+
+    IndexType tensor_size = m_tensor_dimensions.TotalSize();
+
+    // Corner case: one of the dimensions is zero. Logic below is too complex
+    // to handle this case on a general basis, just use unit block size.
+    // Note: we must not yield blocks with zero dimensions (recipe for
+    // overflows/underflows, divisions by zero and NaNs later).
+    if (tensor_size == 0) {
+      for (int i = 0; i < NumDims; ++i) {
+        m_block_dimensions[i] = 1;
+      }
+      m_total_block_count = 0;
+      return;
+    }
+
+    // If tensor fits into a target block size, evaluate it as a single block.
+    if (tensor_size <= target_block_size) {
+      m_block_dimensions = m_tensor_dimensions;
+      m_total_block_count = 1;
+      // The only valid block index is `0`, and in this case we do not need
+      // to compute real strides for tensor or blocks (see blockDescriptor).
+      for (int i = 0; i < NumDims; ++i) {
+        m_tensor_strides[i] = 0;
+        m_block_strides[i] = 1;
+      }
+      return;
+    }
+
+    static const bool isColMajor = Layout == static_cast<int>(ColMajor);
+
+    // Block shape skewed towards inner dimension.
+    if (shape_type == TensorBlockShapeType::kSkewedInnerDims) {
+      IndexType coeff_to_allocate = target_block_size;
+
+      for (int i = 0; i < NumDims; ++i) {
+        const int dim = isColMajor ? i : NumDims - i - 1;
+        m_block_dimensions[dim] = numext::mini(coeff_to_allocate, m_tensor_dimensions[dim]);
+        coeff_to_allocate =
+            numext::div_ceil(coeff_to_allocate, numext::maxi(static_cast<IndexType>(1), m_block_dimensions[dim]));
+      }
+      eigen_assert(coeff_to_allocate == 1);
+
+    } else if (shape_type == TensorBlockShapeType::kUniformAllDims) {
+      // Tensor will not fit within 'target_block_size' budget: calculate tensor
+      // block dimension sizes based on "square" dimension size target.
+      const IndexType dim_size_target = convert_index<IndexType>(
+          std::pow(static_cast<float>(target_block_size), 1.0f / static_cast<float>(m_block_dimensions.rank())));
+
+      for (int i = 0; i < NumDims; ++i) {
+        // TODO(andydavis) Adjust the inner most 'block_dim_size' to make it
+        // a multiple of the packet size. Note that reducing
+        // 'block_dim_size' in this manner can increase the number of
+        // blocks, and so will amplify any per-block overhead.
+        m_block_dimensions[i] = numext::mini(dim_size_target, m_tensor_dimensions[i]);
+      }
+
+      // Add any un-allocated coefficients to inner dimension(s).
+      IndexType total_size = m_block_dimensions.TotalSize();
+      for (int i = 0; i < NumDims; ++i) {
+        const int dim = isColMajor ? i : NumDims - i - 1;
+
+        if (m_block_dimensions[dim] < m_tensor_dimensions[dim]) {
+          const IndexType total_size_other_dims = total_size / m_block_dimensions[dim];
+          const IndexType alloc_avail = numext::div_ceil<IndexType>(target_block_size, total_size_other_dims);
+          if (alloc_avail == m_block_dimensions[dim]) {
+            // Insufficient excess coefficients to allocate.
+            break;
+          }
+          m_block_dimensions[dim] = numext::mini(m_tensor_dimensions[dim], alloc_avail);
+          total_size = total_size_other_dims * m_block_dimensions[dim];
+        }
+      }
+
+    } else {
+      eigen_assert(false);  // unknown block shape
+    }
+
+    eigen_assert(m_block_dimensions.TotalSize() >=
+                 numext::mini<IndexType>(target_block_size, m_tensor_dimensions.TotalSize()));
+
+    // Calculate block counts by dimension and total block count.
+    DSizes<IndexType, NumDims> block_count;
+    for (int i = 0; i < NumDims; ++i) {
+      block_count[i] = numext::div_ceil(m_tensor_dimensions[i], m_block_dimensions[i]);
+    }
+    m_total_block_count = array_prod(block_count);
+
+    // Calculate block strides (used for enumerating blocks).
+    m_tensor_strides = strides<Layout>(m_tensor_dimensions);
+    m_block_strides = strides<Layout>(block_count);
+  }
+
+  DSizes<IndexType, NumDims> m_tensor_dimensions;
+  TensorBlockResourceRequirements m_requirements;
+
+  DSizes<IndexType, NumDims> m_block_dimensions;
+  IndexType m_total_block_count;
+
+  DSizes<IndexType, NumDims> m_tensor_strides;
+  DSizes<IndexType, NumDims> m_block_strides;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorBlockScratchAllocator is responsible for allocating temporary buffers
+// for block evaluation (output or input block materialization). Given that
+// Eigen expression traversal order is deterministic, all temporary allocations
+// are happening in the same order, and usually have exactly the same size.
+// Scratch allocator keeps a trace of all dynamic allocations, and after the
+// first block evaluation is completed, we should be able to reuse all the
+// temporary buffers for the next block evaluation.
+
+template <typename Device>
+class TensorBlockScratchAllocator {
+ public:
+  explicit TensorBlockScratchAllocator(const Device& device) : m_device(device), m_allocation_index(0) {}
+
+  ~TensorBlockScratchAllocator() {
+    for (size_t i = 0; i < m_allocations.size(); ++i) {
+      m_device.deallocate(m_allocations[i].ptr);
+    }
+  }
+
+  void* allocate(size_t size) {
+    // TODO(ezhulenev): Remove when replaced with inlined vector.
+    if (m_allocations.capacity() == 0) m_allocations.reserve(8);
+
+    // Check if we already have an existing allocation att current index.
+    const int num_allocations = static_cast<int>(m_allocations.size());
+    const bool has_allocation = m_allocation_index < num_allocations;
+
+    // Allocation index can't be larger than the number of allocations.
+    eigen_assert(m_allocation_index <= num_allocations);
+
+    // If we have existing allocation, and its size is larger or equal to
+    // requested size, we do nothing.
+
+    // If current allocation can't fit requested size, we deallocate it, and
+    // replace with a larger allocation.
+    if (has_allocation && m_allocations[m_allocation_index].size < size) {
+      m_device.deallocate(m_allocations[m_allocation_index].ptr);
+      m_allocations[m_allocation_index].ptr = m_device.allocate(size);
+      m_allocations[m_allocation_index].size = size;
+    }
+
+    // Make a new allocation if we don't have and existing one.
+    if (!has_allocation) {
+      Allocation allocation;
+      allocation.ptr = m_device.allocate(size);
+      allocation.size = size;
+      m_allocations.push_back(allocation);
+    }
+
+    eigen_assert(m_allocations[m_allocation_index].ptr != NULL);
+    eigen_assert(m_allocations[m_allocation_index].size >= size);
+
+    return m_allocations[m_allocation_index++].ptr;
+  }
+
+  void reset() { m_allocation_index = 0; }
+
+ private:
+  struct Allocation {
+    void* ptr;
+    size_t size;
+  };
+
+  const Device& m_device;
+  int m_allocation_index;
+  // TODO(ezhulenev): This should be an inlined vector.
+  std::vector<Allocation> m_allocations;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorBlockKind represents all possible block kinds, that can be produced by
+// TensorEvaluator::evalBlock function.
+enum TensorBlockKind {
+  // Tensor block that is a lazy expression that must be assigned to a
+  // destination using TensorBlockAssign.
+  kExpr,
+
+  // Tensor block that is a view into a memory buffer owned by an underlying
+  // Tensor expression (e.g. it can be a view into a Tensor buffer).
+  kView,
+
+  // Tensor block that was materialized in a scratch memory buffer, allocated
+  // with TensorBlockScratchAllocator. This block must be copied to a
+  // destination, similar to a block of `kExpr` type.
+  kMaterializedInScratch,
+
+  // Tensor block that was materialized directly into the final output memory
+  // buffer. For example if the left side of an assignment is a Tensor, we can
+  // directly materialize the block in the destination memory.
+  //
+  // If strides in the output buffer do not match tensor block strides, the
+  // Tensor expression will be invalid, and should not be used by
+  // TensorBlockAssign or for constructing another block expression.
+  kMaterializedInOutput
+};
+
+// -------------------------------------------------------------------------- //
+// TensorBlockNotImplemented should be used to defined TensorBlock typedef in
+// TensorEvaluators that do not support block evaluation.
+
+class TensorBlockNotImplemented {
+ public:
+  typedef void XprType;
+};
+
+// -------------------------------------------------------------------------- //
+// XprScalar extracts Scalar type from the Eigen expressions (if expression type
+// is not void). It's required to be able to define lazy block expression for
+// argument types, that do not support block evaluation.
+
+template <typename XprType>
+struct XprScalar {
+  typedef typename XprType::Scalar type;
+};
+template <>
+struct XprScalar<void> {
+  typedef void type;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorMaterializedBlock is a fully evaluated block of the original tensor,
+// and XprType is just a TensorMap over the data. This block type is typically
+// used to materialize blocks of tensor expressions, that can't be efficiently
+// represented as lazy Tensor expressions with fast coeff/packet operations,
+// e.g. we materialize all broadcasts into evaluated blocks.
+//
+// TensorMaterializedBlock does not own its memory buffer, it's either a memory
+// buffer that backs the original expression (e.g. block is just a view into a
+// Tensor), or a memory buffer allocated with scratch allocator, and in this
+// case the scratch allocator will deallocate it at the end of block based
+// expression execution.
+//
+// If the block was evaluated directly into the output buffer, and strides in
+// the output buffer do not match block strides, the TensorMap expression will
+// be invalid, and should never be used in block assignment or any other tensor
+// expression.
+
+template <typename Scalar, int NumDims, int Layout, typename IndexType = Eigen::Index>
+class TensorMaterializedBlock {
+ public:
+  typedef DSizes<IndexType, NumDims> Dimensions;
+  typedef TensorMap<const Tensor<Scalar, NumDims, Layout> > XprType;
+
+  TensorMaterializedBlock(TensorBlockKind kind, const Scalar* data, const Dimensions& dimensions,
+                          bool valid_expr = true)
+      : m_kind(kind), m_data(data), m_dimensions(dimensions), m_expr(m_data, m_dimensions), m_valid_expr(valid_expr) {
+    eigen_assert(m_kind == internal::TensorBlockKind::kView ||
+                 m_kind == internal::TensorBlockKind::kMaterializedInScratch ||
+                 m_kind == internal::TensorBlockKind::kMaterializedInOutput);
+  }
+
+  TensorBlockKind kind() const { return m_kind; }
+  // NOTE(ezhulenev): Returning XprType by value like in other block types
+  // causes asan failures. The theory is that XprType::Nested doesn't work
+  // properly for TensorMap.
+  const XprType& expr() const {
+    eigen_assert(m_valid_expr);
+    return m_expr;
+  }
+  const Scalar* data() const { return m_data; }
+  void cleanup() {}
+
+  typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
+
+  // TensorMaterializedBlock can be backed by different types of storage:
+  //
+  //   (1) Contiguous block of memory allocated with scratch allocator.
+  //   (2) Contiguous block of memory reused from tensor block descriptor
+  //       destination buffer.
+  //   (3) Strided block of memory reused from tensor block descriptor
+  //       destination buffer.
+  //
+  class Storage {
+   public:
+    Scalar* data() const { return m_data; }
+    const Dimensions& dimensions() const { return m_dimensions; }
+    const Dimensions& strides() const { return m_strides; }
+
+    TensorMaterializedBlock AsTensorMaterializedBlock() const {
+      return TensorMaterializedBlock(m_materialized_in_output ? internal::TensorBlockKind::kMaterializedInOutput
+                                                              : internal::TensorBlockKind::kMaterializedInScratch,
+                                     m_data, m_dimensions, !m_strided_storage);
+    }
+
+   private:
+    friend class TensorMaterializedBlock<Scalar, NumDims, Layout, IndexType>;
+
+    Storage(Scalar* data, const Dimensions& dimensions, const Dimensions& strides, bool materialized_in_output,
+            bool strided_storage)
+        : m_data(data),
+          m_dimensions(dimensions),
+          m_strides(strides),
+          m_materialized_in_output(materialized_in_output),
+          m_strided_storage(strided_storage) {}
+
+    Scalar* m_data;
+    Dimensions m_dimensions;
+    Dimensions m_strides;
+    bool m_materialized_in_output;
+    bool m_strided_storage;
+  };
+
+  // Creates a storage for materialized block either from the block descriptor
+  // destination buffer, or allocates a new buffer with scratch allocator.
+  template <typename TensorBlockScratch>
+  EIGEN_STRONG_INLINE static Storage prepareStorage(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+                                                    bool allow_strided_storage = false) {
+    // Try to reuse destination as an output block buffer.
+    typedef typename TensorBlockDesc::DestinationBuffer DestinationBuffer;
+
+    if (desc.destination().kind() == DestinationBuffer::kContiguous) {
+      Scalar* buffer = desc.destination().template data<Scalar>();
+      desc.DropDestinationBuffer();
+      return Storage(buffer, desc.dimensions(), internal::strides<Layout>(desc.dimensions()),
+                     /*materialized_in_output=*/true,
+                     /*strided_storage=*/false);
+
+    } else if (desc.destination().kind() == DestinationBuffer::kStrided && allow_strided_storage) {
+      Scalar* buffer = desc.destination().template data<Scalar>();
+      desc.DropDestinationBuffer();
+      return Storage(buffer, desc.dimensions(), desc.destination().strides(),
+                     /*materialized_in_output=*/true, /*strided_storage=*/true);
+
+    } else {
+      void* mem = scratch.allocate(desc.size() * sizeof(Scalar));
+      return Storage(static_cast<Scalar*>(mem), desc.dimensions(), internal::strides<Layout>(desc.dimensions()),
+                     /*materialized_in_output=*/false,
+                     /*strided_storage=*/false);
+    }
+  }
+
+  // Creates a materialized block for the given descriptor from a memory buffer.
+  template <typename DataDimensions, typename TensorBlockScratch>
+  EIGEN_STRONG_INLINE static TensorMaterializedBlock materialize(const Scalar* data, const DataDimensions& data_dims,
+                                                                 TensorBlockDesc& desc, TensorBlockScratch& scratch) {
+    eigen_assert(array_size<DataDimensions>::value == desc.dimensions().size());
+
+    // If a tensor block dimensions covers a contiguous block of the underlying
+    // memory, we can skip block buffer memory allocation, and construct a block
+    // from existing `data` memory buffer.
+    //
+    // Example: (RowMajor layout)
+    //   data_dims:          [11, 12, 13, 14]
+    //   desc.dimensions():  [1,   1,  3, 14]
+    //
+    // In this case we can construct a TensorBlock starting at
+    // `data + desc.offset()`, with a `desc.dimensions()` block sizes.
+    static const bool is_col_major = Layout == ColMajor;
+
+    // Find out how many inner dimensions have a matching size.
+    int num_matching_inner_dims = 0;
+    for (int i = 0; i < NumDims; ++i) {
+      int dim = is_col_major ? i : NumDims - i - 1;
+      if (data_dims[dim] != desc.dimensions()[dim]) break;
+      ++num_matching_inner_dims;
+    }
+
+    // All the outer dimensions must be of size `1`, except a single dimension
+    // before the matching inner dimension (`3` in the example above).
+    bool can_use_direct_access = true;
+    for (int i = num_matching_inner_dims + 1; i < NumDims; ++i) {
+      int dim = is_col_major ? i : NumDims - i - 1;
+      if (desc.dimension(dim) != 1) {
+        can_use_direct_access = false;
+        break;
+      }
+    }
+
+    if (can_use_direct_access) {
+      const Scalar* block_start = data + desc.offset();
+      return TensorMaterializedBlock(internal::TensorBlockKind::kView, block_start, desc.dimensions());
+
+    } else {
+      // Reuse destination buffer or allocate new buffer with scratch allocator.
+      const Storage storage = prepareStorage(desc, scratch);
+
+      typedef internal::TensorBlockIO<Scalar, IndexType, NumDims, Layout> TensorBlockIO;
+      typedef typename TensorBlockIO::Dst TensorBlockIODst;
+      typedef typename TensorBlockIO::Src TensorBlockIOSrc;
+
+      TensorBlockIOSrc src(internal::strides<Layout>(Dimensions(data_dims)), data, desc.offset());
+      TensorBlockIODst dst(storage.dimensions(), storage.strides(), storage.data());
+
+      TensorBlockIO::Copy(dst, src);
+      return storage.AsTensorMaterializedBlock();
+    }
+  }
+
+ private:
+  TensorBlockKind m_kind;
+  const Scalar* m_data;
+  Dimensions m_dimensions;
+  XprType m_expr;
+  bool m_valid_expr;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorCwiseUnaryBlock is a lazy tensor expression block that applies UnaryOp
+// functor to the blocks produced by the underlying Tensor expression.
+
+template <typename UnaryOp, typename ArgTensorBlock>
+class TensorCwiseUnaryBlock {
+  static constexpr bool NoArgBlockAccess = internal::is_void<typename ArgTensorBlock::XprType>::value;
+
+ public:
+  typedef std::conditional_t<NoArgBlockAccess, void,
+                             TensorCwiseUnaryOp<UnaryOp, const typename ArgTensorBlock::XprType> >
+      XprType;
+
+  typedef typename XprScalar<XprType>::type Scalar;
+
+  TensorCwiseUnaryBlock(const ArgTensorBlock& arg_block, const UnaryOp& functor)
+      : m_arg_block(arg_block), m_functor(functor) {}
+
+  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
+
+  XprType expr() const { return XprType(m_arg_block.expr(), m_functor); }
+  const Scalar* data() const { return NULL; }
+  void cleanup() { m_arg_block.cleanup(); }
+
+ private:
+  ArgTensorBlock m_arg_block;
+  UnaryOp m_functor;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorCwiseUnaryBlock is a lazy tensor expression block that applies BinaryOp
+// functor to the blocks produced by the underlying Tensor expression.
+
+template <typename BinaryOp, typename LhsTensorBlock, typename RhsTensorBlock>
+class TensorCwiseBinaryBlock {
+  static constexpr bool NoArgBlockAccess = internal::is_void<typename LhsTensorBlock::XprType>::value ||
+                                           internal::is_void<typename RhsTensorBlock::XprType>::value;
+
+ public:
+  typedef std::conditional_t<
+      NoArgBlockAccess, void,
+      TensorCwiseBinaryOp<BinaryOp, const typename LhsTensorBlock::XprType, const typename RhsTensorBlock::XprType> >
+      XprType;
+
+  typedef typename XprScalar<XprType>::type Scalar;
+
+  TensorCwiseBinaryBlock(const LhsTensorBlock& left_block, const RhsTensorBlock& right_block, const BinaryOp& functor)
+      : m_left_block(left_block), m_right_block(right_block), m_functor(functor) {}
+
+  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
+
+  XprType expr() const { return XprType(m_left_block.expr(), m_right_block.expr(), m_functor); }
+
+  const Scalar* data() const { return NULL; }
+
+  void cleanup() {
+    m_left_block.cleanup();
+    m_right_block.cleanup();
+  }
+
+ private:
+  LhsTensorBlock m_left_block;
+  RhsTensorBlock m_right_block;
+  BinaryOp m_functor;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorUnaryExprBlock is a lazy tensor expression block that can construct
+// an arbitrary tensor expression from a block of the underlying type (this is a
+// generalization of the TensorCwiseUnaryBlock for arbitrary expressions).
+
+template <typename BlockFactory, typename ArgTensorBlock>
+class TensorUnaryExprBlock {
+  typedef typename ArgTensorBlock::XprType ArgXprType;
+  static constexpr bool NoArgBlockAccess = internal::is_void<ArgXprType>::value;
+
+ public:
+  typedef std::conditional_t<NoArgBlockAccess, void, typename BlockFactory::template XprType<ArgXprType>::type> XprType;
+
+  typedef typename XprScalar<XprType>::type Scalar;
+
+  TensorUnaryExprBlock(const ArgTensorBlock& arg_block, const BlockFactory& factory)
+      : m_arg_block(arg_block), m_factory(factory) {}
+
+  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
+  XprType expr() const { return m_factory.expr(m_arg_block.expr()); }
+  const Scalar* data() const { return NULL; }
+  void cleanup() { m_arg_block.cleanup(); }
+
+ private:
+  ArgTensorBlock m_arg_block;
+  BlockFactory m_factory;
+};
+
+// -------------------------------------------------------------------------- //
+// TensorTernaryExprBlock is a lazy tensor expression block that can construct
+// an arbitrary tensor expression from three blocks of the underlying type.
+
+template <typename BlockFactory, typename Arg1TensorBlock, typename Arg2TensorBlock, typename Arg3TensorBlock>
+class TensorTernaryExprBlock {
+  typedef typename Arg1TensorBlock::XprType Arg1XprType;
+  typedef typename Arg2TensorBlock::XprType Arg2XprType;
+  typedef typename Arg3TensorBlock::XprType Arg3XprType;
+
+  static constexpr bool NoArgBlockAccess = internal::is_void<Arg1XprType>::value ||
+                                           internal::is_void<Arg2XprType>::value ||
+                                           internal::is_void<Arg3XprType>::value;
+
+ public:
+  typedef std::conditional_t<NoArgBlockAccess, void,
+                             typename BlockFactory::template XprType<Arg1XprType, Arg2XprType, Arg3XprType>::type>
+      XprType;
+
+  typedef typename XprScalar<XprType>::type Scalar;
+
+  TensorTernaryExprBlock(const Arg1TensorBlock& arg1_block, const Arg2TensorBlock& arg2_block,
+                         const Arg3TensorBlock& arg3_block, const BlockFactory& factory)
+      : m_arg1_block(arg1_block), m_arg2_block(arg2_block), m_arg3_block(arg3_block), m_factory(factory) {}
+
+  TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
+  XprType expr() const { return m_factory.expr(m_arg1_block.expr(), m_arg2_block.expr(), m_arg3_block.expr()); }
+  const Scalar* data() const { return NULL; }
+  void cleanup() {
+    m_arg1_block.cleanup();
+    m_arg2_block.cleanup();
+    m_arg3_block.cleanup();
+  }
+
+ private:
+  Arg1TensorBlock m_arg1_block;
+  Arg2TensorBlock m_arg2_block;
+  Arg3TensorBlock m_arg3_block;
+  BlockFactory m_factory;
+};
+
+// -------------------------------------------------------------------------- //
+// StridedLinearBufferCopy provides a method to copy data between two linear
+// buffers with different strides, with optimized paths for scatter/gather.
+
+template <typename Scalar, typename IndexType>
+class StridedLinearBufferCopy {
+  typedef typename packet_traits<Scalar>::type Packet;
+  typedef typename unpacket_traits<Packet>::half HalfPacket;
+  enum {
+    Vectorizable = packet_traits<Scalar>::Vectorizable,
+    PacketSize = packet_traits<Scalar>::size,
+    HalfPacketSize = unpacket_traits<HalfPacket>::size,
+    HasHalfPacket = static_cast<int>(HalfPacketSize) < static_cast<int>(PacketSize)
+  };
+
+ public:
+  // Specifying linear copy kind statically gives ~30% speedup for small sizes.
+  enum class Kind {
+    Linear = 0,       // src_stride == 1 && dst_stride == 1
+    Scatter = 1,      // src_stride == 1 && dst_stride != 1
+    FillLinear = 2,   // src_stride == 0 && dst_stride == 1
+    FillScatter = 3,  // src_stride == 0 && dst_stride != 1
+    Gather = 4,       // dst_stride == 1
+    Random = 5        // everything else
+  };
+
+  struct Dst {
+    Dst(IndexType o, IndexType s, Scalar* d) : offset(o), stride(s), data(d) {}
+
+    IndexType offset;
+    IndexType stride;
+    Scalar* data;
+  };
+
+  struct Src {
+    Src(IndexType o, IndexType s, const Scalar* d) : offset(o), stride(s), data(d) {}
+
+    IndexType offset;
+    IndexType stride;
+    const Scalar* data;
+  };
+
+  template <typename StridedLinearBufferCopy::Kind kind>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Dst& dst, const Src& src, const size_t count) {
+    Run<kind>(count, dst.offset, dst.stride, dst.data, src.offset, src.stride, src.data);
+  }
+
+ private:
+  template <typename StridedLinearBufferCopy::Kind kind>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const IndexType count, const IndexType dst_offset,
+                                                        const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data,
+                                                        const IndexType src_offset, const IndexType src_stride,
+                                                        const Scalar* EIGEN_RESTRICT src_data) {
+    const Scalar* src = &src_data[src_offset];
+    Scalar* dst = &dst_data[dst_offset];
+
+    if (!Vectorizable) {
+      for (Index i = 0; i < count; ++i) {
+        dst[i * dst_stride] = src[i * src_stride];
+      }
+      return;
+    }
+
+    const IndexType vectorized_size = PacketSize * (count / PacketSize);
+    IndexType i = 0;
+
+    if (kind == StridedLinearBufferCopy::Kind::Linear) {
+      // ******************************************************************** //
+      // Linear copy from `src` to `dst`.
+      const IndexType unrolled_size = (4 * PacketSize) * (count / (4 * PacketSize));
+      eigen_assert(src_stride == 1 && dst_stride == 1);
+      for (; i < unrolled_size; i += 4 * PacketSize) {
+        for (int j = 0; j < 4; ++j) {
+          Packet p = ploadu<Packet>(src + i + j * PacketSize);
+          pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
+        }
+      }
+      for (; i < vectorized_size; i += PacketSize) {
+        Packet p = ploadu<Packet>(src + i);
+        pstoreu<Scalar, Packet>(dst + i, p);
+      }
+      if (HasHalfPacket) {
+        const IndexType vectorized_half_size = HalfPacketSize * (count / HalfPacketSize);
+        if (i < vectorized_half_size) {
+          HalfPacket p = ploadu<HalfPacket>(src + i);
+          pstoreu<Scalar, HalfPacket>(dst + i, p);
+          i += HalfPacketSize;
+        }
+      }
+      for (; i < count; ++i) {
+        dst[i] = src[i];
+      }
+      // ******************************************************************** //
+    } else if (kind == StridedLinearBufferCopy::Kind::Scatter) {
+      // Scatter from `src` to `dst`.
+      eigen_assert(src_stride == 1 && dst_stride != 1);
+      for (; i < vectorized_size; i += PacketSize) {
+        Packet p = ploadu<Packet>(src + i);
+        pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
+      }
+      if (HasHalfPacket) {
+        const IndexType vectorized_half_size = HalfPacketSize * (count / HalfPacketSize);
+        if (i < vectorized_half_size) {
+          HalfPacket p = ploadu<HalfPacket>(src + i);
+          pscatter<Scalar, HalfPacket>(dst + i * dst_stride, p, dst_stride);
+          i += HalfPacketSize;
+        }
+      }
+      for (; i < count; ++i) {
+        dst[i * dst_stride] = src[i];
+      }
+      // ******************************************************************** //
+    } else if (kind == StridedLinearBufferCopy::Kind::FillLinear) {
+      // Fill `dst` with value at `*src`.
+      eigen_assert(src_stride == 0 && dst_stride == 1);
+
+      const IndexType unrolled_size = (4 * PacketSize) * (count / (4 * PacketSize));
+      Scalar s = *src;
+      Packet p = pset1<Packet>(s);
+      for (; i < unrolled_size; i += 4 * PacketSize) {
+        for (int j = 0; j < 4; ++j) {
+          pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
+        }
+      }
+      for (; i < vectorized_size; i += PacketSize) {
+        pstoreu<Scalar, Packet>(dst + i, p);
+      }
+      if (HasHalfPacket) {
+        const IndexType vectorized_half_size = HalfPacketSize * (count / HalfPacketSize);
+        if (i < vectorized_half_size) {
+          HalfPacket hp = pset1<HalfPacket>(s);
+          pstoreu<Scalar, HalfPacket>(dst + i, hp);
+          i += HalfPacketSize;
+        }
+      }
+      for (; i < count; ++i) {
+        dst[i] = s;
+      }
+      // ******************************************************************** //
+    } else if (kind == StridedLinearBufferCopy::Kind::FillScatter) {
+      // Scatter `*src` into `dst`.
+      eigen_assert(src_stride == 0 && dst_stride != 1);
+      Scalar s = *src;
+      Packet p = pset1<Packet>(s);
+      for (; i < vectorized_size; i += PacketSize) {
+        pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
+      }
+      if (HasHalfPacket) {
+        const IndexType vectorized_half_size = HalfPacketSize * (count / HalfPacketSize);
+        if (i < vectorized_half_size) {
+          HalfPacket hp = pset1<HalfPacket>(s);
+          pscatter<Scalar, HalfPacket>(dst + i * dst_stride, hp, dst_stride);
+          i += HalfPacketSize;
+        }
+      }
+      for (; i < count; ++i) {
+        dst[i * dst_stride] = s;
+      }
+      // ******************************************************************** //
+    } else if (kind == StridedLinearBufferCopy::Kind::Gather) {
+      // Gather from `src` into `dst`.
+      eigen_assert(dst_stride == 1);
+      for (; i < vectorized_size; i += PacketSize) {
+        Packet p = pgather<Scalar, Packet>(src + i * src_stride, src_stride);
+        pstoreu<Scalar, Packet>(dst + i, p);
+      }
+      if (HasHalfPacket) {
+        const IndexType vectorized_half_size = HalfPacketSize * (count / HalfPacketSize);
+        if (i < vectorized_half_size) {
+          HalfPacket p = pgather<Scalar, HalfPacket>(src + i * src_stride, src_stride);
+          pstoreu<Scalar, HalfPacket>(dst + i, p);
+          i += HalfPacketSize;
+        }
+      }
+      for (; i < count; ++i) {
+        dst[i] = src[i * src_stride];
+      }
+      // ******************************************************************** //
+    } else if (kind == StridedLinearBufferCopy::Kind::Random) {
+      // Random.
+      for (; i < count; ++i) {
+        dst[i * dst_stride] = src[i * src_stride];
+      }
+    } else {
+      eigen_assert(false);
+    }
+  }
+};
+
+// -------------------------------------------------------------------------- //
+// TensorBlockIO copies data from `src` tensor block, to the `dst` tensor block.
+// It's possible to specify src->dst dimension mapping for the copy operation.
+// Dimensions of `dst` specify how many elements have to be copied, for the
+// `src` we need to know only stride to navigate through source memory buffer.
+
+template <typename Scalar, typename IndexType, int NumDims, int Layout>
+class TensorBlockIO {
+  static constexpr bool IsColMajor = (Layout == ColMajor);
+
+  typedef StridedLinearBufferCopy<Scalar, IndexType> LinCopy;
+
+ public:
+  typedef DSizes<IndexType, NumDims> Dimensions;
+  typedef DSizes<int, NumDims> DimensionsMap;
+
+  struct Dst {
+    Dst(const Dimensions& dst_dims, const Dimensions& dst_strides, Scalar* dst, IndexType dst_offset = 0)
+        : dims(dst_dims), strides(dst_strides), data(dst), offset(dst_offset) {}
+
+    Dimensions dims;
+    Dimensions strides;
+    Scalar* data;
+    IndexType offset;
+  };
+
+  struct Src {
+    Src(const Dimensions& src_strides, const Scalar* src, IndexType src_offset = 0)
+        : strides(src_strides), data(src), offset(src_offset) {}
+
+    Dimensions strides;
+    const Scalar* data;
+    IndexType offset;
+  };
+
+  // Copies data to `dst` from `src`, using provided dimensions mapping:
+  //
+  //   src_dimension_index = dst_to_src_dim_map[dst_dimension_index]
+  //
+  // Returns the number of copied elements.
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType Copy(const Dst& dst, const Src& src,
+                                                              const DimensionsMap& dst_to_src_dim_map) {
+    // Copy single scalar value from `src` to `dst`.
+    if (NumDims == 0) {
+      *(dst.data + dst.offset) = *(src.data + src.offset);
+      return 1;
+    }
+
+    // Both `dst` and `src` must have contiguous innermost dimension. We also
+    // accept the special case with stride '0', because it's used as a trick to
+    // implement broadcasting.
+    {
+      int inner_dim = IsColMajor ? 0 : NumDims - 1;
+      EIGEN_UNUSED_VARIABLE(inner_dim);
+      eigen_assert(dst.strides[inner_dim] == 1 || dst.strides[inner_dim] == 0);
+      eigen_assert(src.strides[inner_dim] == 1 || src.strides[inner_dim] == 0);
+    }
+
+    // Give a shorter name to `dst_to_src_dim_map`.
+    const DimensionsMap& dim_map = dst_to_src_dim_map;
+
+    // Do not squeeze reordered inner dimensions.
+    int num_squeezable_dims = NumSqueezableInnerDims(dim_map);
+
+    // NOTE: We find the innermost dimension (contiguous in memory) in the dst
+    // block, and we write data linearly into that dimension, reading it from
+    // the src. If dimensions are reordered, we might end up reading data from
+    // the src with `stride != 1`.
+    //
+    // NOTE: Random-Read/Linear-Write can be up to ~2X faster than
+    // Linear-Read/Random-Write: https://stackoverflow.com/a/54935680
+
+    // Find the innermost dimension in the dst whose size is not 1. This is the
+    // effective inner dim.
+    int num_size_one_inner_dims = 0;
+    for (int i = 0; i < num_squeezable_dims; ++i) {
+      const int dst_dim = IsColMajor ? i : NumDims - i - 1;
+      if (dst.dims[dst_dim] != 1) break;
+      num_size_one_inner_dims++;
+    }
+
+    // If all dimensions are of size 1, just copy a scalar from `src` to `dst`.
+    if (num_size_one_inner_dims == NumDims) {
+      *(dst.data + dst.offset) = *(src.data + src.offset);
+      return 1;
+    }
+
+    // Outermost dimension in the dst with `stride == 1` (contiguous in memory).
+    const int dst_stride1_dim = IsColMajor ? num_size_one_inner_dims : NumDims - num_size_one_inner_dims - 1;
+
+    // Dimension in the src that corresponds to the dst innermost dimension.
+    const int src_dim_for_dst_stride1_dim = NumDims == 0 ? 1 : dim_map[dst_stride1_dim];
+
+    // Size of the innermost dimension (length of contiguous blocks of memory).
+    IndexType dst_inner_dim_size = NumDims == 0 ? 1 : dst.dims[dst_stride1_dim];
+
+    // Squeeze multiple inner dims into one if they are contiguous in `dst` and
+    // `src` memory, so we can do less linear copy calls.
+    for (int i = num_size_one_inner_dims + 1; i < num_squeezable_dims; ++i) {
+      const int dst_dim = IsColMajor ? i : NumDims - i - 1;
+      const IndexType dst_stride = dst.strides[dst_dim];
+      const IndexType src_stride = src.strides[dim_map[dst_dim]];
+      if (dst_inner_dim_size == dst_stride && dst_stride == src_stride) {
+        dst_inner_dim_size *= dst.dims[dst_dim];
+        ++num_size_one_inner_dims;
+      } else {
+        break;
+      }
+    }
+
+    // Setup strides to read data from `src` and write to `dst`.
+    IndexType input_offset = src.offset;
+    IndexType output_offset = dst.offset;
+    IndexType input_stride = NumDims == 0 ? 1 : src.strides[src_dim_for_dst_stride1_dim];
+    IndexType output_stride = NumDims == 0 ? 1 : dst.strides[dst_stride1_dim];
+
+    const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
+    array<BlockIteratorState, at_least_1_dim> it;
+
+    // Initialize block iterator state. Squeeze away any dimension of size 1.
+    int idx = 0;  // currently initialized iterator state index
+    for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
+      const int dst_dim = IsColMajor ? i + 1 : NumDims - i - 2;
+      if (dst.dims[dst_dim] == 1) continue;
+
+      it[idx].size = dst.dims[dst_dim];
+      it[idx].input_stride = src.strides[dim_map[dst_dim]];
+      it[idx].output_stride = dst.strides[dst_dim];
+
+      it[idx].input_span = it[idx].input_stride * (it[idx].size - 1);
+      it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
+
+      idx++;
+    }
+
+    // Iterate copying data from src to dst.
+    const IndexType block_total_size = NumDims == 0 ? 1 : dst.dims.TotalSize();
+
+#define COPY_INNER_DIM(KIND)                                                                                      \
+  IndexType num_copied = 0;                                                                                       \
+  for (num_copied = 0; num_copied < block_total_size; num_copied += dst_inner_dim_size) {                         \
+    LinCopy::template Run<KIND>(typename LinCopy::Dst(output_offset, output_stride, dst.data),                    \
+                                typename LinCopy::Src(input_offset, input_stride, src.data), dst_inner_dim_size); \
+                                                                                                                  \
+    for (int j = 0; j < idx; ++j) {                                                                               \
+      if (++it[j].count < it[j].size) {                                                                           \
+        input_offset += it[j].input_stride;                                                                       \
+        output_offset += it[j].output_stride;                                                                     \
+        break;                                                                                                    \
+      }                                                                                                           \
+      it[j].count = 0;                                                                                            \
+      input_offset -= it[j].input_span;                                                                           \
+      output_offset -= it[j].output_span;                                                                         \
+    }                                                                                                             \
+  }                                                                                                               \
+  return num_copied;
+
+    if (input_stride == 1 && output_stride == 1) {
+      COPY_INNER_DIM(LinCopy::Kind::Linear);
+    } else if (input_stride == 1 && output_stride != 1) {
+      COPY_INNER_DIM(LinCopy::Kind::Scatter);
+    } else if (input_stride == 0 && output_stride == 1) {
+      COPY_INNER_DIM(LinCopy::Kind::FillLinear);
+    } else if (input_stride == 0 && output_stride != 1) {
+      COPY_INNER_DIM(LinCopy::Kind::FillScatter);
+    } else if (output_stride == 1) {
+      COPY_INNER_DIM(LinCopy::Kind::Gather);
+    } else {
+      COPY_INNER_DIM(LinCopy::Kind::Random);
+    }
+
+#undef COPY_INNER_DIM
+  }
+
+  // Copy from `src` to `dst` with an identity src->dst dimension map. Returns
+  // the number of copied elements.
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexType Copy(const Dst& dst, const Src& src) {
+    DimensionsMap dst_to_src_map;
+    for (int i = 0; i < NumDims; ++i) dst_to_src_map[i] = i;
+    return Copy(dst, src, dst_to_src_map);
+  }
+
+ private:
+  struct BlockIteratorState {
+    BlockIteratorState() : size(0), count(0), input_stride(0), output_stride(0), input_span(0), output_span(0) {}
+
+    IndexType size;
+    IndexType count;
+    IndexType input_stride;
+    IndexType output_stride;
+    IndexType input_span;
+    IndexType output_span;
+  };
+
+  // Compute how many inner dimensions it's allowed to squeeze when doing IO
+  // between two tensor blocks. It's safe to squeeze inner dimensions, only
+  // if they are not reordered.
+  static int NumSqueezableInnerDims(const DimensionsMap& dim_map) {
+    int num_squeezable_dims = 0;
+    for (int i = 0; i < NumDims; ++i) {
+      const int dim = IsColMajor ? i : NumDims - i - 1;
+      if (dim_map[dim] != dim) break;
+      num_squeezable_dims++;
+    }
+    return num_squeezable_dims;
+  }
+};
+
+// -------------------------------------------------------------------------- //
+// TensorBlockAssignment assigns a block expression of type `TensorBlockExpr` to
+// a Tensor block defined by `desc`, backed by a memory buffer at `target`.
+//
+// Currently there is no way to write from a Tensor expression to a block of
+// memory, if dimensions are reordered. If you need to do that, you should
+// materialize a Tensor block expression into a memory buffer, and then use
+// TensorBlockIO to copy data between two memory buffers with a custom
+// `target->src` dimension map (see definition above).
+//
+// Also currently the innermost dimension of `target` must have a stride '1'
+// (contiguous in memory). This restriction could be lifted with a `pscatter`,
+// but in practice it's never needed, and there is a similar TensorBlockIO
+// workaround for that.
+//
+// TODO(ezhulenev): TensorBlockAssignment is a special case of TensorBlockIO
+// where `src` is a tensor expression. Explore if it is possible to rewrite IO
+// to use expressions instead of pointers, and after that TensorBlockAssignment
+// will become an alias to IO.
+template <typename Scalar, int NumDims, typename TensorBlockExpr, typename IndexType = Eigen::Index>
+class TensorBlockAssignment {
+  // We will use coeff/packet path to evaluate block expressions.
+  typedef TensorEvaluator<const TensorBlockExpr, DefaultDevice> TensorBlockEvaluator;
+
+  typedef DSizes<IndexType, NumDims> Dimensions;
+
+  enum { Vectorizable = packet_traits<Scalar>::Vectorizable, PacketSize = packet_traits<Scalar>::size };
+
+  template <bool Vectorizable, typename Evaluator>
+  struct InnerDimAssign {
+    EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count, const Evaluator& eval, IndexType eval_offset) {
+      for (IndexType i = 0; i < count; ++i) {
+        target[i] = eval.coeff(eval_offset + i);
+      }
+    }
+  };
+
+  template <typename Evaluator>
+  struct InnerDimAssign<true, Evaluator> {
+    EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count, const Evaluator& eval, IndexType eval_offset) {
+      typedef typename packet_traits<Scalar>::type Packet;
+
+      const IndexType unrolled_size = (4 * PacketSize) * (count / (4 * PacketSize));
+      const IndexType vectorized_size = PacketSize * (count / PacketSize);
+      IndexType i = 0;
+
+      for (; i < unrolled_size; i += 4 * PacketSize) {
+        for (int j = 0; j < 4; ++j) {
+          const IndexType idx = eval_offset + i + j * PacketSize;
+          Packet p = eval.template packet<Unaligned>(idx);
+          pstoreu<Scalar>(target + i + j * PacketSize, p);
+        }
+      }
+
+      for (; i < vectorized_size; i += PacketSize) {
+        Packet p = eval.template packet<Unaligned>(eval_offset + i);
+        pstoreu<Scalar>(target + i, p);
+      }
+
+      for (; i < count; ++i) {
+        target[i] = eval.coeff(eval_offset + i);
+      }
+    }
+  };
+
+ public:
+  struct Target {
+    Target(const Dimensions& target_dims, const Dimensions& target_strides, Scalar* target_data,
+           IndexType target_offset = 0)
+        : dims(target_dims), strides(target_strides), data(target_data), offset(target_offset) {}
+
+    Dimensions dims;
+    Dimensions strides;
+    Scalar* data;
+    IndexType offset;
+  };
+
+  static Target target(const Dimensions& target_dims, const Dimensions& target_strides, Scalar* target_data,
+                       IndexType target_offset = 0) {
+    return Target(target_dims, target_strides, target_data, target_offset);
+  }
+
+  template <typename TargetDimsIndexType, typename TargetStridesIndexType>
+  static Target target(const DSizes<TargetDimsIndexType, NumDims>& target_dims,
+                       const DSizes<TargetStridesIndexType, NumDims>& target_strides, Scalar* target_data,
+                       IndexType target_offset = 0) {
+    // DSizes constructor will do index type promotion if it's safe.
+    return Target(Dimensions(target_dims), Dimensions(target_strides), target_data, target_offset);
+  }
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Target& target, const TensorBlockExpr& expr) {
+    // Prepare evaluator for block expression.
+    DefaultDevice default_device;
+    TensorBlockEvaluator eval(expr, default_device);
+
+    // Tensor block expression dimension should match destination dimensions.
+    eigen_assert(dimensions_match(target.dims, eval.dimensions()));
+
+    static const int Layout = TensorBlockEvaluator::Layout;
+    static const bool is_col_major = Layout == ColMajor;
+
+    // Initialize output inner dimension size based on a layout.
+    const IndexType output_size = NumDims == 0 ? 1 : target.dims.TotalSize();
+    const int inner_dim_idx = is_col_major ? 0 : NumDims - 1;
+    IndexType output_inner_dim_size = target.dims[inner_dim_idx];
+
+    // Target inner dimension stride must be '1'.
+    eigen_assert(target.strides[inner_dim_idx] == 1);
+
+    // Squeeze multiple inner dims into one if they are contiguous in `target`.
+    IndexType num_squeezed_dims = 0;
+    for (Index i = 1; i < NumDims; ++i) {
+      const Index dim = is_col_major ? i : NumDims - i - 1;
+      const IndexType target_stride = target.strides[dim];
+
+      if (output_inner_dim_size == target_stride) {
+        output_inner_dim_size *= target.dims[dim];
+        num_squeezed_dims++;
+      } else {
+        break;
+      }
+    }
+
+    // Initialize output block iterator state. Dimension in this array are
+    // always in inner_most -> outer_most order (col major layout).
+    array<BlockIteratorState, NumDims> it;
+
+    int idx = 0;  // currently initialized iterator state index
+    for (Index i = num_squeezed_dims; i < NumDims - 1; ++i) {
+      const Index dim = is_col_major ? i + 1 : NumDims - i - 2;
+
+      it[idx].count = 0;
+      it[idx].size = target.dims[dim];
+      it[idx].output_stride = target.strides[dim];
+      it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
+      idx++;
+    }
+
+    // We read block expression from the beginning, and start writing data to
+    // `target` at given offset.
+    IndexType input_offset = 0;
+    IndexType output_offset = target.offset;
+
+    // Iterate copying data from `eval` to `target`.
+    for (IndexType i = 0; i < output_size; i += output_inner_dim_size) {
+      // Assign to `target` at current offset.
+      InnerDimAssign<Vectorizable && TensorBlockEvaluator::PacketAccess, TensorBlockEvaluator>::Run(
+          target.data + output_offset, output_inner_dim_size, eval, input_offset);
+
+      // Move input offset forward by the number of assigned coefficients.
+      input_offset += output_inner_dim_size;
+
+      // Update index.
+      for (int j = 0; j < idx; ++j) {
+        if (++it[j].count < it[j].size) {
+          output_offset += it[j].output_stride;
+          break;
+        }
+        it[j].count = 0;
+        output_offset -= it[j].output_span;
+      }
+    }
+  }
+
+ private:
+  struct BlockIteratorState {
+    BlockIteratorState() : count(0), size(0), output_stride(0), output_span(0) {}
+
+    IndexType count;
+    IndexType size;
+    IndexType output_stride;
+    IndexType output_span;
+  };
+};
+
+// -------------------------------------------------------------------------- //
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
new file mode 100644
index 00000000..aad1647c
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -0,0 +1,1001 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H
+#define EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+template <typename Broadcast, typename XprType>
+struct traits<TensorBroadcastingOp<Broadcast, XprType>> : public traits<XprType> {
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef std::remove_reference_t<Nested> Nested_;
+  static constexpr int NumDimensions = XprTraits::NumDimensions;
+  static constexpr int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
+  enum {
+    // Broadcast is read-only.
+    Flags = traits<XprType>::Flags & ~LvalueBit
+  };
+};
+
+template <typename Broadcast, typename XprType>
+struct eval<TensorBroadcastingOp<Broadcast, XprType>, Eigen::Dense> {
+  typedef const TensorBroadcastingOp<Broadcast, XprType> EIGEN_DEVICE_REF type;
+};
+
+template <typename Broadcast, typename XprType>
+struct nested<TensorBroadcastingOp<Broadcast, XprType>, 1,
+              typename eval<TensorBroadcastingOp<Broadcast, XprType>>::type> {
+  typedef TensorBroadcastingOp<Broadcast, XprType> type;
+};
+
+template <typename Dims>
+struct is_input_scalar {
+  static const bool value = false;
+};
+template <>
+struct is_input_scalar<Sizes<>> {
+  static const bool value = true;
+};
+template <typename std::ptrdiff_t... Indices>
+struct is_input_scalar<Sizes<Indices...>> {
+  static constexpr bool value = (Sizes<Indices...>::total_size == 1);
+};
+
+}  // end namespace internal
+
+/** Tensor broadcasting class.
+ * \ingroup CXX11_Tensor_Module
+ */
+template <typename Broadcast, typename XprType>
+class TensorBroadcastingOp : public TensorBase<TensorBroadcastingOp<Broadcast, XprType>, ReadOnlyAccessors> {
+ public:
+  typedef typename Eigen::internal::traits<TensorBroadcastingOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorBroadcastingOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorBroadcastingOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorBroadcastingOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBroadcastingOp(const XprType& expr, const Broadcast& broadcast)
+      : m_xpr(expr), m_broadcast(broadcast) {}
+
+  EIGEN_DEVICE_FUNC const Broadcast& broadcast() const { return m_broadcast; }
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename XprType::Nested>& expression() const { return m_xpr; }
+
+ protected:
+  typename XprType::Nested m_xpr;
+  const Broadcast m_broadcast;
+};
+
+// Eval as rvalue
+template <typename Broadcast, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device> {
+  typedef TensorBroadcastingOp<Broadcast, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static constexpr int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+
+ protected:  //  all the non-static fields must have the same access control, otherwise the TensorEvaluator won't be
+             //  standard layout;
+  bool isCopy, nByOne, oneByN;
+
+ public:
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  enum {
+    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
+    PreferBlockAccess = true,
+    RawAccess = false
+  };
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+
+  typedef std::remove_const_t<Scalar> ScalarNoConst;
+
+  // We do block based broadcasting using a trick with 2x tensor rank and 0
+  // strides. See block method implementation for details.
+  typedef DSizes<Index, 2 * NumDims> BroadcastDimensions;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock ArgTensorBlock;
+
+  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims, Layout, Index> TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : isCopy(false),
+        nByOne(false),
+        oneByN(false),
+        m_device(device),
+        m_broadcast(op.broadcast()),
+        m_impl(op.expression(), device) {
+    // The broadcasting op doesn't change the rank of the tensor. One can't broadcast a scalar
+    // and store the result in a scalar. Instead one should reshape the scalar into a N-D
+    // tensor with N >= 1 of 1 element first and then broadcast.
+    EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    const InputDimensions& input_dims = m_impl.dimensions();
+    isCopy = true;
+    for (int i = 0; i < NumDims; ++i) {
+      eigen_assert(input_dims[i] > 0);
+      m_dimensions[i] = input_dims[i] * m_broadcast[i];
+      if (m_broadcast[i] != 1) {
+        isCopy = false;
+      }
+    }
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_inputStrides[0] = 1;
+      m_outputStrides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_inputStrides[i] = m_inputStrides[i - 1] * input_dims[i - 1];
+        m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
+      }
+    } else {
+      m_inputStrides[NumDims - 1] = 1;
+      m_outputStrides[NumDims - 1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1];
+        m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
+      }
+    }
+
+    if (input_dims[0] == 1) {
+      oneByN = true;
+      for (int i = 1; i < NumDims; ++i) {
+        if (m_broadcast[i] != 1) {
+          oneByN = false;
+          break;
+        }
+      }
+    } else if (input_dims[NumDims - 1] == 1) {
+      nByOne = true;
+      for (int i = 0; i < NumDims - 1; ++i) {
+        if (m_broadcast[i] != 1) {
+          nByOne = false;
+          break;
+        }
+      }
+    }
+
+    // Handle special format like NCHW, its input shape is '[1, N..., 1]' and
+    // broadcast shape is '[N, 1..., N]'
+    if (!oneByN && !nByOne) {
+      if (input_dims[0] == 1 && input_dims[NumDims - 1] == 1 && NumDims > 2) {
+        nByOne = true;
+        oneByN = true;
+        for (int i = 1; i < NumDims - 1; ++i) {
+          if (m_broadcast[i] != 1) {
+            nByOne = false;
+            oneByN = false;
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType, EvalSubExprsCallback done) {
+    m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const {
+    if (internal::is_input_scalar<internal::remove_all_t<InputDimensions>>::value) {
+      return m_impl.coeff(0);
+    }
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      if (isCopy) {
+        return m_impl.coeff(index);
+      } else {
+        return coeffColMajor(index);
+      }
+    } else {
+      if (isCopy) {
+        return m_impl.coeff(index);
+      } else {
+        return coeffRowMajor(index);
+      }
+    }
+  }
+
+  // TODO: attempt to speed this up. The integer divisions and modulo are slow
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexColMajor(Index index) const {
+    Index inputIndex = 0;
+    EIGEN_UNROLL_LOOP
+    for (int i = NumDims - 1; i > 0; --i) {
+      const Index idx = index / m_outputStrides[i];
+      if (internal::index_statically_eq<Broadcast>(i, 1)) {
+        eigen_assert(idx < m_impl.dimensions()[i]);
+        inputIndex += idx * m_inputStrides[i];
+      } else {
+        if (internal::index_statically_eq<InputDimensions>(i, 1)) {
+          eigen_assert(idx % m_impl.dimensions()[i] == 0);
+        } else {
+          inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
+        }
+      }
+      index -= idx * m_outputStrides[i];
+    }
+    if (internal::index_statically_eq<Broadcast>(0, 1)) {
+      eigen_assert(index < m_impl.dimensions()[0]);
+      inputIndex += index;
+    } else {
+      if (internal::index_statically_eq<InputDimensions>(0, 1)) {
+        eigen_assert(index % m_impl.dimensions()[0] == 0);
+      } else {
+        inputIndex += (index % m_impl.dimensions()[0]);
+      }
+    }
+    return inputIndex;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const {
+    return m_impl.coeff(indexColMajor(index));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexRowMajor(Index index) const {
+    Index inputIndex = 0;
+    EIGEN_UNROLL_LOOP
+    for (int i = 0; i < NumDims - 1; ++i) {
+      const Index idx = index / m_outputStrides[i];
+      if (internal::index_statically_eq<Broadcast>(i, 1)) {
+        eigen_assert(idx < m_impl.dimensions()[i]);
+        inputIndex += idx * m_inputStrides[i];
+      } else {
+        if (internal::index_statically_eq<InputDimensions>(i, 1)) {
+          eigen_assert(idx % m_impl.dimensions()[i] == 0);
+        } else {
+          inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
+        }
+      }
+      index -= idx * m_outputStrides[i];
+    }
+    if (internal::index_statically_eq<Broadcast>(NumDims - 1, 1)) {
+      eigen_assert(index < m_impl.dimensions()[NumDims - 1]);
+      inputIndex += index;
+    } else {
+      if (internal::index_statically_eq<InputDimensions>(NumDims - 1, 1)) {
+        eigen_assert(index % m_impl.dimensions()[NumDims - 1] == 0);
+      } else {
+        inputIndex += (index % m_impl.dimensions()[NumDims - 1]);
+      }
+    }
+    return inputIndex;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const {
+    return m_impl.coeff(indexRowMajor(index));
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const {
+    if (internal::is_input_scalar<internal::remove_all_t<InputDimensions>>::value) {
+      return internal::pset1<PacketReturnType>(m_impl.coeff(0));
+    }
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      if (isCopy) {
+#ifdef EIGEN_GPU_COMPILE_PHASE
+        // See PR 437: on NVIDIA P100 and K20m we observed a x3-4 speed up by enforcing
+        // unaligned loads here. The reason is unclear though.
+        return m_impl.template packet<Unaligned>(index);
+#else
+        return m_impl.template packet<LoadMode>(index);
+#endif
+      } else if (oneByN && !nByOne) {
+        return packetNByOne<LoadMode>(index);
+      } else if (!oneByN && nByOne) {
+        return packetOneByN<LoadMode>(index);
+      } else if (oneByN && nByOne) {
+        return packetOneByNByOne<LoadMode>(index);
+      } else {
+        return packetColMajor<LoadMode>(index);
+      }
+    } else {
+      if (isCopy) {
+#ifdef EIGEN_GPU_COMPILE_PHASE
+        // See above.
+        return m_impl.template packet<Unaligned>(index);
+#else
+        return m_impl.template packet<LoadMode>(index);
+#endif
+      } else if (oneByN && !nByOne) {
+        return packetOneByN<LoadMode>(index);
+      } else if (!oneByN && nByOne) {
+        return packetNByOne<LoadMode>(index);
+      } else if (oneByN && nByOne) {
+        return packetOneByNByOne<LoadMode>(index);
+      } else {
+        return packetRowMajor<LoadMode>(index);
+      }
+    }
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByNByOne(Index index) const {
+    eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+
+    EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
+    Index startDim, endDim;
+    Index inputIndex, outputOffset, batchedIndex;
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      startDim = NumDims - 1;
+      endDim = 1;
+    } else {
+      startDim = 0;
+      endDim = NumDims - 2;
+    }
+
+    batchedIndex = index % m_outputStrides[startDim];
+    inputIndex = batchedIndex / m_outputStrides[endDim];
+    outputOffset = batchedIndex % m_outputStrides[endDim];
+
+    if (outputOffset + PacketSize <= m_outputStrides[endDim]) {
+      values[0] = m_impl.coeff(inputIndex);
+      return internal::pload1<PacketReturnType>(values);
+    } else {
+      EIGEN_UNROLL_LOOP
+      for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) {
+        if (outputOffset + cur < m_outputStrides[endDim]) {
+          values[i] = m_impl.coeff(inputIndex);
+        } else {
+          ++inputIndex;
+          inputIndex = (inputIndex == m_inputStrides[startDim] ? 0 : inputIndex);
+          values[i] = m_impl.coeff(inputIndex);
+          outputOffset = 0;
+          cur = 0;
+        }
+      }
+      return internal::pload<PacketReturnType>(values);
+    }
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByN(Index index) const {
+    // Consider the flattened tensor [v0, ..., vN],
+    // Concatenates m_broadcast[dim] copies,
+    //    [v0, ..., vN, v0, ..., vN, ... ]
+    // with dim == NumDims - 1 for col-major, dim == 0 for row-major.
+    eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+
+    // Size of flattened tensor.
+    const Index M =
+        (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_inputStrides[NumDims - 1] : m_inputStrides[0];
+    Index inputIndex = index % M;
+    if (inputIndex + PacketSize <= M) {
+      return m_impl.template packet<Unaligned>(inputIndex);
+    } else {
+      EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
+      EIGEN_UNROLL_LOOP
+      for (int i = 0; i < PacketSize; ++i) {
+        if (inputIndex > M - 1) {
+          inputIndex = 0;
+        }
+        values[i] = m_impl.coeff(inputIndex++);
+      }
+      return internal::pload<PacketReturnType>(values);
+    }
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetNByOne(Index index) const {
+    // Consider the flattened tensor [v0, ..., vN],
+    // Interleaves m_broadcast[dim] copies,
+    //    [v0, v0, ..., v1, v1, ..., vN, vN, ... ]
+    // with dim == 0 for col-major, dim == NumDims - 1 for row-major.
+    eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+
+    const Index M =
+        (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_broadcast[0] : m_broadcast[NumDims - 1];
+
+    Index inputIndex = index / M;
+    Index outputOffset = index % M;
+    if (outputOffset + PacketSize <= M) {
+      return internal::pset1<PacketReturnType>(m_impl.coeff(inputIndex));
+    } else {
+      EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
+      EIGEN_UNROLL_LOOP
+      for (int i = 0; i < PacketSize; ++i) {
+        if (outputOffset < M) {
+          values[i] = m_impl.coeff(inputIndex);
+          ++outputOffset;
+        } else {
+          values[i] = m_impl.coeff(++inputIndex);
+          outputOffset = 1;  // Next offset.
+        }
+      }
+      return internal::pload<PacketReturnType>(values);
+    }
+  }
+
+  // Ignore the LoadMode and always use unaligned loads since we can't guarantee
+  // the alignment at compile time.
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const {
+    eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+
+    const Index originalIndex = index;
+
+    Index inputIndex = 0;
+    EIGEN_UNROLL_LOOP
+    for (int i = NumDims - 1; i > 0; --i) {
+      const Index idx = index / m_outputStrides[i];
+      if (internal::index_statically_eq<Broadcast>(i, 1)) {
+        eigen_assert(idx < m_impl.dimensions()[i]);
+        inputIndex += idx * m_inputStrides[i];
+      } else {
+        if (internal::index_statically_eq<InputDimensions>(i, 1)) {
+          eigen_assert(idx % m_impl.dimensions()[i] == 0);
+        } else {
+          inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
+        }
+      }
+      index -= idx * m_outputStrides[i];
+    }
+    Index innermostLoc;
+    if (internal::index_statically_eq<Broadcast>(0, 1)) {
+      eigen_assert(index < m_impl.dimensions()[0]);
+      innermostLoc = index;
+    } else {
+      if (internal::index_statically_eq<InputDimensions>(0, 1)) {
+        eigen_assert(index % m_impl.dimensions()[0] == 0);
+        innermostLoc = 0;
+      } else {
+        innermostLoc = index % m_impl.dimensions()[0];
+      }
+    }
+    inputIndex += innermostLoc;
+
+    // Todo: this could be extended to the second dimension if we're not
+    // broadcasting alongside the first dimension, and so on.
+    if (innermostLoc + PacketSize <= m_impl.dimensions()[0]) {
+      return m_impl.template packet<Unaligned>(inputIndex);
+    } else {
+      EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
+      values[0] = m_impl.coeff(inputIndex);
+      EIGEN_UNROLL_LOOP
+      for (int i = 1; i < PacketSize; ++i) {
+        if (innermostLoc + i < m_impl.dimensions()[0]) {
+          values[i] = m_impl.coeff(inputIndex + i);
+        } else {
+          values[i] = coeffColMajor(originalIndex + i);
+        }
+      }
+      PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+      return rslt;
+    }
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const {
+    eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+
+    const Index originalIndex = index;
+
+    Index inputIndex = 0;
+    EIGEN_UNROLL_LOOP
+    for (int i = 0; i < NumDims - 1; ++i) {
+      const Index idx = index / m_outputStrides[i];
+      if (internal::index_statically_eq<Broadcast>(i, 1)) {
+        eigen_assert(idx < m_impl.dimensions()[i]);
+        inputIndex += idx * m_inputStrides[i];
+      } else {
+        if (internal::index_statically_eq<InputDimensions>(i, 1)) {
+          eigen_assert(idx % m_impl.dimensions()[i] == 0);
+        } else {
+          inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
+        }
+      }
+      index -= idx * m_outputStrides[i];
+    }
+    Index innermostLoc;
+    if (internal::index_statically_eq<Broadcast>(NumDims - 1, 1)) {
+      eigen_assert(index < m_impl.dimensions()[NumDims - 1]);
+      innermostLoc = index;
+    } else {
+      if (internal::index_statically_eq<InputDimensions>(NumDims - 1, 1)) {
+        eigen_assert(index % m_impl.dimensions()[NumDims - 1] == 0);
+        innermostLoc = 0;
+      } else {
+        innermostLoc = index % m_impl.dimensions()[NumDims - 1];
+      }
+    }
+    inputIndex += innermostLoc;
+
+    // Todo: this could be extended to the second dimension if we're not
+    // broadcasting alongside the first dimension, and so on.
+    if (innermostLoc + PacketSize <= m_impl.dimensions()[NumDims - 1]) {
+      return m_impl.template packet<Unaligned>(inputIndex);
+    } else {
+      EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
+      values[0] = m_impl.coeff(inputIndex);
+      EIGEN_UNROLL_LOOP
+      for (int i = 1; i < PacketSize; ++i) {
+        if (innermostLoc + i < m_impl.dimensions()[NumDims - 1]) {
+          values[i] = m_impl.coeff(inputIndex + i);
+        } else {
+          values[i] = coeffRowMajor(originalIndex + i);
+        }
+      }
+      PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+      return rslt;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    double compute_cost = TensorOpCost::AddCost<Index>();
+    if (!isCopy && NumDims > 0) {
+      EIGEN_UNROLL_LOOP
+      for (int i = NumDims - 1; i > 0; --i) {
+        compute_cost += TensorOpCost::DivCost<Index>();
+        if (internal::index_statically_eq<Broadcast>(i, 1)) {
+          compute_cost += TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>();
+        } else {
+          if (!internal::index_statically_eq<InputDimensions>(i, 1)) {
+            compute_cost +=
+                TensorOpCost::MulCost<Index>() + TensorOpCost::ModCost<Index>() + TensorOpCost::AddCost<Index>();
+          }
+        }
+        compute_cost += TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>();
+      }
+    }
+    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    // TODO(wuke): Targeting L1 size is 30% faster than targeting L{-1} on large
+    // tensors. But this might need further tuning.
+    const size_t target_size = m_device.firstLevelCacheSize();
+    return internal::TensorBlockResourceRequirements::merge(
+        m_impl.getResourceRequirements(), internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+                                                          bool /*root_of_expr_ast*/ = false) const {
+    BlockBroadcastingParams params = blockBroadcastingParams(desc);
+
+    if (params.inner_dim_size == 0 || params.bcast_dim_size == 0) {
+      return emptyBlock();
+    }
+
+    // Prepare storage for the materialized broadcasting result.
+    const typename TensorBlock::Storage block_storage = TensorBlock::prepareStorage(desc, scratch);
+    ScalarNoConst* materialized_output = block_storage.data();
+
+    // We potentially will need to materialize input blocks.
+    size_t materialized_input_size = 0;
+    ScalarNoConst* materialized_input = NULL;
+
+    // Initialize block broadcating iterator state for outer dimensions (outer
+    // with regard to bcast dimension). Dimension in this array are always in
+    // inner_most -> outer_most order (col major layout).
+    array<BlockBroadcastingIteratorState, NumDims> it;
+    int idx = 0;
+
+    for (int i = params.inner_dim_count + 1; i < NumDims; ++i) {
+      const Index dim = IsColMajor ? i : NumDims - 1 - i;
+      it[idx].size = params.output_dims[dim];
+      it[idx].count = 0;
+      it[idx].output_stride = m_outputStrides[dim];
+      it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
+      idx++;
+    }
+
+    // Write output into the beginning of `materialized_output`.
+    Index output_offset = 0;
+
+    // We will fill output block by broadcasting along the bcast dim, and
+    // iterating over outer dimension.
+    const Index output_size = NumDims == 0 ? 1 : params.output_dims.TotalSize();
+
+    for (Index num_output_coeffs = 0; num_output_coeffs < output_size;) {
+      ScalarNoConst* bcast_output = materialized_output + num_output_coeffs;
+      Index bcast_offset = desc.offset() + output_offset;
+
+      // Broadcast along the bcast dimension.
+      num_output_coeffs += BroadcastBlockAlongBcastDim(params, bcast_offset, scratch, bcast_output, &materialized_input,
+                                                       &materialized_input_size);
+
+      // Switch to the next outer dimension.
+      for (int j = 0; j < idx; ++j) {
+        if (++it[j].count < it[j].size) {
+          output_offset += it[j].output_stride;
+          break;
+        }
+        it[j].count = 0;
+        output_offset -= it[j].output_span;
+      }
+    }
+
+    return block_storage.AsTensorMaterializedBlock();
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+
+  Broadcast functor() const { return m_broadcast; }
+
+ private:
+  static constexpr bool IsColMajor = static_cast<int>(Layout) == static_cast<int>(ColMajor);
+
+  // We will build a general case block broadcasting on top of broadcasting
+  // primitive that will do broadcasting only for the inner dimension(s) along
+  // the first dimension smaller than the input size (it's called `bcast_dim`).
+  //
+  // Example:
+  //           dim:  0  1  2   (ColMajor)
+  //    input size: [9, 3, 6]
+  //    block size: [9, 2, 6]
+  //
+  // We will compute broadcasted block by iterating over the outer dimensions
+  // before `bcast_dim` (only dimension `2` in this example) and computing
+  // broadcasts along the `bcast_dim` (dimension `1` in this example).
+
+  // BlockBroadcastingParams holds precomputed parameters for broadcasting a
+  // single block along the broadcasting dimension. Sizes and strides along the
+  // `bcast_dim` might be invalid, they will be adjusted later in
+  // `BroadcastBlockAlongBcastDim`.
+  struct BlockBroadcastingParams {
+    Dimensions input_dims;      // input expression dimensions
+    Dimensions output_dims;     // output block sizes
+    Dimensions output_strides;  // output block strides
+
+    int inner_dim_count;   // count inner dimensions matching in size
+    int bcast_dim;         // broadcasting dimension index
+    Index bcast_dim_size;  // broadcasting dimension size
+    Index inner_dim_size;  // inner dimensions size
+
+    // Block sizes and strides for the input block where all dimensions before
+    // `bcast_dim` are equal to `1`.
+    Dimensions input_block_sizes;
+    Dimensions input_block_strides;
+
+    // Block sizes and strides for blocks with extra dimensions and strides `0`.
+    BroadcastDimensions bcast_block_sizes;
+    BroadcastDimensions bcast_block_strides;
+    BroadcastDimensions bcast_input_strides;
+  };
+
+  struct BlockBroadcastingIteratorState {
+    Index size;
+    Index count;
+    Index output_stride;
+    Index output_span;
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlockBroadcastingParams blockBroadcastingParams(TensorBlockDesc& desc) const {
+    BlockBroadcastingParams params;
+
+    params.input_dims = Dimensions(m_impl.dimensions());
+
+    // Output block sizes and strides.
+    params.output_dims = desc.dimensions();
+    params.output_strides = internal::strides<Layout>(params.output_dims);
+
+    // Find the broadcasting dimension (first dimension with output size smaller
+    // that the input size).
+    params.bcast_dim = 0;
+    params.bcast_dim_size = 1;
+    params.inner_dim_size = 1;
+
+    // Count the number of inner dimensions that have the same size in the block
+    // and in the broadcast expression.
+    params.inner_dim_count = 0;
+
+    for (int i = 0; i < NumDims; ++i) {
+      const int dim = IsColMajor ? i : NumDims - i - 1;
+
+      if (params.output_dims[dim] == m_dimensions[dim]) {
+        params.inner_dim_size *= params.output_dims[dim];
+        ++params.inner_dim_count;
+        continue;
+      }
+
+      // First non-matching dimension is the broadcasting dimension.
+      eigen_assert(params.output_dims[dim] < m_dimensions[dim]);
+      params.bcast_dim = dim;
+      params.bcast_dim_size = params.output_dims[dim];
+      break;
+    }
+
+    // Calculate the input block size for looking into the input.
+    for (int i = 0; i < params.inner_dim_count; ++i) {
+      const int dim = IsColMajor ? i : NumDims - i - 1;
+      params.input_block_sizes[dim] = params.input_dims[dim];
+    }
+    for (int i = params.inner_dim_count; i < NumDims; ++i) {
+      const int dim = IsColMajor ? i : NumDims - i - 1;
+      params.input_block_sizes[dim] = 1;
+    }
+    params.input_block_strides = internal::strides<Layout>(params.input_block_sizes);
+
+    // Broadcast with the 0-stride trick: Create 1 extra dim for each
+    // broadcast, set the input stride to 0.
+    //
+    // When ColMajor:
+    //
+    // - bcast_block_sizes:
+    //   [d_0, b_0, d_1, b_1, ...]
+    //
+    // - bcast_block_strides:
+    //   [output_block_strides[0], output_block_strides[0] * d_0,
+    //    output_block_strides[1], output_block_strides[1] * d_1,
+    //   ...]
+    //
+    // - bcast_input_strides:
+    //   [input_block_strides[0], 0,
+    //    input_block_strides[1], 0,
+    //   ...].
+    //
+    for (int i = 0; i < params.inner_dim_count; ++i) {
+      const int dim = IsColMajor ? i : NumDims - i - 1;
+
+      const int copy_dim = IsColMajor ? 2 * i : 2 * NumDims - 2 * i - 1;
+      const int broadcast_dim = IsColMajor ? copy_dim + 1 : copy_dim - 1;
+
+      params.bcast_block_sizes[copy_dim] = params.input_dims[dim];
+      params.bcast_block_sizes[broadcast_dim] = m_broadcast[dim];
+      params.bcast_block_strides[copy_dim] = params.output_strides[dim];
+      params.bcast_block_strides[broadcast_dim] = params.output_strides[dim] * params.input_dims[dim];
+      params.bcast_input_strides[copy_dim] = params.input_block_strides[dim];
+      params.bcast_input_strides[broadcast_dim] = 0;
+    }
+
+    for (int i = 2 * params.inner_dim_count; i < 2 * NumDims; ++i) {
+      const int dim = IsColMajor ? i : 2 * NumDims - i - 1;
+      params.bcast_block_sizes[dim] = 1;
+      params.bcast_block_strides[dim] = 0;
+      params.bcast_input_strides[dim] = 0;
+    }
+
+    return params;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock emptyBlock() const {
+    DSizes<Index, NumDims> dimensions;
+    for (int i = 0; i < NumDims; ++i) dimensions[i] = 0;
+    return TensorBlock(internal::TensorBlockKind::kView, NULL, dimensions);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index BroadcastBlockAlongBcastDim(
+      BlockBroadcastingParams params, Index bcast_offset, TensorBlockScratch& scratch,
+      ScalarNoConst* materialized_output, ScalarNoConst** materialized_input, size_t* materialized_input_size) const {
+    if (params.bcast_dim_size == 1) {
+      // We just need one block read using the ready-set values above.
+      return BroadcastBlock(params.input_block_sizes, params.input_block_strides, params.bcast_block_sizes,
+                            params.bcast_block_strides, params.bcast_input_strides, bcast_offset, 0, scratch,
+                            materialized_output, materialized_input, materialized_input_size);
+
+    } else if (params.input_dims[params.bcast_dim] == 1) {
+      // Broadcast bcast dimension (< NumDims) by bcast_dim_size.
+      const int broadcast_bcast_dim =
+          IsColMajor ? 2 * params.inner_dim_count + 1 : 2 * NumDims - 2 * params.inner_dim_count - 2;
+
+      params.bcast_block_sizes[broadcast_bcast_dim] = params.bcast_dim_size;
+      params.bcast_input_strides[broadcast_bcast_dim] = 0;
+      params.bcast_block_strides[broadcast_bcast_dim] = params.output_strides[params.bcast_dim];
+
+      return BroadcastBlock(params.input_block_sizes, params.input_block_strides, params.bcast_block_sizes,
+                            params.bcast_block_strides, params.bcast_input_strides, bcast_offset, 0, scratch,
+                            materialized_output, materialized_input, materialized_input_size);
+
+    } else {
+      // Keep track of the total number of the coefficients written to the
+      // output block.
+      Index num_output_coeffs = 0;
+
+      // The general case. Let's denote the output block as
+      //
+      //   x[..., a:a+bcast_dim_size, :, ..., :]
+      //
+      // where a:a+bcast_dim_size is a slice on the bcast_dim dimension
+      // (< NumDims). We need to split the a:a+bcast_dim_size into possibly 3
+      // sub-blocks:
+      //
+      // (1) a:b, where b is the smallest multiple of
+      //     input_dims[bcast_dim_start] in [a, a+bcast_dim_size].
+      //
+      // (2) b:c, where c is the largest multiple of input_dims[bcast_dim_start]
+      //     in [a, a+bcast_dim_size].
+      //
+      // (3) c:a+bcast_dim_size .
+      //
+      // Or, when b and c do not exist, we just need to process the whole block
+      // together.
+
+      // Find a.
+      const Index bcast_dim_left_index = bcast_offset / m_outputStrides[params.bcast_dim];
+
+      // Find b and c.
+      const Index input_bcast_dim_size = params.input_dims[params.bcast_dim];
+
+      // First multiple after a. This is b when <= bcast_dim_left_index +
+      // bcast_dim_size.
+      const Index first_multiple =
+          numext::div_ceil<Index>(bcast_dim_left_index, input_bcast_dim_size) * input_bcast_dim_size;
+
+      if (first_multiple <= bcast_dim_left_index + params.bcast_dim_size) {
+        // b exists, so does c. Find it.
+        const Index last_multiple =
+            (bcast_dim_left_index + params.bcast_dim_size) / input_bcast_dim_size * input_bcast_dim_size;
+        const int copy_bcast_dim =
+            IsColMajor ? 2 * params.inner_dim_count : 2 * NumDims - 2 * params.inner_dim_count - 1;
+        const int broadcast_bcast_dim =
+            IsColMajor ? 2 * params.inner_dim_count + 1 : 2 * NumDims - 2 * params.inner_dim_count - 2;
+
+        if (first_multiple > bcast_dim_left_index) {
+          const Index head_size = first_multiple - bcast_dim_left_index;
+          params.input_block_sizes[params.bcast_dim] = head_size;
+          params.bcast_block_sizes[copy_bcast_dim] = head_size;
+          params.bcast_input_strides[copy_bcast_dim] = params.input_block_strides[params.bcast_dim];
+          params.bcast_block_strides[copy_bcast_dim] = params.output_strides[params.bcast_dim];
+          params.bcast_block_sizes[broadcast_bcast_dim] = 1;
+          params.bcast_input_strides[broadcast_bcast_dim] = 0;
+          params.bcast_block_strides[broadcast_bcast_dim] =
+              params.output_strides[params.bcast_dim] * params.input_dims[params.bcast_dim];
+
+          num_output_coeffs +=
+              BroadcastBlock(params.input_block_sizes, params.input_block_strides, params.bcast_block_sizes,
+                             params.bcast_block_strides, params.bcast_input_strides, bcast_offset, 0, scratch,
+                             materialized_output, materialized_input, materialized_input_size);
+        }
+        if (first_multiple < last_multiple) {
+          params.input_block_sizes[params.bcast_dim] = input_bcast_dim_size;
+          params.bcast_block_sizes[copy_bcast_dim] = input_bcast_dim_size;
+          params.bcast_input_strides[copy_bcast_dim] = params.input_block_strides[params.bcast_dim];
+          params.bcast_block_strides[copy_bcast_dim] = params.output_strides[params.bcast_dim];
+          params.bcast_block_sizes[broadcast_bcast_dim] = (last_multiple - first_multiple) / input_bcast_dim_size;
+          params.bcast_input_strides[broadcast_bcast_dim] = 0;
+          params.bcast_block_strides[broadcast_bcast_dim] =
+              params.output_strides[params.bcast_dim] * params.input_dims[params.bcast_dim];
+          const Index offset = (first_multiple - bcast_dim_left_index) * m_outputStrides[params.bcast_dim];
+
+          num_output_coeffs +=
+              BroadcastBlock(params.input_block_sizes, params.input_block_strides, params.bcast_block_sizes,
+                             params.bcast_block_strides, params.bcast_input_strides, bcast_offset, offset, scratch,
+                             materialized_output, materialized_input, materialized_input_size);
+        }
+        if (last_multiple < bcast_dim_left_index + params.bcast_dim_size) {
+          const Index tail_size = bcast_dim_left_index + params.bcast_dim_size - last_multiple;
+          params.input_block_sizes[params.bcast_dim] = tail_size;
+          params.bcast_block_sizes[copy_bcast_dim] = tail_size;
+          params.bcast_input_strides[copy_bcast_dim] = params.input_block_strides[params.bcast_dim];
+          params.bcast_block_strides[copy_bcast_dim] = params.output_strides[params.bcast_dim];
+          params.bcast_block_sizes[broadcast_bcast_dim] = 1;
+          params.bcast_input_strides[broadcast_bcast_dim] = 0;
+          params.bcast_block_strides[broadcast_bcast_dim] =
+              params.output_strides[params.bcast_dim] * params.input_dims[params.bcast_dim];
+          const Index offset = (last_multiple - bcast_dim_left_index) * m_outputStrides[params.bcast_dim];
+
+          num_output_coeffs +=
+              BroadcastBlock(params.input_block_sizes, params.input_block_strides, params.bcast_block_sizes,
+                             params.bcast_block_strides, params.bcast_input_strides, bcast_offset, offset, scratch,
+                             materialized_output, materialized_input, materialized_input_size);
+        }
+      } else {
+        // b and c do not exist.
+        const int copy_bcast_dim =
+            IsColMajor ? 2 * params.inner_dim_count : 2 * NumDims - 2 * params.inner_dim_count - 1;
+        params.input_block_sizes[params.bcast_dim] = params.bcast_dim_size;
+        params.bcast_block_sizes[copy_bcast_dim] = params.bcast_dim_size;
+        params.bcast_input_strides[copy_bcast_dim] = params.input_block_strides[params.bcast_dim];
+        params.bcast_block_strides[copy_bcast_dim] = params.output_strides[params.bcast_dim];
+
+        num_output_coeffs +=
+            BroadcastBlock(params.input_block_sizes, params.input_block_strides, params.bcast_block_sizes,
+                           params.bcast_block_strides, params.bcast_input_strides, bcast_offset, 0, scratch,
+                           materialized_output, materialized_input, materialized_input_size);
+      }
+
+      return num_output_coeffs;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index BroadcastBlock(
+      const Dimensions& input_block_sizes, const Dimensions& input_block_strides,
+      const BroadcastDimensions& bcast_block_sizes, const BroadcastDimensions& bcast_block_strides,
+      const BroadcastDimensions& bcast_input_strides, Index bcast_offset, Index offset, TensorBlockScratch& scratch,
+      ScalarNoConst* materialized_output, ScalarNoConst** materialized_input, size_t* materialized_input_size) const {
+    // ---------------------------------------------------------------------- //
+    // Tensor block descriptor for reading block from the input.
+    const Index input_offset = bcast_offset + offset;
+    TensorBlockDesc input_desc(IsColMajor ? indexColMajor(input_offset) : indexRowMajor(input_offset),
+                               input_block_sizes);
+
+    ArgTensorBlock input_block = m_impl.block(input_desc, scratch);
+
+    // ---------------------------------------------------------------------- //
+    // Materialize input block into a temporary memory buffer only if it's not
+    // already available in the arg block.
+    const ScalarNoConst* input_buffer = NULL;
+
+    if (input_block.data() != NULL) {
+      // Input block already has raw data, there is no need to materialize it.
+      input_buffer = input_block.data();
+
+    } else {
+      // Otherwise we have to do block assignment into a temporary buffer.
+
+      // Maybe reuse previously allocated buffer, or allocate a new one with a
+      // scratch allocator.
+      const size_t input_total_size = input_block_sizes.TotalSize();
+      if (*materialized_input == NULL || *materialized_input_size < input_total_size) {
+        *materialized_input_size = input_total_size;
+        void* mem = scratch.allocate(*materialized_input_size * sizeof(Scalar));
+        *materialized_input = static_cast<ScalarNoConst*>(mem);
+      }
+
+      typedef internal::TensorBlockAssignment<ScalarNoConst, NumDims, typename ArgTensorBlock::XprType, Index>
+          TensorBlockAssignment;
+
+      TensorBlockAssignment::Run(
+          TensorBlockAssignment::target(input_block_sizes, input_block_strides, *materialized_input),
+          input_block.expr());
+
+      input_buffer = *materialized_input;
+    }
+
+    // ---------------------------------------------------------------------- //
+    // Copy data from materialized input block to the materialized output, using
+    // given broadcast strides (strides with zeroes).
+    typedef internal::TensorBlockIO<ScalarNoConst, Index, 2 * NumDims, Layout> TensorBlockIO;
+
+    typename TensorBlockIO::Src src(bcast_input_strides, input_buffer);
+    typename TensorBlockIO::Dst dst(bcast_block_sizes, bcast_block_strides, materialized_output + offset);
+
+    return TensorBlockIO::Copy(dst, src);
+  }
+
+ protected:
+  const Device EIGEN_DEVICE_REF m_device;
+  const std::remove_reference_t<Broadcast> m_broadcast;
+  Dimensions m_dimensions;
+  array<Index, NumDims> m_outputStrides;
+  array<Index, NumDims> m_inputStrides;
+  TensorEvaluator<ArgType, Device> m_impl;
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
new file mode 100644
index 00000000..977f96fe
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -0,0 +1,469 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+template <DenseIndex DimId, typename XprType>
+struct traits<TensorChippingOp<DimId, XprType> > : public traits<XprType> {
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef std::remove_reference_t<Nested> Nested_;
+  static constexpr int NumDimensions = XprTraits::NumDimensions - 1;
+  static constexpr int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
+};
+
+template <DenseIndex DimId, typename XprType>
+struct eval<TensorChippingOp<DimId, XprType>, Eigen::Dense> {
+  typedef const TensorChippingOp<DimId, XprType> EIGEN_DEVICE_REF type;
+};
+
+template <DenseIndex DimId, typename XprType>
+struct nested<TensorChippingOp<DimId, XprType>, 1, typename eval<TensorChippingOp<DimId, XprType> >::type> {
+  typedef TensorChippingOp<DimId, XprType> type;
+};
+
+template <DenseIndex DimId>
+struct DimensionId {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DimensionId(DenseIndex dim) {
+    EIGEN_UNUSED_VARIABLE(dim);
+    eigen_assert(dim == DimId);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const { return DimId; }
+};
+template <>
+struct DimensionId<Dynamic> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DimensionId(DenseIndex dim) : actual_dim(dim) { eigen_assert(dim >= 0); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex actualDim() const { return actual_dim; }
+
+ private:
+  const DenseIndex actual_dim;
+};
+
+}  // end namespace internal
+
+/** A chip is a thin slice, corresponding to a column or a row in a 2-d tensor.
+ * \ingroup CXX11_Tensor_Module
+ */
+template <DenseIndex DimId, typename XprType>
+class TensorChippingOp : public TensorBase<TensorChippingOp<DimId, XprType> > {
+ public:
+  typedef TensorBase<TensorChippingOp<DimId, XprType> > Base;
+  typedef typename Eigen::internal::traits<TensorChippingOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorChippingOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorChippingOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorChippingOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorChippingOp(const XprType& expr, const Index offset, const Index dim)
+      : m_xpr(expr), m_offset(offset), m_dim(dim) {
+    eigen_assert(dim < XprType::NumDimensions && dim >= 0 && "Chip_Dim_out_of_range");
+  }
+
+  EIGEN_DEVICE_FUNC const Index offset() const { return m_offset; }
+  EIGEN_DEVICE_FUNC const Index dim() const { return m_dim.actualDim(); }
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename XprType::Nested>& expression() const { return m_xpr; }
+
+  EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorChippingOp)
+
+ protected:
+  typename XprType::Nested m_xpr;
+  const Index m_offset;
+  const internal::DimensionId<DimId> m_dim;
+};
+
+// Eval as rvalue
+template <DenseIndex DimId, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> {
+  typedef TensorChippingOp<DimId, ArgType> XprType;
+  static constexpr int NumInputDims =
+      internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  static constexpr int NumDims = NumInputDims - 1;
+  typedef typename XprType::Index Index;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+
+  enum {
+    // Alignment can't be guaranteed at compile time since it depends on the
+    // slice offsets.
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
+    // Chipping of outer-most dimension is a trivial operation, because we can
+    // read and write directly from the underlying tensor using single offset.
+    IsOuterChipping = (Layout == ColMajor && DimId == NumInputDims - 1) || (Layout == RowMajor && DimId == 0),
+    // Chipping inner-most dimension.
+    IsInnerChipping = (Layout == ColMajor && DimId == 0) || (Layout == RowMajor && DimId == NumInputDims - 1),
+    // Prefer block access if the underlying expression prefers it, otherwise
+    // only if chipping is not trivial.
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess || !IsOuterChipping,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  typedef std::remove_const_t<Scalar> ScalarNoConst;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef internal::TensorBlockDescriptor<NumInputDims, Index> ArgTensorBlockDesc;
+  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock ArgTensorBlock;
+
+  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims, Layout, Index> TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device) {
+    EIGEN_STATIC_ASSERT((NumInputDims >= 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    eigen_assert(NumInputDims > m_dim.actualDim());
+
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+    eigen_assert(op.offset() < input_dims[m_dim.actualDim()]);
+
+    int j = 0;
+    for (int i = 0; i < NumInputDims; ++i) {
+      if (i != m_dim.actualDim()) {
+        m_dimensions[j] = input_dims[i];
+        ++j;
+      }
+    }
+
+    m_stride = 1;
+    m_inputStride = 1;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = 0; i < m_dim.actualDim(); ++i) {
+        m_stride *= input_dims[i];
+        m_inputStride *= input_dims[i];
+      }
+    } else {
+      for (int i = NumInputDims - 1; i > m_dim.actualDim(); --i) {
+        m_stride *= input_dims[i];
+        m_inputStride *= input_dims[i];
+      }
+    }
+    m_inputStride *= input_dims[m_dim.actualDim()];
+    m_inputOffset = m_stride * op.offset();
+
+    // Check if chipping is effectively inner or outer: products of dimensions
+    // before or after the chipped dimension is `1`.
+    Index after_chipped_dim_product = 1;
+    for (int i = static_cast<int>(m_dim.actualDim()) + 1; i < NumInputDims; ++i) {
+      after_chipped_dim_product *= input_dims[i];
+    }
+
+    Index before_chipped_dim_product = 1;
+    for (int i = 0; i < m_dim.actualDim(); ++i) {
+      before_chipped_dim_product *= input_dims[i];
+    }
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_isEffectivelyInnerChipping = before_chipped_dim_product == 1;
+      m_isEffectivelyOuterChipping = after_chipped_dim_product == 1;
+    } else {
+      m_isEffectivelyInnerChipping = after_chipped_dim_product == 1;
+      m_isEffectivelyOuterChipping = before_chipped_dim_product == 1;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType /*data*/, EvalSubExprsCallback done) {
+    m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    return m_impl.coeff(srcCoeff(index));
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+    eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+
+    if (isInnerChipping()) {
+      // m_stride is equal to 1, so let's avoid the integer division.
+      eigen_assert(m_stride == 1);
+      Index inputIndex = index * m_inputStride + m_inputOffset;
+      EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
+      EIGEN_UNROLL_LOOP
+      for (int i = 0; i < PacketSize; ++i) {
+        values[i] = m_impl.coeff(inputIndex);
+        inputIndex += m_inputStride;
+      }
+      PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+      return rslt;
+    } else if (isOuterChipping()) {
+      // m_stride is always greater than index, so let's avoid the integer division.
+      eigen_assert(m_stride > index);
+      return m_impl.template packet<LoadMode>(index + m_inputOffset);
+    } else {
+      const Index idx = index / m_stride;
+      const Index rem = index - idx * m_stride;
+      if (rem + PacketSize <= m_stride) {
+        Index inputIndex = idx * m_inputStride + m_inputOffset + rem;
+        return m_impl.template packet<LoadMode>(inputIndex);
+      } else {
+        // Cross the stride boundary. Fallback to slow path.
+        EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
+        EIGEN_UNROLL_LOOP
+        for (int i = 0; i < PacketSize; ++i) {
+          values[i] = coeff(index);
+          ++index;
+        }
+        PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+        return rslt;
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    double cost = 0;
+    if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == 0) ||
+        (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == NumInputDims - 1)) {
+      cost += TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>();
+    } else if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == NumInputDims - 1) ||
+               (static_cast<int>(Layout) == static_cast<int>(RowMajor) && m_dim.actualDim() == 0)) {
+      cost += TensorOpCost::AddCost<Index>();
+    } else {
+      cost += 3 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>() + 3 * TensorOpCost::AddCost<Index>();
+    }
+
+    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, cost, vectorized, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    const size_t target_size = m_device.lastLevelCacheSize();
+    return internal::TensorBlockResourceRequirements::merge(
+        internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size), m_impl.getResourceRequirements());
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+                                                          bool root_of_expr_ast = false) const {
+    const Index chip_dim = m_dim.actualDim();
+
+    DSizes<Index, NumInputDims> input_block_dims;
+    for (int i = 0; i < NumInputDims; ++i) {
+      input_block_dims[i] = i < chip_dim ? desc.dimension(i) : i > chip_dim ? desc.dimension(i - 1) : 1;
+    }
+
+    ArgTensorBlockDesc arg_desc(srcCoeff(desc.offset()), input_block_dims);
+
+    // Try to reuse destination buffer for materializing argument block.
+    if (desc.HasDestinationBuffer()) {
+      DSizes<Index, NumInputDims> arg_destination_strides;
+      for (int i = 0; i < NumInputDims; ++i) {
+        arg_destination_strides[i] = i < chip_dim   ? desc.destination().strides()[i]
+                                     : i > chip_dim ? desc.destination().strides()[i - 1]
+                                                    : 0;  // for dimensions of size `1` stride should never be used.
+      }
+
+      arg_desc.template AddDestinationBuffer<Layout>(desc.destination().template data<ScalarNoConst>(),
+                                                     arg_destination_strides);
+    }
+
+    ArgTensorBlock arg_block = m_impl.block(arg_desc, scratch, root_of_expr_ast);
+    if (!arg_desc.HasDestinationBuffer()) desc.DropDestinationBuffer();
+
+    if (arg_block.data() != NULL) {
+      // Forward argument block buffer if possible.
+      return TensorBlock(arg_block.kind(), arg_block.data(), desc.dimensions());
+
+    } else {
+      // Assign argument block expression to a buffer.
+
+      // Prepare storage for the materialized chipping result.
+      const typename TensorBlock::Storage block_storage = TensorBlock::prepareStorage(desc, scratch);
+
+      typedef internal::TensorBlockAssignment<ScalarNoConst, NumInputDims, typename ArgTensorBlock::XprType, Index>
+          TensorBlockAssignment;
+
+      TensorBlockAssignment::Run(
+          TensorBlockAssignment::target(arg_desc.dimensions(), internal::strides<Layout>(arg_desc.dimensions()),
+                                        block_storage.data()),
+          arg_block.expr());
+
+      return block_storage.AsTensorMaterializedBlock();
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const {
+    typename Storage::Type result = constCast(m_impl.data());
+    if (isOuterChipping() && result) {
+      return result + m_inputOffset;
+    } else {
+      return NULL;
+    }
+  }
+
+ protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const {
+    Index inputIndex;
+    if (isInnerChipping()) {
+      // m_stride is equal to 1, so let's avoid the integer division.
+      eigen_assert(m_stride == 1);
+      inputIndex = index * m_inputStride + m_inputOffset;
+    } else if (isOuterChipping()) {
+      // m_stride is always greater than index, so let's avoid the integer
+      // division.
+      eigen_assert(m_stride > index);
+      inputIndex = index + m_inputOffset;
+    } else {
+      const Index idx = index / m_stride;
+      inputIndex = idx * m_inputStride + m_inputOffset;
+      index -= idx * m_stride;
+      inputIndex += index;
+    }
+    return inputIndex;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool isInnerChipping() const {
+    return IsInnerChipping || m_isEffectivelyInnerChipping;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool isOuterChipping() const {
+    return IsOuterChipping || m_isEffectivelyOuterChipping;
+  }
+
+  Dimensions m_dimensions;
+  Index m_stride;
+  Index m_inputOffset;
+  Index m_inputStride;
+  TensorEvaluator<ArgType, Device> m_impl;
+  const internal::DimensionId<DimId> m_dim;
+  const Device EIGEN_DEVICE_REF m_device;
+
+  // If product of all dimensions after or before the chipped dimension is `1`,
+  // it is effectively the same as chipping innermost or outermost dimension.
+  bool m_isEffectivelyInnerChipping;
+  bool m_isEffectivelyOuterChipping;
+};
+
+// Eval as lvalue
+template <DenseIndex DimId, typename ArgType, typename Device>
+struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
+    : public TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> {
+  typedef TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device> Base;
+  typedef TensorChippingOp<DimId, ArgType> XprType;
+  static constexpr int NumInputDims =
+      internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  static constexpr int NumDims = NumInputDims - 1;
+  typedef typename XprType::Index Index;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = TensorEvaluator<ArgType, Device>::RawAccess,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    RawAccess = false
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const {
+    return this->m_impl.coeffRef(this->srcCoeff(index));
+  }
+
+  template <int StoreMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) const {
+    if (this->isInnerChipping()) {
+      // m_stride is equal to 1, so let's avoid the integer division.
+      eigen_assert(this->m_stride == 1);
+      EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
+      internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
+      Index inputIndex = index * this->m_inputStride + this->m_inputOffset;
+      EIGEN_UNROLL_LOOP
+      for (int i = 0; i < PacketSize; ++i) {
+        this->m_impl.coeffRef(inputIndex) = values[i];
+        inputIndex += this->m_inputStride;
+      }
+    } else if (this->isOuterChipping()) {
+      // m_stride is always greater than index, so let's avoid the integer division.
+      eigen_assert(this->m_stride > index);
+      this->m_impl.template writePacket<StoreMode>(index + this->m_inputOffset, x);
+    } else {
+      const Index idx = index / this->m_stride;
+      const Index rem = index - idx * this->m_stride;
+      if (rem + PacketSize <= this->m_stride) {
+        const Index inputIndex = idx * this->m_inputStride + this->m_inputOffset + rem;
+        this->m_impl.template writePacket<StoreMode>(inputIndex, x);
+      } else {
+        // Cross stride boundary. Fallback to slow path.
+        EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
+        internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
+        EIGEN_UNROLL_LOOP
+        for (int i = 0; i < PacketSize; ++i) {
+          this->coeffRef(index) = values[i];
+          ++index;
+        }
+      }
+    }
+  }
+
+  template <typename TensorBlock>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(const TensorBlockDesc& desc, const TensorBlock& block) {
+    eigen_assert(this->m_impl.data() != NULL);
+
+    const Index chip_dim = this->m_dim.actualDim();
+
+    DSizes<Index, NumInputDims> input_block_dims;
+    for (int i = 0; i < NumInputDims; ++i) {
+      input_block_dims[i] = i < chip_dim ? desc.dimension(i) : i > chip_dim ? desc.dimension(i - 1) : 1;
+    }
+
+    typedef TensorReshapingOp<const DSizes<Index, NumInputDims>, const typename TensorBlock::XprType> TensorBlockExpr;
+
+    typedef internal::TensorBlockAssignment<Scalar, NumInputDims, TensorBlockExpr, Index> TensorBlockAssign;
+
+    TensorBlockAssign::Run(
+        TensorBlockAssign::target(input_block_dims, internal::strides<Layout>(this->m_impl.dimensions()),
+                                  this->m_impl.data(), this->srcCoeff(desc.offset())),
+        block.expr().reshape(input_block_dims));
+  }
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_CHIPPING_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
new file mode 100644
index 00000000..0203f01e
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
@@ -0,0 +1,352 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+template <typename Axis, typename LhsXprType, typename RhsXprType>
+struct traits<TensorConcatenationOp<Axis, LhsXprType, RhsXprType> > {
+  // Type promotion to handle the case where the types of the lhs and the rhs are different.
+  typedef typename promote_storage_type<typename LhsXprType::Scalar, typename RhsXprType::Scalar>::ret Scalar;
+  typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind,
+                                        typename traits<RhsXprType>::StorageKind>::ret StorageKind;
+  typedef
+      typename promote_index_type<typename traits<LhsXprType>::Index, typename traits<RhsXprType>::Index>::type Index;
+  typedef typename LhsXprType::Nested LhsNested;
+  typedef typename RhsXprType::Nested RhsNested;
+  typedef std::remove_reference_t<LhsNested> LhsNested_;
+  typedef std::remove_reference_t<RhsNested> RhsNested_;
+  static constexpr int NumDimensions = traits<LhsXprType>::NumDimensions;
+  static constexpr int Layout = traits<LhsXprType>::Layout;
+  enum { Flags = 0 };
+  typedef std::conditional_t<Pointer_type_promotion<typename LhsXprType::Scalar, Scalar>::val,
+                             typename traits<LhsXprType>::PointerType, typename traits<RhsXprType>::PointerType>
+      PointerType;
+};
+
+template <typename Axis, typename LhsXprType, typename RhsXprType>
+struct eval<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, Eigen::Dense> {
+  typedef const TensorConcatenationOp<Axis, LhsXprType, RhsXprType>& type;
+};
+
+template <typename Axis, typename LhsXprType, typename RhsXprType>
+struct nested<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, 1,
+              typename eval<TensorConcatenationOp<Axis, LhsXprType, RhsXprType> >::type> {
+  typedef TensorConcatenationOp<Axis, LhsXprType, RhsXprType> type;
+};
+
+}  // end namespace internal
+
+/**
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor concatenation class.
+ */
+template <typename Axis, typename LhsXprType, typename RhsXprType>
+class TensorConcatenationOp : public TensorBase<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, WriteAccessors> {
+ public:
+  typedef TensorBase<TensorConcatenationOp<Axis, LhsXprType, RhsXprType>, WriteAccessors> Base;
+  typedef typename internal::traits<TensorConcatenationOp>::Scalar Scalar;
+  typedef typename internal::traits<TensorConcatenationOp>::StorageKind StorageKind;
+  typedef typename internal::traits<TensorConcatenationOp>::Index Index;
+  typedef typename internal::nested<TensorConcatenationOp>::type Nested;
+  typedef typename internal::promote_storage_type<typename LhsXprType::CoeffReturnType,
+                                                  typename RhsXprType::CoeffReturnType>::ret CoeffReturnType;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConcatenationOp(const LhsXprType& lhs, const RhsXprType& rhs, Axis axis)
+      : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_axis(axis) {}
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename LhsXprType::Nested>& lhsExpression() const {
+    return m_lhs_xpr;
+  }
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename RhsXprType::Nested>& rhsExpression() const {
+    return m_rhs_xpr;
+  }
+
+  EIGEN_DEVICE_FUNC const Axis& axis() const { return m_axis; }
+
+  EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorConcatenationOp)
+ protected:
+  typename LhsXprType::Nested m_lhs_xpr;
+  typename RhsXprType::Nested m_rhs_xpr;
+  const Axis m_axis;
+};
+
+// Eval as rvalue
+template <typename Axis, typename LeftArgType, typename RightArgType, typename Device>
+struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgType>, Device> {
+  typedef TensorConcatenationOp<Axis, LeftArgType, RightArgType> XprType;
+  typedef typename XprType::Index Index;
+  static constexpr int NumDims = internal::array_size<typename TensorEvaluator<LeftArgType, Device>::Dimensions>::value;
+  static constexpr int RightNumDims =
+      internal::array_size<typename TensorEvaluator<RightArgType, Device>::Dimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+  static constexpr int Layout = TensorEvaluator<LeftArgType, Device>::Layout;
+  enum {
+    IsAligned = false,
+    PacketAccess =
+        TensorEvaluator<LeftArgType, Device>::PacketAccess && TensorEvaluator<RightArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    PreferBlockAccess = TensorEvaluator<LeftArgType, Device>::PreferBlockAccess ||
+                        TensorEvaluator<RightArgType, Device>::PreferBlockAccess,
+    RawAccess = false
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device), m_axis(op.axis()) {
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) ==
+                             static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout) ||
+                         NumDims == 1),
+                        YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((NumDims == RightNumDims), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    eigen_assert(0 <= m_axis && m_axis < NumDims);
+    const Dimensions& lhs_dims = m_leftImpl.dimensions();
+    const Dimensions& rhs_dims = m_rightImpl.dimensions();
+    {
+      int i = 0;
+      for (; i < m_axis; ++i) {
+        eigen_assert(lhs_dims[i] > 0);
+        eigen_assert(lhs_dims[i] == rhs_dims[i]);
+        m_dimensions[i] = lhs_dims[i];
+      }
+      eigen_assert(lhs_dims[i] > 0);  // Now i == m_axis.
+      eigen_assert(rhs_dims[i] > 0);
+      m_dimensions[i] = lhs_dims[i] + rhs_dims[i];
+      for (++i; i < NumDims; ++i) {
+        eigen_assert(lhs_dims[i] > 0);
+        eigen_assert(lhs_dims[i] == rhs_dims[i]);
+        m_dimensions[i] = lhs_dims[i];
+      }
+    }
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_leftStrides[0] = 1;
+      m_rightStrides[0] = 1;
+      m_outputStrides[0] = 1;
+
+      for (int j = 1; j < NumDims; ++j) {
+        m_leftStrides[j] = m_leftStrides[j - 1] * lhs_dims[j - 1];
+        m_rightStrides[j] = m_rightStrides[j - 1] * rhs_dims[j - 1];
+        m_outputStrides[j] = m_outputStrides[j - 1] * m_dimensions[j - 1];
+      }
+    } else {
+      m_leftStrides[NumDims - 1] = 1;
+      m_rightStrides[NumDims - 1] = 1;
+      m_outputStrides[NumDims - 1] = 1;
+
+      for (int j = NumDims - 2; j >= 0; --j) {
+        m_leftStrides[j] = m_leftStrides[j + 1] * lhs_dims[j + 1];
+        m_rightStrides[j] = m_rightStrides[j + 1] * rhs_dims[j + 1];
+        m_outputStrides[j] = m_outputStrides[j + 1] * m_dimensions[j + 1];
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  // TODO(phli): Add short-circuit memcpy evaluation if underlying data are linear?
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+    m_leftImpl.evalSubExprsIfNeeded(NULL);
+    m_rightImpl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+  EIGEN_STRONG_INLINE void cleanup() {
+    m_leftImpl.cleanup();
+    m_rightImpl.cleanup();
+  }
+
+  // TODO(phli): attempt to speed this up. The integer divisions and modulo are slow.
+  // See CL/76180724 comments for more ideas.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    // Collect dimension-wise indices (subs).
+    array<Index, NumDims> subs;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        subs[i] = index / m_outputStrides[i];
+        index -= subs[i] * m_outputStrides[i];
+      }
+      subs[0] = index;
+    } else {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        subs[i] = index / m_outputStrides[i];
+        index -= subs[i] * m_outputStrides[i];
+      }
+      subs[NumDims - 1] = index;
+    }
+
+    const Dimensions& left_dims = m_leftImpl.dimensions();
+    if (subs[m_axis] < left_dims[m_axis]) {
+      Index left_index;
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        left_index = subs[0];
+        EIGEN_UNROLL_LOOP
+        for (int i = 1; i < NumDims; ++i) {
+          left_index += (subs[i] % left_dims[i]) * m_leftStrides[i];
+        }
+      } else {
+        left_index = subs[NumDims - 1];
+        EIGEN_UNROLL_LOOP
+        for (int i = NumDims - 2; i >= 0; --i) {
+          left_index += (subs[i] % left_dims[i]) * m_leftStrides[i];
+        }
+      }
+      return m_leftImpl.coeff(left_index);
+    } else {
+      subs[m_axis] -= left_dims[m_axis];
+      const Dimensions& right_dims = m_rightImpl.dimensions();
+      Index right_index;
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        right_index = subs[0];
+        EIGEN_UNROLL_LOOP
+        for (int i = 1; i < NumDims; ++i) {
+          right_index += (subs[i] % right_dims[i]) * m_rightStrides[i];
+        }
+      } else {
+        right_index = subs[NumDims - 1];
+        EIGEN_UNROLL_LOOP
+        for (int i = NumDims - 2; i >= 0; --i) {
+          right_index += (subs[i] % right_dims[i]) * m_rightStrides[i];
+        }
+      }
+      return m_rightImpl.coeff(right_index);
+    }
+  }
+
+  // TODO(phli): Add a real vectorization.
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+    const int packetSize = PacketType<CoeffReturnType, Device>::size;
+    EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index + packetSize - 1 < dimensions().TotalSize());
+
+    EIGEN_ALIGN_MAX CoeffReturnType values[packetSize];
+    EIGEN_UNROLL_LOOP
+    for (int i = 0; i < packetSize; ++i) {
+      values[i] = coeff(index + i);
+    }
+    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+    return rslt;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    const double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
+                                           TensorOpCost::DivCost<Index>() + TensorOpCost::ModCost<Index>());
+    const double lhs_size = m_leftImpl.dimensions().TotalSize();
+    const double rhs_size = m_rightImpl.dimensions().TotalSize();
+    return (lhs_size / (lhs_size + rhs_size)) * m_leftImpl.costPerCoeff(vectorized) +
+           (rhs_size / (lhs_size + rhs_size)) * m_rightImpl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost);
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+ protected:
+  Dimensions m_dimensions;
+  array<Index, NumDims> m_outputStrides;
+  array<Index, NumDims> m_leftStrides;
+  array<Index, NumDims> m_rightStrides;
+  TensorEvaluator<LeftArgType, Device> m_leftImpl;
+  TensorEvaluator<RightArgType, Device> m_rightImpl;
+  const Axis m_axis;
+};
+
+// Eval as lvalue
+template <typename Axis, typename LeftArgType, typename RightArgType, typename Device>
+struct TensorEvaluator<TensorConcatenationOp<Axis, LeftArgType, RightArgType>, Device>
+    : public TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgType>, Device> {
+  typedef TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgType>, Device> Base;
+  typedef TensorConcatenationOp<Axis, LeftArgType, RightArgType> XprType;
+  typedef typename Base::Dimensions Dimensions;
+  static constexpr int Layout = TensorEvaluator<LeftArgType, Device>::Layout;
+  enum {
+    IsAligned = false,
+    PacketAccess =
+        TensorEvaluator<LeftArgType, Device>::PacketAccess && TensorEvaluator<RightArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    PreferBlockAccess = TensorEvaluator<LeftArgType, Device>::PreferBlockAccess ||
+                        TensorEvaluator<RightArgType, Device>::PreferBlockAccess,
+    RawAccess = false
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(XprType& op, const Device& device) : Base(op, device) {
+    EIGEN_STATIC_ASSERT((static_cast<int>(Layout) == static_cast<int>(ColMajor)), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  }
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const {
+    // Collect dimension-wise indices (subs).
+    array<Index, Base::NumDims> subs;
+    for (int i = Base::NumDims - 1; i > 0; --i) {
+      subs[i] = index / this->m_outputStrides[i];
+      index -= subs[i] * this->m_outputStrides[i];
+    }
+    subs[0] = index;
+
+    const Dimensions& left_dims = this->m_leftImpl.dimensions();
+    if (subs[this->m_axis] < left_dims[this->m_axis]) {
+      Index left_index = subs[0];
+      for (int i = 1; i < Base::NumDims; ++i) {
+        left_index += (subs[i] % left_dims[i]) * this->m_leftStrides[i];
+      }
+      return this->m_leftImpl.coeffRef(left_index);
+    } else {
+      subs[this->m_axis] -= left_dims[this->m_axis];
+      const Dimensions& right_dims = this->m_rightImpl.dimensions();
+      Index right_index = subs[0];
+      for (int i = 1; i < Base::NumDims; ++i) {
+        right_index += (subs[i] % right_dims[i]) * this->m_rightStrides[i];
+      }
+      return this->m_rightImpl.coeffRef(right_index);
+    }
+  }
+
+  template <int StoreMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) const {
+    const int packetSize = PacketType<CoeffReturnType, Device>::size;
+    EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index + packetSize - 1 < this->dimensions().TotalSize());
+
+    EIGEN_ALIGN_MAX CoeffReturnType values[packetSize];
+    internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
+    for (int i = 0; i < packetSize; ++i) {
+      coeffRef(index + i) = values[i];
+    }
+  }
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_CONCATENATION_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
new file mode 100644
index 00000000..97e7da3b
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -0,0 +1,958 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename Dimensions, typename LhsXprType, typename RhsXprType, typename OutputKernelType>
+struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType>> {
+  // Type promotion to handle the case where the types of the lhs and the rhs are different.
+  typedef typename gebp_traits<std::remove_const_t<typename LhsXprType::Scalar>,
+                               std::remove_const_t<typename RhsXprType::Scalar>>::ResScalar Scalar;
+
+  typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind,
+                                        typename traits<RhsXprType>::StorageKind>::ret StorageKind;
+  typedef
+      typename promote_index_type<typename traits<LhsXprType>::Index, typename traits<RhsXprType>::Index>::type Index;
+  typedef typename LhsXprType::Nested LhsNested;
+  typedef typename RhsXprType::Nested RhsNested;
+  typedef std::remove_reference_t<LhsNested> LhsNested_;
+  typedef std::remove_reference_t<RhsNested> RhsNested_;
+
+  // From NumDims below.
+  static constexpr int NumDimensions =
+      traits<LhsXprType>::NumDimensions + traits<RhsXprType>::NumDimensions - 2 * array_size<Dimensions>::value;
+  static constexpr int Layout = traits<LhsXprType>::Layout;
+  typedef std::conditional_t<Pointer_type_promotion<typename LhsXprType::Scalar, Scalar>::val,
+                             typename traits<LhsXprType>::PointerType, typename traits<RhsXprType>::PointerType>
+      PointerType;
+
+  enum { Flags = 0 };
+};
+
+template <typename Dimensions, typename LhsXprType, typename RhsXprType, typename OutputKernelType>
+struct eval<TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType>, Eigen::Dense> {
+  typedef const TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType>& type;
+};
+
+template <typename Dimensions, typename LhsXprType, typename RhsXprType, typename OutputKernelType>
+struct nested<TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType>, 1,
+              typename eval<TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType>>::type> {
+  typedef TensorContractionOp<Dimensions, LhsXprType, RhsXprType, OutputKernelType> type;
+};
+
+template <typename Indices_, typename LeftArgType_, typename RightArgType_, typename OutputKernelType_,
+          typename Device_>
+struct traits<
+    TensorEvaluator<const TensorContractionOp<Indices_, LeftArgType_, RightArgType_, OutputKernelType_>, Device_>> {
+  typedef Indices_ Indices;
+  typedef LeftArgType_ LeftArgType;
+  typedef RightArgType_ RightArgType;
+  typedef OutputKernelType_ OutputKernelType;
+  typedef Device_ Device;
+
+  // From NumDims below.
+  static constexpr int NumDimensions =
+      traits<LeftArgType_>::NumDimensions + traits<RightArgType_>::NumDimensions - 2 * array_size<Indices_>::value;
+};
+
+// Helper class to allocate and deallocate temporary memory for packed buffers.
+template <typename LhsScalar, typename RhsScalar>
+struct TensorContractionBlockMemAllocator {
+  typedef void* BlockMemHandle;
+
+  template <typename Device>
+  EIGEN_DEVICE_FUNC static BlockMemHandle allocate(Device& d, const Index bm, const Index bk, const Index bn,
+                                                   LhsScalar** lhs_block, RhsScalar** rhs_block) {
+    eigen_assert(lhs_block);
+    eigen_assert(rhs_block);
+    BlockSizes sz = ComputeLhsRhsBlockSizes(bm, bk, bn);
+    char* block_mem = static_cast<char*>(d.allocate(sz.lhs_size + sz.rhs_size));
+    *lhs_block = static_cast<LhsScalar*>(static_cast<void*>(block_mem));
+    *rhs_block = static_cast<RhsScalar*>(static_cast<void*>(block_mem + sz.lhs_size));
+    return block_mem;
+  }
+
+  template <typename Device>
+  EIGEN_DEVICE_FUNC static BlockMemHandle allocateSlices(Device& d, const Index bm, const Index bk, const Index bn,
+                                                         const Index num_lhs, const Index num_rhs,
+                                                         const Index num_slices, std::vector<LhsScalar*>* lhs_blocks,
+                                                         std::vector<RhsScalar*>* rhs_blocks) {
+    eigen_assert(num_slices > 0);
+    eigen_assert(num_lhs >= 0 && num_rhs >= 0);
+    eigen_assert(num_lhs == 0 || lhs_blocks);
+    eigen_assert(num_rhs == 0 || rhs_blocks);
+    BlockSizes sz = ComputeLhsRhsBlockSizes(bm, bk, bn);
+    void* block_mem = d.allocate((num_lhs * sz.lhs_size + num_rhs * sz.rhs_size) * num_slices);
+    eigen_assert(block_mem);
+    char* mem = static_cast<char*>(block_mem);
+
+    for (Index x = 0; x < num_slices; x++) {
+      if (num_lhs > 0) lhs_blocks[x].resize(num_lhs);
+      for (Index m = 0; m < num_lhs; m++) {
+        lhs_blocks[x][m] = static_cast<LhsScalar*>(static_cast<void*>(mem));
+        mem += sz.lhs_size;
+      }
+      if (num_rhs > 0) rhs_blocks[x].resize(num_rhs);
+      for (Index n = 0; n < num_rhs; n++) {
+        rhs_blocks[x][n] = static_cast<RhsScalar*>(static_cast<void*>(mem));
+        mem += sz.rhs_size;
+      }
+    }
+
+    return block_mem;
+  }
+
+  template <typename Device>
+  EIGEN_DEVICE_FUNC static void deallocate(Device& d, BlockMemHandle handle) {
+    d.deallocate(handle);
+  }
+
+ private:
+  struct BlockSizes {
+    Index lhs_size;
+    Index rhs_size;
+  };
+  EIGEN_DEVICE_FUNC static BlockSizes ComputeLhsRhsBlockSizes(const Index bm, const Index bk, const Index bn) {
+    Index align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1);
+    BlockSizes sz;
+    sz.lhs_size = numext::div_ceil<Index>(bm * bk * sizeof(LhsScalar), align) * align;
+    sz.rhs_size = numext::div_ceil<Index>(bn * bk * sizeof(RhsScalar), align) * align;
+    return sz;
+  }
+};
+
+// WARNING: In this code we assume that Lhs and Rhs tensor expressions are in
+// ColMajor storage order. This property is guaranteed by the
+// TensorContractionOp evaluator. TensorContractionKernel specifies how we pack
+// blocks of Lhs and Rhs tensor expressions, and how we invoke matrix
+// multiplication for these blocks. Default tensor contraction uses
+// gemm_pack_rhs, gemm_pack_lhs and gebp_kernel from Eigen Core (see
+// GeneralBlocPanelKernel.h for details).
+//
+// By specializing contraction kernels we can use other low level libraries to
+// perform matrix multiplication, and still rely on Eigen contraction evaluator.
+// This also includes full support in TensorContractionThreadPool, assuming that
+// underlying gemm do not use it's own threading.
+//
+// - ResScalar/LhsScalar/RhsScalar - scalar type for the result of
+//   multiplication, lhs tensor and rhs tensor respectively.
+//
+// - StorageIndex - index type for the tensor expressions. In practice almost
+//   always is Eigen::Index.
+//
+// - OutputMapper provides access to the memory of the output matrix. In
+//   practice it's always column major blas_data_mapper (it must be of ResScalar
+//   type).
+//
+// - LhsMapper/RhsMapper similarly to blas_data_mapper provide a two dimensional
+//   view into the Lhs/Rhs tensor expressions. In practice it's
+//   TensorContractionInputMapper, or some specialization of it based on the
+//   type of tensor expression (e.g. TensorImagePatchOp has optimized input
+//   mapper).
+template <typename ResScalar, typename LhsScalar, typename RhsScalar, typename StorageIndex, typename OutputMapper,
+          typename LhsMapper, typename RhsMapper>
+struct TensorContractionKernel {
+  // True if `invoke()` supports `beta` in `C <- alpha * A * B + beta * C`
+  // (otherwise beta should be always equal to 1).
+  enum { HasBeta = false };
+
+  EIGEN_DEVICE_FUNC TensorContractionKernel(StorageIndex m_, StorageIndex k_, StorageIndex n_, StorageIndex bm_,
+                                            StorageIndex bk_, StorageIndex bn_)
+      : m(m_), k(k_), n(n_), bm(bm_), bk(bk_), bn(bn_) {}
+
+  // Pack blocks of Lhs and Rhs into contiguous blocks in memory.
+  typedef LhsScalar* LhsBlock;
+  typedef RhsScalar* RhsBlock;
+
+  // Packed Lhs/Rhs block memory allocator.
+  typedef TensorContractionBlockMemAllocator<LhsScalar, RhsScalar> BlockMemAllocator;
+  typedef typename BlockMemAllocator::BlockMemHandle BlockMemHandle;
+
+  typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
+
+  typedef internal::gemm_pack_lhs<LhsScalar, StorageIndex, typename LhsMapper::SubMapper, Traits::mr,
+                                  Traits::LhsProgress, typename Traits::LhsPacket4Packing, ColMajor>
+      LhsPacker;
+
+  typedef internal::gemm_pack_rhs<RhsScalar, StorageIndex, typename RhsMapper::SubMapper, Traits::nr, ColMajor>
+      RhsPacker;
+
+  typedef internal::gebp_kernel<LhsScalar, RhsScalar, StorageIndex, OutputMapper, Traits::mr, Traits::nr,
+                                /*ConjugateLhs*/ false, /*ConjugateRhs*/ false>
+      GebpKernel;
+
+  template <typename Device>
+  EIGEN_DEVICE_FUNC BlockMemHandle allocate(Device& d, LhsBlock* lhs_block, RhsBlock* rhs_block) {
+    return BlockMemAllocator::allocate(d, bm, bk, bn, lhs_block, rhs_block);
+  }
+
+  template <typename Device>
+  EIGEN_DEVICE_FUNC BlockMemHandle allocateSlices(Device& d, const StorageIndex num_lhs, const StorageIndex num_rhs,
+                                                  const StorageIndex num_slices, std::vector<LhsBlock>* lhs_blocks,
+                                                  std::vector<RhsBlock>* rhs_blocks) {
+    return BlockMemAllocator::allocateSlices(d, bm, bk, bn, num_lhs, num_rhs, num_slices, lhs_blocks, rhs_blocks);
+  }
+
+  template <typename Device>
+  EIGEN_DEVICE_FUNC static void deallocate(Device& d, BlockMemHandle handle) {
+    BlockMemAllocator::deallocate(d, handle);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void packLhs(LhsBlock* lhsBlock, const typename LhsMapper::SubMapper& data_mapper,
+                                                   const StorageIndex depth, const StorageIndex rows) {
+    LhsPacker()(*lhsBlock, data_mapper, depth, rows, /*stride*/ 0,
+                /*offset*/ 0);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void packRhs(RhsBlock* rhsBlock, const typename RhsMapper::SubMapper& data_mapper,
+                                                   const StorageIndex depth, const StorageIndex cols) {
+    RhsPacker()(*rhsBlock, data_mapper, depth, cols);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_DONT_INLINE void invoke(const OutputMapper& output_mapper, const LhsBlock& lhsBlock,
+                                                  const RhsBlock& rhsBlock, const StorageIndex rows,
+                                                  const StorageIndex depth, const StorageIndex cols,
+                                                  const ResScalar alpha, const ResScalar beta) {
+    // Default GEBP kernel does not support beta.
+    eigen_assert(beta == ResScalar(1));
+    static const int kComputeStrideFromBlockDimensions = -1;
+    GebpKernel()(output_mapper, lhsBlock, rhsBlock, rows, depth, cols, alpha,
+                 /*strideA*/ kComputeStrideFromBlockDimensions,
+                 /*strideB*/ kComputeStrideFromBlockDimensions,
+                 /*offsetA*/ 0, /*offsetB*/ 0);
+  }
+
+ private:
+  // These are dimensions of the original Tensors, and selected block sizes. The
+  // actual block sizes passed to all function above might be smaller because of
+  // the partial blocks at the end.
+  const StorageIndex m;
+  const StorageIndex k;
+  const StorageIndex n;
+  const StorageIndex bm;
+  const StorageIndex bk;
+  const StorageIndex bn;
+};
+
+}  // end namespace internal
+
+// Tensor contraction params that should enable to get from output matrix
+// 2-dimensional coordinates to the output tensor dimensions.
+struct TensorContractionParams {
+  // TensorContraction evaluator assumes that both tensors are in ColMajor
+  // layout, if tensors are in RowMajor evaluator swap lhs with rhs.
+  bool swapped_arguments;
+};
+
+// Output kernel allows to fuse operations into the tensor contraction.
+//
+// Examples:
+//   1. Elementwise Relu transformation following Conv2D.
+//   2. AddBias to the Conv2D output channels dimension.
+//
+// The NoOpOutputKernel implements an output kernel that does absolutely nothing.
+struct NoOpOutputKernel {
+  /**
+   * Tensor contraction evaluator calls this kernel after finishing each block
+   * of output matrix. Output blocks belong to the 2-dimensional output tensor.
+   *
+   * TensorContractionParams contains contraction dimensions information
+   * required to map output 2-d space into the expected output tensor space
+   * (potentially higher dimensional).
+   *
+   * \param[in] output_mapper Access to output tensor memory
+   * \param[in] params   Tensor contraction parameters
+   * \param[in] i        Index of a first row available through output_mapper
+   * \param[in] j        Index of a first column available through output_mapper
+   * \param[in] num_rows Number of available rows
+   * \param[in] num_cols Number of available columns
+   */
+  template <typename Index, typename Scalar>
+  EIGEN_ALWAYS_INLINE void operator()(const internal::blas_data_mapper<Scalar, Index, ColMajor>& output_mapper,
+                                      const TensorContractionParams& params, Index i, Index j, Index num_rows,
+                                      Index num_cols) const {
+    EIGEN_UNUSED_VARIABLE(output_mapper);
+    EIGEN_UNUSED_VARIABLE(params);
+    EIGEN_UNUSED_VARIABLE(i);
+    EIGEN_UNUSED_VARIABLE(j);
+    EIGEN_UNUSED_VARIABLE(num_rows);
+    EIGEN_UNUSED_VARIABLE(num_cols);
+  }
+};
+
+/** Tensor contraction class.
+ * \ingroup CXX11_Tensor_Module
+ */
+template <typename Indices, typename LhsXprType, typename RhsXprType,
+          typename OutputKernelType = const NoOpOutputKernel>
+class TensorContractionOp
+    : public TensorBase<TensorContractionOp<Indices, LhsXprType, RhsXprType, OutputKernelType>, ReadOnlyAccessors> {
+ public:
+  typedef typename Eigen::internal::traits<TensorContractionOp>::Scalar Scalar;
+  typedef typename internal::gebp_traits<typename LhsXprType::CoeffReturnType,
+                                         typename RhsXprType::CoeffReturnType>::ResScalar CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorContractionOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorContractionOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorContractionOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionOp(const LhsXprType& lhs, const RhsXprType& rhs,
+                                                            const Indices& dims,
+                                                            const OutputKernelType& output_kernel = OutputKernelType())
+      : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_indices(dims), m_output_kernel(output_kernel) {}
+
+  EIGEN_DEVICE_FUNC const Indices& indices() const { return m_indices; }
+
+  /** \returns the nested expressions */
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename LhsXprType::Nested>& lhsExpression() const {
+    return m_lhs_xpr;
+  }
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename RhsXprType::Nested>& rhsExpression() const {
+    return m_rhs_xpr;
+  }
+
+  EIGEN_DEVICE_FUNC const OutputKernelType& outputKernel() const { return m_output_kernel; }
+
+ protected:
+  typename LhsXprType::Nested m_lhs_xpr;
+  typename RhsXprType::Nested m_rhs_xpr;
+  const Indices m_indices;
+  const OutputKernelType m_output_kernel;
+};
+
+template <typename Derived>
+struct TensorContractionEvaluatorBase {
+  typedef typename internal::traits<Derived>::Indices Indices;
+  typedef typename internal::traits<Derived>::LeftArgType LeftArgType;
+  typedef typename internal::traits<Derived>::RightArgType RightArgType;
+  typedef typename internal::traits<Derived>::OutputKernelType OutputKernelType;
+  typedef typename internal::traits<Derived>::Device Device;
+
+  typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType;
+  typedef std::remove_const_t<typename XprType::Scalar> Scalar;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef StorageMemory<Scalar, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  static constexpr int Layout = TensorEvaluator<LeftArgType, Device>::Layout;
+  enum {
+    IsAligned = true,
+    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
+    BlockAccess = false,
+    PreferBlockAccess = false,
+    CoordAccess = false,  // to be implemented
+    RawAccess = true
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  // Most of the code is assuming that both input tensors are ColMajor. If the
+  // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
+  // If we want to compute A * B = C, where A is LHS and B is RHS, the code
+  // will pretend B is LHS and A is RHS.
+  typedef std::conditional_t<static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>
+      EvalLeftArgType;
+  typedef std::conditional_t<static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>
+      EvalRightArgType;
+
+  typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluatorType;
+  typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluatorType;
+
+  static constexpr int LDims =
+      internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
+  static constexpr int RDims =
+      internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
+  static constexpr int ContractDims = internal::array_size<Indices>::value;
+  static constexpr int NumDims = LDims + RDims - 2 * ContractDims;
+
+  typedef array<Index, ContractDims> contract_t;
+  typedef array<Index, LDims - ContractDims> left_nocontract_t;
+  typedef array<Index, RDims - ContractDims> right_nocontract_t;
+
+  typedef DSizes<Index, NumDims> Dimensions;
+
+  EIGEN_STRONG_INLINE TensorContractionEvaluatorBase(const XprType& op, const Device& device)
+      : m_leftImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(), op.lhsExpression(),
+                          op.rhsExpression()),
+                   device),
+        m_rightImpl(choose(Cond<static_cast<int>(Layout) == static_cast<int>(ColMajor)>(), op.rhsExpression(),
+                           op.lhsExpression()),
+                    device),
+        m_device(device),
+        m_output_kernel(op.outputKernel()),
+        m_result(NULL) {
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) ==
+                         static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout)),
+                        YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    DSizes<Index, LDims> eval_left_dims;
+    DSizes<Index, RDims> eval_right_dims;
+    array<IndexPair<Index>, ContractDims> eval_op_indices;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      // For ColMajor, we keep using the existing dimensions
+      for (int i = 0; i < LDims; i++) {
+        eval_left_dims[i] = m_leftImpl.dimensions()[i];
+      }
+      for (int i = 0; i < RDims; i++) {
+        eval_right_dims[i] = m_rightImpl.dimensions()[i];
+      }
+      // We keep the pairs of contracting indices.
+      for (int i = 0; i < ContractDims; i++) {
+        eval_op_indices[i].first = op.indices()[i].first;
+        eval_op_indices[i].second = op.indices()[i].second;
+      }
+    } else {
+      // For RowMajor, we need to reverse the existing dimensions
+      for (int i = 0; i < LDims; i++) {
+        eval_left_dims[i] = m_leftImpl.dimensions()[LDims - i - 1];
+      }
+      for (int i = 0; i < RDims; i++) {
+        eval_right_dims[i] = m_rightImpl.dimensions()[RDims - i - 1];
+      }
+      // We need to flip all the pairs of contracting indices as well as
+      // reversing the dimensions.
+      for (int i = 0; i < ContractDims; i++) {
+        eval_op_indices[i].first = LDims - 1 - op.indices()[ContractDims - 1 - i].second;
+        eval_op_indices[i].second = RDims - 1 - op.indices()[ContractDims - 1 - i].first;
+      }
+    }
+
+    // Check for duplicate axes and make sure the first index in eval_op_indices
+    // is increasing. Using O(n^2) sorting is OK since ContractDims is small
+    for (int i = 0; i < ContractDims; i++) {
+      for (int j = i + 1; j < ContractDims; j++) {
+        eigen_assert(eval_op_indices[j].first != eval_op_indices[i].first &&
+                     eval_op_indices[j].second != eval_op_indices[i].second && "contraction axes should be unique");
+        if (eval_op_indices[j].first < eval_op_indices[i].first) {
+          numext::swap(eval_op_indices[j], eval_op_indices[i]);
+        }
+      }
+    }
+
+    array<Index, LDims> lhs_strides;
+    lhs_strides[0] = 1;
+    for (int i = 0; i < LDims - 1; ++i) {
+      lhs_strides[i + 1] = lhs_strides[i] * eval_left_dims[i];
+    }
+
+    array<Index, RDims> rhs_strides;
+    rhs_strides[0] = 1;
+    for (int i = 0; i < RDims - 1; ++i) {
+      rhs_strides[i + 1] = rhs_strides[i] * eval_right_dims[i];
+    }
+
+    if (m_i_strides.size() > 0) m_i_strides[0] = 1;
+    if (m_j_strides.size() > 0) m_j_strides[0] = 1;
+    if (m_k_strides.size() > 0) m_k_strides[0] = 1;
+
+    m_i_size = 1;
+    m_j_size = 1;
+    m_k_size = 1;
+
+    // To compute the dimension, we simply concatenate the non-contracting
+    // dimensions of the left and then the right tensor. Additionally, we also
+    // compute the strides corresponding to the left non-contracting
+    // dimensions and right non-contracting dimensions.
+    m_lhs_inner_dim_contiguous = true;
+    int dim_idx = 0;
+    Index nocontract_idx = 0;
+
+    for (int i = 0; i < LDims; i++) {
+      // find if we are contracting on index i of left tensor
+      bool contracting = false;
+      for (int j = 0; j < ContractDims; j++) {
+        if (eval_op_indices[j].first == i) {
+          contracting = true;
+          break;
+        }
+      }
+      if (!contracting) {
+        // add dimension size to output dimensions
+        m_dimensions[dim_idx] = eval_left_dims[i];
+        m_left_nocontract_strides[nocontract_idx] = lhs_strides[i];
+        if (dim_idx != i) {
+          m_lhs_inner_dim_contiguous = false;
+        }
+        if (nocontract_idx + 1 < internal::array_size<left_nocontract_t>::value) {
+          m_i_strides[nocontract_idx + 1] = m_i_strides[nocontract_idx] * eval_left_dims[i];
+        } else {
+          m_i_size = m_i_strides[nocontract_idx] * eval_left_dims[i];
+        }
+        dim_idx++;
+        nocontract_idx++;
+      }
+    }
+
+    nocontract_idx = 0;
+    for (int i = 0; i < RDims; i++) {
+      bool contracting = false;
+      // find if we are contracting on index i of right tensor
+      for (int j = 0; j < ContractDims; j++) {
+        if (eval_op_indices[j].second == i) {
+          contracting = true;
+          break;
+        }
+      }
+      if (!contracting) {
+        m_dimensions[dim_idx] = eval_right_dims[i];
+        if (nocontract_idx + 1 < internal::array_size<right_nocontract_t>::value) {
+          m_j_strides[nocontract_idx + 1] = m_j_strides[nocontract_idx] * eval_right_dims[i];
+        } else {
+          m_j_size = m_j_strides[nocontract_idx] * eval_right_dims[i];
+        }
+        m_right_nocontract_strides[nocontract_idx] = rhs_strides[i];
+        dim_idx++;
+        nocontract_idx++;
+      }
+    }
+
+    // Now compute the strides corresponding to the contracting dimensions. We
+    // assumed above that non-contracting axes are represented in the same order
+    // in the matrix as they are in the tensor. This is not the case for
+    // contracting axes. As the contracting axes must be of the same size in
+    // each tensor, we'll only look at the first tensor here.
+    m_rhs_inner_dim_contiguous = true;
+    m_rhs_inner_dim_reordered = false;
+    for (int i = 0; i < ContractDims; i++) {
+      Index left = eval_op_indices[i].first;
+      Index right = eval_op_indices[i].second;
+
+      Index size = eval_left_dims[left];
+      eigen_assert(size == eval_right_dims[right] && "Contraction axes must be same size");
+
+      if (i + 1 < static_cast<int>(internal::array_size<contract_t>::value)) {
+        m_k_strides[i + 1] = m_k_strides[i] * size;
+      } else {
+        m_k_size = m_k_strides[i] * size;
+      }
+      m_left_contracting_strides[i] = lhs_strides[left];
+      m_right_contracting_strides[i] = rhs_strides[right];
+
+      if (i > 0 && right < eval_op_indices[i - 1].second) {
+        m_rhs_inner_dim_reordered = true;
+      }
+      if (right != i) {
+        m_rhs_inner_dim_contiguous = false;
+      }
+    }
+
+    // If the layout is RowMajor, we need to reverse the m_dimensions
+    if (static_cast<int>(Layout) == static_cast<int>(RowMajor)) {
+      for (int i = 0, j = NumDims - 1; i < j; i++, j--) {
+        numext::swap(m_dimensions[i], m_dimensions[j]);
+      }
+    }
+
+    // A set of parameters that will allow output kernel to get from output
+    // tensor dimensions (i, j) into the original tensor dimensions.
+    // TODO(ezhulenev): Add parameters required to infer output tensor index for
+    // more complex contractions than 2x2 on internal dimension.
+    m_tensor_contraction_params.swapped_arguments = static_cast<int>(Layout) == RowMajor;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
+    m_leftImpl.evalSubExprsIfNeeded(NULL);
+    m_rightImpl.evalSubExprsIfNeeded(NULL);
+    if (data) {
+      evalTo(data);
+      return false;
+    } else {
+      m_result = static_cast<EvaluatorPointerType>(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)));
+      evalTo(m_result);
+      return true;
+    }
+  }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType dest, EvalSubExprsCallback done) {
+    m_leftImpl.evalSubExprsIfNeededAsync(nullptr, [this, done, dest](bool) {
+      m_rightImpl.evalSubExprsIfNeededAsync(nullptr, [this, done, dest](bool) {
+        if (dest) {
+          evalToAsync(dest, [done]() { done(false); });
+        } else {
+          m_result = static_cast<EvaluatorPointerType>(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)));
+          evalToAsync(m_result, [done]() { done(true); });
+        }
+      });
+    });
+  }
+#endif  // EIGEN_USE_THREADS
+
+#ifndef TENSOR_CONTRACTION_DISPATCH
+#define TENSOR_CONTRACTION_DISPATCH(METHOD, ALIGNMENT, ARGS) \
+  if (this->m_lhs_inner_dim_contiguous) {                    \
+    if (this->m_rhs_inner_dim_contiguous) {                  \
+      if (this->m_rhs_inner_dim_reordered) {                 \
+        METHOD<true, true, true, ALIGNMENT> ARGS;            \
+      } else {                                               \
+        METHOD<true, true, false, ALIGNMENT> ARGS;           \
+      }                                                      \
+    } else {                                                 \
+      if (this->m_rhs_inner_dim_reordered) {                 \
+        METHOD<true, false, true, ALIGNMENT> ARGS;           \
+      } else {                                               \
+        METHOD<true, false, false, ALIGNMENT> ARGS;          \
+      }                                                      \
+    }                                                        \
+  } else {                                                   \
+    if (this->m_rhs_inner_dim_contiguous) {                  \
+      if (this->m_rhs_inner_dim_reordered) {                 \
+        METHOD<false, true, true, ALIGNMENT> ARGS;           \
+      } else {                                               \
+        METHOD<false, true, false, ALIGNMENT> ARGS;          \
+      }                                                      \
+    } else {                                                 \
+      if (this->m_rhs_inner_dim_reordered) {                 \
+        METHOD<false, false, true, ALIGNMENT> ARGS;          \
+      } else {                                               \
+        METHOD<false, false, false, ALIGNMENT> ARGS;         \
+      }                                                      \
+    }                                                        \
+  }
+#endif
+
+#ifndef TENSOR_CONTRACTION_ASYNC_DISPATCH
+#define TENSOR_CONTRACTION_ASYNC_DISPATCH(METHOD, DONE, ALIGNMENT, ARGS, FN) \
+  if (this->m_lhs_inner_dim_contiguous) {                                    \
+    if (this->m_rhs_inner_dim_contiguous) {                                  \
+      if (this->m_rhs_inner_dim_reordered) {                                 \
+        (new METHOD<DONE, true, true, true, ALIGNMENT> ARGS)->FN;            \
+      } else {                                                               \
+        (new METHOD<DONE, true, true, false, ALIGNMENT> ARGS)->FN;           \
+      }                                                                      \
+    } else {                                                                 \
+      if (this->m_rhs_inner_dim_reordered) {                                 \
+        (new METHOD<DONE, true, false, true, ALIGNMENT> ARGS)->FN;           \
+      } else {                                                               \
+        (new METHOD<DONE, true, false, false, ALIGNMENT> ARGS)->FN;          \
+      }                                                                      \
+    }                                                                        \
+  } else {                                                                   \
+    if (this->m_rhs_inner_dim_contiguous) {                                  \
+      if (this->m_rhs_inner_dim_reordered) {                                 \
+        (new METHOD<DONE, false, true, true, ALIGNMENT> ARGS)->FN;           \
+      } else {                                                               \
+        (new METHOD<DONE, false, true, false, ALIGNMENT> ARGS)->FN;          \
+      }                                                                      \
+    } else {                                                                 \
+      if (this->m_rhs_inner_dim_reordered) {                                 \
+        (new METHOD<DONE, false, false, true, ALIGNMENT> ARGS)->FN;          \
+      } else {                                                               \
+        (new METHOD<DONE, false, false, false, ALIGNMENT> ARGS)->FN;         \
+      }                                                                      \
+    }                                                                        \
+  }
+#endif
+
+  EIGEN_DEVICE_FUNC void evalTo(Scalar* buffer) const {
+    static_cast<const Derived*>(this)->template evalProduct<Unaligned>(buffer);
+  }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalToCallback>
+  void evalToAsync(Scalar* buffer, EvalToCallback done) const {
+    static_cast<const Derived*>(this)->template evalProductAsync<EvalToCallback, Unaligned>(buffer, std::move(done));
+  }
+#endif  // EIGEN_USE_THREADS
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+  void evalProductSequential(Scalar* buffer) const {
+    if (this->m_j_size == 1) {
+      this->template evalGemv<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(
+          buffer);
+    } else {
+      this->template evalGemm<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment>(
+          buffer);
+    }
+  }
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+#if !defined(EIGEN_HIPCC)
+  EIGEN_DEVICE_FUNC
+#endif
+      void
+      evalGemv(Scalar* buffer) const {
+    const Index rows = m_i_size;
+    const Index cols = m_k_size;
+
+    typedef std::remove_const_t<typename EvalLeftArgType::Scalar> LhsScalar;
+    typedef std::remove_const_t<typename EvalRightArgType::Scalar> RhsScalar;
+    typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
+    typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
+    const int lhs_packet_size = internal::unpacket_traits<typename LeftEvaluator::PacketReturnType>::size;
+    const int rhs_packet_size = internal::unpacket_traits<typename RightEvaluator::PacketReturnType>::size;
+    const int lhs_alignment = LeftEvaluator::IsAligned ? Aligned : Unaligned;
+    const int rhs_alignment = RightEvaluator::IsAligned ? Aligned : Unaligned;
+    typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs, LeftEvaluator, left_nocontract_t,
+                                                   contract_t, lhs_packet_size, lhs_inner_dim_contiguous, false,
+                                                   lhs_alignment>
+        LhsMapper;
+
+    typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs, RightEvaluator, right_nocontract_t,
+                                                   contract_t, rhs_packet_size, rhs_inner_dim_contiguous,
+                                                   rhs_inner_dim_reordered, rhs_alignment>
+        RhsMapper;
+
+    LhsMapper lhs(m_leftImpl, m_left_nocontract_strides, m_i_strides, m_left_contracting_strides, m_k_strides);
+    RhsMapper rhs(m_rightImpl, m_right_nocontract_strides, m_j_strides, m_right_contracting_strides, m_k_strides);
+
+    const Scalar alpha(1);
+    const Index resIncr(1);
+
+    // zero out the result buffer (which must be of size at least rows * sizeof(Scalar)
+    m_device.fill(buffer, buffer + rows, Scalar(0));
+
+    internal::general_matrix_vector_product<Index, LhsScalar, LhsMapper, ColMajor, false, RhsScalar, RhsMapper,
+                                            false>::run(rows, cols, lhs, rhs, buffer, resIncr, alpha);
+
+    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+    m_output_kernel(OutputMapper(buffer, rows), m_tensor_contraction_params, static_cast<Index>(0),
+                    static_cast<Index>(0), rows, static_cast<Index>(1));
+  }
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+#if !defined(EIGEN_HIPCC)
+  EIGEN_DEVICE_FUNC
+#endif
+      void
+      evalGemm(Scalar* buffer) const {
+    // columns in left side, rows in right side
+    const Index k = this->m_k_size;
+    this->template evalGemmPartial<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered,
+                                   Alignment, true>(buffer, 0, k, 1);
+  }
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+  EIGEN_DEVICE_FUNC void evalGemmPartialWithoutOutputKernel(Scalar* buffer, Index k_start, Index k_end,
+                                                            int num_threads) const {
+    evalGemmPartial<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Alignment,
+                    /*use_output_kernel*/ false>(buffer, k_start, k_end, num_threads);
+  }
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment,
+            bool use_output_kernel>
+  EIGEN_DEVICE_FUNC void evalGemmPartial(Scalar* buffer, Index k_start, Index k_end, int num_threads) const {
+    eigen_assert(k_end >= k_start && k_start >= 0 && k_end <= this->m_k_size);
+    // columns in slice on left side, rows on right side
+    const Index k_slice = k_end - k_start;
+
+    // rows in left side
+    const Index m = this->m_i_size;
+
+    // columns in right side
+    const Index n = this->m_j_size;
+
+    // define data mappers for Lhs and Rhs
+    typedef std::remove_const_t<typename EvalLeftArgType::Scalar> LhsScalar;
+    typedef std::remove_const_t<typename EvalRightArgType::Scalar> RhsScalar;
+
+    typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
+    typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
+
+    const int lhs_packet_size = internal::unpacket_traits<typename LeftEvaluator::PacketReturnType>::size;
+    const int rhs_packet_size = internal::unpacket_traits<typename RightEvaluator::PacketReturnType>::size;
+
+    typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs, LeftEvaluator, left_nocontract_t,
+                                                   contract_t, lhs_packet_size, lhs_inner_dim_contiguous, false,
+                                                   Unaligned>
+        LhsMapper;
+
+    typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs, RightEvaluator, right_nocontract_t,
+                                                   contract_t, rhs_packet_size, rhs_inner_dim_contiguous,
+                                                   rhs_inner_dim_reordered, Unaligned>
+        RhsMapper;
+
+    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+
+    typedef internal::TensorContractionKernel<Scalar, LhsScalar, RhsScalar, Index, OutputMapper, LhsMapper, RhsMapper>
+        TensorContractionKernel;
+
+    // initialize data mappers
+    LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
+                  this->m_left_contracting_strides, this->m_k_strides);
+
+    RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
+                  this->m_right_contracting_strides, this->m_k_strides);
+
+    OutputMapper output(buffer, m);
+
+    // Sizes of the blocks to load in cache. See the Goto paper for details.
+    internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index, internal::ShardByCol> blocking(
+        k_slice, m, n, num_threads);
+    const Index kc = blocking.kc();
+    const Index mc = numext::mini(m, blocking.mc());
+    const Index nc = numext::mini(n, blocking.nc());
+
+    typedef typename TensorContractionKernel::LhsBlock LhsBlock;
+    typedef typename TensorContractionKernel::RhsBlock RhsBlock;
+
+    LhsBlock blockA;
+    RhsBlock blockB;
+
+    TensorContractionKernel kernel(m, k_slice, n, mc, kc, nc);
+
+    typedef typename TensorContractionKernel::BlockMemHandle BlockMemHandle;
+    const BlockMemHandle packed_mem = kernel.allocate(this->m_device, &blockA, &blockB);
+
+    // If a contraction kernel does not support beta, explicitly initialize
+    // output buffer with zeroes.
+    if (!TensorContractionKernel::HasBeta) {
+      this->m_device.fill(buffer, buffer + m * n, Scalar(0));
+    }
+
+    for (Index i2 = 0; i2 < m; i2 += mc) {
+      const Index actual_mc = numext::mini(i2 + mc, m) - i2;
+      for (Index k2 = k_start; k2 < k_end; k2 += kc) {
+        // make sure we don't overshoot right edge of left matrix, then pack vertical panel
+        const Index actual_kc = numext::mini(k2 + kc, k_end) - k2;
+        kernel.packLhs(&blockA, lhs.getSubMapper(i2, k2), actual_kc, actual_mc);
+
+        // If kernel supports beta, there is no need to initialize output
+        // buffer with zeroes.
+        const Scalar alpha = Scalar(1);
+        const Scalar beta = (TensorContractionKernel::HasBeta && k2 == k_start) ? Scalar(0) : Scalar(1);
+
+        // series of horizontal blocks
+        for (Index j2 = 0; j2 < n; j2 += nc) {
+          // make sure we don't overshoot right edge of right matrix, then pack block
+          const Index actual_nc = numext::mini(j2 + nc, n) - j2;
+          kernel.packRhs(&blockB, rhs.getSubMapper(k2, j2), actual_kc, actual_nc);
+
+          // call gebp (matrix kernel)
+          // The parameters here are copied from Eigen's GEMM implementation
+          const OutputMapper output_mapper = output.getSubMapper(i2, j2);
+          kernel.invoke(output_mapper, blockA, blockB, actual_mc, actual_kc, actual_nc, alpha, beta);
+
+          // We are done with this [i2, j2] output block.
+          if (use_output_kernel && k2 + kc >= k_end) {
+            m_output_kernel(output_mapper, m_tensor_contraction_params, i2, j2, actual_mc, actual_nc);
+          }
+        }
+      }
+    }
+
+    kernel.deallocate(this->m_device, packed_mem);
+  }
+
+  EIGEN_STRONG_INLINE void cleanup() {
+    m_leftImpl.cleanup();
+    m_rightImpl.cleanup();
+
+    if (m_result != NULL) {
+      m_device.deallocate(m_result);
+      m_result = NULL;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_result[index]; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const {
+    return TensorOpCost(sizeof(CoeffReturnType), 0, 0);
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+    return internal::ploadt<PacketReturnType, LoadMode>(m_result + index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_result; }
+
+ protected:
+  Dimensions m_dimensions;
+
+  contract_t m_k_strides;
+  contract_t m_left_contracting_strides;
+  contract_t m_right_contracting_strides;
+
+  bool m_lhs_inner_dim_contiguous;
+  bool m_rhs_inner_dim_contiguous;
+  bool m_rhs_inner_dim_reordered;
+
+  left_nocontract_t m_i_strides;
+  right_nocontract_t m_j_strides;
+  left_nocontract_t m_left_nocontract_strides;
+  right_nocontract_t m_right_nocontract_strides;
+
+  Index m_i_size;
+  Index m_j_size;
+  Index m_k_size;
+
+  TensorContractionParams m_tensor_contraction_params;
+
+  TensorEvaluator<EvalLeftArgType, Device> m_leftImpl;
+  TensorEvaluator<EvalRightArgType, Device> m_rightImpl;
+  const Device EIGEN_DEVICE_REF m_device;
+  OutputKernelType m_output_kernel;
+  EvaluatorPointerType m_result;
+};
+
+// evaluator for default device
+template <typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType, typename Device>
+struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device>
+    : public TensorContractionEvaluatorBase<
+          TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device>> {
+  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> Self;
+  typedef TensorContractionEvaluatorBase<Self> Base;
+
+  typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType;
+  typedef std::remove_const_t<typename XprType::Scalar> Scalar;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+
+  static constexpr int Layout = TensorEvaluator<LeftArgType, Device>::Layout;
+
+  // Most of the code is assuming that both input tensors are ColMajor. If the
+  // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
+  // If we want to compute A * B = C, where A is LHS and B is RHS, the code
+  // will pretend B is LHS and A is RHS.
+  typedef std::conditional_t<Layout == static_cast<int>(ColMajor), LeftArgType, RightArgType> EvalLeftArgType;
+  typedef std::conditional_t<Layout == static_cast<int>(ColMajor), RightArgType, LeftArgType> EvalRightArgType;
+
+  static constexpr int LDims =
+      internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
+  static constexpr int RDims =
+      internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
+  static constexpr int ContractDims = internal::array_size<Indices>::value;
+
+  typedef array<Index, ContractDims> contract_t;
+  typedef array<Index, LDims - ContractDims> left_nocontract_t;
+  typedef array<Index, RDims - ContractDims> right_nocontract_t;
+
+  static constexpr int NumDims = LDims + RDims - 2 * ContractDims;
+
+  // Could we use NumDimensions here?
+  typedef DSizes<Index, NumDims> Dimensions;
+
+  TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {}
+
+  template <int Alignment>
+  void evalProduct(Scalar* buffer) const {
+    TENSOR_CONTRACTION_DISPATCH(this->template evalProductSequential, Alignment, (buffer));
+  }
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
new file mode 100644
index 00000000..7fbe30a9
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionBlocking.h
@@ -0,0 +1,69 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+enum { ShardByRow = 0, ShardByCol = 1 };
+
+// Default Blocking Strategy
+template <typename ResScalar, typename LhsScalar, typename RhsScalar, typename StorageIndex,
+          int ShardingType = ShardByCol>
+class TensorContractionBlocking {
+ public:
+  /*
+    adding EIGEN_DEVICE_FUNC unconditionally to 'TensorContractionBlocking' constructor in `TensorContractionBlocking.h`
+      requires adding EIGEN_DEVICE_FUNC to `computeProductBlockingSizes` in `GeneralBlockPanelKernel.h`
+      which in turn, requires adding EIGEN_DEVICE_FUNC to `evaluateProductBlockingSizesHeuristic` in
+    `GeneralBlockPanelKernel.h` which in turn, requires adding EIGEN_DEVICE_FUNC to `manage_caching_sizes` in
+    `GeneralBlockPanelKernel.h` (else HIPCC will error out)
+
+    However adding EIGEN_DEVICE_FUNC to `manage_caching_sizes` in `GeneralBlockPanelKernel.h`
+    results in NVCC erroring out with the following error
+
+    ../Eigen/src/Core/products/GeneralBlockPanelKernel.h(57): error #2901:
+       dynamic initialization is not supported for function-scope static variables within a __device__/__global__
+    function
+  */
+
+#if !defined(EIGEN_HIPCC)
+  EIGEN_DEVICE_FUNC
+#endif
+  TensorContractionBlocking(StorageIndex k, StorageIndex m, StorageIndex n, StorageIndex num_threads = 1)
+      : kc_(k), mc_(m), nc_(n) {
+    if (ShardingType == ShardByCol) {
+      computeProductBlockingSizes<LhsScalar, RhsScalar, 1>(kc_, mc_, nc_, num_threads);
+    } else {
+      computeProductBlockingSizes<LhsScalar, RhsScalar, 1>(kc_, nc_, mc_, num_threads);
+    }
+
+    const int rhs_packet_size = internal::packet_traits<RhsScalar>::size;
+    kc_ = (rhs_packet_size <= 8 || kc_ <= rhs_packet_size) ? kc_ : (kc_ / rhs_packet_size) * rhs_packet_size;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex kc() const { return kc_; }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex mc() const { return mc_; }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE StorageIndex nc() const { return nc_; }
+
+ private:
+  StorageIndex kc_;
+  StorageIndex mc_;
+  StorageIndex nc_;
+};
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_BLOCKING_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
new file mode 100644
index 00000000..dbea8aa9
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
@@ -0,0 +1,7 @@
+
+#if defined(__clang__) || defined(__GNUC__)
+#warning \
+    "Deprecated header file, please either include the main Eigen/CXX11/Tensor header or the respective TensorContractionGpu.h file"
+#endif
+
+#include "TensorContractionGpu.h"
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h
new file mode 100644
index 00000000..780e8961
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionGpu.h
@@ -0,0 +1,1387 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014-2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2015 Navdeep Jaitly <ndjaitly@google.com>
+// Copyright (C) 2014 Eric Martin <eric@ericmart.in>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H
+
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+template <typename Scalar, typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper,
+          bool needs_edge_check>
+__device__ EIGEN_STRONG_INLINE void EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
+                                                                   const OutputMapper output, Scalar* lhs_shmem,
+                                                                   Scalar* rhs_shmem, const Index m_size,
+                                                                   const Index n_size, const Index k_size) {
+  const Index m_block_idx = blockIdx.x;
+  const Index n_block_idx = blockIdx.y;
+
+  const Index base_m = 64 * m_block_idx;
+  const Index base_n = 64 * n_block_idx;
+
+  // declare and initialize 64 registers for output 8x8 block
+
+  // prefetch registers
+  Scalar lhs_pf0;
+  Scalar lhs_pf1;
+  Scalar lhs_pf2;
+  Scalar lhs_pf3;
+  Scalar lhs_pf4;
+  Scalar lhs_pf5;
+  Scalar lhs_pf6;
+  Scalar lhs_pf7;
+
+  Scalar rhs_pf0;
+  Scalar rhs_pf1;
+  Scalar rhs_pf2;
+  Scalar rhs_pf3;
+  Scalar rhs_pf4;
+  Scalar rhs_pf5;
+  Scalar rhs_pf6;
+  Scalar rhs_pf7;
+
+  // shared memory is formatted
+  // (contract idx in block, nocontract idx in block, block idx)
+  // where block idx is column major. This transposition limits the number of
+  // bank conflicts when reading the LHS. The core idea is that since the contracting
+  // index is shared by both sides, then the contracting index should be in threadIdx.x.
+
+  // On the LHS, we pad each row inside of each block with an extra element. This makes
+  // each block 8 rows of 9 elements, which is 72 elements. This gives no bank conflicts
+  // on writes and very few 2-way conflicts on reads. There is an 8x8 grid of these blocks.
+
+  // On the RHS we just add 8 padding elements to the end of each block. This gives no bank
+  // conflicts on writes and also none on reads.
+
+  // storage indices
+  const Index lhs_store_idx_base = threadIdx.y * 72 + threadIdx.x * 9 + threadIdx.z;
+  const Index rhs_store_idx_base = threadIdx.y * 72 + threadIdx.z * 8 + threadIdx.x;
+
+  const Index lhs_store_idx_0 = lhs_store_idx_base + 576 * 0;
+  const Index lhs_store_idx_1 = lhs_store_idx_base + 576 * 1;
+  const Index lhs_store_idx_2 = lhs_store_idx_base + 576 * 2;
+  const Index lhs_store_idx_3 = lhs_store_idx_base + 576 * 3;
+  const Index lhs_store_idx_4 = lhs_store_idx_base + 576 * 4;
+  const Index lhs_store_idx_5 = lhs_store_idx_base + 576 * 5;
+  const Index lhs_store_idx_6 = lhs_store_idx_base + 576 * 6;
+  const Index lhs_store_idx_7 = lhs_store_idx_base + 576 * 7;
+
+  const Index rhs_store_idx_0 = rhs_store_idx_base + 576 * 0;
+  const Index rhs_store_idx_1 = rhs_store_idx_base + 576 * 1;
+  const Index rhs_store_idx_2 = rhs_store_idx_base + 576 * 2;
+  const Index rhs_store_idx_3 = rhs_store_idx_base + 576 * 3;
+  const Index rhs_store_idx_4 = rhs_store_idx_base + 576 * 4;
+  const Index rhs_store_idx_5 = rhs_store_idx_base + 576 * 5;
+  const Index rhs_store_idx_6 = rhs_store_idx_base + 576 * 6;
+  const Index rhs_store_idx_7 = rhs_store_idx_base + 576 * 7;
+
+  // in the loading code, the following variables are important:
+  // threadIdx.x: the vertical position in an 8x8 block
+  // threadIdx.y: the vertical index of the 8x8 block in the grid
+  // threadIdx.z: the horizontal position in an 8x8 block
+  // k: the horizontal index of the 8x8 block in the grid
+  //
+  // The k parameter is implicit (it was the loop counter for a loop that went
+  // from 0 to <8, but now that loop is unrolled in the below code.
+
+  const Index load_idx_vert = threadIdx.x + 8 * threadIdx.y;
+  const Index lhs_vert = base_m + load_idx_vert;
+
+#define prefetchIntoRegisters(base_k)                         \
+  {                                                           \
+    lhs_pf0 = conv(0);                                        \
+    lhs_pf1 = conv(0);                                        \
+    lhs_pf2 = conv(0);                                        \
+    lhs_pf3 = conv(0);                                        \
+    lhs_pf4 = conv(0);                                        \
+    lhs_pf5 = conv(0);                                        \
+    lhs_pf6 = conv(0);                                        \
+    lhs_pf7 = conv(0);                                        \
+                                                              \
+    rhs_pf0 = conv(0);                                        \
+    rhs_pf1 = conv(0);                                        \
+    rhs_pf2 = conv(0);                                        \
+    rhs_pf3 = conv(0);                                        \
+    rhs_pf4 = conv(0);                                        \
+    rhs_pf5 = conv(0);                                        \
+    rhs_pf6 = conv(0);                                        \
+    rhs_pf7 = conv(0);                                        \
+                                                              \
+    if (!needs_edge_check || lhs_vert < m_size) {             \
+      const Index lhs_horiz_0 = base_k + threadIdx.z + 0 * 8; \
+      const Index lhs_horiz_1 = base_k + threadIdx.z + 1 * 8; \
+      const Index lhs_horiz_2 = base_k + threadIdx.z + 2 * 8; \
+      const Index lhs_horiz_3 = base_k + threadIdx.z + 3 * 8; \
+      const Index lhs_horiz_4 = base_k + threadIdx.z + 4 * 8; \
+      const Index lhs_horiz_5 = base_k + threadIdx.z + 5 * 8; \
+      const Index lhs_horiz_6 = base_k + threadIdx.z + 6 * 8; \
+      const Index lhs_horiz_7 = base_k + threadIdx.z + 7 * 8; \
+                                                              \
+      if (!needs_edge_check || lhs_horiz_7 < k_size) {        \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                 \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                 \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                 \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                 \
+        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                 \
+        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                 \
+        lhs_pf6 = lhs(lhs_vert, lhs_horiz_6);                 \
+        lhs_pf7 = lhs(lhs_vert, lhs_horiz_7);                 \
+      } else if (lhs_horiz_6 < k_size) {                      \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                 \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                 \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                 \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                 \
+        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                 \
+        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                 \
+        lhs_pf6 = lhs(lhs_vert, lhs_horiz_6);                 \
+      } else if (lhs_horiz_5 < k_size) {                      \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                 \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                 \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                 \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                 \
+        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                 \
+        lhs_pf5 = lhs(lhs_vert, lhs_horiz_5);                 \
+      } else if (lhs_horiz_4 < k_size) {                      \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                 \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                 \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                 \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                 \
+        lhs_pf4 = lhs(lhs_vert, lhs_horiz_4);                 \
+      } else if (lhs_horiz_3 < k_size) {                      \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                 \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                 \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                 \
+        lhs_pf3 = lhs(lhs_vert, lhs_horiz_3);                 \
+      } else if (lhs_horiz_2 < k_size) {                      \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                 \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                 \
+        lhs_pf2 = lhs(lhs_vert, lhs_horiz_2);                 \
+      } else if (lhs_horiz_1 < k_size) {                      \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                 \
+        lhs_pf1 = lhs(lhs_vert, lhs_horiz_1);                 \
+      } else if (lhs_horiz_0 < k_size) {                      \
+        lhs_pf0 = lhs(lhs_vert, lhs_horiz_0);                 \
+      }                                                       \
+    }                                                         \
+                                                              \
+    const Index rhs_vert = base_k + load_idx_vert;            \
+    if (!needs_edge_check || rhs_vert < k_size) {             \
+      const Index rhs_horiz_0 = base_n + threadIdx.z + 0 * 8; \
+      const Index rhs_horiz_1 = base_n + threadIdx.z + 1 * 8; \
+      const Index rhs_horiz_2 = base_n + threadIdx.z + 2 * 8; \
+      const Index rhs_horiz_3 = base_n + threadIdx.z + 3 * 8; \
+      const Index rhs_horiz_4 = base_n + threadIdx.z + 4 * 8; \
+      const Index rhs_horiz_5 = base_n + threadIdx.z + 5 * 8; \
+      const Index rhs_horiz_6 = base_n + threadIdx.z + 6 * 8; \
+      const Index rhs_horiz_7 = base_n + threadIdx.z + 7 * 8; \
+                                                              \
+      if (rhs_horiz_7 < n_size) {                             \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                 \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                 \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                 \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                 \
+        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                 \
+        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                 \
+        rhs_pf6 = rhs(rhs_vert, rhs_horiz_6);                 \
+        rhs_pf7 = rhs(rhs_vert, rhs_horiz_7);                 \
+      } else if (rhs_horiz_6 < n_size) {                      \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                 \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                 \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                 \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                 \
+        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                 \
+        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                 \
+        rhs_pf6 = rhs(rhs_vert, rhs_horiz_6);                 \
+      } else if (rhs_horiz_5 < n_size) {                      \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                 \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                 \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                 \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                 \
+        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                 \
+        rhs_pf5 = rhs(rhs_vert, rhs_horiz_5);                 \
+      } else if (rhs_horiz_4 < n_size) {                      \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                 \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                 \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                 \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                 \
+        rhs_pf4 = rhs(rhs_vert, rhs_horiz_4);                 \
+      } else if (rhs_horiz_3 < n_size) {                      \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                 \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                 \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                 \
+        rhs_pf3 = rhs(rhs_vert, rhs_horiz_3);                 \
+      } else if (rhs_horiz_2 < n_size) {                      \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                 \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                 \
+        rhs_pf2 = rhs(rhs_vert, rhs_horiz_2);                 \
+      } else if (rhs_horiz_1 < n_size) {                      \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                 \
+        rhs_pf1 = rhs(rhs_vert, rhs_horiz_1);                 \
+      } else if (rhs_horiz_0 < n_size) {                      \
+        rhs_pf0 = rhs(rhs_vert, rhs_horiz_0);                 \
+      }                                                       \
+    }                                                         \
+  }
+
+#define writeRegToShmem()               \
+  lhs_shmem[lhs_store_idx_0] = lhs_pf0; \
+  rhs_shmem[rhs_store_idx_0] = rhs_pf0; \
+                                        \
+  lhs_shmem[lhs_store_idx_1] = lhs_pf1; \
+  rhs_shmem[rhs_store_idx_1] = rhs_pf1; \
+                                        \
+  lhs_shmem[lhs_store_idx_2] = lhs_pf2; \
+  rhs_shmem[rhs_store_idx_2] = rhs_pf2; \
+                                        \
+  lhs_shmem[lhs_store_idx_3] = lhs_pf3; \
+  rhs_shmem[rhs_store_idx_3] = rhs_pf3; \
+                                        \
+  lhs_shmem[lhs_store_idx_4] = lhs_pf4; \
+  rhs_shmem[rhs_store_idx_4] = rhs_pf4; \
+                                        \
+  lhs_shmem[lhs_store_idx_5] = lhs_pf5; \
+  rhs_shmem[rhs_store_idx_5] = rhs_pf5; \
+                                        \
+  lhs_shmem[lhs_store_idx_6] = lhs_pf6; \
+  rhs_shmem[rhs_store_idx_6] = rhs_pf6; \
+                                        \
+  lhs_shmem[lhs_store_idx_7] = lhs_pf7; \
+  rhs_shmem[rhs_store_idx_7] = rhs_pf7;
+
+  // declare and initialize result array
+#define res(i, j) _res_##i##j
+#define initResultRow(i)      \
+  Scalar res(i, 0) = conv(0); \
+  Scalar res(i, 1) = conv(0); \
+  Scalar res(i, 2) = conv(0); \
+  Scalar res(i, 3) = conv(0); \
+  Scalar res(i, 4) = conv(0); \
+  Scalar res(i, 5) = conv(0); \
+  Scalar res(i, 6) = conv(0); \
+  Scalar res(i, 7) = conv(0);
+
+  internal::scalar_cast_op<int, Scalar> conv;
+  initResultRow(0);
+  initResultRow(1);
+  initResultRow(2);
+  initResultRow(3);
+  initResultRow(4);
+  initResultRow(5);
+  initResultRow(6);
+  initResultRow(7);
+#undef initResultRow
+
+  for (Index base_k = 0; base_k < k_size; base_k += 64) {
+    // wait for previous iteration to finish with shmem. Despite common sense,
+    // the code is a bit faster with this here then at bottom of loop
+    __syncthreads();
+
+    prefetchIntoRegisters(base_k);
+    writeRegToShmem();
+
+#undef prefetchIntoRegisters
+#undef writeRegToShmem
+
+    // wait for shared mem packing to be done before starting computation
+    __syncthreads();
+
+    // compute 8x8 matrix product by outer product. This involves packing one column
+    // of LHS and one row of RHS into registers (takes 16 registers).
+
+#define lcol(i) _lcol##i
+    Scalar lcol(0);
+    Scalar lcol(1);
+    Scalar lcol(2);
+    Scalar lcol(3);
+    Scalar lcol(4);
+    Scalar lcol(5);
+    Scalar lcol(6);
+    Scalar lcol(7);
+
+#define rrow(j) _rrow##j
+    Scalar rrow(0);
+    Scalar rrow(1);
+    Scalar rrow(2);
+    Scalar rrow(3);
+    Scalar rrow(4);
+    Scalar rrow(5);
+    Scalar rrow(6);
+    Scalar rrow(7);
+
+    // Now x corresponds to k, y to m, and z to n
+    const Scalar* lhs_block = &lhs_shmem[threadIdx.x + 9 * threadIdx.y];
+    const Scalar* rhs_block = &rhs_shmem[threadIdx.x + 8 * threadIdx.z];
+
+#define lhs_element(i, j) lhs_block[72 * ((i) + 8 * (j))]
+#define rhs_element(i, j) rhs_block[72 * ((i) + 8 * (j))]
+
+#define loadData(i, j)         \
+  lcol(0) = lhs_element(0, j); \
+  rrow(0) = rhs_element(i, 0); \
+  lcol(1) = lhs_element(1, j); \
+  rrow(1) = rhs_element(i, 1); \
+  lcol(2) = lhs_element(2, j); \
+  rrow(2) = rhs_element(i, 2); \
+  lcol(3) = lhs_element(3, j); \
+  rrow(3) = rhs_element(i, 3); \
+  lcol(4) = lhs_element(4, j); \
+  rrow(4) = rhs_element(i, 4); \
+  lcol(5) = lhs_element(5, j); \
+  rrow(5) = rhs_element(i, 5); \
+  lcol(6) = lhs_element(6, j); \
+  rrow(6) = rhs_element(i, 6); \
+  lcol(7) = lhs_element(7, j); \
+  rrow(7) = rhs_element(i, 7);
+
+#define computeCol(j)             \
+  res(0, j) += lcol(0) * rrow(j); \
+  res(1, j) += lcol(1) * rrow(j); \
+  res(2, j) += lcol(2) * rrow(j); \
+  res(3, j) += lcol(3) * rrow(j); \
+  res(4, j) += lcol(4) * rrow(j); \
+  res(5, j) += lcol(5) * rrow(j); \
+  res(6, j) += lcol(6) * rrow(j); \
+  res(7, j) += lcol(7) * rrow(j);
+
+#define computePass(i) \
+  loadData(i, i);      \
+                       \
+  computeCol(0);       \
+  computeCol(1);       \
+  computeCol(2);       \
+  computeCol(3);       \
+  computeCol(4);       \
+  computeCol(5);       \
+  computeCol(6);       \
+  computeCol(7);
+
+    computePass(0);
+    computePass(1);
+    computePass(2);
+    computePass(3);
+    computePass(4);
+    computePass(5);
+    computePass(6);
+    computePass(7);
+
+#undef lcol
+#undef rrow
+#undef lhs_element
+#undef rhs_element
+#undef loadData
+#undef computeCol
+#undef computePass
+  }  // end loop over k
+
+  // we've now iterated over all of the large (ie width 64) k blocks and
+  // accumulated results in registers. At this point thread (x, y, z) contains
+  // the sum across all big k blocks of the product of little k block of index (x, y)
+  // with block of index (y, z). To compute the final output, we need to reduce
+  // the 8 threads over y by summation.
+#if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000)
+#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor(res(i, j), mask)
+#else
+#define shuffleInc(i, j, mask) res(i, j) += __shfl_xor_sync(0xFFFFFFFF, res(i, j), mask)
+#endif
+
+#define reduceRow(i, mask) \
+  shuffleInc(i, 0, mask);  \
+  shuffleInc(i, 1, mask);  \
+  shuffleInc(i, 2, mask);  \
+  shuffleInc(i, 3, mask);  \
+  shuffleInc(i, 4, mask);  \
+  shuffleInc(i, 5, mask);  \
+  shuffleInc(i, 6, mask);  \
+  shuffleInc(i, 7, mask);
+
+#define reduceMatrix(mask) \
+  reduceRow(0, mask);      \
+  reduceRow(1, mask);      \
+  reduceRow(2, mask);      \
+  reduceRow(3, mask);      \
+  reduceRow(4, mask);      \
+  reduceRow(5, mask);      \
+  reduceRow(6, mask);      \
+  reduceRow(7, mask);
+
+  // actually perform the reduction, now each thread of index (_, y, z)
+  // contains the correct values in its registers that belong in the output
+  // block
+  reduceMatrix(1);
+  reduceMatrix(2);
+  reduceMatrix(4);
+
+#undef shuffleInc
+#undef reduceRow
+#undef reduceMatrix
+
+  // now we need to copy the 64 values into main memory. We can't split work
+  // among threads because all variables are in registers. There's 2 ways
+  // to do this:
+  // (1) have 1 thread do 64 writes from registers into global memory
+  // (2) have 1 thread do 64 writes into shared memory, and then 8 threads
+  //     each do 8 writes into global memory. We can just overwrite the shared
+  //     memory from the problem we just solved.
+  // (2) is slightly faster than (1) due to less branching and more ILP
+
+  // TODO: won't yield much gain, but could just use currently unused shared mem
+  //       and then we won't have to sync
+  // wait for shared mem to be out of use
+  __syncthreads();
+
+#define writeResultShmem(i, j) lhs_shmem[i + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j] = res(i, j);
+
+#define writeRow(i)       \
+  writeResultShmem(i, 0); \
+  writeResultShmem(i, 1); \
+  writeResultShmem(i, 2); \
+  writeResultShmem(i, 3); \
+  writeResultShmem(i, 4); \
+  writeResultShmem(i, 5); \
+  writeResultShmem(i, 6); \
+  writeResultShmem(i, 7);
+
+  if (threadIdx.x == 0) {
+    writeRow(0);
+    writeRow(1);
+    writeRow(2);
+    writeRow(3);
+    writeRow(4);
+    writeRow(5);
+    writeRow(6);
+    writeRow(7);
+  }
+#undef writeResultShmem
+#undef writeRow
+
+  const int max_i_write = numext::mini((int)((m_size - base_m - threadIdx.y + 7) / 8), 8);
+  const int max_j_write = numext::mini((int)((n_size - base_n - threadIdx.z + 7) / 8), 8);
+
+  if (threadIdx.x < max_i_write) {
+    if (max_j_write == 8) {
+      // TODO: can i trade bank conflicts for coalesced writes?
+      Scalar val0 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 0];
+      Scalar val1 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 1];
+      Scalar val2 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 2];
+      Scalar val3 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 3];
+      Scalar val4 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 4];
+      Scalar val5 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 5];
+      Scalar val6 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 6];
+      Scalar val7 = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * 7];
+
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 0) = val0;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 1) = val1;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 2) = val2;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 3) = val3;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 4) = val4;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 5) = val5;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 6) = val6;
+      output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * 7) = val7;
+    } else {
+#pragma unroll 7
+      for (int j = 0; j < max_j_write; j++) {
+        Scalar val = lhs_shmem[threadIdx.x + 8 * threadIdx.y + 64 * threadIdx.z + 512 * j];
+        output(base_m + threadIdx.y + 8 * threadIdx.x, base_n + threadIdx.z + 8 * j) = val;
+      }
+    }
+  }
+#undef res
+}
+
+template <typename Scalar, typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper>
+__global__ void
+#if defined(EIGEN_HIPCC)
+__launch_bounds__(512, 1)
+#else
+__launch_bounds__(512)
+#endif
+    EigenContractionKernel(const LhsMapper lhs, const RhsMapper rhs, const OutputMapper output, const Index m_size,
+                           const Index n_size, const Index k_size) {
+  __shared__ Scalar lhs_shmem[72 * 64];
+  __shared__ Scalar rhs_shmem[72 * 64];
+
+  const Index m_block_idx = blockIdx.x;
+  const Index n_block_idx = blockIdx.y;
+
+  const Index base_m = 64 * m_block_idx;
+  const Index base_n = 64 * n_block_idx;
+
+  if (base_m + 63 < m_size && base_n + 63 < n_size) {
+    EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, false>(
+        lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size);
+  } else {
+    EigenContractionKernelInternal<Scalar, Index, LhsMapper, RhsMapper, OutputMapper, true>(
+        lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size);
+  }
+}
+
+template <typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY,
+          bool CHECK_RHS_BOUNDARY>
+__device__ __forceinline__ void EigenFloatContractionKernelInternal16x16(const LhsMapper lhs, const RhsMapper rhs,
+                                                                         const OutputMapper output,
+                                                                         float2 lhs_shmem2[][16],
+                                                                         float2 rhs_shmem2[][8], const Index m_size,
+                                                                         const Index n_size, const Index k_size,
+                                                                         const Index base_m, const Index base_n) {
+  // prefetch registers
+  float4 lhs_pf0, rhs_pf0;
+
+  float4 results[4];
+  for (int i = 0; i < 4; i++) {
+    results[i].x = results[i].y = results[i].z = results[i].w = 0;
+  }
+
+#define prefetch_lhs(reg, row, col)                                 \
+  if (!CHECK_LHS_BOUNDARY) {                                        \
+    if (col < k_size) {                                             \
+      reg = lhs.template loadPacket<float4, Unaligned>(row, col);   \
+    }                                                               \
+  } else {                                                          \
+    if (col < k_size) {                                             \
+      if (row + 3 < m_size) {                                       \
+        reg = lhs.template loadPacket<float4, Unaligned>(row, col); \
+      } else if (row + 2 < m_size) {                                \
+        reg.x = lhs(row + 0, col);                                  \
+        reg.y = lhs(row + 1, col);                                  \
+        reg.z = lhs(row + 2, col);                                  \
+      } else if (row + 1 < m_size) {                                \
+        reg.x = lhs(row + 0, col);                                  \
+        reg.y = lhs(row + 1, col);                                  \
+      } else if (row < m_size) {                                    \
+        reg.x = lhs(row + 0, col);                                  \
+      }                                                             \
+    }                                                               \
+  }
+
+  Index lhs_vert = base_m + threadIdx.x * 4;
+
+  for (Index k = 0; k < k_size; k += 16) {
+    lhs_pf0 = internal::pset1<float4>(0);
+    rhs_pf0 = internal::pset1<float4>(0);
+
+    Index lhs_horiz = threadIdx.y + k;
+    prefetch_lhs(lhs_pf0, lhs_vert, lhs_horiz)
+
+        Index rhs_vert = k + (threadIdx.x % 4) * 4;
+    Index rhs_horiz0 = (threadIdx.x >> 2) + threadIdx.y * 4 + base_n;
+
+    if (!CHECK_RHS_BOUNDARY) {
+      if ((rhs_vert + 3) < k_size) {
+        // just CHECK_RHS_BOUNDARY
+        rhs_pf0 = rhs.template loadPacket<float4, Unaligned>(rhs_vert, rhs_horiz0);
+      } else if (rhs_vert + 2 < k_size) {
+        // just CHECK_RHS_BOUNDARY
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+      } else if (rhs_vert + 1 < k_size) {
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+      } else if (rhs_vert < k_size) {
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+      }
+    } else {
+      if (rhs_horiz0 < n_size) {
+        if ((rhs_vert + 3) < k_size) {
+          rhs_pf0 = rhs.template loadPacket<float4, Unaligned>(rhs_vert, rhs_horiz0);
+        } else if ((rhs_vert + 2) < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+        } else if ((rhs_vert + 1) < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        } else if (rhs_vert < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        }
+      }
+    }
+    float x1, x2;
+    // the following can be a bitwise operation..... some day.
+    if ((threadIdx.x % 8) < 4) {
+      x1 = rhs_pf0.y;
+      x2 = rhs_pf0.w;
+    } else {
+      x1 = rhs_pf0.x;
+      x2 = rhs_pf0.z;
+    }
+#if defined(EIGEN_HIPCC) || (defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000)
+    x1 = __shfl_xor(x1, 4);
+    x2 = __shfl_xor(x2, 4);
+#else
+    x1 = __shfl_xor_sync(0xFFFFFFFF, x1, 4);
+    x2 = __shfl_xor_sync(0xFFFFFFFF, x2, 4);
+#endif
+    if ((threadIdx.x % 8) < 4) {
+      rhs_pf0.y = x1;
+      rhs_pf0.w = x2;
+    } else {
+      rhs_pf0.x = x1;
+      rhs_pf0.z = x2;
+    }
+
+    // We have 64 features.
+    // Row 0 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 0, 1.
+    // Row 1 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 2, 3.
+    // ...
+    // Row 31 -> times (0, 4, 8, 12, 1, 5, 9, 13) for features 62, 63
+    // Row 32 -> times (2, 6, 10, 14, 3, 7, 11, 15) for features 0, 1
+    // ...
+    rhs_shmem2[(threadIdx.x >> 3) + threadIdx.y * 2][threadIdx.x % 8] = make_float2(rhs_pf0.x, rhs_pf0.y);
+    rhs_shmem2[(threadIdx.x >> 3) + threadIdx.y * 2 + 32][threadIdx.x % 8] = make_float2(rhs_pf0.z, rhs_pf0.w);
+
+    // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61)
+    // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61)
+    // ...
+    // Row 15 (time 15) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61)
+    // Row 16 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), ..  (62, 63)
+    // ...
+
+    lhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(lhs_pf0.x, lhs_pf0.y);
+    lhs_shmem2[threadIdx.y + 16][threadIdx.x] = make_float2(lhs_pf0.z, lhs_pf0.w);
+
+#define add_vals(fl1, fl2, fr1, fr2) \
+  results[0].x += fl1.x * fr1.x;     \
+  results[0].y += fl1.y * fr1.x;     \
+  results[0].z += fl2.x * fr1.x;     \
+  results[0].w += fl2.y * fr1.x;     \
+                                     \
+  results[1].x += fl1.x * fr1.y;     \
+  results[1].y += fl1.y * fr1.y;     \
+  results[1].z += fl2.x * fr1.y;     \
+  results[1].w += fl2.y * fr1.y;     \
+                                     \
+  results[2].x += fl1.x * fr2.x;     \
+  results[2].y += fl1.y * fr2.x;     \
+  results[2].z += fl2.x * fr2.x;     \
+  results[2].w += fl2.y * fr2.x;     \
+                                     \
+  results[3].x += fl1.x * fr2.y;     \
+  results[3].y += fl1.y * fr2.y;     \
+  results[3].z += fl2.x * fr2.y;     \
+  results[3].w += fl2.y * fr2.y;
+
+    __syncthreads();
+
+// Do the multiplies.
+#pragma unroll
+    for (int koff = 0; koff < 16; koff++) {
+      // 32 x threads.
+      float2 fl1 = lhs_shmem2[koff][threadIdx.x];
+      float2 fl2 = lhs_shmem2[koff + 16][threadIdx.x];
+
+      int start_feature = threadIdx.y * 4;
+      float2 fr1 = rhs_shmem2[(start_feature >> 1) + 32 * ((koff % 4) / 2)][koff / 4 + (koff % 2) * 4];
+      float2 fr2 = rhs_shmem2[(start_feature >> 1) + 1 + 32 * ((koff % 4) / 2)][koff / 4 + (koff % 2) * 4];
+
+      add_vals(fl1, fl2, fr1, fr2)
+    }
+    __syncthreads();
+  }
+
+#undef prefetch_lhs
+#undef add_vals
+
+  Index horiz_base = threadIdx.y * 4 + base_n;
+  if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
+    for (int i = 0; i < 4; i++) {
+      output(lhs_vert, horiz_base + i) = results[i].x;
+      output(lhs_vert + 1, horiz_base + i) = results[i].y;
+      output(lhs_vert + 2, horiz_base + i) = results[i].z;
+      output(lhs_vert + 3, horiz_base + i) = results[i].w;
+    }
+  } else if (!CHECK_RHS_BOUNDARY) {
+    // CHECK LHS
+    if (lhs_vert + 3 < m_size) {
+      for (int i = 0; i < 4; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    } else if (lhs_vert + 2 < m_size) {
+      for (int i = 0; i < 4; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+      }
+    } else if (lhs_vert + 1 < m_size) {
+      for (int i = 0; i < 4; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+      }
+    } else if (lhs_vert < m_size) {
+      for (int i = 0; i < 4; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+      }
+    }
+  } else if (!CHECK_LHS_BOUNDARY) {
+    // CHECK RHS
+    /*
+    int ncols_rem = fminf(n_size- horiz_base, 4);
+    for (int i = 0; i < ncols_rem; i++) {
+      output(lhs_vert, horiz_base + i) = results[i].x;
+      output(lhs_vert + 1, horiz_base + i) = results[i].y;
+      output(lhs_vert + 2, horiz_base + i) = results[i].z;
+      output(lhs_vert + 3, horiz_base + i) = results[i].w;
+    }*/
+    for (int i = 0; i < 4; i++) {
+      if (horiz_base + i < n_size) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    }
+  } else {
+    // CHECK both boundaries.
+    for (int i = 0; i < 4; i++) {
+      if (horiz_base + i < n_size) {
+        if (lhs_vert < m_size) output(lhs_vert, horiz_base + i) = results[i].x;
+        if (lhs_vert + 1 < m_size) output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        if (lhs_vert + 2 < m_size) output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        if (lhs_vert + 3 < m_size) output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    }
+  }
+}
+
+template <typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper, bool CHECK_LHS_BOUNDARY,
+          bool CHECK_RHS_BOUNDARY>
+__device__ __forceinline__ void EigenFloatContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
+                                                                    const OutputMapper output, float2 lhs_shmem2[][32],
+                                                                    float2 rhs_shmem2[][8], const Index m_size,
+                                                                    const Index n_size, const Index k_size,
+                                                                    const Index base_m, const Index base_n) {
+  // prefetch registers
+  float4 lhs_pf0, lhs_pf1, lhs_pf2, lhs_pf3;
+  float4 rhs_pf0, rhs_pf1;
+
+  float4 results[8];
+  for (int i = 0; i < 8; i++) {
+    results[i].x = results[i].y = results[i].z = results[i].w = 0;
+  }
+
+  Index lhs_vert = base_m + threadIdx.x * 4 + (threadIdx.y % 4) * 32;
+  for (Index k = 0; k < k_size; k += 32) {
+    lhs_pf0 = internal::pset1<float4>(0);
+    lhs_pf1 = internal::pset1<float4>(0);
+    lhs_pf2 = internal::pset1<float4>(0);
+    lhs_pf3 = internal::pset1<float4>(0);
+
+    rhs_pf0 = internal::pset1<float4>(0);
+    rhs_pf1 = internal::pset1<float4>(0);
+
+    if (!CHECK_LHS_BOUNDARY) {
+      if ((threadIdx.y / 4 + k + 24) < k_size) {
+        lhs_pf0 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k));
+        lhs_pf1 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k + 8));
+        lhs_pf2 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k + 16));
+        lhs_pf3 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k + 24));
+      } else if ((threadIdx.y / 4 + k + 16) < k_size) {
+        lhs_pf0 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k));
+        lhs_pf1 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k + 8));
+        lhs_pf2 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k + 16));
+      } else if ((threadIdx.y / 4 + k + 8) < k_size) {
+        lhs_pf0 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k));
+        lhs_pf1 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k + 8));
+      } else if ((threadIdx.y / 4 + k) < k_size) {
+        lhs_pf0 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k));
+      }
+    } else {
+      // just CHECK_LHS_BOUNDARY
+      if (lhs_vert + 3 < m_size) {
+        if ((threadIdx.y / 4 + k + 24) < k_size) {
+          lhs_pf0 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k));
+          lhs_pf1 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k + 8));
+          lhs_pf2 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k + 16));
+          lhs_pf3 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k + 24));
+        } else if ((threadIdx.y / 4 + k + 16) < k_size) {
+          lhs_pf0 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k));
+          lhs_pf1 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k + 8));
+          lhs_pf2 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k + 16));
+        } else if ((threadIdx.y / 4 + k + 8) < k_size) {
+          lhs_pf0 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k));
+          lhs_pf1 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k + 8));
+        } else if ((threadIdx.y / 4 + k) < k_size) {
+          lhs_pf0 = lhs.template loadPacket<float4, Unaligned>(lhs_vert, (threadIdx.y / 4 + k));
+        }
+      } else if (lhs_vert + 2 < m_size) {
+        if ((threadIdx.y / 4 + k + 24) < k_size) {
+          lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k));
+          lhs_pf0.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k));
+          lhs_pf0.z = lhs(lhs_vert + 2, (threadIdx.y / 4 + k));
+          lhs_pf1.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 8));
+          lhs_pf1.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 8));
+          lhs_pf1.z = lhs(lhs_vert + 2, (threadIdx.y / 4 + k + 8));
+          lhs_pf2.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 16));
+          lhs_pf2.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 16));
+          lhs_pf2.z = lhs(lhs_vert + 2, (threadIdx.y / 4 + k + 16));
+          lhs_pf3.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 24));
+          lhs_pf3.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 24));
+          lhs_pf3.z = lhs(lhs_vert + 2, (threadIdx.y / 4 + k + 24));
+        } else if ((threadIdx.y / 4 + k + 16) < k_size) {
+          lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k));
+          lhs_pf0.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k));
+          lhs_pf0.z = lhs(lhs_vert + 2, (threadIdx.y / 4 + k));
+          lhs_pf1.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 8));
+          lhs_pf1.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 8));
+          lhs_pf1.z = lhs(lhs_vert + 2, (threadIdx.y / 4 + k + 8));
+          lhs_pf2.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 16));
+          lhs_pf2.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 16));
+          lhs_pf2.z = lhs(lhs_vert + 2, (threadIdx.y / 4 + k + 16));
+        } else if ((threadIdx.y / 4 + k + 8) < k_size) {
+          lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k));
+          lhs_pf0.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k));
+          lhs_pf0.z = lhs(lhs_vert + 2, (threadIdx.y / 4 + k));
+          lhs_pf1.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 8));
+          lhs_pf1.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 8));
+          lhs_pf1.z = lhs(lhs_vert + 2, (threadIdx.y / 4 + k + 8));
+        } else if ((threadIdx.y / 4 + k) < k_size) {
+          lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k));
+          lhs_pf0.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k));
+          lhs_pf0.z = lhs(lhs_vert + 2, (threadIdx.y / 4 + k));
+        }
+      } else if (lhs_vert + 1 < m_size) {
+        if ((threadIdx.y / 4 + k + 24) < k_size) {
+          lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k));
+          lhs_pf0.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k));
+          lhs_pf1.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 8));
+          lhs_pf1.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 8));
+          lhs_pf2.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 16));
+          lhs_pf2.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 16));
+          lhs_pf3.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 24));
+          lhs_pf3.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 24));
+        } else if ((threadIdx.y / 4 + k + 16) < k_size) {
+          lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k));
+          lhs_pf0.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k));
+          lhs_pf1.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 8));
+          lhs_pf1.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 8));
+          lhs_pf2.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 16));
+          lhs_pf2.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 16));
+        } else if ((threadIdx.y / 4 + k + 8) < k_size) {
+          lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k));
+          lhs_pf0.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k));
+          lhs_pf1.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 8));
+          lhs_pf1.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k + 8));
+        } else if ((threadIdx.y / 4 + k) < k_size) {
+          lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k));
+          lhs_pf0.y = lhs(lhs_vert + 1, (threadIdx.y / 4 + k));
+        }
+      } else if (lhs_vert < m_size) {
+        if ((threadIdx.y / 4 + k + 24) < k_size) {
+          lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k));
+          lhs_pf1.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 8));
+          lhs_pf2.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 16));
+          lhs_pf3.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 24));
+        } else if ((threadIdx.y / 4 + k + 16) < k_size) {
+          lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k));
+          lhs_pf1.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 8));
+          lhs_pf2.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 16));
+        } else if ((threadIdx.y / 4 + k + 8) < k_size) {
+          lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k));
+          lhs_pf1.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k + 8));
+        } else if ((threadIdx.y / 4 + k) < k_size) {
+          lhs_pf0.x = lhs(lhs_vert + 0, (threadIdx.y / 4 + k));
+        }
+      }
+    }
+    __syncthreads();
+    Index rhs_vert = k + threadIdx.x * 4;
+    Index rhs_horiz0 = threadIdx.y * 2 + base_n;
+    Index rhs_horiz1 = threadIdx.y * 2 + 1 + base_n;
+    if (!CHECK_RHS_BOUNDARY) {
+      if ((rhs_vert + 3) < k_size) {
+        // just CHECK_RHS_BOUNDARY
+        rhs_pf0 = rhs.template loadPacket<float4, Unaligned>(rhs_vert, rhs_horiz0);
+        rhs_pf1 = rhs.template loadPacket<float4, Unaligned>(rhs_vert, rhs_horiz1);
+      } else if (rhs_vert + 2 < k_size) {
+        // just CHECK_RHS_BOUNDARY
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+        rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+        rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+        rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
+      } else if (rhs_vert + 1 < k_size) {
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+        rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+      } else if (rhs_vert < k_size) {
+        rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+      }
+    } else {
+      if (rhs_horiz1 < n_size) {
+        if ((rhs_vert + 3) < k_size) {
+          // just CHECK_RHS_BOUNDARY
+          rhs_pf0 = rhs.template loadPacket<float4, Unaligned>(rhs_vert, rhs_horiz0);
+          rhs_pf1 = rhs.template loadPacket<float4, Unaligned>(rhs_vert, rhs_horiz1);
+        } else if (rhs_vert + 2 < k_size) {
+          // just CHECK_RHS_BOUNDARY
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+          rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+          rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+          rhs_pf1.z = rhs(rhs_vert + 2, rhs_horiz1);
+        } else if (k + threadIdx.x * 4 + 1 < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+          rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+          rhs_pf1.y = rhs(rhs_vert + 1, rhs_horiz1);
+        } else if (k + threadIdx.x * 4 < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf1.x = rhs(rhs_vert, rhs_horiz1);
+        }
+      } else if (rhs_horiz0 < n_size) {
+        if ((rhs_vert + 3) < k_size) {
+          // just CHECK_RHS_BOUNDARY
+          rhs_pf0 = rhs.template loadPacket<float4, Unaligned>(rhs_vert, rhs_horiz0);
+        } else if ((rhs_vert + 2) < k_size) {
+          // just CHECK_RHS_BOUNDARY
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+          rhs_pf0.z = rhs(rhs_vert + 2, rhs_horiz0);
+        } else if ((rhs_vert + 1) < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+          rhs_pf0.y = rhs(rhs_vert + 1, rhs_horiz0);
+        } else if (rhs_vert < k_size) {
+          rhs_pf0.x = rhs(rhs_vert, rhs_horiz0);
+        }
+      }
+    }
+    __syncthreads();
+    // Loaded. Do computation
+    // Row 0 -> times (0, 4, 8, .. 28) for features 0, 1.
+    // Row 1 -> times (0, 4, 8, .. 28) for features 2, 3.
+    // ..
+    // Row 31 -> times (0, 4, 8, .. 28) for features 62, 63
+    rhs_shmem2[threadIdx.y][threadIdx.x] = make_float2(rhs_pf0.x, rhs_pf1.x);
+    // Row 32 -> times (1, 5, 9, .. 29) for features 0, 1.
+    // Row 33 -> times (1, 5, 9, .. 29) for features 2, 3.
+    // ..
+    rhs_shmem2[threadIdx.y + 32][threadIdx.x] = make_float2(rhs_pf0.y, rhs_pf1.y);
+    // Row 64 -> times (2, 6, 10, .. 30) for features 0, 1.
+    // Row 65 -> times (2, 6, 10, .. 30) for features 2, 3.
+    rhs_shmem2[threadIdx.y + 64][threadIdx.x] = make_float2(rhs_pf0.z, rhs_pf1.z);
+    // Row 96 -> times (3, 7, 11, .. 31) for features 0, 1.
+    // Row 97 -> times (3, 7, 11, .. 31) for features 2, 3.
+    rhs_shmem2[threadIdx.y + 96][threadIdx.x] = make_float2(rhs_pf0.w, rhs_pf1.w);
+
+    // LHS.
+    // Row 0 (time 0) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61) .. (124, 125)
+    // Row 1 (time 1) -> features (0, 1), (4, 5), .. (28, 29), (32, 33), ..  (60, 61) .. (124, 125)
+    // ...
+    // Row 8 (time 0) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), ..  (62, 63) .. (126, 127)
+    // Row 15 (time 7) -> features (2, 3), (6, 7), .. (30, 31), (34, 35), ..  (62, 63) .. (126, 127)
+
+#define add_vals(a_feat1, a_feat2, f1, f2, f3, f4) \
+  results[0].x += a_feat1.x * f1.x;                \
+  results[1].x += a_feat1.x * f1.y;                \
+  results[2].x += a_feat1.x * f2.x;                \
+  results[3].x += a_feat1.x * f2.y;                \
+  results[4].x += a_feat1.x * f3.x;                \
+  results[5].x += a_feat1.x * f3.y;                \
+  results[6].x += a_feat1.x * f4.x;                \
+  results[7].x += a_feat1.x * f4.y;                \
+                                                   \
+  results[0].y += a_feat1.y * f1.x;                \
+  results[1].y += a_feat1.y * f1.y;                \
+  results[2].y += a_feat1.y * f2.x;                \
+  results[3].y += a_feat1.y * f2.y;                \
+  results[4].y += a_feat1.y * f3.x;                \
+  results[5].y += a_feat1.y * f3.y;                \
+  results[6].y += a_feat1.y * f4.x;                \
+  results[7].y += a_feat1.y * f4.y;                \
+                                                   \
+  results[0].z += a_feat2.x * f1.x;                \
+  results[1].z += a_feat2.x * f1.y;                \
+  results[2].z += a_feat2.x * f2.x;                \
+  results[3].z += a_feat2.x * f2.y;                \
+  results[4].z += a_feat2.x * f3.x;                \
+  results[5].z += a_feat2.x * f3.y;                \
+  results[6].z += a_feat2.x * f4.x;                \
+  results[7].z += a_feat2.x * f4.y;                \
+                                                   \
+  results[0].w += a_feat2.y * f1.x;                \
+  results[1].w += a_feat2.y * f1.y;                \
+  results[2].w += a_feat2.y * f2.x;                \
+  results[3].w += a_feat2.y * f2.y;                \
+  results[4].w += a_feat2.y * f3.x;                \
+  results[5].w += a_feat2.y * f3.y;                \
+  results[6].w += a_feat2.y * f4.x;                \
+  results[7].w += a_feat2.y * f4.y;
+
+    lhs_shmem2[threadIdx.y / 4][threadIdx.x + (threadIdx.y % 4) * 8] = make_float2(lhs_pf0.x, lhs_pf0.y);
+    lhs_shmem2[threadIdx.y / 4 + 8][threadIdx.x + (threadIdx.y % 4) * 8] = make_float2(lhs_pf1.x, lhs_pf1.y);
+    lhs_shmem2[threadIdx.y / 4 + 16][threadIdx.x + (threadIdx.y % 4) * 8] = make_float2(lhs_pf2.x, lhs_pf2.y);
+    lhs_shmem2[threadIdx.y / 4 + 24][threadIdx.x + (threadIdx.y % 4) * 8] = make_float2(lhs_pf3.x, lhs_pf3.y);
+
+    lhs_shmem2[threadIdx.y / 4 + 32][threadIdx.x + (threadIdx.y % 4) * 8] = make_float2(lhs_pf0.z, lhs_pf0.w);
+    lhs_shmem2[threadIdx.y / 4 + 40][threadIdx.x + (threadIdx.y % 4) * 8] = make_float2(lhs_pf1.z, lhs_pf1.w);
+    lhs_shmem2[threadIdx.y / 4 + 48][threadIdx.x + (threadIdx.y % 4) * 8] = make_float2(lhs_pf2.z, lhs_pf2.w);
+    lhs_shmem2[threadIdx.y / 4 + 56][threadIdx.x + (threadIdx.y % 4) * 8] = make_float2(lhs_pf3.z, lhs_pf3.w);
+
+    __syncthreads();
+
+// Do the multiplies.
+#pragma unroll
+    for (int koff = 0; koff < 32; koff++) {
+      float2 a3 = lhs_shmem2[koff][threadIdx.x + (threadIdx.y % 4) * 8];
+      float2 a4 = lhs_shmem2[koff + 32][threadIdx.x + (threadIdx.y % 4) * 8];
+
+      // first feature is at (threadIdx.y/4) * 8 last is at start + 8.
+      int start_feature = (threadIdx.y / 4) * 8;
+
+      float2 br1 = rhs_shmem2[start_feature / 2 + (koff % 4) * 32][koff / 4];
+      float2 br2 = rhs_shmem2[start_feature / 2 + 1 + (koff % 4) * 32][koff / 4];
+      float2 br3 = rhs_shmem2[start_feature / 2 + 2 + (koff % 4) * 32][koff / 4];
+      float2 br4 = rhs_shmem2[start_feature / 2 + 3 + (koff % 4) * 32][koff / 4];
+
+      add_vals(a3, a4, br1, br2, br3, br4)
+    }
+    __syncthreads();
+  }  // end loop over k
+
+#undef add_vals
+
+  __syncthreads();
+  Index horiz_base = (threadIdx.y / 4) * 8 + base_n;
+  if (!CHECK_LHS_BOUNDARY && !CHECK_RHS_BOUNDARY) {
+    for (int i = 0; i < 8; i++) {
+      output(lhs_vert, horiz_base + i) = results[i].x;
+      output(lhs_vert + 1, horiz_base + i) = results[i].y;
+      output(lhs_vert + 2, horiz_base + i) = results[i].z;
+      output(lhs_vert + 3, horiz_base + i) = results[i].w;
+    }
+  } else if (!CHECK_RHS_BOUNDARY) {
+    if (lhs_vert + 3 < m_size) {
+      for (int i = 0; i < 8; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    } else if (lhs_vert + 2 < m_size) {
+      for (int i = 0; i < 8; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+      }
+    } else if (lhs_vert + 1 < m_size) {
+      for (int i = 0; i < 8; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+      }
+    } else if (lhs_vert < m_size) {
+      for (int i = 0; i < 8; i++) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+      }
+    }
+  } else if (!CHECK_LHS_BOUNDARY) {
+    // CHECK BOUNDARY_B
+    for (int i = 0; i < 8; i++) {
+      if (horiz_base + i < n_size) {
+        output(lhs_vert, horiz_base + i) = results[i].x;
+        output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    }
+  } else {
+    // CHECK both boundaries.
+    for (int i = 0; i < 8; i++) {
+      if (horiz_base + i < n_size) {
+        if (lhs_vert < m_size) output(lhs_vert, horiz_base + i) = results[i].x;
+        if (lhs_vert + 1 < m_size) output(lhs_vert + 1, horiz_base + i) = results[i].y;
+        if (lhs_vert + 2 < m_size) output(lhs_vert + 2, horiz_base + i) = results[i].z;
+        if (lhs_vert + 3 < m_size) output(lhs_vert + 3, horiz_base + i) = results[i].w;
+      }
+    }
+  }
+}
+
+template <typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper>
+__global__ void
+#if defined(EIGEN_HIPCC)
+__launch_bounds__(256, 1)
+#else
+__launch_bounds__(256)
+#endif
+    EigenFloatContractionKernel(const LhsMapper lhs, const RhsMapper rhs, const OutputMapper output, const Index m_size,
+                                const Index n_size, const Index k_size) {
+  __shared__ float2 lhs_shmem[64 * 32];
+  __shared__ float2 rhs_shmem[128 * 8];
+
+  typedef float2 LHS_MEM[64][32];
+  typedef float2 RHS_MEM[128][8];
+
+  const Index m_block_idx = blockIdx.x;
+  const Index n_block_idx = blockIdx.y;
+
+  const Index base_m = 128 * m_block_idx;
+  const Index base_n = 64 * n_block_idx;
+
+  bool check_rhs = (base_n + 63) >= n_size;
+  bool check_lhs128 = (base_m + 127) >= m_size;
+
+  if (!check_rhs) {
+    if (!check_lhs128) {
+      // >= 128 rows left
+      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, false>(
+          lhs, rhs, output, *((LHS_MEM*)lhs_shmem), *((RHS_MEM*)rhs_shmem), m_size, n_size, k_size, base_m, base_n);
+    } else {
+      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, false>(
+          lhs, rhs, output, *((LHS_MEM*)lhs_shmem), *((RHS_MEM*)rhs_shmem), m_size, n_size, k_size, base_m, base_n);
+    }
+  } else {
+    if (!check_lhs128) {
+      // >= 128 rows left
+      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, false, true>(
+          lhs, rhs, output, *((LHS_MEM*)lhs_shmem), *((RHS_MEM*)rhs_shmem), m_size, n_size, k_size, base_m, base_n);
+    } else {
+      EigenFloatContractionKernelInternal<Index, LhsMapper, RhsMapper, OutputMapper, true, true>(
+          lhs, rhs, output, *((LHS_MEM*)lhs_shmem), *((RHS_MEM*)rhs_shmem), m_size, n_size, k_size, base_m, base_n);
+    }
+  }
+}
+
+template <typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper>
+__global__ void
+#if defined(EIGEN_HIPCC)
+__launch_bounds__(256, 1)
+#else
+__launch_bounds__(256)
+#endif
+    EigenFloatContractionKernel16x16(const LhsMapper lhs, const RhsMapper rhs, const OutputMapper output,
+                                     const Index m_size, const Index n_size, const Index k_size) {
+  __shared__ float2 lhs_shmem[32][16];
+  __shared__ float2 rhs_shmem[64][8];
+
+  const Index m_block_idx = blockIdx.x;
+  const Index n_block_idx = blockIdx.y;
+
+  const Index base_m = 64 * m_block_idx;
+  const Index base_n = 64 * n_block_idx;
+
+  if (base_m + 63 < m_size) {
+    if (base_n + 63 < n_size) {
+      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, false>(
+          lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
+    } else {
+      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, false, true>(
+          lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
+    }
+  } else {
+    if (base_n + 63 < n_size) {
+      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, false>(
+          lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
+    } else {
+      EigenFloatContractionKernelInternal16x16<Index, LhsMapper, RhsMapper, OutputMapper, true, true>(
+          lhs, rhs, output, lhs_shmem, rhs_shmem, m_size, n_size, k_size, base_m, base_n);
+    }
+  }
+}
+
+template <typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType>
+struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, GpuDevice>
+    : public TensorContractionEvaluatorBase<TensorEvaluator<
+          const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, GpuDevice> > {
+  typedef GpuDevice Device;
+
+  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> Self;
+  typedef TensorContractionEvaluatorBase<Self> Base;
+
+  typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType;
+  typedef std::remove_const_t<typename XprType::Scalar> Scalar;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType;
+
+  static constexpr int Layout = TensorEvaluator<LeftArgType, Device>::Layout;
+
+  // Most of the code is assuming that both input tensors are ColMajor. If the
+  // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
+  // If we want to compute A * B = C, where A is LHS and B is RHS, the code
+  // will pretend B is LHS and A is RHS.
+  typedef std::conditional_t<Layout == static_cast<int>(ColMajor), LeftArgType, RightArgType> EvalLeftArgType;
+  typedef std::conditional_t<Layout == static_cast<int>(ColMajor), RightArgType, LeftArgType> EvalRightArgType;
+
+  static constexpr int LDims =
+      internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
+  static constexpr int RDims =
+      internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
+  static constexpr int ContractDims = internal::array_size<Indices>::value;
+
+  typedef array<Index, LDims> left_dim_mapper_t;
+  typedef array<Index, RDims> right_dim_mapper_t;
+
+  typedef array<Index, ContractDims> contract_t;
+  typedef array<Index, LDims - ContractDims> left_nocontract_t;
+  typedef array<Index, RDims - ContractDims> right_nocontract_t;
+
+  static constexpr int NumDims = LDims + RDims - 2 * ContractDims;
+
+  typedef DSizes<Index, NumDims> Dimensions;
+
+  // typedefs needed in evalTo
+  typedef std::remove_const_t<typename EvalLeftArgType::Scalar> LhsScalar;
+  typedef std::remove_const_t<typename EvalRightArgType::Scalar> RhsScalar;
+
+  typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
+  typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
+
+  typedef typename LeftEvaluator::Dimensions LeftDimensions;
+  typedef typename RightEvaluator::Dimensions RightDimensions;
+
+  TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {
+    EIGEN_STATIC_ASSERT((internal::is_same<OutputKernelType, const NoOpOutputKernel>::value),
+                        GPU_TENSOR_CONTRACTION_DOES_NOT_SUPPORT_OUTPUT_KERNELS);
+  }
+
+  // We need to redefine this method to make nvcc happy
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+    this->m_leftImpl.evalSubExprsIfNeeded(NULL);
+    this->m_rightImpl.evalSubExprsIfNeeded(NULL);
+    if (data) {
+      evalTo(data);
+      return false;
+    } else {
+      this->m_result = static_cast<Scalar*>(this->m_device.allocate(this->dimensions().TotalSize() * sizeof(Scalar)));
+      evalTo(this->m_result);
+      return true;
+    }
+  }
+
+  void evalTo(Scalar* buffer) const {
+    if (this->m_lhs_inner_dim_contiguous) {
+      if (this->m_rhs_inner_dim_contiguous) {
+        if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<true, true, true, Unaligned>(buffer);
+        } else {
+          evalTyped<true, true, false, Unaligned>(buffer);
+        }
+      } else {
+        if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<true, false, true, Unaligned>(buffer);
+        } else {
+          evalTyped<true, false, false, Unaligned>(buffer);
+        }
+      }
+    } else {
+      if (this->m_rhs_inner_dim_contiguous) {
+        if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<false, true, true, Unaligned>(buffer);
+        } else {
+          evalTyped<false, true, false, Unaligned>(buffer);
+        }
+      } else {
+        if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<false, false, true, Unaligned>(buffer);
+        } else {
+          evalTyped<false, false, false, Unaligned>(buffer);
+        }
+      }
+    }
+  }
+
+  template <typename LhsScalar, typename RhsScalar, typename Index, typename LhsMapper, typename RhsMapper,
+            typename OutputMapper>
+  struct LaunchKernels {
+    static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k,
+                    const GpuDevice& device) {
+      const Index m_blocks = (m + 63) / 64;
+      const Index n_blocks = (n + 63) / 64;
+      const dim3 num_blocks(m_blocks, n_blocks, 1);
+      const dim3 block_size(8, 8, 8);
+      LAUNCH_GPU_KERNEL((EigenContractionKernel<Scalar, Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks,
+                        block_size, 0, device, lhs, rhs, output, m, n, k);
+    }
+  };
+
+  template <typename Index, typename LhsMapper, typename RhsMapper, typename OutputMapper>
+  struct LaunchKernels<float, float, Index, LhsMapper, RhsMapper, OutputMapper> {
+    static void Run(const LhsMapper& lhs, const RhsMapper& rhs, const OutputMapper& output, Index m, Index n, Index k,
+                    const GpuDevice& device) {
+      if (m < 768 || n < 768) {
+        const Index m_blocks = (m + 63) / 64;
+        const Index n_blocks = (n + 63) / 64;
+        const dim3 num_blocks(m_blocks, n_blocks, 1);
+        const dim3 block_size(16, 16, 1);
+        LAUNCH_GPU_KERNEL((EigenFloatContractionKernel16x16<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks,
+                          block_size, 0, device, lhs, rhs, output, m, n, k);
+      } else {
+        const Index m_blocks = (m + 127) / 128;
+        const Index n_blocks = (n + 63) / 64;
+        const dim3 num_blocks(m_blocks, n_blocks, 1);
+        const dim3 block_size(8, 32, 1);
+        LAUNCH_GPU_KERNEL((EigenFloatContractionKernel<Index, LhsMapper, RhsMapper, OutputMapper>), num_blocks,
+                          block_size, 0, device, lhs, rhs, output, m, n, k);
+      }
+    }
+  };
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+  void evalTyped(Scalar* buffer) const {
+    // columns in left side, rows in right side
+    const Index k = this->m_k_size;
+    EIGEN_UNUSED_VARIABLE(k)
+
+    // rows in left side
+    const Index m = this->m_i_size;
+
+    // columns in right side
+    const Index n = this->m_j_size;
+
+    // zero out the result buffer (which must be of size at least m * n * sizeof(Scalar))
+    this->m_device.fill(buffer, buffer + m * n, Scalar(0));
+
+    typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs, LeftEvaluator, left_nocontract_t,
+                                                   contract_t, 4, lhs_inner_dim_contiguous, false, Unaligned>
+        LhsMapper;
+
+    typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs, RightEvaluator, right_nocontract_t,
+                                                   contract_t, 4, rhs_inner_dim_contiguous, rhs_inner_dim_reordered,
+                                                   Unaligned>
+        RhsMapper;
+
+    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+
+    // initialize data mappers
+    LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
+                  this->m_left_contracting_strides, this->m_k_strides);
+
+    RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
+                  this->m_right_contracting_strides, this->m_k_strides);
+
+    OutputMapper output(buffer, m);
+
+#if defined(EIGEN_USE_HIP)
+    setGpuSharedMemConfig(hipSharedMemBankSizeEightByte);
+#else
+    setGpuSharedMemConfig(cudaSharedMemBankSizeEightByte);
+#endif
+
+    LaunchKernels<LhsScalar, RhsScalar, Index, LhsMapper, RhsMapper, OutputMapper>::Run(lhs, rhs, output, m, n, k,
+                                                                                        this->m_device);
+  }
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_USE_GPU and EIGEN_GPUCC
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_GPU_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
new file mode 100644
index 00000000..6367db96
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
@@ -0,0 +1,533 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+enum { Rhs = 0, Lhs = 1 };
+
+/*
+ * Implementation of the Eigen blas_data_mapper class for tensors.
+ */
+/// The make pointer class is used by sycl in order to build the mapper class on the device. For other platform the
+/// default make pointer is used which is scalar * for CoeffLoader.
+template <typename Tensor, bool HasRawAccess, template <class> class MakePointer_ = MakePointer>
+struct CoeffLoader;
+
+template <typename Scalar, typename Index, int side, typename Tensor, typename nocontract_t, typename contract_t,
+          int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment,
+          template <class> class MakePointer_ = MakePointer>
+class BaseTensorContractionMapper;
+
+template <typename Tensor, bool HasRawAccess, template <class> class MakePointer_>
+struct CoeffLoader {
+  enum { DirectOffsets = false };
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffLoader(const Tensor& tensor) : m_tensor(tensor) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index) {
+    eigen_assert(false && "unsupported");
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const typename MakePointer_<const typename Tensor::Scalar>::Type data() const {
+    eigen_assert(false && "unsupported");
+    return NULL;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const {
+    return m_tensor.coeff(index);
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Tensor::PacketReturnType packet(typename Tensor::Index index) const {
+    return m_tensor.template packet<LoadMode>(index);
+  }
+
+ private:
+  const Tensor m_tensor;
+};
+
+template <typename Tensor, template <class> class MakePointer_>
+struct CoeffLoader<Tensor, true, MakePointer_> {
+  enum { DirectOffsets = true };
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffLoader(const Tensor& tensor) : m_data(tensor.data()) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) { m_data += offset; }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const typename MakePointer_<const typename Tensor::Scalar>::Type data() const {
+    return m_data;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename Tensor::Scalar coeff(typename Tensor::Index index) const {
+    return loadConstant(m_data + index);
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Tensor::PacketReturnType packet(typename Tensor::Index index) const {
+    return internal::ploadt_ro<typename Tensor::PacketReturnType, LoadMode>(m_data + index);
+  }
+
+ private:
+  typedef typename Tensor::Scalar Scalar;
+
+  typename MakePointer_<const Scalar>::Type m_data;
+};
+
+template <typename Scalar, typename Index, int side, typename Tensor, typename nocontract_t, typename contract_t,
+          int packet_size, bool inner_dim_contiguous, int Alignment, template <class> class MakePointer_ = MakePointer>
+class SimpleTensorContractionMapper {
+ public:
+  EIGEN_DEVICE_FUNC SimpleTensorContractionMapper(const Tensor& tensor, const nocontract_t& nocontract_strides,
+                                                  const nocontract_t& ij_strides, const contract_t& contract_strides,
+                                                  const contract_t& k_strides)
+      : m_tensor(tensor),
+        m_nocontract_strides(nocontract_strides),
+        m_ij_strides(ij_strides),
+        m_contract_strides(contract_strides),
+        m_k_strides(k_strides) {}
+
+  enum { DirectOffsets = CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_>::DirectOffsets };
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void offsetBuffer(typename Tensor::Index offset) {
+    m_tensor.offsetBuffer(offset);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void prefetch(Index /*i*/) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(Index row) const {
+    // column major assumption
+    return operator()(row, 0);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(Index row, Index col) const {
+    return m_tensor.coeff(computeIndex(row, col));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index computeIndex(Index row, Index col) const {
+    const bool left = (side == Lhs);
+    EIGEN_UNUSED_VARIABLE(left);  // annoying bug in g++8.1: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85963
+    Index nocontract_val = left ? row : col;
+    Index linidx = 0;
+    EIGEN_UNROLL_LOOP
+    for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) {
+      const Index idx = nocontract_val / m_ij_strides[i];
+      linidx += idx * m_nocontract_strides[i];
+      nocontract_val -= idx * m_ij_strides[i];
+    }
+    if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) {
+      if (side == Lhs && inner_dim_contiguous) {
+        eigen_assert(m_nocontract_strides[0] == 1);
+        linidx += nocontract_val;
+      } else {
+        linidx += nocontract_val * m_nocontract_strides[0];
+      }
+    }
+
+    Index contract_val = left ? col : row;
+    if (array_size<contract_t>::value > 0) {
+      EIGEN_UNROLL_LOOP
+      for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
+        const Index idx = contract_val / m_k_strides[i];
+        linidx += idx * m_contract_strides[i];
+        contract_val -= idx * m_k_strides[i];
+      }
+
+      if (side == Rhs && inner_dim_contiguous) {
+        eigen_assert(m_contract_strides[0] == 1);
+        linidx += contract_val;
+      } else {
+        linidx += contract_val * m_contract_strides[0];
+      }
+    }
+
+    return linidx;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexPair<Index> computeIndexPair(Index row, Index col,
+                                                                          const Index distance) const {
+    const bool left = (side == Lhs);
+    EIGEN_UNUSED_VARIABLE(left);  // annoying bug in g++8.1: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85963
+    Index nocontract_val[2] = {left ? row : col, left ? row + distance : col};
+    Index linidx[2] = {0, 0};
+    if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) {
+      EIGEN_UNROLL_LOOP
+      for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) {
+        const Index idx0 = nocontract_val[0] / m_ij_strides[i];
+        const Index idx1 = nocontract_val[1] / m_ij_strides[i];
+        linidx[0] += idx0 * m_nocontract_strides[i];
+        linidx[1] += idx1 * m_nocontract_strides[i];
+        nocontract_val[0] -= idx0 * m_ij_strides[i];
+        nocontract_val[1] -= idx1 * m_ij_strides[i];
+      }
+      if (side == Lhs && inner_dim_contiguous) {
+        eigen_assert(m_nocontract_strides[0] == 1);
+        linidx[0] += nocontract_val[0];
+        linidx[1] += nocontract_val[1];
+      } else {
+        linidx[0] += nocontract_val[0] * m_nocontract_strides[0];
+        linidx[1] += nocontract_val[1] * m_nocontract_strides[0];
+      }
+    }
+
+    Index contract_val[2] = {left ? col : row, left ? col : row + distance};
+    if (array_size<contract_t>::value > 0) {
+      EIGEN_UNROLL_LOOP
+      for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
+        const Index idx0 = contract_val[0] / m_k_strides[i];
+        const Index idx1 = contract_val[1] / m_k_strides[i];
+        linidx[0] += idx0 * m_contract_strides[i];
+        linidx[1] += idx1 * m_contract_strides[i];
+        contract_val[0] -= idx0 * m_k_strides[i];
+        contract_val[1] -= idx1 * m_k_strides[i];
+      }
+
+      if (side == Rhs && inner_dim_contiguous) {
+        eigen_assert(m_contract_strides[0] == 1);
+        linidx[0] += contract_val[0];
+        linidx[1] += contract_val[1];
+      } else {
+        linidx[0] += contract_val[0] * m_contract_strides[0];
+        linidx[1] += contract_val[1] * m_contract_strides[0];
+      }
+    }
+    return IndexPair<Index>(linidx[0], linidx[1]);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index firstAligned(Index size) const {
+    // Only claim alignment when we can compute the actual stride (ie when we're
+    // dealing with the lhs with inner_dim_contiguous. This is because the
+    // matrix-vector product relies on the stride when dealing with aligned inputs.
+    return (Alignment == Aligned) && (side == Lhs) && inner_dim_contiguous ? 0 : size;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index stride() const {
+    return ((side == Lhs) && inner_dim_contiguous && array_size<contract_t>::value > 0) ? m_contract_strides[0] : 1;
+  }
+
+  const CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_>& tensor() const { return m_tensor; }
+
+  const nocontract_t& nocontract_strides() const { return m_nocontract_strides; }
+  const nocontract_t& ij_strides() const { return m_ij_strides; }
+  const contract_t& contract_strides() const { return m_contract_strides; }
+  const contract_t& k_strides() const { return m_k_strides; }
+
+ protected:
+  CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_> m_tensor;
+  const nocontract_t m_nocontract_strides;
+  const nocontract_t m_ij_strides;
+  const contract_t m_contract_strides;
+  const contract_t m_k_strides;
+};
+
+template <typename Scalar, typename Index, int side, typename Tensor, typename nocontract_t, typename contract_t,
+          int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment,
+          template <class> class MakePointer_>
+class BaseTensorContractionMapper
+    : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size,
+                                           inner_dim_contiguous, Alignment, MakePointer_> {
+ public:
+  typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size,
+                                        inner_dim_contiguous, Alignment, MakePointer_>
+      ParentMapper;
+
+  EIGEN_DEVICE_FUNC BaseTensorContractionMapper(const Tensor& tensor, const nocontract_t& nocontract_strides,
+                                                const nocontract_t& ij_strides, const contract_t& contract_strides,
+                                                const contract_t& k_strides)
+      : ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) {}
+
+  template <typename PacketT, int AlignmentType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+      std::enable_if_t<internal::unpacket_traits<PacketT>::size == packet_size, PacketT>
+      load(Index i, Index j) const {
+    // whole method makes column major assumption
+
+    // don't need to add offsets for now (because operator handles that)
+    // current code assumes packet size must be a multiple of 2
+    EIGEN_STATIC_ASSERT(packet_size % 2 == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    if (Tensor::PacketAccess && inner_dim_contiguous && !inner_dim_reordered) {
+      const Index index = this->computeIndex(i, j);
+      eigen_assert(this->computeIndex(i + packet_size - 1, j) == index + packet_size - 1);
+      return this->m_tensor.template packet<AlignmentType>(index);
+    }
+
+    const IndexPair<Index> indexPair = this->computeIndexPair(i, j, packet_size - 1);
+    const Index first = indexPair.first;
+    const Index lastIdx = indexPair.second;
+
+    // We can always do optimized packet reads from left hand side right now, because
+    // the vertical matrix dimension on the left hand side is never contracting.
+    // On the right hand side we need to check if the contracting dimensions may have
+    // been shuffled first.
+    if (Tensor::PacketAccess && (side == Lhs || internal::array_size<contract_t>::value <= 1 || !inner_dim_reordered) &&
+        (lastIdx - first) == (packet_size - 1)) {
+      return this->m_tensor.template packet<AlignmentType>(first);
+    }
+
+    EIGEN_ALIGN_MAX Scalar data[packet_size];
+
+    data[0] = this->m_tensor.coeff(first);
+    EIGEN_UNROLL_LOOP
+    for (Index k = 1; k < packet_size - 1; k += 2) {
+      const IndexPair<Index> internal_pair = this->computeIndexPair(i + k, j, 1);
+      data[k] = this->m_tensor.coeff(internal_pair.first);
+      data[k + 1] = this->m_tensor.coeff(internal_pair.second);
+    }
+    data[packet_size - 1] = this->m_tensor.coeff(lastIdx);
+
+    return pload<PacketT>(data);
+  }
+
+  template <typename PacketT, int AlignmentType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+      std::enable_if_t<internal::unpacket_traits<PacketT>::size != packet_size, PacketT>
+      load(Index i, Index j) const {
+    const Index requested_packet_size = internal::unpacket_traits<PacketT>::size;
+    EIGEN_ALIGN_MAX Scalar data[requested_packet_size];
+
+    const IndexPair<Index> indexPair = this->computeIndexPair(i, j, requested_packet_size - 1);
+    const Index first = indexPair.first;
+    const Index lastIdx = indexPair.second;
+
+    data[0] = this->m_tensor.coeff(first);
+    for (Index k = 1; k < requested_packet_size - 1; k += 2) {
+      const IndexPair<Index> internal_pair = this->computeIndexPair(i + k, j, 1);
+      data[k] = this->m_tensor.coeff(internal_pair.first);
+      data[k + 1] = this->m_tensor.coeff(internal_pair.second);
+    }
+    data[requested_packet_size - 1] = this->m_tensor.coeff(lastIdx);
+
+    return pload<PacketT>(data);
+  }
+
+  template <typename PacketT, int AlignmentType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketT loadPacket(Index i, Index j) const {
+    return this->load<PacketT, AlignmentType>(i, j);
+  }
+};
+
+template <typename Scalar, typename Index, int side, typename Tensor, typename nocontract_t, typename contract_t,
+          bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment, template <class> class MakePointer_>
+class BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous,
+                                  inner_dim_reordered, Alignment, MakePointer_>
+    : public SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1,
+                                           inner_dim_contiguous, Alignment, MakePointer_> {
+ public:
+  typedef SimpleTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, 1, inner_dim_contiguous,
+                                        Alignment, MakePointer_>
+      ParentMapper;
+
+  EIGEN_DEVICE_FUNC BaseTensorContractionMapper(const Tensor& tensor, const nocontract_t& nocontract_strides,
+                                                const nocontract_t& ij_strides, const contract_t& contract_strides,
+                                                const contract_t& k_strides)
+      : ParentMapper(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) {}
+
+  template <typename PacketT, int>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketT loadPacket(Index i, Index j) const {
+    EIGEN_ALIGN_MAX Scalar data[1];
+    data[0] = this->m_tensor.coeff(this->computeIndex(i, j));
+    return pload<PacketT>(data);
+  }
+  template <typename PacketT, int>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketT load(Index i, Index j) const {
+    EIGEN_ALIGN_MAX Scalar data[1];
+    data[0] = this->m_tensor.coeff(this->computeIndex(i, j));
+    return pload<PacketT>(data);
+  }
+};
+
+template <typename Scalar, typename Index, int side, typename Tensor, typename nocontract_t, typename contract_t,
+          int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment,
+          template <class> class MakePointer_ = MakePointer>
+class TensorContractionSubMapper {
+ public:
+  typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size,
+                                      inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_>
+      ParentMapper;
+  typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size,
+                                     inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_>
+      Self;
+  typedef Self LinearMapper;
+  typedef Self SubMapper;
+
+  enum {
+    // We can use direct offsets iff the parent mapper supports then and we can compute the strides.
+    // TODO: we should also enable direct offsets for the Rhs case.
+    UseDirectOffsets =
+        ParentMapper::DirectOffsets && (side == Lhs) && inner_dim_contiguous && (array_size<contract_t>::value > 0)
+  };
+
+  EIGEN_DEVICE_FUNC TensorContractionSubMapper(const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset)
+      : m_base_mapper(base_mapper), m_vert_offset(vert_offset), m_horiz_offset(horiz_offset) {
+    // Bake the offsets into the buffer used by the base mapper whenever possible. This avoids the need to recompute
+    // this offset every time we attempt to access a coefficient.
+    if (UseDirectOffsets) {
+      Index stride = m_base_mapper.stride();
+      m_base_mapper.offsetBuffer(vert_offset + horiz_offset * stride);
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i) const {
+    if (UseDirectOffsets) {
+      return m_base_mapper(i, 0);
+    }
+    return m_base_mapper(i + m_vert_offset, m_horiz_offset);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar operator()(Index i, Index j) const {
+    if (UseDirectOffsets) {
+      return m_base_mapper(i, j);
+    }
+    return m_base_mapper(i + m_vert_offset, j + m_horiz_offset);
+  }
+
+  template <typename PacketT>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i) const {
+    if (UseDirectOffsets) {
+      return m_base_mapper.template loadPacket<PacketT, Alignment>(i, 0);
+    }
+    return m_base_mapper.template loadPacket<PacketT, Alignment>(i + m_vert_offset, m_horiz_offset);
+  }
+
+  template <typename PacketT>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i, Index j) const {
+    if (UseDirectOffsets) {
+      return m_base_mapper.template loadPacket<PacketT, Alignment>(i, j);
+    }
+    return m_base_mapper.template loadPacket<PacketT, Alignment>(i + m_vert_offset, j + m_horiz_offset);
+  }
+
+  template <typename PacketT>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacketPartial(Index i, Index j, Index, Index = 0) const {
+    if (UseDirectOffsets) {
+      return m_base_mapper.template loadPacket<PacketT, Alignment>(i, j);
+    }
+    return m_base_mapper.template loadPacket<PacketT, Alignment>(i + m_vert_offset, j + m_horiz_offset);
+  }
+
+  template <typename PacketT, int AlignmentType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT loadPacket(Index i, Index j) const {
+    if (UseDirectOffsets) {
+      return m_base_mapper.template load<PacketT, AlignmentType>(i, j);
+    }
+    return m_base_mapper.template loadPacket<PacketT, AlignmentType>(i + m_vert_offset, j + m_horiz_offset);
+  }
+
+  template <typename PacketT>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const PacketT& p) const {
+    if (UseDirectOffsets) {
+      m_base_mapper.storePacket(i, 0, p);
+    }
+    m_base_mapper.storePacket(i + m_vert_offset, m_horiz_offset, p);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
+    if (UseDirectOffsets) {
+      return LinearMapper(m_base_mapper, i, j);
+    }
+    return LinearMapper(m_base_mapper, i + m_vert_offset, j + m_horiz_offset);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE SubMapper getSubMapper(Index i, Index j) const {
+    if (UseDirectOffsets) {
+      return SubMapper(m_base_mapper, i, j);
+    }
+    return SubMapper(m_base_mapper, i + m_vert_offset, j + m_horiz_offset);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const Index stride() const { return m_base_mapper.stride(); }
+
+  template <typename PacketT, int AlignmentType>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i) const {
+    EIGEN_STATIC_ASSERT((internal::is_same<PacketT, PacketT>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    const int ActualAlignment = (AlignmentType == Aligned) && (Alignment == Aligned) ? Aligned : Unaligned;
+    if (UseDirectOffsets) {
+      return m_base_mapper.template loadPacket<PacketT, ActualAlignment>(i, 0);
+    }
+    return m_base_mapper.template loadPacket<PacketT, ActualAlignment>(i + m_vert_offset, m_horiz_offset);
+  }
+
+  template <typename PacketT>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool aligned(Index) const {
+    return false;
+  }
+
+  const ParentMapper& base_mapper() const { return m_base_mapper; }
+  Index vert_offset() const { return m_vert_offset; }
+  Index horiz_offset() const { return m_horiz_offset; }
+
+ private:
+  ParentMapper m_base_mapper;
+  const Index m_vert_offset;
+  const Index m_horiz_offset;
+};
+
+template <typename Scalar_, typename Index, int side, typename Tensor, typename nocontract_t, typename contract_t,
+          int packet_size, bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment,
+          template <class> class MakePointer_ = MakePointer>
+class TensorContractionInputMapper
+    : public BaseTensorContractionMapper<Scalar_, Index, side, Tensor, nocontract_t, contract_t, packet_size,
+                                         inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_> {
+ public:
+  typedef Scalar_ Scalar;
+  typedef BaseTensorContractionMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size,
+                                      inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_>
+      Base;
+  typedef TensorContractionSubMapper<Scalar, Index, side, Tensor, nocontract_t, contract_t, packet_size,
+                                     inner_dim_contiguous, inner_dim_reordered, Alignment, MakePointer_>
+      SubMapper;
+  typedef SubMapper VectorMapper;
+  typedef SubMapper LinearMapper;
+
+  EIGEN_DEVICE_FUNC TensorContractionInputMapper(const Tensor& tensor, const nocontract_t& nocontract_strides,
+                                                 const nocontract_t& ij_strides, const contract_t& contract_strides,
+                                                 const contract_t& k_strides)
+      : Base(tensor, nocontract_strides, ij_strides, contract_strides, k_strides) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SubMapper getSubMapper(Index i, Index j) const {
+    return SubMapper(*this, i, j);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const {
+    return LinearMapper(*this, i, j);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE VectorMapper getVectorMapper(Index i, Index j) const {
+    return VectorMapper(*this, i, j);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const CoeffLoader<Tensor, Tensor::RawAccess, MakePointer_>& get_tensor() const {
+    return Base::m_tensor;
+  }
+};
+
+template <typename T>
+struct TensorContractionInputMapperTrait;
+
+template <typename Scalar_, typename Index_, int side_, typename Tensor_, typename nocontract_t_, typename contract_t_,
+          int packet_size_, bool inner_dim_contiguous_, bool inner_dim_reordered_, int Alignment_,
+          template <class> class MakePointer_>
+struct TensorContractionInputMapperTrait<
+    TensorContractionInputMapper<Scalar_, Index_, side_, Tensor_, nocontract_t_, contract_t_, packet_size_,
+                                 inner_dim_contiguous_, inner_dim_reordered_, Alignment_, MakePointer_> > {
+  typedef Tensor_ XprType;
+  static const bool inner_dim_contiguous = inner_dim_contiguous_;
+  static const bool inner_dim_reordered = inner_dim_reordered_;
+};
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_MAPPER_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
new file mode 100644
index 00000000..ea210a1f
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionSycl.h
@@ -0,0 +1,1653 @@
+// This file is part of Eigen, a lightweight C++ template library for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla Public License v. 2.0. If a copy of the MPL was not
+// distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*****************************************************************
+ * TensorContractionSycl.h
+ *
+ * \brief:
+ *  TensorContractionSycl.h, provides various tensor contraction kernel for SYCL backend
+ *
+ *****************************************************************/
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace TensorSycl {
+namespace internal {
+
+#ifndef EIGEN_SYCL_DISABLE_GEMV
+/*!
+ * \brief TVPanelSize, a template class used for setting the panel size required for launching General TensorVector
+ * contraction kernel on various hardware devices.
+ *
+ * \tparam Scalar: determines the element type of the tensor/vector
+ *
+ * \tparam StorageIndex  determines the Index type.
+ *
+ * \tparam NCWindow: determines the number of non-contracting element to be process by each work-group
+ *
+ * \tparam CFactor: determines the number of contracting element to be process by each thread
+ *
+ * \tparam NCFactor: determines the number of non-contracting element to be process by each thread
+ */
+template <typename Scalar, typename StorageIndex, StorageIndex NCWindow, StorageIndex CFactor, StorageIndex NCFactor>
+struct TVPanelSize {
+  // LocalThreadSizeC: determines total number of thread per workgroup for the contracting dimension
+  static constexpr StorageIndex LocalThreadSizeC = EIGEN_SYCL_LOCAL_THREAD_DIM0;
+  // LocalThreadSizeNC: determines total number of thread per workgroup for the non-contracting dimension
+  static constxpr StorageIndex LocalThreadSizeNC = EIGEN_SYCL_LOCAL_THREAD_DIM1;
+  // TileSizeDimNC: determines the tile size for the non-contracting dimension
+  static constexpr StorageIndex TileSizeDimNC = NCWindow / NCFactor;
+  // TileSizeDimC: determines the tile size for the contracting dimension
+  static constexpr StorageIndex TileSizeDimC = CFactor * LocalThreadSizeNC * LocalThreadSizeC;
+  // WorkLoadPerThreadNC : determines workload per thread for loading the non-contracting dimension
+  static constexpr StorageIndex WorkLoadPerThreadNC = TileSizeDimNC / LocalThreadSizeNC;
+  // WorkLoadPerThreadC: determines workload per thread for loading the non-contracting dimension
+  static constexpr StorageIndex WorkLoadPerThreadC = TileSizeDimC / LocalThreadSizeC;
+  // BC : determines if supporting bank conflict is required
+  static constexpr bool BC = false;
+};
+#endif
+
+/*!
+ * \brief TTPanelSize, a template class used for setting the panel size required for launching General Tensor Tensor
+ contraction kernel on various hardware devices.
+ *
+ * \tparam Scalar: determines the element type of the tensor
+ *
+ * \tparam StorageIndex: determines the Index type.
+ *
+ * \tparam REG_SIZE_M: determines workload per thread for loading the M dimension This can be varied based on the
+ available register on a chosen device(can be controlled by EIGEN_SYCL_REG_M macro).
+ *
+ * \tparam REG_SIZE_N: determines workload per thread for loading the N dimension This can be varied based on the
+ available register on a chosen device(can be controlled by EIGEN_SYCL_REG_N macro).
+ *
+ * \tparam TSDK: determines Tile size for dimension K. The packet size is assumed to be considered
+ */
+
+template <typename Scalar, typename StorageIndex, StorageIndex REG_SIZE_M, StorageIndex REG_SIZE_N, StorageIndex TSDK>
+struct TTPanelSize {
+  // TileSizeDimK: determines Tile size for dimension K. The packet size is assumed to be considered
+  static constexpr StorageIndex TileSizeDimK = TSDK;
+  // WorkLoadPerThreadM : determines workload per thread for loading the M dimension This can be varied based on the
+  // available register on a chosen device(can be controlled by EIGEN_SYCL_REG_M macro//
+#ifndef EIGEN_SYCL_REG_M
+  static constexpr StorageIndex WorkLoadPerThreadM = REG_SIZE_M;
+#else
+  static constexpr StorageIndex WorkLoadPerThreadM = EIGEN_SYCL_REG_M;
+#endif
+// WorkLoadPerThreadN : determines workload per thread for loading the N dimension This can be varied based on the
+// available register on a chosen device(can be controlled by EIGEN_SYCL_REG_N macro
+#ifndef EIGEN_SYCL_REG_N
+  static constexpr StorageIndex WorkLoadPerThreadN = REG_SIZE_N;
+#else
+  static constexpr StorageIndex WorkLoadPerThreadN = EIGEN_SYCL_REG_N;
+#endif
+  // LocalThreadSizeM: determines total number of thread per workgroup for the m dimension
+  static constexpr StorageIndex LocalThreadSizeM = EIGEN_SYCL_LOCAL_THREAD_DIM0;
+  // LocalThreadSizeN: determines total number of thread per workgroup for the n dimension
+  static constexpr StorageIndex LocalThreadSizeN = EIGEN_SYCL_LOCAL_THREAD_DIM1;
+  // TileSizeDimM: determines the tile size for the m dimension
+  static constexpr StorageIndex TileSizeDimM = LocalThreadSizeM * WorkLoadPerThreadM;
+  // TileSizeDimN: determines the tile size for the n dimension
+  static constexpr StorageIndex TileSizeDimN = LocalThreadSizeN * WorkLoadPerThreadN;
+  // LoadPerThreadLhs: determines workload per thread for loading Lhs Tensor. This must be divisible by packetsize
+  static constexpr StorageIndex LoadPerThreadLhs =
+      ((TileSizeDimK * WorkLoadPerThreadM * WorkLoadPerThreadN) / (TileSizeDimN));
+  // LoadPerThreadRhs: determines workload per thread for loading Rhs Tensor. This must be divisible by packetsize
+  static constexpr StorageIndex LoadPerThreadRhs =
+      ((TileSizeDimK * WorkLoadPerThreadM * WorkLoadPerThreadN) / (TileSizeDimM));
+  // BC : determines if supporting bank conflict is required
+  static constexpr bool BC = true;
+  // DoubleBuffer: determines if double buffering technique should be used (This can be disabled by
+  // EIGEN_SYCL_DISABLE_DOUBLE_BUFFER macro when the device does not have sufficient local memory)
+  static constexpr bool DoubleBuffer =
+#ifdef EIGEN_SYCL_DISABLE_DOUBLE_BUFFER
+      false;
+#else
+      true;
+#endif
+};
+
+/* !
+ * \brief contraction_type: an enum class representing the Tensor Contraction implementation algorithm. This is used to
+ * specialize the contraction algorithm based on device support for dedicated local memory.
+ */
+enum class contraction_type { local, no_local };
+/* !
+ * \brief data_source an enum class determining the location of the data in a memory hierarchy (global, local, private).
+ */
+enum class data_source { global_mem, local_mem, private_mem };
+
+/*!
+ * \brief read, a template function used for loading the data from global
+ memory. This function is used to guarantee coalesced and vectorized load whenever possible
+ *
+ * \tparam PacketLoad: determines if the each element of this tensor block should be loaded in a packet mode
+ *
+ * \param is_coalesced_layout: determines whether or not the Tensor data in a memory can be access coalesced and
+ vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the
+ contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case
+ when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed.
+ *
+ * \tparam PacketType:  determines the type of packet
+ *
+ * \tparam TensorMapper: determines the input tensor mapper type
+ *
+ * \tparam StorageIndex: determines the Index type
+
+ * \param tensorMapper: is the input tensor
+ *
+ * \param NCIndex: is the non-contracting dim index
+ *
+ * \param CIndex is the contracting dim index
+ *
+ * \param ld: is the leading dimension of the flattened tensor
+ */
+template <bool PacketLoad, bool is_coalesced_layout, bool, typename PacketType, typename TensorMapper,
+          typename StorageIndex>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<PacketLoad, PacketType> read(
+    const TensorMapper &tensorMapper, const StorageIndex &NCIndex, const StorageIndex &CIndex, const StorageIndex &ld) {
+  const StorageIndex row = (is_coalesced_layout) ? NCIndex : CIndex;
+  const StorageIndex col = (is_coalesced_layout) ? CIndex : NCIndex;
+  return tensorMapper.get_tensor().template packet<Unaligned>(row + (col * ld));
+}
+
+/*!
+ * \brief read, special overload of read function, when the read access is not vectorized
+ *
+ * \tparam PacketLoad: determines if the each element of this tensor block should be loaded in a packet mode
+ *
+ * \param is_coalesced_layout: determines whether or not the Tensor data in a memory can be access coalesced and
+  vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the
+  contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case
+  when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed.
+ *
+ * \tparam PacketType: determines the type of packet
+ *
+ * \tparam TensorMapper: determines the input tensor mapper type
+ *
+ * \tparam StorageIndex: determines the Index type
+
+ * \param tensorMapper: is the input tensor
+ *
+ * \param NCIndex: is the non-contracting dim index
+ *
+ * \param CIndex: is the contracting dim index
+ */
+template <bool PacketLoad, bool, bool IsRhs, typename PacketType, typename TensorMapper, typename StorageIndex>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<!PacketLoad, PacketType> read(
+    const TensorMapper &tensorMapper, const StorageIndex &NCIndex, const StorageIndex &CIndex, const StorageIndex &) {
+  const StorageIndex row = (IsRhs) ? CIndex : NCIndex;
+  const StorageIndex col = (IsRhs) ? NCIndex : CIndex;
+  return tensorMapper(row, col);
+}
+
+/*!
+ * \brief write, a template function used for storing the data to local memory. This function is used to guarantee
+ * coalesced and vectorized store whenever possible.
+ *
+ * \tparam StorageIndex: determines the Index type
+ *
+ * \param ld is the leading dimension of the local memory. ld is a compile time value for the local memory
+ *
+ * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy.
+ *
+ * \tparam PacketType:  determines the type of packet
+ *
+ * \tparam DataScalar: determines the output data type
+ *
+ * \param packet_data: the data to be written in the local memory
+ *
+ * \param ptr: a pointer to the local memory
+ *
+ * \param CIndex is the contracting dim index
+ */
+
+template <typename StorageIndex, StorageIndex ld, data_source dt, typename PacketType, typename DataScalar>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<dt != data_source::global_mem, void> write(
+    PacketType &packet_data, DataScalar ptr) {
+  constexpr int PacketSize = Eigen::internal::unpacket_traits<PacketType>::size;
+  EIGEN_UNROLL_LOOP
+  for (int i = 0; i < PacketSize; i++) {
+    *ptr = PacketWrapper<PacketType, PacketSize>::scalarize(i, packet_data);
+    ptr += ld;
+  }
+}
+
+/*!
+ * \brief Overloading the write function for storing the data to global memory, when vectorization enabled This function
+ * is used to guarantee coalesced and vectorized store whenever possible.
+ *
+ * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy.
+ *
+ * \tparam PacketType:  determines the type of packet
+ *
+ * \tparam DataScalar: determines the output data type
+ *
+ * \param packet_data: the data to be written in the local memory
+ *
+ * \param ptr: a pointer to the local memory
+ */
+
+template <data_source dt, typename PacketType, typename DataScalar>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    typename std::enable_if_t<Eigen::internal::unpacket_traits<PacketType>::size != 1 && dt == data_source::global_mem,
+                              void>
+    write(PacketType &packet_data, DataScalar *ptr) {
+  ::Eigen::internal::pstoreu<DataScalar, PacketType>(ptr, packet_data);
+}
+
+/*!
+ * \brief Overloading the write function for storing the data to global memory, when vectorization is disabled.
+ *
+ * \tparam data_source: an enum value representing if the location of the data in a memory hierarchy.
+ *
+ * \tparam PacketType:  determines the type of packet
+ *
+ * \tparam DataScalar: determines the output data type
+ *
+ * \param packet_data: the data to be written in the local memory
+ *
+ * \param ptr: a pointer to the local memory
+ */
+template <data_source dt, typename PacketType, typename DataScalar>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    typename std::enable_if_t<Eigen::internal::unpacket_traits<PacketType>::size == 1 && dt == data_source::global_mem,
+                              void>
+    write(PacketType &packet_data, DataScalar *ptr) {
+  *ptr = packet_data;
+}
+
+/*!
+ * \brief check_boundary: is used to check the edge condition for non-internal blocks.
+ *
+ * \tparam is_internal: determines if the block is internal
+ */
+template <bool is_internal>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_boundary(bool) {
+  return true;
+}
+
+/*!
+ * \brief check_boundary: specialization of the check_boundary for non-internal blocks.
+ *
+ * \param cond: true when the data is in range. Otherwise false
+ */
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool check_boundary<false>(bool cond) {
+  return cond;
+}
+
+/*!
+ * \brief BlockProperties is a template class that provides different characteristic of a block of each Tensor processed
+ * by each workgroup.
+ *
+ * \tparam is_transposed: iff true, determines whether or not the block of the Tensor is transposed
+ *
+ * \tparam packet_load_: determines if the each element of this tensor block should be loaded in a packet mode
+ *
+ * \tparam PacketType:  determines the type of packet
+ *
+ * \tparam OutType: determines the type of each element for this block of tensor. If packet load is true, it will be
+ * packetType; Otherwise it will be scalar Type
+ *
+ * \param elements_per_access determines the size of each element based on OutType
+ *
+ * \param is_coalesced_layout  determines whether or not the Tensor data in a memory can be access coalesced and
+ * vectorized when possible. Coalesced memory access is a key factor in Kernel performance. When a tensor is 2d and the
+ * contracting dimension is 1, it is always possible to accessed tensor data coalesced and vectorized. This is the case
+ * when RHS(right hand side) Tensor is transposed or when LHS(left hand side) Tensor is not transposed.
+ *
+ * \param nc_stride determines the stride of non-contracting dimension to access the next adjustment element within the
+ * Tensor Block for each workgroup
+ *
+ * \param c_stride  determines the stride of contracting dimension to access the next adjustment element within the
+ * Tensor Block for each workgroup
+ */
+template <bool is_transposed, bool is_rhs_, bool packet_load_, typename PacketType>
+struct BlockProperties {
+  static constexpr bool packet_load = packet_load_;
+  typedef typename Eigen::internal::unpacket_traits<PacketType>::type OutScalar;
+  static constexpr bool is_rhs = is_rhs_;
+  typedef std::conditional_t<packet_load, PacketType, OutScalar> OutType;
+  static constexpr int elements_per_access = Eigen::internal::unpacket_traits<OutType>::size;
+  static constexpr bool is_coalesced_layout = !(is_transposed ^ is_rhs);
+  static constexpr int nc_stride = (is_coalesced_layout ? elements_per_access : 1);
+  static constexpr int c_stride = (is_coalesced_layout ? 1 : elements_per_access);
+};
+
+/*!
+ * \brief ThreadProperties is a template class that provides each thread's properties within a workgroup.  Please see
+ * the sycl-1.2.1 specification (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for the workgroup,
+ * work-items
+ *
+ * \tparam StorageIndex: determines the StorageIndex Type
+ *
+ * \param linearLocalThreadId: determines the linearized location of a thread within a work-group
+ *
+ * \param kGroupId: determines the logical group id in a k dimension of the flattened tensor. It will be > 1 when
+ * tall/skinny algorithm is used
+ *
+ * \param mGroupOffset: determines the logical start position of all thread within a workgroup for the m dimension of
+ * the flattened tensor.
+ *
+ * \param kGroupOffset determines the logical start position of all thread within a workgroup for the k dimension of the
+ * flattened tensor. It will be > 1 when tall/skinny algorithm is used.
+ *
+ * \param mLocalOffset: determines the logical start position of each thread within a workgroup for the m dimension of a
+ * flattened tensor. The position determines the distance of each thread within the workgroup from each other
+ * independent from their global position.
+ *
+ * \param nLocalOffset: determines the logical start position of each thread within a workgroup for the n dimension of a
+ * flattened tensor. The position determines the distance of each thread within the workgroup from each other
+ * independent from their global position.
+ *
+ * \param mGlobalOffset: determines the logical start position of each thread a thread for the m dimension on a
+ * flattened tensor
+ *
+ * \param nGlobalOffset: determines the logical start position of each thread a thread for the n dimension on a
+ * flattened tensor
+ *
+ * \param kSize : determine the number of the k elements of the flattened Tensor to be processed by each thread for the
+ * given tensor block. This is !=K dimension of Flattened Tensor when Tall/Skinny matrix is used.
+ *
+ * \param is_internal : this will determined if the thread within the work-group computes an internal block of tensor or
+ * the edge blocks. When it is internal, there is no need to check the boundaries and all the if stantement can be
+ * resolve by compiler.
+ */
+template <typename StorageIndex>
+struct ThreadProperties {
+  const StorageIndex linearLocalThreadId;
+  const StorageIndex kGroupId;
+  const StorageIndex mGroupOffset;
+  const StorageIndex nGroupOffset;
+  const StorageIndex kGroupOffset;
+  const StorageIndex mLocalOffset;
+  const StorageIndex nLocalOffset;
+  const StorageIndex mGlobalOffset;
+  const StorageIndex nGlobalOffset;
+  StorageIndex kSize;
+  const bool is_internal;
+  // this is used to adjust the last block
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ThreadProperties(
+      const StorageIndex linearLocalThreadId_, const StorageIndex kGroupId_, const StorageIndex mGroupOffset_,
+      const StorageIndex nGroupOffset_, const StorageIndex kGroupOffset_, const StorageIndex mLocalOffset_,
+      const StorageIndex nLocalOffset_, const StorageIndex mGlobalOffset_, const StorageIndex nGlobalOffset_,
+      StorageIndex kSize_, const bool is_internal_)
+      : linearLocalThreadId(linearLocalThreadId_),
+        kGroupId(kGroupId_),
+        mGroupOffset(mGroupOffset_),
+        nGroupOffset(nGroupOffset_),
+        kGroupOffset(kGroupOffset_),
+        mLocalOffset(mLocalOffset_),
+        nLocalOffset(nLocalOffset_),
+        mGlobalOffset(mGlobalOffset_),
+        nGlobalOffset(nGlobalOffset_),
+        kSize(kSize_),
+        is_internal(is_internal_) {}
+};
+
+/*!
+ * \brief TensorContractionKernel is a template class that provides Tensor -Tensor contraction operation.
+ *
+ * \tparam OutScalar: determines the output scalar type
+ *
+ * \tparam LhsScalar: determines the left-hand-side scalar type
+ *
+ * \tparam RhsScalar: determines the right-hand-side scalar type
+ *
+ * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification
+ (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition)
+ *
+ * \tparam LhsMapper determines the tensor contraction mapper type for left-hand-side matrix
+ *
+ * \tparam RhsMapper determines the tensor contraction mapper type for right-hand-side matrix
+ *
+ * \tparam StorageIndex: determines the StorageIndex Type
+ *
+ * \tparam Properties: determines the Contraction Panel properties
+ *
+ * \tparam TripleDim: determines the M, K, N dimensions for the flatten tensors in order to treat them as a matrix
+ *
+ * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression.
+ *
+ * \tparam input_mapper_properties : determine if the input tensors are matrix. If they are matrix, special memory
+ access is used to guarantee that always the memory access are coalesced.
+ *
+ * \tparam IsFinal : determine if this is the final kernel. If so, the result will be written in a final output.
+ Otherwise, the result of contraction will be written iin a temporary buffer. This is the case when Tall/Skinny
+ contraction is used. So in this case, a final reduction step is required to compute final output.
+
+ * \tparam contraction_tp: it is an enum value representing whether the local memory/no local memory implementation of
+ the algorithm to be used
+ *
+ * \param scratch: local memory containing tiles of LHS and RHS tensors for each work-group
+ *
+ * \param lhs: determines the left-hand-side flattened tensor (tensor mapper)
+ *
+ * \param rhs: determines the right-hand-side flattened tensor (tensor mapper)
+ *
+ * \param out_res: determines the output tensor containing the contraction result
+ *
+ * \param groupSizeM: a logical number determining the number of work-group for m dimension
+ *
+ * \param groupSizeN: a logical number determining the number of work-group for n dimension
+ *
+ * \param numTiles: determines total number of tiles on the k dimension
+ *
+ * \param TripleDim: determines the M, K, N dimensions for the flatten tensors in order to treat them as a matrix
+ */
+template <typename OutScalar, typename LhsScalar, typename RhsScalar, typename OutAccessor, typename LhsMapper,
+          typename RhsMapper, typename StorageIndex, typename Properties, typename TripleDim, bool Vectorizable,
+          typename input_mapper_properties, bool IsFinal, contraction_type contraction_tp>
+class TensorContractionKernel {
+ public:
+  typedef typename Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketReturnType
+      PacketReturnType;
+  static constexpr int PacketSize =
+      Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketSize;
+  static constexpr bool is_lhs_transposed =
+      !::Eigen::internal::TensorContractionInputMapperTrait<LhsMapper>::inner_dim_contiguous;
+  static constexpr bool is_rhs_transposed =
+      !::Eigen::internal::TensorContractionInputMapperTrait<RhsMapper>::inner_dim_contiguous;
+
+  typedef BlockProperties<is_lhs_transposed, false, input_mapper_properties::is_lhs_matrix && Vectorizable,
+                          PacketReturnType>
+      LHSBlockProperties;
+
+  typedef BlockProperties<is_rhs_transposed, true, input_mapper_properties::is_rhs_matrix && Vectorizable,
+                          PacketReturnType>
+      RHSBlockProperties;
+
+  static constexpr StorageIndex NStride =
+      contraction_tp == contraction_type::local ? Properties::WorkLoadPerThreadN : RHSBlockProperties::nc_stride;
+
+  typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> Scratch;
+  typedef cl::sycl::multi_ptr<OutScalar, cl::sycl::access::address_space::local_space> local_ptr;
+  typedef OutScalar * /*cl::sycl::multi_ptr<OutScalar, cl::sycl::access::address_space::private_space>*/ private_ptr;
+  typedef std::conditional_t<contraction_tp == contraction_type::local, local_ptr, private_ptr> tile_ptr;
+  static constexpr StorageIndex LSDL = contraction_tp == contraction_type::local
+                                           ? Properties::TileSizeDimM + Properties::BC
+                                           : Properties::WorkLoadPerThreadM;
+  static constexpr StorageIndex LSDR = contraction_tp == contraction_type::local
+                                           ? Properties::TileSizeDimN + Properties::BC
+                                           : Properties::WorkLoadPerThreadN;
+  static constexpr StorageIndex LocalOffset = Properties::LocalThreadSizeM * Properties::LocalThreadSizeN;
+
+  /**
+   * \brief MemHolder this is a place holder struct for creating memory hierarchy in SYCL. Inside SYCL kernel it is not
+   * allowed to have dynamic memory allocation. While the local memory is created outside of the kernel and passed to
+   * the kernel as an accessor, the private memory can only allowed to be allocated statically. Since we are abstracting
+   * the TiledMemory for both local and private memory, the MemHolder structs is used as a helper to abstract out
+   * different type of memory needed when local/no_local memory computation is called.
+   *
+   * \tparam contraction_type: it is an enum value representing whether the local memory/no local memory implementation
+   of the algorithm to be used
+   * \tparam the private memory size
+   * \param ptr the tile memory pointer type
+   */
+  template <contraction_type, StorageIndex>
+  struct MemHolder {
+    tile_ptr ptr;
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MemHolder(local_ptr block_start_ptr) : ptr(block_start_ptr) {}
+  };
+  /**
+   * \brief specialization of memHolder class when no local memory kernel is used.
+   */
+  template <StorageIndex MemSize>
+  struct MemHolder<contraction_type::no_local, MemSize> {
+    OutScalar ptr[MemSize] = {OutScalar{0}};
+  };
+  /**
+   * \brief TiledMemory: contains required memory pointer for loading  each tile of the TensorContraction panel from
+   * global memory to local/private memory when local/no_local algorithm used.
+   *
+   * \param lhs_scratch_extract : determines the LHS tile memory. It is either private or local memory based on the
+   * selected contraction_type.
+   *
+   * \param rhs_scratch_extract : determines the RHS tile memory. It is either private or local memory based on the
+   * selected contraction_type.
+   *
+   * \param lhs_extract_index: determines the position of each thread on a local memory for lhs input. When private
+   * memory is used this is set to zero as this is not applicable in case of private memory.
+   *
+   * \param rhs_extract_index: determines the position of each thread on a local memory for rhs input. When private
+   * memory is used this is set to zero as this is not applicable in case of private memory.
+   *
+   * \param lhs_scratch_compute : determines the  location to load for computation for lhs_local memory. This is the
+   * same as lhs_scratch_extract for private memory.
+   *
+   * \param rhs_scratch_compute : determines the  location to load for computation for rhs_local memory. This is the
+   * same as rhs_scratch_extract for private memory.
+   */
+  struct TiledMemory {
+    MemHolder<contraction_tp, Properties::WorkLoadPerThreadM * Properties::TileSizeDimK> lhs_scratch_extract;
+    MemHolder<contraction_tp, Properties::WorkLoadPerThreadN * Properties::TileSizeDimK> rhs_scratch_extract;
+    tile_ptr lhs_scratch_ptr_compute;
+    tile_ptr rhs_scratch_ptr_compute;
+    const std::pair<StorageIndex, StorageIndex> lhs_extract_index;
+    const std::pair<StorageIndex, StorageIndex> rhs_extract_index;
+    template <contraction_type tp = contraction_tp>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TiledMemory(const ThreadProperties<StorageIndex> &, local_ptr,
+                                                      std::enable_if_t<tp == contraction_type::no_local> * = 0)
+        : lhs_scratch_extract{},
+          rhs_scratch_extract{},
+          lhs_scratch_ptr_compute(lhs_scratch_extract.ptr),
+          rhs_scratch_ptr_compute(rhs_scratch_extract.ptr),
+          lhs_extract_index(std::pair<StorageIndex, StorageIndex>(StorageIndex{0}, StorageIndex{0})),
+          rhs_extract_index(std::pair<StorageIndex, StorageIndex>(StorageIndex{0}, StorageIndex{0})) {}
+
+    template <contraction_type tp = contraction_tp>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TiledMemory(const ThreadProperties<StorageIndex> &thread_properties,
+                                                      local_ptr block_start_ptr,
+                                                      std::enable_if_t<tp == contraction_type::local> * = 0)
+        : lhs_scratch_extract{block_start_ptr},
+          rhs_scratch_extract{lhs_scratch_extract.ptr +
+                              ((Properties::DoubleBuffer + 1) * LSDL * Properties::TileSizeDimK)},
+          lhs_scratch_ptr_compute(lhs_scratch_extract.ptr + thread_properties.mLocalOffset),
+          rhs_scratch_ptr_compute(rhs_scratch_extract.ptr + thread_properties.nLocalOffset),
+          lhs_extract_index(
+              local_id_extract<LHSBlockProperties, Properties::TileSizeDimM>(thread_properties.linearLocalThreadId)),
+          rhs_extract_index(
+              local_id_extract<RHSBlockProperties, Properties::TileSizeDimN>(thread_properties.linearLocalThreadId)) {}
+  };
+
+  Scratch scratch;
+  const LhsMapper lhs;
+  const RhsMapper rhs;
+  OutAccessor out_res;
+  const StorageIndex groupSizeM;
+  const StorageIndex groupSizeN;
+  const StorageIndex numTiles;
+  const TripleDim triple_dim;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionKernel(Scratch scratch_, const LhsMapper lhs_,
+                                                                const RhsMapper rhs_, OutAccessor out_res_,
+                                                                const StorageIndex groupSizeM_,
+                                                                const StorageIndex groupSizeN_,
+                                                                const StorageIndex numTiles_,
+                                                                const TripleDim triple_dim_)
+      : scratch(scratch_),
+        lhs(lhs_),
+        rhs(rhs_),
+        out_res(out_res_),
+        groupSizeM(groupSizeM_),
+        groupSizeN(groupSizeN_),
+        numTiles(numTiles_),
+        triple_dim(triple_dim_) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionKernel(Scratch scratch_, const LhsMapper lhs_,
+                                                                const RhsMapper rhs_, OutAccessor out_res_,
+                                                                const StorageIndex groupSizeM_,
+                                                                const StorageIndex numTiles_,
+                                                                const TripleDim triple_dim_)
+      : TensorContractionKernel(scratch_, lhs_, rhs_, out_res_, groupSizeM_, 1, numTiles_, triple_dim_) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
+    const StorageIndex linearLocalThreadId = itemID.get_local_id(0);
+    const StorageIndex nLocalThreadId = linearLocalThreadId / Properties::LocalThreadSizeM;
+    const StorageIndex mLocalThreadId = linearLocalThreadId % Properties::LocalThreadSizeM;
+    const StorageIndex mGroupId = itemID.get_group(0) % groupSizeM;
+    const StorageIndex tmp = itemID.get_group(0) / groupSizeM;
+    const StorageIndex nGroupId = IsFinal ? tmp : tmp % groupSizeN;
+    const StorageIndex kGroupId = IsFinal ? 0 : tmp / groupSizeN;
+    const StorageIndex mGroupOffset = mGroupId * Properties::TileSizeDimM;
+    const StorageIndex nGroupOffset = nGroupId * Properties::TileSizeDimN;
+    const StorageIndex mLocalOffset = PacketSize * mLocalThreadId;
+    const StorageIndex nLocalOffset = NStride * nLocalThreadId;
+    const StorageIndex mGlobalOffset = mGroupOffset + mLocalOffset;
+    const StorageIndex nGlobalOffset = nGroupOffset + nLocalOffset;
+
+    const StorageIndex kSizePerWG = IsFinal ? triple_dim.K : numTiles * Properties::TileSizeDimK;
+    StorageIndex kGroupOffset = kGroupId * kSizePerWG;
+    const bool is_internal = triple_dim.M - mGroupOffset >= Properties::TileSizeDimM &&
+                             triple_dim.N - nGroupOffset >= Properties::TileSizeDimN &&
+                             triple_dim.K - kGroupOffset >= kSizePerWG;
+    // this is used to adjust the last block
+    StorageIndex kSize = IsFinal ? triple_dim.K : std::min(kSizePerWG, triple_dim.K - kGroupOffset);
+    // This is used to find out the lats K offset so that kGroupOffset -kSize can compute the coffset for loading to
+    // tile
+    kGroupOffset += kSize;
+
+    auto thread_properties =
+        ThreadProperties<StorageIndex>(linearLocalThreadId, kGroupId, mGroupOffset, nGroupOffset, kGroupOffset,
+                                       mLocalOffset, nLocalOffset, mGlobalOffset, nGlobalOffset, kSize, is_internal);
+
+    auto out_ptr = out_res + (IsFinal ? 0 : thread_properties.kGroupId * triple_dim.M * triple_dim.N);
+
+    (thread_properties.is_internal) ? compute_panel<true>(itemID, thread_properties, out_ptr)
+                                    : compute_panel<false>(itemID, thread_properties, out_ptr);
+  }
+  // The compute block computes the contraction operation private block for each thread and store the resutl in the
+  // privateRes memory of Each computation the compute block function is independent of local and no local concepts as
+  // it only compute the block on each thread's private memory space
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_block_per_tile(OutScalar *lhs_block_ptr, OutScalar *rhs_block_ptr,
+                                                                    PacketReturnType *privateRes) const {
+    StorageIndex idx = 0;
+    constexpr StorageIndex lhs_stride =
+        contraction_tp == contraction_type::local ? (PacketSize * Properties::LocalThreadSizeM) : 1;
+    EIGEN_UNROLL_LOOP
+    for (StorageIndex wLPTN = 0; wLPTN < Properties::WorkLoadPerThreadN; wLPTN++) {
+      auto rhsPacket = PacketReturnType{*(rhs_block_ptr + wLPTN)};
+      StorageIndex lhs_index = 0;
+      EIGEN_UNROLL_LOOP
+      for (StorageIndex wLPTM = 0; wLPTM < Properties::WorkLoadPerThreadM / PacketSize; wLPTM++) {
+        PacketReturnType lhsPack{};
+        Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, PacketSize>::set_packet(lhsPack,
+                                                                                             lhs_block_ptr + lhs_index);
+        privateRes[idx] = ::Eigen::internal::pmadd(lhsPack, rhsPacket, privateRes[idx]);
+
+        lhs_index += lhs_stride;
+        idx++;
+      }
+    }
+  }
+  // The store function write the computed contraction operation in the private memory of each thread to the global
+  // memory. The store function is independent of local and no local concepts s that it can be abstract out in the base
+  // class.
+  template <bool is_internal_block, StorageIndex PrivateNStride, typename OutPtr>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void store(OutPtr *out_ptr, PacketReturnType *privateRes,
+                                                   StorageIndex mGlobalOffset, StorageIndex nGlobalOffset) const {
+    auto chk_bound = [&](const StorageIndex &mIndex, const StorageIndex &nIndex) EIGEN_DEVICE_FUNC {
+      return (mIndex + PacketSize - 1 < triple_dim.M && nGlobalOffset + nIndex < triple_dim.N);
+    };
+    // when local memory is not used M and N are both accessed in a coalesced way. However, when local memory is
+    // available the k*N is transposed in the local to N*K therefore, each blocks operates on blockId*
+    // WorkLoadPerThreadN slice of N
+    constexpr StorageIndex GlobalNStride = contraction_tp == contraction_type::local ? 1 : Properties::LocalThreadSizeN;
+    EIGEN_UNROLL_LOOP
+    for (StorageIndex wLPTN = 0; wLPTN < Properties::WorkLoadPerThreadN / PrivateNStride; wLPTN++) {
+      // output leading dimension
+      StorageIndex outputLD = 0;
+      // When local memory is used the PrivateNstride is always 1 because the coalesced access on N is loaded into Local
+      // memory and extracting from local to global is the same as no transposed version. However, when local memory is
+      // not used and RHS is transposed we packetize the load for RHS.
+      EIGEN_UNROLL_LOOP
+      for (StorageIndex nId = 0; nId < PrivateNStride; nId++) {
+        StorageIndex globalRow = mGlobalOffset;
+        EIGEN_UNROLL_LOOP
+        for (StorageIndex wLPTM = 0; wLPTM < Properties::WorkLoadPerThreadM / PacketSize; wLPTM++) {
+          PacketReturnType privetOut = privateRes[wLPTM];
+          if (check_boundary<is_internal_block>(chk_bound(globalRow, nId))) {
+            // Store the final results in C. The C matrix has always M as a first StorageIndex and N as a second
+            // StorageIndex Therefore it is always coalesced layout
+            write<data_source::global_mem>(privetOut, out_ptr + outputLD + globalRow);
+          } else {
+            EIGEN_UNROLL_LOOP
+            for (StorageIndex mId = 0; mId < PacketSize; mId++) {
+              StorageIndex mOffset = globalRow + mId;
+              if (mOffset < triple_dim.M && (nGlobalOffset + nId < triple_dim.N)) {
+                out_ptr[mOffset + outputLD] =
+                    Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, PacketSize>::scalarize(mId, privetOut);
+              }
+            }
+          }
+          globalRow += (PacketSize * Properties::LocalThreadSizeM);
+        }
+        outputLD += triple_dim.M;
+        privateRes += Properties::WorkLoadPerThreadM / PacketSize;
+      }
+      out_ptr += (GlobalNStride * outputLD);
+
+      nGlobalOffset += (PrivateNStride * GlobalNStride);
+    }
+  }
+  // when no local memory is used the following extract_block will be enabled
+  template <typename InputBlockProperties, bool is_internal_block, typename Input, typename PrivateReg,
+            contraction_type contract_tp = contraction_tp>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<contract_tp == contraction_type::no_local> extract_block(
+      const Input &inpt, PrivateReg private_ptr, const std::pair<StorageIndex, StorageIndex> &,
+      const StorageIndex &ncOffset, const StorageIndex cOffset) const {
+    constexpr StorageIndex LocalThreadSizeNC =
+        InputBlockProperties::is_rhs ? Properties::LocalThreadSizeN : Properties::LocalThreadSizeM;
+    constexpr StorageIndex WorkLoadPerThreadNC =
+        InputBlockProperties::is_rhs ? Properties::WorkLoadPerThreadN : Properties::WorkLoadPerThreadM;
+    const StorageIndex &NC = InputBlockProperties::is_rhs ? triple_dim.N : triple_dim.M;
+
+    auto chk_bound = [&](const StorageIndex &CIndex, const StorageIndex &NCIndex) EIGEN_DEVICE_FUNC {
+      return ((CIndex + InputBlockProperties::c_stride - 1 < triple_dim.K) &&
+              (NCIndex + InputBlockProperties::nc_stride - 1 < NC));
+    };
+    const StorageIndex ld = InputBlockProperties::is_coalesced_layout ? NC : triple_dim.K;
+    StorageIndex cIndex = cOffset;
+
+    EIGEN_UNROLL_LOOP
+    for (StorageIndex cId = 0; cId < Properties::TileSizeDimK / InputBlockProperties::c_stride; cId++) {
+      StorageIndex ncIndex = ncOffset;
+      EIGEN_UNROLL_LOOP
+      for (StorageIndex ncId = 0; ncId < WorkLoadPerThreadNC / InputBlockProperties::nc_stride; ncId++) {
+        if (check_boundary<is_internal_block>(chk_bound(cIndex, ncIndex))) {
+          auto val =
+              read<InputBlockProperties::packet_load, InputBlockProperties::is_coalesced_layout,
+                   InputBlockProperties::is_rhs, typename InputBlockProperties::OutType>(inpt, ncIndex, cIndex, ld);
+
+          write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : WorkLoadPerThreadNC),
+                data_source::private_mem>(val, private_ptr);
+        } else {
+          EIGEN_UNROLL_LOOP
+          for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) {
+            const StorageIndex ncInd = ncIndex + (InputBlockProperties::is_coalesced_layout ? i : 0);
+            const StorageIndex cInd = cIndex + (InputBlockProperties::is_coalesced_layout ? 0 : i);
+            OutScalar val =
+                (ncInd < NC && cInd < triple_dim.K)
+                    ? read<false, InputBlockProperties::is_coalesced_layout, InputBlockProperties::is_rhs, OutScalar>(
+                          inpt, ncInd, cInd, ld)
+                    : OutScalar(0);
+            write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : WorkLoadPerThreadNC),
+                  data_source::private_mem>(
+                val, private_ptr + (InputBlockProperties::is_coalesced_layout ? i : 0) +
+                         ((InputBlockProperties::is_coalesced_layout ? 0 : i) * WorkLoadPerThreadNC));
+          }
+        }
+
+        // if it is lhs we have to load it packetised when the packet size is > 1, because the output is coalesced. So
+        // even if M is not accessed in a coalesced mode, we have to load packet_size number of m per thread.
+        ncIndex = (!InputBlockProperties::is_rhs && InputBlockProperties::nc_stride == 1 && PacketSize != 1)
+                      ? ncOffset + (ncId + 1) % PacketSize + ((ncId + 1) / PacketSize) * LocalThreadSizeNC
+                      : (ncIndex + InputBlockProperties::nc_stride * LocalThreadSizeNC);
+        private_ptr += InputBlockProperties::nc_stride;
+      }
+      // the previous for loop ( private_ptr += (ncId * nc_stride)) has already moved ptr with one WorkLoadPerThreadNC
+      private_ptr += (InputBlockProperties::c_stride - 1) * WorkLoadPerThreadNC;
+      cIndex += InputBlockProperties::c_stride;
+    }
+  }
+  template <typename InputBlockProperties, StorageIndex TileSizeDimNC>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::pair<StorageIndex, StorageIndex> local_id_extract(
+      const StorageIndex &linearLocalThreadId) {
+    const StorageIndex localThreadNC =
+        (InputBlockProperties::is_coalesced_layout)
+            ? linearLocalThreadId % (TileSizeDimNC / InputBlockProperties::nc_stride)
+            : linearLocalThreadId / (Properties::TileSizeDimK / InputBlockProperties::c_stride);
+    const StorageIndex localThreadC =
+        (InputBlockProperties::is_coalesced_layout)
+            ? linearLocalThreadId / (TileSizeDimNC / InputBlockProperties::nc_stride)
+            : linearLocalThreadId % (Properties::TileSizeDimK / InputBlockProperties::c_stride);
+    return std::pair<StorageIndex, StorageIndex>(localThreadNC, localThreadC);
+  }
+
+  template <bool db = Properties::DoubleBuffer, contraction_type ctp = contraction_tp>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<db && ctp == contraction_type::local> sync_mem(
+      const cl::sycl::nd_item<1> &, bool &db_offset) noexcept {
+    db_offset = !db_offset;
+  }
+
+  template <bool db = Properties::DoubleBuffer, contraction_type ctp = contraction_tp>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<!db && ctp == contraction_type::local> sync_mem(
+      const cl::sycl::nd_item<1> &itemID, bool &) noexcept {
+    itemID.barrier(cl::sycl::access::fence_space::local_space);
+  }
+
+  template <contraction_type ctp = contraction_tp>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<ctp == contraction_type::no_local> sync_mem(
+      const cl::sycl::nd_item<1> &, bool &) noexcept {
+    return;
+  }
+
+  template <bool need_sync, contraction_type ctp = contraction_tp>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<need_sync && ctp == contraction_type::no_local>
+  sync_thread(const cl::sycl::nd_item<1> &
+#ifdef EIGEN_SYCL_ARM_GPU_CACHE_OPTIMISATION
+                  itemID
+#endif
+              ) noexcept {
+#ifdef EIGEN_SYCL_ARM_GPU_CACHE_OPTIMISATION
+    itemID.barrier(cl::sycl::access::fence_spacce::local_space);
+#else
+    return;
+#endif
+  }
+  template <bool need_sync, contraction_type ctp = contraction_tp>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<need_sync && ctp == contraction_type::local>
+  sync_thread(const cl::sycl::nd_item<1> &itemID) {
+    itemID.barrier(cl::sycl::access::fence_space::local_space);
+  }
+  template <bool need_sync>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<!need_sync> sync_thread(const cl::sycl::nd_item<1> &) {
+    return;
+  }
+
+  template <bool is_internal_block>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_tile_per_panel(const cl::sycl::nd_item<1> &itemID,
+                                                                    ThreadProperties<StorageIndex> &thread_properties,
+                                                                    TiledMemory &tiled_input_block,
+                                                                    PacketReturnType *privateRes,
+                                                                    bool &db_offset) const {
+    // Tiling the Rhs block from global to local memory
+    extract_block<RHSBlockProperties, is_internal_block>(
+        rhs, tiled_input_block.rhs_scratch_extract.ptr + (db_offset * Properties::TileSizeDimK * LSDR),
+        tiled_input_block.rhs_extract_index,
+        contraction_tp == contraction_type::local ? thread_properties.nGroupOffset : thread_properties.nGlobalOffset,
+        thread_properties.kGroupOffset - thread_properties.kSize);
+
+    sync_thread<contraction_tp == contraction_type::no_local>(itemID);
+
+    // Tiling the Lhs block from global to local memory
+    extract_block<LHSBlockProperties, is_internal_block>(
+        lhs, tiled_input_block.lhs_scratch_extract.ptr + (db_offset * LSDL * Properties::TileSizeDimK),
+        tiled_input_block.lhs_extract_index,
+        contraction_tp == contraction_type::local ? thread_properties.mGroupOffset : thread_properties.mGlobalOffset,
+        thread_properties.kGroupOffset - thread_properties.kSize);
+
+    // itemID.barrier(cl::sycl::access::fence_space::local_space);
+    sync_thread<contraction_tp == contraction_type::local>(itemID);
+    // switch to compute mede
+    StorageIndex lhs_offset = (db_offset * LSDL * Properties::TileSizeDimK);
+    StorageIndex rhs_offset = (db_offset * Properties::TileSizeDimK * LSDR);
+    // Loop over the values of a single tile
+    for (StorageIndex k = 0; k < Properties::TileSizeDimK; k++) {
+      compute_block_per_tile(tiled_input_block.lhs_scratch_ptr_compute + lhs_offset,
+                             tiled_input_block.rhs_scratch_ptr_compute + rhs_offset, privateRes);
+      lhs_offset += LSDL;
+      rhs_offset += LSDR;
+    }
+    // computing the K index for the next tile
+    thread_properties.kSize -= Properties::TileSizeDimK;
+    sync_mem(itemID, db_offset);
+  }
+
+  // when local memory is available the following compute_panel will be enabled
+  template <bool is_internal_block, typename OutPtr>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel(const cl::sycl::nd_item<1> &itemID,
+                                                           ThreadProperties<StorageIndex> &thread_properties,
+                                                           OutPtr out_ptr) const {
+    auto tiled_input_block = TiledMemory{thread_properties, scratch.get_pointer()};
+    // Allocate register space
+    PacketReturnType privateRes[Properties::WorkLoadPerThreadM * Properties::WorkLoadPerThreadN / PacketSize] = {
+        PacketReturnType{0}};
+    bool db_offset = 0;
+
+    while (thread_properties.kSize >= Properties::TileSizeDimK) {
+      compute_tile_per_panel<is_internal_block>(itemID, thread_properties, tiled_input_block, privateRes, db_offset);
+    }
+    if (thread_properties.kSize > 0) {
+      compute_tile_per_panel<false>(itemID, thread_properties, tiled_input_block, privateRes, db_offset);
+    }
+
+    // Storing the final results in the output
+    store<is_internal_block,
+          contraction_tp == contraction_type::local ? static_cast<StorageIndex>(1) : RHSBlockProperties::nc_stride>(
+        out_ptr + thread_properties.nGlobalOffset * triple_dim.M, privateRes, thread_properties.mGlobalOffset,
+        thread_properties.nGlobalOffset);
+  }
+  // When local memory is available the following extract_block will be enabled
+  template <typename InputBlockProperties, bool is_internal_block, typename Input, typename Local,
+            contraction_type contract_tp = contraction_tp>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<contract_tp == contraction_type::local> extract_block(
+      const Input &inpt, Local local_ptr, const std::pair<StorageIndex, StorageIndex> &local_index,
+      const StorageIndex &ncOffset, const StorageIndex cOffset) const {
+    constexpr StorageIndex TileSizeDimNC =
+        InputBlockProperties::is_rhs ? Properties::TileSizeDimN : Properties::TileSizeDimM;
+    constexpr StorageIndex LoadPerThread =
+        InputBlockProperties::is_rhs ? Properties::LoadPerThreadRhs : Properties::LoadPerThreadLhs;
+    constexpr StorageIndex LSD = InputBlockProperties::is_rhs ? LSDR : LSDL;
+    static_assert(((LocalOffset % (TileSizeDimNC / InputBlockProperties::nc_stride) == 0) &&
+                   (LocalOffset % (Properties::TileSizeDimK / InputBlockProperties::c_stride) == 0)),
+                  " LocalOffset must be divisible by stride");
+    const StorageIndex &NC = InputBlockProperties::is_rhs ? triple_dim.N : triple_dim.M;
+    StorageIndex localThreadNC = local_index.first;
+    StorageIndex localThreadC = local_index.second;
+    auto chk_bound = [&](const StorageIndex &CIndex, const StorageIndex &NCIndex) EIGEN_DEVICE_FUNC {
+      return ((CIndex + InputBlockProperties::c_stride - 1 < triple_dim.K) &&
+              (NCIndex + InputBlockProperties::nc_stride - 1 < NC));
+    };
+    EIGEN_UNROLL_LOOP
+    for (StorageIndex lPT = 0; lPT < LoadPerThread / InputBlockProperties::elements_per_access; lPT++) {
+      const StorageIndex CIndex = cOffset + (InputBlockProperties::c_stride * localThreadC);
+      const StorageIndex NCIndex = ncOffset + (InputBlockProperties::nc_stride * localThreadNC);
+      const StorageIndex ld = InputBlockProperties::is_coalesced_layout ? NC : triple_dim.K;
+      if (check_boundary<is_internal_block>(chk_bound(CIndex, NCIndex))) {
+        auto val =
+            read<InputBlockProperties::packet_load, InputBlockProperties::is_coalesced_layout,
+                 InputBlockProperties::is_rhs, typename InputBlockProperties::OutType>(inpt, NCIndex, CIndex, ld);
+        write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : LSD), data_source::local_mem>(
+            val, local_ptr + (InputBlockProperties::nc_stride * localThreadNC) +
+                     (InputBlockProperties::c_stride * localThreadC * LSD));
+      } else {
+        EIGEN_UNROLL_LOOP
+        for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) {
+          const StorageIndex nCInd = NCIndex + (InputBlockProperties::is_coalesced_layout ? i : 0);
+          const StorageIndex cInd = CIndex + (InputBlockProperties::is_coalesced_layout ? 0 : i);
+          OutScalar val =
+              (nCInd < NC && cInd < triple_dim.K)
+                  ? read<false, InputBlockProperties::is_coalesced_layout, InputBlockProperties::is_rhs, OutScalar>(
+                        inpt, nCInd, cInd, ld)
+                  : OutScalar(0);
+
+          write<StorageIndex, (InputBlockProperties::is_coalesced_layout ? 1 : LSD), data_source::local_mem>(
+              val, local_ptr + (InputBlockProperties::nc_stride * localThreadNC) +
+                       (InputBlockProperties::is_coalesced_layout ? i : 0) +
+                       ((InputBlockProperties::c_stride * localThreadC +
+                         (InputBlockProperties::is_coalesced_layout ? 0 : i)) *
+                        LSD));
+        }
+      }
+      localThreadNC += (InputBlockProperties::is_coalesced_layout)
+                           ? LocalOffset % (TileSizeDimNC / InputBlockProperties::nc_stride)
+                           : LocalOffset / (Properties::TileSizeDimK / InputBlockProperties::c_stride);
+      localThreadC += (InputBlockProperties::is_coalesced_layout)
+                          ? LocalOffset / (TileSizeDimNC / InputBlockProperties::nc_stride)
+                          : LocalOffset % (Properties::TileSizeDimK / InputBlockProperties::c_stride);
+    }
+  }
+};
+
+#ifndef EIGEN_SYCL_DISABLE_GEMV
+
+/*!
+ * \brief GeneralVectorTensor is a template class that provides Tensor -vector contraction operation, which is a special
+ * case of Tensor Tensor contraction.
+ *
+ * \tparam OutScalar: determines the output scalar type
+ *
+ * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification
+ * (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition)
+ *
+ * \tparam VectorMapper: determines the tensor contraction mapper for the vector input (can be lhs or rhs)
+ *
+ * \tparam TensorMapper: determines the tensor contraction mapper for the tensor input (can be lhs or rhs)
+ *
+ * \tparam StorageIndex: determines the StorageIndex Type
+ *
+ * \tparam Properties: determines the Contraction Panel properties
+ *
+ * \tparam KFactor: determines the number of elements in K dimension in a Tile
+ *
+ * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression.
+ *
+ * \tparam is_lhs_vec: determines whether lhs is a vector or rhs is a vector
+ *
+ * \tparam IsFinal: determine if this is the final kernel. If so, the result will be written in a final output.
+ * Otherwise, the result of contraction will be written iin a temporary buffer.
+ *
+ * \param scratch: determines the local memory containing the vector block for each work-group
+ *
+ * \param vec: determines the vector input (tensor mapper)
+ *
+ * \param mat: determines the tensor input (tensor mapper)
+ *
+ * \param out_res: determines the output vector containing the contraction result
+ *
+ * \param nonContractGroupSize: a logical number determining the number of work-group for non-contracting dimension
+ *
+ * \param nonContractDim: determines the size of non contracting dimension for the flattened tensor
+ *
+ * \param contractDim: determines the size of non contracting dimension for the flattened tensor
+ *
+ */
+template <typename OutScalar, typename OutAccessor, typename VectorMapper, typename TensorMapper, typename StorageIndex,
+          typename Properties, StorageIndex KFactor, bool Vectorizable, bool is_lhs_vec, bool IsFinal>
+struct GeneralVectorTensor {
+  typedef typename Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketReturnType
+      PacketReturnType;
+  static constexpr int PacketSize =
+      Eigen::TensorSycl::internal::Vectorise<OutScalar, Eigen::SyclDevice, Vectorizable>::PacketSize;
+  typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> Scratch;
+
+  static constexpr StorageIndex OutScratchOffset =
+      KFactor * Properties::LocalThreadSizeC * Properties::LocalThreadSizeNC;
+
+  // Since the access layout for a vector can always be coalesced, when LHS is a vector, we pass false and false to make
+  // sure that the !^ is true When RHS is a vector, we pass true and true to make sure that the !^ is true.
+  typedef BlockProperties<is_lhs_vec ? false : true, is_lhs_vec ? false : true, Vectorizable, PacketReturnType>
+      VecBlockProperties;
+
+  Scratch scratch;
+  const VectorMapper vec;
+  const TensorMapper mat;
+  OutAccessor out_res;
+  const StorageIndex nonContractGroupSize;
+  const StorageIndex nonContractDim;
+  const StorageIndex contractDim;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE GeneralVectorTensor(Scratch scratch_, const VectorMapper vec_,
+                                                            const TensorMapper mat_, OutAccessor out_res_,
+                                                            const StorageIndex nonContractGroupSize_,
+                                                            const StorageIndex nonContractDim_,
+                                                            const StorageIndex contractDim_)
+      : scratch(scratch_),
+        vec(vec_),
+        mat(mat_),
+        out_res(out_res_),
+        nonContractGroupSize(nonContractGroupSize_),
+        nonContractDim(nonContractDim_),
+        contractDim(contractDim_) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
+    auto scratch_ptr = scratch.get_pointer();
+    const StorageIndex linearLocalThreadId = itemID.get_local_id(0);
+    StorageIndex nonContractId = is_lhs_vec ? linearLocalThreadId / Properties::LocalThreadSizeC
+                                            : linearLocalThreadId % Properties::LocalThreadSizeNC;
+    StorageIndex contractId = is_lhs_vec ? linearLocalThreadId % Properties::LocalThreadSizeC
+                                         : linearLocalThreadId / Properties::LocalThreadSizeNC;
+    const StorageIndex cGroupSize = itemID.get_group_range(0) / nonContractGroupSize;
+    const StorageIndex nonContractGroupId =
+        is_lhs_vec ? itemID.get_group(0) / cGroupSize : itemID.get_group(0) % nonContractGroupSize;
+    const StorageIndex contractGroupId =
+        is_lhs_vec ? itemID.get_group(0) % cGroupSize : itemID.get_group(0) / nonContractGroupSize;
+    auto out_ptr = out_res + (IsFinal ? 0 : contractGroupId * nonContractDim);
+
+    const StorageIndex nonContractGroupOffset = nonContractGroupId * Properties::TileSizeDimNC;
+    const StorageIndex contractGroupOffset = contractGroupId * Properties::TileSizeDimC;
+    auto outScratchIndex = nonContractId + contractId * Properties::LocalThreadSizeNC;
+    const StorageIndex globalNonContractDimOffset = nonContractGroupOffset + nonContractId;
+    const StorageIndex globalContractDimOffset = contractGroupOffset + contractId;
+    auto local_output = scratch_ptr + OutScratchOffset;
+    const bool is_internal = nonContractDim - nonContractGroupOffset >= Properties::TileSizeDimNC &&
+                             contractDim - contractGroupOffset >= Properties::TileSizeDimC;
+    is_internal
+        ? compute_panel<true>(itemID, vec, mat, local_output, out_ptr,
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
+                              scratch_ptr, contractGroupOffset,
+#endif
+                              nonContractGroupOffset, linearLocalThreadId, contractDim, nonContractDim, contractId,
+                              nonContractId, globalContractDimOffset, globalNonContractDimOffset, outScratchIndex)
+        : compute_panel<false>(itemID, vec, mat, local_output, out_ptr,
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
+                               scratch_ptr, contractGroupOffset,
+#endif
+                               nonContractGroupOffset, linearLocalThreadId, contractDim, nonContractDim, contractId,
+                               nonContractId, globalContractDimOffset, globalNonContractDimOffset, outScratchIndex);
+  }
+  template <bool is_internal_block, typename OutPtr>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_panel(
+      const cl::sycl::nd_item<1> &itemID, const VectorMapper &vec, const TensorMapper &mat, OutScalar *local_output,
+      OutPtr out_ptr,
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
+      OutScalar *scratch_ptr, const StorageIndex contractGroupOffset,
+#endif
+      const StorageIndex nonContractGroupOffset, const StorageIndex linearLocalThreadId, StorageIndex contractDim,
+      StorageIndex nonContractDim, StorageIndex contractId, StorageIndex nonContractId,
+      StorageIndex globalContractDimOffset, StorageIndex globalNonContractDimOffset, StorageIndex outScratchIndex) {
+    OutScalar outScalar[Properties::WorkLoadPerThreadNC] = {OutScalar(0)};
+    // Reading the vector
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
+    const StorageIndex vectorOffset = contractGroupOffset + linearLocalThreadId;
+    extract_block<VecBlockProperties, is_internal_block, KFactor,
+                  Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC>(vec, scratch_ptr, linearLocalThreadId,
+                                                                                vectorOffset, contractDim);
+
+    itemID.barrier(cl::sycl::access::fence_space::local_space);
+    auto in_scratch_ptr = scratch_ptr + contractId;
+#endif
+
+    StorageIndex privateOffsetC = 0;
+    EIGEN_UNROLL_LOOP
+    for (StorageIndex i = 0; i < Properties::WorkLoadPerThreadC; i++) {
+      StorageIndex privateOffsetNC = 0;
+      bool contract_conds = ((globalContractDimOffset + privateOffsetC) < contractDim);
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
+      auto vecScalar = *in_scratch_ptr;
+#else
+      auto vecScalar = (check_boundary<is_internal_block>(contract_conds))
+                           ? vec(is_lhs_vec ? StorageIndex(0) : globalContractDimOffset + privateOffsetC,
+                                 is_lhs_vec ? globalContractDimOffset + privateOffsetC : StorageIndex(0))
+                           : OutScalar(0);
+#endif
+      EIGEN_UNROLL_LOOP
+      for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) {
+        auto matScalar = (check_boundary<is_internal_block>(
+                             contract_conds && ((globalNonContractDimOffset + privateOffsetNC) < nonContractDim)))
+                             ? mat(is_lhs_vec ? globalContractDimOffset + privateOffsetC
+                                              : globalNonContractDimOffset + privateOffsetNC,
+                                   is_lhs_vec ? globalNonContractDimOffset + privateOffsetNC
+                                              : globalContractDimOffset + privateOffsetC)
+                             : OutScalar(0);
+
+        outScalar[j] = ::Eigen::internal::pmadd(matScalar, vecScalar, outScalar[j]);
+        privateOffsetNC += Properties::LocalThreadSizeNC;
+      }
+      privateOffsetC += Properties::LocalThreadSizeC;
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
+      in_scratch_ptr += Properties::LocalThreadSizeC;
+#endif
+    }
+
+    auto out_scratch_ptr = local_output + outScratchIndex;
+    // Each block of 16*16 element in shared memory should reduce to 16*1
+    EIGEN_UNROLL_LOOP
+    for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) {
+      *out_scratch_ptr = outScalar[j];
+
+      out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC);
+    }
+    if (is_lhs_vec) {
+      nonContractId = linearLocalThreadId % Properties::LocalThreadSizeNC;
+      contractId = linearLocalThreadId / Properties::LocalThreadSizeNC;
+      outScratchIndex = nonContractId + contractId * Properties::LocalThreadSizeNC;
+    }
+
+    out_scratch_ptr = local_output + outScratchIndex;
+    EIGEN_UNROLL_LOOP
+    for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) {
+      EIGEN_UNROLL_LOOP
+      for (StorageIndex offset = Properties::LocalThreadSizeC >> 1; offset > 0; offset >>= 1) {
+        itemID.barrier(cl::sycl::access::fence_space::local_space);
+        if (contractId < offset) {
+          StorageIndex myNeigbourId = (Properties::LocalThreadSizeNC * offset);
+          *out_scratch_ptr += out_scratch_ptr[myNeigbourId];
+        }
+      }
+      // moving to next 16 by 16 block
+      out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC);
+    }
+
+    if (contractId == 0) {
+      out_scratch_ptr = local_output + nonContractId;
+      StorageIndex global_final_offset = nonContractGroupOffset + nonContractId;
+      out_ptr += global_final_offset;
+      EIGEN_UNROLL_LOOP
+      for (StorageIndex j = 0; j < Properties::WorkLoadPerThreadNC; j++) {
+        if (check_boundary<is_internal_block>(global_final_offset < nonContractDim)) {
+          auto res = *out_scratch_ptr;
+
+          *out_ptr = res;
+          out_ptr += Properties::LocalThreadSizeNC;
+        }
+        // moving to next 16 by 16 block to ge the next 16 reduced elements
+        out_scratch_ptr += (Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC);
+        if (!(is_internal_block)) global_final_offset += Properties::LocalThreadSizeNC;
+      }
+    }
+  }
+
+  template <typename InputBlockProperties, bool is_internal_block, int CFactor, int GroupSize, typename Input,
+            typename Local>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void extract_block(const Input &inpt, Local *local_ptr,
+                                                                  const StorageIndex &linearLocalThreadId,
+                                                                  const StorageIndex &cOffset, const StorageIndex &C) {
+    local_ptr += InputBlockProperties::c_stride * linearLocalThreadId;
+    StorageIndex cIndex = cOffset;
+    for (StorageIndex cId = 0; cId < CFactor / InputBlockProperties::c_stride; cId++) {
+      if (check_boundary<is_internal_block>(cIndex + InputBlockProperties::c_stride - 1 < C)) {
+        auto val = read<InputBlockProperties::packet_load, InputBlockProperties::is_coalesced_layout,
+                        InputBlockProperties::is_rhs, typename InputBlockProperties::OutType>(inpt, StorageIndex(0),
+                                                                                              cIndex, StorageIndex(1));
+        write<StorageIndex, 1, data_source::local_mem>(val, local_ptr);
+      } else {
+        EIGEN_UNROLL_LOOP
+        for (StorageIndex i = 0; i < InputBlockProperties::elements_per_access; i++) {
+          OutScalar val =
+              (cIndex + i < C)
+                  ? read<false, InputBlockProperties::is_coalesced_layout, InputBlockProperties::is_rhs, OutScalar>(
+                        inpt, StorageIndex(0), cIndex + i, StorageIndex(1))
+                  : OutScalar(0);
+          write<StorageIndex, 1, data_source::local_mem>(val, local_ptr + i);
+        }
+      }
+      local_ptr += InputBlockProperties::c_stride * GroupSize;
+      cIndex += InputBlockProperties::c_stride * GroupSize;
+    }
+  }
+};
+#endif
+
+#ifndef EIGEN_SYCL_DISABLE_SCALAR
+
+/*!
+ * \brief GeneralScalarContraction is a template class that provides the scalar value of Tensor -Tensor contraction
+ * operation, when all the dimensions are contracting dimensions. This Kernel reduces two tensors to an scalar
+ *
+ * \tparam OutScalar: determines the output scalar type
+ *
+ * \tparam LhsScalar: determines the left-hand-side scalar type
+ *
+ * \tparam RhsScalar: determines the right-hand-side scalar type
+ *
+ * \tparam OutAccessor: determines the sycl accessor type for out put (please see the sycl-1.2.1 specification
+ * (https://www.khronos.org/registry/SYCL/specs/sycl-1.2.1.pdf) for accessor definition)
+ *
+ * \tparam LhsMapper: determines the tensor contraction mapper type for left-hand-side matrix
+ *
+ * \tparam RhsMapper: determines the tensor contraction mapper type for right-hand-side matrix
+ *
+ * \tparam StorageIndex: determines the StorageIndex Type
+ *
+ * \tparam Vectorizable: determines whether or not the vectorization is enabled for the Eigen expression.
+ *
+ * \param scratch: local memory containing tiles of LHS and RHS tensors for each work-group
+ *
+ * \param lhs: determines the left-hand-side flattened tensor (tensor mapper)
+ *
+ * \param rhs: determines the right-hand-side flattened tensor (tensor mapper)
+ *
+ * \param out_res: determines the output tensor containing the contraction result
+ *
+ * \param rng: determines the total input data size
+ */
+template <typename OutScalar, typename LhsScalar, typename RhsScalar, typename OutAccessor, typename LhsMapper,
+          typename RhsMapper, typename StorageIndex, bool Vectorizable>
+struct GeneralScalarContraction {
+  typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local> Scratch;
+  Scratch scratch;
+  const LhsMapper lhs;
+  const RhsMapper rhs;
+  OutAccessor out_res;
+  const StorageIndex rng;
+
+  EIGEN_DEVICE_FUNC GeneralScalarContraction(Scratch scratch_, const LhsMapper lhs_, const RhsMapper rhs_,
+                                             OutAccessor out_res_, const StorageIndex rng_)
+      : scratch(scratch_), lhs(lhs_), rhs(rhs_), out_res(out_res_), rng(rng_) {}
+
+  EIGEN_DEVICE_FUNC void operator()(cl::sycl::nd_item<1> itemID) const {
+    auto out_ptr = out_res;
+    OutScalar *scratch_ptr = scratch.get_pointer();
+
+    StorageIndex globalid = itemID.get_global_id(0);
+    StorageIndex localid = itemID.get_local_id(0);
+    OutScalar accumulator = OutScalar(0);
+    for (StorageIndex i = globalid; i < rng; i += itemID.get_global_range(0)) {
+      accumulator = Eigen::internal::pmadd(lhs(0, i), rhs(i, 0), accumulator);
+    }
+    auto out_scratch_ptr = scratch_ptr + localid;
+    *out_scratch_ptr = accumulator;
+    for (StorageIndex offset = itemID.get_local_range(0) >> 1; offset > 0; offset >>= 1) {
+      itemID.barrier(cl::sycl::access::fence_space::local_space);
+      if (localid < offset) {
+        *out_scratch_ptr = (accumulator += out_scratch_ptr[offset]);
+      }
+    }
+    if (localid == 0) {
+      out_ptr[itemID.get_group(0)] = accumulator;
+    }
+  }
+};
+#endif
+
+}  // namespace internal
+}  // namespace TensorSycl
+
+template <typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType>
+struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>,
+                       Eigen::SyclDevice>
+    : public TensorContractionEvaluatorBase<TensorEvaluator<
+          const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Eigen::SyclDevice>> {
+  static_assert(std::is_same<OutputKernelType, const NoOpOutputKernel>::value,
+                "SYCL tensor contraction does not support output kernels.");
+
+  typedef Eigen::SyclDevice Device;
+
+  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> Self;
+  typedef TensorContractionEvaluatorBase<Self> Base;
+  typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType;
+  typedef std::remove_const_t<typename XprType::Scalar> Scalar;
+  typedef typename XprType::Index StorageIndex;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef typename Base::Storage Storage;
+  typedef typename Base::EvaluatorPointerType EvaluatorPointerType;
+  struct TripleDim {
+    const StorageIndex M;
+    const StorageIndex N;
+    const StorageIndex K;
+    TripleDim(const StorageIndex M_, const StorageIndex N_, const StorageIndex K_) : M(M_), N(N_), K(K_) {}
+  };
+  enum {
+    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
+    BlockAccess = false,
+  };
+
+  static constexpr int Layout = TensorEvaluator<LeftArgType, Device>::Layout;
+  static constexpr int LDims = Base::LDims;
+  static constexpr int RDims = Base::RDims;
+  static constexpr int ContractDims = Base::ContractDims;
+
+  typedef array<StorageIndex, LDims> left_dim_mapper_t;
+  typedef array<StorageIndex, RDims> right_dim_mapper_t;
+
+  typedef array<StorageIndex, ContractDims> contract_t;
+  typedef array<StorageIndex, LDims - ContractDims> left_nocontract_t;
+  typedef array<StorageIndex, RDims - ContractDims> right_nocontract_t;
+
+  static constexpr int NumDims = LDims + RDims - 2 * ContractDims;
+
+  typedef DSizes<StorageIndex, NumDims> Dimensions;
+
+  typedef TensorEvaluator<typename Base::EvalLeftArgType, Device> LeftEvaluator;
+  typedef TensorEvaluator<typename Base::EvalRightArgType, Device> RightEvaluator;
+  typedef std::remove_const_t<typename LeftEvaluator::CoeffReturnType> LhsScalar;
+  typedef std::remove_const_t<typename RightEvaluator::CoeffReturnType> RhsScalar;
+
+  typedef typename LeftEvaluator::Dimensions LeftDimensions;
+  typedef typename RightEvaluator::Dimensions RightDimensions;
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered>
+  struct input_mapper_propertis {
+    static constexpr bool is_lhs_matrix = (LDims == 2 && ContractDims == 1) || lhs_inner_dim_contiguous;
+    static constexpr bool is_rhs_matrix =
+        (RDims == 2 && ContractDims == 1) || (rhs_inner_dim_contiguous && !rhs_inner_dim_reordered);
+  };
+
+  TensorEvaluator(const XprType &op, const Device &device) : Base(op, device) {}
+
+  // We need to redefine this method to make nvcc happy
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(typename Base::EvaluatorPointerType data) {
+    this->m_leftImpl.evalSubExprsIfNeeded(NULL);
+    this->m_rightImpl.evalSubExprsIfNeeded(NULL);
+    if (!data) {
+      this->m_result = this->m_device.get(
+          static_cast<Scalar *>(this->m_device.allocate_temp(this->dimensions().TotalSize() * sizeof(Scalar))));
+      data = this->m_result;
+    }
+    evalToSycl(data);
+    return (this->m_result != NULL);
+  }
+  const Eigen::SyclDevice &device() const { return this->m_device; }
+  void evalToSycl(typename Base::EvaluatorPointerType buffer) const {
+    if (this->m_lhs_inner_dim_contiguous) {
+      if (this->m_rhs_inner_dim_contiguous) {
+        if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<true, true, true, Unaligned>(buffer);
+        } else {
+          evalTyped<true, true, false, Unaligned>(buffer);
+        }
+      } else {
+        if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<true, false, true, Unaligned>(buffer);
+        } else {
+          evalTyped<true, false, false, Unaligned>(buffer);
+        }
+      }
+    } else {
+      if (this->m_rhs_inner_dim_contiguous) {
+        if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<false, true, true, Unaligned>(buffer);
+        } else {
+          evalTyped<false, true, false, Unaligned>(buffer);
+        }
+      } else {
+        if (this->m_rhs_inner_dim_reordered) {
+          evalTyped<false, false, true, Unaligned>(buffer);
+        } else {
+          evalTyped<false, false, false, Unaligned>(buffer);
+        }
+      }
+    }
+  }
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+  void evalTyped(typename Base::EvaluatorPointerType buffer) const {
+    const auto triple_dim = TripleDim{this->m_i_size, this->m_j_size, this->m_k_size};
+    typedef internal::TensorContractionInputMapper<
+        LhsScalar, StorageIndex, internal::Lhs, LeftEvaluator, left_nocontract_t, contract_t,
+        PacketType<CoeffReturnType, Device>::size, lhs_inner_dim_contiguous, false, Unaligned, MakePointer>
+        LhsMapper;
+
+    typedef internal::TensorContractionInputMapper<RhsScalar, StorageIndex, internal::Rhs, RightEvaluator,
+                                                   right_nocontract_t, contract_t,
+                                                   PacketType<CoeffReturnType, Device>::size, rhs_inner_dim_contiguous,
+                                                   rhs_inner_dim_reordered, Unaligned, MakePointer>
+        RhsMapper;
+
+    // initialize data mappers
+    LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides, this->m_i_strides,
+                  this->m_left_contracting_strides, this->m_k_strides);
+
+    RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides, this->m_j_strides,
+                  this->m_right_contracting_strides, this->m_k_strides);
+
+#ifndef EIGEN_SYCL_DISABLE_SCALAR
+    if (triple_dim.M == 1 && triple_dim.N == 1) {
+      launchSC(buffer, lhs, rhs, triple_dim.K);
+    } else
+#endif
+#ifndef EIGEN_SYCL_DISABLE_GEMV
+        if (triple_dim.M != 1 && triple_dim.N == 1) {
+      LaunchVT<false>(buffer, rhs, lhs, triple_dim.M, triple_dim.K);
+    } else if (triple_dim.M == 1 && triple_dim.N != 1) {
+      LaunchVT<true>(buffer, lhs, rhs, triple_dim.N, triple_dim.K);
+    } else  // This is equivalent of if (m!=1 && n!=1)
+#endif
+    {
+      typedef input_mapper_propertis<lhs_inner_dim_contiguous, rhs_inner_dim_contiguous, rhs_inner_dim_reordered>
+          inpt_mapper_properties;
+#ifndef EIGEN_SYCL_DISABLE_SKINNY
+      bool skinny = false;
+      auto platform_name = this->device().getPlatformName();
+      // This is based on empirical calculation for AMD r9-nano and Fiji
+      if (platform_name.find("AMD") == 0) {
+        skinny = (triple_dim.M < triple_dim.K || triple_dim.N < triple_dim.K) &&
+                 ((triple_dim.M < 1024 && triple_dim.N < 1024) ||
+                  (uint64_t(triple_dim.M * triple_dim.N) < uint64_t(triple_dim.K)));
+      } else {
+        skinny = (((std::max(triple_dim.K, triple_dim.N) / std::min(triple_dim.K, triple_dim.N)) > 100) ||
+                  ((std::max(triple_dim.K, triple_dim.M) / std::min(triple_dim.K, triple_dim.M)) > 100) ||
+                  ((std::max(triple_dim.N, triple_dim.M) / std::min(triple_dim.N, triple_dim.M)) > 100));
+      }
+      if (skinny)
+        adjustTT<true, inpt_mapper_properties>(buffer, lhs, rhs, triple_dim);
+      else
+#endif  // EIGEN_SYCL_DISABLE_SKINNY
+        adjustTT<false, inpt_mapper_properties>(buffer, lhs, rhs, triple_dim);
+    }
+  }
+
+  template <bool skinny, typename input_mapper_properties, typename LhsMapper, typename RhsMapper>
+  void EIGEN_ALWAYS_INLINE adjustTT(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs,
+                                    const TripleDim &triple_dim) const {
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON
+    if (device().has_local_memory()) {
+      typedef TensorSycl::internal::TTPanelSize<CoeffReturnType, StorageIndex, 4, 4, 16> PanelParameters;
+      launchTT<TensorSycl::internal::contraction_type::local, skinny, input_mapper_properties, PanelParameters>(
+          buffer, lhs, rhs, triple_dim);
+    }
+#endif
+#ifdef EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF
+    if (!(device().has_local_memory())) {
+      typedef TensorSycl::internal::TTPanelSize<CoeffReturnType, StorageIndex, 4, 4, 4> PanelParameters;
+      launchTT<TensorSycl::internal::contraction_type::no_local, skinny, input_mapper_properties, PanelParameters>(
+          buffer, lhs, rhs, triple_dim);
+    }
+#endif
+  }
+
+  template <TensorSycl::internal::contraction_type ct, bool skinny, typename input_mapper_properties,
+            typename Properties, typename LhsMapper, typename RhsMapper>
+  void launchTT(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs,
+                const TripleDim &triple_dim) const {
+    const StorageIndex roundUpM = Eigen::TensorSycl::internal::roundUp(triple_dim.M, Properties::TileSizeDimM);
+    const StorageIndex roundUpN = Eigen::TensorSycl::internal::roundUp(triple_dim.N, Properties::TileSizeDimN);
+    const StorageIndex groupSizeM = roundUpM / Properties::TileSizeDimM;
+    const StorageIndex groupSizeN = roundUpN / Properties::TileSizeDimN;
+
+    const StorageIndex roundUpK = Eigen::TensorSycl::internal::roundUp(triple_dim.K, Properties::TileSizeDimK);
+    StorageIndex totalTilesK = roundUpK / Properties::TileSizeDimK;
+    StorageIndex groupSizeK =
+        skinny
+            ? std::max(std::min(totalTilesK,
+                                (StorageIndex)(device().getPowerOfTwo(device().getNumSyclMultiProcessors(), true) * 4) /
+                                    (groupSizeM * groupSizeN)),
+                       StorageIndex(1))
+            : StorageIndex(1);
+
+    const StorageIndex numTilesPerGroup = Eigen::TensorSycl::internal::roundUp(totalTilesK, groupSizeK) / groupSizeK;
+
+    const StorageIndex totalGroupSize = groupSizeM * groupSizeN * groupSizeK;
+
+    const StorageIndex localRange = Properties::LocalThreadSizeM * Properties::LocalThreadSizeN;
+    const StorageIndex globalRange = totalGroupSize * localRange;
+
+    const StorageIndex scratchSize = (ct == TensorSycl::internal::contraction_type::local)
+                                         ? ((Properties::DoubleBuffer + 1) *
+                                            (Properties::TileSizeDimM + Properties::BC) * (Properties::TileSizeDimK)) +
+                                               ((Properties::DoubleBuffer + 1) * (Properties::TileSizeDimK) *
+                                                (Properties::TileSizeDimN + Properties::BC))
+                                         : StorageIndex(1);
+
+    auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
+    if (groupSizeK == 1) {
+      typedef TensorSycl::internal::TensorContractionKernel<CoeffReturnType, LhsScalar, RhsScalar, EvaluatorPointerType,
+                                                            LhsMapper, RhsMapper, StorageIndex, Properties, TripleDim,
+                                                            PacketAccess, input_mapper_properties, true, ct>
+          ContractKernelName;
+      device()
+          .template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
+              lhs, rhs, buffer, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup, triple_dim)
+          .wait();
+    } else {
+      typedef TensorSycl::internal::TensorContractionKernel<CoeffReturnType, LhsScalar, RhsScalar, EvaluatorPointerType,
+                                                            LhsMapper, RhsMapper, StorageIndex, Properties, TripleDim,
+                                                            PacketAccess, input_mapper_properties, false, ct>
+          ContractKernelName;
+      CoeffReturnType *temp_pointer = static_cast<CoeffReturnType *>(
+          device().allocate_temp(triple_dim.M * triple_dim.N * groupSizeK * sizeof(CoeffReturnType)));
+      EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer);
+
+      device()
+          .template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
+              lhs, rhs, tmp_global_accessor, thread_range, scratchSize, groupSizeM, groupSizeN, numTilesPerGroup,
+              triple_dim)
+          .wait();
+
+      typedef Eigen::internal::SumReducer<CoeffReturnType> Op;
+      auto op = Op();
+      typedef TensorSycl::internal::SecondStepPartialReduction<CoeffReturnType, StorageIndex, EvaluatorPointerType,
+                                                               EvaluatorPointerType, Op>
+          ReductionKernel;
+
+      device()
+          .template unary_kernel_launcher<CoeffReturnType, ReductionKernel>(
+              tmp_global_accessor, buffer,
+              cl::sycl::nd_range<1>(cl::sycl::range<1>(StorageIndex(
+                                        Eigen::TensorSycl::internal::roundUp(triple_dim.M * triple_dim.N, localRange))),
+                                    cl::sycl::range<1>(localRange)),
+              StorageIndex(1), op, StorageIndex(triple_dim.M * triple_dim.N), groupSizeK)
+          .wait();
+      device().deallocate_temp(temp_pointer);
+    }
+  }
+
+#ifndef EIGEN_SYCL_DISABLE_GEMV
+  template <bool is_lhs_vec, typename VectorMapper, typename TensorMapper, typename StorageIndex>
+  void EIGEN_ALWAYS_INLINE LaunchVT(EvaluatorPointerType buffer, const VectorMapper &vec, const TensorMapper &mat,
+                                    StorageIndex NC, StorageIndex C) const {
+    const StorageIndex nonContractDim = NC;
+    constexpr StorageIndex NCFactor = 1;
+    constexpr StorageIndex CFactor = 1;
+    constexpr StorageIndex NCWindow = 16;
+    typedef Eigen::TensorSycl::internal::TVPanelSize<CoeffReturnType, StorageIndex, NCWindow, CFactor, NCFactor>
+        Properties;
+    const StorageIndex roundUpC = Eigen::TensorSycl::internal::roundUp(C, Properties::TileSizeDimC);
+    const StorageIndex cNumGroups = roundUpC / (Properties::LocalThreadSizeC * Properties::WorkLoadPerThreadC);
+    const StorageIndex roundUpNC = Eigen::TensorSycl::internal::roundUp(nonContractDim, Properties::TileSizeDimNC);
+    const StorageIndex nCNumGroups = roundUpNC / (Properties::LocalThreadSizeNC * Properties::WorkLoadPerThreadNC);
+    const StorageIndex globalRange =
+        (roundUpNC / (Properties::WorkLoadPerThreadNC)) * (roundUpC / (Properties::WorkLoadPerThreadC));
+    const StorageIndex localRange = Properties::LocalThreadSizeNC * Properties::LocalThreadSizeC;
+    const StorageIndex scratchSize =
+        (Properties::WorkLoadPerThreadNC + CFactor) * Properties::LocalThreadSizeC * Properties::LocalThreadSizeNC;
+    auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
+    if (cNumGroups > 1) {
+      typedef Eigen::TensorSycl::internal::GeneralVectorTensor<CoeffReturnType, EvaluatorPointerType, VectorMapper,
+                                                               TensorMapper, StorageIndex, Properties, CFactor, false,
+                                                               is_lhs_vec, false>
+          ContractKernelName;
+      CoeffReturnType *temp_pointer =
+          static_cast<CoeffReturnType *>(device().allocate_temp(nonContractDim * cNumGroups * sizeof(CoeffReturnType)));
+      EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer);
+
+      device()
+          .template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
+              vec, mat, tmp_global_accessor, thread_range, scratchSize, nCNumGroups, nonContractDim, C)
+          .wait();
+
+      typedef Eigen::internal::SumReducer<CoeffReturnType> Op;
+      typedef TensorSycl::internal::SecondStepPartialReduction<CoeffReturnType, StorageIndex, EvaluatorPointerType,
+                                                               EvaluatorPointerType, Op>
+          ReductionKernel;
+
+      device()
+          .template unary_kernel_launcher<CoeffReturnType, ReductionKernel>(
+              tmp_global_accessor, buffer,
+              cl::sycl::nd_range<1>(
+                  cl::sycl::range<1>(Eigen::TensorSycl::internal::roundUp(nonContractDim, localRange)),
+                  cl::sycl::range<1>(localRange)),
+              StorageIndex(1), Op(), nonContractDim, cNumGroups)
+          .wait();
+      device().deallocate_temp(temp_pointer);
+    } else {
+      typedef Eigen::TensorSycl::internal::GeneralVectorTensor<CoeffReturnType, EvaluatorPointerType, VectorMapper,
+                                                               TensorMapper, StorageIndex, Properties, CFactor, false,
+                                                               is_lhs_vec, true>
+          ContractKernelName;
+      device()
+          .template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(
+              vec, mat, buffer, thread_range, scratchSize, nCNumGroups, nonContractDim, C)
+          .wait();
+    }
+  }
+#endif
+
+#ifndef EIGEN_SYCL_DISABLE_SCALAR
+  template <typename LhsMapper, typename RhsMapper>
+  EIGEN_ALWAYS_INLINE void launchSC(EvaluatorPointerType buffer, const LhsMapper &lhs, const RhsMapper &rhs,
+                                    StorageIndex K) const {
+    EIGEN_STATIC_ASSERT(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) &
+                          (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)),
+                        "The Local thread size must be a power of 2 for the reduction "
+                        "operation");
+    constexpr StorageIndex local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1;
+
+    // Here we force the code not to be more than 2-step reduction: Our empirical research shows that if each thread
+    // reduces at least 512 elementss individually, we get better performance.
+    const StorageIndex num_work_group = ((K + (512 * local_range - 1)) / (512 * local_range) > 1 ? local_range : 1);
+    const StorageIndex global_range = num_work_group * local_range;
+
+    typedef Eigen::TensorSycl::internal::GeneralScalarContraction<
+        CoeffReturnType, LhsScalar, RhsScalar, EvaluatorPointerType, LhsMapper, RhsMapper, StorageIndex, false>
+        ContractKernelName;
+    auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
+    if (num_work_group > 1) {
+      CoeffReturnType *temp_pointer =
+          static_cast<CoeffReturnType *>(device().allocate_temp(num_work_group * sizeof(CoeffReturnType)));
+      EvaluatorPointerType tmp_global_accessor = device().get(temp_pointer);
+      device()
+          .template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(lhs, rhs, tmp_global_accessor,
+                                                                                thread_range, local_range, K)
+          .wait();
+      typedef Eigen::internal::SumReducer<CoeffReturnType> Op;
+      typedef TensorSycl::internal::SecondStepFullReducer<CoeffReturnType, Op, EvaluatorPointerType,
+                                                          EvaluatorPointerType, StorageIndex, local_range>
+          GenericRKernel;
+      device()
+          .template unary_kernel_launcher<CoeffReturnType, GenericRKernel>(
+              tmp_global_accessor, buffer,
+              cl::sycl::nd_range<1>(cl::sycl::range<1>(local_range), cl::sycl::range<1>(local_range)), local_range,
+              Op())
+          .wait();
+      device().deallocate_temp(temp_pointer);
+    } else {
+      device()
+          .template binary_kernel_launcher<CoeffReturnType, ContractKernelName>(lhs, rhs, buffer, thread_range,
+                                                                                local_range, K)
+          .wait();
+    }
+  }
+#endif
+
+  EIGEN_STRONG_INLINE void cleanup() {
+    this->m_leftImpl.cleanup();
+    this->m_rightImpl.cleanup();
+
+    if (this->m_result) {
+      this->m_device.deallocate_temp(this->m_result);
+      this->m_result = NULL;
+    }
+  }
+};
+}  // namespace Eigen
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_SYCL_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
new file mode 100644
index 00000000..99e7304d
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -0,0 +1,1552 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H
+
+// evaluator for thread pool device
+#ifdef EIGEN_USE_THREADS
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+template <typename Indices, typename LeftArgType, typename RightArgType, typename OutputKernelType>
+struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>,
+                       ThreadPoolDevice>
+    : public TensorContractionEvaluatorBase<TensorEvaluator<
+          const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, ThreadPoolDevice>> {
+  typedef ThreadPoolDevice Device;
+
+  typedef TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType>, Device> Self;
+  typedef TensorContractionEvaluatorBase<Self> Base;
+
+  typedef TensorContractionOp<Indices, LeftArgType, RightArgType, OutputKernelType> XprType;
+  typedef std::remove_const_t<typename XprType::Scalar> Scalar;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+
+  static constexpr int Layout = TensorEvaluator<LeftArgType, Device>::Layout;
+
+  // Most of the code is assuming that both input tensors are ColMajor. If the
+  // inputs are RowMajor, we will "cheat" by swapping the LHS and RHS:
+  // If we want to compute A * B = C, where A is LHS and B is RHS, the code
+  // will pretend B is LHS and A is RHS.
+  typedef std::conditional_t<static_cast<int>(Layout) == static_cast<int>(ColMajor), LeftArgType, RightArgType>
+      EvalLeftArgType;
+  typedef std::conditional_t<static_cast<int>(Layout) == static_cast<int>(ColMajor), RightArgType, LeftArgType>
+      EvalRightArgType;
+
+  static constexpr int LDims =
+      internal::array_size<typename TensorEvaluator<EvalLeftArgType, Device>::Dimensions>::value;
+  static constexpr int RDims =
+      internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
+  static constexpr int ContractDims = internal::array_size<Indices>::value;
+
+  typedef array<Index, LDims> left_dim_mapper_t;
+  typedef array<Index, RDims> right_dim_mapper_t;
+
+  typedef array<Index, ContractDims> contract_t;
+  typedef array<Index, LDims - ContractDims> left_nocontract_t;
+  typedef array<Index, RDims - ContractDims> right_nocontract_t;
+
+  static constexpr int NumDims = LDims + RDims - 2 * ContractDims;
+
+  typedef DSizes<Index, NumDims> Dimensions;
+
+  // typedefs needed in evalTo
+  typedef std::remove_const_t<typename EvalLeftArgType::Scalar> LhsScalar;
+  typedef std::remove_const_t<typename EvalRightArgType::Scalar> RhsScalar;
+  typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
+
+  typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
+  typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
+
+  TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {}
+
+  template <int Alignment>
+  void evalProduct(Scalar* buffer) const {
+    evalProductImpl<NoCallback, Alignment>(buffer, NoCallback());
+  }
+
+  template <typename EvalToCallback, int Alignment>
+  void evalProductAsync(Scalar* buffer, EvalToCallback done) const {
+    evalProductImpl<EvalToCallback, Alignment>(buffer, std::move(done));
+  }
+
+  template <typename DoneCallback, int Alignment>
+  void evalProductImpl(Scalar* buffer, DoneCallback done) const {
+    // This function computes a lot of heuristics in multiple steps, and it
+    // also has multiple exit points. To keep it sane, readable and all in one
+    // place, sync/async execution decision is made at runtime at the very end.
+    //
+    // (1) In sync mode we allocate Context on the stack, submit computations
+    //     to the device thread pool, and block on a barrier until it is
+    //     completed.
+    //
+    // (2) In async mode we allocate Context on the heap, and after all tasks
+    //     are finished, we call provided the done callback, and delete a
+    //     context from the heap.
+    //
+    // (*) EvalParallelContext & EvalShardedByInnerDimContext owns all the state
+    // and temporary buffers, required for executing the tensor contraction.
+    // They are responsible for cleaning it up after contraction is done.
+    static const bool IsEvalInSyncMode = std::is_same<DoneCallback, NoCallback>::value;
+
+    const Index m = this->m_i_size;
+    const Index n = this->m_j_size;
+    const Index k = this->m_k_size;
+    if (m == 0 || n == 0 || k == 0) return;
+
+    // Compute a set of algorithm parameters:
+    // - kernel block sizes (bm, bn, bk)
+    // - task grain sizes (number of kernels executed per task: gm, gn)
+    // - number of threads
+    // - sharding by row/column
+    // - parallel packing or first lhs then rhs
+    // and some derived parameters:
+    // - number of tasks (nm, nn, nk)
+    // - number of kernels (nm0, nn0)
+    // Unfortunately, all these parameters are tightly interdependent.
+    // So in some cases we first compute approximate values, then compute other
+    // values based on these approximations and then refine the approximations.
+
+    // There are lots of heuristics here. There is some reasoning behind them,
+    // but ultimately they are just tuned on contraction benchmarks for
+    // different input configurations, thread counts and instruction sets.
+    // So feel free to question any of them.
+
+    // Compute whether we want to shard by row or by column.
+    // This is a first approximation, it will be refined later. Since we don't
+    // know number of threads yet we use 2, because what's we are most
+    // interested in at this point is whether it makes sense to use
+    // parallelization at all or not.
+    bool shard_by_col = shardByCol(m, n, 2);
+
+    // First approximation of kernel blocking sizes.
+    // Again, we don't know number of threads yet, so we use 2.
+    Index bm, bn, bk;
+    if (shard_by_col) {
+      internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index, internal::ShardByCol> blocking(k, m, n,
+                                                                                                              2);
+      bm = blocking.mc();
+      bn = blocking.nc();
+      bk = blocking.kc();
+    } else {
+      internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index, internal::ShardByRow> blocking(k, m, n,
+                                                                                                              2);
+      bm = blocking.mc();
+      bn = blocking.nc();
+      bk = blocking.kc();
+    }
+
+    // Compute optimal number of threads.
+    // Note: we use bk instead of k here because we are interested in amount of
+    // _parallelizable_ computations, and computations are not parallelizable
+    // across k dimension.
+    const TensorOpCost cost = contractionCost(m, n, bm, bn, bk, shard_by_col, false);
+    int num_threads =
+        TensorCostModel<ThreadPoolDevice>::numThreads(static_cast<double>(n) * m, cost, this->m_device.numThreads());
+    int num_threads_by_k = numThreadsInnerDim(m, n, k);
+    if (shardByInnerDim(m, n, k, num_threads, num_threads_by_k)) {
+      // We are in the scenario where it is more effective to shard by the
+      // inner dimension.
+      if (IsEvalInSyncMode) {
+        EvalShardedByInnerDimContext<DoneCallback> ctx(this, num_threads_by_k, buffer, m, n, k, std::move(done));
+        ctx.template run<Alignment>();
+      } else {
+        auto* ctx =
+            new EvalShardedByInnerDimContext<DoneCallback>(this, num_threads_by_k, buffer, m, n, k, std::move(done));
+        ctx->template runAsync<Alignment>();
+      }
+
+      return;
+    }
+
+    // TODO(dvyukov): this is a stop-gap to prevent regressions while the cost
+    // model is not tuned. Remove this when the cost model is tuned.
+    if (n == 1) num_threads = 1;
+
+    if (num_threads == 1) {
+      TENSOR_CONTRACTION_DISPATCH(this->template evalProductSequential, Unaligned, (buffer));
+      if (!IsEvalInSyncMode) done();
+      return;
+    }
+
+    // Now that we know number of threads, recalculate sharding and blocking.
+    shard_by_col = shardByCol(m, n, num_threads);
+    if (shard_by_col) {
+      internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index, internal::ShardByCol> blocking(
+          k, m, n, num_threads);
+      bm = blocking.mc();
+      bn = blocking.nc();
+      bk = blocking.kc();
+    } else {
+      internal::TensorContractionBlocking<Scalar, LhsScalar, RhsScalar, Index, internal::ShardByRow> blocking(
+          k, m, n, num_threads);
+      bm = blocking.mc();
+      bn = blocking.nc();
+      bk = blocking.kc();
+    }
+
+    // Number of kernels for each dimension.
+    Index nm0 = numext::div_ceil(m, bm);
+    Index nn0 = numext::div_ceil(n, bn);
+    Index nk = numext::div_ceil(k, bk);
+
+    // Calculate task grain size (number of kernels executed per task).
+    // This task size coarsening serves two purposes:
+    // 1. It reduces per-task overheads including synchronization overheads.
+    // 2. It allows to use caches better (reuse the same packed rhs in several
+    // consecutive kernels).
+    Index gm = 1;
+    Index gn = 1;
+    // If we are sharding by column, then we prefer to reduce rows first.
+    if (shard_by_col) {
+      gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col);
+      gn = coarsenN(m, n, bm, bn, bk, gm, num_threads, shard_by_col);
+    } else {
+      gn = coarsenN(m, n, bm, bn, bk, gm, num_threads, shard_by_col);
+      gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col);
+    }
+    // Number of tasks in each dimension.
+    Index nm = numext::div_ceil(nm0, gm);
+    Index nn = numext::div_ceil(nn0, gn);
+
+    // If there is enough concurrency in the sharding dimension, we choose not
+    // to paralellize by the other dimension, and execute all kernels in sync
+    // mode. This reduces parallelism from the nm x nn down to nn
+    // (shard_by_col==true) or nm (shard_by_col==false).
+    const Index sharding_dim_tasks = shard_by_col ? nn : nm;
+    const int num_worker_threads = this->m_device.numThreadsInPool();
+
+    // With small number of threads we want to make sure that we do not reduce
+    // parallelism too much. With large number of threads we trade maximum
+    // parallelism for better memory locality.
+    const float oversharding_factor = num_worker_threads <= 4    ? 8.0
+                                      : num_worker_threads <= 8  ? 4.0
+                                      : num_worker_threads <= 16 ? 2.0
+                                      : num_worker_threads <= 32 ? 1.0
+                                      : num_worker_threads <= 64 ? 0.8
+                                                                 : /* num_worker_threads > 64 */ 0.6;
+
+    const bool parallelize_by_sharding_dim_only = sharding_dim_tasks >= oversharding_factor * num_worker_threads;
+
+    // Last by not least, decide whether we want to issue both lhs and rhs
+    // packing in parallel; or issue lhs packing first, and then issue rhs
+    // packing when lhs packing completes (for !shard_by_col lhs and rhs are
+    // swapped). Parallel packing allows more parallelism (for both packing and
+    // kernels), while sequential packing provides better locality (once
+    // a thread finishes rhs packing it proceed to kernels with that rhs).
+    // First, we are interested in parallel packing if there are few tasks.
+    bool parallel_pack = num_threads >= nm * nn;
+    // Also do parallel packing if all data fits into L2$.
+    if (m * bk * Index(sizeof(LhsScalar)) + n * bk * Index(sizeof(RhsScalar)) <= l2CacheSize() * num_threads)
+      parallel_pack = true;
+    // But don't do it if we will use each rhs only once. Locality seems to be
+    // more important in this case.
+    if ((shard_by_col ? nm : nn) == 1) parallel_pack = false;
+    // Also don't get in the way of parallelize_by_sharding_dim_only
+    // optimization.
+    if (parallelize_by_sharding_dim_only) parallel_pack = false;
+
+    // TODO(ezhulnev): With if contexpr we don't need SyncEvalParallelContext.
+    if (IsEvalInSyncMode) {
+#define CONTEXT_ARGS                                                                                          \
+  (this, num_threads, buffer, m, n, k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, nn0, shard_by_col, parallel_pack, \
+   parallelize_by_sharding_dim_only, NoCallback())                                                            \
+      .run()
+      TENSOR_CONTRACTION_DISPATCH(SyncEvalParallelContext, Alignment, CONTEXT_ARGS);
+#undef CONTEXT_ARGS
+
+    } else {
+#define CONTEXT_ARGS                                                                                          \
+  (this, num_threads, buffer, m, n, k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, nn0, shard_by_col, parallel_pack, \
+   parallelize_by_sharding_dim_only, std::move(done))
+      TENSOR_CONTRACTION_ASYNC_DISPATCH(EvalParallelContext, DoneCallback, Alignment, CONTEXT_ARGS, run());
+#undef CONTEXT_ARGS
+    }
+  }
+
+  // ------------------------------------------------------------------------ //
+
+  // Dummy struct to represent an empty DoneCallback.
+
+  struct NoCallback {
+    void operator()() { eigen_assert(false && "NoCallback should never be called"); }
+  };
+
+  // ------------------------------------------------------------------------ //
+
+  template <typename DoneCallback, typename Context>
+  class EvalParallelNotification;
+
+  // Synchronous evaluation notification that blocks caller thread in Wait().
+  template <typename Context>
+  class EvalParallelNotification<NoCallback, Context> {
+   public:
+    EvalParallelNotification(Context*, NoCallback) {}
+    void Notify() { done_.Notify(); }
+    void Wait() { done_.Wait(); }
+
+   private:
+    Eigen::Notification done_;
+  };
+
+  // Asynchronous evaluation notification that does not block in Wait().
+  template <typename DoneCallback, typename Context>
+  class EvalParallelNotification {
+   public:
+    EvalParallelNotification(Context* ctx, DoneCallback done) : ctx_(ctx), done_(std::move(done)) {}
+
+    void Notify() {
+      // Make a copy of done callback, because it will be destructed when we
+      // will delete context in the next line (EvalParallelNotification is a
+      // data member of EvalParallelContext class).
+      DoneCallback done_copy = std::move(done_);
+
+      // Delete parallel evaluation context.
+      delete ctx_;
+
+      // Now safely call the done callback.
+      done_copy();
+    }
+
+    void Wait() {}
+
+   private:
+    Context* ctx_;
+    DoneCallback done_;
+  };
+
+  // Context orchestrates sync/async parallel contraction evaluation. When it is
+  // executed in asynchronous mode, it owns all the shared state that might be
+  // accessible by block packing and kernel tasks.
+
+  template <typename DoneCallback, bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous,
+            bool rhs_inner_dim_reordered, int Alignment>
+  class EvalParallelContext {
+   public:
+    typedef internal::TensorContractionInputMapper<LhsScalar, Index, internal::Lhs, LeftEvaluator, left_nocontract_t,
+                                                   contract_t, internal::packet_traits<LhsScalar>::size,
+                                                   lhs_inner_dim_contiguous, false, Unaligned>
+        LhsMapper;
+    typedef internal::TensorContractionInputMapper<RhsScalar, Index, internal::Rhs, RightEvaluator, right_nocontract_t,
+                                                   contract_t, internal::packet_traits<RhsScalar>::size,
+                                                   rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned>
+        RhsMapper;
+
+    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+
+    typedef internal::TensorContractionKernel<Scalar, LhsScalar, RhsScalar, Index, OutputMapper, LhsMapper, RhsMapper>
+        TensorContractionKernel;
+
+    typedef typename TensorContractionKernel::LhsBlock LhsBlock;
+    typedef typename TensorContractionKernel::RhsBlock RhsBlock;
+    typedef typename TensorContractionKernel::BlockMemHandle BlockMemHandle;
+
+    EvalParallelContext(const Self* self, int num_threads, Scalar* buffer, Index tm, Index tn, Index tk, Index bm,
+                        Index bn, Index bk, Index nm, Index nn, Index nk, Index gm, Index gn, Index nm0, Index nn0,
+                        bool shard_by_col, bool parallel_pack, bool parallelize_by_sharding_dim_only, DoneCallback done)
+        : created_by_thread_id_(std::this_thread::get_id()),
+          done_(this, std::move(done)),
+          device_(self->m_device),
+          lhs_(self->m_leftImpl, self->m_left_nocontract_strides, self->m_i_strides, self->m_left_contracting_strides,
+               self->m_k_strides),
+          rhs_(self->m_rightImpl, self->m_right_nocontract_strides, self->m_j_strides,
+               self->m_right_contracting_strides, self->m_k_strides),
+          buffer_(buffer),
+          output_(buffer, tm),
+          output_kernel_(self->m_output_kernel),
+          tensor_contraction_params_(self->m_tensor_contraction_params),
+          num_threads_(num_threads),
+          shard_by_col_(shard_by_col),
+          parallel_pack_(parallel_pack),
+          parallelize_by_sharding_dim_only_(parallelize_by_sharding_dim_only),
+          m_(tm),
+          n_(tn),
+          k_(tk),
+          bm_(bm),
+          bn_(bn),
+          bk_(bk),
+          nm_(nm),
+          nn_(nn),
+          nk_(nk),
+          gm_(gm),
+          gn_(gn),
+          nm0_(nm0),
+          nn0_(nn0),
+          kernel_(m_, k_, n_, bm_, bk_, bn_),
+          num_thread_local_allocations_(0),
+          // We reserve 2X more capacity for a thread local values, than the
+          // number of threads in the pool to efficiently handle task stealing
+          // by threads that are not managed by the pool.
+          thread_local_capacity(2 * (parallelize_by_sharding_dim_only_ ? device_.numThreadsInPool() : 0)),
+          // We will use only one of the Lhs/Rhs thread local storage depending
+          // on the shard_by_col value and we parallelize by sharding dim ONLY.
+          lhs_thread_local_blocks_(shard_by_col_ ? 0 : thread_local_capacity, {*this}, {*this}),
+          rhs_thread_local_blocks_(shard_by_col_ ? thread_local_capacity : 0, {*this}, {*this}) {
+      // These two options are mutually exclusive.
+      eigen_assert(!(parallel_pack && parallelize_by_sharding_dim_only));
+
+      for (Index x = 0; x < P; x++) {
+        // Normal number of notifications for k slice switch is
+        // nm_ + nn_ + nm_ * nn_. However, first P - 1 slices will receive only
+        // nm_ + nn_ notifications, because they will not receive notifications
+        // from preceding kernels.
+        state_switch_[x] =
+            x == 0 ? 1 : (parallel_pack_ ? nn_ + nm_ : (shard_by_col_ ? nn_ : nm_)) + (x == P - 1 ? nm_ * nn_ : 0);
+        state_packing_ready_[x] = parallel_pack_ ? 0 : (shard_by_col_ ? nm_ : nn_);
+        state_kernel_[x] = new std::atomic<uint8_t>*[nm_];
+        for (Index m = 0; m < nm_; m++) {
+          state_kernel_[x][m] = new std::atomic<uint8_t>[nn_];
+          // Kernels generally receive 3 notifications (previous kernel + 2
+          // packing), but the first slice won't get notifications from previous
+          // kernels.
+          for (Index n = 0; n < nn_; n++)
+            state_kernel_[x][m][n].store((x == 0 ? 0 : 1) + (parallel_pack_ ? 2 : 1), std::memory_order_relaxed);
+        }
+      }
+
+      // Allocate memory for packed rhs/lhs matrices.
+      packed_mem_ = kernel_.allocateSlices(            //
+          device_,                                     //
+          /*num_lhs=*/nm0_,                            //
+          /*num_rhs=*/nn0_,                            //
+          /*num_slices=*/std::min<Index>(nk_, P - 1),  //
+          packed_lhs_, packed_rhs_);
+
+      if (parallelize_by_sharding_dim_only_) {
+        const int num_worker_threads = device_.numThreadsInPool();
+
+        if (shard_by_col) {
+          can_use_thread_local_packed_ = new std::atomic<bool>[nn_];
+          for (int i = 0; i < nn_; ++i) can_use_thread_local_packed_[i].store(true, std::memory_order_relaxed);
+
+          Index num_blocks = num_worker_threads * gn_;
+          thread_local_pre_alocated_mem_ = kernel_.allocateSlices(  //
+              device_,                                              //
+              /*num_lhs=*/0,                                        //
+              /*num_rhs=*/num_blocks,                               //
+              /*num_slices=*/1,                                     //
+              /*lhs_blocks=*/nullptr, &rhs_thread_local_pre_allocated_);
+
+        } else {
+          can_use_thread_local_packed_ = new std::atomic<bool>[nm_];
+          for (int i = 0; i < nm_; ++i) can_use_thread_local_packed_[i].store(true, std::memory_order_relaxed);
+
+          Index num_blocks = num_worker_threads * gm_;
+          thread_local_pre_alocated_mem_ = kernel_.allocateSlices(  //
+              device_,                                              //
+              /*num_lhs=*/num_blocks,                               //
+              /*num_rhs=*/0,                                        //
+              /*num_slices=*/1, &lhs_thread_local_pre_allocated_,   //
+              /*rhs_blocks=*/nullptr);
+        }
+      }
+    }
+
+    ~EvalParallelContext() {
+      for (Index x = 0; x < P; x++) {
+        for (Index m = 0; m < nm_; m++) delete[] state_kernel_[x][m];
+        delete[] state_kernel_[x];
+      }
+      kernel_.deallocate(device_, packed_mem_);
+      if (parallelize_by_sharding_dim_only_) {
+        kernel_.deallocate(device_, thread_local_pre_alocated_mem_);
+        delete[] can_use_thread_local_packed_;
+      }
+    }
+
+    void run() {
+      // Kick off packing of the first slice.
+      signal_switch(0, 1);
+
+      // Wait for overall completion.
+      //
+      // If parallel evaluation is executed in async mode, this is a no-op, and
+      // Wait() will return immediately. In synchronous mode it will block the
+      // caller thread until it will receive notification from last task.
+      //
+      // In async mode, last task when completed will call done callback from
+      // the same thread, and will delete this context.
+      //
+      // TODO(dvyukov): This wait can lead to deadlock if contraction is
+      // evaluated in synchronous mode. If nthreads contractions are
+      // concurrently submitted from worker threads, this wait will block all
+      // worker threads and the system will deadlock.
+      done_.Wait();
+    }
+
+   private:
+    std::thread::id created_by_thread_id_;
+
+    // This notification is specialized on the type of DoneCallback and can be
+    // blocking or non-blocking.
+    EvalParallelNotification<DoneCallback, EvalParallelContext> done_;
+
+    const Device& device_;
+    LhsMapper lhs_;
+    RhsMapper rhs_;
+    Scalar* const buffer_;
+    OutputMapper output_;
+    OutputKernelType output_kernel_;
+    TensorContractionParams tensor_contraction_params_;
+    const int num_threads_;
+    const bool shard_by_col_;
+    const bool parallel_pack_;
+    const bool parallelize_by_sharding_dim_only_;
+    // Matrix sizes.
+    const Index m_;
+    const Index n_;
+    const Index k_;
+    // Block sizes.
+    const Index bm_;
+    const Index bn_;
+    const Index bk_;
+    // Number of tasks.
+    const Index nm_;
+    const Index nn_;
+    const Index nk_;
+    // Task grain sizes (number of kernels executed per task).
+    const Index gm_;
+    const Index gn_;
+    // Number of blocks (this is different from ni_/nn_ because of task size
+    // coarsening).
+    const Index nm0_;
+    const Index nn0_;
+    // Tensor contraction kernel.
+    TensorContractionKernel kernel_;
+
+    // Parallelization strategy.
+    //
+    // Blocks related to the same k block can run in parallel because they write
+    // to different output blocks. So we parallelize within k slices, this
+    // gives us parallelism level of m x n. Before we can start any kernels
+    // related to k-th slice, we need to issue m lhs packing tasks and n rhs
+    // packing tasks.
+    //
+    // However, there is a bottleneck when we are finishing kernels for k-th
+    // slice (at the very end there is only 1 runnable kernel). To mitigate this
+    // bottleneck we allow kernels from k-th and k+1-th slices to run in
+    // parallel. Note that (m, n, k) and (m, n, k+1) kernels write to the same
+    // output block, so they must not run in parallel.
+    //
+    // This gives us the following dependency graph.
+    // On each k slice we have m x n kernel tasks, m lhs paking tasks and n rhs
+    // packing tasks.
+    // Kernel (m, n, k) can start when:
+    //  - kernel (m, n, k-1) has finished
+    //  - lhs packing (m, k) has finished
+    //  - rhs packing (n, k) has finished
+    // Lhs/rhs packing can start when:
+    //  - all k-1 packing has finished (artificially imposed to limit amount of
+    //  parallel packing)
+    //
+    // On top of that we limit runnable tasks to two consecutive k slices.
+    // This is done to limit amount of memory we need for packed lhs/rhs
+    // (for each k slice we need m*bk + n*bk memory in packed_lhs_/packed_rhs_).
+    //
+    // state_switch_ tracks when we are ready to switch to the next k slice.
+    // state_kernel_[m][n] tracks when we are ready to kick off kernel (m, n).
+    // These variable are rolling over 3 consecutive k slices: first two we are
+    // actively executing + one to track completion of kernels in the second
+    // slice.
+    static constexpr Index P = 3;
+
+    // Handle to the allocated temporary storage for Lhs/Rhs blocks.
+    BlockMemHandle packed_mem_;
+    std::vector<LhsBlock> packed_lhs_[P - 1];
+    std::vector<RhsBlock> packed_rhs_[P - 1];
+
+    // If we choose to parallelize only by the sharding dimension, each thread
+    // will have it's own "thead local" (not a c++ thread local storage) memory
+    // for packed_lhs or packed_rhs (shard_by_col = false of true). This memory
+    // can't be passed to a kernel that might execute on a different thread.
+    //
+    // In practice when we are ready to pack memory for the sharding dimension
+    // (rhs if shard_by_col==true) of the K-th slice, all kernels for K-1 slice
+    // already computed (99% of the time), and we can pack data into the thread
+    // local storage, and guarantee that all the kernels will be executed
+    // immediately in the same thread. This significantly increases L1 cache hit
+    // ratio and reduces pressure on the memory bus.
+    //
+    // It's still possible that kernel for the K-th slice will be ready before
+    // completion of the K-1 kernel, so we have to allocate "global" packed_lhs_
+    // and packed_rhs_ to allow kernels to be executed later on a thread
+    // different from the thread that was used for packing.
+
+    // Handle for pre-allocated thread local memory buffers.
+    BlockMemHandle thread_local_pre_alocated_mem_;
+
+    // Only one of these will be initialized depending on shard_by_col value
+    // (the size will be `num_worker_threads * num_grains_in_the_sharding_dim`).
+    std::vector<LhsBlock> lhs_thread_local_pre_allocated_;
+    std::vector<RhsBlock> rhs_thread_local_pre_allocated_;
+
+    // How many thread local blocks were already allocated.
+    std::atomic<int> num_thread_local_allocations_;
+    const int thread_local_capacity;
+
+    // We will use pre-allocated Lhs/Rhs blocks defined above, if the number of
+    // unique threads in a system is below or equal to the number of threads in
+    // a thread pool. We will fallback on dynamic memory allocation after that.
+
+    // ThreadLocalBlocks is a container for Lhs or Rhs thread local buffers. Its
+    // size is equal to the grain size in Lhs/Rhs sharding dimension.
+    template <typename BlockType>
+    class ThreadLocalBlocks {
+     public:
+      ThreadLocalBlocks() = default;
+
+      ThreadLocalBlocks(BlockType* base, size_t grain_size)
+          : is_pre_allocated_(true), thread_local_pre_allocated_base_(base), grain_size_(grain_size) {}
+
+      ThreadLocalBlocks(BlockMemHandle mem_handle, std::vector<BlockType> blocks)
+          : is_pre_allocated_(false), mem_handle_(std::move(mem_handle)), blocks_(std::move(blocks)) {}
+
+      BlockType& block(int grain_index) {
+        eigen_assert(grain_index >= 0);
+        eigen_assert(static_cast<size_t>(grain_index) < size());
+        return is_pre_allocated_ ? thread_local_pre_allocated_base_[grain_index] : blocks_[grain_index];
+      }
+
+      void Release(EvalParallelContext& ctx) const {
+        if (!is_pre_allocated_) {
+          ctx.kernel_.deallocate(ctx.device_, mem_handle_);
+        }
+      }
+
+      size_t size() const { return is_pre_allocated_ ? grain_size_ : blocks_.size(); }
+
+     private:
+      bool is_pre_allocated_;
+
+      // Reuse pre-allocated thread local buffers.
+      BlockType* thread_local_pre_allocated_base_ = nullptr;
+      size_t grain_size_ = 0;
+
+      // These will be initialized only if `is_pre_allocated == false`.
+      BlockMemHandle mem_handle_{};
+      std::vector<BlockType> blocks_;
+    };
+
+    // ThreadLocalBlocksInitialize callable does custom thread local blocks
+    // initialization, and will reuse pre-allocated buffers if possible, or will
+    // dynamically allocate new memory.
+    //
+    // Lhs/Rhs blocks might be of the same type, so we have to pass explicitly
+    // for what side do we plan to do block allocation.
+    template <typename BlockType, bool is_rhs>
+    class ThreadLocalBlocksInitialize {
+      static constexpr bool kIsLhs = !is_rhs && std::is_same<BlockType, LhsBlock>::value;
+      static const bool kIsRhs = is_rhs && std::is_same<BlockType, RhsBlock>::value;
+      static_assert(kIsLhs || kIsRhs, "Unknown block type");
+
+      using Blocks = ThreadLocalBlocks<BlockType>;
+
+     public:
+      ThreadLocalBlocksInitialize(EvalParallelContext& ctx)
+          : ctx_(ctx), num_worker_threads_(ctx_.device_.numThreadsInPool()) {}
+
+      void operator()(Blocks& blocks) {
+        const int n = ctx_.num_thread_local_allocations_.fetch_add(1, std::memory_order_relaxed);
+
+        if (n >= num_worker_threads_) {
+          ThreadLocalBlocksAllocator<is_rhs>::allocate(ctx_, blocks);
+        } else {
+          ThreadLocalBlocksAllocator<is_rhs>::reuse(ctx_, n, blocks);
+        }
+      }
+
+     private:
+      // NOTE(ezhulenev): Without 'if constexpr' we have to put calls to
+      // TensorContractionKernel::allocateSlices into template specializations.
+      // Also explicit specializations are not allowed at class scope in C++03,
+      // EvalCtx type parameter is just a workaround for that limitation.
+      template <bool pack_rhs, typename EvalCtx = EvalParallelContext>
+      struct ThreadLocalBlocksAllocator;
+
+      template <typename EvalCtx>
+      struct ThreadLocalBlocksAllocator</*pack_rhs=*/true, EvalCtx> {
+        static void allocate(EvalCtx& ctx, Blocks& blocks) {
+          std::vector<RhsBlock> rhs_blocks;
+          BlockMemHandle mem_handle = ctx.kernel_.allocateSlices(ctx.device_,
+                                                                 /*num_lhs=*/0,
+                                                                 /*num_rhs=*/ctx.gn_,
+                                                                 /*num_slices=*/1,
+                                                                 /*lhs_blocks=*/nullptr, /*rhs_blocks=*/&rhs_blocks);
+
+          blocks = ThreadLocalBlocks<RhsBlock>(std::move(mem_handle), std::move(rhs_blocks));
+        }
+
+        static void reuse(EvalCtx& ctx, int index, Blocks& blocks) {
+          RhsBlock* ptr = &ctx.rhs_thread_local_pre_allocated_[ctx.gn_ * index];
+          blocks = ThreadLocalBlocks<RhsBlock>(ptr, ctx.gn_);
+        }
+      };
+
+      template <typename EvalCtx>
+      struct ThreadLocalBlocksAllocator</*pack_rhs=*/false, EvalCtx> {
+        static void allocate(EvalCtx& ctx, Blocks& blocks) {
+          std::vector<LhsBlock> lhs_blocks;
+          BlockMemHandle mem_handle = ctx.kernel_.allocateSlices(ctx.device_,
+                                                                 /*num_lhs=*/ctx.gm_,
+                                                                 /*num_rhs=*/0,
+                                                                 /*num_slices=*/1,
+                                                                 /*lhs_blocks=*/&lhs_blocks, /*rhs_blocks=*/nullptr);
+
+          blocks = ThreadLocalBlocks<LhsBlock>(std::move(mem_handle), std::move(lhs_blocks));
+        }
+
+        static void reuse(EvalCtx& ctx, int index, Blocks& blocks) {
+          LhsBlock* ptr = &ctx.lhs_thread_local_pre_allocated_[ctx.gm_ * index];
+          blocks = ThreadLocalBlocks<LhsBlock>(ptr, ctx.gm_);
+        }
+      };
+
+      EvalParallelContext& ctx_;
+      const int num_worker_threads_;
+    };
+
+    template <typename BlockType>
+    class ThreadLocalBlocksRelease {
+     public:
+      using Blocks = ThreadLocalBlocks<BlockType>;
+      ThreadLocalBlocksRelease(EvalParallelContext& ctx) : ctx_(ctx) {}
+      void operator()(Blocks& blocks) { blocks.Release(ctx_); }
+
+     private:
+      EvalParallelContext& ctx_;
+    };
+
+    // ThreadLocalBlocks initialization callables.
+    using ThreadLocalLhsInit = ThreadLocalBlocksInitialize<LhsBlock, /*is_rhs=*/false>;
+    using ThreadLocalRhsInit = ThreadLocalBlocksInitialize<RhsBlock, /*is_rhs=*/true>;
+
+    // ThreadLocalBlocks release callables.
+    using ThreadLocalLhsRelease = ThreadLocalBlocksRelease<LhsBlock>;
+    using ThreadLocalRhsRelease = ThreadLocalBlocksRelease<RhsBlock>;
+
+    // Thread local containers for Lhs/Rhs block packs. In practice only one of
+    // them will be used, depending on the shard_by_col value.
+    Eigen::ThreadLocal<ThreadLocalBlocks<LhsBlock>, ThreadLocalLhsInit, ThreadLocalLhsRelease> lhs_thread_local_blocks_;
+    Eigen::ThreadLocal<ThreadLocalBlocks<RhsBlock>, ThreadLocalRhsInit, ThreadLocalRhsRelease> rhs_thread_local_blocks_;
+
+    // After a particular shard for Kth slice missed thread local execution
+    // opportunity (K-1 slice didn't complete kernels execution), we can no
+    // longer schedule K+1 and following slices in thread local mode, because
+    // there is no more guarantee that previous kernels were executed
+    // sequentially in the same thread (size is nn_ or nm_).
+    std::atomic<bool>* can_use_thread_local_packed_;
+
+    std::atomic<uint8_t>** state_kernel_[P];
+    // state_switch_ is frequently modified by worker threads, while other
+    // fields are read-only after constructor. Let's move it to a separate cache
+    // line to reduce cache-coherency traffic.
+    char pad_[128];
+    std::atomic<Index> state_packing_ready_[P];
+    std::atomic<Index> state_switch_[P];
+
+    LhsBlock& packed_lhs(Index m, Index k, Index m1, bool use_thread_local) {
+      if (use_thread_local) {
+        eigen_assert(!shard_by_col_);
+        ThreadLocalBlocks<LhsBlock>& blocks = lhs_thread_local_blocks_.local();
+
+        Index grain_index = m1 - m * gm_;
+        return blocks.block(
+            internal::convert_index<int>(grain_index));  // FIXME better make ThreadLocalBlocks use Eigen::Index?
+      } else {
+        return packed_lhs_[k % (P - 1)][m1];
+      }
+    }
+
+    RhsBlock& packed_rhs(Index n, Index k, Index n1, bool use_thread_local) {
+      if (use_thread_local) {
+        eigen_assert(shard_by_col_);
+        ThreadLocalBlocks<RhsBlock>& blocks = rhs_thread_local_blocks_.local();
+
+        Index grain_index = n1 - n * gn_;
+        return blocks.block(
+            internal::convert_index<int>(grain_index));  // FIXME better make ThreadLocalBlocks use Eigen::Index?
+      } else {
+        return packed_rhs_[k % (P - 1)][n1];
+      }
+    }
+
+    // In following two methods (pack_lhs and pack_rhs), if we know for sure
+    // that we'll be able to immediately call a kernel with packed data, and do
+    // not submit it to the thread pool, we can use thread local memory for
+    // packed data.
+    //
+    // We can only reliably check it if we are running all kernels in sync mode
+    // (parallelize only by sharding dim). If kernel for m==0 (n==0) is ready to
+    // run, it's guaranteed that all kernels with larger values of m (n) are
+    // also ready, because we execute them in the same order for all K slices.
+
+    void pack_lhs(Index m, Index k) {
+      bool use_thread_local = false;
+
+      if (parallelize_by_sharding_dim_only_ && !shard_by_col_ &&
+          can_use_thread_local_packed_[m].load(std::memory_order_relaxed)) {
+        if (state_kernel_[k % P][m][0].load(std::memory_order_relaxed) == 1) {
+          use_thread_local = true;
+        } else {
+          // If we can't guarantee that all kernels in `k` slice will be
+          // executed sequentially in current thread, it's no longer safe to use
+          // thread local memory in following slices along the k dimensions.
+          eigen_assert(k > 0);
+          can_use_thread_local_packed_[m].store(false, std::memory_order_relaxed);
+        }
+      }
+
+      const Index mend = m * gm_ + gm(m);
+      for (Index m1 = m * gm_; m1 < mend; m1++)
+        kernel_.packLhs(&packed_lhs(m, k, m1, use_thread_local), lhs_.getSubMapper(m1 * bm_, k * bk_), bk(k), bm(m1));
+
+      if (!parallel_pack_ && shard_by_col_) {
+        eigen_assert(!use_thread_local);
+        signal_packing(k);
+      } else {
+        signal_switch(k + 1);
+        for (Index n = nn_ - 1; n >= 0; n--) {
+          bool sync = parallelize_by_sharding_dim_only_ || n == 0;
+          signal_kernel(m, n, k, sync, use_thread_local);
+        }
+      }
+    }
+
+    void pack_rhs(Index n, Index k) {
+      bool use_thread_local = false;
+
+      if (parallelize_by_sharding_dim_only_ && shard_by_col_ &&
+          can_use_thread_local_packed_[n].load(std::memory_order_relaxed)) {
+        if (state_kernel_[k % P][0][n].load(std::memory_order_relaxed) == 1) {
+          use_thread_local = true;
+        } else {
+          // If we can't guarantee that all kernels in `k` slice will be
+          // executed sequentially in current thread, it's no longer safe to use
+          // thread local memory in following slices along the k dimensions.
+          eigen_assert(k > 0);
+          can_use_thread_local_packed_[n].store(false, std::memory_order_relaxed);
+        }
+      }
+
+      const Index nend = n * gn_ + gn(n);
+      for (Index n1 = n * gn_; n1 < nend; n1++) {
+        if (!TensorContractionKernel::HasBeta && k == 0) {
+          // Zero the output memory in parallel, only if contraction kernel does
+          // not support `beta`. Otherwise we will pass beta 0.0 to the first
+          // call to the `TensorContractionKernel::invoke()`.
+          //
+          // On 10000x2x10000 mm zeroing can easily take half of time. Zero (bn
+          // x m) row. Safe to do here because all kernels that will write to
+          // this memory depend on completion of this task. Note: don't call
+          // device_.fill() here. device_.fill() blocks on thread pool
+          // worker thread, which can lead to underutilization and deadlocks.
+          std::fill_n(buffer_ + n1 * bn_ * m_, bn(n1) * m_, Scalar(0));
+        }
+        kernel_.packRhs(&packed_rhs(n, k, n1, use_thread_local), rhs_.getSubMapper(k * bk_, n1 * bn_), bk(k), bn(n1));
+      }
+
+      if (parallel_pack_ || shard_by_col_) {
+        signal_switch(k + 1);
+        for (Index m = nm_ - 1; m >= 0; m--) {
+          bool sync = parallelize_by_sharding_dim_only_ || m == 0;
+          signal_kernel(m, n, k, sync, use_thread_local);
+        }
+      } else {
+        eigen_assert(!use_thread_local);
+        signal_packing(k);
+      }
+    }
+
+    void kernel(Index m, Index n, Index k, bool use_thread_local) {
+      // Note: order of iteration matters here. Iteration over m is innermost
+      // because we want to reuse the same packed rhs in consecutive tasks
+      // (rhs fits into L2$ while lhs only into L3$).
+      const Index nend = n * gn_ + gn(n);
+      const Index mend = m * gm_ + gm(m);
+
+      // NOTE: output = alpha * LHS * RHS + beta * output.
+      const Scalar alpha = Scalar(1);
+      const Scalar beta = (TensorContractionKernel::HasBeta && k == 0) ? Scalar(0) : Scalar(1);
+
+      if (shard_by_col_) {
+        for (Index n1 = n * gn_; n1 < nend; n1++) {
+          for (Index m1 = m * gm_; m1 < mend; m1++) {
+            const auto output_mapper = output_.getSubMapper(m1 * bm_, n1 * bn_);
+            kernel_.invoke(output_mapper, packed_lhs(m, k, m1, !shard_by_col_ && use_thread_local),
+                           packed_rhs(n, k, n1, shard_by_col_ && use_thread_local), bm(m1), bk(k), bn(n1), alpha, beta);
+
+            // We are done with the last task for the [m1, n1] block.
+            if (k + 1 == nk_) {
+              output_kernel_(output_mapper, tensor_contraction_params_, m1 * bm_, n1 * bn_, bm(m1), bn(n1));
+            }
+          }
+        }
+      } else {
+        for (Index m1 = m * gm_; m1 < mend; m1++)
+          for (Index n1 = n * gn_; n1 < nend; n1++) {
+            const auto output_mapper = output_.getSubMapper(m1 * bm_, n1 * bn_);
+            kernel_.invoke(output_mapper, packed_lhs(m, k, m1, !shard_by_col_ && use_thread_local),
+                           packed_rhs(n, k, n1, shard_by_col_ && use_thread_local), bm(m1), bk(k), bn(n1), alpha, beta);
+
+            // We are done with the last task for the [m1, n1] block.
+            if (k + 1 == nk_) {
+              output_kernel_(output_mapper, tensor_contraction_params_, m1 * bm_, n1 * bn_, bm(m1), bn(n1));
+            }
+          }
+      }
+      signal_kernel(m, n, k + 1, /*sync=*/false, /*use_thread_local=*/false);
+      signal_switch(k + 2);
+    }
+
+    void signal_packing(Index k) {
+      eigen_assert(!parallel_pack_);
+      Index s = state_packing_ready_[k % P].fetch_sub(1);
+      eigen_assert(s > 0);
+      if (s != 1) return;
+      state_packing_ready_[k % P] = shard_by_col_ ? nm_ : nn_;
+      enqueue_packing(k, shard_by_col_);
+    }
+
+    void signal_kernel(Index m, Index n, Index k, bool sync, bool use_thread_local) {
+      std::atomic<uint8_t>* state = &state_kernel_[k % P][m][n];
+      Index s = state->load();
+      eigen_assert(s > 0);
+      if (s != 1 && state->fetch_sub(1) != 1) {
+        eigen_assert(!use_thread_local);
+        return;
+      }
+      state->store(parallel_pack_ ? 3 : 2, std::memory_order_relaxed);
+      if (sync) {
+        kernel(m, n, k, use_thread_local);
+      } else {
+        eigen_assert(!use_thread_local);
+        device_.enqueue([this, m, n, k, use_thread_local]() { 
+            kernel(m, n, k, use_thread_local); 
+          });
+      }
+    }
+
+    void signal_switch(Index k, Index v = 1) {
+      Index s = state_switch_[k % P].fetch_sub(v);
+      eigen_assert(s >= v);
+      if (s != v) return;
+
+      // Ready to switch to the next k slice.
+      // Reset counter for the next iteration.
+      state_switch_[k % P] = (parallel_pack_ ? nm_ + nn_ : (shard_by_col_ ? nn_ : nm_)) + nm_ * nn_;
+      if (k < nk_) {
+        // Issue lhs/rhs packing. Their completion will in turn kick off
+        // kernels.
+        if (parallel_pack_) {
+          enqueue_packing(k, !shard_by_col_);
+          enqueue_packing(k, shard_by_col_);
+        } else if (shard_by_col_) {
+          enqueue_packing(k, false);
+        } else {
+          enqueue_packing(k, true);
+        }
+
+        // Termination handling.
+        // Because kernel completion signals k + 2 switch, we need to finish nk
+        // + 2 slices without issuing any tasks on nk + 1 slice. So here we
+        // pretend that all nk + 1 packing tasks just finish instantly; so that
+        // nk + 2 switch only waits for completion of nk kernels.
+      } else if (k == nk_) {
+        signal_switch(k + 1, parallel_pack_ ? nm_ + nn_ : (shard_by_col_ ? nn_ : nm_));
+      } else {
+        done_.Notify();
+      }
+    }
+
+    // Enqueue all rhs/lhs packing for k-th slice.
+    void enqueue_packing(Index k, bool rhs) { enqueue_packing_helper(0, rhs ? nn_ : nm_, k, rhs); }
+
+    void enqueue_packing_helper(Index start, Index end, Index k, bool rhs) {
+      if (end - start == 1) {
+        if (rhs)
+          pack_rhs(start, k);
+        else
+          pack_lhs(start, k);
+      } else {
+        while (end - start > 1) {
+          Index mid = (start + end) / 2;
+          device_.enqueue([this, mid, end, k, rhs]() { 
+              enqueue_packing_helper(mid, end, k, rhs);
+            });
+          end = mid;
+        }
+
+        // Decide if we want to run first packing task (start == 0) in
+        // async mode if we parallelize only by sharding dim:
+        // (1) pack_lhs and pack_rhs call signal_switch before completing
+        //     all calls to signal_kernel, which in sync mode might lead
+        //     to the execution of the first kernel of the k+1 slice, before
+        //     completing a call to the last kernel of the k slice.
+        // (2) all pack tasks for sharded dim must be executed in a thread
+        //     pool to get pre-allocated thead local buffers.
+        bool pack_async = (start == 0) && (parallelize_by_sharding_dim_only_ && shard_by_col_ == rhs) &&
+                          (k > 0 || std::this_thread::get_id() == created_by_thread_id_);
+
+        if (pack_async) {
+          device_.enqueue([this, start, end, k, rhs]() { 
+              enqueue_packing_helper(start, end, k, rhs);
+            });
+        } else {
+          enqueue_packing_helper(start, end, k, rhs);
+        }
+      }
+    }
+
+    // Block sizes with accounting for potentially incomplete last block.
+    Index bm(Index m) const { return m + 1 < nm0_ ? bm_ : m_ + bm_ - bm_ * nm0_; }
+    Index bn(Index n) const { return n + 1 < nn0_ ? bn_ : n_ + bn_ - bn_ * nn0_; }
+    Index bk(Index k) const { return k + 1 < nk_ ? bk_ : k_ + bk_ - bk_ * nk_; }
+    // Task grain sizes accounting for potentially incomplete last task.
+    Index gm(Index m) const { return m + 1 < nm_ ? gm_ : nm0_ + gm_ - gm_ * nm_; }
+    Index gn(Index n) const { return n + 1 < nn_ ? gn_ : nn0_ + gn_ - gn_ * nn_; }
+
+    EvalParallelContext(const EvalParallelContext&) = delete;
+    void operator=(const EvalParallelContext&) = delete;
+  };
+
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
+  using SyncEvalParallelContext = EvalParallelContext<NoCallback, lhs_inner_dim_contiguous, rhs_inner_dim_contiguous,
+                                                      rhs_inner_dim_reordered, Alignment>;
+
+  // ------------------------------------------------------------------------ //
+
+  // EvalShardedByInnerDimContext orchestrates sync/async contraction
+  // evaluation, when we shard by inner dimension. When it is executed in
+  // asynchronous mode, it owns all the shared state that might be accessible by
+  // block processing tasks.
+
+  template <typename DoneCallback>
+  struct EvalShardedByInnerDimContext {
+    EvalShardedByInnerDimContext(const Self* self, int num_threads, Scalar* result_buffer, Index m_size, Index n_size,
+                                 Index k_size, DoneCallback done_callback)
+        : evaluator(self),
+          m_lhs_inner_dim_contiguous(evaluator->m_lhs_inner_dim_contiguous),
+          m_rhs_inner_dim_contiguous(evaluator->m_rhs_inner_dim_contiguous),
+          m_rhs_inner_dim_reordered(evaluator->m_rhs_inner_dim_reordered),
+          result(result_buffer),
+          m(m_size),
+          n(n_size),
+          k(k_size),
+          done(std::move(done_callback)),
+          buffer_size_bytes(m * n * sizeof(Scalar)),
+          block_size(blockSize(k, num_threads)),
+          num_blocks(numext::div_ceil<Index>(k, block_size)),
+          num_pending_blocks(internal::convert_index<int>(num_blocks)),
+          l0_ranges(numext::div_ceil<Index>(num_blocks, l0_size)),
+          l0_state(l0_ranges),
+          block_buffers(num_blocks) {
+      // Keep count of pending gemm tasks for each l0 range.
+      for (int i = 0; i < l0_ranges; ++i) {
+        const Index num_pending_tasks = actualRangeSize(l0_ranges, l0_size, i);
+        l0_state.emplace_back(internal::convert_index<int>(num_pending_tasks));
+      }
+
+      // Allocate temporary buffers for each block.
+      for (Index block_idx = 0; block_idx < num_blocks; ++block_idx) {
+        Scalar* buf = block_idx == 0 ? result : static_cast<Scalar*>(evaluator->m_device.allocate(buffer_size_bytes));
+        block_buffers.emplace_back(buf);
+      }
+    }
+
+    ~EvalShardedByInnerDimContext() {
+      for (Index i = 1; i < num_blocks; ++i) {
+        evaluator->m_device.deallocate(block_buffers[i]);
+      }
+    }
+
+    template <int Alignment>
+    void run() {
+      Barrier barrier(internal::convert_index<int>(num_blocks));
+      eval<Alignment>(barrier, 0, num_blocks);
+      barrier.Wait();
+
+      // Aggregate partial sums from l0 ranges.
+      aggregateL0Blocks<Alignment>();
+
+      // Apply output kernel.
+      applyOutputKernel();
+    }
+
+    template <int Alignment>
+    void runAsync() {
+      evalAsync<Alignment>(0, num_blocks);
+    }
+
+   private:
+    // The underlying GEMM kernel assumes that k is a multiple of
+    // the packet size and subtle breakage occurs if this is violated.
+    static const Index packet_size = internal::packet_traits<RhsScalar>::size;
+
+    const Self* evaluator;  // TensorContraction evaluator
+
+    // These fields required fromTENSOR_CONTRACTION_DISPATCH macro.
+    bool m_lhs_inner_dim_contiguous;
+    bool m_rhs_inner_dim_contiguous;
+    bool m_rhs_inner_dim_reordered;
+
+    Scalar* result;
+
+    Index m;
+    Index n;
+    Index k;
+
+    DoneCallback done;
+
+    // ----------------------------------------------------------------------//
+    // Algorithm parameters.
+
+    // We will compute partial results into the buffers of this size.
+    Index buffer_size_bytes;
+
+    Index block_size;
+    Index num_blocks;
+
+    // Keep track of pending tasks when evaluate in async mode.
+    std::atomic<int> num_pending_blocks;
+
+    // We compute partial gemm results in parallel, and to get the final result
+    // we need to add them all together. For the large number of threads (>= 48)
+    // this adds a very expensive sequential step at the end.
+    //
+    // We split the [0, num_blocks) into small ranges, and when a task for the
+    // block finishes its partial gemm computation, it checks if it was the last
+    // gemm in the range, and if so, it will add all blocks of the range.
+    //
+    // After all tasks done, we need to add only these pre-aggregated blocks.
+
+    // For now we use just a single level of ranges to compute pre-aggregated
+    // partial sums, but in general we can use more layers to compute tree
+    // aggregation in parallel and reduce the size of the sequential step.
+    //
+    // TODO(ezhulenev): Add multilevel tree aggregation? Probably will make
+    // sense only if number of threads >= ~128?
+    static const Index l0_size = 4;
+    Index l0_ranges;
+
+    // Keep count of pending gemm tasks for each l0 range.
+    MaxSizeVector<std::atomic<int>> l0_state;  // [0, l0_ranges)
+
+    // Buffers allocated for each temporary block computation.
+    MaxSizeVector<Scalar*> block_buffers;  // [0, num_blocks)
+
+    template <int Alignment>
+    void processBlock(Index block_idx, Index begin, Index end) {
+      Scalar* buf = block_buffers[block_idx];
+
+      TENSOR_CONTRACTION_DISPATCH(evaluator->template evalGemmPartialWithoutOutputKernel, Alignment,
+                                  (buf, begin, end,
+                                   /*num_threads=*/internal::convert_index<int>(num_blocks)));
+
+      // Check if it was the last task in l0 range.
+      const Index l0_index = block_idx / l0_size;
+      const int v = l0_state[l0_index].fetch_sub(1);
+      eigen_assert(v >= 1);
+
+      // If we processed the last block of the range, we can aggregate all
+      // partial results into the first block of the range.
+      if (v == 1) {
+        const Index rng_size = actualRangeSize(l0_ranges, l0_size, l0_index);
+        const Index dst_block_idx = l0_index * l0_size;
+
+        if (rng_size == l0_size) {
+          addAllToBuffer<Alignment>(m * n,
+                                    /*src_buf0=*/block_buffers[dst_block_idx + 1],
+                                    /*src_buf1=*/block_buffers[dst_block_idx + 2],
+                                    /*src_buf2=*/block_buffers[dst_block_idx + 3],
+                                    /*dst_buf= */ block_buffers[dst_block_idx]);
+        } else {
+          // Aggregate blocks of potentially incomplete last range.
+          for (int i = 1; i < rng_size; ++i) {
+            addToBuffer<Alignment>(m * n,
+                                   /*src_buf=*/block_buffers[dst_block_idx + i],
+                                   /*dst_buf=*/block_buffers[dst_block_idx]);
+          }
+        }
+      }
+    }
+
+    // Aggregate partial sums from l0 ranges.
+    template <int Alignment>
+    void aggregateL0Blocks() const {
+      Index l0_index = 1;
+
+      for (; l0_index + 2 < l0_ranges; l0_index += 3) {
+        addAllToBuffer<Alignment>(m * n,
+                                  /*src_buf0=*/block_buffers[(l0_index + 0) * l0_size],
+                                  /*src_buf1=*/block_buffers[(l0_index + 1) * l0_size],
+                                  /*src_buf2=*/block_buffers[(l0_index + 2) * l0_size],
+                                  /*dst_buf= */ block_buffers[0]);
+      }
+
+      for (; l0_index < l0_ranges; ++l0_index) {
+        addToBuffer<Alignment>(m * n, block_buffers[l0_index * l0_size], block_buffers[0]);
+      }
+    }
+
+    void applyOutputKernel() const {
+      typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+      evaluator->m_output_kernel(OutputMapper(result, m), evaluator->m_tensor_contraction_params,
+                                 static_cast<Eigen::Index>(0), static_cast<Eigen::Index>(0), m, n);
+    }
+
+    // Compute block size with accounting for potentially incomplete last block.
+    Index actualBlockSize(Index block_idx) const {
+      return block_idx + 1 < num_blocks ? block_size : k + block_size - block_size * num_blocks;
+    };
+
+    // Compute range size with accounting for potentially incomplete last range.
+    Index actualRangeSize(Index num_ranges, Index range_size, Index range_idx) const {
+      eigen_assert(range_idx < num_ranges);
+      return range_idx + 1 < num_ranges ? range_size : num_blocks + range_size - range_size * num_ranges;
+    };
+
+    template <int Alignment>
+    EIGEN_STRONG_INLINE static void addToBuffer(size_t n, const Scalar* src_buf, Scalar* tgt_buf) {
+      const int output_packet_size = internal::unpacket_traits<PacketReturnType>::size;
+      size_t i = 0;
+      const size_t num_packets = n / output_packet_size;
+      for (; i < output_packet_size * num_packets; i += output_packet_size) {
+        const PacketReturnType src_val = internal::pload<PacketReturnType>(src_buf + i);
+        const PacketReturnType tgt_val = internal::ploadt<PacketReturnType, Alignment>(tgt_buf + i);
+        const PacketReturnType sum = internal::padd(src_val, tgt_val);
+        internal::pstoret<Scalar, PacketReturnType, Alignment>(tgt_buf + i, sum);
+      }
+      for (; i < n; ++i) {
+        tgt_buf[i] += src_buf[i];
+      }
+    }
+
+    template <int Alignment>
+    EIGEN_STRONG_INLINE static void addAllToBuffer(size_t n, const Scalar* src_buf0, const Scalar* src_buf1,
+                                                   const Scalar* src_buf2, Scalar* dst_buf) {
+      using ::Eigen::internal::padd;
+      using ::Eigen::internal::pload;
+      using ::Eigen::internal::ploadt;
+      using ::Eigen::internal::pstoret;
+
+      const int output_packet_size = internal::unpacket_traits<PacketReturnType>::size;
+
+      size_t i = 0;
+      const size_t num_packets = n / output_packet_size;
+      for (; i < output_packet_size * num_packets; i += output_packet_size) {
+        const auto src_val0 = pload<PacketReturnType>(src_buf0 + i);
+        const auto src_val1 = pload<PacketReturnType>(src_buf1 + i);
+        const auto src_val2 = pload<PacketReturnType>(src_buf2 + i);
+
+        const auto dst_val = ploadt<PacketReturnType, Alignment>(dst_buf + i);
+        const auto sum = padd(padd(dst_val, src_val0), padd(src_val1, src_val2));
+
+        pstoret<Scalar, PacketReturnType, Alignment>(dst_buf + i, sum);
+      }
+      for (; i < n; ++i) {
+        dst_buf[i] += src_buf0[i] + src_buf1[i] + src_buf2[i];
+      }
+    }
+
+    template <int Alignment>
+    void eval(Barrier& barrier, Index start_block_idx, Index end_block_idx) {
+      while (end_block_idx - start_block_idx > 1) {
+        Index mid_block_idx = (start_block_idx + end_block_idx) / 2;
+        evaluator->m_device.enqueue([this, &barrier, mid_block_idx, end_block_idx]() {
+          eval<Alignment>(barrier, mid_block_idx, end_block_idx);
+        });
+        end_block_idx = mid_block_idx;
+      }
+
+      Index block_idx = start_block_idx;
+      Index block_start = block_idx * block_size;
+      Index block_end = block_start + actualBlockSize(block_idx);
+
+      processBlock<Alignment>(block_idx, block_start, block_end);
+      barrier.Notify();
+    }
+
+    template <int Alignment>
+    void evalAsync(Index start_block_idx, Index end_block_idx) {
+      while (end_block_idx - start_block_idx > 1) {
+        Index mid_block_idx = (start_block_idx + end_block_idx) / 2;
+        evaluator->m_device.enqueue(
+            [this, mid_block_idx, end_block_idx]() { 
+              evalAsync<Alignment>(mid_block_idx, end_block_idx);
+            });
+        end_block_idx = mid_block_idx;
+      }
+
+      Index block_idx = start_block_idx;
+
+      Index block_start = block_idx * block_size;
+      Index block_end = block_start + actualBlockSize(block_idx);
+
+      processBlock<Alignment>(block_idx, block_start, block_end);
+
+      int v = num_pending_blocks.fetch_sub(1);
+      eigen_assert(v >= 1);
+
+      if (v == 1) {
+        // Aggregate partial sums from l0 ranges.
+        aggregateL0Blocks<Alignment>();
+
+        // Apply output kernel.
+        applyOutputKernel();
+
+        // NOTE: If we call `done` callback before deleting this (context),
+        // it might deallocate Self* pointer captured by context, and we'll
+        // fail in destructor trying to deallocate temporary buffers.
+
+        // Move done call back from context before it will be destructed.
+        DoneCallback done_copy = std::move(done);
+
+        // We are confident that we are the last one who touches context.
+        delete this;
+
+        // Now safely call the done callback.
+        done_copy();
+      }
+    }
+
+    // Cost model doesn't capture well the cost associated with constructing
+    // tensor contraction mappers and computing loop bounds in gemm_pack_lhs
+    // and gemm_pack_rhs, so we specify minimum desired block size.
+    static Index blockSize(Index k, int num_threads) {
+      const auto round_up = [=](Index index) -> Index {
+        const Index kmultiple = packet_size <= 8 ? 8 : packet_size;
+        return numext::div_ceil<Index>(index, kmultiple) * kmultiple;
+      };
+
+      const Index target_block_size = round_up(numext::div_ceil<Index>(k, num_threads));
+      const Index desired_min_block_size = 12 * packet_size;
+
+      return numext::mini<Index>(k, numext::maxi<Index>(desired_min_block_size, target_block_size));
+    }
+
+    EvalShardedByInnerDimContext(const EvalShardedByInnerDimContext&) = delete;
+    void operator=(const EvalShardedByInnerDimContext&) = delete;
+  };
+
+  // ------------------------------------------------------------------------ //
+
+  // Below are the function used by evalProductImpl heuristics, trying to select
+  // optimcal parameters for parallelization algorithm.
+
+  // Decide whether we want to shard m x n contraction by columns or by rows.
+  static bool shardByCol(Index m, Index n, Index num_threads) {
+    // Note: we are comparing both n and m against Traits::nr, it is not
+    // a mistake. We are trying to figure out how both n and m will fit into
+    // the main sharding dimension.
+
+    // Sharding by column is the default
+    // ... unless there is enough data for vectorization over rows
+    if (m / num_threads >= Traits::nr &&
+        // and not enough data for vectorization over columns
+        (n / num_threads < Traits::nr ||
+         // ... or barely enough data for vectorization over columns,
+         // but it is not evenly dividable across threads
+         (n / num_threads < 4 * Traits::nr && (n % (num_threads * Traits::nr)) != 0 &&
+          // ... and it is evenly dividable across threads for rows
+          ((m % (num_threads * Traits::nr)) == 0 ||
+           // .. or it is not evenly dividable for both dimensions but
+           // there is much more data over rows so that corner effects are
+           // mitigated.
+           (m / n >= 6)))))
+      return false;
+    // Wait, or if matrices are just substantially prolonged over the other
+    // dimension.
+    if (n / num_threads < 16 * Traits::nr && m > n * 32) return false;
+    return true;
+  }
+
+  Index coarsenM(Index m, Index n, Index bm, Index bn, Index bk, Index gn, int num_threads, bool shard_by_col) const {
+    Index gm = 1;
+    Index gm1 = 1;
+    Index nm0 = numext::div_ceil(m, bm);
+    Index nm1 = nm0;
+    for (;;) {
+      // Find the next candidate for m grain size. It needs to result in
+      // different number of blocks. E.g. if we have 10 kernels, we want to try
+      // 5 and 10, but not 6, 7, 8 and 9.
+      while (gm1 <= nm0 && nm1 == numext::div_ceil(nm0, gm1)) gm1++;
+      if (gm1 > nm0) break;
+      // Check the candidate.
+      int res = checkGrain(m, n, bm, bn, bk, gm1, gn, gm, gn, num_threads, shard_by_col);
+      if (res < 0) break;
+      nm1 = numext::div_ceil(nm0, gm1);
+      if (res == 0) continue;
+      // Commit new grain size.
+      gm = gm1;
+    }
+    return gm;
+  }
+
+  Index coarsenN(Index m, Index n, Index bm, Index bn, Index bk, Index gm, int num_threads, bool shard_by_col) const {
+    Index gn = 1;
+    Index gn1 = 1;
+    Index nn0 = numext::div_ceil(n, bn);
+    Index nn1 = nn0;
+    for (;;) {
+      while (gn1 <= nn0 && nn1 == numext::div_ceil(nn0, gn1)) gn1++;
+      if (gn1 > nn0) break;
+      int res = checkGrain(m, n, bm, bn, bk, gm, gn1, gm, gn, num_threads, shard_by_col);
+      if (res < 0) break;
+      nn1 = numext::div_ceil(nn0, gn1);
+      if (res == 0) continue;
+      gn = gn1;
+    }
+    return gn;
+  }
+
+  // checkGrain checks whether grain (gm, gn) is suitable and is better than
+  // (oldgm, oldgn).
+  int checkGrain(Index m, Index n, Index bm, Index bn, Index bk, Index gm, Index gn, Index oldgm, Index oldgn,
+                 int num_threads, bool shard_by_col) const {
+    const TensorOpCost cost = contractionCost(bm * gm, bn * gn, bm, bn, bk, shard_by_col, true);
+    double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize(static_cast<double>(bm) * gm * bn * gn, cost);
+    // If the task is too small, then we agree on it regardless of anything
+    // else. Otherwise synchronization overheads will dominate.
+    if (taskSize < 1) return 1;
+    // If it is too large, then we reject it and all larger tasks.
+    if (taskSize > 2) return -1;
+    // Now we are in presumably good task size range.
+    // The main deciding factor here is parallelism. Consider that we have 12
+    // kernels and 4 threads. Grains of 2, 3 and 4 all yield good task sizes.
+    // But 2/4 yield 6/3 tasks, which gives us parallelism of 0.75 (at most 3/4
+    // of cores will be busy). While grain size 3 gives us 4 tasks, which gives
+    // us parallelism of 1 (we can load all cores).
+    Index nm0 = numext::div_ceil(m, bm);
+    Index nn0 = numext::div_ceil(n, bn);
+    Index new_tasks = numext::div_ceil(nm0, gm) * numext::div_ceil(nn0, gn);
+    double new_parallelism =
+        static_cast<double>(new_tasks) / (numext::div_ceil<Index>(new_tasks, num_threads) * num_threads);
+    Index old_tasks = numext::div_ceil(nm0, oldgm) * numext::div_ceil(nn0, oldgn);
+    double old_parallelism =
+        static_cast<double>(old_tasks) / (numext::div_ceil<Index>(old_tasks, num_threads) * num_threads);
+    if (new_parallelism > old_parallelism || new_parallelism == 1) return 1;
+    return 0;
+  }
+
+  TensorOpCost contractionCost(Index m, Index n, Index bm, Index bn, Index bk, bool shard_by_col,
+                               bool prepacked) const {
+    const int packed_size = std::min<int>(PacketType<LhsScalar, Device>::size, PacketType<RhsScalar, Device>::size);
+    const int output_packet_size = internal::unpacket_traits<PacketReturnType>::size;
+    const double kd = static_cast<double>(bk);
+    double compute_bandwidth = computeBandwidth(false, bm, bn, bk);
+    // Computations.
+    TensorOpCost cost = TensorOpCost(0, 0, kd * compute_bandwidth, true, packed_size);
+    // Output stores.
+    cost += TensorOpCost(0, sizeof(CoeffReturnType), 0, true, output_packet_size);
+    if (prepacked) {
+      // Packing and kernels are executed in different tasks. When we calculate
+      // task grain size we look only at kernel cost assuming that kernel
+      // is more expensive than packing.
+      return cost;
+    }
+    // Lhs/rhs loads + computations.
+    TensorOpCost lhsCost = this->m_leftImpl.costPerCoeff(true) * (kd / n);
+    TensorOpCost rhsCost = this->m_rightImpl.costPerCoeff(true) * (kd / m);
+    // Lhs packing memory cost does not contribute considerably to overall
+    // execution time because lhs is prefetched early and accessed sequentially.
+    if (shard_by_col)
+      lhsCost.dropMemoryCost();
+    else
+      rhsCost.dropMemoryCost();
+    return cost + lhsCost + rhsCost;
+  }
+
+  // Decide whether we want to shard m x k x n contraction over the inner
+  // (contraction) dimension (k).
+  static bool shardByInnerDim(Index m, Index n, Index k, int num_threads, int num_threads_by_k) {
+    std::ptrdiff_t bufsize = m * n * sizeof(Scalar);
+    bool shard_by_k = false;
+    if (n == 1 ||                                      // If mat*vec or...
+        num_threads_by_k < 2 ||                        // running single threaded or...
+        num_threads_by_k < num_threads ||              // sharding by k gives less parallelism or...
+        bufsize > l3CacheSize() / num_threads_by_k ||  // need more buffer space
+        // than L3 cache or...
+        k / num_threads_by_k < 2 * Traits::nr) {  // k per thread is tiny.
+      shard_by_k = false;
+    } else if (numext::maxi(m, n) / num_threads < Traits::nr ||  // both other dimensions are tiny or...
+                                                                 // k per thread is not small and...
+               (k / num_threads_by_k > 8 * Traits::nr &&
+                // one of the outer dimensions is tiny or sharding by k offers
+                // more parallelism.
+                (numext::mini(m, n) < 2 * Traits::nr || num_threads_by_k > num_threads))) {
+      shard_by_k = true;
+    }
+    return shard_by_k;
+  }
+
+  TensorOpCost contractionCostPerInnerDim(Index m, Index n, Index k) const {
+    // Compute cost.
+    const int output_packet_size = internal::unpacket_traits<PacketReturnType>::size;
+    TensorOpCost cost(0, 0, (computeBandwidth(true, m, n, k) * m) * n, true, output_packet_size);
+    // Output stores.
+    cost += TensorOpCost(0, sizeof(CoeffReturnType), 0, true, output_packet_size);
+    TensorOpCost lhsCost = this->m_leftImpl.costPerCoeff(true) * m;
+    TensorOpCost rhsCost = this->m_rightImpl.costPerCoeff(true) * n;
+    // Since the inner gemm kernel is always sharded by column, the lhs
+    // load cost is negligible.
+    lhsCost.dropMemoryCost();
+    return cost + lhsCost + rhsCost;
+  }
+
+  int numThreadsInnerDim(Index m, Index n, Index k) const {
+    const int output_packet_size = internal::unpacket_traits<PacketReturnType>::size;
+    TensorOpCost cost = contractionCostPerInnerDim(m, n, k);
+    double total_parallel_cost = TensorCostModel<ThreadPoolDevice>::totalCost(k, cost);
+    // Cost of reduction step accumulating the m*n per-thread buffers into the
+    // result.
+    double reduction_cost =
+        TensorCostModel<ThreadPoolDevice>::totalCost(m * n, TensorOpCost(2, 1, 1, true, output_packet_size));
+    int num_threads = 1;
+    double min_cost = total_parallel_cost;
+    double kPerThreadOverHead = 3000;
+    double kFixedOverHead = 100000;
+    for (int nt = 2; nt <= this->m_device.numThreads(); nt += 2) {
+      double sequential_cost = kFixedOverHead + nt * (reduction_cost + kPerThreadOverHead);
+      double parallel_cost = total_parallel_cost / nt + sequential_cost;
+      if (parallel_cost < min_cost) {
+        num_threads = nt;
+        min_cost = parallel_cost;
+      }
+    }
+    return num_threads;
+  }
+
+  double computeBandwidth(bool shard_by_col, Index bm, Index bn, Index bk) const {
+    // Peak VFMA bandwidth is 0.5. However if we have not enough data for
+    // vectorization bandwidth drops. The 4.0 and 2.0 bandwidth is determined
+    // experimentally.
+    double computeBandwidth = bk == 1                                                                          ? 4.0
+                              : (shard_by_col ? bn : bm) < Traits::nr || (shard_by_col ? bm : bn) < Traits::mr ? 2.0
+                                                                                                               : 0.5;
+#ifndef EIGEN_VECTORIZE_FMA
+    // Bandwidth of all of VFMA/MULPS/ADDPS is 0.5 on latest Intel processors.
+    // However for MULPS/ADDPS we have dependent sequence of 2 such
+    // instructions,
+    // so overall bandwidth is 1.0.
+    if (computeBandwidth == 0.5) computeBandwidth = 1.0;
+#endif
+    return computeBandwidth;
+  }
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_USE_THREADS
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_CONTRACTION_THREAD_POOL_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
new file mode 100644
index 00000000..ccf96b70
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
@@ -0,0 +1,416 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+template <typename TargetType, typename XprType>
+struct traits<TensorConversionOp<TargetType, XprType> > {
+  // Type promotion to handle the case where the types of the lhs and the rhs are different.
+  typedef TargetType Scalar;
+  typedef typename traits<XprType>::StorageKind StorageKind;
+  typedef typename traits<XprType>::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef std::remove_reference_t<Nested> Nested_;
+  static constexpr int NumDimensions = traits<XprType>::NumDimensions;
+  static constexpr int Layout = traits<XprType>::Layout;
+  enum { Flags = 0 };
+  typedef typename TypeConversion<Scalar, typename traits<XprType>::PointerType>::type PointerType;
+};
+
+template <typename TargetType, typename XprType>
+struct eval<TensorConversionOp<TargetType, XprType>, Eigen::Dense> {
+  typedef const TensorConversionOp<TargetType, XprType>& type;
+};
+
+template <typename TargetType, typename XprType>
+struct nested<TensorConversionOp<TargetType, XprType>, 1,
+              typename eval<TensorConversionOp<TargetType, XprType> >::type> {
+  typedef TensorConversionOp<TargetType, XprType> type;
+};
+
+}  // end namespace internal
+
+template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket, int SrcCoeffRatio, int TgtCoeffRatio>
+struct PacketConverter;
+
+template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket>
+struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 1, 1> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketConverter(const TensorEvaluator& impl) : m_impl(impl) {}
+
+  template <int LoadMode, typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const {
+    return internal::pcast<SrcPacket, TgtPacket>(m_impl.template packet<LoadMode>(index));
+  }
+
+ private:
+  const TensorEvaluator& m_impl;
+};
+
+template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket>
+struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 2, 1> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketConverter(const TensorEvaluator& impl) : m_impl(impl) {}
+
+  template <int LoadMode, typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const {
+    const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size;
+
+    SrcPacket src1 = m_impl.template packet<LoadMode>(index);
+    SrcPacket src2 = m_impl.template packet<LoadMode>(index + SrcPacketSize);
+    TgtPacket result = internal::pcast<SrcPacket, TgtPacket>(src1, src2);
+    return result;
+  }
+
+ private:
+  const TensorEvaluator& m_impl;
+};
+
+template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket>
+struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 4, 1> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketConverter(const TensorEvaluator& impl) : m_impl(impl) {}
+
+  template <int LoadMode, typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const {
+    const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size;
+
+    SrcPacket src1 = m_impl.template packet<LoadMode>(index);
+    SrcPacket src2 = m_impl.template packet<LoadMode>(index + SrcPacketSize);
+    SrcPacket src3 = m_impl.template packet<LoadMode>(index + 2 * SrcPacketSize);
+    SrcPacket src4 = m_impl.template packet<LoadMode>(index + 3 * SrcPacketSize);
+    TgtPacket result = internal::pcast<SrcPacket, TgtPacket>(src1, src2, src3, src4);
+    return result;
+  }
+
+ private:
+  const TensorEvaluator& m_impl;
+};
+
+template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket>
+struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 8, 1> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketConverter(const TensorEvaluator& impl) : m_impl(impl) {}
+
+  template <int LoadMode, typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const {
+    const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size;
+
+    SrcPacket src1 = m_impl.template packet<LoadMode>(index);
+    SrcPacket src2 = m_impl.template packet<LoadMode>(index + 1 * SrcPacketSize);
+    SrcPacket src3 = m_impl.template packet<LoadMode>(index + 2 * SrcPacketSize);
+    SrcPacket src4 = m_impl.template packet<LoadMode>(index + 3 * SrcPacketSize);
+    SrcPacket src5 = m_impl.template packet<LoadMode>(index + 4 * SrcPacketSize);
+    SrcPacket src6 = m_impl.template packet<LoadMode>(index + 5 * SrcPacketSize);
+    SrcPacket src7 = m_impl.template packet<LoadMode>(index + 6 * SrcPacketSize);
+    SrcPacket src8 = m_impl.template packet<LoadMode>(index + 7 * SrcPacketSize);
+    TgtPacket result = internal::pcast<SrcPacket, TgtPacket>(src1, src2, src3, src4, src5, src6, src7, src8);
+    return result;
+  }
+
+ private:
+  const TensorEvaluator& m_impl;
+};
+
+template <typename TensorEvaluator, typename SrcPacket, typename TgtPacket, int TgtCoeffRatio>
+struct PacketConverter<TensorEvaluator, SrcPacket, TgtPacket, 1, TgtCoeffRatio> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketConverter(const TensorEvaluator& impl)
+      : m_impl(impl), m_maxIndex(impl.dimensions().TotalSize()) {}
+
+  template <int LoadMode, typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TgtPacket packet(Index index) const {
+    const int SrcPacketSize = internal::unpacket_traits<SrcPacket>::size;
+    // Only call m_impl.packet() when we have direct access to the underlying data. This
+    // ensures that we don't compute the subexpression twice. We may however load some
+    // coefficients twice, but in practice this doesn't negatively impact performance.
+    if (m_impl.data() && (index + SrcPacketSize < m_maxIndex)) {
+      // Force unaligned memory loads since we can't ensure alignment anymore
+      return internal::pcast<SrcPacket, TgtPacket>(m_impl.template packet<Unaligned>(index));
+    } else {
+      const int TgtPacketSize = internal::unpacket_traits<TgtPacket>::size;
+      typedef typename internal::unpacket_traits<SrcPacket>::type SrcType;
+      typedef typename internal::unpacket_traits<TgtPacket>::type TgtType;
+      internal::scalar_cast_op<SrcType, TgtType> converter;
+      EIGEN_ALIGN_MAX typename internal::unpacket_traits<TgtPacket>::type values[TgtPacketSize];
+      EIGEN_UNROLL_LOOP
+      for (int i = 0; i < TgtPacketSize; ++i) {
+        values[i] = converter(m_impl.coeff(index + i));
+      }
+      TgtPacket rslt = internal::pload<TgtPacket>(values);
+      return rslt;
+    }
+  }
+
+ private:
+  const TensorEvaluator& m_impl;
+  const typename TensorEvaluator::Index m_maxIndex;
+};
+
+/**
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor conversion class. This class makes it possible to vectorize
+ * type casting operations when the number of scalars per packet in the source
+ * and the destination type differ
+ */
+template <typename TargetType, typename XprType>
+class TensorConversionOp : public TensorBase<TensorConversionOp<TargetType, XprType>, ReadOnlyAccessors> {
+ public:
+  typedef typename internal::traits<TensorConversionOp>::Scalar Scalar;
+  typedef typename internal::traits<TensorConversionOp>::StorageKind StorageKind;
+  typedef typename internal::traits<TensorConversionOp>::Index Index;
+  typedef typename internal::nested<TensorConversionOp>::type Nested;
+  typedef Scalar CoeffReturnType;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConversionOp(const XprType& xpr) : m_xpr(xpr) {}
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename XprType::Nested>& expression() const { return m_xpr; }
+
+ protected:
+  typename XprType::Nested m_xpr;
+};
+
+template <bool SameType, typename Eval, typename EvalPointerType>
+struct ConversionSubExprEval {
+  static EIGEN_STRONG_INLINE bool run(Eval& impl, EvalPointerType) {
+    impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+};
+
+template <typename Eval, typename EvalPointerType>
+struct ConversionSubExprEval<true, Eval, EvalPointerType> {
+  static EIGEN_STRONG_INLINE bool run(Eval& impl, EvalPointerType data) { return impl.evalSubExprsIfNeeded(data); }
+};
+
+#ifdef EIGEN_USE_THREADS
+template <bool SameType, typename Eval, typename EvalPointerType, typename EvalSubExprsCallback>
+struct ConversionSubExprEvalAsync {
+  static EIGEN_STRONG_INLINE void run(Eval& impl, EvalPointerType, EvalSubExprsCallback done) {
+    impl.evalSubExprsIfNeededAsync(nullptr, std::move(done));
+  }
+};
+
+template <typename Eval, typename EvalPointerType, typename EvalSubExprsCallback>
+struct ConversionSubExprEvalAsync<true, Eval, EvalPointerType, EvalSubExprsCallback> {
+  static EIGEN_STRONG_INLINE void run(Eval& impl, EvalPointerType data, EvalSubExprsCallback done) {
+    impl.evalSubExprsIfNeededAsync(data, std::move(done));
+  }
+};
+#endif
+
+namespace internal {
+
+template <typename SrcType, typename TargetType, bool IsSameT>
+struct CoeffConv {
+  template <typename ArgType, typename Device>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetType run(const TensorEvaluator<ArgType, Device>& impl,
+                                                              Index index) {
+    internal::scalar_cast_op<SrcType, TargetType> converter;
+    return converter(impl.coeff(index));
+  }
+};
+
+template <typename SrcType, typename TargetType>
+struct CoeffConv<SrcType, TargetType, true> {
+  template <typename ArgType, typename Device>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetType run(const TensorEvaluator<ArgType, Device>& impl,
+                                                              Index index) {
+    return impl.coeff(index);
+  }
+};
+
+template <typename SrcPacket, typename TargetPacket, int LoadMode, bool ActuallyVectorize, bool IsSameT>
+struct PacketConv {
+  typedef typename internal::unpacket_traits<SrcPacket>::type SrcType;
+  typedef typename internal::unpacket_traits<TargetPacket>::type TargetType;
+
+  static constexpr int PacketSize = internal::unpacket_traits<TargetPacket>::size;
+
+  template <typename ArgType, typename Device>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator<ArgType, Device>& impl,
+                                                                Index index) {
+    internal::scalar_cast_op<SrcType, TargetType> converter;
+    EIGEN_ALIGN_MAX std::remove_const_t<TargetType> values[PacketSize];
+    EIGEN_UNROLL_LOOP
+    for (int i = 0; i < PacketSize; ++i) {
+      values[i] = converter(impl.coeff(index + i));
+    }
+    TargetPacket rslt = internal::pload<TargetPacket>(values);
+    return rslt;
+  }
+};
+
+template <typename SrcPacket, typename TargetPacket, int LoadMode, bool IsSameT>
+struct PacketConv<SrcPacket, TargetPacket, LoadMode, true, IsSameT> {
+  typedef typename internal::unpacket_traits<SrcPacket>::type SrcType;
+  typedef typename internal::unpacket_traits<TargetPacket>::type TargetType;
+
+  template <typename ArgType, typename Device>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator<ArgType, Device>& impl,
+                                                                Index index) {
+    const int SrcCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::SrcCoeffRatio;
+    const int TgtCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::TgtCoeffRatio;
+    PacketConverter<TensorEvaluator<ArgType, Device>, SrcPacket, TargetPacket, SrcCoeffRatio, TgtCoeffRatio> converter(
+        impl);
+    return converter.template packet<LoadMode>(index);
+  }
+};
+
+template <typename SrcPacket, typename TargetPacket, int LoadMode>
+struct PacketConv<SrcPacket, TargetPacket, LoadMode, /*ActuallyVectorize=*/false, /*IsSameT=*/true> {
+  typedef typename internal::unpacket_traits<TargetPacket>::type TargetType;
+  static constexpr int PacketSize = internal::unpacket_traits<TargetPacket>::size;
+
+  template <typename ArgType, typename Device>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator<ArgType, Device>& impl,
+                                                                Index index) {
+    EIGEN_ALIGN_MAX std::remove_const_t<TargetType> values[PacketSize];
+    for (int i = 0; i < PacketSize; ++i) values[i] = impl.coeff(index + i);
+    return internal::pload<TargetPacket>(values);
+  }
+};
+
+template <typename SrcPacket, typename TargetPacket, int LoadMode>
+struct PacketConv<SrcPacket, TargetPacket, LoadMode, /*ActuallyVectorize=*/true, /*IsSameT=*/true> {
+  template <typename ArgType, typename Device>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TargetPacket run(const TensorEvaluator<ArgType, Device>& impl,
+                                                                Index index) {
+    return impl.template packet<LoadMode>(index);
+  }
+};
+
+}  // namespace internal
+
+// Eval as rvalue
+template <typename TargetType, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device> {
+  typedef TensorConversionOp<TargetType, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+  typedef TargetType Scalar;
+  typedef TargetType CoeffReturnType;
+  typedef internal::remove_all_t<typename internal::traits<ArgType>::Scalar> SrcType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef typename PacketType<SrcType, Device>::type PacketSourceType;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  static constexpr bool IsSameType = internal::is_same<TargetType, SrcType>::value;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  enum {
+    IsAligned = false,
+    PacketAccess =
+#ifndef EIGEN_USE_SYCL
+        true,
+#else
+        TensorEvaluator<ArgType, Device>::PacketAccess &
+        internal::type_casting_traits<SrcType, TargetType>::VectorizedCast,
+#endif
+    BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
+    RawAccess = false
+  };
+
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+  static constexpr int NumDims = internal::array_size<Dimensions>::value;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock ArgTensorBlock;
+
+  struct TensorConversionOpBlockFactory {
+    template <typename ArgXprType>
+    struct XprType {
+      typedef TensorConversionOp<TargetType, const ArgXprType> type;
+    };
+
+    template <typename ArgXprType>
+    typename XprType<ArgXprType>::type expr(const ArgXprType& expr) const {
+      return typename XprType<ArgXprType>::type(expr);
+    }
+  };
+
+  typedef internal::TensorUnaryExprBlock<TensorConversionOpBlockFactory, ArgTensorBlock> TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
+    return ConversionSubExprEval<IsSameType, TensorEvaluator<ArgType, Device>, EvaluatorPointerType>::run(m_impl, data);
+  }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType data, EvalSubExprsCallback done) {
+    ConversionSubExprEvalAsync<IsSameType, TensorEvaluator<ArgType, Device>, EvaluatorPointerType,
+                               EvalSubExprsCallback>::run(m_impl, data, std::move(done));
+  }
+#endif
+
+  EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    return internal::CoeffConv<SrcType, TargetType, IsSameType>::run(m_impl, index);
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+    // If we are not going to do the cast, we just need to check that base
+    // TensorEvaluator has packet access. Otherwise we also need to make sure,
+    // that we have an implementation of vectorized cast.
+    const bool Vectorizable = IsSameType ? TensorEvaluator<ArgType, Device>::PacketAccess
+                                         : int(TensorEvaluator<ArgType, Device>::PacketAccess) &
+                                               int(internal::type_casting_traits<SrcType, TargetType>::VectorizedCast);
+
+    return internal::PacketConv<PacketSourceType, PacketReturnType, LoadMode, Vectorizable, IsSameType>::run(m_impl,
+                                                                                                             index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    const double cast_cost = TensorOpCost::CastCost<SrcType, TargetType>();
+    if (vectorized) {
+      const double SrcCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::SrcCoeffRatio;
+      const double TgtCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::TgtCoeffRatio;
+      return m_impl.costPerCoeff(vectorized) * (SrcCoeffRatio / PacketSize) +
+             TensorOpCost(0, 0, TgtCoeffRatio * (cast_cost / PacketSize));
+    } else {
+      return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, cast_cost);
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    return m_impl.getResourceRequirements();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+                                                          bool /*root_of_expr_ast*/ = false) const {
+    return TensorBlock(m_impl.block(desc, scratch), TensorConversionOpBlockFactory());
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+  /// required by sycl in order to extract the sycl accessor
+  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+
+ protected:
+  TensorEvaluator<ArgType, Device> m_impl;
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_CONVERSION_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
new file mode 100644
index 00000000..016498f0
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -0,0 +1,1119 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename Index, typename InputDims, int NumKernelDims, int Layout>
+class IndexMapper {
+ public:
+  IndexMapper(const InputDims& input_dims, const array<Index, NumKernelDims>& kernel_dims,
+              const array<Index, NumKernelDims>& indices) {
+    array<Index, NumDims> dimensions = input_dims;
+    for (int i = 0; i < NumKernelDims; ++i) {
+      const Index index = indices[i];
+      const Index input_dim = input_dims[index];
+      const Index kernel_dim = kernel_dims[i];
+      const Index result_dim = input_dim - kernel_dim + 1;
+      dimensions[index] = result_dim;
+    }
+
+    array<Index, NumDims> inputStrides;
+    array<Index, NumDims> outputStrides;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      inputStrides[0] = 1;
+      outputStrides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        inputStrides[i] = inputStrides[i - 1] * input_dims[i - 1];
+        outputStrides[i] = outputStrides[i - 1] * dimensions[i - 1];
+      }
+    } else {
+      inputStrides[NumDims - 1] = 1;
+      outputStrides[NumDims - 1] = 1;
+      for (int i = static_cast<int>(NumDims) - 2; i >= 0; --i) {
+        inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1];
+        outputStrides[i] = outputStrides[i + 1] * dimensions[i + 1];
+      }
+    }
+
+    array<Index, NumDims> gpuInputDimensions;
+    array<Index, NumDims> gpuOutputDimensions;
+    array<Index, NumDims> tmp = dimensions;
+    array<Index, NumDims> ordering;
+    const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - NumKernelDims;
+    for (int i = 0; i < NumKernelDims; ++i) {
+      const Index index = i + offset;
+      ordering[index] = indices[i];
+      tmp[indices[i]] = -1;
+      gpuInputDimensions[index] = input_dims[indices[i]];
+      gpuOutputDimensions[index] = dimensions[indices[i]];
+    }
+
+    int written = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? NumKernelDims : 0;
+    for (int i = 0; i < NumDims; ++i) {
+      if (tmp[i] >= 0) {
+        ordering[written] = i;
+        gpuInputDimensions[written] = input_dims[i];
+        gpuOutputDimensions[written] = dimensions[i];
+        ++written;
+      }
+    }
+
+    for (int i = 0; i < NumDims; ++i) {
+      m_inputStrides[i] = inputStrides[ordering[i]];
+      m_outputStrides[i] = outputStrides[ordering[i]];
+    }
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = 0; i < NumDims; ++i) {
+        if (i > NumKernelDims) {
+          m_gpuInputStrides[i] = m_gpuInputStrides[i - 1] * gpuInputDimensions[i - 1];
+          m_gpuOutputStrides[i] = m_gpuOutputStrides[i - 1] * gpuOutputDimensions[i - 1];
+        } else {
+          m_gpuInputStrides[i] = 1;
+          m_gpuOutputStrides[i] = 1;
+        }
+      }
+    } else {
+      for (int i = NumDims - 1; i >= 0; --i) {
+        if (static_cast<size_t>(i + 1) < offset) {
+          m_gpuInputStrides[i] = m_gpuInputStrides[i + 1] * gpuInputDimensions[i + 1];
+          m_gpuOutputStrides[i] = m_gpuOutputStrides[i + 1] * gpuOutputDimensions[i + 1];
+        } else {
+          m_gpuInputStrides[i] = 1;
+          m_gpuOutputStrides[i] = 1;
+        }
+      }
+    }
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputPlaneToTensorInputOffset(Index p) const {
+    Index inputIndex = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int d = NumDims - 1; d > NumKernelDims; --d) {
+        const Index idx = p / m_gpuInputStrides[d];
+        inputIndex += idx * m_inputStrides[d];
+        p -= idx * m_gpuInputStrides[d];
+      }
+      if (NumKernelDims < NumDims) {
+        inputIndex += p * m_inputStrides[NumKernelDims];
+      }
+    } else {
+      std::ptrdiff_t limit = 0;
+      if (NumKernelDims < NumDims) {
+        limit = NumDims - NumKernelDims - 1;
+      }
+      for (int d = 0; d < limit; ++d) {
+        const Index idx = p / m_gpuInputStrides[d];
+        inputIndex += idx * m_inputStrides[d];
+        p -= idx * m_gpuInputStrides[d];
+      }
+      inputIndex += p * m_inputStrides[limit];
+    }
+    return inputIndex;
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputPlaneToTensorOutputOffset(Index p) const {
+    Index outputIndex = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int d = NumDims - 1; d > NumKernelDims; --d) {
+        const Index idx = p / m_gpuOutputStrides[d];
+        outputIndex += idx * m_outputStrides[d];
+        p -= idx * m_gpuOutputStrides[d];
+      }
+      if (NumKernelDims < NumDims) {
+        outputIndex += p * m_outputStrides[NumKernelDims];
+      }
+    } else {
+      std::ptrdiff_t limit = 0;
+      if (NumKernelDims < NumDims) {
+        limit = NumDims - NumKernelDims - 1;
+      }
+      for (int d = 0; d < limit; ++d) {
+        const Index idx = p / m_gpuOutputStrides[d];
+        outputIndex += idx * m_outputStrides[d];
+        p -= idx * m_gpuOutputStrides[d];
+      }
+      outputIndex += p * m_outputStrides[limit];
+    }
+    return outputIndex;
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i) const {
+    const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - NumKernelDims;
+    return i * m_inputStrides[offset];
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i) const {
+    const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - NumKernelDims;
+    return i * m_outputStrides[offset];
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j) const {
+    const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - NumKernelDims;
+    return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1];
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j) const {
+    const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - NumKernelDims;
+    return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1];
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j, Index k) const {
+    const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - NumKernelDims;
+    return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1] + k * m_inputStrides[offset + 2];
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const {
+    const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - NumKernelDims;
+    return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1] + k * m_outputStrides[offset + 2];
+  }
+
+ private:
+  static constexpr int NumDims = internal::array_size<InputDims>::value;
+  array<Index, NumDims> m_inputStrides;
+  array<Index, NumDims> m_outputStrides;
+  array<Index, NumDims> m_gpuInputStrides;
+  array<Index, NumDims> m_gpuOutputStrides;
+};
+
+template <typename Dimensions, typename InputXprType, typename KernelXprType>
+struct traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> > {
+  // Type promotion to handle the case where the types of the lhs and the rhs are different.
+  typedef typename promote_storage_type<typename InputXprType::Scalar, typename KernelXprType::Scalar>::ret Scalar;
+  typedef typename promote_storage_type<typename traits<InputXprType>::StorageKind,
+                                        typename traits<KernelXprType>::StorageKind>::ret StorageKind;
+  typedef typename promote_index_type<typename traits<InputXprType>::Index, typename traits<KernelXprType>::Index>::type
+      Index;
+  typedef typename InputXprType::Nested LhsNested;
+  typedef typename KernelXprType::Nested RhsNested;
+  typedef std::remove_reference_t<LhsNested> LhsNested_;
+  typedef std::remove_reference_t<RhsNested> RhsNested_;
+  static constexpr int NumDimensions = traits<InputXprType>::NumDimensions;
+  static constexpr int Layout = traits<InputXprType>::Layout;
+  typedef std::conditional_t<Pointer_type_promotion<typename InputXprType::Scalar, Scalar>::val,
+                             typename traits<InputXprType>::PointerType, typename traits<KernelXprType>::PointerType>
+      PointerType;
+
+  enum { Flags = 0 };
+};
+
+template <typename Dimensions, typename InputXprType, typename KernelXprType>
+struct eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, Eigen::Dense> {
+  typedef const TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>& type;
+};
+
+template <typename Dimensions, typename InputXprType, typename KernelXprType>
+struct nested<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, 1,
+              typename eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >::type> {
+  typedef TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> type;
+};
+
+}  // end namespace internal
+
+/** Tensor convolution class.
+ * \ingroup CXX11_Tensor_Module
+ */
+template <typename Indices, typename InputXprType, typename KernelXprType>
+class TensorConvolutionOp
+    : public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType>, ReadOnlyAccessors> {
+ public:
+  typedef typename Eigen::internal::traits<TensorConvolutionOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename internal::promote_storage_type<typename InputXprType::CoeffReturnType,
+                                                  typename KernelXprType::CoeffReturnType>::ret CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorConvolutionOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorConvolutionOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorConvolutionOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel,
+                                                            const Indices& dims)
+      : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Indices& indices() const { return m_indices; }
+
+  /** \returns the nested expressions */
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all_t<typename InputXprType::Nested>& inputExpression()
+      const {
+    return m_input_xpr;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all_t<typename KernelXprType::Nested>& kernelExpression()
+      const {
+    return m_kernel_xpr;
+  }
+
+ protected:
+  typename InputXprType::Nested m_input_xpr;
+  typename KernelXprType::Nested m_kernel_xpr;
+  const Indices m_indices;
+};
+
+template <typename Indices, typename InputArgType, typename KernelArgType, typename Device>
+struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Device> {
+  typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
+
+  static constexpr int NumDims =
+      internal::array_size<typename TensorEvaluator<InputArgType, Device>::Dimensions>::value;
+  static constexpr int NumKernelDims = internal::array_size<Indices>::value;
+  typedef typename XprType::Index Index;
+  typedef DSizes<Index, NumDims> Dimensions;
+
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<Scalar, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  static constexpr int Layout = TensorEvaluator<InputArgType, Device>::Layout;
+  enum {
+    IsAligned =
+        int(TensorEvaluator<InputArgType, Device>::IsAligned) & int(TensorEvaluator<KernelArgType, Device>::IsAligned),
+    PacketAccess = int(TensorEvaluator<InputArgType, Device>::PacketAccess) &
+                   int(TensorEvaluator<KernelArgType, Device>::PacketAccess),
+    BlockAccess = false,
+    PreferBlockAccess = false,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_inputImpl(op.inputExpression(), device),
+        m_kernelImpl(op.kernelExpression(), device),
+        m_kernelArg(op.kernelExpression()),
+        m_kernel(NULL),
+        m_local_kernel(false),
+        m_device(device) {
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) ==
+                         static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)),
+                        YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions();
+    const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_inputStride[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1];
+      }
+    } else {
+      m_inputStride[NumDims - 1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1];
+      }
+    }
+
+    m_dimensions = m_inputImpl.dimensions();
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = 0; i < NumKernelDims; ++i) {
+        const Index index = op.indices()[i];
+        const Index input_dim = input_dims[index];
+        const Index kernel_dim = kernel_dims[i];
+        const Index result_dim = input_dim - kernel_dim + 1;
+        m_dimensions[index] = result_dim;
+        if (i > 0) {
+          m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1];
+        } else {
+          m_kernelStride[0] = 1;
+        }
+        m_indexStride[i] = m_inputStride[index];
+      }
+
+      m_outputStride[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1];
+      }
+    } else {
+      for (int i = NumKernelDims - 1; i >= 0; --i) {
+        const Index index = op.indices()[i];
+        const Index input_dim = input_dims[index];
+        const Index kernel_dim = kernel_dims[i];
+        const Index result_dim = input_dim - kernel_dim + 1;
+        m_dimensions[index] = result_dim;
+        if (i < NumKernelDims - 1) {
+          m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1];
+        } else {
+          m_kernelStride[NumKernelDims - 1] = 1;
+        }
+        m_indexStride[i] = m_inputStride[index];
+      }
+
+      m_outputStride[NumDims - 1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1];
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
+    m_inputImpl.evalSubExprsIfNeeded(NULL);
+    preloadKernel();
+    return true;
+  }
+  EIGEN_STRONG_INLINE void cleanup() {
+    m_inputImpl.cleanup();
+    if (m_local_kernel) {
+      m_device.deallocate((void*)m_kernel);
+      m_local_kernel = false;
+    }
+    m_kernel = NULL;
+  }
+
+  void evalTo(typename XprType::Scalar* buffer) {
+    evalSubExprsIfNeeded(NULL);
+    for (int i = 0; i < dimensions().TotalSize(); ++i) {
+      buffer[i] += coeff(i);
+    }
+    cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    CoeffReturnType result = CoeffReturnType(0);
+    convolve(firstInput(index), 0, NumKernelDims - 1, result);
+    return result;
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const {
+    Index indices[2] = {index, index + PacketSize - 1};
+    Index startInputs[2] = {0, 0};
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx0 = indices[0] / m_outputStride[i];
+        const Index idx1 = indices[1] / m_outputStride[i];
+        startInputs[0] += idx0 * m_inputStride[i];
+        startInputs[1] += idx1 * m_inputStride[i];
+        indices[0] -= idx0 * m_outputStride[i];
+        indices[1] -= idx1 * m_outputStride[i];
+      }
+    } else {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx0 = indices[0] / m_outputStride[i];
+        const Index idx1 = indices[1] / m_outputStride[i];
+        startInputs[0] += idx0 * m_inputStride[i];
+        startInputs[1] += idx1 * m_inputStride[i];
+        indices[0] -= idx0 * m_outputStride[i];
+        indices[1] -= idx1 * m_outputStride[i];
+      }
+    }
+    startInputs[0] += indices[0];
+    startInputs[1] += indices[1];
+
+    if (startInputs[1] - startInputs[0] == PacketSize - 1) {
+      PacketReturnType result = internal::pset1<PacketReturnType>(0);
+      convolvePacket(startInputs[0], 0, NumKernelDims - 1, result);
+      return result;
+    } else {
+      EIGEN_ALIGN_MAX Scalar data[PacketSize];
+      data[0] = Scalar(0);
+      convolve(startInputs[0], 0, NumKernelDims - 1, data[0]);
+      for (int i = 1; i < PacketSize - 1; ++i) {
+        data[i] = Scalar(0);
+        convolve(firstInput(index + i), 0, NumKernelDims - 1, data[i]);
+      }
+      data[PacketSize - 1] = Scalar(0);
+      convolve(startInputs[1], 0, NumKernelDims - 1, data[PacketSize - 1]);
+      return internal::pload<PacketReturnType>(data);
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    const double kernel_size = m_kernelImpl.dimensions().TotalSize();
+    // We ignore the use of fused multiply-add.
+    const double convolve_compute_cost = TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
+    const double firstIndex_compute_cost =
+        NumDims *
+        (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>());
+    return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
+           kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) +
+                          TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize));
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+ private:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
+    Index startInput = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx = index / m_outputStride[i];
+        startInput += idx * m_inputStride[i];
+        index -= idx * m_outputStride[i];
+      }
+    } else {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx = index / m_outputStride[i];
+        startInput += idx * m_inputStride[i];
+        index -= idx * m_outputStride[i];
+      }
+    }
+    startInput += index;
+    return startInput;
+  }
+
+  EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const {
+    for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
+      const Index input = firstIndex + j * m_indexStride[DimIndex];
+      const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
+      if (DimIndex > 0) {
+        convolve(input, kernel, DimIndex - 1, accum);
+      } else {
+        accum += m_inputImpl.coeff(input) * m_kernel[kernel];
+      }
+    }
+  }
+
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const {
+    for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
+      const Index input = firstIndex + j * m_indexStride[DimIndex];
+      const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
+      if (DimIndex > 0) {
+        convolvePacket(input, kernel, DimIndex - 1, accum);
+      } else {
+        accum = internal::pmadd<Packet>(m_inputImpl.template packet<Unaligned>(input),
+                                        internal::pset1<Packet>(m_kernel[kernel]), accum);
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {
+    // Don't make a local copy of the kernel unless we have to (i.e. it's an
+    // expression that needs to be evaluated)
+    const Scalar* in_place = m_kernelImpl.data();
+    if (in_place) {
+      m_kernel = in_place;
+      m_local_kernel = false;
+    } else {
+      size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
+      Scalar* local = (Scalar*)m_device.allocate_temp(kernel_sz);
+      typedef TensorEvalToOp<const KernelArgType> EvalTo;
+      EvalTo evalToTmp(local, m_kernelArg);
+      const bool Vectorize = internal::IsVectorizable<Device, KernelArgType>::value;
+      internal::TensorExecutor<const EvalTo, Device, Vectorize>::run(evalToTmp, m_device);
+
+      m_kernel = local;
+      m_local_kernel = true;
+    }
+  }
+
+  array<Index, NumDims> m_inputStride;
+  array<Index, NumDims> m_outputStride;
+
+  array<Index, NumKernelDims> m_indexStride;
+  array<Index, NumKernelDims> m_kernelStride;
+  TensorEvaluator<InputArgType, Device> m_inputImpl;
+  TensorEvaluator<KernelArgType, Device> m_kernelImpl;
+  Dimensions m_dimensions;
+
+  KernelArgType m_kernelArg;
+  const Scalar* m_kernel;
+  bool m_local_kernel;
+  const Device EIGEN_DEVICE_REF m_device;
+};
+
+// Use an optimized implementation of the evaluation code for GPUs whenever possible.
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
+
+template <int StaticKernelSize>
+struct GetKernelSize {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(const int /*kernelSize*/) const { return StaticKernelSize; }
+};
+template <>
+struct GetKernelSize<Dynamic> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(const int kernelSize) const { return kernelSize; }
+};
+
+template <typename InputEvaluator, typename Index, typename InputDims, int StaticKernelSize>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel1D(
+    InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout> indexMapper,
+    const float* __restrict kernel, const int numPlanes, const int numX, const int maxX, const int kernelSize,
+    float* buffer) {
+#if defined(EIGEN_HIPCC)
+  HIP_DYNAMIC_SHARED(float, s)
+#else
+  extern __shared__ float s[];
+#endif
+
+  const int first_x = blockIdx.x * maxX;
+  const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
+  const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize);
+  const int num_x_output = last_x - first_x + 1;
+
+  const int first_plane = blockIdx.y * blockDim.y;
+  const int plane_stride = blockDim.y * gridDim.y;
+
+  for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) {
+    // Load inputs to shared memory
+    const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
+    const int plane_kernel_offset = threadIdx.y * num_x_input;
+#pragma unroll
+    for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
+      const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + first_x);
+      s[i + plane_kernel_offset] = eval.coeff(tensor_index);
+    }
+
+    __syncthreads();
+
+    // Compute the convolution
+    const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
+
+#pragma unroll
+    for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
+      const int kernel_offset = plane_kernel_offset + i;
+      float result = 0.0f;
+#pragma unroll
+      for (int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) {
+        result += s[k + kernel_offset] * kernel[k];
+      }
+      const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i + first_x);
+      buffer[tensor_index] = result;
+    }
+    __syncthreads();
+  }
+};
+
+template <typename InputEvaluator, typename Index, typename InputDims, int StaticKernelSizeX, int StaticKernelSizeY>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel2D(
+    InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout> indexMapper,
+    const float* __restrict kernel, const int numPlanes, const int numX, const int maxX, const int numY, const int maxY,
+    const int kernelSizeX, const int kernelSizeY, float* buffer) {
+#if defined(EIGEN_HIPCC)
+  HIP_DYNAMIC_SHARED(float, s)
+#else
+  extern __shared__ float s[];
+#endif
+
+  const int first_x = blockIdx.x * maxX;
+  const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
+  const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSizeX>()(kernelSizeX);
+  const int num_x_output = last_x - first_x + 1;
+
+  const int first_y = blockIdx.y * maxY;
+  const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
+  const int num_y_input = last_y - first_y + GetKernelSize<StaticKernelSizeY>()(kernelSizeY);
+  const int num_y_output = last_y - first_y + 1;
+
+  const int first_plane = blockIdx.z * blockDim.z;
+  const int plane_stride = blockDim.z * gridDim.z;
+
+  for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) {
+    const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
+    const int plane_kernel_offset = threadIdx.z * num_y_input;
+
+// Load inputs to shared memory
+#pragma unroll
+    for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
+      const int input_offset = num_x_input * (j + plane_kernel_offset);
+#pragma unroll
+      for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
+        const int tensor_index =
+            plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + first_x, j + first_y);
+        s[i + input_offset] = eval.coeff(tensor_index);
+      }
+    }
+
+    __syncthreads();
+
+    // Convolution
+    const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
+
+#pragma unroll
+    for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
+#pragma unroll
+      for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
+        float result = 0.0f;
+#pragma unroll
+        for (int l = 0; l < GetKernelSize<StaticKernelSizeY>()(kernelSizeY); ++l) {
+          const int kernel_offset = kernelSizeX * l;
+          const int input_offset = i + num_x_input * (j + l + plane_kernel_offset);
+#pragma unroll
+          for (int k = 0; k < GetKernelSize<StaticKernelSizeX>()(kernelSizeX); ++k) {
+            result += s[k + input_offset] * kernel[k + kernel_offset];
+          }
+        }
+        const int tensor_index =
+            plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i + first_x, j + first_y);
+        buffer[tensor_index] = result;
+      }
+    }
+
+    __syncthreads();
+  }
+};
+
+template <typename InputEvaluator, typename Index, typename InputDims>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel3D(
+    InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout> indexMapper,
+    const float* __restrict kernel, const size_t numPlanes, const size_t numX, const size_t maxX, const size_t numY,
+    const size_t maxY, const size_t numZ, const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY,
+    const size_t kernelSizeZ, float* buffer) {
+#if defined(EIGEN_HIPCC)
+  HIP_DYNAMIC_SHARED(float, s)
+#else
+  extern __shared__ float s[];
+#endif
+
+  // Load inputs to shared memory
+  const int first_x = blockIdx.x * maxX;
+  const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
+  const int num_x_input = last_x - first_x + kernelSizeX;
+
+  const int first_y = blockIdx.y * maxY;
+  const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
+  const int num_y_input = last_y - first_y + kernelSizeY;
+
+  const int first_z = blockIdx.z * maxZ;
+  const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
+  const int num_z_input = last_z - first_z + kernelSizeZ;
+
+  for (int p = 0; p < numPlanes; ++p) {
+    const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
+    const int plane_kernel_offset = 0;
+
+    for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) {
+      for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
+        for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
+          const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
+                                                            i + first_x, j + first_y, k + first_z);
+          s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);
+        }
+      }
+    }
+
+    __syncthreads();
+
+    // Convolution
+    const int num_z_output = last_z - first_z + 1;
+    const int num_y_output = last_y - first_y + 1;
+    const int num_x_output = last_x - first_x + 1;
+    const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
+
+    for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) {
+      for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
+        for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
+          float result = 0.0f;
+          for (int n = 0; n < kernelSizeZ; ++n) {
+            for (int m = 0; m < kernelSizeY; ++m) {
+              for (int l = 0; l < kernelSizeX; ++l) {
+                result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] *
+                          kernel[l + kernelSizeX * (m + kernelSizeY * n)];
+              }
+            }
+          }
+          const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(
+                                                             i + first_x, j + first_y, k + first_z);
+          buffer[tensor_index] = result;
+        }
+      }
+    }
+    __syncthreads();
+  }
+};
+
+template <typename Indices, typename InputArgType, typename KernelArgType>
+struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, GpuDevice> {
+  typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
+
+  static constexpr int NumDims =
+      internal::array_size<typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions>::value;
+  static constexpr int NumKernelDims = internal::array_size<Indices>::value;
+  typedef typename XprType::Index Index;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions KernelDimensions;
+
+  static constexpr int Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout;
+  enum {
+    IsAligned =
+        TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,
+    PacketAccess = false,
+    BlockAccess = false,
+    PreferBlockAccess = false,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  TensorEvaluator(const XprType& op, const GpuDevice& device)
+      : m_inputImpl(op.inputExpression(), device),
+        m_kernelImpl(op.kernelExpression(), device),
+        m_kernelArg(op.kernelExpression()),
+        m_indices(op.indices()),
+        m_buf(NULL),
+        m_kernel(NULL),
+        m_local_kernel(false),
+        m_device(device) {
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, GpuDevice>::Layout) ==
+                         static_cast<int>(TensorEvaluator<KernelArgType, GpuDevice>::Layout)),
+                        YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    const typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions& input_dims = m_inputImpl.dimensions();
+    const typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
+
+    m_dimensions = m_inputImpl.dimensions();
+    for (int i = 0; i < NumKernelDims; ++i) {
+      const Index index = op.indices()[i];
+      const Index input_dim = input_dims[index];
+      const Index kernel_dim = kernel_dims[i];
+      const Index result_dim = input_dim - kernel_dim + 1;
+      m_dimensions[index] = result_dim;
+    }
+  }
+
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType;
+  typedef typename InputArgType::Scalar Scalar;
+  static constexpr int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+
+  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+    preloadKernel();
+    m_inputImpl.evalSubExprsIfNeeded(NULL);
+    if (data) {
+      executeEval(data);
+      return false;
+    } else {
+      m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar));
+      executeEval(m_buf);
+      return true;
+    }
+  }
+
+  EIGEN_STRONG_INLINE void cleanup() {
+    m_inputImpl.cleanup();
+    if (m_buf) {
+      m_device.deallocate(m_buf);
+      m_buf = NULL;
+    }
+    if (m_local_kernel) {
+      m_device.deallocate((void*)m_kernel);
+      m_local_kernel = false;
+    }
+    m_kernel = NULL;
+  }
+
+  EIGEN_STRONG_INLINE void preloadKernel() {
+    // Don't make a local copy of the kernel unless we have to (i.e. it's an
+    // expression that needs to be evaluated)
+    const Scalar* in_place = m_kernelImpl.data();
+    if (in_place) {
+      m_kernel = in_place;
+      m_local_kernel = false;
+    } else {
+      size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
+      Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
+      typedef TensorEvalToOp<const KernelArgType> EvalTo;
+      EvalTo evalToTmp(local, m_kernelArg);
+      const bool PacketAccess = internal::IsVectorizable<GpuDevice, KernelArgType>::value;
+      internal::TensorExecutor<const EvalTo, GpuDevice, PacketAccess>::run(evalToTmp, m_device);
+
+      m_kernel = local;
+      m_local_kernel = true;
+    }
+  }
+
+  static unsigned int ceil(unsigned int num, unsigned int denom) {
+    const unsigned int rounded_toward_zero = num / denom;
+    if (num > rounded_toward_zero * denom) {
+      return rounded_toward_zero + 1;
+    }
+    return rounded_toward_zero;
+  }
+
+  void executeEval(Scalar* data) const {
+    typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims;
+
+    const int maxSharedMem = m_device.sharedMemPerBlock();
+    const int maxThreadsPerBlock = m_device.maxGpuThreadsPerBlock();
+    const int maxBlocksPerProcessor = m_device.maxGpuThreadsPerMultiProcessor() / maxThreadsPerBlock;
+    const int numMultiProcessors = m_device.getNumGpuMultiProcessors();
+    const int warpSize = 32;
+
+    switch (NumKernelDims) {
+      case 1: {
+        const int kernel_size = m_kernelImpl.dimensions().TotalSize();
+
+        const int numX = dimensions()[m_indices[0]];
+        const int numP = dimensions().TotalSize() / numX;
+        int maxX;
+        dim3 block_size;
+
+        const int single_stride_dim =
+            static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : m_inputImpl.dimensions().rank() - 1;
+        if (m_indices[0] == single_stride_dim) {
+          // Maximum the reuse
+          const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32;
+          maxX = numext::mini<int>(inner_dim, numX);
+          const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP);
+          block_size.x = numext::mini(maxThreadsPerBlock, maxX);
+          block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP);
+        } else {
+          // Read as much as possible alongside the inner most dimension, that is the plane
+          const int inner_dim = maxSharedMem / ((warpSize + kernel_size) * sizeof(Scalar));
+          const int maxP = numext::mini<int>(inner_dim, numP);
+          maxX = numext::mini<int>(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX);
+
+          block_size.x = numext::mini(warpSize, maxX);
+          block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP);
+        }
+
+        const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar);
+        gpu_assert(shared_mem <= maxSharedMem);
+
+        const int num_x_blocks = ceil(numX, maxX);
+        const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
+        const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks);
+
+        dim3 num_blocks(num_x_blocks, numext::mini<int>(num_y_blocks, ceil(numP, block_size.y)));
+
+        // cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << "
+        // num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: "
+        // << shared_mem << " in stream " << m_device.stream() << endl;
+
+        const array<Index, 1> indices{m_indices[0]};
+        const array<Index, 1> kernel_dims{m_kernelImpl.dimensions()[0]};
+        internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
+        switch (kernel_size) {
+          case 4: {
+            LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>),
+                              num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP,
+                              numX, maxX, 4, data);
+            break;
+          }
+          case 7: {
+            LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>),
+                              num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP,
+                              numX, maxX, 7, data);
+            break;
+          }
+          default: {
+            LAUNCH_GPU_KERNEL(
+                (EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>),
+                num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX,
+                kernel_size, data);
+          }
+        }
+        break;
+      }
+
+      case 2: {
+        const int idxX = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1;
+        const int idxY = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0;
+        const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
+        const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
+
+        const int numX = dimensions()[m_indices[idxX]];
+        const int numY = dimensions()[m_indices[idxY]];
+        const int numP = dimensions().TotalSize() / (numX * numY);
+
+        const float scaling_factor =
+            sqrtf(static_cast<float>(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x));
+
+        // Snap maxX to warp size
+        int inner_dim = ((static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32;
+        const int maxX = numext::mini<int>(inner_dim, numX);
+        const int maxY =
+            numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY);
+        const int maxP = numext::mini<int>(
+            maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP);
+
+        dim3 block_size;
+        block_size.x = numext::mini(1024, maxX);
+        block_size.y = numext::mini<int>(1024 / block_size.x, maxY);
+        block_size.z = numext::mini<int>(1024 / (block_size.x * block_size.y), maxP);
+
+        const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar);
+        gpu_assert(shared_mem <= maxSharedMem);
+
+        const int num_x_blocks = ceil(numX, maxX);
+        const int num_y_blocks = ceil(numY, maxY);
+        const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
+        const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks);
+
+        dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini<int>(num_z_blocks, ceil(numP, block_size.z)));
+
+        // cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y  << "
+        // block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y <<
+        // " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << "
+        // shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
+
+        const array<Index, 2> indices{m_indices[idxX], m_indices[idxY]};
+        const array<Index, 2> kernel_dims{m_kernelImpl.dimensions()[idxX], m_kernelImpl.dimensions()[idxY]};
+        internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
+        switch (kernel_size_x) {
+          case 4: {
+            switch (kernel_size_y) {
+              case 7: {
+                LAUNCH_GPU_KERNEL(
+                    (EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>),
+                    num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX,
+                    numY, maxY, 4, 7, data);
+                break;
+              }
+              default: {
+                LAUNCH_GPU_KERNEL(
+                    (EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>),
+                    num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX,
+                    numY, maxY, 4, kernel_size_y, data);
+                break;
+              }
+            }
+            break;
+          }
+          case 7: {
+            switch (kernel_size_y) {
+              case 4: {
+                LAUNCH_GPU_KERNEL(
+                    (EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>),
+                    num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX,
+                    numY, maxY, 7, 4, data);
+                break;
+              }
+              default: {
+                LAUNCH_GPU_KERNEL(
+                    (EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>),
+                    num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX,
+                    numY, maxY, 7, kernel_size_y, data);
+                break;
+              }
+            }
+            break;
+          }
+          default: {
+            LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims,
+                                                        Dynamic, Dynamic>),
+                              num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP,
+                              numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
+            break;
+          }
+        }
+        break;
+      }
+
+      case 3: {
+        const int idxX = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2;
+        const int idxY = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1;
+        const int idxZ = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0;
+
+        const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
+        const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
+        const int kernel_size_z = m_kernelImpl.dimensions()[idxZ];
+
+        const int numX = dimensions()[m_indices[idxX]];
+        const int numY = dimensions()[m_indices[idxY]];
+        const int numZ = dimensions()[m_indices[idxZ]];
+        const int numP = dimensions().TotalSize() / (numX * numY * numZ);
+
+        const int maxX = numext::mini<int>(
+            128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1,
+                                   numX));
+        const int maxY = numext::mini<int>(
+            128, numext::mini<int>(
+                     maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1,
+                     numY));
+        const int maxZ = numext::mini<int>(
+            128, numext::mini<int>(
+                     maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) -
+                         kernel_size_z + 1,
+                     numZ));
+
+        dim3 block_size;
+        block_size.x = numext::mini(32, maxX);
+        block_size.y = numext::mini(32, maxY);
+        block_size.z = numext::mini<int>(1024 / (block_size.x * block_size.y), maxZ);
+        dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ));
+
+        const int shared_mem =
+            (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar);
+        gpu_assert(shared_mem <= maxSharedMem);
+
+        // cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y  << "
+        // block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y <<
+        // " num_blocks.z: " << num_blocks.z  << " shared_mem: " << shared_mem << " in stream " << m_device.stream() <<
+        // endl;
+        const array<Index, 3> indices{m_indices[idxX], m_indices[idxY], m_indices[idxZ]};
+        const array<Index, 3> kernel_dims{m_kernelImpl.dimensions()[idxX], m_kernelImpl.dimensions()[idxY],
+                                          m_kernelImpl.dimensions()[idxZ]};
+        internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
+
+        LAUNCH_GPU_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>),
+                          num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX,
+                          maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
+        break;
+      }
+
+      default: {
+        EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3),
+                            THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    eigen_assert(m_buf);
+    eigen_assert(index < m_dimensions.TotalSize());
+    return m_buf[index];
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const {
+    eigen_assert(m_buf);
+    eigen_assert(index < m_dimensions.TotalSize());
+    return internal::ploadt<PacketReturnType, LoadMode>(m_buf + index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
+    // model.
+    const double kernel_size = m_kernelImpl.dimensions().TotalSize();
+    // We ignore the use of fused multiply-add.
+    const double convolve_compute_cost = TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
+    const double firstIndex_compute_cost =
+        NumDims *
+        (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>());
+    return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
+           kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) +
+                          TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize));
+  }
+
+ private:
+  TensorEvaluator<InputArgType, GpuDevice> m_inputImpl;
+  TensorEvaluator<KernelArgType, GpuDevice> m_kernelImpl;
+  KernelArgType m_kernelArg;
+  Indices m_indices;
+  Dimensions m_dimensions;
+  Scalar* m_buf;
+  const Scalar* m_kernel;
+  bool m_local_kernel;
+
+  const GpuDevice& m_device;
+};
+#endif
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
new file mode 100644
index 00000000..915c5de4
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorConvolutionSycl.h
@@ -0,0 +1,538 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+enum class convolution_type { CONV1D, CONV2D, CONV3D };
+template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
+          typename Kernel_accessor, typename Buffer_accessor, convolution_type Conv_Dim>
+struct EigenConvolutionKernel;
+template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
+          typename Kernel_accessor, typename Buffer_accessor>
+struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
+                              Buffer_accessor, convolution_type::CONV1D> {
+  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+      Local_accessor;
+  Local_accessor local_acc;
+  Evaluator device_evaluator;
+  Kernel_accessor kernel_filter;
+  Buffer_accessor buffer_acc;
+  internal::IndexMapper<Index, InputDims, 1, Evaluator::Layout> indexMapper;
+  const size_t kernelSize;
+  const cl::sycl::range<2> input_range;
+  EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
+                         Buffer_accessor buffer_acc_,
+                         internal::IndexMapper<Index, InputDims, 1, Evaluator::Layout> indexMapper_,
+                         const size_t kernelSize_, const cl::sycl::range<2> input_range_)
+      : local_acc(local_acc_),
+        device_evaluator(device_evaluator_),
+        kernel_filter(kernel_filter_),
+        buffer_acc(buffer_acc_),
+        indexMapper(indexMapper_),
+        kernelSize(kernelSize_),
+        input_range(input_range_) {}
+
+  template <typename BooleanDim2>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim2 boolean_check) const {
+    return (boolean_check[0] && boolean_check[1]);
+  }
+  void operator()(cl::sycl::nd_item<2> itemID) const {
+    auto buffer_ptr = buffer_acc;
+    auto kernel_ptr = kernel_filter;
+    // the required row to be calculated for the for each plane in shered memory
+    const size_t num_input = (itemID.get_local_range()[0] + kernelSize - 1);
+    const size_t plane_kernel_offset = itemID.get_local_id(1) * num_input;
+    const size_t input_offset = itemID.get_group(0) * itemID.get_local_range()[0];
+    const size_t plane_tensor_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(1));
+    /// fill the shared memory
+    for (size_t i = itemID.get_local_id(0); i < num_input; i += itemID.get_local_range()[0]) {
+      const size_t local_index = i + plane_kernel_offset;
+      const size_t tensor_index =
+          plane_tensor_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + input_offset);
+
+      local_acc[local_index] =
+          (((i + input_offset) < (input_range[0] + kernelSize - 1)) && itemID.get_global_id(1) < input_range[1])
+              ? device_evaluator.coeff(tensor_index)
+              : CoeffReturnType(0);
+    }
+
+    itemID.barrier(cl::sycl::access::fence_space::local_space);
+
+    // calculate the convolution // output start x
+    const size_t first_output_start = itemID.get_group(0) * (itemID.get_local_range()[0]);
+    if (boundary_check(itemID.get_global_id() < input_range)) {
+      CoeffReturnType result = static_cast<CoeffReturnType>(0);
+      const size_t index = plane_kernel_offset + itemID.get_local_id(0);
+      for (size_t k = 0; k < kernelSize; ++k) {
+        result += (local_acc[k + index] * kernel_ptr[k]);
+      }
+      const size_t tensor_index =
+          indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(1)) +
+          indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + first_output_start);
+      buffer_ptr[tensor_index] = result;
+    }
+  }
+};
+
+template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
+          typename Kernel_accessor, typename Buffer_accessor>
+struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
+                              Buffer_accessor, convolution_type::CONV2D> {
+  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+      Local_accessor;
+  Local_accessor local_acc;
+  Evaluator device_evaluator;
+  Kernel_accessor kernel_filter;
+  Buffer_accessor buffer_acc;
+  internal::IndexMapper<Index, InputDims, 2, Evaluator::Layout> indexMapper;
+  const cl::sycl::range<2> kernel_size;
+  const cl::sycl::range<3> input_range;
+  EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
+                         Buffer_accessor buffer_acc_,
+                         internal::IndexMapper<Index, InputDims, 2, Evaluator::Layout> indexMapper_,
+                         const cl::sycl::range<2> kernel_size_, const cl::sycl::range<3> input_range_)
+      : local_acc(local_acc_),
+        device_evaluator(device_evaluator_),
+        kernel_filter(kernel_filter_),
+        buffer_acc(buffer_acc_),
+        indexMapper(indexMapper_),
+        kernel_size(kernel_size_),
+        input_range(input_range_) {}
+  template <typename BooleanDim3>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) const {
+    return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
+  }
+
+  void operator()(cl::sycl::nd_item<3> itemID) const {
+    auto buffer_ptr = buffer_acc;
+    auto kernel_ptr = kernel_filter;
+    // the required row to be calculated for the for each plane in shered memory
+    const auto num_input = cl::sycl::range<2>{
+        (cl::sycl::range<2>(itemID.get_local_range()[0], itemID.get_local_range()[1]) + kernel_size - 1)};
+
+    const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(2));
+    const size_t plane_kernel_offset = itemID.get_local_id(2) * num_input[1];
+
+    const auto input_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0],
+                                                 itemID.get_group(1) * itemID.get_local_range()[1]};
+
+    // fill the local memory
+    bool in_range_dim2 = itemID.get_global_id(2) < input_range[2];
+    for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) {
+      const size_t local_input_offset = num_input[0] * (j + plane_kernel_offset);
+      bool in_range_dim1 = ((j + input_offset[1]) < (input_range[1] + kernel_size[1] - 1));
+      for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) {
+        const size_t local_index = i + local_input_offset;
+        const size_t tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
+                                                             i + input_offset[0], j + input_offset[1]);
+        local_acc[local_index] =
+            (((i + input_offset[0]) < (input_range[0] + kernel_size[0] - 1)) && in_range_dim1 && in_range_dim2)
+                ? device_evaluator.coeff(tensor_index)
+                : CoeffReturnType(0);
+      }
+    }
+
+    itemID.barrier(cl::sycl::access::fence_space::local_space);
+
+    // output offset start for each thread
+    const auto output_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0],
+                                                  itemID.get_group(1) * itemID.get_local_range()[1]};
+
+    if (boundary_check(itemID.get_global_id() < input_range)) {
+      CoeffReturnType result = static_cast<CoeffReturnType>(0);
+
+      for (size_t j = 0; j < kernel_size[1]; j++) {
+        size_t kernel_offset = kernel_size[0] * j;
+        const size_t index =
+            (num_input[0] * (plane_kernel_offset + j + itemID.get_local_id(1))) + itemID.get_local_id(0);
+        for (size_t i = 0; i < kernel_size[0]; i++) {
+          result += (local_acc[i + index] * kernel_ptr[i + kernel_offset]);
+        }
+      }
+      const size_t tensor_index =
+          indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(2)) +
+          indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + output_offset[0],
+                                                             itemID.get_local_id(1) + output_offset[1]);
+
+      buffer_ptr[tensor_index] = result;
+    }
+  }
+};
+
+template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
+          typename Kernel_accessor, typename Buffer_accessor>
+struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
+                              Buffer_accessor, convolution_type::CONV3D> {
+  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+      Local_accessor;
+  Local_accessor local_acc;
+  Evaluator device_evaluator;
+  Kernel_accessor kernel_filter;
+  Buffer_accessor buffer_acc;
+  internal::IndexMapper<Index, InputDims, 3, Evaluator::Layout> indexMapper;
+  const cl::sycl::range<3> kernel_size;
+  const cl::sycl::range<3> input_range;
+  const size_t numP;
+
+  EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
+                         Buffer_accessor buffer_acc_,
+                         internal::IndexMapper<Index, InputDims, 3, Evaluator::Layout> indexMapper_,
+                         const cl::sycl::range<3> kernel_size_, const cl::sycl::range<3> input_range_,
+                         const size_t numP_)
+      : local_acc(local_acc_),
+        device_evaluator(device_evaluator_),
+        kernel_filter(kernel_filter_),
+        buffer_acc(buffer_acc_),
+        indexMapper(indexMapper_),
+        kernel_size(kernel_size_),
+        input_range(input_range_),
+        numP(numP_) {}
+  template <typename BooleanDim3>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) const {
+    return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
+  }
+  void operator()(cl::sycl::nd_item<3> itemID) const {
+    auto buffer_ptr = buffer_acc;
+    auto kernel_ptr = kernel_filter;
+    const auto num_input = cl::sycl::range<3>{itemID.get_local_range() + kernel_size - 1};
+
+    const auto input_offset = cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range()};
+
+    const auto output_offset =
+        cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range() + itemID.get_local_id()};
+
+    for (size_t p = 0; p < numP; p++) {
+      /// fill the shared memory
+      const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
+      for (size_t k = itemID.get_local_id(2); k < num_input[2]; k += itemID.get_local_range()[2]) {
+        size_t local_index_dim2 = num_input[0] * num_input[1] * k;
+        bool cond_k_dim = (k + input_offset[2] < (input_range[2] + kernel_size[2] - 1));
+        for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) {
+          bool cond_j_dim = cond_k_dim && (j + input_offset[1] < (input_range[1] + kernel_size[1] - 1));
+          size_t local_index_dim1 = (num_input[0] * j) + local_index_dim2;
+          for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) {
+            bool conds = cond_j_dim && (i + input_offset[0] < (input_range[0] + kernel_size[0] - 1));
+            const size_t local_index = local_index_dim1 + i;
+            const size_t tensor_index =
+                plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
+                                         i + input_offset[0], j + input_offset[1], k + input_offset[2]);
+            local_acc[local_index] = conds ? device_evaluator.coeff(tensor_index) : CoeffReturnType(0);
+          }
+        }
+      }
+      itemID.barrier(cl::sycl::access::fence_space::local_space);
+
+      // calculate the convolution
+
+      if (boundary_check(itemID.get_global_id() < input_range)) {
+        CoeffReturnType result = static_cast<CoeffReturnType>(0);
+        for (size_t k = 0; k < kernel_size[2]; k++) {
+          for (size_t j = 0; j < kernel_size[1]; j++) {
+            for (size_t i = 0; i < kernel_size[0]; i++) {
+              const size_t kernel_index = i + kernel_size[0] * (j + kernel_size[1] * k);
+              const size_t local_index =
+                  ((i + itemID.get_local_id(0)) +
+                   num_input[0] * ((j + itemID.get_local_id(1)) + num_input[1] * (k + itemID.get_local_id(2))));
+
+              result += (local_acc[local_index] * kernel_ptr[kernel_index]);
+            }
+          }
+        }
+        const size_t tensor_index =
+            indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p) +
+            indexMapper.mapGpuOutputKernelToTensorOutputOffset(output_offset[0], output_offset[1], output_offset[2]);
+        buffer_ptr[tensor_index] = result;
+      }
+
+      itemID.barrier(cl::sycl::access::fence_space::local_space);
+    }
+  }
+};
+
+template <typename Indices, typename InputArgType, typename KernelArgType>
+struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Eigen::SyclDevice> {
+  typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
+
+  static constexpr int NumDims =
+      internal::array_size<typename TensorEvaluator<InputArgType, Eigen::SyclDevice>::Dimensions>::value;
+  static constexpr int NumKernelDims = internal::array_size<Indices>::value;
+  typedef typename XprType::Index Index;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Dimensions KernelDimensions;
+  typedef const Eigen::SyclDevice Device;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Eigen::SyclDevice>::type PacketReturnType;
+  typedef typename InputArgType::Scalar Scalar;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<CoeffReturnType, Eigen::SyclDevice> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+  typedef StorageMemory<const CoeffReturnType, Eigen::SyclDevice> KernelStorage;
+
+  static constexpr int Layout = TensorEvaluator<InputArgType, Eigen::SyclDevice>::Layout;
+  enum {
+    IsAligned = TensorEvaluator<InputArgType, Eigen::SyclDevice>::IsAligned &
+                TensorEvaluator<KernelArgType, Eigen::SyclDevice>::IsAligned,
+    PacketAccess = false,
+    BlockAccess = false,
+    PreferBlockAccess = false,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  TensorEvaluator(const XprType &op, const Eigen::SyclDevice &device)
+      : m_inputImpl(op.inputExpression(), device),
+        m_kernelArg(op.kernelExpression()),
+        m_kernelImpl(op.kernelExpression(), device),
+        m_indices(op.indices()),
+        m_buf(NULL),
+        m_kernel(NULL),
+        m_local_kernel(false),
+        m_device(device) {
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Eigen::SyclDevice>::Layout) ==
+                         static_cast<int>(TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Layout)),
+                        YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    const typename TensorEvaluator<InputArgType, Eigen::SyclDevice>::Dimensions &input_dims = m_inputImpl.dimensions();
+    const typename TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Dimensions &kernel_dims =
+        m_kernelImpl.dimensions();
+
+    m_dimensions = m_inputImpl.dimensions();
+    for (int i = 0; i < NumKernelDims; ++i) {
+      const Index index = op.indices()[i];
+      const Index input_dim = input_dims[index];
+      const Index kernel_dim = kernel_dims[i];
+      const Index result_dim = input_dim - kernel_dim + 1;
+      m_dimensions[index] = result_dim;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC const Dimensions &dimensions() const { return m_dimensions; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
+    preloadKernel();
+    m_inputImpl.evalSubExprsIfNeeded(NULL);
+    if (data) {
+      executeEval(data);
+      return false;
+    } else {
+      m_buf = (EvaluatorPointerType)m_device.get(
+          (Scalar *)m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar)));
+      executeEval(m_buf);
+      return true;
+    }
+  }
+
+  EIGEN_STRONG_INLINE void cleanup() {
+    m_inputImpl.cleanup();
+    if (m_buf) {
+      m_device.deallocate_temp(m_buf);
+      m_buf = NULL;
+    }
+    if (m_local_kernel) {
+      m_device.deallocate_temp(m_kernel);
+      m_local_kernel = false;
+    }
+    m_kernel = NULL;
+  }
+  /// used by sycl in order to build the sycl buffer
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device &device() const { return m_device; }
+  /// used by sycl in order to build the sycl buffer
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_buf; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {
+    // Don't make a local copy of the kernel unless we have to (i.e. it's an
+    // expression that needs to be evaluated)
+    typename KernelStorage::Type in_place = m_kernelImpl.data();
+    if (in_place) {
+      m_kernel = in_place;
+      m_local_kernel = false;
+    } else {
+      ptrdiff_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
+      EvaluatorPointerType local = (EvaluatorPointerType)m_device.get((Scalar *)m_device.allocate_temp(kernel_sz));
+      typedef TensorEvalToOp<const KernelArgType> EvalTo;
+      EvalTo evalToTmp(m_device.get(local), m_kernelArg);
+      const bool PacketAccess = internal::IsVectorizable<Eigen::SyclDevice, KernelArgType>::value;
+      internal::TensorExecutor<const EvalTo, Eigen::SyclDevice, PacketAccess>::run(evalToTmp, m_device);
+      m_kernel = local;
+      m_local_kernel = true;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void executeEval(EvaluatorPointerType data) const {
+    typedef TensorEvaluator<InputArgType, Eigen::SyclDevice> InputEvaluator;
+    typedef typename InputEvaluator::Dimensions InputDims;
+    switch (NumKernelDims) {
+      case 1: {
+        const size_t numX = dimensions()[m_indices[0]];
+        const size_t numP = dimensions().TotalSize() / numX;
+        const auto input_dim = std::array<size_t, 2>{numX, numP};
+        auto global_range = cl::sycl::range<2>{1, 1};
+        auto local_range = cl::sycl::range<2>{1, 1};
+        const size_t kernel_size = m_kernelImpl.dimensions().TotalSize();
+
+        m_device.parallel_for_setup(input_dim, global_range, local_range);
+        const size_t local_memory_size = (local_range[0] + kernel_size - 1) * (local_range[1]);
+        gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
+        const array<Index, 1> indices{{m_indices[0]}};
+        const array<Index, 1> kernel_dims{{m_kernelImpl.dimensions()[0]}};
+        internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
+
+        typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
+                                       typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV1D>
+            ConvKernel;
+
+        m_device
+            .template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
+                m_inputImpl, m_kernel, data, cl::sycl::nd_range<2>(global_range, local_range), local_memory_size,
+                indexMapper, kernel_size, cl::sycl::range<2>(input_dim[0], input_dim[1]))
+            .wait();
+        break;
+      }
+
+      case 2: {
+        auto kernel_index = std::array<size_t, 2>{static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1,
+                                                  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0};
+        auto kernel_size = cl::sycl::range<2>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]],
+                                              (size_t)m_kernelImpl.dimensions()[kernel_index[1]]};
+        const size_t numX = dimensions()[m_indices[kernel_index[0]]];
+        const size_t numY = dimensions()[m_indices[kernel_index[1]]];
+        const size_t numP = dimensions().TotalSize() / (numX * numY);
+        auto input_dim = std::array<size_t, 3>{numX, numY, numP};
+
+        auto global_range = cl::sycl::range<3>{1, 1, 1};
+        auto local_range = cl::sycl::range<3>{1, 1, 1};
+
+        m_device.parallel_for_setup(input_dim, global_range, local_range);
+
+        const size_t local_memory_size =
+            (local_range[0] + kernel_size[0] - 1) * (local_range[1] + kernel_size[1] - 1) * local_range[2];
+        gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
+        const array<Index, 2> indices{{m_indices[kernel_index[0]], m_indices[kernel_index[1]]}};
+        const array<Index, 2> kernel_dims{
+            {m_kernelImpl.dimensions()[kernel_index[0]], m_kernelImpl.dimensions()[kernel_index[1]]}};
+        internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
+        typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
+                                       typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV2D>
+            ConvKernel;
+        m_device
+            .template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
+                m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
+                indexMapper, kernel_size, cl::sycl::range<3>{input_dim[0], input_dim[1], input_dim[2]})
+            .wait();
+        break;
+      }
+
+      case 3: {
+        auto kernel_index = std::array<size_t, 3>{static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2,
+                                                  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1,
+                                                  static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0};
+
+        auto kernel_size = cl::sycl::range<3>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]],
+                                              (size_t)m_kernelImpl.dimensions()[kernel_index[1]],
+                                              (size_t)m_kernelImpl.dimensions()[kernel_index[2]]};
+
+        const size_t numX = dimensions()[m_indices[kernel_index[0]]];
+        const size_t numY = dimensions()[m_indices[kernel_index[1]]];
+        const size_t numZ = dimensions()[m_indices[kernel_index[2]]];
+        auto input_dim = std::array<size_t, 3>{numX, numY, numZ};
+        const size_t numP = dimensions().TotalSize() / (numX * numY * numZ);
+
+        const array<Index, 3> indices{
+            {m_indices[kernel_index[0]], m_indices[kernel_index[1]], m_indices[kernel_index[2]]}};
+        const array<Index, 3> kernel_dims{{m_kernelImpl.dimensions()[kernel_index[0]],
+                                           m_kernelImpl.dimensions()[kernel_index[1]],
+                                           m_kernelImpl.dimensions()[kernel_index[2]]}};
+
+        internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
+
+        auto global_range = cl::sycl::range<3>{1, 1, 1};
+        auto local_range = cl::sycl::range<3>{1, 1, 1};
+
+        m_device.parallel_for_setup(input_dim, global_range, local_range);
+        auto local_memory_range = (local_range + kernel_size - 1);
+        const size_t local_memory_size = local_memory_range[0] * local_memory_range[1] * local_memory_range[2];
+
+        gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
+        typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
+                                       typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV3D>
+            ConvKernel;
+        m_device
+            .template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
+                m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
+                indexMapper, kernel_size, cl::sycl::range<3>(input_dim[0], input_dim[1], input_dim[2]), numP)
+            .wait();
+        break;
+      }
+
+      default: {
+        EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3),
+                            THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    eigen_assert(m_buf != NULL);
+    eigen_assert(index < m_dimensions.TotalSize());
+    return m_buf[index];
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const {
+    eigen_assert(m_buf != NULL);
+    eigen_assert(index < m_dimensions.TotalSize());
+    return internal::ploadt<PacketReturnType, LoadMode>(m_buf + index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
+    // model.
+    const double kernel_size = m_kernelImpl.dimensions().TotalSize();
+    // We ignore the use of fused multiply-add.
+    const double convolve_compute_cost = TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
+    const double firstIndex_compute_cost =
+        NumDims *
+        (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>());
+    return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
+           kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) +
+                          TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize));
+  }
+
+ private:
+  // No assignment (copies are needed by the kernels)
+  TensorEvaluator &operator=(const TensorEvaluator &);
+  TensorEvaluator<InputArgType, Eigen::SyclDevice> m_inputImpl;
+  KernelArgType m_kernelArg;
+  TensorEvaluator<KernelArgType, Eigen::SyclDevice> m_kernelImpl;
+  Indices m_indices;
+  Dimensions m_dimensions;
+  EvaluatorPointerType m_buf;
+  typename KernelStorage::Type m_kernel;
+  bool m_local_kernel;
+  const Eigen::SyclDevice EIGEN_DEVICE_REF m_device;
+};  // namespace Eigen
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
new file mode 100644
index 00000000..88e36dc5
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
@@ -0,0 +1,189 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Rasmus Munk Larsen <rmlarsen@google.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
+#define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+// Class storing the cost of evaluating a tensor expression in terms of the
+// estimated number of operand bytes loads, bytes stored, and compute cycles.
+class TensorOpCost {
+ public:
+  // TODO(rmlarsen): Fix the scalar op costs in Eigen proper. Even a simple
+  // model based on minimal reciprocal throughput numbers from Intel or
+  // Agner Fog's tables would be better than what is there now.
+  template <typename ArgType>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int MulCost() {
+    return internal::functor_traits<internal::scalar_product_op<ArgType, ArgType> >::Cost;
+  }
+  template <typename ArgType>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int AddCost() {
+    return internal::functor_traits<internal::scalar_sum_op<ArgType> >::Cost;
+  }
+  template <typename ArgType>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int DivCost() {
+    return internal::functor_traits<internal::scalar_quotient_op<ArgType, ArgType> >::Cost;
+  }
+  template <typename ArgType>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int ModCost() {
+    return internal::functor_traits<internal::scalar_mod_op<ArgType> >::Cost;
+  }
+  template <typename SrcType, typename TargetType>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int CastCost() {
+    return internal::functor_traits<internal::scalar_cast_op<SrcType, TargetType> >::Cost;
+  }
+
+  EIGEN_DEVICE_FUNC TensorOpCost() : bytes_loaded_(0), bytes_stored_(0), compute_cycles_(0) {}
+  EIGEN_DEVICE_FUNC TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles)
+      : bytes_loaded_(bytes_loaded), bytes_stored_(bytes_stored), compute_cycles_(compute_cycles) {}
+
+  EIGEN_DEVICE_FUNC TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles, bool vectorized,
+                                 double packet_size)
+      : bytes_loaded_(bytes_loaded),
+        bytes_stored_(bytes_stored),
+        compute_cycles_(vectorized ? compute_cycles / packet_size : compute_cycles) {
+    eigen_assert(bytes_loaded >= 0 && (numext::isfinite)(bytes_loaded));
+    eigen_assert(bytes_stored >= 0 && (numext::isfinite)(bytes_stored));
+    eigen_assert(compute_cycles >= 0 && (numext::isfinite)(compute_cycles));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_loaded() const { return bytes_loaded_; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_stored() const { return bytes_stored_; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double compute_cycles() const { return compute_cycles_; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double total_cost(double load_cost, double store_cost,
+                                                          double compute_cost) const {
+    return load_cost * bytes_loaded_ + store_cost * bytes_stored_ + compute_cost * compute_cycles_;
+  }
+
+  // Drop memory access component. Intended for cases when memory accesses are
+  // sequential or are completely masked by computations.
+  EIGEN_DEVICE_FUNC void dropMemoryCost() {
+    bytes_loaded_ = 0;
+    bytes_stored_ = 0;
+  }
+
+  // TODO(rmlarsen): Define min in terms of total cost, not elementwise.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMin(const TensorOpCost& rhs) const {
+    double bytes_loaded = numext::mini(bytes_loaded_, rhs.bytes_loaded());
+    double bytes_stored = numext::mini(bytes_stored_, rhs.bytes_stored());
+    double compute_cycles = numext::mini(compute_cycles_, rhs.compute_cycles());
+    return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
+  }
+
+  // TODO(rmlarsen): Define max in terms of total cost, not elementwise.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMax(const TensorOpCost& rhs) const {
+    double bytes_loaded = numext::maxi(bytes_loaded_, rhs.bytes_loaded());
+    double bytes_stored = numext::maxi(bytes_stored_, rhs.bytes_stored());
+    double compute_cycles = numext::maxi(compute_cycles_, rhs.compute_cycles());
+    return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator+=(const TensorOpCost& rhs) {
+    bytes_loaded_ += rhs.bytes_loaded();
+    bytes_stored_ += rhs.bytes_stored();
+    compute_cycles_ += rhs.compute_cycles();
+    return *this;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator*=(double rhs) {
+    bytes_loaded_ *= rhs;
+    bytes_stored_ *= rhs;
+    compute_cycles_ *= rhs;
+    return *this;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator+(TensorOpCost lhs, const TensorOpCost& rhs) {
+    lhs += rhs;
+    return lhs;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(TensorOpCost lhs, double rhs) {
+    lhs *= rhs;
+    return lhs;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE friend TensorOpCost operator*(double lhs, TensorOpCost rhs) {
+    rhs *= lhs;
+    return rhs;
+  }
+
+  friend std::ostream& operator<<(std::ostream& os, const TensorOpCost& tc) {
+    return os << "[bytes_loaded = " << tc.bytes_loaded() << ", bytes_stored = " << tc.bytes_stored()
+              << ", compute_cycles = " << tc.compute_cycles() << "]";
+  }
+
+ private:
+  double bytes_loaded_;
+  double bytes_stored_;
+  double compute_cycles_;
+};
+
+// TODO(rmlarsen): Implement a policy that chooses an "optimal" number of theads
+// in [1:max_threads] instead of just switching multi-threading off for small
+// work units.
+/**
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief A cost model used to limit the number of threads used for evaluating
+ * tensor expression.
+ *
+ */
+template <typename Device>
+class TensorCostModel {
+ public:
+  // Scaling from Eigen compute cost to device cycles.
+  static const int kDeviceCyclesPerComputeCycle = 1;
+
+  // Costs in device cycles.
+  static const int kStartupCycles = 100000;
+  static const int kPerThreadCycles = 100000;
+  static const int kTaskSize = 40000;
+
+  // Returns the number of threads in [1:max_threads] to use for
+  // evaluating an expression with the given output size and cost per
+  // coefficient.
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int numThreads(double output_size, const TensorOpCost& cost_per_coeff,
+                                                              int max_threads) {
+    double cost = totalCost(output_size, cost_per_coeff);
+    double threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9;
+    // Make sure we don't invoke undefined behavior when we convert to an int.
+    threads = numext::mini<double>(threads, GenericNumTraits<int>::highest());
+    return numext::mini(max_threads, numext::maxi<int>(1, static_cast<int>(threads)));
+  }
+
+  // taskSize assesses parallel task size.
+  // Value of 1.0 means ideal parallel task size. Values < 1.0 mean that task
+  // granularity needs to be increased to mitigate parallelization overheads.
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double taskSize(double output_size, const TensorOpCost& cost_per_coeff) {
+    return totalCost(output_size, cost_per_coeff) / kTaskSize;
+  }
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double totalCost(double output_size,
+                                                                const TensorOpCost& cost_per_coeff) {
+    // Cost of memory fetches from L2 cache. 64 is typical cache line size.
+    // 11 is L2 cache latency on Haswell.
+    // We don't know whether data is in L1, L2 or L3. But we are most interested
+    // in single-threaded computational time around 100us-10ms (smaller time
+    // is too small for parallelization, larger time is not interesting
+    // either because we are probably using all available threads already).
+    // And for the target time range, L2 seems to be what matters. Data set
+    // fitting into L1 is too small to take noticeable time. Data set fitting
+    // only into L3 presumably will take more than 10ms to load and process.
+    const double kLoadCycles = 1.0 / 64 * 11;
+    const double kStoreCycles = 1.0 / 64 * 11;
+    // Scaling from Eigen compute cost to device cycles.
+    return output_size * cost_per_coeff.total_cost(kLoadCycles, kStoreCycles, kDeviceCyclesPerComputeCycle);
+  }
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
new file mode 100644
index 00000000..1cae60f2
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorCustomOp.h
@@ -0,0 +1,307 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H
+#define EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+template <typename CustomUnaryFunc, typename XprType>
+struct traits<TensorCustomUnaryOp<CustomUnaryFunc, XprType> > {
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::StorageKind StorageKind;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef std::remove_reference_t<Nested> Nested_;
+  static constexpr int NumDimensions = traits<XprType>::NumDimensions;
+  static constexpr int Layout = traits<XprType>::Layout;
+  typedef typename traits<XprType>::PointerType PointerType;
+};
+
+template <typename CustomUnaryFunc, typename XprType>
+struct eval<TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Eigen::Dense> {
+  typedef const TensorCustomUnaryOp<CustomUnaryFunc, XprType> EIGEN_DEVICE_REF type;
+};
+
+template <typename CustomUnaryFunc, typename XprType>
+struct nested<TensorCustomUnaryOp<CustomUnaryFunc, XprType> > {
+  typedef TensorCustomUnaryOp<CustomUnaryFunc, XprType> type;
+};
+
+}  // end namespace internal
+
+/**
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor custom class.
+ */
+template <typename CustomUnaryFunc, typename XprType>
+class TensorCustomUnaryOp : public TensorBase<TensorCustomUnaryOp<CustomUnaryFunc, XprType>, ReadOnlyAccessors> {
+ public:
+  typedef typename internal::traits<TensorCustomUnaryOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename internal::nested<TensorCustomUnaryOp>::type Nested;
+  typedef typename internal::traits<TensorCustomUnaryOp>::StorageKind StorageKind;
+  typedef typename internal::traits<TensorCustomUnaryOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCustomUnaryOp(const XprType& expr, const CustomUnaryFunc& func)
+      : m_expr(expr), m_func(func) {}
+
+  EIGEN_DEVICE_FUNC const CustomUnaryFunc& func() const { return m_func; }
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename XprType::Nested>& expression() const { return m_expr; }
+
+ protected:
+  typename XprType::Nested m_expr;
+  const CustomUnaryFunc m_func;
+};
+
+// Eval as rvalue
+template <typename CustomUnaryFunc, typename XprType, typename Device>
+struct TensorEvaluator<const TensorCustomUnaryOp<CustomUnaryFunc, XprType>, Device> {
+  typedef TensorCustomUnaryOp<CustomUnaryFunc, XprType> ArgType;
+  typedef typename internal::traits<ArgType>::Index Index;
+  static constexpr int NumDims = internal::traits<ArgType>::NumDimensions;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef std::remove_const_t<typename ArgType::Scalar> Scalar;
+  typedef std::remove_const_t<typename XprType::CoeffReturnType> CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  static constexpr int Layout = TensorEvaluator<XprType, Device>::Layout;
+  enum {
+    IsAligned = false,
+    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
+    BlockAccess = false,
+    PreferBlockAccess = false,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const ArgType& op, const Device& device)
+      : m_op(op), m_device(device), m_result(NULL) {
+    m_dimensions = op.func().dimensions(op.expression());
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
+    if (data) {
+      evalTo(data);
+      return false;
+    } else {
+      m_result = static_cast<EvaluatorPointerType>(
+          m_device.get((CoeffReturnType*)m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar))));
+      evalTo(m_result);
+      return true;
+    }
+  }
+
+  EIGEN_STRONG_INLINE void cleanup() {
+    if (m_result) {
+      m_device.deallocate_temp(m_result);
+      m_result = NULL;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_result[index]; }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
+    return internal::ploadt<PacketReturnType, LoadMode>(m_result + index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    // TODO(rmlarsen): Extend CustomOp API to return its cost estimate.
+    return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; }
+
+ protected:
+  void evalTo(EvaluatorPointerType data) {
+    TensorMap<Tensor<CoeffReturnType, NumDims, Layout, Index> > result(m_device.get(data), m_dimensions);
+    m_op.func().eval(m_op.expression(), result, m_device);
+  }
+
+  Dimensions m_dimensions;
+  const ArgType m_op;
+  const Device EIGEN_DEVICE_REF m_device;
+  EvaluatorPointerType m_result;
+};
+
+/** \class TensorCustomBinaryOp
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor custom class.
+ *
+ *
+ */
+namespace internal {
+template <typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>
+struct traits<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> > {
+  typedef typename internal::promote_storage_type<typename LhsXprType::Scalar, typename RhsXprType::Scalar>::ret Scalar;
+  typedef typename internal::promote_storage_type<typename LhsXprType::CoeffReturnType,
+                                                  typename RhsXprType::CoeffReturnType>::ret CoeffReturnType;
+  typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind,
+                                        typename traits<RhsXprType>::StorageKind>::ret StorageKind;
+  typedef
+      typename promote_index_type<typename traits<LhsXprType>::Index, typename traits<RhsXprType>::Index>::type Index;
+  typedef typename LhsXprType::Nested LhsNested;
+  typedef typename RhsXprType::Nested RhsNested;
+  typedef std::remove_reference_t<LhsNested> LhsNested_;
+  typedef std::remove_reference_t<RhsNested> RhsNested_;
+  static constexpr int NumDimensions = traits<LhsXprType>::NumDimensions;
+  static constexpr int Layout = traits<LhsXprType>::Layout;
+  typedef std::conditional_t<Pointer_type_promotion<typename LhsXprType::Scalar, Scalar>::val,
+                             typename traits<LhsXprType>::PointerType, typename traits<RhsXprType>::PointerType>
+      PointerType;
+};
+
+template <typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>
+struct eval<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, Eigen::Dense> {
+  typedef const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>& type;
+};
+
+template <typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>
+struct nested<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> > {
+  typedef TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> type;
+};
+
+}  // end namespace internal
+
+template <typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>
+class TensorCustomBinaryOp
+    : public TensorBase<TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, ReadOnlyAccessors> {
+ public:
+  typedef typename internal::traits<TensorCustomBinaryOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename internal::traits<TensorCustomBinaryOp>::CoeffReturnType CoeffReturnType;
+  typedef typename internal::nested<TensorCustomBinaryOp>::type Nested;
+  typedef typename internal::traits<TensorCustomBinaryOp>::StorageKind StorageKind;
+  typedef typename internal::traits<TensorCustomBinaryOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCustomBinaryOp(const LhsXprType& lhs, const RhsXprType& rhs,
+                                                             const CustomBinaryFunc& func)
+
+      : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_func(func) {}
+
+  EIGEN_DEVICE_FUNC const CustomBinaryFunc& func() const { return m_func; }
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename LhsXprType::Nested>& lhsExpression() const {
+    return m_lhs_xpr;
+  }
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename RhsXprType::Nested>& rhsExpression() const {
+    return m_rhs_xpr;
+  }
+
+ protected:
+  typename LhsXprType::Nested m_lhs_xpr;
+  typename RhsXprType::Nested m_rhs_xpr;
+  const CustomBinaryFunc m_func;
+};
+
+// Eval as rvalue
+template <typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType, typename Device>
+struct TensorEvaluator<const TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType>, Device> {
+  typedef TensorCustomBinaryOp<CustomBinaryFunc, LhsXprType, RhsXprType> XprType;
+  typedef typename internal::traits<XprType>::Index Index;
+  static constexpr int NumDims = internal::traits<XprType>::NumDimensions;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef std::remove_const_t<typename XprType::CoeffReturnType> CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+
+  typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  static constexpr int Layout = TensorEvaluator<LhsXprType, Device>::Layout;
+  enum {
+    IsAligned = false,
+    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
+    BlockAccess = false,
+    PreferBlockAccess = false,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_op(op), m_device(device), m_result(NULL) {
+    m_dimensions = op.func().dimensions(op.lhsExpression(), op.rhsExpression());
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
+    if (data) {
+      evalTo(data);
+      return false;
+    } else {
+      m_result = static_cast<EvaluatorPointerType>(
+          m_device.get((CoeffReturnType*)m_device.allocate_temp(dimensions().TotalSize() * sizeof(CoeffReturnType))));
+      evalTo(m_result);
+      return true;
+    }
+  }
+
+  EIGEN_STRONG_INLINE void cleanup() {
+    if (m_result != NULL) {
+      m_device.deallocate_temp(m_result);
+      m_result = NULL;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_result[index]; }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
+    return internal::ploadt<PacketReturnType, LoadMode>(m_result + index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    // TODO(rmlarsen): Extend CustomOp API to return its cost estimate.
+    return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; }
+
+ protected:
+  void evalTo(EvaluatorPointerType data) {
+    TensorMap<Tensor<CoeffReturnType, NumDims, Layout> > result(m_device.get(data), m_dimensions);
+    m_op.func().eval(m_op.lhsExpression(), m_op.rhsExpression(), result, m_device);
+  }
+
+  Dimensions m_dimensions;
+  const XprType m_op;
+  const Device EIGEN_DEVICE_REF m_device;
+  EvaluatorPointerType m_result;
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_CUSTOM_OP_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
new file mode 100644
index 00000000..1b9b49e1
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h
@@ -0,0 +1,138 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H
+#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/**
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Pseudo expression providing an operator = that will evaluate its argument
+ * on the specified computing 'device' (GPU, thread pool, ...)
+ *
+ * Example:
+ *    C.device(EIGEN_GPU) = A + B;
+ *
+ * Todo: operator *= and /=.
+ */
+template <typename ExpressionType, typename DeviceType>
+class TensorDevice {
+ public:
+  TensorDevice(const DeviceType& device, ExpressionType& expression) : m_device(device), m_expression(expression) {}
+
+  EIGEN_DEFAULT_COPY_CONSTRUCTOR(TensorDevice)
+
+  template <typename OtherDerived>
+  EIGEN_STRONG_INLINE TensorDevice& operator=(const OtherDerived& other) {
+    typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
+    Assign assign(m_expression, other);
+    internal::TensorExecutor<const Assign, DeviceType>::run(assign, m_device);
+    return *this;
+  }
+
+  template <typename OtherDerived>
+  EIGEN_STRONG_INLINE TensorDevice& operator+=(const OtherDerived& other) {
+    typedef typename OtherDerived::Scalar Scalar;
+    typedef TensorCwiseBinaryOp<internal::scalar_sum_op<Scalar>, const ExpressionType, const OtherDerived> Sum;
+    Sum sum(m_expression, other);
+    typedef TensorAssignOp<ExpressionType, const Sum> Assign;
+    Assign assign(m_expression, sum);
+    internal::TensorExecutor<const Assign, DeviceType>::run(assign, m_device);
+    return *this;
+  }
+
+  template <typename OtherDerived>
+  EIGEN_STRONG_INLINE TensorDevice& operator-=(const OtherDerived& other) {
+    typedef typename OtherDerived::Scalar Scalar;
+    typedef TensorCwiseBinaryOp<internal::scalar_difference_op<Scalar>, const ExpressionType, const OtherDerived>
+        Difference;
+    Difference difference(m_expression, other);
+    typedef TensorAssignOp<ExpressionType, const Difference> Assign;
+    Assign assign(m_expression, difference);
+    internal::TensorExecutor<const Assign, DeviceType>::run(assign, m_device);
+    return *this;
+  }
+
+ protected:
+  const DeviceType& m_device;
+  ExpressionType& m_expression;
+};
+
+/** \class TensorAsyncDevice
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Pseudo expression providing an operator = that will evaluate its
+ * argument asynchronously on the specified device. Currently only
+ * ThreadPoolDevice implements proper asynchronous execution, while the default
+ * and GPU devices just run the expression synchronously and call m_done() on
+ * completion..
+ *
+ * Example:
+ *    auto done = []() { ... expression evaluation done ... };
+ *    C.device(thread_pool_device, std::move(done)) = A + B;
+ */
+
+template <typename ExpressionType, typename DeviceType, typename DoneCallback>
+class TensorAsyncDevice {
+ public:
+  TensorAsyncDevice(const DeviceType& device, ExpressionType& expression, DoneCallback done)
+      : m_device(device), m_expression(expression), m_done(std::move(done)) {}
+
+  template <typename OtherDerived>
+  EIGEN_STRONG_INLINE TensorAsyncDevice& operator=(const OtherDerived& other) {
+    typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
+    typedef internal::TensorExecutor<const Assign, DeviceType> Executor;
+
+    Assign assign(m_expression, other);
+    Executor::run(assign, m_device);
+    m_done();
+
+    return *this;
+  }
+
+ protected:
+  const DeviceType& m_device;
+  ExpressionType& m_expression;
+  DoneCallback m_done;
+};
+
+#ifdef EIGEN_USE_THREADS
+template <typename ExpressionType, typename DoneCallback>
+class TensorAsyncDevice<ExpressionType, ThreadPoolDevice, DoneCallback> {
+ public:
+  TensorAsyncDevice(const ThreadPoolDevice& device, ExpressionType& expression, DoneCallback done)
+      : m_device(device), m_expression(expression), m_done(std::move(done)) {}
+
+  template <typename OtherDerived>
+  EIGEN_STRONG_INLINE TensorAsyncDevice& operator=(const OtherDerived& other) {
+    typedef TensorAssignOp<ExpressionType, const OtherDerived> Assign;
+    typedef internal::TensorAsyncExecutor<const Assign, ThreadPoolDevice, DoneCallback> Executor;
+
+    // WARNING: After assignment 'm_done' callback will be in undefined state.
+    Assign assign(m_expression, other);
+    Executor::runAsync(assign, m_device, std::move(m_done));
+
+    return *this;
+  }
+
+ protected:
+  const ThreadPoolDevice& m_device;
+  ExpressionType& m_expression;
+  DoneCallback m_done;
+};
+#endif
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
new file mode 100644
index 00000000..c2c8ed00
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@@ -0,0 +1,7 @@
+
+#if defined(__clang__) || defined(__GNUC__)
+#warning \
+    "Deprecated header file, please either include the main Eigen/CXX11/Tensor header or the respective TensorDeviceGpu.h file"
+#endif
+
+#include "TensorDeviceGpu.h"
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
new file mode 100644
index 00000000..eaaf3321
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h
@@ -0,0 +1,113 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_DEVICE_DEFAULT_H
+#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_DEFAULT_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+// Default device for the machine (typically a single cpu core)
+struct DefaultDevice {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
+    return internal::aligned_malloc(num_bytes);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const { internal::aligned_free(buffer); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const { return allocate(num_bytes); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const { deallocate(buffer); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
+    ::memcpy(dst, src, n);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
+    memcpy(dst, src, n);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
+    memcpy(dst, src, n);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { ::memset(buffer, c, n); }
+  template <typename T>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void fill(T* begin, T* end, const T& value) const {
+#ifdef EIGEN_GPU_COMPILE_PHASE
+    // std::fill is not a device function, so resort to simple loop.
+    for (T* it = begin; it != end; ++it) {
+      *it = value;
+    }
+#else
+    std::fill(begin, end, value);
+#endif
+  }
+  template <typename Type>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const {
+    return data;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
+#if !defined(EIGEN_GPU_COMPILE_PHASE)
+    // Running on the host CPU
+    return 1;
+#elif defined(EIGEN_HIP_DEVICE_COMPILE)
+    // Running on a HIP device
+    return 64;
+#else
+    // Running on a CUDA device
+    return 32;
+#endif
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
+#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY)
+    // Running on the host CPU
+    return l1CacheSize();
+#elif defined(EIGEN_HIP_DEVICE_COMPILE)
+    // Running on a HIP device
+    return 48 * 1024;  // FIXME : update this number for HIP
+#else
+    // Running on a CUDA device, return the amount of shared memory available.
+    return 48 * 1024;
+#endif
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
+#if !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY)
+    // Running single threaded on the host CPU
+    return l3CacheSize();
+#elif defined(EIGEN_HIP_DEVICE_COMPILE)
+    // Running on a HIP device
+    return firstLevelCacheSize();  // FIXME : update this number for HIP
+#else
+    // Running on a CUDA device
+    return firstLevelCacheSize();
+#endif
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
+    // Nothing.  Default device operations are synchronous.
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
+#if !defined(EIGEN_GPU_COMPILE_PHASE)
+    // Running single threaded on the host CPU
+    // Should return an enum that encodes the ISA supported by the CPU
+    return 1;
+#elif defined(EIGEN_HIP_DEVICE_COMPILE)
+    // Running on a HIP device
+    // return 1 as major for HIP
+    return 1;
+#else
+    // Running on a CUDA device
+    return EIGEN_CUDA_ARCH / 100;
+#endif
+  }
+};
+
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_DEFAULT_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
new file mode 100644
index 00000000..2a3b0873
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h
@@ -0,0 +1,392 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#if defined(EIGEN_USE_GPU) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H)
+#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+#include "../../../../../Eigen/src/Core/util/GpuHipCudaDefines.inc"
+
+namespace Eigen {
+
+static const int kGpuScratchSize = 1024;
+
+// This defines an interface that GPUDevice can take to use
+// HIP / CUDA streams underneath.
+class StreamInterface {
+ public:
+  virtual ~StreamInterface() {}
+
+  virtual const gpuStream_t& stream() const = 0;
+  virtual const gpuDeviceProp_t& deviceProperties() const = 0;
+
+  // Allocate memory on the actual device where the computation will run
+  virtual void* allocate(size_t num_bytes) const = 0;
+  virtual void deallocate(void* buffer) const = 0;
+
+  // Return a scratchpad buffer of size 1k
+  virtual void* scratchpad() const = 0;
+
+  // Return a semaphore. The semaphore is initially initialized to 0, and
+  // each kernel using it is responsible for resetting to 0 upon completion
+  // to maintain the invariant that the semaphore is always equal to 0 upon
+  // each kernel start.
+  virtual unsigned int* semaphore() const = 0;
+};
+
+class GpuDeviceProperties {
+ public:
+  GpuDeviceProperties() : initialized_(false), first_(true), device_properties_(nullptr) {}
+
+  ~GpuDeviceProperties() {
+    if (device_properties_) {
+      delete[] device_properties_;
+    }
+  }
+
+  EIGEN_STRONG_INLINE const gpuDeviceProp_t& get(int device) const { return device_properties_[device]; }
+
+  EIGEN_STRONG_INLINE bool isInitialized() const { return initialized_; }
+
+  void initialize() {
+    if (!initialized_) {
+      // Attempts to ensure proper behavior in the case of multiple threads
+      // calling this function simultaneously. This would be trivial to
+      // implement if we could use std::mutex, but unfortunately mutex don't
+      // compile with nvcc, so we resort to atomics and thread fences instead.
+      // Note that if the caller uses a compiler that doesn't support c++11 we
+      // can't ensure that the initialization is thread safe.
+      if (first_.exchange(false)) {
+        // We're the first thread to reach this point.
+        int num_devices;
+        gpuError_t status = gpuGetDeviceCount(&num_devices);
+        if (status != gpuSuccess) {
+          std::cerr << "Failed to get the number of GPU devices: " << gpuGetErrorString(status) << std::endl;
+          gpu_assert(status == gpuSuccess);
+        }
+        device_properties_ = new gpuDeviceProp_t[num_devices];
+        for (int i = 0; i < num_devices; ++i) {
+          status = gpuGetDeviceProperties(&device_properties_[i], i);
+          if (status != gpuSuccess) {
+            std::cerr << "Failed to initialize GPU device #" << i << ": " << gpuGetErrorString(status) << std::endl;
+            gpu_assert(status == gpuSuccess);
+          }
+        }
+
+        std::atomic_thread_fence(std::memory_order_release);
+        initialized_ = true;
+      } else {
+        // Wait for the other thread to inititialize the properties.
+        while (!initialized_) {
+          std::atomic_thread_fence(std::memory_order_acquire);
+          std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+        }
+      }
+    }
+  }
+
+ private:
+  volatile bool initialized_;
+  std::atomic<bool> first_;
+  gpuDeviceProp_t* device_properties_;
+};
+
+EIGEN_ALWAYS_INLINE const GpuDeviceProperties& GetGpuDeviceProperties() {
+  static GpuDeviceProperties* deviceProperties = new GpuDeviceProperties();
+  if (!deviceProperties->isInitialized()) {
+    deviceProperties->initialize();
+  }
+  return *deviceProperties;
+}
+
+EIGEN_ALWAYS_INLINE const gpuDeviceProp_t& GetGpuDeviceProperties(int device) {
+  return GetGpuDeviceProperties().get(device);
+}
+
+static const gpuStream_t default_stream = gpuStreamDefault;
+
+class GpuStreamDevice : public StreamInterface {
+ public:
+  // Use the default stream on the current device
+  GpuStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
+    gpuError_t status = gpuGetDevice(&device_);
+    if (status != gpuSuccess) {
+      std::cerr << "Failed to get the GPU devices " << gpuGetErrorString(status) << std::endl;
+      gpu_assert(status == gpuSuccess);
+    }
+  }
+  // Use the default stream on the specified device
+  GpuStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {}
+  // Use the specified stream. Note that it's the
+  // caller responsibility to ensure that the stream can run on
+  // the specified device. If no device is specified the code
+  // assumes that the stream is associated to the current gpu device.
+  GpuStreamDevice(const gpuStream_t* stream, int device = -1)
+      : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) {
+    if (device < 0) {
+      gpuError_t status = gpuGetDevice(&device_);
+      if (status != gpuSuccess) {
+        std::cerr << "Failed to get the GPU devices " << gpuGetErrorString(status) << std::endl;
+        gpu_assert(status == gpuSuccess);
+      }
+    } else {
+      int num_devices;
+      gpuError_t err = gpuGetDeviceCount(&num_devices);
+      EIGEN_UNUSED_VARIABLE(err)
+      gpu_assert(err == gpuSuccess);
+      gpu_assert(device < num_devices);
+      device_ = device;
+    }
+  }
+
+  virtual ~GpuStreamDevice() {
+    if (scratch_) {
+      deallocate(scratch_);
+    }
+  }
+
+  const gpuStream_t& stream() const { return *stream_; }
+  const gpuDeviceProp_t& deviceProperties() const { return GetGpuDeviceProperties(device_); }
+  virtual void* allocate(size_t num_bytes) const {
+    gpuError_t err = gpuSetDevice(device_);
+    EIGEN_UNUSED_VARIABLE(err)
+    gpu_assert(err == gpuSuccess);
+    void* result;
+    err = gpuMalloc(&result, num_bytes);
+    gpu_assert(err == gpuSuccess);
+    gpu_assert(result != NULL);
+    return result;
+  }
+  virtual void deallocate(void* buffer) const {
+    gpuError_t err = gpuSetDevice(device_);
+    EIGEN_UNUSED_VARIABLE(err)
+    gpu_assert(err == gpuSuccess);
+    gpu_assert(buffer != NULL);
+    err = gpuFree(buffer);
+    gpu_assert(err == gpuSuccess);
+  }
+
+  virtual void* scratchpad() const {
+    if (scratch_ == NULL) {
+      scratch_ = allocate(kGpuScratchSize + sizeof(unsigned int));
+    }
+    return scratch_;
+  }
+
+  virtual unsigned int* semaphore() const {
+    if (semaphore_ == NULL) {
+      char* scratch = static_cast<char*>(scratchpad()) + kGpuScratchSize;
+      semaphore_ = reinterpret_cast<unsigned int*>(scratch);
+      gpuError_t err = gpuMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_);
+      EIGEN_UNUSED_VARIABLE(err)
+      gpu_assert(err == gpuSuccess);
+    }
+    return semaphore_;
+  }
+
+ private:
+  const gpuStream_t* stream_;
+  int device_;
+  mutable void* scratch_;
+  mutable unsigned int* semaphore_;
+};
+
+struct GpuDevice {
+  // The StreamInterface is not owned: the caller is
+  // responsible for its initialization and eventual destruction.
+  explicit GpuDevice(const StreamInterface* stream) : stream_(stream), max_blocks_(INT_MAX) { eigen_assert(stream); }
+  explicit GpuDevice(const StreamInterface* stream, int num_blocks) : stream_(stream), max_blocks_(num_blocks) {
+    eigen_assert(stream);
+  }
+  // TODO(bsteiner): This is an internal API, we should not expose it.
+  EIGEN_STRONG_INLINE const gpuStream_t& stream() const { return stream_->stream(); }
+
+  EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const { return stream_->allocate(num_bytes); }
+
+  EIGEN_STRONG_INLINE void deallocate(void* buffer) const { stream_->deallocate(buffer); }
+
+  EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const { return stream_->allocate(num_bytes); }
+
+  EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const { stream_->deallocate(buffer); }
+
+  template <typename Type>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const {
+    return data;
+  }
+
+  EIGEN_STRONG_INLINE void* scratchpad() const { return stream_->scratchpad(); }
+
+  EIGEN_STRONG_INLINE unsigned int* semaphore() const { return stream_->semaphore(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
+#ifndef EIGEN_GPU_COMPILE_PHASE
+    gpuError_t err = gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToDevice, stream_->stream());
+    EIGEN_UNUSED_VARIABLE(err)
+    gpu_assert(err == gpuSuccess);
+#else
+    EIGEN_UNUSED_VARIABLE(dst);
+    EIGEN_UNUSED_VARIABLE(src);
+    EIGEN_UNUSED_VARIABLE(n);
+    eigen_assert(false && "The default device should be used instead to generate kernel code");
+#endif
+  }
+
+  EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
+    gpuError_t err = gpuMemcpyAsync(dst, src, n, gpuMemcpyHostToDevice, stream_->stream());
+    EIGEN_UNUSED_VARIABLE(err)
+    gpu_assert(err == gpuSuccess);
+  }
+
+  EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
+    gpuError_t err = gpuMemcpyAsync(dst, src, n, gpuMemcpyDeviceToHost, stream_->stream());
+    EIGEN_UNUSED_VARIABLE(err)
+    gpu_assert(err == gpuSuccess);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
+#ifndef EIGEN_GPU_COMPILE_PHASE
+    gpuError_t err = gpuMemsetAsync(buffer, c, n, stream_->stream());
+    EIGEN_UNUSED_VARIABLE(err)
+    gpu_assert(err == gpuSuccess);
+#else
+    EIGEN_UNUSED_VARIABLE(buffer)
+    EIGEN_UNUSED_VARIABLE(c)
+    EIGEN_UNUSED_VARIABLE(n)
+    eigen_assert(false && "The default device should be used instead to generate kernel code");
+#endif
+  }
+
+  template <typename T>
+  EIGEN_STRONG_INLINE void fill(T* begin, T* end, const T& value) const {
+#ifndef EIGEN_GPU_COMPILE_PHASE
+    const size_t count = end - begin;
+    // Split value into bytes and run memset with stride.
+    const int value_size = sizeof(value);
+    char* buffer = (char*)begin;
+    char* value_bytes = (char*)(&value);
+    gpuError_t err;
+    EIGEN_UNUSED_VARIABLE(err)
+
+    // If all value bytes are equal, then a single memset can be much faster.
+    bool use_single_memset = true;
+    for (int i = 1; i < value_size; ++i) {
+      if (value_bytes[i] != value_bytes[0]) {
+        use_single_memset = false;
+      }
+    }
+
+    if (use_single_memset) {
+      err = gpuMemsetAsync(buffer, value_bytes[0], count * sizeof(T), stream_->stream());
+      gpu_assert(err == gpuSuccess);
+    } else {
+      for (int b = 0; b < value_size; ++b) {
+        err = gpuMemset2DAsync(buffer + b, value_size, value_bytes[b], 1, count, stream_->stream());
+        gpu_assert(err == gpuSuccess);
+      }
+    }
+#else
+    EIGEN_UNUSED_VARIABLE(begin)
+    EIGEN_UNUSED_VARIABLE(end)
+    EIGEN_UNUSED_VARIABLE(value)
+    eigen_assert(false && "The default device should be used instead to generate kernel code");
+#endif
+  }
+
+  EIGEN_STRONG_INLINE size_t numThreads() const {
+    // FIXME
+    return 32;
+  }
+
+  EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
+    // FIXME
+    return 48 * 1024;
+  }
+
+  EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
+    // We won't try to take advantage of the l2 cache for the time being, and
+    // there is no l3 cache on hip/cuda devices.
+    return firstLevelCacheSize();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
+#ifndef EIGEN_GPU_COMPILE_PHASE
+    gpuError_t err = gpuStreamSynchronize(stream_->stream());
+    if (err != gpuSuccess) {
+      std::cerr << "Error detected in GPU stream: " << gpuGetErrorString(err) << std::endl;
+      gpu_assert(err == gpuSuccess);
+    }
+#else
+    gpu_assert(false && "The default device should be used instead to generate kernel code");
+#endif
+  }
+
+  EIGEN_STRONG_INLINE int getNumGpuMultiProcessors() const { return stream_->deviceProperties().multiProcessorCount; }
+  EIGEN_STRONG_INLINE int maxGpuThreadsPerBlock() const { return stream_->deviceProperties().maxThreadsPerBlock; }
+  EIGEN_STRONG_INLINE int maxGpuThreadsPerMultiProcessor() const {
+    return stream_->deviceProperties().maxThreadsPerMultiProcessor;
+  }
+  EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
+    return static_cast<int>(stream_->deviceProperties().sharedMemPerBlock);
+  }
+  EIGEN_STRONG_INLINE int majorDeviceVersion() const { return stream_->deviceProperties().major; }
+  EIGEN_STRONG_INLINE int minorDeviceVersion() const { return stream_->deviceProperties().minor; }
+
+  EIGEN_STRONG_INLINE int maxBlocks() const { return max_blocks_; }
+
+  // This function checks if the GPU runtime recorded an error for the
+  // underlying stream device.
+  inline bool ok() const {
+#ifdef EIGEN_GPUCC
+    gpuError_t error = gpuStreamQuery(stream_->stream());
+    return (error == gpuSuccess) || (error == gpuErrorNotReady);
+#else
+    return false;
+#endif
+  }
+
+ private:
+  const StreamInterface* stream_;
+  int max_blocks_;
+};
+
+#if defined(EIGEN_HIPCC)
+
+#define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)                              \
+  hipLaunchKernelGGL(kernel, dim3(gridsize), dim3(blocksize), (sharedmem), (device).stream(), __VA_ARGS__); \
+  gpu_assert(hipGetLastError() == hipSuccess);
+
+#else
+
+#define LAUNCH_GPU_KERNEL(kernel, gridsize, blocksize, sharedmem, device, ...)        \
+  (kernel)<<<(gridsize), (blocksize), (sharedmem), (device).stream()>>>(__VA_ARGS__); \
+  gpu_assert(cudaGetLastError() == cudaSuccess);
+
+#endif
+
+// FIXME: Should be device and kernel specific.
+#ifdef EIGEN_GPUCC
+static EIGEN_DEVICE_FUNC inline void setGpuSharedMemConfig(gpuSharedMemConfig config) {
+#ifndef EIGEN_GPU_COMPILE_PHASE
+  gpuError_t status = gpuDeviceSetSharedMemConfig(config);
+  EIGEN_UNUSED_VARIABLE(status)
+  gpu_assert(status == gpuSuccess);
+#else
+  EIGEN_UNUSED_VARIABLE(config)
+#endif
+}
+#endif
+
+}  // end namespace Eigen
+
+// undefine all the gpu* macros we defined at the beginning of the file
+#include "../../../../../Eigen/src/Core/util/GpuHipCudaUndefines.inc"
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_GPU_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
new file mode 100644
index 00000000..b291fed6
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h
@@ -0,0 +1,567 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#if defined(EIGEN_USE_SYCL) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H)
+#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
+#include <unordered_set>
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace TensorSycl {
+namespace internal {
+
+/// Cache all the device information needed
+struct SyclDeviceInfo {
+  SyclDeviceInfo(cl::sycl::queue queue)
+      : local_mem_type(queue.get_device().template get_info<cl::sycl::info::device::local_mem_type>()),
+        max_work_item_sizes(queue.get_device().template get_info<cl::sycl::info::device::max_work_item_sizes<3>>()),
+        max_mem_alloc_size(queue.get_device().template get_info<cl::sycl::info::device::max_mem_alloc_size>()),
+        max_compute_units(queue.get_device().template get_info<cl::sycl::info::device::max_compute_units>()),
+        max_work_group_size(queue.get_device().template get_info<cl::sycl::info::device::max_work_group_size>()),
+        local_mem_size(queue.get_device().template get_info<cl::sycl::info::device::local_mem_size>()),
+        platform_name(queue.get_device().get_platform().template get_info<cl::sycl::info::platform::name>()),
+        device_name(queue.get_device().template get_info<cl::sycl::info::device::name>()),
+        device_vendor(queue.get_device().template get_info<cl::sycl::info::device::vendor>()) {}
+
+  cl::sycl::info::local_mem_type local_mem_type;
+  cl::sycl::id<3> max_work_item_sizes;
+  unsigned long max_mem_alloc_size;
+  unsigned long max_compute_units;
+  unsigned long max_work_group_size;
+  size_t local_mem_size;
+  std::string platform_name;
+  std::string device_name;
+  std::string device_vendor;
+};
+
+}  // end namespace internal
+}  // end namespace TensorSycl
+
+// All devices (even AMD CPU with intel OpenCL runtime) that support OpenCL and
+// can consume SPIR or SPIRV can use the Eigen SYCL backend and consequently
+// TensorFlow via the Eigen SYCL Backend.
+EIGEN_STRONG_INLINE auto get_sycl_supported_devices() -> decltype(cl::sycl::device::get_devices()) {
+#ifdef EIGEN_SYCL_USE_DEFAULT_SELECTOR
+  return {cl::sycl::device(cl::sycl::default_selector())};
+#else
+  std::vector<cl::sycl::device> supported_devices;
+  auto platform_list = cl::sycl::platform::get_platforms();
+  for (const auto &platform : platform_list) {
+    auto device_list = platform.get_devices();
+    auto platform_name = platform.template get_info<cl::sycl::info::platform::name>();
+    std::transform(platform_name.begin(), platform_name.end(), platform_name.begin(), ::tolower);
+    for (const auto &device : device_list) {
+      auto vendor = device.template get_info<cl::sycl::info::device::vendor>();
+      std::transform(vendor.begin(), vendor.end(), vendor.begin(), ::tolower);
+      bool unsupported_condition = (device.is_cpu() && platform_name.find("amd") != std::string::npos &&
+                                    vendor.find("apu") == std::string::npos) ||
+                                   (platform_name.find("experimental") != std::string::npos) || device.is_host();
+      if (!unsupported_condition) {
+        supported_devices.push_back(device);
+      }
+    }
+  }
+  return supported_devices;
+#endif
+}
+
+class QueueInterface {
+ public:
+  /// Creating device by using cl::sycl::selector or cl::sycl::device.
+  template <typename DeviceOrSelector>
+  explicit QueueInterface(const DeviceOrSelector &dev_or_sel, cl::sycl::async_handler handler,
+                          unsigned num_threads = std::thread::hardware_concurrency())
+      : m_queue{dev_or_sel, handler, {sycl::property::queue::in_order()}},
+        m_thread_pool(num_threads),
+        m_device_info(m_queue) {}
+
+  template <typename DeviceOrSelector>
+  explicit QueueInterface(const DeviceOrSelector &dev_or_sel,
+                          unsigned num_threads = std::thread::hardware_concurrency())
+      : QueueInterface(
+            dev_or_sel, [this](cl::sycl::exception_list l) { this->exception_caught_ = this->sycl_async_handler(l); },
+            num_threads) {}
+
+  explicit QueueInterface(const cl::sycl::queue &q, unsigned num_threads = std::thread::hardware_concurrency())
+      : m_queue(q), m_thread_pool(num_threads), m_device_info(m_queue) {}
+
+  EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const {
+#if EIGEN_MAX_ALIGN_BYTES > 0
+    return (void *)cl::sycl::aligned_alloc_device(EIGEN_MAX_ALIGN_BYTES, num_bytes, m_queue);
+#else
+    return (void *)cl::sycl::malloc_device(num_bytes, m_queue);
+#endif
+  }
+
+  EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const {
+    return (void *)cl::sycl::malloc_device<uint8_t>(num_bytes, m_queue);
+  }
+
+  template <typename data_t>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(data_t *data) const {
+    return data;
+  }
+
+  EIGEN_STRONG_INLINE void deallocate_temp(void *p) const { deallocate(p); }
+
+  EIGEN_STRONG_INLINE void deallocate_temp(const void *p) const { deallocate_temp(const_cast<void *>(p)); }
+
+  EIGEN_STRONG_INLINE void deallocate(void *p) const { cl::sycl::free(p, m_queue); }
+
+  /// The memcpyHostToDevice is used to copy the data from host to device
+  /// The destination pointer could be deleted before the copy happened which is
+  /// why a callback function is needed. By default if none is provided, the
+  /// function is blocking.
+  EIGEN_STRONG_INLINE void memcpyHostToDevice(void *dst, const void *src, size_t n,
+                                              std::function<void()> callback) const {
+    auto e = m_queue.memcpy(dst, src, n);
+    synchronize_and_callback(e, callback);
+  }
+
+  /// The memcpyDeviceToHost is used to copy the data from device to host.
+  /// The source pointer could be deleted before the copy happened which is
+  /// why a callback function is needed. By default if none is provided, the
+  /// function is blocking.
+  EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const void *src, size_t n,
+                                              std::function<void()> callback) const {
+    if (n == 0) {
+      if (callback) callback();
+      return;
+    }
+    auto e = m_queue.memcpy(dst, src, n);
+    synchronize_and_callback(e, callback);
+  }
+
+  /// The memcpy function.
+  /// No callback is required here as both arguments are on the device
+  /// and SYCL can handle the dependency.
+  EIGEN_STRONG_INLINE void memcpy(void *dst, const void *src, size_t n) const {
+    if (n == 0) {
+      return;
+    }
+    m_queue.memcpy(dst, src, n).wait();
+  }
+
+  /// the memset function.
+  /// No callback is required here as both arguments are on the device
+  /// and SYCL can handle the dependency.
+  EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const {
+    if (n == 0) {
+      return;
+    }
+    m_queue.memset(data, c, n).wait();
+  }
+
+  template <typename T>
+  EIGEN_STRONG_INLINE void fill(T *begin, T *end, const T &value) const {
+    if (begin == end) {
+      return;
+    }
+    const size_t count = end - begin;
+    m_queue.fill(begin, value, count).wait();
+  }
+
+  template <typename OutScalar, typename sycl_kernel, typename Lhs, typename Rhs, typename OutPtr, typename Range,
+            typename Index, typename... T>
+  EIGEN_ALWAYS_INLINE cl::sycl::event binary_kernel_launcher(const Lhs &lhs, const Rhs &rhs, OutPtr outptr,
+                                                             Range thread_range, Index scratchSize, T... var) const {
+    auto kernel_functor = [=](cl::sycl::handler &cgh) {
+      typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+          LocalAccessor;
+
+      LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
+      cgh.parallel_for(thread_range, sycl_kernel(scratch, lhs, rhs, outptr, var...));
+    };
+
+    return m_queue.submit(kernel_functor);
+  }
+
+  template <typename OutScalar, typename sycl_kernel, typename InPtr, typename OutPtr, typename Range, typename Index,
+            typename... T>
+  EIGEN_ALWAYS_INLINE cl::sycl::event unary_kernel_launcher(const InPtr &inptr, OutPtr &outptr, Range thread_range,
+                                                            Index scratchSize, T... var) const {
+    auto kernel_functor = [=](cl::sycl::handler &cgh) {
+      typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+          LocalAccessor;
+
+      LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
+      cgh.parallel_for(thread_range, sycl_kernel(scratch, inptr, outptr, var...));
+    };
+    return m_queue.submit(kernel_functor);
+  }
+
+  template <typename OutScalar, typename sycl_kernel, typename InPtr, typename Range, typename Index, typename... T>
+  EIGEN_ALWAYS_INLINE cl::sycl::event nullary_kernel_launcher(const InPtr &inptr, Range thread_range, Index scratchSize,
+                                                              T... var) const {
+    auto kernel_functor = [=](cl::sycl::handler &cgh) {
+      typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+          LocalAccessor;
+
+      LocalAccessor scratch(cl::sycl::range<1>(scratchSize), cgh);
+      cgh.parallel_for(thread_range, sycl_kernel(scratch, inptr, var...));
+    };
+
+    return m_queue.submit(kernel_functor);
+  }
+
+  EIGEN_STRONG_INLINE void synchronize() const {
+#ifdef EIGEN_EXCEPTIONS
+    m_queue.wait_and_throw();
+#else
+    m_queue.wait();
+#endif
+  }
+
+  template <typename Index>
+  EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange) const {
+    tileSize = static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
+    tileSize = std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1),
+                        static_cast<Index>(tileSize));
+    rng = n;
+    if (rng == 0) rng = static_cast<Index>(1);
+    GRange = rng;
+    if (tileSize > GRange)
+      tileSize = GRange;
+    else if (GRange > tileSize) {
+      Index xMode = static_cast<Index>(GRange % tileSize);
+      if (xMode != 0) GRange += static_cast<Index>(tileSize - xMode);
+    }
+  }
+
+  /// This is used to prepare the number of threads and also the number of
+  /// threads per block for sycl kernels
+  template <typename Index>
+  EIGEN_STRONG_INLINE void parallel_for_setup(const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
+                                              cl::sycl::range<2> &local_range) const {
+    std::array<Index, 2> input_range = input_dim;
+    Index max_workgroup_Size = static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
+    max_workgroup_Size = std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1),
+                                  static_cast<Index>(max_workgroup_Size));
+    Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
+    local_range[1] = static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2)));
+    input_range[1] = input_dim[1];
+    if (input_range[1] == 0) input_range[1] = static_cast<Index>(1);
+    global_range[1] = input_range[1];
+    if (local_range[1] > global_range[1])
+      local_range[1] = global_range[1];
+    else if (global_range[1] > local_range[1]) {
+      Index xMode = static_cast<Index>(global_range[1] % local_range[1]);
+      if (xMode != 0) global_range[1] += static_cast<Index>(local_range[1] - xMode);
+    }
+    local_range[0] = static_cast<Index>(max_workgroup_Size / local_range[1]);
+    input_range[0] = input_dim[0];
+    if (input_range[0] == 0) input_range[0] = static_cast<Index>(1);
+    global_range[0] = input_range[0];
+    if (local_range[0] > global_range[0])
+      local_range[0] = global_range[0];
+    else if (global_range[0] > local_range[0]) {
+      Index xMode = static_cast<Index>(global_range[0] % local_range[0]);
+      if (xMode != 0) global_range[0] += static_cast<Index>(local_range[0] - xMode);
+    }
+  }
+
+  /// This is used to prepare the number of threads and also the number of
+  /// threads per block for sycl kernels
+  template <typename Index>
+  EIGEN_STRONG_INLINE void parallel_for_setup(const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
+                                              cl::sycl::range<3> &local_range) const {
+    std::array<Index, 3> input_range = input_dim;
+    Index max_workgroup_Size = static_cast<Index>(getNearestPowerOfTwoWorkGroupSize());
+    max_workgroup_Size = std::min(static_cast<Index>(EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1),
+                                  static_cast<Index>(max_workgroup_Size));
+    Index pow_of_2 = static_cast<Index>(std::log2(max_workgroup_Size));
+    local_range[2] = static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 3)));
+    input_range[2] = input_dim[2];
+    if (input_range[2] == 0) input_range[1] = static_cast<Index>(1);
+    global_range[2] = input_range[2];
+    if (local_range[2] > global_range[2])
+      local_range[2] = global_range[2];
+    else if (global_range[2] > local_range[2]) {
+      Index xMode = static_cast<Index>(global_range[2] % local_range[2]);
+      if (xMode != 0) global_range[2] += static_cast<Index>(local_range[2] - xMode);
+    }
+    pow_of_2 = static_cast<Index>(std::log2(static_cast<Index>(max_workgroup_Size / local_range[2])));
+    local_range[1] = static_cast<Index>(std::pow(2, static_cast<Index>(pow_of_2 / 2)));
+    input_range[1] = input_dim[1];
+    if (input_range[1] == 0) input_range[1] = static_cast<Index>(1);
+    global_range[1] = input_range[1];
+    if (local_range[1] > global_range[1])
+      local_range[1] = global_range[1];
+    else if (global_range[1] > local_range[1]) {
+      Index xMode = static_cast<Index>(global_range[1] % local_range[1]);
+      if (xMode != 0) global_range[1] += static_cast<Index>(local_range[1] - xMode);
+    }
+    local_range[0] = static_cast<Index>(max_workgroup_Size / (local_range[1] * local_range[2]));
+    input_range[0] = input_dim[0];
+    if (input_range[0] == 0) input_range[0] = static_cast<Index>(1);
+    global_range[0] = input_range[0];
+    if (local_range[0] > global_range[0])
+      local_range[0] = global_range[0];
+    else if (global_range[0] > local_range[0]) {
+      Index xMode = static_cast<Index>(global_range[0] % local_range[0]);
+      if (xMode != 0) global_range[0] += static_cast<Index>(local_range[0] - xMode);
+    }
+  }
+
+  EIGEN_STRONG_INLINE bool has_local_memory() const {
+#if !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM)
+    return false;
+#elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)
+    return true;
+#else
+    return m_device_info.local_mem_type == cl::sycl::info::local_mem_type::local;
+#endif
+  }
+
+  EIGEN_STRONG_INLINE unsigned long max_buffer_size() const { return m_device_info.max_mem_alloc_size; }
+
+  EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const { return m_device_info.max_compute_units; }
+
+  EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const { return m_device_info.max_work_group_size; }
+
+  EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const { return m_device_info.max_work_item_sizes; }
+
+  /// No need for sycl it should act the same as CPU version
+  EIGEN_STRONG_INLINE int majorDeviceVersion() const { return 1; }
+
+  EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const {
+    // OpenCL does not have such a concept
+    return 2;
+  }
+
+  EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const { return m_device_info.local_mem_size; }
+
+  // This function returns the nearest power of 2 Work-group size which is <=
+  // maximum device workgroup size.
+  EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const {
+    return getPowerOfTwo(m_device_info.max_work_group_size, false);
+  }
+
+  EIGEN_STRONG_INLINE std::string getPlatformName() const { return m_device_info.platform_name; }
+
+  EIGEN_STRONG_INLINE std::string getDeviceName() const { return m_device_info.device_name; }
+
+  EIGEN_STRONG_INLINE std::string getDeviceVendor() const { return m_device_info.device_vendor; }
+
+  // This function returns the nearest power of 2
+  // if roundup is true returns result>=wgsize
+  // else it return result <= wgsize
+  EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t wGSize, bool roundUp) const {
+    if (roundUp) --wGSize;
+    wGSize |= (wGSize >> 1);
+    wGSize |= (wGSize >> 2);
+    wGSize |= (wGSize >> 4);
+    wGSize |= (wGSize >> 8);
+    wGSize |= (wGSize >> 16);
+#if EIGEN_ARCH_x86_64 || EIGEN_ARCH_ARM64 || EIGEN_OS_WIN64
+    wGSize |= (wGSize >> 32);
+#endif
+    return ((!roundUp) ? (wGSize - (wGSize >> 1)) : ++wGSize);
+  }
+
+  EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const { return m_queue; }
+
+  // This function checks if the runtime recorded an error for the
+  // underlying stream device.
+  EIGEN_STRONG_INLINE bool ok() const {
+    if (!exception_caught_) {
+      synchronize();
+    }
+    return !exception_caught_;
+  }
+
+ protected:
+  void synchronize_and_callback(cl::sycl::event e, const std::function<void()> &callback) const {
+    if (callback) {
+      auto callback_ = [=]() {
+#ifdef EIGEN_EXCEPTIONS
+        cl::sycl::event(e).wait_and_throw();
+#else
+        cl::sycl::event(e).wait();
+#endif
+        callback();
+      };
+      m_thread_pool.Schedule(std::move(callback_));
+    } else {
+#ifdef EIGEN_EXCEPTIONS
+      m_queue.wait_and_throw();
+#else
+      m_queue.wait();
+#endif
+    }
+  }
+
+  bool sycl_async_handler(cl::sycl::exception_list exceptions) const {
+    bool exception_caught = false;
+    for (const auto &e : exceptions) {
+      if (e) {
+        exception_caught = true;
+        EIGEN_THROW_X(e);
+      }
+    }
+    return exception_caught;
+  }
+
+  /// class members:
+  bool exception_caught_ = false;
+  /// sycl queue
+  mutable cl::sycl::queue m_queue;
+  /// The thread pool is used to wait on events and call callbacks
+  /// asynchronously
+  mutable Eigen::ThreadPool m_thread_pool;
+
+  const TensorSycl::internal::SyclDeviceInfo m_device_info;
+};
+
+struct SyclDeviceBase {
+  /// QueueInterface is not owned. it is the caller's responsibility to destroy
+  /// it
+  const QueueInterface *m_queue_stream;
+  explicit SyclDeviceBase(const QueueInterface *queue_stream) : m_queue_stream(queue_stream) {}
+  EIGEN_STRONG_INLINE const QueueInterface *queue_stream() const { return m_queue_stream; }
+};
+
+// Here is a sycl device struct which accept the sycl queue interface
+// as an input
+struct SyclDevice : public SyclDeviceBase {
+  explicit SyclDevice(const QueueInterface *queue_stream) : SyclDeviceBase(queue_stream) {}
+
+  /// This is used to prepare the number of threads and also the number of
+  /// threads per block for sycl kernels
+  template <typename Index>
+  EIGEN_STRONG_INLINE void parallel_for_setup(Index n, Index &tileSize, Index &rng, Index &GRange) const {
+    queue_stream()->parallel_for_setup(n, tileSize, rng, GRange);
+  }
+
+  /// This is used to prepare the number of threads and also the number of
+  /// threads per block for sycl kernels
+  template <typename Index>
+  EIGEN_STRONG_INLINE void parallel_for_setup(const std::array<Index, 2> &input_dim, cl::sycl::range<2> &global_range,
+                                              cl::sycl::range<2> &local_range) const {
+    queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
+  }
+
+  /// This is used to prepare the number of threads and also the number of
+  /// threads per block for sycl kernels
+  template <typename Index>
+  EIGEN_STRONG_INLINE void parallel_for_setup(const std::array<Index, 3> &input_dim, cl::sycl::range<3> &global_range,
+                                              cl::sycl::range<3> &local_range) const {
+    queue_stream()->parallel_for_setup(input_dim, global_range, local_range);
+  }
+
+  /// allocate device memory
+  EIGEN_STRONG_INLINE void *allocate(size_t num_bytes) const { return queue_stream()->allocate(num_bytes); }
+
+  EIGEN_STRONG_INLINE void *allocate_temp(size_t num_bytes) const { return queue_stream()->allocate_temp(num_bytes); }
+
+  /// deallocate device memory
+  EIGEN_STRONG_INLINE void deallocate(void *p) const { queue_stream()->deallocate(p); }
+
+  EIGEN_STRONG_INLINE void deallocate_temp(void *buffer) const { queue_stream()->deallocate_temp(buffer); }
+
+  EIGEN_STRONG_INLINE void deallocate_temp(const void *buffer) const { queue_stream()->deallocate_temp(buffer); }
+
+  template <typename data_t>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE data_t *get(data_t *data) const {
+    return data;
+  }
+
+  // some runtime conditions that can be applied here
+  EIGEN_STRONG_INLINE bool isDeviceSuitable() const { return true; }
+
+  /// memcpyHostToDevice
+  template <typename Index>
+  EIGEN_STRONG_INLINE void memcpyHostToDevice(Index *dst, const Index *src, size_t n,
+                                              std::function<void()> callback = {}) const {
+    queue_stream()->memcpyHostToDevice(dst, src, n, callback);
+  }
+  /// memcpyDeviceToHost
+  template <typename Index>
+  EIGEN_STRONG_INLINE void memcpyDeviceToHost(void *dst, const Index *src, size_t n,
+                                              std::function<void()> callback = {}) const {
+    queue_stream()->memcpyDeviceToHost(dst, src, n, callback);
+  }
+  /// the memcpy function
+  template <typename Index>
+  EIGEN_STRONG_INLINE void memcpy(void *dst, const Index *src, size_t n) const {
+    queue_stream()->memcpy(dst, src, n);
+  }
+  /// the memset function
+  EIGEN_STRONG_INLINE void memset(void *data, int c, size_t n) const { queue_stream()->memset(data, c, n); }
+  /// the fill function
+  template <typename T>
+  EIGEN_STRONG_INLINE void fill(T *begin, T *end, const T &value) const {
+    queue_stream()->fill(begin, end, value);
+  }
+  /// returning the sycl queue
+  EIGEN_STRONG_INLINE cl::sycl::queue &sycl_queue() const { return queue_stream()->sycl_queue(); }
+
+  EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { return 48 * 1024; }
+
+  EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
+    // We won't try to take advantage of the l2 cache for the time being, and
+    // there is no l3 cache on sycl devices.
+    return firstLevelCacheSize();
+  }
+  EIGEN_STRONG_INLINE unsigned long getNumSyclMultiProcessors() const {
+    return queue_stream()->getNumSyclMultiProcessors();
+  }
+  EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerBlock() const { return queue_stream()->maxSyclThreadsPerBlock(); }
+  EIGEN_STRONG_INLINE cl::sycl::id<3> maxWorkItemSizes() const { return queue_stream()->maxWorkItemSizes(); }
+  EIGEN_STRONG_INLINE unsigned long maxSyclThreadsPerMultiProcessor() const {
+    // OpenCL does not have such a concept
+    return queue_stream()->maxSyclThreadsPerMultiProcessor();
+  }
+  EIGEN_STRONG_INLINE size_t sharedMemPerBlock() const { return queue_stream()->sharedMemPerBlock(); }
+  EIGEN_STRONG_INLINE size_t getNearestPowerOfTwoWorkGroupSize() const {
+    return queue_stream()->getNearestPowerOfTwoWorkGroupSize();
+  }
+
+  EIGEN_STRONG_INLINE size_t getPowerOfTwo(size_t val, bool roundUp) const {
+    return queue_stream()->getPowerOfTwo(val, roundUp);
+  }
+  /// No need for sycl it should act the same as CPU version
+  EIGEN_STRONG_INLINE int majorDeviceVersion() const { return queue_stream()->majorDeviceVersion(); }
+
+  EIGEN_STRONG_INLINE void synchronize() const { queue_stream()->synchronize(); }
+
+  // This function checks if the runtime recorded an error for the
+  // underlying stream device.
+  EIGEN_STRONG_INLINE bool ok() const { return queue_stream()->ok(); }
+
+  EIGEN_STRONG_INLINE bool has_local_memory() const { return queue_stream()->has_local_memory(); }
+  EIGEN_STRONG_INLINE long max_buffer_size() const { return queue_stream()->max_buffer_size(); }
+  EIGEN_STRONG_INLINE std::string getPlatformName() const { return queue_stream()->getPlatformName(); }
+  EIGEN_STRONG_INLINE std::string getDeviceName() const { return queue_stream()->getDeviceName(); }
+  EIGEN_STRONG_INLINE std::string getDeviceVendor() const { return queue_stream()->getDeviceVendor(); }
+  template <typename OutScalar, typename KernelType, typename... T>
+  EIGEN_ALWAYS_INLINE cl::sycl::event binary_kernel_launcher(T... var) const {
+    return queue_stream()->template binary_kernel_launcher<OutScalar, KernelType>(var...);
+  }
+  template <typename OutScalar, typename KernelType, typename... T>
+  EIGEN_ALWAYS_INLINE cl::sycl::event unary_kernel_launcher(T... var) const {
+    return queue_stream()->template unary_kernel_launcher<OutScalar, KernelType>(var...);
+  }
+
+  template <typename OutScalar, typename KernelType, typename... T>
+  EIGEN_ALWAYS_INLINE cl::sycl::event nullary_kernel_launcher(T... var) const {
+    return queue_stream()->template nullary_kernel_launcher<OutScalar, KernelType>(var...);
+  }
+};
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_SYCL_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
new file mode 100644
index 00000000..3320990a
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@@ -0,0 +1,346 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#if defined(EIGEN_USE_THREADS) && !defined(EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H)
+#define EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+// An abstract interface to a device specific memory allocator.
+class Allocator {
+ public:
+  virtual ~Allocator() {}
+  virtual void* allocate(size_t num_bytes) const = 0;
+  virtual void deallocate(void* buffer) const = 0;
+};
+
+// Build a thread pool device on top the an existing pool of threads.
+struct ThreadPoolDevice {
+  // The ownership of the thread pool remains with the caller.
+  ThreadPoolDevice(ThreadPoolInterface* pool, int num_cores, Allocator* allocator = nullptr)
+      : pool_(pool), num_threads_(num_cores), allocator_(allocator) {}
+
+  EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
+    return allocator_ ? allocator_->allocate(num_bytes) : internal::aligned_malloc(num_bytes);
+  }
+
+  EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
+    if (allocator_) {
+      allocator_->deallocate(buffer);
+    } else {
+      internal::aligned_free(buffer);
+    }
+  }
+
+  EIGEN_STRONG_INLINE void* allocate_temp(size_t num_bytes) const { return allocate(num_bytes); }
+
+  EIGEN_STRONG_INLINE void deallocate_temp(void* buffer) const { deallocate(buffer); }
+
+  template <typename Type>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Type get(Type data) const {
+    return data;
+  }
+
+  EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
+#ifdef __ANDROID__
+    ::memcpy(dst, src, n);
+#else
+    // TODO(rmlarsen): Align blocks on cache lines.
+    // We have observed that going beyond 4 threads usually just wastes
+    // CPU cycles due to the threads competing for memory bandwidth, so we
+    // statically schedule at most 4 block copies here.
+    const size_t kMinBlockSize = 32768;
+    const size_t num_threads = CostModel::numThreads(n, TensorOpCost(1.0, 1.0, 0), 4);
+    if (n <= kMinBlockSize || num_threads < 2) {
+      ::memcpy(dst, src, n);
+    } else {
+      const char* src_ptr = static_cast<const char*>(src);
+      char* dst_ptr = static_cast<char*>(dst);
+      const size_t blocksize = (n + (num_threads - 1)) / num_threads;
+      Barrier barrier(static_cast<int>(num_threads - 1));
+      // Launch the last 3 blocks on worker threads.
+      for (size_t i = 1; i < num_threads; ++i) {
+        pool_->Schedule([n, i, src_ptr, dst_ptr, blocksize, &barrier] {
+          ::memcpy(dst_ptr + i * blocksize, src_ptr + i * blocksize, numext::mini(blocksize, n - (i * blocksize)));
+          barrier.Notify();
+        });
+      }
+      // Launch the first block on the main thread.
+      ::memcpy(dst_ptr, src_ptr, blocksize);
+      barrier.Wait();
+    }
+#endif
+  }
+  EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const { memcpy(dst, src, n); }
+  EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const { memcpy(dst, src, n); }
+
+  EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const { ::memset(buffer, c, n); }
+
+  template <typename T>
+  EIGEN_STRONG_INLINE void fill(T* begin, T* end, const T& value) const {
+    std::fill(begin, end, value);
+  }
+
+  EIGEN_STRONG_INLINE int numThreads() const { return num_threads_; }
+
+  // Number of theads available in the underlying thread pool. This number can
+  // be different from the value returned by numThreads().
+  EIGEN_STRONG_INLINE int numThreadsInPool() const { return pool_->NumThreads(); }
+
+  EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const { return l1CacheSize(); }
+
+  EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
+    // The l3 cache size is shared between all the cores.
+    return l3CacheSize() / num_threads_;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void synchronize() const {
+    // Nothing.  Threadpool device operations are synchronous.
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
+    // Should return an enum that encodes the ISA supported by the CPU
+    return 1;
+  }
+
+  // TODO(rmlarsen): Remove this deprecated interface when all users have been converted.
+  template <class Function, class... Args>
+  EIGEN_STRONG_INLINE void enqueueNoNotification(Function&& f, Args&&... args) const {
+    enqueue(std::forward<Function>(f), std::forward<Args>(args)...);
+  }
+
+  template <class Function, class... Args>
+  EIGEN_STRONG_INLINE void enqueue(Function&& f, Args&&... args) const {
+#if EIGEN_COMP_CXXVER >= 20
+    if constexpr (sizeof...(args) > 0) {
+      auto run_f = [f = std::forward<Function>(f), ... args = std::forward<Args>(args)]() { f(args...); };
+#else
+    if (sizeof...(args) > 0) {
+      auto run_f = [f = std::forward<Function>(f), &args...]() { f(args...); };
+#endif
+      pool_->Schedule(std::move(run_f));
+    } else {
+      pool_->Schedule(std::forward<Function>(f));
+    }
+  }
+
+  // Returns a logical thread index between 0 and pool_->NumThreads() - 1 if
+  // called from one of the threads in pool_. Returns -1 otherwise.
+  EIGEN_STRONG_INLINE int currentThreadId() const { return pool_->CurrentThreadId(); }
+
+  // WARNING: This function is synchronous and will block the calling thread.
+  //
+  // Synchronous parallelFor executes f with [0, n) arguments in parallel and
+  // waits for completion. F accepts a half-open interval [first, last). Block
+  // size is chosen based on the iteration cost and resulting parallel
+  // efficiency. If block_align is not nullptr, it is called to round up the
+  // block size.
+  void parallelFor(Index n, const TensorOpCost& cost, std::function<Index(Index)> block_align,
+                   std::function<void(Index, Index)> f) const {
+    if (EIGEN_PREDICT_FALSE(n <= 0)) {
+      return;
+      // Compute small problems directly in the caller thread.
+    } else if (n == 1 || numThreads() == 1 || CostModel::numThreads(n, cost, static_cast<int>(numThreads())) == 1) {
+      f(0, n);
+      return;
+    }
+
+    // Compute block size and total count of blocks.
+    ParallelForBlock block = CalculateParallelForBlock(n, cost, block_align);
+
+    // Recursively divide size into halves until we reach block_size.
+    // Division code rounds mid to block_size, so we are guaranteed to get
+    // block_count leaves that do actual computations.
+    Barrier barrier(static_cast<unsigned int>(block.count));
+    if (block.count <= numThreads()) {
+      // Avoid a thread hop by running the root of the tree and one block on the
+      // main thread.
+      handleRange(0, n, block.size, &barrier, pool_, f);
+    } else {
+      // Execute the root in the thread pool to avoid running work on more than
+      // numThreads() threads.
+      pool_->Schedule([this, n, &block, &barrier, &f]() { handleRange(0, n, block.size, &barrier, pool_, f); });
+    }
+
+    barrier.Wait();
+  }
+
+  // Convenience wrapper for parallelFor that does not align blocks.
+  void parallelFor(Index n, const TensorOpCost& cost, std::function<void(Index, Index)> f) const {
+    parallelFor(n, cost, nullptr, std::move(f));
+  }
+
+  // WARNING: This function is asynchronous and will not block the calling thread.
+  //
+  // Asynchronous parallelFor executes f with [0, n) arguments in parallel
+  // without waiting for completion. When the last block finished, it will call
+  // 'done' callback. F accepts a half-open interval [first, last). Block size
+  // is chosen based on the iteration cost and resulting parallel efficiency. If
+  // block_align is not nullptr, it is called to round up the block size.
+  void parallelForAsync(Index n, const TensorOpCost& cost, std::function<Index(Index)> block_align,
+                        std::function<void(Index, Index)> f, std::function<void()> done) const {
+    // Compute small problems directly in the caller thread.
+    if (n <= 1 || numThreads() == 1 || CostModel::numThreads(n, cost, static_cast<int>(numThreads())) == 1) {
+      f(0, n);
+      done();
+      return;
+    }
+
+    // Compute block size and total count of blocks.
+    ParallelForBlock block = CalculateParallelForBlock(n, cost, block_align);
+
+    ParallelForAsyncContext* const ctx = new ParallelForAsyncContext(block.count, std::move(f), std::move(done));
+
+    // Recursively divide size into halves until we reach block_size.
+    // Division code rounds mid to block_size, so we are guaranteed to get
+    // block_count leaves that do actual computations.
+    ctx->handle_range = [this, ctx, block](Index firstIdx, Index lastIdx) {
+      while (lastIdx - firstIdx > block.size) {
+        // Split into halves and schedule the second half on a different thread.
+        const Index midIdx = firstIdx + numext::div_ceil((lastIdx - firstIdx) / 2, block.size) * block.size;
+        pool_->Schedule([ctx, midIdx, lastIdx]() { ctx->handle_range(midIdx, lastIdx); });
+        lastIdx = midIdx;
+      }
+
+      // Single block or less, execute directly.
+      ctx->f(firstIdx, lastIdx);
+
+      // Delete async context if it was the last block.
+      if (ctx->count.fetch_sub(1) == 1) delete ctx;
+    };
+
+    if (block.count <= numThreads()) {
+      // Avoid a thread hop by running the root of the tree and one block on the
+      // main thread.
+      ctx->handle_range(0, n);
+    } else {
+      // Execute the root in the thread pool to avoid running work on more than
+      // numThreads() threads.
+      pool_->Schedule([ctx, n]() { ctx->handle_range(0, n); });
+    }
+  }
+
+  // Convenience wrapper for parallelForAsync that does not align blocks.
+  void parallelForAsync(Index n, const TensorOpCost& cost, std::function<void(Index, Index)> f,
+                        std::function<void()> done) const {
+    parallelForAsync(n, cost, nullptr, std::move(f), std::move(done));
+  }
+
+  // Thread pool accessor.
+  ThreadPoolInterface* getPool() const { return pool_; }
+
+  // Allocator accessor.
+  Allocator* allocator() const { return allocator_; }
+
+ private:
+  typedef TensorCostModel<ThreadPoolDevice> CostModel;
+
+  static void handleRange(Index firstIdx, Index lastIdx, Index granularity, Barrier* barrier, ThreadPoolInterface* pool,
+                          const std::function<void(Index, Index)>& f) {
+    while (lastIdx - firstIdx > granularity) {
+      // Split into halves and schedule the second half on a different thread.
+      const Index midIdx = firstIdx + numext::div_ceil((lastIdx - firstIdx) / 2, granularity) * granularity;
+      pool->Schedule([=, &f]() { handleRange(midIdx, lastIdx, granularity, barrier, pool, f); });
+      lastIdx = midIdx;
+    }
+    // Single block or less, execute directly.
+    f(firstIdx, lastIdx);
+    barrier->Notify();
+  }
+
+  // For parallelForAsync we must keep passed in closures on the heap, and
+  // delete them only after `done` callback finished.
+  struct ParallelForAsyncContext {
+    ParallelForAsyncContext(Index block_count, std::function<void(Index, Index)> block_f,
+                            std::function<void()> done_callback)
+        : count(block_count), f(std::move(block_f)), done(std::move(done_callback)) {}
+    ~ParallelForAsyncContext() { done(); }
+
+    std::atomic<Index> count;
+    std::function<void(Index, Index)> f;
+    std::function<void()> done;
+
+    std::function<void(Index, Index)> handle_range;
+  };
+
+  struct ParallelForBlock {
+    Index size;   // block size
+    Index count;  // number of blocks
+  };
+
+  // Calculates block size based on (1) the iteration cost and (2) parallel
+  // efficiency. We want blocks to be not too small to mitigate parallelization
+  // overheads; not too large to mitigate tail effect and potential load
+  // imbalance and we also want number of blocks to be evenly dividable across
+  // threads.
+  ParallelForBlock CalculateParallelForBlock(const Index n, const TensorOpCost& cost,
+                                             std::function<Index(Index)> block_align) const {
+    const double block_size_f = 1.0 / CostModel::taskSize(1, cost);
+    const Index max_oversharding_factor = 4;
+    Index block_size = numext::mini(
+        n, numext::maxi<Index>(numext::div_ceil<Index>(n, max_oversharding_factor * numThreads()), block_size_f));
+    const Index max_block_size = numext::mini(n, 2 * block_size);
+
+    if (block_align) {
+      Index new_block_size = block_align(block_size);
+      eigen_assert(new_block_size >= block_size);
+      block_size = numext::mini(n, new_block_size);
+    }
+
+    Index block_count = numext::div_ceil(n, block_size);
+
+    // Calculate parallel efficiency as fraction of total CPU time used for
+    // computations:
+    double max_efficiency =
+        static_cast<double>(block_count) / (numext::div_ceil<Index>(block_count, numThreads()) * numThreads());
+
+    // Now try to increase block size up to max_block_size as long as it
+    // doesn't decrease parallel efficiency.
+    for (Index prev_block_count = block_count; max_efficiency < 1.0 && prev_block_count > 1;) {
+      // This is the next block size that divides size into a smaller number
+      // of blocks than the current block_size.
+      Index coarser_block_size = numext::div_ceil(n, prev_block_count - 1);
+      if (block_align) {
+        Index new_block_size = block_align(coarser_block_size);
+        eigen_assert(new_block_size >= coarser_block_size);
+        coarser_block_size = numext::mini(n, new_block_size);
+      }
+      if (coarser_block_size > max_block_size) {
+        break;  // Reached max block size. Stop.
+      }
+      // Recalculate parallel efficiency.
+      const Index coarser_block_count = numext::div_ceil(n, coarser_block_size);
+      eigen_assert(coarser_block_count < prev_block_count);
+      prev_block_count = coarser_block_count;
+      const double coarser_efficiency = static_cast<double>(coarser_block_count) /
+                                        (numext::div_ceil<Index>(coarser_block_count, numThreads()) * numThreads());
+      if (coarser_efficiency + 0.01 >= max_efficiency) {
+        // Taking it.
+        block_size = coarser_block_size;
+        block_count = coarser_block_count;
+        if (max_efficiency < coarser_efficiency) {
+          max_efficiency = coarser_efficiency;
+        }
+      }
+    }
+
+    return {block_size, block_count};
+  }
+
+  ThreadPoolInterface* pool_;
+  int num_threads_;
+  Allocator* allocator_;
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_DEVICE_THREAD_POOL_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h
new file mode 100644
index 00000000..8f2e5af6
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h
@@ -0,0 +1,117 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H
+#define EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \internal
+ *
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Special case of tensor index list used to list all the dimensions of a tensor of rank n.
+ *
+ * \sa Tensor
+ */
+template <typename Index, std::size_t Rank>
+struct DimensionList {
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const Index operator[](const Index i) const { return i; }
+};
+
+namespace internal {
+
+template <typename Index, std::size_t Rank>
+struct array_size<DimensionList<Index, Rank> > {
+  static const size_t value = Rank;
+};
+template <typename Index, std::size_t Rank>
+struct array_size<const DimensionList<Index, Rank> > {
+  static const size_t value = Rank;
+};
+
+template <DenseIndex n, typename Index, std::size_t Rank>
+const Index array_get(DimensionList<Index, Rank>&) {
+  return n;
+}
+template <DenseIndex n, typename Index, std::size_t Rank>
+const Index array_get(const DimensionList<Index, Rank>&) {
+  return n;
+}
+
+template <typename Index, std::size_t Rank>
+struct index_known_statically_impl<DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex) { return true; }
+};
+template <typename Index, std::size_t Rank>
+struct index_known_statically_impl<const DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex) { return true; }
+};
+
+template <typename Index, std::size_t Rank>
+struct all_indices_known_statically_impl<DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run() { return true; }
+};
+template <typename Index, std::size_t Rank>
+struct all_indices_known_statically_impl<const DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run() { return true; }
+};
+
+template <typename Index, std::size_t Rank>
+struct indices_statically_known_to_increase_impl<DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run() { return true; }
+};
+template <typename Index, std::size_t Rank>
+struct indices_statically_known_to_increase_impl<const DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run() { return true; }
+};
+
+template <typename Index, std::size_t Rank>
+struct index_statically_eq_impl<DimensionList<Index, Rank> > {
+  static constexpr bool run(const DenseIndex i, const DenseIndex value) { return i == value; }
+};
+template <typename Index, std::size_t Rank>
+struct index_statically_eq_impl<const DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { return i == value; }
+};
+
+template <typename Index, std::size_t Rank>
+struct index_statically_ne_impl<DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { return i != value; }
+};
+template <typename Index, std::size_t Rank>
+struct index_statically_ne_impl<const DimensionList<Index, Rank> > {
+  static constexpr bool run(const DenseIndex i, const DenseIndex value) { return i != value; }
+};
+
+template <typename Index, std::size_t Rank>
+struct index_statically_gt_impl<DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { return i > value; }
+};
+template <typename Index, std::size_t Rank>
+struct index_statically_gt_impl<const DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { return i > value; }
+};
+
+template <typename Index, std::size_t Rank>
+struct index_statically_lt_impl<DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { return i < value; }
+};
+template <typename Index, std::size_t Rank>
+struct index_statically_lt_impl<const DimensionList<Index, Rank> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) { return i < value; }
+};
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_DIMENSION_LIST_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
new file mode 100644
index 00000000..e20052c9
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
@@ -0,0 +1,334 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H
+#define EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+// Boilerplate code
+namespace internal {
+
+template <std::ptrdiff_t n, typename Dimension>
+struct dget {
+  static const std::ptrdiff_t value = get<n, Dimension>::value;
+};
+
+template <typename Index, std::ptrdiff_t NumIndices, std::ptrdiff_t n, bool RowMajor>
+struct fixed_size_tensor_index_linearization_helper {
+  template <typename Dimensions>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Index run(array<Index, NumIndices> const& indices,
+                                                         const Dimensions& dimensions) {
+    return array_get < RowMajor                             ? n - 1
+           : (NumIndices - n) > (indices) + dget < RowMajor ? n - 1
+                                                            : (NumIndices - n),
+           Dimensions > ::value * fixed_size_tensor_index_linearization_helper<Index, NumIndices, n - 1, RowMajor>::run(
+                                      indices, dimensions);
+  }
+};
+
+template <typename Index, std::ptrdiff_t NumIndices, bool RowMajor>
+struct fixed_size_tensor_index_linearization_helper<Index, NumIndices, 0, RowMajor> {
+  template <typename Dimensions>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Index run(array<Index, NumIndices> const&, const Dimensions&) {
+    return 0;
+  }
+};
+
+template <typename Index, std::ptrdiff_t n>
+struct fixed_size_tensor_index_extraction_helper {
+  template <typename Dimensions>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Index run(const Index index, const Dimensions& dimensions) {
+    const Index mult = (index == n - 1) ? 1 : 0;
+    return array_get<n - 1>(dimensions) * mult +
+           fixed_size_tensor_index_extraction_helper<Index, n - 1>::run(index, dimensions);
+  }
+};
+
+template <typename Index>
+struct fixed_size_tensor_index_extraction_helper<Index, 0> {
+  template <typename Dimensions>
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Index run(const Index, const Dimensions&) {
+    return 0;
+  }
+};
+
+}  // end namespace internal
+
+/** \internal
+ *
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Fixed dimensions of a Tensor.
+ *
+ * The Sizes class encodes as part of the type the number of dimensions and the
+ * sizes corresponding to each dimension. It uses no storage space since it is
+ * entirely known at compile time.
+ *
+ * \sa Tensor
+ */
+template <typename std::ptrdiff_t... Indices>
+struct Sizes {
+  typedef internal::numeric_list<std::ptrdiff_t, Indices...> Base;
+  const Base t = Base();
+  static const std::ptrdiff_t total_size = internal::arg_prod(Indices...);
+  static const ptrdiff_t count = Base::count;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t rank() const { return Base::count; }
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t TotalSize() { return internal::arg_prod(Indices...); }
+
+  EIGEN_DEVICE_FUNC Sizes() {}
+  template <typename DenseIndex>
+  explicit EIGEN_DEVICE_FUNC Sizes(const array<DenseIndex, Base::count>& /*indices*/) {
+    // todo: add assertion
+  }
+  template <typename... DenseIndex>
+  EIGEN_DEVICE_FUNC Sizes(DenseIndex...) {}
+  explicit EIGEN_DEVICE_FUNC Sizes(std::initializer_list<std::ptrdiff_t> /*l*/) {
+    // todo: add assertion
+  }
+
+  template <typename T>
+  Sizes& operator=(const T& /*other*/) {
+    // add assertion failure if the size of other is different
+    return *this;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t operator[](const std::ptrdiff_t index) const {
+    return internal::fixed_size_tensor_index_extraction_helper<std::ptrdiff_t, Base::count>::run(index, t);
+  }
+
+  template <typename DenseIndex>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptrdiff_t IndexOfColMajor(const array<DenseIndex, Base::count>& indices) const {
+    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, false>::run(
+        indices, t);
+  }
+  template <typename DenseIndex>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ptrdiff_t IndexOfRowMajor(const array<DenseIndex, Base::count>& indices) const {
+    return internal::fixed_size_tensor_index_linearization_helper<DenseIndex, Base::count, Base::count, true>::run(
+        indices, t);
+  }
+};
+
+namespace internal {
+template <typename std::ptrdiff_t... Indices>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_prod(const Sizes<Indices...>&) {
+  return Sizes<Indices...>::total_size;
+}
+}  // namespace internal
+
+// Boilerplate
+namespace internal {
+template <typename Index, std::ptrdiff_t NumIndices, std::ptrdiff_t n, bool RowMajor>
+struct tensor_index_linearization_helper {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index run(array<Index, NumIndices> const& indices,
+                                                         array<Index, NumIndices> const& dimensions) {
+    return array_get < RowMajor ? n
+           : (NumIndices - n - 1) > (indices) + array_get < RowMajor
+               ? n
+               : (NumIndices - n - 1) >
+                     (dimensions)*tensor_index_linearization_helper<Index, NumIndices, n - 1, RowMajor>::run(
+                         indices, dimensions);
+  }
+};
+
+template <typename Index, std::ptrdiff_t NumIndices, bool RowMajor>
+struct tensor_index_linearization_helper<Index, NumIndices, 0, RowMajor> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index run(array<Index, NumIndices> const& indices,
+                                                         array<Index, NumIndices> const&) {
+    return array_get < RowMajor ? 0 : NumIndices - 1 > (indices);
+  }
+};
+}  // end namespace internal
+
+/** \internal
+ *
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Dynamic dimensions of a Tensor.
+ *
+ * The DSizes class is its dynamic sibling: the number of dimensions is known
+ * at compile time but the sizes are set during execution.
+ *
+ * \sa Tensor
+ */
+template <typename DenseIndex, int NumDims>
+struct DSizes : array<DenseIndex, NumDims> {
+  typedef array<DenseIndex, NumDims> Base;
+  static const int count = NumDims;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return NumDims; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex TotalSize() const {
+    return (NumDims == 0) ? 1 : internal::array_prod(*static_cast<const Base*>(this));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DSizes() {
+    for (int i = 0; i < NumDims; ++i) {
+      (*this)[i] = 0;
+    }
+  }
+  EIGEN_DEVICE_FUNC explicit DSizes(const array<DenseIndex, NumDims>& a) : Base(a) {}
+
+  EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0) {
+    eigen_assert(NumDims == 1);
+    (*this)[0] = i0;
+  }
+
+  EIGEN_DEVICE_FUNC DSizes(const DimensionList<DenseIndex, NumDims>& a) {
+    for (int i = 0; i < NumDims; ++i) {
+      (*this)[i] = a[i];
+    }
+  }
+
+  // Enable DSizes index type promotion only if we are promoting to the
+  // larger type, e.g. allow to promote dimensions of type int to long.
+  template <typename OtherIndex>
+  EIGEN_DEVICE_FUNC explicit DSizes(
+      const array<OtherIndex, NumDims>& other,
+      // Default template parameters require c++11.
+      std::enable_if_t<
+          internal::is_same<DenseIndex, typename internal::promote_index_type<DenseIndex, OtherIndex>::type>::value,
+          void*> = 0) {
+    for (int i = 0; i < NumDims; ++i) {
+      (*this)[i] = static_cast<DenseIndex>(other[i]);
+    }
+  }
+
+  template <typename FirstType, typename... OtherTypes>
+  EIGEN_DEVICE_FUNC explicit DSizes(const Eigen::IndexList<FirstType, OtherTypes...>& dimensions) {
+    for (int i = 0; i < dimensions.count; ++i) {
+      (*this)[i] = dimensions[i];
+    }
+  }
+
+  template <typename std::ptrdiff_t... Indices>
+  EIGEN_DEVICE_FUNC DSizes(const Sizes<Indices...>& a) {
+    for (int i = 0; i < NumDims; ++i) {
+      (*this)[i] = a[i];
+    }
+  }
+
+  template <typename... IndexTypes>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, DenseIndex secondDimension,
+                                                        IndexTypes... otherDimensions)
+      : Base({{firstDimension, secondDimension, otherDimensions...}}) {
+    EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 2 == NumDims, YOU_MADE_A_PROGRAMMING_MISTAKE)
+  }
+
+  EIGEN_DEVICE_FUNC DSizes& operator=(const array<DenseIndex, NumDims>& other) {
+    *static_cast<Base*>(this) = other;
+    return *this;
+  }
+
+  // A constexpr would be so much better here
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex IndexOfColMajor(const array<DenseIndex, NumDims>& indices) const {
+    return internal::tensor_index_linearization_helper<DenseIndex, NumDims, NumDims - 1, false>::run(
+        indices, *static_cast<const Base*>(this));
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex IndexOfRowMajor(const array<DenseIndex, NumDims>& indices) const {
+    return internal::tensor_index_linearization_helper<DenseIndex, NumDims, NumDims - 1, true>::run(
+        indices, *static_cast<const Base*>(this));
+  }
+};
+
+template <typename IndexType, int NumDims>
+std::ostream& operator<<(std::ostream& os, const DSizes<IndexType, NumDims>& dims) {
+  os << "[";
+  for (int i = 0; i < NumDims; ++i) {
+    if (i > 0) os << ", ";
+    os << dims[i];
+  }
+  os << "]";
+  return os;
+}
+
+// Boilerplate
+namespace internal {
+template <typename Index, std::ptrdiff_t NumIndices, std::ptrdiff_t n, bool RowMajor>
+struct tensor_vsize_index_linearization_helper {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index run(array<Index, NumIndices> const& indices,
+                                                         std::vector<DenseIndex> const& dimensions) {
+    return array_get < RowMajor ? n
+           : (NumIndices - n - 1) > (indices) + array_get < RowMajor
+               ? n
+               : (NumIndices - n - 1) >
+                     (dimensions)*tensor_vsize_index_linearization_helper<Index, NumIndices, n - 1, RowMajor>::run(
+                         indices, dimensions);
+  }
+};
+
+template <typename Index, std::ptrdiff_t NumIndices, bool RowMajor>
+struct tensor_vsize_index_linearization_helper<Index, NumIndices, 0, RowMajor> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index run(array<Index, NumIndices> const& indices,
+                                                         std::vector<DenseIndex> const&) {
+    return array_get < RowMajor ? 0 : NumIndices - 1 > (indices);
+  }
+};
+}  // end namespace internal
+
+namespace internal {
+
+template <typename DenseIndex, int NumDims>
+struct array_size<const DSizes<DenseIndex, NumDims> > {
+  static const ptrdiff_t value = NumDims;
+};
+template <typename DenseIndex, int NumDims>
+struct array_size<DSizes<DenseIndex, NumDims> > {
+  static const ptrdiff_t value = NumDims;
+};
+template <typename std::ptrdiff_t... Indices>
+struct array_size<const Sizes<Indices...> > {
+  static const std::ptrdiff_t value = Sizes<Indices...>::count;
+};
+template <typename std::ptrdiff_t... Indices>
+struct array_size<Sizes<Indices...> > {
+  static const std::ptrdiff_t value = Sizes<Indices...>::count;
+};
+template <std::ptrdiff_t n, typename std::ptrdiff_t... Indices>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes<Indices...>&) {
+  return get<n, internal::numeric_list<std::ptrdiff_t, Indices...> >::value;
+}
+template <std::ptrdiff_t n>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::ptrdiff_t array_get(const Sizes<>&) {
+  eigen_assert(false && "should never be called");
+  return -1;
+}
+
+template <typename Dims1, typename Dims2, ptrdiff_t n, ptrdiff_t m>
+struct sizes_match_below_dim {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Dims1&, Dims2&) { return false; }
+};
+template <typename Dims1, typename Dims2, ptrdiff_t n>
+struct sizes_match_below_dim<Dims1, Dims2, n, n> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Dims1& dims1, Dims2& dims2) {
+    return numext::equal_strict(array_get<n - 1>(dims1), array_get<n - 1>(dims2)) &&
+           sizes_match_below_dim<Dims1, Dims2, n - 1, n - 1>::run(dims1, dims2);
+  }
+};
+template <typename Dims1, typename Dims2>
+struct sizes_match_below_dim<Dims1, Dims2, 0, 0> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Dims1&, Dims2&) { return true; }
+};
+
+}  // end namespace internal
+
+template <typename Dims1, typename Dims2>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool dimensions_match(Dims1 dims1, Dims2 dims2) {
+  return internal::sizes_match_below_dim<Dims1, Dims2, internal::array_size<Dims1>::value,
+                                         internal::array_size<Dims2>::value>::run(dims1, dims2);
+}
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_DIMENSIONS_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
new file mode 100644
index 00000000..9bc0eac9
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@@ -0,0 +1,189 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H
+#define EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+template <typename XprType, template <class> class MakePointer_>
+struct traits<TensorEvalToOp<XprType, MakePointer_> > {
+  // Type promotion to handle the case where the types of the lhs and the rhs are different.
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef std::remove_reference_t<Nested> Nested_;
+  static constexpr int NumDimensions = XprTraits::NumDimensions;
+  static constexpr int Layout = XprTraits::Layout;
+  typedef typename MakePointer_<Scalar>::Type PointerType;
+
+  enum { Flags = 0 };
+  template <class T>
+  struct MakePointer {
+    // Intermediate typedef to workaround MSVC issue.
+    typedef MakePointer_<T> MakePointerT;
+    typedef typename MakePointerT::Type Type;
+  };
+};
+
+template <typename XprType, template <class> class MakePointer_>
+struct eval<TensorEvalToOp<XprType, MakePointer_>, Eigen::Dense> {
+  typedef const TensorEvalToOp<XprType, MakePointer_>& type;
+};
+
+template <typename XprType, template <class> class MakePointer_>
+struct nested<TensorEvalToOp<XprType, MakePointer_>, 1, typename eval<TensorEvalToOp<XprType, MakePointer_> >::type> {
+  typedef TensorEvalToOp<XprType, MakePointer_> type;
+};
+
+}  // end namespace internal
+
+template <typename XprType, template <class> class MakePointer_>
+class TensorEvalToOp : public TensorBase<TensorEvalToOp<XprType, MakePointer_>, ReadOnlyAccessors> {
+ public:
+  typedef typename Eigen::internal::traits<TensorEvalToOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef std::remove_const_t<typename XprType::CoeffReturnType> CoeffReturnType;
+  typedef typename MakePointer_<CoeffReturnType>::Type PointerType;
+  typedef typename Eigen::internal::nested<TensorEvalToOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorEvalToOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorEvalToOp>::Index Index;
+
+  static constexpr int NumDims = Eigen::internal::traits<TensorEvalToOp>::NumDimensions;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvalToOp(PointerType buffer, const XprType& expr)
+      : m_xpr(expr), m_buffer(buffer) {}
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename XprType::Nested>& expression() const { return m_xpr; }
+
+  EIGEN_DEVICE_FUNC PointerType buffer() const { return m_buffer; }
+
+ protected:
+  typename XprType::Nested m_xpr;
+  PointerType m_buffer;
+};
+
+template <typename ArgType, typename Device, template <class> class MakePointer_>
+struct TensorEvaluator<const TensorEvalToOp<ArgType, MakePointer_>, Device> {
+  typedef TensorEvalToOp<ArgType, MakePointer_> XprType;
+  typedef typename ArgType::Scalar Scalar;
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+  typedef typename XprType::Index Index;
+  typedef std::remove_const_t<typename XprType::CoeffReturnType> CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+  enum {
+    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = true,
+    PreferBlockAccess = false,
+    CoordAccess = false,  // to be implemented
+    RawAccess = true
+  };
+
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+  static constexpr int NumDims = internal::traits<ArgType>::NumDimensions;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock ArgTensorBlock;
+
+  typedef internal::TensorBlockAssignment<CoeffReturnType, NumDims, typename ArgTensorBlock::XprType, Index>
+      TensorBlockAssignment;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_buffer(device.get(op.buffer())), m_expression(op.expression()) {}
+
+  EIGEN_STRONG_INLINE ~TensorEvaluator() {}
+
+  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType scalar) {
+    EIGEN_UNUSED_VARIABLE(scalar);
+    eigen_assert(scalar == NULL);
+    return m_impl.evalSubExprsIfNeeded(m_buffer);
+  }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType scalar, EvalSubExprsCallback done) {
+    EIGEN_UNUSED_VARIABLE(scalar);
+    eigen_assert(scalar == NULL);
+    m_impl.evalSubExprsIfNeededAsync(m_buffer, std::move(done));
+  }
+#endif
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalScalar(Index i) const { m_buffer[i] = m_impl.coeff(i); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalPacket(Index i) const {
+    internal::pstoret<CoeffReturnType, PacketReturnType, Aligned>(
+        m_buffer + i, m_impl.template packet < TensorEvaluator<ArgType, Device>::IsAligned ? Aligned : Unaligned > (i));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    return m_impl.getResourceRequirements();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalBlock(TensorBlockDesc& desc, TensorBlockScratch& scratch) {
+    // Add `m_buffer` as destination buffer to the block descriptor.
+    desc.template AddDestinationBuffer<Layout>(
+        /*dst_base=*/m_buffer + desc.offset(),
+        /*dst_strides=*/internal::strides<Layout>(m_impl.dimensions()));
+
+    ArgTensorBlock block = m_impl.block(desc, scratch, /*root_of_expr_ast=*/true);
+
+    // If block was evaluated into a destination buffer, there is no need to do
+    // an assignment.
+    if (block.kind() != internal::TensorBlockKind::kMaterializedInOutput) {
+      TensorBlockAssignment::Run(
+          TensorBlockAssignment::target(desc.dimensions(), internal::strides<Layout>(m_impl.dimensions()), m_buffer,
+                                        desc.offset()),
+          block.expr());
+    }
+    block.cleanup();
+  }
+
+  EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_buffer[index]; }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+    return internal::ploadt<PacketReturnType, LoadMode>(m_buffer + index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    // We assume that evalPacket or evalScalar is called to perform the
+    // assignment and account for the cost of the write here.
+    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, sizeof(CoeffReturnType), 0, vectorized, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_buffer; }
+  ArgType expression() const { return m_expression; }
+
+ private:
+  TensorEvaluator<ArgType, Device> m_impl;
+  EvaluatorPointerType m_buffer;
+  const ArgType m_expression;
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_EVAL_TO_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
new file mode 100644
index 00000000..5544953e
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -0,0 +1,858 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H
+#define EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+// Generic evaluator
+/**
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief The tensor evaluator class.
+ *
+ * These classes are responsible for the evaluation of the tensor expression.
+ *
+ * TODO: add support for more types of expressions, in particular expressions
+ * leading to lvalues (slicing, reshaping, etc...)
+ */
+template <typename Derived, typename Device>
+struct TensorEvaluator {
+  typedef typename Derived::Index Index;
+  typedef typename Derived::Scalar Scalar;
+  typedef typename Derived::Scalar CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef typename Derived::Dimensions Dimensions;
+  typedef Derived XprType;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef typename internal::traits<Derived>::template MakePointer<Scalar>::Type TensorPointerType;
+  typedef StorageMemory<Scalar, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  // NumDimensions is -1 for variable dim tensors
+  static constexpr int NumCoords =
+      internal::traits<Derived>::NumDimensions > 0 ? internal::traits<Derived>::NumDimensions : 0;
+  static constexpr int Layout = Derived::Layout;
+
+  enum {
+    IsAligned = Derived::IsAligned,
+    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
+    BlockAccess = internal::is_arithmetic<std::remove_const_t<Scalar>>::value,
+    PreferBlockAccess = false,
+    CoordAccess = NumCoords > 0,
+    RawAccess = true
+  };
+
+  typedef std::remove_const_t<Scalar> ScalarNoConst;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumCoords, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumCoords, Layout, Index> TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const Derived& m, const Device& device)
+      : m_data(device.get((const_cast<TensorPointerType>(m.data())))), m_dims(m.dimensions()), m_device(device) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType dest) {
+    if (!NumTraits<std::remove_const_t<Scalar>>::RequireInitialization && dest) {
+      m_device.memcpy((void*)(m_device.get(dest)), m_device.get(m_data), m_dims.TotalSize() * sizeof(Scalar));
+      return false;
+    }
+    return true;
+  }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType dest, EvalSubExprsCallback done) {
+    // TODO(ezhulenev): ThreadPoolDevice memcpy is blockign operation.
+    done(evalSubExprsIfNeeded(dest));
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    eigen_assert(m_data != NULL);
+    return m_data[index];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const {
+    eigen_assert(m_data != NULL);
+    return m_data[index];
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+    return internal::ploadt<PacketReturnType, LoadMode>(m_data + index);
+  }
+
+  // Return a packet starting at `index` where `umask` specifies which elements
+  // have to be loaded. Type/size of mask depends on PacketReturnType, e.g. for
+  // Packet16f, `umask` is of type uint16_t and if a bit is 1, corresponding
+  // float element will be loaded, otherwise 0 will be loaded.
+  // Function has been templatized to enable Sfinae.
+  template <typename PacketReturnTypeT>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+      std::enable_if_t<internal::unpacket_traits<PacketReturnTypeT>::masked_load_available, PacketReturnTypeT>
+      partialPacket(Index index, typename internal::unpacket_traits<PacketReturnTypeT>::mask_t umask) const {
+    return internal::ploadu<PacketReturnTypeT>(m_data + index, umask);
+  }
+
+  template <int StoreMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) const {
+    return internal::pstoret<Scalar, PacketReturnType, StoreMode>(m_data + index, x);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<DenseIndex, NumCoords>& coords) const {
+    eigen_assert(m_data != NULL);
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      return m_data[m_dims.IndexOfColMajor(coords)];
+    } else {
+      return m_data[m_dims.IndexOfRowMajor(coords)];
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(const array<DenseIndex, NumCoords>& coords) const {
+    eigen_assert(m_data != NULL);
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      return m_data[m_dims.IndexOfColMajor(coords)];
+    } else {
+      return m_data[m_dims.IndexOfRowMajor(coords)];
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketType<CoeffReturnType, Device>::size);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    return internal::TensorBlockResourceRequirements::any();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+                                                          bool /*root_of_expr_ast*/ = false) const {
+    eigen_assert(m_data != NULL);
+    return TensorBlock::materialize(m_data, m_dims, desc, scratch);
+  }
+
+  template <typename TensorBlock>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(const TensorBlockDesc& desc, const TensorBlock& block) {
+    eigen_assert(m_data != NULL);
+
+    typedef typename TensorBlock::XprType TensorBlockExpr;
+    typedef internal::TensorBlockAssignment<Scalar, NumCoords, TensorBlockExpr, Index> TensorBlockAssign;
+
+    TensorBlockAssign::Run(
+        TensorBlockAssign::target(desc.dimensions(), internal::strides<Layout>(m_dims), m_data, desc.offset()),
+        block.expr());
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }
+
+ protected:
+  EvaluatorPointerType m_data;
+  Dimensions m_dims;
+  const Device EIGEN_DEVICE_REF m_device;
+};
+
+namespace internal {
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T loadConstant(const T* address) {
+  return *address;
+}
+// Use the texture cache on CUDA devices whenever possible
+#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float loadConstant(const float* address) {
+  return __ldg(address);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double loadConstant(const double* address) {
+  return __ldg(address);
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Eigen::half loadConstant(const Eigen::half* address) {
+  return Eigen::half(half_impl::raw_uint16_to_half(__ldg(&address->x)));
+}
+#endif
+
+}  // namespace internal
+
+// Default evaluator for rvalues
+template <typename Derived, typename Device>
+struct TensorEvaluator<const Derived, Device> {
+  typedef typename Derived::Index Index;
+  typedef typename Derived::Scalar Scalar;
+  typedef typename Derived::Scalar CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef typename Derived::Dimensions Dimensions;
+  typedef const Derived XprType;
+  typedef typename internal::traits<Derived>::template MakePointer<const Scalar>::Type TensorPointerType;
+  typedef StorageMemory<const Scalar, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  typedef std::remove_const_t<Scalar> ScalarNoConst;
+
+  // NumDimensions is -1 for variable dim tensors
+  static constexpr int NumCoords =
+      internal::traits<Derived>::NumDimensions > 0 ? internal::traits<Derived>::NumDimensions : 0;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  static constexpr int Layout = Derived::Layout;
+
+  enum {
+    IsAligned = Derived::IsAligned,
+    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
+    BlockAccess = internal::is_arithmetic<ScalarNoConst>::value,
+    PreferBlockAccess = false,
+    CoordAccess = NumCoords > 0,
+    RawAccess = true
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumCoords, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumCoords, Layout, Index> TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC TensorEvaluator(const Derived& m, const Device& device)
+      : m_data(device.get(m.data())), m_dims(m.dimensions()), m_device(device) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dims; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
+    if (!NumTraits<std::remove_const_t<Scalar>>::RequireInitialization && data) {
+      m_device.memcpy((void*)(m_device.get(data)), m_device.get(m_data), m_dims.TotalSize() * sizeof(Scalar));
+      return false;
+    }
+    return true;
+  }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType dest, EvalSubExprsCallback done) {
+    // TODO(ezhulenev): ThreadPoolDevice memcpy is a blockign operation.
+    done(evalSubExprsIfNeeded(dest));
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    eigen_assert(m_data != NULL);
+    return internal::loadConstant(m_data + index);
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+    return internal::ploadt_ro<PacketReturnType, LoadMode>(m_data + index);
+  }
+
+  // Return a packet starting at `index` where `umask` specifies which elements
+  // have to be loaded. Type/size of mask depends on PacketReturnType, e.g. for
+  // Packet16f, `umask` is of type uint16_t and if a bit is 1, corresponding
+  // float element will be loaded, otherwise 0 will be loaded.
+  // Function has been templatized to enable Sfinae.
+  template <typename PacketReturnTypeT>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+      std::enable_if_t<internal::unpacket_traits<PacketReturnTypeT>::masked_load_available, PacketReturnTypeT>
+      partialPacket(Index index, typename internal::unpacket_traits<PacketReturnTypeT>::mask_t umask) const {
+    return internal::ploadu<PacketReturnTypeT>(m_data + index, umask);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<DenseIndex, NumCoords>& coords) const {
+    eigen_assert(m_data != NULL);
+    const Index index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? m_dims.IndexOfColMajor(coords)
+                                                                                 : m_dims.IndexOfRowMajor(coords);
+    return internal::loadConstant(m_data + index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketType<CoeffReturnType, Device>::size);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    return internal::TensorBlockResourceRequirements::any();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+                                                          bool /*root_of_expr_ast*/ = false) const {
+    eigen_assert(m_data != NULL);
+    return TensorBlock::materialize(m_data, m_dims, desc, scratch);
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }
+
+ protected:
+  EvaluatorPointerType m_data;
+  Dimensions m_dims;
+  const Device EIGEN_DEVICE_REF m_device;
+};
+
+// -------------------- CwiseNullaryOp --------------------
+
+template <typename NullaryOp, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device> {
+  typedef TensorCwiseNullaryOp<NullaryOp, ArgType> XprType;
+
+  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
+      : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device), m_wrapper() {}
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+  enum {
+    IsAligned = true,
+    PacketAccess = internal::functor_traits<NullaryOp>::PacketAccess
+#ifdef EIGEN_USE_SYCL
+                   && (PacketType<CoeffReturnType, Device>::size > 1)
+#endif
+        ,
+    BlockAccess = false,
+    PreferBlockAccess = false,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { return true; }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType, EvalSubExprsCallback done) {
+    done(true);
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() {}
+
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const { return m_wrapper(m_functor, index); }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+    return m_wrapper.template packetOp<PacketReturnType, Index>(m_functor, index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketType<CoeffReturnType, Device>::size);
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+ private:
+  const NullaryOp m_functor;
+  TensorEvaluator<ArgType, Device> m_argImpl;
+  const internal::nullary_wrapper<CoeffReturnType, NullaryOp> m_wrapper;
+};
+
+// -------------------- CwiseUnaryOp --------------------
+
+template <typename UnaryOp, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorCwiseUnaryOp<UnaryOp, ArgType>, Device> {
+  typedef TensorCwiseUnaryOp<UnaryOp, ArgType> XprType;
+
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+  enum {
+    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess =
+        int(TensorEvaluator<ArgType, Device>::PacketAccess) & int(internal::functor_traits<UnaryOp>::PacketAccess),
+    BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
+      : m_device(device), m_functor(op.functor()), m_argImpl(op.nestedExpression(), device) {}
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef std::remove_const_t<Scalar> ScalarNoConst;
+  typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+  static constexpr int NumDims = internal::array_size<Dimensions>::value;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock ArgTensorBlock;
+
+  typedef internal::TensorCwiseUnaryBlock<UnaryOp, ArgTensorBlock> TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_argImpl.dimensions(); }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+    m_argImpl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType, EvalSubExprsCallback done) {
+    m_argImpl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() { m_argImpl.cleanup(); }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const { return m_functor(m_argImpl.coeff(index)); }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+    return m_functor.packetOp(m_argImpl.template packet<LoadMode>(index));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    const double functor_cost = internal::functor_traits<UnaryOp>::Cost;
+    return m_argImpl.costPerCoeff(vectorized) + TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    static const double functor_cost = internal::functor_traits<UnaryOp>::Cost;
+    return m_argImpl.getResourceRequirements().addCostPerCoeff({0, 0, functor_cost / PacketSize});
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+                                                          bool /*root_of_expr_ast*/ = false) const {
+    return TensorBlock(m_argImpl.block(desc, scratch), m_functor);
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+ private:
+  const Device EIGEN_DEVICE_REF m_device;
+  const UnaryOp m_functor;
+  TensorEvaluator<ArgType, Device> m_argImpl;
+};
+
+// -------------------- CwiseBinaryOp --------------------
+
+template <typename BinaryOp, typename LeftArgType, typename RightArgType, typename Device>
+struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArgType>, Device> {
+  typedef TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArgType> XprType;
+
+  static constexpr int Layout = TensorEvaluator<LeftArgType, Device>::Layout;
+  enum {
+    IsAligned =
+        int(TensorEvaluator<LeftArgType, Device>::IsAligned) & int(TensorEvaluator<RightArgType, Device>::IsAligned),
+    PacketAccess = int(TensorEvaluator<LeftArgType, Device>::PacketAccess) &
+                   int(TensorEvaluator<RightArgType, Device>::PacketAccess) &
+                   int(internal::functor_traits<BinaryOp>::PacketAccess),
+    BlockAccess = int(TensorEvaluator<LeftArgType, Device>::BlockAccess) &
+                  int(TensorEvaluator<RightArgType, Device>::BlockAccess),
+    PreferBlockAccess = int(TensorEvaluator<LeftArgType, Device>::PreferBlockAccess) |
+                        int(TensorEvaluator<RightArgType, Device>::PreferBlockAccess),
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
+      : m_device(device),
+        m_functor(op.functor()),
+        m_leftImpl(op.lhsExpression(), device),
+        m_rightImpl(op.rhsExpression(), device) {
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) ==
+                             static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout) ||
+                         internal::traits<XprType>::NumDimensions <= 1),
+                        YOU_MADE_A_PROGRAMMING_MISTAKE);
+    eigen_assert(dimensions_match(m_leftImpl.dimensions(), m_rightImpl.dimensions()));
+  }
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef typename TensorEvaluator<LeftArgType, Device>::Dimensions Dimensions;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  static constexpr int NumDims = internal::array_size<typename TensorEvaluator<LeftArgType, Device>::Dimensions>::value;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename TensorEvaluator<const LeftArgType, Device>::TensorBlock LeftTensorBlock;
+  typedef typename TensorEvaluator<const RightArgType, Device>::TensorBlock RightTensorBlock;
+
+  typedef internal::TensorCwiseBinaryBlock<BinaryOp, LeftTensorBlock, RightTensorBlock> TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const {
+    // TODO: use right impl instead if right impl dimensions are known at compile time.
+    return m_leftImpl.dimensions();
+  }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+    m_leftImpl.evalSubExprsIfNeeded(NULL);
+    m_rightImpl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType, EvalSubExprsCallback done) {
+    // TODO(ezhulenev): Evaluate two expression in parallel?
+    m_leftImpl.evalSubExprsIfNeededAsync(
+        nullptr, [this, done](bool) { m_rightImpl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); });
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() {
+    m_leftImpl.cleanup();
+    m_rightImpl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const {
+    return m_functor(m_leftImpl.coeff(index), m_rightImpl.coeff(index));
+  }
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+    return m_functor.packetOp(m_leftImpl.template packet<LoadMode>(index),
+                              m_rightImpl.template packet<LoadMode>(index));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    const double functor_cost = internal::functor_traits<BinaryOp>::Cost;
+    return m_leftImpl.costPerCoeff(vectorized) + m_rightImpl.costPerCoeff(vectorized) +
+           TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    static const double functor_cost = internal::functor_traits<BinaryOp>::Cost;
+    return internal::TensorBlockResourceRequirements::merge(m_leftImpl.getResourceRequirements(),
+                                                            m_rightImpl.getResourceRequirements())
+        .addCostPerCoeff({0, 0, functor_cost / PacketSize});
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+                                                          bool /*root_of_expr_ast*/ = false) const {
+    desc.DropDestinationBuffer();
+    return TensorBlock(m_leftImpl.block(desc, scratch), m_rightImpl.block(desc, scratch), m_functor);
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+ private:
+  const Device EIGEN_DEVICE_REF m_device;
+  const BinaryOp m_functor;
+  TensorEvaluator<LeftArgType, Device> m_leftImpl;
+  TensorEvaluator<RightArgType, Device> m_rightImpl;
+};
+
+// -------------------- CwiseTernaryOp --------------------
+
+template <typename TernaryOp, typename Arg1Type, typename Arg2Type, typename Arg3Type, typename Device>
+struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type, Arg3Type>, Device> {
+  typedef TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type, Arg3Type> XprType;
+
+  static constexpr int Layout = TensorEvaluator<Arg1Type, Device>::Layout;
+  enum {
+    IsAligned = TensorEvaluator<Arg1Type, Device>::IsAligned & TensorEvaluator<Arg2Type, Device>::IsAligned &
+                TensorEvaluator<Arg3Type, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<Arg1Type, Device>::PacketAccess && TensorEvaluator<Arg2Type, Device>::PacketAccess &&
+                   TensorEvaluator<Arg3Type, Device>::PacketAccess && internal::functor_traits<TernaryOp>::PacketAccess,
+    BlockAccess = false,
+    PreferBlockAccess = TensorEvaluator<Arg1Type, Device>::PreferBlockAccess ||
+                        TensorEvaluator<Arg2Type, Device>::PreferBlockAccess ||
+                        TensorEvaluator<Arg3Type, Device>::PreferBlockAccess,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
+      : m_functor(op.functor()),
+        m_arg1Impl(op.arg1Expression(), device),
+        m_arg2Impl(op.arg2Expression(), device),
+        m_arg3Impl(op.arg3Expression(), device) {
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<Arg1Type, Device>::Layout) ==
+                             static_cast<int>(TensorEvaluator<Arg3Type, Device>::Layout) ||
+                         internal::traits<XprType>::NumDimensions <= 1),
+                        YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::StorageKind,
+                                           typename internal::traits<Arg2Type>::StorageKind>::value),
+                        STORAGE_KIND_MUST_MATCH)
+    EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::StorageKind,
+                                           typename internal::traits<Arg3Type>::StorageKind>::value),
+                        STORAGE_KIND_MUST_MATCH)
+    EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::Index,
+                                           typename internal::traits<Arg2Type>::Index>::value),
+                        STORAGE_INDEX_MUST_MATCH)
+    EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::Index,
+                                           typename internal::traits<Arg3Type>::Index>::value),
+                        STORAGE_INDEX_MUST_MATCH)
+
+    eigen_assert(dimensions_match(m_arg1Impl.dimensions(), m_arg2Impl.dimensions()) &&
+                 dimensions_match(m_arg1Impl.dimensions(), m_arg3Impl.dimensions()));
+  }
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef typename TensorEvaluator<Arg1Type, Device>::Dimensions Dimensions;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const {
+    // TODO: use arg2 or arg3 dimensions if they are known at compile time.
+    return m_arg1Impl.dimensions();
+  }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+    m_arg1Impl.evalSubExprsIfNeeded(NULL);
+    m_arg2Impl.evalSubExprsIfNeeded(NULL);
+    m_arg3Impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+  EIGEN_STRONG_INLINE void cleanup() {
+    m_arg1Impl.cleanup();
+    m_arg2Impl.cleanup();
+    m_arg3Impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const {
+    return m_functor(m_arg1Impl.coeff(index), m_arg2Impl.coeff(index), m_arg3Impl.coeff(index));
+  }
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+    return m_functor.packetOp(m_arg1Impl.template packet<LoadMode>(index), m_arg2Impl.template packet<LoadMode>(index),
+                              m_arg3Impl.template packet<LoadMode>(index));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    const double functor_cost = internal::functor_traits<TernaryOp>::Cost;
+    return m_arg1Impl.costPerCoeff(vectorized) + m_arg2Impl.costPerCoeff(vectorized) +
+           m_arg3Impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+ private:
+  const TernaryOp m_functor;
+  TensorEvaluator<Arg1Type, Device> m_arg1Impl;
+  TensorEvaluator<Arg2Type, Device> m_arg2Impl;
+  TensorEvaluator<Arg3Type, Device> m_arg3Impl;
+};
+
+// -------------------- SelectOp --------------------
+
+template <typename IfArgType, typename ThenArgType, typename ElseArgType, typename Device>
+struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>, Device> {
+  typedef TensorSelectOp<IfArgType, ThenArgType, ElseArgType> XprType;
+  typedef typename XprType::Scalar Scalar;
+
+  using TernarySelectOp = internal::scalar_boolean_select_op<typename internal::traits<ThenArgType>::Scalar,
+                                                             typename internal::traits<ElseArgType>::Scalar,
+                                                             typename internal::traits<IfArgType>::Scalar>;
+  static constexpr bool TernaryPacketAccess =
+      TensorEvaluator<ThenArgType, Device>::PacketAccess && TensorEvaluator<ElseArgType, Device>::PacketAccess &&
+      TensorEvaluator<IfArgType, Device>::PacketAccess && internal::functor_traits<TernarySelectOp>::PacketAccess;
+
+  static constexpr int Layout = TensorEvaluator<IfArgType, Device>::Layout;
+  enum {
+    IsAligned = TensorEvaluator<ThenArgType, Device>::IsAligned & TensorEvaluator<ElseArgType, Device>::IsAligned,
+    PacketAccess = (TensorEvaluator<ThenArgType, Device>::PacketAccess &&
+                    TensorEvaluator<ElseArgType, Device>::PacketAccess && PacketType<Scalar, Device>::HasBlend) ||
+                   TernaryPacketAccess,
+    BlockAccess = TensorEvaluator<IfArgType, Device>::BlockAccess &&
+                  TensorEvaluator<ThenArgType, Device>::BlockAccess &&
+                  TensorEvaluator<ElseArgType, Device>::BlockAccess,
+    PreferBlockAccess = TensorEvaluator<IfArgType, Device>::PreferBlockAccess ||
+                        TensorEvaluator<ThenArgType, Device>::PreferBlockAccess ||
+                        TensorEvaluator<ElseArgType, Device>::PreferBlockAccess,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
+      : m_condImpl(op.ifExpression(), device),
+        m_thenImpl(op.thenExpression(), device),
+        m_elseImpl(op.elseExpression(), device) {
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<IfArgType, Device>::Layout) ==
+                         static_cast<int>(TensorEvaluator<ThenArgType, Device>::Layout)),
+                        YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<IfArgType, Device>::Layout) ==
+                         static_cast<int>(TensorEvaluator<ElseArgType, Device>::Layout)),
+                        YOU_MADE_A_PROGRAMMING_MISTAKE);
+    eigen_assert(dimensions_match(m_condImpl.dimensions(), m_thenImpl.dimensions()));
+    eigen_assert(dimensions_match(m_thenImpl.dimensions(), m_elseImpl.dimensions()));
+  }
+
+  typedef typename XprType::Index Index;
+  typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef typename TensorEvaluator<IfArgType, Device>::Dimensions Dimensions;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  static constexpr int NumDims = internal::array_size<Dimensions>::value;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename TensorEvaluator<const IfArgType, Device>::TensorBlock IfArgTensorBlock;
+  typedef typename TensorEvaluator<const ThenArgType, Device>::TensorBlock ThenArgTensorBlock;
+  typedef typename TensorEvaluator<const ElseArgType, Device>::TensorBlock ElseArgTensorBlock;
+
+  struct TensorSelectOpBlockFactory {
+    template <typename IfArgXprType, typename ThenArgXprType, typename ElseArgXprType>
+    struct XprType {
+      typedef TensorSelectOp<const IfArgXprType, const ThenArgXprType, const ElseArgXprType> type;
+    };
+
+    template <typename IfArgXprType, typename ThenArgXprType, typename ElseArgXprType>
+    typename XprType<IfArgXprType, ThenArgXprType, ElseArgXprType>::type expr(const IfArgXprType& if_expr,
+                                                                              const ThenArgXprType& then_expr,
+                                                                              const ElseArgXprType& else_expr) const {
+      return typename XprType<IfArgXprType, ThenArgXprType, ElseArgXprType>::type(if_expr, then_expr, else_expr);
+    }
+  };
+
+  typedef internal::TensorTernaryExprBlock<TensorSelectOpBlockFactory, IfArgTensorBlock, ThenArgTensorBlock,
+                                           ElseArgTensorBlock>
+      TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const {
+    // TODO: use then or else impl instead if they happen to be known at compile time.
+    return m_condImpl.dimensions();
+  }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+    m_condImpl.evalSubExprsIfNeeded(NULL);
+    m_thenImpl.evalSubExprsIfNeeded(NULL);
+    m_elseImpl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType, EvalSubExprsCallback done) {
+    m_condImpl.evalSubExprsIfNeeded(nullptr, [this, done](bool) {
+      m_thenImpl.evalSubExprsIfNeeded(
+          nullptr, [this, done](bool) { m_elseImpl.evalSubExprsIfNeeded(nullptr, [done](bool) { done(true); }); });
+    });
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() {
+    m_condImpl.cleanup();
+    m_thenImpl.cleanup();
+    m_elseImpl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const {
+    return m_condImpl.coeff(index) ? m_thenImpl.coeff(index) : m_elseImpl.coeff(index);
+  }
+
+  template <int LoadMode, bool UseTernary = TernaryPacketAccess, std::enable_if_t<!UseTernary, bool> = true>
+  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
+    internal::Selector<PacketSize> select;
+    EIGEN_UNROLL_LOOP
+    for (Index i = 0; i < PacketSize; ++i) {
+      select.select[i] = m_condImpl.coeff(index + i);
+    }
+    return internal::pblend(select, m_thenImpl.template packet<LoadMode>(index),
+                            m_elseImpl.template packet<LoadMode>(index));
+  }
+
+  template <int LoadMode, bool UseTernary = TernaryPacketAccess, std::enable_if_t<UseTernary, bool> = true>
+  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
+    return TernarySelectOp().template packetOp<PacketReturnType>(m_thenImpl.template packet<LoadMode>(index),
+                                                                 m_elseImpl.template packet<LoadMode>(index),
+                                                                 m_condImpl.template packet<LoadMode>(index));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    return m_condImpl.costPerCoeff(vectorized) +
+           m_thenImpl.costPerCoeff(vectorized).cwiseMax(m_elseImpl.costPerCoeff(vectorized));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    auto then_req = m_thenImpl.getResourceRequirements();
+    auto else_req = m_elseImpl.getResourceRequirements();
+
+    auto merged_req = internal::TensorBlockResourceRequirements::merge(then_req, else_req);
+    merged_req.cost_per_coeff = then_req.cost_per_coeff.cwiseMax(else_req.cost_per_coeff);
+
+    return internal::TensorBlockResourceRequirements::merge(m_condImpl.getResourceRequirements(), merged_req);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+                                                          bool /*root_of_expr_ast*/ = false) const {
+    // It's unsafe to pass destination buffer to underlying expressions, because
+    // output might be aliased with one of the inputs.
+    desc.DropDestinationBuffer();
+
+    return TensorBlock(m_condImpl.block(desc, scratch), m_thenImpl.block(desc, scratch),
+                       m_elseImpl.block(desc, scratch), TensorSelectOpBlockFactory());
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; }
+
+#ifdef EIGEN_USE_SYCL
+  // binding placeholder accessors to a command group handler for SYCL
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler& cgh) const {
+    m_condImpl.bind(cgh);
+    m_thenImpl.bind(cgh);
+    m_elseImpl.bind(cgh);
+  }
+#endif
+ private:
+  TensorEvaluator<IfArgType, Device> m_condImpl;
+  TensorEvaluator<ThenArgType, Device> m_thenImpl;
+  TensorEvaluator<ElseArgType, Device> m_elseImpl;
+};
+
+}  // end namespace Eigen
+
+#if defined(EIGEN_USE_SYCL) && defined(SYCL_COMPILER_IS_DPCPP)
+template <typename Derived, typename Device>
+struct cl::sycl::is_device_copyable<
+    Eigen::TensorEvaluator<Derived, Device>,
+    std::enable_if_t<!std::is_trivially_copyable<Eigen::TensorEvaluator<Derived, Device>>::value>> : std::true_type {};
+#endif
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_EVALUATOR_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
new file mode 100644
index 00000000..da332107
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -0,0 +1,670 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
+#define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+/**
+ * Evaluating TensorBroadcastingOp via coefficient of packet path is extremely
+ * expensive. If expression has at least one broadcast op in it, and it supports
+ * block based evaluation, we always prefer it, even for the small tensors. For
+ * all other tileable ops, block evaluation overhead for small tensors (fits
+ * into L1) is too large, and we fallback on vectorized evaluation.
+ */
+
+// TODO(ezhulenev): Add specializations for all other types of Tensor ops.
+
+template <typename Expression>
+struct ExpressionHasTensorBroadcastingOp {
+  enum { value = false };
+};
+
+template <typename LhsXprType, typename RhsXprType>
+struct ExpressionHasTensorBroadcastingOp<const TensorAssignOp<LhsXprType, RhsXprType> > {
+  enum { value = ExpressionHasTensorBroadcastingOp<RhsXprType>::value };
+};
+
+template <typename UnaryOp, typename XprType>
+struct ExpressionHasTensorBroadcastingOp<const TensorCwiseUnaryOp<UnaryOp, XprType> > {
+  enum { value = ExpressionHasTensorBroadcastingOp<XprType>::value };
+};
+
+template <typename BinaryOp, typename LhsXprType, typename RhsXprType>
+struct ExpressionHasTensorBroadcastingOp<const TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> > {
+  enum {
+    value = ExpressionHasTensorBroadcastingOp<LhsXprType>::value || ExpressionHasTensorBroadcastingOp<RhsXprType>::value
+  };
+};
+
+template <typename Broadcast, typename XprType>
+struct ExpressionHasTensorBroadcastingOp<const TensorBroadcastingOp<Broadcast, XprType> > {
+  enum { value = true };
+};
+
+// -------------------------------------------------------------------------- //
+
+/**
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief The tensor executor class.
+ *
+ * This class is responsible for launch the evaluation of the expression on
+ * the specified computing device.
+ *
+ * Default strategy: the expression is evaluated sequentially with a single cpu
+ * thread, without vectorization and block evaluation.
+ *
+ * @tparam Vectorizable can use packet math (SSE/AVX/etc... registers and
+ *                      instructions)
+ * @tparam Tiling       can use block based tensor evaluation
+ *                      (see TensorBlock.h)
+ */
+template <typename Expression, typename Device, bool Vectorizable, TiledEvaluation Tiling>
+class TensorExecutor {
+ public:
+  typedef typename Expression::Index StorageIndex;
+
+  // Including `unsupported/Eigen/CXX11/Tensor` in different translation units
+  // with/without `EIGEN_USE_THREADS` or `EIGEN_USE_GPU` is a potential ODR
+  // violation. If this template is instantiated with a non-default device, it
+  // means that this header file was included without defining
+  // `EIGEN_USE_THREADS`, `EIGEN_USE_GPU` or `EIGEN_USE_SYCL`.
+  static_assert(std::is_same<Device, DefaultDevice>::value,
+                "Default executor instantiated with non-default device. "
+                "You must #define EIGEN_USE_THREADS, EIGEN_USE_GPU or "
+                "EIGEN_USE_SYCL before including Eigen headers.");
+
+  static EIGEN_STRONG_INLINE void run(const Expression& expr, const Device& device = DefaultDevice()) {
+    TensorEvaluator<Expression, Device> evaluator(expr, device);
+    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+    if (needs_assign) {
+      const StorageIndex size = array_prod(evaluator.dimensions());
+      for (StorageIndex i = 0; i < size; ++i) {
+        evaluator.evalScalar(i);
+      }
+    }
+    evaluator.cleanup();
+  }
+};
+
+/**
+ * Default async execution strategy is not implemented. Currently it's only
+ * available for ThreadPoolDevice (see definition below).
+ */
+template <typename Expression, typename Device, typename DoneCallback, bool Vectorizable, TiledEvaluation Tiling>
+class TensorAsyncExecutor {};
+
+/**
+ * Process all the data with a single cpu thread, using vectorized instructions.
+ */
+template <typename Expression>
+class TensorExecutor<Expression, DefaultDevice, /*Vectorizable=*/true,
+                     /*Tiling=*/TiledEvaluation::Off> {
+ public:
+  typedef typename Expression::Index StorageIndex;
+
+  static EIGEN_STRONG_INLINE void run(const Expression& expr, const DefaultDevice& device = DefaultDevice()) {
+    TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
+    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+    if (needs_assign) {
+      const StorageIndex size = array_prod(evaluator.dimensions());
+      const int PacketSize =
+          unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size;
+
+      // Give compiler a strong possibility to unroll the loop. But don't insist
+      // on unrolling, because if the function is expensive compiler should not
+      // unroll the loop at the expense of inlining.
+      const StorageIndex UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize;
+      for (StorageIndex i = 0; i < UnrolledSize; i += 4 * PacketSize) {
+        for (StorageIndex j = 0; j < 4; j++) {
+          evaluator.evalPacket(i + j * PacketSize);
+        }
+      }
+      const StorageIndex VectorizedSize = (size / PacketSize) * PacketSize;
+      for (StorageIndex i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
+        evaluator.evalPacket(i);
+      }
+      for (StorageIndex i = VectorizedSize; i < size; ++i) {
+        evaluator.evalScalar(i);
+      }
+    }
+    evaluator.cleanup();
+  }
+};
+
+/**
+ * Process all the data with a single cpu thread, using blocks of data. By
+ * sizing a block to fit L1 cache we get better cache performance.
+ */
+template <typename Expression, bool Vectorizable>
+class TensorExecutor<Expression, DefaultDevice, Vectorizable,
+                     /*Tiling=*/TiledEvaluation::On> {
+ public:
+  typedef typename traits<Expression>::Scalar Scalar;
+  typedef std::remove_const_t<Scalar> ScalarNoConst;
+
+  typedef TensorEvaluator<Expression, DefaultDevice> Evaluator;
+  typedef typename traits<Expression>::Index StorageIndex;
+
+  static constexpr int NumDims = traits<Expression>::NumDimensions;
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(const Expression& expr,
+                                                        const DefaultDevice& device = DefaultDevice()) {
+    typedef TensorBlockMapper<NumDims, Evaluator::Layout, StorageIndex> TensorBlockMapper;
+
+    typedef internal::TensorBlockDescriptor<NumDims, StorageIndex> TensorBlockDesc;
+    typedef internal::TensorBlockScratchAllocator<DefaultDevice> TensorBlockScratch;
+
+    Evaluator evaluator(expr, device);
+
+    // TODO(ezhulenev): Do not use tiling for small tensors?
+    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+
+    if (needs_assign) {
+      // Query expression tree for desired block size/shape.
+      const TensorBlockResourceRequirements requirements = evaluator.getResourceRequirements();
+
+      const TensorBlockMapper block_mapper(typename TensorBlockDesc::Dimensions(evaluator.dimensions()), requirements);
+
+      // Share scratch memory allocator between all blocks.
+      TensorBlockScratch scratch(device);
+
+      const StorageIndex total_block_count = block_mapper.blockCount();
+      for (StorageIndex i = 0; i < total_block_count; ++i) {
+        TensorBlockDesc desc = block_mapper.blockDescriptor(i);
+        evaluator.evalBlock(desc, scratch);
+        scratch.reset();
+      }
+    }
+    evaluator.cleanup();
+  }
+};
+
+/**
+ * Multicore strategy: the index space is partitioned and each partition is
+ * executed on a single core.
+ *
+ * (1) TensorExecutor will submit work to the ThreadPoolDevice managed thread
+ *     pool, and will block the caller thread until all tasks are finished.
+ *
+ * (2) TensorAsyncExecutor is a non-blocking version, that will submit work to
+ *     the ThreadPoolDevice managed thread pool, and will return immediately.
+ *     It will call 'done' callback after all tasks are finished.
+ */
+#ifdef EIGEN_USE_THREADS
+
+template <typename TensorBlockMapper>
+struct TensorExecutorTilingContext {
+  TensorExecutorTilingContext() = default;
+  TensorExecutorTilingContext(const TensorBlockMapper& b_mapper, const TensorOpCost& b_cost, size_t b_aligned_size)
+      : block_mapper(b_mapper), cost(b_cost), aligned_blocksize(b_aligned_size) {}
+
+  TensorBlockMapper block_mapper;  // navigate through blocks
+  TensorOpCost cost;               // cost of computing a single block
+  size_t aligned_blocksize;        // block size after memory alignment
+};
+
+// Computes a block evaluation parameters, and allocates temporary memory buffer
+// for blocks. See TensorExecutor/TensorAsyncExecutor (Tiling=On) below.
+template <typename Evaluator, typename TensorBlockMapper, bool Vectorizable>
+TensorExecutorTilingContext<TensorBlockMapper> GetTensorExecutorTilingContext(const Evaluator& evaluator) {
+  // Query expression tree for desired block size/shape.
+  TensorBlockResourceRequirements requirements = evaluator.getResourceRequirements();
+
+  // Update target block size based on cost model.
+  double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize(1, requirements.cost_per_coeff);
+  requirements.size = static_cast<size_t>(1.0 / taskSize);
+
+  TensorBlockMapper block_mapper(typename TensorBlockMapper::Dimensions(evaluator.dimensions()), requirements);
+
+  size_t block_size = block_mapper.blockTotalSize();
+  const size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1);
+  const size_t aligned_blocksize =
+      align * numext::div_ceil<size_t>(block_size * sizeof(typename Evaluator::Scalar), align);
+
+  return {block_mapper, requirements.cost_per_coeff * block_size, aligned_blocksize};
+}
+
+template <typename Evaluator, typename StorageIndex, bool Vectorizable>
+struct EvalRange {
+  static void run(Evaluator* evaluator_in, const StorageIndex firstIdx, const StorageIndex lastIdx) {
+    Evaluator evaluator = *evaluator_in;
+    eigen_assert(lastIdx >= firstIdx);
+    for (StorageIndex i = firstIdx; i < lastIdx; ++i) {
+      evaluator.evalScalar(i);
+    }
+  }
+
+  static StorageIndex alignBlockSize(StorageIndex size) { return size; }
+};
+
+template <typename Evaluator, typename StorageIndex>
+struct EvalRange<Evaluator, StorageIndex, /*Vectorizable*/ true> {
+  static constexpr int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
+
+  static void run(Evaluator* evaluator_in, const StorageIndex firstIdx, const StorageIndex lastIdx) {
+    Evaluator evaluator = *evaluator_in;
+    eigen_assert(lastIdx >= firstIdx);
+    StorageIndex i = firstIdx;
+    if (lastIdx - firstIdx >= PacketSize) {
+      eigen_assert(firstIdx % PacketSize == 0);
+      StorageIndex last_chunk_offset = lastIdx - 4 * PacketSize;
+      // Give compiler a strong possibility to unroll the loop. But don't insist
+      // on unrolling, because if the function is expensive compiler should not
+      // unroll the loop at the expense of inlining.
+      for (; i <= last_chunk_offset; i += 4 * PacketSize) {
+        for (StorageIndex j = 0; j < 4; j++) {
+          evaluator.evalPacket(i + j * PacketSize);
+        }
+      }
+      last_chunk_offset = lastIdx - PacketSize;
+      for (; i <= last_chunk_offset; i += PacketSize) {
+        evaluator.evalPacket(i);
+      }
+    }
+    for (; i < lastIdx; ++i) {
+      evaluator.evalScalar(i);
+    }
+  }
+
+  static StorageIndex alignBlockSize(StorageIndex size) {
+    // Align block size to packet size and account for unrolling in run above.
+    if (size >= 16 * PacketSize) {
+      return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1);
+    }
+    // Aligning to 4 * PacketSize would increase block size by more than 25%.
+    return (size + PacketSize - 1) & ~(PacketSize - 1);
+  }
+};
+
+template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
+class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tiling> {
+ public:
+  typedef typename Expression::Index StorageIndex;
+
+  static EIGEN_STRONG_INLINE void run(const Expression& expr, const ThreadPoolDevice& device) {
+    typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
+    typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
+
+    Evaluator evaluator(expr, device);
+    const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
+    if (needs_assign) {
+      const StorageIndex size = array_prod(evaluator.dimensions());
+      device.parallelFor(
+          size, evaluator.costPerCoeff(Vectorizable), EvalRange::alignBlockSize,
+          [&evaluator](StorageIndex firstIdx, StorageIndex lastIdx) { EvalRange::run(&evaluator, firstIdx, lastIdx); });
+    }
+    evaluator.cleanup();
+  }
+};
+
+template <typename Expression, bool Vectorizable>
+class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
+                     /*Tiling=*/TiledEvaluation::On> {
+ public:
+  typedef typename traits<Expression>::Index IndexType;
+  typedef typename traits<Expression>::Scalar Scalar;
+  typedef std::remove_const_t<Scalar> ScalarNoConst;
+
+  static constexpr int NumDims = traits<Expression>::NumDimensions;
+
+  typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
+  typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
+  typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
+
+  typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice> TensorBlockScratch;
+
+  static EIGEN_STRONG_INLINE void run(const Expression& expr, const ThreadPoolDevice& device) {
+    Evaluator evaluator(expr, device);
+
+    const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
+    if (needs_assign) {
+      const TilingContext tiling =
+          internal::GetTensorExecutorTilingContext<Evaluator, BlockMapper, Vectorizable>(evaluator);
+
+      auto eval_block = [&device, &evaluator, &tiling](IndexType firstBlockIdx, IndexType lastBlockIdx) {
+        TensorBlockScratch scratch(device);
+
+        for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx; ++block_idx) {
+          TensorBlockDesc desc = tiling.block_mapper.blockDescriptor(block_idx);
+          evaluator.evalBlock(desc, scratch);
+          scratch.reset();
+        }
+      };
+
+      // Evaluate small expressions directly as a single block.
+      if (tiling.block_mapper.blockCount() == 1) {
+        TensorBlockScratch scratch(device);
+        TensorBlockDesc desc(0, tiling.block_mapper.blockDimensions());
+        evaluator.evalBlock(desc, scratch);
+      } else {
+        device.parallelFor(tiling.block_mapper.blockCount(), tiling.cost, std::move(eval_block));
+      }
+    }
+    evaluator.cleanup();
+  }
+};
+
+template <typename Expression, typename DoneCallback, bool Vectorizable, TiledEvaluation Tiling>
+class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback, Vectorizable, Tiling> {
+ public:
+  typedef typename Expression::Index StorageIndex;
+  typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
+
+  static EIGEN_STRONG_INLINE void runAsync(const Expression& expr, const ThreadPoolDevice& device, DoneCallback done) {
+    TensorAsyncExecutorContext* const ctx = new TensorAsyncExecutorContext(expr, device, std::move(done));
+
+    const auto on_eval_subexprs = [ctx, &device](bool need_assign) -> void {
+      if (!need_assign) {
+        delete ctx;
+        return;
+      }
+
+      typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
+      const StorageIndex size = array_prod(ctx->evaluator.dimensions());
+      device.parallelForAsync(
+          size, ctx->evaluator.costPerCoeff(Vectorizable), EvalRange::alignBlockSize,
+          [ctx](StorageIndex firstIdx, StorageIndex lastIdx) { EvalRange::run(&ctx->evaluator, firstIdx, lastIdx); },
+          [ctx]() { delete ctx; });
+    };
+
+    ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs);
+  }
+
+ private:
+  struct TensorAsyncExecutorContext {
+    TensorAsyncExecutorContext(const Expression& expr, const ThreadPoolDevice& thread_pool, DoneCallback done)
+        : evaluator(expr, thread_pool), on_done(std::move(done)) {}
+
+    ~TensorAsyncExecutorContext() {
+      evaluator.cleanup();
+      on_done();
+    }
+
+    Evaluator evaluator;
+
+   private:
+    DoneCallback on_done;
+  };
+};
+
+template <typename Expression, typename DoneCallback, bool Vectorizable>
+class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback, Vectorizable, /*Tileable*/ TiledEvaluation::On> {
+ public:
+  typedef typename traits<Expression>::Index IndexType;
+  typedef typename traits<Expression>::Scalar Scalar;
+  typedef std::remove_const_t<Scalar> ScalarNoConst;
+
+  static constexpr int NumDims = traits<Expression>::NumDimensions;
+
+  typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
+  typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
+  typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
+
+  typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice> TensorBlockScratch;
+
+  static EIGEN_STRONG_INLINE void runAsync(const Expression& expr, const ThreadPoolDevice& device, DoneCallback done) {
+    TensorAsyncExecutorContext* const ctx = new TensorAsyncExecutorContext(expr, device, std::move(done));
+
+    const auto on_eval_subexprs = [ctx](bool need_assign) -> void {
+      if (!need_assign) {
+        delete ctx;
+        return;
+      }
+
+      ctx->tiling = internal::GetTensorExecutorTilingContext<Evaluator, BlockMapper, Vectorizable>(ctx->evaluator);
+
+      auto eval_block = [ctx](IndexType firstBlockIdx, IndexType lastBlockIdx) {
+        TensorBlockScratch scratch(ctx->device);
+
+        for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx; ++block_idx) {
+          TensorBlockDesc desc = ctx->tiling.block_mapper.blockDescriptor(block_idx);
+          ctx->evaluator.evalBlock(desc, scratch);
+          scratch.reset();
+        }
+      };
+
+      // Evaluate small expressions directly as a single block.
+      if (ctx->tiling.block_mapper.blockCount() == 1) {
+        TensorBlockScratch scratch(ctx->device);
+        TensorBlockDesc desc(0, ctx->tiling.block_mapper.blockDimensions());
+        ctx->evaluator.evalBlock(desc, scratch);
+        delete ctx;
+      } else {
+        ctx->device.parallelForAsync(ctx->tiling.block_mapper.blockCount(), ctx->tiling.cost, eval_block,
+                                     [ctx]() { delete ctx; });
+      }
+    };
+
+    ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs);
+  }
+
+ private:
+  struct TensorAsyncExecutorContext {
+    TensorAsyncExecutorContext(const Expression& expr, const ThreadPoolDevice& thread_pool, DoneCallback done)
+        : device(thread_pool), evaluator(expr, thread_pool), on_done(std::move(done)) {}
+
+    ~TensorAsyncExecutorContext() {
+      evaluator.cleanup();
+      on_done();
+    }
+
+    const ThreadPoolDevice& device;
+    Evaluator evaluator;
+    TilingContext tiling;
+
+   private:
+    DoneCallback on_done;
+  };
+};
+
+#endif  // EIGEN_USE_THREADS
+
+// GPU: the evaluation of the expression is offloaded to a GPU.
+#if defined(EIGEN_USE_GPU)
+
+template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
+class TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling> {
+ public:
+  typedef typename Expression::Index StorageIndex;
+  static void run(const Expression& expr, const GpuDevice& device);
+};
+
+#if defined(EIGEN_GPUCC)
+// Returns 1 if lhs + rhs would overflow, -1 if it would underflow, otherwise 0.
+template <typename Index>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int sum_will_overflow(Index lhs, Index rhs) {
+  const Index highest = NumTraits<Index>::highest();
+  const Index lowest = NumTraits<Index>::lowest();
+  if (lhs > 0 && rhs > 0) {
+    return lhs > highest - rhs ? 1 : 0;
+  } else if (lhs < 0 && rhs < 0) {
+    return lhs < lowest - rhs ? -1 : 0;
+  } else {
+    return 0;
+  }
+}
+
+// Returns lhs + rhs, saturating to the highest/lowest representable value on
+// overflow/underflow respectively.
+template <typename Index>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index saturate_add(Index lhs, Index rhs) {
+  const Index highest = NumTraits<Index>::highest();
+  const Index lowest = NumTraits<Index>::lowest();
+  int overflow = sum_will_overflow(lhs, rhs);
+  return overflow == 1 ? highest : overflow == -1 ? lowest : lhs + rhs;
+}
+
+// A functor that adds step_size to a given index, saturating to avoid
+// overflow/underflow. If overflow/underflow is not possible, regular addition
+// is used (for efficiency).
+template <typename Index>
+struct SafeStep {
+  // lastIdx is one past the end of the possible indexes.
+  // step_size is the value that will be added to the given index when the
+  // functor is called.
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE SafeStep(Index lastIdx, Index step_size)
+      : can_overflow_(sum_will_overflow(lastIdx, step_size)), step_size_(step_size) {}
+
+  // Adds step_size to index, saturating on overflow (if overflow is possible).
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index operator()(Index index) const {
+    return can_overflow_ ? saturate_add(index, step_size_) : index + step_size_;
+  }
+
+ private:
+  const bool can_overflow_;
+  const Index step_size_;
+};
+
+template <typename Evaluator, typename StorageIndex, bool Vectorizable>
+struct EigenMetaKernelEval {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx,
+                                                        StorageIndex step_size) {
+    SafeStep<StorageIndex> safe_step(lastIdx, step_size);
+    for (StorageIndex i = firstIdx; i < lastIdx; i = safe_step(i)) {
+      eval.evalScalar(i);
+    }
+  }
+};
+
+template <typename Evaluator, typename StorageIndex>
+struct EigenMetaKernelEval<Evaluator, StorageIndex, true> {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx,
+                                                        StorageIndex step_size) {
+    const StorageIndex PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
+    const StorageIndex vectorized_size = (lastIdx / PacketSize) * PacketSize;
+    const StorageIndex vectorized_step_size = step_size * PacketSize;
+
+    SafeStep<StorageIndex> safe_vectorized_step(vectorized_size, vectorized_step_size);
+    // Use the vector path
+    for (StorageIndex i = firstIdx * PacketSize; i < vectorized_size; i = safe_vectorized_step(i)) {
+      eval.evalPacket(i);
+    }
+    SafeStep<StorageIndex> safe_step(lastIdx, step_size);
+    for (StorageIndex i = saturate_add(vectorized_size, firstIdx); i < lastIdx; i = safe_step(i)) {
+      eval.evalScalar(i);
+    }
+  }
+};
+
+template <typename Evaluator, typename StorageIndex>
+__global__ void __launch_bounds__(1024) EigenMetaKernel(Evaluator eval, StorageIndex size) {
+  const StorageIndex first_index = blockIdx.x * blockDim.x + threadIdx.x;
+  const StorageIndex step_size = blockDim.x * gridDim.x;
+
+  const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
+  EigenMetaKernelEval<Evaluator, StorageIndex, vectorizable>::run(eval, first_index, size, step_size);
+}
+
+/*static*/
+template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
+EIGEN_STRONG_INLINE void TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling>::run(const Expression& expr,
+                                                                                          const GpuDevice& device) {
+  TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
+  const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
+  if (needs_assign) {
+    const int block_size = device.maxGpuThreadsPerBlock();
+    const int max_blocks = static_cast<int>(
+        numext::mini<int64_t>(device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor(),
+                              NumTraits<StorageIndex>::highest()) /
+        block_size);
+    const StorageIndex size = array_prod(evaluator.dimensions());
+    // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
+    const int num_blocks = numext::maxi<int>(
+        numext::mini<int>(max_blocks, static_cast<int>(numext::div_ceil<StorageIndex>(size, block_size))), 1);
+
+    LAUNCH_GPU_KERNEL((EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, StorageIndex>), num_blocks, block_size,
+                      0, device, evaluator, size);
+  }
+  evaluator.cleanup();
+}
+
+#endif  // EIGEN_GPUCC
+#endif  // EIGEN_USE_GPU
+
+// SYCL Executor policy
+#ifdef EIGEN_USE_SYCL
+
+template <typename Evaluator>
+struct ExecExprFunctorKernel {
+  typedef typename Evaluator::Index Index;
+  Evaluator evaluator;
+  const Index range;
+  template <typename Scratch>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel(const Scratch, Evaluator evaluator_, const Index range_)
+      : evaluator(evaluator_), range(range_) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void operator()(cl::sycl::nd_item<1> itemID) const { compute(itemID); }
+  template <bool is_vec = Evaluator::PacketAccess>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t<!is_vec> compute(const cl::sycl::nd_item<1>& itemID) const {
+    Index gId = static_cast<Index>(itemID.get_global_linear_id());
+    Index total_threads = itemID.get_global_range(0);
+
+    for (Index i = gId; i < range; i += total_threads) {
+      evaluator.evalScalar(i);
+    }
+  }
+  template <bool is_vec = Evaluator::PacketAccess>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t<is_vec> compute(const cl::sycl::nd_item<1>& itemID) const {
+    const Index vectorizedRange = (range / Evaluator::PacketSize) * Evaluator::PacketSize;
+    Index gId = static_cast<Index>(itemID.get_global_linear_id());
+    const Index step = Evaluator::PacketSize * itemID.get_global_range(0);
+    const Index start = Evaluator::PacketSize * gId;
+    for (Index i = start; i < vectorizedRange; i += step) {
+      evaluator.evalPacket(i);
+    }
+    gId += vectorizedRange;
+    for (Index i = gId; i < range; i += itemID.get_global_range(0)) {
+      evaluator.evalScalar(i);
+    }
+  }
+};
+
+template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
+class TensorExecutor<Expression, Eigen::SyclDevice, Vectorizable, Tiling> {
+ public:
+  typedef typename Expression::Index Index;
+  static EIGEN_STRONG_INLINE void run(const Expression& expr, const Eigen::SyclDevice& dev) {
+    typedef Eigen::TensorEvaluator<Expression, Eigen::SyclDevice> Evaluator;
+    Evaluator evaluator(expr, dev);
+    const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
+    if (needs_assign) {
+      Index range, GRange, tileSize;
+      Index total_size = ::Eigen::internal::array_prod(evaluator.dimensions());
+      total_size = (total_size == 0) ? 1 : total_size;
+      const int PacketSize = Eigen::PacketType<typename Evaluator::CoeffReturnType, Eigen::SyclDevice>::size;
+      Index vectorizable_threads = static_cast<Index>(total_size / PacketSize);
+      dev.parallel_for_setup(vectorizable_threads, tileSize, range, GRange);
+      range = total_size;
+
+      dev.template nullary_kernel_launcher<typename Evaluator::CoeffReturnType, ExecExprFunctorKernel<Evaluator> >(
+             evaluator, cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), Index(1),
+             range)
+          .wait();
+    }
+    evaluator.cleanup();
+  }
+};
+
+#endif
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
new file mode 100644
index 00000000..a0e558ba
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
@@ -0,0 +1,338 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXPR_H
+#define EIGEN_CXX11_TENSOR_TENSOR_EXPR_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+template <typename NullaryOp, typename XprType>
+struct traits<TensorCwiseNullaryOp<NullaryOp, XprType> > : traits<XprType> {
+  typedef traits<XprType> XprTraits;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::Nested XprTypeNested;
+  typedef std::remove_reference_t<XprTypeNested> XprTypeNested_;
+  static constexpr int NumDimensions = XprTraits::NumDimensions;
+  static constexpr int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
+  enum { Flags = 0 };
+};
+
+}  // end namespace internal
+
+/**
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor nullary expression.
+ *
+ * The TensorCwiseNullaryOp class applies a nullary operators to an expression.
+ * This is typically used to generate constants.
+ */
+template <typename NullaryOp, typename XprType>
+class TensorCwiseNullaryOp : public TensorBase<TensorCwiseNullaryOp<NullaryOp, XprType>, ReadOnlyAccessors> {
+ public:
+  typedef typename Eigen::internal::traits<TensorCwiseNullaryOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef TensorCwiseNullaryOp<NullaryOp, XprType> Nested;
+  typedef typename Eigen::internal::traits<TensorCwiseNullaryOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorCwiseNullaryOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseNullaryOp(const XprType& xpr, const NullaryOp& func = NullaryOp())
+      : m_xpr(xpr), m_functor(func) {}
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename XprType::Nested>& nestedExpression() const { return m_xpr; }
+
+  EIGEN_DEVICE_FUNC const NullaryOp& functor() const { return m_functor; }
+
+ protected:
+  typename XprType::Nested m_xpr;
+  const NullaryOp m_functor;
+};
+
+namespace internal {
+template <typename UnaryOp, typename XprType>
+struct traits<TensorCwiseUnaryOp<UnaryOp, XprType> > : traits<XprType> {
+  // TODO(phli): Add InputScalar, InputPacket.  Check references to
+  // current Scalar/Packet to see if the intent is Input or Output.
+  typedef typename result_of<UnaryOp(typename XprType::Scalar)>::type Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprType::Nested XprTypeNested;
+  typedef std::remove_reference_t<XprTypeNested> XprTypeNested_;
+  static constexpr int NumDimensions = XprTraits::NumDimensions;
+  static constexpr int Layout = XprTraits::Layout;
+  typedef typename TypeConversion<Scalar, typename XprTraits::PointerType>::type PointerType;
+};
+
+template <typename UnaryOp, typename XprType>
+struct eval<TensorCwiseUnaryOp<UnaryOp, XprType>, Eigen::Dense> {
+  typedef const TensorCwiseUnaryOp<UnaryOp, XprType>& type;
+};
+
+template <typename UnaryOp, typename XprType>
+struct nested<TensorCwiseUnaryOp<UnaryOp, XprType>, 1, typename eval<TensorCwiseUnaryOp<UnaryOp, XprType> >::type> {
+  typedef TensorCwiseUnaryOp<UnaryOp, XprType> type;
+};
+
+}  // end namespace internal
+
+/**
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor unary expression.
+ *
+ * The TensorCwiseUnaryOp class represents an expression where a unary operator
+ * (e.g. cwiseSqrt) is applied to an expression.
+ */
+template <typename UnaryOp, typename XprType>
+class TensorCwiseUnaryOp : public TensorBase<TensorCwiseUnaryOp<UnaryOp, XprType>, ReadOnlyAccessors> {
+ public:
+  // TODO(phli): Add InputScalar, InputPacket.  Check references to
+  // current Scalar/Packet to see if the intent is Input or Output.
+  typedef typename Eigen::internal::traits<TensorCwiseUnaryOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef Scalar CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorCwiseUnaryOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorCwiseUnaryOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorCwiseUnaryOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseUnaryOp(const XprType& xpr, const UnaryOp& func = UnaryOp())
+      : m_xpr(xpr), m_functor(func) {}
+
+  EIGEN_DEVICE_FUNC const UnaryOp& functor() const { return m_functor; }
+
+  /** \returns the nested expression */
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename XprType::Nested>& nestedExpression() const { return m_xpr; }
+
+ protected:
+  typename XprType::Nested m_xpr;
+  const UnaryOp m_functor;
+};
+
+namespace internal {
+template <typename BinaryOp, typename LhsXprType, typename RhsXprType>
+struct traits<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> > {
+  // Type promotion to handle the case where the types of the lhs and the rhs
+  // are different.
+  // TODO(phli): Add Lhs/RhsScalar, Lhs/RhsPacket.  Check references to
+  // current Scalar/Packet to see if the intent is Inputs or Output.
+  typedef typename result_of<BinaryOp(typename LhsXprType::Scalar, typename RhsXprType::Scalar)>::type Scalar;
+  typedef traits<LhsXprType> XprTraits;
+  typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind,
+                                        typename traits<RhsXprType>::StorageKind>::ret StorageKind;
+  typedef
+      typename promote_index_type<typename traits<LhsXprType>::Index, typename traits<RhsXprType>::Index>::type Index;
+  typedef typename LhsXprType::Nested LhsNested;
+  typedef typename RhsXprType::Nested RhsNested;
+  typedef std::remove_reference_t<LhsNested> LhsNested_;
+  typedef std::remove_reference_t<RhsNested> RhsNested_;
+  static constexpr int NumDimensions = XprTraits::NumDimensions;
+  static constexpr int Layout = XprTraits::Layout;
+  typedef typename TypeConversion<Scalar,
+                                  std::conditional_t<Pointer_type_promotion<typename LhsXprType::Scalar, Scalar>::val,
+                                                     typename traits<LhsXprType>::PointerType,
+                                                     typename traits<RhsXprType>::PointerType> >::type PointerType;
+  enum { Flags = 0 };
+};
+
+template <typename BinaryOp, typename LhsXprType, typename RhsXprType>
+struct eval<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType>, Eigen::Dense> {
+  typedef const TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType>& type;
+};
+
+template <typename BinaryOp, typename LhsXprType, typename RhsXprType>
+struct nested<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType>, 1,
+              typename eval<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> >::type> {
+  typedef TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> type;
+};
+
+}  // end namespace internal
+
+/**
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor binary expression.
+ *
+ * The TensorCwiseBinaryOp class represents an expression where a binary
+ * operator (e.g. addition) is applied to a lhs and a rhs expression.
+ */
+template <typename BinaryOp, typename LhsXprType, typename RhsXprType>
+class TensorCwiseBinaryOp
+    : public TensorBase<TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType>, ReadOnlyAccessors> {
+ public:
+  // TODO(phli): Add Lhs/RhsScalar, Lhs/RhsPacket.  Check references to
+  // current Scalar/Packet to see if the intent is Inputs or Output.
+  typedef typename Eigen::internal::traits<TensorCwiseBinaryOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef Scalar CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorCwiseBinaryOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorCwiseBinaryOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorCwiseBinaryOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseBinaryOp(const LhsXprType& lhs, const RhsXprType& rhs,
+                                                            const BinaryOp& func = BinaryOp())
+      : m_lhs_xpr(lhs), m_rhs_xpr(rhs), m_functor(func) {}
+
+  EIGEN_DEVICE_FUNC const BinaryOp& functor() const { return m_functor; }
+
+  /** \returns the nested expressions */
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename LhsXprType::Nested>& lhsExpression() const {
+    return m_lhs_xpr;
+  }
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename RhsXprType::Nested>& rhsExpression() const {
+    return m_rhs_xpr;
+  }
+
+ protected:
+  typename LhsXprType::Nested m_lhs_xpr;
+  typename RhsXprType::Nested m_rhs_xpr;
+  const BinaryOp m_functor;
+};
+
+namespace internal {
+template <typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType>
+struct traits<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType> > {
+  // Type promotion to handle the case where the types of the args are different.
+  typedef typename result_of<TernaryOp(typename Arg1XprType::Scalar, typename Arg2XprType::Scalar,
+                                       typename Arg3XprType::Scalar)>::type Scalar;
+  typedef traits<Arg1XprType> XprTraits;
+  typedef typename traits<Arg1XprType>::StorageKind StorageKind;
+  typedef typename traits<Arg1XprType>::Index Index;
+  typedef typename Arg1XprType::Nested Arg1Nested;
+  typedef typename Arg2XprType::Nested Arg2Nested;
+  typedef typename Arg3XprType::Nested Arg3Nested;
+  typedef std::remove_reference_t<Arg1Nested> Arg1Nested_;
+  typedef std::remove_reference_t<Arg2Nested> Arg2Nested_;
+  typedef std::remove_reference_t<Arg3Nested> Arg3Nested_;
+  static constexpr int NumDimensions = XprTraits::NumDimensions;
+  static constexpr int Layout = XprTraits::Layout;
+  typedef typename TypeConversion<Scalar,
+                                  std::conditional_t<Pointer_type_promotion<typename Arg2XprType::Scalar, Scalar>::val,
+                                                     typename traits<Arg2XprType>::PointerType,
+                                                     typename traits<Arg3XprType>::PointerType> >::type PointerType;
+  enum { Flags = 0 };
+};
+
+template <typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType>
+struct eval<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>, Eigen::Dense> {
+  typedef const TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>& type;
+};
+
+template <typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType>
+struct nested<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>, 1,
+              typename eval<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType> >::type> {
+  typedef TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType> type;
+};
+
+}  // end namespace internal
+
+template <typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType>
+class TensorCwiseTernaryOp
+    : public TensorBase<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>, ReadOnlyAccessors> {
+ public:
+  typedef typename Eigen::internal::traits<TensorCwiseTernaryOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef Scalar CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorCwiseTernaryOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorCwiseTernaryOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorCwiseTernaryOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseTernaryOp(const Arg1XprType& arg1, const Arg2XprType& arg2,
+                                                             const Arg3XprType& arg3,
+                                                             const TernaryOp& func = TernaryOp())
+      : m_arg1_xpr(arg1), m_arg2_xpr(arg2), m_arg3_xpr(arg3), m_functor(func) {}
+
+  EIGEN_DEVICE_FUNC const TernaryOp& functor() const { return m_functor; }
+
+  /** \returns the nested expressions */
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename Arg1XprType::Nested>& arg1Expression() const {
+    return m_arg1_xpr;
+  }
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename Arg2XprType::Nested>& arg2Expression() const {
+    return m_arg2_xpr;
+  }
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename Arg3XprType::Nested>& arg3Expression() const {
+    return m_arg3_xpr;
+  }
+
+ protected:
+  typename Arg1XprType::Nested m_arg1_xpr;
+  typename Arg2XprType::Nested m_arg2_xpr;
+  typename Arg3XprType::Nested m_arg3_xpr;
+  const TernaryOp m_functor;
+};
+
+namespace internal {
+template <typename IfXprType, typename ThenXprType, typename ElseXprType>
+struct traits<TensorSelectOp<IfXprType, ThenXprType, ElseXprType> > : traits<ThenXprType> {
+  typedef typename traits<ThenXprType>::Scalar Scalar;
+  typedef traits<ThenXprType> XprTraits;
+  typedef typename promote_storage_type<typename traits<ThenXprType>::StorageKind,
+                                        typename traits<ElseXprType>::StorageKind>::ret StorageKind;
+  typedef
+      typename promote_index_type<typename traits<ElseXprType>::Index, typename traits<ThenXprType>::Index>::type Index;
+  typedef typename IfXprType::Nested IfNested;
+  typedef typename ThenXprType::Nested ThenNested;
+  typedef typename ElseXprType::Nested ElseNested;
+  static constexpr int NumDimensions = XprTraits::NumDimensions;
+  static constexpr int Layout = XprTraits::Layout;
+  typedef std::conditional_t<Pointer_type_promotion<typename ThenXprType::Scalar, Scalar>::val,
+                             typename traits<ThenXprType>::PointerType, typename traits<ElseXprType>::PointerType>
+      PointerType;
+};
+
+template <typename IfXprType, typename ThenXprType, typename ElseXprType>
+struct eval<TensorSelectOp<IfXprType, ThenXprType, ElseXprType>, Eigen::Dense> {
+  typedef const TensorSelectOp<IfXprType, ThenXprType, ElseXprType>& type;
+};
+
+template <typename IfXprType, typename ThenXprType, typename ElseXprType>
+struct nested<TensorSelectOp<IfXprType, ThenXprType, ElseXprType>, 1,
+              typename eval<TensorSelectOp<IfXprType, ThenXprType, ElseXprType> >::type> {
+  typedef TensorSelectOp<IfXprType, ThenXprType, ElseXprType> type;
+};
+
+}  // end namespace internal
+
+template <typename IfXprType, typename ThenXprType, typename ElseXprType>
+class TensorSelectOp : public TensorBase<TensorSelectOp<IfXprType, ThenXprType, ElseXprType>, ReadOnlyAccessors> {
+ public:
+  typedef typename Eigen::internal::traits<TensorSelectOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename internal::promote_storage_type<typename ThenXprType::CoeffReturnType,
+                                                  typename ElseXprType::CoeffReturnType>::ret CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorSelectOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorSelectOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorSelectOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC TensorSelectOp(const IfXprType& a_condition, const ThenXprType& a_then, const ElseXprType& a_else)
+      : m_condition(a_condition), m_then(a_then), m_else(a_else) {}
+
+  EIGEN_DEVICE_FUNC const IfXprType& ifExpression() const { return m_condition; }
+
+  EIGEN_DEVICE_FUNC const ThenXprType& thenExpression() const { return m_then; }
+
+  EIGEN_DEVICE_FUNC const ElseXprType& elseExpression() const { return m_else; }
+
+ protected:
+  typename IfXprType::Nested m_condition;
+  typename ThenXprType::Nested m_then;
+  typename ElseXprType::Nested m_else;
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_EXPR_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
new file mode 100644
index 00000000..b9d6f376
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
@@ -0,0 +1,666 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Jianwei Cui <thucjw@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_FFT_H
+#define EIGEN_CXX11_TENSOR_TENSOR_FFT_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+template <bool IsReal>
+struct MakeComplex {
+  template <typename T>
+  EIGEN_DEVICE_FUNC T operator()(const T& val) const {
+    return val;
+  }
+};
+
+template <>
+struct MakeComplex<true> {
+  template <typename T>
+  EIGEN_DEVICE_FUNC internal::make_complex_t<T> operator()(const T& val) const {
+    return internal::make_complex_t<T>(val, T(0));
+  }
+};
+
+template <int ResultType>
+struct PartOf {
+  template <typename T>
+  T operator()(const T& val) const {
+    return val;
+  }
+};
+
+template <>
+struct PartOf<RealPart> {
+  template <typename T, typename EnableIf = std::enable_if_t<NumTraits<T>::IsComplex>>
+  typename NumTraits<T>::Real operator()(const T& val) const {
+    return Eigen::numext::real(val);
+  }
+};
+
+template <>
+struct PartOf<ImagPart> {
+  template <typename T, typename EnableIf = std::enable_if_t<NumTraits<T>::IsComplex>>
+  typename NumTraits<T>::Real operator()(const T& val) const {
+    return Eigen::numext::imag(val);
+  }
+};
+
+namespace internal {
+template <typename FFT, typename XprType, int FFTResultType, int FFTDir>
+struct traits<TensorFFTOp<FFT, XprType, FFTResultType, FFTDir> > : public traits<XprType> {
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef make_complex_t<Scalar> ComplexScalar;
+  typedef typename XprTraits::Scalar InputScalar;
+  typedef std::conditional_t<FFTResultType == RealPart || FFTResultType == ImagPart, RealScalar, ComplexScalar>
+      OutputScalar;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef std::remove_reference_t<Nested> Nested_;
+  static constexpr int NumDimensions = XprTraits::NumDimensions;
+  static constexpr int Layout = XprTraits::Layout;
+  typedef typename traits<XprType>::PointerType PointerType;
+};
+
+template <typename FFT, typename XprType, int FFTResultType, int FFTDirection>
+struct eval<TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection>, Eigen::Dense> {
+  typedef const TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection>& type;
+};
+
+template <typename FFT, typename XprType, int FFTResultType, int FFTDirection>
+struct nested<TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection>, 1,
+              typename eval<TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection> >::type> {
+  typedef TensorFFTOp<FFT, XprType, FFTResultType, FFTDirection> type;
+};
+
+}  // end namespace internal
+
+/**
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor FFT class.
+ *
+ * TODO:
+ * Vectorize the Cooley Tukey and the Bluestein algorithm
+ * Add support for multithreaded evaluation
+ * Improve the performance on GPU
+ */
+template <typename FFT, typename XprType, int FFTResultType, int FFTDir>
+class TensorFFTOp : public TensorBase<TensorFFTOp<FFT, XprType, FFTResultType, FFTDir>, ReadOnlyAccessors> {
+ public:
+  typedef typename Eigen::internal::traits<TensorFFTOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef internal::make_complex_t<Scalar> ComplexScalar;
+  typedef std::conditional_t<FFTResultType == RealPart || FFTResultType == ImagPart, RealScalar, ComplexScalar>
+      OutputScalar;
+  typedef OutputScalar CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorFFTOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorFFTOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorFFTOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFFTOp(const XprType& expr, const FFT& fft) : m_xpr(expr), m_fft(fft) {}
+
+  EIGEN_DEVICE_FUNC const FFT& fft() const { return m_fft; }
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename XprType::Nested>& expression() const { return m_xpr; }
+
+ protected:
+  typename XprType::Nested m_xpr;
+  const FFT m_fft;
+};
+
+// Eval as rvalue
+template <typename FFT, typename ArgType, typename Device, int FFTResultType, int FFTDir>
+struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, Device> {
+  typedef TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir> XprType;
+  typedef typename XprType::Index Index;
+  static constexpr int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef internal::make_complex_t<Scalar> ComplexScalar;
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
+  typedef internal::traits<XprType> XprTraits;
+  typedef typename XprTraits::Scalar InputScalar;
+  typedef std::conditional_t<FFTResultType == RealPart || FFTResultType == ImagPart, RealScalar, ComplexScalar>
+      OutputScalar;
+  typedef OutputScalar CoeffReturnType;
+  typedef typename PacketType<OutputScalar, Device>::type PacketReturnType;
+  static constexpr int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+  enum {
+    IsAligned = false,
+    PacketAccess = true,
+    BlockAccess = false,
+    PreferBlockAccess = false,
+    CoordAccess = false,
+    RawAccess = false
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_fft(op.fft()), m_impl(op.expression(), device), m_data(NULL), m_device(device) {
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+    for (int i = 0; i < NumDims; ++i) {
+      eigen_assert(input_dims[i] > 0);
+      m_dimensions[i] = input_dims[i];
+    }
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_strides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1];
+      }
+    } else {
+      m_strides[NumDims - 1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1];
+      }
+    }
+    m_size = m_dimensions.TotalSize();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    if (data) {
+      evalToBuf(data);
+      return false;
+    } else {
+      m_data = (EvaluatorPointerType)m_device.get(
+          (CoeffReturnType*)(m_device.allocate_temp(sizeof(CoeffReturnType) * m_size)));
+      evalToBuf(m_data);
+      return true;
+    }
+  }
+
+  EIGEN_STRONG_INLINE void cleanup() {
+    if (m_data) {
+      m_device.deallocate(m_data);
+      m_data = NULL;
+    }
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const { return m_data[index]; }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const {
+    return internal::ploadt<PacketReturnType, LoadMode>(m_data + index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_data; }
+
+ private:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalToBuf(EvaluatorPointerType data) {
+    const bool write_to_out = internal::is_same<OutputScalar, ComplexScalar>::value;
+    ComplexScalar* buf =
+        write_to_out ? (ComplexScalar*)data : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * m_size);
+
+    for (Index i = 0; i < m_size; ++i) {
+      buf[i] = MakeComplex<internal::is_same<InputScalar, RealScalar>::value>()(m_impl.coeff(i));
+    }
+
+    for (size_t i = 0; i < m_fft.size(); ++i) {
+      Index dim = m_fft[i];
+      eigen_assert(dim >= 0 && dim < NumDims);
+      Index line_len = m_dimensions[dim];
+      eigen_assert(line_len >= 1);
+      ComplexScalar* line_buf = (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * line_len);
+      const bool is_power_of_two = isPowerOfTwo(line_len);
+      const Index good_composite = is_power_of_two ? 0 : findGoodComposite(line_len);
+      const Index log_len = is_power_of_two ? getLog2(line_len) : getLog2(good_composite);
+
+      ComplexScalar* a =
+          is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * good_composite);
+      ComplexScalar* b =
+          is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * good_composite);
+      ComplexScalar* pos_j_base_powered =
+          is_power_of_two ? NULL : (ComplexScalar*)m_device.allocate(sizeof(ComplexScalar) * (line_len + 1));
+      if (!is_power_of_two) {
+        // Compute twiddle factors
+        //   t_n = exp(sqrt(-1) * pi * n^2 / line_len)
+        // for n = 0, 1,..., line_len-1.
+        // For n > 2 we use the recurrence t_n = t_{n-1}^2 / t_{n-2} * t_1^2
+
+        // The recurrence is correct in exact arithmetic, but causes
+        // numerical issues for large transforms, especially in
+        // single-precision floating point.
+        //
+        // pos_j_base_powered[0] = ComplexScalar(1, 0);
+        // if (line_len > 1) {
+        //   const ComplexScalar pos_j_base = ComplexScalar(
+        //       numext::cos(EIGEN_PI / line_len), numext::sin(EIGEN_PI / line_len));
+        //   pos_j_base_powered[1] = pos_j_base;
+        //   if (line_len > 2) {
+        //     const ComplexScalar pos_j_base_sq = pos_j_base * pos_j_base;
+        //     for (int i = 2; i < line_len + 1; ++i) {
+        //       pos_j_base_powered[i] = pos_j_base_powered[i - 1] *
+        //           pos_j_base_powered[i - 1] /
+        //           pos_j_base_powered[i - 2] *
+        //           pos_j_base_sq;
+        //     }
+        //   }
+        // }
+        // TODO(rmlarsen): Find a way to use Eigen's vectorized sin
+        // and cosine functions here.
+        for (int j = 0; j < line_len + 1; ++j) {
+          double arg = ((EIGEN_PI * j) * j) / line_len;
+          std::complex<double> tmp(numext::cos(arg), numext::sin(arg));
+          pos_j_base_powered[j] = static_cast<ComplexScalar>(tmp);
+        }
+      }
+
+      for (Index partial_index = 0; partial_index < m_size / line_len; ++partial_index) {
+        const Index base_offset = getBaseOffsetFromIndex(partial_index, dim);
+
+        // get data into line_buf
+        const Index stride = m_strides[dim];
+        if (stride == 1) {
+          m_device.memcpy(line_buf, &buf[base_offset], line_len * sizeof(ComplexScalar));
+        } else {
+          Index offset = base_offset;
+          for (int j = 0; j < line_len; ++j, offset += stride) {
+            line_buf[j] = buf[offset];
+          }
+        }
+
+        // process the line
+        if (is_power_of_two) {
+          processDataLineCooleyTukey(line_buf, line_len, log_len);
+        } else {
+          processDataLineBluestein(line_buf, line_len, good_composite, log_len, a, b, pos_j_base_powered);
+        }
+
+        // write back
+        if (FFTDir == FFT_FORWARD && stride == 1) {
+          m_device.memcpy(&buf[base_offset], line_buf, line_len * sizeof(ComplexScalar));
+        } else {
+          Index offset = base_offset;
+          const ComplexScalar div_factor = ComplexScalar(1.0 / line_len, 0);
+          for (int j = 0; j < line_len; ++j, offset += stride) {
+            buf[offset] = (FFTDir == FFT_FORWARD) ? line_buf[j] : line_buf[j] * div_factor;
+          }
+        }
+      }
+      m_device.deallocate(line_buf);
+      if (!is_power_of_two) {
+        m_device.deallocate(a);
+        m_device.deallocate(b);
+        m_device.deallocate(pos_j_base_powered);
+      }
+    }
+
+    if (!write_to_out) {
+      for (Index i = 0; i < m_size; ++i) {
+        data[i] = PartOf<FFTResultType>()(buf[i]);
+      }
+      m_device.deallocate(buf);
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static bool isPowerOfTwo(Index x) {
+    eigen_assert(x > 0);
+    return !(x & (x - 1));
+  }
+
+  // The composite number for padding, used in Bluestein's FFT algorithm
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Index findGoodComposite(Index n) {
+    Index i = 2;
+    while (i < 2 * n - 1) i *= 2;
+    return i;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static Index getLog2(Index m) {
+    Index log2m = 0;
+    while (m >>= 1) log2m++;
+    return log2m;
+  }
+
+  // Call Cooley Tukey algorithm directly, data length must be power of 2
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void processDataLineCooleyTukey(ComplexScalar* line_buf, Index line_len,
+                                                                        Index log_len) {
+    eigen_assert(isPowerOfTwo(line_len));
+    scramble_FFT(line_buf, line_len);
+    compute_1D_Butterfly<FFTDir>(line_buf, line_len, log_len);
+  }
+
+  // Call Bluestein's FFT algorithm, m is a good composite number greater than (2 * n - 1), used as the padding length
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void processDataLineBluestein(ComplexScalar* line_buf, Index line_len,
+                                                                      Index good_composite, Index log_len,
+                                                                      ComplexScalar* a, ComplexScalar* b,
+                                                                      const ComplexScalar* pos_j_base_powered) {
+    Index n = line_len;
+    Index m = good_composite;
+    ComplexScalar* data = line_buf;
+
+    for (Index i = 0; i < n; ++i) {
+      if (FFTDir == FFT_FORWARD) {
+        a[i] = data[i] * numext::conj(pos_j_base_powered[i]);
+      } else {
+        a[i] = data[i] * pos_j_base_powered[i];
+      }
+    }
+    for (Index i = n; i < m; ++i) {
+      a[i] = ComplexScalar(0, 0);
+    }
+
+    for (Index i = 0; i < n; ++i) {
+      if (FFTDir == FFT_FORWARD) {
+        b[i] = pos_j_base_powered[i];
+      } else {
+        b[i] = numext::conj(pos_j_base_powered[i]);
+      }
+    }
+    for (Index i = n; i < m - n; ++i) {
+      b[i] = ComplexScalar(0, 0);
+    }
+    for (Index i = m - n; i < m; ++i) {
+      if (FFTDir == FFT_FORWARD) {
+        b[i] = pos_j_base_powered[m - i];
+      } else {
+        b[i] = numext::conj(pos_j_base_powered[m - i]);
+      }
+    }
+
+    scramble_FFT(a, m);
+    compute_1D_Butterfly<FFT_FORWARD>(a, m, log_len);
+
+    scramble_FFT(b, m);
+    compute_1D_Butterfly<FFT_FORWARD>(b, m, log_len);
+
+    for (Index i = 0; i < m; ++i) {
+      a[i] *= b[i];
+    }
+
+    scramble_FFT(a, m);
+    compute_1D_Butterfly<FFT_REVERSE>(a, m, log_len);
+
+    // Do the scaling after ifft
+    for (Index i = 0; i < m; ++i) {
+      a[i] /= m;
+    }
+
+    for (Index i = 0; i < n; ++i) {
+      if (FFTDir == FFT_FORWARD) {
+        data[i] = a[i] * numext::conj(pos_j_base_powered[i]);
+      } else {
+        data[i] = a[i] * pos_j_base_powered[i];
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static void scramble_FFT(ComplexScalar* data, Index n) {
+    eigen_assert(isPowerOfTwo(n));
+    Index j = 1;
+    for (Index i = 1; i < n; ++i) {
+      if (j > i) {
+        std::swap(data[j - 1], data[i - 1]);
+      }
+      Index m = n >> 1;
+      while (m >= 2 && j > m) {
+        j -= m;
+        m >>= 1;
+      }
+      j += m;
+    }
+  }
+
+  template <int Dir>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_2(ComplexScalar* data) {
+    ComplexScalar tmp = data[1];
+    data[1] = data[0] - data[1];
+    data[0] += tmp;
+  }
+
+  template <int Dir>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_4(ComplexScalar* data) {
+    ComplexScalar tmp[4];
+    tmp[0] = data[0] + data[1];
+    tmp[1] = data[0] - data[1];
+    tmp[2] = data[2] + data[3];
+    if (Dir == FFT_FORWARD) {
+      tmp[3] = ComplexScalar(0.0, -1.0) * (data[2] - data[3]);
+    } else {
+      tmp[3] = ComplexScalar(0.0, 1.0) * (data[2] - data[3]);
+    }
+    data[0] = tmp[0] + tmp[2];
+    data[1] = tmp[1] + tmp[3];
+    data[2] = tmp[0] - tmp[2];
+    data[3] = tmp[1] - tmp[3];
+  }
+
+  template <int Dir>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_8(ComplexScalar* data) {
+    ComplexScalar tmp_1[8];
+    ComplexScalar tmp_2[8];
+
+    tmp_1[0] = data[0] + data[1];
+    tmp_1[1] = data[0] - data[1];
+    tmp_1[2] = data[2] + data[3];
+    if (Dir == FFT_FORWARD) {
+      tmp_1[3] = (data[2] - data[3]) * ComplexScalar(0, -1);
+    } else {
+      tmp_1[3] = (data[2] - data[3]) * ComplexScalar(0, 1);
+    }
+    tmp_1[4] = data[4] + data[5];
+    tmp_1[5] = data[4] - data[5];
+    tmp_1[6] = data[6] + data[7];
+    if (Dir == FFT_FORWARD) {
+      tmp_1[7] = (data[6] - data[7]) * ComplexScalar(0, -1);
+    } else {
+      tmp_1[7] = (data[6] - data[7]) * ComplexScalar(0, 1);
+    }
+    tmp_2[0] = tmp_1[0] + tmp_1[2];
+    tmp_2[1] = tmp_1[1] + tmp_1[3];
+    tmp_2[2] = tmp_1[0] - tmp_1[2];
+    tmp_2[3] = tmp_1[1] - tmp_1[3];
+    tmp_2[4] = tmp_1[4] + tmp_1[6];
+// SQRT2DIV2 = sqrt(2)/2
+#define SQRT2DIV2 0.7071067811865476
+    if (Dir == FFT_FORWARD) {
+      tmp_2[5] = (tmp_1[5] + tmp_1[7]) * ComplexScalar(SQRT2DIV2, -SQRT2DIV2);
+      tmp_2[6] = (tmp_1[4] - tmp_1[6]) * ComplexScalar(0, -1);
+      tmp_2[7] = (tmp_1[5] - tmp_1[7]) * ComplexScalar(-SQRT2DIV2, -SQRT2DIV2);
+    } else {
+      tmp_2[5] = (tmp_1[5] + tmp_1[7]) * ComplexScalar(SQRT2DIV2, SQRT2DIV2);
+      tmp_2[6] = (tmp_1[4] - tmp_1[6]) * ComplexScalar(0, 1);
+      tmp_2[7] = (tmp_1[5] - tmp_1[7]) * ComplexScalar(-SQRT2DIV2, SQRT2DIV2);
+    }
+    data[0] = tmp_2[0] + tmp_2[4];
+    data[1] = tmp_2[1] + tmp_2[5];
+    data[2] = tmp_2[2] + tmp_2[6];
+    data[3] = tmp_2[3] + tmp_2[7];
+    data[4] = tmp_2[0] - tmp_2[4];
+    data[5] = tmp_2[1] - tmp_2[5];
+    data[6] = tmp_2[2] - tmp_2[6];
+    data[7] = tmp_2[3] - tmp_2[7];
+  }
+
+  template <int Dir>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void butterfly_1D_merge(ComplexScalar* data, Index n, Index n_power_of_2) {
+    // Original code:
+    // RealScalar wtemp = std::sin(EIGEN_PI/n);
+    // RealScalar wpi =  -std::sin(2 * EIGEN_PI/n);
+    const RealScalar wtemp = m_sin_PI_div_n_LUT[n_power_of_2];
+    const RealScalar wpi =
+        (Dir == FFT_FORWARD) ? m_minus_sin_2_PI_div_n_LUT[n_power_of_2] : -m_minus_sin_2_PI_div_n_LUT[n_power_of_2];
+
+    const ComplexScalar wp(wtemp, wpi);
+    const ComplexScalar wp_one = wp + ComplexScalar(1, 0);
+    const ComplexScalar wp_one_2 = wp_one * wp_one;
+    const ComplexScalar wp_one_3 = wp_one_2 * wp_one;
+    const ComplexScalar wp_one_4 = wp_one_3 * wp_one;
+    const Index n2 = n / 2;
+    ComplexScalar w(1.0, 0.0);
+    for (Index i = 0; i < n2; i += 4) {
+      ComplexScalar temp0(data[i + n2] * w);
+      ComplexScalar temp1(data[i + 1 + n2] * w * wp_one);
+      ComplexScalar temp2(data[i + 2 + n2] * w * wp_one_2);
+      ComplexScalar temp3(data[i + 3 + n2] * w * wp_one_3);
+      w = w * wp_one_4;
+
+      data[i + n2] = data[i] - temp0;
+      data[i] += temp0;
+
+      data[i + 1 + n2] = data[i + 1] - temp1;
+      data[i + 1] += temp1;
+
+      data[i + 2 + n2] = data[i + 2] - temp2;
+      data[i + 2] += temp2;
+
+      data[i + 3 + n2] = data[i + 3] - temp3;
+      data[i + 3] += temp3;
+    }
+  }
+
+  template <int Dir>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void compute_1D_Butterfly(ComplexScalar* data, Index n, Index n_power_of_2) {
+    eigen_assert(isPowerOfTwo(n));
+    if (n > 8) {
+      compute_1D_Butterfly<Dir>(data, n / 2, n_power_of_2 - 1);
+      compute_1D_Butterfly<Dir>(data + n / 2, n / 2, n_power_of_2 - 1);
+      butterfly_1D_merge<Dir>(data, n, n_power_of_2);
+    } else if (n == 8) {
+      butterfly_8<Dir>(data);
+    } else if (n == 4) {
+      butterfly_4<Dir>(data);
+    } else if (n == 2) {
+      butterfly_2<Dir>(data);
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index getBaseOffsetFromIndex(Index index, Index omitted_dim) const {
+    Index result = 0;
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > omitted_dim; --i) {
+        const Index partial_m_stride = m_strides[i] / m_dimensions[omitted_dim];
+        const Index idx = index / partial_m_stride;
+        index -= idx * partial_m_stride;
+        result += idx * m_strides[i];
+      }
+      result += index;
+    } else {
+      for (Index i = 0; i < omitted_dim; ++i) {
+        const Index partial_m_stride = m_strides[i] / m_dimensions[omitted_dim];
+        const Index idx = index / partial_m_stride;
+        index -= idx * partial_m_stride;
+        result += idx * m_strides[i];
+      }
+      result += index;
+    }
+    // Value of index_coords[omitted_dim] is not determined to this step
+    return result;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index getIndexFromOffset(Index base, Index omitted_dim, Index offset) const {
+    Index result = base + offset * m_strides[omitted_dim];
+    return result;
+  }
+
+ protected:
+  Index m_size;
+  const FFT EIGEN_DEVICE_REF m_fft;
+  Dimensions m_dimensions;
+  array<Index, NumDims> m_strides;
+  TensorEvaluator<ArgType, Device> m_impl;
+  EvaluatorPointerType m_data;
+  const Device EIGEN_DEVICE_REF m_device;
+
+  // This will support a maximum FFT size of 2^32 for each dimension
+  // m_sin_PI_div_n_LUT[i] = (-2) * std::sin(EIGEN_PI / std::pow(2,i)) ^ 2;
+  const RealScalar m_sin_PI_div_n_LUT[32] = {RealScalar(0.0),
+                                             RealScalar(-2),
+                                             RealScalar(-0.999999999999999),
+                                             RealScalar(-0.292893218813453),
+                                             RealScalar(-0.0761204674887130),
+                                             RealScalar(-0.0192147195967696),
+                                             RealScalar(-0.00481527332780311),
+                                             RealScalar(-0.00120454379482761),
+                                             RealScalar(-3.01181303795779e-04),
+                                             RealScalar(-7.52981608554592e-05),
+                                             RealScalar(-1.88247173988574e-05),
+                                             RealScalar(-4.70619042382852e-06),
+                                             RealScalar(-1.17654829809007e-06),
+                                             RealScalar(-2.94137117780840e-07),
+                                             RealScalar(-7.35342821488550e-08),
+                                             RealScalar(-1.83835707061916e-08),
+                                             RealScalar(-4.59589268710903e-09),
+                                             RealScalar(-1.14897317243732e-09),
+                                             RealScalar(-2.87243293150586e-10),
+                                             RealScalar(-7.18108232902250e-11),
+                                             RealScalar(-1.79527058227174e-11),
+                                             RealScalar(-4.48817645568941e-12),
+                                             RealScalar(-1.12204411392298e-12),
+                                             RealScalar(-2.80511028480785e-13),
+                                             RealScalar(-7.01277571201985e-14),
+                                             RealScalar(-1.75319392800498e-14),
+                                             RealScalar(-4.38298482001247e-15),
+                                             RealScalar(-1.09574620500312e-15),
+                                             RealScalar(-2.73936551250781e-16),
+                                             RealScalar(-6.84841378126949e-17),
+                                             RealScalar(-1.71210344531737e-17),
+                                             RealScalar(-4.28025861329343e-18)};
+
+  // m_minus_sin_2_PI_div_n_LUT[i] = -std::sin(2 * EIGEN_PI / std::pow(2,i));
+  const RealScalar m_minus_sin_2_PI_div_n_LUT[32] = {RealScalar(0.0),
+                                                     RealScalar(0.0),
+                                                     RealScalar(-1.00000000000000e+00),
+                                                     RealScalar(-7.07106781186547e-01),
+                                                     RealScalar(-3.82683432365090e-01),
+                                                     RealScalar(-1.95090322016128e-01),
+                                                     RealScalar(-9.80171403295606e-02),
+                                                     RealScalar(-4.90676743274180e-02),
+                                                     RealScalar(-2.45412285229123e-02),
+                                                     RealScalar(-1.22715382857199e-02),
+                                                     RealScalar(-6.13588464915448e-03),
+                                                     RealScalar(-3.06795676296598e-03),
+                                                     RealScalar(-1.53398018628477e-03),
+                                                     RealScalar(-7.66990318742704e-04),
+                                                     RealScalar(-3.83495187571396e-04),
+                                                     RealScalar(-1.91747597310703e-04),
+                                                     RealScalar(-9.58737990959773e-05),
+                                                     RealScalar(-4.79368996030669e-05),
+                                                     RealScalar(-2.39684498084182e-05),
+                                                     RealScalar(-1.19842249050697e-05),
+                                                     RealScalar(-5.99211245264243e-06),
+                                                     RealScalar(-2.99605622633466e-06),
+                                                     RealScalar(-1.49802811316901e-06),
+                                                     RealScalar(-7.49014056584716e-07),
+                                                     RealScalar(-3.74507028292384e-07),
+                                                     RealScalar(-1.87253514146195e-07),
+                                                     RealScalar(-9.36267570730981e-08),
+                                                     RealScalar(-4.68133785365491e-08),
+                                                     RealScalar(-2.34066892682746e-08),
+                                                     RealScalar(-1.17033446341373e-08),
+                                                     RealScalar(-5.85167231706864e-09),
+                                                     RealScalar(-2.92583615853432e-09)};
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_FFT_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
new file mode 100644
index 00000000..753a25a8
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
@@ -0,0 +1,225 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H
+#define EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/**
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief The fixed sized version of the tensor class.
+ *
+ * The fixed sized equivalent of
+ * Eigen::Tensor<float, 3> t(3, 5, 7);
+ * is
+ * Eigen::TensorFixedSize<float, Sizes<3,5,7>> t;
+ */
+template <typename Scalar_, typename Dimensions_, int Options_, typename IndexType>
+class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_, Options_, IndexType> > {
+ public:
+  typedef TensorFixedSize<Scalar_, Dimensions_, Options_, IndexType> Self;
+  typedef TensorBase<TensorFixedSize<Scalar_, Dimensions_, Options_, IndexType> > Base;
+  typedef typename Eigen::internal::nested<Self>::type Nested;
+  typedef typename internal::traits<Self>::StorageKind StorageKind;
+  typedef typename internal::traits<Self>::Index Index;
+  typedef Scalar_ Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef typename Base::CoeffReturnType CoeffReturnType;
+
+  static constexpr int Options = Options_;
+  static constexpr int Layout = Options_ & RowMajor ? RowMajor : ColMajor;
+
+  enum {
+    IsAligned = bool(EIGEN_MAX_ALIGN_BYTES > 0),
+    PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+    BlockAccess = false,
+    PreferBlockAccess = false,
+    CoordAccess = true,
+    RawAccess = true
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  typedef Dimensions_ Dimensions;
+  static constexpr std::size_t NumIndices = Dimensions::count;
+
+ protected:
+  TensorStorage<Scalar, Dimensions, Options> m_storage;
+
+ public:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return NumIndices; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(std::size_t n) const { return m_storage.dimensions()[n]; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions dimensions() const { return m_storage.dimensions(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_storage.size(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() { return m_storage.data(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar* data() const { return m_storage.data(); }
+
+  // This makes EIGEN_INITIALIZE_COEFFS_IF_THAT_OPTION_IS_ENABLED
+  // work, because that uses base().coeffRef() - and we don't yet
+  // implement a similar class hierarchy
+  inline Self& base() { return *this; }
+  inline const Self& base() const { return *this; }
+
+  template <typename... IndexTypes>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index firstIndex, IndexTypes... otherIndices) const {
+    // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+    EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    return coeff(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(const array<Index, NumIndices>& indices) const {
+    eigen_internal_assert(checkIndexRange(indices));
+    return m_storage.data()[linearizedIndex(indices)];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index index) const {
+    eigen_internal_assert(index >= 0 && index < size());
+    return m_storage.data()[index];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff() const {
+    EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    return m_storage.data()[0];
+  }
+
+  template <typename... IndexTypes>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index firstIndex, IndexTypes... otherIndices) {
+    // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+    EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    return coeffRef(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array<Index, NumIndices>& indices) {
+    eigen_internal_assert(checkIndexRange(indices));
+    return m_storage.data()[linearizedIndex(indices)];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) {
+    eigen_internal_assert(index >= 0 && index < size());
+    return m_storage.data()[index];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef() {
+    EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    return m_storage.data()[0];
+  }
+
+  template <typename... IndexTypes>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) const {
+    // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+    EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    return this->operator()(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(const array<Index, NumIndices>& indices) const {
+    eigen_assert(checkIndexRange(indices));
+    return coeff(indices);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index index) const {
+    eigen_internal_assert(index >= 0 && index < size());
+    return coeff(index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()() const {
+    EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    return coeff();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator[](Index index) const {
+    // The bracket operator is only for vectors, use the parenthesis operator instead.
+    EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    return coeff(index);
+  }
+
+  template <typename... IndexTypes>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) {
+    // The number of indices used to access a tensor coefficient must be equal to the rank of the tensor.
+    EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    return operator()(array<Index, NumIndices>{{firstIndex, otherIndices...}});
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(const array<Index, NumIndices>& indices) {
+    eigen_assert(checkIndexRange(indices));
+    return coeffRef(indices);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index index) {
+    eigen_assert(index >= 0 && index < size());
+    return coeffRef(index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()() {
+    EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    return coeffRef();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator[](Index index) {
+    // The bracket operator is only for vectors, use the parenthesis operator instead
+    EIGEN_STATIC_ASSERT(NumIndices == 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    return coeffRef(index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFixedSize() : m_storage() {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFixedSize(const Self& other) : Base(other), m_storage(other.m_storage) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFixedSize(Self&& other) : m_storage(other.m_storage) {}
+
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFixedSize(const TensorBase<OtherDerived, ReadOnlyAccessors>& other) {
+    typedef TensorAssignOp<TensorFixedSize, const OtherDerived> Assign;
+    Assign assign(*this, other.derived());
+    internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+  }
+  template <typename OtherDerived>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFixedSize(const TensorBase<OtherDerived, WriteAccessors>& other) {
+    typedef TensorAssignOp<TensorFixedSize, const OtherDerived> Assign;
+    Assign assign(*this, other.derived());
+    internal::TensorExecutor<const Assign, DefaultDevice>::run(assign, DefaultDevice());
+  }
+
+  // FIXME: check that the dimensions of other match the dimensions of *this.
+  // Unfortunately this isn't possible yet when the rhs is an expression.
+  EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(TensorFixedSize)
+
+ protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool checkIndexRange(const array<Index, NumIndices>& /*indices*/) const {
+    using internal::array_apply_and_reduce;
+    using internal::array_zip_and_reduce;
+    using internal::greater_equal_zero_op;
+    using internal::lesser_op;
+    using internal::logical_and_op;
+
+    return true;
+    // check whether the indices are all >= 0
+    /*       array_apply_and_reduce<logical_and_op, greater_equal_zero_op>(indices) &&
+  // check whether the indices fit in the dimensions
+  array_zip_and_reduce<logical_and_op, lesser_op>(indices, m_storage.dimensions());*/
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index linearizedIndex(const array<Index, NumIndices>& indices) const {
+    if (Options & RowMajor) {
+      return m_storage.dimensions().IndexOfRowMajor(indices);
+    } else {
+      return m_storage.dimensions().IndexOfColMajor(indices);
+    }
+  }
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_FIXED_SIZE_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
new file mode 100644
index 00000000..dadccb32
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@@ -0,0 +1,231 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H
+#define EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+#include <memory>
+
+namespace Eigen {
+
+namespace internal {
+template <typename XprType>
+struct traits<TensorForcedEvalOp<XprType>> {
+  // Type promotion to handle the case where the types of the lhs and the rhs are different.
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename traits<XprType>::StorageKind StorageKind;
+  typedef typename traits<XprType>::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef std::remove_reference_t<Nested> Nested_;
+  static constexpr int NumDimensions = XprTraits::NumDimensions;
+  static constexpr int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
+
+  enum { Flags = 0 };
+};
+
+template <typename XprType>
+struct eval<TensorForcedEvalOp<XprType>, Eigen::Dense> {
+  typedef const TensorForcedEvalOp<XprType>& type;
+};
+
+template <typename XprType>
+struct nested<TensorForcedEvalOp<XprType>, 1, typename eval<TensorForcedEvalOp<XprType>>::type> {
+  typedef TensorForcedEvalOp<XprType> type;
+};
+
+}  // end namespace internal
+
+/**
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor reshaping class.
+ */
+template <typename XprType>
+class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType>, ReadOnlyAccessors> {
+ public:
+  typedef typename Eigen::internal::traits<TensorForcedEvalOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef std::remove_const_t<typename XprType::CoeffReturnType> CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorForcedEvalOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorForcedEvalOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorForcedEvalOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorForcedEvalOp(const XprType& expr) : m_xpr(expr) {}
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename XprType::Nested>& expression() const { return m_xpr; }
+
+ protected:
+  typename XprType::Nested m_xpr;
+};
+
+namespace internal {
+template <typename Device, typename CoeffReturnType>
+struct non_integral_type_placement_new {
+  template <typename StorageType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index numValues, StorageType m_buffer) {
+    // Initialize non-trivially constructible types.
+    if (!internal::is_arithmetic<CoeffReturnType>::value) {
+      for (Index i = 0; i < numValues; ++i) new (m_buffer + i) CoeffReturnType();
+    }
+  }
+};
+
+// SYCL does not support non-integral types
+// having new (m_buffer + i) CoeffReturnType() causes the following compiler error for SYCL Devices
+// no matching function for call to 'operator new'
+template <typename CoeffReturnType>
+struct non_integral_type_placement_new<Eigen::SyclDevice, CoeffReturnType> {
+  template <typename StorageType>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(Index, StorageType) {}
+};
+}  // end namespace internal
+
+template <typename Device>
+class DeviceTempPointerHolder {
+ public:
+  DeviceTempPointerHolder(const Device& device, size_t size)
+      : device_(device), size_(size), ptr_(device.allocate_temp(size)) {}
+
+  ~DeviceTempPointerHolder() {
+    device_.deallocate_temp(ptr_);
+    size_ = 0;
+    ptr_ = nullptr;
+  }
+
+  void* ptr() { return ptr_; }
+
+ private:
+  Device device_;
+  size_t size_;
+  void* ptr_;
+};
+
+template <typename ArgType_, typename Device>
+struct TensorEvaluator<const TensorForcedEvalOp<ArgType_>, Device> {
+  typedef const internal::remove_all_t<ArgType_> ArgType;
+  typedef TensorForcedEvalOp<ArgType> XprType;
+  typedef typename ArgType::Scalar Scalar;
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  enum {
+    IsAligned = true,
+    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
+    BlockAccess = internal::is_arithmetic<CoeffReturnType>::value,
+    PreferBlockAccess = false,
+    RawAccess = true
+  };
+
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+  static constexpr int NumDims = internal::traits<ArgType>::NumDimensions;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename internal::TensorMaterializedBlock<CoeffReturnType, NumDims, Layout, Index> TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device),
+        m_op(op.expression()),
+        m_device(device),
+        m_buffer_holder(nullptr),
+        m_buffer(nullptr) {}
+
+  ~TensorEvaluator() { cleanup(); }
+
+  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+    const Index numValues = internal::array_prod(m_impl.dimensions());
+    m_buffer_holder = std::make_shared<DeviceTempPointerHolder<Device>>(m_device, numValues * sizeof(CoeffReturnType));
+    m_buffer = static_cast<EvaluatorPointerType>(m_buffer_holder->ptr());
+
+    internal::non_integral_type_placement_new<Device, CoeffReturnType>()(numValues, m_buffer);
+
+    typedef TensorEvalToOp<const std::remove_const_t<ArgType>> EvalTo;
+    EvalTo evalToTmp(m_device.get(m_buffer), m_op);
+
+    internal::TensorExecutor<const EvalTo, std::remove_const_t<Device>,
+                             /*Vectorizable=*/internal::IsVectorizable<Device, const ArgType>::value,
+                             /*Tiling=*/internal::IsTileable<Device, const ArgType>::value>::run(evalToTmp, m_device);
+
+    return true;
+  }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType, EvalSubExprsCallback done) {
+    const Index numValues = internal::array_prod(m_impl.dimensions());
+    m_buffer_holder = std::make_shared<DeviceTempPointerHolder<Device>>(m_device, numValues * sizeof(CoeffReturnType));
+    m_buffer = static_cast<EvaluatorPointerType>(m_buffer_holder->ptr());
+
+    typedef TensorEvalToOp<const std::remove_const_t<ArgType>> EvalTo;
+    EvalTo evalToTmp(m_device.get(m_buffer), m_op);
+
+    auto on_done = std::bind([](EvalSubExprsCallback done_) { done_(true); }, std::move(done));
+    internal::TensorAsyncExecutor<
+        const EvalTo, std::remove_const_t<Device>, decltype(on_done),
+        /*Vectorizable=*/internal::IsVectorizable<Device, const ArgType>::value,
+        /*Tiling=*/internal::IsTileable<Device, const ArgType>::value>::runAsync(evalToTmp, m_device,
+                                                                                 std::move(on_done));
+  }
+#endif
+
+  EIGEN_STRONG_INLINE void cleanup() {
+    m_buffer_holder = nullptr;
+    m_buffer = nullptr;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_buffer[index]; }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+    return internal::ploadt<PacketReturnType, LoadMode>(m_buffer + index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    return internal::TensorBlockResourceRequirements::any();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+                                                          bool /*root_of_expr_ast*/ = false) const {
+    eigen_assert(m_buffer != nullptr);
+    return TensorBlock::materialize(m_buffer, m_impl.dimensions(), desc, scratch);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE EvaluatorPointerType data() const { return m_buffer; }
+
+ private:
+  TensorEvaluator<ArgType, Device> m_impl;
+  const ArgType m_op;
+  const Device EIGEN_DEVICE_REF m_device;
+  std::shared_ptr<DeviceTempPointerHolder<Device>> m_buffer_holder;
+  EvaluatorPointerType m_buffer;  // Cached copy of the value stored in m_buffer_holder.
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_FORCED_EVAL_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
new file mode 100644
index 00000000..49c20a4e
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
@@ -0,0 +1,215 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H
+#define EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+// MakePointer class is used as a container of the address space of the pointer
+// on the host and on the device. From the host side it generates the T* pointer
+// and when EIGEN_USE_SYCL is used it construct a buffer with a map_allocator to
+// T* m_data on the host. It is always called on the device.
+// Specialisation of MakePointer class for creating the sycl buffer with
+// map_allocator.
+template <typename T>
+struct MakePointer {
+  typedef T* Type;
+  typedef const T* ConstType;
+};
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T* constCast(const T* data) {
+  return const_cast<T*>(data);
+}
+
+// The StorageMemory class is a container of the device specific pointer
+// used for referring to a Pointer on TensorEvaluator class. While the TensorExpression
+// is a device-agnostic type and need MakePointer class for type conversion,
+// the TensorEvaluator class can be specialized for a device, hence it is possible
+// to construct different types of temporary storage memory in TensorEvaluator
+// for different devices by specializing the following StorageMemory class.
+template <typename T, typename device>
+struct StorageMemory : MakePointer<T> {};
+
+namespace internal {
+template <typename A, typename B>
+struct Pointer_type_promotion {
+  static const bool val = false;
+};
+template <typename A>
+struct Pointer_type_promotion<A, A> {
+  static const bool val = true;
+};
+template <typename A, typename B>
+struct TypeConversion {
+  typedef A* type;
+};
+}  // namespace internal
+
+template <typename PlainObjectType, int Options_ = Unaligned, template <class> class MakePointer_ = MakePointer>
+class TensorMap;
+template <typename Scalar_, int NumIndices_, int Options_ = 0, typename IndexType = DenseIndex>
+class Tensor;
+template <typename Scalar_, typename Dimensions, int Options_ = 0, typename IndexType = DenseIndex>
+class TensorFixedSize;
+template <typename PlainObjectType>
+class TensorRef;
+template <typename Derived, int AccessLevel>
+class TensorBase;
+
+template <typename NullaryOp, typename PlainObjectType>
+class TensorCwiseNullaryOp;
+template <typename UnaryOp, typename XprType>
+class TensorCwiseUnaryOp;
+template <typename BinaryOp, typename LeftXprType, typename RightXprType>
+class TensorCwiseBinaryOp;
+template <typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType>
+class TensorCwiseTernaryOp;
+template <typename IfXprType, typename ThenXprType, typename ElseXprType>
+class TensorSelectOp;
+template <typename Op, typename Dims, typename XprType, template <class> class MakePointer_ = MakePointer>
+class TensorReductionOp;
+template <typename XprType>
+class TensorIndexPairOp;
+template <typename ReduceOp, typename Dims, typename XprType>
+class TensorPairReducerOp;
+template <typename Axis, typename LeftXprType, typename RightXprType>
+class TensorConcatenationOp;
+template <typename Dimensions, typename LeftXprType, typename RightXprType, typename OutputKernelType>
+class TensorContractionOp;
+template <typename TargetType, typename XprType>
+class TensorConversionOp;
+template <typename Dimensions, typename InputXprType, typename KernelXprType>
+class TensorConvolutionOp;
+template <typename FFT, typename XprType, int FFTDataType, int FFTDirection>
+class TensorFFTOp;
+template <typename PatchDim, typename XprType>
+class TensorPatchOp;
+template <DenseIndex Rows, DenseIndex Cols, typename XprType>
+class TensorImagePatchOp;
+template <DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>
+class TensorVolumePatchOp;
+template <typename Broadcast, typename XprType>
+class TensorBroadcastingOp;
+template <DenseIndex DimId, typename XprType>
+class TensorChippingOp;
+template <typename NewDimensions, typename XprType>
+class TensorReshapingOp;
+template <typename XprType>
+class TensorLayoutSwapOp;
+template <typename StartIndices, typename Sizes, typename XprType>
+class TensorSlicingOp;
+template <typename ReverseDimensions, typename XprType>
+class TensorReverseOp;
+template <typename Rolls, typename XprType>
+class TensorRollOp;
+template <typename PaddingDimensions, typename XprType>
+class TensorPaddingOp;
+template <typename Shuffle, typename XprType>
+class TensorShufflingOp;
+template <typename Strides, typename XprType>
+class TensorStridingOp;
+template <typename StartIndices, typename StopIndices, typename Strides, typename XprType>
+class TensorStridingSlicingOp;
+template <typename Strides, typename XprType>
+class TensorInflationOp;
+template <typename Generator, typename XprType>
+class TensorGeneratorOp;
+template <typename LeftXprType, typename RightXprType>
+class TensorAssignOp;
+template <typename Op, typename XprType>
+class TensorScanOp;
+template <typename Dims, typename XprType>
+class TensorTraceOp;
+
+template <typename CustomUnaryFunc, typename XprType>
+class TensorCustomUnaryOp;
+template <typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType>
+class TensorCustomBinaryOp;
+
+template <typename XprType, template <class> class MakePointer_ = MakePointer>
+class TensorEvalToOp;
+template <typename XprType>
+class TensorForcedEvalOp;
+
+template <typename ExpressionType, typename DeviceType>
+class TensorDevice;
+template <typename ExpressionType, typename DeviceType, typename DoneCallback>
+class TensorAsyncDevice;
+template <typename Derived, typename Device>
+struct TensorEvaluator;
+
+struct NoOpOutputKernel;
+
+struct DefaultDevice;
+struct ThreadPoolDevice;
+struct GpuDevice;
+struct SyclDevice;
+
+#ifdef EIGEN_USE_SYCL
+namespace TensorSycl {
+namespace internal {
+template <typename Evaluator, typename Op>
+class GenericNondeterministicReducer;
+}
+}  // namespace TensorSycl
+#endif
+
+enum FFTResultType { RealPart = 0, ImagPart = 1, BothParts = 2 };
+
+enum FFTDirection { FFT_FORWARD = 0, FFT_REVERSE = 1 };
+
+namespace internal {
+
+template <typename Device, typename Expression>
+struct IsVectorizable {
+  static const bool value = TensorEvaluator<Expression, Device>::PacketAccess;
+};
+
+template <typename Expression>
+struct IsVectorizable<GpuDevice, Expression> {
+  static const bool value =
+      TensorEvaluator<Expression, GpuDevice>::PacketAccess && TensorEvaluator<Expression, GpuDevice>::IsAligned;
+};
+
+// Tiled evaluation strategy.
+enum TiledEvaluation {
+  Off = 0,  // tiled evaluation is not supported
+  On = 1,   // still work in progress (see TensorBlock.h)
+};
+
+template <typename Device, typename Expression>
+struct IsTileable {
+  // Check that block evaluation is supported and it's a preferred option (at
+  // least one sub-expression has much faster block evaluation, e.g.
+  // broadcasting).
+  static constexpr bool BlockAccess =
+      TensorEvaluator<Expression, Device>::BlockAccess && TensorEvaluator<Expression, Device>::PreferBlockAccess;
+
+  static const TiledEvaluation value = BlockAccess ? TiledEvaluation::On : TiledEvaluation::Off;
+};
+
+template <typename Expression, typename Device, bool Vectorizable = IsVectorizable<Device, Expression>::value,
+          TiledEvaluation Tiling = IsTileable<Device, Expression>::value>
+class TensorExecutor;
+
+template <typename Expression, typename Device, typename DoneCallback,
+          bool Vectorizable = IsVectorizable<Device, Expression>::value,
+          TiledEvaluation Tiling = IsTileable<Device, Expression>::value>
+class TensorAsyncExecutor;
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_FORWARD_DECLARATIONS_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
new file mode 100644
index 00000000..7a87594c
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@@ -0,0 +1,422 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H
+#define EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+/** \internal
+ * \brief Template functor to compute the modulo between an array and a scalar.
+ */
+template <typename Scalar>
+struct scalar_mod_op {
+  EIGEN_DEVICE_FUNC scalar_mod_op(const Scalar& divisor) : m_divisor(divisor) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a) const { return a % m_divisor; }
+  const Scalar m_divisor;
+};
+template <typename Scalar>
+struct functor_traits<scalar_mod_op<Scalar> > {
+  enum { Cost = scalar_div_cost<Scalar, false>::value, PacketAccess = false };
+};
+
+/** \internal
+ * \brief Template functor to compute the modulo between 2 arrays.
+ */
+template <typename Scalar>
+struct scalar_mod2_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a, const Scalar& b) const { return a % b; }
+};
+template <typename Scalar>
+struct functor_traits<scalar_mod2_op<Scalar> > {
+  enum { Cost = scalar_div_cost<Scalar, false>::value, PacketAccess = false };
+};
+
+template <typename Scalar>
+struct scalar_fmod_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar operator()(const Scalar& a, const Scalar& b) const {
+    return numext::fmod(a, b);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_fmod_op<Scalar> > {
+  enum {
+    Cost = 13,  // Reciprocal throughput of FPREM on Haswell.
+    PacketAccess = false
+  };
+};
+
+template <typename Reducer, typename Device>
+struct reducer_traits {
+  enum { Cost = 1, PacketAccess = false, IsStateful = false, IsExactlyAssociative = true };
+};
+
+// Standard reduction functors
+template <typename T>
+struct SumReducer {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
+    internal::scalar_sum_op<T> sum_op;
+    *accum = sum_op(*accum, t);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
+    (*accum) = padd<Packet>(*accum, p);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
+    internal::scalar_cast_op<int, T> conv;
+    return conv(0);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
+    return pset1<Packet>(initialize());
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { return accum; }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
+    return vaccum;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
+    internal::scalar_sum_op<T> sum_op;
+    return sum_op(saccum, predux(vaccum));
+  }
+};
+
+template <typename T, typename Device>
+struct reducer_traits<SumReducer<T>, Device> {
+  enum {
+    Cost = NumTraits<T>::AddCost,
+    PacketAccess = PacketType<T, Device>::HasAdd,
+    IsStateful = false,
+    IsExactlyAssociative = NumTraits<T>::IsInteger
+  };
+};
+
+template <typename T>
+struct MeanReducer {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE MeanReducer() : scalarCount_(0), packetCount_(0) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) {
+    internal::scalar_sum_op<T> sum_op;
+    *accum = sum_op(*accum, t);
+    scalarCount_++;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) {
+    (*accum) = padd<Packet>(*accum, p);
+    packetCount_++;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
+    internal::scalar_cast_op<int, T> conv;
+    return conv(0);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
+    return pset1<Packet>(initialize());
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const {
+    internal::scalar_quotient_op<T> quotient_op;
+    return quotient_op(accum, T(scalarCount_));
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
+    return pdiv(vaccum, pset1<Packet>(T(packetCount_)));
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
+    internal::scalar_sum_op<T> sum_op;
+    internal::scalar_quotient_op<T> quotient_op;
+    return quotient_op(sum_op(saccum, predux(vaccum)), T(scalarCount_ + packetCount_ * unpacket_traits<Packet>::size));
+  }
+
+ protected:
+  DenseIndex scalarCount_;
+  DenseIndex packetCount_;
+};
+
+template <typename T, typename Device>
+struct reducer_traits<MeanReducer<T>, Device> {
+  enum {
+    Cost = NumTraits<T>::AddCost,
+    PacketAccess = PacketType<T, Device>::HasAdd && PacketType<T, Device>::HasDiv && !NumTraits<T>::IsInteger,
+    IsStateful = true,
+    IsExactlyAssociative = NumTraits<T>::IsInteger
+  };
+};
+
+template <typename T, bool IsMax = true, bool IsInteger = true>
+struct MinMaxBottomValue {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() { return Eigen::NumTraits<T>::lowest(); }
+};
+template <typename T>
+struct MinMaxBottomValue<T, true, false> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() { return -Eigen::NumTraits<T>::infinity(); }
+};
+template <typename T>
+struct MinMaxBottomValue<T, false, true> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() { return Eigen::NumTraits<T>::highest(); }
+};
+template <typename T>
+struct MinMaxBottomValue<T, false, false> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() { return Eigen::NumTraits<T>::infinity(); }
+};
+
+template <typename T, int NaNPropagation = PropagateFast>
+struct MaxReducer {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
+    scalar_max_op<T, T, NaNPropagation> op;
+    *accum = op(t, *accum);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
+    scalar_max_op<T, T, NaNPropagation> op;
+    (*accum) = op.packetOp(*accum, p);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
+    return MinMaxBottomValue<T, /*IsMax=*/true, Eigen::NumTraits<T>::IsInteger>::bottom_value();
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
+    return pset1<Packet>(initialize());
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { return accum; }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
+    return vaccum;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
+    scalar_max_op<T, T, NaNPropagation> op;
+    return op(saccum, op.predux(vaccum));
+  }
+};
+
+template <typename T, typename Device, int NaNPropagation>
+struct reducer_traits<MaxReducer<T, NaNPropagation>, Device> {
+  enum {
+    Cost = NumTraits<T>::AddCost,
+    PacketAccess = PacketType<T, Device>::HasMax,
+    IsStateful = false,
+    IsExactlyAssociative = (NaNPropagation != PropagateFast)
+  };
+};
+
+template <typename T, int NaNPropagation = PropagateFast>
+struct MinReducer {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
+    scalar_min_op<T, T, NaNPropagation> op;
+    *accum = op(t, *accum);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
+    scalar_min_op<T, T, NaNPropagation> op;
+    (*accum) = op.packetOp(*accum, p);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
+    return MinMaxBottomValue<T, /*IsMax=*/false, Eigen::NumTraits<T>::IsInteger>::bottom_value();
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
+    return pset1<Packet>(initialize());
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { return accum; }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
+    return vaccum;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
+    scalar_min_op<T, T, NaNPropagation> op;
+    return op(saccum, op.predux(vaccum));
+  }
+};
+
+template <typename T, typename Device, int NaNPropagation>
+struct reducer_traits<MinReducer<T, NaNPropagation>, Device> {
+  enum {
+    Cost = NumTraits<T>::AddCost,
+    PacketAccess = PacketType<T, Device>::HasMin,
+    IsStateful = false,
+    IsExactlyAssociative = (NaNPropagation != PropagateFast)
+  };
+};
+
+template <typename T>
+struct ProdReducer {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
+    internal::scalar_product_op<T> prod_op;
+    (*accum) = prod_op(*accum, t);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
+    (*accum) = pmul<Packet>(*accum, p);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
+    internal::scalar_cast_op<int, T> conv;
+    return conv(1);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
+    return pset1<Packet>(initialize());
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T accum) const { return accum; }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet finalizePacket(const Packet& vaccum) const {
+    return vaccum;
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalizeBoth(const T saccum, const Packet& vaccum) const {
+    internal::scalar_product_op<T> prod_op;
+    return prod_op(saccum, predux_mul(vaccum));
+  }
+};
+
+template <typename T, typename Device>
+struct reducer_traits<ProdReducer<T>, Device> {
+  enum {
+    Cost = NumTraits<T>::MulCost,
+    PacketAccess = PacketType<T, Device>::HasMul,
+    IsStateful = false,
+    IsExactlyAssociative = true
+  };
+};
+
+struct AndReducer {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(bool t, bool* accum) const { *accum = *accum && t; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool initialize() const { return true; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool finalize(bool accum) const { return accum; }
+};
+
+template <typename Device>
+struct reducer_traits<AndReducer, Device> {
+  enum { Cost = 1, PacketAccess = false, IsStateful = false, IsExactlyAssociative = true };
+};
+
+struct OrReducer {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(bool t, bool* accum) const { *accum = *accum || t; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool initialize() const { return false; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool finalize(bool accum) const { return accum; }
+};
+
+template <typename Device>
+struct reducer_traits<OrReducer, Device> {
+  enum { Cost = 1, PacketAccess = false, IsStateful = false, IsExactlyAssociative = true };
+};
+
+// Argmin/Argmax reducers.  Returns the first occurrence if multiple locations
+// contain the same min/max value.
+template <typename T>
+struct ArgMaxPairReducer {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
+    if (t.second < accum->second) {
+      return;
+    } else if (t.second > accum->second || accum->first > t.first) {
+      *accum = t;
+    }
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
+    return T(0, NumTraits<typename T::second_type>::lowest());
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T& accum) const { return accum; }
+};
+
+template <typename T, typename Device>
+struct reducer_traits<ArgMaxPairReducer<T>, Device> {
+  enum { Cost = NumTraits<T>::AddCost, PacketAccess = false, IsStateful = false, IsExactlyAssociative = true };
+};
+
+template <typename T>
+struct ArgMinPairReducer {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T& t, T* accum) const {
+    if (t.second > accum->second) {
+      return;
+    } else if (t.second < accum->second || accum->first > t.first) {
+      *accum = t;
+    }
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
+    return T(0, NumTraits<typename T::second_type>::highest());
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T finalize(const T& accum) const { return accum; }
+};
+
+template <typename T, typename Device>
+struct reducer_traits<ArgMinPairReducer<T>, Device> {
+  enum { Cost = NumTraits<T>::AddCost, PacketAccess = false, IsStateful = false, IsExactlyAssociative = true };
+};
+
+template <typename T, typename Index, size_t NumDims>
+class GaussianGenerator {
+ public:
+  static constexpr bool PacketAccess = false;
+
+  EIGEN_DEVICE_FUNC GaussianGenerator(const array<T, NumDims>& means, const array<T, NumDims>& std_devs)
+      : m_means(means) {
+    EIGEN_UNROLL_LOOP
+    for (size_t i = 0; i < NumDims; ++i) {
+      m_two_sigmas[i] = std_devs[i] * std_devs[i] * 2;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC T operator()(const array<Index, NumDims>& coordinates) const {
+    T tmp = T(0);
+    EIGEN_UNROLL_LOOP
+    for (size_t i = 0; i < NumDims; ++i) {
+      T offset = coordinates[i] - m_means[i];
+      tmp += offset * offset / m_two_sigmas[i];
+    }
+    return numext::exp(-tmp);
+  }
+
+ private:
+  array<T, NumDims> m_means;
+  array<T, NumDims> m_two_sigmas;
+};
+
+template <typename T, typename Index, size_t NumDims>
+struct functor_traits<GaussianGenerator<T, Index, NumDims> > {
+  enum {
+    Cost = NumDims *
+               (2 * NumTraits<T>::AddCost + NumTraits<T>::MulCost + functor_traits<scalar_quotient_op<T, T> >::Cost) +
+           functor_traits<scalar_exp_op<T> >::Cost,
+    PacketAccess = GaussianGenerator<T, Index, NumDims>::PacketAccess
+  };
+};
+
+template <typename Scalar>
+struct scalar_clamp_op {
+  EIGEN_DEVICE_FUNC inline scalar_clamp_op(const Scalar& _min, const Scalar& _max) : m_min(_min), m_max(_max) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    return numext::mini(numext::maxi(x, m_min), m_max);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& x) const {
+    return internal::pmin(internal::pmax(x, pset1<Packet>(m_min)), pset1<Packet>(m_max));
+  }
+  const Scalar m_min;
+  const Scalar m_max;
+};
+template <typename Scalar>
+struct functor_traits<scalar_clamp_op<Scalar> > {
+  enum {
+    Cost = 2 * NumTraits<Scalar>::AddCost,
+    PacketAccess = (packet_traits<Scalar>::HasMin && packet_traits<Scalar>::HasMax)
+  };
+};
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_FUNCTORS_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
new file mode 100644
index 00000000..2dac3379
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
@@ -0,0 +1,269 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_GENERATOR_H
+#define EIGEN_CXX11_TENSOR_TENSOR_GENERATOR_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+template <typename Generator, typename XprType>
+struct traits<TensorGeneratorOp<Generator, XprType> > : public traits<XprType> {
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef std::remove_reference_t<Nested> Nested_;
+  static constexpr int NumDimensions = XprTraits::NumDimensions;
+  static constexpr int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
+};
+
+template <typename Generator, typename XprType>
+struct eval<TensorGeneratorOp<Generator, XprType>, Eigen::Dense> {
+  typedef const TensorGeneratorOp<Generator, XprType>& type;
+};
+
+template <typename Generator, typename XprType>
+struct nested<TensorGeneratorOp<Generator, XprType>, 1, typename eval<TensorGeneratorOp<Generator, XprType> >::type> {
+  typedef TensorGeneratorOp<Generator, XprType> type;
+};
+
+}  // end namespace internal
+
+/**
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor generator class.
+ */
+template <typename Generator, typename XprType>
+class TensorGeneratorOp : public TensorBase<TensorGeneratorOp<Generator, XprType>, ReadOnlyAccessors> {
+ public:
+  typedef typename Eigen::internal::traits<TensorGeneratorOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorGeneratorOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorGeneratorOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorGeneratorOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorGeneratorOp(const XprType& expr, const Generator& generator)
+      : m_xpr(expr), m_generator(generator) {}
+
+  EIGEN_DEVICE_FUNC const Generator& generator() const { return m_generator; }
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename XprType::Nested>& expression() const { return m_xpr; }
+
+ protected:
+  typename XprType::Nested m_xpr;
+  const Generator m_generator;
+};
+
+// Eval as rvalue
+template <typename Generator, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device> {
+  typedef TensorGeneratorOp<Generator, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions Dimensions;
+  static constexpr int NumDims = internal::array_size<Dimensions>::value;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+  enum {
+    IsAligned = false,
+    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
+    BlockAccess = true,
+    PreferBlockAccess = true,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  typedef internal::TensorIntDivisor<Index> IndexDivisor;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename internal::TensorMaterializedBlock<CoeffReturnType, NumDims, Layout, Index> TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_device(device), m_generator(op.generator()) {
+    TensorEvaluator<ArgType, Device> argImpl(op.expression(), device);
+    m_dimensions = argImpl.dimensions();
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_strides[0] = 1;
+      EIGEN_UNROLL_LOOP
+      for (int i = 1; i < NumDims; ++i) {
+        m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1];
+        if (m_strides[i] != 0) m_fast_strides[i] = IndexDivisor(m_strides[i]);
+      }
+    } else {
+      m_strides[NumDims - 1] = 1;
+      EIGEN_UNROLL_LOOP
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1];
+        if (m_strides[i] != 0) m_fast_strides[i] = IndexDivisor(m_strides[i]);
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { return true; }
+  EIGEN_STRONG_INLINE void cleanup() {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    array<Index, NumDims> coords;
+    extract_coordinates(index, coords);
+    return m_generator(coords);
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+    const int packetSize = PacketType<CoeffReturnType, Device>::size;
+    eigen_assert(index + packetSize - 1 < dimensions().TotalSize());
+
+    EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[packetSize];
+    for (int i = 0; i < packetSize; ++i) {
+      values[i] = coeff(index + i);
+    }
+    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+    return rslt;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    const size_t target_size = m_device.firstLevelCacheSize();
+    // TODO(ezhulenev): Generator should have a cost.
+    return internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size);
+  }
+
+  struct BlockIteratorState {
+    Index stride;
+    Index span;
+    Index size;
+    Index count;
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+                                                          bool /*root_of_expr_ast*/ = false) const {
+    static const bool is_col_major = static_cast<int>(Layout) == static_cast<int>(ColMajor);
+
+    // Compute spatial coordinates for the first block element.
+    array<Index, NumDims> coords;
+    extract_coordinates(desc.offset(), coords);
+    array<Index, NumDims> initial_coords = coords;
+
+    // Offset in the output block buffer.
+    Index offset = 0;
+
+    // Initialize output block iterator state. Dimension in this array are
+    // always in inner_most -> outer_most order (col major layout).
+    array<BlockIteratorState, NumDims> it;
+    for (int i = 0; i < NumDims; ++i) {
+      const int dim = is_col_major ? i : NumDims - 1 - i;
+      it[i].size = desc.dimension(dim);
+      it[i].stride = i == 0 ? 1 : (it[i - 1].size * it[i - 1].stride);
+      it[i].span = it[i].stride * (it[i].size - 1);
+      it[i].count = 0;
+    }
+    eigen_assert(it[0].stride == 1);
+
+    // Prepare storage for the materialized generator result.
+    const typename TensorBlock::Storage block_storage = TensorBlock::prepareStorage(desc, scratch);
+
+    CoeffReturnType* block_buffer = block_storage.data();
+
+    static const int packet_size = PacketType<CoeffReturnType, Device>::size;
+
+    static const int inner_dim = is_col_major ? 0 : NumDims - 1;
+    const Index inner_dim_size = it[0].size;
+    const Index inner_dim_vectorized = inner_dim_size - packet_size;
+
+    while (it[NumDims - 1].count < it[NumDims - 1].size) {
+      Index i = 0;
+      // Generate data for the vectorized part of the inner-most dimension.
+      for (; i <= inner_dim_vectorized; i += packet_size) {
+        for (Index j = 0; j < packet_size; ++j) {
+          array<Index, NumDims> j_coords = coords;  // Break loop dependence.
+          j_coords[inner_dim] += j;
+          *(block_buffer + offset + i + j) = m_generator(j_coords);
+        }
+        coords[inner_dim] += packet_size;
+      }
+      // Finalize non-vectorized part of the inner-most dimension.
+      for (; i < inner_dim_size; ++i) {
+        *(block_buffer + offset + i) = m_generator(coords);
+        coords[inner_dim]++;
+      }
+      coords[inner_dim] = initial_coords[inner_dim];
+
+      // For the 1d tensor we need to generate only one inner-most dimension.
+      if (NumDims == 1) break;
+
+      // Update offset.
+      for (i = 1; i < NumDims; ++i) {
+        if (++it[i].count < it[i].size) {
+          offset += it[i].stride;
+          coords[is_col_major ? i : NumDims - 1 - i]++;
+          break;
+        }
+        if (i != NumDims - 1) it[i].count = 0;
+        coords[is_col_major ? i : NumDims - 1 - i] = initial_coords[is_col_major ? i : NumDims - 1 - i];
+        offset -= it[i].span;
+      }
+    }
+
+    return block_storage.AsTensorMaterializedBlock();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const {
+    // TODO(rmlarsen): This is just a placeholder. Define interface to make
+    // generators return their cost.
+    return TensorOpCost(0, 0, TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>());
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+ protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void extract_coordinates(Index index, array<Index, NumDims>& coords) const {
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx = index / m_fast_strides[i];
+        index -= idx * m_strides[i];
+        coords[i] = idx;
+      }
+      coords[0] = index;
+    } else {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx = index / m_fast_strides[i];
+        index -= idx * m_strides[i];
+        coords[i] = idx;
+      }
+      coords[NumDims - 1] = index;
+    }
+  }
+
+  const Device EIGEN_DEVICE_REF m_device;
+  Dimensions m_dimensions;
+  array<Index, NumDims> m_strides;
+  array<IndexDivisor, NumDims> m_fast_strides;
+  Generator m_generator;
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_GENERATOR_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h
new file mode 100644
index 00000000..6a1240cf
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h
@@ -0,0 +1,36 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Eugene Brevdo <ebrevdo@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H
+#define EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \cpp11 \returns an expression of the coefficient-wise betainc(\a x, \a a, \a b) to the given tensors.
+ *
+ * This function computes the regularized incomplete beta function (integral).
+ *
+ */
+template <typename ADerived, typename BDerived, typename XDerived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorCwiseTernaryOp<internal::scalar_betainc_op<typename XDerived::Scalar>,
+                                                                 const ADerived, const BDerived, const XDerived>
+betainc(const Eigen::TensorBase<ADerived, ReadOnlyAccessors>& a,
+        const Eigen::TensorBase<BDerived, ReadOnlyAccessors>& b,
+        const Eigen::TensorBase<XDerived, ReadOnlyAccessors>& x) {
+  return TensorCwiseTernaryOp<internal::scalar_betainc_op<typename XDerived::Scalar>, const ADerived, const BDerived,
+                              const XDerived>(a.derived(), b.derived(), x.derived(),
+                                              internal::scalar_betainc_op<typename XDerived::Scalar>());
+}
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h
new file mode 100644
index 00000000..0bdb1ab7
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h
@@ -0,0 +1,413 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_IO_H
+#define EIGEN_CXX11_TENSOR_TENSOR_IO_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+struct TensorIOFormat;
+
+namespace internal {
+template <typename Tensor, std::size_t rank, typename Format, typename EnableIf = void>
+struct TensorPrinter;
+}
+
+template <typename Derived_>
+struct TensorIOFormatBase {
+  using Derived = Derived_;
+  TensorIOFormatBase(const std::vector<std::string>& separator, const std::vector<std::string>& prefix,
+                     const std::vector<std::string>& suffix, int precision = StreamPrecision, int flags = 0,
+                     const std::string& tenPrefix = "", const std::string& tenSuffix = "", const char fill = ' ')
+      : tenPrefix(tenPrefix),
+        tenSuffix(tenSuffix),
+        prefix(prefix),
+        suffix(suffix),
+        separator(separator),
+        fill(fill),
+        precision(precision),
+        flags(flags) {
+    init_spacer();
+  }
+
+  void init_spacer() {
+    if ((flags & DontAlignCols)) return;
+    spacer.resize(prefix.size());
+    spacer[0] = "";
+    int i = int(tenPrefix.length()) - 1;
+    while (i >= 0 && tenPrefix[i] != '\n') {
+      spacer[0] += ' ';
+      i--;
+    }
+
+    for (std::size_t k = 1; k < prefix.size(); k++) {
+      int j = int(prefix[k].length()) - 1;
+      while (j >= 0 && prefix[k][j] != '\n') {
+        spacer[k] += ' ';
+        j--;
+      }
+    }
+  }
+
+  std::string tenPrefix;
+  std::string tenSuffix;
+  std::vector<std::string> prefix;
+  std::vector<std::string> suffix;
+  std::vector<std::string> separator;
+  char fill;
+  int precision;
+  int flags;
+  std::vector<std::string> spacer{};
+};
+
+struct TensorIOFormatNumpy : public TensorIOFormatBase<TensorIOFormatNumpy> {
+  using Base = TensorIOFormatBase<TensorIOFormatNumpy>;
+  TensorIOFormatNumpy()
+      : Base(/*separator=*/{" ", "\n"}, /*prefix=*/{"", "["}, /*suffix=*/{"", "]"}, /*precision=*/StreamPrecision,
+             /*flags=*/0, /*tenPrefix=*/"[", /*tenSuffix=*/"]") {}
+};
+
+struct TensorIOFormatNative : public TensorIOFormatBase<TensorIOFormatNative> {
+  using Base = TensorIOFormatBase<TensorIOFormatNative>;
+  TensorIOFormatNative()
+      : Base(/*separator=*/{", ", ",\n", "\n"}, /*prefix=*/{"", "{"}, /*suffix=*/{"", "}"},
+             /*precision=*/StreamPrecision, /*flags=*/0, /*tenPrefix=*/"{", /*tenSuffix=*/"}") {}
+};
+
+struct TensorIOFormatPlain : public TensorIOFormatBase<TensorIOFormatPlain> {
+  using Base = TensorIOFormatBase<TensorIOFormatPlain>;
+  TensorIOFormatPlain()
+      : Base(/*separator=*/{" ", "\n", "\n", ""}, /*prefix=*/{""}, /*suffix=*/{""}, /*precision=*/StreamPrecision,
+             /*flags=*/0, /*tenPrefix=*/"", /*tenSuffix=*/"") {}
+};
+
+struct TensorIOFormatLegacy : public TensorIOFormatBase<TensorIOFormatLegacy> {
+  using Base = TensorIOFormatBase<TensorIOFormatLegacy>;
+  TensorIOFormatLegacy()
+      : Base(/*separator=*/{", ", "\n"}, /*prefix=*/{"", "["}, /*suffix=*/{"", "]"}, /*precision=*/StreamPrecision,
+             /*flags=*/0, /*tenPrefix=*/"", /*tenSuffix=*/"") {}
+};
+
+struct TensorIOFormat : public TensorIOFormatBase<TensorIOFormat> {
+  using Base = TensorIOFormatBase<TensorIOFormat>;
+  TensorIOFormat(const std::vector<std::string>& separator, const std::vector<std::string>& prefix,
+                 const std::vector<std::string>& suffix, int precision = StreamPrecision, int flags = 0,
+                 const std::string& tenPrefix = "", const std::string& tenSuffix = "", const char fill = ' ')
+      : Base(separator, prefix, suffix, precision, flags, tenPrefix, tenSuffix, fill) {}
+
+  static inline const TensorIOFormatNumpy Numpy() { return TensorIOFormatNumpy{}; }
+
+  static inline const TensorIOFormatPlain Plain() { return TensorIOFormatPlain{}; }
+
+  static inline const TensorIOFormatNative Native() { return TensorIOFormatNative{}; }
+
+  static inline const TensorIOFormatLegacy Legacy() { return TensorIOFormatLegacy{}; }
+};
+
+template <typename T, int Layout, int rank, typename Format>
+class TensorWithFormat;
+// specialize for Layout=ColMajor, Layout=RowMajor and rank=0.
+template <typename T, int rank, typename Format>
+class TensorWithFormat<T, RowMajor, rank, Format> {
+ public:
+  TensorWithFormat(const T& tensor, const Format& format) : t_tensor(tensor), t_format(format) {}
+
+  friend std::ostream& operator<<(std::ostream& os, const TensorWithFormat<T, RowMajor, rank, Format>& wf) {
+    // Evaluate the expression if needed
+    typedef TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice> Evaluator;
+    TensorForcedEvalOp<const T> eval = wf.t_tensor.eval();
+    Evaluator tensor(eval, DefaultDevice());
+    tensor.evalSubExprsIfNeeded(NULL);
+    internal::TensorPrinter<Evaluator, rank, Format>::run(os, tensor, wf.t_format);
+    // Cleanup.
+    tensor.cleanup();
+    return os;
+  }
+
+ protected:
+  T t_tensor;
+  Format t_format;
+};
+
+template <typename T, int rank, typename Format>
+class TensorWithFormat<T, ColMajor, rank, Format> {
+ public:
+  TensorWithFormat(const T& tensor, const Format& format) : t_tensor(tensor), t_format(format) {}
+
+  friend std::ostream& operator<<(std::ostream& os, const TensorWithFormat<T, ColMajor, rank, Format>& wf) {
+    // Switch to RowMajor storage and print afterwards
+    typedef typename T::Index IndexType;
+    std::array<IndexType, rank> shuffle;
+    std::array<IndexType, rank> id;
+    std::iota(id.begin(), id.end(), IndexType(0));
+    std::copy(id.begin(), id.end(), shuffle.rbegin());
+    auto tensor_row_major = wf.t_tensor.swap_layout().shuffle(shuffle);
+
+    // Evaluate the expression if needed
+    typedef TensorEvaluator<const TensorForcedEvalOp<const decltype(tensor_row_major)>, DefaultDevice> Evaluator;
+    TensorForcedEvalOp<const decltype(tensor_row_major)> eval = tensor_row_major.eval();
+    Evaluator tensor(eval, DefaultDevice());
+    tensor.evalSubExprsIfNeeded(NULL);
+    internal::TensorPrinter<Evaluator, rank, Format>::run(os, tensor, wf.t_format);
+    // Cleanup.
+    tensor.cleanup();
+    return os;
+  }
+
+ protected:
+  T t_tensor;
+  Format t_format;
+};
+
+template <typename T, typename Format>
+class TensorWithFormat<T, ColMajor, 0, Format> {
+ public:
+  TensorWithFormat(const T& tensor, const Format& format) : t_tensor(tensor), t_format(format) {}
+
+  friend std::ostream& operator<<(std::ostream& os, const TensorWithFormat<T, ColMajor, 0, Format>& wf) {
+    // Evaluate the expression if needed
+    typedef TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice> Evaluator;
+    TensorForcedEvalOp<const T> eval = wf.t_tensor.eval();
+    Evaluator tensor(eval, DefaultDevice());
+    tensor.evalSubExprsIfNeeded(NULL);
+    internal::TensorPrinter<Evaluator, 0, Format>::run(os, tensor, wf.t_format);
+    // Cleanup.
+    tensor.cleanup();
+    return os;
+  }
+
+ protected:
+  T t_tensor;
+  Format t_format;
+};
+
+namespace internal {
+
+// Default scalar printer.
+template <typename Scalar, typename Format, typename EnableIf = void>
+struct ScalarPrinter {
+  static void run(std::ostream& stream, const Scalar& scalar, const Format&) { stream << scalar; }
+};
+
+template <typename Scalar>
+struct ScalarPrinter<Scalar, TensorIOFormatNumpy, std::enable_if_t<NumTraits<Scalar>::IsComplex>> {
+  static void run(std::ostream& stream, const Scalar& scalar, const TensorIOFormatNumpy&) {
+    stream << numext::real(scalar) << "+" << numext::imag(scalar) << "j";
+  }
+};
+
+template <typename Scalar>
+struct ScalarPrinter<Scalar, TensorIOFormatNative, std::enable_if_t<NumTraits<Scalar>::IsComplex>> {
+  static void run(std::ostream& stream, const Scalar& scalar, const TensorIOFormatNative&) {
+    stream << "{" << numext::real(scalar) << ", " << numext::imag(scalar) << "}";
+  }
+};
+
+template <typename Tensor, std::size_t rank, typename Format, typename EnableIf>
+struct TensorPrinter {
+  using Scalar = std::remove_const_t<typename Tensor::Scalar>;
+
+  static void run(std::ostream& s, const Tensor& tensor, const Format& fmt) {
+    typedef typename Tensor::Index IndexType;
+
+    eigen_assert(Tensor::Layout == RowMajor);
+    typedef std::conditional_t<is_same<Scalar, char>::value || is_same<Scalar, unsigned char>::value ||
+                                   is_same<Scalar, numext::int8_t>::value || is_same<Scalar, numext::uint8_t>::value,
+                               int,
+                               std::conditional_t<is_same<Scalar, std::complex<char>>::value ||
+                                                      is_same<Scalar, std::complex<unsigned char>>::value ||
+                                                      is_same<Scalar, std::complex<numext::int8_t>>::value ||
+                                                      is_same<Scalar, std::complex<numext::uint8_t>>::value,
+                                                  std::complex<int>, const Scalar&>>
+        PrintType;
+
+    const IndexType total_size = array_prod(tensor.dimensions());
+
+    std::streamsize explicit_precision;
+    if (fmt.precision == StreamPrecision) {
+      explicit_precision = 0;
+    } else if (fmt.precision == FullPrecision) {
+      if (NumTraits<Scalar>::IsInteger) {
+        explicit_precision = 0;
+      } else {
+        explicit_precision = significant_decimals_impl<Scalar>::run();
+      }
+    } else {
+      explicit_precision = fmt.precision;
+    }
+
+    std::streamsize old_precision = 0;
+    if (explicit_precision) old_precision = s.precision(explicit_precision);
+
+    IndexType width = 0;
+    bool align_cols = !(fmt.flags & DontAlignCols);
+    if (align_cols) {
+      // compute the largest width
+      for (IndexType i = 0; i < total_size; i++) {
+        std::stringstream sstr;
+        sstr.copyfmt(s);
+        ScalarPrinter<Scalar, Format>::run(sstr, static_cast<PrintType>(tensor.data()[i]), fmt);
+        width = std::max<IndexType>(width, IndexType(sstr.str().length()));
+      }
+    }
+    s << fmt.tenPrefix;
+    for (IndexType i = 0; i < total_size; i++) {
+      std::array<bool, rank> is_at_end{};
+      std::array<bool, rank> is_at_begin{};
+
+      // is the ith element the end of an coeff (always true), of a row, of a matrix, ...?
+      for (std::size_t k = 0; k < rank; k++) {
+        if ((i + 1) % (std::accumulate(tensor.dimensions().rbegin(), tensor.dimensions().rbegin() + k, 1,
+                                       std::multiplies<IndexType>())) ==
+            0) {
+          is_at_end[k] = true;
+        }
+      }
+
+      // is the ith element the begin of an coeff (always true), of a row, of a matrix, ...?
+      for (std::size_t k = 0; k < rank; k++) {
+        if (i % (std::accumulate(tensor.dimensions().rbegin(), tensor.dimensions().rbegin() + k, 1,
+                                 std::multiplies<IndexType>())) ==
+            0) {
+          is_at_begin[k] = true;
+        }
+      }
+
+      // do we have a line break?
+      bool is_at_begin_after_newline = false;
+      for (std::size_t k = 0; k < rank; k++) {
+        if (is_at_begin[k]) {
+          std::size_t separator_index = (k < fmt.separator.size()) ? k : fmt.separator.size() - 1;
+          if (fmt.separator[separator_index].find('\n') != std::string::npos) {
+            is_at_begin_after_newline = true;
+          }
+        }
+      }
+
+      bool is_at_end_before_newline = false;
+      for (std::size_t k = 0; k < rank; k++) {
+        if (is_at_end[k]) {
+          std::size_t separator_index = (k < fmt.separator.size()) ? k : fmt.separator.size() - 1;
+          if (fmt.separator[separator_index].find('\n') != std::string::npos) {
+            is_at_end_before_newline = true;
+          }
+        }
+      }
+
+      std::stringstream suffix, prefix, separator;
+      for (std::size_t k = 0; k < rank; k++) {
+        std::size_t suffix_index = (k < fmt.suffix.size()) ? k : fmt.suffix.size() - 1;
+        if (is_at_end[k]) {
+          suffix << fmt.suffix[suffix_index];
+        }
+      }
+      for (std::size_t k = 0; k < rank; k++) {
+        std::size_t separator_index = (k < fmt.separator.size()) ? k : fmt.separator.size() - 1;
+        if (is_at_end[k] &&
+            (!is_at_end_before_newline || fmt.separator[separator_index].find('\n') != std::string::npos)) {
+          separator << fmt.separator[separator_index];
+        }
+      }
+      for (std::size_t k = 0; k < rank; k++) {
+        std::size_t spacer_index = (k < fmt.spacer.size()) ? k : fmt.spacer.size() - 1;
+        if (i != 0 && is_at_begin_after_newline && (!is_at_begin[k] || k == 0)) {
+          prefix << fmt.spacer[spacer_index];
+        }
+      }
+      for (int k = rank - 1; k >= 0; k--) {
+        std::size_t prefix_index = (static_cast<std::size_t>(k) < fmt.prefix.size()) ? k : fmt.prefix.size() - 1;
+        if (is_at_begin[k]) {
+          prefix << fmt.prefix[prefix_index];
+        }
+      }
+
+      s << prefix.str();
+      // So we don't mess around with formatting, output scalar to a string stream, and adjust the width/fill manually.
+      std::stringstream sstr;
+      sstr.copyfmt(s);
+      ScalarPrinter<Scalar, Format>::run(sstr, static_cast<PrintType>(tensor.data()[i]), fmt);
+      std::string scalar_str = sstr.str();
+      IndexType scalar_width = scalar_str.length();
+      if (width && scalar_width < width) {
+        std::string filler;
+        for (IndexType j = scalar_width; j < width; ++j) {
+          filler.push_back(fmt.fill);
+        }
+        s << filler;
+      }
+      s << scalar_str;
+      s << suffix.str();
+      if (i < total_size - 1) {
+        s << separator.str();
+      }
+    }
+    s << fmt.tenSuffix;
+    if (explicit_precision) s.precision(old_precision);
+  }
+};
+
+template <typename Tensor, std::size_t rank>
+struct TensorPrinter<Tensor, rank, TensorIOFormatLegacy, std::enable_if_t<rank != 0>> {
+  using Format = TensorIOFormatLegacy;
+  using Scalar = std::remove_const_t<typename Tensor::Scalar>;
+
+  static void run(std::ostream& s, const Tensor& tensor, const Format&) {
+    typedef typename Tensor::Index IndexType;
+    // backwards compatibility case: print tensor after reshaping to matrix of size dim(0) x
+    // (dim(1)*dim(2)*...*dim(rank-1)).
+    const IndexType total_size = internal::array_prod(tensor.dimensions());
+    if (total_size > 0) {
+      const IndexType first_dim = Eigen::internal::array_get<0>(tensor.dimensions());
+      Map<const Array<Scalar, Dynamic, Dynamic, Tensor::Layout>> matrix(tensor.data(), first_dim,
+                                                                        total_size / first_dim);
+      s << matrix;
+      return;
+    }
+  }
+};
+
+template <typename Tensor, typename Format>
+struct TensorPrinter<Tensor, 0, Format> {
+  static void run(std::ostream& s, const Tensor& tensor, const Format& fmt) {
+    using Scalar = std::remove_const_t<typename Tensor::Scalar>;
+
+    std::streamsize explicit_precision;
+    if (fmt.precision == StreamPrecision) {
+      explicit_precision = 0;
+    } else if (fmt.precision == FullPrecision) {
+      if (NumTraits<Scalar>::IsInteger) {
+        explicit_precision = 0;
+      } else {
+        explicit_precision = significant_decimals_impl<Scalar>::run();
+      }
+    } else {
+      explicit_precision = fmt.precision;
+    }
+
+    std::streamsize old_precision = 0;
+    if (explicit_precision) old_precision = s.precision(explicit_precision);
+    s << fmt.tenPrefix;
+    ScalarPrinter<Scalar, Format>::run(s, tensor.coeff(0), fmt);
+    s << fmt.tenSuffix;
+    if (explicit_precision) s.precision(old_precision);
+  }
+};
+
+}  // end namespace internal
+template <typename T>
+std::ostream& operator<<(std::ostream& s, const TensorBase<T, ReadOnlyAccessors>& t) {
+  s << t.format(TensorIOFormat::Plain());
+  return s;
+}
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_IO_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
new file mode 100644
index 00000000..8bd1c43d
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
@@ -0,0 +1,590 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H
+#define EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+template <DenseIndex Rows, DenseIndex Cols, typename XprType>
+struct traits<TensorImagePatchOp<Rows, Cols, XprType> > : public traits<XprType> {
+  typedef std::remove_const_t<typename XprType::Scalar> Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef std::remove_reference_t<Nested> Nested_;
+  static constexpr int NumDimensions = XprTraits::NumDimensions + 1;
+  static constexpr int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
+};
+
+template <DenseIndex Rows, DenseIndex Cols, typename XprType>
+struct eval<TensorImagePatchOp<Rows, Cols, XprType>, Eigen::Dense> {
+  typedef const TensorImagePatchOp<Rows, Cols, XprType>& type;
+};
+
+template <DenseIndex Rows, DenseIndex Cols, typename XprType>
+struct nested<TensorImagePatchOp<Rows, Cols, XprType>, 1,
+              typename eval<TensorImagePatchOp<Rows, Cols, XprType> >::type> {
+  typedef TensorImagePatchOp<Rows, Cols, XprType> type;
+};
+
+template <typename Self, bool Vectorizable>
+struct ImagePatchCopyOp {
+  typedef typename Self::Index Index;
+  typedef typename Self::Scalar Scalar;
+  typedef typename Self::Impl Impl;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Self& self, const Index num_coeff_to_copy,
+                                                        const Index dst_index, Scalar* dst_data,
+                                                        const Index src_index) {
+    const Impl& impl = self.impl();
+    for (Index i = 0; i < num_coeff_to_copy; ++i) {
+      dst_data[dst_index + i] = impl.coeff(src_index + i);
+    }
+  }
+};
+
+template <typename Self>
+struct ImagePatchCopyOp<Self, true> {
+  typedef typename Self::Index Index;
+  typedef typename Self::Scalar Scalar;
+  typedef typename Self::Impl Impl;
+  typedef typename packet_traits<Scalar>::type Packet;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Self& self, const Index num_coeff_to_copy,
+                                                        const Index dst_index, Scalar* dst_data,
+                                                        const Index src_index) {
+    const Impl& impl = self.impl();
+    const Index packet_size = internal::unpacket_traits<Packet>::size;
+    const Index vectorized_size = (num_coeff_to_copy / packet_size) * packet_size;
+    for (Index i = 0; i < vectorized_size; i += packet_size) {
+      Packet p = impl.template packet<Unaligned>(src_index + i);
+      internal::pstoret<Scalar, Packet, Unaligned>(dst_data + dst_index + i, p);
+    }
+    for (Index i = vectorized_size; i < num_coeff_to_copy; ++i) {
+      dst_data[dst_index + i] = impl.coeff(src_index + i);
+    }
+  }
+};
+
+template <typename Self>
+struct ImagePatchPaddingOp {
+  typedef typename Self::Index Index;
+  typedef typename Self::Scalar Scalar;
+  typedef typename packet_traits<Scalar>::type Packet;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Index num_coeff_to_pad, const Scalar padding_value,
+                                                        const Index dst_index, Scalar* dst_data) {
+    const Index packet_size = internal::unpacket_traits<Packet>::size;
+    const Packet padded_packet = internal::pset1<Packet>(padding_value);
+    const Index vectorized_size = (num_coeff_to_pad / packet_size) * packet_size;
+    for (Index i = 0; i < vectorized_size; i += packet_size) {
+      internal::pstoret<Scalar, Packet, Unaligned>(dst_data + dst_index + i, padded_packet);
+    }
+    for (Index i = vectorized_size; i < num_coeff_to_pad; ++i) {
+      dst_data[dst_index + i] = padding_value;
+    }
+  }
+};
+
+}  // end namespace internal
+
+/**
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Patch extraction specialized for image processing.
+ * This assumes that the input has a least 3 dimensions ordered as follow:
+ *  1st dimension: channels (of size d)
+ *  2nd dimension: rows (of size r)
+ *  3rd dimension: columns (of size c)
+ *  There can be additional dimensions such as time (for video) or batch (for
+ * bulk processing after the first 3.
+ * Calling the image patch code with patch_rows and patch_cols is equivalent
+ * to calling the regular patch extraction code with parameters d, patch_rows,
+ * patch_cols, and 1 for all the additional dimensions.
+ */
+template <DenseIndex Rows, DenseIndex Cols, typename XprType>
+class TensorImagePatchOp : public TensorBase<TensorImagePatchOp<Rows, Cols, XprType>, ReadOnlyAccessors> {
+ public:
+  typedef typename Eigen::internal::traits<TensorImagePatchOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorImagePatchOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorImagePatchOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorImagePatchOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorImagePatchOp(const XprType& expr, DenseIndex patch_rows,
+                                                           DenseIndex patch_cols, DenseIndex row_strides,
+                                                           DenseIndex col_strides, DenseIndex in_row_strides,
+                                                           DenseIndex in_col_strides, DenseIndex row_inflate_strides,
+                                                           DenseIndex col_inflate_strides, PaddingType padding_type,
+                                                           Scalar padding_value)
+      : m_xpr(expr),
+        m_patch_rows(patch_rows),
+        m_patch_cols(patch_cols),
+        m_row_strides(row_strides),
+        m_col_strides(col_strides),
+        m_in_row_strides(in_row_strides),
+        m_in_col_strides(in_col_strides),
+        m_row_inflate_strides(row_inflate_strides),
+        m_col_inflate_strides(col_inflate_strides),
+        m_padding_explicit(false),
+        m_padding_top(0),
+        m_padding_bottom(0),
+        m_padding_left(0),
+        m_padding_right(0),
+        m_padding_type(padding_type),
+        m_padding_value(padding_value) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorImagePatchOp(const XprType& expr, DenseIndex patch_rows,
+                                                           DenseIndex patch_cols, DenseIndex row_strides,
+                                                           DenseIndex col_strides, DenseIndex in_row_strides,
+                                                           DenseIndex in_col_strides, DenseIndex row_inflate_strides,
+                                                           DenseIndex col_inflate_strides, DenseIndex padding_top,
+                                                           DenseIndex padding_bottom, DenseIndex padding_left,
+                                                           DenseIndex padding_right, Scalar padding_value)
+      : m_xpr(expr),
+        m_patch_rows(patch_rows),
+        m_patch_cols(patch_cols),
+        m_row_strides(row_strides),
+        m_col_strides(col_strides),
+        m_in_row_strides(in_row_strides),
+        m_in_col_strides(in_col_strides),
+        m_row_inflate_strides(row_inflate_strides),
+        m_col_inflate_strides(col_inflate_strides),
+        m_padding_explicit(true),
+        m_padding_top(padding_top),
+        m_padding_bottom(padding_bottom),
+        m_padding_left(padding_left),
+        m_padding_right(padding_right),
+        m_padding_type(PADDING_VALID),
+        m_padding_value(padding_value) {}
+
+  EIGEN_DEVICE_FUNC DenseIndex patch_rows() const { return m_patch_rows; }
+  EIGEN_DEVICE_FUNC DenseIndex patch_cols() const { return m_patch_cols; }
+  EIGEN_DEVICE_FUNC DenseIndex row_strides() const { return m_row_strides; }
+  EIGEN_DEVICE_FUNC DenseIndex col_strides() const { return m_col_strides; }
+  EIGEN_DEVICE_FUNC DenseIndex in_row_strides() const { return m_in_row_strides; }
+  EIGEN_DEVICE_FUNC DenseIndex in_col_strides() const { return m_in_col_strides; }
+  EIGEN_DEVICE_FUNC DenseIndex row_inflate_strides() const { return m_row_inflate_strides; }
+  EIGEN_DEVICE_FUNC DenseIndex col_inflate_strides() const { return m_col_inflate_strides; }
+  EIGEN_DEVICE_FUNC bool padding_explicit() const { return m_padding_explicit; }
+  EIGEN_DEVICE_FUNC DenseIndex padding_top() const { return m_padding_top; }
+  EIGEN_DEVICE_FUNC DenseIndex padding_bottom() const { return m_padding_bottom; }
+  EIGEN_DEVICE_FUNC DenseIndex padding_left() const { return m_padding_left; }
+  EIGEN_DEVICE_FUNC DenseIndex padding_right() const { return m_padding_right; }
+  EIGEN_DEVICE_FUNC PaddingType padding_type() const { return m_padding_type; }
+  EIGEN_DEVICE_FUNC Scalar padding_value() const { return m_padding_value; }
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename XprType::Nested>& expression() const { return m_xpr; }
+
+ protected:
+  typename XprType::Nested m_xpr;
+  const DenseIndex m_patch_rows;
+  const DenseIndex m_patch_cols;
+  const DenseIndex m_row_strides;
+  const DenseIndex m_col_strides;
+  const DenseIndex m_in_row_strides;
+  const DenseIndex m_in_col_strides;
+  const DenseIndex m_row_inflate_strides;
+  const DenseIndex m_col_inflate_strides;
+  const bool m_padding_explicit;
+  const DenseIndex m_padding_top;
+  const DenseIndex m_padding_bottom;
+  const DenseIndex m_padding_left;
+  const DenseIndex m_padding_right;
+  const PaddingType m_padding_type;
+  const Scalar m_padding_value;
+};
+
+// Eval as rvalue
+template <DenseIndex Rows, DenseIndex Cols, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device> {
+  typedef TensorImagePatchOp<Rows, Cols, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static constexpr int NumInputDims =
+      internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  static constexpr int NumDims = NumInputDims + 1;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef std::remove_const_t<typename XprType::Scalar> Scalar;
+  typedef TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device> Self;
+  typedef TensorEvaluator<ArgType, Device> Impl;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+  enum {
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    PreferBlockAccess = true,
+    CoordAccess = false,
+    RawAccess = false
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_device(device), m_impl(op.expression(), device) {
+    EIGEN_STATIC_ASSERT((NumDims >= 4), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    m_paddingValue = op.padding_value();
+
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+
+    // Caches a few variables.
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_inputDepth = input_dims[0];
+      m_inputRows = input_dims[1];
+      m_inputCols = input_dims[2];
+    } else {
+      m_inputDepth = input_dims[NumInputDims - 1];
+      m_inputRows = input_dims[NumInputDims - 2];
+      m_inputCols = input_dims[NumInputDims - 3];
+    }
+
+    m_row_strides = op.row_strides();
+    m_col_strides = op.col_strides();
+
+    // Input strides and effective input/patch size
+    m_in_row_strides = op.in_row_strides();
+    m_in_col_strides = op.in_col_strides();
+    m_row_inflate_strides = op.row_inflate_strides();
+    m_col_inflate_strides = op.col_inflate_strides();
+    // The "effective" input rows and input cols are the input rows and cols
+    // after inflating them with zeros.
+    // For examples, a 2x3 matrix with row_inflate_strides and
+    // col_inflate_strides of 2 comes from:
+    //   A B C
+    //   D E F
+    //
+    // to a matrix is 3 x 5:
+    //
+    //   A . B . C
+    //   . . . . .
+    //   D . E . F
+
+    m_input_rows_eff = (m_inputRows - 1) * m_row_inflate_strides + 1;
+    m_input_cols_eff = (m_inputCols - 1) * m_col_inflate_strides + 1;
+    m_patch_rows_eff = op.patch_rows() + (op.patch_rows() - 1) * (m_in_row_strides - 1);
+    m_patch_cols_eff = op.patch_cols() + (op.patch_cols() - 1) * (m_in_col_strides - 1);
+
+    if (op.padding_explicit()) {
+      m_outputRows = numext::ceil((m_input_rows_eff + op.padding_top() + op.padding_bottom() - m_patch_rows_eff + 1.f) /
+                                  static_cast<float>(m_row_strides));
+      m_outputCols = numext::ceil((m_input_cols_eff + op.padding_left() + op.padding_right() - m_patch_cols_eff + 1.f) /
+                                  static_cast<float>(m_col_strides));
+      m_rowPaddingTop = op.padding_top();
+      m_colPaddingLeft = op.padding_left();
+    } else {
+      // Computing padding from the type
+      switch (op.padding_type()) {
+        case PADDING_VALID:
+          m_outputRows = numext::ceil((m_input_rows_eff - m_patch_rows_eff + 1.f) / static_cast<float>(m_row_strides));
+          m_outputCols = numext::ceil((m_input_cols_eff - m_patch_cols_eff + 1.f) / static_cast<float>(m_col_strides));
+          // Calculate the padding
+          m_rowPaddingTop =
+              numext::maxi<Index>(0, ((m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff) / 2);
+          m_colPaddingLeft =
+              numext::maxi<Index>(0, ((m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff) / 2);
+          break;
+        case PADDING_SAME:
+          m_outputRows = numext::ceil(m_input_rows_eff / static_cast<float>(m_row_strides));
+          m_outputCols = numext::ceil(m_input_cols_eff / static_cast<float>(m_col_strides));
+          // Calculate the padding
+          m_rowPaddingTop = ((m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff) / 2;
+          m_colPaddingLeft = ((m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff) / 2;
+          // The padding size calculation for PADDING_SAME has been updated to
+          // be consistent with how TensorFlow extracts its paddings.
+          m_rowPaddingTop = numext::maxi<Index>(0, m_rowPaddingTop);
+          m_colPaddingLeft = numext::maxi<Index>(0, m_colPaddingLeft);
+          break;
+        default:
+          eigen_assert(false && "unexpected padding");
+          m_outputCols = 0;  // silence the uninitialised warning;
+          m_outputRows = 0;  //// silence the uninitialised warning;
+      }
+    }
+    eigen_assert(m_outputRows > 0);
+    eigen_assert(m_outputCols > 0);
+
+    // Dimensions for result of extraction.
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      // ColMajor
+      // 0: depth
+      // 1: patch_rows
+      // 2: patch_cols
+      // 3: number of patches
+      // 4 and beyond: anything else (such as batch).
+      m_dimensions[0] = input_dims[0];
+      m_dimensions[1] = op.patch_rows();
+      m_dimensions[2] = op.patch_cols();
+      m_dimensions[3] = m_outputRows * m_outputCols;
+      for (int i = 4; i < NumDims; ++i) {
+        m_dimensions[i] = input_dims[i - 1];
+      }
+    } else {
+      // RowMajor
+      // NumDims-1: depth
+      // NumDims-2: patch_rows
+      // NumDims-3: patch_cols
+      // NumDims-4: number of patches
+      // NumDims-5 and beyond: anything else (such as batch).
+      m_dimensions[NumDims - 1] = input_dims[NumInputDims - 1];
+      m_dimensions[NumDims - 2] = op.patch_rows();
+      m_dimensions[NumDims - 3] = op.patch_cols();
+      m_dimensions[NumDims - 4] = m_outputRows * m_outputCols;
+      for (int i = NumDims - 5; i >= 0; --i) {
+        m_dimensions[i] = input_dims[i];
+      }
+    }
+
+    // Strides for moving the patch in various dimensions.
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_colStride = m_dimensions[1];
+      m_patchStride = m_colStride * m_dimensions[2] * m_dimensions[0];
+      m_otherStride = m_patchStride * m_dimensions[3];
+    } else {
+      m_colStride = m_dimensions[NumDims - 2];
+      m_patchStride = m_colStride * m_dimensions[NumDims - 3] * m_dimensions[NumDims - 1];
+      m_otherStride = m_patchStride * m_dimensions[NumDims - 4];
+    }
+
+    // Strides for navigating through the input tensor.
+    m_rowInputStride = m_inputDepth;
+    m_colInputStride = m_inputDepth * m_inputRows;
+    m_patchInputStride = m_inputDepth * m_inputRows * m_inputCols;
+
+    // Fast representations of different variables.
+    m_fastOtherStride = internal::TensorIntDivisor<Index>(m_otherStride);
+    m_fastPatchStride = internal::TensorIntDivisor<Index>(m_patchStride);
+    m_fastColStride = internal::TensorIntDivisor<Index>(m_colStride);
+    m_fastInflateRowStride = internal::TensorIntDivisor<Index>(m_row_inflate_strides);
+    m_fastInflateColStride = internal::TensorIntDivisor<Index>(m_col_inflate_strides);
+    m_fastInputColsEff = internal::TensorIntDivisor<Index>(m_input_cols_eff);
+
+    // Number of patches in the width dimension.
+    m_fastOutputRows = internal::TensorIntDivisor<Index>(m_outputRows);
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[0]);
+    } else {
+      m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[NumDims - 1]);
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType, EvalSubExprsCallback done) {
+    m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    // Patch index corresponding to the passed in index.
+    const Index patchIndex = index / m_fastPatchStride;
+    // Find the offset of the element wrt the location of the first element.
+    const Index patchOffset = (index - patchIndex * m_patchStride) / m_fastOutputDepth;
+
+    // Other ways to index this element.
+    const Index otherIndex = (NumDims == 4) ? 0 : index / m_fastOtherStride;
+    const Index patch2DIndex = (NumDims == 4) ? patchIndex : (index - otherIndex * m_otherStride) / m_fastPatchStride;
+
+    // Calculate col index in the input original tensor.
+    const Index colIndex = patch2DIndex / m_fastOutputRows;
+    const Index colOffset = patchOffset / m_fastColStride;
+    const Index inputCol = colIndex * m_col_strides + colOffset * m_in_col_strides - m_colPaddingLeft;
+    const Index origInputCol =
+        (m_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInflateColStride) : 0);
+    if (inputCol < 0 || inputCol >= m_input_cols_eff ||
+        ((m_col_inflate_strides != 1) && (inputCol != origInputCol * m_col_inflate_strides))) {
+      return Scalar(m_paddingValue);
+    }
+
+    // Calculate row index in the original input tensor.
+    const Index rowIndex = patch2DIndex - colIndex * m_outputRows;
+    const Index rowOffset = patchOffset - colOffset * m_colStride;
+    const Index inputRow = rowIndex * m_row_strides + rowOffset * m_in_row_strides - m_rowPaddingTop;
+    const Index origInputRow =
+        (m_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInflateRowStride) : 0);
+    if (inputRow < 0 || inputRow >= m_input_rows_eff ||
+        ((m_row_inflate_strides != 1) && (inputRow != origInputRow * m_row_inflate_strides))) {
+      return Scalar(m_paddingValue);
+    }
+
+    const int depth_index = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
+    const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index];
+
+    const Index inputIndex =
+        depth + origInputRow * m_rowInputStride + origInputCol * m_colInputStride + otherIndex * m_patchInputStride;
+    return m_impl.coeff(inputIndex);
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+    eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+
+    if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1) {
+      return packetWithPossibleZero(index);
+    }
+
+    const Index indices[2] = {index, index + PacketSize - 1};
+    const Index patchIndex = indices[0] / m_fastPatchStride;
+    if (patchIndex != indices[1] / m_fastPatchStride) {
+      return packetWithPossibleZero(index);
+    }
+    const Index otherIndex = (NumDims == 4) ? 0 : indices[0] / m_fastOtherStride;
+    eigen_assert(otherIndex == indices[1] / m_fastOtherStride);
+
+    // Find the offset of the element wrt the location of the first element.
+    const Index patchOffsets[2] = {(indices[0] - patchIndex * m_patchStride) / m_fastOutputDepth,
+                                   (indices[1] - patchIndex * m_patchStride) / m_fastOutputDepth};
+
+    const Index patch2DIndex =
+        (NumDims == 4) ? patchIndex : (indices[0] - otherIndex * m_otherStride) / m_fastPatchStride;
+    eigen_assert(patch2DIndex == (indices[1] - otherIndex * m_otherStride) / m_fastPatchStride);
+
+    const Index colIndex = patch2DIndex / m_fastOutputRows;
+    const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride, patchOffsets[1] / m_fastColStride};
+
+    // Calculate col indices in the original input tensor.
+    const Index inputCols[2] = {colIndex * m_col_strides + colOffsets[0] - m_colPaddingLeft,
+                                colIndex * m_col_strides + colOffsets[1] - m_colPaddingLeft};
+    if (inputCols[1] < 0 || inputCols[0] >= m_inputCols) {
+      return internal::pset1<PacketReturnType>(Scalar(m_paddingValue));
+    }
+
+    if (inputCols[0] == inputCols[1]) {
+      const Index rowIndex = patch2DIndex - colIndex * m_outputRows;
+      const Index rowOffsets[2] = {patchOffsets[0] - colOffsets[0] * m_colStride,
+                                   patchOffsets[1] - colOffsets[1] * m_colStride};
+      eigen_assert(rowOffsets[0] <= rowOffsets[1]);
+      // Calculate col indices in the original input tensor.
+      const Index inputRows[2] = {rowIndex * m_row_strides + rowOffsets[0] - m_rowPaddingTop,
+                                  rowIndex * m_row_strides + rowOffsets[1] - m_rowPaddingTop};
+
+      if (inputRows[1] < 0 || inputRows[0] >= m_inputRows) {
+        return internal::pset1<PacketReturnType>(Scalar(m_paddingValue));
+      }
+
+      if (inputRows[0] >= 0 && inputRows[1] < m_inputRows) {
+        // no padding
+        const int depth_index = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
+        const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index];
+        const Index inputIndex =
+            depth + inputRows[0] * m_rowInputStride + inputCols[0] * m_colInputStride + otherIndex * m_patchInputStride;
+        return m_impl.template packet<Unaligned>(inputIndex);
+      }
+    }
+
+    return packetWithPossibleZero(index);
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowPaddingTop() const { return m_rowPaddingTop; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colPaddingLeft() const { return m_colPaddingLeft; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputRows() const { return m_outputRows; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputCols() const { return m_outputCols; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userRowStride() const { return m_row_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userColStride() const { return m_col_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInRowStride() const { return m_in_row_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInColStride() const { return m_in_col_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowInflateStride() const { return m_row_inflate_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colInflateStride() const { return m_col_inflate_strides; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    // We conservatively estimate the cost for the code path where the computed
+    // index is inside the original image and
+    // TensorEvaluator<ArgType, Device>::CoordAccess is false.
+    const double compute_cost =
+        3 * TensorOpCost::DivCost<Index>() + 6 * TensorOpCost::MulCost<Index>() + 8 * TensorOpCost::MulCost<Index>();
+    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
+  }
+
+ protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const {
+    EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
+    EIGEN_UNROLL_LOOP
+    for (int i = 0; i < PacketSize; ++i) {
+      values[i] = coeff(index + i);
+    }
+    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+    return rslt;
+  }
+
+  Dimensions m_dimensions;
+
+  Index m_otherStride;
+  Index m_patchStride;
+  Index m_colStride;
+  Index m_row_strides;
+  Index m_col_strides;
+
+  Index m_in_row_strides;
+  Index m_in_col_strides;
+  Index m_row_inflate_strides;
+  Index m_col_inflate_strides;
+
+  Index m_input_rows_eff;
+  Index m_input_cols_eff;
+  Index m_patch_rows_eff;
+  Index m_patch_cols_eff;
+
+  internal::TensorIntDivisor<Index> m_fastOtherStride;
+  internal::TensorIntDivisor<Index> m_fastPatchStride;
+  internal::TensorIntDivisor<Index> m_fastColStride;
+  internal::TensorIntDivisor<Index> m_fastInflateRowStride;
+  internal::TensorIntDivisor<Index> m_fastInflateColStride;
+  internal::TensorIntDivisor<Index> m_fastInputColsEff;
+
+  Index m_rowInputStride;
+  Index m_colInputStride;
+  Index m_patchInputStride;
+
+  Index m_inputDepth;
+  Index m_inputRows;
+  Index m_inputCols;
+
+  Index m_outputRows;
+  Index m_outputCols;
+
+  Index m_rowPaddingTop;
+  Index m_colPaddingLeft;
+
+  internal::TensorIntDivisor<Index> m_fastOutputRows;
+  internal::TensorIntDivisor<Index> m_fastOutputDepth;
+
+  Scalar m_paddingValue;
+
+  const Device EIGEN_DEVICE_REF m_device;
+  TensorEvaluator<ArgType, Device> m_impl;
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_IMAGE_PATCH_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
new file mode 100644
index 00000000..394c150e
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
@@ -0,0 +1,619 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H
+#define EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+template <Index n>
+struct type2index {
+  static constexpr Index value = n;
+  EIGEN_DEVICE_FUNC constexpr operator Index() const { return n; }
+  EIGEN_DEVICE_FUNC void set(Index val) { eigen_assert(val == n); }
+};
+
+// This can be used with IndexPairList to get compile-time constant pairs,
+// such as IndexPairList<type2indexpair<1,2>, type2indexpair<3,4>>().
+template <Index f, Index s>
+struct type2indexpair {
+  static constexpr Index first = f;
+  static constexpr Index second = s;
+
+  constexpr EIGEN_DEVICE_FUNC operator IndexPair<Index>() const { return IndexPair<Index>(f, s); }
+
+  EIGEN_DEVICE_FUNC void set(const IndexPair<Index>& val) {
+    eigen_assert(val.first == f);
+    eigen_assert(val.second == s);
+  }
+};
+
+template <Index n>
+struct NumTraits<type2index<n>> {
+  typedef Index Real;
+  enum { IsComplex = 0, RequireInitialization = false, ReadCost = 1, AddCost = 1, MulCost = 1 };
+
+  EIGEN_DEVICE_FUNC static constexpr EIGEN_STRONG_INLINE Real epsilon() { return 0; }
+  EIGEN_DEVICE_FUNC static constexpr EIGEN_STRONG_INLINE Real dummy_precision() { return 0; }
+  EIGEN_DEVICE_FUNC static constexpr EIGEN_STRONG_INLINE Real highest() { return n; }
+  EIGEN_DEVICE_FUNC static constexpr EIGEN_STRONG_INLINE Real lowest() { return n; }
+};
+
+namespace internal {
+template <typename T>
+EIGEN_DEVICE_FUNC void update_value(T& val, Index new_val) {
+  val = internal::convert_index<T>(new_val);
+}
+template <Index n>
+EIGEN_DEVICE_FUNC void update_value(type2index<n>& val, Index new_val) {
+  val.set(new_val);
+}
+
+template <typename T>
+EIGEN_DEVICE_FUNC void update_value(T& val, IndexPair<Index> new_val) {
+  val = new_val;
+}
+template <Index f, Index s>
+EIGEN_DEVICE_FUNC void update_value(type2indexpair<f, s>& val, IndexPair<Index> new_val) {
+  val.set(new_val);
+}
+
+template <typename T>
+struct is_compile_time_constant {
+  static constexpr bool value = false;
+};
+
+template <Index idx>
+struct is_compile_time_constant<type2index<idx>> {
+  static constexpr bool value = true;
+};
+template <Index idx>
+struct is_compile_time_constant<const type2index<idx>> {
+  static constexpr bool value = true;
+};
+template <Index idx>
+struct is_compile_time_constant<type2index<idx>&> {
+  static constexpr bool value = true;
+};
+template <Index idx>
+struct is_compile_time_constant<const type2index<idx>&> {
+  static constexpr bool value = true;
+};
+
+template <Index f, Index s>
+struct is_compile_time_constant<type2indexpair<f, s>> {
+  static constexpr bool value = true;
+};
+template <Index f, Index s>
+struct is_compile_time_constant<const type2indexpair<f, s>> {
+  static constexpr bool value = true;
+};
+template <Index f, Index s>
+struct is_compile_time_constant<type2indexpair<f, s>&> {
+  static constexpr bool value = true;
+};
+template <Index f, Index s>
+struct is_compile_time_constant<const type2indexpair<f, s>&> {
+  static constexpr bool value = true;
+};
+
+template <typename... T>
+struct IndexTuple;
+
+template <typename T, typename... O>
+struct IndexTuple<T, O...> {
+  EIGEN_DEVICE_FUNC constexpr IndexTuple() : head(), others() {}
+  EIGEN_DEVICE_FUNC constexpr IndexTuple(const T& v, const O... o) : head(v), others(o...) {}
+
+  static constexpr int count = 1 + sizeof...(O);
+  T head;
+  IndexTuple<O...> others;
+  typedef T Head;
+  typedef IndexTuple<O...> Other;
+};
+
+template <typename T>
+struct IndexTuple<T> {
+  EIGEN_DEVICE_FUNC constexpr IndexTuple() : head() {}
+  EIGEN_DEVICE_FUNC constexpr IndexTuple(const T& v) : head(v) {}
+
+  constexpr static int count = 1;
+  T head;
+  typedef T Head;
+};
+
+template <int N, typename... T>
+struct IndexTupleExtractor;
+
+template <int N, typename T, typename... O>
+struct IndexTupleExtractor<N, T, O...> {
+  typedef typename IndexTupleExtractor<N - 1, O...>::ValType ValType;
+
+  EIGEN_DEVICE_FUNC static constexpr ValType& get_val(IndexTuple<T, O...>& val) {
+    return IndexTupleExtractor<N - 1, O...>::get_val(val.others);
+  }
+
+  EIGEN_DEVICE_FUNC static constexpr const ValType& get_val(const IndexTuple<T, O...>& val) {
+    return IndexTupleExtractor<N - 1, O...>::get_val(val.others);
+  }
+  template <typename V>
+  EIGEN_DEVICE_FUNC static void set_val(IndexTuple<T, O...>& val, V& new_val) {
+    IndexTupleExtractor<N - 1, O...>::set_val(val.others, new_val);
+  }
+};
+
+template <typename T, typename... O>
+struct IndexTupleExtractor<0, T, O...> {
+  typedef T ValType;
+
+  EIGEN_DEVICE_FUNC static constexpr ValType& get_val(IndexTuple<T, O...>& val) { return val.head; }
+  EIGEN_DEVICE_FUNC static constexpr const ValType& get_val(const IndexTuple<T, O...>& val) { return val.head; }
+  template <typename V>
+  EIGEN_DEVICE_FUNC static void set_val(IndexTuple<T, O...>& val, V& new_val) {
+    val.head = new_val;
+  }
+};
+
+template <int N, typename T, typename... O>
+EIGEN_DEVICE_FUNC constexpr typename IndexTupleExtractor<N, T, O...>::ValType& array_get(IndexTuple<T, O...>& tuple) {
+  return IndexTupleExtractor<N, T, O...>::get_val(tuple);
+}
+template <int N, typename T, typename... O>
+EIGEN_DEVICE_FUNC constexpr const typename IndexTupleExtractor<N, T, O...>::ValType& array_get(
+    const IndexTuple<T, O...>& tuple) {
+  return IndexTupleExtractor<N, T, O...>::get_val(tuple);
+}
+template <typename T, typename... O>
+struct array_size<IndexTuple<T, O...>> {
+  static constexpr size_t value = IndexTuple<T, O...>::count;
+};
+template <typename T, typename... O>
+struct array_size<const IndexTuple<T, O...>> {
+  static constexpr size_t value = IndexTuple<T, O...>::count;
+};
+
+template <Index Idx, typename ValueT>
+struct tuple_coeff {
+  template <typename... T>
+  EIGEN_DEVICE_FUNC static constexpr ValueT get(const Index i, const IndexTuple<T...>& t) {
+    //    return array_get<Idx>(t) * (i == Idx) + tuple_coeff<Idx-1>::get(i, t) * (i != Idx);
+    return (i == Idx ? array_get<Idx>(t) : tuple_coeff<Idx - 1, ValueT>::get(i, t));
+  }
+  template <typename... T>
+  EIGEN_DEVICE_FUNC static void set(const Index i, IndexTuple<T...>& t, const ValueT& value) {
+    if (i == Idx) {
+      update_value(array_get<Idx>(t), value);
+    } else {
+      tuple_coeff<Idx - 1, ValueT>::set(i, t, value);
+    }
+  }
+
+  template <typename... T>
+  EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const Index i, const IndexTuple<T...>& t) {
+    return ((i == Idx) && is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value) ||
+           tuple_coeff<Idx - 1, ValueT>::value_known_statically(i, t);
+  }
+
+  template <typename... T>
+  EIGEN_DEVICE_FUNC static constexpr bool values_up_to_known_statically(const IndexTuple<T...>& t) {
+    return is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value &&
+           tuple_coeff<Idx - 1, ValueT>::values_up_to_known_statically(t);
+  }
+
+  template <typename... T>
+  EIGEN_DEVICE_FUNC static constexpr bool values_up_to_statically_known_to_increase(const IndexTuple<T...>& t) {
+    return is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value &&
+           is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value &&
+           array_get<Idx>(t) > array_get<Idx - 1>(t) &&
+           tuple_coeff<Idx - 1, ValueT>::values_up_to_statically_known_to_increase(t);
+  }
+};
+
+template <typename ValueT>
+struct tuple_coeff<0, ValueT> {
+  template <typename... T>
+  EIGEN_DEVICE_FUNC static constexpr ValueT get(const Index /*i*/, const IndexTuple<T...>& t) {
+    //  eigen_assert (i == 0);  // gcc fails to compile assertions in constexpr
+    return array_get<0>(t) /* * (i == 0)*/;
+  }
+  template <typename... T>
+  EIGEN_DEVICE_FUNC static void set(const Index i, IndexTuple<T...>& t, const ValueT value) {
+    eigen_assert(i == 0);
+    update_value(array_get<0>(t), value);
+  }
+  template <typename... T>
+  EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const Index i, const IndexTuple<T...>&) {
+    return is_compile_time_constant<typename IndexTupleExtractor<0, T...>::ValType>::value && (i == 0);
+  }
+
+  template <typename... T>
+  EIGEN_DEVICE_FUNC static constexpr bool values_up_to_known_statically(const IndexTuple<T...>&) {
+    return is_compile_time_constant<typename IndexTupleExtractor<0, T...>::ValType>::value;
+  }
+
+  template <typename... T>
+  EIGEN_DEVICE_FUNC static constexpr bool values_up_to_statically_known_to_increase(const IndexTuple<T...>&) {
+    return true;
+  }
+};
+}  // namespace internal
+
+/** \internal
+ *
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Set of classes used to encode a set of Tensor dimensions/indices.
+ *
+ * The indices in the list can be known at compile time or at runtime. A mix
+ * of static and dynamic indices can also be provided if needed. The tensor
+ * code will attempt to take advantage of the indices that are known at
+ * compile time to optimize the code it generates.
+ *
+ * This functionality requires a c++11 compliant compiler. If your compiler
+ * is older you need to use arrays of indices instead.
+ *
+ * Several examples are provided in the cxx11_tensor_index_list.cpp file.
+ *
+ * \sa Tensor
+ */
+
+template <typename FirstType, typename... OtherTypes>
+struct IndexList : internal::IndexTuple<FirstType, OtherTypes...> {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr Index operator[](const Index i) const {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...>>::value - 1,
+                                 Index>::get(i, *this);
+  }
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr Index get(const Index i) const {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...>>::value - 1,
+                                 Index>::get(i, *this);
+  }
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const Index i, const Index value) {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...>>::value - 1,
+                                 Index>::set(i, *this, value);
+  }
+
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr std::size_t size() const { return 1 + sizeof...(OtherTypes); };
+
+  EIGEN_DEVICE_FUNC constexpr IndexList(const internal::IndexTuple<FirstType, OtherTypes...>& other)
+      : internal::IndexTuple<FirstType, OtherTypes...>(other) {}
+  EIGEN_DEVICE_FUNC constexpr IndexList(FirstType& first, OtherTypes... other)
+      : internal::IndexTuple<FirstType, OtherTypes...>(first, other...) {}
+  EIGEN_DEVICE_FUNC constexpr IndexList() : internal::IndexTuple<FirstType, OtherTypes...>() {}
+
+  EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const Index i) const {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...>>::value - 1,
+                                 Index>::value_known_statically(i, *this);
+  }
+  EIGEN_DEVICE_FUNC constexpr bool all_values_known_statically() const {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...>>::value - 1,
+                                 Index>::values_up_to_known_statically(*this);
+  }
+
+  EIGEN_DEVICE_FUNC constexpr bool values_statically_known_to_increase() const {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...>>::value - 1,
+                                 Index>::values_up_to_statically_known_to_increase(*this);
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+std::ostream& operator<<(std::ostream& os, const IndexList<FirstType, OtherTypes...>& dims) {
+  os << "[";
+  for (size_t i = 0; i < 1 + sizeof...(OtherTypes); ++i) {
+    if (i > 0) os << ", ";
+    os << dims[i];
+  }
+  os << "]";
+  return os;
+}
+
+template <typename FirstType, typename... OtherTypes>
+constexpr IndexList<FirstType, OtherTypes...> make_index_list(FirstType val1, OtherTypes... other_vals) {
+  return IndexList<FirstType, OtherTypes...>(val1, other_vals...);
+}
+
+template <typename FirstType, typename... OtherTypes>
+struct IndexPairList : internal::IndexTuple<FirstType, OtherTypes...> {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr IndexPair<Index> operator[](const Index i) const {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...>>::value - 1,
+                                 IndexPair<Index>>::get(i, *this);
+  }
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const Index i, const IndexPair<Index> value) {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...>>::value - 1,
+                                 IndexPair<Index>>::set(i, *this, value);
+  }
+
+  EIGEN_DEVICE_FUNC constexpr IndexPairList(const internal::IndexTuple<FirstType, OtherTypes...>& other)
+      : internal::IndexTuple<FirstType, OtherTypes...>(other) {}
+  EIGEN_DEVICE_FUNC constexpr IndexPairList() : internal::IndexTuple<FirstType, OtherTypes...>() {}
+
+  EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const Index i) const {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...>>::value - 1,
+                                 Index>::value_known_statically(i, *this);
+  }
+};
+
+namespace internal {
+
+template <typename FirstType, typename... OtherTypes>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index array_prod(const IndexList<FirstType, OtherTypes...>& sizes) {
+  Index result = 1;
+  EIGEN_UNROLL_LOOP
+  for (size_t i = 0; i < array_size<IndexList<FirstType, OtherTypes...>>::value; ++i) {
+    result *= sizes[i];
+  }
+  return result;
+}
+
+template <typename FirstType, typename... OtherTypes>
+struct array_size<IndexList<FirstType, OtherTypes...>> {
+  static const size_t value = array_size<IndexTuple<FirstType, OtherTypes...>>::value;
+};
+template <typename FirstType, typename... OtherTypes>
+struct array_size<const IndexList<FirstType, OtherTypes...>> {
+  static const size_t value = array_size<IndexTuple<FirstType, OtherTypes...>>::value;
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct array_size<IndexPairList<FirstType, OtherTypes...>> {
+  static const size_t value = 1 + sizeof...(OtherTypes);
+};
+template <typename FirstType, typename... OtherTypes>
+struct array_size<const IndexPairList<FirstType, OtherTypes...>> {
+  static const size_t value = 1 + sizeof...(OtherTypes);
+};
+
+template <Index N, typename FirstType, typename... OtherTypes>
+EIGEN_DEVICE_FUNC constexpr Index array_get(IndexList<FirstType, OtherTypes...>& a) {
+  return IndexTupleExtractor<N, FirstType, OtherTypes...>::get_val(a);
+}
+template <Index N, typename FirstType, typename... OtherTypes>
+EIGEN_DEVICE_FUNC constexpr Index array_get(const IndexList<FirstType, OtherTypes...>& a) {
+  return IndexTupleExtractor<N, FirstType, OtherTypes...>::get_val(a);
+}
+
+template <typename T>
+struct index_known_statically_impl {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index) { return false; }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_known_statically_impl<IndexList<FirstType, OtherTypes...>> {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i) {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i);
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_known_statically_impl<const IndexList<FirstType, OtherTypes...>> {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i) {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i);
+  }
+};
+
+template <typename T>
+struct all_indices_known_statically_impl {
+  static constexpr bool run() { return false; }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct all_indices_known_statically_impl<IndexList<FirstType, OtherTypes...>> {
+  EIGEN_DEVICE_FUNC static constexpr bool run() {
+    return IndexList<FirstType, OtherTypes...>().all_values_known_statically();
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct all_indices_known_statically_impl<const IndexList<FirstType, OtherTypes...>> {
+  EIGEN_DEVICE_FUNC static constexpr bool run() {
+    return IndexList<FirstType, OtherTypes...>().all_values_known_statically();
+  }
+};
+
+template <typename T>
+struct indices_statically_known_to_increase_impl {
+  EIGEN_DEVICE_FUNC static constexpr bool run() { return false; }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct indices_statically_known_to_increase_impl<IndexList<FirstType, OtherTypes...>> {
+  EIGEN_DEVICE_FUNC static constexpr bool run() {
+    return Eigen::IndexList<FirstType, OtherTypes...>().values_statically_known_to_increase();
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct indices_statically_known_to_increase_impl<const IndexList<FirstType, OtherTypes...>> {
+  EIGEN_DEVICE_FUNC static constexpr bool run() {
+    return Eigen::IndexList<FirstType, OtherTypes...>().values_statically_known_to_increase();
+  }
+};
+
+template <typename Tx>
+struct index_statically_eq_impl {
+  EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) { return false; }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_eq_impl<IndexList<FirstType, OtherTypes...>> {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &&
+           (IndexList<FirstType, OtherTypes...>().get(i) == value);
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_eq_impl<const IndexList<FirstType, OtherTypes...>> {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &&
+           (IndexList<FirstType, OtherTypes...>().get(i) == value);
+  }
+};
+
+template <typename T>
+struct index_statically_ne_impl {
+  EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) { return false; }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_ne_impl<IndexList<FirstType, OtherTypes...>> {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &&
+           (IndexList<FirstType, OtherTypes...>().get(i) != value);
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_ne_impl<const IndexList<FirstType, OtherTypes...>> {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &&
+           (IndexList<FirstType, OtherTypes...>().get(i) != value);
+  }
+};
+
+template <typename T>
+struct index_statically_gt_impl {
+  EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) { return false; }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_gt_impl<IndexList<FirstType, OtherTypes...>> {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &&
+           (IndexList<FirstType, OtherTypes...>().get(i) > value);
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_gt_impl<const IndexList<FirstType, OtherTypes...>> {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &&
+           (IndexList<FirstType, OtherTypes...>().get(i) > value);
+  }
+};
+
+template <typename T>
+struct index_statically_lt_impl {
+  EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) { return false; }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_lt_impl<IndexList<FirstType, OtherTypes...>> {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &&
+           (IndexList<FirstType, OtherTypes...>().get(i) < value);
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_statically_lt_impl<const IndexList<FirstType, OtherTypes...>> {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+    return IndexList<FirstType, OtherTypes...>().value_known_statically(i) &&
+           (IndexList<FirstType, OtherTypes...>().get(i) < value);
+  }
+};
+
+template <typename Tx>
+struct index_pair_first_statically_eq_impl {
+  EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) { return false; }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_pair_first_statically_eq_impl<IndexPairList<FirstType, OtherTypes...>> {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+    return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &&
+           (IndexPairList<FirstType, OtherTypes...>().operator[](i).first == value);
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_pair_first_statically_eq_impl<const IndexPairList<FirstType, OtherTypes...>> {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+    return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &&
+           (IndexPairList<FirstType, OtherTypes...>().operator[](i).first == value);
+  }
+};
+
+template <typename Tx>
+struct index_pair_second_statically_eq_impl {
+  EIGEN_DEVICE_FUNC static constexpr bool run(Index, Index) { return false; }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_pair_second_statically_eq_impl<IndexPairList<FirstType, OtherTypes...>> {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+    return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &&
+           (IndexPairList<FirstType, OtherTypes...>().operator[](i).second == value);
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_pair_second_statically_eq_impl<const IndexPairList<FirstType, OtherTypes...>> {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const Index i, const Index value) {
+    return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &&
+           (IndexPairList<FirstType, OtherTypes...>().operator[](i).second == value);
+  }
+};
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+namespace Eigen {
+namespace internal {
+template <typename T>
+static EIGEN_DEVICE_FUNC constexpr bool index_known_statically(Index i) {
+  return index_known_statically_impl<T>::run(i);
+}
+
+template <typename T>
+static EIGEN_DEVICE_FUNC constexpr bool all_indices_known_statically() {
+  return all_indices_known_statically_impl<T>::run();
+}
+
+template <typename T>
+static EIGEN_DEVICE_FUNC constexpr bool indices_statically_known_to_increase() {
+  return indices_statically_known_to_increase_impl<T>::run();
+}
+
+template <typename T>
+static EIGEN_DEVICE_FUNC constexpr bool index_statically_eq(Index i, Index value) {
+  return index_statically_eq_impl<T>::run(i, value);
+}
+
+template <typename T>
+static EIGEN_DEVICE_FUNC constexpr bool index_statically_ne(Index i, Index value) {
+  return index_statically_ne_impl<T>::run(i, value);
+}
+
+template <typename T>
+static EIGEN_DEVICE_FUNC constexpr bool index_statically_gt(Index i, Index value) {
+  return index_statically_gt_impl<T>::run(i, value);
+}
+
+template <typename T>
+static EIGEN_DEVICE_FUNC constexpr bool index_statically_lt(Index i, Index value) {
+  return index_statically_lt_impl<T>::run(i, value);
+}
+
+template <typename T>
+static EIGEN_DEVICE_FUNC constexpr bool index_pair_first_statically_eq(Index i, Index value) {
+  return index_pair_first_statically_eq_impl<T>::run(i, value);
+}
+
+template <typename T>
+static EIGEN_DEVICE_FUNC constexpr bool index_pair_second_statically_eq(Index i, Index value) {
+  return index_pair_second_statically_eq_impl<T>::run(i, value);
+}
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
new file mode 100644
index 00000000..74a53be1
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
@@ -0,0 +1,224 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Ke Yang <yangke@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_INFLATION_H
+#define EIGEN_CXX11_TENSOR_TENSOR_INFLATION_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+template <typename Strides, typename XprType>
+struct traits<TensorInflationOp<Strides, XprType> > : public traits<XprType> {
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef std::remove_reference_t<Nested> Nested_;
+  static constexpr int NumDimensions = XprTraits::NumDimensions;
+  static constexpr int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
+};
+
+template <typename Strides, typename XprType>
+struct eval<TensorInflationOp<Strides, XprType>, Eigen::Dense> {
+  typedef const TensorInflationOp<Strides, XprType>& type;
+};
+
+template <typename Strides, typename XprType>
+struct nested<TensorInflationOp<Strides, XprType>, 1, typename eval<TensorInflationOp<Strides, XprType> >::type> {
+  typedef TensorInflationOp<Strides, XprType> type;
+};
+
+}  // end namespace internal
+
+/**
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor inflation class.
+ */
+template <typename Strides, typename XprType>
+class TensorInflationOp : public TensorBase<TensorInflationOp<Strides, XprType>, ReadOnlyAccessors> {
+ public:
+  typedef typename Eigen::internal::traits<TensorInflationOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorInflationOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorInflationOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorInflationOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorInflationOp(const XprType& expr, const Strides& strides)
+      : m_xpr(expr), m_strides(strides) {}
+
+  EIGEN_DEVICE_FUNC const Strides& strides() const { return m_strides; }
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename XprType::Nested>& expression() const { return m_xpr; }
+
+ protected:
+  typename XprType::Nested m_xpr;
+  const Strides m_strides;
+};
+
+// Eval as rvalue
+template <typename Strides, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device> {
+  typedef TensorInflationOp<Strides, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static constexpr int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+  enum {
+    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    PreferBlockAccess = false,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_strides(op.strides()) {
+    m_dimensions = m_impl.dimensions();
+    // Expand each dimension to the inflated dimension.
+    for (int i = 0; i < NumDims; ++i) {
+      m_dimensions[i] = (m_dimensions[i] - 1) * op.strides()[i] + 1;
+    }
+
+    // Remember the strides for fast division.
+    for (int i = 0; i < NumDims; ++i) {
+      m_fastStrides[i] = internal::TensorIntDivisor<Index>(m_strides[i]);
+    }
+
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_outputStrides[0] = 1;
+      m_inputStrides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
+        m_inputStrides[i] = m_inputStrides[i - 1] * input_dims[i - 1];
+      }
+    } else {  // RowMajor
+      m_outputStrides[NumDims - 1] = 1;
+      m_inputStrides[NumDims - 1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
+        m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1];
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+  EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); }
+
+  // Computes the input index given the output index. Returns true if the output
+  // index doesn't fall into a hole.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool getInputIndex(Index index, Index* inputIndex) const {
+    eigen_assert(index < dimensions().TotalSize());
+    *inputIndex = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx = index / m_outputStrides[i];
+        if (idx != idx / m_fastStrides[i] * m_strides[i]) {
+          return false;
+        }
+        *inputIndex += idx / m_strides[i] * m_inputStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      if (index != index / m_fastStrides[0] * m_strides[0]) {
+        return false;
+      }
+      *inputIndex += index / m_strides[0];
+      return true;
+    } else {
+      EIGEN_UNROLL_LOOP
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx = index / m_outputStrides[i];
+        if (idx != idx / m_fastStrides[i] * m_strides[i]) {
+          return false;
+        }
+        *inputIndex += idx / m_strides[i] * m_inputStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      if (index != index / m_fastStrides[NumDims - 1] * m_strides[NumDims - 1]) {
+        return false;
+      }
+      *inputIndex += index / m_strides[NumDims - 1];
+    }
+    return true;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    Index inputIndex = 0;
+    if (getInputIndex(index, &inputIndex)) {
+      return m_impl.coeff(inputIndex);
+    } else {
+      return Scalar(0);
+    }
+  }
+
+  // TODO(yangke): optimize this function so that we can detect and produce
+  // all-zero packets
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+
+    EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
+    EIGEN_UNROLL_LOOP
+    for (int i = 0; i < PacketSize; ++i) {
+      values[i] = coeff(index + i);
+    }
+    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+    return rslt;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    const double compute_cost = NumDims * (3 * TensorOpCost::DivCost<Index>() + 3 * TensorOpCost::MulCost<Index>() +
+                                           2 * TensorOpCost::AddCost<Index>());
+    const double input_size = m_impl.dimensions().TotalSize();
+    const double output_size = m_dimensions.TotalSize();
+    if (output_size == 0) return TensorOpCost();
+    return m_impl.costPerCoeff(vectorized) +
+           TensorOpCost(sizeof(CoeffReturnType) * input_size / output_size, 0, compute_cost, vectorized, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+ protected:
+  Dimensions m_dimensions;
+  array<Index, NumDims> m_outputStrides;
+  array<Index, NumDims> m_inputStrides;
+  TensorEvaluator<ArgType, Device> m_impl;
+  const Strides m_strides;
+  array<internal::TensorIntDivisor<Index>, NumDims> m_fastStrides;
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_INFLATION_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h
new file mode 100644
index 00000000..26cd50f7
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h
@@ -0,0 +1,78 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H
+#define EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H
+
+#include <initializer_list>
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+/**
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Helper template to initialize Tensors from std::initializer_lists.
+ */
+template <typename Derived, int N>
+struct Initializer {
+  typedef std::initializer_list<typename Initializer<Derived, N - 1>::InitList> InitList;
+
+  static void run(TensorEvaluator<Derived, DefaultDevice>& tensor,
+                  Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions>* indices,
+                  const InitList& vals) {
+    int i = 0;
+    for (const auto& v : vals) {
+      (*indices)[traits<Derived>::NumDimensions - N] = i++;
+      Initializer<Derived, N - 1>::run(tensor, indices, v);
+    }
+  }
+};
+
+template <typename Derived>
+struct Initializer<Derived, 1> {
+  typedef std::initializer_list<typename traits<Derived>::Scalar> InitList;
+
+  static void run(TensorEvaluator<Derived, DefaultDevice>& tensor,
+                  Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions>* indices,
+                  const InitList& vals) {
+    int i = 0;
+    // There is likely a faster way to do that than iterating.
+    for (const auto& v : vals) {
+      (*indices)[traits<Derived>::NumDimensions - 1] = i++;
+      tensor.coeffRef(*indices) = v;
+    }
+  }
+};
+
+template <typename Derived>
+struct Initializer<Derived, 0> {
+  typedef typename traits<Derived>::Scalar InitList;
+
+  static void run(TensorEvaluator<Derived, DefaultDevice>& tensor,
+                  Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions>*, const InitList& v) {
+    tensor.coeffRef(0) = v;
+  }
+};
+
+template <typename Derived, int N>
+void initialize_tensor(TensorEvaluator<Derived, DefaultDevice>& tensor,
+                       const typename Initializer<Derived, traits<Derived>::NumDimensions>::InitList& vals) {
+  Eigen::array<typename traits<Derived>::Index, traits<Derived>::NumDimensions> indices;
+  Initializer<Derived, traits<Derived>::NumDimensions>::run(tensor, &indices, vals);
+}
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
new file mode 100644
index 00000000..cd046807
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
@@ -0,0 +1,259 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H
+#define EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+// Note: result is undefined if val == 0
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t<sizeof(T) == 4, int> count_leading_zeros(const T val) {
+#ifdef EIGEN_GPU_COMPILE_PHASE
+  return __clz(val);
+#elif defined(SYCL_DEVICE_ONLY)
+  return cl::sycl::clz(val);
+#elif EIGEN_COMP_MSVC
+  unsigned long index;
+  _BitScanReverse(&index, val);
+  return 31 - index;
+#else
+  EIGEN_STATIC_ASSERT(sizeof(unsigned long long) == 8, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  return __builtin_clz(static_cast<uint32_t>(val));
+#endif
+}
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE std::enable_if_t<sizeof(T) == 8, int> count_leading_zeros(const T val) {
+#ifdef EIGEN_GPU_COMPILE_PHASE
+  return __clzll(val);
+#elif defined(SYCL_DEVICE_ONLY)
+  return static_cast<int>(cl::sycl::clz(val));
+#elif EIGEN_COMP_MSVC && EIGEN_ARCH_x86_64
+  unsigned long index;
+  _BitScanReverse64(&index, val);
+  return 63 - index;
+#elif EIGEN_COMP_MSVC
+  // MSVC's _BitScanReverse64 is not available for 32bits builds.
+  unsigned int lo = (unsigned int)(val & 0xffffffff);
+  unsigned int hi = (unsigned int)((val >> 32) & 0xffffffff);
+  int n;
+  if (hi == 0)
+    n = 32 + count_leading_zeros<unsigned int>(lo);
+  else
+    n = count_leading_zeros<unsigned int>(hi);
+  return n;
+#else
+  EIGEN_STATIC_ASSERT(sizeof(unsigned long long) == 8, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  return __builtin_clzll(static_cast<uint64_t>(val));
+#endif
+}
+
+template <typename T>
+struct UnsignedTraits {
+  typedef std::conditional_t<sizeof(T) == 8, uint64_t, uint32_t> type;
+};
+
+template <typename T>
+struct DividerTraits {
+  typedef typename UnsignedTraits<T>::type type;
+  static constexpr int N = sizeof(T) * 8;
+};
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t muluh(const uint32_t a, const T b) {
+#if defined(EIGEN_GPU_COMPILE_PHASE)
+  return __umulhi(a, b);
+#elif defined(SYCL_DEVICE_ONLY)
+  return cl::sycl::mul_hi(a, static_cast<uint32_t>(b));
+#else
+  return (static_cast<uint64_t>(a) * b) >> 32;
+#endif
+}
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) {
+#if defined(EIGEN_GPU_COMPILE_PHASE)
+  return __umul64hi(a, b);
+#elif defined(SYCL_DEVICE_ONLY)
+  return cl::sycl::mul_hi(a, static_cast<uint64_t>(b));
+#elif EIGEN_COMP_MSVC && (EIGEN_ARCH_x86_64 || EIGEN_ARCH_ARM64)
+  return __umulh(a, static_cast<uint64_t>(b));
+#elif EIGEN_HAS_BUILTIN_INT128
+  __uint128_t v = static_cast<__uint128_t>(a) * static_cast<__uint128_t>(b);
+  return static_cast<uint64_t>(v >> 64);
+#else
+  return (TensorUInt128<static_val<0>, uint64_t>(a) * TensorUInt128<static_val<0>, uint64_t>(b)).upper();
+#endif
+}
+
+template <int N, typename T>
+struct DividerHelper {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t computeMultiplier(const int log_div, const T divider) {
+    EIGEN_STATIC_ASSERT(N == 32, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    return static_cast<uint32_t>((static_cast<uint64_t>(1) << (N + log_div)) / divider -
+                                 (static_cast<uint64_t>(1) << N) + 1);
+  }
+};
+
+template <typename T>
+struct DividerHelper<64, T> {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) {
+#if EIGEN_HAS_BUILTIN_INT128 && !defined(EIGEN_GPU_COMPILE_PHASE) && !defined(SYCL_DEVICE_ONLY)
+    return static_cast<uint64_t>((static_cast<__uint128_t>(1) << (64 + log_div)) / static_cast<__uint128_t>(divider) -
+                                 (static_cast<__uint128_t>(1) << 64) + 1);
+#else
+    const uint64_t shift = 1ULL << log_div;
+    TensorUInt128<uint64_t, uint64_t> result =
+        TensorUInt128<uint64_t, static_val<0> >(shift, 0) / TensorUInt128<static_val<0>, uint64_t>(divider) -
+        TensorUInt128<static_val<1>, static_val<0> >(1, 0) + TensorUInt128<static_val<0>, static_val<1> >(1);
+    return static_cast<uint64_t>(result);
+#endif
+  }
+};
+
+/** \internal
+ *
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Fast integer division by a constant.
+ *
+ * See the paper from Granlund and Montgomery for explanation.
+ *   (at https://doi.org/10.1145/773473.178249)
+ *
+ * \sa Tensor
+ */
+template <typename T, bool div_gt_one = false>
+struct TensorIntDivisor {
+ public:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() {
+    multiplier = 0;
+    shift1 = 0;
+    shift2 = 0;
+  }
+
+  // Must have 0 < divider < 2^31. This is relaxed to
+  // 0 < divider < 2^63 when using 64-bit indices on platforms that support
+  // the __uint128_t type.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor(const T divider) {
+    const int N = DividerTraits<T>::N;
+    eigen_assert(static_cast<typename UnsignedTraits<T>::type>(divider) < NumTraits<UnsignedType>::highest() / 2);
+    eigen_assert(divider > 0);
+
+    // fast ln2
+    const int leading_zeros = count_leading_zeros(static_cast<UnsignedType>(divider));
+    int log_div = N - leading_zeros;
+    // if divider is a power of two then log_div is 1 more than it should be.
+    if ((static_cast<typename UnsignedTraits<T>::type>(1) << (log_div - 1)) ==
+        static_cast<typename UnsignedTraits<T>::type>(divider))
+      log_div--;
+
+    multiplier = DividerHelper<N, T>::computeMultiplier(log_div, divider);
+    shift1 = log_div > 1 ? 1 : log_div;
+    shift2 = log_div > 1 ? log_div - 1 : 0;
+  }
+
+  // Must have 0 <= numerator. On platforms that don't support the __uint128_t
+  // type numerator should also be less than 2^32-1.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T divide(const T numerator) const {
+    eigen_assert(static_cast<typename UnsignedTraits<T>::type>(numerator) < NumTraits<UnsignedType>::highest() / 2);
+    // eigen_assert(numerator >= 0); // this is implicitly asserted by the line above
+
+    UnsignedType t1 = muluh(multiplier, numerator);
+    UnsignedType t = (static_cast<UnsignedType>(numerator) - t1) >> shift1;
+    return (t1 + t) >> shift2;
+  }
+
+ private:
+  typedef typename DividerTraits<T>::type UnsignedType;
+  UnsignedType multiplier;
+  int32_t shift1;
+  int32_t shift2;
+};
+
+// Optimized version for signed 32 bit integers.
+// Derived from Hacker's Delight.
+// Only works for divisors strictly greater than one
+template <>
+class TensorIntDivisor<int32_t, true> {
+ public:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorIntDivisor() {
+    magic = 0;
+    shift = 0;
+  }
+  // Must have 2 <= divider
+  EIGEN_DEVICE_FUNC TensorIntDivisor(int32_t divider) {
+    eigen_assert(divider >= 2);
+    calcMagic(divider);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int divide(const int32_t n) const {
+#ifdef EIGEN_GPU_COMPILE_PHASE
+    return (__umulhi(magic, n) >> shift);
+#elif defined(SYCL_DEVICE_ONLY)
+    return (cl::sycl::mul_hi(magic, static_cast<uint32_t>(n)) >> shift);
+#else
+    uint64_t v = static_cast<uint64_t>(magic) * static_cast<uint64_t>(n);
+    return (static_cast<uint32_t>(v >> 32) >> shift);
+#endif
+  }
+
+ private:
+  // Compute the magic numbers. See Hacker's Delight section 10 for an in
+  // depth explanation.
+  EIGEN_DEVICE_FUNC void calcMagic(int32_t d) {
+    const unsigned two31 = 0x80000000;  // 2**31.
+    unsigned ad = d;
+    unsigned t = two31 + (ad >> 31);
+    unsigned anc = t - 1 - t % ad;   // Absolute value of nc.
+    int p = 31;                      // Init. p.
+    unsigned q1 = two31 / anc;       // Init. q1 = 2**p/|nc|.
+    unsigned r1 = two31 - q1 * anc;  // Init. r1 = rem(2**p, |nc|).
+    unsigned q2 = two31 / ad;        // Init. q2 = 2**p/|d|.
+    unsigned r2 = two31 - q2 * ad;   // Init. r2 = rem(2**p, |d|).
+    unsigned delta = 0;
+    do {
+      p = p + 1;
+      q1 = 2 * q1;      // Update q1 = 2**p/|nc|.
+      r1 = 2 * r1;      // Update r1 = rem(2**p, |nc|).
+      if (r1 >= anc) {  // (Must be an unsigned
+        q1 = q1 + 1;    // comparison here).
+        r1 = r1 - anc;
+      }
+      q2 = 2 * q2;     // Update q2 = 2**p/|d|.
+      r2 = 2 * r2;     // Update r2 = rem(2**p, |d|).
+      if (r2 >= ad) {  // (Must be an unsigned
+        q2 = q2 + 1;   // comparison here).
+        r2 = r2 - ad;
+      }
+      delta = ad - r2;
+    } while (q1 < delta || (q1 == delta && r1 == 0));
+
+    magic = (unsigned)(q2 + 1);
+    shift = p - 32;
+  }
+
+  uint32_t magic;
+  int32_t shift;
+};
+
+template <typename T, bool div_gt_one>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator/(const T& numerator, const TensorIntDivisor<T, div_gt_one>& divisor) {
+  return divisor.divide(numerator);
+}
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_INTDIV_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
new file mode 100644
index 00000000..3ddcc1c5
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorLayoutSwap.h
@@ -0,0 +1,185 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H
+#define EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+template <typename XprType>
+struct traits<TensorLayoutSwapOp<XprType> > : public traits<XprType> {
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef std::remove_reference_t<Nested> Nested_;
+  static constexpr int NumDimensions = traits<XprType>::NumDimensions;
+  static constexpr int Layout = (traits<XprType>::Layout == ColMajor) ? RowMajor : ColMajor;
+  typedef typename XprTraits::PointerType PointerType;
+};
+
+template <typename XprType>
+struct eval<TensorLayoutSwapOp<XprType>, Eigen::Dense> {
+  typedef const TensorLayoutSwapOp<XprType>& type;
+};
+
+template <typename XprType>
+struct nested<TensorLayoutSwapOp<XprType>, 1, typename eval<TensorLayoutSwapOp<XprType> >::type> {
+  typedef TensorLayoutSwapOp<XprType> type;
+};
+
+}  // end namespace internal
+
+/**
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Swap the layout from col-major to row-major, or row-major
+ * to col-major, and invert the order of the dimensions.
+ *
+ * Beware: the dimensions are reversed by this operation. If you want to
+ * preserve the ordering of the dimensions, you need to combine this
+ * operation with a shuffle.
+ *
+ * \example:
+ * Tensor<float, 2, ColMajor> input(2, 4);
+ * Tensor<float, 2, RowMajor> output = input.swap_layout();
+ * eigen_assert(output.dimension(0) == 4);
+ * eigen_assert(output.dimension(1) == 2);
+ *
+ * array<int, 2> shuffle(1, 0);
+ * output = input.swap_layout().shuffle(shuffle);
+ * eigen_assert(output.dimension(0) == 2);
+ * eigen_assert(output.dimension(1) == 4);
+ *
+ */
+template <typename XprType>
+class TensorLayoutSwapOp : public TensorBase<TensorLayoutSwapOp<XprType>, WriteAccessors> {
+ public:
+  typedef TensorBase<TensorLayoutSwapOp<XprType>, WriteAccessors> Base;
+  typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef std::remove_const_t<typename XprType::CoeffReturnType> CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorLayoutSwapOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorLayoutSwapOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorLayoutSwapOp(const XprType& expr) : m_xpr(expr) {}
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename XprType::Nested>& expression() const { return m_xpr; }
+
+  EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorLayoutSwapOp)
+ protected:
+  typename XprType::Nested m_xpr;
+};
+
+// Eval as rvalue
+template <typename ArgType, typename Device>
+struct TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device> {
+  typedef TensorLayoutSwapOp<ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static constexpr int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+
+  static constexpr int Layout =
+      (TensorEvaluator<ArgType, Device>::Layout == static_cast<int>(ColMajor)) ? RowMajor : ColMajor;
+  enum {
+    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
+    CoordAccess = false,  // to be implemented
+    RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) {
+    for (int i = 0; i < NumDims; ++i) {
+      m_dimensions[i] = m_impl.dimensions()[NumDims - 1 - i];
+    }
+  }
+
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { return m_impl.evalSubExprsIfNeeded(data); }
+  EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_impl.coeff(index); }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+    return m_impl.template packet<LoadMode>(index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    return m_impl.costPerCoeff(vectorized);
+  }
+
+  EIGEN_DEVICE_FUNC typename Storage::Type data() const { return constCast(m_impl.data()); }
+
+  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+
+ protected:
+  TensorEvaluator<ArgType, Device> m_impl;
+  Dimensions m_dimensions;
+};
+
+// Eval as lvalue
+template <typename ArgType, typename Device>
+struct TensorEvaluator<TensorLayoutSwapOp<ArgType>, Device>
+    : public TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device> {
+  typedef TensorEvaluator<const TensorLayoutSwapOp<ArgType>, Device> Base;
+  typedef TensorLayoutSwapOp<ArgType> XprType;
+
+  static constexpr int Layout =
+      (TensorEvaluator<ArgType, Device>::Layout == static_cast<int>(ColMajor)) ? RowMajor : ColMajor;
+  enum {
+    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
+    CoordAccess = false  // to be implemented
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {}
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const {
+    return this->m_impl.coeffRef(index);
+  }
+  template <int StoreMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) const {
+    this->m_impl.template writePacket<StoreMode>(index, x);
+  }
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_LAYOUT_SWAP_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
new file mode 100644
index 00000000..f8bbcfee
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
@@ -0,0 +1,85 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_META_MACROS_H
+#define EIGEN_CXX11_TENSOR_TENSOR_META_MACROS_H
+
+/** use this macro in sfinae selection in templated functions
+ *
+ *   template<typename T,
+ *            std::enable_if_t< isBanana<T>::value , int > = 0
+ *   >
+ *   void foo(){}
+ *
+ *   becomes =>
+ *
+ *   template<typename TopoType,
+ *           SFINAE_ENABLE_IF( isBanana<T>::value )
+ *   >
+ *   void foo(){}
+ */
+
+#define EIGEN_SFINAE_ENABLE_IF(__condition__) std::enable_if_t<(__condition__), int> = 0
+
+// Define a macro to use a reference on the host but a value on the device
+#if defined(SYCL_DEVICE_ONLY)
+#define EIGEN_DEVICE_REF
+#else
+#define EIGEN_DEVICE_REF &
+#endif
+
+// Define a macro for catching SYCL exceptions if exceptions are enabled
+#define EIGEN_SYCL_TRY_CATCH(X)                                                                                        \
+  do {                                                                                                                 \
+    EIGEN_TRY { X; }                                                                                                   \
+    EIGEN_CATCH(const cl::sycl::exception& e) {                                                                        \
+      EIGEN_THROW_X(std::runtime_error("SYCL exception at " + std::string(__FILE__) + ":" + std::to_string(__LINE__) + \
+                                       "\n" + e.what()));                                                              \
+    }                                                                                                                  \
+  } while (false)
+
+// Define a macro if local memory flags are unset or one of them is set
+// Setting both flags is the same as unsetting them
+#if (!defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)) || \
+    (defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM))
+#define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON 1
+#define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF 1
+#elif defined(EIGEN_SYCL_LOCAL_MEM) && !defined(EIGEN_SYCL_NO_LOCAL_MEM)
+#define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_ON 1
+#elif !defined(EIGEN_SYCL_LOCAL_MEM) && defined(EIGEN_SYCL_NO_LOCAL_MEM)
+#define EIGEN_SYCL_LOCAL_MEM_UNSET_OR_OFF 1
+#endif
+
+#if EIGEN_COMP_CLANG  // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653)
+#define EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived)                         \
+  using Base::operator=;                                                                \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const Derived& other) {      \
+    Base::operator=(other);                                                             \
+    return *this;                                                                       \
+  }                                                                                     \
+  template <typename OtherDerived>                                                      \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator=(const OtherDerived& other) { \
+    Base::operator=(other);                                                             \
+    return *this;                                                                       \
+  }
+#else
+#define EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived)
+#endif
+
+/** \internal
+ * \brief Macro to manually inherit assignment operators.
+ * This is necessary, because the implicitly defined assignment operator gets deleted when a custom operator= is
+ * defined. This also inherits template<OtherDerived> operator=(const OtherDerived&) assignments. With C++11 or later
+ * this also default-implements the copy-constructor
+ */
+#define EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(Derived) \
+  EIGEN_TENSOR_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived)  \
+  EIGEN_DEFAULT_COPY_CONSTRUCTOR(Derived)
+
+#endif
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
new file mode 100644
index 00000000..9abfddb4
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
@@ -0,0 +1,191 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_MAP_H
+#define EIGEN_CXX11_TENSOR_TENSOR_MAP_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+// FIXME use proper doxygen documentation (e.g. \tparam MakePointer_)
+
+/**
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief A tensor expression mapping an existing array of data.
+ *
+ */
+/// `template <class> class MakePointer_` is added to convert the host pointer to the device pointer.
+/// It is added due to the fact that for our device compiler `T*` is not allowed.
+/// If we wanted to use the same Evaluator functions we have to convert that type to our pointer `T`.
+/// This is done through our `MakePointer_` class. By default the Type in the `MakePointer_<T>` is `T*` .
+/// Therefore, by adding the default value, we managed to convert the type and it does not break any
+/// existing code as its default value is `T*`.
+template <typename PlainObjectType, int Options_, template <class> class MakePointer_>
+class TensorMap : public TensorBase<TensorMap<PlainObjectType, Options_, MakePointer_> > {
+ public:
+  typedef TensorMap<PlainObjectType, Options_, MakePointer_> Self;
+  typedef TensorBase<TensorMap<PlainObjectType, Options_, MakePointer_> > Base;
+#ifdef EIGEN_USE_SYCL
+  typedef std::remove_reference_t<typename Eigen::internal::nested<Self>::type> Nested;
+#else
+  typedef typename Eigen::internal::nested<Self>::type Nested;
+#endif
+  typedef typename internal::traits<PlainObjectType>::StorageKind StorageKind;
+  typedef typename internal::traits<PlainObjectType>::Index Index;
+  typedef typename internal::traits<PlainObjectType>::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef typename PlainObjectType::Base::CoeffReturnType CoeffReturnType;
+
+  typedef typename MakePointer_<Scalar>::Type PointerType;
+  typedef typename MakePointer_<Scalar>::ConstType PointerConstType;
+
+  // WARN: PointerType still can be a pointer to const (const Scalar*), for
+  // example in TensorMap<Tensor<const Scalar, ...>> expression. This type of
+  // expression should be illegal, but adding this restriction is not possible
+  // in practice (see https://bitbucket.org/eigen/eigen/pull-requests/488).
+  typedef std::conditional_t<bool(internal::is_lvalue<PlainObjectType>::value),
+                             PointerType,      // use simple pointer in lvalue expressions
+                             PointerConstType  // use const pointer in rvalue expressions
+                             >
+      StoragePointerType;
+
+  // If TensorMap was constructed over rvalue expression (e.g. const Tensor),
+  // we should return a reference to const from operator() (and others), even
+  // if TensorMap itself is not const.
+  typedef std::conditional_t<bool(internal::is_lvalue<PlainObjectType>::value), Scalar&, const Scalar&> StorageRefType;
+
+  static constexpr int Options = Options_;
+
+  static constexpr Index NumIndices = PlainObjectType::NumIndices;
+  typedef typename PlainObjectType::Dimensions Dimensions;
+
+  static constexpr int Layout = PlainObjectType::Layout;
+  enum { IsAligned = ((int(Options_) & Aligned) == Aligned), CoordAccess = true, RawAccess = true };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr) : m_data(dataPtr), m_dimensions() {
+    // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
+    EIGEN_STATIC_ASSERT((0 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE)
+  }
+
+  template <typename... IndexTypes>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, Index firstDimension,
+                                                  IndexTypes... otherDimensions)
+      : m_data(dataPtr), m_dimensions(firstDimension, otherDimensions...) {
+    // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
+    EIGEN_STATIC_ASSERT((sizeof...(otherDimensions) + 1 == NumIndices || NumIndices == Dynamic),
+                        YOU_MADE_A_PROGRAMMING_MISTAKE)
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr,
+                                                  const array<Index, NumIndices>& dimensions)
+      : m_data(dataPtr), m_dimensions(dimensions) {}
+
+  template <typename Dimensions>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(StoragePointerType dataPtr, const Dimensions& dimensions)
+      : m_data(dataPtr), m_dimensions(dimensions) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorMap(PlainObjectType& tensor)
+      : m_data(tensor.data()), m_dimensions(tensor.dimensions()) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return m_dimensions.rank(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(Index n) const { return m_dimensions[n]; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_dimensions.TotalSize(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StoragePointerType data() { return m_data; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StoragePointerType data() const { return m_data; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageRefType operator()(const array<Index, NumIndices>& indices) const {
+    //      eigen_assert(checkIndexRange(indices));
+    if (PlainObjectType::Options & RowMajor) {
+      const Index index = m_dimensions.IndexOfRowMajor(indices);
+      return m_data[index];
+    } else {
+      const Index index = m_dimensions.IndexOfColMajor(indices);
+      return m_data[index];
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageRefType operator()() const {
+    EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    return m_data[0];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageRefType operator()(Index index) const {
+    eigen_internal_assert(index >= 0 && index < size());
+    return m_data[index];
+  }
+
+  template <typename... IndexTypes>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageRefType operator()(Index firstIndex, Index secondIndex,
+                                                                  IndexTypes... otherIndices) const {
+    EIGEN_STATIC_ASSERT(sizeof...(otherIndices) + 2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(internal::all((Eigen::NumTraits<Index>::highest() >= otherIndices)...));
+    if (PlainObjectType::Options & RowMajor) {
+      const Index index =
+          m_dimensions.IndexOfRowMajor(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
+      return m_data[index];
+    } else {
+      const Index index =
+          m_dimensions.IndexOfColMajor(array<Index, NumIndices>{{firstIndex, secondIndex, otherIndices...}});
+      return m_data[index];
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageRefType operator()(const array<Index, NumIndices>& indices) {
+    //      eigen_assert(checkIndexRange(indices));
+    if (PlainObjectType::Options & RowMajor) {
+      const Index index = m_dimensions.IndexOfRowMajor(indices);
+      return m_data[index];
+    } else {
+      const Index index = m_dimensions.IndexOfColMajor(indices);
+      return m_data[index];
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageRefType operator()() {
+    EIGEN_STATIC_ASSERT(NumIndices == 0, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    return m_data[0];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageRefType operator()(Index index) {
+    eigen_internal_assert(index >= 0 && index < size());
+    return m_data[index];
+  }
+
+  template <typename... IndexTypes>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE StorageRefType operator()(Index firstIndex, Index secondIndex,
+                                                                  IndexTypes... otherIndices) {
+    static_assert(sizeof...(otherIndices) + 2 == NumIndices || NumIndices == Dynamic,
+                  "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
+    eigen_assert(internal::all((Eigen::NumTraits<Index>::highest() >= otherIndices)...));
+    const std::size_t NumDims = sizeof...(otherIndices) + 2;
+    if (PlainObjectType::Options & RowMajor) {
+      const Index index =
+          m_dimensions.IndexOfRowMajor(array<Index, NumDims>{{firstIndex, secondIndex, otherIndices...}});
+      return m_data[index];
+    } else {
+      const Index index =
+          m_dimensions.IndexOfColMajor(array<Index, NumDims>{{firstIndex, secondIndex, otherIndices...}});
+      return m_data[index];
+    }
+  }
+
+  EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorMap)
+
+ private:
+  StoragePointerType m_data;
+  Dimensions m_dimensions;
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_MAP_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
new file mode 100644
index 00000000..8454070a
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
@@ -0,0 +1,291 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_META_H
+#define EIGEN_CXX11_TENSOR_TENSOR_META_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+template <bool cond>
+struct Cond {};
+
+template <typename T1, typename T2>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const T1& choose(Cond<true>, const T1& first, const T2&) {
+  return first;
+}
+
+template <typename T1, typename T2>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE const T2& choose(Cond<false>, const T1&, const T2& second) {
+  return second;
+}
+
+template <size_t n>
+struct max_n_1 {
+  static const size_t size = n;
+};
+template <>
+struct max_n_1<0> {
+  static const size_t size = 1;
+};
+
+template <typename T>
+EIGEN_DEPRECATED EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE constexpr T divup(const T x, const T y) {
+  return Eigen::numext::div_ceil(x, y);
+}
+
+// Default packet types
+template <typename Scalar, typename Device>
+struct PacketType : internal::packet_traits<Scalar> {
+  typedef typename internal::packet_traits<Scalar>::type type;
+};
+
+// For CUDA packet types when using a GpuDevice
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_HAS_GPU_FP16) && defined(EIGEN_GPU_COMPILE_PHASE)
+
+typedef ulonglong2 Packet4h2;
+template <>
+struct PacketType<half, GpuDevice> {
+  typedef Packet4h2 type;
+  static const int size = 8;
+  enum {
+    HasAdd = 1,
+    HasSub = 1,
+    HasMul = 1,
+    HasNegate = 1,
+    HasAbs = 1,
+    HasArg = 0,
+    HasAbs2 = 0,
+    HasMin = 1,
+    HasMax = 1,
+    HasConj = 0,
+    HasSetLinear = 0,
+    HasBlend = 0,
+
+    HasDiv = 1,
+    HasSqrt = 1,
+    HasRsqrt = 1,
+    HasExp = 1,
+    HasExpm1 = 0,
+    HasLog = 1,
+    HasLog1p = 0,
+    HasLog10 = 0,
+    HasPow = 1,
+  };
+};
+#endif
+
+#if defined(EIGEN_USE_SYCL)
+
+namespace TensorSycl {
+namespace internal {
+
+template <typename Index, Index A, Index B>
+struct PlusOp {
+  static constexpr Index Value = A + B;
+};
+
+template <typename Index, Index A, Index B>
+struct DivOp {
+  static constexpr Index Value = A / B;
+};
+
+template <typename Index, Index start, Index end, Index step, template <class Indx, Indx...> class StepOp>
+struct static_for {
+  template <typename UnaryOperator>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void loop(UnaryOperator op) {
+    op(start);
+    static_for<Index, StepOp<Index, start, step>::Value, end, step, StepOp>::loop(op);
+  }
+};
+template <typename Index, Index end, Index step, template <class Indx, Indx...> class StepOp>
+struct static_for<Index, end, end, step, StepOp> {
+  template <typename UnaryOperator>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void loop(UnaryOperator) {}
+};
+
+template <typename OutScalar, typename Device, bool Vectorizable>
+struct Vectorise {
+  static constexpr int PacketSize = 1;
+  typedef OutScalar PacketReturnType;
+};
+
+template <typename OutScalar, typename Device>
+struct Vectorise<OutScalar, Device, true> {
+  static constexpr int PacketSize = Eigen::PacketType<OutScalar, Device>::size;
+  typedef typename Eigen::PacketType<OutScalar, Device>::type PacketReturnType;
+};
+
+static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index roundUp(Index x, Index y) { return ((((x) + (y)-1) / (y)) * (y)); }
+
+}  // namespace internal
+}  // namespace TensorSycl
+
+template <>
+struct PacketType<half, SyclDevice> {
+  typedef half type;
+  static const int size = 1;
+  enum {
+    HasAdd = 0,
+    HasSub = 0,
+    HasMul = 0,
+    HasNegate = 0,
+    HasAbs = 0,
+    HasArg = 0,
+    HasAbs2 = 0,
+    HasMin = 0,
+    HasMax = 0,
+    HasConj = 0,
+    HasSetLinear = 0,
+    HasBlend = 0
+  };
+};
+template <typename Scalar>
+struct PacketType<Scalar, SyclDevice> : internal::default_packet_traits {
+  typedef Scalar type;
+  typedef Scalar half;
+  enum {
+    Vectorizable = 0,
+    size = 1,
+    AlignedOnScalar = 0,
+  };
+  enum {
+    HasAdd = 0,
+    HasSub = 0,
+    HasMul = 0,
+    HasNegate = 0,
+    HasAbs = 0,
+    HasAbs2 = 0,
+    HasMin = 0,
+    HasMax = 0,
+    HasConj = 0,
+    HasSetLinear = 0
+  };
+};
+
+template <typename Scalar>
+struct PacketType<Scalar, const SyclDevice> : PacketType<Scalar, SyclDevice> {};
+
+#ifndef EIGEN_DONT_VECTORIZE_SYCL
+#define PACKET_TYPE(CVQual, Type, val, lengths, DEV)                                 \
+  template <>                                                                        \
+  struct PacketType<CVQual Type, DEV> : internal::sycl_packet_traits<val, lengths> { \
+    typedef typename internal::packet_traits<Type>::type type;                       \
+    typedef typename internal::packet_traits<Type>::half half;                       \
+  };
+
+PACKET_TYPE(const, float, 1, 4, SyclDevice)
+PACKET_TYPE(, float, 1, 4, SyclDevice)
+PACKET_TYPE(const, float, 1, 4, const SyclDevice)
+PACKET_TYPE(, float, 1, 4, const SyclDevice)
+
+PACKET_TYPE(const, double, 0, 2, SyclDevice)
+PACKET_TYPE(, double, 0, 2, SyclDevice)
+PACKET_TYPE(const, double, 0, 2, const SyclDevice)
+PACKET_TYPE(, double, 0, 2, const SyclDevice)
+#undef PACKET_TYPE
+
+template <>
+struct PacketType<half, const SyclDevice> : PacketType<half, SyclDevice> {};
+template <>
+struct PacketType<const half, const SyclDevice> : PacketType<half, SyclDevice> {};
+#endif
+#endif
+
+// Pair mimics std::pair but works on e.g. nvcc.
+template <typename U, typename V>
+struct Pair {
+ public:
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW
+
+  U first;
+  V second;
+
+  typedef U first_type;
+  typedef V second_type;
+
+  constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Pair() : first(), second() {}
+
+  constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Pair(const U& f, const V& s) : first(f), second(s) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void swap(Pair& rhs) {
+    using numext::swap;
+    swap(first, rhs.first);
+    swap(second, rhs.second);
+  }
+};
+
+template <typename U, typename V>
+constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator==(const Pair<U, V>& x, const Pair<U, V>& y) {
+  return (x.first == y.first && x.second == y.second);
+}
+
+template <typename U, typename V>
+constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool operator!=(const Pair<U, V>& x, const Pair<U, V>& y) {
+  return !(x == y);
+}
+
+// Can't use std::pairs on cuda devices
+template <typename Idx>
+struct IndexPair {
+  constexpr EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair() : first(0), second(0) {}
+  constexpr EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair(Idx f, Idx s) : first(f), second(s) {}
+
+  EIGEN_DEVICE_FUNC void set(IndexPair<Idx> val) {
+    first = val.first;
+    second = val.second;
+  }
+
+  Idx first;
+  Idx second;
+};
+
+namespace internal {
+
+template <typename IndexType, typename Index, Index First, Index... Is>
+constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array<Index, 1 + sizeof...(Is)> customIndices2Array(
+    IndexType& idx, numeric_list<Index, First, Is...>) {
+  return {static_cast<Index>(idx[First]), static_cast<Index>(idx[Is])...};
+}
+template <typename IndexType, typename Index>
+constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array<Index, 0> customIndices2Array(IndexType&, numeric_list<Index>) {
+  return array<Index, 0>();
+}
+
+/** Make an array (for index/dimensions) out of a custom index */
+template <typename Index, std::size_t NumIndices, typename IndexType>
+constexpr EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array<Index, NumIndices> customIndices2Array(IndexType& idx) {
+  return customIndices2Array(idx, typename gen_numeric_list<Index, NumIndices>::type{});
+}
+
+template <typename B, typename D>
+struct is_base_of {
+  typedef char (&yes)[1];
+  typedef char (&no)[2];
+
+  template <typename BB, typename DD>
+  struct Host {
+    operator BB*() const;
+    operator DD*();
+  };
+
+  template <typename T>
+  static yes check(D*, T);
+  static no check(B*, int);
+
+  static const bool value = sizeof(check(Host<B, D>(), int())) == sizeof(yes);
+};
+
+}  // namespace internal
+
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_META_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
new file mode 100644
index 00000000..3a697d3a
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -0,0 +1,982 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H
+#define EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+template <typename NewDimensions, typename XprType>
+struct traits<TensorReshapingOp<NewDimensions, XprType>> : public traits<XprType> {
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef std::remove_reference_t<Nested> Nested_;
+  static constexpr int NumDimensions = array_size<NewDimensions>::value;
+  static constexpr int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
+};
+
+template <typename NewDimensions, typename XprType>
+struct eval<TensorReshapingOp<NewDimensions, XprType>, Eigen::Dense> {
+  typedef const TensorReshapingOp<NewDimensions, XprType> EIGEN_DEVICE_REF type;
+};
+
+template <typename NewDimensions, typename XprType>
+struct nested<TensorReshapingOp<NewDimensions, XprType>, 1,
+              typename eval<TensorReshapingOp<NewDimensions, XprType>>::type> {
+  typedef TensorReshapingOp<NewDimensions, XprType> type;
+};
+
+}  // end namespace internal
+
+/**
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor reshaping class.
+ */
+template <typename NewDimensions, typename XprType>
+class TensorReshapingOp : public TensorBase<TensorReshapingOp<NewDimensions, XprType>, WriteAccessors> {
+ public:
+  typedef TensorBase<TensorReshapingOp<NewDimensions, XprType>, WriteAccessors> Base;
+  typedef typename Eigen::internal::traits<TensorReshapingOp>::Scalar Scalar;
+  typedef std::remove_const_t<typename XprType::CoeffReturnType> CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorReshapingOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorReshapingOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorReshapingOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReshapingOp(const XprType& expr, const NewDimensions& dims)
+      : m_xpr(expr), m_dims(dims) {}
+
+  EIGEN_DEVICE_FUNC const NewDimensions& dimensions() const { return m_dims; }
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename XprType::Nested>& expression() const { return m_xpr; }
+
+  EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorReshapingOp)
+
+ protected:
+  typename XprType::Nested m_xpr;
+  const NewDimensions m_dims;
+};
+
+// Eval as rvalue
+template <typename NewDimensions, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device> {
+  typedef TensorReshapingOp<NewDimensions, ArgType> XprType;
+  typedef NewDimensions Dimensions;
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+  typedef StorageMemory<std::remove_const_t<CoeffReturnType>, Device> ConstCastStorage;
+
+  static constexpr int NumOutputDims = internal::array_size<Dimensions>::value;
+  static constexpr int NumInputDims =
+      internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+
+  enum ReshapingKind {
+    // We do not use layout information to determine reshaping kind.
+    // Depending on the layout `N` can be inner or outer dimension.
+    OneByN = 0,  // expr.reshape(1, N)
+    NByOne = 1,  // expr.reshape(N, 1)
+    Runtime = 2  // Reshape dimensions are dynamic (specified at runtime).
+  };
+
+  // clang-format off
+  static const ReshapingKind kind =
+        (NumOutputDims == 2 && internal::index_statically_eq<NewDimensions>(/*index=*/0, /*value=*/1)) ? OneByN
+      : (NumOutputDims == 2 && internal::index_statically_eq<NewDimensions>(/*index=*/1, /*value=*/1)) ? NByOne
+      : Runtime;
+  // clang-format on
+
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+  enum {
+    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    // For trivial reshapes with raw access to underlying data we will provide
+    // zero overhead block access.
+    // TODO(ezhulenev): Consider adding block access without raw access?
+    BlockAccess = TensorEvaluator<ArgType, Device>::RawAccess && NumInputDims > 0 && NumOutputDims > 0,
+    PreferBlockAccess = false,
+    CoordAccess = false,  // to be implemented
+    RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
+  };
+
+  typedef std::remove_const_t<Scalar> ScalarNoConst;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumOutputDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumOutputDims, Layout, Index> TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_dimensions(op.dimensions()) {
+    // The total size of the reshaped tensor must be equal to the total size
+    // of the input tensor.
+    eigen_assert(internal::array_prod(m_impl.dimensions()) == internal::array_prod(op.dimensions()));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType data, EvalSubExprsCallback done) {
+    m_impl.evalSubExprsIfNeededAsync(data, std::move(done));
+  }
+#endif
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) { return m_impl.evalSubExprsIfNeeded(data); }
+  EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_impl.coeff(index); }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+    return m_impl.template packet<LoadMode>(index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    return m_impl.costPerCoeff(vectorized);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    return internal::TensorBlockResourceRequirements::any();
+  }
+
+  // required in block(OutputTensorBlock* output_block) const
+  // For C++03 compatibility this must be defined outside the method
+  struct BlockIteratorState {
+    Index stride;
+    Index span;
+    Index size;
+    Index count;
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+                                                          bool /*root_of_expr_ast*/ = false) const {
+    eigen_assert(m_impl.data() != NULL);
+    eigen_assert((kind == Runtime) || (kind == OneByN && desc.dimensions()[0] == 1) ||
+                 (kind == NByOne && desc.dimensions()[1] == 1));
+
+    if (kind == OneByN || kind == NByOne) {
+      // We can guarantee at compile time that block is just a contiguous slice
+      // of the underlying expression memory buffer.
+      return TensorBlock(internal::TensorBlockKind::kView, m_impl.data() + desc.offset(), desc.dimensions());
+    } else {
+      // This will do additional runtime checks, and in the end it might be also
+      // a view, or it might be a block materialized in the temporary buffer.
+      return TensorBlock::materialize(m_impl.data(), m_dimensions, desc, scratch);
+    }
+  }
+
+  EIGEN_DEVICE_FUNC typename Storage::Type data() const { return constCast(m_impl.data()); }
+
+  EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+
+ protected:
+  TensorEvaluator<ArgType, Device> m_impl;
+  NewDimensions m_dimensions;
+};
+
+// Eval as lvalue
+template <typename NewDimensions, typename ArgType, typename Device>
+struct TensorEvaluator<TensorReshapingOp<NewDimensions, ArgType>, Device>
+    : public TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
+
+{
+  typedef TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device> Base;
+  typedef TensorReshapingOp<NewDimensions, ArgType> XprType;
+  typedef NewDimensions Dimensions;
+
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+  enum {
+    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = TensorEvaluator<ArgType, Device>::RawAccess,
+    PreferBlockAccess = false,
+    CoordAccess = false,  // to be implemented
+    RawAccess = TensorEvaluator<ArgType, Device>::RawAccess
+  };
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {}
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<TensorEvaluator::NumOutputDims, Index> TensorBlockDesc;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const {
+    return this->m_impl.coeffRef(index);
+  }
+
+  template <int StoreMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) const {
+    this->m_impl.template writePacket<StoreMode>(index, x);
+  }
+
+  template <typename TensorBlock>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(const TensorBlockDesc& desc, const TensorBlock& block) {
+    eigen_assert(this->m_impl.data() != NULL);
+
+    typedef typename TensorBlock::XprType TensorBlockExpr;
+    typedef internal::TensorBlockAssignment<Scalar, TensorEvaluator::NumOutputDims, TensorBlockExpr, Index>
+        TensorBlockAssign;
+
+    TensorBlockAssign::Run(TensorBlockAssign::target(desc.dimensions(), internal::strides<Layout>(this->dimensions()),
+                                                     this->m_impl.data(), desc.offset()),
+                           block.expr());
+  }
+};
+
+/** \class TensorSlicing
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor slicing class.
+ *
+ *
+ */
+namespace internal {
+template <typename StartIndices, typename Sizes, typename XprType>
+struct traits<TensorSlicingOp<StartIndices, Sizes, XprType>> : public traits<XprType> {
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef std::remove_reference_t<Nested> Nested_;
+  static constexpr int NumDimensions = array_size<StartIndices>::value;
+  static constexpr int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
+};
+
+template <typename StartIndices, typename Sizes, typename XprType>
+struct eval<TensorSlicingOp<StartIndices, Sizes, XprType>, Eigen::Dense> {
+  typedef const TensorSlicingOp<StartIndices, Sizes, XprType> EIGEN_DEVICE_REF type;
+};
+
+template <typename StartIndices, typename Sizes, typename XprType>
+struct nested<TensorSlicingOp<StartIndices, Sizes, XprType>, 1,
+              typename eval<TensorSlicingOp<StartIndices, Sizes, XprType>>::type> {
+  typedef TensorSlicingOp<StartIndices, Sizes, XprType> type;
+};
+
+}  // end namespace internal
+
+template <typename StartIndices, typename Sizes, typename XprType>
+class TensorSlicingOp : public TensorBase<TensorSlicingOp<StartIndices, Sizes, XprType>> {
+ public:
+  typedef TensorBase<TensorSlicingOp<StartIndices, Sizes, XprType>> Base;
+  typedef typename Eigen::internal::traits<TensorSlicingOp>::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorSlicingOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorSlicingOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorSlicingOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorSlicingOp(const XprType& expr, const StartIndices& indices,
+                                                        const Sizes& sizes)
+      : m_xpr(expr), m_indices(indices), m_sizes(sizes) {}
+
+  EIGEN_DEVICE_FUNC const StartIndices& startIndices() const { return m_indices; }
+  EIGEN_DEVICE_FUNC const Sizes& sizes() const { return m_sizes; }
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename XprType::Nested>& expression() const { return m_xpr; }
+
+  EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorSlicingOp)
+
+ protected:
+  typename XprType::Nested m_xpr;
+  const StartIndices m_indices;
+  const Sizes m_sizes;
+};
+
+namespace internal {
+
+// Fixme: figure out the exact threshold
+template <typename Index, typename Device, bool BlockAccess>
+struct MemcpyTriggerForSlicing {
+  EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const Device& device) : threshold_(2 * device.numThreads()) {}
+  EIGEN_DEVICE_FUNC bool operator()(Index total, Index contiguous) const {
+    const bool prefer_block_evaluation = BlockAccess && total > 32 * 1024;
+    return !prefer_block_evaluation && contiguous > threshold_;
+  }
+
+ private:
+  Index threshold_;
+};
+
+// It is very expensive to start the memcpy kernel on GPU: we therefore only
+// use it for large copies.
+#ifdef EIGEN_USE_GPU
+template <typename Index, bool BlockAccess>
+struct MemcpyTriggerForSlicing<Index, GpuDevice, BlockAccess> {
+  EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const GpuDevice&) {}
+  EIGEN_DEVICE_FUNC bool operator()(Index, Index contiguous) const { return contiguous > 4 * 1024 * 1024; }
+};
+#endif
+
+// It is very expensive to start the memcpy kernel on GPU: we therefore only
+// use it for large copies.
+#ifdef EIGEN_USE_SYCL
+template <typename Index, bool BlockAccess>
+struct MemcpyTriggerForSlicing<Index, Eigen::SyclDevice, BlockAccess> {
+  EIGEN_DEVICE_FUNC MemcpyTriggerForSlicing(const SyclDevice&) {}
+  EIGEN_DEVICE_FUNC bool operator()(Index, Index contiguous) const { return contiguous > 4 * 1024 * 1024; }
+};
+#endif
+
+}  // namespace internal
+
+// Eval as rvalue
+template <typename StartIndices, typename Sizes, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Device> {
+  typedef TensorSlicingOp<StartIndices, Sizes, ArgType> XprType;
+  static constexpr int NumDims = internal::array_size<Sizes>::value;
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef Sizes Dimensions;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef StorageMemory<std::remove_const_t<CoeffReturnType>, Device> ConstCastStorage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+  enum {
+    // Alignment can't be guaranteed at compile time since it depends on the
+    // slice offsets and sizes.
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess &&
+                  // FIXME: Temporary workaround for bug in slicing of bool tensors.
+                  !internal::is_same<std::remove_const_t<Scalar>, bool>::value,
+    PreferBlockAccess = true,
+    CoordAccess = false,
+    RawAccess = false
+  };
+
+  typedef std::remove_const_t<Scalar> ScalarNoConst;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  // Tensor slicing does not change the block type.
+  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_device(device), m_dimensions(op.sizes()), m_offsets(op.startIndices()) {
+    m_is_identity = true;
+    for (int i = 0; i < internal::array_size<Dimensions>::value; ++i) {
+      eigen_assert(m_impl.dimensions()[i] >= op.sizes()[i] + op.startIndices()[i]);
+      if (m_impl.dimensions()[i] != op.sizes()[i] || op.startIndices()[i] != 0) {
+        m_is_identity = false;
+      }
+    }
+
+    // No strides for scalars.
+    if (NumDims == 0) return;
+
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+    const Sizes& output_dims = op.sizes();
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_inputStrides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_inputStrides[i] = m_inputStrides[i - 1] * input_dims[i - 1];
+      }
+
+      // Don't initialize m_fastOutputStrides[0] since it won't ever be accessed.
+      m_outputStrides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_outputStrides[i] = m_outputStrides[i - 1] * output_dims[i - 1];
+        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1);
+      }
+    } else {
+      m_inputStrides[NumDims - 1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1];
+      }
+
+      // Don't initialize m_fastOutputStrides[NumDims-1] since it won't ever be accessed.
+      m_outputStrides[NumDims - 1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_outputStrides[i] = m_outputStrides[i + 1] * output_dims[i + 1];
+        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1);
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    if (!NumTraits<std::remove_const_t<Scalar>>::RequireInitialization && data && m_impl.data()) {
+      Index contiguous_values = 1;
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        for (int i = 0; i < NumDims; ++i) {
+          contiguous_values *= dimensions()[i];
+          if (dimensions()[i] != m_impl.dimensions()[i]) {
+            break;
+          }
+        }
+      } else {
+        for (int i = NumDims - 1; i >= 0; --i) {
+          contiguous_values *= dimensions()[i];
+          if (dimensions()[i] != m_impl.dimensions()[i]) {
+            break;
+          }
+        }
+      }
+      // Use memcpy if it's going to be faster than using the regular evaluation.
+      const internal::MemcpyTriggerForSlicing<Index, Device, BlockAccess> trigger(m_device);
+      if (trigger(internal::array_prod(dimensions()), contiguous_values)) {
+        EvaluatorPointerType src = (EvaluatorPointerType)m_impl.data();
+        for (Index i = 0; i < internal::array_prod(dimensions()); i += contiguous_values) {
+          Index offset = srcCoeff(i);
+          m_device.memcpy((void*)(m_device.get(data + i)), m_device.get(src + offset),
+                          contiguous_values * sizeof(Scalar));
+        }
+        return false;
+      }
+    }
+    return true;
+  }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType /*data*/, EvalSubExprsCallback done) {
+    m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    if (m_is_identity) {
+      return m_impl.coeff(index);
+    } else {
+      return m_impl.coeff(srcCoeff(index));
+    }
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+    const int packetSize = PacketType<CoeffReturnType, Device>::size;
+    EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index + packetSize - 1 < internal::array_prod(dimensions()));
+
+    if (m_is_identity) {
+      return m_impl.template packet<LoadMode>(index);
+    }
+
+    Index inputIndices[] = {0, 0};
+    Index indices[] = {index, index + packetSize - 1};
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx0 = indices[0] / m_fastOutputStrides[i];
+        const Index idx1 = indices[1] / m_fastOutputStrides[i];
+        inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i];
+        inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i];
+        indices[0] -= idx0 * m_outputStrides[i];
+        indices[1] -= idx1 * m_outputStrides[i];
+      }
+      inputIndices[0] += (indices[0] + m_offsets[0]);
+      inputIndices[1] += (indices[1] + m_offsets[0]);
+    } else {
+      EIGEN_UNROLL_LOOP
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx0 = indices[0] / m_fastOutputStrides[i];
+        const Index idx1 = indices[1] / m_fastOutputStrides[i];
+        inputIndices[0] += (idx0 + m_offsets[i]) * m_inputStrides[i];
+        inputIndices[1] += (idx1 + m_offsets[i]) * m_inputStrides[i];
+        indices[0] -= idx0 * m_outputStrides[i];
+        indices[1] -= idx1 * m_outputStrides[i];
+      }
+      inputIndices[0] += (indices[0] + m_offsets[NumDims - 1]);
+      inputIndices[1] += (indices[1] + m_offsets[NumDims - 1]);
+    }
+    if (inputIndices[1] - inputIndices[0] == packetSize - 1) {
+      PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]);
+      return rslt;
+    } else {
+      EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[packetSize];
+      values[0] = m_impl.coeff(inputIndices[0]);
+      values[packetSize - 1] = m_impl.coeff(inputIndices[1]);
+      EIGEN_UNROLL_LOOP
+      for (int i = 1; i < packetSize - 1; ++i) {
+        values[i] = coeff(index + i);
+      }
+      PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+      return rslt;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, m_is_identity ? 1 : NumDims);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    const size_t target_size = m_device.lastLevelCacheSize();
+    return internal::TensorBlockResourceRequirements::merge(
+        internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size), m_impl.getResourceRequirements());
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+                                                          bool /*root_of_expr_ast*/ = false) const {
+    TensorBlockDesc arg_desc = desc.WithOffset(srcCoeff(desc.offset()));
+    TensorBlock block = m_impl.block(arg_desc, scratch);
+    if (!arg_desc.HasDestinationBuffer()) desc.DropDestinationBuffer();
+    return block;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const {
+    typename Storage::Type result = constCast(m_impl.data());
+    if (result) {
+      Index offset = 0;
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        for (int i = 0; i < NumDims; ++i) {
+          if (m_dimensions[i] != m_impl.dimensions()[i]) {
+            offset += m_offsets[i] * m_inputStrides[i];
+            for (int j = i + 1; j < NumDims; ++j) {
+              if (m_dimensions[j] > 1) {
+                return NULL;
+              }
+              offset += m_offsets[j] * m_inputStrides[j];
+            }
+            break;
+          }
+        }
+      } else {
+        for (int i = NumDims - 1; i >= 0; --i) {
+          if (m_dimensions[i] != m_impl.dimensions()[i]) {
+            offset += m_offsets[i] * m_inputStrides[i];
+            for (int j = i - 1; j >= 0; --j) {
+              if (m_dimensions[j] > 1) {
+                return NULL;
+              }
+              offset += m_offsets[j] * m_inputStrides[j];
+            }
+            break;
+          }
+        }
+      }
+      return result + offset;
+    }
+    return NULL;
+  }
+
+ protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const {
+    Index inputIndex = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx = index / m_fastOutputStrides[i];
+        inputIndex += (idx + m_offsets[i]) * m_inputStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      inputIndex += (index + m_offsets[0]);
+    } else {
+      EIGEN_UNROLL_LOOP
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx = index / m_fastOutputStrides[i];
+        inputIndex += (idx + m_offsets[i]) * m_inputStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      inputIndex += (index + m_offsets[NumDims - 1]);
+    }
+    return inputIndex;
+  }
+
+  array<Index, NumDims> m_outputStrides;
+  array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
+  array<Index, NumDims> m_inputStrides;
+  TensorEvaluator<ArgType, Device> m_impl;
+  const Device EIGEN_DEVICE_REF m_device;
+  Dimensions m_dimensions;
+  bool m_is_identity;
+  const StartIndices m_offsets;
+};
+
+// Eval as lvalue
+template <typename StartIndices, typename Sizes, typename ArgType, typename Device>
+struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
+    : public TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Device> {
+  typedef TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Device> Base;
+  typedef TensorSlicingOp<StartIndices, Sizes, ArgType> XprType;
+  static constexpr int NumDims = internal::array_size<Sizes>::value;
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef Sizes Dimensions;
+
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+  enum {
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
+    PreferBlockAccess = true,
+    CoordAccess = false,
+    RawAccess = (NumDims == 1) & TensorEvaluator<ArgType, Device>::RawAccess
+  };
+
+  typedef std::remove_const_t<Scalar> ScalarNoConst;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const {
+    if (this->m_is_identity) {
+      return this->m_impl.coeffRef(index);
+    } else {
+      return this->m_impl.coeffRef(this->srcCoeff(index));
+    }
+  }
+
+  template <int StoreMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) const {
+    if (this->m_is_identity) {
+      this->m_impl.template writePacket<StoreMode>(index, x);
+      return;
+    }
+
+    const int packetSize = PacketType<CoeffReturnType, Device>::size;
+    Index inputIndices[] = {0, 0};
+    Index indices[] = {index, index + packetSize - 1};
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx0 = indices[0] / this->m_fastOutputStrides[i];
+        const Index idx1 = indices[1] / this->m_fastOutputStrides[i];
+        inputIndices[0] += (idx0 + this->m_offsets[i]) * this->m_inputStrides[i];
+        inputIndices[1] += (idx1 + this->m_offsets[i]) * this->m_inputStrides[i];
+        indices[0] -= idx0 * this->m_outputStrides[i];
+        indices[1] -= idx1 * this->m_outputStrides[i];
+      }
+      inputIndices[0] += (indices[0] + this->m_offsets[0]);
+      inputIndices[1] += (indices[1] + this->m_offsets[0]);
+    } else {
+      EIGEN_UNROLL_LOOP
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx0 = indices[0] / this->m_fastOutputStrides[i];
+        const Index idx1 = indices[1] / this->m_fastOutputStrides[i];
+        inputIndices[0] += (idx0 + this->m_offsets[i]) * this->m_inputStrides[i];
+        inputIndices[1] += (idx1 + this->m_offsets[i]) * this->m_inputStrides[i];
+        indices[0] -= idx0 * this->m_outputStrides[i];
+        indices[1] -= idx1 * this->m_outputStrides[i];
+      }
+      inputIndices[0] += (indices[0] + this->m_offsets[NumDims - 1]);
+      inputIndices[1] += (indices[1] + this->m_offsets[NumDims - 1]);
+    }
+    if (inputIndices[1] - inputIndices[0] == packetSize - 1) {
+      this->m_impl.template writePacket<StoreMode>(inputIndices[0], x);
+    } else {
+      EIGEN_ALIGN_MAX CoeffReturnType values[packetSize];
+      internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
+      this->m_impl.coeffRef(inputIndices[0]) = values[0];
+      this->m_impl.coeffRef(inputIndices[1]) = values[packetSize - 1];
+      EIGEN_UNROLL_LOOP
+      for (int i = 1; i < packetSize - 1; ++i) {
+        this->coeffRef(index + i) = values[i];
+      }
+    }
+  }
+
+  template <typename TensorBlock>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(const TensorBlockDesc& desc, const TensorBlock& block) {
+    TensorBlockDesc arg_desc = desc.WithOffset(this->srcCoeff(desc.offset()));
+    this->m_impl.writeBlock(arg_desc, block);
+  }
+};
+
+namespace internal {
+template <typename StartIndices, typename StopIndices, typename Strides, typename XprType>
+struct traits<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>> : public traits<XprType> {
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef std::remove_reference_t<Nested> Nested_;
+  static constexpr int NumDimensions = array_size<StartIndices>::value;
+  static constexpr int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
+};
+
+template <typename StartIndices, typename StopIndices, typename Strides, typename XprType>
+struct eval<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Eigen::Dense> {
+  typedef const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> EIGEN_DEVICE_REF type;
+};
+
+template <typename StartIndices, typename StopIndices, typename Strides, typename XprType>
+struct nested<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, 1,
+              typename eval<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>>::type> {
+  typedef TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> type;
+};
+
+}  // end namespace internal
+
+template <typename StartIndices, typename StopIndices, typename Strides, typename XprType>
+class TensorStridingSlicingOp
+    : public TensorBase<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>> {
+ public:
+  typedef TensorBase<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>> Base;
+  typedef typename internal::traits<TensorStridingSlicingOp>::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename internal::nested<TensorStridingSlicingOp>::type Nested;
+  typedef typename internal::traits<TensorStridingSlicingOp>::StorageKind StorageKind;
+  typedef typename internal::traits<TensorStridingSlicingOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorStridingSlicingOp(const XprType& expr, const StartIndices& startIndices,
+                                                                const StopIndices& stopIndices, const Strides& strides)
+      : m_xpr(expr), m_startIndices(startIndices), m_stopIndices(stopIndices), m_strides(strides) {}
+
+  EIGEN_DEVICE_FUNC const StartIndices& startIndices() const { return m_startIndices; }
+  EIGEN_DEVICE_FUNC const StartIndices& stopIndices() const { return m_stopIndices; }
+  EIGEN_DEVICE_FUNC const StartIndices& strides() const { return m_strides; }
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename XprType::Nested>& expression() const { return m_xpr; }
+
+  EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorStridingSlicingOp)
+
+ protected:
+  typename XprType::Nested m_xpr;
+  const StartIndices m_startIndices;
+  const StopIndices m_stopIndices;
+  const Strides m_strides;
+};
+
+// Eval as rvalue
+template <typename StartIndices, typename StopIndices, typename Strides, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType>, Device> {
+  typedef TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType> XprType;
+  static constexpr int NumDims = internal::array_size<Strides>::value;
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+  typedef Strides Dimensions;
+
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+  enum {
+    // Alignment can't be guaranteed at compile time since it depends on the
+    // slice offsets and sizes.
+    IsAligned = false,
+    PacketAccess = false,
+    BlockAccess = false,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
+    RawAccess = false
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_device(device), m_strides(op.strides()) {
+    // Handle degenerate intervals by gracefully clamping and allowing m_dimensions to be zero
+    DSizes<Index, NumDims> startIndicesClamped, stopIndicesClamped;
+    for (ptrdiff_t i = 0; i < internal::array_size<Dimensions>::value; ++i) {
+      eigen_assert(m_strides[i] != 0 && "0 stride is invalid");
+      if (m_strides[i] > 0) {
+        startIndicesClamped[i] = clamp(op.startIndices()[i], 0, m_impl.dimensions()[i]);
+        stopIndicesClamped[i] = clamp(op.stopIndices()[i], 0, m_impl.dimensions()[i]);
+      } else {
+        /* implies m_strides[i] < 0 by assert */
+        startIndicesClamped[i] = clamp(op.startIndices()[i], -1, m_impl.dimensions()[i] - 1);
+        stopIndicesClamped[i] = clamp(op.stopIndices()[i], -1, m_impl.dimensions()[i] - 1);
+      }
+      m_startIndices[i] = startIndicesClamped[i];
+    }
+
+    typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
+    const InputDimensions& input_dims = m_impl.dimensions();
+
+    // compute output tensor shape
+    m_is_identity = true;
+    for (int i = 0; i < NumDims; i++) {
+      Index interval = stopIndicesClamped[i] - startIndicesClamped[i];
+      if (interval == 0 || ((interval < 0) != (m_strides[i] < 0))) {
+        m_dimensions[i] = 0;
+      } else {
+        m_dimensions[i] = (interval / m_strides[i]) + (interval % m_strides[i] != 0 ? 1 : 0);
+        eigen_assert(m_dimensions[i] >= 0);
+      }
+      if (m_strides[i] != 1 || interval != m_impl.dimensions()[i]) {
+        m_is_identity = false;
+      }
+    }
+
+    Strides output_dims = m_dimensions;
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_inputStrides[0] = m_strides[0];
+      m_offsets[0] = startIndicesClamped[0];
+      Index previousDimProduct = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        previousDimProduct *= input_dims[i - 1];
+        m_inputStrides[i] = previousDimProduct * m_strides[i];
+        m_offsets[i] = startIndicesClamped[i] * previousDimProduct;
+      }
+
+      // Don't initialize m_fastOutputStrides[0] since it won't ever be accessed.
+      m_outputStrides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_outputStrides[i] = m_outputStrides[i - 1] * output_dims[i - 1];
+        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1);
+      }
+    } else {
+      m_inputStrides[NumDims - 1] = m_strides[NumDims - 1];
+      m_offsets[NumDims - 1] = startIndicesClamped[NumDims - 1];
+      Index previousDimProduct = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        previousDimProduct *= input_dims[i + 1];
+        m_inputStrides[i] = previousDimProduct * m_strides[i];
+        m_offsets[i] = startIndicesClamped[i] * previousDimProduct;
+      }
+
+      m_outputStrides[NumDims - 1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_outputStrides[i] = m_outputStrides[i + 1] * output_dims[i + 1];
+        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i] > 0 ? m_outputStrides[i] : 1);
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+  EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    if (m_is_identity) {
+      return m_impl.coeff(index);
+    } else {
+      return m_impl.coeff(srcCoeff(index));
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, m_is_identity ? 1 : NumDims);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Storage::Type data() const { return NULL; }
+
+ protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const {
+    Index inputIndex = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
+      for (int i = NumDims - 1; i >= 0; --i) {
+        const Index idx = index / m_fastOutputStrides[i];
+        inputIndex += idx * m_inputStrides[i] + m_offsets[i];
+        index -= idx * m_outputStrides[i];
+      }
+    } else {
+      EIGEN_UNROLL_LOOP
+      for (int i = 0; i < NumDims; ++i) {
+        const Index idx = index / m_fastOutputStrides[i];
+        inputIndex += idx * m_inputStrides[i] + m_offsets[i];
+        index -= idx * m_outputStrides[i];
+      }
+    }
+    return inputIndex;
+  }
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) {
+#ifndef SYCL_DEVICE_ONLY
+    return numext::maxi(min, numext::mini(max, value));
+#else
+    return cl::sycl::clamp(value, min, max);
+#endif
+  }
+
+  array<Index, NumDims> m_outputStrides;
+  array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
+  array<Index, NumDims> m_inputStrides;
+  bool m_is_identity;
+  TensorEvaluator<ArgType, Device> m_impl;
+  const Device EIGEN_DEVICE_REF m_device;
+  DSizes<Index, NumDims> m_startIndices;  // clamped startIndices
+  DSizes<Index, NumDims> m_dimensions;
+  DSizes<Index, NumDims> m_offsets;  // offset in a flattened shape
+  const Strides m_strides;
+};
+
+// Eval as lvalue
+template <typename StartIndices, typename StopIndices, typename Strides, typename ArgType, typename Device>
+struct TensorEvaluator<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType>, Device>
+    : public TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType>, Device> {
+  typedef TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType>, Device> Base;
+  typedef TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType> XprType;
+  static constexpr int NumDims = internal::array_size<Strides>::value;
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = false,
+    BlockAccess = false,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
+    CoordAccess = TensorEvaluator<ArgType, Device>::CoordAccess,
+    RawAccess = false
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {}
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef Strides Dimensions;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const {
+    if (this->m_is_identity) {
+      return this->m_impl.coeffRef(index);
+    } else {
+      return this->m_impl.coeffRef(this->srcCoeff(index));
+    }
+  }
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
new file mode 100644
index 00000000..7b2db491
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
@@ -0,0 +1,620 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_PADDING_H
+#define EIGEN_CXX11_TENSOR_TENSOR_PADDING_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+template <typename PaddingDimensions, typename XprType>
+struct traits<TensorPaddingOp<PaddingDimensions, XprType> > : public traits<XprType> {
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef std::remove_reference_t<Nested> Nested_;
+  static constexpr int NumDimensions = XprTraits::NumDimensions;
+  static constexpr int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
+};
+
+template <typename PaddingDimensions, typename XprType>
+struct eval<TensorPaddingOp<PaddingDimensions, XprType>, Eigen::Dense> {
+  typedef const TensorPaddingOp<PaddingDimensions, XprType>& type;
+};
+
+template <typename PaddingDimensions, typename XprType>
+struct nested<TensorPaddingOp<PaddingDimensions, XprType>, 1,
+              typename eval<TensorPaddingOp<PaddingDimensions, XprType> >::type> {
+  typedef TensorPaddingOp<PaddingDimensions, XprType> type;
+};
+
+}  // end namespace internal
+
+/**
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor padding class.
+ * At the moment only padding with a constant value is supported.
+ *
+ */
+template <typename PaddingDimensions, typename XprType>
+class TensorPaddingOp : public TensorBase<TensorPaddingOp<PaddingDimensions, XprType>, ReadOnlyAccessors> {
+ public:
+  typedef typename Eigen::internal::traits<TensorPaddingOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorPaddingOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorPaddingOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorPaddingOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPaddingOp(const XprType& expr, const PaddingDimensions& padding_dims,
+                                                        const Scalar padding_value)
+      : m_xpr(expr), m_padding_dims(padding_dims), m_padding_value(padding_value) {}
+
+  EIGEN_DEVICE_FUNC const PaddingDimensions& padding() const { return m_padding_dims; }
+  EIGEN_DEVICE_FUNC Scalar padding_value() const { return m_padding_value; }
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename XprType::Nested>& expression() const { return m_xpr; }
+
+ protected:
+  typename XprType::Nested m_xpr;
+  const PaddingDimensions m_padding_dims;
+  const Scalar m_padding_value;
+};
+
+// Eval as rvalue
+template <typename PaddingDimensions, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device> {
+  typedef TensorPaddingOp<PaddingDimensions, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static constexpr int NumDims = internal::array_size<PaddingDimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+  enum {
+    IsAligned = true,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = TensorEvaluator<ArgType, Device>::RawAccess,
+    PreferBlockAccess = true,
+    CoordAccess = true,
+    RawAccess = false
+  };
+
+  typedef std::remove_const_t<Scalar> ScalarNoConst;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims, Layout, Index> TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_padding(op.padding()), m_paddingValue(op.padding_value()), m_device(device) {
+    // The padding op doesn't change the rank of the tensor. Directly padding a scalar would lead
+    // to a vector, which doesn't make sense. Instead one should reshape the scalar into a vector
+    // of 1 element first and then pad.
+    EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    // Compute dimensions
+    m_dimensions = m_impl.dimensions();
+    for (int i = 0; i < NumDims; ++i) {
+      m_dimensions[i] += m_padding[i].first + m_padding[i].second;
+    }
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_inputStrides[0] = 1;
+      m_outputStrides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_inputStrides[i] = m_inputStrides[i - 1] * input_dims[i - 1];
+        m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
+      }
+      m_outputStrides[NumDims] = m_outputStrides[NumDims - 1] * m_dimensions[NumDims - 1];
+    } else {
+      m_inputStrides[NumDims - 1] = 1;
+      m_outputStrides[NumDims] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1];
+        m_outputStrides[i + 1] = m_outputStrides[i + 2] * m_dimensions[i + 1];
+      }
+      m_outputStrides[0] = m_outputStrides[1] * m_dimensions[0];
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType, EvalSubExprsCallback done) {
+    m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    eigen_assert(index < dimensions().TotalSize());
+    Index inputIndex = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx = index / m_outputStrides[i];
+        if (isPaddingAtIndexForDim(idx, i)) {
+          return m_paddingValue;
+        }
+        inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      if (isPaddingAtIndexForDim(index, 0)) {
+        return m_paddingValue;
+      }
+      inputIndex += (index - m_padding[0].first);
+    } else {
+      EIGEN_UNROLL_LOOP
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx = index / m_outputStrides[i + 1];
+        if (isPaddingAtIndexForDim(idx, i)) {
+          return m_paddingValue;
+        }
+        inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
+        index -= idx * m_outputStrides[i + 1];
+      }
+      if (isPaddingAtIndexForDim(index, NumDims - 1)) {
+        return m_paddingValue;
+      }
+      inputIndex += (index - m_padding[NumDims - 1].first);
+    }
+    return m_impl.coeff(inputIndex);
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      return packetColMajor(index);
+    }
+    return packetRowMajor(index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    TensorOpCost cost = m_impl.costPerCoeff(vectorized);
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
+      for (int i = 0; i < NumDims; ++i) updateCostPerDimension(cost, i, i == 0);
+    } else {
+      EIGEN_UNROLL_LOOP
+      for (int i = NumDims - 1; i >= 0; --i) updateCostPerDimension(cost, i, i == NumDims - 1);
+    }
+    return cost;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    const size_t target_size = m_device.lastLevelCacheSize();
+    return internal::TensorBlockResourceRequirements::merge(
+        internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size), m_impl.getResourceRequirements());
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+                                                          bool /*root_of_expr_ast*/ = false) const {
+    // If one of the dimensions is zero, return empty block view.
+    if (desc.size() == 0) {
+      return TensorBlock(internal::TensorBlockKind::kView, NULL, desc.dimensions());
+    }
+
+    static const bool IsColMajor = Layout == static_cast<int>(ColMajor);
+    const int inner_dim_idx = IsColMajor ? 0 : NumDims - 1;
+
+    Index offset = desc.offset();
+
+    // Compute offsets in the output tensor corresponding to the desc.offset().
+    DSizes<Index, NumDims> output_offsets;
+    for (int i = NumDims - 1; i > 0; --i) {
+      const int dim = IsColMajor ? i : NumDims - i - 1;
+      const int stride_dim = IsColMajor ? dim : dim + 1;
+      output_offsets[dim] = offset / m_outputStrides[stride_dim];
+      offset -= output_offsets[dim] * m_outputStrides[stride_dim];
+    }
+    output_offsets[inner_dim_idx] = offset;
+
+    // Offsets in the input corresponding to output offsets.
+    DSizes<Index, NumDims> input_offsets = output_offsets;
+    for (int i = 0; i < NumDims; ++i) {
+      const int dim = IsColMajor ? i : NumDims - i - 1;
+      input_offsets[dim] = input_offsets[dim] - m_padding[dim].first;
+    }
+
+    // Compute offset in the input buffer (at this point it might be illegal and
+    // point outside of the input buffer, because we don't check for negative
+    // offsets, it will be autocorrected in the block iteration loop below).
+    Index input_offset = 0;
+    for (int i = 0; i < NumDims; ++i) {
+      const int dim = IsColMajor ? i : NumDims - i - 1;
+      input_offset += input_offsets[dim] * m_inputStrides[dim];
+    }
+
+    // Destination buffer and scratch buffer both indexed from 0 and have the
+    // same dimensions as the requested block (for destination buffer this
+    // property is guaranteed by `desc.destination()`).
+    Index output_offset = 0;
+    const DSizes<Index, NumDims> output_strides = internal::strides<Layout>(desc.dimensions());
+
+    // NOTE(ezhulenev): We initialize bock iteration state for `NumDims - 1`
+    // dimensions, skipping innermost dimension. In theory it should be possible
+    // to squeeze matching innermost dimensions, however in practice that did
+    // not show any improvements in benchmarks. Also in practice first outer
+    // dimension usually has padding, and will prevent squeezing.
+
+    // Initialize output block iterator state. Dimension in this array are
+    // always in inner_most -> outer_most order (col major layout).
+    array<BlockIteratorState, NumDims - 1> it;
+    for (int i = 0; i < NumDims - 1; ++i) {
+      const int dim = IsColMajor ? i + 1 : NumDims - i - 2;
+      it[i].count = 0;
+      it[i].size = desc.dimension(dim);
+
+      it[i].input_stride = m_inputStrides[dim];
+      it[i].input_span = it[i].input_stride * (it[i].size - 1);
+
+      it[i].output_stride = output_strides[dim];
+      it[i].output_span = it[i].output_stride * (it[i].size - 1);
+    }
+
+    const Index input_inner_dim_size = static_cast<Index>(m_impl.dimensions()[inner_dim_idx]);
+
+    // Total output size.
+    const Index output_size = desc.size();
+
+    // We will fill inner dimension of this size in the output. It might be
+    // larger than the inner dimension in the input, so we might have to pad
+    // before/after we copy values from the input inner dimension.
+    const Index output_inner_dim_size = desc.dimension(inner_dim_idx);
+
+    // How many values to fill with padding BEFORE reading from the input inner
+    // dimension.
+    const Index output_inner_pad_before_size =
+        input_offsets[inner_dim_idx] < 0
+            ? numext::mini(numext::abs(input_offsets[inner_dim_idx]), output_inner_dim_size)
+            : 0;
+
+    // How many values we can actually copy from the input inner dimension.
+    const Index output_inner_copy_size = numext::mini(
+        // Want to copy from input.
+        (output_inner_dim_size - output_inner_pad_before_size),
+        // Can copy from input.
+        numext::maxi(input_inner_dim_size - (input_offsets[inner_dim_idx] + output_inner_pad_before_size), Index(0)));
+
+    eigen_assert(output_inner_copy_size >= 0);
+
+    // How many values to fill with padding AFTER reading from the input inner
+    // dimension.
+    const Index output_inner_pad_after_size =
+        (output_inner_dim_size - output_inner_copy_size - output_inner_pad_before_size);
+
+    // Sanity check, sum of all sizes must be equal to the output size.
+    eigen_assert(output_inner_dim_size ==
+                 (output_inner_pad_before_size + output_inner_copy_size + output_inner_pad_after_size));
+
+    // Keep track of current coordinates and padding in the output.
+    DSizes<Index, NumDims> output_coord = output_offsets;
+    DSizes<Index, NumDims> output_padded;
+    for (int i = 0; i < NumDims; ++i) {
+      const int dim = IsColMajor ? i : NumDims - i - 1;
+      output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim);
+    }
+
+    typedef internal::StridedLinearBufferCopy<ScalarNoConst, Index> LinCopy;
+
+    // Prepare storage for the materialized padding result.
+    const typename TensorBlock::Storage block_storage = TensorBlock::prepareStorage(desc, scratch);
+
+    // TODO(ezhulenev): Squeeze multiple non-padded inner dimensions into a
+    // single logical inner dimension.
+
+    // When possible we squeeze writes for the innermost (only if non-padded)
+    // dimension with the first padded dimension. This allows to reduce the
+    // number of calls to LinCopy and better utilize vector instructions.
+    const bool squeeze_writes = NumDims > 1 &&
+                                // inner dimension is not padded
+                                (input_inner_dim_size == m_dimensions[inner_dim_idx]) &&
+                                // and equal to the block inner dimension
+                                (input_inner_dim_size == output_inner_dim_size);
+
+    const int squeeze_dim = IsColMajor ? inner_dim_idx + 1 : inner_dim_idx - 1;
+
+    // Maximum coordinate on a squeeze dimension that we can write to.
+    const Index squeeze_max_coord =
+        squeeze_writes ? numext::mini(
+                             // max non-padded element in the input
+                             static_cast<Index>(m_dimensions[squeeze_dim] - m_padding[squeeze_dim].second),
+                             // max element in the output buffer
+                             static_cast<Index>(output_offsets[squeeze_dim] + desc.dimension(squeeze_dim)))
+                       : static_cast<Index>(0);
+
+    // Iterate copying data from `m_impl.data()` to the output buffer.
+    for (Index size = 0; size < output_size;) {
+      // Detect if we are in the padded region (exclude innermost dimension).
+      bool is_padded = false;
+      for (int j = 1; j < NumDims; ++j) {
+        const int dim = IsColMajor ? j : NumDims - j - 1;
+        is_padded = output_padded[dim];
+        if (is_padded) break;
+      }
+
+      if (is_padded) {
+        // Fill single innermost dimension with padding value.
+        size += output_inner_dim_size;
+
+        LinCopy::template Run<LinCopy::Kind::FillLinear>(typename LinCopy::Dst(output_offset, 1, block_storage.data()),
+                                                         typename LinCopy::Src(0, 0, &m_paddingValue),
+                                                         output_inner_dim_size);
+
+      } else if (squeeze_writes) {
+        // Squeeze multiple reads from innermost dimensions.
+        const Index squeeze_num = squeeze_max_coord - output_coord[squeeze_dim];
+        size += output_inner_dim_size * squeeze_num;
+
+        // Copy `squeeze_num` inner dimensions from input to output.
+        LinCopy::template Run<LinCopy::Kind::Linear>(typename LinCopy::Dst(output_offset, 1, block_storage.data()),
+                                                     typename LinCopy::Src(input_offset, 1, m_impl.data()),
+                                                     output_inner_dim_size * squeeze_num);
+
+        // Update iteration state for only `squeeze_num - 1` processed inner
+        // dimensions, because we have another iteration state update at the end
+        // of the loop that will update iteration state for the last inner
+        // processed dimension.
+        it[0].count += (squeeze_num - 1);
+        input_offset += it[0].input_stride * (squeeze_num - 1);
+        output_offset += it[0].output_stride * (squeeze_num - 1);
+        output_coord[squeeze_dim] += (squeeze_num - 1);
+
+      } else {
+        // Single read from innermost dimension.
+        size += output_inner_dim_size;
+
+        {  // Fill with padding before copying from input inner dimension.
+          const Index out = output_offset;
+
+          LinCopy::template Run<LinCopy::Kind::FillLinear>(typename LinCopy::Dst(out, 1, block_storage.data()),
+                                                           typename LinCopy::Src(0, 0, &m_paddingValue),
+                                                           output_inner_pad_before_size);
+        }
+
+        {  // Copy data from input inner dimension.
+          const Index out = output_offset + output_inner_pad_before_size;
+          const Index in = input_offset + output_inner_pad_before_size;
+
+          eigen_assert(output_inner_copy_size == 0 || m_impl.data() != NULL);
+
+          LinCopy::template Run<LinCopy::Kind::Linear>(typename LinCopy::Dst(out, 1, block_storage.data()),
+                                                       typename LinCopy::Src(in, 1, m_impl.data()),
+                                                       output_inner_copy_size);
+        }
+
+        {  // Fill with padding after copying from input inner dimension.
+          const Index out = output_offset + output_inner_pad_before_size + output_inner_copy_size;
+
+          LinCopy::template Run<LinCopy::Kind::FillLinear>(typename LinCopy::Dst(out, 1, block_storage.data()),
+                                                           typename LinCopy::Src(0, 0, &m_paddingValue),
+                                                           output_inner_pad_after_size);
+        }
+      }
+
+      for (int j = 0; j < NumDims - 1; ++j) {
+        const int dim = IsColMajor ? j + 1 : NumDims - j - 2;
+
+        if (++it[j].count < it[j].size) {
+          input_offset += it[j].input_stride;
+          output_offset += it[j].output_stride;
+          output_coord[dim] += 1;
+          output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim);
+          break;
+        }
+        it[j].count = 0;
+        input_offset -= it[j].input_span;
+        output_offset -= it[j].output_span;
+        output_coord[dim] -= it[j].size - 1;
+        output_padded[dim] = isPaddingAtIndexForDim(output_coord[dim], dim);
+      }
+    }
+
+    return block_storage.AsTensorMaterializedBlock();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return NULL; }
+
+ private:
+  struct BlockIteratorState {
+    BlockIteratorState() : count(0), size(0), input_stride(0), input_span(0), output_stride(0), output_span(0) {}
+
+    Index count;
+    Index size;
+    Index input_stride;
+    Index input_span;
+    Index output_stride;
+    Index output_span;
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isPaddingAtIndexForDim(Index index, int dim_index) const {
+    return (!internal::index_pair_first_statically_eq<PaddingDimensions>(dim_index, 0) &&
+            index < m_padding[dim_index].first) ||
+           (!internal::index_pair_second_statically_eq<PaddingDimensions>(dim_index, 0) &&
+            index >= m_dimensions[dim_index] - m_padding[dim_index].second);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isLeftPaddingCompileTimeZero(int dim_index) const {
+    return internal::index_pair_first_statically_eq<PaddingDimensions>(dim_index, 0);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isRightPaddingCompileTimeZero(int dim_index) const {
+    return internal::index_pair_second_statically_eq<PaddingDimensions>(dim_index, 0);
+  }
+
+  void updateCostPerDimension(TensorOpCost& cost, int i, bool first) const {
+    const double in = static_cast<double>(m_impl.dimensions()[i]);
+    const double out = in + m_padding[i].first + m_padding[i].second;
+    if (out == 0) return;
+    const double reduction = in / out;
+    cost *= reduction;
+    if (first) {
+      cost += TensorOpCost(0, 0, 2 * TensorOpCost::AddCost<Index>() + reduction * (1 * TensorOpCost::AddCost<Index>()));
+    } else {
+      cost += TensorOpCost(0, 0,
+                           2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
+                               reduction * (2 * TensorOpCost::MulCost<Index>() + 1 * TensorOpCost::DivCost<Index>()));
+    }
+  }
+
+ protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const {
+    eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+
+    const Index initialIndex = index;
+    Index inputIndex = 0;
+    EIGEN_UNROLL_LOOP
+    for (int i = NumDims - 1; i > 0; --i) {
+      const Index firstIdx = index;
+      const Index lastIdx = index + PacketSize - 1;
+      const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i];
+      const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i];
+      const Index lastPaddedRight = m_outputStrides[i + 1];
+
+      if (!isLeftPaddingCompileTimeZero(i) && lastIdx < lastPaddedLeft) {
+        // all the coefficient are in the padding zone.
+        return internal::pset1<PacketReturnType>(m_paddingValue);
+      } else if (!isRightPaddingCompileTimeZero(i) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) {
+        // all the coefficient are in the padding zone.
+        return internal::pset1<PacketReturnType>(m_paddingValue);
+      } else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) ||
+                 (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) {
+        // all the coefficient are between the 2 padding zones.
+        const Index idx = index / m_outputStrides[i];
+        inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
+        index -= idx * m_outputStrides[i];
+      } else {
+        // Every other case
+        return packetWithPossibleZero(initialIndex);
+      }
+    }
+
+    const Index lastIdx = index + PacketSize - 1;
+    const Index firstIdx = index;
+    const Index lastPaddedLeft = m_padding[0].first;
+    const Index firstPaddedRight = (m_dimensions[0] - m_padding[0].second);
+    const Index lastPaddedRight = m_outputStrides[1];
+
+    if (!isLeftPaddingCompileTimeZero(0) && lastIdx < lastPaddedLeft) {
+      // all the coefficient are in the padding zone.
+      return internal::pset1<PacketReturnType>(m_paddingValue);
+    } else if (!isRightPaddingCompileTimeZero(0) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) {
+      // all the coefficient are in the padding zone.
+      return internal::pset1<PacketReturnType>(m_paddingValue);
+    } else if ((isLeftPaddingCompileTimeZero(0) && isRightPaddingCompileTimeZero(0)) ||
+               (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) {
+      // all the coefficient are between the 2 padding zones.
+      inputIndex += (index - m_padding[0].first);
+      return m_impl.template packet<Unaligned>(inputIndex);
+    }
+    // Every other case
+    return packetWithPossibleZero(initialIndex);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const {
+    eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+
+    const Index initialIndex = index;
+    Index inputIndex = 0;
+    EIGEN_UNROLL_LOOP
+    for (int i = 0; i < NumDims - 1; ++i) {
+      const Index firstIdx = index;
+      const Index lastIdx = index + PacketSize - 1;
+      const Index lastPaddedLeft = m_padding[i].first * m_outputStrides[i + 1];
+      const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i + 1];
+      const Index lastPaddedRight = m_outputStrides[i];
+
+      if (!isLeftPaddingCompileTimeZero(i) && lastIdx < lastPaddedLeft) {
+        // all the coefficient are in the padding zone.
+        return internal::pset1<PacketReturnType>(m_paddingValue);
+      } else if (!isRightPaddingCompileTimeZero(i) && firstIdx >= firstPaddedRight && lastIdx < lastPaddedRight) {
+        // all the coefficient are in the padding zone.
+        return internal::pset1<PacketReturnType>(m_paddingValue);
+      } else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) ||
+                 (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) {
+        // all the coefficient are between the 2 padding zones.
+        const Index idx = index / m_outputStrides[i + 1];
+        inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
+        index -= idx * m_outputStrides[i + 1];
+      } else {
+        // Every other case
+        return packetWithPossibleZero(initialIndex);
+      }
+    }
+
+    const Index lastIdx = index + PacketSize - 1;
+    const Index firstIdx = index;
+    const Index lastPaddedLeft = m_padding[NumDims - 1].first;
+    const Index firstPaddedRight = (m_dimensions[NumDims - 1] - m_padding[NumDims - 1].second);
+    const Index lastPaddedRight = m_outputStrides[NumDims - 1];
+
+    if (!isLeftPaddingCompileTimeZero(NumDims - 1) && lastIdx < lastPaddedLeft) {
+      // all the coefficient are in the padding zone.
+      return internal::pset1<PacketReturnType>(m_paddingValue);
+    } else if (!isRightPaddingCompileTimeZero(NumDims - 1) && firstIdx >= firstPaddedRight &&
+               lastIdx < lastPaddedRight) {
+      // all the coefficient are in the padding zone.
+      return internal::pset1<PacketReturnType>(m_paddingValue);
+    } else if ((isLeftPaddingCompileTimeZero(NumDims - 1) && isRightPaddingCompileTimeZero(NumDims - 1)) ||
+               (firstIdx >= lastPaddedLeft && lastIdx < firstPaddedRight)) {
+      // all the coefficient are between the 2 padding zones.
+      inputIndex += (index - m_padding[NumDims - 1].first);
+      return m_impl.template packet<Unaligned>(inputIndex);
+    }
+    // Every other case
+    return packetWithPossibleZero(initialIndex);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const {
+    EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
+    EIGEN_UNROLL_LOOP
+    for (int i = 0; i < PacketSize; ++i) {
+      values[i] = coeff(index + i);
+    }
+    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+    return rslt;
+  }
+
+  Dimensions m_dimensions;
+  array<Index, NumDims + 1> m_outputStrides;
+  array<Index, NumDims> m_inputStrides;
+  TensorEvaluator<ArgType, Device> m_impl;
+  PaddingDimensions m_padding;
+
+  Scalar m_paddingValue;
+
+  const Device EIGEN_DEVICE_REF m_device;
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_PADDING_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
new file mode 100644
index 00000000..6e767a7f
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
@@ -0,0 +1,258 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_PATCH_H
+#define EIGEN_CXX11_TENSOR_TENSOR_PATCH_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+template <typename PatchDim, typename XprType>
+struct traits<TensorPatchOp<PatchDim, XprType> > : public traits<XprType> {
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef std::remove_reference_t<Nested> Nested_;
+  static constexpr int NumDimensions = XprTraits::NumDimensions + 1;
+  static constexpr int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
+};
+
+template <typename PatchDim, typename XprType>
+struct eval<TensorPatchOp<PatchDim, XprType>, Eigen::Dense> {
+  typedef const TensorPatchOp<PatchDim, XprType>& type;
+};
+
+template <typename PatchDim, typename XprType>
+struct nested<TensorPatchOp<PatchDim, XprType>, 1, typename eval<TensorPatchOp<PatchDim, XprType> >::type> {
+  typedef TensorPatchOp<PatchDim, XprType> type;
+};
+
+}  // end namespace internal
+
+/**
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor patch class.
+ */
+template <typename PatchDim, typename XprType>
+class TensorPatchOp : public TensorBase<TensorPatchOp<PatchDim, XprType>, ReadOnlyAccessors> {
+ public:
+  typedef typename Eigen::internal::traits<TensorPatchOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorPatchOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorPatchOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorPatchOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorPatchOp(const XprType& expr, const PatchDim& patch_dims)
+      : m_xpr(expr), m_patch_dims(patch_dims) {}
+
+  EIGEN_DEVICE_FUNC const PatchDim& patch_dims() const { return m_patch_dims; }
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename XprType::Nested>& expression() const { return m_xpr; }
+
+ protected:
+  typename XprType::Nested m_xpr;
+  const PatchDim m_patch_dims;
+};
+
+// Eval as rvalue
+template <typename PatchDim, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device> {
+  typedef TensorPatchOp<PatchDim, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static constexpr int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value + 1;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+  enum {
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
+    CoordAccess = false,
+    RawAccess = false
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) {
+    Index num_patches = 1;
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+    const PatchDim& patch_dims = op.patch_dims();
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        m_dimensions[i] = patch_dims[i];
+        num_patches *= (input_dims[i] - patch_dims[i] + 1);
+      }
+      m_dimensions[NumDims - 1] = num_patches;
+
+      m_inputStrides[0] = 1;
+      m_patchStrides[0] = 1;
+      for (int i = 1; i < NumDims - 1; ++i) {
+        m_inputStrides[i] = m_inputStrides[i - 1] * input_dims[i - 1];
+        m_patchStrides[i] = m_patchStrides[i - 1] * (input_dims[i - 1] - patch_dims[i - 1] + 1);
+      }
+      m_outputStrides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
+      }
+    } else {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        m_dimensions[i + 1] = patch_dims[i];
+        num_patches *= (input_dims[i] - patch_dims[i] + 1);
+      }
+      m_dimensions[0] = num_patches;
+
+      m_inputStrides[NumDims - 2] = 1;
+      m_patchStrides[NumDims - 2] = 1;
+      for (int i = NumDims - 3; i >= 0; --i) {
+        m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1];
+        m_patchStrides[i] = m_patchStrides[i + 1] * (input_dims[i + 1] - patch_dims[i + 1] + 1);
+      }
+      m_outputStrides[NumDims - 1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+  EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    Index output_stride_index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? NumDims - 1 : 0;
+    // Find the location of the first element of the patch.
+    Index patchIndex = index / m_outputStrides[output_stride_index];
+    // Find the offset of the element wrt the location of the first element.
+    Index patchOffset = index - patchIndex * m_outputStrides[output_stride_index];
+    Index inputIndex = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
+      for (int i = NumDims - 2; i > 0; --i) {
+        const Index patchIdx = patchIndex / m_patchStrides[i];
+        patchIndex -= patchIdx * m_patchStrides[i];
+        const Index offsetIdx = patchOffset / m_outputStrides[i];
+        patchOffset -= offsetIdx * m_outputStrides[i];
+        inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i];
+      }
+    } else {
+      EIGEN_UNROLL_LOOP
+      for (int i = 0; i < NumDims - 2; ++i) {
+        const Index patchIdx = patchIndex / m_patchStrides[i];
+        patchIndex -= patchIdx * m_patchStrides[i];
+        const Index offsetIdx = patchOffset / m_outputStrides[i + 1];
+        patchOffset -= offsetIdx * m_outputStrides[i + 1];
+        inputIndex += (patchIdx + offsetIdx) * m_inputStrides[i];
+      }
+    }
+    inputIndex += (patchIndex + patchOffset);
+    return m_impl.coeff(inputIndex);
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+    eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+
+    Index output_stride_index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? NumDims - 1 : 0;
+    Index indices[2] = {index, index + PacketSize - 1};
+    Index patchIndices[2] = {indices[0] / m_outputStrides[output_stride_index],
+                             indices[1] / m_outputStrides[output_stride_index]};
+    Index patchOffsets[2] = {indices[0] - patchIndices[0] * m_outputStrides[output_stride_index],
+                             indices[1] - patchIndices[1] * m_outputStrides[output_stride_index]};
+
+    Index inputIndices[2] = {0, 0};
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
+      for (int i = NumDims - 2; i > 0; --i) {
+        const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i], patchIndices[1] / m_patchStrides[i]};
+        patchIndices[0] -= patchIdx[0] * m_patchStrides[i];
+        patchIndices[1] -= patchIdx[1] * m_patchStrides[i];
+
+        const Index offsetIdx[2] = {patchOffsets[0] / m_outputStrides[i], patchOffsets[1] / m_outputStrides[i]};
+        patchOffsets[0] -= offsetIdx[0] * m_outputStrides[i];
+        patchOffsets[1] -= offsetIdx[1] * m_outputStrides[i];
+
+        inputIndices[0] += (patchIdx[0] + offsetIdx[0]) * m_inputStrides[i];
+        inputIndices[1] += (patchIdx[1] + offsetIdx[1]) * m_inputStrides[i];
+      }
+    } else {
+      EIGEN_UNROLL_LOOP
+      for (int i = 0; i < NumDims - 2; ++i) {
+        const Index patchIdx[2] = {patchIndices[0] / m_patchStrides[i], patchIndices[1] / m_patchStrides[i]};
+        patchIndices[0] -= patchIdx[0] * m_patchStrides[i];
+        patchIndices[1] -= patchIdx[1] * m_patchStrides[i];
+
+        const Index offsetIdx[2] = {patchOffsets[0] / m_outputStrides[i + 1], patchOffsets[1] / m_outputStrides[i + 1]};
+        patchOffsets[0] -= offsetIdx[0] * m_outputStrides[i + 1];
+        patchOffsets[1] -= offsetIdx[1] * m_outputStrides[i + 1];
+
+        inputIndices[0] += (patchIdx[0] + offsetIdx[0]) * m_inputStrides[i];
+        inputIndices[1] += (patchIdx[1] + offsetIdx[1]) * m_inputStrides[i];
+      }
+    }
+    inputIndices[0] += (patchIndices[0] + patchOffsets[0]);
+    inputIndices[1] += (patchIndices[1] + patchOffsets[1]);
+
+    if (inputIndices[1] - inputIndices[0] == PacketSize - 1) {
+      PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]);
+      return rslt;
+    } else {
+      EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize];
+      values[0] = m_impl.coeff(inputIndices[0]);
+      values[PacketSize - 1] = m_impl.coeff(inputIndices[1]);
+      EIGEN_UNROLL_LOOP
+      for (int i = 1; i < PacketSize - 1; ++i) {
+        values[i] = coeff(index + i);
+      }
+      PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+      return rslt;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    const double compute_cost = NumDims * (TensorOpCost::DivCost<Index>() + TensorOpCost::MulCost<Index>() +
+                                           2 * TensorOpCost::AddCost<Index>());
+    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+ protected:
+  Dimensions m_dimensions;
+  array<Index, NumDims> m_outputStrides;
+  array<Index, NumDims - 1> m_inputStrides;
+  array<Index, NumDims - 1> m_patchStrides;
+
+  TensorEvaluator<ArgType, Device> m_impl;
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_PATCH_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
new file mode 100644
index 00000000..e9de9884
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
@@ -0,0 +1,315 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2018 Mehdi Goli <eigen@codeplay.com> Codeplay Software Ltd.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
+#define EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t get_random_seed() {
+#if defined(EIGEN_GPU_COMPILE_PHASE)
+  // We don't support 3d kernels since we currently only use 1 and
+  // 2d kernels.
+  gpu_assert(threadIdx.z == 0);
+  return blockIdx.x * blockDim.x + threadIdx.x + gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y);
+#else
+  // Rely on Eigen's random implementation.
+  return random<uint64_t>();
+#endif
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state, uint64_t stream) {
+  // TODO: Unify with the implementation in the non blocking thread pool.
+  uint64_t current = *state;
+  // Update the internal state
+  *state = current * 6364136223846793005ULL + (stream << 1 | 1);
+  // Generate the random output (using the PCG-XSH-RS scheme)
+  return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61)));
+}
+
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t seed) {
+  seed = seed ? seed : get_random_seed();
+  return seed * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
+}
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T RandomToTypeUniform(uint64_t* state, uint64_t stream) {
+  unsigned rnd = PCG_XSH_RS_generator(state, stream);
+  return static_cast<T>(rnd);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool RandomToTypeUniform<bool>(uint64_t* state, uint64_t stream) {
+  unsigned rnd = PCG_XSH_RS_generator(state, stream);
+  return (rnd & 0x1) != 0;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state, uint64_t stream) {
+  // Generate 10 random bits for the mantissa, merge with exponent.
+  unsigned rnd = PCG_XSH_RS_generator(state, stream);
+  const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x3ffu) | (static_cast<uint16_t>(15) << 10);
+  Eigen::half result = Eigen::numext::bit_cast<Eigen::half>(half_bits);
+  // Return the final result
+  return result - Eigen::half(1.0f);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::bfloat16 RandomToTypeUniform<Eigen::bfloat16>(uint64_t* state,
+                                                                                           uint64_t stream) {
+  // Generate 7 random bits for the mantissa, merge with exponent.
+  unsigned rnd = PCG_XSH_RS_generator(state, stream);
+  const uint16_t half_bits = static_cast<uint16_t>(rnd & 0x7fu) | (static_cast<uint16_t>(127) << 7);
+  Eigen::bfloat16 result = Eigen::numext::bit_cast<Eigen::bfloat16>(half_bits);
+  // Return the final result
+  return result - Eigen::bfloat16(1.0f);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float RandomToTypeUniform<float>(uint64_t* state, uint64_t stream) {
+  typedef union {
+    uint32_t raw;
+    float fp;
+  } internal;
+  internal result;
+  // Generate 23 random bits for the mantissa mantissa
+  const unsigned rnd = PCG_XSH_RS_generator(state, stream);
+  result.raw = rnd & 0x7fffffu;
+  // Set the exponent
+  result.raw |= (static_cast<uint32_t>(127) << 23);
+  // Return the final result
+  return result.fp - 1.0f;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double RandomToTypeUniform<double>(uint64_t* state, uint64_t stream) {
+  typedef union {
+    uint64_t raw;
+    double dp;
+  } internal;
+  internal result;
+  result.raw = 0;
+  // Generate 52 random bits for the mantissa
+  // First generate the upper 20 bits
+  unsigned rnd1 = PCG_XSH_RS_generator(state, stream) & 0xfffffu;
+  // The generate the lower 32 bits
+  unsigned rnd2 = PCG_XSH_RS_generator(state, stream);
+  result.raw = (static_cast<uint64_t>(rnd1) << 32) | rnd2;
+  // Set the exponent
+  result.raw |= (static_cast<uint64_t>(1023) << 52);
+  // Return the final result
+  return result.dp - 1.0;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<float> RandomToTypeUniform<std::complex<float> >(uint64_t* state,
+                                                                                                    uint64_t stream) {
+  return std::complex<float>(RandomToTypeUniform<float>(state, stream), RandomToTypeUniform<float>(state, stream));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<double> RandomToTypeUniform<std::complex<double> >(uint64_t* state,
+                                                                                                      uint64_t stream) {
+  return std::complex<double>(RandomToTypeUniform<double>(state, stream), RandomToTypeUniform<double>(state, stream));
+}
+
+template <typename T>
+class UniformRandomGenerator {
+ public:
+  static constexpr bool PacketAccess = true;
+
+  // Uses the given "seed" if non-zero, otherwise uses a random seed.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(uint64_t seed = 0) {
+    m_state = PCG_XSH_RS_state(seed);
+#ifdef EIGEN_USE_SYCL
+    // In SYCL it is not possible to build PCG_XSH_RS_state in one step.
+    // Therefore, we need two steps to initializate the m_state.
+    // IN SYCL, the constructor of the functor is s called on the CPU
+    // and we get the clock seed here from the CPU. However, This seed is
+    // the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
+    // and only  available on the Operator() function (which is called on the GPU).
+    // Thus for CUDA (((CLOCK  + global_thread_id)* 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each
+    // thread but for SYCL ((CLOCK * 6364136223846793005ULL) + 0xda3e39cb94b95bdbULL) is passed to each thread and each
+    // thread adds the  (global_thread_id* 6364136223846793005ULL) for itself only once, in order to complete the
+    // construction similar to CUDA Therefore, the thread Id injection is not available at this stage.
+    // However when the operator() is called the thread ID will be available. So inside the operator,
+    // we add the thrreadID, BlockId,... (which is equivalent of i)
+    // to the seed and construct the unique m_state per thead similar to cuda.
+    m_exec_once = false;
+#endif
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(const UniformRandomGenerator& other) {
+    m_state = other.m_state;
+#ifdef EIGEN_USE_SYCL
+    m_exec_once = other.m_exec_once;
+#endif
+  }
+
+  template <typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(Index i) const {
+#ifdef EIGEN_USE_SYCL
+    if (!m_exec_once) {
+      // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
+      // The (i * 6364136223846793005ULL) is the remaining part of the PCG_XSH_RS_state on the GPU side
+      m_state += (i * 6364136223846793005ULL);
+      m_exec_once = true;
+    }
+#endif
+    T result = RandomToTypeUniform<T>(&m_state, i);
+    return result;
+  }
+
+  template <typename Packet, typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(Index i) const {
+    const int packetSize = internal::unpacket_traits<Packet>::size;
+    EIGEN_ALIGN_MAX T values[packetSize];
+#ifdef EIGEN_USE_SYCL
+    if (!m_exec_once) {
+      // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
+      m_state += (i * 6364136223846793005ULL);
+      m_exec_once = true;
+    }
+#endif
+    EIGEN_UNROLL_LOOP
+    for (int j = 0; j < packetSize; ++j) {
+      values[j] = RandomToTypeUniform<T>(&m_state, i);
+    }
+    return internal::pload<Packet>(values);
+  }
+
+ private:
+  mutable uint64_t m_state;
+#ifdef EIGEN_USE_SYCL
+  mutable bool m_exec_once;
+#endif
+};
+
+template <typename Scalar>
+struct functor_traits<UniformRandomGenerator<Scalar> > {
+  enum {
+    // Rough estimate for floating point, multiplied by ceil(sizeof(T) / sizeof(float)).
+    Cost = 12 * NumTraits<Scalar>::AddCost * ((sizeof(Scalar) + sizeof(float) - 1) / sizeof(float)),
+    PacketAccess = UniformRandomGenerator<Scalar>::PacketAccess
+  };
+};
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T RandomToTypeNormal(uint64_t* state, uint64_t stream) {
+  // Use the ratio of uniform method to generate numbers following a normal
+  // distribution. See for example Numerical Recipes chapter 7.3.9 for the
+  // details.
+  T u, v, q;
+  do {
+    u = RandomToTypeUniform<T>(state, stream);
+    v = T(1.7156) * (RandomToTypeUniform<T>(state, stream) - T(0.5));
+    const T x = u - T(0.449871);
+    const T y = numext::abs(v) + T(0.386595);
+    q = x * x + y * (T(0.196) * y - T(0.25472) * x);
+  } while (q > T(0.27597) && (q > T(0.27846) || v * v > T(-4) * numext::log(u) * u * u));
+
+  return v / u;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<float> RandomToTypeNormal<std::complex<float> >(uint64_t* state,
+                                                                                                   uint64_t stream) {
+  return std::complex<float>(RandomToTypeNormal<float>(state, stream), RandomToTypeNormal<float>(state, stream));
+}
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::complex<double> RandomToTypeNormal<std::complex<double> >(uint64_t* state,
+                                                                                                     uint64_t stream) {
+  return std::complex<double>(RandomToTypeNormal<double>(state, stream), RandomToTypeNormal<double>(state, stream));
+}
+
+template <typename T>
+class NormalRandomGenerator {
+ public:
+  static constexpr bool PacketAccess = true;
+
+  // Uses the given "seed" if non-zero, otherwise uses a random seed.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) {
+    m_state = PCG_XSH_RS_state(seed);
+#ifdef EIGEN_USE_SYCL
+    // In SYCL it is not possible to build PCG_XSH_RS_state in one step.
+    // Therefore, we need two steps to initializate the m_state.
+    // IN SYCL, the constructor of the functor is s called on the CPU
+    // and we get the clock seed here from the CPU. However, This seed is
+    // the same for all the thread. As unlike CUDA, the thread.ID, BlockID, etc is not a global function.
+    // and only  available on the Operator() function (which is called on the GPU).
+    // Therefore, the thread Id injection is not available at this stage. However when the operator()
+    // is called the thread ID will be available. So inside the operator,
+    // we add the thrreadID, BlockId,... (which is equivalent of i)
+    // to the seed and construct the unique m_state per thead similar to cuda.
+    m_exec_once = false;
+#endif
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(const NormalRandomGenerator& other) {
+    m_state = other.m_state;
+#ifdef EIGEN_USE_SYCL
+    m_exec_once = other.m_exec_once;
+#endif
+  }
+
+  template <typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T operator()(Index i) const {
+#ifdef EIGEN_USE_SYCL
+    if (!m_exec_once) {
+      // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
+      m_state += (i * 6364136223846793005ULL);
+      m_exec_once = true;
+    }
+#endif
+    T result = RandomToTypeNormal<T>(&m_state, i);
+    return result;
+  }
+
+  template <typename Packet, typename Index>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(Index i) const {
+    const int packetSize = internal::unpacket_traits<Packet>::size;
+    EIGEN_ALIGN_MAX T values[packetSize];
+#ifdef EIGEN_USE_SYCL
+    if (!m_exec_once) {
+      // This is the second stage of adding thread Id to the CPU clock seed and build unique seed per thread
+      m_state += (i * 6364136223846793005ULL);
+      m_exec_once = true;
+    }
+#endif
+    EIGEN_UNROLL_LOOP
+    for (int j = 0; j < packetSize; ++j) {
+      values[j] = RandomToTypeNormal<T>(&m_state, i);
+    }
+    return internal::pload<Packet>(values);
+  }
+
+ private:
+  mutable uint64_t m_state;
+#ifdef EIGEN_USE_SYCL
+  mutable bool m_exec_once;
+#endif
+};
+
+template <typename Scalar>
+struct functor_traits<NormalRandomGenerator<Scalar> > {
+  enum {
+    // On average, we need to generate about 3 random numbers
+    // 15 mul, 8 add, 1.5 logs
+    Cost = 3 * functor_traits<UniformRandomGenerator<Scalar> >::Cost + 15 * NumTraits<Scalar>::AddCost +
+           8 * NumTraits<Scalar>::AddCost + 3 * functor_traits<scalar_log_op<Scalar> >::Cost / 2,
+    PacketAccess = NormalRandomGenerator<Scalar>::PacketAccess
+  };
+};
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
new file mode 100644
index 00000000..9bbf945f
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -0,0 +1,1025 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+// Copyright (C) 2016 Mehdi Goli, Codeplay Software Ltd <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
+#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
+
+// clang is incompatible with the CUDA syntax wrt making a kernel a class friend,
+// so we'll use a macro to make clang happy.
+#ifndef KERNEL_FRIEND
+#if defined(__clang__) && (defined(__CUDA__) || defined(__HIP__))
+#define KERNEL_FRIEND friend __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024
+#else
+#define KERNEL_FRIEND friend
+#endif
+#endif
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+template <typename Op, typename Dims, typename XprType, template <class> class MakePointer_>
+struct traits<TensorReductionOp<Op, Dims, XprType, MakePointer_> > : traits<XprType> {
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::Scalar Scalar;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  static constexpr int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value;
+  static constexpr int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
+
+  template <class T>
+  struct MakePointer {
+    // Intermediate typedef to workaround MSVC issue.
+    typedef MakePointer_<T> MakePointerT;
+    typedef typename MakePointerT::Type Type;
+  };
+};
+
+template <typename Op, typename Dims, typename XprType, template <class> class MakePointer_>
+struct eval<TensorReductionOp<Op, Dims, XprType, MakePointer_>, Eigen::Dense> {
+  typedef const TensorReductionOp<Op, Dims, XprType, MakePointer_>& type;
+};
+
+template <typename Op, typename Dims, typename XprType, template <class> class MakePointer_>
+struct nested<TensorReductionOp<Op, Dims, XprType, MakePointer_>, 1,
+              typename eval<TensorReductionOp<Op, Dims, XprType, MakePointer_> >::type> {
+  typedef TensorReductionOp<Op, Dims, XprType, MakePointer_> type;
+};
+
+template <typename OutputDims>
+struct DimInitializer {
+  template <typename InputDims, typename ReducedDims>
+  EIGEN_DEVICE_FUNC static void run(const InputDims& input_dims,
+                                    const array<bool, internal::array_size<InputDims>::value>& reduced,
+                                    OutputDims* output_dims, ReducedDims* reduced_dims) {
+    const int NumInputDims = internal::array_size<InputDims>::value;
+    int outputIndex = 0;
+    int reduceIndex = 0;
+    for (int i = 0; i < NumInputDims; ++i) {
+      if (reduced[i]) {
+        (*reduced_dims)[reduceIndex] = input_dims[i];
+        ++reduceIndex;
+      } else {
+        (*output_dims)[outputIndex] = input_dims[i];
+        ++outputIndex;
+      }
+    }
+  }
+};
+
+template <>
+struct DimInitializer<Sizes<> > {
+  template <typename InputDims, typename Index, size_t Rank>
+  EIGEN_DEVICE_FUNC static void run(const InputDims& input_dims, const array<bool, Rank>&, Sizes<>*,
+                                    array<Index, Rank>* reduced_dims) {
+    const int NumInputDims = internal::array_size<InputDims>::value;
+    for (int i = 0; i < NumInputDims; ++i) {
+      (*reduced_dims)[i] = input_dims[i];
+    }
+  }
+};
+
+template <typename ReducedDims, int NumTensorDims, int Layout>
+struct are_inner_most_dims {
+  static const bool value = false;
+};
+template <typename ReducedDims, int NumTensorDims, int Layout>
+struct preserve_inner_most_dims {
+  static const bool value = false;
+};
+
+template <typename ReducedDims, int NumTensorDims>
+struct are_inner_most_dims<ReducedDims, NumTensorDims, ColMajor> {
+  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
+  static const bool tmp2 = index_statically_eq<ReducedDims>(0, 0);
+  static const bool tmp3 =
+      index_statically_eq<ReducedDims>(array_size<ReducedDims>::value - 1, array_size<ReducedDims>::value - 1);
+  static const bool value = tmp1 & tmp2 & tmp3;
+};
+template <typename ReducedDims, int NumTensorDims>
+struct are_inner_most_dims<ReducedDims, NumTensorDims, RowMajor> {
+  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
+  static const bool tmp2 = index_statically_eq<ReducedDims>(0, NumTensorDims - array_size<ReducedDims>::value);
+  static const bool tmp3 = index_statically_eq<ReducedDims>(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
+  static const bool value = tmp1 & tmp2 & tmp3;
+};
+template <typename ReducedDims, int NumTensorDims>
+struct preserve_inner_most_dims<ReducedDims, NumTensorDims, ColMajor> {
+  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
+  static const bool tmp2 = index_statically_gt<ReducedDims>(0, 0);
+  static const bool value = tmp1 & tmp2;
+};
+template <typename ReducedDims, int NumTensorDims>
+struct preserve_inner_most_dims<ReducedDims, NumTensorDims, RowMajor> {
+  static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
+  static const bool tmp2 = index_statically_lt<ReducedDims>(array_size<ReducedDims>::value - 1, NumTensorDims - 1);
+  static const bool value = tmp1 & tmp2;
+};
+
+template <int DimIndex, typename Self, typename Op>
+struct GenericDimReducer {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex,
+                                                           Op& reducer, typename Self::CoeffReturnType* accum) {
+    EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    for (int j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
+      const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
+      GenericDimReducer<DimIndex - 1, Self, Op>::reduce(self, input, reducer, accum);
+    }
+  }
+};
+template <typename Self, typename Op>
+struct GenericDimReducer<0, Self, Op> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex,
+                                                           Op& reducer, typename Self::CoeffReturnType* accum) {
+    for (int j = 0; j < self.m_reducedDims[0]; ++j) {
+      const typename Self::Index input = firstIndex + j * self.m_reducedStrides[0];
+      reducer.reduce(self.m_impl.coeff(input), accum);
+    }
+  }
+};
+template <typename Self, typename Op>
+struct GenericDimReducer<-1, Self, Op> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index index, Op& reducer,
+                                                           typename Self::CoeffReturnType* accum) {
+    reducer.reduce(self.m_impl.coeff(index), accum);
+  }
+};
+
+template <typename Self, typename Op,
+          bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess),
+          bool UseTreeReduction = (!Self::ReducerTraits::IsStateful && !Self::ReducerTraits::IsExactlyAssociative &&
+                                   // GPU threads can quickly run out of stack space
+                                   // for moderately sized inputs.
+                                   !Self::RunningOnGPU)>
+struct InnerMostDimReducer {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(
+      const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
+    typename Self::CoeffReturnType accum = reducer.initialize();
+    for (typename Self::Index j = 0; j < numValuesToReduce; ++j) {
+      reducer.reduce(self.m_impl.coeff(firstIndex + j), &accum);
+    }
+    return reducer.finalize(accum);
+  }
+};
+
+template <typename Self, typename Op>
+struct InnerMostDimReducer<Self, Op, true, false> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(
+      const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer0) {
+    using Index = typename Self::Index;
+    constexpr Index packetSize = internal::unpacket_traits<typename Self::PacketReturnType>::size;
+    Index start = 0;
+    typename Self::PacketReturnType paccum0 = reducer0.template initializePacket<typename Self::PacketReturnType>();
+    if (!Self::ReducerTraits::IsStateful && numValuesToReduce >= 4 * packetSize) {
+      const Index VectorizedSize4 = (numValuesToReduce / (4 * packetSize)) * (4 * packetSize);
+      typename Self::PacketReturnType paccum1 = reducer0.template initializePacket<typename Self::PacketReturnType>();
+      typename Self::PacketReturnType paccum2 = reducer0.template initializePacket<typename Self::PacketReturnType>();
+      typename Self::PacketReturnType paccum3 = reducer0.template initializePacket<typename Self::PacketReturnType>();
+      const Index offset0 = firstIndex;
+      const Index offset1 = firstIndex + packetSize;
+      const Index offset2 = firstIndex + 2 * packetSize;
+      const Index offset3 = firstIndex + 3 * packetSize;
+      for (Index j = 0; j < VectorizedSize4; j += 4 * packetSize) {
+        reducer0.reducePacket(self.m_impl.template packet<Unaligned>(offset0 + j), &paccum0);
+        reducer0.reducePacket(self.m_impl.template packet<Unaligned>(offset1 + j), &paccum1);
+        reducer0.reducePacket(self.m_impl.template packet<Unaligned>(offset2 + j), &paccum2);
+        reducer0.reducePacket(self.m_impl.template packet<Unaligned>(offset3 + j), &paccum3);
+      }
+      reducer0.reducePacket(paccum1, &paccum0);
+      reducer0.reducePacket(paccum2, &paccum0);
+      reducer0.reducePacket(paccum3, &paccum0);
+      start = VectorizedSize4;
+    }
+    if (start <= (numValuesToReduce - packetSize)) {
+      const Index VectorizedSize = (numValuesToReduce / packetSize) * packetSize;
+      for (Index j = start; j < VectorizedSize; j += packetSize) {
+        reducer0.reducePacket(self.m_impl.template packet<Unaligned>(firstIndex + j), &paccum0);
+      }
+      start = VectorizedSize;
+    }
+    typename Self::CoeffReturnType accum = reducer0.initialize();
+    for (Index j = start; j < numValuesToReduce; ++j) {
+      reducer0.reduce(self.m_impl.coeff(firstIndex + j), &accum);
+    }
+    return reducer0.finalizeBoth(accum, paccum0);
+  }
+};
+
+#if !defined(EIGEN_HIPCC)
+
+// The following implements tree-based reduction, which improves the accuracy
+// of sum and mean reductions, since each of the n inputs only participates in
+// O(log n) additions.
+template <typename T>
+EIGEN_DEVICE_FUNC inline Index LeafSize() {
+  return 1024;
+}
+template <>
+EIGEN_DEVICE_FUNC inline Index LeafSize<half>() {
+  return 200;
+}
+template <>
+EIGEN_DEVICE_FUNC inline Index LeafSize<bfloat16>() {
+  return 128;
+}
+
+template <typename Self, typename Op>
+struct InnerMostDimReducer<Self, Op, false, true> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(
+      const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
+    const Index kLeafSize = LeafSize<typename Self::CoeffReturnType>();
+    typename Self::CoeffReturnType accum = reducer.initialize();
+    if (numValuesToReduce > kLeafSize) {
+      const typename Self::Index half = numValuesToReduce / 2;
+      // Recursively reduce the two halves.
+      reducer.reduce(reduce(self, firstIndex, half, reducer), &accum);
+      reducer.reduce(reduce(self, firstIndex + half, numValuesToReduce - half, reducer), &accum);
+      return reducer.finalize(accum);
+    } else {
+      return InnerMostDimReducer<Self, Op, false, false>::reduce(self, firstIndex, numValuesToReduce, reducer);
+    }
+  }
+};
+
+template <typename Self, typename Op>
+struct InnerMostDimReducer<Self, Op, true, true> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Self::CoeffReturnType reduce(
+      const Self& self, typename Self::Index firstIndex, typename Self::Index numValuesToReduce, Op& reducer) {
+    const Index kLeafSize = LeafSize<typename Self::CoeffReturnType>();
+    const typename Self::Index packetSize = internal::unpacket_traits<typename Self::PacketReturnType>::size;
+    typename Self::CoeffReturnType accum = reducer.initialize();
+    if (numValuesToReduce > packetSize * kLeafSize) {
+      // Make sure the split point is aligned on a packet boundary.
+      const typename Self::Index split =
+          packetSize *
+          numext::div_ceil(firstIndex + numext::div_ceil(numValuesToReduce, typename Self::Index(2)), packetSize);
+      const typename Self::Index num_left = numext::mini(split - firstIndex, numValuesToReduce);
+      reducer.reduce(reduce(self, firstIndex, num_left, reducer), &accum);
+      if (num_left < numValuesToReduce) {
+        reducer.reduce(reduce(self, split, numValuesToReduce - num_left, reducer), &accum);
+      }
+      return reducer.finalize(accum);
+    } else {
+      return InnerMostDimReducer<Self, Op, true, false>::reduce(self, firstIndex, numValuesToReduce, reducer);
+    }
+  }
+};
+#endif
+
+template <int DimIndex, typename Self, typename Op,
+          bool vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
+struct InnerMostDimPreserver {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&,
+                                                           typename Self::PacketReturnType*) {
+    eigen_assert(false && "should never be called");
+  }
+};
+
+template <int DimIndex, typename Self, typename Op>
+struct InnerMostDimPreserver<DimIndex, Self, Op, true> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex,
+                                                           Op& reducer, typename Self::PacketReturnType* accum) {
+    EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    for (typename Self::Index j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
+      const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
+      InnerMostDimPreserver<DimIndex - 1, Self, Op>::reduce(self, input, reducer, accum);
+    }
+  }
+};
+
+template <typename Self, typename Op>
+struct InnerMostDimPreserver<0, Self, Op, true> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex,
+                                                           Op& reducer0, typename Self::PacketReturnType* accum0) {
+    using Index = typename Self::Index;
+    const Index stride = self.m_reducedStrides[0];
+    const Index size = self.m_reducedDims[0];
+    if (!Self::ReducerTraits::IsStateful && size >= 16) {
+      const Index unrolled_size4 = (size / 4) * 4;
+      typename Self::PacketReturnType accum1 = reducer0.template initializePacket<typename Self::PacketReturnType>();
+      typename Self::PacketReturnType accum2 = reducer0.template initializePacket<typename Self::PacketReturnType>();
+      typename Self::PacketReturnType accum3 = reducer0.template initializePacket<typename Self::PacketReturnType>();
+      for (Index j = 0; j < unrolled_size4; j += 4) {
+        const Index input0 = firstIndex + j * stride;
+        reducer0.reducePacket(self.m_impl.template packet<Unaligned>(input0), accum0);
+        const Index input1 = firstIndex + (j + 1) * stride;
+        reducer0.reducePacket(self.m_impl.template packet<Unaligned>(input1), &accum1);
+        const Index input2 = firstIndex + (j + 2) * stride;
+        reducer0.reducePacket(self.m_impl.template packet<Unaligned>(input2), &accum2);
+        const Index input3 = firstIndex + (j + 3) * stride;
+        reducer0.reducePacket(self.m_impl.template packet<Unaligned>(input3), &accum3);
+      }
+      reducer0.reducePacket(accum1, accum0);
+      reducer0.reducePacket(accum2, accum0);
+      reducer0.reducePacket(accum3, accum0);
+      for (Index j = unrolled_size4; j < size; ++j) {
+        Index input = firstIndex + j * stride;
+        reducer0.reducePacket(self.m_impl.template packet<Unaligned>(input), accum0);
+      }
+    } else {
+      for (Index j = 0; j < size; ++j) {
+        Index input = firstIndex + j * stride;
+        reducer0.reducePacket(self.m_impl.template packet<Unaligned>(input), accum0);
+      }
+    }
+  }
+};
+template <typename Self, typename Op>
+struct InnerMostDimPreserver<-1, Self, Op, true> {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self&, typename Self::Index, Op&,
+                                                           typename Self::PacketReturnType*) {
+    eigen_assert(false && "should never be called");
+  }
+};
+
+// Default full reducer
+template <typename Self, typename Op, typename Device,
+          bool Vectorizable = (Self::InputPacketAccess && Self::ReducerTraits::PacketAccess)>
+struct FullReducer {
+  static constexpr bool HasOptimizedImplementation = false;
+
+  static EIGEN_DEVICE_FUNC void run(const Self& self, Op& reducer, const Device&,
+                                    typename Self::EvaluatorPointerType output) {
+    const typename Self::Index num_coeffs = array_prod(self.m_impl.dimensions());
+    *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
+  }
+};
+
+#ifdef EIGEN_USE_THREADS
+// Multithreaded full reducer
+template <typename Self, typename Op, bool Vectorizable>
+struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> {
+  static constexpr bool HasOptimizedImplementation = !Self::ReducerTraits::IsStateful;
+  static constexpr Index PacketSize = unpacket_traits<typename Self::PacketReturnType>::size;
+
+  // launch one reducer per thread and accumulate the result.
+  static void run(const Self& self, Op& reducer, const ThreadPoolDevice& device,
+                  typename Self::CoeffReturnType* output) {
+    typedef typename Self::Index Index;
+    const Index num_coeffs = array_prod(self.m_impl.dimensions());
+    if (num_coeffs == 0) {
+      *output = reducer.finalize(reducer.initialize());
+      return;
+    }
+    const TensorOpCost cost = self.m_impl.costPerCoeff(Vectorizable) +
+                              TensorOpCost(0, 0, internal::functor_traits<Op>::Cost, Vectorizable, PacketSize);
+    const Index num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(num_coeffs, cost, device.numThreads());
+    if (num_threads == 1) {
+      *output = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
+      return;
+    }
+    const Index blocksize = num_coeffs / num_threads;
+    const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
+    eigen_assert(num_coeffs >= numblocks * blocksize);
+
+    Barrier barrier(internal::convert_index<unsigned int>(numblocks));
+    MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
+    for (Index i = 0; i < numblocks; ++i) {
+      auto run_shard = [i, blocksize, &self, &barrier, &shards, &reducer](){
+        shards[i] = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, i * blocksize, blocksize, reducer);
+        barrier.Notify();
+      };
+      device.enqueue(std::move(run_shard));
+    }
+    typename Self::CoeffReturnType finalShard;
+    if (numblocks * blocksize < num_coeffs) {
+      finalShard = InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, numblocks * blocksize,
+                                                                       num_coeffs - numblocks * blocksize, reducer);
+    } else {
+      finalShard = reducer.initialize();
+    }
+    barrier.Wait();
+
+    for (Index i = 0; i < numblocks; ++i) {
+      reducer.reduce(shards[i], &finalShard);
+    }
+    *output = reducer.finalize(finalShard);
+  }
+};
+
+#endif
+
+// Default inner reducer
+template <typename Self, typename Op, typename Device>
+struct InnerReducer {
+  static constexpr bool HasOptimizedImplementation = false;
+
+  EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*,
+                                    typename Self::Index, typename Self::Index) {
+    eigen_assert(false && "Not implemented");
+    return true;
+  }
+};
+
+// Default outer reducer
+template <typename Self, typename Op, typename Device>
+struct OuterReducer {
+  static constexpr bool HasOptimizedImplementation = false;
+
+  EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*,
+                                    typename Self::Index, typename Self::Index) {
+    eigen_assert(false && "Not implemented");
+    return true;
+  }
+};
+
+#ifdef EIGEN_USE_SYCL
+// Default Generic reducer
+template <typename Self, typename Op, typename Device>
+struct GenericReducer {
+  static constexpr bool HasOptimizedImplementation = false;
+
+  EIGEN_DEVICE_FUNC static bool run(const Self&, Op&, const Device&, typename Self::CoeffReturnType*,
+                                    typename Self::Index, typename Self::Index) {
+    eigen_assert(false && "Not implemented");
+    return true;
+  }
+};
+#endif
+
+#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
+template <int B, int N, typename S, typename R, typename I_>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*,
+                                                                 unsigned int*);
+
+#if defined(EIGEN_HAS_GPU_FP16)
+template <typename S, typename R, typename I_>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(
+    R, const S, I_, internal::packet_traits<half>::type*);
+template <int B, int N, typename S, typename R, typename I_>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(R, const S, I_, half*,
+                                                                          internal::packet_traits<half>::type*);
+template <int NPT, typename S, typename R, typename I_>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(R, const S, I_, I_, half*);
+
+#endif
+
+template <int NPT, typename S, typename R, typename I_>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
+
+template <int NPT, typename S, typename R, typename I_>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
+#endif
+
+/**
+ * For SYCL, the return type of the reduction is deduced from the initialize method of the given Op.
+ * This allows the reduction to have a different type for the accumulator than the input data type.
+ * If this is the case, the functor needs to have two reduce method: one for reducing an element of the input
+ * with the accumulator and the other for reducing two accumulators.
+ * Such a reducer can be useful for instance when the accumulator is a boolean or a bitset that checks for
+ * some properties of the input.
+ */
+template <typename Op, typename CoeffReturnType>
+struct ReductionReturnType {
+#if defined(EIGEN_USE_SYCL)
+  typedef std::remove_const_t<decltype(std::declval<Op>().initialize())> type;
+#else
+  typedef std::remove_const_t<CoeffReturnType> type;
+#endif
+};
+
+}  // end namespace internal
+
+/**
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor reduction class.
+ *
+ */
+template <typename Op, typename Dims, typename XprType, template <class> class MakePointer_>
+class TensorReductionOp : public TensorBase<TensorReductionOp<Op, Dims, XprType, MakePointer_>, ReadOnlyAccessors> {
+ public:
+  typedef typename Eigen::internal::traits<TensorReductionOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef std::remove_const_t<typename XprType::CoeffReturnType> CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorReductionOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorReductionOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorReductionOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReductionOp(const XprType& expr, const Dims& dims)
+      : m_expr(expr), m_dims(dims) {}
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReductionOp(const XprType& expr, const Dims& dims, const Op& reducer)
+      : m_expr(expr), m_dims(dims), m_reducer(reducer) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& expression() const { return m_expr; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dims& dims() const { return m_dims; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op& reducer() const { return m_reducer; }
+
+ protected:
+  typename XprType::Nested m_expr;
+  const Dims m_dims;
+  const Op m_reducer;
+};
+
+template <typename ArgType, typename Device>
+struct TensorReductionEvaluatorBase;
+
+// Eval as rvalue
+template <typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
+struct TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> {
+  typedef internal::reducer_traits<Op, Device> ReducerTraits;
+  typedef Dims ReducedDims;
+  typedef TensorReductionOp<Op, Dims, ArgType, MakePointer_> XprType;
+  typedef typename XprType::Index Index;
+  typedef ArgType ChildType;
+  typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
+  static constexpr int NumInputDims = internal::array_size<InputDimensions>::value;
+  static constexpr int NumReducedDims = internal::array_size<Dims>::value;
+  static constexpr int NumOutputDims = NumInputDims - NumReducedDims;
+  typedef std::conditional_t<NumOutputDims == 0, Sizes<>, DSizes<Index, NumOutputDims> > Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Self;
+  static constexpr bool InputPacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess;
+  typedef typename internal::ReductionReturnType<Op, typename XprType::CoeffReturnType>::type CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static constexpr Index PacketSize = PacketType<CoeffReturnType, Device>::size;
+
+  typedef typename Eigen::internal::traits<XprType>::PointerType TensorPointerType;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  // Subset of strides of the input tensor for the non-reduced dimensions.
+  // Indexed by output dimensions.
+  static constexpr int NumPreservedStrides = max_n_1<NumOutputDims>::size;
+
+  // For full reductions
+#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
+  static constexpr bool RunningOnGPU = internal::is_same<Device, Eigen::GpuDevice>::value;
+  static constexpr bool RunningOnSycl = false;
+#elif defined(EIGEN_USE_SYCL)
+  static constexpr bool RunningOnSycl = internal::is_same<internal::remove_all_t<Device>, Eigen::SyclDevice>::value;
+  static constexpr bool RunningOnGPU = false;
+#else
+  static constexpr bool RunningOnGPU = false;
+  static constexpr bool RunningOnSycl = false;
+#endif
+
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+  enum {
+    IsAligned = false,
+    PacketAccess = Self::InputPacketAccess && ReducerTraits::PacketAccess,
+    BlockAccess = false,
+    PreferBlockAccess = true,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  typedef std::remove_const_t<Scalar> ScalarNoConst;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  static constexpr bool ReducingInnerMostDims = internal::are_inner_most_dims<Dims, NumInputDims, Layout>::value;
+  static constexpr bool PreservingInnerMostDims = internal::preserve_inner_most_dims<Dims, NumInputDims, Layout>::value;
+  static constexpr bool RunningFullReduction = (NumOutputDims == 0);
+
+  EIGEN_STRONG_INLINE TensorReductionEvaluatorBase(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device) {
+    EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)),
+                        YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    // Build the bitmap indicating if an input dimension is reduced or not.
+    for (int i = 0; i < NumInputDims; ++i) {
+      m_reduced[i] = false;
+    }
+    for (int i = 0; i < NumReducedDims; ++i) {
+      eigen_assert(op.dims()[i] >= 0);
+      eigen_assert(op.dims()[i] < NumInputDims);
+      m_reduced[op.dims()[i]] = true;
+    }
+
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+    internal::DimInitializer<Dimensions>::run(input_dims, m_reduced, &m_dimensions, &m_reducedDims);
+
+    // Precompute output strides.
+    if (NumOutputDims > 0) {
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        m_outputStrides[0] = 1;
+        for (int i = 1; i < NumOutputDims; ++i) {
+          m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
+          m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
+        }
+      } else {
+        m_outputStrides[static_cast<size_t>(NumOutputDims - 1)] = 1;
+        for (int i = NumOutputDims - 2; i >= 0; --i) {
+          m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
+          m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
+        }
+      }
+    }
+
+    // Precompute input strides.
+    if (NumInputDims > 0) {
+      array<Index, NumInputDims> input_strides;
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        input_strides[0] = 1;
+        for (int i = 1; i < NumInputDims; ++i) {
+          input_strides[i] = input_strides[i - 1] * input_dims[i - 1];
+        }
+      } else {
+        input_strides.back() = 1;
+        for (int i = NumInputDims - 2; i >= 0; --i) {
+          input_strides[i] = input_strides[i + 1] * input_dims[i + 1];
+        }
+      }
+
+      int outputIndex = 0;
+      int reduceIndex = 0;
+      for (int i = 0; i < NumInputDims; ++i) {
+        if (m_reduced[i]) {
+          m_reducedStrides[reduceIndex] = input_strides[i];
+          ++reduceIndex;
+        } else {
+          m_preservedStrides[outputIndex] = input_strides[i];
+          m_output_to_input_dim_map[outputIndex] = i;
+          ++outputIndex;
+        }
+      }
+    }
+
+    // Special case for full reductions
+    if (NumOutputDims == 0) {
+      m_preservedStrides[0] = internal::array_prod(input_dims);
+    }
+
+    m_numValuesToReduce = NumOutputDims == 0 ? internal::array_prod(input_dims)
+                          : (static_cast<int>(Layout) == static_cast<int>(ColMajor))
+                              ? m_preservedStrides[0]
+                              : m_preservedStrides[static_cast<size_t>(NumOutputDims - 1)];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeededCommon(EvaluatorPointerType data) {
+    // Use the FullReducer if possible.
+    if ((RunningFullReduction && RunningOnSycl) ||
+        (RunningFullReduction && internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation &&
+         ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || !RunningOnGPU))) {
+      bool need_assign = false;
+      if (!data) {
+        m_result = static_cast<EvaluatorPointerType>(
+            m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType))));
+        data = m_result;
+        need_assign = true;
+      }
+      Op reducer(m_reducer);
+      internal::FullReducer<Self, Op, Device>::run(*this, reducer, m_device, data);
+      return need_assign;
+    }
+
+    // Attempt to use an optimized reduction.
+    else if ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) || (RunningOnSycl)) {
+      bool reducing_inner_dims = true;
+      for (int i = 0; i < NumReducedDims; ++i) {
+        if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+          reducing_inner_dims &= m_reduced[i];
+        } else {
+          reducing_inner_dims &= m_reduced[NumInputDims - 1 - i];
+        }
+      }
+      if (internal::InnerReducer<Self, Op, Device>::HasOptimizedImplementation &&
+          (reducing_inner_dims || ReducingInnerMostDims)) {
+        const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
+        const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
+        if (!data) {
+          if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve &&
+               num_values_to_reduce > 128) ||
+              (RunningOnSycl)) {
+            data = static_cast<EvaluatorPointerType>(m_device.get(
+                (CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
+            m_result = data;
+          } else {
+            return true;
+          }
+        }
+        Op reducer(m_reducer);
+        // For SYCL this if always return false
+        if (internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce,
+                                                          num_coeffs_to_preserve)) {
+          if (m_result) {
+            m_device.deallocate_temp(m_result);
+            m_result = NULL;
+          }
+          return true;
+        } else {
+          return (m_result != NULL);
+        }
+      }
+
+      bool preserving_inner_dims = true;
+      for (int i = 0; i < NumReducedDims; ++i) {
+        if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+          preserving_inner_dims &= m_reduced[NumInputDims - 1 - i];
+        } else {
+          preserving_inner_dims &= m_reduced[i];
+        }
+      }
+      if (internal::OuterReducer<Self, Op, Device>::HasOptimizedImplementation && preserving_inner_dims) {
+        const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
+        const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
+        if (!data) {
+          if ((num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve &&
+               num_values_to_reduce > 32) ||
+              (RunningOnSycl)) {
+            data = static_cast<EvaluatorPointerType>(m_device.get(
+                (CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
+            m_result = data;
+          } else {
+            return true;
+          }
+        }
+        Op reducer(m_reducer);
+        // For SYCL this if always return false
+        if (internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce,
+                                                          num_coeffs_to_preserve)) {
+          if (m_result) {
+            m_device.deallocate_temp(m_result);
+            m_result = NULL;
+          }
+          return true;
+        } else {
+          return (m_result != NULL);
+        }
+      }
+#if defined(EIGEN_USE_SYCL)
+      // If there is no Optimised version for SYCL, the reduction expression
+      // must break into two subexpression and use the SYCL generic Reducer on the device.
+      if (RunningOnSycl) {
+        const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
+        const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
+        if (!data) {
+          data = static_cast<EvaluatorPointerType>(
+              m_device.get((CoeffReturnType*)m_device.allocate_temp(sizeof(CoeffReturnType) * num_coeffs_to_preserve)));
+          m_result = data;
+        }
+        Op reducer(m_reducer);
+        internal::GenericReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce,
+                                                        num_coeffs_to_preserve);
+        return (m_result != NULL);
+      }
+#endif
+    }
+    return true;
+  }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType data, EvalSubExprsCallback done) {
+    m_impl.evalSubExprsIfNeededAsync(NULL, [this, data, done](bool) { done(evalSubExprsIfNeededCommon(data)); });
+  }
+#endif
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return evalSubExprsIfNeededCommon(data);
+  }
+
+  EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+    if (m_result) {
+      m_device.deallocate_temp(m_result);
+      m_result = NULL;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    if ((RunningFullReduction || RunningOnGPU) && m_result) {
+      return *(m_result + index);
+    }
+    Op reducer(m_reducer);
+    if (ReducingInnerMostDims || RunningFullReduction) {
+      const Index num_values_to_reduce = (static_cast<int>(Layout) == static_cast<int>(ColMajor))
+                                             ? m_preservedStrides[0]
+                                             : m_preservedStrides[NumPreservedStrides - 1];
+      return internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstInput(index), num_values_to_reduce, reducer);
+    } else {
+      typename Self::CoeffReturnType accum = reducer.initialize();
+      internal::GenericDimReducer<NumReducedDims - 1, Self, Op>::reduce(*this, firstInput(index), reducer, &accum);
+      return reducer.finalize(accum);
+    }
+  }
+
+  // TODO(bsteiner): provide a more efficient implementation.
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+    eigen_assert(index + PacketSize - 1 < Index(internal::array_prod(dimensions())));
+
+    if (RunningOnGPU && m_result) {
+      return internal::pload<PacketReturnType>(m_result + index);
+    }
+
+    EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
+    if (ReducingInnerMostDims) {
+      const Index num_values_to_reduce = (static_cast<int>(Layout) == static_cast<int>(ColMajor))
+                                             ? m_preservedStrides[0]
+                                             : m_preservedStrides[NumPreservedStrides - 1];
+      const Index firstIndex = firstInput(index);
+      for (Index i = 0; i < PacketSize; ++i) {
+        Op reducer(m_reducer);
+        values[i] = internal::InnerMostDimReducer<Self, Op>::reduce(*this, firstIndex + i * num_values_to_reduce,
+                                                                    num_values_to_reduce, reducer);
+      }
+    } else if (PreservingInnerMostDims) {
+      const Index firstIndex = firstInput(index);
+      const int innermost_dim = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : NumOutputDims - 1;
+      // TBD: extend this the the n innermost dimensions that we preserve.
+      if (((firstIndex % m_dimensions[innermost_dim]) + PacketSize - 1) < m_dimensions[innermost_dim]) {
+        Op reducer(m_reducer);
+        typename Self::PacketReturnType accum = reducer.template initializePacket<typename Self::PacketReturnType>();
+        internal::InnerMostDimPreserver<NumReducedDims - 1, Self, Op>::reduce(*this, firstIndex, reducer, &accum);
+        return reducer.finalizePacket(accum);
+      } else {
+        for (int i = 0; i < PacketSize; ++i) {
+          values[i] = coeff(index + i);
+        }
+      }
+    } else {
+      for (int i = 0; i < PacketSize; ++i) {
+        values[i] = coeff(index + i);
+      }
+    }
+    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+    return rslt;
+  }
+
+  // Must be called after evalSubExprsIfNeeded().
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    if (RunningFullReduction && m_result) {
+      return TensorOpCost(sizeof(CoeffReturnType), 0, 0, vectorized, PacketSize);
+    } else {
+      const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
+      const double compute_cost = num_values_to_reduce * internal::functor_traits<Op>::Cost;
+      return m_impl.costPerCoeff(vectorized) * num_values_to_reduce +
+             TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return m_result; }
+  EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+  EIGEN_DEVICE_FUNC const Device& device() const { return m_device; }
+
+ private:
+  template <int, typename, typename>
+  friend struct internal::GenericDimReducer;
+  template <typename, typename, bool, bool>
+  friend struct internal::InnerMostDimReducer;
+  template <int, typename, typename, bool>
+  friend struct internal::InnerMostDimPreserver;
+  template <typename S, typename O, typename D, bool V>
+  friend struct internal::FullReducer;
+#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
+  template <int B, int N, typename S, typename R, typename I_>
+  KERNEL_FRIEND void internal::FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*);
+#if defined(EIGEN_HAS_GPU_FP16)
+  template <typename S, typename R, typename I_>
+  KERNEL_FRIEND void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I_,
+                                                                     internal::packet_traits<Eigen::half>::type*);
+  template <int B, int N, typename S, typename R, typename I_>
+  KERNEL_FRIEND void internal::FullReductionKernelHalfFloat(R, const S, I_, half*,
+                                                            internal::packet_traits<Eigen::half>::type*);
+  template <int NPT, typename S, typename R, typename I_>
+  KERNEL_FRIEND void internal::InnerReductionKernelHalfFloat(R, const S, I_, I_, half*);
+#endif
+  template <int NPT, typename S, typename R, typename I_>
+  KERNEL_FRIEND void internal::InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
+
+  template <int NPT, typename S, typename R, typename I_>
+  KERNEL_FRIEND void internal::OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
+#endif
+
+#if defined(EIGEN_USE_SYCL)
+  template <typename Evaluator_, typename Op__>
+  friend class TensorSycl::internal::GenericNondeterministicReducer;
+  // SYCL need the Generic reducer for the case the recution algorithm is neither inner, outer, and full reducer
+  template <typename, typename, typename>
+  friend struct internal::GenericReducer;
+#endif
+
+  template <typename S, typename O, typename D>
+  friend struct internal::InnerReducer;
+
+  struct BlockIteratorState {
+    Index input_dim;
+    Index output_size;
+    Index output_count;
+  };
+
+  // Returns the Index in the input tensor of the first value that needs to be
+  // used to compute the reduction at output index "index".
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
+    if (ReducingInnerMostDims) {
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        return index * m_preservedStrides[0];
+      } else {
+        return index * m_preservedStrides[NumPreservedStrides - 1];
+      }
+    }
+    // TBD: optimize the case where we preserve the innermost dimensions.
+    Index startInput = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumOutputDims - 1; i > 0; --i) {
+        // This is index_i in the output tensor.
+        const Index idx = index / m_outputStrides[i];
+        startInput += idx * m_preservedStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      if (PreservingInnerMostDims) {
+        eigen_assert(m_preservedStrides[0] == 1);
+        startInput += index;
+      } else {
+        startInput += index * m_preservedStrides[0];
+      }
+    } else {
+      for (int i = 0; i < NumOutputDims - 1; ++i) {
+        // This is index_i in the output tensor.
+        const Index idx = index / m_outputStrides[i];
+        startInput += idx * m_preservedStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      if (PreservingInnerMostDims) {
+        eigen_assert(m_preservedStrides[NumPreservedStrides - 1] == 1);
+        startInput += index;
+      } else {
+        startInput += index * m_preservedStrides[NumPreservedStrides - 1];
+      }
+    }
+    return startInput;
+  }
+
+  // Bitmap indicating if an input dimension is reduced or not.
+  array<bool, NumInputDims> m_reduced;
+  // Dimensions of the output of the operation.
+  Dimensions m_dimensions;
+  // Precomputed strides for the output tensor.
+  // Avoid zero-sized arrays, since element access fails to compile on GPU.
+  array<Index, (std::max)(NumOutputDims, 1)> m_outputStrides;
+  array<internal::TensorIntDivisor<Index>, (std::max)(NumOutputDims, 1)> m_fastOutputStrides;
+  array<Index, (std::max)(NumPreservedStrides, 1)> m_preservedStrides;
+  // Map from output to input dimension index.
+  array<Index, (std::max)(NumOutputDims, 1)> m_output_to_input_dim_map;
+  // How many values go into each reduction
+  Index m_numValuesToReduce;
+
+  // Subset of strides of the input tensor for the reduced dimensions.
+  // Indexed by reduced dimensions.
+  array<Index, NumReducedDims> m_reducedStrides;
+  // Size of the input dimensions that are reduced.
+  // Indexed by reduced dimensions.
+  array<Index, NumReducedDims> m_reducedDims;
+
+  // Evaluator for the input expression.
+  TensorEvaluator<ArgType, Device> m_impl;
+
+  // Operation to apply for computing the reduction.
+  Op m_reducer;
+
+  EvaluatorPointerType m_result;
+
+  const Device EIGEN_DEVICE_REF m_device;
+};
+
+template <typename Op, typename Dims, typename ArgType, template <class> class MakePointer_, typename Device>
+struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device>
+    : public TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> {
+  typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Device> Base;
+  EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Device& device) : Base(op, device) {}
+};
+
+template <typename Op, typename Dims, typename ArgType, template <class> class MakePointer_>
+struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice>
+    : public TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice> {
+  typedef TensorReductionEvaluatorBase<const TensorReductionOp<Op, Dims, ArgType, MakePointer_>, Eigen::SyclDevice>
+      Base;
+  EIGEN_STRONG_INLINE TensorEvaluator(const typename Base::XprType& op, const Eigen::SyclDevice& device)
+      : Base(op, device) {}
+  // The coeff function in the base the recursive method which is not an standard layout and cannot be used in the SYCL
+  // kernel
+  // Therefore the coeff function should be overridden by for SYCL kernel
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::CoeffReturnType coeff(typename Base::Index index) const {
+    return *(this->data() + index);
+  }
+  // The packet function in the base the recursive method which is not an standard layout and cannot be used in the SYCL
+  // kernel
+  // Therefore the packet function should be overridden by for SYCL kernel
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename Base::PacketReturnType packet(typename Base::Index index) const {
+    return internal::pload<typename Base::PacketReturnType>(this->data() + index);
+  }
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
new file mode 100644
index 00000000..c5273e9b
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
@@ -0,0 +1,958 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
+#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
+// Full reducers for GPU, don't vectorize for now
+
+// Reducer function that enables multiple gpu thread to safely accumulate at the same
+// output address. It basically reads the current value of the output variable, and
+// attempts to update it with the new value. If in the meantime another gpu thread
+// updated the content of the output address it will try again.
+template <typename T, typename R>
+__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) {
+#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
+  if (sizeof(T) == 4) {
+    unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
+    unsigned int newval = oldval;
+    reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+    if (newval == oldval) {
+      return;
+    }
+    unsigned int readback;
+    while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
+      oldval = readback;
+      newval = oldval;
+      reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+      if (newval == oldval) {
+        return;
+      }
+    }
+  } else if (sizeof(T) == 8) {
+    unsigned long long oldval = *reinterpret_cast<unsigned long long*>(output);
+    unsigned long long newval = oldval;
+    reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+    if (newval == oldval) {
+      return;
+    }
+    unsigned long long readback;
+    while ((readback = atomicCAS(reinterpret_cast<unsigned long long*>(output), oldval, newval)) != oldval) {
+      oldval = readback;
+      newval = oldval;
+      reducer.reduce(accum, reinterpret_cast<T*>(&newval));
+      if (newval == oldval) {
+        return;
+      }
+    }
+  } else {
+    gpu_assert(0 && "Wordsize not supported");
+  }
+#else   // EIGEN_CUDA_ARCH >= 300
+  EIGEN_UNUSED_VARIABLE(output);
+  EIGEN_UNUSED_VARIABLE(accum);
+  EIGEN_UNUSED_VARIABLE(reducer);
+  gpu_assert(0 && "Shouldn't be called on unsupported device");
+#endif  // EIGEN_CUDA_ARCH >= 300
+}
+
+// We extend atomicExch to support extra data types
+template <typename Type>
+__device__ inline Type atomicExchCustom(Type* address, Type val) {
+  return atomicExch(address, val);
+}
+
+template <>
+__device__ inline double atomicExchCustom(double* address, double val) {
+  unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address);
+  return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
+}
+
+#ifdef EIGEN_HAS_GPU_FP16
+template <typename R>
+__device__ inline void atomicReduce(half2* output, half2 accum, R& reducer) {
+  unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
+  unsigned int newval = oldval;
+  reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
+  if (newval == oldval) {
+    return;
+  }
+  unsigned int readback;
+  while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
+    oldval = readback;
+    newval = oldval;
+    reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
+    if (newval == oldval) {
+      return;
+    }
+  }
+}
+#ifdef EIGEN_GPU_COMPILE_PHASE
+// reduction should be associative since reduction is not atomic in wide vector but atomic in half2 operations
+template <typename R>
+__device__ inline void atomicReduce(Packet4h2* output, Packet4h2 accum, R& reducer) {
+  half2* houtput = reinterpret_cast<half2*>(output);
+  half2* haccum = reinterpret_cast<half2*>(&accum);
+  for (int i = 0; i < 4; ++i) {
+    atomicReduce(houtput + i, *(haccum + i), reducer);
+  }
+}
+#endif  // EIGEN_GPU_COMPILE_PHASE
+#endif  // EIGEN_HAS_GPU_FP16
+
+template <>
+__device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
+#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
+  atomicAdd(output, accum);
+#else   // EIGEN_CUDA_ARCH >= 300
+  EIGEN_UNUSED_VARIABLE(output);
+  EIGEN_UNUSED_VARIABLE(accum);
+  gpu_assert(0 && "Shouldn't be called on unsupported device");
+#endif  // EIGEN_CUDA_ARCH >= 300
+}
+
+template <typename CoeffType, typename Index>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs,
+                                                                 CoeffType* output) {
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  const Index num_threads = blockDim.x * gridDim.x;
+  for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
+    output[i] = val;
+  }
+}
+
+template <int BlockSize, int NumPerThread, typename Self, typename Reducer, typename Index>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
+                                                                 typename Self::CoeffReturnType* output,
+                                                                 unsigned int* semaphore) {
+#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
+  // Initialize the output value
+  const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
+  if (gridDim.x == 1) {
+    if (first_index == 0) {
+      *output = reducer.initialize();
+    }
+  } else {
+    if (threadIdx.x == 0) {
+      unsigned int block = atomicCAS(semaphore, 0u, 1u);
+      if (block == 0) {
+        // We're the first block to run, initialize the output value
+        atomicExchCustom(output, reducer.initialize());
+        __threadfence();
+        atomicExch(semaphore, 2u);
+      } else {
+        // Wait for the first block to initialize the output value.
+        // Use atomicCAS here to ensure that the reads aren't cached
+        unsigned int val;
+        do {
+          val = atomicCAS(semaphore, 2u, 2u);
+        } while (val < 2u);
+      }
+    }
+  }
+
+  __syncthreads();
+
+  eigen_assert(gridDim.x == 1 || *semaphore >= 2u);
+
+  typename Self::CoeffReturnType accum = reducer.initialize();
+  Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread * BlockSize);
+  for (Index i = 0; i < max_iter; i += BlockSize) {
+    const Index index = first_index + i;
+    eigen_assert(index < num_coeffs);
+    typename Self::CoeffReturnType val = input.m_impl.coeff(index);
+    reducer.reduce(val, &accum);
+  }
+
+#pragma unroll
+  for (int offset = warpSize / 2; offset > 0; offset /= 2) {
+#if defined(EIGEN_HIPCC)
+    // use std::is_floating_point to determine the type of reduced_val
+    // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error
+    // and list the float and int versions of __shfl_down as the candidate functions.
+    if (std::is_floating_point<typename Self::CoeffReturnType>::value) {
+      reducer.reduce(__shfl_down(static_cast<float>(accum), offset, warpSize), &accum);
+    } else {
+      reducer.reduce(__shfl_down(static_cast<int>(accum), offset, warpSize), &accum);
+    }
+#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
+    reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
+#else
+    reducer.reduce(__shfl_down_sync(0xFFFFFFFF, accum, offset, warpSize), &accum);
+#endif
+  }
+
+  if ((threadIdx.x & (warpSize - 1)) == 0) {
+    atomicReduce(output, accum, reducer);
+  }
+
+  if (gridDim.x > 1 && threadIdx.x == 0) {
+    // Let the last block reset the semaphore
+    atomicInc(semaphore, gridDim.x + 1);
+#if defined(EIGEN_HIPCC)
+    __threadfence_system();
+#endif
+  }
+#else   // EIGEN_CUDA_ARCH >= 300
+  EIGEN_UNUSED_VARIABLE(reducer);
+  EIGEN_UNUSED_VARIABLE(input);
+  EIGEN_UNUSED_VARIABLE(num_coeffs);
+  EIGEN_UNUSED_VARIABLE(output);
+  EIGEN_UNUSED_VARIABLE(semaphore);
+  gpu_assert(0 && "Shouldn't be called on unsupported device");
+#endif  // EIGEN_CUDA_ARCH >= 300
+}
+
+#ifdef EIGEN_HAS_GPU_FP16
+template <typename Self, typename Reducer, typename Index>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input,
+                                                                                   Index num_coeffs, half* scratch) {
+  eigen_assert(blockDim.x == 1);
+  eigen_assert(gridDim.x == 1);
+  typedef packet_traits<Eigen::half>::type packet_type;
+  Index packet_remainder = num_coeffs % Index(unpacket_traits<packet_type>::size);
+  if (packet_remainder != 0) {
+    half2* h2scratch = reinterpret_cast<half2*>(scratch);
+    for (Index i = num_coeffs - packet_remainder; i + 2 <= num_coeffs; i += 2) {
+      *h2scratch = __halves2half2(input.coeff(i), input.coeff(i + 1));
+      h2scratch++;
+    }
+    if ((num_coeffs & 1) != 0) {
+      half lastCoeff = input.coeff(num_coeffs - 1);
+      *h2scratch = __halves2half2(lastCoeff, reducer.initialize());
+    }
+  } else {
+    packet_type reduce = reducer.template initializePacket<packet_type>();
+    internal::pstoreu(scratch, reduce);
+  }
+}
+
+template <typename Self, typename Reducer, typename Index>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernelHalfFloat(Reducer reducer, const Self /*input*/,
+                                                                          Index num_coeffs, half* output) {
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  const Index num_threads = blockDim.x * gridDim.x;
+  typedef typename packet_traits<Eigen::half>::type PacketType;
+
+  const Index num_packets = num_coeffs / Index(unpacket_traits<PacketType>::size);
+  PacketType* p_output = reinterpret_cast<PacketType*>(output);
+  for (Index i = thread_id; i < num_packets; i += num_threads) {
+    p_output[i] = reducer.template initializePacket<PacketType>();
+  }
+  Index packet_remainder = num_coeffs % Index(unpacket_traits<PacketType>::size);
+  if (thread_id < packet_remainder) {
+    output[num_coeffs - packet_remainder + thread_id] = reducer.initialize();
+  }
+}
+
+template <int BlockSize, int NumPerThread, typename Self, typename Reducer, typename Index>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(Reducer reducer, const Self input,
+                                                                          Index num_coeffs, half* output,
+                                                                          half* scratch) {
+  typedef typename packet_traits<Eigen::half>::type PacketType;
+  const int packet_width = unpacket_traits<PacketType>::size;
+  eigen_assert(NumPerThread % packet_width == 0);
+  const Index first_index = blockIdx.x * BlockSize * NumPerThread + packet_width * threadIdx.x;
+
+  // Initialize the output value if it wasn't initialized by the ReductionInitKernel
+
+  if (gridDim.x == 1) {
+    if (first_index == 0) {
+      int rem = num_coeffs % packet_width;
+      if (rem != 0) {
+        half2* p_scratch = reinterpret_cast<half2*>(scratch);
+        pstoreu(scratch, reducer.template initializePacket<PacketType>());
+        for (int i = 0; i < rem / 2; i++) {
+          *p_scratch = __halves2half2(input.coeff(num_coeffs - packet_width + 2 * i),
+                                      input.coeff(num_coeffs - packet_width + 2 * i + 1));
+          p_scratch++;
+        }
+        if ((num_coeffs & 1) != 0) {
+          half last = input.coeff(num_coeffs - 1);
+          *p_scratch = __halves2half2(last, reducer.initialize());
+        }
+      } else {
+        PacketType reduce = reducer.template initializePacket<PacketType>();
+        pstoreu(scratch, reduce);
+      }
+    }
+    __syncthreads();
+  }
+
+  PacketType accum = reducer.template initializePacket<PacketType>();
+  const Index max_iter =
+      numext::mini<Index>((num_coeffs - first_index) / packet_width, NumPerThread * BlockSize / packet_width);
+  for (Index i = 0; i < max_iter; i += BlockSize) {
+    const Index index = first_index + packet_width * i;
+    eigen_assert(index + packet_width < num_coeffs);
+    PacketType val = input.template packet<Unaligned>(index);
+    reducer.reducePacket(val, &accum);
+  }
+
+#pragma unroll
+  for (int offset = warpSize / 2; offset > 0; offset /= 2) {
+#if defined(EIGEN_HIPCC)
+    PacketType r1;
+    half2* hr = reinterpret_cast<half2*>(&r1);
+    half2* hacc = reinterpret_cast<half2*>(&accum);
+    for (int i = 0; i < packet_width / 2; i++) {
+      // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
+      union {
+        int i;
+        half2 h;
+      } wka_in, wka_out;
+      wka_in.h = hacc[i];
+      wka_out.i = __shfl_down(wka_in.i, offset, warpSize);
+      hr[i] = wka_out.h;
+    }
+    reducer.reducePacket(r1, &accum);
+#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
+    PacketType r1;
+    half2* hr = reinterpret_cast<half2*>(&r1);
+    half2* hacc = reinterpret_cast<half2*>(&accum);
+    for (int i = 0; i < packet_width / 2; i++) {
+      hr[i] = __shfl_down(hacc[i], offset, warpSize);
+    }
+    reducer.reducePacket(r1, &accum);
+#else
+    PacketType r1;
+    half2* hr = reinterpret_cast<half2*>(&r1);
+    half2* hacc = reinterpret_cast<half2*>(&accum);
+    for (int i = 0; i < packet_width / 2; i++) {
+      hr[i] = __shfl_down_sync(0xFFFFFFFF, hacc[i], (unsigned)offset, warpSize);
+    }
+    reducer.reducePacket(r1, &accum);
+
+#endif
+  }
+
+  if ((threadIdx.x & (warpSize - 1)) == 0) {
+    atomicReduce(reinterpret_cast<PacketType*>(scratch), accum, reducer);
+  }
+
+  __syncthreads();
+  half2* rv1 = reinterpret_cast<half2*>(scratch);
+  if (packet_width > 2) {
+    reducer.reducePacket(rv1[2], rv1);
+    reducer.reducePacket(rv1[3], rv1 + 1);
+    reducer.reducePacket(rv1[1], rv1);
+  }
+  if (gridDim.x == 1) {
+    if (first_index == 0) {
+      half tmp = __low2half(*rv1);
+      reducer.reduce(__high2half(*rv1), &tmp);
+      *output = tmp;
+    }
+  }
+}
+
+template <typename Op>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionCleanupKernelHalfFloat(Op reducer, half* output, half* scratch) {
+  eigen_assert(threadIdx.x == 1);
+  typedef packet_traits<Eigen::half>::type packet_type;
+  if (unpacket_traits<packet_type>::size == 1) {
+    *output = *scratch;
+  } else {
+    half2* pscratch = reinterpret_cast<half2*>(scratch);
+    half tmp = __float2half(0.f);
+    for (int i = 0; i < unpacket_traits<packet_type>::size; i += 2) {
+      reducer.reduce(__low2half(*pscratch), &tmp);
+      reducer.reduce(__high2half(*pscratch), &tmp);
+      pscratch++;
+    }
+    *output = tmp;
+  }
+}
+
+#endif  // EIGEN_HAS_GPU_FP16
+
+template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
+struct FullReductionLauncher {
+  static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
+    gpu_assert(false && "Should only be called on doubles, floats and half floats");
+  }
+};
+
+// Specialization for float and double
+template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+struct FullReductionLauncher<
+    Self, Op, OutputType, PacketAccess,
+    std::enable_if_t<internal::is_same<float, OutputType>::value || internal::is_same<double, OutputType>::value,
+                     void>> {
+  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output,
+                  typename Self::Index num_coeffs) {
+    typedef typename Self::Index Index;
+    const int block_size = 256;
+    const int num_per_thread = 128;
+    const int num_blocks = numext::div_ceil<int>(num_coeffs, block_size * num_per_thread);
+
+    unsigned int* semaphore = NULL;
+    if (num_blocks > 1) {
+      semaphore = device.semaphore();
+    }
+
+    LAUNCH_GPU_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>), num_blocks, block_size, 0,
+                      device, reducer, self, num_coeffs, output, semaphore);
+  }
+};
+
+#ifdef EIGEN_HAS_GPU_FP16
+template <typename Self, typename Op>
+struct FullReductionLauncher<Self, Op, Eigen::half, false> {
+  static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) {
+    gpu_assert(false && "Should not be called since there is no packet accessor");
+  }
+};
+
+template <typename Self, typename Op>
+struct FullReductionLauncher<Self, Op, Eigen::half, true> {
+  static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output,
+                  typename Self::Index num_coeffs) {
+    typedef typename Self::Index Index;
+
+    const int block_size = 256;
+    const int num_per_thread = 128;
+    const int num_blocks = numext::div_ceil<int>(num_coeffs, block_size * num_per_thread);
+    half* scratch = static_cast<half*>(device.scratchpad());
+
+    if (num_blocks > 1) {
+      // We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there
+      // won't be a race conditions between multiple thread blocks.
+      LAUNCH_GPU_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>), 1, 1, 0, device, reducer, self,
+                        num_coeffs, scratch);
+    }
+
+    LAUNCH_GPU_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>), num_blocks,
+                      block_size, 0, device, reducer, self, num_coeffs, output, scratch);
+
+    if (num_blocks > 1) {
+      LAUNCH_GPU_KERNEL((ReductionCleanupKernelHalfFloat<Op>), 1, 1, 0, device, reducer, output, scratch);
+    }
+  }
+};
+#endif  // EIGEN_HAS_GPU_FP16
+
+template <typename Self, typename Op, bool Vectorizable>
+struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
+  // Unfortunately nvidia doesn't support well exotic types such as complex,
+  // so reduce the scope of the optimized version of the code to the simple cases
+  // of doubles, floats and half floats
+#ifdef EIGEN_HAS_GPU_FP16
+  static constexpr bool HasOptimizedImplementation =
+      !Self::ReducerTraits::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+                                           internal::is_same<typename Self::CoeffReturnType, double>::value ||
+                                           (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value &&
+                                            reducer_traits<Op, GpuDevice>::PacketAccess));
+#else   // EIGEN_HAS_GPU_FP16
+  static constexpr bool HasOptimizedImplementation =
+      !Self::ReducerTraits::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+                                           internal::is_same<typename Self::CoeffReturnType, double>::value);
+#endif  // EIGEN_HAS_GPU_FP16
+
+  template <typename OutputType>
+  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
+    gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
+    const Index num_coeffs = array_prod(self.m_impl.dimensions());
+    // Don't crash when we're called with an input tensor of size 0.
+    if (num_coeffs == 0) {
+      return;
+    }
+
+    FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device,
+                                                                                                  output, num_coeffs);
+  }
+};
+
+template <int NumPerThread, typename Self, typename Reducer, typename Index>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(Reducer reducer, const Self input,
+                                                                  Index num_coeffs_to_reduce,
+                                                                  Index num_preserved_coeffs,
+                                                                  typename Self::CoeffReturnType* output) {
+#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
+  typedef typename Self::CoeffReturnType Type;
+  eigen_assert(blockDim.y == 1);
+  eigen_assert(blockDim.z == 1);
+  eigen_assert(gridDim.y == 1);
+  eigen_assert(gridDim.z == 1);
+
+  const int unroll_times = 16;
+  eigen_assert(NumPerThread % unroll_times == 0);
+
+  const Index input_col_blocks = numext::div_ceil<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread);
+  const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
+
+  const Index num_threads = blockDim.x * gridDim.x;
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  // Initialize the output values if they weren't initialized by the ReductionInitKernel
+  if (gridDim.x == 1) {
+    for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
+      output[i] = reducer.initialize();
+    }
+    __syncthreads();
+  }
+
+  for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
+    const Index row = i / input_col_blocks;
+
+    if (row < num_preserved_coeffs) {
+      const Index col_block = i % input_col_blocks;
+      const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x;
+
+      Type reduced_val = reducer.initialize();
+
+      for (Index j = 0; j < NumPerThread; j += unroll_times) {
+        const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1);
+        if (last_col >= num_coeffs_to_reduce) {
+          for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) {
+            const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
+            reducer.reduce(val, &reduced_val);
+          }
+          break;
+        } else {
+          // Faster version of the loop with no branches after unrolling.
+#pragma unroll
+          for (int k = 0; k < unroll_times; ++k) {
+            const Index col = col_begin + blockDim.x * (j + k);
+            reducer.reduce(input.m_impl.coeff(row * num_coeffs_to_reduce + col), &reduced_val);
+          }
+        }
+      }
+
+#pragma unroll
+      for (int offset = warpSize / 2; offset > 0; offset /= 2) {
+#if defined(EIGEN_HIPCC)
+        // use std::is_floating_point to determine the type of reduced_val
+        // This is needed because when Type == double, hipcc will give a "call to __shfl_down is ambguous" error
+        // and list the float and int versions of __shfl_down as the candidate functions.
+        if (std::is_floating_point<Type>::value) {
+          reducer.reduce(__shfl_down(static_cast<float>(reduced_val), offset), &reduced_val);
+        } else {
+          reducer.reduce(__shfl_down(static_cast<int>(reduced_val), offset), &reduced_val);
+        }
+#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
+        reducer.reduce(__shfl_down(reduced_val, offset), &reduced_val);
+#else
+        reducer.reduce(__shfl_down_sync(0xFFFFFFFF, reduced_val, offset), &reduced_val);
+#endif
+      }
+
+      if ((threadIdx.x & (warpSize - 1)) == 0) {
+        atomicReduce(&(output[row]), reduced_val, reducer);
+      }
+    }
+  }
+#else   // EIGEN_CUDA_ARCH >= 300
+  gpu_assert(0 && "Shouldn't be called on unsupported device");
+#endif  // EIGEN_CUDA_ARCH >= 300
+}
+
+#ifdef EIGEN_HAS_GPU_FP16
+
+template <int NumPerThread, typename Self, typename Reducer, typename Index>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reducer reducer, const Self input,
+                                                                           Index num_coeffs_to_reduce,
+                                                                           Index num_preserved_coeffs, half* output) {
+  eigen_assert(blockDim.y == 1);
+  eigen_assert(blockDim.z == 1);
+  eigen_assert(gridDim.y == 1);
+  eigen_assert(gridDim.z == 1);
+
+  typedef typename packet_traits<Eigen::half>::type PacketType;
+  const int packet_width = unpacket_traits<PacketType>::size;
+  const int unroll_times = 16 / packet_width;
+  eigen_assert(NumPerThread % unroll_times == 0);
+  eigen_assert(unroll_times % 2 == 0);
+
+  const Index input_col_blocks = numext::div_ceil<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2);
+  const Index num_input_blocks = numext::div_ceil<Index>(input_col_blocks * num_preserved_coeffs, 2);
+
+  const Index num_threads = blockDim.x * gridDim.x;
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  // Initialize the output values if they weren't initialized by the ReductionInitKernel
+  if (gridDim.x == 1) {
+    Index i = packet_width * thread_id;
+    for (; i + packet_width <= num_preserved_coeffs; i += packet_width * num_threads) {
+      PacketType* poutput = reinterpret_cast<PacketType*>(output + i);
+      *poutput = reducer.template initializePacket<PacketType>();
+    }
+    if (i < num_preserved_coeffs) {
+      output[i] = reducer.initialize();
+    }
+    __syncthreads();
+  }
+
+  for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
+    const Index row = 2 * (i / input_col_blocks);  // everybody takes 2 rows
+
+    if (row + 1 < num_preserved_coeffs) {
+      const Index col_block = i % input_col_blocks;
+      const Index col_begin = packet_width * (col_block * blockDim.x * NumPerThread + threadIdx.x);
+
+      PacketType reduced_val1 = reducer.template initializePacket<PacketType>();
+      PacketType reduced_val2 = reducer.template initializePacket<PacketType>();
+
+      for (Index j = 0; j < NumPerThread; j += unroll_times) {
+        const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1) * packet_width;
+        if (last_col >= num_coeffs_to_reduce) {
+          Index col = col_begin + blockDim.x * j;
+          for (; col + packet_width <= num_coeffs_to_reduce; col += blockDim.x) {
+            const PacketType val1 = input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col);
+            reducer.reducePacket(val1, &reduced_val1);
+            const PacketType val2 = input.m_impl.template packet<Unaligned>((row + 1) * num_coeffs_to_reduce + col);
+            reducer.reducePacket(val2, &reduced_val2);
+          }
+          if (col < num_coeffs_to_reduce) {
+            PacketType r1 = reducer.template initializePacket<PacketType>();
+            PacketType r2 = reducer.template initializePacket<PacketType>();
+            half2* hr1 = reinterpret_cast<half2*>(&r1);
+            half2* hr2 = reinterpret_cast<half2*>(&r2);
+            while (col + 1 < num_coeffs_to_reduce) {
+              *hr1 = __halves2half2(input.m_impl.coeff(row * num_coeffs_to_reduce + col),
+                                    input.m_impl.coeff(row * num_coeffs_to_reduce + col + 1));
+              *hr2 = __halves2half2(input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col),
+                                    input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col + 1));
+              hr1++;
+              hr2++;
+              col += 2;
+            }
+            if (col < num_coeffs_to_reduce) {
+              // Peel;
+              const half last1 = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
+              *hr1 = __halves2half2(last1, reducer.initialize());
+              const half last2 = input.m_impl.coeff((row + 1) * num_coeffs_to_reduce + col);
+              *hr2 = __halves2half2(last2, reducer.initialize());
+            }
+            reducer.reducePacket(r1, &reduced_val1);
+            reducer.reducePacket(r2, &reduced_val2);
+          }
+          break;
+        } else {
+          // Faster version of the loop with no branches after unrolling.
+#pragma unroll
+          for (int k = 0; k < unroll_times; ++k) {
+            const Index col = col_begin + blockDim.x * (j + k) * packet_width;
+            reducer.reducePacket(input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col),
+                                 &reduced_val1);
+            reducer.reducePacket(input.m_impl.template packet<Unaligned>((row + 1) * num_coeffs_to_reduce + col),
+                                 &reduced_val2);
+          }
+        }
+      }
+
+#pragma unroll
+      for (int offset = warpSize / 2; offset > 0; offset /= 2) {
+#if defined(EIGEN_HIPCC)
+        PacketType r1;
+        PacketType r2;
+        half2* hr1 = reinterpret_cast<half2*>(&r1);
+        half2* hr2 = reinterpret_cast<half2*>(&r2);
+        half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
+        half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
+        for (int i = 0; i < packet_width / 2; i++) {
+          // FIXME : remove this workaround once we have native half/half2 support for __shfl_down
+          union {
+            int i;
+            half2 h;
+          } wka_in1, wka_out1;
+          wka_in1.h = rv1[i];
+          wka_out1.i = __shfl_down(wka_in1.i, offset, warpSize);
+          hr1[i] = wka_out1.h;
+
+          union {
+            int i;
+            half2 h;
+          } wka_in2, wka_out2;
+          wka_in2.h = rv2[i];
+          wka_out2.i = __shfl_down(wka_in2.i, offset, warpSize);
+          hr2[i] = wka_out2.h;
+        }
+        reducer.reducePacket(r1, &reduced_val1);
+        reducer.reducePacket(r2, &reduced_val2);
+#elif defined(EIGEN_CUDA_SDK_VER) && EIGEN_CUDA_SDK_VER < 90000
+        PacketType r1;
+        PacketType r2;
+        half2* hr1 = reinterpret_cast<half2*>(&r1);
+        half2* hr2 = reinterpret_cast<half2*>(&r2);
+        half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
+        half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
+        for (int i = 0; i < packet_width / 2; i++) {
+          hr1[i] = __shfl_down(rv1[i], offset, warpSize);
+          hr2[i] = __shfl_down(rv2[i], offset, warpSize);
+        }
+        reducer.reducePacket(r1, &reduced_val1);
+        reducer.reducePacket(r2, &reduced_val2);
+#else
+        PacketType r1;
+        PacketType r2;
+        half2* hr1 = reinterpret_cast<half2*>(&r1);
+        half2* hr2 = reinterpret_cast<half2*>(&r2);
+        half2* rr1 = reinterpret_cast<half2*>(&reduced_val1);
+        half2* rr2 = reinterpret_cast<half2*>(&reduced_val2);
+        for (int j = 0; j < packet_width / 2; j++) {
+          hr1[j] = __shfl_down_sync(0xFFFFFFFF, rr1[j], (unsigned)offset, warpSize);
+          hr2[j] = __shfl_down_sync(0xFFFFFFFF, rr2[j], (unsigned)offset, warpSize);
+        }
+        reducer.reducePacket(r1, &reduced_val1);
+        reducer.reducePacket(r2, &reduced_val2);
+
+#endif
+      }
+      half2* rv1 = reinterpret_cast<half2*>(&reduced_val1);
+      half2* rv2 = reinterpret_cast<half2*>(&reduced_val2);
+      half2 val;
+      if (packet_width > 2) {
+        reducer.reducePacket(rv1[2], rv1);
+        reducer.reducePacket(rv1[3], rv1 + 1);
+        reducer.reducePacket(rv1[1], rv1);
+        reducer.reducePacket(rv2[2], rv2);
+        reducer.reducePacket(rv2[3], rv2 + 1);
+        reducer.reducePacket(rv2[1], rv2);
+      }
+      half val1 = __low2half(*rv1);
+      reducer.reduce(__high2half(*rv1), &val1);
+      half val2 = __low2half(*rv2);
+      reducer.reduce(__high2half(*rv2), &val2);
+      val = __halves2half2(val1, val2);
+      if ((threadIdx.x & (warpSize - 1)) == 0) {
+        half* loc = output + row;
+        atomicReduce(reinterpret_cast<half2*>(loc), val, reducer);
+      }
+    }
+  }
+}
+
+#endif  // EIGEN_HAS_GPU_FP16
+
+template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
+struct InnerReductionLauncher {
+  static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index,
+                                    typename Self::Index) {
+    gpu_assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device");
+    return true;
+  }
+};
+
+// Specialization for float and double
+template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+struct InnerReductionLauncher<
+    Self, Op, OutputType, PacketAccess,
+    std::enable_if_t<internal::is_same<float, OutputType>::value || internal::is_same<double, OutputType>::value,
+                     void>> {
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output,
+                  typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+    typedef typename Self::Index Index;
+
+    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
+    const int block_size = 256;
+    const int num_per_thread = 128;
+    const int dyn_blocks = numext::div_ceil<int>(num_coeffs, block_size * num_per_thread);
+    const int max_blocks = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / block_size;
+    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+
+    if (num_blocks > 1) {
+      // We initialize the outputs outside the reduction kernel when we can't be sure that there
+      // won't be a race conditions between multiple thread blocks.
+      const int dyn_blocks2 = numext::div_ceil<int>(num_preserved_vals, 1024);
+      const int max_blocks2 = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / 1024;
+      const int num_blocks2 = numext::mini<int>(max_blocks2, dyn_blocks2);
+      LAUNCH_GPU_KERNEL((ReductionInitKernel<OutputType, Index>), num_blocks2, 1024, 0, device, reducer.initialize(),
+                        num_preserved_vals, output);
+    }
+
+    LAUNCH_GPU_KERNEL((InnerReductionKernel<num_per_thread, Self, Op, Index>), num_blocks, block_size, 0, device,
+                      reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
+
+    return false;
+  }
+};
+
+#ifdef EIGEN_HAS_GPU_FP16
+template <typename Self, typename Op>
+struct InnerReductionLauncher<Self, Op, Eigen::half, false> {
+  static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) {
+    gpu_assert(false && "Should not be called since there is no packet accessor");
+    return true;
+  }
+};
+
+template <typename Self, typename Op>
+struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, half* output,
+                  typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+    typedef typename Self::Index Index;
+
+    if (num_preserved_vals % 2 != 0) {
+      // Not supported yet, revert to the slower code path
+      return true;
+    }
+
+    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
+    const int block_size = /*256*/ 128;
+    const int num_per_thread = /*128*/ 64;
+    const int dyn_blocks = numext::div_ceil<int>(num_coeffs, block_size * num_per_thread);
+    const int max_blocks = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / block_size;
+    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+
+    if (num_blocks > 1) {
+      // We initialize the outputs outside the reduction kernel when we can't be sure that there
+      // won't be a race conditions between multiple thread blocks.
+      LAUNCH_GPU_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>), 1, 1, 0, device, reducer, self,
+                        num_preserved_vals, output);
+    }
+
+    LAUNCH_GPU_KERNEL((InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>), num_blocks, block_size, 0,
+                      device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
+
+    return false;
+  }
+};
+#endif  // EIGEN_HAS_GPU_FP16
+
+template <typename Self, typename Op>
+struct InnerReducer<Self, Op, GpuDevice> {
+  // Unfortunately nvidia doesn't support well exotic types such as complex,
+  // so reduce the scope of the optimized version of the code to the simple case
+  // of floats and half floats.
+#ifdef EIGEN_HAS_GPU_FP16
+  static constexpr bool HasOptimizedImplementation =
+      !Self::ReducerTraits::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+                                           internal::is_same<typename Self::CoeffReturnType, double>::value ||
+                                           (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value &&
+                                            reducer_traits<Op, GpuDevice>::PacketAccess));
+#else   // EIGEN_HAS_GPU_FP16
+  static constexpr bool HasOptimizedImplementation =
+      !Self::ReducerTraits::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+                                           internal::is_same<typename Self::CoeffReturnType, double>::value);
+#endif  // EIGEN_HAS_GPU_FP16
+
+  template <typename OutputType>
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output,
+                  typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+    gpu_assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
+    const Index num_coeffs = array_prod(self.m_impl.dimensions());
+    // Don't crash when we're called with an input tensor of size 0.
+    if (num_coeffs == 0) {
+      return true;
+    }
+    // It's faster to use the usual code.
+    if (num_coeffs_to_reduce <= 128) {
+      return true;
+    }
+
+    return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(
+        self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
+  }
+};
+
+template <int NumPerThread, typename Self, typename Reducer, typename Index>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(Reducer reducer, const Self input,
+                                                                  Index num_coeffs_to_reduce,
+                                                                  Index num_preserved_coeffs,
+                                                                  typename Self::CoeffReturnType* output) {
+  const Index num_threads = blockDim.x * gridDim.x;
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  // Initialize the output values if they weren't initialized by the ReductionInitKernel
+  if (gridDim.x == 1) {
+    for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
+      output[i] = reducer.initialize();
+    }
+    __syncthreads();
+  }
+
+  // Do the reduction.
+  const Index max_iter = num_preserved_coeffs * numext::div_ceil<Index>(num_coeffs_to_reduce, NumPerThread);
+  for (Index i = thread_id; i < max_iter; i += num_threads) {
+    const Index input_col = i % num_preserved_coeffs;
+    const Index input_row = (i / num_preserved_coeffs) * NumPerThread;
+    typename Self::CoeffReturnType reduced_val = reducer.initialize();
+    const Index max_row = numext::mini(input_row + NumPerThread, num_coeffs_to_reduce);
+    for (Index j = input_row; j < max_row; j++) {
+      typename Self::CoeffReturnType val = input.m_impl.coeff(j * num_preserved_coeffs + input_col);
+      reducer.reduce(val, &reduced_val);
+    }
+    atomicReduce(&(output[input_col]), reduced_val, reducer);
+  }
+}
+
+template <typename Self, typename Op>
+struct OuterReducer<Self, Op, GpuDevice> {
+  // Unfortunately nvidia doesn't support well exotic types such as complex,
+  // so reduce the scope of the optimized version of the code to the simple case
+  // of floats.
+  static constexpr bool HasOptimizedImplementation =
+      !Self::ReducerTraits::IsStateful && (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+                                           internal::is_same<typename Self::CoeffReturnType, double>::value);
+  template <typename Device, typename OutputType>
+  static
+#if !defined(EIGEN_HIPCC)
+      // FIXME :  leaving this EIGEN_DEVICE_FUNC in, results in the following runtime error
+      //          (in the cxx11_tensor_reduction_gpu test)
+      //
+      // terminate called after throwing an instance of 'std::runtime_error'
+      //   what():  No device code available for function: _ZN5Eigen8internal20OuterReductionKernelIL...
+      //
+      // don't know why this happens (and why is it a runtime error instead of a compile time error)
+      //
+      // this will be fixed by HIP PR#457
+      EIGEN_DEVICE_FUNC
+#endif
+      bool
+      run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
+    gpu_assert(false && "Should only be called to reduce doubles or floats on a gpu device");
+    return true;
+  }
+
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output,
+                  typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+    typedef typename Self::Index Index;
+
+    // It's faster to use the usual code.
+    if (num_coeffs_to_reduce <= 32) {
+      return true;
+    }
+
+    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
+    const int block_size = 256;
+    const int num_per_thread = 16;
+    const int dyn_blocks = numext::div_ceil<int>(num_coeffs, block_size * num_per_thread);
+    const int max_blocks = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / block_size;
+    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+
+    if (num_blocks > 1) {
+      // We initialize the outputs in the reduction kernel itself when we don't have to worry
+      // about race conditions between multiple thread blocks.
+      const int dyn_blocks2 = numext::div_ceil<int>(num_preserved_vals, 1024);
+      const int max_blocks2 = device.getNumGpuMultiProcessors() * device.maxGpuThreadsPerMultiProcessor() / 1024;
+      const int num_blocks2 = numext::mini<int>(max_blocks2, dyn_blocks2);
+      LAUNCH_GPU_KERNEL((ReductionInitKernel<float, Index>), num_blocks2, 1024, 0, device, reducer.initialize(),
+                        num_preserved_vals, output);
+    }
+
+    LAUNCH_GPU_KERNEL((OuterReductionKernel<num_per_thread, Self, Op, Index>), num_blocks, block_size, 0, device,
+                      reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
+
+    return false;
+  }
+};
+
+#endif  // defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
new file mode 100644
index 00000000..b4749b41
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorReductionSycl.h
@@ -0,0 +1,588 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*****************************************************************
+ * TensorReductionSycl.h
+ *
+ * \brief:
+ *  This is the specialization of the reduction operation. Two phase reduction approach
+ * is used since the GPU does not have Global Synchronization for global memory among
+ * different work-group/thread block. To solve the problem, we need to create two kernels
+ * to reduce the data, where the first kernel reduce the data locally and each local
+ * workgroup/thread-block save the input data into global memory. In the second phase (global reduction)
+ * one work-group uses one work-group/thread-block to reduces the intermediate data into one single element.
+ * Here is an NVIDIA presentation explaining the optimized two phase reduction algorithm on GPU:
+ * https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf
+ *
+ *****************************************************************/
+
+#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
+#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace TensorSycl {
+namespace internal {
+
+template <typename Op, typename CoeffReturnType, typename Index, bool Vectorizable>
+struct OpDefiner {
+  typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, Vectorizable>::PacketReturnType PacketReturnType;
+  typedef Op type;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Op &op) { return op; }
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator,
+                                                                            const Index &) {
+    return accumulator;
+  }
+};
+
+template <typename CoeffReturnType, typename Index>
+struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, false> {
+  typedef Eigen::internal::SumReducer<CoeffReturnType> type;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) {
+    return type();
+  }
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType finalise_op(const CoeffReturnType &accumulator,
+                                                                           const Index &scale) {
+    ::Eigen::internal::scalar_quotient_op<CoeffReturnType> quotient_op;
+    return quotient_op(accumulator, CoeffReturnType(scale));
+  }
+};
+
+template <typename CoeffReturnType, typename Index>
+struct OpDefiner<Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType, Index, true> {
+  typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, true>::PacketReturnType PacketReturnType;
+  typedef Eigen::internal::SumReducer<CoeffReturnType> type;
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) {
+    return type();
+  }
+
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(const PacketReturnType &accumulator,
+                                                                            const Index &scale) {
+    return ::Eigen::internal::pdiv(accumulator, ::Eigen::internal::pset1<PacketReturnType>(CoeffReturnType(scale)));
+  }
+};
+
+template <typename CoeffReturnType, typename OpType, typename InputAccessor, typename OutputAccessor, typename Index,
+          Index local_range>
+struct SecondStepFullReducer {
+  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+      LocalAccessor;
+  typedef OpDefiner<OpType, CoeffReturnType, Index, true> OpDef;
+  typedef typename OpDef::type Op;
+  LocalAccessor scratch;
+  InputAccessor aI;
+  OutputAccessor outAcc;
+  Op op;
+  SecondStepFullReducer(LocalAccessor scratch_, InputAccessor aI_, OutputAccessor outAcc_, OpType op_)
+      : scratch(scratch_), aI(aI_), outAcc(outAcc_), op(OpDef::get_op(op_)) {}
+
+  void operator()(cl::sycl::nd_item<1> itemID) const {
+    // Our empirical research shows that the best performance will be achieved
+    // when there is only one element per thread to reduce in the second step.
+    // in this step the second step reduction time is almost negligible.
+    // Hence, in the second step of reduction the input size is fixed to the
+    // local size, thus, there is only one element read per thread. The
+    // algorithm must be changed if the number of reduce per thread in the
+    // second step is greater than 1. Otherwise, the result will be wrong.
+    const Index localid = itemID.get_local_id(0);
+    auto aInPtr = aI + localid;
+    auto aOutPtr = outAcc;
+    CoeffReturnType *scratchptr = scratch.get_pointer();
+    CoeffReturnType accumulator = *aInPtr;
+
+    scratchptr[localid] = op.finalize(accumulator);
+    for (Index offset = itemID.get_local_range(0) / 2; offset > 0; offset /= 2) {
+      itemID.barrier(cl::sycl::access::fence_space::local_space);
+      if (localid < offset) {
+        op.reduce(scratchptr[localid + offset], &accumulator);
+        scratchptr[localid] = op.finalize(accumulator);
+      }
+    }
+    if (localid == 0) *aOutPtr = op.finalize(accumulator);
+  }
+};
+
+// Full reduction first phase. In this version the vectorization is true and the reduction accept
+// any generic reducerOp  e.g( max, min, sum, mean, iamax, iamin, etc ).
+template <typename Evaluator, typename OpType, typename Evaluator::Index local_range>
+class FullReductionKernelFunctor {
+ public:
+  typedef typename Evaluator::CoeffReturnType CoeffReturnType;
+  typedef typename Evaluator::Index Index;
+  typedef OpDefiner<OpType, typename Evaluator::CoeffReturnType, Index,
+                    (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
+      OpDef;
+
+  typedef typename OpDef::type Op;
+  typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
+  typedef typename Evaluator::PacketReturnType PacketReturnType;
+  typedef std::conditional_t<(Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess), PacketReturnType,
+                             CoeffReturnType>
+      OutType;
+  typedef cl::sycl::accessor<OutType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+      LocalAccessor;
+  LocalAccessor scratch;
+  Evaluator evaluator;
+  EvaluatorPointerType final_output;
+  Index rng;
+  Op op;
+
+  FullReductionKernelFunctor(LocalAccessor scratch_, Evaluator evaluator_, EvaluatorPointerType final_output_,
+                             Index rng_, OpType op_)
+      : scratch(scratch_), evaluator(evaluator_), final_output(final_output_), rng(rng_), op(OpDef::get_op(op_)) {}
+
+  void operator()(cl::sycl::nd_item<1> itemID) const { compute_reduction(itemID); }
+
+  template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<Vect> compute_reduction(
+      const cl::sycl::nd_item<1> &itemID) const {
+    auto output_ptr = final_output;
+    Index VectorizedRange = (rng / Evaluator::PacketSize) * Evaluator::PacketSize;
+    Index globalid = itemID.get_global_id(0);
+    Index localid = itemID.get_local_id(0);
+    Index step = Evaluator::PacketSize * itemID.get_global_range(0);
+    Index start = Evaluator::PacketSize * globalid;
+    // vectorizable parts
+    PacketReturnType packetAccumulator = op.template initializePacket<PacketReturnType>();
+    for (Index i = start; i < VectorizedRange; i += step) {
+      op.template reducePacket<PacketReturnType>(evaluator.impl().template packet<Unaligned>(i), &packetAccumulator);
+    }
+    globalid += VectorizedRange;
+    // non vectorizable parts
+    for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) {
+      op.template reducePacket<PacketReturnType>(
+          ::Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, Evaluator::PacketSize>::convert_to_packet_type(
+              evaluator.impl().coeff(i), op.initialize()),
+          &packetAccumulator);
+    }
+    scratch[localid] = packetAccumulator =
+        OpDef::finalise_op(op.template finalizePacket<PacketReturnType>(packetAccumulator), rng);
+    // reduction parts // Local size is always power of 2
+    EIGEN_UNROLL_LOOP
+    for (Index offset = local_range / 2; offset > 0; offset /= 2) {
+      itemID.barrier(cl::sycl::access::fence_space::local_space);
+      if (localid < offset) {
+        op.template reducePacket<PacketReturnType>(scratch[localid + offset], &packetAccumulator);
+        scratch[localid] = op.template finalizePacket<PacketReturnType>(packetAccumulator);
+      }
+    }
+    if (localid == 0) {
+      output_ptr[itemID.get_group(0)] =
+          op.finalizeBoth(op.initialize(), op.template finalizePacket<PacketReturnType>(packetAccumulator));
+    }
+  }
+
+  template <bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE std::enable_if_t<!Vect> compute_reduction(
+      const cl::sycl::nd_item<1> &itemID) const {
+    auto output_ptr = final_output;
+    Index globalid = itemID.get_global_id(0);
+    Index localid = itemID.get_local_id(0);
+    // vectorizable parts
+    CoeffReturnType accumulator = op.initialize();
+    // non vectorizable parts
+    for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) {
+      op.reduce(evaluator.impl().coeff(i), &accumulator);
+    }
+    scratch[localid] = accumulator = OpDef::finalise_op(op.finalize(accumulator), rng);
+
+    // reduction parts. the local size is always power of 2
+    EIGEN_UNROLL_LOOP
+    for (Index offset = local_range / 2; offset > 0; offset /= 2) {
+      itemID.barrier(cl::sycl::access::fence_space::local_space);
+      if (localid < offset) {
+        op.reduce(scratch[localid + offset], &accumulator);
+        scratch[localid] = op.finalize(accumulator);
+      }
+    }
+    if (localid == 0) {
+      output_ptr[itemID.get_group(0)] = op.finalize(accumulator);
+    }
+  }
+};
+
+template <typename Evaluator, typename OpType>
+class GenericNondeterministicReducer {
+ public:
+  typedef typename Evaluator::CoeffReturnType CoeffReturnType;
+  typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
+  typedef typename Evaluator::Index Index;
+  typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef;
+  typedef typename OpDef::type Op;
+  template <typename Scratch>
+  GenericNondeterministicReducer(Scratch, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType functor_,
+                                 Index range_, Index num_values_to_reduce_)
+      : evaluator(evaluator_),
+        output_accessor(output_accessor_),
+        functor(OpDef::get_op(functor_)),
+        range(range_),
+        num_values_to_reduce(num_values_to_reduce_) {}
+
+  void operator()(cl::sycl::nd_item<1> itemID) const {
+    // This is to bypass the statefull condition in Eigen meanReducer
+    Op non_const_functor;
+    std::memcpy(&non_const_functor, &functor, sizeof(Op));
+    auto output_accessor_ptr = output_accessor;
+    Index globalid = static_cast<Index>(itemID.get_global_linear_id());
+    if (globalid < range) {
+      CoeffReturnType accum = functor.initialize();
+      Eigen::internal::GenericDimReducer<Evaluator::NumReducedDims - 1, Evaluator, Op>::reduce(
+          evaluator, evaluator.firstInput(globalid), non_const_functor, &accum);
+      output_accessor_ptr[globalid] = OpDef::finalise_op(functor.finalize(accum), num_values_to_reduce);
+    }
+  }
+
+ private:
+  Evaluator evaluator;
+  EvaluatorPointerType output_accessor;
+  Op functor;
+  Index range;
+  Index num_values_to_reduce;
+};
+
+enum class reduction_dim { inner_most, outer_most };
+// default is preserver
+template <typename Evaluator, typename OpType, typename PannelParameters, reduction_dim rt>
+struct PartialReductionKernel {
+  typedef typename Evaluator::CoeffReturnType CoeffReturnType;
+  typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
+  typedef typename Evaluator::Index Index;
+  typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef;
+  typedef typename OpDef::type Op;
+  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+      ScratchAcc;
+  ScratchAcc scratch;
+  Evaluator evaluator;
+  EvaluatorPointerType output_accessor;
+  Op op;
+  const Index preserve_elements_num_groups;
+  const Index reduce_elements_num_groups;
+  const Index num_coeffs_to_preserve;
+  const Index num_coeffs_to_reduce;
+
+  PartialReductionKernel(ScratchAcc scratch_, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType op_,
+                         const Index preserve_elements_num_groups_, const Index reduce_elements_num_groups_,
+                         const Index num_coeffs_to_preserve_, const Index num_coeffs_to_reduce_)
+      : scratch(scratch_),
+        evaluator(evaluator_),
+        output_accessor(output_accessor_),
+        op(OpDef::get_op(op_)),
+        preserve_elements_num_groups(preserve_elements_num_groups_),
+        reduce_elements_num_groups(reduce_elements_num_groups_),
+        num_coeffs_to_preserve(num_coeffs_to_preserve_),
+        num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void element_wise_reduce(Index globalRId, Index globalPId,
+                                                                 CoeffReturnType &accumulator) const {
+    if (globalPId >= num_coeffs_to_preserve) {
+      return;
+    }
+    Index global_offset = rt == reduction_dim::outer_most ? globalPId + (globalRId * num_coeffs_to_preserve)
+                                                          : globalRId + (globalPId * num_coeffs_to_reduce);
+    Index localOffset = globalRId;
+
+    const Index per_thread_local_stride = PannelParameters::LocalThreadSizeR * reduce_elements_num_groups;
+    const Index per_thread_global_stride =
+        rt == reduction_dim::outer_most ? num_coeffs_to_preserve * per_thread_local_stride : per_thread_local_stride;
+    for (Index i = globalRId; i < num_coeffs_to_reduce; i += per_thread_local_stride) {
+      op.reduce(evaluator.impl().coeff(global_offset), &accumulator);
+      localOffset += per_thread_local_stride;
+      global_offset += per_thread_global_stride;
+    }
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
+    const Index linearLocalThreadId = itemID.get_local_id(0);
+    Index pLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId % PannelParameters::LocalThreadSizeP
+                                                           : linearLocalThreadId / PannelParameters::LocalThreadSizeR;
+    Index rLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId / PannelParameters::LocalThreadSizeP
+                                                           : linearLocalThreadId % PannelParameters::LocalThreadSizeR;
+    const Index pGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) % preserve_elements_num_groups
+                                                           : itemID.get_group(0) / reduce_elements_num_groups;
+    const Index rGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) / preserve_elements_num_groups
+                                                           : itemID.get_group(0) % reduce_elements_num_groups;
+
+    Index globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
+    const Index globalRId = rGroupId * PannelParameters::LocalThreadSizeR + rLocalThreadId;
+    CoeffReturnType *scratchPtr = scratch.get_pointer();
+    auto outPtr = output_accessor + (reduce_elements_num_groups > 1 ? rGroupId * num_coeffs_to_preserve : 0);
+    CoeffReturnType accumulator = op.initialize();
+
+    element_wise_reduce(globalRId, globalPId, accumulator);
+
+    accumulator = OpDef::finalise_op(op.finalize(accumulator), num_coeffs_to_reduce);
+    scratchPtr[pLocalThreadId + rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)] =
+        accumulator;
+    if (rt == reduction_dim::inner_most) {
+      pLocalThreadId = linearLocalThreadId % PannelParameters::LocalThreadSizeP;
+      rLocalThreadId = linearLocalThreadId / PannelParameters::LocalThreadSizeP;
+      globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
+    }
+
+    /* Apply the reduction operation between the current local
+     * id and the one on the other half of the vector. */
+    auto out_scratch_ptr =
+        scratchPtr + (pLocalThreadId + (rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)));
+    itemID.barrier(cl::sycl::access::fence_space::local_space);
+    if (rt == reduction_dim::inner_most) {
+      accumulator = *out_scratch_ptr;
+    }
+    // The Local LocalThreadSizeR is always power of 2
+    EIGEN_UNROLL_LOOP
+    for (Index offset = PannelParameters::LocalThreadSizeR >> 1; offset > 0; offset >>= 1) {
+      if (rLocalThreadId < offset) {
+        op.reduce(out_scratch_ptr[(PannelParameters::LocalThreadSizeP + PannelParameters::BC) * offset], &accumulator);
+        // The result has already been divided for mean reducer in the
+        // previous reduction so no need to divide furthermore
+        *out_scratch_ptr = op.finalize(accumulator);
+      }
+      /* All threads collectively read from global memory into local.
+       * The barrier ensures all threads' IO is resolved before
+       * execution continues (strictly speaking, all threads within
+       * a single work-group - there is no co-ordination between
+       * work-groups, only work-items). */
+      itemID.barrier(cl::sycl::access::fence_space::local_space);
+    }
+
+    if (rLocalThreadId == 0 && (globalPId < num_coeffs_to_preserve)) {
+      outPtr[globalPId] = op.finalize(accumulator);
+    }
+  }
+};
+
+template <typename OutScalar, typename Index, typename InputAccessor, typename OutputAccessor, typename OpType>
+struct SecondStepPartialReduction {
+  typedef OpDefiner<OpType, OutScalar, Index, false> OpDef;
+  typedef typename OpDef::type Op;
+  typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+      ScratchAccessor;
+  InputAccessor input_accessor;
+  OutputAccessor output_accessor;
+  Op op;
+  const Index num_coeffs_to_preserve;
+  const Index num_coeffs_to_reduce;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SecondStepPartialReduction(ScratchAccessor, InputAccessor input_accessor_,
+                                                                   OutputAccessor output_accessor_, OpType op_,
+                                                                   const Index num_coeffs_to_preserve_,
+                                                                   const Index num_coeffs_to_reduce_)
+      : input_accessor(input_accessor_),
+        output_accessor(output_accessor_),
+        op(OpDef::get_op(op_)),
+        num_coeffs_to_preserve(num_coeffs_to_preserve_),
+        num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
+    const Index globalId = itemID.get_global_id(0);
+
+    if (globalId >= num_coeffs_to_preserve) return;
+
+    auto in_ptr = input_accessor + globalId;
+
+    OutScalar accumulator = op.initialize();
+    // num_coeffs_to_reduce is not bigger that 256
+    for (Index i = 0; i < num_coeffs_to_reduce; i++) {
+      op.reduce(*in_ptr, &accumulator);
+      in_ptr += num_coeffs_to_preserve;
+    }
+    output_accessor[globalId] = op.finalize(accumulator);
+  }
+};  // namespace internal
+
+template <typename Index, Index LTP, Index LTR, bool BC_>
+struct ReductionPannel {
+  static constexpr Index LocalThreadSizeP = LTP;
+  static constexpr Index LocalThreadSizeR = LTR;
+  static constexpr bool BC = BC_;
+};
+
+template <typename Self, typename Op, TensorSycl::internal::reduction_dim rt>
+struct PartialReducerLauncher {
+  typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
+  typedef typename Self::CoeffReturnType CoeffReturnType;
+  typedef typename Self::Storage Storage;
+  typedef typename Self::Index Index;
+  typedef ReductionPannel<typename Self::Index, EIGEN_SYCL_LOCAL_THREAD_DIM0, EIGEN_SYCL_LOCAL_THREAD_DIM1, true>
+      PannelParameters;
+
+  typedef PartialReductionKernel<Self, Op, PannelParameters, rt> SyclReducerKerneType;
+
+  static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType output,
+                  Index num_coeffs_to_reduce, Index num_coeffs_to_preserve) {
+    Index roundUpP = roundUp(num_coeffs_to_preserve, PannelParameters::LocalThreadSizeP);
+
+    // getPowerOfTwo makes sure local range is power of 2 and <=
+    // maxSyclThreadPerBlock this will help us to avoid extra check on the
+    // kernel
+    static_assert(!((PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR) &
+                    (PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR - 1)),
+                  "The Local thread size must be a power of 2 for the reduction "
+                  "operation");
+
+    constexpr Index localRange = PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR;
+    // In this step, we force the code not to be more than 2-step reduction:
+    // Our empirical research shows that if each thread reduces at least 64
+    // elements individually, we get better performance. However, this can change
+    // on different platforms. In this step we force the code not to be
+    // morthan step reduction: Our empirical research shows that for inner_most
+    // dim reducer, it is better to have 8 group in a reduce dimension for sizes
+    // > 1024 to achieve the best performance.
+    const Index reductionPerThread = 64;
+    Index cu = dev.getPowerOfTwo(dev.getNumSyclMultiProcessors(), true);
+    const Index pNumGroups = roundUpP / PannelParameters::LocalThreadSizeP;
+    Index rGroups = (cu + pNumGroups - 1) / pNumGroups;
+    const Index rNumGroups = num_coeffs_to_reduce > reductionPerThread * localRange ? std::min(rGroups, localRange) : 1;
+    const Index globalRange = pNumGroups * rNumGroups * localRange;
+
+    constexpr Index scratchSize =
+        PannelParameters::LocalThreadSizeR * (PannelParameters::LocalThreadSizeP + PannelParameters::BC);
+    auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
+    if (rNumGroups > 1) {
+      CoeffReturnType *temp_pointer = static_cast<CoeffReturnType *>(
+          dev.allocate_temp(num_coeffs_to_preserve * rNumGroups * sizeof(CoeffReturnType)));
+      EvaluatorPointerType temp_accessor = dev.get(temp_pointer);
+      dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
+             self, temp_accessor, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
+             num_coeffs_to_reduce)
+          .wait();
+      typedef SecondStepPartialReduction<CoeffReturnType, Index, EvaluatorPointerType, EvaluatorPointerType, Op>
+          SecondStepPartialReductionKernel;
+      dev.template unary_kernel_launcher<CoeffReturnType, SecondStepPartialReductionKernel>(
+             temp_accessor, output,
+             cl::sycl::nd_range<1>(cl::sycl::range<1>(pNumGroups * localRange), cl::sycl::range<1>(localRange)),
+             Index(1), reducer, num_coeffs_to_preserve, rNumGroups)
+          .wait();
+      self.device().deallocate_temp(temp_pointer);
+    } else {
+      dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
+             self, output, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
+             num_coeffs_to_reduce)
+          .wait();
+    }
+    return false;
+  }
+};
+}  // namespace internal
+}  // namespace TensorSycl
+
+namespace internal {
+
+template <typename Self, typename Op, bool Vectorizable>
+struct FullReducer<Self, Op, Eigen::SyclDevice, Vectorizable> {
+  typedef typename Self::CoeffReturnType CoeffReturnType;
+  typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
+  static constexpr bool HasOptimizedImplementation = true;
+  static constexpr int PacketSize = Self::PacketAccess ? Self::PacketSize : 1;
+  static void run(const Self &self, Op &reducer, const Eigen::SyclDevice &dev, EvaluatorPointerType data) {
+    typedef std::conditional_t<Self::PacketAccess, typename Self::PacketReturnType, CoeffReturnType> OutType;
+    static_assert(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) &
+                    (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)),
+                  "The Local thread size must be a power of 2 for the reduction "
+                  "operation");
+    constexpr Index local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1;
+
+    typename Self::Index inputSize = self.impl().dimensions().TotalSize();
+    // In this step we force the code not to be more than 2-step reduction:
+    // Our empirical research shows that if each thread reduces at least 512
+    // elements individually, we get better performance.
+    const Index reductionPerThread = 2048;
+    // const Index num_work_group =
+    Index reductionGroup = dev.getPowerOfTwo(
+        (inputSize + (reductionPerThread * local_range - 1)) / (reductionPerThread * local_range), true);
+    const Index num_work_group = std::min(reductionGroup, local_range);
+    // 1
+    // ? local_range
+    // : 1);
+    const Index global_range = num_work_group * local_range;
+
+    auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
+    typedef TensorSycl::internal::FullReductionKernelFunctor<Self, Op, local_range> reduction_kernel_t;
+    if (num_work_group > 1) {
+      CoeffReturnType *temp_pointer =
+          static_cast<CoeffReturnType *>(dev.allocate_temp(num_work_group * sizeof(CoeffReturnType)));
+      typename Self::EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);
+      dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, tmp_global_accessor, thread_range,
+                                                                      local_range, inputSize, reducer)
+          .wait();
+      typedef TensorSycl::internal::SecondStepFullReducer<CoeffReturnType, Op, EvaluatorPointerType,
+                                                          EvaluatorPointerType, Index, local_range>
+          GenericRKernel;
+      dev.template unary_kernel_launcher<CoeffReturnType, GenericRKernel>(
+             tmp_global_accessor, data,
+             cl::sycl::nd_range<1>(cl::sycl::range<1>(num_work_group), cl::sycl::range<1>(num_work_group)),
+             num_work_group, reducer)
+          .wait();
+      dev.deallocate_temp(temp_pointer);
+    } else {
+      dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, data, thread_range, local_range, inputSize,
+                                                                      reducer)
+          .wait();
+    }
+  }
+};
+// vectorizable inner_most most dim preserver
+// col reduction
+template <typename Self, typename Op>
+struct OuterReducer<Self, Op, Eigen::SyclDevice> {
+  static constexpr bool HasOptimizedImplementation = true;
+
+  static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
+                  typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce,
+                  typename Self::Index num_coeffs_to_preserve) {
+    return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
+        Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::outer_most>::run(self, reducer, dev, output,
+                                                                                 num_coeffs_to_reduce,
+                                                                                 num_coeffs_to_preserve);
+  }
+};
+// row reduction
+template <typename Self, typename Op>
+struct InnerReducer<Self, Op, Eigen::SyclDevice> {
+  static constexpr bool HasOptimizedImplementation = true;
+
+  static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
+                  typename Self::EvaluatorPointerType output, typename Self::Index num_coeffs_to_reduce,
+                  typename Self::Index num_coeffs_to_preserve) {
+    return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
+        Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::inner_most>::run(self, reducer, dev, output,
+                                                                                 num_coeffs_to_reduce,
+                                                                                 num_coeffs_to_preserve);
+  }
+};
+
+// ArmgMax uses this kernel for partial reduction//
+// TODO(@mehdi.goli) come up with a better kernel
+// generic partial reduction
+template <typename Self, typename Op>
+struct GenericReducer<Self, Op, Eigen::SyclDevice> {
+  static constexpr bool HasOptimizedImplementation = false;
+  static bool run(const Self &self, const Op &reducer, const Eigen::SyclDevice &dev,
+                  typename Self::EvaluatorPointerType output, typename Self::Index num_values_to_reduce,
+                  typename Self::Index num_coeffs_to_preserve) {
+    typename Self::Index range, GRange, tileSize;
+    dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange);
+
+    dev.template unary_kernel_launcher<typename Self::CoeffReturnType,
+                                       TensorSycl::internal::GenericNondeterministicReducer<Self, Op>>(
+           self, output, cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)), Index(1),
+           reducer, range, (num_values_to_reduce != 0) ? num_values_to_reduce : static_cast<Index>(1))
+        .wait();
+    return false;
+  }
+};
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
new file mode 100644
index 00000000..98223fe7
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
@@ -0,0 +1,391 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_REF_H
+#define EIGEN_CXX11_TENSOR_TENSOR_REF_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename Dimensions, typename Scalar>
+class TensorLazyBaseEvaluator {
+ public:
+  TensorLazyBaseEvaluator() : m_refcount(0) {}
+  virtual ~TensorLazyBaseEvaluator() {}
+
+  EIGEN_DEVICE_FUNC virtual const Dimensions& dimensions() const = 0;
+  EIGEN_DEVICE_FUNC virtual const Scalar* data() const = 0;
+
+  EIGEN_DEVICE_FUNC virtual const Scalar coeff(DenseIndex index) const = 0;
+  EIGEN_DEVICE_FUNC virtual Scalar& coeffRef(DenseIndex index) = 0;
+
+  void incrRefCount() { ++m_refcount; }
+  void decrRefCount() { --m_refcount; }
+  int refCount() const { return m_refcount; }
+
+ private:
+  // No copy, no assignment;
+  TensorLazyBaseEvaluator(const TensorLazyBaseEvaluator& other);
+  TensorLazyBaseEvaluator& operator=(const TensorLazyBaseEvaluator& other);
+
+  int m_refcount;
+};
+
+template <typename Dimensions, typename Expr, typename Device>
+class TensorLazyEvaluatorReadOnly
+    : public TensorLazyBaseEvaluator<Dimensions, typename TensorEvaluator<Expr, Device>::Scalar> {
+ public:
+  //  typedef typename TensorEvaluator<Expr, Device>::Dimensions Dimensions;
+  typedef typename TensorEvaluator<Expr, Device>::Scalar Scalar;
+  typedef StorageMemory<Scalar, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+  typedef TensorEvaluator<Expr, Device> EvalType;
+
+  TensorLazyEvaluatorReadOnly(const Expr& expr, const Device& device) : m_impl(expr, device), m_dummy(Scalar(0)) {
+    EIGEN_STATIC_ASSERT(
+        internal::array_size<Dimensions>::value == internal::array_size<typename EvalType::Dimensions>::value,
+        "Dimension sizes must match.");
+    const auto& other_dims = m_impl.dimensions();
+    for (std::size_t i = 0; i < m_dims.size(); ++i) {
+      m_dims[i] = other_dims[i];
+    }
+    m_impl.evalSubExprsIfNeeded(NULL);
+  }
+  virtual ~TensorLazyEvaluatorReadOnly() { m_impl.cleanup(); }
+
+  EIGEN_DEVICE_FUNC virtual const Dimensions& dimensions() const { return m_dims; }
+  EIGEN_DEVICE_FUNC virtual const Scalar* data() const { return m_impl.data(); }
+
+  EIGEN_DEVICE_FUNC virtual const Scalar coeff(DenseIndex index) const { return m_impl.coeff(index); }
+  EIGEN_DEVICE_FUNC virtual Scalar& coeffRef(DenseIndex /*index*/) {
+    eigen_assert(false && "can't reference the coefficient of a rvalue");
+    return m_dummy;
+  };
+
+ protected:
+  TensorEvaluator<Expr, Device> m_impl;
+  Dimensions m_dims;
+  Scalar m_dummy;
+};
+
+template <typename Dimensions, typename Expr, typename Device>
+class TensorLazyEvaluatorWritable : public TensorLazyEvaluatorReadOnly<Dimensions, Expr, Device> {
+ public:
+  typedef TensorLazyEvaluatorReadOnly<Dimensions, Expr, Device> Base;
+  typedef typename Base::Scalar Scalar;
+  typedef StorageMemory<Scalar, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  TensorLazyEvaluatorWritable(const Expr& expr, const Device& device) : Base(expr, device) {}
+  virtual ~TensorLazyEvaluatorWritable() {}
+
+  EIGEN_DEVICE_FUNC virtual Scalar& coeffRef(DenseIndex index) { return this->m_impl.coeffRef(index); }
+};
+
+template <typename Dimensions, typename Expr, typename Device, bool IsWritable>
+class TensorLazyEvaluator : public std::conditional_t<IsWritable, TensorLazyEvaluatorWritable<Dimensions, Expr, Device>,
+                                                      TensorLazyEvaluatorReadOnly<Dimensions, const Expr, Device>> {
+ public:
+  typedef std::conditional_t<IsWritable, TensorLazyEvaluatorWritable<Dimensions, Expr, Device>,
+                             TensorLazyEvaluatorReadOnly<Dimensions, const Expr, Device>>
+      Base;
+  typedef typename Base::Scalar Scalar;
+
+  TensorLazyEvaluator(const Expr& expr, const Device& device) : Base(expr, device) {}
+  virtual ~TensorLazyEvaluator() {}
+};
+
+template <typename Derived>
+class TensorRefBase : public TensorBase<Derived> {
+ public:
+  typedef typename traits<Derived>::PlainObjectType PlainObjectType;
+  typedef typename PlainObjectType::Base Base;
+  typedef typename Eigen::internal::nested<Derived>::type Nested;
+  typedef typename traits<PlainObjectType>::StorageKind StorageKind;
+  typedef typename traits<PlainObjectType>::Index Index;
+  typedef typename traits<PlainObjectType>::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef typename Base::CoeffReturnType CoeffReturnType;
+  typedef Scalar* PointerType;
+  typedef PointerType PointerArgType;
+
+  static constexpr Index NumIndices = PlainObjectType::NumIndices;
+  typedef typename PlainObjectType::Dimensions Dimensions;
+
+  static constexpr int Layout = PlainObjectType::Layout;
+  enum {
+    IsAligned = false,
+    PacketAccess = false,
+    BlockAccess = false,
+    PreferBlockAccess = false,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -----------===//
+  typedef TensorBlockNotImplemented TensorBlock;
+  //===------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorRefBase() : m_evaluator(NULL) {}
+
+  TensorRefBase(const TensorRefBase& other) : TensorBase<Derived>(other), m_evaluator(other.m_evaluator) {
+    eigen_assert(m_evaluator->refCount() > 0);
+    m_evaluator->incrRefCount();
+  }
+
+  TensorRefBase& operator=(const TensorRefBase& other) {
+    if (this != &other) {
+      unrefEvaluator();
+      m_evaluator = other.m_evaluator;
+      eigen_assert(m_evaluator->refCount() > 0);
+      m_evaluator->incrRefCount();
+    }
+    return *this;
+  }
+
+  template <typename Expression,
+            typename EnableIf = std::enable_if_t<!std::is_same<std::decay_t<Expression>, Derived>::value>>
+  EIGEN_STRONG_INLINE TensorRefBase(const Expression& expr)
+      : m_evaluator(new TensorLazyEvaluator<Dimensions, Expression, DefaultDevice,
+                                            /*IsWritable=*/!std::is_const<PlainObjectType>::value &&
+                                                bool(is_lvalue<Expression>::value)>(expr, DefaultDevice())) {
+    m_evaluator->incrRefCount();
+  }
+
+  template <typename Expression,
+            typename EnableIf = std::enable_if_t<!std::is_same<std::decay_t<Expression>, Derived>::value>>
+  EIGEN_STRONG_INLINE TensorRefBase& operator=(const Expression& expr) {
+    unrefEvaluator();
+    m_evaluator = new TensorLazyEvaluator < Dimensions, Expression, DefaultDevice,
+    /*IsWritable=*/!std::is_const<PlainObjectType>::value&& bool(is_lvalue<Expression>::value) >
+        (expr, DefaultDevice());
+    m_evaluator->incrRefCount();
+    return *this;
+  }
+
+  ~TensorRefBase() { unrefEvaluator(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rank() const { return m_evaluator->dimensions().size(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index dimension(Index n) const { return m_evaluator->dimensions()[n]; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_evaluator->dimensions(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_evaluator->dimensions().TotalSize(); }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar* data() const { return m_evaluator->data(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(Index index) const { return m_evaluator->coeff(index); }
+
+  template <typename... IndexTypes>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(Index firstIndex, IndexTypes... otherIndices) const {
+    const std::size_t num_indices = (sizeof...(otherIndices) + 1);
+    const array<Index, num_indices> indices{{firstIndex, otherIndices...}};
+    return coeff(indices);
+  }
+
+  template <std::size_t NumIndices>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(const array<Index, NumIndices>& indices) const {
+    const Dimensions& dims = this->dimensions();
+    Index index = 0;
+    if (PlainObjectType::Options & RowMajor) {
+      index += indices[0];
+      for (size_t i = 1; i < NumIndices; ++i) {
+        index = index * dims[i] + indices[i];
+      }
+    } else {
+      index += indices[NumIndices - 1];
+      for (int i = NumIndices - 2; i >= 0; --i) {
+        index = index * dims[i] + indices[i];
+      }
+    }
+    return m_evaluator->coeff(index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index index) const { return m_evaluator->coeff(index); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { return m_evaluator->coeffRef(index); }
+
+ protected:
+  TensorLazyBaseEvaluator<Dimensions, Scalar>* evaluator() { return m_evaluator; }
+
+ private:
+  EIGEN_STRONG_INLINE void unrefEvaluator() {
+    if (m_evaluator) {
+      m_evaluator->decrRefCount();
+      if (m_evaluator->refCount() == 0) {
+        delete m_evaluator;
+      }
+    }
+  }
+
+  TensorLazyBaseEvaluator<Dimensions, Scalar>* m_evaluator;
+};
+
+}  // namespace internal
+
+/**
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief A reference to a tensor expression
+ * The expression will be evaluated lazily (as much as possible).
+ *
+ */
+template <typename PlainObjectType>
+class TensorRef : public internal::TensorRefBase<TensorRef<PlainObjectType>> {
+  typedef internal::TensorRefBase<TensorRef<PlainObjectType>> Base;
+
+ public:
+  using Scalar = typename Base::Scalar;
+  using Dimensions = typename Base::Dimensions;
+
+  EIGEN_STRONG_INLINE TensorRef() : Base() {}
+
+  EIGEN_STRONG_INLINE TensorRef(const TensorRef& other) : Base(other) {}
+
+  template <typename Expression>
+  EIGEN_STRONG_INLINE TensorRef(const Expression& expr) : Base(expr) {
+    EIGEN_STATIC_ASSERT(internal::is_lvalue<Expression>::value,
+                        "Expression must be mutable to create a mutable TensorRef<Expression>.  Did you mean "
+                        "TensorRef<const Expression>?)");
+  }
+
+  TensorRef& operator=(const TensorRef& other) { return Base::operator=(other).derived(); }
+
+  template <typename Expression>
+  EIGEN_STRONG_INLINE TensorRef& operator=(const Expression& expr) {
+    EIGEN_STATIC_ASSERT(internal::is_lvalue<Expression>::value,
+                        "Expression must be mutable to create a mutable TensorRef<Expression>.  Did you mean "
+                        "TensorRef<const Expression>?)");
+    return Base::operator=(expr).derived();
+  }
+
+  template <typename... IndexTypes>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index firstIndex, IndexTypes... otherIndices) {
+    const std::size_t num_indices = (sizeof...(otherIndices) + 1);
+    const array<Index, num_indices> indices{{firstIndex, otherIndices...}};
+    return coeffRef(indices);
+  }
+
+  template <std::size_t NumIndices>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(const array<Index, NumIndices>& indices) {
+    const Dimensions& dims = this->dimensions();
+    Index index = 0;
+    if (PlainObjectType::Options & RowMajor) {
+      index += indices[0];
+      for (size_t i = 1; i < NumIndices; ++i) {
+        index = index * dims[i] + indices[i];
+      }
+    } else {
+      index += indices[NumIndices - 1];
+      for (int i = NumIndices - 2; i >= 0; --i) {
+        index = index * dims[i] + indices[i];
+      }
+    }
+    return Base::evaluator()->coeffRef(index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { return Base::evaluator()->coeffRef(index); }
+};
+
+/**
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief A reference to a constant tensor expression
+ * The expression will be evaluated lazily (as much as possible).
+ *
+ */
+template <typename PlainObjectType>
+class TensorRef<const PlainObjectType> : public internal::TensorRefBase<TensorRef<const PlainObjectType>> {
+  typedef internal::TensorRefBase<TensorRef<const PlainObjectType>> Base;
+
+ public:
+  EIGEN_STRONG_INLINE TensorRef() : Base() {}
+
+  EIGEN_STRONG_INLINE TensorRef(const TensorRef& other) : Base(other) {}
+
+  template <typename Expression>
+  EIGEN_STRONG_INLINE TensorRef(const Expression& expr) : Base(expr) {}
+
+  TensorRef& operator=(const TensorRef& other) { return Base::operator=(other).derived(); }
+
+  template <typename Expression>
+  EIGEN_STRONG_INLINE TensorRef& operator=(const Expression& expr) {
+    return Base::operator=(expr).derived();
+  }
+};
+
+// evaluator for rvalues
+template <typename Derived, typename Device>
+struct TensorEvaluator<const TensorRef<Derived>, Device> {
+  typedef typename Derived::Index Index;
+  typedef typename Derived::Scalar Scalar;
+  typedef typename Derived::Scalar CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef typename Derived::Dimensions Dimensions;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  static constexpr int Layout = TensorRef<Derived>::Layout;
+  enum {
+    IsAligned = false,
+    PacketAccess = false,
+    BlockAccess = false,
+    PreferBlockAccess = false,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const TensorRef<Derived>& m, const Device&) : m_ref(m) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_ref.dimensions(); }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) { return true; }
+
+  EIGEN_STRONG_INLINE void cleanup() {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_ref.coeff(index); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { return m_ref.coeffRef(index); }
+
+  EIGEN_DEVICE_FUNC const Scalar* data() const { return m_ref.data(); }
+
+ protected:
+  TensorRef<Derived> m_ref;
+};
+
+// evaluator for lvalues
+template <typename Derived, typename Device>
+struct TensorEvaluator<TensorRef<Derived>, Device> : public TensorEvaluator<const TensorRef<Derived>, Device> {
+  typedef typename Derived::Index Index;
+  typedef typename Derived::Scalar Scalar;
+  typedef typename Derived::Scalar CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef typename Derived::Dimensions Dimensions;
+
+  typedef TensorEvaluator<const TensorRef<Derived>, Device> Base;
+
+  enum { IsAligned = false, PacketAccess = false, BlockAccess = false, PreferBlockAccess = false, RawAccess = false };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(TensorRef<Derived>& m, const Device& d) : Base(m, d) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { return this->m_ref.coeffRef(index); }
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_REF_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
new file mode 100644
index 00000000..4f167e7c
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@@ -0,0 +1,410 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Navdeep Jaitly <ndjaitly@google.com>
+//                    Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H
+#define EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+template <typename ReverseDimensions, typename XprType>
+struct traits<TensorReverseOp<ReverseDimensions, XprType> > : public traits<XprType> {
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef std::remove_reference_t<Nested> Nested_;
+  static constexpr int NumDimensions = XprTraits::NumDimensions;
+  static constexpr int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
+};
+
+template <typename ReverseDimensions, typename XprType>
+struct eval<TensorReverseOp<ReverseDimensions, XprType>, Eigen::Dense> {
+  typedef const TensorReverseOp<ReverseDimensions, XprType>& type;
+};
+
+template <typename ReverseDimensions, typename XprType>
+struct nested<TensorReverseOp<ReverseDimensions, XprType>, 1,
+              typename eval<TensorReverseOp<ReverseDimensions, XprType> >::type> {
+  typedef TensorReverseOp<ReverseDimensions, XprType> type;
+};
+
+}  // end namespace internal
+
+/**
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor reverse elements class.
+ *
+ */
+template <typename ReverseDimensions, typename XprType>
+class TensorReverseOp : public TensorBase<TensorReverseOp<ReverseDimensions, XprType>, WriteAccessors> {
+ public:
+  typedef TensorBase<TensorReverseOp<ReverseDimensions, XprType>, WriteAccessors> Base;
+  typedef typename Eigen::internal::traits<TensorReverseOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorReverseOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorReverseOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorReverseOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorReverseOp(const XprType& expr, const ReverseDimensions& reverse_dims)
+      : m_xpr(expr), m_reverse_dims(reverse_dims) {}
+
+  EIGEN_DEVICE_FUNC const ReverseDimensions& reverse() const { return m_reverse_dims; }
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename XprType::Nested>& expression() const { return m_xpr; }
+
+  EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorReverseOp)
+
+ protected:
+  typename XprType::Nested m_xpr;
+  const ReverseDimensions m_reverse_dims;
+};
+
+// Eval as rvalue
+template <typename ReverseDimensions, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device> {
+  typedef TensorReverseOp<ReverseDimensions, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static constexpr int NumDims = internal::array_size<ReverseDimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+  enum {
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = NumDims > 0,
+    PreferBlockAccess = true,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  typedef internal::TensorIntDivisor<Index> IndexDivisor;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock ArgTensorBlock;
+
+  typedef typename internal::TensorMaterializedBlock<CoeffReturnType, NumDims, Layout, Index> TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_reverse(op.reverse()), m_device(device) {
+    // Reversing a scalar isn't supported yet. It would be a no-op anyway.
+    EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    // Compute strides
+    m_dimensions = m_impl.dimensions();
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_strides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1];
+        if (m_strides[i] > 0) m_fastStrides[i] = IndexDivisor(m_strides[i]);
+      }
+    } else {
+      m_strides[NumDims - 1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1];
+        if (m_strides[i] > 0) m_fastStrides[i] = IndexDivisor(m_strides[i]);
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType, EvalSubExprsCallback done) {
+    m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index reverseIndex(Index index) const {
+    eigen_assert(index < dimensions().TotalSize());
+    Index inputIndex = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
+      for (int i = NumDims - 1; i > 0; --i) {
+        Index idx = index / m_fastStrides[i];
+        index -= idx * m_strides[i];
+        if (m_reverse[i]) {
+          idx = m_dimensions[i] - idx - 1;
+        }
+        inputIndex += idx * m_strides[i];
+      }
+      if (m_reverse[0]) {
+        inputIndex += (m_dimensions[0] - index - 1);
+      } else {
+        inputIndex += index;
+      }
+    } else {
+      EIGEN_UNROLL_LOOP
+      for (int i = 0; i < NumDims - 1; ++i) {
+        Index idx = index / m_fastStrides[i];
+        index -= idx * m_strides[i];
+        if (m_reverse[i]) {
+          idx = m_dimensions[i] - idx - 1;
+        }
+        inputIndex += idx * m_strides[i];
+      }
+      if (m_reverse[NumDims - 1]) {
+        inputIndex += (m_dimensions[NumDims - 1] - index - 1);
+      } else {
+        inputIndex += index;
+      }
+    }
+    return inputIndex;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    return m_impl.coeff(reverseIndex(index));
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+    eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+
+    // TODO(ndjaitly): write a better packing routine that uses
+    // local structure.
+    EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
+    EIGEN_UNROLL_LOOP
+    for (int i = 0; i < PacketSize; ++i) {
+      values[i] = coeff(index + i);
+    }
+    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+    return rslt;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    const size_t target_size = m_device.lastLevelCacheSize();
+    // Block evaluation reads underlying memory in reverse order, and default
+    // cost model does not properly catch this in bytes stored/loaded.
+    return internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size).addCostPerCoeff({0, 0, 24});
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+                                                          bool /*root_of_expr_ast*/ = false) const {
+    // TODO(ezhulenev): If underlying tensor expression supports and prefers
+    // block evaluation we must use it. Currently we use coeff and packet
+    // access into the underlying tensor expression.
+    // static const bool useBlockAccessForArgType =
+    //     TensorEvaluator<ArgType, Device>::BlockAccess &&
+    //     TensorEvaluator<ArgType, Device>::PreferBlockAccess;
+
+    static const bool isColMajor = static_cast<int>(Layout) == static_cast<int>(ColMajor);
+
+    static const Index inner_dim_idx = isColMajor ? 0 : NumDims - 1;
+    const bool inner_dim_reversed = m_reverse[inner_dim_idx];
+
+    // Offset in the output block.
+    Index block_offset = 0;
+
+    // Offset in the input Tensor.
+    Index input_offset = reverseIndex(desc.offset());
+
+    // Initialize output block iterator state. Dimension in this array are
+    // always in inner_most -> outer_most order (col major layout).
+    array<BlockIteratorState, NumDims> it;
+    for (int i = 0; i < NumDims; ++i) {
+      const int dim = isColMajor ? i : NumDims - 1 - i;
+      it[i].size = desc.dimension(dim);
+      it[i].count = 0;
+      it[i].reverse = m_reverse[dim];
+
+      it[i].block_stride = i == 0 ? 1 : (it[i - 1].size * it[i - 1].block_stride);
+      it[i].block_span = it[i].block_stride * (it[i].size - 1);
+
+      it[i].input_stride = m_strides[dim];
+      it[i].input_span = it[i].input_stride * (it[i].size - 1);
+
+      if (it[i].reverse) {
+        it[i].input_stride = -1 * it[i].input_stride;
+        it[i].input_span = -1 * it[i].input_span;
+      }
+    }
+
+    // If multiple inner dimensions have the same reverse flag, check if we can
+    // merge them into a single virtual inner dimension.
+    int effective_inner_dim = 0;
+    for (int i = 1; i < NumDims; ++i) {
+      if (it[i].reverse != it[effective_inner_dim].reverse) break;
+      if (it[i].block_stride != it[effective_inner_dim].size) break;
+      if (it[i].block_stride != numext::abs(it[i].input_stride)) break;
+
+      it[i].size = it[effective_inner_dim].size * it[i].size;
+
+      it[i].block_stride = 1;
+      it[i].input_stride = (inner_dim_reversed ? -1 : 1);
+
+      it[i].block_span = it[i].block_stride * (it[i].size - 1);
+      it[i].input_span = it[i].input_stride * (it[i].size - 1);
+
+      effective_inner_dim = i;
+    }
+
+    eigen_assert(it[effective_inner_dim].block_stride == 1);
+    eigen_assert(it[effective_inner_dim].input_stride == (inner_dim_reversed ? -1 : 1));
+
+    const Index inner_dim_size = it[effective_inner_dim].size;
+
+    // Prepare storage for the materialized reverse result.
+    const typename TensorBlock::Storage block_storage = TensorBlock::prepareStorage(desc, scratch);
+    CoeffReturnType* block_buffer = block_storage.data();
+
+    while (it[NumDims - 1].count < it[NumDims - 1].size) {
+      // Copy inner-most dimension data from reversed location in input.
+      Index dst = block_offset;
+      Index src = input_offset;
+
+      // NOTE(ezhulenev): Adding vectorized path with internal::preverse showed
+      // worse results in benchmarks than a simple coefficient loop.
+      if (inner_dim_reversed) {
+        for (Index i = 0; i < inner_dim_size; ++i) {
+          block_buffer[dst] = m_impl.coeff(src);
+          ++dst;
+          --src;
+        }
+      } else {
+        for (Index i = 0; i < inner_dim_size; ++i) {
+          block_buffer[dst] = m_impl.coeff(src);
+          ++dst;
+          ++src;
+        }
+      }
+
+      // For the 1d tensor we need to generate only one inner-most dimension.
+      if ((NumDims - effective_inner_dim) == 1) break;
+
+      // Update offset.
+      for (Index i = effective_inner_dim + 1; i < NumDims; ++i) {
+        if (++it[i].count < it[i].size) {
+          block_offset += it[i].block_stride;
+          input_offset += it[i].input_stride;
+          break;
+        }
+        if (i != NumDims - 1) it[i].count = 0;
+        block_offset -= it[i].block_span;
+        input_offset -= it[i].input_span;
+      }
+    }
+
+    return block_storage.AsTensorMaterializedBlock();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
+                                     TensorOpCost::DivCost<Index>());
+    for (int i = 0; i < NumDims; ++i) {
+      if (m_reverse[i]) {
+        compute_cost += 2 * TensorOpCost::AddCost<Index>();
+      }
+    }
+    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; }
+
+ protected:
+  Dimensions m_dimensions;
+  array<Index, NumDims> m_strides;
+  array<IndexDivisor, NumDims> m_fastStrides;
+  TensorEvaluator<ArgType, Device> m_impl;
+  ReverseDimensions m_reverse;
+  const Device EIGEN_DEVICE_REF m_device;
+
+ private:
+  struct BlockIteratorState {
+    BlockIteratorState()
+        : size(0), count(0), reverse(false), block_stride(0), block_span(0), input_stride(0), input_span(0) {}
+
+    Index size;
+    Index count;
+    bool reverse;
+    Index block_stride;
+    Index block_span;
+    Index input_stride;
+    Index input_span;
+  };
+};
+
+// Eval as lvalue
+
+template <typename ReverseDimensions, typename ArgType, typename Device>
+struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
+    : public TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device> {
+  typedef TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device> Base;
+  typedef TensorReverseOp<ReverseDimensions, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static constexpr int NumDims = internal::array_size<ReverseDimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+  enum {
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    PreferBlockAccess = false,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {}
+
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return this->m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) const {
+    return this->m_impl.coeffRef(this->reverseIndex(index));
+  }
+
+  template <int StoreMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) const {
+    eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+
+    // This code is pilfered from TensorMorphing.h
+    EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize];
+    internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
+    EIGEN_UNROLL_LOOP
+    for (int i = 0; i < PacketSize; ++i) {
+      this->coeffRef(index + i) = values[i];
+    }
+  }
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_REVERSE_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorRoll.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorRoll.h
new file mode 100644
index 00000000..e4b5181c
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorRoll.h
@@ -0,0 +1,361 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2024 Tobias Wood tobias@spinicist.org.uk
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_ROLL_H
+#define EIGEN_CXX11_TENSOR_TENSOR_ROLL_H
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+template <typename RollDimensions, typename XprType>
+struct traits<TensorRollOp<RollDimensions, XprType> > : public traits<XprType> {
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef std::remove_reference_t<Nested> Nested_;
+  static constexpr int NumDimensions = XprTraits::NumDimensions;
+  static constexpr int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
+};
+
+template <typename RollDimensions, typename XprType>
+struct eval<TensorRollOp<RollDimensions, XprType>, Eigen::Dense> {
+  typedef const TensorRollOp<RollDimensions, XprType>& type;
+};
+
+template <typename RollDimensions, typename XprType>
+struct nested<TensorRollOp<RollDimensions, XprType>, 1, typename eval<TensorRollOp<RollDimensions, XprType> >::type> {
+  typedef TensorRollOp<RollDimensions, XprType> type;
+};
+
+}  // end namespace internal
+
+/**
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor roll (circular shift) elements class.
+ *
+ */
+template <typename RollDimensions, typename XprType>
+class TensorRollOp : public TensorBase<TensorRollOp<RollDimensions, XprType>, WriteAccessors> {
+ public:
+  typedef TensorBase<TensorRollOp<RollDimensions, XprType>, WriteAccessors> Base;
+  typedef typename Eigen::internal::traits<TensorRollOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorRollOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorRollOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorRollOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorRollOp(const XprType& expr, const RollDimensions& roll_dims)
+      : m_xpr(expr), m_roll_dims(roll_dims) {}
+
+  EIGEN_DEVICE_FUNC const RollDimensions& roll() const { return m_roll_dims; }
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename XprType::Nested>& expression() const { return m_xpr; }
+
+  EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorRollOp)
+
+ protected:
+  typename XprType::Nested m_xpr;
+  const RollDimensions m_roll_dims;
+};
+
+// Eval as rvalue
+template <typename RollDimensions, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorRollOp<RollDimensions, ArgType>, Device> {
+  typedef TensorRollOp<RollDimensions, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static constexpr int NumDims = internal::array_size<RollDimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+  enum {
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = NumDims > 0,
+    PreferBlockAccess = true,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  typedef internal::TensorIntDivisor<Index> IndexDivisor;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  using TensorBlockDesc = internal::TensorBlockDescriptor<NumDims, Index>;
+  using TensorBlockScratch = internal::TensorBlockScratchAllocator<Device>;
+  using ArgTensorBlock = typename TensorEvaluator<const ArgType, Device>::TensorBlock;
+  using TensorBlock = typename internal::TensorMaterializedBlock<CoeffReturnType, NumDims, Layout, Index>;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_rolls(op.roll()), m_device(device) {
+    EIGEN_STATIC_ASSERT((NumDims > 0), Must_Have_At_Least_One_Dimension_To_Roll);
+
+    // Compute strides
+    m_dimensions = m_impl.dimensions();
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_strides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_strides[i] = m_strides[i - 1] * m_dimensions[i - 1];
+        if (m_strides[i] > 0) m_fast_strides[i] = IndexDivisor(m_strides[i]);
+      }
+    } else {
+      m_strides[NumDims - 1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_strides[i] = m_strides[i + 1] * m_dimensions[i + 1];
+        if (m_strides[i] > 0) m_fast_strides[i] = IndexDivisor(m_strides[i]);
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
+    m_impl.evalSubExprsIfNeeded(nullptr);
+    return true;
+  }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType, EvalSubExprsCallback done) {
+    m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index roll(Index const i, Index const r, Index const n) const {
+    auto const tmp = (i + r) % n;
+    if (tmp < 0) {
+      return tmp + n;
+    } else {
+      return tmp;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE array<Index, NumDims> rollCoords(array<Index, NumDims> const& coords) const {
+    array<Index, NumDims> rolledCoords;
+    for (int id = 0; id < NumDims; id++) {
+      eigen_assert(coords[id] < m_dimensions[id]);
+      rolledCoords[id] = roll(coords[id], m_rolls[id], m_dimensions[id]);
+    }
+    return rolledCoords;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rollIndex(Index index) const {
+    eigen_assert(index < dimensions().TotalSize());
+    Index rolledIndex = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
+      for (int i = NumDims - 1; i > 0; --i) {
+        Index idx = index / m_fast_strides[i];
+        index -= idx * m_strides[i];
+        rolledIndex += roll(idx, m_rolls[i], m_dimensions[i]) * m_strides[i];
+      }
+      rolledIndex += roll(index, m_rolls[0], m_dimensions[0]);
+    } else {
+      EIGEN_UNROLL_LOOP
+      for (int i = 0; i < NumDims - 1; ++i) {
+        Index idx = index / m_fast_strides[i];
+        index -= idx * m_strides[i];
+        rolledIndex += roll(idx, m_rolls[i], m_dimensions[i]) * m_strides[i];
+      }
+      rolledIndex += roll(index, m_rolls[NumDims - 1], m_dimensions[NumDims - 1]);
+    }
+    return rolledIndex;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    return m_impl.coeff(rollIndex(index));
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+    eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+    EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
+    EIGEN_UNROLL_LOOP
+    for (int i = 0; i < PacketSize; ++i) {
+      values[i] = coeff(index + i);
+    }
+    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+    return rslt;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    const size_t target_size = m_device.lastLevelCacheSize();
+    return internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size).addCostPerCoeff({0, 0, 24});
+  }
+
+  struct BlockIteratorState {
+    Index stride;
+    Index span;
+    Index size;
+    Index count;
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+                                                          bool /*root_of_expr_ast*/ = false) const {
+    static const bool is_col_major = static_cast<int>(Layout) == static_cast<int>(ColMajor);
+
+    // Compute spatial coordinates for the first block element.
+    array<Index, NumDims> coords;
+    extract_coordinates(desc.offset(), coords);
+    array<Index, NumDims> initial_coords = coords;
+    Index offset = 0;  // Offset in the output block buffer.
+
+    // Initialize output block iterator state. Dimension in this array are
+    // always in inner_most -> outer_most order (col major layout).
+    array<BlockIteratorState, NumDims> it;
+    for (int i = 0; i < NumDims; ++i) {
+      const int dim = is_col_major ? i : NumDims - 1 - i;
+      it[i].size = desc.dimension(dim);
+      it[i].stride = i == 0 ? 1 : (it[i - 1].size * it[i - 1].stride);
+      it[i].span = it[i].stride * (it[i].size - 1);
+      it[i].count = 0;
+    }
+    eigen_assert(it[0].stride == 1);
+
+    // Prepare storage for the materialized generator result.
+    const typename TensorBlock::Storage block_storage = TensorBlock::prepareStorage(desc, scratch);
+    CoeffReturnType* block_buffer = block_storage.data();
+
+    static const int inner_dim = is_col_major ? 0 : NumDims - 1;
+    const Index inner_dim_size = it[0].size;
+
+    while (it[NumDims - 1].count < it[NumDims - 1].size) {
+      Index i = 0;
+      for (; i < inner_dim_size; ++i) {
+        auto const rolled = rollCoords(coords);
+        auto const index = is_col_major ? m_dimensions.IndexOfColMajor(rolled) : m_dimensions.IndexOfRowMajor(rolled);
+        *(block_buffer + offset + i) = m_impl.coeff(index);
+        coords[inner_dim]++;
+      }
+      coords[inner_dim] = initial_coords[inner_dim];
+
+      if (NumDims == 1) break;  // For the 1d tensor we need to generate only one inner-most dimension.
+
+      // Update offset.
+      for (i = 1; i < NumDims; ++i) {
+        if (++it[i].count < it[i].size) {
+          offset += it[i].stride;
+          coords[is_col_major ? i : NumDims - 1 - i]++;
+          break;
+        }
+        if (i != NumDims - 1) it[i].count = 0;
+        coords[is_col_major ? i : NumDims - 1 - i] = initial_coords[is_col_major ? i : NumDims - 1 - i];
+        offset -= it[i].span;
+      }
+    }
+
+    return block_storage.AsTensorMaterializedBlock();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    double compute_cost = NumDims * (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
+                                     TensorOpCost::DivCost<Index>());
+    for (int i = 0; i < NumDims; ++i) {
+      compute_cost += 2 * TensorOpCost::AddCost<Index>();
+    }
+    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost, false /* vectorized */, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC typename Storage::Type data() const { return nullptr; }
+
+ protected:
+  Dimensions m_dimensions;
+  array<Index, NumDims> m_strides;
+  array<IndexDivisor, NumDims> m_fast_strides;
+  TensorEvaluator<ArgType, Device> m_impl;
+  RollDimensions m_rolls;
+  const Device EIGEN_DEVICE_REF m_device;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void extract_coordinates(Index index, array<Index, NumDims>& coords) const {
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx = index / m_fast_strides[i];
+        index -= idx * m_strides[i];
+        coords[i] = idx;
+      }
+      coords[0] = index;
+    } else {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx = index / m_fast_strides[i];
+        index -= idx * m_strides[i];
+        coords[i] = idx;
+      }
+      coords[NumDims - 1] = index;
+    }
+  }
+
+ private:
+};
+
+// Eval as lvalue
+
+template <typename RollDimensions, typename ArgType, typename Device>
+struct TensorEvaluator<TensorRollOp<RollDimensions, ArgType>, Device>
+    : public TensorEvaluator<const TensorRollOp<RollDimensions, ArgType>, Device> {
+  typedef TensorEvaluator<const TensorRollOp<RollDimensions, ArgType>, Device> Base;
+  typedef TensorRollOp<RollDimensions, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static constexpr int NumDims = internal::array_size<RollDimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+  enum {
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    PreferBlockAccess = false,
+    CoordAccess = false,
+    RawAccess = false
+  };
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {}
+
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return this->m_dimensions; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) const {
+    return this->m_impl.coeffRef(this->rollIndex(index));
+  }
+
+  template <int StoreMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) const {
+    eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+    EIGEN_ALIGN_MAX CoeffReturnType values[PacketSize];
+    internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
+    EIGEN_UNROLL_LOOP
+    for (int i = 0; i < PacketSize; ++i) {
+      this->coeffRef(index + i) = values[i];
+    }
+  }
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_ROLL_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
new file mode 100644
index 00000000..6de08679
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
@@ -0,0 +1,474 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Igor Babuschkin <igor@babuschk.in>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
+#define EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename Op, typename XprType>
+struct traits<TensorScanOp<Op, XprType> > : public traits<XprType> {
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprType::Nested Nested;
+  typedef std::remove_reference_t<Nested> Nested_;
+  static constexpr int NumDimensions = XprTraits::NumDimensions;
+  static constexpr int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
+};
+
+template <typename Op, typename XprType>
+struct eval<TensorScanOp<Op, XprType>, Eigen::Dense> {
+  typedef const TensorScanOp<Op, XprType>& type;
+};
+
+template <typename Op, typename XprType>
+struct nested<TensorScanOp<Op, XprType>, 1, typename eval<TensorScanOp<Op, XprType> >::type> {
+  typedef TensorScanOp<Op, XprType> type;
+};
+}  // end namespace internal
+
+/**
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor scan class.
+ */
+template <typename Op, typename XprType>
+class TensorScanOp : public TensorBase<TensorScanOp<Op, XprType>, ReadOnlyAccessors> {
+ public:
+  typedef typename Eigen::internal::traits<TensorScanOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorScanOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorScanOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorScanOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorScanOp(const XprType& expr, const Index& axis, bool exclusive = false,
+                                                     const Op& op = Op())
+      : m_expr(expr), m_axis(axis), m_accumulator(op), m_exclusive(exclusive) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index axis() const { return m_axis; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprType& expression() const { return m_expr; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op accumulator() const { return m_accumulator; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool exclusive() const { return m_exclusive; }
+
+ protected:
+  typename XprType::Nested m_expr;
+  const Index m_axis;
+  const Op m_accumulator;
+  const bool m_exclusive;
+};
+
+namespace internal {
+
+template <typename Self>
+EIGEN_STRONG_INLINE void ReduceScalar(Self& self, Index offset, typename Self::CoeffReturnType* data) {
+  // Compute the scan along the axis, starting at the given offset
+  typename Self::CoeffReturnType accum = self.accumulator().initialize();
+  if (self.stride() == 1) {
+    if (self.exclusive()) {
+      for (Index curr = offset; curr < offset + self.size(); ++curr) {
+        data[curr] = self.accumulator().finalize(accum);
+        self.accumulator().reduce(self.inner().coeff(curr), &accum);
+      }
+    } else {
+      for (Index curr = offset; curr < offset + self.size(); ++curr) {
+        self.accumulator().reduce(self.inner().coeff(curr), &accum);
+        data[curr] = self.accumulator().finalize(accum);
+      }
+    }
+  } else {
+    if (self.exclusive()) {
+      for (Index idx3 = 0; idx3 < self.size(); idx3++) {
+        Index curr = offset + idx3 * self.stride();
+        data[curr] = self.accumulator().finalize(accum);
+        self.accumulator().reduce(self.inner().coeff(curr), &accum);
+      }
+    } else {
+      for (Index idx3 = 0; idx3 < self.size(); idx3++) {
+        Index curr = offset + idx3 * self.stride();
+        self.accumulator().reduce(self.inner().coeff(curr), &accum);
+        data[curr] = self.accumulator().finalize(accum);
+      }
+    }
+  }
+}
+
+template <typename Self>
+EIGEN_STRONG_INLINE void ReducePacket(Self& self, Index offset, typename Self::CoeffReturnType* data) {
+  using Scalar = typename Self::CoeffReturnType;
+  using Packet = typename Self::PacketReturnType;
+  // Compute the scan along the axis, starting at the calculated offset
+  Packet accum = self.accumulator().template initializePacket<Packet>();
+  if (self.stride() == 1) {
+    if (self.exclusive()) {
+      for (Index curr = offset; curr < offset + self.size(); ++curr) {
+        internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum));
+        self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum);
+      }
+    } else {
+      for (Index curr = offset; curr < offset + self.size(); ++curr) {
+        self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum);
+        internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum));
+      }
+    }
+  } else {
+    if (self.exclusive()) {
+      for (Index idx3 = 0; idx3 < self.size(); idx3++) {
+        const Index curr = offset + idx3 * self.stride();
+        internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum));
+        self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum);
+      }
+    } else {
+      for (Index idx3 = 0; idx3 < self.size(); idx3++) {
+        const Index curr = offset + idx3 * self.stride();
+        self.accumulator().reducePacket(self.inner().template packet<Unaligned>(curr), &accum);
+        internal::pstoreu<Scalar, Packet>(data + curr, self.accumulator().finalizePacket(accum));
+      }
+    }
+  }
+}
+
+template <typename Self, bool Vectorize, bool Parallel>
+struct ReduceBlock {
+  EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1, typename Self::CoeffReturnType* data) {
+    for (Index idx2 = 0; idx2 < self.stride(); idx2++) {
+      // Calculate the starting offset for the scan
+      Index offset = idx1 + idx2;
+      ReduceScalar(self, offset, data);
+    }
+  }
+};
+
+// Specialization for vectorized reduction.
+template <typename Self>
+struct ReduceBlock<Self, /*Vectorize=*/true, /*Parallel=*/false> {
+  EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1, typename Self::CoeffReturnType* data) {
+    using Packet = typename Self::PacketReturnType;
+    const int PacketSize = internal::unpacket_traits<Packet>::size;
+    Index idx2 = 0;
+    for (; idx2 + PacketSize <= self.stride(); idx2 += PacketSize) {
+      // Calculate the starting offset for the packet scan
+      Index offset = idx1 + idx2;
+      ReducePacket(self, offset, data);
+    }
+    for (; idx2 < self.stride(); idx2++) {
+      // Calculate the starting offset for the scan
+      Index offset = idx1 + idx2;
+      ReduceScalar(self, offset, data);
+    }
+  }
+};
+
+// Single-threaded CPU implementation of scan
+template <typename Self, typename Reducer, typename Device,
+          bool Vectorize = (TensorEvaluator<typename Self::ChildTypeNoConst, Device>::PacketAccess &&
+                            internal::reducer_traits<Reducer, Device>::PacketAccess)>
+struct ScanLauncher {
+  void operator()(Self& self, typename Self::CoeffReturnType* data) const {
+    Index total_size = internal::array_prod(self.dimensions());
+
+    // We fix the index along the scan axis to 0 and perform a
+    // scan per remaining entry. The iteration is split into two nested
+    // loops to avoid an integer division by keeping track of each idx1 and
+    // idx2.
+    for (Index idx1 = 0; idx1 < total_size; idx1 += self.stride() * self.size()) {
+      ReduceBlock<Self, Vectorize, /*Parallel=*/false> block_reducer;
+      block_reducer(self, idx1, data);
+    }
+  }
+};
+
+#ifdef EIGEN_USE_THREADS
+
+// Adjust block_size to avoid false sharing of cachelines among
+// threads. Currently set to twice the cache line size on Intel and ARM
+// processors.
+EIGEN_STRONG_INLINE Index AdjustBlockSize(Index item_size, Index block_size) {
+  constexpr Index kBlockAlignment = 128;
+  const Index items_per_cacheline = numext::maxi<Index>(1, kBlockAlignment / item_size);
+  return items_per_cacheline * numext::div_ceil(block_size, items_per_cacheline);
+}
+
+template <typename Self>
+struct ReduceBlock<Self, /*Vectorize=*/true, /*Parallel=*/true> {
+  EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1, typename Self::CoeffReturnType* data) {
+    using Scalar = typename Self::CoeffReturnType;
+    using Packet = typename Self::PacketReturnType;
+    const int PacketSize = internal::unpacket_traits<Packet>::size;
+    Index num_scalars = self.stride();
+    Index num_packets = 0;
+    if (self.stride() >= PacketSize) {
+      num_packets = self.stride() / PacketSize;
+      self.device().parallelFor(
+          num_packets,
+          TensorOpCost(PacketSize * self.size(), PacketSize * self.size(), 16 * PacketSize * self.size(), true,
+                       PacketSize),
+          // Make the shard size large enough that two neighboring threads
+          // won't write to the same cacheline of `data`.
+          [=](Index blk_size) { return AdjustBlockSize(PacketSize * sizeof(Scalar), blk_size); },
+          [&](Index first, Index last) {
+            for (Index packet = first; packet < last; ++packet) {
+              const Index idx2 = packet * PacketSize;
+              ReducePacket(self, idx1 + idx2, data);
+            }
+          });
+      num_scalars -= num_packets * PacketSize;
+    }
+    self.device().parallelFor(
+        num_scalars, TensorOpCost(self.size(), self.size(), 16 * self.size()),
+        // Make the shard size large enough that two neighboring threads
+        // won't write to the same cacheline of `data`.
+        [=](Index blk_size) { return AdjustBlockSize(sizeof(Scalar), blk_size); },
+        [&](Index first, Index last) {
+          for (Index scalar = first; scalar < last; ++scalar) {
+            const Index idx2 = num_packets * PacketSize + scalar;
+            ReduceScalar(self, idx1 + idx2, data);
+          }
+        });
+  }
+};
+
+template <typename Self>
+struct ReduceBlock<Self, /*Vectorize=*/false, /*Parallel=*/true> {
+  EIGEN_STRONG_INLINE void operator()(Self& self, Index idx1, typename Self::CoeffReturnType* data) {
+    using Scalar = typename Self::CoeffReturnType;
+    self.device().parallelFor(
+        self.stride(), TensorOpCost(self.size(), self.size(), 16 * self.size()),
+        // Make the shard size large enough that two neighboring threads
+        // won't write to the same cacheline of `data`.
+        [=](Index blk_size) { return AdjustBlockSize(sizeof(Scalar), blk_size); },
+        [&](Index first, Index last) {
+          for (Index idx2 = first; idx2 < last; ++idx2) {
+            ReduceScalar(self, idx1 + idx2, data);
+          }
+        });
+  }
+};
+
+// Specialization for multi-threaded execution.
+template <typename Self, typename Reducer, bool Vectorize>
+struct ScanLauncher<Self, Reducer, ThreadPoolDevice, Vectorize> {
+  void operator()(Self& self, typename Self::CoeffReturnType* data) {
+    using Scalar = typename Self::CoeffReturnType;
+    using Packet = typename Self::PacketReturnType;
+    const int PacketSize = internal::unpacket_traits<Packet>::size;
+    const Index total_size = internal::array_prod(self.dimensions());
+    const Index inner_block_size = self.stride() * self.size();
+    bool parallelize_by_outer_blocks = (total_size >= (self.stride() * inner_block_size));
+
+    if ((parallelize_by_outer_blocks && total_size <= 4096) ||
+        (!parallelize_by_outer_blocks && self.stride() < PacketSize)) {
+      ScanLauncher<Self, Reducer, DefaultDevice, Vectorize> launcher;
+      launcher(self, data);
+      return;
+    }
+
+    if (parallelize_by_outer_blocks) {
+      // Parallelize over outer blocks.
+      const Index num_outer_blocks = total_size / inner_block_size;
+      self.device().parallelFor(
+          num_outer_blocks,
+          TensorOpCost(inner_block_size, inner_block_size, 16 * PacketSize * inner_block_size, Vectorize, PacketSize),
+          [=](Index blk_size) { return AdjustBlockSize(inner_block_size * sizeof(Scalar), blk_size); },
+          [&](Index first, Index last) {
+            for (Index idx1 = first; idx1 < last; ++idx1) {
+              ReduceBlock<Self, Vectorize, /*Parallelize=*/false> block_reducer;
+              block_reducer(self, idx1 * inner_block_size, data);
+            }
+          });
+    } else {
+      // Parallelize over inner packets/scalars dimensions when the reduction
+      // axis is not an inner dimension.
+      ReduceBlock<Self, Vectorize, /*Parallelize=*/true> block_reducer;
+      for (Index idx1 = 0; idx1 < total_size; idx1 += self.stride() * self.size()) {
+        block_reducer(self, idx1, data);
+      }
+    }
+  }
+};
+#endif  // EIGEN_USE_THREADS
+
+#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
+
+// GPU implementation of scan
+// TODO(ibab) This placeholder implementation performs multiple scans in
+// parallel, but it would be better to use a parallel scan algorithm and
+// optimize memory access.
+template <typename Self, typename Reducer>
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ScanKernel(Self self, Index total_size,
+                                                        typename Self::CoeffReturnType* data) {
+  // Compute offset as in the CPU version
+  Index val = threadIdx.x + blockIdx.x * blockDim.x;
+  Index offset = (val / self.stride()) * self.stride() * self.size() + val % self.stride();
+
+  if (offset + (self.size() - 1) * self.stride() < total_size) {
+    // Compute the scan along the axis, starting at the calculated offset
+    typename Self::CoeffReturnType accum = self.accumulator().initialize();
+    for (Index idx = 0; idx < self.size(); idx++) {
+      Index curr = offset + idx * self.stride();
+      if (self.exclusive()) {
+        data[curr] = self.accumulator().finalize(accum);
+        self.accumulator().reduce(self.inner().coeff(curr), &accum);
+      } else {
+        self.accumulator().reduce(self.inner().coeff(curr), &accum);
+        data[curr] = self.accumulator().finalize(accum);
+      }
+    }
+  }
+  __syncthreads();
+}
+
+template <typename Self, typename Reducer, bool Vectorize>
+struct ScanLauncher<Self, Reducer, GpuDevice, Vectorize> {
+  void operator()(const Self& self, typename Self::CoeffReturnType* data) {
+    Index total_size = internal::array_prod(self.dimensions());
+    Index num_blocks = (total_size / self.size() + 63) / 64;
+    Index block_size = 64;
+
+    LAUNCH_GPU_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data);
+  }
+};
+#endif  // EIGEN_USE_GPU && (EIGEN_GPUCC)
+
+}  // namespace internal
+
+// Eval as rvalue
+template <typename Op, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
+  typedef TensorScanOp<Op, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  typedef const ArgType ChildTypeNoConst;
+  typedef const ArgType ChildType;
+  static constexpr int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef std::remove_const_t<typename XprType::Scalar> Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> Self;
+  typedef StorageMemory<Scalar, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+  enum {
+    IsAligned = false,
+    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
+    BlockAccess = false,
+    PreferBlockAccess = false,
+    CoordAccess = false,
+    RawAccess = true
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device),
+        m_device(device),
+        m_exclusive(op.exclusive()),
+        m_accumulator(op.accumulator()),
+        m_size(m_impl.dimensions()[op.axis()]),
+        m_stride(1),
+        m_consume_dim(op.axis()),
+        m_output(NULL) {
+    // Accumulating a scalar isn't supported.
+    EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    eigen_assert(op.axis() >= 0 && op.axis() < NumDims);
+
+    // Compute stride of scan axis
+    const Dimensions& dims = m_impl.dimensions();
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = 0; i < op.axis(); ++i) {
+        m_stride = m_stride * dims[i];
+      }
+    } else {
+      // dims can only be indexed through unsigned integers,
+      // so let's use an unsigned type to let the compiler knows.
+      // This prevents stupid warnings: ""'*((void*)(& evaluator)+64)[18446744073709551615]' may be used uninitialized
+      // in this function"
+      unsigned int axis = internal::convert_index<unsigned int>(op.axis());
+      for (unsigned int i = NumDims - 1; i > axis; --i) {
+        m_stride = m_stride * dims[i];
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_impl.dimensions(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride() const { return m_stride; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& consume_dim() const { return m_consume_dim; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& size() const { return m_size; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op& accumulator() const { return m_accumulator; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool exclusive() const { return m_exclusive; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& inner() const { return m_impl; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const { return m_device; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    internal::ScanLauncher<Self, Op, Device> launcher;
+    if (data) {
+      launcher(*this, data);
+      return false;
+    }
+
+    const Index total_size = internal::array_prod(dimensions());
+    m_output =
+        static_cast<EvaluatorPointerType>(m_device.get((Scalar*)m_device.allocate_temp(total_size * sizeof(Scalar))));
+    launcher(*this, m_output);
+    return true;
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
+    return internal::ploadt<PacketReturnType, LoadMode>(m_output + index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_output; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { return m_output[index]; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const {
+    return TensorOpCost(sizeof(CoeffReturnType), 0, 0);
+  }
+
+  EIGEN_STRONG_INLINE void cleanup() {
+    if (m_output) {
+      m_device.deallocate_temp(m_output);
+      m_output = NULL;
+    }
+    m_impl.cleanup();
+  }
+
+ protected:
+  TensorEvaluator<ArgType, Device> m_impl;
+  const Device EIGEN_DEVICE_REF m_device;
+  const bool m_exclusive;
+  Op m_accumulator;
+  const Index m_size;
+  Index m_stride;
+  Index m_consume_dim;
+  EvaluatorPointerType m_output;
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h
new file mode 100644
index 00000000..3636788c
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorScanSycl.h
@@ -0,0 +1,506 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Mehdi Goli    Codeplay Software Ltd.
+// Ralph Potter  Codeplay Software Ltd.
+// Luke Iwanski  Codeplay Software Ltd.
+// Contact: <eigen@codeplay.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*****************************************************************
+ * TensorScanSycl.h
+ *
+ * \brief:
+ *  Tensor Scan Sycl implement the extend  version of
+ * "Efficient parallel scan algorithms for GPUs." .for Tensor operations.
+ * The algorithm requires up to 3 stage (consequently 3 kernels) depending on
+ * the size of the tensor. In the first kernel (ScanKernelFunctor), each
+ * threads within the work-group individually reduces the allocated elements per
+ * thread in order to reduces the total number of blocks. In the next step all
+ * thread within the work-group will reduce the associated blocks into the
+ * temporary buffers. In the next kernel(ScanBlockKernelFunctor), the temporary
+ * buffer is given as an input and all the threads within a work-group scan and
+ * reduces the boundaries between the blocks (generated from the previous
+ * kernel). and write the data on the temporary buffer. If the second kernel is
+ * required, the third and final kernel (ScanAdjustmentKernelFunctor) will
+ * adjust the final result into the output buffer.
+ * The original algorithm for the parallel prefix sum can be found here:
+ *
+ * Sengupta, Shubhabrata, Mark Harris, and Michael Garland. "Efficient parallel
+ * scan algorithms for GPUs." NVIDIA, Santa Clara, CA, Tech. Rep. NVR-2008-003
+ *1, no. 1 (2008): 1-17.
+ *****************************************************************/
+
+#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP
+#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace TensorSycl {
+namespace internal {
+
+#ifndef EIGEN_SYCL_MAX_GLOBAL_RANGE
+#define EIGEN_SYCL_MAX_GLOBAL_RANGE (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 * 4)
+#endif
+
+template <typename index_t>
+struct ScanParameters {
+  // must be power of 2
+  static constexpr index_t ScanPerThread = 8;
+  const index_t total_size;
+  const index_t non_scan_size;
+  const index_t scan_size;
+  const index_t non_scan_stride;
+  const index_t scan_stride;
+  const index_t panel_threads;
+  const index_t group_threads;
+  const index_t block_threads;
+  const index_t elements_per_group;
+  const index_t elements_per_block;
+  const index_t loop_range;
+
+  ScanParameters(index_t total_size_, index_t non_scan_size_, index_t scan_size_, index_t non_scan_stride_,
+                 index_t scan_stride_, index_t panel_threads_, index_t group_threads_, index_t block_threads_,
+                 index_t elements_per_group_, index_t elements_per_block_, index_t loop_range_)
+      : total_size(total_size_),
+        non_scan_size(non_scan_size_),
+        scan_size(scan_size_),
+        non_scan_stride(non_scan_stride_),
+        scan_stride(scan_stride_),
+        panel_threads(panel_threads_),
+        group_threads(group_threads_),
+        block_threads(block_threads_),
+        elements_per_group(elements_per_group_),
+        elements_per_block(elements_per_block_),
+        loop_range(loop_range_) {}
+};
+
+enum class scan_step { first, second };
+template <typename Evaluator, typename CoeffReturnType, typename OutAccessor, typename Op, typename Index,
+          scan_step stp>
+struct ScanKernelFunctor {
+  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+      LocalAccessor;
+  static constexpr int PacketSize = ScanParameters<Index>::ScanPerThread / 2;
+
+  LocalAccessor scratch;
+  Evaluator dev_eval;
+  OutAccessor out_ptr;
+  OutAccessor tmp_ptr;
+  const ScanParameters<Index> scanParameters;
+  Op accumulator;
+  const bool inclusive;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanKernelFunctor(LocalAccessor scratch_, const Evaluator dev_eval_,
+                                                          OutAccessor out_accessor_, OutAccessor temp_accessor_,
+                                                          const ScanParameters<Index> scanParameters_, Op accumulator_,
+                                                          const bool inclusive_)
+      : scratch(scratch_),
+        dev_eval(dev_eval_),
+        out_ptr(out_accessor_),
+        tmp_ptr(temp_accessor_),
+        scanParameters(scanParameters_),
+        accumulator(accumulator_),
+        inclusive(inclusive_) {}
+
+  template <scan_step sst = stp, typename Input>
+  std::enable_if_t<sst == scan_step::first, CoeffReturnType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE read(
+      const Input &inpt, Index global_id) const {
+    return inpt.coeff(global_id);
+  }
+
+  template <scan_step sst = stp, typename Input>
+  std::enable_if_t<sst != scan_step::first, CoeffReturnType> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE read(
+      const Input &inpt, Index global_id) const {
+    return inpt[global_id];
+  }
+
+  template <scan_step sst = stp, typename InclusiveOp>
+  std::enable_if_t<sst == scan_step::first> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE first_step_inclusive_Operation(
+      InclusiveOp inclusive_op) const {
+    inclusive_op();
+  }
+
+  template <scan_step sst = stp, typename InclusiveOp>
+  std::enable_if_t<sst != scan_step::first> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE first_step_inclusive_Operation(
+      InclusiveOp) const {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
+    for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) {
+      Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset));
+      Index tmp = data_offset % scanParameters.panel_threads;
+      const Index panel_id = data_offset / scanParameters.panel_threads;
+      const Index group_id = tmp / scanParameters.group_threads;
+      tmp = tmp % scanParameters.group_threads;
+      const Index block_id = tmp / scanParameters.block_threads;
+      const Index local_id = tmp % scanParameters.block_threads;
+      // we put one element per packet in scratch_mem
+      const Index scratch_stride = scanParameters.elements_per_block / PacketSize;
+      const Index scratch_offset = (itemID.get_local_id(0) / scanParameters.block_threads) * scratch_stride;
+      CoeffReturnType private_scan[ScanParameters<Index>::ScanPerThread];
+      CoeffReturnType inclusive_scan;
+      // the actual panel size is scan_size * non_scan_size.
+      // elements_per_panel is roundup to power of 2 for binary tree
+      const Index panel_offset = panel_id * scanParameters.scan_size * scanParameters.non_scan_size;
+      const Index group_offset = group_id * scanParameters.non_scan_stride;
+      // This will be effective when the size is bigger than elements_per_block
+      const Index block_offset = block_id * scanParameters.elements_per_block * scanParameters.scan_stride;
+      const Index thread_offset = (ScanParameters<Index>::ScanPerThread * local_id * scanParameters.scan_stride);
+      const Index global_offset = panel_offset + group_offset + block_offset + thread_offset;
+      Index next_elements = 0;
+      EIGEN_UNROLL_LOOP
+      for (int i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {
+        Index global_id = global_offset + next_elements;
+        private_scan[i] = ((((block_id * scanParameters.elements_per_block) +
+                             (ScanParameters<Index>::ScanPerThread * local_id) + i) < scanParameters.scan_size) &&
+                           (global_id < scanParameters.total_size))
+                              ? read(dev_eval, global_id)
+                              : accumulator.initialize();
+        next_elements += scanParameters.scan_stride;
+      }
+      first_step_inclusive_Operation([&]() EIGEN_DEVICE_FUNC {
+        if (inclusive) {
+          inclusive_scan = private_scan[ScanParameters<Index>::ScanPerThread - 1];
+        }
+      });
+      // This for loop must be 2
+      EIGEN_UNROLL_LOOP
+      for (int packetIndex = 0; packetIndex < ScanParameters<Index>::ScanPerThread; packetIndex += PacketSize) {
+        Index private_offset = 1;
+        // build sum in place up the tree
+        EIGEN_UNROLL_LOOP
+        for (Index d = PacketSize >> 1; d > 0; d >>= 1) {
+          EIGEN_UNROLL_LOOP
+          for (Index l = 0; l < d; l++) {
+            Index ai = private_offset * (2 * l + 1) - 1 + packetIndex;
+            Index bi = private_offset * (2 * l + 2) - 1 + packetIndex;
+            CoeffReturnType accum = accumulator.initialize();
+            accumulator.reduce(private_scan[ai], &accum);
+            accumulator.reduce(private_scan[bi], &accum);
+            private_scan[bi] = accumulator.finalize(accum);
+          }
+          private_offset *= 2;
+        }
+        scratch[2 * local_id + (packetIndex / PacketSize) + scratch_offset] =
+            private_scan[PacketSize - 1 + packetIndex];
+        private_scan[PacketSize - 1 + packetIndex] = accumulator.initialize();
+        // traverse down tree & build scan
+        EIGEN_UNROLL_LOOP
+        for (Index d = 1; d < PacketSize; d *= 2) {
+          private_offset >>= 1;
+          EIGEN_UNROLL_LOOP
+          for (Index l = 0; l < d; l++) {
+            Index ai = private_offset * (2 * l + 1) - 1 + packetIndex;
+            Index bi = private_offset * (2 * l + 2) - 1 + packetIndex;
+            CoeffReturnType accum = accumulator.initialize();
+            accumulator.reduce(private_scan[ai], &accum);
+            accumulator.reduce(private_scan[bi], &accum);
+            private_scan[ai] = private_scan[bi];
+            private_scan[bi] = accumulator.finalize(accum);
+          }
+        }
+      }
+
+      Index offset = 1;
+      // build sum in place up the tree
+      for (Index d = scratch_stride >> 1; d > 0; d >>= 1) {
+        // Synchronise
+        itemID.barrier(cl::sycl::access::fence_space::local_space);
+        if (local_id < d) {
+          Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset;
+          Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset;
+          CoeffReturnType accum = accumulator.initialize();
+          accumulator.reduce(scratch[ai], &accum);
+          accumulator.reduce(scratch[bi], &accum);
+          scratch[bi] = accumulator.finalize(accum);
+        }
+        offset *= 2;
+      }
+      // Synchronise
+      itemID.barrier(cl::sycl::access::fence_space::local_space);
+      // next step optimisation
+      if (local_id == 0) {
+        if (((scanParameters.elements_per_group / scanParameters.elements_per_block) > 1)) {
+          const Index temp_id = panel_id * (scanParameters.elements_per_group / scanParameters.elements_per_block) *
+                                    scanParameters.non_scan_size +
+                                group_id * (scanParameters.elements_per_group / scanParameters.elements_per_block) +
+                                block_id;
+          tmp_ptr[temp_id] = scratch[scratch_stride - 1 + scratch_offset];
+        }
+        // clear the last element
+        scratch[scratch_stride - 1 + scratch_offset] = accumulator.initialize();
+      }
+      // traverse down tree & build scan
+      for (Index d = 1; d < scratch_stride; d *= 2) {
+        offset >>= 1;
+        // Synchronise
+        itemID.barrier(cl::sycl::access::fence_space::local_space);
+        if (local_id < d) {
+          Index ai = offset * (2 * local_id + 1) - 1 + scratch_offset;
+          Index bi = offset * (2 * local_id + 2) - 1 + scratch_offset;
+          CoeffReturnType accum = accumulator.initialize();
+          accumulator.reduce(scratch[ai], &accum);
+          accumulator.reduce(scratch[bi], &accum);
+          scratch[ai] = scratch[bi];
+          scratch[bi] = accumulator.finalize(accum);
+        }
+      }
+      // Synchronise
+      itemID.barrier(cl::sycl::access::fence_space::local_space);
+      // This for loop must be 2
+      EIGEN_UNROLL_LOOP
+      for (int packetIndex = 0; packetIndex < ScanParameters<Index>::ScanPerThread; packetIndex += PacketSize) {
+        EIGEN_UNROLL_LOOP
+        for (Index i = 0; i < PacketSize; i++) {
+          CoeffReturnType accum = private_scan[packetIndex + i];
+          accumulator.reduce(scratch[2 * local_id + (packetIndex / PacketSize) + scratch_offset], &accum);
+          private_scan[packetIndex + i] = accumulator.finalize(accum);
+        }
+      }
+      first_step_inclusive_Operation([&]() EIGEN_DEVICE_FUNC {
+        if (inclusive) {
+          accumulator.reduce(private_scan[ScanParameters<Index>::ScanPerThread - 1], &inclusive_scan);
+          private_scan[0] = accumulator.finalize(inclusive_scan);
+        }
+      });
+      next_elements = 0;
+      // right the first set of private param
+      EIGEN_UNROLL_LOOP
+      for (Index i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {
+        Index global_id = global_offset + next_elements;
+        if ((((block_id * scanParameters.elements_per_block) + (ScanParameters<Index>::ScanPerThread * local_id) + i) <
+             scanParameters.scan_size) &&
+            (global_id < scanParameters.total_size)) {
+          Index private_id = (i * !inclusive) + (((i + 1) % ScanParameters<Index>::ScanPerThread) * (inclusive));
+          out_ptr[global_id] = private_scan[private_id];
+        }
+        next_elements += scanParameters.scan_stride;
+      }
+    }  // end for loop
+  }
+};
+
+template <typename CoeffReturnType, typename InAccessor, typename OutAccessor, typename Op, typename Index>
+struct ScanAdjustmentKernelFunctor {
+  typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
+      LocalAccessor;
+  static constexpr int PacketSize = ScanParameters<Index>::ScanPerThread / 2;
+  InAccessor in_ptr;
+  OutAccessor out_ptr;
+  const ScanParameters<Index> scanParameters;
+  Op accumulator;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE ScanAdjustmentKernelFunctor(LocalAccessor, InAccessor in_accessor_,
+                                                                    OutAccessor out_accessor_,
+                                                                    const ScanParameters<Index> scanParameters_,
+                                                                    Op accumulator_)
+      : in_ptr(in_accessor_), out_ptr(out_accessor_), scanParameters(scanParameters_), accumulator(accumulator_) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void operator()(cl::sycl::nd_item<1> itemID) const {
+    for (Index loop_offset = 0; loop_offset < scanParameters.loop_range; loop_offset++) {
+      Index data_offset = (itemID.get_global_id(0) + (itemID.get_global_range(0) * loop_offset));
+      Index tmp = data_offset % scanParameters.panel_threads;
+      const Index panel_id = data_offset / scanParameters.panel_threads;
+      const Index group_id = tmp / scanParameters.group_threads;
+      tmp = tmp % scanParameters.group_threads;
+      const Index block_id = tmp / scanParameters.block_threads;
+      const Index local_id = tmp % scanParameters.block_threads;
+
+      // the actual panel size is scan_size * non_scan_size.
+      // elements_per_panel is roundup to power of 2 for binary tree
+      const Index panel_offset = panel_id * scanParameters.scan_size * scanParameters.non_scan_size;
+      const Index group_offset = group_id * scanParameters.non_scan_stride;
+      // This will be effective when the size is bigger than elements_per_block
+      const Index block_offset = block_id * scanParameters.elements_per_block * scanParameters.scan_stride;
+      const Index thread_offset = ScanParameters<Index>::ScanPerThread * local_id * scanParameters.scan_stride;
+
+      const Index global_offset = panel_offset + group_offset + block_offset + thread_offset;
+      const Index block_size = scanParameters.elements_per_group / scanParameters.elements_per_block;
+      const Index in_id = (panel_id * block_size * scanParameters.non_scan_size) + (group_id * block_size) + block_id;
+      CoeffReturnType adjust_val = in_ptr[in_id];
+
+      Index next_elements = 0;
+      EIGEN_UNROLL_LOOP
+      for (Index i = 0; i < ScanParameters<Index>::ScanPerThread; i++) {
+        Index global_id = global_offset + next_elements;
+        if ((((block_id * scanParameters.elements_per_block) + (ScanParameters<Index>::ScanPerThread * local_id) + i) <
+             scanParameters.scan_size) &&
+            (global_id < scanParameters.total_size)) {
+          CoeffReturnType accum = adjust_val;
+          accumulator.reduce(out_ptr[global_id], &accum);
+          out_ptr[global_id] = accumulator.finalize(accum);
+        }
+        next_elements += scanParameters.scan_stride;
+      }
+    }
+  }
+};
+
+template <typename Index>
+struct ScanInfo {
+  const Index &total_size;
+  const Index &scan_size;
+  const Index &panel_size;
+  const Index &non_scan_size;
+  const Index &scan_stride;
+  const Index &non_scan_stride;
+
+  Index max_elements_per_block;
+  Index block_size;
+  Index panel_threads;
+  Index group_threads;
+  Index block_threads;
+  Index elements_per_group;
+  Index elements_per_block;
+  Index loop_range;
+  Index global_range;
+  Index local_range;
+  const Eigen::SyclDevice &dev;
+  EIGEN_STRONG_INLINE ScanInfo(const Index &total_size_, const Index &scan_size_, const Index &panel_size_,
+                               const Index &non_scan_size_, const Index &scan_stride_, const Index &non_scan_stride_,
+                               const Eigen::SyclDevice &dev_)
+      : total_size(total_size_),
+        scan_size(scan_size_),
+        panel_size(panel_size_),
+        non_scan_size(non_scan_size_),
+        scan_stride(scan_stride_),
+        non_scan_stride(non_scan_stride_),
+        dev(dev_) {
+    // must be power of 2
+    local_range = std::min(Index(dev.getNearestPowerOfTwoWorkGroupSize()),
+                           Index(EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1));
+
+    max_elements_per_block = local_range * ScanParameters<Index>::ScanPerThread;
+
+    elements_per_group =
+        dev.getPowerOfTwo(Index(roundUp(Index(scan_size), ScanParameters<Index>::ScanPerThread)), true);
+    const Index elements_per_panel = elements_per_group * non_scan_size;
+    elements_per_block = std::min(Index(elements_per_group), Index(max_elements_per_block));
+    panel_threads = elements_per_panel / ScanParameters<Index>::ScanPerThread;
+    group_threads = elements_per_group / ScanParameters<Index>::ScanPerThread;
+    block_threads = elements_per_block / ScanParameters<Index>::ScanPerThread;
+    block_size = elements_per_group / elements_per_block;
+#ifdef EIGEN_SYCL_MAX_GLOBAL_RANGE
+    const Index max_threads = std::min(Index(panel_threads * panel_size), Index(EIGEN_SYCL_MAX_GLOBAL_RANGE));
+#else
+    const Index max_threads = panel_threads * panel_size;
+#endif
+    global_range = roundUp(max_threads, local_range);
+    loop_range = Index(
+        std::ceil(double(elements_per_panel * panel_size) / (global_range * ScanParameters<Index>::ScanPerThread)));
+  }
+  inline ScanParameters<Index> get_scan_parameter() {
+    return ScanParameters<Index>(total_size, non_scan_size, scan_size, non_scan_stride, scan_stride, panel_threads,
+                                 group_threads, block_threads, elements_per_group, elements_per_block, loop_range);
+  }
+  inline cl::sycl::nd_range<1> get_thread_range() {
+    return cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
+  }
+};
+
+template <typename EvaluatorPointerType, typename CoeffReturnType, typename Reducer, typename Index>
+struct SYCLAdjustBlockOffset {
+  EIGEN_STRONG_INLINE static void adjust_scan_block_offset(EvaluatorPointerType in_ptr, EvaluatorPointerType out_ptr,
+                                                           Reducer &accumulator, const Index total_size,
+                                                           const Index scan_size, const Index panel_size,
+                                                           const Index non_scan_size, const Index scan_stride,
+                                                           const Index non_scan_stride, const Eigen::SyclDevice &dev) {
+    auto scan_info =
+        ScanInfo<Index>(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev);
+
+    typedef ScanAdjustmentKernelFunctor<CoeffReturnType, EvaluatorPointerType, EvaluatorPointerType, Reducer, Index>
+        AdjustFuctor;
+    dev.template unary_kernel_launcher<CoeffReturnType, AdjustFuctor>(in_ptr, out_ptr, scan_info.get_thread_range(),
+                                                                      scan_info.max_elements_per_block,
+                                                                      scan_info.get_scan_parameter(), accumulator)
+        .wait();
+  }
+};
+
+template <typename CoeffReturnType, scan_step stp>
+struct ScanLauncher_impl {
+  template <typename Input, typename EvaluatorPointerType, typename Reducer, typename Index>
+  EIGEN_STRONG_INLINE static void scan_block(Input in_ptr, EvaluatorPointerType out_ptr, Reducer &accumulator,
+                                             const Index total_size, const Index scan_size, const Index panel_size,
+                                             const Index non_scan_size, const Index scan_stride,
+                                             const Index non_scan_stride, const bool inclusive,
+                                             const Eigen::SyclDevice &dev) {
+    auto scan_info =
+        ScanInfo<Index>(total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride, dev);
+    const Index temp_pointer_size = scan_info.block_size * non_scan_size * panel_size;
+    const Index scratch_size = scan_info.max_elements_per_block / (ScanParameters<Index>::ScanPerThread / 2);
+    CoeffReturnType *temp_pointer =
+        static_cast<CoeffReturnType *>(dev.allocate_temp(temp_pointer_size * sizeof(CoeffReturnType)));
+    EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);
+
+    typedef ScanKernelFunctor<Input, CoeffReturnType, EvaluatorPointerType, Reducer, Index, stp> ScanFunctor;
+    dev.template binary_kernel_launcher<CoeffReturnType, ScanFunctor>(
+           in_ptr, out_ptr, tmp_global_accessor, scan_info.get_thread_range(), scratch_size,
+           scan_info.get_scan_parameter(), accumulator, inclusive)
+        .wait();
+
+    if (scan_info.block_size > 1) {
+      ScanLauncher_impl<CoeffReturnType, scan_step::second>::scan_block(
+          tmp_global_accessor, tmp_global_accessor, accumulator, temp_pointer_size, scan_info.block_size, panel_size,
+          non_scan_size, Index(1), scan_info.block_size, false, dev);
+
+      SYCLAdjustBlockOffset<EvaluatorPointerType, CoeffReturnType, Reducer, Index>::adjust_scan_block_offset(
+          tmp_global_accessor, out_ptr, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride,
+          non_scan_stride, dev);
+    }
+    dev.deallocate_temp(temp_pointer);
+  }
+};
+
+}  // namespace internal
+}  // namespace TensorSycl
+namespace internal {
+template <typename Self, typename Reducer, bool vectorize>
+struct ScanLauncher<Self, Reducer, Eigen::SyclDevice, vectorize> {
+  typedef typename Self::Index Index;
+  typedef typename Self::CoeffReturnType CoeffReturnType;
+  typedef typename Self::Storage Storage;
+  typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
+  void operator()(Self &self, EvaluatorPointerType data) const {
+    const Index total_size = internal::array_prod(self.dimensions());
+    const Index scan_size = self.size();
+    const Index scan_stride = self.stride();
+    // this is the scan op (can be sum or ...)
+    auto accumulator = self.accumulator();
+    auto inclusive = !self.exclusive();
+    auto consume_dim = self.consume_dim();
+    auto dev = self.device();
+
+    auto dims = self.inner().dimensions();
+
+    Index non_scan_size = 1;
+    Index panel_size = 1;
+    if (static_cast<int>(Self::Layout) == static_cast<int>(ColMajor)) {
+      for (int i = 0; i < consume_dim; i++) {
+        non_scan_size *= dims[i];
+      }
+      for (int i = consume_dim + 1; i < Self::NumDims; i++) {
+        panel_size *= dims[i];
+      }
+    } else {
+      for (int i = Self::NumDims - 1; i > consume_dim; i--) {
+        non_scan_size *= dims[i];
+      }
+      for (int i = consume_dim - 1; i >= 0; i--) {
+        panel_size *= dims[i];
+      }
+    }
+    const Index non_scan_stride = (scan_stride > 1) ? 1 : scan_size;
+    auto eval_impl = self.inner();
+    TensorSycl::internal::ScanLauncher_impl<CoeffReturnType, TensorSycl::internal::scan_step::first>::scan_block(
+        eval_impl, data, accumulator, total_size, scan_size, panel_size, non_scan_size, scan_stride, non_scan_stride,
+        inclusive, dev);
+  }
+};
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_SYCL_SYCL_HPP
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
new file mode 100644
index 00000000..51f424e8
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@@ -0,0 +1,413 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H
+#define EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+template <typename Shuffle, typename XprType>
+struct traits<TensorShufflingOp<Shuffle, XprType> > : public traits<XprType> {
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef std::remove_reference_t<Nested> Nested_;
+  static constexpr int NumDimensions = XprTraits::NumDimensions;
+  static constexpr int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
+};
+
+template <typename Shuffle, typename XprType>
+struct eval<TensorShufflingOp<Shuffle, XprType>, Eigen::Dense> {
+  typedef const TensorShufflingOp<Shuffle, XprType>& type;
+};
+
+template <typename Shuffle, typename XprType>
+struct nested<TensorShufflingOp<Shuffle, XprType>, 1, typename eval<TensorShufflingOp<Shuffle, XprType> >::type> {
+  typedef TensorShufflingOp<Shuffle, XprType> type;
+};
+
+}  // end namespace internal
+
+/**
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor shuffling class.
+ */
+template <typename Shuffle, typename XprType>
+class TensorShufflingOp : public TensorBase<TensorShufflingOp<Shuffle, XprType> > {
+ public:
+  typedef TensorBase<TensorShufflingOp<Shuffle, XprType> > Base;
+  typedef typename Eigen::internal::traits<TensorShufflingOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorShufflingOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorShufflingOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorShufflingOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorShufflingOp(const XprType& expr, const Shuffle& shfl)
+      : m_xpr(expr), m_shuffle(shfl) {}
+
+  EIGEN_DEVICE_FUNC const Shuffle& shufflePermutation() const { return m_shuffle; }
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename XprType::Nested>& expression() const { return m_xpr; }
+
+  EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorShufflingOp)
+
+ protected:
+  typename XprType::Nested m_xpr;
+  const Shuffle m_shuffle;
+};
+
+// Eval as rvalue
+template <typename Shuffle, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> {
+  typedef TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> Self;
+  typedef TensorShufflingOp<Shuffle, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static constexpr int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+  enum {
+    IsAligned = false,
+    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
+    BlockAccess = TensorEvaluator<ArgType, Device>::RawAccess,
+    PreferBlockAccess = true,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  typedef std::remove_const_t<Scalar> ScalarNoConst;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
+
+  typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims, Layout, Index> TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_device(device), m_impl(op.expression(), device) {
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+    const Shuffle& shuffle = op.shufflePermutation();
+    m_is_identity = true;
+    for (int i = 0; i < NumDims; ++i) {
+      m_shuffle[i] = static_cast<int>(shuffle[i]);
+      m_dimensions[i] = input_dims[shuffle[i]];
+      m_inverseShuffle[shuffle[i]] = i;
+      if (m_is_identity && shuffle[i] != i) {
+        m_is_identity = false;
+      }
+    }
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_unshuffledInputStrides[0] = 1;
+      m_outputStrides[0] = 1;
+
+      for (int i = 1; i < NumDims; ++i) {
+        m_unshuffledInputStrides[i] = m_unshuffledInputStrides[i - 1] * input_dims[i - 1];
+        m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
+        m_fastOutputStrides[i] =
+            internal::TensorIntDivisor<Index>(m_outputStrides[i] > 0 ? m_outputStrides[i] : Index(1));
+      }
+    } else {
+      m_unshuffledInputStrides[NumDims - 1] = 1;
+      m_outputStrides[NumDims - 1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_unshuffledInputStrides[i] = m_unshuffledInputStrides[i + 1] * input_dims[i + 1];
+        m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
+        m_fastOutputStrides[i] =
+            internal::TensorIntDivisor<Index>(m_outputStrides[i] > 0 ? m_outputStrides[i] : Index(1));
+      }
+    }
+
+    for (int i = 0; i < NumDims; ++i) {
+      m_inputStrides[i] = m_unshuffledInputStrides[shuffle[i]];
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType, EvalSubExprsCallback done) {
+    m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    if (m_is_identity) {
+      return m_impl.coeff(index);
+    } else {
+      return m_impl.coeff(srcCoeff(index));
+    }
+  }
+
+  template <int LoadMode, typename Self, bool ImplPacketAccess>
+  struct PacketLoader {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static PacketReturnType Run(const Self& self, Index index) {
+      EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
+      EIGEN_UNROLL_LOOP
+      for (int i = 0; i < PacketSize; ++i) {
+        values[i] = self.coeff(index + i);
+      }
+      PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+      return rslt;
+    }
+  };
+
+  template <int LoadMode, typename Self>
+  struct PacketLoader<LoadMode, Self, true> {
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static PacketReturnType Run(const Self& self, Index index) {
+      if (self.m_is_identity) {
+        return self.m_impl.template packet<LoadMode>(index);
+      } else {
+        EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
+        EIGEN_UNROLL_LOOP
+        for (int i = 0; i < PacketSize; ++i) {
+          values[i] = self.coeff(index + i);
+        }
+        PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+        return rslt;
+      }
+    }
+  };
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+    eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+    return PacketLoader<LoadMode, Self, TensorEvaluator<ArgType, Device>::PacketAccess>::Run(*this, index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE internal::TensorBlockResourceRequirements getResourceRequirements() const {
+    static const int inner_dim = Layout == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
+
+    const size_t target_size = m_device.firstLevelCacheSize();
+    const bool inner_dim_shuffled = m_shuffle[inner_dim] != inner_dim;
+
+    // Shuffled inner dimensions leads to a random memory access, which is not
+    // captured by default cost model bytes loaded/stored. We add this cost
+    // explicitly. The number of cycles picked based on the benchmarks.
+    // TODO(ezhulenev): This number was picked based on a very questionable
+    // benchmarks, add benchmarks that are representative of real workloads.
+    using BlockRequirements = internal::TensorBlockResourceRequirements;
+    if (inner_dim_shuffled) {
+      return BlockRequirements::uniform<Scalar>(target_size).addCostPerCoeff({0, 0, NumDims * 28});
+    } else {
+      return BlockRequirements::skewed<Scalar>(target_size);
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
+                                                          bool root_of_expr_ast = false) const {
+    eigen_assert(m_impl.data() != NULL);
+
+    typedef internal::TensorBlockIO<ScalarNoConst, Index, NumDims, Layout> TensorBlockIO;
+    typedef typename TensorBlockIO::Dst TensorBlockIODst;
+    typedef typename TensorBlockIO::Src TensorBlockIOSrc;
+
+    const typename TensorBlock::Storage block_storage =
+        TensorBlock::prepareStorage(desc, scratch, /*allow_strided_storage=*/root_of_expr_ast);
+
+    typename TensorBlockIO::Dimensions input_strides(m_unshuffledInputStrides);
+    TensorBlockIOSrc src(input_strides, m_impl.data(), srcCoeff(desc.offset()));
+
+    TensorBlockIODst dst(block_storage.dimensions(), block_storage.strides(), block_storage.data());
+
+    typename TensorBlockIO::DimensionsMap dst_to_src_dim_map(m_shuffle);
+    TensorBlockIO::Copy(dst, src, dst_to_src_dim_map);
+
+    return block_storage.AsTensorMaterializedBlock();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    const double compute_cost = m_is_identity
+                                    ? TensorOpCost::AddCost<Index>()
+                                    : NumDims * (2 * TensorOpCost::AddCost<Index>() +
+                                                 2 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>());
+    return m_impl.costPerCoeff(vectorized) +
+           TensorOpCost(0, 0, compute_cost, m_is_identity /* vectorized */, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; }
+
+ protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index
+  GetBlockOutputIndex(Index input_index, const DSizes<Index, NumDims>& input_block_strides,
+                      const DSizes<Index, NumDims>& output_block_strides,
+                      const DSizes<internal::TensorIntDivisor<Index>, NumDims>& fast_input_block_strides) const {
+    Index output_index = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx = input_index / fast_input_block_strides[i];
+        output_index += idx * output_block_strides[m_inverseShuffle[i]];
+        input_index -= idx * input_block_strides[i];
+      }
+      return output_index + input_index * output_block_strides[m_inverseShuffle[0]];
+    } else {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx = input_index / fast_input_block_strides[i];
+        output_index += idx * output_block_strides[m_inverseShuffle[i]];
+        input_index -= idx * input_block_strides[i];
+      }
+      return output_index + input_index * output_block_strides[m_inverseShuffle[NumDims - 1]];
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const {
+    Index inputIndex = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx = index / m_fastOutputStrides[i];
+        inputIndex += idx * m_inputStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      return inputIndex + index * m_inputStrides[0];
+    } else {
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx = index / m_fastOutputStrides[i];
+        inputIndex += idx * m_inputStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      return inputIndex + index * m_inputStrides[NumDims - 1];
+    }
+  }
+
+  Dimensions m_dimensions;
+  bool m_is_identity;
+  array<int, NumDims> m_shuffle;
+  array<Index, NumDims> m_inverseShuffle;  // TODO(ezhulenev): Make it int type.
+  array<Index, NumDims> m_outputStrides;
+  array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
+  array<Index, NumDims> m_inputStrides;
+  array<Index, NumDims> m_unshuffledInputStrides;
+
+  const Device EIGEN_DEVICE_REF m_device;
+  TensorEvaluator<ArgType, Device> m_impl;
+};
+
+// Eval as lvalue
+template <typename Shuffle, typename ArgType, typename Device>
+struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
+    : public TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> {
+  typedef TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> Base;
+
+  typedef TensorShufflingOp<Shuffle, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static constexpr int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1),
+    BlockAccess = TensorEvaluator<ArgType, Device>::RawAccess,
+    PreferBlockAccess = true,
+    RawAccess = false
+  };
+
+  typedef std::remove_const_t<Scalar> ScalarNoConst;
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) const {
+    return this->m_impl.coeffRef(this->srcCoeff(index));
+  }
+
+  template <int StoreMode>
+  EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) const {
+    EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
+    internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
+    EIGEN_UNROLL_LOOP
+    for (int i = 0; i < PacketSize; ++i) {
+      this->coeffRef(index + i) = values[i];
+    }
+  }
+
+  template <typename TensorBlock>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(const TensorBlockDesc& desc, const TensorBlock& block) {
+    eigen_assert(this->m_impl.data() != NULL);
+
+    typedef internal::TensorBlockIO<ScalarNoConst, Index, NumDims, Layout> TensorBlockIO;
+    typedef typename TensorBlockIO::Dst TensorBlockIODst;
+    typedef typename TensorBlockIO::Src TensorBlockIOSrc;
+
+    const Scalar* block_buffer = block.data();
+
+    // TODO(ezhulenev): TensorBlockIO should be able to read from any Eigen
+    // expression with coefficient and packet access as `src`.
+    void* mem = NULL;
+    if (block_buffer == NULL) {
+      mem = this->m_device.allocate(desc.size() * sizeof(Scalar));
+      ScalarNoConst* buf = static_cast<ScalarNoConst*>(mem);
+
+      typedef internal::TensorBlockAssignment<ScalarNoConst, NumDims, typename TensorBlock::XprType, Index>
+          TensorBlockAssignment;
+
+      TensorBlockAssignment::Run(
+          TensorBlockAssignment::target(desc.dimensions(), internal::strides<Layout>(desc.dimensions()), buf),
+          block.expr());
+
+      block_buffer = buf;
+    }
+
+    // Read from block.
+    TensorBlockIOSrc src(internal::strides<Layout>(desc.dimensions()), block_buffer);
+
+    // Write to the output buffer.
+    typename TensorBlockIO::Dimensions output_strides(this->m_unshuffledInputStrides);
+    typename TensorBlockIO::Dimensions output_dimensions;
+    for (int i = 0; i < NumDims; ++i) {
+      output_dimensions[this->m_shuffle[i]] = desc.dimension(i);
+    }
+    TensorBlockIODst dst(output_dimensions, output_strides, this->m_impl.data(), this->srcCoeff(desc.offset()));
+
+    // Reorder dimensions according to the shuffle.
+    typename TensorBlockIO::DimensionsMap dst_to_src_dim_map;
+    for (int i = 0; i < NumDims; ++i) {
+      dst_to_src_dim_map[i] = static_cast<int>(this->m_inverseShuffle[i]);
+    }
+    TensorBlockIO::Copy(dst, src, dst_to_src_dim_map);
+
+    // Deallocate temporary buffer used for the block materialization.
+    if (mem != NULL) this->m_device.deallocate(mem);
+  }
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
new file mode 100644
index 00000000..62686ce6
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
@@ -0,0 +1,143 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+// Copyright (C) 2014-2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSORSTORAGE_H
+#define EIGEN_CXX11_TENSOR_TENSORSTORAGE_H
+
+#ifdef EIGEN_TENSOR_STORAGE_CTOR_PLUGIN
+#define EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN EIGEN_TENSOR_STORAGE_CTOR_PLUGIN;
+#else
+#define EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN
+#endif
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \internal
+ *
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Stores the data of a tensor
+ *
+ * This class stores the data of fixed-size, dynamic-size or mixed tensors
+ * in a way as compact as possible.
+ *
+ * \sa Tensor
+ */
+template <typename T, typename Dimensions, int Options>
+class TensorStorage;
+
+// Pure fixed-size storage
+template <typename T, typename FixedDimensions, int Options_>
+class TensorStorage {
+ private:
+  static constexpr std::size_t Size = FixedDimensions::total_size;
+
+  // Allocate an array of size at least one to prevent compiler warnings.
+  static constexpr std::size_t MinSize = max_n_1<Size>::size;
+  EIGEN_ALIGN_MAX T m_data[MinSize];
+
+ public:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorStorage() {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T* data() { return m_data; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T* data() const { return m_data; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const FixedDimensions dimensions() const { return FixedDimensions(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE DenseIndex size() const { return Size; }
+};
+
+// pure dynamic
+template <typename T, typename IndexType, int NumIndices_, int Options_>
+class TensorStorage<T, DSizes<IndexType, NumIndices_>, Options_> {
+ public:
+  typedef IndexType Index;
+  typedef DSizes<IndexType, NumIndices_> Dimensions;
+  typedef TensorStorage<T, DSizes<IndexType, NumIndices_>, Options_> Self;
+
+  EIGEN_DEVICE_FUNC TensorStorage() : m_data(0), m_dimensions() {
+    if (NumIndices_ == 0) {
+      m_data = internal::conditional_aligned_new_auto<T, (Options_ & DontAlign) == 0>(1);
+    }
+  }
+  EIGEN_DEVICE_FUNC TensorStorage(Index size, const array<Index, NumIndices_>& dimensions)
+      : m_data(internal::conditional_aligned_new_auto<T, (Options_ & DontAlign) == 0>(size)), m_dimensions(dimensions) {
+    EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN
+  }
+
+  template <typename... DenseIndex>
+  EIGEN_DEVICE_FUNC TensorStorage(DenseIndex... indices) : m_dimensions(indices...) {
+    m_data = internal::conditional_aligned_new_auto<T, (Options_ & DontAlign) == 0>(internal::array_prod(m_dimensions));
+  }
+
+  EIGEN_DEVICE_FUNC TensorStorage(const Self& other)
+      : m_data(internal::conditional_aligned_new_auto<T, (Options_ & DontAlign) == 0>(
+            internal::array_prod(other.m_dimensions))),
+        m_dimensions(other.m_dimensions) {
+    internal::smart_copy(other.m_data, other.m_data + internal::array_prod(other.m_dimensions), m_data);
+  }
+  EIGEN_DEVICE_FUNC Self& operator=(const Self& other) {
+    if (this != &other) {
+      Self tmp(other);
+      this->swap(tmp);
+    }
+    return *this;
+  }
+
+  EIGEN_DEVICE_FUNC TensorStorage(Self&& other) : TensorStorage() { *this = std::move(other); }
+
+  EIGEN_DEVICE_FUNC Self& operator=(Self&& other) {
+    numext::swap(m_data, other.m_data);
+    numext::swap(m_dimensions, other.m_dimensions);
+    return *this;
+  }
+
+  EIGEN_DEVICE_FUNC ~TensorStorage() {
+    internal::conditional_aligned_delete_auto<T, (Options_ & DontAlign) == 0>(m_data,
+                                                                              internal::array_prod(m_dimensions));
+  }
+  EIGEN_DEVICE_FUNC void swap(Self& other) {
+    numext::swap(m_data, other.m_data);
+    numext::swap(m_dimensions, other.m_dimensions);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_DEVICE_FUNC void resize(Index size, const array<Index, NumIndices_>& nbDimensions) {
+    const Index currentSz = internal::array_prod(m_dimensions);
+    if (size != currentSz) {
+      internal::conditional_aligned_delete_auto<T, (Options_ & DontAlign) == 0>(m_data, currentSz);
+      if (size)
+        m_data = internal::conditional_aligned_new_auto<T, (Options_ & DontAlign) == 0>(size);
+      else if (NumIndices_ == 0) {
+        m_data = internal::conditional_aligned_new_auto<T, (Options_ & DontAlign) == 0>(1);
+      } else
+        m_data = 0;
+      EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({})
+    }
+    m_dimensions = nbDimensions;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T* data() { return m_data; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T* data() const { return m_data; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return m_dimensions.TotalSize(); }
+
+ private:
+  T* m_data;
+  Dimensions m_dimensions;
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSORSTORAGE_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
new file mode 100644
index 00000000..04ade37b
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
@@ -0,0 +1,314 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H
+#define EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+template <typename Strides, typename XprType>
+struct traits<TensorStridingOp<Strides, XprType> > : public traits<XprType> {
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef std::remove_reference_t<Nested> Nested_;
+  static constexpr int NumDimensions = XprTraits::NumDimensions;
+  static constexpr int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
+};
+
+template <typename Strides, typename XprType>
+struct eval<TensorStridingOp<Strides, XprType>, Eigen::Dense> {
+  typedef const TensorStridingOp<Strides, XprType> EIGEN_DEVICE_REF type;
+};
+
+template <typename Strides, typename XprType>
+struct nested<TensorStridingOp<Strides, XprType>, 1, typename eval<TensorStridingOp<Strides, XprType> >::type> {
+  typedef TensorStridingOp<Strides, XprType> type;
+};
+
+}  // end namespace internal
+
+/**
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor striding class.
+ */
+template <typename Strides, typename XprType>
+class TensorStridingOp : public TensorBase<TensorStridingOp<Strides, XprType> > {
+ public:
+  typedef TensorBase<TensorStridingOp<Strides, XprType> > Base;
+  typedef typename Eigen::internal::traits<TensorStridingOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorStridingOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorStridingOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorStridingOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorStridingOp(const XprType& expr, const Strides& dims)
+      : m_xpr(expr), m_dims(dims) {}
+
+  EIGEN_DEVICE_FUNC const Strides& strides() const { return m_dims; }
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename XprType::Nested>& expression() const { return m_xpr; }
+
+  EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorStridingOp)
+
+ protected:
+  typename XprType::Nested m_xpr;
+  const Strides m_dims;
+};
+
+// Eval as rvalue
+template <typename Strides, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device> {
+  typedef TensorStridingOp<Strides, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static constexpr int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+  enum {
+    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) {
+    m_dimensions = m_impl.dimensions();
+    for (int i = 0; i < NumDims; ++i) {
+      m_dimensions[i] = Eigen::numext::ceil(static_cast<float>(m_dimensions[i]) / op.strides()[i]);
+    }
+
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_outputStrides[0] = 1;
+      m_inputStrides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
+        m_inputStrides[i] = m_inputStrides[i - 1] * input_dims[i - 1];
+        m_inputStrides[i - 1] *= op.strides()[i - 1];
+      }
+      m_inputStrides[NumDims - 1] *= op.strides()[NumDims - 1];
+    } else {  // RowMajor
+      m_outputStrides[NumDims - 1] = 1;
+      m_inputStrides[NumDims - 1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
+        m_inputStrides[i] = m_inputStrides[i + 1] * input_dims[i + 1];
+        m_inputStrides[i + 1] *= op.strides()[i + 1];
+      }
+      m_inputStrides[0] *= op.strides()[0];
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+  EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    return m_impl.coeff(srcCoeff(index));
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+
+    Index inputIndices[] = {0, 0};
+    Index indices[] = {index, index + PacketSize - 1};
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx0 = indices[0] / m_outputStrides[i];
+        const Index idx1 = indices[1] / m_outputStrides[i];
+        inputIndices[0] += idx0 * m_inputStrides[i];
+        inputIndices[1] += idx1 * m_inputStrides[i];
+        indices[0] -= idx0 * m_outputStrides[i];
+        indices[1] -= idx1 * m_outputStrides[i];
+      }
+      inputIndices[0] += indices[0] * m_inputStrides[0];
+      inputIndices[1] += indices[1] * m_inputStrides[0];
+    } else {  // RowMajor
+      EIGEN_UNROLL_LOOP
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx0 = indices[0] / m_outputStrides[i];
+        const Index idx1 = indices[1] / m_outputStrides[i];
+        inputIndices[0] += idx0 * m_inputStrides[i];
+        inputIndices[1] += idx1 * m_inputStrides[i];
+        indices[0] -= idx0 * m_outputStrides[i];
+        indices[1] -= idx1 * m_outputStrides[i];
+      }
+      inputIndices[0] += indices[0] * m_inputStrides[NumDims - 1];
+      inputIndices[1] += indices[1] * m_inputStrides[NumDims - 1];
+    }
+    if (inputIndices[1] - inputIndices[0] == PacketSize - 1) {
+      PacketReturnType rslt = m_impl.template packet<Unaligned>(inputIndices[0]);
+      return rslt;
+    } else {
+      EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
+      values[0] = m_impl.coeff(inputIndices[0]);
+      values[PacketSize - 1] = m_impl.coeff(inputIndices[1]);
+      EIGEN_UNROLL_LOOP
+      for (int i = 1; i < PacketSize - 1; ++i) {
+        values[i] = coeff(index + i);
+      }
+      PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+      return rslt;
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    double compute_cost = (NumDims - 1) * (TensorOpCost::AddCost<Index>() + TensorOpCost::MulCost<Index>() +
+                                           TensorOpCost::DivCost<Index>()) +
+                          TensorOpCost::MulCost<Index>();
+    if (vectorized) {
+      compute_cost *= 2;  // packet() computes two indices
+    }
+    const int innerDim = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? 0 : (NumDims - 1);
+    return m_impl.costPerCoeff(vectorized && m_inputStrides[innerDim] == 1) +
+           // Computation is not vectorized per se, but it is done once per packet.
+           TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; }
+
+ protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const {
+    Index inputIndex = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx = index / m_outputStrides[i];
+        inputIndex += idx * m_inputStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      inputIndex += index * m_inputStrides[0];
+    } else {  // RowMajor
+      EIGEN_UNROLL_LOOP
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx = index / m_outputStrides[i];
+        inputIndex += idx * m_inputStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      inputIndex += index * m_inputStrides[NumDims - 1];
+    }
+    return inputIndex;
+  }
+
+  Dimensions m_dimensions;
+  array<Index, NumDims> m_outputStrides;
+  array<Index, NumDims> m_inputStrides;
+  TensorEvaluator<ArgType, Device> m_impl;
+};
+
+// Eval as lvalue
+template <typename Strides, typename ArgType, typename Device>
+struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
+    : public TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device> {
+  typedef TensorStridingOp<Strides, ArgType> XprType;
+  typedef TensorEvaluator<const XprType, Device> Base;
+  //  typedef typename XprType::Index Index;
+  static constexpr int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  //  typedef DSizes<Index, NumDims> Dimensions;
+
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+  enum {
+    IsAligned = /*TensorEvaluator<ArgType, Device>::IsAligned*/ false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    PreferBlockAccess = false,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {}
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) const {
+    return this->m_impl.coeffRef(this->srcCoeff(index));
+  }
+
+  template <int StoreMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writePacket(Index index, const PacketReturnType& x) const {
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index + PacketSize - 1 < this->dimensions().TotalSize());
+
+    Index inputIndices[] = {0, 0};
+    Index indices[] = {index, index + PacketSize - 1};
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      EIGEN_UNROLL_LOOP
+      for (int i = NumDims - 1; i > 0; --i) {
+        const Index idx0 = indices[0] / this->m_outputStrides[i];
+        const Index idx1 = indices[1] / this->m_outputStrides[i];
+        inputIndices[0] += idx0 * this->m_inputStrides[i];
+        inputIndices[1] += idx1 * this->m_inputStrides[i];
+        indices[0] -= idx0 * this->m_outputStrides[i];
+        indices[1] -= idx1 * this->m_outputStrides[i];
+      }
+      inputIndices[0] += indices[0] * this->m_inputStrides[0];
+      inputIndices[1] += indices[1] * this->m_inputStrides[0];
+    } else {  // RowMajor
+      EIGEN_UNROLL_LOOP
+      for (int i = 0; i < NumDims - 1; ++i) {
+        const Index idx0 = indices[0] / this->m_outputStrides[i];
+        const Index idx1 = indices[1] / this->m_outputStrides[i];
+        inputIndices[0] += idx0 * this->m_inputStrides[i];
+        inputIndices[1] += idx1 * this->m_inputStrides[i];
+        indices[0] -= idx0 * this->m_outputStrides[i];
+        indices[1] -= idx1 * this->m_outputStrides[i];
+      }
+      inputIndices[0] += indices[0] * this->m_inputStrides[NumDims - 1];
+      inputIndices[1] += indices[1] * this->m_inputStrides[NumDims - 1];
+    }
+    if (inputIndices[1] - inputIndices[0] == PacketSize - 1) {
+      this->m_impl.template writePacket<Unaligned>(inputIndices[0], x);
+    } else {
+      EIGEN_ALIGN_MAX Scalar values[PacketSize];
+      internal::pstore<Scalar, PacketReturnType>(values, x);
+      this->m_impl.coeffRef(inputIndices[0]) = values[0];
+      this->m_impl.coeffRef(inputIndices[1]) = values[PacketSize - 1];
+      EIGEN_UNROLL_LOOP
+      for (int i = 1; i < PacketSize - 1; ++i) {
+        this->coeffRef(index + i) = values[i];
+      }
+    }
+  }
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_STRIDING_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
new file mode 100644
index 00000000..5357a482
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h
@@ -0,0 +1,281 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2017 Gagan Goel <gagan.nith@gmail.com>
+// Copyright (C) 2017 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_TRACE_H
+#define EIGEN_CXX11_TENSOR_TENSOR_TRACE_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+template <typename Dims, typename XprType>
+struct traits<TensorTraceOp<Dims, XprType> > : public traits<XprType> {
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef std::remove_reference_t<Nested> Nested_;
+  static constexpr int NumDimensions = XprTraits::NumDimensions - array_size<Dims>::value;
+  static constexpr int Layout = XprTraits::Layout;
+  enum {
+    // Trace is read-only.
+    Flags = traits<XprType>::Flags & ~LvalueBit
+  };
+};
+
+template <typename Dims, typename XprType>
+struct eval<TensorTraceOp<Dims, XprType>, Eigen::Dense> {
+  typedef const TensorTraceOp<Dims, XprType>& type;
+};
+
+template <typename Dims, typename XprType>
+struct nested<TensorTraceOp<Dims, XprType>, 1, typename eval<TensorTraceOp<Dims, XprType> >::type> {
+  typedef TensorTraceOp<Dims, XprType> type;
+};
+
+}  // end namespace internal
+
+/**
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Tensor Trace class.
+ */
+template <typename Dims, typename XprType>
+class TensorTraceOp : public TensorBase<TensorTraceOp<Dims, XprType> > {
+ public:
+  typedef typename Eigen::internal::traits<TensorTraceOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorTraceOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorTraceOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorTraceOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorTraceOp(const XprType& expr, const Dims& dims)
+      : m_xpr(expr), m_dims(dims) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dims& dims() const { return m_dims; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const internal::remove_all_t<typename XprType::Nested>& expression() const {
+    return m_xpr;
+  }
+
+ protected:
+  typename XprType::Nested m_xpr;
+  const Dims m_dims;
+};
+
+// Eval as rvalue
+template <typename Dims, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorTraceOp<Dims, ArgType>, Device> {
+  typedef TensorTraceOp<Dims, ArgType> XprType;
+  static constexpr int NumInputDims =
+      internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  static constexpr int NumReducedDims = internal::array_size<Dims>::value;
+  static constexpr int NumOutputDims = NumInputDims - NumReducedDims;
+  typedef typename XprType::Index Index;
+  typedef DSizes<Index, NumOutputDims> Dimensions;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static constexpr int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+  enum {
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
+    CoordAccess = false,
+    RawAccess = false
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_traceDim(1), m_device(device) {
+    EIGEN_STATIC_ASSERT((NumOutputDims >= 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((NumReducedDims >= 2) || ((NumReducedDims == 0) && (NumInputDims == 0)),
+                        YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    for (int i = 0; i < NumInputDims; ++i) {
+      m_reduced[i] = false;
+    }
+
+    const Dims& op_dims = op.dims();
+    for (int i = 0; i < NumReducedDims; ++i) {
+      eigen_assert(op_dims[i] >= 0);
+      eigen_assert(op_dims[i] < NumInputDims);
+      m_reduced[op_dims[i]] = true;
+    }
+
+    // All the dimensions should be distinct to compute the trace
+    int num_distinct_reduce_dims = 0;
+    for (int i = 0; i < NumInputDims; ++i) {
+      if (m_reduced[i]) {
+        ++num_distinct_reduce_dims;
+      }
+    }
+
+    EIGEN_ONLY_USED_FOR_DEBUG(num_distinct_reduce_dims);
+    eigen_assert(num_distinct_reduce_dims == NumReducedDims);
+
+    // Compute the dimensions of the result.
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+
+    int output_index = 0;
+    int reduced_index = 0;
+    for (int i = 0; i < NumInputDims; ++i) {
+      if (m_reduced[i]) {
+        m_reducedDims[reduced_index] = input_dims[i];
+        if (reduced_index > 0) {
+          // All the trace dimensions must have the same size
+          eigen_assert(m_reducedDims[0] == m_reducedDims[reduced_index]);
+        }
+        ++reduced_index;
+      } else {
+        m_dimensions[output_index] = input_dims[i];
+        ++output_index;
+      }
+    }
+
+    if (NumReducedDims != 0) {
+      m_traceDim = m_reducedDims[0];
+    }
+
+    // Compute the output strides
+    if (NumOutputDims > 0) {
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        m_outputStrides[0] = 1;
+        for (int i = 1; i < NumOutputDims; ++i) {
+          m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
+        }
+      } else {
+        m_outputStrides.back() = 1;
+        for (int i = NumOutputDims - 2; i >= 0; --i) {
+          m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
+        }
+      }
+    }
+
+    // Compute the input strides
+    if (NumInputDims > 0) {
+      array<Index, NumInputDims> input_strides;
+      if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+        input_strides[0] = 1;
+        for (int i = 1; i < NumInputDims; ++i) {
+          input_strides[i] = input_strides[i - 1] * input_dims[i - 1];
+        }
+      } else {
+        input_strides.back() = 1;
+        for (int i = NumInputDims - 2; i >= 0; --i) {
+          input_strides[i] = input_strides[i + 1] * input_dims[i + 1];
+        }
+      }
+
+      output_index = 0;
+      reduced_index = 0;
+      for (int i = 0; i < NumInputDims; ++i) {
+        if (m_reduced[i]) {
+          m_reducedStrides[reduced_index] = input_strides[i];
+          ++reduced_index;
+        } else {
+          m_preservedStrides[output_index] = input_strides[i];
+          ++output_index;
+        }
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return nullptr; }
+
+  EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    // Initialize the result
+    CoeffReturnType result = internal::cast<int, CoeffReturnType>(0);
+    Index index_stride = 0;
+    for (int i = 0; i < NumReducedDims; ++i) {
+      index_stride += m_reducedStrides[i];
+    }
+
+    // If trace is requested along all dimensions, starting index would be 0
+    Index cur_index = 0;
+    if (NumOutputDims != 0) cur_index = firstInput(index);
+    for (Index i = 0; i < m_traceDim; ++i) {
+      result += m_impl.coeff(cur_index);
+      cur_index += index_stride;
+    }
+
+    return result;
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+    eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+
+    EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
+    for (int i = 0; i < PacketSize; ++i) {
+      values[i] = coeff(index + i);
+    }
+    PacketReturnType result = internal::ploadt<PacketReturnType, LoadMode>(values);
+    return result;
+  }
+
+ protected:
+  // Given the output index, finds the first index in the input tensor used to compute the trace
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
+    Index startInput = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumOutputDims - 1; i > 0; --i) {
+        const Index idx = index / m_outputStrides[i];
+        startInput += idx * m_preservedStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      startInput += index * m_preservedStrides[0];
+    } else {
+      for (int i = 0; i < NumOutputDims - 1; ++i) {
+        const Index idx = index / m_outputStrides[i];
+        startInput += idx * m_preservedStrides[i];
+        index -= idx * m_outputStrides[i];
+      }
+      startInput += index * m_preservedStrides[NumOutputDims - 1];
+    }
+    return startInput;
+  }
+
+  Dimensions m_dimensions;
+  TensorEvaluator<ArgType, Device> m_impl;
+  // Initialize the size of the trace dimension
+  Index m_traceDim;
+  const Device EIGEN_DEVICE_REF m_device;
+  array<bool, NumInputDims> m_reduced;
+  array<Index, NumReducedDims> m_reducedDims;
+  array<Index, NumOutputDims> m_outputStrides;
+  array<Index, NumReducedDims> m_reducedStrides;
+  array<Index, NumOutputDims> m_preservedStrides;
+};
+
+}  // End namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_TRACE_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
new file mode 100644
index 00000000..f5954d6f
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h
@@ -0,0 +1,232 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H
+#define EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+template <typename Scalar, int Options>
+class compute_tensor_flags {
+  enum {
+    is_dynamic_size_storage = 1,
+
+    is_aligned = (((Options & DontAlign) == 0) && (
+#if EIGEN_MAX_STATIC_ALIGN_BYTES > 0
+                                                      (!is_dynamic_size_storage)
+#else
+                                                      0
+#endif
+                                                      |
+#if EIGEN_MAX_ALIGN_BYTES > 0
+                                                      is_dynamic_size_storage
+#else
+                                                      0
+#endif
+                                                      )),
+    packet_access_bit = packet_traits<Scalar>::Vectorizable && is_aligned ? PacketAccessBit : 0
+  };
+
+ public:
+  enum { ret = packet_access_bit };
+};
+
+template <typename Scalar_, int NumIndices_, int Options_, typename IndexType_>
+struct traits<Tensor<Scalar_, NumIndices_, Options_, IndexType_> > {
+  typedef Scalar_ Scalar;
+  typedef Dense StorageKind;
+  typedef IndexType_ Index;
+  static constexpr int NumDimensions = NumIndices_;
+  static constexpr int Layout = Options_ & RowMajor ? RowMajor : ColMajor;
+  enum {
+    Options = Options_,
+    Flags = compute_tensor_flags<Scalar_, Options_>::ret | (is_const<Scalar_>::value ? 0 : LvalueBit)
+  };
+  template <typename T>
+  struct MakePointer {
+    typedef T* Type;
+  };
+  typedef typename MakePointer<Scalar>::Type PointerType;
+};
+
+template <typename Scalar_, typename Dimensions, int Options_, typename IndexType_>
+struct traits<TensorFixedSize<Scalar_, Dimensions, Options_, IndexType_> > {
+  typedef Scalar_ Scalar;
+  typedef Dense StorageKind;
+  typedef IndexType_ Index;
+  static constexpr int NumDimensions = array_size<Dimensions>::value;
+  static constexpr int Layout = Options_ & RowMajor ? RowMajor : ColMajor;
+  enum {
+    Options = Options_,
+    Flags = compute_tensor_flags<Scalar_, Options_>::ret | (is_const<Scalar_>::value ? 0 : LvalueBit)
+  };
+  template <typename T>
+  struct MakePointer {
+    typedef T* Type;
+  };
+  typedef typename MakePointer<Scalar>::Type PointerType;
+};
+
+template <typename PlainObjectType, int Options_, template <class> class MakePointer_>
+struct traits<TensorMap<PlainObjectType, Options_, MakePointer_> > : public traits<PlainObjectType> {
+  typedef traits<PlainObjectType> BaseTraits;
+  typedef typename BaseTraits::Scalar Scalar;
+  typedef typename BaseTraits::StorageKind StorageKind;
+  typedef typename BaseTraits::Index Index;
+  static constexpr int NumDimensions = BaseTraits::NumDimensions;
+  static constexpr int Layout = BaseTraits::Layout;
+  enum { Options = Options_, Flags = BaseTraits::Flags };
+  template <class T>
+  struct MakePointer {
+    // Intermediate typedef to workaround MSVC issue.
+    typedef MakePointer_<T> MakePointerT;
+    typedef typename MakePointerT::Type Type;
+  };
+  typedef typename MakePointer<Scalar>::Type PointerType;
+};
+
+template <typename PlainObjectType_>
+struct traits<TensorRef<PlainObjectType_> > : public traits<PlainObjectType_> {
+  typedef PlainObjectType_ PlainObjectType;
+  typedef traits<PlainObjectType> BaseTraits;
+  typedef typename BaseTraits::Scalar Scalar;
+  typedef typename BaseTraits::StorageKind StorageKind;
+  typedef typename BaseTraits::Index Index;
+  static constexpr int NumDimensions = BaseTraits::NumDimensions;
+  static constexpr int Layout = BaseTraits::Layout;
+  enum { Options = BaseTraits::Options, Flags = BaseTraits::Flags };
+  typedef typename BaseTraits::PointerType PointerType;
+};
+
+template <typename Scalar_, int NumIndices_, int Options, typename IndexType_>
+struct eval<Tensor<Scalar_, NumIndices_, Options, IndexType_>, Eigen::Dense> {
+  typedef const Tensor<Scalar_, NumIndices_, Options, IndexType_> EIGEN_DEVICE_REF type;
+};
+
+template <typename Scalar_, int NumIndices_, int Options, typename IndexType_>
+struct eval<const Tensor<Scalar_, NumIndices_, Options, IndexType_>, Eigen::Dense> {
+  typedef const Tensor<Scalar_, NumIndices_, Options, IndexType_> EIGEN_DEVICE_REF type;
+};
+
+template <typename Scalar_, typename Dimensions, int Options, typename IndexType_>
+struct eval<TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>, Eigen::Dense> {
+  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_> EIGEN_DEVICE_REF type;
+};
+
+template <typename Scalar_, typename Dimensions, int Options, typename IndexType_>
+struct eval<const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_>, Eigen::Dense> {
+  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_> EIGEN_DEVICE_REF type;
+};
+
+template <typename PlainObjectType, int Options, template <class> class MakePointer>
+struct eval<TensorMap<PlainObjectType, Options, MakePointer>, Eigen::Dense> {
+  typedef const TensorMap<PlainObjectType, Options, MakePointer> EIGEN_DEVICE_REF type;
+};
+
+template <typename PlainObjectType, int Options, template <class> class MakePointer>
+struct eval<const TensorMap<PlainObjectType, Options, MakePointer>, Eigen::Dense> {
+  typedef const TensorMap<PlainObjectType, Options, MakePointer> EIGEN_DEVICE_REF type;
+};
+
+template <typename PlainObjectType>
+struct eval<TensorRef<PlainObjectType>, Eigen::Dense> {
+  typedef const TensorRef<PlainObjectType> EIGEN_DEVICE_REF type;
+};
+
+template <typename PlainObjectType>
+struct eval<const TensorRef<PlainObjectType>, Eigen::Dense> {
+  typedef const TensorRef<PlainObjectType> EIGEN_DEVICE_REF type;
+};
+
+// TODO nested<> does not exist anymore in Eigen/Core, and it thus has to be removed in favor of ref_selector.
+template <typename T, int n = 1, typename PlainObject = void>
+struct nested {
+  typedef typename ref_selector<T>::type type;
+};
+
+template <typename Scalar_, int NumIndices_, int Options_, typename IndexType_>
+struct nested<Tensor<Scalar_, NumIndices_, Options_, IndexType_> > {
+  typedef const Tensor<Scalar_, NumIndices_, Options_, IndexType_> EIGEN_DEVICE_REF type;
+};
+
+template <typename Scalar_, int NumIndices_, int Options_, typename IndexType_>
+struct nested<const Tensor<Scalar_, NumIndices_, Options_, IndexType_> > {
+  typedef const Tensor<Scalar_, NumIndices_, Options_, IndexType_> EIGEN_DEVICE_REF type;
+};
+
+template <typename Scalar_, typename Dimensions, int Options, typename IndexType_>
+struct nested<TensorFixedSize<Scalar_, Dimensions, Options, IndexType_> > {
+  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_> EIGEN_DEVICE_REF type;
+};
+
+template <typename Scalar_, typename Dimensions, int Options, typename IndexType_>
+struct nested<const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_> > {
+  typedef const TensorFixedSize<Scalar_, Dimensions, Options, IndexType_> EIGEN_DEVICE_REF type;
+};
+
+template <typename PlainObjectType>
+struct nested<TensorRef<PlainObjectType> > {
+  typedef const TensorRef<PlainObjectType> EIGEN_DEVICE_REF type;
+};
+
+template <typename PlainObjectType>
+struct nested<const TensorRef<PlainObjectType> > {
+  typedef const TensorRef<PlainObjectType> EIGEN_DEVICE_REF type;
+};
+
+}  // end namespace internal
+
+// Convolutional layers take in an input tensor of shape (D, R, C, B), or (D, C,
+// R, B), and convolve it with a set of filters, which can also be presented as
+// a tensor (D, K, K, M), where M is the number of filters, K is the filter
+// size, and each 3-dimensional tensor of size (D, K, K) is a filter. For
+// simplicity we assume that we always use square filters (which is usually the
+// case in images), hence the two Ks in the tensor dimension.  It also takes in
+// a few additional parameters:
+// Stride (S): The convolution stride is the offset between locations where we
+//             apply the filters.  A larger stride means that the output will be
+//             spatially smaller.
+// Padding (P): The padding we apply to the input tensor along the R and C
+//              dimensions.  This is usually used to make sure that the spatial
+//              dimensions of the output matches our intention.
+//
+// Two types of padding are often used:
+//   SAME: The pad value is computed so that the output will have size
+//         R/S and C/S.
+//   VALID: no padding is carried out.
+// When we do padding, the padded values at the padded locations are usually
+// zero.
+//
+// The output dimensions for convolution, when given all the parameters above,
+// are as follows:
+// When Padding = SAME: the output size is (B, R', C', M), where
+//   R' = ceil(float(R) / float(S))
+//   C' = ceil(float(C) / float(S))
+// where ceil is the ceiling function.  The input tensor is padded with 0 as
+// needed.  The number of padded rows and columns are computed as:
+//   Pr = ((R' - 1) * S + K - R) / 2
+//   Pc = ((C' - 1) * S + K - C) / 2
+// when the stride is 1, we have the simplified case R'=R, C'=C, Pr=Pc=(K-1)/2.
+// This is where SAME comes from - the output has the same size as the input has.
+// When Padding = VALID: the output size is computed as
+//   R' = ceil(float(R - K + 1) / float(S))
+//   C' = ceil(float(C - K + 1) / float(S))
+// and the number of padded rows and columns are computed in the same way as in
+// the SAME case.
+// When the stride is 1, we have the simplified case R'=R-K+1, C'=C-K+1, Pr=0,
+// Pc=0.
+enum PaddingType { PADDING_VALID = 1, PADDING_SAME = 2 };
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_TRAITS_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
new file mode 100644
index 00000000..99e51c57
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
@@ -0,0 +1,229 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_UINT128_H
+#define EIGEN_CXX11_TENSOR_TENSOR_UINT128_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+template <uint64_t n>
+struct static_val {
+  static const uint64_t value = n;
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE operator uint64_t() const { return n; }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val() {}
+
+  template <typename T>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val(const T& v) {
+    EIGEN_UNUSED_VARIABLE(v);
+    eigen_assert(v == n);
+  }
+};
+
+template <typename HIGH = uint64_t, typename LOW = uint64_t>
+struct TensorUInt128 {
+  HIGH high;
+  LOW low;
+
+  template <typename OTHER_HIGH, typename OTHER_LOW>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE TensorUInt128(const TensorUInt128<OTHER_HIGH, OTHER_LOW>& other)
+      : high(other.high), low(other.low) {
+    EIGEN_STATIC_ASSERT(sizeof(OTHER_HIGH) <= sizeof(HIGH), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT(sizeof(OTHER_LOW) <= sizeof(LOW), YOU_MADE_A_PROGRAMMING_MISTAKE);
+  }
+
+  template <typename OTHER_HIGH, typename OTHER_LOW>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE TensorUInt128& operator=(const TensorUInt128<OTHER_HIGH, OTHER_LOW>& other) {
+    EIGEN_STATIC_ASSERT(sizeof(OTHER_HIGH) <= sizeof(HIGH), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT(sizeof(OTHER_LOW) <= sizeof(LOW), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    high = other.high;
+    low = other.low;
+    return *this;
+  }
+
+  template <typename T>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE explicit TensorUInt128(const T& x) : high(0), low(x) {
+    eigen_assert(
+        (static_cast<std::conditional_t<sizeof(T) == 8, uint64_t, uint32_t>>(x) <= NumTraits<uint64_t>::highest()));
+    eigen_assert(x >= 0);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE TensorUInt128(HIGH y, LOW x) : high(y), low(x) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE operator LOW() const { return low; }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LOW lower() const { return low; }
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HIGH upper() const { return high; }
+};
+
+template <typename HL, typename LL, typename HR, typename LR>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool operator==(const TensorUInt128<HL, LL>& lhs,
+                                                      const TensorUInt128<HR, LR>& rhs) {
+  return (lhs.high == rhs.high) && (lhs.low == rhs.low);
+}
+
+template <typename HL, typename LL, typename HR, typename LR>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool operator!=(const TensorUInt128<HL, LL>& lhs,
+                                                      const TensorUInt128<HR, LR>& rhs) {
+  return (lhs.high != rhs.high) || (lhs.low != rhs.low);
+}
+
+template <typename HL, typename LL, typename HR, typename LR>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool operator>=(const TensorUInt128<HL, LL>& lhs,
+                                                      const TensorUInt128<HR, LR>& rhs) {
+  if (lhs.high != rhs.high) {
+    return lhs.high > rhs.high;
+  }
+  return lhs.low >= rhs.low;
+}
+
+template <typename HL, typename LL, typename HR, typename LR>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool operator<(const TensorUInt128<HL, LL>& lhs,
+                                                     const TensorUInt128<HR, LR>& rhs) {
+  if (lhs.high != rhs.high) {
+    return lhs.high < rhs.high;
+  }
+  return lhs.low < rhs.low;
+}
+
+template <typename HL, typename LL, typename HR, typename LR>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE TensorUInt128<uint64_t, uint64_t> operator+(const TensorUInt128<HL, LL>& lhs,
+                                                                                  const TensorUInt128<HR, LR>& rhs) {
+  TensorUInt128<uint64_t, uint64_t> result(lhs.high + rhs.high, lhs.low + rhs.low);
+  if (result.low < rhs.low) {
+    result.high += 1;
+  }
+  return result;
+}
+
+template <typename HL, typename LL, typename HR, typename LR>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE TensorUInt128<uint64_t, uint64_t> operator-(const TensorUInt128<HL, LL>& lhs,
+                                                                                  const TensorUInt128<HR, LR>& rhs) {
+  TensorUInt128<uint64_t, uint64_t> result(lhs.high - rhs.high, lhs.low - rhs.low);
+  if (result.low > lhs.low) {
+    result.high -= 1;
+  }
+  return result;
+}
+
+template <typename HL, typename LL, typename HR, typename LR>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorUInt128<uint64_t, uint64_t> operator*(
+    const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs) {
+  // Split each 128-bit integer into 4 32-bit integers, and then do the
+  // multiplications by hand as follow:
+  //   lhs      a  b  c  d
+  //   rhs      e  f  g  h
+  //           -----------
+  //           ah bh ch dh
+  //           bg cg dg
+  //           cf df
+  //           de
+  // The result is stored in 2 64bit integers, high and low.
+
+  const uint64_t LOW = 0x00000000FFFFFFFFLL;
+  const uint64_t HIGH = 0xFFFFFFFF00000000LL;
+
+  uint64_t d = lhs.low & LOW;
+  uint64_t c = (lhs.low & HIGH) >> 32LL;
+  uint64_t b = lhs.high & LOW;
+  uint64_t a = (lhs.high & HIGH) >> 32LL;
+
+  uint64_t h = rhs.low & LOW;
+  uint64_t g = (rhs.low & HIGH) >> 32LL;
+  uint64_t f = rhs.high & LOW;
+  uint64_t e = (rhs.high & HIGH) >> 32LL;
+
+  // Compute the low 32 bits of low
+  uint64_t acc = d * h;
+  uint64_t low = acc & LOW;
+  //  Compute the high 32 bits of low. Add a carry every time we wrap around
+  acc >>= 32LL;
+  uint64_t carry = 0;
+  uint64_t acc2 = acc + c * h;
+  if (acc2 < acc) {
+    carry++;
+  }
+  acc = acc2 + d * g;
+  if (acc < acc2) {
+    carry++;
+  }
+  low |= (acc << 32LL);
+
+  // Carry forward the high bits of acc to initiate the computation of the
+  // low 32 bits of high
+  acc2 = (acc >> 32LL) | (carry << 32LL);
+  carry = 0;
+
+  acc = acc2 + b * h;
+  if (acc < acc2) {
+    carry++;
+  }
+  acc2 = acc + c * g;
+  if (acc2 < acc) {
+    carry++;
+  }
+  acc = acc2 + d * f;
+  if (acc < acc2) {
+    carry++;
+  }
+  uint64_t high = acc & LOW;
+
+  // Start to compute the high 32 bits of high.
+  acc2 = (acc >> 32LL) | (carry << 32LL);
+
+  acc = acc2 + a * h;
+  acc2 = acc + b * g;
+  acc = acc2 + c * f;
+  acc2 = acc + d * e;
+  high |= (acc2 << 32LL);
+
+  return TensorUInt128<uint64_t, uint64_t>(high, low);
+}
+
+template <typename HL, typename LL, typename HR, typename LR>
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorUInt128<uint64_t, uint64_t> operator/(
+    const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs) {
+  if (rhs == TensorUInt128<static_val<0>, static_val<1>>(1)) {
+    return TensorUInt128<uint64_t, uint64_t>(lhs.high, lhs.low);
+  } else if (lhs < rhs) {
+    return TensorUInt128<uint64_t, uint64_t>(0);
+  } else {
+    // calculate the biggest power of 2 times rhs that's less than or equal to lhs
+    TensorUInt128<uint64_t, uint64_t> power2(1);
+    TensorUInt128<uint64_t, uint64_t> d(rhs);
+    TensorUInt128<uint64_t, uint64_t> tmp(lhs - d);
+    while (lhs >= d) {
+      tmp = tmp - d;
+      d = d + d;
+      power2 = power2 + power2;
+    }
+
+    tmp = TensorUInt128<uint64_t, uint64_t>(lhs.high, lhs.low);
+    TensorUInt128<uint64_t, uint64_t> result(0);
+    while (power2 != TensorUInt128<static_val<0>, static_val<0>>(0)) {
+      if (tmp >= d) {
+        tmp = tmp - d;
+        result = result + power2;
+      }
+      // Shift right
+      power2 = TensorUInt128<uint64_t, uint64_t>(power2.high >> 1, (power2.low >> 1) | (power2.high << 63));
+      d = TensorUInt128<uint64_t, uint64_t>(d.high >> 1, (d.low >> 1) | (d.high << 63));
+    }
+
+    return result;
+  }
+}
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_UINT128_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
new file mode 100644
index 00000000..cf69fef6
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
@@ -0,0 +1,622 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_VOLUME_PATCH_H
+#define EIGEN_CXX11_TENSOR_TENSOR_VOLUME_PATCH_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+template <DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>
+struct traits<TensorVolumePatchOp<Planes, Rows, Cols, XprType> > : public traits<XprType> {
+  typedef std::remove_const_t<typename XprType::Scalar> Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef std::remove_reference_t<Nested> Nested_;
+  static constexpr int NumDimensions = XprTraits::NumDimensions + 1;
+  static constexpr int Layout = XprTraits::Layout;
+  typedef typename XprTraits::PointerType PointerType;
+};
+
+template <DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>
+struct eval<TensorVolumePatchOp<Planes, Rows, Cols, XprType>, Eigen::Dense> {
+  typedef const TensorVolumePatchOp<Planes, Rows, Cols, XprType>& type;
+};
+
+template <DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>
+struct nested<TensorVolumePatchOp<Planes, Rows, Cols, XprType>, 1,
+              typename eval<TensorVolumePatchOp<Planes, Rows, Cols, XprType> >::type> {
+  typedef TensorVolumePatchOp<Planes, Rows, Cols, XprType> type;
+};
+
+}  // end namespace internal
+
+/**
+ * \ingroup CXX11_Tensor_Module
+ *
+ * \brief Patch extraction specialized for processing of volumetric data.
+ * This assumes that the input has a least 4 dimensions ordered as follows:
+ *  - channels
+ *  - planes
+ *  - rows
+ *  - columns
+ *  - (optional) additional dimensions such as time or batch size.
+ * Calling the volume patch code with patch_planes, patch_rows, and patch_cols
+ * is equivalent to calling the regular patch extraction code with parameters
+ * d, patch_planes, patch_rows, patch_cols, and 1 for all the additional
+ * dimensions.
+ */
+template <DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename XprType>
+class TensorVolumePatchOp : public TensorBase<TensorVolumePatchOp<Planes, Rows, Cols, XprType>, ReadOnlyAccessors> {
+ public:
+  typedef typename Eigen::internal::traits<TensorVolumePatchOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorVolumePatchOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorVolumePatchOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorVolumePatchOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorVolumePatchOp(
+      const XprType& expr, DenseIndex patch_planes, DenseIndex patch_rows, DenseIndex patch_cols,
+      DenseIndex plane_strides, DenseIndex row_strides, DenseIndex col_strides, DenseIndex in_plane_strides,
+      DenseIndex in_row_strides, DenseIndex in_col_strides, DenseIndex plane_inflate_strides,
+      DenseIndex row_inflate_strides, DenseIndex col_inflate_strides, PaddingType padding_type, Scalar padding_value)
+      : m_xpr(expr),
+        m_patch_planes(patch_planes),
+        m_patch_rows(patch_rows),
+        m_patch_cols(patch_cols),
+        m_plane_strides(plane_strides),
+        m_row_strides(row_strides),
+        m_col_strides(col_strides),
+        m_in_plane_strides(in_plane_strides),
+        m_in_row_strides(in_row_strides),
+        m_in_col_strides(in_col_strides),
+        m_plane_inflate_strides(plane_inflate_strides),
+        m_row_inflate_strides(row_inflate_strides),
+        m_col_inflate_strides(col_inflate_strides),
+        m_padding_explicit(false),
+        m_padding_top_z(0),
+        m_padding_bottom_z(0),
+        m_padding_top(0),
+        m_padding_bottom(0),
+        m_padding_left(0),
+        m_padding_right(0),
+        m_padding_type(padding_type),
+        m_padding_value(padding_value) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorVolumePatchOp(
+      const XprType& expr, DenseIndex patch_planes, DenseIndex patch_rows, DenseIndex patch_cols,
+      DenseIndex plane_strides, DenseIndex row_strides, DenseIndex col_strides, DenseIndex in_plane_strides,
+      DenseIndex in_row_strides, DenseIndex in_col_strides, DenseIndex plane_inflate_strides,
+      DenseIndex row_inflate_strides, DenseIndex col_inflate_strides, DenseIndex padding_top_z,
+      DenseIndex padding_bottom_z, DenseIndex padding_top, DenseIndex padding_bottom, DenseIndex padding_left,
+      DenseIndex padding_right, Scalar padding_value)
+      : m_xpr(expr),
+        m_patch_planes(patch_planes),
+        m_patch_rows(patch_rows),
+        m_patch_cols(patch_cols),
+        m_plane_strides(plane_strides),
+        m_row_strides(row_strides),
+        m_col_strides(col_strides),
+        m_in_plane_strides(in_plane_strides),
+        m_in_row_strides(in_row_strides),
+        m_in_col_strides(in_col_strides),
+        m_plane_inflate_strides(plane_inflate_strides),
+        m_row_inflate_strides(row_inflate_strides),
+        m_col_inflate_strides(col_inflate_strides),
+        m_padding_explicit(true),
+        m_padding_top_z(padding_top_z),
+        m_padding_bottom_z(padding_bottom_z),
+        m_padding_top(padding_top),
+        m_padding_bottom(padding_bottom),
+        m_padding_left(padding_left),
+        m_padding_right(padding_right),
+        m_padding_type(PADDING_VALID),
+        m_padding_value(padding_value) {}
+
+  EIGEN_DEVICE_FUNC DenseIndex patch_planes() const { return m_patch_planes; }
+  EIGEN_DEVICE_FUNC DenseIndex patch_rows() const { return m_patch_rows; }
+  EIGEN_DEVICE_FUNC DenseIndex patch_cols() const { return m_patch_cols; }
+  EIGEN_DEVICE_FUNC DenseIndex plane_strides() const { return m_plane_strides; }
+  EIGEN_DEVICE_FUNC DenseIndex row_strides() const { return m_row_strides; }
+  EIGEN_DEVICE_FUNC DenseIndex col_strides() const { return m_col_strides; }
+  EIGEN_DEVICE_FUNC DenseIndex in_plane_strides() const { return m_in_plane_strides; }
+  EIGEN_DEVICE_FUNC DenseIndex in_row_strides() const { return m_in_row_strides; }
+  EIGEN_DEVICE_FUNC DenseIndex in_col_strides() const { return m_in_col_strides; }
+  EIGEN_DEVICE_FUNC DenseIndex plane_inflate_strides() const { return m_plane_inflate_strides; }
+  EIGEN_DEVICE_FUNC DenseIndex row_inflate_strides() const { return m_row_inflate_strides; }
+  EIGEN_DEVICE_FUNC DenseIndex col_inflate_strides() const { return m_col_inflate_strides; }
+  EIGEN_DEVICE_FUNC bool padding_explicit() const { return m_padding_explicit; }
+  EIGEN_DEVICE_FUNC DenseIndex padding_top_z() const { return m_padding_top_z; }
+  EIGEN_DEVICE_FUNC DenseIndex padding_bottom_z() const { return m_padding_bottom_z; }
+  EIGEN_DEVICE_FUNC DenseIndex padding_top() const { return m_padding_top; }
+  EIGEN_DEVICE_FUNC DenseIndex padding_bottom() const { return m_padding_bottom; }
+  EIGEN_DEVICE_FUNC DenseIndex padding_left() const { return m_padding_left; }
+  EIGEN_DEVICE_FUNC DenseIndex padding_right() const { return m_padding_right; }
+  EIGEN_DEVICE_FUNC PaddingType padding_type() const { return m_padding_type; }
+  EIGEN_DEVICE_FUNC Scalar padding_value() const { return m_padding_value; }
+
+  EIGEN_DEVICE_FUNC const internal::remove_all_t<typename XprType::Nested>& expression() const { return m_xpr; }
+
+ protected:
+  typename XprType::Nested m_xpr;
+  const DenseIndex m_patch_planes;
+  const DenseIndex m_patch_rows;
+  const DenseIndex m_patch_cols;
+  const DenseIndex m_plane_strides;
+  const DenseIndex m_row_strides;
+  const DenseIndex m_col_strides;
+  const DenseIndex m_in_plane_strides;
+  const DenseIndex m_in_row_strides;
+  const DenseIndex m_in_col_strides;
+  const DenseIndex m_plane_inflate_strides;
+  const DenseIndex m_row_inflate_strides;
+  const DenseIndex m_col_inflate_strides;
+  const bool m_padding_explicit;
+  const DenseIndex m_padding_top_z;
+  const DenseIndex m_padding_bottom_z;
+  const DenseIndex m_padding_top;
+  const DenseIndex m_padding_bottom;
+  const DenseIndex m_padding_left;
+  const DenseIndex m_padding_right;
+  const PaddingType m_padding_type;
+  const Scalar m_padding_value;
+};
+
+// Eval as rvalue
+template <DenseIndex Planes, DenseIndex Rows, DenseIndex Cols, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, Device> {
+  typedef TensorVolumePatchOp<Planes, Rows, Cols, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static constexpr int NumInputDims =
+      internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  static constexpr int NumDims = NumInputDims + 1;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef std::remove_const_t<typename XprType::Scalar> Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static constexpr int PacketSize = PacketType<CoeffReturnType, Device>::size;
+  typedef StorageMemory<CoeffReturnType, Device> Storage;
+  typedef typename Storage::Type EvaluatorPointerType;
+
+  static constexpr int Layout = TensorEvaluator<ArgType, Device>::Layout;
+  enum {
+    IsAligned = false,
+    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
+    BlockAccess = false,
+    PreferBlockAccess = TensorEvaluator<ArgType, Device>::PreferBlockAccess,
+    CoordAccess = false,
+    RawAccess = false
+  };
+
+  //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
+  typedef internal::TensorBlockNotImplemented TensorBlock;
+  //===--------------------------------------------------------------------===//
+
+  EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) : m_impl(op.expression(), device) {
+    EIGEN_STATIC_ASSERT((NumDims >= 5), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    m_paddingValue = op.padding_value();
+
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+
+    // Cache a few variables.
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_inputDepth = input_dims[0];
+      m_inputPlanes = input_dims[1];
+      m_inputRows = input_dims[2];
+      m_inputCols = input_dims[3];
+    } else {
+      m_inputDepth = input_dims[NumInputDims - 1];
+      m_inputPlanes = input_dims[NumInputDims - 2];
+      m_inputRows = input_dims[NumInputDims - 3];
+      m_inputCols = input_dims[NumInputDims - 4];
+    }
+
+    m_plane_strides = op.plane_strides();
+    m_row_strides = op.row_strides();
+    m_col_strides = op.col_strides();
+
+    // Input strides and effective input/patch size
+    m_in_plane_strides = op.in_plane_strides();
+    m_in_row_strides = op.in_row_strides();
+    m_in_col_strides = op.in_col_strides();
+    m_plane_inflate_strides = op.plane_inflate_strides();
+    m_row_inflate_strides = op.row_inflate_strides();
+    m_col_inflate_strides = op.col_inflate_strides();
+
+    // The "effective" spatial size after inflating data with zeros.
+    m_input_planes_eff = (m_inputPlanes - 1) * m_plane_inflate_strides + 1;
+    m_input_rows_eff = (m_inputRows - 1) * m_row_inflate_strides + 1;
+    m_input_cols_eff = (m_inputCols - 1) * m_col_inflate_strides + 1;
+    m_patch_planes_eff = op.patch_planes() + (op.patch_planes() - 1) * (m_in_plane_strides - 1);
+    m_patch_rows_eff = op.patch_rows() + (op.patch_rows() - 1) * (m_in_row_strides - 1);
+    m_patch_cols_eff = op.patch_cols() + (op.patch_cols() - 1) * (m_in_col_strides - 1);
+
+    if (op.padding_explicit()) {
+      m_outputPlanes =
+          numext::ceil((m_input_planes_eff + op.padding_top_z() + op.padding_bottom_z() - m_patch_planes_eff + 1.f) /
+                       static_cast<float>(m_plane_strides));
+      m_outputRows = numext::ceil((m_input_rows_eff + op.padding_top() + op.padding_bottom() - m_patch_rows_eff + 1.f) /
+                                  static_cast<float>(m_row_strides));
+      m_outputCols = numext::ceil((m_input_cols_eff + op.padding_left() + op.padding_right() - m_patch_cols_eff + 1.f) /
+                                  static_cast<float>(m_col_strides));
+      m_planePaddingTop = op.padding_top_z();
+      m_rowPaddingTop = op.padding_top();
+      m_colPaddingLeft = op.padding_left();
+    } else {
+      // Computing padding from the type
+      switch (op.padding_type()) {
+        case PADDING_VALID:
+          m_outputPlanes =
+              numext::ceil((m_input_planes_eff - m_patch_planes_eff + 1.f) / static_cast<float>(m_plane_strides));
+          m_outputRows = numext::ceil((m_input_rows_eff - m_patch_rows_eff + 1.f) / static_cast<float>(m_row_strides));
+          m_outputCols = numext::ceil((m_input_cols_eff - m_patch_cols_eff + 1.f) / static_cast<float>(m_col_strides));
+          m_planePaddingTop = 0;
+          m_rowPaddingTop = 0;
+          m_colPaddingLeft = 0;
+          break;
+        case PADDING_SAME: {
+          m_outputPlanes = numext::ceil(m_input_planes_eff / static_cast<float>(m_plane_strides));
+          m_outputRows = numext::ceil(m_input_rows_eff / static_cast<float>(m_row_strides));
+          m_outputCols = numext::ceil(m_input_cols_eff / static_cast<float>(m_col_strides));
+          const Index dz = (m_outputPlanes - 1) * m_plane_strides + m_patch_planes_eff - m_input_planes_eff;
+          const Index dy = (m_outputRows - 1) * m_row_strides + m_patch_rows_eff - m_input_rows_eff;
+          const Index dx = (m_outputCols - 1) * m_col_strides + m_patch_cols_eff - m_input_cols_eff;
+          m_planePaddingTop = dz / 2;
+          m_rowPaddingTop = dy / 2;
+          m_colPaddingLeft = dx / 2;
+          break;
+        }
+        default: {
+          eigen_assert(false && "unexpected padding");
+          return;
+        }
+      }
+    }
+    eigen_assert(m_outputRows > 0);
+    eigen_assert(m_outputCols > 0);
+    eigen_assert(m_outputPlanes > 0);
+
+    // Dimensions for result of extraction.
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      // ColMajor
+      // 0: depth
+      // 1: patch_planes
+      // 2: patch_rows
+      // 3: patch_cols
+      // 4: number of patches
+      // 5 and beyond: anything else (such as batch).
+      m_dimensions[0] = input_dims[0];
+      m_dimensions[1] = op.patch_planes();
+      m_dimensions[2] = op.patch_rows();
+      m_dimensions[3] = op.patch_cols();
+      m_dimensions[4] = m_outputPlanes * m_outputRows * m_outputCols;
+      for (int i = 5; i < NumDims; ++i) {
+        m_dimensions[i] = input_dims[i - 1];
+      }
+    } else {
+      // RowMajor
+      // NumDims-1: depth
+      // NumDims-2: patch_planes
+      // NumDims-3: patch_rows
+      // NumDims-4: patch_cols
+      // NumDims-5: number of patches
+      // NumDims-6 and beyond: anything else (such as batch).
+      m_dimensions[NumDims - 1] = input_dims[NumInputDims - 1];
+      m_dimensions[NumDims - 2] = op.patch_planes();
+      m_dimensions[NumDims - 3] = op.patch_rows();
+      m_dimensions[NumDims - 4] = op.patch_cols();
+      m_dimensions[NumDims - 5] = m_outputPlanes * m_outputRows * m_outputCols;
+      for (int i = NumDims - 6; i >= 0; --i) {
+        m_dimensions[i] = input_dims[i];
+      }
+    }
+
+    // Strides for the output tensor.
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_rowStride = m_dimensions[1];
+      m_colStride = m_dimensions[2] * m_rowStride;
+      m_patchStride = m_colStride * m_dimensions[3] * m_dimensions[0];
+      m_otherStride = m_patchStride * m_dimensions[4];
+    } else {
+      m_rowStride = m_dimensions[NumDims - 2];
+      m_colStride = m_dimensions[NumDims - 3] * m_rowStride;
+      m_patchStride = m_colStride * m_dimensions[NumDims - 4] * m_dimensions[NumDims - 1];
+      m_otherStride = m_patchStride * m_dimensions[NumDims - 5];
+    }
+
+    // Strides for navigating through the input tensor.
+    m_planeInputStride = m_inputDepth;
+    m_rowInputStride = m_inputDepth * m_inputPlanes;
+    m_colInputStride = m_inputDepth * m_inputRows * m_inputPlanes;
+    m_otherInputStride = m_inputDepth * m_inputRows * m_inputCols * m_inputPlanes;
+
+    m_outputPlanesRows = m_outputPlanes * m_outputRows;
+
+    // Fast representations of different variables.
+    m_fastOtherStride = internal::TensorIntDivisor<Index>(m_otherStride);
+
+    m_fastPatchStride = internal::TensorIntDivisor<Index>(m_patchStride);
+    m_fastColStride = internal::TensorIntDivisor<Index>(m_colStride);
+    m_fastRowStride = internal::TensorIntDivisor<Index>(m_rowStride);
+    m_fastInputRowStride = internal::TensorIntDivisor<Index>(m_row_inflate_strides);
+    m_fastInputColStride = internal::TensorIntDivisor<Index>(m_col_inflate_strides);
+    m_fastInputPlaneStride = internal::TensorIntDivisor<Index>(m_plane_inflate_strides);
+    m_fastInputColsEff = internal::TensorIntDivisor<Index>(m_input_cols_eff);
+    m_fastOutputPlanes = internal::TensorIntDivisor<Index>(m_outputPlanes);
+    m_fastOutputPlanesRows = internal::TensorIntDivisor<Index>(m_outputPlanesRows);
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[0]);
+    } else {
+      m_fastOutputDepth = internal::TensorIntDivisor<Index>(m_dimensions[NumDims - 1]);
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+#ifdef EIGEN_USE_THREADS
+  template <typename EvalSubExprsCallback>
+  EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(EvaluatorPointerType /*data*/, EvalSubExprsCallback done) {
+    m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
+  }
+#endif  // EIGEN_USE_THREADS
+
+  EIGEN_STRONG_INLINE void cleanup() { m_impl.cleanup(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    // Patch index corresponding to the passed in index.
+    const Index patchIndex = index / m_fastPatchStride;
+
+    // Spatial offset within the patch. This has to be translated into 3D
+    // coordinates within the patch.
+    const Index patchOffset = (index - patchIndex * m_patchStride) / m_fastOutputDepth;
+
+    // Batch, etc.
+    const Index otherIndex = (NumDims == 5) ? 0 : index / m_fastOtherStride;
+    const Index patch3DIndex = (NumDims == 5) ? patchIndex : (index - otherIndex * m_otherStride) / m_fastPatchStride;
+
+    // Calculate column index in the input original tensor.
+    const Index colIndex = patch3DIndex / m_fastOutputPlanesRows;
+    const Index colOffset = patchOffset / m_fastColStride;
+    const Index inputCol = colIndex * m_col_strides + colOffset * m_in_col_strides - m_colPaddingLeft;
+    const Index origInputCol =
+        (m_col_inflate_strides == 1) ? inputCol : ((inputCol >= 0) ? (inputCol / m_fastInputColStride) : 0);
+    if (inputCol < 0 || inputCol >= m_input_cols_eff ||
+        ((m_col_inflate_strides != 1) && (inputCol != origInputCol * m_col_inflate_strides))) {
+      return Scalar(m_paddingValue);
+    }
+
+    // Calculate row index in the original input tensor.
+    const Index rowIndex = (patch3DIndex - colIndex * m_outputPlanesRows) / m_fastOutputPlanes;
+    const Index rowOffset = (patchOffset - colOffset * m_colStride) / m_fastRowStride;
+    const Index inputRow = rowIndex * m_row_strides + rowOffset * m_in_row_strides - m_rowPaddingTop;
+    const Index origInputRow =
+        (m_row_inflate_strides == 1) ? inputRow : ((inputRow >= 0) ? (inputRow / m_fastInputRowStride) : 0);
+    if (inputRow < 0 || inputRow >= m_input_rows_eff ||
+        ((m_row_inflate_strides != 1) && (inputRow != origInputRow * m_row_inflate_strides))) {
+      return Scalar(m_paddingValue);
+    }
+
+    // Calculate plane index in the original input tensor.
+    const Index planeIndex = (patch3DIndex - m_outputPlanes * (colIndex * m_outputRows + rowIndex));
+    const Index planeOffset = patchOffset - colOffset * m_colStride - rowOffset * m_rowStride;
+    const Index inputPlane = planeIndex * m_plane_strides + planeOffset * m_in_plane_strides - m_planePaddingTop;
+    const Index origInputPlane =
+        (m_plane_inflate_strides == 1) ? inputPlane : ((inputPlane >= 0) ? (inputPlane / m_fastInputPlaneStride) : 0);
+    if (inputPlane < 0 || inputPlane >= m_input_planes_eff ||
+        ((m_plane_inflate_strides != 1) && (inputPlane != origInputPlane * m_plane_inflate_strides))) {
+      return Scalar(m_paddingValue);
+    }
+
+    const int depth_index = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
+    const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index];
+
+    const Index inputIndex = depth + origInputRow * m_rowInputStride + origInputCol * m_colInputStride +
+                             origInputPlane * m_planeInputStride + otherIndex * m_otherInputStride;
+
+    return m_impl.coeff(inputIndex);
+  }
+
+  template <int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const {
+    eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+
+    if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1 ||
+        m_in_plane_strides != 1 || m_plane_inflate_strides != 1) {
+      return packetWithPossibleZero(index);
+    }
+
+    const Index indices[2] = {index, index + PacketSize - 1};
+    const Index patchIndex = indices[0] / m_fastPatchStride;
+    if (patchIndex != indices[1] / m_fastPatchStride) {
+      return packetWithPossibleZero(index);
+    }
+    const Index otherIndex = (NumDims == 5) ? 0 : indices[0] / m_fastOtherStride;
+    eigen_assert(otherIndex == indices[1] / m_fastOtherStride);
+
+    // Find the offset of the element wrt the location of the first element.
+    Index first_entry = (indices[0] - patchIndex * m_patchStride) / m_fastOutputDepth;
+    Index second_entry = PacketSize == 1 ? first_entry : 
+                        (indices[1] - patchIndex * m_patchStride) / m_fastOutputDepth;
+
+    const Index patchOffsets[2] = {first_entry, second_entry};
+
+    const Index patch3DIndex =
+        (NumDims == 5) ? patchIndex : (indices[0] - otherIndex * m_otherStride) / m_fastPatchStride;
+    eigen_assert(patch3DIndex == (indices[1] - otherIndex * m_otherStride) / m_fastPatchStride);
+
+    const Index colIndex = patch3DIndex / m_fastOutputPlanesRows;
+    const Index colOffsets[2] = {patchOffsets[0] / m_fastColStride, patchOffsets[1] / m_fastColStride};
+
+    // Calculate col indices in the original input tensor.
+    const Index inputCols[2] = {colIndex * m_col_strides + colOffsets[0] - m_colPaddingLeft,
+                                colIndex * m_col_strides + colOffsets[1] - m_colPaddingLeft};
+    if (inputCols[1] < 0 || inputCols[0] >= m_inputCols) {
+      return internal::pset1<PacketReturnType>(Scalar(m_paddingValue));
+    }
+
+    if (inputCols[0] != inputCols[1]) {
+      return packetWithPossibleZero(index);
+    }
+
+    const Index rowIndex = (patch3DIndex - colIndex * m_outputPlanesRows) / m_fastOutputPlanes;
+    const Index rowOffsets[2] = {(patchOffsets[0] - colOffsets[0] * m_colStride) / m_fastRowStride,
+                                 (patchOffsets[1] - colOffsets[1] * m_colStride) / m_fastRowStride};
+    eigen_assert(rowOffsets[0] <= rowOffsets[1]);
+    // Calculate col indices in the original input tensor.
+    const Index inputRows[2] = {rowIndex * m_row_strides + rowOffsets[0] - m_rowPaddingTop,
+                                rowIndex * m_row_strides + rowOffsets[1] - m_rowPaddingTop};
+
+    if (inputRows[1] < 0 || inputRows[0] >= m_inputRows) {
+      return internal::pset1<PacketReturnType>(Scalar(m_paddingValue));
+    }
+
+    if (inputRows[0] != inputRows[1]) {
+      return packetWithPossibleZero(index);
+    }
+
+    const Index planeIndex = (patch3DIndex - m_outputPlanes * (colIndex * m_outputRows + rowIndex));
+    const Index planeOffsets[2] = {patchOffsets[0] - colOffsets[0] * m_colStride - rowOffsets[0] * m_rowStride,
+                                   patchOffsets[1] - colOffsets[1] * m_colStride - rowOffsets[1] * m_rowStride};
+    eigen_assert(planeOffsets[0] <= planeOffsets[1]);
+    const Index inputPlanes[2] = {planeIndex * m_plane_strides + planeOffsets[0] - m_planePaddingTop,
+                                  planeIndex * m_plane_strides + planeOffsets[1] - m_planePaddingTop};
+
+    if (inputPlanes[1] < 0 || inputPlanes[0] >= m_inputPlanes) {
+      return internal::pset1<PacketReturnType>(Scalar(m_paddingValue));
+    }
+
+    if (inputPlanes[0] >= 0 && inputPlanes[1] < m_inputPlanes) {
+      // no padding
+      const int depth_index = static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
+      const Index depth = index - (index / m_fastOutputDepth) * m_dimensions[depth_index];
+      const Index inputIndex = depth + inputRows[0] * m_rowInputStride + inputCols[0] * m_colInputStride +
+                               m_planeInputStride * inputPlanes[0] + otherIndex * m_otherInputStride;
+      return m_impl.template packet<Unaligned>(inputIndex);
+    }
+
+    return packetWithPossibleZero(index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    const double compute_cost =
+        10 * TensorOpCost::DivCost<Index>() + 21 * TensorOpCost::MulCost<Index>() + 8 * TensorOpCost::AddCost<Index>();
+    return TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
+
+  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index planePaddingTop() const { return m_planePaddingTop; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowPaddingTop() const { return m_rowPaddingTop; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colPaddingLeft() const { return m_colPaddingLeft; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputPlanes() const { return m_outputPlanes; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputRows() const { return m_outputRows; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index outputCols() const { return m_outputCols; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userPlaneStride() const { return m_plane_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userRowStride() const { return m_row_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userColStride() const { return m_col_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInPlaneStride() const { return m_in_plane_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInRowStride() const { return m_in_row_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index userInColStride() const { return m_in_col_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index planeInflateStride() const { return m_plane_inflate_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rowInflateStride() const { return m_row_inflate_strides; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index colInflateStride() const { return m_col_inflate_strides; }
+
+ protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetWithPossibleZero(Index index) const {
+    EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[PacketSize];
+    EIGEN_UNROLL_LOOP
+    for (int i = 0; i < PacketSize; ++i) {
+      values[i] = coeff(index + i);
+    }
+    PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+    return rslt;
+  }
+
+  Dimensions m_dimensions;
+
+  // Parameters passed to the constructor.
+  Index m_plane_strides;
+  Index m_row_strides;
+  Index m_col_strides;
+
+  Index m_outputPlanes;
+  Index m_outputRows;
+  Index m_outputCols;
+
+  Index m_planePaddingTop;
+  Index m_rowPaddingTop;
+  Index m_colPaddingLeft;
+
+  Index m_in_plane_strides;
+  Index m_in_row_strides;
+  Index m_in_col_strides;
+
+  Index m_plane_inflate_strides;
+  Index m_row_inflate_strides;
+  Index m_col_inflate_strides;
+
+  // Cached input size.
+  Index m_inputDepth;
+  Index m_inputPlanes;
+  Index m_inputRows;
+  Index m_inputCols;
+
+  // Other cached variables.
+  Index m_outputPlanesRows;
+
+  // Effective input/patch post-inflation size.
+  Index m_input_planes_eff;
+  Index m_input_rows_eff;
+  Index m_input_cols_eff;
+  Index m_patch_planes_eff;
+  Index m_patch_rows_eff;
+  Index m_patch_cols_eff;
+
+  // Strides for the output tensor.
+  Index m_otherStride;
+  Index m_patchStride;
+  Index m_rowStride;
+  Index m_colStride;
+
+  // Strides for the input tensor.
+  Index m_planeInputStride;
+  Index m_rowInputStride;
+  Index m_colInputStride;
+  Index m_otherInputStride;
+
+  internal::TensorIntDivisor<Index> m_fastOtherStride;
+  internal::TensorIntDivisor<Index> m_fastPatchStride;
+  internal::TensorIntDivisor<Index> m_fastColStride;
+  internal::TensorIntDivisor<Index> m_fastRowStride;
+  internal::TensorIntDivisor<Index> m_fastInputPlaneStride;
+  internal::TensorIntDivisor<Index> m_fastInputRowStride;
+  internal::TensorIntDivisor<Index> m_fastInputColStride;
+  internal::TensorIntDivisor<Index> m_fastInputColsEff;
+  internal::TensorIntDivisor<Index> m_fastOutputPlanesRows;
+  internal::TensorIntDivisor<Index> m_fastOutputPlanes;
+  internal::TensorIntDivisor<Index> m_fastOutputDepth;
+
+  Scalar m_paddingValue;
+
+  TensorEvaluator<ArgType, Device> m_impl;
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_VOLUME_PATCH_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h b/inst/include/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h
new file mode 100644
index 00000000..ae5c4f4c
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/TensorSymmetry/DynamicSymmetry.h
@@ -0,0 +1,296 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSORSYMMETRY_DYNAMICSYMMETRY_H
+#define EIGEN_CXX11_TENSORSYMMETRY_DYNAMICSYMMETRY_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+class DynamicSGroup {
+ public:
+  inline explicit DynamicSGroup() : m_numIndices(1), m_elements(), m_generators(), m_globalFlags(0) {
+    m_elements.push_back(ge(Generator(0, 0, 0)));
+  }
+  inline DynamicSGroup(const DynamicSGroup& o)
+      : m_numIndices(o.m_numIndices),
+        m_elements(o.m_elements),
+        m_generators(o.m_generators),
+        m_globalFlags(o.m_globalFlags) {}
+  inline DynamicSGroup(DynamicSGroup&& o)
+      : m_numIndices(o.m_numIndices), m_elements(), m_generators(o.m_generators), m_globalFlags(o.m_globalFlags) {
+    std::swap(m_elements, o.m_elements);
+  }
+  inline DynamicSGroup& operator=(const DynamicSGroup& o) {
+    m_numIndices = o.m_numIndices;
+    m_elements = o.m_elements;
+    m_generators = o.m_generators;
+    m_globalFlags = o.m_globalFlags;
+    return *this;
+  }
+  inline DynamicSGroup& operator=(DynamicSGroup&& o) {
+    m_numIndices = o.m_numIndices;
+    std::swap(m_elements, o.m_elements);
+    m_generators = o.m_generators;
+    m_globalFlags = o.m_globalFlags;
+    return *this;
+  }
+
+  void add(int one, int two, int flags = 0);
+
+  template <typename Gen_>
+  inline void add(Gen_) {
+    add(Gen_::One, Gen_::Two, Gen_::Flags);
+  }
+  inline void addSymmetry(int one, int two) { add(one, two, 0); }
+  inline void addAntiSymmetry(int one, int two) { add(one, two, NegationFlag); }
+  inline void addHermiticity(int one, int two) { add(one, two, ConjugationFlag); }
+  inline void addAntiHermiticity(int one, int two) { add(one, two, NegationFlag | ConjugationFlag); }
+
+  template <typename Op, typename RV, typename Index, std::size_t N, typename... Args>
+  inline RV apply(const std::array<Index, N>& idx, RV initial, Args&&... args) const {
+    eigen_assert(N >= m_numIndices &&
+                 "Can only apply symmetry group to objects that have at least the required amount of indices.");
+    for (std::size_t i = 0; i < size(); i++)
+      initial = Op::run(h_permute(i, idx, typename internal::gen_numeric_list<int, N>::type()), m_elements[i].flags,
+                        initial, std::forward<Args>(args)...);
+    return initial;
+  }
+
+  template <typename Op, typename RV, typename Index, typename... Args>
+  inline RV apply(const std::vector<Index>& idx, RV initial, Args&&... args) const {
+    eigen_assert(idx.size() >= m_numIndices &&
+                 "Can only apply symmetry group to objects that have at least the required amount of indices.");
+    for (std::size_t i = 0; i < size(); i++)
+      initial = Op::run(h_permute(i, idx), m_elements[i].flags, initial, std::forward<Args>(args)...);
+    return initial;
+  }
+
+  inline int globalFlags() const { return m_globalFlags; }
+  inline std::size_t size() const { return m_elements.size(); }
+
+  template <typename Tensor_, typename... IndexTypes>
+  inline internal::tensor_symmetry_value_setter<Tensor_, DynamicSGroup> operator()(Tensor_& tensor,
+                                                                                   typename Tensor_::Index firstIndex,
+                                                                                   IndexTypes... otherIndices) const {
+    static_assert(sizeof...(otherIndices) + 1 == Tensor_::NumIndices,
+                  "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
+    return operator()(tensor, std::array<typename Tensor_::Index, Tensor_::NumIndices>{{firstIndex, otherIndices...}});
+  }
+
+  template <typename Tensor_>
+  inline internal::tensor_symmetry_value_setter<Tensor_, DynamicSGroup> operator()(
+      Tensor_& tensor, std::array<typename Tensor_::Index, Tensor_::NumIndices> const& indices) const {
+    return internal::tensor_symmetry_value_setter<Tensor_, DynamicSGroup>(tensor, *this, indices);
+  }
+
+ private:
+  struct GroupElement {
+    std::vector<int> representation;
+    int flags;
+    bool isId() const {
+      for (std::size_t i = 0; i < representation.size(); i++)
+        if (i != (size_t)representation[i]) return false;
+      return true;
+    }
+  };
+  struct Generator {
+    int one;
+    int two;
+    int flags;
+    constexpr Generator(int one_, int two_, int flags_) : one(one_), two(two_), flags(flags_) {}
+  };
+
+  std::size_t m_numIndices;
+  std::vector<GroupElement> m_elements;
+  std::vector<Generator> m_generators;
+  int m_globalFlags;
+
+  template <typename Index, std::size_t N, int... n>
+  inline std::array<Index, N> h_permute(std::size_t which, const std::array<Index, N>& idx,
+                                        internal::numeric_list<int, n...>) const {
+    return std::array<Index, N>{{idx[n >= m_numIndices ? n : m_elements[which].representation[n]]...}};
+  }
+
+  template <typename Index>
+  inline std::vector<Index> h_permute(std::size_t which, std::vector<Index> idx) const {
+    std::vector<Index> result;
+    result.reserve(idx.size());
+    for (auto k : m_elements[which].representation) result.push_back(idx[k]);
+    for (std::size_t i = m_numIndices; i < idx.size(); i++) result.push_back(idx[i]);
+    return result;
+  }
+
+  inline GroupElement ge(Generator const& g) const {
+    GroupElement result;
+    result.representation.reserve(m_numIndices);
+    result.flags = g.flags;
+    for (std::size_t k = 0; k < m_numIndices; k++) {
+      if (k == (std::size_t)g.one)
+        result.representation.push_back(g.two);
+      else if (k == (std::size_t)g.two)
+        result.representation.push_back(g.one);
+      else
+        result.representation.push_back(int(k));
+    }
+    return result;
+  }
+
+  GroupElement mul(GroupElement, GroupElement) const;
+  inline GroupElement mul(Generator g1, GroupElement g2) const { return mul(ge(g1), g2); }
+
+  inline GroupElement mul(GroupElement g1, Generator g2) const { return mul(g1, ge(g2)); }
+
+  inline GroupElement mul(Generator g1, Generator g2) const { return mul(ge(g1), ge(g2)); }
+
+  inline int findElement(GroupElement e) const {
+    for (auto ee : m_elements) {
+      if (ee.representation == e.representation) return ee.flags ^ e.flags;
+    }
+    return -1;
+  }
+
+  void updateGlobalFlags(int flagDiffOfSameGenerator);
+};
+
+// dynamic symmetry group that auto-adds the template parameters in the constructor
+template <typename... Gen>
+class DynamicSGroupFromTemplateArgs : public DynamicSGroup {
+ public:
+  inline DynamicSGroupFromTemplateArgs() : DynamicSGroup() { add_all(internal::type_list<Gen...>()); }
+  inline DynamicSGroupFromTemplateArgs(DynamicSGroupFromTemplateArgs const& other) : DynamicSGroup(other) {}
+  inline DynamicSGroupFromTemplateArgs(DynamicSGroupFromTemplateArgs&& other) : DynamicSGroup(other) {}
+  inline DynamicSGroupFromTemplateArgs<Gen...>& operator=(const DynamicSGroupFromTemplateArgs<Gen...>& o) {
+    DynamicSGroup::operator=(o);
+    return *this;
+  }
+  inline DynamicSGroupFromTemplateArgs<Gen...>& operator=(DynamicSGroupFromTemplateArgs<Gen...>&& o) {
+    DynamicSGroup::operator=(o);
+    return *this;
+  }
+
+ private:
+  template <typename Gen1, typename... GenNext>
+  inline void add_all(internal::type_list<Gen1, GenNext...>) {
+    add(Gen1());
+    add_all(internal::type_list<GenNext...>());
+  }
+
+  inline void add_all(internal::type_list<>) {}
+};
+
+inline DynamicSGroup::GroupElement DynamicSGroup::mul(GroupElement g1, GroupElement g2) const {
+  eigen_internal_assert(g1.representation.size() == m_numIndices);
+  eigen_internal_assert(g2.representation.size() == m_numIndices);
+
+  GroupElement result;
+  result.representation.reserve(m_numIndices);
+  for (std::size_t i = 0; i < m_numIndices; i++) {
+    int v = g2.representation[g1.representation[i]];
+    eigen_assert(v >= 0);
+    result.representation.push_back(v);
+  }
+  result.flags = g1.flags ^ g2.flags;
+  return result;
+}
+
+inline void DynamicSGroup::add(int one, int two, int flags) {
+  eigen_assert(one >= 0);
+  eigen_assert(two >= 0);
+  eigen_assert(one != two);
+
+  if ((std::size_t)one >= m_numIndices || (std::size_t)two >= m_numIndices) {
+    std::size_t newNumIndices = (one > two) ? one : two + 1;
+    for (auto& gelem : m_elements) {
+      gelem.representation.reserve(newNumIndices);
+      for (std::size_t i = m_numIndices; i < newNumIndices; i++) gelem.representation.push_back(i);
+    }
+    m_numIndices = newNumIndices;
+  }
+
+  Generator g{one, two, flags};
+  GroupElement e = ge(g);
+
+  /* special case for first generator */
+  if (m_elements.size() == 1) {
+    while (!e.isId()) {
+      m_elements.push_back(e);
+      e = mul(e, g);
+    }
+
+    if (e.flags > 0) updateGlobalFlags(e.flags);
+
+    // only add in case we didn't have identity
+    if (m_elements.size() > 1) m_generators.push_back(g);
+    return;
+  }
+
+  int p = findElement(e);
+  if (p >= 0) {
+    updateGlobalFlags(p);
+    return;
+  }
+
+  std::size_t coset_order = m_elements.size();
+  m_elements.push_back(e);
+  for (std::size_t i = 1; i < coset_order; i++) m_elements.push_back(mul(m_elements[i], e));
+  m_generators.push_back(g);
+
+  std::size_t coset_rep = coset_order;
+  do {
+    for (auto g : m_generators) {
+      e = mul(m_elements[coset_rep], g);
+      p = findElement(e);
+      if (p < 0) {
+        // element not yet in group
+        m_elements.push_back(e);
+        for (std::size_t i = 1; i < coset_order; i++) m_elements.push_back(mul(m_elements[i], e));
+      } else if (p > 0) {
+        updateGlobalFlags(p);
+      }
+    }
+    coset_rep += coset_order;
+  } while (coset_rep < m_elements.size());
+}
+
+inline void DynamicSGroup::updateGlobalFlags(int flagDiffOfSameGenerator) {
+  switch (flagDiffOfSameGenerator) {
+    case 0:
+    default:
+      // nothing happened
+      break;
+    case NegationFlag:
+      // every element is it's own negative => whole tensor is zero
+      m_globalFlags |= GlobalZeroFlag;
+      break;
+    case ConjugationFlag:
+      // every element is it's own conjugate => whole tensor is real
+      m_globalFlags |= GlobalRealFlag;
+      break;
+    case (NegationFlag | ConjugationFlag):
+      // every element is it's own negative conjugate => whole tensor is imaginary
+      m_globalFlags |= GlobalImagFlag;
+      break;
+      /* NOTE:
+       *   since GlobalZeroFlag == GlobalRealFlag | GlobalImagFlag, if one generator
+       *   causes the tensor to be real and the next one to be imaginary, this will
+       *   trivially give the correct result
+       */
+  }
+}
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSORSYMMETRY_DYNAMICSYMMETRY_H
+
+/*
+ * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
+ */
diff --git a/inst/include/unsupported/Eigen/CXX11/src/TensorSymmetry/InternalHeaderCheck.h b/inst/include/unsupported/Eigen/CXX11/src/TensorSymmetry/InternalHeaderCheck.h
new file mode 100644
index 00000000..b1b2e14c
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/TensorSymmetry/InternalHeaderCheck.h
@@ -0,0 +1,4 @@
+#ifndef EIGEN_CXX11_TENSORSYMMETRY_MODULE_H
+#error \
+    "Please include unsupported/Eigen/CXX11/TensorSymmetry instead of including headers inside the src directory directly."
+#endif
diff --git a/inst/include/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h b/inst/include/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h
new file mode 100644
index 00000000..66a982bd
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/TensorSymmetry/StaticSymmetry.h
@@ -0,0 +1,223 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSORSYMMETRY_STATICSYMMETRY_H
+#define EIGEN_CXX11_TENSORSYMMETRY_STATICSYMMETRY_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename list>
+struct tensor_static_symgroup_permutate;
+
+template <int... nn>
+struct tensor_static_symgroup_permutate<numeric_list<int, nn...>> {
+  constexpr static std::size_t N = sizeof...(nn);
+
+  template <typename T>
+  constexpr static std::array<T, N> run(const std::array<T, N>& indices) {
+    return {{indices[nn]...}};
+  }
+};
+
+template <typename indices_, int flags_>
+struct tensor_static_symgroup_element {
+  typedef indices_ indices;
+  constexpr static int flags = flags_;
+};
+
+template <typename Gen, int N>
+struct tensor_static_symgroup_element_ctor {
+  typedef tensor_static_symgroup_element<typename gen_numeric_list_swapped_pair<int, N, Gen::One, Gen::Two>::type,
+                                         Gen::Flags>
+      type;
+};
+
+template <int N>
+struct tensor_static_symgroup_identity_ctor {
+  typedef tensor_static_symgroup_element<typename gen_numeric_list<int, N>::type, 0> type;
+};
+
+template <typename iib>
+struct tensor_static_symgroup_multiply_helper {
+  template <int... iia>
+  constexpr static numeric_list<int, get<iia, iib>::value...> helper(numeric_list<int, iia...>) {
+    return numeric_list<int, get<iia, iib>::value...>();
+  }
+};
+
+template <typename A, typename B>
+struct tensor_static_symgroup_multiply {
+ private:
+  typedef typename A::indices iia;
+  typedef typename B::indices iib;
+  constexpr static int ffa = A::flags;
+  constexpr static int ffb = B::flags;
+
+ public:
+  static_assert(iia::count == iib::count, "Cannot multiply symmetry elements with different number of indices.");
+
+  typedef tensor_static_symgroup_element<decltype(tensor_static_symgroup_multiply_helper<iib>::helper(iia())),
+                                         ffa ^ ffb>
+      type;
+};
+
+template <typename A, typename B>
+struct tensor_static_symgroup_equality {
+  typedef typename A::indices iia;
+  typedef typename B::indices iib;
+  constexpr static int ffa = A::flags;
+  constexpr static int ffb = B::flags;
+  static_assert(iia::count == iib::count, "Cannot compare symmetry elements with different number of indices.");
+
+  constexpr static bool value = is_same<iia, iib>::value;
+
+ private:
+  /* this should be zero if they are identical, or else the tensor
+   * will be forced to be pure real, pure imaginary or even pure zero
+   */
+  constexpr static int flags_cmp_ = ffa ^ ffb;
+
+  /* either they are not equal, then we don't care whether the flags
+   * match, or they are equal, and then we have to check
+   */
+  constexpr static bool is_zero = value && flags_cmp_ == NegationFlag;
+  constexpr static bool is_real = value && flags_cmp_ == ConjugationFlag;
+  constexpr static bool is_imag = value && flags_cmp_ == (NegationFlag | ConjugationFlag);
+
+ public:
+  constexpr static int global_flags =
+      (is_real ? GlobalRealFlag : 0) | (is_imag ? GlobalImagFlag : 0) | (is_zero ? GlobalZeroFlag : 0);
+};
+
+template <std::size_t NumIndices, typename... Gen>
+struct tensor_static_symgroup {
+  typedef StaticSGroup<Gen...> type;
+  constexpr static std::size_t size = type::static_size;
+};
+
+template <typename Index, std::size_t N, int... ii, int... jj>
+constexpr static std::array<Index, N> tensor_static_symgroup_index_permute(std::array<Index, N> idx,
+                                                                           internal::numeric_list<int, ii...>,
+                                                                           internal::numeric_list<int, jj...>) {
+  return {{idx[ii]..., idx[jj]...}};
+}
+
+template <typename Index, int... ii>
+static inline std::vector<Index> tensor_static_symgroup_index_permute(std::vector<Index> idx,
+                                                                      internal::numeric_list<int, ii...>) {
+  std::vector<Index> result{{idx[ii]...}};
+  std::size_t target_size = idx.size();
+  for (std::size_t i = result.size(); i < target_size; i++) result.push_back(idx[i]);
+  return result;
+}
+
+template <typename T>
+struct tensor_static_symgroup_do_apply;
+
+template <typename first, typename... next>
+struct tensor_static_symgroup_do_apply<internal::type_list<first, next...>> {
+  template <typename Op, typename RV, std::size_t SGNumIndices, typename Index, std::size_t NumIndices,
+            typename... Args>
+  static inline RV run(const std::array<Index, NumIndices>& idx, RV initial, Args&&... args) {
+    static_assert(NumIndices >= SGNumIndices,
+                  "Can only apply symmetry group to objects that have at least the required amount of indices.");
+    typedef typename internal::gen_numeric_list<int, NumIndices - SGNumIndices, SGNumIndices>::type remaining_indices;
+    initial = Op::run(tensor_static_symgroup_index_permute(idx, typename first::indices(), remaining_indices()),
+                      first::flags, initial, std::forward<Args>(args)...);
+    return tensor_static_symgroup_do_apply<internal::type_list<next...>>::template run<Op, RV, SGNumIndices>(
+        idx, initial, args...);
+  }
+
+  template <typename Op, typename RV, std::size_t SGNumIndices, typename Index, typename... Args>
+  static inline RV run(const std::vector<Index>& idx, RV initial, Args&&... args) {
+    eigen_assert(idx.size() >= SGNumIndices &&
+                 "Can only apply symmetry group to objects that have at least the required amount of indices.");
+    initial = Op::run(tensor_static_symgroup_index_permute(idx, typename first::indices()), first::flags, initial,
+                      std::forward<Args>(args)...);
+    return tensor_static_symgroup_do_apply<internal::type_list<next...>>::template run<Op, RV, SGNumIndices>(
+        idx, initial, args...);
+  }
+};
+
+template <EIGEN_TPL_PP_SPEC_HACK_DEF(typename, empty)>
+struct tensor_static_symgroup_do_apply<internal::type_list<EIGEN_TPL_PP_SPEC_HACK_USE(empty)>> {
+  template <typename Op, typename RV, std::size_t SGNumIndices, typename Index, std::size_t NumIndices,
+            typename... Args>
+  static inline RV run(const std::array<Index, NumIndices>&, RV initial, Args&&...) {
+    // do nothing
+    return initial;
+  }
+
+  template <typename Op, typename RV, std::size_t SGNumIndices, typename Index, typename... Args>
+  static inline RV run(const std::vector<Index>&, RV initial, Args&&...) {
+    // do nothing
+    return initial;
+  }
+};
+
+}  // end namespace internal
+
+template <typename... Gen>
+class StaticSGroup {
+  constexpr static std::size_t NumIndices = internal::tensor_symmetry_num_indices<Gen...>::value;
+  typedef internal::group_theory::enumerate_group_elements<
+      internal::tensor_static_symgroup_multiply, internal::tensor_static_symgroup_equality,
+      typename internal::tensor_static_symgroup_identity_ctor<NumIndices>::type,
+      internal::type_list<typename internal::tensor_static_symgroup_element_ctor<Gen, NumIndices>::type...>>
+      group_elements;
+  typedef typename group_elements::type ge;
+
+ public:
+  constexpr StaticSGroup() {}
+  constexpr StaticSGroup(const StaticSGroup<Gen...>&) {}
+  constexpr StaticSGroup(StaticSGroup<Gen...>&&) {}
+
+  template <typename Op, typename RV, typename Index, std::size_t N, typename... Args>
+  static inline RV apply(const std::array<Index, N>& idx, RV initial, Args&&... args) {
+    return internal::tensor_static_symgroup_do_apply<ge>::template run<Op, RV, NumIndices>(idx, initial, args...);
+  }
+
+  template <typename Op, typename RV, typename Index, typename... Args>
+  static inline RV apply(const std::vector<Index>& idx, RV initial, Args&&... args) {
+    eigen_assert(idx.size() == NumIndices);
+    return internal::tensor_static_symgroup_do_apply<ge>::template run<Op, RV, NumIndices>(idx, initial, args...);
+  }
+
+  constexpr static std::size_t static_size = ge::count;
+
+  constexpr static std::size_t size() { return ge::count; }
+  constexpr static int globalFlags() { return group_elements::global_flags; }
+
+  template <typename Tensor_, typename... IndexTypes>
+  inline internal::tensor_symmetry_value_setter<Tensor_, StaticSGroup<Gen...>> operator()(
+      Tensor_& tensor, typename Tensor_::Index firstIndex, IndexTypes... otherIndices) const {
+    static_assert(sizeof...(otherIndices) + 1 == Tensor_::NumIndices,
+                  "Number of indices used to access a tensor coefficient must be equal to the rank of the tensor.");
+    return operator()(tensor, std::array<typename Tensor_::Index, Tensor_::NumIndices>{{firstIndex, otherIndices...}});
+  }
+
+  template <typename Tensor_>
+  inline internal::tensor_symmetry_value_setter<Tensor_, StaticSGroup<Gen...>> operator()(
+      Tensor_& tensor, std::array<typename Tensor_::Index, Tensor_::NumIndices> const& indices) const {
+    return internal::tensor_symmetry_value_setter<Tensor_, StaticSGroup<Gen...>>(tensor, *this, indices);
+  }
+};
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSORSYMMETRY_STATICSYMMETRY_H
+
+/*
+ * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
+ */
diff --git a/inst/include/unsupported/Eigen/CXX11/src/TensorSymmetry/Symmetry.h b/inst/include/unsupported/Eigen/CXX11/src/TensorSymmetry/Symmetry.h
new file mode 100644
index 00000000..2d3ff466
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/TensorSymmetry/Symmetry.h
@@ -0,0 +1,335 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSORSYMMETRY_SYMMETRY_H
+#define EIGEN_CXX11_TENSORSYMMETRY_SYMMETRY_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+enum { NegationFlag = 0x01, ConjugationFlag = 0x02 };
+
+enum { GlobalRealFlag = 0x01, GlobalImagFlag = 0x02, GlobalZeroFlag = 0x03 };
+
+namespace internal {
+
+template <std::size_t NumIndices, typename... Sym>
+struct tensor_symmetry_pre_analysis;
+template <std::size_t NumIndices, typename... Sym>
+struct tensor_static_symgroup;
+template <bool instantiate, std::size_t NumIndices, typename... Sym>
+struct tensor_static_symgroup_if;
+template <typename Tensor_>
+struct tensor_symmetry_calculate_flags;
+template <typename Tensor_>
+struct tensor_symmetry_assign_value;
+template <typename... Sym>
+struct tensor_symmetry_num_indices;
+
+}  // end namespace internal
+
+template <int One_, int Two_>
+struct Symmetry {
+  static_assert(One_ != Two_, "Symmetries must cover distinct indices.");
+  constexpr static int One = One_;
+  constexpr static int Two = Two_;
+  constexpr static int Flags = 0;
+};
+
+template <int One_, int Two_>
+struct AntiSymmetry {
+  static_assert(One_ != Two_, "Symmetries must cover distinct indices.");
+  constexpr static int One = One_;
+  constexpr static int Two = Two_;
+  constexpr static int Flags = NegationFlag;
+};
+
+template <int One_, int Two_>
+struct Hermiticity {
+  static_assert(One_ != Two_, "Symmetries must cover distinct indices.");
+  constexpr static int One = One_;
+  constexpr static int Two = Two_;
+  constexpr static int Flags = ConjugationFlag;
+};
+
+template <int One_, int Two_>
+struct AntiHermiticity {
+  static_assert(One_ != Two_, "Symmetries must cover distinct indices.");
+  constexpr static int One = One_;
+  constexpr static int Two = Two_;
+  constexpr static int Flags = ConjugationFlag | NegationFlag;
+};
+
+/** \class DynamicSGroup
+ * \ingroup TensorSymmetry_Module
+ *
+ * \brief Dynamic symmetry group
+ *
+ * The %DynamicSGroup class represents a symmetry group that need not be known at
+ * compile time. It is useful if one wants to support arbitrary run-time defineable
+ * symmetries for tensors, but it is also instantiated if a symmetry group is defined
+ * at compile time that would be either too large for the compiler to reasonably
+ * generate (using templates to calculate this at compile time is very inefficient)
+ * or that the compiler could generate the group but that it wouldn't make sense to
+ * unroll the loop for setting coefficients anymore.
+ */
+class DynamicSGroup;
+
+/** \internal
+ *
+ * \class DynamicSGroupFromTemplateArgs
+ * \ingroup TensorSymmetry_Module
+ *
+ * \brief Dynamic symmetry group, initialized from template arguments
+ *
+ * This class is a child class of DynamicSGroup. It uses the template arguments
+ * specified to initialize itself.
+ */
+template <typename... Gen>
+class DynamicSGroupFromTemplateArgs;
+
+/** \class StaticSGroup
+ * \ingroup TensorSymmetry_Module
+ *
+ * \brief Static symmetry group
+ *
+ * This class represents a symmetry group that is known and resolved completely
+ * at compile time. Ideally, no run-time penalty is incurred compared to the
+ * manual unrolling of the symmetry.
+ *
+ * <b><i>CAUTION:</i></b>
+ *
+ * Do not use this class directly for large symmetry groups. The compiler
+ * may run into a limit, or segfault or in the very least will take a very,
+ * very, very long time to compile the code. Use the SGroup class instead
+ * if you want a static group. That class contains logic that will
+ * automatically select the DynamicSGroup class instead if the symmetry
+ * group becomes too large. (In that case, unrolling may not even be
+ * beneficial.)
+ */
+template <typename... Gen>
+class StaticSGroup;
+
+/** \class SGroup
+ * \ingroup TensorSymmetry_Module
+ *
+ * \brief Symmetry group, initialized from template arguments
+ *
+ * This class represents a symmetry group whose generators are already
+ * known at compile time. It may or may not be resolved at compile time,
+ * depending on the estimated size of the group.
+ *
+ * \sa StaticSGroup
+ * \sa DynamicSGroup
+ */
+template <typename... Gen>
+class SGroup : public internal::tensor_symmetry_pre_analysis<internal::tensor_symmetry_num_indices<Gen...>::value,
+                                                             Gen...>::root_type {
+ public:
+  constexpr static std::size_t NumIndices = internal::tensor_symmetry_num_indices<Gen...>::value;
+  typedef typename internal::tensor_symmetry_pre_analysis<NumIndices, Gen...>::root_type Base;
+
+  // make standard constructors + assignment operators public
+  inline SGroup() : Base() {}
+  inline SGroup(const SGroup<Gen...>& other) : Base(other) {}
+  inline SGroup(SGroup<Gen...>&& other) : Base(other) {}
+  inline SGroup<Gen...>& operator=(const SGroup<Gen...>& other) {
+    Base::operator=(other);
+    return *this;
+  }
+  inline SGroup<Gen...>& operator=(SGroup<Gen...>&& other) {
+    Base::operator=(other);
+    return *this;
+  }
+
+  // all else is defined in the base class
+};
+
+namespace internal {
+
+template <typename... Sym>
+struct tensor_symmetry_num_indices {
+  constexpr static std::size_t value = 1;
+};
+
+template <int One_, int Two_, typename... Sym>
+struct tensor_symmetry_num_indices<Symmetry<One_, Two_>, Sym...> {
+ private:
+  constexpr static std::size_t One = static_cast<std::size_t>(One_);
+  constexpr static std::size_t Two = static_cast<std::size_t>(Two_);
+  constexpr static std::size_t Three = tensor_symmetry_num_indices<Sym...>::value;
+
+  // don't use std::max, since it's not constexpr until C++14...
+  constexpr static std::size_t maxOneTwoPlusOne = ((One > Two) ? One : Two) + 1;
+
+ public:
+  constexpr static std::size_t value = (maxOneTwoPlusOne > Three) ? maxOneTwoPlusOne : Three;
+};
+
+template <int One_, int Two_, typename... Sym>
+struct tensor_symmetry_num_indices<AntiSymmetry<One_, Two_>, Sym...>
+    : public tensor_symmetry_num_indices<Symmetry<One_, Two_>, Sym...> {};
+template <int One_, int Two_, typename... Sym>
+struct tensor_symmetry_num_indices<Hermiticity<One_, Two_>, Sym...>
+    : public tensor_symmetry_num_indices<Symmetry<One_, Two_>, Sym...> {};
+template <int One_, int Two_, typename... Sym>
+struct tensor_symmetry_num_indices<AntiHermiticity<One_, Two_>, Sym...>
+    : public tensor_symmetry_num_indices<Symmetry<One_, Two_>, Sym...> {};
+
+/** \internal
+ *
+ * \class tensor_symmetry_pre_analysis
+ * \ingroup TensorSymmetry_Module
+ *
+ * \brief Pre-select whether to use a static or dynamic symmetry group
+ *
+ * When a symmetry group could in principle be determined at compile time,
+ * this template implements the logic whether to actually do that or whether
+ * to rather defer that to runtime.
+ *
+ * The logic is as follows:
+ * <dl>
+ * <dt><b>No generators (trivial symmetry):</b></dt>
+ * <dd>Use a trivial static group. Ideally, this has no performance impact
+ *     compared to not using symmetry at all. In practice, this might not
+ *     be the case.</dd>
+ * <dt><b>More than 4 generators:</b></dt>
+ * <dd>Calculate the group at run time, it is likely far too large for the
+ *     compiler to be able to properly generate it in a realistic time.</dd>
+ * <dt><b>Up to and including 4 generators:</b></dt>
+ * <dd>Actually enumerate all group elements, but then check how many there
+ *     are. If there are more than 16, it is unlikely that unrolling the
+ *     loop (as is done in the static compile-time case) is sensible, so
+ *     use a dynamic group instead. If there are at most 16 elements, actually
+ *     use that static group. Note that the largest group with 4 generators
+ *     still compiles with reasonable resources.</dd>
+ * </dl>
+ *
+ * Note: Example compile time performance with g++-4.6 on an Intenl Core i5-3470
+ *       with 16 GiB RAM (all generators non-redundant and the subgroups don't
+ *       factorize):
+ *
+ *          # Generators          -O0 -ggdb               -O2
+ *          -------------------------------------------------------------------
+ *          1                 0.5 s  /   250 MiB     0.45s /   230 MiB
+ *          2                 0.5 s  /   260 MiB     0.5 s /   250 MiB
+ *          3                 0.65s  /   310 MiB     0.62s /   310 MiB
+ *          4                 2.2 s  /   860 MiB     1.7 s /   770 MiB
+ *          5               130   s  / 13000 MiB   120   s / 11000 MiB
+ *
+ * It is clear that everything is still very efficient up to 4 generators, then
+ * the memory and CPU requirements become unreasonable. Thus we only instantiate
+ * the template group theory logic if the number of generators supplied is 4 or
+ * lower, otherwise this will be forced to be done during runtime, where the
+ * algorithm is reasonably fast.
+ */
+template <std::size_t NumIndices>
+struct tensor_symmetry_pre_analysis<NumIndices> {
+  typedef StaticSGroup<> root_type;
+};
+
+template <std::size_t NumIndices, typename Gen_, typename... Gens_>
+struct tensor_symmetry_pre_analysis<NumIndices, Gen_, Gens_...> {
+  constexpr static std::size_t max_static_generators = 4;
+  constexpr static std::size_t max_static_elements = 16;
+  typedef tensor_static_symgroup_if<(sizeof...(Gens_) + 1 <= max_static_generators), NumIndices, Gen_, Gens_...> helper;
+  constexpr static std::size_t possible_size = helper::size;
+
+  typedef std::conditional_t<possible_size == 0 || possible_size >= max_static_elements,
+                             DynamicSGroupFromTemplateArgs<Gen_, Gens_...>, typename helper::type>
+      root_type;
+};
+
+template <bool instantiate, std::size_t NumIndices, typename... Gens>
+struct tensor_static_symgroup_if {
+  constexpr static std::size_t size = 0;
+  typedef void type;
+};
+
+template <std::size_t NumIndices, typename... Gens>
+struct tensor_static_symgroup_if<true, NumIndices, Gens...> : tensor_static_symgroup<NumIndices, Gens...> {};
+
+template <typename Tensor_>
+struct tensor_symmetry_assign_value {
+  typedef typename Tensor_::Index Index;
+  typedef typename Tensor_::Scalar Scalar;
+  constexpr static std::size_t NumIndices = Tensor_::NumIndices;
+
+  static inline int run(const std::array<Index, NumIndices>& transformed_indices, int transformation_flags, int dummy,
+                        Tensor_& tensor, const Scalar& value_) {
+    Scalar value(value_);
+    if (transformation_flags & ConjugationFlag) value = numext::conj(value);
+    if (transformation_flags & NegationFlag) value = -value;
+    tensor.coeffRef(transformed_indices) = value;
+    return dummy;
+  }
+};
+
+template <typename Tensor_>
+struct tensor_symmetry_calculate_flags {
+  typedef typename Tensor_::Index Index;
+  constexpr static std::size_t NumIndices = Tensor_::NumIndices;
+
+  static inline int run(const std::array<Index, NumIndices>& transformed_indices, int transform_flags,
+                        int current_flags, const std::array<Index, NumIndices>& orig_indices) {
+    if (transformed_indices == orig_indices) {
+      if (transform_flags & (ConjugationFlag | NegationFlag))
+        return current_flags | GlobalImagFlag;  // anti-hermitian diagonal
+      else if (transform_flags & ConjugationFlag)
+        return current_flags | GlobalRealFlag;  // hermitian diagonal
+      else if (transform_flags & NegationFlag)
+        return current_flags | GlobalZeroFlag;  // anti-symmetric diagonal
+    }
+    return current_flags;
+  }
+};
+
+template <typename Tensor_, typename Symmetry_, int Flags = 0>
+class tensor_symmetry_value_setter {
+ public:
+  typedef typename Tensor_::Index Index;
+  typedef typename Tensor_::Scalar Scalar;
+  constexpr static std::size_t NumIndices = Tensor_::NumIndices;
+
+  inline tensor_symmetry_value_setter(Tensor_& tensor, Symmetry_ const& symmetry,
+                                      std::array<Index, NumIndices> const& indices)
+      : m_tensor(tensor), m_symmetry(symmetry), m_indices(indices) {}
+
+  inline tensor_symmetry_value_setter<Tensor_, Symmetry_, Flags>& operator=(Scalar const& value) {
+    doAssign(value);
+    return *this;
+  }
+
+ private:
+  Tensor_& m_tensor;
+  Symmetry_ m_symmetry;
+  std::array<Index, NumIndices> m_indices;
+
+  inline void doAssign(Scalar const& value) {
+#ifdef EIGEN_TENSOR_SYMMETRY_CHECK_VALUES
+    int value_flags = m_symmetry.template apply<internal::tensor_symmetry_calculate_flags<Tensor_>, int>(
+        m_indices, m_symmetry.globalFlags(), m_indices);
+    if (value_flags & GlobalRealFlag) eigen_assert(numext::imag(value) == 0);
+    if (value_flags & GlobalImagFlag) eigen_assert(numext::real(value) == 0);
+#endif
+    m_symmetry.template apply<internal::tensor_symmetry_assign_value<Tensor_>, int>(m_indices, 0, m_tensor, value);
+  }
+};
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSORSYMMETRY_SYMMETRY_H
+
+/*
+ * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
+ */
diff --git a/inst/include/unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h b/inst/include/unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h
new file mode 100644
index 00000000..aa16f3cd
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h
@@ -0,0 +1,492 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSORSYMMETRY_TEMPLATEGROUPTHEORY_H
+#define EIGEN_CXX11_TENSORSYMMETRY_TEMPLATEGROUPTHEORY_H
+
+// IWYU pragma: private
+#include "../InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+namespace group_theory {
+
+/** \internal
+ * \file CXX11/src/TensorSymmetry/util/TemplateGroupTheory.h
+ * This file contains C++ templates that implement group theory algorithms.
+ *
+ * The algorithms allow for a compile-time analysis of finite groups.
+ *
+ * Currently only Dimino's algorithm is implemented, which returns a list
+ * of all elements in a group given a set of (possibly redundant) generators.
+ * (One could also do that with the so-called orbital algorithm, but that
+ * is much more expensive and usually has no advantages.)
+ */
+
+/**********************************************************************
+ *                "Ok kid, here is where it gets complicated."
+ *                         - Amelia Pond in the "Doctor Who" episode
+ *                           "The Big Bang"
+ *
+ * Dimino's algorithm
+ * ==================
+ *
+ * The following is Dimino's algorithm in sequential form:
+ *
+ * Input: identity element, list of generators, equality check,
+ *        multiplication operation
+ * Output: list of group elements
+ *
+ * 1. add identity element
+ * 2. remove identities from list of generators
+ * 3. add all powers of first generator that aren't the
+ *    identity element
+ * 4. go through all remaining generators:
+ *        a. if generator is already in the list of elements
+ *                -> do nothing
+ *        b. otherwise
+ *                i.   remember current # of elements
+ *                     (i.e. the size of the current subgroup)
+ *                ii.  add all current elements (which includes
+ *                     the identity) each multiplied from right
+ *                     with the current generator to the group
+ *                iii. add all remaining cosets that are generated
+ *                     by products of the new generator with itself
+ *                     and all other generators seen so far
+ *
+ * In functional form, this is implemented as a long set of recursive
+ * templates that have a complicated relationship.
+ *
+ * The main interface for Dimino's algorithm is the template
+ * enumerate_group_elements. All lists are implemented as variadic
+ * type_list<typename...> and numeric_list<typename = int, int...>
+ * templates.
+ *
+ * 'Calling' templates is usually done via typedefs.
+ *
+ * This algorithm is an extended version of the basic version. The
+ * extension consists in the fact that each group element has a set
+ * of flags associated with it. Multiplication of two group elements
+ * with each other results in a group element whose flags are the
+ * XOR of the flags of the previous elements. Each time the algorithm
+ * notices that a group element it just calculated is already in the
+ * list of current elements, the flags of both will be compared and
+ * added to the so-called 'global flags' of the group.
+ *
+ * The rationale behind this extension is that this allows not only
+ * for the description of symmetries between tensor indices, but
+ * also allows for the description of hermiticity, antisymmetry and
+ * antihermiticity. Negation and conjugation each are specific bit
+ * in the flags value and if two different ways to reach a group
+ * element lead to two different flags, this poses a constraint on
+ * the allowed values of the resulting tensor. For example, if a
+ * group element is reach both with and without the conjugation
+ * flags, it is clear that the resulting tensor has to be real.
+ *
+ * Note that this flag mechanism is quite generic and may have other
+ * uses beyond tensor properties.
+ *
+ * IMPORTANT:
+ *     This algorithm assumes the group to be finite. If you try to
+ *     run it with a group that's infinite, the algorithm will only
+ *     terminate once you hit a compiler limit (max template depth).
+ *     Also note that trying to use this implementation to create a
+ *     very large group will probably either make you hit the same
+ *     limit, cause the compiler to segfault or at the very least
+ *     take a *really* long time (hours, days, weeks - sic!) to
+ *     compile. It is not recommended to plug in more than 4
+ *     generators, unless they are independent of each other.
+ */
+
+/** \internal
+ *
+ * \class strip_identities
+ * \ingroup CXX11_TensorSymmetry_Module
+ *
+ * \brief Cleanse a list of group elements of the identity element
+ *
+ * This template is used to make a first pass through all initial
+ * generators of Dimino's algorithm and remove the identity
+ * elements.
+ *
+ * \sa enumerate_group_elements
+ */
+template <template <typename, typename> class Equality, typename id, typename L>
+struct strip_identities;
+
+template <template <typename, typename> class Equality, typename id, typename t, typename... ts>
+struct strip_identities<Equality, id, type_list<t, ts...>> {
+  typedef std::conditional_t<
+      Equality<id, t>::value, typename strip_identities<Equality, id, type_list<ts...>>::type,
+      typename concat<type_list<t>, typename strip_identities<Equality, id, type_list<ts...>>::type>::type>
+      type;
+  constexpr static int global_flags =
+      Equality<id, t>::global_flags | strip_identities<Equality, id, type_list<ts...>>::global_flags;
+};
+
+template <template <typename, typename> class Equality, typename id EIGEN_TPL_PP_SPEC_HACK_DEFC(typename, ts)>
+struct strip_identities<Equality, id, type_list<EIGEN_TPL_PP_SPEC_HACK_USE(ts)>> {
+  typedef type_list<> type;
+  constexpr static int global_flags = 0;
+};
+
+/** \internal
+ *
+ * \class dimino_first_step_elements_helper
+ * \ingroup CXX11_TensorSymmetry_Module
+ *
+ * \brief Recursive template that adds powers of the first generator to the list of group elements
+ *
+ * This template calls itself recursively to add powers of the first
+ * generator to the list of group elements. It stops if it reaches
+ * the identity element again.
+ *
+ * \sa enumerate_group_elements, dimino_first_step_elements
+ */
+template <template <typename, typename> class Multiply, template <typename, typename> class Equality, typename id,
+          typename g, typename current_element, typename elements,
+          bool dont_add_current_element  // = false
+          >
+struct dimino_first_step_elements_helper
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+    :  // recursive inheritance is too difficult for Doxygen
+       public dimino_first_step_elements_helper<Multiply, Equality, id, g, typename Multiply<current_element, g>::type,
+                                                typename concat<elements, type_list<current_element>>::type,
+                                                Equality<typename Multiply<current_element, g>::type, id>::value> {
+};
+
+template <template <typename, typename> class Multiply, template <typename, typename> class Equality, typename id,
+          typename g, typename current_element, typename elements>
+struct dimino_first_step_elements_helper<Multiply, Equality, id, g, current_element, elements, true>
+#endif  // EIGEN_PARSED_BY_DOXYGEN
+{
+  typedef elements type;
+  constexpr static int global_flags = Equality<current_element, id>::global_flags;
+};
+
+/** \internal
+ *
+ * \class dimino_first_step_elements
+ * \ingroup CXX11_TensorSymmetry_Module
+ *
+ * \brief Add all powers of the first generator to the list of group elements
+ *
+ * This template takes the first non-identity generator and generates the initial
+ * list of elements which consists of all powers of that generator. For a group
+ * with just one generated, it would be enumerated after this.
+ *
+ * \sa enumerate_group_elements
+ */
+template <template <typename, typename> class Multiply, template <typename, typename> class Equality, typename id,
+          typename generators>
+struct dimino_first_step_elements {
+  typedef typename get<0, generators>::type first_generator;
+  typedef typename skip<1, generators>::type next_generators;
+  typedef type_list<first_generator> generators_done;
+
+  typedef dimino_first_step_elements_helper<Multiply, Equality, id, first_generator, first_generator, type_list<id>,
+                                            false>
+      helper;
+  typedef typename helper::type type;
+  constexpr static int global_flags = helper::global_flags;
+};
+
+/** \internal
+ *
+ * \class dimino_get_coset_elements
+ * \ingroup CXX11_TensorSymmetry_Module
+ *
+ * \brief Generate all elements of a specific coset
+ *
+ * This template generates all the elements of a specific coset by
+ * multiplying all elements in the given subgroup with the new
+ * coset representative. Note that the first element of the
+ * subgroup is always the identity element, so the first element of
+ * the result of this template is going to be the coset
+ * representative itself.
+ *
+ * Note that this template accepts an additional boolean parameter
+ * that specifies whether to actually generate the coset (true) or
+ * just return an empty list (false).
+ *
+ * \sa enumerate_group_elements, dimino_add_cosets_for_rep
+ */
+template <template <typename, typename> class Multiply, typename sub_group_elements, typename new_coset_rep,
+          bool generate_coset  // = true
+          >
+struct dimino_get_coset_elements {
+  typedef typename apply_op_from_right<Multiply, new_coset_rep, sub_group_elements>::type type;
+};
+
+template <template <typename, typename> class Multiply, typename sub_group_elements, typename new_coset_rep>
+struct dimino_get_coset_elements<Multiply, sub_group_elements, new_coset_rep, false> {
+  typedef type_list<> type;
+};
+
+/** \internal
+ *
+ * \class dimino_add_cosets_for_rep
+ * \ingroup CXX11_TensorSymmetry_Module
+ *
+ * \brief Recursive template for adding coset spaces
+ *
+ * This template multiplies the coset representative with a generator
+ * from the list of previous generators. If the new element is not in
+ * the group already, it adds the corresponding coset. Finally it
+ * proceeds to call itself with the next generator from the list.
+ *
+ * \sa enumerate_group_elements, dimino_add_all_coset_spaces
+ */
+template <template <typename, typename> class Multiply, template <typename, typename> class Equality, typename id,
+          typename sub_group_elements, typename elements, typename generators, typename rep_element, int sub_group_size>
+struct dimino_add_cosets_for_rep;
+
+template <template <typename, typename> class Multiply, template <typename, typename> class Equality, typename id,
+          typename sub_group_elements, typename elements, typename g, typename... gs, typename rep_element,
+          int sub_group_size>
+struct dimino_add_cosets_for_rep<Multiply, Equality, id, sub_group_elements, elements, type_list<g, gs...>, rep_element,
+                                 sub_group_size> {
+  typedef typename Multiply<rep_element, g>::type new_coset_rep;
+  typedef contained_in_list_gf<Equality, new_coset_rep, elements> _cil;
+  constexpr static bool add_coset = !_cil::value;
+
+  typedef
+      typename dimino_get_coset_elements<Multiply, sub_group_elements, new_coset_rep, add_coset>::type coset_elements;
+
+  typedef dimino_add_cosets_for_rep<Multiply, Equality, id, sub_group_elements,
+                                    typename concat<elements, coset_elements>::type, type_list<gs...>, rep_element,
+                                    sub_group_size>
+      _helper;
+
+  typedef typename _helper::type type;
+  constexpr static int global_flags = _cil::global_flags | _helper::global_flags;
+
+  /* Note that we don't have to update global flags here, since
+   * we will only add these elements if they are not part of
+   * the group already. But that only happens if the coset rep
+   * is not already in the group, so the check for the coset rep
+   * will catch this.
+   */
+};
+
+template <template <typename, typename> class Multiply, template <typename, typename> class Equality, typename id,
+          typename sub_group_elements, typename elements EIGEN_TPL_PP_SPEC_HACK_DEFC(typename, empty),
+          typename rep_element, int sub_group_size>
+struct dimino_add_cosets_for_rep<Multiply, Equality, id, sub_group_elements, elements,
+                                 type_list<EIGEN_TPL_PP_SPEC_HACK_USE(empty)>, rep_element, sub_group_size> {
+  typedef elements type;
+  constexpr static int global_flags = 0;
+};
+
+/** \internal
+ *
+ * \class dimino_add_all_coset_spaces
+ * \ingroup CXX11_TensorSymmetry_Module
+ *
+ * \brief Recursive template for adding all coset spaces for a new generator
+ *
+ * This template tries to go through the list of generators (with
+ * the help of the dimino_add_cosets_for_rep template) as long as
+ * it still finds elements that are not part of the group and add
+ * the corresponding cosets.
+ *
+ * \sa enumerate_group_elements, dimino_add_cosets_for_rep
+ */
+template <template <typename, typename> class Multiply, template <typename, typename> class Equality, typename id,
+          typename sub_group_elements, typename elements, typename generators, int sub_group_size, int rep_pos,
+          bool stop_condition  // = false
+          >
+struct dimino_add_all_coset_spaces {
+  typedef typename get<rep_pos, elements>::type rep_element;
+  typedef dimino_add_cosets_for_rep<Multiply, Equality, id, sub_group_elements, elements, generators, rep_element,
+                                    sub_group_elements::count>
+      _ac4r;
+  typedef typename _ac4r::type new_elements;
+
+  constexpr static int new_rep_pos = rep_pos + sub_group_elements::count;
+  constexpr static bool new_stop_condition = new_rep_pos >= new_elements::count;
+
+  typedef dimino_add_all_coset_spaces<Multiply, Equality, id, sub_group_elements, new_elements, generators,
+                                      sub_group_size, new_rep_pos, new_stop_condition>
+      _helper;
+
+  typedef typename _helper::type type;
+  constexpr static int global_flags = _helper::global_flags | _ac4r::global_flags;
+};
+
+template <template <typename, typename> class Multiply, template <typename, typename> class Equality, typename id,
+          typename sub_group_elements, typename elements, typename generators, int sub_group_size, int rep_pos>
+struct dimino_add_all_coset_spaces<Multiply, Equality, id, sub_group_elements, elements, generators, sub_group_size,
+                                   rep_pos, true> {
+  typedef elements type;
+  constexpr static int global_flags = 0;
+};
+
+/** \internal
+ *
+ * \class dimino_add_generator
+ * \ingroup CXX11_TensorSymmetry_Module
+ *
+ * \brief Enlarge the group by adding a new generator.
+ *
+ * It accepts a boolean parameter that determines if the generator is redundant,
+ * i.e. was already seen in the group. In that case, it reduces to a no-op.
+ *
+ * \sa enumerate_group_elements, dimino_add_all_coset_spaces
+ */
+template <template <typename, typename> class Multiply, template <typename, typename> class Equality, typename id,
+          typename elements, typename generators_done, typename current_generator,
+          bool redundant  // = false
+          >
+struct dimino_add_generator {
+  /* this template is only called if the generator is not redundant
+   * => all elements of the group multiplied with the new generator
+   *    are going to be new elements of the most trivial coset space
+   */
+  typedef typename apply_op_from_right<Multiply, current_generator, elements>::type multiplied_elements;
+  typedef typename concat<elements, multiplied_elements>::type new_elements;
+
+  constexpr static int rep_pos = elements::count;
+
+  typedef dimino_add_all_coset_spaces<
+      Multiply, Equality, id,
+      elements,  // elements of previous subgroup
+      new_elements, typename concat<generators_done, type_list<current_generator>>::type,
+      elements::count,  // size of previous subgroup
+      rep_pos,
+      false  // don't stop (because rep_pos >= new_elements::count is always false at this point)
+      >
+      _helper;
+  typedef typename _helper::type type;
+  constexpr static int global_flags = _helper::global_flags;
+};
+
+template <template <typename, typename> class Multiply, template <typename, typename> class Equality, typename id,
+          typename elements, typename generators_done, typename current_generator>
+struct dimino_add_generator<Multiply, Equality, id, elements, generators_done, current_generator, true> {
+  // redundant case
+  typedef elements type;
+  constexpr static int global_flags = 0;
+};
+
+/** \internal
+ *
+ * \class dimino_add_remaining_generators
+ * \ingroup CXX11_TensorSymmetry_Module
+ *
+ * \brief Recursive template that adds all remaining generators to a group
+ *
+ * Loop through the list of generators that remain and successively
+ * add them to the group.
+ *
+ * \sa enumerate_group_elements, dimino_add_generator
+ */
+template <template <typename, typename> class Multiply, template <typename, typename> class Equality, typename id,
+          typename generators_done, typename remaining_generators, typename elements>
+struct dimino_add_remaining_generators {
+  typedef typename get<0, remaining_generators>::type first_generator;
+  typedef typename skip<1, remaining_generators>::type next_generators;
+
+  typedef contained_in_list_gf<Equality, first_generator, elements> _cil;
+
+  typedef dimino_add_generator<Multiply, Equality, id, elements, generators_done, first_generator, _cil::value> _helper;
+
+  typedef typename _helper::type new_elements;
+
+  typedef dimino_add_remaining_generators<Multiply, Equality, id,
+                                          typename concat<generators_done, type_list<first_generator>>::type,
+                                          next_generators, new_elements>
+      _next_iter;
+
+  typedef typename _next_iter::type type;
+  constexpr static int global_flags = _cil::global_flags | _helper::global_flags | _next_iter::global_flags;
+};
+
+template <template <typename, typename> class Multiply, template <typename, typename> class Equality, typename id,
+          typename generators_done, typename elements>
+struct dimino_add_remaining_generators<Multiply, Equality, id, generators_done, type_list<>, elements> {
+  typedef elements type;
+  constexpr static int global_flags = 0;
+};
+
+/** \internal
+ *
+ * \class enumerate_group_elements_noid
+ * \ingroup CXX11_TensorSymmetry_Module
+ *
+ * \brief Helper template that implements group element enumeration
+ *
+ * This is a helper template that implements the actual enumeration
+ * of group elements. This has been split so that the list of
+ * generators can be cleansed of the identity element before
+ * performing the actual operation.
+ *
+ * \sa enumerate_group_elements
+ */
+template <template <typename, typename> class Multiply, template <typename, typename> class Equality, typename id,
+          typename generators, int initial_global_flags = 0>
+struct enumerate_group_elements_noid {
+  typedef dimino_first_step_elements<Multiply, Equality, id, generators> first_step;
+  typedef typename first_step::type first_step_elements;
+
+  typedef dimino_add_remaining_generators<Multiply, Equality, id, typename first_step::generators_done,
+                                          typename first_step::next_generators,  // remaining_generators
+                                          typename first_step::type              // first_step elements
+                                          >
+      _helper;
+
+  typedef typename _helper::type type;
+  constexpr static int global_flags = initial_global_flags | first_step::global_flags | _helper::global_flags;
+};
+
+// in case when no generators are specified
+template <template <typename, typename> class Multiply, template <typename, typename> class Equality, typename id,
+          int initial_global_flags>
+struct enumerate_group_elements_noid<Multiply, Equality, id, type_list<>, initial_global_flags> {
+  typedef type_list<id> type;
+  constexpr static int global_flags = initial_global_flags;
+};
+
+/** \internal
+ *
+ * \class enumerate_group_elements
+ * \ingroup CXX11_TensorSymmetry_Module
+ *
+ * \brief Enumerate all elements in a finite group
+ *
+ * This template enumerates all elements in a finite group. It accepts
+ * the following template parameters:
+ *
+ * \tparam Multiply      The multiplication operation that multiplies two group elements
+ *                       with each other.
+ * \tparam Equality      The equality check operation that checks if two group elements
+ *                       are equal to another.
+ * \tparam id            The identity element
+ * \tparam Generators_   A list of (possibly redundant) generators of the group
+ */
+template <template <typename, typename> class Multiply, template <typename, typename> class Equality, typename id,
+          typename Generators_>
+struct enumerate_group_elements
+    : public enumerate_group_elements_noid<Multiply, Equality, id,
+                                           typename strip_identities<Equality, id, Generators_>::type,
+                                           strip_identities<Equality, id, Generators_>::global_flags> {};
+
+}  // end namespace group_theory
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSORSYMMETRY_TEMPLATEGROUPTHEORY_H
+
+/*
+ * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
+ */
diff --git a/inst/include/unsupported/Eigen/MoreVectorization b/inst/include/unsupported/Eigen/CXX11/src/util/CXX11Meta.h
similarity index 51%
rename from inst/include/unsupported/Eigen/MoreVectorization
rename to inst/include/unsupported/Eigen/CXX11/src/util/CXX11Meta.h
index 470e7243..74b47ce1 100644
--- a/inst/include/unsupported/Eigen/MoreVectorization
+++ b/inst/include/unsupported/Eigen/CXX11/src/util/CXX11Meta.h
@@ -1,24 +1,18 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#ifndef EIGEN_MOREVECTORIZATION_MODULE_H
-#define EIGEN_MOREVECTORIZATION_MODULE_H
+#ifndef EIGEN_CXX11META_H
+#define EIGEN_CXX11META_H
 
-#include <Eigen/Core>
+#include <vector>
+#include "../../../../../Eigen/src/Core/util/EmulateArray.h"
 
-namespace Eigen {
+#include "CXX11Workarounds.h"
 
-/**
-  * \defgroup MoreVectorization More vectorization module
-  */
-
-}
-
-#include "src/MoreVectorization/MathFunctions.h"
-
-#endif // EIGEN_MOREVECTORIZATION_MODULE_H
+#endif  // EIGEN_CXX11META_H
diff --git a/inst/include/unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h b/inst/include/unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h
new file mode 100644
index 00000000..632f4370
--- /dev/null
+++ b/inst/include/unsupported/Eigen/CXX11/src/util/CXX11Workarounds.h
@@ -0,0 +1,85 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Christian Seiler <christian@iwakd.de>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11WORKAROUNDS_H
+#define EIGEN_CXX11WORKAROUNDS_H
+
+/* COMPATIBILITY CHECKS
+ * (so users of compilers that are too old get some realistic error messages)
+ */
+#if defined(__INTEL_COMPILER) && (__INTEL_COMPILER < 1310)
+#error Intel Compiler only supports required C++ features since version 13.1.
+// note that most stuff in principle works with 13.0 but when combining
+// some features, at some point 13.0 will just fail with an internal assertion
+#elif defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER) && \
+    (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 6))
+// G++ < 4.6 by default will continue processing the source files - even if we use #error to make
+// it error out. For this reason, we use the pragma to make sure G++ aborts at the first error
+// it sees. Unfortunately, that is still not our #error directive, but at least the output is
+// short enough the user has a chance to see that the compiler version is not sufficient for
+// the funky template mojo we use.
+#pragma GCC diagnostic error "-Wfatal-errors"
+#error GNU C++ Compiler (g++) only supports required C++ features since version 4.6.
+#endif
+
+namespace Eigen {
+
+namespace internal {
+
+/* std::get is only constexpr in C++14, not yet in C++11
+ */
+
+template <std::size_t I_, class T>
+constexpr T& array_get(std::vector<T>& a) {
+  return a[I_];
+}
+template <std::size_t I_, class T>
+constexpr T&& array_get(std::vector<T>&& a) {
+  return a[I_];
+}
+template <std::size_t I_, class T>
+constexpr T const& array_get(std::vector<T> const& a) {
+  return a[I_];
+}
+
+/* Suppose you have a template of the form
+ * template<typename T> struct X;
+ * And you want to specialize it in such a way:
+ *    template<typename S1, typename... SN> struct X<Foo<S1, SN...>> { ::: };
+ *    template<>                            struct X<Foo<>>          { ::: };
+ * This will work in Intel's compiler 13.0, but only to some extent in g++ 4.6, since
+ * g++ can only match templates called with parameter packs if the number of template
+ * arguments is not a fixed size (so inside the first specialization, referencing
+ * X<Foo<Sn...>> will fail in g++). On the other hand, g++ will accept the following:
+ *    template<typename S...> struct X<Foo<S...>> { ::: }:
+ * as an additional (!) specialization, which will then only match the empty case.
+ * But Intel's compiler 13.0 won't accept that, it will only accept the empty syntax,
+ * so we have to create a workaround for this.
+ */
+#if defined(__GNUC__) && !defined(__INTEL_COMPILER)
+#define EIGEN_TPL_PP_SPEC_HACK_DEF(mt, n) mt... n
+#define EIGEN_TPL_PP_SPEC_HACK_DEFC(mt, n) , EIGEN_TPL_PP_SPEC_HACK_DEF(mt, n)
+#define EIGEN_TPL_PP_SPEC_HACK_USE(n) n...
+#define EIGEN_TPL_PP_SPEC_HACK_USEC(n) , n...
+#else
+#define EIGEN_TPL_PP_SPEC_HACK_DEF(mt, n)
+#define EIGEN_TPL_PP_SPEC_HACK_DEFC(mt, n)
+#define EIGEN_TPL_PP_SPEC_HACK_USE(n)
+#define EIGEN_TPL_PP_SPEC_HACK_USEC(n)
+#endif
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11WORKAROUNDS_H
+
+/*
+ * kate: space-indent on; indent-width 2; mixedindent off; indent-mode cstyle;
+ */
diff --git a/inst/include/unsupported/Eigen/EulerAngles b/inst/include/unsupported/Eigen/EulerAngles
new file mode 100644
index 00000000..e95dd81a
--- /dev/null
+++ b/inst/include/unsupported/Eigen/EulerAngles
@@ -0,0 +1,44 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Tal Hadad <tal_hd@hotmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_EULERANGLES_MODULE_H
+#define EIGEN_EULERANGLES_MODULE_H
+
+#include "../../Eigen/Core"
+#include "../../Eigen/Geometry"
+
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
+namespace Eigen {
+
+/**
+ * \defgroup EulerAngles_Module EulerAngles module
+ * \brief This module provides generic euler angles rotation.
+ *
+ * Euler angles are a way to represent 3D rotation.
+ *
+ * In order to use this module in your code, include this header:
+ * \code
+ * #include <unsupported/Eigen/EulerAngles>
+ * \endcode
+ *
+ * See \ref EulerAngles for more information.
+ *
+ */
+
+}
+
+// IWYU pragma: begin_exports
+#include "src/EulerAngles/EulerSystem.h"
+#include "src/EulerAngles/EulerAngles.h"
+// IWYU pragma: end_exports
+
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+#endif  // EIGEN_EULERANGLES_MODULE_H
diff --git a/inst/include/unsupported/Eigen/FFT b/inst/include/unsupported/Eigen/FFT
index 2c45b399..bca5a3ee 100644
--- a/inst/include/unsupported/Eigen/FFT
+++ b/inst/include/unsupported/Eigen/FFT
@@ -1,5 +1,5 @@
 // This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. 
+// for linear algebra.
 //
 // Copyright (C) 2009 Mark Borgerding mark a borgerding net
 //
@@ -7,412 +7,424 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#ifndef EIGEN_FFT_H
-#define EIGEN_FFT_H
+#ifndef EIGEN_FFT_MODULE_H
+#define EIGEN_FFT_MODULE_H
 
 #include <complex>
 #include <vector>
 #include <map>
-#include <Eigen/Core>
-
+#include "../../Eigen/Core"
 
 /**
-  * \defgroup FFT_Module Fast Fourier Transform module
-  *
-  * \code
-  * #include <unsupported/Eigen/FFT>
-  * \endcode
-  *
-  * This module provides Fast Fourier transformation, with a configurable backend
-  * implementation.
-  *
-  * The default implementation is based on kissfft. It is a small, free, and
-  * reasonably efficient default.
-  *
-  * There are currently two implementation backend:
-  *
-  * - fftw (http://www.fftw.org) : faster, GPL -- incompatible with Eigen in LGPL form, bigger code size.
-  * - MKL (http://en.wikipedia.org/wiki/Math_Kernel_Library) : fastest, commercial -- may be incompatible with Eigen in GPL form.
-  *
-  * \section FFTDesign Design
-  *
-  * The following design decisions were made concerning scaling and
-  * half-spectrum for real FFT.
-  *
-  * The intent is to facilitate generic programming and ease migrating code
-  * from  Matlab/octave.
-  * We think the default behavior of Eigen/FFT should favor correctness and
-  * generality over speed. Of course, the caller should be able to "opt-out" from this
-  * behavior and get the speed increase if they want it.
-  *
-  * 1) %Scaling:
-  * Other libraries (FFTW,IMKL,KISSFFT)  do not perform scaling, so there
-  * is a constant gain incurred after the forward&inverse transforms , so 
-  * IFFT(FFT(x)) = Kx;  this is done to avoid a vector-by-value multiply.  
-  * The downside is that algorithms that worked correctly in Matlab/octave 
-  * don't behave the same way once implemented in C++.
-  *
-  * How Eigen/FFT differs: invertible scaling is performed so IFFT( FFT(x) ) = x. 
-  *
-  * 2) Real FFT half-spectrum
-  * Other libraries use only half the frequency spectrum (plus one extra 
-  * sample for the Nyquist bin) for a real FFT, the other half is the 
-  * conjugate-symmetric of the first half.  This saves them a copy and some 
-  * memory.  The downside is the caller needs to have special logic for the 
-  * number of bins in complex vs real.
-  *
-  * How Eigen/FFT differs: The full spectrum is returned from the forward 
-  * transform.  This facilitates generic template programming by obviating 
-  * separate specializations for real vs complex.  On the inverse
-  * transform, only half the spectrum is actually used if the output type is real.
-  */
- 
+ * \defgroup FFT_Module Fast Fourier Transform module
+ *
+ * \code
+ * #include <unsupported/Eigen/FFT>
+ * \endcode
+ *
+ * This module provides Fast Fourier transformation, with a configurable backend
+ * implementation.
+ *
+ * The default implementation is based on kissfft. It is a small, free, and
+ * reasonably efficient default.
+ *
+ * There are currently four implementation backend:
+ *
+ * - kissfft(https://github.com/mborgerding/kissfft) : Simple and not so fast, BSD-3-Clause.
+ *   It is a mixed-radix Fast Fourier Transform based up on the principle, "Keep It Simple, Stupid."
+ *   Notice that:kissfft fails to handle "atypically-sized" inputs(i.e., sizes with large factors),a workaround is using
+ * fftw or pocketfft.
+ * - fftw (http://www.fftw.org) : faster, GPL -- incompatible with Eigen in LGPL form, bigger code size.
+ * - MKL (https://www.intel.com/content/www/us/en/developer/tools/oneapi/onemkl-download.html) : fastest, free -- may be
+ * incompatible with Eigen in GPL form.
+ * - PocketFFT/DUCC (https://gitlab.mpcdf.mpg.de/mtr/pocketfft, https://gitlab.mpcdf.mpg.de/mtr/ducc) : faster than kissfft, BSD 3-clause.
+ *   It is a heavily modified implementation of FFTPack, with the following advantages:
+ *   1.strictly C++11 compliant
+ *   2.more accurate twiddle factor computation
+ *   3.very fast plan generation
+ *   4.worst case complexity for transform sizes with large prime factors is N*log(N), because Bluestein's algorithm is
+ *   According to the author, DUCC contains the "evolution" of pocketfft, though the interface is very similar.
+ * used for these cases
+ *
+ * \section FFTDesign Design
+ *
+ * The following design decisions were made concerning scaling and
+ * half-spectrum for real FFT.
+ *
+ * The intent is to facilitate generic programming and ease migrating code
+ * from  Matlab/octave.
+ * We think the default behavior of Eigen/FFT should favor correctness and
+ * generality over speed. Of course, the caller should be able to "opt-out" from this
+ * behavior and get the speed increase if they want it.
+ *
+ * 1) %Scaling:
+ * Other libraries (FFTW,IMKL,KISSFFT)  do not perform scaling, so there
+ * is a constant gain incurred after the forward&inverse transforms , so
+ * IFFT(FFT(x)) = Kx;  this is done to avoid a vector-by-value multiply.
+ * The downside is that algorithms that worked correctly in Matlab/octave
+ * don't behave the same way once implemented in C++.
+ *
+ * How Eigen/FFT differs: invertible scaling is performed so IFFT( FFT(x) ) = x.
+ *
+ * 2) Real FFT half-spectrum
+ * Other libraries use only half the frequency spectrum (plus one extra
+ * sample for the Nyquist bin) for a real FFT, the other half is the
+ * conjugate-symmetric of the first half.  This saves them a copy and some
+ * memory.  The downside is the caller needs to have special logic for the
+ * number of bins in complex vs real.
+ *
+ * How Eigen/FFT differs: The full spectrum is returned from the forward
+ * transform.  This facilitates generic template programming by obviating
+ * separate specializations for real vs complex.  On the inverse
+ * transform, only half the spectrum is actually used if the output type is real.
+ */
+
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
+// IWYU pragma: begin_exports
 
 #ifdef EIGEN_FFTW_DEFAULT
 // FFTW: faster, GPL -- incompatible with Eigen in LGPL form, bigger code size
-#  include <fftw3.h>
-#  include "src/FFT/ei_fftw_impl.h"
-   namespace Eigen {
-     //template <typename T> typedef struct internal::fftw_impl  default_fft_impl; this does not work
-     template <typename T> struct default_fft_impl : public internal::fftw_impl<T> {};
-   }
+#include <fftw3.h>
+#include "src/FFT/fftw_impl.h"
+namespace Eigen {
+// template <typename T> typedef struct internal::fftw_impl  default_fft_impl; this does not work
+template <typename T>
+struct default_fft_impl : public internal::fftw_impl<T> {};
+}  // namespace Eigen
 #elif defined EIGEN_MKL_DEFAULT
-// TODO 
-// intel Math Kernel Library: fastest, commercial -- may be incompatible with Eigen in GPL form
-#  include "src/FFT/ei_imklfft_impl.h"
-   namespace Eigen {
-     template <typename T> struct default_fft_impl : public internal::imklfft_impl {};
-   }
+// intel Math Kernel Library: fastest, free -- may be incompatible with Eigen in GPL form
+#include "src/FFT/imklfft_impl.h"
+namespace Eigen {
+template <typename T>
+struct default_fft_impl : public internal::imklfft::imklfft_impl<T> {};
+}  // namespace Eigen
+#elif defined EIGEN_POCKETFFT_DEFAULT
+// internal::pocketfft_impl: a heavily modified implementation of FFTPack, with many advantages.
+#include <pocketfft_hdronly.h>
+#include "src/FFT/pocketfft_impl.h"
+namespace Eigen {
+template <typename T>
+struct default_fft_impl : public internal::pocketfft_impl<T> {};
+}  // namespace Eigen
+#elif defined EIGEN_DUCCFFT_DEFAULT
+#include <ducc0/fft/fft.h>
+#include <ducc0/infra/string_utils.h>
+#include <ducc0/fft/fft.h>
+#include <ducc0/fft/fftnd_impl.h>
+#include "src/FFT/duccfft_impl.h"
+namespace Eigen {
+template <typename T>
+struct default_fft_impl : public internal::duccfft_impl<T> {};
+}  // namespace Eigen
 #else
 // internal::kissfft_impl:  small, free, reasonably efficient default, derived from kissfft
-//
-# include "src/FFT/ei_kissfft_impl.h"
-  namespace Eigen {
-     template <typename T> 
-       struct default_fft_impl : public internal::kissfft_impl<T> {};
-  }
+#include "src/FFT/kissfft_impl.h"
+namespace Eigen {
+template <typename T>
+struct default_fft_impl : public internal::kissfft_impl<T> {};
+}  // namespace Eigen
 #endif
 
+// IWYU pragma: end_exports
+
 namespace Eigen {
 
- 
-// 
-template<typename T_SrcMat,typename T_FftIfc> struct fft_fwd_proxy;
-template<typename T_SrcMat,typename T_FftIfc> struct fft_inv_proxy;
+//
+template <typename T_SrcMat, typename T_FftIfc>
+struct fft_fwd_proxy;
+template <typename T_SrcMat, typename T_FftIfc>
+struct fft_inv_proxy;
 
 namespace internal {
-template<typename T_SrcMat,typename T_FftIfc>
-struct traits< fft_fwd_proxy<T_SrcMat,T_FftIfc> >
-{
+template <typename T_SrcMat, typename T_FftIfc>
+struct traits<fft_fwd_proxy<T_SrcMat, T_FftIfc> > {
   typedef typename T_SrcMat::PlainObject ReturnType;
 };
-template<typename T_SrcMat,typename T_FftIfc>
-struct traits< fft_inv_proxy<T_SrcMat,T_FftIfc> >
-{
+template <typename T_SrcMat, typename T_FftIfc>
+struct traits<fft_inv_proxy<T_SrcMat, T_FftIfc> > {
   typedef typename T_SrcMat::PlainObject ReturnType;
 };
-}
+}  // namespace internal
 
-template<typename T_SrcMat,typename T_FftIfc> 
-struct fft_fwd_proxy
- : public ReturnByValue<fft_fwd_proxy<T_SrcMat,T_FftIfc> >
-{
+template <typename T_SrcMat, typename T_FftIfc>
+struct fft_fwd_proxy : public ReturnByValue<fft_fwd_proxy<T_SrcMat, T_FftIfc> > {
   typedef DenseIndex Index;
 
-  fft_fwd_proxy(const T_SrcMat& src,T_FftIfc & fft, Index nfft) : m_src(src),m_ifc(fft), m_nfft(nfft) {}
+  fft_fwd_proxy(const T_SrcMat& src, T_FftIfc& fft, Index nfft) : m_src(src), m_ifc(fft), m_nfft(nfft) {}
 
-  template<typename T_DestMat> void evalTo(T_DestMat& dst) const;
+  template <typename T_DestMat>
+  void evalTo(T_DestMat& dst) const;
 
   Index rows() const { return m_src.rows(); }
   Index cols() const { return m_src.cols(); }
-protected:
-  const T_SrcMat & m_src;
-  T_FftIfc & m_ifc;
+
+ protected:
+  const T_SrcMat& m_src;
+  T_FftIfc& m_ifc;
   Index m_nfft;
-private:
-  fft_fwd_proxy& operator=(const fft_fwd_proxy&);
 };
 
-template<typename T_SrcMat,typename T_FftIfc> 
-struct fft_inv_proxy
- : public ReturnByValue<fft_inv_proxy<T_SrcMat,T_FftIfc> >
-{
+template <typename T_SrcMat, typename T_FftIfc>
+struct fft_inv_proxy : public ReturnByValue<fft_inv_proxy<T_SrcMat, T_FftIfc> > {
   typedef DenseIndex Index;
 
-  fft_inv_proxy(const T_SrcMat& src,T_FftIfc & fft, Index nfft) : m_src(src),m_ifc(fft), m_nfft(nfft) {}
+  fft_inv_proxy(const T_SrcMat& src, T_FftIfc& fft, Index nfft) : m_src(src), m_ifc(fft), m_nfft(nfft) {}
 
-  template<typename T_DestMat> void evalTo(T_DestMat& dst) const;
+  template <typename T_DestMat>
+  void evalTo(T_DestMat& dst) const;
 
   Index rows() const { return m_src.rows(); }
   Index cols() const { return m_src.cols(); }
-protected:
-  const T_SrcMat & m_src;
-  T_FftIfc & m_ifc;
+
+ protected:
+  const T_SrcMat& m_src;
+  T_FftIfc& m_ifc;
   Index m_nfft;
-private:
-  fft_inv_proxy& operator=(const fft_inv_proxy&);
 };
 
+template <typename T_Scalar, typename T_Impl = default_fft_impl<T_Scalar> >
+class FFT {
+ public:
+  typedef T_Impl impl_type;
+  typedef DenseIndex Index;
+  typedef typename impl_type::Scalar Scalar;
+  typedef typename impl_type::Complex Complex;
+
+  using Flag = int;
+  static constexpr Flag Default = 0;
+  static constexpr Flag Unscaled = 1;
+  static constexpr Flag HalfSpectrum = 2;
+  static constexpr Flag Speedy = 32767;
+
+  FFT(const impl_type& impl = impl_type(), Flag flags = Default) : m_impl(impl), m_flag(flags) {
+    eigen_assert((flags == Default || flags == Unscaled || flags == HalfSpectrum || flags == Speedy) &&
+                 "invalid flags argument");
+  }
 
-template <typename T_Scalar,
-         typename T_Impl=default_fft_impl<T_Scalar> >
-class FFT
-{
-  public:
-    typedef T_Impl impl_type;
-    typedef DenseIndex Index;
-    typedef typename impl_type::Scalar Scalar;
-    typedef typename impl_type::Complex Complex;
-
-    enum Flag {
-      Default=0, // goof proof
-      Unscaled=1,
-      HalfSpectrum=2,
-      // SomeOtherSpeedOptimization=4
-      Speedy=32767
-    };
-
-    FFT( const impl_type & impl=impl_type() , Flag flags=Default ) :m_impl(impl),m_flag(flags) { }
-
-    inline
-    bool HasFlag(Flag f) const { return (m_flag & (int)f) == f;}
-
-    inline
-    void SetFlag(Flag f) { m_flag |= (int)f;}
-
-    inline
-    void ClearFlag(Flag f) { m_flag &= (~(int)f);}
-
-    inline
-    void fwd( Complex * dst, const Scalar * src, Index nfft)
-    {
-        m_impl.fwd(dst,src,static_cast<int>(nfft));
-        if ( HasFlag(HalfSpectrum) == false)
-          ReflectSpectrum(dst,nfft);
-    }
+  inline bool HasFlag(Flag f) const { return (m_flag & (int)f) == f; }
 
-    inline
-    void fwd( Complex * dst, const Complex * src, Index nfft)
-    {
-        m_impl.fwd(dst,src,static_cast<int>(nfft));
-    }
+  inline void SetFlag(Flag f) { m_flag |= (int)f; }
 
-    /*
-    inline 
-    void fwd2(Complex * dst, const Complex * src, int n0,int n1)
-    {
-      m_impl.fwd2(dst,src,n0,n1);
-    }
-    */
-
-    template <typename _Input>
-    inline
-    void fwd( std::vector<Complex> & dst, const std::vector<_Input> & src) 
-    {
-      if ( NumTraits<_Input>::IsComplex == 0 && HasFlag(HalfSpectrum) )
-        dst.resize( (src.size()>>1)+1); // half the bins + Nyquist bin
-      else
-        dst.resize(src.size());
-      fwd(&dst[0],&src[0],src.size());
-    }
+  inline void ClearFlag(Flag f) { m_flag &= (~(int)f); }
 
-    template<typename InputDerived, typename ComplexDerived>
-    inline
-    void fwd( MatrixBase<ComplexDerived> & dst, const MatrixBase<InputDerived> & src, Index nfft=-1)
-    {
-      typedef typename ComplexDerived::Scalar dst_type;
-      typedef typename InputDerived::Scalar src_type;
-      EIGEN_STATIC_ASSERT_VECTOR_ONLY(InputDerived)
-      EIGEN_STATIC_ASSERT_VECTOR_ONLY(ComplexDerived)
-      EIGEN_STATIC_ASSERT_SAME_VECTOR_SIZE(ComplexDerived,InputDerived) // size at compile-time
-      EIGEN_STATIC_ASSERT((internal::is_same<dst_type, Complex>::value),
-            YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
-      EIGEN_STATIC_ASSERT(int(InputDerived::Flags)&int(ComplexDerived::Flags)&DirectAccessBit,
-            THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_WITH_DIRECT_MEMORY_ACCESS_SUCH_AS_MAP_OR_PLAIN_MATRICES)
-
-      if (nfft<1)
-        nfft = src.size();
+  inline void fwd(Complex* dst, const Scalar* src, Index nfft) {
+    m_impl.fwd(dst, src, static_cast<int>(nfft));
+    if (HasFlag(HalfSpectrum) == false) ReflectSpectrum(dst, nfft);
+  }
 
-      if ( NumTraits< src_type >::IsComplex == 0 && HasFlag(HalfSpectrum) )
-        dst.derived().resize( (nfft>>1)+1);
-      else
-        dst.derived().resize(nfft);
-
-      if ( src.innerStride() != 1 || src.size() < nfft ) {
-        Matrix<src_type,1,Dynamic> tmp;
-        if (src.size()<nfft) {
-          tmp.setZero(nfft);
-          tmp.block(0,0,src.size(),1 ) = src;
-        }else{
-          tmp = src;
-        }
-        fwd( &dst[0],&tmp[0],nfft );
-      }else{
-        fwd( &dst[0],&src[0],nfft );
-      }
+  inline void fwd(Complex* dst, const Complex* src, Index nfft) { m_impl.fwd(dst, src, static_cast<int>(nfft)); }
+
+#if defined EIGEN_FFTW_DEFAULT || defined EIGEN_POCKETFFT_DEFAULT || defined EIGEN_DUCCFFT_DEFAULT || \
+    defined EIGEN_MKL_DEFAULT
+  inline void fwd2(Complex* dst, const Complex* src, int n0, int n1) { m_impl.fwd2(dst, src, n0, n1); }
+#endif
+
+  template <typename Input_>
+  inline void fwd(std::vector<Complex>& dst, const std::vector<Input_>& src) {
+    if (NumTraits<Input_>::IsComplex == 0 && HasFlag(HalfSpectrum))
+      dst.resize((src.size() >> 1) + 1);  // half the bins + Nyquist bin
+    else
+      dst.resize(src.size());
+    fwd(&dst[0], &src[0], src.size());
+  }
+
+  template <typename InputDerived, typename ComplexDerived>
+  inline void fwd(MatrixBase<ComplexDerived>& dst, const MatrixBase<InputDerived>& src, Index nfft = -1) {
+    typedef typename ComplexDerived::Scalar dst_type;
+    typedef typename InputDerived::Scalar src_type;
+    EIGEN_STATIC_ASSERT_VECTOR_ONLY(InputDerived)
+    EIGEN_STATIC_ASSERT_VECTOR_ONLY(ComplexDerived)
+    EIGEN_STATIC_ASSERT_SAME_VECTOR_SIZE(ComplexDerived, InputDerived)  // size at compile-time
+    EIGEN_STATIC_ASSERT(
+        (internal::is_same<dst_type, Complex>::value),
+        YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+    EIGEN_STATIC_ASSERT(int(InputDerived::Flags) & int(ComplexDerived::Flags) & DirectAccessBit,
+                        THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_WITH_DIRECT_MEMORY_ACCESS_SUCH_AS_MAP_OR_PLAIN_MATRICES)
+
+    if (nfft < 1) nfft = src.size();
+
+    Index dst_size = nfft;
+    if (NumTraits<src_type>::IsComplex == 0 && HasFlag(HalfSpectrum)) {
+      dst_size = (nfft >> 1) + 1;
     }
- 
-    template<typename InputDerived>
-    inline
-    fft_fwd_proxy< MatrixBase<InputDerived>, FFT<T_Scalar,T_Impl> >
-    fwd( const MatrixBase<InputDerived> & src, Index nfft=-1)
-    {
-      return fft_fwd_proxy< MatrixBase<InputDerived> ,FFT<T_Scalar,T_Impl> >( src, *this,nfft );
+    dst.derived().resize(dst_size);
+
+    if (src.innerStride() != 1 || src.size() < nfft) {
+      Matrix<src_type, 1, Dynamic> tmp;
+      if (src.size() < nfft) {
+        tmp.setZero(nfft);
+        tmp.block(0, 0, src.size(), 1) = src;
+      } else {
+        tmp = src;
+      }
+      if (dst.innerStride() != 1) {
+        Matrix<dst_type, 1, Dynamic> out(1, dst_size);
+        fwd(&out[0], &tmp[0], nfft);
+        dst.derived() = out;
+      } else {
+        fwd(&dst[0], &tmp[0], nfft);
+      }
+    } else {
+      if (dst.innerStride() != 1) {
+        Matrix<dst_type, 1, Dynamic> out(1, dst_size);
+        fwd(&out[0], &src[0], nfft);
+        dst.derived() = out;
+      } else {
+        fwd(&dst[0], &src[0], nfft);
+      }
     }
+  }
 
-    template<typename InputDerived>
-    inline
-    fft_inv_proxy< MatrixBase<InputDerived>, FFT<T_Scalar,T_Impl> >
-    inv( const MatrixBase<InputDerived> & src, Index nfft=-1)
-    {
-      return  fft_inv_proxy< MatrixBase<InputDerived> ,FFT<T_Scalar,T_Impl> >( src, *this,nfft );
-    }
+  template <typename InputDerived>
+  inline fft_fwd_proxy<MatrixBase<InputDerived>, FFT<T_Scalar, T_Impl> > fwd(const MatrixBase<InputDerived>& src,
+                                                                             Index nfft = -1) {
+    return fft_fwd_proxy<MatrixBase<InputDerived>, FFT<T_Scalar, T_Impl> >(src, *this, nfft);
+  }
 
-    inline
-    void inv( Complex * dst, const Complex * src, Index nfft)
-    {
-      m_impl.inv( dst,src,static_cast<int>(nfft) );
-      if ( HasFlag( Unscaled ) == false)
-        scale(dst,Scalar(1./nfft),nfft); // scale the time series
-    }
+  template <typename InputDerived>
+  inline fft_inv_proxy<MatrixBase<InputDerived>, FFT<T_Scalar, T_Impl> > inv(const MatrixBase<InputDerived>& src,
+                                                                             Index nfft = -1) {
+    return fft_inv_proxy<MatrixBase<InputDerived>, FFT<T_Scalar, T_Impl> >(src, *this, nfft);
+  }
 
-    inline
-    void inv( Scalar * dst, const Complex * src, Index nfft)
-    {
-      m_impl.inv( dst,src,static_cast<int>(nfft) );
-      if ( HasFlag( Unscaled ) == false)
-        scale(dst,Scalar(1./nfft),nfft); // scale the time series
-    }
+  inline void inv(Complex* dst, const Complex* src, Index nfft) {
+    m_impl.inv(dst, src, static_cast<int>(nfft));
+    if (HasFlag(Unscaled) == false) scale(dst, Scalar(1. / nfft), nfft);  // scale the time series
+  }
 
-    template<typename OutputDerived, typename ComplexDerived>
-    inline
-    void inv( MatrixBase<OutputDerived> & dst, const MatrixBase<ComplexDerived> & src, Index nfft=-1)
-    {
-      typedef typename ComplexDerived::Scalar src_type;
-      typedef typename OutputDerived::Scalar dst_type;
-      const bool realfft= (NumTraits<dst_type>::IsComplex == 0);
-      EIGEN_STATIC_ASSERT_VECTOR_ONLY(OutputDerived)
-      EIGEN_STATIC_ASSERT_VECTOR_ONLY(ComplexDerived)
-      EIGEN_STATIC_ASSERT_SAME_VECTOR_SIZE(ComplexDerived,OutputDerived) // size at compile-time
-      EIGEN_STATIC_ASSERT((internal::is_same<src_type, Complex>::value),
-            YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
-      EIGEN_STATIC_ASSERT(int(OutputDerived::Flags)&int(ComplexDerived::Flags)&DirectAccessBit,
-            THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_WITH_DIRECT_MEMORY_ACCESS_SUCH_AS_MAP_OR_PLAIN_MATRICES)
-
-      if (nfft<1) { //automatic FFT size determination
-        if ( realfft && HasFlag(HalfSpectrum) ) 
-          nfft = 2*(src.size()-1); //assume even fft size
-        else
-          nfft = src.size();
-      }
-      dst.derived().resize( nfft );
-
-      // check for nfft that does not fit the input data size
-      Index resize_input= ( realfft && HasFlag(HalfSpectrum) )
-        ? ( (nfft/2+1) - src.size() )
-        : ( nfft - src.size() );
-
-      if ( src.innerStride() != 1 || resize_input ) {
-        // if the vector is strided, then we need to copy it to a packed temporary
-        Matrix<src_type,1,Dynamic> tmp;
-        if ( resize_input ) {
-          size_t ncopy = (std::min)(src.size(),src.size() + resize_input);
-          tmp.setZero(src.size() + resize_input);
-          if ( realfft && HasFlag(HalfSpectrum) ) {
-            // pad at the Nyquist bin
-            tmp.head(ncopy) = src.head(ncopy);
-            tmp(ncopy-1) = real(tmp(ncopy-1)); // enforce real-only Nyquist bin
-          }else{
-            size_t nhead,ntail;
-            nhead = 1+ncopy/2-1; // range  [0:pi)
-            ntail = ncopy/2-1;   // range (-pi:0)
-            tmp.head(nhead) = src.head(nhead);
-            tmp.tail(ntail) = src.tail(ntail);
-            if (resize_input<0) { //shrinking -- create the Nyquist bin as the average of the two bins that fold into it
-              tmp(nhead) = ( src(nfft/2) + src( src.size() - nfft/2 ) )*src_type(.5);
-            }else{ // expanding -- split the old Nyquist bin into two halves
-              tmp(nhead) = src(nhead) * src_type(.5);
-              tmp(tmp.size()-nhead) = tmp(nhead);
-            }
+  inline void inv(Scalar* dst, const Complex* src, Index nfft) {
+    m_impl.inv(dst, src, static_cast<int>(nfft));
+    if (HasFlag(Unscaled) == false) scale(dst, Scalar(1. / nfft), nfft);  // scale the time series
+  }
+
+  template <typename OutputDerived, typename ComplexDerived>
+  inline void inv(MatrixBase<OutputDerived>& dst, const MatrixBase<ComplexDerived>& src, Index nfft = -1) {
+    typedef typename ComplexDerived::Scalar src_type;
+    typedef typename ComplexDerived::RealScalar real_type;
+    typedef typename OutputDerived::Scalar dst_type;
+    const bool realfft = (NumTraits<dst_type>::IsComplex == 0);
+    EIGEN_STATIC_ASSERT_VECTOR_ONLY(OutputDerived)
+    EIGEN_STATIC_ASSERT_VECTOR_ONLY(ComplexDerived)
+    EIGEN_STATIC_ASSERT_SAME_VECTOR_SIZE(ComplexDerived, OutputDerived)  // size at compile-time
+    EIGEN_STATIC_ASSERT(
+        (internal::is_same<src_type, Complex>::value),
+        YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+    EIGEN_STATIC_ASSERT(int(OutputDerived::Flags) & int(ComplexDerived::Flags) & DirectAccessBit,
+                        THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_WITH_DIRECT_MEMORY_ACCESS_SUCH_AS_MAP_OR_PLAIN_MATRICES)
+
+    if (nfft < 1) {  // automatic FFT size determination
+      if (realfft && HasFlag(HalfSpectrum))
+        nfft = 2 * (src.size() - 1);  // assume even fft size
+      else
+        nfft = src.size();
+    }
+    dst.derived().resize(nfft);
+
+    // check for nfft that does not fit the input data size
+    Index resize_input = (realfft && HasFlag(HalfSpectrum)) ? ((nfft / 2 + 1) - src.size()) : (nfft - src.size());
+
+    if (src.innerStride() != 1 || resize_input) {
+      // if the vector is strided, then we need to copy it to a packed temporary
+      Matrix<src_type, 1, Dynamic> tmp;
+      if (resize_input) {
+        size_t ncopy = (std::min)(src.size(), src.size() + resize_input);
+        tmp.setZero(src.size() + resize_input);
+        if (realfft && HasFlag(HalfSpectrum)) {
+          // pad at the Nyquist bin
+          tmp.head(ncopy) = src.head(ncopy);
+          tmp(ncopy - 1) = real(tmp(ncopy - 1));  // enforce real-only Nyquist bin
+        } else {
+          size_t nhead, ntail;
+          nhead = 1 + ncopy / 2 - 1;  // range  [0:pi)
+          ntail = ncopy / 2 - 1;      // range (-pi:0)
+          tmp.head(nhead) = src.head(nhead);
+          tmp.tail(ntail) = src.tail(ntail);
+          if (resize_input <
+              0) {  // shrinking -- create the Nyquist bin as the average of the two bins that fold into it
+            tmp(nhead) = (src(nfft / 2) + src(src.size() - nfft / 2)) * real_type(.5);
+          } else {  // expanding -- split the old Nyquist bin into two halves
+            tmp(nhead) = src(nhead) * real_type(.5);
+            tmp(tmp.size() - nhead) = tmp(nhead);
           }
-        }else{
-          tmp = src;
         }
-        inv( &dst[0],&tmp[0], nfft);
-      }else{
-        inv( &dst[0],&src[0], nfft);
+      } else {
+        tmp = src;
       }
-    }
 
-    template <typename _Output>
-    inline
-    void inv( std::vector<_Output> & dst, const std::vector<Complex> & src,Index nfft=-1)
-    {
-      if (nfft<1)
-        nfft = ( NumTraits<_Output>::IsComplex == 0 && HasFlag(HalfSpectrum) ) ? 2*(src.size()-1) : src.size();
-      dst.resize( nfft );
-      inv( &dst[0],&src[0],nfft);
+      if (dst.innerStride() != 1) {
+        Matrix<dst_type, 1, Dynamic> out(1, nfft);
+        inv(&out[0], &tmp[0], nfft);
+        dst.derived() = out;
+      } else {
+        inv(&dst[0], &tmp[0], nfft);
+      }
+    } else {
+      if (dst.innerStride() != 1) {
+        Matrix<dst_type, 1, Dynamic> out(1, nfft);
+        inv(&out[0], &src[0], nfft);
+        dst.derived() = out;
+      } else {
+        inv(&dst[0], &src[0], nfft);
+      }
     }
+  }
 
+  template <typename Output_>
+  inline void inv(std::vector<Output_>& dst, const std::vector<Complex>& src, Index nfft = -1) {
+    if (nfft < 1)
+      nfft = (NumTraits<Output_>::IsComplex == 0 && HasFlag(HalfSpectrum)) ? 2 * (src.size() - 1) : src.size();
+    dst.resize(nfft);
+    inv(&dst[0], &src[0], nfft);
+  }
 
-    /*
-    // TODO: multi-dimensional FFTs
-    inline 
-    void inv2(Complex * dst, const Complex * src, int n0,int n1)
-    {
-      m_impl.inv2(dst,src,n0,n1);
-      if ( HasFlag( Unscaled ) == false)
-          scale(dst,1./(n0*n1),n0*n1);
-    }
-  */
+#if defined EIGEN_FFTW_DEFAULT || defined EIGEN_POCKETFFT_DEFAULT || defined EIGEN_DUCCFFT_DEFAULT || \
+    defined EIGEN_MKL_DEFAULT
+  inline void inv2(Complex* dst, const Complex* src, int n0, int n1) {
+    m_impl.inv2(dst, src, n0, n1);
+    if (HasFlag(Unscaled) == false) scale(dst, 1. / (n0 * n1), n0 * n1);
+  }
+#endif
 
-    inline
-    impl_type & impl() {return m_impl;}
-  private:
+  inline impl_type& impl() { return m_impl; }
 
-    template <typename T_Data>
-    inline
-    void scale(T_Data * x,Scalar s,Index nx)
-    {
+ private:
+  template <typename T_Data>
+  inline void scale(T_Data* x, Scalar s, Index nx) {
 #if 1
-      for (int k=0;k<nx;++k)
-        *x++ *= s;
+    for (int k = 0; k < nx; ++k) *x++ *= s;
 #else
-      if ( ((ptrdiff_t)x) & 15 )
-        Matrix<T_Data, Dynamic, 1>::Map(x,nx) *= s;
-      else
-        Matrix<T_Data, Dynamic, 1>::MapAligned(x,nx) *= s;
-         //Matrix<T_Data, Dynamic, Dynamic>::Map(x,nx) * s;
-#endif  
-    }
+    if (((ptrdiff_t)x) & 15)
+      Matrix<T_Data, Dynamic, 1>::Map(x, nx) *= s;
+    else
+      Matrix<T_Data, Dynamic, 1>::MapAligned(x, nx) *= s;
+#endif
+  }
 
-    inline
-    void ReflectSpectrum(Complex * freq, Index nfft)
-    {
-      // create the implicit right-half spectrum (conjugate-mirror of the left-half)
-      Index nhbins=(nfft>>1)+1;
-      for (Index k=nhbins;k < nfft; ++k )
-        freq[k] = conj(freq[nfft-k]);
-    }
+  inline void ReflectSpectrum(Complex* freq, Index nfft) {
+    // create the implicit right-half spectrum (conjugate-mirror of the left-half)
+    Index nhbins = (nfft >> 1) + 1;
+    for (Index k = nhbins; k < nfft; ++k) freq[k] = conj(freq[nfft - k]);
+  }
 
-    impl_type m_impl;
-    int m_flag;
+  impl_type m_impl;
+  int m_flag;
 };
 
-template<typename T_SrcMat,typename T_FftIfc> 
-template<typename T_DestMat> inline 
-void fft_fwd_proxy<T_SrcMat,T_FftIfc>::evalTo(T_DestMat& dst) const
-{
-    m_ifc.fwd( dst, m_src, m_nfft);
+template <typename T_SrcMat, typename T_FftIfc>
+template <typename T_DestMat>
+inline void fft_fwd_proxy<T_SrcMat, T_FftIfc>::evalTo(T_DestMat& dst) const {
+  m_ifc.fwd(dst, m_src, m_nfft);
 }
 
-template<typename T_SrcMat,typename T_FftIfc> 
-template<typename T_DestMat> inline 
-void fft_inv_proxy<T_SrcMat,T_FftIfc>::evalTo(T_DestMat& dst) const
-{
-    m_ifc.inv( dst, m_src, m_nfft);
+template <typename T_SrcMat, typename T_FftIfc>
+template <typename T_DestMat>
+inline void fft_inv_proxy<T_SrcMat, T_FftIfc>::evalTo(T_DestMat& dst) const {
+  m_ifc.inv(dst, m_src, m_nfft);
 }
 
-}
+}  // namespace Eigen
+
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
 #endif
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */
diff --git a/inst/include/unsupported/Eigen/IterativeSolvers b/inst/include/unsupported/Eigen/IterativeSolvers
index aa15403d..4ae0975d 100644
--- a/inst/include/unsupported/Eigen/IterativeSolvers
+++ b/inst/include/unsupported/Eigen/IterativeSolvers
@@ -10,36 +10,83 @@
 #ifndef EIGEN_ITERATIVE_SOLVERS_MODULE_H
 #define EIGEN_ITERATIVE_SOLVERS_MODULE_H
 
-#include <Eigen/Sparse>
+#include "../../Eigen/Sparse"
+#include "../../Eigen/Jacobi"
+#include "../../Eigen/Householder"
 
 /**
-  * \defgroup IterativeSolvers_Module Iterative solvers module
-  * This module aims to provide various iterative linear and non linear solver algorithms.
-  * It currently provides:
-  *  - a constrained conjugate gradient
-  *  - a Householder GMRES implementation
-  * \code
-  * #include <unsupported/Eigen/IterativeSolvers>
-  * \endcode
-  */
-//@{
-
-#include "../../Eigen/src/misc/Solve.h"
-#include "../../Eigen/src/misc/SparseSolve.h"
+ * \defgroup IterativeLinearSolvers_Module IterativeLinearSolvers module
+ * This module aims to provide various iterative linear and non linear solver algorithms.
+ * It currently provides:
+ *  - a Householder GMRES implementation
+ *  - an IDR(s) implementation
+ *  - a BiCGSTAB(L) implementation
+ *  - a DGMRES implementation
+ *  - a MINRES implementation
+ *  - a IDRSTABL implementation
+ *
+ * Choosing the best solver for solving \c A \c x = \c b depends a lot on the preconditioner chosen as well as the
+ *properties of \c A. The following flowchart might help you.
+ * \dot width=50% 
+ * digraph g {
+ *   node [ fontname=Arial, fontsize=11];
+ *   edge [ fontname=Helvetica, fontsize=10 ];
+ *   A1[label="hermitian", shape="box"];
+ *   A2[label="positive definite", shape="box"];
+ *   CG[shape="plaintext"];
+ *   A3[label="ill conditioned", shape="box"];
+ *   A4[label="good preconditioner", shape="box"];
+ *   A5[label="flexible preconditioner", shape="box"];
+ *   A6[label="strongly indefinite", shape="box"];
+ *   A8[label="large imaginary eigenvalue", shape="box"];
+ *   A7[label="large imaginary eigenvalue",shape="box"];
+ *
+ *   SYMMLQ[shape="plaintext"];
+ *   MINRES[shape="plaintext"];
+ *   GCR[shape="plaintext"];
+ *   GMRES[shape="plaintext"];
+ *   IDRSTABL[shape="plaintext"];
+ *   IDRS[shape="plaintext"];
+ *   BICGSTABL[shape="plaintext"];
+ *   BICGSTAB[shape="plaintext"];
+ *
+ *	 A1 -> A2 [label="yes"];
+ *	 A2 -> CG [label="yes"];
+ *	 A2 -> A3 [label="no"];
+ *	 A3 -> SYMMLQ [label="yes"];
+ *	 A3 -> MINRES [label="no"];
+ *
+ *	 A1 -> A4 [label="no"];
+ *	 A4 -> A5 [label="yes"];
+ *	 A5 -> GCR [label="yes"];
+ *	 A5 -> GMRES [label="no"];
+ *
+ *	 A4 -> A6 [label="no"];
+ *	 A6 -> A8 [label="yes"];
+ *	 A6 -> A7 [label="no"];
+ *	 A7 -> BICGSTABL [label="yes"];
+ *	 A7 -> BICGSTAB [label="no"];
+ *	 A8 -> IDRSTABL [label="yes"];
+ *	 A8 -> IDRS [label="no"];
+ * }
+ * \enddot
+ * \code
+ * #include <unsupported/Eigen/IterativeSolvers>
+ * \endcode
+ */
 
-#ifndef EIGEN_MPL2_ONLY
-#include "src/IterativeSolvers/IterationController.h"
-#include "src/IterativeSolvers/ConstrainedConjGrad.h"
-#endif
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
 
+// IWYU pragma: begin_exports
 #include "src/IterativeSolvers/IncompleteLU.h"
-#include "../../Eigen/Jacobi"
-#include "../../Eigen/Householder"
 #include "src/IterativeSolvers/GMRES.h"
-#include "src/IterativeSolvers/IncompleteCholesky.h"
-//#include "src/IterativeSolvers/SSORPreconditioner.h"
+#include "src/IterativeSolvers/DGMRES.h"
 #include "src/IterativeSolvers/MINRES.h"
+#include "src/IterativeSolvers/IDRS.h"
+#include "src/IterativeSolvers/BiCGSTABL.h"
+#include "src/IterativeSolvers/IDRSTABL.h"
+// IWYU pragma: end_exports
 
-//@}
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
 
-#endif // EIGEN_ITERATIVE_SOLVERS_MODULE_H
+#endif  // EIGEN_ITERATIVE_SOLVERS_MODULE_H
diff --git a/inst/include/unsupported/Eigen/KroneckerProduct b/inst/include/unsupported/Eigen/KroneckerProduct
index c932c06a..80432ea6 100644
--- a/inst/include/unsupported/Eigen/KroneckerProduct
+++ b/inst/include/unsupported/Eigen/KroneckerProduct
@@ -10,25 +10,26 @@
 #define EIGEN_KRONECKER_PRODUCT_MODULE_H
 
 #include "../../Eigen/Core"
-
-#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+#include "../../Eigen/SparseCore"
 
 namespace Eigen {
 
 /**
-  * \defgroup KroneckerProduct_Module KroneckerProduct module
-  *
-  * This module contains an experimental Kronecker product implementation.
-  *
-  * \code
-  * #include <Eigen/KroneckerProduct>
-  * \endcode
-  */
-
-} // namespace Eigen
-
+ * \defgroup KroneckerProduct_Module KroneckerProduct module
+ *
+ * This module contains an experimental Kronecker product implementation.
+ *
+ * \code
+ * #include <Eigen/KroneckerProduct>
+ * \endcode
+ */
+
+}  // namespace Eigen
+
+// IWYU pragma: begin_exports
 #include "src/KroneckerProduct/KroneckerTensorProduct.h"
+// IWYU pragma: end_exports
 
 #include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
 
-#endif // EIGEN_KRONECKER_PRODUCT_MODULE_H
+#endif  // EIGEN_KRONECKER_PRODUCT_MODULE_H
diff --git a/inst/include/unsupported/Eigen/LevenbergMarquardt b/inst/include/unsupported/Eigen/LevenbergMarquardt
index 0fe2680b..72c6da2e 100644
--- a/inst/include/unsupported/Eigen/LevenbergMarquardt
+++ b/inst/include/unsupported/Eigen/LevenbergMarquardt
@@ -7,39 +7,43 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#ifndef EIGEN_LEVENBERGMARQUARDT_MODULE
-#define EIGEN_LEVENBERGMARQUARDT_MODULE
+#ifndef EIGEN_LEVENBERGMARQUARDT_MODULE_H
+#define EIGEN_LEVENBERGMARQUARDT_MODULE_H
 
 // #include <vector>
 
-#include <Eigen/Core>
-#include <Eigen/Jacobi>
-#include <Eigen/QR>
-#include <unsupported/Eigen/NumericalDiff> 
+#include "../../Eigen/Core"
+#include "../../Eigen/Jacobi"
+#include "../../Eigen/QR"
+#include "NumericalDiff"
 
-#include <Eigen/SparseQR>
+#include "../../Eigen/SparseQR"
 
 /**
-  * \defgroup LevenbergMarquardt_Module Levenberg-Marquardt module
-  *
-  * \code
-  * #include </Eigen/LevenbergMarquardt>
-  * \endcode
-  *
-  * 
-  */
-
-#include "Eigen/SparseCore"
-#ifndef EIGEN_PARSED_BY_DOXYGEN
+ * \defgroup LevenbergMarquardt_Module Levenberg-Marquardt module
+ *
+ * \code
+ * #include </Eigen/LevenbergMarquardt>
+ * \endcode
+ *
+ *
+ */
+
+#include "../../Eigen/SparseCore"
+
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
 
+// IWYU pragma: begin_exports
+#ifndef EIGEN_PARSED_BY_DOXYGEN
 #include "src/LevenbergMarquardt/LMqrsolv.h"
 #include "src/LevenbergMarquardt/LMcovar.h"
 #include "src/LevenbergMarquardt/LMpar.h"
-
 #endif
 
 #include "src/LevenbergMarquardt/LevenbergMarquardt.h"
 #include "src/LevenbergMarquardt/LMonestep.h"
+// IWYU pragma: end_exports
 
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
 
-#endif // EIGEN_LEVENBERGMARQUARDT_MODULE
+#endif  // EIGEN_LEVENBERGMARQUARDT_MODULE_H
diff --git a/inst/include/unsupported/Eigen/MPRealSupport b/inst/include/unsupported/Eigen/MPRealSupport
index d4b03647..322ff45f 100644
--- a/inst/include/unsupported/Eigen/MPRealSupport
+++ b/inst/include/unsupported/Eigen/MPRealSupport
@@ -12,11 +12,11 @@
 #ifndef EIGEN_MPREALSUPPORT_MODULE_H
 #define EIGEN_MPREALSUPPORT_MODULE_H
 
-#include <Eigen/Core>
+#include "../../Eigen/Core"
 #include <mpreal.h>
 
 namespace Eigen {
-  
+
 /**
   * \defgroup MPRealSupport_Module MPFRC++ Support module
   * \code
@@ -27,6 +27,8 @@ namespace Eigen {
   * via the <a href="http://www.holoborodko.com/pavel/mpfr">MPFR C++</a>
   * library which itself is built upon <a href="http://www.mpfr.org/">MPFR</a>/<a href="http://gmplib.org/">GMP</a>.
   *
+  * \warning MPFR C++ is licensed under the GPL.
+  *
   * You can find a copy of MPFR C++ that is known to be compatible in the unsupported/test/mpreal folder.
   *
   * Here is an example:
@@ -56,148 +58,157 @@ int main()
 \endcode
   *
   */
-	
-  template<> struct NumTraits<mpfr::mpreal>
-    : GenericNumTraits<mpfr::mpreal>
-  {
-    enum {
-      IsInteger = 0,
-      IsSigned = 1,
-      IsComplex = 0,
-      RequireInitialization = 1,
-      ReadCost = 10,
-      AddCost = 10,
-      MulCost = 40
-    };
-
-    typedef mpfr::mpreal Real;
-    typedef mpfr::mpreal NonInteger;
-    
-    inline static Real highest   (long Precision = mpfr::mpreal::get_default_prec())  { return  mpfr::maxval(Precision); }
-    inline static Real lowest    (long Precision = mpfr::mpreal::get_default_prec())  { return -mpfr::maxval(Precision); }
-
-    // Constants
-    inline static Real Pi       (long Precision = mpfr::mpreal::get_default_prec())     {    return mpfr::const_pi(Precision);        }
-    inline static Real Euler    (long Precision = mpfr::mpreal::get_default_prec())     {    return mpfr::const_euler(Precision);     }
-    inline static Real Log2     (long Precision = mpfr::mpreal::get_default_prec())     {    return mpfr::const_log2(Precision);      }
-    inline static Real Catalan  (long Precision = mpfr::mpreal::get_default_prec())     {    return mpfr::const_catalan(Precision);   }
-
-    inline static Real epsilon  (long Precision = mpfr::mpreal::get_default_prec())     {    return mpfr::machine_epsilon(Precision); }
-    inline static Real epsilon  (const Real& x)                                         {    return mpfr::machine_epsilon(x); }
-
-    inline static Real dummy_precision()   
-    { 
-        unsigned int weak_prec = ((mpfr::mpreal::get_default_prec()-1) * 90) / 100;
-        return mpfr::machine_epsilon(weak_prec);
-    }
+
+template <>
+struct NumTraits<mpfr::mpreal> : GenericNumTraits<mpfr::mpreal> {
+  enum {
+    IsInteger = 0,
+    IsSigned = 1,
+    IsComplex = 0,
+    RequireInitialization = 1,
+    ReadCost = HugeCost,
+    AddCost = HugeCost,
+    MulCost = HugeCost
   };
 
-  namespace internal {
+  typedef mpfr::mpreal Real;
+  typedef mpfr::mpreal NonInteger;
 
-  template<> inline mpfr::mpreal random<mpfr::mpreal>()
-  {
-    return mpfr::random();
-  }
+  static inline Real highest(long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::maxval(Precision); }
+  static inline Real lowest(long Precision = mpfr::mpreal::get_default_prec()) { return -mpfr::maxval(Precision); }
 
-  template<> inline mpfr::mpreal random<mpfr::mpreal>(const mpfr::mpreal& a, const mpfr::mpreal& b)
-  {
-    return a + (b-a) * random<mpfr::mpreal>();
+  // Constants
+  static inline Real Pi(long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::const_pi(Precision); }
+  static inline Real Euler(long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::const_euler(Precision); }
+  static inline Real Log2(long Precision = mpfr::mpreal::get_default_prec()) { return mpfr::const_log2(Precision); }
+  static inline Real Catalan(long Precision = mpfr::mpreal::get_default_prec()) {
+    return mpfr::const_catalan(Precision);
   }
 
-  inline bool isMuchSmallerThan(const mpfr::mpreal& a, const mpfr::mpreal& b, const mpfr::mpreal& eps)
-  {
-    return mpfr::abs(a) <= mpfr::abs(b) * eps;
+  static inline Real epsilon(long Precision = mpfr::mpreal::get_default_prec()) {
+    return mpfr::machine_epsilon(Precision);
   }
+  static inline Real epsilon(const Real& x) { return mpfr::machine_epsilon(x); }
 
-  inline bool isApprox(const mpfr::mpreal& a, const mpfr::mpreal& b, const mpfr::mpreal& eps)
-  {
-    return mpfr::isEqualFuzzy(a,b,eps);
+#ifdef MPREAL_HAVE_DYNAMIC_STD_NUMERIC_LIMITS
+  static inline int digits10(long Precision = mpfr::mpreal::get_default_prec()) {
+    return std::numeric_limits<Real>::digits10(Precision);
+  }
+  static inline int digits10(const Real& x) { return std::numeric_limits<Real>::digits10(x); }
+ 
+  static inline int max_digits10(long Precision = mpfr::mpreal::get_default_prec()) {
+    return std::numeric_limits<Real>::max_digits10(Precision);
   }
 
-  inline bool isApproxOrLessThan(const mpfr::mpreal& a, const mpfr::mpreal& b, const mpfr::mpreal& eps)
-  {
-    return a <= b || mpfr::isEqualFuzzy(a,b,eps);
+  static inline int digits() { return std::numeric_limits<Real>::digits(); }
+  static inline int digits(const Real& x) { return std::numeric_limits<Real>::digits(x); }
+#endif
+
+  static inline Real dummy_precision() {
+    mpfr_prec_t weak_prec = ((mpfr::mpreal::get_default_prec() - 1) * 90) / 100;
+    return mpfr::machine_epsilon(weak_prec);
   }
+};
 
-  template<> inline long double cast<mpfr::mpreal,long double>(const mpfr::mpreal& x)
-  { return x.toLDouble(); }
-
-  template<> inline double cast<mpfr::mpreal,double>(const mpfr::mpreal& x)
-  { return x.toDouble(); }
-
-  template<> inline long cast<mpfr::mpreal,long>(const mpfr::mpreal& x)
-  { return x.toLong(); }
-
-  template<> inline int cast<mpfr::mpreal,int>(const mpfr::mpreal& x)
-  { return int(x.toLong()); }
-
-  // Specialize GEBP kernel and traits for mpreal (no need for peeling, nor complicated stuff)
-  // This also permits to directly call mpfr's routines and avoid many temporaries produced by mpreal
-    template<>
-    class gebp_traits<mpfr::mpreal, mpfr::mpreal, false, false>
-    {
-    public:
-      typedef mpfr::mpreal ResScalar;
-      enum {
-        nr = 2, // must be 2 for proper packing...
-        mr = 1,
-        WorkSpaceFactor = nr,
-        LhsProgress = 1,
-        RhsProgress = 1
-      };
-    };
-
-    template<typename Index, int mr, int nr, bool ConjugateLhs, bool ConjugateRhs>
-    struct gebp_kernel<mpfr::mpreal,mpfr::mpreal,Index,mr,nr,ConjugateLhs,ConjugateRhs>
-    {
-      typedef mpfr::mpreal mpreal;
-
-      EIGEN_DONT_INLINE
-      void operator()(mpreal* res, Index resStride, const mpreal* blockA, const mpreal* blockB, Index rows, Index depth, Index cols, mpreal alpha,
-                      Index strideA=-1, Index strideB=-1, Index offsetA=0, Index offsetB=0, mpreal* /*unpackedB*/ = 0)
-      {
-        mpreal acc1, acc2, tmp;
-        
-        if(strideA==-1) strideA = depth;
-        if(strideB==-1) strideB = depth;
-
-        for(Index j=0; j<cols; j+=nr)
-        {
-          Index actual_nr = (std::min<Index>)(nr,cols-j);
-          mpreal *C1 = res + j*resStride;
-          mpreal *C2 = res + (j+1)*resStride;
-          for(Index i=0; i<rows; i++)
-          {
-            mpreal *B = const_cast<mpreal*>(blockB) + j*strideB + offsetB*actual_nr;
-            mpreal *A = const_cast<mpreal*>(blockA) + i*strideA + offsetA;
-            acc1 = 0;
-            acc2 = 0;
-            for(Index k=0; k<depth; k++)
-            {
-              mpfr_mul(tmp.mpfr_ptr(), A[k].mpfr_ptr(), B[0].mpfr_ptr(), mpreal::get_default_rnd());
-              mpfr_add(acc1.mpfr_ptr(), acc1.mpfr_ptr(), tmp.mpfr_ptr(),  mpreal::get_default_rnd());
-              
-              if(actual_nr==2) {
-                mpfr_mul(tmp.mpfr_ptr(), A[k].mpfr_ptr(), B[1].mpfr_ptr(), mpreal::get_default_rnd());
-                mpfr_add(acc2.mpfr_ptr(), acc2.mpfr_ptr(), tmp.mpfr_ptr(),  mpreal::get_default_rnd());
-              }
-              
-              B+=actual_nr;
-            }
-            
-            mpfr_mul(acc1.mpfr_ptr(), acc1.mpfr_ptr(), alpha.mpfr_ptr(), mpreal::get_default_rnd());
-            mpfr_add(C1[i].mpfr_ptr(), C1[i].mpfr_ptr(), acc1.mpfr_ptr(),  mpreal::get_default_rnd());
-            
-            if(actual_nr==2) {
-              mpfr_mul(acc2.mpfr_ptr(), acc2.mpfr_ptr(), alpha.mpfr_ptr(), mpreal::get_default_rnd());
-              mpfr_add(C2[i].mpfr_ptr(), C2[i].mpfr_ptr(), acc2.mpfr_ptr(),  mpreal::get_default_rnd());
-            }
-          }
-        }
-      }
-    };
+namespace internal {
+
+template <>
+inline mpfr::mpreal random<mpfr::mpreal>() {
+  return mpfr::random();
+}
+
+template <>
+inline mpfr::mpreal random<mpfr::mpreal>(const mpfr::mpreal& a, const mpfr::mpreal& b) {
+  return a + (b - a) * random<mpfr::mpreal>();
+}
+
+inline bool isMuchSmallerThan(const mpfr::mpreal& a, const mpfr::mpreal& b, const mpfr::mpreal& eps) {
+  return mpfr::abs(a) <= mpfr::abs(b) * eps;
+}
+
+inline bool isApprox(const mpfr::mpreal& a, const mpfr::mpreal& b, const mpfr::mpreal& eps) {
+  return mpfr::isEqualFuzzy(a, b, eps);
+}
+
+inline bool isApproxOrLessThan(const mpfr::mpreal& a, const mpfr::mpreal& b, const mpfr::mpreal& eps) {
+  return a <= b || mpfr::isEqualFuzzy(a, b, eps);
+}
+
+template <>
+inline long double cast<mpfr::mpreal, long double>(const mpfr::mpreal& x) {
+  return x.toLDouble();
+}
+
+template <>
+inline double cast<mpfr::mpreal, double>(const mpfr::mpreal& x) {
+  return x.toDouble();
+}
+
+template <>
+inline long cast<mpfr::mpreal, long>(const mpfr::mpreal& x) {
+  return x.toLong();
+}
 
-  } // end namespace internal
+template <>
+inline int cast<mpfr::mpreal, int>(const mpfr::mpreal& x) {
+  return int(x.toLong());
 }
 
-#endif // EIGEN_MPREALSUPPORT_MODULE_H
+// Specialize GEBP kernel and traits for mpreal (no need for peeling, nor complicated stuff)
+// This also permits to directly call mpfr's routines and avoid many temporaries produced by mpreal
+template <>
+class gebp_traits<mpfr::mpreal, mpfr::mpreal, false, false> {
+ public:
+  typedef mpfr::mpreal ResScalar;
+  enum {
+    Vectorizable = false,
+    LhsPacketSize = 1,
+    RhsPacketSize = 1,
+    ResPacketSize = 1,
+    NumberOfRegisters = 1,
+    nr = 1,
+    mr = 1,
+    LhsProgress = 1,
+    RhsProgress = 1
+  };
+  typedef ResScalar LhsPacket;
+  typedef ResScalar RhsPacket;
+  typedef ResScalar ResPacket;
+  typedef LhsPacket LhsPacket4Packing;
+};
+
+template <typename Index, typename DataMapper, bool ConjugateLhs, bool ConjugateRhs>
+struct gebp_kernel<mpfr::mpreal, mpfr::mpreal, Index, DataMapper, 1, 1, ConjugateLhs, ConjugateRhs> {
+  typedef mpfr::mpreal mpreal;
+
+  EIGEN_DONT_INLINE void operator()(const DataMapper& res, const mpreal* blockA, const mpreal* blockB, Index rows,
+                                    Index depth, Index cols, const mpreal& alpha, Index strideA = -1,
+                                    Index strideB = -1, Index offsetA = 0, Index offsetB = 0) {
+    if (rows == 0 || cols == 0 || depth == 0) return;
+
+    mpreal acc1(0, mpfr_get_prec(blockA[0].mpfr_srcptr())), tmp(0, mpfr_get_prec(blockA[0].mpfr_srcptr()));
+
+    if (strideA == -1) strideA = depth;
+    if (strideB == -1) strideB = depth;
+
+    for (Index i = 0; i < rows; ++i) {
+      for (Index j = 0; j < cols; ++j) {
+        const mpreal* A = blockA + i * strideA + offsetA;
+        const mpreal* B = blockB + j * strideB + offsetB;
+
+        acc1 = 0;
+        for (Index k = 0; k < depth; k++) {
+          mpfr_mul(tmp.mpfr_ptr(), A[k].mpfr_srcptr(), B[k].mpfr_srcptr(), mpreal::get_default_rnd());
+          mpfr_add(acc1.mpfr_ptr(), acc1.mpfr_ptr(), tmp.mpfr_ptr(), mpreal::get_default_rnd());
+        }
+
+        mpfr_mul(acc1.mpfr_ptr(), acc1.mpfr_srcptr(), alpha.mpfr_srcptr(), mpreal::get_default_rnd());
+        mpfr_add(res(i, j).mpfr_ptr(), res(i, j).mpfr_srcptr(), acc1.mpfr_srcptr(), mpreal::get_default_rnd());
+      }
+    }
+  }
+};
+}  // end namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_MPREALSUPPORT_MODULE_H
diff --git a/inst/include/unsupported/Eigen/MatrixFunctions b/inst/include/unsupported/Eigen/MatrixFunctions
index 0991817d..845ee0bb 100644
--- a/inst/include/unsupported/Eigen/MatrixFunctions
+++ b/inst/include/unsupported/Eigen/MatrixFunctions
@@ -8,61 +8,64 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#ifndef EIGEN_MATRIX_FUNCTIONS
-#define EIGEN_MATRIX_FUNCTIONS
+#ifndef EIGEN_MATRIX_FUNCTIONS_MODULE_H
+#define EIGEN_MATRIX_FUNCTIONS_MODULE_H
 
 #include <cfloat>
 #include <list>
-#include <functional>
-#include <iterator>
 
-#include <Eigen/Core>
-#include <Eigen/LU>
-#include <Eigen/Eigenvalues>
+#include "../../Eigen/Core"
+#include "../../Eigen/LU"
+#include "../../Eigen/Eigenvalues"
 
 /**
-  * \defgroup MatrixFunctions_Module Matrix functions module
-  * \brief This module aims to provide various methods for the computation of
-  * matrix functions. 
-  *
-  * To use this module, add 
-  * \code
-  * #include <unsupported/Eigen/MatrixFunctions>
-  * \endcode
-  * at the start of your source file.
-  *
-  * This module defines the following MatrixBase methods.
-  *  - \ref matrixbase_cos "MatrixBase::cos()", for computing the matrix cosine
-  *  - \ref matrixbase_cosh "MatrixBase::cosh()", for computing the matrix hyperbolic cosine
-  *  - \ref matrixbase_exp "MatrixBase::exp()", for computing the matrix exponential
-  *  - \ref matrixbase_log "MatrixBase::log()", for computing the matrix logarithm
-  *  - \ref matrixbase_pow "MatrixBase::pow()", for computing the matrix power
-  *  - \ref matrixbase_matrixfunction "MatrixBase::matrixFunction()", for computing general matrix functions
-  *  - \ref matrixbase_sin "MatrixBase::sin()", for computing the matrix sine
-  *  - \ref matrixbase_sinh "MatrixBase::sinh()", for computing the matrix hyperbolic sine
-  *  - \ref matrixbase_sqrt "MatrixBase::sqrt()", for computing the matrix square root
-  *
-  * These methods are the main entry points to this module. 
-  *
-  * %Matrix functions are defined as follows.  Suppose that \f$ f \f$
-  * is an entire function (that is, a function on the complex plane
-  * that is everywhere complex differentiable).  Then its Taylor
-  * series
-  * \f[ f(0) + f'(0) x + \frac{f''(0)}{2} x^2 + \frac{f'''(0)}{3!} x^3 + \cdots \f]
-  * converges to \f$ f(x) \f$. In this case, we can define the matrix
-  * function by the same series:
-  * \f[ f(M) = f(0) + f'(0) M + \frac{f''(0)}{2} M^2 + \frac{f'''(0)}{3!} M^3 + \cdots \f]
-  *
-  */
-
+ * \defgroup MatrixFunctions_Module Matrix functions module
+ * \brief This module aims to provide various methods for the computation of
+ * matrix functions.
+ *
+ * To use this module, add
+ * \code
+ * #include <unsupported/Eigen/MatrixFunctions>
+ * \endcode
+ * at the start of your source file.
+ *
+ * This module defines the following MatrixBase methods.
+ *  - \ref matrixbase_cos "MatrixBase::cos()", for computing the matrix cosine
+ *  - \ref matrixbase_cosh "MatrixBase::cosh()", for computing the matrix hyperbolic cosine
+ *  - \ref matrixbase_exp "MatrixBase::exp()", for computing the matrix exponential
+ *  - \ref matrixbase_log "MatrixBase::log()", for computing the matrix logarithm
+ *  - \ref matrixbase_pow "MatrixBase::pow()", for computing the matrix power
+ *  - \ref matrixbase_matrixfunction "MatrixBase::matrixFunction()", for computing general matrix functions
+ *  - \ref matrixbase_sin "MatrixBase::sin()", for computing the matrix sine
+ *  - \ref matrixbase_sinh "MatrixBase::sinh()", for computing the matrix hyperbolic sine
+ *  - \ref matrixbase_sqrt "MatrixBase::sqrt()", for computing the matrix square root
+ *
+ * These methods are the main entry points to this module.
+ *
+ * %Matrix functions are defined as follows.  Suppose that \f$ f \f$
+ * is an entire function (that is, a function on the complex plane
+ * that is everywhere complex differentiable).  Then its Taylor
+ * series
+ * \f[ f(0) + f'(0) x + \frac{f''(0)}{2} x^2 + \frac{f'''(0)}{3!} x^3 + \cdots \f]
+ * converges to \f$ f(x) \f$. In this case, we can define the matrix
+ * function by the same series:
+ * \f[ f(M) = f(0) + f'(0) M + \frac{f''(0)}{2} M^2 + \frac{f'''(0)}{3!} M^3 + \cdots \f]
+ *
+ */
+
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
+// IWYU pragma: begin_exports
 #include "src/MatrixFunctions/MatrixExponential.h"
 #include "src/MatrixFunctions/MatrixFunction.h"
 #include "src/MatrixFunctions/MatrixSquareRoot.h"
 #include "src/MatrixFunctions/MatrixLogarithm.h"
 #include "src/MatrixFunctions/MatrixPower.h"
+// IWYU pragma: end_exports
 
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
 
-/** 
+/**
 \page matrixbaseextra_page
 \ingroup MatrixFunctions_Module
 
@@ -84,7 +87,9 @@ const MatrixFunctionReturnValue<Derived> MatrixBase<Derived>::cos() const
 \param[in]  M  a square matrix.
 \returns  expression representing \f$ \cos(M) \f$.
 
-This function calls \ref matrixbase_matrixfunction "matrixFunction()" with StdStemFunctions::cos().
+This function computes the matrix cosine. Use ArrayBase::cos() for computing the entry-wise cosine.
+
+The implementation calls \ref matrixbase_matrixfunction "matrixFunction()" with StdStemFunctions::cos().
 
 \sa \ref matrixbase_sin "sin()" for an example.
 
@@ -125,6 +130,9 @@ differential equations: the solution of \f$ y' = My \f$ with the
 initial condition \f$ y(0) = y_0 \f$ is given by
 \f$ y(t) = \exp(M) y_0 \f$.
 
+The matrix exponential is different from applying the exp function to all the entries in the matrix.
+Use ArrayBase::exp() if you want to do the latter.
+
 The cost of the computation is approximately \f$ 20 n^3 \f$ for
 matrices of size \f$ n \f$. The number 20 depends weakly on the
 norm of the matrix.
@@ -158,8 +166,8 @@ the z-axis.
 \include MatrixExponential.cpp
 Output: \verbinclude MatrixExponential.out
 
-\note \p M has to be a matrix of \c float, \c double, \c long double
-\c complex<float>, \c complex<double>, or \c complex<long double> .
+\note \p M has to be a matrix of \c float, \c double, `long double`
+\c complex<float>, \c complex<double>, or `complex<long double>` .
 
 
 \subsection matrixbase_log MatrixBase::log()
@@ -173,12 +181,15 @@ const MatrixLogarithmReturnValue<Derived> MatrixBase<Derived>::log() const
 \param[in]  M  invertible matrix whose logarithm is to be computed.
 \returns    expression representing the matrix logarithm root of \p M.
 
-The matrix logarithm of \f$ M \f$ is a matrix \f$ X \f$ such that 
+The matrix logarithm of \f$ M \f$ is a matrix \f$ X \f$ such that
 \f$ \exp(X) = M \f$ where exp denotes the matrix exponential. As for
 the scalar logarithm, the equation \f$ \exp(X) = M \f$ may have
 multiple solutions; this function returns a matrix whose eigenvalues
 have imaginary part in the interval \f$ (-\pi,\pi] \f$.
 
+The matrix logarithm is different from applying the log function to all the entries in the matrix.
+Use ArrayBase::log() if you want to do the latter.
+
 In the real case, the matrix \f$ M \f$ should be invertible and
 it should have no eigenvalues which are real and negative (pairs of
 complex conjugate eigenvalues are allowed). In the complex case, it
@@ -197,14 +208,14 @@ Nicholas J. Higham,
 SIAM 2008. ISBN 978-0-898716-46-7.
 
 Example: The following program checks that
-\f[ \log \left[ \begin{array}{ccc} 
+\f[ \log \left[ \begin{array}{ccc}
       \frac12\sqrt2 & -\frac12\sqrt2 & 0 \\
       \frac12\sqrt2 & \frac12\sqrt2 & 0 \\
       0 & 0 & 1
     \end{array} \right] = \left[ \begin{array}{ccc}
-      0 & \frac14\pi & 0 \\ 
+      0 & \frac14\pi & 0 \\
       -\frac14\pi & 0 & 0 \\
-      0 & 0 & 0 
+      0 & 0 & 0
     \end{array} \right]. \f]
 This corresponds to a rotation of \f$ \frac14\pi \f$ radians around
 the z-axis. This is the inverse of the example used in the
@@ -213,11 +224,10 @@ documentation of \ref matrixbase_exp "exp()".
 \include MatrixLogarithm.cpp
 Output: \verbinclude MatrixLogarithm.out
 
-\note \p M has to be a matrix of \c float, \c double, <tt>long
-double</tt>, \c complex<float>, \c complex<double>, or \c complex<long
-double> .
+\note \p M has to be a matrix of \c float, \c double, `long
+double`, \c complex<float>, \c complex<double>, or `complex<long double>`.
 
-\sa MatrixBase::exp(), MatrixBase::matrixFunction(), 
+\sa MatrixBase::exp(), MatrixBase::matrixFunction(),
     class MatrixLogarithmAtomic, MatrixBase::sqrt().
 
 
@@ -230,22 +240,66 @@ const MatrixPowerReturnValue<Derived> MatrixBase<Derived>::pow(RealScalar p) con
 \endcode
 
 \param[in]  M  base of the matrix power, should be a square matrix.
-\param[in]  p  exponent of the matrix power, should be real.
+\param[in]  p  exponent of the matrix power.
 
 The matrix power \f$ M^p \f$ is defined as \f$ \exp(p \log(M)) \f$,
 where exp denotes the matrix exponential, and log denotes the matrix
-logarithm.
+logarithm. This is different from raising all the entries in the matrix
+to the p-th power. Use ArrayBase::pow() if you want to do the latter.
 
-The matrix \f$ M \f$ should meet the conditions to be an argument of
-matrix logarithm. If \p p is not of the real scalar type of \p M, it
-is casted into the real scalar type of \p M.
+If \p p is complex, the scalar type of \p M should be the type of \p
+p . \f$ M^p \f$ simply evaluates into \f$ \exp(p \log(M)) \f$.
+Therefore, the matrix \f$ M \f$ should meet the conditions to be an
+argument of matrix logarithm.
 
-This function computes the matrix power using the Schur-Pad&eacute;
+If \p p is real, it is casted into the real scalar type of \p M. Then
+this function computes the matrix power using the Schur-Pad&eacute;
 algorithm as implemented by class MatrixPower. The exponent is split
 into integral part and fractional part, where the fractional part is
 in the interval \f$ (-1, 1) \f$. The main diagonal and the first
 super-diagonal is directly computed.
 
+If \p M is singular with a semisimple zero eigenvalue and \p p is
+positive, the Schur factor \f$ T \f$ is reordered with Givens
+rotations, i.e.
+
+\f[ T = \left[ \begin{array}{cc}
+      T_1 & T_2 \\
+      0   & 0
+    \end{array} \right] \f]
+
+where \f$ T_1 \f$ is invertible. Then \f$ T^p \f$ is given by
+
+\f[ T^p = \left[ \begin{array}{cc}
+      T_1^p & T_1^{-1} T_1^p T_2 \\
+      0     & 0
+    \end{array}. \right] \f]
+
+\warning Fractional power of a matrix with a non-semisimple zero
+eigenvalue is not well-defined. We introduce an assertion failure
+against inaccurate result, e.g. \code
+#include <unsupported/Eigen/MatrixFunctions>
+#include <iostream>
+
+int main()
+{
+  Eigen::Matrix4d A;
+  A << 0, 0, 2, 3,
+       0, 0, 4, 5,
+       0, 0, 6, 7,
+       0, 0, 8, 9;
+  std::cout << A.pow(0.37) << std::endl;
+
+  // The 1 makes eigenvalue 0 non-semisimple.
+  A.coeffRef(0, 1) = 1;
+
+  // This fails if EIGEN_NO_DEBUG is undefined.
+  std::cout << A.pow(0.37) << std::endl;
+
+  return 0;
+}
+\endcode
+
 Details of the algorithm can be found in: Nicholas J. Higham and
 Lijing Lin, "A Schur-Pad&eacute; algorithm for fractional powers of a
 matrix," <em>SIAM J. %Matrix Anal. Applic.</em>,
@@ -276,9 +330,9 @@ Example:
 \include MatrixPower_optimal.cpp
 Output: \verbinclude MatrixPower_optimal.out
 
-\note \p M has to be a matrix of \c float, \c double, <tt>long
-double</tt>, \c complex<float>, \c complex<double>, or \c complex<long
-double> .
+\note \p M has to be a matrix of \c float, \c double, `long
+double`, \c complex<float>, \c complex<double>, or
+\c complex<long double> .
 
 \sa MatrixBase::exp(), MatrixBase::log(), class MatrixPower.
 
@@ -288,18 +342,18 @@ double> .
 Compute a matrix function.
 
 \code
-const MatrixFunctionReturnValue<Derived> MatrixBase<Derived>::matrixFunction(typename internal::stem_function<typename internal::traits<Derived>::Scalar>::type f) const
-\endcode
+const MatrixFunctionReturnValue<Derived> MatrixBase<Derived>::matrixFunction(typename internal::stem_function<typename
+internal::traits<Derived>::Scalar>::type f) const \endcode
 
 \param[in]  M  argument of matrix function, should be a square matrix.
 \param[in]  f  an entire function; \c f(x,n) should compute the n-th
 derivative of f at x.
 \returns  expression representing \p f applied to \p M.
 
-Suppose that \p M is a matrix whose entries have type \c Scalar. 
+Suppose that \p M is a matrix whose entries have type \c Scalar.
 Then, the second argument, \p f, should be a function with prototype
-\code 
-ComplexScalar f(ComplexScalar, int) 
+\code
+ComplexScalar f(ComplexScalar, int)
 \endcode
 where \c ComplexScalar = \c std::complex<Scalar> if \c Scalar is
 real (e.g., \c float or \c double) and \c ComplexScalar =
@@ -307,17 +361,17 @@ real (e.g., \c float or \c double) and \c ComplexScalar =
 should be \f$ f^{(n)}(x) \f$, the n-th derivative of f at x.
 
 This routine uses the algorithm described in:
-Philip Davies and Nicholas J. Higham, 
-"A Schur-Parlett algorithm for computing matrix functions", 
+Philip Davies and Nicholas J. Higham,
+"A Schur-Parlett algorithm for computing matrix functions",
 <em>SIAM J. %Matrix Anal. Applic.</em>, <b>25</b>:464&ndash;485, 2003.
 
 The actual work is done by the MatrixFunction class.
 
 Example: The following program checks that
-\f[ \exp \left[ \begin{array}{ccc} 
-      0 & \frac14\pi & 0 \\ 
+\f[ \exp \left[ \begin{array}{ccc}
+      0 & \frac14\pi & 0 \\
       -\frac14\pi & 0 & 0 \\
-      0 & 0 & 0 
+      0 & 0 & 0
     \end{array} \right] = \left[ \begin{array}{ccc}
       \frac12\sqrt2 & -\frac12\sqrt2 & 0 \\
       \frac12\sqrt2 & \frac12\sqrt2 & 0 \\
@@ -330,7 +384,7 @@ of \ref matrixbase_exp "exp()".
 \include MatrixFunction.cpp
 Output: \verbinclude MatrixFunction.out
 
-Note that the function \c expfn is defined for complex numbers 
+Note that the function \c expfn is defined for complex numbers
 \c x, even though the matrix \c A is over the reals. Instead of
 \c expfn, we could also have used StdStemFunctions::exp:
 \code
@@ -350,7 +404,9 @@ const MatrixFunctionReturnValue<Derived> MatrixBase<Derived>::sin() const
 \param[in]  M  a square matrix.
 \returns  expression representing \f$ \sin(M) \f$.
 
-This function calls \ref matrixbase_matrixfunction "matrixFunction()" with StdStemFunctions::sin().
+This function computes the matrix sine. Use ArrayBase::sin() for computing the entry-wise sine.
+
+The implementation calls \ref matrixbase_matrixfunction "matrixFunction()" with StdStemFunctions::sin().
 
 Example: \include MatrixSine.cpp
 Output: \verbinclude MatrixSine.out
@@ -387,13 +443,15 @@ const MatrixSquareRootReturnValue<Derived> MatrixBase<Derived>::sqrt() const
 
 The matrix square root of \f$ M \f$ is the matrix \f$ M^{1/2} \f$
 whose square is the original matrix; so if \f$ S = M^{1/2} \f$ then
-\f$ S^2 = M \f$. 
+\f$ S^2 = M \f$. This is different from taking the square root of all
+the entries in the matrix; use ArrayBase::sqrt() if you want to do the
+latter.
 
 In the <b>real case</b>, the matrix \f$ M \f$ should be invertible and
 it should have no eigenvalues which are real and negative (pairs of
 complex conjugate eigenvalues are allowed). In that case, the matrix
 has a square root which is also real, and this is the square root
-computed by this function. 
+computed by this function.
 
 The matrix square root is computed by first reducing the matrix to
 quasi-triangular form with the real Schur decomposition. The square
@@ -425,12 +483,12 @@ square root of a matrix", <em>Linear Algebra Appl.</em>,
 52/53:127&ndash;140, 1983.
 
 Example: The following program checks that the square root of
-\f[ \left[ \begin{array}{cc} 
+\f[ \left[ \begin{array}{cc}
               \cos(\frac13\pi) & -\sin(\frac13\pi) \\
               \sin(\frac13\pi) & \cos(\frac13\pi)
     \end{array} \right], \f]
 corresponding to a rotation over 60 degrees, is a rotation over 30 degrees:
-\f[ \left[ \begin{array}{cc} 
+\f[ \left[ \begin{array}{cc}
               \cos(\frac16\pi) & -\sin(\frac16\pi) \\
               \sin(\frac16\pi) & \cos(\frac16\pi)
     \end{array} \right]. \f]
@@ -443,5 +501,4 @@ Output: \verbinclude MatrixSquareRoot.out
 
 */
 
-#endif // EIGEN_MATRIX_FUNCTIONS
-
+#endif  // EIGEN_MATRIX_FUNCTIONS_MODULE_H
diff --git a/inst/include/unsupported/Eigen/NNLS b/inst/include/unsupported/Eigen/NNLS
new file mode 100644
index 00000000..2923f596
--- /dev/null
+++ b/inst/include/unsupported/Eigen/NNLS
@@ -0,0 +1,388 @@
+/* Non-Negagive Least Squares Algorithm for Eigen.
+ *
+ * Copyright (C) 2021 Essex Edwards, <essex.edwards@gmail.com>
+ * Copyright (C) 2013 Hannes Matuschek, hannes.matuschek at uni-potsdam.de
+ *
+ * This Source Code Form is subject to the terms of the Mozilla
+ * Public License v. 2.0. If a copy of the MPL was not distributed
+ * with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+/** \defgroup nnls Non-Negative Least Squares (NNLS) Module
+ * This module provides a single class @c Eigen::NNLS implementing the NNLS algorithm.
+ * The algorithm is described in "SOLVING LEAST SQUARES PROBLEMS", by Charles L. Lawson and
+ * Richard J. Hanson, Prentice-Hall, 1974 and solves optimization problems of the form
+ *
+ * \f[ \min \left\Vert Ax-b\right\Vert_2^2\quad s.t.\, x\ge 0\,.\f]
+ *
+ * The algorithm solves the constrained least-squares problem above by iteratively improving
+ * an estimate of which constraints are active (elements of \f$x\f$ equal to zero)
+ * and which constraints are inactive (elements of \f$x\f$ greater than zero).
+ * Each iteration, an unconstrained linear least-squares problem solves for the
+ * components of \f$x\f$ in the (estimated) inactive set and the sets are updated.
+ * The unconstrained problem minimizes \f$\left\Vert A^Nx^N-b\right\Vert_2^2\f$,
+ * where \f$A^N\f$ is a matrix formed by selecting all columns of A which are
+ * in the inactive set \f$N\f$.
+ *
+ */
+
+#ifndef EIGEN_NNLS_H
+#define EIGEN_NNLS_H
+
+#include "../../Eigen/Core"
+#include "../../Eigen/QR"
+
+#include <limits>
+
+namespace Eigen {
+
+/** \ingroup nnls
+ * \class NNLS
+ * \brief Implementation of the Non-Negative Least Squares (NNLS) algorithm.
+ * \tparam MatrixType The type of the system matrix \f$A\f$.
+ *
+ * This class implements the NNLS algorithm as described in "SOLVING LEAST SQUARES PROBLEMS",
+ * Charles L. Lawson and Richard J. Hanson, Prentice-Hall, 1974. This algorithm solves a least
+ * squares problem iteratively and ensures that the solution is non-negative. I.e.
+ *
+ * \f[ \min \left\Vert Ax-b\right\Vert_2^2\quad s.t.\, x\ge 0 \f]
+ *
+ * The algorithm solves the constrained least-squares problem above by iteratively improving
+ * an estimate of which constraints are active (elements of \f$x\f$ equal to zero)
+ * and which constraints are inactive (elements of \f$x\f$ greater than zero).
+ * Each iteration, an unconstrained linear least-squares problem solves for the
+ * components of \f$x\f$ in the (estimated) inactive set and the sets are updated.
+ * The unconstrained problem minimizes \f$\left\Vert A^Nx^N-b\right\Vert_2^2\f$,
+ * where \f$A^N\f$ is a matrix formed by selecting all columns of A which are
+ * in the inactive set \f$N\f$.
+ *
+ * See <a href="https://en.wikipedia.org/wiki/Non-negative_least_squares">the
+ * wikipedia page on non-negative least squares</a> for more background information.
+ *
+ * \note Please note that it is possible to construct an NNLS problem for which the
+ *       algorithm does not converge. In practice these cases are extremely rare.
+ */
+template <class MatrixType_>
+class NNLS {
+ public:
+  typedef MatrixType_ MatrixType;
+
+  enum {
+    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
+    ColsAtCompileTime = MatrixType::ColsAtCompileTime,
+    Options = MatrixType::Options,
+    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
+    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
+  };
+
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef typename MatrixType::Index Index;
+
+  /** Type of a row vector of the system matrix \f$A\f$. */
+  typedef Matrix<Scalar, ColsAtCompileTime, 1> SolutionVectorType;
+  /** Type of a column vector of the system matrix \f$A\f$. */
+  typedef Matrix<Scalar, RowsAtCompileTime, 1> RhsVectorType;
+  typedef Matrix<Index, ColsAtCompileTime, 1> IndicesType;
+
+  /** */
+  NNLS();
+
+  /** \brief Constructs a NNLS sovler and initializes it with the given system matrix @c A.
+   * \param A Specifies the system matrix.
+   * \param max_iter Specifies the maximum number of iterations to solve the system.
+   * \param tol Specifies the precision of the optimum.
+   *        This is an absolute tolerance on the gradient of the Lagrangian, \f$A^T(Ax-b)-\lambda\f$
+   *        (with Lagrange multipliers \f$\lambda\f$).
+   */
+  NNLS(const MatrixType &A, Index max_iter = -1, Scalar tol = NumTraits<Scalar>::dummy_precision());
+
+  /** Initializes the solver with the matrix \a A for further solving NNLS problems.
+   *
+   * This function mostly initializes/computes the preconditioner. In the future
+   * we might, for instance, implement column reordering for faster matrix vector products.
+   */
+  template <typename MatrixDerived>
+  NNLS<MatrixType> &compute(const EigenBase<MatrixDerived> &A);
+
+  /** \brief Solves the NNLS problem.
+   *
+   * The dimension of @c b must be equal to the number of rows of @c A, given to the constructor.
+   *
+   * \returns The approximate solution vector \f$ x \f$. Use info() to determine if the solve was a success or not.
+   * \sa info()
+   */
+  const SolutionVectorType &solve(const RhsVectorType &b);
+
+  /** \brief Returns the solution if a problem was solved.
+   * If not, an uninitialized vector may be returned. */
+  const SolutionVectorType &x() const { return x_; }
+
+  /** \returns the tolerance threshold used by the stopping criteria.
+   * \sa setTolerance()
+   */
+  Scalar tolerance() const { return tolerance_; }
+
+  /** Sets the tolerance threshold used by the stopping criteria.
+   *
+   * This is an absolute tolerance on the gradient of the Lagrangian, \f$A^T(Ax-b)-\lambda\f$
+   * (with Lagrange multipliers \f$\lambda\f$).
+   */
+  NNLS<MatrixType> &setTolerance(const Scalar &tolerance) {
+    tolerance_ = tolerance;
+    return *this;
+  }
+
+  /** \returns the max number of iterations.
+   * It is either the value set by setMaxIterations or, by default, twice the number of columns of the matrix.
+   */
+  Index maxIterations() const { return max_iter_ < 0 ? 2 * A_.cols() : max_iter_; }
+
+  /** Sets the max number of iterations.
+   * Default is twice the number of columns of the matrix.
+   * The algorithm requires at least k iterations to produce a solution vector with k non-zero entries.
+   */
+  NNLS<MatrixType> &setMaxIterations(Index maxIters) {
+    max_iter_ = maxIters;
+    return *this;
+  }
+
+  /** \returns the number of iterations (least-squares solves) performed during the last solve */
+  Index iterations() const { return iterations_; }
+
+  /** \returns Success if the iterations converged, and an error values otherwise. */
+  ComputationInfo info() const { return info_; }
+
+ private:
+  /** \internal Adds the given index @c idx to the inactive set N and updates the QR decomposition of \f$A^N\f$. */
+  void moveToInactiveSet_(Index idx);
+
+  /** \internal Removes the given index idx from the inactive set N and updates the QR decomposition of \f$A^N\f$. */
+  void moveToActiveSet_(Index idx);
+
+  /** \internal Solves the least-squares problem \f$\left\Vert y-A^Nx\right\Vert_2^2\f$. */
+  void solveInactiveSet_(const RhsVectorType &b);
+
+ private:
+  typedef Matrix<Scalar, ColsAtCompileTime, ColsAtCompileTime> MatrixAtAType;
+
+  /** \internal Holds the maximum number of iterations for the NNLS algorithm.
+   *  @c -1 means to use the default value. */
+  Index max_iter_;
+  /** \internal Holds the number of iterations. */
+  Index iterations_;
+  /** \internal Holds success/fail of the last solve. */
+  ComputationInfo info_;
+  /** \internal Size of the inactive set. */
+  Index numInactive_;
+  /** \internal Accuracy of the algorithm w.r.t the optimality of the solution (gradient). */
+  Scalar tolerance_;
+  /** \internal The system matrix, a copy of the one given to the constructor. */
+  MatrixType A_;
+  /** \internal Precomputed product \f$A^TA\f$. */
+  MatrixAtAType AtA_;
+  /** \internal Will hold the solution. */
+  SolutionVectorType x_;
+  /** \internal Will hold the current gradient.\f$A^Tb - A^TAx\f$ */
+  SolutionVectorType gradient_;
+  /** \internal Will hold the partial solution. */
+  SolutionVectorType y_;
+  /** \internal Precomputed product \f$A^Tb\f$. */
+  SolutionVectorType Atb_;
+  /** \internal Holds the current permutation partitioning the active and inactive sets.
+   * The first @c numInactive_ elements form the inactive set and the rest the active set. */
+  IndicesType index_sets_;
+  /** \internal QR decomposition to solve the (inactive) sub system (together with @c qrCoeffs_). */
+  MatrixType QR_;
+  /** \internal QR decomposition to solve the (inactive) sub system (together with @c QR_). */
+  SolutionVectorType qrCoeffs_;
+  /** \internal Some workspace for QR decomposition. */
+  SolutionVectorType tempSolutionVector_;
+  RhsVectorType tempRhsVector_;
+};
+
+/* ********************************************************************************************
+ * Implementation
+ * ******************************************************************************************** */
+
+template <typename MatrixType>
+NNLS<MatrixType>::NNLS()
+    : max_iter_(-1),
+      iterations_(0),
+      info_(ComputationInfo::InvalidInput),
+      numInactive_(0),
+      tolerance_(NumTraits<Scalar>::dummy_precision()) {}
+
+template <typename MatrixType>
+NNLS<MatrixType>::NNLS(const MatrixType &A, Index max_iter, Scalar tol) : max_iter_(max_iter), tolerance_(tol) {
+  compute(A);
+}
+
+template <typename MatrixType>
+template <typename MatrixDerived>
+NNLS<MatrixType> &NNLS<MatrixType>::compute(const EigenBase<MatrixDerived> &A) {
+  // Ensure Scalar type is real. The non-negativity constraint doesn't obviously extend to complex numbers.
+  EIGEN_STATIC_ASSERT(!NumTraits<Scalar>::IsComplex, NUMERIC_TYPE_MUST_BE_REAL);
+
+  // max_iter_: unchanged
+  iterations_ = 0;
+  info_ = ComputationInfo::Success;
+  numInactive_ = 0;
+  // tolerance: unchanged
+  A_ = A.derived();
+  AtA_.noalias() = A_.transpose() * A_;
+  x_.resize(A_.cols());
+  gradient_.resize(A_.cols());
+  y_.resize(A_.cols());
+  Atb_.resize(A_.cols());
+  index_sets_.resize(A_.cols());
+  QR_.resize(A_.rows(), A_.cols());
+  qrCoeffs_.resize(A_.cols());
+  tempSolutionVector_.resize(A_.cols());
+  tempRhsVector_.resize(A_.rows());
+
+  return *this;
+}
+
+template <typename MatrixType>
+const typename NNLS<MatrixType>::SolutionVectorType &NNLS<MatrixType>::solve(const RhsVectorType &b) {
+  // Initialize solver
+  iterations_ = 0;
+  info_ = ComputationInfo::NumericalIssue;
+  x_.setZero();
+
+  index_sets_ = IndicesType::LinSpaced(A_.cols(), 0, A_.cols() - 1);  // Identity permutation.
+  numInactive_ = 0;
+
+  // Precompute A^T*b
+  Atb_.noalias() = A_.transpose() * b;
+
+  const Index maxIterations = this->maxIterations();
+
+  // OUTER LOOP
+  while (true) {
+    // Early exit if all variables are inactive, which breaks 'maxCoeff' below.
+    if (A_.cols() == numInactive_) {
+      info_ = ComputationInfo::Success;
+      return x_;
+    }
+
+    // Find the maximum element of the gradient in the active set.
+    // If it is small or negative, then we have converged.
+    // Else, we move that variable to the inactive set.
+    gradient_.noalias() = Atb_ - AtA_ * x_;
+
+    const Index numActive = A_.cols() - numInactive_;
+    Index argmaxGradient = -1;
+    const Scalar maxGradient = gradient_(index_sets_.tail(numActive)).maxCoeff(&argmaxGradient);
+    argmaxGradient += numInactive_;  // because tail() skipped the first numInactive_ elements
+
+    if (maxGradient < tolerance_) {
+      info_ = ComputationInfo::Success;
+      return x_;
+    }
+
+    moveToInactiveSet_(argmaxGradient);
+
+    // INNER LOOP
+    while (true) {
+      // Check if max. number of iterations is reached
+      if (iterations_ >= maxIterations) {
+        info_ = ComputationInfo::NoConvergence;
+        return x_;
+      }
+
+      // Solve least-squares problem in inactive set only,
+      // this step is rather trivial as moveToInactiveSet_ & moveToActiveSet_
+      // updates the QR decomposition of inactive columns A^N.
+      // solveInactiveSet_ puts the solution in y_
+      solveInactiveSet_(b);
+      ++iterations_;  // The solve is expensive, so that is what we count as an iteration.
+
+      // Check feasibility...
+      bool feasible = true;
+      Scalar alpha = NumTraits<Scalar>::highest();
+      Index infeasibleIdx = -1;  // Which variable became infeasible first.
+      for (Index i = 0; i < numInactive_; i++) {
+        Index idx = index_sets_[i];
+        if (y_(idx) < 0) {
+          // t should always be in [0,1].
+          Scalar t = -x_(idx) / (y_(idx) - x_(idx));
+          if (alpha > t) {
+            alpha = t;
+            infeasibleIdx = i;
+            feasible = false;
+          }
+        }
+      }
+      eigen_assert(feasible || 0 <= infeasibleIdx);
+
+      // If solution is feasible, exit to outer loop
+      if (feasible) {
+        x_ = y_;
+        break;
+      }
+
+      // Infeasible solution -> interpolate to feasible one
+      for (Index i = 0; i < numInactive_; i++) {
+        Index idx = index_sets_[i];
+        x_(idx) += alpha * (y_(idx) - x_(idx));
+      }
+
+      // Remove these indices from the inactive set and update QR decomposition
+      moveToActiveSet_(infeasibleIdx);
+    }
+  }
+}
+
+template <typename MatrixType>
+void NNLS<MatrixType>::moveToInactiveSet_(Index idx) {
+  // Update permutation matrix:
+  std::swap(index_sets_(idx), index_sets_(numInactive_));
+  numInactive_++;
+
+  // Perform rank-1 update of the QR decomposition stored in QR_ & qrCoeff_
+  internal::householder_qr_inplace_update(QR_, qrCoeffs_, A_.col(index_sets_(numInactive_ - 1)), numInactive_ - 1,
+                                          tempSolutionVector_.data());
+}
+
+template <typename MatrixType>
+void NNLS<MatrixType>::moveToActiveSet_(Index idx) {
+  // swap index with last inactive one & reduce number of inactive columns
+  std::swap(index_sets_(idx), index_sets_(numInactive_ - 1));
+  numInactive_--;
+  // Update QR decomposition starting from the removed index up to the end [idx, ..., numInactive_]
+  for (Index i = idx; i < numInactive_; i++) {
+    Index col = index_sets_(i);
+    internal::householder_qr_inplace_update(QR_, qrCoeffs_, A_.col(col), i, tempSolutionVector_.data());
+  }
+}
+
+template <typename MatrixType>
+void NNLS<MatrixType>::solveInactiveSet_(const RhsVectorType &b) {
+  eigen_assert(numInactive_ > 0);
+
+  tempRhsVector_ = b;
+
+  // tmpRHS(0:numInactive_-1) := Q'*b
+  // tmpRHS(numInactive_:end) := useless stuff we would rather not compute at all.
+  tempRhsVector_.applyOnTheLeft(
+      householderSequence(QR_.leftCols(numInactive_), qrCoeffs_.head(numInactive_)).transpose());
+
+  // tempSol(0:numInactive_-1) := inv(R) * Q' * b
+  //  = the least-squares solution for the inactive variables.
+  tempSolutionVector_.head(numInactive_) =            //
+      QR_.topLeftCorner(numInactive_, numInactive_)   //
+          .template triangularView<Upper>()           //
+          .solve(tempRhsVector_.head(numInactive_));  //
+
+  // tempSol(numInactive_:end) := 0 = the value for the constrained variables.
+  tempSolutionVector_.tail(y_.size() - numInactive_).setZero();
+
+  // Back permute into original column order of A
+  y_.noalias() = index_sets_.asPermutation() * tempSolutionVector_.head(y_.size());
+}
+
+}  // namespace Eigen
+
+#endif  // EIGEN_NNLS_H
diff --git a/inst/include/unsupported/Eigen/NonLinearOptimization b/inst/include/unsupported/Eigen/NonLinearOptimization
index 600ab4c1..486dd4a5 100644
--- a/inst/include/unsupported/Eigen/NonLinearOptimization
+++ b/inst/include/unsupported/Eigen/NonLinearOptimization
@@ -7,111 +7,118 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#ifndef EIGEN_NONLINEAROPTIMIZATION_MODULE
-#define EIGEN_NONLINEAROPTIMIZATION_MODULE
+#ifndef EIGEN_NONLINEAROPTIMIZATION_MODULE_H
+#define EIGEN_NONLINEAROPTIMIZATION_MODULE_H
 
 #include <vector>
 
-#include <Eigen/Core>
-#include <Eigen/Jacobi>
-#include <Eigen/QR>
-#include <unsupported/Eigen/NumericalDiff>
+#include "../../Eigen/Core"
+#include "../../Eigen/Jacobi"
+#include "../../Eigen/QR"
+#include "NumericalDiff"
 
 /**
-  * \defgroup NonLinearOptimization_Module Non linear optimization module
-  *
-  * \code
-  * #include <unsupported/Eigen/NonLinearOptimization>
-  * \endcode
-  *
-  * This module provides implementation of two important algorithms in non linear
-  * optimization. In both cases, we consider a system of non linear functions. Of
-  * course, this should work, and even work very well if those functions are
-  * actually linear. But if this is so, you should probably better use other
-  * methods more fitted to this special case.
-  *
-  * One algorithm allows to find an extremum of such a system (Levenberg
-  * Marquardt algorithm) and the second one is used to find 
-  * a zero for the system (Powell hybrid "dogleg" method).
-  *
-  * This code is a port of minpack (http://en.wikipedia.org/wiki/MINPACK).
-  * Minpack is a very famous, old, robust and well-reknown package, written in 
-  * fortran. Those implementations have been carefully tuned, tested, and used
-  * for several decades.
-  *
-  * The original fortran code was automatically translated using f2c (http://en.wikipedia.org/wiki/F2c) in C,
-  * then c++, and then cleaned by several different authors.
-  * The last one of those cleanings being our starting point : 
-  * http://devernay.free.fr/hacks/cminpack.html
-  * 
-  * Finally, we ported this code to Eigen, creating classes and API
-  * coherent with Eigen. When possible, we switched to Eigen
-  * implementation, such as most linear algebra (vectors, matrices, stable norms).
-  *
-  * Doing so, we were very careful to check the tests we setup at the very
-  * beginning, which ensure that the same results are found.
-  *
-  * \section Tests Tests
-  * 
-  * The tests are placed in the file unsupported/test/NonLinear.cpp.
-  * 
-  * There are two kinds of tests : those that come from examples bundled with cminpack.
-  * They guaranty we get the same results as the original algorithms (value for 'x',
-  * for the number of evaluations of the function, and for the number of evaluations
-  * of the jacobian if ever).
-  * 
-  * Other tests were added by myself at the very beginning of the 
-  * process and check the results for levenberg-marquardt using the reference data 
-  * on http://www.itl.nist.gov/div898/strd/nls/nls_main.shtml. Since then i've 
-  * carefully checked that the same results were obtained when modifiying the 
-  * code. Please note that we do not always get the exact same decimals as they do,
-  * but this is ok : they use 128bits float, and we do the tests using the C type 'double',
-  * which is 64 bits on most platforms (x86 and amd64, at least).
-  * I've performed those tests on several other implementations of levenberg-marquardt, and
-  * (c)minpack performs VERY well compared to those, both in accuracy and speed.
-  * 
-  * The documentation for running the tests is on the wiki
-  * http://eigen.tuxfamily.org/index.php?title=Tests
-  * 
-  * \section API API : overview of methods
-  * 
-  * Both algorithms can use either the jacobian (provided by the user) or compute 
-  * an approximation by themselves (actually using Eigen \ref NumericalDiff_Module).
-  * The part of API referring to the latter use 'NumericalDiff' in the method names
-  * (exemple: LevenbergMarquardt.minimizeNumericalDiff() ) 
-  * 
-  * The methods LevenbergMarquardt.lmder1()/lmdif1()/lmstr1() and 
-  * HybridNonLinearSolver.hybrj1()/hybrd1() are specific methods from the original 
-  * minpack package that you probably should NOT use until you are porting a code that
-  *  was previously using minpack. They just define a 'simple' API with default values 
-  * for some parameters.
-  * 
-  * All algorithms are provided using Two APIs :
-  *     - one where the user inits the algorithm, and uses '*OneStep()' as much as he wants : 
-  * this way the caller have control over the steps
-  *     - one where the user just calls a method (optimize() or solve()) which will 
-  * handle the loop: init + loop until a stop condition is met. Those are provided for
-  *  convenience.
-  * 
-  * As an example, the method LevenbergMarquardt::minimize() is 
-  * implemented as follow : 
-  * \code
-  * Status LevenbergMarquardt<FunctorType,Scalar>::minimize(FVectorType  &x, const int mode)
-  * {
-  *     Status status = minimizeInit(x, mode);
-  *     do {
-  *         status = minimizeOneStep(x, mode);
-  *     } while (status==Running);
-  *     return status;
-  * }
-  * \endcode
-  * 
-  * \section examples Examples
-  * 
-  * The easiest way to understand how to use this module is by looking at the many examples in the file
-  * unsupported/test/NonLinearOptimization.cpp.
-  */
+ * \defgroup NonLinearOptimization_Module Non linear optimization module
+ *
+ * \code
+ * #include <unsupported/Eigen/NonLinearOptimization>
+ * \endcode
+ *
+ * This module provides implementation of two important algorithms in non linear
+ * optimization. In both cases, we consider a system of non linear functions. Of
+ * course, this should work, and even work very well if those functions are
+ * actually linear. But if this is so, you should probably better use other
+ * methods more fitted to this special case.
+ *
+ * One algorithm allows to find a least-squares solution of such a system
+ * (Levenberg-Marquardt algorithm) and the second one is used to find
+ * a zero for the system (Powell hybrid "dogleg" method).
+ *
+ * This code is a port of minpack (http://en.wikipedia.org/wiki/MINPACK).
+ * Minpack is a very famous, old, robust and well renowned package, written in
+ * fortran. Those implementations have been carefully tuned, tested, and used
+ * for several decades.
+ *
+ * The original fortran code was automatically translated using f2c (http://en.wikipedia.org/wiki/F2c) in C,
+ * then c++, and then cleaned by several different authors.
+ * The last one of those cleanings being our starting point :
+ * http://devernay.free.fr/hacks/cminpack.html
+ *
+ * Finally, we ported this code to Eigen, creating classes and API
+ * coherent with Eigen. When possible, we switched to Eigen
+ * implementation, such as most linear algebra (vectors, matrices, stable norms).
+ *
+ * Doing so, we were very careful to check the tests we setup at the very
+ * beginning, which ensure that the same results are found.
+ *
+ * \section Tests Tests
+ *
+ * The tests are placed in the file unsupported/test/NonLinear.cpp.
+ *
+ * There are two kinds of tests : those that come from examples bundled with cminpack.
+ * They guaranty we get the same results as the original algorithms (value for 'x',
+ * for the number of evaluations of the function, and for the number of evaluations
+ * of the Jacobian if ever).
+ *
+ * Other tests were added by myself at the very beginning of the
+ * process and check the results for Levenberg-Marquardt using the reference data
+ * on http://www.itl.nist.gov/div898/strd/nls/nls_main.shtml. Since then i've
+ * carefully checked that the same results were obtained when modifying the
+ * code. Please note that we do not always get the exact same decimals as they do,
+ * but this is ok : they use 128bits float, and we do the tests using the C type 'double',
+ * which is 64 bits on most platforms (x86 and amd64, at least).
+ * I've performed those tests on several other implementations of Levenberg-Marquardt, and
+ * (c)minpack performs VERY well compared to those, both in accuracy and speed.
+ *
+ * The documentation for running the tests is on the wiki
+ * http://eigen.tuxfamily.org/index.php?title=Tests
+ *
+ * \section API API: overview of methods
+ *
+ * Both algorithms needs a functor computing the Jacobian. It can be computed by
+ * hand, using auto-differentiation (see \ref AutoDiff_Module), or using numerical
+ * differences (see \ref NumericalDiff_Module). For instance:
+ *\code
+ * MyFunc func;
+ * NumericalDiff<MyFunc> func_with_num_diff(func);
+ * LevenbergMarquardt<NumericalDiff<MyFunc> > lm(func_with_num_diff);
+ * \endcode
+ * For HybridNonLinearSolver, the method solveNumericalDiff() does the above wrapping for
+ * you.
+ *
+ * The methods LevenbergMarquardt.lmder1()/lmdif1()/lmstr1() and
+ * HybridNonLinearSolver.hybrj1()/hybrd1() are specific methods from the original
+ * minpack package that you probably should NOT use until you are porting a code that
+ * was previously using minpack. They just define a 'simple' API with default values
+ * for some parameters.
+ *
+ * All algorithms are provided using two APIs :
+ *     - one where the user inits the algorithm, and uses '*OneStep()' as much as he wants :
+ * this way the caller have control over the steps
+ *     - one where the user just calls a method (optimize() or solve()) which will
+ * handle the loop: init + loop until a stop condition is met. Those are provided for
+ *  convenience.
+ *
+ * As an example, the method LevenbergMarquardt::minimize() is
+ * implemented as follow:
+ * \code
+ * Status LevenbergMarquardt<FunctorType,Scalar>::minimize(FVectorType  &x, const int mode)
+ * {
+ *     Status status = minimizeInit(x, mode);
+ *     do {
+ *         status = minimizeOneStep(x, mode);
+ *     } while (status==Running);
+ *     return status;
+ * }
+ * \endcode
+ *
+ * \section examples Examples
+ *
+ * The easiest way to understand how to use this module is by looking at the many examples in the file
+ * unsupported/test/NonLinearOptimization.cpp.
+ */
 
+// IWYU pragma: begin_exports
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 
 #include "src/NonLinearOptimization/qrsolv.h"
@@ -129,6 +136,6 @@
 
 #include "src/NonLinearOptimization/HybridNonLinearSolver.h"
 #include "src/NonLinearOptimization/LevenbergMarquardt.h"
+// IWYU pragma: end_exports
 
-
-#endif // EIGEN_NONLINEAROPTIMIZATION_MODULE
+#endif  // EIGEN_NONLINEAROPTIMIZATION_MODULE_H
diff --git a/inst/include/unsupported/Eigen/NumericalDiff b/inst/include/unsupported/Eigen/NumericalDiff
index 433334ca..ed236c86 100644
--- a/inst/include/unsupported/Eigen/NumericalDiff
+++ b/inst/include/unsupported/Eigen/NumericalDiff
@@ -7,50 +7,51 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#ifndef EIGEN_NUMERICALDIFF_MODULE
-#define EIGEN_NUMERICALDIFF_MODULE
+#ifndef EIGEN_NUMERICALDIFF_MODULE_H
+#define EIGEN_NUMERICALDIFF_MODULE_H
 
-#include <Eigen/Core>
+#include "../../Eigen/Core"
 
 namespace Eigen {
 
 /**
-  * \defgroup NumericalDiff_Module Numerical differentiation module
-  *
-  * \code
-  * #include <unsupported/Eigen/NumericalDiff>
-  * \endcode
-  *
-  * See http://en.wikipedia.org/wiki/Numerical_differentiation
-  *
-  * Warning : this should NOT be confused with automatic differentiation, which
-  * is a different method and has its own module in Eigen : \ref
-  * AutoDiff_Module.
-  *
-  * Currently only "Forward" and "Central" schemes are implemented. Those
-  * are basic methods, and there exist some more elaborated way of
-  * computing such approximates. They are implemented using both
-  * proprietary and free software, and usually requires linking to an
-  * external library. It is very easy for you to write a functor
-  * using such software, and the purpose is quite orthogonal to what we
-  * want to achieve with Eigen.
-  *
-  * This is why we will not provide wrappers for every great numerical
-  * differentiation software that exist, but should rather stick with those
-  * basic ones, that still are useful for testing.
-  *
-  * Also, the \ref NonLinearOptimization_Module needs this in order to
-  * provide full features compatibility with the original (c)minpack
-  * package.
-  *
-  */
+ * \defgroup NumericalDiff_Module Numerical differentiation module
+ *
+ * \code
+ * #include <unsupported/Eigen/NumericalDiff>
+ * \endcode
+ *
+ * See http://en.wikipedia.org/wiki/Numerical_differentiation
+ *
+ * Warning : this should NOT be confused with automatic differentiation, which
+ * is a different method and has its own module in Eigen : \ref
+ * AutoDiff_Module.
+ *
+ * Currently only "Forward" and "Central" schemes are implemented. Those
+ * are basic methods, and there exist some more elaborated way of
+ * computing such approximates. They are implemented using both
+ * proprietary and free software, and usually requires linking to an
+ * external library. It is very easy for you to write a functor
+ * using such software, and the purpose is quite orthogonal to what we
+ * want to achieve with Eigen.
+ *
+ * This is why we will not provide wrappers for every great numerical
+ * differentiation software that exist, but should rather stick with those
+ * basic ones, that still are useful for testing.
+ *
+ * Also, the \ref NonLinearOptimization_Module needs this in order to
+ * provide full features compatibility with the original (c)minpack
+ * package.
+ *
+ */
 }
 
 //@{
 
+// IWYU pragma: begin_exports
 #include "src/NumericalDiff/NumericalDiff.h"
+// IWYU pragma: end_exports
 
 //@}
 
-
-#endif // EIGEN_NUMERICALDIFF_MODULE
+#endif  // EIGEN_NUMERICALDIFF_MODULE_H
diff --git a/inst/include/unsupported/Eigen/OpenGLSupport b/inst/include/unsupported/Eigen/OpenGLSupport
index e2769449..f22ae565 100644
--- a/inst/include/unsupported/Eigen/OpenGLSupport
+++ b/inst/include/unsupported/Eigen/OpenGLSupport
@@ -7,316 +7,344 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#ifndef EIGEN_OPENGL_MODULE
-#define EIGEN_OPENGL_MODULE
+#ifndef EIGEN_OPENGL_MODULE_H
+#define EIGEN_OPENGL_MODULE_H
 
-#include <Eigen/Geometry>
+#include "../../Eigen/Geometry"
 
 #if defined(__APPLE_CC__)
-  #include <OpenGL/gl.h>
+#include <OpenGL/gl.h>
 #else
-  #include <GL/gl.h>
+#include <GL/gl.h>
 #endif
 
 namespace Eigen {
 
 /**
-  * \defgroup OpenGLSUpport_Module OpenGL Support module
-  *
-  * This module provides wrapper functions for a couple of OpenGL functions
-  * which simplify the way to pass Eigen's object to openGL.
-  * Here is an exmaple:
-  * 
-  * \code
-  * // You need to add path_to_eigen/unsupported to your include path.
-  * #include <Eigen/OpenGLSupport>
-  * // ...
-  * Vector3f x, y;
-  * Matrix3f rot;
-  * 
-  * glVertex(y + x * rot);
-  * 
-  * Quaternion q;
-  * glRotate(q);
-  * 
-  * // ...
-  * \endcode
-  *
-  */
+ * \defgroup OpenGLSUpport_Module OpenGL Support module
+ *
+ * This module provides wrapper functions for a couple of OpenGL functions
+ * which simplify the way to pass Eigen's object to openGL.
+ * Here is an example:
+ *
+ * \code
+ * // You need to add path_to_eigen/unsupported to your include path.
+ * #include <Eigen/OpenGLSupport>
+ * // ...
+ * Vector3f x, y;
+ * Matrix3f rot;
+ *
+ * glVertex(y + x * rot);
+ *
+ * Quaternion q;
+ * glRotate(q);
+ *
+ * // ...
+ * \endcode
+ *
+ */
 //@{
 
-#define EIGEN_GL_FUNC_DECLARATION(FUNC)                                                                             \
-namespace internal {                                                                                                \
-  template< typename XprType,                                                                                       \
-            typename Scalar = typename XprType::Scalar,                                                             \
-            int Rows = XprType::RowsAtCompileTime,                                                                  \
-            int Cols = XprType::ColsAtCompileTime,                                                                  \
-            bool IsGLCompatible = bool(XprType::Flags&LinearAccessBit)                                              \
-                              && bool(XprType::Flags&DirectAccessBit)                                               \
-                              && (XprType::IsVectorAtCompileTime || (XprType::Flags&RowMajorBit)==0)>               \
-  struct EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl);                                                                      \
-                                                                                                                    \
-  template<typename XprType, typename Scalar, int Rows, int Cols>                                                   \
-  struct EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl)<XprType,Scalar,Rows,Cols,false> {                                     \
-    inline static void run(const XprType& p) {                                                                      \
-      EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl)<typename plain_matrix_type_column_major<XprType>::type>::run(p); }       \
-  };                                                                                                                \
-}                                                                                                                   \
-                                                                                                                    \
-template<typename Derived> inline void FUNC(const Eigen::DenseBase<Derived>& p) {                                   \
-  EIGEN_CAT(EIGEN_CAT(internal::gl_,FUNC),_impl)<Derived>::run(p.derived());                                        \
-}
-
-
-#define EIGEN_GL_FUNC_SPECIALIZATION_MAT(FUNC,SCALAR,ROWS,COLS,SUFFIX)                                              \
-namespace internal {                                                                                                \
-  template< typename XprType> struct EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl)<XprType, SCALAR, ROWS, COLS, true> {      \
-    inline static void run(const XprType& p) { FUNC##SUFFIX(p.data()); }                                            \
-  };                                                                                                                \
-}
-
-  
-#define EIGEN_GL_FUNC_SPECIALIZATION_VEC(FUNC,SCALAR,SIZE,SUFFIX)                                                   \
-namespace internal {                                                                                                \
-  template< typename XprType> struct EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl)<XprType, SCALAR, SIZE, 1, true> {         \
-    inline static void run(const XprType& p) { FUNC##SUFFIX(p.data()); }                                            \
-  };                                                                                                                \
-  template< typename XprType> struct EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl)<XprType, SCALAR, 1, SIZE, true> {         \
-    inline static void run(const XprType& p) { FUNC##SUFFIX(p.data()); }                                            \
-  };                                                                                                                \
-}
-
-  
-EIGEN_GL_FUNC_DECLARATION       (glVertex)
-EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,int,    2,2iv)
-EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,short,  2,2sv)
-EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,float,  2,2fv)
-EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,double, 2,2dv)
-EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,int,    3,3iv)
-EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,short,  3,3sv)
-EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,float,  3,3fv)
-EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,double, 3,3dv)
-EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,int,    4,4iv)
-EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,short,  4,4sv)
-EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,float,  4,4fv)
-EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex,double, 4,4dv)
-
-EIGEN_GL_FUNC_DECLARATION       (glTexCoord)
-EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,int,    2,2iv)
-EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,short,  2,2sv)
-EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,float,  2,2fv)
-EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,double, 2,2dv)
-EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,int,    3,3iv)
-EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,short,  3,3sv)
-EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,float,  3,3fv)
-EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,double, 3,3dv)
-EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,int,    4,4iv)
-EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,short,  4,4sv)
-EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,float,  4,4fv)
-EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord,double, 4,4dv)
-
-EIGEN_GL_FUNC_DECLARATION       (glColor)
-EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,int,    2,2iv)
-EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,short,  2,2sv)
-EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,float,  2,2fv)
-EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,double, 2,2dv)
-EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,int,    3,3iv)
-EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,short,  3,3sv)
-EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,float,  3,3fv)
-EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,double, 3,3dv)
-EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,int,    4,4iv)
-EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,short,  4,4sv)
-EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,float,  4,4fv)
-EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor,double, 4,4dv)
-
-EIGEN_GL_FUNC_DECLARATION       (glNormal)
-EIGEN_GL_FUNC_SPECIALIZATION_VEC(glNormal,int,    3,3iv)
-EIGEN_GL_FUNC_SPECIALIZATION_VEC(glNormal,short,  3,3sv)
-EIGEN_GL_FUNC_SPECIALIZATION_VEC(glNormal,float,  3,3fv)
-EIGEN_GL_FUNC_SPECIALIZATION_VEC(glNormal,double, 3,3dv)
-
-inline void glScale2fv(const float*  v) { glScalef(v[0], v[1], 1.f);  }
-inline void glScale2dv(const double* v) { glScaled(v[0], v[1], 1.0);  }
-inline void glScale3fv(const float*  v) { glScalef(v[0], v[1], v[2]); }
+#define EIGEN_GL_FUNC_DECLARATION(FUNC)                                                                          \
+  namespace internal {                                                                                           \
+  template <typename XprType, typename Scalar = typename XprType::Scalar, int Rows = XprType::RowsAtCompileTime, \
+            int Cols = XprType::ColsAtCompileTime,                                                               \
+            bool IsGLCompatible = bool(internal::evaluator<XprType>::Flags & LinearAccessBit) &&                 \
+                                  bool(XprType::Flags & DirectAccessBit) &&                                      \
+                                  (XprType::IsVectorAtCompileTime || (XprType::Flags & RowMajorBit) == 0)>       \
+  struct EIGEN_CAT(EIGEN_CAT(gl_, FUNC), _impl);                                                                 \
+                                                                                                                 \
+  template <typename XprType, typename Scalar, int Rows, int Cols>                                               \
+  struct EIGEN_CAT(EIGEN_CAT(gl_, FUNC), _impl)<XprType, Scalar, Rows, Cols, false> {                            \
+    inline static void run(const XprType& p) {                                                                   \
+      EIGEN_CAT(EIGEN_CAT(gl_, FUNC), _impl)<typename plain_matrix_type_column_major<XprType>::type>::run(p);    \
+    }                                                                                                            \
+  };                                                                                                             \
+  }                                                                                                              \
+                                                                                                                 \
+  template <typename Derived>                                                                                    \
+  inline void FUNC(const Eigen::DenseBase<Derived>& p) {                                                         \
+    EIGEN_CAT(EIGEN_CAT(internal::gl_, FUNC), _impl)<Derived>::run(p.derived());                                 \
+  }
+
+#define EIGEN_GL_FUNC_SPECIALIZATION_MAT(FUNC, SCALAR, ROWS, COLS, SUFFIX)           \
+  namespace internal {                                                               \
+  template <typename XprType>                                                        \
+  struct EIGEN_CAT(EIGEN_CAT(gl_, FUNC), _impl)<XprType, SCALAR, ROWS, COLS, true> { \
+    inline static void run(const XprType& p) { FUNC##SUFFIX(p.data()); }             \
+  };                                                                                 \
+  }
+
+#define EIGEN_GL_FUNC_SPECIALIZATION_VEC(FUNC, SCALAR, SIZE, SUFFIX)              \
+  namespace internal {                                                            \
+  template <typename XprType>                                                     \
+  struct EIGEN_CAT(EIGEN_CAT(gl_, FUNC), _impl)<XprType, SCALAR, SIZE, 1, true> { \
+    inline static void run(const XprType& p) { FUNC##SUFFIX(p.data()); }          \
+  };                                                                              \
+  template <typename XprType>                                                     \
+  struct EIGEN_CAT(EIGEN_CAT(gl_, FUNC), _impl)<XprType, SCALAR, 1, SIZE, true> { \
+    inline static void run(const XprType& p) { FUNC##SUFFIX(p.data()); }          \
+  };                                                                              \
+  }
+
+EIGEN_GL_FUNC_DECLARATION(glVertex)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex, int, 2, 2iv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex, short, 2, 2sv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex, float, 2, 2fv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex, double, 2, 2dv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex, int, 3, 3iv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex, short, 3, 3sv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex, float, 3, 3fv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex, double, 3, 3dv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex, int, 4, 4iv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex, short, 4, 4sv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex, float, 4, 4fv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glVertex, double, 4, 4dv)
+
+EIGEN_GL_FUNC_DECLARATION(glTexCoord)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord, int, 2, 2iv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord, short, 2, 2sv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord, float, 2, 2fv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord, double, 2, 2dv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord, int, 3, 3iv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord, short, 3, 3sv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord, float, 3, 3fv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord, double, 3, 3dv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord, int, 4, 4iv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord, short, 4, 4sv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord, float, 4, 4fv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTexCoord, double, 4, 4dv)
+
+EIGEN_GL_FUNC_DECLARATION(glColor)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor, int, 2, 2iv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor, short, 2, 2sv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor, float, 2, 2fv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor, double, 2, 2dv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor, int, 3, 3iv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor, short, 3, 3sv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor, float, 3, 3fv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor, double, 3, 3dv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor, int, 4, 4iv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor, short, 4, 4sv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor, float, 4, 4fv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glColor, double, 4, 4dv)
+
+EIGEN_GL_FUNC_DECLARATION(glNormal)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glNormal, int, 3, 3iv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glNormal, short, 3, 3sv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glNormal, float, 3, 3fv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glNormal, double, 3, 3dv)
+
+inline void glScale2fv(const float* v) { glScalef(v[0], v[1], 1.f); }
+inline void glScale2dv(const double* v) { glScaled(v[0], v[1], 1.0); }
+inline void glScale3fv(const float* v) { glScalef(v[0], v[1], v[2]); }
 inline void glScale3dv(const double* v) { glScaled(v[0], v[1], v[2]); }
 
-EIGEN_GL_FUNC_DECLARATION       (glScale)
-EIGEN_GL_FUNC_SPECIALIZATION_VEC(glScale,float,  2,2fv)
-EIGEN_GL_FUNC_SPECIALIZATION_VEC(glScale,double, 2,2dv)
-EIGEN_GL_FUNC_SPECIALIZATION_VEC(glScale,float,  3,3fv)
-EIGEN_GL_FUNC_SPECIALIZATION_VEC(glScale,double, 3,3dv)
+EIGEN_GL_FUNC_DECLARATION(glScale)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glScale, float, 2, 2fv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glScale, double, 2, 2dv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glScale, float, 3, 3fv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glScale, double, 3, 3dv)
 
-template<typename Scalar> void glScale(const UniformScaling<Scalar>& s)  { glScale(Matrix<Scalar,3,1>::Constant(s.factor())); }
+template <typename Scalar>
+void glScale(const UniformScaling<Scalar>& s) {
+  glScale(Matrix<Scalar, 3, 1>::Constant(s.factor()));
+}
 
-inline void glTranslate2fv(const float*  v) { glTranslatef(v[0], v[1], 0.f);  }
-inline void glTranslate2dv(const double* v) { glTranslated(v[0], v[1], 0.0);  }
-inline void glTranslate3fv(const float*  v) { glTranslatef(v[0], v[1], v[2]); }
+inline void glTranslate2fv(const float* v) { glTranslatef(v[0], v[1], 0.f); }
+inline void glTranslate2dv(const double* v) { glTranslated(v[0], v[1], 0.0); }
+inline void glTranslate3fv(const float* v) { glTranslatef(v[0], v[1], v[2]); }
 inline void glTranslate3dv(const double* v) { glTranslated(v[0], v[1], v[2]); }
 
-EIGEN_GL_FUNC_DECLARATION       (glTranslate)
-EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTranslate,float,  2,2fv)
-EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTranslate,double, 2,2dv)
-EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTranslate,float,  3,3fv)
-EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTranslate,double, 3,3dv)
-
-template<typename Scalar> void glTranslate(const Translation<Scalar,2>& t)  { glTranslate(t.vector()); }
-template<typename Scalar> void glTranslate(const Translation<Scalar,3>& t)  { glTranslate(t.vector()); }
-
-EIGEN_GL_FUNC_DECLARATION       (glMultMatrix)
-EIGEN_GL_FUNC_SPECIALIZATION_MAT(glMultMatrix,float,  4,4,f)
-EIGEN_GL_FUNC_SPECIALIZATION_MAT(glMultMatrix,double, 4,4,d)
-
-template<typename Scalar> void glMultMatrix(const Transform<Scalar,3,Affine>& t)        { glMultMatrix(t.matrix()); }
-template<typename Scalar> void glMultMatrix(const Transform<Scalar,3,Projective>& t)    { glMultMatrix(t.matrix()); }
-template<typename Scalar> void glMultMatrix(const Transform<Scalar,3,AffineCompact>& t) { glMultMatrix(Transform<Scalar,3,Affine>(t).matrix()); }
+EIGEN_GL_FUNC_DECLARATION(glTranslate)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTranslate, float, 2, 2fv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTranslate, double, 2, 2dv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTranslate, float, 3, 3fv)
+EIGEN_GL_FUNC_SPECIALIZATION_VEC(glTranslate, double, 3, 3dv)
 
-EIGEN_GL_FUNC_DECLARATION       (glLoadMatrix)
-EIGEN_GL_FUNC_SPECIALIZATION_MAT(glLoadMatrix,float,  4,4,f)
-EIGEN_GL_FUNC_SPECIALIZATION_MAT(glLoadMatrix,double, 4,4,d)
+template <typename Scalar>
+void glTranslate(const Translation<Scalar, 2>& t) {
+  glTranslate(t.vector());
+}
+template <typename Scalar>
+void glTranslate(const Translation<Scalar, 3>& t) {
+  glTranslate(t.vector());
+}
 
-template<typename Scalar> void glLoadMatrix(const Transform<Scalar,3,Affine>& t)        { glLoadMatrix(t.matrix()); }
-template<typename Scalar> void glLoadMatrix(const Transform<Scalar,3,Projective>& t)    { glLoadMatrix(t.matrix()); }
-template<typename Scalar> void glLoadMatrix(const Transform<Scalar,3,AffineCompact>& t) { glLoadMatrix(Transform<Scalar,3,Affine>(t).matrix()); }
+EIGEN_GL_FUNC_DECLARATION(glMultMatrix)
+EIGEN_GL_FUNC_SPECIALIZATION_MAT(glMultMatrix, float, 4, 4, f)
+EIGEN_GL_FUNC_SPECIALIZATION_MAT(glMultMatrix, double, 4, 4, d)
 
-inline void glRotate(const Rotation2D<float>& rot)
-{
-  glRotatef(rot.angle()*180.f/float(M_PI), 0.f, 0.f, 1.f);
+template <typename Scalar>
+void glMultMatrix(const Transform<Scalar, 3, Affine>& t) {
+  glMultMatrix(t.matrix());
 }
-inline void glRotate(const Rotation2D<double>& rot)
-{
-  glRotated(rot.angle()*180.0/M_PI, 0.0, 0.0, 1.0);
+template <typename Scalar>
+void glMultMatrix(const Transform<Scalar, 3, Projective>& t) {
+  glMultMatrix(t.matrix());
 }
-
-template<typename Derived> void glRotate(const RotationBase<Derived,3>& rot)
-{  
-  Transform<typename Derived::Scalar,3,Projective> tr(rot);
-  glMultMatrix(tr.matrix());
+template <typename Scalar>
+void glMultMatrix(const Transform<Scalar, 3, AffineCompact>& t) {
+  glMultMatrix(Transform<Scalar, 3, Affine>(t).matrix());
 }
 
-#define EIGEN_GL_MAKE_CONST_const const
-#define EIGEN_GL_MAKE_CONST__ 
-#define EIGEN_GL_EVAL(X) X
+EIGEN_GL_FUNC_DECLARATION(glLoadMatrix)
+EIGEN_GL_FUNC_SPECIALIZATION_MAT(glLoadMatrix, float, 4, 4, f)
+EIGEN_GL_FUNC_SPECIALIZATION_MAT(glLoadMatrix, double, 4, 4, d)
 
-#define EIGEN_GL_FUNC1_DECLARATION(FUNC,ARG1,CONST)                                                                             \
-namespace internal {                                                                                                            \
-  template< typename XprType,                                                                                                   \
-            typename Scalar = typename XprType::Scalar,                                                                         \
-            int Rows = XprType::RowsAtCompileTime,                                                                              \
-            int Cols = XprType::ColsAtCompileTime,                                                                              \
-            bool IsGLCompatible = bool(XprType::Flags&LinearAccessBit)                                                          \
-                              && bool(XprType::Flags&DirectAccessBit)                                                           \
-                              && (XprType::IsVectorAtCompileTime || (XprType::Flags&RowMajorBit)==0)>                           \
-  struct EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl);                                                                                  \
-                                                                                                                                \
-  template<typename XprType, typename Scalar, int Rows, int Cols>                                                               \
-  struct EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl)<XprType,Scalar,Rows,Cols,false> {                                                 \
-    inline static void run(ARG1 a,EIGEN_GL_EVAL(EIGEN_GL_MAKE_CONST_##CONST) XprType& p) {                                      \
-      EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl)<typename plain_matrix_type_column_major<XprType>::type>::run(a,p); }                 \
-  };                                                                                                                            \
-}                                                                                                                               \
-                                                                                                                                \
-template<typename Derived> inline void FUNC(ARG1 a,EIGEN_GL_EVAL(EIGEN_GL_MAKE_CONST_##CONST) Eigen::DenseBase<Derived>& p) {   \
-  EIGEN_CAT(EIGEN_CAT(internal::gl_,FUNC),_impl)<Derived>::run(a,p.derived());                                                  \
+template <typename Scalar>
+void glLoadMatrix(const Transform<Scalar, 3, Affine>& t) {
+  glLoadMatrix(t.matrix());
+}
+template <typename Scalar>
+void glLoadMatrix(const Transform<Scalar, 3, Projective>& t) {
+  glLoadMatrix(t.matrix());
+}
+template <typename Scalar>
+void glLoadMatrix(const Transform<Scalar, 3, AffineCompact>& t) {
+  glLoadMatrix(Transform<Scalar, 3, Affine>(t).matrix());
 }
 
-
-#define EIGEN_GL_FUNC1_SPECIALIZATION_MAT(FUNC,ARG1,CONST,SCALAR,ROWS,COLS,SUFFIX)                                              \
-namespace internal {                                                                                                            \
-  template< typename XprType> struct EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl)<XprType, SCALAR, ROWS, COLS, true> {                  \
-    inline static void run(ARG1 a, EIGEN_GL_EVAL(EIGEN_GL_MAKE_CONST_##CONST) XprType& p) { FUNC##SUFFIX(a,p.data()); }         \
-  }; \
+inline void glRotate(const Rotation2D<float>& rot) { glRotatef(rot.angle() * 180.f / float(EIGEN_PI), 0.f, 0.f, 1.f); }
+inline void glRotate(const Rotation2D<double>& rot) {
+  glRotated(rot.angle() * 180.0 / double(EIGEN_PI), 0.0, 0.0, 1.0);
 }
 
-  
-#define EIGEN_GL_FUNC1_SPECIALIZATION_VEC(FUNC,ARG1,CONST,SCALAR,SIZE,SUFFIX)                                                   \
-namespace internal {                                                                                                            \
-  template< typename XprType> struct EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl)<XprType, SCALAR, SIZE, 1, true> {                     \
-    inline static void run(ARG1 a, EIGEN_GL_EVAL(EIGEN_GL_MAKE_CONST_##CONST) XprType& p) { FUNC##SUFFIX(a,p.data()); }         \
-  };                                                                                                                            \
-  template< typename XprType> struct EIGEN_CAT(EIGEN_CAT(gl_,FUNC),_impl)<XprType, SCALAR, 1, SIZE, true> {                     \
-    inline static void run(ARG1 a, EIGEN_GL_EVAL(EIGEN_GL_MAKE_CONST_##CONST) XprType& p) { FUNC##SUFFIX(a,p.data()); }         \
-  };                                                                                                                            \
+template <typename Derived>
+void glRotate(const RotationBase<Derived, 3>& rot) {
+  Transform<typename Derived::Scalar, 3, Projective> tr(rot);
+  glMultMatrix(tr.matrix());
 }
 
-EIGEN_GL_FUNC1_DECLARATION       (glGet,GLenum,_)
-EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glGet,GLenum,_,float,  4,4,Floatv)
-EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glGet,GLenum,_,double, 4,4,Doublev)
+#define EIGEN_GL_MAKE_CONST_const const
+#define EIGEN_GL_MAKE_CONST__
+#define EIGEN_GL_EVAL(X) X
+
+#define EIGEN_GL_FUNC1_DECLARATION(FUNC, ARG1, CONST)                                                            \
+  namespace internal {                                                                                           \
+  template <typename XprType, typename Scalar = typename XprType::Scalar, int Rows = XprType::RowsAtCompileTime, \
+            int Cols = XprType::ColsAtCompileTime,                                                               \
+            bool IsGLCompatible = bool(internal::evaluator<XprType>::Flags & LinearAccessBit) &&                 \
+                                  bool(XprType::Flags & DirectAccessBit) &&                                      \
+                                  (XprType::IsVectorAtCompileTime || (XprType::Flags & RowMajorBit) == 0)>       \
+  struct EIGEN_CAT(EIGEN_CAT(gl_, FUNC), _impl);                                                                 \
+                                                                                                                 \
+  template <typename XprType, typename Scalar, int Rows, int Cols>                                               \
+  struct EIGEN_CAT(EIGEN_CAT(gl_, FUNC), _impl)<XprType, Scalar, Rows, Cols, false> {                            \
+    inline static void run(ARG1 a, EIGEN_GL_EVAL(EIGEN_GL_MAKE_CONST_##CONST) XprType& p) {                      \
+      EIGEN_CAT(EIGEN_CAT(gl_, FUNC), _impl)<typename plain_matrix_type_column_major<XprType>::type>::run(a, p); \
+    }                                                                                                            \
+  };                                                                                                             \
+  }                                                                                                              \
+                                                                                                                 \
+  template <typename Derived>                                                                                    \
+  inline void FUNC(ARG1 a, EIGEN_GL_EVAL(EIGEN_GL_MAKE_CONST_##CONST) Eigen::DenseBase<Derived>& p) {            \
+    EIGEN_CAT(EIGEN_CAT(internal::gl_, FUNC), _impl)<Derived>::run(a, p.derived());                              \
+  }
+
+#define EIGEN_GL_FUNC1_SPECIALIZATION_MAT(FUNC, ARG1, CONST, SCALAR, ROWS, COLS, SUFFIX)    \
+  namespace internal {                                                                      \
+  template <typename XprType>                                                               \
+  struct EIGEN_CAT(EIGEN_CAT(gl_, FUNC), _impl)<XprType, SCALAR, ROWS, COLS, true> {        \
+    inline static void run(ARG1 a, EIGEN_GL_EVAL(EIGEN_GL_MAKE_CONST_##CONST) XprType& p) { \
+      FUNC##SUFFIX(a, p.data());                                                            \
+    }                                                                                       \
+  };                                                                                        \
+  }
+
+#define EIGEN_GL_FUNC1_SPECIALIZATION_VEC(FUNC, ARG1, CONST, SCALAR, SIZE, SUFFIX)          \
+  namespace internal {                                                                      \
+  template <typename XprType>                                                               \
+  struct EIGEN_CAT(EIGEN_CAT(gl_, FUNC), _impl)<XprType, SCALAR, SIZE, 1, true> {           \
+    inline static void run(ARG1 a, EIGEN_GL_EVAL(EIGEN_GL_MAKE_CONST_##CONST) XprType& p) { \
+      FUNC##SUFFIX(a, p.data());                                                            \
+    }                                                                                       \
+  };                                                                                        \
+  template <typename XprType>                                                               \
+  struct EIGEN_CAT(EIGEN_CAT(gl_, FUNC), _impl)<XprType, SCALAR, 1, SIZE, true> {           \
+    inline static void run(ARG1 a, EIGEN_GL_EVAL(EIGEN_GL_MAKE_CONST_##CONST) XprType& p) { \
+      FUNC##SUFFIX(a, p.data());                                                            \
+    }                                                                                       \
+  };                                                                                        \
+  }
+
+EIGEN_GL_FUNC1_DECLARATION(glGet, GLenum, _)
+EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glGet, GLenum, _, float, 4, 4, Floatv)
+EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glGet, GLenum, _, double, 4, 4, Doublev)
 
 // glUniform API
 
 #ifdef GL_VERSION_2_0
 
-inline void glUniform2fv_ei  (GLint loc, const float* v)         { glUniform2fv(loc,1,v); }
-inline void glUniform2iv_ei  (GLint loc, const int* v)           { glUniform2iv(loc,1,v); }
+inline void glUniform2fv_ei(GLint loc, const float* v) { glUniform2fv(loc, 1, v); }
+inline void glUniform2iv_ei(GLint loc, const int* v) { glUniform2iv(loc, 1, v); }
 
-inline void glUniform3fv_ei  (GLint loc, const float* v)         { glUniform3fv(loc,1,v); }
-inline void glUniform3iv_ei  (GLint loc, const int* v)           { glUniform3iv(loc,1,v); }
+inline void glUniform3fv_ei(GLint loc, const float* v) { glUniform3fv(loc, 1, v); }
+inline void glUniform3iv_ei(GLint loc, const int* v) { glUniform3iv(loc, 1, v); }
 
-inline void glUniform4fv_ei  (GLint loc, const float* v)         { glUniform4fv(loc,1,v); }
-inline void glUniform4iv_ei  (GLint loc, const int* v)           { glUniform4iv(loc,1,v); }
+inline void glUniform4fv_ei(GLint loc, const float* v) { glUniform4fv(loc, 1, v); }
+inline void glUniform4iv_ei(GLint loc, const int* v) { glUniform4iv(loc, 1, v); }
 
-inline void glUniformMatrix2fv_ei  (GLint loc, const float* v)         { glUniformMatrix2fv(loc,1,false,v); }
-inline void glUniformMatrix3fv_ei  (GLint loc, const float* v)         { glUniformMatrix3fv(loc,1,false,v); }
-inline void glUniformMatrix4fv_ei  (GLint loc, const float* v)         { glUniformMatrix4fv(loc,1,false,v); }
+inline void glUniformMatrix2fv_ei(GLint loc, const float* v) { glUniformMatrix2fv(loc, 1, false, v); }
+inline void glUniformMatrix3fv_ei(GLint loc, const float* v) { glUniformMatrix3fv(loc, 1, false, v); }
+inline void glUniformMatrix4fv_ei(GLint loc, const float* v) { glUniformMatrix4fv(loc, 1, false, v); }
 
+EIGEN_GL_FUNC1_DECLARATION(glUniform, GLint, const)
+EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform, GLint, const, float, 2, 2fv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform, GLint, const, int, 2, 2iv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform, GLint, const, float, 3, 3fv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform, GLint, const, int, 3, 3iv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform, GLint, const, float, 4, 4fv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform, GLint, const, int, 4, 4iv_ei)
 
-EIGEN_GL_FUNC1_DECLARATION       (glUniform,GLint,const)
-EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,float,        2,2fv_ei)
-EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,int,          2,2iv_ei)
-EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,float,        3,3fv_ei)
-EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,int,          3,3iv_ei)
-EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,float,        4,4fv_ei)
-EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,int,          4,4iv_ei)
-
-EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform,GLint,const,float,        2,2,Matrix2fv_ei)
-EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform,GLint,const,float,        3,3,Matrix3fv_ei)
-EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform,GLint,const,float,        4,4,Matrix4fv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform, GLint, const, float, 2, 2, Matrix2fv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform, GLint, const, float, 3, 3, Matrix3fv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform, GLint, const, float, 4, 4, Matrix4fv_ei)
 
 #endif
 
 #ifdef GL_VERSION_2_1
 
-static void glUniformMatrix2x3fv_ei(GLint loc, const float* v)         { glUniformMatrix2x3fv(loc,1,false,v); }
-static void glUniformMatrix3x2fv_ei(GLint loc, const float* v)         { glUniformMatrix3x2fv(loc,1,false,v); }
-static void glUniformMatrix2x4fv_ei(GLint loc, const float* v)         { glUniformMatrix2x4fv(loc,1,false,v); }
-static void glUniformMatrix4x2fv_ei(GLint loc, const float* v)         { glUniformMatrix4x2fv(loc,1,false,v); }
-static void glUniformMatrix3x4fv_ei(GLint loc, const float* v)         { glUniformMatrix3x4fv(loc,1,false,v); }
-static void glUniformMatrix4x3fv_ei(GLint loc, const float* v)         { glUniformMatrix4x3fv(loc,1,false,v); }
+inline void glUniformMatrix2x3fv_ei(GLint loc, const float* v) { glUniformMatrix2x3fv(loc, 1, false, v); }
+inline void glUniformMatrix3x2fv_ei(GLint loc, const float* v) { glUniformMatrix3x2fv(loc, 1, false, v); }
+inline void glUniformMatrix2x4fv_ei(GLint loc, const float* v) { glUniformMatrix2x4fv(loc, 1, false, v); }
+inline void glUniformMatrix4x2fv_ei(GLint loc, const float* v) { glUniformMatrix4x2fv(loc, 1, false, v); }
+inline void glUniformMatrix3x4fv_ei(GLint loc, const float* v) { glUniformMatrix3x4fv(loc, 1, false, v); }
+inline void glUniformMatrix4x3fv_ei(GLint loc, const float* v) { glUniformMatrix4x3fv(loc, 1, false, v); }
 
-EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform,GLint,const,float,        2,3,Matrix2x3fv_ei)
-EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform,GLint,const,float,        3,2,Matrix3x2fv_ei)
-EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform,GLint,const,float,        2,4,Matrix2x4fv_ei)
-EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform,GLint,const,float,        4,2,Matrix4x2fv_ei)
-EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform,GLint,const,float,        3,4,Matrix3x4fv_ei)
-EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform,GLint,const,float,        4,3,Matrix4x3fv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform, GLint, const, float, 2, 3, Matrix2x3fv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform, GLint, const, float, 3, 2, Matrix3x2fv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform, GLint, const, float, 2, 4, Matrix2x4fv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform, GLint, const, float, 4, 2, Matrix4x2fv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform, GLint, const, float, 3, 4, Matrix3x4fv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_MAT(glUniform, GLint, const, float, 4, 3, Matrix4x3fv_ei)
 
 #endif
 
 #ifdef GL_VERSION_3_0
 
-inline void glUniform2uiv_ei (GLint loc, const unsigned int* v)  { glUniform2uiv(loc,1,v); }
-inline void glUniform3uiv_ei (GLint loc, const unsigned int* v)  { glUniform3uiv(loc,1,v); }
-inline void glUniform4uiv_ei (GLint loc, const unsigned int* v)  { glUniform4uiv(loc,1,v); }
+inline void glUniform2uiv_ei(GLint loc, const unsigned int* v) { glUniform2uiv(loc, 1, v); }
+inline void glUniform3uiv_ei(GLint loc, const unsigned int* v) { glUniform3uiv(loc, 1, v); }
+inline void glUniform4uiv_ei(GLint loc, const unsigned int* v) { glUniform4uiv(loc, 1, v); }
 
-EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,unsigned int, 2,2uiv_ei)
-EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,unsigned int, 3,3uiv_ei)
-EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,unsigned int, 4,4uiv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform, GLint, const, unsigned int, 2, 2uiv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform, GLint, const, unsigned int, 3, 3uiv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform, GLint, const, unsigned int, 4, 4uiv_ei)
 
 #endif
 
 #ifdef GL_ARB_gpu_shader_fp64
-inline void glUniform2dv_ei  (GLint loc, const double* v)        { glUniform2dv(loc,1,v); }
-inline void glUniform3dv_ei  (GLint loc, const double* v)        { glUniform3dv(loc,1,v); }
-inline void glUniform4dv_ei  (GLint loc, const double* v)        { glUniform4dv(loc,1,v); }
+inline void glUniform2dv_ei(GLint loc, const double* v) { glUniform2dv(loc, 1, v); }
+inline void glUniform3dv_ei(GLint loc, const double* v) { glUniform3dv(loc, 1, v); }
+inline void glUniform4dv_ei(GLint loc, const double* v) { glUniform4dv(loc, 1, v); }
 
-EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,double,       2,2dv_ei)
-EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,double,       3,3dv_ei)
-EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform,GLint,const,double,       4,4dv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform, GLint, const, double, 2, 2dv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform, GLint, const, double, 3, 3dv_ei)
+EIGEN_GL_FUNC1_SPECIALIZATION_VEC(glUniform, GLint, const, double, 4, 4dv_ei)
 #endif
 
-
 //@}
 
-}
+}  // namespace Eigen
 
-#endif // EIGEN_OPENGL_MODULE
+#endif  // EIGEN_OPENGL_MODULE_H
diff --git a/inst/include/unsupported/Eigen/Polynomials b/inst/include/unsupported/Eigen/Polynomials
index cece5633..aa9aa8eb 100644
--- a/inst/include/unsupported/Eigen/Polynomials
+++ b/inst/include/unsupported/Eigen/Polynomials
@@ -9,130 +9,132 @@
 #ifndef EIGEN_POLYNOMIALS_MODULE_H
 #define EIGEN_POLYNOMIALS_MODULE_H
 
-#include <Eigen/Core>
+#include "../../Eigen/Core"
 
-#include <Eigen/src/Core/util/DisableStupidWarnings.h>
+#include "../../Eigen/Eigenvalues"
 
-#include <Eigen/Eigenvalues>
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
 
 // Note that EIGEN_HIDE_HEAVY_CODE has to be defined per module
-#if (defined EIGEN_EXTERN_INSTANTIATIONS) && (EIGEN_EXTERN_INSTANTIATIONS>=2)
-  #ifndef EIGEN_HIDE_HEAVY_CODE
-  #define EIGEN_HIDE_HEAVY_CODE
-  #endif
+#if (defined EIGEN_EXTERN_INSTANTIATIONS) && (EIGEN_EXTERN_INSTANTIATIONS >= 2)
+#ifndef EIGEN_HIDE_HEAVY_CODE
+#define EIGEN_HIDE_HEAVY_CODE
+#endif
 #elif defined EIGEN_HIDE_HEAVY_CODE
-  #undef EIGEN_HIDE_HEAVY_CODE
+#undef EIGEN_HIDE_HEAVY_CODE
 #endif
 
 /**
-  * \defgroup Polynomials_Module Polynomials module
-  * \brief This module provides a QR based polynomial solver.
-	*
-  * To use this module, add
-  * \code
-  * #include <unsupported/Eigen/Polynomials>
-  * \endcode
-	* at the start of your source file.
-  */
-
+ * \defgroup Polynomials_Module Polynomials module
+ * \brief This module provides a QR based polynomial solver.
+ *
+ * To use this module, add
+ * \code
+ * #include <unsupported/Eigen/Polynomials>
+ * \endcode
+ * at the start of your source file.
+ */
+
+// IWYU pragma: begin_exports
 #include "src/Polynomials/PolynomialUtils.h"
 #include "src/Polynomials/Companion.h"
 #include "src/Polynomials/PolynomialSolver.h"
+// IWYU pragma: end_exports
 
 /**
-	\page polynomials Polynomials defines functions for dealing with polynomials
-	and a QR based polynomial solver.
-	\ingroup Polynomials_Module
+        \page polynomials Polynomials defines functions for dealing with polynomials
+        and a QR based polynomial solver.
+        \ingroup Polynomials_Module
 
-	The remainder of the page documents first the functions for evaluating, computing
-	polynomials, computing estimates about polynomials and next the QR based polynomial
-	solver.
+        The remainder of the page documents first the functions for evaluating, computing
+        polynomials, computing estimates about polynomials and next the QR based polynomial
+        solver.
 
-	\section polynomialUtils convenient functions to deal with polynomials
-	\subsection roots_to_monicPolynomial
-	The function
-	\code
-	void roots_to_monicPolynomial( const RootVector& rv, Polynomial& poly )
-	\endcode
-	computes the coefficients \f$ a_i \f$ of
+        \section polynomialUtils convenient functions to deal with polynomials
+        \subsection roots_to_monicPolynomial
+        The function
+        \code
+        void roots_to_monicPolynomial( const RootVector& rv, Polynomial& poly )
+        \endcode
+        computes the coefficients \f$ a_i \f$ of
 
-	\f$ p(x) = a_0 + a_{1}x + ... + a_{n-1}x^{n-1} + x^n \f$
+        \f$ p(x) = a_0 + a_{1}x + ... + a_{n-1}x^{n-1} + x^n \f$
 
-	where \f$ p \f$ is known through its roots i.e. \f$ p(x) = (x-r_1)(x-r_2)...(x-r_n) \f$.
+        where \f$ p \f$ is known through its roots i.e. \f$ p(x) = (x-r_1)(x-r_2)...(x-r_n) \f$.
 
-	\subsection poly_eval
-	The function
-	\code
-	T poly_eval( const Polynomials& poly, const T& x )
-	\endcode
-	evaluates a polynomial at a given point using stabilized H&ouml;rner method.
+        \subsection poly_eval
+        The function
+        \code
+        T poly_eval( const Polynomials& poly, const T& x )
+        \endcode
+        evaluates a polynomial at a given point using stabilized H&ouml;rner method.
 
-	The following code: first computes the coefficients in the monomial basis of the monic polynomial that has the provided roots;
-	then, it evaluates the computed polynomial, using a stabilized H&ouml;rner method.
+        The following code: first computes the coefficients in the monomial basis of the monic polynomial that has the
+  provided roots; then, it evaluates the computed polynomial, using a stabilized H&ouml;rner method.
 
-	\include PolynomialUtils1.cpp
+        \include PolynomialUtils1.cpp
   Output: \verbinclude PolynomialUtils1.out
 
-	\subsection Cauchy bounds
-	The function
-	\code
-	Real cauchy_max_bound( const Polynomial& poly )
-	\endcode
-	provides a maximum bound (the Cauchy one: \f$C(p)\f$) for the absolute value of a root of the given polynomial i.e.
-	\f$ \forall r_i \f$ root of \f$ p(x) = \sum_{k=0}^d a_k x^k \f$,
-	\f$ |r_i| \le C(p) = \sum_{k=0}^{d} \left | \frac{a_k}{a_d} \right | \f$
-	The leading coefficient \f$ p \f$: should be non zero \f$a_d \neq 0\f$.
-
-
-	The function
-	\code
-	Real cauchy_min_bound( const Polynomial& poly )
-	\endcode
-	provides a minimum bound (the Cauchy one: \f$c(p)\f$) for the absolute value of a non zero root of the given polynomial i.e.
-	\f$ \forall r_i \neq 0 \f$ root of \f$ p(x) = \sum_{k=0}^d a_k x^k \f$,
-	\f$ |r_i| \ge c(p) = \left( \sum_{k=0}^{d} \left | \frac{a_k}{a_0} \right | \right)^{-1} \f$
-
-
-
-
-	\section QR polynomial solver class
-	Computes the complex roots of a polynomial by computing the eigenvalues of the associated companion matrix with the QR algorithm.
-	
-	The roots of \f$ p(x) = a_0 + a_1 x + a_2 x^2 + a_{3} x^3 + x^4 \f$ are the eigenvalues of
-	\f$
-	\left [
-	\begin{array}{cccc}
-	0 & 0 &  0 & a_0 \\
-	1 & 0 &  0 & a_1 \\
-	0 & 1 &  0 & a_2 \\
-	0 & 0 &  1 & a_3
-	\end{array} \right ]
-	\f$
-
-	However, the QR algorithm is not guaranteed to converge when there are several eigenvalues with same modulus.
-
-	Therefore the current polynomial solver is guaranteed to provide a correct result only when the complex roots \f$r_1,r_2,...,r_d\f$ have distinct moduli i.e.
-	
-	\f$ \forall i,j \in [1;d],~ \| r_i \| \neq \| r_j \| \f$.
-
-	With 32bit (float) floating types this problem shows up frequently.
+        \subsection Cauchy bounds
+        The function
+        \code
+        Real cauchy_max_bound( const Polynomial& poly )
+        \endcode
+        provides a maximum bound (the Cauchy one: \f$C(p)\f$) for the absolute value of a root of the given polynomial
+  i.e. \f$ \forall r_i \f$ root of \f$ p(x) = \sum_{k=0}^d a_k x^k \f$, \f$ |r_i| \le C(p) = \sum_{k=0}^{d} \left |
+  \frac{a_k}{a_d} \right | \f$ The leading coefficient \f$ p \f$: should be non zero \f$a_d \neq 0\f$.
+
+
+        The function
+        \code
+        Real cauchy_min_bound( const Polynomial& poly )
+        \endcode
+        provides a minimum bound (the Cauchy one: \f$c(p)\f$) for the absolute value of a non zero root of the given
+  polynomial i.e. \f$ \forall r_i \neq 0 \f$ root of \f$ p(x) = \sum_{k=0}^d a_k x^k \f$, \f$ |r_i| \ge c(p) = \left(
+  \sum_{k=0}^{d} \left | \frac{a_k}{a_0} \right | \right)^{-1} \f$
+
+
+
+
+        \section QR polynomial solver class
+        Computes the complex roots of a polynomial by computing the eigenvalues of the associated companion matrix with
+  the QR algorithm.
+
+        The roots of \f$ p(x) = a_0 + a_1 x + a_2 x^2 + a_{3} x^3 + x^4 \f$ are the eigenvalues of
+        \f$
+        \left [
+        \begin{array}{cccc}
+        0 & 0 &  0 & a_0 \\
+        1 & 0 &  0 & a_1 \\
+        0 & 1 &  0 & a_2 \\
+        0 & 0 &  1 & a_3
+        \end{array} \right ]
+        \f$
+
+        However, the QR algorithm is not guaranteed to converge when there are several eigenvalues with same modulus.
+
+        Therefore the current polynomial solver is guaranteed to provide a correct result only when the complex roots
+  \f$r_1,r_2,...,r_d\f$ have distinct moduli i.e.
+
+        \f$ \forall i,j \in [1;d],~ \| r_i \| \neq \| r_j \| \f$.
+
+        With 32bit (float) floating types this problem shows up frequently.
   However, almost always, correct accuracy is reached even in these cases for 64bit
   (double) floating types and small polynomial degree (<20).
 
-	\include PolynomialSolver1.cpp
-	
-	In the above example:
-	
-	-# a simple use of the polynomial solver is shown;
-	-# the accuracy problem with the QR algorithm is presented: a polynomial with almost conjugate roots is provided to the solver.
-	Those roots have almost same module therefore the QR algorithm failed to converge: the accuracy
-	of the last root is bad;
-	-# a simple way to circumvent the problem is shown: use doubles instead of floats.
+        \include PolynomialSolver1.cpp
+
+        In the above example:
+
+        -# a simple use of the polynomial solver is shown;
+        -# the accuracy problem with the QR algorithm is presented: a polynomial with almost conjugate roots is provided
+  to the solver. Those roots have almost same module therefore the QR algorithm failed to converge: the accuracy of the
+  last root is bad;
+        -# a simple way to circumvent the problem is shown: use doubles instead of floats.
 
   Output: \verbinclude PolynomialSolver1.out
 */
 
-#include <Eigen/src/Core/util/ReenableStupidWarnings.h>
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
 
-#endif // EIGEN_POLYNOMIALS_MODULE_H
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */
+#endif  // EIGEN_POLYNOMIALS_MODULE_H
diff --git a/inst/include/unsupported/Eigen/SVD b/inst/include/unsupported/Eigen/SVD
deleted file mode 100644
index 7cc05928..00000000
--- a/inst/include/unsupported/Eigen/SVD
+++ /dev/null
@@ -1,39 +0,0 @@
-#ifndef EIGEN_SVD_MODULE_H
-#define EIGEN_SVD_MODULE_H
-
-#include <Eigen/QR>
-#include <Eigen/Householder>
-#include <Eigen/Jacobi>
-
-#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
-
-/** \defgroup SVD_Module SVD module
-  *
-  *
-  *
-  * This module provides SVD decomposition for matrices (both real and complex).
-  * This decomposition is accessible via the following MatrixBase method:
-  *  - MatrixBase::jacobiSvd()
-  *
-  * \code
-  * #include <Eigen/SVD>
-  * \endcode
-  */
-
-#include "../../Eigen/src/misc/Solve.h"
-#include "../../Eigen/src/SVD/UpperBidiagonalization.h"
-#include "src/SVD/SVDBase.h"
-#include "src/SVD/JacobiSVD.h"
-#include "src/SVD/BDCSVD.h"
-#if defined(EIGEN_USE_LAPACKE) && !defined(EIGEN_USE_LAPACKE_STRICT)
-#include "../../Eigen/src/SVD/JacobiSVD_MKL.h"
-#endif
-
-#ifdef EIGEN2_SUPPORT
-#include "../../Eigen/src/Eigen2Support/SVD.h"
-#endif
-
-#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
-
-#endif // EIGEN_SVD_MODULE_H
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */
diff --git a/inst/include/unsupported/Eigen/Skyline b/inst/include/unsupported/Eigen/Skyline
deleted file mode 100644
index 71a68cb4..00000000
--- a/inst/include/unsupported/Eigen/Skyline
+++ /dev/null
@@ -1,39 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_SKYLINE_MODULE_H
-#define EIGEN_SKYLINE_MODULE_H
-
-
-#include "Eigen/Core"
-
-#include "Eigen/src/Core/util/DisableStupidWarnings.h"
-
-#include <map>
-#include <cstdlib>
-#include <cstring>
-#include <algorithm>
-
-/**
- *  \defgroup Skyline_Module Skyline module
- *
- *
- *
- *
- */
-
-#include "src/Skyline/SkylineUtil.h"
-#include "src/Skyline/SkylineMatrixBase.h"
-#include "src/Skyline/SkylineStorage.h"
-#include "src/Skyline/SkylineMatrix.h"
-#include "src/Skyline/SkylineInplaceLU.h"
-#include "src/Skyline/SkylineProduct.h"
-
-#include "Eigen/src/Core/util/ReenableStupidWarnings.h"
-
-#endif // EIGEN_SKYLINE_MODULE_H
diff --git a/inst/include/unsupported/Eigen/SparseExtra b/inst/include/unsupported/Eigen/SparseExtra
index b5597902..00a87c56 100644
--- a/inst/include/unsupported/Eigen/SparseExtra
+++ b/inst/include/unsupported/Eigen/SparseExtra
@@ -16,6 +16,7 @@
 
 #include <vector>
 #include <map>
+#include <unordered_map>
 #include <cstdlib>
 #include <cstring>
 #include <algorithm>
@@ -23,26 +24,28 @@
 #include <sstream>
 
 #ifdef EIGEN_GOOGLEHASH_SUPPORT
-  #include <google/dense_hash_map>
+#include <google/dense_hash_map>
+#include <google/sparse_hash_map>
 #endif
 
 /**
-  * \defgroup SparseExtra_Module SparseExtra module
-  *
-  * This module contains some experimental features extending the sparse module.
-  *
-  * \code
-  * #include <Eigen/SparseExtra>
-  * \endcode
-  */
-
-
-#include "../../Eigen/src/misc/Solve.h"
-#include "../../Eigen/src/misc/SparseSolve.h"
-
-#include "src/SparseExtra/DynamicSparseMatrix.h"
-#include "src/SparseExtra/BlockOfDynamicSparseMatrix.h"
+ * \defgroup SparseExtra_Module SparseExtra module
+ *
+ * This module contains some experimental features extending the sparse module:
+ * - A RandomSetter which is a wrapper object allowing to set/update a sparse matrix with random access.
+ * - A SparseInverse which calculates a sparse subset of the inverse of a sparse matrix corresponding to nonzeros of the
+ * input
+ * - MatrixMarket format(https://math.nist.gov/MatrixMarket/formats.html) readers and writers for sparse and dense
+ * matrices.
+ *
+ * \code
+ * #include <unsupported/Eigen/SparseExtra>
+ * \endcode
+ */
+
+// IWYU pragma: begin_exports
 #include "src/SparseExtra/RandomSetter.h"
+#include "src/SparseExtra/SparseInverse.h"
 
 #include "src/SparseExtra/MarketIO.h"
 
@@ -50,7 +53,8 @@
 #include <dirent.h>
 #include "src/SparseExtra/MatrixMarketIterator.h"
 #endif
+// IWYU pragma: end_exports
 
 #include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
 
-#endif // EIGEN_SPARSE_EXTRA_MODULE_H
+#endif  // EIGEN_SPARSE_EXTRA_MODULE_H
diff --git a/inst/include/unsupported/Eigen/SpecialFunctions b/inst/include/unsupported/Eigen/SpecialFunctions
new file mode 100644
index 00000000..4f7e5993
--- /dev/null
+++ b/inst/include/unsupported/Eigen/SpecialFunctions
@@ -0,0 +1,104 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Gael Guennebaud <g.gael@free.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPECIALFUNCTIONS_MODULE_H
+#define EIGEN_SPECIALFUNCTIONS_MODULE_H
+
+#include <math.h>
+
+#include "../../Eigen/Core"
+
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
+namespace Eigen {
+
+/**
+ * \defgroup SpecialFunctions_Module Special math functions module
+ *
+ * This module features additional coefficient-wise math functions available
+ * within the numext:: namespace for the scalar version, and as method and/or free
+ * functions of Array. Those include:
+ *
+ * - erf
+ * - erfc
+ * - lgamma
+ * - igamma
+ * - igamma_der_a
+ * - gamma_sample_der_alpha
+ * - igammac
+ * - digamma
+ * - ndtri
+ * - polygamma
+ * - zeta
+ * - betainc
+ *
+ * Bessel Functions
+ * - bessel_i0
+ * - bessel_i0e
+ * - bessel_i1
+ * - bessel_i1e
+ * - bessel_j0
+ * - bessel_j1
+ * - bessel_k0
+ * - bessel_k0e
+ * - bessel_k1
+ * - bessel_k1e
+ * - bessel_y0
+ * - bessel_y1
+ *
+ * \code
+ * #include <unsupported/Eigen/SpecialFunctions>
+ * \endcode
+ */
+//@{
+
+}  // namespace Eigen
+
+// IWYU pragma: begin_exports
+#include "src/SpecialFunctions/BesselFunctionsImpl.h"
+#include "src/SpecialFunctions/BesselFunctionsBFloat16.h"
+#include "src/SpecialFunctions/BesselFunctionsHalf.h"
+#include "src/SpecialFunctions/BesselFunctionsPacketMath.h"
+#include "src/SpecialFunctions/BesselFunctionsFunctors.h"
+#include "src/SpecialFunctions/BesselFunctionsArrayAPI.h"
+#include "src/SpecialFunctions/SpecialFunctionsImpl.h"
+#if defined(EIGEN_HIPCC)
+#include "src/SpecialFunctions/HipVectorCompatibility.h"
+#endif
+#include "src/SpecialFunctions/SpecialFunctionsBFloat16.h"
+#include "src/SpecialFunctions/SpecialFunctionsHalf.h"
+#include "src/SpecialFunctions/SpecialFunctionsPacketMath.h"
+#include "src/SpecialFunctions/SpecialFunctionsFunctors.h"
+#include "src/SpecialFunctions/SpecialFunctionsArrayAPI.h"
+
+#if defined EIGEN_VECTORIZE_AVX512
+#include "src/SpecialFunctions/arch/AVX/BesselFunctions.h"
+#include "src/SpecialFunctions/arch/AVX/SpecialFunctions.h"
+#include "src/SpecialFunctions/arch/AVX512/BesselFunctions.h"
+#include "src/SpecialFunctions/arch/AVX512/SpecialFunctions.h"
+#elif defined EIGEN_VECTORIZE_AVX
+#include "src/SpecialFunctions/arch/AVX/BesselFunctions.h"
+#include "src/SpecialFunctions/arch/AVX/SpecialFunctions.h"
+#elif defined EIGEN_VECTORIZE_NEON
+#include "src/SpecialFunctions/arch/NEON/BesselFunctions.h"
+#include "src/SpecialFunctions/arch/NEON/SpecialFunctions.h"
+#endif
+
+#if defined EIGEN_VECTORIZE_GPU
+#include "src/SpecialFunctions/arch/GPU/SpecialFunctions.h"
+#endif
+// IWYU pragma: end_exports
+
+namespace Eigen {
+//@}
+}
+
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+#endif  // EIGEN_SPECIALFUNCTIONS_MODULE_H
diff --git a/inst/include/unsupported/Eigen/Splines b/inst/include/unsupported/Eigen/Splines
index 322e6b9f..632095de 100644
--- a/inst/include/unsupported/Eigen/Splines
+++ b/inst/include/unsupported/Eigen/Splines
@@ -10,22 +10,27 @@
 #ifndef EIGEN_SPLINES_MODULE_H
 #define EIGEN_SPLINES_MODULE_H
 
-namespace Eigen 
-{
+namespace Eigen {
 /**
-  * \defgroup Splines_Module Spline and spline fitting module
-  *
-  * This module provides a simple multi-dimensional spline class while
-  * offering most basic functionality to fit a spline to point sets.
-  *
-  * \code
-  * #include <unsupported/Eigen/Splines>
-  * \endcode
-  */
+ * \defgroup Splines_Module Spline and spline fitting module
+ *
+ * This module provides a simple multi-dimensional spline class while
+ * offering most basic functionality to fit a spline to point sets.
+ *
+ * \code
+ * #include <unsupported/Eigen/Splines>
+ * \endcode
+ */
 }
 
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
+// IWYU pragma: begin_exports
 #include "src/Splines/SplineFwd.h"
 #include "src/Splines/Spline.h"
 #include "src/Splines/SplineFitting.h"
+// IWYU pragma: end_exports
+
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
 
-#endif // EIGEN_SPLINES_MODULE_H
+#endif  // EIGEN_SPLINES_MODULE_H
diff --git a/inst/include/unsupported/Eigen/src/AutoDiff/AutoDiffJacobian.h b/inst/include/unsupported/Eigen/src/AutoDiff/AutoDiffJacobian.h
index 1a61e336..a83bdc31 100644
--- a/inst/include/unsupported/Eigen/src/AutoDiff/AutoDiffJacobian.h
+++ b/inst/include/unsupported/Eigen/src/AutoDiff/AutoDiffJacobian.h
@@ -10,47 +10,45 @@
 #ifndef EIGEN_AUTODIFF_JACOBIAN_H
 #define EIGEN_AUTODIFF_JACOBIAN_H
 
-namespace Eigen
-{
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 
-template<typename Functor> class AutoDiffJacobian : public Functor
-{
-public:
+namespace Eigen {
+
+template <typename Functor>
+class AutoDiffJacobian : public Functor {
+ public:
   AutoDiffJacobian() : Functor() {}
   AutoDiffJacobian(const Functor& f) : Functor(f) {}
 
   // forward constructors
-  template<typename T0>
-  AutoDiffJacobian(const T0& a0) : Functor(a0) {}
-  template<typename T0, typename T1>
-  AutoDiffJacobian(const T0& a0, const T1& a1) : Functor(a0, a1) {}
-  template<typename T0, typename T1, typename T2>
-  AutoDiffJacobian(const T0& a0, const T1& a1, const T2& a2) : Functor(a0, a1, a2) {}
-
-  enum {
-    InputsAtCompileTime = Functor::InputsAtCompileTime,
-    ValuesAtCompileTime = Functor::ValuesAtCompileTime
-  };
+  template <typename... T>
+  AutoDiffJacobian(const T&... Values) : Functor(Values...) {}
 
   typedef typename Functor::InputType InputType;
   typedef typename Functor::ValueType ValueType;
-  typedef typename Functor::JacobianType JacobianType;
-  typedef typename JacobianType::Scalar Scalar;
+  typedef typename ValueType::Scalar Scalar;
+
+  enum { InputsAtCompileTime = InputType::RowsAtCompileTime, ValuesAtCompileTime = ValueType::RowsAtCompileTime };
+
+  typedef Matrix<Scalar, ValuesAtCompileTime, InputsAtCompileTime> JacobianType;
   typedef typename JacobianType::Index Index;
 
-  typedef Matrix<Scalar,InputsAtCompileTime,1> DerivativeType;
+  typedef Matrix<Scalar, InputsAtCompileTime, 1> DerivativeType;
   typedef AutoDiffScalar<DerivativeType> ActiveScalar;
 
-
   typedef Matrix<ActiveScalar, InputsAtCompileTime, 1> ActiveInput;
   typedef Matrix<ActiveScalar, ValuesAtCompileTime, 1> ActiveValue;
 
-  void operator() (const InputType& x, ValueType* v, JacobianType* _jac=0) const
-  {
-    eigen_assert(v!=0);
-    if (!_jac)
-    {
-      Functor::operator()(x, v);
+  // Some compilers don't accept variadic parameters after a default parameter,
+  // i.e., we can't just write _jac=0 but we need to overload operator():
+  EIGEN_STRONG_INLINE void operator()(const InputType& x, ValueType* v) const { this->operator()(x, v, 0); }
+  template <typename... ParamsType>
+  void operator()(const InputType& x, ValueType* v, JacobianType* _jac, const ParamsType&... Params) const {
+    eigen_assert(v != 0);
+
+    if (!_jac) {
+      Functor::operator()(x, v, Params...);
       return;
     }
 
@@ -59,25 +57,20 @@ template<typename Functor> class AutoDiffJacobian : public Functor
     ActiveInput ax = x.template cast<ActiveScalar>();
     ActiveValue av(jac.rows());
 
-    if(InputsAtCompileTime==Dynamic)
-      for (Index j=0; j<jac.rows(); j++)
-        av[j].derivatives().resize(this->inputs());
+    if (InputsAtCompileTime == Dynamic)
+      for (Index j = 0; j < jac.rows(); j++) av[j].derivatives().resize(x.rows());
 
-    for (Index i=0; i<jac.cols(); i++)
-      ax[i].derivatives() = DerivativeType::Unit(this->inputs(),i);
+    for (Index i = 0; i < jac.cols(); i++) ax[i].derivatives() = DerivativeType::Unit(x.rows(), i);
 
-    Functor::operator()(ax, &av);
+    Functor::operator()(ax, &av, Params...);
 
-    for (Index i=0; i<jac.rows(); i++)
-    {
+    for (Index i = 0; i < jac.rows(); i++) {
       (*v)[i] = av[i].value();
       jac.row(i) = av[i].derivatives();
     }
   }
-protected:
-
 };
 
-}
+}  // namespace Eigen
 
-#endif // EIGEN_AUTODIFF_JACOBIAN_H
+#endif  // EIGEN_AUTODIFF_JACOBIAN_H
diff --git a/inst/include/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h b/inst/include/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
index fde3ff61..785cd4a5 100644
--- a/inst/include/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
+++ b/inst/include/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
@@ -10,633 +10,630 @@
 #ifndef EIGEN_AUTODIFF_SCALAR_H
 #define EIGEN_AUTODIFF_SCALAR_H
 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
 namespace Eigen {
 
 namespace internal {
 
-template<typename A, typename B>
-struct make_coherent_impl {
-  static void run(A&, B&) {}
+template <typename DerivativeType, bool Enable>
+struct auto_diff_special_op;
+
+template <typename DerivativeType, typename OtherDerivativeType, typename EnableIf = void>
+struct maybe_coherent_pad_helper {
+  static constexpr int SizeAtCompileTime =
+      max_size_prefer_dynamic(DerivativeType::SizeAtCompileTime, OtherDerivativeType::SizeAtCompileTime);
+  using type = CoherentPadOp<DerivativeType, SizeAtCompileTime>;
+  static type pad(const DerivativeType& x, const OtherDerivativeType& y) {
+    // CoherentPadOp uses variable_if_dynamic<SizeAtCompileTime>.  In this case, `SizeAtCompileTime` might
+    // by Dynamic, so we need to take the runtime maximum of x, y.
+    return CoherentPadOp<DerivativeType, SizeAtCompileTime>(x, numext::maxi(x.size(), y.size()));
+  }
 };
 
-// resize a to match b is a.size()==0, and conversely.
-template<typename A, typename B>
-void make_coherent(const A& a, const B&b)
-{
-  make_coherent_impl<A,B>::run(a.const_cast_derived(), b.const_cast_derived());
+// Both are fixed-sized and equal, don't need to pad.
+// Both are fixed-size and this is larger than other, don't need to pad.
+template <typename DerivativeType, typename OtherDerivativeType>
+struct maybe_coherent_pad_helper<
+    DerivativeType, OtherDerivativeType,
+    std::enable_if_t<enum_ge_not_dynamic(DerivativeType::SizeAtCompileTime, OtherDerivativeType::SizeAtCompileTime)>> {
+  using type = const DerivativeType&;
+  static const DerivativeType& pad(const DerivativeType& x, const OtherDerivativeType& /*y*/) { return x; }
+};
+
+template <typename DerivativeType, typename OtherDerivativeType>
+typename maybe_coherent_pad_helper<DerivativeType, OtherDerivativeType>::type MaybeCoherentPad(
+    const DerivativeType& x, const OtherDerivativeType& y) {
+  return maybe_coherent_pad_helper<DerivativeType, OtherDerivativeType>::pad(x, y);
 }
 
-template<typename _DerType, bool Enable> struct auto_diff_special_op;
+template <typename Op, typename LhsDerivativeType, typename RhsDerivativeType>
+auto MakeCoherentCwiseBinaryOp(const LhsDerivativeType& x, const RhsDerivativeType& y, Op op = Op()) {
+  const auto& lhs = MaybeCoherentPad(x, y);
+  const auto& rhs = MaybeCoherentPad(y, x);
+  return CwiseBinaryOp<Op, remove_all_t<decltype(lhs)>, remove_all_t<decltype(rhs)>>(lhs, rhs, op);
+}
+
+}  // namespace internal
 
-} // end namespace internal
+template <typename DerivativeType>
+class AutoDiffScalar;
+
+template <typename NewDerType>
+inline AutoDiffScalar<NewDerType> MakeAutoDiffScalar(const typename NewDerType::Scalar& value, const NewDerType& der) {
+  return AutoDiffScalar<NewDerType>(value, der);
+}
 
 /** \class AutoDiffScalar
-  * \brief A scalar type replacement with automatic differentation capability
-  *
-  * \param _DerType the vector type used to store/represent the derivatives. The base scalar type
-  *                 as well as the number of derivatives to compute are determined from this type.
-  *                 Typical choices include, e.g., \c Vector4f for 4 derivatives, or \c VectorXf
-  *                 if the number of derivatives is not known at compile time, and/or, the number
-  *                 of derivatives is large.
-  *                 Note that _DerType can also be a reference (e.g., \c VectorXf&) to wrap a
-  *                 existing vector into an AutoDiffScalar.
-  *                 Finally, _DerType can also be any Eigen compatible expression.
-  *
-  * This class represents a scalar value while tracking its respective derivatives using Eigen's expression
-  * template mechanism.
-  *
-  * It supports the following list of global math function:
-  *  - std::abs, std::sqrt, std::pow, std::exp, std::log, std::sin, std::cos,
-  *  - internal::abs, internal::sqrt, numext::pow, internal::exp, internal::log, internal::sin, internal::cos,
-  *  - internal::conj, internal::real, internal::imag, numext::abs2.
-  *
-  * AutoDiffScalar can be used as the scalar type of an Eigen::Matrix object. However,
-  * in that case, the expression template mechanism only occurs at the top Matrix level,
-  * while derivatives are computed right away.
-  *
-  */
-
-template<typename _DerType>
+ * \brief A scalar type replacement with automatic differentiation capability
+ *
+ * \param DerivativeType the vector type used to store/represent the derivatives. The base scalar type
+ *                 as well as the number of derivatives to compute are determined from this type.
+ *                 Typical choices include, e.g., \c Vector4f for 4 derivatives, or \c VectorXf
+ *                 if the number of derivatives is not known at compile time, and/or, the number
+ *                 of derivatives is large.
+ *                 Note that DerivativeType can also be a reference (e.g., \c VectorXf&) to wrap a
+ *                 existing vector into an AutoDiffScalar.
+ *                 Finally, DerivativeType can also be any Eigen compatible expression.
+ *
+ * This class represents a scalar value while tracking its respective derivatives using Eigen's expression
+ * template mechanism.
+ *
+ * It supports the following list of global math function:
+ *  - std::abs, std::sqrt, std::pow, std::exp, std::log, std::sin, std::cos,
+ *  - internal::abs, internal::sqrt, numext::pow, internal::exp, internal::log, internal::sin, internal::cos,
+ *  - internal::conj, internal::real, internal::imag, numext::abs2.
+ *
+ * AutoDiffScalar can be used as the scalar type of an Eigen::Matrix object. However,
+ * in that case, the expression template mechanism only occurs at the top Matrix level,
+ * while derivatives are computed right away.
+ *
+ */
+
+template <typename DerivativeType>
 class AutoDiffScalar
-  : public internal::auto_diff_special_op
-            <_DerType, !internal::is_same<typename internal::traits<typename internal::remove_all<_DerType>::type>::Scalar,
-                                        typename NumTraits<typename internal::traits<typename internal::remove_all<_DerType>::type>::Scalar>::Real>::value>
-{
-  public:
-    typedef internal::auto_diff_special_op
-            <_DerType, !internal::is_same<typename internal::traits<typename internal::remove_all<_DerType>::type>::Scalar,
-                       typename NumTraits<typename internal::traits<typename internal::remove_all<_DerType>::type>::Scalar>::Real>::value> Base;
-    typedef typename internal::remove_all<_DerType>::type DerType;
-    typedef typename internal::traits<DerType>::Scalar Scalar;
-    typedef typename NumTraits<Scalar>::Real Real;
-
-    using Base::operator+;
-    using Base::operator*;
-
-    /** Default constructor without any initialization. */
-    AutoDiffScalar() {}
-
-    /** Constructs an active scalar from its \a value,
-        and initializes the \a nbDer derivatives such that it corresponds to the \a derNumber -th variable */
-    AutoDiffScalar(const Scalar& value, int nbDer, int derNumber)
-      : m_value(value), m_derivatives(DerType::Zero(nbDer))
-    {
-      m_derivatives.coeffRef(derNumber) = Scalar(1);
-    }
+    : public internal::auto_diff_special_op<
+          DerivativeType, !internal::is_same<typename internal::traits<internal::remove_all_t<DerivativeType>>::Scalar,
+                                             typename NumTraits<typename internal::traits<
+                                                 internal::remove_all_t<DerivativeType>>::Scalar>::Real>::value> {
+ public:
+  typedef internal::auto_diff_special_op<
+      DerivativeType,
+      !internal::is_same<
+          typename internal::traits<internal::remove_all_t<DerivativeType>>::Scalar,
+          typename NumTraits<typename internal::traits<internal::remove_all_t<DerivativeType>>::Scalar>::Real>::value>
+      Base;
+  typedef internal::remove_all_t<DerivativeType> DerType;
+  typedef typename internal::traits<DerType>::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real Real;
 
-    /** Conversion from a scalar constant to an active scalar.
-      * The derivatives are set to zero. */
-    /*explicit*/ AutoDiffScalar(const Real& value)
-      : m_value(value)
-    {
-      if(m_derivatives.size()>0)
-        m_derivatives.setZero();
-    }
+  using Base::operator+;
+  using Base::operator*;
 
-    /** Constructs an active scalar from its \a value and derivatives \a der */
-    AutoDiffScalar(const Scalar& value, const DerType& der)
-      : m_value(value), m_derivatives(der)
-    {}
+  /** Default constructor without any initialization. */
+  AutoDiffScalar() {}
 
-    template<typename OtherDerType>
-    AutoDiffScalar(const AutoDiffScalar<OtherDerType>& other)
-      : m_value(other.value()), m_derivatives(other.derivatives())
-    {}
+  /** Constructs an active scalar from its \a value,
+      and initializes the \a nbDer derivatives such that it corresponds to the \a derNumber -th variable */
+  AutoDiffScalar(const Scalar& value, int nbDer, int derNumber) : m_value(value), m_derivatives(DerType::Zero(nbDer)) {
+    m_derivatives.coeffRef(derNumber) = Scalar(1);
+  }
 
-    friend  std::ostream & operator << (std::ostream & s, const AutoDiffScalar& a)
-    {
-      return s << a.value();
-    }
+  /** Conversion from a scalar constant to an active scalar.
+   * The derivatives are set to zero. */
+  /*explicit*/ AutoDiffScalar(const Real& value) : m_value(value) {
+    if (m_derivatives.size() > 0) m_derivatives.setZero();
+  }
 
-    AutoDiffScalar(const AutoDiffScalar& other)
-      : m_value(other.value()), m_derivatives(other.derivatives())
-    {}
+  /** Constructs an active scalar from its \a value and derivatives \a der */
+  AutoDiffScalar(const Scalar& value, const DerType& der) : m_value(value), m_derivatives(der) {}
+
+  template <typename OtherDerType>
+  AutoDiffScalar(
+      const AutoDiffScalar<OtherDerType>& other
+#ifndef EIGEN_PARSED_BY_DOXYGEN
+      ,
+      std::enable_if_t<
+          internal::is_same<Scalar, typename internal::traits<internal::remove_all_t<OtherDerType>>::Scalar>::value &&
+              internal::is_convertible<OtherDerType, DerType>::value,
+          void*> = 0
+#endif
+      )
+      : m_value(other.value()), m_derivatives(other.derivatives()) {
+  }
 
-    template<typename OtherDerType>
-    inline AutoDiffScalar& operator=(const AutoDiffScalar<OtherDerType>& other)
-    {
-      m_value = other.value();
-      m_derivatives = other.derivatives();
-      return *this;
-    }
+  friend std::ostream& operator<<(std::ostream& s, const AutoDiffScalar& a) { return s << a.value(); }
 
-    inline AutoDiffScalar& operator=(const AutoDiffScalar& other)
-    {
-      m_value = other.value();
-      m_derivatives = other.derivatives();
-      return *this;
-    }
+  AutoDiffScalar(const AutoDiffScalar& other) : m_value(other.value()), m_derivatives(other.derivatives()) {}
 
-//     inline operator const Scalar& () const { return m_value; }
-//     inline operator Scalar& () { return m_value; }
-
-    inline const Scalar& value() const { return m_value; }
-    inline Scalar& value() { return m_value; }
-
-    inline const DerType& derivatives() const { return m_derivatives; }
-    inline DerType& derivatives() { return m_derivatives; }
-
-    inline bool operator< (const Scalar& other) const  { return m_value <  other; }
-    inline bool operator<=(const Scalar& other) const  { return m_value <= other; }
-    inline bool operator> (const Scalar& other) const  { return m_value >  other; }
-    inline bool operator>=(const Scalar& other) const  { return m_value >= other; }
-    inline bool operator==(const Scalar& other) const  { return m_value == other; }
-    inline bool operator!=(const Scalar& other) const  { return m_value != other; }
-
-    friend inline bool operator< (const Scalar& a, const AutoDiffScalar& b) { return a <  b.value(); }
-    friend inline bool operator<=(const Scalar& a, const AutoDiffScalar& b) { return a <= b.value(); }
-    friend inline bool operator> (const Scalar& a, const AutoDiffScalar& b) { return a >  b.value(); }
-    friend inline bool operator>=(const Scalar& a, const AutoDiffScalar& b) { return a >= b.value(); }
-    friend inline bool operator==(const Scalar& a, const AutoDiffScalar& b) { return a == b.value(); }
-    friend inline bool operator!=(const Scalar& a, const AutoDiffScalar& b) { return a != b.value(); }
-
-    template<typename OtherDerType> inline bool operator< (const AutoDiffScalar<OtherDerType>& b) const  { return m_value <  b.value(); }
-    template<typename OtherDerType> inline bool operator<=(const AutoDiffScalar<OtherDerType>& b) const  { return m_value <= b.value(); }
-    template<typename OtherDerType> inline bool operator> (const AutoDiffScalar<OtherDerType>& b) const  { return m_value >  b.value(); }
-    template<typename OtherDerType> inline bool operator>=(const AutoDiffScalar<OtherDerType>& b) const  { return m_value >= b.value(); }
-    template<typename OtherDerType> inline bool operator==(const AutoDiffScalar<OtherDerType>& b) const  { return m_value == b.value(); }
-    template<typename OtherDerType> inline bool operator!=(const AutoDiffScalar<OtherDerType>& b) const  { return m_value != b.value(); }
-
-    inline const AutoDiffScalar<DerType&> operator+(const Scalar& other) const
-    {
-      return AutoDiffScalar<DerType&>(m_value + other, m_derivatives);
-    }
+  template <typename OtherDerType>
+  inline AutoDiffScalar& operator=(const AutoDiffScalar<OtherDerType>& other) {
+    m_value = other.value();
+    m_derivatives = other.derivatives();
+    return *this;
+  }
 
-    friend inline const AutoDiffScalar<DerType&> operator+(const Scalar& a, const AutoDiffScalar& b)
-    {
-      return AutoDiffScalar<DerType&>(a + b.value(), b.derivatives());
-    }
+  inline AutoDiffScalar& operator=(const AutoDiffScalar& other) {
+    m_value = other.value();
+    m_derivatives = other.derivatives();
+    return *this;
+  }
 
-//     inline const AutoDiffScalar<DerType&> operator+(const Real& other) const
-//     {
-//       return AutoDiffScalar<DerType&>(m_value + other, m_derivatives);
-//     }
+  inline AutoDiffScalar& operator=(const Scalar& other) {
+    m_value = other;
+    if (m_derivatives.size() > 0) m_derivatives.setZero();
+    return *this;
+  }
 
-//     friend inline const AutoDiffScalar<DerType&> operator+(const Real& a, const AutoDiffScalar& b)
-//     {
-//       return AutoDiffScalar<DerType&>(a + b.value(), b.derivatives());
-//     }
+  //     inline operator const Scalar& () const { return m_value; }
+  //     inline operator Scalar& () { return m_value; }
 
-    inline AutoDiffScalar& operator+=(const Scalar& other)
-    {
-      value() += other;
-      return *this;
-    }
+  inline const Scalar& value() const { return m_value; }
+  inline Scalar& value() { return m_value; }
 
-    template<typename OtherDerType>
-    inline const AutoDiffScalar<CwiseBinaryOp<internal::scalar_sum_op<Scalar>,const DerType,const typename internal::remove_all<OtherDerType>::type> >
-    operator+(const AutoDiffScalar<OtherDerType>& other) const
-    {
-      internal::make_coherent(m_derivatives, other.derivatives());
-      return AutoDiffScalar<CwiseBinaryOp<internal::scalar_sum_op<Scalar>,const DerType,const typename internal::remove_all<OtherDerType>::type> >(
-        m_value + other.value(),
-        m_derivatives + other.derivatives());
-    }
+  inline const DerType& derivatives() const { return m_derivatives; }
+  inline DerType& derivatives() { return m_derivatives; }
 
-    template<typename OtherDerType>
-    inline AutoDiffScalar&
-    operator+=(const AutoDiffScalar<OtherDerType>& other)
-    {
-      (*this) = (*this) + other;
-      return *this;
-    }
+  inline bool operator<(const Scalar& other) const { return m_value < other; }
+  inline bool operator<=(const Scalar& other) const { return m_value <= other; }
+  inline bool operator>(const Scalar& other) const { return m_value > other; }
+  inline bool operator>=(const Scalar& other) const { return m_value >= other; }
+  inline bool operator==(const Scalar& other) const { return m_value == other; }
+  inline bool operator!=(const Scalar& other) const { return m_value != other; }
 
-    inline const AutoDiffScalar<DerType&> operator-(const Scalar& b) const
-    {
-      return AutoDiffScalar<DerType&>(m_value - b, m_derivatives);
-    }
+  friend inline bool operator<(const Scalar& a, const AutoDiffScalar& b) { return a < b.value(); }
+  friend inline bool operator<=(const Scalar& a, const AutoDiffScalar& b) { return a <= b.value(); }
+  friend inline bool operator>(const Scalar& a, const AutoDiffScalar& b) { return a > b.value(); }
+  friend inline bool operator>=(const Scalar& a, const AutoDiffScalar& b) { return a >= b.value(); }
+  friend inline bool operator==(const Scalar& a, const AutoDiffScalar& b) { return a == b.value(); }
+  friend inline bool operator!=(const Scalar& a, const AutoDiffScalar& b) { return a != b.value(); }
 
-    friend inline const AutoDiffScalar<CwiseUnaryOp<internal::scalar_opposite_op<Scalar>, const DerType> >
-    operator-(const Scalar& a, const AutoDiffScalar& b)
-    {
-      return AutoDiffScalar<CwiseUnaryOp<internal::scalar_opposite_op<Scalar>, const DerType> >
-            (a - b.value(), -b.derivatives());
-    }
+  template <typename OtherDerType>
+  inline bool operator<(const AutoDiffScalar<OtherDerType>& b) const {
+    return m_value < b.value();
+  }
+  template <typename OtherDerType>
+  inline bool operator<=(const AutoDiffScalar<OtherDerType>& b) const {
+    return m_value <= b.value();
+  }
+  template <typename OtherDerType>
+  inline bool operator>(const AutoDiffScalar<OtherDerType>& b) const {
+    return m_value > b.value();
+  }
+  template <typename OtherDerType>
+  inline bool operator>=(const AutoDiffScalar<OtherDerType>& b) const {
+    return m_value >= b.value();
+  }
+  template <typename OtherDerType>
+  inline bool operator==(const AutoDiffScalar<OtherDerType>& b) const {
+    return m_value == b.value();
+  }
+  template <typename OtherDerType>
+  inline bool operator!=(const AutoDiffScalar<OtherDerType>& b) const {
+    return m_value != b.value();
+  }
 
-    inline AutoDiffScalar& operator-=(const Scalar& other)
-    {
-      value() -= other;
-      return *this;
-    }
+  inline AutoDiffScalar<DerType&> operator+(const Scalar& other) const {
+    return AutoDiffScalar<DerType&>(m_value + other, m_derivatives);
+  }
 
-    template<typename OtherDerType>
-    inline const AutoDiffScalar<CwiseBinaryOp<internal::scalar_difference_op<Scalar>, const DerType,const typename internal::remove_all<OtherDerType>::type> >
-    operator-(const AutoDiffScalar<OtherDerType>& other) const
-    {
-      internal::make_coherent(m_derivatives, other.derivatives());
-      return AutoDiffScalar<CwiseBinaryOp<internal::scalar_difference_op<Scalar>, const DerType,const typename internal::remove_all<OtherDerType>::type> >(
-        m_value - other.value(),
-        m_derivatives - other.derivatives());
-    }
+  friend inline AutoDiffScalar<DerType&> operator+(const Scalar& a, const AutoDiffScalar& b) {
+    return AutoDiffScalar<DerType&>(a + b.value(), b.derivatives());
+  }
 
-    template<typename OtherDerType>
-    inline AutoDiffScalar&
-    operator-=(const AutoDiffScalar<OtherDerType>& other)
-    {
-      *this = *this - other;
-      return *this;
-    }
+  //     inline const AutoDiffScalar<DerType&> operator+(const Real& other) const
+  //     {
+  //       return AutoDiffScalar<DerType&>(m_value + other, m_derivatives);
+  //     }
 
-    inline const AutoDiffScalar<CwiseUnaryOp<internal::scalar_opposite_op<Scalar>, const DerType> >
-    operator-() const
-    {
-      return AutoDiffScalar<CwiseUnaryOp<internal::scalar_opposite_op<Scalar>, const DerType> >(
-        -m_value,
-        -m_derivatives);
-    }
+  //     friend inline const AutoDiffScalar<DerType&> operator+(const Real& a, const AutoDiffScalar& b)
+  //     {
+  //       return AutoDiffScalar<DerType&>(a + b.value(), b.derivatives());
+  //     }
 
-    inline const AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >
-    operator*(const Scalar& other) const
-    {
-      return AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >(
-        m_value * other,
-        (m_derivatives * other));
-    }
+  inline AutoDiffScalar& operator+=(const Scalar& other) {
+    value() += other;
+    return *this;
+  }
 
-    friend inline const AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >
-    operator*(const Scalar& other, const AutoDiffScalar& a)
-    {
-      return AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >(
-        a.value() * other,
-        a.derivatives() * other);
-    }
+  template <typename OtherDerType>
+  inline auto operator+(const AutoDiffScalar<OtherDerType>& other) const {
+    return MakeAutoDiffScalar(
+        m_value + other.value(),
+        internal::MakeCoherentCwiseBinaryOp<internal::scalar_sum_op<Scalar>>(m_derivatives, other.derivatives()));
+  }
 
-//     inline const AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >
-//     operator*(const Real& other) const
-//     {
-//       return AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >(
-//         m_value * other,
-//         (m_derivatives * other));
-//     }
-//
-//     friend inline const AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >
-//     operator*(const Real& other, const AutoDiffScalar& a)
-//     {
-//       return AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >(
-//         a.value() * other,
-//         a.derivatives() * other);
-//     }
-
-    inline const AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >
-    operator/(const Scalar& other) const
-    {
-      return AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >(
-        m_value / other,
-        (m_derivatives * (Scalar(1)/other)));
-    }
+  template <typename OtherDerType>
+  inline AutoDiffScalar& operator+=(const AutoDiffScalar<OtherDerType>& other) {
+    (*this) = (*this) + other;
+    return *this;
+  }
 
-    friend inline const AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >
-    operator/(const Scalar& other, const AutoDiffScalar& a)
-    {
-      return AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >(
-        other / a.value(),
-        a.derivatives() * (Scalar(-other) / (a.value()*a.value())));
-    }
+  inline AutoDiffScalar<DerType&> operator-(const Scalar& b) const {
+    return AutoDiffScalar<DerType&>(m_value - b, m_derivatives);
+  }
 
-//     inline const AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >
-//     operator/(const Real& other) const
-//     {
-//       return AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >(
-//         m_value / other,
-//         (m_derivatives * (Real(1)/other)));
-//     }
-//
-//     friend inline const AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >
-//     operator/(const Real& other, const AutoDiffScalar& a)
-//     {
-//       return AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >(
-//         other / a.value(),
-//         a.derivatives() * (-Real(1)/other));
-//     }
-
-    template<typename OtherDerType>
-    inline const AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>,
-        const CwiseBinaryOp<internal::scalar_difference_op<Scalar>,
-          const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType>,
-          const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const typename internal::remove_all<OtherDerType>::type > > > >
-    operator/(const AutoDiffScalar<OtherDerType>& other) const
-    {
-      internal::make_coherent(m_derivatives, other.derivatives());
-      return AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>,
-        const CwiseBinaryOp<internal::scalar_difference_op<Scalar>,
-          const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType>,
-          const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const typename internal::remove_all<OtherDerType>::type > > > >(
-        m_value / other.value(),
-          ((m_derivatives * other.value()) - (m_value * other.derivatives()))
-        * (Scalar(1)/(other.value()*other.value())));
-    }
+  friend inline AutoDiffScalar<CwiseUnaryOp<internal::scalar_opposite_op<Scalar>, const DerType>> operator-(
+      const Scalar& a, const AutoDiffScalar& b) {
+    return AutoDiffScalar<CwiseUnaryOp<internal::scalar_opposite_op<Scalar>, const DerType>>(a - b.value(),
+                                                                                             -b.derivatives());
+  }
 
-    template<typename OtherDerType>
-    inline const AutoDiffScalar<CwiseBinaryOp<internal::scalar_sum_op<Scalar>,
-        const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType>,
-        const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const typename internal::remove_all<OtherDerType>::type> > >
-    operator*(const AutoDiffScalar<OtherDerType>& other) const
-    {
-      internal::make_coherent(m_derivatives, other.derivatives());
-      return AutoDiffScalar<const CwiseBinaryOp<internal::scalar_sum_op<Scalar>,
-        const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType>,
-        const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const typename internal::remove_all<OtherDerType>::type > > >(
-        m_value * other.value(),
-        (m_derivatives * other.value()) + (m_value * other.derivatives()));
-    }
+  inline AutoDiffScalar& operator-=(const Scalar& other) {
+    value() -= other;
+    return *this;
+  }
 
-    inline AutoDiffScalar& operator*=(const Scalar& other)
-    {
-      *this = *this * other;
-      return *this;
-    }
+  template <typename OtherDerType>
+  inline auto operator-(const AutoDiffScalar<OtherDerType>& other) const {
+    return MakeAutoDiffScalar(m_value - other.value(),
+                              internal::MakeCoherentCwiseBinaryOp<internal::scalar_difference_op<Scalar>>(
+                                  m_derivatives, other.derivatives()));
+  }
 
-    template<typename OtherDerType>
-    inline AutoDiffScalar& operator*=(const AutoDiffScalar<OtherDerType>& other)
-    {
-      *this = *this * other;
-      return *this;
-    }
+  template <typename OtherDerType>
+  inline AutoDiffScalar& operator-=(const AutoDiffScalar<OtherDerType>& other) {
+    *this = *this - other;
+    return *this;
+  }
 
-    inline AutoDiffScalar& operator/=(const Scalar& other)
-    {
-      *this = *this / other;
-      return *this;
-    }
+  inline AutoDiffScalar<CwiseUnaryOp<internal::scalar_opposite_op<Scalar>, const DerType>> operator-() const {
+    return AutoDiffScalar<CwiseUnaryOp<internal::scalar_opposite_op<Scalar>, const DerType>>(-m_value, -m_derivatives);
+  }
 
-    template<typename OtherDerType>
-    inline AutoDiffScalar& operator/=(const AutoDiffScalar<OtherDerType>& other)
-    {
-      *this = *this / other;
-      return *this;
-    }
+  inline auto operator*(const Scalar& other) const {
+    return MakeAutoDiffScalar(m_value * other, m_derivatives * other);
+  }
+
+  friend inline auto operator*(const Scalar& other, const AutoDiffScalar& a) {
+    return MakeAutoDiffScalar(a.value() * other, a.derivatives() * other);
+  }
+
+  //     inline const AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >
+  //     operator*(const Real& other) const
+  //     {
+  //       return AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >(
+  //         m_value * other,
+  //         (m_derivatives * other));
+  //     }
+  //
+  //     friend inline const AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >
+  //     operator*(const Real& other, const AutoDiffScalar& a)
+  //     {
+  //       return AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >(
+  //         a.value() * other,
+  //         a.derivatives() * other);
+  //     }
+
+  inline auto operator/(const Scalar& other) const {
+    return MakeAutoDiffScalar(m_value / other, (m_derivatives * (Scalar(1) / other)));
+  }
+
+  friend inline auto operator/(const Scalar& other, const AutoDiffScalar& a) {
+    return MakeAutoDiffScalar(other / a.value(), a.derivatives() * (Scalar(-other) / (a.value() * a.value())));
+  }
 
-  protected:
-    Scalar m_value;
-    DerType m_derivatives;
+  //     inline const AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >
+  //     operator/(const Real& other) const
+  //     {
+  //       return AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >(
+  //         m_value / other,
+  //         (m_derivatives * (Real(1)/other)));
+  //     }
+  //
+  //     friend inline const AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >
+  //     operator/(const Real& other, const AutoDiffScalar& a)
+  //     {
+  //       return AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >(
+  //         other / a.value(),
+  //         a.derivatives() * (-Real(1)/other));
+  //     }
+
+  template <typename OtherDerType>
+  inline auto operator/(const AutoDiffScalar<OtherDerType>& other) const {
+    return MakeAutoDiffScalar(m_value / other.value(),
+                              internal::MakeCoherentCwiseBinaryOp<internal::scalar_difference_op<Scalar>>(
+                                  m_derivatives * other.value(), (other.derivatives() * m_value)) *
+                                  (Scalar(1) / (other.value() * other.value())));
+  }
+
+  template <typename OtherDerType>
+  inline auto operator*(const AutoDiffScalar<OtherDerType>& other) const {
+    return MakeAutoDiffScalar(m_value * other.value(),
+                              internal::MakeCoherentCwiseBinaryOp<internal::scalar_sum_op<Scalar>>(
+                                  m_derivatives * other.value(), other.derivatives() * m_value));
+  }
+
+  inline AutoDiffScalar& operator*=(const Scalar& other) {
+    *this = *this * other;
+    return *this;
+  }
+
+  template <typename OtherDerType>
+  inline AutoDiffScalar& operator*=(const AutoDiffScalar<OtherDerType>& other) {
+    *this = *this * other;
+    return *this;
+  }
 
+  inline AutoDiffScalar& operator/=(const Scalar& other) {
+    *this = *this / other;
+    return *this;
+  }
+
+  template <typename OtherDerType>
+  inline AutoDiffScalar& operator/=(const AutoDiffScalar<OtherDerType>& other) {
+    *this = *this / other;
+    return *this;
+  }
+
+ protected:
+  Scalar m_value;
+  DerType m_derivatives;
 };
 
 namespace internal {
 
-template<typename _DerType>
-struct auto_diff_special_op<_DerType, true>
-//   : auto_diff_scalar_op<_DerType, typename NumTraits<Scalar>::Real,
+template <typename DerivativeType>
+struct auto_diff_special_op<DerivativeType, true>
+//   : auto_diff_scalar_op<DerivativeType, typename NumTraits<Scalar>::Real,
 //                            is_same<Scalar,typename NumTraits<Scalar>::Real>::value>
 {
-  typedef typename remove_all<_DerType>::type DerType;
+  typedef remove_all_t<DerivativeType> DerType;
   typedef typename traits<DerType>::Scalar Scalar;
   typedef typename NumTraits<Scalar>::Real Real;
 
-//   typedef auto_diff_scalar_op<_DerType, typename NumTraits<Scalar>::Real,
-//                            is_same<Scalar,typename NumTraits<Scalar>::Real>::value> Base;
-
-//   using Base::operator+;
-//   using Base::operator+=;
-//   using Base::operator-;
-//   using Base::operator-=;
-//   using Base::operator*;
-//   using Base::operator*=;
+  //   typedef auto_diff_scalar_op<DerivativeType, typename NumTraits<Scalar>::Real,
+  //                            is_same<Scalar,typename NumTraits<Scalar>::Real>::value> Base;
 
-  const AutoDiffScalar<_DerType>& derived() const { return *static_cast<const AutoDiffScalar<_DerType>*>(this); }
-  AutoDiffScalar<_DerType>& derived() { return *static_cast<AutoDiffScalar<_DerType>*>(this); }
+  //   using Base::operator+;
+  //   using Base::operator+=;
+  //   using Base::operator-;
+  //   using Base::operator-=;
+  //   using Base::operator*;
+  //   using Base::operator*=;
 
+  const AutoDiffScalar<DerivativeType>& derived() const {
+    return *static_cast<const AutoDiffScalar<DerivativeType>*>(this);
+  }
+  AutoDiffScalar<DerivativeType>& derived() { return *static_cast<AutoDiffScalar<DerivativeType>*>(this); }
 
-  inline const AutoDiffScalar<DerType&> operator+(const Real& other) const
-  {
+  inline AutoDiffScalar<DerType&> operator+(const Real& other) const {
     return AutoDiffScalar<DerType&>(derived().value() + other, derived().derivatives());
   }
 
-  friend inline const AutoDiffScalar<DerType&> operator+(const Real& a, const AutoDiffScalar<_DerType>& b)
-  {
+  friend inline AutoDiffScalar<DerType&> operator+(const Real& a, const AutoDiffScalar<DerivativeType>& b) {
     return AutoDiffScalar<DerType&>(a + b.value(), b.derivatives());
   }
 
-  inline AutoDiffScalar<_DerType>& operator+=(const Real& other)
-  {
+  inline AutoDiffScalar<DerivativeType>& operator+=(const Real& other) {
     derived().value() += other;
     return derived();
   }
 
-
-  inline const AutoDiffScalar<typename CwiseUnaryOp<scalar_multiple2_op<Scalar,Real>, DerType>::Type >
-  operator*(const Real& other) const
-  {
-    return AutoDiffScalar<typename CwiseUnaryOp<scalar_multiple2_op<Scalar,Real>, DerType>::Type >(
-      derived().value() * other,
-      derived().derivatives() * other);
+  inline AutoDiffScalar<typename CwiseUnaryOp<bind2nd_op<scalar_product_op<Scalar, Real>>, DerType>::Type> operator*(
+      const Real& other) const {
+    return AutoDiffScalar<typename CwiseUnaryOp<bind2nd_op<scalar_product_op<Scalar, Real>>, DerType>::Type>(
+        derived().value() * other, derived().derivatives() * other);
   }
 
-  friend inline const AutoDiffScalar<typename CwiseUnaryOp<scalar_multiple2_op<Scalar,Real>, DerType>::Type >
-  operator*(const Real& other, const AutoDiffScalar<_DerType>& a)
-  {
-    return AutoDiffScalar<typename CwiseUnaryOp<scalar_multiple2_op<Scalar,Real>, DerType>::Type >(
-      a.value() * other,
-      a.derivatives() * other);
+  friend inline AutoDiffScalar<typename CwiseUnaryOp<bind1st_op<scalar_product_op<Real, Scalar>>, DerType>::Type>
+  operator*(const Real& other, const AutoDiffScalar<DerivativeType>& a) {
+    return AutoDiffScalar<typename CwiseUnaryOp<bind1st_op<scalar_product_op<Real, Scalar>>, DerType>::Type>(
+        a.value() * other, a.derivatives() * other);
   }
 
-  inline AutoDiffScalar<_DerType>& operator*=(const Scalar& other)
-  {
+  inline AutoDiffScalar<DerivativeType>& operator*=(const Scalar& other) {
     *this = *this * other;
     return derived();
   }
 };
 
-template<typename _DerType>
-struct auto_diff_special_op<_DerType, false>
-{
+template <typename DerivativeType>
+struct auto_diff_special_op<DerivativeType, false> {
   void operator*() const;
   void operator-() const;
   void operator+() const;
 };
 
-template<typename A_Scalar, int A_Rows, int A_Cols, int A_Options, int A_MaxRows, int A_MaxCols, typename B>
-struct make_coherent_impl<Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, A_MaxCols>, B> {
-  typedef Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, A_MaxCols> A;
-  static void run(A& a, B& b) {
-    if((A_Rows==Dynamic || A_Cols==Dynamic) && (a.size()==0))
-    {
-      a.resize(b.size());
-      a.setZero();
-    }
-  }
+}  // end namespace internal
+
+template <typename DerType, typename BinOp>
+struct ScalarBinaryOpTraits<AutoDiffScalar<DerType>, typename DerType::Scalar, BinOp> {
+  typedef AutoDiffScalar<DerType> ReturnType;
 };
 
-template<typename A, typename B_Scalar, int B_Rows, int B_Cols, int B_Options, int B_MaxRows, int B_MaxCols>
-struct make_coherent_impl<A, Matrix<B_Scalar, B_Rows, B_Cols, B_Options, B_MaxRows, B_MaxCols> > {
-  typedef Matrix<B_Scalar, B_Rows, B_Cols, B_Options, B_MaxRows, B_MaxCols> B;
-  static void run(A& a, B& b) {
-    if((B_Rows==Dynamic || B_Cols==Dynamic) && (b.size()==0))
-    {
-      b.resize(a.size());
-      b.setZero();
-    }
-  }
+template <typename DerType, typename BinOp>
+struct ScalarBinaryOpTraits<typename DerType::Scalar, AutoDiffScalar<DerType>, BinOp> {
+  typedef AutoDiffScalar<DerType> ReturnType;
 };
 
-template<typename A_Scalar, int A_Rows, int A_Cols, int A_Options, int A_MaxRows, int A_MaxCols,
-         typename B_Scalar, int B_Rows, int B_Cols, int B_Options, int B_MaxRows, int B_MaxCols>
-struct make_coherent_impl<Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, A_MaxCols>,
-                             Matrix<B_Scalar, B_Rows, B_Cols, B_Options, B_MaxRows, B_MaxCols> > {
-  typedef Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, A_MaxCols> A;
-  typedef Matrix<B_Scalar, B_Rows, B_Cols, B_Options, B_MaxRows, B_MaxCols> B;
-  static void run(A& a, B& b) {
-    if((A_Rows==Dynamic || A_Cols==Dynamic) && (a.size()==0))
-    {
-      a.resize(b.size());
-      a.setZero();
-    }
-    else if((B_Rows==Dynamic || B_Cols==Dynamic) && (b.size()==0))
-    {
-      b.resize(a.size());
-      b.setZero();
-    }
+// The following is an attempt to let Eigen's known about expression template, but that's more tricky!
+
+// template<typename DerType, typename BinOp>
+// struct ScalarBinaryOpTraits<AutoDiffScalar<DerType>,AutoDiffScalar<DerType>, BinOp>
+// {
+//   enum { Defined = 1 };
+//   typedef AutoDiffScalar<typename DerType::PlainObject> ReturnType;
+// };
+//
+// template<typename DerType1,typename DerType2, typename BinOp>
+// struct ScalarBinaryOpTraits<AutoDiffScalar<DerType1>,AutoDiffScalar<DerType2>, BinOp>
+// {
+//   enum { Defined = 1 };//internal::is_same<typename DerType1::Scalar,typename DerType2::Scalar>::value };
+//   typedef AutoDiffScalar<typename DerType1::PlainObject> ReturnType;
+// };
+
+#define EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(FUNC, CODE)                                              \
+  template <typename DerType>                                                                        \
+  inline auto FUNC(const Eigen::AutoDiffScalar<DerType>& x) {                                        \
+    using namespace Eigen;                                                                           \
+    typedef typename Eigen::internal::traits<Eigen::internal::remove_all_t<DerType>>::Scalar Scalar; \
+    EIGEN_UNUSED_VARIABLE(sizeof(Scalar));                                                           \
+    CODE;                                                                                            \
   }
-};
 
-template<typename A_Scalar, int A_Rows, int A_Cols, int A_Options, int A_MaxRows, int A_MaxCols>
-struct scalar_product_traits<Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, A_MaxCols>,A_Scalar>
-{
-  enum { Defined = 1 };
-  typedef Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, A_MaxCols> ReturnType;
+template <typename DerType>
+struct CleanedUpDerType {
+  typedef AutoDiffScalar<typename Eigen::internal::remove_all_t<DerType>::PlainObject> type;
 };
 
-template<typename A_Scalar, int A_Rows, int A_Cols, int A_Options, int A_MaxRows, int A_MaxCols>
-struct scalar_product_traits<A_Scalar, Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, A_MaxCols> >
-{
-  enum { Defined = 1 };
-  typedef Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, A_MaxCols> ReturnType;
-};
+template <typename DerType>
+inline const AutoDiffScalar<DerType>& conj(const AutoDiffScalar<DerType>& x) {
+  return x;
+}
+template <typename DerType>
+inline const AutoDiffScalar<DerType>& real(const AutoDiffScalar<DerType>& x) {
+  return x;
+}
+template <typename DerType>
+inline typename DerType::Scalar imag(const AutoDiffScalar<DerType>&) {
+  return 0.;
+}
+template <typename DerType, typename T>
+inline typename CleanedUpDerType<DerType>::type(min)(const AutoDiffScalar<DerType>& x, const T& y) {
+  typedef typename CleanedUpDerType<DerType>::type ADS;
+  return (x <= y ? ADS(x) : ADS(y));
+}
+template <typename DerType, typename T>
+inline typename CleanedUpDerType<DerType>::type(max)(const AutoDiffScalar<DerType>& x, const T& y) {
+  typedef typename CleanedUpDerType<DerType>::type ADS;
+  return (x >= y ? ADS(x) : ADS(y));
+}
+template <typename DerType, typename T>
+inline typename CleanedUpDerType<DerType>::type(min)(const T& x, const AutoDiffScalar<DerType>& y) {
+  typedef typename CleanedUpDerType<DerType>::type ADS;
+  return (x < y ? ADS(x) : ADS(y));
+}
+template <typename DerType, typename T>
+inline typename CleanedUpDerType<DerType>::type(max)(const T& x, const AutoDiffScalar<DerType>& y) {
+  typedef typename CleanedUpDerType<DerType>::type ADS;
+  return (x > y ? ADS(x) : ADS(y));
+}
+template <typename DerType>
+inline
+    typename CleanedUpDerType<DerType>::type(min)(const AutoDiffScalar<DerType>& x, const AutoDiffScalar<DerType>& y) {
+  return (x.value() < y.value() ? x : y);
+}
+template <typename DerType>
+inline
+    typename CleanedUpDerType<DerType>::type(max)(const AutoDiffScalar<DerType>& x, const AutoDiffScalar<DerType>& y) {
+  return (x.value() >= y.value() ? x : y);
+}
 
-template<typename DerType>
-struct scalar_product_traits<AutoDiffScalar<DerType>,typename DerType::Scalar>
-{
-  enum { Defined = 1 };
-  typedef AutoDiffScalar<DerType> ReturnType;
-};
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(abs, using std::abs;
+                                    return Eigen::MakeAutoDiffScalar(abs(x.value()),
+                                                                     x.derivatives() * (x.value() < 0 ? -1 : 1));)
 
-template<typename DerType>
-struct scalar_product_traits<typename DerType::Scalar,AutoDiffScalar<DerType> >
-{
-  enum { Defined = 1 };
-  typedef AutoDiffScalar<DerType> ReturnType;
-};
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(abs2, using numext::abs2;
+                                    return Eigen::MakeAutoDiffScalar(abs2(x.value()),
+                                                                     x.derivatives() * (Scalar(2) * x.value()));)
 
-} // end namespace internal
-
-#define EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(FUNC,CODE) \
-  template<typename DerType> \
-  inline const Eigen::AutoDiffScalar<Eigen::CwiseUnaryOp<Eigen::internal::scalar_multiple_op<typename Eigen::internal::traits<typename Eigen::internal::remove_all<DerType>::type>::Scalar>, const typename Eigen::internal::remove_all<DerType>::type> > \
-  FUNC(const Eigen::AutoDiffScalar<DerType>& x) { \
-    using namespace Eigen; \
-    typedef typename Eigen::internal::traits<typename Eigen::internal::remove_all<DerType>::type>::Scalar Scalar; \
-    typedef AutoDiffScalar<CwiseUnaryOp<Eigen::internal::scalar_multiple_op<Scalar>, const typename Eigen::internal::remove_all<DerType>::type> > ReturnType; \
-    CODE; \
-  }
-
-template<typename DerType>
-inline const AutoDiffScalar<DerType>& conj(const AutoDiffScalar<DerType>& x)  { return x; }
-template<typename DerType>
-inline const AutoDiffScalar<DerType>& real(const AutoDiffScalar<DerType>& x)  { return x; }
-template<typename DerType>
-inline typename DerType::Scalar imag(const AutoDiffScalar<DerType>&)    { return 0.; }
-template<typename DerType, typename T>
-inline AutoDiffScalar<DerType> (min)(const AutoDiffScalar<DerType>& x, const T& y)    { return (x <= y ? x : y); }
-template<typename DerType, typename T>
-inline AutoDiffScalar<DerType> (max)(const AutoDiffScalar<DerType>& x, const T& y)    { return (x >= y ? x : y); }
-template<typename DerType, typename T>
-inline AutoDiffScalar<DerType> (min)(const T& x, const AutoDiffScalar<DerType>& y)    { return (x < y ? x : y); }
-template<typename DerType, typename T>
-inline AutoDiffScalar<DerType> (max)(const T& x, const AutoDiffScalar<DerType>& y)    { return (x > y ? x : y); }
-
-EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(abs,
-  using std::abs;
-  return ReturnType(abs(x.value()), x.derivatives() * (x.value()<0 ? -1 : 1) );)
-
-EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(abs2,
-  using numext::abs2;
-  return ReturnType(abs2(x.value()), x.derivatives() * (Scalar(2)*x.value()));)
-
-EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(sqrt,
-  using std::sqrt;
-  Scalar sqrtx = sqrt(x.value());
-  return ReturnType(sqrtx,x.derivatives() * (Scalar(0.5) / sqrtx));)
-
-EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(cos,
-  using std::cos;
-  using std::sin;
-  return ReturnType(cos(x.value()), x.derivatives() * (-sin(x.value())));)
-
-EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(sin,
-  using std::sin;
-  using std::cos;
-  return ReturnType(sin(x.value()),x.derivatives() * cos(x.value()));)
-
-EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(exp,
-  using std::exp;
-  Scalar expx = exp(x.value());
-  return ReturnType(expx,x.derivatives() * expx);)
-
-EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(log,
-  using std::log;
-  return ReturnType(log(x.value()),x.derivatives() * (Scalar(1)/x.value()));)
-
-template<typename DerType>
-inline const Eigen::AutoDiffScalar<Eigen::CwiseUnaryOp<Eigen::internal::scalar_multiple_op<typename Eigen::internal::traits<DerType>::Scalar>, const DerType> >
-pow(const Eigen::AutoDiffScalar<DerType>& x, typename Eigen::internal::traits<DerType>::Scalar y)
-{
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(sqrt, using std::sqrt; Scalar sqrtx = sqrt(x.value());
+                                    return Eigen::MakeAutoDiffScalar(sqrtx, x.derivatives() * (Scalar(0.5) / sqrtx));)
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(cos, using std::cos; using std::sin;
+                                    return Eigen::MakeAutoDiffScalar(cos(x.value()),
+                                                                     x.derivatives() * (-sin(x.value())));)
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(sin, using std::sin; using std::cos;
+                                    return Eigen::MakeAutoDiffScalar(sin(x.value()), x.derivatives() * cos(x.value()));)
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(exp, using std::exp; Scalar expx = exp(x.value());
+                                    return Eigen::MakeAutoDiffScalar(expx, x.derivatives() * expx);)
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(log, using std::log;
+                                    return Eigen::MakeAutoDiffScalar(log(x.value()),
+                                                                     x.derivatives() * (Scalar(1) / x.value()));)
+
+template <typename DerType>
+inline auto pow(const Eigen::AutoDiffScalar<DerType>& x,
+                const typename internal::traits<internal::remove_all_t<DerType>>::Scalar& y) {
   using namespace Eigen;
-  typedef typename Eigen::internal::traits<DerType>::Scalar Scalar;
-  return AutoDiffScalar<CwiseUnaryOp<Eigen::internal::scalar_multiple_op<Scalar>, const DerType> >(
-    std::pow(x.value(),y),
-    x.derivatives() * (y * std::pow(x.value(),y-1)));
+  using std::pow;
+  return Eigen::MakeAutoDiffScalar(pow(x.value(), y), x.derivatives() * (y * pow(x.value(), y - 1)));
 }
 
-
-template<typename DerTypeA,typename DerTypeB>
-inline const AutoDiffScalar<Matrix<typename internal::traits<DerTypeA>::Scalar,Dynamic,1> >
-atan2(const AutoDiffScalar<DerTypeA>& a, const AutoDiffScalar<DerTypeB>& b)
-{
+template <typename DerTypeA, typename DerTypeB>
+inline AutoDiffScalar<Matrix<typename internal::traits<internal::remove_all_t<DerTypeA>>::Scalar, Dynamic, 1>> atan2(
+    const AutoDiffScalar<DerTypeA>& a, const AutoDiffScalar<DerTypeB>& b) {
   using std::atan2;
-  using std::max;
-  typedef typename internal::traits<DerTypeA>::Scalar Scalar;
-  typedef AutoDiffScalar<Matrix<Scalar,Dynamic,1> > PlainADS;
+  typedef typename internal::traits<internal::remove_all_t<DerTypeA>>::Scalar Scalar;
+  typedef AutoDiffScalar<Matrix<Scalar, Dynamic, 1>> PlainADS;
   PlainADS ret;
   ret.value() = atan2(a.value(), b.value());
-  
-  Scalar tmp2 = a.value() * a.value();
-  Scalar tmp3 = b.value() * b.value();
-  Scalar tmp4 = tmp3/(tmp2+tmp3);
-  
-  if (tmp4!=0)
-    ret.derivatives() = (a.derivatives() * b.value() - a.value() * b.derivatives()) * (tmp2+tmp3);
+
+  Scalar squared_hypot = a.value() * a.value() + b.value() * b.value();
+
+  // if (squared_hypot==0) the derivation is undefined and the following results in a NaN:
+  ret.derivatives() = (a.derivatives() * b.value() - a.value() * b.derivatives()) / squared_hypot;
 
   return ret;
 }
 
-EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(tan,
-  using std::tan;
-  using std::cos;
-  return ReturnType(tan(x.value()),x.derivatives() * (Scalar(1)/numext::abs2(cos(x.value()))));)
-
-EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(asin,
-  using std::sqrt;
-  using std::asin;
-  return ReturnType(asin(x.value()),x.derivatives() * (Scalar(1)/sqrt(1-numext::abs2(x.value()))));)
-  
-EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(acos,
-  using std::sqrt;
-  using std::acos;
-  return ReturnType(acos(x.value()),x.derivatives() * (Scalar(-1)/sqrt(1-numext::abs2(x.value()))));)
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(tan, using std::tan; using std::cos; return Eigen::MakeAutoDiffScalar(
+                                        tan(x.value()), x.derivatives() * (Scalar(1) / numext::abs2(cos(x.value()))));)
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(asin, using std::sqrt; using std::asin; return Eigen::MakeAutoDiffScalar(
+                                        asin(x.value()),
+                                        x.derivatives() * (Scalar(1) / sqrt(1 - numext::abs2(x.value()))));)
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(acos, using std::sqrt; using std::acos; return Eigen::MakeAutoDiffScalar(
+                                        acos(x.value()),
+                                        x.derivatives() * (Scalar(-1) / sqrt(1 - numext::abs2(x.value()))));)
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(
+    tanh, using std::cosh; using std::tanh;
+    return Eigen::MakeAutoDiffScalar(tanh(x.value()), x.derivatives() * (Scalar(1) / numext::abs2(cosh(x.value()))));)
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(sinh, using std::sinh; using std::cosh;
+                                    return Eigen::MakeAutoDiffScalar(sinh(x.value()),
+                                                                     x.derivatives() * cosh(x.value()));)
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(cosh, using std::sinh; using std::cosh;
+                                    return Eigen::MakeAutoDiffScalar(cosh(x.value()),
+                                                                     x.derivatives() * sinh(x.value()));)
 
 #undef EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY
 
-template<typename DerType> struct NumTraits<AutoDiffScalar<DerType> >
-  : NumTraits< typename NumTraits<typename DerType::Scalar>::Real >
-{
-  typedef AutoDiffScalar<Matrix<typename NumTraits<typename DerType::Scalar>::Real,DerType::RowsAtCompileTime,DerType::ColsAtCompileTime> > Real;
+template <typename DerType>
+struct NumTraits<AutoDiffScalar<DerType>>
+    : NumTraits<typename NumTraits<typename internal::remove_all_t<DerType>::Scalar>::Real> {
+  typedef internal::remove_all_t<DerType> DerTypeCleaned;
+  typedef AutoDiffScalar<Matrix<typename NumTraits<typename DerTypeCleaned::Scalar>::Real,
+                                DerTypeCleaned::RowsAtCompileTime, DerTypeCleaned::ColsAtCompileTime, 0,
+                                DerTypeCleaned::MaxRowsAtCompileTime, DerTypeCleaned::MaxColsAtCompileTime>>
+      Real;
   typedef AutoDiffScalar<DerType> NonInteger;
   typedef AutoDiffScalar<DerType> Nested;
-  enum{
-    RequireInitialization = 1
-  };
+  typedef typename NumTraits<typename DerTypeCleaned::Scalar>::Literal Literal;
+  enum { RequireInitialization = 1 };
 };
 
-}
+namespace internal {
+template <typename DerivativeType>
+struct is_identically_zero_impl<AutoDiffScalar<DerivativeType>> {
+  static inline bool run(const AutoDiffScalar<DerivativeType>& s) {
+    const DerivativeType& derivatives = s.derivatives();
+    for (int i = 0; i < derivatives.size(); ++i) {
+      if (!numext::is_exactly_zero(derivatives[i])) {
+        return false;
+      }
+    }
+    return numext::is_exactly_zero(s.value());
+  }
+};
+}  // namespace internal
+}  // namespace Eigen
+
+namespace std {
+
+template <typename T>
+class numeric_limits<Eigen::AutoDiffScalar<T>> : public numeric_limits<typename T::Scalar> {};
+
+template <typename T>
+class numeric_limits<Eigen::AutoDiffScalar<T&>> : public numeric_limits<typename T::Scalar> {};
+
+}  // namespace std
 
-#endif // EIGEN_AUTODIFF_SCALAR_H
+#endif  // EIGEN_AUTODIFF_SCALAR_H
diff --git a/inst/include/unsupported/Eigen/src/AutoDiff/AutoDiffVector.h b/inst/include/unsupported/Eigen/src/AutoDiff/AutoDiffVector.h
index 8c2d0483..62314567 100644
--- a/inst/include/unsupported/Eigen/src/AutoDiff/AutoDiffVector.h
+++ b/inst/include/unsupported/Eigen/src/AutoDiff/AutoDiffVector.h
@@ -10,211 +10,180 @@
 #ifndef EIGEN_AUTODIFF_VECTOR_H
 #define EIGEN_AUTODIFF_VECTOR_H
 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
 namespace Eigen {
 
 /* \class AutoDiffScalar
-  * \brief A scalar type replacement with automatic differentation capability
-  *
-  * \param DerType the vector type used to store/represent the derivatives (e.g. Vector3f)
-  *
-  * This class represents a scalar value while tracking its respective derivatives.
-  *
-  * It supports the following list of global math function:
-  *  - std::abs, std::sqrt, std::pow, std::exp, std::log, std::sin, std::cos,
-  *  - internal::abs, internal::sqrt, numext::pow, internal::exp, internal::log, internal::sin, internal::cos,
-  *  - internal::conj, internal::real, internal::imag, numext::abs2.
-  *
-  * AutoDiffScalar can be used as the scalar type of an Eigen::Matrix object. However,
-  * in that case, the expression template mechanism only occurs at the top Matrix level,
-  * while derivatives are computed right away.
-  *
-  */
-template<typename ValueType, typename JacobianType>
-class AutoDiffVector
-{
-  public:
-    //typedef typename internal::traits<ValueType>::Scalar Scalar;
-    typedef typename internal::traits<ValueType>::Scalar BaseScalar;
-    typedef AutoDiffScalar<Matrix<BaseScalar,JacobianType::RowsAtCompileTime,1> > ActiveScalar;
-    typedef ActiveScalar Scalar;
-    typedef AutoDiffScalar<typename JacobianType::ColXpr> CoeffType;
-    typedef typename JacobianType::Index Index;
-
-    inline AutoDiffVector() {}
-
-    inline AutoDiffVector(const ValueType& values)
-      : m_values(values)
-    {
-      m_jacobian.setZero();
-    }
-
-
-    CoeffType operator[] (Index i) { return CoeffType(m_values[i], m_jacobian.col(i)); }
-    const CoeffType operator[] (Index i) const { return CoeffType(m_values[i], m_jacobian.col(i)); }
-
-    CoeffType operator() (Index i) { return CoeffType(m_values[i], m_jacobian.col(i)); }
-    const CoeffType operator() (Index i) const { return CoeffType(m_values[i], m_jacobian.col(i)); }
-
-    CoeffType coeffRef(Index i) { return CoeffType(m_values[i], m_jacobian.col(i)); }
-    const CoeffType coeffRef(Index i) const { return CoeffType(m_values[i], m_jacobian.col(i)); }
-
-    Index size() const { return m_values.size(); }
-
-    // FIXME here we could return an expression of the sum
-    Scalar sum() const { /*std::cerr << "sum \n\n";*/ /*std::cerr << m_jacobian.rowwise().sum() << "\n\n";*/ return Scalar(m_values.sum(), m_jacobian.rowwise().sum()); }
-
-
-    inline AutoDiffVector(const ValueType& values, const JacobianType& jac)
-      : m_values(values), m_jacobian(jac)
-    {}
-
-    template<typename OtherValueType, typename OtherJacobianType>
-    inline AutoDiffVector(const AutoDiffVector<OtherValueType, OtherJacobianType>& other)
-      : m_values(other.values()), m_jacobian(other.jacobian())
-    {}
-
-    inline AutoDiffVector(const AutoDiffVector& other)
-      : m_values(other.values()), m_jacobian(other.jacobian())
-    {}
-
-    template<typename OtherValueType, typename OtherJacobianType>
-    inline AutoDiffVector& operator=(const AutoDiffVector<OtherValueType, OtherJacobianType>& other)
-    {
-      m_values = other.values();
-      m_jacobian = other.jacobian();
-      return *this;
-    }
-
-    inline AutoDiffVector& operator=(const AutoDiffVector& other)
-    {
-      m_values = other.values();
-      m_jacobian = other.jacobian();
-      return *this;
-    }
-
-    inline const ValueType& values() const { return m_values; }
-    inline ValueType& values() { return m_values; }
-
-    inline const JacobianType& jacobian() const { return m_jacobian; }
-    inline JacobianType& jacobian() { return m_jacobian; }
-
-    template<typename OtherValueType,typename OtherJacobianType>
-    inline const AutoDiffVector<
-      typename MakeCwiseBinaryOp<internal::scalar_sum_op<BaseScalar>,ValueType,OtherValueType>::Type,
-      typename MakeCwiseBinaryOp<internal::scalar_sum_op<BaseScalar>,JacobianType,OtherJacobianType>::Type >
-    operator+(const AutoDiffVector<OtherValueType,OtherJacobianType>& other) const
-    {
-      return AutoDiffVector<
-      typename MakeCwiseBinaryOp<internal::scalar_sum_op<BaseScalar>,ValueType,OtherValueType>::Type,
-      typename MakeCwiseBinaryOp<internal::scalar_sum_op<BaseScalar>,JacobianType,OtherJacobianType>::Type >(
-        m_values + other.values(),
-        m_jacobian + other.jacobian());
-    }
-
-    template<typename OtherValueType, typename OtherJacobianType>
-    inline AutoDiffVector&
-    operator+=(const AutoDiffVector<OtherValueType,OtherJacobianType>& other)
-    {
-      m_values += other.values();
-      m_jacobian += other.jacobian();
-      return *this;
-    }
-
-    template<typename OtherValueType,typename OtherJacobianType>
-    inline const AutoDiffVector<
-      typename MakeCwiseBinaryOp<internal::scalar_difference_op<Scalar>,ValueType,OtherValueType>::Type,
-      typename MakeCwiseBinaryOp<internal::scalar_difference_op<Scalar>,JacobianType,OtherJacobianType>::Type >
-    operator-(const AutoDiffVector<OtherValueType,OtherJacobianType>& other) const
-    {
-      return AutoDiffVector<
-        typename MakeCwiseBinaryOp<internal::scalar_difference_op<Scalar>,ValueType,OtherValueType>::Type,
-        typename MakeCwiseBinaryOp<internal::scalar_difference_op<Scalar>,JacobianType,OtherJacobianType>::Type >(
-          m_values - other.values(),
-          m_jacobian - other.jacobian());
-    }
-
-    template<typename OtherValueType, typename OtherJacobianType>
-    inline AutoDiffVector&
-    operator-=(const AutoDiffVector<OtherValueType,OtherJacobianType>& other)
-    {
-      m_values -= other.values();
-      m_jacobian -= other.jacobian();
-      return *this;
-    }
-
-    inline const AutoDiffVector<
-      typename MakeCwiseUnaryOp<internal::scalar_opposite_op<Scalar>, ValueType>::Type,
-      typename MakeCwiseUnaryOp<internal::scalar_opposite_op<Scalar>, JacobianType>::Type >
-    operator-() const
-    {
-      return AutoDiffVector<
-        typename MakeCwiseUnaryOp<internal::scalar_opposite_op<Scalar>, ValueType>::Type,
-        typename MakeCwiseUnaryOp<internal::scalar_opposite_op<Scalar>, JacobianType>::Type >(
-          -m_values,
-          -m_jacobian);
-    }
-
-    inline const AutoDiffVector<
+ * \brief A scalar type replacement with automatic differentation capability
+ *
+ * \param DerType the vector type used to store/represent the derivatives (e.g. Vector3f)
+ *
+ * This class represents a scalar value while tracking its respective derivatives.
+ *
+ * It supports the following list of global math function:
+ *  - std::abs, std::sqrt, std::pow, std::exp, std::log, std::sin, std::cos,
+ *  - internal::abs, internal::sqrt, numext::pow, internal::exp, internal::log, internal::sin, internal::cos,
+ *  - internal::conj, internal::real, internal::imag, numext::abs2.
+ *
+ * AutoDiffScalar can be used as the scalar type of an Eigen::Matrix object. However,
+ * in that case, the expression template mechanism only occurs at the top Matrix level,
+ * while derivatives are computed right away.
+ *
+ */
+template <typename ValueType, typename JacobianType>
+class AutoDiffVector {
+ public:
+  // typedef typename internal::traits<ValueType>::Scalar Scalar;
+  typedef typename internal::traits<ValueType>::Scalar BaseScalar;
+  typedef AutoDiffScalar<Matrix<BaseScalar, JacobianType::RowsAtCompileTime, 1> > ActiveScalar;
+  typedef ActiveScalar Scalar;
+  typedef AutoDiffScalar<typename JacobianType::ColXpr> CoeffType;
+  typedef typename JacobianType::Index Index;
+
+  inline AutoDiffVector() {}
+
+  inline AutoDiffVector(const ValueType& values) : m_values(values) { m_jacobian.setZero(); }
+
+  CoeffType operator[](Index i) { return CoeffType(m_values[i], m_jacobian.col(i)); }
+  const CoeffType operator[](Index i) const { return CoeffType(m_values[i], m_jacobian.col(i)); }
+
+  CoeffType operator()(Index i) { return CoeffType(m_values[i], m_jacobian.col(i)); }
+  const CoeffType operator()(Index i) const { return CoeffType(m_values[i], m_jacobian.col(i)); }
+
+  CoeffType coeffRef(Index i) { return CoeffType(m_values[i], m_jacobian.col(i)); }
+  const CoeffType coeffRef(Index i) const { return CoeffType(m_values[i], m_jacobian.col(i)); }
+
+  Index size() const { return m_values.size(); }
+
+  // FIXME here we could return an expression of the sum
+  Scalar sum() const { /*std::cerr << "sum \n\n";*/ /*std::cerr << m_jacobian.rowwise().sum() << "\n\n";*/
+    return Scalar(m_values.sum(), m_jacobian.rowwise().sum());
+  }
+
+  inline AutoDiffVector(const ValueType& values, const JacobianType& jac) : m_values(values), m_jacobian(jac) {}
+
+  template <typename OtherValueType, typename OtherJacobianType>
+  inline AutoDiffVector(const AutoDiffVector<OtherValueType, OtherJacobianType>& other)
+      : m_values(other.values()), m_jacobian(other.jacobian()) {}
+
+  inline AutoDiffVector(const AutoDiffVector& other) : m_values(other.values()), m_jacobian(other.jacobian()) {}
+
+  template <typename OtherValueType, typename OtherJacobianType>
+  inline AutoDiffVector& operator=(const AutoDiffVector<OtherValueType, OtherJacobianType>& other) {
+    m_values = other.values();
+    m_jacobian = other.jacobian();
+    return *this;
+  }
+
+  inline AutoDiffVector& operator=(const AutoDiffVector& other) {
+    m_values = other.values();
+    m_jacobian = other.jacobian();
+    return *this;
+  }
+
+  inline const ValueType& values() const { return m_values; }
+  inline ValueType& values() { return m_values; }
+
+  inline const JacobianType& jacobian() const { return m_jacobian; }
+  inline JacobianType& jacobian() { return m_jacobian; }
+
+  template <typename OtherValueType, typename OtherJacobianType>
+  inline const AutoDiffVector<
+      typename MakeCwiseBinaryOp<internal::scalar_sum_op<BaseScalar>, ValueType, OtherValueType>::Type,
+      typename MakeCwiseBinaryOp<internal::scalar_sum_op<BaseScalar>, JacobianType, OtherJacobianType>::Type>
+  operator+(const AutoDiffVector<OtherValueType, OtherJacobianType>& other) const {
+    return AutoDiffVector<
+        typename MakeCwiseBinaryOp<internal::scalar_sum_op<BaseScalar>, ValueType, OtherValueType>::Type,
+        typename MakeCwiseBinaryOp<internal::scalar_sum_op<BaseScalar>, JacobianType, OtherJacobianType>::Type>(
+        m_values + other.values(), m_jacobian + other.jacobian());
+  }
+
+  template <typename OtherValueType, typename OtherJacobianType>
+  inline AutoDiffVector& operator+=(const AutoDiffVector<OtherValueType, OtherJacobianType>& other) {
+    m_values += other.values();
+    m_jacobian += other.jacobian();
+    return *this;
+  }
+
+  template <typename OtherValueType, typename OtherJacobianType>
+  inline const AutoDiffVector<
+      typename MakeCwiseBinaryOp<internal::scalar_difference_op<Scalar>, ValueType, OtherValueType>::Type,
+      typename MakeCwiseBinaryOp<internal::scalar_difference_op<Scalar>, JacobianType, OtherJacobianType>::Type>
+  operator-(const AutoDiffVector<OtherValueType, OtherJacobianType>& other) const {
+    return AutoDiffVector<
+        typename MakeCwiseBinaryOp<internal::scalar_difference_op<Scalar>, ValueType, OtherValueType>::Type,
+        typename MakeCwiseBinaryOp<internal::scalar_difference_op<Scalar>, JacobianType, OtherJacobianType>::Type>(
+        m_values - other.values(), m_jacobian - other.jacobian());
+  }
+
+  template <typename OtherValueType, typename OtherJacobianType>
+  inline AutoDiffVector& operator-=(const AutoDiffVector<OtherValueType, OtherJacobianType>& other) {
+    m_values -= other.values();
+    m_jacobian -= other.jacobian();
+    return *this;
+  }
+
+  inline const AutoDiffVector<typename MakeCwiseUnaryOp<internal::scalar_opposite_op<Scalar>, ValueType>::Type,
+                              typename MakeCwiseUnaryOp<internal::scalar_opposite_op<Scalar>, JacobianType>::Type>
+  operator-() const {
+    return AutoDiffVector<typename MakeCwiseUnaryOp<internal::scalar_opposite_op<Scalar>, ValueType>::Type,
+                          typename MakeCwiseUnaryOp<internal::scalar_opposite_op<Scalar>, JacobianType>::Type>(
+        -m_values, -m_jacobian);
+  }
+
+  inline const AutoDiffVector<typename MakeCwiseUnaryOp<internal::scalar_multiple_op<Scalar>, ValueType>::Type,
+                              typename MakeCwiseUnaryOp<internal::scalar_multiple_op<Scalar>, JacobianType>::Type>
+  operator*(const BaseScalar& other) const {
+    return AutoDiffVector<typename MakeCwiseUnaryOp<internal::scalar_multiple_op<Scalar>, ValueType>::Type,
+                          typename MakeCwiseUnaryOp<internal::scalar_multiple_op<Scalar>, JacobianType>::Type>(
+        m_values * other, m_jacobian * other);
+  }
+
+  friend inline const AutoDiffVector<
       typename MakeCwiseUnaryOp<internal::scalar_multiple_op<Scalar>, ValueType>::Type,
       typename MakeCwiseUnaryOp<internal::scalar_multiple_op<Scalar>, JacobianType>::Type>
-    operator*(const BaseScalar& other) const
-    {
-      return AutoDiffVector<
-        typename MakeCwiseUnaryOp<internal::scalar_multiple_op<Scalar>, ValueType>::Type,
-        typename MakeCwiseUnaryOp<internal::scalar_multiple_op<Scalar>, JacobianType>::Type >(
-          m_values * other,
-          m_jacobian * other);
-    }
-
-    friend inline const AutoDiffVector<
-      typename MakeCwiseUnaryOp<internal::scalar_multiple_op<Scalar>, ValueType>::Type,
-      typename MakeCwiseUnaryOp<internal::scalar_multiple_op<Scalar>, JacobianType>::Type >
-    operator*(const Scalar& other, const AutoDiffVector& v)
-    {
-      return AutoDiffVector<
-        typename MakeCwiseUnaryOp<internal::scalar_multiple_op<Scalar>, ValueType>::Type,
-        typename MakeCwiseUnaryOp<internal::scalar_multiple_op<Scalar>, JacobianType>::Type >(
-          v.values() * other,
-          v.jacobian() * other);
-    }
-
-//     template<typename OtherValueType,typename OtherJacobianType>
-//     inline const AutoDiffVector<
-//       CwiseBinaryOp<internal::scalar_multiple_op<Scalar>, ValueType, OtherValueType>
-//       CwiseBinaryOp<internal::scalar_sum_op<Scalar>,
-//         CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, JacobianType>,
-//         CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, OtherJacobianType> > >
-//     operator*(const AutoDiffVector<OtherValueType,OtherJacobianType>& other) const
-//     {
-//       return AutoDiffVector<
-//         CwiseBinaryOp<internal::scalar_multiple_op<Scalar>, ValueType, OtherValueType>
-//         CwiseBinaryOp<internal::scalar_sum_op<Scalar>,
-//           CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, JacobianType>,
-//           CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, OtherJacobianType> > >(
-//             m_values.cwise() * other.values(),
-//             (m_jacobian * other.values()) + (m_values * other.jacobian()));
-//     }
-
-    inline AutoDiffVector& operator*=(const Scalar& other)
-    {
-      m_values *= other;
-      m_jacobian *= other;
-      return *this;
-    }
-
-    template<typename OtherValueType,typename OtherJacobianType>
-    inline AutoDiffVector& operator*=(const AutoDiffVector<OtherValueType,OtherJacobianType>& other)
-    {
-      *this = *this * other;
-      return *this;
-    }
-
-  protected:
-    ValueType m_values;
-    JacobianType m_jacobian;
-
+  operator*(const Scalar& other, const AutoDiffVector& v) {
+    return AutoDiffVector<typename MakeCwiseUnaryOp<internal::scalar_multiple_op<Scalar>, ValueType>::Type,
+                          typename MakeCwiseUnaryOp<internal::scalar_multiple_op<Scalar>, JacobianType>::Type>(
+        v.values() * other, v.jacobian() * other);
+  }
+
+  //     template<typename OtherValueType,typename OtherJacobianType>
+  //     inline const AutoDiffVector<
+  //       CwiseBinaryOp<internal::scalar_multiple_op<Scalar>, ValueType, OtherValueType>
+  //       CwiseBinaryOp<internal::scalar_sum_op<Scalar>,
+  //         CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, JacobianType>,
+  //         CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, OtherJacobianType> > >
+  //     operator*(const AutoDiffVector<OtherValueType,OtherJacobianType>& other) const
+  //     {
+  //       return AutoDiffVector<
+  //         CwiseBinaryOp<internal::scalar_multiple_op<Scalar>, ValueType, OtherValueType>
+  //         CwiseBinaryOp<internal::scalar_sum_op<Scalar>,
+  //           CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, JacobianType>,
+  //           CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, OtherJacobianType> > >(
+  //             m_values.cwise() * other.values(),
+  //             (m_jacobian * other.values()) + (m_values * other.jacobian()));
+  //     }
+
+  inline AutoDiffVector& operator*=(const Scalar& other) {
+    m_values *= other;
+    m_jacobian *= other;
+    return *this;
+  }
+
+  template <typename OtherValueType, typename OtherJacobianType>
+  inline AutoDiffVector& operator*=(const AutoDiffVector<OtherValueType, OtherJacobianType>& other) {
+    *this = *this * other;
+    return *this;
+  }
+
+ protected:
+  ValueType m_values;
+  JacobianType m_jacobian;
 };
 
-}
+}  // namespace Eigen
 
-#endif // EIGEN_AUTODIFF_VECTOR_H
+#endif  // EIGEN_AUTODIFF_VECTOR_H
diff --git a/inst/include/unsupported/Eigen/src/AutoDiff/CoherentPadOp.h b/inst/include/unsupported/Eigen/src/AutoDiff/CoherentPadOp.h
new file mode 100644
index 00000000..7d3a3fb3
--- /dev/null
+++ b/inst/include/unsupported/Eigen/src/AutoDiff/CoherentPadOp.h
@@ -0,0 +1,152 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2020 The Eigen Team.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_COHERENT_PAD_OP_H
+#define EIGEN_COHERENT_PAD_OP_H
+
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+// Pads a vector with zeros to a given size.
+template <typename XprType, int SizeAtCompileTime_>
+struct CoherentPadOp;
+
+template <typename XprType, int SizeAtCompileTime_>
+struct traits<CoherentPadOp<XprType, SizeAtCompileTime_>> : public traits<XprType> {
+  typedef typename internal::remove_all<XprType>::type PlainXprType;
+  typedef typename internal::ref_selector<XprType>::type XprNested;
+  typedef typename std::remove_reference_t<XprNested> XprNested_;
+  enum : int {
+    IsRowMajor = traits<PlainXprType>::Flags & RowMajorBit,
+    SizeAtCompileTime = SizeAtCompileTime_,
+    RowsAtCompileTime = IsRowMajor ? 1 : SizeAtCompileTime,
+    ColsAtCompileTime = IsRowMajor ? SizeAtCompileTime : 1,
+    MaxRowsAtCompileTime = RowsAtCompileTime,
+    MaxColsAtCompileTime = ColsAtCompileTime,
+    Flags = traits<XprType>::Flags & ~NestByRefBit,
+  };
+};
+
+// Pads a vector with zeros to a given size.
+template <typename XprType, int SizeAtCompileTime_>
+struct CoherentPadOp : public dense_xpr_base<CoherentPadOp<XprType, SizeAtCompileTime_>>::type {
+  typedef typename internal::generic_xpr_base<CoherentPadOp<XprType, SizeAtCompileTime_>>::type Base;
+  EIGEN_GENERIC_PUBLIC_INTERFACE(CoherentPadOp)
+
+  using XprNested = typename traits<CoherentPadOp>::XprNested;
+  using XprNested_ = typename traits<CoherentPadOp>::XprNested_;
+  using NestedExpression = XprNested_;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoherentPadOp() = delete;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoherentPadOp(const CoherentPadOp&) = default;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoherentPadOp(CoherentPadOp&& other) = default;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoherentPadOp(const XprType& xpr, Index size) : xpr_(xpr), size_(size) {
+    static_assert(XprNested_::IsVectorAtCompileTime, "input type must be a vector");
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const XprNested_& nestedExpression() const { return xpr_; }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index size() const { return size_.value(); }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rows() const {
+    return traits<CoherentPadOp>::IsRowMajor ? Index(1) : size();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index cols() const {
+    return traits<CoherentPadOp>::IsRowMajor ? size() : Index(1);
+  }
+
+ private:
+  XprNested xpr_;
+  const internal::variable_if_dynamic<Index, SizeAtCompileTime> size_;
+};
+
+// Adapted from the Replicate evaluator.
+template <typename ArgType, int SizeAtCompileTime>
+struct unary_evaluator<CoherentPadOp<ArgType, SizeAtCompileTime>>
+    : evaluator_base<CoherentPadOp<ArgType, SizeAtCompileTime>> {
+  typedef CoherentPadOp<ArgType, SizeAtCompileTime> XprType;
+  typedef typename internal::remove_all_t<typename XprType::CoeffReturnType> CoeffReturnType;
+  typedef typename internal::nested_eval<ArgType, 1>::type ArgTypeNested;
+  typedef internal::remove_all_t<ArgTypeNested> ArgTypeNestedCleaned;
+
+  enum {
+    CoeffReadCost = evaluator<ArgTypeNestedCleaned>::CoeffReadCost,
+    LinearAccessMask = XprType::IsVectorAtCompileTime ? LinearAccessBit : 0,
+    Flags = evaluator<ArgTypeNestedCleaned>::Flags & (HereditaryBits | LinearAccessMask | RowMajorBit),
+    Alignment = evaluator<ArgTypeNestedCleaned>::Alignment
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit unary_evaluator(const XprType& pad)
+      : m_arg(pad.nestedExpression()), m_argImpl(m_arg), m_size(pad.nestedExpression().size()) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index col) const {
+    EIGEN_IF_CONSTEXPR(XprType::IsRowMajor) {
+      if (col < m_size.value()) {
+        return m_argImpl.coeff(1, col);
+      }
+    }
+    else {
+      if (row < m_size.value()) {
+        return m_argImpl.coeff(row, 1);
+      }
+    }
+    return CoeffReturnType(0);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
+    if (index < m_size.value()) {
+      return m_argImpl.coeff(index);
+    }
+    return CoeffReturnType(0);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packet(Index row, Index col) const {
+    // AutoDiff scalar's derivative must be a vector, which is enforced by static assert.
+    // Defer to linear access for simplicity.
+    EIGEN_IF_CONSTEXPR(XprType::IsRowMajor) { return packet(col); }
+    return packet(row);
+  }
+
+  template <int LoadMode, typename PacketType>
+  EIGEN_STRONG_INLINE PacketType packet(Index index) const {
+    constexpr int kPacketSize = unpacket_traits<PacketType>::size;
+    if (index + kPacketSize <= m_size.value()) {
+      return m_argImpl.template packet<LoadMode, PacketType>(index);
+    } else if (index < m_size.value()) {
+      // Partial packet.
+      EIGEN_ALIGN_MAX std::remove_const_t<CoeffReturnType> values[kPacketSize];
+      const int partial = m_size.value() - index;
+      for (int i = 0; i < partial && i < kPacketSize; ++i) {
+        values[i] = m_argImpl.coeff(index + i);
+      }
+      for (int i = partial; i < kPacketSize; ++i) {
+        values[i] = CoeffReturnType(0);
+      }
+      return pload<PacketType>(values);
+    }
+    return pset1<PacketType>(CoeffReturnType(0));
+  }
+
+ protected:
+  const ArgTypeNested m_arg;
+  evaluator<ArgTypeNestedCleaned> m_argImpl;
+  const variable_if_dynamic<Index, ArgTypeNestedCleaned::SizeAtCompileTime> m_size;
+};
+
+}  // namespace internal
+
+}  // namespace Eigen
+
+#endif  // EIGEN_CWISE_BINARY_OP_H
diff --git a/inst/include/unsupported/Eigen/src/AutoDiff/InternalHeaderCheck.h b/inst/include/unsupported/Eigen/src/AutoDiff/InternalHeaderCheck.h
new file mode 100644
index 00000000..1584337c
--- /dev/null
+++ b/inst/include/unsupported/Eigen/src/AutoDiff/InternalHeaderCheck.h
@@ -0,0 +1,3 @@
+#ifndef EIGEN_AUTODIFF_MODULE_H
+#error "Please include unsupported/Eigen/AutoDiff instead of including headers inside the src directory directly."
+#endif
diff --git a/inst/include/unsupported/Eigen/src/BVH/BVAlgorithms.h b/inst/include/unsupported/Eigen/src/BVH/BVAlgorithms.h
index 994c8af5..d9e41da4 100644
--- a/inst/include/unsupported/Eigen/src/BVH/BVAlgorithms.h
+++ b/inst/include/unsupported/Eigen/src/BVH/BVAlgorithms.h
@@ -10,14 +10,16 @@
 #ifndef EIGEN_BVALGORITHMS_H
 #define EIGEN_BVALGORITHMS_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 namespace internal {
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
-template<typename BVH, typename Intersector>
-bool intersect_helper(const BVH &tree, Intersector &intersector, typename BVH::Index root)
-{
+template <typename BVH, typename Intersector>
+bool intersect_helper(const BVH &tree, Intersector &intersector, typename BVH::Index root) {
   typedef typename BVH::Index Index;
   typedef typename BVH::VolumeIterator VolIter;
   typedef typename BVH::ObjectIterator ObjIter;
@@ -27,47 +29,45 @@ bool intersect_helper(const BVH &tree, Intersector &intersector, typename BVH::I
 
   std::vector<Index> todo(1, root);
 
-  while(!todo.empty()) {
+  while (!todo.empty()) {
     tree.getChildren(todo.back(), vBegin, vEnd, oBegin, oEnd);
     todo.pop_back();
 
-    for(; vBegin != vEnd; ++vBegin) //go through child volumes
-      if(intersector.intersectVolume(tree.getVolume(*vBegin)))
-        todo.push_back(*vBegin);
+    for (; vBegin != vEnd; ++vBegin)  // go through child volumes
+      if (intersector.intersectVolume(tree.getVolume(*vBegin))) todo.push_back(*vBegin);
 
-    for(; oBegin != oEnd; ++oBegin) //go through child objects
-      if(intersector.intersectObject(*oBegin))
-        return true; //intersector said to stop query
+    for (; oBegin != oEnd; ++oBegin)                          // go through child objects
+      if (intersector.intersectObject(*oBegin)) return true;  // intersector said to stop query
   }
   return false;
 }
-#endif //not EIGEN_PARSED_BY_DOXYGEN
+#endif  // not EIGEN_PARSED_BY_DOXYGEN
 
-template<typename Volume1, typename Object1, typename Object2, typename Intersector>
-struct intersector_helper1
-{
+template <typename Volume1, typename Object1, typename Object2, typename Intersector>
+struct intersector_helper1 {
   intersector_helper1(const Object2 &inStored, Intersector &in) : stored(inStored), intersector(in) {}
   bool intersectVolume(const Volume1 &vol) { return intersector.intersectVolumeObject(vol, stored); }
   bool intersectObject(const Object1 &obj) { return intersector.intersectObjectObject(obj, stored); }
   Object2 stored;
   Intersector &intersector;
-private:
-  intersector_helper1& operator=(const intersector_helper1&);
+
+ private:
+  intersector_helper1 &operator=(const intersector_helper1 &);
 };
 
-template<typename Volume2, typename Object2, typename Object1, typename Intersector>
-struct intersector_helper2
-{
+template <typename Volume2, typename Object2, typename Object1, typename Intersector>
+struct intersector_helper2 {
   intersector_helper2(const Object1 &inStored, Intersector &in) : stored(inStored), intersector(in) {}
   bool intersectVolume(const Volume2 &vol) { return intersector.intersectObjectVolume(stored, vol); }
   bool intersectObject(const Object2 &obj) { return intersector.intersectObjectObject(stored, obj); }
   Object1 stored;
   Intersector &intersector;
-private:
-  intersector_helper2& operator=(const intersector_helper2&);
+
+ private:
+  intersector_helper2 &operator=(const intersector_helper2 &);
 };
 
-} // end namespace internal
+}  // end namespace internal
 
 /**  Given a BVH, runs the query encapsulated by \a intersector.
   *  The Intersector type must provide the following members: \code
@@ -75,27 +75,31 @@ struct intersector_helper2
      bool intersectObject(const BVH::Object &object) //returns true if the search should terminate immediately
   \endcode
   */
-template<typename BVH, typename Intersector>
-void BVIntersect(const BVH &tree, Intersector &intersector)
-{
+template <typename BVH, typename Intersector>
+void BVIntersect(const BVH &tree, Intersector &intersector) {
   internal::intersect_helper(tree, intersector, tree.getRootIndex());
 }
 
 /**  Given two BVH's, runs the query on their Cartesian product encapsulated by \a intersector.
   *  The Intersector type must provide the following members: \code
-     bool intersectVolumeVolume(const BVH1::Volume &v1, const BVH2::Volume &v2) //returns true if product of volumes intersects the query
-     bool intersectVolumeObject(const BVH1::Volume &v1, const BVH2::Object &o2) //returns true if the volume-object product intersects the query
-     bool intersectObjectVolume(const BVH1::Object &o1, const BVH2::Volume &v2) //returns true if the volume-object product intersects the query
-     bool intersectObjectObject(const BVH1::Object &o1, const BVH2::Object &o2) //returns true if the search should terminate immediately
-  \endcode
+     bool intersectVolumeVolume(const BVH1::Volume &v1, const BVH2::Volume &v2) //returns true if product of volumes
+  intersects the query bool intersectVolumeObject(const BVH1::Volume &v1, const BVH2::Object &o2) //returns true if the
+  volume-object product intersects the query bool intersectObjectVolume(const BVH1::Object &o1, const BVH2::Volume &v2)
+  //returns true if the volume-object product intersects the query bool intersectObjectObject(const BVH1::Object &o1,
+  const BVH2::Object &o2) //returns true if the search should terminate immediately \endcode
   */
-template<typename BVH1, typename BVH2, typename Intersector>
-void BVIntersect(const BVH1 &tree1, const BVH2 &tree2, Intersector &intersector) //TODO: tandem descent when it makes sense
+template <typename BVH1, typename BVH2, typename Intersector>
+void BVIntersect(const BVH1 &tree1, const BVH2 &tree2,
+                 Intersector &intersector)  // TODO: tandem descent when it makes sense
 {
   typedef typename BVH1::Index Index1;
   typedef typename BVH2::Index Index2;
-  typedef internal::intersector_helper1<typename BVH1::Volume, typename BVH1::Object, typename BVH2::Object, Intersector> Helper1;
-  typedef internal::intersector_helper2<typename BVH2::Volume, typename BVH2::Object, typename BVH1::Object, Intersector> Helper2;
+  typedef internal::intersector_helper1<typename BVH1::Volume, typename BVH1::Object, typename BVH2::Object,
+                                        Intersector>
+      Helper1;
+  typedef internal::intersector_helper2<typename BVH2::Volume, typename BVH2::Object, typename BVH1::Object,
+                                        Intersector>
+      Helper2;
   typedef typename BVH1::VolumeIterator VolIter1;
   typedef typename BVH1::ObjectIterator ObjIter1;
   typedef typename BVH2::VolumeIterator VolIter2;
@@ -108,35 +112,32 @@ void BVIntersect(const BVH1 &tree1, const BVH2 &tree2, Intersector &intersector)
 
   std::vector<std::pair<Index1, Index2> > todo(1, std::make_pair(tree1.getRootIndex(), tree2.getRootIndex()));
 
-  while(!todo.empty()) {
+  while (!todo.empty()) {
     tree1.getChildren(todo.back().first, vBegin1, vEnd1, oBegin1, oEnd1);
     tree2.getChildren(todo.back().second, vBegin2, vEnd2, oBegin2, oEnd2);
     todo.pop_back();
 
-    for(; vBegin1 != vEnd1; ++vBegin1) { //go through child volumes of first tree
+    for (; vBegin1 != vEnd1; ++vBegin1) {  // go through child volumes of first tree
       const typename BVH1::Volume &vol1 = tree1.getVolume(*vBegin1);
-      for(vCur2 = vBegin2; vCur2 != vEnd2; ++vCur2) { //go through child volumes of second tree
-        if(intersector.intersectVolumeVolume(vol1, tree2.getVolume(*vCur2)))
+      for (vCur2 = vBegin2; vCur2 != vEnd2; ++vCur2) {  // go through child volumes of second tree
+        if (intersector.intersectVolumeVolume(vol1, tree2.getVolume(*vCur2)))
           todo.push_back(std::make_pair(*vBegin1, *vCur2));
       }
 
-      for(oCur2 = oBegin2; oCur2 != oEnd2; ++oCur2) {//go through child objects of second tree
+      for (oCur2 = oBegin2; oCur2 != oEnd2; ++oCur2) {  // go through child objects of second tree
         Helper1 helper(*oCur2, intersector);
-        if(internal::intersect_helper(tree1, helper, *vBegin1))
-          return; //intersector said to stop query
+        if (internal::intersect_helper(tree1, helper, *vBegin1)) return;  // intersector said to stop query
       }
     }
 
-    for(; oBegin1 != oEnd1; ++oBegin1) { //go through child objects of first tree
-      for(vCur2 = vBegin2; vCur2 != vEnd2; ++vCur2) { //go through child volumes of second tree
+    for (; oBegin1 != oEnd1; ++oBegin1) {               // go through child objects of first tree
+      for (vCur2 = vBegin2; vCur2 != vEnd2; ++vCur2) {  // go through child volumes of second tree
         Helper2 helper(*oBegin1, intersector);
-        if(internal::intersect_helper(tree2, helper, *vCur2))
-          return; //intersector said to stop query
+        if (internal::intersect_helper(tree2, helper, *vCur2)) return;  // intersector said to stop query
       }
 
-      for(oCur2 = oBegin2; oCur2 != oEnd2; ++oCur2) {//go through child objects of second tree
-        if(intersector.intersectObjectObject(*oBegin1, *oCur2))
-          return; //intersector said to stop query
+      for (oCur2 = oBegin2; oCur2 != oEnd2; ++oCur2) {                    // go through child objects of second tree
+        if (intersector.intersectObjectObject(*oBegin1, *oCur2)) return;  // intersector said to stop query
       }
     }
   }
@@ -145,101 +146,98 @@ void BVIntersect(const BVH1 &tree1, const BVH2 &tree2, Intersector &intersector)
 namespace internal {
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
-template<typename BVH, typename Minimizer>
-typename Minimizer::Scalar minimize_helper(const BVH &tree, Minimizer &minimizer, typename BVH::Index root, typename Minimizer::Scalar minimum)
-{
+template <typename BVH, typename Minimizer>
+typename Minimizer::Scalar minimize_helper(const BVH &tree, Minimizer &minimizer, typename BVH::Index root,
+                                           typename Minimizer::Scalar minimum) {
   typedef typename Minimizer::Scalar Scalar;
   typedef typename BVH::Index Index;
-  typedef std::pair<Scalar, Index> QueueElement; //first element is priority
+  typedef std::pair<Scalar, Index> QueueElement;  // first element is priority
   typedef typename BVH::VolumeIterator VolIter;
   typedef typename BVH::ObjectIterator ObjIter;
 
   VolIter vBegin = VolIter(), vEnd = VolIter();
   ObjIter oBegin = ObjIter(), oEnd = ObjIter();
-  std::priority_queue<QueueElement, std::vector<QueueElement>, std::greater<QueueElement> > todo; //smallest is at the top
+  std::priority_queue<QueueElement, std::vector<QueueElement>, std::greater<QueueElement> >
+      todo;  // smallest is at the top
 
   todo.push(std::make_pair(Scalar(), root));
 
-  while(!todo.empty()) {
+  while (!todo.empty()) {
     tree.getChildren(todo.top().second, vBegin, vEnd, oBegin, oEnd);
     todo.pop();
 
-    for(; oBegin != oEnd; ++oBegin) //go through child objects
+    for (; oBegin != oEnd; ++oBegin)  // go through child objects
       minimum = (std::min)(minimum, minimizer.minimumOnObject(*oBegin));
 
-    for(; vBegin != vEnd; ++vBegin) { //go through child volumes
+    for (; vBegin != vEnd; ++vBegin) {  // go through child volumes
       Scalar val = minimizer.minimumOnVolume(tree.getVolume(*vBegin));
-      if(val < minimum)
-        todo.push(std::make_pair(val, *vBegin));
+      if (val < minimum) todo.push(std::make_pair(val, *vBegin));
     }
   }
 
   return minimum;
 }
-#endif //not EIGEN_PARSED_BY_DOXYGEN
+#endif  // not EIGEN_PARSED_BY_DOXYGEN
 
-
-template<typename Volume1, typename Object1, typename Object2, typename Minimizer>
-struct minimizer_helper1
-{
+template <typename Volume1, typename Object1, typename Object2, typename Minimizer>
+struct minimizer_helper1 {
   typedef typename Minimizer::Scalar Scalar;
   minimizer_helper1(const Object2 &inStored, Minimizer &m) : stored(inStored), minimizer(m) {}
   Scalar minimumOnVolume(const Volume1 &vol) { return minimizer.minimumOnVolumeObject(vol, stored); }
   Scalar minimumOnObject(const Object1 &obj) { return minimizer.minimumOnObjectObject(obj, stored); }
   Object2 stored;
   Minimizer &minimizer;
-private:
-  minimizer_helper1& operator=(const minimizer_helper1&);
+
+ private:
+  minimizer_helper1 &operator=(const minimizer_helper1 &);
 };
 
-template<typename Volume2, typename Object2, typename Object1, typename Minimizer>
-struct minimizer_helper2
-{
+template <typename Volume2, typename Object2, typename Object1, typename Minimizer>
+struct minimizer_helper2 {
   typedef typename Minimizer::Scalar Scalar;
   minimizer_helper2(const Object1 &inStored, Minimizer &m) : stored(inStored), minimizer(m) {}
   Scalar minimumOnVolume(const Volume2 &vol) { return minimizer.minimumOnObjectVolume(stored, vol); }
   Scalar minimumOnObject(const Object2 &obj) { return minimizer.minimumOnObjectObject(stored, obj); }
   Object1 stored;
   Minimizer &minimizer;
-private:
-  minimizer_helper2& operator=(const minimizer_helper2&);
+
+ private:
+  minimizer_helper2 &operator=(const minimizer_helper2 &);
 };
 
-} // end namespace internal
+}  // end namespace internal
 
 /**  Given a BVH, runs the query encapsulated by \a minimizer.
   *  \returns the minimum value.
   *  The Minimizer type must provide the following members: \code
-     typedef Scalar //the numeric type of what is being minimized--not necessarily the Scalar type of the BVH (if it has one)
-     Scalar minimumOnVolume(const BVH::Volume &volume)
-     Scalar minimumOnObject(const BVH::Object &object)
-  \endcode
+     typedef Scalar //the numeric type of what is being minimized--not necessarily the Scalar type of the BVH (if it has
+  one) Scalar minimumOnVolume(const BVH::Volume &volume) Scalar minimumOnObject(const BVH::Object &object) \endcode
   */
-template<typename BVH, typename Minimizer>
-typename Minimizer::Scalar BVMinimize(const BVH &tree, Minimizer &minimizer)
-{
-  return internal::minimize_helper(tree, minimizer, tree.getRootIndex(), (std::numeric_limits<typename Minimizer::Scalar>::max)());
+template <typename BVH, typename Minimizer>
+typename Minimizer::Scalar BVMinimize(const BVH &tree, Minimizer &minimizer) {
+  return internal::minimize_helper(tree, minimizer, tree.getRootIndex(),
+                                   (std::numeric_limits<typename Minimizer::Scalar>::max)());
 }
 
 /**  Given two BVH's, runs the query on their cartesian product encapsulated by \a minimizer.
   *  \returns the minimum value.
   *  The Minimizer type must provide the following members: \code
-     typedef Scalar //the numeric type of what is being minimized--not necessarily the Scalar type of the BVH (if it has one)
-     Scalar minimumOnVolumeVolume(const BVH1::Volume &v1, const BVH2::Volume &v2)
-     Scalar minimumOnVolumeObject(const BVH1::Volume &v1, const BVH2::Object &o2)
-     Scalar minimumOnObjectVolume(const BVH1::Object &o1, const BVH2::Volume &v2)
+     typedef Scalar //the numeric type of what is being minimized--not necessarily the Scalar type of the BVH (if it has
+  one) Scalar minimumOnVolumeVolume(const BVH1::Volume &v1, const BVH2::Volume &v2) Scalar minimumOnVolumeObject(const
+  BVH1::Volume &v1, const BVH2::Object &o2) Scalar minimumOnObjectVolume(const BVH1::Object &o1, const BVH2::Volume &v2)
      Scalar minimumOnObjectObject(const BVH1::Object &o1, const BVH2::Object &o2)
   \endcode
   */
-template<typename BVH1, typename BVH2, typename Minimizer>
-typename Minimizer::Scalar BVMinimize(const BVH1 &tree1, const BVH2 &tree2, Minimizer &minimizer)
-{
+template <typename BVH1, typename BVH2, typename Minimizer>
+typename Minimizer::Scalar BVMinimize(const BVH1 &tree1, const BVH2 &tree2, Minimizer &minimizer) {
   typedef typename Minimizer::Scalar Scalar;
   typedef typename BVH1::Index Index1;
   typedef typename BVH2::Index Index2;
-  typedef internal::minimizer_helper1<typename BVH1::Volume, typename BVH1::Object, typename BVH2::Object, Minimizer> Helper1;
-  typedef internal::minimizer_helper2<typename BVH2::Volume, typename BVH2::Object, typename BVH1::Object, Minimizer> Helper2;
-  typedef std::pair<Scalar, std::pair<Index1, Index2> > QueueElement; //first element is priority
+  typedef internal::minimizer_helper1<typename BVH1::Volume, typename BVH1::Object, typename BVH2::Object, Minimizer>
+      Helper1;
+  typedef internal::minimizer_helper2<typename BVH2::Volume, typename BVH2::Object, typename BVH1::Object, Minimizer>
+      Helper2;
+  typedef std::pair<Scalar, std::pair<Index1, Index2> > QueueElement;  // first element is priority
   typedef typename BVH1::VolumeIterator VolIter1;
   typedef typename BVH1::ObjectIterator ObjIter1;
   typedef typename BVH2::VolumeIterator VolIter2;
@@ -249,45 +247,45 @@ typename Minimizer::Scalar BVMinimize(const BVH1 &tree1, const BVH2 &tree2, Mini
   ObjIter1 oBegin1 = ObjIter1(), oEnd1 = ObjIter1();
   VolIter2 vBegin2 = VolIter2(), vEnd2 = VolIter2(), vCur2 = VolIter2();
   ObjIter2 oBegin2 = ObjIter2(), oEnd2 = ObjIter2(), oCur2 = ObjIter2();
-  std::priority_queue<QueueElement, std::vector<QueueElement>, std::greater<QueueElement> > todo; //smallest is at the top
+  std::priority_queue<QueueElement, std::vector<QueueElement>, std::greater<QueueElement> >
+      todo;  // smallest is at the top
 
   Scalar minimum = (std::numeric_limits<Scalar>::max)();
   todo.push(std::make_pair(Scalar(), std::make_pair(tree1.getRootIndex(), tree2.getRootIndex())));
 
-  while(!todo.empty()) {
+  while (!todo.empty()) {
     tree1.getChildren(todo.top().second.first, vBegin1, vEnd1, oBegin1, oEnd1);
     tree2.getChildren(todo.top().second.second, vBegin2, vEnd2, oBegin2, oEnd2);
     todo.pop();
 
-    for(; oBegin1 != oEnd1; ++oBegin1) { //go through child objects of first tree
-      for(oCur2 = oBegin2; oCur2 != oEnd2; ++oCur2) {//go through child objects of second tree
+    for (; oBegin1 != oEnd1; ++oBegin1) {               // go through child objects of first tree
+      for (oCur2 = oBegin2; oCur2 != oEnd2; ++oCur2) {  // go through child objects of second tree
         minimum = (std::min)(minimum, minimizer.minimumOnObjectObject(*oBegin1, *oCur2));
       }
 
-      for(vCur2 = vBegin2; vCur2 != vEnd2; ++vCur2) { //go through child volumes of second tree
+      for (vCur2 = vBegin2; vCur2 != vEnd2; ++vCur2) {  // go through child volumes of second tree
         Helper2 helper(*oBegin1, minimizer);
         minimum = (std::min)(minimum, internal::minimize_helper(tree2, helper, *vCur2, minimum));
       }
     }
 
-    for(; vBegin1 != vEnd1; ++vBegin1) { //go through child volumes of first tree
+    for (; vBegin1 != vEnd1; ++vBegin1) {  // go through child volumes of first tree
       const typename BVH1::Volume &vol1 = tree1.getVolume(*vBegin1);
 
-      for(oCur2 = oBegin2; oCur2 != oEnd2; ++oCur2) {//go through child objects of second tree
+      for (oCur2 = oBegin2; oCur2 != oEnd2; ++oCur2) {  // go through child objects of second tree
         Helper1 helper(*oCur2, minimizer);
         minimum = (std::min)(minimum, internal::minimize_helper(tree1, helper, *vBegin1, minimum));
       }
 
-      for(vCur2 = vBegin2; vCur2 != vEnd2; ++vCur2) { //go through child volumes of second tree
+      for (vCur2 = vBegin2; vCur2 != vEnd2; ++vCur2) {  // go through child volumes of second tree
         Scalar val = minimizer.minimumOnVolumeVolume(vol1, tree2.getVolume(*vCur2));
-        if(val < minimum)
-          todo.push(std::make_pair(val, std::make_pair(*vBegin1, *vCur2)));
+        if (val < minimum) todo.push(std::make_pair(val, std::make_pair(*vBegin1, *vCur2)));
       }
     }
   }
   return minimum;
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_BVALGORITHMS_H
+#endif  // EIGEN_BVALGORITHMS_H
diff --git a/inst/include/unsupported/Eigen/src/BVH/InternalHeaderCheck.h b/inst/include/unsupported/Eigen/src/BVH/InternalHeaderCheck.h
new file mode 100644
index 00000000..7aade9b0
--- /dev/null
+++ b/inst/include/unsupported/Eigen/src/BVH/InternalHeaderCheck.h
@@ -0,0 +1,3 @@
+#ifndef EIGEN_BVH_MODULE_H
+#error "Please include unsupported/Eigen/BVH instead of including headers inside the src directory directly."
+#endif
diff --git a/inst/include/unsupported/Eigen/src/BVH/KdBVH.h b/inst/include/unsupported/Eigen/src/BVH/KdBVH.h
index 1b8d7586..d421e6f6 100644
--- a/inst/include/unsupported/Eigen/src/BVH/KdBVH.h
+++ b/inst/include/unsupported/Eigen/src/BVH/KdBVH.h
@@ -10,15 +10,17 @@
 #ifndef KDBVH_H_INCLUDED
 #define KDBVH_H_INCLUDED
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 namespace internal {
 
-//internal pair class for the BVH--used instead of std::pair because of alignment
-template<typename Scalar, int Dim>
-struct vector_int_pair
-{
-EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar, Dim)
+// internal pair class for the BVH--used instead of std::pair because of alignment
+template <typename Scalar, int Dim>
+struct vector_int_pair {
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar, Dim)
   typedef Matrix<Scalar, Dim, 1> VectorType;
 
   vector_int_pair(const VectorType &v, int i) : first(v), second(i) {}
@@ -27,72 +29,81 @@ EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar, Dim)
   int second;
 };
 
-//these templates help the tree initializer get the bounding boxes either from a provided
-//iterator range or using bounding_box in a unified way
-template<typename ObjectList, typename VolumeList, typename BoxIter>
+// these templates help the tree initializer get the bounding boxes either from a provided
+// iterator range or using bounding_box in a unified way
+template <typename ObjectList, typename VolumeList, typename BoxIter>
 struct get_boxes_helper {
-  void operator()(const ObjectList &objects, BoxIter boxBegin, BoxIter boxEnd, VolumeList &outBoxes)
-  {
+  void operator()(const ObjectList &objects, BoxIter boxBegin, BoxIter boxEnd, VolumeList &outBoxes) {
     outBoxes.insert(outBoxes.end(), boxBegin, boxEnd);
     eigen_assert(outBoxes.size() == objects.size());
+    EIGEN_ONLY_USED_FOR_DEBUG(objects);
   }
 };
 
-template<typename ObjectList, typename VolumeList>
+template <typename ObjectList, typename VolumeList>
 struct get_boxes_helper<ObjectList, VolumeList, int> {
-  void operator()(const ObjectList &objects, int, int, VolumeList &outBoxes)
-  {
+  void operator()(const ObjectList &objects, int, int, VolumeList &outBoxes) {
     outBoxes.reserve(objects.size());
-    for(int i = 0; i < (int)objects.size(); ++i)
-      outBoxes.push_back(bounding_box(objects[i]));
+    for (int i = 0; i < (int)objects.size(); ++i) outBoxes.push_back(bounding_box(objects[i]));
   }
 };
 
-} // end namespace internal
-
+}  // end namespace internal
 
 /** \class KdBVH
  *  \brief A simple bounding volume hierarchy based on AlignedBox
  *
- *  \param _Scalar The underlying scalar type of the bounding boxes
- *  \param _Dim The dimension of the space in which the hierarchy lives
- *  \param _Object The object type that lives in the hierarchy.  It must have value semantics.  Either bounding_box(_Object) must
- *                 be defined and return an AlignedBox<_Scalar, _Dim> or bounding boxes must be provided to the tree initializer.
+ *  \param Scalar_ The underlying scalar type of the bounding boxes
+ *  \param Dim_ The dimension of the space in which the hierarchy lives
+ *  \param Object_ The object type that lives in the hierarchy.  It must have value semantics.  Either
+ *                 `bounding_box(Object_)` must be defined and return an `AlignedBox<Scalar_, Dim_>` or bounding boxes
+ *                  must be provided to the tree initializer.
  *
- *  This class provides a simple (as opposed to optimized) implementation of a bounding volume hierarchy analogous to a Kd-tree.
- *  Given a sequence of objects, it computes their bounding boxes, constructs a Kd-tree of their centers
- *  and builds a BVH with the structure of that Kd-tree.  When the elements of the tree are too expensive to be copied around,
- *  it is useful for _Object to be a pointer.
+ * This class provides a simple (as opposed to optimized) implementation of a bounding volume hierarchy analogous to a
+ * Kd-tree. Given a sequence of objects, it computes their bounding boxes, constructs a Kd-tree of their centers and
+ * builds a BVH with the structure of that Kd-tree.  When the elements of the tree are too expensive to be copied
+ * around, it is useful for `Object_` to be a pointer.
  */
-template<typename _Scalar, int _Dim, typename _Object> class KdBVH
-{
-public:
-  enum { Dim = _Dim };
-  typedef _Object Object;
+template <typename Scalar_, int Dim_, typename Object_>
+class KdBVH {
+ public:
+  enum { Dim = Dim_ };
+  typedef Object_ Object;
   typedef std::vector<Object, aligned_allocator<Object> > ObjectList;
-  typedef _Scalar Scalar;
+  typedef Scalar_ Scalar;
   typedef AlignedBox<Scalar, Dim> Volume;
   typedef std::vector<Volume, aligned_allocator<Volume> > VolumeList;
   typedef int Index;
-  typedef const int *VolumeIterator; //the iterators are just pointers into the tree's vectors
+  typedef const int *VolumeIterator;  // the iterators are just pointers into the tree's vectors
   typedef const Object *ObjectIterator;
 
   KdBVH() {}
 
-  /** Given an iterator range over \a Object references, constructs the BVH.  Requires that bounding_box(Object) return a Volume. */
-  template<typename Iter> KdBVH(Iter begin, Iter end) { init(begin, end, 0, 0); } //int is recognized by init as not being an iterator type
-
-  /** Given an iterator range over \a Object references and an iterator range over their bounding boxes, constructs the BVH */
-  template<typename OIter, typename BIter> KdBVH(OIter begin, OIter end, BIter boxBegin, BIter boxEnd) { init(begin, end, boxBegin, boxEnd); }
+  /** Given an iterator range over \a Object references, constructs the BVH.  Requires that bounding_box(Object) return
+   * a Volume. */
+  template <typename Iter>
+  KdBVH(Iter begin, Iter end) {
+    init(begin, end, 0, 0);
+  }  // int is recognized by init as not being an iterator type
+
+  /** Given an iterator range over \a Object references and an iterator range over their bounding boxes, constructs the
+   * BVH */
+  template <typename OIter, typename BIter>
+  KdBVH(OIter begin, OIter end, BIter boxBegin, BIter boxEnd) {
+    init(begin, end, boxBegin, boxEnd);
+  }
 
   /** Given an iterator range over \a Object references, constructs the BVH, overwriting whatever is in there currently.
-    * Requires that bounding_box(Object) return a Volume. */
-  template<typename Iter> void init(Iter begin, Iter end) { init(begin, end, 0, 0); }
+   * Requires that bounding_box(Object) return a Volume. */
+  template <typename Iter>
+  void init(Iter begin, Iter end) {
+    init(begin, end, 0, 0);
+  }
 
   /** Given an iterator range over \a Object references and an iterator range over their bounding boxes,
-    * constructs the BVH, overwriting whatever is in there currently. */
-  template<typename OIter, typename BIter> void init(OIter begin, OIter end, BIter boxBegin, BIter boxEnd)
-  {
+   * constructs the BVH, overwriting whatever is in there currently. */
+  template <typename OIter, typename BIter>
+  void init(OIter begin, OIter end, BIter boxBegin, BIter boxEnd) {
     objects.clear();
     boxes.clear();
     children.clear();
@@ -100,59 +111,54 @@ template<typename _Scalar, int _Dim, typename _Object> class KdBVH
     objects.insert(objects.end(), begin, end);
     int n = static_cast<int>(objects.size());
 
-    if(n < 2)
-      return; //if we have at most one object, we don't need any internal nodes
+    if (n < 2) return;  // if we have at most one object, we don't need any internal nodes
 
     VolumeList objBoxes;
     VIPairList objCenters;
 
-    //compute the bounding boxes depending on BIter type
+    // compute the bounding boxes depending on BIter type
     internal::get_boxes_helper<ObjectList, VolumeList, BIter>()(objects, boxBegin, boxEnd, objBoxes);
 
     objCenters.reserve(n);
     boxes.reserve(n - 1);
     children.reserve(2 * n - 2);
 
-    for(int i = 0; i < n; ++i)
-      objCenters.push_back(VIPair(objBoxes[i].center(), i));
+    for (int i = 0; i < n; ++i) objCenters.push_back(VIPair(objBoxes[i].center(), i));
 
-    build(objCenters, 0, n, objBoxes, 0); //the recursive part of the algorithm
+    build(objCenters, 0, n, objBoxes, 0);  // the recursive part of the algorithm
 
     ObjectList tmp(n);
     tmp.swap(objects);
-    for(int i = 0; i < n; ++i)
-      objects[i] = tmp[objCenters[i].second];
+    for (int i = 0; i < n; ++i) objects[i] = tmp[objCenters[i].second];
   }
 
   /** \returns the index of the root of the hierarchy */
   inline Index getRootIndex() const { return (int)boxes.size() - 1; }
 
-  /** Given an \a index of a node, on exit, \a outVBegin and \a outVEnd range over the indices of the volume children of the node
-    * and \a outOBegin and \a outOEnd range over the object children of the node */
+  /** Given an \a index of a node, on exit, \a outVBegin and \a outVEnd range over the indices of the volume children of
+   * the node and \a outOBegin and \a outOEnd range over the object children of the node */
   EIGEN_STRONG_INLINE void getChildren(Index index, VolumeIterator &outVBegin, VolumeIterator &outVEnd,
-                                       ObjectIterator &outOBegin, ObjectIterator &outOEnd) const
-  { //inlining this function should open lots of optimization opportunities to the compiler
-    if(index < 0) {
+                                       ObjectIterator &outOBegin, ObjectIterator &outOEnd)
+      const {  // inlining this function should open lots of optimization opportunities to the compiler
+    if (index < 0) {
       outVBegin = outVEnd;
-      if(!objects.empty())
-        outOBegin = &(objects[0]);
-      outOEnd = outOBegin + objects.size(); //output all objects--necessary when the tree has only one object
+      if (!objects.empty()) outOBegin = &(objects[0]);
+      outOEnd = outOBegin + objects.size();  // output all objects--necessary when the tree has only one object
       return;
     }
 
     int numBoxes = static_cast<int>(boxes.size());
 
     int idx = index * 2;
-    if(children[idx + 1] < numBoxes) { //second index is always bigger
+    if (children[idx + 1] < numBoxes) {  // second index is always bigger
       outVBegin = &(children[idx]);
       outVEnd = outVBegin + 2;
       outOBegin = outOEnd;
-    }
-    else if(children[idx] >= numBoxes) { //if both children are objects
+    } else if (children[idx] >= numBoxes) {  // if both children are objects
       outVBegin = outVEnd;
       outOBegin = &(objects[children[idx] - numBoxes]);
       outOEnd = outOBegin + 2;
-    } else { //if the first child is a volume and the second is an object
+    } else {  // if the first child is a volume and the second is an object
       outVBegin = &(children[idx]);
       outVEnd = outVBegin + 1;
       outOBegin = &(objects[children[idx + 1] - numBoxes]);
@@ -161,47 +167,41 @@ template<typename _Scalar, int _Dim, typename _Object> class KdBVH
   }
 
   /** \returns the bounding box of the node at \a index */
-  inline const Volume &getVolume(Index index) const
-  {
-    return boxes[index];
-  }
+  inline const Volume &getVolume(Index index) const { return boxes[index]; }
 
-private:
+ private:
   typedef internal::vector_int_pair<Scalar, Dim> VIPair;
   typedef std::vector<VIPair, aligned_allocator<VIPair> > VIPairList;
   typedef Matrix<Scalar, Dim, 1> VectorType;
-  struct VectorComparator //compares vectors, or, more specificall, VIPairs along a particular dimension
+  struct VectorComparator  // compares vectors, or more specifically, VIPairs along a particular dimension
   {
     VectorComparator(int inDim) : dim(inDim) {}
     inline bool operator()(const VIPair &v1, const VIPair &v2) const { return v1.first[dim] < v2.first[dim]; }
     int dim;
   };
 
-  //Build the part of the tree between objects[from] and objects[to] (not including objects[to]).
-  //This routine partitions the objCenters in [from, to) along the dimension dim, recursively constructs
-  //the two halves, and adds their parent node.  TODO: a cache-friendlier layout
-  void build(VIPairList &objCenters, int from, int to, const VolumeList &objBoxes, int dim)
-  {
+  // Build the part of the tree between objects[from] and objects[to] (not including objects[to]).
+  // This routine partitions the objCenters in [from, to) along the dimension dim, recursively constructs
+  // the two halves, and adds their parent node.  TODO: a cache-friendlier layout
+  void build(VIPairList &objCenters, int from, int to, const VolumeList &objBoxes, int dim) {
     eigen_assert(to - from > 1);
-    if(to - from == 2) {
+    if (to - from == 2) {
       boxes.push_back(objBoxes[objCenters[from].second].merged(objBoxes[objCenters[from + 1].second]));
-      children.push_back(from + (int)objects.size() - 1); //there are objects.size() - 1 tree nodes
+      children.push_back(from + (int)objects.size() - 1);  // there are objects.size() - 1 tree nodes
       children.push_back(from + (int)objects.size());
-    }
-    else if(to - from == 3) {
+    } else if (to - from == 3) {
       int mid = from + 2;
-      std::nth_element(objCenters.begin() + from, objCenters.begin() + mid,
-                        objCenters.begin() + to, VectorComparator(dim)); //partition
+      std::nth_element(objCenters.begin() + from, objCenters.begin() + mid, objCenters.begin() + to,
+                       VectorComparator(dim));  // partition
       build(objCenters, from, mid, objBoxes, (dim + 1) % Dim);
       int idx1 = (int)boxes.size() - 1;
       boxes.push_back(boxes[idx1].merged(objBoxes[objCenters[mid].second]));
       children.push_back(idx1);
       children.push_back(mid + (int)objects.size() - 1);
-    }
-    else {
+    } else {
       int mid = from + (to - from) / 2;
-      nth_element(objCenters.begin() + from, objCenters.begin() + mid,
-                  objCenters.begin() + to, VectorComparator(dim)); //partition
+      nth_element(objCenters.begin() + from, objCenters.begin() + mid, objCenters.begin() + to,
+                  VectorComparator(dim));  // partition
       build(objCenters, from, mid, objBoxes, (dim + 1) % Dim);
       int idx1 = (int)boxes.size() - 1;
       build(objCenters, mid, to, objBoxes, (dim + 1) % Dim);
@@ -212,11 +212,12 @@ template<typename _Scalar, int _Dim, typename _Object> class KdBVH
     }
   }
 
-  std::vector<int> children; //children of x are children[2x] and children[2x+1], indices bigger than boxes.size() index into objects.
+  std::vector<int> children;  // children of x are children[2x] and children[2x+1], indices bigger than boxes.size()
+                              // index into objects.
   VolumeList boxes;
   ObjectList objects;
 };
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif //KDBVH_H_INCLUDED
+#endif  // KDBVH_H_INCLUDED
diff --git a/inst/include/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h b/inst/include/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h
index 3b6a69af..bc21d94c 100644
--- a/inst/include/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h
+++ b/inst/include/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h
@@ -3,44 +3,31 @@
 //
 // Copyright (C) 2012 David Harmon <dharmon@gmail.com>
 //
-// Eigen is free software; you can redistribute it and/or
-// modify it under the terms of the GNU Lesser General Public
-// License as published by the Free Software Foundation; either
-// version 3 of the License, or (at your option) any later version.
-//
-// Alternatively, you can redistribute it and/or
-// modify it under the terms of the GNU General Public License as
-// published by the Free Software Foundation; either version 2 of
-// the License, or (at your option) any later version.
-//
-// Eigen is distributed in the hope that it will be useful, but WITHOUT ANY
-// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-// FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License or the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU Lesser General Public
-// License and a copy of the GNU General Public License along with
-// Eigen. If not, see <http://www.gnu.org/licenses/>.
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 #ifndef EIGEN_ARPACKGENERALIZEDSELFADJOINTEIGENSOLVER_H
 #define EIGEN_ARPACKGENERALIZEDSELFADJOINTEIGENSOLVER_H
 
-#include <Eigen/Dense>
-
-namespace Eigen { 
+#include "../../../../Eigen/Dense"
 
-namespace internal {
-  template<typename Scalar, typename RealScalar> struct arpack_wrapper;
-  template<typename MatrixSolver, typename MatrixType, typename Scalar, bool BisSPD> struct OP;
-}
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 
+namespace Eigen {
 
+namespace internal {
+template <typename Scalar, typename RealScalar>
+struct arpack_wrapper;
+template <typename MatrixSolver, typename MatrixType, typename Scalar, bool BisSPD>
+struct OP;
+}  // namespace internal
 
-template<typename MatrixType, typename MatrixSolver=SimplicialLLT<MatrixType>, bool BisSPD=false>
-class ArpackGeneralizedSelfAdjointEigenSolver
-{
-public:
-  //typedef typename MatrixSolver::MatrixType MatrixType;
+template <typename MatrixType, typename MatrixSolver = SimplicialLLT<MatrixType>, bool BisSPD = false>
+class ArpackGeneralizedSelfAdjointEigenSolver {
+ public:
+  // typedef typename MatrixSolver::MatrixType MatrixType;
 
   /** \brief Scalar type for matrices of type \p MatrixType. */
   typedef typename MatrixType::Scalar Scalar;
@@ -68,13 +55,12 @@ class ArpackGeneralizedSelfAdjointEigenSolver
    *
    */
   ArpackGeneralizedSelfAdjointEigenSolver()
-   : m_eivec(),
-     m_eivalues(),
-     m_isInitialized(false),
-     m_eigenvectorsOk(false),
-     m_nbrConverged(0),
-     m_nbrIterations(0)
-  { }
+      : m_eivec(),
+        m_eivalues(),
+        m_isInitialized(false),
+        m_eigenvectorsOk(false),
+        m_nbrConverged(0),
+        m_nbrIterations(0) {}
 
   /** \brief Constructor; computes generalized eigenvalues of given matrix with respect to another matrix.
    *
@@ -98,16 +84,15 @@ class ArpackGeneralizedSelfAdjointEigenSolver
    * \p options equals #ComputeEigenvectors.
    *
    */
-  ArpackGeneralizedSelfAdjointEigenSolver(const MatrixType& A, const MatrixType& B,
-                                          Index nbrEigenvalues, std::string eigs_sigma="LM",
-                               int options=ComputeEigenvectors, RealScalar tol=0.0)
-    : m_eivec(),
-      m_eivalues(),
-      m_isInitialized(false),
-      m_eigenvectorsOk(false),
-      m_nbrConverged(0),
-      m_nbrIterations(0)
-  {
+  ArpackGeneralizedSelfAdjointEigenSolver(const MatrixType &A, const MatrixType &B, Index nbrEigenvalues,
+                                          std::string eigs_sigma = "LM", int options = ComputeEigenvectors,
+                                          RealScalar tol = 0.0)
+      : m_eivec(),
+        m_eivalues(),
+        m_isInitialized(false),
+        m_eigenvectorsOk(false),
+        m_nbrConverged(0),
+        m_nbrIterations(0) {
     compute(A, B, nbrEigenvalues, eigs_sigma, options, tol);
   }
 
@@ -133,20 +118,17 @@ class ArpackGeneralizedSelfAdjointEigenSolver
    *
    */
 
-  ArpackGeneralizedSelfAdjointEigenSolver(const MatrixType& A,
-                                          Index nbrEigenvalues, std::string eigs_sigma="LM",
-                               int options=ComputeEigenvectors, RealScalar tol=0.0)
-    : m_eivec(),
-      m_eivalues(),
-      m_isInitialized(false),
-      m_eigenvectorsOk(false),
-      m_nbrConverged(0),
-      m_nbrIterations(0)
-  {
+  ArpackGeneralizedSelfAdjointEigenSolver(const MatrixType &A, Index nbrEigenvalues, std::string eigs_sigma = "LM",
+                                          int options = ComputeEigenvectors, RealScalar tol = 0.0)
+      : m_eivec(),
+        m_eivalues(),
+        m_isInitialized(false),
+        m_eigenvectorsOk(false),
+        m_nbrConverged(0),
+        m_nbrIterations(0) {
     compute(A, nbrEigenvalues, eigs_sigma, options, tol);
   }
 
-
   /** \brief Computes generalized eigenvalues / eigenvectors of given matrix using the external ARPACK library.
    *
    * \param[in]  A  Selfadjoint matrix whose eigendecomposition is to be computed.
@@ -170,10 +152,10 @@ class ArpackGeneralizedSelfAdjointEigenSolver
    * calling eigenvectors().
    *
    */
-  ArpackGeneralizedSelfAdjointEigenSolver& compute(const MatrixType& A, const MatrixType& B,
-                                                   Index nbrEigenvalues, std::string eigs_sigma="LM",
-                                        int options=ComputeEigenvectors, RealScalar tol=0.0);
-  
+  ArpackGeneralizedSelfAdjointEigenSolver &compute(const MatrixType &A, const MatrixType &B, Index nbrEigenvalues,
+                                                   std::string eigs_sigma = "LM", int options = ComputeEigenvectors,
+                                                   RealScalar tol = 0.0);
+
   /** \brief Computes eigenvalues / eigenvectors of given matrix using the external ARPACK library.
    *
    * \param[in]  A  Selfadjoint matrix whose eigendecomposition is to be computed.
@@ -196,10 +178,9 @@ class ArpackGeneralizedSelfAdjointEigenSolver
    * calling eigenvectors().
    *
    */
-  ArpackGeneralizedSelfAdjointEigenSolver& compute(const MatrixType& A,
-                                                   Index nbrEigenvalues, std::string eigs_sigma="LM",
-                                        int options=ComputeEigenvectors, RealScalar tol=0.0);
-
+  ArpackGeneralizedSelfAdjointEigenSolver &compute(const MatrixType &A, Index nbrEigenvalues,
+                                                   std::string eigs_sigma = "LM", int options = ComputeEigenvectors,
+                                                   RealScalar tol = 0.0);
 
   /** \brief Returns the eigenvectors of given matrix.
    *
@@ -220,8 +201,7 @@ class ArpackGeneralizedSelfAdjointEigenSolver
    *
    * \sa eigenvalues()
    */
-  const Matrix<Scalar, Dynamic, Dynamic>& eigenvectors() const
-  {
+  const Matrix<Scalar, Dynamic, Dynamic> &eigenvectors() const {
     eigen_assert(m_isInitialized && "ArpackGeneralizedSelfAdjointEigenSolver is not initialized.");
     eigen_assert(m_eigenvectorsOk && "The eigenvectors have not been computed together with the eigenvalues.");
     return m_eivec;
@@ -242,8 +222,7 @@ class ArpackGeneralizedSelfAdjointEigenSolver
    *
    * \sa eigenvectors(), MatrixBase::eigenvalues()
    */
-  const Matrix<Scalar, Dynamic, 1>& eigenvalues() const
-  {
+  const Matrix<Scalar, Dynamic, 1> &eigenvalues() const {
     eigen_assert(m_isInitialized && "ArpackGeneralizedSelfAdjointEigenSolver is not initialized.");
     return m_eivalues;
   }
@@ -266,8 +245,7 @@ class ArpackGeneralizedSelfAdjointEigenSolver
    * \sa operatorInverseSqrt(),
    *     \ref MatrixFunctions_Module "MatrixFunctions Module"
    */
-  Matrix<Scalar, Dynamic, Dynamic> operatorSqrt() const
-  {
+  Matrix<Scalar, Dynamic, Dynamic> operatorSqrt() const {
     eigen_assert(m_isInitialized && "SelfAdjointEigenSolver is not initialized.");
     eigen_assert(m_eigenvectorsOk && "The eigenvectors have not been computed together with the eigenvalues.");
     return m_eivec * m_eivalues.cwiseSqrt().asDiagonal() * m_eivec.adjoint();
@@ -291,8 +269,7 @@ class ArpackGeneralizedSelfAdjointEigenSolver
    * \sa operatorSqrt(), MatrixBase::inverse(),
    *     \ref MatrixFunctions_Module "MatrixFunctions Module"
    */
-  Matrix<Scalar, Dynamic, Dynamic> operatorInverseSqrt() const
-  {
+  Matrix<Scalar, Dynamic, Dynamic> operatorInverseSqrt() const {
     eigen_assert(m_isInitialized && "SelfAdjointEigenSolver is not initialized.");
     eigen_assert(m_eigenvectorsOk && "The eigenvectors have not been computed together with the eigenvalues.");
     return m_eivec * m_eivalues.cwiseInverse().cwiseSqrt().asDiagonal() * m_eivec.adjoint();
@@ -300,21 +277,18 @@ class ArpackGeneralizedSelfAdjointEigenSolver
 
   /** \brief Reports whether previous computation was successful.
    *
-   * \returns \c Success if computation was succesful, \c NoConvergence otherwise.
+   * \returns \c Success if computation was successful, \c NoConvergence otherwise.
    */
-  ComputationInfo info() const
-  {
+  ComputationInfo info() const {
     eigen_assert(m_isInitialized && "ArpackGeneralizedSelfAdjointEigenSolver is not initialized.");
     return m_info;
   }
 
-  size_t getNbrConvergedEigenValues() const
-  { return m_nbrConverged; }
+  size_t getNbrConvergedEigenValues() const { return m_nbrConverged; }
 
-  size_t getNbrIterations() const
-  { return m_nbrIterations; }
+  size_t getNbrIterations() const { return m_nbrIterations; }
 
-protected:
+ protected:
   Matrix<Scalar, Dynamic, Dynamic> m_eivec;
   Matrix<Scalar, Dynamic, 1> m_eivalues;
   ComputationInfo m_info;
@@ -325,35 +299,30 @@ class ArpackGeneralizedSelfAdjointEigenSolver
   size_t m_nbrIterations;
 };
 
+template <typename MatrixType, typename MatrixSolver, bool BisSPD>
+ArpackGeneralizedSelfAdjointEigenSolver<MatrixType, MatrixSolver, BisSPD> &
+ArpackGeneralizedSelfAdjointEigenSolver<MatrixType, MatrixSolver, BisSPD>::compute(const MatrixType &A,
+                                                                                   Index nbrEigenvalues,
+                                                                                   std::string eigs_sigma, int options,
+                                                                                   RealScalar tol) {
+  MatrixType B(0, 0);
+  compute(A, B, nbrEigenvalues, eigs_sigma, options, tol);
 
-
-
-
-template<typename MatrixType, typename MatrixSolver, bool BisSPD>
-ArpackGeneralizedSelfAdjointEigenSolver<MatrixType, MatrixSolver, BisSPD>&
-    ArpackGeneralizedSelfAdjointEigenSolver<MatrixType, MatrixSolver, BisSPD>
-::compute(const MatrixType& A, Index nbrEigenvalues,
-          std::string eigs_sigma, int options, RealScalar tol)
-{
-    MatrixType B(0,0);
-    compute(A, B, nbrEigenvalues, eigs_sigma, options, tol);
-    
-    return *this;
+  return *this;
 }
 
-
-template<typename MatrixType, typename MatrixSolver, bool BisSPD>
-ArpackGeneralizedSelfAdjointEigenSolver<MatrixType, MatrixSolver, BisSPD>&
-    ArpackGeneralizedSelfAdjointEigenSolver<MatrixType, MatrixSolver, BisSPD>
-::compute(const MatrixType& A, const MatrixType& B, Index nbrEigenvalues,
-          std::string eigs_sigma, int options, RealScalar tol)
-{
+template <typename MatrixType, typename MatrixSolver, bool BisSPD>
+ArpackGeneralizedSelfAdjointEigenSolver<MatrixType, MatrixSolver, BisSPD> &
+ArpackGeneralizedSelfAdjointEigenSolver<MatrixType, MatrixSolver, BisSPD>::compute(const MatrixType &A,
+                                                                                   const MatrixType &B,
+                                                                                   Index nbrEigenvalues,
+                                                                                   std::string eigs_sigma, int options,
+                                                                                   RealScalar tol) {
   eigen_assert(A.cols() == A.rows());
   eigen_assert(B.cols() == B.rows());
   eigen_assert(B.rows() == 0 || A.cols() == B.rows());
-  eigen_assert((options &~ (EigVecMask | GenEigMask)) == 0
-            && (options & EigVecMask) != EigVecMask
-            && "invalid option parameter");
+  eigen_assert((options & ~(EigVecMask | GenEigMask)) == 0 && (options & EigVecMask) != EigVecMask &&
+               "invalid option parameter");
 
   bool isBempty = (B.rows() == 0) || (B.cols() == 0);
 
@@ -368,54 +337,49 @@ ::compute(const MatrixType& A, const MatrixType& B, Index nbrEigenvalues,
   // User options: "LA", "SA", "SM", "LM", "BE"
   //
   char whch[3] = "LM";
-    
+
   // Specifies the shift if iparam[6] = { 3, 4, 5 }, not used if iparam[6] = { 1, 2 }
   //
   RealScalar sigma = 0.0;
 
-  if (eigs_sigma.length() >= 2 && isalpha(eigs_sigma[0]) && isalpha(eigs_sigma[1]))
-  {
-      eigs_sigma[0] = toupper(eigs_sigma[0]);
-      eigs_sigma[1] = toupper(eigs_sigma[1]);
+  if (eigs_sigma.length() >= 2 && isalpha(eigs_sigma[0]) && isalpha(eigs_sigma[1])) {
+    eigs_sigma[0] = toupper(eigs_sigma[0]);
+    eigs_sigma[1] = toupper(eigs_sigma[1]);
 
-      // In the following special case we're going to invert the problem, since solving
-      // for larger magnitude is much much faster
-      // i.e., if 'SM' is specified, we're going to really use 'LM', the default
-      //
-      if (eigs_sigma.substr(0,2) != "SM")
-      {
-          whch[0] = eigs_sigma[0];
-          whch[1] = eigs_sigma[1];
-      }
-  }
-  else
-  {
-      eigen_assert(false && "Specifying clustered eigenvalues is not yet supported!");
+    // In the following special case we're going to invert the problem, since solving
+    // for larger magnitude is much much faster
+    // i.e., if 'SM' is specified, we're going to really use 'LM', the default
+    //
+    if (eigs_sigma.substr(0, 2) != "SM") {
+      whch[0] = eigs_sigma[0];
+      whch[1] = eigs_sigma[1];
+    }
+  } else {
+    eigen_assert(false && "Specifying clustered eigenvalues is not yet supported!");
 
-      // If it's not scalar values, then the user may be explicitly
-      // specifying the sigma value to cluster the evs around
-      //
-      sigma = atof(eigs_sigma.c_str());
+    // If it's not scalar values, then the user may be explicitly
+    // specifying the sigma value to cluster the evs around
+    //
+    sigma = atof(eigs_sigma.c_str());
 
-      // If atof fails, it returns 0.0, which is a fine default
-      //
+    // If atof fails, it returns 0.0, which is a fine default
+    //
   }
 
   // "I" means normal eigenvalue problem, "G" means generalized
   //
   char bmat[2] = "I";
-  if (eigs_sigma.substr(0,2) == "SM" || !(isalpha(eigs_sigma[0]) && isalpha(eigs_sigma[1])) || (!isBempty && !BisSPD))
-      bmat[0] = 'G';
+  if (eigs_sigma.substr(0, 2) == "SM" || !(isalpha(eigs_sigma[0]) && isalpha(eigs_sigma[1])) || (!isBempty && !BisSPD))
+    bmat[0] = 'G';
 
   // Now we determine the mode to use
   //
   int mode = (bmat[0] == 'G') + 1;
-  if (eigs_sigma.substr(0,2) == "SM" || !(isalpha(eigs_sigma[0]) && isalpha(eigs_sigma[1])))
-  {
-      // We're going to use shift-and-invert mode, and basically find
-      // the largest eigenvalues of the inverse operator
-      //
-      mode = 3;
+  if (eigs_sigma.substr(0, 2) == "SM" || !(isalpha(eigs_sigma[0]) && isalpha(eigs_sigma[1]))) {
+    // We're going to use shift-and-invert mode, and basically find
+    // the largest eigenvalues of the inverse operator
+    //
+    mode = 3;
   }
 
   // The user-specified number of eigenvalues/vectors to compute
@@ -430,27 +394,27 @@ ::compute(const MatrixType& A, const MatrixType& B, Index nbrEigenvalues,
   // Note that this indicates that nev != n, and we cannot compute
   // all eigenvalues of a mtrix
   //
-  int ncv = std::min(std::max(2*nev, 20), n);
+  int ncv = std::min(std::max(2 * nev, 20), n);
 
   // The working n x ncv matrix, also store the final eigenvectors (if computed)
   //
-  Scalar *v = new Scalar[n*ncv];
+  Scalar *v = new Scalar[n * ncv];
   int ldv = n;
 
   // Working space
   //
-  Scalar *workd = new Scalar[3*n];
-  int lworkl = ncv*ncv+8*ncv; // Must be at least this length
+  Scalar *workd = new Scalar[3 * n];
+  int lworkl = ncv * ncv + 8 * ncv;  // Must be at least this length
   Scalar *workl = new Scalar[lworkl];
 
-  int *iparam= new int[11];
-  iparam[0] = 1; // 1 means we let ARPACK perform the shifts, 0 means we'd have to do it
-  iparam[2] = std::max(300, (int)std::ceil(2*n/std::max(ncv,1)));
-  iparam[6] = mode; // The mode, 1 is standard ev problem, 2 for generalized ev, 3 for shift-and-invert
+  int *iparam = new int[11];
+  iparam[0] = 1;  // 1 means we let ARPACK perform the shifts, 0 means we'd have to do it
+  iparam[2] = std::max(300, (int)std::ceil(2 * n / std::max(ncv, 1)));
+  iparam[6] = mode;  // The mode, 1 is standard ev problem, 2 for generalized ev, 3 for shift-and-invert
 
   // Used during reverse communicate to notify where arrays start
   //
-  int *ipntr = new int[11]; 
+  int *ipntr = new int[11];
 
   // Error codes are returned in here, initial value of 0 indicates a random initial
   // residual vector is used, any other values means resid contains the initial residual
@@ -459,99 +423,83 @@ ::compute(const MatrixType& A, const MatrixType& B, Index nbrEigenvalues,
   int info = 0;
 
   Scalar scale = 1.0;
-  //if (!isBempty)
+  // if (!isBempty)
   //{
-  //Scalar scale = B.norm() / std::sqrt(n);
-  //scale = std::pow(2, std::floor(std::log(scale+1)));
+  // Scalar scale = B.norm() / std::sqrt(n);
+  // scale = std::pow(2, std::floor(std::log(scale+1)));
   ////M /= scale;
-  //for (size_t i=0; i<(size_t)B.outerSize(); i++)
-  //    for (typename MatrixType::InnerIterator it(B, i); it; ++it)
-  //        it.valueRef() /= scale;
-  //}
+  // for (size_t i=0; i<(size_t)B.outerSize(); i++)
+  //     for (typename MatrixType::InnerIterator it(B, i); it; ++it)
+  //         it.valueRef() /= scale;
+  // }
 
   MatrixSolver OP;
-  if (mode == 1 || mode == 2)
-  {
-      if (!isBempty)
-          OP.compute(B);
-  }
-  else if (mode == 3)
-  {
-      if (sigma == 0.0)
-      {
-          OP.compute(A);
-      }
-      else
-      {
-          // Note: We will never enter here because sigma must be 0.0
-          //
-          if (isBempty)
-          {
-            MatrixType AminusSigmaB(A);
-            for (Index i=0; i<A.rows(); ++i)
-                AminusSigmaB.coeffRef(i,i) -= sigma;
-            
-            OP.compute(AminusSigmaB);
-          }
-          else
-          {
-              MatrixType AminusSigmaB = A - sigma * B;
-              OP.compute(AminusSigmaB);
-          }
+  if (mode == 1 || mode == 2) {
+    if (!isBempty) OP.compute(B);
+  } else if (mode == 3) {
+    if (sigma == 0.0) {
+      OP.compute(A);
+    } else {
+      // Note: We will never enter here because sigma must be 0.0
+      //
+      if (isBempty) {
+        MatrixType AminusSigmaB(A);
+        for (Index i = 0; i < A.rows(); ++i) AminusSigmaB.coeffRef(i, i) -= sigma;
+
+        OP.compute(AminusSigmaB);
+      } else {
+        MatrixType AminusSigmaB = A - sigma * B;
+        OP.compute(AminusSigmaB);
       }
+    }
+  }
+
+  if (!(mode == 1 && isBempty) && !(mode == 2 && isBempty) && OP.info() != Success) {
+    m_info = OP.info() delete[] v;
+    delete[] iparam;
+    delete[] ipntr;
+    delete[] workd;
+    delete[] workl;
+    delete[] resid;
+    m_isInitialized = false;
+    return *this;
   }
- 
-  if (!(mode == 1 && isBempty) && !(mode == 2 && isBempty) && OP.info() != Success)
-      std::cout << "Error factoring matrix" << std::endl;
-
-  do
-  {
-    internal::arpack_wrapper<Scalar, RealScalar>::saupd(&ido, bmat, &n, whch, &nev, &tol, resid, 
-                                                        &ncv, v, &ldv, iparam, ipntr, workd, workl,
-                                                        &lworkl, &info);
-
-    if (ido == -1 || ido == 1)
-    {
-      Scalar *in  = workd + ipntr[0] - 1;
+
+  do {
+    internal::arpack_wrapper<Scalar, RealScalar>::saupd(&ido, bmat, &n, whch, &nev, &tol, resid, &ncv, v, &ldv, iparam,
+                                                        ipntr, workd, workl, &lworkl, &info);
+
+    if (ido == -1 || ido == 1) {
+      Scalar *in = workd + ipntr[0] - 1;
       Scalar *out = workd + ipntr[1] - 1;
 
-      if (ido == 1 && mode != 2)
-      {
-          Scalar *out2 = workd + ipntr[2] - 1;
-          if (isBempty || mode == 1)
-            Matrix<Scalar, Dynamic, 1>::Map(out2, n) = Matrix<Scalar, Dynamic, 1>::Map(in, n);
-          else
-            Matrix<Scalar, Dynamic, 1>::Map(out2, n) = B * Matrix<Scalar, Dynamic, 1>::Map(in, n);
-          
-          in = workd + ipntr[2] - 1;
+      if (ido == 1 && mode != 2) {
+        Scalar *out2 = workd + ipntr[2] - 1;
+        if (isBempty || mode == 1)
+          Matrix<Scalar, Dynamic, 1>::Map(out2, n) = Matrix<Scalar, Dynamic, 1>::Map(in, n);
+        else
+          Matrix<Scalar, Dynamic, 1>::Map(out2, n) = B * Matrix<Scalar, Dynamic, 1>::Map(in, n);
+
+        in = workd + ipntr[2] - 1;
       }
 
-      if (mode == 1)
-      {
-        if (isBempty)
-        {
+      if (mode == 1) {
+        if (isBempty) {
           // OP = A
           //
           Matrix<Scalar, Dynamic, 1>::Map(out, n) = A * Matrix<Scalar, Dynamic, 1>::Map(in, n);
-        }
-        else
-        {
+        } else {
           // OP = L^{-1}AL^{-T}
           //
           internal::OP<MatrixSolver, MatrixType, Scalar, BisSPD>::applyOP(OP, A, n, in, out);
         }
-      }
-      else if (mode == 2)
-      {
-        if (ido == 1)
-          Matrix<Scalar, Dynamic, 1>::Map(in, n)  = A * Matrix<Scalar, Dynamic, 1>::Map(in, n);
-        
+      } else if (mode == 2) {
+        if (ido == 1) Matrix<Scalar, Dynamic, 1>::Map(in, n) = A * Matrix<Scalar, Dynamic, 1>::Map(in, n);
+
         // OP = B^{-1} A
         //
         Matrix<Scalar, Dynamic, 1>::Map(out, n) = OP.solve(Matrix<Scalar, Dynamic, 1>::Map(in, n));
-      }
-      else if (mode == 3)
-      {
+      } else if (mode == 3) {
         // OP = (A-\sigmaB)B (\sigma could be 0, and B could be I)
         // The B * in is already computed and stored at in if ido == 1
         //
@@ -560,10 +508,8 @@ ::compute(const MatrixType& A, const MatrixType& B, Index nbrEigenvalues,
         else
           Matrix<Scalar, Dynamic, 1>::Map(out, n) = OP.solve(B * Matrix<Scalar, Dynamic, 1>::Map(in, n));
       }
-    }
-    else if (ido == 2)
-    {
-      Scalar *in  = workd + ipntr[0] - 1;
+    } else if (ido == 2) {
+      Scalar *in = workd + ipntr[0] - 1;
       Scalar *out = workd + ipntr[1] - 1;
 
       if (isBempty || mode == 1)
@@ -581,15 +527,14 @@ ::compute(const MatrixType& A, const MatrixType& B, Index nbrEigenvalues,
     m_info = InvalidInput;
   else if (info != 0)
     eigen_assert(false && "Unknown ARPACK return value!");
-  else
-  {
+  else {
     // Do we compute eigenvectors or not?
     //
     int rvec = (options & ComputeEigenvectors) == ComputeEigenvectors;
 
     // "A" means "All", use "S" to choose specific eigenvalues (not yet supported in ARPACK))
     //
-    char howmny[2] = "A"; 
+    char howmny[2] = "A";
 
     // if howmny == "S", specifies the eigenvalues to compute (not implemented in ARPACK)
     //
@@ -599,23 +544,20 @@ ::compute(const MatrixType& A, const MatrixType& B, Index nbrEigenvalues,
     //
     m_eivalues.resize(nev, 1);
 
-    internal::arpack_wrapper<Scalar, RealScalar>::seupd(&rvec, howmny, select, m_eivalues.data(), v, &ldv,
-                                                        &sigma, bmat, &n, whch, &nev, &tol, resid, &ncv,
-                                                        v, &ldv, iparam, ipntr, workd, workl, &lworkl, &info);
+    internal::arpack_wrapper<Scalar, RealScalar>::seupd(&rvec, howmny, select, m_eivalues.data(), v, &ldv, &sigma, bmat,
+                                                        &n, whch, &nev, &tol, resid, &ncv, v, &ldv, iparam, ipntr,
+                                                        workd, workl, &lworkl, &info);
 
     if (info == -14)
       m_info = NoConvergence;
     else if (info != 0)
       m_info = InvalidInput;
-    else
-    {
-      if (rvec)
-      {
+    else {
+      if (rvec) {
         m_eivec.resize(A.rows(), nev);
-        for (int i=0; i<nev; i++)
-          for (int j=0; j<n; j++)
-            m_eivec(j,i) = v[i*n+j] / scale;
-      
+        for (int i = 0; i < nev; i++)
+          for (int j = 0; j < n; j++) m_eivec(j, i) = v[i * n + j] / scale;
+
         if (mode == 1 && !isBempty && BisSPD)
           internal::OP<MatrixSolver, MatrixType, Scalar, BisSPD>::project(OP, n, nev, m_eivec.data());
 
@@ -623,138 +565,105 @@ ::compute(const MatrixType& A, const MatrixType& B, Index nbrEigenvalues,
       }
 
       m_nbrIterations = iparam[2];
-      m_nbrConverged  = iparam[4];
+      m_nbrConverged = iparam[4];
 
       m_info = Success;
     }
 
-    delete select;
+    delete[] select;
   }
 
-  delete v;
-  delete iparam;
-  delete ipntr;
-  delete workd;
-  delete workl;
-  delete resid;
+  delete[] v;
+  delete[] iparam;
+  delete[] ipntr;
+  delete[] workd;
+  delete[] workl;
+  delete[] resid;
 
-  m_isInitialized = true;
+  m_isInitialized = (m_info == Success);
 
   return *this;
 }
 
-
 // Single precision
 //
-extern "C" void ssaupd_(int *ido, char *bmat, int *n, char *which,
-    int *nev, float *tol, float *resid, int *ncv,
-    float *v, int *ldv, int *iparam, int *ipntr,
-    float *workd, float *workl, int *lworkl,
-    int *info);
-
-extern "C" void sseupd_(int *rvec, char *All, int *select, float *d,
-    float *z, int *ldz, float *sigma, 
-    char *bmat, int *n, char *which, int *nev,
-    float *tol, float *resid, int *ncv, float *v,
-    int *ldv, int *iparam, int *ipntr, float *workd,
-    float *workl, int *lworkl, int *ierr);
+extern "C" void ssaupd_(int *ido, char *bmat, int *n, char *which, int *nev, float *tol, float *resid, int *ncv,
+                        float *v, int *ldv, int *iparam, int *ipntr, float *workd, float *workl, int *lworkl,
+                        int *info);
+
+extern "C" void sseupd_(int *rvec, char *All, int *select, float *d, float *z, int *ldz, float *sigma, char *bmat,
+                        int *n, char *which, int *nev, float *tol, float *resid, int *ncv, float *v, int *ldv,
+                        int *iparam, int *ipntr, float *workd, float *workl, int *lworkl, int *ierr);
 
 // Double precision
 //
-extern "C" void dsaupd_(int *ido, char *bmat, int *n, char *which,
-    int *nev, double *tol, double *resid, int *ncv,
-    double *v, int *ldv, int *iparam, int *ipntr,
-    double *workd, double *workl, int *lworkl,
-    int *info);
-
-extern "C" void dseupd_(int *rvec, char *All, int *select, double *d,
-    double *z, int *ldz, double *sigma, 
-    char *bmat, int *n, char *which, int *nev,
-    double *tol, double *resid, int *ncv, double *v,
-    int *ldv, int *iparam, int *ipntr, double *workd,
-    double *workl, int *lworkl, int *ierr);
+extern "C" void dsaupd_(int *ido, char *bmat, int *n, char *which, int *nev, double *tol, double *resid, int *ncv,
+                        double *v, int *ldv, int *iparam, int *ipntr, double *workd, double *workl, int *lworkl,
+                        int *info);
 
+extern "C" void dseupd_(int *rvec, char *All, int *select, double *d, double *z, int *ldz, double *sigma, char *bmat,
+                        int *n, char *which, int *nev, double *tol, double *resid, int *ncv, double *v, int *ldv,
+                        int *iparam, int *ipntr, double *workd, double *workl, int *lworkl, int *ierr);
 
 namespace internal {
 
-template<typename Scalar, typename RealScalar> struct arpack_wrapper
-{
-  static inline void saupd(int *ido, char *bmat, int *n, char *which,
-      int *nev, RealScalar *tol, Scalar *resid, int *ncv,
-      Scalar *v, int *ldv, int *iparam, int *ipntr,
-      Scalar *workd, Scalar *workl, int *lworkl, int *info)
-  { 
+template <typename Scalar, typename RealScalar>
+struct arpack_wrapper {
+  static inline void saupd(int *ido, char *bmat, int *n, char *which, int *nev, RealScalar *tol, Scalar *resid,
+                           int *ncv, Scalar *v, int *ldv, int *iparam, int *ipntr, Scalar *workd, Scalar *workl,
+                           int *lworkl, int *info) {
     EIGEN_STATIC_ASSERT(!NumTraits<Scalar>::IsComplex, NUMERIC_TYPE_MUST_BE_REAL)
   }
 
-  static inline void seupd(int *rvec, char *All, int *select, Scalar *d,
-      Scalar *z, int *ldz, RealScalar *sigma,
-      char *bmat, int *n, char *which, int *nev,
-      RealScalar *tol, Scalar *resid, int *ncv, Scalar *v,
-      int *ldv, int *iparam, int *ipntr, Scalar *workd,
-      Scalar *workl, int *lworkl, int *ierr)
-  {
+  static inline void seupd(int *rvec, char *All, int *select, Scalar *d, Scalar *z, int *ldz, RealScalar *sigma,
+                           char *bmat, int *n, char *which, int *nev, RealScalar *tol, Scalar *resid, int *ncv,
+                           Scalar *v, int *ldv, int *iparam, int *ipntr, Scalar *workd, Scalar *workl, int *lworkl,
+                           int *ierr) {
     EIGEN_STATIC_ASSERT(!NumTraits<Scalar>::IsComplex, NUMERIC_TYPE_MUST_BE_REAL)
   }
 };
 
-template <> struct arpack_wrapper<float, float>
-{
-  static inline void saupd(int *ido, char *bmat, int *n, char *which,
-      int *nev, float *tol, float *resid, int *ncv,
-      float *v, int *ldv, int *iparam, int *ipntr,
-      float *workd, float *workl, int *lworkl, int *info)
-  {
+template <>
+struct arpack_wrapper<float, float> {
+  static inline void saupd(int *ido, char *bmat, int *n, char *which, int *nev, float *tol, float *resid, int *ncv,
+                           float *v, int *ldv, int *iparam, int *ipntr, float *workd, float *workl, int *lworkl,
+                           int *info) {
     ssaupd_(ido, bmat, n, which, nev, tol, resid, ncv, v, ldv, iparam, ipntr, workd, workl, lworkl, info);
   }
 
-  static inline void seupd(int *rvec, char *All, int *select, float *d,
-      float *z, int *ldz, float *sigma,
-      char *bmat, int *n, char *which, int *nev,
-      float *tol, float *resid, int *ncv, float *v,
-      int *ldv, int *iparam, int *ipntr, float *workd,
-      float *workl, int *lworkl, int *ierr)
-  {
-    sseupd_(rvec, All, select, d, z, ldz, sigma, bmat, n, which, nev, tol, resid, ncv, v, ldv, iparam, ipntr,
-        workd, workl, lworkl, ierr);
+  static inline void seupd(int *rvec, char *All, int *select, float *d, float *z, int *ldz, float *sigma, char *bmat,
+                           int *n, char *which, int *nev, float *tol, float *resid, int *ncv, float *v, int *ldv,
+                           int *iparam, int *ipntr, float *workd, float *workl, int *lworkl, int *ierr) {
+    sseupd_(rvec, All, select, d, z, ldz, sigma, bmat, n, which, nev, tol, resid, ncv, v, ldv, iparam, ipntr, workd,
+            workl, lworkl, ierr);
   }
 };
 
-template <> struct arpack_wrapper<double, double>
-{
-  static inline void saupd(int *ido, char *bmat, int *n, char *which,
-      int *nev, double *tol, double *resid, int *ncv,
-      double *v, int *ldv, int *iparam, int *ipntr,
-      double *workd, double *workl, int *lworkl, int *info)
-  {
+template <>
+struct arpack_wrapper<double, double> {
+  static inline void saupd(int *ido, char *bmat, int *n, char *which, int *nev, double *tol, double *resid, int *ncv,
+                           double *v, int *ldv, int *iparam, int *ipntr, double *workd, double *workl, int *lworkl,
+                           int *info) {
     dsaupd_(ido, bmat, n, which, nev, tol, resid, ncv, v, ldv, iparam, ipntr, workd, workl, lworkl, info);
   }
 
-  static inline void seupd(int *rvec, char *All, int *select, double *d,
-      double *z, int *ldz, double *sigma,
-      char *bmat, int *n, char *which, int *nev,
-      double *tol, double *resid, int *ncv, double *v,
-      int *ldv, int *iparam, int *ipntr, double *workd,
-      double *workl, int *lworkl, int *ierr)
-  {
-    dseupd_(rvec, All, select, d, v, ldv, sigma, bmat, n, which, nev, tol, resid, ncv, v, ldv, iparam, ipntr,
-        workd, workl, lworkl, ierr);
+  static inline void seupd(int *rvec, char *All, int *select, double *d, double *z, int *ldz, double *sigma, char *bmat,
+                           int *n, char *which, int *nev, double *tol, double *resid, int *ncv, double *v, int *ldv,
+                           int *iparam, int *ipntr, double *workd, double *workl, int *lworkl, int *ierr) {
+    dseupd_(rvec, All, select, d, v, ldv, sigma, bmat, n, which, nev, tol, resid, ncv, v, ldv, iparam, ipntr, workd,
+            workl, lworkl, ierr);
   }
 };
 
-
-template<typename MatrixSolver, typename MatrixType, typename Scalar, bool BisSPD>
-struct OP
-{
-    static inline void applyOP(MatrixSolver &OP, const MatrixType &A, int n, Scalar *in, Scalar *out);
-    static inline void project(MatrixSolver &OP, int n, int k, Scalar *vecs);
+template <typename MatrixSolver, typename MatrixType, typename Scalar, bool BisSPD>
+struct OP {
+  static inline void applyOP(MatrixSolver &OP, const MatrixType &A, int n, Scalar *in, Scalar *out);
+  static inline void project(MatrixSolver &OP, int n, int k, Scalar *vecs);
 };
 
-template<typename MatrixSolver, typename MatrixType, typename Scalar>
-struct OP<MatrixSolver, MatrixType, Scalar, true>
-{
-  static inline void applyOP(MatrixSolver &OP, const MatrixType &A, int n, Scalar *in, Scalar *out)
-{
+template <typename MatrixSolver, typename MatrixType, typename Scalar>
+struct OP<MatrixSolver, MatrixType, Scalar, true> {
+  static inline void applyOP(MatrixSolver &OP, const MatrixType &A, int n, Scalar *in, Scalar *out) {
     // OP = L^{-1} A L^{-T}  (B = LL^T)
     //
     // First solve L^T out = in
@@ -770,36 +679,31 @@ struct OP<MatrixSolver, MatrixType, Scalar, true>
     //
     Matrix<Scalar, Dynamic, 1>::Map(out, n) = OP.permutationP() * Matrix<Scalar, Dynamic, 1>::Map(out, n);
     Matrix<Scalar, Dynamic, 1>::Map(out, n) = OP.matrixL().solve(Matrix<Scalar, Dynamic, 1>::Map(out, n));
-}
+  }
 
-  static inline void project(MatrixSolver &OP, int n, int k, Scalar *vecs)
-{
+  static inline void project(MatrixSolver &OP, int n, int k, Scalar *vecs) {
     // Solve L^T out = in
     //
-    Matrix<Scalar, Dynamic, Dynamic>::Map(vecs, n, k) = OP.matrixU().solve(Matrix<Scalar, Dynamic, Dynamic>::Map(vecs, n, k));
-    Matrix<Scalar, Dynamic, Dynamic>::Map(vecs, n, k) = OP.permutationPinv() * Matrix<Scalar, Dynamic, Dynamic>::Map(vecs, n, k);
-}
-
+    Matrix<Scalar, Dynamic, Dynamic>::Map(vecs, n, k) =
+        OP.matrixU().solve(Matrix<Scalar, Dynamic, Dynamic>::Map(vecs, n, k));
+    Matrix<Scalar, Dynamic, Dynamic>::Map(vecs, n, k) =
+        OP.permutationPinv() * Matrix<Scalar, Dynamic, Dynamic>::Map(vecs, n, k);
+  }
 };
 
-template<typename MatrixSolver, typename MatrixType, typename Scalar>
-struct OP<MatrixSolver, MatrixType, Scalar, false>
-{
-  static inline void applyOP(MatrixSolver &OP, const MatrixType &A, int n, Scalar *in, Scalar *out)
-{
+template <typename MatrixSolver, typename MatrixType, typename Scalar>
+struct OP<MatrixSolver, MatrixType, Scalar, false> {
+  static inline void applyOP(MatrixSolver &OP, const MatrixType &A, int n, Scalar *in, Scalar *out) {
     eigen_assert(false && "Should never be in here...");
-}
+  }
 
-  static inline void project(MatrixSolver &OP, int n, int k, Scalar *vecs)
-{
+  static inline void project(MatrixSolver &OP, int n, int k, Scalar *vecs) {
     eigen_assert(false && "Should never be in here...");
-}
-
+  }
 };
 
-} // end namespace internal
-
-} // end namespace Eigen
+}  // end namespace internal
 
-#endif // EIGEN_ARPACKSELFADJOINTEIGENSOLVER_H
+}  // end namespace Eigen
 
+#endif  // EIGEN_ARPACKSELFADJOINTEIGENSOLVER_H
diff --git a/inst/include/unsupported/Eigen/src/Eigenvalues/InternalHeaderCheck.h b/inst/include/unsupported/Eigen/src/Eigenvalues/InternalHeaderCheck.h
new file mode 100644
index 00000000..c00cb576
--- /dev/null
+++ b/inst/include/unsupported/Eigen/src/Eigenvalues/InternalHeaderCheck.h
@@ -0,0 +1,3 @@
+#ifndef EIGEN_EIGENVALUES_MODULE_H
+#error "Please include unsupported/Eigen/Eigenvalues instead of including headers inside the src directory directly."
+#endif
diff --git a/inst/include/unsupported/Eigen/src/EulerAngles/CMakeLists.txt b/inst/include/unsupported/Eigen/src/EulerAngles/CMakeLists.txt
new file mode 100644
index 00000000..22088eb3
--- /dev/null
+++ b/inst/include/unsupported/Eigen/src/EulerAngles/CMakeLists.txt
@@ -0,0 +1,6 @@
+file(GLOB Eigen_EulerAngles_SRCS "*.h")
+
+install(FILES
+  ${Eigen_EulerAngles_SRCS}
+  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/EulerAngles COMPONENT Devel
+  )
diff --git a/inst/include/unsupported/Eigen/src/EulerAngles/EulerAngles.h b/inst/include/unsupported/Eigen/src/EulerAngles/EulerAngles.h
new file mode 100644
index 00000000..b0b36171
--- /dev/null
+++ b/inst/include/unsupported/Eigen/src/EulerAngles/EulerAngles.h
@@ -0,0 +1,350 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Tal Hadad <tal_hd@hotmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_EULERANGLESCLASS_H  // TODO: Fix previous "EIGEN_EULERANGLES_H" definition?
+#define EIGEN_EULERANGLESCLASS_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+/** \class EulerAngles
+ *
+ * \ingroup EulerAngles_Module
+ *
+ * \brief Represents a rotation in a 3 dimensional space as three Euler angles.
+ *
+ * Euler rotation is a set of three rotation of three angles over three fixed axes, defined by the EulerSystem given as
+ * a template parameter.
+ *
+ * Here is how intrinsic Euler angles works:
+ *  - first, rotate the axes system over the alpha axis in angle alpha
+ *  - then, rotate the axes system over the beta axis(which was rotated in the first stage) in angle beta
+ *  - then, rotate the axes system over the gamma axis(which was rotated in the two stages above) in angle gamma
+ *
+ * \note This class support only intrinsic Euler angles for simplicity,
+ *  see EulerSystem how to easily overcome this for extrinsic systems.
+ *
+ * ### Rotation representation and conversions ###
+ *
+ * It has been proved(see Wikipedia link below) that every rotation can be represented
+ *  by Euler angles, but there is no single representation (e.g. unlike rotation matrices).
+ * Therefore, you can convert from Eigen rotation and to them
+ *  (including rotation matrices, which is not called "rotations" by Eigen design).
+ *
+ * Euler angles usually used for:
+ *  - convenient human representation of rotation, especially in interactive GUI.
+ *  - gimbal systems and robotics
+ *  - efficient encoding(i.e. 3 floats only) of rotation for network protocols.
+ *
+ * However, Euler angles are slow comparing to quaternion or matrices,
+ *  because their unnatural math definition, although it's simple for human.
+ * To overcome this, this class provide easy movement from the math friendly representation
+ *  to the human friendly representation, and vise-versa.
+ *
+ * All the user need to do is a safe simple C++ type conversion,
+ *  and this class take care for the math.
+ * Additionally, some axes related computation is done in compile time.
+ *
+ * #### Euler angles ranges in conversions ####
+ * Rotations representation as EulerAngles are not single (unlike matrices),
+ *  and even have infinite EulerAngles representations.<BR>
+ * For example, add or subtract 2*PI from either angle of EulerAngles
+ *  and you'll get the same rotation.
+ * This is the general reason for infinite representation,
+ *  but it's not the only general reason for not having a single representation.
+ *
+ * When converting rotation to EulerAngles, this class convert it to specific ranges
+ * When converting some rotation to EulerAngles, the rules for ranges are as follow:
+ * - If the rotation we converting from is an EulerAngles
+ *  (even when it represented as RotationBase explicitly), angles ranges are __undefined__.
+ * - otherwise, alpha and gamma angles will be in the range [-PI, PI].<BR>
+ *   As for Beta angle:
+ *    - If the system is Tait-Bryan, the beta angle will be in the range [-PI/2, PI/2].
+ *    - otherwise:
+ *      - If the beta axis is positive, the beta angle will be in the range [0, PI]
+ *      - If the beta axis is negative, the beta angle will be in the range [-PI, 0]
+ *
+ * \sa EulerAngles(const MatrixBase<Derived>&)
+ * \sa EulerAngles(const RotationBase<Derived, 3>&)
+ *
+ * ### Convenient user typedefs ###
+ *
+ * Convenient typedefs for EulerAngles exist for float and double scalar,
+ *  in a form of EulerAngles{A}{B}{C}{scalar},
+ *  e.g. \ref EulerAnglesXYZd, \ref EulerAnglesZYZf.
+ *
+ * Only for positive axes{+x,+y,+z} Euler systems are have convenient typedef.
+ * If you need negative axes{-x,-y,-z}, it is recommended to create you own typedef with
+ *  a word that represent what you need.
+ *
+ * ### Example ###
+ *
+ * \include EulerAngles.cpp
+ * Output: \verbinclude EulerAngles.out
+ *
+ * ### Additional reading ###
+ *
+ * If you're want to get more idea about how Euler system work in Eigen see EulerSystem.
+ *
+ * More information about Euler angles: https://en.wikipedia.org/wiki/Euler_angles
+ *
+ * \tparam Scalar_ the scalar type, i.e. the type of the angles.
+ *
+ * \tparam _System the EulerSystem to use, which represents the axes of rotation.
+ */
+template <typename Scalar_, class _System>
+class EulerAngles : public RotationBase<EulerAngles<Scalar_, _System>, 3> {
+ public:
+  typedef RotationBase<EulerAngles<Scalar_, _System>, 3> Base;
+
+  /** the scalar type of the angles */
+  typedef Scalar_ Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+
+  /** the EulerSystem to use, which represents the axes of rotation. */
+  typedef _System System;
+
+  typedef Matrix<Scalar, 3, 3> Matrix3;      /*!< the equivalent rotation matrix type */
+  typedef Matrix<Scalar, 3, 1> Vector3;      /*!< the equivalent 3 dimension vector type */
+  typedef Quaternion<Scalar> QuaternionType; /*!< the equivalent quaternion type */
+  typedef AngleAxis<Scalar> AngleAxisType;   /*!< the equivalent angle-axis type */
+
+  /** \returns the axis vector of the first (alpha) rotation */
+  static Vector3 AlphaAxisVector() {
+    const Vector3& u = Vector3::Unit(System::AlphaAxisAbs - 1);
+    return System::IsAlphaOpposite ? -u : u;
+  }
+
+  /** \returns the axis vector of the second (beta) rotation */
+  static Vector3 BetaAxisVector() {
+    const Vector3& u = Vector3::Unit(System::BetaAxisAbs - 1);
+    return System::IsBetaOpposite ? -u : u;
+  }
+
+  /** \returns the axis vector of the third (gamma) rotation */
+  static Vector3 GammaAxisVector() {
+    const Vector3& u = Vector3::Unit(System::GammaAxisAbs - 1);
+    return System::IsGammaOpposite ? -u : u;
+  }
+
+ private:
+  Vector3 m_angles;
+
+ public:
+  /** Default constructor without initialization. */
+  EulerAngles() {}
+  /** Constructs and initialize an EulerAngles (\p alpha, \p beta, \p gamma). */
+  EulerAngles(const Scalar& alpha, const Scalar& beta, const Scalar& gamma) : m_angles(alpha, beta, gamma) {}
+
+  // TODO: Test this constructor
+  /** Constructs and initialize an EulerAngles from the array data {alpha, beta, gamma} */
+  explicit EulerAngles(const Scalar* data) : m_angles(data) {}
+
+  /** Constructs and initializes an EulerAngles from either:
+   *  - a 3x3 rotation matrix expression(i.e. pure orthogonal matrix with determinant of +1),
+   *  - a 3D vector expression representing Euler angles.
+   *
+   * \note If \p other is a 3x3 rotation matrix, the angles range rules will be as follow:<BR>
+   *  Alpha and gamma angles will be in the range [-PI, PI].<BR>
+   *  As for Beta angle:
+   *   - If the system is Tait-Bryan, the beta angle will be in the range [-PI/2, PI/2].
+   *   - otherwise:
+   *     - If the beta axis is positive, the beta angle will be in the range [0, PI]
+   *     - If the beta axis is negative, the beta angle will be in the range [-PI, 0]
+   */
+  template <typename Derived>
+  explicit EulerAngles(const MatrixBase<Derived>& other) {
+    *this = other;
+  }
+
+  /** Constructs and initialize Euler angles from a rotation \p rot.
+   *
+   * \note If \p rot is an EulerAngles (even when it represented as RotationBase explicitly),
+   *  angles ranges are __undefined__.
+   *  Otherwise, alpha and gamma angles will be in the range [-PI, PI].<BR>
+   *  As for Beta angle:
+   *   - If the system is Tait-Bryan, the beta angle will be in the range [-PI/2, PI/2].
+   *   - otherwise:
+   *     - If the beta axis is positive, the beta angle will be in the range [0, PI]
+   *     - If the beta axis is negative, the beta angle will be in the range [-PI, 0]
+   */
+  template <typename Derived>
+  EulerAngles(const RotationBase<Derived, 3>& rot) {
+    System::CalcEulerAngles(*this, rot.toRotationMatrix());
+  }
+
+  /*EulerAngles(const QuaternionType& q)
+  {
+    // TODO: Implement it in a faster way for quaternions
+    // According to http://www.euclideanspace.com/maths/geometry/rotations/conversions/quaternionToEuler/
+    //  we can compute only the needed matrix cells and then convert to euler angles. (see ZYX example below)
+    // Currently we compute all matrix cells from quaternion.
+
+    // Special case only for ZYX
+    //Scalar y2 = q.y() * q.y();
+    //m_angles[0] = std::atan2(2*(q.w()*q.z() + q.x()*q.y()), (1 - 2*(y2 + q.z()*q.z())));
+    //m_angles[1] = std::asin( 2*(q.w()*q.y() - q.z()*q.x()));
+    //m_angles[2] = std::atan2(2*(q.w()*q.x() + q.y()*q.z()), (1 - 2*(q.x()*q.x() + y2)));
+  }*/
+
+  /** \returns The angle values stored in a vector (alpha, beta, gamma). */
+  const Vector3& angles() const { return m_angles; }
+  /** \returns A read-write reference to the angle values stored in a vector (alpha, beta, gamma). */
+  Vector3& angles() { return m_angles; }
+
+  /** \returns The value of the first angle. */
+  Scalar alpha() const { return m_angles[0]; }
+  /** \returns A read-write reference to the angle of the first angle. */
+  Scalar& alpha() { return m_angles[0]; }
+
+  /** \returns The value of the second angle. */
+  Scalar beta() const { return m_angles[1]; }
+  /** \returns A read-write reference to the angle of the second angle. */
+  Scalar& beta() { return m_angles[1]; }
+
+  /** \returns The value of the third angle. */
+  Scalar gamma() const { return m_angles[2]; }
+  /** \returns A read-write reference to the angle of the third angle. */
+  Scalar& gamma() { return m_angles[2]; }
+
+  /** \returns The Euler angles rotation inverse (which is as same as the negative),
+   *  (-alpha, -beta, -gamma).
+   */
+  EulerAngles inverse() const {
+    EulerAngles res;
+    res.m_angles = -m_angles;
+    return res;
+  }
+
+  /** \returns The Euler angles rotation negative (which is as same as the inverse),
+   *  (-alpha, -beta, -gamma).
+   */
+  EulerAngles operator-() const { return inverse(); }
+
+  /** Set \c *this from either:
+   *  - a 3x3 rotation matrix expression(i.e. pure orthogonal matrix with determinant of +1),
+   *  - a 3D vector expression representing Euler angles.
+   *
+   * See EulerAngles(const MatrixBase<Derived, 3>&) for more information about
+   *  angles ranges output.
+   */
+  template <class Derived>
+  EulerAngles& operator=(const MatrixBase<Derived>& other) {
+    EIGEN_STATIC_ASSERT(
+        (internal::is_same<Scalar, typename Derived::Scalar>::value),
+        YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY)
+
+    internal::eulerangles_assign_impl<System, Derived>::run(*this, other.derived());
+    return *this;
+  }
+
+  // TODO: Assign and construct from another EulerAngles (with different system)
+
+  /** Set \c *this from a rotation.
+   *
+   * See EulerAngles(const RotationBase<Derived, 3>&) for more information about
+   *  angles ranges output.
+   */
+  template <typename Derived>
+  EulerAngles& operator=(const RotationBase<Derived, 3>& rot) {
+    System::CalcEulerAngles(*this, rot.toRotationMatrix());
+    return *this;
+  }
+
+  /** \returns \c true if \c *this is approximately equal to \a other, within the precision
+   * determined by \a prec.
+   *
+   * \sa MatrixBase::isApprox() */
+  bool isApprox(const EulerAngles& other, const RealScalar& prec = NumTraits<Scalar>::dummy_precision()) const {
+    return angles().isApprox(other.angles(), prec);
+  }
+
+  /** \returns an equivalent 3x3 rotation matrix. */
+  Matrix3 toRotationMatrix() const {
+    // TODO: Calc it faster
+    return static_cast<QuaternionType>(*this).toRotationMatrix();
+  }
+
+  /** Convert the Euler angles to quaternion. */
+  operator QuaternionType() const {
+    return AngleAxisType(alpha(), AlphaAxisVector()) * AngleAxisType(beta(), BetaAxisVector()) *
+           AngleAxisType(gamma(), GammaAxisVector());
+  }
+
+  friend std::ostream& operator<<(std::ostream& s, const EulerAngles<Scalar, System>& eulerAngles) {
+    s << eulerAngles.angles().transpose();
+    return s;
+  }
+
+  /** \returns \c *this with scalar type casted to \a NewScalarType */
+  template <typename NewScalarType>
+  EulerAngles<NewScalarType, System> cast() const {
+    EulerAngles<NewScalarType, System> e;
+    e.angles() = angles().template cast<NewScalarType>();
+    return e;
+  }
+};
+
+#define EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(AXES, SCALAR_TYPE, SCALAR_POSTFIX) \
+  /** \ingroup EulerAngles_Module */                                         \
+  typedef EulerAngles<SCALAR_TYPE, EulerSystem##AXES> EulerAngles##AXES##SCALAR_POSTFIX;
+
+#define EIGEN_EULER_ANGLES_TYPEDEFS(SCALAR_TYPE, SCALAR_POSTFIX)      \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(XYZ, SCALAR_TYPE, SCALAR_POSTFIX) \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(XYX, SCALAR_TYPE, SCALAR_POSTFIX) \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(XZY, SCALAR_TYPE, SCALAR_POSTFIX) \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(XZX, SCALAR_TYPE, SCALAR_POSTFIX) \
+                                                                      \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(YZX, SCALAR_TYPE, SCALAR_POSTFIX) \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(YZY, SCALAR_TYPE, SCALAR_POSTFIX) \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(YXZ, SCALAR_TYPE, SCALAR_POSTFIX) \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(YXY, SCALAR_TYPE, SCALAR_POSTFIX) \
+                                                                      \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(ZXY, SCALAR_TYPE, SCALAR_POSTFIX) \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(ZXZ, SCALAR_TYPE, SCALAR_POSTFIX) \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(ZYX, SCALAR_TYPE, SCALAR_POSTFIX) \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(ZYZ, SCALAR_TYPE, SCALAR_POSTFIX)
+
+EIGEN_EULER_ANGLES_TYPEDEFS(float, f)
+EIGEN_EULER_ANGLES_TYPEDEFS(double, d)
+
+// Specifically-referenced instantiations.
+/** \typedef EulerAnglesXYZd
+ * \ingroup EulerAngles_Module
+ * Euler XYZ system with type double entries.
+ */
+/** \typedef EulerAnglesZYZf
+ * \ingroup EulerAngles_Module
+ * Euler ZYZ system with type float entries.
+ */
+
+namespace internal {
+template <typename Scalar_, class _System>
+struct traits<EulerAngles<Scalar_, _System> > {
+  typedef Scalar_ Scalar;
+};
+
+// set from a rotation matrix
+template <class System, class Other>
+struct eulerangles_assign_impl<System, Other, 3, 3> {
+  typedef typename Other::Scalar Scalar;
+  static void run(EulerAngles<Scalar, System>& e, const Other& m) { System::CalcEulerAngles(e, m); }
+};
+
+// set from a vector of Euler angles
+template <class System, class Other>
+struct eulerangles_assign_impl<System, Other, 3, 1> {
+  typedef typename Other::Scalar Scalar;
+  static void run(EulerAngles<Scalar, System>& e, const Other& vec) { e.angles() = vec; }
+};
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_EULERANGLESCLASS_H
diff --git a/inst/include/unsupported/Eigen/src/EulerAngles/EulerSystem.h b/inst/include/unsupported/Eigen/src/EulerAngles/EulerSystem.h
new file mode 100644
index 00000000..bf8a3345
--- /dev/null
+++ b/inst/include/unsupported/Eigen/src/EulerAngles/EulerSystem.h
@@ -0,0 +1,284 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Tal Hadad <tal_hd@hotmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_EULERSYSTEM_H
+#define EIGEN_EULERSYSTEM_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+// Forward declarations
+template <typename Scalar_, class _System>
+class EulerAngles;
+
+namespace internal {
+// TODO: Add this trait to the Eigen internal API?
+template <int Num, bool IsPositive = (Num > 0)>
+struct Abs {
+  enum { value = Num };
+};
+
+template <int Num>
+struct Abs<Num, false> {
+  enum { value = -Num };
+};
+
+template <int Axis>
+struct IsValidAxis {
+  enum { value = Axis != 0 && Abs<Axis>::value <= 3 };
+};
+
+template <typename System, typename Other, int OtherRows = Other::RowsAtCompileTime,
+          int OtherCols = Other::ColsAtCompileTime>
+struct eulerangles_assign_impl;
+}  // namespace internal
+
+#define EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT(COND, MSG) typedef char static_assertion_##MSG[(COND) ? 1 : -1]
+
+/** \brief Representation of a fixed signed rotation axis for EulerSystem.
+ *
+ * \ingroup EulerAngles_Module
+ *
+ * Values here represent:
+ *  - The axis of the rotation: X, Y or Z.
+ *  - The sign (i.e. direction of the rotation along the axis): positive(+) or negative(-)
+ *
+ * Therefore, this could express all the axes {+X,+Y,+Z,-X,-Y,-Z}
+ *
+ * For positive axis, use +EULER_{axis}, and for negative axis use -EULER_{axis}.
+ */
+enum EulerAxis {
+  EULER_X = 1, /*!< the X axis */
+  EULER_Y = 2, /*!< the Y axis */
+  EULER_Z = 3  /*!< the Z axis */
+};
+
+/** \class EulerSystem
+ *
+ * \ingroup EulerAngles_Module
+ *
+ * \brief Represents a fixed Euler rotation system.
+ *
+ * This meta-class goal is to represent the Euler system in compilation time, for EulerAngles.
+ *
+ * You can use this class to get two things:
+ *  - Build an Euler system, and then pass it as a template parameter to EulerAngles.
+ *  - Query some compile time data about an Euler system. (e.g. Whether it's Tait-Bryan)
+ *
+ * Euler rotation is a set of three rotation on fixed axes. (see \ref EulerAngles)
+ * This meta-class store constantly those signed axes. (see \ref EulerAxis)
+ *
+ * ### Types of Euler systems ###
+ *
+ * All and only valid 3 dimension Euler rotation over standard
+ *  signed axes{+X,+Y,+Z,-X,-Y,-Z} are supported:
+ *  - all axes X, Y, Z in each valid order (see below what order is valid)
+ *  - rotation over the axis is supported both over the positive and negative directions.
+ *  - both Tait-Bryan and proper/classic Euler angles (i.e. the opposite).
+ *
+ * Since EulerSystem support both positive and negative directions,
+ *  you may call this rotation distinction in other names:
+ *  - _right handed_ or _left handed_
+ *  - _counterclockwise_ or _clockwise_
+ *
+ * Notice all axed combination are valid, and would trigger a static assertion.
+ * Same unsigned axes can't be neighbors, e.g. {X,X,Y} is invalid.
+ * This yield two and only two classes:
+ *  - _Tait-Bryan_ - all unsigned axes are distinct, e.g. {X,Y,Z}
+ *  - _proper/classic Euler angles_ - The first and the third unsigned axes is equal,
+ *     and the second is different, e.g. {X,Y,X}
+ *
+ * ### Intrinsic vs extrinsic Euler systems ###
+ *
+ * Only intrinsic Euler systems are supported for simplicity.
+ *  If you want to use extrinsic Euler systems,
+ *   just use the equal intrinsic opposite order for axes and angles.
+ *  I.e axes (A,B,C) becomes (C,B,A), and angles (a,b,c) becomes (c,b,a).
+ *
+ * ### Convenient user typedefs ###
+ *
+ * Convenient typedefs for EulerSystem exist (only for positive axes Euler systems),
+ *  in a form of EulerSystem{A}{B}{C}, e.g. \ref EulerSystemXYZ.
+ *
+ * ### Additional reading ###
+ *
+ * More information about Euler angles: https://en.wikipedia.org/wiki/Euler_angles
+ *
+ * \tparam _AlphaAxis the first fixed EulerAxis
+ *
+ * \tparam _BetaAxis the second fixed EulerAxis
+ *
+ * \tparam _GammaAxis the third fixed EulerAxis
+ */
+template <int _AlphaAxis, int _BetaAxis, int _GammaAxis>
+class EulerSystem {
+ public:
+  // It's defined this way and not as enum, because I think
+  //  that enum is not guerantee to support negative numbers
+
+  /** The first rotation axis */
+  static constexpr int AlphaAxis = _AlphaAxis;
+
+  /** The second rotation axis */
+  static constexpr int BetaAxis = _BetaAxis;
+
+  /** The third rotation axis */
+  static constexpr int GammaAxis = _GammaAxis;
+
+  enum {
+    AlphaAxisAbs = internal::Abs<AlphaAxis>::value, /*!< the first rotation axis unsigned */
+    BetaAxisAbs = internal::Abs<BetaAxis>::value,   /*!< the second rotation axis unsigned */
+    GammaAxisAbs = internal::Abs<GammaAxis>::value, /*!< the third rotation axis unsigned */
+
+    IsAlphaOpposite = (AlphaAxis < 0) ? 1 : 0, /*!< whether alpha axis is negative */
+    IsBetaOpposite = (BetaAxis < 0) ? 1 : 0,   /*!< whether beta axis is negative */
+    IsGammaOpposite = (GammaAxis < 0) ? 1 : 0, /*!< whether gamma axis is negative */
+
+    // Parity is even if alpha axis X is followed by beta axis Y, or Y is followed
+    // by Z, or Z is followed by X; otherwise it is odd.
+    IsOdd = ((AlphaAxisAbs) % 3 == (BetaAxisAbs - 1) % 3) ? 0 : 1, /*!< whether the Euler system is odd */
+    IsEven = IsOdd ? 0 : 1,                                        /*!< whether the Euler system is even */
+
+    IsTaitBryan =
+        ((unsigned)AlphaAxisAbs != (unsigned)GammaAxisAbs) ? 1 : 0 /*!< whether the Euler system is Tait-Bryan */
+  };
+
+ private:
+  EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT(internal::IsValidAxis<AlphaAxis>::value, ALPHA_AXIS_IS_INVALID);
+
+  EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT(internal::IsValidAxis<BetaAxis>::value, BETA_AXIS_IS_INVALID);
+
+  EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT(internal::IsValidAxis<GammaAxis>::value, GAMMA_AXIS_IS_INVALID);
+
+  EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT((unsigned)AlphaAxisAbs != (unsigned)BetaAxisAbs,
+                                         ALPHA_AXIS_CANT_BE_EQUAL_TO_BETA_AXIS);
+
+  EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT((unsigned)BetaAxisAbs != (unsigned)GammaAxisAbs,
+                                         BETA_AXIS_CANT_BE_EQUAL_TO_GAMMA_AXIS);
+
+  static const int
+      // I, J, K are the pivot indexes permutation for the rotation matrix, that match this Euler system.
+      // They are used in this class converters.
+      // They are always different from each other, and their possible values are: 0, 1, or 2.
+      I_ = AlphaAxisAbs - 1,
+      J_ = (AlphaAxisAbs - 1 + 1 + IsOdd) % 3, K_ = (AlphaAxisAbs - 1 + 2 - IsOdd) % 3;
+
+  // TODO: Get @mat parameter in form that avoids double evaluation.
+  template <typename Derived>
+  static void CalcEulerAngles_imp(Matrix<typename MatrixBase<Derived>::Scalar, 3, 1>& res,
+                                  const MatrixBase<Derived>& mat, internal::true_type /*isTaitBryan*/) {
+    using std::atan2;
+    using std::sqrt;
+
+    typedef typename Derived::Scalar Scalar;
+
+    const Scalar plusMinus = IsEven ? 1 : -1;
+    const Scalar minusPlus = IsOdd ? 1 : -1;
+
+    const Scalar Rsum = sqrt((mat(I_, I_) * mat(I_, I_) + mat(I_, J_) * mat(I_, J_) + mat(J_, K_) * mat(J_, K_) +
+                              mat(K_, K_) * mat(K_, K_)) /
+                             2);
+    res[1] = atan2(plusMinus * mat(I_, K_), Rsum);
+
+    // There is a singularity when cos(beta) == 0
+    if (Rsum > 4 * NumTraits<Scalar>::epsilon()) {  // cos(beta) != 0
+      res[0] = atan2(minusPlus * mat(J_, K_), mat(K_, K_));
+      res[2] = atan2(minusPlus * mat(I_, J_), mat(I_, I_));
+    } else if (plusMinus * mat(I_, K_) > 0) {               // cos(beta) == 0 and sin(beta) == 1
+      Scalar spos = mat(J_, I_) + plusMinus * mat(K_, J_);  // 2*sin(alpha + plusMinus * gamma
+      Scalar cpos = mat(J_, J_) + minusPlus * mat(K_, I_);  // 2*cos(alpha + plusMinus * gamma)
+      Scalar alphaPlusMinusGamma = atan2(spos, cpos);
+      res[0] = alphaPlusMinusGamma;
+      res[2] = 0;
+    } else {                                                              // cos(beta) == 0 and sin(beta) == -1
+      Scalar sneg = plusMinus * (mat(K_, J_) + minusPlus * mat(J_, I_));  // 2*sin(alpha + minusPlus*gamma)
+      Scalar cneg = mat(J_, J_) + plusMinus * mat(K_, I_);                // 2*cos(alpha + minusPlus*gamma)
+      Scalar alphaMinusPlusBeta = atan2(sneg, cneg);
+      res[0] = alphaMinusPlusBeta;
+      res[2] = 0;
+    }
+  }
+
+  template <typename Derived>
+  static void CalcEulerAngles_imp(Matrix<typename MatrixBase<Derived>::Scalar, 3, 1>& res,
+                                  const MatrixBase<Derived>& mat, internal::false_type /*isTaitBryan*/) {
+    using std::atan2;
+    using std::sqrt;
+
+    typedef typename Derived::Scalar Scalar;
+
+    const Scalar plusMinus = IsEven ? 1 : -1;
+    const Scalar minusPlus = IsOdd ? 1 : -1;
+
+    const Scalar Rsum = sqrt((mat(I_, J_) * mat(I_, J_) + mat(I_, K_) * mat(I_, K_) + mat(J_, I_) * mat(J_, I_) +
+                              mat(K_, I_) * mat(K_, I_)) /
+                             2);
+
+    res[1] = atan2(Rsum, mat(I_, I_));
+
+    // There is a singularity when sin(beta) == 0
+    if (Rsum > 4 * NumTraits<Scalar>::epsilon()) {  // sin(beta) != 0
+      res[0] = atan2(mat(J_, I_), minusPlus * mat(K_, I_));
+      res[2] = atan2(mat(I_, J_), plusMinus * mat(I_, K_));
+    } else if (mat(I_, I_) > 0) {                                       // sin(beta) == 0 and cos(beta) == 1
+      Scalar spos = plusMinus * mat(K_, J_) + minusPlus * mat(J_, K_);  // 2*sin(alpha + gamma)
+      Scalar cpos = mat(J_, J_) + mat(K_, K_);                          // 2*cos(alpha + gamma)
+      res[0] = atan2(spos, cpos);
+      res[2] = 0;
+    } else {                                                            // sin(beta) == 0 and cos(beta) == -1
+      Scalar sneg = plusMinus * mat(K_, J_) + plusMinus * mat(J_, K_);  // 2*sin(alpha - gamma)
+      Scalar cneg = mat(J_, J_) - mat(K_, K_);                          // 2*cos(alpha - gamma)
+      res[0] = atan2(sneg, cneg);
+      res[2] = 0;
+    }
+  }
+
+  template <typename Scalar>
+  static void CalcEulerAngles(EulerAngles<Scalar, EulerSystem>& res,
+                              const typename EulerAngles<Scalar, EulerSystem>::Matrix3& mat) {
+    CalcEulerAngles_imp(res.angles(), mat,
+                        std::conditional_t<IsTaitBryan, internal::true_type, internal::false_type>());
+
+    if (IsAlphaOpposite) res.alpha() = -res.alpha();
+
+    if (IsBetaOpposite) res.beta() = -res.beta();
+
+    if (IsGammaOpposite) res.gamma() = -res.gamma();
+  }
+
+  template <typename Scalar_, class _System>
+  friend class Eigen::EulerAngles;
+
+  template <typename System, typename Other, int OtherRows, int OtherCols>
+  friend struct internal::eulerangles_assign_impl;
+};
+
+#define EIGEN_EULER_SYSTEM_TYPEDEF(A, B, C) typedef EulerSystem<EULER_##A, EULER_##B, EULER_##C> EulerSystem##A##B##C;
+
+/** Default XYZ Euler coordinate system.
+ * \ingroup EulerAngles_Module
+ */
+EIGEN_EULER_SYSTEM_TYPEDEF(X, Y, Z)
+EIGEN_EULER_SYSTEM_TYPEDEF(X, Y, X)
+EIGEN_EULER_SYSTEM_TYPEDEF(X, Z, Y)
+EIGEN_EULER_SYSTEM_TYPEDEF(X, Z, X)
+
+EIGEN_EULER_SYSTEM_TYPEDEF(Y, Z, X)
+EIGEN_EULER_SYSTEM_TYPEDEF(Y, Z, Y)
+EIGEN_EULER_SYSTEM_TYPEDEF(Y, X, Z)
+EIGEN_EULER_SYSTEM_TYPEDEF(Y, X, Y)
+
+EIGEN_EULER_SYSTEM_TYPEDEF(Z, X, Y)
+EIGEN_EULER_SYSTEM_TYPEDEF(Z, X, Z)
+EIGEN_EULER_SYSTEM_TYPEDEF(Z, Y, X)
+EIGEN_EULER_SYSTEM_TYPEDEF(Z, Y, Z)
+}  // namespace Eigen
+
+#endif  // EIGEN_EULERSYSTEM_H
diff --git a/inst/include/unsupported/Eigen/src/EulerAngles/InternalHeaderCheck.h b/inst/include/unsupported/Eigen/src/EulerAngles/InternalHeaderCheck.h
new file mode 100644
index 00000000..0c00a306
--- /dev/null
+++ b/inst/include/unsupported/Eigen/src/EulerAngles/InternalHeaderCheck.h
@@ -0,0 +1,3 @@
+#ifndef EIGEN_EULERANGLES_MODULE_H
+#error "Please include unsupported/Eigen/EulerAngles instead of including headers inside the src directory directly."
+#endif
diff --git a/inst/include/unsupported/Eigen/src/FFT/InternalHeaderCheck.h b/inst/include/unsupported/Eigen/src/FFT/InternalHeaderCheck.h
new file mode 100644
index 00000000..801e2454
--- /dev/null
+++ b/inst/include/unsupported/Eigen/src/FFT/InternalHeaderCheck.h
@@ -0,0 +1,3 @@
+#ifndef EIGEN_FFT_MODULE_H
+#error "Please include unsupported/Eigen/FFT instead of including headers inside the src directory directly."
+#endif
diff --git a/inst/include/unsupported/Eigen/src/FFT/duccfft_impl.h b/inst/include/unsupported/Eigen/src/FFT/duccfft_impl.h
new file mode 100644
index 00000000..781716dd
--- /dev/null
+++ b/inst/include/unsupported/Eigen/src/FFT/duccfft_impl.h
@@ -0,0 +1,71 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename _Scalar>
+struct duccfft_impl {
+  using Scalar = _Scalar;
+  using Complex = std::complex<Scalar>;
+  using shape_t = ducc0::fmav_info::shape_t;
+  using stride_t = ducc0::fmav_info::stride_t;
+
+  inline void clear() {}
+
+  inline void fwd(Complex* dst, const Scalar* src, int nfft) {
+    const shape_t axes{0};
+    ducc0::cfmav<Scalar> m_in(src, shape_t{static_cast<size_t>(nfft)});
+    ducc0::vfmav<Complex> m_out(dst, shape_t{static_cast<size_t>(nfft) / 2 + 1});
+    ducc0::r2c(m_in, m_out, axes, /*forward=*/true, /*scale=*/static_cast<Scalar>(1));
+  }
+
+  inline void fwd(Complex* dst, const Complex* src, int nfft) {
+    const shape_t axes{0};
+    ducc0::cfmav<Complex> m_in(src, shape_t{static_cast<size_t>(nfft)});
+    ducc0::vfmav<Complex> m_out(dst, shape_t{static_cast<size_t>(nfft)});
+    ducc0::c2c(m_in, m_out, axes, /*forward=*/true, /*scale=*/static_cast<Scalar>(1));
+  }
+
+  inline void inv(Scalar* dst, const Complex* src, int nfft) {
+    const shape_t axes{0};
+    ducc0::cfmav<Complex> m_in(src, shape_t{static_cast<size_t>(nfft) / 2 + 1});
+    ducc0::vfmav<Scalar> m_out(dst, shape_t{static_cast<size_t>(nfft)});
+    ducc0::c2r(m_in, m_out, axes, /*forward=*/false, /*scale=*/static_cast<Scalar>(1));
+  }
+
+  inline void inv(Complex* dst, const Complex* src, int nfft) {
+    const shape_t axes{0};
+    ducc0::cfmav<Complex> m_in(src, shape_t{static_cast<size_t>(nfft)});
+    ducc0::vfmav<Complex> m_out(dst, shape_t{static_cast<size_t>(nfft)});
+    ducc0::c2c(m_in, m_out, axes, /*forward=*/false, /*scale=*/static_cast<Scalar>(1));
+  }
+
+  inline void fwd2(Complex* dst, const Complex* src, int nfft0, int nfft1) {
+    const shape_t axes{0, 1};
+    const shape_t in_shape{static_cast<size_t>(nfft0), static_cast<size_t>(nfft1)};
+    const shape_t out_shape{static_cast<size_t>(nfft0), static_cast<size_t>(nfft1)};
+    const stride_t stride{static_cast<ptrdiff_t>(nfft1), static_cast<ptrdiff_t>(1)};
+    ducc0::cfmav<Complex> m_in(src, in_shape, stride);
+    ducc0::vfmav<Complex> m_out(dst, out_shape, stride);
+    ducc0::c2c(m_in, m_out, axes, /*forward=*/true, /*scale=*/static_cast<Scalar>(1));
+  }
+
+  inline void inv2(Complex* dst, const Complex* src, int nfft0, int nfft1) {
+    const shape_t axes{0, 1};
+    const shape_t in_shape{static_cast<size_t>(nfft0), static_cast<size_t>(nfft1)};
+    const shape_t out_shape{static_cast<size_t>(nfft0), static_cast<size_t>(nfft1)};
+    const stride_t stride{static_cast<ptrdiff_t>(nfft1), static_cast<ptrdiff_t>(1)};
+    ducc0::cfmav<Complex> m_in(src, in_shape, stride);
+    ducc0::vfmav<Complex> m_out(dst, out_shape, stride);
+    ducc0::c2c(m_in, m_out, axes, /*forward=*/false, /*scale=*/static_cast<Scalar>(1));
+  }
+};
+
+}  // namespace internal
+}  // namespace Eigen
diff --git a/inst/include/unsupported/Eigen/src/FFT/ei_fftw_impl.h b/inst/include/unsupported/Eigen/src/FFT/ei_fftw_impl.h
deleted file mode 100644
index d49aa17f..00000000
--- a/inst/include/unsupported/Eigen/src/FFT/ei_fftw_impl.h
+++ /dev/null
@@ -1,261 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra. 
-//
-// Copyright (C) 2009 Mark Borgerding mark a borgerding net
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-namespace Eigen { 
-
-namespace internal {
-
-  // FFTW uses non-const arguments
-  // so we must use ugly const_cast calls for all the args it uses
-  //
-  // This should be safe as long as 
-  // 1. we use FFTW_ESTIMATE for all our planning
-  //       see the FFTW docs section 4.3.2 "Planner Flags"
-  // 2. fftw_complex is compatible with std::complex
-  //    This assumes std::complex<T> layout is array of size 2 with real,imag
-  template <typename T> 
-  inline 
-  T * fftw_cast(const T* p)
-  { 
-      return const_cast<T*>( p); 
-  }
-
-  inline 
-  fftw_complex * fftw_cast( const std::complex<double> * p)
-  {
-      return const_cast<fftw_complex*>( reinterpret_cast<const fftw_complex*>(p) ); 
-  }
-
-  inline 
-  fftwf_complex * fftw_cast( const std::complex<float> * p)
-  { 
-      return const_cast<fftwf_complex*>( reinterpret_cast<const fftwf_complex*>(p) ); 
-  }
-
-  inline 
-  fftwl_complex * fftw_cast( const std::complex<long double> * p)
-  { 
-      return const_cast<fftwl_complex*>( reinterpret_cast<const fftwl_complex*>(p) ); 
-  }
-
-  template <typename T> 
-  struct fftw_plan {};
-
-  template <> 
-  struct fftw_plan<float>
-  {
-      typedef float scalar_type;
-      typedef fftwf_complex complex_type;
-      fftwf_plan m_plan;
-      fftw_plan() :m_plan(NULL) {}
-      ~fftw_plan() {if (m_plan) fftwf_destroy_plan(m_plan);}
-
-      inline
-      void fwd(complex_type * dst,complex_type * src,int nfft) {
-          if (m_plan==NULL) m_plan = fftwf_plan_dft_1d(nfft,src,dst, FFTW_FORWARD, FFTW_ESTIMATE|FFTW_PRESERVE_INPUT);
-          fftwf_execute_dft( m_plan, src,dst);
-      }
-      inline
-      void inv(complex_type * dst,complex_type * src,int nfft) {
-          if (m_plan==NULL) m_plan = fftwf_plan_dft_1d(nfft,src,dst, FFTW_BACKWARD , FFTW_ESTIMATE|FFTW_PRESERVE_INPUT);
-          fftwf_execute_dft( m_plan, src,dst);
-      }
-      inline
-      void fwd(complex_type * dst,scalar_type * src,int nfft) {
-          if (m_plan==NULL) m_plan = fftwf_plan_dft_r2c_1d(nfft,src,dst,FFTW_ESTIMATE|FFTW_PRESERVE_INPUT);
-          fftwf_execute_dft_r2c( m_plan,src,dst);
-      }
-      inline
-      void inv(scalar_type * dst,complex_type * src,int nfft) {
-          if (m_plan==NULL)
-              m_plan = fftwf_plan_dft_c2r_1d(nfft,src,dst,FFTW_ESTIMATE|FFTW_PRESERVE_INPUT);
-          fftwf_execute_dft_c2r( m_plan, src,dst);
-      }
-
-      inline 
-      void fwd2( complex_type * dst,complex_type * src,int n0,int n1) {
-          if (m_plan==NULL) m_plan = fftwf_plan_dft_2d(n0,n1,src,dst,FFTW_FORWARD,FFTW_ESTIMATE|FFTW_PRESERVE_INPUT);
-          fftwf_execute_dft( m_plan, src,dst);
-      }
-      inline 
-      void inv2( complex_type * dst,complex_type * src,int n0,int n1) {
-          if (m_plan==NULL) m_plan = fftwf_plan_dft_2d(n0,n1,src,dst,FFTW_BACKWARD,FFTW_ESTIMATE|FFTW_PRESERVE_INPUT);
-          fftwf_execute_dft( m_plan, src,dst);
-      }
-
-  };
-  template <> 
-  struct fftw_plan<double>
-  {
-      typedef double scalar_type;
-      typedef fftw_complex complex_type;
-      ::fftw_plan m_plan;
-      fftw_plan() :m_plan(NULL) {}
-      ~fftw_plan() {if (m_plan) fftw_destroy_plan(m_plan);}
-
-      inline
-      void fwd(complex_type * dst,complex_type * src,int nfft) {
-          if (m_plan==NULL) m_plan = fftw_plan_dft_1d(nfft,src,dst, FFTW_FORWARD, FFTW_ESTIMATE|FFTW_PRESERVE_INPUT);
-          fftw_execute_dft( m_plan, src,dst);
-      }
-      inline
-      void inv(complex_type * dst,complex_type * src,int nfft) {
-          if (m_plan==NULL) m_plan = fftw_plan_dft_1d(nfft,src,dst, FFTW_BACKWARD , FFTW_ESTIMATE|FFTW_PRESERVE_INPUT);
-          fftw_execute_dft( m_plan, src,dst);
-      }
-      inline
-      void fwd(complex_type * dst,scalar_type * src,int nfft) {
-          if (m_plan==NULL) m_plan = fftw_plan_dft_r2c_1d(nfft,src,dst,FFTW_ESTIMATE|FFTW_PRESERVE_INPUT);
-          fftw_execute_dft_r2c( m_plan,src,dst);
-      }
-      inline
-      void inv(scalar_type * dst,complex_type * src,int nfft) {
-          if (m_plan==NULL)
-              m_plan = fftw_plan_dft_c2r_1d(nfft,src,dst,FFTW_ESTIMATE|FFTW_PRESERVE_INPUT);
-          fftw_execute_dft_c2r( m_plan, src,dst);
-      }
-      inline 
-      void fwd2( complex_type * dst,complex_type * src,int n0,int n1) {
-          if (m_plan==NULL) m_plan = fftw_plan_dft_2d(n0,n1,src,dst,FFTW_FORWARD,FFTW_ESTIMATE|FFTW_PRESERVE_INPUT);
-          fftw_execute_dft( m_plan, src,dst);
-      }
-      inline 
-      void inv2( complex_type * dst,complex_type * src,int n0,int n1) {
-          if (m_plan==NULL) m_plan = fftw_plan_dft_2d(n0,n1,src,dst,FFTW_BACKWARD,FFTW_ESTIMATE|FFTW_PRESERVE_INPUT);
-          fftw_execute_dft( m_plan, src,dst);
-      }
-  };
-  template <> 
-  struct fftw_plan<long double>
-  {
-      typedef long double scalar_type;
-      typedef fftwl_complex complex_type;
-      fftwl_plan m_plan;
-      fftw_plan() :m_plan(NULL) {}
-      ~fftw_plan() {if (m_plan) fftwl_destroy_plan(m_plan);}
-
-      inline
-      void fwd(complex_type * dst,complex_type * src,int nfft) {
-          if (m_plan==NULL) m_plan = fftwl_plan_dft_1d(nfft,src,dst, FFTW_FORWARD, FFTW_ESTIMATE|FFTW_PRESERVE_INPUT);
-          fftwl_execute_dft( m_plan, src,dst);
-      }
-      inline
-      void inv(complex_type * dst,complex_type * src,int nfft) {
-          if (m_plan==NULL) m_plan = fftwl_plan_dft_1d(nfft,src,dst, FFTW_BACKWARD , FFTW_ESTIMATE|FFTW_PRESERVE_INPUT);
-          fftwl_execute_dft( m_plan, src,dst);
-      }
-      inline
-      void fwd(complex_type * dst,scalar_type * src,int nfft) {
-          if (m_plan==NULL) m_plan = fftwl_plan_dft_r2c_1d(nfft,src,dst,FFTW_ESTIMATE|FFTW_PRESERVE_INPUT);
-          fftwl_execute_dft_r2c( m_plan,src,dst);
-      }
-      inline
-      void inv(scalar_type * dst,complex_type * src,int nfft) {
-          if (m_plan==NULL)
-              m_plan = fftwl_plan_dft_c2r_1d(nfft,src,dst,FFTW_ESTIMATE|FFTW_PRESERVE_INPUT);
-          fftwl_execute_dft_c2r( m_plan, src,dst);
-      }
-      inline 
-      void fwd2( complex_type * dst,complex_type * src,int n0,int n1) {
-          if (m_plan==NULL) m_plan = fftwl_plan_dft_2d(n0,n1,src,dst,FFTW_FORWARD,FFTW_ESTIMATE|FFTW_PRESERVE_INPUT);
-          fftwl_execute_dft( m_plan, src,dst);
-      }
-      inline 
-      void inv2( complex_type * dst,complex_type * src,int n0,int n1) {
-          if (m_plan==NULL) m_plan = fftwl_plan_dft_2d(n0,n1,src,dst,FFTW_BACKWARD,FFTW_ESTIMATE|FFTW_PRESERVE_INPUT);
-          fftwl_execute_dft( m_plan, src,dst);
-      }
-  };
-
-  template <typename _Scalar>
-  struct fftw_impl
-  {
-      typedef _Scalar Scalar;
-      typedef std::complex<Scalar> Complex;
-
-      inline
-      void clear() 
-      {
-        m_plans.clear();
-      }
-
-      // complex-to-complex forward FFT
-      inline
-      void fwd( Complex * dst,const Complex *src,int nfft)
-      {
-        get_plan(nfft,false,dst,src).fwd(fftw_cast(dst), fftw_cast(src),nfft );
-      }
-
-      // real-to-complex forward FFT
-      inline
-      void fwd( Complex * dst,const Scalar * src,int nfft) 
-      {
-          get_plan(nfft,false,dst,src).fwd(fftw_cast(dst), fftw_cast(src) ,nfft);
-      }
-
-      // 2-d complex-to-complex
-      inline
-      void fwd2(Complex * dst, const Complex * src, int n0,int n1)
-      {
-          get_plan(n0,n1,false,dst,src).fwd2(fftw_cast(dst), fftw_cast(src) ,n0,n1);
-      }
-
-      // inverse complex-to-complex
-      inline
-      void inv(Complex * dst,const Complex  *src,int nfft)
-      {
-        get_plan(nfft,true,dst,src).inv(fftw_cast(dst), fftw_cast(src),nfft );
-      }
-
-      // half-complex to scalar
-      inline
-      void inv( Scalar * dst,const Complex * src,int nfft) 
-      {
-        get_plan(nfft,true,dst,src).inv(fftw_cast(dst), fftw_cast(src),nfft );
-      }
-
-      // 2-d complex-to-complex
-      inline
-      void inv2(Complex * dst, const Complex * src, int n0,int n1)
-      {
-        get_plan(n0,n1,true,dst,src).inv2(fftw_cast(dst), fftw_cast(src) ,n0,n1);
-      }
-
-
-  protected:
-      typedef fftw_plan<Scalar> PlanData;
-
-      typedef std::map<int64_t,PlanData> PlanMap;
-
-      PlanMap m_plans;
-
-      inline
-      PlanData & get_plan(int nfft,bool inverse,void * dst,const void * src)
-      {
-          bool inplace = (dst==src);
-          bool aligned = ( (reinterpret_cast<size_t>(src)&15) | (reinterpret_cast<size_t>(dst)&15) ) == 0;
-          int64_t key = ( (nfft<<3 ) | (inverse<<2) | (inplace<<1) | aligned ) << 1;
-          return m_plans[key];
-      }
-
-      inline
-      PlanData & get_plan(int n0,int n1,bool inverse,void * dst,const void * src)
-      {
-          bool inplace = (dst==src);
-          bool aligned = ( (reinterpret_cast<size_t>(src)&15) | (reinterpret_cast<size_t>(dst)&15) ) == 0;
-          int64_t key = ( ( (((int64_t)n0) << 30)|(n1<<3 ) | (inverse<<2) | (inplace<<1) | aligned ) << 1 ) + 1;
-          return m_plans[key];
-      }
-  };
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */
diff --git a/inst/include/unsupported/Eigen/src/FFT/ei_kissfft_impl.h b/inst/include/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
deleted file mode 100644
index be51b4e6..00000000
--- a/inst/include/unsupported/Eigen/src/FFT/ei_kissfft_impl.h
+++ /dev/null
@@ -1,420 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2009 Mark Borgerding mark a borgerding net
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-namespace Eigen { 
-
-namespace internal {
-
-  // This FFT implementation was derived from kissfft http:sourceforge.net/projects/kissfft
-  // Copyright 2003-2009 Mark Borgerding
-
-template <typename _Scalar>
-struct kiss_cpx_fft
-{
-  typedef _Scalar Scalar;
-  typedef std::complex<Scalar> Complex;
-  std::vector<Complex> m_twiddles;
-  std::vector<int> m_stageRadix;
-  std::vector<int> m_stageRemainder;
-  std::vector<Complex> m_scratchBuf;
-  bool m_inverse;
-
-  inline
-    void make_twiddles(int nfft,bool inverse)
-    {
-      using std::acos;
-      m_inverse = inverse;
-      m_twiddles.resize(nfft);
-      Scalar phinc =  (inverse?2:-2)* acos( (Scalar) -1)  / nfft;
-      for (int i=0;i<nfft;++i)
-        m_twiddles[i] = exp( Complex(0,i*phinc) );
-    }
-
-  void factorize(int nfft)
-  {
-    //start factoring out 4's, then 2's, then 3,5,7,9,...
-    int n= nfft;
-    int p=4;
-    do {
-      while (n % p) {
-        switch (p) {
-          case 4: p = 2; break;
-          case 2: p = 3; break;
-          default: p += 2; break;
-        }
-        if (p*p>n)
-          p=n;// impossible to have a factor > sqrt(n)
-      }
-      n /= p;
-      m_stageRadix.push_back(p);
-      m_stageRemainder.push_back(n);
-      if ( p > 5 )
-        m_scratchBuf.resize(p); // scratchbuf will be needed in bfly_generic
-    }while(n>1);
-  }
-
-  template <typename _Src>
-    inline
-    void work( int stage,Complex * xout, const _Src * xin, size_t fstride,size_t in_stride)
-    {
-      int p = m_stageRadix[stage];
-      int m = m_stageRemainder[stage];
-      Complex * Fout_beg = xout;
-      Complex * Fout_end = xout + p*m;
-
-      if (m>1) {
-        do{
-          // recursive call:
-          // DFT of size m*p performed by doing
-          // p instances of smaller DFTs of size m, 
-          // each one takes a decimated version of the input
-          work(stage+1, xout , xin, fstride*p,in_stride);
-          xin += fstride*in_stride;
-        }while( (xout += m) != Fout_end );
-      }else{
-        do{
-          *xout = *xin;
-          xin += fstride*in_stride;
-        }while(++xout != Fout_end );
-      }
-      xout=Fout_beg;
-
-      // recombine the p smaller DFTs 
-      switch (p) {
-        case 2: bfly2(xout,fstride,m); break;
-        case 3: bfly3(xout,fstride,m); break;
-        case 4: bfly4(xout,fstride,m); break;
-        case 5: bfly5(xout,fstride,m); break;
-        default: bfly_generic(xout,fstride,m,p); break;
-      }
-    }
-
-  inline
-    void bfly2( Complex * Fout, const size_t fstride, int m)
-    {
-      for (int k=0;k<m;++k) {
-        Complex t = Fout[m+k] * m_twiddles[k*fstride];
-        Fout[m+k] = Fout[k] - t;
-        Fout[k] += t;
-      }
-    }
-
-  inline
-    void bfly4( Complex * Fout, const size_t fstride, const size_t m)
-    {
-      Complex scratch[6];
-      int negative_if_inverse = m_inverse * -2 +1;
-      for (size_t k=0;k<m;++k) {
-        scratch[0] = Fout[k+m] * m_twiddles[k*fstride];
-        scratch[1] = Fout[k+2*m] * m_twiddles[k*fstride*2];
-        scratch[2] = Fout[k+3*m] * m_twiddles[k*fstride*3];
-        scratch[5] = Fout[k] - scratch[1];
-
-        Fout[k] += scratch[1];
-        scratch[3] = scratch[0] + scratch[2];
-        scratch[4] = scratch[0] - scratch[2];
-        scratch[4] = Complex( scratch[4].imag()*negative_if_inverse , -scratch[4].real()* negative_if_inverse );
-
-        Fout[k+2*m]  = Fout[k] - scratch[3];
-        Fout[k] += scratch[3];
-        Fout[k+m] = scratch[5] + scratch[4];
-        Fout[k+3*m] = scratch[5] - scratch[4];
-      }
-    }
-
-  inline
-    void bfly3( Complex * Fout, const size_t fstride, const size_t m)
-    {
-      size_t k=m;
-      const size_t m2 = 2*m;
-      Complex *tw1,*tw2;
-      Complex scratch[5];
-      Complex epi3;
-      epi3 = m_twiddles[fstride*m];
-
-      tw1=tw2=&m_twiddles[0];
-
-      do{
-        scratch[1]=Fout[m] * *tw1;
-        scratch[2]=Fout[m2] * *tw2;
-
-        scratch[3]=scratch[1]+scratch[2];
-        scratch[0]=scratch[1]-scratch[2];
-        tw1 += fstride;
-        tw2 += fstride*2;
-        Fout[m] = Complex( Fout->real() - Scalar(.5)*scratch[3].real() , Fout->imag() - Scalar(.5)*scratch[3].imag() );
-        scratch[0] *= epi3.imag();
-        *Fout += scratch[3];
-        Fout[m2] = Complex(  Fout[m].real() + scratch[0].imag() , Fout[m].imag() - scratch[0].real() );
-        Fout[m] += Complex( -scratch[0].imag(),scratch[0].real() );
-        ++Fout;
-      }while(--k);
-    }
-
-  inline
-    void bfly5( Complex * Fout, const size_t fstride, const size_t m)
-    {
-      Complex *Fout0,*Fout1,*Fout2,*Fout3,*Fout4;
-      size_t u;
-      Complex scratch[13];
-      Complex * twiddles = &m_twiddles[0];
-      Complex *tw;
-      Complex ya,yb;
-      ya = twiddles[fstride*m];
-      yb = twiddles[fstride*2*m];
-
-      Fout0=Fout;
-      Fout1=Fout0+m;
-      Fout2=Fout0+2*m;
-      Fout3=Fout0+3*m;
-      Fout4=Fout0+4*m;
-
-      tw=twiddles;
-      for ( u=0; u<m; ++u ) {
-        scratch[0] = *Fout0;
-
-        scratch[1]  = *Fout1 * tw[u*fstride];
-        scratch[2]  = *Fout2 * tw[2*u*fstride];
-        scratch[3]  = *Fout3 * tw[3*u*fstride];
-        scratch[4]  = *Fout4 * tw[4*u*fstride];
-
-        scratch[7] = scratch[1] + scratch[4];
-        scratch[10] = scratch[1] - scratch[4];
-        scratch[8] = scratch[2] + scratch[3];
-        scratch[9] = scratch[2] - scratch[3];
-
-        *Fout0 +=  scratch[7];
-        *Fout0 +=  scratch[8];
-
-        scratch[5] = scratch[0] + Complex(
-            (scratch[7].real()*ya.real() ) + (scratch[8].real() *yb.real() ),
-            (scratch[7].imag()*ya.real()) + (scratch[8].imag()*yb.real())
-            );
-
-        scratch[6] = Complex(
-            (scratch[10].imag()*ya.imag()) + (scratch[9].imag()*yb.imag()),
-            -(scratch[10].real()*ya.imag()) - (scratch[9].real()*yb.imag())
-            );
-
-        *Fout1 = scratch[5] - scratch[6];
-        *Fout4 = scratch[5] + scratch[6];
-
-        scratch[11] = scratch[0] +
-          Complex(
-              (scratch[7].real()*yb.real()) + (scratch[8].real()*ya.real()),
-              (scratch[7].imag()*yb.real()) + (scratch[8].imag()*ya.real())
-              );
-
-        scratch[12] = Complex(
-            -(scratch[10].imag()*yb.imag()) + (scratch[9].imag()*ya.imag()),
-            (scratch[10].real()*yb.imag()) - (scratch[9].real()*ya.imag())
-            );
-
-        *Fout2=scratch[11]+scratch[12];
-        *Fout3=scratch[11]-scratch[12];
-
-        ++Fout0;++Fout1;++Fout2;++Fout3;++Fout4;
-      }
-    }
-
-  /* perform the butterfly for one stage of a mixed radix FFT */
-  inline
-    void bfly_generic(
-        Complex * Fout,
-        const size_t fstride,
-        int m,
-        int p
-        )
-    {
-      int u,k,q1,q;
-      Complex * twiddles = &m_twiddles[0];
-      Complex t;
-      int Norig = static_cast<int>(m_twiddles.size());
-      Complex * scratchbuf = &m_scratchBuf[0];
-
-      for ( u=0; u<m; ++u ) {
-        k=u;
-        for ( q1=0 ; q1<p ; ++q1 ) {
-          scratchbuf[q1] = Fout[ k  ];
-          k += m;
-        }
-
-        k=u;
-        for ( q1=0 ; q1<p ; ++q1 ) {
-          int twidx=0;
-          Fout[ k ] = scratchbuf[0];
-          for (q=1;q<p;++q ) {
-            twidx += static_cast<int>(fstride) * k;
-            if (twidx>=Norig) twidx-=Norig;
-            t=scratchbuf[q] * twiddles[twidx];
-            Fout[ k ] += t;
-          }
-          k += m;
-        }
-      }
-    }
-};
-
-template <typename _Scalar>
-struct kissfft_impl
-{
-  typedef _Scalar Scalar;
-  typedef std::complex<Scalar> Complex;
-
-  void clear() 
-  {
-    m_plans.clear();
-    m_realTwiddles.clear();
-  }
-
-  inline
-    void fwd( Complex * dst,const Complex *src,int nfft)
-    {
-      get_plan(nfft,false).work(0, dst, src, 1,1);
-    }
-
-  inline
-    void fwd2( Complex * dst,const Complex *src,int n0,int n1)
-    {
-        EIGEN_UNUSED_VARIABLE(dst);
-        EIGEN_UNUSED_VARIABLE(src);
-        EIGEN_UNUSED_VARIABLE(n0);
-        EIGEN_UNUSED_VARIABLE(n1);
-    }
-
-  inline
-    void inv2( Complex * dst,const Complex *src,int n0,int n1)
-    {
-        EIGEN_UNUSED_VARIABLE(dst);
-        EIGEN_UNUSED_VARIABLE(src);
-        EIGEN_UNUSED_VARIABLE(n0);
-        EIGEN_UNUSED_VARIABLE(n1);
-    }
-
-  // real-to-complex forward FFT
-  // perform two FFTs of src even and src odd
-  // then twiddle to recombine them into the half-spectrum format
-  // then fill in the conjugate symmetric half
-  inline
-    void fwd( Complex * dst,const Scalar * src,int nfft) 
-    {
-      if ( nfft&3  ) {
-        // use generic mode for odd
-        m_tmpBuf1.resize(nfft);
-        get_plan(nfft,false).work(0, &m_tmpBuf1[0], src, 1,1);
-        std::copy(m_tmpBuf1.begin(),m_tmpBuf1.begin()+(nfft>>1)+1,dst );
-      }else{
-        int ncfft = nfft>>1;
-        int ncfft2 = nfft>>2;
-        Complex * rtw = real_twiddles(ncfft2);
-
-        // use optimized mode for even real
-        fwd( dst, reinterpret_cast<const Complex*> (src), ncfft);
-        Complex dc = dst[0].real() +  dst[0].imag();
-        Complex nyquist = dst[0].real() -  dst[0].imag();
-        int k;
-        for ( k=1;k <= ncfft2 ; ++k ) {
-          Complex fpk = dst[k];
-          Complex fpnk = conj(dst[ncfft-k]);
-          Complex f1k = fpk + fpnk;
-          Complex f2k = fpk - fpnk;
-          Complex tw= f2k * rtw[k-1];
-          dst[k] =  (f1k + tw) * Scalar(.5);
-          dst[ncfft-k] =  conj(f1k -tw)*Scalar(.5);
-        }
-        dst[0] = dc;
-        dst[ncfft] = nyquist;
-      }
-    }
-
-  // inverse complex-to-complex
-  inline
-    void inv(Complex * dst,const Complex  *src,int nfft)
-    {
-      get_plan(nfft,true).work(0, dst, src, 1,1);
-    }
-
-  // half-complex to scalar
-  inline
-    void inv( Scalar * dst,const Complex * src,int nfft) 
-    {
-      if (nfft&3) {
-        m_tmpBuf1.resize(nfft);
-        m_tmpBuf2.resize(nfft);
-        std::copy(src,src+(nfft>>1)+1,m_tmpBuf1.begin() );
-        for (int k=1;k<(nfft>>1)+1;++k)
-          m_tmpBuf1[nfft-k] = conj(m_tmpBuf1[k]);
-        inv(&m_tmpBuf2[0],&m_tmpBuf1[0],nfft);
-        for (int k=0;k<nfft;++k)
-          dst[k] = m_tmpBuf2[k].real();
-      }else{
-        // optimized version for multiple of 4
-        int ncfft = nfft>>1;
-        int ncfft2 = nfft>>2;
-        Complex * rtw = real_twiddles(ncfft2);
-        m_tmpBuf1.resize(ncfft);
-        m_tmpBuf1[0] = Complex( src[0].real() + src[ncfft].real(), src[0].real() - src[ncfft].real() );
-        for (int k = 1; k <= ncfft / 2; ++k) {
-          Complex fk = src[k];
-          Complex fnkc = conj(src[ncfft-k]);
-          Complex fek = fk + fnkc;
-          Complex tmp = fk - fnkc;
-          Complex fok = tmp * conj(rtw[k-1]);
-          m_tmpBuf1[k] = fek + fok;
-          m_tmpBuf1[ncfft-k] = conj(fek - fok);
-        }
-        get_plan(ncfft,true).work(0, reinterpret_cast<Complex*>(dst), &m_tmpBuf1[0], 1,1);
-      }
-    }
-
-  protected:
-  typedef kiss_cpx_fft<Scalar> PlanData;
-  typedef std::map<int,PlanData> PlanMap;
-
-  PlanMap m_plans;
-  std::map<int, std::vector<Complex> > m_realTwiddles;
-  std::vector<Complex> m_tmpBuf1;
-  std::vector<Complex> m_tmpBuf2;
-
-  inline
-    int PlanKey(int nfft, bool isinverse) const { return (nfft<<1) | int(isinverse); }
-
-  inline
-    PlanData & get_plan(int nfft, bool inverse)
-    {
-      // TODO look for PlanKey(nfft, ! inverse) and conjugate the twiddles
-      PlanData & pd = m_plans[ PlanKey(nfft,inverse) ];
-      if ( pd.m_twiddles.size() == 0 ) {
-        pd.make_twiddles(nfft,inverse);
-        pd.factorize(nfft);
-      }
-      return pd;
-    }
-
-  inline
-    Complex * real_twiddles(int ncfft2)
-    {
-      using std::acos;
-      std::vector<Complex> & twidref = m_realTwiddles[ncfft2];// creates new if not there
-      if ( (int)twidref.size() != ncfft2 ) {
-        twidref.resize(ncfft2);
-        int ncfft= ncfft2<<1;
-        Scalar pi =  acos( Scalar(-1) );
-        for (int k=1;k<=ncfft2;++k) 
-          twidref[k-1] = exp( Complex(0,-pi * (Scalar(k) / ncfft + Scalar(.5)) ) );
-      }
-      return &twidref[0];
-    }
-};
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-/* vim: set filetype=cpp et sw=2 ts=2 ai: */
diff --git a/inst/include/unsupported/Eigen/src/FFT/fftw_impl.h b/inst/include/unsupported/Eigen/src/FFT/fftw_impl.h
new file mode 100644
index 00000000..0b9ad3da
--- /dev/null
+++ b/inst/include/unsupported/Eigen/src/FFT/fftw_impl.h
@@ -0,0 +1,216 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Mark Borgerding mark a borgerding net
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+#include <memory>
+
+namespace Eigen {
+
+namespace internal {
+
+// FFTW uses non-const arguments
+// so we must use ugly const_cast calls for all the args it uses
+//
+// This should be safe as long as
+// 1. we use FFTW_ESTIMATE for all our planning
+//       see the FFTW docs section 4.3.2 "Planner Flags"
+// 2. fftw_complex is compatible with std::complex
+//    This assumes std::complex<T> layout is array of size 2 with real,imag
+template <typename T>
+inline T *fftw_cast(const T *p) {
+  return const_cast<T *>(p);
+}
+
+inline fftw_complex *fftw_cast(const std::complex<double> *p) {
+  return const_cast<fftw_complex *>(reinterpret_cast<const fftw_complex *>(p));
+}
+
+inline fftwf_complex *fftw_cast(const std::complex<float> *p) {
+  return const_cast<fftwf_complex *>(reinterpret_cast<const fftwf_complex *>(p));
+}
+
+inline fftwl_complex *fftw_cast(const std::complex<long double> *p) {
+  return const_cast<fftwl_complex *>(reinterpret_cast<const fftwl_complex *>(p));
+}
+
+template <typename T>
+struct fftw_plan {};
+
+template <>
+struct fftw_plan<float> {
+  typedef float scalar_type;
+  typedef fftwf_complex complex_type;
+  std::shared_ptr<fftwf_plan_s> m_plan;
+  fftw_plan() = default;
+
+  void set_plan(fftwf_plan p) { m_plan.reset(p, fftwf_destroy_plan); }
+  inline void fwd(complex_type *dst, complex_type *src, int nfft) {
+    if (m_plan == NULL) set_plan(fftwf_plan_dft_1d(nfft, src, dst, FFTW_FORWARD, FFTW_ESTIMATE | FFTW_PRESERVE_INPUT));
+    fftwf_execute_dft(m_plan.get(), src, dst);
+  }
+  inline void inv(complex_type *dst, complex_type *src, int nfft) {
+    if (m_plan == NULL) set_plan(fftwf_plan_dft_1d(nfft, src, dst, FFTW_BACKWARD, FFTW_ESTIMATE | FFTW_PRESERVE_INPUT));
+    fftwf_execute_dft(m_plan.get(), src, dst);
+  }
+  inline void fwd(complex_type *dst, scalar_type *src, int nfft) {
+    if (m_plan == NULL) set_plan(fftwf_plan_dft_r2c_1d(nfft, src, dst, FFTW_ESTIMATE | FFTW_PRESERVE_INPUT));
+    fftwf_execute_dft_r2c(m_plan.get(), src, dst);
+  }
+  inline void inv(scalar_type *dst, complex_type *src, int nfft) {
+    if (m_plan == NULL) set_plan(fftwf_plan_dft_c2r_1d(nfft, src, dst, FFTW_ESTIMATE | FFTW_PRESERVE_INPUT));
+    fftwf_execute_dft_c2r(m_plan.get(), src, dst);
+  }
+
+  inline void fwd2(complex_type *dst, complex_type *src, int n0, int n1) {
+    if (m_plan == NULL)
+      set_plan(fftwf_plan_dft_2d(n0, n1, src, dst, FFTW_FORWARD, FFTW_ESTIMATE | FFTW_PRESERVE_INPUT));
+    fftwf_execute_dft(m_plan.get(), src, dst);
+  }
+  inline void inv2(complex_type *dst, complex_type *src, int n0, int n1) {
+    if (m_plan == NULL)
+      set_plan(fftwf_plan_dft_2d(n0, n1, src, dst, FFTW_BACKWARD, FFTW_ESTIMATE | FFTW_PRESERVE_INPUT));
+    fftwf_execute_dft(m_plan.get(), src, dst);
+  }
+};
+template <>
+struct fftw_plan<double> {
+  typedef double scalar_type;
+  typedef fftw_complex complex_type;
+  std::shared_ptr<fftw_plan_s> m_plan;
+  fftw_plan() = default;
+
+  void set_plan(::fftw_plan p) { m_plan.reset(p, fftw_destroy_plan); }
+  inline void fwd(complex_type *dst, complex_type *src, int nfft) {
+    if (m_plan == NULL) set_plan(fftw_plan_dft_1d(nfft, src, dst, FFTW_FORWARD, FFTW_ESTIMATE | FFTW_PRESERVE_INPUT));
+    fftw_execute_dft(m_plan.get(), src, dst);
+  }
+  inline void inv(complex_type *dst, complex_type *src, int nfft) {
+    if (m_plan == NULL) set_plan(fftw_plan_dft_1d(nfft, src, dst, FFTW_BACKWARD, FFTW_ESTIMATE | FFTW_PRESERVE_INPUT));
+    fftw_execute_dft(m_plan.get(), src, dst);
+  }
+  inline void fwd(complex_type *dst, scalar_type *src, int nfft) {
+    if (m_plan == NULL) set_plan(fftw_plan_dft_r2c_1d(nfft, src, dst, FFTW_ESTIMATE | FFTW_PRESERVE_INPUT));
+    fftw_execute_dft_r2c(m_plan.get(), src, dst);
+  }
+  inline void inv(scalar_type *dst, complex_type *src, int nfft) {
+    if (m_plan == NULL) set_plan(fftw_plan_dft_c2r_1d(nfft, src, dst, FFTW_ESTIMATE | FFTW_PRESERVE_INPUT));
+    fftw_execute_dft_c2r(m_plan.get(), src, dst);
+  }
+  inline void fwd2(complex_type *dst, complex_type *src, int n0, int n1) {
+    if (m_plan == NULL) set_plan(fftw_plan_dft_2d(n0, n1, src, dst, FFTW_FORWARD, FFTW_ESTIMATE | FFTW_PRESERVE_INPUT));
+    fftw_execute_dft(m_plan.get(), src, dst);
+  }
+  inline void inv2(complex_type *dst, complex_type *src, int n0, int n1) {
+    if (m_plan == NULL)
+      set_plan(fftw_plan_dft_2d(n0, n1, src, dst, FFTW_BACKWARD, FFTW_ESTIMATE | FFTW_PRESERVE_INPUT));
+    fftw_execute_dft(m_plan.get(), src, dst);
+  }
+};
+template <>
+struct fftw_plan<long double> {
+  typedef long double scalar_type;
+  typedef fftwl_complex complex_type;
+  std::shared_ptr<fftwl_plan_s> m_plan;
+  fftw_plan() = default;
+
+  void set_plan(fftwl_plan p) { m_plan.reset(p, fftwl_destroy_plan); }
+  inline void fwd(complex_type *dst, complex_type *src, int nfft) {
+    if (m_plan == NULL) set_plan(fftwl_plan_dft_1d(nfft, src, dst, FFTW_FORWARD, FFTW_ESTIMATE | FFTW_PRESERVE_INPUT));
+    fftwl_execute_dft(m_plan.get(), src, dst);
+  }
+  inline void inv(complex_type *dst, complex_type *src, int nfft) {
+    if (m_plan == NULL) set_plan(fftwl_plan_dft_1d(nfft, src, dst, FFTW_BACKWARD, FFTW_ESTIMATE | FFTW_PRESERVE_INPUT));
+    fftwl_execute_dft(m_plan.get(), src, dst);
+  }
+  inline void fwd(complex_type *dst, scalar_type *src, int nfft) {
+    if (m_plan == NULL) set_plan(fftwl_plan_dft_r2c_1d(nfft, src, dst, FFTW_ESTIMATE | FFTW_PRESERVE_INPUT));
+    fftwl_execute_dft_r2c(m_plan.get(), src, dst);
+  }
+  inline void inv(scalar_type *dst, complex_type *src, int nfft) {
+    if (m_plan == NULL) set_plan(fftwl_plan_dft_c2r_1d(nfft, src, dst, FFTW_ESTIMATE | FFTW_PRESERVE_INPUT));
+    fftwl_execute_dft_c2r(m_plan.get(), src, dst);
+  }
+  inline void fwd2(complex_type *dst, complex_type *src, int n0, int n1) {
+    if (m_plan == NULL)
+      set_plan(fftwl_plan_dft_2d(n0, n1, src, dst, FFTW_FORWARD, FFTW_ESTIMATE | FFTW_PRESERVE_INPUT));
+    fftwl_execute_dft(m_plan.get(), src, dst);
+  }
+  inline void inv2(complex_type *dst, complex_type *src, int n0, int n1) {
+    if (m_plan == NULL)
+      set_plan(fftwl_plan_dft_2d(n0, n1, src, dst, FFTW_BACKWARD, FFTW_ESTIMATE | FFTW_PRESERVE_INPUT));
+    fftwl_execute_dft(m_plan.get(), src, dst);
+  }
+};
+
+template <typename Scalar_>
+struct fftw_impl {
+  typedef Scalar_ Scalar;
+  typedef std::complex<Scalar> Complex;
+
+  inline void clear() { m_plans.clear(); }
+
+  // complex-to-complex forward FFT
+  inline void fwd(Complex *dst, const Complex *src, int nfft) {
+    get_plan(nfft, false, dst, src).fwd(fftw_cast(dst), fftw_cast(src), nfft);
+  }
+
+  // real-to-complex forward FFT
+  inline void fwd(Complex *dst, const Scalar *src, int nfft) {
+    get_plan(nfft, false, dst, src).fwd(fftw_cast(dst), fftw_cast(src), nfft);
+  }
+
+  // 2-d complex-to-complex
+  inline void fwd2(Complex *dst, const Complex *src, int n0, int n1) {
+    get_plan(n0, n1, false, dst, src).fwd2(fftw_cast(dst), fftw_cast(src), n0, n1);
+  }
+
+  // inverse complex-to-complex
+  inline void inv(Complex *dst, const Complex *src, int nfft) {
+    get_plan(nfft, true, dst, src).inv(fftw_cast(dst), fftw_cast(src), nfft);
+  }
+
+  // half-complex to scalar
+  inline void inv(Scalar *dst, const Complex *src, int nfft) {
+    get_plan(nfft, true, dst, src).inv(fftw_cast(dst), fftw_cast(src), nfft);
+  }
+
+  // 2-d complex-to-complex
+  inline void inv2(Complex *dst, const Complex *src, int n0, int n1) {
+    get_plan(n0, n1, true, dst, src).inv2(fftw_cast(dst), fftw_cast(src), n0, n1);
+  }
+
+ protected:
+  typedef fftw_plan<Scalar> PlanData;
+
+  typedef Eigen::numext::int64_t int64_t;
+
+  typedef std::map<int64_t, PlanData> PlanMap;
+
+  PlanMap m_plans;
+
+  inline PlanData &get_plan(int nfft, bool inverse, void *dst, const void *src) {
+    bool inplace = (dst == src);
+    bool aligned = ((reinterpret_cast<size_t>(src) & 15) | (reinterpret_cast<size_t>(dst) & 15)) == 0;
+    int64_t key = ((nfft << 3) | (inverse << 2) | (inplace << 1) | aligned) << 1;
+    return m_plans[key];
+  }
+
+  inline PlanData &get_plan(int n0, int n1, bool inverse, void *dst, const void *src) {
+    bool inplace = (dst == src);
+    bool aligned = ((reinterpret_cast<size_t>(src) & 15) | (reinterpret_cast<size_t>(dst) & 15)) == 0;
+    int64_t key = (((((int64_t)n0) << 30) | (n1 << 3) | (inverse << 2) | (inplace << 1) | aligned) << 1) + 1;
+    return m_plans[key];
+  }
+};
+
+}  // end namespace internal
+
+}  // end namespace Eigen
diff --git a/inst/include/unsupported/Eigen/src/FFT/imklfft_impl.h b/inst/include/unsupported/Eigen/src/FFT/imklfft_impl.h
new file mode 100644
index 00000000..0c5bb269
--- /dev/null
+++ b/inst/include/unsupported/Eigen/src/FFT/imklfft_impl.h
@@ -0,0 +1,248 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include <mkl_dfti.h>
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+#include <complex>
+#include <memory>
+
+namespace Eigen {
+namespace internal {
+namespace imklfft {
+
+#define RUN_OR_ASSERT(EXPR, ERROR_MSG)                    \
+  {                                                       \
+    MKL_LONG status = (EXPR);                             \
+    eigen_assert(status == DFTI_NO_ERROR && (ERROR_MSG)); \
+  };
+
+inline MKL_Complex16* complex_cast(const std::complex<double>* p) {
+  return const_cast<MKL_Complex16*>(reinterpret_cast<const MKL_Complex16*>(p));
+}
+
+inline MKL_Complex8* complex_cast(const std::complex<float>* p) {
+  return const_cast<MKL_Complex8*>(reinterpret_cast<const MKL_Complex8*>(p));
+}
+
+/*
+ * Parameters:
+ * precision: enum, Precision of the transform: DFTI_SINGLE or DFTI_DOUBLE.
+ * forward_domain: enum, Forward domain of the transform: DFTI_COMPLEX or
+ * DFTI_REAL. dimension: MKL_LONG Dimension of the transform. sizes: MKL_LONG if
+ * dimension = 1.Length of the transform for a one-dimensional transform. sizes:
+ * Array of type MKL_LONG otherwise. Lengths of each dimension for a
+ * multi-dimensional transform.
+ */
+inline void configure_descriptor(std::shared_ptr<DFTI_DESCRIPTOR>& handl, enum DFTI_CONFIG_VALUE precision,
+                                 enum DFTI_CONFIG_VALUE forward_domain, MKL_LONG dimension, MKL_LONG* sizes) {
+  eigen_assert(dimension == 1 || dimension == 2 && "Transformation dimension must be less than 3.");
+
+  DFTI_DESCRIPTOR_HANDLE res = nullptr;
+  if (dimension == 1) {
+    RUN_OR_ASSERT(DftiCreateDescriptor(&res, precision, forward_domain, dimension, *sizes),
+                  "DftiCreateDescriptor failed.")
+    handl.reset(res, [](DFTI_DESCRIPTOR_HANDLE handle) { DftiFreeDescriptor(&handle); });
+    if (forward_domain == DFTI_REAL) {
+      // Set CCE storage
+      RUN_OR_ASSERT(DftiSetValue(handl.get(), DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX),
+                    "DftiSetValue failed.")
+    }
+  } else {
+    RUN_OR_ASSERT(DftiCreateDescriptor(&res, precision, DFTI_COMPLEX, dimension, sizes), "DftiCreateDescriptor failed.")
+    handl.reset(res, [](DFTI_DESCRIPTOR_HANDLE handle) { DftiFreeDescriptor(&handle); });
+  }
+
+  RUN_OR_ASSERT(DftiSetValue(handl.get(), DFTI_PLACEMENT, DFTI_NOT_INPLACE), "DftiSetValue failed.")
+  RUN_OR_ASSERT(DftiCommitDescriptor(handl.get()), "DftiCommitDescriptor failed.")
+}
+
+template <typename T>
+struct plan {};
+
+template <>
+struct plan<float> {
+  typedef float scalar_type;
+  typedef MKL_Complex8 complex_type;
+
+  std::shared_ptr<DFTI_DESCRIPTOR> m_plan;
+
+  plan() = default;
+
+  enum DFTI_CONFIG_VALUE precision = DFTI_SINGLE;
+
+  inline void forward(complex_type* dst, complex_type* src, MKL_LONG nfft) {
+    if (m_plan == 0) {
+      configure_descriptor(m_plan, precision, DFTI_COMPLEX, 1, &nfft);
+    }
+    RUN_OR_ASSERT(DftiComputeForward(m_plan.get(), src, dst), "DftiComputeForward failed.")
+  }
+
+  inline void inverse(complex_type* dst, complex_type* src, MKL_LONG nfft) {
+    if (m_plan == 0) {
+      configure_descriptor(m_plan, precision, DFTI_COMPLEX, 1, &nfft);
+    }
+    RUN_OR_ASSERT(DftiComputeBackward(m_plan.get(), src, dst), "DftiComputeBackward failed.")
+  }
+
+  inline void forward(complex_type* dst, scalar_type* src, MKL_LONG nfft) {
+    if (m_plan == 0) {
+      configure_descriptor(m_plan, precision, DFTI_REAL, 1, &nfft);
+    }
+    RUN_OR_ASSERT(DftiComputeForward(m_plan.get(), src, dst), "DftiComputeForward failed.")
+  }
+
+  inline void inverse(scalar_type* dst, complex_type* src, MKL_LONG nfft) {
+    if (m_plan == 0) {
+      configure_descriptor(m_plan, precision, DFTI_REAL, 1, &nfft);
+    }
+    RUN_OR_ASSERT(DftiComputeBackward(m_plan.get(), src, dst), "DftiComputeBackward failed.")
+  }
+
+  inline void forward2(complex_type* dst, complex_type* src, int n0, int n1) {
+    if (m_plan == 0) {
+      MKL_LONG sizes[2] = {n0, n1};
+      configure_descriptor(m_plan, precision, DFTI_COMPLEX, 2, sizes);
+    }
+    RUN_OR_ASSERT(DftiComputeForward(m_plan.get(), src, dst), "DftiComputeForward failed.")
+  }
+
+  inline void inverse2(complex_type* dst, complex_type* src, int n0, int n1) {
+    if (m_plan == 0) {
+      MKL_LONG sizes[2] = {n0, n1};
+      configure_descriptor(m_plan, precision, DFTI_COMPLEX, 2, sizes);
+    }
+    RUN_OR_ASSERT(DftiComputeBackward(m_plan.get(), src, dst), "DftiComputeBackward failed.")
+  }
+};
+
+template <>
+struct plan<double> {
+  typedef double scalar_type;
+  typedef MKL_Complex16 complex_type;
+
+  std::shared_ptr<DFTI_DESCRIPTOR> m_plan;
+
+  plan() = default;
+
+  enum DFTI_CONFIG_VALUE precision = DFTI_DOUBLE;
+
+  inline void forward(complex_type* dst, complex_type* src, MKL_LONG nfft) {
+    if (m_plan == 0) {
+      configure_descriptor(m_plan, precision, DFTI_COMPLEX, 1, &nfft);
+    }
+    RUN_OR_ASSERT(DftiComputeForward(m_plan.get(), src, dst), "DftiComputeForward failed.")
+  }
+
+  inline void inverse(complex_type* dst, complex_type* src, MKL_LONG nfft) {
+    if (m_plan == 0) {
+      configure_descriptor(m_plan, precision, DFTI_COMPLEX, 1, &nfft);
+    }
+    RUN_OR_ASSERT(DftiComputeBackward(m_plan.get(), src, dst), "DftiComputeBackward failed.")
+  }
+
+  inline void forward(complex_type* dst, scalar_type* src, MKL_LONG nfft) {
+    if (m_plan == 0) {
+      configure_descriptor(m_plan, precision, DFTI_REAL, 1, &nfft);
+    }
+    RUN_OR_ASSERT(DftiComputeForward(m_plan.get(), src, dst), "DftiComputeForward failed.")
+  }
+
+  inline void inverse(scalar_type* dst, complex_type* src, MKL_LONG nfft) {
+    if (m_plan == 0) {
+      configure_descriptor(m_plan, precision, DFTI_REAL, 1, &nfft);
+    }
+    RUN_OR_ASSERT(DftiComputeBackward(m_plan.get(), src, dst), "DftiComputeBackward failed.")
+  }
+
+  inline void forward2(complex_type* dst, complex_type* src, int n0, int n1) {
+    if (m_plan == 0) {
+      MKL_LONG sizes[2] = {n0, n1};
+      configure_descriptor(m_plan, precision, DFTI_COMPLEX, 2, sizes);
+    }
+    RUN_OR_ASSERT(DftiComputeForward(m_plan.get(), src, dst), "DftiComputeForward failed.")
+  }
+
+  inline void inverse2(complex_type* dst, complex_type* src, int n0, int n1) {
+    if (m_plan == 0) {
+      MKL_LONG sizes[2] = {n0, n1};
+      configure_descriptor(m_plan, precision, DFTI_COMPLEX, 2, sizes);
+    }
+    RUN_OR_ASSERT(DftiComputeBackward(m_plan.get(), src, dst), "DftiComputeBackward failed.")
+  }
+};
+
+template <typename Scalar_>
+struct imklfft_impl {
+  typedef Scalar_ Scalar;
+  typedef std::complex<Scalar> Complex;
+
+  inline void clear() { m_plans.clear(); }
+
+  // complex-to-complex forward FFT
+  inline void fwd(Complex* dst, const Complex* src, int nfft) {
+    MKL_LONG size = nfft;
+    get_plan(nfft, dst, src).forward(complex_cast(dst), complex_cast(src), size);
+  }
+
+  // real-to-complex forward FFT
+  inline void fwd(Complex* dst, const Scalar* src, int nfft) {
+    MKL_LONG size = nfft;
+    get_plan(nfft, dst, src).forward(complex_cast(dst), const_cast<Scalar*>(src), nfft);
+  }
+
+  // 2-d complex-to-complex
+  inline void fwd2(Complex* dst, const Complex* src, int n0, int n1) {
+    get_plan(n0, n1, dst, src).forward2(complex_cast(dst), complex_cast(src), n0, n1);
+  }
+
+  // inverse complex-to-complex
+  inline void inv(Complex* dst, const Complex* src, int nfft) {
+    MKL_LONG size = nfft;
+    get_plan(nfft, dst, src).inverse(complex_cast(dst), complex_cast(src), nfft);
+  }
+
+  // half-complex to scalar
+  inline void inv(Scalar* dst, const Complex* src, int nfft) {
+    MKL_LONG size = nfft;
+    get_plan(nfft, dst, src).inverse(const_cast<Scalar*>(dst), complex_cast(src), nfft);
+  }
+
+  // 2-d complex-to-complex
+  inline void inv2(Complex* dst, const Complex* src, int n0, int n1) {
+    get_plan(n0, n1, dst, src).inverse2(complex_cast(dst), complex_cast(src), n0, n1);
+  }
+
+ private:
+  std::map<int64_t, plan<Scalar>> m_plans;
+
+  inline plan<Scalar>& get_plan(int nfft, void* dst, const void* src) {
+    int inplace = dst == src ? 1 : 0;
+    int aligned = ((reinterpret_cast<size_t>(src) & 15) | (reinterpret_cast<size_t>(dst) & 15)) == 0 ? 1 : 0;
+    int64_t key = ((nfft << 2) | (inplace << 1) | aligned) << 1;
+
+    // Create element if key does not exist.
+    return m_plans[key];
+  }
+
+  inline plan<Scalar>& get_plan(int n0, int n1, void* dst, const void* src) {
+    int inplace = (dst == src) ? 1 : 0;
+    int aligned = ((reinterpret_cast<size_t>(src) & 15) | (reinterpret_cast<size_t>(dst) & 15)) == 0 ? 1 : 0;
+    int64_t key = (((((int64_t)n0) << 31) | (n1 << 2) | (inplace << 1) | aligned) << 1) + 1;
+
+    // Create element if key does not exist.
+    return m_plans[key];
+  }
+};
+
+#undef RUN_OR_ASSERT
+
+}  // namespace imklfft
+}  // namespace internal
+}  // namespace Eigen
diff --git a/inst/include/unsupported/Eigen/src/FFT/kissfft_impl.h b/inst/include/unsupported/Eigen/src/FFT/kissfft_impl.h
new file mode 100644
index 00000000..c201d804
--- /dev/null
+++ b/inst/include/unsupported/Eigen/src/FFT/kissfft_impl.h
@@ -0,0 +1,416 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2009 Mark Borgerding mark a borgerding net
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+// This FFT implementation was derived from kissfft http:sourceforge.net/projects/kissfft
+// Copyright 2003-2009 Mark Borgerding
+
+template <typename Scalar_>
+struct kiss_cpx_fft {
+  typedef Scalar_ Scalar;
+  typedef std::complex<Scalar> Complex;
+  std::vector<Complex> m_twiddles;
+  std::vector<int> m_stageRadix;
+  std::vector<int> m_stageRemainder;
+  std::vector<Complex> m_scratchBuf;
+  bool m_inverse;
+
+  static const Scalar m_pi4;  // constant pi / 4
+
+  inline void make_twiddles(int nfft, bool inverse) {
+    using numext::cos;
+    using numext::sin;
+    m_inverse = inverse;
+    m_twiddles.resize(nfft);
+    Scalar phinc = m_pi4 / nfft;
+    Scalar flip = inverse ? Scalar(1) : Scalar(-1);
+    m_twiddles[0] = Complex(Scalar(1), Scalar(0));
+    if ((nfft & 1) == 0) m_twiddles[nfft / 2] = Complex(Scalar(-1), Scalar(0));
+    int i = 1;
+    for (; i * 8 < nfft; ++i) {
+      Scalar c = Scalar(cos(i * 8 * phinc));
+      Scalar s = Scalar(sin(i * 8 * phinc));
+      m_twiddles[i] = Complex(c, s * flip);
+      m_twiddles[nfft - i] = Complex(c, -s * flip);
+    }
+    for (; i * 4 < nfft; ++i) {
+      Scalar c = Scalar(cos((2 * nfft - 8 * i) * phinc));
+      Scalar s = Scalar(sin((2 * nfft - 8 * i) * phinc));
+      m_twiddles[i] = Complex(s, c * flip);
+      m_twiddles[nfft - i] = Complex(s, -c * flip);
+    }
+    for (; i * 8 < 3 * nfft; ++i) {
+      Scalar c = Scalar(cos((8 * i - 2 * nfft) * phinc));
+      Scalar s = Scalar(sin((8 * i - 2 * nfft) * phinc));
+      m_twiddles[i] = Complex(-s, c * flip);
+      m_twiddles[nfft - i] = Complex(-s, -c * flip);
+    }
+    for (; i * 2 < nfft; ++i) {
+      Scalar c = Scalar(cos((4 * nfft - 8 * i) * phinc));
+      Scalar s = Scalar(sin((4 * nfft - 8 * i) * phinc));
+      m_twiddles[i] = Complex(-c, s * flip);
+      m_twiddles[nfft - i] = Complex(-c, -s * flip);
+    }
+  }
+
+  void factorize(int nfft) {
+    // start factoring out 4's, then 2's, then 3,5,7,9,...
+    int n = nfft;
+    int p = 4;
+    do {
+      while (n % p) {
+        switch (p) {
+          case 4:
+            p = 2;
+            break;
+          case 2:
+            p = 3;
+            break;
+          default:
+            p += 2;
+            break;
+        }
+        if (p * p > n) p = n;  // impossible to have a factor > sqrt(n)
+      }
+      n /= p;
+      m_stageRadix.push_back(p);
+      m_stageRemainder.push_back(n);
+      if (p > 5) m_scratchBuf.resize(p);  // scratchbuf will be needed in bfly_generic
+    } while (n > 1);
+  }
+
+  template <typename Src_>
+  inline void work(int stage, Complex *xout, const Src_ *xin, size_t fstride, size_t in_stride) {
+    int p = m_stageRadix[stage];
+    int m = m_stageRemainder[stage];
+    Complex *Fout_beg = xout;
+    Complex *Fout_end = xout + p * m;
+
+    if (m > 1) {
+      do {
+        // recursive call:
+        // DFT of size m*p performed by doing
+        // p instances of smaller DFTs of size m,
+        // each one takes a decimated version of the input
+        work(stage + 1, xout, xin, fstride * p, in_stride);
+        xin += fstride * in_stride;
+      } while ((xout += m) != Fout_end);
+    } else {
+      do {
+        *xout = *xin;
+        xin += fstride * in_stride;
+      } while (++xout != Fout_end);
+    }
+    xout = Fout_beg;
+
+    // recombine the p smaller DFTs
+    switch (p) {
+      case 2:
+        bfly2(xout, fstride, m);
+        break;
+      case 3:
+        bfly3(xout, fstride, m);
+        break;
+      case 4:
+        bfly4(xout, fstride, m);
+        break;
+      case 5:
+        bfly5(xout, fstride, m);
+        break;
+      default:
+        bfly_generic(xout, fstride, m, p);
+        break;
+    }
+  }
+
+  inline void bfly2(Complex *Fout, const size_t fstride, int m) {
+    for (int k = 0; k < m; ++k) {
+      Complex t = Fout[m + k] * m_twiddles[k * fstride];
+      Fout[m + k] = Fout[k] - t;
+      Fout[k] += t;
+    }
+  }
+
+  inline void bfly4(Complex *Fout, const size_t fstride, const size_t m) {
+    Complex scratch[6];
+    int negative_if_inverse = m_inverse * -2 + 1;
+    for (size_t k = 0; k < m; ++k) {
+      scratch[0] = Fout[k + m] * m_twiddles[k * fstride];
+      scratch[1] = Fout[k + 2 * m] * m_twiddles[k * fstride * 2];
+      scratch[2] = Fout[k + 3 * m] * m_twiddles[k * fstride * 3];
+      scratch[5] = Fout[k] - scratch[1];
+
+      Fout[k] += scratch[1];
+      scratch[3] = scratch[0] + scratch[2];
+      scratch[4] = scratch[0] - scratch[2];
+      scratch[4] = Complex(scratch[4].imag() * negative_if_inverse, -scratch[4].real() * negative_if_inverse);
+
+      Fout[k + 2 * m] = Fout[k] - scratch[3];
+      Fout[k] += scratch[3];
+      Fout[k + m] = scratch[5] + scratch[4];
+      Fout[k + 3 * m] = scratch[5] - scratch[4];
+    }
+  }
+
+  inline void bfly3(Complex *Fout, const size_t fstride, const size_t m) {
+    size_t k = m;
+    const size_t m2 = 2 * m;
+    Complex *tw1, *tw2;
+    Complex scratch[5];
+    Complex epi3;
+    epi3 = m_twiddles[fstride * m];
+
+    tw1 = tw2 = &m_twiddles[0];
+
+    do {
+      scratch[1] = Fout[m] * *tw1;
+      scratch[2] = Fout[m2] * *tw2;
+
+      scratch[3] = scratch[1] + scratch[2];
+      scratch[0] = scratch[1] - scratch[2];
+      tw1 += fstride;
+      tw2 += fstride * 2;
+      Fout[m] = Complex(Fout->real() - Scalar(.5) * scratch[3].real(), Fout->imag() - Scalar(.5) * scratch[3].imag());
+      scratch[0] *= epi3.imag();
+      *Fout += scratch[3];
+      Fout[m2] = Complex(Fout[m].real() + scratch[0].imag(), Fout[m].imag() - scratch[0].real());
+      Fout[m] += Complex(-scratch[0].imag(), scratch[0].real());
+      ++Fout;
+    } while (--k);
+  }
+
+  inline void bfly5(Complex *Fout, const size_t fstride, const size_t m) {
+    Complex *Fout0, *Fout1, *Fout2, *Fout3, *Fout4;
+    size_t u;
+    Complex scratch[13];
+    Complex *twiddles = &m_twiddles[0];
+    Complex *tw;
+    Complex ya, yb;
+    ya = twiddles[fstride * m];
+    yb = twiddles[fstride * 2 * m];
+
+    Fout0 = Fout;
+    Fout1 = Fout0 + m;
+    Fout2 = Fout0 + 2 * m;
+    Fout3 = Fout0 + 3 * m;
+    Fout4 = Fout0 + 4 * m;
+
+    tw = twiddles;
+    for (u = 0; u < m; ++u) {
+      scratch[0] = *Fout0;
+
+      scratch[1] = *Fout1 * tw[u * fstride];
+      scratch[2] = *Fout2 * tw[2 * u * fstride];
+      scratch[3] = *Fout3 * tw[3 * u * fstride];
+      scratch[4] = *Fout4 * tw[4 * u * fstride];
+
+      scratch[7] = scratch[1] + scratch[4];
+      scratch[10] = scratch[1] - scratch[4];
+      scratch[8] = scratch[2] + scratch[3];
+      scratch[9] = scratch[2] - scratch[3];
+
+      *Fout0 += scratch[7];
+      *Fout0 += scratch[8];
+
+      scratch[5] = scratch[0] + Complex((scratch[7].real() * ya.real()) + (scratch[8].real() * yb.real()),
+                                        (scratch[7].imag() * ya.real()) + (scratch[8].imag() * yb.real()));
+
+      scratch[6] = Complex((scratch[10].imag() * ya.imag()) + (scratch[9].imag() * yb.imag()),
+                           -(scratch[10].real() * ya.imag()) - (scratch[9].real() * yb.imag()));
+
+      *Fout1 = scratch[5] - scratch[6];
+      *Fout4 = scratch[5] + scratch[6];
+
+      scratch[11] = scratch[0] + Complex((scratch[7].real() * yb.real()) + (scratch[8].real() * ya.real()),
+                                         (scratch[7].imag() * yb.real()) + (scratch[8].imag() * ya.real()));
+
+      scratch[12] = Complex(-(scratch[10].imag() * yb.imag()) + (scratch[9].imag() * ya.imag()),
+                            (scratch[10].real() * yb.imag()) - (scratch[9].real() * ya.imag()));
+
+      *Fout2 = scratch[11] + scratch[12];
+      *Fout3 = scratch[11] - scratch[12];
+
+      ++Fout0;
+      ++Fout1;
+      ++Fout2;
+      ++Fout3;
+      ++Fout4;
+    }
+  }
+
+  /* perform the butterfly for one stage of a mixed radix FFT */
+  inline void bfly_generic(Complex *Fout, const size_t fstride, int m, int p) {
+    int u, k, q1, q;
+    Complex *twiddles = &m_twiddles[0];
+    Complex t;
+    int Norig = static_cast<int>(m_twiddles.size());
+    Complex *scratchbuf = &m_scratchBuf[0];
+
+    for (u = 0; u < m; ++u) {
+      k = u;
+      for (q1 = 0; q1 < p; ++q1) {
+        scratchbuf[q1] = Fout[k];
+        k += m;
+      }
+
+      k = u;
+      for (q1 = 0; q1 < p; ++q1) {
+        int twidx = 0;
+        Fout[k] = scratchbuf[0];
+        for (q = 1; q < p; ++q) {
+          twidx += static_cast<int>(fstride) * k;
+          if (twidx >= Norig) twidx -= Norig;
+          t = scratchbuf[q] * twiddles[twidx];
+          Fout[k] += t;
+        }
+        k += m;
+      }
+    }
+  }
+};
+
+template <typename _Scalar>
+const typename kiss_cpx_fft<_Scalar>::Scalar kiss_cpx_fft<_Scalar>::m_pi4 =
+    numext::atan(kiss_cpx_fft<_Scalar>::Scalar(1));
+
+template <typename Scalar_>
+struct kissfft_impl {
+  typedef Scalar_ Scalar;
+  typedef std::complex<Scalar> Complex;
+
+  void clear() {
+    m_plans.clear();
+    m_realTwiddles.clear();
+  }
+
+  inline void fwd(Complex *dst, const Complex *src, int nfft) { get_plan(nfft, false).work(0, dst, src, 1, 1); }
+
+  inline void fwd2(Complex *dst, const Complex *src, int n0, int n1) {
+    EIGEN_UNUSED_VARIABLE(dst);
+    EIGEN_UNUSED_VARIABLE(src);
+    EIGEN_UNUSED_VARIABLE(n0);
+    EIGEN_UNUSED_VARIABLE(n1);
+  }
+
+  inline void inv2(Complex *dst, const Complex *src, int n0, int n1) {
+    EIGEN_UNUSED_VARIABLE(dst);
+    EIGEN_UNUSED_VARIABLE(src);
+    EIGEN_UNUSED_VARIABLE(n0);
+    EIGEN_UNUSED_VARIABLE(n1);
+  }
+
+  // real-to-complex forward FFT
+  // perform two FFTs of src even and src odd
+  // then twiddle to recombine them into the half-spectrum format
+  // then fill in the conjugate symmetric half
+  inline void fwd(Complex *dst, const Scalar *src, int nfft) {
+    if (nfft & 3) {
+      // use generic mode for odd
+      m_tmpBuf1.resize(nfft);
+      get_plan(nfft, false).work(0, &m_tmpBuf1[0], src, 1, 1);
+      std::copy(m_tmpBuf1.begin(), m_tmpBuf1.begin() + (nfft >> 1) + 1, dst);
+    } else {
+      int ncfft = nfft >> 1;
+      int ncfft2 = nfft >> 2;
+      Complex *rtw = real_twiddles(ncfft2);
+
+      // use optimized mode for even real
+      fwd(dst, reinterpret_cast<const Complex *>(src), ncfft);
+      Complex dc(dst[0].real() + dst[0].imag());
+      Complex nyquist(dst[0].real() - dst[0].imag());
+      int k;
+      for (k = 1; k <= ncfft2; ++k) {
+        Complex fpk = dst[k];
+        Complex fpnk = conj(dst[ncfft - k]);
+        Complex f1k = fpk + fpnk;
+        Complex f2k = fpk - fpnk;
+        Complex tw = f2k * rtw[k - 1];
+        dst[k] = (f1k + tw) * Scalar(.5);
+        dst[ncfft - k] = conj(f1k - tw) * Scalar(.5);
+      }
+      dst[0] = dc;
+      dst[ncfft] = nyquist;
+    }
+  }
+
+  // inverse complex-to-complex
+  inline void inv(Complex *dst, const Complex *src, int nfft) { get_plan(nfft, true).work(0, dst, src, 1, 1); }
+
+  // half-complex to scalar
+  inline void inv(Scalar *dst, const Complex *src, int nfft) {
+    if (nfft & 3) {
+      m_tmpBuf1.resize(nfft);
+      m_tmpBuf2.resize(nfft);
+      std::copy(src, src + (nfft >> 1) + 1, m_tmpBuf1.begin());
+      for (int k = 1; k < (nfft >> 1) + 1; ++k) m_tmpBuf1[nfft - k] = conj(m_tmpBuf1[k]);
+      inv(&m_tmpBuf2[0], &m_tmpBuf1[0], nfft);
+      for (int k = 0; k < nfft; ++k) dst[k] = m_tmpBuf2[k].real();
+    } else {
+      // optimized version for multiple of 4
+      int ncfft = nfft >> 1;
+      int ncfft2 = nfft >> 2;
+      Complex *rtw = real_twiddles(ncfft2);
+      m_tmpBuf1.resize(ncfft);
+      m_tmpBuf1[0] = Complex(src[0].real() + src[ncfft].real(), src[0].real() - src[ncfft].real());
+      for (int k = 1; k <= ncfft / 2; ++k) {
+        Complex fk = src[k];
+        Complex fnkc = conj(src[ncfft - k]);
+        Complex fek = fk + fnkc;
+        Complex tmp = fk - fnkc;
+        Complex fok = tmp * conj(rtw[k - 1]);
+        m_tmpBuf1[k] = fek + fok;
+        m_tmpBuf1[ncfft - k] = conj(fek - fok);
+      }
+      get_plan(ncfft, true).work(0, reinterpret_cast<Complex *>(dst), &m_tmpBuf1[0], 1, 1);
+    }
+  }
+
+ protected:
+  typedef kiss_cpx_fft<Scalar> PlanData;
+  typedef std::map<int, PlanData> PlanMap;
+
+  PlanMap m_plans;
+  std::map<int, std::vector<Complex> > m_realTwiddles;
+  std::vector<Complex> m_tmpBuf1;
+  std::vector<Complex> m_tmpBuf2;
+
+  inline int PlanKey(int nfft, bool isinverse) const { return (nfft << 1) | int(isinverse); }
+
+  inline PlanData &get_plan(int nfft, bool inverse) {
+    // TODO look for PlanKey(nfft, ! inverse) and conjugate the twiddles
+    PlanData &pd = m_plans[PlanKey(nfft, inverse)];
+    if (pd.m_twiddles.size() == 0) {
+      pd.make_twiddles(nfft, inverse);
+      pd.factorize(nfft);
+    }
+    return pd;
+  }
+
+  inline Complex *real_twiddles(int ncfft2) {
+    using std::acos;
+    std::vector<Complex> &twidref = m_realTwiddles[ncfft2];  // creates new if not there
+    if ((int)twidref.size() != ncfft2) {
+      twidref.resize(ncfft2);
+      int ncfft = ncfft2 << 1;
+      Scalar pi = acos(Scalar(-1));
+      for (int k = 1; k <= ncfft2; ++k) twidref[k - 1] = exp(Complex(0, -pi * (Scalar(k) / ncfft + Scalar(.5))));
+    }
+    return &twidref[0];
+  }
+};
+
+}  // end namespace internal
+
+}  // end namespace Eigen
diff --git a/inst/include/unsupported/Eigen/src/FFT/pocketfft_impl.h b/inst/include/unsupported/Eigen/src/FFT/pocketfft_impl.h
new file mode 100644
index 00000000..fce105bf
--- /dev/null
+++ b/inst/include/unsupported/Eigen/src/FFT/pocketfft_impl.h
@@ -0,0 +1,67 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename _Scalar>
+struct pocketfft_impl {
+  using Scalar = _Scalar;
+  using Complex = std::complex<Scalar>;
+  using shape_t = pocketfft::shape_t;
+  using stride_t = pocketfft::stride_t;
+
+  inline void clear() {}
+
+  inline void fwd(Complex* dst, const Scalar* src, int nfft) {
+    const shape_t shape_{static_cast<size_t>(nfft)};
+    const shape_t axes_{0};
+    const stride_t stride_in{sizeof(Scalar)};
+    const stride_t stride_out{sizeof(Complex)};
+    pocketfft::r2c(shape_, stride_in, stride_out, axes_, pocketfft::FORWARD, src, dst, static_cast<Scalar>(1));
+  }
+
+  inline void fwd(Complex* dst, const Complex* src, int nfft) {
+    const shape_t shape_{static_cast<size_t>(nfft)};
+    const shape_t axes_{0};
+    const stride_t stride_{sizeof(Complex)};
+    pocketfft::c2c(shape_, stride_, stride_, axes_, pocketfft::FORWARD, src, dst, static_cast<Scalar>(1));
+  }
+
+  inline void inv(Scalar* dst, const Complex* src, int nfft) {
+    const shape_t shape_{static_cast<size_t>(nfft)};
+    const shape_t axes_{0};
+    const stride_t stride_in{sizeof(Complex)};
+    const stride_t stride_out{sizeof(Scalar)};
+    pocketfft::c2r(shape_, stride_in, stride_out, axes_, pocketfft::BACKWARD, src, dst, static_cast<Scalar>(1));
+  }
+
+  inline void inv(Complex* dst, const Complex* src, int nfft) {
+    const shape_t shape_{static_cast<size_t>(nfft)};
+    const shape_t axes_{0};
+    const stride_t stride_{sizeof(Complex)};
+    pocketfft::c2c(shape_, stride_, stride_, axes_, pocketfft::BACKWARD, src, dst, static_cast<Scalar>(1));
+  }
+
+  inline void fwd2(Complex* dst, const Complex* src, int nfft0, int nfft1) {
+    const shape_t shape_{static_cast<size_t>(nfft0), static_cast<size_t>(nfft1)};
+    const shape_t axes_{0, 1};
+    const stride_t stride_{static_cast<ptrdiff_t>(sizeof(Complex) * nfft1), static_cast<ptrdiff_t>(sizeof(Complex))};
+    pocketfft::c2c(shape_, stride_, stride_, axes_, pocketfft::FORWARD, src, dst, static_cast<Scalar>(1));
+  }
+
+  inline void inv2(Complex* dst, const Complex* src, int nfft0, int nfft1) {
+    const shape_t shape_{static_cast<size_t>(nfft0), static_cast<size_t>(nfft1)};
+    const shape_t axes_{0, 1};
+    const stride_t stride_{static_cast<ptrdiff_t>(sizeof(Complex) * nfft1), static_cast<ptrdiff_t>(sizeof(Complex))};
+    pocketfft::c2c(shape_, stride_, stride_, axes_, pocketfft::BACKWARD, src, dst, static_cast<Scalar>(1));
+  }
+};
+
+}  // namespace internal
+}  // namespace Eigen
diff --git a/inst/include/unsupported/Eigen/src/IterativeSolvers/BiCGSTABL.h b/inst/include/unsupported/Eigen/src/IterativeSolvers/BiCGSTABL.h
new file mode 100644
index 00000000..0469a5a9
--- /dev/null
+++ b/inst/include/unsupported/Eigen/src/IterativeSolvers/BiCGSTABL.h
@@ -0,0 +1,339 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2020 Chris Schoutrop <c.e.m.schoutrop@tue.nl>
+// Copyright (C) 2020 Jens Wehner <j.wehner@esciencecenter.nl>
+// Copyright (C) 2020 Jan van Dijk <j.v.dijk@tue.nl>
+// Copyright (C) 2020 Adithya Vijaykumar
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+/*
+
+  This implementation of BiCGStab(L) is based on the papers
+      General algorithm:
+      1. G.L.G. Sleijpen, D.R. Fokkema. (1993). BiCGstab(l) for linear equations
+  involving unsymmetric matrices with complex spectrum. Electronic Transactions
+  on Numerical Analysis. Polynomial step update:
+      2. G.L.G. Sleijpen, M.B. Van Gijzen. (2010) Exploiting BiCGstab(l)
+  strategies to induce dimension reduction SIAM Journal on Scientific Computing.
+      3. Fokkema, Diederik R. Enhanced implementation of BiCGstab (l) for
+  solving linear systems of equations. Universiteit Utrecht. Mathematisch
+  Instituut, 1996
+      4. Sleijpen, G. L., & van der Vorst, H. A. (1996). Reliable updated
+  residuals in hybrid Bi-CG methods. Computing, 56(2), 141-163.
+*/
+
+#ifndef EIGEN_BICGSTABL_H
+#define EIGEN_BICGSTABL_H
+
+namespace Eigen {
+
+namespace internal {
+/**     \internal Low-level bi conjugate gradient stabilized algorithm with L
+   additional residual minimization steps \param mat The matrix A \param rhs The
+   right hand side vector b \param x On input and initial solution, on output
+   the computed solution. \param precond A preconditioner being able to
+   efficiently solve for an approximation of Ax=b (regardless of b) \param iters
+   On input the max number of iteration, on output the number of performed
+   iterations. \param tol_error On input the tolerance error, on output an
+   estimation of the relative error. \param L On input Number of additional
+   GMRES steps to take. If L is too large (~20) instabilities occur. \return
+   false in the case of numerical issue, for example a break down of BiCGSTABL.
+*/
+template <typename MatrixType, typename Rhs, typename Dest, typename Preconditioner>
+bool bicgstabl(const MatrixType &mat, const Rhs &rhs, Dest &x, const Preconditioner &precond, Index &iters,
+               typename Dest::RealScalar &tol_error, Index L) {
+  using numext::abs;
+  using numext::sqrt;
+  typedef typename Dest::RealScalar RealScalar;
+  typedef typename Dest::Scalar Scalar;
+  const Index N = rhs.size();
+  L = L < x.rows() ? L : x.rows();
+
+  Index k = 0;
+
+  const RealScalar tol = tol_error;
+  const Index maxIters = iters;
+
+  typedef Matrix<Scalar, Dynamic, 1> VectorType;
+  typedef Matrix<Scalar, Dynamic, Dynamic, ColMajor> DenseMatrixType;
+
+  DenseMatrixType rHat(N, L + 1);
+  DenseMatrixType uHat(N, L + 1);
+
+  // We start with an initial guess x_0 and let us set r_0 as (residual
+  // calculated from x_0)
+  VectorType x0 = x;
+  rHat.col(0) = rhs - mat * x0;  // r_0
+
+  x.setZero();  // This will contain the updates to the solution.
+  // rShadow is arbitrary, but must never be orthogonal to any residual.
+  VectorType rShadow = VectorType::Random(N);
+
+  VectorType x_prime = x;
+
+  // Redundant: x is already set to 0
+  // x.setZero();
+  VectorType b_prime = rHat.col(0);
+
+  // Other vectors and scalars initialization
+  Scalar rho0 = 1.0;
+  Scalar alpha = 0.0;
+  Scalar omega = 1.0;
+
+  uHat.col(0).setZero();
+
+  bool bicg_convergence = false;
+
+  const RealScalar normb = rhs.stableNorm();
+  if (internal::isApprox(normb, RealScalar(0))) {
+    x.setZero();
+    iters = 0;
+    return true;
+  }
+  RealScalar normr = rHat.col(0).stableNorm();
+  RealScalar Mx = normr;
+  RealScalar Mr = normr;
+
+  // Keep track of the solution with the lowest residual
+  RealScalar normr_min = normr;
+  VectorType x_min = x_prime + x;
+
+  // Criterion for when to apply the group-wise update, conform ref 3.
+  const RealScalar delta = 0.01;
+
+  bool compute_res = false;
+  bool update_app = false;
+
+  while (normr > tol * normb && k < maxIters) {
+    rho0 *= -omega;
+
+    for (Index j = 0; j < L; ++j) {
+      const Scalar rho1 = rShadow.dot(rHat.col(j));
+
+      if (!(numext::isfinite)(rho1) || rho0 == RealScalar(0.0)) {
+        // We cannot continue computing, return the best solution found.
+        x += x_prime;
+
+        // Check if x is better than the best stored solution thus far.
+        normr = (rhs - mat * (precond.solve(x) + x0)).stableNorm();
+
+        if (normr > normr_min || !(numext::isfinite)(normr)) {
+          // x_min is a better solution than x, return x_min
+          x = x_min;
+          normr = normr_min;
+        }
+        tol_error = normr / normb;
+        iters = k;
+        // x contains the updates to x0, add those back to obtain the solution
+        x = precond.solve(x);
+        x += x0;
+        return (normr < tol * normb);
+      }
+
+      const Scalar beta = alpha * (rho1 / rho0);
+      rho0 = rho1;
+      // Update search directions
+      uHat.leftCols(j + 1) = rHat.leftCols(j + 1) - beta * uHat.leftCols(j + 1);
+      uHat.col(j + 1) = mat * precond.solve(uHat.col(j));
+      const Scalar sigma = rShadow.dot(uHat.col(j + 1));
+      alpha = rho1 / sigma;
+      // Update residuals
+      rHat.leftCols(j + 1) -= alpha * uHat.middleCols(1, j + 1);
+      rHat.col(j + 1) = mat * precond.solve(rHat.col(j));
+      // Complete BiCG iteration by updating x
+      x += alpha * uHat.col(0);
+      normr = rHat.col(0).stableNorm();
+      // Check for early exit
+      if (normr < tol * normb) {
+        /*
+          Convergence was achieved during BiCG step.
+          Without this check BiCGStab(L) fails for trivial matrices, such as
+          when the preconditioner already is the inverse, or the input matrix is
+          identity.
+        */
+        bicg_convergence = true;
+        break;
+      } else if (normr < normr_min) {
+        // We found an x with lower residual, keep this one.
+        x_min = x + x_prime;
+        normr_min = normr;
+      }
+    }
+    if (!bicg_convergence) {
+      /*
+        The polynomial/minimize residual step.
+
+        QR Householder method for argmin is more stable than (modified)
+        Gram-Schmidt, in the sense that there is less loss of orthogonality. It
+        is more accurate than solving the normal equations, since the normal
+        equations scale with condition number squared.
+      */
+      const VectorType gamma = rHat.rightCols(L).householderQr().solve(rHat.col(0));
+      x += rHat.leftCols(L) * gamma;
+      rHat.col(0) -= rHat.rightCols(L) * gamma;
+      uHat.col(0) -= uHat.rightCols(L) * gamma;
+      normr = rHat.col(0).stableNorm();
+      omega = gamma(L - 1);
+    }
+    if (normr < normr_min) {
+      // We found an x with lower residual, keep this one.
+      x_min = x + x_prime;
+      normr_min = normr;
+    }
+
+    k++;
+
+    /*
+      Reliable update part
+
+      The recursively computed residual can deviate from the actual residual
+      after several iterations. However, computing the residual from the
+      definition costs extra MVs and should not be done at each iteration. The
+      reliable update strategy computes the true residual from the definition:
+      r=b-A*x at strategic intervals. Furthermore a "group wise update" strategy
+      is used to combine updates, which improves accuracy.
+    */
+
+    // Maximum norm of residuals since last update of x.
+    Mx = numext::maxi(Mx, normr);
+    // Maximum norm of residuals since last computation of the true residual.
+    Mr = numext::maxi(Mr, normr);
+
+    if (normr < delta * normb && normb <= Mx) {
+      update_app = true;
+    }
+
+    if (update_app || (normr < delta * Mr && normb <= Mr)) {
+      compute_res = true;
+    }
+
+    if (bicg_convergence) {
+      update_app = true;
+      compute_res = true;
+      bicg_convergence = false;
+    }
+
+    if (compute_res) {
+      // Explicitly compute residual from the definition
+
+      // This is equivalent to the shifted version of rhs - mat *
+      // (precond.solve(x)+x0)
+      rHat.col(0) = b_prime - mat * precond.solve(x);
+      normr = rHat.col(0).stableNorm();
+      Mr = normr;
+
+      if (update_app) {
+        // After the group wise update, the original problem is translated to a
+        // shifted one.
+        x_prime += x;
+        x.setZero();
+        b_prime = rHat.col(0);
+        Mx = normr;
+      }
+    }
+    if (normr < normr_min) {
+      // We found an x with lower residual, keep this one.
+      x_min = x + x_prime;
+      normr_min = normr;
+    }
+
+    compute_res = false;
+    update_app = false;
+  }
+
+  // Convert internal variable to the true solution vector x
+  x += x_prime;
+
+  normr = (rhs - mat * (precond.solve(x) + x0)).stableNorm();
+  if (normr > normr_min || !(numext::isfinite)(normr)) {
+    // x_min is a better solution than x, return x_min
+    x = x_min;
+    normr = normr_min;
+  }
+  tol_error = normr / normb;
+  iters = k;
+
+  // x contains the updates to x0, add those back to obtain the solution
+  x = precond.solve(x);
+  x += x0;
+  return true;
+}
+
+}  // namespace internal
+
+template <typename MatrixType_, typename Preconditioner_ = DiagonalPreconditioner<typename MatrixType_::Scalar>>
+class BiCGSTABL;
+
+namespace internal {
+
+template <typename MatrixType_, typename Preconditioner_>
+struct traits<Eigen::BiCGSTABL<MatrixType_, Preconditioner_>> {
+  typedef MatrixType_ MatrixType;
+  typedef Preconditioner_ Preconditioner;
+};
+
+}  // namespace internal
+
+template <typename MatrixType_, typename Preconditioner_>
+class BiCGSTABL : public IterativeSolverBase<BiCGSTABL<MatrixType_, Preconditioner_>> {
+  typedef IterativeSolverBase<BiCGSTABL> Base;
+  using Base::m_error;
+  using Base::m_info;
+  using Base::m_isInitialized;
+  using Base::m_iterations;
+  using Base::matrix;
+  Index m_L;
+
+ public:
+  typedef MatrixType_ MatrixType;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef Preconditioner_ Preconditioner;
+
+  /** Default constructor. */
+  BiCGSTABL() : m_L(2) {}
+
+  /**
+  Initialize the solver with matrix \a A for further \c Ax=b solving.
+
+  This constructor is a shortcut for the default constructor followed
+  by a call to compute().
+
+  \warning this class stores a reference to the matrix A as well as some
+  precomputed values that depend on it. Therefore, if \a A is changed
+  this class becomes invalid. Call compute() to update it with the new
+  matrix A, or modify a copy of A.
+  */
+  template <typename MatrixDerived>
+  explicit BiCGSTABL(const EigenBase<MatrixDerived> &A) : Base(A.derived()), m_L(2) {}
+
+  /** \internal */
+  /** Loops over the number of columns of b and does the following:
+    1. sets the tolerance and maxIterations
+    2. Calls the function that has the core solver routine
+  */
+  template <typename Rhs, typename Dest>
+  void _solve_vector_with_guess_impl(const Rhs &b, Dest &x) const {
+    m_iterations = Base::maxIterations();
+
+    m_error = Base::m_tolerance;
+
+    bool ret = internal::bicgstabl(matrix(), b, x, Base::m_preconditioner, m_iterations, m_error, m_L);
+    m_info = (!ret) ? NumericalIssue : m_error <= Base::m_tolerance ? Success : NoConvergence;
+  }
+
+  /** Sets the parameter L, indicating how many minimize residual steps are
+   * used. Default: 2 */
+  void setL(Index L) {
+    eigen_assert(L >= 1 && "L needs to be positive");
+    m_L = L;
+  }
+};
+
+}  // namespace Eigen
+
+#endif /* EIGEN_BICGSTABL_H */
diff --git a/inst/include/unsupported/Eigen/src/IterativeSolvers/ConstrainedConjGrad.h b/inst/include/unsupported/Eigen/src/IterativeSolvers/ConstrainedConjGrad.h
deleted file mode 100644
index dc0093eb..00000000
--- a/inst/include/unsupported/Eigen/src/IterativeSolvers/ConstrainedConjGrad.h
+++ /dev/null
@@ -1,189 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Gael Guennebaud <gael.guennebaud@inria.fr>
-
-/* NOTE The functions of this file have been adapted from the GMM++ library */
-
-//========================================================================
-//
-// Copyright (C) 2002-2007 Yves Renard
-//
-// This file is a part of GETFEM++
-//
-// Getfem++ is free software; you can redistribute it and/or modify
-// it under the terms of the GNU Lesser General Public License as
-// published by the Free Software Foundation; version 2.1 of the License.
-//
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-// GNU Lesser General Public License for more details.
-// You should have received a copy of the GNU Lesser General Public
-// License along with this program; if not, write to the Free Software
-// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301,
-// USA.
-//
-//========================================================================
-
-#include "../../../../Eigen/src/Core/util/NonMPL2.h"
-
-#ifndef EIGEN_CONSTRAINEDCG_H
-#define EIGEN_CONSTRAINEDCG_H
-
-#include <Eigen/Core>
-
-namespace Eigen { 
-
-namespace internal {
-
-/** \ingroup IterativeSolvers_Module
-  * Compute the pseudo inverse of the non-square matrix C such that
-  * \f$ CINV = (C * C^T)^{-1} * C \f$ based on a conjugate gradient method.
-  *
-  * This function is internally used by constrained_cg.
-  */
-template <typename CMatrix, typename CINVMatrix>
-void pseudo_inverse(const CMatrix &C, CINVMatrix &CINV)
-{
-  // optimisable : copie de la ligne, precalcul de C * trans(C).
-  typedef typename CMatrix::Scalar Scalar;
-  typedef typename CMatrix::Index Index;
-  // FIXME use sparse vectors ?
-  typedef Matrix<Scalar,Dynamic,1> TmpVec;
-
-  Index rows = C.rows(), cols = C.cols();
-
-  TmpVec d(rows), e(rows), l(cols), p(rows), q(rows), r(rows);
-  Scalar rho, rho_1, alpha;
-  d.setZero();
-
-  typedef Triplet<double> T;
-  std::vector<T> tripletList;
-    
-  for (Index i = 0; i < rows; ++i)
-  {
-    d[i] = 1.0;
-    rho = 1.0;
-    e.setZero();
-    r = d;
-    p = d;
-
-    while (rho >= 1e-38)
-    { /* conjugate gradient to compute e             */
-      /* which is the i-th row of inv(C * trans(C))  */
-      l = C.transpose() * p;
-      q = C * l;
-      alpha = rho / p.dot(q);
-      e +=  alpha * p;
-      r += -alpha * q;
-      rho_1 = rho;
-      rho = r.dot(r);
-      p = (rho/rho_1) * p + r;
-    }
-
-    l = C.transpose() * e; // l is the i-th row of CINV
-    // FIXME add a generic "prune/filter" expression for both dense and sparse object to sparse
-    for (Index j=0; j<l.size(); ++j)
-      if (l[j]<1e-15)
-	tripletList.push_back(T(i,j,l(j)));
-
-	
-    d[i] = 0.0;
-  }
-  CINV.setFromTriplets(tripletList.begin(), tripletList.end());
-}
-
-
-
-/** \ingroup IterativeSolvers_Module
-  * Constrained conjugate gradient
-  *
-  * Computes the minimum of \f$ 1/2((Ax).x) - bx \f$ under the contraint \f$ Cx \le f \f$
-  */
-template<typename TMatrix, typename CMatrix,
-         typename VectorX, typename VectorB, typename VectorF>
-void constrained_cg(const TMatrix& A, const CMatrix& C, VectorX& x,
-                       const VectorB& b, const VectorF& f, IterationController &iter)
-{
-  using std::sqrt;
-  typedef typename TMatrix::Scalar Scalar;
-  typedef typename TMatrix::Index Index;
-  typedef Matrix<Scalar,Dynamic,1>  TmpVec;
-
-  Scalar rho = 1.0, rho_1, lambda, gamma;
-  Index xSize = x.size();
-  TmpVec  p(xSize), q(xSize), q2(xSize),
-          r(xSize), old_z(xSize), z(xSize),
-          memox(xSize);
-  std::vector<bool> satured(C.rows());
-  p.setZero();
-  iter.setRhsNorm(sqrt(b.dot(b))); // gael vect_sp(PS, b, b)
-  if (iter.rhsNorm() == 0.0) iter.setRhsNorm(1.0);
-
-  SparseMatrix<Scalar,RowMajor> CINV(C.rows(), C.cols());
-  pseudo_inverse(C, CINV);
-
-  while(true)
-  {
-    // computation of residual
-    old_z = z;
-    memox = x;
-    r = b;
-    r += A * -x;
-    z = r;
-    bool transition = false;
-    for (Index i = 0; i < C.rows(); ++i)
-    {
-      Scalar al = C.row(i).dot(x) - f.coeff(i);
-      if (al >= -1.0E-15)
-      {
-        if (!satured[i])
-        {
-          satured[i] = true;
-          transition = true;
-        }
-        Scalar bb = CINV.row(i).dot(z);
-        if (bb > 0.0)
-          // FIXME: we should allow that: z += -bb * C.row(i);
-          for (typename CMatrix::InnerIterator it(C,i); it; ++it)
-            z.coeffRef(it.index()) -= bb*it.value();
-      }
-      else
-        satured[i] = false;
-    }
-
-    // descent direction
-    rho_1 = rho;
-    rho = r.dot(z);
-
-    if (iter.finished(rho)) break;
-
-    if (iter.noiseLevel() > 0 && transition) std::cerr << "CCG: transition\n";
-    if (transition || iter.first()) gamma = 0.0;
-    else gamma = (std::max)(0.0, (rho - old_z.dot(z)) / rho_1);
-    p = z + gamma*p;
-
-    ++iter;
-    // one dimensionnal optimization
-    q = A * p;
-    lambda = rho / q.dot(p);
-    for (Index i = 0; i < C.rows(); ++i)
-    {
-      if (!satured[i])
-      {
-        Scalar bb = C.row(i).dot(p) - f[i];
-        if (bb > 0.0)
-          lambda = (std::min)(lambda, (f.coeff(i)-C.row(i).dot(x)) / bb);
-      }
-    }
-    x += lambda * p;
-    memox -= x;
-  }
-}
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_CONSTRAINEDCG_H
diff --git a/inst/include/unsupported/Eigen/src/IterativeSolvers/DGMRES.h b/inst/include/unsupported/Eigen/src/IterativeSolvers/DGMRES.h
index 68fc997f..6f6df3ed 100644
--- a/inst/include/unsupported/Eigen/src/IterativeSolvers/DGMRES.h
+++ b/inst/include/unsupported/Eigen/src/IterativeSolvers/DGMRES.h
@@ -10,72 +10,67 @@
 #ifndef EIGEN_DGMRES_H
 #define EIGEN_DGMRES_H
 
-#include <Eigen/Eigenvalues>
+#include "../../../../Eigen/Eigenvalues"
 
-namespace Eigen { 
-  
-template< typename _MatrixType,
-          typename _Preconditioner = DiagonalPreconditioner<typename _MatrixType::Scalar> >
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+template <typename MatrixType_, typename Preconditioner_ = DiagonalPreconditioner<typename MatrixType_::Scalar> >
 class DGMRES;
 
 namespace internal {
 
-template< typename _MatrixType, typename _Preconditioner>
-struct traits<DGMRES<_MatrixType,_Preconditioner> >
-{
-  typedef _MatrixType MatrixType;
-  typedef _Preconditioner Preconditioner;
+template <typename MatrixType_, typename Preconditioner_>
+struct traits<DGMRES<MatrixType_, Preconditioner_> > {
+  typedef MatrixType_ MatrixType;
+  typedef Preconditioner_ Preconditioner;
 };
 
 /** \brief Computes a permutation vector to have a sorted sequence
-  * \param vec The vector to reorder.
-  * \param perm gives the sorted sequence on output. Must be initialized with 0..n-1
-  * \param ncut Put  the ncut smallest elements at the end of the vector
-  * WARNING This is an expensive sort, so should be used only 
-  * for small size vectors
-  * TODO Use modified QuickSplit or std::nth_element to get the smallest values 
-  */
+ * \param vec The vector to reorder.
+ * \param perm gives the sorted sequence on output. Must be initialized with 0..n-1
+ * \param ncut Put  the ncut smallest elements at the end of the vector
+ * WARNING This is an expensive sort, so should be used only
+ * for small size vectors
+ * TODO Use modified QuickSplit or std::nth_element to get the smallest values
+ */
 template <typename VectorType, typename IndexType>
-void sortWithPermutation (VectorType& vec, IndexType& perm, typename IndexType::Scalar& ncut)
-{
+void sortWithPermutation(VectorType& vec, IndexType& perm, typename IndexType::Scalar& ncut) {
   eigen_assert(vec.size() == perm.size());
-  typedef typename IndexType::Scalar Index; 
-  typedef typename VectorType::Scalar Scalar; 
-  bool flag; 
-  for (Index k  = 0; k < ncut; k++)
-  {
+  bool flag;
+  for (Index k = 0; k < ncut; k++) {
     flag = false;
-    for (Index j = 0; j < vec.size()-1; j++)
-    {
-      if ( vec(perm(j)) < vec(perm(j+1)) )
-      {
-        std::swap(perm(j),perm(j+1)); 
+    for (Index j = 0; j < vec.size() - 1; j++) {
+      if (vec(perm(j)) < vec(perm(j + 1))) {
+        std::swap(perm(j), perm(j + 1));
         flag = true;
       }
-      if (!flag) break; // The vector is in sorted order
+      if (!flag) break;  // The vector is in sorted order
     }
   }
 }
 
-}
+}  // namespace internal
 /**
- * \ingroup IterativeLInearSolvers_Module
+ * \ingroup IterativeLinearSolvers_Module
  * \brief A Restarted GMRES with deflation.
  * This class implements a modification of the GMRES solver for
- * sparse linear systems. The basis is built with modified 
+ * sparse linear systems. The basis is built with modified
  * Gram-Schmidt. At each restart, a few approximated eigenvectors
  * corresponding to the smallest eigenvalues are used to build a
- * preconditioner for the next cycle. This preconditioner 
- * for deflation can be combined with any other preconditioner, 
- * the IncompleteLUT for instance. The preconditioner is applied 
+ * preconditioner for the next cycle. This preconditioner
+ * for deflation can be combined with any other preconditioner,
+ * the IncompleteLUT for instance. The preconditioner is applied
  * at right of the matrix and the combination is multiplicative.
- * 
- * \tparam _MatrixType the type of the sparse matrix A, can be a dense or a sparse matrix.
- * \tparam _Preconditioner the type of the preconditioner. Default is DiagonalPreconditioner
+ *
+ * \tparam MatrixType_ the type of the sparse matrix A, can be a dense or a sparse matrix.
+ * \tparam Preconditioner_ the type of the preconditioner. Default is DiagonalPreconditioner
  * Typical usage :
  * \code
  * SparseMatrix<double> A;
- * VectorXd x, b; 
+ * VectorXd x, b;
  * //Fill A and b ...
  * DGMRES<SparseMatrix<double> > solver;
  * solver.set_restart(30); // Set restarting value
@@ -83,207 +78,199 @@ void sortWithPermutation (VectorType& vec, IndexType& perm, typename IndexType::
  * solver.compute(A);
  * x = solver.solve(b);
  * \endcode
- * 
+ *
+ * DGMRES can also be used in a matrix-free context, see the following \link MatrixfreeSolverExample example \endlink.
+ *
  * References :
  * [1] D. NUENTSA WAKAM and F. PACULL, Memory Efficient Hybrid
  *  Algebraic Solvers for Linear Systems Arising from Compressible
  *  Flows, Computers and Fluids, In Press,
- *  http://dx.doi.org/10.1016/j.compfluid.2012.03.023   
- * [2] K. Burrage and J. Erhel, On the performance of various 
+ *  https://doi.org/10.1016/j.compfluid.2012.03.023
+ * [2] K. Burrage and J. Erhel, On the performance of various
  * adaptive preconditioned GMRES strategies, 5(1998), 101-121.
- * [3] J. Erhel, K. Burrage and B. Pohl, Restarted GMRES 
+ * [3] J. Erhel, K. Burrage and B. Pohl, Restarted GMRES
  *  preconditioned by deflation,J. Computational and Applied
- *  Mathematics, 69(1996), 303-318. 
+ *  Mathematics, 69(1996), 303-318.
 
- * 
+ *
  */
-template< typename _MatrixType, typename _Preconditioner>
-class DGMRES : public IterativeSolverBase<DGMRES<_MatrixType,_Preconditioner> >
-{
-    typedef IterativeSolverBase<DGMRES> Base;
-    using Base::mp_matrix;
-    using Base::m_error;
-    using Base::m_iterations;
-    using Base::m_info;
-    using Base::m_isInitialized;
-    using Base::m_tolerance; 
-  public:
-    typedef _MatrixType MatrixType;
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::Index Index;
-    typedef typename MatrixType::RealScalar RealScalar;
-    typedef _Preconditioner Preconditioner;
-    typedef Matrix<Scalar,Dynamic,Dynamic> DenseMatrix; 
-    typedef Matrix<RealScalar,Dynamic,Dynamic> DenseRealMatrix; 
-    typedef Matrix<Scalar,Dynamic,1> DenseVector;
-    typedef Matrix<RealScalar,Dynamic,1> DenseRealVector; 
-    typedef Matrix<std::complex<RealScalar>, Dynamic, 1> ComplexVector;
- 
-    
+template <typename MatrixType_, typename Preconditioner_>
+class DGMRES : public IterativeSolverBase<DGMRES<MatrixType_, Preconditioner_> > {
+  typedef IterativeSolverBase<DGMRES> Base;
+  using Base::m_error;
+  using Base::m_info;
+  using Base::m_isInitialized;
+  using Base::m_iterations;
+  using Base::m_tolerance;
+  using Base::matrix;
+
+ public:
+  using Base::_solve_impl;
+  using Base::_solve_with_guess_impl;
+  typedef MatrixType_ MatrixType;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::StorageIndex StorageIndex;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef internal::make_complex_t<Scalar> ComplexScalar;
+  typedef Preconditioner_ Preconditioner;
+  typedef Matrix<Scalar, Dynamic, Dynamic> DenseMatrix;
+  typedef Matrix<RealScalar, Dynamic, Dynamic> DenseRealMatrix;
+  typedef Matrix<Scalar, Dynamic, 1> DenseVector;
+  typedef Matrix<RealScalar, Dynamic, 1> DenseRealVector;
+  typedef Matrix<ComplexScalar, Dynamic, 1> ComplexVector;
+
   /** Default constructor. */
-  DGMRES() : Base(),m_restart(30),m_neig(0),m_r(0),m_maxNeig(5),m_isDeflAllocated(false),m_isDeflInitialized(false) {}
+  DGMRES()
+      : Base(), m_restart(30), m_neig(0), m_r(0), m_maxNeig(5), m_isDeflAllocated(false), m_isDeflInitialized(false) {}
 
   /** Initialize the solver with matrix \a A for further \c Ax=b solving.
-    * 
-    * This constructor is a shortcut for the default constructor followed
-    * by a call to compute().
-    * 
-    * \warning this class stores a reference to the matrix A as well as some
-    * precomputed values that depend on it. Therefore, if \a A is changed
-    * this class becomes invalid. Call compute() to update it with the new
-    * matrix A, or modify a copy of A.
-    */
-  template<typename MatrixDerived>
-  explicit DGMRES(const EigenBase<MatrixDerived>& A) : Base(A.derived()), m_restart(30),m_neig(0),m_r(0),m_maxNeig(5),m_isDeflAllocated(false),m_isDeflInitialized(false) {}
+   *
+   * This constructor is a shortcut for the default constructor followed
+   * by a call to compute().
+   *
+   * \warning this class stores a reference to the matrix A as well as some
+   * precomputed values that depend on it. Therefore, if \a A is changed
+   * this class becomes invalid. Call compute() to update it with the new
+   * matrix A, or modify a copy of A.
+   */
+  template <typename MatrixDerived>
+  explicit DGMRES(const EigenBase<MatrixDerived>& A)
+      : Base(A.derived()),
+        m_restart(30),
+        m_neig(0),
+        m_r(0),
+        m_maxNeig(5),
+        m_isDeflAllocated(false),
+        m_isDeflInitialized(false) {}
 
   ~DGMRES() {}
-  
-  /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A
-    * \a x0 as an initial solution.
-    *
-    * \sa compute()
-    */
-  template<typename Rhs,typename Guess>
-  inline const internal::solve_retval_with_guess<DGMRES, Rhs, Guess>
-  solveWithGuess(const MatrixBase<Rhs>& b, const Guess& x0) const
-  {
-    eigen_assert(m_isInitialized && "DGMRES is not initialized.");
-    eigen_assert(Base::rows()==b.rows()
-              && "DGMRES::solve(): invalid number of rows of the right hand side matrix b");
-    return internal::solve_retval_with_guess
-            <DGMRES, Rhs, Guess>(*this, b.derived(), x0);
-  }
-  
-  /** \internal */
-  template<typename Rhs,typename Dest>
-  void _solveWithGuess(const Rhs& b, Dest& x) const
-  {    
-    bool failed = false;
-    for(int j=0; j<b.cols(); ++j)
-    {
-      m_iterations = Base::maxIterations();
-      m_error = Base::m_tolerance;
-      
-      typename Dest::ColXpr xj(x,j);
-      dgmres(*mp_matrix, b.col(j), xj, Base::m_preconditioner);
-    }
-    m_info = failed ? NumericalIssue
-           : m_error <= Base::m_tolerance ? Success
-           : NoConvergence;
-    m_isInitialized = true;
-  }
 
   /** \internal */
-  template<typename Rhs,typename Dest>
-  void _solve(const Rhs& b, Dest& x) const
-  {
-    x = b;
-    _solveWithGuess(b,x);
+  template <typename Rhs, typename Dest>
+  void _solve_vector_with_guess_impl(const Rhs& b, Dest& x) const {
+    EIGEN_STATIC_ASSERT(Rhs::ColsAtCompileTime == 1 || Dest::ColsAtCompileTime == 1,
+                        YOU_TRIED_CALLING_A_VECTOR_METHOD_ON_A_MATRIX);
+
+    m_iterations = Base::maxIterations();
+    m_error = Base::m_tolerance;
+
+    dgmres(matrix(), b, x, Base::m_preconditioner);
   }
-  /** 
+
+  /**
    * Get the restart value
-    */
-  int restart() { return m_restart; }
-  
-  /** 
-   * Set the restart value (default is 30)  
    */
-  void set_restart(const int restart) { m_restart=restart; }
-  
-  /** 
-   * Set the number of eigenvalues to deflate at each restart 
+  Index restart() { return m_restart; }
+
+  /**
+   * Set the restart value (default is 30)
    */
-  void setEigenv(const int neig) 
-  {
+  void set_restart(const Index restart) { m_restart = restart; }
+
+  /**
+   * Set the number of eigenvalues to deflate at each restart
+   */
+  void setEigenv(const Index neig) {
     m_neig = neig;
-    if (neig+1 > m_maxNeig) m_maxNeig = neig+1; // To allow for complex conjugates
+    if (neig + 1 > m_maxNeig) m_maxNeig = neig + 1;  // To allow for complex conjugates
   }
-  
-  /** 
+
+  /**
    * Get the size of the deflation subspace size
-   */ 
-  int deflSize() {return m_r; }
-  
+   */
+  Index deflSize() { return m_r; }
+
   /**
    * Set the maximum size of the deflation subspace
    */
-  void setMaxEigenv(const int maxNeig) { m_maxNeig = maxNeig; }
-  
-  protected:
-    // DGMRES algorithm 
-    template<typename Rhs, typename Dest>
-    void dgmres(const MatrixType& mat,const Rhs& rhs, Dest& x, const Preconditioner& precond) const;
-    // Perform one cycle of GMRES
-    template<typename Dest>
-    int dgmresCycle(const MatrixType& mat, const Preconditioner& precond, Dest& x, DenseVector& r0, RealScalar& beta, const RealScalar& normRhs, int& nbIts) const; 
-    // Compute data to use for deflation 
-    int dgmresComputeDeflationData(const MatrixType& mat, const Preconditioner& precond, const Index& it, Index& neig) const;
-    // Apply deflation to a vector
-    template<typename RhsType, typename DestType>
-    int dgmresApplyDeflation(const RhsType& In, DestType& Out) const; 
-    ComplexVector schurValues(const ComplexSchur<DenseMatrix>& schurofH) const;
-    ComplexVector schurValues(const RealSchur<DenseMatrix>& schurofH) const;
-    // Init data for deflation
-    void dgmresInitDeflation(Index& rows) const; 
-    mutable DenseMatrix m_V; // Krylov basis vectors
-    mutable DenseMatrix m_H; // Hessenberg matrix 
-    mutable DenseMatrix m_Hes; // Initial hessenberg matrix wihout Givens rotations applied
-    mutable Index m_restart; // Maximum size of the Krylov subspace
-    mutable DenseMatrix m_U; // Vectors that form the basis of the invariant subspace 
-    mutable DenseMatrix m_MU; // matrix operator applied to m_U (for next cycles)
-    mutable DenseMatrix m_T; /* T=U^T*M^{-1}*A*U */
-    mutable PartialPivLU<DenseMatrix> m_luT; // LU factorization of m_T
-    mutable int m_neig; //Number of eigenvalues to extract at each restart
-    mutable int m_r; // Current number of deflated eigenvalues, size of m_U
-    mutable int m_maxNeig; // Maximum number of eigenvalues to deflate
-    mutable RealScalar m_lambdaN; //Modulus of the largest eigenvalue of A
-    mutable bool m_isDeflAllocated;
-    mutable bool m_isDeflInitialized;
-    
-    //Adaptive strategy 
-    mutable RealScalar m_smv; // Smaller multiple of the remaining number of steps allowed
-    mutable bool m_force; // Force the use of deflation at each restart
-    
-}; 
-/** 
- * \brief Perform several cycles of restarted GMRES with modified Gram Schmidt, 
- * 
+  void setMaxEigenv(const Index maxNeig) { m_maxNeig = maxNeig; }
+
+ protected:
+  // DGMRES algorithm
+  template <typename Rhs, typename Dest>
+  void dgmres(const MatrixType& mat, const Rhs& rhs, Dest& x, const Preconditioner& precond) const;
+  // Perform one cycle of GMRES
+  template <typename Dest>
+  Index dgmresCycle(const MatrixType& mat, const Preconditioner& precond, Dest& x, DenseVector& r0, RealScalar& beta,
+                    const RealScalar& normRhs, Index& nbIts) const;
+  // Compute data to use for deflation
+  Index dgmresComputeDeflationData(const MatrixType& mat, const Preconditioner& precond, const Index& it,
+                                   StorageIndex& neig) const;
+  // Apply deflation to a vector
+  template <typename RhsType, typename DestType>
+  Index dgmresApplyDeflation(const RhsType& In, DestType& Out) const;
+  ComplexVector schurValues(const ComplexSchur<DenseMatrix>& schurofH) const;
+  ComplexVector schurValues(const RealSchur<DenseMatrix>& schurofH) const;
+  // Init data for deflation
+  void dgmresInitDeflation(Index& rows) const;
+  mutable DenseMatrix m_V;                  // Krylov basis vectors
+  mutable DenseMatrix m_H;                  // Hessenberg matrix
+  mutable DenseMatrix m_Hes;                // Initial hessenberg matrix without Givens rotations applied
+  mutable Index m_restart;                  // Maximum size of the Krylov subspace
+  mutable DenseMatrix m_U;                  // Vectors that form the basis of the invariant subspace
+  mutable DenseMatrix m_MU;                 // matrix operator applied to m_U (for next cycles)
+  mutable DenseMatrix m_T;                  /* T=U^T*M^{-1}*A*U */
+  mutable PartialPivLU<DenseMatrix> m_luT;  // LU factorization of m_T
+  mutable StorageIndex m_neig;              // Number of eigenvalues to extract at each restart
+  mutable Index m_r;                        // Current number of deflated eigenvalues, size of m_U
+  mutable Index m_maxNeig;                  // Maximum number of eigenvalues to deflate
+  mutable RealScalar m_lambdaN;             // Modulus of the largest eigenvalue of A
+  mutable bool m_isDeflAllocated;
+  mutable bool m_isDeflInitialized;
+
+  // Adaptive strategy
+  mutable RealScalar m_smv;  // Smaller multiple of the remaining number of steps allowed
+  mutable bool m_force;      // Force the use of deflation at each restart
+};
+/**
+ * \brief Perform several cycles of restarted GMRES with modified Gram Schmidt,
+ *
  * A right preconditioner is used combined with deflation.
- * 
+ *
  */
-template< typename _MatrixType, typename _Preconditioner>
-template<typename Rhs, typename Dest>
-void DGMRES<_MatrixType, _Preconditioner>::dgmres(const MatrixType& mat,const Rhs& rhs, Dest& x,
-              const Preconditioner& precond) const
-{
-  //Initialization
-  int n = mat.rows(); 
-  DenseVector r0(n); 
-  int nbIts = 0; 
-  m_H.resize(m_restart+1, m_restart);
-  m_Hes.resize(m_restart, m_restart);
-  m_V.resize(n,m_restart+1);
-  //Initial residual vector and intial norm
-  x = precond.solve(x);
-  r0 = rhs - mat * x; 
-  RealScalar beta = r0.norm(); 
+template <typename MatrixType_, typename Preconditioner_>
+template <typename Rhs, typename Dest>
+void DGMRES<MatrixType_, Preconditioner_>::dgmres(const MatrixType& mat, const Rhs& rhs, Dest& x,
+                                                  const Preconditioner& precond) const {
+  const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
+
   RealScalar normRhs = rhs.norm();
-  m_error = beta/normRhs; 
-  if(m_error < m_tolerance)
-    m_info = Success; 
+  if (normRhs <= considerAsZero) {
+    x.setZero();
+    m_error = 0;
+    return;
+  }
+
+  // Initialization
+  m_isDeflInitialized = false;
+  Index n = mat.rows();
+  DenseVector r0(n);
+  Index nbIts = 0;
+  m_H.resize(m_restart + 1, m_restart);
+  m_Hes.resize(m_restart, m_restart);
+  m_V.resize(n, m_restart + 1);
+  // Initial residual vector and initial norm
+  if (x.squaredNorm() == 0) x = precond.solve(rhs);
+  r0 = rhs - mat * x;
+  RealScalar beta = r0.norm();
+
+  m_error = beta / normRhs;
+  if (m_error < m_tolerance)
+    m_info = Success;
   else
     m_info = NoConvergence;
-  
+
   // Iterative process
-  while (nbIts < m_iterations && m_info == NoConvergence)
-  {
-    dgmresCycle(mat, precond, x, r0, beta, normRhs, nbIts); 
-    
-    // Compute the new residual vector for the restart 
-    if (nbIts < m_iterations && m_info == NoConvergence)
-      r0 = rhs - mat * x; 
+  while (nbIts < m_iterations && m_info == NoConvergence) {
+    dgmresCycle(mat, precond, x, r0, beta, normRhs, nbIts);
+
+    // Compute the new residual vector for the restart
+    if (nbIts < m_iterations && m_info == NoConvergence) {
+      r0 = rhs - mat * x;
+      beta = r0.norm();
+    }
   }
-} 
+}
 
 /**
  * \brief Perform one restart cycle of DGMRES
@@ -295,248 +282,211 @@ void DGMRES<_MatrixType, _Preconditioner>::dgmres(const MatrixType& mat,const Rh
  * \param normRhs The norm of the right hand side vector
  * \param nbIts The number of iterations
  */
-template< typename _MatrixType, typename _Preconditioner>
-template<typename Dest>
-int DGMRES<_MatrixType, _Preconditioner>::dgmresCycle(const MatrixType& mat, const Preconditioner& precond, Dest& x, DenseVector& r0, RealScalar& beta, const RealScalar& normRhs, int& nbIts) const
-{
-  //Initialization 
-  DenseVector g(m_restart+1); // Right hand side of the least square problem
-  g.setZero();  
-  g(0) = Scalar(beta); 
-  m_V.col(0) = r0/beta; 
-  m_info = NoConvergence; 
-  std::vector<JacobiRotation<Scalar> >gr(m_restart); // Givens rotations
-  int it = 0; // Number of inner iterations 
-  int n = mat.rows();
-  DenseVector tv1(n), tv2(n);  //Temporary vectors
-  while (m_info == NoConvergence && it < m_restart && nbIts < m_iterations)
-  {    
+template <typename MatrixType_, typename Preconditioner_>
+template <typename Dest>
+Index DGMRES<MatrixType_, Preconditioner_>::dgmresCycle(const MatrixType& mat, const Preconditioner& precond, Dest& x,
+                                                        DenseVector& r0, RealScalar& beta, const RealScalar& normRhs,
+                                                        Index& nbIts) const {
+  // Initialization
+  DenseVector g(m_restart + 1);  // Right hand side of the least square problem
+  g.setZero();
+  g(0) = Scalar(beta);
+  m_V.col(0) = r0 / beta;
+  m_info = NoConvergence;
+  std::vector<JacobiRotation<Scalar> > gr(m_restart);  // Givens rotations
+  Index it = 0;                                        // Number of inner iterations
+  Index n = mat.rows();
+  DenseVector tv1(n), tv2(n);  // Temporary vectors
+  while (m_info == NoConvergence && it < m_restart && nbIts < m_iterations) {
     // Apply preconditioner(s) at right
-    if (m_isDeflInitialized )
-    {
-      dgmresApplyDeflation(m_V.col(it), tv1); // Deflation
-      tv2 = precond.solve(tv1); 
-    }
-    else
-    {
-      tv2 = precond.solve(m_V.col(it)); // User's selected preconditioner
+    if (m_isDeflInitialized) {
+      dgmresApplyDeflation(m_V.col(it), tv1);  // Deflation
+      tv2 = precond.solve(tv1);
+    } else {
+      tv2 = precond.solve(m_V.col(it));  // User's selected preconditioner
     }
-    tv1 = mat * tv2; 
-   
+    tv1 = mat * tv2;
+
     // Orthogonalize it with the previous basis in the basis using modified Gram-Schmidt
-    Scalar coef; 
-    for (int i = 0; i <= it; ++i)
-    { 
+    Scalar coef;
+    for (Index i = 0; i <= it; ++i) {
       coef = tv1.dot(m_V.col(i));
-      tv1 = tv1 - coef * m_V.col(i); 
-      m_H(i,it) = coef; 
-      m_Hes(i,it) = coef; 
+      tv1 = tv1 - coef * m_V.col(i);
+      m_H(i, it) = coef;
+      m_Hes(i, it) = coef;
     }
-    // Normalize the vector 
-    coef = tv1.norm(); 
-    m_V.col(it+1) = tv1/coef;
-    m_H(it+1, it) = coef;
-//     m_Hes(it+1,it) = coef; 
-    
-    // FIXME Check for happy breakdown 
-    
+    // Normalize the vector
+    coef = tv1.norm();
+    m_V.col(it + 1) = tv1 / coef;
+    m_H(it + 1, it) = coef;
+    //     m_Hes(it+1,it) = coef;
+
+    // FIXME Check for happy breakdown
+
     // Update Hessenberg matrix with Givens rotations
-    for (int i = 1; i <= it; ++i) 
-    {
-      m_H.col(it).applyOnTheLeft(i-1,i,gr[i-1].adjoint());
+    for (Index i = 1; i <= it; ++i) {
+      m_H.col(it).applyOnTheLeft(i - 1, i, gr[i - 1].adjoint());
     }
-    // Compute the new plane rotation 
-    gr[it].makeGivens(m_H(it, it), m_H(it+1,it)); 
+    // Compute the new plane rotation
+    gr[it].makeGivens(m_H(it, it), m_H(it + 1, it));
     // Apply the new rotation
-    m_H.col(it).applyOnTheLeft(it,it+1,gr[it].adjoint());
-    g.applyOnTheLeft(it,it+1, gr[it].adjoint()); 
-    
-    beta = std::abs(g(it+1));
-    m_error = beta/normRhs; 
-    std::cerr << nbIts << " Relative Residual Norm " << m_error << std::endl;
-    it++; nbIts++; 
-    
-    if (m_error < m_tolerance)
-    {
+    m_H.col(it).applyOnTheLeft(it, it + 1, gr[it].adjoint());
+    g.applyOnTheLeft(it, it + 1, gr[it].adjoint());
+
+    beta = std::abs(g(it + 1));
+    m_error = beta / normRhs;
+    // std::cerr << nbIts << " Relative Residual Norm " << m_error << std::endl;
+    it++;
+    nbIts++;
+
+    if (m_error < m_tolerance) {
       // The method has converged
       m_info = Success;
       break;
     }
   }
-  
+
   // Compute the new coefficients by solving the least square problem
-//   it++;
-  //FIXME  Check first if the matrix is singular ... zero diagonal
-  DenseVector nrs(m_restart); 
-  nrs = m_H.topLeftCorner(it,it).template triangularView<Upper>().solve(g.head(it)); 
-  
+  //   it++;
+  // FIXME  Check first if the matrix is singular ... zero diagonal
+  DenseVector nrs(m_restart);
+  nrs = m_H.topLeftCorner(it, it).template triangularView<Upper>().solve(g.head(it));
+
   // Form the new solution
-  if (m_isDeflInitialized)
-  {
-    tv1 = m_V.leftCols(it) * nrs; 
-    dgmresApplyDeflation(tv1, tv2); 
+  if (m_isDeflInitialized) {
+    tv1 = m_V.leftCols(it) * nrs;
+    dgmresApplyDeflation(tv1, tv2);
     x = x + precond.solve(tv2);
-  }
-  else
-    x = x + precond.solve(m_V.leftCols(it) * nrs); 
-  
+  } else
+    x = x + precond.solve(m_V.leftCols(it) * nrs);
+
   // Go for a new cycle and compute data for deflation
-  if(nbIts < m_iterations && m_info == NoConvergence && m_neig > 0 && (m_r+m_neig) < m_maxNeig)
-    dgmresComputeDeflationData(mat, precond, it, m_neig); 
-  return 0; 
-  
+  if (nbIts < m_iterations && m_info == NoConvergence && m_neig > 0 && (m_r + m_neig) < m_maxNeig)
+    dgmresComputeDeflationData(mat, precond, it, m_neig);
+  return 0;
 }
 
-
-template< typename _MatrixType, typename _Preconditioner>
-void DGMRES<_MatrixType, _Preconditioner>::dgmresInitDeflation(Index& rows) const
-{
+template <typename MatrixType_, typename Preconditioner_>
+void DGMRES<MatrixType_, Preconditioner_>::dgmresInitDeflation(Index& rows) const {
   m_U.resize(rows, m_maxNeig);
-  m_MU.resize(rows, m_maxNeig); 
+  m_MU.resize(rows, m_maxNeig);
   m_T.resize(m_maxNeig, m_maxNeig);
-  m_lambdaN = 0.0; 
-  m_isDeflAllocated = true; 
+  m_lambdaN = 0.0;
+  m_isDeflAllocated = true;
 }
 
-template< typename _MatrixType, typename _Preconditioner>
-inline typename DGMRES<_MatrixType, _Preconditioner>::ComplexVector DGMRES<_MatrixType, _Preconditioner>::schurValues(const ComplexSchur<DenseMatrix>& schurofH) const
-{
+template <typename MatrixType_, typename Preconditioner_>
+inline typename DGMRES<MatrixType_, Preconditioner_>::ComplexVector DGMRES<MatrixType_, Preconditioner_>::schurValues(
+    const ComplexSchur<DenseMatrix>& schurofH) const {
   return schurofH.matrixT().diagonal();
 }
 
-template< typename _MatrixType, typename _Preconditioner>
-inline typename DGMRES<_MatrixType, _Preconditioner>::ComplexVector DGMRES<_MatrixType, _Preconditioner>::schurValues(const RealSchur<DenseMatrix>& schurofH) const
-{
-  typedef typename MatrixType::Index Index;
+template <typename MatrixType_, typename Preconditioner_>
+inline typename DGMRES<MatrixType_, Preconditioner_>::ComplexVector DGMRES<MatrixType_, Preconditioner_>::schurValues(
+    const RealSchur<DenseMatrix>& schurofH) const {
   const DenseMatrix& T = schurofH.matrixT();
   Index it = T.rows();
   ComplexVector eig(it);
   Index j = 0;
-  while (j < it-1)
-  {
-    if (T(j+1,j) ==Scalar(0))
-    {
-      eig(j) = std::complex<RealScalar>(T(j,j),RealScalar(0)); 
-      j++; 
-    }
-    else
-    {
-      eig(j) = std::complex<RealScalar>(T(j,j),T(j+1,j)); 
-      eig(j+1) = std::complex<RealScalar>(T(j,j+1),T(j+1,j+1));
+  while (j < it - 1) {
+    if (T(j + 1, j) == Scalar(0)) {
+      eig(j) = ComplexScalar(T(j, j), RealScalar(0));
+      j++;
+    } else {
+      eig(j) = ComplexScalar(T(j, j), T(j + 1, j));
+      eig(j + 1) = ComplexScalar(T(j, j + 1), T(j + 1, j + 1));
       j++;
     }
   }
-  if (j < it-1) eig(j) = std::complex<RealScalar>(T(j,j),RealScalar(0));
+  if (j < it - 1) eig(j) = ComplexScalar(T(j, j), RealScalar(0));
   return eig;
 }
 
-template< typename _MatrixType, typename _Preconditioner>
-int DGMRES<_MatrixType, _Preconditioner>::dgmresComputeDeflationData(const MatrixType& mat, const Preconditioner& precond, const Index& it, Index& neig) const
-{
+template <typename MatrixType_, typename Preconditioner_>
+Index DGMRES<MatrixType_, Preconditioner_>::dgmresComputeDeflationData(const MatrixType& mat,
+                                                                       const Preconditioner& precond, const Index& it,
+                                                                       StorageIndex& neig) const {
   // First, find the Schur form of the Hessenberg matrix H
-  typename internal::conditional<NumTraits<Scalar>::IsComplex, ComplexSchur<DenseMatrix>, RealSchur<DenseMatrix> >::type schurofH; 
+  std::conditional_t<NumTraits<Scalar>::IsComplex, ComplexSchur<DenseMatrix>, RealSchur<DenseMatrix> > schurofH;
   bool computeU = true;
-  DenseMatrix matrixQ(it,it); 
+  DenseMatrix matrixQ(it, it);
   matrixQ.setIdentity();
-  schurofH.computeFromHessenberg(m_Hes.topLeftCorner(it,it), matrixQ, computeU); 
-  
+  schurofH.computeFromHessenberg(m_Hes.topLeftCorner(it, it), matrixQ, computeU);
+
   ComplexVector eig(it);
-  Matrix<Index,Dynamic,1>perm(it); 
+  Matrix<StorageIndex, Dynamic, 1> perm(it);
   eig = this->schurValues(schurofH);
-  
+
   // Reorder the absolute values of Schur values
-  DenseRealVector modulEig(it); 
-  for (int j=0; j<it; ++j) modulEig(j) = std::abs(eig(j)); 
-  perm.setLinSpaced(it,0,it-1);
+  DenseRealVector modulEig(it);
+  for (Index j = 0; j < it; ++j) modulEig(j) = std::abs(eig(j));
+  perm.setLinSpaced(it, 0, internal::convert_index<StorageIndex>(it - 1));
   internal::sortWithPermutation(modulEig, perm, neig);
-  
-  if (!m_lambdaN)
-  {
+
+  if (!m_lambdaN) {
     m_lambdaN = (std::max)(modulEig.maxCoeff(), m_lambdaN);
   }
-  //Count the real number of extracted eigenvalues (with complex conjugates)
-  int nbrEig = 0; 
-  while (nbrEig < neig)
-  {
-    if(eig(perm(it-nbrEig-1)).imag() == RealScalar(0)) nbrEig++; 
-    else nbrEig += 2; 
+  // Count the real number of extracted eigenvalues (with complex conjugates)
+  Index nbrEig = 0;
+  while (nbrEig < neig) {
+    if (eig(perm(it - nbrEig - 1)).imag() == RealScalar(0))
+      nbrEig++;
+    else
+      nbrEig += 2;
   }
   // Extract the  Schur vectors corresponding to the smallest Ritz values
-  DenseMatrix Sr(it, nbrEig); 
+  DenseMatrix Sr(it, nbrEig);
   Sr.setZero();
-  for (int j = 0; j < nbrEig; j++)
-  {
-    Sr.col(j) = schurofH.matrixU().col(perm(it-j-1));
+  for (Index j = 0; j < nbrEig; j++) {
+    Sr.col(j) = schurofH.matrixU().col(perm(it - j - 1));
   }
-  
+
   // Form the Schur vectors of the initial matrix using the Krylov basis
-  DenseMatrix X; 
+  DenseMatrix X;
   X = m_V.leftCols(it) * Sr;
-  if (m_r)
-  {
-   // Orthogonalize X against m_U using modified Gram-Schmidt
-   for (int j = 0; j < nbrEig; j++)
-     for (int k =0; k < m_r; k++)
-      X.col(j) = X.col(j) - (m_U.col(k).dot(X.col(j)))*m_U.col(k); 
+  if (m_r) {
+    // Orthogonalize X against m_U using modified Gram-Schmidt
+    for (Index j = 0; j < nbrEig; j++)
+      for (Index k = 0; k < m_r; k++) X.col(j) = X.col(j) - (m_U.col(k).dot(X.col(j))) * m_U.col(k);
   }
-  
+
   // Compute m_MX = A * M^-1 * X
   Index m = m_V.rows();
-  if (!m_isDeflAllocated) 
-    dgmresInitDeflation(m); 
+  if (!m_isDeflAllocated) dgmresInitDeflation(m);
   DenseMatrix MX(m, nbrEig);
   DenseVector tv1(m);
-  for (int j = 0; j < nbrEig; j++)
-  {
+  for (Index j = 0; j < nbrEig; j++) {
     tv1 = mat * X.col(j);
     MX.col(j) = precond.solve(tv1);
   }
-  
-  //Update m_T = [U'MU U'MX; X'MU X'MX]
-  m_T.block(m_r, m_r, nbrEig, nbrEig) = X.transpose() * MX; 
-  if(m_r)
-  {
-    m_T.block(0, m_r, m_r, nbrEig) = m_U.leftCols(m_r).transpose() * MX; 
+
+  // Update m_T = [U'MU U'MX; X'MU X'MX]
+  m_T.block(m_r, m_r, nbrEig, nbrEig) = X.transpose() * MX;
+  if (m_r) {
+    m_T.block(0, m_r, m_r, nbrEig) = m_U.leftCols(m_r).transpose() * MX;
     m_T.block(m_r, 0, nbrEig, m_r) = X.transpose() * m_MU.leftCols(m_r);
   }
-  
+
   // Save X into m_U and m_MX in m_MU
-  for (int j = 0; j < nbrEig; j++) m_U.col(m_r+j) = X.col(j);
-  for (int j = 0; j < nbrEig; j++) m_MU.col(m_r+j) = MX.col(j);
+  for (Index j = 0; j < nbrEig; j++) m_U.col(m_r + j) = X.col(j);
+  for (Index j = 0; j < nbrEig; j++) m_MU.col(m_r + j) = MX.col(j);
   // Increase the size of the invariant subspace
-  m_r += nbrEig; 
-  
+  m_r += nbrEig;
+
   // Factorize m_T into m_luT
   m_luT.compute(m_T.topLeftCorner(m_r, m_r));
-  
-  //FIXME CHeck if the factorization was correctly done (nonsingular matrix)
+
+  // FIXME CHeck if the factorization was correctly done (nonsingular matrix)
   m_isDeflInitialized = true;
-  return 0; 
+  return 0;
 }
-template<typename _MatrixType, typename _Preconditioner>
-template<typename RhsType, typename DestType>
-int DGMRES<_MatrixType, _Preconditioner>::dgmresApplyDeflation(const RhsType &x, DestType &y) const
-{
-  DenseVector x1 = m_U.leftCols(m_r).transpose() * x; 
-  y = x + m_U.leftCols(m_r) * ( m_lambdaN * m_luT.solve(x1) - x1);
-  return 0; 
+template <typename MatrixType_, typename Preconditioner_>
+template <typename RhsType, typename DestType>
+Index DGMRES<MatrixType_, Preconditioner_>::dgmresApplyDeflation(const RhsType& x, DestType& y) const {
+  DenseVector x1 = m_U.leftCols(m_r).transpose() * x;
+  y = x + m_U.leftCols(m_r) * (m_lambdaN * m_luT.solve(x1) - x1);
+  return 0;
 }
 
-namespace internal {
-
-  template<typename _MatrixType, typename _Preconditioner, typename Rhs>
-struct solve_retval<DGMRES<_MatrixType, _Preconditioner>, Rhs>
-  : solve_retval_base<DGMRES<_MatrixType, _Preconditioner>, Rhs>
-{
-  typedef DGMRES<_MatrixType, _Preconditioner> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
-  }
-};
-} // end namespace internal
-
-} // end namespace Eigen
-#endif 
+}  // end namespace Eigen
+#endif
diff --git a/inst/include/unsupported/Eigen/src/IterativeSolvers/GMRES.h b/inst/include/unsupported/Eigen/src/IterativeSolvers/GMRES.h
index ea5deb26..238d0ec6 100644
--- a/inst/include/unsupported/Eigen/src/IterativeSolvers/GMRES.h
+++ b/inst/include/unsupported/Eigen/src/IterativeSolvers/GMRES.h
@@ -11,7 +11,10 @@
 #ifndef EIGEN_GMRES_H
 #define EIGEN_GMRES_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 namespace internal {
 
@@ -21,17 +24,17 @@ namespace internal {
  *
  * Parameters:
  *  \param mat       matrix of linear system of equations
- *  \param Rhs       right hand side vector of linear system of equations
+ *  \param rhs       right hand side vector of linear system of equations
  *  \param x         on input: initial guess, on output: solution
  *  \param precond   preconditioner used
  *  \param iters     on input: maximum number of iterations to perform
  *                   on output: number of iterations performed
  *  \param restart   number of iterations for a restart
- *  \param tol_error on input: residual tolerance
+ *  \param tol_error on input: relative residual tolerance
  *                   on output: residuum achieved
  *
- * \sa IterativeMethods::bicgstab() 
- *  
+ * \sa IterativeMethods::bicgstab()
+ *
  *
  * For references, please see:
  *
@@ -52,320 +55,261 @@ namespace internal {
  * SIAM J.Sci.Stat.Comp. 9, 1988, pp. 152 - 163.
  *
  */
-template<typename MatrixType, typename Rhs, typename Dest, typename Preconditioner>
-bool gmres(const MatrixType & mat, const Rhs & rhs, Dest & x, const Preconditioner & precond,
-		int &iters, const int &restart, typename Dest::RealScalar & tol_error) {
-
-	using std::sqrt;
-	using std::abs;
-
-	typedef typename Dest::RealScalar RealScalar;
-	typedef typename Dest::Scalar Scalar;
-	typedef Matrix < Scalar, Dynamic, 1 > VectorType;
-	typedef Matrix < Scalar, Dynamic, Dynamic > FMatrixType;
-
-	RealScalar tol = tol_error;
-	const int maxIters = iters;
-	iters = 0;
-
-	const int m = mat.rows();
-
-	VectorType p0 = rhs - mat*x;
-	VectorType r0 = precond.solve(p0);
- 
-	// is initial guess already good enough?
-	if(abs(r0.norm()) < tol) {
-		return true; 
-	}
-
-	VectorType w = VectorType::Zero(restart + 1);
-
-	FMatrixType H = FMatrixType::Zero(m, restart + 1); // Hessenberg matrix
-	VectorType tau = VectorType::Zero(restart + 1);
-	std::vector < JacobiRotation < Scalar > > G(restart);
-
-	// generate first Householder vector
-	VectorType e(m-1);
-	RealScalar beta;
-	r0.makeHouseholder(e, tau.coeffRef(0), beta);
-	w(0)=(Scalar) beta;
-	H.bottomLeftCorner(m - 1, 1) = e;
-
-	for (int k = 1; k <= restart; ++k) {
-
-		++iters;
-
-		VectorType v = VectorType::Unit(m, k - 1), workspace(m);
-
-		// apply Householder reflections H_{1} ... H_{k-1} to v
-		for (int i = k - 1; i >= 0; --i) {
-			v.tail(m - i).applyHouseholderOnTheLeft(H.col(i).tail(m - i - 1), tau.coeffRef(i), workspace.data());
-		}
-
-		// apply matrix M to v:  v = mat * v;
-		VectorType t=mat*v;
-		v=precond.solve(t);
-
-		// apply Householder reflections H_{k-1} ... H_{1} to v
-		for (int i = 0; i < k; ++i) {
-			v.tail(m - i).applyHouseholderOnTheLeft(H.col(i).tail(m - i - 1), tau.coeffRef(i), workspace.data());
-		}
-
-		if (v.tail(m - k).norm() != 0.0) {
-
-			if (k <= restart) {
-
-				// generate new Householder vector
-                                  VectorType e(m - k - 1);
-				RealScalar beta;
-				v.tail(m - k).makeHouseholder(e, tau.coeffRef(k), beta);
-				H.col(k).tail(m - k - 1) = e;
-
-				// apply Householder reflection H_{k} to v
-				v.tail(m - k).applyHouseholderOnTheLeft(H.col(k).tail(m - k - 1), tau.coeffRef(k), workspace.data());
-
-			}
-                }
+template <typename MatrixType, typename Rhs, typename Dest, typename Preconditioner>
+bool gmres(const MatrixType& mat, const Rhs& rhs, Dest& x, const Preconditioner& precond, Index& iters,
+           const Index& restart, typename Dest::RealScalar& tol_error) {
+  using std::abs;
+  using std::sqrt;
+
+  typedef typename Dest::RealScalar RealScalar;
+  typedef typename Dest::Scalar Scalar;
+  typedef Matrix<Scalar, Dynamic, 1> VectorType;
+  typedef Matrix<Scalar, Dynamic, Dynamic, ColMajor> FMatrixType;
+
+  const RealScalar considerAsZero = (std::numeric_limits<RealScalar>::min)();
+
+  if (rhs.norm() <= considerAsZero) {
+    x.setZero();
+    tol_error = 0;
+    return true;
+  }
 
-                if (k > 1) {
-                        for (int i = 0; i < k - 1; ++i) {
-                                // apply old Givens rotations to v
-                                v.applyOnTheLeft(i, i + 1, G[i].adjoint());
-                        }
-                }
+  RealScalar tol = tol_error;
+  const Index maxIters = iters;
+  iters = 0;
 
-                if (k<m && v(k) != (Scalar) 0) {
-                        // determine next Givens rotation
-                        G[k - 1].makeGivens(v(k - 1), v(k));
+  const Index m = mat.rows();
 
-                        // apply Givens rotation to v and w
-                        v.applyOnTheLeft(k - 1, k, G[k - 1].adjoint());
-                        w.applyOnTheLeft(k - 1, k, G[k - 1].adjoint());
+  // residual and preconditioned residual
+  VectorType p0 = rhs - mat * x;
+  VectorType r0 = precond.solve(p0);
 
-                }
+  const RealScalar r0Norm = r0.norm();
 
-                // insert coefficients into upper matrix triangle
-                H.col(k - 1).head(k) = v.head(k);
+  // is initial guess already good enough?
+  if (r0Norm == 0) {
+    tol_error = 0;
+    return true;
+  }
 
-                bool stop=(k==m || abs(w(k)) < tol || iters == maxIters);
+  // storage for Hessenberg matrix and Householder data
+  FMatrixType H = FMatrixType::Zero(m, restart + 1);
+  VectorType w = VectorType::Zero(restart + 1);
+  VectorType tau = VectorType::Zero(restart + 1);
 
-                if (stop || k == restart) {
+  // storage for Jacobi rotations
+  std::vector<JacobiRotation<Scalar> > G(restart);
 
-                        // solve upper triangular system
-                        VectorType y = w.head(k);
-                        H.topLeftCorner(k, k).template triangularView < Eigen::Upper > ().solveInPlace(y);
+  // storage for temporaries
+  VectorType t(m), v(m), workspace(m), x_new(m);
 
-                        // use Horner-like scheme to calculate solution vector
-                        VectorType x_new = y(k - 1) * VectorType::Unit(m, k - 1);
+  // generate first Householder vector
+  Ref<VectorType> H0_tail = H.col(0).tail(m - 1);
+  RealScalar beta;
+  r0.makeHouseholder(H0_tail, tau.coeffRef(0), beta);
+  w(0) = Scalar(beta);
 
-                        // apply Householder reflection H_{k} to x_new
-                        x_new.tail(m - k + 1).applyHouseholderOnTheLeft(H.col(k - 1).tail(m - k), tau.coeffRef(k - 1), workspace.data());
+  for (Index k = 1; k <= restart; ++k) {
+    ++iters;
 
-                        for (int i = k - 2; i >= 0; --i) {
-                                x_new += y(i) * VectorType::Unit(m, i);
-                                // apply Householder reflection H_{i} to x_new
-                                x_new.tail(m - i).applyHouseholderOnTheLeft(H.col(i).tail(m - i - 1), tau.coeffRef(i), workspace.data());
-                        }
+    v = VectorType::Unit(m, k - 1);
 
-                        x += x_new;
+    // apply Householder reflections H_{1} ... H_{k-1} to v
+    // TODO: use a HouseholderSequence
+    for (Index i = k - 1; i >= 0; --i) {
+      v.tail(m - i).applyHouseholderOnTheLeft(H.col(i).tail(m - i - 1), tau.coeffRef(i), workspace.data());
+    }
 
-                        if (stop) {
-                                return true;
-                        } else {
-                                k=0;
+    // apply matrix M to v:  v = mat * v;
+    t.noalias() = mat * v;
+    v = precond.solve(t);
 
-                                // reset data for a restart  r0 = rhs - mat * x;
-                                VectorType p0=mat*x;
-                                VectorType p1=precond.solve(p0);
-                                r0 = rhs - p1;
-//                                 r0_sqnorm = r0.squaredNorm();
-                                w = VectorType::Zero(restart + 1);
-                                H = FMatrixType::Zero(m, restart + 1);
-                                tau = VectorType::Zero(restart + 1);
+    // apply Householder reflections H_{k-1} ... H_{1} to v
+    // TODO: use a HouseholderSequence
+    for (Index i = 0; i < k; ++i) {
+      v.tail(m - i).applyHouseholderOnTheLeft(H.col(i).tail(m - i - 1), tau.coeffRef(i), workspace.data());
+    }
 
-                                // generate first Householder vector
-                                RealScalar beta;
-                                r0.makeHouseholder(e, tau.coeffRef(0), beta);
-                                w(0)=(Scalar) beta;
-                                H.bottomLeftCorner(m - 1, 1) = e;
+    if (v.tail(m - k).norm() != 0.0) {
+      if (k <= restart) {
+        // generate new Householder vector
+        Ref<VectorType> Hk_tail = H.col(k).tail(m - k - 1);
+        v.tail(m - k).makeHouseholder(Hk_tail, tau.coeffRef(k), beta);
 
-                        }
+        // apply Householder reflection H_{k} to v
+        v.tail(m - k).applyHouseholderOnTheLeft(Hk_tail, tau.coeffRef(k), workspace.data());
+      }
+    }
 
-                }
+    if (k > 1) {
+      for (Index i = 0; i < k - 1; ++i) {
+        // apply old Givens rotations to v
+        v.applyOnTheLeft(i, i + 1, G[i].adjoint());
+      }
+    }
 
+    if (k < m && v(k) != (Scalar)0) {
+      // determine next Givens rotation
+      G[k - 1].makeGivens(v(k - 1), v(k));
 
+      // apply Givens rotation to v and w
+      v.applyOnTheLeft(k - 1, k, G[k - 1].adjoint());
+      w.applyOnTheLeft(k - 1, k, G[k - 1].adjoint());
+    }
 
-	}
-	
-	return false;
+    // insert coefficients into upper matrix triangle
+    H.col(k - 1).head(k) = v.head(k);
+
+    tol_error = abs(w(k)) / r0Norm;
+    bool stop = (k == m || tol_error < tol || iters == maxIters);
+
+    if (stop || k == restart) {
+      // solve upper triangular system
+      Ref<VectorType> y = w.head(k);
+      H.topLeftCorner(k, k).template triangularView<Upper>().solveInPlace(y);
+
+      // use Horner-like scheme to calculate solution vector
+      x_new.setZero();
+      for (Index i = k - 1; i >= 0; --i) {
+        x_new(i) += y(i);
+        // apply Householder reflection H_{i} to x_new
+        x_new.tail(m - i).applyHouseholderOnTheLeft(H.col(i).tail(m - i - 1), tau.coeffRef(i), workspace.data());
+      }
+
+      x += x_new;
+
+      if (stop) {
+        return true;
+      } else {
+        k = 0;
+
+        // reset data for restart
+        p0.noalias() = rhs - mat * x;
+        r0 = precond.solve(p0);
+
+        // clear Hessenberg matrix and Householder data
+        H.setZero();
+        w.setZero();
+        tau.setZero();
+
+        // generate first Householder vector
+        r0.makeHouseholder(H0_tail, tau.coeffRef(0), beta);
+        w(0) = Scalar(beta);
+      }
+    }
+  }
 
+  return false;
 }
 
-}
+}  // namespace internal
 
-template< typename _MatrixType,
-          typename _Preconditioner = DiagonalPreconditioner<typename _MatrixType::Scalar> >
+template <typename MatrixType_, typename Preconditioner_ = DiagonalPreconditioner<typename MatrixType_::Scalar> >
 class GMRES;
 
 namespace internal {
 
-template< typename _MatrixType, typename _Preconditioner>
-struct traits<GMRES<_MatrixType,_Preconditioner> >
-{
-  typedef _MatrixType MatrixType;
-  typedef _Preconditioner Preconditioner;
+template <typename MatrixType_, typename Preconditioner_>
+struct traits<GMRES<MatrixType_, Preconditioner_> > {
+  typedef MatrixType_ MatrixType;
+  typedef Preconditioner_ Preconditioner;
 };
 
-}
+}  // namespace internal
 
 /** \ingroup IterativeLinearSolvers_Module
-  * \brief A GMRES solver for sparse square problems
-  *
-  * This class allows to solve for A.x = b sparse linear problems using a generalized minimal
-  * residual method. The vectors x and b can be either dense or sparse.
-  *
-  * \tparam _MatrixType the type of the sparse matrix A, can be a dense or a sparse matrix.
-  * \tparam _Preconditioner the type of the preconditioner. Default is DiagonalPreconditioner
-  *
-  * The maximal number of iterations and tolerance value can be controlled via the setMaxIterations()
-  * and setTolerance() methods. The defaults are the size of the problem for the maximal number of iterations
-  * and NumTraits<Scalar>::epsilon() for the tolerance.
-  * 
-  * This class can be used as the direct solver classes. Here is a typical usage example:
-  * \code
-  * int n = 10000;
-  * VectorXd x(n), b(n);
-  * SparseMatrix<double> A(n,n);
-  * // fill A and b
-  * GMRES<SparseMatrix<double> > solver(A);
-  * x = solver.solve(b);
-  * std::cout << "#iterations:     " << solver.iterations() << std::endl;
-  * std::cout << "estimated error: " << solver.error()      << std::endl;
-  * // update b, and solve again
-  * x = solver.solve(b);
-  * \endcode
-  * 
-  * By default the iterations start with x=0 as an initial guess of the solution.
-  * One can control the start using the solveWithGuess() method.
-  * 
-  * \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
-  */
-template< typename _MatrixType, typename _Preconditioner>
-class GMRES : public IterativeSolverBase<GMRES<_MatrixType,_Preconditioner> >
-{
+ * \brief A GMRES solver for sparse square problems
+ *
+ * This class allows to solve for A.x = b sparse linear problems using a generalized minimal
+ * residual method. The vectors x and b can be either dense or sparse.
+ *
+ * \tparam MatrixType_ the type of the sparse matrix A, can be a dense or a sparse matrix.
+ * \tparam Preconditioner_ the type of the preconditioner. Default is DiagonalPreconditioner
+ *
+ * The maximal number of iterations and tolerance value can be controlled via the setMaxIterations()
+ * and setTolerance() methods. The defaults are the size of the problem for the maximal number of iterations
+ * and NumTraits<Scalar>::epsilon() for the tolerance.
+ *
+ * This class can be used as the direct solver classes. Here is a typical usage example:
+ * \code
+ * int n = 10000;
+ * VectorXd x(n), b(n);
+ * SparseMatrix<double> A(n,n);
+ * // fill A and b
+ * GMRES<SparseMatrix<double> > solver(A);
+ * x = solver.solve(b);
+ * std::cout << "#iterations:     " << solver.iterations() << std::endl;
+ * std::cout << "estimated error: " << solver.error()      << std::endl;
+ * // update b, and solve again
+ * x = solver.solve(b);
+ * \endcode
+ *
+ * By default the iterations start with x=0 as an initial guess of the solution.
+ * One can control the start using the solveWithGuess() method.
+ *
+ * GMRES can also be used in a matrix-free context, see the following \link MatrixfreeSolverExample example \endlink.
+ *
+ * \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
+ */
+template <typename MatrixType_, typename Preconditioner_>
+class GMRES : public IterativeSolverBase<GMRES<MatrixType_, Preconditioner_> > {
   typedef IterativeSolverBase<GMRES> Base;
-  using Base::mp_matrix;
   using Base::m_error;
-  using Base::m_iterations;
   using Base::m_info;
   using Base::m_isInitialized;
- 
-private:
-  int m_restart;
-  
-public:
-  typedef _MatrixType MatrixType;
+  using Base::m_iterations;
+  using Base::matrix;
+
+ private:
+  Index m_restart;
+
+ public:
+  using Base::_solve_impl;
+  typedef MatrixType_ MatrixType;
   typedef typename MatrixType::Scalar Scalar;
-  typedef typename MatrixType::Index Index;
   typedef typename MatrixType::RealScalar RealScalar;
-  typedef _Preconditioner Preconditioner;
-
-public:
+  typedef Preconditioner_ Preconditioner;
 
+ public:
   /** Default constructor. */
   GMRES() : Base(), m_restart(30) {}
 
   /** Initialize the solver with matrix \a A for further \c Ax=b solving.
-    * 
-    * This constructor is a shortcut for the default constructor followed
-    * by a call to compute().
-    * 
-    * \warning this class stores a reference to the matrix A as well as some
-    * precomputed values that depend on it. Therefore, if \a A is changed
-    * this class becomes invalid. Call compute() to update it with the new
-    * matrix A, or modify a copy of A.
-    */
-  template<typename MatrixDerived>
+   *
+   * This constructor is a shortcut for the default constructor followed
+   * by a call to compute().
+   *
+   * \warning this class stores a reference to the matrix A as well as some
+   * precomputed values that depend on it. Therefore, if \a A is changed
+   * this class becomes invalid. Call compute() to update it with the new
+   * matrix A, or modify a copy of A.
+   */
+  template <typename MatrixDerived>
   explicit GMRES(const EigenBase<MatrixDerived>& A) : Base(A.derived()), m_restart(30) {}
 
   ~GMRES() {}
-  
+
   /** Get the number of iterations after that a restart is performed.
-    */
-  int get_restart() { return m_restart; }
-  
+   */
+  Index get_restart() { return m_restart; }
+
   /** Set the number of iterations after that a restart is performed.
-    *  \param restart   number of iterations for a restarti, default is 30.
-    */
-  void set_restart(const int restart) { m_restart=restart; }
-  
-  /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A
-    * \a x0 as an initial solution.
-    *
-    * \sa compute()
-    */
-  template<typename Rhs,typename Guess>
-  inline const internal::solve_retval_with_guess<GMRES, Rhs, Guess>
-  solveWithGuess(const MatrixBase<Rhs>& b, const Guess& x0) const
-  {
-    eigen_assert(m_isInitialized && "GMRES is not initialized.");
-    eigen_assert(Base::rows()==b.rows()
-              && "GMRES::solve(): invalid number of rows of the right hand side matrix b");
-    return internal::solve_retval_with_guess
-            <GMRES, Rhs, Guess>(*this, b.derived(), x0);
-  }
-  
-  /** \internal */
-  template<typename Rhs,typename Dest>
-  void _solveWithGuess(const Rhs& b, Dest& x) const
-  {    
-    bool failed = false;
-    for(int j=0; j<b.cols(); ++j)
-    {
-      m_iterations = Base::maxIterations();
-      m_error = Base::m_tolerance;
-      
-      typename Dest::ColXpr xj(x,j);
-      if(!internal::gmres(*mp_matrix, b.col(j), xj, Base::m_preconditioner, m_iterations, m_restart, m_error))
-        failed = true;
-    }
-    m_info = failed ? NumericalIssue
-           : m_error <= Base::m_tolerance ? Success
-           : NoConvergence;
-    m_isInitialized = true;
-  }
+   *  \param restart   number of iterations for a restarti, default is 30.
+   */
+  void set_restart(const Index restart) { m_restart = restart; }
 
   /** \internal */
-  template<typename Rhs,typename Dest>
-  void _solve(const Rhs& b, Dest& x) const
-  {
-    x = b;
-    if(x.squaredNorm() == 0) return; // Check Zero right hand side
-    _solveWithGuess(b,x);
+  template <typename Rhs, typename Dest>
+  void _solve_vector_with_guess_impl(const Rhs& b, Dest& x) const {
+    m_iterations = Base::maxIterations();
+    m_error = Base::m_tolerance;
+    bool ret = internal::gmres(matrix(), b, x, Base::m_preconditioner, m_iterations, m_restart, m_error);
+    m_info = (!ret) ? NumericalIssue : m_error <= Base::m_tolerance ? Success : NoConvergence;
   }
 
-protected:
-
+ protected:
 };
 
+}  // end namespace Eigen
 
-namespace internal {
-
-  template<typename _MatrixType, typename _Preconditioner, typename Rhs>
-struct solve_retval<GMRES<_MatrixType, _Preconditioner>, Rhs>
-  : solve_retval_base<GMRES<_MatrixType, _Preconditioner>, Rhs>
-{
-  typedef GMRES<_MatrixType, _Preconditioner> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
-  }
-};
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_GMRES_H
+#endif  // EIGEN_GMRES_H
diff --git a/inst/include/unsupported/Eigen/src/IterativeSolvers/IDRS.h b/inst/include/unsupported/Eigen/src/IterativeSolvers/IDRS.h
new file mode 100644
index 00000000..09568e11
--- /dev/null
+++ b/inst/include/unsupported/Eigen/src/IterativeSolvers/IDRS.h
@@ -0,0 +1,395 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2020 Chris Schoutrop <c.e.m.schoutrop@tue.nl>
+// Copyright (C) 2020 Jens Wehner <j.wehner@esciencecenter.nl>
+// Copyright (C) 2020 Jan van Dijk <j.v.dijk@tue.nl>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_IDRS_H
+#define EIGEN_IDRS_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+/**     \internal Low-level Induced Dimension Reduction algorithm
+        \param A The matrix A
+        \param b The right hand side vector b
+        \param x On input and initial solution, on output the computed solution.
+        \param precond A preconditioner being able to efficiently solve for an
+                  approximation of Ax=b (regardless of b)
+        \param iter On input the max number of iteration, on output the number of performed iterations.
+        \param relres On input the tolerance error, on output an estimation of the relative error.
+        \param S On input Number of the dimension of the shadow space.
+                \param smoothing switches residual smoothing on.
+                \param angle small omega lead to faster convergence at the expense of numerical stability
+                \param replacement switches on a residual replacement strategy to increase accuracy of residual at the
+   expense of more Mat*vec products \return false in the case of numerical issue, for example a break down of IDRS.
+*/
+template <typename Vector, typename RealScalar>
+typename Vector::Scalar omega(const Vector& t, const Vector& s, RealScalar angle) {
+  using numext::abs;
+  typedef typename Vector::Scalar Scalar;
+  const RealScalar ns = s.stableNorm();
+  const RealScalar nt = t.stableNorm();
+  const Scalar ts = t.dot(s);
+  const RealScalar rho = abs(ts / (nt * ns));
+
+  if (rho < angle) {
+    if (ts == Scalar(0)) {
+      return Scalar(0);
+    }
+    // Original relation for om is given by
+    // om = om * angle / rho;
+    // To alleviate potential (near) division by zero this can be rewritten as
+    // om = angle * (ns / nt) * (ts / abs(ts)) = angle * (ns / nt) * sgn(ts)
+    return angle * (ns / nt) * (ts / abs(ts));
+  }
+  return ts / (nt * nt);
+}
+
+template <typename MatrixType, typename Rhs, typename Dest, typename Preconditioner>
+bool idrs(const MatrixType& A, const Rhs& b, Dest& x, const Preconditioner& precond, Index& iter,
+          typename Dest::RealScalar& relres, Index S, bool smoothing, typename Dest::RealScalar angle,
+          bool replacement) {
+  typedef typename Dest::RealScalar RealScalar;
+  typedef typename Dest::Scalar Scalar;
+  typedef Matrix<Scalar, Dynamic, 1> VectorType;
+  typedef Matrix<Scalar, Dynamic, Dynamic, ColMajor> DenseMatrixType;
+  const Index N = b.size();
+  S = S < x.rows() ? S : x.rows();
+  const RealScalar tol = relres;
+  const Index maxit = iter;
+
+  bool trueres = false;
+
+  FullPivLU<DenseMatrixType> lu_solver;
+
+  DenseMatrixType P;
+  {
+    HouseholderQR<DenseMatrixType> qr(DenseMatrixType::Random(N, S));
+    P = (qr.householderQ() * DenseMatrixType::Identity(N, S));
+  }
+
+  const RealScalar normb = b.stableNorm();
+
+  if (internal::isApprox(normb, RealScalar(0))) {
+    // Solution is the zero vector
+    x.setZero();
+    iter = 0;
+    relres = 0;
+    return true;
+  }
+  // from http://homepage.tudelft.nl/1w5b5/IDRS/manual.pdf
+  // A peak in the residual is considered dangerously high if‖ri‖/‖b‖> C(tol/epsilon).
+  // With epsilon the relative machine precision. The factor tol/epsilon corresponds
+  // to the size of a finite precision number that is so large that the absolute
+  // round-off error in this number, when propagated through the process, makes it
+  // impossible to achieve the required accuracy. The factor C accounts for the
+  // accumulation of round-off errors. This parameter has been set to 10^{-3}.
+  // mp is epsilon/C 10^3 * eps is very conservative, so normally no residual
+  // replacements will take place. It only happens if things go very wrong. Too many
+  // restarts may ruin the convergence.
+  const RealScalar mp = RealScalar(1e3) * NumTraits<Scalar>::epsilon();
+
+  // Compute initial residual
+  const RealScalar tolb = tol * normb;  // Relative tolerance
+  VectorType r = b - A * x;
+
+  VectorType x_s, r_s;
+
+  if (smoothing) {
+    x_s = x;
+    r_s = r;
+  }
+
+  RealScalar normr = r.stableNorm();
+
+  if (normr <= tolb) {
+    // Initial guess is a good enough solution
+    iter = 0;
+    relres = normr / normb;
+    return true;
+  }
+
+  DenseMatrixType G = DenseMatrixType::Zero(N, S);
+  DenseMatrixType U = DenseMatrixType::Zero(N, S);
+  DenseMatrixType M = DenseMatrixType::Identity(S, S);
+  VectorType t(N), v(N);
+  Scalar om = 1.;
+
+  // Main iteration loop, guild G-spaces:
+  iter = 0;
+
+  while (normr > tolb && iter < maxit) {
+    // New right hand size for small system:
+    VectorType f = (r.adjoint() * P).adjoint();
+
+    for (Index k = 0; k < S; ++k) {
+      // Solve small system and make v orthogonal to P:
+      // c = M(k:s,k:s)\f(k:s);
+      lu_solver.compute(M.block(k, k, S - k, S - k));
+      VectorType c = lu_solver.solve(f.segment(k, S - k));
+      // v = r - G(:,k:s)*c;
+      v = r - G.rightCols(S - k) * c;
+      // Preconditioning
+      v = precond.solve(v);
+
+      // Compute new U(:,k) and G(:,k), G(:,k) is in space G_j
+      U.col(k) = U.rightCols(S - k) * c + om * v;
+      G.col(k) = A * U.col(k);
+
+      // Bi-Orthogonalise the new basis vectors:
+      for (Index i = 0; i < k - 1; ++i) {
+        // alpha =  ( P(:,i)'*G(:,k) )/M(i,i);
+        Scalar alpha = P.col(i).dot(G.col(k)) / M(i, i);
+        G.col(k) = G.col(k) - alpha * G.col(i);
+        U.col(k) = U.col(k) - alpha * U.col(i);
+      }
+
+      // New column of M = P'*G  (first k-1 entries are zero)
+      // M(k:s,k) = (G(:,k)'*P(:,k:s))';
+      M.block(k, k, S - k, 1) = (G.col(k).adjoint() * P.rightCols(S - k)).adjoint();
+
+      if (internal::isApprox(M(k, k), Scalar(0))) {
+        return false;
+      }
+
+      // Make r orthogonal to q_i, i = 0..k-1
+      Scalar beta = f(k) / M(k, k);
+      r = r - beta * G.col(k);
+      x = x + beta * U.col(k);
+      normr = r.stableNorm();
+
+      if (replacement && normr > tolb / mp) {
+        trueres = true;
+      }
+
+      // Smoothing:
+      if (smoothing) {
+        t = r_s - r;
+        // gamma is a Scalar, but the conversion is not allowed
+        Scalar gamma = t.dot(r_s) / t.stableNorm();
+        r_s = r_s - gamma * t;
+        x_s = x_s - gamma * (x_s - x);
+        normr = r_s.stableNorm();
+      }
+
+      if (normr < tolb || iter == maxit) {
+        break;
+      }
+
+      // New f = P'*r (first k  components are zero)
+      if (k < S - 1) {
+        f.segment(k + 1, S - (k + 1)) = f.segment(k + 1, S - (k + 1)) - beta * M.block(k + 1, k, S - (k + 1), 1);
+      }
+    }  // end for
+
+    if (normr < tolb || iter == maxit) {
+      break;
+    }
+
+    // Now we have sufficient vectors in G_j to compute residual in G_j+1
+    // Note: r is already perpendicular to P so v = r
+    // Preconditioning
+    v = r;
+    v = precond.solve(v);
+
+    // Matrix-vector multiplication:
+    t = A * v;
+
+    // Computation of a new omega
+    om = internal::omega(t, r, angle);
+
+    if (om == RealScalar(0.0)) {
+      return false;
+    }
+
+    r = r - om * t;
+    x = x + om * v;
+    normr = r.stableNorm();
+
+    if (replacement && normr > tolb / mp) {
+      trueres = true;
+    }
+
+    // Residual replacement?
+    if (trueres && normr < normb) {
+      r = b - A * x;
+      trueres = false;
+    }
+
+    // Smoothing:
+    if (smoothing) {
+      t = r_s - r;
+      Scalar gamma = t.dot(r_s) / t.stableNorm();
+      r_s = r_s - gamma * t;
+      x_s = x_s - gamma * (x_s - x);
+      normr = r_s.stableNorm();
+    }
+
+    iter++;
+
+  }  // end while
+
+  if (smoothing) {
+    x = x_s;
+  }
+  relres = normr / normb;
+  return true;
+}
+
+}  // namespace internal
+
+template <typename MatrixType_, typename Preconditioner_ = DiagonalPreconditioner<typename MatrixType_::Scalar> >
+class IDRS;
+
+namespace internal {
+
+template <typename MatrixType_, typename Preconditioner_>
+struct traits<Eigen::IDRS<MatrixType_, Preconditioner_> > {
+  typedef MatrixType_ MatrixType;
+  typedef Preconditioner_ Preconditioner;
+};
+
+}  // namespace internal
+
+/** \ingroup IterativeLinearSolvers_Module
+ * \brief The Induced Dimension Reduction method (IDR(s)) is a short-recurrences Krylov method for sparse square
+ * problems.
+ *
+ * This class allows to solve for A.x = b sparse linear problems. The vectors x and b can be either dense or sparse.
+ * he Induced Dimension Reduction method, IDR(), is a robust and efficient short-recurrence Krylov subspace method for
+ * solving large nonsymmetric systems of linear equations.
+ *
+ * For indefinite systems IDR(S) outperforms both BiCGStab and BiCGStab(L). Additionally, IDR(S) can handle matrices
+ * with complex eigenvalues more efficiently than BiCGStab.
+ *
+ * Many problems that do not converge for BiCGSTAB converge for IDR(s) (for larger values of s). And if both methods
+ * converge the convergence for IDR(s) is typically much faster for difficult systems (for example indefinite problems).
+ *
+ * IDR(s) is a limited memory finite termination method. In exact arithmetic it converges in at most N+N/s iterations,
+ * with N the system size.  It uses a fixed number of 4+3s vector. In comparison, BiCGSTAB terminates in 2N iterations
+ * and uses 7 vectors. GMRES terminates in at most N iterations, and uses I+3 vectors, with I the number of iterations.
+ * Restarting GMRES limits the memory consumption, but destroys the finite termination property.
+ *
+ * \tparam MatrixType_ the type of the sparse matrix A, can be a dense or a sparse matrix.
+ * \tparam Preconditioner_ the type of the preconditioner. Default is DiagonalPreconditioner
+ *
+ * \implsparsesolverconcept
+ *
+ * The maximal number of iterations and tolerance value can be controlled via the setMaxIterations()
+ * and setTolerance() methods. The defaults are the size of the problem for the maximal number of iterations
+ * and NumTraits<Scalar>::epsilon() for the tolerance.
+ *
+ * The tolerance corresponds to the relative residual error: |Ax-b|/|b|
+ *
+ * \b Performance: when using sparse matrices, best performance is achied for a row-major sparse matrix format.
+ * Moreover, in this case multi-threading can be exploited if the user code is compiled with OpenMP enabled.
+ * See \ref TopicMultiThreading for details.
+ *
+ * By default the iterations start with x=0 as an initial guess of the solution.
+ * One can control the start using the solveWithGuess() method.
+ *
+ * IDR(s) can also be used in a matrix-free context, see the following \link MatrixfreeSolverExample example \endlink.
+ *
+ * \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
+ */
+template <typename MatrixType_, typename Preconditioner_>
+class IDRS : public IterativeSolverBase<IDRS<MatrixType_, Preconditioner_> > {
+ public:
+  typedef MatrixType_ MatrixType;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef Preconditioner_ Preconditioner;
+
+ private:
+  typedef IterativeSolverBase<IDRS> Base;
+  using Base::m_error;
+  using Base::m_info;
+  using Base::m_isInitialized;
+  using Base::m_iterations;
+  using Base::matrix;
+  Index m_S;
+  bool m_smoothing;
+  RealScalar m_angle;
+  bool m_residual;
+
+ public:
+  /** Default constructor. */
+  IDRS() : m_S(4), m_smoothing(false), m_angle(RealScalar(0.7)), m_residual(false) {}
+
+  /**     Initialize the solver with matrix \a A for further \c Ax=b solving.
+
+          This constructor is a shortcut for the default constructor followed
+          by a call to compute().
+
+          \warning this class stores a reference to the matrix A as well as some
+          precomputed values that depend on it. Therefore, if \a A is changed
+          this class becomes invalid. Call compute() to update it with the new
+          matrix A, or modify a copy of A.
+  */
+  template <typename MatrixDerived>
+  explicit IDRS(const EigenBase<MatrixDerived>& A)
+      : Base(A.derived()), m_S(4), m_smoothing(false), m_angle(RealScalar(0.7)), m_residual(false) {}
+
+  /** \internal */
+  /**     Loops over the number of columns of b and does the following:
+                  1. sets the tolerance and maxIterations
+                  2. Calls the function that has the core solver routine
+  */
+  template <typename Rhs, typename Dest>
+  void _solve_vector_with_guess_impl(const Rhs& b, Dest& x) const {
+    m_iterations = Base::maxIterations();
+    m_error = Base::m_tolerance;
+
+    bool ret = internal::idrs(matrix(), b, x, Base::m_preconditioner, m_iterations, m_error, m_S, m_smoothing, m_angle,
+                              m_residual);
+
+    m_info = (!ret) ? NumericalIssue : m_error <= Base::m_tolerance ? Success : NoConvergence;
+  }
+
+  /** Sets the parameter S, indicating the dimension of the shadow space. Default is 4*/
+  void setS(Index S) {
+    if (S < 1) {
+      S = 4;
+    }
+
+    m_S = S;
+  }
+
+  /** Switches off and on smoothing.
+  Residual smoothing results in monotonically decreasing residual norms at
+  the expense of two extra vectors of storage and a few extra vector
+  operations. Although monotonic decrease of the residual norms is a
+  desirable property, the rate of convergence of the unsmoothed process and
+  the smoothed process is basically the same. Default is off */
+  void setSmoothing(bool smoothing) { m_smoothing = smoothing; }
+
+  /** The angle must be a real scalar. In IDR(s), a value for the
+  iteration parameter omega must be chosen in every s+1th step. The most
+  natural choice is to select a value to minimize the norm of the next residual.
+  This corresponds to the parameter omega = 0. In practice, this may lead to
+  values of omega that are so small that the other iteration parameters
+  cannot be computed with sufficient accuracy. In such cases it is better to
+  increase the value of omega sufficiently such that a compromise is reached
+  between accurate computations and reduction of the residual norm. The
+  parameter angle =0.7 (”maintaining the convergence strategy”)
+  results in such a compromise. */
+  void setAngle(RealScalar angle) { m_angle = angle; }
+
+  /** The parameter replace is a logical that determines whether a
+  residual replacement strategy is employed to increase the accuracy of the
+  solution. */
+  void setResidualUpdate(bool update) { m_residual = update; }
+};
+
+}  // namespace Eigen
+
+#endif /* EIGEN_IDRS_H */
diff --git a/inst/include/unsupported/Eigen/src/IterativeSolvers/IDRSTABL.h b/inst/include/unsupported/Eigen/src/IterativeSolvers/IDRSTABL.h
new file mode 100644
index 00000000..e55c3b3c
--- /dev/null
+++ b/inst/include/unsupported/Eigen/src/IterativeSolvers/IDRSTABL.h
@@ -0,0 +1,476 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2020 Chris Schoutrop <c.e.m.schoutrop@tue.nl>
+// Copyright (C) 2020 Mischa Senders <m.j.senders@student.tue.nl>
+// Copyright (C) 2020 Lex Kuijpers <l.kuijpers@student.tue.nl>
+// Copyright (C) 2020 Jens Wehner <j.wehner@esciencecenter.nl>
+// Copyright (C) 2020 Jan van Dijk <j.v.dijk@tue.nl>
+// Copyright (C) 2020 Adithya Vijaykumar
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+/*
+
+The IDR(S)Stab(L) method is a combination of IDR(S) and BiCGStab(L)
+
+This implementation of IDRSTABL is based on
+1. Aihara, K., Abe, K., & Ishiwata, E. (2014). A variant of IDRstab with
+reliable update strategies for solving sparse linear systems. Journal of
+Computational and Applied Mathematics, 259, 244-258.
+   doi:10.1016/j.cam.2013.08.028
+                2. Aihara, K., Abe, K., & Ishiwata, E. (2015). Preconditioned
+IDRSTABL Algorithms for Solving Nonsymmetric Linear Systems. International
+Journal of Applied Mathematics, 45(3).
+                3. Saad, Y. (2003). Iterative Methods for Sparse Linear Systems:
+Second Edition. Philadelphia, PA: SIAM.
+                4. Sonneveld, P., & Van Gijzen, M. B. (2009). IDR(s): A Family
+of Simple and Fast Algorithms for Solving Large Nonsymmetric Systems of Linear
+Equations. SIAM Journal on Scientific Computing, 31(2), 1035-1062.
+   doi:10.1137/070685804
+                5. Sonneveld, P. (2012). On the convergence behavior of IDR (s)
+and related methods. SIAM Journal on Scientific Computing, 34(5), A2576-A2598.
+
+    Right-preconditioning based on Ref. 3 is implemented here.
+*/
+
+#ifndef EIGEN_IDRSTABL_H
+#define EIGEN_IDRSTABL_H
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename MatrixType, typename Rhs, typename Dest, typename Preconditioner>
+bool idrstabl(const MatrixType &mat, const Rhs &rhs, Dest &x, const Preconditioner &precond, Index &iters,
+              typename Dest::RealScalar &tol_error, Index L, Index S) {
+  /*
+    Setup and type definitions.
+  */
+  using numext::abs;
+  using numext::sqrt;
+  typedef typename Dest::Scalar Scalar;
+  typedef typename Dest::RealScalar RealScalar;
+  typedef Matrix<Scalar, Dynamic, 1> VectorType;
+  typedef Matrix<Scalar, Dynamic, Dynamic, ColMajor> DenseMatrixType;
+
+  const Index N = x.rows();
+
+  Index k = 0;  // Iteration counter
+  const Index maxIters = iters;
+
+  const RealScalar rhs_norm = rhs.stableNorm();
+  const RealScalar tol = tol_error * rhs_norm;
+
+  if (rhs_norm == 0) {
+    /*
+      If b==0, then the exact solution is x=0.
+      rhs_norm is needed for other calculations anyways, this exit is a freebie.
+    */
+    x.setZero();
+    tol_error = 0.0;
+    return true;
+  }
+  // Construct decomposition objects beforehand.
+  FullPivLU<DenseMatrixType> lu_solver;
+
+  if (S >= N || L >= N) {
+    /*
+      The matrix is very small, or the choice of L and S is very poor
+      in that case solving directly will be best.
+    */
+    lu_solver.compute(DenseMatrixType(mat));
+    x = lu_solver.solve(rhs);
+    tol_error = (rhs - mat * x).stableNorm() / rhs_norm;
+    return true;
+  }
+
+  // Define maximum sizes to prevent any reallocation later on.
+  DenseMatrixType u(N, L + 1);
+  DenseMatrixType r(N, L + 1);
+
+  DenseMatrixType V(N * (L + 1), S);
+
+  VectorType alpha(S);
+  VectorType gamma(L);
+  VectorType update(N);
+
+  /*
+    Main IDRSTABL algorithm
+  */
+  // Set up the initial residual
+  VectorType x0 = x;
+  r.col(0) = rhs - mat * x;
+  x.setZero();  // The final solution will be x0+x
+
+  tol_error = r.col(0).stableNorm();
+
+  // FOM = Full orthogonalisation method
+  DenseMatrixType h_FOM = DenseMatrixType::Zero(S, S - 1);
+
+  // Construct an initial U matrix of size N x S
+  DenseMatrixType U(N * (L + 1), S);
+  for (Index col_index = 0; col_index < S; ++col_index) {
+    // Arnoldi-like process to generate a set of orthogonal vectors spanning
+    // {u,A*u,A*A*u,...,A^(S-1)*u}. This construction can be combined with the
+    // Full Orthogonalization Method (FOM) from Ref.3 to provide a possible
+    // early exit with no additional MV.
+    if (col_index != 0) {
+      /*
+      Modified Gram-Schmidt strategy:
+      */
+      VectorType w = mat * precond.solve(u.col(0));
+      for (Index i = 0; i < col_index; ++i) {
+        auto v = U.col(i).head(N);
+        h_FOM(i, col_index - 1) = v.dot(w);
+        w -= h_FOM(i, col_index - 1) * v;
+      }
+      u.col(0) = w;
+      h_FOM(col_index, col_index - 1) = u.col(0).stableNorm();
+
+      if (abs(h_FOM(col_index, col_index - 1)) != RealScalar(0)) {
+        /*
+        This only happens if u is NOT exactly zero. In case it is exactly zero
+        it would imply that that this u has no component in the direction of the
+        current residual.
+
+        By then setting u to zero it will not contribute any further (as it
+        should). Whereas attempting to normalize results in division by zero.
+
+        Such cases occur if:
+        1. The basis of dimension <S is sufficient to exactly solve the linear
+        system. I.e. the current residual is in span{r,Ar,...A^{m-1}r}, where
+        (m-1)<=S.
+        2. Two vectors vectors generated from r, Ar,... are (numerically)
+        parallel.
+
+        In case 1, the exact solution to the system can be obtained from the
+        "Full Orthogonalization Method" (Algorithm 6.4 in the book of Saad),
+        without any additional MV.
+
+        Contrary to what one would suspect, the comparison with ==0.0 for
+        floating-point types is intended here. Any arbitrary non-zero u is fine
+        to continue, however if u contains either NaN or Inf the algorithm will
+        break down.
+        */
+        u.col(0) /= h_FOM(col_index, col_index - 1);
+      }
+    } else {
+      u.col(0) = r.col(0);
+      u.col(0).normalize();
+    }
+
+    U.col(col_index).head(N) = u.col(0);
+  }
+
+  if (S > 1) {
+    // Check for early FOM exit.
+    Scalar beta = r.col(0).stableNorm();
+    VectorType e1 = VectorType::Zero(S - 1);
+    e1(0) = beta;
+    lu_solver.compute(h_FOM.topLeftCorner(S - 1, S - 1));
+    VectorType y = lu_solver.solve(e1);
+    VectorType x2 = x + U.topLeftCorner(N, S - 1) * y;
+
+    // Using proposition 6.7 in Saad, one MV can be saved to calculate the
+    // residual
+    RealScalar FOM_residual = (h_FOM(S - 1, S - 2) * y(S - 2) * U.col(S - 1).head(N)).stableNorm();
+
+    if (FOM_residual < tol) {
+      // Exit, the FOM algorithm was already accurate enough
+      iters = k;
+      // Convert back to the unpreconditioned solution
+      x = precond.solve(x2);
+      // x contains the updates to x0, add those back to obtain the solution
+      x += x0;
+      tol_error = FOM_residual / rhs_norm;
+      return true;
+    }
+  }
+
+  /*
+    Select an initial (N x S) matrix R0.
+    1. Generate random R0, orthonormalize the result.
+    2. This results in R0, however to save memory and compute we only need the
+    adjoint of R0. This is given by the matrix R_T.\ Additionally, the matrix
+    (mat.adjoint()*R_tilde).adjoint()=R_tilde.adjoint()*mat by the
+    anti-distributivity property of the adjoint. This results in AR_T, which is
+    constant if R_T does not have to be regenerated and can be precomputed.
+    Based on reference 4, this has zero probability in exact arithmetic.
+  */
+
+  // Original IDRSTABL and Kensuke choose S random vectors:
+  const HouseholderQR<DenseMatrixType> qr(DenseMatrixType::Random(N, S));
+  DenseMatrixType R_T = (qr.householderQ() * DenseMatrixType::Identity(N, S)).adjoint();
+  DenseMatrixType AR_T = DenseMatrixType(R_T * mat);
+
+  // Pre-allocate sigma.
+  DenseMatrixType sigma(S, S);
+
+  bool reset_while = false;  // Should the while loop be reset for some reason?
+
+  while (k < maxIters) {
+    for (Index j = 1; j <= L; ++j) {
+      /*
+        The IDR Step
+      */
+      // Construction of the sigma-matrix, and the decomposition of sigma.
+      for (Index i = 0; i < S; ++i) {
+        sigma.col(i).noalias() = AR_T * precond.solve(U.block(N * (j - 1), i, N, 1));
+      }
+
+      lu_solver.compute(sigma);
+      // Obtain the update coefficients alpha
+      if (j == 1) {
+        // alpha=inverse(sigma)*(R_T*r_0);
+        alpha.noalias() = lu_solver.solve(R_T * r.col(0));
+      } else {
+        // alpha=inverse(sigma)*(AR_T*r_{j-2})
+        alpha.noalias() = lu_solver.solve(AR_T * precond.solve(r.col(j - 2)));
+      }
+
+      // Obtain new solution and residual from this update
+      update.noalias() = U.topRows(N) * alpha;
+      r.col(0) -= mat * precond.solve(update);
+      x += update;
+
+      for (Index i = 1; i <= j - 2; ++i) {
+        // This only affects the case L>2
+        r.col(i) -= U.block(N * (i + 1), 0, N, S) * alpha;
+      }
+      if (j > 1) {
+        // r=[r;A*r_{j-2}]
+        r.col(j - 1).noalias() = mat * precond.solve(r.col(j - 2));
+      }
+      tol_error = r.col(0).stableNorm();
+
+      if (tol_error < tol) {
+        // If at this point the algorithm has converged, exit.
+        reset_while = true;
+        break;
+      }
+
+      bool break_normalization = false;
+      for (Index q = 1; q <= S; ++q) {
+        if (q == 1) {
+          // u = r;
+          u.leftCols(j + 1) = r.leftCols(j + 1);
+        } else {
+          // u=[u_1;u_2;...;u_j]
+          u.leftCols(j) = u.middleCols(1, j);
+        }
+
+        // Obtain the update coefficients beta implicitly
+        // beta=lu_sigma.solve(AR_T * u.block(N * (j - 1), 0, N, 1)
+        u.reshaped().head(u.rows() * j) -= U.topRows(N * j) * lu_solver.solve(AR_T * precond.solve(u.col(j - 1)));
+
+        // u=[u;Au_{j-1}]
+        u.col(j).noalias() = mat * precond.solve(u.col(j - 1));
+
+        // Orthonormalize u_j to the columns of V_j(:,1:q-1)
+        if (q > 1) {
+          /*
+          Modified Gram-Schmidt-like procedure to make u orthogonal to the
+          columns of V from Ref. 1.
+
+          The vector mu from Ref. 1 is obtained implicitly:
+          mu=V.block(N * j, 0, N, q - 1).adjoint() * u.block(N * j, 0, N, 1).
+          */
+          for (Index i = 0; i <= q - 2; ++i) {
+            auto v = V.col(i).segment(N * j, N);
+            Scalar h = v.squaredNorm();
+            h = v.dot(u.col(j)) / h;
+            u.reshaped().head(u.rows() * (j + 1)) -= h * V.block(0, i, N * (j + 1), 1);
+          }
+        }
+        // Normalize u and assign to a column of V
+        Scalar normalization_constant = u.col(j).stableNorm();
+        //  If u is exactly zero, this will lead to a NaN. Small, non-zero u is
+        //  fine.
+        if (normalization_constant == RealScalar(0.0)) {
+          break_normalization = true;
+          break;
+        } else {
+          u.leftCols(j + 1) /= normalization_constant;
+        }
+
+        V.block(0, q - 1, N * (j + 1), 1).noalias() = u.reshaped().head(u.rows() * (j + 1));
+      }
+
+      if (break_normalization == false) {
+        U = V;
+      }
+    }
+    if (reset_while) {
+      break;
+    }
+
+    // r=[r;mat*r_{L-1}]
+    r.col(L).noalias() = mat * precond.solve(r.col(L - 1));
+
+    /*
+            The polynomial step
+    */
+    ColPivHouseholderQR<DenseMatrixType> qr_solver(r.rightCols(L));
+    gamma.noalias() = qr_solver.solve(r.col(0));
+
+    // Update solution and residual using the "minimized residual coefficients"
+    update.noalias() = r.leftCols(L) * gamma;
+    x += update;
+    r.col(0) -= mat * precond.solve(update);
+
+    // Update iteration info
+    ++k;
+    tol_error = r.col(0).stableNorm();
+
+    if (tol_error < tol) {
+      // Slightly early exit by moving the criterion before the update of U,
+      // after the main while loop the result of that calculation would not be
+      // needed.
+      break;
+    }
+
+    /*
+    U=U0-sum(gamma_j*U_j)
+    Consider the first iteration. Then U only contains U0, so at the start of
+    the while-loop U should be U0. Therefore only the first N rows of U have to
+    be updated.
+    */
+    for (Index i = 1; i <= L; ++i) {
+      U.topRows(N) -= U.block(N * i, 0, N, S) * gamma(i - 1);
+    }
+  }
+
+  /*
+          Exit after the while loop terminated.
+  */
+  iters = k;
+  // Convert back to the unpreconditioned solution
+  x = precond.solve(x);
+  // x contains the updates to x0, add those back to obtain the solution
+  x += x0;
+  tol_error = tol_error / rhs_norm;
+  return true;
+}
+
+}  // namespace internal
+
+template <typename MatrixType_, typename Preconditioner_ = DiagonalPreconditioner<typename MatrixType_::Scalar>>
+class IDRSTABL;
+
+namespace internal {
+
+template <typename MatrixType_, typename Preconditioner_>
+struct traits<IDRSTABL<MatrixType_, Preconditioner_>> {
+  typedef MatrixType_ MatrixType;
+  typedef Preconditioner_ Preconditioner;
+};
+
+}  // namespace internal
+
+/** \ingroup IterativeLinearSolvers_Module
+ * \brief The IDR(s)STAB(l) is a combination of IDR(s) and BiCGSTAB(l). It is a
+ * short-recurrences Krylov method for sparse square problems. It can outperform
+ * both IDR(s) and BiCGSTAB(l). IDR(s)STAB(l) generally closely follows the
+ * optimal GMRES convergence in terms of the number of Matrix-Vector products.
+ * However, without the increasing cost per iteration of GMRES. IDR(s)STAB(l) is
+ * suitable for both indefinite systems and systems with complex eigenvalues.
+ *
+ * This class allows solving for A.x = b sparse linear problems. The vectors x
+ * and b can be either dense or sparse.
+ *
+ * \tparam MatrixType_ the type of the sparse matrix A, can be a dense or a
+ * sparse matrix. \tparam Preconditioner_ the type of the preconditioner.
+ * Default is DiagonalPreconditioner
+ *
+ * \implsparsesolverconcept
+ *
+ * The maximum number of iterations and tolerance value can be controlled via
+ * the setMaxIterations() and setTolerance() methods. The defaults are the size
+ * of the problem for the maximum number of iterations and
+ * NumTraits<Scalar>::epsilon() for the tolerance.
+ *
+ * The tolerance is the maximum relative residual error: |Ax-b|/|b| for which
+ * the linear system is considered solved.
+ *
+ * \b Performance: When using sparse matrices, best performance is achieved for
+ * a row-major sparse matrix format. Moreover, in this case multi-threading can
+ * be exploited if the user code is compiled with OpenMP enabled. See \ref
+ * TopicMultiThreading for details.
+ *
+ * By default the iterations start with x=0 as an initial guess of the solution.
+ * One can control the start using the solveWithGuess() method.
+ *
+ * IDR(s)STAB(l) can also be used in a matrix-free context, see the following
+ * \link MatrixfreeSolverExample example \endlink.
+ *
+ * \sa class SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
+ */
+
+template <typename MatrixType_, typename Preconditioner_>
+class IDRSTABL : public IterativeSolverBase<IDRSTABL<MatrixType_, Preconditioner_>> {
+  typedef IterativeSolverBase<IDRSTABL> Base;
+  using Base::m_error;
+  using Base::m_info;
+  using Base::m_isInitialized;
+  using Base::m_iterations;
+  using Base::matrix;
+  Index m_L;
+  Index m_S;
+
+ public:
+  typedef MatrixType_ MatrixType;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef Preconditioner_ Preconditioner;
+
+ public:
+  /** Default constructor. */
+  IDRSTABL() : m_L(2), m_S(4) {}
+
+  /**   Initialize the solver with matrix \a A for further \c Ax=b solving.
+
+  This constructor is a shortcut for the default constructor followed
+  by a call to compute().
+
+  \warning this class stores a reference to the matrix A as well as some
+  precomputed values that depend on it. Therefore, if \a A is changed
+  this class becomes invalid. Call compute() to update it with the new
+  matrix A, or modify a copy of A.
+          */
+  template <typename MatrixDerived>
+  explicit IDRSTABL(const EigenBase<MatrixDerived> &A) : Base(A.derived()), m_L(2), m_S(4) {}
+
+  /** \internal */
+  /**     Loops over the number of columns of b and does the following:
+                                  1. sets the tolerance and maxIterations
+                                  2. Calls the function that has the core solver
+     routine
+  */
+  template <typename Rhs, typename Dest>
+  void _solve_vector_with_guess_impl(const Rhs &b, Dest &x) const {
+    m_iterations = Base::maxIterations();
+    m_error = Base::m_tolerance;
+    bool ret = internal::idrstabl(matrix(), b, x, Base::m_preconditioner, m_iterations, m_error, m_L, m_S);
+
+    m_info = (!ret) ? NumericalIssue : m_error <= 10 * Base::m_tolerance ? Success : NoConvergence;
+  }
+
+  /** Sets the parameter L, indicating the amount of minimize residual steps are
+   * used. */
+  void setL(Index L) {
+    eigen_assert(L >= 1 && "L needs to be positive");
+    m_L = L;
+  }
+  /** Sets the parameter S, indicating the dimension of the shadow residual
+   * space.. */
+  void setS(Index S) {
+    eigen_assert(S >= 1 && "S needs to be positive");
+    m_S = S;
+  }
+};
+
+}  // namespace Eigen
+
+#endif /* EIGEN_IDRSTABL_H */
diff --git a/inst/include/unsupported/Eigen/src/IterativeSolvers/IncompleteCholesky.h b/inst/include/unsupported/Eigen/src/IterativeSolvers/IncompleteCholesky.h
deleted file mode 100644
index 661c1f2e..00000000
--- a/inst/include/unsupported/Eigen/src/IterativeSolvers/IncompleteCholesky.h
+++ /dev/null
@@ -1,278 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2012 Désiré Nuentsa-Wakam <desire.nuentsa_wakam@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_INCOMPLETE_CHOlESKY_H
-#define EIGEN_INCOMPLETE_CHOlESKY_H
-#include "Eigen/src/IterativeLinearSolvers/IncompleteLUT.h" 
-#include <Eigen/OrderingMethods>
-#include <list>
-
-namespace Eigen {  
-/** 
- * \brief Modified Incomplete Cholesky with dual threshold
- * 
- * References : C-J. Lin and J. J. Moré, Incomplete Cholesky Factorizations with
- *              Limited memory, SIAM J. Sci. Comput.  21(1), pp. 24-45, 1999
- * 
- * \tparam _MatrixType The type of the sparse matrix. It should be a symmetric 
- *                     matrix. It is advised to give  a row-oriented sparse matrix 
- * \tparam _UpLo The triangular part of the matrix to reference. 
- * \tparam _OrderingType 
- */
-
-template <typename Scalar, int _UpLo = Lower, typename _OrderingType = NaturalOrdering<int> >
-class IncompleteCholesky : internal::noncopyable
-{
-  public:
-    typedef SparseMatrix<Scalar,ColMajor> MatrixType;
-    typedef _OrderingType OrderingType;
-    typedef typename MatrixType::RealScalar RealScalar; 
-    typedef typename MatrixType::Index Index; 
-    typedef PermutationMatrix<Dynamic, Dynamic, Index> PermutationType;
-    typedef Matrix<Scalar,Dynamic,1> ScalarType; 
-    typedef Matrix<Index,Dynamic, 1> IndexType;
-    typedef std::vector<std::list<Index> > VectorList; 
-    enum { UpLo = _UpLo };
-  public:
-    IncompleteCholesky() : m_shift(1),m_factorizationIsOk(false) {}
-    IncompleteCholesky(const MatrixType& matrix) : m_shift(1),m_factorizationIsOk(false)
-    {
-      compute(matrix);
-    }
-    
-    Index rows() const { return m_L.rows(); }
-    
-    Index cols() const { return m_L.cols(); }
-    
-
-    /** \brief Reports whether previous computation was successful.
-      *
-      * \returns \c Success if computation was succesful,
-      *          \c NumericalIssue if the matrix appears to be negative.
-      */
-    ComputationInfo info() const
-    {
-      eigen_assert(m_isInitialized && "IncompleteLLT is not initialized.");
-      return m_info;
-    }
-    
-    /** 
-     * \brief Set the initial shift parameter
-     */
-    void setShift( Scalar shift) { m_shift = shift; }
-    
-    /**
-    * \brief Computes the fill reducing permutation vector. 
-    */
-    template<typename MatrixType>
-    void analyzePattern(const MatrixType& mat)
-    {
-      OrderingType ord; 
-      ord(mat.template selfadjointView<UpLo>(), m_perm); 
-      m_analysisIsOk = true; 
-    }
-    
-    template<typename MatrixType>
-    void factorize(const MatrixType& amat);
-    
-    template<typename MatrixType>
-    void compute (const MatrixType& matrix)
-    {
-      analyzePattern(matrix); 
-      factorize(matrix);
-    }
-    
-    template<typename Rhs, typename Dest>
-    void _solve(const Rhs& b, Dest& x) const
-    {
-      eigen_assert(m_factorizationIsOk && "factorize() should be called first");
-      if (m_perm.rows() == b.rows())
-        x = m_perm.inverse() * b; 
-      else 
-        x = b; 
-      x = m_scal.asDiagonal() * x;
-      x = m_L.template triangularView<UnitLower>().solve(x); 
-      x = m_L.adjoint().template triangularView<Upper>().solve(x); 
-      if (m_perm.rows() == b.rows())
-        x = m_perm * x;
-      x = m_scal.asDiagonal() * x;
-    }
-    template<typename Rhs> inline const internal::solve_retval<IncompleteCholesky, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_factorizationIsOk && "IncompleteLLT did not succeed");
-      eigen_assert(m_isInitialized && "IncompleteLLT is not initialized.");
-      eigen_assert(cols()==b.rows()
-                && "IncompleteLLT::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::solve_retval<IncompleteCholesky, Rhs>(*this, b.derived());
-    }
-  protected:
-    SparseMatrix<Scalar,ColMajor> m_L;  // The lower part stored in CSC
-    ScalarType m_scal; // The vector for scaling the matrix 
-    Scalar m_shift; //The initial shift parameter
-    bool m_analysisIsOk; 
-    bool m_factorizationIsOk; 
-    bool m_isInitialized;
-    ComputationInfo m_info;
-    PermutationType m_perm; 
-    
-  private:
-    template <typename IdxType, typename SclType>
-    inline void updateList(const IdxType& colPtr, IdxType& rowIdx, SclType& vals, const Index& col, const Index& jk, IndexType& firstElt, VectorList& listCol); 
-}; 
-
-template<typename Scalar, int _UpLo, typename OrderingType>
-template<typename _MatrixType>
-void IncompleteCholesky<Scalar,_UpLo, OrderingType>::factorize(const _MatrixType& mat)
-{
-  using std::sqrt;
-  using std::min;
-  eigen_assert(m_analysisIsOk && "analyzePattern() should be called first"); 
-    
-  // Dropping strategies : Keep only the p largest elements per column, where p is the number of elements in the column of the original matrix. Other strategies will be added
-  
-  // Apply the fill-reducing permutation computed in analyzePattern()
-  if (m_perm.rows() == mat.rows() ) // To detect the null permutation
-    m_L.template selfadjointView<Lower>() = mat.template selfadjointView<_UpLo>().twistedBy(m_perm);
-  else
-    m_L.template selfadjointView<Lower>() = mat.template selfadjointView<_UpLo>();
-  
-  Index n = m_L.cols(); 
-  Index nnz = m_L.nonZeros();
-  Map<ScalarType> vals(m_L.valuePtr(), nnz); //values
-  Map<IndexType> rowIdx(m_L.innerIndexPtr(), nnz);  //Row indices
-  Map<IndexType> colPtr( m_L.outerIndexPtr(), n+1); // Pointer to the beginning of each row
-  IndexType firstElt(n-1); // for each j, points to the next entry in vals that will be used in the factorization
-  VectorList listCol(n); // listCol(j) is a linked list of columns to update column j
-  ScalarType curCol(n); // Store a  nonzero values in each column
-  IndexType irow(n); // Row indices of nonzero elements in each column
-  
-  
-  // Computes the scaling factors 
-  m_scal.resize(n);
-  for (int j = 0; j < n; j++)
-  {
-    m_scal(j) = m_L.col(j).norm();
-    m_scal(j) = sqrt(m_scal(j));
-  }
-  // Scale and compute the shift for the matrix 
-  Scalar mindiag = vals[0];
-  for (int j = 0; j < n; j++){
-    for (int k = colPtr[j]; k < colPtr[j+1]; k++)
-     vals[k] /= (m_scal(j) * m_scal(rowIdx[k]));
-    mindiag = (min)(vals[colPtr[j]], mindiag);
-  }
-  
-  if(mindiag < Scalar(0.)) m_shift = m_shift - mindiag;
-  // Apply the shift to the diagonal elements of the matrix
-  for (int j = 0; j < n; j++)
-    vals[colPtr[j]] += m_shift;
-  // jki version of the Cholesky factorization 
-  for (int j=0; j < n; ++j)
-  {  
-    //Left-looking factorize the column j 
-    // First, load the jth column into curCol 
-    Scalar diag = vals[colPtr[j]];  // It is assumed that only the lower part is stored
-    curCol.setZero();
-    irow.setLinSpaced(n,0,n-1); 
-    for (int i = colPtr[j] + 1; i < colPtr[j+1]; i++)
-    {
-      curCol(rowIdx[i]) = vals[i]; 
-      irow(rowIdx[i]) = rowIdx[i]; 
-    }
-    std::list<int>::iterator k; 
-    // Browse all previous columns that will update column j
-    for(k = listCol[j].begin(); k != listCol[j].end(); k++) 
-    {
-      int jk = firstElt(*k); // First element to use in the column 
-      jk += 1; 
-      for (int i = jk; i < colPtr[*k+1]; i++)
-      {
-        curCol(rowIdx[i]) -= vals[i] * vals[jk] ;
-      }
-      updateList(colPtr,rowIdx,vals, *k, jk, firstElt, listCol);
-    }
-    
-    // Scale the current column
-    if(RealScalar(diag) <= 0) 
-    {
-      std::cerr << "\nNegative diagonal during Incomplete factorization... "<< j << "\n";
-      m_info = NumericalIssue; 
-      return; 
-    }
-    RealScalar rdiag = sqrt(RealScalar(diag));
-    vals[colPtr[j]] = rdiag;
-    for (int i = j+1; i < n; i++)
-    {
-      //Scale 
-      curCol(i) /= rdiag;
-      //Update the remaining diagonals with curCol
-      vals[colPtr[i]] -= curCol(i) * curCol(i);
-    }
-    // Select the largest p elements
-    //  p is the original number of elements in the column (without the diagonal)
-    int p = colPtr[j+1] - colPtr[j] - 1 ; 
-    internal::QuickSplit(curCol, irow, p); 
-    // Insert the largest p elements in the matrix
-    int cpt = 0; 
-    for (int i = colPtr[j]+1; i < colPtr[j+1]; i++)
-    {
-      vals[i] = curCol(cpt); 
-      rowIdx[i] = irow(cpt); 
-      cpt ++; 
-    }
-    // Get the first smallest row index and put it after the diagonal element
-    Index jk = colPtr(j)+1;
-    updateList(colPtr,rowIdx,vals,j,jk,firstElt,listCol); 
-  }
-  m_factorizationIsOk = true; 
-  m_isInitialized = true;
-  m_info = Success; 
-}
-
-template<typename Scalar, int _UpLo, typename OrderingType>
-template <typename IdxType, typename SclType>
-inline void IncompleteCholesky<Scalar,_UpLo, OrderingType>::updateList(const IdxType& colPtr, IdxType& rowIdx, SclType& vals, const Index& col, const Index& jk, IndexType& firstElt, VectorList& listCol)
-{
-  if (jk < colPtr(col+1) )
-  {
-    Index p = colPtr(col+1) - jk;
-    Index minpos; 
-    rowIdx.segment(jk,p).minCoeff(&minpos);
-    minpos += jk;
-    if (rowIdx(minpos) != rowIdx(jk))
-    {
-      //Swap
-      std::swap(rowIdx(jk),rowIdx(minpos));
-      std::swap(vals(jk),vals(minpos));
-    }
-    firstElt(col) = jk;
-    listCol[rowIdx(jk)].push_back(col);
-  }
-}
-namespace internal {
-
-template<typename _Scalar, int _UpLo, typename OrderingType, typename Rhs>
-struct solve_retval<IncompleteCholesky<_Scalar,  _UpLo, OrderingType>, Rhs>
-  : solve_retval_base<IncompleteCholesky<_Scalar, _UpLo, OrderingType>, Rhs>
-{
-  typedef IncompleteCholesky<_Scalar, _UpLo, OrderingType> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
-  }
-};
-
-} // end namespace internal
-
-} // end namespace Eigen 
-
-#endif
diff --git a/inst/include/unsupported/Eigen/src/IterativeSolvers/IncompleteLU.h b/inst/include/unsupported/Eigen/src/IterativeSolvers/IncompleteLU.h
index 67e78018..1a2edcab 100644
--- a/inst/include/unsupported/Eigen/src/IterativeSolvers/IncompleteLU.h
+++ b/inst/include/unsupported/Eigen/src/IterativeSolvers/IncompleteLU.h
@@ -10,104 +10,79 @@
 #ifndef EIGEN_INCOMPLETE_LU_H
 #define EIGEN_INCOMPLETE_LU_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 
-template <typename _Scalar>
-class IncompleteLU
-{
-    typedef _Scalar Scalar;
-    typedef Matrix<Scalar,Dynamic,1> Vector;
-    typedef typename Vector::Index Index;
-    typedef SparseMatrix<Scalar,RowMajor> FactorType;
+namespace Eigen {
 
-  public:
-    typedef Matrix<Scalar,Dynamic,Dynamic> MatrixType;
+template <typename Scalar_>
+class IncompleteLU : public SparseSolverBase<IncompleteLU<Scalar_> > {
+ protected:
+  typedef SparseSolverBase<IncompleteLU<Scalar_> > Base;
+  using Base::m_isInitialized;
 
-    IncompleteLU() : m_isInitialized(false) {}
+  typedef Scalar_ Scalar;
+  typedef Matrix<Scalar, Dynamic, 1> Vector;
+  typedef typename Vector::Index Index;
+  typedef SparseMatrix<Scalar, RowMajor> FactorType;
 
-    template<typename MatrixType>
-    IncompleteLU(const MatrixType& mat) : m_isInitialized(false)
-    {
-      compute(mat);
-    }
+ public:
+  typedef Matrix<Scalar, Dynamic, Dynamic> MatrixType;
 
-    Index rows() const { return m_lu.rows(); }
-    Index cols() const { return m_lu.cols(); }
+  IncompleteLU() {}
 
-    template<typename MatrixType>
-    IncompleteLU& compute(const MatrixType& mat)
-    {
-      m_lu = mat;
-      int size = mat.cols();
-      Vector diag(size);
-      for(int i=0; i<size; ++i)
-      {
-        typename FactorType::InnerIterator k_it(m_lu,i);
-        for(; k_it && k_it.index()<i; ++k_it)
-        {
-          int k = k_it.index();
-          k_it.valueRef() /= diag(k);
+  template <typename MatrixType>
+  IncompleteLU(const MatrixType& mat) {
+    compute(mat);
+  }
 
-          typename FactorType::InnerIterator j_it(k_it);
-          typename FactorType::InnerIterator kj_it(m_lu, k);
-          while(kj_it && kj_it.index()<=k) ++kj_it;
-          for(++j_it; j_it; )
-          {
-            if(kj_it.index()==j_it.index())
-            {
-              j_it.valueRef() -= k_it.value() * kj_it.value();
-              ++j_it;
-              ++kj_it;
-            }
-            else if(kj_it.index()<j_it.index()) ++kj_it;
-            else                                ++j_it;
-          }
+  Index rows() const { return m_lu.rows(); }
+  Index cols() const { return m_lu.cols(); }
+
+  template <typename MatrixType>
+  IncompleteLU& compute(const MatrixType& mat) {
+    m_lu = mat;
+    int size = mat.cols();
+    Vector diag(size);
+    for (int i = 0; i < size; ++i) {
+      typename FactorType::InnerIterator k_it(m_lu, i);
+      for (; k_it && k_it.index() < i; ++k_it) {
+        int k = k_it.index();
+        k_it.valueRef() /= diag(k);
+
+        typename FactorType::InnerIterator j_it(k_it);
+        typename FactorType::InnerIterator kj_it(m_lu, k);
+        while (kj_it && kj_it.index() <= k) ++kj_it;
+        for (++j_it; j_it;) {
+          if (kj_it.index() == j_it.index()) {
+            j_it.valueRef() -= k_it.value() * kj_it.value();
+            ++j_it;
+            ++kj_it;
+          } else if (kj_it.index() < j_it.index())
+            ++kj_it;
+          else
+            ++j_it;
         }
-        if(k_it && k_it.index()==i) diag(i) = k_it.value();
-        else                        diag(i) = 1;
       }
-      m_isInitialized = true;
-      return *this;
-    }
-
-    template<typename Rhs, typename Dest>
-    void _solve(const Rhs& b, Dest& x) const
-    {
-      x = m_lu.template triangularView<UnitLower>().solve(b);
-      x = m_lu.template triangularView<Upper>().solve(x);
+      if (k_it && k_it.index() == i)
+        diag(i) = k_it.value();
+      else
+        diag(i) = 1;
     }
+    m_isInitialized = true;
+    return *this;
+  }
 
-    template<typename Rhs> inline const internal::solve_retval<IncompleteLU, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(m_isInitialized && "IncompleteLU is not initialized.");
-      eigen_assert(cols()==b.rows()
-                && "IncompleteLU::solve(): invalid number of rows of the right hand side matrix b");
-      return internal::solve_retval<IncompleteLU, Rhs>(*this, b.derived());
-    }
-
-  protected:
-    FactorType m_lu;
-    bool m_isInitialized;
-};
-
-namespace internal {
-
-template<typename _MatrixType, typename Rhs>
-struct solve_retval<IncompleteLU<_MatrixType>, Rhs>
-  : solve_retval_base<IncompleteLU<_MatrixType>, Rhs>
-{
-  typedef IncompleteLU<_MatrixType> Dec;
-  EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    dec()._solve(rhs(),dst);
+  template <typename Rhs, typename Dest>
+  void _solve_impl(const Rhs& b, Dest& x) const {
+    x = m_lu.template triangularView<UnitLower>().solve(b);
+    x = m_lu.template triangularView<Upper>().solve(x);
   }
-};
 
-} // end namespace internal
+ protected:
+  FactorType m_lu;
+};
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_INCOMPLETE_LU_H
+#endif  // EIGEN_INCOMPLETE_LU_H
diff --git a/inst/include/unsupported/Eigen/src/IterativeSolvers/InternalHeaderCheck.h b/inst/include/unsupported/Eigen/src/IterativeSolvers/InternalHeaderCheck.h
new file mode 100644
index 00000000..3d6ee41c
--- /dev/null
+++ b/inst/include/unsupported/Eigen/src/IterativeSolvers/InternalHeaderCheck.h
@@ -0,0 +1,4 @@
+#ifndef EIGEN_ITERATIVE_SOLVERS_MODULE_H
+#error \
+    "Please include unsupported/Eigen/IterativeSolvers instead of including headers inside the src directory directly."
+#endif
diff --git a/inst/include/unsupported/Eigen/src/IterativeSolvers/IterationController.h b/inst/include/unsupported/Eigen/src/IterativeSolvers/IterationController.h
deleted file mode 100644
index c9c1a4be..00000000
--- a/inst/include/unsupported/Eigen/src/IterativeSolvers/IterationController.h
+++ /dev/null
@@ -1,154 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
-
-/* NOTE The class IterationController has been adapted from the iteration
- *      class of the GMM++ and ITL libraries.
- */
-
-//=======================================================================
-// Copyright (C) 1997-2001
-// Authors: Andrew Lumsdaine <lums@osl.iu.edu> 
-//          Lie-Quan Lee     <llee@osl.iu.edu>
-//
-// This file is part of the Iterative Template Library
-//
-// You should have received a copy of the License Agreement for the
-// Iterative Template Library along with the software;  see the
-// file LICENSE.  
-//
-// Permission to modify the code and to distribute modified code is
-// granted, provided the text of this NOTICE is retained, a notice that
-// the code was modified is included with the above COPYRIGHT NOTICE and
-// with the COPYRIGHT NOTICE in the LICENSE file, and that the LICENSE
-// file is distributed with the modified code.
-//
-// LICENSOR MAKES NO REPRESENTATIONS OR WARRANTIES, EXPRESS OR IMPLIED.
-// By way of example, but not limitation, Licensor MAKES NO
-// REPRESENTATIONS OR WARRANTIES OF MERCHANTABILITY OR FITNESS FOR ANY
-// PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE COMPONENTS
-// OR DOCUMENTATION WILL NOT INFRINGE ANY PATENTS, COPYRIGHTS, TRADEMARKS
-// OR OTHER RIGHTS.
-//=======================================================================
-
-//========================================================================
-//
-// Copyright (C) 2002-2007 Yves Renard
-//
-// This file is a part of GETFEM++
-//
-// Getfem++ is free software; you can redistribute it and/or modify
-// it under the terms of the GNU Lesser General Public License as
-// published by the Free Software Foundation; version 2.1 of the License.
-//
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-// GNU Lesser General Public License for more details.
-// You should have received a copy of the GNU Lesser General Public
-// License along with this program; if not, write to the Free Software
-// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301,
-// USA.
-//
-//========================================================================
-
-#include "../../../../Eigen/src/Core/util/NonMPL2.h"
-
-#ifndef EIGEN_ITERATION_CONTROLLER_H
-#define EIGEN_ITERATION_CONTROLLER_H
-
-namespace Eigen { 
-
-/** \ingroup IterativeSolvers_Module
-  * \class IterationController
-  *
-  * \brief Controls the iterations of the iterative solvers
-  *
-  * This class has been adapted from the iteration class of GMM++ and ITL libraries.
-  *
-  */
-class IterationController
-{
-  protected :
-    double m_rhsn;        ///< Right hand side norm
-    size_t m_maxiter;     ///< Max. number of iterations
-    int m_noise;          ///< if noise > 0 iterations are printed
-    double m_resmax;      ///< maximum residual
-    double m_resminreach, m_resadd;
-    size_t m_nit;         ///< iteration number
-    double m_res;         ///< last computed residual
-    bool m_written;
-    void (*m_callback)(const IterationController&);
-  public :
-
-    void init()
-    {
-      m_nit = 0; m_res = 0.0; m_written = false;
-      m_resminreach = 1E50; m_resadd = 0.0;
-      m_callback = 0;
-    }
-
-    IterationController(double r = 1.0E-8, int noi = 0, size_t mit = size_t(-1))
-      : m_rhsn(1.0), m_maxiter(mit), m_noise(noi), m_resmax(r) { init(); }
-
-    void operator ++(int) { m_nit++; m_written = false; m_resadd += m_res; }
-    void operator ++() { (*this)++; }
-
-    bool first() { return m_nit == 0; }
-
-    /* get/set the "noisyness" (verbosity) of the solvers */
-    int noiseLevel() const { return m_noise; }
-    void setNoiseLevel(int n) { m_noise = n; }
-    void reduceNoiseLevel() { if (m_noise > 0) m_noise--; }
-
-    double maxResidual() const { return m_resmax; }
-    void setMaxResidual(double r) { m_resmax = r; }
-
-    double residual() const { return m_res; }
-
-    /* change the user-definable callback, called after each iteration */
-    void setCallback(void (*t)(const IterationController&))
-    {
-      m_callback = t;
-    }
-
-    size_t iteration() const { return m_nit; }
-    void setIteration(size_t i) { m_nit = i; }
-
-    size_t maxIterarions() const { return m_maxiter; }
-    void setMaxIterations(size_t i) { m_maxiter = i; }
-
-    double rhsNorm() const { return m_rhsn; }
-    void setRhsNorm(double r) { m_rhsn = r; }
-
-    bool converged() const { return m_res <= m_rhsn * m_resmax; }
-    bool converged(double nr)
-    {
-      using std::abs;
-      m_res = abs(nr); 
-      m_resminreach = (std::min)(m_resminreach, m_res);
-      return converged();
-    }
-    template<typename VectorType> bool converged(const VectorType &v)
-    { return converged(v.squaredNorm()); }
-
-    bool finished(double nr)
-    {
-      if (m_callback) m_callback(*this);
-      if (m_noise > 0 && !m_written)
-      {
-        converged(nr);
-        m_written = true;
-      }
-      return (m_nit >= m_maxiter || converged(nr));
-    }
-    template <typename VectorType>
-    bool finished(const MatrixBase<VectorType> &v)
-    { return finished(double(v.squaredNorm())); }
-
-};
-
-} // end namespace Eigen
-
-#endif // EIGEN_ITERATION_CONTROLLER_H
diff --git a/inst/include/unsupported/Eigen/src/IterativeSolvers/MINRES.h b/inst/include/unsupported/Eigen/src/IterativeSolvers/MINRES.h
index 670f274b..9daf5686 100644
--- a/inst/include/unsupported/Eigen/src/IterativeSolvers/MINRES.h
+++ b/inst/include/unsupported/Eigen/src/IterativeSolvers/MINRES.h
@@ -2,310 +2,253 @@
 // for linear algebra.
 //
 // Copyright (C) 2012 Giacomo Po <gpo@ucla.edu>
-// Copyright (C) 2011 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2011-2014 Gael Guennebaud <gael.guennebaud@inria.fr>
+// Copyright (C) 2018 David Hyde <dabh@stanford.edu>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+#ifndef EIGEN_MINRES_H
+#define EIGEN_MINRES_H
 
-#ifndef EIGEN_MINRES_H_
-#define EIGEN_MINRES_H_
-
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 
 namespace Eigen {
-    
-    namespace internal {
-        
-        /** \internal Low-level MINRES algorithm
-         * \param mat The matrix A
-         * \param rhs The right hand side vector b
-         * \param x On input and initial solution, on output the computed solution.
-         * \param precond A right preconditioner being able to efficiently solve for an
-         *                approximation of Ax=b (regardless of b)
-         * \param iters On input the max number of iteration, on output the number of performed iterations.
-         * \param tol_error On input the tolerance error, on output an estimation of the relative error.
-         */
-        template<typename MatrixType, typename Rhs, typename Dest, typename Preconditioner>
-        EIGEN_DONT_INLINE
-        void minres(const MatrixType& mat, const Rhs& rhs, Dest& x,
-                    const Preconditioner& precond, int& iters,
-                    typename Dest::RealScalar& tol_error)
-        {
-            using std::sqrt;
-            typedef typename Dest::RealScalar RealScalar;
-            typedef typename Dest::Scalar Scalar;
-            typedef Matrix<Scalar,Dynamic,1> VectorType;
 
-            // Check for zero rhs
-            const RealScalar rhsNorm2(rhs.squaredNorm());
-            if(rhsNorm2 == 0)
-            {
-                x.setZero();
-                iters = 0;
-                tol_error = 0;
-                return;
-            }
-            
-            // initialize
-            const int maxIters(iters);  // initialize maxIters to iters
-            const int N(mat.cols());    // the size of the matrix
-            const RealScalar threshold2(tol_error*tol_error*rhsNorm2); // convergence threshold (compared to residualNorm2)
-            
-            // Initialize preconditioned Lanczos
-            VectorType v_old(N); // will be initialized inside loop
-            VectorType v( VectorType::Zero(N) ); //initialize v
-            VectorType v_new(rhs-mat*x); //initialize v_new
-            RealScalar residualNorm2(v_new.squaredNorm());
-            VectorType w(N); // will be initialized inside loop
-            VectorType w_new(precond.solve(v_new)); // initialize w_new
-//            RealScalar beta; // will be initialized inside loop
-            RealScalar beta_new2(v_new.dot(w_new));
-            eigen_assert(beta_new2 >= 0.0 && "PRECONDITIONER IS NOT POSITIVE DEFINITE");
-            RealScalar beta_new(sqrt(beta_new2));
-            const RealScalar beta_one(beta_new);
-            v_new /= beta_new;
-            w_new /= beta_new;
-            // Initialize other variables
-            RealScalar c(1.0); // the cosine of the Givens rotation
-            RealScalar c_old(1.0);
-            RealScalar s(0.0); // the sine of the Givens rotation
-            RealScalar s_old(0.0); // the sine of the Givens rotation
-            VectorType p_oold(N); // will be initialized in loop
-            VectorType p_old(VectorType::Zero(N)); // initialize p_old=0
-            VectorType p(p_old); // initialize p=0
-            RealScalar eta(1.0);
-                        
-            iters = 0; // reset iters
-            while ( iters < maxIters )
-            {
-                // Preconditioned Lanczos
-                /* Note that there are 4 variants on the Lanczos algorithm. These are
-                 * described in Paige, C. C. (1972). Computational variants of
-                 * the Lanczos method for the eigenproblem. IMA Journal of Applied
-                 * Mathematics, 10(3), 373–381. The current implementation corresponds 
-                 * to the case A(2,7) in the paper. It also corresponds to 
-                 * algorithm 6.14 in Y. Saad, Iterative Methods ￼￼￼for Sparse Linear
-                 * Systems, 2003 p.173. For the preconditioned version see 
-                 * A. Greenbaum, Iterative Methods for Solving Linear Systems, SIAM (1987).
-                 */
-                const RealScalar beta(beta_new);
-                v_old = v; // update: at first time step, this makes v_old = 0 so value of beta doesn't matter
-//                const VectorType v_old(v); // NOT SURE IF CREATING v_old EVERY ITERATION IS EFFICIENT
-                v = v_new; // update
-                w = w_new; // update
-//                const VectorType w(w_new); // NOT SURE IF CREATING w EVERY ITERATION IS EFFICIENT
-                v_new.noalias() = mat*w - beta*v_old; // compute v_new
-                const RealScalar alpha = v_new.dot(w);
-                v_new -= alpha*v; // overwrite v_new
-                w_new = precond.solve(v_new); // overwrite w_new
-                beta_new2 = v_new.dot(w_new); // compute beta_new
-                eigen_assert(beta_new2 >= 0.0 && "PRECONDITIONER IS NOT POSITIVE DEFINITE");
-                beta_new = sqrt(beta_new2); // compute beta_new
-                v_new /= beta_new; // overwrite v_new for next iteration
-                w_new /= beta_new; // overwrite w_new for next iteration
-                
-                // Givens rotation
-                const RealScalar r2 =s*alpha+c*c_old*beta; // s, s_old, c and c_old are still from previous iteration
-                const RealScalar r3 =s_old*beta; // s, s_old, c and c_old are still from previous iteration
-                const RealScalar r1_hat=c*alpha-c_old*s*beta;
-                const RealScalar r1 =sqrt( std::pow(r1_hat,2) + std::pow(beta_new,2) );
-                c_old = c; // store for next iteration
-                s_old = s; // store for next iteration
-                c=r1_hat/r1; // new cosine
-                s=beta_new/r1; // new sine
-                
-                // Update solution
-                p_oold = p_old;
-//                const VectorType p_oold(p_old); // NOT SURE IF CREATING p_oold EVERY ITERATION IS EFFICIENT
-                p_old = p;
-                p.noalias()=(w-r2*p_old-r3*p_oold) /r1; // IS NOALIAS REQUIRED?
-                x += beta_one*c*eta*p;
-                
-                /* Update the squared residual. Note that this is the estimated residual.
-                The real residual |Ax-b|^2 may be slightly larger */
-                residualNorm2 *= s*s;
-                
-                if ( residualNorm2 < threshold2)
-                {
-                    break;
-                }
-                
-                eta=-s*eta; // update eta
-                iters++; // increment iteration number (for output purposes)
-            }
-            
-            /* Compute error. Note that this is the estimated error. The real 
-             error |Ax-b|/|b| may be slightly larger */
-            tol_error = std::sqrt(residualNorm2 / rhsNorm2);
-        }
-        
-    }
-    
-    template< typename _MatrixType, int _UpLo=Lower,
-    typename _Preconditioner = IdentityPreconditioner>
-//    typename _Preconditioner = IdentityPreconditioner<typename _MatrixType::Scalar> > // preconditioner must be positive definite
-    class MINRES;
-    
-    namespace internal {
-        
-        template< typename _MatrixType, int _UpLo, typename _Preconditioner>
-        struct traits<MINRES<_MatrixType,_UpLo,_Preconditioner> >
-        {
-            typedef _MatrixType MatrixType;
-            typedef _Preconditioner Preconditioner;
-        };
-        
-    }
-    
-    /** \ingroup IterativeLinearSolvers_Module
-     * \brief A minimal residual solver for sparse symmetric problems
-     *
-     * This class allows to solve for A.x = b sparse linear problems using the MINRES algorithm
-     * of Paige and Saunders (1975). The sparse matrix A must be symmetric (possibly indefinite).
-     * The vectors x and b can be either dense or sparse.
-     *
-     * \tparam _MatrixType the type of the sparse matrix A, can be a dense or a sparse matrix.
-     * \tparam _UpLo the triangular part that will be used for the computations. It can be Lower
-     *               or Upper. Default is Lower.
-     * \tparam _Preconditioner the type of the preconditioner. Default is DiagonalPreconditioner
-     *
-     * The maximal number of iterations and tolerance value can be controlled via the setMaxIterations()
-     * and setTolerance() methods. The defaults are the size of the problem for the maximal number of iterations
-     * and NumTraits<Scalar>::epsilon() for the tolerance.
-     *
-     * This class can be used as the direct solver classes. Here is a typical usage example:
-     * \code
-     * int n = 10000;
-     * VectorXd x(n), b(n);
-     * SparseMatrix<double> A(n,n);
-     * // fill A and b
-     * MINRES<SparseMatrix<double> > mr;
-     * mr.compute(A);
-     * x = mr.solve(b);
-     * std::cout << "#iterations:     " << mr.iterations() << std::endl;
-     * std::cout << "estimated error: " << mr.error()      << std::endl;
-     * // update b, and solve again
-     * x = mr.solve(b);
-     * \endcode
-     *
-     * By default the iterations start with x=0 as an initial guess of the solution.
-     * One can control the start using the solveWithGuess() method.
-     *
-     * \sa class ConjugateGradient, BiCGSTAB, SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
+namespace internal {
+
+/** \internal Low-level MINRES algorithm
+ * \param mat The matrix A
+ * \param rhs The right hand side vector b
+ * \param x On input and initial solution, on output the computed solution.
+ * \param precond A right preconditioner being able to efficiently solve for an
+ *                approximation of Ax=b (regardless of b)
+ * \param iters On input the max number of iteration, on output the number of performed iterations.
+ * \param tol_error On input the tolerance error, on output an estimation of the relative error.
+ */
+template <typename MatrixType, typename Rhs, typename Dest, typename Preconditioner>
+EIGEN_DONT_INLINE void minres(const MatrixType& mat, const Rhs& rhs, Dest& x, const Preconditioner& precond,
+                              Index& iters, typename Dest::RealScalar& tol_error) {
+  using std::sqrt;
+  typedef typename Dest::RealScalar RealScalar;
+  typedef typename Dest::Scalar Scalar;
+  typedef Matrix<Scalar, Dynamic, 1> VectorType;
+
+  // Check for zero rhs
+  const RealScalar rhsNorm2(rhs.squaredNorm());
+  if (rhsNorm2 == 0) {
+    x.setZero();
+    iters = 0;
+    tol_error = 0;
+    return;
+  }
+
+  // initialize
+  const Index maxIters(iters);                                    // initialize maxIters to iters
+  const Index N(mat.cols());                                      // the size of the matrix
+  const RealScalar threshold2(tol_error * tol_error * rhsNorm2);  // convergence threshold (compared to residualNorm2)
+
+  // Initialize preconditioned Lanczos
+  VectorType v_old(N);                // will be initialized inside loop
+  VectorType v(VectorType::Zero(N));  // initialize v
+  VectorType v_new(rhs - mat * x);    // initialize v_new
+  RealScalar residualNorm2(v_new.squaredNorm());
+  VectorType w(N);                         // will be initialized inside loop
+  VectorType w_new(precond.solve(v_new));  // initialize w_new
+                                           //            RealScalar beta; // will be initialized inside loop
+  RealScalar beta_new2(v_new.dot(w_new));
+  eigen_assert(beta_new2 >= 0.0 && "PRECONDITIONER IS NOT POSITIVE DEFINITE");
+  RealScalar beta_new(sqrt(beta_new2));
+  const RealScalar beta_one(beta_new);
+  // Initialize other variables
+  RealScalar c(1.0);  // the cosine of the Givens rotation
+  RealScalar c_old(1.0);
+  RealScalar s(0.0);                      // the sine of the Givens rotation
+  RealScalar s_old(0.0);                  // the sine of the Givens rotation
+  VectorType p_oold(N);                   // will be initialized in loop
+  VectorType p_old(VectorType::Zero(N));  // initialize p_old=0
+  VectorType p(p_old);                    // initialize p=0
+  RealScalar eta(1.0);
+
+  iters = 0;  // reset iters
+  while (iters < maxIters) {
+    // Preconditioned Lanczos
+    /* Note that there are 4 variants on the Lanczos algorithm. These are
+     * described in Paige, C. C. (1972). Computational variants of
+     * the Lanczos method for the eigenproblem. IMA Journal of Applied
+     * Mathematics, 10(3), 373-381. The current implementation corresponds
+     * to the case A(2,7) in the paper. It also corresponds to
+     * algorithm 6.14 in Y. Saad, Iterative Methods for Sparse Linear
+     * Systems, 2003 p.173. For the preconditioned version see
+     * A. Greenbaum, Iterative Methods for Solving Linear Systems, SIAM (1987).
      */
-    template< typename _MatrixType, int _UpLo, typename _Preconditioner>
-    class MINRES : public IterativeSolverBase<MINRES<_MatrixType,_UpLo,_Preconditioner> >
-    {
-        
-        typedef IterativeSolverBase<MINRES> Base;
-        using Base::mp_matrix;
-        using Base::m_error;
-        using Base::m_iterations;
-        using Base::m_info;
-        using Base::m_isInitialized;
-    public:
-        typedef _MatrixType MatrixType;
-        typedef typename MatrixType::Scalar Scalar;
-        typedef typename MatrixType::Index Index;
-        typedef typename MatrixType::RealScalar RealScalar;
-        typedef _Preconditioner Preconditioner;
-        
-        enum {UpLo = _UpLo};
-        
-    public:
-        
-        /** Default constructor. */
-        MINRES() : Base() {}
-        
-        /** Initialize the solver with matrix \a A for further \c Ax=b solving.
-         *
-         * This constructor is a shortcut for the default constructor followed
-         * by a call to compute().
-         *
-         * \warning this class stores a reference to the matrix A as well as some
-         * precomputed values that depend on it. Therefore, if \a A is changed
-         * this class becomes invalid. Call compute() to update it with the new
-         * matrix A, or modify a copy of A.
-         */
-        template<typename MatrixDerived>
-        explicit MINRES(const EigenBase<MatrixDerived>& A) : Base(A.derived()) {}
-        
-        /** Destructor. */
-        ~MINRES(){}
-		
-        /** \returns the solution x of \f$ A x = b \f$ using the current decomposition of A
-         * \a x0 as an initial solution.
-         *
-         * \sa compute()
-         */
-        template<typename Rhs,typename Guess>
-        inline const internal::solve_retval_with_guess<MINRES, Rhs, Guess>
-        solveWithGuess(const MatrixBase<Rhs>& b, const Guess& x0) const
-        {
-            eigen_assert(m_isInitialized && "MINRES is not initialized.");
-            eigen_assert(Base::rows()==b.rows()
-                         && "MINRES::solve(): invalid number of rows of the right hand side matrix b");
-            return internal::solve_retval_with_guess
-            <MINRES, Rhs, Guess>(*this, b.derived(), x0);
-        }
-        
-        /** \internal */
-        template<typename Rhs,typename Dest>
-        void _solveWithGuess(const Rhs& b, Dest& x) const
-        {
-            typedef typename internal::conditional<UpLo==(Lower|Upper),
-                                                   const MatrixType&,
-                                                   SparseSelfAdjointView<const MatrixType, UpLo>
-                                                  >::type MatrixWrapperType;
-                                          
-            m_iterations = Base::maxIterations();
-            m_error = Base::m_tolerance;
-            
-            for(int j=0; j<b.cols(); ++j)
-            {
-                m_iterations = Base::maxIterations();
-                m_error = Base::m_tolerance;
-                
-                typename Dest::ColXpr xj(x,j);
-                internal::minres(MatrixWrapperType(*mp_matrix), b.col(j), xj,
-                                 Base::m_preconditioner, m_iterations, m_error);
-            }
-            
-            m_isInitialized = true;
-            m_info = m_error <= Base::m_tolerance ? Success : NoConvergence;
-        }
-        
-        /** \internal */
-        template<typename Rhs,typename Dest>
-        void _solve(const Rhs& b, Dest& x) const
-        {
-            x.setZero();
-            _solveWithGuess(b,x);
-        }
-        
-    protected:
-        
+    const RealScalar beta(beta_new);
+    v_old = v;          // update: at first time step, this makes v_old = 0 so value of beta doesn't matter
+    v_new /= beta_new;  // overwrite v_new for next iteration
+    w_new /= beta_new;  // overwrite w_new for next iteration
+    v = v_new;          // update
+    w = w_new;          // update
+    v_new.noalias() = mat * w - beta * v_old;  // compute v_new
+    const RealScalar alpha = v_new.dot(w);
+    v_new -= alpha * v;            // overwrite v_new
+    w_new = precond.solve(v_new);  // overwrite w_new
+    beta_new2 = v_new.dot(w_new);  // compute beta_new
+    eigen_assert(beta_new2 >= 0.0 && "PRECONDITIONER IS NOT POSITIVE DEFINITE");
+    beta_new = sqrt(beta_new2);  // compute beta_new
+
+    // Givens rotation
+    const RealScalar r2 = s * alpha + c * c_old * beta;  // s, s_old, c and c_old are still from previous iteration
+    const RealScalar r3 = s_old * beta;                  // s, s_old, c and c_old are still from previous iteration
+    const RealScalar r1_hat = c * alpha - c_old * s * beta;
+    const RealScalar r1 = sqrt(std::pow(r1_hat, 2) + std::pow(beta_new, 2));
+    c_old = c;          // store for next iteration
+    s_old = s;          // store for next iteration
+    c = r1_hat / r1;    // new cosine
+    s = beta_new / r1;  // new sine
+
+    // Update solution
+    p_oold = p_old;
+    p_old = p;
+    p.noalias() = (w - r2 * p_old - r3 * p_oold) / r1;  // IS NOALIAS REQUIRED?
+    x += beta_one * c * eta * p;
+
+    /* Update the squared residual. Note that this is the estimated residual.
+    The real residual |Ax-b|^2 may be slightly larger */
+    residualNorm2 *= s * s;
+
+    if (residualNorm2 < threshold2) {
+      break;
+    }
+
+    eta = -s * eta;  // update eta
+    iters++;         // increment iteration number (for output purposes)
+  }
+
+  /* Compute error. Note that this is the estimated error. The real
+   error |Ax-b|/|b| may be slightly larger */
+  tol_error = std::sqrt(residualNorm2 / rhsNorm2);
+}
+
+}  // namespace internal
+
+template <typename MatrixType_, int UpLo_ = Lower, typename Preconditioner_ = IdentityPreconditioner>
+class MINRES;
+
+namespace internal {
+
+template <typename MatrixType_, int UpLo_, typename Preconditioner_>
+struct traits<MINRES<MatrixType_, UpLo_, Preconditioner_> > {
+  typedef MatrixType_ MatrixType;
+  typedef Preconditioner_ Preconditioner;
+};
+
+}  // namespace internal
+
+/** \ingroup IterativeLinearSolvers_Module
+ * \brief A minimal residual solver for sparse symmetric problems
+ *
+ * This class allows to solve for A.x = b sparse linear problems using the MINRES algorithm
+ * of Paige and Saunders (1975). The sparse matrix A must be symmetric (possibly indefinite).
+ * The vectors x and b can be either dense or sparse.
+ *
+ * \tparam MatrixType_ the type of the sparse matrix A, can be a dense or a sparse matrix.
+ * \tparam UpLo_ the triangular part that will be used for the computations. It can be Lower,
+ *               Upper, or Lower|Upper in which the full matrix entries will be considered. Default is Lower.
+ * \tparam Preconditioner_ the type of the preconditioner. Default is DiagonalPreconditioner
+ *
+ * The maximal number of iterations and tolerance value can be controlled via the setMaxIterations()
+ * and setTolerance() methods. The defaults are the size of the problem for the maximal number of iterations
+ * and NumTraits<Scalar>::epsilon() for the tolerance.
+ *
+ * This class can be used as the direct solver classes. Here is a typical usage example:
+ * \code
+ * int n = 10000;
+ * VectorXd x(n), b(n);
+ * SparseMatrix<double> A(n,n);
+ * // fill A and b
+ * MINRES<SparseMatrix<double> > mr;
+ * mr.compute(A);
+ * x = mr.solve(b);
+ * std::cout << "#iterations:     " << mr.iterations() << std::endl;
+ * std::cout << "estimated error: " << mr.error()      << std::endl;
+ * // update b, and solve again
+ * x = mr.solve(b);
+ * \endcode
+ *
+ * By default the iterations start with x=0 as an initial guess of the solution.
+ * One can control the start using the solveWithGuess() method.
+ *
+ * MINRES can also be used in a matrix-free context, see the following \link MatrixfreeSolverExample example \endlink.
+ *
+ * \sa class ConjugateGradient, BiCGSTAB, SimplicialCholesky, DiagonalPreconditioner, IdentityPreconditioner
+ */
+template <typename MatrixType_, int UpLo_, typename Preconditioner_>
+class MINRES : public IterativeSolverBase<MINRES<MatrixType_, UpLo_, Preconditioner_> > {
+  typedef IterativeSolverBase<MINRES> Base;
+  using Base::m_error;
+  using Base::m_info;
+  using Base::m_isInitialized;
+  using Base::m_iterations;
+  using Base::matrix;
+
+ public:
+  using Base::_solve_impl;
+  typedef MatrixType_ MatrixType;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef Preconditioner_ Preconditioner;
+
+  enum { UpLo = UpLo_ };
+
+ public:
+  /** Default constructor. */
+  MINRES() : Base() {}
+
+  /** Initialize the solver with matrix \a A for further \c Ax=b solving.
+   *
+   * This constructor is a shortcut for the default constructor followed
+   * by a call to compute().
+   *
+   * \warning this class stores a reference to the matrix A as well as some
+   * precomputed values that depend on it. Therefore, if \a A is changed
+   * this class becomes invalid. Call compute() to update it with the new
+   * matrix A, or modify a copy of A.
+   */
+  template <typename MatrixDerived>
+  explicit MINRES(const EigenBase<MatrixDerived>& A) : Base(A.derived()) {}
+
+  /** Destructor. */
+  ~MINRES() {}
+
+  /** \internal */
+  template <typename Rhs, typename Dest>
+  void _solve_vector_with_guess_impl(const Rhs& b, Dest& x) const {
+    typedef typename Base::MatrixWrapper MatrixWrapper;
+    typedef typename Base::ActualMatrixType ActualMatrixType;
+    enum {
+      TransposeInput = (!MatrixWrapper::MatrixFree) && (UpLo == (Lower | Upper)) && (!MatrixType::IsRowMajor) &&
+                       (!NumTraits<Scalar>::IsComplex)
     };
-    
-    namespace internal {
-        
-        template<typename _MatrixType, int _UpLo, typename _Preconditioner, typename Rhs>
-        struct solve_retval<MINRES<_MatrixType,_UpLo,_Preconditioner>, Rhs>
-        : solve_retval_base<MINRES<_MatrixType,_UpLo,_Preconditioner>, Rhs>
-        {
-            typedef MINRES<_MatrixType,_UpLo,_Preconditioner> Dec;
-            EIGEN_MAKE_SOLVE_HELPERS(Dec,Rhs)
-            
-            template<typename Dest> void evalTo(Dest& dst) const
-            {
-                dec()._solve(rhs(),dst);
-            }
-        };
-        
-    } // end namespace internal
-    
-} // end namespace Eigen
+    typedef std::conditional_t<TransposeInput, Transpose<const ActualMatrixType>, ActualMatrixType const&>
+        RowMajorWrapper;
+    EIGEN_STATIC_ASSERT(internal::check_implication(MatrixWrapper::MatrixFree, UpLo == (Lower | Upper)),
+                        MATRIX_FREE_CONJUGATE_GRADIENT_IS_COMPATIBLE_WITH_UPPER_UNION_LOWER_MODE_ONLY);
+    typedef std::conditional_t<UpLo == (Lower | Upper), RowMajorWrapper,
+                               typename MatrixWrapper::template ConstSelfAdjointViewReturnType<UpLo>::Type>
+        SelfAdjointWrapper;
+
+    m_iterations = Base::maxIterations();
+    m_error = Base::m_tolerance;
+    RowMajorWrapper row_mat(matrix());
+    internal::minres(SelfAdjointWrapper(row_mat), b, x, Base::m_preconditioner, m_iterations, m_error);
+    m_info = m_error <= Base::m_tolerance ? Success : NoConvergence;
+  }
+
+ protected:
+};
 
-#endif // EIGEN_MINRES_H
+}  // end namespace Eigen
 
+#endif  // EIGEN_MINRES_H
diff --git a/inst/include/unsupported/Eigen/src/IterativeSolvers/Scaling.h b/inst/include/unsupported/Eigen/src/IterativeSolvers/Scaling.h
index 4fd43920..248c7b80 100644
--- a/inst/include/unsupported/Eigen/src/IterativeSolvers/Scaling.h
+++ b/inst/include/unsupported/Eigen/src/IterativeSolvers/Scaling.h
@@ -9,177 +9,164 @@
 
 #ifndef EIGEN_ITERSCALING_H
 #define EIGEN_ITERSCALING_H
-/**
-  * \ingroup IterativeSolvers_Module
-  * \brief iterative scaling algorithm to equilibrate rows and column norms in matrices
-  * 
-  * This class can be used as a preprocessing tool to accelerate the convergence of iterative methods 
-  * 
-  * This feature is  useful to limit the pivoting amount during LU/ILU factorization
-  * The  scaling strategy as presented here preserves the symmetry of the problem
-  * NOTE It is assumed that the matrix does not have empty row or column, 
-  * 
-  * Example with key steps 
-  * \code
-  * VectorXd x(n), b(n);
-  * SparseMatrix<double> A;
-  * // fill A and b;
-  * IterScaling<SparseMatrix<double> > scal; 
-  * // Compute the left and right scaling vectors. The matrix is equilibrated at output
-  * scal.computeRef(A); 
-  * // Scale the right hand side
-  * b = scal.LeftScaling().cwiseProduct(b); 
-  * // Now, solve the equilibrated linear system with any available solver
-  * 
-  * // Scale back the computed solution
-  * x = scal.RightScaling().cwiseProduct(x); 
-  * \endcode
-  * 
-  * \tparam _MatrixType the type of the matrix. It should be a real square sparsematrix
-  * 
-  * References : D. Ruiz and B. Ucar, A Symmetry Preserving Algorithm for Matrix Scaling, INRIA Research report RR-7552
-  * 
-  * \sa \ref IncompleteLUT 
-  */
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
 namespace Eigen {
-using std::abs; 
-template<typename _MatrixType>
-class IterScaling
-{
-  public:
-    typedef _MatrixType MatrixType; 
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::Index Index;
-    
-  public:
-    IterScaling() { init(); }
-    
-    IterScaling(const MatrixType& matrix)
-    {
-      init();
-      compute(matrix);
-    }
-    
-    ~IterScaling() { }
-    
-    /** 
-     * Compute the left and right diagonal matrices to scale the input matrix @p mat
-     * 
-     * FIXME This algorithm will be modified such that the diagonal elements are permuted on the diagonal. 
-     * 
-     * \sa LeftScaling() RightScaling()
-     */
-    void compute (const MatrixType& mat)
-    {
-      int m = mat.rows(); 
-      int n = mat.cols();
-      eigen_assert((m>0 && m == n) && "Please give a non - empty matrix");
-      m_left.resize(m); 
-      m_right.resize(n);
-      m_left.setOnes();
-      m_right.setOnes();
-      m_matrix = mat;
-      VectorXd Dr, Dc, DrRes, DcRes; // Temporary Left and right scaling vectors
-      Dr.resize(m); Dc.resize(n);
-      DrRes.resize(m); DcRes.resize(n);
-      double EpsRow = 1.0, EpsCol = 1.0;
-      int its = 0; 
-      do
-      { // Iterate until the infinite norm of each row and column is approximately 1
-        // Get the maximum value in each row and column
-        Dr.setZero(); Dc.setZero();
-        for (int k=0; k<m_matrix.outerSize(); ++k)
-        {
-          for (typename MatrixType::InnerIterator it(m_matrix, k); it; ++it)
-          {
-            if ( Dr(it.row()) < abs(it.value()) )
-              Dr(it.row()) = abs(it.value());
-            
-            if ( Dc(it.col()) < abs(it.value()) )
-              Dc(it.col()) = abs(it.value());
-          }
-        }
-        for (int i = 0; i < m; ++i) 
-        {
-          Dr(i) = std::sqrt(Dr(i));
-          Dc(i) = std::sqrt(Dc(i));
+
+/**
+ * \ingroup IterativeLinearSolvers_Module
+ * \brief iterative scaling algorithm to equilibrate rows and column norms in matrices
+ *
+ * This class can be used as a preprocessing tool to accelerate the convergence of iterative methods
+ *
+ * This feature is  useful to limit the pivoting amount during LU/ILU factorization
+ * The  scaling strategy as presented here preserves the symmetry of the problem
+ * NOTE It is assumed that the matrix does not have empty row or column,
+ *
+ * Example with key steps
+ * \code
+ * VectorXd x(n), b(n);
+ * SparseMatrix<double> A;
+ * // fill A and b;
+ * IterScaling<SparseMatrix<double> > scal;
+ * // Compute the left and right scaling vectors. The matrix is equilibrated at output
+ * scal.computeRef(A);
+ * // Scale the right hand side
+ * b = scal.LeftScaling().cwiseProduct(b);
+ * // Now, solve the equilibrated linear system with any available solver
+ *
+ * // Scale back the computed solution
+ * x = scal.RightScaling().cwiseProduct(x);
+ * \endcode
+ *
+ * \tparam MatrixType_ the type of the matrix. It should be a real square sparsematrix
+ *
+ * References : D. Ruiz and B. Ucar, A Symmetry Preserving Algorithm for Matrix Scaling, INRIA Research report RR-7552
+ *
+ * \sa \ref IncompleteLUT
+ */
+template <typename MatrixType_>
+class IterScaling {
+ public:
+  typedef MatrixType_ MatrixType;
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::Index Index;
+
+ public:
+  IterScaling() { init(); }
+
+  IterScaling(const MatrixType& matrix) {
+    init();
+    compute(matrix);
+  }
+
+  ~IterScaling() {}
+
+  /**
+   * Compute the left and right diagonal matrices to scale the input matrix @p mat
+   *
+   * FIXME This algorithm will be modified such that the diagonal elements are permuted on the diagonal.
+   *
+   * \sa LeftScaling() RightScaling()
+   */
+  void compute(const MatrixType& mat) {
+    using std::abs;
+    int m = mat.rows();
+    int n = mat.cols();
+    eigen_assert((m > 0 && m == n) && "Please give a non - empty matrix");
+    m_left.resize(m);
+    m_right.resize(n);
+    m_left.setOnes();
+    m_right.setOnes();
+    m_matrix = mat;
+    VectorXd Dr, Dc, DrRes, DcRes;  // Temporary Left and right scaling vectors
+    Dr.resize(m);
+    Dc.resize(n);
+    DrRes.resize(m);
+    DcRes.resize(n);
+    double EpsRow = 1.0, EpsCol = 1.0;
+    int its = 0;
+    do {  // Iterate until the infinite norm of each row and column is approximately 1
+      // Get the maximum value in each row and column
+      Dr.setZero();
+      Dc.setZero();
+      for (int k = 0; k < m_matrix.outerSize(); ++k) {
+        for (typename MatrixType::InnerIterator it(m_matrix, k); it; ++it) {
+          if (Dr(it.row()) < abs(it.value())) Dr(it.row()) = abs(it.value());
+
+          if (Dc(it.col()) < abs(it.value())) Dc(it.col()) = abs(it.value());
         }
-        // Save the scaling factors 
-        for (int i = 0; i < m; ++i) 
-        {
-          m_left(i) /= Dr(i);
-          m_right(i) /= Dc(i);
+      }
+      for (int i = 0; i < m; ++i) {
+        Dr(i) = std::sqrt(Dr(i));
+      }
+      for (int i = 0; i < n; ++i) {
+        Dc(i) = std::sqrt(Dc(i));
+      }
+      // Save the scaling factors
+      for (int i = 0; i < m; ++i) {
+        m_left(i) /= Dr(i);
+      }
+      for (int i = 0; i < n; ++i) {
+        m_right(i) /= Dc(i);
+      }
+      // Scale the rows and the columns of the matrix
+      DrRes.setZero();
+      DcRes.setZero();
+      for (int k = 0; k < m_matrix.outerSize(); ++k) {
+        for (typename MatrixType::InnerIterator it(m_matrix, k); it; ++it) {
+          it.valueRef() = it.value() / (Dr(it.row()) * Dc(it.col()));
+          // Accumulate the norms of the row and column vectors
+          if (DrRes(it.row()) < abs(it.value())) DrRes(it.row()) = abs(it.value());
+
+          if (DcRes(it.col()) < abs(it.value())) DcRes(it.col()) = abs(it.value());
         }
-        // Scale the rows and the columns of the matrix
-        DrRes.setZero(); DcRes.setZero(); 
-        for (int k=0; k<m_matrix.outerSize(); ++k)
-        {
-          for (typename MatrixType::InnerIterator it(m_matrix, k); it; ++it)
-          {
-            it.valueRef() = it.value()/( Dr(it.row()) * Dc(it.col()) );
-            // Accumulate the norms of the row and column vectors   
-            if ( DrRes(it.row()) < abs(it.value()) )
-              DrRes(it.row()) = abs(it.value());
-            
-            if ( DcRes(it.col()) < abs(it.value()) )
-              DcRes(it.col()) = abs(it.value());
-          }
-        }  
-        DrRes.array() = (1-DrRes.array()).abs();
-        EpsRow = DrRes.maxCoeff();
-        DcRes.array() = (1-DcRes.array()).abs();
-        EpsCol = DcRes.maxCoeff();
-        its++;
-      }while ( (EpsRow >m_tol || EpsCol > m_tol) && (its < m_maxits) );
-      m_isInitialized = true;
-    }
-    /** Compute the left and right vectors to scale the vectors
-     * the input matrix is scaled with the computed vectors at output
-     * 
-     * \sa compute()
-     */
-    void computeRef (MatrixType& mat)
-    {
-      compute (mat);
-      mat = m_matrix;
-    }
-    /** Get the vector to scale the rows of the matrix 
-     */
-    VectorXd& LeftScaling()
-    {
-      return m_left;
-    }
-    
-    /** Get the vector to scale the columns of the matrix 
-     */
-    VectorXd& RightScaling()
-    {
-      return m_right;
-    }
-    
-    /** Set the tolerance for the convergence of the iterative scaling algorithm
-     */
-    void setTolerance(double tol)
-    {
-      m_tol = tol; 
-    }
-      
-  protected:
-    
-    void init()
-    {
-      m_tol = 1e-10;
-      m_maxits = 5;
-      m_isInitialized = false;
-    }
-    
-    MatrixType m_matrix;
-    mutable ComputationInfo m_info; 
-    bool m_isInitialized; 
-    VectorXd m_left; // Left scaling vector
-    VectorXd m_right; // m_right scaling vector
-    double m_tol; 
-    int m_maxits; // Maximum number of iterations allowed
+      }
+      DrRes.array() = (1 - DrRes.array()).abs();
+      EpsRow = DrRes.maxCoeff();
+      DcRes.array() = (1 - DcRes.array()).abs();
+      EpsCol = DcRes.maxCoeff();
+      its++;
+    } while ((EpsRow > m_tol || EpsCol > m_tol) && (its < m_maxits));
+    m_isInitialized = true;
+  }
+  /** Compute the left and right vectors to scale the vectors
+   * the input matrix is scaled with the computed vectors at output
+   *
+   * \sa compute()
+   */
+  void computeRef(MatrixType& mat) {
+    compute(mat);
+    mat = m_matrix;
+  }
+  /** Get the vector to scale the rows of the matrix
+   */
+  VectorXd& LeftScaling() { return m_left; }
+
+  /** Get the vector to scale the columns of the matrix
+   */
+  VectorXd& RightScaling() { return m_right; }
+
+  /** Set the tolerance for the convergence of the iterative scaling algorithm
+   */
+  void setTolerance(double tol) { m_tol = tol; }
+
+ protected:
+  void init() {
+    m_tol = 1e-10;
+    m_maxits = 5;
+    m_isInitialized = false;
+  }
+
+  MatrixType m_matrix;
+  mutable ComputationInfo m_info;
+  bool m_isInitialized;
+  VectorXd m_left;   // Left scaling vector
+  VectorXd m_right;  // m_right scaling vector
+  double m_tol;
+  int m_maxits;  // Maximum number of iterations allowed
 };
-}
+}  // namespace Eigen
 #endif
diff --git a/inst/include/unsupported/Eigen/src/KroneckerProduct/InternalHeaderCheck.h b/inst/include/unsupported/Eigen/src/KroneckerProduct/InternalHeaderCheck.h
new file mode 100644
index 00000000..3cf30d9f
--- /dev/null
+++ b/inst/include/unsupported/Eigen/src/KroneckerProduct/InternalHeaderCheck.h
@@ -0,0 +1,4 @@
+#ifndef EIGEN_KRONECKER_PRODUCT_MODULE_H
+#error \
+    "Please include unsupported/Eigen/KroneckerProduct instead of including headers inside the src directory directly."
+#endif
diff --git a/inst/include/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h b/inst/include/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h
index 532896c3..1d29f2fc 100644
--- a/inst/include/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h
+++ b/inst/include/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h
@@ -12,11 +12,60 @@
 #ifndef KRONECKER_TENSOR_PRODUCT_H
 #define KRONECKER_TENSOR_PRODUCT_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 
-template<typename Scalar, int Options, typename Index> class SparseMatrix;
+namespace Eigen {
 
 /*!
+ * \ingroup KroneckerProduct_Module
+ *
+ * \brief The base class of dense and sparse Kronecker product.
+ *
+ * \tparam Derived is the derived type.
+ */
+template <typename Derived>
+class KroneckerProductBase : public ReturnByValue<Derived> {
+ private:
+  typedef typename internal::traits<Derived> Traits;
+  typedef typename Traits::Scalar Scalar;
+
+ protected:
+  typedef typename Traits::Lhs Lhs;
+  typedef typename Traits::Rhs Rhs;
+
+ public:
+  /*! \brief Constructor. */
+  KroneckerProductBase(const Lhs& A, const Rhs& B) : m_A(A), m_B(B) {}
+
+  inline Index rows() const { return m_A.rows() * m_B.rows(); }
+  inline Index cols() const { return m_A.cols() * m_B.cols(); }
+
+  /*!
+   * This overrides ReturnByValue::coeff because this function is
+   * efficient enough.
+   */
+  Scalar coeff(Index row, Index col) const {
+    return m_A.coeff(row / m_B.rows(), col / m_B.cols()) * m_B.coeff(row % m_B.rows(), col % m_B.cols());
+  }
+
+  /*!
+   * This overrides ReturnByValue::coeff because this function is
+   * efficient enough.
+   */
+  Scalar coeff(Index i) const {
+    EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived);
+    return m_A.coeff(i / m_A.size()) * m_B.coeff(i % m_A.size());
+  }
+
+ protected:
+  typename Lhs::Nested m_A;
+  typename Rhs::Nested m_B;
+};
+
+/*!
+ * \ingroup KroneckerProduct_Module
+ *
  * \brief Kronecker tensor product helper class for dense matrices
  *
  * This class is the return value of kroneckerProduct(MatrixBase,
@@ -26,44 +75,25 @@ template<typename Scalar, int Options, typename Index> class SparseMatrix;
  * \tparam Lhs  Type of the left-hand side, a matrix expression.
  * \tparam Rhs  Type of the rignt-hand side, a matrix expression.
  */
-template<typename Lhs, typename Rhs>
-class KroneckerProduct : public ReturnByValue<KroneckerProduct<Lhs,Rhs> >
-{
-  private:
-    typedef ReturnByValue<KroneckerProduct> Base;
-    typedef typename Base::Scalar Scalar;
-    typedef typename Base::Index Index;
-
-  public:
-    /*! \brief Constructor. */
-    KroneckerProduct(const Lhs& A, const Rhs& B)
-      : m_A(A), m_B(B)
-    {}
-
-    /*! \brief Evaluate the Kronecker tensor product. */
-    template<typename Dest> void evalTo(Dest& dst) const;
-    
-    inline Index rows() const { return m_A.rows() * m_B.rows(); }
-    inline Index cols() const { return m_A.cols() * m_B.cols(); }
-
-    Scalar coeff(Index row, Index col) const
-    {
-      return m_A.coeff(row / m_B.rows(), col / m_B.cols()) *
-             m_B.coeff(row % m_B.rows(), col % m_B.cols());
-    }
+template <typename Lhs, typename Rhs>
+class KroneckerProduct : public KroneckerProductBase<KroneckerProduct<Lhs, Rhs> > {
+ private:
+  typedef KroneckerProductBase<KroneckerProduct> Base;
+  using Base::m_A;
+  using Base::m_B;
 
-    Scalar coeff(Index i) const
-    {
-      EIGEN_STATIC_ASSERT_VECTOR_ONLY(KroneckerProduct);
-      return m_A.coeff(i / m_A.size()) * m_B.coeff(i % m_A.size());
-    }
+ public:
+  /*! \brief Constructor. */
+  KroneckerProduct(const Lhs& A, const Rhs& B) : Base(A, B) {}
 
-  private:
-    typename Lhs::Nested m_A;
-    typename Rhs::Nested m_B;
+  /*! \brief Evaluate the Kronecker tensor product. */
+  template <typename Dest>
+  void evalTo(Dest& dst) const;
 };
 
 /*!
+ * \ingroup KroneckerProduct_Module
+ *
  * \brief Kronecker tensor product helper class for sparse matrices
  *
  * If at least one of the operands is a sparse matrix expression,
@@ -76,71 +106,72 @@ class KroneckerProduct : public ReturnByValue<KroneckerProduct<Lhs,Rhs> >
  * \tparam Lhs  Type of the left-hand side, a matrix expression.
  * \tparam Rhs  Type of the rignt-hand side, a matrix expression.
  */
-template<typename Lhs, typename Rhs>
-class KroneckerProductSparse : public EigenBase<KroneckerProductSparse<Lhs,Rhs> >
-{
-  private:
-    typedef typename internal::traits<KroneckerProductSparse>::Index Index;
-
-  public:
-    /*! \brief Constructor. */
-    KroneckerProductSparse(const Lhs& A, const Rhs& B)
-      : m_A(A), m_B(B)
-    {}
-
-    /*! \brief Evaluate the Kronecker tensor product. */
-    template<typename Dest> void evalTo(Dest& dst) const;
-    
-    inline Index rows() const { return m_A.rows() * m_B.rows(); }
-    inline Index cols() const { return m_A.cols() * m_B.cols(); }
-
-    template<typename Scalar, int Options, typename Index>
-    operator SparseMatrix<Scalar, Options, Index>()
-    {
-      SparseMatrix<Scalar, Options, Index> result;
-      evalTo(result.derived());
-      return result;
-    }
+template <typename Lhs, typename Rhs>
+class KroneckerProductSparse : public KroneckerProductBase<KroneckerProductSparse<Lhs, Rhs> > {
+ private:
+  typedef KroneckerProductBase<KroneckerProductSparse> Base;
+  using Base::m_A;
+  using Base::m_B;
+
+ public:
+  /*! \brief Constructor. */
+  KroneckerProductSparse(const Lhs& A, const Rhs& B) : Base(A, B) {}
 
-  private:
-    typename Lhs::Nested m_A;
-    typename Rhs::Nested m_B;
+  /*! \brief Evaluate the Kronecker tensor product. */
+  template <typename Dest>
+  void evalTo(Dest& dst) const;
 };
 
-template<typename Lhs, typename Rhs>
-template<typename Dest>
-void KroneckerProduct<Lhs,Rhs>::evalTo(Dest& dst) const
-{
-  const int BlockRows = Rhs::RowsAtCompileTime,
-            BlockCols = Rhs::ColsAtCompileTime;
-  const Index Br = m_B.rows(),
-              Bc = m_B.cols();
-  for (Index i=0; i < m_A.rows(); ++i)
-    for (Index j=0; j < m_A.cols(); ++j)
-      Block<Dest,BlockRows,BlockCols>(dst,i*Br,j*Bc,Br,Bc) = m_A.coeff(i,j) * m_B;
+template <typename Lhs, typename Rhs>
+template <typename Dest>
+void KroneckerProduct<Lhs, Rhs>::evalTo(Dest& dst) const {
+  const int BlockRows = Rhs::RowsAtCompileTime, BlockCols = Rhs::ColsAtCompileTime;
+  const Index Br = m_B.rows(), Bc = m_B.cols();
+  for (Index i = 0; i < m_A.rows(); ++i)
+    for (Index j = 0; j < m_A.cols(); ++j)
+      Block<Dest, BlockRows, BlockCols>(dst, i * Br, j * Bc, Br, Bc) = m_A.coeff(i, j) * m_B;
 }
 
-template<typename Lhs, typename Rhs>
-template<typename Dest>
-void KroneckerProductSparse<Lhs,Rhs>::evalTo(Dest& dst) const
-{
-  const Index Br = m_B.rows(),
-              Bc = m_B.cols();
-  dst.resize(rows(),cols());
+template <typename Lhs, typename Rhs>
+template <typename Dest>
+void KroneckerProductSparse<Lhs, Rhs>::evalTo(Dest& dst) const {
+  Index Br = m_B.rows(), Bc = m_B.cols();
+  dst.resize(this->rows(), this->cols());
   dst.resizeNonZeros(0);
-  dst.reserve(m_A.nonZeros() * m_B.nonZeros());
 
-  for (Index kA=0; kA < m_A.outerSize(); ++kA)
+  // 1 - evaluate the operands if needed:
+  typedef typename internal::nested_eval<Lhs, Dynamic>::type Lhs1;
+  typedef internal::remove_all_t<Lhs1> Lhs1Cleaned;
+  const Lhs1 lhs1(m_A);
+  typedef typename internal::nested_eval<Rhs, Dynamic>::type Rhs1;
+  typedef internal::remove_all_t<Rhs1> Rhs1Cleaned;
+  const Rhs1 rhs1(m_B);
+
+  // 2 - construct respective iterators
+  typedef Eigen::InnerIterator<Lhs1Cleaned> LhsInnerIterator;
+  typedef Eigen::InnerIterator<Rhs1Cleaned> RhsInnerIterator;
+
+  // compute number of non-zeros per innervectors of dst
   {
-    for (Index kB=0; kB < m_B.outerSize(); ++kB)
-    {
-      for (typename Lhs::InnerIterator itA(m_A,kA); itA; ++itA)
-      {
-        for (typename Rhs::InnerIterator itB(m_B,kB); itB; ++itB)
-        {
-          const Index i = itA.row() * Br + itB.row(),
-                      j = itA.col() * Bc + itB.col();
-          dst.insert(i,j) = itA.value() * itB.value();
+    // TODO VectorXi is not necessarily big enough!
+    VectorXi nnzA = VectorXi::Zero(Dest::IsRowMajor ? m_A.rows() : m_A.cols());
+    for (Index kA = 0; kA < m_A.outerSize(); ++kA)
+      for (LhsInnerIterator itA(lhs1, kA); itA; ++itA) nnzA(Dest::IsRowMajor ? itA.row() : itA.col())++;
+
+    VectorXi nnzB = VectorXi::Zero(Dest::IsRowMajor ? m_B.rows() : m_B.cols());
+    for (Index kB = 0; kB < m_B.outerSize(); ++kB)
+      for (RhsInnerIterator itB(rhs1, kB); itB; ++itB) nnzB(Dest::IsRowMajor ? itB.row() : itB.col())++;
+
+    Matrix<int, Dynamic, Dynamic, ColMajor> nnzAB = nnzB * nnzA.transpose();
+    dst.reserve(VectorXi::Map(nnzAB.data(), nnzAB.size()));
+  }
+
+  for (Index kA = 0; kA < m_A.outerSize(); ++kA) {
+    for (Index kB = 0; kB < m_B.outerSize(); ++kB) {
+      for (LhsInnerIterator itA(lhs1, kA); itA; ++itA) {
+        for (RhsInnerIterator itB(rhs1, kB); itB; ++itB) {
+          Index i = itA.row() * Br + itB.row(), j = itA.col() * Bc + itB.col();
+          dst.insert(i, j) = itA.value() * itB.value();
         }
       }
     }
@@ -149,53 +180,54 @@ void KroneckerProductSparse<Lhs,Rhs>::evalTo(Dest& dst) const
 
 namespace internal {
 
-template<typename _Lhs, typename _Rhs>
-struct traits<KroneckerProduct<_Lhs,_Rhs> >
-{
-  typedef typename remove_all<_Lhs>::type Lhs;
-  typedef typename remove_all<_Rhs>::type Rhs;
-  typedef typename scalar_product_traits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType Scalar;
+template <typename Lhs_, typename Rhs_>
+struct traits<KroneckerProduct<Lhs_, Rhs_> > {
+  typedef remove_all_t<Lhs_> Lhs;
+  typedef remove_all_t<Rhs_> Rhs;
+  typedef typename ScalarBinaryOpTraits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType Scalar;
+  typedef typename promote_index_type<typename Lhs::StorageIndex, typename Rhs::StorageIndex>::type StorageIndex;
 
   enum {
-    Rows = size_at_compile_time<traits<Lhs>::RowsAtCompileTime, traits<Rhs>::RowsAtCompileTime>::ret,
-    Cols = size_at_compile_time<traits<Lhs>::ColsAtCompileTime, traits<Rhs>::ColsAtCompileTime>::ret,
-    MaxRows = size_at_compile_time<traits<Lhs>::MaxRowsAtCompileTime, traits<Rhs>::MaxRowsAtCompileTime>::ret,
-    MaxCols = size_at_compile_time<traits<Lhs>::MaxColsAtCompileTime, traits<Rhs>::MaxColsAtCompileTime>::ret,
-    CoeffReadCost = Lhs::CoeffReadCost + Rhs::CoeffReadCost + NumTraits<Scalar>::MulCost
+    Rows = size_at_compile_time(traits<Lhs>::RowsAtCompileTime, traits<Rhs>::RowsAtCompileTime),
+    Cols = size_at_compile_time(traits<Lhs>::ColsAtCompileTime, traits<Rhs>::ColsAtCompileTime),
+    MaxRows = size_at_compile_time(traits<Lhs>::MaxRowsAtCompileTime, traits<Rhs>::MaxRowsAtCompileTime),
+    MaxCols = size_at_compile_time(traits<Lhs>::MaxColsAtCompileTime, traits<Rhs>::MaxColsAtCompileTime)
   };
 
-  typedef Matrix<Scalar,Rows,Cols> ReturnType;
+  typedef Matrix<Scalar, Rows, Cols> ReturnType;
 };
 
-template<typename _Lhs, typename _Rhs>
-struct traits<KroneckerProductSparse<_Lhs,_Rhs> >
-{
+template <typename Lhs_, typename Rhs_>
+struct traits<KroneckerProductSparse<Lhs_, Rhs_> > {
   typedef MatrixXpr XprKind;
-  typedef typename remove_all<_Lhs>::type Lhs;
-  typedef typename remove_all<_Rhs>::type Rhs;
-  typedef typename scalar_product_traits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType Scalar;
-  typedef typename promote_storage_type<typename traits<Lhs>::StorageKind, typename traits<Rhs>::StorageKind>::ret StorageKind;
-  typedef typename promote_index_type<typename Lhs::Index, typename Rhs::Index>::type Index;
+  typedef remove_all_t<Lhs_> Lhs;
+  typedef remove_all_t<Rhs_> Rhs;
+  typedef typename ScalarBinaryOpTraits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType Scalar;
+  typedef typename cwise_promote_storage_type<typename traits<Lhs>::StorageKind, typename traits<Rhs>::StorageKind,
+                                              scalar_product_op<typename Lhs::Scalar, typename Rhs::Scalar> >::ret
+      StorageKind;
+  typedef typename promote_index_type<typename Lhs::StorageIndex, typename Rhs::StorageIndex>::type StorageIndex;
 
   enum {
     LhsFlags = Lhs::Flags,
     RhsFlags = Rhs::Flags,
 
-    RowsAtCompileTime = size_at_compile_time<traits<Lhs>::RowsAtCompileTime, traits<Rhs>::RowsAtCompileTime>::ret,
-    ColsAtCompileTime = size_at_compile_time<traits<Lhs>::ColsAtCompileTime, traits<Rhs>::ColsAtCompileTime>::ret,
-    MaxRowsAtCompileTime = size_at_compile_time<traits<Lhs>::MaxRowsAtCompileTime, traits<Rhs>::MaxRowsAtCompileTime>::ret,
-    MaxColsAtCompileTime = size_at_compile_time<traits<Lhs>::MaxColsAtCompileTime, traits<Rhs>::MaxColsAtCompileTime>::ret,
+    RowsAtCompileTime = size_at_compile_time(traits<Lhs>::RowsAtCompileTime, traits<Rhs>::RowsAtCompileTime),
+    ColsAtCompileTime = size_at_compile_time(traits<Lhs>::ColsAtCompileTime, traits<Rhs>::ColsAtCompileTime),
+    MaxRowsAtCompileTime = size_at_compile_time(traits<Lhs>::MaxRowsAtCompileTime, traits<Rhs>::MaxRowsAtCompileTime),
+    MaxColsAtCompileTime = size_at_compile_time(traits<Lhs>::MaxColsAtCompileTime, traits<Rhs>::MaxColsAtCompileTime),
 
-    EvalToRowMajor = (LhsFlags & RhsFlags & RowMajorBit),
+    EvalToRowMajor = (int(LhsFlags) & int(RhsFlags) & RowMajorBit),
     RemovedBits = ~(EvalToRowMajor ? 0 : RowMajorBit),
 
-    Flags = ((LhsFlags | RhsFlags) & HereditaryBits & RemovedBits)
-          | EvalBeforeNestingBit | EvalBeforeAssigningBit,
-    CoeffReadCost = Dynamic
+    Flags = ((int(LhsFlags) | int(RhsFlags)) & HereditaryBits & RemovedBits) | EvalBeforeNestingBit,
+    CoeffReadCost = HugeCost
   };
+
+  typedef SparseMatrix<Scalar, 0, StorageIndex> ReturnType;
 };
 
-} // end namespace internal
+}  // end namespace internal
 
 /*!
  * \ingroup KroneckerProduct_Module
@@ -216,9 +248,8 @@ struct traits<KroneckerProductSparse<_Lhs,_Rhs> >
  * \param b  Dense matrix b
  * \return   Kronecker tensor product of a and b
  */
-template<typename A, typename B>
-KroneckerProduct<A,B> kroneckerProduct(const MatrixBase<A>& a, const MatrixBase<B>& b)
-{
+template <typename A, typename B>
+KroneckerProduct<A, B> kroneckerProduct(const MatrixBase<A>& a, const MatrixBase<B>& b) {
   return KroneckerProduct<A, B>(a.derived(), b.derived());
 }
 
@@ -228,17 +259,26 @@ KroneckerProduct<A,B> kroneckerProduct(const MatrixBase<A>& a, const MatrixBase<
  * Computes Kronecker tensor product of two matrices, at least one of
  * which is sparse
  *
+ * \warning If you want to replace a matrix by its Kronecker product
+ *          with some matrix, do \b NOT do this:
+ * \code
+ * A = kroneckerProduct(A,B); // bug!!! caused by aliasing effect
+ * \endcode
+ * instead, use eval() to work around this:
+ * \code
+ * A = kroneckerProduct(A,B).eval();
+ * \endcode
+ *
  * \param a  Dense/sparse matrix a
  * \param b  Dense/sparse matrix b
  * \return   Kronecker tensor product of a and b, stored in a sparse
  *           matrix
  */
-template<typename A, typename B>
-KroneckerProductSparse<A,B> kroneckerProduct(const EigenBase<A>& a, const EigenBase<B>& b)
-{
-  return KroneckerProductSparse<A,B>(a.derived(), b.derived());
+template <typename A, typename B>
+KroneckerProductSparse<A, B> kroneckerProduct(const EigenBase<A>& a, const EigenBase<B>& b) {
+  return KroneckerProductSparse<A, B>(a.derived(), b.derived());
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // KRONECKER_TENSOR_PRODUCT_H
+#endif  // KRONECKER_TENSOR_PRODUCT_H
diff --git a/inst/include/unsupported/Eigen/src/LevenbergMarquardt/InternalHeaderCheck.h b/inst/include/unsupported/Eigen/src/LevenbergMarquardt/InternalHeaderCheck.h
new file mode 100644
index 00000000..a9a646f9
--- /dev/null
+++ b/inst/include/unsupported/Eigen/src/LevenbergMarquardt/InternalHeaderCheck.h
@@ -0,0 +1,4 @@
+#ifndef EIGEN_LEVENBERGMARQUARDT_MODULE_H
+#error \
+    "Please include unsupported/Eigen/LevenbergMarquardt instead of including headers inside the src directory directly."
+#endif
diff --git a/inst/include/unsupported/Eigen/src/LevenbergMarquardt/LMcovar.h b/inst/include/unsupported/Eigen/src/LevenbergMarquardt/LMcovar.h
index 32d3ad51..b81d5a3e 100644
--- a/inst/include/unsupported/Eigen/src/LevenbergMarquardt/LMcovar.h
+++ b/inst/include/unsupported/Eigen/src/LevenbergMarquardt/LMcovar.h
@@ -12,74 +12,69 @@
 #ifndef EIGEN_LMCOVAR_H
 #define EIGEN_LMCOVAR_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 namespace internal {
 
 template <typename Scalar>
-void covar(
-        Matrix< Scalar, Dynamic, Dynamic > &r,
-        const VectorXi& ipvt,
-        Scalar tol = std::sqrt(NumTraits<Scalar>::epsilon()) )
-{
-    using std::abs;
-    typedef DenseIndex Index;
-    /* Local variables */
-    Index i, j, k, l, ii, jj;
-    bool sing;
-    Scalar temp;
-
-    /* Function Body */
-    const Index n = r.cols();
-    const Scalar tolr = tol * abs(r(0,0));
-    Matrix< Scalar, Dynamic, 1 > wa(n);
-    eigen_assert(ipvt.size()==n);
+void covar(Matrix<Scalar, Dynamic, Dynamic>& r, const VectorXi& ipvt,
+           Scalar tol = std::sqrt(NumTraits<Scalar>::epsilon())) {
+  using std::abs;
+  /* Local variables */
+  Index i, j, k, l, ii, jj;
+  bool sing;
+  Scalar temp;
 
-    /* form the inverse of r in the full upper triangle of r. */
-    l = -1;
-    for (k = 0; k < n; ++k)
-        if (abs(r(k,k)) > tolr) {
-            r(k,k) = 1. / r(k,k);
-            for (j = 0; j <= k-1; ++j) {
-                temp = r(k,k) * r(j,k);
-                r(j,k) = 0.;
-                r.col(k).head(j+1) -= r.col(j).head(j+1) * temp;
-            }
-            l = k;
-        }
+  /* Function Body */
+  const Index n = r.cols();
+  const Scalar tolr = tol * abs(r(0, 0));
+  Matrix<Scalar, Dynamic, 1> wa(n);
+  eigen_assert(ipvt.size() == n);
 
-    /* form the full upper triangle of the inverse of (r transpose)*r */
-    /* in the full upper triangle of r. */
-    for (k = 0; k <= l; ++k) {
-        for (j = 0; j <= k-1; ++j)
-            r.col(j).head(j+1) += r.col(k).head(j+1) * r(j,k);
-        r.col(k).head(k+1) *= r(k,k);
+  /* form the inverse of r in the full upper triangle of r. */
+  l = -1;
+  for (k = 0; k < n; ++k)
+    if (abs(r(k, k)) > tolr) {
+      r(k, k) = 1. / r(k, k);
+      for (j = 0; j <= k - 1; ++j) {
+        temp = r(k, k) * r(j, k);
+        r(j, k) = 0.;
+        r.col(k).head(j + 1) -= r.col(j).head(j + 1) * temp;
+      }
+      l = k;
     }
 
-    /* form the full lower triangle of the covariance matrix */
-    /* in the strict lower triangle of r and in wa. */
-    for (j = 0; j < n; ++j) {
-        jj = ipvt[j];
-        sing = j > l;
-        for (i = 0; i <= j; ++i) {
-            if (sing)
-                r(i,j) = 0.;
-            ii = ipvt[i];
-            if (ii > jj)
-                r(ii,jj) = r(i,j);
-            if (ii < jj)
-                r(jj,ii) = r(i,j);
-        }
-        wa[jj] = r(j,j);
+  /* form the full upper triangle of the inverse of (r transpose)*r */
+  /* in the full upper triangle of r. */
+  for (k = 0; k <= l; ++k) {
+    for (j = 0; j <= k - 1; ++j) r.col(j).head(j + 1) += r.col(k).head(j + 1) * r(j, k);
+    r.col(k).head(k + 1) *= r(k, k);
+  }
+
+  /* form the full lower triangle of the covariance matrix */
+  /* in the strict lower triangle of r and in wa. */
+  for (j = 0; j < n; ++j) {
+    jj = ipvt[j];
+    sing = j > l;
+    for (i = 0; i <= j; ++i) {
+      if (sing) r(i, j) = 0.;
+      ii = ipvt[i];
+      if (ii > jj) r(ii, jj) = r(i, j);
+      if (ii < jj) r(jj, ii) = r(i, j);
     }
+    wa[jj] = r(j, j);
+  }
 
-    /* symmetrize the covariance matrix in r. */
-    r.topLeftCorner(n,n).template triangularView<StrictlyUpper>() = r.topLeftCorner(n,n).transpose();
-    r.diagonal() = wa;
+  /* symmetrize the covariance matrix in r. */
+  r.topLeftCorner(n, n).template triangularView<StrictlyUpper>() = r.topLeftCorner(n, n).transpose();
+  r.diagonal() = wa;
 }
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_LMCOVAR_H
+#endif  // EIGEN_LMCOVAR_H
diff --git a/inst/include/unsupported/Eigen/src/LevenbergMarquardt/LMonestep.h b/inst/include/unsupported/Eigen/src/LevenbergMarquardt/LMonestep.h
index 25b32ec5..f0697cee 100644
--- a/inst/include/unsupported/Eigen/src/LevenbergMarquardt/LMonestep.h
+++ b/inst/include/unsupported/Eigen/src/LevenbergMarquardt/LMonestep.h
@@ -14,34 +14,35 @@
 #ifndef EIGEN_LMONESTEP_H
 #define EIGEN_LMONESTEP_H
 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
 namespace Eigen {
 
-template<typename FunctorType>
-LevenbergMarquardtSpace::Status
-LevenbergMarquardt<FunctorType>::minimizeOneStep(FVectorType  &x)
-{
+template <typename FunctorType>
+LevenbergMarquardtSpace::Status LevenbergMarquardt<FunctorType>::minimizeOneStep(FVectorType &x) {
   using std::abs;
   using std::sqrt;
-  RealScalar temp, temp1,temp2; 
-  RealScalar ratio; 
+  RealScalar temp, temp1, temp2;
+  RealScalar ratio;
   RealScalar pnorm, xnorm, fnorm1, actred, dirder, prered;
-  eigen_assert(x.size()==n); // check the caller is not cheating us
+  eigen_assert(x.size() == n);  // check the caller is not cheating us
 
-  temp = 0.0; xnorm = 0.0;
+  temp = 0.0;
+  xnorm = 0.0;
   /* calculate the jacobian matrix. */
   Index df_ret = m_functor.df(x, m_fjac);
-  if (df_ret<0)
-      return LevenbergMarquardtSpace::UserAsked;
-  if (df_ret>0)
-      // numerical diff, we evaluated the function df_ret times
-      m_nfev += df_ret;
-  else m_njev++;
+  if (df_ret < 0) return LevenbergMarquardtSpace::UserAsked;
+  if (df_ret > 0)
+    // numerical diff, we evaluated the function df_ret times
+    m_nfev += df_ret;
+  else
+    m_njev++;
 
   /* compute the qr factorization of the jacobian. */
-  for (int j = 0; j < x.size(); ++j)
-    m_wa2(j) = m_fjac.col(j).blueNorm();
+  for (int j = 0; j < x.size(); ++j) m_wa2(j) = m_fjac.col(j).blueNorm();
   QRSolver qrfac(m_fjac);
-  if(qrfac.info() != Success) {
+  if (qrfac.info() != Success) {
     m_info = NumericalIssue;
     return LevenbergMarquardtSpace::ImproperInputParameters;
   }
@@ -52,30 +53,29 @@ LevenbergMarquardt<FunctorType>::minimizeOneStep(FVectorType  &x)
   /* on the first iteration and if external scaling is not used, scale according */
   /* to the norms of the columns of the initial jacobian. */
   if (m_iter == 1) {
-      if (!m_useExternalScaling)
-          for (Index j = 0; j < n; ++j)
-              m_diag[j] = (m_wa2[j]==0.)? 1. : m_wa2[j];
-
-      /* on the first iteration, calculate the norm of the scaled x */
-      /* and initialize the step bound m_delta. */
-      xnorm = m_diag.cwiseProduct(x).stableNorm();
-      m_delta = m_factor * xnorm;
-      if (m_delta == 0.)
-          m_delta = m_factor;
+    if (!m_useExternalScaling)
+      for (Index j = 0; j < n; ++j) m_diag[j] = (m_wa2[j] == 0.) ? 1. : m_wa2[j];
+
+    /* on the first iteration, calculate the norm of the scaled x */
+    /* and initialize the step bound m_delta. */
+    xnorm = m_diag.cwiseProduct(x).stableNorm();
+    m_delta = m_factor * xnorm;
+    if (m_delta == 0.) m_delta = m_factor;
   }
 
   /* form (q transpose)*m_fvec and store the first n components in */
   /* m_qtf. */
   m_wa4 = m_fvec;
-  m_wa4 = qrfac.matrixQ().adjoint() * m_fvec; 
+  m_wa4 = qrfac.matrixQ().adjoint() * m_fvec;
   m_qtf = m_wa4.head(n);
 
   /* compute the norm of the scaled gradient. */
   m_gnorm = 0.;
   if (m_fnorm != 0.)
-      for (Index j = 0; j < n; ++j)
-          if (m_wa2[m_permutation.indices()[j]] != 0.)
-              m_gnorm = (std::max)(m_gnorm, abs( m_rfactor.col(j).head(j+1).dot(m_qtf.head(j+1)/m_fnorm) / m_wa2[m_permutation.indices()[j]]));
+    for (Index j = 0; j < n; ++j)
+      if (m_wa2[m_permutation.indices()[j]] != 0.)
+        m_gnorm = (std::max)(m_gnorm, abs(m_rfactor.col(j).head(j + 1).dot(m_qtf.head(j + 1) / m_fnorm) /
+                                          m_wa2[m_permutation.indices()[j]]));
 
   /* test for convergence of the gradient norm. */
   if (m_gnorm <= m_gtol) {
@@ -84,8 +84,7 @@ LevenbergMarquardt<FunctorType>::minimizeOneStep(FVectorType  &x)
   }
 
   /* rescale if necessary. */
-  if (!m_useExternalScaling)
-      m_diag = m_diag.cwiseMax(m_wa2);
+  if (!m_useExternalScaling) m_diag = m_diag.cwiseMax(m_wa2);
 
   do {
     /* determine the levenberg-marquardt parameter. */
@@ -97,23 +96,20 @@ LevenbergMarquardt<FunctorType>::minimizeOneStep(FVectorType  &x)
     pnorm = m_diag.cwiseProduct(m_wa1).stableNorm();
 
     /* on the first iteration, adjust the initial step bound. */
-    if (m_iter == 1)
-        m_delta = (std::min)(m_delta,pnorm);
+    if (m_iter == 1) m_delta = (std::min)(m_delta, pnorm);
 
     /* evaluate the function at x + p and calculate its norm. */
-    if ( m_functor(m_wa2, m_wa4) < 0)
-        return LevenbergMarquardtSpace::UserAsked;
+    if (m_functor(m_wa2, m_wa4) < 0) return LevenbergMarquardtSpace::UserAsked;
     ++m_nfev;
     fnorm1 = m_wa4.stableNorm();
 
     /* compute the scaled actual reduction. */
     actred = -1.;
-    if (Scalar(.1) * fnorm1 < m_fnorm)
-        actred = 1. - numext::abs2(fnorm1 / m_fnorm);
+    if (Scalar(.1) * fnorm1 < m_fnorm) actred = 1. - numext::abs2(fnorm1 / m_fnorm);
 
     /* compute the scaled predicted reduction and */
     /* the scaled directional derivative. */
-    m_wa3 = m_rfactor.template triangularView<Upper>() * (m_permutation.inverse() *m_wa1);
+    m_wa3 = m_rfactor.template triangularView<Upper>() * (m_permutation.inverse() * m_wa1);
     temp1 = numext::abs2(m_wa3.stableNorm() / m_fnorm);
     temp2 = numext::abs2(sqrt(m_par) * pnorm / m_fnorm);
     prered = temp1 + temp2 / Scalar(.5);
@@ -122,71 +118,61 @@ LevenbergMarquardt<FunctorType>::minimizeOneStep(FVectorType  &x)
     /* compute the ratio of the actual to the predicted */
     /* reduction. */
     ratio = 0.;
-    if (prered != 0.)
-        ratio = actred / prered;
+    if (prered != 0.) ratio = actred / prered;
 
     /* update the step bound. */
     if (ratio <= Scalar(.25)) {
-        if (actred >= 0.)
-            temp = RealScalar(.5);
-        if (actred < 0.)
-            temp = RealScalar(.5) * dirder / (dirder + RealScalar(.5) * actred);
-        if (RealScalar(.1) * fnorm1 >= m_fnorm || temp < RealScalar(.1))
-            temp = Scalar(.1);
-        /* Computing MIN */
-        m_delta = temp * (std::min)(m_delta, pnorm / RealScalar(.1));
-        m_par /= temp;
+      if (actred >= 0.) temp = RealScalar(.5);
+      if (actred < 0.) temp = RealScalar(.5) * dirder / (dirder + RealScalar(.5) * actred);
+      if (RealScalar(.1) * fnorm1 >= m_fnorm || temp < RealScalar(.1)) temp = Scalar(.1);
+      /* Computing MIN */
+      m_delta = temp * (std::min)(m_delta, pnorm / RealScalar(.1));
+      m_par /= temp;
     } else if (!(m_par != 0. && ratio < RealScalar(.75))) {
-        m_delta = pnorm / RealScalar(.5);
-        m_par = RealScalar(.5) * m_par;
+      m_delta = pnorm / RealScalar(.5);
+      m_par = RealScalar(.5) * m_par;
     }
 
     /* test for successful iteration. */
     if (ratio >= RealScalar(1e-4)) {
-        /* successful iteration. update x, m_fvec, and their norms. */
-        x = m_wa2;
-        m_wa2 = m_diag.cwiseProduct(x);
-        m_fvec = m_wa4;
-        xnorm = m_wa2.stableNorm();
-        m_fnorm = fnorm1;
-        ++m_iter;
+      /* successful iteration. update x, m_fvec, and their norms. */
+      x = m_wa2;
+      m_wa2 = m_diag.cwiseProduct(x);
+      m_fvec = m_wa4;
+      xnorm = m_wa2.stableNorm();
+      m_fnorm = fnorm1;
+      ++m_iter;
     }
 
     /* tests for convergence. */
-    if (abs(actred) <= m_ftol && prered <= m_ftol && Scalar(.5) * ratio <= 1. && m_delta <= m_xtol * xnorm)
-    {
-       m_info = Success;
+    if (abs(actred) <= m_ftol && prered <= m_ftol && Scalar(.5) * ratio <= 1. && m_delta <= m_xtol * xnorm) {
+      m_info = Success;
       return LevenbergMarquardtSpace::RelativeErrorAndReductionTooSmall;
     }
-    if (abs(actred) <= m_ftol && prered <= m_ftol && Scalar(.5) * ratio <= 1.) 
-    {
+    if (abs(actred) <= m_ftol && prered <= m_ftol && Scalar(.5) * ratio <= 1.) {
       m_info = Success;
       return LevenbergMarquardtSpace::RelativeReductionTooSmall;
     }
-    if (m_delta <= m_xtol * xnorm)
-    {
+    if (m_delta <= m_xtol * xnorm) {
       m_info = Success;
       return LevenbergMarquardtSpace::RelativeErrorTooSmall;
     }
 
     /* tests for termination and stringent tolerances. */
-    if (m_nfev >= m_maxfev) 
-    {
+    if (m_nfev >= m_maxfev) {
       m_info = NoConvergence;
       return LevenbergMarquardtSpace::TooManyFunctionEvaluation;
     }
-    if (abs(actred) <= NumTraits<Scalar>::epsilon() && prered <= NumTraits<Scalar>::epsilon() && Scalar(.5) * ratio <= 1.)
-    {
+    if (abs(actred) <= NumTraits<Scalar>::epsilon() && prered <= NumTraits<Scalar>::epsilon() &&
+        Scalar(.5) * ratio <= 1.) {
       m_info = Success;
       return LevenbergMarquardtSpace::FtolTooSmall;
     }
-    if (m_delta <= NumTraits<Scalar>::epsilon() * xnorm) 
-    {
+    if (m_delta <= NumTraits<Scalar>::epsilon() * xnorm) {
       m_info = Success;
       return LevenbergMarquardtSpace::XtolTooSmall;
     }
-    if (m_gnorm <= NumTraits<Scalar>::epsilon())
-    {
+    if (m_gnorm <= NumTraits<Scalar>::epsilon()) {
       m_info = Success;
       return LevenbergMarquardtSpace::GtolTooSmall;
     }
@@ -196,7 +182,6 @@ LevenbergMarquardt<FunctorType>::minimizeOneStep(FVectorType  &x)
   return LevenbergMarquardtSpace::Running;
 }
 
-  
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_LMONESTEP_H
+#endif  // EIGEN_LMONESTEP_H
diff --git a/inst/include/unsupported/Eigen/src/LevenbergMarquardt/LMpar.h b/inst/include/unsupported/Eigen/src/LevenbergMarquardt/LMpar.h
index 9532042d..01fcfdc2 100644
--- a/inst/include/unsupported/Eigen/src/LevenbergMarquardt/LMpar.h
+++ b/inst/include/unsupported/Eigen/src/LevenbergMarquardt/LMpar.h
@@ -12,149 +12,138 @@
 #ifndef EIGEN_LMPAR_H
 #define EIGEN_LMPAR_H
 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
 namespace Eigen {
 
 namespace internal {
-  
-  template <typename QRSolver, typename VectorType>
-    void lmpar2(
-    const QRSolver &qr,
-    const VectorType  &diag,
-    const VectorType  &qtb,
-    typename VectorType::Scalar m_delta,
-    typename VectorType::Scalar &par,
-    VectorType  &x)
-
-  {
-    using std::sqrt;
-    using std::abs;
-    typedef typename QRSolver::MatrixType MatrixType;
-    typedef typename QRSolver::Scalar Scalar;
-    typedef typename QRSolver::Index Index;
-
-    /* Local variables */
-    Index j;
-    Scalar fp;
-    Scalar parc, parl;
-    Index iter;
-    Scalar temp, paru;
-    Scalar gnorm;
-    Scalar dxnorm;
-    
-    // Make a copy of the triangular factor. 
-    // This copy is modified during call the qrsolv
-    MatrixType s;
-    s = qr.matrixR();
-
-    /* Function Body */
-    const Scalar dwarf = (std::numeric_limits<Scalar>::min)();
-    const Index n = qr.matrixR().cols();
-    eigen_assert(n==diag.size());
-    eigen_assert(n==qtb.size());
-
-    VectorType  wa1, wa2;
-
-    /* compute and store in x the gauss-newton direction. if the */
-    /* jacobian is rank-deficient, obtain a least squares solution. */
-
-    //    const Index rank = qr.nonzeroPivots(); // exactly double(0.)
-    const Index rank = qr.rank(); // use a threshold
-    wa1 = qtb;
-    wa1.tail(n-rank).setZero();
-    //FIXME There is no solve in place for sparse triangularView
-    wa1.head(rank) = s.topLeftCorner(rank,rank).template triangularView<Upper>().solve(qtb.head(rank));
-
-    x = qr.colsPermutation()*wa1;
-
-    /* initialize the iteration counter. */
-    /* evaluate the function at the origin, and test */
-    /* for acceptance of the gauss-newton direction. */
-    iter = 0;
+
+template <typename QRSolver, typename VectorType>
+void lmpar2(const QRSolver &qr, const VectorType &diag, const VectorType &qtb, typename VectorType::Scalar m_delta,
+            typename VectorType::Scalar &par, VectorType &x)
+
+{
+  using std::abs;
+  using std::sqrt;
+  typedef typename QRSolver::MatrixType MatrixType;
+  typedef typename QRSolver::Scalar Scalar;
+  //    typedef typename QRSolver::StorageIndex StorageIndex;
+
+  /* Local variables */
+  Index j;
+  Scalar fp;
+  Scalar parc, parl;
+  Index iter;
+  Scalar temp, paru;
+  Scalar gnorm;
+  Scalar dxnorm;
+
+  // Make a copy of the triangular factor.
+  // This copy is modified during call the qrsolv
+  MatrixType s;
+  s = qr.matrixR();
+
+  /* Function Body */
+  const Scalar dwarf = (std::numeric_limits<Scalar>::min)();
+  const Index n = qr.matrixR().cols();
+  eigen_assert(n == diag.size());
+  eigen_assert(n == qtb.size());
+
+  VectorType wa1, wa2;
+
+  /* compute and store in x the gauss-newton direction. if the */
+  /* jacobian is rank-deficient, obtain a least squares solution. */
+
+  //    const Index rank = qr.nonzeroPivots(); // exactly double(0.)
+  const Index rank = qr.rank();  // use a threshold
+  wa1 = qtb;
+  wa1.tail(n - rank).setZero();
+  // FIXME There is no solve in place for sparse triangularView
+  wa1.head(rank) = s.topLeftCorner(rank, rank).template triangularView<Upper>().solve(qtb.head(rank));
+
+  x = qr.colsPermutation() * wa1;
+
+  /* initialize the iteration counter. */
+  /* evaluate the function at the origin, and test */
+  /* for acceptance of the gauss-newton direction. */
+  iter = 0;
+  wa2 = diag.cwiseProduct(x);
+  dxnorm = wa2.blueNorm();
+  fp = dxnorm - m_delta;
+  if (fp <= Scalar(0.1) * m_delta) {
+    par = 0;
+    return;
+  }
+
+  /* if the jacobian is not rank deficient, the newton */
+  /* step provides a lower bound, parl, for the zero of */
+  /* the function. otherwise set this bound to zero. */
+  parl = 0.;
+  if (rank == n) {
+    wa1 = qr.colsPermutation().inverse() * diag.cwiseProduct(wa2) / dxnorm;
+    s.topLeftCorner(n, n).transpose().template triangularView<Lower>().solveInPlace(wa1);
+    temp = wa1.blueNorm();
+    parl = fp / m_delta / temp / temp;
+  }
+
+  /* calculate an upper bound, paru, for the zero of the function. */
+  for (j = 0; j < n; ++j) wa1[j] = s.col(j).head(j + 1).dot(qtb.head(j + 1)) / diag[qr.colsPermutation().indices()(j)];
+
+  gnorm = wa1.stableNorm();
+  paru = gnorm / m_delta;
+  if (paru == 0.) paru = dwarf / (std::min)(m_delta, Scalar(0.1));
+
+  /* if the input par lies outside of the interval (parl,paru), */
+  /* set par to the closer endpoint. */
+  par = (std::max)(par, parl);
+  par = (std::min)(par, paru);
+  if (par == 0.) par = gnorm / dxnorm;
+
+  /* beginning of an iteration. */
+  while (true) {
+    ++iter;
+
+    /* evaluate the function at the current value of par. */
+    if (par == 0.) par = (std::max)(dwarf, Scalar(.001) * paru); /* Computing MAX */
+    wa1 = sqrt(par) * diag;
+
+    VectorType sdiag(n);
+    lmqrsolv(s, qr.colsPermutation(), wa1, qtb, x, sdiag);
+
     wa2 = diag.cwiseProduct(x);
     dxnorm = wa2.blueNorm();
+    temp = fp;
     fp = dxnorm - m_delta;
-    if (fp <= Scalar(0.1) * m_delta) {
-      par = 0;
-      return;
-    }
 
-    /* if the jacobian is not rank deficient, the newton */
-    /* step provides a lower bound, parl, for the zero of */
-    /* the function. otherwise set this bound to zero. */
-    parl = 0.;
-    if (rank==n) {
-      wa1 = qr.colsPermutation().inverse() *  diag.cwiseProduct(wa2)/dxnorm;
-      s.topLeftCorner(n,n).transpose().template triangularView<Lower>().solveInPlace(wa1);
-      temp = wa1.blueNorm();
-      parl = fp / m_delta / temp / temp;
+    /* if the function is small enough, accept the current value */
+    /* of par. also test for the exceptional cases where parl */
+    /* is zero or the number of iterations has reached 10. */
+    if (abs(fp) <= Scalar(0.1) * m_delta || (parl == 0. && fp <= temp && temp < 0.) || iter == 10) break;
+
+    /* compute the newton correction. */
+    wa1 = qr.colsPermutation().inverse() * diag.cwiseProduct(wa2 / dxnorm);
+    // we could almost use this here, but the diagonal is outside qr, in sdiag[]
+    for (j = 0; j < n; ++j) {
+      wa1[j] /= sdiag[j];
+      temp = wa1[j];
+      for (Index i = j + 1; i < n; ++i) wa1[i] -= s.coeff(i, j) * temp;
     }
+    temp = wa1.blueNorm();
+    parc = fp / m_delta / temp / temp;
 
-    /* calculate an upper bound, paru, for the zero of the function. */
-    for (j = 0; j < n; ++j)
-      wa1[j] = s.col(j).head(j+1).dot(qtb.head(j+1)) / diag[qr.colsPermutation().indices()(j)];
-
-    gnorm = wa1.stableNorm();
-    paru = gnorm / m_delta;
-    if (paru == 0.)
-      paru = dwarf / (std::min)(m_delta,Scalar(0.1));
-
-    /* if the input par lies outside of the interval (parl,paru), */
-    /* set par to the closer endpoint. */
-    par = (std::max)(par,parl);
-    par = (std::min)(par,paru);
-    if (par == 0.)
-      par = gnorm / dxnorm;
-
-    /* beginning of an iteration. */
-    while (true) {
-      ++iter;
-
-      /* evaluate the function at the current value of par. */
-      if (par == 0.)
-        par = (std::max)(dwarf,Scalar(.001) * paru); /* Computing MAX */
-      wa1 = sqrt(par)* diag;
-
-      VectorType sdiag(n);
-      lmqrsolv(s, qr.colsPermutation(), wa1, qtb, x, sdiag);
-
-      wa2 = diag.cwiseProduct(x);
-      dxnorm = wa2.blueNorm();
-      temp = fp;
-      fp = dxnorm - m_delta;
-
-      /* if the function is small enough, accept the current value */
-      /* of par. also test for the exceptional cases where parl */
-      /* is zero or the number of iterations has reached 10. */
-      if (abs(fp) <= Scalar(0.1) * m_delta || (parl == 0. && fp <= temp && temp < 0.) || iter == 10)
-        break;
-
-      /* compute the newton correction. */
-      wa1 = qr.colsPermutation().inverse() * diag.cwiseProduct(wa2/dxnorm);
-      // we could almost use this here, but the diagonal is outside qr, in sdiag[]
-      for (j = 0; j < n; ++j) {
-        wa1[j] /= sdiag[j];
-        temp = wa1[j];
-        for (Index i = j+1; i < n; ++i)
-          wa1[i] -= s.coeff(i,j) * temp;
-      }
-      temp = wa1.blueNorm();
-      parc = fp / m_delta / temp / temp;
-
-      /* depending on the sign of the function, update parl or paru. */
-      if (fp > 0.)
-        parl = (std::max)(parl,par);
-      if (fp < 0.)
-        paru = (std::min)(paru,par);
-
-      /* compute an improved estimate for par. */
-      par = (std::max)(parl,par+parc);
-    }
-    if (iter == 0)
-      par = 0.;
-    return;
+    /* depending on the sign of the function, update parl or paru. */
+    if (fp > 0.) parl = (std::max)(parl, par);
+    if (fp < 0.) paru = (std::min)(paru, par);
+
+    /* compute an improved estimate for par. */
+    par = (std::max)(parl, par + parc);
   }
-} // end namespace internal
+  if (iter == 0) par = 0.;
+  return;
+}
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_LMPAR_H
+#endif  // EIGEN_LMPAR_H
diff --git a/inst/include/unsupported/Eigen/src/LevenbergMarquardt/LMqrsolv.h b/inst/include/unsupported/Eigen/src/LevenbergMarquardt/LMqrsolv.h
index f5290dee..49bf6e1a 100644
--- a/inst/include/unsupported/Eigen/src/LevenbergMarquardt/LMqrsolv.h
+++ b/inst/include/unsupported/Eigen/src/LevenbergMarquardt/LMqrsolv.h
@@ -15,175 +15,164 @@
 #ifndef EIGEN_LMQRSOLV_H
 #define EIGEN_LMQRSOLV_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 namespace internal {
 
-template <typename Scalar,int Rows, int Cols, typename Index>
-void lmqrsolv(
-  Matrix<Scalar,Rows,Cols> &s,
-  const PermutationMatrix<Dynamic,Dynamic,Index> &iPerm,
-  const Matrix<Scalar,Dynamic,1> &diag,
-  const Matrix<Scalar,Dynamic,1> &qtb,
-  Matrix<Scalar,Dynamic,1> &x,
-  Matrix<Scalar,Dynamic,1> &sdiag)
-{
-
-    /* Local variables */
-    Index i, j, k, l;
-    Scalar temp;
-    Index n = s.cols();
-    Matrix<Scalar,Dynamic,1>  wa(n);
-    JacobiRotation<Scalar> givens;
-
-    /* Function Body */
-    // the following will only change the lower triangular part of s, including
-    // the diagonal, though the diagonal is restored afterward
-
-    /*     copy r and (q transpose)*b to preserve input and initialize s. */
-    /*     in particular, save the diagonal elements of r in x. */
-    x = s.diagonal();
-    wa = qtb;
-    
-   
-    s.topLeftCorner(n,n).template triangularView<StrictlyLower>() = s.topLeftCorner(n,n).transpose();
-    /*     eliminate the diagonal matrix d using a givens rotation. */
-    for (j = 0; j < n; ++j) {
-
-        /*        prepare the row of d to be eliminated, locating the */
-        /*        diagonal element using p from the qr factorization. */
-        l = iPerm.indices()(j);
-        if (diag[l] == 0.)
-            break;
-        sdiag.tail(n-j).setZero();
-        sdiag[j] = diag[l];
-
-        /*        the transformations to eliminate the row of d */
-        /*        modify only a single element of (q transpose)*b */
-        /*        beyond the first n, which is initially zero. */
-        Scalar qtbpj = 0.;
-        for (k = j; k < n; ++k) {
-            /*           determine a givens rotation which eliminates the */
-            /*           appropriate element in the current row of d. */
-            givens.makeGivens(-s(k,k), sdiag[k]);
-
-            /*           compute the modified diagonal element of r and */
-            /*           the modified element of ((q transpose)*b,0). */
-            s(k,k) = givens.c() * s(k,k) + givens.s() * sdiag[k];
-            temp = givens.c() * wa[k] + givens.s() * qtbpj;
-            qtbpj = -givens.s() * wa[k] + givens.c() * qtbpj;
-            wa[k] = temp;
-
-            /*           accumulate the tranformation in the row of s. */
-            for (i = k+1; i<n; ++i) {
-                temp = givens.c() * s(i,k) + givens.s() * sdiag[i];
-                sdiag[i] = -givens.s() * s(i,k) + givens.c() * sdiag[i];
-                s(i,k) = temp;
-            }
-        }
+template <typename Scalar, int Rows, int Cols, typename PermIndex>
+void lmqrsolv(Matrix<Scalar, Rows, Cols> &s, const PermutationMatrix<Dynamic, Dynamic, PermIndex> &iPerm,
+              const Matrix<Scalar, Dynamic, 1> &diag, const Matrix<Scalar, Dynamic, 1> &qtb,
+              Matrix<Scalar, Dynamic, 1> &x, Matrix<Scalar, Dynamic, 1> &sdiag) {
+  /* Local variables */
+  Index i, j, k;
+  Scalar temp;
+  Index n = s.cols();
+  Matrix<Scalar, Dynamic, 1> wa(n);
+  JacobiRotation<Scalar> givens;
+
+  /* Function Body */
+  // the following will only change the lower triangular part of s, including
+  // the diagonal, though the diagonal is restored afterward
+
+  /*     copy r and (q transpose)*b to preserve input and initialize s. */
+  /*     in particular, save the diagonal elements of r in x. */
+  x = s.diagonal();
+  wa = qtb;
+
+  s.topLeftCorner(n, n).template triangularView<StrictlyLower>() = s.topLeftCorner(n, n).transpose();
+  /*     eliminate the diagonal matrix d using a givens rotation. */
+  for (j = 0; j < n; ++j) {
+    /*        prepare the row of d to be eliminated, locating the */
+    /*        diagonal element using p from the qr factorization. */
+    const PermIndex l = iPerm.indices()(j);
+    if (diag[l] == 0.) break;
+    sdiag.tail(n - j).setZero();
+    sdiag[j] = diag[l];
+
+    /*        the transformations to eliminate the row of d */
+    /*        modify only a single element of (q transpose)*b */
+    /*        beyond the first n, which is initially zero. */
+    Scalar qtbpj = 0.;
+    for (k = j; k < n; ++k) {
+      /*           determine a givens rotation which eliminates the */
+      /*           appropriate element in the current row of d. */
+      givens.makeGivens(-s(k, k), sdiag[k]);
+
+      /*           compute the modified diagonal element of r and */
+      /*           the modified element of ((q transpose)*b,0). */
+      s(k, k) = givens.c() * s(k, k) + givens.s() * sdiag[k];
+      temp = givens.c() * wa[k] + givens.s() * qtbpj;
+      qtbpj = -givens.s() * wa[k] + givens.c() * qtbpj;
+      wa[k] = temp;
+
+      /*           accumulate the transformation in the row of s. */
+      for (i = k + 1; i < n; ++i) {
+        temp = givens.c() * s(i, k) + givens.s() * sdiag[i];
+        sdiag[i] = -givens.s() * s(i, k) + givens.c() * sdiag[i];
+        s(i, k) = temp;
+      }
     }
-  
-    /*     solve the triangular system for z. if the system is */
-    /*     singular, then obtain a least squares solution. */
-    Index nsing;
-    for(nsing=0; nsing<n && sdiag[nsing]!=0; nsing++) {}
-
-    wa.tail(n-nsing).setZero();
-    s.topLeftCorner(nsing, nsing).transpose().template triangularView<Upper>().solveInPlace(wa.head(nsing));
-  
-    // restore
-    sdiag = s.diagonal();
-    s.diagonal() = x;
-
-    /* permute the components of z back to components of x. */
-    x = iPerm * wa; 
+  }
+
+  /*     solve the triangular system for z. if the system is */
+  /*     singular, then obtain a least squares solution. */
+  Index nsing;
+  for (nsing = 0; nsing < n && sdiag[nsing] != 0; nsing++) {
+  }
+
+  wa.tail(n - nsing).setZero();
+  s.topLeftCorner(nsing, nsing).transpose().template triangularView<Upper>().solveInPlace(wa.head(nsing));
+
+  // restore
+  sdiag = s.diagonal();
+  s.diagonal() = x;
+
+  /* permute the components of z back to components of x. */
+  x = iPerm * wa;
 }
 
-template <typename Scalar, int _Options, typename Index>
-void lmqrsolv(
-  SparseMatrix<Scalar,_Options,Index> &s,
-  const PermutationMatrix<Dynamic,Dynamic> &iPerm,
-  const Matrix<Scalar,Dynamic,1> &diag,
-  const Matrix<Scalar,Dynamic,1> &qtb,
-  Matrix<Scalar,Dynamic,1> &x,
-  Matrix<Scalar,Dynamic,1> &sdiag)
-{
+template <typename Scalar, int Options_, typename Index>
+void lmqrsolv(SparseMatrix<Scalar, Options_, Index> &s, const PermutationMatrix<Dynamic, Dynamic> &iPerm,
+              const Matrix<Scalar, Dynamic, 1> &diag, const Matrix<Scalar, Dynamic, 1> &qtb,
+              Matrix<Scalar, Dynamic, 1> &x, Matrix<Scalar, Dynamic, 1> &sdiag) {
   /* Local variables */
-  typedef SparseMatrix<Scalar,RowMajor,Index> FactorType;
-    Index i, j, k, l;
-    Scalar temp;
-    Index n = s.cols();
-    Matrix<Scalar,Dynamic,1>  wa(n);
-    JacobiRotation<Scalar> givens;
-
-    /* Function Body */
-    // the following will only change the lower triangular part of s, including
-    // the diagonal, though the diagonal is restored afterward
-
-    /*     copy r and (q transpose)*b to preserve input and initialize R. */
-    wa = qtb;
-    FactorType R(s);
-    // Eliminate the diagonal matrix d using a givens rotation
-    for (j = 0; j < n; ++j)
-    {
-      // Prepare the row of d to be eliminated, locating the 
-      // diagonal element using p from the qr factorization
-      l = iPerm.indices()(j);
-      if (diag(l) == Scalar(0)) 
-        break; 
-      sdiag.tail(n-j).setZero();
-      sdiag[j] = diag[l];
-      // the transformations to eliminate the row of d
-      // modify only a single element of (q transpose)*b
-      // beyond the first n, which is initially zero. 
-      
-      Scalar qtbpj = 0; 
-      // Browse the nonzero elements of row j of the upper triangular s
-      for (k = j; k < n; ++k)
-      {
-        typename FactorType::InnerIterator itk(R,k);
-        for (; itk; ++itk){
-          if (itk.index() < k) continue;
-          else break;
-        }
-        //At this point, we have the diagonal element R(k,k)
-        // Determine a givens rotation which eliminates 
-        // the appropriate element in the current row of d
-        givens.makeGivens(-itk.value(), sdiag(k));
-        
-        // Compute the modified diagonal element of r and 
-        // the modified element of ((q transpose)*b,0).
-        itk.valueRef() = givens.c() * itk.value() + givens.s() * sdiag(k);
-        temp = givens.c() * wa(k) + givens.s() * qtbpj; 
-        qtbpj = -givens.s() * wa(k) + givens.c() * qtbpj;
-        wa(k) = temp;
-        
-        // Accumulate the transformation in the remaining k row/column of R
-        for (++itk; itk; ++itk)
-        {
-          i = itk.index();
-          temp = givens.c() *  itk.value() + givens.s() * sdiag(i);
-          sdiag(i) = -givens.s() * itk.value() + givens.c() * sdiag(i);
-          itk.valueRef() = temp;
-        }
+  typedef SparseMatrix<Scalar, RowMajor, Index> FactorType;
+  Index i, j, k, l;
+  Scalar temp;
+  Index n = s.cols();
+  Matrix<Scalar, Dynamic, 1> wa(n);
+  JacobiRotation<Scalar> givens;
+
+  /* Function Body */
+  // the following will only change the lower triangular part of s, including
+  // the diagonal, though the diagonal is restored afterward
+
+  /*     copy r and (q transpose)*b to preserve input and initialize R. */
+  wa = qtb;
+  FactorType R(s);
+  // Eliminate the diagonal matrix d using a givens rotation
+  for (j = 0; j < n; ++j) {
+    // Prepare the row of d to be eliminated, locating the
+    // diagonal element using p from the qr factorization
+    l = iPerm.indices()(j);
+    if (diag(l) == Scalar(0)) break;
+    sdiag.tail(n - j).setZero();
+    sdiag[j] = diag[l];
+    // the transformations to eliminate the row of d
+    // modify only a single element of (q transpose)*b
+    // beyond the first n, which is initially zero.
+
+    Scalar qtbpj = 0;
+    // Browse the nonzero elements of row j of the upper triangular s
+    for (k = j; k < n; ++k) {
+      typename FactorType::InnerIterator itk(R, k);
+      for (; itk; ++itk) {
+        if (itk.index() < k)
+          continue;
+        else
+          break;
+      }
+      // At this point, we have the diagonal element R(k,k)
+      //  Determine a givens rotation which eliminates
+      //  the appropriate element in the current row of d
+      givens.makeGivens(-itk.value(), sdiag(k));
+
+      // Compute the modified diagonal element of r and
+      // the modified element of ((q transpose)*b,0).
+      itk.valueRef() = givens.c() * itk.value() + givens.s() * sdiag(k);
+      temp = givens.c() * wa(k) + givens.s() * qtbpj;
+      qtbpj = -givens.s() * wa(k) + givens.c() * qtbpj;
+      wa(k) = temp;
+
+      // Accumulate the transformation in the remaining k row/column of R
+      for (++itk; itk; ++itk) {
+        i = itk.index();
+        temp = givens.c() * itk.value() + givens.s() * sdiag(i);
+        sdiag(i) = -givens.s() * itk.value() + givens.c() * sdiag(i);
+        itk.valueRef() = temp;
       }
     }
-    
-    // Solve the triangular system for z. If the system is 
-    // singular, then obtain a least squares solution
-    Index nsing;
-    for(nsing = 0; nsing<n && sdiag(nsing) !=0; nsing++) {}
-    
-    wa.tail(n-nsing).setZero();
-//     x = wa; 
-    wa.head(nsing) = R.topLeftCorner(nsing,nsing).template triangularView<Upper>().solve/*InPlace*/(wa.head(nsing));
-    
-    sdiag = R.diagonal();
-    // Permute the components of z back to components of x
-    x = iPerm * wa; 
+  }
+
+  // Solve the triangular system for z. If the system is
+  // singular, then obtain a least squares solution
+  Index nsing;
+  for (nsing = 0; nsing < n && sdiag(nsing) != 0; nsing++) {
+  }
+
+  wa.tail(n - nsing).setZero();
+  //     x = wa;
+  wa.head(nsing) = R.topLeftCorner(nsing, nsing).template triangularView<Upper>().solve /*InPlace*/ (wa.head(nsing));
+
+  sdiag = R.diagonal();
+  // Permute the components of z back to components of x
+  x = iPerm * wa;
 }
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_LMQRSOLV_H
+#endif  // EIGEN_LMQRSOLV_H
diff --git a/inst/include/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h b/inst/include/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h
index 51dd1d3c..b8a6ddae 100644
--- a/inst/include/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h
+++ b/inst/include/unsupported/Eigen/src/LevenbergMarquardt/LevenbergMarquardt.h
@@ -19,36 +19,34 @@
 #ifndef EIGEN_LEVENBERGMARQUARDT_H
 #define EIGEN_LEVENBERGMARQUARDT_H
 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 
 namespace Eigen {
 namespace LevenbergMarquardtSpace {
-    enum Status {
-        NotStarted = -2,
-        Running = -1,
-        ImproperInputParameters = 0,
-        RelativeReductionTooSmall = 1,
-        RelativeErrorTooSmall = 2,
-        RelativeErrorAndReductionTooSmall = 3,
-        CosinusTooSmall = 4,
-        TooManyFunctionEvaluation = 5,
-        FtolTooSmall = 6,
-        XtolTooSmall = 7,
-        GtolTooSmall = 8,
-        UserAsked = 9
-    };
+enum Status {
+  NotStarted = -2,
+  Running = -1,
+  ImproperInputParameters = 0,
+  RelativeReductionTooSmall = 1,
+  RelativeErrorTooSmall = 2,
+  RelativeErrorAndReductionTooSmall = 3,
+  CosinusTooSmall = 4,
+  TooManyFunctionEvaluation = 5,
+  FtolTooSmall = 6,
+  XtolTooSmall = 7,
+  GtolTooSmall = 8,
+  UserAsked = 9
+};
 }
 
-template <typename _Scalar, int NX=Dynamic, int NY=Dynamic>
-struct DenseFunctor
-{
-  typedef _Scalar Scalar;
-  enum {
-    InputsAtCompileTime = NX,
-    ValuesAtCompileTime = NY
-  };
-  typedef Matrix<Scalar,InputsAtCompileTime,1> InputType;
-  typedef Matrix<Scalar,ValuesAtCompileTime,1> ValueType;
-  typedef Matrix<Scalar,ValuesAtCompileTime,InputsAtCompileTime> JacobianType;
+template <typename Scalar_, int NX = Dynamic, int NY = Dynamic>
+struct DenseFunctor {
+  typedef Scalar_ Scalar;
+  enum { InputsAtCompileTime = NX, ValuesAtCompileTime = NY };
+  typedef Matrix<Scalar, InputsAtCompileTime, 1> InputType;
+  typedef Matrix<Scalar, ValuesAtCompileTime, 1> ValueType;
+  typedef Matrix<Scalar, ValuesAtCompileTime, InputsAtCompileTime> JacobianType;
   typedef ColPivHouseholderQR<JacobianType> QRSolver;
   const int m_inputs, m_values;
 
@@ -58,320 +56,307 @@ struct DenseFunctor
   int inputs() const { return m_inputs; }
   int values() const { return m_values; }
 
-  //int operator()(const InputType &x, ValueType& fvec) { }
-  // should be defined in derived classes
-  
-  //int df(const InputType &x, JacobianType& fjac) { }
-  // should be defined in derived classes
+  // int operator()(const InputType &x, ValueType& fvec) { }
+  //  should be defined in derived classes
+
+  // int df(const InputType &x, JacobianType& fjac) { }
+  //  should be defined in derived classes
 };
 
-template <typename _Scalar, typename _Index>
-struct SparseFunctor
-{
-  typedef _Scalar Scalar;
-  typedef _Index Index;
-  typedef Matrix<Scalar,Dynamic,1> InputType;
-  typedef Matrix<Scalar,Dynamic,1> ValueType;
+template <typename Scalar_, typename Index_>
+struct SparseFunctor {
+  typedef Scalar_ Scalar;
+  typedef Index_ Index;
+  typedef Matrix<Scalar, Dynamic, 1> InputType;
+  typedef Matrix<Scalar, Dynamic, 1> ValueType;
   typedef SparseMatrix<Scalar, ColMajor, Index> JacobianType;
   typedef SparseQR<JacobianType, COLAMDOrdering<int> > QRSolver;
-  enum {
-    InputsAtCompileTime = Dynamic,
-    ValuesAtCompileTime = Dynamic
-  };
-  
+  enum { InputsAtCompileTime = Dynamic, ValuesAtCompileTime = Dynamic };
+
   SparseFunctor(int inputs, int values) : m_inputs(inputs), m_values(values) {}
 
   int inputs() const { return m_inputs; }
   int values() const { return m_values; }
-  
+
   const int m_inputs, m_values;
-  //int operator()(const InputType &x, ValueType& fvec) { }
-  // to be defined in the functor
-  
-  //int df(const InputType &x, JacobianType& fjac) { }
-  // to be defined in the functor if no automatic differentiation
-  
+  // int operator()(const InputType &x, ValueType& fvec) { }
+  //  to be defined in the functor
+
+  // int df(const InputType &x, JacobianType& fjac) { }
+  //  to be defined in the functor if no automatic differentiation
 };
 namespace internal {
 template <typename QRSolver, typename VectorType>
-void lmpar2(const QRSolver &qr, const VectorType  &diag, const VectorType  &qtb,
-	    typename VectorType::Scalar m_delta, typename VectorType::Scalar &par,
-	    VectorType  &x);
-    }
+void lmpar2(const QRSolver &qr, const VectorType &diag, const VectorType &qtb, typename VectorType::Scalar m_delta,
+            typename VectorType::Scalar &par, VectorType &x);
+}
 /**
-  * \ingroup NonLinearOptimization_Module
-  * \brief Performs non linear optimization over a non-linear function,
-  * using a variant of the Levenberg Marquardt algorithm.
-  *
-  * Check wikipedia for more information.
-  * http://en.wikipedia.org/wiki/Levenberg%E2%80%93Marquardt_algorithm
-  */
-template<typename _FunctorType>
-class LevenbergMarquardt : internal::no_assignment_operator
-{
-  public:
-    typedef _FunctorType FunctorType;
-    typedef typename FunctorType::QRSolver QRSolver;
-    typedef typename FunctorType::JacobianType JacobianType;
-    typedef typename JacobianType::Scalar Scalar;
-    typedef typename JacobianType::RealScalar RealScalar; 
-    typedef typename JacobianType::Index Index;
-    typedef typename QRSolver::Index PermIndex;
-    typedef Matrix<Scalar,Dynamic,1> FVectorType;
-    typedef PermutationMatrix<Dynamic,Dynamic> PermutationType;
-  public:
-    LevenbergMarquardt(FunctorType& functor) 
-    : m_functor(functor),m_nfev(0),m_njev(0),m_fnorm(0.0),m_gnorm(0),
-      m_isInitialized(false),m_info(InvalidInput)
-    {
-      resetParameters();
-      m_useExternalScaling=false; 
-    }
-    
-    LevenbergMarquardtSpace::Status minimize(FVectorType &x);
-    LevenbergMarquardtSpace::Status minimizeInit(FVectorType &x);
-    LevenbergMarquardtSpace::Status minimizeOneStep(FVectorType &x);
-    LevenbergMarquardtSpace::Status lmder1(
-      FVectorType  &x, 
-      const Scalar tol = std::sqrt(NumTraits<Scalar>::epsilon())
-    );
-    static LevenbergMarquardtSpace::Status lmdif1(
-            FunctorType &functor,
-            FVectorType  &x,
-            Index *nfev,
-            const Scalar tol = std::sqrt(NumTraits<Scalar>::epsilon())
-            );
-    
-    /** Sets the default parameters */
-    void resetParameters() 
-    { 
-      m_factor = 100.; 
-      m_maxfev = 400; 
-      m_ftol = std::sqrt(NumTraits<RealScalar>::epsilon());
-      m_xtol = std::sqrt(NumTraits<RealScalar>::epsilon());
-      m_gtol = 0. ; 
-      m_epsfcn = 0. ;
-    }
-    
-    /** Sets the tolerance for the norm of the solution vector*/
-    void setXtol(RealScalar xtol) { m_xtol = xtol; }
-    
-    /** Sets the tolerance for the norm of the vector function*/
-    void setFtol(RealScalar ftol) { m_ftol = ftol; }
-    
-    /** Sets the tolerance for the norm of the gradient of the error vector*/
-    void setGtol(RealScalar gtol) { m_gtol = gtol; }
-    
-    /** Sets the step bound for the diagonal shift */
-    void setFactor(RealScalar factor) { m_factor = factor; }    
-    
-    /** Sets the error precision  */
-    void setEpsilon (RealScalar epsfcn) { m_epsfcn = epsfcn; }
-    
-    /** Sets the maximum number of function evaluation */
-    void setMaxfev(Index maxfev) {m_maxfev = maxfev; }
-    
-    /** Use an external Scaling. If set to true, pass a nonzero diagonal to diag() */
-    void setExternalScaling(bool value) {m_useExternalScaling  = value; }
-    
-    /** \returns a reference to the diagonal of the jacobian */
-    FVectorType& diag() {return m_diag; }
-    
-    /** \returns the number of iterations performed */
-    Index iterations() { return m_iter; }
-    
-    /** \returns the number of functions evaluation */
-    Index nfev() { return m_nfev; }
-    
-    /** \returns the number of jacobian evaluation */
-    Index njev() { return m_njev; }
-    
-    /** \returns the norm of current vector function */
-    RealScalar fnorm() {return m_fnorm; }
-    
-    /** \returns the norm of the gradient of the error */
-    RealScalar gnorm() {return m_gnorm; }
-    
-    /** \returns the LevenbergMarquardt parameter */
-    RealScalar lm_param(void) { return m_par; }
-    
-    /** \returns a reference to the  current vector function 
-     */
-    FVectorType& fvec() {return m_fvec; }
-    
-    /** \returns a reference to the matrix where the current Jacobian matrix is stored
-     */
-    JacobianType& jacobian() {return m_fjac; }
-    
-    /** \returns a reference to the triangular matrix R from the QR of the jacobian matrix.
-     * \sa jacobian()
-     */
-    JacobianType& matrixR() {return m_rfactor; }
-    
-    /** the permutation used in the QR factorization
-     */
-    PermutationType permutation() {return m_permutation; }
-    
-    /** 
-     * \brief Reports whether the minimization was successful
-     * \returns \c Success if the minimization was succesful,
-     *         \c NumericalIssue if a numerical problem arises during the 
-     *          minimization process, for exemple during the QR factorization
-     *         \c NoConvergence if the minimization did not converge after 
-     *          the maximum number of function evaluation allowed
-     *          \c InvalidInput if the input matrix is invalid
-     */
-    ComputationInfo info() const
-    {
-      
-      return m_info;
-    }
-  private:
-    JacobianType m_fjac; 
-    JacobianType m_rfactor; // The triangular matrix R from the QR of the jacobian matrix m_fjac
-    FunctorType &m_functor;
-    FVectorType m_fvec, m_qtf, m_diag; 
-    Index n;
-    Index m; 
-    Index m_nfev;
-    Index m_njev; 
-    RealScalar m_fnorm; // Norm of the current vector function
-    RealScalar m_gnorm; //Norm of the gradient of the error 
-    RealScalar m_factor; //
-    Index m_maxfev; // Maximum number of function evaluation
-    RealScalar m_ftol; //Tolerance in the norm of the vector function
-    RealScalar m_xtol; // 
-    RealScalar m_gtol; //tolerance of the norm of the error gradient
-    RealScalar m_epsfcn; //
-    Index m_iter; // Number of iterations performed
-    RealScalar m_delta;
-    bool m_useExternalScaling;
-    PermutationType m_permutation;
-    FVectorType m_wa1, m_wa2, m_wa3, m_wa4; //Temporary vectors
-    RealScalar m_par;
-    bool m_isInitialized; // Check whether the minimization step has been called
-    ComputationInfo m_info; 
+ * \ingroup NonLinearOptimization_Module
+ * \brief Performs non linear optimization over a non-linear function,
+ * using a variant of the Levenberg Marquardt algorithm.
+ *
+ * Check wikipedia for more information.
+ * http://en.wikipedia.org/wiki/Levenberg%E2%80%93Marquardt_algorithm
+ */
+template <typename FunctorType_>
+class LevenbergMarquardt : internal::no_assignment_operator {
+ public:
+  typedef FunctorType_ FunctorType;
+  typedef typename FunctorType::QRSolver QRSolver;
+  typedef typename FunctorType::JacobianType JacobianType;
+  typedef typename JacobianType::Scalar Scalar;
+  typedef typename JacobianType::RealScalar RealScalar;
+  typedef typename QRSolver::StorageIndex PermIndex;
+  typedef Matrix<Scalar, Dynamic, 1> FVectorType;
+  typedef PermutationMatrix<Dynamic, Dynamic, int> PermutationType;
+
+ public:
+  LevenbergMarquardt(FunctorType &functor)
+      : m_functor(functor),
+        m_nfev(0),
+        m_njev(0),
+        m_fnorm(0.0),
+        m_gnorm(0),
+        m_isInitialized(false),
+        m_info(InvalidInput) {
+    resetParameters();
+    m_useExternalScaling = false;
+  }
+
+  LevenbergMarquardtSpace::Status minimize(FVectorType &x);
+  LevenbergMarquardtSpace::Status minimizeInit(FVectorType &x);
+  LevenbergMarquardtSpace::Status minimizeOneStep(FVectorType &x);
+  LevenbergMarquardtSpace::Status lmder1(FVectorType &x, const Scalar tol = std::sqrt(NumTraits<Scalar>::epsilon()));
+  static LevenbergMarquardtSpace::Status lmdif1(FunctorType &functor, FVectorType &x, Index *nfev,
+                                                const Scalar tol = std::sqrt(NumTraits<Scalar>::epsilon()));
+
+  /** Sets the default parameters */
+  void resetParameters() {
+    using std::sqrt;
+
+    m_factor = 100.;
+    m_maxfev = 400;
+    m_ftol = sqrt(NumTraits<RealScalar>::epsilon());
+    m_xtol = sqrt(NumTraits<RealScalar>::epsilon());
+    m_gtol = 0.;
+    m_epsfcn = 0.;
+  }
+
+  /** Sets the tolerance for the norm of the solution vector*/
+  void setXtol(RealScalar xtol) { m_xtol = xtol; }
+
+  /** Sets the tolerance for the norm of the vector function*/
+  void setFtol(RealScalar ftol) { m_ftol = ftol; }
+
+  /** Sets the tolerance for the norm of the gradient of the error vector*/
+  void setGtol(RealScalar gtol) { m_gtol = gtol; }
+
+  /** Sets the step bound for the diagonal shift */
+  void setFactor(RealScalar factor) { m_factor = factor; }
+
+  /** Sets the error precision  */
+  void setEpsilon(RealScalar epsfcn) { m_epsfcn = epsfcn; }
+
+  /** Sets the maximum number of function evaluation */
+  void setMaxfev(Index maxfev) { m_maxfev = maxfev; }
+
+  /** Use an external Scaling. If set to true, pass a nonzero diagonal to diag() */
+  void setExternalScaling(bool value) { m_useExternalScaling = value; }
+
+  /** \returns the tolerance for the norm of the solution vector */
+  RealScalar xtol() const { return m_xtol; }
+
+  /** \returns the tolerance for the norm of the vector function */
+  RealScalar ftol() const { return m_ftol; }
+
+  /** \returns the tolerance for the norm of the gradient of the error vector */
+  RealScalar gtol() const { return m_gtol; }
+
+  /** \returns the step bound for the diagonal shift */
+  RealScalar factor() const { return m_factor; }
+
+  /** \returns the error precision */
+  RealScalar epsilon() const { return m_epsfcn; }
+
+  /** \returns the maximum number of function evaluation */
+  Index maxfev() const { return m_maxfev; }
+
+  /** \returns a reference to the diagonal of the jacobian */
+  FVectorType &diag() { return m_diag; }
+
+  /** \returns the number of iterations performed */
+  Index iterations() { return m_iter; }
+
+  /** \returns the number of functions evaluation */
+  Index nfev() { return m_nfev; }
+
+  /** \returns the number of jacobian evaluation */
+  Index njev() { return m_njev; }
+
+  /** \returns the norm of current vector function */
+  RealScalar fnorm() { return m_fnorm; }
+
+  /** \returns the norm of the gradient of the error */
+  RealScalar gnorm() { return m_gnorm; }
+
+  /** \returns the LevenbergMarquardt parameter */
+  RealScalar lm_param(void) { return m_par; }
+
+  /** \returns a reference to the  current vector function
+   */
+  FVectorType &fvec() { return m_fvec; }
+
+  /** \returns a reference to the matrix where the current Jacobian matrix is stored
+   */
+  JacobianType &jacobian() { return m_fjac; }
+
+  /** \returns a reference to the triangular matrix R from the QR of the jacobian matrix.
+   * \sa jacobian()
+   */
+  JacobianType &matrixR() { return m_rfactor; }
+
+  /** the permutation used in the QR factorization
+   */
+  PermutationType permutation() { return m_permutation; }
+
+  /**
+   * \brief Reports whether the minimization was successful
+   * \returns \c Success if the minimization was successful,
+   *         \c NumericalIssue if a numerical problem arises during the
+   *          minimization process, for example during the QR factorization
+   *         \c NoConvergence if the minimization did not converge after
+   *          the maximum number of function evaluation allowed
+   *          \c InvalidInput if the input matrix is invalid
+   */
+  ComputationInfo info() const { return m_info; }
+
+ private:
+  JacobianType m_fjac;
+  JacobianType m_rfactor;  // The triangular matrix R from the QR of the jacobian matrix m_fjac
+  FunctorType &m_functor;
+  FVectorType m_fvec, m_qtf, m_diag;
+  Index n;
+  Index m;
+  Index m_nfev;
+  Index m_njev;
+  RealScalar m_fnorm;   // Norm of the current vector function
+  RealScalar m_gnorm;   // Norm of the gradient of the error
+  RealScalar m_factor;  //
+  Index m_maxfev;       // Maximum number of function evaluation
+  RealScalar m_ftol;    // Tolerance in the norm of the vector function
+  RealScalar m_xtol;    //
+  RealScalar m_gtol;    // tolerance of the norm of the error gradient
+  RealScalar m_epsfcn;  //
+  Index m_iter;         // Number of iterations performed
+  RealScalar m_delta;
+  bool m_useExternalScaling;
+  PermutationType m_permutation;
+  FVectorType m_wa1, m_wa2, m_wa3, m_wa4;  // Temporary vectors
+  RealScalar m_par;
+  bool m_isInitialized;  // Check whether the minimization step has been called
+  ComputationInfo m_info;
 };
 
-template<typename FunctorType>
-LevenbergMarquardtSpace::Status
-LevenbergMarquardt<FunctorType>::minimize(FVectorType  &x)
-{
-    LevenbergMarquardtSpace::Status status = minimizeInit(x);
-    if (status==LevenbergMarquardtSpace::ImproperInputParameters) {
-      m_isInitialized = true;
-      return status;
-    }
-    do {
-//       std::cout << " uv " << x.transpose() << "\n";
-        status = minimizeOneStep(x);
-    } while (status==LevenbergMarquardtSpace::Running);
-     m_isInitialized = true;
-     return status;
+template <typename FunctorType>
+LevenbergMarquardtSpace::Status LevenbergMarquardt<FunctorType>::minimize(FVectorType &x) {
+  LevenbergMarquardtSpace::Status status = minimizeInit(x);
+  if (status == LevenbergMarquardtSpace::ImproperInputParameters) {
+    m_isInitialized = true;
+    return status;
+  }
+  do {
+    //       std::cout << " uv " << x.transpose() << "\n";
+    status = minimizeOneStep(x);
+  } while (status == LevenbergMarquardtSpace::Running);
+  m_isInitialized = true;
+  return status;
 }
 
-template<typename FunctorType>
-LevenbergMarquardtSpace::Status
-LevenbergMarquardt<FunctorType>::minimizeInit(FVectorType  &x)
-{
-    n = x.size();
-    m = m_functor.values();
-
-    m_wa1.resize(n); m_wa2.resize(n); m_wa3.resize(n);
-    m_wa4.resize(m);
-    m_fvec.resize(m);
-    //FIXME Sparse Case : Allocate space for the jacobian
-    m_fjac.resize(m, n);
-//     m_fjac.reserve(VectorXi::Constant(n,5)); // FIXME Find a better alternative
-    if (!m_useExternalScaling)
-        m_diag.resize(n);
-    eigen_assert( (!m_useExternalScaling || m_diag.size()==n) || "When m_useExternalScaling is set, the caller must provide a valid 'm_diag'");
-    m_qtf.resize(n);
-
-    /* Function Body */
-    m_nfev = 0;
-    m_njev = 0;
-
-    /*     check the input parameters for errors. */
-    if (n <= 0 || m < n || m_ftol < 0. || m_xtol < 0. || m_gtol < 0. || m_maxfev <= 0 || m_factor <= 0.){
-      m_info = InvalidInput;
-      return LevenbergMarquardtSpace::ImproperInputParameters;
-    }
-
-    if (m_useExternalScaling)
-        for (Index j = 0; j < n; ++j)
-            if (m_diag[j] <= 0.) 
-            {
-              m_info = InvalidInput;
-              return LevenbergMarquardtSpace::ImproperInputParameters;
-            }
-
-    /*     evaluate the function at the starting point */
-    /*     and calculate its norm. */
-    m_nfev = 1;
-    if ( m_functor(x, m_fvec) < 0)
-        return LevenbergMarquardtSpace::UserAsked;
-    m_fnorm = m_fvec.stableNorm();
-
-    /*     initialize levenberg-marquardt parameter and iteration counter. */
-    m_par = 0.;
-    m_iter = 1;
-
-    return LevenbergMarquardtSpace::NotStarted;
-}
+template <typename FunctorType>
+LevenbergMarquardtSpace::Status LevenbergMarquardt<FunctorType>::minimizeInit(FVectorType &x) {
+  n = x.size();
+  m = m_functor.values();
 
-template<typename FunctorType>
-LevenbergMarquardtSpace::Status
-LevenbergMarquardt<FunctorType>::lmder1(
-        FVectorType  &x,
-        const Scalar tol
-        )
-{
-    n = x.size();
-    m = m_functor.values();
-
-    /* check the input parameters for errors. */
-    if (n <= 0 || m < n || tol < 0.)
+  m_wa1.resize(n);
+  m_wa2.resize(n);
+  m_wa3.resize(n);
+  m_wa4.resize(m);
+  m_fvec.resize(m);
+  // FIXME Sparse Case : Allocate space for the jacobian
+  m_fjac.resize(m, n);
+  //     m_fjac.reserve(VectorXi::Constant(n,5)); // FIXME Find a better alternative
+  if (!m_useExternalScaling) m_diag.resize(n);
+  eigen_assert((!m_useExternalScaling || m_diag.size() == n) &&
+               "When m_useExternalScaling is set, the caller must provide a valid 'm_diag'");
+  m_qtf.resize(n);
+
+  /* Function Body */
+  m_nfev = 0;
+  m_njev = 0;
+
+  /*     check the input parameters for errors. */
+  if (n <= 0 || m < n || m_ftol < 0. || m_xtol < 0. || m_gtol < 0. || m_maxfev <= 0 || m_factor <= 0.) {
+    m_info = InvalidInput;
+    return LevenbergMarquardtSpace::ImproperInputParameters;
+  }
+
+  if (m_useExternalScaling)
+    for (Index j = 0; j < n; ++j)
+      if (m_diag[j] <= 0.) {
+        m_info = InvalidInput;
         return LevenbergMarquardtSpace::ImproperInputParameters;
+      }
 
-    resetParameters();
-    m_ftol = tol;
-    m_xtol = tol;
-    m_maxfev = 100*(n+1);
+  /*     evaluate the function at the starting point */
+  /*     and calculate its norm. */
+  m_nfev = 1;
+  if (m_functor(x, m_fvec) < 0) return LevenbergMarquardtSpace::UserAsked;
+  m_fnorm = m_fvec.stableNorm();
+
+  /*     initialize levenberg-marquardt parameter and iteration counter. */
+  m_par = 0.;
+  m_iter = 1;
 
-    return minimize(x);
+  return LevenbergMarquardtSpace::NotStarted;
 }
 
+template <typename FunctorType>
+LevenbergMarquardtSpace::Status LevenbergMarquardt<FunctorType>::lmder1(FVectorType &x, const Scalar tol) {
+  n = x.size();
+  m = m_functor.values();
 
-template<typename FunctorType>
-LevenbergMarquardtSpace::Status
-LevenbergMarquardt<FunctorType>::lmdif1(
-        FunctorType &functor,
-        FVectorType  &x,
-        Index *nfev,
-        const Scalar tol
-        )
-{
-    Index n = x.size();
-    Index m = functor.values();
-
-    /* check the input parameters for errors. */
-    if (n <= 0 || m < n || tol < 0.)
-        return LevenbergMarquardtSpace::ImproperInputParameters;
+  /* check the input parameters for errors. */
+  if (n <= 0 || m < n || tol < 0.) return LevenbergMarquardtSpace::ImproperInputParameters;
+
+  resetParameters();
+  m_ftol = tol;
+  m_xtol = tol;
+  m_maxfev = 100 * (n + 1);
+
+  return minimize(x);
+}
+
+template <typename FunctorType>
+LevenbergMarquardtSpace::Status LevenbergMarquardt<FunctorType>::lmdif1(FunctorType &functor, FVectorType &x,
+                                                                        Index *nfev, const Scalar tol) {
+  Index n = x.size();
+  Index m = functor.values();
+
+  /* check the input parameters for errors. */
+  if (n <= 0 || m < n || tol < 0.) return LevenbergMarquardtSpace::ImproperInputParameters;
+
+  NumericalDiff<FunctorType> numDiff(functor);
+  // embedded LevenbergMarquardt
+  LevenbergMarquardt<NumericalDiff<FunctorType> > lm(numDiff);
+  lm.setFtol(tol);
+  lm.setXtol(tol);
+  lm.setMaxfev(200 * (n + 1));
 
-    NumericalDiff<FunctorType> numDiff(functor);
-    // embedded LevenbergMarquardt
-    LevenbergMarquardt<NumericalDiff<FunctorType> > lm(numDiff);
-    lm.setFtol(tol);
-    lm.setXtol(tol);
-    lm.setMaxfev(200*(n+1));
-
-    LevenbergMarquardtSpace::Status info = LevenbergMarquardtSpace::Status(lm.minimize(x));
-    if (nfev)
-        * nfev = lm.nfev();
-    return info;
+  LevenbergMarquardtSpace::Status info = LevenbergMarquardtSpace::Status(lm.minimize(x));
+  if (nfev) *nfev = lm.nfev();
+  return info;
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_LEVENBERGMARQUARDT_H
+#endif  // EIGEN_LEVENBERGMARQUARDT_H
diff --git a/inst/include/unsupported/Eigen/src/MatrixFunctions/InternalHeaderCheck.h b/inst/include/unsupported/Eigen/src/MatrixFunctions/InternalHeaderCheck.h
new file mode 100644
index 00000000..4fc840ef
--- /dev/null
+++ b/inst/include/unsupported/Eigen/src/MatrixFunctions/InternalHeaderCheck.h
@@ -0,0 +1,4 @@
+#ifndef EIGEN_MATRIX_FUNCTIONS_MODULE_H
+#error \
+    "Please include unsupported/Eigen/MatrixFunctions instead of including headers inside the src directory directly."
+#endif
diff --git a/inst/include/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h b/inst/include/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h
index 6825a788..f4e0428e 100644
--- a/inst/include/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h
+++ b/inst/include/unsupported/Eigen/src/MatrixFunctions/MatrixExponential.h
@@ -1,8 +1,8 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2009, 2010 Jitse Niesen <jitse@maths.leeds.ac.uk>
-// Copyright (C) 2011 Chen-Pang He <jdh8@ms63.hinet.net>
+// Copyright (C) 2009, 2010, 2013 Jitse Niesen <jitse@maths.leeds.ac.uk>
+// Copyright (C) 2011, 2013 Chen-Pang He <jdh8@ms63.hinet.net>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -13,439 +13,445 @@
 
 #include "StemFunction.h"
 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
 namespace Eigen {
+namespace internal {
 
-/** \ingroup MatrixFunctions_Module
-  * \brief Class for computing the matrix exponential.
-  * \tparam MatrixType type of the argument of the exponential,
-  * expected to be an instantiation of the Matrix class template.
-  */
-template <typename MatrixType>
-class MatrixExponential {
-
-  public:
-
-    /** \brief Constructor.
-      * 
-      * The class stores a reference to \p M, so it should not be
-      * changed (or destroyed) before compute() is called.
-      *
-      * \param[in] M  matrix whose exponential is to be computed.
-      */
-    MatrixExponential(const MatrixType &M);
-
-    /** \brief Computes the matrix exponential.
-      *
-      * \param[out] result  the matrix exponential of \p M in the constructor.
-      */
-    template <typename ResultType> 
-    void compute(ResultType &result);
-
-  private:
-
-    // Prevent copying
-    MatrixExponential(const MatrixExponential&);
-    MatrixExponential& operator=(const MatrixExponential&);
-
-    /** \brief Compute the (3,3)-Pad&eacute; approximant to the exponential.
-     *
-     *  After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Pad&eacute;
-     *  approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$.
-     *
-     *  \param[in] A   Argument of matrix exponential
-     */
-    void pade3(const MatrixType &A);
-
-    /** \brief Compute the (5,5)-Pad&eacute; approximant to the exponential.
-     *
-     *  After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Pad&eacute;
-     *  approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$.
-     *
-     *  \param[in] A   Argument of matrix exponential
-     */
-    void pade5(const MatrixType &A);
-
-    /** \brief Compute the (7,7)-Pad&eacute; approximant to the exponential.
-     *
-     *  After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Pad&eacute;
-     *  approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$.
-     *
-     *  \param[in] A   Argument of matrix exponential
-     */
-    void pade7(const MatrixType &A);
-
-    /** \brief Compute the (9,9)-Pad&eacute; approximant to the exponential.
-     *
-     *  After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Pad&eacute;
-     *  approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$.
-     *
-     *  \param[in] A   Argument of matrix exponential
-     */
-    void pade9(const MatrixType &A);
-
-    /** \brief Compute the (13,13)-Pad&eacute; approximant to the exponential.
-     *
-     *  After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Pad&eacute;
-     *  approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$.
-     *
-     *  \param[in] A   Argument of matrix exponential
-     */
-    void pade13(const MatrixType &A);
-
-    /** \brief Compute the (17,17)-Pad&eacute; approximant to the exponential.
-     *
-     *  After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Pad&eacute;
-     *  approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$.
-     *
-     *  This function activates only if your long double is double-double or quadruple.
-     *
-     *  \param[in] A   Argument of matrix exponential
-     */
-    void pade17(const MatrixType &A);
-
-    /** \brief Compute Pad&eacute; approximant to the exponential.
-     *
-     * Computes \c m_U, \c m_V and \c m_squarings such that
-     * \f$ (V+U)(V-U)^{-1} \f$ is a Pad&eacute; of
-     * \f$ \exp(2^{-\mbox{squarings}}M) \f$ around \f$ M = 0 \f$. The
-     * degree of the Pad&eacute; approximant and the value of
-     * squarings are chosen such that the approximation error is no
-     * more than the round-off error.
-     *
-     * The argument of this function should correspond with the (real
-     * part of) the entries of \c m_M.  It is used to select the
-     * correct implementation using overloading.
-     */
-    void computeUV(double);
-
-    /** \brief Compute Pad&eacute; approximant to the exponential.
-     *
-     *  \sa computeUV(double);
-     */
-    void computeUV(float);
-    
-    /** \brief Compute Pad&eacute; approximant to the exponential.
-     *
-     *  \sa computeUV(double);
-     */
-    void computeUV(long double);
-
-    typedef typename internal::traits<MatrixType>::Scalar Scalar;
-    typedef typename NumTraits<Scalar>::Real RealScalar;
-    typedef typename std::complex<RealScalar> ComplexScalar;
-
-    /** \brief Reference to matrix whose exponential is to be computed. */
-    typename internal::nested<MatrixType>::type m_M;
-
-    /** \brief Odd-degree terms in numerator of Pad&eacute; approximant. */
-    MatrixType m_U;
-
-    /** \brief Even-degree terms in numerator of Pad&eacute; approximant. */
-    MatrixType m_V;
-
-    /** \brief Used for temporary storage. */
-    MatrixType m_tmp1;
-
-    /** \brief Used for temporary storage. */
-    MatrixType m_tmp2;
-
-    /** \brief Identity matrix of the same size as \c m_M. */
-    MatrixType m_Id;
-
-    /** \brief Number of squarings required in the last step. */
-    int m_squarings;
-
-    /** \brief L1 norm of m_M. */
-    RealScalar m_l1norm;
-};
+/** \brief Scaling operator.
+ *
+ * This struct is used by CwiseUnaryOp to scale a matrix by \f$ 2^{-s} \f$.
+ */
+template <typename Scalar, bool IsComplex = NumTraits<Scalar>::IsComplex>
+struct MatrixExponentialScalingOp {
+  using RealScalar = typename NumTraits<Scalar>::Real;
+
+  /** \brief Constructor.
+   *
+   * \param[in] squarings  The integer \f$ s \f$ in this document.
+   */
+  MatrixExponentialScalingOp(int squarings) : m_squarings(squarings) {}
+
+  /** \brief Scale a matrix coefficient.
+   *
+   * \param[in,out] x  The scalar to be scaled, becoming \f$ 2^{-s} x \f$.
+   */
+  inline const Scalar operator()(const Scalar& x) const {
+    using std::ldexp;
+    return Scalar(ldexp(Eigen::numext::real(x), -m_squarings), ldexp(Eigen::numext::imag(x), -m_squarings));
+  }
 
-template <typename MatrixType>
-MatrixExponential<MatrixType>::MatrixExponential(const MatrixType &M) :
-  m_M(M),
-  m_U(M.rows(),M.cols()),
-  m_V(M.rows(),M.cols()),
-  m_tmp1(M.rows(),M.cols()),
-  m_tmp2(M.rows(),M.cols()),
-  m_Id(MatrixType::Identity(M.rows(), M.cols())),
-  m_squarings(0),
-  m_l1norm(M.cwiseAbs().colwise().sum().maxCoeff())
-{
-  /* empty body */
-}
+ private:
+  int m_squarings;
+};
 
-template <typename MatrixType>
-template <typename ResultType> 
-void MatrixExponential<MatrixType>::compute(ResultType &result)
-{
-#if LDBL_MANT_DIG > 112 // rarely happens
-  if(sizeof(RealScalar) > 14) {
-    result = m_M.matrixFunction(StdStemFunctions<ComplexScalar>::exp);
-    return;
+template <typename Scalar>
+struct MatrixExponentialScalingOp<Scalar, /*IsComplex=*/false> {
+  /** \brief Constructor.
+   *
+   * \param[in] squarings  The integer \f$ s \f$ in this document.
+   */
+  MatrixExponentialScalingOp(int squarings) : m_squarings(squarings) {}
+
+  /** \brief Scale a matrix coefficient.
+   *
+   * \param[in,out] x  The scalar to be scaled, becoming \f$ 2^{-s} x \f$.
+   */
+  inline const Scalar operator()(const Scalar& x) const {
+    using std::ldexp;
+    return ldexp(x, -m_squarings);
   }
-#endif
-  computeUV(RealScalar());
-  m_tmp1 = m_U + m_V;   // numerator of Pade approximant
-  m_tmp2 = -m_U + m_V;  // denominator of Pade approximant
-  result = m_tmp2.partialPivLu().solve(m_tmp1);
-  for (int i=0; i<m_squarings; i++)
-    result *= result;   // undo scaling by repeated squaring
-}
 
-template <typename MatrixType>
-EIGEN_STRONG_INLINE void MatrixExponential<MatrixType>::pade3(const MatrixType &A)
-{
-  const RealScalar b[] = {120., 60., 12., 1.};
-  m_tmp1.noalias() = A * A;
-  m_tmp2 = b[3]*m_tmp1 + b[1]*m_Id;
-  m_U.noalias() = A * m_tmp2;
-  m_V = b[2]*m_tmp1 + b[0]*m_Id;
+ private:
+  int m_squarings;
+};
+
+/** \brief Compute the (3,3)-Pad&eacute; approximant to the exponential.
+ *
+ *  After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Pad&eacute;
+ *  approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$.
+ */
+template <typename MatA, typename MatU, typename MatV>
+void matrix_exp_pade3(const MatA& A, MatU& U, MatV& V) {
+  typedef typename MatA::PlainObject MatrixType;
+  typedef typename NumTraits<typename traits<MatA>::Scalar>::Real RealScalar;
+  const RealScalar b[] = {120.L, 60.L, 12.L, 1.L};
+  const MatrixType A2 = A * A;
+  const MatrixType tmp = b[3] * A2 + b[1] * MatrixType::Identity(A.rows(), A.cols());
+  U.noalias() = A * tmp;
+  V = b[2] * A2 + b[0] * MatrixType::Identity(A.rows(), A.cols());
 }
 
-template <typename MatrixType>
-EIGEN_STRONG_INLINE void MatrixExponential<MatrixType>::pade5(const MatrixType &A)
-{
-  const RealScalar b[] = {30240., 15120., 3360., 420., 30., 1.};
-  MatrixType A2 = A * A;
-  m_tmp1.noalias() = A2 * A2;
-  m_tmp2 = b[5]*m_tmp1 + b[3]*A2 + b[1]*m_Id;
-  m_U.noalias() = A * m_tmp2;
-  m_V = b[4]*m_tmp1 + b[2]*A2 + b[0]*m_Id;
+/** \brief Compute the (5,5)-Pad&eacute; approximant to the exponential.
+ *
+ *  After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Pad&eacute;
+ *  approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$.
+ */
+template <typename MatA, typename MatU, typename MatV>
+void matrix_exp_pade5(const MatA& A, MatU& U, MatV& V) {
+  typedef typename MatA::PlainObject MatrixType;
+  typedef typename NumTraits<typename traits<MatrixType>::Scalar>::Real RealScalar;
+  const RealScalar b[] = {30240.L, 15120.L, 3360.L, 420.L, 30.L, 1.L};
+  const MatrixType A2 = A * A;
+  const MatrixType A4 = A2 * A2;
+  const MatrixType tmp = b[5] * A4 + b[3] * A2 + b[1] * MatrixType::Identity(A.rows(), A.cols());
+  U.noalias() = A * tmp;
+  V = b[4] * A4 + b[2] * A2 + b[0] * MatrixType::Identity(A.rows(), A.cols());
 }
 
-template <typename MatrixType>
-EIGEN_STRONG_INLINE void MatrixExponential<MatrixType>::pade7(const MatrixType &A)
-{
-  const RealScalar b[] = {17297280., 8648640., 1995840., 277200., 25200., 1512., 56., 1.};
-  MatrixType A2 = A * A;
-  MatrixType A4 = A2 * A2;
-  m_tmp1.noalias() = A4 * A2;
-  m_tmp2 = b[7]*m_tmp1 + b[5]*A4 + b[3]*A2 + b[1]*m_Id;
-  m_U.noalias() = A * m_tmp2;
-  m_V = b[6]*m_tmp1 + b[4]*A4 + b[2]*A2 + b[0]*m_Id;
+/** \brief Compute the (7,7)-Pad&eacute; approximant to the exponential.
+ *
+ *  After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Pad&eacute;
+ *  approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$.
+ */
+template <typename MatA, typename MatU, typename MatV>
+void matrix_exp_pade7(const MatA& A, MatU& U, MatV& V) {
+  typedef typename MatA::PlainObject MatrixType;
+  typedef typename NumTraits<typename traits<MatrixType>::Scalar>::Real RealScalar;
+  const RealScalar b[] = {17297280.L, 8648640.L, 1995840.L, 277200.L, 25200.L, 1512.L, 56.L, 1.L};
+  const MatrixType A2 = A * A;
+  const MatrixType A4 = A2 * A2;
+  const MatrixType A6 = A4 * A2;
+  const MatrixType tmp = b[7] * A6 + b[5] * A4 + b[3] * A2 + b[1] * MatrixType::Identity(A.rows(), A.cols());
+  U.noalias() = A * tmp;
+  V = b[6] * A6 + b[4] * A4 + b[2] * A2 + b[0] * MatrixType::Identity(A.rows(), A.cols());
 }
 
-template <typename MatrixType>
-EIGEN_STRONG_INLINE void MatrixExponential<MatrixType>::pade9(const MatrixType &A)
-{
-  const RealScalar b[] = {17643225600., 8821612800., 2075673600., 302702400., 30270240.,
-		      2162160., 110880., 3960., 90., 1.};
-  MatrixType A2 = A * A;
-  MatrixType A4 = A2 * A2;
-  MatrixType A6 = A4 * A2;
-  m_tmp1.noalias() = A6 * A2;
-  m_tmp2 = b[9]*m_tmp1 + b[7]*A6 + b[5]*A4 + b[3]*A2 + b[1]*m_Id;
-  m_U.noalias() = A * m_tmp2;
-  m_V = b[8]*m_tmp1 + b[6]*A6 + b[4]*A4 + b[2]*A2 + b[0]*m_Id;
+/** \brief Compute the (9,9)-Pad&eacute; approximant to the exponential.
+ *
+ *  After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Pad&eacute;
+ *  approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$.
+ */
+template <typename MatA, typename MatU, typename MatV>
+void matrix_exp_pade9(const MatA& A, MatU& U, MatV& V) {
+  typedef typename MatA::PlainObject MatrixType;
+  typedef typename NumTraits<typename traits<MatrixType>::Scalar>::Real RealScalar;
+  const RealScalar b[] = {17643225600.L, 8821612800.L, 2075673600.L, 302702400.L, 30270240.L,
+                          2162160.L,     110880.L,     3960.L,       90.L,        1.L};
+  const MatrixType A2 = A * A;
+  const MatrixType A4 = A2 * A2;
+  const MatrixType A6 = A4 * A2;
+  const MatrixType A8 = A6 * A2;
+  const MatrixType tmp =
+      b[9] * A8 + b[7] * A6 + b[5] * A4 + b[3] * A2 + b[1] * MatrixType::Identity(A.rows(), A.cols());
+  U.noalias() = A * tmp;
+  V = b[8] * A8 + b[6] * A6 + b[4] * A4 + b[2] * A2 + b[0] * MatrixType::Identity(A.rows(), A.cols());
 }
 
-template <typename MatrixType>
-EIGEN_STRONG_INLINE void MatrixExponential<MatrixType>::pade13(const MatrixType &A)
-{
-  const RealScalar b[] = {64764752532480000., 32382376266240000., 7771770303897600.,
-		      1187353796428800., 129060195264000., 10559470521600., 670442572800.,
-		      33522128640., 1323241920., 40840800., 960960., 16380., 182., 1.};
-  MatrixType A2 = A * A;
-  MatrixType A4 = A2 * A2;
-  m_tmp1.noalias() = A4 * A2;
-  m_V = b[13]*m_tmp1 + b[11]*A4 + b[9]*A2; // used for temporary storage
-  m_tmp2.noalias() = m_tmp1 * m_V;
-  m_tmp2 += b[7]*m_tmp1 + b[5]*A4 + b[3]*A2 + b[1]*m_Id;
-  m_U.noalias() = A * m_tmp2;
-  m_tmp2 = b[12]*m_tmp1 + b[10]*A4 + b[8]*A2;
-  m_V.noalias() = m_tmp1 * m_tmp2;
-  m_V += b[6]*m_tmp1 + b[4]*A4 + b[2]*A2 + b[0]*m_Id;
+/** \brief Compute the (13,13)-Pad&eacute; approximant to the exponential.
+ *
+ *  After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Pad&eacute;
+ *  approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$.
+ */
+template <typename MatA, typename MatU, typename MatV>
+void matrix_exp_pade13(const MatA& A, MatU& U, MatV& V) {
+  typedef typename MatA::PlainObject MatrixType;
+  typedef typename NumTraits<typename traits<MatrixType>::Scalar>::Real RealScalar;
+  const RealScalar b[] = {64764752532480000.L,
+                          32382376266240000.L,
+                          7771770303897600.L,
+                          1187353796428800.L,
+                          129060195264000.L,
+                          10559470521600.L,
+                          670442572800.L,
+                          33522128640.L,
+                          1323241920.L,
+                          40840800.L,
+                          960960.L,
+                          16380.L,
+                          182.L,
+                          1.L};
+  const MatrixType A2 = A * A;
+  const MatrixType A4 = A2 * A2;
+  const MatrixType A6 = A4 * A2;
+  V = b[13] * A6 + b[11] * A4 + b[9] * A2;  // used for temporary storage
+  MatrixType tmp = A6 * V;
+  tmp += b[7] * A6 + b[5] * A4 + b[3] * A2 + b[1] * MatrixType::Identity(A.rows(), A.cols());
+  U.noalias() = A * tmp;
+  tmp = b[12] * A6 + b[10] * A4 + b[8] * A2;
+  V.noalias() = A6 * tmp;
+  V += b[6] * A6 + b[4] * A4 + b[2] * A2 + b[0] * MatrixType::Identity(A.rows(), A.cols());
 }
 
+/** \brief Compute the (17,17)-Pad&eacute; approximant to the exponential.
+ *
+ *  After exit, \f$ (V+U)(V-U)^{-1} \f$ is the Pad&eacute;
+ *  approximant of \f$ \exp(A) \f$ around \f$ A = 0 \f$.
+ *
+ *  This function activates only if your long double is double-double or quadruple.
+ */
 #if LDBL_MANT_DIG > 64
-template <typename MatrixType>
-EIGEN_STRONG_INLINE void MatrixExponential<MatrixType>::pade17(const MatrixType &A)
-{
-  const RealScalar b[] = {830034394580628357120000.L, 415017197290314178560000.L,
-		      100610229646136770560000.L, 15720348382208870400000.L,
-		      1774878043152614400000.L, 153822763739893248000.L, 10608466464820224000.L,
-		      595373117923584000.L, 27563570274240000.L, 1060137318240000.L,
-		      33924394183680.L, 899510451840.L, 19554575040.L, 341863200.L, 4651200.L,
-		      46512.L, 306.L, 1.L};
-  MatrixType A2 = A * A;
-  MatrixType A4 = A2 * A2;
-  MatrixType A6 = A4 * A2;
-  m_tmp1.noalias() = A4 * A4;
-  m_V = b[17]*m_tmp1 + b[15]*A6 + b[13]*A4 + b[11]*A2; // used for temporary storage
-  m_tmp2.noalias() = m_tmp1 * m_V;
-  m_tmp2 += b[9]*m_tmp1 + b[7]*A6 + b[5]*A4 + b[3]*A2 + b[1]*m_Id;
-  m_U.noalias() = A * m_tmp2;
-  m_tmp2 = b[16]*m_tmp1 + b[14]*A6 + b[12]*A4 + b[10]*A2;
-  m_V.noalias() = m_tmp1 * m_tmp2;
-  m_V += b[8]*m_tmp1 + b[6]*A6 + b[4]*A4 + b[2]*A2 + b[0]*m_Id;
+template <typename MatA, typename MatU, typename MatV>
+void matrix_exp_pade17(const MatA& A, MatU& U, MatV& V) {
+  typedef typename MatA::PlainObject MatrixType;
+  typedef typename NumTraits<typename traits<MatrixType>::Scalar>::Real RealScalar;
+  const RealScalar b[] = {830034394580628357120000.L,
+                          415017197290314178560000.L,
+                          100610229646136770560000.L,
+                          15720348382208870400000.L,
+                          1774878043152614400000.L,
+                          153822763739893248000.L,
+                          10608466464820224000.L,
+                          595373117923584000.L,
+                          27563570274240000.L,
+                          1060137318240000.L,
+                          33924394183680.L,
+                          899510451840.L,
+                          19554575040.L,
+                          341863200.L,
+                          4651200.L,
+                          46512.L,
+                          306.L,
+                          1.L};
+  const MatrixType A2 = A * A;
+  const MatrixType A4 = A2 * A2;
+  const MatrixType A6 = A4 * A2;
+  const MatrixType A8 = A4 * A4;
+  V = b[17] * A8 + b[15] * A6 + b[13] * A4 + b[11] * A2;  // used for temporary storage
+  MatrixType tmp = A8 * V;
+  tmp += b[9] * A8 + b[7] * A6 + b[5] * A4 + b[3] * A2 + b[1] * MatrixType::Identity(A.rows(), A.cols());
+  U.noalias() = A * tmp;
+  tmp = b[16] * A8 + b[14] * A6 + b[12] * A4 + b[10] * A2;
+  V.noalias() = tmp * A8;
+  V += b[8] * A8 + b[6] * A6 + b[4] * A4 + b[2] * A2 + b[0] * MatrixType::Identity(A.rows(), A.cols());
 }
 #endif
 
+template <typename MatrixType, typename RealScalar = typename NumTraits<typename traits<MatrixType>::Scalar>::Real>
+struct matrix_exp_computeUV {
+  /** \brief Compute Pad&eacute; approximant to the exponential.
+   *
+   * Computes \c U, \c V and \c squarings such that \f$ (V+U)(V-U)^{-1} \f$ is a Pad&eacute;
+   * approximant of \f$ \exp(2^{-\mbox{squarings}}M) \f$ around \f$ M = 0 \f$, where \f$ M \f$
+   * denotes the matrix \c arg. The degree of the Pad&eacute; approximant and the value of squarings
+   * are chosen such that the approximation error is no more than the round-off error.
+   */
+  static void run(const MatrixType& arg, MatrixType& U, MatrixType& V, int& squarings);
+};
+
 template <typename MatrixType>
-void MatrixExponential<MatrixType>::computeUV(float)
-{
-  using std::frexp;
-  using std::pow;
-  if (m_l1norm < 4.258730016922831e-001) {
-    pade3(m_M);
-  } else if (m_l1norm < 1.880152677804762e+000) {
-    pade5(m_M);
-  } else {
-    const float maxnorm = 3.925724783138660f;
-    frexp(m_l1norm / maxnorm, &m_squarings);
-    if (m_squarings < 0) m_squarings = 0;
-    MatrixType A = m_M / pow(Scalar(2), m_squarings);
-    pade7(A);
+struct matrix_exp_computeUV<MatrixType, float> {
+  using Scalar = typename traits<MatrixType>::Scalar;
+  template <typename ArgType>
+  static void run(const ArgType& arg, MatrixType& U, MatrixType& V, int& squarings) {
+    using std::frexp;
+    using std::pow;
+    const float l1norm = arg.cwiseAbs().colwise().sum().maxCoeff();
+    squarings = 0;
+    if (l1norm < 4.258730016922831e-001f) {
+      matrix_exp_pade3(arg, U, V);
+    } else if (l1norm < 1.880152677804762e+000f) {
+      matrix_exp_pade5(arg, U, V);
+    } else {
+      const float maxnorm = 3.925724783138660f;
+      frexp(l1norm / maxnorm, &squarings);
+      if (squarings < 0) squarings = 0;
+      MatrixType A = arg.unaryExpr(MatrixExponentialScalingOp<Scalar>(squarings));
+      matrix_exp_pade7(A, U, V);
+    }
   }
-}
+};
 
 template <typename MatrixType>
-void MatrixExponential<MatrixType>::computeUV(double)
-{
-  using std::frexp;
-  using std::pow;
-  if (m_l1norm < 1.495585217958292e-002) {
-    pade3(m_M);
-  } else if (m_l1norm < 2.539398330063230e-001) {
-    pade5(m_M);
-  } else if (m_l1norm < 9.504178996162932e-001) {
-    pade7(m_M);
-  } else if (m_l1norm < 2.097847961257068e+000) {
-    pade9(m_M);
-  } else {
-    const double maxnorm = 5.371920351148152;
-    frexp(m_l1norm / maxnorm, &m_squarings);
-    if (m_squarings < 0) m_squarings = 0;
-    MatrixType A = m_M / pow(Scalar(2), m_squarings);
-    pade13(A);
+struct matrix_exp_computeUV<MatrixType, double> {
+  using Scalar = typename traits<MatrixType>::Scalar;
+  template <typename ArgType>
+  static void run(const ArgType& arg, MatrixType& U, MatrixType& V, int& squarings) {
+    using std::frexp;
+    using std::pow;
+    const double l1norm = arg.cwiseAbs().colwise().sum().maxCoeff();
+    squarings = 0;
+    if (l1norm < 1.495585217958292e-002) {
+      matrix_exp_pade3(arg, U, V);
+    } else if (l1norm < 2.539398330063230e-001) {
+      matrix_exp_pade5(arg, U, V);
+    } else if (l1norm < 9.504178996162932e-001) {
+      matrix_exp_pade7(arg, U, V);
+    } else if (l1norm < 2.097847961257068e+000) {
+      matrix_exp_pade9(arg, U, V);
+    } else {
+      const double maxnorm = 5.371920351148152;
+      frexp(l1norm / maxnorm, &squarings);
+      if (squarings < 0) squarings = 0;
+      MatrixType A = arg.unaryExpr(MatrixExponentialScalingOp<Scalar>(squarings));
+      matrix_exp_pade13(A, U, V);
+    }
   }
-}
+};
 
 template <typename MatrixType>
-void MatrixExponential<MatrixType>::computeUV(long double)
-{
-  using std::frexp;
-  using std::pow;
-#if   LDBL_MANT_DIG == 53   // double precision
-  computeUV(double());
-#elif LDBL_MANT_DIG <= 64   // extended precision
-  if (m_l1norm < 4.1968497232266989671e-003L) {
-    pade3(m_M);
-  } else if (m_l1norm < 1.1848116734693823091e-001L) {
-    pade5(m_M);
-  } else if (m_l1norm < 5.5170388480686700274e-001L) {
-    pade7(m_M);
-  } else if (m_l1norm < 1.3759868875587845383e+000L) {
-    pade9(m_M);
-  } else {
-    const long double maxnorm = 4.0246098906697353063L;
-    frexp(m_l1norm / maxnorm, &m_squarings);
-    if (m_squarings < 0) m_squarings = 0;
-    MatrixType A = m_M / pow(Scalar(2), m_squarings);
-    pade13(A);
-  }
+struct matrix_exp_computeUV<MatrixType, long double> {
+  template <typename ArgType>
+  static void run(const ArgType& arg, MatrixType& U, MatrixType& V, int& squarings) {
+#if LDBL_MANT_DIG == 53  // double precision
+    matrix_exp_computeUV<MatrixType, double>::run(arg, U, V, squarings);
+
+#else
+
+    using Scalar = typename traits<MatrixType>::Scalar;
+
+    using std::frexp;
+    using std::pow;
+    const long double l1norm = arg.cwiseAbs().colwise().sum().maxCoeff();
+    squarings = 0;
+
+#if LDBL_MANT_DIG <= 64  // extended precision
+
+    if (l1norm < 4.1968497232266989671e-003L) {
+      matrix_exp_pade3(arg, U, V);
+    } else if (l1norm < 1.1848116734693823091e-001L) {
+      matrix_exp_pade5(arg, U, V);
+    } else if (l1norm < 5.5170388480686700274e-001L) {
+      matrix_exp_pade7(arg, U, V);
+    } else if (l1norm < 1.3759868875587845383e+000L) {
+      matrix_exp_pade9(arg, U, V);
+    } else {
+      const long double maxnorm = 4.0246098906697353063L;
+      frexp(l1norm / maxnorm, &squarings);
+      if (squarings < 0) squarings = 0;
+      MatrixType A = arg.unaryExpr(MatrixExponentialScalingOp<Scalar>(squarings));
+      matrix_exp_pade13(A, U, V);
+    }
+
 #elif LDBL_MANT_DIG <= 106  // double-double
-  if (m_l1norm < 3.2787892205607026992947488108213e-005L) {
-    pade3(m_M);
-  } else if (m_l1norm < 6.4467025060072760084130906076332e-003L) {
-    pade5(m_M);
-  } else if (m_l1norm < 6.8988028496595374751374122881143e-002L) {
-    pade7(m_M);
-  } else if (m_l1norm < 2.7339737518502231741495857201670e-001L) {
-    pade9(m_M);
-  } else if (m_l1norm < 1.3203382096514474905666448850278e+000L) {
-    pade13(m_M);
-  } else {
-    const long double maxnorm = 3.2579440895405400856599663723517L;
-    frexp(m_l1norm / maxnorm, &m_squarings);
-    if (m_squarings < 0) m_squarings = 0;
-    MatrixType A = m_M / pow(Scalar(2), m_squarings);
-    pade17(A);
-  }
-#elif LDBL_MANT_DIG <= 112  // quadruple precison
-  if (m_l1norm < 1.639394610288918690547467954466970e-005L) {
-    pade3(m_M);
-  } else if (m_l1norm < 4.253237712165275566025884344433009e-003L) {
-    pade5(m_M);
-  } else if (m_l1norm < 5.125804063165764409885122032933142e-002L) {
-    pade7(m_M);
-  } else if (m_l1norm < 2.170000765161155195453205651889853e-001L) {
-    pade9(m_M);
-  } else if (m_l1norm < 1.125358383453143065081397882891878e+000L) {
-    pade13(m_M);
-  } else {
-    const long double maxnorm = 2.884233277829519311757165057717815L;
-    frexp(m_l1norm / maxnorm, &m_squarings);
-    if (m_squarings < 0) m_squarings = 0;
-    MatrixType A = m_M / pow(Scalar(2), m_squarings);
-    pade17(A);
-  }
+
+    if (l1norm < 3.2787892205607026992947488108213e-005L) {
+      matrix_exp_pade3(arg, U, V);
+    } else if (l1norm < 6.4467025060072760084130906076332e-003L) {
+      matrix_exp_pade5(arg, U, V);
+    } else if (l1norm < 6.8988028496595374751374122881143e-002L) {
+      matrix_exp_pade7(arg, U, V);
+    } else if (l1norm < 2.7339737518502231741495857201670e-001L) {
+      matrix_exp_pade9(arg, U, V);
+    } else if (l1norm < 1.3203382096514474905666448850278e+000L) {
+      matrix_exp_pade13(arg, U, V);
+    } else {
+      const long double maxnorm = 3.2579440895405400856599663723517L;
+      frexp(l1norm / maxnorm, &squarings);
+      if (squarings < 0) squarings = 0;
+      MatrixType A = arg.unaryExpr(MatrixExponentialScalingOp<Scalar>(squarings));
+      matrix_exp_pade17(A, U, V);
+    }
+
+#elif LDBL_MANT_DIG <= 113  // quadruple precision
+
+    if (l1norm < 1.639394610288918690547467954466970e-005L) {
+      matrix_exp_pade3(arg, U, V);
+    } else if (l1norm < 4.253237712165275566025884344433009e-003L) {
+      matrix_exp_pade5(arg, U, V);
+    } else if (l1norm < 5.125804063165764409885122032933142e-002L) {
+      matrix_exp_pade7(arg, U, V);
+    } else if (l1norm < 2.170000765161155195453205651889853e-001L) {
+      matrix_exp_pade9(arg, U, V);
+    } else if (l1norm < 1.125358383453143065081397882891878e+000L) {
+      matrix_exp_pade13(arg, U, V);
+    } else {
+      const long double maxnorm = 2.884233277829519311757165057717815L;
+      frexp(l1norm / maxnorm, &squarings);
+      if (squarings < 0) squarings = 0;
+      MatrixType A = arg.unaryExpr(MatrixExponentialScalingOp<Scalar>(squarings));
+      matrix_exp_pade17(A, U, V);
+    }
+
 #else
-  // this case should be handled in compute()
-  eigen_assert(false && "Bug in MatrixExponential"); 
+
+    // this case should be handled in compute()
+    eigen_assert(false && "Bug in MatrixExponential");
+
+#endif
 #endif  // LDBL_MANT_DIG
+  }
+};
+
+template <typename T>
+struct is_exp_known_type : false_type {};
+template <>
+struct is_exp_known_type<float> : true_type {};
+template <>
+struct is_exp_known_type<double> : true_type {};
+#if LDBL_MANT_DIG <= 113
+template <>
+struct is_exp_known_type<long double> : true_type {};
+#endif
+
+template <typename ArgType, typename ResultType>
+void matrix_exp_compute(const ArgType& arg, ResultType& result, true_type)  // natively supported scalar type
+{
+  typedef typename ArgType::PlainObject MatrixType;
+  MatrixType U, V;
+  int squarings;
+  matrix_exp_computeUV<MatrixType>::run(arg, U, V, squarings);  // Pade approximant is (U+V) / (-U+V)
+  MatrixType numer = U + V;
+  MatrixType denom = -U + V;
+  result = denom.partialPivLu().solve(numer);
+  for (int i = 0; i < squarings; i++) result *= result;  // undo scaling by repeated squaring
 }
 
-/** \ingroup MatrixFunctions_Module
-  *
-  * \brief Proxy for the matrix exponential of some matrix (expression).
-  *
-  * \tparam Derived  Type of the argument to the matrix exponential.
-  *
-  * This class holds the argument to the matrix exponential until it
-  * is assigned or evaluated for some other reason (so the argument
-  * should not be changed in the meantime). It is the return type of
-  * MatrixBase::exp() and most of the time this is the only way it is
-  * used.
-  */
-template<typename Derived> struct MatrixExponentialReturnValue
-: public ReturnByValue<MatrixExponentialReturnValue<Derived> >
+/* Computes the matrix exponential
+ *
+ * \param arg    argument of matrix exponential (should be plain object)
+ * \param result variable in which result will be stored
+ */
+template <typename ArgType, typename ResultType>
+void matrix_exp_compute(const ArgType& arg, ResultType& result, false_type)  // default
 {
-    typedef typename Derived::Index Index;
-  public:
-    /** \brief Constructor.
-      *
-      * \param[in] src %Matrix (expression) forming the argument of the
-      * matrix exponential.
-      */
-    MatrixExponentialReturnValue(const Derived& src) : m_src(src) { }
-
-    /** \brief Compute the matrix exponential.
-      *
-      * \param[out] result the matrix exponential of \p src in the
-      * constructor.
-      */
-    template <typename ResultType>
-    inline void evalTo(ResultType& result) const
-    {
-      const typename Derived::PlainObject srcEvaluated = m_src.eval();
-      MatrixExponential<typename Derived::PlainObject> me(srcEvaluated);
-      me.compute(result);
-    }
+  typedef typename ArgType::PlainObject MatrixType;
+  typedef make_complex_t<typename traits<MatrixType>::Scalar> ComplexScalar;
+  result = arg.matrixFunction(internal::stem_function_exp<ComplexScalar>);
+}
 
-    Index rows() const { return m_src.rows(); }
-    Index cols() const { return m_src.cols(); }
+}  // namespace internal
 
-  protected:
-    const Derived& m_src;
-  private:
-    MatrixExponentialReturnValue& operator=(const MatrixExponentialReturnValue&);
+/** \ingroup MatrixFunctions_Module
+ *
+ * \brief Proxy for the matrix exponential of some matrix (expression).
+ *
+ * \tparam Derived  Type of the argument to the matrix exponential.
+ *
+ * This class holds the argument to the matrix exponential until it is assigned or evaluated for
+ * some other reason (so the argument should not be changed in the meantime). It is the return type
+ * of MatrixBase::exp() and most of the time this is the only way it is used.
+ */
+template <typename Derived>
+struct MatrixExponentialReturnValue : public ReturnByValue<MatrixExponentialReturnValue<Derived> > {
+ public:
+  /** \brief Constructor.
+   *
+   * \param src %Matrix (expression) forming the argument of the matrix exponential.
+   */
+  MatrixExponentialReturnValue(const Derived& src) : m_src(src) {}
+
+  /** \brief Compute the matrix exponential.
+   *
+   * \param result the matrix exponential of \p src in the constructor.
+   */
+  template <typename ResultType>
+  inline void evalTo(ResultType& result) const {
+    const typename internal::nested_eval<Derived, 10>::type tmp(m_src);
+    internal::matrix_exp_compute(tmp, result, internal::is_exp_known_type<typename Derived::RealScalar>());
+  }
+
+  Index rows() const { return m_src.rows(); }
+  Index cols() const { return m_src.cols(); }
+
+ protected:
+  const typename internal::ref_selector<Derived>::type m_src;
 };
 
 namespace internal {
-template<typename Derived>
-struct traits<MatrixExponentialReturnValue<Derived> >
-{
+template <typename Derived>
+struct traits<MatrixExponentialReturnValue<Derived> > {
   typedef typename Derived::PlainObject ReturnType;
 };
-}
+}  // namespace internal
 
 template <typename Derived>
-const MatrixExponentialReturnValue<Derived> MatrixBase<Derived>::exp() const
-{
+const MatrixExponentialReturnValue<Derived> MatrixBase<Derived>::exp() const {
   eigen_assert(rows() == cols());
   return MatrixExponentialReturnValue<Derived>(derived());
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_MATRIX_EXPONENTIAL
+#endif  // EIGEN_MATRIX_EXPONENTIAL
diff --git a/inst/include/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h b/inst/include/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h
index 7d426640..0c18ad66 100644
--- a/inst/include/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h
+++ b/inst/include/unsupported/Eigen/src/MatrixFunctions/MatrixFunction.h
@@ -1,440 +1,267 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2009-2011 Jitse Niesen <jitse@maths.leeds.ac.uk>
+// Copyright (C) 2009-2011, 2013 Jitse Niesen <jitse@maths.leeds.ac.uk>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
-#ifndef EIGEN_MATRIX_FUNCTION
-#define EIGEN_MATRIX_FUNCTION
+#ifndef EIGEN_MATRIX_FUNCTION_H
+#define EIGEN_MATRIX_FUNCTION_H
 
 #include "StemFunction.h"
-#include "MatrixFunctionAtomic.h"
 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 
-namespace Eigen { 
+namespace Eigen {
 
-/** \ingroup MatrixFunctions_Module
-  * \brief Class for computing matrix functions.
-  * \tparam  MatrixType  type of the argument of the matrix function,
-  *                      expected to be an instantiation of the Matrix class template.
-  * \tparam  AtomicType  type for computing matrix function of atomic blocks.
-  * \tparam  IsComplex   used internally to select correct specialization.
-  *
-  * This class implements the Schur-Parlett algorithm for computing matrix functions. The spectrum of the
-  * matrix is divided in clustered of eigenvalues that lies close together. This class delegates the
-  * computation of the matrix function on every block corresponding to these clusters to an object of type
-  * \p AtomicType and uses these results to compute the matrix function of the whole matrix. The class
-  * \p AtomicType should have a \p compute() member function for computing the matrix function of a block.
-  *
-  * \sa class MatrixFunctionAtomic, class MatrixLogarithmAtomic
-  */
-template <typename MatrixType, 
-	  typename AtomicType,  
-          int IsComplex = NumTraits<typename internal::traits<MatrixType>::Scalar>::IsComplex>
-class MatrixFunction
-{  
-  public:
-
-    /** \brief Constructor. 
-      *
-      * \param[in]  A       argument of matrix function, should be a square matrix.
-      * \param[in]  atomic  class for computing matrix function of atomic blocks.
-      *
-      * The class stores references to \p A and \p atomic, so they should not be
-      * changed (or destroyed) before compute() is called.
-      */
-    MatrixFunction(const MatrixType& A, AtomicType& atomic);
-
-    /** \brief Compute the matrix function.
-      *
-      * \param[out] result  the function \p f applied to \p A, as
-      * specified in the constructor.
-      *
-      * See MatrixBase::matrixFunction() for details on how this computation
-      * is implemented.
-      */
-    template <typename ResultType> 
-    void compute(ResultType &result);    
-};
-
-
-/** \internal \ingroup MatrixFunctions_Module 
-  * \brief Partial specialization of MatrixFunction for real matrices
-  */
-template <typename MatrixType, typename AtomicType>
-class MatrixFunction<MatrixType, AtomicType, 0>
-{  
-  private:
-
-    typedef internal::traits<MatrixType> Traits;
-    typedef typename Traits::Scalar Scalar;
-    static const int Rows = Traits::RowsAtCompileTime;
-    static const int Cols = Traits::ColsAtCompileTime;
-    static const int Options = MatrixType::Options;
-    static const int MaxRows = Traits::MaxRowsAtCompileTime;
-    static const int MaxCols = Traits::MaxColsAtCompileTime;
-
-    typedef std::complex<Scalar> ComplexScalar;
-    typedef Matrix<ComplexScalar, Rows, Cols, Options, MaxRows, MaxCols> ComplexMatrix;
-
-  public:
-
-    /** \brief Constructor. 
-      *
-      * \param[in]  A       argument of matrix function, should be a square matrix.
-      * \param[in]  atomic  class for computing matrix function of atomic blocks.
-      */
-    MatrixFunction(const MatrixType& A, AtomicType& atomic) : m_A(A), m_atomic(atomic) { }
-
-    /** \brief Compute the matrix function.
-      *
-      * \param[out] result  the function \p f applied to \p A, as
-      * specified in the constructor.
-      *
-      * This function converts the real matrix \c A to a complex matrix,
-      * uses MatrixFunction<MatrixType,1> and then converts the result back to
-      * a real matrix.
-      */
-    template <typename ResultType>
-    void compute(ResultType& result) 
-    {
-      ComplexMatrix CA = m_A.template cast<ComplexScalar>();
-      ComplexMatrix Cresult;
-      MatrixFunction<ComplexMatrix, AtomicType> mf(CA, m_atomic);
-      mf.compute(Cresult);
-      result = Cresult.real();
-    }
-
-  private:
-    typename internal::nested<MatrixType>::type m_A; /**< \brief Reference to argument of matrix function. */
-    AtomicType& m_atomic; /**< \brief Class for computing matrix function of atomic blocks. */
-
-    MatrixFunction& operator=(const MatrixFunction&);
-};
-
-      
-/** \internal \ingroup MatrixFunctions_Module 
-  * \brief Partial specialization of MatrixFunction for complex matrices
-  */
-template <typename MatrixType, typename AtomicType>
-class MatrixFunction<MatrixType, AtomicType, 1>
-{
-  private:
+namespace internal {
 
-    typedef internal::traits<MatrixType> Traits;
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::Index Index;
-    static const int RowsAtCompileTime = Traits::RowsAtCompileTime;
-    static const int ColsAtCompileTime = Traits::ColsAtCompileTime;
-    static const int Options = MatrixType::Options;
-    typedef typename NumTraits<Scalar>::Real RealScalar;
-    typedef Matrix<Scalar, Traits::RowsAtCompileTime, 1> VectorType;
-    typedef Matrix<Index, Traits::RowsAtCompileTime, 1> IntVectorType;
-    typedef Matrix<Index, Dynamic, 1> DynamicIntVectorType;
-    typedef std::list<Scalar> Cluster;
-    typedef std::list<Cluster> ListOfClusters;
-    typedef Matrix<Scalar, Dynamic, Dynamic, Options, RowsAtCompileTime, ColsAtCompileTime> DynMatrixType;
-
-  public:
-
-    MatrixFunction(const MatrixType& A, AtomicType& atomic);
-    template <typename ResultType> void compute(ResultType& result);
-
-  private:
-
-    void computeSchurDecomposition();
-    void partitionEigenvalues();
-    typename ListOfClusters::iterator findCluster(Scalar key);
-    void computeClusterSize();
-    void computeBlockStart();
-    void constructPermutation();
-    void permuteSchur();
-    void swapEntriesInSchur(Index index);
-    void computeBlockAtomic();
-    Block<MatrixType> block(MatrixType& A, Index i, Index j);
-    void computeOffDiagonal();
-    DynMatrixType solveTriangularSylvester(const DynMatrixType& A, const DynMatrixType& B, const DynMatrixType& C);
-
-    typename internal::nested<MatrixType>::type m_A; /**< \brief Reference to argument of matrix function. */
-    AtomicType& m_atomic; /**< \brief Class for computing matrix function of atomic blocks. */
-    MatrixType m_T; /**< \brief Triangular part of Schur decomposition */
-    MatrixType m_U; /**< \brief Unitary part of Schur decomposition */
-    MatrixType m_fT; /**< \brief %Matrix function applied to #m_T */
-    ListOfClusters m_clusters; /**< \brief Partition of eigenvalues into clusters of ei'vals "close" to each other */
-    DynamicIntVectorType m_eivalToCluster; /**< \brief m_eivalToCluster[i] = j means i-th ei'val is in j-th cluster */
-    DynamicIntVectorType m_clusterSize; /**< \brief Number of eigenvalues in each clusters  */
-    DynamicIntVectorType m_blockStart; /**< \brief Row index at which block corresponding to i-th cluster starts */
-    IntVectorType m_permutation; /**< \brief Permutation which groups ei'vals in the same cluster together */
-
-    /** \brief Maximum distance allowed between eigenvalues to be considered "close".
-      *
-      * This is morally a \c static \c const \c Scalar, but only
-      * integers can be static constant class members in C++. The
-      * separation constant is set to 0.1, a value taken from the
-      * paper by Davies and Higham. */
-    static const RealScalar separation() { return static_cast<RealScalar>(0.1); }
-
-    MatrixFunction& operator=(const MatrixFunction&);
-};
+/** \brief Maximum distance allowed between eigenvalues to be considered "close". */
+static const float matrix_function_separation = 0.1f;
 
-/** \brief Constructor. 
+/** \ingroup MatrixFunctions_Module
+ * \class MatrixFunctionAtomic
+ * \brief Helper class for computing matrix functions of atomic matrices.
  *
- * \param[in]  A       argument of matrix function, should be a square matrix.
- * \param[in]  atomic  class for computing matrix function of atomic blocks.
+ * Here, an atomic matrix is a triangular matrix whose diagonal entries are close to each other.
  */
-template <typename MatrixType, typename AtomicType>
-MatrixFunction<MatrixType,AtomicType,1>::MatrixFunction(const MatrixType& A, AtomicType& atomic)
-  : m_A(A), m_atomic(atomic)
-{
-  /* empty body */
+template <typename MatrixType>
+class MatrixFunctionAtomic {
+ public:
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename stem_function<Scalar>::type StemFunction;
+
+  /** \brief Constructor
+   * \param[in]  f  matrix function to compute.
+   */
+  MatrixFunctionAtomic(StemFunction f) : m_f(f) {}
+
+  /** \brief Compute matrix function of atomic matrix
+   * \param[in]  A  argument of matrix function, should be upper triangular and atomic
+   * \returns  f(A), the matrix function evaluated at the given matrix
+   */
+  MatrixType compute(const MatrixType& A);
+
+ private:
+  StemFunction* m_f;
+};
+
+template <typename MatrixType>
+typename NumTraits<typename MatrixType::Scalar>::Real matrix_function_compute_mu(const MatrixType& A) {
+  typedef typename plain_col_type<MatrixType>::type VectorType;
+  Index rows = A.rows();
+  const MatrixType N = MatrixType::Identity(rows, rows) - A;
+  VectorType e = VectorType::Ones(rows);
+  N.template triangularView<Upper>().solveInPlace(e);
+  return e.cwiseAbs().maxCoeff();
 }
 
-/** \brief Compute the matrix function.
-  *
-  * \param[out] result  the function \p f applied to \p A, as
-  * specified in the constructor.
-  */
-template <typename MatrixType, typename AtomicType>
-template <typename ResultType>
-void MatrixFunction<MatrixType,AtomicType,1>::compute(ResultType& result) 
-{
-  computeSchurDecomposition();
-  partitionEigenvalues();
-  computeClusterSize();
-  computeBlockStart();
-  constructPermutation();
-  permuteSchur();
-  computeBlockAtomic();
-  computeOffDiagonal();
-  result = m_U * (m_fT.template triangularView<Upper>() * m_U.adjoint());
+template <typename MatrixType>
+MatrixType MatrixFunctionAtomic<MatrixType>::compute(const MatrixType& A) {
+  // TODO: Use that A is upper triangular
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  Index rows = A.rows();
+  Scalar avgEival = A.trace() / Scalar(RealScalar(rows));
+  MatrixType Ashifted = A - avgEival * MatrixType::Identity(rows, rows);
+  RealScalar mu = matrix_function_compute_mu(Ashifted);
+  MatrixType F = m_f(avgEival, 0) * MatrixType::Identity(rows, rows);
+  MatrixType P = Ashifted;
+  MatrixType Fincr;
+  for (Index s = 1; double(s) < 1.1 * double(rows) + 10.0; s++) {  // upper limit is fairly arbitrary
+    Fincr = m_f(avgEival, static_cast<int>(s)) * P;
+    F += Fincr;
+    P = Scalar(RealScalar(1) / RealScalar(s + 1)) * P * Ashifted;
+
+    // test whether Taylor series converged
+    const RealScalar F_norm = F.cwiseAbs().rowwise().sum().maxCoeff();
+    const RealScalar Fincr_norm = Fincr.cwiseAbs().rowwise().sum().maxCoeff();
+    if (Fincr_norm < NumTraits<Scalar>::epsilon() * F_norm) {
+      RealScalar delta = 0;
+      RealScalar rfactorial = 1;
+      for (Index r = 0; r < rows; r++) {
+        RealScalar mx = 0;
+        for (Index i = 0; i < rows; i++)
+          mx = (std::max)(mx, std::abs(m_f(Ashifted(i, i) + avgEival, static_cast<int>(s + r))));
+        if (r != 0) rfactorial *= RealScalar(r);
+        delta = (std::max)(delta, mx / rfactorial);
+      }
+      const RealScalar P_norm = P.cwiseAbs().rowwise().sum().maxCoeff();
+      if (mu * delta * P_norm < NumTraits<Scalar>::epsilon() * F_norm)  // series converged
+        break;
+    }
+  }
+  return F;
 }
 
-/** \brief Store the Schur decomposition of #m_A in #m_T and #m_U */
-template <typename MatrixType, typename AtomicType>
-void MatrixFunction<MatrixType,AtomicType,1>::computeSchurDecomposition()
-{
-  const ComplexSchur<MatrixType> schurOfA(m_A);  
-  m_T = schurOfA.matrixT();
-  m_U = schurOfA.matrixU();
+/** \brief Find cluster in \p clusters containing some value
+ * \param[in] key Value to find
+ * \returns Iterator to cluster containing \p key, or \c clusters.end() if no cluster in \p m_clusters
+ * contains \p key.
+ */
+template <typename Index, typename ListOfClusters>
+typename ListOfClusters::iterator matrix_function_find_cluster(Index key, ListOfClusters& clusters) {
+  typename std::list<Index>::iterator j;
+  for (typename ListOfClusters::iterator i = clusters.begin(); i != clusters.end(); ++i) {
+    j = std::find(i->begin(), i->end(), key);
+    if (j != i->end()) return i;
+  }
+  return clusters.end();
 }
 
 /** \brief Partition eigenvalues in clusters of ei'vals close to each other
-  * 
-  * This function computes #m_clusters. This is a partition of the
-  * eigenvalues of #m_T in clusters, such that
-  * # Any eigenvalue in a certain cluster is at most separation() away
-  *   from another eigenvalue in the same cluster.
-  * # The distance between two eigenvalues in different clusters is
-  *   more than separation().
-  * The implementation follows Algorithm 4.1 in the paper of Davies
-  * and Higham. 
-  */
-template <typename MatrixType, typename AtomicType>
-void MatrixFunction<MatrixType,AtomicType,1>::partitionEigenvalues()
-{
-  using std::abs;
-  const Index rows = m_T.rows();
-  VectorType diag = m_T.diagonal(); // contains eigenvalues of A
-
-  for (Index i=0; i<rows; ++i) {
-    // Find set containing diag(i), adding a new set if necessary
-    typename ListOfClusters::iterator qi = findCluster(diag(i));
-    if (qi == m_clusters.end()) {
+ *
+ * \param[in]  eivals    Eigenvalues
+ * \param[out] clusters  Resulting partition of eigenvalues
+ *
+ * The partition satisfies the following two properties:
+ * # Any eigenvalue in a certain cluster is at most matrix_function_separation() away from another eigenvalue
+ *   in the same cluster.
+ * # The distance between two eigenvalues in different clusters is more than matrix_function_separation().
+ * The implementation follows Algorithm 4.1 in the paper of Davies and Higham.
+ */
+template <typename EivalsType, typename Cluster>
+void matrix_function_partition_eigenvalues(const EivalsType& eivals, std::list<Cluster>& clusters) {
+  typedef typename EivalsType::RealScalar RealScalar;
+  for (Index i = 0; i < eivals.rows(); ++i) {
+    // Find cluster containing i-th ei'val, adding a new cluster if necessary
+    typename std::list<Cluster>::iterator qi = matrix_function_find_cluster(i, clusters);
+    if (qi == clusters.end()) {
       Cluster l;
-      l.push_back(diag(i));
-      m_clusters.push_back(l);
-      qi = m_clusters.end();
+      l.push_back(i);
+      clusters.push_back(l);
+      qi = clusters.end();
       --qi;
     }
 
     // Look for other element to add to the set
-    for (Index j=i+1; j<rows; ++j) {
-      if (abs(diag(j) - diag(i)) <= separation() && std::find(qi->begin(), qi->end(), diag(j)) == qi->end()) {
-        typename ListOfClusters::iterator qj = findCluster(diag(j));
-        if (qj == m_clusters.end()) {
-          qi->push_back(diag(j));
+    for (Index j = i + 1; j < eivals.rows(); ++j) {
+      if (abs(eivals(j) - eivals(i)) <= RealScalar(matrix_function_separation) &&
+          std::find(qi->begin(), qi->end(), j) == qi->end()) {
+        typename std::list<Cluster>::iterator qj = matrix_function_find_cluster(j, clusters);
+        if (qj == clusters.end()) {
+          qi->push_back(j);
         } else {
           qi->insert(qi->end(), qj->begin(), qj->end());
-          m_clusters.erase(qj);
+          clusters.erase(qj);
         }
       }
     }
   }
 }
 
-/** \brief Find cluster in #m_clusters containing some value 
-  * \param[in] key Value to find
-  * \returns Iterator to cluster containing \c key, or
-  * \c m_clusters.end() if no cluster in m_clusters contains \c key.
-  */
-template <typename MatrixType, typename AtomicType>
-typename MatrixFunction<MatrixType,AtomicType,1>::ListOfClusters::iterator MatrixFunction<MatrixType,AtomicType,1>::findCluster(Scalar key)
-{
-  typename Cluster::iterator j;
-  for (typename ListOfClusters::iterator i = m_clusters.begin(); i != m_clusters.end(); ++i) {
-    j = std::find(i->begin(), i->end(), key);
-    if (j != i->end())
-      return i;
+/** \brief Compute size of each cluster given a partitioning */
+template <typename ListOfClusters, typename Index>
+void matrix_function_compute_cluster_size(const ListOfClusters& clusters, Matrix<Index, Dynamic, 1>& clusterSize) {
+  const Index numClusters = static_cast<Index>(clusters.size());
+  clusterSize.setZero(numClusters);
+  Index clusterIndex = 0;
+  for (typename ListOfClusters::const_iterator cluster = clusters.begin(); cluster != clusters.end(); ++cluster) {
+    clusterSize[clusterIndex] = cluster->size();
+    ++clusterIndex;
   }
-  return m_clusters.end();
 }
 
-/** \brief Compute #m_clusterSize and #m_eivalToCluster using #m_clusters */
-template <typename MatrixType, typename AtomicType>
-void MatrixFunction<MatrixType,AtomicType,1>::computeClusterSize()
-{
-  const Index rows = m_T.rows();
-  VectorType diag = m_T.diagonal(); 
-  const Index numClusters = static_cast<Index>(m_clusters.size());
+/** \brief Compute start of each block using clusterSize */
+template <typename VectorType>
+void matrix_function_compute_block_start(const VectorType& clusterSize, VectorType& blockStart) {
+  blockStart.resize(clusterSize.rows());
+  blockStart(0) = 0;
+  for (Index i = 1; i < clusterSize.rows(); i++) {
+    blockStart(i) = blockStart(i - 1) + clusterSize(i - 1);
+  }
+}
 
-  m_clusterSize.setZero(numClusters);
-  m_eivalToCluster.resize(rows);
+/** \brief Compute mapping of eigenvalue indices to cluster indices */
+template <typename EivalsType, typename ListOfClusters, typename VectorType>
+void matrix_function_compute_map(const EivalsType& eivals, const ListOfClusters& clusters, VectorType& eivalToCluster) {
+  eivalToCluster.resize(eivals.rows());
   Index clusterIndex = 0;
-  for (typename ListOfClusters::const_iterator cluster = m_clusters.begin(); cluster != m_clusters.end(); ++cluster) {
-    for (Index i = 0; i < diag.rows(); ++i) {
-      if (std::find(cluster->begin(), cluster->end(), diag(i)) != cluster->end()) {
-        ++m_clusterSize[clusterIndex];
-        m_eivalToCluster[i] = clusterIndex;
+  for (typename ListOfClusters::const_iterator cluster = clusters.begin(); cluster != clusters.end(); ++cluster) {
+    for (Index i = 0; i < eivals.rows(); ++i) {
+      if (std::find(cluster->begin(), cluster->end(), i) != cluster->end()) {
+        eivalToCluster[i] = clusterIndex;
       }
     }
     ++clusterIndex;
   }
 }
 
-/** \brief Compute #m_blockStart using #m_clusterSize */
-template <typename MatrixType, typename AtomicType>
-void MatrixFunction<MatrixType,AtomicType,1>::computeBlockStart()
-{
-  m_blockStart.resize(m_clusterSize.rows());
-  m_blockStart(0) = 0;
-  for (Index i = 1; i < m_clusterSize.rows(); i++) {
-    m_blockStart(i) = m_blockStart(i-1) + m_clusterSize(i-1);
+/** \brief Compute permutation which groups ei'vals in same cluster together */
+template <typename DynVectorType, typename VectorType>
+void matrix_function_compute_permutation(const DynVectorType& blockStart, const DynVectorType& eivalToCluster,
+                                         VectorType& permutation) {
+  DynVectorType indexNextEntry = blockStart;
+  permutation.resize(eivalToCluster.rows());
+  for (Index i = 0; i < eivalToCluster.rows(); i++) {
+    Index cluster = eivalToCluster[i];
+    permutation[i] = indexNextEntry[cluster];
+    ++indexNextEntry[cluster];
   }
 }
 
-/** \brief Compute #m_permutation using #m_eivalToCluster and #m_blockStart */
-template <typename MatrixType, typename AtomicType>
-void MatrixFunction<MatrixType,AtomicType,1>::constructPermutation()
-{
-  DynamicIntVectorType indexNextEntry = m_blockStart;
-  m_permutation.resize(m_T.rows());
-  for (Index i = 0; i < m_T.rows(); i++) {
-    Index cluster = m_eivalToCluster[i];
-    m_permutation[i] = indexNextEntry[cluster];
-    ++indexNextEntry[cluster];
-  }
-}  
-
-/** \brief Permute Schur decomposition in #m_U and #m_T according to #m_permutation */
-template <typename MatrixType, typename AtomicType>
-void MatrixFunction<MatrixType,AtomicType,1>::permuteSchur()
-{
-  IntVectorType p = m_permutation;
-  for (Index i = 0; i < p.rows() - 1; i++) {
+/** \brief Permute Schur decomposition in U and T according to permutation */
+template <typename VectorType, typename MatrixType>
+void matrix_function_permute_schur(VectorType& permutation, MatrixType& U, MatrixType& T) {
+  for (Index i = 0; i < permutation.rows() - 1; i++) {
     Index j;
-    for (j = i; j < p.rows(); j++) {
-      if (p(j) == i) break;
+    for (j = i; j < permutation.rows(); j++) {
+      if (permutation(j) == i) break;
     }
-    eigen_assert(p(j) == i);
-    for (Index k = j-1; k >= i; k--) {
-      swapEntriesInSchur(k);
-      std::swap(p.coeffRef(k), p.coeffRef(k+1));
+    eigen_assert(permutation(j) == i);
+    for (Index k = j - 1; k >= i; k--) {
+      JacobiRotation<typename MatrixType::Scalar> rotation;
+      rotation.makeGivens(T(k, k + 1), T(k + 1, k + 1) - T(k, k));
+      T.applyOnTheLeft(k, k + 1, rotation.adjoint());
+      T.applyOnTheRight(k, k + 1, rotation);
+      U.applyOnTheRight(k, k + 1, rotation);
+      std::swap(permutation.coeffRef(k), permutation.coeffRef(k + 1));
     }
   }
 }
 
-/** \brief Swap rows \a index and \a index+1 in Schur decomposition in #m_U and #m_T */
-template <typename MatrixType, typename AtomicType>
-void MatrixFunction<MatrixType,AtomicType,1>::swapEntriesInSchur(Index index)
-{
-  JacobiRotation<Scalar> rotation;
-  rotation.makeGivens(m_T(index, index+1), m_T(index+1, index+1) - m_T(index, index));
-  m_T.applyOnTheLeft(index, index+1, rotation.adjoint());
-  m_T.applyOnTheRight(index, index+1, rotation);
-  m_U.applyOnTheRight(index, index+1, rotation);
-}  
-
-/** \brief Compute block diagonal part of #m_fT.
-  *
-  * This routine computes the matrix function applied to the block diagonal part of #m_T, with the blocking
-  * given by #m_blockStart. The matrix function of each diagonal block is computed by #m_atomic. The
-  * off-diagonal parts of #m_fT are set to zero.
-  */
-template <typename MatrixType, typename AtomicType>
-void MatrixFunction<MatrixType,AtomicType,1>::computeBlockAtomic()
-{ 
-  m_fT.resize(m_T.rows(), m_T.cols());
-  m_fT.setZero();
-  for (Index i = 0; i < m_clusterSize.rows(); ++i) {
-    block(m_fT, i, i) = m_atomic.compute(block(m_T, i, i));
-  }
-}
-
-/** \brief Return block of matrix according to blocking given by #m_blockStart */
-template <typename MatrixType, typename AtomicType>
-Block<MatrixType> MatrixFunction<MatrixType,AtomicType,1>::block(MatrixType& A, Index i, Index j)
-{
-  return A.block(m_blockStart(i), m_blockStart(j), m_clusterSize(i), m_clusterSize(j));
-}
-
-/** \brief Compute part of #m_fT above block diagonal.
-  *
-  * This routine assumes that the block diagonal part of #m_fT (which
-  * equals the matrix function applied to #m_T) has already been computed and computes
-  * the part above the block diagonal. The part below the diagonal is
-  * zero, because #m_T is upper triangular.
-  */
-template <typename MatrixType, typename AtomicType>
-void MatrixFunction<MatrixType,AtomicType,1>::computeOffDiagonal()
-{ 
-  for (Index diagIndex = 1; diagIndex < m_clusterSize.rows(); diagIndex++) {
-    for (Index blockIndex = 0; blockIndex < m_clusterSize.rows() - diagIndex; blockIndex++) {
-      // compute (blockIndex, blockIndex+diagIndex) block
-      DynMatrixType A = block(m_T, blockIndex, blockIndex);
-      DynMatrixType B = -block(m_T, blockIndex+diagIndex, blockIndex+diagIndex);
-      DynMatrixType C = block(m_fT, blockIndex, blockIndex) * block(m_T, blockIndex, blockIndex+diagIndex);
-      C -= block(m_T, blockIndex, blockIndex+diagIndex) * block(m_fT, blockIndex+diagIndex, blockIndex+diagIndex);
-      for (Index k = blockIndex + 1; k < blockIndex + diagIndex; k++) {
-	C += block(m_fT, blockIndex, k) * block(m_T, k, blockIndex+diagIndex);
-	C -= block(m_T, blockIndex, k) * block(m_fT, k, blockIndex+diagIndex);
-      }
-      block(m_fT, blockIndex, blockIndex+diagIndex) = solveTriangularSylvester(A, B, C);
-    }
+/** \brief Compute block diagonal part of matrix function.
+ *
+ * This routine computes the matrix function applied to the block diagonal part of \p T (which should be
+ * upper triangular), with the blocking given by \p blockStart and \p clusterSize. The matrix function of
+ * each diagonal block is computed by \p atomic. The off-diagonal parts of \p fT are set to zero.
+ */
+template <typename MatrixType, typename AtomicType, typename VectorType>
+void matrix_function_compute_block_atomic(const MatrixType& T, AtomicType& atomic, const VectorType& blockStart,
+                                          const VectorType& clusterSize, MatrixType& fT) {
+  fT.setZero(T.rows(), T.cols());
+  for (Index i = 0; i < clusterSize.rows(); ++i) {
+    fT.block(blockStart(i), blockStart(i), clusterSize(i), clusterSize(i)) =
+        atomic.compute(T.block(blockStart(i), blockStart(i), clusterSize(i), clusterSize(i)));
   }
 }
 
-/** \brief Solve a triangular Sylvester equation AX + XB = C 
-  *
-  * \param[in]  A  the matrix A; should be square and upper triangular
-  * \param[in]  B  the matrix B; should be square and upper triangular
-  * \param[in]  C  the matrix C; should have correct size.
-  *
-  * \returns the solution X.
-  *
-  * If A is m-by-m and B is n-by-n, then both C and X are m-by-n. 
-  * The (i,j)-th component of the Sylvester equation is
-  * \f[ 
-  *     \sum_{k=i}^m A_{ik} X_{kj} + \sum_{k=1}^j X_{ik} B_{kj} = C_{ij}. 
-  * \f]
-  * This can be re-arranged to yield:
-  * \f[ 
-  *     X_{ij} = \frac{1}{A_{ii} + B_{jj}} \Bigl( C_{ij}
-  *     - \sum_{k=i+1}^m A_{ik} X_{kj} - \sum_{k=1}^{j-1} X_{ik} B_{kj} \Bigr).
-  * \f]
-  * It is assumed that A and B are such that the numerator is never
-  * zero (otherwise the Sylvester equation does not have a unique
-  * solution). In that case, these equations can be evaluated in the
-  * order \f$ i=m,\ldots,1 \f$ and \f$ j=1,\ldots,n \f$.
-  */
-template <typename MatrixType, typename AtomicType>
-typename MatrixFunction<MatrixType,AtomicType,1>::DynMatrixType MatrixFunction<MatrixType,AtomicType,1>::solveTriangularSylvester(
-  const DynMatrixType& A, 
-  const DynMatrixType& B, 
-  const DynMatrixType& C)
-{
+/** \brief Solve a triangular Sylvester equation AX + XB = C
+ *
+ * \param[in]  A  the matrix A; should be square and upper triangular
+ * \param[in]  B  the matrix B; should be square and upper triangular
+ * \param[in]  C  the matrix C; should have correct size.
+ *
+ * \returns the solution X.
+ *
+ * If A is m-by-m and B is n-by-n, then both C and X are m-by-n.  The (i,j)-th component of the Sylvester
+ * equation is
+ * \f[
+ *     \sum_{k=i}^m A_{ik} X_{kj} + \sum_{k=1}^j X_{ik} B_{kj} = C_{ij}.
+ * \f]
+ * This can be re-arranged to yield:
+ * \f[
+ *     X_{ij} = \frac{1}{A_{ii} + B_{jj}} \Bigl( C_{ij}
+ *     - \sum_{k=i+1}^m A_{ik} X_{kj} - \sum_{k=1}^{j-1} X_{ik} B_{kj} \Bigr).
+ * \f]
+ * It is assumed that A and B are such that the numerator is never zero (otherwise the Sylvester equation
+ * does not have a unique solution). In that case, these equations can be evaluated in the order
+ * \f$ i=m,\ldots,1 \f$ and \f$ j=1,\ldots,n \f$.
+ */
+template <typename MatrixType>
+MatrixType matrix_function_solve_triangular_sylvester(const MatrixType& A, const MatrixType& B, const MatrixType& C) {
   eigen_assert(A.rows() == A.cols());
   eigen_assert(A.isUpperTriangular());
   eigen_assert(B.rows() == B.cols());
@@ -442,150 +269,275 @@ typename MatrixFunction<MatrixType,AtomicType,1>::DynMatrixType MatrixFunction<M
   eigen_assert(C.rows() == A.rows());
   eigen_assert(C.cols() == B.rows());
 
+  typedef typename MatrixType::Scalar Scalar;
+
   Index m = A.rows();
   Index n = B.rows();
-  DynMatrixType X(m, n);
+  MatrixType X(m, n);
 
   for (Index i = m - 1; i >= 0; --i) {
     for (Index j = 0; j < n; ++j) {
-
       // Compute AX = \sum_{k=i+1}^m A_{ik} X_{kj}
       Scalar AX;
       if (i == m - 1) {
-	AX = 0; 
+        AX = 0;
       } else {
-	Matrix<Scalar,1,1> AXmatrix = A.row(i).tail(m-1-i) * X.col(j).tail(m-1-i);
-	AX = AXmatrix(0,0);
+        Matrix<Scalar, 1, 1> AXmatrix = A.row(i).tail(m - 1 - i) * X.col(j).tail(m - 1 - i);
+        AX = AXmatrix(0, 0);
       }
 
       // Compute XB = \sum_{k=1}^{j-1} X_{ik} B_{kj}
       Scalar XB;
       if (j == 0) {
-	XB = 0; 
+        XB = 0;
       } else {
-	Matrix<Scalar,1,1> XBmatrix = X.row(i).head(j) * B.col(j).head(j);
-	XB = XBmatrix(0,0);
+        Matrix<Scalar, 1, 1> XBmatrix = X.row(i).head(j) * B.col(j).head(j);
+        XB = XBmatrix(0, 0);
       }
 
-      X(i,j) = (C(i,j) - AX - XB) / (A(i,i) + B(j,j));
+      X(i, j) = (C(i, j) - AX - XB) / (A(i, i) + B(j, j));
     }
   }
   return X;
 }
 
-/** \ingroup MatrixFunctions_Module
-  *
-  * \brief Proxy for the matrix function of some matrix (expression).
-  *
-  * \tparam Derived  Type of the argument to the matrix function.
-  *
-  * This class holds the argument to the matrix function until it is
-  * assigned or evaluated for some other reason (so the argument
-  * should not be changed in the meantime). It is the return type of
-  * matrixBase::matrixFunction() and related functions and most of the
-  * time this is the only way it is used.
-  */
-template<typename Derived> class MatrixFunctionReturnValue
-: public ReturnByValue<MatrixFunctionReturnValue<Derived> >
-{
-  public:
-
-    typedef typename Derived::Scalar Scalar;
-    typedef typename Derived::Index Index;
-    typedef typename internal::stem_function<Scalar>::type StemFunction;
-
-   /** \brief Constructor.
-      *
-      * \param[in] A  %Matrix (expression) forming the argument of the
-      * matrix function.
-      * \param[in] f  Stem function for matrix function under consideration.
-      */
-    MatrixFunctionReturnValue(const Derived& A, StemFunction f) : m_A(A), m_f(f) { }
-
-    /** \brief Compute the matrix function.
-      *
-      * \param[out] result \p f applied to \p A, where \p f and \p A
-      * are as in the constructor.
-      */
-    template <typename ResultType>
-    inline void evalTo(ResultType& result) const
-    {
-      typedef typename Derived::PlainObject PlainObject;
-      typedef internal::traits<PlainObject> Traits;
-      static const int RowsAtCompileTime = Traits::RowsAtCompileTime;
-      static const int ColsAtCompileTime = Traits::ColsAtCompileTime;
-      static const int Options = PlainObject::Options;
-      typedef std::complex<typename NumTraits<Scalar>::Real> ComplexScalar;
-      typedef Matrix<ComplexScalar, Dynamic, Dynamic, Options, RowsAtCompileTime, ColsAtCompileTime> DynMatrixType;
-      typedef MatrixFunctionAtomic<DynMatrixType> AtomicType;
-      AtomicType atomic(m_f);
-
-      const PlainObject Aevaluated = m_A.eval();
-      MatrixFunction<PlainObject, AtomicType> mf(Aevaluated, atomic);
-      mf.compute(result);
+/** \brief Compute part of matrix function above block diagonal.
+ *
+ * This routine completes the computation of \p fT, denoting a matrix function applied to the triangular
+ * matrix \p T. It assumes that the block diagonal part of \p fT has already been computed. The part below
+ * the diagonal is zero, because \p T is upper triangular.
+ */
+template <typename MatrixType, typename VectorType>
+void matrix_function_compute_above_diagonal(const MatrixType& T, const VectorType& blockStart,
+                                            const VectorType& clusterSize, MatrixType& fT) {
+  typedef internal::traits<MatrixType> Traits;
+  typedef typename MatrixType::Scalar Scalar;
+  static const int Options = MatrixType::Options;
+  typedef Matrix<Scalar, Dynamic, Dynamic, Options, Traits::RowsAtCompileTime, Traits::ColsAtCompileTime> DynMatrixType;
+
+  for (Index k = 1; k < clusterSize.rows(); k++) {
+    for (Index i = 0; i < clusterSize.rows() - k; i++) {
+      // compute (i, i+k) block
+      DynMatrixType A = T.block(blockStart(i), blockStart(i), clusterSize(i), clusterSize(i));
+      DynMatrixType B = -T.block(blockStart(i + k), blockStart(i + k), clusterSize(i + k), clusterSize(i + k));
+      DynMatrixType C = fT.block(blockStart(i), blockStart(i), clusterSize(i), clusterSize(i)) *
+                        T.block(blockStart(i), blockStart(i + k), clusterSize(i), clusterSize(i + k));
+      C -= T.block(blockStart(i), blockStart(i + k), clusterSize(i), clusterSize(i + k)) *
+           fT.block(blockStart(i + k), blockStart(i + k), clusterSize(i + k), clusterSize(i + k));
+      for (Index m = i + 1; m < i + k; m++) {
+        C += fT.block(blockStart(i), blockStart(m), clusterSize(i), clusterSize(m)) *
+             T.block(blockStart(m), blockStart(i + k), clusterSize(m), clusterSize(i + k));
+        C -= T.block(blockStart(i), blockStart(m), clusterSize(i), clusterSize(m)) *
+             fT.block(blockStart(m), blockStart(i + k), clusterSize(m), clusterSize(i + k));
+      }
+      fT.block(blockStart(i), blockStart(i + k), clusterSize(i), clusterSize(i + k)) =
+          matrix_function_solve_triangular_sylvester(A, B, C);
     }
+  }
+}
+
+/** \ingroup MatrixFunctions_Module
+ * \brief Class for computing matrix functions.
+ * \tparam  MatrixType  type of the argument of the matrix function,
+ *                      expected to be an instantiation of the Matrix class template.
+ * \tparam  AtomicType  type for computing matrix function of atomic blocks.
+ * \tparam  IsComplex   used internally to select correct specialization.
+ *
+ * This class implements the Schur-Parlett algorithm for computing matrix functions. The spectrum of the
+ * matrix is divided in clustered of eigenvalues that lies close together. This class delegates the
+ * computation of the matrix function on every block corresponding to these clusters to an object of type
+ * \p AtomicType and uses these results to compute the matrix function of the whole matrix. The class
+ * \p AtomicType should have a \p compute() member function for computing the matrix function of a block.
+ *
+ * \sa class MatrixFunctionAtomic, class MatrixLogarithmAtomic
+ */
+template <typename MatrixType, int IsComplex = NumTraits<typename internal::traits<MatrixType>::Scalar>::IsComplex>
+struct matrix_function_compute {
+  /** \brief Compute the matrix function.
+   *
+   * \param[in]  A       argument of matrix function, should be a square matrix.
+   * \param[in]  atomic  class for computing matrix function of atomic blocks.
+   * \param[out] result  the function \p f applied to \p A, as
+   * specified in the constructor.
+   *
+   * See MatrixBase::matrixFunction() for details on how this computation
+   * is implemented.
+   */
+  template <typename AtomicType, typename ResultType>
+  static void run(const MatrixType& A, AtomicType& atomic, ResultType& result);
+};
+
+/** \internal \ingroup MatrixFunctions_Module
+ * \brief Partial specialization of MatrixFunction for real matrices
+ *
+ * This converts the real matrix to a complex matrix, compute the matrix function of that matrix, and then
+ * converts the result back to a real matrix.
+ */
+template <typename MatrixType>
+struct matrix_function_compute<MatrixType, 0> {
+  template <typename MatA, typename AtomicType, typename ResultType>
+  static void run(const MatA& A, AtomicType& atomic, ResultType& result) {
+    typedef internal::traits<MatrixType> Traits;
+    typedef typename Traits::Scalar Scalar;
+    static const int Rows = Traits::RowsAtCompileTime, Cols = Traits::ColsAtCompileTime;
+    static const int MaxRows = Traits::MaxRowsAtCompileTime, MaxCols = Traits::MaxColsAtCompileTime;
+
+    typedef internal::make_complex_t<Scalar> ComplexScalar;
+    typedef Matrix<ComplexScalar, Rows, Cols, 0, MaxRows, MaxCols> ComplexMatrix;
+
+    ComplexMatrix CA = A.template cast<ComplexScalar>();
+    ComplexMatrix Cresult;
+    matrix_function_compute<ComplexMatrix>::run(CA, atomic, Cresult);
+    result = Cresult.real();
+  }
+};
+
+/** \internal \ingroup MatrixFunctions_Module
+ * \brief Partial specialization of MatrixFunction for complex matrices
+ */
+template <typename MatrixType>
+struct matrix_function_compute<MatrixType, 1> {
+  template <typename MatA, typename AtomicType, typename ResultType>
+  static void run(const MatA& A, AtomicType& atomic, ResultType& result) {
+    typedef internal::traits<MatrixType> Traits;
+
+    // compute Schur decomposition of A
+    const ComplexSchur<MatrixType> schurOfA(A);
+    eigen_assert(schurOfA.info() == Success);
+    MatrixType T = schurOfA.matrixT();
+    MatrixType U = schurOfA.matrixU();
+
+    // partition eigenvalues into clusters of ei'vals "close" to each other
+    std::list<std::list<Index> > clusters;
+    matrix_function_partition_eigenvalues(T.diagonal(), clusters);
+
+    // compute size of each cluster
+    Matrix<Index, Dynamic, 1> clusterSize;
+    matrix_function_compute_cluster_size(clusters, clusterSize);
+
+    // blockStart[i] is row index at which block corresponding to i-th cluster starts
+    Matrix<Index, Dynamic, 1> blockStart;
+    matrix_function_compute_block_start(clusterSize, blockStart);
+
+    // compute map so that eivalToCluster[i] = j means that i-th ei'val is in j-th cluster
+    Matrix<Index, Dynamic, 1> eivalToCluster;
+    matrix_function_compute_map(T.diagonal(), clusters, eivalToCluster);
+
+    // compute permutation which groups ei'vals in same cluster together
+    Matrix<Index, Traits::RowsAtCompileTime, 1> permutation;
+    matrix_function_compute_permutation(blockStart, eivalToCluster, permutation);
+
+    // permute Schur decomposition
+    matrix_function_permute_schur(permutation, U, T);
+
+    // compute result
+    MatrixType fT;  // matrix function applied to T
+    matrix_function_compute_block_atomic(T, atomic, blockStart, clusterSize, fT);
+    matrix_function_compute_above_diagonal(T, blockStart, clusterSize, fT);
+    result = U * (fT.template triangularView<Upper>() * U.adjoint());
+  }
+};
 
-    Index rows() const { return m_A.rows(); }
-    Index cols() const { return m_A.cols(); }
+}  // end of namespace internal
 
-  private:
-    typename internal::nested<Derived>::type m_A;
-    StemFunction *m_f;
+/** \ingroup MatrixFunctions_Module
+ *
+ * \brief Proxy for the matrix function of some matrix (expression).
+ *
+ * \tparam Derived  Type of the argument to the matrix function.
+ *
+ * This class holds the argument to the matrix function until it is assigned or evaluated for some other
+ * reason (so the argument should not be changed in the meantime). It is the return type of
+ * matrixBase::matrixFunction() and related functions and most of the time this is the only way it is used.
+ */
+template <typename Derived>
+class MatrixFunctionReturnValue : public ReturnByValue<MatrixFunctionReturnValue<Derived> > {
+ public:
+  typedef typename Derived::Scalar Scalar;
+  typedef typename internal::stem_function<Scalar>::type StemFunction;
+
+ protected:
+  typedef typename internal::ref_selector<Derived>::type DerivedNested;
+
+ public:
+  /** \brief Constructor.
+   *
+   * \param[in] A  %Matrix (expression) forming the argument of the matrix function.
+   * \param[in] f  Stem function for matrix function under consideration.
+   */
+  MatrixFunctionReturnValue(const Derived& A, StemFunction f) : m_A(A), m_f(f) {}
+
+  /** \brief Compute the matrix function.
+   *
+   * \param[out] result \p f applied to \p A, where \p f and \p A are as in the constructor.
+   */
+  template <typename ResultType>
+  inline void evalTo(ResultType& result) const {
+    typedef typename internal::nested_eval<Derived, 10>::type NestedEvalType;
+    typedef internal::remove_all_t<NestedEvalType> NestedEvalTypeClean;
+    typedef internal::traits<NestedEvalTypeClean> Traits;
+    typedef internal::make_complex_t<Scalar> ComplexScalar;
+    typedef Matrix<ComplexScalar, Dynamic, Dynamic, 0, Traits::RowsAtCompileTime, Traits::ColsAtCompileTime>
+        DynMatrixType;
+
+    typedef internal::MatrixFunctionAtomic<DynMatrixType> AtomicType;
+    AtomicType atomic(m_f);
+
+    internal::matrix_function_compute<typename NestedEvalTypeClean::PlainObject>::run(m_A, atomic, result);
+  }
+
+  Index rows() const { return m_A.rows(); }
+  Index cols() const { return m_A.cols(); }
 
-    MatrixFunctionReturnValue& operator=(const MatrixFunctionReturnValue&);
+ private:
+  const DerivedNested m_A;
+  StemFunction* m_f;
 };
 
 namespace internal {
-template<typename Derived>
-struct traits<MatrixFunctionReturnValue<Derived> >
-{
+template <typename Derived>
+struct traits<MatrixFunctionReturnValue<Derived> > {
   typedef typename Derived::PlainObject ReturnType;
 };
-}
-
+}  // namespace internal
 
 /********** MatrixBase methods **********/
 
-
 template <typename Derived>
-const MatrixFunctionReturnValue<Derived> MatrixBase<Derived>::matrixFunction(typename internal::stem_function<typename internal::traits<Derived>::Scalar>::type f) const
-{
+const MatrixFunctionReturnValue<Derived> MatrixBase<Derived>::matrixFunction(
+    typename internal::stem_function<typename internal::traits<Derived>::Scalar>::type f) const {
   eigen_assert(rows() == cols());
   return MatrixFunctionReturnValue<Derived>(derived(), f);
 }
 
 template <typename Derived>
-const MatrixFunctionReturnValue<Derived> MatrixBase<Derived>::sin() const
-{
+const MatrixFunctionReturnValue<Derived> MatrixBase<Derived>::sin() const {
   eigen_assert(rows() == cols());
   typedef typename internal::stem_function<Scalar>::ComplexScalar ComplexScalar;
-  return MatrixFunctionReturnValue<Derived>(derived(), StdStemFunctions<ComplexScalar>::sin);
+  return MatrixFunctionReturnValue<Derived>(derived(), internal::stem_function_sin<ComplexScalar>);
 }
 
 template <typename Derived>
-const MatrixFunctionReturnValue<Derived> MatrixBase<Derived>::cos() const
-{
+const MatrixFunctionReturnValue<Derived> MatrixBase<Derived>::cos() const {
   eigen_assert(rows() == cols());
   typedef typename internal::stem_function<Scalar>::ComplexScalar ComplexScalar;
-  return MatrixFunctionReturnValue<Derived>(derived(), StdStemFunctions<ComplexScalar>::cos);
+  return MatrixFunctionReturnValue<Derived>(derived(), internal::stem_function_cos<ComplexScalar>);
 }
 
 template <typename Derived>
-const MatrixFunctionReturnValue<Derived> MatrixBase<Derived>::sinh() const
-{
+const MatrixFunctionReturnValue<Derived> MatrixBase<Derived>::sinh() const {
   eigen_assert(rows() == cols());
   typedef typename internal::stem_function<Scalar>::ComplexScalar ComplexScalar;
-  return MatrixFunctionReturnValue<Derived>(derived(), StdStemFunctions<ComplexScalar>::sinh);
+  return MatrixFunctionReturnValue<Derived>(derived(), internal::stem_function_sinh<ComplexScalar>);
 }
 
 template <typename Derived>
-const MatrixFunctionReturnValue<Derived> MatrixBase<Derived>::cosh() const
-{
+const MatrixFunctionReturnValue<Derived> MatrixBase<Derived>::cosh() const {
   eigen_assert(rows() == cols());
   typedef typename internal::stem_function<Scalar>::ComplexScalar ComplexScalar;
-  return MatrixFunctionReturnValue<Derived>(derived(), StdStemFunctions<ComplexScalar>::cosh);
+  return MatrixFunctionReturnValue<Derived>(derived(), internal::stem_function_cosh<ComplexScalar>);
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_MATRIX_FUNCTION
+#endif  // EIGEN_MATRIX_FUNCTION_H
diff --git a/inst/include/unsupported/Eigen/src/MatrixFunctions/MatrixFunctionAtomic.h b/inst/include/unsupported/Eigen/src/MatrixFunctions/MatrixFunctionAtomic.h
deleted file mode 100644
index efe332c4..00000000
--- a/inst/include/unsupported/Eigen/src/MatrixFunctions/MatrixFunctionAtomic.h
+++ /dev/null
@@ -1,131 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2009 Jitse Niesen <jitse@maths.leeds.ac.uk>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_MATRIX_FUNCTION_ATOMIC
-#define EIGEN_MATRIX_FUNCTION_ATOMIC
-
-namespace Eigen { 
-
-/** \ingroup MatrixFunctions_Module
-  * \class MatrixFunctionAtomic
-  * \brief Helper class for computing matrix functions of atomic matrices.
-  *
-  * \internal
-  * Here, an atomic matrix is a triangular matrix whose diagonal
-  * entries are close to each other.
-  */
-template <typename MatrixType>
-class MatrixFunctionAtomic
-{
-  public:
-
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::Index Index;
-    typedef typename NumTraits<Scalar>::Real RealScalar;
-    typedef typename internal::stem_function<Scalar>::type StemFunction;
-    typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
-
-    /** \brief Constructor
-      * \param[in]  f  matrix function to compute.
-      */
-    MatrixFunctionAtomic(StemFunction f) : m_f(f) { }
-
-    /** \brief Compute matrix function of atomic matrix
-      * \param[in]  A  argument of matrix function, should be upper triangular and atomic
-      * \returns  f(A), the matrix function evaluated at the given matrix
-      */
-    MatrixType compute(const MatrixType& A);
-
-  private:
-
-    // Prevent copying
-    MatrixFunctionAtomic(const MatrixFunctionAtomic&);
-    MatrixFunctionAtomic& operator=(const MatrixFunctionAtomic&);
-
-    void computeMu();
-    bool taylorConverged(Index s, const MatrixType& F, const MatrixType& Fincr, const MatrixType& P);
-
-    /** \brief Pointer to scalar function */
-    StemFunction* m_f;
-
-    /** \brief Size of matrix function */
-    Index m_Arows;
-
-    /** \brief Mean of eigenvalues */
-    Scalar m_avgEival;
-
-    /** \brief Argument shifted by mean of eigenvalues */
-    MatrixType m_Ashifted;
-
-    /** \brief Constant used to determine whether Taylor series has converged */
-    RealScalar m_mu;
-};
-
-template <typename MatrixType>
-MatrixType MatrixFunctionAtomic<MatrixType>::compute(const MatrixType& A)
-{
-  // TODO: Use that A is upper triangular
-  m_Arows = A.rows();
-  m_avgEival = A.trace() / Scalar(RealScalar(m_Arows));
-  m_Ashifted = A - m_avgEival * MatrixType::Identity(m_Arows, m_Arows);
-  computeMu();
-  MatrixType F = m_f(m_avgEival, 0) * MatrixType::Identity(m_Arows, m_Arows);
-  MatrixType P = m_Ashifted;
-  MatrixType Fincr;
-  for (Index s = 1; s < 1.1 * m_Arows + 10; s++) { // upper limit is fairly arbitrary
-    Fincr = m_f(m_avgEival, static_cast<int>(s)) * P;
-    F += Fincr;
-    P = Scalar(RealScalar(1.0/(s + 1))) * P * m_Ashifted;
-    if (taylorConverged(s, F, Fincr, P)) {
-      return F;
-    }
-  }
-  eigen_assert("Taylor series does not converge" && 0);
-  return F;
-}
-
-/** \brief Compute \c m_mu. */
-template <typename MatrixType>
-void MatrixFunctionAtomic<MatrixType>::computeMu()
-{
-  const MatrixType N = MatrixType::Identity(m_Arows, m_Arows) - m_Ashifted;
-  VectorType e = VectorType::Ones(m_Arows);
-  N.template triangularView<Upper>().solveInPlace(e);
-  m_mu = e.cwiseAbs().maxCoeff();
-}
-
-/** \brief Determine whether Taylor series has converged */
-template <typename MatrixType>
-bool MatrixFunctionAtomic<MatrixType>::taylorConverged(Index s, const MatrixType& F,
-						       const MatrixType& Fincr, const MatrixType& P)
-{
-  const Index n = F.rows();
-  const RealScalar F_norm = F.cwiseAbs().rowwise().sum().maxCoeff();
-  const RealScalar Fincr_norm = Fincr.cwiseAbs().rowwise().sum().maxCoeff();
-  if (Fincr_norm < NumTraits<Scalar>::epsilon() * F_norm) {
-    RealScalar delta = 0;
-    RealScalar rfactorial = 1;
-    for (Index r = 0; r < n; r++) {
-      RealScalar mx = 0;
-      for (Index i = 0; i < n; i++)
-        mx = (std::max)(mx, std::abs(m_f(m_Ashifted(i, i) + m_avgEival, static_cast<int>(s+r))));
-      if (r != 0)
-        rfactorial *= RealScalar(r);
-      delta = (std::max)(delta, mx / rfactorial);
-    }
-    const RealScalar P_norm = P.cwiseAbs().rowwise().sum().maxCoeff();
-    if (m_mu * delta * P_norm < NumTraits<Scalar>::epsilon() * F_norm)
-      return true;
-  }
-  return false;
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN_MATRIX_FUNCTION_ATOMIC
diff --git a/inst/include/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h b/inst/include/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h
index c744fc05..398971eb 100644
--- a/inst/include/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h
+++ b/inst/include/unsupported/Eigen/src/MatrixFunctions/MatrixLogarithm.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2011 Jitse Niesen <jitse@maths.leeds.ac.uk>
+// Copyright (C) 2011, 2013 Jitse Niesen <jitse@maths.leeds.ac.uk>
 // Copyright (C) 2011 Chen-Pang He <jdh8@ms63.hinet.net>
 //
 // This Source Code Form is subject to the terms of the Mozilla
@@ -11,476 +11,356 @@
 #ifndef EIGEN_MATRIX_LOGARITHM
 #define EIGEN_MATRIX_LOGARITHM
 
-#ifndef M_PI
-#define M_PI 3.141592653589793238462643383279503L
-#endif
-
-namespace Eigen { 
-
-/** \ingroup MatrixFunctions_Module
-  * \class MatrixLogarithmAtomic
-  * \brief Helper class for computing matrix logarithm of atomic matrices.
-  *
-  * \internal
-  * Here, an atomic matrix is a triangular matrix whose diagonal
-  * entries are close to each other.
-  *
-  * \sa class MatrixFunctionAtomic, MatrixBase::log()
-  */
-template <typename MatrixType>
-class MatrixLogarithmAtomic
-{
-public:
-
-  typedef typename MatrixType::Scalar Scalar;
-  // typedef typename MatrixType::Index Index;
-  typedef typename NumTraits<Scalar>::Real RealScalar;
-  // typedef typename internal::stem_function<Scalar>::type StemFunction;
-  // typedef Matrix<Scalar, MatrixType::RowsAtCompileTime, 1> VectorType;
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 
-  /** \brief Constructor. */
-  MatrixLogarithmAtomic() { }
+namespace Eigen {
 
-  /** \brief Compute matrix logarithm of atomic matrix
-    * \param[in]  A  argument of matrix logarithm, should be upper triangular and atomic
-    * \returns  The logarithm of \p A.
-    */
-  MatrixType compute(const MatrixType& A);
+namespace internal {
 
-private:
-
-  void compute2x2(const MatrixType& A, MatrixType& result);
-  void computeBig(const MatrixType& A, MatrixType& result);
-  int getPadeDegree(float normTminusI);
-  int getPadeDegree(double normTminusI);
-  int getPadeDegree(long double normTminusI);
-  void computePade(MatrixType& result, const MatrixType& T, int degree);
-  void computePade3(MatrixType& result, const MatrixType& T);
-  void computePade4(MatrixType& result, const MatrixType& T);
-  void computePade5(MatrixType& result, const MatrixType& T);
-  void computePade6(MatrixType& result, const MatrixType& T);
-  void computePade7(MatrixType& result, const MatrixType& T);
-  void computePade8(MatrixType& result, const MatrixType& T);
-  void computePade9(MatrixType& result, const MatrixType& T);
-  void computePade10(MatrixType& result, const MatrixType& T);
-  void computePade11(MatrixType& result, const MatrixType& T);
-
-  static const int minPadeDegree = 3;
-  static const int maxPadeDegree = std::numeric_limits<RealScalar>::digits<= 24?  5:  // single precision
-                                   std::numeric_limits<RealScalar>::digits<= 53?  7:  // double precision
-                                   std::numeric_limits<RealScalar>::digits<= 64?  8:  // extended precision
-                                   std::numeric_limits<RealScalar>::digits<=106? 10:  // double-double
-                                                                                 11;  // quadruple precision
-
-  // Prevent copying
-  MatrixLogarithmAtomic(const MatrixLogarithmAtomic&);
-  MatrixLogarithmAtomic& operator=(const MatrixLogarithmAtomic&);
+template <typename Scalar>
+struct matrix_log_min_pade_degree {
+  static const int value = 3;
 };
 
-/** \brief Compute logarithm of triangular matrix with clustered eigenvalues. */
-template <typename MatrixType>
-MatrixType MatrixLogarithmAtomic<MatrixType>::compute(const MatrixType& A)
-{
-  using std::log;
-  MatrixType result(A.rows(), A.rows());
-  if (A.rows() == 1)
-    result(0,0) = log(A(0,0));
-  else if (A.rows() == 2)
-    compute2x2(A, result);
-  else
-    computeBig(A, result);
-  return result;
-}
+template <typename Scalar>
+struct matrix_log_max_pade_degree {
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  static const int value = std::numeric_limits<RealScalar>::digits <= 24 ? 5 :  // single precision
+                               std::numeric_limits<RealScalar>::digits <= 53 ? 7
+                                                                             :  // double precision
+                               std::numeric_limits<RealScalar>::digits <= 64 ? 8
+                                                                             :  // extended precision
+                               std::numeric_limits<RealScalar>::digits <= 106 ? 10
+                                                                              :  // double-double
+                               11;                                               // quadruple precision
+};
 
 /** \brief Compute logarithm of 2x2 triangular matrix. */
 template <typename MatrixType>
-void MatrixLogarithmAtomic<MatrixType>::compute2x2(const MatrixType& A, MatrixType& result)
-{
+void matrix_log_compute_2x2(const MatrixType& A, MatrixType& result) {
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
   using std::abs;
   using std::ceil;
   using std::imag;
   using std::log;
 
-  Scalar logA00 = log(A(0,0));
-  Scalar logA11 = log(A(1,1));
+  Scalar logA00 = log(A(0, 0));
+  Scalar logA11 = log(A(1, 1));
 
-  result(0,0) = logA00;
-  result(1,0) = Scalar(0);
-  result(1,1) = logA11;
+  result(0, 0) = logA00;
+  result(1, 0) = Scalar(0);
+  result(1, 1) = logA11;
 
-  if (A(0,0) == A(1,1)) {
-    result(0,1) = A(0,1) / A(0,0);
-  } else if ((abs(A(0,0)) < 0.5*abs(A(1,1))) || (abs(A(0,0)) > 2*abs(A(1,1)))) {
-    result(0,1) = A(0,1) * (logA11 - logA00) / (A(1,1) - A(0,0));
+  Scalar y = A(1, 1) - A(0, 0);
+  if (y == Scalar(0)) {
+    result(0, 1) = A(0, 1) / A(0, 0);
+  } else if ((abs(A(0, 0)) < RealScalar(0.5) * abs(A(1, 1))) || (abs(A(0, 0)) > 2 * abs(A(1, 1)))) {
+    result(0, 1) = A(0, 1) * (logA11 - logA00) / y;
   } else {
     // computation in previous branch is inaccurate if A(1,1) \approx A(0,0)
-    int unwindingNumber = static_cast<int>(ceil((imag(logA11 - logA00) - M_PI) / (2*M_PI)));
-    Scalar y = A(1,1) - A(0,0), x = A(1,1) + A(0,0);
-    result(0,1) = A(0,1) * (Scalar(2) * numext::atanh2(y,x) + Scalar(0,2*M_PI*unwindingNumber)) / y;
+    RealScalar unwindingNumber = ceil((imag(logA11 - logA00) - RealScalar(EIGEN_PI)) / RealScalar(2 * EIGEN_PI));
+    result(0, 1) = A(0, 1) * (numext::log1p(y / A(0, 0)) + Scalar(0, RealScalar(2 * EIGEN_PI) * unwindingNumber)) / y;
   }
 }
 
-/** \brief Compute logarithm of triangular matrices with size > 2. 
-  * \details This uses a inverse scale-and-square algorithm. */
-template <typename MatrixType>
-void MatrixLogarithmAtomic<MatrixType>::computeBig(const MatrixType& A, MatrixType& result)
-{
-  using std::pow;
-  int numberOfSquareRoots = 0;
-  int numberOfExtraSquareRoots = 0;
-  int degree;
-  MatrixType T = A, sqrtT;
-  const RealScalar maxNormForPade = maxPadeDegree<= 5? 5.3149729967117310e-1:                     // single precision
-                                    maxPadeDegree<= 7? 2.6429608311114350e-1:                     // double precision
-                                    maxPadeDegree<= 8? 2.32777776523703892094e-1L:                // extended precision
-                                    maxPadeDegree<=10? 1.05026503471351080481093652651105e-1L:    // double-double
-                                                       1.1880960220216759245467951592883642e-1L;  // quadruple precision
-
-  while (true) {
-    RealScalar normTminusI = (T - MatrixType::Identity(T.rows(), T.rows())).cwiseAbs().colwise().sum().maxCoeff();
-    if (normTminusI < maxNormForPade) {
-      degree = getPadeDegree(normTminusI);
-      int degree2 = getPadeDegree(normTminusI / RealScalar(2));
-      if ((degree - degree2 <= 1) || (numberOfExtraSquareRoots == 1)) 
-        break;
-      ++numberOfExtraSquareRoots;
-    }
-    MatrixSquareRootTriangular<MatrixType>(T).compute(sqrtT);
-    T = sqrtT.template triangularView<Upper>();
-    ++numberOfSquareRoots;
-  }
-
-  computePade(result, T, degree);
-  result *= pow(RealScalar(2), numberOfSquareRoots);
-}
-
 /* \brief Get suitable degree for Pade approximation. (specialized for RealScalar = float) */
-template <typename MatrixType>
-int MatrixLogarithmAtomic<MatrixType>::getPadeDegree(float normTminusI)
-{
-  const float maxNormForPade[] = { 2.5111573934555054e-1 /* degree = 3 */ , 4.0535837411880493e-1,
-            5.3149729967117310e-1 };
-  int degree = 3;
-  for (; degree <= maxPadeDegree; ++degree) 
-    if (normTminusI <= maxNormForPade[degree - minPadeDegree])
-      break;
+inline int matrix_log_get_pade_degree(float normTminusI) {
+  const float maxNormForPade[] = {2.5111573934555054e-1 /* degree = 3 */, 4.0535837411880493e-1, 5.3149729967117310e-1};
+  const int minPadeDegree = matrix_log_min_pade_degree<float>::value;
+  const int maxPadeDegree = matrix_log_max_pade_degree<float>::value;
+  int degree = minPadeDegree;
+  for (; degree <= maxPadeDegree; ++degree)
+    if (normTminusI <= maxNormForPade[degree - minPadeDegree]) break;
   return degree;
 }
 
 /* \brief Get suitable degree for Pade approximation. (specialized for RealScalar = double) */
-template <typename MatrixType>
-int MatrixLogarithmAtomic<MatrixType>::getPadeDegree(double normTminusI)
-{
-  const double maxNormForPade[] = { 1.6206284795015624e-2 /* degree = 3 */ , 5.3873532631381171e-2,
-            1.1352802267628681e-1, 1.8662860613541288e-1, 2.642960831111435e-1 };
-  int degree = 3;
+inline int matrix_log_get_pade_degree(double normTminusI) {
+  const double maxNormForPade[] = {1.6206284795015624e-2 /* degree = 3 */, 5.3873532631381171e-2, 1.1352802267628681e-1,
+                                   1.8662860613541288e-1, 2.642960831111435e-1};
+  const int minPadeDegree = matrix_log_min_pade_degree<double>::value;
+  const int maxPadeDegree = matrix_log_max_pade_degree<double>::value;
+  int degree = minPadeDegree;
   for (; degree <= maxPadeDegree; ++degree)
-    if (normTminusI <= maxNormForPade[degree - minPadeDegree])
-      break;
+    if (normTminusI <= maxNormForPade[degree - minPadeDegree]) break;
   return degree;
 }
 
 /* \brief Get suitable degree for Pade approximation. (specialized for RealScalar = long double) */
-template <typename MatrixType>
-int MatrixLogarithmAtomic<MatrixType>::getPadeDegree(long double normTminusI)
-{
-#if   LDBL_MANT_DIG == 53         // double precision
-  const long double maxNormForPade[] = { 1.6206284795015624e-2L /* degree = 3 */ , 5.3873532631381171e-2L,
-            1.1352802267628681e-1L, 1.8662860613541288e-1L, 2.642960831111435e-1L };
-#elif LDBL_MANT_DIG <= 64         // extended precision
-  const long double maxNormForPade[] = { 5.48256690357782863103e-3L /* degree = 3 */, 2.34559162387971167321e-2L,
-            5.84603923897347449857e-2L, 1.08486423756725170223e-1L, 1.68385767881294446649e-1L,
-            2.32777776523703892094e-1L };
-#elif LDBL_MANT_DIG <= 106        // double-double
-  const long double maxNormForPade[] = { 8.58970550342939562202529664318890e-5L /* degree = 3 */,
-            9.34074328446359654039446552677759e-4L, 4.26117194647672175773064114582860e-3L,
-            1.21546224740281848743149666560464e-2L, 2.61100544998339436713088248557444e-2L,
-            4.66170074627052749243018566390567e-2L, 7.32585144444135027565872014932387e-2L,
-            1.05026503471351080481093652651105e-1L };
-#else                             // quadruple precision
-  const long double maxNormForPade[] = { 4.7419931187193005048501568167858103e-5L /* degree = 3 */,
-            5.8853168473544560470387769480192666e-4L, 2.9216120366601315391789493628113520e-3L,
-            8.8415758124319434347116734705174308e-3L, 1.9850836029449446668518049562565291e-2L,
-            3.6688019729653446926585242192447447e-2L, 5.9290962294020186998954055264528393e-2L,
-            8.6998436081634343903250580992127677e-2L, 1.1880960220216759245467951592883642e-1L };
+inline int matrix_log_get_pade_degree(long double normTminusI) {
+#if LDBL_MANT_DIG == 53  // double precision
+  const long double maxNormForPade[] = {1.6206284795015624e-2L /* degree = 3 */, 5.3873532631381171e-2L,
+                                        1.1352802267628681e-1L, 1.8662860613541288e-1L, 2.642960831111435e-1L};
+#elif LDBL_MANT_DIG <= 64   // extended precision
+  const long double maxNormForPade[] = {5.48256690357782863103e-3L /* degree = 3 */,
+                                        2.34559162387971167321e-2L,
+                                        5.84603923897347449857e-2L,
+                                        1.08486423756725170223e-1L,
+                                        1.68385767881294446649e-1L,
+                                        2.32777776523703892094e-1L};
+#elif LDBL_MANT_DIG <= 106  // double-double
+  const long double maxNormForPade[] = {8.58970550342939562202529664318890e-5L /* degree = 3 */,
+                                        9.34074328446359654039446552677759e-4L,
+                                        4.26117194647672175773064114582860e-3L,
+                                        1.21546224740281848743149666560464e-2L,
+                                        2.61100544998339436713088248557444e-2L,
+                                        4.66170074627052749243018566390567e-2L,
+                                        7.32585144444135027565872014932387e-2L,
+                                        1.05026503471351080481093652651105e-1L};
+#else                       // quadruple precision
+  const long double maxNormForPade[] = {4.7419931187193005048501568167858103e-5L /* degree = 3 */,
+                                        5.8853168473544560470387769480192666e-4L,
+                                        2.9216120366601315391789493628113520e-3L,
+                                        8.8415758124319434347116734705174308e-3L,
+                                        1.9850836029449446668518049562565291e-2L,
+                                        3.6688019729653446926585242192447447e-2L,
+                                        5.9290962294020186998954055264528393e-2L,
+                                        8.6998436081634343903250580992127677e-2L,
+                                        1.1880960220216759245467951592883642e-1L};
 #endif
-  int degree = 3;
+  const int minPadeDegree = matrix_log_min_pade_degree<long double>::value;
+  const int maxPadeDegree = matrix_log_max_pade_degree<long double>::value;
+  int degree = minPadeDegree;
   for (; degree <= maxPadeDegree; ++degree)
-    if (normTminusI <= maxNormForPade[degree - minPadeDegree])
-      break;
+    if (normTminusI <= maxNormForPade[degree - minPadeDegree]) break;
   return degree;
 }
 
 /* \brief Compute Pade approximation to matrix logarithm */
 template <typename MatrixType>
-void MatrixLogarithmAtomic<MatrixType>::computePade(MatrixType& result, const MatrixType& T, int degree)
-{
-  switch (degree) {
-    case 3:  computePade3(result, T);  break;
-    case 4:  computePade4(result, T);  break;
-    case 5:  computePade5(result, T);  break;
-    case 6:  computePade6(result, T);  break;
-    case 7:  computePade7(result, T);  break;
-    case 8:  computePade8(result, T);  break;
-    case 9:  computePade9(result, T);  break;
-    case 10: computePade10(result, T); break;
-    case 11: computePade11(result, T); break;
-    default: assert(false); // should never happen
-  }
-} 
+void matrix_log_compute_pade(MatrixType& result, const MatrixType& T, int degree) {
+  typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
+  const int minPadeDegree = 3;
+  const int maxPadeDegree = 11;
+  eigen_assert(degree >= minPadeDegree && degree <= maxPadeDegree);
+  // FIXME this creates float-conversion-warnings if these are enabled.
+  // Either manually convert each value, or disable the warning locally
+  const RealScalar nodes[][maxPadeDegree] = {
+      {0.1127016653792583114820734600217600L, 0.5000000000000000000000000000000000L,  // degree 3
+       0.8872983346207416885179265399782400L},
+      {0.0694318442029737123880267555535953L, 0.3300094782075718675986671204483777L,  // degree 4
+       0.6699905217924281324013328795516223L, 0.9305681557970262876119732444464048L},
+      {0.0469100770306680036011865608503035L, 0.2307653449471584544818427896498956L,  // degree 5
+       0.5000000000000000000000000000000000L, 0.7692346550528415455181572103501044L,
+       0.9530899229693319963988134391496965L},
+      {0.0337652428984239860938492227530027L, 0.1693953067668677431693002024900473L,  // degree 6
+       0.3806904069584015456847491391596440L, 0.6193095930415984543152508608403560L,
+       0.8306046932331322568306997975099527L, 0.9662347571015760139061507772469973L},
+      {0.0254460438286207377369051579760744L, 0.1292344072003027800680676133596058L,  // degree 7
+       0.2970774243113014165466967939615193L, 0.5000000000000000000000000000000000L,
+       0.7029225756886985834533032060384807L, 0.8707655927996972199319323866403942L,
+       0.9745539561713792622630948420239256L},
+      {0.0198550717512318841582195657152635L, 0.1016667612931866302042230317620848L,  // degree 8
+       0.2372337950418355070911304754053768L, 0.4082826787521750975302619288199080L,
+       0.5917173212478249024697380711800920L, 0.7627662049581644929088695245946232L,
+       0.8983332387068133697957769682379152L, 0.9801449282487681158417804342847365L},
+      {0.0159198802461869550822118985481636L, 0.0819844463366821028502851059651326L,  // degree 9
+       0.1933142836497048013456489803292629L, 0.3378732882980955354807309926783317L,
+       0.5000000000000000000000000000000000L, 0.6621267117019044645192690073216683L,
+       0.8066857163502951986543510196707371L, 0.9180155536633178971497148940348674L,
+       0.9840801197538130449177881014518364L},
+      {0.0130467357414141399610179939577740L, 0.0674683166555077446339516557882535L,  // degree 10
+       0.1602952158504877968828363174425632L, 0.2833023029353764046003670284171079L,
+       0.4255628305091843945575869994351400L, 0.5744371694908156054424130005648600L,
+       0.7166976970646235953996329715828921L, 0.8397047841495122031171636825574368L,
+       0.9325316833444922553660483442117465L, 0.9869532642585858600389820060422260L},
+      {0.0108856709269715035980309994385713L, 0.0564687001159523504624211153480364L,  // degree 11
+       0.1349239972129753379532918739844233L, 0.2404519353965940920371371652706952L,
+       0.3652284220238275138342340072995692L, 0.5000000000000000000000000000000000L,
+       0.6347715779761724861657659927004308L, 0.7595480646034059079628628347293048L,
+       0.8650760027870246620467081260155767L, 0.9435312998840476495375788846519636L,
+       0.9891143290730284964019690005614287L}};
+
+  const RealScalar weights[][maxPadeDegree] = {
+      {0.2777777777777777777777777777777778L, 0.4444444444444444444444444444444444L,  // degree 3
+       0.2777777777777777777777777777777778L},
+      {0.1739274225687269286865319746109997L, 0.3260725774312730713134680253890003L,  // degree 4
+       0.3260725774312730713134680253890003L, 0.1739274225687269286865319746109997L},
+      {0.1184634425280945437571320203599587L, 0.2393143352496832340206457574178191L,  // degree 5
+       0.2844444444444444444444444444444444L, 0.2393143352496832340206457574178191L,
+       0.1184634425280945437571320203599587L},
+      {0.0856622461895851725201480710863665L, 0.1803807865240693037849167569188581L,  // degree 6
+       0.2339569672863455236949351719947755L, 0.2339569672863455236949351719947755L,
+       0.1803807865240693037849167569188581L, 0.0856622461895851725201480710863665L},
+      {0.0647424830844348466353057163395410L, 0.1398526957446383339507338857118898L,  // degree 7
+       0.1909150252525594724751848877444876L, 0.2089795918367346938775510204081633L,
+       0.1909150252525594724751848877444876L, 0.1398526957446383339507338857118898L,
+       0.0647424830844348466353057163395410L},
+      {0.0506142681451881295762656771549811L, 0.1111905172266872352721779972131204L,  // degree 8
+       0.1568533229389436436689811009933007L, 0.1813418916891809914825752246385978L,
+       0.1813418916891809914825752246385978L, 0.1568533229389436436689811009933007L,
+       0.1111905172266872352721779972131204L, 0.0506142681451881295762656771549811L},
+      {0.0406371941807872059859460790552618L, 0.0903240803474287020292360156214564L,  // degree 9
+       0.1303053482014677311593714347093164L, 0.1561735385200014200343152032922218L,
+       0.1651196775006298815822625346434870L, 0.1561735385200014200343152032922218L,
+       0.1303053482014677311593714347093164L, 0.0903240803474287020292360156214564L,
+       0.0406371941807872059859460790552618L},
+      {0.0333356721543440687967844049466659L, 0.0747256745752902965728881698288487L,  // degree 10
+       0.1095431812579910219977674671140816L, 0.1346333596549981775456134607847347L,
+       0.1477621123573764350869464973256692L, 0.1477621123573764350869464973256692L,
+       0.1346333596549981775456134607847347L, 0.1095431812579910219977674671140816L,
+       0.0747256745752902965728881698288487L, 0.0333356721543440687967844049466659L},
+      {0.0278342835580868332413768602212743L, 0.0627901847324523123173471496119701L,  // degree 11
+       0.0931451054638671257130488207158280L, 0.1165968822959952399592618524215876L,
+       0.1314022722551233310903444349452546L, 0.1364625433889503153572417641681711L,
+       0.1314022722551233310903444349452546L, 0.1165968822959952399592618524215876L,
+       0.0931451054638671257130488207158280L, 0.0627901847324523123173471496119701L,
+       0.0278342835580868332413768602212743L}};
 
-template <typename MatrixType>
-void MatrixLogarithmAtomic<MatrixType>::computePade3(MatrixType& result, const MatrixType& T)
-{
-  const int degree = 3;
-  const RealScalar nodes[]   = { 0.1127016653792583114820734600217600L, 0.5000000000000000000000000000000000L,
-            0.8872983346207416885179265399782400L };
-  const RealScalar weights[] = { 0.2777777777777777777777777777777778L, 0.4444444444444444444444444444444444L,
-            0.2777777777777777777777777777777778L };
-  eigen_assert(degree <= maxPadeDegree);
   MatrixType TminusI = T - MatrixType::Identity(T.rows(), T.rows());
   result.setZero(T.rows(), T.rows());
-  for (int k = 0; k < degree; ++k)
-    result += weights[k] * (MatrixType::Identity(T.rows(), T.rows()) + nodes[k] * TminusI)
-                           .template triangularView<Upper>().solve(TminusI);
+  for (int k = 0; k < degree; ++k) {
+    RealScalar weight = weights[degree - minPadeDegree][k];
+    RealScalar node = nodes[degree - minPadeDegree][k];
+    result +=
+        weight *
+        (MatrixType::Identity(T.rows(), T.rows()) + node * TminusI).template triangularView<Upper>().solve(TminusI);
+  }
 }
 
+/** \brief Compute logarithm of triangular matrices with size > 2.
+ * \details This uses a inverse scale-and-square algorithm. */
 template <typename MatrixType>
-void MatrixLogarithmAtomic<MatrixType>::computePade4(MatrixType& result, const MatrixType& T)
-{
-  const int degree = 4;
-  const RealScalar nodes[]   = { 0.0694318442029737123880267555535953L, 0.3300094782075718675986671204483777L,
-            0.6699905217924281324013328795516223L, 0.9305681557970262876119732444464048L };
-  const RealScalar weights[] = { 0.1739274225687269286865319746109997L, 0.3260725774312730713134680253890003L,
-            0.3260725774312730713134680253890003L, 0.1739274225687269286865319746109997L };
-  eigen_assert(degree <= maxPadeDegree);
-  MatrixType TminusI = T - MatrixType::Identity(T.rows(), T.rows());
-  result.setZero(T.rows(), T.rows());
-  for (int k = 0; k < degree; ++k)
-    result += weights[k] * (MatrixType::Identity(T.rows(), T.rows()) + nodes[k] * TminusI)
-                           .template triangularView<Upper>().solve(TminusI);
-}
+void matrix_log_compute_big(const MatrixType& A, MatrixType& result) {
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  using std::pow;
 
-template <typename MatrixType>
-void MatrixLogarithmAtomic<MatrixType>::computePade5(MatrixType& result, const MatrixType& T)
-{
-  const int degree = 5;
-  const RealScalar nodes[]   = { 0.0469100770306680036011865608503035L, 0.2307653449471584544818427896498956L,
-            0.5000000000000000000000000000000000L, 0.7692346550528415455181572103501044L,
-            0.9530899229693319963988134391496965L };
-  const RealScalar weights[] = { 0.1184634425280945437571320203599587L, 0.2393143352496832340206457574178191L,
-            0.2844444444444444444444444444444444L, 0.2393143352496832340206457574178191L,
-            0.1184634425280945437571320203599587L };
-  eigen_assert(degree <= maxPadeDegree);
-  MatrixType TminusI = T - MatrixType::Identity(T.rows(), T.rows());
-  result.setZero(T.rows(), T.rows());
-  for (int k = 0; k < degree; ++k)
-    result += weights[k] * (MatrixType::Identity(T.rows(), T.rows()) + nodes[k] * TminusI)
-                           .template triangularView<Upper>().solve(TminusI);
-}
+  int numberOfSquareRoots = 0;
+  int numberOfExtraSquareRoots = 0;
+  int degree;
+  MatrixType T = A, sqrtT;
 
-template <typename MatrixType>
-void MatrixLogarithmAtomic<MatrixType>::computePade6(MatrixType& result, const MatrixType& T)
-{
-  const int degree = 6;
-  const RealScalar nodes[]   = { 0.0337652428984239860938492227530027L, 0.1693953067668677431693002024900473L,
-            0.3806904069584015456847491391596440L, 0.6193095930415984543152508608403560L,
-            0.8306046932331322568306997975099527L, 0.9662347571015760139061507772469973L };
-  const RealScalar weights[] = { 0.0856622461895851725201480710863665L, 0.1803807865240693037849167569188581L,
-            0.2339569672863455236949351719947755L, 0.2339569672863455236949351719947755L,
-            0.1803807865240693037849167569188581L, 0.0856622461895851725201480710863665L };
-  eigen_assert(degree <= maxPadeDegree);
-  MatrixType TminusI = T - MatrixType::Identity(T.rows(), T.rows());
-  result.setZero(T.rows(), T.rows());
-  for (int k = 0; k < degree; ++k)
-    result += weights[k] * (MatrixType::Identity(T.rows(), T.rows()) + nodes[k] * TminusI)
-                           .template triangularView<Upper>().solve(TminusI);
-}
+  const int maxPadeDegree = matrix_log_max_pade_degree<Scalar>::value;
+  const RealScalar maxNormForPade = RealScalar(maxPadeDegree <= 5 ? 5.3149729967117310e-1L :  // single precision
+                                                   maxPadeDegree <= 7 ? 2.6429608311114350e-1L
+                                                                      :  // double precision
+                                                   maxPadeDegree <= 8 ? 2.32777776523703892094e-1L
+                                                                      :  // extended precision
+                                                   maxPadeDegree <= 10 ? 1.05026503471351080481093652651105e-1L
+                                                                       :                       // double-double
+                                                   1.1880960220216759245467951592883642e-1L);  // quadruple precision
 
-template <typename MatrixType>
-void MatrixLogarithmAtomic<MatrixType>::computePade7(MatrixType& result, const MatrixType& T)
-{
-  const int degree = 7;
-  const RealScalar nodes[]   = { 0.0254460438286207377369051579760744L, 0.1292344072003027800680676133596058L,
-            0.2970774243113014165466967939615193L, 0.5000000000000000000000000000000000L,
-            0.7029225756886985834533032060384807L, 0.8707655927996972199319323866403942L,
-            0.9745539561713792622630948420239256L };
-  const RealScalar weights[] = { 0.0647424830844348466353057163395410L, 0.1398526957446383339507338857118898L,
-            0.1909150252525594724751848877444876L, 0.2089795918367346938775510204081633L,
-            0.1909150252525594724751848877444876L, 0.1398526957446383339507338857118898L,
-            0.0647424830844348466353057163395410L };
-  eigen_assert(degree <= maxPadeDegree);
-  MatrixType TminusI = T - MatrixType::Identity(T.rows(), T.rows());
-  result.setZero(T.rows(), T.rows());
-  for (int k = 0; k < degree; ++k)
-    result += weights[k] * (MatrixType::Identity(T.rows(), T.rows()) + nodes[k] * TminusI)
-                           .template triangularView<Upper>().solve(TminusI);
-}
+  while (true) {
+    RealScalar normTminusI = (T - MatrixType::Identity(T.rows(), T.rows())).cwiseAbs().colwise().sum().maxCoeff();
+    if (normTminusI < maxNormForPade) {
+      degree = matrix_log_get_pade_degree(normTminusI);
+      int degree2 = matrix_log_get_pade_degree(normTminusI / RealScalar(2));
+      if ((degree - degree2 <= 1) || (numberOfExtraSquareRoots == 1)) break;
+      ++numberOfExtraSquareRoots;
+    }
+    matrix_sqrt_triangular(T, sqrtT);
+    T = sqrtT.template triangularView<Upper>();
+    ++numberOfSquareRoots;
+  }
 
-template <typename MatrixType>
-void MatrixLogarithmAtomic<MatrixType>::computePade8(MatrixType& result, const MatrixType& T)
-{
-  const int degree = 8;
-  const RealScalar nodes[]   = { 0.0198550717512318841582195657152635L, 0.1016667612931866302042230317620848L,
-            0.2372337950418355070911304754053768L, 0.4082826787521750975302619288199080L,
-            0.5917173212478249024697380711800920L, 0.7627662049581644929088695245946232L,
-            0.8983332387068133697957769682379152L, 0.9801449282487681158417804342847365L };
-  const RealScalar weights[] = { 0.0506142681451881295762656771549811L, 0.1111905172266872352721779972131204L,
-            0.1568533229389436436689811009933007L, 0.1813418916891809914825752246385978L,
-            0.1813418916891809914825752246385978L, 0.1568533229389436436689811009933007L,
-            0.1111905172266872352721779972131204L, 0.0506142681451881295762656771549811L };
-  eigen_assert(degree <= maxPadeDegree);
-  MatrixType TminusI = T - MatrixType::Identity(T.rows(), T.rows());
-  result.setZero(T.rows(), T.rows());
-  for (int k = 0; k < degree; ++k)
-    result += weights[k] * (MatrixType::Identity(T.rows(), T.rows()) + nodes[k] * TminusI)
-                           .template triangularView<Upper>().solve(TminusI);
+  matrix_log_compute_pade(result, T, degree);
+  result *= pow(RealScalar(2), RealScalar(numberOfSquareRoots));  // TODO replace by bitshift if possible
 }
 
+/** \ingroup MatrixFunctions_Module
+ * \class MatrixLogarithmAtomic
+ * \brief Helper class for computing matrix logarithm of atomic matrices.
+ *
+ * Here, an atomic matrix is a triangular matrix whose diagonal entries are close to each other.
+ *
+ * \sa class MatrixFunctionAtomic, MatrixBase::log()
+ */
 template <typename MatrixType>
-void MatrixLogarithmAtomic<MatrixType>::computePade9(MatrixType& result, const MatrixType& T)
-{
-  const int degree = 9;
-  const RealScalar nodes[]   = { 0.0159198802461869550822118985481636L, 0.0819844463366821028502851059651326L,
-            0.1933142836497048013456489803292629L, 0.3378732882980955354807309926783317L,
-            0.5000000000000000000000000000000000L, 0.6621267117019044645192690073216683L,
-            0.8066857163502951986543510196707371L, 0.9180155536633178971497148940348674L,
-            0.9840801197538130449177881014518364L };
-  const RealScalar weights[] = { 0.0406371941807872059859460790552618L, 0.0903240803474287020292360156214564L,
-            0.1303053482014677311593714347093164L, 0.1561735385200014200343152032922218L,
-            0.1651196775006298815822625346434870L, 0.1561735385200014200343152032922218L,
-            0.1303053482014677311593714347093164L, 0.0903240803474287020292360156214564L,
-            0.0406371941807872059859460790552618L };
-  eigen_assert(degree <= maxPadeDegree);
-  MatrixType TminusI = T - MatrixType::Identity(T.rows(), T.rows());
-  result.setZero(T.rows(), T.rows());
-  for (int k = 0; k < degree; ++k)
-    result += weights[k] * (MatrixType::Identity(T.rows(), T.rows()) + nodes[k] * TminusI)
-                           .template triangularView<Upper>().solve(TminusI);
-}
+class MatrixLogarithmAtomic {
+ public:
+  /** \brief Compute matrix logarithm of atomic matrix
+   * \param[in]  A  argument of matrix logarithm, should be upper triangular and atomic
+   * \returns  The logarithm of \p A.
+   */
+  MatrixType compute(const MatrixType& A);
+};
 
 template <typename MatrixType>
-void MatrixLogarithmAtomic<MatrixType>::computePade10(MatrixType& result, const MatrixType& T)
-{
-  const int degree = 10;
-  const RealScalar nodes[]   = { 0.0130467357414141399610179939577740L, 0.0674683166555077446339516557882535L,
-            0.1602952158504877968828363174425632L, 0.2833023029353764046003670284171079L,
-            0.4255628305091843945575869994351400L, 0.5744371694908156054424130005648600L,
-            0.7166976970646235953996329715828921L, 0.8397047841495122031171636825574368L,
-            0.9325316833444922553660483442117465L, 0.9869532642585858600389820060422260L };
-  const RealScalar weights[] = { 0.0333356721543440687967844049466659L, 0.0747256745752902965728881698288487L,
-            0.1095431812579910219977674671140816L, 0.1346333596549981775456134607847347L,
-            0.1477621123573764350869464973256692L, 0.1477621123573764350869464973256692L,
-            0.1346333596549981775456134607847347L, 0.1095431812579910219977674671140816L,
-            0.0747256745752902965728881698288487L, 0.0333356721543440687967844049466659L };
-  eigen_assert(degree <= maxPadeDegree);
-  MatrixType TminusI = T - MatrixType::Identity(T.rows(), T.rows());
-  result.setZero(T.rows(), T.rows());
-  for (int k = 0; k < degree; ++k)
-    result += weights[k] * (MatrixType::Identity(T.rows(), T.rows()) + nodes[k] * TminusI)
-                           .template triangularView<Upper>().solve(TminusI);
+MatrixType MatrixLogarithmAtomic<MatrixType>::compute(const MatrixType& A) {
+  using std::log;
+  MatrixType result(A.rows(), A.rows());
+  if (A.rows() == 1)
+    result(0, 0) = log(A(0, 0));
+  else if (A.rows() == 2)
+    matrix_log_compute_2x2(A, result);
+  else
+    matrix_log_compute_big(A, result);
+  return result;
 }
 
-template <typename MatrixType>
-void MatrixLogarithmAtomic<MatrixType>::computePade11(MatrixType& result, const MatrixType& T)
-{
-  const int degree = 11;
-  const RealScalar nodes[]   = { 0.0108856709269715035980309994385713L, 0.0564687001159523504624211153480364L,
-            0.1349239972129753379532918739844233L, 0.2404519353965940920371371652706952L,
-            0.3652284220238275138342340072995692L, 0.5000000000000000000000000000000000L,
-            0.6347715779761724861657659927004308L, 0.7595480646034059079628628347293048L,
-            0.8650760027870246620467081260155767L, 0.9435312998840476495375788846519636L,
-            0.9891143290730284964019690005614287L };
-  const RealScalar weights[] = { 0.0278342835580868332413768602212743L, 0.0627901847324523123173471496119701L,
-            0.0931451054638671257130488207158280L, 0.1165968822959952399592618524215876L,
-            0.1314022722551233310903444349452546L, 0.1364625433889503153572417641681711L,
-            0.1314022722551233310903444349452546L, 0.1165968822959952399592618524215876L,
-            0.0931451054638671257130488207158280L, 0.0627901847324523123173471496119701L,
-            0.0278342835580868332413768602212743L };
-  eigen_assert(degree <= maxPadeDegree);
-  MatrixType TminusI = T - MatrixType::Identity(T.rows(), T.rows());
-  result.setZero(T.rows(), T.rows());
-  for (int k = 0; k < degree; ++k)
-    result += weights[k] * (MatrixType::Identity(T.rows(), T.rows()) + nodes[k] * TminusI)
-                           .template triangularView<Upper>().solve(TminusI);
-}
+}  // end of namespace internal
 
 /** \ingroup MatrixFunctions_Module
-  *
-  * \brief Proxy for the matrix logarithm of some matrix (expression).
-  *
-  * \tparam Derived  Type of the argument to the matrix function.
-  *
-  * This class holds the argument to the matrix function until it is
-  * assigned or evaluated for some other reason (so the argument
-  * should not be changed in the meantime). It is the return type of
-  * MatrixBase::log() and most of the time this is the only way it
-  * is used.
-  */
-template<typename Derived> class MatrixLogarithmReturnValue
-: public ReturnByValue<MatrixLogarithmReturnValue<Derived> >
-{
-public:
-
+ *
+ * \brief Proxy for the matrix logarithm of some matrix (expression).
+ *
+ * \tparam Derived  Type of the argument to the matrix function.
+ *
+ * This class holds the argument to the matrix function until it is
+ * assigned or evaluated for some other reason (so the argument
+ * should not be changed in the meantime). It is the return type of
+ * MatrixBase::log() and most of the time this is the only way it
+ * is used.
+ */
+template <typename Derived>
+class MatrixLogarithmReturnValue : public ReturnByValue<MatrixLogarithmReturnValue<Derived> > {
+ public:
   typedef typename Derived::Scalar Scalar;
   typedef typename Derived::Index Index;
 
+ protected:
+  typedef typename internal::ref_selector<Derived>::type DerivedNested;
+
+ public:
   /** \brief Constructor.
-    *
-    * \param[in]  A  %Matrix (expression) forming the argument of the matrix logarithm.
-    */
-  MatrixLogarithmReturnValue(const Derived& A) : m_A(A) { }
-  
+   *
+   * \param[in]  A  %Matrix (expression) forming the argument of the matrix logarithm.
+   */
+  explicit MatrixLogarithmReturnValue(const Derived& A) : m_A(A) {}
+
   /** \brief Compute the matrix logarithm.
-    *
-    * \param[out]  result  Logarithm of \p A, where \A is as specified in the constructor.
-    */
+   *
+   * \param[out]  result  Logarithm of \c A, where \c A is as specified in the constructor.
+   */
   template <typename ResultType>
-  inline void evalTo(ResultType& result) const
-  {
-    typedef typename Derived::PlainObject PlainObject;
-    typedef internal::traits<PlainObject> Traits;
-    static const int RowsAtCompileTime = Traits::RowsAtCompileTime;
-    static const int ColsAtCompileTime = Traits::ColsAtCompileTime;
-    static const int Options = PlainObject::Options;
-    typedef std::complex<typename NumTraits<Scalar>::Real> ComplexScalar;
-    typedef Matrix<ComplexScalar, Dynamic, Dynamic, Options, RowsAtCompileTime, ColsAtCompileTime> DynMatrixType;
-    typedef MatrixLogarithmAtomic<DynMatrixType> AtomicType;
+  inline void evalTo(ResultType& result) const {
+    typedef typename internal::nested_eval<Derived, 10>::type DerivedEvalType;
+    typedef internal::remove_all_t<DerivedEvalType> DerivedEvalTypeClean;
+    typedef internal::traits<DerivedEvalTypeClean> Traits;
+    typedef internal::make_complex_t<Scalar> ComplexScalar;
+    typedef Matrix<ComplexScalar, Dynamic, Dynamic, 0, Traits::RowsAtCompileTime, Traits::ColsAtCompileTime>
+        DynMatrixType;
+    typedef internal::MatrixLogarithmAtomic<DynMatrixType> AtomicType;
     AtomicType atomic;
-    
-    const PlainObject Aevaluated = m_A.eval();
-    MatrixFunction<PlainObject, AtomicType> mf(Aevaluated, atomic);
-    mf.compute(result);
+
+    internal::matrix_function_compute<typename DerivedEvalTypeClean::PlainObject>::run(m_A, atomic, result);
   }
 
   Index rows() const { return m_A.rows(); }
   Index cols() const { return m_A.cols(); }
-  
-private:
-  typename internal::nested<Derived>::type m_A;
-  
-  MatrixLogarithmReturnValue& operator=(const MatrixLogarithmReturnValue&);
+
+ private:
+  const DerivedNested m_A;
 };
 
 namespace internal {
-  template<typename Derived>
-  struct traits<MatrixLogarithmReturnValue<Derived> >
-  {
-    typedef typename Derived::PlainObject ReturnType;
-  };
-}
-
+template <typename Derived>
+struct traits<MatrixLogarithmReturnValue<Derived> > {
+  typedef typename Derived::PlainObject ReturnType;
+};
+}  // namespace internal
 
 /********** MatrixBase method **********/
 
-
 template <typename Derived>
-const MatrixLogarithmReturnValue<Derived> MatrixBase<Derived>::log() const
-{
+const MatrixLogarithmReturnValue<Derived> MatrixBase<Derived>::log() const {
   eigen_assert(rows() == cols());
   return MatrixLogarithmReturnValue<Derived>(derived());
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_MATRIX_LOGARITHM
+#endif  // EIGEN_MATRIX_LOGARITHM
diff --git a/inst/include/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h b/inst/include/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h
index 78a307e9..a420ee70 100644
--- a/inst/include/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h
+++ b/inst/include/unsupported/Eigen/src/MatrixFunctions/MatrixPower.h
@@ -10,78 +10,139 @@
 #ifndef EIGEN_MATRIX_POWER
 #define EIGEN_MATRIX_POWER
 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
 namespace Eigen {
 
-template<typename MatrixType> class MatrixPower;
+template <typename MatrixType>
+class MatrixPower;
 
-template<typename MatrixType>
-class MatrixPowerRetval : public ReturnByValue< MatrixPowerRetval<MatrixType> >
-{
-  public:
-    typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
+/**
+ * \ingroup MatrixFunctions_Module
+ *
+ * \brief Proxy for the matrix power of some matrix.
+ *
+ * \tparam MatrixType  type of the base, a matrix.
+ *
+ * This class holds the arguments to the matrix power until it is
+ * assigned or evaluated for some other reason (so the argument
+ * should not be changed in the meantime). It is the return type of
+ * MatrixPower::operator() and related functions and most of the
+ * time this is the only way it is used.
+ */
+/* TODO This class is only used by MatrixPower, so it should be nested
+ * into MatrixPower, like MatrixPower::ReturnValue. However, my
+ * compiler complained about unused template parameter in the
+ * following declaration in namespace internal.
+ *
+ * template<typename MatrixType>
+ * struct traits<MatrixPower<MatrixType>::ReturnValue>;
+ */
+template <typename MatrixType>
+class MatrixPowerParenthesesReturnValue : public ReturnByValue<MatrixPowerParenthesesReturnValue<MatrixType> > {
+ public:
+  typedef typename MatrixType::RealScalar RealScalar;
 
-    MatrixPowerRetval(MatrixPower<MatrixType>& pow, RealScalar p) : m_pow(pow), m_p(p)
-    { }
+  /**
+   * \brief Constructor.
+   *
+   * \param[in] pow  %MatrixPower storing the base.
+   * \param[in] p    scalar, the exponent of the matrix power.
+   */
+  MatrixPowerParenthesesReturnValue(MatrixPower<MatrixType>& pow, RealScalar p) : m_pow(pow), m_p(p) {}
 
-    template<typename ResultType>
-    inline void evalTo(ResultType& res) const
-    { m_pow.compute(res, m_p); }
+  /**
+   * \brief Compute the matrix power.
+   *
+   * \param[out] result
+   */
+  template <typename ResultType>
+  inline void evalTo(ResultType& result) const {
+    m_pow.compute(result, m_p);
+  }
 
-    Index rows() const { return m_pow.rows(); }
-    Index cols() const { return m_pow.cols(); }
+  Index rows() const { return m_pow.rows(); }
+  Index cols() const { return m_pow.cols(); }
 
-  private:
-    MatrixPower<MatrixType>& m_pow;
-    const RealScalar m_p;
-    MatrixPowerRetval& operator=(const MatrixPowerRetval&);
+ private:
+  MatrixPower<MatrixType>& m_pow;
+  const RealScalar m_p;
 };
 
-template<typename MatrixType>
-class MatrixPowerAtomic
-{
-  private:
-    enum {
-      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-      MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime
-    };
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::RealScalar RealScalar;
-    typedef std::complex<RealScalar> ComplexScalar;
-    typedef typename MatrixType::Index Index;
-    typedef Array<Scalar, RowsAtCompileTime, 1, ColMajor, MaxRowsAtCompileTime> ArrayType;
-
-    const MatrixType& m_A;
-    RealScalar m_p;
-
-    void computePade(int degree, const MatrixType& IminusT, MatrixType& res) const;
-    void compute2x2(MatrixType& res, RealScalar p) const;
-    void computeBig(MatrixType& res) const;
-    static int getPadeDegree(float normIminusT);
-    static int getPadeDegree(double normIminusT);
-    static int getPadeDegree(long double normIminusT);
-    static ComplexScalar computeSuperDiag(const ComplexScalar&, const ComplexScalar&, RealScalar p);
-    static RealScalar computeSuperDiag(RealScalar, RealScalar, RealScalar p);
-
-  public:
-    MatrixPowerAtomic(const MatrixType& T, RealScalar p);
-    void compute(MatrixType& res) const;
+/**
+ * \ingroup MatrixFunctions_Module
+ *
+ * \brief Class for computing matrix powers.
+ *
+ * \tparam MatrixType  type of the base, expected to be an instantiation
+ * of the Matrix class template.
+ *
+ * This class is capable of computing triangular real/complex matrices
+ * raised to a power in the interval \f$ (-1, 1) \f$.
+ *
+ * \note Currently this class is only used by MatrixPower. One may
+ * insist that this be nested into MatrixPower. This class is here to
+ * facilitate future development of triangular matrix functions.
+ */
+template <typename MatrixType>
+class MatrixPowerAtomic : internal::noncopyable {
+ private:
+  enum { RowsAtCompileTime = MatrixType::RowsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime };
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+  typedef internal::make_complex_t<Scalar> ComplexScalar;
+  typedef Block<MatrixType, Dynamic, Dynamic> ResultType;
+
+  const MatrixType& m_A;
+  RealScalar m_p;
+
+  void computePade(int degree, const MatrixType& IminusT, ResultType& res) const;
+  void compute2x2(ResultType& res, RealScalar p) const;
+  void computeBig(ResultType& res) const;
+  static int getPadeDegree(float normIminusT);
+  static int getPadeDegree(double normIminusT);
+  static int getPadeDegree(long double normIminusT);
+  static ComplexScalar computeSuperDiag(const ComplexScalar&, const ComplexScalar&, RealScalar p);
+  static RealScalar computeSuperDiag(RealScalar, RealScalar, RealScalar p);
+
+ public:
+  /**
+   * \brief Constructor.
+   *
+   * \param[in] T  the base of the matrix power.
+   * \param[in] p  the exponent of the matrix power, should be in
+   * \f$ (-1, 1) \f$.
+   *
+   * The class stores a reference to T, so it should not be changed
+   * (or destroyed) before evaluation. Only the upper triangular
+   * part of T is read.
+   */
+  MatrixPowerAtomic(const MatrixType& T, RealScalar p);
+
+  /**
+   * \brief Compute the matrix power.
+   *
+   * \param[out] res  \f$ A^p \f$ where A and p are specified in the
+   * constructor.
+   */
+  void compute(ResultType& res) const;
 };
 
-template<typename MatrixType>
-MatrixPowerAtomic<MatrixType>::MatrixPowerAtomic(const MatrixType& T, RealScalar p) :
-  m_A(T), m_p(p)
-{ eigen_assert(T.rows() == T.cols()); }
+template <typename MatrixType>
+MatrixPowerAtomic<MatrixType>::MatrixPowerAtomic(const MatrixType& T, RealScalar p) : m_A(T), m_p(p) {
+  eigen_assert(T.rows() == T.cols());
+  eigen_assert(p > -1 && p < 1);
+}
 
-template<typename MatrixType>
-void MatrixPowerAtomic<MatrixType>::compute(MatrixType& res) const
-{
-  res.resizeLike(m_A);
+template <typename MatrixType>
+void MatrixPowerAtomic<MatrixType>::compute(ResultType& res) const {
+  using std::pow;
   switch (m_A.rows()) {
     case 0:
       break;
     case 1:
-      res(0,0) = std::pow(m_A(0,0), m_p);
+      res(0, 0) = pow(m_A(0, 0), m_p);
       break;
     case 2:
       compute2x2(res, m_p);
@@ -91,164 +152,169 @@ void MatrixPowerAtomic<MatrixType>::compute(MatrixType& res) const
   }
 }
 
-template<typename MatrixType>
-void MatrixPowerAtomic<MatrixType>::computePade(int degree, const MatrixType& IminusT, MatrixType& res) const
-{
-  int i = degree<<1;
-  res = (m_p-degree) / ((i-1)<<1) * IminusT;
+template <typename MatrixType>
+void MatrixPowerAtomic<MatrixType>::computePade(int degree, const MatrixType& IminusT, ResultType& res) const {
+  int i = 2 * degree;
+  res = (m_p - RealScalar(degree)) / RealScalar(2 * i - 2) * IminusT;
+
   for (--i; i; --i) {
-    res = (MatrixType::Identity(IminusT.rows(), IminusT.cols()) + res).template triangularView<Upper>()
-	.solve((i==1 ? -m_p : i&1 ? (-m_p-(i>>1))/(i<<1) : (m_p-(i>>1))/((i-1)<<1)) * IminusT).eval();
+    res = (MatrixType::Identity(IminusT.rows(), IminusT.cols()) + res)
+              .template triangularView<Upper>()
+              .solve((i == 1  ? -m_p
+                      : i & 1 ? (-m_p - RealScalar(i / 2)) / RealScalar(2 * i)
+                              : (m_p - RealScalar(i / 2)) / RealScalar(2 * i - 2)) *
+                     IminusT)
+              .eval();
   }
   res += MatrixType::Identity(IminusT.rows(), IminusT.cols());
 }
 
 // This function assumes that res has the correct size (see bug 614)
-template<typename MatrixType>
-void MatrixPowerAtomic<MatrixType>::compute2x2(MatrixType& res, RealScalar p) const
-{
+template <typename MatrixType>
+void MatrixPowerAtomic<MatrixType>::compute2x2(ResultType& res, RealScalar p) const {
   using std::abs;
   using std::pow;
-  
-  res.coeffRef(0,0) = pow(m_A.coeff(0,0), p);
-
-  for (Index i=1; i < m_A.cols(); ++i) {
-    res.coeffRef(i,i) = pow(m_A.coeff(i,i), p);
-    if (m_A.coeff(i-1,i-1) == m_A.coeff(i,i))
-      res.coeffRef(i-1,i) = p * pow(m_A.coeff(i,i), p-1);
-    else if (2*abs(m_A.coeff(i-1,i-1)) < abs(m_A.coeff(i,i)) || 2*abs(m_A.coeff(i,i)) < abs(m_A.coeff(i-1,i-1)))
-      res.coeffRef(i-1,i) = (res.coeff(i,i)-res.coeff(i-1,i-1)) / (m_A.coeff(i,i)-m_A.coeff(i-1,i-1));
+  res.coeffRef(0, 0) = pow(m_A.coeff(0, 0), p);
+
+  for (Index i = 1; i < m_A.cols(); ++i) {
+    res.coeffRef(i, i) = pow(m_A.coeff(i, i), p);
+    if (m_A.coeff(i - 1, i - 1) == m_A.coeff(i, i))
+      res.coeffRef(i - 1, i) = p * pow(m_A.coeff(i, i), p - 1);
+    else if (2 * abs(m_A.coeff(i - 1, i - 1)) < abs(m_A.coeff(i, i)) ||
+             2 * abs(m_A.coeff(i, i)) < abs(m_A.coeff(i - 1, i - 1)))
+      res.coeffRef(i - 1, i) =
+          (res.coeff(i, i) - res.coeff(i - 1, i - 1)) / (m_A.coeff(i, i) - m_A.coeff(i - 1, i - 1));
     else
-      res.coeffRef(i-1,i) = computeSuperDiag(m_A.coeff(i,i), m_A.coeff(i-1,i-1), p);
-    res.coeffRef(i-1,i) *= m_A.coeff(i-1,i);
+      res.coeffRef(i - 1, i) = computeSuperDiag(m_A.coeff(i, i), m_A.coeff(i - 1, i - 1), p);
+    res.coeffRef(i - 1, i) *= m_A.coeff(i - 1, i);
   }
 }
 
-template<typename MatrixType>
-void MatrixPowerAtomic<MatrixType>::computeBig(MatrixType& res) const
-{
+template <typename MatrixType>
+void MatrixPowerAtomic<MatrixType>::computeBig(ResultType& res) const {
+  using std::ldexp;
   const int digits = std::numeric_limits<RealScalar>::digits;
-  const RealScalar maxNormForPade = digits <=  24? 4.3386528e-1f:                           // sigle precision
-				    digits <=  53? 2.789358995219730e-1:                    // double precision
-				    digits <=  64? 2.4471944416607995472e-1L:               // extended precision
-				    digits <= 106? 1.1016843812851143391275867258512e-1L:   // double-double
-						   9.134603732914548552537150753385375e-2L; // quadruple precision
+  const RealScalar maxNormForPade =
+      RealScalar(digits <= 24    ? 4.3386528e-1L                              // single precision
+                 : digits <= 53  ? 2.789358995219730e-1L                      // double precision
+                 : digits <= 64  ? 2.4471944416607995472e-1L                  // extended precision
+                 : digits <= 106 ? 1.1016843812851143391275867258512e-1L      // double-double
+                                 : 9.134603732914548552537150753385375e-2L);  // quadruple precision
   MatrixType IminusT, sqrtT, T = m_A.template triangularView<Upper>();
   RealScalar normIminusT;
   int degree, degree2, numberOfSquareRoots = 0;
   bool hasExtraSquareRoot = false;
 
-  /* FIXME
-   * For singular T, norm(I - T) >= 1 but maxNormForPade < 1, leads to infinite
-   * loop.  We should move 0 eigenvalues to bottom right corner.  We need not
-   * worry about tiny values (e.g. 1e-300) because they will reach 1 if
-   * repetitively sqrt'ed.
-   *
-   * If the 0 eigenvalues are semisimple, they can form a 0 matrix at the
-   * bottom right corner.
-   *
-   * [ T  A ]^p   [ T^p  (T^-1 T^p A) ]
-   * [      ]   = [                   ]
-   * [ 0  0 ]     [  0         0      ]
-   */
-  for (Index i=0; i < m_A.cols(); ++i)
-    eigen_assert(m_A(i,i) != RealScalar(0));
+  for (Index i = 0; i < m_A.cols(); ++i) eigen_assert(m_A(i, i) != RealScalar(0));
 
   while (true) {
     IminusT = MatrixType::Identity(m_A.rows(), m_A.cols()) - T;
     normIminusT = IminusT.cwiseAbs().colwise().sum().maxCoeff();
     if (normIminusT < maxNormForPade) {
       degree = getPadeDegree(normIminusT);
-      degree2 = getPadeDegree(normIminusT/2);
-      if (degree - degree2 <= 1 || hasExtraSquareRoot)
-	break;
+      degree2 = getPadeDegree(normIminusT / 2);
+      if (degree - degree2 <= 1 || hasExtraSquareRoot) break;
       hasExtraSquareRoot = true;
     }
-    MatrixSquareRootTriangular<MatrixType>(T).compute(sqrtT);
+    matrix_sqrt_triangular(T, sqrtT);
     T = sqrtT.template triangularView<Upper>();
     ++numberOfSquareRoots;
   }
   computePade(degree, IminusT, res);
 
   for (; numberOfSquareRoots; --numberOfSquareRoots) {
-    compute2x2(res, std::ldexp(m_p, -numberOfSquareRoots));
+    compute2x2(res, ldexp(m_p, -numberOfSquareRoots));
     res = res.template triangularView<Upper>() * res;
   }
   compute2x2(res, m_p);
 }
-  
-template<typename MatrixType>
-inline int MatrixPowerAtomic<MatrixType>::getPadeDegree(float normIminusT)
-{
-  const float maxNormForPade[] = { 2.8064004e-1f /* degree = 3 */ , 4.3386528e-1f };
+
+template <typename MatrixType>
+inline int MatrixPowerAtomic<MatrixType>::getPadeDegree(float normIminusT) {
+  const float maxNormForPade[] = {2.8064004e-1f /* degree = 3 */, 4.3386528e-1f};
   int degree = 3;
   for (; degree <= 4; ++degree)
-    if (normIminusT <= maxNormForPade[degree - 3])
-      break;
+    if (normIminusT <= maxNormForPade[degree - 3]) break;
   return degree;
 }
 
-template<typename MatrixType>
-inline int MatrixPowerAtomic<MatrixType>::getPadeDegree(double normIminusT)
-{
-  const double maxNormForPade[] = { 1.884160592658218e-2 /* degree = 3 */ , 6.038881904059573e-2, 1.239917516308172e-1,
-      1.999045567181744e-1, 2.789358995219730e-1 };
+template <typename MatrixType>
+inline int MatrixPowerAtomic<MatrixType>::getPadeDegree(double normIminusT) {
+  const double maxNormForPade[] = {1.884160592658218e-2 /* degree = 3 */, 6.038881904059573e-2, 1.239917516308172e-1,
+                                   1.999045567181744e-1, 2.789358995219730e-1};
   int degree = 3;
   for (; degree <= 7; ++degree)
-    if (normIminusT <= maxNormForPade[degree - 3])
-      break;
+    if (normIminusT <= maxNormForPade[degree - 3]) break;
   return degree;
 }
 
-template<typename MatrixType>
-inline int MatrixPowerAtomic<MatrixType>::getPadeDegree(long double normIminusT)
-{
-#if   LDBL_MANT_DIG == 53
+template <typename MatrixType>
+inline int MatrixPowerAtomic<MatrixType>::getPadeDegree(long double normIminusT) {
+#if LDBL_MANT_DIG == 53
   const int maxPadeDegree = 7;
-  const double maxNormForPade[] = { 1.884160592658218e-2L /* degree = 3 */ , 6.038881904059573e-2L, 1.239917516308172e-1L,
-      1.999045567181744e-1L, 2.789358995219730e-1L };
+  const double maxNormForPade[] = {1.884160592658218e-2L /* degree = 3 */, 6.038881904059573e-2L, 1.239917516308172e-1L,
+                                   1.999045567181744e-1L, 2.789358995219730e-1L};
 #elif LDBL_MANT_DIG <= 64
   const int maxPadeDegree = 8;
-  const double maxNormForPade[] = { 6.3854693117491799460e-3L /* degree = 3 */ , 2.6394893435456973676e-2L,
-      6.4216043030404063729e-2L, 1.1701165502926694307e-1L, 1.7904284231268670284e-1L, 2.4471944416607995472e-1L };
+  const long double maxNormForPade[] = {6.3854693117491799460e-3L /* degree = 3 */,
+                                        2.6394893435456973676e-2L,
+                                        6.4216043030404063729e-2L,
+                                        1.1701165502926694307e-1L,
+                                        1.7904284231268670284e-1L,
+                                        2.4471944416607995472e-1L};
 #elif LDBL_MANT_DIG <= 106
   const int maxPadeDegree = 10;
-  const double maxNormForPade[] = { 1.0007161601787493236741409687186e-4L /* degree = 3 */ ,
-      1.0007161601787493236741409687186e-3L, 4.7069769360887572939882574746264e-3L, 1.3220386624169159689406653101695e-2L,
-      2.8063482381631737920612944054906e-2L, 4.9625993951953473052385361085058e-2L, 7.7367040706027886224557538328171e-2L,
-      1.1016843812851143391275867258512e-1L };
+  const double maxNormForPade[] = {1.0007161601787493236741409687186e-4L /* degree = 3 */,
+                                   1.0007161601787493236741409687186e-3L,
+                                   4.7069769360887572939882574746264e-3L,
+                                   1.3220386624169159689406653101695e-2L,
+                                   2.8063482381631737920612944054906e-2L,
+                                   4.9625993951953473052385361085058e-2L,
+                                   7.7367040706027886224557538328171e-2L,
+                                   1.1016843812851143391275867258512e-1L};
 #else
   const int maxPadeDegree = 10;
-  const double maxNormForPade[] = { 5.524506147036624377378713555116378e-5L /* degree = 3 */ ,
-      6.640600568157479679823602193345995e-4L, 3.227716520106894279249709728084626e-3L,
-      9.619593944683432960546978734646284e-3L, 2.134595382433742403911124458161147e-2L,
-      3.908166513900489428442993794761185e-2L, 6.266780814639442865832535460550138e-2L,
-      9.134603732914548552537150753385375e-2L };
+  const double maxNormForPade[] = {5.524506147036624377378713555116378e-5L /* degree = 3 */,
+                                   6.640600568157479679823602193345995e-4L,
+                                   3.227716520106894279249709728084626e-3L,
+                                   9.619593944683432960546978734646284e-3L,
+                                   2.134595382433742403911124458161147e-2L,
+                                   3.908166513900489428442993794761185e-2L,
+                                   6.266780814639442865832535460550138e-2L,
+                                   9.134603732914548552537150753385375e-2L};
 #endif
   int degree = 3;
   for (; degree <= maxPadeDegree; ++degree)
-    if (normIminusT <= maxNormForPade[degree - 3])
-      break;
+    if (normIminusT <= static_cast<long double>(maxNormForPade[degree - 3])) break;
   return degree;
 }
 
-template<typename MatrixType>
-inline typename MatrixPowerAtomic<MatrixType>::ComplexScalar
-MatrixPowerAtomic<MatrixType>::computeSuperDiag(const ComplexScalar& curr, const ComplexScalar& prev, RealScalar p)
-{
-  ComplexScalar logCurr = std::log(curr);
-  ComplexScalar logPrev = std::log(prev);
-  int unwindingNumber = std::ceil((numext::imag(logCurr - logPrev) - M_PI) / (2*M_PI));
-  ComplexScalar w = numext::atanh2(curr - prev, curr + prev) + ComplexScalar(0, M_PI*unwindingNumber);
-  return RealScalar(2) * std::exp(RealScalar(0.5) * p * (logCurr + logPrev)) * std::sinh(p * w) / (curr - prev);
+template <typename MatrixType>
+inline typename MatrixPowerAtomic<MatrixType>::ComplexScalar MatrixPowerAtomic<MatrixType>::computeSuperDiag(
+    const ComplexScalar& curr, const ComplexScalar& prev, RealScalar p) {
+  using std::ceil;
+  using std::exp;
+  using std::log;
+  using std::sinh;
+
+  ComplexScalar logCurr = log(curr);
+  ComplexScalar logPrev = log(prev);
+  RealScalar unwindingNumber =
+      ceil((numext::imag(logCurr - logPrev) - RealScalar(EIGEN_PI)) / RealScalar(2 * EIGEN_PI));
+  ComplexScalar w =
+      numext::log1p((curr - prev) / prev) / RealScalar(2) + ComplexScalar(0, RealScalar(EIGEN_PI) * unwindingNumber);
+  return RealScalar(2) * exp(RealScalar(0.5) * p * (logCurr + logPrev)) * sinh(p * w) / (curr - prev);
 }
 
-template<typename MatrixType>
-inline typename MatrixPowerAtomic<MatrixType>::RealScalar
-MatrixPowerAtomic<MatrixType>::computeSuperDiag(RealScalar curr, RealScalar prev, RealScalar p)
-{
-  RealScalar w = numext::atanh2(curr - prev, curr + prev);
-  return 2 * std::exp(p * (std::log(curr) + std::log(prev)) / 2) * std::sinh(p * w) / (curr - prev);
+template <typename MatrixType>
+inline typename MatrixPowerAtomic<MatrixType>::RealScalar MatrixPowerAtomic<MatrixType>::computeSuperDiag(
+    RealScalar curr, RealScalar prev, RealScalar p) {
+  using std::exp;
+  using std::log;
+  using std::sinh;
+
+  RealScalar w = numext::log1p((curr - prev) / prev) / RealScalar(2);
+  return 2 * exp(p * (log(curr) + log(prev)) / 2) * sinh(p * w) / (curr - prev);
 }
 
 /**
@@ -270,173 +336,234 @@ MatrixPowerAtomic<MatrixType>::computeSuperDiag(RealScalar curr, RealScalar prev
  * \include MatrixPower_optimal.cpp
  * Output: \verbinclude MatrixPower_optimal.out
  */
-template<typename MatrixType>
-class MatrixPower
-{
-  private:
-    enum {
-      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-      MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
-      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime
-    };
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::RealScalar RealScalar;
-    typedef typename MatrixType::Index Index;
-
-  public:
-    /**
-     * \brief Constructor.
-     *
-     * \param[in] A  the base of the matrix power.
-     *
-     * The class stores a reference to A, so it should not be changed
-     * (or destroyed) before evaluation.
-     */
-    explicit MatrixPower(const MatrixType& A) : m_A(A), m_conditionNumber(0)
-    { eigen_assert(A.rows() == A.cols()); }
-
-    /**
-     * \brief Returns the matrix power.
-     *
-     * \param[in] p  exponent, a real scalar.
-     * \return The expression \f$ A^p \f$, where A is specified in the
-     * constructor.
-     */
-    const MatrixPowerRetval<MatrixType> operator()(RealScalar p)
-    { return MatrixPowerRetval<MatrixType>(*this, p); }
-
-    /**
-     * \brief Compute the matrix power.
-     *
-     * \param[in]  p    exponent, a real scalar.
-     * \param[out] res  \f$ A^p \f$ where A is specified in the
-     * constructor.
-     */
-    template<typename ResultType>
-    void compute(ResultType& res, RealScalar p);
-    
-    Index rows() const { return m_A.rows(); }
-    Index cols() const { return m_A.cols(); }
-
-  private:
-    typedef std::complex<RealScalar> ComplexScalar;
-    typedef Matrix<ComplexScalar, RowsAtCompileTime, ColsAtCompileTime, MatrixType::Options,
-              MaxRowsAtCompileTime, MaxColsAtCompileTime> ComplexMatrix;
-
-    typename MatrixType::Nested m_A;
-    MatrixType m_tmp;
-    ComplexMatrix m_T, m_U, m_fT;
-    RealScalar m_conditionNumber;
-
-    RealScalar modfAndInit(RealScalar, RealScalar*);
-
-    template<typename ResultType>
-    void computeIntPower(ResultType&, RealScalar);
-
-    template<typename ResultType>
-    void computeFracPower(ResultType&, RealScalar);
-
-    template<int Rows, int Cols, int Options, int MaxRows, int MaxCols>
-    static void revertSchur(
-        Matrix<ComplexScalar, Rows, Cols, Options, MaxRows, MaxCols>& res,
-        const ComplexMatrix& T,
-        const ComplexMatrix& U);
-
-    template<int Rows, int Cols, int Options, int MaxRows, int MaxCols>
-    static void revertSchur(
-        Matrix<RealScalar, Rows, Cols, Options, MaxRows, MaxCols>& res,
-        const ComplexMatrix& T,
-        const ComplexMatrix& U);
+template <typename MatrixType>
+class MatrixPower : internal::noncopyable {
+ private:
+  typedef typename MatrixType::Scalar Scalar;
+  typedef typename MatrixType::RealScalar RealScalar;
+
+ public:
+  /**
+   * \brief Constructor.
+   *
+   * \param[in] A  the base of the matrix power.
+   *
+   * The class stores a reference to A, so it should not be changed
+   * (or destroyed) before evaluation.
+   */
+  explicit MatrixPower(const MatrixType& A) : m_A(A), m_conditionNumber(0), m_rank(A.cols()), m_nulls(0) {
+    eigen_assert(A.rows() == A.cols());
+  }
+
+  /**
+   * \brief Returns the matrix power.
+   *
+   * \param[in] p  exponent, a real scalar.
+   * \return The expression \f$ A^p \f$, where A is specified in the
+   * constructor.
+   */
+  const MatrixPowerParenthesesReturnValue<MatrixType> operator()(RealScalar p) {
+    return MatrixPowerParenthesesReturnValue<MatrixType>(*this, p);
+  }
+
+  /**
+   * \brief Compute the matrix power.
+   *
+   * \param[in]  p    exponent, a real scalar.
+   * \param[out] res  \f$ A^p \f$ where A is specified in the
+   * constructor.
+   */
+  template <typename ResultType>
+  void compute(ResultType& res, RealScalar p);
+
+  Index rows() const { return m_A.rows(); }
+  Index cols() const { return m_A.cols(); }
+
+ private:
+  typedef internal::make_complex_t<Scalar> ComplexScalar;
+  typedef Matrix<ComplexScalar, Dynamic, Dynamic, 0, MatrixType::RowsAtCompileTime, MatrixType::ColsAtCompileTime>
+      ComplexMatrix;
+
+  /** \brief Reference to the base of matrix power. */
+  typename MatrixType::Nested m_A;
+
+  /** \brief Temporary storage. */
+  MatrixType m_tmp;
+
+  /** \brief Store the result of Schur decomposition. */
+  ComplexMatrix m_T, m_U;
+
+  /** \brief Store fractional power of m_T. */
+  ComplexMatrix m_fT;
+
+  /**
+   * \brief Condition number of m_A.
+   *
+   * It is initialized as 0 to avoid performing unnecessary Schur
+   * decomposition, which is the bottleneck.
+   */
+  RealScalar m_conditionNumber;
+
+  /** \brief Rank of m_A. */
+  Index m_rank;
+
+  /** \brief Rank deficiency of m_A. */
+  Index m_nulls;
+
+  /**
+   * \brief Split p into integral part and fractional part.
+   *
+   * \param[in]  p        The exponent.
+   * \param[out] p        The fractional part ranging in \f$ (-1, 1) \f$.
+   * \param[out] intpart  The integral part.
+   *
+   * Only if the fractional part is nonzero, it calls initialize().
+   */
+  void split(RealScalar& p, RealScalar& intpart);
+
+  /** \brief Perform Schur decomposition for fractional power. */
+  void initialize();
+
+  template <typename ResultType>
+  void computeIntPower(ResultType& res, RealScalar p);
+
+  template <typename ResultType>
+  void computeFracPower(ResultType& res, RealScalar p);
+
+  template <int Rows, int Cols, int Options, int MaxRows, int MaxCols>
+  static void revertSchur(Matrix<ComplexScalar, Rows, Cols, Options, MaxRows, MaxCols>& res, const ComplexMatrix& T,
+                          const ComplexMatrix& U);
+
+  template <int Rows, int Cols, int Options, int MaxRows, int MaxCols>
+  static void revertSchur(Matrix<RealScalar, Rows, Cols, Options, MaxRows, MaxCols>& res, const ComplexMatrix& T,
+                          const ComplexMatrix& U);
 };
 
-template<typename MatrixType>
-template<typename ResultType>
-void MatrixPower<MatrixType>::compute(ResultType& res, RealScalar p)
-{
+template <typename MatrixType>
+template <typename ResultType>
+void MatrixPower<MatrixType>::compute(ResultType& res, RealScalar p) {
+  using std::pow;
   switch (cols()) {
     case 0:
       break;
     case 1:
-      res(0,0) = std::pow(m_A.coeff(0,0), p);
+      res(0, 0) = pow(m_A.coeff(0, 0), p);
       break;
     default:
-      RealScalar intpart, x = modfAndInit(p, &intpart);
+      RealScalar intpart;
+      split(p, intpart);
+
+      res = MatrixType::Identity(rows(), cols());
       computeIntPower(res, intpart);
-      computeFracPower(res, x);
+      if (p) computeFracPower(res, p);
   }
 }
 
-template<typename MatrixType>
-typename MatrixPower<MatrixType>::RealScalar
-MatrixPower<MatrixType>::modfAndInit(RealScalar x, RealScalar* intpart)
-{
-  typedef Array<RealScalar, RowsAtCompileTime, 1, ColMajor, MaxRowsAtCompileTime> RealArray;
-
-  *intpart = std::floor(x);
-  RealScalar res = x - *intpart;
-
-  if (!m_conditionNumber && res) {
-    const ComplexSchur<MatrixType> schurOfA(m_A);
-    m_T = schurOfA.matrixT();
-    m_U = schurOfA.matrixU();
-    
-    const RealArray absTdiag = m_T.diagonal().array().abs();
-    m_conditionNumber = absTdiag.maxCoeff() / absTdiag.minCoeff();
+template <typename MatrixType>
+void MatrixPower<MatrixType>::split(RealScalar& p, RealScalar& intpart) {
+  using std::floor;
+  using std::pow;
+
+  intpart = floor(p);
+  p -= intpart;
+
+  // Perform Schur decomposition if it is not yet performed and the power is
+  // not an integer.
+  if (!m_conditionNumber && p) initialize();
+
+  // Choose the more stable of intpart = floor(p) and intpart = ceil(p).
+  if (p > RealScalar(0.5) && p > (1 - p) * pow(m_conditionNumber, p)) {
+    --p;
+    ++intpart;
   }
+}
 
-  if (res>RealScalar(0.5) && res>(1-res)*std::pow(m_conditionNumber, res)) {
-    --res;
-    ++*intpart;
+template <typename MatrixType>
+void MatrixPower<MatrixType>::initialize() {
+  const ComplexSchur<MatrixType> schurOfA(m_A);
+  JacobiRotation<ComplexScalar> rot;
+  ComplexScalar eigenvalue;
+
+  m_fT.resizeLike(m_A);
+  m_T = schurOfA.matrixT();
+  m_U = schurOfA.matrixU();
+  m_conditionNumber = m_T.diagonal().array().abs().maxCoeff() / m_T.diagonal().array().abs().minCoeff();
+
+  // Move zero eigenvalues to the bottom right corner.
+  for (Index i = cols() - 1; i >= 0; --i) {
+    if (m_rank <= 2) return;
+    if (m_T.coeff(i, i) == RealScalar(0)) {
+      for (Index j = i + 1; j < m_rank; ++j) {
+        eigenvalue = m_T.coeff(j, j);
+        rot.makeGivens(m_T.coeff(j - 1, j), eigenvalue);
+        m_T.applyOnTheRight(j - 1, j, rot);
+        m_T.applyOnTheLeft(j - 1, j, rot.adjoint());
+        m_T.coeffRef(j - 1, j - 1) = eigenvalue;
+        m_T.coeffRef(j, j) = RealScalar(0);
+        m_U.applyOnTheRight(j - 1, j, rot);
+      }
+      --m_rank;
+    }
+  }
+
+  m_nulls = rows() - m_rank;
+  if (m_nulls) {
+    eigen_assert(m_T.bottomRightCorner(m_nulls, m_nulls).isZero() &&
+                 "Base of matrix power should be invertible or with a semisimple zero eigenvalue.");
+    m_fT.bottomRows(m_nulls).fill(RealScalar(0));
   }
-  return res;
 }
 
-template<typename MatrixType>
-template<typename ResultType>
-void MatrixPower<MatrixType>::computeIntPower(ResultType& res, RealScalar p)
-{
-  RealScalar pp = std::abs(p);
+template <typename MatrixType>
+template <typename ResultType>
+void MatrixPower<MatrixType>::computeIntPower(ResultType& res, RealScalar p) {
+  using std::abs;
+  using std::fmod;
+  RealScalar pp = abs(p);
 
-  if (p<0)  m_tmp = m_A.inverse();
-  else      m_tmp = m_A;
+  if (p < 0)
+    m_tmp = m_A.inverse();
+  else
+    m_tmp = m_A;
 
-  res = MatrixType::Identity(rows(), cols());
-  while (pp >= 1) {
-    if (std::fmod(pp, 2) >= 1)
-      res = m_tmp * res;
-    m_tmp *= m_tmp;
+  while (true) {
+    if (fmod(pp, 2) >= 1) res = m_tmp * res;
     pp /= 2;
+    if (pp < 1) break;
+    m_tmp *= m_tmp;
   }
 }
 
-template<typename MatrixType>
-template<typename ResultType>
-void MatrixPower<MatrixType>::computeFracPower(ResultType& res, RealScalar p)
-{
-  if (p) {
-    eigen_assert(m_conditionNumber);
-    MatrixPowerAtomic<ComplexMatrix>(m_T, p).compute(m_fT);
-    revertSchur(m_tmp, m_fT, m_U);
-    res = m_tmp * res;
+template <typename MatrixType>
+template <typename ResultType>
+void MatrixPower<MatrixType>::computeFracPower(ResultType& res, RealScalar p) {
+  Block<ComplexMatrix, Dynamic, Dynamic> blockTp(m_fT, 0, 0, m_rank, m_rank);
+  eigen_assert(m_conditionNumber);
+  eigen_assert(m_rank + m_nulls == rows());
+
+  MatrixPowerAtomic<ComplexMatrix>(m_T.topLeftCorner(m_rank, m_rank), p).compute(blockTp);
+  if (m_nulls) {
+    m_fT.topRightCorner(m_rank, m_nulls) = m_T.topLeftCorner(m_rank, m_rank)
+                                               .template triangularView<Upper>()
+                                               .solve(blockTp * m_T.topRightCorner(m_rank, m_nulls));
   }
+  revertSchur(m_tmp, m_fT, m_U);
+  res = m_tmp * res;
+}
+
+template <typename MatrixType>
+template <int Rows, int Cols, int Options, int MaxRows, int MaxCols>
+inline void MatrixPower<MatrixType>::revertSchur(Matrix<ComplexScalar, Rows, Cols, Options, MaxRows, MaxCols>& res,
+                                                 const ComplexMatrix& T, const ComplexMatrix& U) {
+  res.noalias() = U * (T.template triangularView<Upper>() * U.adjoint());
 }
 
-template<typename MatrixType>
-template<int Rows, int Cols, int Options, int MaxRows, int MaxCols>
-inline void MatrixPower<MatrixType>::revertSchur(
-    Matrix<ComplexScalar, Rows, Cols, Options, MaxRows, MaxCols>& res,
-    const ComplexMatrix& T,
-    const ComplexMatrix& U)
-{ res.noalias() = U * (T.template triangularView<Upper>() * U.adjoint()); }
-
-template<typename MatrixType>
-template<int Rows, int Cols, int Options, int MaxRows, int MaxCols>
-inline void MatrixPower<MatrixType>::revertSchur(
-    Matrix<RealScalar, Rows, Cols, Options, MaxRows, MaxCols>& res,
-    const ComplexMatrix& T,
-    const ComplexMatrix& U)
-{ res.noalias() = (U * (T.template triangularView<Upper>() * U.adjoint())).real(); }
+template <typename MatrixType>
+template <int Rows, int Cols, int Options, int MaxRows, int MaxCols>
+inline void MatrixPower<MatrixType>::revertSchur(Matrix<RealScalar, Rows, Cols, Options, MaxRows, MaxCols>& res,
+                                                 const ComplexMatrix& T, const ComplexMatrix& U) {
+  res.noalias() = (U * (T.template triangularView<Upper>() * U.adjoint())).real();
+}
 
 /**
  * \ingroup MatrixFunctions_Module
@@ -451,58 +578,117 @@ inline void MatrixPower<MatrixType>::revertSchur(
  * MatrixBase::pow() and related functions and most of the
  * time this is the only way it is used.
  */
-template<typename Derived>
-class MatrixPowerReturnValue : public ReturnByValue< MatrixPowerReturnValue<Derived> >
-{
-  public:
-    typedef typename Derived::PlainObject PlainObject;
-    typedef typename Derived::RealScalar RealScalar;
-    typedef typename Derived::Index Index;
-
-    /**
-     * \brief Constructor.
-     *
-     * \param[in] A  %Matrix (expression), the base of the matrix power.
-     * \param[in] p  scalar, the exponent of the matrix power.
-     */
-    MatrixPowerReturnValue(const Derived& A, RealScalar p) : m_A(A), m_p(p)
-    { }
-
-    /**
-     * \brief Compute the matrix power.
-     *
-     * \param[out] result  \f$ A^p \f$ where \p A and \p p are as in the
-     * constructor.
-     */
-    template<typename ResultType>
-    inline void evalTo(ResultType& res) const
-    { MatrixPower<PlainObject>(m_A.eval()).compute(res, m_p); }
-
-    Index rows() const { return m_A.rows(); }
-    Index cols() const { return m_A.cols(); }
-
-  private:
-    const Derived& m_A;
-    const RealScalar m_p;
-    MatrixPowerReturnValue& operator=(const MatrixPowerReturnValue&);
+template <typename Derived>
+class MatrixPowerReturnValue : public ReturnByValue<MatrixPowerReturnValue<Derived> > {
+ public:
+  typedef typename Derived::PlainObject PlainObject;
+  typedef typename Derived::RealScalar RealScalar;
+
+  /**
+   * \brief Constructor.
+   *
+   * \param[in] A  %Matrix (expression), the base of the matrix power.
+   * \param[in] p  real scalar, the exponent of the matrix power.
+   */
+  MatrixPowerReturnValue(const Derived& A, RealScalar p) : m_A(A), m_p(p) {}
+
+  /**
+   * \brief Compute the matrix power.
+   *
+   * \param[out] result  \f$ A^p \f$ where \p A and \p p are as in the
+   * constructor.
+   */
+  template <typename ResultType>
+  inline void evalTo(ResultType& result) const {
+    MatrixPower<PlainObject>(m_A.eval()).compute(result, m_p);
+  }
+
+  Index rows() const { return m_A.rows(); }
+  Index cols() const { return m_A.cols(); }
+
+ private:
+  const Derived& m_A;
+  const RealScalar m_p;
+};
+
+/**
+ * \ingroup MatrixFunctions_Module
+ *
+ * \brief Proxy for the matrix power of some matrix (expression).
+ *
+ * \tparam Derived  type of the base, a matrix (expression).
+ *
+ * This class holds the arguments to the matrix power until it is
+ * assigned or evaluated for some other reason (so the argument
+ * should not be changed in the meantime). It is the return type of
+ * MatrixBase::pow() and related functions and most of the
+ * time this is the only way it is used.
+ */
+template <typename Derived>
+class MatrixComplexPowerReturnValue : public ReturnByValue<MatrixComplexPowerReturnValue<Derived> > {
+ public:
+  typedef typename Derived::PlainObject PlainObject;
+  typedef internal::make_complex_t<typename Derived::Scalar> ComplexScalar;
+
+  /**
+   * \brief Constructor.
+   *
+   * \param[in] A  %Matrix (expression), the base of the matrix power.
+   * \param[in] p  complex scalar, the exponent of the matrix power.
+   */
+  MatrixComplexPowerReturnValue(const Derived& A, const ComplexScalar& p) : m_A(A), m_p(p) {}
+
+  /**
+   * \brief Compute the matrix power.
+   *
+   * Because \p p is complex, \f$ A^p \f$ is simply evaluated as \f$
+   * \exp(p \log(A)) \f$.
+   *
+   * \param[out] result  \f$ A^p \f$ where \p A and \p p are as in the
+   * constructor.
+   */
+  template <typename ResultType>
+  inline void evalTo(ResultType& result) const {
+    result = (m_p * m_A.log()).exp();
+  }
+
+  Index rows() const { return m_A.rows(); }
+  Index cols() const { return m_A.cols(); }
+
+ private:
+  const Derived& m_A;
+  const ComplexScalar m_p;
 };
 
 namespace internal {
 
-template<typename MatrixPowerType>
-struct traits< MatrixPowerRetval<MatrixPowerType> >
-{ typedef typename MatrixPowerType::PlainObject ReturnType; };
+template <typename MatrixPowerType>
+struct traits<MatrixPowerParenthesesReturnValue<MatrixPowerType> > {
+  typedef typename MatrixPowerType::PlainObject ReturnType;
+};
+
+template <typename Derived>
+struct traits<MatrixPowerReturnValue<Derived> > {
+  typedef typename Derived::PlainObject ReturnType;
+};
+
+template <typename Derived>
+struct traits<MatrixComplexPowerReturnValue<Derived> > {
+  typedef typename Derived::PlainObject ReturnType;
+};
 
-template<typename Derived>
-struct traits< MatrixPowerReturnValue<Derived> >
-{ typedef typename Derived::PlainObject ReturnType; };
+}  // namespace internal
 
+template <typename Derived>
+const MatrixPowerReturnValue<Derived> MatrixBase<Derived>::pow(const RealScalar& p) const {
+  return MatrixPowerReturnValue<Derived>(derived(), p);
 }
 
-template<typename Derived>
-const MatrixPowerReturnValue<Derived> MatrixBase<Derived>::pow(const RealScalar& p) const
-{ return MatrixPowerReturnValue<Derived>(derived(), p); }
+template <typename Derived>
+const MatrixComplexPowerReturnValue<Derived> MatrixBase<Derived>::pow(const internal::make_complex_t<Scalar>& p) const {
+  return MatrixComplexPowerReturnValue<Derived>(derived(), p);
+}
 
-} // namespace Eigen
+}  // namespace Eigen
 
-#endif // EIGEN_MATRIX_POWER
+#endif  // EIGEN_MATRIX_POWER
diff --git a/inst/include/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h b/inst/include/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h
index b48ea9d4..b11eb741 100644
--- a/inst/include/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h
+++ b/inst/include/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2011 Jitse Niesen <jitse@maths.leeds.ac.uk>
+// Copyright (C) 2011, 2013 Jitse Niesen <jitse@maths.leeds.ac.uk>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,457 +10,337 @@
 #ifndef EIGEN_MATRIX_SQUARE_ROOT
 #define EIGEN_MATRIX_SQUARE_ROOT
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 
-/** \ingroup MatrixFunctions_Module
-  * \brief Class for computing matrix square roots of upper quasi-triangular matrices.
-  * \tparam  MatrixType  type of the argument of the matrix square root,
-  *                      expected to be an instantiation of the Matrix class template.
-  *
-  * This class computes the square root of the upper quasi-triangular
-  * matrix stored in the upper Hessenberg part of the matrix passed to
-  * the constructor.
-  *
-  * \sa MatrixSquareRoot, MatrixSquareRootTriangular
-  */
-template <typename MatrixType>
-class MatrixSquareRootQuasiTriangular
-{
-  public:
-
-    /** \brief Constructor. 
-      *
-      * \param[in]  A  upper quasi-triangular matrix whose square root 
-      *                is to be computed.
-      *
-      * The class stores a reference to \p A, so it should not be
-      * changed (or destroyed) before compute() is called.
-      */
-    MatrixSquareRootQuasiTriangular(const MatrixType& A) 
-      : m_A(A) 
-    {
-      eigen_assert(A.rows() == A.cols());
-    }
-    
-    /** \brief Compute the matrix square root
-      *
-      * \param[out] result  square root of \p A, as specified in the constructor.
-      *
-      * Only the upper Hessenberg part of \p result is updated, the
-      * rest is not touched.  See MatrixBase::sqrt() for details on
-      * how this computation is implemented.
-      */
-    template <typename ResultType> void compute(ResultType &result);    
-    
-  private:
-    typedef typename MatrixType::Index Index;
-    typedef typename MatrixType::Scalar Scalar;
-    
-    void computeDiagonalPartOfSqrt(MatrixType& sqrtT, const MatrixType& T);
-    void computeOffDiagonalPartOfSqrt(MatrixType& sqrtT, const MatrixType& T);
-    void compute2x2diagonalBlock(MatrixType& sqrtT, const MatrixType& T, typename MatrixType::Index i);
-    void compute1x1offDiagonalBlock(MatrixType& sqrtT, const MatrixType& T, 
-				  typename MatrixType::Index i, typename MatrixType::Index j);
-    void compute1x2offDiagonalBlock(MatrixType& sqrtT, const MatrixType& T, 
-				  typename MatrixType::Index i, typename MatrixType::Index j);
-    void compute2x1offDiagonalBlock(MatrixType& sqrtT, const MatrixType& T, 
-				  typename MatrixType::Index i, typename MatrixType::Index j);
-    void compute2x2offDiagonalBlock(MatrixType& sqrtT, const MatrixType& T, 
-				  typename MatrixType::Index i, typename MatrixType::Index j);
-  
-    template <typename SmallMatrixType>
-    static void solveAuxiliaryEquation(SmallMatrixType& X, const SmallMatrixType& A, 
-				     const SmallMatrixType& B, const SmallMatrixType& C);
-  
-    const MatrixType& m_A;
-};
-
-template <typename MatrixType>
-template <typename ResultType> 
-void MatrixSquareRootQuasiTriangular<MatrixType>::compute(ResultType &result)
-{
-  result.resize(m_A.rows(), m_A.cols());
-  computeDiagonalPartOfSqrt(result, m_A);
-  computeOffDiagonalPartOfSqrt(result, m_A);
-}
+namespace Eigen {
 
-// pre:  T is quasi-upper-triangular and sqrtT is a zero matrix of the same size
-// post: the diagonal blocks of sqrtT are the square roots of the diagonal blocks of T
-template <typename MatrixType>
-void MatrixSquareRootQuasiTriangular<MatrixType>::computeDiagonalPartOfSqrt(MatrixType& sqrtT, 
-									  const MatrixType& T)
-{
-  using std::sqrt;
-  const Index size = m_A.rows();
-  for (Index i = 0; i < size; i++) {
-    if (i == size - 1 || T.coeff(i+1, i) == 0) {
-      eigen_assert(T(i,i) >= 0);
-      sqrtT.coeffRef(i,i) = sqrt(T.coeff(i,i));
-    }
-    else {
-      compute2x2diagonalBlock(sqrtT, T, i);
-      ++i;
-    }
-  }
-}
-
-// pre:  T is quasi-upper-triangular and diagonal blocks of sqrtT are square root of diagonal blocks of T.
-// post: sqrtT is the square root of T.
-template <typename MatrixType>
-void MatrixSquareRootQuasiTriangular<MatrixType>::computeOffDiagonalPartOfSqrt(MatrixType& sqrtT, 
-									     const MatrixType& T)
-{
-  const Index size = m_A.rows();
-  for (Index j = 1; j < size; j++) {
-      if (T.coeff(j, j-1) != 0)  // if T(j-1:j, j-1:j) is a 2-by-2 block
-	continue;
-    for (Index i = j-1; i >= 0; i--) {
-      if (i > 0 && T.coeff(i, i-1) != 0)  // if T(i-1:i, i-1:i) is a 2-by-2 block
-	continue;
-      bool iBlockIs2x2 = (i < size - 1) && (T.coeff(i+1, i) != 0);
-      bool jBlockIs2x2 = (j < size - 1) && (T.coeff(j+1, j) != 0);
-      if (iBlockIs2x2 && jBlockIs2x2) 
-	compute2x2offDiagonalBlock(sqrtT, T, i, j);
-      else if (iBlockIs2x2 && !jBlockIs2x2) 
-	compute2x1offDiagonalBlock(sqrtT, T, i, j);
-      else if (!iBlockIs2x2 && jBlockIs2x2) 
-	compute1x2offDiagonalBlock(sqrtT, T, i, j);
-      else if (!iBlockIs2x2 && !jBlockIs2x2) 
-	compute1x1offDiagonalBlock(sqrtT, T, i, j);
-    }
-  }
-}
+namespace internal {
 
 // pre:  T.block(i,i,2,2) has complex conjugate eigenvalues
 // post: sqrtT.block(i,i,2,2) is square root of T.block(i,i,2,2)
-template <typename MatrixType>
-void MatrixSquareRootQuasiTriangular<MatrixType>
-     ::compute2x2diagonalBlock(MatrixType& sqrtT, const MatrixType& T, typename MatrixType::Index i)
-{
+template <typename MatrixType, typename ResultType>
+void matrix_sqrt_quasi_triangular_2x2_diagonal_block(const MatrixType& T, Index i, ResultType& sqrtT) {
   // TODO: This case (2-by-2 blocks with complex conjugate eigenvalues) is probably hidden somewhere
   //       in EigenSolver. If we expose it, we could call it directly from here.
-  Matrix<Scalar,2,2> block = T.template block<2,2>(i,i);
-  EigenSolver<Matrix<Scalar,2,2> > es(block);
-  sqrtT.template block<2,2>(i,i)
-    = (es.eigenvectors() * es.eigenvalues().cwiseSqrt().asDiagonal() * es.eigenvectors().inverse()).real();
+  typedef typename traits<MatrixType>::Scalar Scalar;
+  Matrix<Scalar, 2, 2> block = T.template block<2, 2>(i, i);
+  EigenSolver<Matrix<Scalar, 2, 2> > es(block);
+  sqrtT.template block<2, 2>(i, i) =
+      (es.eigenvectors() * es.eigenvalues().cwiseSqrt().asDiagonal() * es.eigenvectors().inverse()).real();
 }
 
 // pre:  block structure of T is such that (i,j) is a 1x1 block,
 //       all blocks of sqrtT to left of and below (i,j) are correct
 // post: sqrtT(i,j) has the correct value
-template <typename MatrixType>
-void MatrixSquareRootQuasiTriangular<MatrixType>
-     ::compute1x1offDiagonalBlock(MatrixType& sqrtT, const MatrixType& T, 
-				  typename MatrixType::Index i, typename MatrixType::Index j)
-{
-  Scalar tmp = (sqrtT.row(i).segment(i+1,j-i-1) * sqrtT.col(j).segment(i+1,j-i-1)).value();
-  sqrtT.coeffRef(i,j) = (T.coeff(i,j) - tmp) / (sqrtT.coeff(i,i) + sqrtT.coeff(j,j));
+template <typename MatrixType, typename ResultType>
+void matrix_sqrt_quasi_triangular_1x1_off_diagonal_block(const MatrixType& T, Index i, Index j, ResultType& sqrtT) {
+  typedef typename traits<MatrixType>::Scalar Scalar;
+  Scalar tmp = (sqrtT.row(i).segment(i + 1, j - i - 1) * sqrtT.col(j).segment(i + 1, j - i - 1)).value();
+  sqrtT.coeffRef(i, j) = (T.coeff(i, j) - tmp) / (sqrtT.coeff(i, i) + sqrtT.coeff(j, j));
 }
 
 // similar to compute1x1offDiagonalBlock()
-template <typename MatrixType>
-void MatrixSquareRootQuasiTriangular<MatrixType>
-     ::compute1x2offDiagonalBlock(MatrixType& sqrtT, const MatrixType& T, 
-				  typename MatrixType::Index i, typename MatrixType::Index j)
-{
-  Matrix<Scalar,1,2> rhs = T.template block<1,2>(i,j);
-  if (j-i > 1)
-    rhs -= sqrtT.block(i, i+1, 1, j-i-1) * sqrtT.block(i+1, j, j-i-1, 2);
-  Matrix<Scalar,2,2> A = sqrtT.coeff(i,i) * Matrix<Scalar,2,2>::Identity();
-  A += sqrtT.template block<2,2>(j,j).transpose();
-  sqrtT.template block<1,2>(i,j).transpose() = A.fullPivLu().solve(rhs.transpose());
+template <typename MatrixType, typename ResultType>
+void matrix_sqrt_quasi_triangular_1x2_off_diagonal_block(const MatrixType& T, Index i, Index j, ResultType& sqrtT) {
+  typedef typename traits<MatrixType>::Scalar Scalar;
+  Matrix<Scalar, 1, 2> rhs = T.template block<1, 2>(i, j);
+  if (j - i > 1) rhs -= sqrtT.block(i, i + 1, 1, j - i - 1) * sqrtT.block(i + 1, j, j - i - 1, 2);
+  Matrix<Scalar, 2, 2> A = sqrtT.coeff(i, i) * Matrix<Scalar, 2, 2>::Identity();
+  A += sqrtT.template block<2, 2>(j, j).transpose();
+  sqrtT.template block<1, 2>(i, j).transpose() = A.fullPivLu().solve(rhs.transpose());
 }
 
 // similar to compute1x1offDiagonalBlock()
-template <typename MatrixType>
-void MatrixSquareRootQuasiTriangular<MatrixType>
-     ::compute2x1offDiagonalBlock(MatrixType& sqrtT, const MatrixType& T, 
-				  typename MatrixType::Index i, typename MatrixType::Index j)
-{
-  Matrix<Scalar,2,1> rhs = T.template block<2,1>(i,j);
-  if (j-i > 2)
-    rhs -= sqrtT.block(i, i+2, 2, j-i-2) * sqrtT.block(i+2, j, j-i-2, 1);
-  Matrix<Scalar,2,2> A = sqrtT.coeff(j,j) * Matrix<Scalar,2,2>::Identity();
-  A += sqrtT.template block<2,2>(i,i);
-  sqrtT.template block<2,1>(i,j) = A.fullPivLu().solve(rhs);
-}
-
-// similar to compute1x1offDiagonalBlock()
-template <typename MatrixType>
-void MatrixSquareRootQuasiTriangular<MatrixType>
-     ::compute2x2offDiagonalBlock(MatrixType& sqrtT, const MatrixType& T, 
-				  typename MatrixType::Index i, typename MatrixType::Index j)
-{
-  Matrix<Scalar,2,2> A = sqrtT.template block<2,2>(i,i);
-  Matrix<Scalar,2,2> B = sqrtT.template block<2,2>(j,j);
-  Matrix<Scalar,2,2> C = T.template block<2,2>(i,j);
-  if (j-i > 2)
-    C -= sqrtT.block(i, i+2, 2, j-i-2) * sqrtT.block(i+2, j, j-i-2, 2);
-  Matrix<Scalar,2,2> X;
-  solveAuxiliaryEquation(X, A, B, C);
-  sqrtT.template block<2,2>(i,j) = X;
+template <typename MatrixType, typename ResultType>
+void matrix_sqrt_quasi_triangular_2x1_off_diagonal_block(const MatrixType& T, Index i, Index j, ResultType& sqrtT) {
+  typedef typename traits<MatrixType>::Scalar Scalar;
+  Matrix<Scalar, 2, 1> rhs = T.template block<2, 1>(i, j);
+  if (j - i > 2) rhs -= sqrtT.block(i, i + 2, 2, j - i - 2) * sqrtT.block(i + 2, j, j - i - 2, 1);
+  Matrix<Scalar, 2, 2> A = sqrtT.coeff(j, j) * Matrix<Scalar, 2, 2>::Identity();
+  A += sqrtT.template block<2, 2>(i, i);
+  sqrtT.template block<2, 1>(i, j) = A.fullPivLu().solve(rhs);
 }
 
 // solves the equation A X + X B = C where all matrices are 2-by-2
 template <typename MatrixType>
-template <typename SmallMatrixType>
-void MatrixSquareRootQuasiTriangular<MatrixType>
-     ::solveAuxiliaryEquation(SmallMatrixType& X, const SmallMatrixType& A,
-			      const SmallMatrixType& B, const SmallMatrixType& C)
-{
-  EIGEN_STATIC_ASSERT((internal::is_same<SmallMatrixType, Matrix<Scalar,2,2> >::value),
-		      EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT);
-
-  Matrix<Scalar,4,4> coeffMatrix = Matrix<Scalar,4,4>::Zero();
-  coeffMatrix.coeffRef(0,0) = A.coeff(0,0) + B.coeff(0,0);
-  coeffMatrix.coeffRef(1,1) = A.coeff(0,0) + B.coeff(1,1);
-  coeffMatrix.coeffRef(2,2) = A.coeff(1,1) + B.coeff(0,0);
-  coeffMatrix.coeffRef(3,3) = A.coeff(1,1) + B.coeff(1,1);
-  coeffMatrix.coeffRef(0,1) = B.coeff(1,0);
-  coeffMatrix.coeffRef(0,2) = A.coeff(0,1);
-  coeffMatrix.coeffRef(1,0) = B.coeff(0,1);
-  coeffMatrix.coeffRef(1,3) = A.coeff(0,1);
-  coeffMatrix.coeffRef(2,0) = A.coeff(1,0);
-  coeffMatrix.coeffRef(2,3) = B.coeff(1,0);
-  coeffMatrix.coeffRef(3,1) = A.coeff(1,0);
-  coeffMatrix.coeffRef(3,2) = B.coeff(0,1);
-  
-  Matrix<Scalar,4,1> rhs;
-  rhs.coeffRef(0) = C.coeff(0,0);
-  rhs.coeffRef(1) = C.coeff(0,1);
-  rhs.coeffRef(2) = C.coeff(1,0);
-  rhs.coeffRef(3) = C.coeff(1,1);
-  
-  Matrix<Scalar,4,1> result;
+void matrix_sqrt_quasi_triangular_solve_auxiliary_equation(MatrixType& X, const MatrixType& A, const MatrixType& B,
+                                                           const MatrixType& C) {
+  typedef typename traits<MatrixType>::Scalar Scalar;
+  Matrix<Scalar, 4, 4> coeffMatrix = Matrix<Scalar, 4, 4>::Zero();
+  coeffMatrix.coeffRef(0, 0) = A.coeff(0, 0) + B.coeff(0, 0);
+  coeffMatrix.coeffRef(1, 1) = A.coeff(0, 0) + B.coeff(1, 1);
+  coeffMatrix.coeffRef(2, 2) = A.coeff(1, 1) + B.coeff(0, 0);
+  coeffMatrix.coeffRef(3, 3) = A.coeff(1, 1) + B.coeff(1, 1);
+  coeffMatrix.coeffRef(0, 1) = B.coeff(1, 0);
+  coeffMatrix.coeffRef(0, 2) = A.coeff(0, 1);
+  coeffMatrix.coeffRef(1, 0) = B.coeff(0, 1);
+  coeffMatrix.coeffRef(1, 3) = A.coeff(0, 1);
+  coeffMatrix.coeffRef(2, 0) = A.coeff(1, 0);
+  coeffMatrix.coeffRef(2, 3) = B.coeff(1, 0);
+  coeffMatrix.coeffRef(3, 1) = A.coeff(1, 0);
+  coeffMatrix.coeffRef(3, 2) = B.coeff(0, 1);
+
+  Matrix<Scalar, 4, 1> rhs;
+  rhs.coeffRef(0) = C.coeff(0, 0);
+  rhs.coeffRef(1) = C.coeff(0, 1);
+  rhs.coeffRef(2) = C.coeff(1, 0);
+  rhs.coeffRef(3) = C.coeff(1, 1);
+
+  Matrix<Scalar, 4, 1> result;
   result = coeffMatrix.fullPivLu().solve(rhs);
 
-  X.coeffRef(0,0) = result.coeff(0);
-  X.coeffRef(0,1) = result.coeff(1);
-  X.coeffRef(1,0) = result.coeff(2);
-  X.coeffRef(1,1) = result.coeff(3);
+  X.coeffRef(0, 0) = result.coeff(0);
+  X.coeffRef(0, 1) = result.coeff(1);
+  X.coeffRef(1, 0) = result.coeff(2);
+  X.coeffRef(1, 1) = result.coeff(3);
 }
 
+// similar to compute1x1offDiagonalBlock()
+template <typename MatrixType, typename ResultType>
+void matrix_sqrt_quasi_triangular_2x2_off_diagonal_block(const MatrixType& T, Index i, Index j, ResultType& sqrtT) {
+  typedef typename traits<MatrixType>::Scalar Scalar;
+  Matrix<Scalar, 2, 2> A = sqrtT.template block<2, 2>(i, i);
+  Matrix<Scalar, 2, 2> B = sqrtT.template block<2, 2>(j, j);
+  Matrix<Scalar, 2, 2> C = T.template block<2, 2>(i, j);
+  if (j - i > 2) C -= sqrtT.block(i, i + 2, 2, j - i - 2) * sqrtT.block(i + 2, j, j - i - 2, 2);
+  Matrix<Scalar, 2, 2> X;
+  matrix_sqrt_quasi_triangular_solve_auxiliary_equation(X, A, B, C);
+  sqrtT.template block<2, 2>(i, j) = X;
+}
 
-/** \ingroup MatrixFunctions_Module
-  * \brief Class for computing matrix square roots of upper triangular matrices.
-  * \tparam  MatrixType  type of the argument of the matrix square root,
-  *                      expected to be an instantiation of the Matrix class template.
-  *
-  * This class computes the square root of the upper triangular matrix
-  * stored in the upper triangular part (including the diagonal) of
-  * the matrix passed to the constructor.
-  *
-  * \sa MatrixSquareRoot, MatrixSquareRootQuasiTriangular
-  */
-template <typename MatrixType>
-class MatrixSquareRootTriangular
-{
-  public:
-    MatrixSquareRootTriangular(const MatrixType& A) 
-      : m_A(A) 
-    {
-      eigen_assert(A.rows() == A.cols());
+// pre:  T is quasi-upper-triangular and sqrtT is a zero matrix of the same size
+// post: the diagonal blocks of sqrtT are the square roots of the diagonal blocks of T
+template <typename MatrixType, typename ResultType>
+void matrix_sqrt_quasi_triangular_diagonal(const MatrixType& T, ResultType& sqrtT) {
+  using std::sqrt;
+  const Index size = T.rows();
+  for (Index i = 0; i < size; i++) {
+    if (i == size - 1 || T.coeff(i + 1, i) == 0) {
+      eigen_assert(T(i, i) >= 0);
+      sqrtT.coeffRef(i, i) = sqrt(T.coeff(i, i));
+    } else {
+      matrix_sqrt_quasi_triangular_2x2_diagonal_block(T, i, sqrtT);
+      ++i;
     }
+  }
+}
 
-    /** \brief Compute the matrix square root
-      *
-      * \param[out] result  square root of \p A, as specified in the constructor.
-      *
-      * Only the upper triangular part (including the diagonal) of 
-      * \p result is updated, the rest is not touched.  See
-      * MatrixBase::sqrt() for details on how this computation is
-      * implemented.
-      */
-    template <typename ResultType> void compute(ResultType &result);    
-
- private:
-    const MatrixType& m_A;
-};
+// pre:  T is quasi-upper-triangular and diagonal blocks of sqrtT are square root of diagonal blocks of T.
+// post: sqrtT is the square root of T.
+template <typename MatrixType, typename ResultType>
+void matrix_sqrt_quasi_triangular_off_diagonal(const MatrixType& T, ResultType& sqrtT) {
+  const Index size = T.rows();
+  for (Index j = 1; j < size; j++) {
+    if (T.coeff(j, j - 1) != 0)  // if T(j-1:j, j-1:j) is a 2-by-2 block
+      continue;
+    for (Index i = j - 1; i >= 0; i--) {
+      if (i > 0 && T.coeff(i, i - 1) != 0)  // if T(i-1:i, i-1:i) is a 2-by-2 block
+        continue;
+      bool iBlockIs2x2 = (i < size - 1) && (T.coeff(i + 1, i) != 0);
+      bool jBlockIs2x2 = (j < size - 1) && (T.coeff(j + 1, j) != 0);
+      if (iBlockIs2x2 && jBlockIs2x2)
+        matrix_sqrt_quasi_triangular_2x2_off_diagonal_block(T, i, j, sqrtT);
+      else if (iBlockIs2x2 && !jBlockIs2x2)
+        matrix_sqrt_quasi_triangular_2x1_off_diagonal_block(T, i, j, sqrtT);
+      else if (!iBlockIs2x2 && jBlockIs2x2)
+        matrix_sqrt_quasi_triangular_1x2_off_diagonal_block(T, i, j, sqrtT);
+      else if (!iBlockIs2x2 && !jBlockIs2x2)
+        matrix_sqrt_quasi_triangular_1x1_off_diagonal_block(T, i, j, sqrtT);
+    }
+  }
+}
 
-template <typename MatrixType>
-template <typename ResultType> 
-void MatrixSquareRootTriangular<MatrixType>::compute(ResultType &result)
-{
+}  // end of namespace internal
+
+/** \ingroup MatrixFunctions_Module
+ * \brief Compute matrix square root of quasi-triangular matrix.
+ *
+ * \tparam  MatrixType  type of \p arg, the argument of matrix square root,
+ *                      expected to be an instantiation of the Matrix class template.
+ * \tparam  ResultType  type of \p result, where result is to be stored.
+ * \param[in]  arg      argument of matrix square root.
+ * \param[out] result   matrix square root of upper Hessenberg part of \p arg.
+ *
+ * This function computes the square root of the upper quasi-triangular matrix stored in the upper
+ * Hessenberg part of \p arg.  Only the upper Hessenberg part of \p result is updated, the rest is
+ * not touched.  See MatrixBase::sqrt() for details on how this computation is implemented.
+ *
+ * \sa MatrixSquareRoot, MatrixSquareRootQuasiTriangular
+ */
+template <typename MatrixType, typename ResultType>
+void matrix_sqrt_quasi_triangular(const MatrixType& arg, ResultType& result) {
+  eigen_assert(arg.rows() == arg.cols());
+  result.resize(arg.rows(), arg.cols());
+  internal::matrix_sqrt_quasi_triangular_diagonal(arg, result);
+  internal::matrix_sqrt_quasi_triangular_off_diagonal(arg, result);
+}
+
+/** \ingroup MatrixFunctions_Module
+ * \brief Compute matrix square root of triangular matrix.
+ *
+ * \tparam  MatrixType  type of \p arg, the argument of matrix square root,
+ *                      expected to be an instantiation of the Matrix class template.
+ * \tparam  ResultType  type of \p result, where result is to be stored.
+ * \param[in]  arg      argument of matrix square root.
+ * \param[out] result   matrix square root of upper triangular part of \p arg.
+ *
+ * Only the upper triangular part (including the diagonal) of \p result is updated, the rest is not
+ * touched.  See MatrixBase::sqrt() for details on how this computation is implemented.
+ *
+ * \sa MatrixSquareRoot, MatrixSquareRootQuasiTriangular
+ */
+template <typename MatrixType, typename ResultType>
+void matrix_sqrt_triangular(const MatrixType& arg, ResultType& result) {
   using std::sqrt;
+  typedef typename MatrixType::Scalar Scalar;
 
-  // Compute square root of m_A and store it in upper triangular part of result
+  eigen_assert(arg.rows() == arg.cols());
+
+  // Compute square root of arg and store it in upper triangular part of result
   // This uses that the square root of triangular matrices can be computed directly.
-  result.resize(m_A.rows(), m_A.cols());
-  typedef typename MatrixType::Index Index;
-  for (Index i = 0; i < m_A.rows(); i++) {
-    result.coeffRef(i,i) = sqrt(m_A.coeff(i,i));
+  result.resize(arg.rows(), arg.cols());
+  for (Index i = 0; i < arg.rows(); i++) {
+    result.coeffRef(i, i) = sqrt(arg.coeff(i, i));
   }
-  for (Index j = 1; j < m_A.cols(); j++) {
-    for (Index i = j-1; i >= 0; i--) {
-      typedef typename MatrixType::Scalar Scalar;
+  for (Index j = 1; j < arg.cols(); j++) {
+    for (Index i = j - 1; i >= 0; i--) {
       // if i = j-1, then segment has length 0 so tmp = 0
-      Scalar tmp = (result.row(i).segment(i+1,j-i-1) * result.col(j).segment(i+1,j-i-1)).value();
+      Scalar tmp = (result.row(i).segment(i + 1, j - i - 1) * result.col(j).segment(i + 1, j - i - 1)).value();
       // denominator may be zero if original matrix is singular
-      result.coeffRef(i,j) = (m_A.coeff(i,j) - tmp) / (result.coeff(i,i) + result.coeff(j,j));
+      result.coeffRef(i, j) = (arg.coeff(i, j) - tmp) / (result.coeff(i, i) + result.coeff(j, j));
     }
   }
 }
 
+namespace internal {
 
 /** \ingroup MatrixFunctions_Module
-  * \brief Class for computing matrix square roots of general matrices.
-  * \tparam  MatrixType  type of the argument of the matrix square root,
-  *                      expected to be an instantiation of the Matrix class template.
-  *
-  * \sa MatrixSquareRootTriangular, MatrixSquareRootQuasiTriangular, MatrixBase::sqrt()
-  */
+ * \brief Helper struct for computing matrix square roots of general matrices.
+ * \tparam  MatrixType  type of the argument of the matrix square root,
+ *                      expected to be an instantiation of the Matrix class template.
+ *
+ * \sa MatrixSquareRootTriangular, MatrixSquareRootQuasiTriangular, MatrixBase::sqrt()
+ */
 template <typename MatrixType, int IsComplex = NumTraits<typename internal::traits<MatrixType>::Scalar>::IsComplex>
-class MatrixSquareRoot
-{
-  public:
-
-    /** \brief Constructor. 
-      *
-      * \param[in]  A  matrix whose square root is to be computed.
-      *
-      * The class stores a reference to \p A, so it should not be
-      * changed (or destroyed) before compute() is called.
-      */
-    MatrixSquareRoot(const MatrixType& A); 
-    
-    /** \brief Compute the matrix square root
-      *
-      * \param[out] result  square root of \p A, as specified in the constructor.
-      *
-      * See MatrixBase::sqrt() for details on how this computation is
-      * implemented.
-      */
-    template <typename ResultType> void compute(ResultType &result);    
+struct matrix_sqrt_compute {
+  /** \brief Compute the matrix square root
+   *
+   * \param[in]  arg     matrix whose square root is to be computed.
+   * \param[out] result  square root of \p arg.
+   *
+   * See MatrixBase::sqrt() for details on how this computation is implemented.
+   */
+  template <typename ResultType>
+  static void run(const MatrixType& arg, ResultType& result);
 };
 
-
 // ********** Partial specialization for real matrices **********
 
 template <typename MatrixType>
-class MatrixSquareRoot<MatrixType, 0>
-{
-  public:
-
-    MatrixSquareRoot(const MatrixType& A) 
-      : m_A(A) 
-    {  
-      eigen_assert(A.rows() == A.cols());
-    }
-  
-    template <typename ResultType> void compute(ResultType &result)
-    {
-      // Compute Schur decomposition of m_A
-      const RealSchur<MatrixType> schurOfA(m_A);  
-      const MatrixType& T = schurOfA.matrixT();
-      const MatrixType& U = schurOfA.matrixU();
-    
-      // Compute square root of T
-      MatrixType sqrtT = MatrixType::Zero(m_A.rows(), m_A.cols());
-      MatrixSquareRootQuasiTriangular<MatrixType>(T).compute(sqrtT);
-    
-      // Compute square root of m_A
-      result = U * sqrtT * U.adjoint();
-    }
-    
-  private:
-    const MatrixType& m_A;
+struct matrix_sqrt_compute<MatrixType, 0> {
+  typedef typename MatrixType::PlainObject PlainType;
+  template <typename ResultType>
+  static void run(const MatrixType& arg, ResultType& result) {
+    eigen_assert(arg.rows() == arg.cols());
+
+    // Compute Schur decomposition of arg
+    const RealSchur<PlainType> schurOfA(arg);
+    const PlainType& T = schurOfA.matrixT();
+    const PlainType& U = schurOfA.matrixU();
+
+    // Compute square root of T
+    PlainType sqrtT = PlainType::Zero(arg.rows(), arg.cols());
+    matrix_sqrt_quasi_triangular(T, sqrtT);
+
+    // Compute square root of arg
+    result = U * sqrtT * U.adjoint();
+  }
 };
 
-
 // ********** Partial specialization for complex matrices **********
 
 template <typename MatrixType>
-class MatrixSquareRoot<MatrixType, 1>
-{
-  public:
-
-    MatrixSquareRoot(const MatrixType& A) 
-      : m_A(A) 
-    {  
-      eigen_assert(A.rows() == A.cols());
-    }
-  
-    template <typename ResultType> void compute(ResultType &result)
-    {
-      // Compute Schur decomposition of m_A
-      const ComplexSchur<MatrixType> schurOfA(m_A);  
-      const MatrixType& T = schurOfA.matrixT();
-      const MatrixType& U = schurOfA.matrixU();
-    
-      // Compute square root of T
-      MatrixType sqrtT;
-      MatrixSquareRootTriangular<MatrixType>(T).compute(sqrtT);
-    
-      // Compute square root of m_A
-      result = U * (sqrtT.template triangularView<Upper>() * U.adjoint());
-    }
-    
-  private:
-    const MatrixType& m_A;
+struct matrix_sqrt_compute<MatrixType, 1> {
+  typedef typename MatrixType::PlainObject PlainType;
+  template <typename ResultType>
+  static void run(const MatrixType& arg, ResultType& result) {
+    eigen_assert(arg.rows() == arg.cols());
+
+    // Compute Schur decomposition of arg
+    const ComplexSchur<PlainType> schurOfA(arg);
+    const PlainType& T = schurOfA.matrixT();
+    const PlainType& U = schurOfA.matrixU();
+
+    // Compute square root of T
+    PlainType sqrtT;
+    matrix_sqrt_triangular(T, sqrtT);
+
+    // Compute square root of arg
+    result = U * (sqrtT.template triangularView<Upper>() * U.adjoint());
+  }
 };
 
+}  // end namespace internal
 
 /** \ingroup MatrixFunctions_Module
-  *
-  * \brief Proxy for the matrix square root of some matrix (expression).
-  *
-  * \tparam Derived  Type of the argument to the matrix square root.
-  *
-  * This class holds the argument to the matrix square root until it
-  * is assigned or evaluated for some other reason (so the argument
-  * should not be changed in the meantime). It is the return type of
-  * MatrixBase::sqrt() and most of the time this is the only way it is
-  * used.
-  */
-template<typename Derived> class MatrixSquareRootReturnValue
-: public ReturnByValue<MatrixSquareRootReturnValue<Derived> >
-{
-    typedef typename Derived::Index Index;
-  public:
-    /** \brief Constructor.
-      *
-      * \param[in]  src  %Matrix (expression) forming the argument of the
-      * matrix square root.
-      */
-    MatrixSquareRootReturnValue(const Derived& src) : m_src(src) { }
-
-    /** \brief Compute the matrix square root.
-      *
-      * \param[out]  result  the matrix square root of \p src in the
-      * constructor.
-      */
-    template <typename ResultType>
-    inline void evalTo(ResultType& result) const
-    {
-      const typename Derived::PlainObject srcEvaluated = m_src.eval();
-      MatrixSquareRoot<typename Derived::PlainObject> me(srcEvaluated);
-      me.compute(result);
-    }
+ *
+ * \brief Proxy for the matrix square root of some matrix (expression).
+ *
+ * \tparam Derived  Type of the argument to the matrix square root.
+ *
+ * This class holds the argument to the matrix square root until it
+ * is assigned or evaluated for some other reason (so the argument
+ * should not be changed in the meantime). It is the return type of
+ * MatrixBase::sqrt() and most of the time this is the only way it is
+ * used.
+ */
+template <typename Derived>
+class MatrixSquareRootReturnValue : public ReturnByValue<MatrixSquareRootReturnValue<Derived> > {
+ protected:
+  typedef typename internal::ref_selector<Derived>::type DerivedNested;
+
+ public:
+  /** \brief Constructor.
+   *
+   * \param[in]  src  %Matrix (expression) forming the argument of the
+   * matrix square root.
+   */
+  explicit MatrixSquareRootReturnValue(const Derived& src) : m_src(src) {}
+
+  /** \brief Compute the matrix square root.
+   *
+   * \param[out]  result  the matrix square root of \p src in the
+   * constructor.
+   */
+  template <typename ResultType>
+  inline void evalTo(ResultType& result) const {
+    typedef typename internal::nested_eval<Derived, 10>::type DerivedEvalType;
+    typedef internal::remove_all_t<DerivedEvalType> DerivedEvalTypeClean;
+    DerivedEvalType tmp(m_src);
+    internal::matrix_sqrt_compute<DerivedEvalTypeClean>::run(tmp, result);
+  }
 
-    Index rows() const { return m_src.rows(); }
-    Index cols() const { return m_src.cols(); }
+  Index rows() const { return m_src.rows(); }
+  Index cols() const { return m_src.cols(); }
 
-  protected:
-    const Derived& m_src;
-  private:
-    MatrixSquareRootReturnValue& operator=(const MatrixSquareRootReturnValue&);
+ protected:
+  const DerivedNested m_src;
 };
 
 namespace internal {
-template<typename Derived>
-struct traits<MatrixSquareRootReturnValue<Derived> >
-{
+template <typename Derived>
+struct traits<MatrixSquareRootReturnValue<Derived> > {
   typedef typename Derived::PlainObject ReturnType;
 };
-}
+}  // namespace internal
 
 template <typename Derived>
-const MatrixSquareRootReturnValue<Derived> MatrixBase<Derived>::sqrt() const
-{
+const MatrixSquareRootReturnValue<Derived> MatrixBase<Derived>::sqrt() const {
   eigen_assert(rows() == cols());
   return MatrixSquareRootReturnValue<Derived>(derived());
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_MATRIX_FUNCTION
+#endif  // EIGEN_MATRIX_FUNCTION
diff --git a/inst/include/unsupported/Eigen/src/MatrixFunctions/StemFunction.h b/inst/include/unsupported/Eigen/src/MatrixFunctions/StemFunction.h
index 724e55c1..8050c37c 100644
--- a/inst/include/unsupported/Eigen/src/MatrixFunctions/StemFunction.h
+++ b/inst/include/unsupported/Eigen/src/MatrixFunctions/StemFunction.h
@@ -1,7 +1,7 @@
 // This file is part of Eigen, a lightweight C++ template library
 // for linear algebra.
 //
-// Copyright (C) 2010 Jitse Niesen <jitse@maths.leeds.ac.uk>
+// Copyright (C) 2010, 2013 Jitse Niesen <jitse@maths.leeds.ac.uk>
 //
 // This Source Code Form is subject to the terms of the Mozilla
 // Public License v. 2.0. If a copy of the MPL was not distributed
@@ -10,96 +10,106 @@
 #ifndef EIGEN_STEM_FUNCTION
 #define EIGEN_STEM_FUNCTION
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 
-/** \ingroup MatrixFunctions_Module 
-  * \brief Stem functions corresponding to standard mathematical functions.
-  */
+namespace Eigen {
+
+namespace internal {
+
+/** \brief The exponential function (and its derivatives). */
 template <typename Scalar>
-class StdStemFunctions
-{
-  public:
-
-    /** \brief The exponential function (and its derivatives). */
-    static Scalar exp(Scalar x, int)
-    {
-      return std::exp(x);
-    }
-
-    /** \brief Cosine (and its derivatives). */
-    static Scalar cos(Scalar x, int n)
-    {
-      Scalar res;
-      switch (n % 4) {
-      case 0: 
-	res = std::cos(x);
-	break;
-      case 1:
-	res = -std::sin(x);
-	break;
-      case 2:
-	res = -std::cos(x);
-	break;
-      case 3:
-	res = std::sin(x);
-	break;
-      }
-      return res;
-    }
-
-    /** \brief Sine (and its derivatives). */
-    static Scalar sin(Scalar x, int n)
-    {
-      Scalar res;
-      switch (n % 4) {
-      case 0:
-	res = std::sin(x);
-	break;
-      case 1:
-	res = std::cos(x);
-	break;
-      case 2:
-	res = -std::sin(x);
-	break;
-      case 3:
-	res = -std::cos(x);
-	break;
-      }
-      return res;
-    }
-
-    /** \brief Hyperbolic cosine (and its derivatives). */
-    static Scalar cosh(Scalar x, int n)
-    {
-      Scalar res;
-      switch (n % 2) {
-      case 0:
-	res = std::cosh(x);
-	break;
-      case 1:
-	res = std::sinh(x);
-	break;
-      }
-      return res;
-    }
-	
-    /** \brief Hyperbolic sine (and its derivatives). */
-    static Scalar sinh(Scalar x, int n)
-    {
-      Scalar res;
-      switch (n % 2) {
-      case 0:
-	res = std::sinh(x);
-	break;
-      case 1:
-	res = std::cosh(x);
-	break;
-      }
-      return res;
-    }
-
-}; // end of class StdStemFunctions
-
-} // end namespace Eigen
-
-#endif // EIGEN_STEM_FUNCTION
+Scalar stem_function_exp(Scalar x, int) {
+  using std::exp;
+  return exp(x);
+}
+
+/** \brief Cosine (and its derivatives). */
+template <typename Scalar>
+Scalar stem_function_cos(Scalar x, int n) {
+  using std::cos;
+  using std::sin;
+  Scalar res;
+
+  switch (n % 4) {
+    case 0:
+      res = std::cos(x);
+      break;
+    case 1:
+      res = -std::sin(x);
+      break;
+    case 2:
+      res = -std::cos(x);
+      break;
+    case 3:
+      res = std::sin(x);
+      break;
+  }
+  return res;
+}
+
+/** \brief Sine (and its derivatives). */
+template <typename Scalar>
+Scalar stem_function_sin(Scalar x, int n) {
+  using std::cos;
+  using std::sin;
+  Scalar res;
+
+  switch (n % 4) {
+    case 0:
+      res = std::sin(x);
+      break;
+    case 1:
+      res = std::cos(x);
+      break;
+    case 2:
+      res = -std::sin(x);
+      break;
+    case 3:
+      res = -std::cos(x);
+      break;
+  }
+  return res;
+}
+
+/** \brief Hyperbolic cosine (and its derivatives). */
+template <typename Scalar>
+Scalar stem_function_cosh(Scalar x, int n) {
+  using std::cosh;
+  using std::sinh;
+  Scalar res;
+
+  switch (n % 2) {
+    case 0:
+      res = std::cosh(x);
+      break;
+    case 1:
+      res = std::sinh(x);
+      break;
+  }
+  return res;
+}
+
+/** \brief Hyperbolic sine (and its derivatives). */
+template <typename Scalar>
+Scalar stem_function_sinh(Scalar x, int n) {
+  using std::cosh;
+  using std::sinh;
+  Scalar res;
+
+  switch (n % 2) {
+    case 0:
+      res = std::sinh(x);
+      break;
+    case 1:
+      res = std::cosh(x);
+      break;
+  }
+  return res;
+}
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_STEM_FUNCTION
diff --git a/inst/include/unsupported/Eigen/src/MoreVectorization/MathFunctions.h b/inst/include/unsupported/Eigen/src/MoreVectorization/MathFunctions.h
deleted file mode 100644
index 63cb28de..00000000
--- a/inst/include/unsupported/Eigen/src/MoreVectorization/MathFunctions.h
+++ /dev/null
@@ -1,95 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2009 Rohit Garg <rpg.314@gmail.com>
-// Copyright (C) 2009 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_MOREVECTORIZATION_MATHFUNCTIONS_H
-#define EIGEN_MOREVECTORIZATION_MATHFUNCTIONS_H
-
-namespace Eigen { 
-
-namespace internal {
-
-/** \internal \returns the arcsin of \a a (coeff-wise) */
-template<typename Packet> inline static Packet pasin(Packet a) { return std::asin(a); }
-
-#ifdef EIGEN_VECTORIZE_SSE
-
-template<> EIGEN_DONT_INLINE Packet4f pasin(Packet4f x)
-{
-  _EIGEN_DECLARE_CONST_Packet4f(half, 0.5);
-  _EIGEN_DECLARE_CONST_Packet4f(minus_half, -0.5);
-  _EIGEN_DECLARE_CONST_Packet4f(3half, 1.5);
-
-  _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(sign_mask, 0x80000000);
-
-  _EIGEN_DECLARE_CONST_Packet4f(pi, 3.141592654);
-  _EIGEN_DECLARE_CONST_Packet4f(pi_over_2, 3.141592654*0.5);
-
-  _EIGEN_DECLARE_CONST_Packet4f(asin1, 4.2163199048E-2);
-  _EIGEN_DECLARE_CONST_Packet4f(asin2, 2.4181311049E-2);
-  _EIGEN_DECLARE_CONST_Packet4f(asin3, 4.5470025998E-2);
-  _EIGEN_DECLARE_CONST_Packet4f(asin4, 7.4953002686E-2);
-  _EIGEN_DECLARE_CONST_Packet4f(asin5, 1.6666752422E-1);
-
-  Packet4f a = pabs(x);//got the absolute value
-
-  Packet4f sign_bit= _mm_and_ps(x, p4f_sign_mask);//extracted the sign bit
-
-  Packet4f z1,z2;//will need them during computation    
-
-
-//will compute the two branches for asin
-//so first compare with half
-
-  Packet4f branch_mask= _mm_cmpgt_ps(a, p4f_half);//this is to select which branch to take
-//both will be taken, and finally results will be merged
-//the branch for values >0.5
-
-    {
-//the core series expansion 
-    z1=pmadd(p4f_minus_half,a,p4f_half);
-    Packet4f x1=psqrt(z1);
-    Packet4f s1=pmadd(p4f_asin1, z1, p4f_asin2);
-    Packet4f s2=pmadd(s1, z1, p4f_asin3);
-    Packet4f s3=pmadd(s2,z1, p4f_asin4);
-    Packet4f s4=pmadd(s3,z1, p4f_asin5);
-    Packet4f temp=pmul(s4,z1);//not really a madd but a mul by z so that the next term can be a madd
-    z1=pmadd(temp,x1,x1);
-    z1=padd(z1,z1);
-    z1=psub(p4f_pi_over_2,z1);
-    }
-
-    {
-//the core series expansion 
-    Packet4f x2=a;
-    z2=pmul(x2,x2);
-    Packet4f s1=pmadd(p4f_asin1, z2, p4f_asin2);
-    Packet4f s2=pmadd(s1, z2, p4f_asin3);
-    Packet4f s3=pmadd(s2,z2, p4f_asin4);
-    Packet4f s4=pmadd(s3,z2, p4f_asin5);
-    Packet4f temp=pmul(s4,z2);//not really a madd but a mul by z so that the next term can be a madd
-    z2=pmadd(temp,x2,x2);
-    }
-
-/* select the correct result from the two branch evaluations */
-  z1  = _mm_and_ps(branch_mask, z1);
-  z2  = _mm_andnot_ps(branch_mask, z2);
-  Packet4f z  = _mm_or_ps(z1,z2);
-
-/* update the sign */
-  return _mm_xor_ps(z, sign_bit);
-}
-
-#endif // EIGEN_VECTORIZE_SSE
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_MOREVECTORIZATION_MATHFUNCTIONS_H
diff --git a/inst/include/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h b/inst/include/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h
index b8ba6ddc..079772e6 100644
--- a/inst/include/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h
+++ b/inst/include/unsupported/Eigen/src/NonLinearOptimization/HybridNonLinearSolver.h
@@ -13,589 +13,533 @@
 #ifndef EIGEN_HYBRIDNONLINEARSOLVER_H
 #define EIGEN_HYBRIDNONLINEARSOLVER_H
 
-namespace Eigen { 
-
-namespace HybridNonLinearSolverSpace { 
-    enum Status {
-        Running = -1,
-        ImproperInputParameters = 0,
-        RelativeErrorTooSmall = 1,
-        TooManyFunctionEvaluation = 2,
-        TolTooSmall = 3,
-        NotMakingProgressJacobian = 4,
-        NotMakingProgressIterations = 5,
-        UserAsked = 6
-    };
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace HybridNonLinearSolverSpace {
+enum Status {
+  Running = -1,
+  ImproperInputParameters = 0,
+  RelativeErrorTooSmall = 1,
+  TooManyFunctionEvaluation = 2,
+  TolTooSmall = 3,
+  NotMakingProgressJacobian = 4,
+  NotMakingProgressIterations = 5,
+  UserAsked = 6
+};
 }
 
 /**
-  * \ingroup NonLinearOptimization_Module
-  * \brief Finds a zero of a system of n
-  * nonlinear functions in n variables by a modification of the Powell
-  * hybrid method ("dogleg").
-  *
-  * The user must provide a subroutine which calculates the
-  * functions. The Jacobian is either provided by the user, or approximated
-  * using a forward-difference method.
-  *
-  */
-template<typename FunctorType, typename Scalar=double>
-class HybridNonLinearSolver
-{
-public:
-    typedef DenseIndex Index;
-
-    HybridNonLinearSolver(FunctorType &_functor)
-        : functor(_functor) { nfev=njev=iter = 0;  fnorm= 0.; useExternalScaling=false;}
-
-    struct Parameters {
-        Parameters()
-            : factor(Scalar(100.))
-            , maxfev(1000)
-            , xtol(std::sqrt(NumTraits<Scalar>::epsilon()))
-            , nb_of_subdiagonals(-1)
-            , nb_of_superdiagonals(-1)
-            , epsfcn(Scalar(0.)) {}
-        Scalar factor;
-        Index maxfev;   // maximum number of function evaluation
-        Scalar xtol;
-        Index nb_of_subdiagonals;
-        Index nb_of_superdiagonals;
-        Scalar epsfcn;
-    };
-    typedef Matrix< Scalar, Dynamic, 1 > FVectorType;
-    typedef Matrix< Scalar, Dynamic, Dynamic > JacobianType;
-    /* TODO: if eigen provides a triangular storage, use it here */
-    typedef Matrix< Scalar, Dynamic, Dynamic > UpperTriangularType;
-
-    HybridNonLinearSolverSpace::Status hybrj1(
-            FVectorType  &x,
-            const Scalar tol = std::sqrt(NumTraits<Scalar>::epsilon())
-            );
-
-    HybridNonLinearSolverSpace::Status solveInit(FVectorType  &x);
-    HybridNonLinearSolverSpace::Status solveOneStep(FVectorType  &x);
-    HybridNonLinearSolverSpace::Status solve(FVectorType  &x);
-
-    HybridNonLinearSolverSpace::Status hybrd1(
-            FVectorType  &x,
-            const Scalar tol = std::sqrt(NumTraits<Scalar>::epsilon())
-            );
-
-    HybridNonLinearSolverSpace::Status solveNumericalDiffInit(FVectorType  &x);
-    HybridNonLinearSolverSpace::Status solveNumericalDiffOneStep(FVectorType  &x);
-    HybridNonLinearSolverSpace::Status solveNumericalDiff(FVectorType  &x);
-
-    void resetParameters(void) { parameters = Parameters(); }
-    Parameters parameters;
-    FVectorType  fvec, qtf, diag;
-    JacobianType fjac;
-    UpperTriangularType R;
-    Index nfev;
-    Index njev;
-    Index iter;
-    Scalar fnorm;
-    bool useExternalScaling; 
-private:
-    FunctorType &functor;
-    Index n;
-    Scalar sum;
-    bool sing;
-    Scalar temp;
-    Scalar delta;
-    bool jeval;
-    Index ncsuc;
-    Scalar ratio;
-    Scalar pnorm, xnorm, fnorm1;
-    Index nslow1, nslow2;
-    Index ncfail;
-    Scalar actred, prered;
-    FVectorType wa1, wa2, wa3, wa4;
-
-    HybridNonLinearSolver& operator=(const HybridNonLinearSolver&);
+ * \ingroup NonLinearOptimization_Module
+ * \brief Finds a zero of a system of n
+ * nonlinear functions in n variables by a modification of the Powell
+ * hybrid method ("dogleg").
+ *
+ * The user must provide a subroutine which calculates the
+ * functions. The Jacobian is either provided by the user, or approximated
+ * using a forward-difference method.
+ *
+ */
+template <typename FunctorType, typename Scalar = double>
+class HybridNonLinearSolver {
+ public:
+  typedef DenseIndex Index;
+
+  HybridNonLinearSolver(FunctorType &_functor) : functor(_functor) {
+    nfev = njev = iter = 0;
+    fnorm = 0.;
+    useExternalScaling = false;
+  }
+
+  struct Parameters {
+    Parameters()
+        : factor(Scalar(100.)),
+          maxfev(1000),
+          xtol(numext::sqrt(NumTraits<Scalar>::epsilon())),
+          nb_of_subdiagonals(-1),
+          nb_of_superdiagonals(-1),
+          epsfcn(Scalar(0.)) {}
+    Scalar factor;
+    Index maxfev;  // maximum number of function evaluation
+    Scalar xtol;
+    Index nb_of_subdiagonals;
+    Index nb_of_superdiagonals;
+    Scalar epsfcn;
+  };
+  typedef Matrix<Scalar, Dynamic, 1> FVectorType;
+  typedef Matrix<Scalar, Dynamic, Dynamic> JacobianType;
+  /* TODO: if eigen provides a triangular storage, use it here */
+  typedef Matrix<Scalar, Dynamic, Dynamic> UpperTriangularType;
+
+  HybridNonLinearSolverSpace::Status hybrj1(FVectorType &x,
+                                            const Scalar tol = numext::sqrt(NumTraits<Scalar>::epsilon()));
+
+  HybridNonLinearSolverSpace::Status solveInit(FVectorType &x);
+  HybridNonLinearSolverSpace::Status solveOneStep(FVectorType &x);
+  HybridNonLinearSolverSpace::Status solve(FVectorType &x);
+
+  HybridNonLinearSolverSpace::Status hybrd1(FVectorType &x,
+                                            const Scalar tol = numext::sqrt(NumTraits<Scalar>::epsilon()));
+
+  HybridNonLinearSolverSpace::Status solveNumericalDiffInit(FVectorType &x);
+  HybridNonLinearSolverSpace::Status solveNumericalDiffOneStep(FVectorType &x);
+  HybridNonLinearSolverSpace::Status solveNumericalDiff(FVectorType &x);
+
+  void resetParameters(void) { parameters = Parameters(); }
+  Parameters parameters;
+  FVectorType fvec, qtf, diag;
+  JacobianType fjac;
+  UpperTriangularType R;
+  Index nfev;
+  Index njev;
+  Index iter;
+  Scalar fnorm;
+  bool useExternalScaling;
+
+ private:
+  FunctorType &functor;
+  Index n;
+  Scalar sum;
+  bool sing;
+  Scalar temp;
+  Scalar delta;
+  bool jeval;
+  Index ncsuc;
+  Scalar ratio;
+  Scalar pnorm, xnorm, fnorm1;
+  Index nslow1, nslow2;
+  Index ncfail;
+  Scalar actred, prered;
+  FVectorType wa1, wa2, wa3, wa4;
+
+  HybridNonLinearSolver &operator=(const HybridNonLinearSolver &);
 };
 
+template <typename FunctorType, typename Scalar>
+HybridNonLinearSolverSpace::Status HybridNonLinearSolver<FunctorType, Scalar>::hybrj1(FVectorType &x,
+                                                                                      const Scalar tol) {
+  n = x.size();
 
+  /* check the input parameters for errors. */
+  if (n <= 0 || tol < 0.) return HybridNonLinearSolverSpace::ImproperInputParameters;
 
-template<typename FunctorType, typename Scalar>
-HybridNonLinearSolverSpace::Status
-HybridNonLinearSolver<FunctorType,Scalar>::hybrj1(
-        FVectorType  &x,
-        const Scalar tol
-        )
-{
-    n = x.size();
-
-    /* check the input parameters for errors. */
-    if (n <= 0 || tol < 0.)
-        return HybridNonLinearSolverSpace::ImproperInputParameters;
-
-    resetParameters();
-    parameters.maxfev = 100*(n+1);
-    parameters.xtol = tol;
-    diag.setConstant(n, 1.);
-    useExternalScaling = true;
-    return solve(x);
+  resetParameters();
+  parameters.maxfev = 100 * (n + 1);
+  parameters.xtol = tol;
+  diag.setConstant(n, 1.);
+  useExternalScaling = true;
+  return solve(x);
 }
 
-template<typename FunctorType, typename Scalar>
-HybridNonLinearSolverSpace::Status
-HybridNonLinearSolver<FunctorType,Scalar>::solveInit(FVectorType  &x)
-{
-    n = x.size();
-
-    wa1.resize(n); wa2.resize(n); wa3.resize(n); wa4.resize(n);
-    fvec.resize(n);
-    qtf.resize(n);
-    fjac.resize(n, n);
-    if (!useExternalScaling)
-        diag.resize(n);
-    eigen_assert( (!useExternalScaling || diag.size()==n) || "When useExternalScaling is set, the caller must provide a valid 'diag'");
-
-    /* Function Body */
-    nfev = 0;
-    njev = 0;
-
-    /*     check the input parameters for errors. */
-    if (n <= 0 || parameters.xtol < 0. || parameters.maxfev <= 0 || parameters.factor <= 0. )
-        return HybridNonLinearSolverSpace::ImproperInputParameters;
-    if (useExternalScaling)
-        for (Index j = 0; j < n; ++j)
-            if (diag[j] <= 0.)
-                return HybridNonLinearSolverSpace::ImproperInputParameters;
-
-    /*     evaluate the function at the starting point */
-    /*     and calculate its norm. */
-    nfev = 1;
-    if ( functor(x, fvec) < 0)
-        return HybridNonLinearSolverSpace::UserAsked;
-    fnorm = fvec.stableNorm();
-
-    /*     initialize iteration counter and monitors. */
-    iter = 1;
-    ncsuc = 0;
-    ncfail = 0;
-    nslow1 = 0;
-    nslow2 = 0;
-
-    return HybridNonLinearSolverSpace::Running;
+template <typename FunctorType, typename Scalar>
+HybridNonLinearSolverSpace::Status HybridNonLinearSolver<FunctorType, Scalar>::solveInit(FVectorType &x) {
+  n = x.size();
+
+  wa1.resize(n);
+  wa2.resize(n);
+  wa3.resize(n);
+  wa4.resize(n);
+  fvec.resize(n);
+  qtf.resize(n);
+  fjac.resize(n, n);
+  if (!useExternalScaling) diag.resize(n);
+  eigen_assert((!useExternalScaling || diag.size() == n) &&
+               "When useExternalScaling is set, the caller must provide a valid 'diag'");
+
+  /* Function Body */
+  nfev = 0;
+  njev = 0;
+
+  /*     check the input parameters for errors. */
+  if (n <= 0 || parameters.xtol < 0. || parameters.maxfev <= 0 || parameters.factor <= 0.)
+    return HybridNonLinearSolverSpace::ImproperInputParameters;
+  if (useExternalScaling)
+    for (Index j = 0; j < n; ++j)
+      if (diag[j] <= 0.) return HybridNonLinearSolverSpace::ImproperInputParameters;
+
+  /*     evaluate the function at the starting point */
+  /*     and calculate its norm. */
+  nfev = 1;
+  if (functor(x, fvec) < 0) return HybridNonLinearSolverSpace::UserAsked;
+  fnorm = fvec.stableNorm();
+
+  /*     initialize iteration counter and monitors. */
+  iter = 1;
+  ncsuc = 0;
+  ncfail = 0;
+  nslow1 = 0;
+  nslow2 = 0;
+
+  return HybridNonLinearSolverSpace::Running;
 }
 
-template<typename FunctorType, typename Scalar>
-HybridNonLinearSolverSpace::Status
-HybridNonLinearSolver<FunctorType,Scalar>::solveOneStep(FVectorType  &x)
-{
-    using std::abs;
-    
-    eigen_assert(x.size()==n); // check the caller is not cheating us
-
-    Index j;
-    std::vector<JacobiRotation<Scalar> > v_givens(n), w_givens(n);
-
-    jeval = true;
-
-    /* calculate the jacobian matrix. */
-    if ( functor.df(x, fjac) < 0)
-        return HybridNonLinearSolverSpace::UserAsked;
-    ++njev;
-
-    wa2 = fjac.colwise().blueNorm();
-
-    /* on the first iteration and if external scaling is not used, scale according */
-    /* to the norms of the columns of the initial jacobian. */
-    if (iter == 1) {
-        if (!useExternalScaling)
-            for (j = 0; j < n; ++j)
-                diag[j] = (wa2[j]==0.) ? 1. : wa2[j];
-
-        /* on the first iteration, calculate the norm of the scaled x */
-        /* and initialize the step bound delta. */
-        xnorm = diag.cwiseProduct(x).stableNorm();
-        delta = parameters.factor * xnorm;
-        if (delta == 0.)
-            delta = parameters.factor;
-    }
+template <typename FunctorType, typename Scalar>
+HybridNonLinearSolverSpace::Status HybridNonLinearSolver<FunctorType, Scalar>::solveOneStep(FVectorType &x) {
+  using std::abs;
+
+  eigen_assert(x.size() == n);  // check the caller is not cheating us
 
-    /* compute the qr factorization of the jacobian. */
-    HouseholderQR<JacobianType> qrfac(fjac); // no pivoting:
+  Index j;
+  std::vector<JacobiRotation<Scalar> > v_givens(n), w_givens(n);
 
-    /* copy the triangular factor of the qr factorization into r. */
-    R = qrfac.matrixQR();
+  jeval = true;
 
-    /* accumulate the orthogonal factor in fjac. */
-    fjac = qrfac.householderQ();
+  /* calculate the jacobian matrix. */
+  if (functor.df(x, fjac) < 0) return HybridNonLinearSolverSpace::UserAsked;
+  ++njev;
 
-    /* form (q transpose)*fvec and store in qtf. */
-    qtf = fjac.transpose() * fvec;
+  wa2 = fjac.colwise().blueNorm();
 
-    /* rescale if necessary. */
+  /* on the first iteration and if external scaling is not used, scale according */
+  /* to the norms of the columns of the initial jacobian. */
+  if (iter == 1) {
     if (!useExternalScaling)
-        diag = diag.cwiseMax(wa2);
-
-    while (true) {
-        /* determine the direction p. */
-        internal::dogleg<Scalar>(R, diag, qtf, delta, wa1);
-
-        /* store the direction p and x + p. calculate the norm of p. */
-        wa1 = -wa1;
-        wa2 = x + wa1;
-        pnorm = diag.cwiseProduct(wa1).stableNorm();
-
-        /* on the first iteration, adjust the initial step bound. */
-        if (iter == 1)
-            delta = (std::min)(delta,pnorm);
-
-        /* evaluate the function at x + p and calculate its norm. */
-        if ( functor(wa2, wa4) < 0)
-            return HybridNonLinearSolverSpace::UserAsked;
-        ++nfev;
-        fnorm1 = wa4.stableNorm();
-
-        /* compute the scaled actual reduction. */
-        actred = -1.;
-        if (fnorm1 < fnorm) /* Computing 2nd power */
-            actred = 1. - numext::abs2(fnorm1 / fnorm);
-
-        /* compute the scaled predicted reduction. */
-        wa3 = R.template triangularView<Upper>()*wa1 + qtf;
-        temp = wa3.stableNorm();
-        prered = 0.;
-        if (temp < fnorm) /* Computing 2nd power */
-            prered = 1. - numext::abs2(temp / fnorm);
-
-        /* compute the ratio of the actual to the predicted reduction. */
-        ratio = 0.;
-        if (prered > 0.)
-            ratio = actred / prered;
-
-        /* update the step bound. */
-        if (ratio < Scalar(.1)) {
-            ncsuc = 0;
-            ++ncfail;
-            delta = Scalar(.5) * delta;
-        } else {
-            ncfail = 0;
-            ++ncsuc;
-            if (ratio >= Scalar(.5) || ncsuc > 1)
-                delta = (std::max)(delta, pnorm / Scalar(.5));
-            if (abs(ratio - 1.) <= Scalar(.1)) {
-                delta = pnorm / Scalar(.5);
-            }
-        }
-
-        /* test for successful iteration. */
-        if (ratio >= Scalar(1e-4)) {
-            /* successful iteration. update x, fvec, and their norms. */
-            x = wa2;
-            wa2 = diag.cwiseProduct(x);
-            fvec = wa4;
-            xnorm = wa2.stableNorm();
-            fnorm = fnorm1;
-            ++iter;
-        }
-
-        /* determine the progress of the iteration. */
-        ++nslow1;
-        if (actred >= Scalar(.001))
-            nslow1 = 0;
-        if (jeval)
-            ++nslow2;
-        if (actred >= Scalar(.1))
-            nslow2 = 0;
-
-        /* test for convergence. */
-        if (delta <= parameters.xtol * xnorm || fnorm == 0.)
-            return HybridNonLinearSolverSpace::RelativeErrorTooSmall;
-
-        /* tests for termination and stringent tolerances. */
-        if (nfev >= parameters.maxfev)
-            return HybridNonLinearSolverSpace::TooManyFunctionEvaluation;
-        if (Scalar(.1) * (std::max)(Scalar(.1) * delta, pnorm) <= NumTraits<Scalar>::epsilon() * xnorm)
-            return HybridNonLinearSolverSpace::TolTooSmall;
-        if (nslow2 == 5)
-            return HybridNonLinearSolverSpace::NotMakingProgressJacobian;
-        if (nslow1 == 10)
-            return HybridNonLinearSolverSpace::NotMakingProgressIterations;
-
-        /* criterion for recalculating jacobian. */
-        if (ncfail == 2)
-            break; // leave inner loop and go for the next outer loop iteration
-
-        /* calculate the rank one modification to the jacobian */
-        /* and update qtf if necessary. */
-        wa1 = diag.cwiseProduct( diag.cwiseProduct(wa1)/pnorm );
-        wa2 = fjac.transpose() * wa4;
-        if (ratio >= Scalar(1e-4))
-            qtf = wa2;
-        wa2 = (wa2-wa3)/pnorm;
-
-        /* compute the qr factorization of the updated jacobian. */
-        internal::r1updt<Scalar>(R, wa1, v_givens, w_givens, wa2, wa3, &sing);
-        internal::r1mpyq<Scalar>(n, n, fjac.data(), v_givens, w_givens);
-        internal::r1mpyq<Scalar>(1, n, qtf.data(), v_givens, w_givens);
-
-        jeval = false;
+      for (j = 0; j < n; ++j) diag[j] = (wa2[j] == 0.) ? 1. : wa2[j];
+
+    /* on the first iteration, calculate the norm of the scaled x */
+    /* and initialize the step bound delta. */
+    xnorm = diag.cwiseProduct(x).stableNorm();
+    delta = parameters.factor * xnorm;
+    if (delta == 0.) delta = parameters.factor;
+  }
+
+  /* compute the qr factorization of the jacobian. */
+  HouseholderQR<JacobianType> qrfac(fjac);  // no pivoting:
+
+  /* copy the triangular factor of the qr factorization into r. */
+  R = qrfac.matrixQR();
+
+  /* accumulate the orthogonal factor in fjac. */
+  fjac = qrfac.householderQ();
+
+  /* form (q transpose)*fvec and store in qtf. */
+  qtf = fjac.transpose() * fvec;
+
+  /* rescale if necessary. */
+  if (!useExternalScaling) diag = diag.cwiseMax(wa2);
+
+  while (true) {
+    /* determine the direction p. */
+    internal::dogleg<Scalar>(R, diag, qtf, delta, wa1);
+
+    /* store the direction p and x + p. calculate the norm of p. */
+    wa1 = -wa1;
+    wa2 = x + wa1;
+    pnorm = diag.cwiseProduct(wa1).stableNorm();
+
+    /* on the first iteration, adjust the initial step bound. */
+    if (iter == 1) delta = (std::min)(delta, pnorm);
+
+    /* evaluate the function at x + p and calculate its norm. */
+    if (functor(wa2, wa4) < 0) return HybridNonLinearSolverSpace::UserAsked;
+    ++nfev;
+    fnorm1 = wa4.stableNorm();
+
+    /* compute the scaled actual reduction. */
+    actred = -1.;
+    if (fnorm1 < fnorm) /* Computing 2nd power */
+      actred = 1. - numext::abs2(fnorm1 / fnorm);
+
+    /* compute the scaled predicted reduction. */
+    wa3 = R.template triangularView<Upper>() * wa1 + qtf;
+    temp = wa3.stableNorm();
+    prered = 0.;
+    if (temp < fnorm) /* Computing 2nd power */
+      prered = 1. - numext::abs2(temp / fnorm);
+
+    /* compute the ratio of the actual to the predicted reduction. */
+    ratio = 0.;
+    if (prered > 0.) ratio = actred / prered;
+
+    /* update the step bound. */
+    if (ratio < Scalar(.1)) {
+      ncsuc = 0;
+      ++ncfail;
+      delta = Scalar(.5) * delta;
+    } else {
+      ncfail = 0;
+      ++ncsuc;
+      if (ratio >= Scalar(.5) || ncsuc > 1) delta = (std::max)(delta, pnorm / Scalar(.5));
+      if (abs(ratio - 1.) <= Scalar(.1)) {
+        delta = pnorm / Scalar(.5);
+      }
     }
-    return HybridNonLinearSolverSpace::Running;
-}
 
-template<typename FunctorType, typename Scalar>
-HybridNonLinearSolverSpace::Status
-HybridNonLinearSolver<FunctorType,Scalar>::solve(FVectorType  &x)
-{
-    HybridNonLinearSolverSpace::Status status = solveInit(x);
-    if (status==HybridNonLinearSolverSpace::ImproperInputParameters)
-        return status;
-    while (status==HybridNonLinearSolverSpace::Running)
-        status = solveOneStep(x);
-    return status;
-}
+    /* test for successful iteration. */
+    if (ratio >= Scalar(1e-4)) {
+      /* successful iteration. update x, fvec, and their norms. */
+      x = wa2;
+      wa2 = diag.cwiseProduct(x);
+      fvec = wa4;
+      xnorm = wa2.stableNorm();
+      fnorm = fnorm1;
+      ++iter;
+    }
 
+    /* determine the progress of the iteration. */
+    ++nslow1;
+    if (actred >= Scalar(.001)) nslow1 = 0;
+    if (jeval) ++nslow2;
+    if (actred >= Scalar(.1)) nslow2 = 0;
+
+    /* test for convergence. */
+    if (delta <= parameters.xtol * xnorm || fnorm == 0.) return HybridNonLinearSolverSpace::RelativeErrorTooSmall;
+
+    /* tests for termination and stringent tolerances. */
+    if (nfev >= parameters.maxfev) return HybridNonLinearSolverSpace::TooManyFunctionEvaluation;
+    if (Scalar(.1) * (std::max)(Scalar(.1) * delta, pnorm) <= NumTraits<Scalar>::epsilon() * xnorm)
+      return HybridNonLinearSolverSpace::TolTooSmall;
+    if (nslow2 == 5) return HybridNonLinearSolverSpace::NotMakingProgressJacobian;
+    if (nslow1 == 10) return HybridNonLinearSolverSpace::NotMakingProgressIterations;
+
+    /* criterion for recalculating jacobian. */
+    if (ncfail == 2) break;  // leave inner loop and go for the next outer loop iteration
+
+    /* calculate the rank one modification to the jacobian */
+    /* and update qtf if necessary. */
+    wa1 = diag.cwiseProduct(diag.cwiseProduct(wa1) / pnorm);
+    wa2 = fjac.transpose() * wa4;
+    if (ratio >= Scalar(1e-4)) qtf = wa2;
+    wa2 = (wa2 - wa3) / pnorm;
+
+    /* compute the qr factorization of the updated jacobian. */
+    internal::r1updt<Scalar>(R, wa1, v_givens, w_givens, wa2, wa3, &sing);
+    internal::r1mpyq<Scalar>(n, n, fjac.data(), v_givens, w_givens);
+    internal::r1mpyq<Scalar>(1, n, qtf.data(), v_givens, w_givens);
+
+    jeval = false;
+  }
+  return HybridNonLinearSolverSpace::Running;
+}
 
+template <typename FunctorType, typename Scalar>
+HybridNonLinearSolverSpace::Status HybridNonLinearSolver<FunctorType, Scalar>::solve(FVectorType &x) {
+  HybridNonLinearSolverSpace::Status status = solveInit(x);
+  if (status == HybridNonLinearSolverSpace::ImproperInputParameters) return status;
+  while (status == HybridNonLinearSolverSpace::Running) status = solveOneStep(x);
+  return status;
+}
 
-template<typename FunctorType, typename Scalar>
-HybridNonLinearSolverSpace::Status
-HybridNonLinearSolver<FunctorType,Scalar>::hybrd1(
-        FVectorType  &x,
-        const Scalar tol
-        )
-{
-    n = x.size();
+template <typename FunctorType, typename Scalar>
+HybridNonLinearSolverSpace::Status HybridNonLinearSolver<FunctorType, Scalar>::hybrd1(FVectorType &x,
+                                                                                      const Scalar tol) {
+  n = x.size();
 
-    /* check the input parameters for errors. */
-    if (n <= 0 || tol < 0.)
-        return HybridNonLinearSolverSpace::ImproperInputParameters;
+  /* check the input parameters for errors. */
+  if (n <= 0 || tol < 0.) return HybridNonLinearSolverSpace::ImproperInputParameters;
 
-    resetParameters();
-    parameters.maxfev = 200*(n+1);
-    parameters.xtol = tol;
+  resetParameters();
+  parameters.maxfev = 200 * (n + 1);
+  parameters.xtol = tol;
 
-    diag.setConstant(n, 1.);
-    useExternalScaling = true;
-    return solveNumericalDiff(x);
+  diag.setConstant(n, 1.);
+  useExternalScaling = true;
+  return solveNumericalDiff(x);
 }
 
-template<typename FunctorType, typename Scalar>
-HybridNonLinearSolverSpace::Status
-HybridNonLinearSolver<FunctorType,Scalar>::solveNumericalDiffInit(FVectorType  &x)
-{
-    n = x.size();
-
-    if (parameters.nb_of_subdiagonals<0) parameters.nb_of_subdiagonals= n-1;
-    if (parameters.nb_of_superdiagonals<0) parameters.nb_of_superdiagonals= n-1;
-
-    wa1.resize(n); wa2.resize(n); wa3.resize(n); wa4.resize(n);
-    qtf.resize(n);
-    fjac.resize(n, n);
-    fvec.resize(n);
-    if (!useExternalScaling)
-        diag.resize(n);
-    eigen_assert( (!useExternalScaling || diag.size()==n) || "When useExternalScaling is set, the caller must provide a valid 'diag'");
-
-    /* Function Body */
-    nfev = 0;
-    njev = 0;
-
-    /*     check the input parameters for errors. */
-    if (n <= 0 || parameters.xtol < 0. || parameters.maxfev <= 0 || parameters.nb_of_subdiagonals< 0 || parameters.nb_of_superdiagonals< 0 || parameters.factor <= 0. )
-        return HybridNonLinearSolverSpace::ImproperInputParameters;
-    if (useExternalScaling)
-        for (Index j = 0; j < n; ++j)
-            if (diag[j] <= 0.)
-                return HybridNonLinearSolverSpace::ImproperInputParameters;
-
-    /*     evaluate the function at the starting point */
-    /*     and calculate its norm. */
-    nfev = 1;
-    if ( functor(x, fvec) < 0)
-        return HybridNonLinearSolverSpace::UserAsked;
-    fnorm = fvec.stableNorm();
-
-    /*     initialize iteration counter and monitors. */
-    iter = 1;
-    ncsuc = 0;
-    ncfail = 0;
-    nslow1 = 0;
-    nslow2 = 0;
-
-    return HybridNonLinearSolverSpace::Running;
+template <typename FunctorType, typename Scalar>
+HybridNonLinearSolverSpace::Status HybridNonLinearSolver<FunctorType, Scalar>::solveNumericalDiffInit(FVectorType &x) {
+  n = x.size();
+
+  if (parameters.nb_of_subdiagonals < 0) parameters.nb_of_subdiagonals = n - 1;
+  if (parameters.nb_of_superdiagonals < 0) parameters.nb_of_superdiagonals = n - 1;
+
+  wa1.resize(n);
+  wa2.resize(n);
+  wa3.resize(n);
+  wa4.resize(n);
+  qtf.resize(n);
+  fjac.resize(n, n);
+  fvec.resize(n);
+  if (!useExternalScaling) diag.resize(n);
+  eigen_assert((!useExternalScaling || diag.size() == n) &&
+               "When useExternalScaling is set, the caller must provide a valid 'diag'");
+
+  /* Function Body */
+  nfev = 0;
+  njev = 0;
+
+  /*     check the input parameters for errors. */
+  if (n <= 0 || parameters.xtol < 0. || parameters.maxfev <= 0 || parameters.nb_of_subdiagonals < 0 ||
+      parameters.nb_of_superdiagonals < 0 || parameters.factor <= 0.)
+    return HybridNonLinearSolverSpace::ImproperInputParameters;
+  if (useExternalScaling)
+    for (Index j = 0; j < n; ++j)
+      if (diag[j] <= 0.) return HybridNonLinearSolverSpace::ImproperInputParameters;
+
+  /*     evaluate the function at the starting point */
+  /*     and calculate its norm. */
+  nfev = 1;
+  if (functor(x, fvec) < 0) return HybridNonLinearSolverSpace::UserAsked;
+  fnorm = fvec.stableNorm();
+
+  /*     initialize iteration counter and monitors. */
+  iter = 1;
+  ncsuc = 0;
+  ncfail = 0;
+  nslow1 = 0;
+  nslow2 = 0;
+
+  return HybridNonLinearSolverSpace::Running;
 }
 
-template<typename FunctorType, typename Scalar>
-HybridNonLinearSolverSpace::Status
-HybridNonLinearSolver<FunctorType,Scalar>::solveNumericalDiffOneStep(FVectorType  &x)
-{
-    using std::sqrt;
-    using std::abs;
-    
-    assert(x.size()==n); // check the caller is not cheating us
-
-    Index j;
-    std::vector<JacobiRotation<Scalar> > v_givens(n), w_givens(n);
-
-    jeval = true;
-    if (parameters.nb_of_subdiagonals<0) parameters.nb_of_subdiagonals= n-1;
-    if (parameters.nb_of_superdiagonals<0) parameters.nb_of_superdiagonals= n-1;
-
-    /* calculate the jacobian matrix. */
-    if (internal::fdjac1(functor, x, fvec, fjac, parameters.nb_of_subdiagonals, parameters.nb_of_superdiagonals, parameters.epsfcn) <0)
-        return HybridNonLinearSolverSpace::UserAsked;
-    nfev += (std::min)(parameters.nb_of_subdiagonals+parameters.nb_of_superdiagonals+ 1, n);
-
-    wa2 = fjac.colwise().blueNorm();
-
-    /* on the first iteration and if external scaling is not used, scale according */
-    /* to the norms of the columns of the initial jacobian. */
-    if (iter == 1) {
-        if (!useExternalScaling)
-            for (j = 0; j < n; ++j)
-                diag[j] = (wa2[j]==0.) ? 1. : wa2[j];
-
-        /* on the first iteration, calculate the norm of the scaled x */
-        /* and initialize the step bound delta. */
-        xnorm = diag.cwiseProduct(x).stableNorm();
-        delta = parameters.factor * xnorm;
-        if (delta == 0.)
-            delta = parameters.factor;
-    }
+template <typename FunctorType, typename Scalar>
+HybridNonLinearSolverSpace::Status HybridNonLinearSolver<FunctorType, Scalar>::solveNumericalDiffOneStep(
+    FVectorType &x) {
+  using std::abs;
+  using std::sqrt;
+
+  eigen_assert(x.size() == n);  // check the caller is not cheating us
 
-    /* compute the qr factorization of the jacobian. */
-    HouseholderQR<JacobianType> qrfac(fjac); // no pivoting:
+  Index j;
+  std::vector<JacobiRotation<Scalar> > v_givens(n), w_givens(n);
 
-    /* copy the triangular factor of the qr factorization into r. */
-    R = qrfac.matrixQR();
+  jeval = true;
+  if (parameters.nb_of_subdiagonals < 0) parameters.nb_of_subdiagonals = n - 1;
+  if (parameters.nb_of_superdiagonals < 0) parameters.nb_of_superdiagonals = n - 1;
 
-    /* accumulate the orthogonal factor in fjac. */
-    fjac = qrfac.householderQ();
+  /* calculate the jacobian matrix. */
+  if (internal::fdjac1(functor, x, fvec, fjac, parameters.nb_of_subdiagonals, parameters.nb_of_superdiagonals,
+                       parameters.epsfcn) < 0)
+    return HybridNonLinearSolverSpace::UserAsked;
+  nfev += (std::min)(parameters.nb_of_subdiagonals + parameters.nb_of_superdiagonals + 1, n);
 
-    /* form (q transpose)*fvec and store in qtf. */
-    qtf = fjac.transpose() * fvec;
+  wa2 = fjac.colwise().blueNorm();
 
-    /* rescale if necessary. */
+  /* on the first iteration and if external scaling is not used, scale according */
+  /* to the norms of the columns of the initial jacobian. */
+  if (iter == 1) {
     if (!useExternalScaling)
-        diag = diag.cwiseMax(wa2);
-
-    while (true) {
-        /* determine the direction p. */
-        internal::dogleg<Scalar>(R, diag, qtf, delta, wa1);
-
-        /* store the direction p and x + p. calculate the norm of p. */
-        wa1 = -wa1;
-        wa2 = x + wa1;
-        pnorm = diag.cwiseProduct(wa1).stableNorm();
-
-        /* on the first iteration, adjust the initial step bound. */
-        if (iter == 1)
-            delta = (std::min)(delta,pnorm);
-
-        /* evaluate the function at x + p and calculate its norm. */
-        if ( functor(wa2, wa4) < 0)
-            return HybridNonLinearSolverSpace::UserAsked;
-        ++nfev;
-        fnorm1 = wa4.stableNorm();
-
-        /* compute the scaled actual reduction. */
-        actred = -1.;
-        if (fnorm1 < fnorm) /* Computing 2nd power */
-            actred = 1. - numext::abs2(fnorm1 / fnorm);
-
-        /* compute the scaled predicted reduction. */
-        wa3 = R.template triangularView<Upper>()*wa1 + qtf;
-        temp = wa3.stableNorm();
-        prered = 0.;
-        if (temp < fnorm) /* Computing 2nd power */
-            prered = 1. - numext::abs2(temp / fnorm);
-
-        /* compute the ratio of the actual to the predicted reduction. */
-        ratio = 0.;
-        if (prered > 0.)
-            ratio = actred / prered;
-
-        /* update the step bound. */
-        if (ratio < Scalar(.1)) {
-            ncsuc = 0;
-            ++ncfail;
-            delta = Scalar(.5) * delta;
-        } else {
-            ncfail = 0;
-            ++ncsuc;
-            if (ratio >= Scalar(.5) || ncsuc > 1)
-                delta = (std::max)(delta, pnorm / Scalar(.5));
-            if (abs(ratio - 1.) <= Scalar(.1)) {
-                delta = pnorm / Scalar(.5);
-            }
-        }
-
-        /* test for successful iteration. */
-        if (ratio >= Scalar(1e-4)) {
-            /* successful iteration. update x, fvec, and their norms. */
-            x = wa2;
-            wa2 = diag.cwiseProduct(x);
-            fvec = wa4;
-            xnorm = wa2.stableNorm();
-            fnorm = fnorm1;
-            ++iter;
-        }
-
-        /* determine the progress of the iteration. */
-        ++nslow1;
-        if (actred >= Scalar(.001))
-            nslow1 = 0;
-        if (jeval)
-            ++nslow2;
-        if (actred >= Scalar(.1))
-            nslow2 = 0;
-
-        /* test for convergence. */
-        if (delta <= parameters.xtol * xnorm || fnorm == 0.)
-            return HybridNonLinearSolverSpace::RelativeErrorTooSmall;
-
-        /* tests for termination and stringent tolerances. */
-        if (nfev >= parameters.maxfev)
-            return HybridNonLinearSolverSpace::TooManyFunctionEvaluation;
-        if (Scalar(.1) * (std::max)(Scalar(.1) * delta, pnorm) <= NumTraits<Scalar>::epsilon() * xnorm)
-            return HybridNonLinearSolverSpace::TolTooSmall;
-        if (nslow2 == 5)
-            return HybridNonLinearSolverSpace::NotMakingProgressJacobian;
-        if (nslow1 == 10)
-            return HybridNonLinearSolverSpace::NotMakingProgressIterations;
-
-        /* criterion for recalculating jacobian. */
-        if (ncfail == 2)
-            break; // leave inner loop and go for the next outer loop iteration
-
-        /* calculate the rank one modification to the jacobian */
-        /* and update qtf if necessary. */
-        wa1 = diag.cwiseProduct( diag.cwiseProduct(wa1)/pnorm );
-        wa2 = fjac.transpose() * wa4;
-        if (ratio >= Scalar(1e-4))
-            qtf = wa2;
-        wa2 = (wa2-wa3)/pnorm;
-
-        /* compute the qr factorization of the updated jacobian. */
-        internal::r1updt<Scalar>(R, wa1, v_givens, w_givens, wa2, wa3, &sing);
-        internal::r1mpyq<Scalar>(n, n, fjac.data(), v_givens, w_givens);
-        internal::r1mpyq<Scalar>(1, n, qtf.data(), v_givens, w_givens);
-
-        jeval = false;
+      for (j = 0; j < n; ++j) diag[j] = (wa2[j] == 0.) ? 1. : wa2[j];
+
+    /* on the first iteration, calculate the norm of the scaled x */
+    /* and initialize the step bound delta. */
+    xnorm = diag.cwiseProduct(x).stableNorm();
+    delta = parameters.factor * xnorm;
+    if (delta == 0.) delta = parameters.factor;
+  }
+
+  /* compute the qr factorization of the jacobian. */
+  HouseholderQR<JacobianType> qrfac(fjac);  // no pivoting:
+
+  /* copy the triangular factor of the qr factorization into r. */
+  R = qrfac.matrixQR();
+
+  /* accumulate the orthogonal factor in fjac. */
+  fjac = qrfac.householderQ();
+
+  /* form (q transpose)*fvec and store in qtf. */
+  qtf = fjac.transpose() * fvec;
+
+  /* rescale if necessary. */
+  if (!useExternalScaling) diag = diag.cwiseMax(wa2);
+
+  while (true) {
+    /* determine the direction p. */
+    internal::dogleg<Scalar>(R, diag, qtf, delta, wa1);
+
+    /* store the direction p and x + p. calculate the norm of p. */
+    wa1 = -wa1;
+    wa2 = x + wa1;
+    pnorm = diag.cwiseProduct(wa1).stableNorm();
+
+    /* on the first iteration, adjust the initial step bound. */
+    if (iter == 1) delta = (std::min)(delta, pnorm);
+
+    /* evaluate the function at x + p and calculate its norm. */
+    if (functor(wa2, wa4) < 0) return HybridNonLinearSolverSpace::UserAsked;
+    ++nfev;
+    fnorm1 = wa4.stableNorm();
+
+    /* compute the scaled actual reduction. */
+    actred = -1.;
+    if (fnorm1 < fnorm) /* Computing 2nd power */
+      actred = 1. - numext::abs2(fnorm1 / fnorm);
+
+    /* compute the scaled predicted reduction. */
+    wa3 = R.template triangularView<Upper>() * wa1 + qtf;
+    temp = wa3.stableNorm();
+    prered = 0.;
+    if (temp < fnorm) /* Computing 2nd power */
+      prered = 1. - numext::abs2(temp / fnorm);
+
+    /* compute the ratio of the actual to the predicted reduction. */
+    ratio = 0.;
+    if (prered > 0.) ratio = actred / prered;
+
+    /* update the step bound. */
+    if (ratio < Scalar(.1)) {
+      ncsuc = 0;
+      ++ncfail;
+      delta = Scalar(.5) * delta;
+    } else {
+      ncfail = 0;
+      ++ncsuc;
+      if (ratio >= Scalar(.5) || ncsuc > 1) delta = (std::max)(delta, pnorm / Scalar(.5));
+      if (abs(ratio - 1.) <= Scalar(.1)) {
+        delta = pnorm / Scalar(.5);
+      }
     }
-    return HybridNonLinearSolverSpace::Running;
+
+    /* test for successful iteration. */
+    if (ratio >= Scalar(1e-4)) {
+      /* successful iteration. update x, fvec, and their norms. */
+      x = wa2;
+      wa2 = diag.cwiseProduct(x);
+      fvec = wa4;
+      xnorm = wa2.stableNorm();
+      fnorm = fnorm1;
+      ++iter;
+    }
+
+    /* determine the progress of the iteration. */
+    ++nslow1;
+    if (actred >= Scalar(.001)) nslow1 = 0;
+    if (jeval) ++nslow2;
+    if (actred >= Scalar(.1)) nslow2 = 0;
+
+    /* test for convergence. */
+    if (delta <= parameters.xtol * xnorm || fnorm == 0.) return HybridNonLinearSolverSpace::RelativeErrorTooSmall;
+
+    /* tests for termination and stringent tolerances. */
+    if (nfev >= parameters.maxfev) return HybridNonLinearSolverSpace::TooManyFunctionEvaluation;
+    if (Scalar(.1) * (std::max)(Scalar(.1) * delta, pnorm) <= NumTraits<Scalar>::epsilon() * xnorm)
+      return HybridNonLinearSolverSpace::TolTooSmall;
+    if (nslow2 == 5) return HybridNonLinearSolverSpace::NotMakingProgressJacobian;
+    if (nslow1 == 10) return HybridNonLinearSolverSpace::NotMakingProgressIterations;
+
+    /* criterion for recalculating jacobian. */
+    if (ncfail == 2) break;  // leave inner loop and go for the next outer loop iteration
+
+    /* calculate the rank one modification to the jacobian */
+    /* and update qtf if necessary. */
+    wa1 = diag.cwiseProduct(diag.cwiseProduct(wa1) / pnorm);
+    wa2 = fjac.transpose() * wa4;
+    if (ratio >= Scalar(1e-4)) qtf = wa2;
+    wa2 = (wa2 - wa3) / pnorm;
+
+    /* compute the qr factorization of the updated jacobian. */
+    internal::r1updt<Scalar>(R, wa1, v_givens, w_givens, wa2, wa3, &sing);
+    internal::r1mpyq<Scalar>(n, n, fjac.data(), v_givens, w_givens);
+    internal::r1mpyq<Scalar>(1, n, qtf.data(), v_givens, w_givens);
+
+    jeval = false;
+  }
+  return HybridNonLinearSolverSpace::Running;
 }
 
-template<typename FunctorType, typename Scalar>
-HybridNonLinearSolverSpace::Status
-HybridNonLinearSolver<FunctorType,Scalar>::solveNumericalDiff(FVectorType  &x)
-{
-    HybridNonLinearSolverSpace::Status status = solveNumericalDiffInit(x);
-    if (status==HybridNonLinearSolverSpace::ImproperInputParameters)
-        return status;
-    while (status==HybridNonLinearSolverSpace::Running)
-        status = solveNumericalDiffOneStep(x);
-    return status;
+template <typename FunctorType, typename Scalar>
+HybridNonLinearSolverSpace::Status HybridNonLinearSolver<FunctorType, Scalar>::solveNumericalDiff(FVectorType &x) {
+  HybridNonLinearSolverSpace::Status status = solveNumericalDiffInit(x);
+  if (status == HybridNonLinearSolverSpace::ImproperInputParameters) return status;
+  while (status == HybridNonLinearSolverSpace::Running) status = solveNumericalDiffOneStep(x);
+  return status;
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_HYBRIDNONLINEARSOLVER_H
+#endif  // EIGEN_HYBRIDNONLINEARSOLVER_H
 
-//vim: ai ts=4 sts=4 et sw=4
+// vim: ai ts=4 sts=4 et sw=4
diff --git a/inst/include/unsupported/Eigen/src/NonLinearOptimization/InternalHeaderCheck.h b/inst/include/unsupported/Eigen/src/NonLinearOptimization/InternalHeaderCheck.h
new file mode 100644
index 00000000..d06970f5
--- /dev/null
+++ b/inst/include/unsupported/Eigen/src/NonLinearOptimization/InternalHeaderCheck.h
@@ -0,0 +1,4 @@
+#ifndef EIGEN_NONLINEAROPTIMIZATION_MODULE_H
+#error \
+    "Please include unsupported/Eigen/NonLinearOptimization instead of including headers inside the src directory directly."
+#endif
diff --git a/inst/include/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h b/inst/include/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h
index bfeb26fc..19ec8ea3 100644
--- a/inst/include/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h
+++ b/inst/include/unsupported/Eigen/src/NonLinearOptimization/LevenbergMarquardt.h
@@ -13,638 +13,575 @@
 #ifndef EIGEN_LEVENBERGMARQUARDT__H
 #define EIGEN_LEVENBERGMARQUARDT__H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 namespace LevenbergMarquardtSpace {
-    enum Status {
-        NotStarted = -2,
-        Running = -1,
-        ImproperInputParameters = 0,
-        RelativeReductionTooSmall = 1,
-        RelativeErrorTooSmall = 2,
-        RelativeErrorAndReductionTooSmall = 3,
-        CosinusTooSmall = 4,
-        TooManyFunctionEvaluation = 5,
-        FtolTooSmall = 6,
-        XtolTooSmall = 7,
-        GtolTooSmall = 8,
-        UserAsked = 9
-    };
+enum Status {
+  NotStarted = -2,
+  Running = -1,
+  ImproperInputParameters = 0,
+  RelativeReductionTooSmall = 1,
+  RelativeErrorTooSmall = 2,
+  RelativeErrorAndReductionTooSmall = 3,
+  CosinusTooSmall = 4,
+  TooManyFunctionEvaluation = 5,
+  FtolTooSmall = 6,
+  XtolTooSmall = 7,
+  GtolTooSmall = 8,
+  UserAsked = 9
+};
 }
 
-
-
 /**
-  * \ingroup NonLinearOptimization_Module
-  * \brief Performs non linear optimization over a non-linear function,
-  * using a variant of the Levenberg Marquardt algorithm.
-  *
-  * Check wikipedia for more information.
-  * http://en.wikipedia.org/wiki/Levenberg%E2%80%93Marquardt_algorithm
-  */
-template<typename FunctorType, typename Scalar=double>
-class LevenbergMarquardt
-{
-public:
-    LevenbergMarquardt(FunctorType &_functor)
-        : functor(_functor) { nfev = njev = iter = 0;  fnorm = gnorm = 0.; useExternalScaling=false; }
-
-    typedef DenseIndex Index;
-
-    struct Parameters {
-        Parameters()
-            : factor(Scalar(100.))
-            , maxfev(400)
-            , ftol(std::sqrt(NumTraits<Scalar>::epsilon()))
-            , xtol(std::sqrt(NumTraits<Scalar>::epsilon()))
-            , gtol(Scalar(0.))
-            , epsfcn(Scalar(0.)) {}
-        Scalar factor;
-        Index maxfev;   // maximum number of function evaluation
-        Scalar ftol;
-        Scalar xtol;
-        Scalar gtol;
-        Scalar epsfcn;
-    };
-
-    typedef Matrix< Scalar, Dynamic, 1 > FVectorType;
-    typedef Matrix< Scalar, Dynamic, Dynamic > JacobianType;
-
-    LevenbergMarquardtSpace::Status lmder1(
-            FVectorType &x,
-            const Scalar tol = std::sqrt(NumTraits<Scalar>::epsilon())
-            );
-
-    LevenbergMarquardtSpace::Status minimize(FVectorType &x);
-    LevenbergMarquardtSpace::Status minimizeInit(FVectorType &x);
-    LevenbergMarquardtSpace::Status minimizeOneStep(FVectorType &x);
-
-    static LevenbergMarquardtSpace::Status lmdif1(
-            FunctorType &functor,
-            FVectorType &x,
-            Index *nfev,
-            const Scalar tol = std::sqrt(NumTraits<Scalar>::epsilon())
-            );
-
-    LevenbergMarquardtSpace::Status lmstr1(
-            FVectorType  &x,
-            const Scalar tol = std::sqrt(NumTraits<Scalar>::epsilon())
-            );
-
-    LevenbergMarquardtSpace::Status minimizeOptimumStorage(FVectorType  &x);
-    LevenbergMarquardtSpace::Status minimizeOptimumStorageInit(FVectorType  &x);
-    LevenbergMarquardtSpace::Status minimizeOptimumStorageOneStep(FVectorType  &x);
-
-    void resetParameters(void) { parameters = Parameters(); }
-
-    Parameters parameters;
-    FVectorType  fvec, qtf, diag;
-    JacobianType fjac;
-    PermutationMatrix<Dynamic,Dynamic> permutation;
-    Index nfev;
-    Index njev;
-    Index iter;
-    Scalar fnorm, gnorm;
-    bool useExternalScaling; 
-
-    Scalar lm_param(void) { return par; }
-private:
-    FunctorType &functor;
-    Index n;
-    Index m;
-    FVectorType wa1, wa2, wa3, wa4;
-
-    Scalar par, sum;
-    Scalar temp, temp1, temp2;
-    Scalar delta;
-    Scalar ratio;
-    Scalar pnorm, xnorm, fnorm1, actred, dirder, prered;
-
-    LevenbergMarquardt& operator=(const LevenbergMarquardt&);
+ * \ingroup NonLinearOptimization_Module
+ * \brief Performs non linear optimization over a non-linear function,
+ * using a variant of the Levenberg Marquardt algorithm.
+ *
+ * Check wikipedia for more information.
+ * http://en.wikipedia.org/wiki/Levenberg%E2%80%93Marquardt_algorithm
+ */
+template <typename FunctorType, typename Scalar = double>
+class LevenbergMarquardt {
+  static Scalar sqrt_epsilon() {
+    using std::sqrt;
+    return sqrt(NumTraits<Scalar>::epsilon());
+  }
+
+ public:
+  LevenbergMarquardt(FunctorType &_functor) : functor(_functor) {
+    nfev = njev = iter = 0;
+    fnorm = gnorm = 0.;
+    useExternalScaling = false;
+  }
+
+  typedef DenseIndex Index;
+
+  struct Parameters {
+    Parameters()
+        : factor(Scalar(100.)),
+          maxfev(400),
+          ftol(sqrt_epsilon()),
+          xtol(sqrt_epsilon()),
+          gtol(Scalar(0.)),
+          epsfcn(Scalar(0.)) {}
+    Scalar factor;
+    Index maxfev;  // maximum number of function evaluation
+    Scalar ftol;
+    Scalar xtol;
+    Scalar gtol;
+    Scalar epsfcn;
+  };
+
+  typedef Matrix<Scalar, Dynamic, 1> FVectorType;
+  typedef Matrix<Scalar, Dynamic, Dynamic> JacobianType;
+
+  LevenbergMarquardtSpace::Status lmder1(FVectorType &x, const Scalar tol = sqrt_epsilon());
+
+  LevenbergMarquardtSpace::Status minimize(FVectorType &x);
+  LevenbergMarquardtSpace::Status minimizeInit(FVectorType &x);
+  LevenbergMarquardtSpace::Status minimizeOneStep(FVectorType &x);
+
+  static LevenbergMarquardtSpace::Status lmdif1(FunctorType &functor, FVectorType &x, Index *nfev,
+                                                const Scalar tol = sqrt_epsilon());
+
+  LevenbergMarquardtSpace::Status lmstr1(FVectorType &x, const Scalar tol = sqrt_epsilon());
+
+  LevenbergMarquardtSpace::Status minimizeOptimumStorage(FVectorType &x);
+  LevenbergMarquardtSpace::Status minimizeOptimumStorageInit(FVectorType &x);
+  LevenbergMarquardtSpace::Status minimizeOptimumStorageOneStep(FVectorType &x);
+
+  void resetParameters(void) { parameters = Parameters(); }
+
+  Parameters parameters;
+  FVectorType fvec, qtf, diag;
+  JacobianType fjac;
+  PermutationMatrix<Dynamic, Dynamic> permutation;
+  Index nfev;
+  Index njev;
+  Index iter;
+  Scalar fnorm, gnorm;
+  bool useExternalScaling;
+
+  Scalar lm_param(void) { return par; }
+
+ private:
+  FunctorType &functor;
+  Index n;
+  Index m;
+  FVectorType wa1, wa2, wa3, wa4;
+
+  Scalar par, sum;
+  Scalar temp, temp1, temp2;
+  Scalar delta;
+  Scalar ratio;
+  Scalar pnorm, xnorm, fnorm1, actred, dirder, prered;
+
+  LevenbergMarquardt &operator=(const LevenbergMarquardt &);
 };
 
-template<typename FunctorType, typename Scalar>
-LevenbergMarquardtSpace::Status
-LevenbergMarquardt<FunctorType,Scalar>::lmder1(
-        FVectorType  &x,
-        const Scalar tol
-        )
-{
-    n = x.size();
-    m = functor.values();
-
-    /* check the input parameters for errors. */
-    if (n <= 0 || m < n || tol < 0.)
-        return LevenbergMarquardtSpace::ImproperInputParameters;
-
-    resetParameters();
-    parameters.ftol = tol;
-    parameters.xtol = tol;
-    parameters.maxfev = 100*(n+1);
-
-    return minimize(x);
-}
+template <typename FunctorType, typename Scalar>
+LevenbergMarquardtSpace::Status LevenbergMarquardt<FunctorType, Scalar>::lmder1(FVectorType &x, const Scalar tol) {
+  n = x.size();
+  m = functor.values();
+
+  /* check the input parameters for errors. */
+  if (n <= 0 || m < n || tol < 0.) return LevenbergMarquardtSpace::ImproperInputParameters;
 
+  resetParameters();
+  parameters.ftol = tol;
+  parameters.xtol = tol;
+  parameters.maxfev = 100 * (n + 1);
 
-template<typename FunctorType, typename Scalar>
-LevenbergMarquardtSpace::Status
-LevenbergMarquardt<FunctorType,Scalar>::minimize(FVectorType  &x)
-{
-    LevenbergMarquardtSpace::Status status = minimizeInit(x);
-    if (status==LevenbergMarquardtSpace::ImproperInputParameters)
-        return status;
-    do {
-        status = minimizeOneStep(x);
-    } while (status==LevenbergMarquardtSpace::Running);
-    return status;
+  return minimize(x);
 }
 
-template<typename FunctorType, typename Scalar>
-LevenbergMarquardtSpace::Status
-LevenbergMarquardt<FunctorType,Scalar>::minimizeInit(FVectorType  &x)
-{
-    n = x.size();
-    m = functor.values();
-
-    wa1.resize(n); wa2.resize(n); wa3.resize(n);
-    wa4.resize(m);
-    fvec.resize(m);
-    fjac.resize(m, n);
-    if (!useExternalScaling)
-        diag.resize(n);
-    eigen_assert( (!useExternalScaling || diag.size()==n) || "When useExternalScaling is set, the caller must provide a valid 'diag'");
-    qtf.resize(n);
-
-    /* Function Body */
-    nfev = 0;
-    njev = 0;
-
-    /*     check the input parameters for errors. */
-    if (n <= 0 || m < n || parameters.ftol < 0. || parameters.xtol < 0. || parameters.gtol < 0. || parameters.maxfev <= 0 || parameters.factor <= 0.)
-        return LevenbergMarquardtSpace::ImproperInputParameters;
-
-    if (useExternalScaling)
-        for (Index j = 0; j < n; ++j)
-            if (diag[j] <= 0.)
-                return LevenbergMarquardtSpace::ImproperInputParameters;
-
-    /*     evaluate the function at the starting point */
-    /*     and calculate its norm. */
-    nfev = 1;
-    if ( functor(x, fvec) < 0)
-        return LevenbergMarquardtSpace::UserAsked;
-    fnorm = fvec.stableNorm();
-
-    /*     initialize levenberg-marquardt parameter and iteration counter. */
-    par = 0.;
-    iter = 1;
-
-    return LevenbergMarquardtSpace::NotStarted;
+template <typename FunctorType, typename Scalar>
+LevenbergMarquardtSpace::Status LevenbergMarquardt<FunctorType, Scalar>::minimize(FVectorType &x) {
+  LevenbergMarquardtSpace::Status status = minimizeInit(x);
+  if (status == LevenbergMarquardtSpace::ImproperInputParameters) return status;
+  do {
+    status = minimizeOneStep(x);
+  } while (status == LevenbergMarquardtSpace::Running);
+  return status;
 }
 
-template<typename FunctorType, typename Scalar>
-LevenbergMarquardtSpace::Status
-LevenbergMarquardt<FunctorType,Scalar>::minimizeOneStep(FVectorType  &x)
-{
-    using std::abs;
-    using std::sqrt;
-    
-    eigen_assert(x.size()==n); // check the caller is not cheating us
-
-    /* calculate the jacobian matrix. */
-    Index df_ret = functor.df(x, fjac);
-    if (df_ret<0)
-        return LevenbergMarquardtSpace::UserAsked;
-    if (df_ret>0)
-        // numerical diff, we evaluated the function df_ret times
-        nfev += df_ret;
-    else njev++;
-
-    /* compute the qr factorization of the jacobian. */
-    wa2 = fjac.colwise().blueNorm();
-    ColPivHouseholderQR<JacobianType> qrfac(fjac);
-    fjac = qrfac.matrixQR();
-    permutation = qrfac.colsPermutation();
+template <typename FunctorType, typename Scalar>
+LevenbergMarquardtSpace::Status LevenbergMarquardt<FunctorType, Scalar>::minimizeInit(FVectorType &x) {
+  n = x.size();
+  m = functor.values();
+
+  wa1.resize(n);
+  wa2.resize(n);
+  wa3.resize(n);
+  wa4.resize(m);
+  fvec.resize(m);
+  fjac.resize(m, n);
+  if (!useExternalScaling) diag.resize(n);
+  eigen_assert((!useExternalScaling || diag.size() == n) &&
+               "When useExternalScaling is set, the caller must provide a valid 'diag'");
+  qtf.resize(n);
+
+  /* Function Body */
+  nfev = 0;
+  njev = 0;
+
+  /*     check the input parameters for errors. */
+  if (n <= 0 || m < n || parameters.ftol < 0. || parameters.xtol < 0. || parameters.gtol < 0. ||
+      parameters.maxfev <= 0 || parameters.factor <= 0.)
+    return LevenbergMarquardtSpace::ImproperInputParameters;
+
+  if (useExternalScaling)
+    for (Index j = 0; j < n; ++j)
+      if (diag[j] <= 0.) return LevenbergMarquardtSpace::ImproperInputParameters;
+
+  /*     evaluate the function at the starting point */
+  /*     and calculate its norm. */
+  nfev = 1;
+  if (functor(x, fvec) < 0) return LevenbergMarquardtSpace::UserAsked;
+  fnorm = fvec.stableNorm();
+
+  /*     initialize levenberg-marquardt parameter and iteration counter. */
+  par = 0.;
+  iter = 1;
+
+  return LevenbergMarquardtSpace::NotStarted;
+}
 
-    /* on the first iteration and if external scaling is not used, scale according */
-    /* to the norms of the columns of the initial jacobian. */
-    if (iter == 1) {
-        if (!useExternalScaling)
-            for (Index j = 0; j < n; ++j)
-                diag[j] = (wa2[j]==0.)? 1. : wa2[j];
-
-        /* on the first iteration, calculate the norm of the scaled x */
-        /* and initialize the step bound delta. */
-        xnorm = diag.cwiseProduct(x).stableNorm();
-        delta = parameters.factor * xnorm;
-        if (delta == 0.)
-            delta = parameters.factor;
+template <typename FunctorType, typename Scalar>
+LevenbergMarquardtSpace::Status LevenbergMarquardt<FunctorType, Scalar>::minimizeOneStep(FVectorType &x) {
+  using std::abs;
+  using std::sqrt;
+
+  eigen_assert(x.size() == n);  // check the caller is not cheating us
+
+  /* calculate the jacobian matrix. */
+  Index df_ret = functor.df(x, fjac);
+  if (df_ret < 0) return LevenbergMarquardtSpace::UserAsked;
+  if (df_ret > 0)
+    // numerical diff, we evaluated the function df_ret times
+    nfev += df_ret;
+  else
+    njev++;
+
+  /* compute the qr factorization of the jacobian. */
+  wa2 = fjac.colwise().blueNorm();
+  ColPivHouseholderQR<JacobianType> qrfac(fjac);
+  fjac = qrfac.matrixQR();
+  permutation = qrfac.colsPermutation();
+
+  /* on the first iteration and if external scaling is not used, scale according */
+  /* to the norms of the columns of the initial jacobian. */
+  if (iter == 1) {
+    if (!useExternalScaling)
+      for (Index j = 0; j < n; ++j) diag[j] = (wa2[j] == 0.) ? 1. : wa2[j];
+
+    /* on the first iteration, calculate the norm of the scaled x */
+    /* and initialize the step bound delta. */
+    xnorm = diag.cwiseProduct(x).stableNorm();
+    delta = parameters.factor * xnorm;
+    if (delta == 0.) delta = parameters.factor;
+  }
+
+  /* form (q transpose)*fvec and store the first n components in */
+  /* qtf. */
+  wa4 = fvec;
+  wa4.applyOnTheLeft(qrfac.householderQ().adjoint());
+  qtf = wa4.head(n);
+
+  /* compute the norm of the scaled gradient. */
+  gnorm = 0.;
+  if (fnorm != 0.)
+    for (Index j = 0; j < n; ++j)
+      if (wa2[permutation.indices()[j]] != 0.)
+        gnorm = (std::max)(gnorm,
+                           abs(fjac.col(j).head(j + 1).dot(qtf.head(j + 1) / fnorm) / wa2[permutation.indices()[j]]));
+
+  /* test for convergence of the gradient norm. */
+  if (gnorm <= parameters.gtol) return LevenbergMarquardtSpace::CosinusTooSmall;
+
+  /* rescale if necessary. */
+  if (!useExternalScaling) diag = diag.cwiseMax(wa2);
+
+  do {
+    /* determine the levenberg-marquardt parameter. */
+    internal::lmpar2<Scalar>(qrfac, diag, qtf, delta, par, wa1);
+
+    /* store the direction p and x + p. calculate the norm of p. */
+    wa1 = -wa1;
+    wa2 = x + wa1;
+    pnorm = diag.cwiseProduct(wa1).stableNorm();
+
+    /* on the first iteration, adjust the initial step bound. */
+    if (iter == 1) delta = (std::min)(delta, pnorm);
+
+    /* evaluate the function at x + p and calculate its norm. */
+    if (functor(wa2, wa4) < 0) return LevenbergMarquardtSpace::UserAsked;
+    ++nfev;
+    fnorm1 = wa4.stableNorm();
+
+    /* compute the scaled actual reduction. */
+    actred = -1.;
+    if (Scalar(.1) * fnorm1 < fnorm) actred = 1. - numext::abs2(fnorm1 / fnorm);
+
+    /* compute the scaled predicted reduction and */
+    /* the scaled directional derivative. */
+    wa3.noalias() = fjac.template triangularView<Upper>() * (qrfac.colsPermutation().inverse() * wa1);
+    temp1 = numext::abs2(wa3.stableNorm() / fnorm);
+    temp2 = numext::abs2(sqrt(par) * pnorm / fnorm);
+    prered = temp1 + temp2 / Scalar(.5);
+    dirder = -(temp1 + temp2);
+
+    /* compute the ratio of the actual to the predicted */
+    /* reduction. */
+    ratio = 0.;
+    if (prered != 0.) ratio = actred / prered;
+
+    /* update the step bound. */
+    if (ratio <= Scalar(.25)) {
+      if (actred >= 0.) temp = Scalar(.5);
+      if (actred < 0.) temp = Scalar(.5) * dirder / (dirder + Scalar(.5) * actred);
+      if (Scalar(.1) * fnorm1 >= fnorm || temp < Scalar(.1)) temp = Scalar(.1);
+      /* Computing MIN */
+      delta = temp * (std::min)(delta, pnorm / Scalar(.1));
+      par /= temp;
+    } else if (!(par != 0. && ratio < Scalar(.75))) {
+      delta = pnorm / Scalar(.5);
+      par = Scalar(.5) * par;
     }
 
-    /* form (q transpose)*fvec and store the first n components in */
-    /* qtf. */
-    wa4 = fvec;
-    wa4.applyOnTheLeft(qrfac.householderQ().adjoint());
-    qtf = wa4.head(n);
+    /* test for successful iteration. */
+    if (ratio >= Scalar(1e-4)) {
+      /* successful iteration. update x, fvec, and their norms. */
+      x = wa2;
+      wa2 = diag.cwiseProduct(x);
+      fvec = wa4;
+      xnorm = wa2.stableNorm();
+      fnorm = fnorm1;
+      ++iter;
+    }
 
-    /* compute the norm of the scaled gradient. */
-    gnorm = 0.;
-    if (fnorm != 0.)
-        for (Index j = 0; j < n; ++j)
-            if (wa2[permutation.indices()[j]] != 0.)
-                gnorm = (std::max)(gnorm, abs( fjac.col(j).head(j+1).dot(qtf.head(j+1)/fnorm) / wa2[permutation.indices()[j]]));
+    /* tests for convergence. */
+    if (abs(actred) <= parameters.ftol && prered <= parameters.ftol && Scalar(.5) * ratio <= 1. &&
+        delta <= parameters.xtol * xnorm)
+      return LevenbergMarquardtSpace::RelativeErrorAndReductionTooSmall;
+    if (abs(actred) <= parameters.ftol && prered <= parameters.ftol && Scalar(.5) * ratio <= 1.)
+      return LevenbergMarquardtSpace::RelativeReductionTooSmall;
+    if (delta <= parameters.xtol * xnorm) return LevenbergMarquardtSpace::RelativeErrorTooSmall;
+
+    /* tests for termination and stringent tolerances. */
+    if (nfev >= parameters.maxfev) return LevenbergMarquardtSpace::TooManyFunctionEvaluation;
+    if (abs(actred) <= NumTraits<Scalar>::epsilon() && prered <= NumTraits<Scalar>::epsilon() &&
+        Scalar(.5) * ratio <= 1.)
+      return LevenbergMarquardtSpace::FtolTooSmall;
+    if (delta <= NumTraits<Scalar>::epsilon() * xnorm) return LevenbergMarquardtSpace::XtolTooSmall;
+    if (gnorm <= NumTraits<Scalar>::epsilon()) return LevenbergMarquardtSpace::GtolTooSmall;
+
+  } while (ratio < Scalar(1e-4));
+
+  return LevenbergMarquardtSpace::Running;
+}
 
-    /* test for convergence of the gradient norm. */
-    if (gnorm <= parameters.gtol)
-        return LevenbergMarquardtSpace::CosinusTooSmall;
+template <typename FunctorType, typename Scalar>
+LevenbergMarquardtSpace::Status LevenbergMarquardt<FunctorType, Scalar>::lmstr1(FVectorType &x, const Scalar tol) {
+  n = x.size();
+  m = functor.values();
 
-    /* rescale if necessary. */
-    if (!useExternalScaling)
-        diag = diag.cwiseMax(wa2);
-
-    do {
-
-        /* determine the levenberg-marquardt parameter. */
-        internal::lmpar2<Scalar>(qrfac, diag, qtf, delta, par, wa1);
-
-        /* store the direction p and x + p. calculate the norm of p. */
-        wa1 = -wa1;
-        wa2 = x + wa1;
-        pnorm = diag.cwiseProduct(wa1).stableNorm();
-
-        /* on the first iteration, adjust the initial step bound. */
-        if (iter == 1)
-            delta = (std::min)(delta,pnorm);
-
-        /* evaluate the function at x + p and calculate its norm. */
-        if ( functor(wa2, wa4) < 0)
-            return LevenbergMarquardtSpace::UserAsked;
-        ++nfev;
-        fnorm1 = wa4.stableNorm();
-
-        /* compute the scaled actual reduction. */
-        actred = -1.;
-        if (Scalar(.1) * fnorm1 < fnorm)
-            actred = 1. - numext::abs2(fnorm1 / fnorm);
-
-        /* compute the scaled predicted reduction and */
-        /* the scaled directional derivative. */
-        wa3 = fjac.template triangularView<Upper>() * (qrfac.colsPermutation().inverse() *wa1);
-        temp1 = numext::abs2(wa3.stableNorm() / fnorm);
-        temp2 = numext::abs2(sqrt(par) * pnorm / fnorm);
-        prered = temp1 + temp2 / Scalar(.5);
-        dirder = -(temp1 + temp2);
-
-        /* compute the ratio of the actual to the predicted */
-        /* reduction. */
-        ratio = 0.;
-        if (prered != 0.)
-            ratio = actred / prered;
-
-        /* update the step bound. */
-        if (ratio <= Scalar(.25)) {
-            if (actred >= 0.)
-                temp = Scalar(.5);
-            if (actred < 0.)
-                temp = Scalar(.5) * dirder / (dirder + Scalar(.5) * actred);
-            if (Scalar(.1) * fnorm1 >= fnorm || temp < Scalar(.1))
-                temp = Scalar(.1);
-            /* Computing MIN */
-            delta = temp * (std::min)(delta, pnorm / Scalar(.1));
-            par /= temp;
-        } else if (!(par != 0. && ratio < Scalar(.75))) {
-            delta = pnorm / Scalar(.5);
-            par = Scalar(.5) * par;
-        }
-
-        /* test for successful iteration. */
-        if (ratio >= Scalar(1e-4)) {
-            /* successful iteration. update x, fvec, and their norms. */
-            x = wa2;
-            wa2 = diag.cwiseProduct(x);
-            fvec = wa4;
-            xnorm = wa2.stableNorm();
-            fnorm = fnorm1;
-            ++iter;
-        }
-
-        /* tests for convergence. */
-        if (abs(actred) <= parameters.ftol && prered <= parameters.ftol && Scalar(.5) * ratio <= 1. && delta <= parameters.xtol * xnorm)
-            return LevenbergMarquardtSpace::RelativeErrorAndReductionTooSmall;
-        if (abs(actred) <= parameters.ftol && prered <= parameters.ftol && Scalar(.5) * ratio <= 1.)
-            return LevenbergMarquardtSpace::RelativeReductionTooSmall;
-        if (delta <= parameters.xtol * xnorm)
-            return LevenbergMarquardtSpace::RelativeErrorTooSmall;
-
-        /* tests for termination and stringent tolerances. */
-        if (nfev >= parameters.maxfev)
-            return LevenbergMarquardtSpace::TooManyFunctionEvaluation;
-        if (abs(actred) <= NumTraits<Scalar>::epsilon() && prered <= NumTraits<Scalar>::epsilon() && Scalar(.5) * ratio <= 1.)
-            return LevenbergMarquardtSpace::FtolTooSmall;
-        if (delta <= NumTraits<Scalar>::epsilon() * xnorm)
-            return LevenbergMarquardtSpace::XtolTooSmall;
-        if (gnorm <= NumTraits<Scalar>::epsilon())
-            return LevenbergMarquardtSpace::GtolTooSmall;
-
-    } while (ratio < Scalar(1e-4));
-
-    return LevenbergMarquardtSpace::Running;
-}
+  /* check the input parameters for errors. */
+  if (n <= 0 || m < n || tol < 0.) return LevenbergMarquardtSpace::ImproperInputParameters;
 
-template<typename FunctorType, typename Scalar>
-LevenbergMarquardtSpace::Status
-LevenbergMarquardt<FunctorType,Scalar>::lmstr1(
-        FVectorType  &x,
-        const Scalar tol
-        )
-{
-    n = x.size();
-    m = functor.values();
-
-    /* check the input parameters for errors. */
-    if (n <= 0 || m < n || tol < 0.)
-        return LevenbergMarquardtSpace::ImproperInputParameters;
-
-    resetParameters();
-    parameters.ftol = tol;
-    parameters.xtol = tol;
-    parameters.maxfev = 100*(n+1);
-
-    return minimizeOptimumStorage(x);
-}
+  resetParameters();
+  parameters.ftol = tol;
+  parameters.xtol = tol;
+  parameters.maxfev = 100 * (n + 1);
 
-template<typename FunctorType, typename Scalar>
-LevenbergMarquardtSpace::Status
-LevenbergMarquardt<FunctorType,Scalar>::minimizeOptimumStorageInit(FVectorType  &x)
-{
-    n = x.size();
-    m = functor.values();
-
-    wa1.resize(n); wa2.resize(n); wa3.resize(n);
-    wa4.resize(m);
-    fvec.resize(m);
-    // Only R is stored in fjac. Q is only used to compute 'qtf', which is
-    // Q.transpose()*rhs. qtf will be updated using givens rotation,
-    // instead of storing them in Q.
-    // The purpose it to only use a nxn matrix, instead of mxn here, so
-    // that we can handle cases where m>>n :
-    fjac.resize(n, n);
-    if (!useExternalScaling)
-        diag.resize(n);
-    eigen_assert( (!useExternalScaling || diag.size()==n) || "When useExternalScaling is set, the caller must provide a valid 'diag'");
-    qtf.resize(n);
-
-    /* Function Body */
-    nfev = 0;
-    njev = 0;
-
-    /*     check the input parameters for errors. */
-    if (n <= 0 || m < n || parameters.ftol < 0. || parameters.xtol < 0. || parameters.gtol < 0. || parameters.maxfev <= 0 || parameters.factor <= 0.)
-        return LevenbergMarquardtSpace::ImproperInputParameters;
-
-    if (useExternalScaling)
-        for (Index j = 0; j < n; ++j)
-            if (diag[j] <= 0.)
-                return LevenbergMarquardtSpace::ImproperInputParameters;
-
-    /*     evaluate the function at the starting point */
-    /*     and calculate its norm. */
-    nfev = 1;
-    if ( functor(x, fvec) < 0)
-        return LevenbergMarquardtSpace::UserAsked;
-    fnorm = fvec.stableNorm();
-
-    /*     initialize levenberg-marquardt parameter and iteration counter. */
-    par = 0.;
-    iter = 1;
-
-    return LevenbergMarquardtSpace::NotStarted;
+  return minimizeOptimumStorage(x);
 }
 
+template <typename FunctorType, typename Scalar>
+LevenbergMarquardtSpace::Status LevenbergMarquardt<FunctorType, Scalar>::minimizeOptimumStorageInit(FVectorType &x) {
+  n = x.size();
+  m = functor.values();
+
+  wa1.resize(n);
+  wa2.resize(n);
+  wa3.resize(n);
+  wa4.resize(m);
+  fvec.resize(m);
+  // Only R is stored in fjac. Q is only used to compute 'qtf', which is
+  // Q.transpose()*rhs. qtf will be updated using givens rotation,
+  // instead of storing them in Q.
+  // The purpose it to only use a nxn matrix, instead of mxn here, so
+  // that we can handle cases where m>>n :
+  fjac.resize(n, n);
+  if (!useExternalScaling) diag.resize(n);
+  eigen_assert((!useExternalScaling || diag.size() == n) &&
+               "When useExternalScaling is set, the caller must provide a valid 'diag'");
+  qtf.resize(n);
+
+  /* Function Body */
+  nfev = 0;
+  njev = 0;
+
+  /*     check the input parameters for errors. */
+  if (n <= 0 || m < n || parameters.ftol < 0. || parameters.xtol < 0. || parameters.gtol < 0. ||
+      parameters.maxfev <= 0 || parameters.factor <= 0.)
+    return LevenbergMarquardtSpace::ImproperInputParameters;
+
+  if (useExternalScaling)
+    for (Index j = 0; j < n; ++j)
+      if (diag[j] <= 0.) return LevenbergMarquardtSpace::ImproperInputParameters;
+
+  /*     evaluate the function at the starting point */
+  /*     and calculate its norm. */
+  nfev = 1;
+  if (functor(x, fvec) < 0) return LevenbergMarquardtSpace::UserAsked;
+  fnorm = fvec.stableNorm();
+
+  /*     initialize levenberg-marquardt parameter and iteration counter. */
+  par = 0.;
+  iter = 1;
+
+  return LevenbergMarquardtSpace::NotStarted;
+}
 
-template<typename FunctorType, typename Scalar>
-LevenbergMarquardtSpace::Status
-LevenbergMarquardt<FunctorType,Scalar>::minimizeOptimumStorageOneStep(FVectorType  &x)
-{
-    using std::abs;
-    using std::sqrt;
-    
-    eigen_assert(x.size()==n); // check the caller is not cheating us
-
-    Index i, j;
-    bool sing;
-
-    /* compute the qr factorization of the jacobian matrix */
-    /* calculated one row at a time, while simultaneously */
-    /* forming (q transpose)*fvec and storing the first */
-    /* n components in qtf. */
-    qtf.fill(0.);
-    fjac.fill(0.);
-    Index rownb = 2;
-    for (i = 0; i < m; ++i) {
-        if (functor.df(x, wa3, rownb) < 0) return LevenbergMarquardtSpace::UserAsked;
-        internal::rwupdt<Scalar>(fjac, wa3, qtf, fvec[i]);
-        ++rownb;
-    }
-    ++njev;
+template <typename FunctorType, typename Scalar>
+LevenbergMarquardtSpace::Status LevenbergMarquardt<FunctorType, Scalar>::minimizeOptimumStorageOneStep(FVectorType &x) {
+  using std::abs;
+  using std::sqrt;
+
+  eigen_assert(x.size() == n);  // check the caller is not cheating us
+
+  Index i, j;
+  bool sing;
+
+  /* compute the qr factorization of the jacobian matrix */
+  /* calculated one row at a time, while simultaneously */
+  /* forming (q transpose)*fvec and storing the first */
+  /* n components in qtf. */
+  qtf.fill(0.);
+  fjac.fill(0.);
+  Index rownb = 2;
+  for (i = 0; i < m; ++i) {
+    if (functor.df(x, wa3, rownb) < 0) return LevenbergMarquardtSpace::UserAsked;
+    internal::rwupdt<Scalar>(fjac, wa3, qtf, fvec[i]);
+    ++rownb;
+  }
+  ++njev;
+
+  /* if the jacobian is rank deficient, call qrfac to */
+  /* reorder its columns and update the components of qtf. */
+  sing = false;
+  for (j = 0; j < n; ++j) {
+    if (fjac(j, j) == 0.) sing = true;
+    wa2[j] = fjac.col(j).head(j).stableNorm();
+  }
+  permutation.setIdentity(n);
+  if (sing) {
+    wa2 = fjac.colwise().blueNorm();
+    // TODO We have no unit test covering this code path, do not modify
+    // until it is carefully tested
+    ColPivHouseholderQR<JacobianType> qrfac(fjac);
+    fjac = qrfac.matrixQR();
+    wa1 = fjac.diagonal();
+    fjac.diagonal() = qrfac.hCoeffs();
+    permutation = qrfac.colsPermutation();
+    // TODO : avoid this:
+    for (Index ii = 0; ii < fjac.cols(); ii++)
+      fjac.col(ii).segment(ii + 1, fjac.rows() - ii - 1) *= fjac(ii, ii);  // rescale vectors
 
-    /* if the jacobian is rank deficient, call qrfac to */
-    /* reorder its columns and update the components of qtf. */
-    sing = false;
     for (j = 0; j < n; ++j) {
-        if (fjac(j,j) == 0.)
-            sing = true;
-        wa2[j] = fjac.col(j).head(j).stableNorm();
-    }
-    permutation.setIdentity(n);
-    if (sing) {
-        wa2 = fjac.colwise().blueNorm();
-        // TODO We have no unit test covering this code path, do not modify
-        // until it is carefully tested
-        ColPivHouseholderQR<JacobianType> qrfac(fjac);
-        fjac = qrfac.matrixQR();
-        wa1 = fjac.diagonal();
-        fjac.diagonal() = qrfac.hCoeffs();
-        permutation = qrfac.colsPermutation();
-        // TODO : avoid this:
-        for(Index ii=0; ii< fjac.cols(); ii++) fjac.col(ii).segment(ii+1, fjac.rows()-ii-1) *= fjac(ii,ii); // rescale vectors
-
-        for (j = 0; j < n; ++j) {
-            if (fjac(j,j) != 0.) {
-                sum = 0.;
-                for (i = j; i < n; ++i)
-                    sum += fjac(i,j) * qtf[i];
-                temp = -sum / fjac(j,j);
-                for (i = j; i < n; ++i)
-                    qtf[i] += fjac(i,j) * temp;
-            }
-            fjac(j,j) = wa1[j];
-        }
+      if (fjac(j, j) != 0.) {
+        sum = 0.;
+        for (i = j; i < n; ++i) sum += fjac(i, j) * qtf[i];
+        temp = -sum / fjac(j, j);
+        for (i = j; i < n; ++i) qtf[i] += fjac(i, j) * temp;
+      }
+      fjac(j, j) = wa1[j];
     }
+  }
 
-    /* on the first iteration and if external scaling is not used, scale according */
-    /* to the norms of the columns of the initial jacobian. */
-    if (iter == 1) {
-        if (!useExternalScaling)
-            for (j = 0; j < n; ++j)
-                diag[j] = (wa2[j]==0.)? 1. : wa2[j];
-
-        /* on the first iteration, calculate the norm of the scaled x */
-        /* and initialize the step bound delta. */
-        xnorm = diag.cwiseProduct(x).stableNorm();
-        delta = parameters.factor * xnorm;
-        if (delta == 0.)
-            delta = parameters.factor;
+  /* on the first iteration and if external scaling is not used, scale according */
+  /* to the norms of the columns of the initial jacobian. */
+  if (iter == 1) {
+    if (!useExternalScaling)
+      for (j = 0; j < n; ++j) diag[j] = (wa2[j] == 0.) ? 1. : wa2[j];
+
+    /* on the first iteration, calculate the norm of the scaled x */
+    /* and initialize the step bound delta. */
+    xnorm = diag.cwiseProduct(x).stableNorm();
+    delta = parameters.factor * xnorm;
+    if (delta == 0.) delta = parameters.factor;
+  }
+
+  /* compute the norm of the scaled gradient. */
+  gnorm = 0.;
+  if (fnorm != 0.)
+    for (j = 0; j < n; ++j)
+      if (wa2[permutation.indices()[j]] != 0.)
+        gnorm = (std::max)(gnorm,
+                           abs(fjac.col(j).head(j + 1).dot(qtf.head(j + 1) / fnorm) / wa2[permutation.indices()[j]]));
+
+  /* test for convergence of the gradient norm. */
+  if (gnorm <= parameters.gtol) return LevenbergMarquardtSpace::CosinusTooSmall;
+
+  /* rescale if necessary. */
+  if (!useExternalScaling) diag = diag.cwiseMax(wa2);
+
+  do {
+    /* determine the levenberg-marquardt parameter. */
+    internal::lmpar<Scalar>(fjac, permutation.indices(), diag, qtf, delta, par, wa1);
+
+    /* store the direction p and x + p. calculate the norm of p. */
+    wa1 = -wa1;
+    wa2 = x + wa1;
+    pnorm = diag.cwiseProduct(wa1).stableNorm();
+
+    /* on the first iteration, adjust the initial step bound. */
+    if (iter == 1) delta = (std::min)(delta, pnorm);
+
+    /* evaluate the function at x + p and calculate its norm. */
+    if (functor(wa2, wa4) < 0) return LevenbergMarquardtSpace::UserAsked;
+    ++nfev;
+    fnorm1 = wa4.stableNorm();
+
+    /* compute the scaled actual reduction. */
+    actred = -1.;
+    if (Scalar(.1) * fnorm1 < fnorm) actred = 1. - numext::abs2(fnorm1 / fnorm);
+
+    /* compute the scaled predicted reduction and */
+    /* the scaled directional derivative. */
+    wa3 = fjac.topLeftCorner(n, n).template triangularView<Upper>() * (permutation.inverse() * wa1);
+    temp1 = numext::abs2(wa3.stableNorm() / fnorm);
+    temp2 = numext::abs2(sqrt(par) * pnorm / fnorm);
+    prered = temp1 + temp2 / Scalar(.5);
+    dirder = -(temp1 + temp2);
+
+    /* compute the ratio of the actual to the predicted */
+    /* reduction. */
+    ratio = 0.;
+    if (prered != 0.) ratio = actred / prered;
+
+    /* update the step bound. */
+    if (ratio <= Scalar(.25)) {
+      if (actred >= 0.) temp = Scalar(.5);
+      if (actred < 0.) temp = Scalar(.5) * dirder / (dirder + Scalar(.5) * actred);
+      if (Scalar(.1) * fnorm1 >= fnorm || temp < Scalar(.1)) temp = Scalar(.1);
+      /* Computing MIN */
+      delta = temp * (std::min)(delta, pnorm / Scalar(.1));
+      par /= temp;
+    } else if (!(par != 0. && ratio < Scalar(.75))) {
+      delta = pnorm / Scalar(.5);
+      par = Scalar(.5) * par;
     }
 
-    /* compute the norm of the scaled gradient. */
-    gnorm = 0.;
-    if (fnorm != 0.)
-        for (j = 0; j < n; ++j)
-            if (wa2[permutation.indices()[j]] != 0.)
-                gnorm = (std::max)(gnorm, abs( fjac.col(j).head(j+1).dot(qtf.head(j+1)/fnorm) / wa2[permutation.indices()[j]]));
-
-    /* test for convergence of the gradient norm. */
-    if (gnorm <= parameters.gtol)
-        return LevenbergMarquardtSpace::CosinusTooSmall;
+    /* test for successful iteration. */
+    if (ratio >= Scalar(1e-4)) {
+      /* successful iteration. update x, fvec, and their norms. */
+      x = wa2;
+      wa2 = diag.cwiseProduct(x);
+      fvec = wa4;
+      xnorm = wa2.stableNorm();
+      fnorm = fnorm1;
+      ++iter;
+    }
 
-    /* rescale if necessary. */
-    if (!useExternalScaling)
-        diag = diag.cwiseMax(wa2);
-
-    do {
-
-        /* determine the levenberg-marquardt parameter. */
-        internal::lmpar<Scalar>(fjac, permutation.indices(), diag, qtf, delta, par, wa1);
-
-        /* store the direction p and x + p. calculate the norm of p. */
-        wa1 = -wa1;
-        wa2 = x + wa1;
-        pnorm = diag.cwiseProduct(wa1).stableNorm();
-
-        /* on the first iteration, adjust the initial step bound. */
-        if (iter == 1)
-            delta = (std::min)(delta,pnorm);
-
-        /* evaluate the function at x + p and calculate its norm. */
-        if ( functor(wa2, wa4) < 0)
-            return LevenbergMarquardtSpace::UserAsked;
-        ++nfev;
-        fnorm1 = wa4.stableNorm();
-
-        /* compute the scaled actual reduction. */
-        actred = -1.;
-        if (Scalar(.1) * fnorm1 < fnorm)
-            actred = 1. - numext::abs2(fnorm1 / fnorm);
-
-        /* compute the scaled predicted reduction and */
-        /* the scaled directional derivative. */
-        wa3 = fjac.topLeftCorner(n,n).template triangularView<Upper>() * (permutation.inverse() * wa1);
-        temp1 = numext::abs2(wa3.stableNorm() / fnorm);
-        temp2 = numext::abs2(sqrt(par) * pnorm / fnorm);
-        prered = temp1 + temp2 / Scalar(.5);
-        dirder = -(temp1 + temp2);
-
-        /* compute the ratio of the actual to the predicted */
-        /* reduction. */
-        ratio = 0.;
-        if (prered != 0.)
-            ratio = actred / prered;
-
-        /* update the step bound. */
-        if (ratio <= Scalar(.25)) {
-            if (actred >= 0.)
-                temp = Scalar(.5);
-            if (actred < 0.)
-                temp = Scalar(.5) * dirder / (dirder + Scalar(.5) * actred);
-            if (Scalar(.1) * fnorm1 >= fnorm || temp < Scalar(.1))
-                temp = Scalar(.1);
-            /* Computing MIN */
-            delta = temp * (std::min)(delta, pnorm / Scalar(.1));
-            par /= temp;
-        } else if (!(par != 0. && ratio < Scalar(.75))) {
-            delta = pnorm / Scalar(.5);
-            par = Scalar(.5) * par;
-        }
-
-        /* test for successful iteration. */
-        if (ratio >= Scalar(1e-4)) {
-            /* successful iteration. update x, fvec, and their norms. */
-            x = wa2;
-            wa2 = diag.cwiseProduct(x);
-            fvec = wa4;
-            xnorm = wa2.stableNorm();
-            fnorm = fnorm1;
-            ++iter;
-        }
-
-        /* tests for convergence. */
-        if (abs(actred) <= parameters.ftol && prered <= parameters.ftol && Scalar(.5) * ratio <= 1. && delta <= parameters.xtol * xnorm)
-            return LevenbergMarquardtSpace::RelativeErrorAndReductionTooSmall;
-        if (abs(actred) <= parameters.ftol && prered <= parameters.ftol && Scalar(.5) * ratio <= 1.)
-            return LevenbergMarquardtSpace::RelativeReductionTooSmall;
-        if (delta <= parameters.xtol * xnorm)
-            return LevenbergMarquardtSpace::RelativeErrorTooSmall;
-
-        /* tests for termination and stringent tolerances. */
-        if (nfev >= parameters.maxfev)
-            return LevenbergMarquardtSpace::TooManyFunctionEvaluation;
-        if (abs(actred) <= NumTraits<Scalar>::epsilon() && prered <= NumTraits<Scalar>::epsilon() && Scalar(.5) * ratio <= 1.)
-            return LevenbergMarquardtSpace::FtolTooSmall;
-        if (delta <= NumTraits<Scalar>::epsilon() * xnorm)
-            return LevenbergMarquardtSpace::XtolTooSmall;
-        if (gnorm <= NumTraits<Scalar>::epsilon())
-            return LevenbergMarquardtSpace::GtolTooSmall;
-
-    } while (ratio < Scalar(1e-4));
-
-    return LevenbergMarquardtSpace::Running;
+    /* tests for convergence. */
+    if (abs(actred) <= parameters.ftol && prered <= parameters.ftol && Scalar(.5) * ratio <= 1. &&
+        delta <= parameters.xtol * xnorm)
+      return LevenbergMarquardtSpace::RelativeErrorAndReductionTooSmall;
+    if (abs(actred) <= parameters.ftol && prered <= parameters.ftol && Scalar(.5) * ratio <= 1.)
+      return LevenbergMarquardtSpace::RelativeReductionTooSmall;
+    if (delta <= parameters.xtol * xnorm) return LevenbergMarquardtSpace::RelativeErrorTooSmall;
+
+    /* tests for termination and stringent tolerances. */
+    if (nfev >= parameters.maxfev) return LevenbergMarquardtSpace::TooManyFunctionEvaluation;
+    if (abs(actred) <= NumTraits<Scalar>::epsilon() && prered <= NumTraits<Scalar>::epsilon() &&
+        Scalar(.5) * ratio <= 1.)
+      return LevenbergMarquardtSpace::FtolTooSmall;
+    if (delta <= NumTraits<Scalar>::epsilon() * xnorm) return LevenbergMarquardtSpace::XtolTooSmall;
+    if (gnorm <= NumTraits<Scalar>::epsilon()) return LevenbergMarquardtSpace::GtolTooSmall;
+
+  } while (ratio < Scalar(1e-4));
+
+  return LevenbergMarquardtSpace::Running;
 }
 
-template<typename FunctorType, typename Scalar>
-LevenbergMarquardtSpace::Status
-LevenbergMarquardt<FunctorType,Scalar>::minimizeOptimumStorage(FVectorType  &x)
-{
-    LevenbergMarquardtSpace::Status status = minimizeOptimumStorageInit(x);
-    if (status==LevenbergMarquardtSpace::ImproperInputParameters)
-        return status;
-    do {
-        status = minimizeOptimumStorageOneStep(x);
-    } while (status==LevenbergMarquardtSpace::Running);
-    return status;
+template <typename FunctorType, typename Scalar>
+LevenbergMarquardtSpace::Status LevenbergMarquardt<FunctorType, Scalar>::minimizeOptimumStorage(FVectorType &x) {
+  LevenbergMarquardtSpace::Status status = minimizeOptimumStorageInit(x);
+  if (status == LevenbergMarquardtSpace::ImproperInputParameters) return status;
+  do {
+    status = minimizeOptimumStorageOneStep(x);
+  } while (status == LevenbergMarquardtSpace::Running);
+  return status;
 }
 
-template<typename FunctorType, typename Scalar>
-LevenbergMarquardtSpace::Status
-LevenbergMarquardt<FunctorType,Scalar>::lmdif1(
-        FunctorType &functor,
-        FVectorType  &x,
-        Index *nfev,
-        const Scalar tol
-        )
-{
-    Index n = x.size();
-    Index m = functor.values();
-
-    /* check the input parameters for errors. */
-    if (n <= 0 || m < n || tol < 0.)
-        return LevenbergMarquardtSpace::ImproperInputParameters;
-
-    NumericalDiff<FunctorType> numDiff(functor);
-    // embedded LevenbergMarquardt
-    LevenbergMarquardt<NumericalDiff<FunctorType>, Scalar > lm(numDiff);
-    lm.parameters.ftol = tol;
-    lm.parameters.xtol = tol;
-    lm.parameters.maxfev = 200*(n+1);
-
-    LevenbergMarquardtSpace::Status info = LevenbergMarquardtSpace::Status(lm.minimize(x));
-    if (nfev)
-        * nfev = lm.nfev;
-    return info;
+template <typename FunctorType, typename Scalar>
+LevenbergMarquardtSpace::Status LevenbergMarquardt<FunctorType, Scalar>::lmdif1(FunctorType &functor, FVectorType &x,
+                                                                                Index *nfev, const Scalar tol) {
+  Index n = x.size();
+  Index m = functor.values();
+
+  /* check the input parameters for errors. */
+  if (n <= 0 || m < n || tol < 0.) return LevenbergMarquardtSpace::ImproperInputParameters;
+
+  NumericalDiff<FunctorType> numDiff(functor);
+  // embedded LevenbergMarquardt
+  LevenbergMarquardt<NumericalDiff<FunctorType>, Scalar> lm(numDiff);
+  lm.parameters.ftol = tol;
+  lm.parameters.xtol = tol;
+  lm.parameters.maxfev = 200 * (n + 1);
+
+  LevenbergMarquardtSpace::Status info = LevenbergMarquardtSpace::Status(lm.minimize(x));
+  if (nfev) *nfev = lm.nfev;
+  return info;
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_LEVENBERGMARQUARDT__H
+#endif  // EIGEN_LEVENBERGMARQUARDT__H
 
-//vim: ai ts=4 sts=4 et sw=4
+// vim: ai ts=4 sts=4 et sw=4
diff --git a/inst/include/unsupported/Eigen/src/NonLinearOptimization/chkder.h b/inst/include/unsupported/Eigen/src/NonLinearOptimization/chkder.h
index db8ff7d6..a9311ff4 100644
--- a/inst/include/unsupported/Eigen/src/NonLinearOptimization/chkder.h
+++ b/inst/include/unsupported/Eigen/src/NonLinearOptimization/chkder.h
@@ -1,66 +1,57 @@
 #define chkder_log10e 0.43429448190325182765
 #define chkder_factor 100.
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 
-namespace internal {
-
-template<typename Scalar>
-void chkder(
-        const Matrix< Scalar, Dynamic, 1 >  &x,
-        const Matrix< Scalar, Dynamic, 1 >  &fvec,
-        const Matrix< Scalar, Dynamic, Dynamic > &fjac,
-        Matrix< Scalar, Dynamic, 1 >  &xp,
-        const Matrix< Scalar, Dynamic, 1 >  &fvecp,
-        int mode,
-        Matrix< Scalar, Dynamic, 1 >  &err
-        )
-{
-    using std::sqrt;
-    using std::abs;
-    using std::log;
-    
-    typedef DenseIndex Index;
-
-    const Scalar eps = sqrt(NumTraits<Scalar>::epsilon());
-    const Scalar epsf = chkder_factor * NumTraits<Scalar>::epsilon();
-    const Scalar epslog = chkder_log10e * log(eps);
-    Scalar temp;
+namespace Eigen {
 
-    const Index m = fvec.size(), n = x.size();
+namespace internal {
 
-    if (mode != 2) {
-        /* mode = 1. */
-        xp.resize(n);
-        for (Index j = 0; j < n; ++j) {
-            temp = eps * abs(x[j]);
-            if (temp == 0.)
-                temp = eps;
-            xp[j] = x[j] + temp;
-        }
+template <typename Scalar>
+void chkder(const Matrix<Scalar, Dynamic, 1> &x, const Matrix<Scalar, Dynamic, 1> &fvec,
+            const Matrix<Scalar, Dynamic, Dynamic> &fjac, Matrix<Scalar, Dynamic, 1> &xp,
+            const Matrix<Scalar, Dynamic, 1> &fvecp, int mode, Matrix<Scalar, Dynamic, 1> &err) {
+  using std::abs;
+  using std::log;
+  using std::sqrt;
+
+  typedef DenseIndex Index;
+
+  const Scalar eps = sqrt(NumTraits<Scalar>::epsilon());
+  const Scalar epsf = chkder_factor * NumTraits<Scalar>::epsilon();
+  const Scalar epslog = chkder_log10e * log(eps);
+  Scalar temp;
+
+  const Index m = fvec.size(), n = x.size();
+
+  if (mode != 2) {
+    /* mode = 1. */
+    xp.resize(n);
+    for (Index j = 0; j < n; ++j) {
+      temp = eps * abs(x[j]);
+      if (temp == 0.) temp = eps;
+      xp[j] = x[j] + temp;
+    }
+  } else {
+    /* mode = 2. */
+    err.setZero(m);
+    for (Index j = 0; j < n; ++j) {
+      temp = abs(x[j]);
+      if (temp == 0.) temp = 1.;
+      err += temp * fjac.col(j);
     }
-    else {
-        /* mode = 2. */
-        err.setZero(m); 
-        for (Index j = 0; j < n; ++j) {
-            temp = abs(x[j]);
-            if (temp == 0.)
-                temp = 1.;
-            err += temp * fjac.col(j);
-        }
-        for (Index i = 0; i < m; ++i) {
-            temp = 1.;
-            if (fvec[i] != 0. && fvecp[i] != 0. && abs(fvecp[i] - fvec[i]) >= epsf * abs(fvec[i]))
-                temp = eps * abs((fvecp[i] - fvec[i]) / eps - err[i]) / (abs(fvec[i]) + abs(fvecp[i]));
-            err[i] = 1.;
-            if (temp > NumTraits<Scalar>::epsilon() && temp < eps)
-                err[i] = (chkder_log10e * log(temp) - epslog) / epslog;
-            if (temp >= eps)
-                err[i] = 0.;
-        }
+    for (Index i = 0; i < m; ++i) {
+      temp = 1.;
+      if (fvec[i] != 0. && fvecp[i] != 0. && abs(fvecp[i] - fvec[i]) >= epsf * abs(fvec[i]))
+        temp = eps * abs((fvecp[i] - fvec[i]) / eps - err[i]) / (abs(fvec[i]) + abs(fvecp[i]));
+      err[i] = 1.;
+      if (temp > NumTraits<Scalar>::epsilon() && temp < eps) err[i] = (chkder_log10e * log(temp) - epslog) / epslog;
+      if (temp >= eps) err[i] = 0.;
     }
+  }
 }
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
diff --git a/inst/include/unsupported/Eigen/src/NonLinearOptimization/covar.h b/inst/include/unsupported/Eigen/src/NonLinearOptimization/covar.h
index 68260d19..311d9982 100644
--- a/inst/include/unsupported/Eigen/src/NonLinearOptimization/covar.h
+++ b/inst/include/unsupported/Eigen/src/NonLinearOptimization/covar.h
@@ -1,70 +1,66 @@
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 namespace internal {
 
 template <typename Scalar>
-void covar(
-        Matrix< Scalar, Dynamic, Dynamic > &r,
-        const VectorXi &ipvt,
-        Scalar tol = std::sqrt(NumTraits<Scalar>::epsilon()) )
-{
-    using std::abs;
-    typedef DenseIndex Index;
-
-    /* Local variables */
-    Index i, j, k, l, ii, jj;
-    bool sing;
-    Scalar temp;
+void covar(Matrix<Scalar, Dynamic, Dynamic> &r, const VectorXi &ipvt,
+           Scalar tol = std::sqrt(NumTraits<Scalar>::epsilon())) {
+  using std::abs;
+  typedef DenseIndex Index;
 
-    /* Function Body */
-    const Index n = r.cols();
-    const Scalar tolr = tol * abs(r(0,0));
-    Matrix< Scalar, Dynamic, 1 > wa(n);
-    eigen_assert(ipvt.size()==n);
+  /* Local variables */
+  Index i, j, k, l, ii, jj;
+  bool sing;
+  Scalar temp;
 
-    /* form the inverse of r in the full upper triangle of r. */
-    l = -1;
-    for (k = 0; k < n; ++k)
-        if (abs(r(k,k)) > tolr) {
-            r(k,k) = 1. / r(k,k);
-            for (j = 0; j <= k-1; ++j) {
-                temp = r(k,k) * r(j,k);
-                r(j,k) = 0.;
-                r.col(k).head(j+1) -= r.col(j).head(j+1) * temp;
-            }
-            l = k;
-        }
+  /* Function Body */
+  const Index n = r.cols();
+  const Scalar tolr = tol * abs(r(0, 0));
+  Matrix<Scalar, Dynamic, 1> wa(n);
+  eigen_assert(ipvt.size() == n);
 
-    /* form the full upper triangle of the inverse of (r transpose)*r */
-    /* in the full upper triangle of r. */
-    for (k = 0; k <= l; ++k) {
-        for (j = 0; j <= k-1; ++j)
-            r.col(j).head(j+1) += r.col(k).head(j+1) * r(j,k);
-        r.col(k).head(k+1) *= r(k,k);
+  /* form the inverse of r in the full upper triangle of r. */
+  l = -1;
+  for (k = 0; k < n; ++k)
+    if (abs(r(k, k)) > tolr) {
+      r(k, k) = 1. / r(k, k);
+      for (j = 0; j <= k - 1; ++j) {
+        temp = r(k, k) * r(j, k);
+        r(j, k) = 0.;
+        r.col(k).head(j + 1) -= r.col(j).head(j + 1) * temp;
+      }
+      l = k;
     }
 
-    /* form the full lower triangle of the covariance matrix */
-    /* in the strict lower triangle of r and in wa. */
-    for (j = 0; j < n; ++j) {
-        jj = ipvt[j];
-        sing = j > l;
-        for (i = 0; i <= j; ++i) {
-            if (sing)
-                r(i,j) = 0.;
-            ii = ipvt[i];
-            if (ii > jj)
-                r(ii,jj) = r(i,j);
-            if (ii < jj)
-                r(jj,ii) = r(i,j);
-        }
-        wa[jj] = r(j,j);
+  /* form the full upper triangle of the inverse of (r transpose)*r */
+  /* in the full upper triangle of r. */
+  for (k = 0; k <= l; ++k) {
+    for (j = 0; j <= k - 1; ++j) r.col(j).head(j + 1) += r.col(k).head(j + 1) * r(j, k);
+    r.col(k).head(k + 1) *= r(k, k);
+  }
+
+  /* form the full lower triangle of the covariance matrix */
+  /* in the strict lower triangle of r and in wa. */
+  for (j = 0; j < n; ++j) {
+    jj = ipvt[j];
+    sing = j > l;
+    for (i = 0; i <= j; ++i) {
+      if (sing) r(i, j) = 0.;
+      ii = ipvt[i];
+      if (ii > jj) r(ii, jj) = r(i, j);
+      if (ii < jj) r(jj, ii) = r(i, j);
     }
+    wa[jj] = r(j, j);
+  }
 
-    /* symmetrize the covariance matrix in r. */
-    r.topLeftCorner(n,n).template triangularView<StrictlyUpper>() = r.topLeftCorner(n,n).transpose();
-    r.diagonal() = wa;
+  /* symmetrize the covariance matrix in r. */
+  r.topLeftCorner(n, n).template triangularView<StrictlyUpper>() = r.topLeftCorner(n, n).transpose();
+  r.diagonal() = wa;
 }
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
diff --git a/inst/include/unsupported/Eigen/src/NonLinearOptimization/dogleg.h b/inst/include/unsupported/Eigen/src/NonLinearOptimization/dogleg.h
index 80c5d277..d1abb9e3 100644
--- a/inst/include/unsupported/Eigen/src/NonLinearOptimization/dogleg.h
+++ b/inst/include/unsupported/Eigen/src/NonLinearOptimization/dogleg.h
@@ -1,107 +1,103 @@
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 namespace internal {
 
 template <typename Scalar>
-void dogleg(
-        const Matrix< Scalar, Dynamic, Dynamic >  &qrfac,
-        const Matrix< Scalar, Dynamic, 1 >  &diag,
-        const Matrix< Scalar, Dynamic, 1 >  &qtb,
-        Scalar delta,
-        Matrix< Scalar, Dynamic, 1 >  &x)
-{
-    using std::abs;
-    using std::sqrt;
-    
-    typedef DenseIndex Index;
-
-    /* Local variables */
-    Index i, j;
-    Scalar sum, temp, alpha, bnorm;
-    Scalar gnorm, qnorm;
-    Scalar sgnorm;
-
-    /* Function Body */
-    const Scalar epsmch = NumTraits<Scalar>::epsilon();
-    const Index n = qrfac.cols();
-    eigen_assert(n==qtb.size());
-    eigen_assert(n==x.size());
-    eigen_assert(n==diag.size());
-    Matrix< Scalar, Dynamic, 1 >  wa1(n), wa2(n);
-
-    /* first, calculate the gauss-newton direction. */
-    for (j = n-1; j >=0; --j) {
-        temp = qrfac(j,j);
-        if (temp == 0.) {
-            temp = epsmch * qrfac.col(j).head(j+1).maxCoeff();
-            if (temp == 0.)
-                temp = epsmch;
-        }
-        if (j==n-1)
-            x[j] = qtb[j] / temp;
-        else
-            x[j] = (qtb[j] - qrfac.row(j).tail(n-j-1).dot(x.tail(n-j-1))) / temp;
+void dogleg(const Matrix<Scalar, Dynamic, Dynamic> &qrfac, const Matrix<Scalar, Dynamic, 1> &diag,
+            const Matrix<Scalar, Dynamic, 1> &qtb, Scalar delta, Matrix<Scalar, Dynamic, 1> &x) {
+  using std::abs;
+  using std::sqrt;
+
+  typedef DenseIndex Index;
+
+  /* Local variables */
+  Index i, j;
+  Scalar sum, temp, alpha, bnorm;
+  Scalar gnorm, qnorm;
+  Scalar sgnorm;
+
+  /* Function Body */
+  const Scalar epsmch = NumTraits<Scalar>::epsilon();
+  const Index n = qrfac.cols();
+  eigen_assert(n == qtb.size());
+  eigen_assert(n == x.size());
+  eigen_assert(n == diag.size());
+  Matrix<Scalar, Dynamic, 1> wa1(n), wa2(n);
+
+  /* first, calculate the gauss-newton direction. */
+  for (j = n - 1; j >= 0; --j) {
+    temp = qrfac(j, j);
+    if (temp == 0.) {
+      temp = epsmch * qrfac.col(j).head(j + 1).maxCoeff();
+      if (temp == 0.) temp = epsmch;
     }
-
-    /* test whether the gauss-newton direction is acceptable. */
-    qnorm = diag.cwiseProduct(x).stableNorm();
-    if (qnorm <= delta)
-        return;
-
-    // TODO : this path is not tested by Eigen unit tests
-
-    /* the gauss-newton direction is not acceptable. */
-    /* next, calculate the scaled gradient direction. */
-
-    wa1.fill(0.);
-    for (j = 0; j < n; ++j) {
-        wa1.tail(n-j) += qrfac.row(j).tail(n-j) * qtb[j];
-        wa1[j] /= diag[j];
-    }
-
-    /* calculate the norm of the scaled gradient and test for */
-    /* the special case in which the scaled gradient is zero. */
-    gnorm = wa1.stableNorm();
-    sgnorm = 0.;
-    alpha = delta / qnorm;
-    if (gnorm == 0.)
-        goto algo_end;
-
-    /* calculate the point along the scaled gradient */
-    /* at which the quadratic is minimized. */
-    wa1.array() /= (diag*gnorm).array();
-    // TODO : once unit tests cover this part,:
-    // wa2 = qrfac.template triangularView<Upper>() * wa1;
-    for (j = 0; j < n; ++j) {
-        sum = 0.;
-        for (i = j; i < n; ++i) {
-            sum += qrfac(j,i) * wa1[i];
-        }
-        wa2[j] = sum;
+    if (j == n - 1)
+      x[j] = qtb[j] / temp;
+    else
+      x[j] = (qtb[j] - qrfac.row(j).tail(n - j - 1).dot(x.tail(n - j - 1))) / temp;
+  }
+
+  /* test whether the gauss-newton direction is acceptable. */
+  qnorm = diag.cwiseProduct(x).stableNorm();
+  if (qnorm <= delta) return;
+
+  // TODO : this path is not tested by Eigen unit tests
+
+  /* the gauss-newton direction is not acceptable. */
+  /* next, calculate the scaled gradient direction. */
+
+  wa1.fill(0.);
+  for (j = 0; j < n; ++j) {
+    wa1.tail(n - j) += qrfac.row(j).tail(n - j) * qtb[j];
+    wa1[j] /= diag[j];
+  }
+
+  /* calculate the norm of the scaled gradient and test for */
+  /* the special case in which the scaled gradient is zero. */
+  gnorm = wa1.stableNorm();
+  sgnorm = 0.;
+  alpha = delta / qnorm;
+  if (gnorm == 0.) goto algo_end;
+
+  /* calculate the point along the scaled gradient */
+  /* at which the quadratic is minimized. */
+  wa1.array() /= (diag * gnorm).array();
+  // TODO : once unit tests cover this part,:
+  // wa2 = qrfac.template triangularView<Upper>() * wa1;
+  for (j = 0; j < n; ++j) {
+    sum = 0.;
+    for (i = j; i < n; ++i) {
+      sum += qrfac(j, i) * wa1[i];
     }
-    temp = wa2.stableNorm();
-    sgnorm = gnorm / temp / temp;
-
-    /* test whether the scaled gradient direction is acceptable. */
-    alpha = 0.;
-    if (sgnorm >= delta)
-        goto algo_end;
-
-    /* the scaled gradient direction is not acceptable. */
-    /* finally, calculate the point along the dogleg */
-    /* at which the quadratic is minimized. */
-    bnorm = qtb.stableNorm();
-    temp = bnorm / gnorm * (bnorm / qnorm) * (sgnorm / delta);
-    temp = temp - delta / qnorm * numext::abs2(sgnorm / delta) + sqrt(numext::abs2(temp - delta / qnorm) + (1.-numext::abs2(delta / qnorm)) * (1.-numext::abs2(sgnorm / delta)));
-    alpha = delta / qnorm * (1. - numext::abs2(sgnorm / delta)) / temp;
+    wa2[j] = sum;
+  }
+  temp = wa2.stableNorm();
+  sgnorm = gnorm / temp / temp;
+
+  /* test whether the scaled gradient direction is acceptable. */
+  alpha = 0.;
+  if (sgnorm >= delta) goto algo_end;
+
+  /* the scaled gradient direction is not acceptable. */
+  /* finally, calculate the point along the dogleg */
+  /* at which the quadratic is minimized. */
+  bnorm = qtb.stableNorm();
+  temp = bnorm / gnorm * (bnorm / qnorm) * (sgnorm / delta);
+  temp = temp - delta / qnorm * numext::abs2(sgnorm / delta) +
+         sqrt(numext::abs2(temp - delta / qnorm) +
+              (1. - numext::abs2(delta / qnorm)) * (1. - numext::abs2(sgnorm / delta)));
+  alpha = delta / qnorm * (1. - numext::abs2(sgnorm / delta)) / temp;
 algo_end:
 
-    /* form appropriate convex combination of the gauss-newton */
-    /* direction and the scaled gradient direction. */
-    temp = (1.-alpha) * (std::min)(sgnorm,delta);
-    x = temp * wa1 + alpha * x;
+  /* form appropriate convex combination of the gauss-newton */
+  /* direction and the scaled gradient direction. */
+  temp = (1. - alpha) * (std::min)(sgnorm, delta);
+  x = temp * wa1 + alpha * x;
 }
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
diff --git a/inst/include/unsupported/Eigen/src/NonLinearOptimization/fdjac1.h b/inst/include/unsupported/Eigen/src/NonLinearOptimization/fdjac1.h
index bb7cf267..70da3f96 100644
--- a/inst/include/unsupported/Eigen/src/NonLinearOptimization/fdjac1.h
+++ b/inst/include/unsupported/Eigen/src/NonLinearOptimization/fdjac1.h
@@ -1,79 +1,73 @@
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 namespace internal {
 
-template<typename FunctorType, typename Scalar>
-DenseIndex fdjac1(
-        const FunctorType &Functor,
-        Matrix< Scalar, Dynamic, 1 >  &x,
-        Matrix< Scalar, Dynamic, 1 >  &fvec,
-        Matrix< Scalar, Dynamic, Dynamic > &fjac,
-        DenseIndex ml, DenseIndex mu,
-        Scalar epsfcn)
-{
-    using std::sqrt;
-    using std::abs;
-    
-    typedef DenseIndex Index;
+template <typename FunctorType, typename Scalar>
+DenseIndex fdjac1(const FunctorType &Functor, Matrix<Scalar, Dynamic, 1> &x, Matrix<Scalar, Dynamic, 1> &fvec,
+                  Matrix<Scalar, Dynamic, Dynamic> &fjac, DenseIndex ml, DenseIndex mu, Scalar epsfcn) {
+  using std::abs;
+  using std::sqrt;
+
+  typedef DenseIndex Index;
 
-    /* Local variables */
-    Scalar h;
-    Index j, k;
-    Scalar eps, temp;
-    Index msum;
-    int iflag;
-    Index start, length;
+  /* Local variables */
+  Scalar h;
+  Index j, k;
+  Scalar eps, temp;
+  Index msum;
+  int iflag;
+  Index start, length;
 
-    /* Function Body */
-    const Scalar epsmch = NumTraits<Scalar>::epsilon();
-    const Index n = x.size();
-    eigen_assert(fvec.size()==n);
-    Matrix< Scalar, Dynamic, 1 >  wa1(n);
-    Matrix< Scalar, Dynamic, 1 >  wa2(n);
+  /* Function Body */
+  const Scalar epsmch = NumTraits<Scalar>::epsilon();
+  const Index n = x.size();
+  eigen_assert(fvec.size() == n);
+  Matrix<Scalar, Dynamic, 1> wa1(n);
+  Matrix<Scalar, Dynamic, 1> wa2(n);
 
-    eps = sqrt((std::max)(epsfcn,epsmch));
-    msum = ml + mu + 1;
-    if (msum >= n) {
-        /* computation of dense approximate jacobian. */
-        for (j = 0; j < n; ++j) {
-            temp = x[j];
-            h = eps * abs(temp);
-            if (h == 0.)
-                h = eps;
-            x[j] = temp + h;
-            iflag = Functor(x, wa1);
-            if (iflag < 0)
-                return iflag;
-            x[j] = temp;
-            fjac.col(j) = (wa1-fvec)/h;
-        }
+  eps = sqrt((std::max)(epsfcn, epsmch));
+  msum = ml + mu + 1;
+  if (msum >= n) {
+    /* computation of dense approximate jacobian. */
+    for (j = 0; j < n; ++j) {
+      temp = x[j];
+      h = eps * abs(temp);
+      if (h == 0.) h = eps;
+      x[j] = temp + h;
+      iflag = Functor(x, wa1);
+      if (iflag < 0) return iflag;
+      x[j] = temp;
+      fjac.col(j) = (wa1 - fvec) / h;
+    }
 
-    }else {
-        /* computation of banded approximate jacobian. */
-        for (k = 0; k < msum; ++k) {
-            for (j = k; (msum<0) ? (j>n): (j<n); j += msum) {
-                wa2[j] = x[j];
-                h = eps * abs(wa2[j]);
-                if (h == 0.) h = eps;
-                x[j] = wa2[j] + h;
-            }
-            iflag = Functor(x, wa1);
-            if (iflag < 0)
-                return iflag;
-            for (j = k; (msum<0) ? (j>n): (j<n); j += msum) {
-                x[j] = wa2[j];
-                h = eps * abs(wa2[j]);
-                if (h == 0.) h = eps;
-                fjac.col(j).setZero();
-                start = std::max<Index>(0,j-mu);
-                length = (std::min)(n-1, j+ml) - start + 1;
-                fjac.col(j).segment(start, length) = ( wa1.segment(start, length)-fvec.segment(start, length))/h;
-            }
-        }
+  } else {
+    /* computation of banded approximate jacobian. */
+    for (k = 0; k < msum; ++k) {
+      for (j = k; (msum < 0) ? (j > n) : (j < n); j += msum) {
+        wa2[j] = x[j];
+        h = eps * abs(wa2[j]);
+        if (h == 0.) h = eps;
+        x[j] = wa2[j] + h;
+      }
+      iflag = Functor(x, wa1);
+      if (iflag < 0) return iflag;
+      for (j = k; (msum < 0) ? (j > n) : (j < n); j += msum) {
+        x[j] = wa2[j];
+        h = eps * abs(wa2[j]);
+        if (h == 0.) h = eps;
+        fjac.col(j).setZero();
+        start = std::max<Index>(0, j - mu);
+        length = (std::min)(n - 1, j + ml) - start + 1;
+        fjac.col(j).segment(start, length) = (wa1.segment(start, length) - fvec.segment(start, length)) / h;
+      }
     }
-    return 0;
+  }
+  return 0;
 }
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
diff --git a/inst/include/unsupported/Eigen/src/NonLinearOptimization/lmpar.h b/inst/include/unsupported/Eigen/src/NonLinearOptimization/lmpar.h
index 4c17d4cd..14202012 100644
--- a/inst/include/unsupported/Eigen/src/NonLinearOptimization/lmpar.h
+++ b/inst/include/unsupported/Eigen/src/NonLinearOptimization/lmpar.h
@@ -1,298 +1,265 @@
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 namespace internal {
 
 template <typename Scalar>
-void lmpar(
-        Matrix< Scalar, Dynamic, Dynamic > &r,
-        const VectorXi &ipvt,
-        const Matrix< Scalar, Dynamic, 1 >  &diag,
-        const Matrix< Scalar, Dynamic, 1 >  &qtb,
-        Scalar delta,
-        Scalar &par,
-        Matrix< Scalar, Dynamic, 1 >  &x)
-{
-    using std::abs;
-    using std::sqrt;
-    typedef DenseIndex Index;
-
-    /* Local variables */
-    Index i, j, l;
-    Scalar fp;
-    Scalar parc, parl;
-    Index iter;
-    Scalar temp, paru;
-    Scalar gnorm;
-    Scalar dxnorm;
-
-
-    /* Function Body */
-    const Scalar dwarf = (std::numeric_limits<Scalar>::min)();
-    const Index n = r.cols();
-    eigen_assert(n==diag.size());
-    eigen_assert(n==qtb.size());
-    eigen_assert(n==x.size());
-
-    Matrix< Scalar, Dynamic, 1 >  wa1, wa2;
-
-    /* compute and store in x the gauss-newton direction. if the */
-    /* jacobian is rank-deficient, obtain a least squares solution. */
-    Index nsing = n-1;
-    wa1 = qtb;
+void lmpar(Matrix<Scalar, Dynamic, Dynamic> &r, const VectorXi &ipvt, const Matrix<Scalar, Dynamic, 1> &diag,
+           const Matrix<Scalar, Dynamic, 1> &qtb, Scalar delta, Scalar &par, Matrix<Scalar, Dynamic, 1> &x) {
+  using std::abs;
+  using std::sqrt;
+  typedef DenseIndex Index;
+
+  /* Local variables */
+  Index i, j, l;
+  Scalar fp;
+  Scalar parc, parl;
+  Index iter;
+  Scalar temp, paru;
+  Scalar gnorm;
+  Scalar dxnorm;
+
+  /* Function Body */
+  const Scalar dwarf = (std::numeric_limits<Scalar>::min)();
+  const Index n = r.cols();
+  eigen_assert(n == diag.size());
+  eigen_assert(n == qtb.size());
+  eigen_assert(n == x.size());
+
+  Matrix<Scalar, Dynamic, 1> wa1, wa2;
+
+  /* compute and store in x the gauss-newton direction. if the */
+  /* jacobian is rank-deficient, obtain a least squares solution. */
+  Index nsing = n - 1;
+  wa1 = qtb;
+  for (j = 0; j < n; ++j) {
+    if (r(j, j) == 0. && nsing == n - 1) nsing = j - 1;
+    if (nsing < n - 1) wa1[j] = 0.;
+  }
+  for (j = nsing; j >= 0; --j) {
+    wa1[j] /= r(j, j);
+    temp = wa1[j];
+    for (i = 0; i < j; ++i) wa1[i] -= r(i, j) * temp;
+  }
+
+  for (j = 0; j < n; ++j) x[ipvt[j]] = wa1[j];
+
+  /* initialize the iteration counter. */
+  /* evaluate the function at the origin, and test */
+  /* for acceptance of the gauss-newton direction. */
+  iter = 0;
+  wa2 = diag.cwiseProduct(x);
+  dxnorm = wa2.blueNorm();
+  fp = dxnorm - delta;
+  if (fp <= Scalar(0.1) * delta) {
+    par = 0;
+    return;
+  }
+
+  /* if the jacobian is not rank deficient, the newton */
+  /* step provides a lower bound, parl, for the zero of */
+  /* the function. otherwise set this bound to zero. */
+  parl = 0.;
+  if (nsing >= n - 1) {
     for (j = 0; j < n; ++j) {
-        if (r(j,j) == 0. && nsing == n-1)
-            nsing = j - 1;
-        if (nsing < n-1)
-            wa1[j] = 0.;
+      l = ipvt[j];
+      wa1[j] = diag[l] * (wa2[l] / dxnorm);
     }
-    for (j = nsing; j>=0; --j) {
-        wa1[j] /= r(j,j);
-        temp = wa1[j];
-        for (i = 0; i < j ; ++i)
-            wa1[i] -= r(i,j) * temp;
+    // it's actually a triangularView.solveInplace(), though in a weird
+    // way:
+    for (j = 0; j < n; ++j) {
+      Scalar sum = 0.;
+      for (i = 0; i < j; ++i) sum += r(i, j) * wa1[i];
+      wa1[j] = (wa1[j] - sum) / r(j, j);
     }
+    temp = wa1.blueNorm();
+    parl = fp / delta / temp / temp;
+  }
+
+  /* calculate an upper bound, paru, for the zero of the function. */
+  for (j = 0; j < n; ++j) wa1[j] = r.col(j).head(j + 1).dot(qtb.head(j + 1)) / diag[ipvt[j]];
+
+  gnorm = wa1.stableNorm();
+  paru = gnorm / delta;
+  if (paru == 0.) paru = dwarf / (std::min)(delta, Scalar(0.1));
 
-    for (j = 0; j < n; ++j)
-        x[ipvt[j]] = wa1[j];
+  /* if the input par lies outside of the interval (parl,paru), */
+  /* set par to the closer endpoint. */
+  par = (std::max)(par, parl);
+  par = (std::min)(par, paru);
+  if (par == 0.) par = gnorm / dxnorm;
+
+  /* beginning of an iteration. */
+  while (true) {
+    ++iter;
+
+    /* evaluate the function at the current value of par. */
+    if (par == 0.) par = (std::max)(dwarf, Scalar(.001) * paru); /* Computing MAX */
+    wa1 = sqrt(par) * diag;
+
+    Matrix<Scalar, Dynamic, 1> sdiag(n);
+    qrsolv<Scalar>(r, ipvt, wa1, qtb, x, sdiag);
 
-    /* initialize the iteration counter. */
-    /* evaluate the function at the origin, and test */
-    /* for acceptance of the gauss-newton direction. */
-    iter = 0;
     wa2 = diag.cwiseProduct(x);
     dxnorm = wa2.blueNorm();
+    temp = fp;
     fp = dxnorm - delta;
-    if (fp <= Scalar(0.1) * delta) {
-        par = 0;
-        return;
-    }
 
-    /* if the jacobian is not rank deficient, the newton */
-    /* step provides a lower bound, parl, for the zero of */
-    /* the function. otherwise set this bound to zero. */
-    parl = 0.;
-    if (nsing >= n-1) {
-        for (j = 0; j < n; ++j) {
-            l = ipvt[j];
-            wa1[j] = diag[l] * (wa2[l] / dxnorm);
-        }
-        // it's actually a triangularView.solveInplace(), though in a weird
-        // way:
-        for (j = 0; j < n; ++j) {
-            Scalar sum = 0.;
-            for (i = 0; i < j; ++i)
-                sum += r(i,j) * wa1[i];
-            wa1[j] = (wa1[j] - sum) / r(j,j);
-        }
-        temp = wa1.blueNorm();
-        parl = fp / delta / temp / temp;
-    }
+    /* if the function is small enough, accept the current value */
+    /* of par. also test for the exceptional cases where parl */
+    /* is zero or the number of iterations has reached 10. */
+    if (abs(fp) <= Scalar(0.1) * delta || (parl == 0. && fp <= temp && temp < 0.) || iter == 10) break;
 
-    /* calculate an upper bound, paru, for the zero of the function. */
-    for (j = 0; j < n; ++j)
-        wa1[j] = r.col(j).head(j+1).dot(qtb.head(j+1)) / diag[ipvt[j]];
-
-    gnorm = wa1.stableNorm();
-    paru = gnorm / delta;
-    if (paru == 0.)
-        paru = dwarf / (std::min)(delta,Scalar(0.1));
-
-    /* if the input par lies outside of the interval (parl,paru), */
-    /* set par to the closer endpoint. */
-    par = (std::max)(par,parl);
-    par = (std::min)(par,paru);
-    if (par == 0.)
-        par = gnorm / dxnorm;
-
-    /* beginning of an iteration. */
-    while (true) {
-        ++iter;
-
-        /* evaluate the function at the current value of par. */
-        if (par == 0.)
-            par = (std::max)(dwarf,Scalar(.001) * paru); /* Computing MAX */
-        wa1 = sqrt(par)* diag;
-
-        Matrix< Scalar, Dynamic, 1 > sdiag(n);
-        qrsolv<Scalar>(r, ipvt, wa1, qtb, x, sdiag);
-
-        wa2 = diag.cwiseProduct(x);
-        dxnorm = wa2.blueNorm();
-        temp = fp;
-        fp = dxnorm - delta;
-
-        /* if the function is small enough, accept the current value */
-        /* of par. also test for the exceptional cases where parl */
-        /* is zero or the number of iterations has reached 10. */
-        if (abs(fp) <= Scalar(0.1) * delta || (parl == 0. && fp <= temp && temp < 0.) || iter == 10)
-            break;
-
-        /* compute the newton correction. */
-        for (j = 0; j < n; ++j) {
-            l = ipvt[j];
-            wa1[j] = diag[l] * (wa2[l] / dxnorm);
-        }
-        for (j = 0; j < n; ++j) {
-            wa1[j] /= sdiag[j];
-            temp = wa1[j];
-            for (i = j+1; i < n; ++i)
-                wa1[i] -= r(i,j) * temp;
-        }
-        temp = wa1.blueNorm();
-        parc = fp / delta / temp / temp;
-
-        /* depending on the sign of the function, update parl or paru. */
-        if (fp > 0.)
-            parl = (std::max)(parl,par);
-        if (fp < 0.)
-            paru = (std::min)(paru,par);
-
-        /* compute an improved estimate for par. */
-        /* Computing MAX */
-        par = (std::max)(parl,par+parc);
-
-        /* end of an iteration. */
+    /* compute the newton correction. */
+    for (j = 0; j < n; ++j) {
+      l = ipvt[j];
+      wa1[j] = diag[l] * (wa2[l] / dxnorm);
     }
+    for (j = 0; j < n; ++j) {
+      wa1[j] /= sdiag[j];
+      temp = wa1[j];
+      for (i = j + 1; i < n; ++i) wa1[i] -= r(i, j) * temp;
+    }
+    temp = wa1.blueNorm();
+    parc = fp / delta / temp / temp;
 
-    /* termination. */
-    if (iter == 0)
-        par = 0.;
-    return;
+    /* depending on the sign of the function, update parl or paru. */
+    if (fp > 0.) parl = (std::max)(parl, par);
+    if (fp < 0.) paru = (std::min)(paru, par);
+
+    /* compute an improved estimate for par. */
+    /* Computing MAX */
+    par = (std::max)(parl, par + parc);
+
+    /* end of an iteration. */
+  }
+
+  /* termination. */
+  if (iter == 0) par = 0.;
+  return;
 }
 
 template <typename Scalar>
-void lmpar2(
-        const ColPivHouseholderQR<Matrix< Scalar, Dynamic, Dynamic> > &qr,
-        const Matrix< Scalar, Dynamic, 1 >  &diag,
-        const Matrix< Scalar, Dynamic, 1 >  &qtb,
-        Scalar delta,
-        Scalar &par,
-        Matrix< Scalar, Dynamic, 1 >  &x)
+void lmpar2(const ColPivHouseholderQR<Matrix<Scalar, Dynamic, Dynamic> > &qr, const Matrix<Scalar, Dynamic, 1> &diag,
+            const Matrix<Scalar, Dynamic, 1> &qtb, Scalar delta, Scalar &par, Matrix<Scalar, Dynamic, 1> &x)
 
 {
-    using std::sqrt;
-    using std::abs;
-    typedef DenseIndex Index;
-
-    /* Local variables */
-    Index j;
-    Scalar fp;
-    Scalar parc, parl;
-    Index iter;
-    Scalar temp, paru;
-    Scalar gnorm;
-    Scalar dxnorm;
-
-
-    /* Function Body */
-    const Scalar dwarf = (std::numeric_limits<Scalar>::min)();
-    const Index n = qr.matrixQR().cols();
-    eigen_assert(n==diag.size());
-    eigen_assert(n==qtb.size());
-
-    Matrix< Scalar, Dynamic, 1 >  wa1, wa2;
-
-    /* compute and store in x the gauss-newton direction. if the */
-    /* jacobian is rank-deficient, obtain a least squares solution. */
-
-//    const Index rank = qr.nonzeroPivots(); // exactly double(0.)
-    const Index rank = qr.rank(); // use a threshold
-    wa1 = qtb;
-    wa1.tail(n-rank).setZero();
-    qr.matrixQR().topLeftCorner(rank, rank).template triangularView<Upper>().solveInPlace(wa1.head(rank));
-
-    x = qr.colsPermutation()*wa1;
-
-    /* initialize the iteration counter. */
-    /* evaluate the function at the origin, and test */
-    /* for acceptance of the gauss-newton direction. */
-    iter = 0;
+  using std::abs;
+  using std::sqrt;
+  typedef DenseIndex Index;
+
+  /* Local variables */
+  Index j;
+  Scalar fp;
+  Scalar parc, parl;
+  Index iter;
+  Scalar temp, paru;
+  Scalar gnorm;
+  Scalar dxnorm;
+
+  /* Function Body */
+  const Scalar dwarf = (std::numeric_limits<Scalar>::min)();
+  const Index n = qr.matrixQR().cols();
+  eigen_assert(n == diag.size());
+  eigen_assert(n == qtb.size());
+
+  Matrix<Scalar, Dynamic, 1> wa1, wa2;
+
+  /* compute and store in x the gauss-newton direction. if the */
+  /* jacobian is rank-deficient, obtain a least squares solution. */
+
+  //    const Index rank = qr.nonzeroPivots(); // exactly double(0.)
+  const Index rank = qr.rank();  // use a threshold
+  wa1 = qtb;
+  wa1.tail(n - rank).setZero();
+  qr.matrixQR().topLeftCorner(rank, rank).template triangularView<Upper>().solveInPlace(wa1.head(rank));
+
+  x = qr.colsPermutation() * wa1;
+
+  /* initialize the iteration counter. */
+  /* evaluate the function at the origin, and test */
+  /* for acceptance of the gauss-newton direction. */
+  iter = 0;
+  wa2 = diag.cwiseProduct(x);
+  dxnorm = wa2.blueNorm();
+  fp = dxnorm - delta;
+  if (fp <= Scalar(0.1) * delta) {
+    par = 0;
+    return;
+  }
+
+  /* if the jacobian is not rank deficient, the newton */
+  /* step provides a lower bound, parl, for the zero of */
+  /* the function. otherwise set this bound to zero. */
+  parl = 0.;
+  if (rank == n) {
+    wa1 = qr.colsPermutation().inverse() * diag.cwiseProduct(wa2) / dxnorm;
+    qr.matrixQR().topLeftCorner(n, n).transpose().template triangularView<Lower>().solveInPlace(wa1);
+    temp = wa1.blueNorm();
+    parl = fp / delta / temp / temp;
+  }
+
+  /* calculate an upper bound, paru, for the zero of the function. */
+  for (j = 0; j < n; ++j)
+    wa1[j] = qr.matrixQR().col(j).head(j + 1).dot(qtb.head(j + 1)) / diag[qr.colsPermutation().indices()(j)];
+
+  gnorm = wa1.stableNorm();
+  paru = gnorm / delta;
+  if (paru == 0.) paru = dwarf / (std::min)(delta, Scalar(0.1));
+
+  /* if the input par lies outside of the interval (parl,paru), */
+  /* set par to the closer endpoint. */
+  par = (std::max)(par, parl);
+  par = (std::min)(par, paru);
+  if (par == 0.) par = gnorm / dxnorm;
+
+  /* beginning of an iteration. */
+  Matrix<Scalar, Dynamic, Dynamic> s = qr.matrixQR();
+  while (true) {
+    ++iter;
+
+    /* evaluate the function at the current value of par. */
+    if (par == 0.) par = (std::max)(dwarf, Scalar(.001) * paru); /* Computing MAX */
+    wa1 = sqrt(par) * diag;
+
+    Matrix<Scalar, Dynamic, 1> sdiag(n);
+    qrsolv<Scalar>(s, qr.colsPermutation().indices(), wa1, qtb, x, sdiag);
+
     wa2 = diag.cwiseProduct(x);
     dxnorm = wa2.blueNorm();
+    temp = fp;
     fp = dxnorm - delta;
-    if (fp <= Scalar(0.1) * delta) {
-        par = 0;
-        return;
-    }
 
-    /* if the jacobian is not rank deficient, the newton */
-    /* step provides a lower bound, parl, for the zero of */
-    /* the function. otherwise set this bound to zero. */
-    parl = 0.;
-    if (rank==n) {
-        wa1 = qr.colsPermutation().inverse() *  diag.cwiseProduct(wa2)/dxnorm;
-        qr.matrixQR().topLeftCorner(n, n).transpose().template triangularView<Lower>().solveInPlace(wa1);
-        temp = wa1.blueNorm();
-        parl = fp / delta / temp / temp;
-    }
+    /* if the function is small enough, accept the current value */
+    /* of par. also test for the exceptional cases where parl */
+    /* is zero or the number of iterations has reached 10. */
+    if (abs(fp) <= Scalar(0.1) * delta || (parl == 0. && fp <= temp && temp < 0.) || iter == 10) break;
 
-    /* calculate an upper bound, paru, for the zero of the function. */
-    for (j = 0; j < n; ++j)
-        wa1[j] = qr.matrixQR().col(j).head(j+1).dot(qtb.head(j+1)) / diag[qr.colsPermutation().indices()(j)];
-
-    gnorm = wa1.stableNorm();
-    paru = gnorm / delta;
-    if (paru == 0.)
-        paru = dwarf / (std::min)(delta,Scalar(0.1));
-
-    /* if the input par lies outside of the interval (parl,paru), */
-    /* set par to the closer endpoint. */
-    par = (std::max)(par,parl);
-    par = (std::min)(par,paru);
-    if (par == 0.)
-        par = gnorm / dxnorm;
-
-    /* beginning of an iteration. */
-    Matrix< Scalar, Dynamic, Dynamic > s = qr.matrixQR();
-    while (true) {
-        ++iter;
-
-        /* evaluate the function at the current value of par. */
-        if (par == 0.)
-            par = (std::max)(dwarf,Scalar(.001) * paru); /* Computing MAX */
-        wa1 = sqrt(par)* diag;
-
-        Matrix< Scalar, Dynamic, 1 > sdiag(n);
-        qrsolv<Scalar>(s, qr.colsPermutation().indices(), wa1, qtb, x, sdiag);
-
-        wa2 = diag.cwiseProduct(x);
-        dxnorm = wa2.blueNorm();
-        temp = fp;
-        fp = dxnorm - delta;
-
-        /* if the function is small enough, accept the current value */
-        /* of par. also test for the exceptional cases where parl */
-        /* is zero or the number of iterations has reached 10. */
-        if (abs(fp) <= Scalar(0.1) * delta || (parl == 0. && fp <= temp && temp < 0.) || iter == 10)
-            break;
-
-        /* compute the newton correction. */
-        wa1 = qr.colsPermutation().inverse() * diag.cwiseProduct(wa2/dxnorm);
-        // we could almost use this here, but the diagonal is outside qr, in sdiag[]
-        // qr.matrixQR().topLeftCorner(n, n).transpose().template triangularView<Lower>().solveInPlace(wa1);
-        for (j = 0; j < n; ++j) {
-            wa1[j] /= sdiag[j];
-            temp = wa1[j];
-            for (Index i = j+1; i < n; ++i)
-                wa1[i] -= s(i,j) * temp;
-        }
-        temp = wa1.blueNorm();
-        parc = fp / delta / temp / temp;
-
-        /* depending on the sign of the function, update parl or paru. */
-        if (fp > 0.)
-            parl = (std::max)(parl,par);
-        if (fp < 0.)
-            paru = (std::min)(paru,par);
-
-        /* compute an improved estimate for par. */
-        par = (std::max)(parl,par+parc);
+    /* compute the newton correction. */
+    wa1 = qr.colsPermutation().inverse() * diag.cwiseProduct(wa2 / dxnorm);
+    // we could almost use this here, but the diagonal is outside qr, in sdiag[]
+    // qr.matrixQR().topLeftCorner(n, n).transpose().template triangularView<Lower>().solveInPlace(wa1);
+    for (j = 0; j < n; ++j) {
+      wa1[j] /= sdiag[j];
+      temp = wa1[j];
+      for (Index i = j + 1; i < n; ++i) wa1[i] -= s(i, j) * temp;
     }
-    if (iter == 0)
-        par = 0.;
-    return;
+    temp = wa1.blueNorm();
+    parc = fp / delta / temp / temp;
+
+    /* depending on the sign of the function, update parl or paru. */
+    if (fp > 0.) parl = (std::max)(parl, par);
+    if (fp < 0.) paru = (std::min)(paru, par);
+
+    /* compute an improved estimate for par. */
+    par = (std::max)(parl, par + parc);
+  }
+  if (iter == 0) par = 0.;
+  return;
 }
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
diff --git a/inst/include/unsupported/Eigen/src/NonLinearOptimization/qrsolv.h b/inst/include/unsupported/Eigen/src/NonLinearOptimization/qrsolv.h
index feafd62a..2e4d0363 100644
--- a/inst/include/unsupported/Eigen/src/NonLinearOptimization/qrsolv.h
+++ b/inst/include/unsupported/Eigen/src/NonLinearOptimization/qrsolv.h
@@ -1,91 +1,89 @@
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 namespace internal {
 
 // TODO : once qrsolv2 is removed, use ColPivHouseholderQR or PermutationMatrix instead of ipvt
 template <typename Scalar>
-void qrsolv(
-        Matrix< Scalar, Dynamic, Dynamic > &s,
-        // TODO : use a PermutationMatrix once lmpar is no more:
-        const VectorXi &ipvt,
-        const Matrix< Scalar, Dynamic, 1 >  &diag,
-        const Matrix< Scalar, Dynamic, 1 >  &qtb,
-        Matrix< Scalar, Dynamic, 1 >  &x,
-        Matrix< Scalar, Dynamic, 1 >  &sdiag)
+void qrsolv(Matrix<Scalar, Dynamic, Dynamic> &s,
+            // TODO : use a PermutationMatrix once lmpar is no more:
+            const VectorXi &ipvt, const Matrix<Scalar, Dynamic, 1> &diag, const Matrix<Scalar, Dynamic, 1> &qtb,
+            Matrix<Scalar, Dynamic, 1> &x, Matrix<Scalar, Dynamic, 1> &sdiag)
 
 {
-    typedef DenseIndex Index;
-
-    /* Local variables */
-    Index i, j, k, l;
-    Scalar temp;
-    Index n = s.cols();
-    Matrix< Scalar, Dynamic, 1 >  wa(n);
-    JacobiRotation<Scalar> givens;
-
-    /* Function Body */
-    // the following will only change the lower triangular part of s, including
-    // the diagonal, though the diagonal is restored afterward
-
-    /*     copy r and (q transpose)*b to preserve input and initialize s. */
-    /*     in particular, save the diagonal elements of r in x. */
-    x = s.diagonal();
-    wa = qtb;
-
-    s.topLeftCorner(n,n).template triangularView<StrictlyLower>() = s.topLeftCorner(n,n).transpose();
-
-    /*     eliminate the diagonal matrix d using a givens rotation. */
-    for (j = 0; j < n; ++j) {
-
-        /*        prepare the row of d to be eliminated, locating the */
-        /*        diagonal element using p from the qr factorization. */
-        l = ipvt[j];
-        if (diag[l] == 0.)
-            break;
-        sdiag.tail(n-j).setZero();
-        sdiag[j] = diag[l];
-
-        /*        the transformations to eliminate the row of d */
-        /*        modify only a single element of (q transpose)*b */
-        /*        beyond the first n, which is initially zero. */
-        Scalar qtbpj = 0.;
-        for (k = j; k < n; ++k) {
-            /*           determine a givens rotation which eliminates the */
-            /*           appropriate element in the current row of d. */
-            givens.makeGivens(-s(k,k), sdiag[k]);
-
-            /*           compute the modified diagonal element of r and */
-            /*           the modified element of ((q transpose)*b,0). */
-            s(k,k) = givens.c() * s(k,k) + givens.s() * sdiag[k];
-            temp = givens.c() * wa[k] + givens.s() * qtbpj;
-            qtbpj = -givens.s() * wa[k] + givens.c() * qtbpj;
-            wa[k] = temp;
-
-            /*           accumulate the tranformation in the row of s. */
-            for (i = k+1; i<n; ++i) {
-                temp = givens.c() * s(i,k) + givens.s() * sdiag[i];
-                sdiag[i] = -givens.s() * s(i,k) + givens.c() * sdiag[i];
-                s(i,k) = temp;
-            }
-        }
+  typedef DenseIndex Index;
+
+  /* Local variables */
+  Index i, j, k, l;
+  Scalar temp;
+  Index n = s.cols();
+  Matrix<Scalar, Dynamic, 1> wa(n);
+  JacobiRotation<Scalar> givens;
+
+  /* Function Body */
+  // the following will only change the lower triangular part of s, including
+  // the diagonal, though the diagonal is restored afterward
+
+  /*     copy r and (q transpose)*b to preserve input and initialize s. */
+  /*     in particular, save the diagonal elements of r in x. */
+  x = s.diagonal();
+  wa = qtb;
+
+  s.topLeftCorner(n, n).template triangularView<StrictlyLower>() = s.topLeftCorner(n, n).transpose();
+
+  /*     eliminate the diagonal matrix d using a givens rotation. */
+  for (j = 0; j < n; ++j) {
+    /*        prepare the row of d to be eliminated, locating the */
+    /*        diagonal element using p from the qr factorization. */
+    l = ipvt[j];
+    if (diag[l] == 0.) break;
+    sdiag.tail(n - j).setZero();
+    sdiag[j] = diag[l];
+
+    /*        the transformations to eliminate the row of d */
+    /*        modify only a single element of (q transpose)*b */
+    /*        beyond the first n, which is initially zero. */
+    Scalar qtbpj = 0.;
+    for (k = j; k < n; ++k) {
+      /*           determine a givens rotation which eliminates the */
+      /*           appropriate element in the current row of d. */
+      givens.makeGivens(-s(k, k), sdiag[k]);
+
+      /*           compute the modified diagonal element of r and */
+      /*           the modified element of ((q transpose)*b,0). */
+      s(k, k) = givens.c() * s(k, k) + givens.s() * sdiag[k];
+      temp = givens.c() * wa[k] + givens.s() * qtbpj;
+      qtbpj = -givens.s() * wa[k] + givens.c() * qtbpj;
+      wa[k] = temp;
+
+      /*           accumulate the transformation in the row of s. */
+      for (i = k + 1; i < n; ++i) {
+        temp = givens.c() * s(i, k) + givens.s() * sdiag[i];
+        sdiag[i] = -givens.s() * s(i, k) + givens.c() * sdiag[i];
+        s(i, k) = temp;
+      }
     }
+  }
 
-    /*     solve the triangular system for z. if the system is */
-    /*     singular, then obtain a least squares solution. */
-    Index nsing;
-    for(nsing=0; nsing<n && sdiag[nsing]!=0; nsing++) {}
+  /*     solve the triangular system for z. if the system is */
+  /*     singular, then obtain a least squares solution. */
+  Index nsing;
+  for (nsing = 0; nsing < n && sdiag[nsing] != 0; nsing++) {
+  }
 
-    wa.tail(n-nsing).setZero();
-    s.topLeftCorner(nsing, nsing).transpose().template triangularView<Upper>().solveInPlace(wa.head(nsing));
+  wa.tail(n - nsing).setZero();
+  s.topLeftCorner(nsing, nsing).transpose().template triangularView<Upper>().solveInPlace(wa.head(nsing));
 
-    // restore
-    sdiag = s.diagonal();
-    s.diagonal() = x;
+  // restore
+  sdiag = s.diagonal();
+  s.diagonal() = x;
 
-    /*     permute the components of z back to components of x. */
-    for (j = 0; j < n; ++j) x[ipvt[j]] = wa[j];
+  /*     permute the components of z back to components of x. */
+  for (j = 0; j < n; ++j) x[ipvt[j]] = wa[j];
 }
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
diff --git a/inst/include/unsupported/Eigen/src/NonLinearOptimization/r1mpyq.h b/inst/include/unsupported/Eigen/src/NonLinearOptimization/r1mpyq.h
index 36ff700e..ea413159 100644
--- a/inst/include/unsupported/Eigen/src/NonLinearOptimization/r1mpyq.h
+++ b/inst/include/unsupported/Eigen/src/NonLinearOptimization/r1mpyq.h
@@ -1,30 +1,33 @@
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 namespace internal {
 
 // TODO : move this to GivensQR once there's such a thing in Eigen
 
 template <typename Scalar>
-void r1mpyq(DenseIndex m, DenseIndex n, Scalar *a, const std::vector<JacobiRotation<Scalar> > &v_givens, const std::vector<JacobiRotation<Scalar> > &w_givens)
-{
-    typedef DenseIndex Index;
+void r1mpyq(DenseIndex m, DenseIndex n, Scalar *a, const std::vector<JacobiRotation<Scalar> > &v_givens,
+            const std::vector<JacobiRotation<Scalar> > &w_givens) {
+  typedef DenseIndex Index;
 
-    /*     apply the first set of givens rotations to a. */
-    for (Index j = n-2; j>=0; --j)
-        for (Index i = 0; i<m; ++i) {
-            Scalar temp = v_givens[j].c() * a[i+m*j] - v_givens[j].s() * a[i+m*(n-1)];
-            a[i+m*(n-1)] = v_givens[j].s() * a[i+m*j] + v_givens[j].c() * a[i+m*(n-1)];
-            a[i+m*j] = temp;
-        }
-    /*     apply the second set of givens rotations to a. */
-    for (Index j = 0; j<n-1; ++j)
-        for (Index i = 0; i<m; ++i) {
-            Scalar temp = w_givens[j].c() * a[i+m*j] + w_givens[j].s() * a[i+m*(n-1)];
-            a[i+m*(n-1)] = -w_givens[j].s() * a[i+m*j] + w_givens[j].c() * a[i+m*(n-1)];
-            a[i+m*j] = temp;
-        }
+  /*     apply the first set of givens rotations to a. */
+  for (Index j = n - 2; j >= 0; --j)
+    for (Index i = 0; i < m; ++i) {
+      Scalar temp = v_givens[j].c() * a[i + m * j] - v_givens[j].s() * a[i + m * (n - 1)];
+      a[i + m * (n - 1)] = v_givens[j].s() * a[i + m * j] + v_givens[j].c() * a[i + m * (n - 1)];
+      a[i + m * j] = temp;
+    }
+  /*     apply the second set of givens rotations to a. */
+  for (Index j = 0; j < n - 1; ++j)
+    for (Index i = 0; i < m; ++i) {
+      Scalar temp = w_givens[j].c() * a[i + m * j] + w_givens[j].s() * a[i + m * (n - 1)];
+      a[i + m * (n - 1)] = -w_givens[j].s() * a[i + m * j] + w_givens[j].c() * a[i + m * (n - 1)];
+      a[i + m * j] = temp;
+    }
 }
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
diff --git a/inst/include/unsupported/Eigen/src/NonLinearOptimization/r1updt.h b/inst/include/unsupported/Eigen/src/NonLinearOptimization/r1updt.h
index f2876606..201fba35 100644
--- a/inst/include/unsupported/Eigen/src/NonLinearOptimization/r1updt.h
+++ b/inst/include/unsupported/Eigen/src/NonLinearOptimization/r1updt.h
@@ -1,99 +1,96 @@
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 namespace internal {
 
 template <typename Scalar>
-void r1updt(
-        Matrix< Scalar, Dynamic, Dynamic > &s,
-        const Matrix< Scalar, Dynamic, 1> &u,
-        std::vector<JacobiRotation<Scalar> > &v_givens,
-        std::vector<JacobiRotation<Scalar> > &w_givens,
-        Matrix< Scalar, Dynamic, 1> &v,
-        Matrix< Scalar, Dynamic, 1> &w,
-        bool *sing)
-{
-    typedef DenseIndex Index;
-    const JacobiRotation<Scalar> IdentityRotation = JacobiRotation<Scalar>(1,0);
-
-    /* Local variables */
-    const Index m = s.rows();
-    const Index n = s.cols();
-    Index i, j=1;
-    Scalar temp;
-    JacobiRotation<Scalar> givens;
-
-    // r1updt had a broader usecase, but we dont use it here. And, more
-    // importantly, we can not test it.
-    eigen_assert(m==n);
-    eigen_assert(u.size()==m);
-    eigen_assert(v.size()==n);
-    eigen_assert(w.size()==n);
-
-    /* move the nontrivial part of the last column of s into w. */
-    w[n-1] = s(n-1,n-1);
-
-    /* rotate the vector v into a multiple of the n-th unit vector */
-    /* in such a way that a spike is introduced into w. */
-    for (j=n-2; j>=0; --j) {
-        w[j] = 0.;
-        if (v[j] != 0.) {
-            /* determine a givens rotation which eliminates the */
-            /* j-th element of v. */
-            givens.makeGivens(-v[n-1], v[j]);
-
-            /* apply the transformation to v and store the information */
-            /* necessary to recover the givens rotation. */
-            v[n-1] = givens.s() * v[j] + givens.c() * v[n-1];
-            v_givens[j] = givens;
-
-            /* apply the transformation to s and extend the spike in w. */
-            for (i = j; i < m; ++i) {
-                temp = givens.c() * s(j,i) - givens.s() * w[i];
-                w[i] = givens.s() * s(j,i) + givens.c() * w[i];
-                s(j,i) = temp;
-            }
-        } else
-            v_givens[j] = IdentityRotation;
-    }
-
-    /* add the spike from the rank 1 update to w. */
-    w += v[n-1] * u;
-
-    /* eliminate the spike. */
-    *sing = false;
-    for (j = 0; j < n-1; ++j) {
-        if (w[j] != 0.) {
-            /* determine a givens rotation which eliminates the */
-            /* j-th element of the spike. */
-            givens.makeGivens(-s(j,j), w[j]);
-
-            /* apply the transformation to s and reduce the spike in w. */
-            for (i = j; i < m; ++i) {
-                temp = givens.c() * s(j,i) + givens.s() * w[i];
-                w[i] = -givens.s() * s(j,i) + givens.c() * w[i];
-                s(j,i) = temp;
-            }
-
-            /* store the information necessary to recover the */
-            /* givens rotation. */
-            w_givens[j] = givens;
-        } else
-            v_givens[j] = IdentityRotation;
-
-        /* test for zero diagonal elements in the output s. */
-        if (s(j,j) == 0.) {
-            *sing = true;
-        }
-    }
-    /* move w back into the last column of the output s. */
-    s(n-1,n-1) = w[n-1];
-
-    if (s(j,j) == 0.) {
-        *sing = true;
+void r1updt(Matrix<Scalar, Dynamic, Dynamic> &s, const Matrix<Scalar, Dynamic, 1> &u,
+            std::vector<JacobiRotation<Scalar> > &v_givens, std::vector<JacobiRotation<Scalar> > &w_givens,
+            Matrix<Scalar, Dynamic, 1> &v, Matrix<Scalar, Dynamic, 1> &w, bool *sing) {
+  typedef DenseIndex Index;
+  const JacobiRotation<Scalar> IdentityRotation = JacobiRotation<Scalar>(1, 0);
+
+  /* Local variables */
+  const Index m = s.rows();
+  const Index n = s.cols();
+  Index i, j = 1;
+  Scalar temp;
+  JacobiRotation<Scalar> givens;
+
+  // r1updt had a broader usecase, but we don't use it here. And, more
+  // importantly, we can not test it.
+  eigen_assert(m == n);
+  eigen_assert(u.size() == m);
+  eigen_assert(v.size() == n);
+  eigen_assert(w.size() == n);
+
+  /* move the nontrivial part of the last column of s into w. */
+  w[n - 1] = s(n - 1, n - 1);
+
+  /* rotate the vector v into a multiple of the n-th unit vector */
+  /* in such a way that a spike is introduced into w. */
+  for (j = n - 2; j >= 0; --j) {
+    w[j] = 0.;
+    if (v[j] != 0.) {
+      /* determine a givens rotation which eliminates the */
+      /* j-th element of v. */
+      givens.makeGivens(-v[n - 1], v[j]);
+
+      /* apply the transformation to v and store the information */
+      /* necessary to recover the givens rotation. */
+      v[n - 1] = givens.s() * v[j] + givens.c() * v[n - 1];
+      v_givens[j] = givens;
+
+      /* apply the transformation to s and extend the spike in w. */
+      for (i = j; i < m; ++i) {
+        temp = givens.c() * s(j, i) - givens.s() * w[i];
+        w[i] = givens.s() * s(j, i) + givens.c() * w[i];
+        s(j, i) = temp;
+      }
+    } else
+      v_givens[j] = IdentityRotation;
+  }
+
+  /* add the spike from the rank 1 update to w. */
+  w += v[n - 1] * u;
+
+  /* eliminate the spike. */
+  *sing = false;
+  for (j = 0; j < n - 1; ++j) {
+    if (w[j] != 0.) {
+      /* determine a givens rotation which eliminates the */
+      /* j-th element of the spike. */
+      givens.makeGivens(-s(j, j), w[j]);
+
+      /* apply the transformation to s and reduce the spike in w. */
+      for (i = j; i < m; ++i) {
+        temp = givens.c() * s(j, i) + givens.s() * w[i];
+        w[i] = -givens.s() * s(j, i) + givens.c() * w[i];
+        s(j, i) = temp;
+      }
+
+      /* store the information necessary to recover the */
+      /* givens rotation. */
+      w_givens[j] = givens;
+    } else
+      w_givens[j] = IdentityRotation;
+
+    /* test for zero diagonal elements in the output s. */
+    if (s(j, j) == 0.) {
+      *sing = true;
     }
-    return;
+  }
+  /* move w back into the last column of the output s. */
+  s(n - 1, n - 1) = w[n - 1];
+
+  if (s(j, j) == 0.) {
+    *sing = true;
+  }
+  return;
 }
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
diff --git a/inst/include/unsupported/Eigen/src/NonLinearOptimization/rwupdt.h b/inst/include/unsupported/Eigen/src/NonLinearOptimization/rwupdt.h
index 6ebf8563..8ae58487 100644
--- a/inst/include/unsupported/Eigen/src/NonLinearOptimization/rwupdt.h
+++ b/inst/include/unsupported/Eigen/src/NonLinearOptimization/rwupdt.h
@@ -1,49 +1,47 @@
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 namespace internal {
 
 template <typename Scalar>
-void rwupdt(
-        Matrix< Scalar, Dynamic, Dynamic >  &r,
-        const Matrix< Scalar, Dynamic, 1>  &w,
-        Matrix< Scalar, Dynamic, 1>  &b,
-        Scalar alpha)
-{
-    typedef DenseIndex Index;
-
-    const Index n = r.cols();
-    eigen_assert(r.rows()>=n);
-    std::vector<JacobiRotation<Scalar> > givens(n);
-
-    /* Local variables */
-    Scalar temp, rowj;
-
-    /* Function Body */
-    for (Index j = 0; j < n; ++j) {
-        rowj = w[j];
-
-        /* apply the previous transformations to */
-        /* r(i,j), i=0,1,...,j-1, and to w(j). */
-        for (Index i = 0; i < j; ++i) {
-            temp = givens[i].c() * r(i,j) + givens[i].s() * rowj;
-            rowj = -givens[i].s() * r(i,j) + givens[i].c() * rowj;
-            r(i,j) = temp;
-        }
-
-        /* determine a givens rotation which eliminates w(j). */
-        givens[j].makeGivens(-r(j,j), rowj);
-
-        if (rowj == 0.)
-            continue; // givens[j] is identity
-
-        /* apply the current transformation to r(j,j), b(j), and alpha. */
-        r(j,j) = givens[j].c() * r(j,j) + givens[j].s() * rowj;
-        temp = givens[j].c() * b[j] + givens[j].s() * alpha;
-        alpha = -givens[j].s() * b[j] + givens[j].c() * alpha;
-        b[j] = temp;
+void rwupdt(Matrix<Scalar, Dynamic, Dynamic> &r, const Matrix<Scalar, Dynamic, 1> &w, Matrix<Scalar, Dynamic, 1> &b,
+            Scalar alpha) {
+  typedef DenseIndex Index;
+
+  const Index n = r.cols();
+  eigen_assert(r.rows() >= n);
+  std::vector<JacobiRotation<Scalar> > givens(n);
+
+  /* Local variables */
+  Scalar temp, rowj;
+
+  /* Function Body */
+  for (Index j = 0; j < n; ++j) {
+    rowj = w[j];
+
+    /* apply the previous transformations to */
+    /* r(i,j), i=0,1,...,j-1, and to w(j). */
+    for (Index i = 0; i < j; ++i) {
+      temp = givens[i].c() * r(i, j) + givens[i].s() * rowj;
+      rowj = -givens[i].s() * r(i, j) + givens[i].c() * rowj;
+      r(i, j) = temp;
     }
+
+    /* determine a givens rotation which eliminates w(j). */
+    givens[j].makeGivens(-r(j, j), rowj);
+
+    if (rowj == 0.) continue;  // givens[j] is identity
+
+    /* apply the current transformation to r(j,j), b(j), and alpha. */
+    r(j, j) = givens[j].c() * r(j, j) + givens[j].s() * rowj;
+    temp = givens[j].c() * b[j] + givens[j].s() * alpha;
+    alpha = -givens[j].s() * b[j] + givens[j].c() * alpha;
+    b[j] = temp;
+  }
 }
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
diff --git a/inst/include/unsupported/Eigen/src/NumericalDiff/InternalHeaderCheck.h b/inst/include/unsupported/Eigen/src/NumericalDiff/InternalHeaderCheck.h
new file mode 100644
index 00000000..8c513d20
--- /dev/null
+++ b/inst/include/unsupported/Eigen/src/NumericalDiff/InternalHeaderCheck.h
@@ -0,0 +1,3 @@
+#ifndef EIGEN_NUMERICALDIFF_MODULE_H
+#error "Please include unsupported/Eigen/NumericalDiff instead of including headers inside the src directory directly."
+#endif
diff --git a/inst/include/unsupported/Eigen/src/NumericalDiff/NumericalDiff.h b/inst/include/unsupported/Eigen/src/NumericalDiff/NumericalDiff.h
index ea5d8bc2..1f552636 100644
--- a/inst/include/unsupported/Eigen/src/NumericalDiff/NumericalDiff.h
+++ b/inst/include/unsupported/Eigen/src/NumericalDiff/NumericalDiff.h
@@ -13,118 +13,115 @@
 #ifndef EIGEN_NUMERICAL_DIFF_H
 #define EIGEN_NUMERICAL_DIFF_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 
-enum NumericalDiffMode {
-    Forward,
-    Central
-};
+namespace Eigen {
 
+enum NumericalDiffMode { Forward, Central };
 
 /**
-  * This class allows you to add a method df() to your functor, which will 
-  * use numerical differentiation to compute an approximate of the
-  * derivative for the functor. Of course, if you have an analytical form
-  * for the derivative, you should rather implement df() by yourself.
-  *
-  * More information on
-  * http://en.wikipedia.org/wiki/Numerical_differentiation
-  *
-  * Currently only "Forward" and "Central" scheme are implemented.
-  */
-template<typename _Functor, NumericalDiffMode mode=Forward>
-class NumericalDiff : public _Functor
-{
-public:
-    typedef _Functor Functor;
-    typedef typename Functor::Scalar Scalar;
-    typedef typename Functor::InputType InputType;
-    typedef typename Functor::ValueType ValueType;
-    typedef typename Functor::JacobianType JacobianType;
-
-    NumericalDiff(Scalar _epsfcn=0.) : Functor(), epsfcn(_epsfcn) {}
-    NumericalDiff(const Functor& f, Scalar _epsfcn=0.) : Functor(f), epsfcn(_epsfcn) {}
-
-    // forward constructors
-    template<typename T0>
-        NumericalDiff(const T0& a0) : Functor(a0), epsfcn(0) {}
-    template<typename T0, typename T1>
-        NumericalDiff(const T0& a0, const T1& a1) : Functor(a0, a1), epsfcn(0) {}
-    template<typename T0, typename T1, typename T2>
-        NumericalDiff(const T0& a0, const T1& a1, const T2& a2) : Functor(a0, a1, a2), epsfcn(0) {}
-
-    enum {
-        InputsAtCompileTime = Functor::InputsAtCompileTime,
-        ValuesAtCompileTime = Functor::ValuesAtCompileTime
+ * This class allows you to add a method df() to your functor, which will
+ * use numerical differentiation to compute an approximate of the
+ * derivative for the functor. Of course, if you have an analytical form
+ * for the derivative, you should rather implement df() by yourself.
+ *
+ * More information on
+ * http://en.wikipedia.org/wiki/Numerical_differentiation
+ *
+ * Currently only "Forward" and "Central" scheme are implemented.
+ */
+template <typename Functor_, NumericalDiffMode mode = Forward>
+class NumericalDiff : public Functor_ {
+ public:
+  typedef Functor_ Functor;
+  typedef typename Functor::Scalar Scalar;
+  typedef typename Functor::InputType InputType;
+  typedef typename Functor::ValueType ValueType;
+  typedef typename Functor::JacobianType JacobianType;
+
+  NumericalDiff(Scalar _epsfcn = 0.) : Functor(), epsfcn(_epsfcn) {}
+  NumericalDiff(const Functor& f, Scalar _epsfcn = 0.) : Functor(f), epsfcn(_epsfcn) {}
+
+  // forward constructors
+  template <typename T0>
+  NumericalDiff(const T0& a0) : Functor(a0), epsfcn(0) {}
+  template <typename T0, typename T1>
+  NumericalDiff(const T0& a0, const T1& a1) : Functor(a0, a1), epsfcn(0) {}
+  template <typename T0, typename T1, typename T2>
+  NumericalDiff(const T0& a0, const T1& a1, const T2& a2) : Functor(a0, a1, a2), epsfcn(0) {}
+
+  enum { InputsAtCompileTime = Functor::InputsAtCompileTime, ValuesAtCompileTime = Functor::ValuesAtCompileTime };
+
+  /**
+   * return the number of evaluation of functor
+   */
+  int df(const InputType& _x, JacobianType& jac) const {
+    using std::abs;
+    using std::sqrt;
+    /* Local variables */
+    Scalar h;
+    int nfev = 0;
+    const typename InputType::Index n = _x.size();
+    const Scalar eps = sqrt(((std::max)(epsfcn, NumTraits<Scalar>::epsilon())));
+    ValueType val1, val2;
+    InputType x = _x;
+    // TODO : we should do this only if the size is not already known
+    val1.resize(Functor::values());
+    val2.resize(Functor::values());
+
+    // initialization
+    switch (mode) {
+      case Forward:
+        // compute f(x)
+        Functor::operator()(x, val1);
+        nfev++;
+        break;
+      case Central:
+        // do nothing
+        break;
+      default:
+        eigen_assert(false);
     };
 
-    /**
-      * return the number of evaluation of functor
-     */
-    int df(const InputType& _x, JacobianType &jac) const
-    {
-        using std::sqrt;
-        using std::abs;
-        /* Local variables */
-        Scalar h;
-        int nfev=0;
-        const typename InputType::Index n = _x.size();
-        const Scalar eps = sqrt(((std::max)(epsfcn,NumTraits<Scalar>::epsilon() )));
-        ValueType val1, val2;
-        InputType x = _x;
-        // TODO : we should do this only if the size is not already known
-        val1.resize(Functor::values());
-        val2.resize(Functor::values());
-
-        // initialization
-        switch(mode) {
-            case Forward:
-                // compute f(x)
-                Functor::operator()(x, val1); nfev++;
-                break;
-            case Central:
-                // do nothing
-                break;
-            default:
-                eigen_assert(false);
-        };
-
-        // Function Body
-        for (int j = 0; j < n; ++j) {
-            h = eps * abs(x[j]);
-            if (h == 0.) {
-                h = eps;
-            }
-            switch(mode) {
-                case Forward:
-                    x[j] += h;
-                    Functor::operator()(x, val2);
-                    nfev++;
-                    x[j] = _x[j];
-                    jac.col(j) = (val2-val1)/h;
-                    break;
-                case Central:
-                    x[j] += h;
-                    Functor::operator()(x, val2); nfev++;
-                    x[j] -= 2*h;
-                    Functor::operator()(x, val1); nfev++;
-                    x[j] = _x[j];
-                    jac.col(j) = (val2-val1)/(2*h);
-                    break;
-                default:
-                    eigen_assert(false);
-            };
-        }
-        return nfev;
+    // Function Body
+    for (int j = 0; j < n; ++j) {
+      h = eps * abs(x[j]);
+      if (h == 0.) {
+        h = eps;
+      }
+      switch (mode) {
+        case Forward:
+          x[j] += h;
+          Functor::operator()(x, val2);
+          nfev++;
+          x[j] = _x[j];
+          jac.col(j) = (val2 - val1) / h;
+          break;
+        case Central:
+          x[j] += h;
+          Functor::operator()(x, val2);
+          nfev++;
+          x[j] -= 2 * h;
+          Functor::operator()(x, val1);
+          nfev++;
+          x[j] = _x[j];
+          jac.col(j) = (val2 - val1) / (2 * h);
+          break;
+        default:
+          eigen_assert(false);
+      };
     }
-private:
-    Scalar epsfcn;
+    return nfev;
+  }
 
-    NumericalDiff& operator=(const NumericalDiff&);
-};
+ private:
+  Scalar epsfcn;
 
-} // end namespace Eigen
+  NumericalDiff& operator=(const NumericalDiff&);
+};
 
-//vim: ai ts=4 sts=4 et sw=4
-#endif // EIGEN_NUMERICAL_DIFF_H
+}  // end namespace Eigen
 
+// vim: ai ts=4 sts=4 et sw=4
+#endif  // EIGEN_NUMERICAL_DIFF_H
diff --git a/inst/include/unsupported/Eigen/src/Polynomials/Companion.h b/inst/include/unsupported/Eigen/src/Polynomials/Companion.h
index b515c292..f0809966 100644
--- a/inst/include/unsupported/Eigen/src/Polynomials/Companion.h
+++ b/inst/include/unsupported/Eigen/src/Polynomials/Companion.h
@@ -14,263 +14,239 @@
 // * Eigen/Core
 // * Eigen/src/PolynomialSolver.h
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 namespace internal {
 
 #ifndef EIGEN_PARSED_BY_DOXYGEN
 
-template <typename T>
-T radix(){ return 2; }
-
-template <typename T>
-T radix2(){ return radix<T>()*radix<T>(); }
-
-template<int Size>
-struct decrement_if_fixed_size
-{
-  enum {
-    ret = (Size == Dynamic) ? Dynamic : Size-1 };
+template <int Size>
+struct decrement_if_fixed_size {
+  enum { ret = (Size == Dynamic) ? Dynamic : Size - 1 };
 };
 
 #endif
 
-template< typename _Scalar, int _Deg >
-class companion
-{
-  public:
-    EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_Deg==Dynamic ? Dynamic : _Deg)
-
-    enum {
-      Deg = _Deg,
-      Deg_1=decrement_if_fixed_size<Deg>::ret
-    };
-
-    typedef _Scalar                                Scalar;
-    typedef typename NumTraits<Scalar>::Real       RealScalar;
-    typedef Matrix<Scalar, Deg, 1>                 RightColumn;
-    //typedef DiagonalMatrix< Scalar, Deg_1, Deg_1 > BottomLeftDiagonal;
-    typedef Matrix<Scalar, Deg_1, 1>               BottomLeftDiagonal;
-
-    typedef Matrix<Scalar, Deg, Deg>               DenseCompanionMatrixType;
-    typedef Matrix< Scalar, _Deg, Deg_1 >          LeftBlock;
-    typedef Matrix< Scalar, Deg_1, Deg_1 >         BottomLeftBlock;
-    typedef Matrix< Scalar, 1, Deg_1 >             LeftBlockFirstRow;
-
-    typedef DenseIndex Index;
-
-  public:
-    EIGEN_STRONG_INLINE const _Scalar operator()(Index row, Index col ) const
-    {
-      if( m_bl_diag.rows() > col )
-      {
-        if( 0 < row ){ return m_bl_diag[col]; }
-        else{ return 0; }
+template <typename Scalar_, int Deg_>
+class companion {
+ public:
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar_, Deg_ == Dynamic ? Dynamic : Deg_)
+
+  enum { Deg = Deg_, Deg_1 = decrement_if_fixed_size<Deg>::ret };
+
+  typedef Scalar_ Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef Matrix<Scalar, Deg, 1> RightColumn;
+  // typedef DiagonalMatrix< Scalar, Deg_1, Deg_1 > BottomLeftDiagonal;
+  typedef Matrix<Scalar, Deg_1, 1> BottomLeftDiagonal;
+
+  typedef Matrix<Scalar, Deg, Deg> DenseCompanionMatrixType;
+  typedef Matrix<Scalar, Deg_, Deg_1> LeftBlock;
+  typedef Matrix<Scalar, Deg_1, Deg_1> BottomLeftBlock;
+  typedef Matrix<Scalar, 1, Deg_1> LeftBlockFirstRow;
+
+  typedef DenseIndex Index;
+
+ public:
+  EIGEN_STRONG_INLINE const Scalar_ operator()(Index row, Index col) const {
+    if (m_bl_diag.rows() > col) {
+      if (0 < row) {
+        return m_bl_diag[col];
+      } else {
+        return 0;
       }
-      else{ return m_monic[row]; }
-    }
-
-  public:
-    template<typename VectorType>
-    void setPolynomial( const VectorType& poly )
-    {
-      const Index deg = poly.size()-1;
-      m_monic = -1/poly[deg] * poly.head(deg);
-      //m_bl_diag.setIdentity( deg-1 );
-      m_bl_diag.setOnes(deg-1);
+    } else {
+      return m_monic[row];
     }
+  }
 
-    template<typename VectorType>
-    companion( const VectorType& poly ){
-      setPolynomial( poly ); }
-
-  public:
-    DenseCompanionMatrixType denseMatrix() const
-    {
-      const Index deg   = m_monic.size();
-      const Index deg_1 = deg-1;
-      DenseCompanionMatrixType companion(deg,deg);
-      companion <<
-        ( LeftBlock(deg,deg_1)
-          << LeftBlockFirstRow::Zero(1,deg_1),
-          BottomLeftBlock::Identity(deg-1,deg-1)*m_bl_diag.asDiagonal() ).finished()
-        , m_monic;
-      return companion;
-    }
-
-
-
-  protected:
-    /** Helper function for the balancing algorithm.
-     * \returns true if the row and the column, having colNorm and rowNorm
-     * as norms, are balanced, false otherwise.
-     * colB and rowB are repectively the multipliers for
-     * the column and the row in order to balance them.
-     * */
-    bool balanced( Scalar colNorm, Scalar rowNorm,
-        bool& isBalanced, Scalar& colB, Scalar& rowB );
+ public:
+  template <typename VectorType>
+  void setPolynomial(const VectorType& poly) {
+    const Index deg = poly.size() - 1;
+    m_monic = -poly.head(deg) / poly[deg];
+    m_bl_diag.setOnes(deg - 1);
+  }
 
-    /** Helper function for the balancing algorithm.
-     * \returns true if the row and the column, having colNorm and rowNorm
-     * as norms, are balanced, false otherwise.
-     * colB and rowB are repectively the multipliers for
-     * the column and the row in order to balance them.
-     * */
-    bool balancedR( Scalar colNorm, Scalar rowNorm,
-        bool& isBalanced, Scalar& colB, Scalar& rowB );
+  template <typename VectorType>
+  companion(const VectorType& poly) {
+    setPolynomial(poly);
+  }
 
-  public:
-    /**
-     * Balancing algorithm from B. N. PARLETT and C. REINSCH (1969)
-     * "Balancing a matrix for calculation of eigenvalues and eigenvectors"
-     * adapted to the case of companion matrices.
-     * A matrix with non zero row and non zero column is balanced
-     * for a certain norm if the i-th row and the i-th column
-     * have same norm for all i.
-     */
-    void balance();
+ public:
+  DenseCompanionMatrixType denseMatrix() const {
+    const Index deg = m_monic.size();
+    const Index deg_1 = deg - 1;
+    DenseCompanionMatrixType companMat(deg, deg);
+    companMat << (LeftBlock(deg, deg_1) << LeftBlockFirstRow::Zero(1, deg_1),
+                  BottomLeftBlock::Identity(deg - 1, deg - 1) * m_bl_diag.asDiagonal())
+                     .finished(),
+        m_monic;
+    return companMat;
+  }
 
-  protected:
-      RightColumn                m_monic;
-      BottomLeftDiagonal         m_bl_diag;
+ protected:
+  /** Helper function for the balancing algorithm.
+   * \returns true if the row and the column, having colNorm and rowNorm
+   * as norms, are balanced, false otherwise.
+   * colB and rowB are respectively the multipliers for
+   * the column and the row in order to balance them.
+   * */
+  bool balanced(RealScalar colNorm, RealScalar rowNorm, bool& isBalanced, RealScalar& colB, RealScalar& rowB);
+
+  /** Helper function for the balancing algorithm.
+   * \returns true if the row and the column, having colNorm and rowNorm
+   * as norms, are balanced, false otherwise.
+   * colB and rowB are respectively the multipliers for
+   * the column and the row in order to balance them.
+   * */
+  bool balancedR(RealScalar colNorm, RealScalar rowNorm, bool& isBalanced, RealScalar& colB, RealScalar& rowB);
+
+ public:
+  /**
+   * Balancing algorithm from B. N. PARLETT and C. REINSCH (1969)
+   * "Balancing a matrix for calculation of eigenvalues and eigenvectors"
+   * adapted to the case of companion matrices.
+   * A matrix with non zero row and non zero column is balanced
+   * for a certain norm if the i-th row and the i-th column
+   * have same norm for all i.
+   */
+  void balance();
+
+ protected:
+  RightColumn m_monic;
+  BottomLeftDiagonal m_bl_diag;
 };
 
-
-
-template< typename _Scalar, int _Deg >
-inline
-bool companion<_Scalar,_Deg>::balanced( Scalar colNorm, Scalar rowNorm,
-    bool& isBalanced, Scalar& colB, Scalar& rowB )
-{
-  if( Scalar(0) == colNorm || Scalar(0) == rowNorm ){ return true; }
-  else
-  {
-    //To find the balancing coefficients, if the radix is 2,
-    //one finds \f$ \sigma \f$ such that
-    // \f$ 2^{2\sigma-1} < rowNorm / colNorm \le 2^{2\sigma+1} \f$
-    // then the balancing coefficient for the row is \f$ 1/2^{\sigma} \f$
-    // and the balancing coefficient for the column is \f$ 2^{\sigma} \f$
-    rowB = rowNorm / radix<Scalar>();
-    colB = Scalar(1);
-    const Scalar s = colNorm + rowNorm;
-
-    while (colNorm < rowB)
-    {
-      colB *= radix<Scalar>();
-      colNorm *= radix2<Scalar>();
+template <typename Scalar_, int Deg_>
+inline bool companion<Scalar_, Deg_>::balanced(RealScalar colNorm, RealScalar rowNorm, bool& isBalanced,
+                                               RealScalar& colB, RealScalar& rowB) {
+  if (RealScalar(0) == colNorm || RealScalar(0) == rowNorm || !(numext::isfinite)(colNorm) ||
+      !(numext::isfinite)(rowNorm)) {
+    return true;
+  } else {
+    // To find the balancing coefficients, if the radix is 2,
+    // one finds \f$ \sigma \f$ such that
+    //  \f$ 2^{2\sigma-1} < rowNorm / colNorm \le 2^{2\sigma+1} \f$
+    //  then the balancing coefficient for the row is \f$ 1/2^{\sigma} \f$
+    //  and the balancing coefficient for the column is \f$ 2^{\sigma} \f$
+    const RealScalar radix = RealScalar(2);
+    const RealScalar radix2 = RealScalar(4);
+
+    rowB = rowNorm / radix;
+    colB = RealScalar(1);
+    const RealScalar s = colNorm + rowNorm;
+
+    // Find sigma s.t. rowNorm / 2 <= 2^(2*sigma) * colNorm
+    RealScalar scout = colNorm;
+    while (scout < rowB) {
+      colB *= radix;
+      scout *= radix2;
     }
 
-    rowB = rowNorm * radix<Scalar>();
-
-    while (colNorm >= rowB)
-    {
-      colB /= radix<Scalar>();
-      colNorm /= radix2<Scalar>();
+    // We now have an upper-bound for sigma, try to lower it.
+    // Find sigma s.t. 2^(2*sigma) * colNorm / 2 < rowNorm
+    scout = colNorm * (colB / radix) * colB;  // Avoid overflow.
+    while (scout >= rowNorm) {
+      colB /= radix;
+      scout /= radix2;
     }
 
-    //This line is used to avoid insubstantial balancing
-    if ((rowNorm + colNorm) < Scalar(0.95) * s * colB)
-    {
+    // This line is used to avoid insubstantial balancing.
+    if ((rowNorm + radix * scout) < RealScalar(0.95) * s * colB) {
       isBalanced = false;
-      rowB = Scalar(1) / colB;
+      rowB = RealScalar(1) / colB;
       return false;
+    } else {
+      return true;
     }
-    else{
-      return true; }
   }
 }
 
-template< typename _Scalar, int _Deg >
-inline
-bool companion<_Scalar,_Deg>::balancedR( Scalar colNorm, Scalar rowNorm,
-    bool& isBalanced, Scalar& colB, Scalar& rowB )
-{
-  if( Scalar(0) == colNorm || Scalar(0) == rowNorm ){ return true; }
-  else
-  {
+template <typename Scalar_, int Deg_>
+inline bool companion<Scalar_, Deg_>::balancedR(RealScalar colNorm, RealScalar rowNorm, bool& isBalanced,
+                                                RealScalar& colB, RealScalar& rowB) {
+  if (RealScalar(0) == colNorm || RealScalar(0) == rowNorm) {
+    return true;
+  } else {
     /**
      * Set the norm of the column and the row to the geometric mean
      * of the row and column norm
      */
-    const _Scalar q = colNorm/rowNorm;
-    if( !isApprox( q, _Scalar(1) ) )
-    {
-      rowB = sqrt( colNorm/rowNorm );
-      colB = Scalar(1)/rowB;
+    const RealScalar q = colNorm / rowNorm;
+    if (!isApprox(q, Scalar_(1))) {
+      rowB = sqrt(colNorm / rowNorm);
+      colB = RealScalar(1) / rowB;
 
       isBalanced = false;
       return false;
+    } else {
+      return true;
     }
-    else{
-      return true; }
   }
 }
 
-
-template< typename _Scalar, int _Deg >
-void companion<_Scalar,_Deg>::balance()
-{
+template <typename Scalar_, int Deg_>
+void companion<Scalar_, Deg_>::balance() {
   using std::abs;
-  EIGEN_STATIC_ASSERT( Deg == Dynamic || 1 < Deg, YOU_MADE_A_PROGRAMMING_MISTAKE );
-  const Index deg   = m_monic.size();
-  const Index deg_1 = deg-1;
+  EIGEN_STATIC_ASSERT(Deg == Dynamic || 1 < Deg, YOU_MADE_A_PROGRAMMING_MISTAKE);
+  const Index deg = m_monic.size();
+  const Index deg_1 = deg - 1;
 
-  bool hasConverged=false;
-  while( !hasConverged )
-  {
+  bool hasConverged = false;
+  while (!hasConverged) {
     hasConverged = true;
-    Scalar colNorm,rowNorm;
-    Scalar colB,rowB;
+    RealScalar colNorm, rowNorm;
+    RealScalar colB, rowB;
 
-    //First row, first column excluding the diagonal
+    // First row, first column excluding the diagonal
     //==============================================
     colNorm = abs(m_bl_diag[0]);
     rowNorm = abs(m_monic[0]);
 
-    //Compute balancing of the row and the column
-    if( !balanced( colNorm, rowNorm, hasConverged, colB, rowB ) )
-    {
+    // Compute balancing of the row and the column
+    if (!balanced(colNorm, rowNorm, hasConverged, colB, rowB)) {
       m_bl_diag[0] *= colB;
       m_monic[0] *= rowB;
     }
 
-    //Middle rows and columns excluding the diagonal
+    // Middle rows and columns excluding the diagonal
     //==============================================
-    for( Index i=1; i<deg_1; ++i )
-    {
+    for (Index i = 1; i < deg_1; ++i) {
       // column norm, excluding the diagonal
       colNorm = abs(m_bl_diag[i]);
 
       // row norm, excluding the diagonal
-      rowNorm = abs(m_bl_diag[i-1]) + abs(m_monic[i]);
-
-      //Compute balancing of the row and the column
-      if( !balanced( colNorm, rowNorm, hasConverged, colB, rowB ) )
-      {
-        m_bl_diag[i]   *= colB;
-        m_bl_diag[i-1] *= rowB;
-        m_monic[i]     *= rowB;
+      rowNorm = abs(m_bl_diag[i - 1]) + abs(m_monic[i]);
+
+      // Compute balancing of the row and the column
+      if (!balanced(colNorm, rowNorm, hasConverged, colB, rowB)) {
+        m_bl_diag[i] *= colB;
+        m_bl_diag[i - 1] *= rowB;
+        m_monic[i] *= rowB;
       }
     }
 
-    //Last row, last column excluding the diagonal
+    // Last row, last column excluding the diagonal
     //============================================
-    const Index ebl = m_bl_diag.size()-1;
-    VectorBlock<RightColumn,Deg_1> headMonic( m_monic, 0, deg_1 );
+    const Index ebl = m_bl_diag.size() - 1;
+    VectorBlock<RightColumn, Deg_1> headMonic(m_monic, 0, deg_1);
     colNorm = headMonic.array().abs().sum();
-    rowNorm = abs( m_bl_diag[ebl] );
+    rowNorm = abs(m_bl_diag[ebl]);
 
-    //Compute balancing of the row and the column
-    if( !balanced( colNorm, rowNorm, hasConverged, colB, rowB ) )
-    {
-      headMonic      *= colB;
+    // Compute balancing of the row and the column
+    if (!balanced(colNorm, rowNorm, hasConverged, colB, rowB)) {
+      headMonic *= colB;
       m_bl_diag[ebl] *= rowB;
     }
   }
 }
 
-} // end namespace internal
+}  // end namespace internal
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_COMPANION_H
+#endif  // EIGEN_COMPANION_H
diff --git a/inst/include/unsupported/Eigen/src/Polynomials/InternalHeaderCheck.h b/inst/include/unsupported/Eigen/src/Polynomials/InternalHeaderCheck.h
new file mode 100644
index 00000000..b3aa50ca
--- /dev/null
+++ b/inst/include/unsupported/Eigen/src/Polynomials/InternalHeaderCheck.h
@@ -0,0 +1,3 @@
+#ifndef EIGEN_POLYNOMIALS_MODULE_H
+#error "Please include unsupported/Eigen/Polynomials instead of including headers inside the src directory directly."
+#endif
diff --git a/inst/include/unsupported/Eigen/src/Polynomials/PolynomialSolver.h b/inst/include/unsupported/Eigen/src/Polynomials/PolynomialSolver.h
index cd5c04bb..aa357a41 100644
--- a/inst/include/unsupported/Eigen/src/Polynomials/PolynomialSolver.h
+++ b/inst/include/unsupported/Eigen/src/Polynomials/PolynomialSolver.h
@@ -10,7 +10,10 @@
 #ifndef EIGEN_POLYNOMIAL_SOLVER_H
 #define EIGEN_POLYNOMIAL_SOLVER_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 /** \ingroup Polynomials_Module
  *  \class PolynomialSolverBase.
@@ -25,365 +28,360 @@ namespace Eigen {
  * It stores the set of roots as a vector of complexes.
  *
  */
-template< typename _Scalar, int _Deg >
-class PolynomialSolverBase
-{
-  public:
-    EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_Deg==Dynamic ? Dynamic : _Deg)
-
-    typedef _Scalar                             Scalar;
-    typedef typename NumTraits<Scalar>::Real    RealScalar;
-    typedef std::complex<RealScalar>            RootType;
-    typedef Matrix<RootType,_Deg,1>             RootsType;
-
-    typedef DenseIndex Index;
-
-  protected:
-    template< typename OtherPolynomial >
-    inline void setPolynomial( const OtherPolynomial& poly ){
-      m_roots.resize(poly.size()); }
-
-  public:
-    template< typename OtherPolynomial >
-    inline PolynomialSolverBase( const OtherPolynomial& poly ){
-      setPolynomial( poly() ); }
-
-    inline PolynomialSolverBase(){}
-
-  public:
-    /** \returns the complex roots of the polynomial */
-    inline const RootsType& roots() const { return m_roots; }
-
-  public:
-    /** Clear and fills the back insertion sequence with the real roots of the polynomial
-     * i.e. the real part of the complex roots that have an imaginary part which
-     * absolute value is smaller than absImaginaryThreshold.
-     * absImaginaryThreshold takes the dummy_precision associated
-     * with the _Scalar template parameter of the PolynomialSolver class as the default value.
-     *
-     * \param[out] bi_seq : the back insertion sequence (stl concept)
-     * \param[in]  absImaginaryThreshold : the maximum bound of the imaginary part of a complex
-     *  number that is considered as real.
-     * */
-    template<typename Stl_back_insertion_sequence>
-    inline void realRoots( Stl_back_insertion_sequence& bi_seq,
-        const RealScalar& absImaginaryThreshold = NumTraits<Scalar>::dummy_precision() ) const
-    {
-      using std::abs;
-      bi_seq.clear();
-      for(Index i=0; i<m_roots.size(); ++i )
-      {
-        if( abs( m_roots[i].imag() ) < absImaginaryThreshold ){
-          bi_seq.push_back( m_roots[i].real() ); }
+template <typename Scalar_, int Deg_>
+class PolynomialSolverBase {
+ public:
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar_, Deg_ == Dynamic ? Dynamic : Deg_)
+
+  typedef Scalar_ Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef internal::make_complex_t<Scalar> RootType;
+  typedef Matrix<RootType, Deg_, 1> RootsType;
+
+  typedef DenseIndex Index;
+
+ protected:
+  template <typename OtherPolynomial>
+  inline void setPolynomial(const OtherPolynomial& poly) {
+    m_roots.resize(poly.size() - 1);
+  }
+
+ public:
+  template <typename OtherPolynomial>
+  inline PolynomialSolverBase(const OtherPolynomial& poly) {
+    setPolynomial(poly());
+  }
+
+  inline PolynomialSolverBase() {}
+
+ public:
+  /** \returns the complex roots of the polynomial */
+  inline const RootsType& roots() const { return m_roots; }
+
+ public:
+  /** Clear and fills the back insertion sequence with the real roots of the polynomial
+   * i.e. the real part of the complex roots that have an imaginary part which
+   * absolute value is smaller than absImaginaryThreshold.
+   * absImaginaryThreshold takes the dummy_precision associated
+   * with the Scalar_ template parameter of the PolynomialSolver class as the default value.
+   *
+   * \param[out] bi_seq : the back insertion sequence (stl concept)
+   * \param[in]  absImaginaryThreshold : the maximum bound of the imaginary part of a complex
+   *  number that is considered as real.
+   * */
+  template <typename Stl_back_insertion_sequence>
+  inline void realRoots(Stl_back_insertion_sequence& bi_seq,
+                        const RealScalar& absImaginaryThreshold = NumTraits<Scalar>::dummy_precision()) const {
+    using std::abs;
+    bi_seq.clear();
+    for (Index i = 0; i < m_roots.size(); ++i) {
+      if (abs(m_roots[i].imag()) < absImaginaryThreshold) {
+        bi_seq.push_back(m_roots[i].real());
       }
     }
-
-  protected:
-    template<typename squaredNormBinaryPredicate>
-    inline const RootType& selectComplexRoot_withRespectToNorm( squaredNormBinaryPredicate& pred ) const
-    {
-      Index res=0;
-      RealScalar norm2 = numext::abs2( m_roots[0] );
-      for( Index i=1; i<m_roots.size(); ++i )
-      {
-        const RealScalar currNorm2 = numext::abs2( m_roots[i] );
-        if( pred( currNorm2, norm2 ) ){
-          res=i; norm2=currNorm2; }
+  }
+
+ protected:
+  template <typename squaredNormBinaryPredicate>
+  inline const RootType& selectComplexRoot_withRespectToNorm(squaredNormBinaryPredicate& pred) const {
+    Index res = 0;
+    RealScalar norm2 = numext::abs2(m_roots[0]);
+    for (Index i = 1; i < m_roots.size(); ++i) {
+      const RealScalar currNorm2 = numext::abs2(m_roots[i]);
+      if (pred(currNorm2, norm2)) {
+        res = i;
+        norm2 = currNorm2;
       }
-      return m_roots[res];
-    }
-
-  public:
-    /**
-     * \returns the complex root with greatest norm.
-     */
-    inline const RootType& greatestRoot() const
-    {
-      std::greater<Scalar> greater;
-      return selectComplexRoot_withRespectToNorm( greater );
-    }
-
-    /**
-     * \returns the complex root with smallest norm.
-     */
-    inline const RootType& smallestRoot() const
-    {
-      std::less<Scalar> less;
-      return selectComplexRoot_withRespectToNorm( less );
     }
-
-  protected:
-    template<typename squaredRealPartBinaryPredicate>
-    inline const RealScalar& selectRealRoot_withRespectToAbsRealPart(
-        squaredRealPartBinaryPredicate& pred,
-        bool& hasArealRoot,
-        const RealScalar& absImaginaryThreshold = NumTraits<Scalar>::dummy_precision() ) const
-    {
-      using std::abs;
-      hasArealRoot = false;
-      Index res=0;
-      RealScalar abs2(0);
-
-      for( Index i=0; i<m_roots.size(); ++i )
-      {
-        if( abs( m_roots[i].imag() ) < absImaginaryThreshold )
-        {
-          if( !hasArealRoot )
-          {
-            hasArealRoot = true;
+    return m_roots[res];
+  }
+
+ public:
+  /**
+   * \returns the complex root with greatest norm.
+   */
+  inline const RootType& greatestRoot() const {
+    std::greater<RealScalar> greater;
+    return selectComplexRoot_withRespectToNorm(greater);
+  }
+
+  /**
+   * \returns the complex root with smallest norm.
+   */
+  inline const RootType& smallestRoot() const {
+    std::less<RealScalar> less;
+    return selectComplexRoot_withRespectToNorm(less);
+  }
+
+ protected:
+  template <typename squaredRealPartBinaryPredicate>
+  inline const RealScalar& selectRealRoot_withRespectToAbsRealPart(
+      squaredRealPartBinaryPredicate& pred, bool& hasArealRoot,
+      const RealScalar& absImaginaryThreshold = NumTraits<Scalar>::dummy_precision()) const {
+    using std::abs;
+    hasArealRoot = false;
+    Index res = 0;
+    RealScalar abs2(0);
+
+    for (Index i = 0; i < m_roots.size(); ++i) {
+      if (abs(m_roots[i].imag()) <= absImaginaryThreshold) {
+        if (!hasArealRoot) {
+          hasArealRoot = true;
+          res = i;
+          abs2 = m_roots[i].real() * m_roots[i].real();
+        } else {
+          const RealScalar currAbs2 = m_roots[i].real() * m_roots[i].real();
+          if (pred(currAbs2, abs2)) {
+            abs2 = currAbs2;
             res = i;
-            abs2 = m_roots[i].real() * m_roots[i].real();
-          }
-          else
-          {
-            const RealScalar currAbs2 = m_roots[i].real() * m_roots[i].real();
-            if( pred( currAbs2, abs2 ) )
-            {
-              abs2 = currAbs2;
-              res = i;
-            }
           }
         }
-        else
-        {
-          if( abs( m_roots[i].imag() ) < abs( m_roots[res].imag() ) ){
-            res = i; }
+      } else if (!hasArealRoot) {
+        if (abs(m_roots[i].imag()) < abs(m_roots[res].imag())) {
+          res = i;
         }
       }
-      return numext::real_ref(m_roots[res]);
     }
-
-
-    template<typename RealPartBinaryPredicate>
-    inline const RealScalar& selectRealRoot_withRespectToRealPart(
-        RealPartBinaryPredicate& pred,
-        bool& hasArealRoot,
-        const RealScalar& absImaginaryThreshold = NumTraits<Scalar>::dummy_precision() ) const
-    {
-      using std::abs;
-      hasArealRoot = false;
-      Index res=0;
-      RealScalar val(0);
-
-      for( Index i=0; i<m_roots.size(); ++i )
-      {
-        if( abs( m_roots[i].imag() ) < absImaginaryThreshold )
-        {
-          if( !hasArealRoot )
-          {
-            hasArealRoot = true;
+    return numext::real_ref(m_roots[res]);
+  }
+
+  template <typename RealPartBinaryPredicate>
+  inline const RealScalar& selectRealRoot_withRespectToRealPart(
+      RealPartBinaryPredicate& pred, bool& hasArealRoot,
+      const RealScalar& absImaginaryThreshold = NumTraits<Scalar>::dummy_precision()) const {
+    using std::abs;
+    hasArealRoot = false;
+    Index res = 0;
+    RealScalar val(0);
+
+    for (Index i = 0; i < m_roots.size(); ++i) {
+      if (abs(m_roots[i].imag()) <= absImaginaryThreshold) {
+        if (!hasArealRoot) {
+          hasArealRoot = true;
+          res = i;
+          val = m_roots[i].real();
+        } else {
+          const RealScalar curr = m_roots[i].real();
+          if (pred(curr, val)) {
+            val = curr;
             res = i;
-            val = m_roots[i].real();
-          }
-          else
-          {
-            const RealScalar curr = m_roots[i].real();
-            if( pred( curr, val ) )
-            {
-              val = curr;
-              res = i;
-            }
           }
         }
-        else
-        {
-          if( abs( m_roots[i].imag() ) < abs( m_roots[res].imag() ) ){
-            res = i; }
+      } else {
+        if (abs(m_roots[i].imag()) < abs(m_roots[res].imag())) {
+          res = i;
         }
       }
-      return numext::real_ref(m_roots[res]);
-    }
-
-  public:
-    /**
-     * \returns a real root with greatest absolute magnitude.
-     * A real root is defined as the real part of a complex root with absolute imaginary
-     * part smallest than absImaginaryThreshold.
-     * absImaginaryThreshold takes the dummy_precision associated
-     * with the _Scalar template parameter of the PolynomialSolver class as the default value.
-     * If no real root is found the boolean hasArealRoot is set to false and the real part of
-     * the root with smallest absolute imaginary part is returned instead.
-     *
-     * \param[out] hasArealRoot : boolean true if a real root is found according to the
-     *  absImaginaryThreshold criterion, false otherwise.
-     * \param[in] absImaginaryThreshold : threshold on the absolute imaginary part to decide
-     *  whether or not a root is real.
-     */
-    inline const RealScalar& absGreatestRealRoot(
-        bool& hasArealRoot,
-        const RealScalar& absImaginaryThreshold = NumTraits<Scalar>::dummy_precision() ) const
-    {
-      std::greater<Scalar> greater;
-      return selectRealRoot_withRespectToAbsRealPart( greater, hasArealRoot, absImaginaryThreshold );
-    }
-
-
-    /**
-     * \returns a real root with smallest absolute magnitude.
-     * A real root is defined as the real part of a complex root with absolute imaginary
-     * part smallest than absImaginaryThreshold.
-     * absImaginaryThreshold takes the dummy_precision associated
-     * with the _Scalar template parameter of the PolynomialSolver class as the default value.
-     * If no real root is found the boolean hasArealRoot is set to false and the real part of
-     * the root with smallest absolute imaginary part is returned instead.
-     *
-     * \param[out] hasArealRoot : boolean true if a real root is found according to the
-     *  absImaginaryThreshold criterion, false otherwise.
-     * \param[in] absImaginaryThreshold : threshold on the absolute imaginary part to decide
-     *  whether or not a root is real.
-     */
-    inline const RealScalar& absSmallestRealRoot(
-        bool& hasArealRoot,
-        const RealScalar& absImaginaryThreshold = NumTraits<Scalar>::dummy_precision() ) const
-    {
-      std::less<Scalar> less;
-      return selectRealRoot_withRespectToAbsRealPart( less, hasArealRoot, absImaginaryThreshold );
-    }
-
-
-    /**
-     * \returns the real root with greatest value.
-     * A real root is defined as the real part of a complex root with absolute imaginary
-     * part smallest than absImaginaryThreshold.
-     * absImaginaryThreshold takes the dummy_precision associated
-     * with the _Scalar template parameter of the PolynomialSolver class as the default value.
-     * If no real root is found the boolean hasArealRoot is set to false and the real part of
-     * the root with smallest absolute imaginary part is returned instead.
-     *
-     * \param[out] hasArealRoot : boolean true if a real root is found according to the
-     *  absImaginaryThreshold criterion, false otherwise.
-     * \param[in] absImaginaryThreshold : threshold on the absolute imaginary part to decide
-     *  whether or not a root is real.
-     */
-    inline const RealScalar& greatestRealRoot(
-        bool& hasArealRoot,
-        const RealScalar& absImaginaryThreshold = NumTraits<Scalar>::dummy_precision() ) const
-    {
-      std::greater<Scalar> greater;
-      return selectRealRoot_withRespectToRealPart( greater, hasArealRoot, absImaginaryThreshold );
-    }
-
-
-    /**
-     * \returns the real root with smallest value.
-     * A real root is defined as the real part of a complex root with absolute imaginary
-     * part smallest than absImaginaryThreshold.
-     * absImaginaryThreshold takes the dummy_precision associated
-     * with the _Scalar template parameter of the PolynomialSolver class as the default value.
-     * If no real root is found the boolean hasArealRoot is set to false and the real part of
-     * the root with smallest absolute imaginary part is returned instead.
-     *
-     * \param[out] hasArealRoot : boolean true if a real root is found according to the
-     *  absImaginaryThreshold criterion, false otherwise.
-     * \param[in] absImaginaryThreshold : threshold on the absolute imaginary part to decide
-     *  whether or not a root is real.
-     */
-    inline const RealScalar& smallestRealRoot(
-        bool& hasArealRoot,
-        const RealScalar& absImaginaryThreshold = NumTraits<Scalar>::dummy_precision() ) const
-    {
-      std::less<Scalar> less;
-      return selectRealRoot_withRespectToRealPart( less, hasArealRoot, absImaginaryThreshold );
     }
-
-  protected:
-    RootsType               m_roots;
+    return numext::real_ref(m_roots[res]);
+  }
+
+ public:
+  /**
+   * \returns a real root with greatest absolute magnitude.
+   * A real root is defined as the real part of a complex root with absolute imaginary
+   * part smallest than absImaginaryThreshold.
+   * absImaginaryThreshold takes the dummy_precision associated
+   * with the Scalar_ template parameter of the PolynomialSolver class as the default value.
+   * If no real root is found the boolean hasArealRoot is set to false and the real part of
+   * the root with smallest absolute imaginary part is returned instead.
+   *
+   * \param[out] hasArealRoot : boolean true if a real root is found according to the
+   *  absImaginaryThreshold criterion, false otherwise.
+   * \param[in] absImaginaryThreshold : threshold on the absolute imaginary part to decide
+   *  whether or not a root is real.
+   */
+  inline const RealScalar& absGreatestRealRoot(
+      bool& hasArealRoot, const RealScalar& absImaginaryThreshold = NumTraits<Scalar>::dummy_precision()) const {
+    std::greater<RealScalar> greater;
+    return selectRealRoot_withRespectToAbsRealPart(greater, hasArealRoot, absImaginaryThreshold);
+  }
+
+  /**
+   * \returns a real root with smallest absolute magnitude.
+   * A real root is defined as the real part of a complex root with absolute imaginary
+   * part smallest than absImaginaryThreshold.
+   * absImaginaryThreshold takes the dummy_precision associated
+   * with the Scalar_ template parameter of the PolynomialSolver class as the default value.
+   * If no real root is found the boolean hasArealRoot is set to false and the real part of
+   * the root with smallest absolute imaginary part is returned instead.
+   *
+   * \param[out] hasArealRoot : boolean true if a real root is found according to the
+   *  absImaginaryThreshold criterion, false otherwise.
+   * \param[in] absImaginaryThreshold : threshold on the absolute imaginary part to decide
+   *  whether or not a root is real.
+   */
+  inline const RealScalar& absSmallestRealRoot(
+      bool& hasArealRoot, const RealScalar& absImaginaryThreshold = NumTraits<Scalar>::dummy_precision()) const {
+    std::less<RealScalar> less;
+    return selectRealRoot_withRespectToAbsRealPart(less, hasArealRoot, absImaginaryThreshold);
+  }
+
+  /**
+   * \returns the real root with greatest value.
+   * A real root is defined as the real part of a complex root with absolute imaginary
+   * part smallest than absImaginaryThreshold.
+   * absImaginaryThreshold takes the dummy_precision associated
+   * with the Scalar_ template parameter of the PolynomialSolver class as the default value.
+   * If no real root is found the boolean hasArealRoot is set to false and the real part of
+   * the root with smallest absolute imaginary part is returned instead.
+   *
+   * \param[out] hasArealRoot : boolean true if a real root is found according to the
+   *  absImaginaryThreshold criterion, false otherwise.
+   * \param[in] absImaginaryThreshold : threshold on the absolute imaginary part to decide
+   *  whether or not a root is real.
+   */
+  inline const RealScalar& greatestRealRoot(
+      bool& hasArealRoot, const RealScalar& absImaginaryThreshold = NumTraits<Scalar>::dummy_precision()) const {
+    std::greater<RealScalar> greater;
+    return selectRealRoot_withRespectToRealPart(greater, hasArealRoot, absImaginaryThreshold);
+  }
+
+  /**
+   * \returns the real root with smallest value.
+   * A real root is defined as the real part of a complex root with absolute imaginary
+   * part smallest than absImaginaryThreshold.
+   * absImaginaryThreshold takes the dummy_precision associated
+   * with the Scalar_ template parameter of the PolynomialSolver class as the default value.
+   * If no real root is found the boolean hasArealRoot is set to false and the real part of
+   * the root with smallest absolute imaginary part is returned instead.
+   *
+   * \param[out] hasArealRoot : boolean true if a real root is found according to the
+   *  absImaginaryThreshold criterion, false otherwise.
+   * \param[in] absImaginaryThreshold : threshold on the absolute imaginary part to decide
+   *  whether or not a root is real.
+   */
+  inline const RealScalar& smallestRealRoot(
+      bool& hasArealRoot, const RealScalar& absImaginaryThreshold = NumTraits<Scalar>::dummy_precision()) const {
+    std::less<RealScalar> less;
+    return selectRealRoot_withRespectToRealPart(less, hasArealRoot, absImaginaryThreshold);
+  }
+
+ protected:
+  RootsType m_roots;
 };
 
-#define EIGEN_POLYNOMIAL_SOLVER_BASE_INHERITED_TYPES( BASE )  \
-  typedef typename BASE::Scalar                 Scalar;       \
-  typedef typename BASE::RealScalar             RealScalar;   \
-  typedef typename BASE::RootType               RootType;     \
-  typedef typename BASE::RootsType              RootsType;
-
-
+#define EIGEN_POLYNOMIAL_SOLVER_BASE_INHERITED_TYPES(BASE) \
+  typedef typename BASE::Scalar Scalar;                    \
+  typedef typename BASE::RealScalar RealScalar;            \
+  typedef typename BASE::RootType RootType;                \
+  typedef typename BASE::RootsType RootsType;
 
 /** \ingroup Polynomials_Module
-  *
-  * \class PolynomialSolver
-  *
-  * \brief A polynomial solver
-  *
-  * Computes the complex roots of a real polynomial.
-  *
-  * \param _Scalar the scalar type, i.e., the type of the polynomial coefficients
-  * \param _Deg the degree of the polynomial, can be a compile time value or Dynamic.
-  *             Notice that the number of polynomial coefficients is _Deg+1.
-  *
-  * This class implements a polynomial solver and provides convenient methods such as
-  * - real roots,
-  * - greatest, smallest complex roots,
-  * - real roots with greatest, smallest absolute real value.
-  * - greatest, smallest real roots.
-  *
-  * WARNING: this polynomial solver is experimental, part of the unsuported Eigen modules.
-  *
-  *
-  * Currently a QR algorithm is used to compute the eigenvalues of the companion matrix of
-  * the polynomial to compute its roots.
-  * This supposes that the complex moduli of the roots are all distinct: e.g. there should
-  * be no multiple roots or conjugate roots for instance.
-  * With 32bit (float) floating types this problem shows up frequently.
-  * However, almost always, correct accuracy is reached even in these cases for 64bit
-  * (double) floating types and small polynomial degree (<20).
-  */
-template< typename _Scalar, int _Deg >
-class PolynomialSolver : public PolynomialSolverBase<_Scalar,_Deg>
-{
-  public:
-    EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(_Scalar,_Deg==Dynamic ? Dynamic : _Deg)
-
-    typedef PolynomialSolverBase<_Scalar,_Deg>    PS_Base;
-    EIGEN_POLYNOMIAL_SOLVER_BASE_INHERITED_TYPES( PS_Base )
-
-    typedef Matrix<Scalar,_Deg,_Deg>                 CompanionMatrixType;
-    typedef EigenSolver<CompanionMatrixType>         EigenSolverType;
-
-  public:
-    /** Computes the complex roots of a new polynomial. */
-    template< typename OtherPolynomial >
-    void compute( const OtherPolynomial& poly )
-    {
-      eigen_assert( Scalar(0) != poly[poly.size()-1] );
-      internal::companion<Scalar,_Deg> companion( poly );
+ *
+ * \class PolynomialSolver
+ *
+ * \brief A polynomial solver
+ *
+ * Computes the complex roots of a real polynomial.
+ *
+ * \param Scalar_ the scalar type, i.e., the type of the polynomial coefficients
+ * \param Deg_ the degree of the polynomial, can be a compile time value or Dynamic.
+ *             Notice that the number of polynomial coefficients is Deg_+1.
+ *
+ * This class implements a polynomial solver and provides convenient methods such as
+ * - real roots,
+ * - greatest, smallest complex roots,
+ * - real roots with greatest, smallest absolute real value.
+ * - greatest, smallest real roots.
+ *
+ * WARNING: this polynomial solver is experimental, part of the unsupported Eigen modules.
+ *
+ *
+ * Currently a QR algorithm is used to compute the eigenvalues of the companion matrix of
+ * the polynomial to compute its roots.
+ * This supposes that the complex moduli of the roots are all distinct: e.g. there should
+ * be no multiple roots or conjugate roots for instance.
+ * With 32bit (float) floating types this problem shows up frequently.
+ * However, almost always, correct accuracy is reached even in these cases for 64bit
+ * (double) floating types and small polynomial degree (<20).
+ */
+template <typename Scalar_, int Deg_>
+class PolynomialSolver : public PolynomialSolverBase<Scalar_, Deg_> {
+ public:
+  EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF_VECTORIZABLE_FIXED_SIZE(Scalar_, Deg_ == Dynamic ? Dynamic : Deg_)
+
+  typedef PolynomialSolverBase<Scalar_, Deg_> PS_Base;
+  EIGEN_POLYNOMIAL_SOLVER_BASE_INHERITED_TYPES(PS_Base)
+
+  typedef Matrix<Scalar, Deg_, Deg_> CompanionMatrixType;
+  typedef std::conditional_t<NumTraits<Scalar>::IsComplex, ComplexEigenSolver<CompanionMatrixType>,
+                             EigenSolver<CompanionMatrixType> >
+      EigenSolverType;
+  typedef internal::make_complex_t<Scalar_> ComplexScalar;
+
+ public:
+  /** Computes the complex roots of a new polynomial. */
+  template <typename OtherPolynomial>
+  void compute(const OtherPolynomial& poly) {
+    eigen_assert(Scalar(0) != poly[poly.size() - 1]);
+    eigen_assert(poly.size() > 1);
+    if (poly.size() > 2) {
+      internal::companion<Scalar, Deg_> companion(poly);
       companion.balance();
-      m_eigenSolver.compute( companion.denseMatrix() );
+      m_eigenSolver.compute(companion.denseMatrix());
+      eigen_assert(m_eigenSolver.info() == Eigen::Success);
       m_roots = m_eigenSolver.eigenvalues();
+      // cleanup noise in imaginary part of real roots:
+      // if the imaginary part is rather small compared to the real part
+      // and that cancelling the imaginary part yield a smaller evaluation,
+      // then it's safe to keep the real part only.
+      RealScalar coarse_prec = RealScalar(std::pow(4, poly.size() + 1)) * NumTraits<RealScalar>::epsilon();
+      for (Index i = 0; i < m_roots.size(); ++i) {
+        if (internal::isMuchSmallerThan(numext::abs(numext::imag(m_roots[i])), numext::abs(numext::real(m_roots[i])),
+                                        coarse_prec)) {
+          ComplexScalar as_real_root = ComplexScalar(numext::real(m_roots[i]));
+          if (numext::abs(poly_eval(poly, as_real_root)) <= numext::abs(poly_eval(poly, m_roots[i]))) {
+            m_roots[i] = as_real_root;
+          }
+        }
+      }
+    } else if (poly.size() == 2) {
+      m_roots.resize(1);
+      m_roots[0] = -poly[0] / poly[1];
     }
+  }
 
-  public:
-    template< typename OtherPolynomial >
-    inline PolynomialSolver( const OtherPolynomial& poly ){
-      compute( poly ); }
+ public:
+  template <typename OtherPolynomial>
+  inline PolynomialSolver(const OtherPolynomial& poly) {
+    compute(poly);
+  }
 
-    inline PolynomialSolver(){}
+  inline PolynomialSolver() {}
 
-  protected:
-    using                   PS_Base::m_roots;
-    EigenSolverType         m_eigenSolver;
+ protected:
+  using PS_Base::m_roots;
+  EigenSolverType m_eigenSolver;
 };
 
-
-template< typename _Scalar >
-class PolynomialSolver<_Scalar,1> : public PolynomialSolverBase<_Scalar,1>
-{
-  public:
-    typedef PolynomialSolverBase<_Scalar,1>    PS_Base;
-    EIGEN_POLYNOMIAL_SOLVER_BASE_INHERITED_TYPES( PS_Base )
-
-  public:
-    /** Computes the complex roots of a new polynomial. */
-    template< typename OtherPolynomial >
-    void compute( const OtherPolynomial& poly )
-    {
-      eigen_assert( Scalar(0) != poly[poly.size()-1] );
-      m_roots[0] = -poly[0]/poly[poly.size()-1];
-    }
-
-  protected:
-    using                   PS_Base::m_roots;
+template <typename Scalar_>
+class PolynomialSolver<Scalar_, 1> : public PolynomialSolverBase<Scalar_, 1> {
+ public:
+  typedef PolynomialSolverBase<Scalar_, 1> PS_Base;
+  EIGEN_POLYNOMIAL_SOLVER_BASE_INHERITED_TYPES(PS_Base)
+
+ public:
+  /** Computes the complex roots of a new polynomial. */
+  template <typename OtherPolynomial>
+  void compute(const OtherPolynomial& poly) {
+    eigen_assert(poly.size() == 2);
+    eigen_assert(Scalar(0) != poly[1]);
+    m_roots[0] = -poly[0] / poly[1];
+  }
+
+ public:
+  template <typename OtherPolynomial>
+  inline PolynomialSolver(const OtherPolynomial& poly) {
+    compute(poly);
+  }
+
+  inline PolynomialSolver() {}
+
+ protected:
+  using PS_Base::m_roots;
 };
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_POLYNOMIAL_SOLVER_H
+#endif  // EIGEN_POLYNOMIAL_SOLVER_H
diff --git a/inst/include/unsupported/Eigen/src/Polynomials/PolynomialUtils.h b/inst/include/unsupported/Eigen/src/Polynomials/PolynomialUtils.h
index 2bb8bc84..342ca9a0 100644
--- a/inst/include/unsupported/Eigen/src/Polynomials/PolynomialUtils.h
+++ b/inst/include/unsupported/Eigen/src/Polynomials/PolynomialUtils.h
@@ -10,7 +10,10 @@
 #ifndef EIGEN_POLYNOMIAL_UTILS_H
 #define EIGEN_POLYNOMIAL_UTILS_H
 
-namespace Eigen { 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 /** \ingroup Polynomials_Module
  * \returns the evaluation of the polynomial at x using Horner algorithm.
@@ -20,16 +23,15 @@ namespace Eigen {
  *  e.g. \f$ 1 + 3x^2 \f$ is stored as a vector \f$ [ 1, 0, 3 ] \f$.
  * \param[in] x : the value to evaluate the polynomial at.
  *
- * <i><b>Note for stability:</b></i>
- *  <dd> \f$ |x| \le 1 \f$ </dd>
+ * \note for stability:
+ *   \f$ |x| \le 1 \f$
  */
 template <typename Polynomials, typename T>
-inline
-T poly_eval_horner( const Polynomials& poly, const T& x )
-{
-  T val=poly[poly.size()-1];
-  for(DenseIndex i=poly.size()-2; i>=0; --i ){
-    val = val*x + poly[i]; }
+inline T poly_eval_horner(const Polynomials& poly, const T& x) {
+  T val = poly[poly.size() - 1];
+  for (DenseIndex i = poly.size() - 2; i >= 0; --i) {
+    val = val * x + poly[i];
+  }
   return val;
 }
 
@@ -42,21 +44,19 @@ T poly_eval_horner( const Polynomials& poly, const T& x )
  * \param[in] x : the value to evaluate the polynomial at.
  */
 template <typename Polynomials, typename T>
-inline
-T poly_eval( const Polynomials& poly, const T& x )
-{
+inline T poly_eval(const Polynomials& poly, const T& x) {
   typedef typename NumTraits<T>::Real Real;
 
-  if( numext::abs2( x ) <= Real(1) ){
-    return poly_eval_horner( poly, x ); }
-  else
-  {
-    T val=poly[0];
-    T inv_x = T(1)/x;
-    for( DenseIndex i=1; i<poly.size(); ++i ){
-      val = val*inv_x + poly[i]; }
+  if (numext::abs2(x) <= Real(1)) {
+    return poly_eval_horner(poly, x);
+  } else {
+    T val = poly[0];
+    T inv_x = T(1) / x;
+    for (DenseIndex i = 1; i < poly.size(); ++i) {
+      val = val * inv_x + poly[i];
+    }
 
-    return std::pow(x,(T)(poly.size()-1)) * val;
+    return numext::pow(x, (T)(poly.size() - 1)) * val;
   }
 }
 
@@ -67,23 +67,22 @@ T poly_eval( const Polynomials& poly, const T& x )
  *  by degrees i.e. poly[i] is the coefficient of degree i of the polynomial
  *  e.g. \f$ 1 + 3x^2 \f$ is stored as a vector \f$ [ 1, 0, 3 ] \f$.
  *
- *  <i><b>Precondition:</b></i>
- *  <dd> the leading coefficient of the input polynomial poly must be non zero </dd>
+ *  \pre
+ *   the leading coefficient of the input polynomial poly must be non zero
  */
 template <typename Polynomial>
-inline
-typename NumTraits<typename Polynomial::Scalar>::Real cauchy_max_bound( const Polynomial& poly )
-{
+inline typename NumTraits<typename Polynomial::Scalar>::Real cauchy_max_bound(const Polynomial& poly) {
   using std::abs;
   typedef typename Polynomial::Scalar Scalar;
   typedef typename NumTraits<Scalar>::Real Real;
 
-  eigen_assert( Scalar(0) != poly[poly.size()-1] );
-  const Scalar inv_leading_coeff = Scalar(1)/poly[poly.size()-1];
+  eigen_assert(Scalar(0) != poly[poly.size() - 1]);
+  const Scalar inv_leading_coeff = Scalar(1) / poly[poly.size() - 1];
   Real cb(0);
 
-  for( DenseIndex i=0; i<poly.size()-1; ++i ){
-    cb += abs(poly[i]*inv_leading_coeff); }
+  for (DenseIndex i = 0; i < poly.size() - 1; ++i) {
+    cb += abs(poly[i] * inv_leading_coeff);
+  }
   return cb + Real(1);
 }
 
@@ -94,23 +93,25 @@ typename NumTraits<typename Polynomial::Scalar>::Real cauchy_max_bound( const Po
  *  e.g. \f$ 1 + 3x^2 \f$ is stored as a vector \f$ [ 1, 0, 3 ] \f$.
  */
 template <typename Polynomial>
-inline
-typename NumTraits<typename Polynomial::Scalar>::Real cauchy_min_bound( const Polynomial& poly )
-{
+inline typename NumTraits<typename Polynomial::Scalar>::Real cauchy_min_bound(const Polynomial& poly) {
   using std::abs;
   typedef typename Polynomial::Scalar Scalar;
   typedef typename NumTraits<Scalar>::Real Real;
 
-  DenseIndex i=0;
-  while( i<poly.size()-1 && Scalar(0) == poly(i) ){ ++i; }
-  if( poly.size()-1 == i ){
-    return Real(1); }
+  DenseIndex i = 0;
+  while (i < poly.size() - 1 && Scalar(0) == poly(i)) {
+    ++i;
+  }
+  if (poly.size() - 1 == i) {
+    return Real(1);
+  }
 
-  const Scalar inv_min_coeff = Scalar(1)/poly[i];
+  const Scalar inv_min_coeff = Scalar(1) / poly[i];
   Real cb(1);
-  for( DenseIndex j=i+1; j<poly.size(); ++j ){
-    cb += abs(poly[j]*inv_min_coeff); }
-  return Real(1)/cb;
+  for (DenseIndex j = i + 1; j < poly.size(); ++j) {
+    cb += abs(poly[j] * inv_min_coeff);
+  }
+  return Real(1) / cb;
 }
 
 /** \ingroup Polynomials_Module
@@ -124,20 +125,20 @@ typename NumTraits<typename Polynomial::Scalar>::Real cauchy_min_bound( const Po
  *  e.g. \f$ 3 + x^2 \f$ is stored as a vector \f$ [ 3, 0, 1 ] \f$.
  */
 template <typename RootVector, typename Polynomial>
-void roots_to_monicPolynomial( const RootVector& rv, Polynomial& poly )
-{
-
+void roots_to_monicPolynomial(const RootVector& rv, Polynomial& poly) {
   typedef typename Polynomial::Scalar Scalar;
 
-  poly.setZero( rv.size()+1 );
-  poly[0] = -rv[0]; poly[1] = Scalar(1);
-  for( DenseIndex i=1; i< rv.size(); ++i )
-  {
-    for( DenseIndex j=i+1; j>0; --j ){ poly[j] = poly[j-1] - rv[i]*poly[j]; }
-    poly[0] = -rv[i]*poly[0];
+  poly.setZero(rv.size() + 1);
+  poly[0] = -rv[0];
+  poly[1] = Scalar(1);
+  for (DenseIndex i = 1; i < rv.size(); ++i) {
+    for (DenseIndex j = i + 1; j > 0; --j) {
+      poly[j] = poly[j - 1] - rv[i] * poly[j];
+    }
+    poly[0] = -rv[i] * poly[0];
   }
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_POLYNOMIAL_UTILS_H
+#endif  // EIGEN_POLYNOMIAL_UTILS_H
diff --git a/inst/include/unsupported/Eigen/src/SVD/BDCSVD.h b/inst/include/unsupported/Eigen/src/SVD/BDCSVD.h
deleted file mode 100644
index 11d4882e..00000000
--- a/inst/include/unsupported/Eigen/src/SVD/BDCSVD.h
+++ /dev/null
@@ -1,748 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-// 
-// We used the "A Divide-And-Conquer Algorithm for the Bidiagonal SVD"
-// research report written by Ming Gu and Stanley C.Eisenstat
-// The code variable names correspond to the names they used in their 
-// report
-//
-// Copyright (C) 2013 Gauthier Brun <brun.gauthier@gmail.com>
-// Copyright (C) 2013 Nicolas Carre <nicolas.carre@ensimag.fr>
-// Copyright (C) 2013 Jean Ceccato <jean.ceccato@ensimag.fr>
-// Copyright (C) 2013 Pierre Zoppitelli <pierre.zoppitelli@ensimag.fr>
-//
-// Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_BDCSVD_H
-#define EIGEN_BDCSVD_H
-
-#define EPSILON 0.0000000000000001
-
-#define ALGOSWAP 32
-
-namespace Eigen {
-/** \ingroup SVD_Module
- *
- *
- * \class BDCSVD
- *
- * \brief class Bidiagonal Divide and Conquer SVD
- *
- * \param MatrixType the type of the matrix of which we are computing the SVD decomposition
- * We plan to have a very similar interface to JacobiSVD on this class.
- * It should be used to speed up the calcul of SVD for big matrices. 
- */
-template<typename _MatrixType> 
-class BDCSVD : public SVDBase<_MatrixType>
-{
-  typedef SVDBase<_MatrixType> Base;
-    
-public:
-  using Base::rows;
-  using Base::cols;
-  
-  typedef _MatrixType MatrixType;
-  typedef typename MatrixType::Scalar Scalar;
-  typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
-  typedef typename MatrixType::Index Index;
-  enum {
-    RowsAtCompileTime = MatrixType::RowsAtCompileTime, 
-    ColsAtCompileTime = MatrixType::ColsAtCompileTime, 
-    DiagSizeAtCompileTime = EIGEN_SIZE_MIN_PREFER_DYNAMIC(RowsAtCompileTime, ColsAtCompileTime), 
-    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, 
-    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime, 
-    MaxDiagSizeAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED(MaxRowsAtCompileTime, MaxColsAtCompileTime), 
-    MatrixOptions = MatrixType::Options
-  };
-
-  typedef Matrix<Scalar, RowsAtCompileTime, RowsAtCompileTime, 
-		 MatrixOptions, MaxRowsAtCompileTime, MaxRowsAtCompileTime>
-  MatrixUType;
-  typedef Matrix<Scalar, ColsAtCompileTime, ColsAtCompileTime, 
-		 MatrixOptions, MaxColsAtCompileTime, MaxColsAtCompileTime>
-  MatrixVType;
-  typedef typename internal::plain_diag_type<MatrixType, RealScalar>::type SingularValuesType;
-  typedef typename internal::plain_row_type<MatrixType>::type RowType;
-  typedef typename internal::plain_col_type<MatrixType>::type ColType;
-  typedef Matrix<Scalar, Dynamic, Dynamic> MatrixX;
-  typedef Matrix<RealScalar, Dynamic, Dynamic> MatrixXr;
-  typedef Matrix<RealScalar, Dynamic, 1> VectorType;
-
-  /** \brief Default Constructor.
-   *
-   * The default constructor is useful in cases in which the user intends to
-   * perform decompositions via BDCSVD::compute(const MatrixType&).
-   */
-  BDCSVD()
-    : SVDBase<_MatrixType>::SVDBase(), 
-      algoswap(ALGOSWAP)
-  {}
-
-
-  /** \brief Default Constructor with memory preallocation
-   *
-   * Like the default constructor but with preallocation of the internal data
-   * according to the specified problem size.
-   * \sa BDCSVD()
-   */
-  BDCSVD(Index rows, Index cols, unsigned int computationOptions = 0)
-    : SVDBase<_MatrixType>::SVDBase(), 
-      algoswap(ALGOSWAP)
-  {
-    allocate(rows, cols, computationOptions);
-  }
-
-  /** \brief Constructor performing the decomposition of given matrix.
-   *
-   * \param matrix the matrix to decompose
-   * \param computationOptions optional parameter allowing to specify if you want full or thin U or V unitaries to be computed.
-   *                           By default, none is computed. This is a bit - field, the possible bits are #ComputeFullU, #ComputeThinU, 
-   *                           #ComputeFullV, #ComputeThinV.
-   *
-   * Thin unitaries are only available if your matrix type has a Dynamic number of columns (for example MatrixXf). They also are not
-   * available with the (non - default) FullPivHouseholderQR preconditioner.
-   */
-  BDCSVD(const MatrixType& matrix, unsigned int computationOptions = 0)
-    : SVDBase<_MatrixType>::SVDBase(), 
-      algoswap(ALGOSWAP)
-  {
-    compute(matrix, computationOptions);
-  }
-
-  ~BDCSVD() 
-  {
-  }
-  /** \brief Method performing the decomposition of given matrix using custom options.
-   *
-   * \param matrix the matrix to decompose
-   * \param computationOptions optional parameter allowing to specify if you want full or thin U or V unitaries to be computed.
-   *                           By default, none is computed. This is a bit - field, the possible bits are #ComputeFullU, #ComputeThinU, 
-   *                           #ComputeFullV, #ComputeThinV.
-   *
-   * Thin unitaries are only available if your matrix type has a Dynamic number of columns (for example MatrixXf). They also are not
-   * available with the (non - default) FullPivHouseholderQR preconditioner.
-   */
-  SVDBase<MatrixType>& compute(const MatrixType& matrix, unsigned int computationOptions);
-
-  /** \brief Method performing the decomposition of given matrix using current options.
-   *
-   * \param matrix the matrix to decompose
-   *
-   * This method uses the current \a computationOptions, as already passed to the constructor or to compute(const MatrixType&, unsigned int).
-   */
-  SVDBase<MatrixType>& compute(const MatrixType& matrix)
-  {
-    return compute(matrix, this->m_computationOptions);
-  }
-
-  void setSwitchSize(int s) 
-  {
-    eigen_assert(s>3 && "BDCSVD the size of the algo switch has to be greater than 4");
-    algoswap = s;
-  }
-
-
-  /** \returns a (least squares) solution of \f$ A x = b \f$ using the current SVD decomposition of A.
-   *
-   * \param b the right - hand - side of the equation to solve.
-   *
-   * \note Solving requires both U and V to be computed. Thin U and V are enough, there is no need for full U or V.
-   *
-   * \note SVD solving is implicitly least - squares. Thus, this method serves both purposes of exact solving and least - squares solving.
-   * In other words, the returned solution is guaranteed to minimize the Euclidean norm \f$ \Vert A x - b \Vert \f$.
-   */
-  template<typename Rhs>
-  inline const internal::solve_retval<BDCSVD, Rhs>
-  solve(const MatrixBase<Rhs>& b) const
-  {
-    eigen_assert(this->m_isInitialized && "BDCSVD is not initialized.");
-    eigen_assert(SVDBase<_MatrixType>::computeU() && SVDBase<_MatrixType>::computeV() && 
-		 "BDCSVD::solve() requires both unitaries U and V to be computed (thin unitaries suffice).");
-    return internal::solve_retval<BDCSVD, Rhs>(*this, b.derived());
-  }
-
- 
-  const MatrixUType& matrixU() const
-  {
-    eigen_assert(this->m_isInitialized && "SVD is not initialized.");
-    if (isTranspose){
-      eigen_assert(this->computeV() && "This SVD decomposition didn't compute U. Did you ask for it?");
-      return this->m_matrixV;
-    }
-    else 
-    {
-      eigen_assert(this->computeU() && "This SVD decomposition didn't compute U. Did you ask for it?");
-      return this->m_matrixU;
-    }
-     
-  }
-
-
-  const MatrixVType& matrixV() const
-  {
-    eigen_assert(this->m_isInitialized && "SVD is not initialized.");
-    if (isTranspose){
-      eigen_assert(this->computeU() && "This SVD decomposition didn't compute V. Did you ask for it?");
-      return this->m_matrixU;
-    }
-    else
-    {
-      eigen_assert(this->computeV() && "This SVD decomposition didn't compute V. Did you ask for it?");
-      return this->m_matrixV;
-    }
-  }
- 
-private:
-  void allocate(Index rows, Index cols, unsigned int computationOptions);
-  void divide (Index firstCol, Index lastCol, Index firstRowW, 
-	       Index firstColW, Index shift);
-  void deflation43(Index firstCol, Index shift, Index i, Index size);
-  void deflation44(Index firstColu , Index firstColm, Index firstRowW, Index firstColW, Index i, Index j, Index size);
-  void deflation(Index firstCol, Index lastCol, Index k, Index firstRowW, Index firstColW, Index shift);
-  void copyUV(MatrixXr naiveU, MatrixXr naiveV, MatrixX householderU, MatrixX houseHolderV);
-
-protected:
-  MatrixXr m_naiveU, m_naiveV;
-  MatrixXr m_computed;
-  Index nRec;
-  int algoswap;
-  bool isTranspose, compU, compV;
-  
-}; //end class BDCSVD
-
-
-// Methode to allocate ans initialize matrix and attributs
-template<typename MatrixType>
-void BDCSVD<MatrixType>::allocate(Index rows, Index cols, unsigned int computationOptions)
-{
-  isTranspose = (cols > rows);
-  if (SVDBase<MatrixType>::allocate(rows, cols, computationOptions)) return;
-  m_computed = MatrixXr::Zero(this->m_diagSize + 1, this->m_diagSize );
-  if (isTranspose){
-    compU = this->computeU();
-    compV = this->computeV();    
-  } 
-  else
-  {
-    compV = this->computeU();
-    compU = this->computeV();   
-  }
-  if (compU) m_naiveU = MatrixXr::Zero(this->m_diagSize + 1, this->m_diagSize + 1 );
-  else m_naiveU = MatrixXr::Zero(2, this->m_diagSize + 1 );
-  
-  if (compV) m_naiveV = MatrixXr::Zero(this->m_diagSize, this->m_diagSize);
-  
-
-  //should be changed for a cleaner implementation
-  if (isTranspose){
-    bool aux;
-    if (this->computeU()||this->computeV()){
-      aux = this->m_computeFullU;
-      this->m_computeFullU = this->m_computeFullV;
-      this->m_computeFullV = aux;
-      aux = this->m_computeThinU;
-      this->m_computeThinU = this->m_computeThinV;
-      this->m_computeThinV = aux;
-    } 
-  }
-}// end allocate
-
-// Methode which compute the BDCSVD for the int
-template<>
-SVDBase<Matrix<int, Dynamic, Dynamic> >&
-BDCSVD<Matrix<int, Dynamic, Dynamic> >::compute(const MatrixType& matrix, unsigned int computationOptions) {
-  allocate(matrix.rows(), matrix.cols(), computationOptions);
-  this->m_nonzeroSingularValues = 0;
-  m_computed = Matrix<int, Dynamic, Dynamic>::Zero(rows(), cols());
-  for (int i=0; i<this->m_diagSize; i++)   {
-    this->m_singularValues.coeffRef(i) = 0;
-  }
-  if (this->m_computeFullU) this->m_matrixU = Matrix<int, Dynamic, Dynamic>::Zero(rows(), rows());
-  if (this->m_computeFullV) this->m_matrixV = Matrix<int, Dynamic, Dynamic>::Zero(cols(), cols()); 
-  this->m_isInitialized = true;
-  return *this;
-}
-
-
-// Methode which compute the BDCSVD
-template<typename MatrixType>
-SVDBase<MatrixType>&
-BDCSVD<MatrixType>::compute(const MatrixType& matrix, unsigned int computationOptions) 
-{
-  allocate(matrix.rows(), matrix.cols(), computationOptions);
-  using std::abs;
-
-  //**** step 1 Bidiagonalization  isTranspose = (matrix.cols()>matrix.rows()) ;
-  MatrixType copy;
-  if (isTranspose) copy = matrix.adjoint();
-  else copy = matrix;
-  
-  internal::UpperBidiagonalization<MatrixX > bid(copy);
-
-  //**** step 2 Divide
-  // this is ugly and has to be redone (care of complex cast)
-  MatrixXr temp;
-  temp = bid.bidiagonal().toDenseMatrix().transpose();
-  m_computed.setZero();
-  for (int i=0; i<this->m_diagSize - 1; i++)   {
-    m_computed(i, i) = temp(i, i);
-    m_computed(i + 1, i) = temp(i + 1, i);
-  }
-  m_computed(this->m_diagSize - 1, this->m_diagSize - 1) = temp(this->m_diagSize - 1, this->m_diagSize - 1);
-  divide(0, this->m_diagSize - 1, 0, 0, 0);
-
-  //**** step 3 copy
-  for (int i=0; i<this->m_diagSize; i++)   {
-    RealScalar a = abs(m_computed.coeff(i, i));
-    this->m_singularValues.coeffRef(i) = a;
-    if (a == 0){
-      this->m_nonzeroSingularValues = i;
-      break;
-    }
-    else  if (i == this->m_diagSize - 1)
-    {
-      this->m_nonzeroSingularValues = i + 1;
-      break;
-    }
-  }
-  copyUV(m_naiveV, m_naiveU, bid.householderU(), bid.householderV());
-  this->m_isInitialized = true;
-  return *this;
-}// end compute
-
-
-template<typename MatrixType>
-void BDCSVD<MatrixType>::copyUV(MatrixXr naiveU, MatrixXr naiveV, MatrixX householderU, MatrixX householderV){
-  if (this->computeU()){
-    MatrixX temp = MatrixX::Zero(naiveU.rows(), naiveU.cols());
-    temp.real() = naiveU;
-    if (this->m_computeThinU){
-      this->m_matrixU = MatrixX::Identity(householderU.cols(), this->m_nonzeroSingularValues );
-      this->m_matrixU.block(0, 0, this->m_diagSize, this->m_nonzeroSingularValues) = 
-	temp.block(0, 0, this->m_diagSize, this->m_nonzeroSingularValues);
-      this->m_matrixU = householderU * this->m_matrixU ;
-    }
-    else
-    {
-      this->m_matrixU = MatrixX::Identity(householderU.cols(), householderU.cols());
-      this->m_matrixU.block(0, 0, this->m_diagSize, this->m_diagSize) = temp.block(0, 0, this->m_diagSize, this->m_diagSize);
-      this->m_matrixU = householderU * this->m_matrixU ;
-    }
-  }
-  if (this->computeV()){
-    MatrixX temp = MatrixX::Zero(naiveV.rows(), naiveV.cols());
-    temp.real() = naiveV;
-    if (this->m_computeThinV){
-      this->m_matrixV = MatrixX::Identity(householderV.cols(),this->m_nonzeroSingularValues );
-      this->m_matrixV.block(0, 0, this->m_nonzeroSingularValues, this->m_nonzeroSingularValues) = 
-	temp.block(0, 0, this->m_nonzeroSingularValues, this->m_nonzeroSingularValues);
-      this->m_matrixV = householderV * this->m_matrixV ;
-    }
-    else  
-    {
-      this->m_matrixV = MatrixX::Identity(householderV.cols(), householderV.cols());
-      this->m_matrixV.block(0, 0, this->m_diagSize, this->m_diagSize) = temp.block(0, 0, this->m_diagSize, this->m_diagSize);
-      this->m_matrixV = householderV * this->m_matrixV;
-    }
-  }
-}
-
-// The divide algorithm is done "in place", we are always working on subsets of the same matrix. The divide methods takes as argument the 
-// place of the submatrix we are currently working on.
-
-//@param firstCol : The Index of the first column of the submatrix of m_computed and for m_naiveU;
-//@param lastCol : The Index of the last column of the submatrix of m_computed and for m_naiveU; 
-// lastCol + 1 - firstCol is the size of the submatrix.
-//@param firstRowW : The Index of the first row of the matrix W that we are to change. (see the reference paper section 1 for more information on W)
-//@param firstRowW : Same as firstRowW with the column.
-//@param shift : Each time one takes the left submatrix, one must add 1 to the shift. Why? Because! We actually want the last column of the U submatrix 
-// to become the first column (*coeff) and to shift all the other columns to the right. There are more details on the reference paper.
-template<typename MatrixType>
-void BDCSVD<MatrixType>::divide (Index firstCol, Index lastCol, Index firstRowW, 
-				 Index firstColW, Index shift)
-{
-  // requires nbRows = nbCols + 1;
-  using std::pow;
-  using std::sqrt;
-  using std::abs;
-  const Index n = lastCol - firstCol + 1;
-  const Index k = n/2;
-  RealScalar alphaK;
-  RealScalar betaK; 
-  RealScalar r0; 
-  RealScalar lambda, phi, c0, s0;
-  MatrixXr l, f;
-  // We use the other algorithm which is more efficient for small 
-  // matrices.
-  if (n < algoswap){
-    JacobiSVD<MatrixXr> b(m_computed.block(firstCol, firstCol, n + 1, n), 
-			  ComputeFullU | (ComputeFullV * compV)) ;
-    if (compU) m_naiveU.block(firstCol, firstCol, n + 1, n + 1).real() << b.matrixU();
-    else 
-    {
-      m_naiveU.row(0).segment(firstCol, n + 1).real() << b.matrixU().row(0);
-      m_naiveU.row(1).segment(firstCol, n + 1).real() << b.matrixU().row(n);
-    }
-    if (compV) m_naiveV.block(firstRowW, firstColW, n, n).real() << b.matrixV();
-    m_computed.block(firstCol + shift, firstCol + shift, n + 1, n).setZero();
-    for (int i=0; i<n; i++)
-    {
-      m_computed(firstCol + shift + i, firstCol + shift +i) = b.singularValues().coeffRef(i);
-    }
-    return;
-  }
-  // We use the divide and conquer algorithm
-  alphaK =  m_computed(firstCol + k, firstCol + k);
-  betaK = m_computed(firstCol + k + 1, firstCol + k);
-  // The divide must be done in that order in order to have good results. Divide change the data inside the submatrices
-  // and the divide of the right submatrice reads one column of the left submatrice. That's why we need to treat the 
-  // right submatrix before the left one. 
-  divide(k + 1 + firstCol, lastCol, k + 1 + firstRowW, k + 1 + firstColW, shift);
-  divide(firstCol, k - 1 + firstCol, firstRowW, firstColW + 1, shift + 1);
-  if (compU)
-  {
-    lambda = m_naiveU(firstCol + k, firstCol + k);
-    phi = m_naiveU(firstCol + k + 1, lastCol + 1);
-  } 
-  else 
-  {
-    lambda = m_naiveU(1, firstCol + k);
-    phi = m_naiveU(0, lastCol + 1);
-  }
-  r0 = sqrt((abs(alphaK * lambda) * abs(alphaK * lambda))
-	    + abs(betaK * phi) * abs(betaK * phi));
-  if (compU)
-  {
-    l = m_naiveU.row(firstCol + k).segment(firstCol, k);
-    f = m_naiveU.row(firstCol + k + 1).segment(firstCol + k + 1, n - k - 1);
-  } 
-  else 
-  {
-    l = m_naiveU.row(1).segment(firstCol, k);
-    f = m_naiveU.row(0).segment(firstCol + k + 1, n - k - 1);
-  }
-  if (compV) m_naiveV(firstRowW+k, firstColW) = 1;
-  if (r0 == 0)
-  {
-    c0 = 1;
-    s0 = 0;
-  }
-  else
-  {
-    c0 = alphaK * lambda / r0;
-    s0 = betaK * phi / r0;
-  }
-  if (compU)
-  {
-    MatrixXr q1 (m_naiveU.col(firstCol + k).segment(firstCol, k + 1));     
-    // we shiftW Q1 to the right
-    for (Index i = firstCol + k - 1; i >= firstCol; i--) 
-    {
-      m_naiveU.col(i + 1).segment(firstCol, k + 1) << m_naiveU.col(i).segment(firstCol, k + 1);
-    }
-    // we shift q1 at the left with a factor c0
-    m_naiveU.col(firstCol).segment( firstCol, k + 1) << (q1 * c0);
-    // last column = q1 * - s0
-    m_naiveU.col(lastCol + 1).segment(firstCol, k + 1) << (q1 * ( - s0));
-    // first column = q2 * s0
-    m_naiveU.col(firstCol).segment(firstCol + k + 1, n - k) << 
-      m_naiveU.col(lastCol + 1).segment(firstCol + k + 1, n - k) *s0; 
-    // q2 *= c0
-    m_naiveU.col(lastCol + 1).segment(firstCol + k + 1, n - k) *= c0; 
-  } 
-  else 
-  {
-    RealScalar q1 = (m_naiveU(0, firstCol + k));
-    // we shift Q1 to the right
-    for (Index i = firstCol + k - 1; i >= firstCol; i--) 
-    {
-      m_naiveU(0, i + 1) = m_naiveU(0, i);
-    }
-    // we shift q1 at the left with a factor c0
-    m_naiveU(0, firstCol) = (q1 * c0);
-    // last column = q1 * - s0
-    m_naiveU(0, lastCol + 1) = (q1 * ( - s0));
-    // first column = q2 * s0
-    m_naiveU(1, firstCol) = m_naiveU(1, lastCol + 1) *s0; 
-    // q2 *= c0
-    m_naiveU(1, lastCol + 1) *= c0;
-    m_naiveU.row(1).segment(firstCol + 1, k).setZero();
-    m_naiveU.row(0).segment(firstCol + k + 1, n - k - 1).setZero();
-  }
-  m_computed(firstCol + shift, firstCol + shift) = r0;
-  m_computed.col(firstCol + shift).segment(firstCol + shift + 1, k) << alphaK * l.transpose().real();
-  m_computed.col(firstCol + shift).segment(firstCol + shift + k + 1, n - k - 1) << betaK * f.transpose().real();
-
-
-  // the line below do the deflation of the matrix for the third part of the algorithm
-  // Here the deflation is commented because the third part of the algorithm is not implemented
-  // the third part of the algorithm is a fast SVD on the matrix m_computed which works thanks to the deflation
-
-  deflation(firstCol, lastCol, k, firstRowW, firstColW, shift);
-
-  // Third part of the algorithm, since the real third part of the algorithm is not implemeted we use a JacobiSVD
-  JacobiSVD<MatrixXr> res= JacobiSVD<MatrixXr>(m_computed.block(firstCol + shift, firstCol +shift, n + 1, n), 
-					       ComputeFullU | (ComputeFullV * compV)) ;
-  if (compU) m_naiveU.block(firstCol, firstCol, n + 1, n + 1) *= res.matrixU();
-  else m_naiveU.block(0, firstCol, 2, n + 1) *= res.matrixU();
-  
-  if (compV) m_naiveV.block(firstRowW, firstColW, n, n) *= res.matrixV();
-  m_computed.block(firstCol + shift, firstCol + shift, n, n) << MatrixXr::Zero(n, n);
-  for (int i=0; i<n; i++)
-    m_computed(firstCol + shift + i, firstCol + shift +i) = res.singularValues().coeffRef(i);
-  // end of the third part
-
-
-}// end divide
-
-
-// page 12_13
-// i >= 1, di almost null and zi non null.
-// We use a rotation to zero out zi applied to the left of M
-template <typename MatrixType>
-void BDCSVD<MatrixType>::deflation43(Index firstCol, Index shift, Index i, Index size){
-  using std::abs;
-  using std::sqrt;
-  using std::pow;
-  RealScalar c = m_computed(firstCol + shift, firstCol + shift);
-  RealScalar s = m_computed(i, firstCol + shift);
-  RealScalar r = sqrt(pow(abs(c), 2) + pow(abs(s), 2));
-  if (r == 0){
-    m_computed(i, i)=0;
-    return;
-  }
-  c/=r;
-  s/=r;
-  m_computed(firstCol + shift, firstCol + shift) = r;  
-  m_computed(i, firstCol + shift) = 0;
-  m_computed(i, i) = 0;
-  if (compU){
-    m_naiveU.col(firstCol).segment(firstCol,size) = 
-      c * m_naiveU.col(firstCol).segment(firstCol, size) - 
-      s * m_naiveU.col(i).segment(firstCol, size) ;
-
-    m_naiveU.col(i).segment(firstCol, size) = 
-      (c + s*s/c) * m_naiveU.col(i).segment(firstCol, size) + 
-      (s/c) * m_naiveU.col(firstCol).segment(firstCol,size);
-  }
-}// end deflation 43
-
-
-// page 13
-// i,j >= 1, i != j and |di - dj| < epsilon * norm2(M)
-// We apply two rotations to have zj = 0;
-template <typename MatrixType>
-void BDCSVD<MatrixType>::deflation44(Index firstColu , Index firstColm, Index firstRowW, Index firstColW, Index i, Index j, Index size){
-  using std::abs;
-  using std::sqrt;
-  using std::conj;
-  using std::pow;
-  RealScalar c = m_computed(firstColm, firstColm + j - 1);
-  RealScalar s = m_computed(firstColm, firstColm + i - 1);
-  RealScalar r = sqrt(pow(abs(c), 2) + pow(abs(s), 2));
-  if (r==0){
-    m_computed(firstColm + i, firstColm + i) = m_computed(firstColm + j, firstColm + j);
-    return;
-  }
-  c/=r;
-  s/=r;
-  m_computed(firstColm + i, firstColm) = r;  
-  m_computed(firstColm + i, firstColm + i) = m_computed(firstColm + j, firstColm + j);
-  m_computed(firstColm + j, firstColm) = 0;
-  if (compU){
-    m_naiveU.col(firstColu + i).segment(firstColu, size) = 
-      c * m_naiveU.col(firstColu + i).segment(firstColu, size) - 
-      s * m_naiveU.col(firstColu + j).segment(firstColu, size) ;
-
-    m_naiveU.col(firstColu + j).segment(firstColu, size) = 
-      (c + s*s/c) *  m_naiveU.col(firstColu + j).segment(firstColu, size) + 
-      (s/c) * m_naiveU.col(firstColu + i).segment(firstColu, size);
-  } 
-  if (compV){
-    m_naiveV.col(firstColW + i).segment(firstRowW, size - 1) = 
-      c * m_naiveV.col(firstColW + i).segment(firstRowW, size - 1) + 
-      s * m_naiveV.col(firstColW + j).segment(firstRowW, size - 1) ;
-
-    m_naiveV.col(firstColW + j).segment(firstRowW, size - 1)  = 
-      (c + s*s/c) * m_naiveV.col(firstColW + j).segment(firstRowW, size - 1) - 
-      (s/c) * m_naiveV.col(firstColW + i).segment(firstRowW, size - 1);
-  }
-}// end deflation 44
-
-
-
-template <typename MatrixType>
-void BDCSVD<MatrixType>::deflation(Index firstCol, Index lastCol, Index k, Index firstRowW, Index firstColW, Index shift){
-  //condition 4.1
-  RealScalar EPS = EPSILON * (std::max<RealScalar>(m_computed(firstCol + shift + 1, firstCol + shift + 1), m_computed(firstCol + k, firstCol + k)));
-  const Index length = lastCol + 1 - firstCol;
-  if (m_computed(firstCol + shift, firstCol + shift) < EPS){
-    m_computed(firstCol + shift, firstCol + shift) = EPS;
-  }
-  //condition 4.2
-  for (Index i=firstCol + shift + 1;i<=lastCol + shift;i++){
-    if (std::abs(m_computed(i, firstCol + shift)) < EPS){
-      m_computed(i, firstCol + shift) = 0;
-    }
-  }
-
-  //condition 4.3
-  for (Index i=firstCol + shift + 1;i<=lastCol + shift; i++){
-    if (m_computed(i, i) < EPS){
-      deflation43(firstCol, shift, i, length);
-    }
-  }
-
-  //condition 4.4
- 
-  Index i=firstCol + shift + 1, j=firstCol + shift + k + 1;
-  //we stock the final place of each line
-  Index *permutation = new Index[length];
-
-  for (Index p =1; p < length; p++) {
-    if (i> firstCol + shift + k){
-      permutation[p] = j;
-      j++;
-    } else if (j> lastCol + shift) 
-    {
-      permutation[p] = i;
-      i++;
-    }
-    else 
-    {
-      if (m_computed(i, i) < m_computed(j, j)){
-        permutation[p] = j;
-        j++;
-      } 
-      else
-      {
-        permutation[p] = i;
-        i++;
-      }
-    }
-  }
-  //we do the permutation
-  RealScalar aux;
-  //we stock the current index of each col
-  //and the column of each index
-  Index *realInd = new Index[length];
-  Index *realCol = new Index[length];
-  for (int pos = 0; pos< length; pos++){
-    realCol[pos] = pos + firstCol + shift;
-    realInd[pos] = pos;
-  }
-  const Index Zero = firstCol + shift;
-  VectorType temp;
-  for (int i = 1; i < length - 1; i++){
-    const Index I = i + Zero;
-    const Index realI = realInd[i];
-    const Index j  = permutation[length - i] - Zero;
-    const Index J = realCol[j];
-    
-    //diag displace
-    aux = m_computed(I, I); 
-    m_computed(I, I) = m_computed(J, J);
-    m_computed(J, J) = aux;
-    
-    //firstrow displace
-    aux = m_computed(I, Zero); 
-    m_computed(I, Zero) = m_computed(J, Zero);
-    m_computed(J, Zero) = aux;
-
-    // change columns
-    if (compU) {
-      temp = m_naiveU.col(I - shift).segment(firstCol, length + 1);
-      m_naiveU.col(I - shift).segment(firstCol, length + 1) << 
-        m_naiveU.col(J - shift).segment(firstCol, length + 1);
-      m_naiveU.col(J - shift).segment(firstCol, length + 1) << temp;
-    } 
-    else
-    {
-      temp = m_naiveU.col(I - shift).segment(0, 2);
-      m_naiveU.col(I - shift).segment(0, 2) << 
-        m_naiveU.col(J - shift).segment(0, 2);
-      m_naiveU.col(J - shift).segment(0, 2) << temp;      
-    }
-    if (compV) {
-      const Index CWI = I + firstColW - Zero;
-      const Index CWJ = J + firstColW - Zero;
-      temp = m_naiveV.col(CWI).segment(firstRowW, length);
-      m_naiveV.col(CWI).segment(firstRowW, length) << m_naiveV.col(CWJ).segment(firstRowW, length);
-      m_naiveV.col(CWJ).segment(firstRowW, length) << temp;
-    }
-
-    //update real pos
-    realCol[realI] = J;
-    realCol[j] = I;
-    realInd[J - Zero] = realI;
-    realInd[I - Zero] = j;
-  }
-  for (Index i = firstCol + shift + 1; i<lastCol + shift;i++){
-    if ((m_computed(i + 1, i + 1) - m_computed(i, i)) < EPS){
-      deflation44(firstCol , 
-		  firstCol + shift, 
-		  firstRowW, 
-		  firstColW, 
-		  i - Zero, 
-		  i + 1 - Zero, 
-		  length);
-    }
-  }
-  delete [] permutation;
-  delete [] realInd;
-  delete [] realCol;
-
-}//end deflation
-
-
-namespace internal{
-
-template<typename _MatrixType, typename Rhs>
-struct solve_retval<BDCSVD<_MatrixType>, Rhs>
-  : solve_retval_base<BDCSVD<_MatrixType>, Rhs>
-{
-  typedef BDCSVD<_MatrixType> BDCSVDType;
-  EIGEN_MAKE_SOLVE_HELPERS(BDCSVDType, Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    eigen_assert(rhs().rows() == dec().rows());
-    // A = U S V^*
-    // So A^{ - 1} = V S^{ - 1} U^*    
-    Index diagSize = (std::min)(dec().rows(), dec().cols());
-    typename BDCSVDType::SingularValuesType invertedSingVals(diagSize);
-    Index nonzeroSingVals = dec().nonzeroSingularValues();
-    invertedSingVals.head(nonzeroSingVals) = dec().singularValues().head(nonzeroSingVals).array().inverse();
-    invertedSingVals.tail(diagSize - nonzeroSingVals).setZero();
-    
-    dst = dec().matrixV().leftCols(diagSize)
-      * invertedSingVals.asDiagonal()
-      * dec().matrixU().leftCols(diagSize).adjoint()
-      * rhs();	
-    return;
-  }
-};
-
-} //end namespace internal
-
-  /** \svd_module
-   *
-   * \return the singular value decomposition of \c *this computed by 
-   *  BDC Algorithm
-   *
-   * \sa class BDCSVD
-   */
-/*
-template<typename Derived>
-BDCSVD<typename MatrixBase<Derived>::PlainObject>
-MatrixBase<Derived>::bdcSvd(unsigned int computationOptions) const
-{
-  return BDCSVD<PlainObject>(*this, computationOptions);
-}
-*/
-
-} // end namespace Eigen
-
-#endif
diff --git a/inst/include/unsupported/Eigen/src/SVD/JacobiSVD.h b/inst/include/unsupported/Eigen/src/SVD/JacobiSVD.h
deleted file mode 100644
index 02fac409..00000000
--- a/inst/include/unsupported/Eigen/src/SVD/JacobiSVD.h
+++ /dev/null
@@ -1,782 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2009-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_JACOBISVD_H
-#define EIGEN_JACOBISVD_H
-
-namespace Eigen { 
-
-namespace internal {
-// forward declaration (needed by ICC)
-// the empty body is required by MSVC
-template<typename MatrixType, int QRPreconditioner,
-         bool IsComplex = NumTraits<typename MatrixType::Scalar>::IsComplex>
-struct svd_precondition_2x2_block_to_be_real {};
-
-/*** QR preconditioners (R-SVD)
- ***
- *** Their role is to reduce the problem of computing the SVD to the case of a square matrix.
- *** This approach, known as R-SVD, is an optimization for rectangular-enough matrices, and is a requirement for
- *** JacobiSVD which by itself is only able to work on square matrices.
- ***/
-
-enum { PreconditionIfMoreColsThanRows, PreconditionIfMoreRowsThanCols };
-
-template<typename MatrixType, int QRPreconditioner, int Case>
-struct qr_preconditioner_should_do_anything
-{
-  enum { a = MatrixType::RowsAtCompileTime != Dynamic &&
-             MatrixType::ColsAtCompileTime != Dynamic &&
-             MatrixType::ColsAtCompileTime <= MatrixType::RowsAtCompileTime,
-         b = MatrixType::RowsAtCompileTime != Dynamic &&
-             MatrixType::ColsAtCompileTime != Dynamic &&
-             MatrixType::RowsAtCompileTime <= MatrixType::ColsAtCompileTime,
-         ret = !( (QRPreconditioner == NoQRPreconditioner) ||
-                  (Case == PreconditionIfMoreColsThanRows && bool(a)) ||
-                  (Case == PreconditionIfMoreRowsThanCols && bool(b)) )
-  };
-};
-
-template<typename MatrixType, int QRPreconditioner, int Case,
-         bool DoAnything = qr_preconditioner_should_do_anything<MatrixType, QRPreconditioner, Case>::ret
-> struct qr_preconditioner_impl {};
-
-template<typename MatrixType, int QRPreconditioner, int Case>
-class qr_preconditioner_impl<MatrixType, QRPreconditioner, Case, false>
-{
-public:
-  typedef typename MatrixType::Index Index;
-  void allocate(const JacobiSVD<MatrixType, QRPreconditioner>&) {}
-  bool run(JacobiSVD<MatrixType, QRPreconditioner>&, const MatrixType&)
-  {
-    return false;
-  }
-};
-
-/*** preconditioner using FullPivHouseholderQR ***/
-
-template<typename MatrixType>
-class qr_preconditioner_impl<MatrixType, FullPivHouseholderQRPreconditioner, PreconditionIfMoreRowsThanCols, true>
-{
-public:
-  typedef typename MatrixType::Index Index;
-  typedef typename MatrixType::Scalar Scalar;
-  enum
-  {
-    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime
-  };
-  typedef Matrix<Scalar, 1, RowsAtCompileTime, RowMajor, 1, MaxRowsAtCompileTime> WorkspaceType;
-
-  void allocate(const JacobiSVD<MatrixType, FullPivHouseholderQRPreconditioner>& svd)
-  {
-    if (svd.rows() != m_qr.rows() || svd.cols() != m_qr.cols())
-    {
-      m_qr.~QRType();
-      ::new (&m_qr) QRType(svd.rows(), svd.cols());
-    }
-    if (svd.m_computeFullU) m_workspace.resize(svd.rows());
-  }
-
-  bool run(JacobiSVD<MatrixType, FullPivHouseholderQRPreconditioner>& svd, const MatrixType& matrix)
-  {
-    if(matrix.rows() > matrix.cols())
-    {
-      m_qr.compute(matrix);
-      svd.m_workMatrix = m_qr.matrixQR().block(0,0,matrix.cols(),matrix.cols()).template triangularView<Upper>();
-      if(svd.m_computeFullU) m_qr.matrixQ().evalTo(svd.m_matrixU, m_workspace);
-      if(svd.computeV()) svd.m_matrixV = m_qr.colsPermutation();
-      return true;
-    }
-    return false;
-  }
-private:
-  typedef FullPivHouseholderQR<MatrixType> QRType;
-  QRType m_qr;
-  WorkspaceType m_workspace;
-};
-
-template<typename MatrixType>
-class qr_preconditioner_impl<MatrixType, FullPivHouseholderQRPreconditioner, PreconditionIfMoreColsThanRows, true>
-{
-public:
-  typedef typename MatrixType::Index Index;
-  typedef typename MatrixType::Scalar Scalar;
-  enum
-  {
-    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-    ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
-    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
-    Options = MatrixType::Options
-  };
-  typedef Matrix<Scalar, ColsAtCompileTime, RowsAtCompileTime, Options, MaxColsAtCompileTime, MaxRowsAtCompileTime>
-          TransposeTypeWithSameStorageOrder;
-
-  void allocate(const JacobiSVD<MatrixType, FullPivHouseholderQRPreconditioner>& svd)
-  {
-    if (svd.cols() != m_qr.rows() || svd.rows() != m_qr.cols())
-    {
-      m_qr.~QRType();
-      ::new (&m_qr) QRType(svd.cols(), svd.rows());
-    }
-    m_adjoint.resize(svd.cols(), svd.rows());
-    if (svd.m_computeFullV) m_workspace.resize(svd.cols());
-  }
-
-  bool run(JacobiSVD<MatrixType, FullPivHouseholderQRPreconditioner>& svd, const MatrixType& matrix)
-  {
-    if(matrix.cols() > matrix.rows())
-    {
-      m_adjoint = matrix.adjoint();
-      m_qr.compute(m_adjoint);
-      svd.m_workMatrix = m_qr.matrixQR().block(0,0,matrix.rows(),matrix.rows()).template triangularView<Upper>().adjoint();
-      if(svd.m_computeFullV) m_qr.matrixQ().evalTo(svd.m_matrixV, m_workspace);
-      if(svd.computeU()) svd.m_matrixU = m_qr.colsPermutation();
-      return true;
-    }
-    else return false;
-  }
-private:
-  typedef FullPivHouseholderQR<TransposeTypeWithSameStorageOrder> QRType;
-  QRType m_qr;
-  TransposeTypeWithSameStorageOrder m_adjoint;
-  typename internal::plain_row_type<MatrixType>::type m_workspace;
-};
-
-/*** preconditioner using ColPivHouseholderQR ***/
-
-template<typename MatrixType>
-class qr_preconditioner_impl<MatrixType, ColPivHouseholderQRPreconditioner, PreconditionIfMoreRowsThanCols, true>
-{
-public:
-  typedef typename MatrixType::Index Index;
-
-  void allocate(const JacobiSVD<MatrixType, ColPivHouseholderQRPreconditioner>& svd)
-  {
-    if (svd.rows() != m_qr.rows() || svd.cols() != m_qr.cols())
-    {
-      m_qr.~QRType();
-      ::new (&m_qr) QRType(svd.rows(), svd.cols());
-    }
-    if (svd.m_computeFullU) m_workspace.resize(svd.rows());
-    else if (svd.m_computeThinU) m_workspace.resize(svd.cols());
-  }
-
-  bool run(JacobiSVD<MatrixType, ColPivHouseholderQRPreconditioner>& svd, const MatrixType& matrix)
-  {
-    if(matrix.rows() > matrix.cols())
-    {
-      m_qr.compute(matrix);
-      svd.m_workMatrix = m_qr.matrixQR().block(0,0,matrix.cols(),matrix.cols()).template triangularView<Upper>();
-      if(svd.m_computeFullU) m_qr.householderQ().evalTo(svd.m_matrixU, m_workspace);
-      else if(svd.m_computeThinU)
-      {
-        svd.m_matrixU.setIdentity(matrix.rows(), matrix.cols());
-        m_qr.householderQ().applyThisOnTheLeft(svd.m_matrixU, m_workspace);
-      }
-      if(svd.computeV()) svd.m_matrixV = m_qr.colsPermutation();
-      return true;
-    }
-    return false;
-  }
-
-private:
-  typedef ColPivHouseholderQR<MatrixType> QRType;
-  QRType m_qr;
-  typename internal::plain_col_type<MatrixType>::type m_workspace;
-};
-
-template<typename MatrixType>
-class qr_preconditioner_impl<MatrixType, ColPivHouseholderQRPreconditioner, PreconditionIfMoreColsThanRows, true>
-{
-public:
-  typedef typename MatrixType::Index Index;
-  typedef typename MatrixType::Scalar Scalar;
-  enum
-  {
-    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-    ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
-    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
-    Options = MatrixType::Options
-  };
-
-  typedef Matrix<Scalar, ColsAtCompileTime, RowsAtCompileTime, Options, MaxColsAtCompileTime, MaxRowsAtCompileTime>
-          TransposeTypeWithSameStorageOrder;
-
-  void allocate(const JacobiSVD<MatrixType, ColPivHouseholderQRPreconditioner>& svd)
-  {
-    if (svd.cols() != m_qr.rows() || svd.rows() != m_qr.cols())
-    {
-      m_qr.~QRType();
-      ::new (&m_qr) QRType(svd.cols(), svd.rows());
-    }
-    if (svd.m_computeFullV) m_workspace.resize(svd.cols());
-    else if (svd.m_computeThinV) m_workspace.resize(svd.rows());
-    m_adjoint.resize(svd.cols(), svd.rows());
-  }
-
-  bool run(JacobiSVD<MatrixType, ColPivHouseholderQRPreconditioner>& svd, const MatrixType& matrix)
-  {
-    if(matrix.cols() > matrix.rows())
-    {
-      m_adjoint = matrix.adjoint();
-      m_qr.compute(m_adjoint);
-
-      svd.m_workMatrix = m_qr.matrixQR().block(0,0,matrix.rows(),matrix.rows()).template triangularView<Upper>().adjoint();
-      if(svd.m_computeFullV) m_qr.householderQ().evalTo(svd.m_matrixV, m_workspace);
-      else if(svd.m_computeThinV)
-      {
-        svd.m_matrixV.setIdentity(matrix.cols(), matrix.rows());
-        m_qr.householderQ().applyThisOnTheLeft(svd.m_matrixV, m_workspace);
-      }
-      if(svd.computeU()) svd.m_matrixU = m_qr.colsPermutation();
-      return true;
-    }
-    else return false;
-  }
-
-private:
-  typedef ColPivHouseholderQR<TransposeTypeWithSameStorageOrder> QRType;
-  QRType m_qr;
-  TransposeTypeWithSameStorageOrder m_adjoint;
-  typename internal::plain_row_type<MatrixType>::type m_workspace;
-};
-
-/*** preconditioner using HouseholderQR ***/
-
-template<typename MatrixType>
-class qr_preconditioner_impl<MatrixType, HouseholderQRPreconditioner, PreconditionIfMoreRowsThanCols, true>
-{
-public:
-  typedef typename MatrixType::Index Index;
-
-  void allocate(const JacobiSVD<MatrixType, HouseholderQRPreconditioner>& svd)
-  {
-    if (svd.rows() != m_qr.rows() || svd.cols() != m_qr.cols())
-    {
-      m_qr.~QRType();
-      ::new (&m_qr) QRType(svd.rows(), svd.cols());
-    }
-    if (svd.m_computeFullU) m_workspace.resize(svd.rows());
-    else if (svd.m_computeThinU) m_workspace.resize(svd.cols());
-  }
-
-  bool run(JacobiSVD<MatrixType, HouseholderQRPreconditioner>& svd, const MatrixType& matrix)
-  {
-    if(matrix.rows() > matrix.cols())
-    {
-      m_qr.compute(matrix);
-      svd.m_workMatrix = m_qr.matrixQR().block(0,0,matrix.cols(),matrix.cols()).template triangularView<Upper>();
-      if(svd.m_computeFullU) m_qr.householderQ().evalTo(svd.m_matrixU, m_workspace);
-      else if(svd.m_computeThinU)
-      {
-        svd.m_matrixU.setIdentity(matrix.rows(), matrix.cols());
-        m_qr.householderQ().applyThisOnTheLeft(svd.m_matrixU, m_workspace);
-      }
-      if(svd.computeV()) svd.m_matrixV.setIdentity(matrix.cols(), matrix.cols());
-      return true;
-    }
-    return false;
-  }
-private:
-  typedef HouseholderQR<MatrixType> QRType;
-  QRType m_qr;
-  typename internal::plain_col_type<MatrixType>::type m_workspace;
-};
-
-template<typename MatrixType>
-class qr_preconditioner_impl<MatrixType, HouseholderQRPreconditioner, PreconditionIfMoreColsThanRows, true>
-{
-public:
-  typedef typename MatrixType::Index Index;
-  typedef typename MatrixType::Scalar Scalar;
-  enum
-  {
-    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-    ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
-    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
-    Options = MatrixType::Options
-  };
-
-  typedef Matrix<Scalar, ColsAtCompileTime, RowsAtCompileTime, Options, MaxColsAtCompileTime, MaxRowsAtCompileTime>
-          TransposeTypeWithSameStorageOrder;
-
-  void allocate(const JacobiSVD<MatrixType, HouseholderQRPreconditioner>& svd)
-  {
-    if (svd.cols() != m_qr.rows() || svd.rows() != m_qr.cols())
-    {
-      m_qr.~QRType();
-      ::new (&m_qr) QRType(svd.cols(), svd.rows());
-    }
-    if (svd.m_computeFullV) m_workspace.resize(svd.cols());
-    else if (svd.m_computeThinV) m_workspace.resize(svd.rows());
-    m_adjoint.resize(svd.cols(), svd.rows());
-  }
-
-  bool run(JacobiSVD<MatrixType, HouseholderQRPreconditioner>& svd, const MatrixType& matrix)
-  {
-    if(matrix.cols() > matrix.rows())
-    {
-      m_adjoint = matrix.adjoint();
-      m_qr.compute(m_adjoint);
-
-      svd.m_workMatrix = m_qr.matrixQR().block(0,0,matrix.rows(),matrix.rows()).template triangularView<Upper>().adjoint();
-      if(svd.m_computeFullV) m_qr.householderQ().evalTo(svd.m_matrixV, m_workspace);
-      else if(svd.m_computeThinV)
-      {
-        svd.m_matrixV.setIdentity(matrix.cols(), matrix.rows());
-        m_qr.householderQ().applyThisOnTheLeft(svd.m_matrixV, m_workspace);
-      }
-      if(svd.computeU()) svd.m_matrixU.setIdentity(matrix.rows(), matrix.rows());
-      return true;
-    }
-    else return false;
-  }
-
-private:
-  typedef HouseholderQR<TransposeTypeWithSameStorageOrder> QRType;
-  QRType m_qr;
-  TransposeTypeWithSameStorageOrder m_adjoint;
-  typename internal::plain_row_type<MatrixType>::type m_workspace;
-};
-
-/*** 2x2 SVD implementation
- ***
- *** JacobiSVD consists in performing a series of 2x2 SVD subproblems
- ***/
-
-template<typename MatrixType, int QRPreconditioner>
-struct svd_precondition_2x2_block_to_be_real<MatrixType, QRPreconditioner, false>
-{
-  typedef JacobiSVD<MatrixType, QRPreconditioner> SVD;
-  typedef typename SVD::Index Index;
-  static void run(typename SVD::WorkMatrixType&, SVD&, Index, Index) {}
-};
-
-template<typename MatrixType, int QRPreconditioner>
-struct svd_precondition_2x2_block_to_be_real<MatrixType, QRPreconditioner, true>
-{
-  typedef JacobiSVD<MatrixType, QRPreconditioner> SVD;
-  typedef typename MatrixType::Scalar Scalar;
-  typedef typename MatrixType::RealScalar RealScalar;
-  typedef typename SVD::Index Index;
-  static void run(typename SVD::WorkMatrixType& work_matrix, SVD& svd, Index p, Index q)
-  {
-    using std::sqrt;
-    Scalar z;
-    JacobiRotation<Scalar> rot;
-    RealScalar n = sqrt(numext::abs2(work_matrix.coeff(p,p)) + numext::abs2(work_matrix.coeff(q,p)));
-    if(n==0)
-    {
-      z = abs(work_matrix.coeff(p,q)) / work_matrix.coeff(p,q);
-      work_matrix.row(p) *= z;
-      if(svd.computeU()) svd.m_matrixU.col(p) *= conj(z);
-      z = abs(work_matrix.coeff(q,q)) / work_matrix.coeff(q,q);
-      work_matrix.row(q) *= z;
-      if(svd.computeU()) svd.m_matrixU.col(q) *= conj(z);
-    }
-    else
-    {
-      rot.c() = conj(work_matrix.coeff(p,p)) / n;
-      rot.s() = work_matrix.coeff(q,p) / n;
-      work_matrix.applyOnTheLeft(p,q,rot);
-      if(svd.computeU()) svd.m_matrixU.applyOnTheRight(p,q,rot.adjoint());
-      if(work_matrix.coeff(p,q) != Scalar(0))
-      {
-        Scalar z = abs(work_matrix.coeff(p,q)) / work_matrix.coeff(p,q);
-        work_matrix.col(q) *= z;
-        if(svd.computeV()) svd.m_matrixV.col(q) *= z;
-      }
-      if(work_matrix.coeff(q,q) != Scalar(0))
-      {
-        z = abs(work_matrix.coeff(q,q)) / work_matrix.coeff(q,q);
-        work_matrix.row(q) *= z;
-        if(svd.computeU()) svd.m_matrixU.col(q) *= conj(z);
-      }
-    }
-  }
-};
-
-template<typename MatrixType, typename RealScalar, typename Index>
-void real_2x2_jacobi_svd(const MatrixType& matrix, Index p, Index q,
-                            JacobiRotation<RealScalar> *j_left,
-                            JacobiRotation<RealScalar> *j_right)
-{
-  using std::sqrt;
-  Matrix<RealScalar,2,2> m;
-  m << numext::real(matrix.coeff(p,p)), numext::real(matrix.coeff(p,q)),
-       numext::real(matrix.coeff(q,p)), numext::real(matrix.coeff(q,q));
-  JacobiRotation<RealScalar> rot1;
-  RealScalar t = m.coeff(0,0) + m.coeff(1,1);
-  RealScalar d = m.coeff(1,0) - m.coeff(0,1);
-  if(t == RealScalar(0))
-  {
-    rot1.c() = RealScalar(0);
-    rot1.s() = d > RealScalar(0) ? RealScalar(1) : RealScalar(-1);
-  }
-  else
-  {
-    RealScalar u = d / t;
-    rot1.c() = RealScalar(1) / sqrt(RealScalar(1) + numext::abs2(u));
-    rot1.s() = rot1.c() * u;
-  }
-  m.applyOnTheLeft(0,1,rot1);
-  j_right->makeJacobi(m,0,1);
-  *j_left  = rot1 * j_right->transpose();
-}
-
-} // end namespace internal
-
-/** \ingroup SVD_Module
-  *
-  *
-  * \class JacobiSVD
-  *
-  * \brief Two-sided Jacobi SVD decomposition of a rectangular matrix
-  *
-  * \param MatrixType the type of the matrix of which we are computing the SVD decomposition
-  * \param QRPreconditioner this optional parameter allows to specify the type of QR decomposition that will be used internally
-  *                        for the R-SVD step for non-square matrices. See discussion of possible values below.
-  *
-  * SVD decomposition consists in decomposing any n-by-p matrix \a A as a product
-  *   \f[ A = U S V^* \f]
-  * where \a U is a n-by-n unitary, \a V is a p-by-p unitary, and \a S is a n-by-p real positive matrix which is zero outside of its main diagonal;
-  * the diagonal entries of S are known as the \em singular \em values of \a A and the columns of \a U and \a V are known as the left
-  * and right \em singular \em vectors of \a A respectively.
-  *
-  * Singular values are always sorted in decreasing order.
-  *
-  * This JacobiSVD decomposition computes only the singular values by default. If you want \a U or \a V, you need to ask for them explicitly.
-  *
-  * You can ask for only \em thin \a U or \a V to be computed, meaning the following. In case of a rectangular n-by-p matrix, letting \a m be the
-  * smaller value among \a n and \a p, there are only \a m singular vectors; the remaining columns of \a U and \a V do not correspond to actual
-  * singular vectors. Asking for \em thin \a U or \a V means asking for only their \a m first columns to be formed. So \a U is then a n-by-m matrix,
-  * and \a V is then a p-by-m matrix. Notice that thin \a U and \a V are all you need for (least squares) solving.
-  *
-  * Here's an example demonstrating basic usage:
-  * \include JacobiSVD_basic.cpp
-  * Output: \verbinclude JacobiSVD_basic.out
-  *
-  * This JacobiSVD class is a two-sided Jacobi R-SVD decomposition, ensuring optimal reliability and accuracy. The downside is that it's slower than
-  * bidiagonalizing SVD algorithms for large square matrices; however its complexity is still \f$ O(n^2p) \f$ where \a n is the smaller dimension and
-  * \a p is the greater dimension, meaning that it is still of the same order of complexity as the faster bidiagonalizing R-SVD algorithms.
-  * In particular, like any R-SVD, it takes advantage of non-squareness in that its complexity is only linear in the greater dimension.
-  *
-  * If the input matrix has inf or nan coefficients, the result of the computation is undefined, but the computation is guaranteed to
-  * terminate in finite (and reasonable) time.
-  *
-  * The possible values for QRPreconditioner are:
-  * \li ColPivHouseholderQRPreconditioner is the default. In practice it's very safe. It uses column-pivoting QR.
-  * \li FullPivHouseholderQRPreconditioner, is the safest and slowest. It uses full-pivoting QR.
-  *     Contrary to other QRs, it doesn't allow computing thin unitaries.
-  * \li HouseholderQRPreconditioner is the fastest, and less safe and accurate than the pivoting variants. It uses non-pivoting QR.
-  *     This is very similar in safety and accuracy to the bidiagonalization process used by bidiagonalizing SVD algorithms (since bidiagonalization
-  *     is inherently non-pivoting). However the resulting SVD is still more reliable than bidiagonalizing SVDs because the Jacobi-based iterarive
-  *     process is more reliable than the optimized bidiagonal SVD iterations.
-  * \li NoQRPreconditioner allows not to use a QR preconditioner at all. This is useful if you know that you will only be computing
-  *     JacobiSVD decompositions of square matrices. Non-square matrices require a QR preconditioner. Using this option will result in
-  *     faster compilation and smaller executable code. It won't significantly speed up computation, since JacobiSVD is always checking
-  *     if QR preconditioning is needed before applying it anyway.
-  *
-  * \sa MatrixBase::jacobiSvd()
-  */
-template<typename _MatrixType, int QRPreconditioner> 
-class JacobiSVD : public SVDBase<_MatrixType>
-{
-  public:
-
-    typedef _MatrixType MatrixType;
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
-    typedef typename MatrixType::Index Index;
-    enum {
-      RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-      ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-      DiagSizeAtCompileTime = EIGEN_SIZE_MIN_PREFER_DYNAMIC(RowsAtCompileTime,ColsAtCompileTime),
-      MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
-      MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
-      MaxDiagSizeAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED(MaxRowsAtCompileTime,MaxColsAtCompileTime),
-      MatrixOptions = MatrixType::Options
-    };
-
-    typedef Matrix<Scalar, RowsAtCompileTime, RowsAtCompileTime,
-                   MatrixOptions, MaxRowsAtCompileTime, MaxRowsAtCompileTime>
-            MatrixUType;
-    typedef Matrix<Scalar, ColsAtCompileTime, ColsAtCompileTime,
-                   MatrixOptions, MaxColsAtCompileTime, MaxColsAtCompileTime>
-            MatrixVType;
-    typedef typename internal::plain_diag_type<MatrixType, RealScalar>::type SingularValuesType;
-    typedef typename internal::plain_row_type<MatrixType>::type RowType;
-    typedef typename internal::plain_col_type<MatrixType>::type ColType;
-    typedef Matrix<Scalar, DiagSizeAtCompileTime, DiagSizeAtCompileTime,
-                   MatrixOptions, MaxDiagSizeAtCompileTime, MaxDiagSizeAtCompileTime>
-            WorkMatrixType;
-
-    /** \brief Default Constructor.
-      *
-      * The default constructor is useful in cases in which the user intends to
-      * perform decompositions via JacobiSVD::compute(const MatrixType&).
-      */
-    JacobiSVD()
-      : SVDBase<_MatrixType>::SVDBase()
-    {}
-
-
-    /** \brief Default Constructor with memory preallocation
-      *
-      * Like the default constructor but with preallocation of the internal data
-      * according to the specified problem size.
-      * \sa JacobiSVD()
-      */
-    JacobiSVD(Index rows, Index cols, unsigned int computationOptions = 0)
-      : SVDBase<_MatrixType>::SVDBase() 
-    {
-      allocate(rows, cols, computationOptions);
-    }
-
-    /** \brief Constructor performing the decomposition of given matrix.
-     *
-     * \param matrix the matrix to decompose
-     * \param computationOptions optional parameter allowing to specify if you want full or thin U or V unitaries to be computed.
-     *                           By default, none is computed. This is a bit-field, the possible bits are #ComputeFullU, #ComputeThinU,
-     *                           #ComputeFullV, #ComputeThinV.
-     *
-     * Thin unitaries are only available if your matrix type has a Dynamic number of columns (for example MatrixXf). They also are not
-     * available with the (non-default) FullPivHouseholderQR preconditioner.
-     */
-    JacobiSVD(const MatrixType& matrix, unsigned int computationOptions = 0)
-      : SVDBase<_MatrixType>::SVDBase()
-    {
-      compute(matrix, computationOptions);
-    }
-
-    /** \brief Method performing the decomposition of given matrix using custom options.
-     *
-     * \param matrix the matrix to decompose
-     * \param computationOptions optional parameter allowing to specify if you want full or thin U or V unitaries to be computed.
-     *                           By default, none is computed. This is a bit-field, the possible bits are #ComputeFullU, #ComputeThinU,
-     *                           #ComputeFullV, #ComputeThinV.
-     *
-     * Thin unitaries are only available if your matrix type has a Dynamic number of columns (for example MatrixXf). They also are not
-     * available with the (non-default) FullPivHouseholderQR preconditioner.
-     */
-    SVDBase<MatrixType>& compute(const MatrixType& matrix, unsigned int computationOptions);
-
-    /** \brief Method performing the decomposition of given matrix using current options.
-     *
-     * \param matrix the matrix to decompose
-     *
-     * This method uses the current \a computationOptions, as already passed to the constructor or to compute(const MatrixType&, unsigned int).
-     */
-    SVDBase<MatrixType>& compute(const MatrixType& matrix)
-    {
-      return compute(matrix, this->m_computationOptions);
-    }
-    
-    /** \returns a (least squares) solution of \f$ A x = b \f$ using the current SVD decomposition of A.
-      *
-      * \param b the right-hand-side of the equation to solve.
-      *
-      * \note Solving requires both U and V to be computed. Thin U and V are enough, there is no need for full U or V.
-      *
-      * \note SVD solving is implicitly least-squares. Thus, this method serves both purposes of exact solving and least-squares solving.
-      * In other words, the returned solution is guaranteed to minimize the Euclidean norm \f$ \Vert A x - b \Vert \f$.
-      */
-    template<typename Rhs>
-    inline const internal::solve_retval<JacobiSVD, Rhs>
-    solve(const MatrixBase<Rhs>& b) const
-    {
-      eigen_assert(this->m_isInitialized && "JacobiSVD is not initialized.");
-      eigen_assert(SVDBase<MatrixType>::computeU() && SVDBase<MatrixType>::computeV() && "JacobiSVD::solve() requires both unitaries U and V to be computed (thin unitaries suffice).");
-      return internal::solve_retval<JacobiSVD, Rhs>(*this, b.derived());
-    }
-
-    
-
-  private:
-    void allocate(Index rows, Index cols, unsigned int computationOptions);
-
-  protected:
-    WorkMatrixType m_workMatrix;
-   
-    template<typename __MatrixType, int _QRPreconditioner, bool _IsComplex>
-    friend struct internal::svd_precondition_2x2_block_to_be_real;
-    template<typename __MatrixType, int _QRPreconditioner, int _Case, bool _DoAnything>
-    friend struct internal::qr_preconditioner_impl;
-
-    internal::qr_preconditioner_impl<MatrixType, QRPreconditioner, internal::PreconditionIfMoreColsThanRows> m_qr_precond_morecols;
-    internal::qr_preconditioner_impl<MatrixType, QRPreconditioner, internal::PreconditionIfMoreRowsThanCols> m_qr_precond_morerows;
-};
-
-template<typename MatrixType, int QRPreconditioner>
-void JacobiSVD<MatrixType, QRPreconditioner>::allocate(Index rows, Index cols, unsigned int computationOptions)
-{
-  if (SVDBase<MatrixType>::allocate(rows, cols, computationOptions)) return;
-
-  if (QRPreconditioner == FullPivHouseholderQRPreconditioner)
-  {
-      eigen_assert(!(this->m_computeThinU || this->m_computeThinV) &&
-              "JacobiSVD: can't compute thin U or thin V with the FullPivHouseholderQR preconditioner. "
-              "Use the ColPivHouseholderQR preconditioner instead.");
-  }
-
-  m_workMatrix.resize(this->m_diagSize, this->m_diagSize);
-  
-  if(this->m_cols>this->m_rows) m_qr_precond_morecols.allocate(*this);
-  if(this->m_rows>this->m_cols) m_qr_precond_morerows.allocate(*this);
-}
-
-template<typename MatrixType, int QRPreconditioner>
-SVDBase<MatrixType>&
-JacobiSVD<MatrixType, QRPreconditioner>::compute(const MatrixType& matrix, unsigned int computationOptions)
-{
-  using std::abs;
-  allocate(matrix.rows(), matrix.cols(), computationOptions);
-
-  // currently we stop when we reach precision 2*epsilon as the last bit of precision can require an unreasonable number of iterations,
-  // only worsening the precision of U and V as we accumulate more rotations
-  const RealScalar precision = RealScalar(2) * NumTraits<Scalar>::epsilon();
-
-  // limit for very small denormal numbers to be considered zero in order to avoid infinite loops (see bug 286)
-  const RealScalar considerAsZero = RealScalar(2) * std::numeric_limits<RealScalar>::denorm_min();
-
-  /*** step 1. The R-SVD step: we use a QR decomposition to reduce to the case of a square matrix */
-
-  if(!m_qr_precond_morecols.run(*this, matrix) && !m_qr_precond_morerows.run(*this, matrix))
-  {
-    m_workMatrix = matrix.block(0,0,this->m_diagSize,this->m_diagSize);
-    if(this->m_computeFullU) this->m_matrixU.setIdentity(this->m_rows,this->m_rows);
-    if(this->m_computeThinU) this->m_matrixU.setIdentity(this->m_rows,this->m_diagSize);
-    if(this->m_computeFullV) this->m_matrixV.setIdentity(this->m_cols,this->m_cols);
-    if(this->m_computeThinV) this->m_matrixV.setIdentity(this->m_cols, this->m_diagSize);
-  }
-
-  /*** step 2. The main Jacobi SVD iteration. ***/
-
-  bool finished = false;
-  while(!finished)
-  {
-    finished = true;
-
-    // do a sweep: for all index pairs (p,q), perform SVD of the corresponding 2x2 sub-matrix
-
-    for(Index p = 1; p < this->m_diagSize; ++p)
-    {
-      for(Index q = 0; q < p; ++q)
-      {
-        // if this 2x2 sub-matrix is not diagonal already...
-        // notice that this comparison will evaluate to false if any NaN is involved, ensuring that NaN's don't
-        // keep us iterating forever. Similarly, small denormal numbers are considered zero.
-        using std::max;
-        RealScalar threshold = (max)(considerAsZero, precision * (max)(abs(m_workMatrix.coeff(p,p)),
-                                                                       abs(m_workMatrix.coeff(q,q))));
-        if((max)(abs(m_workMatrix.coeff(p,q)),abs(m_workMatrix.coeff(q,p))) > threshold)
-        {
-          finished = false;
-
-          // perform SVD decomposition of 2x2 sub-matrix corresponding to indices p,q to make it diagonal
-          internal::svd_precondition_2x2_block_to_be_real<MatrixType, QRPreconditioner>::run(m_workMatrix, *this, p, q);
-          JacobiRotation<RealScalar> j_left, j_right;
-          internal::real_2x2_jacobi_svd(m_workMatrix, p, q, &j_left, &j_right);
-
-          // accumulate resulting Jacobi rotations
-          m_workMatrix.applyOnTheLeft(p,q,j_left);
-          if(SVDBase<MatrixType>::computeU()) this->m_matrixU.applyOnTheRight(p,q,j_left.transpose());
-
-          m_workMatrix.applyOnTheRight(p,q,j_right);
-          if(SVDBase<MatrixType>::computeV()) this->m_matrixV.applyOnTheRight(p,q,j_right);
-        }
-      }
-    }
-  }
-
-  /*** step 3. The work matrix is now diagonal, so ensure it's positive so its diagonal entries are the singular values ***/
-
-  for(Index i = 0; i < this->m_diagSize; ++i)
-  {
-    RealScalar a = abs(m_workMatrix.coeff(i,i));
-    this->m_singularValues.coeffRef(i) = a;
-    if(SVDBase<MatrixType>::computeU() && (a!=RealScalar(0))) this->m_matrixU.col(i) *= this->m_workMatrix.coeff(i,i)/a;
-  }
-
-  /*** step 4. Sort singular values in descending order and compute the number of nonzero singular values ***/
-
-  this->m_nonzeroSingularValues = this->m_diagSize;
-  for(Index i = 0; i < this->m_diagSize; i++)
-  {
-    Index pos;
-    RealScalar maxRemainingSingularValue = this->m_singularValues.tail(this->m_diagSize-i).maxCoeff(&pos);
-    if(maxRemainingSingularValue == RealScalar(0))
-    {
-      this->m_nonzeroSingularValues = i;
-      break;
-    }
-    if(pos)
-    {
-      pos += i;
-      std::swap(this->m_singularValues.coeffRef(i), this->m_singularValues.coeffRef(pos));
-      if(SVDBase<MatrixType>::computeU()) this->m_matrixU.col(pos).swap(this->m_matrixU.col(i));
-      if(SVDBase<MatrixType>::computeV()) this->m_matrixV.col(pos).swap(this->m_matrixV.col(i));
-    }
-  }
-
-  this->m_isInitialized = true;
-  return *this;
-}
-
-namespace internal {
-template<typename _MatrixType, int QRPreconditioner, typename Rhs>
-struct solve_retval<JacobiSVD<_MatrixType, QRPreconditioner>, Rhs>
-  : solve_retval_base<JacobiSVD<_MatrixType, QRPreconditioner>, Rhs>
-{
-  typedef JacobiSVD<_MatrixType, QRPreconditioner> JacobiSVDType;
-  EIGEN_MAKE_SOLVE_HELPERS(JacobiSVDType,Rhs)
-
-  template<typename Dest> void evalTo(Dest& dst) const
-  {
-    eigen_assert(rhs().rows() == dec().rows());
-
-    // A = U S V^*
-    // So A^{-1} = V S^{-1} U^*
-
-    Index diagSize = (std::min)(dec().rows(), dec().cols());
-    typename JacobiSVDType::SingularValuesType invertedSingVals(diagSize);
-
-    Index nonzeroSingVals = dec().nonzeroSingularValues();
-    invertedSingVals.head(nonzeroSingVals) = dec().singularValues().head(nonzeroSingVals).array().inverse();
-    invertedSingVals.tail(diagSize - nonzeroSingVals).setZero();
-
-    dst = dec().matrixV().leftCols(diagSize)
-        * invertedSingVals.asDiagonal()
-        * dec().matrixU().leftCols(diagSize).adjoint()
-        * rhs();
-  }
-};
-} // end namespace internal
-
-/** \svd_module
-  *
-  * \return the singular value decomposition of \c *this computed by two-sided
-  * Jacobi transformations.
-  *
-  * \sa class JacobiSVD
-  */
-template<typename Derived>
-JacobiSVD<typename MatrixBase<Derived>::PlainObject>
-MatrixBase<Derived>::jacobiSvd(unsigned int computationOptions) const
-{
-  return JacobiSVD<PlainObject>(*this, computationOptions);
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN_JACOBISVD_H
diff --git a/inst/include/unsupported/Eigen/src/SVD/SVDBase.h b/inst/include/unsupported/Eigen/src/SVD/SVDBase.h
deleted file mode 100644
index fd8af3b8..00000000
--- a/inst/include/unsupported/Eigen/src/SVD/SVDBase.h
+++ /dev/null
@@ -1,236 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2009-2010 Benoit Jacob <jacob.benoit.1@gmail.com>
-//
-// Copyright (C) 2013 Gauthier Brun <brun.gauthier@gmail.com>
-// Copyright (C) 2013 Nicolas Carre <nicolas.carre@ensimag.fr>
-// Copyright (C) 2013 Jean Ceccato <jean.ceccato@ensimag.fr>
-// Copyright (C) 2013 Pierre Zoppitelli <pierre.zoppitelli@ensimag.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_SVD_H
-#define EIGEN_SVD_H
-
-namespace Eigen {
-/** \ingroup SVD_Module
- *
- *
- * \class SVDBase
- *
- * \brief Mother class of SVD classes algorithms
- *
- * \param MatrixType the type of the matrix of which we are computing the SVD decomposition
- * SVD decomposition consists in decomposing any n-by-p matrix \a A as a product
- *   \f[ A = U S V^* \f]
- * where \a U is a n-by-n unitary, \a V is a p-by-p unitary, and \a S is a n-by-p real positive matrix which is zero outside of its main diagonal;
- * the diagonal entries of S are known as the \em singular \em values of \a A and the columns of \a U and \a V are known as the left
- * and right \em singular \em vectors of \a A respectively.
- *
- * Singular values are always sorted in decreasing order.
- *
- * 
- * You can ask for only \em thin \a U or \a V to be computed, meaning the following. In case of a rectangular n-by-p matrix, letting \a m be the
- * smaller value among \a n and \a p, there are only \a m singular vectors; the remaining columns of \a U and \a V do not correspond to actual
- * singular vectors. Asking for \em thin \a U or \a V means asking for only their \a m first columns to be formed. So \a U is then a n-by-m matrix,
- * and \a V is then a p-by-m matrix. Notice that thin \a U and \a V are all you need for (least squares) solving.
- *  
- * If the input matrix has inf or nan coefficients, the result of the computation is undefined, but the computation is guaranteed to
- * terminate in finite (and reasonable) time.
- * \sa MatrixBase::genericSvd()
- */
-template<typename _MatrixType> 
-class SVDBase
-{
-
-public:
-  typedef _MatrixType MatrixType;
-  typedef typename MatrixType::Scalar Scalar;
-  typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
-  typedef typename MatrixType::Index Index;
-  enum {
-    RowsAtCompileTime = MatrixType::RowsAtCompileTime,
-    ColsAtCompileTime = MatrixType::ColsAtCompileTime,
-    DiagSizeAtCompileTime = EIGEN_SIZE_MIN_PREFER_DYNAMIC(RowsAtCompileTime,ColsAtCompileTime),
-    MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime,
-    MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime,
-    MaxDiagSizeAtCompileTime = EIGEN_SIZE_MIN_PREFER_FIXED(MaxRowsAtCompileTime,MaxColsAtCompileTime),
-    MatrixOptions = MatrixType::Options
-  };
-
-  typedef Matrix<Scalar, RowsAtCompileTime, RowsAtCompileTime,
-		 MatrixOptions, MaxRowsAtCompileTime, MaxRowsAtCompileTime>
-  MatrixUType;
-  typedef Matrix<Scalar, ColsAtCompileTime, ColsAtCompileTime,
-		 MatrixOptions, MaxColsAtCompileTime, MaxColsAtCompileTime>
-  MatrixVType;
-  typedef typename internal::plain_diag_type<MatrixType, RealScalar>::type SingularValuesType;
-  typedef typename internal::plain_row_type<MatrixType>::type RowType;
-  typedef typename internal::plain_col_type<MatrixType>::type ColType;
-  typedef Matrix<Scalar, DiagSizeAtCompileTime, DiagSizeAtCompileTime,
-		 MatrixOptions, MaxDiagSizeAtCompileTime, MaxDiagSizeAtCompileTime>
-  WorkMatrixType;
-	
-
-
-
-  /** \brief Method performing the decomposition of given matrix using custom options.
-   *
-   * \param matrix the matrix to decompose
-   * \param computationOptions optional parameter allowing to specify if you want full or thin U or V unitaries to be computed.
-   *                           By default, none is computed. This is a bit-field, the possible bits are #ComputeFullU, #ComputeThinU,
-   *                           #ComputeFullV, #ComputeThinV.
-   *
-   * Thin unitaries are only available if your matrix type has a Dynamic number of columns (for example MatrixXf). They also are not
-   * available with the (non-default) FullPivHouseholderQR preconditioner.
-   */
-  SVDBase& compute(const MatrixType& matrix, unsigned int computationOptions);
-
-  /** \brief Method performing the decomposition of given matrix using current options.
-   *
-   * \param matrix the matrix to decompose
-   *
-   * This method uses the current \a computationOptions, as already passed to the constructor or to compute(const MatrixType&, unsigned int).
-   */
-  //virtual SVDBase& compute(const MatrixType& matrix) = 0;
-  SVDBase& compute(const MatrixType& matrix);
-
-  /** \returns the \a U matrix.
-   *
-   * For the SVDBase decomposition of a n-by-p matrix, letting \a m be the minimum of \a n and \a p,
-   * the U matrix is n-by-n if you asked for #ComputeFullU, and is n-by-m if you asked for #ComputeThinU.
-   *
-   * The \a m first columns of \a U are the left singular vectors of the matrix being decomposed.
-   *
-   * This method asserts that you asked for \a U to be computed.
-   */
-  const MatrixUType& matrixU() const
-  {
-    eigen_assert(m_isInitialized && "SVD is not initialized.");
-    eigen_assert(computeU() && "This SVD decomposition didn't compute U. Did you ask for it?");
-    return m_matrixU;
-  }
-
-  /** \returns the \a V matrix.
-   *
-   * For the SVD decomposition of a n-by-p matrix, letting \a m be the minimum of \a n and \a p,
-   * the V matrix is p-by-p if you asked for #ComputeFullV, and is p-by-m if you asked for ComputeThinV.
-   *
-   * The \a m first columns of \a V are the right singular vectors of the matrix being decomposed.
-   *
-   * This method asserts that you asked for \a V to be computed.
-   */
-  const MatrixVType& matrixV() const
-  {
-    eigen_assert(m_isInitialized && "SVD is not initialized.");
-    eigen_assert(computeV() && "This SVD decomposition didn't compute V. Did you ask for it?");
-    return m_matrixV;
-  }
-
-  /** \returns the vector of singular values.
-   *
-   * For the SVD decomposition of a n-by-p matrix, letting \a m be the minimum of \a n and \a p, the
-   * returned vector has size \a m.  Singular values are always sorted in decreasing order.
-   */
-  const SingularValuesType& singularValues() const
-  {
-    eigen_assert(m_isInitialized && "SVD is not initialized.");
-    return m_singularValues;
-  }
-
-  
-
-  /** \returns the number of singular values that are not exactly 0 */
-  Index nonzeroSingularValues() const
-  {
-    eigen_assert(m_isInitialized && "SVD is not initialized.");
-    return m_nonzeroSingularValues;
-  }
-
-
-  /** \returns true if \a U (full or thin) is asked for in this SVD decomposition */
-  inline bool computeU() const { return m_computeFullU || m_computeThinU; }
-  /** \returns true if \a V (full or thin) is asked for in this SVD decomposition */
-  inline bool computeV() const { return m_computeFullV || m_computeThinV; }
-
-
-  inline Index rows() const { return m_rows; }
-  inline Index cols() const { return m_cols; }
-
-
-protected:
-  // return true if already allocated
-  bool allocate(Index rows, Index cols, unsigned int computationOptions) ;
-
-  MatrixUType m_matrixU;
-  MatrixVType m_matrixV;
-  SingularValuesType m_singularValues;
-  bool m_isInitialized, m_isAllocated;
-  bool m_computeFullU, m_computeThinU;
-  bool m_computeFullV, m_computeThinV;
-  unsigned int m_computationOptions;
-  Index m_nonzeroSingularValues, m_rows, m_cols, m_diagSize;
-
-
-  /** \brief Default Constructor.
-   *
-   * Default constructor of SVDBase
-   */
-  SVDBase()
-    : m_isInitialized(false),
-      m_isAllocated(false),
-      m_computationOptions(0),
-      m_rows(-1), m_cols(-1)
-  {}
-
-
-};
-
-
-template<typename MatrixType>
-bool SVDBase<MatrixType>::allocate(Index rows, Index cols, unsigned int computationOptions)
-{
-  eigen_assert(rows >= 0 && cols >= 0);
-
-  if (m_isAllocated &&
-      rows == m_rows &&
-      cols == m_cols &&
-      computationOptions == m_computationOptions)
-  {
-    return true;
-  }
-
-  m_rows = rows;
-  m_cols = cols;
-  m_isInitialized = false;
-  m_isAllocated = true;
-  m_computationOptions = computationOptions;
-  m_computeFullU = (computationOptions & ComputeFullU) != 0;
-  m_computeThinU = (computationOptions & ComputeThinU) != 0;
-  m_computeFullV = (computationOptions & ComputeFullV) != 0;
-  m_computeThinV = (computationOptions & ComputeThinV) != 0;
-  eigen_assert(!(m_computeFullU && m_computeThinU) && "SVDBase: you can't ask for both full and thin U");
-  eigen_assert(!(m_computeFullV && m_computeThinV) && "SVDBase: you can't ask for both full and thin V");
-  eigen_assert(EIGEN_IMPLIES(m_computeThinU || m_computeThinV, MatrixType::ColsAtCompileTime==Dynamic) &&
-	       "SVDBase: thin U and V are only available when your matrix has a dynamic number of columns.");
-
-  m_diagSize = (std::min)(m_rows, m_cols);
-  m_singularValues.resize(m_diagSize);
-  if(RowsAtCompileTime==Dynamic)
-    m_matrixU.resize(m_rows, m_computeFullU ? m_rows
-		     : m_computeThinU ? m_diagSize
-		     : 0);
-  if(ColsAtCompileTime==Dynamic)
-    m_matrixV.resize(m_cols, m_computeFullV ? m_cols
-		     : m_computeThinV ? m_diagSize
-		     : 0);
-
-  return false;
-}
-
-}// end namespace
-
-#endif // EIGEN_SVD_H
diff --git a/inst/include/unsupported/Eigen/src/SVD/TODOBdcsvd.txt b/inst/include/unsupported/Eigen/src/SVD/TODOBdcsvd.txt
deleted file mode 100644
index 0bc9a46e..00000000
--- a/inst/include/unsupported/Eigen/src/SVD/TODOBdcsvd.txt
+++ /dev/null
@@ -1,29 +0,0 @@
-TO DO LIST
-
-
-
-(optional optimization) - do all the allocations in the allocate part 
-                        - support static matrices
-                        - return a error at compilation time when using integer matrices (int, long, std::complex<int>, ...)
-
-to finish the algorithm :
-			-implement the last part of the algorithm as described on the reference paper. 
-			    You may find more information on that part on this paper
-
-			-to replace the call to JacobiSVD at the end of the divide algorithm, just after the call to 
-			    deflation.
-
-(suggested step by step resolution)
-                       0) comment the call to Jacobi in the last part of the divide method and everything right after
-                               until the end of the method. What is commented can be a guideline to steps 3) 4) and 6)
-                       1) solve the secular equation (Characteristic equation) on the values that are not null (zi!=0 and di!=0), after the deflation
-                               wich should be uncommented in the divide method
-                       2) remember the values of the singular values that are already computed (zi=0)
-                       3) assign the singular values found in m_computed at the right places (with the ones found in step 2) )
-                               in decreasing order
-                       4) set the firstcol to zero (except the first element) in m_computed
-                       5) compute all the singular vectors when CompV is set to true and only the left vectors when
-                               CompV is set to false
-                       6) multiply naiveU and naiveV to the right by the matrices found, only naiveU when CompV is set to
-                               false, /!\ if CompU is false NaiveU has only 2 rows
-                       7) delete everything commented in step 0)
diff --git a/inst/include/unsupported/Eigen/src/SVD/doneInBDCSVD.txt b/inst/include/unsupported/Eigen/src/SVD/doneInBDCSVD.txt
deleted file mode 100644
index 8563ddab..00000000
--- a/inst/include/unsupported/Eigen/src/SVD/doneInBDCSVD.txt
+++ /dev/null
@@ -1,21 +0,0 @@
-This unsupported package is about a divide and conquer algorithm to compute SVD.
-
-The implementation follows as closely as possible the following reference paper : 
-http://www.cs.yale.edu/publications/techreports/tr933.pdf
-
-The code documentation uses the same names for variables as the reference paper. The code, deflation included, is
-working  but there are a few things that could be optimised as explained in the TODOBdsvd. 
-
-In the code comments were put at the line where would be the third step of the algorithm so one could simply add the call 
-of a function doing the last part of the algorithm and that would not require any knowledge of the part we implemented.
-
-In the TODOBdcsvd we explain what is the main difficulty of the last part and suggest a reference paper to help solve it.
-
-The implemented has trouble with fixed size matrices. 
-
-In the actual implementation, it returns matrices of zero when ask to do a svd on an int matrix. 
-
-
-Paper for the third part:
-http://www.stat.uchicago.edu/~lekheng/courses/302/classics/greengard-rokhlin.pdf
-
diff --git a/inst/include/unsupported/Eigen/src/Skyline/SkylineInplaceLU.h b/inst/include/unsupported/Eigen/src/Skyline/SkylineInplaceLU.h
deleted file mode 100644
index a1f54ed3..00000000
--- a/inst/include/unsupported/Eigen/src/Skyline/SkylineInplaceLU.h
+++ /dev/null
@@ -1,352 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008 Guillaume Saupin <guillaume.saupin@cea.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_SKYLINEINPLACELU_H
-#define EIGEN_SKYLINEINPLACELU_H
-
-namespace Eigen { 
-
-/** \ingroup Skyline_Module
- *
- * \class SkylineInplaceLU
- *
- * \brief Inplace LU decomposition of a skyline matrix and associated features
- *
- * \param MatrixType the type of the matrix of which we are computing the LU factorization
- *
- */
-template<typename MatrixType>
-class SkylineInplaceLU {
-protected:
-    typedef typename MatrixType::Scalar Scalar;
-    typedef typename MatrixType::Index Index;
-    
-    typedef typename NumTraits<typename MatrixType::Scalar>::Real RealScalar;
-
-public:
-
-    /** Creates a LU object and compute the respective factorization of \a matrix using
-     * flags \a flags. */
-    SkylineInplaceLU(MatrixType& matrix, int flags = 0)
-    : /*m_matrix(matrix.rows(), matrix.cols()),*/ m_flags(flags), m_status(0), m_lu(matrix) {
-        m_precision = RealScalar(0.1) * Eigen::dummy_precision<RealScalar > ();
-        m_lu.IsRowMajor ? computeRowMajor() : compute();
-    }
-
-    /** Sets the relative threshold value used to prune zero coefficients during the decomposition.
-     *
-     * Setting a value greater than zero speeds up computation, and yields to an imcomplete
-     * factorization with fewer non zero coefficients. Such approximate factors are especially
-     * useful to initialize an iterative solver.
-     *
-     * Note that the exact meaning of this parameter might depends on the actual
-     * backend. Moreover, not all backends support this feature.
-     *
-     * \sa precision() */
-    void setPrecision(RealScalar v) {
-        m_precision = v;
-    }
-
-    /** \returns the current precision.
-     *
-     * \sa setPrecision() */
-    RealScalar precision() const {
-        return m_precision;
-    }
-
-    /** Sets the flags. Possible values are:
-     *  - CompleteFactorization
-     *  - IncompleteFactorization
-     *  - MemoryEfficient
-     *  - one of the ordering methods
-     *  - etc...
-     *
-     * \sa flags() */
-    void setFlags(int f) {
-        m_flags = f;
-    }
-
-    /** \returns the current flags */
-    int flags() const {
-        return m_flags;
-    }
-
-    void setOrderingMethod(int m) {
-        m_flags = m;
-    }
-
-    int orderingMethod() const {
-        return m_flags;
-    }
-
-    /** Computes/re-computes the LU factorization */
-    void compute();
-    void computeRowMajor();
-
-    /** \returns the lower triangular matrix L */
-    //inline const MatrixType& matrixL() const { return m_matrixL; }
-
-    /** \returns the upper triangular matrix U */
-    //inline const MatrixType& matrixU() const { return m_matrixU; }
-
-    template<typename BDerived, typename XDerived>
-    bool solve(const MatrixBase<BDerived> &b, MatrixBase<XDerived>* x,
-            const int transposed = 0) const;
-
-    /** \returns true if the factorization succeeded */
-    inline bool succeeded(void) const {
-        return m_succeeded;
-    }
-
-protected:
-    RealScalar m_precision;
-    int m_flags;
-    mutable int m_status;
-    bool m_succeeded;
-    MatrixType& m_lu;
-};
-
-/** Computes / recomputes the in place LU decomposition of the SkylineInplaceLU.
- * using the default algorithm.
- */
-template<typename MatrixType>
-//template<typename _Scalar>
-void SkylineInplaceLU<MatrixType>::compute() {
-    const size_t rows = m_lu.rows();
-    const size_t cols = m_lu.cols();
-
-    eigen_assert(rows == cols && "We do not (yet) support rectangular LU.");
-    eigen_assert(!m_lu.IsRowMajor && "LU decomposition does not work with rowMajor Storage");
-
-    for (Index row = 0; row < rows; row++) {
-        const double pivot = m_lu.coeffDiag(row);
-
-        //Lower matrix Columns update
-        const Index& col = row;
-        for (typename MatrixType::InnerLowerIterator lIt(m_lu, col); lIt; ++lIt) {
-            lIt.valueRef() /= pivot;
-        }
-
-        //Upper matrix update -> contiguous memory access
-        typename MatrixType::InnerLowerIterator lIt(m_lu, col);
-        for (Index rrow = row + 1; rrow < m_lu.rows(); rrow++) {
-            typename MatrixType::InnerUpperIterator uItPivot(m_lu, row);
-            typename MatrixType::InnerUpperIterator uIt(m_lu, rrow);
-            const double coef = lIt.value();
-
-            uItPivot += (rrow - row - 1);
-
-            //update upper part  -> contiguous memory access
-            for (++uItPivot; uIt && uItPivot;) {
-                uIt.valueRef() -= uItPivot.value() * coef;
-
-                ++uIt;
-                ++uItPivot;
-            }
-            ++lIt;
-        }
-
-        //Upper matrix update -> non contiguous memory access
-        typename MatrixType::InnerLowerIterator lIt3(m_lu, col);
-        for (Index rrow = row + 1; rrow < m_lu.rows(); rrow++) {
-            typename MatrixType::InnerUpperIterator uItPivot(m_lu, row);
-            const double coef = lIt3.value();
-
-            //update lower part ->  non contiguous memory access
-            for (Index i = 0; i < rrow - row - 1; i++) {
-                m_lu.coeffRefLower(rrow, row + i + 1) -= uItPivot.value() * coef;
-                ++uItPivot;
-            }
-            ++lIt3;
-        }
-        //update diag -> contiguous
-        typename MatrixType::InnerLowerIterator lIt2(m_lu, col);
-        for (Index rrow = row + 1; rrow < m_lu.rows(); rrow++) {
-
-            typename MatrixType::InnerUpperIterator uItPivot(m_lu, row);
-            typename MatrixType::InnerUpperIterator uIt(m_lu, rrow);
-            const double coef = lIt2.value();
-
-            uItPivot += (rrow - row - 1);
-            m_lu.coeffRefDiag(rrow) -= uItPivot.value() * coef;
-            ++lIt2;
-        }
-    }
-}
-
-template<typename MatrixType>
-void SkylineInplaceLU<MatrixType>::computeRowMajor() {
-    const size_t rows = m_lu.rows();
-    const size_t cols = m_lu.cols();
-
-    eigen_assert(rows == cols && "We do not (yet) support rectangular LU.");
-    eigen_assert(m_lu.IsRowMajor && "You're trying to apply rowMajor decomposition on a ColMajor matrix !");
-
-    for (Index row = 0; row < rows; row++) {
-        typename MatrixType::InnerLowerIterator llIt(m_lu, row);
-
-
-        for (Index col = llIt.col(); col < row; col++) {
-            if (m_lu.coeffExistLower(row, col)) {
-                const double diag = m_lu.coeffDiag(col);
-
-                typename MatrixType::InnerLowerIterator lIt(m_lu, row);
-                typename MatrixType::InnerUpperIterator uIt(m_lu, col);
-
-
-                const Index offset = lIt.col() - uIt.row();
-
-
-                Index stop = offset > 0 ? col - lIt.col() : col - uIt.row();
-
-                //#define VECTORIZE
-#ifdef VECTORIZE
-                Map<VectorXd > rowVal(lIt.valuePtr() + (offset > 0 ? 0 : -offset), stop);
-                Map<VectorXd > colVal(uIt.valuePtr() + (offset > 0 ? offset : 0), stop);
-
-
-                Scalar newCoeff = m_lu.coeffLower(row, col) - rowVal.dot(colVal);
-#else
-                if (offset > 0) //Skip zero value of lIt
-                    uIt += offset;
-                else //Skip zero values of uIt
-                    lIt += -offset;
-                Scalar newCoeff = m_lu.coeffLower(row, col);
-
-                for (Index k = 0; k < stop; ++k) {
-                    const Scalar tmp = newCoeff;
-                    newCoeff = tmp - lIt.value() * uIt.value();
-                    ++lIt;
-                    ++uIt;
-                }
-#endif
-
-                m_lu.coeffRefLower(row, col) = newCoeff / diag;
-            }
-        }
-
-        //Upper matrix update
-        const Index col = row;
-        typename MatrixType::InnerUpperIterator uuIt(m_lu, col);
-        for (Index rrow = uuIt.row(); rrow < col; rrow++) {
-
-            typename MatrixType::InnerLowerIterator lIt(m_lu, rrow);
-            typename MatrixType::InnerUpperIterator uIt(m_lu, col);
-            const Index offset = lIt.col() - uIt.row();
-
-            Index stop = offset > 0 ? rrow - lIt.col() : rrow - uIt.row();
-
-#ifdef VECTORIZE
-            Map<VectorXd > rowVal(lIt.valuePtr() + (offset > 0 ? 0 : -offset), stop);
-            Map<VectorXd > colVal(uIt.valuePtr() + (offset > 0 ? offset : 0), stop);
-
-            Scalar newCoeff = m_lu.coeffUpper(rrow, col) - rowVal.dot(colVal);
-#else
-            if (offset > 0) //Skip zero value of lIt
-                uIt += offset;
-            else //Skip zero values of uIt
-                lIt += -offset;
-            Scalar newCoeff = m_lu.coeffUpper(rrow, col);
-            for (Index k = 0; k < stop; ++k) {
-                const Scalar tmp = newCoeff;
-                newCoeff = tmp - lIt.value() * uIt.value();
-
-                ++lIt;
-                ++uIt;
-            }
-#endif
-            m_lu.coeffRefUpper(rrow, col) = newCoeff;
-        }
-
-
-        //Diag matrix update
-        typename MatrixType::InnerLowerIterator lIt(m_lu, row);
-        typename MatrixType::InnerUpperIterator uIt(m_lu, row);
-
-        const Index offset = lIt.col() - uIt.row();
-
-
-        Index stop = offset > 0 ? lIt.size() : uIt.size();
-#ifdef VECTORIZE
-        Map<VectorXd > rowVal(lIt.valuePtr() + (offset > 0 ? 0 : -offset), stop);
-        Map<VectorXd > colVal(uIt.valuePtr() + (offset > 0 ? offset : 0), stop);
-        Scalar newCoeff = m_lu.coeffDiag(row) - rowVal.dot(colVal);
-#else
-        if (offset > 0) //Skip zero value of lIt
-            uIt += offset;
-        else //Skip zero values of uIt
-            lIt += -offset;
-        Scalar newCoeff = m_lu.coeffDiag(row);
-        for (Index k = 0; k < stop; ++k) {
-            const Scalar tmp = newCoeff;
-            newCoeff = tmp - lIt.value() * uIt.value();
-            ++lIt;
-            ++uIt;
-        }
-#endif
-        m_lu.coeffRefDiag(row) = newCoeff;
-    }
-}
-
-/** Computes *x = U^-1 L^-1 b
- *
- * If \a transpose is set to SvTranspose or SvAdjoint, the solution
- * of the transposed/adjoint system is computed instead.
- *
- * Not all backends implement the solution of the transposed or
- * adjoint system.
- */
-template<typename MatrixType>
-template<typename BDerived, typename XDerived>
-bool SkylineInplaceLU<MatrixType>::solve(const MatrixBase<BDerived> &b, MatrixBase<XDerived>* x, const int transposed) const {
-    const size_t rows = m_lu.rows();
-    const size_t cols = m_lu.cols();
-
-
-    for (Index row = 0; row < rows; row++) {
-        x->coeffRef(row) = b.coeff(row);
-        Scalar newVal = x->coeff(row);
-        typename MatrixType::InnerLowerIterator lIt(m_lu, row);
-
-        Index col = lIt.col();
-        while (lIt.col() < row) {
-
-            newVal -= x->coeff(col++) * lIt.value();
-            ++lIt;
-        }
-
-        x->coeffRef(row) = newVal;
-    }
-
-
-    for (Index col = rows - 1; col > 0; col--) {
-        x->coeffRef(col) = x->coeff(col) / m_lu.coeffDiag(col);
-
-        const Scalar x_col = x->coeff(col);
-
-        typename MatrixType::InnerUpperIterator uIt(m_lu, col);
-        uIt += uIt.size()-1;
-
-
-        while (uIt) {
-            x->coeffRef(uIt.row()) -= x_col * uIt.value();
-            //TODO : introduce --operator
-            uIt += -1;
-        }
-
-
-    }
-    x->coeffRef(0) = x->coeff(0) / m_lu.coeffDiag(0);
-
-    return true;
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN_SKYLINELU_H
diff --git a/inst/include/unsupported/Eigen/src/Skyline/SkylineMatrix.h b/inst/include/unsupported/Eigen/src/Skyline/SkylineMatrix.h
deleted file mode 100644
index a2a8933c..00000000
--- a/inst/include/unsupported/Eigen/src/Skyline/SkylineMatrix.h
+++ /dev/null
@@ -1,862 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008-2009 Guillaume Saupin <guillaume.saupin@cea.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_SKYLINEMATRIX_H
-#define EIGEN_SKYLINEMATRIX_H
-
-#include "SkylineStorage.h"
-#include "SkylineMatrixBase.h"
-
-namespace Eigen { 
-
-/** \ingroup Skyline_Module
- *
- * \class SkylineMatrix
- *
- * \brief The main skyline matrix class
- *
- * This class implements a skyline matrix using the very uncommon storage
- * scheme.
- *
- * \param _Scalar the scalar type, i.e. the type of the coefficients
- * \param _Options Union of bit flags controlling the storage scheme. Currently the only possibility
- *                 is RowMajor. The default is 0 which means column-major.
- *
- *
- */
-namespace internal {
-template<typename _Scalar, int _Options>
-struct traits<SkylineMatrix<_Scalar, _Options> > {
-    typedef _Scalar Scalar;
-    typedef Sparse StorageKind;
-
-    enum {
-        RowsAtCompileTime = Dynamic,
-        ColsAtCompileTime = Dynamic,
-        MaxRowsAtCompileTime = Dynamic,
-        MaxColsAtCompileTime = Dynamic,
-        Flags = SkylineBit | _Options,
-        CoeffReadCost = NumTraits<Scalar>::ReadCost,
-    };
-};
-}
-
-template<typename _Scalar, int _Options>
-class SkylineMatrix
-: public SkylineMatrixBase<SkylineMatrix<_Scalar, _Options> > {
-public:
-    EIGEN_SKYLINE_GENERIC_PUBLIC_INTERFACE(SkylineMatrix)
-    EIGEN_SKYLINE_INHERIT_ASSIGNMENT_OPERATOR(SkylineMatrix, +=)
-    EIGEN_SKYLINE_INHERIT_ASSIGNMENT_OPERATOR(SkylineMatrix, -=)
-
-    using Base::IsRowMajor;
-
-protected:
-
-    typedef SkylineMatrix<Scalar, (Flags&~RowMajorBit) | (IsRowMajor ? RowMajorBit : 0) > TransposedSkylineMatrix;
-
-    Index m_outerSize;
-    Index m_innerSize;
-
-public:
-    Index* m_colStartIndex;
-    Index* m_rowStartIndex;
-    SkylineStorage<Scalar> m_data;
-
-public:
-
-    inline Index rows() const {
-        return IsRowMajor ? m_outerSize : m_innerSize;
-    }
-
-    inline Index cols() const {
-        return IsRowMajor ? m_innerSize : m_outerSize;
-    }
-
-    inline Index innerSize() const {
-        return m_innerSize;
-    }
-
-    inline Index outerSize() const {
-        return m_outerSize;
-    }
-
-    inline Index upperNonZeros() const {
-        return m_data.upperSize();
-    }
-
-    inline Index lowerNonZeros() const {
-        return m_data.lowerSize();
-    }
-
-    inline Index upperNonZeros(Index j) const {
-        return m_colStartIndex[j + 1] - m_colStartIndex[j];
-    }
-
-    inline Index lowerNonZeros(Index j) const {
-        return m_rowStartIndex[j + 1] - m_rowStartIndex[j];
-    }
-
-    inline const Scalar* _diagPtr() const {
-        return &m_data.diag(0);
-    }
-
-    inline Scalar* _diagPtr() {
-        return &m_data.diag(0);
-    }
-
-    inline const Scalar* _upperPtr() const {
-        return &m_data.upper(0);
-    }
-
-    inline Scalar* _upperPtr() {
-        return &m_data.upper(0);
-    }
-
-    inline const Scalar* _lowerPtr() const {
-        return &m_data.lower(0);
-    }
-
-    inline Scalar* _lowerPtr() {
-        return &m_data.lower(0);
-    }
-
-    inline const Index* _upperProfilePtr() const {
-        return &m_data.upperProfile(0);
-    }
-
-    inline Index* _upperProfilePtr() {
-        return &m_data.upperProfile(0);
-    }
-
-    inline const Index* _lowerProfilePtr() const {
-        return &m_data.lowerProfile(0);
-    }
-
-    inline Index* _lowerProfilePtr() {
-        return &m_data.lowerProfile(0);
-    }
-
-    inline Scalar coeff(Index row, Index col) const {
-        const Index outer = IsRowMajor ? row : col;
-        const Index inner = IsRowMajor ? col : row;
-
-        eigen_assert(outer < outerSize());
-        eigen_assert(inner < innerSize());
-
-        if (outer == inner)
-            return this->m_data.diag(outer);
-
-        if (IsRowMajor) {
-            if (inner > outer) //upper matrix
-            {
-                const Index minOuterIndex = inner - m_data.upperProfile(inner);
-                if (outer >= minOuterIndex)
-                    return this->m_data.upper(m_colStartIndex[inner] + outer - (inner - m_data.upperProfile(inner)));
-                else
-                    return Scalar(0);
-            }
-            if (inner < outer) //lower matrix
-            {
-                const Index minInnerIndex = outer - m_data.lowerProfile(outer);
-                if (inner >= minInnerIndex)
-                    return this->m_data.lower(m_rowStartIndex[outer] + inner - (outer - m_data.lowerProfile(outer)));
-                else
-                    return Scalar(0);
-            }
-            return m_data.upper(m_colStartIndex[inner] + outer - inner);
-        } else {
-            if (outer > inner) //upper matrix
-            {
-                const Index maxOuterIndex = inner + m_data.upperProfile(inner);
-                if (outer <= maxOuterIndex)
-                    return this->m_data.upper(m_colStartIndex[inner] + (outer - inner));
-                else
-                    return Scalar(0);
-            }
-            if (outer < inner) //lower matrix
-            {
-                const Index maxInnerIndex = outer + m_data.lowerProfile(outer);
-
-                if (inner <= maxInnerIndex)
-                    return this->m_data.lower(m_rowStartIndex[outer] + (inner - outer));
-                else
-                    return Scalar(0);
-            }
-        }
-    }
-
-    inline Scalar& coeffRef(Index row, Index col) {
-        const Index outer = IsRowMajor ? row : col;
-        const Index inner = IsRowMajor ? col : row;
-
-        eigen_assert(outer < outerSize());
-        eigen_assert(inner < innerSize());
-
-        if (outer == inner)
-            return this->m_data.diag(outer);
-
-        if (IsRowMajor) {
-            if (col > row) //upper matrix
-            {
-                const Index minOuterIndex = inner - m_data.upperProfile(inner);
-                eigen_assert(outer >= minOuterIndex && "you try to acces a coeff that do not exist in the storage");
-                return this->m_data.upper(m_colStartIndex[inner] + outer - (inner - m_data.upperProfile(inner)));
-            }
-            if (col < row) //lower matrix
-            {
-                const Index minInnerIndex = outer - m_data.lowerProfile(outer);
-                eigen_assert(inner >= minInnerIndex && "you try to acces a coeff that do not exist in the storage");
-                return this->m_data.lower(m_rowStartIndex[outer] + inner - (outer - m_data.lowerProfile(outer)));
-            }
-        } else {
-            if (outer > inner) //upper matrix
-            {
-                const Index maxOuterIndex = inner + m_data.upperProfile(inner);
-                eigen_assert(outer <= maxOuterIndex && "you try to acces a coeff that do not exist in the storage");
-                return this->m_data.upper(m_colStartIndex[inner] + (outer - inner));
-            }
-            if (outer < inner) //lower matrix
-            {
-                const Index maxInnerIndex = outer + m_data.lowerProfile(outer);
-                eigen_assert(inner <= maxInnerIndex && "you try to acces a coeff that do not exist in the storage");
-                return this->m_data.lower(m_rowStartIndex[outer] + (inner - outer));
-            }
-        }
-    }
-
-    inline Scalar coeffDiag(Index idx) const {
-        eigen_assert(idx < outerSize());
-        eigen_assert(idx < innerSize());
-        return this->m_data.diag(idx);
-    }
-
-    inline Scalar coeffLower(Index row, Index col) const {
-        const Index outer = IsRowMajor ? row : col;
-        const Index inner = IsRowMajor ? col : row;
-
-        eigen_assert(outer < outerSize());
-        eigen_assert(inner < innerSize());
-        eigen_assert(inner != outer);
-
-        if (IsRowMajor) {
-            const Index minInnerIndex = outer - m_data.lowerProfile(outer);
-            if (inner >= minInnerIndex)
-                return this->m_data.lower(m_rowStartIndex[outer] + inner - (outer - m_data.lowerProfile(outer)));
-            else
-                return Scalar(0);
-
-        } else {
-            const Index maxInnerIndex = outer + m_data.lowerProfile(outer);
-            if (inner <= maxInnerIndex)
-                return this->m_data.lower(m_rowStartIndex[outer] + (inner - outer));
-            else
-                return Scalar(0);
-        }
-    }
-
-    inline Scalar coeffUpper(Index row, Index col) const {
-        const Index outer = IsRowMajor ? row : col;
-        const Index inner = IsRowMajor ? col : row;
-
-        eigen_assert(outer < outerSize());
-        eigen_assert(inner < innerSize());
-        eigen_assert(inner != outer);
-
-        if (IsRowMajor) {
-            const Index minOuterIndex = inner - m_data.upperProfile(inner);
-            if (outer >= minOuterIndex)
-                return this->m_data.upper(m_colStartIndex[inner] + outer - (inner - m_data.upperProfile(inner)));
-            else
-                return Scalar(0);
-        } else {
-            const Index maxOuterIndex = inner + m_data.upperProfile(inner);
-            if (outer <= maxOuterIndex)
-                return this->m_data.upper(m_colStartIndex[inner] + (outer - inner));
-            else
-                return Scalar(0);
-        }
-    }
-
-    inline Scalar& coeffRefDiag(Index idx) {
-        eigen_assert(idx < outerSize());
-        eigen_assert(idx < innerSize());
-        return this->m_data.diag(idx);
-    }
-
-    inline Scalar& coeffRefLower(Index row, Index col) {
-        const Index outer = IsRowMajor ? row : col;
-        const Index inner = IsRowMajor ? col : row;
-
-        eigen_assert(outer < outerSize());
-        eigen_assert(inner < innerSize());
-        eigen_assert(inner != outer);
-
-        if (IsRowMajor) {
-            const Index minInnerIndex = outer - m_data.lowerProfile(outer);
-            eigen_assert(inner >= minInnerIndex && "you try to acces a coeff that do not exist in the storage");
-            return this->m_data.lower(m_rowStartIndex[outer] + inner - (outer - m_data.lowerProfile(outer)));
-        } else {
-            const Index maxInnerIndex = outer + m_data.lowerProfile(outer);
-            eigen_assert(inner <= maxInnerIndex && "you try to acces a coeff that do not exist in the storage");
-            return this->m_data.lower(m_rowStartIndex[outer] + (inner - outer));
-        }
-    }
-
-    inline bool coeffExistLower(Index row, Index col) {
-        const Index outer = IsRowMajor ? row : col;
-        const Index inner = IsRowMajor ? col : row;
-
-        eigen_assert(outer < outerSize());
-        eigen_assert(inner < innerSize());
-        eigen_assert(inner != outer);
-
-        if (IsRowMajor) {
-            const Index minInnerIndex = outer - m_data.lowerProfile(outer);
-            return inner >= minInnerIndex;
-        } else {
-            const Index maxInnerIndex = outer + m_data.lowerProfile(outer);
-            return inner <= maxInnerIndex;
-        }
-    }
-
-    inline Scalar& coeffRefUpper(Index row, Index col) {
-        const Index outer = IsRowMajor ? row : col;
-        const Index inner = IsRowMajor ? col : row;
-
-        eigen_assert(outer < outerSize());
-        eigen_assert(inner < innerSize());
-        eigen_assert(inner != outer);
-
-        if (IsRowMajor) {
-            const Index minOuterIndex = inner - m_data.upperProfile(inner);
-            eigen_assert(outer >= minOuterIndex && "you try to acces a coeff that do not exist in the storage");
-            return this->m_data.upper(m_colStartIndex[inner] + outer - (inner - m_data.upperProfile(inner)));
-        } else {
-            const Index maxOuterIndex = inner + m_data.upperProfile(inner);
-            eigen_assert(outer <= maxOuterIndex && "you try to acces a coeff that do not exist in the storage");
-            return this->m_data.upper(m_colStartIndex[inner] + (outer - inner));
-        }
-    }
-
-    inline bool coeffExistUpper(Index row, Index col) {
-        const Index outer = IsRowMajor ? row : col;
-        const Index inner = IsRowMajor ? col : row;
-
-        eigen_assert(outer < outerSize());
-        eigen_assert(inner < innerSize());
-        eigen_assert(inner != outer);
-
-        if (IsRowMajor) {
-            const Index minOuterIndex = inner - m_data.upperProfile(inner);
-            return outer >= minOuterIndex;
-        } else {
-            const Index maxOuterIndex = inner + m_data.upperProfile(inner);
-            return outer <= maxOuterIndex;
-        }
-    }
-
-
-protected:
-
-public:
-    class InnerUpperIterator;
-    class InnerLowerIterator;
-
-    class OuterUpperIterator;
-    class OuterLowerIterator;
-
-    /** Removes all non zeros */
-    inline void setZero() {
-        m_data.clear();
-        memset(m_colStartIndex, 0, (m_outerSize + 1) * sizeof (Index));
-        memset(m_rowStartIndex, 0, (m_outerSize + 1) * sizeof (Index));
-    }
-
-    /** \returns the number of non zero coefficients */
-    inline Index nonZeros() const {
-        return m_data.diagSize() + m_data.upperSize() + m_data.lowerSize();
-    }
-
-    /** Preallocates \a reserveSize non zeros */
-    inline void reserve(Index reserveSize, Index reserveUpperSize, Index reserveLowerSize) {
-        m_data.reserve(reserveSize, reserveUpperSize, reserveLowerSize);
-    }
-
-    /** \returns a reference to a novel non zero coefficient with coordinates \a row x \a col.
-
-     *
-     * \warning This function can be extremely slow if the non zero coefficients
-     * are not inserted in a coherent order.
-     *
-     * After an insertion session, you should call the finalize() function.
-     */
-    EIGEN_DONT_INLINE Scalar & insert(Index row, Index col) {
-        const Index outer = IsRowMajor ? row : col;
-        const Index inner = IsRowMajor ? col : row;
-
-        eigen_assert(outer < outerSize());
-        eigen_assert(inner < innerSize());
-
-        if (outer == inner)
-            return m_data.diag(col);
-
-        if (IsRowMajor) {
-            if (outer < inner) //upper matrix
-            {
-                Index minOuterIndex = 0;
-                minOuterIndex = inner - m_data.upperProfile(inner);
-
-                if (outer < minOuterIndex) //The value does not yet exist
-                {
-                    const Index previousProfile = m_data.upperProfile(inner);
-
-                    m_data.upperProfile(inner) = inner - outer;
-
-
-                    const Index bandIncrement = m_data.upperProfile(inner) - previousProfile;
-                    //shift data stored after this new one
-                    const Index stop = m_colStartIndex[cols()];
-                    const Index start = m_colStartIndex[inner];
-
-
-                    for (Index innerIdx = stop; innerIdx >= start; innerIdx--) {
-                        m_data.upper(innerIdx + bandIncrement) = m_data.upper(innerIdx);
-                    }
-
-                    for (Index innerIdx = cols(); innerIdx > inner; innerIdx--) {
-                        m_colStartIndex[innerIdx] += bandIncrement;
-                    }
-
-                    //zeros new data
-                    memset(this->_upperPtr() + start, 0, (bandIncrement - 1) * sizeof (Scalar));
-
-                    return m_data.upper(m_colStartIndex[inner]);
-                } else {
-                    return m_data.upper(m_colStartIndex[inner] + outer - (inner - m_data.upperProfile(inner)));
-                }
-            }
-
-            if (outer > inner) //lower matrix
-            {
-                const Index minInnerIndex = outer - m_data.lowerProfile(outer);
-                if (inner < minInnerIndex) //The value does not yet exist
-                {
-                    const Index previousProfile = m_data.lowerProfile(outer);
-                    m_data.lowerProfile(outer) = outer - inner;
-
-                    const Index bandIncrement = m_data.lowerProfile(outer) - previousProfile;
-                    //shift data stored after this new one
-                    const Index stop = m_rowStartIndex[rows()];
-                    const Index start = m_rowStartIndex[outer];
-
-
-                    for (Index innerIdx = stop; innerIdx >= start; innerIdx--) {
-                        m_data.lower(innerIdx + bandIncrement) = m_data.lower(innerIdx);
-                    }
-
-                    for (Index innerIdx = rows(); innerIdx > outer; innerIdx--) {
-                        m_rowStartIndex[innerIdx] += bandIncrement;
-                    }
-
-                    //zeros new data
-                    memset(this->_lowerPtr() + start, 0, (bandIncrement - 1) * sizeof (Scalar));
-                    return m_data.lower(m_rowStartIndex[outer]);
-                } else {
-                    return m_data.lower(m_rowStartIndex[outer] + inner - (outer - m_data.lowerProfile(outer)));
-                }
-            }
-        } else {
-            if (outer > inner) //upper matrix
-            {
-                const Index maxOuterIndex = inner + m_data.upperProfile(inner);
-                if (outer > maxOuterIndex) //The value does not yet exist
-                {
-                    const Index previousProfile = m_data.upperProfile(inner);
-                    m_data.upperProfile(inner) = outer - inner;
-
-                    const Index bandIncrement = m_data.upperProfile(inner) - previousProfile;
-                    //shift data stored after this new one
-                    const Index stop = m_rowStartIndex[rows()];
-                    const Index start = m_rowStartIndex[inner + 1];
-
-                    for (Index innerIdx = stop; innerIdx >= start; innerIdx--) {
-                        m_data.upper(innerIdx + bandIncrement) = m_data.upper(innerIdx);
-                    }
-
-                    for (Index innerIdx = inner + 1; innerIdx < outerSize() + 1; innerIdx++) {
-                        m_rowStartIndex[innerIdx] += bandIncrement;
-                    }
-                    memset(this->_upperPtr() + m_rowStartIndex[inner] + previousProfile + 1, 0, (bandIncrement - 1) * sizeof (Scalar));
-                    return m_data.upper(m_rowStartIndex[inner] + m_data.upperProfile(inner));
-                } else {
-                    return m_data.upper(m_rowStartIndex[inner] + (outer - inner));
-                }
-            }
-
-            if (outer < inner) //lower matrix
-            {
-                const Index maxInnerIndex = outer + m_data.lowerProfile(outer);
-                if (inner > maxInnerIndex) //The value does not yet exist
-                {
-                    const Index previousProfile = m_data.lowerProfile(outer);
-                    m_data.lowerProfile(outer) = inner - outer;
-
-                    const Index bandIncrement = m_data.lowerProfile(outer) - previousProfile;
-                    //shift data stored after this new one
-                    const Index stop = m_colStartIndex[cols()];
-                    const Index start = m_colStartIndex[outer + 1];
-
-                    for (Index innerIdx = stop; innerIdx >= start; innerIdx--) {
-                        m_data.lower(innerIdx + bandIncrement) = m_data.lower(innerIdx);
-                    }
-
-                    for (Index innerIdx = outer + 1; innerIdx < outerSize() + 1; innerIdx++) {
-                        m_colStartIndex[innerIdx] += bandIncrement;
-                    }
-                    memset(this->_lowerPtr() + m_colStartIndex[outer] + previousProfile + 1, 0, (bandIncrement - 1) * sizeof (Scalar));
-                    return m_data.lower(m_colStartIndex[outer] + m_data.lowerProfile(outer));
-                } else {
-                    return m_data.lower(m_colStartIndex[outer] + (inner - outer));
-                }
-            }
-        }
-    }
-
-    /** Must be called after inserting a set of non zero entries.
-     */
-    inline void finalize() {
-        if (IsRowMajor) {
-            if (rows() > cols())
-                m_data.resize(cols(), cols(), rows(), m_colStartIndex[cols()] + 1, m_rowStartIndex[rows()] + 1);
-            else
-                m_data.resize(rows(), cols(), rows(), m_colStartIndex[cols()] + 1, m_rowStartIndex[rows()] + 1);
-
-            //            eigen_assert(rows() == cols() && "memory reorganisatrion only works with suare matrix");
-            //
-            //            Scalar* newArray = new Scalar[m_colStartIndex[cols()] + 1 + m_rowStartIndex[rows()] + 1];
-            //            Index dataIdx = 0;
-            //            for (Index row = 0; row < rows(); row++) {
-            //
-            //                const Index nbLowerElts = m_rowStartIndex[row + 1] - m_rowStartIndex[row];
-            //                //                std::cout << "nbLowerElts" << nbLowerElts << std::endl;
-            //                memcpy(newArray + dataIdx, m_data.m_lower + m_rowStartIndex[row], nbLowerElts * sizeof (Scalar));
-            //                m_rowStartIndex[row] = dataIdx;
-            //                dataIdx += nbLowerElts;
-            //
-            //                const Index nbUpperElts = m_colStartIndex[row + 1] - m_colStartIndex[row];
-            //                memcpy(newArray + dataIdx, m_data.m_upper + m_colStartIndex[row], nbUpperElts * sizeof (Scalar));
-            //                m_colStartIndex[row] = dataIdx;
-            //                dataIdx += nbUpperElts;
-            //
-            //
-            //            }
-            //            //todo : don't access m_data profile directly : add an accessor from SkylineMatrix
-            //            m_rowStartIndex[rows()] = m_rowStartIndex[rows()-1] + m_data.lowerProfile(rows()-1);
-            //            m_colStartIndex[cols()] = m_colStartIndex[cols()-1] + m_data.upperProfile(cols()-1);
-            //
-            //            delete[] m_data.m_lower;
-            //            delete[] m_data.m_upper;
-            //
-            //            m_data.m_lower = newArray;
-            //            m_data.m_upper = newArray;
-        } else {
-            if (rows() > cols())
-                m_data.resize(cols(), rows(), cols(), m_rowStartIndex[cols()] + 1, m_colStartIndex[cols()] + 1);
-            else
-                m_data.resize(rows(), rows(), cols(), m_rowStartIndex[rows()] + 1, m_colStartIndex[rows()] + 1);
-        }
-    }
-
-    inline void squeeze() {
-        finalize();
-        m_data.squeeze();
-    }
-
-    void prune(Scalar reference, RealScalar epsilon = dummy_precision<RealScalar > ()) {
-        //TODO
-    }
-
-    /** Resizes the matrix to a \a rows x \a cols matrix and initializes it to zero
-     * \sa resizeNonZeros(Index), reserve(), setZero()
-     */
-    void resize(size_t rows, size_t cols) {
-        const Index diagSize = rows > cols ? cols : rows;
-        m_innerSize = IsRowMajor ? cols : rows;
-
-        eigen_assert(rows == cols && "Skyline matrix must be square matrix");
-
-        if (diagSize % 2) { // diagSize is odd
-            const Index k = (diagSize - 1) / 2;
-
-            m_data.resize(diagSize, IsRowMajor ? cols : rows, IsRowMajor ? rows : cols,
-                    2 * k * k + k + 1,
-                    2 * k * k + k + 1);
-
-        } else // diagSize is even
-        {
-            const Index k = diagSize / 2;
-            m_data.resize(diagSize, IsRowMajor ? cols : rows, IsRowMajor ? rows : cols,
-                    2 * k * k - k + 1,
-                    2 * k * k - k + 1);
-        }
-
-        if (m_colStartIndex && m_rowStartIndex) {
-            delete[] m_colStartIndex;
-            delete[] m_rowStartIndex;
-        }
-        m_colStartIndex = new Index [cols + 1];
-        m_rowStartIndex = new Index [rows + 1];
-        m_outerSize = diagSize;
-
-        m_data.reset();
-        m_data.clear();
-
-        m_outerSize = diagSize;
-        memset(m_colStartIndex, 0, (cols + 1) * sizeof (Index));
-        memset(m_rowStartIndex, 0, (rows + 1) * sizeof (Index));
-    }
-
-    void resizeNonZeros(Index size) {
-        m_data.resize(size);
-    }
-
-    inline SkylineMatrix()
-    : m_outerSize(-1), m_innerSize(0), m_colStartIndex(0), m_rowStartIndex(0) {
-        resize(0, 0);
-    }
-
-    inline SkylineMatrix(size_t rows, size_t cols)
-    : m_outerSize(0), m_innerSize(0), m_colStartIndex(0), m_rowStartIndex(0) {
-        resize(rows, cols);
-    }
-
-    template<typename OtherDerived>
-    inline SkylineMatrix(const SkylineMatrixBase<OtherDerived>& other)
-    : m_outerSize(0), m_innerSize(0), m_colStartIndex(0), m_rowStartIndex(0) {
-        *this = other.derived();
-    }
-
-    inline SkylineMatrix(const SkylineMatrix & other)
-    : Base(), m_outerSize(0), m_innerSize(0), m_colStartIndex(0), m_rowStartIndex(0) {
-        *this = other.derived();
-    }
-
-    inline void swap(SkylineMatrix & other) {
-        //EIGEN_DBG_SKYLINE(std::cout << "SkylineMatrix:: swap\n");
-        std::swap(m_colStartIndex, other.m_colStartIndex);
-        std::swap(m_rowStartIndex, other.m_rowStartIndex);
-        std::swap(m_innerSize, other.m_innerSize);
-        std::swap(m_outerSize, other.m_outerSize);
-        m_data.swap(other.m_data);
-    }
-
-    inline SkylineMatrix & operator=(const SkylineMatrix & other) {
-        std::cout << "SkylineMatrix& operator=(const SkylineMatrix& other)\n";
-        if (other.isRValue()) {
-            swap(other.const_cast_derived());
-        } else {
-            resize(other.rows(), other.cols());
-            memcpy(m_colStartIndex, other.m_colStartIndex, (m_outerSize + 1) * sizeof (Index));
-            memcpy(m_rowStartIndex, other.m_rowStartIndex, (m_outerSize + 1) * sizeof (Index));
-            m_data = other.m_data;
-        }
-        return *this;
-    }
-
-    template<typename OtherDerived>
-            inline SkylineMatrix & operator=(const SkylineMatrixBase<OtherDerived>& other) {
-        const bool needToTranspose = (Flags & RowMajorBit) != (OtherDerived::Flags & RowMajorBit);
-        if (needToTranspose) {
-            //         TODO
-            //            return *this;
-        } else {
-            // there is no special optimization
-            return SkylineMatrixBase<SkylineMatrix>::operator=(other.derived());
-        }
-    }
-
-    friend std::ostream & operator <<(std::ostream & s, const SkylineMatrix & m) {
-
-        EIGEN_DBG_SKYLINE(
-        std::cout << "upper elements : " << std::endl;
-        for (Index i = 0; i < m.m_data.upperSize(); i++)
-            std::cout << m.m_data.upper(i) << "\t";
-        std::cout << std::endl;
-        std::cout << "upper profile : " << std::endl;
-        for (Index i = 0; i < m.m_data.upperProfileSize(); i++)
-            std::cout << m.m_data.upperProfile(i) << "\t";
-        std::cout << std::endl;
-        std::cout << "lower startIdx : " << std::endl;
-        for (Index i = 0; i < m.m_data.upperProfileSize(); i++)
-            std::cout << (IsRowMajor ? m.m_colStartIndex[i] : m.m_rowStartIndex[i]) << "\t";
-        std::cout << std::endl;
-
-
-        std::cout << "lower elements : " << std::endl;
-        for (Index i = 0; i < m.m_data.lowerSize(); i++)
-            std::cout << m.m_data.lower(i) << "\t";
-        std::cout << std::endl;
-        std::cout << "lower profile : " << std::endl;
-        for (Index i = 0; i < m.m_data.lowerProfileSize(); i++)
-            std::cout << m.m_data.lowerProfile(i) << "\t";
-        std::cout << std::endl;
-        std::cout << "lower startIdx : " << std::endl;
-        for (Index i = 0; i < m.m_data.lowerProfileSize(); i++)
-            std::cout << (IsRowMajor ? m.m_rowStartIndex[i] : m.m_colStartIndex[i]) << "\t";
-        std::cout << std::endl;
-        );
-        for (Index rowIdx = 0; rowIdx < m.rows(); rowIdx++) {
-            for (Index colIdx = 0; colIdx < m.cols(); colIdx++) {
-                s << m.coeff(rowIdx, colIdx) << "\t";
-            }
-            s << std::endl;
-        }
-        return s;
-    }
-
-    /** Destructor */
-    inline ~SkylineMatrix() {
-        delete[] m_colStartIndex;
-        delete[] m_rowStartIndex;
-    }
-
-    /** Overloaded for performance */
-    Scalar sum() const;
-};
-
-template<typename Scalar, int _Options>
-class SkylineMatrix<Scalar, _Options>::InnerUpperIterator {
-public:
-
-    InnerUpperIterator(const SkylineMatrix& mat, Index outer)
-    : m_matrix(mat), m_outer(outer),
-    m_id(_Options == RowMajor ? mat.m_colStartIndex[outer] : mat.m_rowStartIndex[outer] + 1),
-    m_start(m_id),
-    m_end(_Options == RowMajor ? mat.m_colStartIndex[outer + 1] : mat.m_rowStartIndex[outer + 1] + 1) {
-    }
-
-    inline InnerUpperIterator & operator++() {
-        m_id++;
-        return *this;
-    }
-
-    inline InnerUpperIterator & operator+=(Index shift) {
-        m_id += shift;
-        return *this;
-    }
-
-    inline Scalar value() const {
-        return m_matrix.m_data.upper(m_id);
-    }
-
-    inline Scalar* valuePtr() {
-        return const_cast<Scalar*> (&(m_matrix.m_data.upper(m_id)));
-    }
-
-    inline Scalar& valueRef() {
-        return const_cast<Scalar&> (m_matrix.m_data.upper(m_id));
-    }
-
-    inline Index index() const {
-        return IsRowMajor ? m_outer - m_matrix.m_data.upperProfile(m_outer) + (m_id - m_start) :
-                m_outer + (m_id - m_start) + 1;
-    }
-
-    inline Index row() const {
-        return IsRowMajor ? index() : m_outer;
-    }
-
-    inline Index col() const {
-        return IsRowMajor ? m_outer : index();
-    }
-
-    inline size_t size() const {
-        return m_matrix.m_data.upperProfile(m_outer);
-    }
-
-    inline operator bool() const {
-        return (m_id < m_end) && (m_id >= m_start);
-    }
-
-protected:
-    const SkylineMatrix& m_matrix;
-    const Index m_outer;
-    Index m_id;
-    const Index m_start;
-    const Index m_end;
-};
-
-template<typename Scalar, int _Options>
-class SkylineMatrix<Scalar, _Options>::InnerLowerIterator {
-public:
-
-    InnerLowerIterator(const SkylineMatrix& mat, Index outer)
-    : m_matrix(mat),
-    m_outer(outer),
-    m_id(_Options == RowMajor ? mat.m_rowStartIndex[outer] : mat.m_colStartIndex[outer] + 1),
-    m_start(m_id),
-    m_end(_Options == RowMajor ? mat.m_rowStartIndex[outer + 1] : mat.m_colStartIndex[outer + 1] + 1) {
-    }
-
-    inline InnerLowerIterator & operator++() {
-        m_id++;
-        return *this;
-    }
-
-    inline InnerLowerIterator & operator+=(Index shift) {
-        m_id += shift;
-        return *this;
-    }
-
-    inline Scalar value() const {
-        return m_matrix.m_data.lower(m_id);
-    }
-
-    inline Scalar* valuePtr() {
-        return const_cast<Scalar*> (&(m_matrix.m_data.lower(m_id)));
-    }
-
-    inline Scalar& valueRef() {
-        return const_cast<Scalar&> (m_matrix.m_data.lower(m_id));
-    }
-
-    inline Index index() const {
-        return IsRowMajor ? m_outer - m_matrix.m_data.lowerProfile(m_outer) + (m_id - m_start) :
-                m_outer + (m_id - m_start) + 1;
-        ;
-    }
-
-    inline Index row() const {
-        return IsRowMajor ? m_outer : index();
-    }
-
-    inline Index col() const {
-        return IsRowMajor ? index() : m_outer;
-    }
-
-    inline size_t size() const {
-        return m_matrix.m_data.lowerProfile(m_outer);
-    }
-
-    inline operator bool() const {
-        return (m_id < m_end) && (m_id >= m_start);
-    }
-
-protected:
-    const SkylineMatrix& m_matrix;
-    const Index m_outer;
-    Index m_id;
-    const Index m_start;
-    const Index m_end;
-};
-
-} // end namespace Eigen
-
-#endif // EIGEN_SkylineMatrix_H
diff --git a/inst/include/unsupported/Eigen/src/Skyline/SkylineMatrixBase.h b/inst/include/unsupported/Eigen/src/Skyline/SkylineMatrixBase.h
deleted file mode 100644
index b3a23723..00000000
--- a/inst/include/unsupported/Eigen/src/Skyline/SkylineMatrixBase.h
+++ /dev/null
@@ -1,212 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008-2009 Guillaume Saupin <guillaume.saupin@cea.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_SKYLINEMATRIXBASE_H
-#define EIGEN_SKYLINEMATRIXBASE_H
-
-#include "SkylineUtil.h"
-
-namespace Eigen { 
-
-/** \ingroup Skyline_Module
- *
- * \class SkylineMatrixBase
- *
- * \brief Base class of any skyline matrices or skyline expressions
- *
- * \param Derived
- *
- */
-template<typename Derived> class SkylineMatrixBase : public EigenBase<Derived> {
-public:
-
-    typedef typename internal::traits<Derived>::Scalar Scalar;
-    typedef typename internal::traits<Derived>::StorageKind StorageKind;
-    typedef typename internal::index<StorageKind>::type Index;
-
-    enum {
-        RowsAtCompileTime = internal::traits<Derived>::RowsAtCompileTime,
-        /**< The number of rows at compile-time. This is just a copy of the value provided
-         * by the \a Derived type. If a value is not known at compile-time,
-         * it is set to the \a Dynamic constant.
-         * \sa MatrixBase::rows(), MatrixBase::cols(), ColsAtCompileTime, SizeAtCompileTime */
-
-        ColsAtCompileTime = internal::traits<Derived>::ColsAtCompileTime,
-        /**< The number of columns at compile-time. This is just a copy of the value provided
-         * by the \a Derived type. If a value is not known at compile-time,
-         * it is set to the \a Dynamic constant.
-         * \sa MatrixBase::rows(), MatrixBase::cols(), RowsAtCompileTime, SizeAtCompileTime */
-
-
-        SizeAtCompileTime = (internal::size_at_compile_time<internal::traits<Derived>::RowsAtCompileTime,
-        internal::traits<Derived>::ColsAtCompileTime>::ret),
-        /**< This is equal to the number of coefficients, i.e. the number of
-         * rows times the number of columns, or to \a Dynamic if this is not
-         * known at compile-time. \sa RowsAtCompileTime, ColsAtCompileTime */
-
-        MaxRowsAtCompileTime = RowsAtCompileTime,
-        MaxColsAtCompileTime = ColsAtCompileTime,
-
-        MaxSizeAtCompileTime = (internal::size_at_compile_time<MaxRowsAtCompileTime,
-        MaxColsAtCompileTime>::ret),
-
-        IsVectorAtCompileTime = RowsAtCompileTime == 1 || ColsAtCompileTime == 1,
-        /**< This is set to true if either the number of rows or the number of
-         * columns is known at compile-time to be equal to 1. Indeed, in that case,
-         * we are dealing with a column-vector (if there is only one column) or with
-         * a row-vector (if there is only one row). */
-
-        Flags = internal::traits<Derived>::Flags,
-        /**< This stores expression \ref flags flags which may or may not be inherited by new expressions
-         * constructed from this one. See the \ref flags "list of flags".
-         */
-
-        CoeffReadCost = internal::traits<Derived>::CoeffReadCost,
-        /**< This is a rough measure of how expensive it is to read one coefficient from
-         * this expression.
-         */
-
-        IsRowMajor = Flags & RowMajorBit ? 1 : 0
-    };
-
-#ifndef EIGEN_PARSED_BY_DOXYGEN
-    /** This is the "real scalar" type; if the \a Scalar type is already real numbers
-     * (e.g. int, float or double) then \a RealScalar is just the same as \a Scalar. If
-     * \a Scalar is \a std::complex<T> then RealScalar is \a T.
-     *
-     * \sa class NumTraits
-     */
-    typedef typename NumTraits<Scalar>::Real RealScalar;
-
-    /** type of the equivalent square matrix */
-    typedef Matrix<Scalar, EIGEN_SIZE_MAX(RowsAtCompileTime, ColsAtCompileTime),
-                           EIGEN_SIZE_MAX(RowsAtCompileTime, ColsAtCompileTime) > SquareMatrixType;
-
-    inline const Derived& derived() const {
-        return *static_cast<const Derived*> (this);
-    }
-
-    inline Derived& derived() {
-        return *static_cast<Derived*> (this);
-    }
-
-    inline Derived& const_cast_derived() const {
-        return *static_cast<Derived*> (const_cast<SkylineMatrixBase*> (this));
-    }
-#endif // not EIGEN_PARSED_BY_DOXYGEN
-
-    /** \returns the number of rows. \sa cols(), RowsAtCompileTime */
-    inline Index rows() const {
-        return derived().rows();
-    }
-
-    /** \returns the number of columns. \sa rows(), ColsAtCompileTime*/
-    inline Index cols() const {
-        return derived().cols();
-    }
-
-    /** \returns the number of coefficients, which is \a rows()*cols().
-     * \sa rows(), cols(), SizeAtCompileTime. */
-    inline Index size() const {
-        return rows() * cols();
-    }
-
-    /** \returns the number of nonzero coefficients which is in practice the number
-     * of stored coefficients. */
-    inline Index nonZeros() const {
-        return derived().nonZeros();
-    }
-
-    /** \returns the size of the storage major dimension,
-     * i.e., the number of columns for a columns major matrix, and the number of rows otherwise */
-    Index outerSize() const {
-        return (int(Flags) & RowMajorBit) ? this->rows() : this->cols();
-    }
-
-    /** \returns the size of the inner dimension according to the storage order,
-     * i.e., the number of rows for a columns major matrix, and the number of cols otherwise */
-    Index innerSize() const {
-        return (int(Flags) & RowMajorBit) ? this->cols() : this->rows();
-    }
-
-    bool isRValue() const {
-        return m_isRValue;
-    }
-
-    Derived& markAsRValue() {
-        m_isRValue = true;
-        return derived();
-    }
-
-    SkylineMatrixBase() : m_isRValue(false) {
-        /* TODO check flags */
-    }
-
-    inline Derived & operator=(const Derived& other) {
-        this->operator=<Derived > (other);
-        return derived();
-    }
-
-    template<typename OtherDerived>
-    inline void assignGeneric(const OtherDerived& other) {
-        derived().resize(other.rows(), other.cols());
-        for (Index row = 0; row < rows(); row++)
-            for (Index col = 0; col < cols(); col++) {
-                if (other.coeff(row, col) != Scalar(0))
-                    derived().insert(row, col) = other.coeff(row, col);
-            }
-        derived().finalize();
-    }
-
-    template<typename OtherDerived>
-            inline Derived & operator=(const SkylineMatrixBase<OtherDerived>& other) {
-        //TODO
-    }
-
-    template<typename Lhs, typename Rhs>
-            inline Derived & operator=(const SkylineProduct<Lhs, Rhs, SkylineTimeSkylineProduct>& product);
-
-    friend std::ostream & operator <<(std::ostream & s, const SkylineMatrixBase& m) {
-        s << m.derived();
-        return s;
-    }
-
-    template<typename OtherDerived>
-    const typename SkylineProductReturnType<Derived, OtherDerived>::Type
-    operator*(const MatrixBase<OtherDerived> &other) const;
-
-    /** \internal use operator= */
-    template<typename DenseDerived>
-    void evalTo(MatrixBase<DenseDerived>& dst) const {
-        dst.setZero();
-        for (Index i = 0; i < rows(); i++)
-            for (Index j = 0; j < rows(); j++)
-                dst(i, j) = derived().coeff(i, j);
-    }
-
-    Matrix<Scalar, RowsAtCompileTime, ColsAtCompileTime> toDense() const {
-        return derived();
-    }
-
-    /** \returns the matrix or vector obtained by evaluating this expression.
-     *
-     * Notice that in the case of a plain matrix or vector (not an expression) this function just returns
-     * a const reference, in order to avoid a useless copy.
-     */
-    EIGEN_STRONG_INLINE const typename internal::eval<Derived, IsSkyline>::type eval() const {
-        return typename internal::eval<Derived>::type(derived());
-    }
-
-protected:
-    bool m_isRValue;
-};
-
-} // end namespace Eigen
-
-#endif // EIGEN_SkylineMatrixBase_H
diff --git a/inst/include/unsupported/Eigen/src/Skyline/SkylineProduct.h b/inst/include/unsupported/Eigen/src/Skyline/SkylineProduct.h
deleted file mode 100644
index 1ddf455e..00000000
--- a/inst/include/unsupported/Eigen/src/Skyline/SkylineProduct.h
+++ /dev/null
@@ -1,295 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008-2009 Guillaume Saupin <guillaume.saupin@cea.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_SKYLINEPRODUCT_H
-#define EIGEN_SKYLINEPRODUCT_H
-
-namespace Eigen { 
-
-template<typename Lhs, typename Rhs, int ProductMode>
-struct SkylineProductReturnType {
-    typedef const typename internal::nested<Lhs, Rhs::RowsAtCompileTime>::type LhsNested;
-    typedef const typename internal::nested<Rhs, Lhs::RowsAtCompileTime>::type RhsNested;
-
-    typedef SkylineProduct<LhsNested, RhsNested, ProductMode> Type;
-};
-
-template<typename LhsNested, typename RhsNested, int ProductMode>
-struct internal::traits<SkylineProduct<LhsNested, RhsNested, ProductMode> > {
-    // clean the nested types:
-    typedef typename internal::remove_all<LhsNested>::type _LhsNested;
-    typedef typename internal::remove_all<RhsNested>::type _RhsNested;
-    typedef typename _LhsNested::Scalar Scalar;
-
-    enum {
-        LhsCoeffReadCost = _LhsNested::CoeffReadCost,
-        RhsCoeffReadCost = _RhsNested::CoeffReadCost,
-        LhsFlags = _LhsNested::Flags,
-        RhsFlags = _RhsNested::Flags,
-
-        RowsAtCompileTime = _LhsNested::RowsAtCompileTime,
-        ColsAtCompileTime = _RhsNested::ColsAtCompileTime,
-        InnerSize = EIGEN_SIZE_MIN_PREFER_FIXED(_LhsNested::ColsAtCompileTime, _RhsNested::RowsAtCompileTime),
-
-        MaxRowsAtCompileTime = _LhsNested::MaxRowsAtCompileTime,
-        MaxColsAtCompileTime = _RhsNested::MaxColsAtCompileTime,
-
-        EvalToRowMajor = (RhsFlags & LhsFlags & RowMajorBit),
-        ResultIsSkyline = ProductMode == SkylineTimeSkylineProduct,
-
-        RemovedBits = ~((EvalToRowMajor ? 0 : RowMajorBit) | (ResultIsSkyline ? 0 : SkylineBit)),
-
-        Flags = (int(LhsFlags | RhsFlags) & HereditaryBits & RemovedBits)
-        | EvalBeforeAssigningBit
-        | EvalBeforeNestingBit,
-
-        CoeffReadCost = Dynamic
-    };
-
-    typedef typename internal::conditional<ResultIsSkyline,
-            SkylineMatrixBase<SkylineProduct<LhsNested, RhsNested, ProductMode> >,
-            MatrixBase<SkylineProduct<LhsNested, RhsNested, ProductMode> > >::type Base;
-};
-
-namespace internal {
-template<typename LhsNested, typename RhsNested, int ProductMode>
-class SkylineProduct : no_assignment_operator,
-public traits<SkylineProduct<LhsNested, RhsNested, ProductMode> >::Base {
-public:
-
-    EIGEN_GENERIC_PUBLIC_INTERFACE(SkylineProduct)
-
-private:
-
-    typedef typename traits<SkylineProduct>::_LhsNested _LhsNested;
-    typedef typename traits<SkylineProduct>::_RhsNested _RhsNested;
-
-public:
-
-    template<typename Lhs, typename Rhs>
-    EIGEN_STRONG_INLINE SkylineProduct(const Lhs& lhs, const Rhs& rhs)
-    : m_lhs(lhs), m_rhs(rhs) {
-        eigen_assert(lhs.cols() == rhs.rows());
-
-        enum {
-            ProductIsValid = _LhsNested::ColsAtCompileTime == Dynamic
-            || _RhsNested::RowsAtCompileTime == Dynamic
-            || int(_LhsNested::ColsAtCompileTime) == int(_RhsNested::RowsAtCompileTime),
-            AreVectors = _LhsNested::IsVectorAtCompileTime && _RhsNested::IsVectorAtCompileTime,
-            SameSizes = EIGEN_PREDICATE_SAME_MATRIX_SIZE(_LhsNested, _RhsNested)
-        };
-        // note to the lost user:
-        //    * for a dot product use: v1.dot(v2)
-        //    * for a coeff-wise product use: v1.cwise()*v2
-        EIGEN_STATIC_ASSERT(ProductIsValid || !(AreVectors && SameSizes),
-                INVALID_VECTOR_VECTOR_PRODUCT__IF_YOU_WANTED_A_DOT_OR_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTIONS)
-                EIGEN_STATIC_ASSERT(ProductIsValid || !(SameSizes && !AreVectors),
-                INVALID_MATRIX_PRODUCT__IF_YOU_WANTED_A_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTION)
-                EIGEN_STATIC_ASSERT(ProductIsValid || SameSizes, INVALID_MATRIX_PRODUCT)
-    }
-
-    EIGEN_STRONG_INLINE Index rows() const {
-        return m_lhs.rows();
-    }
-
-    EIGEN_STRONG_INLINE Index cols() const {
-        return m_rhs.cols();
-    }
-
-    EIGEN_STRONG_INLINE const _LhsNested& lhs() const {
-        return m_lhs;
-    }
-
-    EIGEN_STRONG_INLINE const _RhsNested& rhs() const {
-        return m_rhs;
-    }
-
-protected:
-    LhsNested m_lhs;
-    RhsNested m_rhs;
-};
-
-// dense = skyline * dense
-// Note that here we force no inlining and separate the setZero() because GCC messes up otherwise
-
-template<typename Lhs, typename Rhs, typename Dest>
-EIGEN_DONT_INLINE void skyline_row_major_time_dense_product(const Lhs& lhs, const Rhs& rhs, Dest& dst) {
-    typedef typename remove_all<Lhs>::type _Lhs;
-    typedef typename remove_all<Rhs>::type _Rhs;
-    typedef typename traits<Lhs>::Scalar Scalar;
-
-    enum {
-        LhsIsRowMajor = (_Lhs::Flags & RowMajorBit) == RowMajorBit,
-        LhsIsSelfAdjoint = (_Lhs::Flags & SelfAdjointBit) == SelfAdjointBit,
-        ProcessFirstHalf = LhsIsSelfAdjoint
-        && (((_Lhs::Flags & (UpperTriangularBit | LowerTriangularBit)) == 0)
-        || ((_Lhs::Flags & UpperTriangularBit) && !LhsIsRowMajor)
-        || ((_Lhs::Flags & LowerTriangularBit) && LhsIsRowMajor)),
-        ProcessSecondHalf = LhsIsSelfAdjoint && (!ProcessFirstHalf)
-    };
-
-    //Use matrix diagonal part <- Improvement : use inner iterator on dense matrix.
-    for (Index col = 0; col < rhs.cols(); col++) {
-        for (Index row = 0; row < lhs.rows(); row++) {
-            dst(row, col) = lhs.coeffDiag(row) * rhs(row, col);
-        }
-    }
-    //Use matrix lower triangular part
-    for (Index row = 0; row < lhs.rows(); row++) {
-        typename _Lhs::InnerLowerIterator lIt(lhs, row);
-        const Index stop = lIt.col() + lIt.size();
-        for (Index col = 0; col < rhs.cols(); col++) {
-
-            Index k = lIt.col();
-            Scalar tmp = 0;
-            while (k < stop) {
-                tmp +=
-                        lIt.value() *
-                        rhs(k++, col);
-                ++lIt;
-            }
-            dst(row, col) += tmp;
-            lIt += -lIt.size();
-        }
-
-    }
-
-    //Use matrix upper triangular part
-    for (Index lhscol = 0; lhscol < lhs.cols(); lhscol++) {
-        typename _Lhs::InnerUpperIterator uIt(lhs, lhscol);
-        const Index stop = uIt.size() + uIt.row();
-        for (Index rhscol = 0; rhscol < rhs.cols(); rhscol++) {
-
-
-            const Scalar rhsCoeff = rhs.coeff(lhscol, rhscol);
-            Index k = uIt.row();
-            while (k < stop) {
-                dst(k++, rhscol) +=
-                        uIt.value() *
-                        rhsCoeff;
-                ++uIt;
-            }
-            uIt += -uIt.size();
-        }
-    }
-
-}
-
-template<typename Lhs, typename Rhs, typename Dest>
-EIGEN_DONT_INLINE void skyline_col_major_time_dense_product(const Lhs& lhs, const Rhs& rhs, Dest& dst) {
-    typedef typename remove_all<Lhs>::type _Lhs;
-    typedef typename remove_all<Rhs>::type _Rhs;
-    typedef typename traits<Lhs>::Scalar Scalar;
-
-    enum {
-        LhsIsRowMajor = (_Lhs::Flags & RowMajorBit) == RowMajorBit,
-        LhsIsSelfAdjoint = (_Lhs::Flags & SelfAdjointBit) == SelfAdjointBit,
-        ProcessFirstHalf = LhsIsSelfAdjoint
-        && (((_Lhs::Flags & (UpperTriangularBit | LowerTriangularBit)) == 0)
-        || ((_Lhs::Flags & UpperTriangularBit) && !LhsIsRowMajor)
-        || ((_Lhs::Flags & LowerTriangularBit) && LhsIsRowMajor)),
-        ProcessSecondHalf = LhsIsSelfAdjoint && (!ProcessFirstHalf)
-    };
-
-    //Use matrix diagonal part <- Improvement : use inner iterator on dense matrix.
-    for (Index col = 0; col < rhs.cols(); col++) {
-        for (Index row = 0; row < lhs.rows(); row++) {
-            dst(row, col) = lhs.coeffDiag(row) * rhs(row, col);
-        }
-    }
-
-    //Use matrix upper triangular part
-    for (Index row = 0; row < lhs.rows(); row++) {
-        typename _Lhs::InnerUpperIterator uIt(lhs, row);
-        const Index stop = uIt.col() + uIt.size();
-        for (Index col = 0; col < rhs.cols(); col++) {
-
-            Index k = uIt.col();
-            Scalar tmp = 0;
-            while (k < stop) {
-                tmp +=
-                        uIt.value() *
-                        rhs(k++, col);
-                ++uIt;
-            }
-
-
-            dst(row, col) += tmp;
-            uIt += -uIt.size();
-        }
-    }
-
-    //Use matrix lower triangular part
-    for (Index lhscol = 0; lhscol < lhs.cols(); lhscol++) {
-        typename _Lhs::InnerLowerIterator lIt(lhs, lhscol);
-        const Index stop = lIt.size() + lIt.row();
-        for (Index rhscol = 0; rhscol < rhs.cols(); rhscol++) {
-
-            const Scalar rhsCoeff = rhs.coeff(lhscol, rhscol);
-            Index k = lIt.row();
-            while (k < stop) {
-                dst(k++, rhscol) +=
-                        lIt.value() *
-                        rhsCoeff;
-                ++lIt;
-            }
-            lIt += -lIt.size();
-        }
-    }
-
-}
-
-template<typename Lhs, typename Rhs, typename ResultType,
-        int LhsStorageOrder = traits<Lhs>::Flags&RowMajorBit>
-        struct skyline_product_selector;
-
-template<typename Lhs, typename Rhs, typename ResultType>
-struct skyline_product_selector<Lhs, Rhs, ResultType, RowMajor> {
-    typedef typename traits<typename remove_all<Lhs>::type>::Scalar Scalar;
-
-    static void run(const Lhs& lhs, const Rhs& rhs, ResultType & res) {
-        skyline_row_major_time_dense_product<Lhs, Rhs, ResultType > (lhs, rhs, res);
-    }
-};
-
-template<typename Lhs, typename Rhs, typename ResultType>
-struct skyline_product_selector<Lhs, Rhs, ResultType, ColMajor> {
-    typedef typename traits<typename remove_all<Lhs>::type>::Scalar Scalar;
-
-    static void run(const Lhs& lhs, const Rhs& rhs, ResultType & res) {
-        skyline_col_major_time_dense_product<Lhs, Rhs, ResultType > (lhs, rhs, res);
-    }
-};
-
-} // end namespace internal
-
-// template<typename Derived>
-// template<typename Lhs, typename Rhs >
-// Derived & MatrixBase<Derived>::lazyAssign(const SkylineProduct<Lhs, Rhs, SkylineTimeDenseProduct>& product) {
-//     typedef typename internal::remove_all<Lhs>::type _Lhs;
-//     internal::skyline_product_selector<typename internal::remove_all<Lhs>::type,
-//             typename internal::remove_all<Rhs>::type,
-//             Derived>::run(product.lhs(), product.rhs(), derived());
-// 
-//     return derived();
-// }
-
-// skyline * dense
-
-template<typename Derived>
-template<typename OtherDerived >
-EIGEN_STRONG_INLINE const typename SkylineProductReturnType<Derived, OtherDerived>::Type
-SkylineMatrixBase<Derived>::operator*(const MatrixBase<OtherDerived> &other) const {
-
-    return typename SkylineProductReturnType<Derived, OtherDerived>::Type(derived(), other.derived());
-}
-
-} // end namespace Eigen
-
-#endif // EIGEN_SKYLINEPRODUCT_H
diff --git a/inst/include/unsupported/Eigen/src/Skyline/SkylineStorage.h b/inst/include/unsupported/Eigen/src/Skyline/SkylineStorage.h
deleted file mode 100644
index 378a8deb..00000000
--- a/inst/include/unsupported/Eigen/src/Skyline/SkylineStorage.h
+++ /dev/null
@@ -1,259 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008-2009 Guillaume Saupin <guillaume.saupin@cea.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_SKYLINE_STORAGE_H
-#define EIGEN_SKYLINE_STORAGE_H
-
-namespace Eigen { 
-
-/** Stores a skyline set of values in three structures :
- * The diagonal elements
- * The upper elements
- * The lower elements
- *
- */
-template<typename Scalar>
-class SkylineStorage {
-    typedef typename NumTraits<Scalar>::Real RealScalar;
-    typedef SparseIndex Index;
-public:
-
-    SkylineStorage()
-    : m_diag(0),
-    m_lower(0),
-    m_upper(0),
-    m_lowerProfile(0),
-    m_upperProfile(0),
-    m_diagSize(0),
-    m_upperSize(0),
-    m_lowerSize(0),
-    m_upperProfileSize(0),
-    m_lowerProfileSize(0),
-    m_allocatedSize(0) {
-    }
-
-    SkylineStorage(const SkylineStorage& other)
-    : m_diag(0),
-    m_lower(0),
-    m_upper(0),
-    m_lowerProfile(0),
-    m_upperProfile(0),
-    m_diagSize(0),
-    m_upperSize(0),
-    m_lowerSize(0),
-    m_upperProfileSize(0),
-    m_lowerProfileSize(0),
-    m_allocatedSize(0) {
-        *this = other;
-    }
-
-    SkylineStorage & operator=(const SkylineStorage& other) {
-        resize(other.diagSize(), other.m_upperProfileSize, other.m_lowerProfileSize, other.upperSize(), other.lowerSize());
-        memcpy(m_diag, other.m_diag, m_diagSize * sizeof (Scalar));
-        memcpy(m_upper, other.m_upper, other.upperSize() * sizeof (Scalar));
-        memcpy(m_lower, other.m_lower, other.lowerSize() * sizeof (Scalar));
-        memcpy(m_upperProfile, other.m_upperProfile, m_upperProfileSize * sizeof (Index));
-        memcpy(m_lowerProfile, other.m_lowerProfile, m_lowerProfileSize * sizeof (Index));
-        return *this;
-    }
-
-    void swap(SkylineStorage& other) {
-        std::swap(m_diag, other.m_diag);
-        std::swap(m_upper, other.m_upper);
-        std::swap(m_lower, other.m_lower);
-        std::swap(m_upperProfile, other.m_upperProfile);
-        std::swap(m_lowerProfile, other.m_lowerProfile);
-        std::swap(m_diagSize, other.m_diagSize);
-        std::swap(m_upperSize, other.m_upperSize);
-        std::swap(m_lowerSize, other.m_lowerSize);
-        std::swap(m_allocatedSize, other.m_allocatedSize);
-    }
-
-    ~SkylineStorage() {
-        delete[] m_diag;
-        delete[] m_upper;
-        if (m_upper != m_lower)
-            delete[] m_lower;
-        delete[] m_upperProfile;
-        delete[] m_lowerProfile;
-    }
-
-    void reserve(Index size, Index upperProfileSize, Index lowerProfileSize, Index upperSize, Index lowerSize) {
-        Index newAllocatedSize = size + upperSize + lowerSize;
-        if (newAllocatedSize > m_allocatedSize)
-            reallocate(size, upperProfileSize, lowerProfileSize, upperSize, lowerSize);
-    }
-
-    void squeeze() {
-        if (m_allocatedSize > m_diagSize + m_upperSize + m_lowerSize)
-            reallocate(m_diagSize, m_upperProfileSize, m_lowerProfileSize, m_upperSize, m_lowerSize);
-    }
-
-    void resize(Index diagSize, Index upperProfileSize, Index lowerProfileSize, Index upperSize, Index lowerSize, float reserveSizeFactor = 0) {
-        if (m_allocatedSize < diagSize + upperSize + lowerSize)
-            reallocate(diagSize, upperProfileSize, lowerProfileSize, upperSize + Index(reserveSizeFactor * upperSize), lowerSize + Index(reserveSizeFactor * lowerSize));
-        m_diagSize = diagSize;
-        m_upperSize = upperSize;
-        m_lowerSize = lowerSize;
-        m_upperProfileSize = upperProfileSize;
-        m_lowerProfileSize = lowerProfileSize;
-    }
-
-    inline Index diagSize() const {
-        return m_diagSize;
-    }
-
-    inline Index upperSize() const {
-        return m_upperSize;
-    }
-
-    inline Index lowerSize() const {
-        return m_lowerSize;
-    }
-
-    inline Index upperProfileSize() const {
-        return m_upperProfileSize;
-    }
-
-    inline Index lowerProfileSize() const {
-        return m_lowerProfileSize;
-    }
-
-    inline Index allocatedSize() const {
-        return m_allocatedSize;
-    }
-
-    inline void clear() {
-        m_diagSize = 0;
-    }
-
-    inline Scalar& diag(Index i) {
-        return m_diag[i];
-    }
-
-    inline const Scalar& diag(Index i) const {
-        return m_diag[i];
-    }
-
-    inline Scalar& upper(Index i) {
-        return m_upper[i];
-    }
-
-    inline const Scalar& upper(Index i) const {
-        return m_upper[i];
-    }
-
-    inline Scalar& lower(Index i) {
-        return m_lower[i];
-    }
-
-    inline const Scalar& lower(Index i) const {
-        return m_lower[i];
-    }
-
-    inline Index& upperProfile(Index i) {
-        return m_upperProfile[i];
-    }
-
-    inline const Index& upperProfile(Index i) const {
-        return m_upperProfile[i];
-    }
-
-    inline Index& lowerProfile(Index i) {
-        return m_lowerProfile[i];
-    }
-
-    inline const Index& lowerProfile(Index i) const {
-        return m_lowerProfile[i];
-    }
-
-    static SkylineStorage Map(Index* upperProfile, Index* lowerProfile, Scalar* diag, Scalar* upper, Scalar* lower, Index size, Index upperSize, Index lowerSize) {
-        SkylineStorage res;
-        res.m_upperProfile = upperProfile;
-        res.m_lowerProfile = lowerProfile;
-        res.m_diag = diag;
-        res.m_upper = upper;
-        res.m_lower = lower;
-        res.m_allocatedSize = res.m_diagSize = size;
-        res.m_upperSize = upperSize;
-        res.m_lowerSize = lowerSize;
-        return res;
-    }
-
-    inline void reset() {
-        memset(m_diag, 0, m_diagSize * sizeof (Scalar));
-        memset(m_upper, 0, m_upperSize * sizeof (Scalar));
-        memset(m_lower, 0, m_lowerSize * sizeof (Scalar));
-        memset(m_upperProfile, 0, m_diagSize * sizeof (Index));
-        memset(m_lowerProfile, 0, m_diagSize * sizeof (Index));
-    }
-
-    void prune(Scalar reference, RealScalar epsilon = dummy_precision<RealScalar>()) {
-        //TODO
-    }
-
-protected:
-
-    inline void reallocate(Index diagSize, Index upperProfileSize, Index lowerProfileSize, Index upperSize, Index lowerSize) {
-
-        Scalar* diag = new Scalar[diagSize];
-        Scalar* upper = new Scalar[upperSize];
-        Scalar* lower = new Scalar[lowerSize];
-        Index* upperProfile = new Index[upperProfileSize];
-        Index* lowerProfile = new Index[lowerProfileSize];
-
-        Index copyDiagSize = (std::min)(diagSize, m_diagSize);
-        Index copyUpperSize = (std::min)(upperSize, m_upperSize);
-        Index copyLowerSize = (std::min)(lowerSize, m_lowerSize);
-        Index copyUpperProfileSize = (std::min)(upperProfileSize, m_upperProfileSize);
-        Index copyLowerProfileSize = (std::min)(lowerProfileSize, m_lowerProfileSize);
-
-        // copy
-        memcpy(diag, m_diag, copyDiagSize * sizeof (Scalar));
-        memcpy(upper, m_upper, copyUpperSize * sizeof (Scalar));
-        memcpy(lower, m_lower, copyLowerSize * sizeof (Scalar));
-        memcpy(upperProfile, m_upperProfile, copyUpperProfileSize * sizeof (Index));
-        memcpy(lowerProfile, m_lowerProfile, copyLowerProfileSize * sizeof (Index));
-
-
-
-        // delete old stuff
-        delete[] m_diag;
-        delete[] m_upper;
-        delete[] m_lower;
-        delete[] m_upperProfile;
-        delete[] m_lowerProfile;
-        m_diag = diag;
-        m_upper = upper;
-        m_lower = lower;
-        m_upperProfile = upperProfile;
-        m_lowerProfile = lowerProfile;
-        m_allocatedSize = diagSize + upperSize + lowerSize;
-        m_upperSize = upperSize;
-        m_lowerSize = lowerSize;
-    }
-
-public:
-    Scalar* m_diag;
-    Scalar* m_upper;
-    Scalar* m_lower;
-    Index* m_upperProfile;
-    Index* m_lowerProfile;
-    Index m_diagSize;
-    Index m_upperSize;
-    Index m_lowerSize;
-    Index m_upperProfileSize;
-    Index m_lowerProfileSize;
-    Index m_allocatedSize;
-
-};
-
-} // end namespace Eigen
-
-#endif // EIGEN_COMPRESSED_STORAGE_H
diff --git a/inst/include/unsupported/Eigen/src/Skyline/SkylineUtil.h b/inst/include/unsupported/Eigen/src/Skyline/SkylineUtil.h
deleted file mode 100644
index 75eb612f..00000000
--- a/inst/include/unsupported/Eigen/src/Skyline/SkylineUtil.h
+++ /dev/null
@@ -1,89 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2009 Guillaume Saupin <guillaume.saupin@cea.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_SKYLINEUTIL_H
-#define EIGEN_SKYLINEUTIL_H
-
-namespace Eigen { 
-
-#ifdef NDEBUG
-#define EIGEN_DBG_SKYLINE(X)
-#else
-#define EIGEN_DBG_SKYLINE(X) X
-#endif
-
-const unsigned int SkylineBit = 0x1200;
-template<typename Lhs, typename Rhs, int ProductMode> class SkylineProduct;
-enum AdditionalProductEvaluationMode {SkylineTimeDenseProduct, SkylineTimeSkylineProduct, DenseTimeSkylineProduct};
-enum {IsSkyline = SkylineBit};
-
-
-#define EIGEN_SKYLINE_INHERIT_ASSIGNMENT_OPERATOR(Derived, Op) \
-template<typename OtherDerived> \
-EIGEN_STRONG_INLINE Derived& operator Op(const Eigen::SkylineMatrixBase<OtherDerived>& other) \
-{ \
-  return Base::operator Op(other.derived()); \
-} \
-EIGEN_STRONG_INLINE Derived& operator Op(const Derived& other) \
-{ \
-  return Base::operator Op(other); \
-}
-
-#define EIGEN_SKYLINE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Derived, Op) \
-template<typename Other> \
-EIGEN_STRONG_INLINE Derived& operator Op(const Other& scalar) \
-{ \
-  return Base::operator Op(scalar); \
-}
-
-#define EIGEN_SKYLINE_INHERIT_ASSIGNMENT_OPERATORS(Derived) \
-  EIGEN_SKYLINE_INHERIT_ASSIGNMENT_OPERATOR(Derived, =) \
-  EIGEN_SKYLINE_INHERIT_ASSIGNMENT_OPERATOR(Derived, +=) \
-  EIGEN_SKYLINE_INHERIT_ASSIGNMENT_OPERATOR(Derived, -=) \
-  EIGEN_SKYLINE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Derived, *=) \
-  EIGEN_SKYLINE_INHERIT_SCALAR_ASSIGNMENT_OPERATOR(Derived, /=)
-
-#define _EIGEN_SKYLINE_GENERIC_PUBLIC_INTERFACE(Derived, BaseClass) \
-  typedef BaseClass Base; \
-  typedef typename Eigen::internal::traits<Derived>::Scalar Scalar; \
-  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; \
-  typedef typename Eigen::internal::traits<Derived>::StorageKind StorageKind; \
-  typedef typename Eigen::internal::index<StorageKind>::type Index; \
-  enum {  Flags = Eigen::internal::traits<Derived>::Flags, };
-
-#define EIGEN_SKYLINE_GENERIC_PUBLIC_INTERFACE(Derived) \
-  _EIGEN_SKYLINE_GENERIC_PUBLIC_INTERFACE(Derived, Eigen::SkylineMatrixBase<Derived>)
-
-template<typename Derived> class SkylineMatrixBase;
-template<typename _Scalar, int _Flags = 0> class SkylineMatrix;
-template<typename _Scalar, int _Flags = 0> class DynamicSkylineMatrix;
-template<typename _Scalar, int _Flags = 0> class SkylineVector;
-template<typename _Scalar, int _Flags = 0> class MappedSkylineMatrix;
-
-namespace internal {
-
-template<typename Lhs, typename Rhs> struct skyline_product_mode;
-template<typename Lhs, typename Rhs, int ProductMode = skyline_product_mode<Lhs,Rhs>::value> struct SkylineProductReturnType;
-
-template<typename T> class eval<T,IsSkyline>
-{
-    typedef typename traits<T>::Scalar _Scalar;
-    enum {
-          _Flags = traits<T>::Flags
-    };
-
-  public:
-    typedef SkylineMatrix<_Scalar, _Flags> type;
-};
-
-} // end namespace internal
-
-} // end namespace Eigen
-
-#endif // EIGEN_SKYLINEUTIL_H
diff --git a/inst/include/unsupported/Eigen/src/SparseExtra/BlockOfDynamicSparseMatrix.h b/inst/include/unsupported/Eigen/src/SparseExtra/BlockOfDynamicSparseMatrix.h
deleted file mode 100644
index e9ec746e..00000000
--- a/inst/include/unsupported/Eigen/src/SparseExtra/BlockOfDynamicSparseMatrix.h
+++ /dev/null
@@ -1,122 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_SPARSE_BLOCKFORDYNAMICMATRIX_H
-#define EIGEN_SPARSE_BLOCKFORDYNAMICMATRIX_H
-
-namespace Eigen { 
-
-#if 0
-
-// NOTE Have to be reimplemented as a specialization of BlockImpl< DynamicSparseMatrix<_Scalar, _Options, _Index>, ... >
-// See SparseBlock.h for an example
-
-
-/***************************************************************************
-* specialisation for DynamicSparseMatrix
-***************************************************************************/
-
-template<typename _Scalar, int _Options, typename _Index, int Size>
-class SparseInnerVectorSet<DynamicSparseMatrix<_Scalar, _Options, _Index>, Size>
-  : public SparseMatrixBase<SparseInnerVectorSet<DynamicSparseMatrix<_Scalar, _Options, _Index>, Size> >
-{
-    typedef DynamicSparseMatrix<_Scalar, _Options, _Index> MatrixType;
-  public:
-
-    enum { IsRowMajor = internal::traits<SparseInnerVectorSet>::IsRowMajor };
-
-    EIGEN_SPARSE_PUBLIC_INTERFACE(SparseInnerVectorSet)
-    class InnerIterator: public MatrixType::InnerIterator
-    {
-      public:
-        inline InnerIterator(const SparseInnerVectorSet& xpr, Index outer)
-          : MatrixType::InnerIterator(xpr.m_matrix, xpr.m_outerStart + outer), m_outer(outer)
-        {}
-        inline Index row() const { return IsRowMajor ? m_outer : this->index(); }
-        inline Index col() const { return IsRowMajor ? this->index() : m_outer; }
-      protected:
-        Index m_outer;
-    };
-
-    inline SparseInnerVectorSet(const MatrixType& matrix, Index outerStart, Index outerSize)
-      : m_matrix(matrix), m_outerStart(outerStart), m_outerSize(outerSize)
-    {
-      eigen_assert( (outerStart>=0) && ((outerStart+outerSize)<=matrix.outerSize()) );
-    }
-
-    inline SparseInnerVectorSet(const MatrixType& matrix, Index outer)
-      : m_matrix(matrix), m_outerStart(outer), m_outerSize(Size)
-    {
-      eigen_assert(Size!=Dynamic);
-      eigen_assert( (outer>=0) && (outer<matrix.outerSize()) );
-    }
-
-    template<typename OtherDerived>
-    inline SparseInnerVectorSet& operator=(const SparseMatrixBase<OtherDerived>& other)
-    {
-      if (IsRowMajor != ((OtherDerived::Flags&RowMajorBit)==RowMajorBit))
-      {
-        // need to transpose => perform a block evaluation followed by a big swap
-        DynamicSparseMatrix<Scalar,IsRowMajor?RowMajorBit:0> aux(other);
-        *this = aux.markAsRValue();
-      }
-      else
-      {
-        // evaluate/copy vector per vector
-        for (Index j=0; j<m_outerSize.value(); ++j)
-        {
-          SparseVector<Scalar,IsRowMajor ? RowMajorBit : 0> aux(other.innerVector(j));
-          m_matrix.const_cast_derived()._data()[m_outerStart+j].swap(aux._data());
-        }
-      }
-      return *this;
-    }
-
-    inline SparseInnerVectorSet& operator=(const SparseInnerVectorSet& other)
-    {
-      return operator=<SparseInnerVectorSet>(other);
-    }
-
-    Index nonZeros() const
-    {
-      Index count = 0;
-      for (Index j=0; j<m_outerSize.value(); ++j)
-        count += m_matrix._data()[m_outerStart+j].size();
-      return count;
-    }
-
-    const Scalar& lastCoeff() const
-    {
-      EIGEN_STATIC_ASSERT_VECTOR_ONLY(SparseInnerVectorSet);
-      eigen_assert(m_matrix.data()[m_outerStart].size()>0);
-      return m_matrix.data()[m_outerStart].vale(m_matrix.data()[m_outerStart].size()-1);
-    }
-
-//     template<typename Sparse>
-//     inline SparseInnerVectorSet& operator=(const SparseMatrixBase<OtherDerived>& other)
-//     {
-//       return *this;
-//     }
-
-    EIGEN_STRONG_INLINE Index rows() const { return IsRowMajor ? m_outerSize.value() : m_matrix.rows(); }
-    EIGEN_STRONG_INLINE Index cols() const { return IsRowMajor ? m_matrix.cols() : m_outerSize.value(); }
-
-  protected:
-
-    const typename MatrixType::Nested m_matrix;
-    Index m_outerStart;
-    const internal::variable_if_dynamic<Index, Size> m_outerSize;
-
-};
-
-#endif
-
-} // end namespace Eigen
-
-#endif // EIGEN_SPARSE_BLOCKFORDYNAMICMATRIX_H
diff --git a/inst/include/unsupported/Eigen/src/SparseExtra/BlockSparseMatrix.h b/inst/include/unsupported/Eigen/src/SparseExtra/BlockSparseMatrix.h
new file mode 100644
index 00000000..6e8be849
--- /dev/null
+++ b/inst/include/unsupported/Eigen/src/SparseExtra/BlockSparseMatrix.h
@@ -0,0 +1,974 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2013 Desire Nuentsa <desire.nuentsa_wakam@inria.fr>
+// Copyright (C) 2013 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSEBLOCKMATRIX_H
+#define EIGEN_SPARSEBLOCKMATRIX_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+/** \ingroup SparseCore_Module
+ *
+ * \class BlockSparseMatrix
+ *
+ * \brief A versatile sparse matrix representation where each element is a block
+ *
+ * This class provides routines to manipulate block sparse matrices stored in a
+ * BSR-like representation. There are two main types :
+ *
+ * 1. All blocks have the same number of rows and columns, called block size
+ * in the following. In this case, if this block size is known at compile time,
+ * it can be given as a template parameter like
+ * \code
+ * BlockSparseMatrix<Scalar, 3, ColMajor> bmat(b_rows, b_cols);
+ * \endcode
+ * Here, bmat is a b_rows x b_cols block sparse matrix
+ * where each coefficient is a 3x3 dense matrix.
+ * If the block size is fixed but will be given at runtime,
+ * \code
+ * BlockSparseMatrix<Scalar, Dynamic, ColMajor> bmat(b_rows, b_cols);
+ * bmat.setBlockSize(block_size);
+ * \endcode
+ *
+ * 2. The second case is for variable-block sparse matrices.
+ * Here each block has its own dimensions. The only restriction is that all the blocks
+ * in a row (resp. a column) should have the same number of rows (resp. of columns).
+ * It is thus required in this case to describe the layout of the matrix by calling
+ * setBlockLayout(rowBlocks, colBlocks).
+ *
+ * In any of the previous case, the matrix can be filled by calling setFromTriplets().
+ * A regular sparse matrix can be converted to a block sparse matrix and vice versa.
+ * It is obviously required to describe the block layout beforehand by calling either
+ * setBlockSize() for fixed-size blocks or setBlockLayout for variable-size blocks.
+ *
+ * \tparam Scalar_ The Scalar type
+ * \tparam _BlockAtCompileTime The block layout option. It takes the following values
+ * Dynamic : block size known at runtime
+ * a numeric number : fixed-size block known at compile time
+ */
+template <typename Scalar_, int _BlockAtCompileTime = Dynamic, int Options_ = ColMajor, typename StorageIndex_ = int>
+class BlockSparseMatrix;
+
+template <typename BlockSparseMatrixT>
+class BlockSparseMatrixView;
+
+namespace internal {
+template <typename Scalar_, int _BlockAtCompileTime, int Options_, typename Index_>
+struct traits<BlockSparseMatrix<Scalar_, _BlockAtCompileTime, Options_, Index_> > {
+  typedef Scalar_ Scalar;
+  typedef Index_ Index;
+  typedef Sparse StorageKind;  // FIXME Where is it used ??
+  typedef MatrixXpr XprKind;
+  enum {
+    RowsAtCompileTime = Dynamic,
+    ColsAtCompileTime = Dynamic,
+    MaxRowsAtCompileTime = Dynamic,
+    MaxColsAtCompileTime = Dynamic,
+    BlockSize = _BlockAtCompileTime,
+    Flags = Options_ | NestByRefBit | LvalueBit,
+    CoeffReadCost = NumTraits<Scalar>::ReadCost,
+    SupportedAccessPatterns = InnerRandomAccessPattern
+  };
+};
+template <typename BlockSparseMatrixT>
+struct traits<BlockSparseMatrixView<BlockSparseMatrixT> > {
+  typedef Ref<
+      Matrix<typename BlockSparseMatrixT::Scalar, BlockSparseMatrixT::BlockSize, BlockSparseMatrixT::BlockSize> >
+      Scalar;
+  typedef Ref<
+      Matrix<typename BlockSparseMatrixT::RealScalar, BlockSparseMatrixT::BlockSize, BlockSparseMatrixT::BlockSize> >
+      RealScalar;
+};
+
+// Function object to sort a triplet list
+template <typename Iterator, bool IsColMajor>
+struct TripletComp {
+  typedef typename Iterator::value_type Triplet;
+  bool operator()(const Triplet& a, const Triplet& b) {
+    if (IsColMajor)
+      return ((a.col() == b.col() && a.row() < b.row()) || (a.col() < b.col()));
+    else
+      return ((a.row() == b.row() && a.col() < b.col()) || (a.row() < b.row()));
+  }
+};
+}  // end namespace internal
+
+/* Proxy to view the block sparse matrix as a regular sparse matrix */
+template <typename BlockSparseMatrixT>
+class BlockSparseMatrixView : public SparseMatrixBase<BlockSparseMatrixT> {
+ public:
+  typedef Ref<typename BlockSparseMatrixT::BlockScalar> Scalar;
+  typedef Ref<typename BlockSparseMatrixT::BlockRealScalar> RealScalar;
+  typedef typename BlockSparseMatrixT::Index Index;
+  typedef BlockSparseMatrixT Nested;
+  enum {
+    Flags = BlockSparseMatrixT::Options,
+    Options = BlockSparseMatrixT::Options,
+    RowsAtCompileTime = BlockSparseMatrixT::RowsAtCompileTime,
+    ColsAtCompileTime = BlockSparseMatrixT::ColsAtCompileTime,
+    MaxColsAtCompileTime = BlockSparseMatrixT::MaxColsAtCompileTime,
+    MaxRowsAtCompileTime = BlockSparseMatrixT::MaxRowsAtCompileTime
+  };
+
+ public:
+  BlockSparseMatrixView(const BlockSparseMatrixT& spblockmat) : m_spblockmat(spblockmat) {}
+
+  Index outerSize() const { return (Flags & RowMajorBit) == 1 ? this->rows() : this->cols(); }
+  Index cols() const { return m_spblockmat.blockCols(); }
+  Index rows() const { return m_spblockmat.blockRows(); }
+  Scalar coeff(Index row, Index col) { return m_spblockmat.coeff(row, col); }
+  Scalar coeffRef(Index row, Index col) { return m_spblockmat.coeffRef(row, col); }
+  // Wrapper to iterate over all blocks
+  class InnerIterator : public BlockSparseMatrixT::BlockInnerIterator {
+   public:
+    InnerIterator(const BlockSparseMatrixView& mat, Index outer)
+        : BlockSparseMatrixT::BlockInnerIterator(mat.m_spblockmat, outer) {}
+  };
+
+ protected:
+  const BlockSparseMatrixT& m_spblockmat;
+};
+
+// Proxy to view a regular vector as a block vector
+template <typename BlockSparseMatrixT, typename VectorType>
+class BlockVectorView {
+ public:
+  enum {
+    BlockSize = BlockSparseMatrixT::BlockSize,
+    ColsAtCompileTime = VectorType::ColsAtCompileTime,
+    RowsAtCompileTime = VectorType::RowsAtCompileTime,
+    Flags = VectorType::Flags
+  };
+  typedef Ref<const Matrix<typename BlockSparseMatrixT::Scalar, (RowsAtCompileTime == 1) ? 1 : BlockSize,
+                           (ColsAtCompileTime == 1) ? 1 : BlockSize> >
+      Scalar;
+  typedef typename BlockSparseMatrixT::Index Index;
+
+ public:
+  BlockVectorView(const BlockSparseMatrixT& spblockmat, const VectorType& vec) : m_spblockmat(spblockmat), m_vec(vec) {}
+  inline Index cols() const { return m_vec.cols(); }
+  inline Index size() const { return m_spblockmat.blockRows(); }
+  inline Scalar coeff(Index bi) const {
+    Index startRow = m_spblockmat.blockRowsIndex(bi);
+    Index rowSize = m_spblockmat.blockRowsIndex(bi + 1) - startRow;
+    return m_vec.middleRows(startRow, rowSize);
+  }
+  inline Scalar coeff(Index bi, Index j) const {
+    Index startRow = m_spblockmat.blockRowsIndex(bi);
+    Index rowSize = m_spblockmat.blockRowsIndex(bi + 1) - startRow;
+    return m_vec.block(startRow, j, rowSize, 1);
+  }
+
+ protected:
+  const BlockSparseMatrixT& m_spblockmat;
+  const VectorType& m_vec;
+};
+
+template <typename VectorType, typename Index>
+class BlockVectorReturn;
+
+// Proxy to view a regular vector as a block vector
+template <typename BlockSparseMatrixT, typename VectorType>
+class BlockVectorReturn {
+ public:
+  enum {
+    ColsAtCompileTime = VectorType::ColsAtCompileTime,
+    RowsAtCompileTime = VectorType::RowsAtCompileTime,
+    Flags = VectorType::Flags
+  };
+  typedef Ref<Matrix<typename VectorType::Scalar, RowsAtCompileTime, ColsAtCompileTime> > Scalar;
+  typedef typename BlockSparseMatrixT::Index Index;
+
+ public:
+  BlockVectorReturn(const BlockSparseMatrixT& spblockmat, VectorType& vec) : m_spblockmat(spblockmat), m_vec(vec) {}
+  inline Index size() const { return m_spblockmat.blockRows(); }
+  inline Scalar coeffRef(Index bi) {
+    Index startRow = m_spblockmat.blockRowsIndex(bi);
+    Index rowSize = m_spblockmat.blockRowsIndex(bi + 1) - startRow;
+    return m_vec.middleRows(startRow, rowSize);
+  }
+  inline Scalar coeffRef(Index bi, Index j) {
+    Index startRow = m_spblockmat.blockRowsIndex(bi);
+    Index rowSize = m_spblockmat.blockRowsIndex(bi + 1) - startRow;
+    return m_vec.block(startRow, j, rowSize, 1);
+  }
+
+ protected:
+  const BlockSparseMatrixT& m_spblockmat;
+  VectorType& m_vec;
+};
+
+// Block version of the sparse dense product
+template <typename Lhs, typename Rhs>
+class BlockSparseTimeDenseProduct;
+
+namespace internal {
+
+template <typename BlockSparseMatrixT, typename VecType>
+struct traits<BlockSparseTimeDenseProduct<BlockSparseMatrixT, VecType> > {
+  typedef Dense StorageKind;
+  typedef MatrixXpr XprKind;
+  typedef typename BlockSparseMatrixT::Scalar Scalar;
+  typedef typename BlockSparseMatrixT::Index Index;
+  enum {
+    RowsAtCompileTime = Dynamic,
+    ColsAtCompileTime = Dynamic,
+    MaxRowsAtCompileTime = Dynamic,
+    MaxColsAtCompileTime = Dynamic,
+    Flags = 0,
+    CoeffReadCost = internal::traits<BlockSparseMatrixT>::CoeffReadCost
+  };
+};
+}  // end namespace internal
+
+template <typename Lhs, typename Rhs>
+class BlockSparseTimeDenseProduct : public ProductBase<BlockSparseTimeDenseProduct<Lhs, Rhs>, Lhs, Rhs> {
+ public:
+  EIGEN_PRODUCT_PUBLIC_INTERFACE(BlockSparseTimeDenseProduct)
+
+  BlockSparseTimeDenseProduct(const Lhs& lhs, const Rhs& rhs) : Base(lhs, rhs) {}
+
+  template <typename Dest>
+  void scaleAndAddTo(Dest& dest, const typename Rhs::Scalar& alpha) const {
+    BlockVectorReturn<Lhs, Dest> tmpDest(m_lhs, dest);
+    internal::sparse_time_dense_product(BlockSparseMatrixView<Lhs>(m_lhs), BlockVectorView<Lhs, Rhs>(m_lhs, m_rhs),
+                                        tmpDest, alpha);
+  }
+
+ private:
+  BlockSparseTimeDenseProduct& operator=(const BlockSparseTimeDenseProduct&);
+};
+
+template <typename Scalar_, int _BlockAtCompileTime, int Options_, typename StorageIndex_>
+class BlockSparseMatrix
+    : public SparseMatrixBase<BlockSparseMatrix<Scalar_, _BlockAtCompileTime, Options_, StorageIndex_> > {
+ public:
+  typedef Scalar_ Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+  typedef StorageIndex_ StorageIndex;
+  typedef
+      typename internal::ref_selector<BlockSparseMatrix<Scalar_, _BlockAtCompileTime, Options_, StorageIndex_> >::type
+          Nested;
+
+  enum {
+    Options = Options_,
+    Flags = Options,
+    BlockSize = _BlockAtCompileTime,
+    RowsAtCompileTime = Dynamic,
+    ColsAtCompileTime = Dynamic,
+    MaxRowsAtCompileTime = Dynamic,
+    MaxColsAtCompileTime = Dynamic,
+    IsVectorAtCompileTime = 0,
+    IsColMajor = Flags & RowMajorBit ? 0 : 1
+  };
+  typedef Matrix<Scalar, _BlockAtCompileTime, _BlockAtCompileTime, IsColMajor ? ColMajor : RowMajor> BlockScalar;
+  typedef Matrix<RealScalar, _BlockAtCompileTime, _BlockAtCompileTime, IsColMajor ? ColMajor : RowMajor>
+      BlockRealScalar;
+  typedef std::conditional_t<_BlockAtCompileTime == Dynamic, Scalar, BlockScalar> BlockScalarReturnType;
+  typedef BlockSparseMatrix<Scalar, BlockSize, IsColMajor ? ColMajor : RowMajor, StorageIndex> PlainObject;
+
+ public:
+  // Default constructor
+  BlockSparseMatrix()
+      : m_innerBSize(0),
+        m_outerBSize(0),
+        m_innerOffset(0),
+        m_outerOffset(0),
+        m_nonzerosblocks(0),
+        m_values(0),
+        m_blockPtr(0),
+        m_indices(0),
+        m_outerIndex(0),
+        m_blockSize(BlockSize) {}
+
+  /**
+   * \brief Construct and resize
+   *
+   */
+  BlockSparseMatrix(Index brow, Index bcol)
+      : m_innerBSize(IsColMajor ? brow : bcol),
+        m_outerBSize(IsColMajor ? bcol : brow),
+        m_innerOffset(0),
+        m_outerOffset(0),
+        m_nonzerosblocks(0),
+        m_values(0),
+        m_blockPtr(0),
+        m_indices(0),
+        m_outerIndex(0),
+        m_blockSize(BlockSize) {}
+
+  /**
+   * \brief Copy-constructor
+   */
+  BlockSparseMatrix(const BlockSparseMatrix& other)
+      : m_innerBSize(other.m_innerBSize),
+        m_outerBSize(other.m_outerBSize),
+        m_nonzerosblocks(other.m_nonzerosblocks),
+        m_nonzeros(other.m_nonzeros),
+        m_blockPtr(0),
+        m_blockSize(other.m_blockSize) {
+    // should we allow copying between variable-size blocks and fixed-size blocks ??
+    eigen_assert(m_blockSize == BlockSize && " CAN NOT COPY BETWEEN FIXED-SIZE AND VARIABLE-SIZE BLOCKS");
+
+    std::copy(other.m_innerOffset, other.m_innerOffset + m_innerBSize + 1, m_innerOffset);
+    std::copy(other.m_outerOffset, other.m_outerOffset + m_outerBSize + 1, m_outerOffset);
+    std::copy(other.m_values, other.m_values + m_nonzeros, m_values);
+
+    if (m_blockSize != Dynamic) std::copy(other.m_blockPtr, other.m_blockPtr + m_nonzerosblocks, m_blockPtr);
+
+    std::copy(other.m_indices, other.m_indices + m_nonzerosblocks, m_indices);
+    std::copy(other.m_outerIndex, other.m_outerIndex + m_outerBSize, m_outerIndex);
+  }
+
+  friend void swap(BlockSparseMatrix& first, BlockSparseMatrix& second) {
+    std::swap(first.m_innerBSize, second.m_innerBSize);
+    std::swap(first.m_outerBSize, second.m_outerBSize);
+    std::swap(first.m_innerOffset, second.m_innerOffset);
+    std::swap(first.m_outerOffset, second.m_outerOffset);
+    std::swap(first.m_nonzerosblocks, second.m_nonzerosblocks);
+    std::swap(first.m_nonzeros, second.m_nonzeros);
+    std::swap(first.m_values, second.m_values);
+    std::swap(first.m_blockPtr, second.m_blockPtr);
+    std::swap(first.m_indices, second.m_indices);
+    std::swap(first.m_outerIndex, second.m_outerIndex);
+    std::swap(first.m_BlockSize, second.m_blockSize);
+  }
+
+  BlockSparseMatrix& operator=(BlockSparseMatrix other) {
+    // Copy-and-swap paradigm ... avoid leaked data if thrown
+    swap(*this, other);
+    return *this;
+  }
+
+  // Destructor
+  ~BlockSparseMatrix() {
+    delete[] m_outerIndex;
+    delete[] m_innerOffset;
+    delete[] m_outerOffset;
+    delete[] m_indices;
+    delete[] m_blockPtr;
+    delete[] m_values;
+  }
+
+  /**
+   * \brief Constructor from a sparse matrix
+   *
+   */
+  template <typename MatrixType>
+  inline BlockSparseMatrix(const MatrixType& spmat) : m_blockSize(BlockSize) {
+    EIGEN_STATIC_ASSERT((m_blockSize != Dynamic), THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE);
+
+    *this = spmat;
+  }
+
+  /**
+   * \brief Assignment from a sparse matrix with the same storage order
+   *
+   * Convert from a sparse matrix to block sparse matrix.
+   * \warning Before calling this function, tt is necessary to call
+   * either setBlockLayout() (matrices with variable-size blocks)
+   * or setBlockSize() (for fixed-size blocks).
+   */
+  template <typename MatrixType>
+  inline BlockSparseMatrix& operator=(const MatrixType& spmat) {
+    eigen_assert((m_innerBSize != 0 && m_outerBSize != 0) &&
+                 "Trying to assign to a zero-size matrix, call resize() first");
+    eigen_assert(((MatrixType::Options & RowMajorBit) != IsColMajor) && "Wrong storage order");
+    typedef SparseMatrix<bool, MatrixType::Options, typename MatrixType::Index> MatrixPatternType;
+    MatrixPatternType blockPattern(blockRows(), blockCols());
+    m_nonzeros = 0;
+
+    // First, compute the number of nonzero blocks and their locations
+    for (StorageIndex bj = 0; bj < m_outerBSize; ++bj) {
+      // Browse each outer block and compute the structure
+      std::vector<bool> nzblocksFlag(m_innerBSize, false);  // Record the existing blocks
+      blockPattern.startVec(bj);
+      for (StorageIndex j = blockOuterIndex(bj); j < blockOuterIndex(bj + 1); ++j) {
+        typename MatrixType::InnerIterator it_spmat(spmat, j);
+        for (; it_spmat; ++it_spmat) {
+          StorageIndex bi = innerToBlock(it_spmat.index());  // Index of the current nonzero block
+          if (!nzblocksFlag[bi]) {
+            // Save the index of this nonzero block
+            nzblocksFlag[bi] = true;
+            blockPattern.insertBackByOuterInnerUnordered(bj, bi) = true;
+            // Compute the total number of nonzeros (including explicit zeros in blocks)
+            m_nonzeros += blockOuterSize(bj) * blockInnerSize(bi);
+          }
+        }
+      }  // end current outer block
+    }
+    blockPattern.finalize();
+
+    // Allocate the internal arrays
+    setBlockStructure(blockPattern);
+
+    for (StorageIndex nz = 0; nz < m_nonzeros; ++nz) m_values[nz] = Scalar(0);
+    for (StorageIndex bj = 0; bj < m_outerBSize; ++bj) {
+      // Now copy the values
+      for (StorageIndex j = blockOuterIndex(bj); j < blockOuterIndex(bj + 1); ++j) {
+        // Browse the outer block column by column (for column-major matrices)
+        typename MatrixType::InnerIterator it_spmat(spmat, j);
+        for (; it_spmat; ++it_spmat) {
+          StorageIndex idx = 0;                              // Position of this block in the column block
+          StorageIndex bi = innerToBlock(it_spmat.index());  // Index of the current nonzero block
+          // Go to the inner block where this element belongs to
+          while (bi > m_indices[m_outerIndex[bj] + idx]) ++idx;  // Not expensive for ordered blocks
+          StorageIndex idxVal;  // Get the right position in the array of values for this element
+          if (m_blockSize == Dynamic) {
+            // Offset from all blocks before ...
+            idxVal = m_blockPtr[m_outerIndex[bj] + idx];
+            // ... and offset inside the block
+            idxVal += (j - blockOuterIndex(bj)) * blockOuterSize(bj) + it_spmat.index() - m_innerOffset[bi];
+          } else {
+            // All blocks before
+            idxVal = (m_outerIndex[bj] + idx) * m_blockSize * m_blockSize;
+            // inside the block
+            idxVal += (j - blockOuterIndex(bj)) * m_blockSize + (it_spmat.index() % m_blockSize);
+          }
+          // Insert the value
+          m_values[idxVal] = it_spmat.value();
+        }  // end of this column
+      }    // end of this block
+    }      // end of this outer block
+
+    return *this;
+  }
+
+  /**
+   * \brief Set the nonzero block pattern of the matrix
+   *
+   * Given a sparse matrix describing the nonzero block pattern,
+   * this function prepares the internal pointers for values.
+   * After calling this function, any *nonzero* block (bi, bj) can be set
+   * with a simple call to coeffRef(bi,bj).
+   *
+   *
+   * \warning Before calling this function, tt is necessary to call
+   * either setBlockLayout() (matrices with variable-size blocks)
+   * or setBlockSize() (for fixed-size blocks).
+   *
+   * \param blockPattern Sparse matrix of boolean elements describing the block structure
+   *
+   * \sa setBlockLayout() \sa setBlockSize()
+   */
+  template <typename MatrixType>
+  void setBlockStructure(const MatrixType& blockPattern) {
+    resize(blockPattern.rows(), blockPattern.cols());
+    reserve(blockPattern.nonZeros());
+
+    // Browse the block pattern and set up the various pointers
+    m_outerIndex[0] = 0;
+    if (m_blockSize == Dynamic) m_blockPtr[0] = 0;
+    for (StorageIndex nz = 0; nz < m_nonzeros; ++nz) m_values[nz] = Scalar(0);
+    for (StorageIndex bj = 0; bj < m_outerBSize; ++bj) {
+      // Browse each outer block
+
+      // First, copy and save the indices of nonzero blocks
+      // FIXME : find a way to avoid this ...
+      std::vector<int> nzBlockIdx;
+      typename MatrixType::InnerIterator it(blockPattern, bj);
+      for (; it; ++it) {
+        nzBlockIdx.push_back(it.index());
+      }
+      std::sort(nzBlockIdx.begin(), nzBlockIdx.end());
+
+      // Now, fill block indices and (eventually) pointers to blocks
+      for (StorageIndex idx = 0; idx < nzBlockIdx.size(); ++idx) {
+        StorageIndex offset = m_outerIndex[bj] + idx;  // offset in m_indices
+        m_indices[offset] = nzBlockIdx[idx];
+        if (m_blockSize == Dynamic)
+          m_blockPtr[offset] = m_blockPtr[offset - 1] + blockInnerSize(nzBlockIdx[idx]) * blockOuterSize(bj);
+        // There is no blockPtr for fixed-size blocks... not needed !???
+      }
+      // Save the pointer to the next outer block
+      m_outerIndex[bj + 1] = m_outerIndex[bj] + nzBlockIdx.size();
+    }
+  }
+
+  /**
+   * \brief Set the number of rows and columns blocks
+   */
+  inline void resize(Index brow, Index bcol) {
+    m_innerBSize = IsColMajor ? brow : bcol;
+    m_outerBSize = IsColMajor ? bcol : brow;
+  }
+
+  /**
+   * \brief set the block size at runtime for fixed-size block layout
+   *
+   * Call this only for fixed-size blocks
+   */
+  inline void setBlockSize(Index blockSize) { m_blockSize = blockSize; }
+
+  /**
+   * \brief Set the row and column block layouts,
+   *
+   * This function set the size of each row and column block.
+   * So this function should be used only for blocks with variable size.
+   * \param rowBlocks : Number of rows per row block
+   * \param colBlocks : Number of columns per column block
+   * \sa resize(), setBlockSize()
+   */
+  inline void setBlockLayout(const VectorXi& rowBlocks, const VectorXi& colBlocks) {
+    const VectorXi& innerBlocks = IsColMajor ? rowBlocks : colBlocks;
+    const VectorXi& outerBlocks = IsColMajor ? colBlocks : rowBlocks;
+    eigen_assert(m_innerBSize == innerBlocks.size() && "CHECK THE NUMBER OF ROW OR COLUMN BLOCKS");
+    eigen_assert(m_outerBSize == outerBlocks.size() && "CHECK THE NUMBER OF ROW OR COLUMN BLOCKS");
+    m_outerBSize = outerBlocks.size();
+    //  starting index of blocks... cumulative sums
+    m_innerOffset = new StorageIndex[m_innerBSize + 1];
+    m_outerOffset = new StorageIndex[m_outerBSize + 1];
+    m_innerOffset[0] = 0;
+    m_outerOffset[0] = 0;
+    std::partial_sum(&innerBlocks[0], &innerBlocks[m_innerBSize - 1] + 1, &m_innerOffset[1]);
+    std::partial_sum(&outerBlocks[0], &outerBlocks[m_outerBSize - 1] + 1, &m_outerOffset[1]);
+
+    // Compute the total number of nonzeros
+    m_nonzeros = 0;
+    for (StorageIndex bj = 0; bj < m_outerBSize; ++bj)
+      for (StorageIndex bi = 0; bi < m_innerBSize; ++bi) m_nonzeros += outerBlocks[bj] * innerBlocks[bi];
+  }
+
+  /**
+   * \brief Allocate the internal array of pointers to blocks and their inner indices
+   *
+   * \note For fixed-size blocks, call setBlockSize() to set the block.
+   * And For variable-size blocks, call setBlockLayout() before using this function
+   *
+   * \param nonzerosblocks Number of nonzero blocks. The total number of nonzeros is
+   * is computed in setBlockLayout() for variable-size blocks
+   * \sa setBlockSize()
+   */
+  inline void reserve(const Index nonzerosblocks) {
+    eigen_assert((m_innerBSize != 0 && m_outerBSize != 0) &&
+                 "TRYING TO RESERVE ZERO-SIZE MATRICES, CALL resize() first");
+
+    // FIXME Should free if already allocated
+    m_outerIndex = new StorageIndex[m_outerBSize + 1];
+
+    m_nonzerosblocks = nonzerosblocks;
+    if (m_blockSize != Dynamic) {
+      m_nonzeros = nonzerosblocks * (m_blockSize * m_blockSize);
+      m_blockPtr = 0;
+    } else {
+      // m_nonzeros  is already computed in setBlockLayout()
+      m_blockPtr = new StorageIndex[m_nonzerosblocks + 1];
+    }
+    m_indices = new StorageIndex[m_nonzerosblocks + 1];
+    m_values = new Scalar[m_nonzeros];
+  }
+
+  /**
+   * \brief Fill values in a matrix  from a triplet list.
+   *
+   * Each triplet item has a block stored in an Eigen dense matrix.
+   * The InputIterator class should provide the functions row(), col() and value()
+   *
+   * \note For fixed-size blocks, call setBlockSize() before this function.
+   *
+   * FIXME Do not accept duplicates
+   */
+  template <typename InputIterator>
+  void setFromTriplets(const InputIterator& begin, const InputIterator& end) {
+    eigen_assert((m_innerBSize != 0 && m_outerBSize != 0) && "ZERO BLOCKS, PLEASE CALL resize() before");
+
+    /* First, sort the triplet list
+     * FIXME This can be unnecessarily expensive since only the inner indices have to be sorted
+     * The best approach is like in SparseMatrix::setFromTriplets()
+     */
+    internal::TripletComp<InputIterator, IsColMajor> tripletcomp;
+    std::sort(begin, end, tripletcomp);
+
+    /* Count the number of rows and column blocks,
+     * and the number of nonzero blocks per outer dimension
+     */
+    VectorXi rowBlocks(m_innerBSize);  // Size of each block row
+    VectorXi colBlocks(m_outerBSize);  // Size of each block column
+    rowBlocks.setZero();
+    colBlocks.setZero();
+    VectorXi nzblock_outer(m_outerBSize);  // Number of nz blocks per outer vector
+    VectorXi nz_outer(m_outerBSize);       // Number of nz per outer vector...for variable-size blocks
+    nzblock_outer.setZero();
+    nz_outer.setZero();
+    for (InputIterator it(begin); it != end; ++it) {
+      eigen_assert(it->row() >= 0 && it->row() < this->blockRows() && it->col() >= 0 && it->col() < this->blockCols());
+      eigen_assert((it->value().rows() == it->value().cols() && (it->value().rows() == m_blockSize)) ||
+                   (m_blockSize == Dynamic));
+
+      if (m_blockSize == Dynamic) {
+        eigen_assert((rowBlocks[it->row()] == 0 || rowBlocks[it->row()] == it->value().rows()) &&
+                     "NON CORRESPONDING SIZES FOR ROW BLOCKS");
+        eigen_assert((colBlocks[it->col()] == 0 || colBlocks[it->col()] == it->value().cols()) &&
+                     "NON CORRESPONDING SIZES FOR COLUMN BLOCKS");
+        rowBlocks[it->row()] = it->value().rows();
+        colBlocks[it->col()] = it->value().cols();
+      }
+      nz_outer(IsColMajor ? it->col() : it->row()) += it->value().rows() * it->value().cols();
+      nzblock_outer(IsColMajor ? it->col() : it->row())++;
+    }
+    // Allocate member arrays
+    if (m_blockSize == Dynamic) setBlockLayout(rowBlocks, colBlocks);
+    StorageIndex nzblocks = nzblock_outer.sum();
+    reserve(nzblocks);
+
+    // Temporary markers
+    VectorXi block_id(m_outerBSize);  // To be used as a block marker during insertion
+
+    // Setup outer index pointers and markers
+    m_outerIndex[0] = 0;
+    if (m_blockSize == Dynamic) m_blockPtr[0] = 0;
+    for (StorageIndex bj = 0; bj < m_outerBSize; ++bj) {
+      m_outerIndex[bj + 1] = m_outerIndex[bj] + nzblock_outer(bj);
+      block_id(bj) = m_outerIndex[bj];
+      if (m_blockSize == Dynamic) {
+        m_blockPtr[m_outerIndex[bj + 1]] = m_blockPtr[m_outerIndex[bj]] + nz_outer(bj);
+      }
+    }
+
+    // Fill the matrix
+    for (InputIterator it(begin); it != end; ++it) {
+      StorageIndex outer = IsColMajor ? it->col() : it->row();
+      StorageIndex inner = IsColMajor ? it->row() : it->col();
+      m_indices[block_id(outer)] = inner;
+      StorageIndex block_size = it->value().rows() * it->value().cols();
+      StorageIndex nz_marker = blockPtr(block_id[outer]);
+      memcpy(&(m_values[nz_marker]), it->value().data(), block_size * sizeof(Scalar));
+      if (m_blockSize == Dynamic) {
+        m_blockPtr[block_id(outer) + 1] = m_blockPtr[block_id(outer)] + block_size;
+      }
+      block_id(outer)++;
+    }
+
+    // An alternative when the outer indices are sorted...no need to use an array of markers
+    //      for(Index bcol = 0; bcol < m_outerBSize; ++bcol)
+    //      {
+    //      Index id = 0, id_nz = 0, id_nzblock = 0;
+    //      for(InputIterator it(begin); it!=end; ++it)
+    //      {
+    //        while (id<bcol) // one pass should do the job unless there are empty columns
+    //        {
+    //          id++;
+    //          m_outerIndex[id+1]=m_outerIndex[id];
+    //        }
+    //        m_outerIndex[id+1] += 1;
+    //        m_indices[id_nzblock]=brow;
+    //        Index block_size = it->value().rows()*it->value().cols();
+    //        m_blockPtr[id_nzblock+1] = m_blockPtr[id_nzblock] + block_size;
+    //        id_nzblock++;
+    //        memcpy(&(m_values[id_nz]),it->value().data(), block_size*sizeof(Scalar));
+    //        id_nz += block_size;
+    //      }
+    //      while(id < m_outerBSize-1) // Empty columns at the end
+    //      {
+    //        id++;
+    //        m_outerIndex[id+1]=m_outerIndex[id];
+    //      }
+    //      }
+  }
+
+  /**
+   * \returns the number of rows
+   */
+  inline Index rows() const {
+    //      return blockRows();
+    return (IsColMajor ? innerSize() : outerSize());
+  }
+
+  /**
+   * \returns the number of cols
+   */
+  inline Index cols() const {
+    //      return blockCols();
+    return (IsColMajor ? outerSize() : innerSize());
+  }
+
+  inline Index innerSize() const {
+    if (m_blockSize == Dynamic)
+      return m_innerOffset[m_innerBSize];
+    else
+      return (m_innerBSize * m_blockSize);
+  }
+
+  inline Index outerSize() const {
+    if (m_blockSize == Dynamic)
+      return m_outerOffset[m_outerBSize];
+    else
+      return (m_outerBSize * m_blockSize);
+  }
+  /** \returns the number of rows grouped by blocks */
+  inline Index blockRows() const { return (IsColMajor ? m_innerBSize : m_outerBSize); }
+  /** \returns the number of columns grouped by blocks */
+  inline Index blockCols() const { return (IsColMajor ? m_outerBSize : m_innerBSize); }
+
+  inline Index outerBlocks() const { return m_outerBSize; }
+  inline Index innerBlocks() const { return m_innerBSize; }
+
+  /** \returns the block index where outer belongs to */
+  inline Index outerToBlock(Index outer) const {
+    eigen_assert(outer < outerSize() && "OUTER INDEX OUT OF BOUNDS");
+
+    if (m_blockSize != Dynamic) return (outer / m_blockSize);  // Integer division
+
+    StorageIndex b_outer = 0;
+    while (m_outerOffset[b_outer] <= outer) ++b_outer;
+    return b_outer - 1;
+  }
+  /** \returns  the block index where inner belongs to */
+  inline Index innerToBlock(Index inner) const {
+    eigen_assert(inner < innerSize() && "OUTER INDEX OUT OF BOUNDS");
+
+    if (m_blockSize != Dynamic) return (inner / m_blockSize);  // Integer division
+
+    StorageIndex b_inner = 0;
+    while (m_innerOffset[b_inner] <= inner) ++b_inner;
+    return b_inner - 1;
+  }
+
+  /**
+   *\returns a reference to the (i,j) block as an Eigen Dense Matrix
+   */
+  Ref<BlockScalar> coeffRef(Index brow, Index bcol) {
+    eigen_assert(brow < blockRows() && "BLOCK ROW INDEX OUT OF BOUNDS");
+    eigen_assert(bcol < blockCols() && "BLOCK nzblocksFlagCOLUMN OUT OF BOUNDS");
+
+    StorageIndex rsize = IsColMajor ? blockInnerSize(brow) : blockOuterSize(bcol);
+    StorageIndex csize = IsColMajor ? blockOuterSize(bcol) : blockInnerSize(brow);
+    StorageIndex inner = IsColMajor ? brow : bcol;
+    StorageIndex outer = IsColMajor ? bcol : brow;
+    StorageIndex offset = m_outerIndex[outer];
+    while (offset < m_outerIndex[outer + 1] && m_indices[offset] != inner) offset++;
+    if (m_indices[offset] == inner) {
+      return Map<BlockScalar>(&(m_values[blockPtr(offset)]), rsize, csize);
+    } else {
+      // FIXME the block does not exist, Insert it !!!!!!!!!
+      eigen_assert("DYNAMIC INSERTION IS NOT YET SUPPORTED");
+    }
+  }
+
+  /**
+   * \returns the value of the (i,j) block as an Eigen Dense Matrix
+   */
+  Map<const BlockScalar> coeff(Index brow, Index bcol) const {
+    eigen_assert(brow < blockRows() && "BLOCK ROW INDEX OUT OF BOUNDS");
+    eigen_assert(bcol < blockCols() && "BLOCK COLUMN OUT OF BOUNDS");
+
+    StorageIndex rsize = IsColMajor ? blockInnerSize(brow) : blockOuterSize(bcol);
+    StorageIndex csize = IsColMajor ? blockOuterSize(bcol) : blockInnerSize(brow);
+    StorageIndex inner = IsColMajor ? brow : bcol;
+    StorageIndex outer = IsColMajor ? bcol : brow;
+    StorageIndex offset = m_outerIndex[outer];
+    while (offset < m_outerIndex[outer + 1] && m_indices[offset] != inner) offset++;
+    if (m_indices[offset] == inner) {
+      return Map<const BlockScalar>(&(m_values[blockPtr(offset)]), rsize, csize);
+    } else
+      //        return BlockScalar::Zero(rsize, csize);
+      eigen_assert("NOT YET SUPPORTED");
+  }
+
+  // Block Matrix times vector product
+  template <typename VecType>
+  BlockSparseTimeDenseProduct<BlockSparseMatrix, VecType> operator*(const VecType& lhs) const {
+    return BlockSparseTimeDenseProduct<BlockSparseMatrix, VecType>(*this, lhs);
+  }
+
+  /** \returns the number of nonzero blocks */
+  inline Index nonZerosBlocks() const { return m_nonzerosblocks; }
+  /** \returns the total number of nonzero elements, including eventual explicit zeros in blocks */
+  inline Index nonZeros() const { return m_nonzeros; }
+
+  inline BlockScalarReturnType* valuePtr() { return static_cast<BlockScalarReturnType*>(m_values); }
+  //    inline Scalar *valuePtr(){ return m_values; }
+  inline StorageIndex* innerIndexPtr() { return m_indices; }
+  inline const StorageIndex* innerIndexPtr() const { return m_indices; }
+  inline StorageIndex* outerIndexPtr() { return m_outerIndex; }
+  inline const StorageIndex* outerIndexPtr() const { return m_outerIndex; }
+
+  /** \brief for compatibility purposes with the SparseMatrix class */
+  inline bool isCompressed() const { return true; }
+  /**
+   * \returns the starting index of the bi row block
+   */
+  inline Index blockRowsIndex(Index bi) const { return IsColMajor ? blockInnerIndex(bi) : blockOuterIndex(bi); }
+
+  /**
+   * \returns the starting index of the bj col block
+   */
+  inline Index blockColsIndex(Index bj) const { return IsColMajor ? blockOuterIndex(bj) : blockInnerIndex(bj); }
+
+  inline Index blockOuterIndex(Index bj) const {
+    return (m_blockSize == Dynamic) ? m_outerOffset[bj] : (bj * m_blockSize);
+  }
+  inline Index blockInnerIndex(Index bi) const {
+    return (m_blockSize == Dynamic) ? m_innerOffset[bi] : (bi * m_blockSize);
+  }
+
+  // Not needed ???
+  inline Index blockInnerSize(Index bi) const {
+    return (m_blockSize == Dynamic) ? (m_innerOffset[bi + 1] - m_innerOffset[bi]) : m_blockSize;
+  }
+  inline Index blockOuterSize(Index bj) const {
+    return (m_blockSize == Dynamic) ? (m_outerOffset[bj + 1] - m_outerOffset[bj]) : m_blockSize;
+  }
+
+  /**
+   * \brief Browse the matrix by outer index
+   */
+  class InnerIterator;  // Browse column by column
+
+  /**
+   * \brief Browse the matrix by block outer index
+   */
+  class BlockInnerIterator;  // Browse block by block
+
+  friend std::ostream& operator<<(std::ostream& s, const BlockSparseMatrix& m) {
+    for (StorageIndex j = 0; j < m.outerBlocks(); ++j) {
+      BlockInnerIterator itb(m, j);
+      for (; itb; ++itb) {
+        s << "(" << itb.row() << ", " << itb.col() << ")\n";
+        s << itb.value() << "\n";
+      }
+    }
+    s << std::endl;
+    return s;
+  }
+
+  /**
+   * \returns the starting position of the block \p id in the array of values
+   */
+  Index blockPtr(Index id) const {
+    if (m_blockSize == Dynamic)
+      return m_blockPtr[id];
+    else
+      return id * m_blockSize * m_blockSize;
+    // return blockDynIdx(id, std::conditional_t<(BlockSize==Dynamic), internal::true_type, internal::false_type>());
+  }
+
+ protected:
+  //    inline Index blockDynIdx(Index id, internal::true_type) const
+  //    {
+  //      return m_blockPtr[id];
+  //    }
+  //    inline Index blockDynIdx(Index id, internal::false_type) const
+  //    {
+  //      return id * BlockSize * BlockSize;
+  //    }
+
+  // To be implemented
+  // Insert a block at a particular location... need to make a room for that
+  Map<BlockScalar> insert(Index brow, Index bcol);
+
+  Index m_innerBSize;           // Number of block rows
+  Index m_outerBSize;           // Number of block columns
+  StorageIndex* m_innerOffset;  // Starting index of each inner block (size m_innerBSize+1)
+  StorageIndex* m_outerOffset;  // Starting index of each outer block (size m_outerBSize+1)
+  Index m_nonzerosblocks;       // Total nonzeros blocks (lower than  m_innerBSize x m_outerBSize)
+  Index m_nonzeros;             // Total nonzeros elements
+  Scalar* m_values;             // Values stored block column after block column (size m_nonzeros)
+  StorageIndex* m_blockPtr;     // Pointer to the beginning of each block in m_values, size m_nonzeroblocks ... null for
+                                // fixed-size blocks
+  StorageIndex* m_indices;      // Inner block indices, size m_nonzerosblocks ... OK
+  StorageIndex* m_outerIndex;   // Starting pointer of each block column in m_indices (size m_outerBSize)... OK
+  Index m_blockSize;            // Size of a block for fixed-size blocks, otherwise -1
+};
+
+template <typename Scalar_, int _BlockAtCompileTime, int Options_, typename StorageIndex_>
+class BlockSparseMatrix<Scalar_, _BlockAtCompileTime, Options_, StorageIndex_>::BlockInnerIterator {
+ public:
+  enum { Flags = Options_ };
+
+  BlockInnerIterator(const BlockSparseMatrix& mat, const Index outer)
+      : m_mat(mat), m_outer(outer), m_id(mat.m_outerIndex[outer]), m_end(mat.m_outerIndex[outer + 1]) {}
+
+  inline BlockInnerIterator& operator++() {
+    m_id++;
+    return *this;
+  }
+
+  inline const Map<const BlockScalar> value() const {
+    return Map<const BlockScalar>(&(m_mat.m_values[m_mat.blockPtr(m_id)]), rows(), cols());
+  }
+  inline Map<BlockScalar> valueRef() {
+    return Map<BlockScalar>(&(m_mat.m_values[m_mat.blockPtr(m_id)]), rows(), cols());
+  }
+  // Block inner index
+  inline Index index() const { return m_mat.m_indices[m_id]; }
+  inline Index outer() const { return m_outer; }
+  // block row index
+  inline Index row() const { return index(); }
+  // block column index
+  inline Index col() const { return outer(); }
+  // FIXME Number of rows in the current block
+  inline Index rows() const {
+    return (m_mat.m_blockSize == Dynamic) ? (m_mat.m_innerOffset[index() + 1] - m_mat.m_innerOffset[index()])
+                                          : m_mat.m_blockSize;
+  }
+  // Number of columns in the current block ...
+  inline Index cols() const {
+    return (m_mat.m_blockSize == Dynamic) ? (m_mat.m_outerOffset[m_outer + 1] - m_mat.m_outerOffset[m_outer])
+                                          : m_mat.m_blockSize;
+  }
+  inline operator bool() const { return (m_id < m_end); }
+
+ protected:
+  const BlockSparseMatrix<Scalar_, _BlockAtCompileTime, Options_, StorageIndex>& m_mat;
+  const Index m_outer;
+  Index m_id;
+  Index m_end;
+};
+
+template <typename Scalar_, int _BlockAtCompileTime, int Options_, typename StorageIndex_>
+class BlockSparseMatrix<Scalar_, _BlockAtCompileTime, Options_, StorageIndex_>::InnerIterator {
+ public:
+  InnerIterator(const BlockSparseMatrix& mat, Index outer)
+      : m_mat(mat),
+        m_outerB(mat.outerToBlock(outer)),
+        m_outer(outer),
+        itb(mat, mat.outerToBlock(outer)),
+        m_offset(outer - mat.blockOuterIndex(m_outerB)) {
+    if (itb) {
+      m_id = m_mat.blockInnerIndex(itb.index());
+      m_start = m_id;
+      m_end = m_mat.blockInnerIndex(itb.index() + 1);
+    }
+  }
+  inline InnerIterator& operator++() {
+    m_id++;
+    if (m_id >= m_end) {
+      ++itb;
+      if (itb) {
+        m_id = m_mat.blockInnerIndex(itb.index());
+        m_start = m_id;
+        m_end = m_mat.blockInnerIndex(itb.index() + 1);
+      }
+    }
+    return *this;
+  }
+  inline const Scalar& value() const { return itb.value().coeff(m_id - m_start, m_offset); }
+  inline Scalar& valueRef() { return itb.valueRef().coeff(m_id - m_start, m_offset); }
+  inline Index index() const { return m_id; }
+  inline Index outer() const { return m_outer; }
+  inline Index col() const { return outer(); }
+  inline Index row() const { return index(); }
+  inline operator bool() const { return itb; }
+
+ protected:
+  const BlockSparseMatrix& m_mat;
+  const Index m_outer;
+  const Index m_outerB;
+  BlockInnerIterator itb;  // Iterator through the blocks
+  const Index m_offset;    // Position of this column in the block
+  Index m_start;           // starting inner index of this block
+  Index m_id;              // current inner index in the block
+  Index m_end;             // starting inner index of the next block
+};
+}  // end namespace Eigen
+
+#endif  // EIGEN_SPARSEBLOCKMATRIX_H
diff --git a/inst/include/unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h b/inst/include/unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h
deleted file mode 100644
index dec16df2..00000000
--- a/inst/include/unsupported/Eigen/src/SparseExtra/DynamicSparseMatrix.h
+++ /dev/null
@@ -1,357 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// Copyright (C) 2008-2009 Gael Guennebaud <gael.guennebaud@inria.fr>
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#ifndef EIGEN_DYNAMIC_SPARSEMATRIX_H
-#define EIGEN_DYNAMIC_SPARSEMATRIX_H
-
-namespace Eigen { 
-
-/** \deprecated use a SparseMatrix in an uncompressed mode
-  *
-  * \class DynamicSparseMatrix
-  *
-  * \brief A sparse matrix class designed for matrix assembly purpose
-  *
-  * \param _Scalar the scalar type, i.e. the type of the coefficients
-  *
-  * Unlike SparseMatrix, this class provides a much higher degree of flexibility. In particular, it allows
-  * random read/write accesses in log(rho*outer_size) where \c rho is the probability that a coefficient is
-  * nonzero and outer_size is the number of columns if the matrix is column-major and the number of rows
-  * otherwise.
-  *
-  * Internally, the data are stored as a std::vector of compressed vector. The performances of random writes might
-  * decrease as the number of nonzeros per inner-vector increase. In practice, we observed very good performance
-  * till about 100 nonzeros/vector, and the performance remains relatively good till 500 nonzeros/vectors.
-  *
-  * \see SparseMatrix
-  */
-
-namespace internal {
-template<typename _Scalar, int _Options, typename _Index>
-struct traits<DynamicSparseMatrix<_Scalar, _Options, _Index> >
-{
-  typedef _Scalar Scalar;
-  typedef _Index Index;
-  typedef Sparse StorageKind;
-  typedef MatrixXpr XprKind;
-  enum {
-    RowsAtCompileTime = Dynamic,
-    ColsAtCompileTime = Dynamic,
-    MaxRowsAtCompileTime = Dynamic,
-    MaxColsAtCompileTime = Dynamic,
-    Flags = _Options | NestByRefBit | LvalueBit,
-    CoeffReadCost = NumTraits<Scalar>::ReadCost,
-    SupportedAccessPatterns = OuterRandomAccessPattern
-  };
-};
-}
-
-template<typename _Scalar, int _Options, typename _Index>
- class  DynamicSparseMatrix
-  : public SparseMatrixBase<DynamicSparseMatrix<_Scalar, _Options, _Index> >
-{
-  public:
-    EIGEN_SPARSE_PUBLIC_INTERFACE(DynamicSparseMatrix)
-    // FIXME: why are these operator already alvailable ???
-    // EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(DynamicSparseMatrix, +=)
-    // EIGEN_SPARSE_INHERIT_ASSIGNMENT_OPERATOR(DynamicSparseMatrix, -=)
-    typedef MappedSparseMatrix<Scalar,Flags> Map;
-    using Base::IsRowMajor;
-    using Base::operator=;
-    enum {
-      Options = _Options
-    };
-
-  protected:
-
-    typedef DynamicSparseMatrix<Scalar,(Flags&~RowMajorBit)|(IsRowMajor?RowMajorBit:0)> TransposedSparseMatrix;
-
-    Index m_innerSize;
-    std::vector<internal::CompressedStorage<Scalar,Index> > m_data;
-
-  public:
-
-    inline Index rows() const { return IsRowMajor ? outerSize() : m_innerSize; }
-    inline Index cols() const { return IsRowMajor ? m_innerSize : outerSize(); }
-    inline Index innerSize() const { return m_innerSize; }
-    inline Index outerSize() const { return static_cast<Index>(m_data.size()); }
-    inline Index innerNonZeros(Index j) const { return m_data[j].size(); }
-
-    std::vector<internal::CompressedStorage<Scalar,Index> >& _data() { return m_data; }
-    const std::vector<internal::CompressedStorage<Scalar,Index> >& _data() const { return m_data; }
-
-    /** \returns the coefficient value at given position \a row, \a col
-      * This operation involes a log(rho*outer_size) binary search.
-      */
-    inline Scalar coeff(Index row, Index col) const
-    {
-      const Index outer = IsRowMajor ? row : col;
-      const Index inner = IsRowMajor ? col : row;
-      return m_data[outer].at(inner);
-    }
-
-    /** \returns a reference to the coefficient value at given position \a row, \a col
-      * This operation involes a log(rho*outer_size) binary search. If the coefficient does not
-      * exist yet, then a sorted insertion into a sequential buffer is performed.
-      */
-    inline Scalar& coeffRef(Index row, Index col)
-    {
-      const Index outer = IsRowMajor ? row : col;
-      const Index inner = IsRowMajor ? col : row;
-      return m_data[outer].atWithInsertion(inner);
-    }
-
-    class InnerIterator;
-    class ReverseInnerIterator;
-
-    void setZero()
-    {
-      for (Index j=0; j<outerSize(); ++j)
-        m_data[j].clear();
-    }
-
-    /** \returns the number of non zero coefficients */
-    Index nonZeros() const
-    {
-      Index res = 0;
-      for (Index j=0; j<outerSize(); ++j)
-        res += static_cast<Index>(m_data[j].size());
-      return res;
-    }
-
-
-
-    void reserve(Index reserveSize = 1000)
-    {
-      if (outerSize()>0)
-      {
-        Index reserveSizePerVector = (std::max)(reserveSize/outerSize(),Index(4));
-        for (Index j=0; j<outerSize(); ++j)
-        {
-          m_data[j].reserve(reserveSizePerVector);
-        }
-      }
-    }
-
-    /** Does nothing: provided for compatibility with SparseMatrix */
-    inline void startVec(Index /*outer*/) {}
-
-    /** \returns a reference to the non zero coefficient at position \a row, \a col assuming that:
-      * - the nonzero does not already exist
-      * - the new coefficient is the last one of the given inner vector.
-      *
-      * \sa insert, insertBackByOuterInner */
-    inline Scalar& insertBack(Index row, Index col)
-    {
-      return insertBackByOuterInner(IsRowMajor?row:col, IsRowMajor?col:row);
-    }
-
-    /** \sa insertBack */
-    inline Scalar& insertBackByOuterInner(Index outer, Index inner)
-    {
-      eigen_assert(outer<Index(m_data.size()) && inner<m_innerSize && "out of range");
-      eigen_assert(((m_data[outer].size()==0) || (m_data[outer].index(m_data[outer].size()-1)<inner))
-                && "wrong sorted insertion");
-      m_data[outer].append(0, inner);
-      return m_data[outer].value(m_data[outer].size()-1);
-    }
-
-    inline Scalar& insert(Index row, Index col)
-    {
-      const Index outer = IsRowMajor ? row : col;
-      const Index inner = IsRowMajor ? col : row;
-
-      Index startId = 0;
-      Index id = static_cast<Index>(m_data[outer].size()) - 1;
-      m_data[outer].resize(id+2,1);
-
-      while ( (id >= startId) && (m_data[outer].index(id) > inner) )
-      {
-        m_data[outer].index(id+1) = m_data[outer].index(id);
-        m_data[outer].value(id+1) = m_data[outer].value(id);
-        --id;
-      }
-      m_data[outer].index(id+1) = inner;
-      m_data[outer].value(id+1) = 0;
-      return m_data[outer].value(id+1);
-    }
-
-    /** Does nothing: provided for compatibility with SparseMatrix */
-    inline void finalize() {}
-
-    /** Suppress all nonzeros which are smaller than \a reference under the tolerence \a epsilon */
-    void prune(Scalar reference, RealScalar epsilon = NumTraits<RealScalar>::dummy_precision())
-    {
-      for (Index j=0; j<outerSize(); ++j)
-        m_data[j].prune(reference,epsilon);
-    }
-
-    /** Resize the matrix without preserving the data (the matrix is set to zero)
-      */
-    void resize(Index rows, Index cols)
-    {
-      const Index outerSize = IsRowMajor ? rows : cols;
-      m_innerSize = IsRowMajor ? cols : rows;
-      setZero();
-      if (Index(m_data.size()) != outerSize)
-      {
-        m_data.resize(outerSize);
-      }
-    }
-
-    void resizeAndKeepData(Index rows, Index cols)
-    {
-      const Index outerSize = IsRowMajor ? rows : cols;
-      const Index innerSize = IsRowMajor ? cols : rows;
-      if (m_innerSize>innerSize)
-      {
-        // remove all coefficients with innerCoord>=innerSize
-        // TODO
-        //std::cerr << "not implemented yet\n";
-        exit(2);
-      }
-      if (m_data.size() != outerSize)
-      {
-        m_data.resize(outerSize);
-      }
-    }
-
-    /** The class DynamicSparseMatrix is deprectaed */
-    EIGEN_DEPRECATED inline DynamicSparseMatrix()
-      : m_innerSize(0), m_data(0)
-    {
-      eigen_assert(innerSize()==0 && outerSize()==0);
-    }
-
-    /** The class DynamicSparseMatrix is deprectaed */
-    EIGEN_DEPRECATED inline DynamicSparseMatrix(Index rows, Index cols)
-      : m_innerSize(0)
-    {
-      resize(rows, cols);
-    }
-
-    /** The class DynamicSparseMatrix is deprectaed */
-    template<typename OtherDerived>
-    EIGEN_DEPRECATED explicit inline DynamicSparseMatrix(const SparseMatrixBase<OtherDerived>& other)
-      : m_innerSize(0)
-    {
-    Base::operator=(other.derived());
-    }
-
-    inline DynamicSparseMatrix(const DynamicSparseMatrix& other)
-      : Base(), m_innerSize(0)
-    {
-      *this = other.derived();
-    }
-
-    inline void swap(DynamicSparseMatrix& other)
-    {
-      //EIGEN_DBG_SPARSE(std::cout << "SparseMatrix:: swap\n");
-      std::swap(m_innerSize, other.m_innerSize);
-      //std::swap(m_outerSize, other.m_outerSize);
-      m_data.swap(other.m_data);
-    }
-
-    inline DynamicSparseMatrix& operator=(const DynamicSparseMatrix& other)
-    {
-      if (other.isRValue())
-      {
-        swap(other.const_cast_derived());
-      }
-      else
-      {
-        resize(other.rows(), other.cols());
-        m_data = other.m_data;
-      }
-      return *this;
-    }
-
-    /** Destructor */
-    inline ~DynamicSparseMatrix() {}
-
-  public:
-
-    /** \deprecated
-      * Set the matrix to zero and reserve the memory for \a reserveSize nonzero coefficients. */
-    EIGEN_DEPRECATED void startFill(Index reserveSize = 1000)
-    {
-      setZero();
-      reserve(reserveSize);
-    }
-
-    /** \deprecated use insert()
-      * inserts a nonzero coefficient at given coordinates \a row, \a col and returns its reference assuming that:
-      *  1 - the coefficient does not exist yet
-      *  2 - this the coefficient with greater inner coordinate for the given outer coordinate.
-      * In other words, assuming \c *this is column-major, then there must not exists any nonzero coefficient of coordinates
-      * \c i \c x \a col such that \c i >= \a row. Otherwise the matrix is invalid.
-      *
-      * \see fillrand(), coeffRef()
-      */
-    EIGEN_DEPRECATED Scalar& fill(Index row, Index col)
-    {
-      const Index outer = IsRowMajor ? row : col;
-      const Index inner = IsRowMajor ? col : row;
-      return insertBack(outer,inner);
-    }
-
-    /** \deprecated use insert()
-      * Like fill() but with random inner coordinates.
-      * Compared to the generic coeffRef(), the unique limitation is that we assume
-      * the coefficient does not exist yet.
-      */
-    EIGEN_DEPRECATED Scalar& fillrand(Index row, Index col)
-    {
-      return insert(row,col);
-    }
-
-    /** \deprecated use finalize()
-      * Does nothing. Provided for compatibility with SparseMatrix. */
-    EIGEN_DEPRECATED void endFill() {}
-    
-#   ifdef EIGEN_DYNAMICSPARSEMATRIX_PLUGIN
-#     include EIGEN_DYNAMICSPARSEMATRIX_PLUGIN
-#   endif
- };
-
-template<typename Scalar, int _Options, typename _Index>
-class DynamicSparseMatrix<Scalar,_Options,_Index>::InnerIterator : public SparseVector<Scalar,_Options,_Index>::InnerIterator
-{
-    typedef typename SparseVector<Scalar,_Options,_Index>::InnerIterator Base;
-  public:
-    InnerIterator(const DynamicSparseMatrix& mat, Index outer)
-      : Base(mat.m_data[outer]), m_outer(outer)
-    {}
-
-    inline Index row() const { return IsRowMajor ? m_outer : Base::index(); }
-    inline Index col() const { return IsRowMajor ? Base::index() : m_outer; }
-
-  protected:
-    const Index m_outer;
-};
-
-template<typename Scalar, int _Options, typename _Index>
-class DynamicSparseMatrix<Scalar,_Options,_Index>::ReverseInnerIterator : public SparseVector<Scalar,_Options,_Index>::ReverseInnerIterator
-{
-    typedef typename SparseVector<Scalar,_Options,_Index>::ReverseInnerIterator Base;
-  public:
-    ReverseInnerIterator(const DynamicSparseMatrix& mat, Index outer)
-      : Base(mat.m_data[outer]), m_outer(outer)
-    {}
-
-    inline Index row() const { return IsRowMajor ? m_outer : Base::index(); }
-    inline Index col() const { return IsRowMajor ? Base::index() : m_outer; }
-
-  protected:
-    const Index m_outer;
-};
-
-} // end namespace Eigen
-
-#endif // EIGEN_DYNAMIC_SPARSEMATRIX_H
diff --git a/inst/include/unsupported/Eigen/src/SparseExtra/InternalHeaderCheck.h b/inst/include/unsupported/Eigen/src/SparseExtra/InternalHeaderCheck.h
new file mode 100644
index 00000000..0e55251b
--- /dev/null
+++ b/inst/include/unsupported/Eigen/src/SparseExtra/InternalHeaderCheck.h
@@ -0,0 +1,3 @@
+#ifndef EIGEN_SPARSE_EXTRA_MODULE_H
+#error "Please include unsupported/Eigen/SparseExtra instead of including headers inside the src directory directly."
+#endif
diff --git a/inst/include/unsupported/Eigen/src/SparseExtra/MarketIO.h b/inst/include/unsupported/Eigen/src/SparseExtra/MarketIO.h
index 7aafce92..f92622dc 100644
--- a/inst/include/unsupported/Eigen/src/SparseExtra/MarketIO.h
+++ b/inst/include/unsupported/Eigen/src/SparseExtra/MarketIO.h
@@ -12,262 +12,355 @@
 #define EIGEN_SPARSE_MARKET_IO_H
 
 #include <iostream>
+#include <vector>
 
-namespace Eigen { 
-
-namespace internal 
-{
-  template <typename Scalar>
-  inline bool GetMarketLine (std::stringstream& line, int& M, int& N, int& i, int& j, Scalar& value)
-  {
-    line >> i >> j >> value;
-    i--;
-    j--;
-    if(i>=0 && j>=0 && i<M && j<N)
-    {
-      return true; 
-    }
-    else
-      return false;
-  }
-  template <typename Scalar>
-  inline bool GetMarketLine (std::stringstream& line, int& M, int& N, int& i, int& j, std::complex<Scalar>& value)
-  {
-    Scalar valR, valI;
-    line >> i >> j >> valR >> valI;
-    i--;
-    j--;
-    if(i>=0 && j>=0 && i<M && j<N)
-    {
-      value = std::complex<Scalar>(valR, valI);
-      return true; 
-    }
-    else
-      return false;
-  }
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 
-  template <typename RealScalar>
-  inline void  GetVectorElt (const std::string& line, RealScalar& val)
-  {
-    std::istringstream newline(line);
-    newline >> val;  
-  }
+namespace Eigen {
 
-  template <typename RealScalar>
-  inline void GetVectorElt (const std::string& line, std::complex<RealScalar>& val)
-  {
-    RealScalar valR, valI; 
-    std::istringstream newline(line);
-    newline >> valR >> valI; 
-    val = std::complex<RealScalar>(valR, valI);
-  }
-  
-  template<typename Scalar>
-  inline void putMarketHeader(std::string& header,int sym)
-  {
-    header= "%%MatrixMarket matrix coordinate ";
-    if(internal::is_same<Scalar, std::complex<float> >::value || internal::is_same<Scalar, std::complex<double> >::value)
-    {
-      header += " complex"; 
-      if(sym == Symmetric) header += " symmetric";
-      else if (sym == SelfAdjoint) header += " Hermitian";
-      else header += " general";
-    }
+namespace internal {
+template <typename Scalar, typename StorageIndex>
+inline void GetMarketLine(const char* line, StorageIndex& i, StorageIndex& j, Scalar& value) {
+  std::stringstream sline(line);
+  sline >> i >> j >> value;
+}
+
+template <>
+inline void GetMarketLine(const char* line, int& i, int& j, float& value) {
+  std::sscanf(line, "%d %d %g", &i, &j, &value);
+}
+
+template <>
+inline void GetMarketLine(const char* line, int& i, int& j, double& value) {
+  std::sscanf(line, "%d %d %lg", &i, &j, &value);
+}
+
+template <>
+inline void GetMarketLine(const char* line, int& i, int& j, std::complex<float>& value) {
+  std::sscanf(line, "%d %d %g %g", &i, &j, &numext::real_ref(value), &numext::imag_ref(value));
+}
+
+template <>
+inline void GetMarketLine(const char* line, int& i, int& j, std::complex<double>& value) {
+  std::sscanf(line, "%d %d %lg %lg", &i, &j, &numext::real_ref(value), &numext::imag_ref(value));
+}
+
+template <typename Scalar, typename StorageIndex>
+inline void GetMarketLine(const char* line, StorageIndex& i, StorageIndex& j, std::complex<Scalar>& value) {
+  std::stringstream sline(line);
+  Scalar valR, valI;
+  sline >> i >> j >> valR >> valI;
+  value = std::complex<Scalar>(valR, valI);
+}
+
+template <typename RealScalar>
+inline void GetDenseElt(const std::string& line, RealScalar& val) {
+  std::istringstream newline(line);
+  newline >> val;
+}
+
+template <typename RealScalar>
+inline void GetDenseElt(const std::string& line, std::complex<RealScalar>& val) {
+  RealScalar valR, valI;
+  std::istringstream newline(line);
+  newline >> valR >> valI;
+  val = std::complex<RealScalar>(valR, valI);
+}
+
+template <typename Scalar>
+inline void putMarketHeader(std::string& header, int sym) {
+  header = "%%MatrixMarket matrix coordinate ";
+  if (internal::is_same<Scalar, std::complex<float> >::value ||
+      internal::is_same<Scalar, std::complex<double> >::value) {
+    header += " complex";
+    if (sym == Symmetric)
+      header += " symmetric";
+    else if (sym == SelfAdjoint)
+      header += " Hermitian";
     else
-    {
-      header += " real"; 
-      if(sym == Symmetric) header += " symmetric";
-      else header += " general";
-    }
+      header += " general";
+  } else {
+    header += " real";
+    if (sym == Symmetric)
+      header += " symmetric";
+    else
+      header += " general";
   }
+}
 
-  template<typename Scalar>
-  inline void PutMatrixElt(Scalar value, int row, int col, std::ofstream& out)
-  {
-    out << row << " "<< col << " " << value << "\n";
-  }
-  template<typename Scalar>
-  inline void PutMatrixElt(std::complex<Scalar> value, int row, int col, std::ofstream& out)
-  {
-    out << row << " " << col << " " << value.real() << " " << value.imag() << "\n";
-  }
+template <typename Scalar, typename StorageIndex>
+inline void PutMatrixElt(Scalar value, StorageIndex row, StorageIndex col, std::ofstream& out) {
+  out << row << " " << col << " " << value << "\n";
+}
+template <typename Scalar, typename StorageIndex>
+inline void PutMatrixElt(std::complex<Scalar> value, StorageIndex row, StorageIndex col, std::ofstream& out) {
+  out << row << " " << col << " " << value.real() << " " << value.imag() << "\n";
+}
 
+template <typename Scalar>
+inline void putDenseElt(Scalar value, std::ofstream& out) {
+  out << value << "\n";
+}
+template <typename Scalar>
+inline void putDenseElt(std::complex<Scalar> value, std::ofstream& out) {
+  out << value.real() << " " << value.imag() << "\n";
+}
 
-  template<typename Scalar>
-  inline void putVectorElt(Scalar value, std::ofstream& out)
-  {
-    out << value << "\n"; 
-  }
-  template<typename Scalar>
-  inline void putVectorElt(std::complex<Scalar> value, std::ofstream& out)
-  {
-    out << value.real << " " << value.imag()<< "\n"; 
-  }
+}  // end namespace internal
 
-} // end namepsace internal
+/**
+ * \ingroup SparseExtra_Module
+ * @brief Reads the header of a matrixmarket file and determines the properties of a matrix
+ *
+ * @param filename of the file
+ * @param sym if the matrix is hermitian,symmetric or none of the latter (sym=0)
+ * @param iscomplex if the matrix has complex or real coefficients
+ * @param isdense if the matrix is dense or sparse
+ * @return true if the file was found
+ */
+inline bool getMarketHeader(const std::string& filename, int& sym, bool& iscomplex, bool& isdense) {
+  sym = 0;
+  iscomplex = false;
+  isdense = false;
+  std::ifstream in(filename.c_str(), std::ios::in);
+  if (!in) return false;
 
-inline bool getMarketHeader(const std::string& filename, int& sym, bool& iscomplex, bool& isvector)
-{
-  sym = 0; 
-  isvector = false;
-  std::ifstream in(filename.c_str(),std::ios::in);
-  if(!in)
-    return false;
-  
-  std::string line; 
-  // The matrix header is always the first line in the file 
-  std::getline(in, line); eigen_assert(in.good());
-  
-  std::stringstream fmtline(line); 
+  std::string line;
+  // The matrix header is always the first line in the file
+  std::getline(in, line);
+  eigen_assert(in.good());
+
+  std::stringstream fmtline(line);
   std::string substr[5];
-  fmtline>> substr[0] >> substr[1] >> substr[2] >> substr[3] >> substr[4];
-  if(substr[2].compare("array") == 0) isvector = true;
-  if(substr[3].compare("complex") == 0) iscomplex = true;
-  if(substr[4].compare("symmetric") == 0) sym = Symmetric;
-  else if (substr[4].compare("Hermitian") == 0) sym = SelfAdjoint;
-  
+  fmtline >> substr[0] >> substr[1] >> substr[2] >> substr[3] >> substr[4];
+  if (substr[2].compare("array") == 0) isdense = true;
+  if (substr[3].compare("complex") == 0) iscomplex = true;
+  if (substr[4].compare("symmetric") == 0)
+    sym = Symmetric;
+  else if (substr[4].compare("Hermitian") == 0)
+    sym = SelfAdjoint;
+
   return true;
 }
-  
-template<typename SparseMatrixType>
-bool loadMarket(SparseMatrixType& mat, const std::string& filename)
-{
+/**
+ * \ingroup SparseExtra_Module
+ * @brief Loads a sparse matrix from a matrixmarket format file.
+ *
+ * @tparam SparseMatrixType to read into, symmetries are not supported
+ * @param mat SparseMatrix to read into, current values are overwritten
+ * @param filename to parse matrix from
+ * @return returns true if file exists. Returns false if the parsing did not succeed.
+ */
+template <typename SparseMatrixType>
+bool loadMarket(SparseMatrixType& mat, const std::string& filename) {
   typedef typename SparseMatrixType::Scalar Scalar;
-  std::ifstream input(filename.c_str(),std::ios::in);
-  if(!input)
-    return false;
-  
+  typedef typename SparseMatrixType::StorageIndex StorageIndex;
+  std::ifstream input(filename.c_str(), std::ios::in);
+  if (!input) return false;
+
+  char rdbuffer[4096];
+  input.rdbuf()->pubsetbuf(rdbuffer, 4096);
+
   const int maxBuffersize = 2048;
   char buffer[maxBuffersize];
-  
+
   bool readsizes = false;
 
-  typedef Triplet<Scalar,int> T;
+  typedef Triplet<Scalar, StorageIndex> T;
   std::vector<T> elements;
-  
-  int M(-1), N(-1), NNZ(-1);
-  int count = 0;
-  while(input.getline(buffer, maxBuffersize))
-  {
-    // skip comments   
-    //NOTE An appropriate test should be done on the header to get the  symmetry
-    if(buffer[0]=='%')
-      continue;
-    
-    std::stringstream line(buffer);
-    
-    if(!readsizes)
-    {
+
+  Index M(-1), N(-1), NNZ(-1);
+  Index count = 0;
+  while (input.getline(buffer, maxBuffersize)) {
+    // skip comments
+    // NOTE An appropriate test should be done on the header to get the  symmetry
+    if (buffer[0] == '%') continue;
+
+    if (!readsizes) {
+      std::stringstream line(buffer);
       line >> M >> N >> NNZ;
-      if(M > 0 && N > 0 && NNZ > 0) 
-      {
+      if (M > 0 && N > 0) {
         readsizes = true;
-        std::cout << "sizes: " << M << "," << N << "," << NNZ << "\n";
-        mat.resize(M,N);
+        mat.resize(M, N);
         mat.reserve(NNZ);
+        elements.reserve(NNZ);
       }
-    }
-    else
-    { 
-      int i(-1), j(-1);
-      Scalar value; 
-      if( internal::GetMarketLine(line, M, N, i, j, value) ) 
-      {
-        ++ count;
-        elements.push_back(T(i,j,value));
+    } else {
+      StorageIndex i(-1), j(-1);
+      Scalar value;
+      internal::GetMarketLine(buffer, i, j, value);
+
+      i--;
+      j--;
+      if (i >= 0 && j >= 0 && i < M && j < N) {
+        ++count;
+        elements.push_back(T(i, j, value));
+      } else {
+        std::cerr << "Invalid read: " << i << "," << j << "\n";
+        return false;
       }
-      else 
-        std::cerr << "Invalid read: " << i << "," << j << "\n";        
     }
   }
+
   mat.setFromTriplets(elements.begin(), elements.end());
-  if(count!=NNZ)
+  if (count != NNZ) {
     std::cerr << count << "!=" << NNZ << "\n";
-  
+    return false;
+  }
   input.close();
   return true;
 }
 
-template<typename VectorType>
-bool loadMarketVector(VectorType& vec, const std::string& filename)
-{
-   typedef typename VectorType::Scalar Scalar;
+/**
+ * \ingroup SparseExtra_Module
+ * @brief Loads a dense Matrix or Vector from a matrixmarket file. If a statically sized matrix has to be parsed and the
+ * file contains the wrong dimensions it is undefined behaviour.
+ *
+ * @tparam DenseMatrixType to read into
+ * @param mat DenseMatrix to read into, current values are overwritten, symmetries are not supported
+ * @param filename to parse matrix from
+ * @return true if parsing was successful. Returns false if the parsing did not succeed.
+ */
+template <typename DenseType>
+bool loadMarketDense(DenseType& mat, const std::string& filename) {
+  typedef typename DenseType::Scalar Scalar;
   std::ifstream in(filename.c_str(), std::ios::in);
-  if(!in)
-    return false;
-  
-  std::string line; 
-  int n(0), col(0); 
-  do 
-  { // Skip comments
-    std::getline(in, line); eigen_assert(in.good());
+  if (!in) return false;
+
+  std::string line;
+  Index rows(0), cols(0);
+  do {  // Skip comments
+    std::getline(in, line);
+    eigen_assert(in.good());
   } while (line[0] == '%');
   std::istringstream newline(line);
-  newline  >> n >> col; 
-  eigen_assert(n>0 && col>0);
-  vec.resize(n);
-  int i = 0; 
-  Scalar value; 
-  while ( std::getline(in, line) && (i < n) ){
-    internal::GetVectorElt(line, value); 
-    vec(i++) = value; 
+  newline >> rows >> cols;
+
+  bool sizes_not_positive = (rows < 1 || cols < 1);
+  bool wrong_input_rows = (DenseType::MaxRowsAtCompileTime != Dynamic && rows > DenseType::MaxRowsAtCompileTime) ||
+                          (DenseType::RowsAtCompileTime != Dynamic && rows != DenseType::RowsAtCompileTime);
+  bool wrong_input_cols = (DenseType::MaxColsAtCompileTime != Dynamic && cols > DenseType::MaxColsAtCompileTime) ||
+                          (DenseType::ColsAtCompileTime != Dynamic && cols != DenseType::ColsAtCompileTime);
+
+  if (sizes_not_positive || wrong_input_rows || wrong_input_cols) {
+    if (sizes_not_positive) {
+      std::cerr << "non-positive row or column size in file" << filename << "\n";
+    } else {
+      std::cerr << "Input matrix can not be resized to" << rows << " x " << cols << "as given in " << filename << "\n";
+    }
+    in.close();
+    return false;
+  }
+
+  mat.resize(rows, cols);
+  Index row = 0;
+  Index col = 0;
+  Index n = 0;
+  Scalar value;
+  while (std::getline(in, line) && (row < rows) && (col < cols)) {
+    internal::GetDenseElt(line, value);
+    // matrixmarket format is column major
+    mat(row, col) = value;
+    row++;
+    if (row == rows) {
+      row = 0;
+      col++;
+    }
+    n++;
   }
   in.close();
-  if (i!=n){
-    std::cerr<< "Unable to read all elements from file " << filename << "\n";
+  if (n != mat.size()) {
+    std::cerr << "Unable to read all elements from file " << filename << "\n";
     return false;
   }
   return true;
 }
+/**
+ * \ingroup SparseExtra_Module
+ * @brief Same functionality as loadMarketDense, deprecated
+ */
+template <typename VectorType>
+bool loadMarketVector(VectorType& vec, const std::string& filename) {
+  return loadMarketDense(vec, filename);
+}
 
-template<typename SparseMatrixType>
-bool saveMarket(const SparseMatrixType& mat, const std::string& filename, int sym = 0)
-{
+/**
+ * \ingroup SparseExtra_Module
+ * @brief writes a sparse Matrix to a marketmarket format file
+ *
+ * @tparam SparseMatrixType to write to file
+ * @param mat matrix to write to file
+ * @param filename filename to write to
+ * @param sym at the moment no symmetry operations are supported
+ * @return true if writing succeeded
+ */
+template <typename SparseMatrixType>
+bool saveMarket(const SparseMatrixType& mat, const std::string& filename, int sym = 0) {
   typedef typename SparseMatrixType::Scalar Scalar;
-  std::ofstream out(filename.c_str(),std::ios::out);
-  if(!out)
-    return false;
-  
+  typedef typename SparseMatrixType::RealScalar RealScalar;
+  std::ofstream out(filename.c_str(), std::ios::out);
+  if (!out) return false;
+
   out.flags(std::ios_base::scientific);
-  out.precision(64);
-  std::string header; 
-  internal::putMarketHeader<Scalar>(header, sym); 
-  out << header << std::endl; 
+  out.precision(std::numeric_limits<RealScalar>::digits10 + 2);
+  std::string header;
+  internal::putMarketHeader<Scalar>(header, sym);
+  out << header << std::endl;
   out << mat.rows() << " " << mat.cols() << " " << mat.nonZeros() << "\n";
   int count = 0;
-  for(int j=0; j<mat.outerSize(); ++j)
-    for(typename SparseMatrixType::InnerIterator it(mat,j); it; ++it)
-    {
-	++ count;
-	internal::PutMatrixElt(it.value(), it.row()+1, it.col()+1, out);
-	// out << it.row()+1 << " " << it.col()+1 << " " << it.value() << "\n";
+  EIGEN_UNUSED_VARIABLE(count);
+  for (int j = 0; j < mat.outerSize(); ++j)
+    for (typename SparseMatrixType::InnerIterator it(mat, j); it; ++it) {
+      ++count;
+      internal::PutMatrixElt(it.value(), it.row() + 1, it.col() + 1, out);
     }
   out.close();
   return true;
 }
 
-template<typename VectorType>
-bool saveMarketVector (const VectorType& vec, const std::string& filename)
-{
- typedef typename VectorType::Scalar Scalar; 
- std::ofstream out(filename.c_str(),std::ios::out);
-  if(!out)
-    return false;
-  
+/**
+ * \ingroup SparseExtra_Module
+ * @brief writes a dense Matrix or vector to a marketmarket format file
+ *
+ * @tparam DenseMatrixType to write to file
+ * @param mat matrix to write to file
+ * @param filename filename to write to
+ * @return true if writing succeeded
+ */
+
+template <typename DenseType>
+bool saveMarketDense(const DenseType& mat, const std::string& filename) {
+  typedef typename DenseType::Scalar Scalar;
+  typedef typename DenseType::RealScalar RealScalar;
+  std::ofstream out(filename.c_str(), std::ios::out);
+  if (!out) return false;
+
   out.flags(std::ios_base::scientific);
-  out.precision(64);
-  if(internal::is_same<Scalar, std::complex<float> >::value || internal::is_same<Scalar, std::complex<double> >::value)
-      out << "%%MatrixMarket matrix array complex general\n"; 
+  out.precision(std::numeric_limits<RealScalar>::digits10 + 2);
+  if (internal::is_same<Scalar, std::complex<float> >::value || internal::is_same<Scalar, std::complex<double> >::value)
+    out << "%%MatrixMarket matrix array complex general\n";
   else
-    out << "%%MatrixMarket matrix array real general\n"; 
-  out << vec.size() << " "<< 1 << "\n";
-  for (int i=0; i < vec.size(); i++){
-    internal::putVectorElt(vec(i), out); 
+    out << "%%MatrixMarket matrix array real general\n";
+  out << mat.rows() << " " << mat.cols() << "\n";
+  for (Index i = 0; i < mat.cols(); i++) {
+    for (Index j = 0; j < mat.rows(); j++) {
+      internal::putDenseElt(mat(j, i), out);
+    }
   }
   out.close();
-  return true; 
+  return true;
+}
+
+/**
+ * \ingroup SparseExtra_Module
+ * @brief Same functionality as saveMarketDense, deprecated
+ */
+template <typename VectorType>
+bool saveMarketVector(const VectorType& vec, const std::string& filename) {
+  return saveMarketDense(vec, filename);
 }
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_SPARSE_MARKET_IO_H
+#endif  // EIGEN_SPARSE_MARKET_IO_H
diff --git a/inst/include/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h b/inst/include/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h
index 510130c0..21b62f60 100644
--- a/inst/include/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h
+++ b/inst/include/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h
@@ -11,224 +11,212 @@
 #ifndef EIGEN_BROWSE_MATRICES_H
 #define EIGEN_BROWSE_MATRICES_H
 
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
 namespace Eigen {
 
-enum {
-  SPD = 0x100,
-  NonSymmetric = 0x0
-}; 
+enum { SPD = 0x100, NonSymmetric = 0x0 };
 
-/** 
+/**
  * @brief Iterator to browse matrices from a specified folder
- * 
- * This is used to load all the matrices from a folder. 
+ *
+ * This is used to load all the matrices from a folder.
  * The matrices should be in Matrix Market format
  * It is assumed that the matrices are named as matname.mtx
  * and matname_SPD.mtx if the matrix is Symmetric and positive definite (or Hermitian)
  * The right hand side vectors are loaded as well, if they exist.
- * They should be named as matname_b.mtx. 
+ * They should be named as matname_b.mtx.
  * Note that the right hand side for a SPD matrix is named as matname_SPD_b.mtx
- * 
+ *
  * Sometimes a reference solution is available. In this case, it should be named as matname_x.mtx
- * 
+ *
  * Sample code
  * \code
- * 
+ *
  * \endcode
- * 
- * \tparam Scalar The scalar type 
+ *
+ * \tparam Scalar The scalar type
  */
 template <typename Scalar>
-class MatrixMarketIterator 
-{
-  public:
-    typedef Matrix<Scalar,Dynamic,1> VectorType; 
-    typedef SparseMatrix<Scalar,ColMajor> MatrixType; 
-  
-  public:
-    MatrixMarketIterator(const std::string folder):m_sym(0),m_isvalid(false),m_matIsLoaded(false),m_hasRhs(false),m_hasrefX(false),m_folder(folder)
-    {
-      m_folder_id = opendir(folder.c_str());
-      if (!m_folder_id){
-        m_isvalid = false;
-        std::cerr << "The provided Matrix folder could not be opened \n\n";
-        abort();
-      }
-      Getnextvalidmatrix();
-    }
-    
-    ~MatrixMarketIterator()
-    {
-      if (m_folder_id) closedir(m_folder_id); 
-    }
-    
-    inline MatrixMarketIterator& operator++()
-    {
+class MatrixMarketIterator {
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+
+ public:
+  typedef Matrix<Scalar, Dynamic, 1> VectorType;
+  typedef SparseMatrix<Scalar, ColMajor> MatrixType;
+
+ public:
+  MatrixMarketIterator(const std::string& folder)
+      : m_sym(0), m_isvalid(false), m_matIsLoaded(false), m_hasRhs(false), m_hasrefX(false), m_folder(folder) {
+    m_folder_id = opendir(folder.c_str());
+    if (m_folder_id) Getnextvalidmatrix();
+  }
+
+  ~MatrixMarketIterator() {
+    if (m_folder_id) closedir(m_folder_id);
+  }
+
+  inline MatrixMarketIterator& operator++() {
+    m_matIsLoaded = false;
+    m_hasrefX = false;
+    m_hasRhs = false;
+    Getnextvalidmatrix();
+    return *this;
+  }
+  inline operator bool() const { return m_isvalid; }
+
+  /** Return the sparse matrix corresponding to the current file */
+  inline MatrixType& matrix() {
+    // Read the matrix
+    if (m_matIsLoaded) return m_mat;
+
+    std::string matrix_file = m_folder + "/" + m_matname + ".mtx";
+    if (!loadMarket(m_mat, matrix_file)) {
+      std::cerr << "Warning loadMarket failed when loading \"" << matrix_file << "\"" << std::endl;
       m_matIsLoaded = false;
-      m_hasrefX = false;
-      m_hasRhs = false;
-      Getnextvalidmatrix();
-      return *this;
+      return m_mat;
     }
-    inline operator bool() const { return m_isvalid;}
-    
-    /** Return the sparse matrix corresponding to the current file */
-    inline MatrixType& matrix() 
-    { 
-      // Read the matrix
-      if (m_matIsLoaded) return m_mat;
-      
-      std::string matrix_file = m_folder + "/" + m_matname + ".mtx";
-      if ( !loadMarket(m_mat, matrix_file)) 
-      {
-        m_matIsLoaded = false;
-        return m_mat;
-      }
-      m_matIsLoaded = true; 
-      
-      if (m_sym != NonSymmetric) 
-      { // Store the upper part of the matrix. It is needed by the solvers dealing with nonsymmetric matrices ??
-        MatrixType B; 
-        B = m_mat;
-        m_mat = B.template selfadjointView<Lower>();
+    m_matIsLoaded = true;
+
+    if (m_sym != NonSymmetric) {
+      // Check whether we need to restore a full matrix:
+      RealScalar diag_norm = m_mat.diagonal().norm();
+      RealScalar lower_norm = m_mat.template triangularView<Lower>().norm();
+      RealScalar upper_norm = m_mat.template triangularView<Upper>().norm();
+      if (lower_norm > diag_norm && upper_norm == diag_norm) {
+        // only the lower part is stored
+        MatrixType tmp(m_mat);
+        m_mat = tmp.template selfadjointView<Lower>();
+      } else if (upper_norm > diag_norm && lower_norm == diag_norm) {
+        // only the upper part is stored
+        MatrixType tmp(m_mat);
+        m_mat = tmp.template selfadjointView<Upper>();
       }
-      return m_mat; 
     }
-    
-    /** Return the right hand side corresponding to the current matrix. 
-     * If the rhs file is not provided, a random rhs is generated
-     */
-    inline VectorType& rhs() 
-    { 
-       // Get the right hand side
-      if (m_hasRhs) return m_rhs;
-      
-      std::string rhs_file;
-      rhs_file = m_folder + "/" + m_matname + "_b.mtx"; // The pattern is matname_b.mtx
-      m_hasRhs = Fileexists(rhs_file);
-      if (m_hasRhs)
-      {
-        m_rhs.resize(m_mat.cols());
-        m_hasRhs = loadMarketVector(m_rhs, rhs_file);
-      }
-      if (!m_hasRhs)
-      {
-        // Generate a random right hand side
-        if (!m_matIsLoaded) this->matrix(); 
-        m_refX.resize(m_mat.cols());
-        m_refX.setRandom();
-        m_rhs = m_mat * m_refX;
-        m_hasrefX = true;
-        m_hasRhs = true;
-      }
-      return m_rhs; 
+    return m_mat;
+  }
+
+  /** Return the right hand side corresponding to the current matrix.
+   * If the rhs file is not provided, a random rhs is generated
+   */
+  inline VectorType& rhs() {
+    // Get the right hand side
+    if (m_hasRhs) return m_rhs;
+
+    std::string rhs_file;
+    rhs_file = m_folder + "/" + m_matname + "_b.mtx";  // The pattern is matname_b.mtx
+    m_hasRhs = Fileexists(rhs_file);
+    if (m_hasRhs) {
+      m_rhs.resize(m_mat.cols());
+      m_hasRhs = loadMarketVector(m_rhs, rhs_file);
     }
-    
-    /** Return a reference solution
-     * If it is not provided and if the right hand side is not available
-     * then refX is randomly generated such that A*refX = b 
-     * where A and b are the matrix and the rhs. 
-     * Note that when a rhs is provided, refX is not available 
-     */
-    inline VectorType& refX() 
-    { 
-      // Check if a reference solution is provided
-      if (m_hasrefX) return m_refX;
-      
-      std::string lhs_file;
-      lhs_file = m_folder + "/" + m_matname + "_x.mtx"; 
-      m_hasrefX = Fileexists(lhs_file);
-      if (m_hasrefX)
-      {
-        m_refX.resize(m_mat.cols());
-        m_hasrefX = loadMarketVector(m_refX, lhs_file);
-      }
-      return m_refX; 
+    if (!m_hasRhs) {
+      // Generate a random right hand side
+      if (!m_matIsLoaded) this->matrix();
+      m_refX.resize(m_mat.cols());
+      m_refX.setRandom();
+      m_rhs = m_mat * m_refX;
+      m_hasrefX = true;
+      m_hasRhs = true;
     }
-    
-    inline std::string& matname() { return m_matname; }
-    
-    inline int sym() { return m_sym; }
-    
-    inline bool hasRhs() {return m_hasRhs; }
-    inline bool hasrefX() {return m_hasrefX; }
-    
-  protected:
-    
-    inline bool Fileexists(std::string file)
-    {
-      std::ifstream file_id(file.c_str());
-      if (!file_id.good() ) 
-      {
-        return false;
-      }
-      else 
-      {
-        file_id.close();
-        return true;
-      }
+    return m_rhs;
+  }
+
+  /** Return a reference solution
+   * If it is not provided and if the right hand side is not available
+   * then refX is randomly generated such that A*refX = b
+   * where A and b are the matrix and the rhs.
+   * Note that when a rhs is provided, refX is not available
+   */
+  inline VectorType& refX() {
+    // Check if a reference solution is provided
+    if (m_hasrefX) return m_refX;
+
+    std::string lhs_file;
+    lhs_file = m_folder + "/" + m_matname + "_x.mtx";
+    m_hasrefX = Fileexists(lhs_file);
+    if (m_hasrefX) {
+      m_refX.resize(m_mat.cols());
+      m_hasrefX = loadMarketVector(m_refX, lhs_file);
+    } else
+      m_refX.resize(0);
+    return m_refX;
+  }
+
+  inline std::string& matname() { return m_matname; }
+
+  inline int sym() { return m_sym; }
+
+  bool hasRhs() { return m_hasRhs; }
+  bool hasrefX() { return m_hasrefX; }
+  bool isFolderValid() { return bool(m_folder_id); }
+
+ protected:
+  inline bool Fileexists(std::string file) {
+    std::ifstream file_id(file.c_str());
+    if (!file_id.good()) {
+      return false;
+    } else {
+      file_id.close();
+      return true;
     }
-    
-    void Getnextvalidmatrix( )
-    {
+  }
+
+  void Getnextvalidmatrix() {
+    m_isvalid = false;
+    // Here, we return with the next valid matrix in the folder
+    while ((m_curs_id = readdir(m_folder_id)) != NULL) {
       m_isvalid = false;
-      // Here, we return with the next valid matrix in the folder
-      while ( (m_curs_id = readdir(m_folder_id)) != NULL) {
-        m_isvalid = false;
-        std::string curfile;
-        curfile = m_folder + "/" + m_curs_id->d_name;
-        // Discard if it is a folder
-#if !(defined(__sun) || defined(_AIX) || defined(__hpux) || defined(__sgi))
-        if (m_curs_id->d_type == DT_DIR) continue; //FIXME This may not be available on non BSD systems
+      std::string curfile;
+      curfile = m_folder + "/" + m_curs_id->d_name;
+      // Discard if it is a folder
+#if !(defined(__sun) || defined(_AIX) || defined(__hpux) || defined(__sgi) || defined(__HAIKU__))
+      if (m_curs_id->d_type == DT_DIR) continue;  // FIXME This may not be available on non BSD systems
 #endif
-//         struct stat st_buf; 
-//         stat (curfile.c_str(), &st_buf);
-//         if (S_ISDIR(st_buf.st_mode)) continue;
-        
-        // Determine from the header if it is a matrix or a right hand side 
-        bool isvector,iscomplex=false;
-        if(!getMarketHeader(curfile,m_sym,iscomplex,isvector)) continue;
-        if(isvector) continue;
-        if (!iscomplex)
-        {
-          if(internal::is_same<Scalar, std::complex<float> >::value || internal::is_same<Scalar, std::complex<double> >::value)
-            continue; 
-        }
-        if (iscomplex)
-        {
-          if(internal::is_same<Scalar, float>::value || internal::is_same<Scalar, double>::value)
-            continue; 
-        }
-        
-        
-        // Get the matrix name
-        std::string filename = m_curs_id->d_name;
-        m_matname = filename.substr(0, filename.length()-4); 
-        
-        // Find if the matrix is SPD 
-        size_t found = m_matname.find("SPD");
-        if( (found!=std::string::npos) && (m_sym != NonSymmetric) )
-          m_sym = SPD;
-       
-        m_isvalid = true;
-        break; 
+      //         struct stat st_buf;
+      //         stat (curfile.c_str(), &st_buf);
+      //         if (S_ISDIR(st_buf.st_mode)) continue;
+
+      // Determine from the header if it is a matrix or a right hand side
+      bool isvector, iscomplex = false;
+      if (!getMarketHeader(curfile, m_sym, iscomplex, isvector)) continue;
+      if (isvector) continue;
+      if (!iscomplex) {
+        if (internal::is_same<Scalar, std::complex<float> >::value ||
+            internal::is_same<Scalar, std::complex<double> >::value)
+          continue;
+      }
+      if (iscomplex) {
+        if (internal::is_same<Scalar, float>::value || internal::is_same<Scalar, double>::value) continue;
       }
+
+      // Get the matrix name
+      std::string filename = m_curs_id->d_name;
+      m_matname = filename.substr(0, filename.length() - 4);
+
+      // Find if the matrix is SPD
+      size_t found = m_matname.find("SPD");
+      if ((found != std::string::npos) && (m_sym != NonSymmetric)) m_sym = SPD;
+
+      m_isvalid = true;
+      break;
     }
-    int m_sym; // Symmetry of the matrix
-    MatrixType m_mat; // Current matrix  
-    VectorType m_rhs;  // Current vector
-    VectorType m_refX; // The reference solution, if exists
-    std::string m_matname; // Matrix Name
-    bool m_isvalid; 
-    bool m_matIsLoaded; // Determine if the matrix has already been loaded from the file
-    bool m_hasRhs; // The right hand side exists
-    bool m_hasrefX; // A reference solution is provided
-    std::string m_folder;
-    DIR * m_folder_id;
-    struct dirent *m_curs_id; 
-    
+  }
+  int m_sym;              // Symmetry of the matrix
+  MatrixType m_mat;       // Current matrix
+  VectorType m_rhs;       // Current vector
+  VectorType m_refX;      // The reference solution, if exists
+  std::string m_matname;  // Matrix Name
+  bool m_isvalid;
+  bool m_matIsLoaded;  // Determine if the matrix has already been loaded from the file
+  bool m_hasRhs;       // The right hand side exists
+  bool m_hasrefX;      // A reference solution is provided
+  std::string m_folder;
+  DIR* m_folder_id;
+  struct dirent* m_curs_id;
 };
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
 #endif
diff --git a/inst/include/unsupported/Eigen/src/SparseExtra/RandomSetter.h b/inst/include/unsupported/Eigen/src/SparseExtra/RandomSetter.h
index dee1708e..2889d82a 100644
--- a/inst/include/unsupported/Eigen/src/SparseExtra/RandomSetter.h
+++ b/inst/include/unsupported/Eigen/src/SparseExtra/RandomSetter.h
@@ -10,318 +10,294 @@
 #ifndef EIGEN_RANDOMSETTER_H
 #define EIGEN_RANDOMSETTER_H
 
-namespace Eigen { 
+#if defined(EIGEN_GOOGLEHASH_SUPPORT)
+// Ensure the ::google namespace exists, required for checking existence of
+// ::google::dense_hash_map and ::google::sparse_hash_map.
+namespace google {}
+#endif
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
 
 /** Represents a std::map
-  *
-  * \see RandomSetter
-  */
-template<typename Scalar> struct StdMapTraits
-{
+ *
+ * \see RandomSetter
+ */
+template <typename Scalar>
+struct StdMapTraits {
   typedef int KeyType;
-  typedef std::map<KeyType,Scalar> Type;
-  enum {
-    IsSorted = 1
-  };
+  typedef std::map<KeyType, Scalar> Type;
+  enum { IsSorted = 1 };
 
   static void setInvalidKey(Type&, const KeyType&) {}
 };
 
-#ifdef EIGEN_UNORDERED_MAP_SUPPORT
 /** Represents a std::unordered_map
-  *
-  * To use it you need to both define EIGEN_UNORDERED_MAP_SUPPORT and include the unordered_map header file
-  * yourself making sure that unordered_map is defined in the std namespace.
-  *
-  * For instance, with current version of gcc you can either enable C++0x standard (-std=c++0x) or do:
-  * \code
-  * #include <tr1/unordered_map>
-  * #define EIGEN_UNORDERED_MAP_SUPPORT
-  * namespace std {
-  *   using std::tr1::unordered_map;
-  * }
-  * \endcode
-  *
-  * \see RandomSetter
-  */
-template<typename Scalar> struct StdUnorderedMapTraits
-{
+ * \see RandomSetter
+ */
+template <typename Scalar>
+struct StdUnorderedMapTraits {
   typedef int KeyType;
-  typedef std::unordered_map<KeyType,Scalar> Type;
-  enum {
-    IsSorted = 0
-  };
+  typedef std::unordered_map<KeyType, Scalar> Type;
+  enum { IsSorted = 0 };
 
   static void setInvalidKey(Type&, const KeyType&) {}
 };
-#endif // EIGEN_UNORDERED_MAP_SUPPORT
 
-#ifdef _DENSE_HASH_MAP_H_
+#if defined(EIGEN_GOOGLEHASH_SUPPORT)
+
+namespace google {
+
+// Namespace work-around, since sometimes dense_hash_map and sparse_hash_map
+// are in the global namespace, and other times they are under ::google.
+using namespace ::google;
+
+template <typename KeyType, typename Scalar>
+struct DenseHashMap {
+  typedef dense_hash_map<KeyType, Scalar> type;
+};
+
+template <typename KeyType, typename Scalar>
+struct SparseHashMap {
+  typedef sparse_hash_map<KeyType, Scalar> type;
+};
+
+}  // namespace google
+
 /** Represents a google::dense_hash_map
-  *
-  * \see RandomSetter
-  */
-template<typename Scalar> struct GoogleDenseHashMapTraits
-{
+ *
+ * \see RandomSetter
+ */
+template <typename Scalar>
+struct GoogleDenseHashMapTraits {
   typedef int KeyType;
-  typedef google::dense_hash_map<KeyType,Scalar> Type;
-  enum {
-    IsSorted = 0
-  };
+  typedef typename google::DenseHashMap<KeyType, Scalar>::type Type;
+  enum { IsSorted = 0 };
 
-  static void setInvalidKey(Type& map, const KeyType& k)
-  { map.set_empty_key(k); }
+  static void setInvalidKey(Type& map, const KeyType& k) { map.set_empty_key(k); }
 };
-#endif
 
-#ifdef _SPARSE_HASH_MAP_H_
 /** Represents a google::sparse_hash_map
-  *
-  * \see RandomSetter
-  */
-template<typename Scalar> struct GoogleSparseHashMapTraits
-{
+ *
+ * \see RandomSetter
+ */
+template <typename Scalar>
+struct GoogleSparseHashMapTraits {
   typedef int KeyType;
-  typedef google::sparse_hash_map<KeyType,Scalar> Type;
-  enum {
-    IsSorted = 0
-  };
+  typedef typename google::SparseHashMap<KeyType, Scalar>::type Type;
+  enum { IsSorted = 0 };
 
   static void setInvalidKey(Type&, const KeyType&) {}
 };
 #endif
 
 /** \class RandomSetter
-  *
-  * \brief The RandomSetter is a wrapper object allowing to set/update a sparse matrix with random access
-  *
-  * \param SparseMatrixType the type of the sparse matrix we are updating
-  * \param MapTraits a traits class representing the map implementation used for the temporary sparse storage.
-  *                  Its default value depends on the system.
-  * \param OuterPacketBits defines the number of rows (or columns) manage by a single map object
-  *                        as a power of two exponent.
-  *
-  * This class temporarily represents a sparse matrix object using a generic map implementation allowing for
-  * efficient random access. The conversion from the compressed representation to a hash_map object is performed
-  * in the RandomSetter constructor, while the sparse matrix is updated back at destruction time. This strategy
-  * suggest the use of nested blocks as in this example:
-  *
-  * \code
-  * SparseMatrix<double> m(rows,cols);
-  * {
-  *   RandomSetter<SparseMatrix<double> > w(m);
-  *   // don't use m but w instead with read/write random access to the coefficients:
-  *   for(;;)
-  *     w(rand(),rand()) = rand;
-  * }
-  * // when w is deleted, the data are copied back to m
-  * // and m is ready to use.
-  * \endcode
-  *
-  * Since hash_map objects are not fully sorted, representing a full matrix as a single hash_map would
-  * involve a big and costly sort to update the compressed matrix back. To overcome this issue, a RandomSetter
-  * use multiple hash_map, each representing 2^OuterPacketBits columns or rows according to the storage order.
-  * To reach optimal performance, this value should be adjusted according to the average number of nonzeros
-  * per rows/columns.
-  *
-  * The possible values for the template parameter MapTraits are:
-  *  - \b StdMapTraits: corresponds to std::map. (does not perform very well)
-  *  - \b GnuHashMapTraits: corresponds to __gnu_cxx::hash_map (available only with GCC)
-  *  - \b GoogleDenseHashMapTraits: corresponds to google::dense_hash_map (best efficiency, reasonable memory consumption)
-  *  - \b GoogleSparseHashMapTraits: corresponds to google::sparse_hash_map (best memory consumption, relatively good performance)
-  *
-  * The default map implementation depends on the availability, and the preferred order is:
-  * GoogleSparseHashMapTraits, GnuHashMapTraits, and finally StdMapTraits.
-  *
-  * For performance and memory consumption reasons it is highly recommended to use one of
-  * the Google's hash_map implementation. To enable the support for them, you have two options:
-  *  - \#include <google/dense_hash_map> yourself \b before Eigen/Sparse header
-  *  - define EIGEN_GOOGLEHASH_SUPPORT
-  * In the later case the inclusion of <google/dense_hash_map> is made for you.
-  *
-  * \see http://code.google.com/p/google-sparsehash/
-  */
-template<typename SparseMatrixType,
-         template <typename T> class MapTraits =
-#if defined _DENSE_HASH_MAP_H_
-          GoogleDenseHashMapTraits
-#elif defined _HASH_MAP
-          GnuHashMapTraits
+ * \ingroup SparseExtra_Module
+ * \brief The RandomSetter is a wrapper object allowing to set/update a sparse matrix with random access
+ *
+ * \tparam SparseMatrixType the type of the sparse matrix we are updating
+ * \tparam MapTraits a traits class representing the map implementation used for the temporary sparse storage.
+ *                  Its default value depends on the system.
+ * \tparam OuterPacketBits defines the number of rows (or columns) manage by a single map object
+ *                        as a power of two exponent.
+ *
+ * This class temporarily represents a sparse matrix object using a generic map implementation allowing for
+ * efficient random access. The conversion from the compressed representation to a hash_map object is performed
+ * in the RandomSetter constructor, while the sparse matrix is updated back at destruction time. This strategy
+ * suggest the use of nested blocks as in this example:
+ *
+ * \code
+ * SparseMatrix<double> m(rows,cols);
+ * {
+ *   RandomSetter<SparseMatrix<double> > w(m);
+ *   // don't use m but w instead with read/write random access to the coefficients:
+ *   for(;;)
+ *     w(rand(),rand()) = rand;
+ * }
+ * // when w is deleted, the data are copied back to m
+ * // and m is ready to use.
+ * \endcode
+ *
+ * Since hash_map objects are not fully sorted, representing a full matrix as a single hash_map would
+ * involve a big and costly sort to update the compressed matrix back. To overcome this issue, a RandomSetter
+ * use multiple hash_map, each representing 2^OuterPacketBits columns or rows according to the storage order.
+ * To reach optimal performance, this value should be adjusted according to the average number of nonzeros
+ * per rows/columns.
+ *
+ * The possible values for the template parameter MapTraits are:
+ *  - \b StdMapTraits: corresponds to std::map. (does not perform very well)
+ *  - \b StdUnorderedMapTraits: corresponds to std::unordered_map
+ *  - \b GoogleDenseHashMapTraits: corresponds to google::dense_hash_map (best efficiency, reasonable memory
+ * consumption)
+ *  - \b GoogleSparseHashMapTraits: corresponds to google::sparse_hash_map (best memory consumption, relatively good
+ * performance)
+ *
+ * The default map implementation depends on the availability, and the preferred order is:
+ * GoogleSparseHashMapTraits, StdUnorderedMapTraits, and finally StdMapTraits.
+ *
+ * For performance and memory consumption reasons it is highly recommended to use one of
+ * Google's hash_map implementations. To enable the support for them, you must define
+ * EIGEN_GOOGLEHASH_SUPPORT. This will include both <google/dense_hash_map> and
+ * <google/sparse_hash_map> for you.
+ *
+ * \see https://github.com/sparsehash/sparsehash
+ */
+template <typename SparseMatrixType,
+          template <typename T> class MapTraits =
+#if defined(EIGEN_GOOGLEHASH_SUPPORT)
+              GoogleDenseHashMapTraits
 #else
-          StdMapTraits
+              StdUnorderedMapTraits
 #endif
-         ,int OuterPacketBits = 6>
-class RandomSetter
-{
-    typedef typename SparseMatrixType::Scalar Scalar;
-    typedef typename SparseMatrixType::Index Index;
+          ,
+          int OuterPacketBits = 6>
+class RandomSetter {
+  typedef typename SparseMatrixType::Scalar Scalar;
+  typedef typename SparseMatrixType::StorageIndex StorageIndex;
 
-    struct ScalarWrapper
-    {
-      ScalarWrapper() : value(0) {}
-      Scalar value;
-    };
-    typedef typename MapTraits<ScalarWrapper>::KeyType KeyType;
-    typedef typename MapTraits<ScalarWrapper>::Type HashMapType;
-    static const int OuterPacketMask = (1 << OuterPacketBits) - 1;
-    enum {
-      SwapStorage = 1 - MapTraits<ScalarWrapper>::IsSorted,
-      TargetRowMajor = (SparseMatrixType::Flags & RowMajorBit) ? 1 : 0,
-      SetterRowMajor = SwapStorage ? 1-TargetRowMajor : TargetRowMajor
-    };
-
-  public:
-
-    /** Constructs a random setter object from the sparse matrix \a target
-      *
-      * Note that the initial value of \a target are imported. If you want to re-set
-      * a sparse matrix from scratch, then you must set it to zero first using the
-      * setZero() function.
-      */
-    inline RandomSetter(SparseMatrixType& target)
-      : mp_target(&target)
-    {
-      const Index outerSize = SwapStorage ? target.innerSize() : target.outerSize();
-      const Index innerSize = SwapStorage ? target.outerSize() : target.innerSize();
-      m_outerPackets = outerSize >> OuterPacketBits;
-      if (outerSize&OuterPacketMask)
-        m_outerPackets += 1;
-      m_hashmaps = new HashMapType[m_outerPackets];
-      // compute number of bits needed to store inner indices
-      Index aux = innerSize - 1;
-      m_keyBitsOffset = 0;
-      while (aux)
-      {
-        ++m_keyBitsOffset;
-        aux = aux >> 1;
-      }
-      KeyType ik = (1<<(OuterPacketBits+m_keyBitsOffset));
-      for (Index k=0; k<m_outerPackets; ++k)
-        MapTraits<ScalarWrapper>::setInvalidKey(m_hashmaps[k],ik);
+  struct ScalarWrapper {
+    ScalarWrapper() : value(0) {}
+    Scalar value;
+  };
+  typedef typename MapTraits<ScalarWrapper>::KeyType KeyType;
+  typedef typename MapTraits<ScalarWrapper>::Type HashMapType;
+  static constexpr int OuterPacketMask = (1 << OuterPacketBits) - 1;
+  enum {
+    SwapStorage = 1 - MapTraits<ScalarWrapper>::IsSorted,
+    TargetRowMajor = (SparseMatrixType::Flags & RowMajorBit) ? 1 : 0,
+    SetterRowMajor = SwapStorage ? 1 - TargetRowMajor : TargetRowMajor
+  };
 
-      // insert current coeffs
-      for (Index j=0; j<mp_target->outerSize(); ++j)
-        for (typename SparseMatrixType::InnerIterator it(*mp_target,j); it; ++it)
-          (*this)(TargetRowMajor?j:it.index(), TargetRowMajor?it.index():j) = it.value();
+ public:
+  /** Constructs a random setter object from the sparse matrix \a target
+   *
+   * Note that the initial value of \a target are imported. If you want to re-set
+   * a sparse matrix from scratch, then you must set it to zero first using the
+   * setZero() function.
+   */
+  inline RandomSetter(SparseMatrixType& target) : mp_target(&target) {
+    const Index outerSize = SwapStorage ? target.innerSize() : target.outerSize();
+    const Index innerSize = SwapStorage ? target.outerSize() : target.innerSize();
+    m_outerPackets = outerSize >> OuterPacketBits;
+    if (outerSize & OuterPacketMask) m_outerPackets += 1;
+    m_hashmaps = new HashMapType[m_outerPackets];
+    // compute number of bits needed to store inner indices
+    Index aux = innerSize - 1;
+    m_keyBitsOffset = 0;
+    while (aux) {
+      ++m_keyBitsOffset;
+      aux = aux >> 1;
     }
+    KeyType ik = (1 << (OuterPacketBits + m_keyBitsOffset));
+    for (Index k = 0; k < m_outerPackets; ++k) MapTraits<ScalarWrapper>::setInvalidKey(m_hashmaps[k], ik);
 
-    /** Destructor updating back the sparse matrix target */
-    ~RandomSetter()
+    // insert current coeffs
+    for (Index j = 0; j < mp_target->outerSize(); ++j)
+      for (typename SparseMatrixType::InnerIterator it(*mp_target, j); it; ++it)
+        (*this)(TargetRowMajor ? j : it.index(), TargetRowMajor ? it.index() : j) = it.value();
+  }
+
+  /** Destructor updating back the sparse matrix target */
+  ~RandomSetter() {
+    KeyType keyBitsMask = (1 << m_keyBitsOffset) - 1;
+    if (!SwapStorage)  // also means the map is sorted
     {
-      KeyType keyBitsMask = (1<<m_keyBitsOffset)-1;
-      if (!SwapStorage) // also means the map is sorted
-      {
-        mp_target->setZero();
-        mp_target->makeCompressed();
-        mp_target->reserve(nonZeros());
-        Index prevOuter = -1;
-        for (Index k=0; k<m_outerPackets; ++k)
-        {
-          const Index outerOffset = (1<<OuterPacketBits) * k;
-          typename HashMapType::iterator end = m_hashmaps[k].end();
-          for (typename HashMapType::iterator it = m_hashmaps[k].begin(); it!=end; ++it)
-          {
-            const Index outer = (it->first >> m_keyBitsOffset) + outerOffset;
-            const Index inner = it->first & keyBitsMask;
-            if (prevOuter!=outer)
-            {
-              for (Index j=prevOuter+1;j<=outer;++j)
-                mp_target->startVec(j);
-              prevOuter = outer;
-            }
-            mp_target->insertBackByOuterInner(outer, inner) = it->second.value;
+      mp_target->setZero();
+      mp_target->makeCompressed();
+      mp_target->reserve(nonZeros());
+      Index prevOuter = -1;
+      for (Index k = 0; k < m_outerPackets; ++k) {
+        const Index outerOffset = (1 << OuterPacketBits) * k;
+        typename HashMapType::iterator end = m_hashmaps[k].end();
+        for (typename HashMapType::iterator it = m_hashmaps[k].begin(); it != end; ++it) {
+          const Index outer = (it->first >> m_keyBitsOffset) + outerOffset;
+          const Index inner = it->first & keyBitsMask;
+          if (prevOuter != outer) {
+            for (Index j = prevOuter + 1; j <= outer; ++j) mp_target->startVec(j);
+            prevOuter = outer;
           }
+          mp_target->insertBackByOuterInner(outer, inner) = it->second.value;
         }
-        mp_target->finalize();
       }
-      else
-      {
-        VectorXi positions(mp_target->outerSize());
-        positions.setZero();
-        // pass 1
-        for (Index k=0; k<m_outerPackets; ++k)
-        {
-          typename HashMapType::iterator end = m_hashmaps[k].end();
-          for (typename HashMapType::iterator it = m_hashmaps[k].begin(); it!=end; ++it)
-          {
-            const Index outer = it->first & keyBitsMask;
-            ++positions[outer];
-          }
+      mp_target->finalize();
+    } else {
+      VectorXi positions(mp_target->outerSize());
+      positions.setZero();
+      // pass 1
+      for (Index k = 0; k < m_outerPackets; ++k) {
+        typename HashMapType::iterator end = m_hashmaps[k].end();
+        for (typename HashMapType::iterator it = m_hashmaps[k].begin(); it != end; ++it) {
+          const Index outer = it->first & keyBitsMask;
+          ++positions[outer];
         }
-        // prefix sum
-        Index count = 0;
-        for (Index j=0; j<mp_target->outerSize(); ++j)
-        {
-          Index tmp = positions[j];
-          mp_target->outerIndexPtr()[j] = count;
-          positions[j] = count;
-          count += tmp;
-        }
-        mp_target->makeCompressed();
-        mp_target->outerIndexPtr()[mp_target->outerSize()] = count;
-        mp_target->resizeNonZeros(count);
-        // pass 2
-        for (Index k=0; k<m_outerPackets; ++k)
-        {
-          const Index outerOffset = (1<<OuterPacketBits) * k;
-          typename HashMapType::iterator end = m_hashmaps[k].end();
-          for (typename HashMapType::iterator it = m_hashmaps[k].begin(); it!=end; ++it)
-          {
-            const Index inner = (it->first >> m_keyBitsOffset) + outerOffset;
-            const Index outer = it->first & keyBitsMask;
-            // sorted insertion
-            // Note that we have to deal with at most 2^OuterPacketBits unsorted coefficients,
-            // moreover those 2^OuterPacketBits coeffs are likely to be sparse, an so only a
-            // small fraction of them have to be sorted, whence the following simple procedure:
-            Index posStart = mp_target->outerIndexPtr()[outer];
-            Index i = (positions[outer]++) - 1;
-            while ( (i >= posStart) && (mp_target->innerIndexPtr()[i] > inner) )
-            {
-              mp_target->valuePtr()[i+1] = mp_target->valuePtr()[i];
-              mp_target->innerIndexPtr()[i+1] = mp_target->innerIndexPtr()[i];
-              --i;
-            }
-            mp_target->innerIndexPtr()[i+1] = inner;
-            mp_target->valuePtr()[i+1] = it->second.value;
+      }
+      // prefix sum
+      StorageIndex count = 0;
+      for (Index j = 0; j < mp_target->outerSize(); ++j) {
+        StorageIndex tmp = positions[j];
+        mp_target->outerIndexPtr()[j] = count;
+        positions[j] = count;
+        count += tmp;
+      }
+      mp_target->makeCompressed();
+      mp_target->outerIndexPtr()[mp_target->outerSize()] = count;
+      mp_target->resizeNonZeros(count);
+      // pass 2
+      for (Index k = 0; k < m_outerPackets; ++k) {
+        const Index outerOffset = (1 << OuterPacketBits) * k;
+        typename HashMapType::iterator end = m_hashmaps[k].end();
+        for (typename HashMapType::iterator it = m_hashmaps[k].begin(); it != end; ++it) {
+          const Index inner = (it->first >> m_keyBitsOffset) + outerOffset;
+          const Index outer = it->first & keyBitsMask;
+          // sorted insertion
+          // Note that we have to deal with at most 2^OuterPacketBits unsorted coefficients,
+          // moreover those 2^OuterPacketBits coeffs are likely to be sparse, an so only a
+          // small fraction of them have to be sorted, whence the following simple procedure:
+          Index posStart = mp_target->outerIndexPtr()[outer];
+          Index i = (positions[outer]++) - 1;
+          while ((i >= posStart) && (mp_target->innerIndexPtr()[i] > inner)) {
+            mp_target->valuePtr()[i + 1] = mp_target->valuePtr()[i];
+            mp_target->innerIndexPtr()[i + 1] = mp_target->innerIndexPtr()[i];
+            --i;
           }
+          mp_target->innerIndexPtr()[i + 1] = internal::convert_index<StorageIndex>(inner);
+          mp_target->valuePtr()[i + 1] = it->second.value;
         }
       }
-      delete[] m_hashmaps;
-    }
-
-    /** \returns a reference to the coefficient at given coordinates \a row, \a col */
-    Scalar& operator() (Index row, Index col)
-    {
-      const Index outer = SetterRowMajor ? row : col;
-      const Index inner = SetterRowMajor ? col : row;
-      const Index outerMajor = outer >> OuterPacketBits; // index of the packet/map
-      const Index outerMinor = outer & OuterPacketMask;  // index of the inner vector in the packet
-      const KeyType key = (KeyType(outerMinor)<<m_keyBitsOffset) | inner;
-      return m_hashmaps[outerMajor][key].value;
-    }
-
-    /** \returns the number of non zero coefficients
-      *
-      * \note According to the underlying map/hash_map implementation,
-      * this function might be quite expensive.
-      */
-    Index nonZeros() const
-    {
-      Index nz = 0;
-      for (Index k=0; k<m_outerPackets; ++k)
-        nz += static_cast<Index>(m_hashmaps[k].size());
-      return nz;
     }
+    delete[] m_hashmaps;
+  }
 
+  /** \returns a reference to the coefficient at given coordinates \a row, \a col */
+  Scalar& operator()(Index row, Index col) {
+    const Index outer = SetterRowMajor ? row : col;
+    const Index inner = SetterRowMajor ? col : row;
+    const Index outerMajor = outer >> OuterPacketBits;  // index of the packet/map
+    const Index outerMinor = outer & OuterPacketMask;   // index of the inner vector in the packet
+    const KeyType key = internal::convert_index<KeyType>((outerMinor << m_keyBitsOffset) | inner);
+    return m_hashmaps[outerMajor][key].value;
+  }
 
-  protected:
+  /** \returns the number of non zero coefficients
+   *
+   * \note According to the underlying map/hash_map implementation,
+   * this function might be quite expensive.
+   */
+  Index nonZeros() const {
+    Index nz = 0;
+    for (Index k = 0; k < m_outerPackets; ++k) nz += static_cast<Index>(m_hashmaps[k].size());
+    return nz;
+  }
 
-    HashMapType* m_hashmaps;
-    SparseMatrixType* mp_target;
-    Index m_outerPackets;
-    unsigned char m_keyBitsOffset;
+ protected:
+  HashMapType* m_hashmaps;
+  SparseMatrixType* mp_target;
+  Index m_outerPackets;
+  unsigned char m_keyBitsOffset;
 };
 
-} // end namespace Eigen
+}  // end namespace Eigen
 
-#endif // EIGEN_RANDOMSETTER_H
+#endif  // EIGEN_RANDOMSETTER_H
diff --git a/inst/include/unsupported/Eigen/src/SparseExtra/SparseInverse.h b/inst/include/unsupported/Eigen/src/SparseExtra/SparseInverse.h
new file mode 100644
index 00000000..142cc8f1
--- /dev/null
+++ b/inst/include/unsupported/Eigen/src/SparseExtra/SparseInverse.h
@@ -0,0 +1,232 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2022 Julian Kent <jkflying@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPARSEINVERSE_H
+#define EIGEN_SPARSEINVERSE_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+#include "../../../../Eigen/Sparse"
+#include "../../../../Eigen/SparseLU"
+
+namespace Eigen {
+
+/**
+ * @brief Kahan algorithm based accumulator
+ *
+ * The Kahan sum algorithm guarantees to bound the error from floating point
+ * accumulation to a fixed value, regardless of the number of accumulations
+ * performed. Naive accumulation accumulates errors O(N), and pairwise O(logN).
+ * However pairwise also requires O(logN) memory while Kahan summation requires
+ * O(1) memory, but 4x the operations / latency.
+ *
+ * NB! Do not enable associative math optimizations, they may cause the Kahan
+ * summation to be optimized out leaving you with naive summation again.
+ *
+ */
+template <typename Scalar>
+class KahanSum {
+  // Straightforward Kahan summation for accurate accumulation of a sum of numbers
+  Scalar _sum{};
+  Scalar _correction{};
+
+ public:
+  Scalar value() { return _sum; }
+
+  void operator+=(Scalar increment) {
+    const Scalar correctedIncrement = increment + _correction;
+    const Scalar previousSum = _sum;
+    _sum += correctedIncrement;
+    _correction = correctedIncrement - (_sum - previousSum);
+  }
+};
+template <typename Scalar, Index Width = 16>
+class FABSum {
+  // https://epubs.siam.org/doi/pdf/10.1137/19M1257780
+  // Fast and Accurate Blocked Summation
+  // Uses naive summation for the fast sum, and Kahan summation for the accurate sum
+  // Theoretically SIMD sum could be changed to a tree sum which would improve accuracy
+  // over naive summation
+  KahanSum<Scalar> _totalSum;
+  Matrix<Scalar, Width, 1> _block;
+  Index _blockUsed{};
+
+ public:
+  Scalar value() { return _block.topRows(_blockUsed).sum() + _totalSum.value(); }
+
+  void operator+=(Scalar increment) {
+    _block(_blockUsed++, 0) = increment;
+    if (_blockUsed == Width) {
+      _totalSum += _block.sum();
+      _blockUsed = 0;
+    }
+  }
+};
+
+/**
+ * @brief computes an accurate dot product on two sparse vectors
+ *
+ * Uses an accurate summation algorithm for the accumulator in order to
+ * compute an accurate dot product for two sparse vectors.
+ *
+ */
+template <typename Derived, typename OtherDerived>
+typename Derived::Scalar accurateDot(const SparseMatrixBase<Derived>& A, const SparseMatrixBase<OtherDerived>& other) {
+  typedef typename Derived::Scalar Scalar;
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived)
+  EIGEN_STATIC_ASSERT_VECTOR_ONLY(OtherDerived)
+  EIGEN_STATIC_ASSERT_SAME_VECTOR_SIZE(Derived, OtherDerived)
+  static_assert(internal::is_same<Scalar, typename OtherDerived::Scalar>::value, "mismatched types");
+
+  internal::evaluator<Derived> thisEval(A.derived());
+  typename Derived::ReverseInnerIterator i(thisEval, 0);
+
+  internal::evaluator<OtherDerived> otherEval(other.derived());
+  typename OtherDerived::ReverseInnerIterator j(otherEval, 0);
+
+  FABSum<Scalar> res;
+  while (i && j) {
+    if (i.index() == j.index()) {
+      res += numext::conj(i.value()) * j.value();
+      --i;
+      --j;
+    } else if (i.index() > j.index())
+      --i;
+    else
+      --j;
+  }
+  return res.value();
+}
+
+/**
+ * @brief calculate sparse subset of inverse of sparse matrix
+ *
+ * This class returns a sparse subset of the inverse of the input matrix.
+ * The nonzeros correspond to the nonzeros of the input, plus any additional
+ * elements required due to fill-in of the internal LU factorization. This is
+ * is minimized via a applying a fill-reducing permutation as part of the LU
+ * factorization.
+ *
+ * If there are specific entries of the input matrix which you need inverse
+ * values for, which are zero for the input, you need to insert entries into
+ * the input sparse matrix for them to be calculated.
+ *
+ * Due to the sensitive nature of matrix inversion, particularly on large
+ * matrices which are made possible via sparsity, high accuracy dot products
+ * based on Kahan summation are used to reduce numerical error. If you still
+ * encounter numerical errors you may with to equilibrate your matrix before
+ * calculating the inverse, as well as making sure it is actually full rank.
+ */
+template <typename Scalar>
+class SparseInverse {
+ public:
+  typedef SparseMatrix<Scalar, ColMajor> MatrixType;
+  typedef SparseMatrix<Scalar, RowMajor> RowMatrixType;
+
+  SparseInverse() {}
+
+  /**
+   * @brief This Constructor is for if you already have a factored SparseLU and would like to use it to calculate a
+   * sparse inverse.
+   *
+   * Just call this constructor with your already factored SparseLU class and you can directly call the .inverse()
+   * method to get the result.
+   */
+  SparseInverse(const SparseLU<MatrixType>& slu) { _result = computeInverse(slu); }
+
+  /**
+   * @brief Calculate the sparse inverse from a given sparse input
+   */
+  SparseInverse& compute(const SparseMatrix<Scalar>& A) {
+    SparseLU<MatrixType> slu;
+    slu.compute(A);
+    _result = computeInverse(slu);
+    return *this;
+  }
+
+  /**
+   * @brief return the already-calculated sparse inverse, or a 0x0 matrix if it could not be computed
+   */
+  const MatrixType& inverse() const { return _result; }
+
+  /**
+   * @brief Internal function to calculate the sparse inverse in a functional way
+   * @return A sparse inverse representation, or, if the decomposition didn't complete, a 0x0 matrix.
+   */
+  static MatrixType computeInverse(const SparseLU<MatrixType>& slu) {
+    if (slu.info() != Success) {
+      return MatrixType(0, 0);
+    }
+
+    // Extract from SparseLU and decompose into L, inverse D and U terms
+    Matrix<Scalar, Dynamic, 1> invD;
+    RowMatrixType Upper;
+    {
+      RowMatrixType DU = slu.matrixU().toSparse();
+      invD = DU.diagonal().cwiseInverse();
+      Upper = (invD.asDiagonal() * DU).template triangularView<StrictlyUpper>();
+    }
+    MatrixType Lower = slu.matrixL().toSparse().template triangularView<StrictlyLower>();
+
+    // Compute the inverse and reapply the permutation matrix from the LU decomposition
+    return slu.colsPermutation().transpose() * computeInverse(Upper, invD, Lower) * slu.rowsPermutation();
+  }
+
+  /**
+   * @brief Internal function to calculate the inverse from strictly upper, diagonal and strictly lower components
+   */
+  static MatrixType computeInverse(const RowMatrixType& Upper, const Matrix<Scalar, Dynamic, 1>& inverseDiagonal,
+                                   const MatrixType& Lower) {
+    // Calculate the 'minimal set', which is the nonzeros of (L+U).transpose()
+    // It could be zeroed, but we will overwrite all non-zeros anyways.
+    MatrixType colInv = Lower.transpose().template triangularView<UnitUpper>();
+    colInv += Upper.transpose();
+
+    // We also need rowmajor representation in order to do efficient row-wise dot products
+    RowMatrixType rowInv = Upper.transpose().template triangularView<UnitLower>();
+    rowInv += Lower.transpose();
+
+    // Use the Takahashi algorithm to build the supporting elements of the inverse
+    // upwards and to the left, from the bottom right element, 1 col/row at a time
+    for (Index recurseLevel = Upper.cols() - 1; recurseLevel >= 0; recurseLevel--) {
+      const auto& col = Lower.col(recurseLevel);
+      const auto& row = Upper.row(recurseLevel);
+
+      // Calculate the inverse values for the nonzeros in this column
+      typename MatrixType::ReverseInnerIterator colIter(colInv, recurseLevel);
+      for (; recurseLevel < colIter.index(); --colIter) {
+        const Scalar element = -accurateDot(col, rowInv.row(colIter.index()));
+        colIter.valueRef() = element;
+        rowInv.coeffRef(colIter.index(), recurseLevel) = element;
+      }
+
+      // Calculate the inverse values for the nonzeros in this row
+      typename RowMatrixType::ReverseInnerIterator rowIter(rowInv, recurseLevel);
+      for (; recurseLevel < rowIter.index(); --rowIter) {
+        const Scalar element = -accurateDot(row, colInv.col(rowIter.index()));
+        rowIter.valueRef() = element;
+        colInv.coeffRef(recurseLevel, rowIter.index()) = element;
+      }
+
+      // And finally the diagonal, which corresponds to both row and col iterator now
+      const Scalar diag = inverseDiagonal(recurseLevel) - accurateDot(row, colInv.col(recurseLevel));
+      rowIter.valueRef() = diag;
+      colIter.valueRef() = diag;
+    }
+
+    return colInv;
+  }
+
+ private:
+  MatrixType _result;
+};
+
+}  // namespace Eigen
+#endif
diff --git a/inst/include/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsArrayAPI.h b/inst/include/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsArrayAPI.h
new file mode 100644
index 00000000..25b91b50
--- /dev/null
+++ b/inst/include/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsArrayAPI.h
@@ -0,0 +1,276 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BESSELFUNCTIONS_ARRAYAPI_H
+#define EIGEN_BESSELFUNCTIONS_ARRAYAPI_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \returns an expression of the coefficient-wise i0(\a x) to the given
+ * arrays.
+ *
+ * It returns the modified Bessel function of the first kind of order zero.
+ *
+ * \param x is the argument
+ *
+ * \note This function supports only float and double scalar types. To support
+ * other scalar types, the user has to provide implementations of i0(T) for
+ * any scalar type T to be supported.
+ *
+ * \sa ArrayBase::bessel_i0()
+ */
+template <typename Derived>
+EIGEN_STRONG_INLINE const
+    Eigen::CwiseUnaryOp<Eigen::internal::scalar_bessel_i0_op<typename Derived::Scalar>, const Derived>
+    bessel_i0(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<Eigen::internal::scalar_bessel_i0_op<typename Derived::Scalar>, const Derived>(
+      x.derived());
+}
+
+/** \returns an expression of the coefficient-wise i0e(\a x) to the given
+ * arrays.
+ *
+ * It returns the exponentially scaled modified Bessel
+ * function of the first kind of order zero.
+ *
+ * \param x is the argument
+ *
+ * \note This function supports only float and double scalar types. To support
+ * other scalar types, the user has to provide implementations of i0e(T) for
+ * any scalar type T to be supported.
+ *
+ * \sa ArrayBase::bessel_i0e()
+ */
+template <typename Derived>
+EIGEN_STRONG_INLINE const
+    Eigen::CwiseUnaryOp<Eigen::internal::scalar_bessel_i0e_op<typename Derived::Scalar>, const Derived>
+    bessel_i0e(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<Eigen::internal::scalar_bessel_i0e_op<typename Derived::Scalar>, const Derived>(
+      x.derived());
+}
+
+/** \returns an expression of the coefficient-wise i1(\a x) to the given
+ * arrays.
+ *
+ * It returns the modified Bessel function of the first kind of order one.
+ *
+ * \param x is the argument
+ *
+ * \note This function supports only float and double scalar types. To support
+ * other scalar types, the user has to provide implementations of i1(T) for
+ * any scalar type T to be supported.
+ *
+ * \sa ArrayBase::bessel_i1()
+ */
+template <typename Derived>
+EIGEN_STRONG_INLINE const
+    Eigen::CwiseUnaryOp<Eigen::internal::scalar_bessel_i1_op<typename Derived::Scalar>, const Derived>
+    bessel_i1(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<Eigen::internal::scalar_bessel_i1_op<typename Derived::Scalar>, const Derived>(
+      x.derived());
+}
+
+/** \returns an expression of the coefficient-wise i1e(\a x) to the given
+ * arrays.
+ *
+ * It returns the exponentially scaled modified Bessel
+ * function of the first kind of order one.
+ *
+ * \param x is the argument
+ *
+ * \note This function supports only float and double scalar types. To support
+ * other scalar types, the user has to provide implementations of i1e(T) for
+ * any scalar type T to be supported.
+ *
+ * \sa ArrayBase::bessel_i1e()
+ */
+template <typename Derived>
+EIGEN_STRONG_INLINE const
+    Eigen::CwiseUnaryOp<Eigen::internal::scalar_bessel_i1e_op<typename Derived::Scalar>, const Derived>
+    bessel_i1e(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<Eigen::internal::scalar_bessel_i1e_op<typename Derived::Scalar>, const Derived>(
+      x.derived());
+}
+
+/** \returns an expression of the coefficient-wise k0(\a x) to the given
+ * arrays.
+ *
+ * It returns the modified Bessel function of the second kind of order zero.
+ *
+ * \param x is the argument
+ *
+ * \note This function supports only float and double scalar types. To support
+ * other scalar types, the user has to provide implementations of k0(T) for
+ * any scalar type T to be supported.
+ *
+ * \sa ArrayBase::bessel_k0()
+ */
+template <typename Derived>
+EIGEN_STRONG_INLINE const
+    Eigen::CwiseUnaryOp<Eigen::internal::scalar_bessel_k0_op<typename Derived::Scalar>, const Derived>
+    bessel_k0(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<Eigen::internal::scalar_bessel_k0_op<typename Derived::Scalar>, const Derived>(
+      x.derived());
+}
+
+/** \returns an expression of the coefficient-wise k0e(\a x) to the given
+ * arrays.
+ *
+ * It returns the exponentially scaled modified Bessel
+ * function of the second kind of order zero.
+ *
+ * \param x is the argument
+ *
+ * \note This function supports only float and double scalar types. To support
+ * other scalar types, the user has to provide implementations of k0e(T) for
+ * any scalar type T to be supported.
+ *
+ * \sa ArrayBase::bessel_k0e()
+ */
+template <typename Derived>
+EIGEN_STRONG_INLINE const
+    Eigen::CwiseUnaryOp<Eigen::internal::scalar_bessel_k0e_op<typename Derived::Scalar>, const Derived>
+    bessel_k0e(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<Eigen::internal::scalar_bessel_k0e_op<typename Derived::Scalar>, const Derived>(
+      x.derived());
+}
+
+/** \returns an expression of the coefficient-wise k1(\a x) to the given
+ * arrays.
+ *
+ * It returns the modified Bessel function of the second kind of order one.
+ *
+ * \param x is the argument
+ *
+ * \note This function supports only float and double scalar types. To support
+ * other scalar types, the user has to provide implementations of k1(T) for
+ * any scalar type T to be supported.
+ *
+ * \sa ArrayBase::bessel_k1()
+ */
+template <typename Derived>
+EIGEN_STRONG_INLINE const
+    Eigen::CwiseUnaryOp<Eigen::internal::scalar_bessel_k1_op<typename Derived::Scalar>, const Derived>
+    bessel_k1(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<Eigen::internal::scalar_bessel_k1_op<typename Derived::Scalar>, const Derived>(
+      x.derived());
+}
+
+/** \returns an expression of the coefficient-wise k1e(\a x) to the given
+ * arrays.
+ *
+ * It returns the exponentially scaled modified Bessel
+ * function of the second kind of order one.
+ *
+ * \param x is the argument
+ *
+ * \note This function supports only float and double scalar types. To support
+ * other scalar types, the user has to provide implementations of k1e(T) for
+ * any scalar type T to be supported.
+ *
+ * \sa ArrayBase::bessel_k1e()
+ */
+template <typename Derived>
+EIGEN_STRONG_INLINE const
+    Eigen::CwiseUnaryOp<Eigen::internal::scalar_bessel_k1e_op<typename Derived::Scalar>, const Derived>
+    bessel_k1e(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<Eigen::internal::scalar_bessel_k1e_op<typename Derived::Scalar>, const Derived>(
+      x.derived());
+}
+
+/** \returns an expression of the coefficient-wise j0(\a x) to the given
+ * arrays.
+ *
+ * It returns the Bessel function of the first kind of order zero.
+ *
+ * \param x is the argument
+ *
+ * \note This function supports only float and double scalar types. To support
+ * other scalar types, the user has to provide implementations of j0(T) for
+ * any scalar type T to be supported.
+ *
+ * \sa ArrayBase::bessel_j0()
+ */
+template <typename Derived>
+EIGEN_STRONG_INLINE const
+    Eigen::CwiseUnaryOp<Eigen::internal::scalar_bessel_j0_op<typename Derived::Scalar>, const Derived>
+    bessel_j0(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<Eigen::internal::scalar_bessel_j0_op<typename Derived::Scalar>, const Derived>(
+      x.derived());
+}
+
+/** \returns an expression of the coefficient-wise y0(\a x) to the given
+ * arrays.
+ *
+ * It returns the Bessel function of the second kind of order zero.
+ *
+ * \param x is the argument
+ *
+ * \note This function supports only float and double scalar types. To support
+ * other scalar types, the user has to provide implementations of y0(T) for
+ * any scalar type T to be supported.
+ *
+ * \sa ArrayBase::bessel_y0()
+ */
+template <typename Derived>
+EIGEN_STRONG_INLINE const
+    Eigen::CwiseUnaryOp<Eigen::internal::scalar_bessel_y0_op<typename Derived::Scalar>, const Derived>
+    bessel_y0(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<Eigen::internal::scalar_bessel_y0_op<typename Derived::Scalar>, const Derived>(
+      x.derived());
+}
+
+/** \returns an expression of the coefficient-wise j1(\a x) to the given
+ * arrays.
+ *
+ * It returns the modified Bessel function of the first kind of order one.
+ *
+ * \param x is the argument
+ *
+ * \note This function supports only float and double scalar types. To support
+ * other scalar types, the user has to provide implementations of j1(T) for
+ * any scalar type T to be supported.
+ *
+ * \sa ArrayBase::bessel_j1()
+ */
+template <typename Derived>
+EIGEN_STRONG_INLINE const
+    Eigen::CwiseUnaryOp<Eigen::internal::scalar_bessel_j1_op<typename Derived::Scalar>, const Derived>
+    bessel_j1(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<Eigen::internal::scalar_bessel_j1_op<typename Derived::Scalar>, const Derived>(
+      x.derived());
+}
+
+/** \returns an expression of the coefficient-wise y1(\a x) to the given
+ * arrays.
+ *
+ * It returns the Bessel function of the second kind of order one.
+ *
+ * \param x is the argument
+ *
+ * \note This function supports only float and double scalar types. To support
+ * other scalar types, the user has to provide implementations of y1(T) for
+ * any scalar type T to be supported.
+ *
+ * \sa ArrayBase::bessel_y1()
+ */
+template <typename Derived>
+EIGEN_STRONG_INLINE const
+    Eigen::CwiseUnaryOp<Eigen::internal::scalar_bessel_y1_op<typename Derived::Scalar>, const Derived>
+    bessel_y1(const Eigen::ArrayBase<Derived>& x) {
+  return Eigen::CwiseUnaryOp<Eigen::internal::scalar_bessel_y1_op<typename Derived::Scalar>, const Derived>(
+      x.derived());
+}
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_BESSELFUNCTIONS_ARRAYAPI_H
diff --git a/inst/include/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsBFloat16.h b/inst/include/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsBFloat16.h
new file mode 100644
index 00000000..45b70b45
--- /dev/null
+++ b/inst/include/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsBFloat16.h
@@ -0,0 +1,71 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BESSELFUNCTIONS_BFLOAT16_H
+#define EIGEN_BESSELFUNCTIONS_BFLOAT16_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace numext {
+
+#if EIGEN_HAS_C99_MATH
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_i0(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_i0(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_i0e(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_i0e(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_i1(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_i1(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_i1e(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_i1e(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_j0(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_j0(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_j1(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_j1(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_y0(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_y0(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_y1(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_y1(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_k0(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_k0(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_k0e(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_k0e(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_k1(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_k1(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 bessel_k1e(const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::bessel_k1e(static_cast<float>(x)));
+}
+#endif
+
+}  // end namespace numext
+}  // end namespace Eigen
+
+#endif  // EIGEN_BESSELFUNCTIONS_BFLOAT16_H
diff --git a/inst/include/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsFunctors.h b/inst/include/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsFunctors.h
new file mode 100644
index 00000000..d8774176
--- /dev/null
+++ b/inst/include/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsFunctors.h
@@ -0,0 +1,323 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Eugene Brevdo <ebrevdo@gmail.com>
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BESSELFUNCTIONS_FUNCTORS_H
+#define EIGEN_BESSELFUNCTIONS_FUNCTORS_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+/** \internal
+ * \brief Template functor to compute the modified Bessel function of the first
+ * kind of order zero.
+ * \sa class CwiseUnaryOp, Cwise::bessel_i0()
+ */
+template <typename Scalar>
+struct scalar_bessel_i0_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_i0;
+    return bessel_i0(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const { return internal::pbessel_i0(x); }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_i0_op<Scalar> > {
+  enum {
+    // On average, a Chebyshev polynomial of order N=20 is computed.
+    // The cost is N multiplications and 2N additions. We also add
+    // the cost of an additional exp over i0e.
+    Cost = 28 * NumTraits<Scalar>::MulCost + 48 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the exponentially scaled modified Bessel
+ * function of the first kind of order zero
+ * \sa class CwiseUnaryOp, Cwise::bessel_i0e()
+ */
+template <typename Scalar>
+struct scalar_bessel_i0e_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_i0e;
+    return bessel_i0e(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const { return internal::pbessel_i0e(x); }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_i0e_op<Scalar> > {
+  enum {
+    // On average, a Chebyshev polynomial of order N=20 is computed.
+    // The cost is N multiplications and 2N additions.
+    Cost = 20 * NumTraits<Scalar>::MulCost + 40 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the modified Bessel function of the first
+ * kind of order one
+ * \sa class CwiseUnaryOp, Cwise::bessel_i1()
+ */
+template <typename Scalar>
+struct scalar_bessel_i1_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_i1;
+    return bessel_i1(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const { return internal::pbessel_i1(x); }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_i1_op<Scalar> > {
+  enum {
+    // On average, a Chebyshev polynomial of order N=20 is computed.
+    // The cost is N multiplications and 2N additions. We also add
+    // the cost of an additional exp over i1e.
+    Cost = 28 * NumTraits<Scalar>::MulCost + 48 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the exponentially scaled modified Bessel
+ * function of the first kind of order zero
+ * \sa class CwiseUnaryOp, Cwise::bessel_i1e()
+ */
+template <typename Scalar>
+struct scalar_bessel_i1e_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_i1e;
+    return bessel_i1e(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const { return internal::pbessel_i1e(x); }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_i1e_op<Scalar> > {
+  enum {
+    // On average, a Chebyshev polynomial of order N=20 is computed.
+    // The cost is N multiplications and 2N additions.
+    Cost = 20 * NumTraits<Scalar>::MulCost + 40 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the Bessel function of the second kind of
+ * order zero
+ * \sa class CwiseUnaryOp, Cwise::bessel_j0()
+ */
+template <typename Scalar>
+struct scalar_bessel_j0_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_j0;
+    return bessel_j0(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const { return internal::pbessel_j0(x); }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_j0_op<Scalar> > {
+  enum {
+    // 6 polynomial of order ~N=8 is computed.
+    // The cost is N multiplications and N additions each, along with a
+    // sine, cosine and rsqrt cost.
+    Cost = 63 * NumTraits<Scalar>::MulCost + 48 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the Bessel function of the second kind of
+ * order zero
+ * \sa class CwiseUnaryOp, Cwise::bessel_y0()
+ */
+template <typename Scalar>
+struct scalar_bessel_y0_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_y0;
+    return bessel_y0(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const { return internal::pbessel_y0(x); }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_y0_op<Scalar> > {
+  enum {
+    // 6 polynomial of order ~N=8 is computed.
+    // The cost is N multiplications and N additions each, along with a
+    // sine, cosine, rsqrt and j0 cost.
+    Cost = 126 * NumTraits<Scalar>::MulCost + 96 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the Bessel function of the first kind of
+ * order one
+ * \sa class CwiseUnaryOp, Cwise::bessel_j1()
+ */
+template <typename Scalar>
+struct scalar_bessel_j1_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_j1;
+    return bessel_j1(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const { return internal::pbessel_j1(x); }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_j1_op<Scalar> > {
+  enum {
+    // 6 polynomial of order ~N=8 is computed.
+    // The cost is N multiplications and N additions each, along with a
+    // sine, cosine and rsqrt cost.
+    Cost = 63 * NumTraits<Scalar>::MulCost + 48 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the Bessel function of the second kind of
+ * order one
+ * \sa class CwiseUnaryOp, Cwise::bessel_j1e()
+ */
+template <typename Scalar>
+struct scalar_bessel_y1_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_y1;
+    return bessel_y1(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const { return internal::pbessel_y1(x); }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_y1_op<Scalar> > {
+  enum {
+    // 6 polynomial of order ~N=8 is computed.
+    // The cost is N multiplications and N additions each, along with a
+    // sine, cosine, rsqrt and j1 cost.
+    Cost = 126 * NumTraits<Scalar>::MulCost + 96 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the modified Bessel function of the second
+ * kind of order zero
+ * \sa class CwiseUnaryOp, Cwise::bessel_k0()
+ */
+template <typename Scalar>
+struct scalar_bessel_k0_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_k0;
+    return bessel_k0(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const { return internal::pbessel_k0(x); }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_k0_op<Scalar> > {
+  enum {
+    // On average, a Chebyshev polynomial of order N=10 is computed.
+    // The cost is N multiplications and 2N additions. In addition we compute
+    // i0, a log, exp and prsqrt and sin and cos.
+    Cost = 68 * NumTraits<Scalar>::MulCost + 88 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the exponentially scaled modified Bessel
+ * function of the second kind of order zero
+ * \sa class CwiseUnaryOp, Cwise::bessel_k0e()
+ */
+template <typename Scalar>
+struct scalar_bessel_k0e_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_k0e;
+    return bessel_k0e(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const { return internal::pbessel_k0e(x); }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_k0e_op<Scalar> > {
+  enum {
+    // On average, a Chebyshev polynomial of order N=10 is computed.
+    // The cost is N multiplications and 2N additions. In addition we compute
+    // i0, a log, exp and prsqrt and sin and cos.
+    Cost = 68 * NumTraits<Scalar>::MulCost + 88 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the modified Bessel function of the
+ * second kind of order one
+ * \sa class CwiseUnaryOp, Cwise::bessel_k1()
+ */
+template <typename Scalar>
+struct scalar_bessel_k1_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_k1;
+    return bessel_k1(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const { return internal::pbessel_k1(x); }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_k1_op<Scalar> > {
+  enum {
+    // On average, a Chebyshev polynomial of order N=10 is computed.
+    // The cost is N multiplications and 2N additions. In addition we compute
+    // i1, a log, exp and prsqrt and sin and cos.
+    Cost = 68 * NumTraits<Scalar>::MulCost + 88 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the exponentially scaled modified Bessel
+ * function of the second kind of order one
+ * \sa class CwiseUnaryOp, Cwise::bessel_k1e()
+ */
+template <typename Scalar>
+struct scalar_bessel_k1e_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x) const {
+    using numext::bessel_k1e;
+    return bessel_k1e(x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const { return internal::pbessel_k1e(x); }
+};
+template <typename Scalar>
+struct functor_traits<scalar_bessel_k1e_op<Scalar> > {
+  enum {
+    // On average, a Chebyshev polynomial of order N=10 is computed.
+    // The cost is N multiplications and 2N additions. In addition we compute
+    // i1, a log, exp and prsqrt and sin and cos.
+    Cost = 68 * NumTraits<Scalar>::MulCost + 88 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBessel
+  };
+};
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_BESSELFUNCTIONS_FUNCTORS_H
diff --git a/inst/include/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsHalf.h b/inst/include/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsHalf.h
new file mode 100644
index 00000000..ac1ea28f
--- /dev/null
+++ b/inst/include/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsHalf.h
@@ -0,0 +1,69 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BESSELFUNCTIONS_HALF_H
+#define EIGEN_BESSELFUNCTIONS_HALF_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace numext {
+
+#if EIGEN_HAS_C99_MATH
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_i0(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_i0(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_i0e(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_i0e(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_i1(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_i1(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_i1e(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_i1e(static_cast<float>(x)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_j0(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_j0(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_j1(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_j1(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_y0(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_y0(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_y1(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_y1(static_cast<float>(x)));
+}
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_k0(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_k0(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_k0e(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_k0e(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_k1(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_k1(static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half bessel_k1e(const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::bessel_k1e(static_cast<float>(x)));
+}
+#endif
+
+}  // end namespace numext
+}  // end namespace Eigen
+
+#endif  // EIGEN_BESSELFUNCTIONS_HALF_H
diff --git a/inst/include/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsImpl.h b/inst/include/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsImpl.h
new file mode 100644
index 00000000..59bdaf1b
--- /dev/null
+++ b/inst/include/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsImpl.h
@@ -0,0 +1,1638 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Eugene Brevdo <ebrevdo@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BESSEL_FUNCTIONS_H
+#define EIGEN_BESSEL_FUNCTIONS_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+//  Parts of this code are based on the Cephes Math Library.
+//
+//  Cephes Math Library Release 2.8:  June, 2000
+//  Copyright 1984, 1987, 1992, 2000 by Stephen L. Moshier
+//
+//  Permission has been kindly provided by the original author
+//  to incorporate the Cephes software into the Eigen codebase:
+//
+//    From: Stephen Moshier
+//    To: Eugene Brevdo
+//    Subject: Re: Permission to wrap several cephes functions in Eigen
+//
+//    Hello Eugene,
+//
+//    Thank you for writing.
+//
+//    If your licensing is similar to BSD, the formal way that has been
+//    handled is simply to add a statement to the effect that you are incorporating
+//    the Cephes software by permission of the author.
+//
+//    Good luck with your project,
+//    Steve
+
+/****************************************************************************
+ * Implementation of Bessel function, based on Cephes                       *
+ ****************************************************************************/
+
+template <typename Scalar>
+struct bessel_i0e_retval {
+  typedef Scalar type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_i0e {
+  EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false), THIS_TYPE_IS_NOT_SUPPORTED)
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T&) { return ScalarType(0); }
+};
+
+template <typename T>
+struct generic_i0e<T, float> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  i0ef.c
+     *
+     *  Modified Bessel function of order zero,
+     *  exponentially scaled
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float x, y, i0ef();
+     *
+     * y = i0ef( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns exponentially scaled modified Bessel function
+     * of order zero of the argument.
+     *
+     * The function is defined as i0e(x) = exp(-|x|) j0( ix ).
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0,30        100000      3.7e-7      7.0e-8
+     * See i0f().
+     *
+     */
+
+    const float A[] = {-1.30002500998624804212E-8f, 6.04699502254191894932E-8f,  -2.67079385394061173391E-7f,
+                       1.11738753912010371815E-6f,  -4.41673835845875056359E-6f, 1.64484480707288970893E-5f,
+                       -5.75419501008210370398E-5f, 1.88502885095841655729E-4f,  -5.76375574538582365885E-4f,
+                       1.63947561694133579842E-3f,  -4.32430999505057594430E-3f, 1.05464603945949983183E-2f,
+                       -2.37374148058994688156E-2f, 4.93052842396707084878E-2f,  -9.49010970480476444210E-2f,
+                       1.71620901522208775349E-1f,  -3.04682672343198398683E-1f, 6.76795274409476084995E-1f};
+
+    const float B[] = {3.39623202570838634515E-9f, 2.26666899049817806459E-8f, 2.04891858946906374183E-7f,
+                       2.89137052083475648297E-6f, 6.88975834691682398426E-5f, 3.36911647825569408990E-3f,
+                       8.04490411014108831608E-1f};
+    T y = pabs(x);
+    T y_le_eight = internal::pchebevl<T, 18>::run(pmadd(pset1<T>(0.5f), y, pset1<T>(-2.0f)), A);
+    T y_gt_eight = pmul(internal::pchebevl<T, 7>::run(psub(pdiv(pset1<T>(32.0f), y), pset1<T>(2.0f)), B), prsqrt(y));
+    // TODO: Perhaps instead check whether all packet elements are in
+    // [-8, 8] and evaluate a branch based off of that. It's possible
+    // in practice most elements are in this region.
+    return pselect(pcmp_le(y, pset1<T>(8.0f)), y_le_eight, y_gt_eight);
+  }
+};
+
+template <typename T>
+struct generic_i0e<T, double> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  i0e.c
+     *
+     *  Modified Bessel function of order zero,
+     *  exponentially scaled
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, i0e();
+     *
+     * y = i0e( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns exponentially scaled modified Bessel function
+     * of order zero of the argument.
+     *
+     * The function is defined as i0e(x) = exp(-|x|) j0( ix ).
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0,30        30000       5.4e-16     1.2e-16
+     * See i0().
+     *
+     */
+
+    const double A[] = {-4.41534164647933937950E-18, 3.33079451882223809783E-17,  -2.43127984654795469359E-16,
+                        1.71539128555513303061E-15,  -1.16853328779934516808E-14, 7.67618549860493561688E-14,
+                        -4.85644678311192946090E-13, 2.95505266312963983461E-12,  -1.72682629144155570723E-11,
+                        9.67580903537323691224E-11,  -5.18979560163526290666E-10, 2.65982372468238665035E-9,
+                        -1.30002500998624804212E-8,  6.04699502254191894932E-8,   -2.67079385394061173391E-7,
+                        1.11738753912010371815E-6,   -4.41673835845875056359E-6,  1.64484480707288970893E-5,
+                        -5.75419501008210370398E-5,  1.88502885095841655729E-4,   -5.76375574538582365885E-4,
+                        1.63947561694133579842E-3,   -4.32430999505057594430E-3,  1.05464603945949983183E-2,
+                        -2.37374148058994688156E-2,  4.93052842396707084878E-2,   -9.49010970480476444210E-2,
+                        1.71620901522208775349E-1,   -3.04682672343198398683E-1,  6.76795274409476084995E-1};
+    const double B[] = {-7.23318048787475395456E-18, -4.83050448594418207126E-18, 4.46562142029675999901E-17,
+                        3.46122286769746109310E-17,  -2.82762398051658348494E-16, -3.42548561967721913462E-16,
+                        1.77256013305652638360E-15,  3.81168066935262242075E-15,  -9.55484669882830764870E-15,
+                        -4.15056934728722208663E-14, 1.54008621752140982691E-14,  3.85277838274214270114E-13,
+                        7.18012445138366623367E-13,  -1.79417853150680611778E-12, -1.32158118404477131188E-11,
+                        -3.14991652796324136454E-11, 1.18891471078464383424E-11,  4.94060238822496958910E-10,
+                        3.39623202570838634515E-9,   2.26666899049817806459E-8,   2.04891858946906374183E-7,
+                        2.89137052083475648297E-6,   6.88975834691682398426E-5,   3.36911647825569408990E-3,
+                        8.04490411014108831608E-1};
+    T y = pabs(x);
+    T y_le_eight = internal::pchebevl<T, 30>::run(pmadd(pset1<T>(0.5), y, pset1<T>(-2.0)), A);
+    T y_gt_eight = pmul(internal::pchebevl<T, 25>::run(psub(pdiv(pset1<T>(32.0), y), pset1<T>(2.0)), B), prsqrt(y));
+    // TODO: Perhaps instead check whether all packet elements are in
+    // [-8, 8] and evaluate a branch based off of that. It's possible
+    // in practice most elements are in this region.
+    return pselect(pcmp_le(y, pset1<T>(8.0)), y_le_eight, y_gt_eight);
+  }
+};
+
+template <typename T>
+struct bessel_i0e_impl {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T x) { return generic_i0e<T>::run(x); }
+};
+
+template <typename Scalar>
+struct bessel_i0_retval {
+  typedef Scalar type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_i0 {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) {
+    return pmul(pexp(pabs(x)), generic_i0e<T, ScalarType>::run(x));
+  }
+};
+
+template <typename T>
+struct bessel_i0_impl {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T x) { return generic_i0<T>::run(x); }
+};
+
+template <typename Scalar>
+struct bessel_i1e_retval {
+  typedef Scalar type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_i1e {
+  EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false), THIS_TYPE_IS_NOT_SUPPORTED)
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T&) { return ScalarType(0); }
+};
+
+template <typename T>
+struct generic_i1e<T, float> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) {
+    /* i1ef.c
+     *
+     *  Modified Bessel function of order one,
+     *  exponentially scaled
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float x, y, i1ef();
+     *
+     * y = i1ef( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns exponentially scaled modified Bessel function
+     * of order one of the argument.
+     *
+     * The function is defined as i1(x) = -i exp(-|x|) j1( ix ).
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 30       30000       1.5e-6      1.5e-7
+     * See i1().
+     *
+     */
+    const float A[] = {9.38153738649577178388E-9f,  -4.44505912879632808065E-8f, 2.00329475355213526229E-7f,
+                       -8.56872026469545474066E-7f, 3.47025130813767847674E-6f,  -1.32731636560394358279E-5f,
+                       4.78156510755005422638E-5f,  -1.61760815825896745588E-4f, 5.12285956168575772895E-4f,
+                       -1.51357245063125314899E-3f, 4.15642294431288815669E-3f,  -1.05640848946261981558E-2f,
+                       2.47264490306265168283E-2f,  -5.29459812080949914269E-2f, 1.02643658689847095384E-1f,
+                       -1.76416518357834055153E-1f, 2.52587186443633654823E-1f};
+
+    const float B[] = {-3.83538038596423702205E-9f, -2.63146884688951950684E-8f, -2.51223623787020892529E-7f,
+                       -3.88256480887769039346E-6f, -1.10588938762623716291E-4f, -9.76109749136146840777E-3f,
+                       7.78576235018280120474E-1f};
+
+    T y = pabs(x);
+    T y_le_eight = pmul(y, internal::pchebevl<T, 17>::run(pmadd(pset1<T>(0.5f), y, pset1<T>(-2.0f)), A));
+    T y_gt_eight = pmul(internal::pchebevl<T, 7>::run(psub(pdiv(pset1<T>(32.0f), y), pset1<T>(2.0f)), B), prsqrt(y));
+    // TODO: Perhaps instead check whether all packet elements are in
+    // [-8, 8] and evaluate a branch based off of that. It's possible
+    // in practice most elements are in this region.
+    y = pselect(pcmp_le(y, pset1<T>(8.0f)), y_le_eight, y_gt_eight);
+    return pselect(pcmp_lt(x, pset1<T>(0.0f)), pnegate(y), y);
+  }
+};
+
+template <typename T>
+struct generic_i1e<T, double> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  i1e.c
+     *
+     *  Modified Bessel function of order one,
+     *  exponentially scaled
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, i1e();
+     *
+     * y = i1e( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns exponentially scaled modified Bessel function
+     * of order one of the argument.
+     *
+     * The function is defined as i1(x) = -i exp(-|x|) j1( ix ).
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 30       30000       2.0e-15     2.0e-16
+     * See i1().
+     *
+     */
+    const double A[] = {2.77791411276104639959E-18,  -2.11142121435816608115E-17, 1.55363195773620046921E-16,
+                        -1.10559694773538630805E-15, 7.60068429473540693410E-15,  -5.04218550472791168711E-14,
+                        3.22379336594557470981E-13,  -1.98397439776494371520E-12, 1.17361862988909016308E-11,
+                        -6.66348972350202774223E-11, 3.62559028155211703701E-10,  -1.88724975172282928790E-9,
+                        9.38153738649577178388E-9,   -4.44505912879632808065E-8,  2.00329475355213526229E-7,
+                        -8.56872026469545474066E-7,  3.47025130813767847674E-6,   -1.32731636560394358279E-5,
+                        4.78156510755005422638E-5,   -1.61760815825896745588E-4,  5.12285956168575772895E-4,
+                        -1.51357245063125314899E-3,  4.15642294431288815669E-3,   -1.05640848946261981558E-2,
+                        2.47264490306265168283E-2,   -5.29459812080949914269E-2,  1.02643658689847095384E-1,
+                        -1.76416518357834055153E-1,  2.52587186443633654823E-1};
+    const double B[] = {7.51729631084210481353E-18,  4.41434832307170791151E-18,  -4.65030536848935832153E-17,
+                        -3.20952592199342395980E-17, 2.96262899764595013876E-16,  3.30820231092092828324E-16,
+                        -1.88035477551078244854E-15, -3.81440307243700780478E-15, 1.04202769841288027642E-14,
+                        4.27244001671195135429E-14,  -2.10154184277266431302E-14, -4.08355111109219731823E-13,
+                        -7.19855177624590851209E-13, 2.03562854414708950722E-12,  1.41258074366137813316E-11,
+                        3.25260358301548823856E-11,  -1.89749581235054123450E-11, -5.58974346219658380687E-10,
+                        -3.83538038596423702205E-9,  -2.63146884688951950684E-8,  -2.51223623787020892529E-7,
+                        -3.88256480887769039346E-6,  -1.10588938762623716291E-4,  -9.76109749136146840777E-3,
+                        7.78576235018280120474E-1};
+    T y = pabs(x);
+    T y_le_eight = pmul(y, internal::pchebevl<T, 29>::run(pmadd(pset1<T>(0.5), y, pset1<T>(-2.0)), A));
+    T y_gt_eight = pmul(internal::pchebevl<T, 25>::run(psub(pdiv(pset1<T>(32.0), y), pset1<T>(2.0)), B), prsqrt(y));
+    // TODO: Perhaps instead check whether all packet elements are in
+    // [-8, 8] and evaluate a branch based off of that. It's possible
+    // in practice most elements are in this region.
+    y = pselect(pcmp_le(y, pset1<T>(8.0)), y_le_eight, y_gt_eight);
+    return pselect(pcmp_lt(x, pset1<T>(0.0)), pnegate(y), y);
+  }
+};
+
+template <typename T>
+struct bessel_i1e_impl {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T x) { return generic_i1e<T>::run(x); }
+};
+
+template <typename T>
+struct bessel_i1_retval {
+  typedef T type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_i1 {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) {
+    return pmul(pexp(pabs(x)), generic_i1e<T, ScalarType>::run(x));
+  }
+};
+
+template <typename T>
+struct bessel_i1_impl {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T x) { return generic_i1<T>::run(x); }
+};
+
+template <typename T>
+struct bessel_k0e_retval {
+  typedef T type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_k0e {
+  EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false), THIS_TYPE_IS_NOT_SUPPORTED)
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T&) { return ScalarType(0); }
+};
+
+template <typename T>
+struct generic_k0e<T, float> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  k0ef.c
+     *	Modified Bessel function, third kind, order zero,
+     *	exponentially scaled
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float x, y, k0ef();
+     *
+     * y = k0ef( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns exponentially scaled modified Bessel function
+     * of the third kind of order zero of the argument.
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 30       30000       8.1e-7      7.8e-8
+     * See k0().
+     *
+     */
+
+    const float A[] = {1.90451637722020886025E-9f, 2.53479107902614945675E-7f, 2.28621210311945178607E-5f,
+                       1.26461541144692592338E-3f, 3.59799365153615016266E-2f, 3.44289899924628486886E-1f,
+                       -5.35327393233902768720E-1f};
+
+    const float B[] = {-1.69753450938905987466E-9f, 8.57403401741422608519E-9f,  -4.66048989768794782956E-8f,
+                       2.76681363944501510342E-7f,  -1.83175552271911948767E-6f, 1.39498137188764993662E-5f,
+                       -1.28495495816278026384E-4f, 1.56988388573005337491E-3f,  -3.14481013119645005427E-2f,
+                       2.44030308206595545468E0f};
+    const T MAXNUM = pset1<T>(NumTraits<float>::infinity());
+    const T two = pset1<T>(2.0);
+    T x_le_two = internal::pchebevl<T, 7>::run(pmadd(x, x, pset1<T>(-2.0)), A);
+    x_le_two = pmadd(generic_i0<T, float>::run(x), pnegate(plog(pmul(pset1<T>(0.5), x))), x_le_two);
+    x_le_two = pmul(pexp(x), x_le_two);
+    T x_gt_two = pmul(internal::pchebevl<T, 10>::run(psub(pdiv(pset1<T>(8.0), x), two), B), prsqrt(x));
+    return pselect(pcmp_le(x, pset1<T>(0.0)), MAXNUM, pselect(pcmp_le(x, two), x_le_two, x_gt_two));
+  }
+};
+
+template <typename T>
+struct generic_k0e<T, double> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  k0e.c
+     *	Modified Bessel function, third kind, order zero,
+     *	exponentially scaled
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, k0e();
+     *
+     * y = k0e( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns exponentially scaled modified Bessel function
+     * of the third kind of order zero of the argument.
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 30       30000       1.4e-15     1.4e-16
+     * See k0().
+     *
+     */
+
+    const double A[] = {1.37446543561352307156E-16, 4.25981614279661018399E-14, 1.03496952576338420167E-11,
+                        1.90451637722020886025E-9,  2.53479107902614945675E-7,  2.28621210311945178607E-5,
+                        1.26461541144692592338E-3,  3.59799365153615016266E-2,  3.44289899924628486886E-1,
+                        -5.35327393233902768720E-1};
+    const double B[] = {5.30043377268626276149E-18,  -1.64758043015242134646E-17, 5.21039150503902756861E-17,
+                        -1.67823109680541210385E-16, 5.51205597852431940784E-16,  -1.84859337734377901440E-15,
+                        6.34007647740507060557E-15,  -2.22751332699166985548E-14, 8.03289077536357521100E-14,
+                        -2.98009692317273043925E-13, 1.14034058820847496303E-12,  -4.51459788337394416547E-12,
+                        1.85594911495471785253E-11,  -7.95748924447710747776E-11, 3.57739728140030116597E-10,
+                        -1.69753450938905987466E-9,  8.57403401741422608519E-9,   -4.66048989768794782956E-8,
+                        2.76681363944501510342E-7,   -1.83175552271911948767E-6,  1.39498137188764993662E-5,
+                        -1.28495495816278026384E-4,  1.56988388573005337491E-3,   -3.14481013119645005427E-2,
+                        2.44030308206595545468E0};
+    const T MAXNUM = pset1<T>(NumTraits<double>::infinity());
+    const T two = pset1<T>(2.0);
+    T x_le_two = internal::pchebevl<T, 10>::run(pmadd(x, x, pset1<T>(-2.0)), A);
+    x_le_two = pmadd(generic_i0<T, double>::run(x), pmul(pset1<T>(-1.0), plog(pmul(pset1<T>(0.5), x))), x_le_two);
+    x_le_two = pmul(pexp(x), x_le_two);
+    x_le_two = pselect(pcmp_le(x, pset1<T>(0.0)), MAXNUM, x_le_two);
+    T x_gt_two = pmul(internal::pchebevl<T, 25>::run(psub(pdiv(pset1<T>(8.0), x), two), B), prsqrt(x));
+    return pselect(pcmp_le(x, two), x_le_two, x_gt_two);
+  }
+};
+
+template <typename T>
+struct bessel_k0e_impl {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T x) { return generic_k0e<T>::run(x); }
+};
+
+template <typename T>
+struct bessel_k0_retval {
+  typedef T type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_k0 {
+  EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false), THIS_TYPE_IS_NOT_SUPPORTED)
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T&) { return ScalarType(0); }
+};
+
+template <typename T>
+struct generic_k0<T, float> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  k0f.c
+     *	Modified Bessel function, third kind, order zero
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float x, y, k0f();
+     *
+     * y = k0f( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns modified Bessel function of the third kind
+     * of order zero of the argument.
+     *
+     * The range is partitioned into the two intervals [0,8] and
+     * (8, infinity).  Chebyshev polynomial expansions are employed
+     * in each interval.
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     * Tested at 2000 random points between 0 and 8.  Peak absolute
+     * error (relative when K0 > 1) was 1.46e-14; rms, 4.26e-15.
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 30       30000       7.8e-7      8.5e-8
+     *
+     * ERROR MESSAGES:
+     *
+     *   message         condition      value returned
+     *  K0 domain          x <= 0          MAXNUM
+     *
+     */
+
+    const float A[] = {1.90451637722020886025E-9f, 2.53479107902614945675E-7f, 2.28621210311945178607E-5f,
+                       1.26461541144692592338E-3f, 3.59799365153615016266E-2f, 3.44289899924628486886E-1f,
+                       -5.35327393233902768720E-1f};
+
+    const float B[] = {-1.69753450938905987466E-9f, 8.57403401741422608519E-9f,  -4.66048989768794782956E-8f,
+                       2.76681363944501510342E-7f,  -1.83175552271911948767E-6f, 1.39498137188764993662E-5f,
+                       -1.28495495816278026384E-4f, 1.56988388573005337491E-3f,  -3.14481013119645005427E-2f,
+                       2.44030308206595545468E0f};
+    const T MAXNUM = pset1<T>(NumTraits<float>::infinity());
+    const T two = pset1<T>(2.0);
+    T x_le_two = internal::pchebevl<T, 7>::run(pmadd(x, x, pset1<T>(-2.0)), A);
+    x_le_two = pmadd(generic_i0<T, float>::run(x), pnegate(plog(pmul(pset1<T>(0.5), x))), x_le_two);
+    x_le_two = pselect(pcmp_le(x, pset1<T>(0.0)), MAXNUM, x_le_two);
+    T x_gt_two =
+        pmul(pmul(pexp(pnegate(x)), internal::pchebevl<T, 10>::run(psub(pdiv(pset1<T>(8.0), x), two), B)), prsqrt(x));
+    return pselect(pcmp_le(x, two), x_le_two, x_gt_two);
+  }
+};
+
+template <typename T>
+struct generic_k0<T, double> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*
+     *
+     *	Modified Bessel function, third kind, order zero,
+     *	exponentially scaled
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, k0();
+     *
+     * y = k0( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns exponentially scaled modified Bessel function
+     * of the third kind of order zero of the argument.
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 30       30000       1.4e-15     1.4e-16
+     * See k0().
+     *
+     */
+    const double A[] = {1.37446543561352307156E-16, 4.25981614279661018399E-14, 1.03496952576338420167E-11,
+                        1.90451637722020886025E-9,  2.53479107902614945675E-7,  2.28621210311945178607E-5,
+                        1.26461541144692592338E-3,  3.59799365153615016266E-2,  3.44289899924628486886E-1,
+                        -5.35327393233902768720E-1};
+    const double B[] = {5.30043377268626276149E-18,  -1.64758043015242134646E-17, 5.21039150503902756861E-17,
+                        -1.67823109680541210385E-16, 5.51205597852431940784E-16,  -1.84859337734377901440E-15,
+                        6.34007647740507060557E-15,  -2.22751332699166985548E-14, 8.03289077536357521100E-14,
+                        -2.98009692317273043925E-13, 1.14034058820847496303E-12,  -4.51459788337394416547E-12,
+                        1.85594911495471785253E-11,  -7.95748924447710747776E-11, 3.57739728140030116597E-10,
+                        -1.69753450938905987466E-9,  8.57403401741422608519E-9,   -4.66048989768794782956E-8,
+                        2.76681363944501510342E-7,   -1.83175552271911948767E-6,  1.39498137188764993662E-5,
+                        -1.28495495816278026384E-4,  1.56988388573005337491E-3,   -3.14481013119645005427E-2,
+                        2.44030308206595545468E0};
+    const T MAXNUM = pset1<T>(NumTraits<double>::infinity());
+    const T two = pset1<T>(2.0);
+    T x_le_two = internal::pchebevl<T, 10>::run(pmadd(x, x, pset1<T>(-2.0)), A);
+    x_le_two = pmadd(generic_i0<T, double>::run(x), pnegate(plog(pmul(pset1<T>(0.5), x))), x_le_two);
+    x_le_two = pselect(pcmp_le(x, pset1<T>(0.0)), MAXNUM, x_le_two);
+    T x_gt_two = pmul(pmul(pexp(-x), internal::pchebevl<T, 25>::run(psub(pdiv(pset1<T>(8.0), x), two), B)), prsqrt(x));
+    return pselect(pcmp_le(x, two), x_le_two, x_gt_two);
+  }
+};
+
+template <typename T>
+struct bessel_k0_impl {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T x) { return generic_k0<T>::run(x); }
+};
+
+template <typename T>
+struct bessel_k1e_retval {
+  typedef T type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_k1e {
+  EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false), THIS_TYPE_IS_NOT_SUPPORTED)
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T&) { return ScalarType(0); }
+};
+
+template <typename T>
+struct generic_k1e<T, float> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) {
+    /* k1ef.c
+     *
+     *	Modified Bessel function, third kind, order one,
+     *	exponentially scaled
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float x, y, k1ef();
+     *
+     * y = k1ef( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns exponentially scaled modified Bessel function
+     * of the third kind of order one of the argument:
+     *
+     *      k1e(x) = exp(x) * k1(x).
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 30       30000       4.9e-7      6.7e-8
+     * See k1().
+     *
+     */
+
+    const float A[] = {-2.21338763073472585583E-8f, -2.43340614156596823496E-6f, -1.73028895751305206302E-4f,
+                       -6.97572385963986435018E-3f, -1.22611180822657148235E-1f, -3.53155960776544875667E-1f,
+                       1.52530022733894777053E0f};
+    const float B[] = {2.01504975519703286596E-9f,  -1.03457624656780970260E-8f, 5.74108412545004946722E-8f,
+                       -3.50196060308781257119E-7f, 2.40648494783721712015E-6f,  -1.93619797416608296024E-5f,
+                       1.95215518471351631108E-4f,  -2.85781685962277938680E-3f, 1.03923736576817238437E-1f,
+                       2.72062619048444266945E0f};
+    const T MAXNUM = pset1<T>(NumTraits<float>::infinity());
+    const T two = pset1<T>(2.0);
+    T x_le_two = pdiv(internal::pchebevl<T, 7>::run(pmadd(x, x, pset1<T>(-2.0)), A), x);
+    x_le_two = pmadd(generic_i1<T, float>::run(x), plog(pmul(pset1<T>(0.5), x)), x_le_two);
+    x_le_two = pmul(x_le_two, pexp(x));
+    x_le_two = pselect(pcmp_le(x, pset1<T>(0.0)), MAXNUM, x_le_two);
+    T x_gt_two = pmul(internal::pchebevl<T, 10>::run(psub(pdiv(pset1<T>(8.0), x), two), B), prsqrt(x));
+    return pselect(pcmp_le(x, two), x_le_two, x_gt_two);
+  }
+};
+
+template <typename T>
+struct generic_k1e<T, double> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  k1e.c
+     *
+     *	Modified Bessel function, third kind, order one,
+     *	exponentially scaled
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, k1e();
+     *
+     * y = k1e( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns exponentially scaled modified Bessel function
+     * of the third kind of order one of the argument:
+     *
+     *      k1e(x) = exp(x) * k1(x).
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 30       30000       7.8e-16     1.2e-16
+     * See k1().
+     *
+     */
+    const double A[] = {-7.02386347938628759343E-18, -2.42744985051936593393E-15, -6.66690169419932900609E-13,
+                        -1.41148839263352776110E-10, -2.21338763073472585583E-8,  -2.43340614156596823496E-6,
+                        -1.73028895751305206302E-4,  -6.97572385963986435018E-3,  -1.22611180822657148235E-1,
+                        -3.53155960776544875667E-1,  1.52530022733894777053E0};
+    const double B[] = {-5.75674448366501715755E-18, 1.79405087314755922667E-17,  -5.68946255844285935196E-17,
+                        1.83809354436663880070E-16,  -6.05704724837331885336E-16, 2.03870316562433424052E-15,
+                        -7.01983709041831346144E-15, 2.47715442448130437068E-14,  -8.97670518232499435011E-14,
+                        3.34841966607842919884E-13,  -1.28917396095102890680E-12, 5.13963967348173025100E-12,
+                        -2.12996783842756842877E-11, 9.21831518760500529508E-11,  -4.19035475934189648750E-10,
+                        2.01504975519703286596E-9,   -1.03457624656780970260E-8,  5.74108412545004946722E-8,
+                        -3.50196060308781257119E-7,  2.40648494783721712015E-6,   -1.93619797416608296024E-5,
+                        1.95215518471351631108E-4,   -2.85781685962277938680E-3,  1.03923736576817238437E-1,
+                        2.72062619048444266945E0};
+    const T MAXNUM = pset1<T>(NumTraits<double>::infinity());
+    const T two = pset1<T>(2.0);
+    T x_le_two = pdiv(internal::pchebevl<T, 11>::run(pmadd(x, x, pset1<T>(-2.0)), A), x);
+    x_le_two = pmadd(generic_i1<T, double>::run(x), plog(pmul(pset1<T>(0.5), x)), x_le_two);
+    x_le_two = pmul(x_le_two, pexp(x));
+    x_le_two = pselect(pcmp_le(x, pset1<T>(0.0)), MAXNUM, x_le_two);
+    T x_gt_two = pmul(internal::pchebevl<T, 25>::run(psub(pdiv(pset1<T>(8.0), x), two), B), prsqrt(x));
+    return pselect(pcmp_le(x, two), x_le_two, x_gt_two);
+  }
+};
+
+template <typename T>
+struct bessel_k1e_impl {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T x) { return generic_k1e<T>::run(x); }
+};
+
+template <typename T>
+struct bessel_k1_retval {
+  typedef T type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_k1 {
+  EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false), THIS_TYPE_IS_NOT_SUPPORTED)
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T&) { return ScalarType(0); }
+};
+
+template <typename T>
+struct generic_k1<T, float> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) {
+    /* k1f.c
+     *	Modified Bessel function, third kind, order one
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float x, y, k1f();
+     *
+     * y = k1f( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Computes the modified Bessel function of the third kind
+     * of order one of the argument.
+     *
+     * The range is partitioned into the two intervals [0,2] and
+     * (2, infinity).  Chebyshev polynomial expansions are employed
+     * in each interval.
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 30       30000       4.6e-7      7.6e-8
+     *
+     * ERROR MESSAGES:
+     *
+     *   message         condition      value returned
+     * k1 domain          x <= 0          MAXNUM
+     *
+     */
+
+    const float A[] = {-2.21338763073472585583E-8f, -2.43340614156596823496E-6f, -1.73028895751305206302E-4f,
+                       -6.97572385963986435018E-3f, -1.22611180822657148235E-1f, -3.53155960776544875667E-1f,
+                       1.52530022733894777053E0f};
+    const float B[] = {2.01504975519703286596E-9f,  -1.03457624656780970260E-8f, 5.74108412545004946722E-8f,
+                       -3.50196060308781257119E-7f, 2.40648494783721712015E-6f,  -1.93619797416608296024E-5f,
+                       1.95215518471351631108E-4f,  -2.85781685962277938680E-3f, 1.03923736576817238437E-1f,
+                       2.72062619048444266945E0f};
+    const T MAXNUM = pset1<T>(NumTraits<float>::infinity());
+    const T two = pset1<T>(2.0);
+    T x_le_two = pdiv(internal::pchebevl<T, 7>::run(pmadd(x, x, pset1<T>(-2.0)), A), x);
+    x_le_two = pmadd(generic_i1<T, float>::run(x), plog(pmul(pset1<T>(0.5), x)), x_le_two);
+    x_le_two = pselect(pcmp_le(x, pset1<T>(0.0)), MAXNUM, x_le_two);
+    T x_gt_two =
+        pmul(pexp(pnegate(x)), pmul(internal::pchebevl<T, 10>::run(psub(pdiv(pset1<T>(8.0), x), two), B), prsqrt(x)));
+    return pselect(pcmp_le(x, two), x_le_two, x_gt_two);
+  }
+};
+
+template <typename T>
+struct generic_k1<T, double> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  k1.c
+     *	Modified Bessel function, third kind, order one
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float x, y, k1f();
+     *
+     * y = k1f( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Computes the modified Bessel function of the third kind
+     * of order one of the argument.
+     *
+     * The range is partitioned into the two intervals [0,2] and
+     * (2, infinity).  Chebyshev polynomial expansions are employed
+     * in each interval.
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 30       30000       4.6e-7      7.6e-8
+     *
+     * ERROR MESSAGES:
+     *
+     *   message         condition      value returned
+     * k1 domain          x <= 0          MAXNUM
+     *
+     */
+    const double A[] = {-7.02386347938628759343E-18, -2.42744985051936593393E-15, -6.66690169419932900609E-13,
+                        -1.41148839263352776110E-10, -2.21338763073472585583E-8,  -2.43340614156596823496E-6,
+                        -1.73028895751305206302E-4,  -6.97572385963986435018E-3,  -1.22611180822657148235E-1,
+                        -3.53155960776544875667E-1,  1.52530022733894777053E0};
+    const double B[] = {-5.75674448366501715755E-18, 1.79405087314755922667E-17,  -5.68946255844285935196E-17,
+                        1.83809354436663880070E-16,  -6.05704724837331885336E-16, 2.03870316562433424052E-15,
+                        -7.01983709041831346144E-15, 2.47715442448130437068E-14,  -8.97670518232499435011E-14,
+                        3.34841966607842919884E-13,  -1.28917396095102890680E-12, 5.13963967348173025100E-12,
+                        -2.12996783842756842877E-11, 9.21831518760500529508E-11,  -4.19035475934189648750E-10,
+                        2.01504975519703286596E-9,   -1.03457624656780970260E-8,  5.74108412545004946722E-8,
+                        -3.50196060308781257119E-7,  2.40648494783721712015E-6,   -1.93619797416608296024E-5,
+                        1.95215518471351631108E-4,   -2.85781685962277938680E-3,  1.03923736576817238437E-1,
+                        2.72062619048444266945E0};
+    const T MAXNUM = pset1<T>(NumTraits<double>::infinity());
+    const T two = pset1<T>(2.0);
+    T x_le_two = pdiv(internal::pchebevl<T, 11>::run(pmadd(x, x, pset1<T>(-2.0)), A), x);
+    x_le_two = pmadd(generic_i1<T, double>::run(x), plog(pmul(pset1<T>(0.5), x)), x_le_two);
+    x_le_two = pselect(pcmp_le(x, pset1<T>(0.0)), MAXNUM, x_le_two);
+    T x_gt_two = pmul(pexp(-x), pmul(internal::pchebevl<T, 25>::run(psub(pdiv(pset1<T>(8.0), x), two), B), prsqrt(x)));
+    return pselect(pcmp_le(x, two), x_le_two, x_gt_two);
+  }
+};
+
+template <typename T>
+struct bessel_k1_impl {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T x) { return generic_k1<T>::run(x); }
+};
+
+template <typename T>
+struct bessel_j0_retval {
+  typedef T type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_j0 {
+  EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false), THIS_TYPE_IS_NOT_SUPPORTED)
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T&) { return ScalarType(0); }
+};
+
+template <typename T>
+struct generic_j0<T, float> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) {
+    /* j0f.c
+     *	Bessel function of order zero
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float x, y, j0f();
+     *
+     * y = j0f( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns Bessel function of order zero of the argument.
+     *
+     * The domain is divided into the intervals [0, 2] and
+     * (2, infinity). In the first interval the following polynomial
+     * approximation is used:
+     *
+     *
+     *        2         2         2
+     * (w - r  ) (w - r  ) (w - r  ) P(w)
+     *       1         2         3
+     *
+     *            2
+     * where w = x  and the three r's are zeros of the function.
+     *
+     * In the second interval, the modulus and phase are approximated
+     * by polynomials of the form Modulus(x) = sqrt(1/x) Q(1/x)
+     * and Phase(x) = x + 1/x R(1/x^2) - pi/4.  The function is
+     *
+     *   j0(x) = Modulus(x) cos( Phase(x) ).
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Absolute error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0, 2        100000      1.3e-7      3.6e-8
+     *    IEEE      2, 32       100000      1.9e-7      5.4e-8
+     *
+     */
+
+    const float JP[] = {-6.068350350393235E-008f, 6.388945720783375E-006f, -3.969646342510940E-004f,
+                        1.332913422519003E-002f, -1.729150680240724E-001f};
+    const float MO[] = {-6.838999669318810E-002f, 1.864949361379502E-001f,  -2.145007480346739E-001f,
+                        1.197549369473540E-001f,  -3.560281861530129E-003f, -4.969382655296620E-002f,
+                        -3.355424622293709E-006f, 7.978845717621440E-001f};
+    const float PH[] = {3.242077816988247E+001f,  -3.630592630518434E+001f, 1.756221482109099E+001f,
+                        -4.974978466280903E+000f, 1.001973420681837E+000f,  -1.939906941791308E-001f,
+                        6.490598792654666E-002f,  -1.249992184872738E-001f};
+    const T DR1 = pset1<T>(5.78318596294678452118f);
+    const T NEG_PIO4F = pset1<T>(-0.7853981633974483096f); /* -pi / 4 */
+    T y = pabs(x);
+    T z = pmul(y, y);
+    T y_le_two = pselect(pcmp_lt(y, pset1<T>(1.0e-3f)), pmadd(z, pset1<T>(-0.25f), pset1<T>(1.0f)),
+                         pmul(psub(z, DR1), internal::ppolevl<T, 4>::run(z, JP)));
+    T q = pdiv(pset1<T>(1.0f), y);
+    T w = prsqrt(y);
+    T p = pmul(w, internal::ppolevl<T, 7>::run(q, MO));
+    w = pmul(q, q);
+    T yn = pmadd(q, internal::ppolevl<T, 7>::run(w, PH), NEG_PIO4F);
+    T y_gt_two = pmul(p, pcos(padd(yn, y)));
+    return pselect(pcmp_le(y, pset1<T>(2.0)), y_le_two, y_gt_two);
+  }
+};
+
+template <typename T>
+struct generic_j0<T, double> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  j0.c
+     *	Bessel function of order zero
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, j0();
+     *
+     * y = j0( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns Bessel function of order zero of the argument.
+     *
+     * The domain is divided into the intervals [0, 5] and
+     * (5, infinity). In the first interval the following rational
+     * approximation is used:
+     *
+     *
+     *        2         2
+     * (w - r  ) (w - r  ) P (w) / Q (w)
+     *       1         2    3       8
+     *
+     *            2
+     * where w = x  and the two r's are zeros of the function.
+     *
+     * In the second interval, the Hankel asymptotic expansion
+     * is employed with two rational functions of degree 6/6
+     * and 7/7.
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Absolute error:
+     * arithmetic   domain     # trials      peak         rms
+     *    DEC       0, 30       10000       4.4e-17     6.3e-18
+     *    IEEE      0, 30       60000       4.2e-16     1.1e-16
+     *
+     */
+    const double PP[] = {7.96936729297347051624E-4, 8.28352392107440799803E-2, 1.23953371646414299388E0,
+                         5.44725003058768775090E0,  8.74716500199817011941E0,  5.30324038235394892183E0,
+                         9.99999999999999997821E-1};
+    const double PQ[] = {9.24408810558863637013E-4, 8.56288474354474431428E-2, 1.25352743901058953537E0,
+                         5.47097740330417105182E0,  8.76190883237069594232E0,  5.30605288235394617618E0,
+                         1.00000000000000000218E0};
+    const double QP[] = {-1.13663838898469149931E-2, -1.28252718670509318512E0, -1.95539544257735972385E1,
+                         -9.32060152123768231369E1,  -1.77681167980488050595E2, -1.47077505154951170175E2,
+                         -5.14105326766599330220E1,  -6.05014350600728481186E0};
+    const double QQ[] = {1.00000000000000000000E0, 6.43178256118178023184E1, 8.56430025976980587198E2,
+                         3.88240183605401609683E3, 7.24046774195652478189E3, 5.93072701187316984827E3,
+                         2.06209331660327847417E3, 2.42005740240291393179E2};
+    const double RP[] = {-4.79443220978201773821E9, 1.95617491946556577543E12, -2.49248344360967716204E14,
+                         9.70862251047306323952E15};
+    const double RQ[] = {1.00000000000000000000E0,  4.99563147152651017219E2,  1.73785401676374683123E5,
+                         4.84409658339962045305E7,  1.11855537045356834862E10, 2.11277520115489217587E12,
+                         3.10518229857422583814E14, 3.18121955943204943306E16, 1.71086294081043136091E18};
+    const T DR1 = pset1<T>(5.78318596294678452118E0);
+    const T DR2 = pset1<T>(3.04712623436620863991E1);
+    const T SQ2OPI = pset1<T>(7.9788456080286535587989E-1); /* sqrt(2 / pi) */
+    const T NEG_PIO4 = pset1<T>(-0.7853981633974483096);    /* pi / 4 */
+
+    T y = pabs(x);
+    T z = pmul(y, y);
+    T y_le_five = pselect(pcmp_lt(y, pset1<T>(1.0e-5)), pmadd(z, pset1<T>(-0.25), pset1<T>(1.0)),
+                          pmul(pmul(psub(z, DR1), psub(z, DR2)),
+                               pdiv(internal::ppolevl<T, 3>::run(z, RP), internal::ppolevl<T, 8>::run(z, RQ))));
+    T s = pdiv(pset1<T>(25.0), z);
+    T p = pdiv(internal::ppolevl<T, 6>::run(s, PP), internal::ppolevl<T, 6>::run(s, PQ));
+    T q = pdiv(internal::ppolevl<T, 7>::run(s, QP), internal::ppolevl<T, 7>::run(s, QQ));
+    T yn = padd(y, NEG_PIO4);
+    T w = pdiv(pset1<T>(-5.0), y);
+    p = pmadd(p, pcos(yn), pmul(w, pmul(q, psin(yn))));
+    T y_gt_five = pmul(p, pmul(SQ2OPI, prsqrt(y)));
+    return pselect(pcmp_le(y, pset1<T>(5.0)), y_le_five, y_gt_five);
+  }
+};
+
+template <typename T>
+struct bessel_j0_impl {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T x) { return generic_j0<T>::run(x); }
+};
+
+template <typename T>
+struct bessel_y0_retval {
+  typedef T type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_y0 {
+  EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false), THIS_TYPE_IS_NOT_SUPPORTED)
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T&) { return ScalarType(0); }
+};
+
+template <typename T>
+struct generic_y0<T, float> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) {
+    /* j0f.c
+     * 	Bessel function of the second kind, order zero
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float x, y, y0f();
+     *
+     * y = y0f( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns Bessel function of the second kind, of order
+     * zero, of the argument.
+     *
+     * The domain is divided into the intervals [0, 2] and
+     * (2, infinity). In the first interval a rational approximation
+     * R(x) is employed to compute
+     *
+     *                  2         2         2
+     * y0(x)  =  (w - r  ) (w - r  ) (w - r  ) R(x)  +  2/pi ln(x) j0(x).
+     *                 1         2         3
+     *
+     * Thus a call to j0() is required.  The three zeros are removed
+     * from R(x) to improve its numerical stability.
+     *
+     * In the second interval, the modulus and phase are approximated
+     * by polynomials of the form Modulus(x) = sqrt(1/x) Q(1/x)
+     * and Phase(x) = x + 1/x S(1/x^2) - pi/4.  Then the function is
+     *
+     *   y0(x) = Modulus(x) sin( Phase(x) ).
+     *
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *  Absolute error, when y0(x) < 1; else relative error:
+     *
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0,  2       100000      2.4e-7      3.4e-8
+     *    IEEE      2, 32       100000      1.8e-7      5.3e-8
+     *
+     */
+
+    const float YP[] = {9.454583683980369E-008f, -9.413212653797057E-006f, 5.344486707214273E-004f,
+                        -1.584289289821316E-002f, 1.707584643733568E-001f};
+    const float MO[] = {-6.838999669318810E-002f, 1.864949361379502E-001f,  -2.145007480346739E-001f,
+                        1.197549369473540E-001f,  -3.560281861530129E-003f, -4.969382655296620E-002f,
+                        -3.355424622293709E-006f, 7.978845717621440E-001f};
+    const float PH[] = {3.242077816988247E+001f,  -3.630592630518434E+001f, 1.756221482109099E+001f,
+                        -4.974978466280903E+000f, 1.001973420681837E+000f,  -1.939906941791308E-001f,
+                        6.490598792654666E-002f,  -1.249992184872738E-001f};
+    const T YZ1 = pset1<T>(0.43221455686510834878f);
+    const T TWOOPI = pset1<T>(0.636619772367581343075535f); /* 2 / pi */
+    const T NEG_PIO4F = pset1<T>(-0.7853981633974483096f);  /* -pi / 4 */
+    const T NEG_MAXNUM = pset1<T>(-NumTraits<float>::infinity());
+    T z = pmul(x, x);
+    T x_le_two = pmul(TWOOPI, pmul(plog(x), generic_j0<T, float>::run(x)));
+    x_le_two = pmadd(psub(z, YZ1), internal::ppolevl<T, 4>::run(z, YP), x_le_two);
+    x_le_two = pselect(pcmp_le(x, pset1<T>(0.0)), NEG_MAXNUM, x_le_two);
+    T q = pdiv(pset1<T>(1.0), x);
+    T w = prsqrt(x);
+    T p = pmul(w, internal::ppolevl<T, 7>::run(q, MO));
+    T u = pmul(q, q);
+    T xn = pmadd(q, internal::ppolevl<T, 7>::run(u, PH), NEG_PIO4F);
+    T x_gt_two = pmul(p, psin(padd(xn, x)));
+    return pselect(pcmp_le(x, pset1<T>(2.0)), x_le_two, x_gt_two);
+  }
+};
+
+template <typename T>
+struct generic_y0<T, double> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  j0.c
+     *	Bessel function of the second kind, order zero
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, y0();
+     *
+     * y = y0( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns Bessel function of the second kind, of order
+     * zero, of the argument.
+     *
+     * The domain is divided into the intervals [0, 5] and
+     * (5, infinity). In the first interval a rational approximation
+     * R(x) is employed to compute
+     *   y0(x)  = R(x)  +   2 * log(x) * j0(x) / PI.
+     * Thus a call to j0() is required.
+     *
+     * In the second interval, the Hankel asymptotic expansion
+     * is employed with two rational functions of degree 6/6
+     * and 7/7.
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *  Absolute error, when y0(x) < 1; else relative error:
+     *
+     * arithmetic   domain     # trials      peak         rms
+     *    DEC       0, 30        9400       7.0e-17     7.9e-18
+     *    IEEE      0, 30       30000       1.3e-15     1.6e-16
+     *
+     */
+    const double PP[] = {7.96936729297347051624E-4, 8.28352392107440799803E-2, 1.23953371646414299388E0,
+                         5.44725003058768775090E0,  8.74716500199817011941E0,  5.30324038235394892183E0,
+                         9.99999999999999997821E-1};
+    const double PQ[] = {9.24408810558863637013E-4, 8.56288474354474431428E-2, 1.25352743901058953537E0,
+                         5.47097740330417105182E0,  8.76190883237069594232E0,  5.30605288235394617618E0,
+                         1.00000000000000000218E0};
+    const double QP[] = {-1.13663838898469149931E-2, -1.28252718670509318512E0, -1.95539544257735972385E1,
+                         -9.32060152123768231369E1,  -1.77681167980488050595E2, -1.47077505154951170175E2,
+                         -5.14105326766599330220E1,  -6.05014350600728481186E0};
+    const double QQ[] = {1.00000000000000000000E0, 6.43178256118178023184E1, 8.56430025976980587198E2,
+                         3.88240183605401609683E3, 7.24046774195652478189E3, 5.93072701187316984827E3,
+                         2.06209331660327847417E3, 2.42005740240291393179E2};
+    const double YP[] = {1.55924367855235737965E4,   -1.46639295903971606143E7, 5.43526477051876500413E9,
+                         -9.82136065717911466409E11, 8.75906394395366999549E13, -3.46628303384729719441E15,
+                         4.42733268572569800351E16,  -1.84950800436986690637E16};
+    const double YQ[] = {1.00000000000000000000E0,  1.04128353664259848412E3,  6.26107330137134956842E5,
+                         2.68919633393814121987E8,  8.64002487103935000337E10, 2.02979612750105546709E13,
+                         3.17157752842975028269E15, 2.50596256172653059228E17};
+    const T SQ2OPI = pset1<T>(7.9788456080286535587989E-1); /* sqrt(2 / pi) */
+    const T TWOOPI = pset1<T>(0.636619772367581343075535);  /* 2 / pi */
+    const T NEG_PIO4 = pset1<T>(-0.7853981633974483096);    /* -pi / 4 */
+    const T NEG_MAXNUM = pset1<T>(-NumTraits<double>::infinity());
+
+    T z = pmul(x, x);
+    T x_le_five = pdiv(internal::ppolevl<T, 7>::run(z, YP), internal::ppolevl<T, 7>::run(z, YQ));
+    x_le_five = pmadd(pmul(TWOOPI, plog(x)), generic_j0<T, double>::run(x), x_le_five);
+    x_le_five = pselect(pcmp_le(x, pset1<T>(0.0)), NEG_MAXNUM, x_le_five);
+    T s = pdiv(pset1<T>(25.0), z);
+    T p = pdiv(internal::ppolevl<T, 6>::run(s, PP), internal::ppolevl<T, 6>::run(s, PQ));
+    T q = pdiv(internal::ppolevl<T, 7>::run(s, QP), internal::ppolevl<T, 7>::run(s, QQ));
+    T xn = padd(x, NEG_PIO4);
+    T w = pdiv(pset1<T>(5.0), x);
+    p = pmadd(p, psin(xn), pmul(w, pmul(q, pcos(xn))));
+    T x_gt_five = pmul(p, pmul(SQ2OPI, prsqrt(x)));
+    return pselect(pcmp_le(x, pset1<T>(5.0)), x_le_five, x_gt_five);
+  }
+};
+
+template <typename T>
+struct bessel_y0_impl {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T x) { return generic_y0<T>::run(x); }
+};
+
+template <typename T>
+struct bessel_j1_retval {
+  typedef T type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_j1 {
+  EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false), THIS_TYPE_IS_NOT_SUPPORTED)
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T&) { return ScalarType(0); }
+};
+
+template <typename T>
+struct generic_j1<T, float> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) {
+    /* j1f.c
+     *	Bessel function of order one
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float x, y, j1f();
+     *
+     * y = j1f( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns Bessel function of order one of the argument.
+     *
+     * The domain is divided into the intervals [0, 2] and
+     * (2, infinity). In the first interval a polynomial approximation
+     *        2
+     * (w - r  ) x P(w)
+     *       1
+     *                     2
+     * is used, where w = x  and r is the first zero of the function.
+     *
+     * In the second interval, the modulus and phase are approximated
+     * by polynomials of the form Modulus(x) = sqrt(1/x) Q(1/x)
+     * and Phase(x) = x + 1/x R(1/x^2) - 3pi/4.  The function is
+     *
+     *   j0(x) = Modulus(x) cos( Phase(x) ).
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Absolute error:
+     * arithmetic   domain      # trials      peak       rms
+     *    IEEE      0,  2       100000       1.2e-7     2.5e-8
+     *    IEEE      2, 32       100000       2.0e-7     5.3e-8
+     *
+     *
+     */
+
+    const float JP[] = {-4.878788132172128E-009f, 6.009061827883699E-007f, -4.541343896997497E-005f,
+                        1.937383947804541E-003f, -3.405537384615824E-002f};
+    const float MO1[] = {6.913942741265801E-002f,  -2.284801500053359E-001f, 3.138238455499697E-001f,
+                         -2.102302420403875E-001f, 5.435364690523026E-003f,  1.493389585089498E-001f,
+                         4.976029650847191E-006f,  7.978845453073848E-001f};
+    const float PH1[] = {-4.497014141919556E+001f, 5.073465654089319E+001f,  -2.485774108720340E+001f,
+                         7.222973196770240E+000f,  -1.544842782180211E+000f, 3.503787691653334E-001f,
+                         -1.637986776941202E-001f, 3.749989509080821E-001f};
+    const T Z1 = pset1<T>(1.46819706421238932572E1f);
+    const T NEG_THPIO4F = pset1<T>(-2.35619449019234492885f); /* -3*pi/4 */
+
+    T y = pabs(x);
+    T z = pmul(y, y);
+    T y_le_two = pmul(psub(z, Z1), pmul(x, internal::ppolevl<T, 4>::run(z, JP)));
+    T q = pdiv(pset1<T>(1.0f), y);
+    T w = prsqrt(y);
+    T p = pmul(w, internal::ppolevl<T, 7>::run(q, MO1));
+    w = pmul(q, q);
+    T yn = pmadd(q, internal::ppolevl<T, 7>::run(w, PH1), NEG_THPIO4F);
+    T y_gt_two = pmul(p, pcos(padd(yn, y)));
+    // j1 is an odd function. This implementation differs from cephes to
+    // take this fact in to account. Cephes returns -j1(x) for y > 2 range.
+    y_gt_two = pselect(pcmp_lt(x, pset1<T>(0.0f)), pnegate(y_gt_two), y_gt_two);
+    return pselect(pcmp_le(y, pset1<T>(2.0f)), y_le_two, y_gt_two);
+  }
+};
+
+template <typename T>
+struct generic_j1<T, double> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  j1.c
+     *	Bessel function of order one
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, j1();
+     *
+     * y = j1( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns Bessel function of order one of the argument.
+     *
+     * The domain is divided into the intervals [0, 8] and
+     * (8, infinity). In the first interval a 24 term Chebyshev
+     * expansion is used. In the second, the asymptotic
+     * trigonometric representation is employed using two
+     * rational functions of degree 5/5.
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Absolute error:
+     * arithmetic   domain      # trials      peak         rms
+     *    DEC       0, 30       10000       4.0e-17     1.1e-17
+     *    IEEE      0, 30       30000       2.6e-16     1.1e-16
+     *
+     */
+    const double PP[] = {7.62125616208173112003E-4, 7.31397056940917570436E-2, 1.12719608129684925192E0,
+                         5.11207951146807644818E0,  8.42404590141772420927E0,  5.21451598682361504063E0,
+                         1.00000000000000000254E0};
+    const double PQ[] = {5.71323128072548699714E-4, 6.88455908754495404082E-2, 1.10514232634061696926E0,
+                         5.07386386128601488557E0,  8.39985554327604159757E0,  5.20982848682361821619E0,
+                         9.99999999999999997461E-1};
+    const double QP[] = {5.10862594750176621635E-2, 4.98213872951233449420E0, 7.58238284132545283818E1,
+                         3.66779609360150777800E2,  7.10856304998926107277E2, 5.97489612400613639965E2,
+                         2.11688757100572135698E2,  2.52070205858023719784E1};
+    const double QQ[] = {1.00000000000000000000E0, 7.42373277035675149943E1, 1.05644886038262816351E3,
+                         4.98641058337653607651E3, 9.56231892404756170795E3, 7.99704160447350683650E3,
+                         2.82619278517639096600E3, 3.36093607810698293419E2};
+    const double RP[] = {-8.99971225705559398224E8, 4.52228297998194034323E11, -7.27494245221818276015E13,
+                         3.68295732863852883286E15};
+    const double RQ[] = {1.00000000000000000000E0,  6.20836478118054335476E2,  2.56987256757748830383E5,
+                         8.35146791431949253037E7,  2.21511595479792499675E10, 4.74914122079991414898E12,
+                         7.84369607876235854894E14, 8.95222336184627338078E16, 5.32278620332680085395E18};
+    const T Z1 = pset1<T>(1.46819706421238932572E1);
+    const T Z2 = pset1<T>(4.92184563216946036703E1);
+    const T NEG_THPIO4 = pset1<T>(-2.35619449019234492885); /* -3*pi/4 */
+    const T SQ2OPI = pset1<T>(7.9788456080286535587989E-1); /* sqrt(2 / pi) */
+    T y = pabs(x);
+    T z = pmul(y, y);
+    T y_le_five = pdiv(internal::ppolevl<T, 3>::run(z, RP), internal::ppolevl<T, 8>::run(z, RQ));
+    y_le_five = pmul(pmul(pmul(y_le_five, x), psub(z, Z1)), psub(z, Z2));
+    T s = pdiv(pset1<T>(25.0), z);
+    T p = pdiv(internal::ppolevl<T, 6>::run(s, PP), internal::ppolevl<T, 6>::run(s, PQ));
+    T q = pdiv(internal::ppolevl<T, 7>::run(s, QP), internal::ppolevl<T, 7>::run(s, QQ));
+    T yn = padd(y, NEG_THPIO4);
+    T w = pdiv(pset1<T>(-5.0), y);
+    p = pmadd(p, pcos(yn), pmul(w, pmul(q, psin(yn))));
+    T y_gt_five = pmul(p, pmul(SQ2OPI, prsqrt(y)));
+    // j1 is an odd function. This implementation differs from cephes to
+    // take this fact in to account. Cephes returns -j1(x) for y > 5 range.
+    y_gt_five = pselect(pcmp_lt(x, pset1<T>(0.0)), pnegate(y_gt_five), y_gt_five);
+    return pselect(pcmp_le(y, pset1<T>(5.0)), y_le_five, y_gt_five);
+  }
+};
+
+template <typename T>
+struct bessel_j1_impl {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T x) { return generic_j1<T>::run(x); }
+};
+
+template <typename T>
+struct bessel_y1_retval {
+  typedef T type;
+};
+
+template <typename T, typename ScalarType = typename unpacket_traits<T>::type>
+struct generic_y1 {
+  EIGEN_STATIC_ASSERT((internal::is_same<T, T>::value == false), THIS_TYPE_IS_NOT_SUPPORTED)
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T&) { return ScalarType(0); }
+};
+
+template <typename T>
+struct generic_y1<T, float> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) {
+    /* j1f.c
+     *	Bessel function of second kind of order one
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, y1();
+     *
+     * y = y1( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns Bessel function of the second kind of order one
+     * of the argument.
+     *
+     * The domain is divided into the intervals [0, 2] and
+     * (2, infinity). In the first interval a rational approximation
+     * R(x) is employed to compute
+     *
+     *                  2
+     * y0(x)  =  (w - r  ) x R(x^2)  +  2/pi (ln(x) j1(x) - 1/x) .
+     *                 1
+     *
+     * Thus a call to j1() is required.
+     *
+     * In the second interval, the modulus and phase are approximated
+     * by polynomials of the form Modulus(x) = sqrt(1/x) Q(1/x)
+     * and Phase(x) = x + 1/x S(1/x^2) - 3pi/4.  Then the function is
+     *
+     *   y0(x) = Modulus(x) sin( Phase(x) ).
+     *
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Absolute error:
+     * arithmetic   domain      # trials      peak         rms
+     *    IEEE      0,  2       100000       2.2e-7     4.6e-8
+     *    IEEE      2, 32       100000       1.9e-7     5.3e-8
+     *
+     * (error criterion relative when |y1| > 1).
+     *
+     */
+
+    const float YP[] = {8.061978323326852E-009f, -9.496460629917016E-007f, 6.719543806674249E-005f,
+                        -2.641785726447862E-003f, 4.202369946500099E-002f};
+    const float MO1[] = {6.913942741265801E-002f,  -2.284801500053359E-001f, 3.138238455499697E-001f,
+                         -2.102302420403875E-001f, 5.435364690523026E-003f,  1.493389585089498E-001f,
+                         4.976029650847191E-006f,  7.978845453073848E-001f};
+    const float PH1[] = {-4.497014141919556E+001f, 5.073465654089319E+001f,  -2.485774108720340E+001f,
+                         7.222973196770240E+000f,  -1.544842782180211E+000f, 3.503787691653334E-001f,
+                         -1.637986776941202E-001f, 3.749989509080821E-001f};
+    const T YO1 = pset1<T>(4.66539330185668857532f);
+    const T NEG_THPIO4F = pset1<T>(-2.35619449019234492885f); /* -3*pi/4 */
+    const T TWOOPI = pset1<T>(0.636619772367581343075535f);   /* 2/pi */
+    const T NEG_MAXNUM = pset1<T>(-NumTraits<float>::infinity());
+
+    T z = pmul(x, x);
+    T x_le_two = pmul(psub(z, YO1), internal::ppolevl<T, 4>::run(z, YP));
+    x_le_two = pmadd(x_le_two, x, pmul(TWOOPI, pmadd(generic_j1<T, float>::run(x), plog(x), pdiv(pset1<T>(-1.0f), x))));
+    x_le_two = pselect(pcmp_lt(x, pset1<T>(0.0f)), NEG_MAXNUM, x_le_two);
+
+    T q = pdiv(pset1<T>(1.0), x);
+    T w = prsqrt(x);
+    T p = pmul(w, internal::ppolevl<T, 7>::run(q, MO1));
+    w = pmul(q, q);
+    T xn = pmadd(q, internal::ppolevl<T, 7>::run(w, PH1), NEG_THPIO4F);
+    T x_gt_two = pmul(p, psin(padd(xn, x)));
+    return pselect(pcmp_le(x, pset1<T>(2.0)), x_le_two, x_gt_two);
+  }
+};
+
+template <typename T>
+struct generic_y1<T, double> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) {
+    /*  j1.c
+     *	Bessel function of second kind of order one
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, y1();
+     *
+     * y = y1( x );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns Bessel function of the second kind of order one
+     * of the argument.
+     *
+     * The domain is divided into the intervals [0, 8] and
+     * (8, infinity). In the first interval a 25 term Chebyshev
+     * expansion is used, and a call to j1() is required.
+     * In the second, the asymptotic trigonometric representation
+     * is employed using two rational functions of degree 5/5.
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     *                      Absolute error:
+     * arithmetic   domain      # trials      peak         rms
+     *    DEC       0, 30       10000       8.6e-17     1.3e-17
+     *    IEEE      0, 30       30000       1.0e-15     1.3e-16
+     *
+     * (error criterion relative when |y1| > 1).
+     *
+     */
+    const double PP[] = {7.62125616208173112003E-4, 7.31397056940917570436E-2, 1.12719608129684925192E0,
+                         5.11207951146807644818E0,  8.42404590141772420927E0,  5.21451598682361504063E0,
+                         1.00000000000000000254E0};
+    const double PQ[] = {5.71323128072548699714E-4, 6.88455908754495404082E-2, 1.10514232634061696926E0,
+                         5.07386386128601488557E0,  8.39985554327604159757E0,  5.20982848682361821619E0,
+                         9.99999999999999997461E-1};
+    const double QP[] = {5.10862594750176621635E-2, 4.98213872951233449420E0, 7.58238284132545283818E1,
+                         3.66779609360150777800E2,  7.10856304998926107277E2, 5.97489612400613639965E2,
+                         2.11688757100572135698E2,  2.52070205858023719784E1};
+    const double QQ[] = {1.00000000000000000000E0, 7.42373277035675149943E1, 1.05644886038262816351E3,
+                         4.98641058337653607651E3, 9.56231892404756170795E3, 7.99704160447350683650E3,
+                         2.82619278517639096600E3, 3.36093607810698293419E2};
+    const double YP[] = {1.26320474790178026440E9,   -6.47355876379160291031E11, 1.14509511541823727583E14,
+                         -8.12770255501325109621E15, 2.02439475713594898196E17,  -7.78877196265950026825E17};
+    const double YQ[] = {1.00000000000000000000E0,  5.94301592346128195359E2,  2.35564092943068577943E5,
+                         7.34811944459721705660E7,  1.87601316108706159478E10, 3.88231277496238566008E12,
+                         6.20557727146953693363E14, 6.87141087355300489866E16, 3.97270608116560655612E18};
+    const T SQ2OPI = pset1<T>(.79788456080286535588);
+    const T NEG_THPIO4 = pset1<T>(-2.35619449019234492885); /* -3*pi/4 */
+    const T TWOOPI = pset1<T>(0.636619772367581343075535);  /* 2/pi */
+    const T NEG_MAXNUM = pset1<T>(-NumTraits<double>::infinity());
+
+    T z = pmul(x, x);
+    T x_le_five = pdiv(internal::ppolevl<T, 5>::run(z, YP), internal::ppolevl<T, 8>::run(z, YQ));
+    x_le_five =
+        pmadd(x_le_five, x, pmul(TWOOPI, pmadd(generic_j1<T, double>::run(x), plog(x), pdiv(pset1<T>(-1.0), x))));
+
+    x_le_five = pselect(pcmp_le(x, pset1<T>(0.0)), NEG_MAXNUM, x_le_five);
+    T s = pdiv(pset1<T>(25.0), z);
+    T p = pdiv(internal::ppolevl<T, 6>::run(s, PP), internal::ppolevl<T, 6>::run(s, PQ));
+    T q = pdiv(internal::ppolevl<T, 7>::run(s, QP), internal::ppolevl<T, 7>::run(s, QQ));
+    T xn = padd(x, NEG_THPIO4);
+    T w = pdiv(pset1<T>(5.0), x);
+    p = pmadd(p, psin(xn), pmul(w, pmul(q, pcos(xn))));
+    T x_gt_five = pmul(p, pmul(SQ2OPI, prsqrt(x)));
+    return pselect(pcmp_le(x, pset1<T>(5.0)), x_le_five, x_gt_five);
+  }
+};
+
+template <typename T>
+struct bessel_y1_impl {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T x) { return generic_y1<T>::run(x); }
+};
+
+}  // end namespace internal
+
+namespace numext {
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_i0, Scalar) bessel_i0(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_i0, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_i0e, Scalar) bessel_i0e(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_i0e, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_i1, Scalar) bessel_i1(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_i1, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_i1e, Scalar) bessel_i1e(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_i1e, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_k0, Scalar) bessel_k0(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_k0, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_k0e, Scalar) bessel_k0e(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_k0e, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_k1, Scalar) bessel_k1(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_k1, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_k1e, Scalar) bessel_k1e(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_k1e, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_j0, Scalar) bessel_j0(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_j0, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_y0, Scalar) bessel_y0(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_y0, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_j1, Scalar) bessel_j1(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_j1, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(bessel_y1, Scalar) bessel_y1(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(bessel_y1, Scalar)::run(x);
+}
+
+}  // end namespace numext
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_BESSEL_FUNCTIONS_H
diff --git a/inst/include/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsPacketMath.h b/inst/include/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsPacketMath.h
new file mode 100644
index 00000000..1c325fc4
--- /dev/null
+++ b/inst/include/unsupported/Eigen/src/SpecialFunctions/BesselFunctionsPacketMath.h
@@ -0,0 +1,108 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_BESSELFUNCTIONS_PACKETMATH_H
+#define EIGEN_BESSELFUNCTIONS_PACKETMATH_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order zero i0(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pbessel_i0(const Packet& x) {
+  return numext::bessel_i0(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order zero i0e(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pbessel_i0e(const Packet& x) {
+  return numext::bessel_i0e(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order one i1(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pbessel_i1(const Packet& x) {
+  return numext::bessel_i1(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order one i1e(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pbessel_i1e(const Packet& x) {
+  return numext::bessel_i1e(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order zero j0(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pbessel_j0(const Packet& x) {
+  return numext::bessel_j0(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order zero j1(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pbessel_j1(const Packet& x) {
+  return numext::bessel_j1(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order one y0(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pbessel_y0(const Packet& x) {
+  return numext::bessel_y0(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order one y1(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pbessel_y1(const Packet& x) {
+  return numext::bessel_y1(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order zero k0(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pbessel_k0(const Packet& x) {
+  return numext::bessel_k0(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order zero k0e(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pbessel_k0e(const Packet& x) {
+  return numext::bessel_k0e(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order one k1e(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pbessel_k1(const Packet& x) {
+  return numext::bessel_k1(x);
+}
+
+/** \internal \returns the exponentially scaled modified Bessel function of
+ * order one k1e(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pbessel_k1e(const Packet& x) {
+  return numext::bessel_k1e(x);
+}
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_BESSELFUNCTIONS_PACKETMATH_H
diff --git a/inst/include/unsupported/Eigen/src/SpecialFunctions/HipVectorCompatibility.h b/inst/include/unsupported/Eigen/src/SpecialFunctions/HipVectorCompatibility.h
new file mode 100644
index 00000000..9119335a
--- /dev/null
+++ b/inst/include/unsupported/Eigen/src/SpecialFunctions/HipVectorCompatibility.h
@@ -0,0 +1,71 @@
+#ifndef HIP_VECTOR_COMPATIBILITY_H
+#define HIP_VECTOR_COMPATIBILITY_H
+
+namespace hip_impl {
+template <typename, typename, unsigned int>
+struct Scalar_accessor;
+}  // end namespace hip_impl
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+#define HIP_SCALAR_ACCESSOR_BUILDER(NAME)           \
+  template <typename T, typename U, unsigned int n> \
+  struct NAME<hip_impl::Scalar_accessor<T, U, n>> : NAME<T> {};
+
+#define HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(NAME)                              \
+  template <typename T, typename U, unsigned int n>                           \
+  struct NAME##_impl<hip_impl::Scalar_accessor<T, U, n>> : NAME##_impl<T> {}; \
+  template <typename T, typename U, unsigned int n>                           \
+  struct NAME##_retval<hip_impl::Scalar_accessor<T, U, n>> : NAME##_retval<T> {};
+
+#define HIP_SCALAR_ACCESSOR_BUILDER_IGAMMA(NAME)                                \
+  template <typename T, typename U, unsigned int n, IgammaComputationMode mode> \
+  struct NAME<hip_impl::Scalar_accessor<T, U, n>, mode> : NAME<T, mode> {};
+
+#if EIGEN_HAS_C99_MATH
+HIP_SCALAR_ACCESSOR_BUILDER(betainc_helper)
+HIP_SCALAR_ACCESSOR_BUILDER(incbeta_cfe)
+
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(erf)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(erfc)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(igammac)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(lgamma)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(ndtri)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(polygamma)
+
+HIP_SCALAR_ACCESSOR_BUILDER_IGAMMA(igamma_generic_impl)
+#endif
+
+HIP_SCALAR_ACCESSOR_BUILDER(digamma_impl_maybe_poly)
+HIP_SCALAR_ACCESSOR_BUILDER(zeta_impl_series)
+
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_i0)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_i0e)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_i1)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_i1e)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_j0)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_j1)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_k0)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_k0e)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_k1)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_k1e)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_y0)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(bessel_y1)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(betainc)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(digamma)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(gamma_sample_der_alpha)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(igamma_der_a)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(igamma)
+HIP_SCALAR_ACCESSOR_BUILDER_RETVAL(zeta)
+
+HIP_SCALAR_ACCESSOR_BUILDER_IGAMMA(igamma_series_impl)
+HIP_SCALAR_ACCESSOR_BUILDER_IGAMMA(igammac_cf_impl)
+
+}  // end namespace internal
+}  // end namespace Eigen
+
+#endif  // HIP_VECTOR_COMPATIBILITY_H
diff --git a/inst/include/unsupported/Eigen/src/SpecialFunctions/InternalHeaderCheck.h b/inst/include/unsupported/Eigen/src/SpecialFunctions/InternalHeaderCheck.h
new file mode 100644
index 00000000..a5ef51a8
--- /dev/null
+++ b/inst/include/unsupported/Eigen/src/SpecialFunctions/InternalHeaderCheck.h
@@ -0,0 +1,4 @@
+#ifndef EIGEN_SPECIALFUNCTIONS_MODULE_H
+#error \
+    "Please include unsupported/Eigen/SpecialFunctions instead of including headers inside the src directory directly."
+#endif
diff --git a/inst/include/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h b/inst/include/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h
new file mode 100644
index 00000000..0920d274
--- /dev/null
+++ b/inst/include/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h
@@ -0,0 +1,159 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPECIALFUNCTIONS_ARRAYAPI_H
+#define EIGEN_SPECIALFUNCTIONS_ARRAYAPI_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+/** \cpp11 \returns an expression of the coefficient-wise igamma(\a a, \a x) to the given arrays.
+ *
+ * This function computes the coefficient-wise incomplete gamma function.
+ *
+ * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
+ * or float/double in non c++11 mode, the user has to provide implementations of igammac(T,T) for any scalar
+ * type T to be supported.
+ *
+ * \sa Eigen::igammac(), Eigen::lgamma()
+ */
+template <typename Derived, typename ExponentDerived>
+EIGEN_STRONG_INLINE const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_op<typename Derived::Scalar>,
+                                               const Derived, const ExponentDerived>
+igamma(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDerived>& x) {
+  return Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_op<typename Derived::Scalar>, const Derived,
+                              const ExponentDerived>(a.derived(), x.derived());
+}
+
+/** \cpp11 \returns an expression of the coefficient-wise igamma_der_a(\a a, \a x) to the given arrays.
+ *
+ * This function computes the coefficient-wise derivative of the incomplete
+ * gamma function with respect to the parameter a.
+ *
+ * \note This function supports only float and double scalar types in c++11
+ * mode. To support other scalar types,
+ * or float/double in non c++11 mode, the user has to provide implementations
+ * of igamma_der_a(T,T) for any scalar
+ * type T to be supported.
+ *
+ * \sa Eigen::igamma(), Eigen::lgamma()
+ */
+template <typename Derived, typename ExponentDerived>
+EIGEN_STRONG_INLINE const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_der_a_op<typename Derived::Scalar>,
+                                               const Derived, const ExponentDerived>
+igamma_der_a(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDerived>& x) {
+  return Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_der_a_op<typename Derived::Scalar>, const Derived,
+                              const ExponentDerived>(a.derived(), x.derived());
+}
+
+/** \cpp11 \returns an expression of the coefficient-wise gamma_sample_der_alpha(\a alpha, \a sample) to the given
+ * arrays.
+ *
+ * This function computes the coefficient-wise derivative of the sample
+ * of a Gamma(alpha, 1) random variable with respect to the parameter alpha.
+ *
+ * \note This function supports only float and double scalar types in c++11
+ * mode. To support other scalar types,
+ * or float/double in non c++11 mode, the user has to provide implementations
+ * of gamma_sample_der_alpha(T,T) for any scalar
+ * type T to be supported.
+ *
+ * \sa Eigen::igamma(), Eigen::lgamma()
+ */
+template <typename AlphaDerived, typename SampleDerived>
+EIGEN_STRONG_INLINE const
+    Eigen::CwiseBinaryOp<Eigen::internal::scalar_gamma_sample_der_alpha_op<typename AlphaDerived::Scalar>,
+                         const AlphaDerived, const SampleDerived>
+    gamma_sample_der_alpha(const Eigen::ArrayBase<AlphaDerived>& alpha, const Eigen::ArrayBase<SampleDerived>& sample) {
+  return Eigen::CwiseBinaryOp<Eigen::internal::scalar_gamma_sample_der_alpha_op<typename AlphaDerived::Scalar>,
+                              const AlphaDerived, const SampleDerived>(alpha.derived(), sample.derived());
+}
+
+/** \cpp11 \returns an expression of the coefficient-wise igammac(\a a, \a x) to the given arrays.
+ *
+ * This function computes the coefficient-wise complementary incomplete gamma function.
+ *
+ * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
+ * or float/double in non c++11 mode, the user has to provide implementations of igammac(T,T) for any scalar
+ * type T to be supported.
+ *
+ * \sa Eigen::igamma(), Eigen::lgamma()
+ */
+template <typename Derived, typename ExponentDerived>
+EIGEN_STRONG_INLINE const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igammac_op<typename Derived::Scalar>,
+                                               const Derived, const ExponentDerived>
+igammac(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDerived>& x) {
+  return Eigen::CwiseBinaryOp<Eigen::internal::scalar_igammac_op<typename Derived::Scalar>, const Derived,
+                              const ExponentDerived>(a.derived(), x.derived());
+}
+
+/** \cpp11 \returns an expression of the coefficient-wise polygamma(\a n, \a x) to the given arrays.
+ *
+ * It returns the \a n -th derivative of the digamma(psi) evaluated at \c x.
+ *
+ * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
+ * or float/double in non c++11 mode, the user has to provide implementations of polygamma(T,T) for any scalar
+ * type T to be supported.
+ *
+ * \sa Eigen::digamma()
+ */
+// * \warning Be careful with the order of the parameters: x.polygamma(n) is equivalent to polygamma(n,x)
+// * \sa ArrayBase::polygamma()
+template <typename DerivedN, typename DerivedX>
+EIGEN_STRONG_INLINE const Eigen::CwiseBinaryOp<Eigen::internal::scalar_polygamma_op<typename DerivedX::Scalar>,
+                                               const DerivedN, const DerivedX>
+polygamma(const Eigen::ArrayBase<DerivedN>& n, const Eigen::ArrayBase<DerivedX>& x) {
+  return Eigen::CwiseBinaryOp<Eigen::internal::scalar_polygamma_op<typename DerivedX::Scalar>, const DerivedN,
+                              const DerivedX>(n.derived(), x.derived());
+}
+
+/** \cpp11 \returns an expression of the coefficient-wise betainc(\a x, \a a, \a b) to the given arrays.
+ *
+ * This function computes the regularized incomplete beta function (integral).
+ *
+ * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
+ * or float/double in non c++11 mode, the user has to provide implementations of betainc(T,T,T) for any scalar
+ * type T to be supported.
+ *
+ * \sa Eigen::betainc(), Eigen::lgamma()
+ */
+template <typename ArgADerived, typename ArgBDerived, typename ArgXDerived>
+EIGEN_STRONG_INLINE const Eigen::CwiseTernaryOp<Eigen::internal::scalar_betainc_op<typename ArgXDerived::Scalar>,
+                                                const ArgADerived, const ArgBDerived, const ArgXDerived>
+betainc(const Eigen::ArrayBase<ArgADerived>& a, const Eigen::ArrayBase<ArgBDerived>& b,
+        const Eigen::ArrayBase<ArgXDerived>& x) {
+  return Eigen::CwiseTernaryOp<Eigen::internal::scalar_betainc_op<typename ArgXDerived::Scalar>, const ArgADerived,
+                               const ArgBDerived, const ArgXDerived>(a.derived(), b.derived(), x.derived());
+}
+
+/** \returns an expression of the coefficient-wise zeta(\a x, \a q) to the given arrays.
+ *
+ * It returns the Riemann zeta function of two arguments \a x and \a q:
+ *
+ * \param x is the exponent, it must be > 1
+ * \param q is the shift, it must be > 0
+ *
+ * \note This function supports only float and double scalar types. To support other scalar types, the user has
+ * to provide implementations of zeta(T,T) for any scalar type T to be supported.
+ *
+ * \sa ArrayBase::zeta()
+ */
+template <typename DerivedX, typename DerivedQ>
+EIGEN_STRONG_INLINE const
+    Eigen::CwiseBinaryOp<Eigen::internal::scalar_zeta_op<typename DerivedX::Scalar>, const DerivedX, const DerivedQ>
+    zeta(const Eigen::ArrayBase<DerivedX>& x, const Eigen::ArrayBase<DerivedQ>& q) {
+  return Eigen::CwiseBinaryOp<Eigen::internal::scalar_zeta_op<typename DerivedX::Scalar>, const DerivedX,
+                              const DerivedQ>(x.derived(), q.derived());
+}
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_SPECIALFUNCTIONS_ARRAYAPI_H
diff --git a/inst/include/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsBFloat16.h b/inst/include/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsBFloat16.h
new file mode 100644
index 00000000..90babaea
--- /dev/null
+++ b/inst/include/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsBFloat16.h
@@ -0,0 +1,73 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPECIALFUNCTIONS_BFLOAT16_H
+#define EIGEN_SPECIALFUNCTIONS_BFLOAT16_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace numext {
+
+#if EIGEN_HAS_C99_MATH
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 lgamma(const Eigen::bfloat16& a) {
+  return Eigen::bfloat16(Eigen::numext::lgamma(static_cast<float>(a)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 digamma(const Eigen::bfloat16& a) {
+  return Eigen::bfloat16(Eigen::numext::digamma(static_cast<float>(a)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 zeta(const Eigen::bfloat16& x, const Eigen::bfloat16& q) {
+  return Eigen::bfloat16(Eigen::numext::zeta(static_cast<float>(x), static_cast<float>(q)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 polygamma(const Eigen::bfloat16& n, const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::polygamma(static_cast<float>(n), static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 erf(const Eigen::bfloat16& a) {
+  return Eigen::bfloat16(Eigen::numext::erf(static_cast<float>(a)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 erfc(const Eigen::bfloat16& a) {
+  return Eigen::bfloat16(Eigen::numext::erfc(static_cast<float>(a)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 ndtri(const Eigen::bfloat16& a) {
+  return Eigen::bfloat16(Eigen::numext::ndtri(static_cast<float>(a)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 igamma(const Eigen::bfloat16& a, const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::igamma(static_cast<float>(a), static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 igamma_der_a(const Eigen::bfloat16& a, const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::igamma_der_a(static_cast<float>(a), static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 gamma_sample_der_alpha(const Eigen::bfloat16& alpha,
+                                                                             const Eigen::bfloat16& sample) {
+  return Eigen::bfloat16(Eigen::numext::gamma_sample_der_alpha(static_cast<float>(alpha), static_cast<float>(sample)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 igammac(const Eigen::bfloat16& a, const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::igammac(static_cast<float>(a), static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::bfloat16 betainc(const Eigen::bfloat16& a, const Eigen::bfloat16& b,
+                                                              const Eigen::bfloat16& x) {
+  return Eigen::bfloat16(Eigen::numext::betainc(static_cast<float>(a), static_cast<float>(b), static_cast<float>(x)));
+}
+#endif
+
+}  // end namespace numext
+}  // end namespace Eigen
+
+#endif  // EIGEN_SPECIALFUNCTIONS_BFLOAT16_H
diff --git a/inst/include/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h b/inst/include/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h
new file mode 100644
index 00000000..483c9ae0
--- /dev/null
+++ b/inst/include/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h
@@ -0,0 +1,326 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Eugene Brevdo <ebrevdo@gmail.com>
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPECIALFUNCTIONS_FUNCTORS_H
+#define EIGEN_SPECIALFUNCTIONS_FUNCTORS_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+/** \internal
+ * \brief Template functor to compute the incomplete gamma function igamma(a, x)
+ *
+ * \sa class CwiseBinaryOp, Cwise::igamma
+ */
+template <typename Scalar>
+struct scalar_igamma_op : binary_op_base<Scalar, Scalar> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a, const Scalar& x) const {
+    using numext::igamma;
+    return igamma(a, x);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& x) const {
+    return internal::pigamma(a, x);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_igamma_op<Scalar> > {
+  enum {
+    // Guesstimate
+    Cost = 20 * NumTraits<Scalar>::MulCost + 10 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasIGamma
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the derivative of the incomplete gamma
+ * function igamma_der_a(a, x)
+ *
+ * \sa class CwiseBinaryOp, Cwise::igamma_der_a
+ */
+template <typename Scalar>
+struct scalar_igamma_der_a_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a, const Scalar& x) const {
+    using numext::igamma_der_a;
+    return igamma_der_a(a, x);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& x) const {
+    return internal::pigamma_der_a(a, x);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_igamma_der_a_op<Scalar> > {
+  enum {
+    // 2x the cost of igamma
+    Cost = 40 * NumTraits<Scalar>::MulCost + 20 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasIGammaDerA
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the derivative of the sample
+ * of a Gamma(alpha, 1) random variable with respect to the parameter alpha
+ * gamma_sample_der_alpha(alpha, sample)
+ *
+ * \sa class CwiseBinaryOp, Cwise::gamma_sample_der_alpha
+ */
+template <typename Scalar>
+struct scalar_gamma_sample_der_alpha_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& alpha, const Scalar& sample) const {
+    using numext::gamma_sample_der_alpha;
+    return gamma_sample_der_alpha(alpha, sample);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& alpha, const Packet& sample) const {
+    return internal::pgamma_sample_der_alpha(alpha, sample);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_gamma_sample_der_alpha_op<Scalar> > {
+  enum {
+    // 2x the cost of igamma, minus the lgamma cost (the lgamma cancels out)
+    Cost = 30 * NumTraits<Scalar>::MulCost + 15 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasGammaSampleDerAlpha
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the complementary incomplete gamma function igammac(a, x)
+ *
+ * \sa class CwiseBinaryOp, Cwise::igammac
+ */
+template <typename Scalar>
+struct scalar_igammac_op : binary_op_base<Scalar, Scalar> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a, const Scalar& x) const {
+    using numext::igammac;
+    return igammac(a, x);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& x) const {
+    return internal::pigammac(a, x);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_igammac_op<Scalar> > {
+  enum {
+    // Guesstimate
+    Cost = 20 * NumTraits<Scalar>::MulCost + 10 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasIGammac
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the incomplete beta integral betainc(a, b, x)
+ *
+ */
+template <typename Scalar>
+struct scalar_betainc_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x, const Scalar& a,
+                                                                const Scalar& b) const {
+    using numext::betainc;
+    return betainc(x, a, b);
+  }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& x, const Packet& a, const Packet& b) const {
+    return internal::pbetainc(x, a, b);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_betainc_op<Scalar> > {
+  enum {
+    // Guesstimate
+    Cost = 400 * NumTraits<Scalar>::MulCost + 400 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBetaInc
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the natural log of the absolute
+ * value of Gamma of a scalar
+ * \sa class CwiseUnaryOp, Cwise::lgamma()
+ */
+template <typename Scalar>
+struct scalar_lgamma_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a) const {
+    using numext::lgamma;
+    return lgamma(a);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const { return internal::plgamma(a); }
+};
+template <typename Scalar>
+struct functor_traits<scalar_lgamma_op<Scalar> > {
+  enum {
+    // Guesstimate
+    Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasLGamma
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute psi, the derivative of lgamma of a scalar.
+ * \sa class CwiseUnaryOp, Cwise::digamma()
+ */
+template <typename Scalar>
+struct scalar_digamma_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a) const {
+    using numext::digamma;
+    return digamma(a);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const { return internal::pdigamma(a); }
+};
+template <typename Scalar>
+struct functor_traits<scalar_digamma_op<Scalar> > {
+  enum {
+    // Guesstimate
+    Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasDiGamma
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the Riemann Zeta function of two arguments.
+ * \sa class CwiseUnaryOp, Cwise::zeta()
+ */
+template <typename Scalar>
+struct scalar_zeta_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& x, const Scalar& q) const {
+    using numext::zeta;
+    return zeta(x, q);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x, const Packet& q) const {
+    return internal::pzeta(x, q);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_zeta_op<Scalar> > {
+  enum {
+    // Guesstimate
+    Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasZeta
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the polygamma function.
+ * \sa class CwiseUnaryOp, Cwise::polygamma()
+ */
+template <typename Scalar>
+struct scalar_polygamma_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& n, const Scalar& x) const {
+    using numext::polygamma;
+    return polygamma(n, x);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& n, const Packet& x) const {
+    return internal::ppolygamma(n, x);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_polygamma_op<Scalar> > {
+  enum {
+    // Guesstimate
+    Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasPolygamma
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the error function of a scalar
+ * \sa class CwiseUnaryOp, ArrayBase::erf()
+ */
+template <typename Scalar>
+struct scalar_erf_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a) const { return numext::erf(a); }
+  template <typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& x) const {
+    return perf(x);
+  }
+};
+template <typename Scalar>
+struct functor_traits<scalar_erf_op<Scalar> > {
+  enum {
+    PacketAccess = packet_traits<Scalar>::HasErf,
+    Cost = (PacketAccess
+#ifdef EIGEN_VECTORIZE_FMA
+                // TODO(rmlarsen): Move the FMA cost model to a central location.
+                // Haswell can issue 2 add/mul/madd per cycle.
+                // 10 pmadd, 2 pmul, 1 div, 2 other
+                ? (2 * NumTraits<Scalar>::AddCost + 7 * NumTraits<Scalar>::MulCost +
+                   scalar_div_cost<Scalar, packet_traits<Scalar>::HasDiv>::value)
+#else
+                ? (12 * NumTraits<Scalar>::AddCost + 12 * NumTraits<Scalar>::MulCost +
+                   scalar_div_cost<Scalar, packet_traits<Scalar>::HasDiv>::value)
+#endif
+                // Assume for simplicity that this is as expensive as an exp().
+                : (functor_traits<scalar_exp_op<Scalar> >::Cost))
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the Complementary Error Function
+ * of a scalar
+ * \sa class CwiseUnaryOp, Cwise::erfc()
+ */
+template <typename Scalar>
+struct scalar_erfc_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a) const {
+    using numext::erfc;
+    return erfc(a);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const { return internal::perfc(a); }
+};
+template <typename Scalar>
+struct functor_traits<scalar_erfc_op<Scalar> > {
+  enum {
+    // Guesstimate
+    Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasErfc
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the Inverse of the normal distribution
+ * function of a scalar
+ * \sa class CwiseUnaryOp, Cwise::ndtri()
+ */
+template <typename Scalar>
+struct scalar_ndtri_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator()(const Scalar& a) const {
+    using numext::ndtri;
+    return ndtri(a);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet packetOp(const Packet& a) const { return internal::pndtri(a); }
+};
+template <typename Scalar>
+struct functor_traits<scalar_ndtri_op<Scalar> > {
+  enum {
+    // On average, We are evaluating rational functions with degree N=9 in the
+    // numerator and denominator. This results in 2*N additions and 2*N
+    // multiplications.
+    Cost = 18 * NumTraits<Scalar>::MulCost + 18 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasNdtri
+  };
+};
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_SPECIALFUNCTIONS_FUNCTORS_H
diff --git a/inst/include/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h b/inst/include/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h
new file mode 100644
index 00000000..baba8481
--- /dev/null
+++ b/inst/include/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h
@@ -0,0 +1,73 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPECIALFUNCTIONS_HALF_H
+#define EIGEN_SPECIALFUNCTIONS_HALF_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace numext {
+
+#if EIGEN_HAS_C99_MATH
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half lgamma(const Eigen::half& a) {
+  return Eigen::half(Eigen::numext::lgamma(static_cast<float>(a)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half digamma(const Eigen::half& a) {
+  return Eigen::half(Eigen::numext::digamma(static_cast<float>(a)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half zeta(const Eigen::half& x, const Eigen::half& q) {
+  return Eigen::half(Eigen::numext::zeta(static_cast<float>(x), static_cast<float>(q)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half polygamma(const Eigen::half& n, const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::polygamma(static_cast<float>(n), static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half erf(const Eigen::half& a) {
+  return Eigen::half(Eigen::numext::erf(static_cast<float>(a)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half erfc(const Eigen::half& a) {
+  return Eigen::half(Eigen::numext::erfc(static_cast<float>(a)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half ndtri(const Eigen::half& a) {
+  return Eigen::half(Eigen::numext::ndtri(static_cast<float>(a)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igamma(const Eigen::half& a, const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::igamma(static_cast<float>(a), static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igamma_der_a(const Eigen::half& a, const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::igamma_der_a(static_cast<float>(a), static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half gamma_sample_der_alpha(const Eigen::half& alpha,
+                                                                         const Eigen::half& sample) {
+  return Eigen::half(Eigen::numext::gamma_sample_der_alpha(static_cast<float>(alpha), static_cast<float>(sample)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igammac(const Eigen::half& a, const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::igammac(static_cast<float>(a), static_cast<float>(x)));
+}
+template <>
+EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half betainc(const Eigen::half& a, const Eigen::half& b,
+                                                          const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::betainc(static_cast<float>(a), static_cast<float>(b), static_cast<float>(x)));
+}
+#endif
+
+}  // end namespace numext
+}  // end namespace Eigen
+
+#endif  // EIGEN_SPECIALFUNCTIONS_HALF_H
diff --git a/inst/include/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h b/inst/include/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
new file mode 100644
index 00000000..387836b7
--- /dev/null
+++ b/inst/include/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
@@ -0,0 +1,2073 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Eugene Brevdo <ebrevdo@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPECIAL_FUNCTIONS_H
+#define EIGEN_SPECIAL_FUNCTIONS_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+namespace internal {
+
+//  Parts of this code are based on the Cephes Math Library.
+//
+//  Cephes Math Library Release 2.8:  June, 2000
+//  Copyright 1984, 1987, 1992, 2000 by Stephen L. Moshier
+//
+//  Permission has been kindly provided by the original author
+//  to incorporate the Cephes software into the Eigen codebase:
+//
+//    From: Stephen Moshier
+//    To: Eugene Brevdo
+//    Subject: Re: Permission to wrap several cephes functions in Eigen
+//
+//    Hello Eugene,
+//
+//    Thank you for writing.
+//
+//    If your licensing is similar to BSD, the formal way that has been
+//    handled is simply to add a statement to the effect that you are incorporating
+//    the Cephes software by permission of the author.
+//
+//    Good luck with your project,
+//    Steve
+
+/****************************************************************************
+ * Implementation of lgamma, requires C++11/C99                             *
+ ****************************************************************************/
+
+template <typename Scalar>
+struct lgamma_impl {
+  EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false), THIS_TYPE_IS_NOT_SUPPORTED)
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Scalar) { return Scalar(0); }
+};
+
+template <typename Scalar>
+struct lgamma_retval {
+  typedef Scalar type;
+};
+
+#if EIGEN_HAS_C99_MATH
+// Since glibc 2.19
+#if defined(__GLIBC__) && ((__GLIBC__ >= 2 && __GLIBC_MINOR__ >= 19) || __GLIBC__ > 2) && \
+    (defined(_DEFAULT_SOURCE) || defined(_BSD_SOURCE) || defined(_SVID_SOURCE))
+#define EIGEN_HAS_LGAMMA_R
+#endif
+
+// Glibc versions before 2.19
+#if defined(__GLIBC__) && ((__GLIBC__ == 2 && __GLIBC_MINOR__ < 19) || __GLIBC__ < 2) && \
+    (defined(_BSD_SOURCE) || defined(_SVID_SOURCE))
+#define EIGEN_HAS_LGAMMA_R
+#endif
+
+template <>
+struct lgamma_impl<float> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE float run(float x) {
+#if !defined(EIGEN_GPU_COMPILE_PHASE) && defined(EIGEN_HAS_LGAMMA_R) && !defined(__APPLE__)
+    int dummy;
+    return ::lgammaf_r(x, &dummy);
+#elif defined(SYCL_DEVICE_ONLY)
+    return cl::sycl::lgamma(x);
+#else
+    return ::lgammaf(x);
+#endif
+  }
+};
+
+template <>
+struct lgamma_impl<double> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE double run(double x) {
+#if !defined(EIGEN_GPU_COMPILE_PHASE) && defined(EIGEN_HAS_LGAMMA_R) && !defined(__APPLE__)
+    int dummy;
+    return ::lgamma_r(x, &dummy);
+#elif defined(SYCL_DEVICE_ONLY)
+    return cl::sycl::lgamma(x);
+#else
+    return ::lgamma(x);
+#endif
+  }
+};
+
+#undef EIGEN_HAS_LGAMMA_R
+#endif
+
+/****************************************************************************
+ * Implementation of digamma (psi), based on Cephes                         *
+ ****************************************************************************/
+
+template <typename Scalar>
+struct digamma_retval {
+  typedef Scalar type;
+};
+
+/*
+ *
+ * Polynomial evaluation helper for the Psi (digamma) function.
+ *
+ * digamma_impl_maybe_poly::run(s) evaluates the asymptotic Psi expansion for
+ * input Scalar s, assuming s is above 10.0.
+ *
+ * If s is above a certain threshold for the given Scalar type, zero
+ * is returned.  Otherwise the polynomial is evaluated with enough
+ * coefficients for results matching Scalar machine precision.
+ *
+ *
+ */
+template <typename Scalar>
+struct digamma_impl_maybe_poly {
+  EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false), THIS_TYPE_IS_NOT_SUPPORTED)
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Scalar) { return Scalar(0); }
+};
+
+template <>
+struct digamma_impl_maybe_poly<float> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE float run(const float s) {
+    constexpr float A[] = {-4.16666666666666666667E-3f, 3.96825396825396825397E-3f, -8.33333333333333333333E-3f,
+                           8.33333333333333333333E-2f};
+
+    float z;
+    if (s < 1.0e8f) {
+      z = 1.0f / (s * s);
+      return z * internal::ppolevl<float, 3>::run(z, A);
+    } else
+      return 0.0f;
+  }
+};
+
+template <>
+struct digamma_impl_maybe_poly<double> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE double run(const double s) {
+    constexpr double A[] = {8.33333333333333333333E-2,  -2.10927960927960927961E-2, 7.57575757575757575758E-3,
+                            -4.16666666666666666667E-3, 3.96825396825396825397E-3,  -8.33333333333333333333E-3,
+                            8.33333333333333333333E-2};
+
+    double z;
+    if (s < 1.0e17) {
+      z = 1.0 / (s * s);
+      return z * internal::ppolevl<double, 6>::run(z, A);
+    } else
+      return 0.0;
+  }
+};
+
+template <typename Scalar>
+struct digamma_impl {
+  EIGEN_DEVICE_FUNC static Scalar run(Scalar x) {
+    /*
+     *
+     *     Psi (digamma) function (modified for Eigen)
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, psi();
+     *
+     * y = psi( x );
+     *
+     *
+     * DESCRIPTION:
+     *
+     *              d      -
+     *   psi(x)  =  -- ln | (x)
+     *              dx
+     *
+     * is the logarithmic derivative of the gamma function.
+     * For integer x,
+     *                   n-1
+     *                    -
+     * psi(n) = -EUL  +   >  1/k.
+     *                    -
+     *                   k=1
+     *
+     * If x is negative, it is transformed to a positive argument by the
+     * reflection formula  psi(1-x) = psi(x) + pi cot(pi x).
+     * For general positive x, the argument is made greater than 10
+     * using the recurrence  psi(x+1) = psi(x) + 1/x.
+     * Then the following asymptotic expansion is applied:
+     *
+     *                           inf.   B
+     *                            -      2k
+     * psi(x) = log(x) - 1/2x -   >   -------
+     *                            -        2k
+     *                           k=1   2k x
+     *
+     * where the B2k are Bernoulli numbers.
+     *
+     * ACCURACY (float):
+     *    Relative error (except absolute when |psi| < 1):
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0,30        30000       1.3e-15     1.4e-16
+     *    IEEE      -30,0       40000       1.5e-15     2.2e-16
+     *
+     * ACCURACY (double):
+     *    Absolute error,  relative when |psi| > 1 :
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      -33,0        30000      8.2e-7      1.2e-7
+     *    IEEE      0,33        100000      7.3e-7      7.7e-8
+     *
+     * ERROR MESSAGES:
+     *     message         condition      value returned
+     * psi singularity    x integer <=0      INFINITY
+     */
+
+    Scalar p, q, nz, s, w, y;
+    bool negative = false;
+
+    const Scalar nan = NumTraits<Scalar>::quiet_NaN();
+    const Scalar m_pi = Scalar(EIGEN_PI);
+
+    const Scalar zero = Scalar(0);
+    const Scalar one = Scalar(1);
+    const Scalar half = Scalar(0.5);
+    nz = zero;
+
+    if (x <= zero) {
+      negative = true;
+      q = x;
+      p = numext::floor(q);
+      if (p == q) {
+        return nan;
+      }
+      /* Remove the zeros of tan(m_pi x)
+       * by subtracting the nearest integer from x
+       */
+      nz = q - p;
+      if (nz != half) {
+        if (nz > half) {
+          p += one;
+          nz = q - p;
+        }
+        nz = m_pi / numext::tan(m_pi * nz);
+      } else {
+        nz = zero;
+      }
+      x = one - x;
+    }
+
+    /* use the recurrence psi(x+1) = psi(x) + 1/x. */
+    s = x;
+    w = zero;
+    while (s < Scalar(10)) {
+      w += one / s;
+      s += one;
+    }
+
+    y = digamma_impl_maybe_poly<Scalar>::run(s);
+
+    y = numext::log(s) - (half / s) - y - w;
+
+    return (negative) ? y - nz : y;
+  }
+};
+
+/***************************************************************************
+ * Implementation of erfc.
+ ****************************************************************************/
+template <typename Scalar>
+struct generic_fast_erfc {
+  template <typename T>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T run(const T& x_in);
+};
+
+template <>
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erfc<float>::run(const T& x_in) {
+  constexpr float kClamp = 11.0f;
+  const T x = pmin(pmax(x_in, pset1<T>(-kClamp)), pset1<T>(kClamp));
+
+  // erfc(x) = 1 + x * S(x^2), |x| <= 1.
+  //
+  // Coefficients for S and T generated with Rminimax command:
+  // ./ratapprox --function="erfc(x)-1" --dom='[-1,1]' --type=[11,0] --num="odd"
+  //   --numF="[SG]" --denF="[SG]" --log --dispCoeff="dec"
+  constexpr float alpha[] = {5.61802298761904239654541015625e-04, -4.91381669417023658752441406250e-03,
+                             2.67075151205062866210937500000e-02, -1.12800106406211853027343750000e-01,
+                             3.76122951507568359375000000000e-01, -1.12837910652160644531250000000e+00};
+  const T x2 = pmul(x, x);
+  const T one = pset1<T>(1.0f);
+  const T erfc_small = pmadd(x, ppolevl<T, 5>::run(x2, alpha), one);
+
+  // Return early if we don't need the more expensive approximation for any
+  // entry in a.
+  const T x_abs_gt_one_mask = pcmp_lt(one, x2);
+  if (!predux_any(x_abs_gt_one_mask)) return erfc_small;
+
+  // erfc(x) = exp(-x^2) * 1/x * P(1/x^2) / Q(1/x^2), 1 < x < 9.
+  //
+  // Coefficients for P and Q generated with Rminimax command:
+  //   ./ratapprox --function="erfc(1/sqrt(x))*exp(1/x)/sqrt(x)"
+  //     --dom='[0.01,1]' --type=[3,4] --numF="[SG]" --denF="[SG]" --log
+  //     --dispCoeff="dec"
+  constexpr float gamma[] = {1.0208116471767425537109375e-01f, 4.2920666933059692382812500e-01f,
+                             3.2379078865051269531250000e-01f, 5.3971976041793823242187500e-02f};
+  constexpr float delta[] = {1.7251677811145782470703125e-02f, 3.9137163758277893066406250e-01f,
+                             1.0000000000000000000000000e+00f, 6.2173241376876831054687500e-01f,
+                             9.5662862062454223632812500e-02f};
+  const T x2_lo = twoprod_low(x, x, x2);
+  // Here we use that
+  //   exp(-x^2) = exp(-(x2+x2_lo)^2) ~= exp(-x2)*exp(-x2_lo) ~= exp(-x2)*(1-x2_lo)
+  // since x2_lo < kClamp * eps << 1 in the region we care about. This trick reduces the max error
+  // from 34 ulps to below 5 ulps.
+  const T exp2_hi = pexp(pnegate(x2));
+  const T z = pnmadd(exp2_hi, x2_lo, exp2_hi);
+  const T q2 = preciprocal(x2);
+  const T num = ppolevl<T, 3>::run(q2, gamma);
+  const T denom = pmul(x, ppolevl<T, 4>::run(q2, delta));
+  const T r = pdiv(num, denom);
+  const T maybe_two = pselect(pcmp_lt(x, pset1<T>(0.0f)), pset1<T>(2.0f), pset1<T>(0.0f));
+  const T erfc_large = pmadd(z, r, maybe_two);
+  return pselect(x_abs_gt_one_mask, erfc_large, erfc_small);
+}
+
+// Computes erf(x)/x for |x| <= 1. Used by both erf and erfc implementations.
+// Takes x2 = x^2 as input.
+//
+// PRECONDITION: x2 <= 1.
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T erf_over_x_double_small(const T& x2) {
+  // erf(x)/x =  S(x^2) / T(x^2), x^2 <= 1.
+  //
+  // Coefficients for S and T generated with Rminimax command:
+  //  ./ratapprox --function="erf(x)" --dom='[-1,1]' --type=[9,10]
+  //  --num="odd" --numF="[D]" --den="even" --denF="[D]" --log --dispCoeff="dec"
+  constexpr double alpha[] = {1.9493725660006057018823477644531294572516344487667083740234375e-04,
+                              1.8272566210022942682217328425053892715368419885635375976562500e-03,
+                              4.5303363351690106863856044583371840417385101318359375000000000e-02,
+                              1.4215015503619179981775744181504705920815467834472656250000000e-01,
+                              1.1283791670955125585606992899556644260883331298828125000000000e+00};
+  constexpr double beta[] = {2.0294484101083099089526257108317963684385176748037338256835938e-05,
+                             6.8117805899186819641732970609382391558028757572174072265625000e-04,
+                             1.0582026056098614921752165685120417037978768348693847656250000e-02,
+                             9.3252603143757495374188692949246615171432495117187500000000000e-02,
+                             4.5931062818368939559832142549566924571990966796875000000000000e-01,
+                             1.0};
+  const T num_small = ppolevl<T, 4>::run(x2, alpha);
+  const T denom_small = ppolevl<T, 5>::run(x2, beta);
+  return pdiv(num_small, denom_small);
+}
+
+// erfc(x) = exp(-x^2) * 1/x * P(1/x^2) / Q(1/x^2), 1 < x < 28.
+//
+// Coefficients for P and Q generated with Rminimax command:
+//  ./ratapprox --function="erfc(1/sqrt(x))*exp(1/x)/sqrt(x)"  --dom='[0.0013717,1]' --type=[9,9] --numF="[D]"
+//  --denF="[D]" --log --dispCoeff="dec"
+//
+// PRECONDITION: 1 < x < 28.
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T erfc_double_large(const T& x, const T& x2) {
+  constexpr double gamma[] = {1.5252844933226974316088642158462107545346952974796295166015625e-04,
+                              1.0909912393738931124520519233556115068495273590087890625000000e-02,
+                              1.0628604636755033252537572252549580298364162445068359375000000e-01,
+                              3.3492472973137982217295416376146022230386734008789062500000000e-01,
+                              4.5065776215933289750026347064704168587923049926757812500000000e-01,
+                              2.9433039130294824659017649537418037652969360351562500000000000e-01,
+                              9.8792676360600226170838311645638896152377128601074218750000000e-02,
+                              1.7095935395503719655962981960328761488199234008789062500000000e-02,
+                              1.4249109729504577659398023570247460156679153442382812500000000e-03,
+                              4.4567378313647954771875570045835956989321857690811157226562500e-05};
+  constexpr double delta[] = {2.041985103115789845773520028160419315099716186523437500000000e-03,
+                              5.316030659946043707142493417450168635696172714233398437500000e-02,
+                              3.426242193784684864077405563875799998641014099121093750000000e-01,
+                              8.565637124308049799026321124983951449394226074218750000000000e-01,
+                              1.000000000000000000000000000000000000000000000000000000000000e+00,
+                              5.968805280570776972126623149961233139038085937500000000000000e-01,
+                              1.890922854723317836356244470152887515723705291748046875000000e-01,
+                              3.152505418656005586885981983868987299501895904541015625000000e-02,
+                              2.565085751861882583380047861965067568235099315643310546875000e-03,
+                              7.899362131678837697403017248376499992446042597293853759765625e-05};
+  // Compute exp(-x^2).
+  const T x2_lo = twoprod_low(x, x, x2);
+  // Here we use that
+  //   exp(-x^2) = exp(-(x2+x2_lo)^2) ~= exp(-x2)*exp(-x2_lo) ~= exp(-x2)*(1-x2_lo)
+  // since x2_lo < kClamp *eps << 1 in the region we care about. This trick reduces the max error
+  // from 258 ulps to below 7 ulps.
+  const T exp2_hi = pexp(pnegate(x2));
+  const T z = pnmadd(exp2_hi, x2_lo, exp2_hi);
+  // Compute r = P / Q.
+  const T q2 = preciprocal(x2);
+  const T num_large = ppolevl<T, 9>::run(q2, gamma);
+  const T denom_large = pmul(x, ppolevl<T, 9>::run(q2, delta));
+  const T r = pdiv(num_large, denom_large);
+  const T maybe_two = pselect(pcmp_lt(x, pset1<T>(0.0)), pset1<T>(2.0), pset1<T>(0.0));
+  return pmadd(z, r, maybe_two);
+}
+
+template <>
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erfc<double>::run(const T& x_in) {
+  // Clamp x to [-28:28] beyond which erfc(x) is either two or zero (below the underflow threshold).
+  // This avoids having to deal with twoprod(x,x) producing NaN for sufficiently large x.
+  constexpr double kClamp = 28.0;
+  const T x = pmin(pmax(x_in, pset1<T>(-kClamp)), pset1<T>(kClamp));
+
+  // For |x| < 1, we use erfc(x) = 1 - erf(x).
+  const T x2 = pmul(x, x);
+  const T one = pset1<T>(1.0);
+  const T erfc_small = pnmadd(x, erf_over_x_double_small(x2), one);
+
+  // Return early if we don't need the more expensive approximation for any
+  // entry in a.
+  const T x_abs_gt_one_mask = pcmp_lt(one, x2);
+  if (!predux_any(x_abs_gt_one_mask)) return erfc_small;
+
+  const T erfc_large = erfc_double_large(x, x2);
+  return pselect(x_abs_gt_one_mask, erfc_large, erfc_small);
+}
+
+template <typename T>
+struct erfc_impl {
+  typedef typename unpacket_traits<T>::type Scalar;
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) { return generic_fast_erfc<Scalar>::run(x); }
+};
+
+template <typename Scalar>
+struct erfc_retval {
+  typedef Scalar type;
+};
+
+#if EIGEN_HAS_C99_MATH
+template <>
+struct erfc_impl<float> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE float run(const float x) {
+#if defined(SYCL_DEVICE_ONLY)
+    return cl::sycl::erfc(x);
+#else
+    return generic_fast_erfc<float>::run(x);
+#endif
+  }
+};
+
+template <>
+struct erfc_impl<double> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE double run(const double x) {
+#if defined(SYCL_DEVICE_ONLY)
+    return cl::sycl::erfc(x);
+#else
+    return generic_fast_erfc<double>::run(x);
+#endif
+  }
+};
+#endif  // EIGEN_HAS_C99_MATH
+
+/****************************************************************************
+ * Implementation of erf.
+ ****************************************************************************/
+
+template <typename Scalar>
+struct generic_fast_erf {
+  template <typename T>
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T run(const T& x_in);
+};
+
+/** \internal \returns the error function of \a a (coeff-wise)
+    This uses a 11/10-degree rational interpolantand is accurate to 3 ulp for
+    normalized floats.
+
+    This implementation works on both scalars and SIMD "packets".
+*/
+template <>
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erf<float>::run(const T& x) {
+  // The monomial coefficients of the numerator polynomial (odd).
+  constexpr float alpha[] = {2.123732201653183437883853912353515625e-06f, 2.861979592125862836837768554687500000e-04f,
+                             3.658048342913389205932617187500000000e-03f, 5.243302136659622192382812500000000000e-02f,
+                             1.874160766601562500000000000000000000e-01f, 1.128379106521606445312500000000000000e+00f};
+
+  // The monomial coefficients of the denominator polynomial (even).
+  constexpr float beta[] = {3.89185734093189239501953125000e-05f, 1.14329601638019084930419921875e-03f,
+                            1.47520881146192550659179687500e-02f, 1.12945675849914550781250000000e-01f,
+                            4.99425798654556274414062500000e-01f, 1.0f};
+
+  // Since the polynomials are odd/even, we need x^2.
+  // Since erf(4) == 1 in float, we clamp x^2 to 16 to avoid
+  // computing Inf/Inf below.
+  const T x2 = pmin(pset1<T>(16.0f), pmul(x, x));
+
+  // Evaluate the numerator polynomial p.
+  T p = ppolevl<T, 5>::run(x2, alpha);
+  p = pmul(x, p);
+
+  // Evaluate the denominator polynomial p.
+  T q = ppolevl<T, 5>::run(x2, beta);
+  const T r = pdiv(p, q);
+
+  // Clamp to [-1:1].
+  return pmax(pmin(r, pset1<T>(1.0f)), pset1<T>(-1.0f));
+}
+
+template <>
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_fast_erf<double>::run(const T& x) {
+  T x2 = pmul(x, x);
+  T erf_small = pmul(x, erf_over_x_double_small(x2));
+
+  // Return early if we don't need the more expensive approximation for any
+  // entry in a.
+  const T one = pset1<T>(1.0);
+  const T x_abs_gt_one_mask = pcmp_lt(one, x2);
+  if (!predux_any(x_abs_gt_one_mask)) return erf_small;
+
+  // For |x| > 1, use erf(x) = 1 - erfc(x).
+  const T erf_large = psub(one, erfc_double_large(x, x2));
+  return pselect(x_abs_gt_one_mask, erf_large, erf_small);
+}
+
+template <typename T>
+struct erf_impl {
+  typedef typename unpacket_traits<T>::type Scalar;
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T run(const T& x) { return generic_fast_erf<Scalar>::run(x); }
+};
+
+template <typename Scalar>
+struct erf_retval {
+  typedef Scalar type;
+};
+
+#if EIGEN_HAS_C99_MATH
+template <>
+struct erf_impl<float> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE float run(const float x) {
+#if defined(SYCL_DEVICE_ONLY)
+    return cl::sycl::erf(x);
+#else
+    return generic_fast_erf<float>::run(x);
+#endif
+  }
+};
+
+template <>
+struct erf_impl<double> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE double run(const double x) {
+#if defined(SYCL_DEVICE_ONLY)
+    return cl::sycl::erf(x);
+#else
+    return generic_fast_erf<double>::run(x);
+#endif
+  }
+};
+#endif  // EIGEN_HAS_C99_MATH
+
+/***************************************************************************
+ * Implementation of ndtri.                                                 *
+ ****************************************************************************/
+
+/* Inverse of Normal distribution function (modified for Eigen).
+ *
+ *
+ * SYNOPSIS:
+ *
+ * double x, y, ndtri();
+ *
+ * x = ndtri( y );
+ *
+ *
+ *
+ * DESCRIPTION:
+ *
+ * Returns the argument, x, for which the area under the
+ * Gaussian probability density function (integrated from
+ * minus infinity to x) is equal to y.
+ *
+ *
+ * For small arguments 0 < y < exp(-2), the program computes
+ * z = sqrt( -2.0 * log(y) );  then the approximation is
+ * x = z - log(z)/z  - (1/z) P(1/z) / Q(1/z).
+ * There are two rational functions P/Q, one for 0 < y < exp(-32)
+ * and the other for y up to exp(-2).  For larger arguments,
+ * w = y - 0.5, and  x/sqrt(2pi) = w + w**3 R(w**2)/S(w**2)).
+ *
+ *
+ * ACCURACY:
+ *
+ *                      Relative error:
+ * arithmetic   domain        # trials      peak         rms
+ *    DEC      0.125, 1         5500       9.5e-17     2.1e-17
+ *    DEC      6e-39, 0.135     3500       5.7e-17     1.3e-17
+ *    IEEE     0.125, 1        20000       7.2e-16     1.3e-16
+ *    IEEE     3e-308, 0.135   50000       4.6e-16     9.8e-17
+ *
+ *
+ * ERROR MESSAGES:
+ *
+ *   message         condition    value returned
+ * ndtri domain       x == 0        -INF
+ * ndtri domain       x == 1         INF
+ * ndtri domain       x < 0, x > 1   NAN
+ */
+/*
+  Cephes Math Library Release 2.2: June, 1992
+  Copyright 1985, 1987, 1992 by Stephen L. Moshier
+  Direct inquiries to 30 Frost Street, Cambridge, MA 02140
+*/
+
+// TODO: Add a cheaper approximation for float.
+
+template <typename T>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T flipsign(const T& should_flipsign, const T& x) {
+  typedef typename unpacket_traits<T>::type Scalar;
+  const T sign_mask = pset1<T>(Scalar(-0.0));
+  T sign_bit = pand<T>(should_flipsign, sign_mask);
+  return pxor<T>(sign_bit, x);
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double flipsign<double>(const double& should_flipsign, const double& x) {
+  return should_flipsign == 0 ? x : -x;
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float flipsign<float>(const float& should_flipsign, const float& x) {
+  return should_flipsign == 0 ? x : -x;
+}
+
+// We split this computation in to two so that in the scalar path
+// only one branch is evaluated (due to our template specialization of pselect
+// being an if statement.)
+
+template <typename T, typename ScalarType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_ndtri_gt_exp_neg_two(const T& b) {
+  const ScalarType p0[] = {ScalarType(-5.99633501014107895267e1), ScalarType(9.80010754185999661536e1),
+                           ScalarType(-5.66762857469070293439e1), ScalarType(1.39312609387279679503e1),
+                           ScalarType(-1.23916583867381258016e0)};
+  const ScalarType q0[] = {ScalarType(1.0),
+                           ScalarType(1.95448858338141759834e0),
+                           ScalarType(4.67627912898881538453e0),
+                           ScalarType(8.63602421390890590575e1),
+                           ScalarType(-2.25462687854119370527e2),
+                           ScalarType(2.00260212380060660359e2),
+                           ScalarType(-8.20372256168333339912e1),
+                           ScalarType(1.59056225126211695515e1),
+                           ScalarType(-1.18331621121330003142e0)};
+  const T sqrt2pi = pset1<T>(ScalarType(2.50662827463100050242e0));
+  const T half = pset1<T>(ScalarType(0.5));
+  T c, c2, ndtri_gt_exp_neg_two;
+
+  c = psub(b, half);
+  c2 = pmul(c, c);
+  ndtri_gt_exp_neg_two =
+      pmadd(c, pmul(c2, pdiv(internal::ppolevl<T, 4>::run(c2, p0), internal::ppolevl<T, 8>::run(c2, q0))), c);
+  return pmul(ndtri_gt_exp_neg_two, sqrt2pi);
+}
+
+template <typename T, typename ScalarType>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T generic_ndtri_lt_exp_neg_two(const T& b, const T& should_flipsign) {
+  /* Approximation for interval z = sqrt(-2 log a ) between 2 and 8
+   * i.e., a between exp(-2) = .135 and exp(-32) = 1.27e-14.
+   */
+  const ScalarType p1[] = {ScalarType(4.05544892305962419923e0),   ScalarType(3.15251094599893866154e1),
+                           ScalarType(5.71628192246421288162e1),   ScalarType(4.40805073893200834700e1),
+                           ScalarType(1.46849561928858024014e1),   ScalarType(2.18663306850790267539e0),
+                           ScalarType(-1.40256079171354495875e-1), ScalarType(-3.50424626827848203418e-2),
+                           ScalarType(-8.57456785154685413611e-4)};
+  const ScalarType q1[] = {ScalarType(1.0),
+                           ScalarType(1.57799883256466749731e1),
+                           ScalarType(4.53907635128879210584e1),
+                           ScalarType(4.13172038254672030440e1),
+                           ScalarType(1.50425385692907503408e1),
+                           ScalarType(2.50464946208309415979e0),
+                           ScalarType(-1.42182922854787788574e-1),
+                           ScalarType(-3.80806407691578277194e-2),
+                           ScalarType(-9.33259480895457427372e-4)};
+  /* Approximation for interval z = sqrt(-2 log a ) between 8 and 64
+   * i.e., a between exp(-32) = 1.27e-14 and exp(-2048) = 3.67e-890.
+   */
+  const ScalarType p2[] = {ScalarType(3.23774891776946035970e0),  ScalarType(6.91522889068984211695e0),
+                           ScalarType(3.93881025292474443415e0),  ScalarType(1.33303460815807542389e0),
+                           ScalarType(2.01485389549179081538e-1), ScalarType(1.23716634817820021358e-2),
+                           ScalarType(3.01581553508235416007e-4), ScalarType(2.65806974686737550832e-6),
+                           ScalarType(6.23974539184983293730e-9)};
+  const ScalarType q2[] = {ScalarType(1.0),
+                           ScalarType(6.02427039364742014255e0),
+                           ScalarType(3.67983563856160859403e0),
+                           ScalarType(1.37702099489081330271e0),
+                           ScalarType(2.16236993594496635890e-1),
+                           ScalarType(1.34204006088543189037e-2),
+                           ScalarType(3.28014464682127739104e-4),
+                           ScalarType(2.89247864745380683936e-6),
+                           ScalarType(6.79019408009981274425e-9)};
+  const T eight = pset1<T>(ScalarType(8.0));
+  const T neg_two = pset1<T>(ScalarType(-2));
+  T x, x0, x1, z;
+
+  x = psqrt(pmul(neg_two, plog(b)));
+  x0 = psub(x, pdiv(plog(x), x));
+  z = preciprocal(x);
+  x1 =
+      pmul(z, pselect(pcmp_lt(x, eight), pdiv(internal::ppolevl<T, 8>::run(z, p1), internal::ppolevl<T, 8>::run(z, q1)),
+                      pdiv(internal::ppolevl<T, 8>::run(z, p2), internal::ppolevl<T, 8>::run(z, q2))));
+  return flipsign(should_flipsign, psub(x0, x1));
+}
+
+template <typename T, typename ScalarType>
+EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE T generic_ndtri(const T& a) {
+  const T maxnum = pset1<T>(NumTraits<ScalarType>::infinity());
+  const T neg_maxnum = pset1<T>(-NumTraits<ScalarType>::infinity());
+
+  const T zero = pset1<T>(ScalarType(0));
+  const T one = pset1<T>(ScalarType(1));
+  // exp(-2)
+  const T exp_neg_two = pset1<T>(ScalarType(0.13533528323661269189));
+  T b, ndtri, should_flipsign;
+
+  should_flipsign = pcmp_le(a, psub(one, exp_neg_two));
+  b = pselect(should_flipsign, a, psub(one, a));
+
+  ndtri = pselect(pcmp_lt(exp_neg_two, b), generic_ndtri_gt_exp_neg_two<T, ScalarType>(b),
+                  generic_ndtri_lt_exp_neg_two<T, ScalarType>(b, should_flipsign));
+
+  return pselect(pcmp_eq(a, zero), neg_maxnum, pselect(pcmp_eq(one, a), maxnum, ndtri));
+}
+
+template <typename Scalar>
+struct ndtri_retval {
+  typedef Scalar type;
+};
+
+#if !EIGEN_HAS_C99_MATH
+
+template <typename Scalar>
+struct ndtri_impl {
+  EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false), THIS_TYPE_IS_NOT_SUPPORTED)
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Scalar) { return Scalar(0); }
+};
+
+#else
+
+template <typename Scalar>
+struct ndtri_impl {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Scalar x) { return generic_ndtri<Scalar, Scalar>(x); }
+};
+
+#endif  // EIGEN_HAS_C99_MATH
+
+/**************************************************************************************************************
+ * Implementation of igammac (complemented incomplete gamma integral), based on Cephes but requires C++11/C99 *
+ **************************************************************************************************************/
+
+template <typename Scalar>
+struct igammac_retval {
+  typedef Scalar type;
+};
+
+// NOTE: cephes_helper is also used to implement zeta
+template <typename Scalar>
+struct cephes_helper {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar machep() {
+    eigen_assert(false && "machep not supported for this type");
+    return 0.0;
+  }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar big() {
+    eigen_assert(false && "big not supported for this type");
+    return 0.0;
+  }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar biginv() {
+    eigen_assert(false && "biginv not supported for this type");
+    return 0.0;
+  }
+};
+
+template <>
+struct cephes_helper<float> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE float machep() {
+    return NumTraits<float>::epsilon() / 2;  // 1.0 - machep == 1.0
+  }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE float big() {
+    // use epsneg (1.0 - epsneg == 1.0)
+    return 1.0f / (NumTraits<float>::epsilon() / 2);
+  }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE float biginv() {
+    // epsneg
+    return machep();
+  }
+};
+
+template <>
+struct cephes_helper<double> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE double machep() {
+    return NumTraits<double>::epsilon() / 2;  // 1.0 - machep == 1.0
+  }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE double big() { return 1.0 / NumTraits<double>::epsilon(); }
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE double biginv() {
+    // inverse of eps
+    return NumTraits<double>::epsilon();
+  }
+};
+
+enum IgammaComputationMode { VALUE, DERIVATIVE, SAMPLE_DERIVATIVE };
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar main_igamma_term(Scalar a, Scalar x) {
+  /* Compute  x**a * exp(-x) / gamma(a)  */
+  Scalar logax = a * numext::log(x) - x - lgamma_impl<Scalar>::run(a);
+  if (logax < -numext::log(NumTraits<Scalar>::highest()) ||
+      // Assuming x and a aren't Nan.
+      (numext::isnan)(logax)) {
+    return Scalar(0);
+  }
+  return numext::exp(logax);
+}
+
+template <typename Scalar, IgammaComputationMode mode>
+EIGEN_DEVICE_FUNC int igamma_num_iterations() {
+  /* Returns the maximum number of internal iterations for igamma computation.
+   */
+  if (mode == VALUE) {
+    return 2000;
+  }
+
+  if (internal::is_same<Scalar, float>::value) {
+    return 200;
+  } else if (internal::is_same<Scalar, double>::value) {
+    return 500;
+  } else {
+    return 2000;
+  }
+}
+
+template <typename Scalar, IgammaComputationMode mode>
+struct igammac_cf_impl {
+  /* Computes igamc(a, x) or derivative (depending on the mode)
+   * using the continued fraction expansion of the complementary
+   * incomplete Gamma function.
+   *
+   * Preconditions:
+   *   a > 0
+   *   x >= 1
+   *   x >= a
+   */
+  EIGEN_DEVICE_FUNC static Scalar run(Scalar a, Scalar x) {
+    const Scalar zero = 0;
+    const Scalar one = 1;
+    const Scalar two = 2;
+    const Scalar machep = cephes_helper<Scalar>::machep();
+    const Scalar big = cephes_helper<Scalar>::big();
+    const Scalar biginv = cephes_helper<Scalar>::biginv();
+
+    if ((numext::isinf)(x)) {
+      return zero;
+    }
+
+    Scalar ax = main_igamma_term<Scalar>(a, x);
+    // This is independent of mode. If this value is zero,
+    // then the function value is zero. If the function value is zero,
+    // then we are in a neighborhood where the function value evaluates to zero,
+    // so the derivative is zero.
+    if (ax == zero) {
+      return zero;
+    }
+
+    // continued fraction
+    Scalar y = one - a;
+    Scalar z = x + y + one;
+    Scalar c = zero;
+    Scalar pkm2 = one;
+    Scalar qkm2 = x;
+    Scalar pkm1 = x + one;
+    Scalar qkm1 = z * x;
+    Scalar ans = pkm1 / qkm1;
+
+    Scalar dpkm2_da = zero;
+    Scalar dqkm2_da = zero;
+    Scalar dpkm1_da = zero;
+    Scalar dqkm1_da = -x;
+    Scalar dans_da = (dpkm1_da - ans * dqkm1_da) / qkm1;
+
+    for (int i = 0; i < igamma_num_iterations<Scalar, mode>(); i++) {
+      c += one;
+      y += one;
+      z += two;
+
+      Scalar yc = y * c;
+      Scalar pk = pkm1 * z - pkm2 * yc;
+      Scalar qk = qkm1 * z - qkm2 * yc;
+
+      Scalar dpk_da = dpkm1_da * z - pkm1 - dpkm2_da * yc + pkm2 * c;
+      Scalar dqk_da = dqkm1_da * z - qkm1 - dqkm2_da * yc + qkm2 * c;
+
+      if (qk != zero) {
+        Scalar ans_prev = ans;
+        ans = pk / qk;
+
+        Scalar dans_da_prev = dans_da;
+        dans_da = (dpk_da - ans * dqk_da) / qk;
+
+        if (mode == VALUE) {
+          if (numext::abs(ans_prev - ans) <= machep * numext::abs(ans)) {
+            break;
+          }
+        } else {
+          if (numext::abs(dans_da - dans_da_prev) <= machep) {
+            break;
+          }
+        }
+      }
+
+      pkm2 = pkm1;
+      pkm1 = pk;
+      qkm2 = qkm1;
+      qkm1 = qk;
+
+      dpkm2_da = dpkm1_da;
+      dpkm1_da = dpk_da;
+      dqkm2_da = dqkm1_da;
+      dqkm1_da = dqk_da;
+
+      if (numext::abs(pk) > big) {
+        pkm2 *= biginv;
+        pkm1 *= biginv;
+        qkm2 *= biginv;
+        qkm1 *= biginv;
+
+        dpkm2_da *= biginv;
+        dpkm1_da *= biginv;
+        dqkm2_da *= biginv;
+        dqkm1_da *= biginv;
+      }
+    }
+
+    /* Compute  x**a * exp(-x) / gamma(a)  */
+    Scalar dlogax_da = numext::log(x) - digamma_impl<Scalar>::run(a);
+    Scalar dax_da = ax * dlogax_da;
+
+    switch (mode) {
+      case VALUE:
+        return ans * ax;
+      case DERIVATIVE:
+        return ans * dax_da + dans_da * ax;
+      case SAMPLE_DERIVATIVE:
+      default:  // this is needed to suppress clang warning
+        return -(dans_da + ans * dlogax_da) * x;
+    }
+  }
+};
+
+template <typename Scalar, IgammaComputationMode mode>
+struct igamma_series_impl {
+  /* Computes igam(a, x) or its derivative (depending on the mode)
+   * using the series expansion of the incomplete Gamma function.
+   *
+   * Preconditions:
+   *   x > 0
+   *   a > 0
+   *   !(x > 1 && x > a)
+   */
+  EIGEN_DEVICE_FUNC static Scalar run(Scalar a, Scalar x) {
+    const Scalar zero = 0;
+    const Scalar one = 1;
+    const Scalar machep = cephes_helper<Scalar>::machep();
+
+    Scalar ax = main_igamma_term<Scalar>(a, x);
+
+    // This is independent of mode. If this value is zero,
+    // then the function value is zero. If the function value is zero,
+    // then we are in a neighborhood where the function value evaluates to zero,
+    // so the derivative is zero.
+    if (ax == zero) {
+      return zero;
+    }
+
+    ax /= a;
+
+    /* power series */
+    Scalar r = a;
+    Scalar c = one;
+    Scalar ans = one;
+
+    Scalar dc_da = zero;
+    Scalar dans_da = zero;
+
+    for (int i = 0; i < igamma_num_iterations<Scalar, mode>(); i++) {
+      r += one;
+      Scalar term = x / r;
+      Scalar dterm_da = -x / (r * r);
+      dc_da = term * dc_da + dterm_da * c;
+      dans_da += dc_da;
+      c *= term;
+      ans += c;
+
+      if (mode == VALUE) {
+        if (c <= machep * ans) {
+          break;
+        }
+      } else {
+        if (numext::abs(dc_da) <= machep * numext::abs(dans_da)) {
+          break;
+        }
+      }
+    }
+
+    Scalar dlogax_da = numext::log(x) - digamma_impl<Scalar>::run(a + one);
+    Scalar dax_da = ax * dlogax_da;
+
+    switch (mode) {
+      case VALUE:
+        return ans * ax;
+      case DERIVATIVE:
+        return ans * dax_da + dans_da * ax;
+      case SAMPLE_DERIVATIVE:
+      default:  // this is needed to suppress clang warning
+        return -(dans_da + ans * dlogax_da) * x / a;
+    }
+  }
+};
+
+#if !EIGEN_HAS_C99_MATH
+
+template <typename Scalar>
+struct igammac_impl {
+  EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false), THIS_TYPE_IS_NOT_SUPPORTED)
+
+  EIGEN_DEVICE_FUNC static Scalar run(Scalar a, Scalar x) { return Scalar(0); }
+};
+
+#else
+
+template <typename Scalar>
+struct igammac_impl {
+  EIGEN_DEVICE_FUNC static Scalar run(Scalar a, Scalar x) {
+    /*  igamc()
+     *
+     *	Incomplete gamma integral (modified for Eigen)
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double a, x, y, igamc();
+     *
+     * y = igamc( a, x );
+     *
+     * DESCRIPTION:
+     *
+     * The function is defined by
+     *
+     *
+     *  igamc(a,x)   =   1 - igam(a,x)
+     *
+     *                            inf.
+     *                              -
+     *                     1       | |  -t  a-1
+     *               =   -----     |   e   t   dt.
+     *                    -      | |
+     *                   | (a)    -
+     *                             x
+     *
+     *
+     * In this implementation both arguments must be positive.
+     * The integral is evaluated by either a power series or
+     * continued fraction expansion, depending on the relative
+     * values of a and x.
+     *
+     * ACCURACY (float):
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0,30        30000       7.8e-6      5.9e-7
+     *
+     *
+     * ACCURACY (double):
+     *
+     * Tested at random a, x.
+     *                a         x                      Relative error:
+     * arithmetic   domain   domain     # trials      peak         rms
+     *    IEEE     0.5,100   0,100      200000       1.9e-14     1.7e-15
+     *    IEEE     0.01,0.5  0,100      200000       1.4e-13     1.6e-15
+     *
+     */
+    /*
+      Cephes Math Library Release 2.2: June, 1992
+      Copyright 1985, 1987, 1992 by Stephen L. Moshier
+      Direct inquiries to 30 Frost Street, Cambridge, MA 02140
+    */
+    const Scalar zero = 0;
+    const Scalar one = 1;
+    const Scalar nan = NumTraits<Scalar>::quiet_NaN();
+
+    if ((x < zero) || (a <= zero)) {
+      // domain error
+      return nan;
+    }
+
+    if ((numext::isnan)(a) || (numext::isnan)(x)) {  // propagate nans
+      return nan;
+    }
+
+    if ((x < one) || (x < a)) {
+      return (one - igamma_series_impl<Scalar, VALUE>::run(a, x));
+    }
+
+    return igammac_cf_impl<Scalar, VALUE>::run(a, x);
+  }
+};
+
+#endif  // EIGEN_HAS_C99_MATH
+
+/************************************************************************************************
+ * Implementation of igamma (incomplete gamma integral), based on Cephes but requires C++11/C99 *
+ ************************************************************************************************/
+
+#if !EIGEN_HAS_C99_MATH
+
+template <typename Scalar, IgammaComputationMode mode>
+struct igamma_generic_impl {
+  EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false), THIS_TYPE_IS_NOT_SUPPORTED)
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(Scalar a, Scalar x) { return Scalar(0); }
+};
+
+#else
+
+template <typename Scalar, IgammaComputationMode mode>
+struct igamma_generic_impl {
+  EIGEN_DEVICE_FUNC static Scalar run(Scalar a, Scalar x) {
+    /* Depending on the mode, returns
+     * - VALUE: incomplete Gamma function igamma(a, x)
+     * - DERIVATIVE: derivative of incomplete Gamma function d/da igamma(a, x)
+     * - SAMPLE_DERIVATIVE: implicit derivative of a Gamma random variable
+     * x ~ Gamma(x | a, 1), dx/da = -1 / Gamma(x | a, 1) * d igamma(a, x) / dx
+     *
+     * Derivatives are implemented by forward-mode differentiation.
+     */
+    const Scalar zero = 0;
+    const Scalar one = 1;
+    const Scalar nan = NumTraits<Scalar>::quiet_NaN();
+
+    if (x == zero) return zero;
+
+    if ((x < zero) || (a <= zero)) {  // domain error
+      return nan;
+    }
+
+    if ((numext::isnan)(a) || (numext::isnan)(x)) {  // propagate nans
+      return nan;
+    }
+
+    if ((x > one) && (x > a)) {
+      Scalar ret = igammac_cf_impl<Scalar, mode>::run(a, x);
+      if (mode == VALUE) {
+        return one - ret;
+      } else {
+        return -ret;
+      }
+    }
+
+    return igamma_series_impl<Scalar, mode>::run(a, x);
+  }
+};
+
+#endif  // EIGEN_HAS_C99_MATH
+
+template <typename Scalar>
+struct igamma_retval {
+  typedef Scalar type;
+};
+
+template <typename Scalar>
+struct igamma_impl : igamma_generic_impl<Scalar, VALUE> {
+  /* igam()
+   * Incomplete gamma integral.
+   *
+   * The CDF of Gamma(a, 1) random variable at the point x.
+   *
+   * Accuracy estimation. For each a in [10^-2, 10^-1...10^3] we sample
+   * 50 Gamma random variables x ~ Gamma(x | a, 1), a total of 300 points.
+   * The ground truth is computed by mpmath. Mean absolute error:
+   * float: 1.26713e-05
+   * double: 2.33606e-12
+   *
+   * Cephes documentation below.
+   *
+   * SYNOPSIS:
+   *
+   * double a, x, y, igam();
+   *
+   * y = igam( a, x );
+   *
+   * DESCRIPTION:
+   *
+   * The function is defined by
+   *
+   *                           x
+   *                            -
+   *                   1       | |  -t  a-1
+   *  igam(a,x)  =   -----     |   e   t   dt.
+   *                  -      | |
+   *                 | (a)    -
+   *                           0
+   *
+   *
+   * In this implementation both arguments must be positive.
+   * The integral is evaluated by either a power series or
+   * continued fraction expansion, depending on the relative
+   * values of a and x.
+   *
+   * ACCURACY (double):
+   *
+   *                      Relative error:
+   * arithmetic   domain     # trials      peak         rms
+   *    IEEE      0,30       200000       3.6e-14     2.9e-15
+   *    IEEE      0,100      300000       9.9e-14     1.5e-14
+   *
+   *
+   * ACCURACY (float):
+   *
+   *                      Relative error:
+   * arithmetic   domain     # trials      peak         rms
+   *    IEEE      0,30        20000       7.8e-6      5.9e-7
+   *
+   */
+  /*
+    Cephes Math Library Release 2.2: June, 1992
+    Copyright 1985, 1987, 1992 by Stephen L. Moshier
+    Direct inquiries to 30 Frost Street, Cambridge, MA 02140
+  */
+
+  /* left tail of incomplete gamma function:
+   *
+   *          inf.      k
+   *   a  -x   -       x
+   *  x  e     >   ----------
+   *           -     -
+   *          k=0   | (a+k+1)
+   *
+   */
+};
+
+template <typename Scalar>
+struct igamma_der_a_retval : igamma_retval<Scalar> {};
+
+template <typename Scalar>
+struct igamma_der_a_impl : igamma_generic_impl<Scalar, DERIVATIVE> {
+  /* Derivative of the incomplete Gamma function with respect to a.
+   *
+   * Computes d/da igamma(a, x) by forward differentiation of the igamma code.
+   *
+   * Accuracy estimation. For each a in [10^-2, 10^-1...10^3] we sample
+   * 50 Gamma random variables x ~ Gamma(x | a, 1), a total of 300 points.
+   * The ground truth is computed by mpmath. Mean absolute error:
+   * float: 6.17992e-07
+   * double: 4.60453e-12
+   *
+   * Reference:
+   * R. Moore. "Algorithm AS 187: Derivatives of the incomplete gamma
+   * integral". Journal of the Royal Statistical Society. 1982
+   */
+};
+
+template <typename Scalar>
+struct gamma_sample_der_alpha_retval : igamma_retval<Scalar> {};
+
+template <typename Scalar>
+struct gamma_sample_der_alpha_impl : igamma_generic_impl<Scalar, SAMPLE_DERIVATIVE> {
+  /* Derivative of a Gamma random variable sample with respect to alpha.
+   *
+   * Consider a sample of a Gamma random variable with the concentration
+   * parameter alpha: sample ~ Gamma(alpha, 1). The reparameterization
+   * derivative that we want to compute is dsample / dalpha =
+   * d igammainv(alpha, u) / dalpha, where u = igamma(alpha, sample).
+   * However, this formula is numerically unstable and expensive, so instead
+   * we use implicit differentiation:
+   *
+   * igamma(alpha, sample) = u, where u ~ Uniform(0, 1).
+   * Apply d / dalpha to both sides:
+   * d igamma(alpha, sample) / dalpha
+   *     + d igamma(alpha, sample) / dsample * dsample/dalpha  = 0
+   * d igamma(alpha, sample) / dalpha
+   *     + Gamma(sample | alpha, 1) dsample / dalpha = 0
+   * dsample/dalpha = - (d igamma(alpha, sample) / dalpha)
+   *                   / Gamma(sample | alpha, 1)
+   *
+   * Here Gamma(sample | alpha, 1) is the PDF of the Gamma distribution
+   * (note that the derivative of the CDF w.r.t. sample is the PDF).
+   * See the reference below for more details.
+   *
+   * The derivative of igamma(alpha, sample) is computed by forward
+   * differentiation of the igamma code. Division by the Gamma PDF is performed
+   * in the same code, increasing the accuracy and speed due to cancellation
+   * of some terms.
+   *
+   * Accuracy estimation. For each alpha in [10^-2, 10^-1...10^3] we sample
+   * 50 Gamma random variables sample ~ Gamma(sample | alpha, 1), a total of 300
+   * points. The ground truth is computed by mpmath. Mean absolute error:
+   * float: 2.1686e-06
+   * double: 1.4774e-12
+   *
+   * Reference:
+   * M. Figurnov, S. Mohamed, A. Mnih "Implicit Reparameterization Gradients".
+   * 2018
+   */
+};
+
+/*****************************************************************************
+ * Implementation of Riemann zeta function of two arguments, based on Cephes *
+ *****************************************************************************/
+
+template <typename Scalar>
+struct zeta_retval {
+  typedef Scalar type;
+};
+
+template <typename Scalar>
+struct zeta_impl_series {
+  EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false), THIS_TYPE_IS_NOT_SUPPORTED)
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(const Scalar) { return Scalar(0); }
+};
+
+template <>
+struct zeta_impl_series<float> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE bool run(float& a, float& b, float& s, const float x,
+                                                        const float machep) {
+    int i = 0;
+    while (i < 9) {
+      i += 1;
+      a += 1.0f;
+      b = numext::pow(a, -x);
+      s += b;
+      if (numext::abs(b / s) < machep) return true;
+    }
+
+    // Return whether we are done
+    return false;
+  }
+};
+
+template <>
+struct zeta_impl_series<double> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE bool run(double& a, double& b, double& s, const double x,
+                                                        const double machep) {
+    int i = 0;
+    while ((i < 9) || (a <= 9.0)) {
+      i += 1;
+      a += 1.0;
+      b = numext::pow(a, -x);
+      s += b;
+      if (numext::abs(b / s) < machep) return true;
+    }
+
+    // Return whether we are done
+    return false;
+  }
+};
+
+template <typename Scalar>
+struct zeta_impl {
+  EIGEN_DEVICE_FUNC static Scalar run(Scalar x, Scalar q) {
+    /*							zeta.c
+     *
+     *	Riemann zeta function of two arguments
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, q, y, zeta();
+     *
+     * y = zeta( x, q );
+     *
+     *
+     *
+     * DESCRIPTION:
+     *
+     *
+     *
+     *                 inf.
+     *                  -        -x
+     *   zeta(x,q)  =   >   (k+q)
+     *                  -
+     *                 k=0
+     *
+     * where x > 1 and q is not a negative integer or zero.
+     * The Euler-Maclaurin summation formula is used to obtain
+     * the expansion
+     *
+     *                n
+     *                -       -x
+     * zeta(x,q)  =   >  (k+q)
+     *                -
+     *               k=1
+     *
+     *           1-x                 inf.  B   x(x+1)...(x+2j)
+     *      (n+q)           1         -     2j
+     *  +  ---------  -  -------  +   >    --------------------
+     *        x-1              x      -                   x+2j+1
+     *                   2(n+q)      j=1       (2j)! (n+q)
+     *
+     * where the B2j are Bernoulli numbers.  Note that (see zetac.c)
+     * zeta(x,1) = zetac(x) + 1.
+     *
+     *
+     *
+     * ACCURACY:
+     *
+     * Relative error for single precision:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0,25        10000       6.9e-7      1.0e-7
+     *
+     * Large arguments may produce underflow in powf(), in which
+     * case the results are inaccurate.
+     *
+     * REFERENCE:
+     *
+     * Gradshteyn, I. S., and I. M. Ryzhik, Tables of Integrals,
+     * Series, and Products, p. 1073; Academic Press, 1980.
+     *
+     */
+
+    int i;
+    Scalar p, r, a, b, k, s, t, w;
+
+    const Scalar A[] = {
+        Scalar(12.0),
+        Scalar(-720.0),
+        Scalar(30240.0),
+        Scalar(-1209600.0),
+        Scalar(47900160.0),
+        Scalar(-1.8924375803183791606e9), /*1.307674368e12/691*/
+        Scalar(7.47242496e10),
+        Scalar(-2.950130727918164224e12),  /*1.067062284288e16/3617*/
+        Scalar(1.1646782814350067249e14),  /*5.109094217170944e18/43867*/
+        Scalar(-4.5979787224074726105e15), /*8.028576626982912e20/174611*/
+        Scalar(1.8152105401943546773e17),  /*1.5511210043330985984e23/854513*/
+        Scalar(-7.1661652561756670113e18)  /*1.6938241367317436694528e27/236364091*/
+    };
+
+    const Scalar maxnum = NumTraits<Scalar>::infinity();
+    const Scalar zero = Scalar(0.0), half = Scalar(0.5), one = Scalar(1.0);
+    const Scalar machep = cephes_helper<Scalar>::machep();
+    const Scalar nan = NumTraits<Scalar>::quiet_NaN();
+
+    if (x == one) return maxnum;
+
+    if (x < one) {
+      return nan;
+    }
+
+    if (q <= zero) {
+      if (q == numext::floor(q)) {
+        if (numext::rint(Scalar(0.5) * x) == Scalar(0.5) * x) {
+          return maxnum;
+        } else {
+          return nan;
+        }
+      }
+      p = x;
+      r = numext::floor(p);
+      if (p != r) return nan;
+    }
+
+    /* Permit negative q but continue sum until n+q > +9 .
+     * This case should be handled by a reflection formula.
+     * If q<0 and x is an integer, there is a relation to
+     * the polygamma function.
+     */
+    s = numext::pow(q, -x);
+    a = q;
+    b = zero;
+    // Run the summation in a helper function that is specific to the floating precision
+    if (zeta_impl_series<Scalar>::run(a, b, s, x, machep)) {
+      return s;
+    }
+
+    // If b is zero, then the tail sum will also end up being zero.
+    // Exiting early here can prevent NaNs for some large inputs, where
+    // the tail sum computed below has term `a` which can overflow to `inf`.
+    if (numext::equal_strict(b, zero)) {
+      return s;
+    }
+
+    w = a;
+    s += b * w / (x - one);
+    s -= half * b;
+    a = one;
+    k = zero;
+
+    for (i = 0; i < 12; i++) {
+      a *= x + k;
+      b /= w;
+      t = a * b / A[i];
+      s = s + t;
+      t = numext::abs(t / s);
+      if (t < machep) {
+        break;
+      }
+      k += one;
+      a *= x + k;
+      b /= w;
+      k += one;
+    }
+    return s;
+  }
+};
+
+/****************************************************************************
+ * Implementation of polygamma function, requires C++11/C99                 *
+ ****************************************************************************/
+
+template <typename Scalar>
+struct polygamma_retval {
+  typedef Scalar type;
+};
+
+#if !EIGEN_HAS_C99_MATH
+
+template <typename Scalar>
+struct polygamma_impl {
+  EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false), THIS_TYPE_IS_NOT_SUPPORTED)
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(Scalar n, Scalar x) { return Scalar(0); }
+};
+
+#else
+
+template <typename Scalar>
+struct polygamma_impl {
+  EIGEN_DEVICE_FUNC static Scalar run(Scalar n, Scalar x) {
+    Scalar zero = 0.0, one = 1.0;
+    Scalar nplus = n + one;
+    const Scalar nan = NumTraits<Scalar>::quiet_NaN();
+
+    // Check that n is a non-negative integer
+    if (numext::floor(n) != n || n < zero) {
+      return nan;
+    }
+    // Just return the digamma function for n = 0
+    else if (n == zero) {
+      return digamma_impl<Scalar>::run(x);
+    }
+    // Use the same implementation as scipy
+    else {
+      Scalar factorial = numext::exp(lgamma_impl<Scalar>::run(nplus));
+      return numext::pow(-one, nplus) * factorial * zeta_impl<Scalar>::run(nplus, x);
+    }
+  }
+};
+
+#endif  // EIGEN_HAS_C99_MATH
+
+/************************************************************************************************
+ * Implementation of betainc (incomplete beta integral), based on Cephes but requires C++11/C99 *
+ ************************************************************************************************/
+
+template <typename Scalar>
+struct betainc_retval {
+  typedef Scalar type;
+};
+
+#if !EIGEN_HAS_C99_MATH
+
+template <typename Scalar>
+struct betainc_impl {
+  EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false), THIS_TYPE_IS_NOT_SUPPORTED)
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(Scalar a, Scalar b, Scalar x) { return Scalar(0); }
+};
+
+#else
+
+template <typename Scalar>
+struct betainc_impl {
+  EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false), THIS_TYPE_IS_NOT_SUPPORTED)
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(Scalar, Scalar, Scalar) {
+    /*	betaincf.c
+     *
+     *	Incomplete beta integral
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float a, b, x, y, betaincf();
+     *
+     * y = betaincf( a, b, x );
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns incomplete beta integral of the arguments, evaluated
+     * from zero to x.  The function is defined as
+     *
+     *                  x
+     *     -            -
+     *    | (a+b)      | |  a-1     b-1
+     *  -----------    |   t   (1-t)   dt.
+     *   -     -     | |
+     *  | (a) | (b)   -
+     *                 0
+     *
+     * The domain of definition is 0 <= x <= 1.  In this
+     * implementation a and b are restricted to positive values.
+     * The integral from x to 1 may be obtained by the symmetry
+     * relation
+     *
+     *    1 - betainc( a, b, x )  =  betainc( b, a, 1-x ).
+     *
+     * The integral is evaluated by a continued fraction expansion.
+     * If a < 1, the function calls itself recursively after a
+     * transformation to increase a to a+1.
+     *
+     * ACCURACY (float):
+     *
+     * Tested at random points (a,b,x) with a and b in the indicated
+     * interval and x between 0 and 1.
+     *
+     * arithmetic   domain     # trials      peak         rms
+     * Relative error:
+     *    IEEE       0,30       10000       3.7e-5      5.1e-6
+     *    IEEE       0,100      10000       1.7e-4      2.5e-5
+     * The useful domain for relative error is limited by underflow
+     * of the single precision exponential function.
+     * Absolute error:
+     *    IEEE       0,30      100000       2.2e-5      9.6e-7
+     *    IEEE       0,100      10000       6.5e-5      3.7e-6
+     *
+     * Larger errors may occur for extreme ratios of a and b.
+     *
+     * ACCURACY (double):
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0,5         10000       6.9e-15     4.5e-16
+     *    IEEE      0,85       250000       2.2e-13     1.7e-14
+     *    IEEE      0,1000      30000       5.3e-12     6.3e-13
+     *    IEEE      0,10000    250000       9.3e-11     7.1e-12
+     *    IEEE      0,100000    10000       8.7e-10     4.8e-11
+     * Outputs smaller than the IEEE gradual underflow threshold
+     * were excluded from these statistics.
+     *
+     * ERROR MESSAGES:
+     *   message         condition      value returned
+     * incbet domain      x<0, x>1          nan
+     * incbet underflow                     nan
+     */
+    return Scalar(0);
+  }
+};
+
+/* Continued fraction expansion #1 for incomplete beta integral (small_branch = True)
+ * Continued fraction expansion #2 for incomplete beta integral (small_branch = False)
+ */
+template <typename Scalar>
+struct incbeta_cfe {
+  EIGEN_STATIC_ASSERT((internal::is_same<Scalar, float>::value || internal::is_same<Scalar, double>::value),
+                      THIS_TYPE_IS_NOT_SUPPORTED)
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Scalar run(Scalar a, Scalar b, Scalar x, bool small_branch) {
+    const Scalar big = cephes_helper<Scalar>::big();
+    const Scalar machep = cephes_helper<Scalar>::machep();
+    const Scalar biginv = cephes_helper<Scalar>::biginv();
+
+    const Scalar zero = 0;
+    const Scalar one = 1;
+    const Scalar two = 2;
+
+    Scalar xk, pk, pkm1, pkm2, qk, qkm1, qkm2;
+    Scalar k1, k2, k3, k4, k5, k6, k7, k8, k26update;
+    Scalar ans;
+    int n;
+
+    const int num_iters = (internal::is_same<Scalar, float>::value) ? 100 : 300;
+    const Scalar thresh = (internal::is_same<Scalar, float>::value) ? machep : Scalar(3) * machep;
+    Scalar r = (internal::is_same<Scalar, float>::value) ? zero : one;
+
+    if (small_branch) {
+      k1 = a;
+      k2 = a + b;
+      k3 = a;
+      k4 = a + one;
+      k5 = one;
+      k6 = b - one;
+      k7 = k4;
+      k8 = a + two;
+      k26update = one;
+    } else {
+      k1 = a;
+      k2 = b - one;
+      k3 = a;
+      k4 = a + one;
+      k5 = one;
+      k6 = a + b;
+      k7 = a + one;
+      k8 = a + two;
+      k26update = -one;
+      x = x / (one - x);
+    }
+
+    pkm2 = zero;
+    qkm2 = one;
+    pkm1 = one;
+    qkm1 = one;
+    ans = one;
+    n = 0;
+
+    do {
+      xk = -(x * k1 * k2) / (k3 * k4);
+      pk = pkm1 + pkm2 * xk;
+      qk = qkm1 + qkm2 * xk;
+      pkm2 = pkm1;
+      pkm1 = pk;
+      qkm2 = qkm1;
+      qkm1 = qk;
+
+      xk = (x * k5 * k6) / (k7 * k8);
+      pk = pkm1 + pkm2 * xk;
+      qk = qkm1 + qkm2 * xk;
+      pkm2 = pkm1;
+      pkm1 = pk;
+      qkm2 = qkm1;
+      qkm1 = qk;
+
+      if (qk != zero) {
+        r = pk / qk;
+        if (numext::abs(ans - r) < numext::abs(r) * thresh) {
+          return r;
+        }
+        ans = r;
+      }
+
+      k1 += one;
+      k2 += k26update;
+      k3 += two;
+      k4 += two;
+      k5 += one;
+      k6 -= k26update;
+      k7 += two;
+      k8 += two;
+
+      if ((numext::abs(qk) + numext::abs(pk)) > big) {
+        pkm2 *= biginv;
+        pkm1 *= biginv;
+        qkm2 *= biginv;
+        qkm1 *= biginv;
+      }
+      if ((numext::abs(qk) < biginv) || (numext::abs(pk) < biginv)) {
+        pkm2 *= big;
+        pkm1 *= big;
+        qkm2 *= big;
+        qkm1 *= big;
+      }
+    } while (++n < num_iters);
+
+    return ans;
+  }
+};
+
+/* Helper functions depending on the Scalar type */
+template <typename Scalar>
+struct betainc_helper {};
+
+template <>
+struct betainc_helper<float> {
+  /* Core implementation, assumes a large (> 1.0) */
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE float incbsa(float aa, float bb, float xx) {
+    float ans, a, b, t, x, onemx;
+    bool reversed_a_b = false;
+
+    onemx = 1.0f - xx;
+
+    /* see if x is greater than the mean */
+    if (xx > (aa / (aa + bb))) {
+      reversed_a_b = true;
+      a = bb;
+      b = aa;
+      t = xx;
+      x = onemx;
+    } else {
+      a = aa;
+      b = bb;
+      t = onemx;
+      x = xx;
+    }
+
+    /* Choose expansion for optimal convergence */
+    if (b > 10.0f) {
+      if (numext::abs(b * x / a) < 0.3f) {
+        t = betainc_helper<float>::incbps(a, b, x);
+        if (reversed_a_b) t = 1.0f - t;
+        return t;
+      }
+    }
+
+    ans = x * (a + b - 2.0f) / (a - 1.0f);
+    if (ans < 1.0f) {
+      ans = incbeta_cfe<float>::run(a, b, x, true /* small_branch */);
+      t = b * numext::log(t);
+    } else {
+      ans = incbeta_cfe<float>::run(a, b, x, false /* small_branch */);
+      t = (b - 1.0f) * numext::log(t);
+    }
+
+    t += a * numext::log(x) + lgamma_impl<float>::run(a + b) - lgamma_impl<float>::run(a) - lgamma_impl<float>::run(b);
+    t += numext::log(ans / a);
+    t = numext::exp(t);
+
+    if (reversed_a_b) t = 1.0f - t;
+    return t;
+  }
+
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE float incbps(float a, float b, float x) {
+    float t, u, y, s;
+    const float machep = cephes_helper<float>::machep();
+
+    y = a * numext::log(x) + (b - 1.0f) * numext::log1p(-x) - numext::log(a);
+    y -= lgamma_impl<float>::run(a) + lgamma_impl<float>::run(b);
+    y += lgamma_impl<float>::run(a + b);
+
+    t = x / (1.0f - x);
+    s = 0.0f;
+    u = 1.0f;
+    do {
+      b -= 1.0f;
+      if (b == 0.0f) {
+        break;
+      }
+      a += 1.0f;
+      u *= t * b / a;
+      s += u;
+    } while (numext::abs(u) > machep);
+
+    return numext::exp(y) * (1.0f + s);
+  }
+};
+
+template <>
+struct betainc_impl<float> {
+  EIGEN_DEVICE_FUNC static float run(float a, float b, float x) {
+    const float nan = NumTraits<float>::quiet_NaN();
+    float ans, t;
+
+    if (a <= 0.0f) return nan;
+    if (b <= 0.0f) return nan;
+    if ((x <= 0.0f) || (x >= 1.0f)) {
+      if (x == 0.0f) return 0.0f;
+      if (x == 1.0f) return 1.0f;
+      // mtherr("betaincf", DOMAIN);
+      return nan;
+    }
+
+    /* transformation for small aa */
+    if (a <= 1.0f) {
+      ans = betainc_helper<float>::incbsa(a + 1.0f, b, x);
+      t = a * numext::log(x) + b * numext::log1p(-x) + lgamma_impl<float>::run(a + b) -
+          lgamma_impl<float>::run(a + 1.0f) - lgamma_impl<float>::run(b);
+      return (ans + numext::exp(t));
+    } else {
+      return betainc_helper<float>::incbsa(a, b, x);
+    }
+  }
+};
+
+template <>
+struct betainc_helper<double> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE double incbps(double a, double b, double x) {
+    const double machep = cephes_helper<double>::machep();
+
+    double s, t, u, v, n, t1, z, ai;
+
+    ai = 1.0 / a;
+    u = (1.0 - b) * x;
+    v = u / (a + 1.0);
+    t1 = v;
+    t = u;
+    n = 2.0;
+    s = 0.0;
+    z = machep * ai;
+    while (numext::abs(v) > z) {
+      u = (n - b) * x / n;
+      t *= u;
+      v = t / (a + n);
+      s += v;
+      n += 1.0;
+    }
+    s += t1;
+    s += ai;
+
+    u = a * numext::log(x);
+    // TODO: gamma() is not directly implemented in Eigen.
+    /*
+    if ((a + b) < maxgam && numext::abs(u) < maxlog) {
+      t = gamma(a + b) / (gamma(a) * gamma(b));
+      s = s * t * pow(x, a);
+    }
+    */
+    t = lgamma_impl<double>::run(a + b) - lgamma_impl<double>::run(a) - lgamma_impl<double>::run(b) + u +
+        numext::log(s);
+    return s = numext::exp(t);
+  }
+};
+
+template <>
+struct betainc_impl<double> {
+  EIGEN_DEVICE_FUNC static double run(double aa, double bb, double xx) {
+    const double nan = NumTraits<double>::quiet_NaN();
+    const double machep = cephes_helper<double>::machep();
+    // const double maxgam = 171.624376956302725;
+
+    double a, b, t, x, xc, w, y;
+    bool reversed_a_b = false;
+
+    if (aa <= 0.0 || bb <= 0.0) {
+      return nan;  // goto domerr;
+    }
+
+    if ((xx <= 0.0) || (xx >= 1.0)) {
+      if (xx == 0.0) return (0.0);
+      if (xx == 1.0) return (1.0);
+      // mtherr("incbet", DOMAIN);
+      return nan;
+    }
+
+    if ((bb * xx) <= 1.0 && xx <= 0.95) {
+      return betainc_helper<double>::incbps(aa, bb, xx);
+    }
+
+    w = 1.0 - xx;
+
+    /* Reverse a and b if x is greater than the mean. */
+    if (xx > (aa / (aa + bb))) {
+      reversed_a_b = true;
+      a = bb;
+      b = aa;
+      xc = xx;
+      x = w;
+    } else {
+      a = aa;
+      b = bb;
+      xc = w;
+      x = xx;
+    }
+
+    if (reversed_a_b && (b * x) <= 1.0 && x <= 0.95) {
+      t = betainc_helper<double>::incbps(a, b, x);
+      if (t <= machep) {
+        t = 1.0 - machep;
+      } else {
+        t = 1.0 - t;
+      }
+      return t;
+    }
+
+    /* Choose expansion for better convergence. */
+    y = x * (a + b - 2.0) - (a - 1.0);
+    if (y < 0.0) {
+      w = incbeta_cfe<double>::run(a, b, x, true /* small_branch */);
+    } else {
+      w = incbeta_cfe<double>::run(a, b, x, false /* small_branch */) / xc;
+    }
+
+    /* Multiply w by the factor
+         a      b   _             _     _
+        x  (1-x)   | (a+b) / ( a | (a) | (b) ) .   */
+
+    y = a * numext::log(x);
+    t = b * numext::log(xc);
+    // TODO: gamma is not directly implemented in Eigen.
+    /*
+    if ((a + b) < maxgam && numext::abs(y) < maxlog && numext::abs(t) < maxlog)
+    {
+      t = pow(xc, b);
+      t *= pow(x, a);
+      t /= a;
+      t *= w;
+      t *= gamma(a + b) / (gamma(a) * gamma(b));
+    } else {
+    */
+    /* Resort to logarithms.  */
+    y += t + lgamma_impl<double>::run(a + b) - lgamma_impl<double>::run(a) - lgamma_impl<double>::run(b);
+    y += numext::log(w / a);
+    t = numext::exp(y);
+
+    /* } */
+    // done:
+
+    if (reversed_a_b) {
+      if (t <= machep) {
+        t = 1.0 - machep;
+      } else {
+        t = 1.0 - t;
+      }
+    }
+    return t;
+  }
+};
+
+#endif  // EIGEN_HAS_C99_MATH
+
+}  // end namespace internal
+
+namespace numext {
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(lgamma, Scalar) lgamma(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(lgamma, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(digamma, Scalar) digamma(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(digamma, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(zeta, Scalar) zeta(const Scalar& x, const Scalar& q) {
+  return EIGEN_MATHFUNC_IMPL(zeta, Scalar)::run(x, q);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(polygamma, Scalar) polygamma(const Scalar& n, const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(polygamma, Scalar)::run(n, x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(erf, Scalar) erf(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(erf, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(erfc, Scalar) erfc(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(erfc, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(ndtri, Scalar) ndtri(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(ndtri, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(igamma, Scalar) igamma(const Scalar& a, const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(igamma, Scalar)::run(a, x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(igamma_der_a, Scalar) igamma_der_a(const Scalar& a, const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(igamma_der_a, Scalar)::run(a, x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(gamma_sample_der_alpha, Scalar)
+    gamma_sample_der_alpha(const Scalar& a, const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(gamma_sample_der_alpha, Scalar)::run(a, x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(igammac, Scalar) igammac(const Scalar& a, const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(igammac, Scalar)::run(a, x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(betainc, Scalar)
+    betainc(const Scalar& a, const Scalar& b, const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(betainc, Scalar)::run(a, b, x);
+}
+
+}  // end namespace numext
+}  // end namespace Eigen
+
+#endif  // EIGEN_SPECIAL_FUNCTIONS_H
diff --git a/inst/include/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h b/inst/include/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h
new file mode 100644
index 00000000..d1470db2
--- /dev/null
+++ b/inst/include/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h
@@ -0,0 +1,112 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPECIALFUNCTIONS_PACKETMATH_H
+#define EIGEN_SPECIALFUNCTIONS_PACKETMATH_H
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+
+namespace Eigen {
+
+namespace internal {
+
+/** \internal \returns the ln(|gamma(\a a)|) (coeff-wise) */
+template <typename Packet>
+EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet plgamma(const Packet& a) {
+  using numext::lgamma;
+  return lgamma(a);
+}
+
+/** \internal \returns the derivative of lgamma, psi(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pdigamma(const Packet& a) {
+  using numext::digamma;
+  return digamma(a);
+}
+
+/** \internal \returns the zeta function of two arguments (coeff-wise) */
+template <typename Packet>
+EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pzeta(const Packet& x, const Packet& q) {
+  using numext::zeta;
+  return zeta(x, q);
+}
+
+/** \internal \returns the polygamma function (coeff-wise) */
+template <typename Packet>
+EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet ppolygamma(const Packet& n, const Packet& x) {
+  using numext::polygamma;
+  return polygamma(n, x);
+}
+
+/** \internal \returns the erf(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet perf(const Packet& a) {
+  using numext::erf;
+  return erf(a);
+}
+
+/** \internal \returns the erfc(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet perfc(const Packet& a) {
+  using numext::erfc;
+  return erfc(a);
+}
+
+/** \internal \returns the ndtri(\a a) (coeff-wise) */
+template <typename Packet>
+EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS Packet pndtri(const Packet& a) {
+  typedef typename unpacket_traits<Packet>::type ScalarType;
+  using internal::generic_ndtri;
+  return generic_ndtri<Packet, ScalarType>(a);
+}
+
+/** \internal \returns the incomplete gamma function igamma(\a a, \a x) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pigamma(const Packet& a, const Packet& x) {
+  using numext::igamma;
+  return igamma(a, x);
+}
+
+/** \internal \returns the derivative of the incomplete gamma function
+ * igamma_der_a(\a a, \a x) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pigamma_der_a(const Packet& a, const Packet& x) {
+  using numext::igamma_der_a;
+  return igamma_der_a(a, x);
+}
+
+/** \internal \returns compute the derivative of the sample
+ * of Gamma(alpha, 1) random variable with respect to the parameter a
+ * gamma_sample_der_alpha(\a alpha, \a sample) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pgamma_sample_der_alpha(const Packet& alpha, const Packet& sample) {
+  using numext::gamma_sample_der_alpha;
+  return gamma_sample_der_alpha(alpha, sample);
+}
+
+/** \internal \returns the complementary incomplete gamma function igammac(\a a, \a x) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pigammac(const Packet& a, const Packet& x) {
+  using numext::igammac;
+  return igammac(a, x);
+}
+
+/** \internal \returns the complementary incomplete gamma function betainc(\a a, \a b, \a x) */
+template <typename Packet>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet pbetainc(const Packet& a, const Packet& b, const Packet& x) {
+  using numext::betainc;
+  return betainc(a, b, x);
+}
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_SPECIALFUNCTIONS_PACKETMATH_H
diff --git a/inst/include/unsupported/Eigen/src/SpecialFunctions/arch/AVX/BesselFunctions.h b/inst/include/unsupported/Eigen/src/SpecialFunctions/arch/AVX/BesselFunctions.h
new file mode 100644
index 00000000..2d766920
--- /dev/null
+++ b/inst/include/unsupported/Eigen/src/SpecialFunctions/arch/AVX/BesselFunctions.h
@@ -0,0 +1,46 @@
+#ifndef EIGEN_AVX_BESSELFUNCTIONS_H
+#define EIGEN_AVX_BESSELFUNCTIONS_H
+
+namespace Eigen {
+namespace internal {
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_i0)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_i0)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_i0e)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_i0e)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_i1)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_i1)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_i1e)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_i1e)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_j0)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_j0)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_j1)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_j1)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_k0)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_k0)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_k0e)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_k0e)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_k1)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_k1)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_k1e)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_k1e)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_y0)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_y0)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pbessel_y1)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pbessel_y1)
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_AVX_BESSELFUNCTIONS_H
diff --git a/inst/include/unsupported/Eigen/src/SpecialFunctions/arch/AVX/SpecialFunctions.h b/inst/include/unsupported/Eigen/src/SpecialFunctions/arch/AVX/SpecialFunctions.h
new file mode 100644
index 00000000..35e62a8a
--- /dev/null
+++ b/inst/include/unsupported/Eigen/src/SpecialFunctions/arch/AVX/SpecialFunctions.h
@@ -0,0 +1,16 @@
+#ifndef EIGEN_AVX_SPECIALFUNCTIONS_H
+#define EIGEN_AVX_SPECIALFUNCTIONS_H
+
+namespace Eigen {
+namespace internal {
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, perf)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, perf)
+
+F16_PACKET_FUNCTION(Packet8f, Packet8h, pndtri)
+BF16_PACKET_FUNCTION(Packet8f, Packet8bf, pndtri)
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_AVX_SPECIAL_FUNCTIONS_H
diff --git a/inst/include/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/BesselFunctions.h b/inst/include/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/BesselFunctions.h
new file mode 100644
index 00000000..7dd3c3e5
--- /dev/null
+++ b/inst/include/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/BesselFunctions.h
@@ -0,0 +1,46 @@
+#ifndef EIGEN_AVX512_BESSELFUNCTIONS_H
+#define EIGEN_AVX512_BESSELFUNCTIONS_H
+
+namespace Eigen {
+namespace internal {
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_i0)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_i0)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_i0e)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_i0e)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_i1)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_i1)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_i1e)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_i1e)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_j0)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_j0)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_j1)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_j1)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_k0)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_k0)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_k0e)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_k0e)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_k1)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_k1)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_k1e)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_k1e)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_y0)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_y0)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pbessel_y1)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pbessel_y1)
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_AVX512_BESSELFUNCTIONS_H
diff --git a/inst/include/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/SpecialFunctions.h b/inst/include/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/SpecialFunctions.h
new file mode 100644
index 00000000..79878f2b
--- /dev/null
+++ b/inst/include/unsupported/Eigen/src/SpecialFunctions/arch/AVX512/SpecialFunctions.h
@@ -0,0 +1,16 @@
+#ifndef EIGEN_AVX512_SPECIALFUNCTIONS_H
+#define EIGEN_AVX512_SPECIALFUNCTIONS_H
+
+namespace Eigen {
+namespace internal {
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, perf)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, perf)
+
+F16_PACKET_FUNCTION(Packet16f, Packet16h, pndtri)
+BF16_PACKET_FUNCTION(Packet16f, Packet16bf, pndtri)
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_AVX512_SPECIAL_FUNCTIONS_H
diff --git a/inst/include/unsupported/Eigen/src/SpecialFunctions/arch/GPU/SpecialFunctions.h b/inst/include/unsupported/Eigen/src/SpecialFunctions/arch/GPU/SpecialFunctions.h
new file mode 100644
index 00000000..8f3468b8
--- /dev/null
+++ b/inst/include/unsupported/Eigen/src/SpecialFunctions/arch/GPU/SpecialFunctions.h
@@ -0,0 +1,317 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_GPU_SPECIALFUNCTIONS_H
+#define EIGEN_GPU_SPECIALFUNCTIONS_H
+
+namespace Eigen {
+
+namespace internal {
+
+// Make sure this is only available when targeting a GPU: we don't want to
+// introduce conflicts between these packet_traits definitions and the ones
+// we'll use on the host side (SSE, AVX, ...)
+#if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plgamma<float4>(const float4& a) {
+  return make_float4(lgammaf(a.x), lgammaf(a.y), lgammaf(a.z), lgammaf(a.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plgamma<double2>(const double2& a) {
+  using numext::lgamma;
+  return make_double2(lgamma(a.x), lgamma(a.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pdigamma<float4>(const float4& a) {
+  using numext::digamma;
+  return make_float4(digamma(a.x), digamma(a.y), digamma(a.z), digamma(a.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pdigamma<double2>(const double2& a) {
+  using numext::digamma;
+  return make_double2(digamma(a.x), digamma(a.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pzeta<float4>(const float4& x, const float4& q) {
+  using numext::zeta;
+  return make_float4(zeta(x.x, q.x), zeta(x.y, q.y), zeta(x.z, q.z), zeta(x.w, q.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pzeta<double2>(const double2& x, const double2& q) {
+  using numext::zeta;
+  return make_double2(zeta(x.x, q.x), zeta(x.y, q.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ppolygamma<float4>(const float4& n, const float4& x) {
+  using numext::polygamma;
+  return make_float4(polygamma(n.x, x.x), polygamma(n.y, x.y), polygamma(n.z, x.z), polygamma(n.w, x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ppolygamma<double2>(const double2& n, const double2& x) {
+  using numext::polygamma;
+  return make_double2(polygamma(n.x, x.x), polygamma(n.y, x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 perf<float4>(const float4& a) {
+  return make_float4(erff(a.x), erff(a.y), erff(a.z), erff(a.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 perf<double2>(const double2& a) {
+  using numext::erf;
+  return make_double2(erf(a.x), erf(a.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 perfc<float4>(const float4& a) {
+  using numext::erfc;
+  return make_float4(erfc(a.x), erfc(a.y), erfc(a.z), erfc(a.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 perfc<double2>(const double2& a) {
+  using numext::erfc;
+  return make_double2(erfc(a.x), erfc(a.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pndtri<float4>(const float4& a) {
+  using numext::ndtri;
+  return make_float4(ndtri(a.x), ndtri(a.y), ndtri(a.z), ndtri(a.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pndtri<double2>(const double2& a) {
+  using numext::ndtri;
+  return make_double2(ndtri(a.x), ndtri(a.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pigamma<float4>(const float4& a, const float4& x) {
+  using numext::igamma;
+  return make_float4(igamma(a.x, x.x), igamma(a.y, x.y), igamma(a.z, x.z), igamma(a.w, x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pigamma<double2>(const double2& a, const double2& x) {
+  using numext::igamma;
+  return make_double2(igamma(a.x, x.x), igamma(a.y, x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pigamma_der_a<float4>(const float4& a, const float4& x) {
+  using numext::igamma_der_a;
+  return make_float4(igamma_der_a(a.x, x.x), igamma_der_a(a.y, x.y), igamma_der_a(a.z, x.z), igamma_der_a(a.w, x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pigamma_der_a<double2>(const double2& a, const double2& x) {
+  using numext::igamma_der_a;
+  return make_double2(igamma_der_a(a.x, x.x), igamma_der_a(a.y, x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pgamma_sample_der_alpha<float4>(const float4& alpha,
+                                                                             const float4& sample) {
+  using numext::gamma_sample_der_alpha;
+  return make_float4(gamma_sample_der_alpha(alpha.x, sample.x), gamma_sample_der_alpha(alpha.y, sample.y),
+                     gamma_sample_der_alpha(alpha.z, sample.z), gamma_sample_der_alpha(alpha.w, sample.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pgamma_sample_der_alpha<double2>(const double2& alpha,
+                                                                               const double2& sample) {
+  using numext::gamma_sample_der_alpha;
+  return make_double2(gamma_sample_der_alpha(alpha.x, sample.x), gamma_sample_der_alpha(alpha.y, sample.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pigammac<float4>(const float4& a, const float4& x) {
+  using numext::igammac;
+  return make_float4(igammac(a.x, x.x), igammac(a.y, x.y), igammac(a.z, x.z), igammac(a.w, x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pigammac<double2>(const double2& a, const double2& x) {
+  using numext::igammac;
+  return make_double2(igammac(a.x, x.x), igammac(a.y, x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbetainc<float4>(const float4& a, const float4& b, const float4& x) {
+  using numext::betainc;
+  return make_float4(betainc(a.x, b.x, x.x), betainc(a.y, b.y, x.y), betainc(a.z, b.z, x.z), betainc(a.w, b.w, x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pbetainc<double2>(const double2& a, const double2& b, const double2& x) {
+  using numext::betainc;
+  return make_double2(betainc(a.x, b.x, x.x), betainc(a.y, b.y, x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_i0e<float4>(const float4& x) {
+  using numext::bessel_i0e;
+  return make_float4(bessel_i0e(x.x), bessel_i0e(x.y), bessel_i0e(x.z), bessel_i0e(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pbessel_i0e<double2>(const double2& x) {
+  using numext::bessel_i0e;
+  return make_double2(bessel_i0e(x.x), bessel_i0e(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_i0<float4>(const float4& x) {
+  using numext::bessel_i0;
+  return make_float4(bessel_i0(x.x), bessel_i0(x.y), bessel_i0(x.z), bessel_i0(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pbessel_i0<double2>(const double2& x) {
+  using numext::bessel_i0;
+  return make_double2(bessel_i0(x.x), bessel_i0(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_i1e<float4>(const float4& x) {
+  using numext::bessel_i1e;
+  return make_float4(bessel_i1e(x.x), bessel_i1e(x.y), bessel_i1e(x.z), bessel_i1e(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pbessel_i1e<double2>(const double2& x) {
+  using numext::bessel_i1e;
+  return make_double2(bessel_i1e(x.x), bessel_i1e(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_i1<float4>(const float4& x) {
+  using numext::bessel_i1;
+  return make_float4(bessel_i1(x.x), bessel_i1(x.y), bessel_i1(x.z), bessel_i1(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pbessel_i1<double2>(const double2& x) {
+  using numext::bessel_i1;
+  return make_double2(bessel_i1(x.x), bessel_i1(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_k0e<float4>(const float4& x) {
+  using numext::bessel_k0e;
+  return make_float4(bessel_k0e(x.x), bessel_k0e(x.y), bessel_k0e(x.z), bessel_k0e(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pbessel_k0e<double2>(const double2& x) {
+  using numext::bessel_k0e;
+  return make_double2(bessel_k0e(x.x), bessel_k0e(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_k0<float4>(const float4& x) {
+  using numext::bessel_k0;
+  return make_float4(bessel_k0(x.x), bessel_k0(x.y), bessel_k0(x.z), bessel_k0(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pbessel_k0<double2>(const double2& x) {
+  using numext::bessel_k0;
+  return make_double2(bessel_k0(x.x), bessel_k0(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_k1e<float4>(const float4& x) {
+  using numext::bessel_k1e;
+  return make_float4(bessel_k1e(x.x), bessel_k1e(x.y), bessel_k1e(x.z), bessel_k1e(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pbessel_k1e<double2>(const double2& x) {
+  using numext::bessel_k1e;
+  return make_double2(bessel_k1e(x.x), bessel_k1e(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_k1<float4>(const float4& x) {
+  using numext::bessel_k1;
+  return make_float4(bessel_k1(x.x), bessel_k1(x.y), bessel_k1(x.z), bessel_k1(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pbessel_k1<double2>(const double2& x) {
+  using numext::bessel_k1;
+  return make_double2(bessel_k1(x.x), bessel_k1(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_j0<float4>(const float4& x) {
+  using numext::bessel_j0;
+  return make_float4(bessel_j0(x.x), bessel_j0(x.y), bessel_j0(x.z), bessel_j0(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pbessel_j0<double2>(const double2& x) {
+  using numext::bessel_j0;
+  return make_double2(bessel_j0(x.x), bessel_j0(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_j1<float4>(const float4& x) {
+  using numext::bessel_j1;
+  return make_float4(bessel_j1(x.x), bessel_j1(x.y), bessel_j1(x.z), bessel_j1(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pbessel_j1<double2>(const double2& x) {
+  using numext::bessel_j1;
+  return make_double2(bessel_j1(x.x), bessel_j1(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_y0<float4>(const float4& x) {
+  using numext::bessel_y0;
+  return make_float4(bessel_y0(x.x), bessel_y0(x.y), bessel_y0(x.z), bessel_y0(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pbessel_y0<double2>(const double2& x) {
+  using numext::bessel_y0;
+  return make_double2(bessel_y0(x.x), bessel_y0(x.y));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pbessel_y1<float4>(const float4& x) {
+  using numext::bessel_y1;
+  return make_float4(bessel_y1(x.x), bessel_y1(x.y), bessel_y1(x.z), bessel_y1(x.w));
+}
+
+template <>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pbessel_y1<double2>(const double2& x) {
+  using numext::bessel_y1;
+  return make_double2(bessel_y1(x.x), bessel_y1(x.y));
+}
+
+#endif
+
+}  // end namespace internal
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_GPU_SPECIALFUNCTIONS_H
diff --git a/inst/include/unsupported/Eigen/src/SpecialFunctions/arch/NEON/BesselFunctions.h b/inst/include/unsupported/Eigen/src/SpecialFunctions/arch/NEON/BesselFunctions.h
new file mode 100644
index 00000000..70d90566
--- /dev/null
+++ b/inst/include/unsupported/Eigen/src/SpecialFunctions/arch/NEON/BesselFunctions.h
@@ -0,0 +1,54 @@
+#ifndef EIGEN_NEON_BESSELFUNCTIONS_H
+#define EIGEN_NEON_BESSELFUNCTIONS_H
+
+namespace Eigen {
+namespace internal {
+
+#if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
+
+#define NEON_HALF_TO_FLOAT_FUNCTIONS(METHOD)                                              \
+  template <>                                                                             \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf METHOD<Packet8hf>(const Packet8hf& x) { \
+    const Packet4f lo = METHOD<Packet4f>(vcvt_f32_f16(vget_low_f16(x)));                  \
+    const Packet4f hi = METHOD<Packet4f>(vcvt_f32_f16(vget_high_f16(x)));                 \
+    return vcombine_f16(vcvt_f16_f32(lo), vcvt_f16_f32(hi));                              \
+  }                                                                                       \
+                                                                                          \
+  template <>                                                                             \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf METHOD<Packet4hf>(const Packet4hf& x) { \
+    return vcvt_f16_f32(METHOD<Packet4f>(vcvt_f32_f16(x)));                               \
+  }
+
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_i0)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_i0e)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_i1)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_i1e)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_j0)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_j1)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_k0)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_k0e)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_k1)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_k1e)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_y0)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pbessel_y1)
+
+#undef NEON_HALF_TO_FLOAT_FUNCTIONS
+#endif
+
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_i0)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_i0e)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_i1)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_i1e)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_j0)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_j1)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_k0)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_k0e)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_k1)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_k1e)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_y0)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pbessel_y1)
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_NEON_BESSELFUNCTIONS_H
diff --git a/inst/include/unsupported/Eigen/src/SpecialFunctions/arch/NEON/SpecialFunctions.h b/inst/include/unsupported/Eigen/src/SpecialFunctions/arch/NEON/SpecialFunctions.h
new file mode 100644
index 00000000..5590d2bf
--- /dev/null
+++ b/inst/include/unsupported/Eigen/src/SpecialFunctions/arch/NEON/SpecialFunctions.h
@@ -0,0 +1,34 @@
+#ifndef EIGEN_NEON_SPECIALFUNCTIONS_H
+#define EIGEN_NEON_SPECIALFUNCTIONS_H
+
+namespace Eigen {
+namespace internal {
+
+#if EIGEN_HAS_ARM64_FP16_VECTOR_ARITHMETIC
+
+#define NEON_HALF_TO_FLOAT_FUNCTIONS(METHOD)                                              \
+  template <>                                                                             \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet8hf METHOD<Packet8hf>(const Packet8hf& x) { \
+    const Packet4f lo = METHOD<Packet4f>(vcvt_f32_f16(vget_low_f16(x)));                  \
+    const Packet4f hi = METHOD<Packet4f>(vcvt_f32_f16(vget_high_f16(x)));                 \
+    return vcombine_f16(vcvt_f16_f32(lo), vcvt_f16_f32(hi));                              \
+  }                                                                                       \
+                                                                                          \
+  template <>                                                                             \
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4hf METHOD<Packet4hf>(const Packet4hf& x) { \
+    return vcvt_f16_f32(METHOD<Packet4f>(vcvt_f32_f16(x)));                               \
+  }
+
+NEON_HALF_TO_FLOAT_FUNCTIONS(perf)
+NEON_HALF_TO_FLOAT_FUNCTIONS(pndtri)
+
+#undef NEON_HALF_TO_FLOAT_FUNCTIONS
+#endif
+
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, perf)
+BF16_PACKET_FUNCTION(Packet4f, Packet4bf, pndtri)
+
+}  // namespace internal
+}  // namespace Eigen
+
+#endif  // EIGEN_NEON_SPECIALFUNCTIONS_H
diff --git a/inst/include/unsupported/Eigen/src/Splines/InternalHeaderCheck.h b/inst/include/unsupported/Eigen/src/Splines/InternalHeaderCheck.h
new file mode 100644
index 00000000..4a6087e7
--- /dev/null
+++ b/inst/include/unsupported/Eigen/src/Splines/InternalHeaderCheck.h
@@ -0,0 +1,3 @@
+#ifndef EIGEN_SPLINES_MODULE_H
+#error "Please include unsupported/Eigen/Splines instead of including headers inside the src directory directly."
+#endif
diff --git a/inst/include/unsupported/Eigen/src/Splines/Spline.h b/inst/include/unsupported/Eigen/src/Splines/Spline.h
index 771f1043..6ff1eea6 100644
--- a/inst/include/unsupported/Eigen/src/Splines/Spline.h
+++ b/inst/include/unsupported/Eigen/src/Splines/Spline.h
@@ -10,465 +10,470 @@
 #ifndef EIGEN_SPLINE_H
 #define EIGEN_SPLINE_H
 
-#include "SplineFwd.h"
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 
-namespace Eigen
-{
-    /**
-     * \ingroup Splines_Module
-     * \class Spline
-     * \brief A class representing multi-dimensional spline curves.
-     *
-     * The class represents B-splines with non-uniform knot vectors. Each control
-     * point of the B-spline is associated with a basis function
-     * \f{align*}
-     *   C(u) & = \sum_{i=0}^{n}N_{i,p}(u)P_i
-     * \f}
-     *
-     * \tparam _Scalar The underlying data type (typically float or double)
-     * \tparam _Dim The curve dimension (e.g. 2 or 3)
-     * \tparam _Degree Per default set to Dynamic; could be set to the actual desired
-     *                degree for optimization purposes (would result in stack allocation
-     *                of several temporary variables).
-     **/
-  template <typename _Scalar, int _Dim, int _Degree>
-  class Spline
-  {
-  public:
-    typedef _Scalar Scalar; /*!< The spline curve's scalar type. */
-    enum { Dimension = _Dim /*!< The spline curve's dimension. */ };
-    enum { Degree = _Degree /*!< The spline curve's degree. */ };
-
-    /** \brief The point type the spline is representing. */
-    typedef typename SplineTraits<Spline>::PointType PointType;
-    
-    /** \brief The data type used to store knot vectors. */
-    typedef typename SplineTraits<Spline>::KnotVectorType KnotVectorType;
-    
-    /** \brief The data type used to store non-zero basis functions. */
-    typedef typename SplineTraits<Spline>::BasisVectorType BasisVectorType;
-    
-    /** \brief The data type representing the spline's control points. */
-    typedef typename SplineTraits<Spline>::ControlPointVectorType ControlPointVectorType;
-    
-    /**
-    * \brief Creates a (constant) zero spline.
-    * For Splines with dynamic degree, the resulting degree will be 0.
-    **/
-    Spline() 
-    : m_knots(1, (Degree==Dynamic ? 2 : 2*Degree+2))
-    , m_ctrls(ControlPointVectorType::Zero(2,(Degree==Dynamic ? 1 : Degree+1))) 
-    {
-      // in theory this code can go to the initializer list but it will get pretty
-      // much unreadable ...
-      enum { MinDegree = (Degree==Dynamic ? 0 : Degree) };
-      m_knots.template segment<MinDegree+1>(0) = Array<Scalar,1,MinDegree+1>::Zero();
-      m_knots.template segment<MinDegree+1>(MinDegree+1) = Array<Scalar,1,MinDegree+1>::Ones();
-    }
+#include "SplineFwd.h"
 
-    /**
-    * \brief Creates a spline from a knot vector and control points.
-    * \param knots The spline's knot vector.
-    * \param ctrls The spline's control point vector.
-    **/
-    template <typename OtherVectorType, typename OtherArrayType>
-    Spline(const OtherVectorType& knots, const OtherArrayType& ctrls) : m_knots(knots), m_ctrls(ctrls) {}
-
-    /**
-    * \brief Copy constructor for splines.
-    * \param spline The input spline.
-    **/
-    template <int OtherDegree>
-    Spline(const Spline<Scalar, Dimension, OtherDegree>& spline) : 
-    m_knots(spline.knots()), m_ctrls(spline.ctrls()) {}
-
-    /**
-     * \brief Returns the knots of the underlying spline.
-     **/
-    const KnotVectorType& knots() const { return m_knots; }
-    
-    /**
-     * \brief Returns the knots of the underlying spline.
-     **/    
-    const ControlPointVectorType& ctrls() const { return m_ctrls; }
-
-    /**
-     * \brief Returns the spline value at a given site \f$u\f$.
-     *
-     * The function returns
-     * \f{align*}
-     *   C(u) & = \sum_{i=0}^{n}N_{i,p}P_i
-     * \f}
-     *
-     * \param u Parameter \f$u \in [0;1]\f$ at which the spline is evaluated.
-     * \return The spline value at the given location \f$u\f$.
-     **/
-    PointType operator()(Scalar u) const;
-
-    /**
-     * \brief Evaluation of spline derivatives of up-to given order.
-     *
-     * The function returns
-     * \f{align*}
-     *   \frac{d^i}{du^i}C(u) & = \sum_{i=0}^{n} \frac{d^i}{du^i} N_{i,p}(u)P_i
-     * \f}
-     * for i ranging between 0 and order.
-     *
-     * \param u Parameter \f$u \in [0;1]\f$ at which the spline derivative is evaluated.
-     * \param order The order up to which the derivatives are computed.
-     **/
-    typename SplineTraits<Spline>::DerivativeType
-      derivatives(Scalar u, DenseIndex order) const;
-
-    /**
-     * \copydoc Spline::derivatives
-     * Using the template version of this function is more efficieent since
-     * temporary objects are allocated on the stack whenever this is possible.
-     **/    
-    template <int DerivativeOrder>
-    typename SplineTraits<Spline,DerivativeOrder>::DerivativeType
-      derivatives(Scalar u, DenseIndex order = DerivativeOrder) const;
-
-    /**
-     * \brief Computes the non-zero basis functions at the given site.
-     *
-     * Splines have local support and a point from their image is defined
-     * by exactly \f$p+1\f$ control points \f$P_i\f$ where \f$p\f$ is the
-     * spline degree.
-     *
-     * This function computes the \f$p+1\f$ non-zero basis function values
-     * for a given parameter value \f$u\f$. It returns
-     * \f{align*}{
-     *   N_{i,p}(u), \hdots, N_{i+p+1,p}(u)
-     * \f}
-     *
-     * \param u Parameter \f$u \in [0;1]\f$ at which the non-zero basis functions 
-     *          are computed.
-     **/
-    typename SplineTraits<Spline>::BasisVectorType
-      basisFunctions(Scalar u) const;
-
-    /**
-     * \brief Computes the non-zero spline basis function derivatives up to given order.
-     *
-     * The function computes
-     * \f{align*}{
-     *   \frac{d^i}{du^i} N_{i,p}(u), \hdots, \frac{d^i}{du^i} N_{i+p+1,p}(u)
-     * \f}
-     * with i ranging from 0 up to the specified order.
-     *
-     * \param u Parameter \f$u \in [0;1]\f$ at which the non-zero basis function
-     *          derivatives are computed.
-     * \param order The order up to which the basis function derivatives are computes.
-     **/
-    typename SplineTraits<Spline>::BasisDerivativeType
-      basisFunctionDerivatives(Scalar u, DenseIndex order) const;
-
-    /**
-     * \copydoc Spline::basisFunctionDerivatives
-     * Using the template version of this function is more efficieent since
-     * temporary objects are allocated on the stack whenever this is possible.
-     **/    
-    template <int DerivativeOrder>
-    typename SplineTraits<Spline,DerivativeOrder>::BasisDerivativeType
-      basisFunctionDerivatives(Scalar u, DenseIndex order = DerivativeOrder) const;
-
-    /**
-     * \brief Returns the spline degree.
-     **/ 
-    DenseIndex degree() const;
-
-    /** 
-     * \brief Returns the span within the knot vector in which u is falling.
-     * \param u The site for which the span is determined.
-     **/
-    DenseIndex span(Scalar u) const;
-
-    /**
-     * \brief Computes the spang within the provided knot vector in which u is falling.
-     **/
-    static DenseIndex Span(typename SplineTraits<Spline>::Scalar u, DenseIndex degree, const typename SplineTraits<Spline>::KnotVectorType& knots);
-    
-    /**
-     * \brief Returns the spline's non-zero basis functions.
-     *
-     * The function computes and returns
-     * \f{align*}{
-     *   N_{i,p}(u), \hdots, N_{i+p+1,p}(u)
-     * \f}
-     *
-     * \param u The site at which the basis functions are computed.
-     * \param degree The degree of the underlying spline.
-     * \param knots The underlying spline's knot vector.
-     **/
-    static BasisVectorType BasisFunctions(Scalar u, DenseIndex degree, const KnotVectorType& knots);
-
-
-  private:
-    KnotVectorType m_knots; /*!< Knot vector. */
-    ControlPointVectorType  m_ctrls; /*!< Control points. */
-  };
-
-  template <typename _Scalar, int _Dim, int _Degree>
-  DenseIndex Spline<_Scalar, _Dim, _Degree>::Span(
-    typename SplineTraits< Spline<_Scalar, _Dim, _Degree> >::Scalar u,
-    DenseIndex degree,
-    const typename SplineTraits< Spline<_Scalar, _Dim, _Degree> >::KnotVectorType& knots)
-  {
-    // Piegl & Tiller, "The NURBS Book", A2.1 (p. 68)
-    if (u <= knots(0)) return degree;
-    const Scalar* pos = std::upper_bound(knots.data()+degree-1, knots.data()+knots.size()-degree-1, u);
-    return static_cast<DenseIndex>( std::distance(knots.data(), pos) - 1 );
+namespace Eigen {
+/**
+ * \ingroup Splines_Module
+ * \class Spline
+ * \brief A class representing multi-dimensional spline curves.
+ *
+ * The class represents B-splines with non-uniform knot vectors. Each control
+ * point of the B-spline is associated with a basis function
+ * \f{align*}
+ *   C(u) & = \sum_{i=0}^{n}N_{i,p}(u)P_i
+ * \f}
+ *
+ * \tparam Scalar_ The underlying data type (typically float or double)
+ * \tparam Dim_ The curve dimension (e.g. 2 or 3)
+ * \tparam Degree_ Per default set to Dynamic; could be set to the actual desired
+ *                degree for optimization purposes (would result in stack allocation
+ *                of several temporary variables).
+ **/
+template <typename Scalar_, int Dim_, int Degree_>
+class Spline {
+ public:
+  typedef Scalar_ Scalar; /*!< The spline curve's scalar type. */
+  enum { Dimension = Dim_ /*!< The spline curve's dimension. */ };
+  enum { Degree = Degree_ /*!< The spline curve's degree. */ };
+
+  /** \brief The point type the spline is representing. */
+  typedef typename SplineTraits<Spline>::PointType PointType;
+
+  /** \brief The data type used to store knot vectors. */
+  typedef typename SplineTraits<Spline>::KnotVectorType KnotVectorType;
+
+  /** \brief The data type used to store parameter vectors. */
+  typedef typename SplineTraits<Spline>::ParameterVectorType ParameterVectorType;
+
+  /** \brief The data type used to store non-zero basis functions. */
+  typedef typename SplineTraits<Spline>::BasisVectorType BasisVectorType;
+
+  /** \brief The data type used to store the values of the basis function derivatives. */
+  typedef typename SplineTraits<Spline>::BasisDerivativeType BasisDerivativeType;
+
+  /** \brief The data type representing the spline's control points. */
+  typedef typename SplineTraits<Spline>::ControlPointVectorType ControlPointVectorType;
+
+  /**
+   * \brief Creates a (constant) zero spline.
+   * For Splines with dynamic degree, the resulting degree will be 0.
+   **/
+  Spline()
+      : m_knots(1, (Degree == Dynamic ? 2 : 2 * Degree + 2)),
+        m_ctrls(ControlPointVectorType::Zero(Dimension, (Degree == Dynamic ? 1 : Degree + 1))) {
+    // in theory this code can go to the initializer list but it will get pretty
+    // much unreadable ...
+    enum { MinDegree = (Degree == Dynamic ? 0 : Degree) };
+    m_knots.template segment<MinDegree + 1>(0) = Array<Scalar, 1, MinDegree + 1>::Zero();
+    m_knots.template segment<MinDegree + 1>(MinDegree + 1) = Array<Scalar, 1, MinDegree + 1>::Ones();
   }
 
-  template <typename _Scalar, int _Dim, int _Degree>
-  typename Spline<_Scalar, _Dim, _Degree>::BasisVectorType
-    Spline<_Scalar, _Dim, _Degree>::BasisFunctions(
-    typename Spline<_Scalar, _Dim, _Degree>::Scalar u,
-    DenseIndex degree,
-    const typename Spline<_Scalar, _Dim, _Degree>::KnotVectorType& knots)
-  {
-    typedef typename Spline<_Scalar, _Dim, _Degree>::BasisVectorType BasisVectorType;
-
-    const DenseIndex p = degree;
-    const DenseIndex i = Spline::Span(u, degree, knots);
-
-    const KnotVectorType& U = knots;
-
-    BasisVectorType left(p+1); left(0) = Scalar(0);
-    BasisVectorType right(p+1); right(0) = Scalar(0);        
-
-    VectorBlock<BasisVectorType,Degree>(left,1,p) = u - VectorBlock<const KnotVectorType,Degree>(U,i+1-p,p).reverse();
-    VectorBlock<BasisVectorType,Degree>(right,1,p) = VectorBlock<const KnotVectorType,Degree>(U,i+1,p) - u;
-
-    BasisVectorType N(1,p+1);
-    N(0) = Scalar(1);
-    for (DenseIndex j=1; j<=p; ++j)
-    {
-      Scalar saved = Scalar(0);
-      for (DenseIndex r=0; r<j; r++)
-      {
-        const Scalar tmp = N(r)/(right(r+1)+left(j-r));
-        N[r] = saved + right(r+1)*tmp;
-        saved = left(j-r)*tmp;
-      }
-      N(j) = saved;
+  /**
+   * \brief Creates a spline from a knot vector and control points.
+   * \param knots The spline's knot vector.
+   * \param ctrls The spline's control point vector.
+   **/
+  template <typename OtherVectorType, typename OtherArrayType>
+  Spline(const OtherVectorType& knots, const OtherArrayType& ctrls) : m_knots(knots), m_ctrls(ctrls) {}
+
+  /**
+   * \brief Copy constructor for splines.
+   * \param spline The input spline.
+   **/
+  template <int OtherDegree>
+  Spline(const Spline<Scalar, Dimension, OtherDegree>& spline) : m_knots(spline.knots()), m_ctrls(spline.ctrls()) {}
+
+  /**
+   * \brief Returns the knots of the underlying spline.
+   **/
+  const KnotVectorType& knots() const { return m_knots; }
+
+  /**
+   * \brief Returns the ctrls of the underlying spline.
+   **/
+  const ControlPointVectorType& ctrls() const { return m_ctrls; }
+
+  /**
+   * \brief Returns the spline value at a given site \f$u\f$.
+   *
+   * The function returns
+   * \f{align*}
+   *   C(u) & = \sum_{i=0}^{n}N_{i,p}P_i
+   * \f}
+   *
+   * \param u Parameter \f$u \in [0;1]\f$ at which the spline is evaluated.
+   * \return The spline value at the given location \f$u\f$.
+   **/
+  PointType operator()(Scalar u) const;
+
+  /**
+   * \brief Evaluation of spline derivatives of up-to given order.
+   *
+   * The function returns
+   * \f{align*}
+   *   \frac{d^i}{du^i}C(u) & = \sum_{i=0}^{n} \frac{d^i}{du^i} N_{i,p}(u)P_i
+   * \f}
+   * for i ranging between 0 and order.
+   *
+   * \param u Parameter \f$u \in [0;1]\f$ at which the spline derivative is evaluated.
+   * \param order The order up to which the derivatives are computed.
+   **/
+  typename SplineTraits<Spline>::DerivativeType derivatives(Scalar u, DenseIndex order) const;
+
+  /**
+   * \copydoc Spline::derivatives
+   * Using the template version of this function is more efficieent since
+   * temporary objects are allocated on the stack whenever this is possible.
+   **/
+  template <int DerivativeOrder>
+  typename SplineTraits<Spline, DerivativeOrder>::DerivativeType derivatives(Scalar u,
+                                                                             DenseIndex order = DerivativeOrder) const;
+
+  /**
+   * \brief Computes the non-zero basis functions at the given site.
+   *
+   * Splines have local support and a point from their image is defined
+   * by exactly \f$p+1\f$ control points \f$P_i\f$ where \f$p\f$ is the
+   * spline degree.
+   *
+   * This function computes the \f$p+1\f$ non-zero basis function values
+   * for a given parameter value \f$u\f$. It returns
+   * \f{align*}{
+   *   N_{i,p}(u), \hdots, N_{i+p+1,p}(u)
+   * \f}
+   *
+   * \param u Parameter \f$u \in [0;1]\f$ at which the non-zero basis functions
+   *          are computed.
+   **/
+  typename SplineTraits<Spline>::BasisVectorType basisFunctions(Scalar u) const;
+
+  /**
+   * \brief Computes the non-zero spline basis function derivatives up to given order.
+   *
+   * The function computes
+   * \f{align*}{
+   *   \frac{d^i}{du^i} N_{i,p}(u), \hdots, \frac{d^i}{du^i} N_{i+p+1,p}(u)
+   * \f}
+   * with i ranging from 0 up to the specified order.
+   *
+   * \param u Parameter \f$u \in [0;1]\f$ at which the non-zero basis function
+   *          derivatives are computed.
+   * \param order The order up to which the basis function derivatives are computes.
+   **/
+  typename SplineTraits<Spline>::BasisDerivativeType basisFunctionDerivatives(Scalar u, DenseIndex order) const;
+
+  /**
+   * \copydoc Spline::basisFunctionDerivatives
+   * Using the template version of this function is more efficieent since
+   * temporary objects are allocated on the stack whenever this is possible.
+   **/
+  template <int DerivativeOrder>
+  typename SplineTraits<Spline, DerivativeOrder>::BasisDerivativeType basisFunctionDerivatives(
+      Scalar u, DenseIndex order = DerivativeOrder) const;
+
+  /**
+   * \brief Returns the spline degree.
+   **/
+  DenseIndex degree() const;
+
+  /**
+   * \brief Returns the span within the knot vector in which u is falling.
+   * \param u The site for which the span is determined.
+   **/
+  DenseIndex span(Scalar u) const;
+
+  /**
+   * \brief Computes the span within the provided knot vector in which u is falling.
+   **/
+  static DenseIndex Span(typename SplineTraits<Spline>::Scalar u, DenseIndex degree,
+                         const typename SplineTraits<Spline>::KnotVectorType& knots);
+
+  /**
+   * \brief Returns the spline's non-zero basis functions.
+   *
+   * The function computes and returns
+   * \f{align*}{
+   *   N_{i,p}(u), \hdots, N_{i+p+1,p}(u)
+   * \f}
+   *
+   * \param u The site at which the basis functions are computed.
+   * \param degree The degree of the underlying spline.
+   * \param knots The underlying spline's knot vector.
+   **/
+  static BasisVectorType BasisFunctions(Scalar u, DenseIndex degree, const KnotVectorType& knots);
+
+  /**
+   * \copydoc Spline::basisFunctionDerivatives
+   * \param degree The degree of the underlying spline
+   * \param knots The underlying spline's knot vector.
+   **/
+  static BasisDerivativeType BasisFunctionDerivatives(const Scalar u, const DenseIndex order, const DenseIndex degree,
+                                                      const KnotVectorType& knots);
+
+ private:
+  KnotVectorType m_knots;         /*!< Knot vector. */
+  ControlPointVectorType m_ctrls; /*!< Control points. */
+
+  template <typename DerivativeType>
+  static void BasisFunctionDerivativesImpl(const typename Spline<Scalar_, Dim_, Degree_>::Scalar u,
+                                           const DenseIndex order, const DenseIndex p,
+                                           const typename Spline<Scalar_, Dim_, Degree_>::KnotVectorType& U,
+                                           DerivativeType& N_);
+};
+
+template <typename Scalar_, int Dim_, int Degree_>
+DenseIndex Spline<Scalar_, Dim_, Degree_>::Span(
+    typename SplineTraits<Spline<Scalar_, Dim_, Degree_> >::Scalar u, DenseIndex degree,
+    const typename SplineTraits<Spline<Scalar_, Dim_, Degree_> >::KnotVectorType& knots) {
+  // Piegl & Tiller, "The NURBS Book", A2.1 (p. 68)
+  if (u <= knots(0)) return degree;
+  const Scalar* pos = std::upper_bound(knots.data() + degree - 1, knots.data() + knots.size() - degree - 1, u);
+  return static_cast<DenseIndex>(std::distance(knots.data(), pos) - 1);
+}
+
+template <typename Scalar_, int Dim_, int Degree_>
+typename Spline<Scalar_, Dim_, Degree_>::BasisVectorType Spline<Scalar_, Dim_, Degree_>::BasisFunctions(
+    typename Spline<Scalar_, Dim_, Degree_>::Scalar u, DenseIndex degree,
+    const typename Spline<Scalar_, Dim_, Degree_>::KnotVectorType& knots) {
+  const DenseIndex p = degree;
+  const DenseIndex i = Spline::Span(u, degree, knots);
+
+  const KnotVectorType& U = knots;
+
+  BasisVectorType left(p + 1);
+  left(0) = Scalar(0);
+  BasisVectorType right(p + 1);
+  right(0) = Scalar(0);
+
+  VectorBlock<BasisVectorType, Degree>(left, 1, p) =
+      u - VectorBlock<const KnotVectorType, Degree>(U, i + 1 - p, p).reverse();
+  VectorBlock<BasisVectorType, Degree>(right, 1, p) = VectorBlock<const KnotVectorType, Degree>(U, i + 1, p) - u;
+
+  BasisVectorType N(1, p + 1);
+  N(0) = Scalar(1);
+  for (DenseIndex j = 1; j <= p; ++j) {
+    Scalar saved = Scalar(0);
+    for (DenseIndex r = 0; r < j; r++) {
+      const Scalar tmp = N(r) / (right(r + 1) + left(j - r));
+      N[r] = saved + right(r + 1) * tmp;
+      saved = left(j - r) * tmp;
     }
-    return N;
+    N(j) = saved;
   }
+  return N;
+}
 
-  template <typename _Scalar, int _Dim, int _Degree>
-  DenseIndex Spline<_Scalar, _Dim, _Degree>::degree() const
-  {
-    if (_Degree == Dynamic)
-      return m_knots.size() - m_ctrls.cols() - 1;
-    else
-      return _Degree;
-  }
+template <typename Scalar_, int Dim_, int Degree_>
+DenseIndex Spline<Scalar_, Dim_, Degree_>::degree() const {
+  if (Degree_ == Dynamic)
+    return m_knots.size() - m_ctrls.cols() - 1;
+  else
+    return Degree_;
+}
 
-  template <typename _Scalar, int _Dim, int _Degree>
-  DenseIndex Spline<_Scalar, _Dim, _Degree>::span(Scalar u) const
-  {
-    return Spline::Span(u, degree(), knots());
-  }
+template <typename Scalar_, int Dim_, int Degree_>
+DenseIndex Spline<Scalar_, Dim_, Degree_>::span(Scalar u) const {
+  return Spline::Span(u, degree(), knots());
+}
 
-  template <typename _Scalar, int _Dim, int _Degree>
-  typename Spline<_Scalar, _Dim, _Degree>::PointType Spline<_Scalar, _Dim, _Degree>::operator()(Scalar u) const
-  {
-    enum { Order = SplineTraits<Spline>::OrderAtCompileTime };
+template <typename Scalar_, int Dim_, int Degree_>
+typename Spline<Scalar_, Dim_, Degree_>::PointType Spline<Scalar_, Dim_, Degree_>::operator()(Scalar u) const {
+  enum { Order = SplineTraits<Spline>::OrderAtCompileTime };
 
-    const DenseIndex span = this->span(u);
-    const DenseIndex p = degree();
-    const BasisVectorType basis_funcs = basisFunctions(u);
+  const DenseIndex span = this->span(u);
+  const DenseIndex p = degree();
+  const BasisVectorType basis_funcs = basisFunctions(u);
 
-    const Replicate<BasisVectorType,Dimension,1> ctrl_weights(basis_funcs);
-    const Block<const ControlPointVectorType,Dimension,Order> ctrl_pts(ctrls(),0,span-p,Dimension,p+1);
-    return (ctrl_weights * ctrl_pts).rowwise().sum();
-  }
+  const Replicate<BasisVectorType, Dimension, 1> ctrl_weights(basis_funcs);
+  const Block<const ControlPointVectorType, Dimension, Order> ctrl_pts(ctrls(), 0, span - p, Dimension, p + 1);
+  return (ctrl_weights * ctrl_pts).rowwise().sum();
+}
 
-  /* --------------------------------------------------------------------------------------------- */
+/* --------------------------------------------------------------------------------------------- */
 
-  template <typename SplineType, typename DerivativeType>
-  void derivativesImpl(const SplineType& spline, typename SplineType::Scalar u, DenseIndex order, DerivativeType& der)
-  {    
-    enum { Dimension = SplineTraits<SplineType>::Dimension };
-    enum { Order = SplineTraits<SplineType>::OrderAtCompileTime };
-    enum { DerivativeOrder = DerivativeType::ColsAtCompileTime };
+template <typename SplineType, typename DerivativeType>
+void derivativesImpl(const SplineType& spline, typename SplineType::Scalar u, DenseIndex order, DerivativeType& der) {
+  enum { Dimension = SplineTraits<SplineType>::Dimension };
+  enum { Order = SplineTraits<SplineType>::OrderAtCompileTime };
+  enum { DerivativeOrder = DerivativeType::ColsAtCompileTime };
 
-    typedef typename SplineTraits<SplineType>::ControlPointVectorType ControlPointVectorType;
-    typedef typename SplineTraits<SplineType,DerivativeOrder>::BasisDerivativeType BasisDerivativeType;
-    typedef typename BasisDerivativeType::ConstRowXpr BasisDerivativeRowXpr;    
+  typedef typename SplineTraits<SplineType>::ControlPointVectorType ControlPointVectorType;
+  typedef typename SplineTraits<SplineType, DerivativeOrder>::BasisDerivativeType BasisDerivativeType;
+  typedef typename BasisDerivativeType::ConstRowXpr BasisDerivativeRowXpr;
 
-    const DenseIndex p = spline.degree();
-    const DenseIndex span = spline.span(u);
+  const DenseIndex p = spline.degree();
+  const DenseIndex span = spline.span(u);
 
-    const DenseIndex n = (std::min)(p, order);
+  const DenseIndex n = (std::min)(p, order);
 
-    der.resize(Dimension,n+1);
+  der.resize(Dimension, n + 1);
 
-    // Retrieve the basis function derivatives up to the desired order...    
-    const BasisDerivativeType basis_func_ders = spline.template basisFunctionDerivatives<DerivativeOrder>(u, n+1);
+  // Retrieve the basis function derivatives up to the desired order...
+  const BasisDerivativeType basis_func_ders = spline.template basisFunctionDerivatives<DerivativeOrder>(u, n + 1);
 
-    // ... and perform the linear combinations of the control points.
-    for (DenseIndex der_order=0; der_order<n+1; ++der_order)
-    {
-      const Replicate<BasisDerivativeRowXpr,Dimension,1> ctrl_weights( basis_func_ders.row(der_order) );
-      const Block<const ControlPointVectorType,Dimension,Order> ctrl_pts(spline.ctrls(),0,span-p,Dimension,p+1);
-      der.col(der_order) = (ctrl_weights * ctrl_pts).rowwise().sum();
-    }
+  // ... and perform the linear combinations of the control points.
+  for (DenseIndex der_order = 0; der_order < n + 1; ++der_order) {
+    const Replicate<BasisDerivativeRowXpr, Dimension, 1> ctrl_weights(basis_func_ders.row(der_order));
+    const Block<const ControlPointVectorType, Dimension, Order> ctrl_pts(spline.ctrls(), 0, span - p, Dimension, p + 1);
+    der.col(der_order) = (ctrl_weights * ctrl_pts).rowwise().sum();
   }
+}
 
-  template <typename _Scalar, int _Dim, int _Degree>
-  typename SplineTraits< Spline<_Scalar, _Dim, _Degree> >::DerivativeType
-    Spline<_Scalar, _Dim, _Degree>::derivatives(Scalar u, DenseIndex order) const
-  {
-    typename SplineTraits< Spline >::DerivativeType res;
-    derivativesImpl(*this, u, order, res);
-    return res;
-  }
+template <typename Scalar_, int Dim_, int Degree_>
+typename SplineTraits<Spline<Scalar_, Dim_, Degree_> >::DerivativeType Spline<Scalar_, Dim_, Degree_>::derivatives(
+    Scalar u, DenseIndex order) const {
+  typename SplineTraits<Spline>::DerivativeType res;
+  derivativesImpl(*this, u, order, res);
+  return res;
+}
 
-  template <typename _Scalar, int _Dim, int _Degree>
-  template <int DerivativeOrder>
-  typename SplineTraits< Spline<_Scalar, _Dim, _Degree>, DerivativeOrder >::DerivativeType
-    Spline<_Scalar, _Dim, _Degree>::derivatives(Scalar u, DenseIndex order) const
-  {
-    typename SplineTraits< Spline, DerivativeOrder >::DerivativeType res;
-    derivativesImpl(*this, u, order, res);
-    return res;
-  }
+template <typename Scalar_, int Dim_, int Degree_>
+template <int DerivativeOrder>
+typename SplineTraits<Spline<Scalar_, Dim_, Degree_>, DerivativeOrder>::DerivativeType
+Spline<Scalar_, Dim_, Degree_>::derivatives(Scalar u, DenseIndex order) const {
+  typename SplineTraits<Spline, DerivativeOrder>::DerivativeType res;
+  derivativesImpl(*this, u, order, res);
+  return res;
+}
 
-  template <typename _Scalar, int _Dim, int _Degree>
-  typename SplineTraits< Spline<_Scalar, _Dim, _Degree> >::BasisVectorType
-    Spline<_Scalar, _Dim, _Degree>::basisFunctions(Scalar u) const
-  {
-    return Spline::BasisFunctions(u, degree(), knots());
-  }
+template <typename Scalar_, int Dim_, int Degree_>
+typename SplineTraits<Spline<Scalar_, Dim_, Degree_> >::BasisVectorType Spline<Scalar_, Dim_, Degree_>::basisFunctions(
+    Scalar u) const {
+  return Spline::BasisFunctions(u, degree(), knots());
+}
 
-  /* --------------------------------------------------------------------------------------------- */
+/* --------------------------------------------------------------------------------------------- */
 
-  template <typename SplineType, typename DerivativeType>
-  void basisFunctionDerivativesImpl(const SplineType& spline, typename SplineType::Scalar u, DenseIndex order, DerivativeType& N_)
-  {
-    enum { Order = SplineTraits<SplineType>::OrderAtCompileTime };
+template <typename Scalar_, int Dim_, int Degree_>
+template <typename DerivativeType>
+void Spline<Scalar_, Dim_, Degree_>::BasisFunctionDerivativesImpl(
+    const typename Spline<Scalar_, Dim_, Degree_>::Scalar u, const DenseIndex order, const DenseIndex p,
+    const typename Spline<Scalar_, Dim_, Degree_>::KnotVectorType& U, DerivativeType& N_) {
+  typedef Spline<Scalar_, Dim_, Degree_> SplineType;
+  enum { Order = SplineTraits<SplineType>::OrderAtCompileTime };
 
-    typedef typename SplineTraits<SplineType>::Scalar Scalar;
-    typedef typename SplineTraits<SplineType>::BasisVectorType BasisVectorType;
-    typedef typename SplineTraits<SplineType>::KnotVectorType KnotVectorType;
+  const DenseIndex span = SplineType::Span(u, p, U);
 
-    const KnotVectorType& U = spline.knots();
+  const DenseIndex n = (std::min)(p, order);
 
-    const DenseIndex p = spline.degree();
-    const DenseIndex span = spline.span(u);
+  N_.resize(n + 1, p + 1);
 
-    const DenseIndex n = (std::min)(p, order);
+  BasisVectorType left = BasisVectorType::Zero(p + 1);
+  BasisVectorType right = BasisVectorType::Zero(p + 1);
 
-    N_.resize(n+1, p+1);
+  Matrix<Scalar, Order, Order> ndu(p + 1, p + 1);
 
-    BasisVectorType left = BasisVectorType::Zero(p+1);
-    BasisVectorType right = BasisVectorType::Zero(p+1);
+  Scalar saved, temp;  // FIXME These were double instead of Scalar. Was there a reason for that?
 
-    Matrix<Scalar,Order,Order> ndu(p+1,p+1);
+  ndu(0, 0) = 1.0;
 
-    double saved, temp;
+  DenseIndex j;
+  for (j = 1; j <= p; ++j) {
+    left[j] = u - U[span + 1 - j];
+    right[j] = U[span + j] - u;
+    saved = 0.0;
 
-    ndu(0,0) = 1.0;
+    for (DenseIndex r = 0; r < j; ++r) {
+      /* Lower triangle */
+      ndu(j, r) = right[r + 1] + left[j - r];
+      temp = ndu(r, j - 1) / ndu(j, r);
+      /* Upper triangle */
+      ndu(r, j) = static_cast<Scalar>(saved + right[r + 1] * temp);
+      saved = left[j - r] * temp;
+    }
 
-    DenseIndex j;
-    for (j=1; j<=p; ++j)
-    {
-      left[j] = u-U[span+1-j];
-      right[j] = U[span+j]-u;
-      saved = 0.0;
+    ndu(j, j) = static_cast<Scalar>(saved);
+  }
 
-      for (DenseIndex r=0; r<j; ++r)
-      {
-        /* Lower triangle */
-        ndu(j,r) = right[r+1]+left[j-r];
-        temp = ndu(r,j-1)/ndu(j,r);
-        /* Upper triangle */
-        ndu(r,j) = static_cast<Scalar>(saved+right[r+1] * temp);
-        saved = left[j-r] * temp;
+  for (j = p; j >= 0; --j) N_(0, j) = ndu(j, p);
+
+  // Compute the derivatives
+  DerivativeType a(n + 1, p + 1);
+  DenseIndex r = 0;
+  for (; r <= p; ++r) {
+    DenseIndex s1, s2;
+    s1 = 0;
+    s2 = 1;  // alternate rows in array a
+    a(0, 0) = 1.0;
+
+    // Compute the k-th derivative
+    for (DenseIndex k = 1; k <= static_cast<DenseIndex>(n); ++k) {
+      Scalar d = 0.0;
+      DenseIndex rk, pk, j1, j2;
+      rk = r - k;
+      pk = p - k;
+
+      if (r >= k) {
+        a(s2, 0) = a(s1, 0) / ndu(pk + 1, rk);
+        d = a(s2, 0) * ndu(rk, pk);
       }
 
-      ndu(j,j) = static_cast<Scalar>(saved);
-    }
+      if (rk >= -1)
+        j1 = 1;
+      else
+        j1 = -rk;
 
-    for (j = p; j>=0; --j) 
-      N_(0,j) = ndu(j,p);
-
-    // Compute the derivatives
-    DerivativeType a(n+1,p+1);
-    DenseIndex r=0;
-    for (; r<=p; ++r)
-    {
-      DenseIndex s1,s2;
-      s1 = 0; s2 = 1; // alternate rows in array a
-      a(0,0) = 1.0;
-
-      // Compute the k-th derivative
-      for (DenseIndex k=1; k<=static_cast<DenseIndex>(n); ++k)
-      {
-        double d = 0.0;
-        DenseIndex rk,pk,j1,j2;
-        rk = r-k; pk = p-k;
-
-        if (r>=k)
-        {
-          a(s2,0) = a(s1,0)/ndu(pk+1,rk);
-          d = a(s2,0)*ndu(rk,pk);
-        }
-
-        if (rk>=-1) j1 = 1;
-        else        j1 = -rk;
-
-        if (r-1 <= pk) j2 = k-1;
-        else           j2 = p-r;
-
-        for (j=j1; j<=j2; ++j)
-        {
-          a(s2,j) = (a(s1,j)-a(s1,j-1))/ndu(pk+1,rk+j);
-          d += a(s2,j)*ndu(rk+j,pk);
-        }
-
-        if (r<=pk)
-        {
-          a(s2,k) = -a(s1,k-1)/ndu(pk+1,r);
-          d += a(s2,k)*ndu(r,pk);
-        }
-
-        N_(k,r) = static_cast<Scalar>(d);
-        j = s1; s1 = s2; s2 = j; // Switch rows
+      if (r - 1 <= pk)
+        j2 = k - 1;
+      else
+        j2 = p - r;
+
+      for (j = j1; j <= j2; ++j) {
+        a(s2, j) = (a(s1, j) - a(s1, j - 1)) / ndu(pk + 1, rk + j);
+        d += a(s2, j) * ndu(rk + j, pk);
       }
-    }
 
-    /* Multiply through by the correct factors */
-    /* (Eq. [2.9])                             */
-    r = p;
-    for (DenseIndex k=1; k<=static_cast<DenseIndex>(n); ++k)
-    {
-      for (DenseIndex j=p; j>=0; --j) N_(k,j) *= r;
-      r *= p-k;
+      if (r <= pk) {
+        a(s2, k) = -a(s1, k - 1) / ndu(pk + 1, r);
+        d += a(s2, k) * ndu(r, pk);
+      }
+
+      N_(k, r) = static_cast<Scalar>(d);
+      j = s1;
+      s1 = s2;
+      s2 = j;  // Switch rows
     }
   }
 
-  template <typename _Scalar, int _Dim, int _Degree>
-  typename SplineTraits< Spline<_Scalar, _Dim, _Degree> >::BasisDerivativeType
-    Spline<_Scalar, _Dim, _Degree>::basisFunctionDerivatives(Scalar u, DenseIndex order) const
-  {
-    typename SplineTraits< Spline >::BasisDerivativeType der;
-    basisFunctionDerivativesImpl(*this, u, order, der);
-    return der;
+  /* Multiply through by the correct factors */
+  /* (Eq. [2.9])                             */
+  r = p;
+  for (DenseIndex k = 1; k <= static_cast<DenseIndex>(n); ++k) {
+    for (j = p; j >= 0; --j) N_(k, j) *= r;
+    r *= p - k;
   }
+}
 
-  template <typename _Scalar, int _Dim, int _Degree>
-  template <int DerivativeOrder>
-  typename SplineTraits< Spline<_Scalar, _Dim, _Degree>, DerivativeOrder >::BasisDerivativeType
-    Spline<_Scalar, _Dim, _Degree>::basisFunctionDerivatives(Scalar u, DenseIndex order) const
-  {
-    typename SplineTraits< Spline, DerivativeOrder >::BasisDerivativeType der;
-    basisFunctionDerivativesImpl(*this, u, order, der);
-    return der;
-  }
+template <typename Scalar_, int Dim_, int Degree_>
+typename SplineTraits<Spline<Scalar_, Dim_, Degree_> >::BasisDerivativeType
+Spline<Scalar_, Dim_, Degree_>::basisFunctionDerivatives(Scalar u, DenseIndex order) const {
+  typename SplineTraits<Spline<Scalar_, Dim_, Degree_> >::BasisDerivativeType der;
+  BasisFunctionDerivativesImpl(u, order, degree(), knots(), der);
+  return der;
+}
+
+template <typename Scalar_, int Dim_, int Degree_>
+template <int DerivativeOrder>
+typename SplineTraits<Spline<Scalar_, Dim_, Degree_>, DerivativeOrder>::BasisDerivativeType
+Spline<Scalar_, Dim_, Degree_>::basisFunctionDerivatives(Scalar u, DenseIndex order) const {
+  typename SplineTraits<Spline<Scalar_, Dim_, Degree_>, DerivativeOrder>::BasisDerivativeType der;
+  BasisFunctionDerivativesImpl(u, order, degree(), knots(), der);
+  return der;
+}
+
+template <typename Scalar_, int Dim_, int Degree_>
+typename SplineTraits<Spline<Scalar_, Dim_, Degree_> >::BasisDerivativeType
+Spline<Scalar_, Dim_, Degree_>::BasisFunctionDerivatives(
+    const typename Spline<Scalar_, Dim_, Degree_>::Scalar u, const DenseIndex order, const DenseIndex degree,
+    const typename Spline<Scalar_, Dim_, Degree_>::KnotVectorType& knots) {
+  typename SplineTraits<Spline>::BasisDerivativeType der;
+  BasisFunctionDerivativesImpl(u, order, degree, knots, der);
+  return der;
 }
+}  // namespace Eigen
 
-#endif // EIGEN_SPLINE_H
+#endif  // EIGEN_SPLINE_H
diff --git a/inst/include/unsupported/Eigen/src/Splines/SplineFitting.h b/inst/include/unsupported/Eigen/src/Splines/SplineFitting.h
index 0265d532..f6a1111e 100644
--- a/inst/include/unsupported/Eigen/src/Splines/SplineFitting.h
+++ b/inst/include/unsupported/Eigen/src/Splines/SplineFitting.h
@@ -10,147 +10,384 @@
 #ifndef EIGEN_SPLINE_FITTING_H
 #define EIGEN_SPLINE_FITTING_H
 
+#include <algorithm>
+#include <functional>
 #include <numeric>
+#include <vector>
+
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
 
 #include "SplineFwd.h"
 
-#include <Eigen/QR>
+#include "../../../../Eigen/LU"
+#include "../../../../Eigen/QR"
+
+namespace Eigen {
+/**
+ * \brief Computes knot averages.
+ * \ingroup Splines_Module
+ *
+ * The knots are computed as
+ * \f{align*}
+ *  u_0 & = \hdots = u_p = 0 \\
+ *  u_{m-p} & = \hdots = u_{m} = 1 \\
+ *  u_{j+p} & = \frac{1}{p}\sum_{i=j}^{j+p-1}\bar{u}_i \quad\quad j=1,\hdots,n-p
+ * \f}
+ * where \f$p\f$ is the degree and \f$m+1\f$ the number knots
+ * of the desired interpolating spline.
+ *
+ * \param[in] parameters The input parameters. During interpolation one for each data point.
+ * \param[in] degree The spline degree which is used during the interpolation.
+ * \param[out] knots The output knot vector.
+ *
+ * \sa Les Piegl and Wayne Tiller, The NURBS book (2nd ed.), 1997, 9.2.1 Global Curve Interpolation to Point Data
+ **/
+template <typename KnotVectorType>
+void KnotAveraging(const KnotVectorType& parameters, DenseIndex degree, KnotVectorType& knots) {
+  knots.resize(parameters.size() + degree + 1);
+
+  for (DenseIndex j = 1; j < parameters.size() - degree; ++j) knots(j + degree) = parameters.segment(j, degree).mean();
+
+  knots.segment(0, degree + 1) = KnotVectorType::Zero(degree + 1);
+  knots.segment(knots.size() - degree - 1, degree + 1) = KnotVectorType::Ones(degree + 1);
+}
+
+/**
+ * \brief Computes knot averages when derivative constraints are present.
+ * Note that this is a technical interpretation of the referenced article
+ * since the algorithm contained therein is incorrect as written.
+ * \ingroup Splines_Module
+ *
+ * \param[in] parameters The parameters at which the interpolation B-Spline
+ *            will intersect the given interpolation points. The parameters
+ *            are assumed to be a non-decreasing sequence.
+ * \param[in] degree The degree of the interpolating B-Spline. This must be
+ *            greater than zero.
+ * \param[in] derivativeIndices The indices corresponding to parameters at
+ *            which there are derivative constraints. The indices are assumed
+ *            to be a non-decreasing sequence.
+ * \param[out] knots The calculated knot vector. These will be returned as a
+ *             non-decreasing sequence
+ *
+ * \sa Les A. Piegl, Khairan Rajab, Volha Smarodzinana. 2008.
+ * Curve interpolation with directional constraints for engineering design.
+ * Engineering with Computers
+ **/
+template <typename KnotVectorType, typename ParameterVectorType, typename IndexArray>
+void KnotAveragingWithDerivatives(const ParameterVectorType& parameters, const unsigned int degree,
+                                  const IndexArray& derivativeIndices, KnotVectorType& knots) {
+  typedef typename ParameterVectorType::Scalar Scalar;
+
+  DenseIndex numParameters = parameters.size();
+  DenseIndex numDerivatives = derivativeIndices.size();
+
+  if (numDerivatives < 1) {
+    KnotAveraging(parameters, degree, knots);
+    return;
+  }
+
+  DenseIndex startIndex;
+  DenseIndex endIndex;
+
+  DenseIndex numInternalDerivatives = numDerivatives;
+
+  if (derivativeIndices[0] == 0) {
+    startIndex = 0;
+    --numInternalDerivatives;
+  } else {
+    startIndex = 1;
+  }
+  if (derivativeIndices[numDerivatives - 1] == numParameters - 1) {
+    endIndex = numParameters - degree;
+    --numInternalDerivatives;
+  } else {
+    endIndex = numParameters - degree - 1;
+  }
+
+  // There are (endIndex - startIndex + 1) knots obtained from the averaging
+  // and 2 for the first and last parameters.
+  DenseIndex numAverageKnots = endIndex - startIndex + 3;
+  KnotVectorType averageKnots(numAverageKnots);
+  averageKnots[0] = parameters[0];
+
+  int newKnotIndex = 0;
+  for (DenseIndex i = startIndex; i <= endIndex; ++i)
+    averageKnots[++newKnotIndex] = parameters.segment(i, degree).mean();
+  averageKnots[++newKnotIndex] = parameters[numParameters - 1];
+
+  newKnotIndex = -1;
+
+  ParameterVectorType temporaryParameters(numParameters + 1);
+  KnotVectorType derivativeKnots(numInternalDerivatives);
+  for (DenseIndex i = 0; i < numAverageKnots - 1; ++i) {
+    temporaryParameters[0] = averageKnots[i];
+    ParameterVectorType parameterIndices(numParameters);
+    int temporaryParameterIndex = 1;
+    for (DenseIndex j = 0; j < numParameters; ++j) {
+      Scalar parameter = parameters[j];
+      if (parameter >= averageKnots[i] && parameter < averageKnots[i + 1]) {
+        parameterIndices[temporaryParameterIndex] = j;
+        temporaryParameters[temporaryParameterIndex++] = parameter;
+      }
+    }
+    temporaryParameters[temporaryParameterIndex] = averageKnots[i + 1];
+
+    for (int j = 0; j <= temporaryParameterIndex - 2; ++j) {
+      for (DenseIndex k = 0; k < derivativeIndices.size(); ++k) {
+        if (parameterIndices[j + 1] == derivativeIndices[k] && parameterIndices[j + 1] != 0 &&
+            parameterIndices[j + 1] != numParameters - 1) {
+          derivativeKnots[++newKnotIndex] = temporaryParameters.segment(j, 3).mean();
+          break;
+        }
+      }
+    }
+  }
+
+  KnotVectorType temporaryKnots(averageKnots.size() + derivativeKnots.size());
+
+  std::merge(averageKnots.data(), averageKnots.data() + averageKnots.size(), derivativeKnots.data(),
+             derivativeKnots.data() + derivativeKnots.size(), temporaryKnots.data());
+
+  // Number of knots (one for each point and derivative) plus spline order.
+  DenseIndex numKnots = numParameters + numDerivatives + degree + 1;
+  knots.resize(numKnots);
+
+  knots.head(degree).fill(temporaryKnots[0]);
+  knots.tail(degree).fill(temporaryKnots.template tail<1>()[0]);
+  knots.segment(degree, temporaryKnots.size()) = temporaryKnots;
+}
+
+/**
+ * \brief Computes chord length parameters which are required for spline interpolation.
+ * \ingroup Splines_Module
+ *
+ * \param[in] pts The data points to which a spline should be fit.
+ * \param[out] chord_lengths The resulting chord length vector.
+ *
+ * \sa Les Piegl and Wayne Tiller, The NURBS book (2nd ed.), 1997, 9.2.1 Global Curve Interpolation to Point Data
+ **/
+template <typename PointArrayType, typename KnotVectorType>
+void ChordLengths(const PointArrayType& pts, KnotVectorType& chord_lengths) {
+  typedef typename KnotVectorType::Scalar Scalar;
+
+  const DenseIndex n = pts.cols();
+
+  // 1. compute the column-wise norms
+  chord_lengths.resize(pts.cols());
+  chord_lengths[0] = 0;
+  chord_lengths.rightCols(n - 1) =
+      (pts.array().leftCols(n - 1) - pts.array().rightCols(n - 1)).matrix().colwise().norm();
+
+  // 2. compute the partial sums
+  std::partial_sum(chord_lengths.data(), chord_lengths.data() + n, chord_lengths.data());
+
+  // 3. normalize the data
+  chord_lengths /= chord_lengths(n - 1);
+  chord_lengths(n - 1) = Scalar(1);
+}
+
+/**
+ * \brief Spline fitting methods.
+ * \ingroup Splines_Module
+ **/
+template <typename SplineType>
+struct SplineFitting {
+  typedef typename SplineType::KnotVectorType KnotVectorType;
+  typedef typename SplineType::ParameterVectorType ParameterVectorType;
 
-namespace Eigen
-{
   /**
-   * \brief Computes knot averages.
-   * \ingroup Splines_Module
+   * \brief Fits an interpolating Spline to the given data points.
    *
-   * The knots are computed as
-   * \f{align*}
-   *  u_0 & = \hdots = u_p = 0 \\
-   *  u_{m-p} & = \hdots = u_{m} = 1 \\
-   *  u_{j+p} & = \frac{1}{p}\sum_{i=j}^{j+p-1}\bar{u}_i \quad\quad j=1,\hdots,n-p
-   * \f}
-   * where \f$p\f$ is the degree and \f$m+1\f$ the number knots
-   * of the desired interpolating spline.
+   * \param pts The points for which an interpolating spline will be computed.
+   * \param degree The degree of the interpolating spline.
    *
-   * \param[in] parameters The input parameters. During interpolation one for each data point.
-   * \param[in] degree The spline degree which is used during the interpolation.
-   * \param[out] knots The output knot vector.
-   *
-   * \sa Les Piegl and Wayne Tiller, The NURBS book (2nd ed.), 1997, 9.2.1 Global Curve Interpolation to Point Data
+   * \returns A spline interpolating the initially provided points.
    **/
-  template <typename KnotVectorType>
-  void KnotAveraging(const KnotVectorType& parameters, DenseIndex degree, KnotVectorType& knots)
-  {
-    knots.resize(parameters.size()+degree+1);      
+  template <typename PointArrayType>
+  static SplineType Interpolate(const PointArrayType& pts, DenseIndex degree);
 
-    for (DenseIndex j=1; j<parameters.size()-degree; ++j)
-      knots(j+degree) = parameters.segment(j,degree).mean();
+  /**
+   * \brief Fits an interpolating Spline to the given data points.
+   *
+   * \param pts The points for which an interpolating spline will be computed.
+   * \param degree The degree of the interpolating spline.
+   * \param knot_parameters The knot parameters for the interpolation.
+   *
+   * \returns A spline interpolating the initially provided points.
+   **/
+  template <typename PointArrayType>
+  static SplineType Interpolate(const PointArrayType& pts, DenseIndex degree, const KnotVectorType& knot_parameters);
 
-    knots.segment(0,degree+1) = KnotVectorType::Zero(degree+1);
-    knots.segment(knots.size()-degree-1,degree+1) = KnotVectorType::Ones(degree+1);
-  }
+  /**
+   * \brief Fits an interpolating spline to the given data points and
+   * derivatives.
+   *
+   * \param points The points for which an interpolating spline will be computed.
+   * \param derivatives The desired derivatives of the interpolating spline at interpolation
+   *                    points.
+   * \param derivativeIndices An array indicating which point each derivative belongs to. This
+   *                          must be the same size as @a derivatives.
+   * \param degree The degree of the interpolating spline.
+   *
+   * \returns A spline interpolating @a points with @a derivatives at those points.
+   *
+   * \sa Les A. Piegl, Khairan Rajab, Volha Smarodzinana. 2008.
+   * Curve interpolation with directional constraints for engineering design.
+   * Engineering with Computers
+   **/
+  template <typename PointArrayType, typename IndexArray>
+  static SplineType InterpolateWithDerivatives(const PointArrayType& points, const PointArrayType& derivatives,
+                                               const IndexArray& derivativeIndices, const unsigned int degree);
 
   /**
-   * \brief Computes chord length parameters which are required for spline interpolation.
-   * \ingroup Splines_Module
+   * \brief Fits an interpolating spline to the given data points and derivatives.
+   *
+   * \param points The points for which an interpolating spline will be computed.
+   * \param derivatives The desired derivatives of the interpolating spline at interpolation points.
+   * \param derivativeIndices An array indicating which point each derivative belongs to. This
+   *                          must be the same size as @a derivatives.
+   * \param degree The degree of the interpolating spline.
+   * \param parameters The parameters corresponding to the interpolation points.
    *
-   * \param[in] pts The data points to which a spline should be fit.
-   * \param[out] chord_lengths The resulting chord lenggth vector.
+   * \returns A spline interpolating @a points with @a derivatives at those points.
    *
-   * \sa Les Piegl and Wayne Tiller, The NURBS book (2nd ed.), 1997, 9.2.1 Global Curve Interpolation to Point Data
-   **/   
-  template <typename PointArrayType, typename KnotVectorType>
-  void ChordLengths(const PointArrayType& pts, KnotVectorType& chord_lengths)
-  {
-    typedef typename KnotVectorType::Scalar Scalar;
-
-    const DenseIndex n = pts.cols();
-
-    // 1. compute the column-wise norms
-    chord_lengths.resize(pts.cols());
-    chord_lengths[0] = 0;
-    chord_lengths.rightCols(n-1) = (pts.array().leftCols(n-1) - pts.array().rightCols(n-1)).matrix().colwise().norm();
-
-    // 2. compute the partial sums
-    std::partial_sum(chord_lengths.data(), chord_lengths.data()+n, chord_lengths.data());
-
-    // 3. normalize the data
-    chord_lengths /= chord_lengths(n-1);
-    chord_lengths(n-1) = Scalar(1);
+   * \sa Les A. Piegl, Khairan Rajab, Volha Smarodzinana. 2008.
+   * Curve interpolation with directional constraints for engineering design.
+   * Engineering with Computers
+   */
+  template <typename PointArrayType, typename IndexArray>
+  static SplineType InterpolateWithDerivatives(const PointArrayType& points, const PointArrayType& derivatives,
+                                               const IndexArray& derivativeIndices, const unsigned int degree,
+                                               const ParameterVectorType& parameters);
+};
+
+template <typename SplineType>
+template <typename PointArrayType>
+SplineType SplineFitting<SplineType>::Interpolate(const PointArrayType& pts, DenseIndex degree,
+                                                  const KnotVectorType& knot_parameters) {
+  typedef typename SplineType::KnotVectorType::Scalar Scalar;
+  typedef typename SplineType::ControlPointVectorType ControlPointVectorType;
+
+  typedef Matrix<Scalar, Dynamic, Dynamic> MatrixType;
+
+  KnotVectorType knots;
+  KnotAveraging(knot_parameters, degree, knots);
+
+  DenseIndex n = pts.cols();
+  MatrixType A = MatrixType::Zero(n, n);
+  for (DenseIndex i = 1; i < n - 1; ++i) {
+    const DenseIndex span = SplineType::Span(knot_parameters[i], degree, knots);
+
+    // The segment call should somehow be told the spline order at compile time.
+    A.row(i).segment(span - degree, degree + 1) = SplineType::BasisFunctions(knot_parameters[i], degree, knots);
   }
+  A(0, 0) = 1.0;
+  A(n - 1, n - 1) = 1.0;
 
-  /**
-   * \brief Spline fitting methods.
-   * \ingroup Splines_Module
-   **/     
-  template <typename SplineType>
-  struct SplineFitting
-  {
-    typedef typename SplineType::KnotVectorType KnotVectorType;
-
-    /**
-     * \brief Fits an interpolating Spline to the given data points.
-     *
-     * \param pts The points for which an interpolating spline will be computed.
-     * \param degree The degree of the interpolating spline.
-     *
-     * \returns A spline interpolating the initially provided points.
-     **/
-    template <typename PointArrayType>
-    static SplineType Interpolate(const PointArrayType& pts, DenseIndex degree);
-
-    /**
-     * \brief Fits an interpolating Spline to the given data points.
-     *
-     * \param pts The points for which an interpolating spline will be computed.
-     * \param degree The degree of the interpolating spline.
-     * \param knot_parameters The knot parameters for the interpolation.
-     *
-     * \returns A spline interpolating the initially provided points.
-     **/
-    template <typename PointArrayType>
-    static SplineType Interpolate(const PointArrayType& pts, DenseIndex degree, const KnotVectorType& knot_parameters);
-  };
-
-  template <typename SplineType>
-  template <typename PointArrayType>
-  SplineType SplineFitting<SplineType>::Interpolate(const PointArrayType& pts, DenseIndex degree, const KnotVectorType& knot_parameters)
-  {
-    typedef typename SplineType::KnotVectorType::Scalar Scalar;      
-    typedef typename SplineType::ControlPointVectorType ControlPointVectorType;      
+  HouseholderQR<MatrixType> qr(A);
+
+  // Here, we are creating a temporary due to an Eigen issue.
+  ControlPointVectorType ctrls = qr.solve(MatrixType(pts.transpose())).transpose();
+
+  return SplineType(knots, ctrls);
+}
 
-    typedef Matrix<Scalar,Dynamic,Dynamic> MatrixType;
+template <typename SplineType>
+template <typename PointArrayType>
+SplineType SplineFitting<SplineType>::Interpolate(const PointArrayType& pts, DenseIndex degree) {
+  KnotVectorType chord_lengths;  // knot parameters
+  ChordLengths(pts, chord_lengths);
+  return Interpolate(pts, degree, chord_lengths);
+}
 
-    KnotVectorType knots;
-    KnotAveraging(knot_parameters, degree, knots);
+template <typename SplineType>
+template <typename PointArrayType, typename IndexArray>
+SplineType SplineFitting<SplineType>::InterpolateWithDerivatives(const PointArrayType& points,
+                                                                 const PointArrayType& derivatives,
+                                                                 const IndexArray& derivativeIndices,
+                                                                 const unsigned int degree,
+                                                                 const ParameterVectorType& parameters) {
+  typedef typename SplineType::KnotVectorType::Scalar Scalar;
+  typedef typename SplineType::ControlPointVectorType ControlPointVectorType;
 
-    DenseIndex n = pts.cols();
-    MatrixType A = MatrixType::Zero(n,n);
-    for (DenseIndex i=1; i<n-1; ++i)
-    {
-      const DenseIndex span = SplineType::Span(knot_parameters[i], degree, knots);
+  typedef Matrix<Scalar, Dynamic, Dynamic> MatrixType;
 
-      // The segment call should somehow be told the spline order at compile time.
-      A.row(i).segment(span-degree, degree+1) = SplineType::BasisFunctions(knot_parameters[i], degree, knots);
-    }
-    A(0,0) = 1.0;
-    A(n-1,n-1) = 1.0;
+  const DenseIndex n = points.cols() + derivatives.cols();
+
+  KnotVectorType knots;
+
+  KnotAveragingWithDerivatives(parameters, degree, derivativeIndices, knots);
 
-    HouseholderQR<MatrixType> qr(A);
+  // fill matrix
+  MatrixType A = MatrixType::Zero(n, n);
 
-    // Here, we are creating a temporary due to an Eigen issue.
-    ControlPointVectorType ctrls = qr.solve(MatrixType(pts.transpose())).transpose();
+  // Use these dimensions for quicker populating, then transpose for solving.
+  MatrixType b(points.rows(), n);
 
-    return SplineType(knots, ctrls);
+  DenseIndex startRow;
+  DenseIndex derivativeStart;
+
+  // End derivatives.
+  if (derivativeIndices[0] == 0) {
+    A.template block<1, 2>(1, 0) << -1, 1;
+
+    Scalar y = (knots(degree + 1) - knots(0)) / degree;
+    b.col(1) = y * derivatives.col(0);
+
+    startRow = 2;
+    derivativeStart = 1;
+  } else {
+    startRow = 1;
+    derivativeStart = 0;
   }
+  if (derivativeIndices[derivatives.cols() - 1] == points.cols() - 1) {
+    A.template block<1, 2>(n - 2, n - 2) << -1, 1;
 
-  template <typename SplineType>
-  template <typename PointArrayType>
-  SplineType SplineFitting<SplineType>::Interpolate(const PointArrayType& pts, DenseIndex degree)
-  {
-    KnotVectorType chord_lengths; // knot parameters
-    ChordLengths(pts, chord_lengths);
-    return Interpolate(pts, degree, chord_lengths);
+    Scalar y = (knots(knots.size() - 1) - knots(knots.size() - (degree + 2))) / degree;
+    b.col(b.cols() - 2) = y * derivatives.col(derivatives.cols() - 1);
   }
+
+  DenseIndex row = startRow;
+  DenseIndex derivativeIndex = derivativeStart;
+  for (DenseIndex i = 1; i < parameters.size() - 1; ++i) {
+    const DenseIndex span = SplineType::Span(parameters[i], degree, knots);
+
+    if (derivativeIndex < derivativeIndices.size() && derivativeIndices[derivativeIndex] == i) {
+      A.block(row, span - degree, 2, degree + 1) =
+          SplineType::BasisFunctionDerivatives(parameters[i], 1, degree, knots);
+
+      b.col(row++) = points.col(i);
+      b.col(row++) = derivatives.col(derivativeIndex++);
+    } else {
+      A.row(row).segment(span - degree, degree + 1) = SplineType::BasisFunctions(parameters[i], degree, knots);
+      b.col(row++) = points.col(i);
+    }
+  }
+  b.col(0) = points.col(0);
+  b.col(b.cols() - 1) = points.col(points.cols() - 1);
+  A(0, 0) = 1;
+  A(n - 1, n - 1) = 1;
+
+  // Solve
+  FullPivLU<MatrixType> lu(A);
+  ControlPointVectorType controlPoints = lu.solve(MatrixType(b.transpose())).transpose();
+
+  SplineType spline(knots, controlPoints);
+
+  return spline;
+}
+
+template <typename SplineType>
+template <typename PointArrayType, typename IndexArray>
+SplineType SplineFitting<SplineType>::InterpolateWithDerivatives(const PointArrayType& points,
+                                                                 const PointArrayType& derivatives,
+                                                                 const IndexArray& derivativeIndices,
+                                                                 const unsigned int degree) {
+  ParameterVectorType parameters;
+  ChordLengths(points, parameters);
+  return InterpolateWithDerivatives(points, derivatives, derivativeIndices, degree, parameters);
 }
+}  // namespace Eigen
 
-#endif // EIGEN_SPLINE_FITTING_H
+#endif  // EIGEN_SPLINE_FITTING_H
diff --git a/inst/include/unsupported/Eigen/src/Splines/SplineFwd.h b/inst/include/unsupported/Eigen/src/Splines/SplineFwd.h
index 49db8d35..ff648d46 100644
--- a/inst/include/unsupported/Eigen/src/Splines/SplineFwd.h
+++ b/inst/include/unsupported/Eigen/src/Splines/SplineFwd.h
@@ -10,77 +10,100 @@
 #ifndef EIGEN_SPLINES_FWD_H
 #define EIGEN_SPLINES_FWD_H
 
-#include <Eigen/Core>
-
-namespace Eigen
-{
-    template <typename Scalar, int Dim, int Degree = Dynamic> class Spline;
-
-    template < typename SplineType, int DerivativeOrder = Dynamic > struct SplineTraits {};
-
-    /**
-     * \ingroup Splines_Module
-     * \brief Compile-time attributes of the Spline class for Dynamic degree.
-     **/
-    template <typename _Scalar, int _Dim, int _Degree>
-    struct SplineTraits< Spline<_Scalar, _Dim, _Degree>, Dynamic >
-    {
-      typedef _Scalar Scalar; /*!< The spline curve's scalar type. */
-      enum { Dimension = _Dim /*!< The spline curve's dimension. */ };
-      enum { Degree = _Degree /*!< The spline curve's degree. */ };
-
-      enum { OrderAtCompileTime = _Degree==Dynamic ? Dynamic : _Degree+1 /*!< The spline curve's order at compile-time. */ };
-      enum { NumOfDerivativesAtCompileTime = OrderAtCompileTime /*!< The number of derivatives defined for the current spline. */ };
-
-      /** \brief The data type used to store non-zero basis functions. */
-      typedef Array<Scalar,1,OrderAtCompileTime> BasisVectorType;
-
-      /** \brief The data type used to store the values of the basis function derivatives. */
-      typedef Array<Scalar,Dynamic,Dynamic,RowMajor,NumOfDerivativesAtCompileTime,OrderAtCompileTime> BasisDerivativeType;
-      
-      /** \brief The data type used to store the spline's derivative values. */
-      typedef Array<Scalar,Dimension,Dynamic,ColMajor,Dimension,NumOfDerivativesAtCompileTime> DerivativeType;
-
-      /** \brief The point type the spline is representing. */
-      typedef Array<Scalar,Dimension,1> PointType;
-      
-      /** \brief The data type used to store knot vectors. */
-      typedef Array<Scalar,1,Dynamic> KnotVectorType;
-      
-      /** \brief The data type representing the spline's control points. */
-      typedef Array<Scalar,Dimension,Dynamic> ControlPointVectorType;
-    };
-
-    /**
-     * \ingroup Splines_Module
-     * \brief Compile-time attributes of the Spline class for fixed degree.
-     *
-     * The traits class inherits all attributes from the SplineTraits of Dynamic degree.
-     **/
-    template < typename _Scalar, int _Dim, int _Degree, int _DerivativeOrder >
-    struct SplineTraits< Spline<_Scalar, _Dim, _Degree>, _DerivativeOrder > : public SplineTraits< Spline<_Scalar, _Dim, _Degree> >
-    {
-      enum { OrderAtCompileTime = _Degree==Dynamic ? Dynamic : _Degree+1 /*!< The spline curve's order at compile-time. */ };
-      enum { NumOfDerivativesAtCompileTime = _DerivativeOrder==Dynamic ? Dynamic : _DerivativeOrder+1 /*!< The number of derivatives defined for the current spline. */ };
-
-      /** \brief The data type used to store the values of the basis function derivatives. */
-      typedef Array<_Scalar,Dynamic,Dynamic,RowMajor,NumOfDerivativesAtCompileTime,OrderAtCompileTime> BasisDerivativeType;
-      
-      /** \brief The data type used to store the spline's derivative values. */      
-      typedef Array<_Scalar,_Dim,Dynamic,ColMajor,_Dim,NumOfDerivativesAtCompileTime> DerivativeType;
-    };
-
-    /** \brief 2D float B-spline with dynamic degree. */
-    typedef Spline<float,2> Spline2f;
-    
-    /** \brief 3D float B-spline with dynamic degree. */
-    typedef Spline<float,3> Spline3f;
-
-    /** \brief 2D double B-spline with dynamic degree. */
-    typedef Spline<double,2> Spline2d;
-    
-    /** \brief 3D double B-spline with dynamic degree. */
-    typedef Spline<double,3> Spline3d;
-}
-
-#endif // EIGEN_SPLINES_FWD_H
+// IWYU pragma: private
+#include "./InternalHeaderCheck.h"
+#include "../../../../Eigen/Core"
+
+namespace Eigen {
+template <typename Scalar, int Dim, int Degree = Dynamic>
+class Spline;
+
+template <typename SplineType, int DerivativeOrder = Dynamic>
+struct SplineTraits {};
+
+/**
+ * \ingroup Splines_Module
+ * \brief Compile-time attributes of the Spline class for Dynamic degree.
+ **/
+template <typename Scalar_, int Dim_, int Degree_>
+struct SplineTraits<Spline<Scalar_, Dim_, Degree_>, Dynamic> {
+  typedef Scalar_ Scalar; /*!< The spline curve's scalar type. */
+  enum { Dimension = Dim_ /*!< The spline curve's dimension. */ };
+  enum { Degree = Degree_ /*!< The spline curve's degree. */ };
+
+  enum {
+    OrderAtCompileTime = Degree_ == Dynamic ? Dynamic : Degree_ + 1 /*!< The spline curve's order at compile-time. */
+  };
+  enum {
+    NumOfDerivativesAtCompileTime = OrderAtCompileTime /*!< The number of derivatives defined for the current spline. */
+  };
+
+  enum { DerivativeMemoryLayout = Dimension == 1 ? RowMajor : ColMajor /*!< The derivative type's memory layout. */ };
+
+  /** \brief The data type used to store non-zero basis functions. */
+  typedef Array<Scalar, 1, OrderAtCompileTime> BasisVectorType;
+
+  /** \brief The data type used to store the values of the basis function derivatives. */
+  typedef Array<Scalar, Dynamic, Dynamic, RowMajor, NumOfDerivativesAtCompileTime, OrderAtCompileTime>
+      BasisDerivativeType;
+
+  /** \brief The data type used to store the spline's derivative values. */
+  typedef Array<Scalar, Dimension, Dynamic, DerivativeMemoryLayout, Dimension, NumOfDerivativesAtCompileTime>
+      DerivativeType;
+
+  /** \brief The point type the spline is representing. */
+  typedef Array<Scalar, Dimension, 1> PointType;
+
+  /** \brief The data type used to store knot vectors. */
+  typedef Array<Scalar, 1, Dynamic> KnotVectorType;
+
+  /** \brief The data type used to store parameter vectors. */
+  typedef Array<Scalar, 1, Dynamic> ParameterVectorType;
+
+  /** \brief The data type representing the spline's control points. */
+  typedef Array<Scalar, Dimension, Dynamic> ControlPointVectorType;
+};
+
+/**
+ * \ingroup Splines_Module
+ * \brief Compile-time attributes of the Spline class for fixed degree.
+ *
+ * The traits class inherits all attributes from the SplineTraits of Dynamic degree.
+ **/
+template <typename Scalar_, int Dim_, int Degree_, int _DerivativeOrder>
+struct SplineTraits<Spline<Scalar_, Dim_, Degree_>, _DerivativeOrder>
+    : public SplineTraits<Spline<Scalar_, Dim_, Degree_> > {
+  enum {
+    OrderAtCompileTime = Degree_ == Dynamic ? Dynamic : Degree_ + 1 /*!< The spline curve's order at compile-time. */
+  };
+  enum {
+    NumOfDerivativesAtCompileTime =
+        _DerivativeOrder == Dynamic
+            ? Dynamic
+            : _DerivativeOrder + 1 /*!< The number of derivatives defined for the current spline. */
+  };
+
+  enum { DerivativeMemoryLayout = Dim_ == 1 ? RowMajor : ColMajor /*!< The derivative type's memory layout. */ };
+
+  /** \brief The data type used to store the values of the basis function derivatives. */
+  typedef Array<Scalar_, Dynamic, Dynamic, RowMajor, NumOfDerivativesAtCompileTime, OrderAtCompileTime>
+      BasisDerivativeType;
+
+  /** \brief The data type used to store the spline's derivative values. */
+  typedef Array<Scalar_, Dim_, Dynamic, DerivativeMemoryLayout, Dim_, NumOfDerivativesAtCompileTime> DerivativeType;
+};
+
+/** \brief 2D float B-spline with dynamic degree. */
+typedef Spline<float, 2> Spline2f;
+
+/** \brief 3D float B-spline with dynamic degree. */
+typedef Spline<float, 3> Spline3f;
+
+/** \brief 2D double B-spline with dynamic degree. */
+typedef Spline<double, 2> Spline2d;
+
+/** \brief 3D double B-spline with dynamic degree. */
+typedef Spline<double, 3> Spline3d;
+}  // namespace Eigen
+
+#endif  // EIGEN_SPLINES_FWD_H
diff --git a/inst/skeleton/rcppeigen_hello_world.cpp b/inst/skeleton/rcppeigen_hello_world.cpp
index 1dc69b54..5bbdce87 100644
--- a/inst/skeleton/rcppeigen_hello_world.cpp
+++ b/inst/skeleton/rcppeigen_hello_world.cpp
@@ -9,7 +9,7 @@
 // [[Rcpp::depends(RcppEigen)]]
 
 // simple example of creating two matrices and
-// returning the result of an operatioon on them
+// returning the result of an operation on them
 //
 // via the exports attribute we tell Rcpp to make this function
 // available from R
@@ -17,13 +17,18 @@
 // [[Rcpp::export]]
 Eigen::MatrixXd rcppeigen_hello_world() {
     Eigen::MatrixXd m1 = Eigen::MatrixXd::Identity(3, 3);
-    Eigen::MatrixXd m2 = Eigen::MatrixXd::Random(3, 3);
-	                     
+    // Eigen::MatrixXd m2 = Eigen::MatrixXd::Random(3, 3);
+    // Do not use Random() here to not promote use of a non-R RNG
+    Eigen::MatrixXd m2 = Eigen::MatrixXd::Zero(3, 3);
+    for (auto i=0; i<m2.rows(); i++)
+        for (auto j=0; j<m2.cols(); j++)
+            m2(i,j) = R::rnorm(0, 1);
+
     return m1 + 3 * (m1 + m2);
 }
 
 
-// another simple example: outer product of a vector, 
+// another simple example: outer product of a vector,
 // returning a matrix
 //
 // [[Rcpp::export]]
diff --git a/inst/tinytest/cpp/rcppeigen.cpp b/inst/tinytest/cpp/rcppeigen.cpp
new file mode 100644
index 00000000..c9ea8771
--- /dev/null
+++ b/inst/tinytest/cpp/rcppeigen.cpp
@@ -0,0 +1,157 @@
+
+#include <RcppEigen.h>
+
+// [[Rcpp::depends(RcppEigen)]]
+
+// [[Rcpp::export]]
+Rcpp::List fx() {
+    Rcpp::List vecs = Rcpp::List::create(
+        Rcpp::_["Vec<complex>"] = Eigen::VectorXcd::Zero(5),
+        Rcpp::_["Vec<double>"]  = Eigen::VectorXd::Zero(5),
+        Rcpp::_["Vec<float>"]   = Eigen::VectorXf::Zero(5),
+        Rcpp::_["Vec<int>"]     = Eigen::VectorXi::Zero(5)
+    );
+
+    // A VectorX<T> behaves as a matrix with one column but is converted to
+    // a vector object in R, not a matrix of one column.  The distinction is
+    // that VectorX<T> objects are defined at compile time to have one column,
+    // whereas a MatrixX<T> has a dynamic number of columns that is set to 1
+    // during execution of the code.  A MatrixX<T> object can be resized to have
+    // a different number of columns.  A VectorX<T> object cannot.
+    Rcpp::List cols = Rcpp::List::create(
+        Rcpp::_["Col<complex>"] = Eigen::MatrixXcd::Zero(5, 1),
+        Rcpp::_["Col<double>"]  = Eigen::MatrixXd::Zero(5, 1),
+        Rcpp::_["Col<float>"]   = Eigen::MatrixXf::Zero(5, 1),
+        Rcpp::_["Col<int>"]     = Eigen::MatrixXi::Zero(5, 1)
+    );
+
+    Rcpp::List rows = Rcpp::List::create(
+        Rcpp::_["Row<complex>"] = Eigen::RowVectorXcd::Zero(5),
+        Rcpp::_["Row<double>"]  = Eigen::RowVectorXd::Zero(5),
+        Rcpp::_["Row<float>"]   = Eigen::RowVectorXf::Zero(5),
+        Rcpp::_["Row<int>"]     = Eigen::RowVectorXi::Zero(5)
+    );
+
+    Rcpp::List matrices = Rcpp::List::create(
+        Rcpp::_["Mat<complex>"] = Eigen::MatrixXcd::Identity(3, 3),
+        Rcpp::_["Mat<double>"]  = Eigen::MatrixXd::Identity(3, 3),
+        Rcpp::_["Mat<float>"]   = Eigen::MatrixXf::Identity(3, 3),
+        Rcpp::_["Mat<int>"]     = Eigen::MatrixXi::Identity(3, 3)
+    );
+
+    // ArrayXX<t> objects have the same structure as matrices but allow
+    // componentwise arithmetic.  A * B is matrix multiplication for
+    // matrices and componentwise multiplication for arrays.
+    Rcpp::List arrays2 = Rcpp::List::create(
+        Rcpp::_["Arr2<complex>"] = Eigen::ArrayXXcd::Zero(3, 3),
+        Rcpp::_["Arr2<double>"]  = Eigen::ArrayXXd::Zero(3, 3),
+        Rcpp::_["Arr2<float>"]   = Eigen::ArrayXXf::Zero(3, 3),
+        Rcpp::_["Arr2<int>"]     = Eigen::ArrayXXi::Zero(3, 3)
+    );
+
+    // ArrayX<t> objects have the same structure as VectorX<T> objects
+    // but allow componentwise arithmetic, including functions like exp, log,
+    // sqrt, ...
+    Rcpp::List arrays1 = Rcpp::List::create(
+        Rcpp::_["Arr1<complex>"] = Eigen::ArrayXcd::Zero(5),
+        Rcpp::_["Arr1<double>"]  = Eigen::ArrayXd::Zero(5),
+        Rcpp::_["Arr1<float>"]   = Eigen::ArrayXf::Zero(5),
+        Rcpp::_["Arr1<int>"]     = Eigen::ArrayXi::Zero(5)
+    );
+
+    Rcpp::List operations = Rcpp::List::create(
+        Rcpp::_["Op_seq"]  = Eigen::ArrayXd::LinSpaced(6, 1, 10),  // arguments are length.out, start, end
+        Rcpp::_["Op_log"]  = Eigen::ArrayXd::LinSpaced(6, 1, 10).log(),
+        Rcpp::_["Op_exp"]  = Eigen::ArrayXd::LinSpaced(6, 1, 10).exp(),
+        Rcpp::_["Op_sqrt"] = Eigen::ArrayXd::LinSpaced(6, 1, 10).sqrt(),
+        Rcpp::_["Op_cos"]  = Eigen::ArrayXd::LinSpaced(6, 1, 10).cos()
+    );
+
+    Rcpp::List output = Rcpp::List::create(
+    	Rcpp::_["vectors : VectorX<T>"]   = vecs,
+    	Rcpp::_["matrices : MatrixX<T>"]  = matrices,
+    	Rcpp::_["rows : RowVectorX<T>"]   = rows,
+    	Rcpp::_["columns : MatrixX<T>"]   = cols,
+        Rcpp::_["arrays2d : ArrayXX<T>"]  = arrays2,
+        Rcpp::_["arrays1d : ArrayX<T>"]   = arrays1,
+        Rcpp::_["operations : ArrayXd"]   = operations
+        );
+
+    return output ;
+}
+
+// [[Rcpp::export]]
+Rcpp::List fx2(Rcpp::List input) {
+    Eigen::VectorXi                                m1 = input[0] ; /* implicit as */
+    Eigen::VectorXd                                m2 = input[1] ; /* implicit as */
+    Eigen::Matrix<unsigned int, Eigen::Dynamic, 1> m3 = input[0] ; /* implicit as */
+    Eigen::VectorXf                                m4 = input[1] ; /* implicit as */
+
+    Rcpp::List res = Rcpp::List::create(m1.sum(), m2.sum(), m3.sum(), m4.sum());
+
+    return res ;
+}
+
+
+// [[Rcpp::export]]
+Rcpp::List fx3(Rcpp::List input) {
+
+    const Eigen::Map<Eigen::VectorXi>   m1 = input[0] ; // maps share storage and do not allow conversion
+    const Eigen::Map<Eigen::VectorXd>   m2 = input[1] ;
+
+    Rcpp::List res = Rcpp::List::create(m1.sum(), m2.sum());
+
+    return res ;
+}
+
+// [[Rcpp::export]]
+Rcpp::List fx4(Rcpp::List input) {
+
+    const Eigen::Map<Eigen::RowVectorXi>   m1 = input[0] ; // maps share storage, do not allow conversion
+    const Eigen::Map<Eigen::RowVectorXd>   m2 = input[1] ;
+
+    Rcpp::List res = Rcpp::List::create(m1.sum(), m2.sum());
+
+    return res ;
+}
+
+
+// [[Rcpp::export]]
+Rcpp::List fx5(Rcpp::List input) {
+    const Eigen::Map<Eigen::MatrixXi>   m1 = input[0]; // maps share storage, do not allow conversion
+    const Eigen::Map<Eigen::MatrixXd>   m2 = input[1] ;
+    // FIXME: Write a version of as specifically for complex matrices.
+    //    const Eigen::Map<Eigen::MatrixXcd>  m3 = input[2] ;
+
+    Rcpp::List res = Rcpp::List::create(m1.sum(), m2.sum());//, m3.sum());
+
+    return res ;
+}
+
+
+// [[Rcpp::export]]
+Rcpp::List fx6(Rcpp::List input) {
+    const Eigen::Map<Eigen::SparseMatrix<double>>  m1 = input[0]; // maps share storage and do not allow conversion
+
+    Rcpp::List res = Rcpp::List::create(Rcpp::_["nnz"]   = double(m1.nonZeros()),
+                                        Rcpp::_["nr"]    = double(m1.rows()),
+                                        Rcpp::_["nc"]    = double(m1.cols()),
+                                        Rcpp::_["inSz"]  = double(m1.innerSize()),
+                                        Rcpp::_["outSz"] = double(m1.outerSize()),
+                                        Rcpp::_["sum"]   = m1.sum());
+
+    return res ;
+}
+
+
+// [[Rcpp::export]]
+Rcpp::List fx7(Rcpp::List input) {
+    const Eigen::SparseMatrix<double>  m1 = input[0];
+    Rcpp::List res = Rcpp::List::create(Rcpp::_["nnz"]   = double(m1.nonZeros()),
+                                        Rcpp::_["nr"]    = double(m1.rows()),
+                                        Rcpp::_["nc"]    = double(m1.cols()),
+                                        Rcpp::_["inSz"]  = double(m1.innerSize()),
+                                        Rcpp::_["outSz"] = double(m1.outerSize()),
+                                        Rcpp::_["sum"]   = m1.sum());
+    return res ;
+}
diff --git a/inst/tinytest/cpp/solution.cpp b/inst/tinytest/cpp/solution.cpp
new file mode 100644
index 00000000..9c018a27
--- /dev/null
+++ b/inst/tinytest/cpp/solution.cpp
@@ -0,0 +1,37 @@
+
+#include <RcppEigen.h>
+
+// [[Rcpp::depends(RcppEigen)]]
+
+typedef Eigen::ArrayXd                   Ar1;
+typedef Eigen::Map<Ar1>                 MAr1;
+typedef Eigen::ArrayXXd                  Ar2;
+typedef Eigen::Map<Ar2>                 MAr2;
+typedef Eigen::MatrixXd                  Mat;
+typedef Eigen::Map<Mat>                 MMat;
+typedef Eigen::VectorXd                  Vec;
+typedef Eigen::Map<Vec>                 MVec;
+typedef Eigen::PartialPivLU<Mat>        PPLU;
+typedef Eigen::ColPivHouseholderQR<Mat> CPQR;
+
+
+// [[Rcpp::export]]
+Rcpp::List dense_PPLU(MMat A, MVec b) {
+    PPLU           lu(A);
+    Mat            Ainv(lu.inverse());
+    Vec            x(lu.solve(b));
+
+    return Rcpp::List::create(Rcpp::Named("A",    A),
+                              Rcpp::Named("Ainv", Ainv),
+                              Rcpp::Named("b",    b),
+                              Rcpp::Named("x",    x));
+}
+
+// [[Rcpp::export]]
+Rcpp::List dense_CPQR(MMat A, MVec b) {
+    CPQR           qr(A);
+    Mat            Ainv(qr.inverse());
+    Vec            x(qr.solve(b));
+    return Rcpp::List::create(Rcpp::Named("Ainv", Ainv),
+                              Rcpp::Named("x",    x));
+}
diff --git a/inst/tinytest/cpp/sparse.cpp b/inst/tinytest/cpp/sparse.cpp
new file mode 100644
index 00000000..85fdbba2
--- /dev/null
+++ b/inst/tinytest/cpp/sparse.cpp
@@ -0,0 +1,102 @@
+
+#include <RcppEigen.h>
+
+// [[Rcpp::depends(RcppEigen)]]
+
+// [[Rcpp::export]]
+Eigen::SparseMatrix<double> wrapSparseDouble() {
+    Eigen::SparseMatrix<double>  mm(9,3);
+    mm.reserve(9);
+    for (int j = 0; j < 3; ++j) {
+        mm.startVec(j);
+        for (int i = 3 * j; i < 3 * (j + 1); ++i)
+            mm.insertBack(i, j) = 1.;
+    }
+    mm.finalize();
+    return mm;
+}
+
+// [[Rcpp::export]]
+Eigen::SparseMatrix<double, Eigen::ColMajor> wrapSparseDoubleColumnMajor() {
+    Eigen::SparseMatrix<double, Eigen::ColMajor>  mm(9,3);
+    mm.reserve(9);
+    for (int j = 0; j < 3; ++j) {
+        mm.startVec(j);
+        for (int i = 3 * j; i < 3 * (j + 1); ++i)
+            mm.insertBack(i, j) = 1.;
+    }
+    mm.finalize();
+    return mm;
+}
+
+// [[Rcpp::export]]
+Eigen::SparseMatrix<double, Eigen::RowMajor> wrapSparseDoubleRowMajor() {
+    Eigen::SparseMatrix<double, Eigen::RowMajor>  mm(9,3);
+    mm.reserve(9);
+    for (int irow = 0; irow < 9; ++irow) {
+        mm.startVec(irow);
+        mm.insertBack(irow, irow / 3) = static_cast<double>( 9 - irow );
+    }
+    mm.finalize();
+    return mm;
+}
+
+// [[Rcpp::export]]
+Eigen::SparseMatrix<double, Eigen::ColMajor> asSparseDoubleColumnMajor(Eigen::SparseMatrix<double, Eigen::ColMajor> mm) {
+    return mm;
+}
+
+// [[Rcpp::export]]
+double asMappedSparseDoubleColMajor(Eigen::Map<Eigen::SparseMatrix<double, Eigen::ColMajor> > mm) {
+    double s = mm.sum();          // access instantiated sparse matrix
+    return s;
+}
+
+// // [ [ Rcpp::export]]
+// double asMappedSparseDeprecatedDoubleColMajor(Eigen::MappedSparseMatrix<double, Eigen::ColMajor> mm) {
+//     // Deprecated
+//     double s = mm.sum();          // access instantiated sparse matrix
+//     return s;
+// }
+
+// [[Rcpp::export]]
+double asSparseDoubleRowMajor(Eigen::SparseMatrix<double, Eigen::RowMajor> mm) {
+    double s = mm.sum();          // access instantiated sparse matrix
+    return s;
+}
+
+// [[Rcpp::export]]
+double asMappedSparseDoubleRowMajor(Eigen::Map<Eigen::SparseMatrix<double, Eigen::RowMajor> > mm) {
+    double s = mm.sum();          // access instantiated sparse matrix
+    return s;
+}
+
+// // [ [ Rcpp::export ] ]
+// double asMappedSparseDeprecatedDoubleRowMajor(Eigen::MappedSparseMatrix<double, Eigen::RowMajor> mm) {
+//     double s = mm.sum();          // access instantiated sparse matrix
+//     return s;
+// }
+
+// [[Rcpp::export]]
+Rcpp::List sparseCholesky(Rcpp::List input) {
+    using Eigen::VectorXd;
+    using Eigen::MatrixXd;
+    using Eigen::Lower;
+    using Eigen::Map;
+    using Eigen::SparseMatrix;
+    using Eigen::SimplicialLDLT;
+    using Eigen::Success;
+
+    const Map<SparseMatrix<double> > m1 = input[0];
+    const Map<VectorXd>              v1 = input[1];
+    SparseMatrix<double>             m2(m1.cols(), m1.cols());
+    m2.selfadjointView<Lower>().rankUpdate(m1.adjoint());
+
+    SimplicialLDLT<SparseMatrix<double> > ff(m2);
+    VectorXd                        res = ff.solve(m1.adjoint() * v1);
+
+    return Rcpp::List::create(Rcpp::Named("res")   = res,
+                              Rcpp::Named("rows")  = double(ff.rows()),
+                              Rcpp::Named("cols")  = double(ff.cols()));
+
+}
diff --git a/inst/tinytest/cpp/transform.cpp b/inst/tinytest/cpp/transform.cpp
new file mode 100644
index 00000000..5cb40afd
--- /dev/null
+++ b/inst/tinytest/cpp/transform.cpp
@@ -0,0 +1,31 @@
+
+#include <RcppEigen.h>
+
+// [[Rcpp::depends(RcppEigen)]]
+
+typedef Eigen::ArrayXd                   Ar1;
+typedef Eigen::Map<Ar1>                 MAr1;
+typedef Eigen::ArrayXXd                  Ar2;
+typedef Eigen::Map<Ar2>                 MAr2;
+typedef Eigen::MatrixXd                  Mat;
+typedef Eigen::Map<Mat>                 MMat;
+typedef Eigen::VectorXd                  Vec;
+typedef Eigen::Map<Vec>                 MVec;
+
+// [[Rcpp::export]]
+Rcpp::List transformAr1unbounded(Rcpp::NumericVector x_) {
+    MAr1           x(Rcpp::as<MAr1>(x_));
+    return Rcpp::List::create(Rcpp::Named("abs",    x.abs()),
+                              Rcpp::Named("abs2",   x.abs2()),
+                              Rcpp::Named("exp",    x.exp()),
+                              Rcpp::Named("cos",    x.cos()));
+}
+
+// [[Rcpp::export]]
+Rcpp::List transformAr2unbounded(Rcpp::NumericMatrix X_) {
+    MAr2           X(Rcpp::as<MAr2>(X_));
+    return Rcpp::List::create(Rcpp::Named("abs",    X.abs()),
+                              Rcpp::Named("abs2",   X.abs2()),
+                              Rcpp::Named("exp",    X.exp()),
+                              Rcpp::Named("cos",    X.cos()));
+}
diff --git a/inst/unitTests/runit.wrap.R b/inst/tinytest/cpp/wrap.cpp
similarity index 65%
rename from inst/unitTests/runit.wrap.R
rename to inst/tinytest/cpp/wrap.cpp
index 50e284e8..14a11565 100644
--- a/inst/unitTests/runit.wrap.R
+++ b/inst/tinytest/cpp/wrap.cpp
@@ -1,23 +1,10 @@
-#
-# Copyright (C) 2012 - 2013  Douglas Bates, Dirk Eddelbuettel and Romain Francois
-#
-# This file is part of RcppEigen.
-#
-# RcppEigen is free software: you can redistribute it and/or modify it
-# under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 2 of the License, or
-# (at your option) any later version.
-#
-# RcppEigen is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with Rcpp.  If not, see <http://www.gnu.org/licenses/>.
-
-incl <- '
-// double
+
+#include <RcppEigen.h>
+
+using namespace Rcpp;
+
+// [[Rcpp::depends(RcppEigen)]]
+
 typedef Eigen::ArrayXd                   Ar1;
 typedef Eigen::Map<Ar1>                 MAr1;
 typedef Eigen::ArrayXXd                  Ar2;
@@ -62,11 +49,9 @@ typedef Eigen::MatrixXcd               cdMat;
 typedef Eigen::Map<cdMat>             McdMat;
 typedef Eigen::VectorXcd               cdVec;
 typedef Eigen::Map<cdVec>             McdVec;
-'
 
-definitions <- list(
-    "wrap_vectors" = list(signature(),
-    '
+// [[Rcpp::export]]
+Rcpp::List wrap_vectors() {
     List vecs = List::create(
         _["Vec<complex>"]       = cdVec::Zero(5),
         _["Vec<double>"]        = Vec::Zero(5),
@@ -146,12 +131,10 @@ definitions <- list(
         _["operations : ArrayXd"]   = operations
         );
     return output;
-    '),
-
+}
 
-    "as_Vec" = list(signature(input_ = "list"),
-    '
-    List input(input_) ;
+// [[Rcpp::export]]
+Rcpp::List as_Vec(Rcpp::List input) {
 
     // Column vector
     iVec       m1 = input[0] ; /* implicit as */
@@ -174,13 +157,10 @@ definitions <- list(
                             m9.sum(), m10.sum());
 
     return res ;
+}
 
-    '),
-
-
-    "as_Array" = list(signature(input_ = "list"),
-    '
-    List input(input_) ;
+// [[Rcpp::export]]
+Rcpp::List as_Array(Rcpp::List input) {
 
     // Column array
     iAr1       m1 = input[0] ; /* implicit as */
@@ -203,13 +183,10 @@ definitions <- list(
                             m9.sum(), m10.sum());
 
     return res ;
+}
 
-    '),
-
-
-    "as_Mat" = list(signature(input_ = "list"),
-    '
-    List input(input_) ;
+// [[Rcpp::export]]
+Rcpp::List as_Mat(Rcpp::List input) {
 
     // Copy to matrix
     iMat       m1 = input[0] ; /* implicit as */
@@ -225,13 +202,10 @@ definitions <- list(
                             m5.sum(), m6.sum());
 
     return res ;
+}
 
-    '),
-
-
-    "as_Array2D" = list(signature(input_ = "list"),
-    '
-    List input(input_) ;
+// [[Rcpp::export]]
+Rcpp::List as_Array2D(Rcpp::List input) {
 
     // Copy to 2D array
     iAr2       m1 = input[0] ; /* implicit as */
@@ -247,98 +221,24 @@ definitions <- list(
                             m5.sum(), m6.sum());
 
     return res ;
-
-    ')
-    )
-
-.setUp <- function() {
-    suppressMessages(require(inline))
-    suppressMessages(require(RcppEigen))
-    cxxargs <- ifelse(Rcpp:::capabilities()[["initializer lists"]],
-                      "-std=c++0x","")
-    tests <- ".rcppeigen.wrap"
-    if( ! exists( tests, globalenv() )) {
-        fun <- RcppEigen:::compile_unit_tests(definitions,
-                                              includes=incl,
-                                              cxxargs = cxxargs)
-        names(fun) <- names(definitions)
-        assign(tests, fun, globalenv())
-    }
-}
-
-
-test.wrapVectors <- function() {
-    res <- .rcppeigen.wrap$wrap_vectors()
-
-    checkEquals(res[[1]][[1]], complex(5))
-    checkEquals(res[[1]][[2]], double(5))
-    checkEquals(res[[1]][[3]], double(5))
-    checkEquals(res[[1]][[4]], integer(5))
-    checkEquals(res[[1]][[5]], integer(5))
-
-    checkEquals(res[[2]][[1]], (1+0i) * diag(nr=3L))
-    checkEquals(res[[2]][[2]], diag(nr=3L))
-    checkEquals(res[[2]][[3]], diag(nr=3L))
-    checkEquals(res[[2]][[4]], matrix(as.integer((diag(nr=3L))),nr=3L))
-    checkEquals(res[[2]][[5]], matrix(as.integer((diag(nr=3L))),nr=3L))
-
-    checkEquals(res[[3]][[1]], matrix(complex(5), nr=1L))
-    checkEquals(res[[3]][[2]], matrix(numeric(5), nr=1L))
-    checkEquals(res[[3]][[3]], matrix(numeric(5), nr=1L))
-    checkEquals(res[[3]][[4]], matrix(integer(5), nr=1L))
-    checkEquals(res[[3]][[5]], matrix(integer(5), nr=1L))
-
-    checkEquals(res[[4]][[1]], as.matrix(complex(5)))
-    checkEquals(res[[4]][[2]], as.matrix(numeric(5)))
-    checkEquals(res[[4]][[3]], as.matrix(numeric(5)))
-    checkEquals(res[[4]][[4]], as.matrix(integer(5)))
-    checkEquals(res[[4]][[5]], as.matrix(integer(5)))
-
-    checkEquals(res[[5]][[1]], matrix(complex(9L), nc=3L))
-    checkEquals(res[[5]][[2]], matrix(numeric(9L), nc=3L))
-    checkEquals(res[[5]][[3]], matrix(numeric(9L), nc=3L))
-    checkEquals(res[[5]][[4]], matrix(integer(9L), nc=3L))
-    checkEquals(res[[5]][[5]], matrix(integer(9L), nc=3L))
-
-    checkEquals(res[[6]][[1]], complex(5))
-    checkEquals(res[[6]][[2]], double(5))
-    checkEquals(res[[6]][[3]], double(5))
-    checkEquals(res[[6]][[4]], integer(5))
-    checkEquals(res[[6]][[5]], integer(5))
-
-    oneTen <- seq(1, 10, length.out=6L)
-
-    checkEquals(res[[7]][[1]], oneTen)
-    checkEquals(res[[7]][[2]], log(oneTen))
-    checkEquals(res[[7]][[3]], exp(oneTen))
-    checkEquals(res[[7]][[4]], sqrt(oneTen))
-    checkEquals(res[[7]][[5]], cos(oneTen))
-}
-
-test.asVec <- function() {
-    res <- .rcppeigen.wrap$as_Vec(list(1:10, as.numeric(1:10)))
-
-    checkEquals(unlist(res), rep.int(55, 10L))
 }
 
-test.asArray <- function() {
-    res <- .rcppeigen.wrap$as_Array(list(1:10, as.numeric(1:10)))
-
-    checkEquals(unlist(res), rep.int(55, 10L))
-}
-
-test.asMat <- function() {
-    integer_mat <- matrix(as.integer(diag(nrow = 5L)))
-    numeric_mat <- diag(nrow = 5L)
-    res <- .rcppeigen.wrap$as_Mat(list(integer_mat, numeric_mat))
-
-    checkEquals(unlist(res), rep.int(5, 6L))
+// wrap large vector, passes for n > 2^31-1 as no dim attribute
+// [[Rcpp::export]]
+Rcpp::IntegerVector vector_large_wrap(R_xlen_t n) {
+    Eigen::VectorXi x(n, 1);
+    for (R_xlen_t i = 0; i < n; ++i) {
+        x(i) = static_cast<int32_t>(i % 10);
+    }
+    return Rcpp::wrap(x);
 }
 
-test.asArray2D <- function() {
-    integer_mat <- matrix(as.integer(diag(nrow = 5L)))
-    numeric_mat <- diag(nrow = 5L)
-    res <- .rcppeigen.wrap$as_Array2D(list(integer_mat, numeric_mat))
-
-    checkEquals(unlist(res), rep.int(5, 6L))
+// wrap large matrix, fails for n > 2^31-1 if dim attribute > 2^31-1
+// [[Rcpp::export]]
+Rcpp::IntegerMatrix matrix_large_wrap(R_xlen_t n) {
+    Eigen::MatrixXi x(n, 1);
+    for (R_xlen_t i = 0; i < n; ++i) {
+        x(i, 0) = static_cast<int32_t>(i % 10);
+    }
+    return Rcpp::wrap(x);
 }
diff --git a/inst/tinytest/test_RcppEigen.R b/inst/tinytest/test_RcppEigen.R
new file mode 100644
index 00000000..82b52ac7
--- /dev/null
+++ b/inst/tinytest/test_RcppEigen.R
@@ -0,0 +1,109 @@
+#!/usr/bin/r -t
+#
+# Copyright (C)  2011 - 2019  Douglas Bates, Dirk Eddelbuettel and Romain Francois
+#
+# This file is part of RcppEigen
+#
+# RcppEigen is free software: you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 2 of the License, or
+# (at your option) any later version.
+#
+# RcppEigen is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with RcppEigen.  If not, see <http://www.gnu.org/licenses/>.
+
+#test.wrap.R <- function(){
+
+#fx <- cxxfunction( , '
+#	' , plugin = "RcppEigen" )
+Rcpp::sourceCpp("cpp/rcppeigen.cpp")
+
+res <- fx()
+
+expect_equal( res[["vectors : VectorX<T>"]][["Vec<complex>"]], complex(5), info = "VectorXcd::Zero(5)")
+expect_equal( res[["vectors : VectorX<T>"]][["Vec<double>"]], double(5), info = "VectorXd::Zero(5)")
+expect_equal( res[["vectors : VectorX<T>"]][["Vec<float>"]], double(5), info = "VectorXf::Zero(5)")
+expect_equal( res[["vectors : VectorX<T>"]][["Vec<int>"]], integer(5), info = "VectorXi::Zero(5)")
+
+expect_equal( res[["matrices : MatrixX<T>"]][["Mat<complex>"]], (1+0i) * diag(nr=3L), info = "MatrixXcd::Identity(3,3)")
+expect_equal( res[["matrices : MatrixX<T>"]][["Mat<double>"]], diag(nr=3L), info = "MatrixXd::Identity(3,3)")
+expect_equal( res[["matrices : MatrixX<T>"]][["Mat<float>"]], diag(nr=3L), info = "MatrixXf::Identity(3,3)")
+expect_equal( res[["matrices : MatrixX<T>"]][["Mat<int>"]], matrix(as.integer((diag(nr=3L))),nr=3L), info = "MatrixXi::Identity(3,3)")
+
+expect_equal( res[["rows : RowVectorX<T>"]][["Row<complex>"]], matrix(complex(5), nr=1L), info = "RowVectorXcd::Zero(5)")
+expect_equal( res[["rows : RowVectorX<T>"]][["Row<double>"]], matrix(numeric(5), nr=1L), info = "RowVectorXd::Zero(5)")
+expect_equal( res[["rows : RowVectorX<T>"]][["Row<float>"]], matrix(numeric(5), nr=1L), info = "RowVectorXf::Zero(5)")
+expect_equal( res[["rows : RowVectorX<T>"]][["Row<int>"]], matrix(integer(5), nr=1L), info = "RowVectorXi::Zero(5)")
+
+expect_equal( res[["columns : MatrixX<T>"]][["Col<complex>"]], as.matrix(complex(5)), info = "MatrixXcd::Zero(5, 1)")
+expect_equal( res[["columns : MatrixX<T>"]][["Col<double>"]], as.matrix(numeric(5)), info = "MatrixXd::Zero(5, 1)")
+expect_equal( res[["columns : MatrixX<T>"]][["Col<float>"]], as.matrix(numeric(5)), info = "MatrixXf::Zero(5, 1)")
+expect_equal( res[["columns : MatrixX<T>"]][["Col<int>"]], as.matrix(integer(5)), info = "MatrixXi::Zero(5, 1)")
+
+expect_equal( res[["arrays2d : ArrayXX<T>"]][["Arr2<complex>"]], matrix(complex(9L), nc=3L), info = "ArrayXXcd::Zero(3,3)")
+expect_equal( res[["arrays2d : ArrayXX<T>"]][["Arr2<double>"]], matrix(numeric(9L), nc=3L), info = "ArrayXXd::Zero(3,3)")
+expect_equal( res[["arrays2d : ArrayXX<T>"]][["Arr2<float>"]], matrix(numeric(9L), nc=3L), info = "ArrayXXf::Zero(3,3)")
+expect_equal( res[["arrays2d : ArrayXX<T>"]][["Arr2<int>"]], matrix(integer(9L), nc=3L), info = "ArrayXXi::Zero(3,3)")
+
+expect_equal( res[["arrays1d : ArrayX<T>"]][["Arr1<complex>"]], complex(5), info = "ArrayXcd::Zero(5)")
+expect_equal( res[["arrays1d : ArrayX<T>"]][["Arr1<double>"]], double(5), info = "ArrayXd::Zero(5)")
+expect_equal( res[["arrays1d : ArrayX<T>"]][["Arr1<float>"]], double(5), info = "ArrayXf::Zero(5)")
+expect_equal( res[["arrays1d : ArrayX<T>"]][["Arr1<int>"]], integer(5), info = "ArrayXi::Zero(5)")
+
+oneTen <- seq(1, 10, length.out=6L)
+
+expect_equal( res[["operations : ArrayXd"]][["Op_seq"]],  oneTen,       info = "Op_seq")
+expect_equal( res[["operations : ArrayXd"]][["Op_log"]],  log(oneTen),  info = "Op_log")
+expect_equal( res[["operations : ArrayXd"]][["Op_exp"]],  exp(oneTen),  info = "Op_exp")
+expect_equal( res[["operations : ArrayXd"]][["Op_sqrt"]], sqrt(oneTen), info = "Op_sqrt")
+expect_equal( res[["operations : ArrayXd"]][["Op_cos"]],  cos(oneTen),  info = "Op_cos")
+
+
+#test.as.Vec <- function(){
+res <- fx2( list( 1:10, as.numeric(1:10) ) )
+expect_equal( unlist( res ), rep(55.0, 4 ), info = "as<Vec>" )
+
+
+
+#test.as.MVec <- function(){
+res <- fx3( list( 1:10, as.numeric(1:10) ) )
+expect_equal( unlist( res ), rep(55.0, 2 ), info = "as<MVec>" )
+
+#test.as.MRowVec <- function(){
+res <- fx4( list( 1:10, as.numeric(1:10) ) )
+expect_equal( unlist( res ), rep(55.0, 2 ), info = "as<MRowVec>" )
+
+
+
+integer_mat <- matrix(as.integer(diag(nr=4L)), nc=4L)
+numeric_mat <- diag(nr=5L)
+complex_mat <- (1+0i) * diag(nr=5L)
+res <- fx5(list(integer_mat, numeric_mat, complex_mat))
+expect_equal(unlist(res), c(4L, 5)#, 5+0i)
+           , info = "as<MMat>" )
+
+
+#test.as.MSpMat <- function() {
+suppressMessages(require("Matrix"))
+data("KNex", package = "Matrix")
+
+KNX <- KNex[[1]]
+res <- fx6(KNex)
+expect_equal(unname(unlist(res)),
+             as.numeric(c(nnzero(KNX), nrow(KNX), ncol(KNX), nrow(KNX), ncol(KNX), sum(KNX@x))),
+             info = "as<MSPMatrix>")
+
+
+#test.as.SpMat <- function() {
+suppressMessages(require("Matrix"))
+data("KNex", package = "Matrix")
+KNX <- KNex[[1]]
+res <- fx7(KNex)
+expect_equal(unname(unlist(res)),
+             as.numeric(c(nnzero(KNX), nrow(KNX), ncol(KNX), nrow(KNX), ncol(KNX), sum(KNX@x))),
+             info = "as<MSPMatrix>")
diff --git a/inst/tinytest/test_fastLm.R b/inst/tinytest/test_fastLm.R
new file mode 100644
index 00000000..fafd8691
--- /dev/null
+++ b/inst/tinytest/test_fastLm.R
@@ -0,0 +1,77 @@
+#!/usr/bin/r -t
+#
+# Copyright (C) 2011 - 2021  Douglas Bates, Dirk Eddelbuettel and Romain Francois
+#
+# This file is part of RcppEigen
+#
+# RcppEigen is free software: you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 2 of the License, or
+# (at your option) any later version.
+#
+# RcppEigen is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with RcppEigen.  If not, see <http://www.gnu.org/licenses/>.
+
+library(RcppEigen)
+
+#test.fastLm <- function() {
+data(trees, package="datasets")
+flm0 <- fastLmPure(cbind(1, log(trees$Girth)), log(trees$Volume), 0L)
+flm1 <- fastLmPure(cbind(1, log(trees$Girth)), log(trees$Volume), 1L)
+flm2 <- fastLmPure(cbind(1, log(trees$Girth)), log(trees$Volume), 2L)
+flm3 <- fastLmPure(cbind(1, log(trees$Girth)), log(trees$Volume), 3L)
+flm4 <- fastLmPure(cbind(1, log(trees$Girth)), log(trees$Volume), 4L)
+flm5 <- fastLmPure(cbind(1, log(trees$Girth)), log(trees$Volume), 5L)
+flm6 <- fastLmPure(cbind(1, log(trees$Girth)), log(trees$Volume), 6L)
+
+fit       <- lm(log(Volume) ~ log(Girth), data=trees)
+fitCoef   <- unname(coef(fit))
+fitStdErr <- unname(coef(summary(fit))[, "Std. Error", drop = TRUE])
+
+expect_equal(flm0$coefficients , fitCoef,   info="fastLm0.coef")
+expect_equal(flm0$se           , fitStdErr, info="fastLm0.stderr")
+expect_equal(flm1$coefficients , fitCoef,   info="fastLm1.coef")
+expect_equal(flm1$se           , fitStdErr, info="fastLm1.stderr")
+expect_equal(flm2$coefficients , fitCoef,   info="fastLm2.coef")
+expect_equal(flm2$se           , fitStdErr, info="fastLm2.stderr")
+expect_equal(flm3$coefficients , fitCoef,   info="fastLm3.coef")
+expect_equal(flm3$se           , fitStdErr, info="fastLm3.stderr")
+expect_equal(flm4$coefficients , fitCoef,   info="fastLm0.coef")
+expect_equal(flm4$se           , fitStdErr, info="fastLm0.stderr")
+expect_equal(flm5$coefficients , fitCoef,   info="fastLm0.coef")
+expect_equal(flm5$se           , fitStdErr, info="fastLm0.stderr")
+expect_equal(flm6$coefficients , fitCoef,   info="fastLm0.coef")
+expect_equal(flm6$se           , fitStdErr, info="fastLm0.stderr")
+
+## check unsupported type
+expect_error(fastLmPure(cbind(1, log(trees$Girth)), log(trees$Volume), 7L))
+
+
+#test.fastLm.formula <- function() {
+data(trees, package="datasets")
+flm <- fastLm(log(Volume) ~ log(Girth), data=trees)
+fit <- lm(log(Volume) ~ log(Girth), data=trees)
+
+expect_equal(flm$coefficients, coef(fit), info="fastLm.formula.coef")
+expect_equal(as.numeric(flm$se), as.numeric(coef(summary(fit))[,2]),
+            info="fastLm.formula.stderr")
+
+## also tickle print and predict methods
+expect_stdout(print(flm))
+expect_stdout(print(summary(flm)))
+vec <- predict(flm, newdata=data.frame(Girth=c(1,2,3), Volume=c(2,3,4)))
+expect_equal(class(vec), "numeric")
+expect_equal(length(vec), 3L)
+vec <- predict(flm, newdata=NULL)
+expect_equal(vec, fitted(flm))
+
+## also generate summary
+flmsum <- summary(flm)
+fitsum <- summary(fit)
+expect_equal(flmsum$coef, fitsum$coef)
+expect_equal(length(flmsum), 12)
diff --git a/inst/tinytest/test_misc.R b/inst/tinytest/test_misc.R
new file mode 100644
index 00000000..24a22b08
--- /dev/null
+++ b/inst/tinytest/test_misc.R
@@ -0,0 +1,38 @@
+#!/usr/bin/r -t
+#
+# Copyright (C) 2021-2025  Dirk Eddelbuettel
+#
+# This file is part of RcppEigen
+#
+# RcppEigen is free software: you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 2 of the License, or
+# (at your option) any later version.
+#
+# RcppEigen is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with RcppEigen.  If not, see <http://www.gnu.org/licenses/>.
+
+library(RcppEigen)
+
+## -- src/RcppEigen.cpp
+eig <- RcppEigen:::eigen_version(FALSE)
+expect_equal(length(eig), 3)           # major minor patch
+expect_equal(names(eig), c("major","minor","patch"))
+eig <- RcppEigen:::eigen_version(TRUE)
+expect_equal(class(eig), "integer")
+expect_equal(length(eig), 1L)
+eigt <- RcppEigen:::eigen_version_typed()
+expect_true(inherits(eigt, "package_version"))
+expect_equal(class(RcppEigen:::Eigen_SSE()), "logical")
+nt <- RcppEigen::EigenNbThreads()
+expect_true(nt >= 0)
+
+## -- R/flags.R
+cxxflags <- RcppEigen:::RcppEigenCxxFlags()
+expect_true(is.character(cxxflags))
+expect_stdout(RcppEigen:::CxxFlags())
diff --git a/inst/tinytest/test_solution.R b/inst/tinytest/test_solution.R
new file mode 100644
index 00000000..0baba25e
--- /dev/null
+++ b/inst/tinytest/test_solution.R
@@ -0,0 +1,48 @@
+#
+# Copyright (C) 2012 - 2019  Douglas Bates, Dirk Eddelbuettel and Romain Francois
+#
+# This file is part of RcppEigen.
+#
+# RcppEigen is free software: you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 2 of the License, or
+# (at your option) any later version.
+#
+# RcppEigen is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Rcpp.  If not, see <http://www.gnu.org/licenses/>.
+
+Rcpp::sourceCpp("cpp/solution.cpp")
+
+#test.smallDense <- function() {
+A <- matrix(c(1,2,3,4), nrow=2L)
+B <- matrix(c(5,6,7,8), nrow=2L)
+b <- c(1,1)
+
+## solutions to dense systems
+res <- dense_PPLU(A, b)
+expect_equal(res$Ainv,  solve(A))
+expect_equal(res$x,     solve(A, b))
+
+res <- dense_CPQR(A, b)
+expect_equal(res$Ainv,  solve(A))
+expect_equal(res$x,     solve(A, b))
+
+
+#test.largeDense <- function() {
+set.seed(1234321)
+N <- 100L
+AA <- matrix(rnorm(N * N), nrow=N)
+bb <- rnorm(N)
+
+res <- dense_PPLU(AA, bb)
+expect_equal(res$Ainv,  solve(AA))
+expect_equal(res$x,     solve(AA, bb))
+
+res <- dense_CPQR(AA, bb)
+expect_equal(res$Ainv,  solve(AA))
+expect_equal(res$x,     solve(AA, bb))
diff --git a/inst/tinytest/test_sparse.R b/inst/tinytest/test_sparse.R
new file mode 100644
index 00000000..a3b7ec68
--- /dev/null
+++ b/inst/tinytest/test_sparse.R
@@ -0,0 +1,136 @@
+#!/usr/bin/r -t
+#
+# Copyright (C)      2011 Douglas Bates, Dirk Eddelbuettel and Romain Francois
+#
+# This file is part of RcppEigen
+#
+# RcppEigen is free software: you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 2 of the License, or
+# (at your option) any later version.
+#
+# RcppEigen is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with RcppEigen.  If not, see <http://www.gnu.org/licenses/>.
+
+Rcpp::sourceCpp("cpp/sparse.cpp")
+
+library(Matrix)
+
+#test.wrapSparse.double.R <- function(){
+res <- wrapSparseDouble()
+rr <- Matrix::t(as(gl(3,3), "sparseMatrix"))
+colnames(rr) <- NULL
+expect_equal(res, rr, info = "wrap<SparseMatrix<double> >")
+
+
+#test.wrapSparse.double.ColMajor.R <- function(){
+res <- wrapSparseDoubleColumnMajor()
+rr <- Matrix::t(as(gl(3,3), "sparseMatrix"))
+colnames(rr) <- NULL
+expect_equal(res, rr, info = "wrap<SparseMatrix<double, Eigen::ColMajor> >")
+
+## test.wrapSparse.int.ColMajor.R <- function(){  ## classes not yet exported from Matrix
+
+##     fx <- cxxfunction( , '
+
+##     Eigen::SparseMatrix<int, Eigen::ColMajor>  mm(9,3);
+##     mm.reserve(9);
+##     for (int j = 0; j < 3; ++j) {
+##         mm.startVec(j);
+##         for (int i = 3 * j; i < 3 * (j + 1); ++i)
+##             mm.insertBack(i, j) = 1;
+##     }
+##     mm.finalize();
+##     return wrap(mm);
+## ' , plugin = "RcppEigen" )
+
+##     #res <- fx()
+##     #rr <- Matrix::t(as(gl(3,3), "sparseMatrix"))
+##     #colnames(rr) <- NULL
+##     #expect_equal( res, rr, info = "wrap<SparseMatrix<double, Eigen::ColMajor> >")
+##     checkException( fx(), info = "wrap<SparseMatrix<int, Eigen::ColMajor> >" )
+## }
+
+#test.wrapSparse.double.RowMajor.R <- function(){
+res <- wrapSparseDoubleRowMajor()
+rr <- new( "dgRMatrix", j=rep(0L:2L, each=3), p=0L:9L, x=as.numeric(9:1), Dim=c(9L,3L) )
+colnames(rr) <- NULL
+expect_equal( res, rr, info = "wrap<SparseMatrix<double, Eigen::RowMajor> >")
+
+
+## test.wrapSparse.int.RowMajor.R <- function(){
+
+##     fx <- cxxfunction( , '
+
+##     Eigen::SparseMatrix<int, Eigen::RowMajor>  mm(9,3);
+##     mm.reserve(9);
+##     for (int irow = 0; irow < 9; ++irow) {
+##         mm.startVec(irow);
+##         mm.insertBack(irow, irow / 3) = 9 - irow;
+##     }
+##     mm.finalize();
+##     return wrap(mm);
+## ' , plugin = "RcppEigen" )
+
+##     #res <- fx()
+##     #rr <- new( "igRMatrix", j=rep(0L:2L, each=3), p=0L:9L, x=9L:1L, Dim=c(9L,3L) )
+##     #colnames(rr) <- NULL
+##     #expect_equal( res, rr, info = "wrap<SparseMatrix<int, Eigen::RowMajor> >")
+##     checkException( fx(), info = "wrap<SparseMatrix<int, Eigen::RowMajor> >" )
+## }
+
+#test.asSparse.double.ColMajor.R <- function(){
+rr <- Matrix::t(as(gl(3,3), "sparseMatrix"))
+colnames(rr) <- NULL
+res <- asSparseDoubleColumnMajor( rr )
+expect_equal( res, rr, info = "as<SparseMatrix<double, Eigen::ColMajor> >")
+
+
+#test.asMappedSparse.double.ColMajor.R <- function(){
+rr <- Matrix::t(as(gl(3,3), "sparseMatrix"))
+colnames(rr) <- NULL
+res <- asMappedSparseDoubleColMajor( rr )
+expect_equal( res, sum(rr), info = "as<Map<SparseMatrix<double, Eigen::ColMajor> > >")
+
+
+#test.asMappedSparse.deprecated.double.ColMajor.R <- function(){
+#fx <- asMappedSparseDeprecatedDoubleColMajor
+#rr <- Matrix::t(as(gl(3,3), "sparseMatrix"))
+#colnames(rr) <- NULL
+#res <- fx( rr )
+#expect_equal( res, sum(rr), info = "as<MappedSparseMatrix<double, Eigen::ColMajor> >")
+
+
+#test.asSparse.double.RowMajor.R <- function(){
+rr <- new( "dgRMatrix", j=rep(0L:2L, each=3), p=0L:9L, x=as.numeric(9:1), Dim=c(9L,3L) )
+colnames(rr) <- NULL
+res <- asSparseDoubleRowMajor( rr )
+expect_equal( res, sum(rr), info = "as<SparseMatrix<double, Eigen::RowMajor> >")
+
+
+#test.asMappedSparse.double.RowMajor.R <- function(){
+rr <- new( "dgRMatrix", j=rep(0L:2L, each=3), p=0L:9L, x=as.numeric(9:1), Dim=c(9L,3L) )
+colnames(rr) <- NULL
+res <- asMappedSparseDoubleRowMajor( rr )
+expect_equal( res, sum(rr), info = "as<Map<SparseMatrix<double, Eigen::RowMajor> > >")
+
+
+#test.asMappedSparse.deprecated.double.RowMajor.R <- function(){
+#rr <- new( "dgRMatrix", j=rep(0L:2L, each=3), p=0L:9L, x=as.numeric(9:1), Dim=c(9L,3L) )
+#colnames(rr) <- NULL
+#res <- asMappedSparseDeprecatedDoubleRowMajor( rr )
+#expect_equal( res, sum(rr), info = "as<MappedSparseMatrix<double, Eigen::RowMajor> >")
+
+
+# test.sparseCholesky.R <- function() {
+suppressMessages(require("Matrix", character.only=TRUE))
+data("KNex", package = "Matrix")
+rr <- sparseCholesky(KNex)
+expect_equal(rr[[1]],
+             as.vector(solve(crossprod(KNex[[1]]), crossprod(KNex[[1]], KNex[[2]])), mode="numeric"),
+             info = "Cholmod solution")
diff --git a/inst/tinytest/test_transform.R b/inst/tinytest/test_transform.R
new file mode 100644
index 00000000..ec24ec28
--- /dev/null
+++ b/inst/tinytest/test_transform.R
@@ -0,0 +1,40 @@
+#
+# Copyright (C) 2012 - 2019  Douglas Bates, Dirk Eddelbuettel and Romain Francois
+#
+# This file is part of RcppEigen.
+#
+# RcppEigen is free software: you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 2 of the License, or
+# (at your option) any later version.
+#
+# RcppEigen is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Rcpp.  If not, see <http://www.gnu.org/licenses/>.
+
+Rcpp::sourceCpp("cpp/transform.cpp")
+
+#test.transformationAr1 <- function() {
+set.seed(1234321)
+x <- rnorm(10L)
+
+res <- transformAr1unbounded(x)
+expect_equal(res$abs,  abs(x))
+expect_equal(res$abs2, x * x)
+expect_equal(res$exp,  exp(x))
+expect_equal(res$cos,  cos(x))
+
+
+#test.transformationAr2 <- function() {
+set.seed(1234321)
+X <- matrix(rnorm(100L), nrow = 10, ncol = 10)
+
+res <- transformAr2unbounded(X)
+expect_equal(res$abs,  abs(X))
+expect_equal(res$abs2, X * X)
+expect_equal(res$exp,  exp(X))
+expect_equal(res$cos,  cos(X))
diff --git a/inst/tinytest/test_wrap.R b/inst/tinytest/test_wrap.R
new file mode 100644
index 00000000..65ac970d
--- /dev/null
+++ b/inst/tinytest/test_wrap.R
@@ -0,0 +1,109 @@
+
+# Copyright (C) 2012 - 2022  Douglas Bates, Dirk Eddelbuettel and Romain Francois
+#
+# This file is part of RcppEigen.
+#
+# RcppEigen is free software: you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 2 of the License, or
+# (at your option) any later version.
+#
+# RcppEigen is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with Rcpp.  If not, see <http://www.gnu.org/licenses/>.
+
+Rcpp::sourceCpp("cpp/wrap.cpp")
+
+#test.wrapVectors <- function() {
+res <- wrap_vectors()
+
+expect_equal(res[[1]][[1]], complex(5))
+expect_equal(res[[1]][[2]], double(5))
+expect_equal(res[[1]][[3]], double(5))
+expect_equal(res[[1]][[4]], integer(5))
+expect_equal(res[[1]][[5]], integer(5))
+
+expect_equal(res[[2]][[1]], (1+0i) * diag(nr=3L))
+expect_equal(res[[2]][[2]], diag(nr=3L))
+expect_equal(res[[2]][[3]], diag(nr=3L))
+expect_equal(res[[2]][[4]], matrix(as.integer((diag(nr=3L))),nr=3L))
+expect_equal(res[[2]][[5]], matrix(as.integer((diag(nr=3L))),nr=3L))
+
+expect_equal(res[[3]][[1]], matrix(complex(5), nr=1L))
+expect_equal(res[[3]][[2]], matrix(numeric(5), nr=1L))
+expect_equal(res[[3]][[3]], matrix(numeric(5), nr=1L))
+expect_equal(res[[3]][[4]], matrix(integer(5), nr=1L))
+expect_equal(res[[3]][[5]], matrix(integer(5), nr=1L))
+
+expect_equal(res[[4]][[1]], as.matrix(complex(5)))
+expect_equal(res[[4]][[2]], as.matrix(numeric(5)))
+expect_equal(res[[4]][[3]], as.matrix(numeric(5)))
+expect_equal(res[[4]][[4]], as.matrix(integer(5)))
+expect_equal(res[[4]][[5]], as.matrix(integer(5)))
+
+expect_equal(res[[5]][[1]], matrix(complex(9L), nc=3L))
+expect_equal(res[[5]][[2]], matrix(numeric(9L), nc=3L))
+expect_equal(res[[5]][[3]], matrix(numeric(9L), nc=3L))
+expect_equal(res[[5]][[4]], matrix(integer(9L), nc=3L))
+expect_equal(res[[5]][[5]], matrix(integer(9L), nc=3L))
+
+expect_equal(res[[6]][[1]], complex(5))
+expect_equal(res[[6]][[2]], double(5))
+expect_equal(res[[6]][[3]], double(5))
+expect_equal(res[[6]][[4]], integer(5))
+expect_equal(res[[6]][[5]], integer(5))
+
+oneTen <- seq(1, 10, length.out=6L)
+
+expect_equal(res[[7]][[1]], oneTen)
+expect_equal(res[[7]][[2]], log(oneTen))
+expect_equal(res[[7]][[3]], exp(oneTen))
+expect_equal(res[[7]][[4]], sqrt(oneTen))
+expect_equal(res[[7]][[5]], cos(oneTen))
+
+
+#test.asVec <- function() {
+res <- as_Vec(list(1:10, as.numeric(1:10)))
+expect_equal(unlist(res), rep.int(55, 10L))
+
+#test.asArray <- function() {
+res <- as_Array(list(1:10, as.numeric(1:10)))
+expect_equal(unlist(res), rep.int(55, 10L))
+
+#test.asMat <- function() {
+integer_mat <- matrix(as.integer(diag(nrow = 5L)))
+numeric_mat <- diag(nrow = 5L)
+res <- as_Mat(list(integer_mat, numeric_mat))
+expect_equal(unlist(res), rep.int(5, 6L))
+
+#test.asArray2D <- function() {
+integer_mat <- matrix(as.integer(diag(nrow = 5L)))
+numeric_mat <- diag(nrow = 5L)
+res <- as_Array2D(list(integer_mat, numeric_mat))
+expect_equal(unlist(res), rep.int(5, 6L))
+
+
+## CI systems may have limited memory, and CRAN may not like us creating multi-gb objects
+## so remainer is opt-in
+if (Sys.getenv("RunLargeMemoryTests") != "yes") exit_file("Set 'RunLargeMemoryTests' to 'yes' to run.")
+
+## add test for wrapping of large vectors (PRs #105, 106) which works for vectors
+## but fails for matrices as we violate the 'size_t value permitted for lenth values
+## but not inside a `dim` object of type `integer` (aka `int32_t`)
+n <- 2^31 + 100                         # in excess of limit of 2^31 - 1
+res <- vector_large_wrap(n)
+expect_true(is.vector(res, "integer"))
+expect_equal(length(res), n)
+expect_equal(res[seq_len(2^10)], rep_len(0:9, 2^10))
+
+expect_error(matrix_large_wrap(n))
+n <- 2^31 - 100                         # within limit of 2^31 - 1 for dim given one column
+res <- matrix_large_wrap(n)
+expect_true(is.matrix(res))
+expect_equal(typeof(res), "integer")
+expect_equal(dim(res), c(n,1))
+expect_equal(res[seq_len(2^10)], rep_len(0:9, 2^10))
diff --git a/inst/unitTests/runTests.R b/inst/unitTests/runTests.R
deleted file mode 100644
index 7d41a041..00000000
--- a/inst/unitTests/runTests.R
+++ /dev/null
@@ -1,100 +0,0 @@
-
-pkg <- "RcppEigen"
-
-if ( ! require( "inline", character.only = TRUE, quietly = TRUE ) ){
-    stop( "The inline package is required to run RcppEigen unit tests" )
-}
-
-if ( compareVersion( packageDescription( "inline" )[["Version"]], "0.3.5" ) < 0 ){
-    stop( "RcppEigen unit tests need at least the version 0.3.5 of inline" )
-}
-
-if (require("RUnit", quietly = TRUE)) {
-
-    is_local <- function(){
-    	if ( exists( "argv", globalenv() ) && "--local" %in% argv ) return(TRUE)
-    	if ( "--local" %in% commandArgs(TRUE) ) return(TRUE)
-    	FALSE
-    }
-    if ( is_local() ) path <- getwd()
-
-    library(package=pkg, character.only = TRUE)
-    if (!(exists("path") && file.exists(path)))
-        path <- system.file("unitTests", package = pkg)
-
-    ## --- Testing ---
-
-    ## Define tests
-    testSuite <- defineTestSuite(name=paste(pkg, "unit testing"), dirs = path)
-
-    if (interactive()) {
-        cat("Now have RUnit Test Suite 'testSuite' for package '", pkg, "' :\n", sep='')
-        str(testSuite)
-        cat('', "Consider doing",
-            "\t  tests <- runTestSuite(testSuite)", "\nand later",
-            "\t  printTextProtocol(tests)", '', sep="\n")
-    } else { ## run from shell / Rscript / R CMD Batch / ...
-        ## Run
-        tests <- runTestSuite(testSuite)
-
-        output <- NULL
-
-        process_args <- function(argv){
-            if ( !is.null(argv) && length(argv) > 0 ){
-                rx <- "^--output=(.*)$"
-                g  <- grep( rx, argv, value = TRUE )
-                if ( length(g) ){
-                    sub( rx, "\\1", g[1L] )
-                }
-            }
-        }
-
-        # R CMD check uses this
-        if ( exists( "RcppEigen.unit.test.output.dir", globalenv() ) ){
-            output <- RcppEigen.unit.test.output.dir
-        } else {
-
-            ## give a chance to the user to customize where he/she wants
-            ## the unit tests results to be stored with the --output= command
-            ## line argument
-            if ( exists( "argv",  globalenv() ) ){
-                ## littler
-                output <- process_args(argv)
-            } else {
-                ## Rscript
-                output <- process_args(commandArgs(TRUE))
-            }
-        }
-
-        if( is.null(output) ) {         # if it did not work, use parent dir
-            output <- ".."              # as BDR does not want /tmp to be used
-        }
-
-        ## Print results
-        output.txt  <- file.path( output, sprintf("%s-unitTests.txt", pkg))
-        output.html <- file.path( output, sprintf("%s-unitTests.html", pkg))
-
-        printTextProtocol(tests, fileName=output.txt)
-        message( sprintf( "saving txt unit test report to '%s'", output.txt ) )
-
-        ## Print HTML version to a file
-        ## printHTMLProtocol has problems on Mac OS X
-        if (Sys.info()["sysname"] != "Darwin"){
-            message( sprintf( "saving html unit test report to '%s'", output.html ) )
-            printHTMLProtocol(tests, fileName=output.html)
-        }
-
-        ##  stop() if there are any failures i.e. FALSE to unit test.
-        ## This will cause R CMD check to return error and stop
-        err <- getErrors(tests)
-        if ( (err$nFail + err$nErr) > 0) {
-            stop( sprintf( "unit test problems: %d failures, %d errors", err$nFail, err$nErr) )
-        } else {
-            success <- err$nTestFunc - err$nFail - err$nErr - err$nDeactivated
-            cat( sprintf( "%d / %d\n", success, err$nTestFunc ) )
-        }
-    }
-} else {
-    cat("R package 'RUnit' cannot be loaded -- no unit tests run\n", "for package", pkg,"\n")
-}
-
diff --git a/inst/unitTests/runit.RcppEigen.R b/inst/unitTests/runit.RcppEigen.R
deleted file mode 100644
index 78164ef6..00000000
--- a/inst/unitTests/runit.RcppEigen.R
+++ /dev/null
@@ -1,251 +0,0 @@
-#!/usr/bin/r -t
-#
-# Copyright (C)      2011 Douglas Bates, Dirk Eddelbuettel and Romain Francois
-#
-# This file is part of RcppEigen
-#
-# RcppEigen is free software: you can redistribute it and/or modify it
-# under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 2 of the License, or
-# (at your option) any later version.
-#
-# RcppEigen is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with RcppEigen.  If not, see <http://www.gnu.org/licenses/>.
-
-.setUp <- function(){
-    suppressMessages(require(inline))
-}
-
-test.wrap.R <- function(){
-
-    fx <- cxxfunction( , '
-
-    List vecs = List::create(
-        _["Vec<complex>"] = Eigen::VectorXcd::Zero(5),
-        _["Vec<double>"]  = Eigen::VectorXd::Zero(5),
-        _["Vec<float>"]   = Eigen::VectorXf::Zero(5),
-        _["Vec<int>"]     = Eigen::VectorXi::Zero(5)
-    );
-
-    // A VectorX<T> behaves as a matrix with one column but is converted to
-    // a vector object in R, not a matrix of one column.  The distinction is
-    // that VectorX<T> objects are defined at compile time to have one column,
-    // whereas a MatrixX<T> has a dynamic number of columns that is set to 1
-    // during execution of the code.  A MatrixX<T> object can be resized to have
-    // a different number of columns.  A VectorX<T> object cannot.
-    List cols = List::create(
-        _["Col<complex>"] = Eigen::MatrixXcd::Zero(5, 1),
-        _["Col<double>"]  = Eigen::MatrixXd::Zero(5, 1),
-        _["Col<float>"]   = Eigen::MatrixXf::Zero(5, 1),
-        _["Col<int>"]     = Eigen::MatrixXi::Zero(5, 1)
-    );
-
-    List rows = List::create(
-        _["Row<complex>"] = Eigen::RowVectorXcd::Zero(5),
-        _["Row<double>"]  = Eigen::RowVectorXd::Zero(5),
-        _["Row<float>"]   = Eigen::RowVectorXf::Zero(5),
-        _["Row<int>"]     = Eigen::RowVectorXi::Zero(5)
-    );
-
-    List matrices = List::create(
-        _["Mat<complex>"] = Eigen::MatrixXcd::Identity(3, 3),
-        _["Mat<double>"]  = Eigen::MatrixXd::Identity(3, 3),
-        _["Mat<float>"]   = Eigen::MatrixXf::Identity(3, 3),
-        _["Mat<int>"]     = Eigen::MatrixXi::Identity(3, 3)
-    );
-
-    // ArrayXX<t> objects have the same structure as matrices but allow
-    // componentwise arithmetic.  A * B is matrix multiplication for
-    // matrices and componentwise multiplication for arrays.
-    List arrays2 = List::create(
-        _["Arr2<complex>"] = Eigen::ArrayXXcd::Zero(3, 3),
-        _["Arr2<double>"]  = Eigen::ArrayXXd::Zero(3, 3),
-        _["Arr2<float>"]   = Eigen::ArrayXXf::Zero(3, 3),
-        _["Arr2<int>"]     = Eigen::ArrayXXi::Zero(3, 3)
-    );
-
-    // ArrayX<t> objects have the same structure as VectorX<T> objects
-    // but allow componentwise arithmetic, including functions like exp, log,
-    // sqrt, ...
-    List arrays1 = List::create(
-        _["Arr1<complex>"] = Eigen::ArrayXcd::Zero(5),
-        _["Arr1<double>"]  = Eigen::ArrayXd::Zero(5),
-        _["Arr1<float>"]   = Eigen::ArrayXf::Zero(5),
-        _["Arr1<int>"]     = Eigen::ArrayXi::Zero(5)
-    );
-
-    List operations = List::create(
-        _["Op_seq"]  = Eigen::ArrayXd::LinSpaced(6, 1, 10),  // arguments are length.out, start, end
-        _["Op_log"]  = Eigen::ArrayXd::LinSpaced(6, 1, 10).log(),
-        _["Op_exp"]  = Eigen::ArrayXd::LinSpaced(6, 1, 10).exp(),
-        _["Op_sqrt"] = Eigen::ArrayXd::LinSpaced(6, 1, 10).sqrt(),
-        _["Op_cos"]  = Eigen::ArrayXd::LinSpaced(6, 1, 10).cos()
-    );
-
-    List output = List::create(
-    	_["vectors : VectorX<T>"]   = vecs,
-    	_["matrices : MatrixX<T>"]  = matrices,
-    	_["rows : RowVectorX<T>"]   = rows,
-    	_["columns : MatrixX<T>"]   = cols,
-        _["arrays2d : ArrayXX<T>"]  = arrays2,
-        _["arrays1d : ArrayX<T>"]   = arrays1,
-        _["operations : ArrayXd"]   = operations
-        );
-
-    return output ;
-	' , plugin = "RcppEigen" )
-
-    res <- fx()
-
-    checkEquals( res[["vectors : VectorX<T>"]][["Vec<complex>"]], complex(5), msg = "VectorXcd::Zero(5)")
-    checkEquals( res[["vectors : VectorX<T>"]][["Vec<double>"]], double(5), msg = "VectorXd::Zero(5)")
-    checkEquals( res[["vectors : VectorX<T>"]][["Vec<float>"]], double(5), msg = "VectorXf::Zero(5)")
-    checkEquals( res[["vectors : VectorX<T>"]][["Vec<int>"]], integer(5), msg = "VectorXi::Zero(5)")
-
-    checkEquals( res[["matrices : MatrixX<T>"]][["Mat<complex>"]], (1+0i) * diag(nr=3L), msg = "MatrixXcd::Identity(3,3)")
-    checkEquals( res[["matrices : MatrixX<T>"]][["Mat<double>"]], diag(nr=3L), msg = "MatrixXd::Identity(3,3)")
-    checkEquals( res[["matrices : MatrixX<T>"]][["Mat<float>"]], diag(nr=3L), msg = "MatrixXf::Identity(3,3)")
-    checkEquals( res[["matrices : MatrixX<T>"]][["Mat<int>"]], matrix(as.integer((diag(nr=3L))),nr=3L), msg = "MatrixXi::Identity(3,3)")
-
-    checkEquals( res[["rows : RowVectorX<T>"]][["Row<complex>"]], matrix(complex(5), nr=1L), msg = "RowVectorXcd::Zero(5)")
-    checkEquals( res[["rows : RowVectorX<T>"]][["Row<double>"]], matrix(numeric(5), nr=1L), msg = "RowVectorXd::Zero(5)")
-    checkEquals( res[["rows : RowVectorX<T>"]][["Row<float>"]], matrix(numeric(5), nr=1L), msg = "RowVectorXf::Zero(5)")
-    checkEquals( res[["rows : RowVectorX<T>"]][["Row<int>"]], matrix(integer(5), nr=1L), msg = "RowVectorXi::Zero(5)")
-
-    checkEquals( res[["columns : MatrixX<T>"]][["Col<complex>"]], as.matrix(complex(5)), msg = "MatrixXcd::Zero(5, 1)")
-    checkEquals( res[["columns : MatrixX<T>"]][["Col<double>"]], as.matrix(numeric(5)), msg = "MatrixXd::Zero(5, 1)")
-    checkEquals( res[["columns : MatrixX<T>"]][["Col<float>"]], as.matrix(numeric(5)), msg = "MatrixXf::Zero(5, 1)")
-    checkEquals( res[["columns : MatrixX<T>"]][["Col<int>"]], as.matrix(integer(5)), msg = "MatrixXi::Zero(5, 1)")
-
-    checkEquals( res[["arrays2d : ArrayXX<T>"]][["Arr2<complex>"]], matrix(complex(9L), nc=3L), msg = "ArrayXXcd::Zero(3,3)")
-    checkEquals( res[["arrays2d : ArrayXX<T>"]][["Arr2<double>"]], matrix(numeric(9L), nc=3L), msg = "ArrayXXd::Zero(3,3)")
-    checkEquals( res[["arrays2d : ArrayXX<T>"]][["Arr2<float>"]], matrix(numeric(9L), nc=3L), msg = "ArrayXXf::Zero(3,3)")
-    checkEquals( res[["arrays2d : ArrayXX<T>"]][["Arr2<int>"]], matrix(integer(9L), nc=3L), msg = "ArrayXXi::Zero(3,3)")
-
-    checkEquals( res[["arrays1d : ArrayX<T>"]][["Arr1<complex>"]], complex(5), msg = "ArrayXcd::Zero(5)")
-    checkEquals( res[["arrays1d : ArrayX<T>"]][["Arr1<double>"]], double(5), msg = "ArrayXd::Zero(5)")
-    checkEquals( res[["arrays1d : ArrayX<T>"]][["Arr1<float>"]], double(5), msg = "ArrayXf::Zero(5)")
-    checkEquals( res[["arrays1d : ArrayX<T>"]][["Arr1<int>"]], integer(5), msg = "ArrayXi::Zero(5)")
-
-    oneTen <- seq(1, 10, length.out=6L)
-
-    checkEquals( res[["operations : ArrayXd"]][["Op_seq"]],  oneTen,       msg = "Op_seq")
-    checkEquals( res[["operations : ArrayXd"]][["Op_log"]],  log(oneTen),  msg = "Op_log")
-    checkEquals( res[["operations : ArrayXd"]][["Op_exp"]],  exp(oneTen),  msg = "Op_exp")
-    checkEquals( res[["operations : ArrayXd"]][["Op_sqrt"]], sqrt(oneTen), msg = "Op_sqrt")
-    checkEquals( res[["operations : ArrayXd"]][["Op_cos"]],  cos(oneTen),  msg = "Op_cos")
-
-}
-
-test.as.Vec <- function(){
-    fx <- cxxfunction( signature(input_ = "list" ) , '
-
-    List input(input_) ;
-    Eigen::VectorXi                                m1 = input[0] ; /* implicit as */
-    Eigen::VectorXd                                m2 = input[1] ; /* implicit as */
-    Eigen::Matrix<unsigned int, Eigen::Dynamic, 1> m3 = input[0] ; /* implicit as */
-    Eigen::VectorXf                                m4 = input[1] ; /* implicit as */
-
-    List res = List::create(m1.sum(), m2.sum(), m3.sum(), m4.sum());
-
-    return res ;
-
-    ', plugin = "RcppEigen" )
-
-    res <- fx( list( 1:10, as.numeric(1:10) ) )
-    checkEquals( unlist( res ), rep(55.0, 4 ), msg = "as<Vec>" )
-}
-
-test.as.MVec <- function(){
-    fx <- cxxfunction( signature(input_ = "list" ) , '
-
-    List input(input_) ;
-    const Eigen::Map<Eigen::VectorXi>   m1 = input[0] ; // maps share storage and do not allow conversion
-    const Eigen::Map<Eigen::VectorXd>   m2 = input[1] ; 
-
-    List res = List::create(m1.sum(), m2.sum());
-
-    return res ;
-
-    ', plugin = "RcppEigen" )
-
-    res <- fx( list( 1:10, as.numeric(1:10) ) )
-    checkEquals( unlist( res ), rep(55.0, 2 ), msg = "as<MVec>" )
-}
-
-test.as.MMat <- function(){
-    fx <- cxxfunction( signature(input_ = "list" ) , '
-
-    List input(input_) ;
-    const Eigen::Map<Eigen::MatrixXi>   m1 = input[0]; // maps share storage and do not allow conversion
-    const Eigen::Map<Eigen::MatrixXd>   m2 = input[1] ;
-// FIXME: Write a version of as specifically for complex matrices.
-//    const Eigen::Map<Eigen::MatrixXcd>  m3 = input[2] ; 
-
-    List res = List::create(m1.sum(), m2.sum());//, m3.sum());
-
-    return res ;
-
-    ', plugin = "RcppEigen" )
-
-    integer_mat <- matrix(as.integer(diag(nr=4L)), nc=4L)
-    numeric_mat <- diag(nr=5L)
-    complex_mat <- (1+0i) * diag(nr=5L)
-    res <- fx(list(integer_mat, numeric_mat, complex_mat))
-    checkEquals(unlist(res), c(4L, 5)#, 5+0i)
-                , msg = "as<MMat>" )
-}
-
-test.as.MSpMat <- function() {
-    suppressMessages(require("Matrix"))
-    data("KNex", package = "Matrix")
-    fx <- cxxfunction( signature(input_ = "list"), '
-    List input(input_) ;
-    const Eigen::MappedSparseMatrix<double>  m1 = input[0]; // maps share storage and do not allow conversion
-
-    List res = List::create(_["nnz"]   = m1.nonZeros(),
-                            _["nr"]    = m1.rows(),
-                            _["nc"]    = m1.cols(),
-                            _["inSz"]  = m1.innerSize(),
-                            _["outSz"] = m1.outerSize(),
-                            _["sum"]   = m1.sum());
-
-    return res ;
-
-    ', plugin = "RcppEigen" )
-
-    KNX <- KNex[[1]]
-    res <- fx(KNex)
-    checkEquals(unname(unlist(res)),
-                c(nnzero(KNX), nrow(KNX), ncol(KNX),  nrow(KNX), ncol(KNX), sum(KNX@x)),
-                msg = "as<MSPMatrix>")
-}
-
-test.as.SpMat <- function() {
-    suppressMessages(require("Matrix"))
-    data("KNex", package = "Matrix")
-    fx <- cxxfunction( signature(input_ = "list"), '
-    List input(input_) ;
-    const Eigen::SparseMatrix<double>  m1 = input[0];
-
-    List res = List::create(_["nnz"]   = m1.nonZeros(),
-                            _["nr"]    = m1.rows(),
-                            _["nc"]    = m1.cols(),
-                            _["inSz"]  = m1.innerSize(),
-                            _["outSz"] = m1.outerSize(),
-                            _["sum"]   = m1.sum());
-
-    return res ;
-    ', plugin = "RcppEigen" )
-
-    KNX <- KNex[[1]]
-    res <- fx(KNex)
-    checkEquals(unname(unlist(res)),
-                c(nnzero(KNX), nrow(KNX), ncol(KNX),  nrow(KNX), ncol(KNX), sum(KNX@x)),
-                msg = "as<MSPMatrix>")
-}
diff --git a/inst/unitTests/runit.fastLm.R b/inst/unitTests/runit.fastLm.R
deleted file mode 100644
index de84955b..00000000
--- a/inst/unitTests/runit.fastLm.R
+++ /dev/null
@@ -1,78 +0,0 @@
-#!/usr/bin/r -t
-#
-# Copyright (C) 2011 - 2015  Douglas Bates, Dirk Eddelbuettel and Romain Francois
-#
-# This file is part of RcppEigen
-#
-# RcppEigen is free software: you can redistribute it and/or modify it
-# under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 2 of the License, or
-# (at your option) any later version.
-#
-# RcppEigen is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with RcppEigen.  If not, see <http://www.gnu.org/licenses/>.
-
-.setUp <- function(){
-    suppressMessages(require(datasets))
-    suppressMessages(require(RcppEigen))
-}
-
-test.fastLm <- function() {
-    data(trees, package="datasets")
-    flm0 <- .Call("RcppEigen_fastLm_Impl",
-                  cbind(1, log(trees$Girth)),
-                  log(trees$Volume), 0L,
-                  PACKAGE="RcppEigen")
-    flm1 <- .Call("RcppEigen_fastLm_Impl",
-                  cbind(1, log(trees$Girth)),
-                  log(trees$Volume), 1L,
-                  PACKAGE="RcppEigen")
-    flm2 <- .Call("RcppEigen_fastLm_Impl",
-                  cbind(1, log(trees$Girth)),
-                  log(trees$Volume), 2L,
-                  PACKAGE="RcppEigen")
-    flm3 <- .Call("RcppEigen_fastLm_Impl",
-                  cbind(1, log(trees$Girth)),
-                  log(trees$Volume), 3L,
-                  PACKAGE="RcppEigen")
-    flm4 <- .Call("RcppEigen_fastLm_Impl",
-                  cbind(1, log(trees$Girth)),
-                  log(trees$Volume), 4L,
-                  PACKAGE="RcppEigen")
-    flm5 <- .Call("RcppEigen_fastLm_Impl",
-                  cbind(1, log(trees$Girth)),
-                  log(trees$Volume), 5L,
-                  PACKAGE="RcppEigen")
-    fit <- lm(log(Volume) ~ log(Girth), data=trees)
-    fitCoef <- unname(coef(fit))
-    fitStdErr <- unname(coef(summary(fit))[, "Std. Error", drop = TRUE])
-    checkEquals(flm0$coefficients, fitCoef, msg="fastLm0.coef")
-    checkEquals(flm0$se, fitStdErr, msg="fastLm0.stderr")
-    checkEquals(flm1$coefficients, fitCoef, msg="fastLm1.coef")
-    checkEquals(flm1$se, fitStdErr, msg="fastLm1.stderr")
-    checkEquals(flm2$coefficients, fitCoef, msg="fastLm2.coef")
-    checkEquals(flm2$se, fitStdErr, msg="fastLm2.stderr")
-    checkEquals(flm3$coefficients, fitCoef, msg="fastLm3.coef")
-    checkEquals(flm3$se, fitStdErr, msg="fastLm3.stderr")
-    checkEquals(flm4$coefficients, fitCoef, msg="fastLm0.coef")
-    checkEquals(flm4$se, fitStdErr, msg="fastLm0.stderr")
-    checkEquals(flm5$coefficients, fitCoef, msg="fastLm0.coef")
-    checkEquals(flm5$se, fitStdErr, msg="fastLm0.stderr")
-}
-
-
-test.fastLm.formula <- function() {
-    data(trees, package="datasets")
-    flm <- fastLm(log(Volume) ~ log(Girth), data=trees)
-    fit <- lm(log(Volume) ~ log(Girth), data=trees)
-
-    checkEquals(flm$coefficients, coef(fit), msg="fastLm.formula.coef")
-    checkEquals(as.numeric(flm$se), as.numeric(coef(summary(fit))[,2]),
-                msg="fastLm.formula.stderr")
-}
-
diff --git a/inst/unitTests/runit.solutions.R b/inst/unitTests/runit.solutions.R
deleted file mode 100644
index f3008f1d..00000000
--- a/inst/unitTests/runit.solutions.R
+++ /dev/null
@@ -1,103 +0,0 @@
-#
-# Copyright (C) 2012 - 2013  Douglas Bates, Dirk Eddelbuettel and Romain Francois
-#
-# This file is part of RcppEigen.
-#
-# RcppEigen is free software: you can redistribute it and/or modify it
-# under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 2 of the License, or
-# (at your option) any later version.
-#
-# RcppEigen is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with Rcpp.  If not, see <http://www.gnu.org/licenses/>.
-
-incl <- '
-typedef Eigen::ArrayXd                   Ar1;
-typedef Eigen::Map<Ar1>                 MAr1;
-typedef Eigen::ArrayXXd                  Ar2;
-typedef Eigen::Map<Ar2>                 MAr2;
-typedef Eigen::MatrixXd                  Mat;
-typedef Eigen::Map<Mat>                 MMat;
-typedef Eigen::VectorXd                  Vec;
-typedef Eigen::Map<Vec>                 MVec;
-typedef Eigen::PartialPivLU<Mat>        PPLU;
-typedef Eigen::ColPivHouseholderQR<Mat> CPQR;
-'
-
-definitions <- list(
-    "dense_PPLU" = list(signature(A_="matrix", b_="numeric"),
-    '
-    MMat           A(as<MMat>(A_));
-    MVec           b(as<MVec>(b_));
-    PPLU           lu(A);
-    Mat            Ainv(lu.inverse());
-    Vec            x(lu.solve(b));
-
-    return List::create(Named("A",    A),
-                        Named("Ainv", Ainv),
-                        Named("b",    b),
-                        Named("x",    x));
-    '),
-    "dense_CPQR" = list(signature(A_="matrix", b_="numeric"),
-    '
-    MMat           A(as<MMat>(A_));
-    MVec           b(as<MVec>(b_));
-    CPQR           qr(A);
-    Mat            Ainv(qr.inverse());
-    Vec            x(qr.solve(b));
-    return List::create(Named("Ainv", Ainv),
-                        Named("x",    x));
-    ')
-    )
-
-
-.setUp <- function() {
-    suppressMessages(require(inline))
-    suppressMessages(require(RcppEigen))
-    cxxargs <- ifelse(Rcpp:::capabilities()[["initializer lists"]],
-                      "-std=c++0x","")
-    tests <- ".rcppeigen.solve"
-    if( ! exists( tests, globalenv() )) {
-        fun <- RcppEigen:::compile_unit_tests(definitions,
-                                              includes=incl,
-                                              cxxargs = cxxargs)
-        names(fun) <- names(definitions)
-        assign( tests, fun, globalenv() )
-    }
-}
-
-test.smallDense <- function() {
-    A <- matrix(c(1,2,3,4), nrow=2L)
-    B <- matrix(c(5,6,7,8), nrow=2L)
-    b <- c(1,1)
-
-    ## solutions to dense systems
-    res <- .rcppeigen.solve$dense_PPLU(A, b)
-    checkEquals(res$Ainv,  solve(A))
-    checkEquals(res$x,     solve(A, b))
-
-    res <- .rcppeigen.solve$dense_CPQR(A, b)
-    checkEquals(res$Ainv,  solve(A))
-    checkEquals(res$x,     solve(A, b))
-}
-
-test.largeDense <- function() {
-    set.seed(1234321)
-    N <- 100L
-    AA <- matrix(rnorm(N * N), nrow=N)
-    bb <- rnorm(N)
-
-    res <- .rcppeigen.solve$dense_PPLU(AA, bb)
-    checkEquals(res$Ainv,  solve(AA))
-    checkEquals(res$x,     solve(AA, bb))
-
-    res <- .rcppeigen.solve$dense_CPQR(AA, bb)
-    checkEquals(res$Ainv,  solve(AA))
-    checkEquals(res$x,     solve(AA, bb))
-}
-
diff --git a/inst/unitTests/runit.sparse.R b/inst/unitTests/runit.sparse.R
deleted file mode 100644
index 6f34cf6e..00000000
--- a/inst/unitTests/runit.sparse.R
+++ /dev/null
@@ -1,221 +0,0 @@
-#!/usr/bin/r -t
-#
-# Copyright (C)      2011 Douglas Bates, Dirk Eddelbuettel and Romain Francois
-#
-# This file is part of RcppEigen
-#
-# RcppEigen is free software: you can redistribute it and/or modify it
-# under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 2 of the License, or
-# (at your option) any later version.
-#
-# RcppEigen is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with RcppEigen.  If not, see <http://www.gnu.org/licenses/>.
-
-.setUp <- function(){
-    suppressMessages(require(inline))
-}
-
-test.wrapSparse.double.R <- function(){
-
-    fx <- cxxfunction( , '
-
-    Eigen::SparseMatrix<double>  mm(9,3);
-    mm.reserve(9);
-    for (int j = 0; j < 3; ++j) {
-        mm.startVec(j);
-        for (int i = 3 * j; i < 3 * (j + 1); ++i)
-            mm.insertBack(i, j) = 1.;
-    }
-    mm.finalize();
-    return wrap(mm);
-' , plugin = "RcppEigen" )
-
-    res <- fx()
-    rr <- Matrix::t(as(gl(3,3), "sparseMatrix"))
-    colnames(rr) <- NULL
-    checkEquals( res, rr, msg = "wrap<SparseMatrix<double> >")
-}
-
-test.wrapSparse.double.ColMajor.R <- function(){
-
-    fx <- cxxfunction( , '
-
-    Eigen::SparseMatrix<double, Eigen::ColMajor>  mm(9,3);
-    mm.reserve(9);
-    for (int j = 0; j < 3; ++j) {
-        mm.startVec(j);
-        for (int i = 3 * j; i < 3 * (j + 1); ++i)
-            mm.insertBack(i, j) = 1.;
-    }
-    mm.finalize();
-    return wrap(mm);
-' , plugin = "RcppEigen" )
-
-    res <- fx()
-    rr <- Matrix::t(as(gl(3,3), "sparseMatrix"))
-    colnames(rr) <- NULL
-    checkEquals( res, rr, msg = "wrap<SparseMatrix<double, Eigen::ColMajor> >")
-}
-
-## test.wrapSparse.int.ColMajor.R <- function(){  ## classes not yet exported from Matrix
-
-##     fx <- cxxfunction( , '
-
-##     Eigen::SparseMatrix<int, Eigen::ColMajor>  mm(9,3);
-##     mm.reserve(9);
-##     for (int j = 0; j < 3; ++j) {
-##         mm.startVec(j);
-##         for (int i = 3 * j; i < 3 * (j + 1); ++i)
-##             mm.insertBack(i, j) = 1;
-##     }
-##     mm.finalize();
-##     return wrap(mm);
-## ' , plugin = "RcppEigen" )
-
-##     #res <- fx()
-##     #rr <- Matrix::t(as(gl(3,3), "sparseMatrix"))
-##     #colnames(rr) <- NULL
-##     #checkEquals( res, rr, msg = "wrap<SparseMatrix<double, Eigen::ColMajor> >")
-##     checkException( fx(), msg = "wrap<SparseMatrix<int, Eigen::ColMajor> >" )
-## }
-
-test.wrapSparse.double.RowMajor.R <- function(){
-
-    fx <- cxxfunction( , '
-
-    Eigen::SparseMatrix<double, Eigen::RowMajor>  mm(9,3);
-    mm.reserve(9);
-    for (int irow = 0; irow < 9; ++irow) {
-        mm.startVec(irow);
-        mm.insertBack(irow, irow / 3) = static_cast<double>( 9 - irow );
-    }
-    mm.finalize();
-    return wrap(mm);
-' , plugin = "RcppEigen" )
-
-    res <- fx()
-    rr <- new( "dgRMatrix", j=rep(0L:2L, each=3), p=0L:9L, x=as.numeric(9:1), Dim=c(9L,3L) )
-    colnames(rr) <- NULL
-    checkEquals( res, rr, msg = "wrap<SparseMatrix<double, Eigen::RowMajor> >")
-}
-
-## test.wrapSparse.int.RowMajor.R <- function(){
-
-##     fx <- cxxfunction( , '
-
-##     Eigen::SparseMatrix<int, Eigen::RowMajor>  mm(9,3);
-##     mm.reserve(9);
-##     for (int irow = 0; irow < 9; ++irow) {
-##         mm.startVec(irow);
-##         mm.insertBack(irow, irow / 3) = 9 - irow;
-##     }
-##     mm.finalize();
-##     return wrap(mm);
-## ' , plugin = "RcppEigen" )
-
-##     #res <- fx()
-##     #rr <- new( "igRMatrix", j=rep(0L:2L, each=3), p=0L:9L, x=9L:1L, Dim=c(9L,3L) )
-##     #colnames(rr) <- NULL
-##     #checkEquals( res, rr, msg = "wrap<SparseMatrix<int, Eigen::RowMajor> >")
-##     checkException( fx(), msg = "wrap<SparseMatrix<int, Eigen::RowMajor> >" )
-## }
-
-test.asSparse.double.ColMajor.R <- function(){
-
-    fx <- cxxfunction( sig=signature(R_mm="dgCMatrix"), '
-
-    Eigen::SparseMatrix<double, Eigen::ColMajor> mm = Rcpp::as<Eigen::SparseMatrix<double, Eigen::ColMajor> >( R_mm );
-    return wrap(mm);
-' , plugin = "RcppEigen" )
-
-    rr <- Matrix::t(as(gl(3,3), "sparseMatrix"))
-    colnames(rr) <- NULL
-    res <- fx( R_mm = rr )
-    checkEquals( res, rr, msg = "as<SparseMatrix<double, Eigen::ColMajor> >")
-}
-
-test.asMappedSparse.double.ColMajor.R <- function(){
-
-    fx <- cxxfunction( sig=signature(R_mm="dgCMatrix"), '
-
-    typedef Eigen::MappedSparseMatrix<double, Eigen::ColMajor> MapMat;
-    MapMat mm = Rcpp::as<MapMat>( R_mm );
-    return wrap(mm);
-' , plugin = "RcppEigen" )
-
-    rr <- Matrix::t(as(gl(3,3), "sparseMatrix"))
-    colnames(rr) <- NULL
-    res <- fx( R_mm = rr )
-    checkEquals( res, rr, msg = "as<MappedSparseMatrix<double, Eigen::ColMajor> >")
-}
-
-test.asSparse.double.RowMajor.R <- function(){
-    fx <- cxxfunction( sig=signature(R_mm="dgRMatrix"), '
-
-    Eigen::SparseMatrix<double, Eigen::RowMajor> mm = Rcpp::as<Eigen::SparseMatrix<double, Eigen::RowMajor> >( R_mm );
-    return wrap(mm);
-' , plugin = "RcppEigen" )
-
-    rr <- new( "dgRMatrix", j=rep(0L:2L, each=3), p=0L:9L, x=as.numeric(9:1), Dim=c(9L,3L) )
-    colnames(rr) <- NULL
-    res <- fx( R_mm = rr )
-    checkEquals( res, rr, msg = "as<SparseMatrix<double, Eigen::RowMajor> >")
-}
-
-test.asMappedSparse.double.RowMajor.R <- function(){
-    fx <- cxxfunction( sig=signature(R_mm="dgRMatrix"), '
-
-    typedef Eigen::MappedSparseMatrix<double, Eigen::RowMajor> MapMat;
-    MapMat mm = Rcpp::as<MapMat>( R_mm );
-    return wrap(mm);
-' , plugin = "RcppEigen" )
-
-    rr <- new( "dgRMatrix", j=rep(0L:2L, each=3), p=0L:9L, x=as.numeric(9:1), Dim=c(9L,3L) )
-    colnames(rr) <- NULL
-    res <- fx( R_mm = rr )
-    checkEquals( res, rr, msg = "as<MappedSparseMatrix<double, Eigen::RowMajor> >")
-}
-
-
-test.sparseCholesky.R <- function() {
-    suppressMessages(require("Matrix", character.only=TRUE))
-    data("KNex", package = "Matrix")
-
-    fx <- cxxfunction( signature(input_ = "list"), '
-    using Eigen::VectorXd;
-    using Eigen::MatrixXd;
-    using Eigen::Lower;
-    using Eigen::Map;
-    using Eigen::MappedSparseMatrix;
-    using Eigen::SparseMatrix;
-    using Eigen::SimplicialLDLT;
-    using Eigen::Success;
-
-    List input(input_);
-    const MappedSparseMatrix<double> m1 = input[0];
-    const Map<VectorXd>              v1 = input[1];
-    SparseMatrix<double>             m2(m1.cols(), m1.cols());
-    m2.selfadjointView<Lower>().rankUpdate(m1.adjoint());
-
-    SimplicialLDLT<SparseMatrix<double> > ff(m2);
-    VectorXd                        res = ff.solve(m1.adjoint() * v1);
-    
-    return List::create(_["res"]   = res,
-                        _["rows"]  = ff.rows(),
-                        _["cols"]  = ff.cols());
-',
-                      plugin = "RcppEigen")
-
-    rr <- fx(KNex)
-    checkEquals(rr[[1]], as.vector(solve(crossprod(KNex[[1]]),
-                                         crossprod(KNex[[1]], KNex[[2]])),
-                                   mode="numeric"),
-                "Cholmod solution")
-}
-
diff --git a/inst/unitTests/runit.transform.R b/inst/unitTests/runit.transform.R
deleted file mode 100644
index 73223bf3..00000000
--- a/inst/unitTests/runit.transform.R
+++ /dev/null
@@ -1,87 +0,0 @@
-#
-# Copyright (C) 2012 - 2013  Douglas Bates, Dirk Eddelbuettel and Romain Francois
-#
-# This file is part of RcppEigen.
-#
-# RcppEigen is free software: you can redistribute it and/or modify it
-# under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 2 of the License, or
-# (at your option) any later version.
-#
-# RcppEigen is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with Rcpp.  If not, see <http://www.gnu.org/licenses/>.
-
-incl <- '
-typedef Eigen::ArrayXd                   Ar1;
-typedef Eigen::Map<Ar1>                 MAr1;
-typedef Eigen::ArrayXXd                  Ar2;
-typedef Eigen::Map<Ar2>                 MAr2;
-typedef Eigen::MatrixXd                  Mat;
-typedef Eigen::Map<Mat>                 MMat;
-typedef Eigen::VectorXd                  Vec;
-typedef Eigen::Map<Vec>                 MVec;
-'
-
-definitions <- list(
-    "ar1_unbounded" = list(signature(x_="numeric"),
-    '
-    MAr1           x(as<MAr1>(x_));
-
-    return List::create(Named("abs",    x.abs()),
-                        Named("abs2",   x.abs2()),
-                        Named("exp",    x.exp()),
-                        Named("cos",    x.cos()));
-    '),
-    "ar2_unbounded" = list(signature(X_="matrix"),
-    '
-    MAr2           X(as<MAr2>(X_));
-
-    return List::create(Named("abs",    X.abs()),
-                        Named("abs2",   X.abs2()),
-                        Named("exp",    X.exp()),
-                        Named("cos",    X.cos()));
-    ')
-    )
-
-.setUp <- function() {
-    suppressMessages(require(inline))
-    suppressMessages(require(RcppEigen))
-    cxxargs <- ifelse(Rcpp:::capabilities()[["initializer lists"]],
-                      "-std=c++0x","")
-    tests <- ".rcppeigen.trans"
-    if( ! exists( tests, globalenv() )) {
-        fun <- RcppEigen:::compile_unit_tests(definitions,
-                                              includes=incl,
-                                              cxxargs = cxxargs)
-        names(fun) <- names(definitions)
-        assign(tests, fun, globalenv())
-    }
-}
-
-test.transformationAr1 <- function() {
-    set.seed(1234321)
-    x <- rnorm(10L)
-
-    res <- .rcppeigen.trans$ar1_unbounded(x)
-    checkEquals(res$abs,  abs(x))
-    checkEquals(res$abs2, x * x)
-    checkEquals(res$exp,  exp(x))
-    checkEquals(res$cos,  cos(x))
-}
-
-test.transformationAr2 <- function() {
-    set.seed(1234321)
-    X <- matrix(rnorm(100L), nrow = 10, ncol = 10)
-
-    res <- .rcppeigen.trans$ar2_unbounded(X)
-    checkEquals(res$abs,  abs(X))
-    checkEquals(res$abs2, X * X)
-    checkEquals(res$exp,  exp(X))
-    checkEquals(res$cos,  cos(X))
-}
-
diff --git a/man/RcppEigen-package.Rd b/man/RcppEigen-package.Rd
index c0976f42..8ca648cf 100644
--- a/man/RcppEigen-package.Rd
+++ b/man/RcppEigen-package.Rd
@@ -16,9 +16,23 @@ Rcpp/Eigen bridge
   other packages.  The C++ source code and the R source code in this
   package are for illustration only.
 
-  As described at the Eigen project's home page,
-  \url{http://eigen.tuxfamily.org}, Eigen is a versatile, fast, reliable
-  and elegant collection of C++ classes for linear algebra.
+  As described at the \href{https://libeigen.gitlab.io/}{Eigen project home
+  page} , Eigen is a C++ template library for linear algebra: matrices,
+  vectors, numerical solvers, and related algorithms.
+}
+\section{Threading}{
+  The Eigen library can take advantage of OpenMP to execute computations in
+  parallel via multi-threaded code. The number of cores uses can be set (or
+  retrieved) explicitly via helper functions \code{EigenSetNbThreads()} and
+  \code{EigenNbThreads()}. A default value is stored at package startup; it
+  recognises R option value \code{Ncpus} and environment variable
+  \code{OMP_THREAD_LIMIT}.  Additional helper functions
+  \code{RcppEigen_throttle_cores()} and \code{RcppEigen_reset_cores()} are
+  available to (temporarily) lower the number of cores uses and to reset to
+  the package default value set at startup.
+}
+\seealso{
+  \code{\link{RcppEigen_throttle_cores}}
 }
 \references{
   Douglas Bates and Dirk Eddelbuettel (2013). Fast and Elegant Numerical
diff --git a/man/RcppEigen_throttle_cores.Rd b/man/RcppEigen_throttle_cores.Rd
new file mode 100644
index 00000000..dba25141
--- /dev/null
+++ b/man/RcppEigen_throttle_cores.Rd
@@ -0,0 +1,35 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/RcppExports.R, R/init.R
+\name{EigenNbThreads}
+\alias{EigenNbThreads}
+\alias{EigenSetNbThreads}
+\alias{RcppEigen_throttle_cores}
+\alias{RcppEigen_reset_cores}
+\title{Throttle (or Reset) (Rcpp)Eigen Core Usage}
+\usage{
+EigenNbThreads()
+
+EigenSetNbThreads(n)
+
+RcppEigen_throttle_cores(n)
+
+RcppEigen_reset_cores()
+}
+\arguments{
+\item{n}{Integer value of desired cores, default is the value set at package
+startup reflecting the smallest value among the total number of available
+cores (or one if compiled without OpenMP support), the value of option
+\code{Ncpus} and the value of environment variable \code{OMP_THREAD_LIMIT}.}
+}
+\value{
+Only \code{EigenNbThreads()} returns a value, the current value of
+the number of cores used. The other functions are invoked for their side
+effect of affecting the count of cores used.
+}
+\description{
+Helper functions to throttle use of cores by RcppEigen-internal code.
+On package load, the initial value is saved and used to reset the value.
+}
+\seealso{
+\code{\link{RcppEigen-package}}
+}
diff --git a/man/fastLm.Rd b/man/fastLm.Rd
index 784099bf..4ae4441c 100644
--- a/man/fastLm.Rd
+++ b/man/fastLm.Rd
@@ -37,7 +37,7 @@ fastLm(X, \dots)
     Cholesky, 3 for the LDLT Cholesky, 4 for the Jacobi singular value
     decomposition (SVD) and 5 for a method based on the
     eigenvalue-eigenvector decomposition of
-    \eqn{\mathbf{X}^\prime\mathbf{X}}{X'X}.  Default is zero.} 
+    \eqn{\mathbf{X}^\prime\mathbf{X}}{X'X}.  Default is zero.}
 
   \item{\dots}{not used}
 }
@@ -47,7 +47,7 @@ fastLm(X, \dots)
 
   The \code{fastLmPure} function provides a reference use case of the \code{Eigen}
   C++ template library via the wrapper functions in the \pkg{RcppEigen} package.
-  
+
   The \code{fastLm} function provides a more standard implementation of
   a linear model fit, offering both a default and a formula interface as
   well as \code{print}, \code{summary} and \code{predict} methods.
@@ -57,7 +57,7 @@ fastLm(X, \dots)
   decomposition, so that it can handle rank-deficient cases
   effectively.  Other methods for determining least squares solutions
   are available according to the value of the \code{method} argument.
-  
+
   An example of the type of situation requiring extra care in checking
   for rank deficiency is a two-way layout with missing cells (see the
   examples section).  These cases require a special pivoting scheme of
@@ -78,7 +78,7 @@ fastLm(X, \dots)
      call argument similar to the \code{\link{lm}} or
   \code{\link[MASS]{rlm}} functions..
 }
-\seealso{\code{\link{lm}}, \code{\link{lm.fit}}} 
+\seealso{\code{\link{lm}}, \code{\link{lm.fit}}}
 \references{
   Douglas Bates and Dirk Eddelbuettel (2013). Fast and Elegant Numerical
   Linear Algebra Using the \pkg{RcppEigen} Package. \emph{Journal of
@@ -86,8 +86,8 @@ fastLm(X, \dots)
   URL http://www.jstatsoft.org/v52/i05/.
 }
 \author{
-  Eigen is described at \url{http://eigen.tuxfamily.org}. RcppEigen is written by
-  Douglas Bates, Dirk Eddelbuettel and Romain Francois.
+  Eigen is described at \url{https://libeigen.gitlab.io/}.
+  RcppEigen is written by Douglas Bates, Dirk Eddelbuettel and Romain Francois.
 }
 \examples{
   data(trees, package="datasets")
diff --git a/patches/eigen-3.3.4.diff b/patches/eigen-3.3.4.diff
new file mode 100644
index 00000000..12cfe6d1
--- /dev/null
+++ b/patches/eigen-3.3.4.diff
@@ -0,0 +1,163 @@
+Only in ./eigen-eigen-88c4604601b9/: bench
+Only in ./eigen-eigen-88c4604601b9/: blas
+Only in ./eigen-eigen-88c4604601b9/: cmake
+Only in ./eigen-eigen-88c4604601b9/: CMakeLists.txt
+Only in ./eigen-eigen-88c4604601b9/: COPYING.BSD
+Only in ./eigen-eigen-88c4604601b9/: COPYING.GPL
+Only in ./eigen-eigen-88c4604601b9/: COPYING.LGPL
+Only in ./eigen-eigen-88c4604601b9/: COPYING.MINPACK
+Only in ./eigen-eigen-88c4604601b9/: COPYING.MPL2
+Only in ./eigen-eigen-88c4604601b9/: COPYING.README
+Only in ./eigen-eigen-88c4604601b9/: CTestConfig.cmake
+Only in ./eigen-eigen-88c4604601b9/: CTestCustom.cmake.in
+Only in ./eigen-eigen-88c4604601b9/: debug
+Only in ./eigen-eigen-88c4604601b9/: demos
+Only in ./eigen-eigen-88c4604601b9/: doc
+diff -r -u ./eigen-eigen-88c4604601b9/Eigen/CholmodSupport ./include/Eigen/CholmodSupport
+--- ./eigen-eigen-88c4604601b9/Eigen/CholmodSupport	2018-01-03 15:55:52.000000000 -0500
++++ ./include/Eigen/CholmodSupport	2018-02-04 14:06:06.527035000 -0500
+@@ -13,7 +13,7 @@
+ #include "src/Core/util/DisableStupidWarnings.h"
+ 
+ extern "C" {
+-  #include <cholmod.h>
++  #include <RcppEigenCholmod.h>
+ }
+ 
+ /** \ingroup Support_modules
+Only in ./eigen-eigen-88c4604601b9/Eigen: CMakeLists.txt
+diff -r -u ./eigen-eigen-88c4604601b9/Eigen/src/Core/arch/CUDA/Half.h ./include/Eigen/src/Core/arch/CUDA/Half.h
+--- ./eigen-eigen-88c4604601b9/Eigen/src/Core/arch/CUDA/Half.h	2018-01-03 15:55:52.000000000 -0500
++++ ./include/Eigen/src/Core/arch/CUDA/Half.h	2018-02-06 08:19:30.904301638 -0500
+@@ -119,12 +119,14 @@
+   EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long) const {
+     return static_cast<unsigned long>(half_impl::half_to_float(*this));
+   }
++  #if EIGEN_HAS_CXX11
+   EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long long) const {
+     return static_cast<long long>(half_impl::half_to_float(*this));
+   }
+   EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long long) const {
+     return static_cast<unsigned long long>(half_to_float(*this));
+   }
++  #endif
+   EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(float) const {
+     return half_impl::half_to_float(*this);
+   }
+diff -r -u ./eigen-eigen-88c4604601b9/Eigen/src/Core/arch/SSE/Complex.h ./include/Eigen/src/Core/arch/SSE/Complex.h
+--- ./eigen-eigen-88c4604601b9/Eigen/src/Core/arch/SSE/Complex.h	2018-01-03 15:55:52.000000000 -0500
++++ ./include/Eigen/src/Core/arch/SSE/Complex.h	2018-02-05 13:46:26.921642310 -0500
+@@ -98,10 +98,10 @@
+   res.v = _mm_loadl_pi(_mm_set1_ps(0.0f), reinterpret_cast<const __m64*>(&from));
+ #elif EIGEN_GNUC_AT_LEAST(4,6)
+   // Suppress annoying "may be used uninitialized in this function" warning with gcc >= 4.6
+-  #pragma GCC diagnostic push
+-  #pragma GCC diagnostic ignored "-Wuninitialized"
++  // #pragma GCC diagnostic push
++  // #pragma GCC diagnostic ignored "-Wuninitialized"
+   res.v = _mm_loadl_pi(res.v, (const __m64*)&from);
+-  #pragma GCC diagnostic pop
++  // #pragma GCC diagnostic pop
+ #else
+   res.v = _mm_loadl_pi(res.v, (const __m64*)&from);
+ #endif
+diff -r -u ./eigen-eigen-88c4604601b9/Eigen/src/Core/util/DisableStupidWarnings.h ./include/Eigen/src/Core/util/DisableStupidWarnings.h
+--- ./eigen-eigen-88c4604601b9/Eigen/src/Core/util/DisableStupidWarnings.h	2018-01-03 15:55:52.000000000 -0500
++++ ./include/Eigen/src/Core/util/DisableStupidWarnings.h	2018-02-05 13:46:26.925644780 -0500
+@@ -4,7 +4,6 @@
+ #ifdef _MSC_VER
+   // 4100 - unreferenced formal parameter (occurred e.g. in aligned_allocator::destroy(pointer p))
+   // 4101 - unreferenced local variable
+-  // 4127 - conditional expression is constant
+   // 4181 - qualifier applied to reference type ignored
+   // 4211 - nonstandard extension used : redefined extern to static
+   // 4244 - 'argument' : conversion from 'type1' to 'type2', possible loss of data
+@@ -20,7 +19,7 @@
+   #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
+     #pragma warning( push )
+   #endif
+-  #pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4503 4512 4522 4700 4714 4717 4800)
++  #pragma warning( disable : 4100 4101 4181 4211 4244 4273 4324 4503 4512 4522 4700 4714 4717 4800)
+ 
+ #elif defined __INTEL_COMPILER
+   // 2196 - routine is both "inline" and "noinline" ("noinline" assumed)
+@@ -38,21 +37,25 @@
+ #elif defined __clang__
+   // -Wconstant-logical-operand - warning: use of logical && with constant operand; switch to bitwise & or remove constant
+   //     this is really a stupid warning as it warns on compile-time expressions involving enums
+-  #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
+-    #pragma clang diagnostic push
+-  #endif
+-  #pragma clang diagnostic ignored "-Wconstant-logical-operand"
++  // #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
++  //   #pragma clang diagnostic push
++  // #endif
++  // #pragma clang diagnostic ignored "-Wconstant-logical-operand"
++  // #if __clang_major__ >= 3 && __clang_minor__ >= 5
++  //   #pragma clang diagnostic ignored "-Wabsolute-value"
++  // #endif
+ 
+ #elif defined __GNUC__ && __GNUC__>=6
+ 
+-  #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
+-    #pragma GCC diagnostic push
+-  #endif
+-  #pragma GCC diagnostic ignored "-Wignored-attributes"
++  // #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
++  //   #pragma GCC diagnostic push
++  // #endif
++  // #pragma GCC diagnostic ignored "-Wignored-attributes"
+ 
+ #endif
+ 
+ #if defined __NVCC__
++  #pragma diag_suppress boolean_controlling_expr_is_constant
+   // Disable the "statement is unreachable" message
+   #pragma diag_suppress code_is_unreachable
+   // Disable the "dynamic initialization in unreachable code" message
+@@ -70,6 +73,7 @@
+   #pragma diag_suppress 2671
+   #pragma diag_suppress 2735
+   #pragma diag_suppress 2737
++  #pragma diag_suppress 2739
+ #endif
+ 
+ #endif // not EIGEN_WARNINGS_DISABLED
+Only in ./eigen-eigen-88c4604601b9/: eigen3.pc.in
+Only in ./eigen-eigen-88c4604601b9/: failtest
+Only in ./eigen-eigen-88c4604601b9/: .hg_archival.txt
+Only in ./eigen-eigen-88c4604601b9/: .hgeol
+Only in ./eigen-eigen-88c4604601b9/: .hgignore
+Only in ./eigen-eigen-88c4604601b9/: .hgtags
+Only in ./eigen-eigen-88c4604601b9/: INSTALL
+Only in ./eigen-eigen-88c4604601b9/: lapack
+Only in ./include: RcppEigenCholmod.h
+Only in ./include: RcppEigenForward.h
+Only in ./include: RcppEigen.h
+Only in ./include: RcppEigenStubs.h
+Only in ./include: RcppEigenWrap.h
+Only in ./eigen-eigen-88c4604601b9/: README.md
+Only in ./eigen-eigen-88c4604601b9/: scripts
+Only in ./eigen-eigen-88c4604601b9/: signature_of_eigen3_matrix_library
+Only in ./eigen-eigen-88c4604601b9/: test
+Only in ./eigen-eigen-88c4604601b9/unsupported: bench
+Only in ./eigen-eigen-88c4604601b9/unsupported: CMakeLists.txt
+Only in ./eigen-eigen-88c4604601b9/unsupported: doc
+Only in ./eigen-eigen-88c4604601b9/unsupported/Eigen: CMakeLists.txt
+Only in ./eigen-eigen-88c4604601b9/unsupported/Eigen/CXX11: CMakeLists.txt
+Only in ./eigen-eigen-88c4604601b9/unsupported/Eigen/src/EulerAngles: CMakeLists.txt
+diff -r -u ./eigen-eigen-88c4604601b9/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h ./include/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h
+--- ./eigen-eigen-88c4604601b9/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h	2018-01-03 15:55:52.000000000 -0500
++++ ./include/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h	2018-02-04 14:10:32.572055000 -0500
+@@ -193,7 +193,9 @@
+         std::string curfile;
+         curfile = m_folder + "/" + m_curs_id->d_name;
+         // Discard if it is a folder
++#if !(defined(__sun) || defined(_AIX) || defined(__hpux) || defined(__sgi) || defined(__HAIKU__))
+         if (m_curs_id->d_type == DT_DIR) continue; //FIXME This may not be available on non BSD systems
++#endif
+ //         struct stat st_buf; 
+ //         stat (curfile.c_str(), &st_buf);
+ //         if (S_ISDIR(st_buf.st_mode)) continue;
+Only in ./eigen-eigen-88c4604601b9/unsupported: README.txt
+Only in ./eigen-eigen-88c4604601b9/unsupported: test
diff --git a/patches/eigen-3.3.5.diff b/patches/eigen-3.3.5.diff
new file mode 100644
index 00000000..ca959bbf
--- /dev/null
+++ b/patches/eigen-3.3.5.diff
@@ -0,0 +1,140 @@
+diff -ru eigen-3.3.5/Eigen/CholmodSupport inst/include/Eigen/CholmodSupport
+--- eigen-3.3.5/Eigen/CholmodSupport	2018-07-23 04:33:42.000000000 -0500
++++ inst/include/Eigen/CholmodSupport	2018-11-22 20:56:05.407463176 -0600
+@@ -13,7 +13,7 @@
+ #include "src/Core/util/DisableStupidWarnings.h"
+ 
+ extern "C" {
+-  #include <cholmod.h>
++  #include <RcppEigenCholmod.h>
+ }
+ 
+ /** \ingroup Support_modules
+Only in eigen-3.3.5/Eigen: CMakeLists.txt
+diff -ru eigen-3.3.5/Eigen/src/Core/arch/CUDA/Half.h inst/include/Eigen/src/Core/arch/CUDA/Half.h
+--- eigen-3.3.5/Eigen/src/Core/arch/CUDA/Half.h	2018-07-23 04:33:42.000000000 -0500
++++ inst/include/Eigen/src/Core/arch/CUDA/Half.h	2018-11-22 20:57:25.834187584 -0600
+@@ -119,12 +119,14 @@
+   EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long) const {
+     return static_cast<unsigned long>(half_impl::half_to_float(*this));
+   }
++  #if EIGEN_HAS_CXX11
+   EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long long) const {
+     return static_cast<long long>(half_impl::half_to_float(*this));
+   }
+   EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long long) const {
+     return static_cast<unsigned long long>(half_to_float(*this));
+   }
++  #endif
+   EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(float) const {
+     return half_impl::half_to_float(*this);
+   }
+diff -ru eigen-3.3.5/Eigen/src/Core/arch/SSE/Complex.h inst/include/Eigen/src/Core/arch/SSE/Complex.h
+--- eigen-3.3.5/Eigen/src/Core/arch/SSE/Complex.h	2018-07-23 04:33:42.000000000 -0500
++++ inst/include/Eigen/src/Core/arch/SSE/Complex.h	2018-11-22 20:59:03.388691931 -0600
+@@ -98,10 +98,10 @@
+   res.v = _mm_loadl_pi(_mm_set1_ps(0.0f), reinterpret_cast<const __m64*>(&from));
+ #elif EIGEN_GNUC_AT_LEAST(4,6)
+   // Suppress annoying "may be used uninitialized in this function" warning with gcc >= 4.6
+-  #pragma GCC diagnostic push
+-  #pragma GCC diagnostic ignored "-Wuninitialized"
++  //#pragma GCC diagnostic push
++  //#pragma GCC diagnostic ignored "-Wuninitialized"
+   res.v = _mm_loadl_pi(res.v, (const __m64*)&from);
+-  #pragma GCC diagnostic pop
++  //#pragma GCC diagnostic pop
+ #else
+   res.v = _mm_loadl_pi(res.v, (const __m64*)&from);
+ #endif
+diff -ru eigen-3.3.5/Eigen/src/Core/util/DisableStupidWarnings.h inst/include/Eigen/src/Core/util/DisableStupidWarnings.h
+--- eigen-3.3.5/Eigen/src/Core/util/DisableStupidWarnings.h	2018-07-23 04:33:42.000000000 -0500
++++ inst/include/Eigen/src/Core/util/DisableStupidWarnings.h	2018-11-22 21:02:32.261638153 -0600
+@@ -4,7 +4,6 @@
+ #ifdef _MSC_VER
+   // 4100 - unreferenced formal parameter (occurred e.g. in aligned_allocator::destroy(pointer p))
+   // 4101 - unreferenced local variable
+-  // 4127 - conditional expression is constant
+   // 4181 - qualifier applied to reference type ignored
+   // 4211 - nonstandard extension used : redefined extern to static
+   // 4244 - 'argument' : conversion from 'type1' to 'type2', possible loss of data
+@@ -20,7 +19,7 @@
+   #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
+     #pragma warning( push )
+   #endif
+-  #pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4503 4512 4522 4700 4714 4717 4800)
++  #pragma warning( disable : 4100 4101 4181 4211 4244 4273 4324 4503 4512 4522 4700 4714 4717 4800)
+ 
+ #elif defined __INTEL_COMPILER
+   // 2196 - routine is both "inline" and "noinline" ("noinline" assumed)
+@@ -38,21 +37,22 @@
+ #elif defined __clang__
+   // -Wconstant-logical-operand - warning: use of logical && with constant operand; switch to bitwise & or remove constant
+   //     this is really a stupid warning as it warns on compile-time expressions involving enums
+-  #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
+-    #pragma clang diagnostic push
+-  #endif
+-  #pragma clang diagnostic ignored "-Wconstant-logical-operand"
++  //#ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
++  //  #pragma clang diagnostic push
++  //#endif
++  //#pragma clang diagnostic ignored "-Wconstant-logical-operand"
+ 
+ #elif defined __GNUC__ && __GNUC__>=6
+ 
+-  #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
+-    #pragma GCC diagnostic push
+-  #endif
+-  #pragma GCC diagnostic ignored "-Wignored-attributes"
++  //#ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
++  //  #pragma GCC diagnostic push
++  //#endif
++  //#pragma GCC diagnostic ignored "-Wignored-attributes"
+ 
+ #endif
+ 
+ #if defined __NVCC__
++  #pragma diag_suppress boolean_controlling_expr_is_constant
+   // Disable the "statement is unreachable" message
+   #pragma diag_suppress code_is_unreachable
+   // Disable the "dynamic initialization in unreachable code" message
+@@ -70,6 +70,7 @@
+   #pragma diag_suppress 2671
+   #pragma diag_suppress 2735
+   #pragma diag_suppress 2737
++  #pragma diag_suppress 2739
+ #endif
+ 
+ #endif // not EIGEN_WARNINGS_DISABLED
+Only in eigen-3.3.5/unsupported/: bench
+Only in eigen-3.3.5/unsupported/: CMakeLists.txt
+Only in eigen-3.3.5/unsupported/: doc
+Only in eigen-3.3.5/unsupported/Eigen: CMakeLists.txt
+Only in eigen-3.3.5/unsupported/Eigen/CXX11: CMakeLists.txt
+Only in eigen-3.3.5/unsupported/Eigen/src/EulerAngles: CMakeLists.txt
+diff -ru eigen-3.3.5/unsupported/Eigen/src/IterativeSolvers/DGMRES.h inst/include/unsupported/Eigen/src/IterativeSolvers/DGMRES.h
+--- eigen-3.3.5/unsupported/Eigen/src/IterativeSolvers/DGMRES.h	2018-07-23 04:33:42.000000000 -0500
++++ inst/include/unsupported/Eigen/src/IterativeSolvers/DGMRES.h	2018-11-22 21:12:59.577162671 -0600
+@@ -173,7 +173,7 @@
+   /** 
+    * Set the restart value (default is 30)  
+    */
+-  Index set_restart(const Index restart) { m_restart=restart; }
++  void set_restart(const Index restart) { m_restart=restart; }
+   
+   /** 
+    * Set the number of eigenvalues to deflate at each restart 
+diff -ru eigen-3.3.5/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h inst/include/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h
+--- eigen-3.3.5/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h	2018-07-23 04:33:42.000000000 -0500
++++ inst/include/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h	2018-11-22 21:03:35.872739714 -0600
+@@ -193,7 +193,9 @@
+         std::string curfile;
+         curfile = m_folder + "/" + m_curs_id->d_name;
+         // Discard if it is a folder
++#if !(defined(__sun) || defined(_AIX) || defined(__hpux) || defined(__sgi) || defined(__HAIKU__))
+         if (m_curs_id->d_type == DT_DIR) continue; //FIXME This may not be available on non BSD systems
++#endif
+ //         struct stat st_buf; 
+ //         stat (curfile.c_str(), &st_buf);
+ //         if (S_ISDIR(st_buf.st_mode)) continue;
+Only in eigen-3.3.5/unsupported/: README.txt
+Only in eigen-3.3.5/unsupported/: test
diff --git a/patches/eigen-3.3.7.diff b/patches/eigen-3.3.7.diff
new file mode 100644
index 00000000..d71574ad
--- /dev/null
+++ b/patches/eigen-3.3.7.diff
@@ -0,0 +1,140 @@
+diff -rwu eigen-3.3.7/Eigen/CholmodSupport inst/include/Eigen/CholmodSupport
+--- eigen-3.3.7/Eigen/CholmodSupport	2018-12-11 11:57:55.000000000 -0600
++++ inst/include/Eigen/CholmodSupport	2019-11-15 21:02:18.740853178 -0600
+@@ -13,7 +13,7 @@
+ #include "src/Core/util/DisableStupidWarnings.h"
+ 
+ extern "C" {
+-  #include <cholmod.h>
++  #include <RcppEigenCholmod.h>
+ }
+ 
+ /** \ingroup Support_modules
+@@ -45,4 +45,3 @@
+ #include "src/Core/util/ReenableStupidWarnings.h"
+ 
+ #endif // EIGEN_CHOLMODSUPPORT_MODULE_H
+-
+diff -rwu eigen-3.3.7/Eigen/src/Core/arch/CUDA/Half.h inst/include/Eigen/src/Core/arch/CUDA/Half.h
+--- eigen-3.3.7/Eigen/src/Core/arch/CUDA/Half.h	2018-12-11 11:57:55.000000000 -0600
++++ inst/include/Eigen/src/Core/arch/CUDA/Half.h	2019-11-15 20:50:19.577555015 -0600
+@@ -126,12 +126,14 @@
+   EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long) const {
+     return static_cast<unsigned long>(half_impl::half_to_float(*this));
+   }
++  #if EIGEN_HAS_CXX11
+   EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long long) const {
+     return static_cast<long long>(half_impl::half_to_float(*this));
+   }
+   EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long long) const {
+     return static_cast<unsigned long long>(half_to_float(*this));
+   }
++  #endif
+   EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(float) const {
+     return half_impl::half_to_float(*this);
+   }
+diff -rwu eigen-3.3.7/Eigen/src/Core/arch/SSE/Complex.h inst/include/Eigen/src/Core/arch/SSE/Complex.h
+--- eigen-3.3.7/Eigen/src/Core/arch/SSE/Complex.h	2018-12-11 11:57:55.000000000 -0600
++++ inst/include/Eigen/src/Core/arch/SSE/Complex.h	2019-11-15 20:51:36.228633705 -0600
+@@ -98,10 +98,10 @@
+   res.v = _mm_loadl_pi(_mm_set1_ps(0.0f), reinterpret_cast<const __m64*>(&from));
+ #elif EIGEN_GNUC_AT_LEAST(4,6)
+   // Suppress annoying "may be used uninitialized in this function" warning with gcc >= 4.6
+-  #pragma GCC diagnostic push
+-  #pragma GCC diagnostic ignored "-Wuninitialized"
++  //#pragma GCC diagnostic push
++  //#pragma GCC diagnostic ignored "-Wuninitialized"
+   res.v = _mm_loadl_pi(res.v, (const __m64*)&from);
+-  #pragma GCC diagnostic pop
++  //#pragma GCC diagnostic pop
+ #else
+   res.v = _mm_loadl_pi(res.v, (const __m64*)&from);
+ #endif
+diff -rwu eigen-3.3.7/Eigen/src/Core/util/DisableStupidWarnings.h inst/include/Eigen/src/Core/util/DisableStupidWarnings.h
+--- eigen-3.3.7/Eigen/src/Core/util/DisableStupidWarnings.h	2018-12-11 11:57:55.000000000 -0600
++++ inst/include/Eigen/src/Core/util/DisableStupidWarnings.h	2019-11-15 20:54:48.602319507 -0600
+@@ -4,7 +4,6 @@
+ #ifdef _MSC_VER
+   // 4100 - unreferenced formal parameter (occurred e.g. in aligned_allocator::destroy(pointer p))
+   // 4101 - unreferenced local variable
+-  // 4127 - conditional expression is constant
+   // 4181 - qualifier applied to reference type ignored
+   // 4211 - nonstandard extension used : redefined extern to static
+   // 4244 - 'argument' : conversion from 'type1' to 'type2', possible loss of data
+@@ -20,7 +19,7 @@
+   #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
+     #pragma warning( push )
+   #endif
+-  #pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4503 4512 4522 4700 4714 4717 4800)
++  #pragma warning( disable : 4100 4101 4181 4211 4244 4273 4324 4503 4512 4522 4700 4714 4717 4800)
+ 
+ #elif defined __INTEL_COMPILER
+   // 2196 - routine is both "inline" and "noinline" ("noinline" assumed)
+@@ -38,29 +37,30 @@
+ #elif defined __clang__
+   // -Wconstant-logical-operand - warning: use of logical && with constant operand; switch to bitwise & or remove constant
+   //     this is really a stupid warning as it warns on compile-time expressions involving enums
+-  #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
+-    #pragma clang diagnostic push
+-  #endif
+-  #pragma clang diagnostic ignored "-Wconstant-logical-operand"
++  //#ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
++  //  #pragma clang diagnostic push
++  //#endif
++  //#pragma clang diagnostic ignored "-Wconstant-logical-operand"
+ 
+ #elif defined __GNUC__
+ 
+-  #if (!defined(EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS)) &&  (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6))
+-    #pragma GCC diagnostic push
+-  #endif
+-  // g++ warns about local variables shadowing member functions, which is too strict
+-  #pragma GCC diagnostic ignored "-Wshadow"
+-  #if __GNUC__ == 4 && __GNUC_MINOR__ < 8
+-    // Until g++-4.7 there are warnings when comparing unsigned int vs 0, even in templated functions:
+-    #pragma GCC diagnostic ignored "-Wtype-limits"
+-  #endif
+-  #if __GNUC__>=6
+-    #pragma GCC diagnostic ignored "-Wignored-attributes"
+-  #endif
++  // #if (!defined(EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS)) &&  (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6))
++  //   #pragma GCC diagnostic push
++  // #endif
++  // // g++ warns about local variables shadowing member functions, which is too strict
++  // #pragma GCC diagnostic ignored "-Wshadow"
++  // #if __GNUC__ == 4 && __GNUC_MINOR__ < 8
++  //   // Until g++-4.7 there are warnings when comparing unsigned int vs 0, even in templated functions:
++  //   #pragma GCC diagnostic ignored "-Wtype-limits"
++  // #endif
++  // #if __GNUC__>=6
++  //   #pragma GCC diagnostic ignored "-Wignored-attributes"
++  // #endif
+ 
+ #endif
+ 
+ #if defined __NVCC__
++  #pragma diag_suppress boolean_controlling_expr_is_constant
+   // Disable the "statement is unreachable" message
+   #pragma diag_suppress code_is_unreachable
+   // Disable the "dynamic initialization in unreachable code" message
+@@ -78,6 +78,7 @@
+   #pragma diag_suppress 2671
+   #pragma diag_suppress 2735
+   #pragma diag_suppress 2737
++  #pragma diag_suppress 2739
+ #endif
+ 
+ #endif // not EIGEN_WARNINGS_DISABLED
+diff -rwu eigen-3.3.7/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h inst/include/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h
+--- eigen-3.3.7/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h	2018-12-11 11:57:55.000000000 -0600
++++ inst/include/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h	2019-11-15 20:57:13.664573037 -0600
+@@ -193,7 +193,9 @@
+         std::string curfile;
+         curfile = m_folder + "/" + m_curs_id->d_name;
+         // Discard if it is a folder
++#if !(defined(__sun) || defined(_AIX) || defined(__hpux) || defined(__sgi) || defined(__HAIKU__))
+         if (m_curs_id->d_type == DT_DIR) continue; //FIXME This may not be available on non BSD systems
++#endif
+ //         struct stat st_buf; 
+ //         stat (curfile.c_str(), &st_buf);
+ //         if (S_ISDIR(st_buf.st_mode)) continue;
diff --git a/patches/eigen-3.3.9.diff b/patches/eigen-3.3.9.diff
new file mode 100644
index 00000000..634c6b7e
--- /dev/null
+++ b/patches/eigen-3.3.9.diff
@@ -0,0 +1,147 @@
+diff '--exclude=CMakeLists.txt' -ruw eigen-3.3.9/Eigen/CholmodSupport inst/include/Eigen/CholmodSupport
+--- eigen-3.3.9/Eigen/CholmodSupport	2020-12-04 15:53:41.000000000 -0600
++++ inst/include/Eigen/CholmodSupport	2020-12-05 09:30:13.025906089 -0600
+@@ -13,7 +13,7 @@
+ #include "src/Core/util/DisableStupidWarnings.h"
+ 
+ extern "C" {
+-  #include <cholmod.h>
++  #include <RcppEigenCholmod.h>
+ }
+ 
+ /** \ingroup Support_modules
+@@ -45,4 +45,3 @@
+ #include "src/Core/util/ReenableStupidWarnings.h"
+ 
+ #endif // EIGEN_CHOLMODSUPPORT_MODULE_H
+-
+diff '--exclude=CMakeLists.txt' -ruw eigen-3.3.9/Eigen/src/Core/arch/CUDA/Half.h inst/include/Eigen/src/Core/arch/CUDA/Half.h
+--- eigen-3.3.9/Eigen/src/Core/arch/CUDA/Half.h	2020-12-04 15:53:41.000000000 -0600
++++ inst/include/Eigen/src/Core/arch/CUDA/Half.h	2020-12-05 09:31:13.765176934 -0600
+@@ -127,12 +127,14 @@
+   EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long) const {
+     return static_cast<unsigned long>(half_impl::half_to_float(*this));
+   }
++#if EIGEN_HAS_CXX11
+   EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(long long) const {
+     return static_cast<long long>(half_impl::half_to_float(*this));
+   }
+   EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(unsigned long long) const {
+     return static_cast<unsigned long long>(half_to_float(*this));
+   }
++#endif
+   EIGEN_DEVICE_FUNC EIGEN_EXPLICIT_CAST(float) const {
+     return half_impl::half_to_float(*this);
+   }
+diff '--exclude=CMakeLists.txt' -ruw eigen-3.3.9/Eigen/src/Core/arch/SSE/Complex.h inst/include/Eigen/src/Core/arch/SSE/Complex.h
+--- eigen-3.3.9/Eigen/src/Core/arch/SSE/Complex.h	2020-12-04 15:53:41.000000000 -0600
++++ inst/include/Eigen/src/Core/arch/SSE/Complex.h	2020-12-05 09:32:22.436352085 -0600
+@@ -98,10 +98,10 @@
+   res.v = _mm_loadl_pi(_mm_set1_ps(0.0f), reinterpret_cast<const __m64*>(&from));
+ #elif EIGEN_GNUC_AT_LEAST(4,6)
+   // Suppress annoying "may be used uninitialized in this function" warning with gcc >= 4.6
+-  #pragma GCC diagnostic push
+-  #pragma GCC diagnostic ignored "-Wuninitialized"
++  //#pragma GCC diagnostic push
++  //#pragma GCC diagnostic ignored "-Wuninitialized"
+   res.v = _mm_loadl_pi(res.v, (const __m64*)&from);
+-  #pragma GCC diagnostic pop
++  //#pragma GCC diagnostic pop
+ #else
+   res.v = _mm_loadl_pi(res.v, (const __m64*)&from);
+ #endif
+diff '--exclude=CMakeLists.txt' -ruw eigen-3.3.9/Eigen/src/Core/util/DisableStupidWarnings.h inst/include/Eigen/src/Core/util/DisableStupidWarnings.h
+--- eigen-3.3.9/Eigen/src/Core/util/DisableStupidWarnings.h	2020-12-04 15:53:41.000000000 -0600
++++ inst/include/Eigen/src/Core/util/DisableStupidWarnings.h	2020-12-05 09:37:01.133000285 -0600
+@@ -4,7 +4,6 @@
+ #ifdef _MSC_VER
+   // 4100 - unreferenced formal parameter (occurred e.g. in aligned_allocator::destroy(pointer p))
+   // 4101 - unreferenced local variable
+-  // 4127 - conditional expression is constant
+   // 4181 - qualifier applied to reference type ignored
+   // 4211 - nonstandard extension used : redefined extern to static
+   // 4244 - 'argument' : conversion from 'type1' to 'type2', possible loss of data
+@@ -20,7 +19,7 @@
+   #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
+     #pragma warning( push )
+   #endif
+-  #pragma warning( disable : 4100 4101 4127 4181 4211 4244 4273 4324 4503 4512 4522 4700 4714 4717 4800)
++  #pragma warning( disable : 4100 4101 4181 4211 4244 4273 4324 4503 4512 4522 4700 4714 4717 4800)
+ 
+ #elif defined __INTEL_COMPILER
+   // 2196 - routine is both "inline" and "noinline" ("noinline" assumed)
+@@ -38,32 +37,33 @@
+ #elif defined __clang__
+   // -Wconstant-logical-operand - warning: use of logical && with constant operand; switch to bitwise & or remove constant
+   //     this is really a stupid warning as it warns on compile-time expressions involving enums
+-  #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
+-    #pragma clang diagnostic push
+-  #endif
+-  #pragma clang diagnostic ignored "-Wconstant-logical-operand"
++  //#ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
++  //  #pragma clang diagnostic push
++  //#endif
++  //#pragma clang diagnostic ignored "-Wconstant-logical-operand"
+ 
+ #elif defined __GNUC__
+ 
+-  #if (!defined(EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS)) &&  (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6))
+-    #pragma GCC diagnostic push
+-  #endif
+-  // g++ warns about local variables shadowing member functions, which is too strict
+-  #pragma GCC diagnostic ignored "-Wshadow"
+-  #if __GNUC__ == 4 && __GNUC_MINOR__ < 8
+-    // Until g++-4.7 there are warnings when comparing unsigned int vs 0, even in templated functions:
+-    #pragma GCC diagnostic ignored "-Wtype-limits"
+-  #endif
+-  #if __GNUC__>=6
+-    #pragma GCC diagnostic ignored "-Wignored-attributes"
+-  #endif
+-  #if __GNUC__==7
+-    // See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89325
+-    #pragma GCC diagnostic ignored "-Wattributes"
+-  #endif
++  // #if (!defined(EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS)) &&  (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6))
++  //   #pragma GCC diagnostic push
++  // #endif
++  // // g++ warns about local variables shadowing member functions, which is too strict
++  // #pragma GCC diagnostic ignored "-Wshadow"
++  // #if __GNUC__ == 4 && __GNUC_MINOR__ < 8
++  //   // Until g++-4.7 there are warnings when comparing unsigned int vs 0, even in templated functions:
++  //   #pragma GCC diagnostic ignored "-Wtype-limits"
++  // #endif
++  // #if __GNUC__>=6
++  //   #pragma GCC diagnostic ignored "-Wignored-attributes"
++  // #endif
++  // #if __GNUC__==7
++  //   // See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89325
++  //   #pragma GCC diagnostic ignored "-Wattributes"
++  // #endif
+ #endif
+ 
+ #if defined __NVCC__
++  #pragma diag_suppress boolean_controlling_expr_is_constant
+   // Disable the "statement is unreachable" message
+   #pragma diag_suppress code_is_unreachable
+   // Disable the "dynamic initialization in unreachable code" message
+@@ -81,6 +81,7 @@
+   #pragma diag_suppress 2671
+   #pragma diag_suppress 2735
+   #pragma diag_suppress 2737
++  #pragma diag_suppress 2739
+ #endif
+ 
+ #else
+diff '--exclude=CMakeLists.txt' -ruw eigen-3.3.9/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h inst/include/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h
+--- eigen-3.3.9/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h	2020-12-04 15:53:41.000000000 -0600
++++ inst/include/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h	2020-12-05 09:38:56.131615691 -0600
+@@ -193,7 +193,9 @@
+         std::string curfile;
+         curfile = m_folder + "/" + m_curs_id->d_name;
+         // Discard if it is a folder
++#if !(defined(__sun) || defined(_AIX) || defined(__hpux) || defined(__sgi) || defined(__HAIKU__))
+         if (m_curs_id->d_type == DT_DIR) continue; //FIXME This may not be available on non BSD systems
++#endif
+ //         struct stat st_buf; 
+ //         stat (curfile.c_str(), &st_buf);
+ //         if (S_ISDIR(st_buf.st_mode)) continue;
diff --git a/patches/eigen-3.4.0.diff b/patches/eigen-3.4.0.diff
new file mode 100644
index 00000000..26d9215e
--- /dev/null
+++ b/patches/eigen-3.4.0.diff
@@ -0,0 +1,153 @@
+diff '--exclude=CMakeLists.txt' -ruw eigen-3.4.0/Eigen/CholmodSupport inst/include/Eigen/CholmodSupport
+--- eigen-3.4.0/Eigen/CholmodSupport	2021-08-19 04:41:58.000000000 +0800
++++ inst/include/Eigen/CholmodSupport	2021-10-26 12:58:06.061753725 +0800
+@@ -13,7 +13,7 @@
+ #include "src/Core/util/DisableStupidWarnings.h"
+
+ extern "C" {
+-  #include <cholmod.h>
++  #include <RcppEigenCholmod.h>
+ }
+
+ /** \ingroup Support_modules
+diff '--exclude=CMakeLists.txt' -ruw eigen-3.4.0/Eigen/src/CholmodSupport/CholmodSupport.h inst/include/Eigen/src/CholmodSupport/CholmodSupport.h
+--- eigen-3.4.0/Eigen/src/CholmodSupport/CholmodSupport.h	2021-08-19 04:41:58.000000000 +0800
++++ inst/include/Eigen/src/CholmodSupport/CholmodSupport.h	2021-10-26 14:01:03.556869005 +0800
+@@ -167,12 +167,10 @@
+ // template specializations for int and long that call the correct cholmod method
+
+ #define EIGEN_CHOLMOD_SPECIALIZE0(ret, name) \
+-    template<typename _StorageIndex> inline ret cm_ ## name       (cholmod_common &Common) { return cholmod_ ## name   (&Common); } \
+-    template<>                       inline ret cm_ ## name<SuiteSparse_long> (cholmod_common &Common) { return cholmod_l_ ## name (&Common); }
++    template<typename _StorageIndex> inline ret cm_ ## name       (cholmod_common &Common) { return cholmod_ ## name   (&Common); }
+
+ #define EIGEN_CHOLMOD_SPECIALIZE1(ret, name, t1, a1) \
+-    template<typename _StorageIndex> inline ret cm_ ## name       (t1& a1, cholmod_common &Common) { return cholmod_ ## name   (&a1, &Common); } \
+-    template<>                       inline ret cm_ ## name<SuiteSparse_long> (t1& a1, cholmod_common &Common) { return cholmod_l_ ## name (&a1, &Common); }
++    template<typename _StorageIndex> inline ret cm_ ## name       (t1& a1, cholmod_common &Common) { return cholmod_ ## name   (&a1, &Common); }
+
+ EIGEN_CHOLMOD_SPECIALIZE0(int, start)
+ EIGEN_CHOLMOD_SPECIALIZE0(int, finish)
+@@ -184,15 +182,15 @@
+ EIGEN_CHOLMOD_SPECIALIZE1(cholmod_factor*, analyze, cholmod_sparse, A)
+
+ template<typename _StorageIndex> inline cholmod_dense*  cm_solve         (int sys, cholmod_factor& L, cholmod_dense&  B, cholmod_common &Common) { return cholmod_solve     (sys, &L, &B, &Common); }
+-template<>                       inline cholmod_dense*  cm_solve<SuiteSparse_long>   (int sys, cholmod_factor& L, cholmod_dense&  B, cholmod_common &Common) { return cholmod_l_solve   (sys, &L, &B, &Common); }
++// template<>                       inline cholmod_dense*  cm_solve<SuiteSparse_long>   (int sys, cholmod_factor& L, cholmod_dense&  B, cholmod_common &Common) { return cholmod_l_solve   (sys, &L, &B, &Common); }
+
+ template<typename _StorageIndex> inline cholmod_sparse* cm_spsolve       (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return cholmod_spsolve   (sys, &L, &B, &Common); }
+-template<>                       inline cholmod_sparse* cm_spsolve<SuiteSparse_long> (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return cholmod_l_spsolve (sys, &L, &B, &Common); }
++// template<>                       inline cholmod_sparse* cm_spsolve<SuiteSparse_long> (int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common &Common) { return cholmod_l_spsolve (sys, &L, &B, &Common); }
+
+ template<typename _StorageIndex>
+ inline int  cm_factorize_p       (cholmod_sparse*  A, double beta[2], _StorageIndex* fset, std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return cholmod_factorize_p   (A, beta, fset, fsize, L, &Common); }
+-template<>
+-inline int  cm_factorize_p<SuiteSparse_long> (cholmod_sparse*  A, double beta[2], SuiteSparse_long* fset,          std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return cholmod_l_factorize_p (A, beta, fset, fsize, L, &Common); }
++// template<>
++// inline int  cm_factorize_p<SuiteSparse_long> (cholmod_sparse*  A, double beta[2], SuiteSparse_long* fset,          std::size_t fsize, cholmod_factor* L, cholmod_common &Common) { return cholmod_l_factorize_p (A, beta, fset, fsize, L, &Common); }
+
+ #undef EIGEN_CHOLMOD_SPECIALIZE0
+ #undef EIGEN_CHOLMOD_SPECIALIZE1
+diff '--exclude=CMakeLists.txt' -ruw eigen-3.4.0/Eigen/src/Core/util/DisableStupidWarnings.h inst/include/Eigen/src/Core/util/DisableStupidWarnings.h
+--- eigen-3.4.0/Eigen/src/Core/util/DisableStupidWarnings.h	2021-08-19 04:41:58.000000000 +0800
++++ inst/include/Eigen/src/Core/util/DisableStupidWarnings.h	2021-10-26 14:09:07.167883764 +0800
+@@ -37,40 +37,40 @@
+ #elif defined __clang__
+   // -Wconstant-logical-operand - warning: use of logical && with constant operand; switch to bitwise & or remove constant
+   //     this is really a stupid warning as it warns on compile-time expressions involving enums
+-  #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
+-    #pragma clang diagnostic push
+-  #endif
+-  #pragma clang diagnostic ignored "-Wconstant-logical-operand"
+-  #if __clang_major__ >= 3 && __clang_minor__ >= 5
+-    #pragma clang diagnostic ignored "-Wabsolute-value"
+-  #endif
+-  #if __clang_major__ >= 10
+-    #pragma clang diagnostic ignored "-Wimplicit-int-float-conversion"
+-  #endif
+-  #if ( defined(__ALTIVEC__) || defined(__VSX__) ) && __cplusplus < 201103L
+-    // warning: generic selections are a C11-specific feature
+-    // ignoring warnings thrown at vec_ctf in Altivec/PacketMath.h
+-    #pragma clang diagnostic ignored "-Wc11-extensions"
+-  #endif
++  // #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
++  //   #pragma clang diagnostic push
++  // #endif
++  // #pragma clang diagnostic ignored "-Wconstant-logical-operand"
++  // #if __clang_major__ >= 3 && __clang_minor__ >= 5
++  //   #pragma clang diagnostic ignored "-Wabsolute-value"
++  // #endif
++  // #if __clang_major__ >= 10
++  //   #pragma clang diagnostic ignored "-Wimplicit-int-float-conversion"
++  // #endif
++  // #if ( defined(__ALTIVEC__) || defined(__VSX__) ) && __cplusplus < 201103L
++  //   // warning: generic selections are a C11-specific feature
++  //   // ignoring warnings thrown at vec_ctf in Altivec/PacketMath.h
++  //   #pragma clang diagnostic ignored "-Wc11-extensions"
++  // #endif
+
+ #elif defined __GNUC__
+
+-  #if (!defined(EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS)) &&  (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6))
+-    #pragma GCC diagnostic push
+-  #endif
+-  // g++ warns about local variables shadowing member functions, which is too strict
+-  #pragma GCC diagnostic ignored "-Wshadow"
+-  #if __GNUC__ == 4 && __GNUC_MINOR__ < 8
+-    // Until g++-4.7 there are warnings when comparing unsigned int vs 0, even in templated functions:
+-    #pragma GCC diagnostic ignored "-Wtype-limits"
+-  #endif
+-  #if __GNUC__>=6
+-    #pragma GCC diagnostic ignored "-Wignored-attributes"
+-  #endif
+-  #if __GNUC__==7
+-    // See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89325
+-    #pragma GCC diagnostic ignored "-Wattributes"
+-  #endif
++  // #if (!defined(EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS)) &&  (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6))
++  //   #pragma GCC diagnostic push
++  // #endif
++  // // g++ warns about local variables shadowing member functions, which is too strict
++  // #pragma GCC diagnostic ignored "-Wshadow"
++  // #if __GNUC__ == 4 && __GNUC_MINOR__ < 8
++  //   // Until g++-4.7 there are warnings when comparing unsigned int vs 0, even in templated functions:
++  //   #pragma GCC diagnostic ignored "-Wtype-limits"
++  // #endif
++  // #if __GNUC__>=6
++  //   #pragma GCC diagnostic ignored "-Wignored-attributes"
++  // #endif
++  // #if __GNUC__==7
++  //   // See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89325
++  //   #pragma GCC diagnostic ignored "-Wattributes"
++  // #endif
+ #endif
+
+ #if defined __NVCC__
+diff '--exclude=CMakeLists.txt' -ruw eigen-3.4.0/Eigen/src/Core/util/ReenableStupidWarnings.h inst/include/Eigen/src/Core/util/ReenableStupidWarnings.h
+--- eigen-3.4.0/Eigen/src/Core/util/ReenableStupidWarnings.h	2021-08-19 04:41:58.000000000 +0800
++++ inst/include/Eigen/src/Core/util/ReenableStupidWarnings.h	2021-10-26 14:20:01.194903723 +0800
+@@ -11,9 +11,9 @@
+   #elif defined __INTEL_COMPILER
+     #pragma warning pop
+   #elif defined __clang__
+-    #pragma clang diagnostic pop
++    // #pragma clang diagnostic pop
+   #elif defined __GNUC__  &&  (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6))
+-    #pragma GCC diagnostic pop
++    // #pragma GCC diagnostic pop
+   #endif
+
+   #if defined __NVCC__
+diff '--exclude=CMakeLists.txt' -ruw eigen-3.4.0/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h inst/include/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h
+--- eigen-3.4.0/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h	2021-08-19 04:41:58.000000000 +0800
++++ inst/include/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h	2021-10-26 13:07:49.922771543 +0800
+@@ -193,7 +193,9 @@
+         std::string curfile;
+         curfile = m_folder + "/" + m_curs_id->d_name;
+         // Discard if it is a folder
++#if !(defined(__sun) || defined(_AIX) || defined(__hpux) || defined(__sgi) || defined(__HAIKU__))
+         if (m_curs_id->d_type == DT_DIR) continue; //FIXME This may not be available on non BSD systems
++#endif
+ //         struct stat st_buf;
+ //         stat (curfile.c_str(), &st_buf);
+ //         if (S_ISDIR(st_buf.st_mode)) continue;
diff --git a/patches/eigen-5.0.0.diff b/patches/eigen-5.0.0.diff
new file mode 100644
index 00000000..e6e48c11
--- /dev/null
+++ b/patches/eigen-5.0.0.diff
@@ -0,0 +1,330 @@
+diff --git c/inst/include/Eigen/CholmodSupport w/inst/include/Eigen/CholmodSupport
+index adc5f8d..bff39e6 100644
+--- c/inst/include/Eigen/CholmodSupport
++++ w/inst/include/Eigen/CholmodSupport
+@@ -12,7 +12,7 @@
+
+ #include "src/Core/util/DisableStupidWarnings.h"
+
+-#include <cholmod.h>
++#include <RcppEigenCholmod.h>
+
+ /** \ingroup Support_modules
+  * \defgroup CholmodSupport_Module CholmodSupport module
+diff --git c/inst/include/Eigen/src/CholmodSupport/CholmodSupport.h w/inst/include/Eigen/src/CholmodSupport/CholmodSupport.h
+index 7e3c881..758fb5a 100644
+--- c/inst/include/Eigen/src/CholmodSupport/CholmodSupport.h
++++ w/inst/include/Eigen/src/CholmodSupport/CholmodSupport.h
+@@ -13,6 +13,10 @@
+ // IWYU pragma: private
+ #include "./InternalHeaderCheck.h"
+
++#ifndef R_MATRIX_CHOLMOD
++# define R_MATRIX_CHOLMOD(_NAME_) cholmod_ ## _NAME_
++#endif
++
+ namespace Eigen {
+
+ namespace internal {
+@@ -84,8 +88,8 @@ cholmod_sparse viewAsCholmod(Ref<SparseMatrix<Scalar_, Options_, StorageIndex_>
+
+   if (internal::is_same<StorageIndex_, int>::value) {
+     res.itype = CHOLMOD_INT;
+-  } else if (internal::is_same<StorageIndex_, SuiteSparse_long>::value) {
+-    res.itype = CHOLMOD_LONG;
++  // } else if (internal::is_same<StorageIndex_, SuiteSparse_long>::value) {
++  //   res.itype = CHOLMOD_LONG;
+   } else {
+     eigen_assert(false && "Index type not supported yet");
+   }
+@@ -172,22 +176,14 @@ namespace internal {
+ #define EIGEN_CHOLMOD_SPECIALIZE0(ret, name)                        \
+   template <typename StorageIndex_>                                 \
+   inline ret cm_##name(cholmod_common& Common) {                    \
+-    return cholmod_##name(&Common);                                 \
+-  }                                                                 \
+-  template <>                                                       \
+-  inline ret cm_##name<SuiteSparse_long>(cholmod_common & Common) { \
+-    return cholmod_l_##name(&Common);                               \
+-  }
++    return R_MATRIX_CHOLMOD(name)(&Common);			    \
++  }
+
+-#define EIGEN_CHOLMOD_SPECIALIZE1(ret, name, t1, a1)                         \
+-  template <typename StorageIndex_>                                          \
+-  inline ret cm_##name(t1& a1, cholmod_common& Common) {                     \
+-    return cholmod_##name(&a1, &Common);                                     \
+-  }                                                                          \
+-  template <>                                                                \
+-  inline ret cm_##name<SuiteSparse_long>(t1 & a1, cholmod_common & Common) { \
+-    return cholmod_l_##name(&a1, &Common);                                   \
+-  }
++#define EIGEN_CHOLMOD_SPECIALIZE1(ret, name, t1, a1)                    \
++  template <typename StorageIndex_>                                     \
++  inline ret cm_##name(t1& a1, cholmod_common& Common) {                \
++    return R_MATRIX_CHOLMOD(name) (&a1, &Common);			\
++  }
+
+ EIGEN_CHOLMOD_SPECIALIZE0(int, start)
+ EIGEN_CHOLMOD_SPECIALIZE0(int, finish)
+@@ -201,33 +197,33 @@ EIGEN_CHOLMOD_SPECIALIZE1(cholmod_sparse*, factor_to_sparse, cholmod_factor, L)
+
+ template <typename StorageIndex_>
+ inline cholmod_dense* cm_solve(int sys, cholmod_factor& L, cholmod_dense& B, cholmod_common& Common) {
+-  return cholmod_solve(sys, &L, &B, &Common);
+-}
+-template <>
+-inline cholmod_dense* cm_solve<SuiteSparse_long>(int sys, cholmod_factor& L, cholmod_dense& B, cholmod_common& Common) {
+-  return cholmod_l_solve(sys, &L, &B, &Common);
++  return R_MATRIX_CHOLMOD(solve) (sys, &L, &B, &Common);
+ }
++// template <>
++// inline cholmod_dense* cm_solve<SuiteSparse_long>(int sys, cholmod_factor& L, cholmod_dense& B, cholmod_common& Common) {
++//   return cholmod_l_solve(sys, &L, &B, &Common);
++// }
+
+ template <typename StorageIndex_>
+ inline cholmod_sparse* cm_spsolve(int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common& Common) {
+-  return cholmod_spsolve(sys, &L, &B, &Common);
+-}
+-template <>
+-inline cholmod_sparse* cm_spsolve<SuiteSparse_long>(int sys, cholmod_factor& L, cholmod_sparse& B,
+-                                                    cholmod_common& Common) {
+-  return cholmod_l_spsolve(sys, &L, &B, &Common);
++  return R_MATRIX_CHOLMOD(spsolve) (sys, &L, &B, &Common);
+ }
++// template <>
++// inline cholmod_sparse* cm_spsolve<SuiteSparse_long>(int sys, cholmod_factor& L, cholmod_sparse& B,
++//                                                     cholmod_common& Common) {
++//   return cholmod_l_spsolve(sys, &L, &B, &Common);
++// }
+
+ template <typename StorageIndex_>
+ inline int cm_factorize_p(cholmod_sparse* A, double beta[2], StorageIndex_* fset, std::size_t fsize, cholmod_factor* L,
+                           cholmod_common& Common) {
+-  return cholmod_factorize_p(A, beta, fset, fsize, L, &Common);
+-}
+-template <>
+-inline int cm_factorize_p<SuiteSparse_long>(cholmod_sparse* A, double beta[2], SuiteSparse_long* fset,
+-                                            std::size_t fsize, cholmod_factor* L, cholmod_common& Common) {
+-  return cholmod_l_factorize_p(A, beta, fset, fsize, L, &Common);
++  return R_MATRIX_CHOLMOD(factorize_p) (A, beta, fset, fsize, L, &Common);
+ }
++// template <>
++// inline int cm_factorize_p<SuiteSparse_long>(cholmod_sparse* A, double beta[2], SuiteSparse_long* fset,
++//                                             std::size_t fsize, cholmod_factor* L, cholmod_common& Common) {
++//   return cholmod_l_factorize_p(A, beta, fset, fsize, L, &Common);
++// }
+
+ #undef EIGEN_CHOLMOD_SPECIALIZE0
+ #undef EIGEN_CHOLMOD_SPECIALIZE1
+diff --git c/inst/include/Eigen/src/Core/util/DisableStupidWarnings.h w/inst/include/Eigen/src/Core/util/DisableStupidWarnings.h
+index ab0c542..8c27b14 100644
+--- c/inst/include/Eigen/src/Core/util/DisableStupidWarnings.h
++++ w/inst/include/Eigen/src/Core/util/DisableStupidWarnings.h
+@@ -42,45 +42,45 @@
+ #pragma warning disable 2196 279 1684 2259
+
+ #elif defined __clang__
+-#ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
+-#pragma clang diagnostic push
+-#endif
+-#if defined(__has_warning)
+-// -Wconstant-logical-operand - warning: use of logical && with constant operand; switch to bitwise & or remove constant
+-//     this is really a stupid warning as it warns on compile-time expressions involving enums
+-#if __has_warning("-Wconstant-logical-operand")
+-#pragma clang diagnostic ignored "-Wconstant-logical-operand"
+-#endif
+-#if __has_warning("-Wimplicit-int-float-conversion")
+-#pragma clang diagnostic ignored "-Wimplicit-int-float-conversion"
+-#endif
+-#if (defined(__ALTIVEC__) || defined(__VSX__)) && (!defined(__STDC_VERSION__) || (__STDC_VERSION__ < 201112L))
+-// warning: generic selections are a C11-specific feature
+-// ignoring warnings thrown at vec_ctf in Altivec/PacketMath.h
+-#if __has_warning("-Wc11-extensions")
+-#pragma clang diagnostic ignored "-Wc11-extensions"
+-#endif
+-#endif
+-#endif
++// #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
++// #pragma clang diagnostic push
++// #endif
++// #if defined(__has_warning)
++// // -Wconstant-logical-operand - warning: use of logical && with constant operand; switch to bitwise & or remove constant
++// //     this is really a stupid warning as it warns on compile-time expressions involving enums
++// #if __has_warning("-Wconstant-logical-operand")
++// #pragma clang diagnostic ignored "-Wconstant-logical-operand"
++// #endif
++// #if __has_warning("-Wimplicit-int-float-conversion")
++// #pragma clang diagnostic ignored "-Wimplicit-int-float-conversion"
++// #endif
++// #if (defined(__ALTIVEC__) || defined(__VSX__)) && (!defined(__STDC_VERSION__) || (__STDC_VERSION__ < 201112L))
++// // warning: generic selections are a C11-specific feature
++// // ignoring warnings thrown at vec_ctf in Altivec/PacketMath.h
++// #if __has_warning("-Wc11-extensions")
++// #pragma clang diagnostic ignored "-Wc11-extensions"
++// #endif
++// #endif
++// #endif
+
+ #elif defined __GNUC__ && !defined(__FUJITSU)
+
+-#if (!defined(EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS)) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6))
+-#pragma GCC diagnostic push
+-#endif
+-// g++ warns about local variables shadowing member functions, which is too strict
+-#pragma GCC diagnostic ignored "-Wshadow"
+-#if __GNUC__ == 4 && __GNUC_MINOR__ < 8
+-// Until g++-4.7 there are warnings when comparing unsigned int vs 0, even in templated functions:
+-#pragma GCC diagnostic ignored "-Wtype-limits"
+-#endif
+-#if __GNUC__ >= 6
+-#pragma GCC diagnostic ignored "-Wignored-attributes"
+-#endif
+-#if __GNUC__ == 7
+-// See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89325
+-#pragma GCC diagnostic ignored "-Wattributes"
+-#endif
++// #if (!defined(EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS)) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6))
++// #pragma GCC diagnostic push
++// #endif
++// // g++ warns about local variables shadowing member functions, which is too strict
++// #pragma GCC diagnostic ignored "-Wshadow"
++// #if __GNUC__ == 4 && __GNUC_MINOR__ < 8
++// // Until g++-4.7 there are warnings when comparing unsigned int vs 0, even in templated functions:
++// #pragma GCC diagnostic ignored "-Wtype-limits"
++// #endif
++// #if __GNUC__ >= 6
++// #pragma GCC diagnostic ignored "-Wignored-attributes"
++// #endif
++// #if __GNUC__ == 7
++// // See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89325
++// #pragma GCC diagnostic ignored "-Wattributes"
++// #endif
+ #endif
+
+ #if defined __NVCC__ && defined __CUDACC__
+diff --git c/inst/include/RcppEigenForward.h w/inst/include/RcppEigenForward.h
+index ef347e4..a41212c 100644
+--- c/inst/include/RcppEigenForward.h
++++ w/inst/include/RcppEigenForward.h
+@@ -54,10 +54,10 @@ namespace Rcpp {
+ 	template<typename T> class Exporter< Eigen::Array<T, Eigen::Dynamic, 1> >;
+ 	template<typename T> class Exporter< Eigen::Array<T, 1, Eigen::Dynamic> >;
+     template<typename T> class Exporter< Eigen::Map<Eigen::SparseMatrix<T> > >;
+-	template<typename T> class Exporter< Eigen::MappedSparseMatrix<T> >;  // Deprecated
++    //template<typename T> class Exporter< Eigen::MappedSparseMatrix<T> >;  // Deprecated
+ 	template<typename T> class Exporter< Eigen::SparseMatrix<T> >;
+     template<typename T> class Exporter< Eigen::Map<Eigen::SparseMatrix<T, Eigen::RowMajor> > >;
+-	template<typename T> class Exporter< Eigen::MappedSparseMatrix<T, Eigen::RowMajor> >;  // Deprecated
++    //template<typename T> class Exporter< Eigen::MappedSparseMatrix<T, Eigen::RowMajor> >;  // Deprecated
+ 	template<typename T> class Exporter< Eigen::SparseMatrix<T, Eigen::RowMajor> >;
+
+     } // namespace traits
+diff --git c/inst/include/RcppEigenWrap.h w/inst/include/RcppEigenWrap.h
+index 3c0e04e..4750b61 100644
+--- c/inst/include/RcppEigenWrap.h
++++ w/inst/include/RcppEigenWrap.h
+@@ -310,24 +310,24 @@ namespace Rcpp{
+             IntegerVector d_dims, d_i, d_p;
+             Vector<RTYPE> xx ;
+         };
+-        // Deprecated
+-        template<typename T>
+-        class Exporter<Eigen::MappedSparseMatrix<T> > {
+-        public:
+-            const static int RTYPE = ::Rcpp::traits::r_sexptype_traits<T>::rtype ;
+-            Exporter(SEXP x) : d_x(x), d_dims(d_x.slot("Dim")), d_i(d_x.slot("i")), d_p(d_x.slot("p")), xx( d_x.slot("x") ) {
+-                if (!d_x.is("dgCMatrix"))
+-                    throw std::invalid_argument("Need S4 class dgCMatrix for a mapped sparse matrix");
+-            }
+-            Eigen::MappedSparseMatrix<T> get() {
+-                return Eigen::MappedSparseMatrix<T>(d_dims[0], d_dims[1], d_p[d_dims[1]],
+-                                                    d_p.begin(), d_i.begin(), xx.begin() );
+-            }
+-        protected:
+-            S4            d_x;
+-            IntegerVector d_dims, d_i, d_p;
+-            Vector<RTYPE> xx ;
+-        };
++        // // Deprecated
++        // template<typename T>
++        // class Exporter<Eigen::MappedSparseMatrix<T> > {
++        // public:
++        //     const static int RTYPE = ::Rcpp::traits::r_sexptype_traits<T>::rtype ;
++        //     Exporter(SEXP x) : d_x(x), d_dims(d_x.slot("Dim")), d_i(d_x.slot("i")), d_p(d_x.slot("p")), xx( d_x.slot("x") ) {
++        //         if (!d_x.is("dgCMatrix"))
++        //             throw std::invalid_argument("Need S4 class dgCMatrix for a mapped sparse matrix");
++        //     }
++        //     Eigen::MappedSparseMatrix<T> get() {
++        //         return Eigen::MappedSparseMatrix<T>(d_dims[0], d_dims[1], d_p[d_dims[1]],
++        //                                             d_p.begin(), d_i.begin(), xx.begin() );
++        //     }
++        // protected:
++        //     S4            d_x;
++        //     IntegerVector d_dims, d_i, d_p;
++        //     Vector<RTYPE> xx ;
++        // };
+
+         // Starting from Eigen 3.3 MappedSparseMatrix was deprecated.
+         // The new type is Map<SparseMatrix>.
+@@ -348,24 +348,24 @@ namespace Rcpp{
+             IntegerVector d_dims, d_j, d_p;
+             Vector<RTYPE> xx ;
+         };
+-        // Deprecated
+-        template<typename T>
+-        class Exporter<Eigen::MappedSparseMatrix<T, Eigen::RowMajor> > {
+-        public:
+-            const static int RTYPE = ::Rcpp::traits::r_sexptype_traits<T>::rtype ;
+-            Exporter(SEXP x) : d_x(x), d_dims(d_x.slot("Dim")), d_j(d_x.slot("j")), d_p(d_x.slot("p")), xx( d_x.slot("x") ) {
+-                if (!d_x.is("dgRMatrix"))
+-                    throw std::invalid_argument("Need S4 class dgRMatrix for a mapped sparse matrix");
+-            }
+-            Eigen::MappedSparseMatrix<T, Eigen::RowMajor> get() {
+-                return Eigen::MappedSparseMatrix<T, Eigen::RowMajor>(d_dims[0], d_dims[1], d_p[d_dims[1]],
+-                                                                     d_p.begin(), d_j.begin(), xx.begin() );
+-            }
+-        protected:
+-            S4            d_x;
+-            IntegerVector d_dims, d_j, d_p;
+-            Vector<RTYPE> xx ;
+-        };
++        // // Deprecated
++        // template<typename T>
++        // class Exporter<Eigen::MappedSparseMatrix<T, Eigen::RowMajor> > {
++        // public:
++        //     const static int RTYPE = ::Rcpp::traits::r_sexptype_traits<T>::rtype ;
++        //     Exporter(SEXP x) : d_x(x), d_dims(d_x.slot("Dim")), d_j(d_x.slot("j")), d_p(d_x.slot("p")), xx( d_x.slot("x") ) {
++        //         if (!d_x.is("dgRMatrix"))
++        //             throw std::invalid_argument("Need S4 class dgRMatrix for a mapped sparse matrix");
++        //     }
++        //     Eigen::MappedSparseMatrix<T, Eigen::RowMajor> get() {
++        //         return Eigen::MappedSparseMatrix<T, Eigen::RowMajor>(d_dims[0], d_dims[1], d_p[d_dims[1]],
++        //                                                              d_p.begin(), d_j.begin(), xx.begin() );
++        //     }
++        // protected:
++        //     S4            d_x;
++        //     IntegerVector d_dims, d_j, d_p;
++        //     Vector<RTYPE> xx ;
++        // };
+
+         template<typename T>
+         class Exporter<Eigen::SparseMatrix<T> > {
+diff --git c/inst/include/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h w/inst/include/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h
+index 21b62f6..0ccb566 100644
+--- c/inst/include/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h
++++ w/inst/include/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h
+@@ -170,10 +170,10 @@ class MatrixMarketIterator {
+       m_isvalid = false;
+       std::string curfile;
+       curfile = m_folder + "/" + m_curs_id->d_name;
+-      // Discard if it is a folder
+ #if !(defined(__sun) || defined(_AIX) || defined(__hpux) || defined(__sgi) || defined(__HAIKU__))
+-      if (m_curs_id->d_type == DT_DIR) continue;  // FIXME This may not be available on non BSD systems
++      // Discard if it is a folder
+ #endif
++      if (m_curs_id->d_type == DT_DIR) continue;  // FIXME This may not be available on non BSD systems
+       //         struct stat st_buf;
+       //         stat (curfile.c_str(), &st_buf);
+       //         if (S_ISDIR(st_buf.st_mode)) continue;
diff --git a/patches/eigen-5.0.1.diff b/patches/eigen-5.0.1.diff
new file mode 100644
index 00000000..18bf1d60
--- /dev/null
+++ b/patches/eigen-5.0.1.diff
@@ -0,0 +1,222 @@
+diff '--exclude=CMakeLists.txt' -ruw ../eigen/Eigen/CholmodSupport inst/include/Eigen/CholmodSupport
+--- ../eigen/Eigen/CholmodSupport	2026-06-04 19:21:16.915228190 -0500
++++ inst/include/Eigen/CholmodSupport	2026-06-04 19:34:11.433660905 -0500
+@@ -12,7 +12,7 @@
+
+ #include "src/Core/util/DisableStupidWarnings.h"
+
+-#include <cholmod.h>
++#include <RcppEigenCholmod.h>
+
+ /** \ingroup Support_modules
+  * \defgroup CholmodSupport_Module CholmodSupport module
+diff '--exclude=CMakeLists.txt' -ruw ../eigen/Eigen/src/CholmodSupport/CholmodSupport.h inst/include/Eigen/src/CholmodSupport/CholmodSupport.h
+--- ../eigen/Eigen/src/CholmodSupport/CholmodSupport.h	2026-06-04 19:21:16.916228218 -0500
++++ inst/include/Eigen/src/CholmodSupport/CholmodSupport.h	2026-06-04 19:34:11.433726981 -0500
+@@ -13,6 +13,10 @@
+ // IWYU pragma: private
+ #include "./InternalHeaderCheck.h"
+
++#ifndef R_MATRIX_CHOLMOD
++# define R_MATRIX_CHOLMOD(_NAME_) cholmod_ ## _NAME_
++#endif
++
+ namespace Eigen {
+
+ namespace internal {
+@@ -84,8 +88,8 @@
+
+   if (internal::is_same<StorageIndex_, int>::value) {
+     res.itype = CHOLMOD_INT;
+-  } else if (internal::is_same<StorageIndex_, SuiteSparse_long>::value) {
+-    res.itype = CHOLMOD_LONG;
++  // } else if (internal::is_same<StorageIndex_, SuiteSparse_long>::value) {
++  //   res.itype = CHOLMOD_LONG;
+   } else {
+     eigen_assert(false && "Index type not supported yet");
+   }
+@@ -172,21 +176,13 @@
+ #define EIGEN_CHOLMOD_SPECIALIZE0(ret, name)                        \
+   template <typename StorageIndex_>                                 \
+   inline ret cm_##name(cholmod_common& Common) {                    \
+-    return cholmod_##name(&Common);                                 \
+-  }                                                                 \
+-  template <>                                                       \
+-  inline ret cm_##name<SuiteSparse_long>(cholmod_common & Common) { \
+-    return cholmod_l_##name(&Common);                               \
++    return R_MATRIX_CHOLMOD(name)(&Common);			    \
+   }
+
+ #define EIGEN_CHOLMOD_SPECIALIZE1(ret, name, t1, a1)                         \
+   template <typename StorageIndex_>                                          \
+   inline ret cm_##name(t1& a1, cholmod_common& Common) {                     \
+-    return cholmod_##name(&a1, &Common);                                     \
+-  }                                                                          \
+-  template <>                                                                \
+-  inline ret cm_##name<SuiteSparse_long>(t1 & a1, cholmod_common & Common) { \
+-    return cholmod_l_##name(&a1, &Common);                                   \
++    return R_MATRIX_CHOLMOD(name) (&a1, &Common);			\
+   }
+
+ EIGEN_CHOLMOD_SPECIALIZE0(int, start)
+@@ -201,33 +197,33 @@
+
+ template <typename StorageIndex_>
+ inline cholmod_dense* cm_solve(int sys, cholmod_factor& L, cholmod_dense& B, cholmod_common& Common) {
+-  return cholmod_solve(sys, &L, &B, &Common);
+-}
+-template <>
+-inline cholmod_dense* cm_solve<SuiteSparse_long>(int sys, cholmod_factor& L, cholmod_dense& B, cholmod_common& Common) {
+-  return cholmod_l_solve(sys, &L, &B, &Common);
++  return R_MATRIX_CHOLMOD(solve) (sys, &L, &B, &Common);
+ }
++// template <>
++// inline cholmod_dense* cm_solve<SuiteSparse_long>(int sys, cholmod_factor& L, cholmod_dense& B, cholmod_common& Common) {
++//   return cholmod_l_solve(sys, &L, &B, &Common);
++// }
+
+ template <typename StorageIndex_>
+ inline cholmod_sparse* cm_spsolve(int sys, cholmod_factor& L, cholmod_sparse& B, cholmod_common& Common) {
+-  return cholmod_spsolve(sys, &L, &B, &Common);
+-}
+-template <>
+-inline cholmod_sparse* cm_spsolve<SuiteSparse_long>(int sys, cholmod_factor& L, cholmod_sparse& B,
+-                                                    cholmod_common& Common) {
+-  return cholmod_l_spsolve(sys, &L, &B, &Common);
++  return R_MATRIX_CHOLMOD(spsolve) (sys, &L, &B, &Common);
+ }
++// template <>
++// inline cholmod_sparse* cm_spsolve<SuiteSparse_long>(int sys, cholmod_factor& L, cholmod_sparse& B,
++//                                                     cholmod_common& Common) {
++//   return cholmod_l_spsolve(sys, &L, &B, &Common);
++// }
+
+ template <typename StorageIndex_>
+ inline int cm_factorize_p(cholmod_sparse* A, double beta[2], StorageIndex_* fset, std::size_t fsize, cholmod_factor* L,
+                           cholmod_common& Common) {
+-  return cholmod_factorize_p(A, beta, fset, fsize, L, &Common);
+-}
+-template <>
+-inline int cm_factorize_p<SuiteSparse_long>(cholmod_sparse* A, double beta[2], SuiteSparse_long* fset,
+-                                            std::size_t fsize, cholmod_factor* L, cholmod_common& Common) {
+-  return cholmod_l_factorize_p(A, beta, fset, fsize, L, &Common);
++  return R_MATRIX_CHOLMOD(factorize_p) (A, beta, fset, fsize, L, &Common);
+ }
++// template <>
++// inline int cm_factorize_p<SuiteSparse_long>(cholmod_sparse* A, double beta[2], SuiteSparse_long* fset,
++//                                             std::size_t fsize, cholmod_factor* L, cholmod_common& Common) {
++//   return cholmod_l_factorize_p(A, beta, fset, fsize, L, &Common);
++// }
+
+ #undef EIGEN_CHOLMOD_SPECIALIZE0
+ #undef EIGEN_CHOLMOD_SPECIALIZE1
+diff '--exclude=CMakeLists.txt' -ruw ../eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h inst/include/Eigen/src/Core/arch/AltiVec/PacketMath.h
+--- ../eigen/Eigen/src/Core/arch/AltiVec/PacketMath.h   2026-06-04 19:21:16.924228445 -0500
++++ inst/include/Eigen/src/Core/arch/AltiVec/PacketMath.h       2026-06-04 19:48:47.058195837 -0500
+@@ -486,7 +486,7 @@
+   // Ignore partial input memory initialized
+ #if !EIGEN_COMP_LLVM
+ #pragma GCC diagnostic push
+-#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
++//#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+ #endif
+ #ifdef EIGEN_VECTORIZE_VSX
+   return vec_xl(0, const_cast<__UNPACK_TYPE__(Packet)*>(from));
+diff '--exclude=CMakeLists.txt' -ruw ../eigen/Eigen/src/Core/util/DisableStupidWarnings.h inst/include/Eigen/src/Core/util/DisableStupidWarnings.h
+--- ../eigen/Eigen/src/Core/util/DisableStupidWarnings.h	2026-06-04 19:21:16.930228615 -0500
++++ inst/include/Eigen/src/Core/util/DisableStupidWarnings.h	2026-06-04 19:34:11.433784882 -0500
+@@ -42,45 +42,45 @@
+ #pragma warning disable 2196 279 1684 2259
+
+ #elif defined __clang__
+-#ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
+-#pragma clang diagnostic push
+-#endif
+-#if defined(__has_warning)
+-// -Wconstant-logical-operand - warning: use of logical && with constant operand; switch to bitwise & or remove constant
+-//     this is really a stupid warning as it warns on compile-time expressions involving enums
+-#if __has_warning("-Wconstant-logical-operand")
+-#pragma clang diagnostic ignored "-Wconstant-logical-operand"
+-#endif
+-#if __has_warning("-Wimplicit-int-float-conversion")
+-#pragma clang diagnostic ignored "-Wimplicit-int-float-conversion"
+-#endif
+-#if (defined(__ALTIVEC__) || defined(__VSX__)) && (!defined(__STDC_VERSION__) || (__STDC_VERSION__ < 201112L))
+-// warning: generic selections are a C11-specific feature
+-// ignoring warnings thrown at vec_ctf in Altivec/PacketMath.h
+-#if __has_warning("-Wc11-extensions")
+-#pragma clang diagnostic ignored "-Wc11-extensions"
+-#endif
+-#endif
+-#endif
++// #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS
++// #pragma clang diagnostic push
++// #endif
++// #if defined(__has_warning)
++// // -Wconstant-logical-operand - warning: use of logical && with constant operand; switch to bitwise & or remove constant
++// //     this is really a stupid warning as it warns on compile-time expressions involving enums
++// #if __has_warning("-Wconstant-logical-operand")
++// #pragma clang diagnostic ignored "-Wconstant-logical-operand"
++// #endif
++// #if __has_warning("-Wimplicit-int-float-conversion")
++// #pragma clang diagnostic ignored "-Wimplicit-int-float-conversion"
++// #endif
++// #if (defined(__ALTIVEC__) || defined(__VSX__)) && (!defined(__STDC_VERSION__) || (__STDC_VERSION__ < 201112L))
++// // warning: generic selections are a C11-specific feature
++// // ignoring warnings thrown at vec_ctf in Altivec/PacketMath.h
++// #if __has_warning("-Wc11-extensions")
++// #pragma clang diagnostic ignored "-Wc11-extensions"
++// #endif
++// #endif
++// #endif
+
+ #elif defined __GNUC__ && !defined(__FUJITSU)
+
+-#if (!defined(EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS)) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6))
+-#pragma GCC diagnostic push
+-#endif
+-// g++ warns about local variables shadowing member functions, which is too strict
+-#pragma GCC diagnostic ignored "-Wshadow"
+-#if __GNUC__ == 4 && __GNUC_MINOR__ < 8
+-// Until g++-4.7 there are warnings when comparing unsigned int vs 0, even in templated functions:
+-#pragma GCC diagnostic ignored "-Wtype-limits"
+-#endif
+-#if __GNUC__ >= 6
+-#pragma GCC diagnostic ignored "-Wignored-attributes"
+-#endif
+-#if __GNUC__ == 7
+-// See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89325
+-#pragma GCC diagnostic ignored "-Wattributes"
+-#endif
++// #if (!defined(EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS)) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6))
++// #pragma GCC diagnostic push
++// #endif
++// // g++ warns about local variables shadowing member functions, which is too strict
++// #pragma GCC diagnostic ignored "-Wshadow"
++// #if __GNUC__ == 4 && __GNUC_MINOR__ < 8
++// // Until g++-4.7 there are warnings when comparing unsigned int vs 0, even in templated functions:
++// #pragma GCC diagnostic ignored "-Wtype-limits"
++// #endif
++// #if __GNUC__ >= 6
++// #pragma GCC diagnostic ignored "-Wignored-attributes"
++// #endif
++// #if __GNUC__ == 7
++// // See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89325
++// #pragma GCC diagnostic ignored "-Wattributes"
++// #endif
+ #endif
+
+ #if defined __NVCC__ && defined __CUDACC__
+diff '--exclude=CMakeLists.txt' -ruw ../eigen/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h inst/include/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h
+--- ../eigen/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h	2026-06-04 19:21:16.986230202 -0500
++++ inst/include/unsupported/Eigen/src/SparseExtra/MatrixMarketIterator.h	2026-06-04 19:38:28.113063684 -0500
+@@ -171,7 +171,9 @@
+       std::string curfile;
+       curfile = m_folder + "/" + m_curs_id->d_name;
+       // Discard if it is a folder
++#if !(defined(__sun) || defined(_AIX) || defined(__hpux) || defined(__sgi) || defined(__HAIKU__))
+       if (m_curs_id->d_type == DT_DIR) continue;  // FIXME This may not be available on non BSD systems
++#endif
+       //         struct stat st_buf;
+       //         stat (curfile.c_str(), &st_buf);
+       //         if (S_ISDIR(st_buf.st_mode)) continue;
diff --git a/patches/howToDiff.md b/patches/howToDiff.md
new file mode 100644
index 00000000..57e8086c
--- /dev/null
+++ b/patches/howToDiff.md
@@ -0,0 +1,14 @@
+
+### Hint
+
+```sh
+diff --exclude=CMakeLists.txt -ruw eigen-3.4.0/Eigen/ inst/include/Eigen/ > patches/eigen-3.4.0.diff
+diff --exclude=CMakeLists.txt -ruw eigen-3.4.0/unsupported/Eigen/ inst/include/unsupported/Eigen/ >> patches/eigen-3.4.0.diff
+```
+
+or when using a git checkout of eigen (at the appropriate tag and branch)
+
+```sh
+diff --exclude=CMakeLists.txt -ruw ../eigen/Eigen/ inst/include/Eigen > patches/eigen-5.0.1.diff
+diff --exclude=CMakeLists.txt -ruw ../eigen/unsupported/Eigen/ inst/include/unsupported/Eigen/ >> patches/eigen-5.0.1.diff
+```
diff --git a/src/Makevars b/src/Makevars
deleted file mode 100644
index 9c3ce5cc..00000000
--- a/src/Makevars
+++ /dev/null
@@ -1,11 +0,0 @@
-## -*- mode: makefile; -*-
-
-PKG_CXXFLAGS = -I../inst/include
-
-## With Rcpp 0.11.0 and later, we no longer need to set PKG_LIBS for
-## Rcpp as there is no user-facing library. 
-PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS)
-
-## With R 3.1.0 or later, you can uncomment the following line to tell R to 
-## enable compilation with C++11 (where available)
-#USE_CXX1X = 
diff --git a/src/Makevars.in b/src/Makevars.in
new file mode 100644
index 00000000..05606ca3
--- /dev/null
+++ b/src/Makevars.in
@@ -0,0 +1,5 @@
+## -*- mode: makefile; -*-
+
+PKG_CPPFLAGS = -I../inst/include
+PKG_CXXFLAGS = @OPENMP_FLAG@ @PKG_CXXFLAGS@
+PKG_LIBS= @OPENMP_FLAG@ $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS) @PKG_LIBS@
diff --git a/src/Makevars.win b/src/Makevars.win
index 9c3ce5cc..fdbe13c1 100644
--- a/src/Makevars.win
+++ b/src/Makevars.win
@@ -1,11 +1,5 @@
 ## -*- mode: makefile; -*-
 
-PKG_CXXFLAGS = -I../inst/include
+PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) -I../inst/include
 
-## With Rcpp 0.11.0 and later, we no longer need to set PKG_LIBS for
-## Rcpp as there is no user-facing library. 
-PKG_LIBS = $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS)
-
-## With R 3.1.0 or later, you can uncomment the following line to tell R to 
-## enable compilation with C++11 (where available)
-#USE_CXX1X = 
+PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(LAPACK_LIBS) $(BLAS_LIBS) $(FLIBS)
diff --git a/src/RcppEigen.cpp b/src/RcppEigen.cpp
index 09002113..8d827632 100644
--- a/src/RcppEigen.cpp
+++ b/src/RcppEigen.cpp
@@ -1,8 +1,7 @@
-// -*- mode: C++; c-indent-level: 4; c-basic-offset: 4; tab-width: 8 -*-
-//
+
 // RcppEigen.cpp: Rcpp/Eigen glue
 //
-// Copyright (C) 2011 - 2015  Douglas Bates, Dirk Eddelbuettel and Romain Francois
+// Copyright (C) 2011 - 2025  Douglas Bates, Dirk Eddelbuettel and Romain Francois
 //
 // This file is part of RcppEigen.
 //
@@ -23,21 +22,41 @@
 
 // [[Rcpp::export]]
 Rcpp::IntegerVector eigen_version(bool single) {
-    using Rcpp::_;
-    using Rcpp::IntegerVector;
-	
     if (single) {
-        return Rcpp::wrap( 10000 * EIGEN_WORLD_VERSION +
-                           100 * EIGEN_MAJOR_VERSION + 
-                           EIGEN_MINOR_VERSION ) ;
+        return Rcpp::wrap(10000 * EIGEN_WORLD_VERSION +
+                          100 * EIGEN_MAJOR_VERSION +
+                          EIGEN_MINOR_VERSION) ;
     }
-	
-    return IntegerVector::create(_["major"] = EIGEN_WORLD_VERSION,
-                                 _["minor"] = EIGEN_MAJOR_VERSION,
-                                 _["patch"] = EIGEN_MINOR_VERSION);
+
+    return Rcpp::IntegerVector::create(Rcpp::Named("major") = EIGEN_WORLD_VERSION,
+                                       Rcpp::Named("minor") = EIGEN_MAJOR_VERSION,
+                                       Rcpp::Named("patch") = EIGEN_MINOR_VERSION);
+}
+
+// [[Rcpp::export]]
+Rcpp::List eigen_version_typed() {
+    // create a vector of major, minor, patch
+    auto v = Rcpp::IntegerVector::create(EIGEN_WORLD_VERSION, EIGEN_MAJOR_VERSION, EIGEN_MINOR_VERSION);
+    // and place it in a list (as e.g. packageVersion() in R returns)
+    auto l = Rcpp::List::create(v);
+    // and class it as 'package_version' accessing print() etc methods
+    l.attr("class") = Rcpp::CharacterVector::create("package_version", "numeric_version");
+    return l;
 }
 
 // [[Rcpp::export]]
 bool Eigen_SSE() {
     return Rcpp::wrap(Eigen::SimdInstructionSetsInUse());
 }
+
+//' @rdname RcppEigen_throttle_cores
+// [[Rcpp::export]]
+int EigenNbThreads() {
+    return Eigen::nbThreads();
+}
+
+//' @rdname RcppEigen_throttle_cores
+// [[Rcpp::export]]
+void EigenSetNbThreads(int n) {
+    Eigen::setNbThreads(n);
+}
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
index 2e0c3a35..ee856593 100644
--- a/src/RcppExports.cpp
+++ b/src/RcppExports.cpp
@@ -1,4 +1,4 @@
-// This file was generated by Rcpp::compileAttributes
+// Generated by using Rcpp::compileAttributes() -> do not edit by hand
 // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
 
 #include "../inst/include/RcppEigen.h"
@@ -6,37 +6,87 @@
 
 using namespace Rcpp;
 
-// fastLm_Impl
-Rcpp::List fastLm_Impl(Rcpp::NumericMatrix X, Rcpp::NumericVector y, int type);
-RcppExport SEXP RcppEigen_fastLm_Impl(SEXP XSEXP, SEXP ySEXP, SEXP typeSEXP) {
-BEGIN_RCPP
-    Rcpp::RObject __result;
-    Rcpp::RNGScope __rngScope;
-    Rcpp::traits::input_parameter< Rcpp::NumericMatrix >::type X(XSEXP);
-    Rcpp::traits::input_parameter< Rcpp::NumericVector >::type y(ySEXP);
-    Rcpp::traits::input_parameter< int >::type type(typeSEXP);
-    __result = Rcpp::wrap(fastLm_Impl(X, y, type));
-    return __result;
-END_RCPP
-}
+#ifdef RCPP_USE_GLOBAL_ROSTREAM
+Rcpp::Rostream<true>&  Rcpp::Rcout = Rcpp::Rcpp_cout_get();
+Rcpp::Rostream<false>& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get();
+#endif
+
 // eigen_version
 Rcpp::IntegerVector eigen_version(bool single);
-RcppExport SEXP RcppEigen_eigen_version(SEXP singleSEXP) {
+RcppExport SEXP _RcppEigen_eigen_version(SEXP singleSEXP) {
 BEGIN_RCPP
-    Rcpp::RObject __result;
-    Rcpp::RNGScope __rngScope;
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
     Rcpp::traits::input_parameter< bool >::type single(singleSEXP);
-    __result = Rcpp::wrap(eigen_version(single));
-    return __result;
+    rcpp_result_gen = Rcpp::wrap(eigen_version(single));
+    return rcpp_result_gen;
+END_RCPP
+}
+// eigen_version_typed
+Rcpp::List eigen_version_typed();
+RcppExport SEXP _RcppEigen_eigen_version_typed() {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    rcpp_result_gen = Rcpp::wrap(eigen_version_typed());
+    return rcpp_result_gen;
 END_RCPP
 }
 // Eigen_SSE
 bool Eigen_SSE();
-RcppExport SEXP RcppEigen_Eigen_SSE() {
+RcppExport SEXP _RcppEigen_Eigen_SSE() {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    rcpp_result_gen = Rcpp::wrap(Eigen_SSE());
+    return rcpp_result_gen;
+END_RCPP
+}
+// EigenNbThreads
+int EigenNbThreads();
+RcppExport SEXP _RcppEigen_EigenNbThreads() {
 BEGIN_RCPP
-    Rcpp::RObject __result;
-    Rcpp::RNGScope __rngScope;
-    __result = Rcpp::wrap(Eigen_SSE());
-    return __result;
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    rcpp_result_gen = Rcpp::wrap(EigenNbThreads());
+    return rcpp_result_gen;
 END_RCPP
 }
+// EigenSetNbThreads
+void EigenSetNbThreads(int n);
+RcppExport SEXP _RcppEigen_EigenSetNbThreads(SEXP nSEXP) {
+BEGIN_RCPP
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< int >::type n(nSEXP);
+    EigenSetNbThreads(n);
+    return R_NilValue;
+END_RCPP
+}
+// fastLm_Impl
+Rcpp::List fastLm_Impl(Rcpp::NumericMatrix X, Rcpp::NumericVector y, int type);
+RcppExport SEXP _RcppEigen_fastLm_Impl(SEXP XSEXP, SEXP ySEXP, SEXP typeSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< Rcpp::NumericMatrix >::type X(XSEXP);
+    Rcpp::traits::input_parameter< Rcpp::NumericVector >::type y(ySEXP);
+    Rcpp::traits::input_parameter< int >::type type(typeSEXP);
+    rcpp_result_gen = Rcpp::wrap(fastLm_Impl(X, y, type));
+    return rcpp_result_gen;
+END_RCPP
+}
+
+static const R_CallMethodDef CallEntries[] = {
+    {"_RcppEigen_eigen_version", (DL_FUNC) &_RcppEigen_eigen_version, 1},
+    {"_RcppEigen_eigen_version_typed", (DL_FUNC) &_RcppEigen_eigen_version_typed, 0},
+    {"_RcppEigen_Eigen_SSE", (DL_FUNC) &_RcppEigen_Eigen_SSE, 0},
+    {"_RcppEigen_EigenNbThreads", (DL_FUNC) &_RcppEigen_EigenNbThreads, 0},
+    {"_RcppEigen_EigenSetNbThreads", (DL_FUNC) &_RcppEigen_EigenSetNbThreads, 1},
+    {"_RcppEigen_fastLm_Impl", (DL_FUNC) &_RcppEigen_fastLm_Impl, 3},
+    {NULL, NULL, 0}
+};
+
+RcppExport void R_init_RcppEigen(DllInfo *dll) {
+    R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
+    R_useDynamicSymbols(dll, FALSE);
+}
diff --git a/src/fastLm.cpp b/src/fastLm.cpp
index 29a8b833..6cc7598f 100644
--- a/src/fastLm.cpp
+++ b/src/fastLm.cpp
@@ -2,7 +2,7 @@
 //
 // fastLm.cpp: Rcpp/Eigen example of a simple lm() alternative
 //
-// Copyright (C) 2011 - 2015  Douglas Bates, Dirk Eddelbuettel and Romain Francois
+// Copyright (C) 2011 - 2022  Douglas Bates, Dirk Eddelbuettel and Romain Francois
 //
 // This file is part of RcppEigen.
 //
@@ -25,6 +25,10 @@
 #include <R_ext/Lapack.h>
 #endif
 
+#ifndef FCONE
+# define FCONE
+#endif
+
 namespace lmsol {
     using Rcpp::_;
     using Rcpp::as;
@@ -44,17 +48,17 @@ namespace lmsol {
 	  m_y(y),
 	  m_n(X.rows()),
 	  m_p(X.cols()),
-	  m_coef(VectorXd::Constant(m_p, ::NA_REAL)),
+	  m_coef(VectorXd::Constant(m_p, ::NA_REAL)), 		// #nocov
 	  m_r(::NA_INTEGER),
 	  m_fitted(m_n),
-	  m_se(VectorXd::Constant(m_p, ::NA_REAL)),
+	  m_se(VectorXd::Constant(m_p, ::NA_REAL)), 		// #nocov
 	  m_usePrescribedThreshold(false) {
     }
 
-    lm& lm::setThreshold(const RealScalar& threshold) {
+    lm& lm::setThreshold(const RealScalar& threshold) { 	// #nocov start
 	m_usePrescribedThreshold = true;
 	m_prescribedThreshold = threshold;
-	return *this;
+	return *this;           				// #nocov end
     }
 
     inline ArrayXd lm::Dplus(const ArrayXd& d) {
@@ -71,16 +75,16 @@ namespace lmsol {
     }
 
     /** Returns the threshold that will be used by certain methods such as rank().
-     * 
+     *
      *  The default value comes from experimenting (see "LU precision
      *  tuning" thread on the Eigen list) and turns out to be
-     *  identical to Higham's formula used already in LDLt. 
+     *  identical to Higham's formula used already in LDLt.
      *
      *  @return The user-prescribed threshold or the default.
      */
     RealScalar lm::threshold() const {
 	return m_usePrescribedThreshold ? m_prescribedThreshold
-	    : numeric_limits<double>::epsilon() * m_p; 
+	    : numeric_limits<double>::epsilon() * m_p;
     }
 
     ColPivQR::ColPivQR(const Map<MatrixXd> &X, const Map<VectorXd> &y)
@@ -94,8 +98,8 @@ namespace lmsol {
 	    m_se       = Pmat * PQR.matrixQR().topRows(m_p).
 		triangularView<Upper>().solve(I_p()).rowwise().norm();
 	    return;
-	} 
-	MatrixXd                     Rinv(PQR.matrixQR().topLeftCorner(m_r, m_r).
+	}
+	MatrixXd                     Rinv(PQR.matrixQR().topLeftCorner(m_r, m_r). 	// #nocov start
 					  triangularView<Upper>().
 					  solve(MatrixXd::Identity(m_r, m_r)));
 	VectorXd                  effects(PQR.householderQ().adjoint() * y);
@@ -106,9 +110,9 @@ namespace lmsol {
 	effects.tail(m_n - m_r).setZero();
 	m_fitted                          = PQR.householderQ() * effects;
 	m_se.head(m_r)                    = Rinv.rowwise().norm();
-	m_se                              = Pmat * m_se;
+	m_se                              = Pmat * m_se; 				// #nocov end
     }
-    
+
     QR::QR(const Map<MatrixXd> &X, const Map<VectorXd> &y) : lm(X, y) {
 	HouseholderQR<MatrixXd> QR(X);
 	m_coef                     = QR.solve(y);
@@ -116,15 +120,15 @@ namespace lmsol {
 	m_se                       = QR.matrixQR().topRows(m_p).
 	    triangularView<Upper>().solve(I_p()).rowwise().norm();
     }
-    
-    
+
+
     Llt::Llt(const Map<MatrixXd> &X, const Map<VectorXd> &y) : lm(X, y) {
 	LLT<MatrixXd>  Ch(XtX().selfadjointView<Lower>());
 	m_coef            = Ch.solve(X.adjoint() * y);
 	m_fitted          = X * m_coef;
 	m_se              = Ch.matrixL().solve(I_p()).colwise().norm();
     }
-    
+
     Ldlt::Ldlt(const Map<MatrixXd> &X, const Map<VectorXd> &y) : lm(X, y) {
 	LDLT<MatrixXd> Ch(XtX().selfadjointView<Lower>());
 	Dplus(Ch.vectorD());	// to set the rank
@@ -136,19 +140,19 @@ namespace lmsol {
 	m_fitted          = X * m_coef;
 	m_se              = Ch.solve(I_p()).diagonal().array().sqrt();
     }
-    
+
     int gesdd(MatrixXd& A, ArrayXd& S, MatrixXd& Vt) {
 	int info, mone = -1, m = A.rows(), n = A.cols();
 	std::vector<int> iwork(8 * n);
 	double wrk;
 	if (m < n || S.size() != n || Vt.rows() != n || Vt.cols() != n)
-	    throw std::invalid_argument("dimension mismatch in gesvd");
+	    throw std::invalid_argument("dimension mismatch in gesvd"); // #nocov
 	F77_CALL(dgesdd)("O", &m, &n, A.data(), &m, S.data(), A.data(),
-			 &m, Vt.data(), &n, &wrk, &mone, &iwork[0], &info);
+			 &m, Vt.data(), &n, &wrk, &mone, &iwork[0], &info FCONE);
 	int lwork(wrk);
 	std::vector<double> work(lwork);
 	F77_CALL(dgesdd)("O", &m, &n, A.data(), &m, S.data(), A.data(),
-			 &m, Vt.data(), &n, &work[0], &lwork, &iwork[0], &info);
+			 &m, Vt.data(), &n, &work[0], &lwork, &iwork[0], &info FCONE);
 	return info;
     }
 
@@ -163,7 +167,7 @@ namespace lmsol {
     }
 
     SVD::SVD(const Map<MatrixXd> &X, const Map<VectorXd> &y) : lm(X, y) {
-	JacobiSVD<MatrixXd>  UDV(X.jacobiSvd(ComputeThinU|ComputeThinV));
+    JacobiSVD<MatrixXd>  UDV(X, ComputeThinU | ComputeThinV);
 	MatrixXd             VDi(UDV.matrixV() *
 				 Dplus(UDV.singularValues().array()).matrix().asDiagonal());
 	m_coef                   = VDi * UDV.matrixU().adjoint() * y;
@@ -222,7 +226,7 @@ namespace lmsol {
             if (!(colnames).isNULL())
                 coef.attr("names") = clone(CharacterVector(colnames));
         }
-	    
+
         VectorXd         resid = y - ans.fitted();
         int               rank = ans.rank();
         int                 df = (rank == ::NA_INTEGER) ? n - X.cols() : n - rank;
@@ -240,9 +244,8 @@ namespace lmsol {
     }
 }
 
-// This defines the R-callable function 'fastLm' 
+// This defines the R-callable function 'fastLm'
 // [[Rcpp::export]]
 Rcpp::List fastLm_Impl(Rcpp::NumericMatrix X, Rcpp::NumericVector y, int type) {
-    return lmsol::fastLm(X, y, type); 
+    return lmsol::fastLm(X, y, type);
 }
-
diff --git a/tests/doRUnit.R b/tests/doRUnit.R
deleted file mode 100644
index 72162cd4..00000000
--- a/tests/doRUnit.R
+++ /dev/null
@@ -1,33 +0,0 @@
-#### doRUnit.R --- Run RUnit tests
-####------------------------------------------------------------------------
-
-### borrowed from package fUtilities in RMetrics
-### http://r-forge.r-project.org/plugins/scmsvn/viewcvs.php/pkg/fUtilities/tests/doRUnit.R?rev=1958&root=rmetrics&view=markup
-
-### Originally follows Gregor Gojanc's example in CRAN package  'gdata'
-### and the corresponding section in the R Wiki:
-###  http://wiki.r-project.org/rwiki/doku.php?id=developers:runit
-
-### MM: Vastly changed:  This should also be "runnable" for *installed*
-##              package which has no ./tests/
-## ----> put the bulk of the code e.g. in  ../inst/unitTests/runTests.R :
-
-if(require("RUnit", quietly = TRUE)) {
-    pkg <- "RcppEigen"
-
-    require( pkg, character.only=TRUE)
-
-    path <- system.file("unitTests", package = pkg)
-
-    stopifnot(file.exists(path), file.info(path.expand(path))$isdir)
-
-    ## without this, we get unit test failures
-    Sys.setenv( R_TESTS = "" )
-
-    RcppEigen.unit.test.output.dir <- getwd()
-
-    source(file.path(path, "runTests.R"), echo = TRUE)
-
-} else {
-    print( "package RUnit not available, cannot run unit tests" )
-}
diff --git a/tests/tinytest.R b/tests/tinytest.R
new file mode 100644
index 00000000..65023dd0
--- /dev/null
+++ b/tests/tinytest.R
@@ -0,0 +1,15 @@
+
+if (requireNamespace("tinytest", quietly=TRUE) &&
+    utils::packageVersion("tinytest") >= "1.0.0") {
+
+    ## Set a seed to make the test deterministic
+    set.seed(42)
+
+    ## R makes us to this
+    Sys.setenv("R_TESTS"="")
+
+    ## there are several more granular ways to test files in a tinytest directory,
+    ## see its package vignette; tests can also run once the package is installed
+    ## using the same command `test_package(pkgName)`, or by director or file
+    tinytest::test_package("RcppEigen", ncpu=getOption("Ncpus", 1))
+}
diff --git a/vignettes/RcppEigen-Introduction.pdf b/vignettes/RcppEigen-Introduction.pdf
new file mode 100644
index 00000000..1fa8017e
Binary files /dev/null and b/vignettes/RcppEigen-Introduction.pdf differ
diff --git a/vignettes/RcppEigen-Introduction.pdf.asis b/vignettes/RcppEigen-Introduction.pdf.asis
new file mode 100644
index 00000000..b80fab78
--- /dev/null
+++ b/vignettes/RcppEigen-Introduction.pdf.asis
@@ -0,0 +1,5 @@
+%\VignetteIndexEntry{RcppEigen-intro}
+%\VignetteKeywords{linear algebra, template programming, C++, R, Rcpp}
+%\VignettePackage{RcppEigen}
+%\VignetteEncoding{UTF-8}
+%\VignetteEngine{Rcpp::asis}
diff --git a/vignettes/rnw/Makefile b/vignettes/rnw/Makefile
new file mode 100644
index 00000000..d29332b1
--- /dev/null
+++ b/vignettes/rnw/Makefile
@@ -0,0 +1,16 @@
+sources := 		RcppEigen-Introduction.Rnw
+vignettes := 		RcppEigen-Introduction.pdf
+
+all:			${vignettes}
+
+${vignettes}: 		${sources}
+			Rscript -e 'utils::Sweave("RcppEigen-Introduction.Rnw")'
+			pdflatex RcppEigen-Introduction.tex
+			bibtex RcppEigen-Introduction.aux
+			pdflatex RcppEigen-Introduction.tex
+			pdflatex RcppEigen-Introduction.tex
+			Rscript -e 'tools::compactPDF("$@", gs_quality="ebook")'
+			cp $@ ..
+
+clean:
+			@rm -rf *.aux *.log *.out *.tex *.pdf *.bbl *.blg pinp.cls auto/
diff --git a/vignettes/RcppEigen-Introduction.Rnw b/vignettes/rnw/RcppEigen-Introduction.Rnw
similarity index 99%
rename from vignettes/RcppEigen-Introduction.Rnw
rename to vignettes/rnw/RcppEigen-Introduction.Rnw
index b0fde203..4929c2ea 100644
--- a/vignettes/RcppEigen-Introduction.Rnw
+++ b/vignettes/rnw/RcppEigen-Introduction.Rnw
@@ -43,13 +43,13 @@ prettyDate <- format(Sys.Date(), "%B %e, %Y")
   University of Wisconsin-Madison \\
   Madison, WI, United States of America \\
   E-mail: \email{bates@stat.wisc.edu} \\
-  URL: \url{http://www.stat.wisc.edu/~bates/}\\
+  URL: \url{https://www.stat.wisc.edu/~bates/}\\
 
   Dirk Eddelbuettel \\
   Debian Project \\
   River Forest, IL, United States of America\\
   E-mail: \email{edd@debian.org}\\
-  URL: \url{http://dirk.eddelbuettel.com}\\
+  URL: \url{https://dirk.eddelbuettel.com}\\
 }
 
 \usepackage{Sweave}
@@ -85,7 +85,7 @@ prettyDate <- format(Sys.Date(), "%B %e, %Y")
 
 \begin{quote} \footnotesize
   This vignette corresponds to a
-  \href{http://www.jstatsoft.org/v52/i05/}{paper published} in the
+  \href{https://www.jstatsoft.org/v52/i05/}{paper published} in the
   \textsl{Journal of Statistical Software}. Currently still identical
   to the paper, this vignette version may over time receive minor updates.
   For citations, please use \citet{JSS:RcppEigen} as provided by \code{citation("RcppEigen")}.
@@ -132,13 +132,13 @@ matrices to represent projections in a visualization application. \pkg{Eigen}
 grew from there and has over the course of about a decade produced three
 major releases with \pkg{Eigen}3 being the current major version. To
 check the minor and patch version numbers, load the \pkg{RcppEigen}
-package and call
+package and call this (internal) helper function:
 \begin{CodeInput}
-R> .Call("eigen_version", FALSE)
+R> RcppEigen:::eigen_version(FALSE)
 \end{CodeInput}
 \begin{CodeOutput}
 major minor patch 
-    3     1     1 
+    3     3     5 
 \end{CodeOutput}
 \pkg{Eigen} is of interest as the \proglang{R} system for statistical
 computation and graphics \citep{R:Main} is itself easily extensible. This is
diff --git a/vignettes/jss.bst b/vignettes/rnw/jss.bst
similarity index 100%
rename from vignettes/jss.bst
rename to vignettes/rnw/jss.bst